diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..ef06b6f7fca0dc5cafbb4c9d8082510479d54a3d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,73 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+head_extractor/assets/001.jpg filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/_ext.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/.ninja_deps filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/convex_iou.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/cudabind.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/knn_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_indice.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_conv.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/modulated_deform_conv.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pybind.o filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/docs/en/_static/community/3.png filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/docs/en/_static/flow_raw_images.png filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/docs/en/_static/flow_warp.png filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/docs/en/_static/flow_warp_diff.png filter=lfs diff=lfs merge=lfs -text
+head_extractor/mmcv-2.1.0/docs/en/_static/progress.gif filter=lfs diff=lfs merge=lfs -text
diff --git a/head_extractor/assets/001.jpg b/head_extractor/assets/001.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e6a6db3ddbb4fccaa4f53b29e93a1c57b195f445
--- /dev/null
+++ b/head_extractor/assets/001.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33562c08290fdd1576ebfe8da41bae3b8f7e21b7e0971ba0568d1fa259e1f409
+size 112582
diff --git a/head_extractor/assets/001_head-black-bg.webp b/head_extractor/assets/001_head-black-bg.webp
new file mode 100644
index 0000000000000000000000000000000000000000..674399cf24059f65fd763ff219033f8bdc5be2a8
Binary files /dev/null and b/head_extractor/assets/001_head-black-bg.webp differ
diff --git a/head_extractor/assets/001_head-default.webp b/head_extractor/assets/001_head-default.webp
new file mode 100644
index 0000000000000000000000000000000000000000..a0799a9b3cf8eac122569cbedff61125dba44cf9
Binary files /dev/null and b/head_extractor/assets/001_head-default.webp differ
diff --git a/head_extractor/assets/001_head-pad2square-false.webp b/head_extractor/assets/001_head-pad2square-false.webp
new file mode 100644
index 0000000000000000000000000000000000000000..3b7f5e7bf61b83467b7512ec49966b0bd985cdb3
Binary files /dev/null and b/head_extractor/assets/001_head-pad2square-false.webp differ
diff --git a/head_extractor/build/lib/head_extractor/__init__.py b/head_extractor/build/lib/head_extractor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2988acd2b20e4bd119a8ebcb0e8798069b953f0
--- /dev/null
+++ b/head_extractor/build/lib/head_extractor/__init__.py
@@ -0,0 +1,6 @@
+from .processor import ProcessorPipeline, TaskType
+
+__version__ = "0.1.0"
+
+# 让外部可以直接 from head_extractor import ProcessorPipeline
+__all__ = ['ProcessorPipeline', 'TaskType']
\ No newline at end of file
diff --git a/head_extractor/build/lib/head_extractor/models/__init__.py b/head_extractor/build/lib/head_extractor/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/head_extractor/build/lib/head_extractor/models/depth_anything_large_mask2former_16xb1_160k_human_parsing_fashion_1024x1024.py b/head_extractor/build/lib/head_extractor/models/depth_anything_large_mask2former_16xb1_160k_human_parsing_fashion_1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..a42645384835b5a843ec3acbcc16a6d497953d0b
--- /dev/null
+++ b/head_extractor/build/lib/head_extractor/models/depth_anything_large_mask2former_16xb1_160k_human_parsing_fashion_1024x1024.py
@@ -0,0 +1,573 @@
+auto_scale_lr = dict(base_batch_size=16, enable=False)
+backbone_embed_multi = dict(decay_mult=0.0, lr_mult=0.1)
+backbone_norm_multi = dict(decay_mult=0.0, lr_mult=0.1)
+crop_size = (
+    896,
+    896,
+)
+custom_keys = dict({
+    'backbone.dinov2':
+    dict(decay_mult=1.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.0.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.1.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.10.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.11.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.12.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.13.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.14.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.15.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.16.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.17.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.18.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.19.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.2.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.20.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.21.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.22.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.23.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.3.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.4.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.5.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.6.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.7.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.8.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.9.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'level_embed':
+    dict(decay_mult=0.0, lr_mult=1.0),
+    'pos_embed':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'query_embed':
+    dict(decay_mult=0.0, lr_mult=1.0),
+    'query_feat':
+    dict(decay_mult=0.0, lr_mult=1.0)
+})
+data_preprocessor = dict(
+    bgr_to_rgb=True,
+    mean=[
+        123.675,
+        116.28,
+        103.53,
+    ],
+    pad_val=0,
+    seg_pad_val=255,
+    size=(
+        896,
+        896,
+    ),
+    std=[
+        58.395,
+        57.12,
+        57.375,
+    ],
+    type='SegDataPreProcessor')
+data_root = '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/deep_fashion_10k'
+dataset_type = 'HumanParsingDataset'
+default_hooks = dict(
+    checkpoint=dict(
+        by_epoch=False,
+        interval=2000,
+        max_keep_ckpts=50,
+        save_best='mIoU',
+        type='CheckpointHook'),
+    logger=dict(interval=50, log_metric_by_epoch=False, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='SegVisualizationHook'))
+default_scope = 'mmseg'
+embed_multi = dict(decay_mult=0.0, lr_mult=1.0)
+env_cfg = dict(
+    cudnn_benchmark=True,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+find_unused_parameters = True
+img_ratios = [
+    0.5,
+    0.75,
+    1.0,
+    1.25,
+    1.5,
+    1.75,
+]
+launcher = 'none'
+load_from = '/mnt/data_ssd/limaopeng/limaopeng/segmentation/mmsegmentation/work_dirs/depth_anything_large_mask2former_16xb1_160k_human_parsing_896x896/best_mIoU_iter_110000.pth'
+log_level = 'INFO'
+log_processor = dict(by_epoch=False)
+model = dict(
+    backbone=dict(
+        freeze=False,
+        # load_from='./checkpoints/depth_anything_vitl14.pth',
+        type='DINOv2',
+        version='large'),
+    data_preprocessor=dict(
+        bgr_to_rgb=True,
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        pad_val=0,
+        seg_pad_val=255,
+        size=(
+            896,
+            896,
+        ),
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        type='SegDataPreProcessor'),
+    decode_head=dict(
+        align_corners=False,
+        enforce_decoder_input_project=False,
+        feat_channels=1024,
+        in_channels=[
+            1024,
+            1024,
+            1024,
+            1024,
+        ],
+        loss_boundary=dict(loss_weight=5.0, type='BoundaryLoss'),
+        loss_cls=dict(
+            class_weight=[
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+            ],
+            loss_weight=2.0,
+            reduction='mean',
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False),
+        loss_dice=dict(
+            activate=True,
+            eps=1.0,
+            loss_weight=5.0,
+            naive_dice=True,
+            reduction='mean',
+            type='mmdet.DiceLoss',
+            use_sigmoid=True),
+        loss_mask=dict(
+            loss_weight=5.0,
+            reduction='mean',
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True),
+        num_classes=43,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        out_channels=1024,
+        pixel_decoder=dict(
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                init_cfg=None,
+                layer_cfg=dict(
+                    ffn_cfg=dict(
+                        act_cfg=dict(inplace=True, type='ReLU'),
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        ffn_drop=0.0,
+                        num_fcs=2),
+                    self_attn_cfg=dict(
+                        batch_first=True,
+                        dropout=0.0,
+                        embed_dims=1024,
+                        im2col_step=64,
+                        init_cfg=None,
+                        norm_cfg=None,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4)),
+                num_layers=6),
+            init_cfg=None,
+            norm_cfg=dict(num_groups=32, type='GN'),
+            num_outs=3,
+            positional_encoding=dict(normalize=True, num_feats=512),
+            type='mmdet.MSDeformAttnPixelDecoder'),
+        positional_encoding=dict(normalize=True, num_feats=512),
+        train_cfg=dict(
+            assigner=dict(
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        use_sigmoid=True,
+                        weight=5.0),
+                    dict(
+                        eps=1.0,
+                        pred_act=True,
+                        type='mmdet.DiceCost',
+                        weight=5.0),
+                ],
+                type='mmdet.HungarianAssigner'),
+            importance_sample_ratio=0.75,
+            num_points=12544,
+            oversample_ratio=3.0,
+            sampler=dict(type='mmdet.MaskPseudoSampler')),
+        transformer_decoder=dict(
+            init_cfg=None,
+            layer_cfg=dict(
+                cross_attn_cfg=dict(
+                    attn_drop=0.0,
+                    batch_first=True,
+                    dropout_layer=None,
+                    embed_dims=1024,
+                    num_heads=32,
+                    proj_drop=0.0),
+                ffn_cfg=dict(
+                    act_cfg=dict(inplace=True, type='ReLU'),
+                    add_identity=True,
+                    dropout_layer=None,
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    ffn_drop=0.0,
+                    num_fcs=2),
+                self_attn_cfg=dict(
+                    attn_drop=0.0,
+                    batch_first=True,
+                    dropout_layer=None,
+                    embed_dims=1024,
+                    num_heads=32,
+                    proj_drop=0.0)),
+            num_layers=9,
+            return_intermediate=True),
+        type='Mask2FormerHead'),
+    neck=dict(
+        embed_dim=1024, rescales=[
+            4,
+            2,
+            1,
+            0.5,
+        ], type='Feature2Pyramid'),
+    test_cfg=dict(crop_size=(
+        896,
+        896,
+    ), mode='slide', stride=(
+        426,
+        426,
+    )),
+    train_cfg=dict(),
+    type='EncoderDecoder')
+num_classes = 43
+optim_wrapper = dict(
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    optimizer=dict(
+        betas=(
+            0.9,
+            0.999,
+        ),
+        eps=1e-08,
+        lr=3e-05,
+        type='AdamW',
+        weight_decay=0.05),
+    paramwise_cfg=dict(
+        custom_keys=dict({
+            'backbone.dinov2':
+            dict(decay_mult=1.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.0.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.1.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.10.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.11.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.12.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.13.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.14.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.15.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.16.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.17.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.18.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.19.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.2.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.20.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.21.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.22.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.23.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.3.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.4.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.5.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.6.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.7.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.8.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.9.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'level_embed':
+            dict(decay_mult=0.0, lr_mult=1.0),
+            'pos_embed':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'query_embed':
+            dict(decay_mult=0.0, lr_mult=1.0),
+            'query_feat':
+            dict(decay_mult=0.0, lr_mult=1.0)
+        }),
+        norm_decay_mult=0.0),
+    type='OptimWrapper')
+optimizer = dict(
+    betas=(
+        0.9,
+        0.999,
+    ), eps=1e-08, lr=3e-05, type='AdamW', weight_decay=0.05)
+param_scheduler = [
+    dict(
+        begin=0, by_epoch=False, end=1500, start_factor=1e-06,
+        type='LinearLR'),
+    dict(
+        begin=1500,
+        by_epoch=False,
+        end=300000,
+        eta_min=0.0,
+        power=0.9,
+        type='PolyLR'),
+]
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='val20250512/images', seg_map_path='val20250512/labels'),
+        data_root=
+        '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/deep_fashion_10k',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=False, scale=(
+                896,
+                896,
+            ), type='Resize'),
+            dict(reduce_zero_label=False, type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type='HumanParsingDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(keep_ratio=False, scale=(
+        896,
+        896,
+    ), type='Resize'),
+    dict(reduce_zero_label=False, type='LoadAnnotations'),
+    dict(type='PackSegInputs'),
+]
+train_cfg = dict(
+    max_iters=300000, type='IterBasedTrainLoop', val_interval=2000)
+train_dataloader = dict(
+    batch_size=3,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='train20250512/images',
+            seg_map_path='train20250512/labels'),
+        data_root=
+        '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/deep_fashion_10k',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations'),
+            dict(
+                keep_ratio=True,
+                ratio_range=(
+                    0.2,
+                    2.0,
+                ),
+                scale=(
+                    896,
+                    896,
+                ),
+                type='RandomResize'),
+            dict(
+                cat_max_ratio=0.75, crop_size=(
+                    896,
+                    896,
+                ), type='RandomCrop'),
+            dict(keep_ratio=True, scale=(
+                896,
+                896,
+            ), type='Resize'),
+            dict(degree=45, prob=0.5, seg_pad_val=0, type='RandomRotate'),
+            dict(type='PhotoMetricDistortion'),
+            dict(type='PackSegInputs'),
+        ],
+        type='HumanParsingDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='InfiniteSampler'))
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        keep_ratio=True,
+        ratio_range=(
+            0.2,
+            2.0,
+        ),
+        scale=(
+            896,
+            896,
+        ),
+        type='RandomResize'),
+    dict(cat_max_ratio=0.75, crop_size=(
+        896,
+        896,
+    ), type='RandomCrop'),
+    dict(keep_ratio=True, scale=(
+        896,
+        896,
+    ), type='Resize'),
+    dict(degree=45, prob=0.5, seg_pad_val=0, type='RandomRotate'),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs'),
+]
+tta_model = dict(type='SegTTAModel')
+tta_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(
+        transforms=[
+            [
+                dict(keep_ratio=True, scale_factor=0.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=0.75, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.0, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.25, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.75, type='Resize'),
+            ],
+            [
+                dict(direction='horizontal', prob=0.0, type='RandomFlip'),
+                dict(direction='horizontal', prob=1.0, type='RandomFlip'),
+            ],
+            [
+                dict(type='LoadAnnotations'),
+            ],
+            [
+                dict(type='PackSegInputs'),
+            ],
+        ],
+        type='TestTimeAug'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='val20250512/images', seg_map_path='val20250512/labels'),
+        data_root=
+        '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/deep_fashion_10k',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=False, scale=(
+                896,
+                896,
+            ), type='Resize'),
+            dict(reduce_zero_label=False, type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type='HumanParsingDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    name='visualizer',
+    type='SegLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_160k_human_pasing_fasion_1024x1024_boundary_20250521'
diff --git a/head_extractor/build/lib/head_extractor/processor.py b/head_extractor/build/lib/head_extractor/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..868ae0e69b9ec01f118660ffcd50b40d95be8dd7
--- /dev/null
+++ b/head_extractor/build/lib/head_extractor/processor.py
@@ -0,0 +1,585 @@
+import os
+import numpy as np
+from mmseg.apis import inference_model, init_model
+from PIL import Image
+import cv2
+from enum import Enum
+import importlib.resources
+
+
+'''
+Labels: 
+0: 'background'	1: 'top'	2: 'outer'	3: 'skirt'
+4: 'dress'	5: 'pants'	6: 'leggings'	7: 'headwear'
+8: 'eyeglass'	9: 'neckwear'	10: 'belt'	11: 'footwear'
+12: 'bag'	13: 'hair'	14: 'face'	15: 'skin'
+16: 'ring'	17: 'wrist_wearing'	18: 'socks'	19: 'gloves'
+20: 'necklace'	21: 'rompers'	22: 'earrings'	23: 'tie'
+24: Left_Foot
+25: Left_Hand
+26: Left_Lower_Arm
+27: Left_Lower_Leg
+28: Left_Upper_Arm
+29: Left_Upper_Leg
+30: Right_Foot
+31: Right_Hand
+32: Right_Lower_Arm
+33: Right_Lower_Leg
+34: Right_Upper_Arm
+35: Right_Upper_Leg
+36: Torso
+'''
+
+class PersonSeg:
+    def __init__(self, config_path, model_path, device='cuda'):
+        # init model
+        self.model = init_model(config_path, model_path, device=device)
+
+    def process(self, image):
+        result = inference_model(self.model, image)
+        pred_seg = result.pred_sem_seg.data.cpu().numpy()[0]
+        return pred_seg
+
+class TaskType(Enum):
+    face = "face"
+    head = "head"
+    head_plus_shoulders = "head_plus_shoulders"
+
+    # 衣服相关任务
+    top_cloth = "top_cloth"
+    bottom_cloth = "bottom_cloth"
+    full_clothes = "full_clothes"
+
+    # 全身相关任务
+    full_character = "full_character"
+
+class ProcessorPipeline:
+    """
+    该功能主要用于从单个图像中提取指定内容的mask
+    """
+    def __init__(self, seg_pipe: PersonSeg):
+        self.seg_pipe = seg_pipe
+
+    @classmethod
+    def load(cls, device: str = 'cuda') -> "ProcessorPipeline":
+        """
+        从包内加载模型和配置来初始化 Pipeline。
+        不再需要外部路径。
+        """
+        # 使用 importlib.resources 安全地获取包内文件的路径
+        with importlib.resources.path('head_extractor.models', 'depth_anything_large_mask2former_16xb1_160k_human_parsing_fashion_1024x1024.py') as config_path:
+            with importlib.resources.path('head_extractor.models', 'ckpt.pth') as model_path:
+                seg_pipe = PersonSeg(str(config_path), str(model_path), device=device)
+        
+        return cls(seg_pipe)
+
+    def process(
+        self,
+        image: Image.Image,
+        task_type: TaskType,
+        long_edge: int = 1024
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        从图像中提取mask，内部流程优化为返回NumPy数组。
+        
+        Args:
+            image: 输入图像
+            task_type: 任务类型 ('head' or 'face')
+            long_edge (int): 用于缩放图像的长边尺寸，值越小速度越快。
+        
+        Returns:
+            (处理后的图像 NumPy 数组, 生成的mask NumPy 数组)
+        """
+        # 1. 预处理图像：统一转换为numpy array (RGB)
+        if isinstance(image, Image.Image):
+            image_np = np.array(image.convert("RGB"))
+        else: # 假设是numpy array
+            image_np = image
+        
+        if len(image_np.shape) == 2:
+            image_np = cv2.cvtColor(image_np, cv2.COLOR_GRAY2RGB)
+        elif image_np.shape[2] == 4:
+            image_np = cv2.cvtColor(image_np, cv2.COLOR_RGBA2RGB)
+
+        processed_image_np = self.resize_long_edge(image_np, long_edge=long_edge)
+        ori_h, ori_w = processed_image_np.shape[:2]
+
+        # 2. 运行分割
+        pred_mask_map = self.seg_pipe.process(processed_image_np)
+
+        if task_type == TaskType.head_plus_shoulders:
+            # 2.1 先做“头部”基础mask
+            head_labels = [7, 8, 13, 14]  # headwear, eyeglass, hair, face
+            head_mask = np.isin(pred_mask_map, head_labels).astype(np.float32)
+            head_mask = cv2.resize(head_mask, (ori_w, ori_h), interpolation=cv2.INTER_NEAREST)
+
+            # 2.2 计算头部bbox并向下和左右扩展一段
+            rows = np.any(head_mask > 0, axis=1)
+            cols = np.any(head_mask > 0, axis=0)
+            if np.any(rows) and np.any(cols):
+                rmin, rmax = np.where(rows)[0][[0, -1]]
+                cmin, cmax = np.where(cols)[0][[0, -1]]
+                h_box = max(1, rmax - rmin)
+                w_box = max(1, cmax - cmin)
+
+                down_ratio = 0.1   # 向下扩展比例（相对头bbox高）
+                side_ratio = 0.6  # 左右扩展比例（相对头bbox宽）
+
+                r2max = min(ori_h, rmax + int(h_box * down_ratio))
+                c2min = max(0, cmin - int(w_box * side_ratio))
+                c2max = min(ori_w, cmax + int(w_box * side_ratio))
+
+                rect_mask = np.zeros((ori_h, ori_w), dtype=np.float32)
+                rect_mask[rmin:r2max, c2min:c2max] = 1.0
+
+                # 2.3 在扩展矩形内，仅保留“人物相关像素”（过滤掉背景）
+                person_labels = list(range(1, 37))  # 1..36 都是人物部件
+                person_mask = np.isin(pred_mask_map, person_labels).astype(np.float32)
+                person_mask = cv2.resize(person_mask, (ori_w, ori_h), interpolation=cv2.INTER_NEAREST)
+
+                initial_mask = np.clip(head_mask + (person_mask * rect_mask), 0, 1)
+            else:
+                initial_mask = head_mask
+        else:
+            # 其它任务保持原逻辑
+            labels_map = self._get_labels_for_task(task_type)
+            primary_labels = labels_map['primary']
+            initial_mask = np.isin(pred_mask_map, primary_labels).astype(np.float32)
+            initial_mask = cv2.resize(initial_mask, (ori_w, ori_h), interpolation=cv2.INTER_NEAREST)
+
+        # 3. 后处理（不同任务的形态学策略）
+        final_mask_np = self._apply_task_specific_mask_processing(initial_mask, task_type, ori_h, ori_w)
+
+        # 4. 返回
+        final_mask_uint8 = (final_mask_np * 255).astype(np.uint8)
+        return processed_image_np, final_mask_uint8
+
+    def _get_labels_for_task(self, task_type: TaskType) -> dict:
+        """根据任务类型获取对应的标签映射"""
+        labels_map = {
+            TaskType.face: { 'primary': [8, 14] }, # eyeglass, face
+            TaskType.head: { 'primary': [7, 8, 13, 14] }, # headwear, eyeglass, hair, face
+            TaskType.top_cloth: { 'primary': [1, 2] }, # top, outer
+            TaskType.bottom_cloth: { 'primary': [3, 4, 5, 6] }, # skirt, dress, pants, leggings
+            TaskType.full_clothes: { 'primary': [1, 2, 3, 4, 5, 6] }, # all clothes
+            TaskType.full_character: { 'primary': list(range(1, 37)) }, # 包含所有人物相关部分
+        }
+        return labels_map.get(task_type, {'primary': []})
+
+    def _apply_task_specific_mask_processing(self, mask: np.ndarray, task_type: TaskType, ori_h: int, ori_w: int) -> np.ndarray:
+        """根据任务类型对mask进行特殊处理"""
+        if task_type == TaskType.face:
+            # 人脸任务：简单膨胀
+            expand_kernel = 5
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (expand_kernel, expand_kernel))
+            mask = cv2.dilate((mask > 0.5).astype(np.float32), kernel)
+            
+        elif task_type == TaskType.head:
+            # 头部任务：先腐蚀再膨胀
+            kernel = np.ones((7, 7), dtype=np.uint8)
+            mask = cv2.erode(mask, kernel, iterations=1)
+            
+            expand_kernel = 11
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (expand_kernel, expand_kernel))
+            mask = cv2.dilate((mask > 0.5).astype(np.float32), kernel)
+
+        elif task_type == TaskType.head_plus_shoulders:
+            # 比 head 更偏向“向下与左右扩展”的膨胀（高度核 > 宽度核）
+            # 轻微腐蚀，避免边界毛刺
+            erode_k = 5
+            kernel = np.ones((erode_k, erode_k), dtype=np.uint8)
+            mask = cv2.erode(mask, kernel, iterations=1)
+
+            max_side = max(ori_h, ori_w)
+            h_kernel = max(15, int(max_side * 0.05))  # 更高
+            w_kernel = max(11, int(max_side * 0.03))  # 稍窄
+            # 保证奇数
+            h_kernel = h_kernel // 2 * 2 + 1
+            w_kernel = w_kernel // 2 * 2 + 1
+
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w_kernel, h_kernel))
+            mask = cv2.dilate((mask > 0.5).astype(np.float32), kernel)
+
+        if task_type in [TaskType.top_cloth, TaskType.bottom_cloth, TaskType.full_clothes, TaskType.full_character]:
+            # 衣服相关任务：膨胀和模糊处理
+            expand_ratio = 0.01
+            max_side = max(ori_h, ori_w)
+            blur_kernel = 1
+            expand_kernel = int(max_side * expand_ratio) // 2 * 2 + 1
+            
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (expand_kernel, expand_kernel))
+            expanded = cv2.dilate((mask > 0.5).astype(np.uint8), kernel)
+            
+            blurred = cv2.GaussianBlur(
+                expanded.astype(np.float32),
+                (blur_kernel, blur_kernel),
+                sigmaX=0,
+            )
+            mask = np.clip(blurred / (blurred.max() + 1e-6), 0, 1)
+            
+        return mask
+
+    @staticmethod
+    def resize_long_edge(image_np: np.ndarray, long_edge=1024) -> np.ndarray:
+        """将图像等比例缩放到指定长边尺寸 (使用OpenCV)"""
+        original_height, original_width = image_np.shape[:2]
+        
+        max_dimension = max(original_width, original_height)
+        if max_dimension <= long_edge:
+            return image_np
+        
+        ratio = long_edge / max_dimension
+        new_width = int(original_width * ratio)
+        new_height = int(original_height * ratio)
+        
+        # 使用cv2.INTER_AREA进行缩放，对于缩小图像效果较好且速度快
+        return cv2.resize(image_np, (new_width, new_height), interpolation=cv2.INTER_AREA)
+
+    @staticmethod
+    def _pad_to_square_np(image_np: np.ndarray, background_value: tuple) -> np.ndarray:
+        """将NumPy图像填充为正方形"""
+        height, width = image_np.shape[:2]
+        if width == height:
+            return image_np
+        
+        max_dim = max(width, height)
+        
+        # 根据通道数确定背景色
+        channels = image_np.shape[2] if len(image_np.shape) > 2 else 1
+        
+        # 创建一个正确尺寸的背景板
+        padded_image = np.full((max_dim, max_dim, channels), background_value, dtype=image_np.dtype)
+
+        paste_x = (max_dim - width) // 2
+        paste_y = (max_dim - height) // 2
+        
+        padded_image[paste_y:paste_y+height, paste_x:paste_x+width] = image_np
+        return padded_image
+
+    @staticmethod
+    def pad_to_square(image: Image.Image, background_color: tuple = (255, 255, 255)) -> Image.Image:
+        """
+        将图像填充为正方形
+        
+        Args:
+            image: 输入图像
+            background_color: 填充的背景颜色
+        
+        Returns:
+            填充为正方形的图像
+        """
+        width, height = image.size
+        if width == height:
+            return image
+        
+        max_dim = max(width, height)
+        padded_image = Image.new(image.mode, (max_dim, max_dim), background_color)
+        paste_x = (max_dim - width) // 2
+        paste_y = (max_dim - height) // 2
+        padded_image.paste(image, (paste_x, paste_y))
+        return padded_image
+
+    def crop_image_by_mask(self, image: Image.Image, mask: Image.Image, padding: int = 20) -> Image.Image:
+        """
+        根据mask裁剪图像，只保留mask覆盖的区域
+        
+        Args:
+            image: 原始图像
+            mask: 二值mask图像
+            padding: 裁剪区域的边距扩展像素数
+        
+        Returns:
+            裁剪后的图像
+        """
+        # 转换为numpy数组
+        mask_np = np.array(mask)
+        image_np = np.array(image)
+        
+        # 找到mask中非零像素的边界框
+        rows = np.any(mask_np > 0, axis=1)
+        cols = np.any(mask_np > 0, axis=0)
+        
+        if not np.any(rows) or not np.any(cols):
+            # 如果mask为空，返回原图
+            return image
+        
+        # 获取边界框坐标
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        
+        # 添加padding并确保不超出图像边界
+        h, w = image_np.shape[:2]
+        rmin = max(0, rmin - padding)
+        rmax = min(h, rmax + padding + 1)
+        cmin = max(0, cmin - padding)
+        cmax = min(w, cmax + padding + 1)
+        
+        # 裁剪图像
+        cropped_image = image_np[rmin:rmax, cmin:cmax]
+        
+        return Image.fromarray(cropped_image)
+
+    def _crop_image_and_mask_np(self, image_np: np.ndarray, mask_np: np.ndarray, padding: int = 20) -> tuple[np.ndarray, np.ndarray]:
+        """根据mask同时裁剪NumPy图像和mask"""
+        rows = np.any(mask_np > 0, axis=1)
+        cols = np.any(mask_np > 0, axis=0)
+
+        if not np.any(rows) or not np.any(cols):
+            return image_np, mask_np
+
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+
+        h, w = image_np.shape[:2]
+        rmin = max(0, rmin - padding)
+        rmax = min(h, rmax + padding + 1)
+        cmin = max(0, cmin - padding)
+        cmax = min(w, cmax + padding + 1)
+
+        cropped_image_np = image_np[rmin:rmax, cmin:cmax]
+        cropped_mask_np = mask_np[rmin:rmax, cmin:cmax]
+
+        return cropped_image_np, cropped_mask_np
+
+    def crop_image_and_mask(self, image: Image.Image, mask: Image.Image, padding: int = 20) -> tuple[Image.Image, Image.Image]:
+        """根据mask同时裁剪图像和mask，避免重复计算边界框"""
+        mask_np = np.array(mask)
+        image_np = np.array(image)
+
+        rows = np.any(mask_np > 0, axis=1)
+        cols = np.any(mask_np > 0, axis=0)
+
+        if not np.any(rows) or not np.any(cols):
+            return image, mask
+
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+
+        h, w = image_np.shape[:2]
+        rmin = max(0, rmin - padding)
+        rmax = min(h, rmax + padding + 1)
+        cmin = max(0, cmin - padding)
+        cmax = min(w, cmax + padding + 1)
+
+        cropped_image_np = image_np[rmin:rmax, cmin:cmax]
+        cropped_mask_np = mask_np[rmin:rmax, cmin:cmax]
+
+        return Image.fromarray(cropped_image_np), Image.fromarray(cropped_mask_np)
+
+    def _apply_mask_to_image_np(self, image_np: np.ndarray, mask_np: np.ndarray, background_color: tuple) -> np.ndarray:
+        """将NumPy mask应用到NumPy图像上"""
+        mask_normalized = mask_np.astype(np.float32) / 255.0
+        background = np.full_like(image_np, background_color)
+        result = image_np * mask_normalized[..., np.newaxis] + background * (1 - mask_normalized[..., np.newaxis])
+        return result.astype(np.uint8)
+
+    def apply_mask_to_image(self, image: Image.Image, mask: Image.Image, background_color: tuple = (255, 255, 255)) -> Image.Image:
+        """
+        将mask应用到图像上，mask外的区域设置为指定背景色
+        
+        Args:
+            image: 原始图像
+            mask: 二值mask图像
+            background_color: 背景颜色 (R, G, B)
+        
+        Returns:
+            应用mask后的图像
+        """
+        # 转换为numpy数组
+        image_np = np.array(image)
+        mask_np = np.array(mask)
+        
+        # 将mask归一化到0-1范围
+        mask_normalized = mask_np.astype(np.float32) / 255.0
+        
+        # 创建背景
+        background = np.full_like(image_np, background_color)
+        
+        # 应用mask：mask区域保持原图，其他区域为背景色
+        result = image_np * mask_normalized[..., np.newaxis] + background * (1 - mask_normalized[..., np.newaxis])
+        
+        return Image.fromarray(result.astype(np.uint8))
+
+    def extract_head(
+        self,
+        image: Image.Image,
+        crop_padding: int = 10,
+        background_color: tuple = (255, 255, 255),
+        pad2square: bool = True,
+        output_mode: str = 'RGB',
+        long_edge: int = 1024,
+        include_shoulders: bool = False
+    ) -> Image.Image:
+        """
+        从输入图像中提取头部区域，并返回一个裁剪、填充为正方形的图像。
+
+        Args:
+            image: 输入图像 (PIL.Image or np.ndarray).
+            crop_padding: 裁剪边界框的额外边距.
+            background_color: `output_mode` 为 'RGB' 时，用于填充背景的颜色.
+            pad2square (bool): 是否将最终结果填充为正方形. 默认为 True.
+            output_mode (str): 输出图像模式，可选 'RGB' (纯色背景) 或 'RGBA' (透明背景). 默认为 'RGB'.
+            long_edge (int): 送入模型前缩放的长边尺寸，值越小速度越快，但可能影响精度。默认为1024。
+
+        Returns:
+            处理后的头部图像 (PIL.Image).
+        """
+        # 1. 任务类型改为可选
+        task = TaskType.head_plus_shoulders if include_shoulders else TaskType.head
+        processed_image_np, head_mask_np = self.process(
+            image=image,
+            task_type=task,
+            long_edge=long_edge
+        )
+
+        # 2. NumPy-based 裁剪
+        face_cropped_np, mask_cropped_np = self._crop_image_and_mask_np(
+            processed_image_np, head_mask_np, padding=crop_padding
+        )
+
+        # 3. 根据输出模式（RGB/RGBA）应用蒙版
+        output_mode = output_mode.upper()
+        if output_mode == 'RGBA':
+            # 创建一个带透明通道的RGBA图像
+            # 首先确保图像是3通道的
+            if face_cropped_np.shape[2] == 4:
+                face_cropped_np = face_cropped_np[:,:,:3]
+            # 创建RGBA图像
+            result_image_np = cv2.cvtColor(face_cropped_np, cv2.COLOR_RGB2RGBA)
+            result_image_np[:, :, 3] = mask_cropped_np # 设置alpha通道
+            
+        elif output_mode == 'RGB':
+            # NumPy-based 蒙版应用
+            result_image_np = self._apply_mask_to_image_np(
+                face_cropped_np,
+                mask_cropped_np,
+                background_color=background_color
+            )
+        else:
+            raise ValueError("output_mode must be 'RGB' or 'RGBA'")
+
+        # 4. 可选：NumPy-based 填充
+        if pad2square:
+            if output_mode == 'RGBA':
+                pad_color = (255, 255, 255, 0) # 透明背景
+            else:  # RGB
+                pad_color = background_color
+            
+            final_image_np = self._pad_to_square_np(
+                result_image_np,
+                background_value=pad_color
+            )
+        else:
+            final_image_np = result_image_np
+
+        # 5. 仅在最后一步转换为 PIL Image
+        if output_mode == 'RGBA':
+             return Image.fromarray(final_image_np, 'RGBA')
+        else:
+             return Image.fromarray(final_image_np, 'RGB')
+
+
+    def extract(
+        self,
+        task_type: TaskType.full_character,
+        image: Image.Image,
+        crop_padding: int = 10,
+        background_color: tuple = (255, 255, 255),
+        pad2square: bool = True,
+        output_mode: str = 'RGB',
+        long_edge: int = 1024
+    ) -> Image.Image:
+        """
+        从输入图像中提取头部区域，并返回一个裁剪、填充为正方形的图像。
+
+        Args:
+            image: 输入图像 (PIL.Image or np.ndarray).
+            crop_padding: 裁剪边界框的额外边距.
+            background_color: `output_mode` 为 'RGB' 时，用于填充背景的颜色.
+            pad2square (bool): 是否将最终结果填充为正方形. 默认为 True.
+            output_mode (str): 输出图像模式，可选 'RGB' (纯色背景) 或 'RGBA' (透明背景). 默认为 'RGB'.
+            long_edge (int): 送入模型前缩放的长边尺寸，值越小速度越快，但可能影响精度。默认为1024。
+
+        Returns:
+            处理后的头部图像 (PIL.Image).
+        """
+        # 1. 运行分割，直接获取 NumPy 结果
+        processed_image_np, head_mask_np = self.process(
+            image=image,
+            task_type=task_type,
+            long_edge=long_edge
+        )
+
+        # 2. NumPy-based 裁剪
+        face_cropped_np, mask_cropped_np = self._crop_image_and_mask_np(
+            processed_image_np, head_mask_np, padding=crop_padding
+        )
+
+        # 3. 根据输出模式（RGB/RGBA）应用蒙版
+        output_mode = output_mode.upper()
+        if output_mode == 'RGBA':
+            # 创建一个带透明通道的RGBA图像
+            # 首先确保图像是3通道的
+            if face_cropped_np.shape[2] == 4:
+                face_cropped_np = face_cropped_np[:,:,:3]
+            # 创建RGBA图像
+            result_image_np = cv2.cvtColor(face_cropped_np, cv2.COLOR_RGB2RGBA)
+            result_image_np[:, :, 3] = mask_cropped_np # 设置alpha通道
+            
+        elif output_mode == 'RGB':
+            # NumPy-based 蒙版应用
+            result_image_np = self._apply_mask_to_image_np(
+                face_cropped_np,
+                mask_cropped_np,
+                background_color=background_color
+            )
+        else:
+            raise ValueError("output_mode must be 'RGB' or 'RGBA'")
+
+        # 4. 可选：NumPy-based 填充
+        if pad2square:
+            if output_mode == 'RGBA':
+                pad_color = (255, 255, 255, 0) # 透明背景
+            else:  # RGB
+                pad_color = background_color
+            
+            final_image_np = self._pad_to_square_np(
+                result_image_np,
+                background_value=pad_color
+            )
+        else:
+            final_image_np = result_image_np
+
+        # 5. 仅在最后一步转换为 PIL Image
+        if output_mode == 'RGBA':
+             return Image.fromarray(final_image_np, 'RGBA')
+        else:
+             return Image.fromarray(final_image_np, 'RGB')
+
+if __name__ == '__main__':
+    # 这是一个示例如何初始化和使用 Pipeline
+    print("Initializing pipeline from package resources...")
+    pipeline = ProcessorPipeline.load()
+    print("Pipeline initialized.")
+
+    # 使用示例 (需要提供一张图片):
+
+    # 请替换为你的图片路径
+    image_path = "001.jpg" 
+    if os.path.exists(image_path):
+        print(f"Processing image: {image_path}")
+        image = Image.open(image_path)
+        
+        print("正在提取头部...")
+        extracted_head = pipeline.extract_head(image)
+
+        # 保存最终结果
+        output_path = "output_head_extracted.png"
+        extracted_head.save(output_path)
+
+        print("\n处理完成!")
+        print(f"已保存提取的头部图像至 '{output_path}'")
+
+    else:
+        print(f"示例图片未找到: {image_path}")
+
diff --git a/head_extractor/build/lib/mmdet/__init__.py b/head_extractor/build/lib/mmdet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ac884ac8b40c1543ed840dfcafe367fbe4bda62
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import mmengine
+from mmengine.utils import digit_version
+
+from .version import __version__, version_info
+
+mmcv_minimum_version = '2.0.0rc4'
+mmcv_maximum_version = '2.2.0'
+mmcv_version = digit_version(mmcv.__version__)
+
+mmengine_minimum_version = '0.7.1'
+mmengine_maximum_version = '1.0.0'
+mmengine_version = digit_version(mmengine.__version__)
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version < digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <{mmcv_maximum_version}.'
+
+assert (mmengine_version >= digit_version(mmengine_minimum_version)
+        and mmengine_version < digit_version(mmengine_maximum_version)), \
+    f'MMEngine=={mmengine.__version__} is used but incompatible. ' \
+    f'Please install mmengine>={mmengine_minimum_version}, ' \
+    f'<{mmengine_maximum_version}.'
+
+__all__ = ['__version__', 'version_info', 'digit_version']
diff --git a/head_extractor/build/lib/mmdet/apis/__init__.py b/head_extractor/build/lib/mmdet/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c89dc72914b11a73e91dc7e9404f41bf10b93c6c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/apis/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .det_inferencer import DetInferencer
+from .inference import (async_inference_detector, inference_detector,
+                        inference_mot, init_detector, init_track_model)
+
+__all__ = [
+    'init_detector', 'async_inference_detector', 'inference_detector',
+    'DetInferencer', 'inference_mot', 'init_track_model'
+]
diff --git a/head_extractor/build/lib/mmdet/apis/det_inferencer.py b/head_extractor/build/lib/mmdet/apis/det_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8532eb786558ca3807195781d8e380741cea00
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/apis/det_inferencer.py
@@ -0,0 +1,652 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+import warnings
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
+import mmcv
+import mmengine
+import numpy as np
+import torch.nn as nn
+from mmcv.transforms import LoadImageFromFile
+from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
+from mmengine.infer.infer import BaseInferencer, ModelType
+from mmengine.model.utils import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner.checkpoint import _load_checkpoint_to_model
+from mmengine.visualization import Visualizer
+from rich.progress import track
+
+from mmdet.evaluation import INSTANCE_OFFSET
+from mmdet.registry import DATASETS
+from mmdet.structures import DetDataSample
+from mmdet.structures.mask import encode_mask_results, mask2bbox
+from mmdet.utils import ConfigType
+from ..evaluation import get_classes
+
+try:
+    from panopticapi.evaluation import VOID
+    from panopticapi.utils import id2rgb
+except ImportError:
+    id2rgb = None
+    VOID = None
+
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = List[DetDataSample]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
+                  '.tiff', '.webp')
+
+
+class DetInferencer(BaseInferencer):
+    """Object Detection Inferencer.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. For example, it could be
+            "rtmdet-s" or 'rtmdet_s_8xb32-300e_coco' or
+            "configs/rtmdet/rtmdet_s_8xb32-300e_coco.py".
+            If model is not specified, user must provide the
+            `weights` saved by MMEngine which contains the config string.
+            Defaults to None.
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        scope (str, optional): The scope of the model. Defaults to mmdet.
+        palette (str): Color palette used for visualization. The order of
+            priority is palette -> config -> checkpoint. Defaults to 'none'.
+        show_progress (bool): Control whether to display the progress
+            bar during the inference process. Defaults to True.
+    """
+
+    preprocess_kwargs: set = set()
+    forward_kwargs: set = set()
+    visualize_kwargs: set = {
+        'return_vis',
+        'show',
+        'wait_time',
+        'draw_pred',
+        'pred_score_thr',
+        'img_out_dir',
+        'no_save_vis',
+    }
+    postprocess_kwargs: set = {
+        'print_result',
+        'pred_out_dir',
+        'return_datasamples',
+        'no_save_pred',
+    }
+
+    def __init__(self,
+                 model: Optional[Union[ModelType, str]] = None,
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: Optional[str] = 'mmdet',
+                 palette: str = 'none',
+                 show_progress: bool = True) -> None:
+        # A global counter tracking the number of images processed, for
+        # naming of the output images
+        self.num_visualized_imgs = 0
+        self.num_predicted_imgs = 0
+        self.palette = palette
+        init_default_scope(scope)
+        super().__init__(
+            model=model, weights=weights, device=device, scope=scope)
+        self.model = revert_sync_batchnorm(self.model)
+        self.show_progress = show_progress
+
+    def _load_weights_to_model(self, model: nn.Module,
+                               checkpoint: Optional[dict],
+                               cfg: Optional[ConfigType]) -> None:
+        """Loading model weights and meta information from cfg and checkpoint.
+
+        Args:
+            model (nn.Module): Model to load weights and meta information.
+            checkpoint (dict, optional): The loaded checkpoint.
+            cfg (Config or ConfigDict, optional): The loaded config.
+        """
+
+        if checkpoint is not None:
+            _load_checkpoint_to_model(model, checkpoint)
+            checkpoint_meta = checkpoint.get('meta', {})
+            # save the dataset_meta in the model for convenience
+            if 'dataset_meta' in checkpoint_meta:
+                # mmdet 3.x, all keys should be lowercase
+                model.dataset_meta = {
+                    k.lower(): v
+                    for k, v in checkpoint_meta['dataset_meta'].items()
+                }
+            elif 'CLASSES' in checkpoint_meta:
+                # < mmdet 3.x
+                classes = checkpoint_meta['CLASSES']
+                model.dataset_meta = {'classes': classes}
+            else:
+                warnings.warn(
+                    'dataset_meta or class names are not saved in the '
+                    'checkpoint\'s meta data, use COCO classes by default.')
+                model.dataset_meta = {'classes': get_classes('coco')}
+        else:
+            warnings.warn('Checkpoint is not loaded, and the inference '
+                          'result is calculated by the randomly initialized '
+                          'model!')
+            warnings.warn('weights is None, use COCO classes by default.')
+            model.dataset_meta = {'classes': get_classes('coco')}
+
+        # Priority:  args.palette -> config -> checkpoint
+        if self.palette != 'none':
+            model.dataset_meta['palette'] = self.palette
+        else:
+            test_dataset_cfg = copy.deepcopy(cfg.test_dataloader.dataset)
+            # lazy init. We only need the metainfo.
+            test_dataset_cfg['lazy_init'] = True
+            metainfo = DATASETS.build(test_dataset_cfg).metainfo
+            cfg_palette = metainfo.get('palette', None)
+            if cfg_palette is not None:
+                model.dataset_meta['palette'] = cfg_palette
+            else:
+                if 'palette' not in model.dataset_meta:
+                    warnings.warn(
+                        'palette does not exist, random is used by default. '
+                        'You can also set the palette to customize.')
+                    model.dataset_meta['palette'] = 'random'
+
+    def _init_pipeline(self, cfg: ConfigType) -> Compose:
+        """Initialize the test pipeline."""
+        pipeline_cfg = cfg.test_dataloader.dataset.pipeline
+
+        # For inference, the key of ``img_id`` is not used.
+        if 'meta_keys' in pipeline_cfg[-1]:
+            pipeline_cfg[-1]['meta_keys'] = tuple(
+                meta_key for meta_key in pipeline_cfg[-1]['meta_keys']
+                if meta_key != 'img_id')
+
+        load_img_idx = self._get_transform_idx(
+            pipeline_cfg, ('LoadImageFromFile', LoadImageFromFile))
+        if load_img_idx == -1:
+            raise ValueError(
+                'LoadImageFromFile is not found in the test pipeline')
+        pipeline_cfg[load_img_idx]['type'] = 'mmdet.InferencerLoader'
+        return Compose(pipeline_cfg)
+
+    def _get_transform_idx(self, pipeline_cfg: ConfigType,
+                           name: Union[str, Tuple[str, type]]) -> int:
+        """Returns the index of the transform in a pipeline.
+
+        If the transform is not found, returns -1.
+        """
+        for i, transform in enumerate(pipeline_cfg):
+            if transform['type'] in name:
+                return i
+        return -1
+
+    def _init_visualizer(self, cfg: ConfigType) -> Optional[Visualizer]:
+        """Initialize visualizers.
+
+        Args:
+            cfg (ConfigType): Config containing the visualizer information.
+
+        Returns:
+            Visualizer or None: Visualizer initialized with config.
+        """
+        visualizer = super()._init_visualizer(cfg)
+        visualizer.dataset_meta = self.model.dataset_meta
+        return visualizer
+
+    def _inputs_to_list(self, inputs: InputsType) -> list:
+        """Preprocess the inputs to a list.
+
+        Preprocess inputs to a list according to its type:
+
+        - list or tuple: return inputs
+        - str:
+            - Directory path: return all files in the directory
+            - other cases: return a list containing the string. The string
+              could be a path to file, a url or other types of string according
+              to the task.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+
+        Returns:
+            list: List of input for the :meth:`preprocess`.
+        """
+        if isinstance(inputs, str):
+            backend = get_file_backend(inputs)
+            if hasattr(backend, 'isdir') and isdir(inputs):
+                # Backends like HttpsBackend do not implement `isdir`, so only
+                # those backends that implement `isdir` could accept the inputs
+                # as a directory
+                filename_list = list_dir_or_file(
+                    inputs, list_dir=False, suffix=IMG_EXTENSIONS)
+                inputs = [
+                    join_path(inputs, filename) for filename in filename_list
+                ]
+
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        return list(inputs)
+
+    def preprocess(self, inputs: InputsType, batch_size: int = 1, **kwargs):
+        """Process the inputs into a model-feedable format.
+
+        Customize your preprocess by overriding this method. Preprocess should
+        return an iterable object, of which each item will be used as the
+        input of ``model.test_step``.
+
+        ``BaseInferencer.preprocess`` will return an iterable chunked data,
+        which will be used in __call__ like this:
+
+        .. code-block:: python
+
+            def __call__(self, inputs, batch_size=1, **kwargs):
+                chunked_data = self.preprocess(inputs, batch_size, **kwargs)
+                for batch in chunked_data:
+                    preds = self.forward(batch, **kwargs)
+
+        Args:
+            inputs (InputsType): Inputs given by user.
+            batch_size (int): batch size. Defaults to 1.
+
+        Yields:
+            Any: Data processed by the ``pipeline`` and ``collate_fn``.
+        """
+        chunked_data = self._get_chunk_data(inputs, batch_size)
+        yield from map(self.collate_fn, chunked_data)
+
+    def _get_chunk_data(self, inputs: Iterable, chunk_size: int):
+        """Get batch data from inputs.
+
+        Args:
+            inputs (Iterable): An iterable dataset.
+            chunk_size (int): Equivalent to batch size.
+
+        Yields:
+            list: batch data.
+        """
+        inputs_iter = iter(inputs)
+        while True:
+            try:
+                chunk_data = []
+                for _ in range(chunk_size):
+                    inputs_ = next(inputs_iter)
+                    if isinstance(inputs_, dict):
+                        if 'img' in inputs_:
+                            ori_inputs_ = inputs_['img']
+                        else:
+                            ori_inputs_ = inputs_['img_path']
+                        chunk_data.append(
+                            (ori_inputs_,
+                             self.pipeline(copy.deepcopy(inputs_))))
+                    else:
+                        chunk_data.append((inputs_, self.pipeline(inputs_)))
+                yield chunk_data
+            except StopIteration:
+                if chunk_data:
+                    yield chunk_data
+                break
+
+    # TODO: Video and Webcam are currently not supported and
+    #  may consume too much memory if your input folder has a lot of images.
+    #  We will be optimized later.
+    def __call__(
+            self,
+            inputs: InputsType,
+            batch_size: int = 1,
+            return_vis: bool = False,
+            show: bool = False,
+            wait_time: int = 0,
+            no_save_vis: bool = False,
+            draw_pred: bool = True,
+            pred_score_thr: float = 0.3,
+            return_datasamples: bool = False,
+            print_result: bool = False,
+            no_save_pred: bool = True,
+            out_dir: str = '',
+            # by open image task
+            texts: Optional[Union[str, list]] = None,
+            # by open panoptic task
+            stuff_texts: Optional[Union[str, list]] = None,
+            # by GLIP and Grounding DINO
+            custom_entities: bool = False,
+            # by Grounding DINO
+            tokens_positive: Optional[Union[int, list]] = None,
+            **kwargs) -> dict:
+        """Call the inferencer.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            batch_size (int): Inference batch size. Defaults to 1.
+            show (bool): Whether to display the visualization results in a
+                popup window. Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            no_save_vis (bool): Whether to force not to save prediction
+                vis results. Defaults to False.
+            draw_pred (bool): Whether to draw predicted bounding boxes.
+                Defaults to True.
+            pred_score_thr (float): Minimum score of bboxes to draw.
+                Defaults to 0.3.
+            return_datasamples (bool): Whether to return results as
+                :obj:`DetDataSample`. Defaults to False.
+            print_result (bool): Whether to print the inference result w/o
+                visualization to the console. Defaults to False.
+            no_save_pred (bool): Whether to force not to save prediction
+                results. Defaults to True.
+            out_dir: Dir to save the inference results or
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+            texts (str | list[str]): Text prompts. Defaults to None.
+            stuff_texts (str | list[str]): Stuff text prompts of open
+                panoptic task. Defaults to None.
+            custom_entities (bool): Whether to use custom entities.
+                Defaults to False. Only used in GLIP and Grounding DINO.
+            **kwargs: Other keyword arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+        Returns:
+            dict: Inference and visualization results.
+        """
+        (
+            preprocess_kwargs,
+            forward_kwargs,
+            visualize_kwargs,
+            postprocess_kwargs,
+        ) = self._dispatch_kwargs(**kwargs)
+
+        ori_inputs = self._inputs_to_list(inputs)
+
+        if texts is not None and isinstance(texts, str):
+            texts = [texts] * len(ori_inputs)
+        if stuff_texts is not None and isinstance(stuff_texts, str):
+            stuff_texts = [stuff_texts] * len(ori_inputs)
+
+        # Currently only supports bs=1
+        tokens_positive = [tokens_positive] * len(ori_inputs)
+
+        if texts is not None:
+            assert len(texts) == len(ori_inputs)
+            for i in range(len(texts)):
+                if isinstance(ori_inputs[i], str):
+                    ori_inputs[i] = {
+                        'text': texts[i],
+                        'img_path': ori_inputs[i],
+                        'custom_entities': custom_entities,
+                        'tokens_positive': tokens_positive[i]
+                    }
+                else:
+                    ori_inputs[i] = {
+                        'text': texts[i],
+                        'img': ori_inputs[i],
+                        'custom_entities': custom_entities,
+                        'tokens_positive': tokens_positive[i]
+                    }
+        if stuff_texts is not None:
+            assert len(stuff_texts) == len(ori_inputs)
+            for i in range(len(stuff_texts)):
+                ori_inputs[i]['stuff_text'] = stuff_texts[i]
+
+        inputs = self.preprocess(
+            ori_inputs, batch_size=batch_size, **preprocess_kwargs)
+
+        results_dict = {'predictions': [], 'visualization': []}
+        for ori_imgs, data in (track(inputs, description='Inference')
+                               if self.show_progress else inputs):
+            preds = self.forward(data, **forward_kwargs)
+            visualization = self.visualize(
+                ori_imgs,
+                preds,
+                return_vis=return_vis,
+                show=show,
+                wait_time=wait_time,
+                draw_pred=draw_pred,
+                pred_score_thr=pred_score_thr,
+                no_save_vis=no_save_vis,
+                img_out_dir=out_dir,
+                **visualize_kwargs)
+            results = self.postprocess(
+                preds,
+                visualization,
+                return_datasamples=return_datasamples,
+                print_result=print_result,
+                no_save_pred=no_save_pred,
+                pred_out_dir=out_dir,
+                **postprocess_kwargs)
+            results_dict['predictions'].extend(results['predictions'])
+            if results['visualization'] is not None:
+                results_dict['visualization'].extend(results['visualization'])
+        return results_dict
+
+    def visualize(self,
+                  inputs: InputsType,
+                  preds: PredType,
+                  return_vis: bool = False,
+                  show: bool = False,
+                  wait_time: int = 0,
+                  draw_pred: bool = True,
+                  pred_score_thr: float = 0.3,
+                  no_save_vis: bool = False,
+                  img_out_dir: str = '',
+                  **kwargs) -> Union[List[np.ndarray], None]:
+        """Visualize predictions.
+
+        Args:
+            inputs (List[Union[str, np.ndarray]]): Inputs for the inferencer.
+            preds (List[:obj:`DetDataSample`]): Predictions of the model.
+            return_vis (bool): Whether to return the visualization result.
+                Defaults to False.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            draw_pred (bool): Whether to draw predicted bounding boxes.
+                Defaults to True.
+            pred_score_thr (float): Minimum score of bboxes to draw.
+                Defaults to 0.3.
+            no_save_vis (bool): Whether to force not to save prediction
+                vis results. Defaults to False.
+            img_out_dir (str): Output directory of visualization results.
+                If left as empty, no file will be saved. Defaults to ''.
+
+        Returns:
+            List[np.ndarray] or None: Returns visualization results only if
+            applicable.
+        """
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
+            return None
+
+        if self.visualizer is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+
+        results = []
+
+        for single_input, pred in zip(inputs, preds):
+            if isinstance(single_input, str):
+                img_bytes = mmengine.fileio.get(single_input)
+                img = mmcv.imfrombytes(img_bytes)
+                img = img[:, :, ::-1]
+                img_name = osp.basename(single_input)
+            elif isinstance(single_input, np.ndarray):
+                img = single_input.copy()
+                img_num = str(self.num_visualized_imgs).zfill(8)
+                img_name = f'{img_num}.jpg'
+            else:
+                raise ValueError('Unsupported input type: '
+                                 f'{type(single_input)}')
+
+            out_file = osp.join(img_out_dir, 'vis',
+                                img_name) if img_out_dir != '' else None
+
+            self.visualizer.add_datasample(
+                img_name,
+                img,
+                pred,
+                show=show,
+                wait_time=wait_time,
+                draw_gt=False,
+                draw_pred=draw_pred,
+                pred_score_thr=pred_score_thr,
+                out_file=out_file,
+            )
+            results.append(self.visualizer.get_image())
+            self.num_visualized_imgs += 1
+
+        return results
+
+    def postprocess(
+        self,
+        preds: PredType,
+        visualization: Optional[List[np.ndarray]] = None,
+        return_datasamples: bool = False,
+        print_result: bool = False,
+        no_save_pred: bool = False,
+        pred_out_dir: str = '',
+        **kwargs,
+    ) -> Dict:
+        """Process the predictions and visualization results from ``forward``
+        and ``visualize``.
+
+        This method should be responsible for the following tasks:
+
+        1. Convert datasamples into a json-serializable dict if needed.
+        2. Pack the predictions and visualization results and return them.
+        3. Dump or log the predictions.
+
+        Args:
+            preds (List[:obj:`DetDataSample`]): Predictions of the model.
+            visualization (Optional[np.ndarray]): Visualized predictions.
+            return_datasamples (bool): Whether to use Datasample to store
+                inference results. If False, dict will be used.
+            print_result (bool): Whether to print the inference result w/o
+                visualization to the console. Defaults to False.
+            no_save_pred (bool): Whether to force not to save prediction
+                results. Defaults to False.
+            pred_out_dir: Dir to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Inference and visualization results with key ``predictions``
+            and ``visualization``.
+
+            - ``visualization`` (Any): Returned by :meth:`visualize`.
+            - ``predictions`` (dict or DataSample): Returned by
+                :meth:`forward` and processed in :meth:`postprocess`.
+                If ``return_datasamples=False``, it usually should be a
+                json-serializable dict containing only basic data elements such
+                as strings and numbers.
+        """
+        if no_save_pred is True:
+            pred_out_dir = ''
+
+        result_dict = {}
+        results = preds
+        if not return_datasamples:
+            results = []
+            for pred in preds:
+                result = self.pred2dict(pred, pred_out_dir)
+                results.append(result)
+        elif pred_out_dir != '':
+            warnings.warn('Currently does not support saving datasample '
+                          'when return_datasamples is set to True. '
+                          'Prediction results are not saved!')
+        # Add img to the results after printing and dumping
+        result_dict['predictions'] = results
+        if print_result:
+            print(result_dict)
+        result_dict['visualization'] = visualization
+        return result_dict
+
+    # TODO: The data format and fields saved in json need further discussion.
+    #  Maybe should include model name, timestamp, filename, image info etc.
+    def pred2dict(self,
+                  data_sample: DetDataSample,
+                  pred_out_dir: str = '') -> Dict:
+        """Extract elements necessary to represent a prediction into a
+        dictionary.
+
+        It's better to contain only basic data elements such as strings and
+        numbers in order to guarantee it's json-serializable.
+
+        Args:
+            data_sample (:obj:`DetDataSample`): Predictions of the model.
+            pred_out_dir: Dir to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Prediction results.
+        """
+        is_save_pred = True
+        if pred_out_dir == '':
+            is_save_pred = False
+
+        if is_save_pred and 'img_path' in data_sample:
+            img_path = osp.basename(data_sample.img_path)
+            img_path = osp.splitext(img_path)[0]
+            out_img_path = osp.join(pred_out_dir, 'preds',
+                                    img_path + '_panoptic_seg.png')
+            out_json_path = osp.join(pred_out_dir, 'preds', img_path + '.json')
+        elif is_save_pred:
+            out_img_path = osp.join(
+                pred_out_dir, 'preds',
+                f'{self.num_predicted_imgs}_panoptic_seg.png')
+            out_json_path = osp.join(pred_out_dir, 'preds',
+                                     f'{self.num_predicted_imgs}.json')
+            self.num_predicted_imgs += 1
+
+        result = {}
+        if 'pred_instances' in data_sample:
+            masks = data_sample.pred_instances.get('masks')
+            pred_instances = data_sample.pred_instances.numpy()
+            result = {
+                'labels': pred_instances.labels.tolist(),
+                'scores': pred_instances.scores.tolist()
+            }
+            if 'bboxes' in pred_instances:
+                result['bboxes'] = pred_instances.bboxes.tolist()
+            if masks is not None:
+                if 'bboxes' not in pred_instances or pred_instances.bboxes.sum(
+                ) == 0:
+                    # Fake bbox, such as the SOLO.
+                    bboxes = mask2bbox(masks.cpu()).numpy().tolist()
+                    result['bboxes'] = bboxes
+                encode_masks = encode_mask_results(pred_instances.masks)
+                for encode_mask in encode_masks:
+                    if isinstance(encode_mask['counts'], bytes):
+                        encode_mask['counts'] = encode_mask['counts'].decode()
+                result['masks'] = encode_masks
+
+        if 'pred_panoptic_seg' in data_sample:
+            if VOID is None:
+                raise RuntimeError(
+                    'panopticapi is not installed, please install it by: '
+                    'pip install git+https://github.com/cocodataset/'
+                    'panopticapi.git.')
+
+            pan = data_sample.pred_panoptic_seg.sem_seg.cpu().numpy()[0]
+            pan[pan % INSTANCE_OFFSET == len(
+                self.model.dataset_meta['classes'])] = VOID
+            pan = id2rgb(pan).astype(np.uint8)
+
+            if is_save_pred:
+                mmcv.imwrite(pan[:, :, ::-1], out_img_path)
+                result['panoptic_seg_path'] = out_img_path
+            else:
+                result['panoptic_seg'] = pan
+
+        if is_save_pred:
+            mmengine.dump(result, out_json_path)
+
+        return result
diff --git a/head_extractor/build/lib/mmdet/apis/inference.py b/head_extractor/build/lib/mmdet/apis/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e6f914ecabf4b9c110a4fd15310bc97d0197db9
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/apis/inference.py
@@ -0,0 +1,372 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from pathlib import Path
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.ops import RoIPool
+from mmcv.transforms import Compose
+from mmengine.config import Config
+from mmengine.dataset import default_collate
+from mmengine.model.utils import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+
+from mmdet.registry import DATASETS
+from mmdet.utils import ConfigType
+from ..evaluation import get_classes
+from ..registry import MODELS
+from ..structures import DetDataSample, SampleList
+from ..utils import get_test_pipeline_cfg
+
+
+def init_detector(
+    config: Union[str, Path, Config],
+    checkpoint: Optional[str] = None,
+    palette: str = 'none',
+    device: str = 'cuda:0',
+    cfg_options: Optional[dict] = None,
+) -> nn.Module:
+    """Initialize a detector from config file.
+
+    Args:
+        config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path,
+            :obj:`Path`, or the config object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+        palette (str): Color palette used for visualization. If palette
+            is stored in checkpoint, use checkpoint's palette first, otherwise
+            use externally passed palette. Currently, supports 'coco', 'voc',
+            'citys' and 'random'. Defaults to none.
+        device (str): The device where the anchors will be put on.
+            Defaults to cuda:0.
+        cfg_options (dict, optional): Options to override some settings in
+            the used config.
+
+    Returns:
+        nn.Module: The constructed detector.
+    """
+    if isinstance(config, (str, Path)):
+        config = Config.fromfile(config)
+    elif not isinstance(config, Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+    elif 'init_cfg' in config.model.backbone:
+        config.model.backbone.init_cfg = None
+
+    scope = config.get('default_scope', 'mmdet')
+    if scope is not None:
+        init_default_scope(config.get('default_scope', 'mmdet'))
+
+    model = MODELS.build(config.model)
+    model = revert_sync_batchnorm(model)
+    if checkpoint is None:
+        warnings.simplefilter('once')
+        warnings.warn('checkpoint is None, use COCO classes by default.')
+        model.dataset_meta = {'classes': get_classes('coco')}
+    else:
+        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+        # Weights converted from elsewhere may not have meta fields.
+        checkpoint_meta = checkpoint.get('meta', {})
+
+        # save the dataset_meta in the model for convenience
+        if 'dataset_meta' in checkpoint_meta:
+            # mmdet 3.x, all keys should be lowercase
+            model.dataset_meta = {
+                k.lower(): v
+                for k, v in checkpoint_meta['dataset_meta'].items()
+            }
+        elif 'CLASSES' in checkpoint_meta:
+            # < mmdet 3.x
+            classes = checkpoint_meta['CLASSES']
+            model.dataset_meta = {'classes': classes}
+        else:
+            warnings.simplefilter('once')
+            warnings.warn(
+                'dataset_meta or class names are not saved in the '
+                'checkpoint\'s meta data, use COCO classes by default.')
+            model.dataset_meta = {'classes': get_classes('coco')}
+
+    # Priority:  args.palette -> config -> checkpoint
+    if palette != 'none':
+        model.dataset_meta['palette'] = palette
+    else:
+        test_dataset_cfg = copy.deepcopy(config.test_dataloader.dataset)
+        # lazy init. We only need the metainfo.
+        test_dataset_cfg['lazy_init'] = True
+        metainfo = DATASETS.build(test_dataset_cfg).metainfo
+        cfg_palette = metainfo.get('palette', None)
+        if cfg_palette is not None:
+            model.dataset_meta['palette'] = cfg_palette
+        else:
+            if 'palette' not in model.dataset_meta:
+                warnings.warn(
+                    'palette does not exist, random is used by default. '
+                    'You can also set the palette to customize.')
+                model.dataset_meta['palette'] = 'random'
+
+    model.cfg = config  # save the config in the model for convenience
+    model.to(device)
+    model.eval()
+    return model
+
+
+ImagesType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]]
+
+
+def inference_detector(
+    model: nn.Module,
+    imgs: ImagesType,
+    test_pipeline: Optional[Compose] = None,
+    text_prompt: Optional[str] = None,
+    custom_entities: bool = False,
+) -> Union[DetDataSample, SampleList]:
+    """Inference image(s) with the detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        imgs (str, ndarray, Sequence[str/ndarray]):
+           Either image files or loaded images.
+        test_pipeline (:obj:`Compose`): Test pipeline.
+
+    Returns:
+        :obj:`DetDataSample` or list[:obj:`DetDataSample`]:
+        If imgs is a list or tuple, the same length list type results
+        will be returned, otherwise return the detection results directly.
+    """
+
+    if isinstance(imgs, (list, tuple)):
+        is_batch = True
+    else:
+        imgs = [imgs]
+        is_batch = False
+
+    cfg = model.cfg
+
+    if test_pipeline is None:
+        cfg = cfg.copy()
+        test_pipeline = get_test_pipeline_cfg(cfg)
+        if isinstance(imgs[0], np.ndarray):
+            # Calling this method across libraries will result
+            # in module unregistered error if not prefixed with mmdet.
+            test_pipeline[0].type = 'mmdet.LoadImageFromNDArray'
+
+        test_pipeline = Compose(test_pipeline)
+
+    if model.data_preprocessor.device.type == 'cpu':
+        for m in model.modules():
+            assert not isinstance(
+                m, RoIPool
+            ), 'CPU inference with RoIPool is not supported currently.'
+
+    result_list = []
+    for i, img in enumerate(imgs):
+        # prepare data
+        if isinstance(img, np.ndarray):
+            # TODO: remove img_id.
+            data_ = dict(img=img, img_id=0)
+        else:
+            # TODO: remove img_id.
+            data_ = dict(img_path=img, img_id=0)
+
+        if text_prompt:
+            data_['text'] = text_prompt
+            data_['custom_entities'] = custom_entities
+
+        # build the data pipeline
+        data_ = test_pipeline(data_)
+
+        data_['inputs'] = [data_['inputs']]
+        data_['data_samples'] = [data_['data_samples']]
+
+        # forward the model
+        with torch.no_grad():
+            results = model.test_step(data_)[0]
+
+        result_list.append(results)
+
+    if not is_batch:
+        return result_list[0]
+    else:
+        return result_list
+
+
+# TODO: Awaiting refactoring
+async def async_inference_detector(model, imgs):
+    """Async inference image(s) with the detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        img (str | ndarray): Either image files or loaded images.
+
+    Returns:
+        Awaitable detection results.
+    """
+    if not isinstance(imgs, (list, tuple)):
+        imgs = [imgs]
+
+    cfg = model.cfg
+
+    if isinstance(imgs[0], np.ndarray):
+        cfg = cfg.copy()
+        # set loading pipeline type
+        cfg.data.test.pipeline[0].type = 'LoadImageFromNDArray'
+
+    # cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+    test_pipeline = Compose(cfg.data.test.pipeline)
+
+    datas = []
+    for img in imgs:
+        # prepare data
+        if isinstance(img, np.ndarray):
+            # directly add img
+            data = dict(img=img)
+        else:
+            # add information into dict
+            data = dict(img_info=dict(filename=img), img_prefix=None)
+        # build the data pipeline
+        data = test_pipeline(data)
+        datas.append(data)
+
+    for m in model.modules():
+        assert not isinstance(
+            m,
+            RoIPool), 'CPU inference with RoIPool is not supported currently.'
+
+    # We don't restore `torch.is_grad_enabled()` value during concurrent
+    # inference since execution can overlap
+    torch.set_grad_enabled(False)
+    results = await model.aforward_test(data, rescale=True)
+    return results
+
+
+def build_test_pipeline(cfg: ConfigType) -> ConfigType:
+    """Build test_pipeline for mot/vis demo. In mot/vis infer, original
+    test_pipeline should remove the "LoadImageFromFile" and
+    "LoadTrackAnnotations".
+
+    Args:
+         cfg (ConfigDict): The loaded config.
+    Returns:
+         ConfigType: new test_pipeline
+    """
+    # remove the "LoadImageFromFile" and "LoadTrackAnnotations" in pipeline
+    transform_broadcaster = cfg.test_dataloader.dataset.pipeline[0].copy()
+    for transform in transform_broadcaster['transforms']:
+        if transform['type'] == 'Resize':
+            transform_broadcaster['transforms'] = transform
+    pack_track_inputs = cfg.test_dataloader.dataset.pipeline[-1].copy()
+    test_pipeline = Compose([transform_broadcaster, pack_track_inputs])
+
+    return test_pipeline
+
+
+def inference_mot(model: nn.Module, img: np.ndarray, frame_id: int,
+                  video_len: int) -> SampleList:
+    """Inference image(s) with the mot model.
+
+    Args:
+        model (nn.Module): The loaded mot model.
+        img (np.ndarray): Loaded image.
+        frame_id (int): frame id.
+        video_len (int): demo video length
+    Returns:
+        SampleList: The tracking data samples.
+    """
+    cfg = model.cfg
+    data = dict(
+        img=[img.astype(np.float32)],
+        frame_id=[frame_id],
+        ori_shape=[img.shape[:2]],
+        img_id=[frame_id + 1],
+        ori_video_length=[video_len])
+
+    test_pipeline = build_test_pipeline(cfg)
+    data = test_pipeline(data)
+
+    if not next(model.parameters()).is_cuda:
+        for m in model.modules():
+            assert not isinstance(
+                m, RoIPool
+            ), 'CPU inference with RoIPool is not supported currently.'
+
+    # forward the model
+    with torch.no_grad():
+        data = default_collate([data])
+        result = model.test_step(data)[0]
+    return result
+
+
+def init_track_model(config: Union[str, Config],
+                     checkpoint: Optional[str] = None,
+                     detector: Optional[str] = None,
+                     reid: Optional[str] = None,
+                     device: str = 'cuda:0',
+                     cfg_options: Optional[dict] = None) -> nn.Module:
+    """Initialize a model from config file.
+
+    Args:
+        config (str or :obj:`mmengine.Config`): Config file path or the config
+            object.
+        checkpoint (Optional[str], optional): Checkpoint path. Defaults to
+            None.
+        detector (Optional[str], optional): Detector Checkpoint path, use in
+            some tracking algorithms like sort.  Defaults to None.
+        reid (Optional[str], optional): Reid checkpoint path. use in
+            some tracking algorithms like sort. Defaults to None.
+        device (str, optional): The device that the model inferences on.
+            Defaults to `cuda:0`.
+        cfg_options (Optional[dict], optional): Options to override some
+            settings in the used config. Defaults to None.
+
+    Returns:
+        nn.Module: The constructed model.
+    """
+    if isinstance(config, str):
+        config = Config.fromfile(config)
+    elif not isinstance(config, Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+
+    model = MODELS.build(config.model)
+
+    if checkpoint is not None:
+        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+        # Weights converted from elsewhere may not have meta fields.
+        checkpoint_meta = checkpoint.get('meta', {})
+        # save the dataset_meta in the model for convenience
+        if 'dataset_meta' in checkpoint_meta:
+            if 'CLASSES' in checkpoint_meta['dataset_meta']:
+                value = checkpoint_meta['dataset_meta'].pop('CLASSES')
+                checkpoint_meta['dataset_meta']['classes'] = value
+            model.dataset_meta = checkpoint_meta['dataset_meta']
+
+    if detector is not None:
+        assert not (checkpoint and detector), \
+            'Error: checkpoint and detector checkpoint cannot both exist'
+        load_checkpoint(model.detector, detector, map_location='cpu')
+
+    if reid is not None:
+        assert not (checkpoint and reid), \
+            'Error: checkpoint and reid checkpoint cannot both exist'
+        load_checkpoint(model.reid, reid, map_location='cpu')
+
+    # Some methods don't load checkpoints or checkpoints don't contain
+    # 'dataset_meta'
+    # VIS need dataset_meta, MOT don't need dataset_meta
+    if not hasattr(model, 'dataset_meta'):
+        warnings.warn('dataset_meta or class names are missed, '
+                      'use None by default.')
+        model.dataset_meta = {'classes': None}
+
+    model.cfg = config  # save the config in the model for convenience
+    model.to(device)
+    model.eval()
+    return model
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/datasets/coco_detection.py b/head_extractor/build/lib/mmdet/configs/_base_/datasets/coco_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..45041f6d236be95eb7592035d31f155c61bfcb25
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/datasets/coco_detection.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms import LoadImageFromFile
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms import (LoadAnnotations, PackDetInputs,
+                                       RandomFlip, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    batch_sampler=dict(type=AspectRatioBatchSampler),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# inference on test dataset and
+# format the output results for submission.
+# test_dataloader = dict(
+#     batch_size=1,
+#     num_workers=2,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type=DefaultSampler, shuffle=False),
+#     dataset=dict(
+#         type=dataset_type,
+#         data_root=data_root,
+#         ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#         data_prefix=dict(img='test2017/'),
+#         test_mode=True,
+#         pipeline=test_pipeline))
+# test_evaluator = dict(
+#     type=CocoMetric,
+#     metric='bbox',
+#     format_only=True,
+#     ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#     outfile_prefix='./work_dirs/coco_detection/test')
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/datasets/coco_instance.py b/head_extractor/build/lib/mmdet/configs/_base_/datasets/coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9575432e26b7e861c4dfcf535773b7a1990eeab
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/datasets/coco_instance.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet.datasets.coco import CocoDataset
+from mmdet.datasets.samplers.batch_sampler import AspectRatioBatchSampler
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import RandomFlip, Resize
+from mmdet.evaluation.metrics.coco_metric import CocoMetric
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    batch_sampler=dict(type=AspectRatioBatchSampler),
+    dataset=dict(
+        type=CocoDataset,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=CocoDataset,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# inference on test dataset and
+# format the output results for submission.
+# test_dataloader = dict(
+#     batch_size=1,
+#     num_workers=2,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type=DefaultSampler, shuffle=False),
+#     dataset=dict(
+#         type=CocoDataset,
+#         data_root=data_root,
+#         ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#         data_prefix=dict(img='test2017/'),
+#         test_mode=True,
+#         pipeline=test_pipeline))
+# test_evaluator = dict(
+#     type=CocoMetric,
+#     metric=['bbox', 'segm'],
+#     format_only=True,
+#     ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#     outfile_prefix='./work_dirs/coco_instance/test')
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/datasets/coco_instance_semantic.py b/head_extractor/build/lib/mmdet/configs/_base_/datasets/coco_instance_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cf5b2cfab8a98a6c97e23a8df663e8f1e90b355
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/datasets/coco_instance_semantic.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet.datasets.coco import CocoDataset
+from mmdet.datasets.samplers.batch_sampler import AspectRatioBatchSampler
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import RandomFlip, Resize
+from mmdet.evaluation.metrics.coco_metric import CocoMetric
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True, with_seg=True),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True, with_seg=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    batch_sampler=dict(type=AspectRatioBatchSampler),
+    dataset=dict(
+        type=CocoDataset,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/', seg='stuffthingmaps/train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=CocoDataset,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/datasets/coco_panoptic.py b/head_extractor/build/lib/mmdet/configs/_base_/datasets/coco_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..29d655ff619c74c5976d5f06c0c623a0d3459997
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/datasets/coco_panoptic.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet.datasets.coco_panoptic import CocoPanopticDataset
+from mmdet.datasets.samplers.batch_sampler import AspectRatioBatchSampler
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadPanopticAnnotations
+from mmdet.datasets.transforms.transforms import RandomFlip, Resize
+from mmdet.evaluation.metrics.coco_panoptic_metric import CocoPanopticMetric
+
+# dataset settings
+dataset_type = 'CocoPanopticDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadPanopticAnnotations, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=LoadPanopticAnnotations, backend_args=backend_args),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    batch_sampler=dict(type=AspectRatioBatchSampler),
+    dataset=dict(
+        type=CocoPanopticDataset,
+        data_root=data_root,
+        ann_file='annotations/panoptic_train2017.json',
+        data_prefix=dict(
+            img='train2017/', seg='annotations/panoptic_train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=CocoPanopticDataset,
+        data_root=data_root,
+        ann_file='annotations/panoptic_val2017.json',
+        data_prefix=dict(img='val2017/', seg='annotations/panoptic_val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoPanopticMetric,
+    ann_file=data_root + 'annotations/panoptic_val2017.json',
+    seg_prefix=data_root + 'annotations/panoptic_val2017/',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# inference on test dataset and
+# format the output results for submission.
+# test_dataloader = dict(
+#     batch_size=1,
+#     num_workers=1,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type=DefaultSampler, shuffle=False),
+#     dataset=dict(
+#         type=CocoPanopticDataset,
+#         data_root=data_root,
+#         ann_file='annotations/panoptic_image_info_test-dev2017.json',
+#         data_prefix=dict(img='test2017/'),
+#         test_mode=True,
+#         pipeline=test_pipeline))
+# test_evaluator = dict(
+#     type=CocoPanopticMetric,
+#     format_only=True,
+#     ann_file=data_root + 'annotations/panoptic_image_info_test-dev2017.json',
+#     outfile_prefix='./work_dirs/coco_panoptic/test')
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/datasets/mot_challenge.py b/head_extractor/build/lib/mmdet/configs/_base_/datasets/mot_challenge.py
new file mode 100644
index 0000000000000000000000000000000000000000..a71520a84e52a812f83862920040d96746829285
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/datasets/mot_challenge.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms import (LoadImageFromFile, RandomResize,
+                             TransformBroadcaster)
+
+from mmdet.datasets import MOTChallengeDataset
+from mmdet.datasets.samplers import TrackImgSampler
+from mmdet.datasets.transforms import (LoadTrackAnnotations, PackTrackInputs,
+                                       PhotoMetricDistortion, RandomCrop,
+                                       RandomFlip, Resize,
+                                       UniformRefFrameSample)
+from mmdet.evaluation import MOTChallengeMetric
+
+# dataset settings
+dataset_type = MOTChallengeDataset
+data_root = 'data/MOT17/'
+img_scale = (1088, 1088)
+
+backend_args = None
+# data pipeline
+train_pipeline = [
+    dict(
+        type=UniformRefFrameSample,
+        num_ref_imgs=1,
+        frame_range=10,
+        filter_key_img=True),
+    dict(
+        type=TransformBroadcaster,
+        share_random_params=True,
+        transforms=[
+            dict(type=LoadImageFromFile, backend_args=backend_args),
+            dict(type=LoadTrackAnnotations),
+            dict(
+                type=RandomResize,
+                scale=img_scale,
+                ratio_range=(0.8, 1.2),
+                keep_ratio=True,
+                clip_object_border=False),
+            dict(type=PhotoMetricDistortion)
+        ]),
+    dict(
+        type=TransformBroadcaster,
+        # different cropped positions for different frames
+        share_random_params=False,
+        transforms=[
+            dict(type=RandomCrop, crop_size=img_scale, bbox_clip_border=False)
+        ]),
+    dict(
+        type=TransformBroadcaster,
+        share_random_params=True,
+        transforms=[
+            dict(type=RandomFlip, prob=0.5),
+        ]),
+    dict(type=PackTrackInputs)
+]
+
+test_pipeline = [
+    dict(
+        type=TransformBroadcaster,
+        transforms=[
+            dict(type=LoadImageFromFile, backend_args=backend_args),
+            dict(type=Resize, scale=img_scale, keep_ratio=True),
+            dict(type=LoadTrackAnnotations)
+        ]),
+    dict(type=PackTrackInputs)
+]
+
+# dataloader
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=TrackImgSampler),  # image-based sampling
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        visibility_thr=-1,
+        ann_file='annotations/half-train_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        metainfo=dict(classes=('pedestrian', )),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    # Now we support two ways to test, image_based and video_based
+    # if you want to use video_based sampling, you can use as follows
+    # sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    sampler=dict(type=TrackImgSampler),  # image-based sampling
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(
+    type=MOTChallengeMetric, metric=['HOTA', 'CLEAR', 'Identity'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/default_runtime.py b/head_extractor/build/lib/mmdet/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff96dbf29f3c90266a268d3831878b0a437d98b2
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/default_runtime.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.runner import LogProcessor
+from mmengine.visualization import LocalVisBackend
+
+from mmdet.engine.hooks import DetVisualizationHook
+from mmdet.visualization import DetLocalVisualizer
+
+default_scope = None
+
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, interval=1),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=DetVisualizationHook))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=DetLocalVisualizer, vis_backends=vis_backends, name='visualizer')
+log_processor = dict(type=LogProcessor, window_size=50, by_epoch=True)
+
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/head_extractor/build/lib/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9132ac40330c67e03ebc608f9527c678c72210e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.ops import RoIAlign, nms
+from torch.nn import BatchNorm2d
+
+from mmdet.models.backbones.resnet import ResNet
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.rpn_head import RPNHead
+from mmdet.models.detectors.cascade_rcnn import CascadeRCNN
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.losses.smooth_l1_loss import SmoothL1Loss
+from mmdet.models.necks.fpn import FPN
+from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \
+    Shared2FCBBoxHead
+from mmdet.models.roi_heads.cascade_roi_head import CascadeRoIHead
+from mmdet.models.roi_heads.mask_heads.fcn_mask_head import FCNMaskHead
+from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \
+    SingleRoIExtractor
+from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner
+from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \
+    DeltaXYWHBBoxCoder
+from mmdet.models.task_modules.prior_generators.anchor_generator import \
+    AnchorGenerator
+from mmdet.models.task_modules.samplers.random_sampler import RandomSampler
+
+# model settings
+model = dict(
+    type=CascadeRCNN,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=FPN,
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type=RPNHead,
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type=AnchorGenerator,
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type=DeltaXYWHBBoxCoder,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type=CascadeRoIHead,
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type=Shared2FCBBoxHead,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type=DeltaXYWHBBoxCoder,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)),
+            dict(
+                type=Shared2FCBBoxHead,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type=DeltaXYWHBBoxCoder,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)),
+            dict(
+                type=Shared2FCBBoxHead,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type=DeltaXYWHBBoxCoder,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type=FCNMaskHead,
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type=CrossEntropyLoss, use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type=MaxIoUAssigner,
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type=RandomSampler,
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type=MaxIoUAssigner,
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type=RandomSampler,
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type=MaxIoUAssigner,
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type=RandomSampler,
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type=nms, iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py b/head_extractor/build/lib/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e6654f381f4993a57b81e6ed1f86c0558b56616
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.ops import RoIAlign, nms
+from torch.nn import BatchNorm2d
+
+from mmdet.models.backbones.resnet import ResNet
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.rpn_head import RPNHead
+from mmdet.models.detectors.cascade_rcnn import CascadeRCNN
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.losses.smooth_l1_loss import SmoothL1Loss
+from mmdet.models.necks.fpn import FPN
+from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \
+    Shared2FCBBoxHead
+from mmdet.models.roi_heads.cascade_roi_head import CascadeRoIHead
+from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \
+    SingleRoIExtractor
+from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner
+from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \
+    DeltaXYWHBBoxCoder
+from mmdet.models.task_modules.prior_generators.anchor_generator import \
+    AnchorGenerator
+from mmdet.models.task_modules.samplers.random_sampler import RandomSampler
+
+# model settings
+model = dict(
+    type=CascadeRCNN,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=FPN,
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type=RPNHead,
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type=AnchorGenerator,
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type=DeltaXYWHBBoxCoder,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type=CascadeRoIHead,
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type=Shared2FCBBoxHead,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type=DeltaXYWHBBoxCoder,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)),
+            dict(
+                type=Shared2FCBBoxHead,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type=DeltaXYWHBBoxCoder,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)),
+            dict(
+                type=Shared2FCBBoxHead,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type=DeltaXYWHBBoxCoder,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0))
+        ]),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type=MaxIoUAssigner,
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type=RandomSampler,
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type=MaxIoUAssigner,
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type=RandomSampler,
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type=MaxIoUAssigner,
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type=RandomSampler,
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type=nms, iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py b/head_extractor/build/lib/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e18de2224d5b4d2cd16a930daf3a9b360455b36
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.ops import RoIAlign, nms
+from torch.nn import BatchNorm2d
+
+from mmdet.models.backbones.resnet import ResNet
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.rpn_head import RPNHead
+from mmdet.models.detectors.faster_rcnn import FasterRCNN
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.losses.smooth_l1_loss import L1Loss
+from mmdet.models.necks.fpn import FPN
+from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \
+    Shared2FCBBoxHead
+from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \
+    SingleRoIExtractor
+from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead
+from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner
+from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \
+    DeltaXYWHBBoxCoder
+from mmdet.models.task_modules.prior_generators.anchor_generator import \
+    AnchorGenerator
+from mmdet.models.task_modules.samplers.random_sampler import RandomSampler
+
+# model settings
+model = dict(
+    type=FasterRCNN,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=FPN,
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type=RPNHead,
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type=AnchorGenerator,
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type=DeltaXYWHBBoxCoder,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type=L1Loss, loss_weight=1.0)),
+    roi_head=dict(
+        type=StandardRoIHead,
+        bbox_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type=Shared2FCBBoxHead,
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type=DeltaXYWHBBoxCoder,
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type=L1Loss, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type=nms, iou_threshold=0.5),
+            max_per_img=100)
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ))
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/models/mask_rcnn_r50_caffe_c4.py b/head_extractor/build/lib/mmdet/configs/_base_/models/mask_rcnn_r50_caffe_c4.py
new file mode 100644
index 0000000000000000000000000000000000000000..3054818375f708826ee41901650a11bbbe3afca9
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/models/mask_rcnn_r50_caffe_c4.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.ops import RoIAlign, nms
+from mmengine.model.weight_init import PretrainedInit
+from torch.nn import BatchNorm2d
+
+from mmdet.models.backbones.resnet import ResNet
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.rpn_head import RPNHead
+from mmdet.models.detectors.mask_rcnn import MaskRCNN
+from mmdet.models.layers import ResLayer
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.losses.smooth_l1_loss import L1Loss
+from mmdet.models.roi_heads.bbox_heads.bbox_head import BBoxHead
+from mmdet.models.roi_heads.mask_heads.fcn_mask_head import FCNMaskHead
+from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \
+    SingleRoIExtractor
+from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead
+from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner
+from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \
+    DeltaXYWHBBoxCoder
+from mmdet.models.task_modules.prior_generators.anchor_generator import \
+    AnchorGenerator
+from mmdet.models.task_modules.samplers.random_sampler import RandomSampler
+
+# model settings
+norm_cfg = dict(type=BatchNorm2d, requires_grad=False)
+# model settings
+model = dict(
+    type=MaskRCNN,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=3,
+        strides=(1, 2, 2),
+        dilations=(1, 1, 1),
+        out_indices=(2, ),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    rpn_head=dict(
+        type=RPNHead,
+        in_channels=1024,
+        feat_channels=1024,
+        anchor_generator=dict(
+            type=AnchorGenerator,
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type=DeltaXYWHBBoxCoder,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type=L1Loss, loss_weight=1.0)),
+    roi_head=dict(
+        type=StandardRoIHead,
+        shared_head=dict(
+            type=ResLayer,
+            depth=50,
+            stage=3,
+            stride=2,
+            dilation=1,
+            style='caffe',
+            norm_cfg=norm_cfg,
+            norm_eval=True),
+        bbox_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=14, sampling_ratio=0),
+            out_channels=1024,
+            featmap_strides=[16]),
+        bbox_head=dict(
+            type=BBoxHead,
+            with_avg_pool=True,
+            roi_feat_size=7,
+            in_channels=2048,
+            num_classes=80,
+            bbox_coder=dict(
+                type=DeltaXYWHBBoxCoder,
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type=L1Loss, loss_weight=1.0)),
+        mask_roi_extractor=None,
+        mask_head=dict(
+            type=FCNMaskHead,
+            num_convs=0,
+            in_channels=2048,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type=CrossEntropyLoss, use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=12000,
+            max_per_img=2000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=14,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=6000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type=nms, iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py b/head_extractor/build/lib/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8a0b031da51c8147c8ed5c5f29502bd0c4bbe7f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.ops import RoIAlign, nms
+from mmengine.model.weight_init import PretrainedInit
+from torch.nn import BatchNorm2d
+
+from mmdet.models.backbones.resnet import ResNet
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.rpn_head import RPNHead
+from mmdet.models.detectors.mask_rcnn import MaskRCNN
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.losses.smooth_l1_loss import L1Loss
+from mmdet.models.necks.fpn import FPN
+from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \
+    Shared2FCBBoxHead
+from mmdet.models.roi_heads.mask_heads.fcn_mask_head import FCNMaskHead
+from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \
+    SingleRoIExtractor
+from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead
+from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner
+from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \
+    DeltaXYWHBBoxCoder
+from mmdet.models.task_modules.prior_generators.anchor_generator import \
+    AnchorGenerator
+from mmdet.models.task_modules.samplers.random_sampler import RandomSampler
+
+# model settings
+model = dict(
+    type=MaskRCNN,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=FPN,
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type=RPNHead,
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type=AnchorGenerator,
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type=DeltaXYWHBBoxCoder,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type=L1Loss, loss_weight=1.0)),
+    roi_head=dict(
+        type=StandardRoIHead,
+        bbox_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type=Shared2FCBBoxHead,
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type=DeltaXYWHBBoxCoder,
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type=L1Loss, loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type=FCNMaskHead,
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type=CrossEntropyLoss, use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=28,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type=nms, iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/models/retinanet_r50_fpn.py b/head_extractor/build/lib/mmdet/configs/_base_/models/retinanet_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..33e5cc4f1fe69f66801abdfedc578293e96cd23d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/models/retinanet_r50_fpn.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.ops import nms
+from torch.nn import BatchNorm2d
+
+from mmdet.models import (FPN, DetDataPreprocessor, FocalLoss, L1Loss, ResNet,
+                          RetinaHead, RetinaNet)
+from mmdet.models.task_modules import (AnchorGenerator, DeltaXYWHBBoxCoder,
+                                       MaxIoUAssigner, PseudoSampler)
+
+# model settings
+model = dict(
+    type=RetinaNet,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=FPN,
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_input',
+        num_outs=5),
+    bbox_head=dict(
+        type=RetinaHead,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type=AnchorGenerator,
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type=DeltaXYWHBBoxCoder,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type=FocalLoss,
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type=L1Loss, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type=MaxIoUAssigner,
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type=PseudoSampler),  # Focal loss should use PseudoSampler
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type=nms, iou_threshold=0.5),
+        max_per_img=100))
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/schedules/schedule_1x.py b/head_extractor/build/lib/mmdet/configs/_base_/schedules/schedule_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..47d1fa6a4852c40f3f9962a47ec90e365671c61c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/schedules/schedule_1x.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+# training schedule for 1x
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=12, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/head_extractor/build/lib/mmdet/configs/_base_/schedules/schedule_2x.py b/head_extractor/build/lib/mmdet/configs/_base_/schedules/schedule_2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..51ba09a4723bc6ba41b8b4cb6e623ade7db26511
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/_base_/schedules/schedule_2x.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+# training schedule for 1x
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=24, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/head_extractor/build/lib/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py b/head_extractor/build/lib/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a81c25af8b9506acfa8755ff4ec99d33c661442b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_instance import *
+    from .._base_.default_runtime import *
+    from .._base_.models.cascade_mask_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_1x import *
diff --git a/head_extractor/build/lib/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py b/head_extractor/build/lib/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..883f09be67066283e1b59484d3483e73d82af776
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_detection import *
+    from .._base_.default_runtime import *
+    from .._base_.models.cascade_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_1x import *
diff --git a/head_extractor/build/lib/mmdet/configs/common/lsj_100e_coco_detection.py b/head_extractor/build/lib/mmdet/configs/common/lsj_100e_coco_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea2d6bad7f500417ad1eb3e16ca7761c6cadca0e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/common/lsj_100e_coco_detection.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import CocoDataset, RepeatDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+image_size = (1024, 1024)
+
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=RandomResize,
+        scale=image_size,
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(
+        type=RandomCrop,
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+# Use RepeatDataset to speed up training
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=4,  # simply change this from 2 to 16 for 50e - 400e training.
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+max_epochs = 25
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=5)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# optimizer assumes bs=64
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=0.00004))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.067, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
+
+# only keep latest 2 checkpoints
+default_hooks.update(dict(checkpoint=dict(max_keep_ckpts=2)))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/head_extractor/build/lib/mmdet/configs/common/lsj_100e_coco_instance.py b/head_extractor/build/lib/mmdet/configs/common/lsj_100e_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..90104ee503b22ef395a9b87d74ee80431575d90c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/common/lsj_100e_coco_instance.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import CocoDataset, RepeatDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+image_size = (1024, 1024)
+
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=RandomResize,
+        scale=image_size,
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(
+        type=RandomCrop,
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+# Use RepeatDataset to speed up training
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=4,  # simply change this from 2 to 16 for 50e - 400e training.
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+max_epochs = 25
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=5)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# optimizer assumes bs=64
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=0.00004))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.067, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
+
+# only keep latest 2 checkpoints
+default_hooks.update(dict(checkpoint=dict(max_keep_ckpts=2)))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/head_extractor/build/lib/mmdet/configs/common/lsj_200e_coco_detection.py b/head_extractor/build/lib/mmdet/configs/common/lsj_200e_coco_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..5759499e95dde6ef99246ab00c21264192ff511c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/common/lsj_200e_coco_detection.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .lsj_100e_coco_detection import *
+
+# 8x25=200e
+train_dataloader.update(dict(dataset=dict(times=8)))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.067, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=25,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
diff --git a/head_extractor/build/lib/mmdet/configs/common/lsj_200e_coco_instance.py b/head_extractor/build/lib/mmdet/configs/common/lsj_200e_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..77c5cdd44c488a763d320768e80b314f999ac555
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/common/lsj_200e_coco_instance.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .lsj_100e_coco_instance import *
+
+# 8x25=200e
+train_dataloader.update(dict(dataset=dict(times=8)))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.067, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=25,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
diff --git a/head_extractor/build/lib/mmdet/configs/common/ms_3x_coco.py b/head_extractor/build/lib/mmdet/configs/common/ms_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c32b24d96aeed59a7340cd7e743dd16b7c728bf1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/common/ms_3x_coco.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import RandomResize
+from mmengine.dataset import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import RandomFlip, Resize
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=RandomResize, scale=[(1333, 640), (1333, 800)], keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    pin_memory=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    batch_sampler=dict(type=AspectRatioBatchSampler),
+    dataset=dict(
+        type=RepeatDataset,
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# training schedule for 3x with `RepeatDataset`
+train_cfg = dict(type=EpochBasedTrainLoop, max_iters=12, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=12,
+        by_epoch=False,
+        milestones=[9, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/head_extractor/build/lib/mmdet/configs/common/ms_3x_coco_instance.py b/head_extractor/build/lib/mmdet/configs/common/ms_3x_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c78909df80173eb37ff83c4ba12614e73848f29
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/common/ms_3x_coco_instance.py
@@ -0,0 +1,136 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.dataset import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type='RandomResize', scale=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader.update(
+    dict(
+        batch_size=2,
+        num_workers=2,
+        persistent_workers=True,
+        sampler=dict(type=DefaultSampler, shuffle=True),
+        batch_sampler=dict(type=AspectRatioBatchSampler),
+        dataset=dict(
+            type=RepeatDataset,
+            times=3,
+            dataset=dict(
+                type=dataset_type,
+                data_root=data_root,
+                ann_file='annotations/instances_train2017.json',
+                data_prefix=dict(img='train2017/'),
+                filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                pipeline=train_pipeline,
+                backend_args=backend_args))))
+val_dataloader.update(
+    dict(
+        batch_size=1,
+        num_workers=2,
+        persistent_workers=True,
+        drop_last=False,
+        sampler=dict(type=DefaultSampler, shuffle=False),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_val2017.json',
+            data_prefix=dict(img='val2017/'),
+            test_mode=True,
+            pipeline=test_pipeline,
+            backend_args=backend_args)))
+test_dataloader = val_dataloader
+
+val_evaluator.update(
+    dict(
+        type=CocoMetric,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        metric='bbox',
+        backend_args=backend_args))
+test_evaluator = val_evaluator
+
+# training schedule for 3x with `RepeatDataset`
+train_cfg.update(dict(type=EpochBasedTrainLoop, max_epochs=12, val_interval=1))
+val_cfg.update(dict(type=ValLoop))
+test_cfg.update(dict(type=TestLoop))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=12,
+        by_epoch=False,
+        milestones=[9, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        type=OptimWrapper,
+        optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr.update(dict(enable=False, base_batch_size=16))
diff --git a/head_extractor/build/lib/mmdet/configs/common/ms_90k_coco.py b/head_extractor/build/lib/mmdet/configs/common/ms_90k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3abf1d4a4a8cf53a4abfa43722e306ac04770e18
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/common/ms_90k_coco.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.dataset import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# Align with Detectron2
+backend = 'pillow'
+train_pipeline = [
+    dict(
+        type=LoadImageFromFile,
+        backend_args=backend_args,
+        imdecode_backend=backend),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=RandomChoiceResize,
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True,
+        backend=backend),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(
+        type=LoadImageFromFile,
+        backend_args=backend_args,
+        imdecode_backend=backend),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True, backend=backend),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader.update(
+    dict(
+        batch_size=2,
+        num_workers=2,
+        persistent_workers=True,
+        pin_memory=True,
+        sampler=dict(type=InfiniteSampler, shuffle=True),
+        batch_sampler=dict(type=AspectRatioBatchSampler),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader.update(
+    dict(
+        batch_size=1,
+        num_workers=2,
+        persistent_workers=True,
+        drop_last=False,
+        pin_memory=True,
+        sampler=dict(type=DefaultSampler, shuffle=False),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_val2017.json',
+            data_prefix=dict(img='val2017/'),
+            test_mode=True,
+            pipeline=test_pipeline,
+            backend_args=backend_args)))
+test_dataloader = val_dataloader
+
+val_evaluator.update(
+    dict(
+        type=CocoMetric,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        metric='bbox',
+        format_only=False,
+        backend_args=backend_args))
+test_evaluator = val_evaluator
+
+# training schedule for 90k
+max_iter = 90000
+train_cfg.update(
+    dict(type=IterBasedTrainLoop, max_iters=max_iter, val_interval=10000))
+val_cfg.update(dict(type=ValLoop))
+test_cfg.update(dict(type=TestLoop))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[60000, 80000],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        type=OptimWrapper,
+        optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr.update(dict(enable=False, base_batch_size=16))
+
+default_hooks.update(dict(checkpoint=dict(by_epoch=False, interval=10000)))
+log_processor.update(dict(by_epoch=False))
diff --git a/head_extractor/build/lib/mmdet/configs/common/ms_poly_3x_coco_instance.py b/head_extractor/build/lib/mmdet/configs/common/ms_poly_3x_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..53913a059a4db9230ebd777934cc8db5595479fe
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/common/ms_poly_3x_coco_instance.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.dataset import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type='RandomResize', scale=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader.update(
+    dict(
+        batch_size=2,
+        num_workers=2,
+        persistent_workers=True,
+        pin_memory=True,
+        sampler=dict(type=DefaultSampler, shuffle=True),
+        batch_sampler=dict(type=AspectRatioBatchSampler),
+        dataset=dict(
+            type=RepeatDataset,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader.update(
+    dict(
+        batch_size=2,
+        num_workers=2,
+        persistent_workers=True,
+        drop_last=False,
+        pin_memory=True,
+        sampler=dict(type=DefaultSampler, shuffle=False),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_val2017.json',
+            data_prefix=dict(img='val2017/'),
+            test_mode=True,
+            pipeline=test_pipeline,
+            backend_args=backend_args)))
+test_dataloader = val_dataloader
+
+val_evaluator.update(
+    dict(
+        type=CocoMetric,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        metric=['bbox', 'segm'],
+        backend_args=backend_args))
+test_evaluator = val_evaluator
+
+# training schedule for 3x with `RepeatDataset`
+train_cfg.update(dict(type=EpochBasedTrainLoop, max_iters=12, val_interval=1))
+val_cfg.update(dict(type=ValLoop))
+test_cfg.update(dict(type=TestLoop))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=12,
+        by_epoch=False,
+        milestones=[9, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        type=OptimWrapper,
+        optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr.update(dict(enable=False, base_batch_size=16))
diff --git a/head_extractor/build/lib/mmdet/configs/common/ms_poly_90k_coco_instance.py b/head_extractor/build/lib/mmdet/configs/common/ms_poly_90k_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..52367350137035604ea167e5732a791c2e9cae87
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/common/ms_poly_90k_coco_instance.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.dataset import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# Align with Detectron2
+backend = 'pillow'
+train_pipeline = [
+    dict(
+        type=LoadImageFromFile,
+        backend_args=backend_args,
+        imdecode_backend=backend),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=RandomChoiceResize,
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True,
+        backend=backend),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(
+        type=LoadImageFromFile,
+        backend_args=backend_args,
+        imdecode_backend=backend),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True, backend=backend),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader.update(
+    dict(
+        batch_size=2,
+        num_workers=2,
+        persistent_workers=True,
+        pin_memory=True,
+        sampler=dict(type=InfiniteSampler, shuffle=True),
+        batch_sampler=dict(type=AspectRatioBatchSampler),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader.update(
+    dict(
+        batch_size=1,
+        num_workers=2,
+        persistent_workers=True,
+        drop_last=False,
+        pin_memory=True,
+        sampler=dict(type=DefaultSampler, shuffle=False),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_val2017.json',
+            data_prefix=dict(img='val2017/'),
+            test_mode=True,
+            pipeline=test_pipeline,
+            backend_args=backend_args)))
+test_dataloader = val_dataloader
+
+val_evaluator.update(
+    dict(
+        type=CocoMetric,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        metric=['bbox', 'segm'],
+        format_only=False,
+        backend_args=backend_args))
+test_evaluator = val_evaluator
+
+# training schedule for 90k
+max_iter = 90000
+train_cfg.update(
+    dict(type=IterBasedTrainLoop, max_iters=max_iter, val_interval=10000))
+val_cfg.update(dict(type=ValLoop))
+test_cfg.update(dict(type=TestLoop))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[60000, 80000],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        type=OptimWrapper,
+        optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr.update(dict(enable=False, base_batch_size=16))
+
+default_hooks.update(dict(checkpoint=dict(by_epoch=False, interval=10000)))
+log_processor.update(dict(by_epoch=False))
diff --git a/head_extractor/build/lib/mmdet/configs/common/ssj_270_coco_instance.py b/head_extractor/build/lib/mmdet/configs/common/ssj_270_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee86fdad4eca5b87ac0066b635e098d6a927bb49
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/common/ssj_270_coco_instance.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.dataset import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# Standard Scale Jittering (SSJ) resizes and crops an image
+# with a resize range of 0.8 to 1.25 of the original image size.
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=RandomResize,
+        scale=image_size,
+        ratio_range=(0.8, 1.25),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader.update(
+    dict(
+        batch_size=2,
+        num_workers=2,
+        persistent_workers=True,
+        sampler=dict(type=InfiniteSampler),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader.update(
+    dict(
+        batch_size=1,
+        num_workers=2,
+        persistent_workers=True,
+        drop_last=False,
+        sampler=dict(type=DefaultSampler, shuffle=False),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_val2017.json',
+            data_prefix=dict(img='val2017/'),
+            test_mode=True,
+            pipeline=test_pipeline,
+            backend_args=backend_args)))
+test_dataloader = val_dataloader
+
+val_evaluator.update(
+    dict(
+        type=CocoMetric,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        metric=['bbox', 'segm'],
+        format_only=False,
+        backend_args=backend_args))
+test_evaluator = val_evaluator
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# The model is trained by 270k iterations with batch_size 64,
+# which is roughly equivalent to 144 epochs.
+
+max_iter = 270000
+train_cfg.update(
+    dict(type=IterBasedTrainLoop, max_iters=max_iter, val_interval=10000))
+val_cfg.update(dict(type=ValLoop))
+test_cfg.update(dict(type=TestLoop))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[243000, 256500, 263250],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        type=OptimWrapper,
+        optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=0.00004)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr.update(dict(base_batch_size=64))
+
+default_hooks.update(dict(checkpoint=dict(by_epoch=False, interval=10000)))
+log_processor.update(dict(by_epoch=False))
diff --git a/head_extractor/build/lib/mmdet/configs/common/ssj_scp_270k_coco_instance.py b/head_extractor/build/lib/mmdet/configs/common/ssj_scp_270k_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..68bb1f0904fcb4de3e2f892355e489f52f53d960
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/common/ssj_scp_270k_coco_instance.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .ssj_270_coco_instance import *
+
+from mmdet.datasets import MultiImageMixDataset
+from mmdet.datasets.transforms import CopyPaste
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+image_size = (1024, 1024)
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# Standard Scale Jittering (SSJ) resizes and crops an image
+# with a resize range of 0.8 to 1.25 of the original image size.
+load_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=RandomResize,
+        scale=image_size,
+        ratio_range=(0.8, 1.25),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=image_size),
+]
+train_pipeline = [
+    dict(type=CopyPaste, max_num_pasted=100),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(
+    dict(
+        type=MultiImageMixDataset,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=load_pipeline,
+            backend_args=backend_args),
+        pipeline=train_pipeline))
diff --git a/head_extractor/build/lib/mmdet/configs/deformable_detr/deformable_detr_r50_16xb2_50e_coco.py b/head_extractor/build/lib/mmdet/configs/deformable_detr/deformable_detr_r50_16xb2_50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee2a41639d84ed8e278af45229b451b742ac8974
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/deformable_detr/deformable_detr_r50_16xb2_50e_coco.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_detection import *
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import LoadImageFromFile, RandomChoice, RandomChoiceResize
+from mmengine.optim.optimizer import OptimWrapper
+from mmengine.optim.scheduler import MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.adamw import AdamW
+
+from mmdet.datasets.transforms import (LoadAnnotations, PackDetInputs,
+                                       RandomCrop, RandomFlip, Resize)
+from mmdet.models.backbones import ResNet
+from mmdet.models.data_preprocessors import DetDataPreprocessor
+from mmdet.models.dense_heads import DeformableDETRHead
+from mmdet.models.detectors import DeformableDETR
+from mmdet.models.losses import FocalLoss, GIoULoss, L1Loss
+from mmdet.models.necks import ChannelMapper
+from mmdet.models.task_modules import (BBoxL1Cost, FocalLossCost,
+                                       HungarianAssigner, IoUCost)
+
+model = dict(
+    type=DeformableDETR,
+    num_queries=300,
+    num_feature_levels=4,
+    with_box_refine=False,
+    as_two_stage=False,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=ChannelMapper,
+        in_channels=[512, 1024, 2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    encoder=dict(  # DeformableDetrTransformerEncoder
+        num_layers=6,
+        layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+            self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                embed_dims=256,
+                batch_first=True),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.1))),
+    decoder=dict(  # DeformableDetrTransformerDecoder
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(  # DeformableDetrTransformerDecoderLayer
+            self_attn_cfg=dict(  # MultiheadAttention
+                embed_dims=256,
+                num_heads=8,
+                dropout=0.1,
+                batch_first=True),
+            cross_attn_cfg=dict(  # MultiScaleDeformableAttention
+                embed_dims=256,
+                batch_first=True),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.1)),
+        post_norm_cfg=None),
+    positional_encoding=dict(num_feats=128, normalize=True, offset=-0.5),
+    bbox_head=dict(
+        type=DeformableDETRHead,
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        loss_cls=dict(
+            type=FocalLoss,
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type=L1Loss, loss_weight=5.0),
+        loss_iou=dict(type=GIoULoss, loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type=HungarianAssigner,
+            match_costs=[
+                dict(type=FocalLossCost, weight=2.0),
+                dict(type=BBoxL1Cost, weight=5.0, box_format='xywh'),
+                dict(type=IoUCost, iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=100))
+
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(
+        type=RandomChoice,
+        transforms=[
+            [
+                dict(
+                    type=RandomChoiceResize,
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    resize_type=Resize,
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type=RandomChoiceResize,
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    resize_type=Resize,
+                    keep_ratio=True),
+                dict(
+                    type=RandomCrop,
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type=RandomChoiceResize,
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    resize_type=Resize,
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type=PackDetInputs)
+]
+train_dataloader.update(
+    dict(
+        dataset=dict(
+            filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline)))
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1),
+            'sampling_offsets': dict(lr_mult=0.1),
+            'reference_points': dict(lr_mult=0.1)
+        }))
+
+# learning policy
+max_epochs = 50
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+param_scheduler = [
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[40],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=32)
diff --git a/head_extractor/build/lib/mmdet/configs/deformable_detr/deformable_detr_refine_r50_16xb2_50e_coco.py b/head_extractor/build/lib/mmdet/configs/deformable_detr/deformable_detr_refine_r50_16xb2_50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f232d6111026488020e586440852c012dd94608
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/deformable_detr/deformable_detr_refine_r50_16xb2_50e_coco.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .deformable_detr_r50_16xb2_50e_coco import *
+
+model.update(dict(with_box_refine=True))
diff --git a/head_extractor/build/lib/mmdet/configs/deformable_detr/deformable_detr_refine_twostage_r50_16xb2_50e_coco.py b/head_extractor/build/lib/mmdet/configs/deformable_detr/deformable_detr_refine_twostage_r50_16xb2_50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fac4d8c4f2020b6d87857fbe157419e4c4f0712
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/deformable_detr/deformable_detr_refine_twostage_r50_16xb2_50e_coco.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .deformable_detr_refine_r50_16xb2_50e_coco import *
+
+model.update(dict(as_two_stage=True))
diff --git a/head_extractor/build/lib/mmdet/configs/detr/detr_r101_8xb2_500e_coco.py b/head_extractor/build/lib/mmdet/configs/detr/detr_r101_8xb2_500e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b961468114ce3adb0582378ac422649ef3bd5013
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/detr/detr_r101_8xb2_500e_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+
+with read_base():
+    from .detr_r50_8xb2_500e_coco import *
+
+model.update(
+    dict(
+        backbone=dict(
+            depth=101,
+            init_cfg=dict(
+                type=PretrainedInit, checkpoint='torchvision://resnet101'))))
diff --git a/head_extractor/build/lib/mmdet/configs/detr/detr_r18_8xb2_500e_coco.py b/head_extractor/build/lib/mmdet/configs/detr/detr_r18_8xb2_500e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..11360af18de729bfd9e8d8cb6597067a588852c9
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/detr/detr_r18_8xb2_500e_coco.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+
+with read_base():
+    from .detr_r50_8xb2_500e_coco import *
+
+model.update(
+    dict(
+        backbone=dict(
+            depth=18,
+            init_cfg=dict(
+                type=PretrainedInit, checkpoint='torchvision://resnet18')),
+        neck=dict(in_channels=[512])))
diff --git a/head_extractor/build/lib/mmdet/configs/detr/detr_r50_8xb2_150e_coco.py b/head_extractor/build/lib/mmdet/configs/detr/detr_r50_8xb2_150e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c50726c7890cb59bee4b921179be1949ff12199e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/detr/detr_r50_8xb2_150e_coco.py
@@ -0,0 +1,182 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms import RandomChoice, RandomChoiceResize
+from mmcv.transforms.loading import LoadImageFromFile
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.nn.modules.activation import ReLU
+from torch.nn.modules.batchnorm import BatchNorm2d
+from torch.optim.adamw import AdamW
+
+from mmdet.datasets.transforms import (LoadAnnotations, PackDetInputs,
+                                       RandomCrop, RandomFlip, Resize)
+from mmdet.models import (DETR, ChannelMapper, DetDataPreprocessor, DETRHead,
+                          ResNet)
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.losses.iou_loss import GIoULoss
+from mmdet.models.losses.smooth_l1_loss import L1Loss
+from mmdet.models.task_modules import (BBoxL1Cost, ClassificationCost,
+                                       HungarianAssigner, IoUCost)
+
+with read_base():
+    from .._base_.datasets.coco_detection import *
+    from .._base_.default_runtime import *
+
+model = dict(
+    type=DETR,
+    num_queries=100,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(3, ),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=ChannelMapper,
+        in_channels=[2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=None,
+        num_outs=1),
+    encoder=dict(  # DetrTransformerEncoder
+        num_layers=6,
+        layer_cfg=dict(  # DetrTransformerEncoderLayer
+            self_attn_cfg=dict(  # MultiheadAttention
+                embed_dims=256,
+                num_heads=8,
+                dropout=0.1,
+                batch_first=True),
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,
+                num_fcs=2,
+                ffn_drop=0.1,
+                act_cfg=dict(type=ReLU, inplace=True)))),
+    decoder=dict(  # DetrTransformerDecoder
+        num_layers=6,
+        layer_cfg=dict(  # DetrTransformerDecoderLayer
+            self_attn_cfg=dict(  # MultiheadAttention
+                embed_dims=256,
+                num_heads=8,
+                dropout=0.1,
+                batch_first=True),
+            cross_attn_cfg=dict(  # MultiheadAttention
+                embed_dims=256,
+                num_heads=8,
+                dropout=0.1,
+                batch_first=True),
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,
+                num_fcs=2,
+                ffn_drop=0.1,
+                act_cfg=dict(type=ReLU, inplace=True))),
+        return_intermediate=True),
+    positional_encoding=dict(num_feats=128, normalize=True),
+    bbox_head=dict(
+        type=DETRHead,
+        num_classes=80,
+        embed_dims=256,
+        loss_cls=dict(
+            type=CrossEntropyLoss,
+            bg_cls_weight=0.1,
+            use_sigmoid=False,
+            loss_weight=1.0,
+            class_weight=1.0),
+        loss_bbox=dict(type=L1Loss, loss_weight=5.0),
+        loss_iou=dict(type=GIoULoss, loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type=HungarianAssigner,
+            match_costs=[
+                dict(type=ClassificationCost, weight=1.),
+                dict(type=BBoxL1Cost, weight=5.0, box_format='xywh'),
+                dict(type=IoUCost, iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=100))
+
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(
+        type=RandomChoice,
+        transforms=[[
+            dict(
+                type=RandomChoiceResize,
+                resize_type=Resize,
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
+                keep_ratio=True)
+        ],
+                    [
+                        dict(
+                            type=RandomChoiceResize,
+                            resize_type=Resize,
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            keep_ratio=True),
+                        dict(
+                            type=RandomCrop,
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type=RandomChoiceResize,
+                            resize_type=Resize,
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            keep_ratio=True)
+                    ]]),
+    dict(type=PackDetInputs)
+]
+train_dataloader.update(dataset=dict(pipeline=train_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
+
+# learning policy
+max_epochs = 150
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+param_scheduler = [
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[100],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/head_extractor/build/lib/mmdet/configs/detr/detr_r50_8xb2_500e_coco.py b/head_extractor/build/lib/mmdet/configs/detr/detr_r50_8xb2_500e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7d0817766255a84237f0aea917806e191d161df
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/detr/detr_r50_8xb2_500e_coco.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.optim.scheduler.lr_scheduler import MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop
+
+with read_base():
+    from .detr_r50_8xb2_150e_coco import *
+
+# learning policy
+max_epochs = 500
+train_cfg.update(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=10)
+
+param_scheduler = [
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[334],
+        gamma=0.1)
+]
+
+# only keep latest 2 checkpoints
+default_hooks.update(checkpoint=dict(max_keep_ckpts=2))
diff --git a/head_extractor/build/lib/mmdet/configs/dino/dino_4scale_r50_8xb2_12e_coco.py b/head_extractor/build/lib/mmdet/configs/dino/dino_4scale_r50_8xb2_12e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab8e95a9a76c0cedb78c66993fc7fb7f4623029c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/dino/dino_4scale_r50_8xb2_12e_coco.py
@@ -0,0 +1,190 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms import RandomChoice, RandomChoiceResize
+from mmcv.transforms.loading import LoadImageFromFile
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.nn.modules.batchnorm import BatchNorm2d
+from torch.nn.modules.normalization import GroupNorm
+from torch.optim.adamw import AdamW
+
+from mmdet.datasets.transforms import (LoadAnnotations, PackDetInputs,
+                                       RandomCrop, RandomFlip, Resize)
+from mmdet.models import (DINO, ChannelMapper, DetDataPreprocessor, DINOHead,
+                          ResNet)
+from mmdet.models.losses.focal_loss import FocalLoss
+from mmdet.models.losses.iou_loss import GIoULoss
+from mmdet.models.losses.smooth_l1_loss import L1Loss
+from mmdet.models.task_modules import (BBoxL1Cost, FocalLossCost,
+                                       HungarianAssigner, IoUCost)
+
+with read_base():
+    from .._base_.datasets.coco_detection import *
+    from .._base_.default_runtime import *
+
+model = dict(
+    type=DINO,
+    num_queries=900,  # num_matching_queries
+    with_box_refine=True,
+    as_two_stage=True,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=ChannelMapper,
+        in_channels=[512, 1024, 2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type=GroupNorm, num_groups=32),
+        num_outs=4),
+    encoder=dict(
+        num_layers=6,
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_levels=4,
+                               dropout=0.0),  # 0.1 for DeformDETR
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,  # 1024 for DeformDETR
+                ffn_drop=0.0))),  # 0.1 for DeformDETR
+    decoder=dict(
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_heads=8,
+                               dropout=0.0),  # 0.1 for DeformDETR
+            cross_attn_cfg=dict(embed_dims=256, num_levels=4,
+                                dropout=0.0),  # 0.1 for DeformDETR
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,  # 1024 for DeformDETR
+                ffn_drop=0.0)),  # 0.1 for DeformDETR
+        post_norm_cfg=None),
+    positional_encoding=dict(
+        num_feats=128,
+        normalize=True,
+        offset=0.0,  # -0.5 for DeformDETR
+        temperature=20),  # 10000 for DeformDETR
+    bbox_head=dict(
+        type=DINOHead,
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        loss_cls=dict(
+            type=FocalLoss,
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),  # 2.0 in DeformDETR
+        loss_bbox=dict(type=L1Loss, loss_weight=5.0),
+        loss_iou=dict(type=GIoULoss, loss_weight=2.0)),
+    dn_cfg=dict(  # TODO: Move to model.train_cfg ?
+        label_noise_scale=0.5,
+        box_noise_scale=1.0,  # 0.4 for DN-DETR
+        group_cfg=dict(dynamic=True, num_groups=None,
+                       num_dn_queries=100)),  # TODO: half num_dn_queries
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type=HungarianAssigner,
+            match_costs=[
+                dict(type=FocalLossCost, weight=2.0),
+                dict(type=BBoxL1Cost, weight=5.0, box_format='xywh'),
+                dict(type=IoUCost, iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=300))  # 100 for DeformDETR
+
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(
+        type=RandomChoice,
+        transforms=[
+            [
+                dict(
+                    type=RandomChoiceResize,
+                    resize_type=Resize,
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type=RandomChoiceResize,
+                    resize_type=Resize,
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type=RandomCrop,
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type=RandomChoiceResize,
+                    resize_type=Resize,
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type=PackDetInputs)
+]
+train_dataloader.update(
+    dataset=dict(
+        filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(
+        type=AdamW,
+        lr=0.0001,  # 0.0002 for DeformDETR
+        weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)})
+)  # custom_keys contains sampling_offsets and reference_points in DeformDETR  # noqa
+
+# learning policy
+max_epochs = 12
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1)
+
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+param_scheduler = [
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/head_extractor/build/lib/mmdet/configs/dino/dino_4scale_r50_8xb2_24e_coco.py b/head_extractor/build/lib/mmdet/configs/dino/dino_4scale_r50_8xb2_24e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c10cc2184de8f71571759ecbeac56696afceb5eb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/dino/dino_4scale_r50_8xb2_24e_coco.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.runner.loops import EpochBasedTrainLoop
+
+with read_base():
+    from .dino_4scale_r50_8xb2_12e_coco import *
+
+max_epochs = 24
+train_cfg.update(
+    dict(type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1))
+
+param_scheduler[0].update(dict(milestones=[20]))
diff --git a/head_extractor/build/lib/mmdet/configs/dino/dino_4scale_r50_8xb2_36e_coco.py b/head_extractor/build/lib/mmdet/configs/dino/dino_4scale_r50_8xb2_36e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3779744322a19d2865f1e6299aba564c4ec1e3d5
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/dino/dino_4scale_r50_8xb2_36e_coco.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.runner.loops import EpochBasedTrainLoop
+
+with read_base():
+    from .dino_4scale_r50_8xb2_12e_coco import *
+
+max_epochs = 36
+train_cfg.update(
+    dict(type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1))
+
+param_scheduler[0].update(dict(milestones=[30]))
diff --git a/head_extractor/build/lib/mmdet/configs/dino/dino_4scale_r50_improved_8xb2_12e_coco.py b/head_extractor/build/lib/mmdet/configs/dino/dino_4scale_r50_improved_8xb2_12e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..43c07201079fcdbad3c9ea7a471306080e006cdc
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/dino/dino_4scale_r50_improved_8xb2_12e_coco.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from .dino_4scale_r50_8xb2_12e_coco import *
+
+# from deformable detr hyper
+model.update(
+    dict(
+        backbone=dict(frozen_stages=-1),
+        bbox_head=dict(loss_cls=dict(loss_weight=2.0)),
+        positional_encoding=dict(offset=-0.5, temperature=10000),
+        dn_cfg=dict(group_cfg=dict(num_dn_queries=300))))
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        optimizer=dict(lr=0.0002),
+        paramwise_cfg=dict(
+            custom_keys={
+                'backbone': dict(lr_mult=0.1),
+                'sampling_offsets': dict(lr_mult=0.1),
+                'reference_points': dict(lr_mult=0.1)
+            })))
diff --git a/head_extractor/build/lib/mmdet/configs/dino/dino_5scale_swin_l_8xb2_12e_coco.py b/head_extractor/build/lib/mmdet/configs/dino/dino_5scale_swin_l_8xb2_12e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..25aac0187ab2472dd062514fecf988dcd47504a5
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/dino/dino_5scale_swin_l_8xb2_12e_coco.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+
+from mmdet.models import SwinTransformer
+
+with read_base():
+    from .dino_4scale_r50_8xb2_12e_coco import *
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa
+num_levels = 5
+model.merge(
+    dict(
+        num_feature_levels=num_levels,
+        backbone=dict(
+            _delete_=True,
+            type=SwinTransformer,
+            pretrain_img_size=384,
+            embed_dims=192,
+            depths=[2, 2, 18, 2],
+            num_heads=[6, 12, 24, 48],
+            window_size=12,
+            mlp_ratio=4,
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.,
+            attn_drop_rate=0.,
+            drop_path_rate=0.2,
+            patch_norm=True,
+            out_indices=(0, 1, 2, 3),
+            # Please only add indices that would be used
+            # in FPN, otherwise some parameter will not be used
+            with_cp=True,
+            convert_weights=True,
+            init_cfg=dict(type=PretrainedInit, checkpoint=pretrained)),
+        neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels),
+        encoder=dict(
+            layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))),
+        decoder=dict(
+            layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels)))))
diff --git a/head_extractor/build/lib/mmdet/configs/dino/dino_5scale_swin_l_8xb2_36e_coco.py b/head_extractor/build/lib/mmdet/configs/dino/dino_5scale_swin_l_8xb2_36e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..494acf59f1c31fe419415920e8b65fbfb9267df1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/dino/dino_5scale_swin_l_8xb2_36e_coco.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.runner.loops import EpochBasedTrainLoop
+
+with read_base():
+    from .dino_5scale_swin_l_8xb2_12e_coco import *
+
+max_epochs = 36
+train_cfg.update(
+    dict(type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1))
+
+param_scheduler[0].update(dict(milestones=[27, 33]))
diff --git a/head_extractor/build/lib/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py b/head_extractor/build/lib/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0a6d5a21470752fd26fa162edf5c2241afb1fed
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_detection import *
+    from .._base_.default_runtime import *
+    from .._base_.models.faster_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_1x import *
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2780f4afddc05ccd4ae1746206a6a6ad8cece39e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_poly_1x_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_ms_poly_3x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_ms_poly_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a1badfc4f04f6ad5466d9ec3aa2d07708887927
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_ms_poly_3x_coco.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from ..common.ms_poly_3x_coco_instance import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        depth=101,
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6770cec8eebe8c5130abd15f9bc44d5b5c5db875
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.models.mask_rcnn_r50_fpn import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet101')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd2aafb912ca84f776637e498d2743213a05d18a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_2x_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet101')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_8xb8_amp_lsj_200e_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_8xb8_amp_lsj_200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..665808d5dc479ecb7c5a328af3861f59e460ac78
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_8xb8_amp_lsj_200e_coco.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet101')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_ms_poly_3x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_ms_poly_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..14688795963cb28018f5897429b191b235a86b6b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_ms_poly_3x_coco.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from ..common.ms_poly_3x_coco_instance import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet101')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..67bd86fa0e8f8b414eec681852511db3b3d4c9c6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..494e6ba593efa663f06e1383ceba8b57b9d097b5
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_instance import *
+    from .._base_.default_runtime import *
+    from .._base_.models.mask_rcnn_r50_caffe_c4 import *
+    from .._base_.schedules.schedule_1x import *
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6481fcfd49eeac603eced8e46ee3a8705add8367
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_1x_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_1x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5952ed587a431740bc3d17ac9d2e6b5a3d326061
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_1x_coco.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_1x_coco import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args={{_base_.backend_args}}),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=RandomChoiceResize,
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs),
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d62b9ebe958b3a8f790a6e9581942494f42bf7d6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_1x_coco import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args={{_base_.backend_args}}),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=RandomChoiceResize,
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_2x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa41b7e00ca153814f28ac29638cc497e7a2d3e9
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_2x_coco.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco import *
+
+train_cfg = dict(max_epochs=24)
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_3x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5f9b977b2dfaebfe01d834ac4ad8cf4522fe9c0
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_3x_coco.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco import *
+
+train_cfg = dict(max_epochs=36)
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..28ba7c77ddf10d295d371db2f46d6c1f117ac7c6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_1x_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+from mmdet.models.losses import SmoothL1Loss
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    rpn_head=dict(
+        loss_bbox=dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            roi_layer=dict(
+                type=RoIAlign, output_size=7, sampling_ratio=2,
+                aligned=False)),
+        bbox_head=dict(
+            loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            roi_layer=dict(
+                type=RoIAlign, output_size=14, sampling_ratio=2,
+                aligned=False))))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8145d08fee85c1758d3794cee952a3b7200b14bd
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_instance import *
+    from .._base_.default_runtime import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_1x import *
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2c0876541289d10832a1b26ddb6e91f6a66d89a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_instance import *
+    from .._base_.default_runtime import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_1x import *
+
+from mmengine.visualization import LocalVisBackend, WandbVisBackend
+
+vis_backends.update(dict(type=WandbVisBackend))
+vis_backends.update(dict(type=LocalVisBackend))
+visualizer.update(dict(vis_backends=vis_backends))
+
+# MMEngine support the following two ways, users can choose
+# according to convenience
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+default_hooks.update(dict(checkpoint=dict(interval=4)))
+
+train_cfg.update(dict(val_interval=2))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6be010b4508d6ba300a1305a1d405ec9a265ae07
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_instance import *
+    from .._base_.default_runtime import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_2x import *
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_amp_1x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_amp_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..110c3c475429701a92321676d17f829f82cbfb76
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_amp_1x_coco.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_1x_coco import *
+
+from mmengine.optim.optimizer.amp_optimizer_wrapper import AmpOptimWrapper
+
+optim_wrapper.update(dict(type=AmpOptimWrapper))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_ms_poly_-3x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_ms_poly_-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff4eec6d2be0f4bd61c7bd04057fd58b303120c8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_ms_poly_-3x_coco.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.models.mask_rcnn_r50_fpn import *
+    from ..common.ms_poly_3x_coco_instance import *
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..012e711cb96f9aa67460b86694838d592fd1ae25
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_instance import *
+    from .._base_.default_runtime import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_1x import *
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs),
+]
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5429b1bd5a62f4786936d19e65d6281807d800bf
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r101_fpn_1x_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+from mmdet.models.backbones.resnext import ResNeXt
+
+model = dict(
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebae6c1dbc3a234ede68ba5b7a6e199edf966ead
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_2x_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+from mmdet.models import ResNeXt
+
+model = dict(
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_ms_poly_3x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_ms_poly_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..aff45d89f351037cec3115271feab678eac3382f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_ms_poly_3x_coco.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from ..common.ms_poly_3x_coco_instance import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+from mmdet.models.backbones import ResNeXt
+
+model = dict(
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9f2095dc2dff7396896a9b2af2fb05bcd765c69
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_x101_32x4d_fpn_1x_coco import *
+
+model = dict(
+    # ResNeXt-101-32x8d model trained with Caffe2 at FB,
+    # so the mean and std need to be changed.
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[57.375, 57.120, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_1x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8eded941751ce71b9c63baa565275802c7ee9bb2
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_1x_coco.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r101_fpn_1x_coco import *
+
+from mmcv.transforms import RandomChoiceResize, RandomFlip
+from mmcv.transforms.loading import LoadImageFromFile
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.models.backbones import ResNeXt
+
+model = dict(
+    # ResNeXt-101-32x8d model trained with Caffe2 at FB,
+    # so the mean and std need to be changed.
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[57.375, 57.120, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
+
+backend_args = None
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=RandomChoiceResize,
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_3x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3f584675f6da93ec7c188753c2f0478bac25ba8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_3x_coco.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from ..common.ms_poly_3x_coco_instance import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+
+from mmdet.models.backbones import ResNeXt
+
+model = dict(
+    # ResNeXt-101-32x8d model trained with Caffe2 at FB,
+    # so the mean and std need to be changed.
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[57.375, 57.120, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_64_4d_fpn_1x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_64_4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bb6f636e641138b902f69a543da0bd8a656db3d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_64_4d_fpn_1x_coco.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_x101_32x4d_fpn_1x_coco import *
+
+model = dict(
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d661076dcf37df9668d6cdf726ecfc5720c561df
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_x101_32x4d_fpn_2x_coco import *
+
+model = dict(
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_ms_poly_3x_coco.py b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_ms_poly_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9ab3643ec27665f7a9411d95c7e01711dfe7623
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_ms_poly_3x_coco.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from ..common.ms_poly_3x_coco_instance import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+
+from mmdet.models.backbones import ResNeXt
+
+model = dict(
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/head_extractor/build/lib/mmdet/configs/maskformer/maskformer_r50_ms_16xb1_75e_coco.py b/head_extractor/build/lib/mmdet/configs/maskformer/maskformer_r50_ms_16xb1_75e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..70744013afcad76834d05ccc8aa6303dc6399bc0
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/maskformer/maskformer_r50_ms_16xb1_75e_coco.py
@@ -0,0 +1,249 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms import RandomChoice, RandomChoiceResize
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+from mmengine.optim.optimizer import OptimWrapper
+from mmengine.optim.scheduler import MultiStepLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.nn.modules.activation import ReLU
+from torch.nn.modules.batchnorm import BatchNorm2d
+from torch.nn.modules.normalization import GroupNorm
+from torch.optim.adamw import AdamW
+
+from mmdet.datasets.transforms.transforms import RandomCrop
+from mmdet.models import MaskFormer
+from mmdet.models.backbones import ResNet
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.maskformer_head import MaskFormerHead
+from mmdet.models.layers.pixel_decoder import TransformerEncoderPixelDecoder
+from mmdet.models.losses import CrossEntropyLoss, DiceLoss, FocalLoss
+from mmdet.models.seg_heads.panoptic_fusion_heads import MaskFormerFusionHead
+from mmdet.models.task_modules.assigners.hungarian_assigner import \
+    HungarianAssigner
+from mmdet.models.task_modules.assigners.match_cost import (ClassificationCost,
+                                                            DiceCost,
+                                                            FocalLossCost)
+from mmdet.models.task_modules.samplers import MaskPseudoSampler
+
+with read_base():
+    from .._base_.datasets.coco_panoptic import *
+    from .._base_.default_runtime import *
+
+data_preprocessor = dict(
+    type=DetDataPreprocessor,
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_size_divisor=1,
+    pad_mask=True,
+    mask_pad_value=0,
+    pad_seg=True,
+    seg_pad_value=255)
+
+num_things_classes = 80
+num_stuff_classes = 53
+num_classes = num_things_classes + num_stuff_classes
+model = dict(
+    type=MaskFormer,
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet50')),
+    panoptic_head=dict(
+        type=MaskFormerHead,
+        in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
+        feat_channels=256,
+        out_channels=256,
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        num_queries=100,
+        pixel_decoder=dict(
+            type=TransformerEncoderPixelDecoder,
+            norm_cfg=dict(type=GroupNorm, num_groups=32),
+            act_cfg=dict(type=ReLU),
+            encoder=dict(  # DetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiheadAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        dropout=0.1,
+                        batch_first=True),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=2048,
+                        num_fcs=2,
+                        ffn_drop=0.1,
+                        act_cfg=dict(type=ReLU, inplace=True)))),
+            positional_encoding=dict(num_feats=128, normalize=True)),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(num_feats=128, normalize=True),
+        transformer_decoder=dict(  # DetrTransformerDecoder
+            num_layers=6,
+            layer_cfg=dict(  # DetrTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.1,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.1,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    ffn_drop=0.1,
+                    act_cfg=dict(type=ReLU, inplace=True))),
+            return_intermediate=True),
+        loss_cls=dict(
+            type=CrossEntropyLoss,
+            use_sigmoid=False,
+            loss_weight=1.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type=FocalLoss,
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            reduction='mean',
+            loss_weight=20.0),
+        loss_dice=dict(
+            type=DiceLoss,
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=1.0)),
+    panoptic_fusion_head=dict(
+        type=MaskFormerFusionHead,
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_panoptic=None,
+        init_cfg=None),
+    train_cfg=dict(
+        assigner=dict(
+            type=HungarianAssigner,
+            match_costs=[
+                dict(type=ClassificationCost, weight=1.0),
+                dict(type=FocalLossCost, weight=20.0, binary_input=True),
+                dict(type=DiceCost, weight=1.0, pred_act=True, eps=1.0)
+            ]),
+        sampler=dict(type=MaskPseudoSampler)),
+    test_cfg=dict(
+        panoptic_on=True,
+        # For now, the dataset does not support
+        # evaluating semantic segmentation metric.
+        semantic_on=False,
+        instance_on=False,
+        # max_per_image is for instance segmentation.
+        max_per_image=100,
+        object_mask_thr=0.8,
+        iou_thr=0.8,
+        # In MaskFormer's panoptic postprocessing,
+        # it will not filter masks whose score is smaller than 0.5 .
+        filter_low_score=False),
+    init_cfg=None)
+
+# dataset settings
+train_pipeline = [
+    dict(type=LoadImageFromFile),
+    dict(
+        type=LoadPanopticAnnotations,
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(type=RandomFlip, prob=0.5),
+    # dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(
+        type=RandomChoice,
+        transforms=[[
+            dict(
+                type=RandomChoiceResize,
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
+                resize_type=Resize,
+                keep_ratio=True)
+        ],
+                    [
+                        dict(
+                            type=RandomChoiceResize,
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            resize_type=Resize,
+                            keep_ratio=True),
+                        dict(
+                            type=RandomCrop,
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type=RandomChoiceResize,
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            resize_type=Resize,
+                            keep_ratio=True)
+                    ]]),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(
+    dict(batch_size=1, num_workers=1, dataset=dict(pipeline=train_pipeline)))
+
+val_dataloader.update(dict(batch_size=1, num_workers=1))
+
+test_dataloader = val_dataloader
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(
+        type=AdamW,
+        lr=0.0001,
+        weight_decay=0.0001,
+        eps=1e-8,
+        betas=(0.9, 0.999)),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': dict(lr_mult=1.0, decay_mult=0.0)
+        },
+        norm_decay_mult=0.0),
+    clip_grad=dict(max_norm=0.01, norm_type=2))
+
+max_epochs = 75
+
+# learning rate
+param_scheduler = dict(
+    type=MultiStepLR,
+    begin=0,
+    end=max_epochs,
+    by_epoch=True,
+    milestones=[50],
+    gamma=0.1)
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (16 GPUs) x (1 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/head_extractor/build/lib/mmdet/configs/maskformer/maskformer_swin_l_p4_w12_64xb1_ms_300e_coco.py b/head_extractor/build/lib/mmdet/configs/maskformer/maskformer_swin_l_p4_w12_64xb1_ms_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2affe520918d0f26c0a858f97bb69646a2860f87
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/maskformer/maskformer_swin_l_p4_w12_64xb1_ms_300e_coco.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.optim.scheduler import LinearLR
+
+from mmdet.models.backbones import SwinTransformer
+from mmdet.models.layers import PixelDecoder
+
+with read_base():
+    from .maskformer_r50_ms_16xb1_75e_coco import *
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa
+depths = [2, 2, 18, 2]
+model.update(
+    dict(
+        backbone=dict(
+            _delete_=True,
+            type=SwinTransformer,
+            pretrain_img_size=384,
+            embed_dims=192,
+            patch_size=4,
+            window_size=12,
+            mlp_ratio=4,
+            depths=depths,
+            num_heads=[6, 12, 24, 48],
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.,
+            attn_drop_rate=0.,
+            drop_path_rate=0.3,
+            patch_norm=True,
+            out_indices=(0, 1, 2, 3),
+            with_cp=False,
+            convert_weights=True,
+            init_cfg=dict(type=PretrainedInit, checkpoint=pretrained)),
+        panoptic_head=dict(
+            in_channels=[192, 384, 768, 1536],  # pass to pixel_decoder inside
+            pixel_decoder=dict(
+                _delete_=True,
+                type=PixelDecoder,
+                norm_cfg=dict(type=GroupNorm, num_groups=32),
+                act_cfg=dict(type=ReLU)),
+            enforce_decoder_input_project=True)))
+
+# optimizer
+
+# weight_decay = 0.01
+# norm_weight_decay = 0.0
+# embed_weight_decay = 0.0
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'norm': norm_multi,
+    'absolute_pos_embed': embed_multi,
+    'relative_position_bias_table': embed_multi,
+    'query_embed': embed_multi
+}
+
+optim_wrapper.update(
+    dict(
+        optimizer=dict(lr=6e-5, weight_decay=0.01),
+        paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)))
+
+max_epochs = 300
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[250],
+        gamma=0.1)
+]
+
+train_cfg.update(dict(max_epochs=max_epochs))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (1 samples per GPU)
+auto_scale_lr.update(dict(base_batch_size=64))
diff --git a/head_extractor/build/lib/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py b/head_extractor/build/lib/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6059780da15b24d4845cdac9ad33d65a6b24e75
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+
+with read_base():
+    from .panoptic_fpn_r50_fpn_1x_coco import *
+
+model.update(
+    dict(
+        backbone=dict(
+            depth=101,
+            init_cfg=dict(
+                type=PretrainedInit, checkpoint='torchvision://resnet101'))))
diff --git a/head_extractor/build/lib/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_ms_3x_coco.py b/head_extractor/build/lib/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_ms_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c02c3237f81df5823ebf60a6d485365cdb655e32
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_ms_3x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+
+with read_base():
+    from .panoptic_fpn_r50_fpn_ms_3x_coco import *
+
+model.update(
+    dict(
+        backbone=dict(
+            depth=101,
+            init_cfg=dict(
+                type=PretrainedInit, checkpoint='torchvision://resnet101'))))
diff --git a/head_extractor/build/lib/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py b/head_extractor/build/lib/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc8932803ca0d1fd52bee7d450fc12898e0ec7b3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.models.mask_rcnn_r50_fpn import *
+    from .._base_.datasets.coco_panoptic import *
+    from .._base_.schedules.schedule_1x import *
+    from .._base_.default_runtime import *
+
+from mmcv.ops import nms
+from torch.nn import GroupNorm
+
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.detectors.panoptic_fpn import PanopticFPN
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.seg_heads.panoptic_fpn_head import PanopticFPNHead
+from mmdet.models.seg_heads.panoptic_fusion_heads import HeuristicFusionHead
+
+model.update(
+    dict(
+        type=PanopticFPN,
+        data_preprocessor=dict(
+            type=DetDataPreprocessor,
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            bgr_to_rgb=True,
+            pad_size_divisor=32,
+            pad_mask=True,
+            mask_pad_value=0,
+            pad_seg=True,
+            seg_pad_value=255),
+        semantic_head=dict(
+            type=PanopticFPNHead,
+            num_things_classes=80,
+            num_stuff_classes=53,
+            in_channels=256,
+            inner_channels=128,
+            start_level=0,
+            end_level=4,
+            norm_cfg=dict(type=GroupNorm, num_groups=32, requires_grad=True),
+            conv_cfg=None,
+            loss_seg=dict(
+                type=CrossEntropyLoss, ignore_index=255, loss_weight=0.5)),
+        panoptic_fusion_head=dict(
+            type=HeuristicFusionHead,
+            num_things_classes=80,
+            num_stuff_classes=53),
+        test_cfg=dict(
+            rcnn=dict(
+                score_thr=0.6,
+                nms=dict(type=nms, iou_threshold=0.5, class_agnostic=True),
+                max_per_img=100,
+                mask_thr_binary=0.5),
+            # used in HeuristicFusionHead
+            panoptic=dict(mask_overlap=0.5, stuff_area_limit=4096))))
+
+# Forced to remove NumClassCheckHook
+custom_hooks = []
diff --git a/head_extractor/build/lib/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_ms_3x_coco.py b/head_extractor/build/lib/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_ms_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..25ebe5d67c44831b5a95978ffc0bfacec7c15de6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_ms_3x_coco.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+
+with read_base():
+    from .panoptic_fpn_r50_fpn_1x_coco import *
+
+from mmcv.transforms import RandomResize
+from mmcv.transforms.loading import LoadImageFromFile
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadPanopticAnnotations
+from mmdet.datasets.transforms.transforms import RandomFlip
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type=LoadImageFromFile),
+    dict(
+        type=LoadPanopticAnnotations,
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(type=RandomResize, scale=[(1333, 640), (1333, 800)], keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
+
+# TODO: Use RepeatDataset to speed up training
+# training schedule for 3x
+train_cfg.update(dict(max_epochs=36, val_interval=3))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[24, 33],
+        gamma=0.1)
+]
diff --git a/head_extractor/build/lib/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_4e_base.py b/head_extractor/build/lib/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_4e_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c672e82c6498092b57c389be01af64a9e26d14bc
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_4e_base.py
@@ -0,0 +1,141 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.models.faster_rcnn_r50_fpn import *
+    from .._base_.models.faster_rcnn_r50_fpn import model
+    from .._base_.default_runtime import *
+
+from mmcv.ops import RoIAlign
+from mmengine.hooks import LoggerHook, SyncBuffersHook
+from mmengine.model.weight_init import PretrainedInit
+from mmengine.optim import MultiStepLR, OptimWrapper
+from mmengine.runner.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.nn.modules.batchnorm import BatchNorm2d
+from torch.nn.modules.normalization import GroupNorm
+from torch.optim import SGD
+
+from mmdet.engine.hooks import TrackVisualizationHook
+from mmdet.models import (QDTrack, QuasiDenseEmbedHead, QuasiDenseTracker,
+                          QuasiDenseTrackHead, SingleRoIExtractor,
+                          TrackDataPreprocessor)
+from mmdet.models.losses import (L1Loss, MarginL2Loss,
+                                 MultiPosCrossEntropyLoss, SmoothL1Loss)
+from mmdet.models.task_modules import (CombinedSampler,
+                                       InstanceBalancedPosSampler,
+                                       MaxIoUAssigner, RandomSampler)
+from mmdet.visualization import TrackLocalVisualizer
+
+detector = model
+detector.pop('data_preprocessor')
+
+detector['backbone'].update(
+    dict(
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+detector.rpn_head.loss_bbox.update(
+    dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0))
+detector.rpn_head.bbox_coder.update(dict(clip_border=False))
+detector.roi_head.bbox_head.update(dict(num_classes=1))
+detector.roi_head.bbox_head.bbox_coder.update(dict(clip_border=False))
+detector['init_cfg'] = dict(
+    type=PretrainedInit,
+    checkpoint=  # noqa: E251
+    'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
+    'faster_rcnn_r50_fpn_1x_coco-person/'
+    'faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'
+    # noqa: E501
+)
+del model
+
+model = dict(
+    type=QDTrack,
+    data_preprocessor=dict(
+        type=TrackDataPreprocessor,
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    detector=detector,
+    track_head=dict(
+        type=QuasiDenseTrackHead,
+        roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        embed_head=dict(
+            type=QuasiDenseEmbedHead,
+            num_convs=4,
+            num_fcs=1,
+            embed_channels=256,
+            norm_cfg=dict(type=GroupNorm, num_groups=32),
+            loss_track=dict(type=MultiPosCrossEntropyLoss, loss_weight=0.25),
+            loss_track_aux=dict(
+                type=MarginL2Loss,
+                neg_pos_ub=3,
+                pos_margin=0,
+                neg_margin=0.1,
+                hard_mining=True,
+                loss_weight=1.0)),
+        loss_bbox=dict(type=L1Loss, loss_weight=1.0),
+        train_cfg=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=CombinedSampler,
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=3,
+                add_gt_as_proposals=True,
+                pos_sampler=dict(type=InstanceBalancedPosSampler),
+                neg_sampler=dict(type=RandomSampler)))),
+    tracker=dict(
+        type=QuasiDenseTracker,
+        init_score_thr=0.9,
+        obj_score_thr=0.5,
+        match_score_thr=0.5,
+        memo_tracklet_frames=30,
+        memo_backdrop_frames=1,
+        memo_momentum=0.8,
+        nms_conf_thr=0.5,
+        nms_backdrop_iou_thr=0.3,
+        nms_class_iou_thr=0.7,
+        with_cats=True,
+        match_metric='bisoftmax'))
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+# learning policy
+param_scheduler = [
+    dict(type=MultiStepLR, begin=0, end=4, by_epoch=True, milestones=[3])
+]
+
+# runtime settings
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=4, val_interval=4)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+default_hooks.update(
+    logger=dict(type=LoggerHook, interval=50),
+    visualization=dict(type=TrackVisualizationHook, draw=False))
+
+visualizer.update(
+    type=TrackLocalVisualizer, vis_backends=vis_backends, name='visualizer')
+
+# custom hooks
+custom_hooks = [
+    # Synchronize model buffers such as running_mean and running_var in BN
+    # at the end of each epoch
+    dict(type=SyncBuffersHook)
+]
diff --git a/head_extractor/build/lib/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/head_extractor/build/lib/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fa715e1b3806f9f9816e3b23a100167c791f0b8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.mot_challenge import *
+    from .qdtrack_faster_rcnn_r50_fpn_4e_base import *
+
+from mmdet.evaluation import CocoVideoMetric, MOTChallengeMetric
+
+# evaluator
+val_evaluator = [
+    dict(type=CocoVideoMetric, metric=['bbox'], classwise=True),
+    dict(type=MOTChallengeMetric, metric=['HOTA', 'CLEAR', 'Identity'])
+]
diff --git a/head_extractor/build/lib/mmdet/configs/retinanet/retinanet_r50_fpn_1x_coco.py b/head_extractor/build/lib/mmdet/configs/retinanet/retinanet_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..847600e61b3daf556ff24d06af2f08249deb2284
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/retinanet/retinanet_r50_fpn_1x_coco.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.models.retinanet_r50_fpn import *
+    from .._base_.datasets.coco_detection import *
+    from .._base_.schedules.schedule_1x import *
+    from .._base_.default_runtime import *
+    from .retinanet_tta import *
+
+from torch.optim.sgd import SGD
+
+# optimizer
+optim_wrapper.update(
+    dict(optimizer=dict(type=SGD, lr=0.01, momentum=0.9, weight_decay=0.0001)))
diff --git a/head_extractor/build/lib/mmdet/configs/retinanet/retinanet_tta.py b/head_extractor/build/lib/mmdet/configs/retinanet/retinanet_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e340e5854e58a332ee174b0e69e7f3f9ec2c486
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/retinanet/retinanet_tta.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import TestTimeAug
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import RandomFlip, Resize
+from mmdet.models.test_time_augs.det_tta import DetTTAModel
+
+tta_model = dict(
+    type=DetTTAModel,
+    tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.5), max_per_img=100))
+
+img_scales = [(1333, 800), (666, 400), (2000, 1200)]
+tta_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=None),
+    dict(
+        type=TestTimeAug,
+        transforms=[
+            [dict(type=Resize, scale=s, keep_ratio=True) for s in img_scales],
+            [dict(type=RandomFlip, prob=1.),
+             dict(type=RandomFlip, prob=0.)],
+            [dict(type=LoadAnnotations, with_bbox=True)],
+            [
+                dict(
+                    type=PackDetInputs,
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'scale_factor', 'flip', 'flip_direction'))
+            ]
+        ])
+]
diff --git a/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_l_8xb32_300e_coco.py b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_l_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..302d7cda110b7a598ba525549e1d96d27ee51990
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_l_8xb32_300e_coco.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_l_8xb32_300e_coco import *
+
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize
+from mmengine.hooks.ema_hook import EMAHook
+from torch.nn.modules.activation import SiLU
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  Resize, YOLOXHSVRandomAug)
+from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook
+from mmdet.models.dense_heads.rtmdet_ins_head import RTMDetInsSepBNHead
+from mmdet.models.layers.ema import ExpMomentumEMA
+from mmdet.models.losses.dice_loss import DiceLoss
+from mmdet.models.losses.gfocal_loss import QualityFocalLoss
+from mmdet.models.losses.iou_loss import GIoULoss
+from mmdet.models.task_modules.coders.distance_point_bbox_coder import \
+    DistancePointBBoxCoder
+from mmdet.models.task_modules.prior_generators.point_generator import \
+    MlvlPointGenerator
+
+model.merge(
+    dict(
+        bbox_head=dict(
+            _delete_=True,
+            type=RTMDetInsSepBNHead,
+            num_classes=80,
+            in_channels=256,
+            stacked_convs=2,
+            share_conv=True,
+            pred_kernel_size=1,
+            feat_channels=256,
+            act_cfg=dict(type=SiLU, inplace=True),
+            norm_cfg=dict(type='SyncBN', requires_grad=True),
+            anchor_generator=dict(
+                type=MlvlPointGenerator, offset=0, strides=[8, 16, 32]),
+            bbox_coder=dict(type=DistancePointBBoxCoder),
+            loss_cls=dict(
+                type=QualityFocalLoss,
+                use_sigmoid=True,
+                beta=2.0,
+                loss_weight=1.0),
+            loss_bbox=dict(type=GIoULoss, loss_weight=2.0),
+            loss_mask=dict(
+                type=DiceLoss, loss_weight=2.0, eps=5e-6, reduction='mean')),
+        test_cfg=dict(
+            nms_pre=1000,
+            min_bbox_size=0,
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.6),
+            max_per_img=100,
+            mask_thr_binary=0.5),
+    ))
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0),
+    dict(
+        type=RandomResize,
+        scale=(1280, 1280),
+        ratio_range=(0.1, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(
+        type=RandomCrop,
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type=CachedMixUp,
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(
+    dict(pin_memory=True, dataset=dict(pipeline=train_pipeline)))
+
+train_pipeline_stage2 = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=RandomResize,
+        scale=(640, 640),
+        ratio_range=(0.1, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(
+        type=RandomCrop,
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type=PackDetInputs)
+]
+custom_hooks = [
+    dict(
+        type=EMAHook,
+        ema_type=ExpMomentumEMA,
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type=PipelineSwitchHook,
+        switch_epoch=280,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+val_evaluator.update(dict(metric=['bbox', 'segm']))
+test_evaluator = val_evaluator
diff --git a/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_m_8xb32_300e_coco.py b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_m_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d90be9293a18cfd703ee1d9993b03237fb3c3dab
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_m_8xb32_300e_coco.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_ins_l_8xb32_300e_coco import *
+
+model.update(
+    dict(
+        backbone=dict(deepen_factor=0.67, widen_factor=0.75),
+        neck=dict(
+            in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2),
+        bbox_head=dict(in_channels=192, feat_channels=192)))
diff --git a/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_s_8xb32_300e_coco.py b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_s_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..58b5b1aff0cff8d770798288b74237bc5183d37b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_s_8xb32_300e_coco.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_ins_l_8xb32_300e_coco import *
+
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize
+from mmengine.hooks.ema_hook import EMAHook
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  Resize, YOLOXHSVRandomAug)
+from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook
+from mmdet.models.layers.ema import ExpMomentumEMA
+
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth'  # noqa
+model.update(
+    dict(
+        backbone=dict(
+            deepen_factor=0.33,
+            widen_factor=0.5,
+            init_cfg=dict(
+                type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+        neck=dict(
+            in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1),
+        bbox_head=dict(in_channels=128, feat_channels=128)))
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0),
+    dict(
+        type=RandomResize,
+        scale=(1280, 1280),
+        ratio_range=(0.5, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(
+        type=RandomCrop,
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type=CachedMixUp,
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)),
+    dict(type=PackDetInputs)
+]
+
+train_pipeline_stage2 = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=RandomResize,
+        scale=(640, 640),
+        ratio_range=(0.5, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(
+        type=RandomCrop,
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
+
+custom_hooks = [
+    dict(
+        type=EMAHook,
+        ema_type=ExpMomentumEMA,
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type=PipelineSwitchHook,
+        switch_epoch=280,
+        switch_pipeline=train_pipeline_stage2)
+]
diff --git a/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_tiny_8xb32_300e_coco.py b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_tiny_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0356b1951da584034cf65014a39c7440fc3da56d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_tiny_8xb32_300e_coco.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_ins_s_8xb32_300e_coco import *
+
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  Resize, YOLOXHSVRandomAug)
+
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth'  # noqa
+
+model.update(
+    dict(
+        backbone=dict(
+            deepen_factor=0.167,
+            widen_factor=0.375,
+            init_cfg=dict(
+                type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+        neck=dict(
+            in_channels=[96, 192, 384], out_channels=96, num_csp_blocks=1),
+        bbox_head=dict(in_channels=96, feat_channels=96)))
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=CachedMosaic,
+        img_scale=(640, 640),
+        pad_val=114.0,
+        max_cached_images=20,
+        random_pop=False),
+    dict(
+        type=RandomResize,
+        scale=(1280, 1280),
+        ratio_range=(0.5, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=(640, 640)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type=CachedMixUp,
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=10,
+        random_pop=False,
+        pad_val=(114, 114, 114),
+        prob=0.5),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_x_8xb16_300e_coco.py b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_x_8xb16_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..555b10102f67ee625d65dbfe0894eb4b41198595
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_ins_x_8xb16_300e_coco.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_ins_l_8xb32_300e_coco import *
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR, LinearLR
+
+model.update(
+    dict(
+        backbone=dict(deepen_factor=1.33, widen_factor=1.25),
+        neck=dict(
+            in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4),
+        bbox_head=dict(in_channels=320, feat_channels=320)))
+
+base_lr = 0.002
+
+# optimizer
+optim_wrapper.update(dict(optimizer=dict(lr=base_lr)))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type=LinearLR, start_factor=1.0e-5, by_epoch=False, begin=0, end=1000),
+    dict(
+        # use cosine lr from 150 to 300 epoch
+        type=CosineAnnealingLR,
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
diff --git a/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dcda7bf994db9f3f5c785d8dea824b3ab8e56a2
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+    from .._base_.schedules.schedule_1x import *
+    from .._base_.datasets.coco_detection import *
+    from .rtmdet_tta import *
+
+from mmcv.ops import nms
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize
+from mmengine.hooks.ema_hook import EMAHook
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR, LinearLR
+from torch.nn import SyncBatchNorm
+from torch.nn.modules.activation import SiLU
+from torch.optim.adamw import AdamW
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  Resize, YOLOXHSVRandomAug)
+from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook
+from mmdet.models.backbones.cspnext import CSPNeXt
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.rtmdet_head import RTMDetSepBNHead
+from mmdet.models.detectors.rtmdet import RTMDet
+from mmdet.models.layers.ema import ExpMomentumEMA
+from mmdet.models.losses.gfocal_loss import QualityFocalLoss
+from mmdet.models.losses.iou_loss import GIoULoss
+from mmdet.models.necks.cspnext_pafpn import CSPNeXtPAFPN
+from mmdet.models.task_modules.assigners.dynamic_soft_label_assigner import \
+    DynamicSoftLabelAssigner
+from mmdet.models.task_modules.coders.distance_point_bbox_coder import \
+    DistancePointBBoxCoder
+from mmdet.models.task_modules.prior_generators.point_generator import \
+    MlvlPointGenerator
+
+model = dict(
+    type=RTMDet,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False,
+        batch_augments=None),
+    backbone=dict(
+        type=CSPNeXt,
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=1,
+        widen_factor=1,
+        channel_attention=True,
+        norm_cfg=dict(type=SyncBatchNorm),
+        act_cfg=dict(type=SiLU, inplace=True)),
+    neck=dict(
+        type=CSPNeXtPAFPN,
+        in_channels=[256, 512, 1024],
+        out_channels=256,
+        num_csp_blocks=3,
+        expand_ratio=0.5,
+        norm_cfg=dict(type=SyncBatchNorm),
+        act_cfg=dict(type=SiLU, inplace=True)),
+    bbox_head=dict(
+        type=RTMDetSepBNHead,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=2,
+        feat_channels=256,
+        anchor_generator=dict(
+            type=MlvlPointGenerator, offset=0, strides=[8, 16, 32]),
+        bbox_coder=dict(type=DistancePointBBoxCoder),
+        loss_cls=dict(
+            type=QualityFocalLoss, use_sigmoid=True, beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type=GIoULoss, loss_weight=2.0),
+        with_objectness=False,
+        exp_on_reg=True,
+        share_conv=True,
+        pred_kernel_size=1,
+        norm_cfg=dict(type=SyncBatchNorm),
+        act_cfg=dict(type=SiLU, inplace=True)),
+    train_cfg=dict(
+        assigner=dict(type=DynamicSoftLabelAssigner, topk=13),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=30000,
+        min_bbox_size=0,
+        score_thr=0.001,
+        nms=dict(type=nms, iou_threshold=0.65),
+        max_per_img=300),
+)
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0),
+    dict(
+        type=RandomResize,
+        scale=(1280, 1280),
+        ratio_range=(0.1, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=(640, 640)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type=CachedMixUp,
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type=PackDetInputs)
+]
+
+train_pipeline_stage2 = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=RandomResize,
+        scale=(640, 640),
+        ratio_range=(0.1, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=(640, 640)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type=PackDetInputs)
+]
+
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(640, 640), keep_ratio=True),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader.update(
+    dict(
+        batch_size=32,
+        num_workers=10,
+        batch_sampler=None,
+        pin_memory=True,
+        dataset=dict(pipeline=train_pipeline)))
+val_dataloader.update(
+    dict(batch_size=5, num_workers=10, dataset=dict(pipeline=test_pipeline)))
+test_dataloader = val_dataloader
+
+max_epochs = 300
+stage2_num_epochs = 20
+base_lr = 0.004
+interval = 10
+
+train_cfg.update(
+    dict(
+        max_epochs=max_epochs,
+        val_interval=interval,
+        dynamic_intervals=[(max_epochs - stage2_num_epochs, 1)]))
+
+val_evaluator.update(dict(proposal_nums=(100, 1, 10)))
+test_evaluator = val_evaluator
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=base_lr, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type=LinearLR, start_factor=1.0e-5, by_epoch=False, begin=0, end=1000),
+    dict(
+        # use cosine lr from 150 to 300 epoch
+        type=CosineAnnealingLR,
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+# hooks
+default_hooks.update(
+    dict(
+        checkpoint=dict(
+            interval=interval,
+            max_keep_ckpts=3  # only keep latest 3 checkpoints
+        )))
+
+custom_hooks = [
+    dict(
+        type=EMAHook,
+        ema_type=ExpMomentumEMA,
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type=PipelineSwitchHook,
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
diff --git a/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_m_8xb32_300e_coco.py b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_m_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e741d8220fe8831894b7b803060031e18dbac62b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_m_8xb32_300e_coco.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_l_8xb32_300e_coco import *
+
+model.update(
+    dict(
+        backbone=dict(deepen_factor=0.67, widen_factor=0.75),
+        neck=dict(
+            in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2),
+        bbox_head=dict(in_channels=192, feat_channels=192)))
diff --git a/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..db21b747e95a15c69af1c17c16a5e6cfd4a2be78
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_l_8xb32_300e_coco import *
+
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize
+from mmengine.hooks.ema_hook import EMAHook
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  Resize, YOLOXHSVRandomAug)
+from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook
+from mmdet.models.layers.ema import ExpMomentumEMA
+
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth'  # noqa
+model.update(
+    dict(
+        backbone=dict(
+            deepen_factor=0.33,
+            widen_factor=0.5,
+            init_cfg=dict(
+                type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+        neck=dict(
+            in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1),
+        bbox_head=dict(in_channels=128, feat_channels=128, exp_on_reg=False)))
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0),
+    dict(
+        type=RandomResize,
+        scale=(1280, 1280),
+        ratio_range=(0.5, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=(640, 640)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type=CachedMixUp,
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type=PackDetInputs)
+]
+
+train_pipeline_stage2 = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=RandomResize,
+        scale=(640, 640),
+        ratio_range=(0.5, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=(640, 640)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
+
+custom_hooks = [
+    dict(
+        type=EMAHook,
+        ema_type=ExpMomentumEMA,
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type=PipelineSwitchHook,
+        switch_epoch=280,
+        switch_pipeline=train_pipeline_stage2)
+]
diff --git a/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_tiny_8xb32_300e_coco.py b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_tiny_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..949d056f16303751d121aeba8f3d859de07b06d2
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_tiny_8xb32_300e_coco.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_s_8xb32_300e_coco import *
+
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  Resize, YOLOXHSVRandomAug)
+
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth'  # noqa
+
+model.update(
+    dict(
+        backbone=dict(
+            deepen_factor=0.167,
+            widen_factor=0.375,
+            init_cfg=dict(
+                type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+        neck=dict(
+            in_channels=[96, 192, 384], out_channels=96, num_csp_blocks=1),
+        bbox_head=dict(in_channels=96, feat_channels=96, exp_on_reg=False)))
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=CachedMosaic,
+        img_scale=(640, 640),
+        pad_val=114.0,
+        max_cached_images=20,
+        random_pop=False),
+    dict(
+        type=RandomResize,
+        scale=(1280, 1280),
+        ratio_range=(0.5, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=(640, 640)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type=CachedMixUp,
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=10,
+        random_pop=False,
+        pad_val=(114, 114, 114),
+        prob=0.5),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_tta.py b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..f27b7aa4a3bf13a28cab3e25be755a9792620ece
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_tta.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import TestTimeAug
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import Pad, RandomFlip, Resize
+from mmdet.models.test_time_augs.det_tta import DetTTAModel
+
+tta_model = dict(
+    type=DetTTAModel,
+    tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.6), max_per_img=100))
+
+img_scales = [(640, 640), (320, 320), (960, 960)]
+
+tta_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=None),
+    dict(
+        type=TestTimeAug,
+        transforms=[
+            [dict(type=Resize, scale=s, keep_ratio=True) for s in img_scales],
+            [
+                # ``RandomFlip`` must be placed before ``Pad``, otherwise
+                # bounding box coordinates after flipping cannot be
+                # recovered correctly.
+                dict(type=RandomFlip, prob=1.),
+                dict(type=RandomFlip, prob=0.)
+            ],
+            [
+                dict(
+                    type=Pad,
+                    size=(960, 960),
+                    pad_val=dict(img=(114, 114, 114))),
+            ],
+            [dict(type=LoadAnnotations, with_bbox=True)],
+            [
+                dict(
+                    type=PackDetInputs,
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'scale_factor', 'flip', 'flip_direction'))
+            ]
+        ])
+]
diff --git a/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_x_8xb32_300e_coco.py b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_x_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..04d67d0ca8f08860462eb0eafc645c403e792394
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/configs/rtmdet/rtmdet_x_8xb32_300e_coco.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_l_8xb32_300e_coco import *
+
+model.update(
+    dict(
+        backbone=dict(deepen_factor=1.33, widen_factor=1.25),
+        neck=dict(
+            in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4),
+        bbox_head=dict(in_channels=320, feat_channels=320)))
diff --git a/head_extractor/build/lib/mmdet/datasets/__init__.py b/head_extractor/build/lib/mmdet/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..670c207cacf9ed0f9fee88bada119ee3aaa85eae
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/__init__.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ade20k import (ADE20KInstanceDataset, ADE20KPanopticDataset,
+                     ADE20KSegDataset)
+from .base_det_dataset import BaseDetDataset
+from .base_semseg_dataset import BaseSegDataset
+from .base_video_dataset import BaseVideoDataset
+from .cityscapes import CityscapesDataset
+from .coco import CocoDataset
+from .coco_caption import CocoCaptionDataset
+from .coco_panoptic import CocoPanopticDataset
+from .coco_semantic import CocoSegDataset
+from .crowdhuman import CrowdHumanDataset
+from .dataset_wrappers import ConcatDataset, MultiImageMixDataset
+from .deepfashion import DeepFashionDataset
+from .dod import DODDataset
+from .dsdl import DSDLDetDataset
+from .flickr30k import Flickr30kDataset
+from .isaid import iSAIDDataset
+from .lvis import LVISDataset, LVISV1Dataset, LVISV05Dataset
+from .mdetr_style_refcoco import MDETRStyleRefCocoDataset
+from .mot_challenge_dataset import MOTChallengeDataset
+from .objects365 import Objects365V1Dataset, Objects365V2Dataset
+from .odvg import ODVGDataset
+from .openimages import OpenImagesChallengeDataset, OpenImagesDataset
+from .refcoco import RefCocoDataset
+from .reid_dataset import ReIDDataset
+from .samplers import (AspectRatioBatchSampler, ClassAwareSampler,
+                       CustomSampleSizeSampler, GroupMultiSourceSampler,
+                       MultiSourceSampler, TrackAspectRatioBatchSampler,
+                       TrackImgSampler)
+from .utils import get_loading_pipeline
+from .v3det import V3DetDataset
+from .voc import VOCDataset
+from .wider_face import WIDERFaceDataset
+from .xml_style import XMLDataset
+from .youtube_vis_dataset import YouTubeVISDataset
+
+__all__ = [
+    'XMLDataset', 'CocoDataset', 'DeepFashionDataset', 'VOCDataset',
+    'CityscapesDataset', 'LVISDataset', 'LVISV05Dataset', 'LVISV1Dataset',
+    'WIDERFaceDataset', 'get_loading_pipeline', 'CocoPanopticDataset',
+    'MultiImageMixDataset', 'OpenImagesDataset', 'OpenImagesChallengeDataset',
+    'AspectRatioBatchSampler', 'ClassAwareSampler', 'MultiSourceSampler',
+    'GroupMultiSourceSampler', 'BaseDetDataset', 'CrowdHumanDataset',
+    'Objects365V1Dataset', 'Objects365V2Dataset', 'DSDLDetDataset',
+    'BaseVideoDataset', 'MOTChallengeDataset', 'TrackImgSampler',
+    'ReIDDataset', 'YouTubeVISDataset', 'TrackAspectRatioBatchSampler',
+    'ADE20KPanopticDataset', 'CocoCaptionDataset', 'RefCocoDataset',
+    'BaseSegDataset', 'ADE20KSegDataset', 'CocoSegDataset',
+    'ADE20KInstanceDataset', 'iSAIDDataset', 'V3DetDataset', 'ConcatDataset',
+    'ODVGDataset', 'MDETRStyleRefCocoDataset', 'DODDataset',
+    'CustomSampleSizeSampler', 'Flickr30kDataset'
+]
diff --git a/head_extractor/build/lib/mmdet/datasets/ade20k.py b/head_extractor/build/lib/mmdet/datasets/ade20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..573271cb5d0cb83571564272895bddde9a5f6ad7
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/ade20k.py
@@ -0,0 +1,260 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from mmengine import fileio
+
+from mmdet.registry import DATASETS
+from .base_semseg_dataset import BaseSegDataset
+from .coco import CocoDataset
+from .coco_panoptic import CocoPanopticDataset
+
+ADE_PALETTE = [(120, 120, 120), (180, 120, 120), (6, 230, 230), (80, 50, 50),
+               (4, 200, 3), (120, 120, 80), (140, 140, 140), (204, 5, 255),
+               (230, 230, 230), (4, 250, 7), (224, 5, 255), (235, 255, 7),
+               (150, 5, 61), (120, 120, 70), (8, 255, 51), (255, 6, 82),
+               (143, 255, 140), (204, 255, 4), (255, 51, 7), (204, 70, 3),
+               (0, 102, 200), (61, 230, 250), (255, 6, 51), (11, 102, 255),
+               (255, 7, 71), (255, 9, 224), (9, 7, 230), (220, 220, 220),
+               (255, 9, 92), (112, 9, 255), (8, 255, 214), (7, 255, 224),
+               (255, 184, 6), (10, 255, 71), (255, 41, 10), (7, 255, 255),
+               (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7),
+               (255, 122, 8), (0, 255, 20), (255, 8, 41), (255, 5, 153),
+               (6, 51, 255), (235, 12, 255), (160, 150, 20), (0, 163, 255),
+               (140, 140, 140), (250, 10, 15), (20, 255, 0), (31, 255, 0),
+               (255, 31, 0), (255, 224, 0), (153, 255, 0), (0, 0, 255),
+               (255, 71, 0), (0, 235, 255), (0, 173, 255), (31, 0, 255),
+               (11, 200, 200), (255, 82, 0), (0, 255, 245), (0, 61, 255),
+               (0, 255, 112), (0, 255, 133), (255, 0, 0), (255, 163, 0),
+               (255, 102, 0), (194, 255, 0), (0, 143, 255), (51, 255, 0),
+               (0, 82, 255), (0, 255, 41), (0, 255, 173), (10, 0, 255),
+               (173, 255, 0), (0, 255, 153), (255, 92, 0), (255, 0, 255),
+               (255, 0, 245), (255, 0, 102), (255, 173, 0), (255, 0, 20),
+               (255, 184, 184), (0, 31, 255), (0, 255, 61), (0, 71, 255),
+               (255, 0, 204), (0, 255, 194), (0, 255, 82), (0, 10, 255),
+               (0, 112, 255), (51, 0, 255), (0, 194, 255), (0, 122, 255),
+               (0, 255, 163), (255, 153, 0), (0, 255, 10), (255, 112, 0),
+               (143, 255, 0), (82, 0, 255), (163, 255, 0), (255, 235, 0),
+               (8, 184, 170), (133, 0, 255), (0, 255, 92), (184, 0, 255),
+               (255, 0, 31), (0, 184, 255), (0, 214, 255), (255, 0, 112),
+               (92, 255, 0), (0, 224, 255), (112, 224, 255), (70, 184, 160),
+               (163, 0, 255), (153, 0, 255), (71, 255, 0), (255, 0, 163),
+               (255, 204, 0), (255, 0, 143), (0, 255, 235), (133, 255, 0),
+               (255, 0, 235), (245, 0, 255), (255, 0, 122), (255, 245, 0),
+               (10, 190, 212), (214, 255, 0), (0, 204, 255), (20, 0, 255),
+               (255, 255, 0), (0, 153, 255), (0, 41, 255), (0, 255, 204),
+               (41, 0, 255), (41, 255, 0), (173, 0, 255), (0, 245, 255),
+               (71, 0, 255), (122, 0, 255), (0, 255, 184), (0, 92, 255),
+               (184, 255, 0), (0, 133, 255), (255, 214, 0), (25, 194, 194),
+               (102, 255, 0), (92, 0, 255)]
+
+
+@DATASETS.register_module()
+class ADE20KPanopticDataset(CocoPanopticDataset):
+    METAINFO = {
+        'classes':
+        ('bed', 'window', 'cabinet', 'person', 'door', 'table', 'curtain',
+         'chair', 'car', 'painting, picture', 'sofa', 'shelf', 'mirror',
+         'armchair', 'seat', 'fence', 'desk', 'wardrobe, closet, press',
+         'lamp', 'tub', 'rail', 'cushion', 'box', 'column, pillar',
+         'signboard, sign', 'chest of drawers, chest, bureau, dresser',
+         'counter', 'sink', 'fireplace', 'refrigerator, icebox', 'stairs',
+         'case, display case, showcase, vitrine',
+         'pool table, billiard table, snooker table', 'pillow',
+         'screen door, screen', 'bookcase', 'coffee table',
+         'toilet, can, commode, crapper, pot, potty, stool, throne', 'flower',
+         'book', 'bench', 'countertop', 'stove', 'palm, palm tree',
+         'kitchen island', 'computer', 'swivel chair', 'boat',
+         'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier',
+         'awning, sunshade, sunblind', 'street lamp', 'booth', 'tv',
+         'airplane', 'clothes', 'pole',
+         'bannister, banister, balustrade, balusters, handrail',
+         'ottoman, pouf, pouffe, puff, hassock', 'bottle', 'van', 'ship',
+         'fountain', 'washer, automatic washer, washing machine',
+         'plaything, toy', 'stool', 'barrel, cask', 'basket, handbasket',
+         'bag', 'minibike, motorbike', 'oven', 'ball', 'food, solid food',
+         'step, stair', 'trade name', 'microwave', 'pot', 'animal', 'bicycle',
+         'dishwasher', 'screen', 'sculpture', 'hood, exhaust hood', 'sconce',
+         'vase', 'traffic light', 'tray', 'trash can', 'fan', 'plate',
+         'monitor', 'bulletin board', 'radiator', 'glass, drinking glass',
+         'clock', 'flag', 'wall', 'building', 'sky', 'floor', 'tree',
+         'ceiling', 'road, route', 'grass', 'sidewalk, pavement',
+         'earth, ground', 'mountain, mount', 'plant', 'water', 'house', 'sea',
+         'rug', 'field', 'rock, stone', 'base, pedestal, stand', 'sand',
+         'skyscraper', 'grandstand, covered stand', 'path', 'runway',
+         'stairway, staircase', 'river', 'bridge, span', 'blind, screen',
+         'hill', 'bar', 'hovel, hut, hutch, shack, shanty', 'tower',
+         'dirt track', 'land, ground, soil',
+         'escalator, moving staircase, moving stairway',
+         'buffet, counter, sideboard',
+         'poster, posting, placard, notice, bill, card', 'stage',
+         'conveyer belt, conveyor belt, conveyer, conveyor, transporter',
+         'canopy', 'pool', 'falls', 'tent', 'cradle', 'tank, storage tank',
+         'lake', 'blanket, cover', 'pier', 'crt screen', 'shower'),
+        'thing_classes':
+        ('bed', 'window', 'cabinet', 'person', 'door', 'table', 'curtain',
+         'chair', 'car', 'painting, picture', 'sofa', 'shelf', 'mirror',
+         'armchair', 'seat', 'fence', 'desk', 'wardrobe, closet, press',
+         'lamp', 'tub', 'rail', 'cushion', 'box', 'column, pillar',
+         'signboard, sign', 'chest of drawers, chest, bureau, dresser',
+         'counter', 'sink', 'fireplace', 'refrigerator, icebox', 'stairs',
+         'case, display case, showcase, vitrine',
+         'pool table, billiard table, snooker table', 'pillow',
+         'screen door, screen', 'bookcase', 'coffee table',
+         'toilet, can, commode, crapper, pot, potty, stool, throne', 'flower',
+         'book', 'bench', 'countertop', 'stove', 'palm, palm tree',
+         'kitchen island', 'computer', 'swivel chair', 'boat',
+         'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier',
+         'awning, sunshade, sunblind', 'street lamp', 'booth', 'tv',
+         'airplane', 'clothes', 'pole',
+         'bannister, banister, balustrade, balusters, handrail',
+         'ottoman, pouf, pouffe, puff, hassock', 'bottle', 'van', 'ship',
+         'fountain', 'washer, automatic washer, washing machine',
+         'plaything, toy', 'stool', 'barrel, cask', 'basket, handbasket',
+         'bag', 'minibike, motorbike', 'oven', 'ball', 'food, solid food',
+         'step, stair', 'trade name', 'microwave', 'pot', 'animal', 'bicycle',
+         'dishwasher', 'screen', 'sculpture', 'hood, exhaust hood', 'sconce',
+         'vase', 'traffic light', 'tray', 'trash can', 'fan', 'plate',
+         'monitor', 'bulletin board', 'radiator', 'glass, drinking glass',
+         'clock', 'flag'),
+        'stuff_classes':
+        ('wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road, route',
+         'grass', 'sidewalk, pavement', 'earth, ground', 'mountain, mount',
+         'plant', 'water', 'house', 'sea', 'rug', 'field', 'rock, stone',
+         'base, pedestal, stand', 'sand', 'skyscraper',
+         'grandstand, covered stand', 'path', 'runway', 'stairway, staircase',
+         'river', 'bridge, span', 'blind, screen', 'hill', 'bar',
+         'hovel, hut, hutch, shack, shanty', 'tower', 'dirt track',
+         'land, ground, soil', 'escalator, moving staircase, moving stairway',
+         'buffet, counter, sideboard',
+         'poster, posting, placard, notice, bill, card', 'stage',
+         'conveyer belt, conveyor belt, conveyer, conveyor, transporter',
+         'canopy', 'pool', 'falls', 'tent', 'cradle', 'tank, storage tank',
+         'lake', 'blanket, cover', 'pier', 'crt screen', 'shower'),
+        'palette':
+        ADE_PALETTE
+    }
+
+
+@DATASETS.register_module()
+class ADE20KInstanceDataset(CocoDataset):
+    METAINFO = {
+        'classes':
+        ('bed', 'windowpane', 'cabinet', 'person', 'door', 'table', 'curtain',
+         'chair', 'car', 'painting', 'sofa', 'shelf', 'mirror', 'armchair',
+         'seat', 'fence', 'desk', 'wardrobe', 'lamp', 'bathtub', 'railing',
+         'cushion', 'box', 'column', 'signboard', 'chest of drawers',
+         'counter', 'sink', 'fireplace', 'refrigerator', 'stairs', 'case',
+         'pool table', 'pillow', 'screen door', 'bookcase', 'coffee table',
+         'toilet', 'flower', 'book', 'bench', 'countertop', 'stove', 'palm',
+         'kitchen island', 'computer', 'swivel chair', 'boat',
+         'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier',
+         'awning', 'streetlight', 'booth', 'television receiver', 'airplane',
+         'apparel', 'pole', 'bannister', 'ottoman', 'bottle', 'van', 'ship',
+         'fountain', 'washer', 'plaything', 'stool', 'barrel', 'basket', 'bag',
+         'minibike', 'oven', 'ball', 'food', 'step', 'trade name', 'microwave',
+         'pot', 'animal', 'bicycle', 'dishwasher', 'screen', 'sculpture',
+         'hood', 'sconce', 'vase', 'traffic light', 'tray', 'ashcan', 'fan',
+         'plate', 'monitor', 'bulletin board', 'radiator', 'glass', 'clock',
+         'flag'),
+        'palette': [(204, 5, 255), (230, 230, 230), (224, 5, 255),
+                    (150, 5, 61), (8, 255, 51), (255, 6, 82), (255, 51, 7),
+                    (204, 70, 3), (0, 102, 200), (255, 6, 51), (11, 102, 255),
+                    (255, 7, 71), (220, 220, 220), (8, 255, 214),
+                    (7, 255, 224), (255, 184, 6), (10, 255, 71), (7, 255, 255),
+                    (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7),
+                    (0, 255, 20), (255, 8, 41), (255, 5, 153), (6, 51, 255),
+                    (235, 12, 255), (0, 163, 255), (250, 10, 15), (20, 255, 0),
+                    (255, 224, 0), (0, 0, 255), (255, 71, 0), (0, 235, 255),
+                    (0, 173, 255), (0, 255, 245), (0, 255, 112), (0, 255, 133),
+                    (255, 0, 0), (255, 163, 0), (194, 255, 0), (0, 143, 255),
+                    (51, 255, 0), (0, 82, 255), (0, 255, 41), (0, 255, 173),
+                    (10, 0, 255), (173, 255, 0), (255, 92, 0), (255, 0, 245),
+                    (255, 0, 102), (255, 173, 0), (255, 0, 20), (0, 31, 255),
+                    (0, 255, 61), (0, 71, 255), (255, 0, 204), (0, 255, 194),
+                    (0, 255, 82), (0, 112, 255), (51, 0, 255), (0, 122, 255),
+                    (255, 153, 0), (0, 255, 10), (163, 255, 0), (255, 235, 0),
+                    (8, 184, 170), (184, 0, 255), (255, 0, 31), (0, 214, 255),
+                    (255, 0, 112), (92, 255, 0), (70, 184, 160), (163, 0, 255),
+                    (71, 255, 0), (255, 0, 163), (255, 204, 0), (255, 0, 143),
+                    (133, 255, 0), (255, 0, 235), (245, 0, 255), (255, 0, 122),
+                    (255, 245, 0), (214, 255, 0), (0, 204, 255), (255, 255, 0),
+                    (0, 153, 255), (0, 41, 255), (0, 255, 204), (41, 0, 255),
+                    (41, 255, 0), (173, 0, 255), (0, 245, 255), (0, 255, 184),
+                    (0, 92, 255), (184, 255, 0), (255, 214, 0), (25, 194, 194),
+                    (102, 255, 0), (92, 0, 255)],
+    }
+
+
+@DATASETS.register_module()
+class ADE20KSegDataset(BaseSegDataset):
+    """ADE20K dataset.
+
+    In segmentation map annotation for ADE20K, 0 stands for background, which
+    is not included in 150 categories. The ``img_suffix`` is fixed to '.jpg',
+    and ``seg_map_suffix`` is fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=('wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road',
+                 'bed ', 'windowpane', 'grass', 'cabinet', 'sidewalk',
+                 'person', 'earth', 'door', 'table', 'mountain', 'plant',
+                 'curtain', 'chair', 'car', 'water', 'painting', 'sofa',
+                 'shelf', 'house', 'sea', 'mirror', 'rug', 'field', 'armchair',
+                 'seat', 'fence', 'desk', 'rock', 'wardrobe', 'lamp',
+                 'bathtub', 'railing', 'cushion', 'base', 'box', 'column',
+                 'signboard', 'chest of drawers', 'counter', 'sand', 'sink',
+                 'skyscraper', 'fireplace', 'refrigerator', 'grandstand',
+                 'path', 'stairs', 'runway', 'case', 'pool table', 'pillow',
+                 'screen door', 'stairway', 'river', 'bridge', 'bookcase',
+                 'blind', 'coffee table', 'toilet', 'flower', 'book', 'hill',
+                 'bench', 'countertop', 'stove', 'palm', 'kitchen island',
+                 'computer', 'swivel chair', 'boat', 'bar', 'arcade machine',
+                 'hovel', 'bus', 'towel', 'light', 'truck', 'tower',
+                 'chandelier', 'awning', 'streetlight', 'booth',
+                 'television receiver', 'airplane', 'dirt track', 'apparel',
+                 'pole', 'land', 'bannister', 'escalator', 'ottoman', 'bottle',
+                 'buffet', 'poster', 'stage', 'van', 'ship', 'fountain',
+                 'conveyer belt', 'canopy', 'washer', 'plaything',
+                 'swimming pool', 'stool', 'barrel', 'basket', 'waterfall',
+                 'tent', 'bag', 'minibike', 'cradle', 'oven', 'ball', 'food',
+                 'step', 'tank', 'trade name', 'microwave', 'pot', 'animal',
+                 'bicycle', 'lake', 'dishwasher', 'screen', 'blanket',
+                 'sculpture', 'hood', 'sconce', 'vase', 'traffic light',
+                 'tray', 'ashcan', 'fan', 'pier', 'crt screen', 'plate',
+                 'monitor', 'bulletin board', 'shower', 'radiator', 'glass',
+                 'clock', 'flag'),
+        palette=ADE_PALETTE)
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 return_classes=False,
+                 **kwargs) -> None:
+        self.return_classes = return_classes
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            List[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        ann_dir = self.data_prefix.get('seg_map_path', None)
+        for img in fileio.list_dir_or_file(
+                dir_path=img_dir,
+                list_dir=False,
+                suffix=self.img_suffix,
+                recursive=True,
+                backend_args=self.backend_args):
+            data_info = dict(img_path=osp.join(img_dir, img))
+            if ann_dir is not None:
+                seg_map = img.replace(self.img_suffix, self.seg_map_suffix)
+                data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+            data_info['label_map'] = self.label_map
+            if self.return_classes:
+                data_info['text'] = list(self._metainfo['classes'])
+            data_list.append(data_info)
+        return data_list
diff --git a/head_extractor/build/lib/mmdet/datasets/api_wrappers/__init__.py b/head_extractor/build/lib/mmdet/datasets/api_wrappers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e3c41a2f87b14d10339955208e0502aeeeb7082
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/api_wrappers/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .coco_api import COCO, COCOeval, COCOPanoptic
+from .cocoeval_mp import COCOevalMP
+
+__all__ = ['COCO', 'COCOeval', 'COCOPanoptic', 'COCOevalMP']
diff --git a/head_extractor/build/lib/mmdet/datasets/api_wrappers/coco_api.py b/head_extractor/build/lib/mmdet/datasets/api_wrappers/coco_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d11a122e1860d1b097710ff98adfddc1508c5a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/api_wrappers/coco_api.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This file add snake case alias for coco api
+
+import warnings
+from collections import defaultdict
+from typing import List, Optional, Union
+
+import pycocotools
+from pycocotools.coco import COCO as _COCO
+from pycocotools.cocoeval import COCOeval as _COCOeval
+
+
+class COCO(_COCO):
+    """This class is almost the same as official pycocotools package.
+
+    It implements some snake case function aliases. So that the COCO class has
+    the same interface as LVIS class.
+    """
+
+    def __init__(self, annotation_file=None):
+        if getattr(pycocotools, '__version__', '0') >= '12.0.2':
+            warnings.warn(
+                'mmpycocotools is deprecated. Please install official pycocotools by "pip install pycocotools"',  # noqa: E501
+                UserWarning)
+        super().__init__(annotation_file=annotation_file)
+        self.img_ann_map = self.imgToAnns
+        self.cat_img_map = self.catToImgs
+
+    def get_ann_ids(self, img_ids=[], cat_ids=[], area_rng=[], iscrowd=None):
+        return self.getAnnIds(img_ids, cat_ids, area_rng, iscrowd)
+
+    def get_cat_ids(self, cat_names=[], sup_names=[], cat_ids=[]):
+        return self.getCatIds(cat_names, sup_names, cat_ids)
+
+    def get_img_ids(self, img_ids=[], cat_ids=[]):
+        return self.getImgIds(img_ids, cat_ids)
+
+    def load_anns(self, ids):
+        return self.loadAnns(ids)
+
+    def load_cats(self, ids):
+        return self.loadCats(ids)
+
+    def load_imgs(self, ids):
+        return self.loadImgs(ids)
+
+
+# just for the ease of import
+COCOeval = _COCOeval
+
+
+class COCOPanoptic(COCO):
+    """This wrapper is for loading the panoptic style annotation file.
+
+    The format is shown in the CocoPanopticDataset class.
+
+    Args:
+        annotation_file (str, optional): Path of annotation file.
+            Defaults to None.
+    """
+
+    def __init__(self, annotation_file: Optional[str] = None) -> None:
+        super(COCOPanoptic, self).__init__(annotation_file)
+
+    def createIndex(self) -> None:
+        """Create index."""
+        # create index
+        print('creating index...')
+        # anns stores 'segment_id -> annotation'
+        anns, cats, imgs = {}, {}, {}
+        img_to_anns, cat_to_imgs = defaultdict(list), defaultdict(list)
+        if 'annotations' in self.dataset:
+            for ann in self.dataset['annotations']:
+                for seg_ann in ann['segments_info']:
+                    # to match with instance.json
+                    seg_ann['image_id'] = ann['image_id']
+                    img_to_anns[ann['image_id']].append(seg_ann)
+                    # segment_id is not unique in coco dataset orz...
+                    # annotations from different images but
+                    # may have same segment_id
+                    if seg_ann['id'] in anns.keys():
+                        anns[seg_ann['id']].append(seg_ann)
+                    else:
+                        anns[seg_ann['id']] = [seg_ann]
+
+            # filter out annotations from other images
+            img_to_anns_ = defaultdict(list)
+            for k, v in img_to_anns.items():
+                img_to_anns_[k] = [x for x in v if x['image_id'] == k]
+            img_to_anns = img_to_anns_
+
+        if 'images' in self.dataset:
+            for img_info in self.dataset['images']:
+                img_info['segm_file'] = img_info['file_name'].replace(
+                    '.jpg', '.png')
+                imgs[img_info['id']] = img_info
+
+        if 'categories' in self.dataset:
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
+            for ann in self.dataset['annotations']:
+                for seg_ann in ann['segments_info']:
+                    cat_to_imgs[seg_ann['category_id']].append(ann['image_id'])
+
+        print('index created!')
+
+        self.anns = anns
+        self.imgToAnns = img_to_anns
+        self.catToImgs = cat_to_imgs
+        self.imgs = imgs
+        self.cats = cats
+
+    def load_anns(self,
+                  ids: Union[List[int], int] = []) -> Optional[List[dict]]:
+        """Load anns with the specified ids.
+
+        ``self.anns`` is a list of annotation lists instead of a
+        list of annotations.
+
+        Args:
+            ids (Union[List[int], int]): Integer ids specifying anns.
+
+        Returns:
+            anns (List[dict], optional): Loaded ann objects.
+        """
+        anns = []
+
+        if hasattr(ids, '__iter__') and hasattr(ids, '__len__'):
+            # self.anns is a list of annotation lists instead of
+            # a list of annotations
+            for id in ids:
+                anns += self.anns[id]
+            return anns
+        elif type(ids) == int:
+            return self.anns[ids]
diff --git a/head_extractor/build/lib/mmdet/datasets/api_wrappers/cocoeval_mp.py b/head_extractor/build/lib/mmdet/datasets/api_wrappers/cocoeval_mp.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3673ea7a7edc593cb49fb336f352a20c1b1015b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/api_wrappers/cocoeval_mp.py
@@ -0,0 +1,296 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import itertools
+import time
+from collections import defaultdict
+
+import numpy as np
+import torch.multiprocessing as mp
+from mmengine.logging import MMLogger
+from pycocotools.cocoeval import COCOeval
+from tqdm import tqdm
+
+
+class COCOevalMP(COCOeval):
+
+    def _prepare(self):
+        '''
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        '''
+
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                rle = coco.annToRLE(ann)
+                ann['segmentation'] = rle
+
+        p = self.params
+        if p.useCats:
+            gts = []
+            dts = []
+            img_ids = set(p.imgIds)
+            cat_ids = set(p.catIds)
+            for gt in self.cocoGt.dataset['annotations']:
+                if (gt['category_id'] in cat_ids) and (gt['image_id']
+                                                       in img_ids):
+                    gts.append(gt)
+            for dt in self.cocoDt.dataset['annotations']:
+                if (dt['category_id'] in cat_ids) and (dt['image_id']
+                                                       in img_ids):
+                    dts.append(dt)
+            # gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) # noqa
+            # dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) # noqa
+            # gts=self.cocoGt.dataset['annotations']
+            # dts=self.cocoDt.dataset['annotations']
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == 'segm':
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+        # set ignore flag
+        for gt in gts:
+            gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
+            gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
+            if p.iouType == 'keypoints':
+                gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        for gt in gts:
+            self._gts[gt['image_id'], gt['category_id']].append(gt)
+        for dt in dts:
+            self._dts[dt['image_id'], dt['category_id']].append(dt)
+        self.evalImgs = defaultdict(
+            list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def evaluate(self):
+        """Run per image evaluation on given images and store results (a list
+        of dict) in self.evalImgs.
+
+        :return: None
+        """
+        tic = time.time()
+        print('Running per image evaluation...')
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+            print('useSegm (deprecated) is not None. Running {} evaluation'.
+                  format(p.iouType))
+        print('Evaluate annotation type *{}*'.format(p.iouType))
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        nproc = 8
+        split_size = len(catIds) // nproc
+        mp_params = []
+        for i in range(nproc):
+            begin = i * split_size
+            end = (i + 1) * split_size
+            if i == nproc - 1:
+                end = len(catIds)
+            mp_params.append((catIds[begin:end], ))
+
+        MMLogger.get_current_instance().info(
+            'start multi processing evaluation ...')
+        with mp.Pool(nproc) as pool:
+            self.evalImgs = pool.starmap(self._evaluateImg, mp_params)
+
+        self.evalImgs = list(itertools.chain(*self.evalImgs))
+
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format(toc - tic))
+
+    def _evaluateImg(self, catids_chunk):
+        self._prepare()
+        p = self.params
+        maxDet = max(p.maxDets)
+        all_params = []
+        for catId in catids_chunk:
+            for areaRng in p.areaRng:
+                for imgId in p.imgIds:
+                    all_params.append((catId, areaRng, imgId))
+        evalImgs = [
+            self.evaluateImg(imgId, catId, areaRng, maxDet)
+            for catId, areaRng, imgId in tqdm(all_params)
+        ]
+        return evalImgs
+
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+
+        for g in gt:
+            if g['ignore'] or (g['area'] < aRng[0] or g['area'] > aRng[1]):
+                g['_ignore'] = 1
+            else:
+                g['_ignore'] = 0
+
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        # load computed ious
+        # ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId] # noqa
+        ious = self.computeIoU(imgId, catId)
+        ious = ious[:, gtind] if len(ious) > 0 else ious
+
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm = np.zeros((T, G))
+        dtm = np.zeros((T, D))
+        gtIg = np.array([g['_ignore'] for g in gt])
+        dtIg = np.zeros((T, D))
+        if not len(ious) == 0:
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t, 1 - 1e-10])
+                    m = -1
+                    for gind, g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
+                            break
+                        # continue to next gt unless better match made
+                        if ious[dind, gind] < iou:
+                            continue
+                        # if match successful and best so far,
+                        # store appropriately
+                        iou = ious[dind, gind]
+                        m = gind
+                    # if match made store id of match for both dt and gt
+                    if m == -1:
+                        continue
+                    dtIg[tind, dind] = gtIg[m]
+                    dtm[tind, dind] = gt[m]['id']
+                    gtm[tind, m] = d['id']
+        # set unmatched detections outside of area range to ignore
+        a = np.array([d['area'] < aRng[0] or d['area'] > aRng[1]
+                      for d in dt]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T,
+                                                                      0)))
+        # store results for given image and category
+
+        return {
+            'image_id': imgId,
+            'category_id': catId,
+            'aRng': aRng,
+            'maxDet': maxDet,
+            'dtIds': [d['id'] for d in dt],
+            'gtIds': [g['id'] for g in gt],
+            'dtMatches': dtm,
+            'gtMatches': gtm,
+            'dtScores': [d['score'] for d in dt],
+            'gtIgnore': gtIg,
+            'dtIgnore': dtIg,
+        }
+
+    def summarize(self):
+        """Compute and display summary metrics for evaluation results.
+
+        Note this function can *only* be applied on the default parameter
+        setting
+        """
+
+        def _summarize(ap=1, iouThr=None, areaRng='all', maxDets=100):
+            p = self.params
+            iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'  # noqa
+            titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+            typeStr = '(AP)' if ap == 1 else '(AR)'
+            iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
+                if iouThr is None else '{:0.2f}'.format(iouThr)
+
+            aind = [
+                i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng
+            ]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval['precision']
+                # IoU
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval['recall']
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            print(
+                iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets,
+                            mean_s))
+            return mean_s
+
+        def _summarizeDets():
+            stats = []
+            stats.append(_summarize(1, maxDets=self.params.maxDets[-1]))
+            stats.append(
+                _summarize(1, iouThr=.5, maxDets=self.params.maxDets[-1]))
+            stats.append(
+                _summarize(1, iouThr=.75, maxDets=self.params.maxDets[-1]))
+            for area_rng in ('small', 'medium', 'large'):
+                stats.append(
+                    _summarize(
+                        1, areaRng=area_rng, maxDets=self.params.maxDets[-1]))
+            for max_det in self.params.maxDets:
+                stats.append(_summarize(0, maxDets=max_det))
+            for area_rng in ('small', 'medium', 'large'):
+                stats.append(
+                    _summarize(
+                        0, areaRng=area_rng, maxDets=self.params.maxDets[-1]))
+            stats = np.array(stats)
+            return stats
+
+        def _summarizeKps():
+            stats = np.zeros((10, ))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng='medium')
+            stats[4] = _summarize(1, maxDets=20, areaRng='large')
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng='medium')
+            stats[9] = _summarize(0, maxDets=20, areaRng='large')
+            return stats
+
+        if not self.eval:
+            raise Exception('Please run accumulate() first')
+        iouType = self.params.iouType
+        if iouType == 'segm' or iouType == 'bbox':
+            summarize = _summarizeDets
+        elif iouType == 'keypoints':
+            summarize = _summarizeKps
+        self.stats = summarize()
diff --git a/head_extractor/build/lib/mmdet/datasets/base_det_dataset.py b/head_extractor/build/lib/mmdet/datasets/base_det_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b3876d5c06eb7d3741a29fe8b0963a7e425ec1b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/base_det_dataset.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List, Optional
+
+from mmengine.dataset import BaseDataset
+from mmengine.fileio import load
+from mmengine.utils import is_abs
+
+from ..registry import DATASETS
+
+
+@DATASETS.register_module()
+class BaseDetDataset(BaseDataset):
+    """Base dataset for detection.
+
+    Args:
+        proposal_file (str, optional): Proposals file path. Defaults to None.
+        file_client_args (dict): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        return_classes (bool): Whether to return class information
+            for open vocabulary-based algorithms. Defaults to False.
+        caption_prompt (dict, optional): Prompt for captioning.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 *args,
+                 seg_map_suffix: str = '.png',
+                 proposal_file: Optional[str] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 return_classes: bool = False,
+                 caption_prompt: Optional[dict] = None,
+                 **kwargs) -> None:
+        self.seg_map_suffix = seg_map_suffix
+        self.proposal_file = proposal_file
+        self.backend_args = backend_args
+        self.return_classes = return_classes
+        self.caption_prompt = caption_prompt
+        if self.caption_prompt is not None:
+            assert self.return_classes, \
+                'return_classes must be True when using caption_prompt'
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+        super().__init__(*args, **kwargs)
+
+    def full_init(self) -> None:
+        """Load annotation file and set ``BaseDataset._fully_initialized`` to
+        True.
+
+        If ``lazy_init=False``, ``full_init`` will be called during the
+        instantiation and ``self._fully_initialized`` will be set to True. If
+        ``obj._fully_initialized=False``, the class method decorated by
+        ``force_full_init`` will call ``full_init`` automatically.
+
+        Several steps to initialize annotation:
+
+            - load_data_list: Load annotations from annotation file.
+            - load_proposals: Load proposals from proposal file, if
+              `self.proposal_file` is not None.
+            - filter data information: Filter annotations according to
+              filter_cfg.
+            - slice_data: Slice dataset according to ``self._indices``
+            - serialize_data: Serialize ``self.data_list`` if
+            ``self.serialize_data`` is True.
+        """
+        if self._fully_initialized:
+            return
+        # load data information
+        self.data_list = self.load_data_list()
+        # get proposals from file
+        if self.proposal_file is not None:
+            self.load_proposals()
+        # filter illegal data, such as data that has no annotations.
+        self.data_list = self.filter_data()
+
+        # Get subset data according to indices.
+        if self._indices is not None:
+            self.data_list = self._get_unserialized_subset(self._indices)
+
+        # serialize data_list
+        if self.serialize_data:
+            self.data_bytes, self.data_address = self._serialize_data()
+
+        self._fully_initialized = True
+
+    def load_proposals(self) -> None:
+        """Load proposals from proposals file.
+
+        The `proposals_list` should be a dict[img_path: proposals]
+        with the same length as `data_list`. And the `proposals` should be
+        a `dict` or :obj:`InstanceData` usually contains following keys.
+
+            - bboxes (np.ndarry): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+            - scores (np.ndarry): Classification scores, has a shape
+              (num_instance, ).
+        """
+        # TODO: Add Unit Test after fully support Dump-Proposal Metric
+        if not is_abs(self.proposal_file):
+            self.proposal_file = osp.join(self.data_root, self.proposal_file)
+        proposals_list = load(
+            self.proposal_file, backend_args=self.backend_args)
+        assert len(self.data_list) == len(proposals_list)
+        for data_info in self.data_list:
+            img_path = data_info['img_path']
+            # `file_name` is the key to obtain the proposals from the
+            # `proposals_list`.
+            file_name = osp.join(
+                osp.split(osp.split(img_path)[0])[-1],
+                osp.split(img_path)[-1])
+            proposals = proposals_list[file_name]
+            data_info['proposals'] = proposals
+
+    def get_cat_ids(self, idx: int) -> List[int]:
+        """Get COCO category ids by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            List[int]: All categories in the image of specified index.
+        """
+        instances = self.get_data_info(idx)['instances']
+        return [instance['bbox_label'] for instance in instances]
diff --git a/head_extractor/build/lib/mmdet/datasets/base_semseg_dataset.py b/head_extractor/build/lib/mmdet/datasets/base_semseg_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d10f762a21a897ab8274fbe9eefab054691a7c60
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/base_semseg_dataset.py
@@ -0,0 +1,265 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import Callable, Dict, List, Optional, Sequence, Union
+
+import mmengine
+import mmengine.fileio as fileio
+import numpy as np
+from mmengine.dataset import BaseDataset, Compose
+
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BaseSegDataset(BaseDataset):
+    """Custom dataset for semantic segmentation. An example of file structure
+    is as followed.
+
+    .. code-block:: none
+
+        ├── data
+        │   ├── my_dataset
+        │   │   ├── img_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{img_suffix}
+        │   │   │   │   ├── yyy{img_suffix}
+        │   │   │   │   ├── zzz{img_suffix}
+        │   │   │   ├── val
+        │   │   ├── ann_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{seg_map_suffix}
+        │   │   │   │   ├── yyy{seg_map_suffix}
+        │   │   │   │   ├── zzz{seg_map_suffix}
+        │   │   │   ├── val
+
+    The img/gt_semantic_seg pair of BaseSegDataset should be of the same
+    except suffix. A valid img/gt_semantic_seg filename pair should be like
+    ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included
+    in the suffix). If split is given, then ``xxx`` is specified in txt file.
+    Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded.
+    Please refer to ``docs/en/tutorials/new_dataset.md`` for more details.
+
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as
+            specify classes to load. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            dict(img_path=None, seg_map_path=None).
+        img_suffix (str): Suffix of images. Default: '.jpg'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        use_label_map (bool, optional): Whether to use label map.
+            Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4 required.
+    """
+    METAINFO: dict = dict()
+
+    def __init__(self,
+                 ann_file: str = '',
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: dict = dict(img_path='', seg_map_path=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 use_label_map: bool = False,
+                 max_refetch: int = 1000,
+                 backend_args: Optional[dict] = None) -> None:
+
+        self.img_suffix = img_suffix
+        self.seg_map_suffix = seg_map_suffix
+        self.backend_args = backend_args.copy() if backend_args else None
+
+        self.data_root = data_root
+        self.data_prefix = copy.copy(data_prefix)
+        self.ann_file = ann_file
+        self.filter_cfg = copy.deepcopy(filter_cfg)
+        self._indices = indices
+        self.serialize_data = serialize_data
+        self.test_mode = test_mode
+        self.max_refetch = max_refetch
+        self.data_list: List[dict] = []
+        self.data_bytes: np.ndarray
+
+        # Set meta information.
+        self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
+
+        # Get label map for custom classes
+        new_classes = self._metainfo.get('classes', None)
+        self.label_map = self.get_label_map(
+            new_classes) if use_label_map else None
+        self._metainfo.update(dict(label_map=self.label_map))
+
+        # Update palette based on label map or generate palette
+        # if it is not defined
+        updated_palette = self._update_palette()
+        self._metainfo.update(dict(palette=updated_palette))
+
+        # Join paths.
+        if self.data_root is not None:
+            self._join_prefix()
+
+        # Build pipeline.
+        self.pipeline = Compose(pipeline)
+        # Full initialize the dataset.
+        if not lazy_init:
+            self.full_init()
+
+        if test_mode:
+            assert self._metainfo.get('classes') is not None, \
+                'dataset metainfo `classes` should be specified when testing'
+
+    @classmethod
+    def get_label_map(cls,
+                      new_classes: Optional[Sequence] = None
+                      ) -> Union[Dict, None]:
+        """Require label mapping.
+
+        The ``label_map`` is a dictionary, its keys are the old label ids and
+        its values are the new label ids, and is used for changing pixel
+        labels in load_annotations. If and only if old classes in cls.METAINFO
+        is not equal to new classes in self._metainfo and nether of them is not
+        None, `label_map` is not None.
+
+        Args:
+            new_classes (list, tuple, optional): The new classes name from
+                metainfo. Default to None.
+
+
+        Returns:
+            dict, optional: The mapping from old classes in cls.METAINFO to
+                new classes in self._metainfo
+        """
+        old_classes = cls.METAINFO.get('classes', None)
+        if (new_classes is not None and old_classes is not None
+                and list(new_classes) != list(old_classes)):
+
+            label_map = {}
+            if not set(new_classes).issubset(cls.METAINFO['classes']):
+                raise ValueError(
+                    f'new classes {new_classes} is not a '
+                    f'subset of classes {old_classes} in METAINFO.')
+            for i, c in enumerate(old_classes):
+                if c not in new_classes:
+                    # 0 is background
+                    label_map[i] = 0
+                else:
+                    label_map[i] = new_classes.index(c)
+            return label_map
+        else:
+            return None
+
+    def _update_palette(self) -> list:
+        """Update palette after loading metainfo.
+
+        If length of palette is equal to classes, just return the palette.
+        If palette is not defined, it will randomly generate a palette.
+        If classes is updated by customer, it will return the subset of
+        palette.
+
+        Returns:
+            Sequence: Palette for current dataset.
+        """
+        palette = self._metainfo.get('palette', [])
+        classes = self._metainfo.get('classes', [])
+        # palette does match classes
+        if len(palette) == len(classes):
+            return palette
+
+        if len(palette) == 0:
+            # Get random state before set seed, and restore
+            # random state later.
+            # It will prevent loss of randomness, as the palette
+            # may be different in each iteration if not specified.
+            # See: https://github.com/open-mmlab/mmdetection/issues/5844
+            state = np.random.get_state()
+            np.random.seed(42)
+            # random palette
+            new_palette = np.random.randint(
+                0, 255, size=(len(classes), 3)).tolist()
+            np.random.set_state(state)
+        elif len(palette) >= len(classes) and self.label_map is not None:
+            new_palette = []
+            # return subset of palette
+            for old_id, new_id in sorted(
+                    self.label_map.items(), key=lambda x: x[1]):
+                # 0 is background
+                if new_id != 0:
+                    new_palette.append(palette[old_id])
+            new_palette = type(palette)(new_palette)
+        elif len(palette) >= len(classes):
+            # Allow palette length is greater than classes.
+            return palette
+        else:
+            raise ValueError('palette does not match classes '
+                             f'as metainfo is {self._metainfo}.')
+        return new_palette
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        ann_dir = self.data_prefix.get('seg_map_path', None)
+        if not osp.isdir(self.ann_file) and self.ann_file:
+            assert osp.isfile(self.ann_file), \
+                f'Failed to load `ann_file` {self.ann_file}'
+            lines = mmengine.list_from_file(
+                self.ann_file, backend_args=self.backend_args)
+            for line in lines:
+                img_name = line.strip()
+                data_info = dict(
+                    img_path=osp.join(img_dir, img_name + self.img_suffix))
+                if ann_dir is not None:
+                    seg_map = img_name + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_list.append(data_info)
+        else:
+            for img in fileio.list_dir_or_file(
+                    dir_path=img_dir,
+                    list_dir=False,
+                    suffix=self.img_suffix,
+                    recursive=True,
+                    backend_args=self.backend_args):
+                data_info = dict(img_path=osp.join(img_dir, img))
+                if ann_dir is not None:
+                    seg_map = img.replace(self.img_suffix, self.seg_map_suffix)
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_list.append(data_info)
+            data_list = sorted(data_list, key=lambda x: x['img_path'])
+        return data_list
diff --git a/head_extractor/build/lib/mmdet/datasets/base_video_dataset.py b/head_extractor/build/lib/mmdet/datasets/base_video_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a4a7a25f16206f06c7b64a7ce4c3588efd5455e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/base_video_dataset.py
@@ -0,0 +1,304 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from collections import defaultdict
+from typing import Any, List, Tuple
+
+import mmengine.fileio as fileio
+from mmengine.dataset import BaseDataset
+from mmengine.logging import print_log
+
+from mmdet.datasets.api_wrappers import COCO
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BaseVideoDataset(BaseDataset):
+    """Base video dataset for VID, MOT and VIS tasks."""
+
+    META = dict(classes=None)
+    # ann_id is unique in coco dataset.
+    ANN_ID_UNIQUE = True
+
+    def __init__(self, *args, backend_args: dict = None, **kwargs):
+        self.backend_args = backend_args
+        super().__init__(*args, **kwargs)
+
+    def load_data_list(self) -> Tuple[List[dict], List]:
+        """Load annotations from an annotation file named as ``self.ann_file``.
+
+        Returns:
+            tuple(list[dict], list): A list of annotation and a list of
+            valid data indices.
+        """
+        with fileio.get_local_path(self.ann_file) as local_path:
+            self.coco = COCO(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the classes
+        self.cat_ids = self.coco.get_cat_ids(
+            cat_names=self.metainfo['classes'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.coco.cat_img_map)
+        # used in `filter_data`
+        self.img_ids_with_ann = set()
+
+        img_ids = self.coco.get_img_ids()
+        total_ann_ids = []
+        # if ``video_id`` is not in the annotation file, we will assign a big
+        # unique video_id for this video.
+        single_video_id = 100000
+        videos = {}
+        for img_id in img_ids:
+            raw_img_info = self.coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+            if 'video_id' not in raw_img_info:
+                single_video_id = single_video_id + 1
+                video_id = single_video_id
+            else:
+                video_id = raw_img_info['video_id']
+
+            if video_id not in videos:
+                videos[video_id] = {
+                    'video_id': video_id,
+                    'images': [],
+                    'video_length': 0
+                }
+
+            videos[video_id]['video_length'] += 1
+            ann_ids = self.coco.get_ann_ids(
+                img_ids=[img_id], cat_ids=self.cat_ids)
+            raw_ann_info = self.coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            parsed_data_info = self.parse_data_info(
+                dict(raw_img_info=raw_img_info, raw_ann_info=raw_ann_info))
+
+            if len(parsed_data_info['instances']) > 0:
+                self.img_ids_with_ann.add(parsed_data_info['img_id'])
+
+            videos[video_id]['images'].append(parsed_data_info)
+
+        data_list = [v for v in videos.values()]
+
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.coco
+
+        return data_list
+
+    def parse_data_info(self, raw_data_info: dict) -> dict:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_data_info (dict): Raw data information loaded from
+                ``ann_file``.
+
+        Returns:
+            dict: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+        data_info = {}
+
+        data_info.update(img_info)
+        if self.data_prefix.get('img_path', None) is not None:
+            img_path = osp.join(self.data_prefix['img_path'],
+                                img_info['file_name'])
+        else:
+            img_path = img_info['file_name']
+        data_info['img_path'] = img_path
+
+        instances = []
+        for i, ann in enumerate(ann_info):
+            instance = {}
+
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+
+            if ann.get('iscrowd', False):
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[ann['category_id']]
+            if ann.get('segmentation', None):
+                instance['mask'] = ann['segmentation']
+            if ann.get('instance_id', None):
+                instance['instance_id'] = ann['instance_id']
+            else:
+                # image dataset usually has no `instance_id`.
+                # Therefore, we set it to `i`.
+                instance['instance_id'] = i
+            instances.append(instance)
+        data_info['instances'] = instances
+        return data_info
+
+    def filter_data(self) -> List[int]:
+        """Filter image annotations according to filter_cfg.
+
+        Returns:
+            list[int]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        num_imgs_before_filter = sum(
+            [len(info['images']) for info in self.data_list])
+        num_imgs_after_filter = 0
+
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= self.img_ids_with_ann
+
+        new_data_list = []
+        for video_data_info in self.data_list:
+            imgs_data_info = video_data_info['images']
+            valid_imgs_data_info = []
+
+            for data_info in imgs_data_info:
+                img_id = data_info['img_id']
+                width = data_info['width']
+                height = data_info['height']
+                # TODO: simplify these conditions
+                if self.filter_cfg is None:
+                    if img_id not in ids_in_cat:
+                        video_data_info['video_length'] -= 1
+                        continue
+                    if min(width, height) >= 32:
+                        valid_imgs_data_info.append(data_info)
+                        num_imgs_after_filter += 1
+                    else:
+                        video_data_info['video_length'] -= 1
+                else:
+                    if self.filter_cfg.get('filter_empty_gt',
+                                           True) and img_id not in ids_in_cat:
+                        video_data_info['video_length'] -= 1
+                        continue
+                    if min(width, height) >= self.filter_cfg.get(
+                            'min_size', 32):
+                        valid_imgs_data_info.append(data_info)
+                        num_imgs_after_filter += 1
+                    else:
+                        video_data_info['video_length'] -= 1
+                video_data_info['images'] = valid_imgs_data_info
+            new_data_list.append(video_data_info)
+
+        print_log(
+            'The number of samples before and after filtering: '
+            f'{num_imgs_before_filter} / {num_imgs_after_filter}', 'current')
+        return new_data_list
+
+    def prepare_data(self, idx) -> Any:
+        """Get date processed by ``self.pipeline``. Note that ``idx`` is a
+        video index in default since the base element of video dataset is a
+        video. However, in some cases, we need to specific both the video index
+        and frame index. For example, in traing mode, we may want to sample the
+        specific frames and all the frames must be sampled once in a epoch; in
+        test mode, we may want to output data of a single image rather than the
+        whole video for saving memory.
+
+        Args:
+            idx (int): The index of ``data_info``.
+
+        Returns:
+            Any: Depends on ``self.pipeline``.
+        """
+        if isinstance(idx, tuple):
+            assert len(idx) == 2, 'The length of idx must be 2: '
+            '(video_index, frame_index)'
+            video_idx, frame_idx = idx[0], idx[1]
+        else:
+            video_idx, frame_idx = idx, None
+
+        data_info = self.get_data_info(video_idx)
+        if self.test_mode:
+            # Support two test_mode: frame-level and video-level
+            final_data_info = defaultdict(list)
+            if frame_idx is None:
+                frames_idx_list = list(range(data_info['video_length']))
+            else:
+                frames_idx_list = [frame_idx]
+            for index in frames_idx_list:
+                frame_ann = data_info['images'][index]
+                frame_ann['video_id'] = data_info['video_id']
+                # Collate data_list (list of dict to dict of list)
+                for key, value in frame_ann.items():
+                    final_data_info[key].append(value)
+                # copy the info in video-level into img-level
+                # TODO: the value of this key is the same as that of
+                # `video_length` in test mode
+                final_data_info['ori_video_length'].append(
+                    data_info['video_length'])
+
+            final_data_info['video_length'] = [len(frames_idx_list)
+                                               ] * len(frames_idx_list)
+            return self.pipeline(final_data_info)
+        else:
+            # Specify `key_frame_id` for the frame sampling in the pipeline
+            if frame_idx is not None:
+                data_info['key_frame_id'] = frame_idx
+            return self.pipeline(data_info)
+
+    def get_cat_ids(self, index) -> List[int]:
+        """Following image detection, we provide this interface function. Get
+        category ids by video index and frame index.
+
+        Args:
+            index: The index of the dataset. It support two kinds of inputs:
+                Tuple:
+                    video_idx (int): Index of video.
+                    frame_idx (int): Index of frame.
+                Int: Index of video.
+
+        Returns:
+            List[int]: All categories in the image of specified video index
+            and frame index.
+        """
+        if isinstance(index, tuple):
+            assert len(
+                index
+            ) == 2, f'Expect the length of index is 2, but got {len(index)}'
+            video_idx, frame_idx = index
+            instances = self.get_data_info(
+                video_idx)['images'][frame_idx]['instances']
+            return [instance['bbox_label'] for instance in instances]
+        else:
+            cat_ids = []
+            for img in self.get_data_info(index)['images']:
+                for instance in img['instances']:
+                    cat_ids.append(instance['bbox_label'])
+            return cat_ids
+
+    @property
+    def num_all_imgs(self):
+        """Get the number of all the images in this video dataset."""
+        return sum(
+            [len(self.get_data_info(i)['images']) for i in range(len(self))])
+
+    def get_len_per_video(self, idx):
+        """Get length of one video.
+
+        Args:
+            idx (int): Index of video.
+
+        Returns:
+            int (int): The length of the video.
+        """
+        return len(self.get_data_info(idx)['images'])
diff --git a/head_extractor/build/lib/mmdet/datasets/cityscapes.py b/head_extractor/build/lib/mmdet/datasets/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..09755eb1e8b0f0c278085bd2fafbb7247a3fc946
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/cityscapes.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/datasets/cityscapes.py # noqa
+# and https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa
+
+from typing import List
+
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class CityscapesDataset(CocoDataset):
+    """Dataset for Cityscapes."""
+
+    METAINFO = {
+        'classes': ('person', 'rider', 'car', 'truck', 'bus', 'train',
+                    'motorcycle', 'bicycle'),
+        'palette': [(220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70),
+                    (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32)]
+    }
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg.
+
+        Returns:
+            List[dict]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        if self.filter_cfg is None:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False)
+        min_size = self.filter_cfg.get('min_size', 0)
+
+        # obtain images that contain annotation
+        ids_with_ann = set(data_info['img_id'] for data_info in self.data_list)
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= ids_with_ann
+
+        valid_data_infos = []
+        for i, data_info in enumerate(self.data_list):
+            img_id = data_info['img_id']
+            width = data_info['width']
+            height = data_info['height']
+            all_is_crowd = all([
+                instance['ignore_flag'] == 1
+                for instance in data_info['instances']
+            ])
+            if filter_empty_gt and (img_id not in ids_in_cat or all_is_crowd):
+                continue
+            if min(width, height) >= min_size:
+                valid_data_infos.append(data_info)
+
+        return valid_data_infos
diff --git a/head_extractor/build/lib/mmdet/datasets/coco.py b/head_extractor/build/lib/mmdet/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cf21c4e667e3b565ea01d1eb95bcdbf171b90d0
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/coco.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import List, Union
+
+from mmengine.fileio import get_local_path
+
+from mmdet.registry import DATASETS
+from .api_wrappers import COCO
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class CocoDataset(BaseDetDataset):
+    """Dataset for COCO."""
+
+    METAINFO = {
+        'classes':
+        ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+         'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+         'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+         'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+         'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+         'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+         'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+         'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+         'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+         'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+         'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+         'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+         'scissors', 'teddy bear', 'hair drier', 'toothbrush'),
+        # palette is a list of color tuples, which is used for visualization.
+        'palette':
+        [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), (106, 0, 228),
+         (0, 60, 100), (0, 80, 100), (0, 0, 70), (0, 0, 192), (250, 170, 30),
+         (100, 170, 30), (220, 220, 0), (175, 116, 175), (250, 0, 30),
+         (165, 42, 42), (255, 77, 255), (0, 226, 252), (182, 182, 255),
+         (0, 82, 0), (120, 166, 157), (110, 76, 0), (174, 57, 255),
+         (199, 100, 0), (72, 0, 118), (255, 179, 240), (0, 125, 92),
+         (209, 0, 151), (188, 208, 182), (0, 220, 176), (255, 99, 164),
+         (92, 0, 73), (133, 129, 255), (78, 180, 255), (0, 228, 0),
+         (174, 255, 243), (45, 89, 255), (134, 134, 103), (145, 148, 174),
+         (255, 208, 186), (197, 226, 255), (171, 134, 1), (109, 63, 54),
+         (207, 138, 255), (151, 0, 95), (9, 80, 61), (84, 105, 51),
+         (74, 65, 105), (166, 196, 102), (208, 195, 210), (255, 109, 65),
+         (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0),
+         (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161),
+         (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120),
+         (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133),
+         (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62),
+         (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45),
+         (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1),
+         (246, 0, 122), (191, 162, 208)]
+    }
+    COCOAPI = COCO
+    # ann_id is unique in coco dataset.
+    ANN_ID_UNIQUE = True
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.coco = self.COCOAPI(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the `classes`
+        self.cat_ids = self.coco.get_cat_ids(
+            cat_names=self.metainfo['classes'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.coco.cat_img_map)
+
+        img_ids = self.coco.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+
+            ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.coco
+
+        return data_list
+
+    def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+
+        data_info = {}
+
+        # TODO: need to change data_prefix['img'] to data_prefix['img_path']
+        img_path = osp.join(self.data_prefix['img'], img_info['file_name'])
+        if self.data_prefix.get('seg', None):
+            seg_map_path = osp.join(
+                self.data_prefix['seg'],
+                img_info['file_name'].rsplit('.', 1)[0] + self.seg_map_suffix)
+        else:
+            seg_map_path = None
+        data_info['img_path'] = img_path
+        data_info['img_id'] = img_info['img_id']
+        data_info['seg_map_path'] = seg_map_path
+        data_info['height'] = img_info['height']
+        data_info['width'] = img_info['width']
+
+        if self.return_classes:
+            data_info['text'] = self.metainfo['classes']
+            data_info['caption_prompt'] = self.caption_prompt
+            data_info['custom_entities'] = True
+
+        instances = []
+        for i, ann in enumerate(ann_info):
+            instance = {}
+
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+
+            if ann.get('iscrowd', False):
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[ann['category_id']]
+
+            if ann.get('segmentation', None):
+                instance['mask'] = ann['segmentation']
+
+            instances.append(instance)
+        data_info['instances'] = instances
+        return data_info
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg.
+
+        Returns:
+            List[dict]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        if self.filter_cfg is None:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False)
+        min_size = self.filter_cfg.get('min_size', 0)
+
+        # obtain images that contain annotation
+        ids_with_ann = set(data_info['img_id'] for data_info in self.data_list)
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= ids_with_ann
+
+        valid_data_infos = []
+        for i, data_info in enumerate(self.data_list):
+            img_id = data_info['img_id']
+            width = data_info['width']
+            height = data_info['height']
+            if filter_empty_gt and img_id not in ids_in_cat:
+                continue
+            if min(width, height) >= min_size:
+                valid_data_infos.append(data_info)
+
+        return valid_data_infos
diff --git a/head_extractor/build/lib/mmdet/datasets/coco_caption.py b/head_extractor/build/lib/mmdet/datasets/coco_caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee695fe9a768f2be5345c6ad6bafc74177f252c0
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/coco_caption.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from pathlib import Path
+from typing import List
+
+import mmengine
+from mmengine.dataset import BaseDataset
+from mmengine.fileio import get_file_backend
+
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class CocoCaptionDataset(BaseDataset):
+    """COCO2014 Caption dataset."""
+
+    def load_data_list(self) -> List[dict]:
+        """Load data list."""
+        img_prefix = self.data_prefix['img_path']
+        annotations = mmengine.load(self.ann_file)
+        file_backend = get_file_backend(img_prefix)
+
+        data_list = []
+        for ann in annotations:
+            data_info = {
+                'img_id': Path(ann['image']).stem.split('_')[-1],
+                'img_path': file_backend.join_path(img_prefix, ann['image']),
+                'gt_caption': ann['caption'],
+            }
+
+            data_list.append(data_info)
+
+        return data_list
diff --git a/head_extractor/build/lib/mmdet/datasets/coco_panoptic.py b/head_extractor/build/lib/mmdet/datasets/coco_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7a200e01d323e998afa782797e1cc92f75c70cf
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/coco_panoptic.py
@@ -0,0 +1,292 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Callable, List, Optional, Sequence, Union
+
+from mmdet.registry import DATASETS
+from .api_wrappers import COCOPanoptic
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class CocoPanopticDataset(CocoDataset):
+    """Coco dataset for Panoptic segmentation.
+
+    The annotation format is shown as follows. The `ann` field is optional
+    for testing.
+
+    .. code-block:: none
+
+        [
+            {
+                'filename': f'{image_id:012}.png',
+                'image_id':9
+                'segments_info':
+                [
+                    {
+                        'id': 8345037, (segment_id in panoptic png,
+                                        convert from rgb)
+                        'category_id': 51,
+                        'iscrowd': 0,
+                        'bbox': (x1, y1, w, h),
+                        'area': 24315
+                    },
+                    ...
+                ]
+            },
+            ...
+        ]
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            ``dict(img=None, ann=None, seg=None)``. The prefix ``seg`` which is
+            for panoptic segmentation map must be not None.
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=False``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+    """
+
+    METAINFO = {
+        'classes':
+        ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+         'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+         'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+         'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+         'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+         'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+         'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+         'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+         'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+         'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+         'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+         'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+         'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+         'blanket', 'bridge', 'cardboard', 'counter', 'curtain', 'door-stuff',
+         'floor-wood', 'flower', 'fruit', 'gravel', 'house', 'light',
+         'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield',
+         'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow',
+         'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile',
+         'wall-wood', 'water-other', 'window-blind', 'window-other',
+         'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+         'cabinet-merged', 'table-merged', 'floor-other-merged',
+         'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged',
+         'paper-merged', 'food-other-merged', 'building-other-merged',
+         'rock-merged', 'wall-other-merged', 'rug-merged'),
+        'thing_classes':
+        ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+         'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+         'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+         'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+         'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+         'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+         'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+         'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+         'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+         'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+         'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+         'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+         'scissors', 'teddy bear', 'hair drier', 'toothbrush'),
+        'stuff_classes':
+        ('banner', 'blanket', 'bridge', 'cardboard', 'counter', 'curtain',
+         'door-stuff', 'floor-wood', 'flower', 'fruit', 'gravel', 'house',
+         'light', 'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield',
+         'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow',
+         'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile',
+         'wall-wood', 'water-other', 'window-blind', 'window-other',
+         'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+         'cabinet-merged', 'table-merged', 'floor-other-merged',
+         'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged',
+         'paper-merged', 'food-other-merged', 'building-other-merged',
+         'rock-merged', 'wall-other-merged', 'rug-merged'),
+        'palette':
+        [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), (106, 0, 228),
+         (0, 60, 100), (0, 80, 100), (0, 0, 70), (0, 0, 192), (250, 170, 30),
+         (100, 170, 30), (220, 220, 0), (175, 116, 175), (250, 0, 30),
+         (165, 42, 42), (255, 77, 255), (0, 226, 252), (182, 182, 255),
+         (0, 82, 0), (120, 166, 157), (110, 76, 0), (174, 57, 255),
+         (199, 100, 0), (72, 0, 118), (255, 179, 240), (0, 125, 92),
+         (209, 0, 151), (188, 208, 182), (0, 220, 176), (255, 99, 164),
+         (92, 0, 73), (133, 129, 255), (78, 180, 255), (0, 228, 0),
+         (174, 255, 243), (45, 89, 255), (134, 134, 103), (145, 148, 174),
+         (255, 208, 186), (197, 226, 255), (171, 134, 1), (109, 63, 54),
+         (207, 138, 255), (151, 0, 95), (9, 80, 61), (84, 105, 51),
+         (74, 65, 105), (166, 196, 102), (208, 195, 210), (255, 109, 65),
+         (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0),
+         (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161),
+         (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120),
+         (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133),
+         (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62),
+         (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45),
+         (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1),
+         (246, 0, 122), (191, 162, 208), (255, 255, 128), (147, 211, 203),
+         (150, 100, 100), (168, 171, 172), (146, 112, 198), (210, 170, 100),
+         (92, 136, 89), (218, 88, 184), (241, 129, 0), (217, 17, 255),
+         (124, 74, 181), (70, 70, 70), (255, 228, 255), (154, 208, 0),
+         (193, 0, 92), (76, 91, 113), (255, 180, 195), (106, 154, 176),
+         (230, 150, 140), (60, 143, 255), (128, 64, 128), (92, 82, 55),
+         (254, 212, 124), (73, 77, 174), (255, 160, 98), (255, 255, 255),
+         (104, 84, 109), (169, 164, 131), (225, 199, 255), (137, 54, 74),
+         (135, 158, 223), (7, 246, 231), (107, 255, 200), (58, 41, 149),
+         (183, 121, 142), (255, 73, 97), (107, 142, 35), (190, 153, 153),
+         (146, 139, 141), (70, 130, 180), (134, 199, 156), (209, 226, 140),
+         (96, 36, 108), (96, 96, 96), (64, 170, 64), (152, 251, 152),
+         (208, 229, 228), (206, 186, 171), (152, 161, 64), (116, 112, 0),
+         (0, 114, 143), (102, 102, 156), (250, 141, 255)]
+    }
+    COCOAPI = COCOPanoptic
+    # ann_id is not unique in coco panoptic dataset.
+    ANN_ID_UNIQUE = False
+
+    def __init__(self,
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: dict = dict(img=None, ann=None, seg=None),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000,
+                 backend_args: dict = None,
+                 **kwargs) -> None:
+        super().__init__(
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_root=data_root,
+            data_prefix=data_prefix,
+            filter_cfg=filter_cfg,
+            indices=indices,
+            serialize_data=serialize_data,
+            pipeline=pipeline,
+            test_mode=test_mode,
+            lazy_init=lazy_init,
+            max_refetch=max_refetch,
+            backend_args=backend_args,
+            **kwargs)
+
+    def parse_data_info(self, raw_data_info: dict) -> dict:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``.
+
+        Returns:
+            dict: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+        # filter out unmatched annotations which have
+        # same segment_id but belong to other image
+        ann_info = [
+            ann for ann in ann_info if ann['image_id'] == img_info['img_id']
+        ]
+        data_info = {}
+
+        img_path = osp.join(self.data_prefix['img'], img_info['file_name'])
+        if self.data_prefix.get('seg', None):
+            seg_map_path = osp.join(
+                self.data_prefix['seg'],
+                img_info['file_name'].replace('.jpg', '.png'))
+        else:
+            seg_map_path = None
+        data_info['img_path'] = img_path
+        data_info['img_id'] = img_info['img_id']
+        data_info['seg_map_path'] = seg_map_path
+        data_info['height'] = img_info['height']
+        data_info['width'] = img_info['width']
+
+        if self.return_classes:
+            data_info['text'] = self.metainfo['thing_classes']
+            data_info['stuff_text'] = self.metainfo['stuff_classes']
+            data_info['custom_entities'] = True  # no important
+
+        instances = []
+        segments_info = []
+        for ann in ann_info:
+            instance = {}
+            x1, y1, w, h = ann['bbox']
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            category_id = ann['category_id']
+            contiguous_cat_id = self.cat2label[category_id]
+
+            is_thing = self.coco.load_cats(ids=category_id)[0]['isthing']
+            if is_thing:
+                is_crowd = ann.get('iscrowd', False)
+                instance['bbox'] = bbox
+                instance['bbox_label'] = contiguous_cat_id
+                if not is_crowd:
+                    instance['ignore_flag'] = 0
+                else:
+                    instance['ignore_flag'] = 1
+                    is_thing = False
+
+            segment_info = {
+                'id': ann['id'],
+                'category': contiguous_cat_id,
+                'is_thing': is_thing
+            }
+            segments_info.append(segment_info)
+            if len(instance) > 0 and is_thing:
+                instances.append(instance)
+        data_info['instances'] = instances
+        data_info['segments_info'] = segments_info
+        return data_info
+
+    def filter_data(self) -> List[dict]:
+        """Filter images too small or without ground truth.
+
+        Returns:
+            List[dict]: ``self.data_list`` after filtering.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        if self.filter_cfg is None:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False)
+        min_size = self.filter_cfg.get('min_size', 0)
+
+        ids_with_ann = set()
+        # check whether images have legal thing annotations.
+        for data_info in self.data_list:
+            for segment_info in data_info['segments_info']:
+                if not segment_info['is_thing']:
+                    continue
+                ids_with_ann.add(data_info['img_id'])
+
+        valid_data_list = []
+        for data_info in self.data_list:
+            img_id = data_info['img_id']
+            width = data_info['width']
+            height = data_info['height']
+            if filter_empty_gt and img_id not in ids_with_ann:
+                continue
+            if min(width, height) >= min_size:
+                valid_data_list.append(data_info)
+
+        return valid_data_list
diff --git a/head_extractor/build/lib/mmdet/datasets/coco_semantic.py b/head_extractor/build/lib/mmdet/datasets/coco_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..752568454456c1e5edcb2a24c6c2b46f042cb334
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/coco_semantic.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .ade20k import ADE20KSegDataset
+
+
+@DATASETS.register_module()
+class CocoSegDataset(ADE20KSegDataset):
+    """COCO dataset.
+
+    In segmentation map annotation for COCO. The ``img_suffix`` is fixed to
+    '.jpg',  and ``seg_map_suffix`` is fixed to '.png'.
+    """
+
+    METAINFO = dict(
+        classes=(
+            'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+            'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+            'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+            'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+            'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+            'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
+            'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+            'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+            'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+            'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
+            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+            'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+            'blanket', 'branch', 'bridge', 'building-other', 'bush', 'cabinet',
+            'cage', 'cardboard', 'carpet', 'ceiling-other', 'ceiling-tile',
+            'cloth', 'clothes', 'clouds', 'counter', 'cupboard', 'curtain',
+            'desk-stuff', 'dirt', 'door-stuff', 'fence', 'floor-marble',
+            'floor-other', 'floor-stone', 'floor-tile', 'floor-wood', 'flower',
+            'fog', 'food-other', 'fruit', 'furniture-other', 'grass', 'gravel',
+            'ground-other', 'hill', 'house', 'leaves', 'light', 'mat', 'metal',
+            'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net',
+            'paper', 'pavement', 'pillow', 'plant-other', 'plastic',
+            'platform', 'playingfield', 'railing', 'railroad', 'river', 'road',
+            'rock', 'roof', 'rug', 'salad', 'sand', 'sea', 'shelf',
+            'sky-other', 'skyscraper', 'snow', 'solid-other', 'stairs',
+            'stone', 'straw', 'structural-other', 'table', 'tent',
+            'textile-other', 'towel', 'tree', 'vegetable', 'wall-brick',
+            'wall-concrete', 'wall-other', 'wall-panel', 'wall-stone',
+            'wall-tile', 'wall-wood', 'water-other', 'waterdrops',
+            'window-blind', 'window-other', 'wood'),
+        palette=[(120, 120, 120), (180, 120, 120), (6, 230, 230), (80, 50, 50),
+                 (4, 200, 3), (120, 120, 80), (140, 140, 140), (204, 5, 255),
+                 (230, 230, 230), (4, 250, 7), (224, 5, 255), (235, 255, 7),
+                 (150, 5, 61), (120, 120, 70), (8, 255, 51), (255, 6, 82),
+                 (143, 255, 140), (204, 255, 4), (255, 51, 7), (204, 70, 3),
+                 (0, 102, 200), (61, 230, 250), (255, 6, 51), (11, 102, 255),
+                 (255, 7, 71), (255, 9, 224), (9, 7, 230), (220, 220, 220),
+                 (255, 9, 92), (112, 9, 255), (8, 255, 214), (7, 255, 224),
+                 (255, 184, 6), (10, 255, 71), (255, 41, 10), (7, 255, 255),
+                 (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7),
+                 (255, 122, 8), (0, 255, 20), (255, 8, 41), (255, 5, 153),
+                 (6, 51, 255), (235, 12, 255), (160, 150, 20), (0, 163, 255),
+                 (140, 140, 140), (250, 10, 15), (20, 255, 0), (31, 255, 0),
+                 (255, 31, 0), (255, 224, 0), (153, 255, 0), (0, 0, 255),
+                 (255, 71, 0), (0, 235, 255), (0, 173, 255), (31, 0, 255),
+                 (11, 200, 200), (255, 82, 0), (0, 255, 245), (0, 61, 255),
+                 (0, 255, 112), (0, 255, 133), (255, 0, 0), (255, 163, 0),
+                 (255, 102, 0), (194, 255, 0), (0, 143, 255), (51, 255, 0),
+                 (0, 82, 255), (0, 255, 41), (0, 255, 173), (10, 0, 255),
+                 (173, 255, 0), (0, 255, 153), (255, 92, 0), (255, 0, 255),
+                 (255, 0, 245), (255, 0, 102), (255, 173, 0), (255, 0, 20),
+                 (255, 184, 184), (0, 31, 255), (0, 255, 61), (0, 71, 255),
+                 (255, 0, 204), (0, 255, 194), (0, 255, 82), (0, 10, 255),
+                 (0, 112, 255), (51, 0, 255), (0, 194, 255), (0, 122, 255),
+                 (0, 255, 163), (255, 153, 0), (0, 255, 10), (255, 112, 0),
+                 (143, 255, 0), (82, 0, 255), (163, 255, 0), (255, 235, 0),
+                 (8, 184, 170), (133, 0, 255), (0, 255, 92), (184, 0, 255),
+                 (255, 0, 31), (0, 184, 255), (0, 214, 255), (255, 0, 112),
+                 (92, 255, 0), (0, 224, 255), (112, 224, 255), (70, 184, 160),
+                 (163, 0, 255), (153, 0, 255), (71, 255, 0), (255, 0, 163),
+                 (255, 204, 0), (255, 0, 143), (0, 255, 235), (133, 255, 0),
+                 (255, 0, 235), (245, 0, 255), (255, 0, 122), (255, 245, 0),
+                 (10, 190, 212), (214, 255, 0), (0, 204, 255), (20, 0, 255),
+                 (255, 255, 0), (0, 153, 255), (0, 41, 255), (0, 255, 204),
+                 (41, 0, 255), (41, 255, 0), (173, 0, 255), (0, 245, 255),
+                 (71, 0, 255), (122, 0, 255), (0, 255, 184), (0, 92, 255),
+                 (184, 255, 0), (0, 133, 255), (255, 214, 0), (25, 194, 194),
+                 (102, 255, 0), (92, 0, 255), (107, 255, 200), (58, 41, 149),
+                 (183, 121, 142), (255, 73, 97), (107, 142, 35),
+                 (190, 153, 153), (146, 139, 141), (70, 130, 180),
+                 (134, 199, 156), (209, 226, 140), (96, 36, 108), (96, 96, 96),
+                 (64, 170, 64), (152, 251, 152), (208, 229, 228),
+                 (206, 186, 171), (152, 161, 64), (116, 112, 0), (0, 114, 143),
+                 (102, 102, 156), (250, 141, 255)])
diff --git a/head_extractor/build/lib/mmdet/datasets/crowdhuman.py b/head_extractor/build/lib/mmdet/datasets/crowdhuman.py
new file mode 100644
index 0000000000000000000000000000000000000000..650176ee545ba6a10a816517553b3b77718d945b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/crowdhuman.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import logging
+import os.path as osp
+import warnings
+from typing import List, Union
+
+import mmcv
+from mmengine.dist import get_rank
+from mmengine.fileio import dump, get, get_text, load
+from mmengine.logging import print_log
+from mmengine.utils import ProgressBar
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class CrowdHumanDataset(BaseDetDataset):
+    r"""Dataset for CrowdHuman.
+
+    Args:
+        data_root (str): The root directory for
+            ``data_prefix`` and ``ann_file``.
+        ann_file (str): Annotation file path.
+        extra_ann_file (str | optional):The path of extra image metas
+            for CrowdHuman. It can be created by CrowdHumanDataset
+            automatically or by tools/misc/get_crowdhuman_id_hw.py
+            manually. Defaults to None.
+    """
+
+    METAINFO = {
+        'classes': ('person', ),
+        # palette is a list of color tuples, which is used for visualization.
+        'palette': [(220, 20, 60)]
+    }
+
+    def __init__(self, data_root, ann_file, extra_ann_file=None, **kwargs):
+        # extra_ann_file record the size of each image. This file is
+        # automatically created when you first load the CrowdHuman
+        # dataset by mmdet.
+        if extra_ann_file is not None:
+            self.extra_ann_exist = True
+            self.extra_anns = load(extra_ann_file)
+        else:
+            ann_file_name = osp.basename(ann_file)
+            if 'train' in ann_file_name:
+                self.extra_ann_file = osp.join(data_root, 'id_hw_train.json')
+            elif 'val' in ann_file_name:
+                self.extra_ann_file = osp.join(data_root, 'id_hw_val.json')
+            self.extra_ann_exist = False
+            if not osp.isfile(self.extra_ann_file):
+                print_log(
+                    'extra_ann_file does not exist, prepare to collect '
+                    'image height and width...',
+                    level=logging.INFO)
+                self.extra_anns = {}
+            else:
+                self.extra_ann_exist = True
+                self.extra_anns = load(self.extra_ann_file)
+        super().__init__(data_root=data_root, ann_file=ann_file, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        anno_strs = get_text(
+            self.ann_file, backend_args=self.backend_args).strip().split('\n')
+        print_log('loading CrowdHuman annotation...', level=logging.INFO)
+        data_list = []
+        prog_bar = ProgressBar(len(anno_strs))
+        for i, anno_str in enumerate(anno_strs):
+            anno_dict = json.loads(anno_str)
+            parsed_data_info = self.parse_data_info(anno_dict)
+            data_list.append(parsed_data_info)
+            prog_bar.update()
+        if not self.extra_ann_exist and get_rank() == 0:
+            #  TODO: support file client
+            try:
+                dump(self.extra_anns, self.extra_ann_file, file_format='json')
+            except:  # noqa
+                warnings.warn(
+                    'Cache files can not be saved automatically! To speed up'
+                    'loading the dataset, please manually generate the cache'
+                    ' file by file tools/misc/get_crowdhuman_id_hw.py')
+
+            print_log(
+                f'\nsave extra_ann_file in {self.data_root}',
+                level=logging.INFO)
+
+        del self.extra_anns
+        print_log('\nDone', level=logging.INFO)
+        return data_list
+
+    def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        data_info = {}
+        img_path = osp.join(self.data_prefix['img'],
+                            f"{raw_data_info['ID']}.jpg")
+        data_info['img_path'] = img_path
+        data_info['img_id'] = raw_data_info['ID']
+
+        if not self.extra_ann_exist:
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, backend='cv2')
+            data_info['height'], data_info['width'] = img.shape[:2]
+            self.extra_anns[raw_data_info['ID']] = img.shape[:2]
+            del img, img_bytes
+        else:
+            data_info['height'], data_info['width'] = self.extra_anns[
+                raw_data_info['ID']]
+
+        instances = []
+        for i, ann in enumerate(raw_data_info['gtboxes']):
+            instance = {}
+            if ann['tag'] not in self.metainfo['classes']:
+                instance['bbox_label'] = -1
+                instance['ignore_flag'] = 1
+            else:
+                instance['bbox_label'] = self.metainfo['classes'].index(
+                    ann['tag'])
+                instance['ignore_flag'] = 0
+            if 'extra' in ann:
+                if 'ignore' in ann['extra']:
+                    if ann['extra']['ignore'] != 0:
+                        instance['bbox_label'] = -1
+                        instance['ignore_flag'] = 1
+
+            x1, y1, w, h = ann['fbox']
+            bbox = [x1, y1, x1 + w, y1 + h]
+            instance['bbox'] = bbox
+
+            # Record the full bbox(fbox), head bbox(hbox) and visible
+            # bbox(vbox) as additional information. If you need to use
+            # this information, you just need to design the pipeline
+            # instead of overriding the CrowdHumanDataset.
+            instance['fbox'] = bbox
+            hbox = ann['hbox']
+            instance['hbox'] = [
+                hbox[0], hbox[1], hbox[0] + hbox[2], hbox[1] + hbox[3]
+            ]
+            vbox = ann['vbox']
+            instance['vbox'] = [
+                vbox[0], vbox[1], vbox[0] + vbox[2], vbox[1] + vbox[3]
+            ]
+
+            instances.append(instance)
+
+        data_info['instances'] = instances
+        return data_info
diff --git a/head_extractor/build/lib/mmdet/datasets/dataset_wrappers.py b/head_extractor/build/lib/mmdet/datasets/dataset_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4e26e07c0f8a9e9f106bcd351f71e7b24d6ccf9
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/dataset_wrappers.py
@@ -0,0 +1,260 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections
+import copy
+from typing import List, Sequence, Union
+
+from mmengine.dataset import BaseDataset
+from mmengine.dataset import ConcatDataset as MMENGINE_ConcatDataset
+from mmengine.dataset import force_full_init
+
+from mmdet.registry import DATASETS, TRANSFORMS
+
+
+@DATASETS.register_module()
+class MultiImageMixDataset:
+    """A wrapper of multiple images mixed dataset.
+
+    Suitable for training on multiple images mixed data augmentation like
+    mosaic and mixup. For the augmentation pipeline of mixed image data,
+    the `get_indexes` method needs to be provided to obtain the image
+    indexes, and you can set `skip_flags` to change the pipeline running
+    process. At the same time, we provide the `dynamic_scale` parameter
+    to dynamically change the output image size.
+
+    Args:
+        dataset (:obj:`CustomDataset`): The dataset to be mixed.
+        pipeline (Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        dynamic_scale (tuple[int], optional): The image scale can be changed
+            dynamically. Default to None. It is deprecated.
+        skip_type_keys (list[str], optional): Sequence of type string to
+            be skip pipeline. Default to None.
+        max_refetch (int): The maximum number of retry iterations for getting
+            valid results from the pipeline. If the number of iterations is
+            greater than `max_refetch`, but results is still None, then the
+            iteration is terminated and raise the error. Default: 15.
+    """
+
+    def __init__(self,
+                 dataset: Union[BaseDataset, dict],
+                 pipeline: Sequence[str],
+                 skip_type_keys: Union[Sequence[str], None] = None,
+                 max_refetch: int = 15,
+                 lazy_init: bool = False) -> None:
+        assert isinstance(pipeline, collections.abc.Sequence)
+        if skip_type_keys is not None:
+            assert all([
+                isinstance(skip_type_key, str)
+                for skip_type_key in skip_type_keys
+            ])
+        self._skip_type_keys = skip_type_keys
+
+        self.pipeline = []
+        self.pipeline_types = []
+        for transform in pipeline:
+            if isinstance(transform, dict):
+                self.pipeline_types.append(transform['type'])
+                transform = TRANSFORMS.build(transform)
+                self.pipeline.append(transform)
+            else:
+                raise TypeError('pipeline must be a dict')
+
+        self.dataset: BaseDataset
+        if isinstance(dataset, dict):
+            self.dataset = DATASETS.build(dataset)
+        elif isinstance(dataset, BaseDataset):
+            self.dataset = dataset
+        else:
+            raise TypeError(
+                'elements in datasets sequence should be config or '
+                f'`BaseDataset` instance, but got {type(dataset)}')
+
+        self._metainfo = self.dataset.metainfo
+        if hasattr(self.dataset, 'flag'):
+            self.flag = self.dataset.flag
+        self.num_samples = len(self.dataset)
+        self.max_refetch = max_refetch
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        """Get the meta information of the multi-image-mixed dataset.
+
+        Returns:
+            dict: The meta information of multi-image-mixed dataset.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self):
+        """Loop to ``full_init`` each dataset."""
+        if self._fully_initialized:
+            return
+
+        self.dataset.full_init()
+        self._ori_len = len(self.dataset)
+        self._fully_initialized = True
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index.
+
+        Args:
+            idx (int): Global index of ``ConcatDataset``.
+
+        Returns:
+            dict: The idx-th annotation of the datasets.
+        """
+        return self.dataset.get_data_info(idx)
+
+    @force_full_init
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        results = copy.deepcopy(self.dataset[idx])
+        for (transform, transform_type) in zip(self.pipeline,
+                                               self.pipeline_types):
+            if self._skip_type_keys is not None and \
+                    transform_type in self._skip_type_keys:
+                continue
+
+            if hasattr(transform, 'get_indexes'):
+                for i in range(self.max_refetch):
+                    # Make sure the results passed the loading pipeline
+                    # of the original dataset is not None.
+                    indexes = transform.get_indexes(self.dataset)
+                    if not isinstance(indexes, collections.abc.Sequence):
+                        indexes = [indexes]
+                    mix_results = [
+                        copy.deepcopy(self.dataset[index]) for index in indexes
+                    ]
+                    if None not in mix_results:
+                        results['mix_results'] = mix_results
+                        break
+                else:
+                    raise RuntimeError(
+                        'The loading pipeline of the original dataset'
+                        ' always return None. Please check the correctness '
+                        'of the dataset and its pipeline.')
+
+            for i in range(self.max_refetch):
+                # To confirm the results passed the training pipeline
+                # of the wrapper is not None.
+                updated_results = transform(copy.deepcopy(results))
+                if updated_results is not None:
+                    results = updated_results
+                    break
+            else:
+                raise RuntimeError(
+                    'The training pipeline of the dataset wrapper'
+                    ' always return None.Please check the correctness '
+                    'of the dataset and its pipeline.')
+
+            if 'mix_results' in results:
+                results.pop('mix_results')
+
+        return results
+
+    def update_skip_type_keys(self, skip_type_keys):
+        """Update skip_type_keys. It is called by an external hook.
+
+        Args:
+            skip_type_keys (list[str], optional): Sequence of type
+                string to be skip pipeline.
+        """
+        assert all([
+            isinstance(skip_type_key, str) for skip_type_key in skip_type_keys
+        ])
+        self._skip_type_keys = skip_type_keys
+
+
+@DATASETS.register_module()
+class ConcatDataset(MMENGINE_ConcatDataset):
+    """A wrapper of concatenated dataset.
+
+    Same as ``torch.utils.data.dataset.ConcatDataset``, support
+    lazy_init and get_dataset_source.
+
+    Note:
+        ``ConcatDataset`` should not inherit from ``BaseDataset`` since
+        ``get_subset`` and ``get_subset_`` could produce ambiguous meaning
+        sub-dataset which conflicts with original dataset. If you want to use
+        a sub-dataset of ``ConcatDataset``, you should set ``indices``
+        arguments for wrapped dataset which inherit from ``BaseDataset``.
+
+    Args:
+        datasets (Sequence[BaseDataset] or Sequence[dict]): A list of datasets
+            which will be concatenated.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. Defaults to False.
+        ignore_keys (List[str] or str): Ignore the keys that can be
+            unequal in `dataset.metainfo`. Defaults to None.
+            `New in version 0.3.0.`
+    """
+
+    def __init__(self,
+                 datasets: Sequence[Union[BaseDataset, dict]],
+                 lazy_init: bool = False,
+                 ignore_keys: Union[str, List[str], None] = None):
+        self.datasets: List[BaseDataset] = []
+        for i, dataset in enumerate(datasets):
+            if isinstance(dataset, dict):
+                self.datasets.append(DATASETS.build(dataset))
+            elif isinstance(dataset, BaseDataset):
+                self.datasets.append(dataset)
+            else:
+                raise TypeError(
+                    'elements in datasets sequence should be config or '
+                    f'`BaseDataset` instance, but got {type(dataset)}')
+        if ignore_keys is None:
+            self.ignore_keys = []
+        elif isinstance(ignore_keys, str):
+            self.ignore_keys = [ignore_keys]
+        elif isinstance(ignore_keys, list):
+            self.ignore_keys = ignore_keys
+        else:
+            raise TypeError('ignore_keys should be a list or str, '
+                            f'but got {type(ignore_keys)}')
+
+        meta_keys: set = set()
+        for dataset in self.datasets:
+            meta_keys |= dataset.metainfo.keys()
+        # if the metainfo of multiple datasets are the same, use metainfo
+        # of the first dataset, else the metainfo is a list with metainfo
+        # of all the datasets
+        is_all_same = True
+        self._metainfo_first = self.datasets[0].metainfo
+        for i, dataset in enumerate(self.datasets, 1):
+            for key in meta_keys:
+                if key in self.ignore_keys:
+                    continue
+                if key not in dataset.metainfo:
+                    is_all_same = False
+                    break
+                if self._metainfo_first[key] != dataset.metainfo[key]:
+                    is_all_same = False
+                    break
+
+        if is_all_same:
+            self._metainfo = self.datasets[0].metainfo
+        else:
+            self._metainfo = [dataset.metainfo for dataset in self.datasets]
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+            if is_all_same:
+                self._metainfo.update(
+                    dict(cumulative_sizes=self.cumulative_sizes))
+            else:
+                for i, dataset in enumerate(self.datasets):
+                    self._metainfo[i].update(
+                        dict(cumulative_sizes=self.cumulative_sizes))
+
+    def get_dataset_source(self, idx: int) -> int:
+        dataset_idx, _ = self._get_ori_dataset_idx(idx)
+        return dataset_idx
diff --git a/head_extractor/build/lib/mmdet/datasets/deepfashion.py b/head_extractor/build/lib/mmdet/datasets/deepfashion.py
new file mode 100644
index 0000000000000000000000000000000000000000..f853fc63398d598b90a88323e660ba6f4d81e2df
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/deepfashion.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class DeepFashionDataset(CocoDataset):
+    """Dataset for DeepFashion."""
+
+    METAINFO = {
+        'classes': ('top', 'skirt', 'leggings', 'dress', 'outer', 'pants',
+                    'bag', 'neckwear', 'headwear', 'eyeglass', 'belt',
+                    'footwear', 'hair', 'skin', 'face'),
+        # palette is a list of color tuples, which is used for visualization.
+        'palette': [(0, 192, 64), (0, 64, 96), (128, 192, 192), (0, 64, 64),
+                    (0, 192, 224), (0, 192, 192), (128, 192, 64), (0, 192, 96),
+                    (128, 32, 192), (0, 0, 224), (0, 0, 64), (0, 160, 192),
+                    (128, 0, 96), (128, 0, 192), (0, 32, 192)]
+    }
diff --git a/head_extractor/build/lib/mmdet/datasets/dod.py b/head_extractor/build/lib/mmdet/datasets/dod.py
new file mode 100644
index 0000000000000000000000000000000000000000..152d32aaf70c7fb5e3730d46d26e150fc1204f22
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/dod.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List, Optional
+
+import numpy as np
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+try:
+    from d_cube import D3
+except ImportError:
+    D3 = None
+from .api_wrappers import COCO
+
+
+@DATASETS.register_module()
+class DODDataset(BaseDetDataset):
+
+    def __init__(self,
+                 *args,
+                 data_root: Optional[str] = '',
+                 data_prefix: dict = dict(img_path=''),
+                 **kwargs) -> None:
+        if D3 is None:
+            raise ImportError(
+                'Please install d3 by `pip install ddd-dataset`.')
+        pkl_anno_path = osp.join(data_root, data_prefix['anno'])
+        self.img_root = osp.join(data_root, data_prefix['img'])
+        self.d3 = D3(self.img_root, pkl_anno_path)
+
+        sent_infos = self.d3.load_sents()
+        classes = tuple([sent_info['raw_sent'] for sent_info in sent_infos])
+        super().__init__(
+            *args,
+            data_root=data_root,
+            data_prefix=data_prefix,
+            metainfo={'classes': classes},
+            **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        coco = COCO(self.ann_file)
+        data_list = []
+        img_ids = self.d3.get_img_ids()
+        for img_id in img_ids:
+            data_info = {}
+
+            img_info = self.d3.load_imgs(img_id)[0]
+            file_name = img_info['file_name']
+            img_path = osp.join(self.img_root, file_name)
+            data_info['img_path'] = img_path
+            data_info['img_id'] = img_id
+            data_info['height'] = img_info['height']
+            data_info['width'] = img_info['width']
+
+            group_ids = self.d3.get_group_ids(img_ids=[img_id])
+            sent_ids = self.d3.get_sent_ids(group_ids=group_ids)
+            sent_list = self.d3.load_sents(sent_ids=sent_ids)
+            text_list = [sent['raw_sent'] for sent in sent_list]
+            ann_ids = coco.get_ann_ids(img_ids=[img_id])
+            anno = coco.load_anns(ann_ids)
+
+            data_info['text'] = text_list
+            data_info['sent_ids'] = np.array([s for s in sent_ids])
+            data_info['custom_entities'] = True
+
+            instances = []
+            for i, ann in enumerate(anno):
+                instance = {}
+                x1, y1, w, h = ann['bbox']
+                bbox = [x1, y1, x1 + w, y1 + h]
+                instance['ignore_flag'] = 0
+                instance['bbox'] = bbox
+                instance['bbox_label'] = ann['category_id'] - 1
+                instances.append(instance)
+            data_info['instances'] = instances
+            data_list.append(data_info)
+        return data_list
diff --git a/head_extractor/build/lib/mmdet/datasets/dsdl.py b/head_extractor/build/lib/mmdet/datasets/dsdl.py
new file mode 100644
index 0000000000000000000000000000000000000000..75570a2a6396e0e7a4ce5cac5dbf2a23cd164629
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/dsdl.py
@@ -0,0 +1,192 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import List
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+try:
+    from dsdl.dataset import DSDLDataset
+except ImportError:
+    DSDLDataset = None
+
+
+@DATASETS.register_module()
+class DSDLDetDataset(BaseDetDataset):
+    """Dataset for dsdl detection.
+
+    Args:
+        with_bbox(bool): Load bbox or not, defaults to be True.
+        with_polygon(bool): Load polygon or not, defaults to be False.
+        with_mask(bool): Load seg map mask or not, defaults to be False.
+        with_imagelevel_label(bool): Load image level label or not,
+            defaults to be False.
+        with_hierarchy(bool): Load hierarchy information or not,
+            defaults to be False.
+        specific_key_path(dict): Path of specific key which can not
+            be loaded by it's field name.
+        pre_transform(dict): pre-transform functions before loading.
+    """
+
+    METAINFO = {}
+
+    def __init__(self,
+                 with_bbox: bool = True,
+                 with_polygon: bool = False,
+                 with_mask: bool = False,
+                 with_imagelevel_label: bool = False,
+                 with_hierarchy: bool = False,
+                 specific_key_path: dict = {},
+                 pre_transform: dict = {},
+                 **kwargs) -> None:
+
+        if DSDLDataset is None:
+            raise RuntimeError(
+                'Package dsdl is not installed. Please run "pip install dsdl".'
+            )
+
+        self.with_hierarchy = with_hierarchy
+        self.specific_key_path = specific_key_path
+
+        loc_config = dict(type='LocalFileReader', working_dir='')
+        if kwargs.get('data_root'):
+            kwargs['ann_file'] = os.path.join(kwargs['data_root'],
+                                              kwargs['ann_file'])
+        self.required_fields = ['Image', 'ImageShape', 'Label', 'ignore_flag']
+        if with_bbox:
+            self.required_fields.append('Bbox')
+        if with_polygon:
+            self.required_fields.append('Polygon')
+        if with_mask:
+            self.required_fields.append('LabelMap')
+        if with_imagelevel_label:
+            self.required_fields.append('image_level_labels')
+            assert 'image_level_labels' in specific_key_path.keys(
+            ), '`image_level_labels` not specified in `specific_key_path` !'
+
+        self.extra_keys = [
+            key for key in self.specific_key_path.keys()
+            if key not in self.required_fields
+        ]
+
+        self.dsdldataset = DSDLDataset(
+            dsdl_yaml=kwargs['ann_file'],
+            location_config=loc_config,
+            required_fields=self.required_fields,
+            specific_key_path=specific_key_path,
+            transform=pre_transform,
+        )
+
+        BaseDetDataset.__init__(self, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load data info from an dsdl yaml file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of data info.
+        """
+        if self.with_hierarchy:
+            # get classes_names and relation_matrix
+            classes_names, relation_matrix = \
+                self.dsdldataset.class_dom.get_hierarchy_info()
+            self._metainfo['classes'] = tuple(classes_names)
+            self._metainfo['RELATION_MATRIX'] = relation_matrix
+
+        else:
+            self._metainfo['classes'] = tuple(self.dsdldataset.class_names)
+
+        data_list = []
+
+        for i, data in enumerate(self.dsdldataset):
+            # basic image info, including image id, path and size.
+            datainfo = dict(
+                img_id=i,
+                img_path=os.path.join(self.data_prefix['img_path'],
+                                      data['Image'][0].location),
+                width=data['ImageShape'][0].width,
+                height=data['ImageShape'][0].height,
+            )
+
+            # get image label info
+            if 'image_level_labels' in data.keys():
+                if self.with_hierarchy:
+                    # get leaf node name when using hierarchy classes
+                    datainfo['image_level_labels'] = [
+                        self._metainfo['classes'].index(i.leaf_node_name)
+                        for i in data['image_level_labels']
+                    ]
+                else:
+                    datainfo['image_level_labels'] = [
+                        self._metainfo['classes'].index(i.name)
+                        for i in data['image_level_labels']
+                    ]
+
+            # get semantic segmentation info
+            if 'LabelMap' in data.keys():
+                datainfo['seg_map_path'] = data['LabelMap']
+
+            # load instance info
+            instances = []
+            if 'Bbox' in data.keys():
+                for idx in range(len(data['Bbox'])):
+                    bbox = data['Bbox'][idx]
+                    if self.with_hierarchy:
+                        # get leaf node name when using hierarchy classes
+                        label = data['Label'][idx].leaf_node_name
+                        label_index = self._metainfo['classes'].index(label)
+                    else:
+                        label = data['Label'][idx].name
+                        label_index = self._metainfo['classes'].index(label)
+
+                    instance = {}
+                    instance['bbox'] = bbox.xyxy
+                    instance['bbox_label'] = label_index
+
+                    if 'ignore_flag' in data.keys():
+                        # get ignore flag
+                        instance['ignore_flag'] = data['ignore_flag'][idx]
+                    else:
+                        instance['ignore_flag'] = 0
+
+                    if 'Polygon' in data.keys():
+                        # get polygon info
+                        polygon = data['Polygon'][idx]
+                        instance['mask'] = polygon.openmmlabformat
+
+                    for key in self.extra_keys:
+                        # load extra instance info
+                        instance[key] = data[key][idx]
+
+                    instances.append(instance)
+
+            datainfo['instances'] = instances
+            # append a standard sample in data list
+            if len(datainfo['instances']) > 0:
+                data_list.append(datainfo)
+
+        return data_list
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg.
+
+        Returns:
+            List[dict]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) \
+            if self.filter_cfg is not None else False
+        min_size = self.filter_cfg.get('min_size', 0) \
+            if self.filter_cfg is not None else 0
+
+        valid_data_list = []
+        for i, data_info in enumerate(self.data_list):
+            width = data_info['width']
+            height = data_info['height']
+            if filter_empty_gt and len(data_info['instances']) == 0:
+                continue
+            if min(width, height) >= min_size:
+                valid_data_list.append(data_info)
+
+        return valid_data_list
diff --git a/head_extractor/build/lib/mmdet/datasets/flickr30k.py b/head_extractor/build/lib/mmdet/datasets/flickr30k.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c76a41bc965bb0e8348c3d13e77d5c6e8ca08ce
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/flickr30k.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from pycocotools.coco import COCO
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+def convert_phrase_ids(phrase_ids: list) -> list:
+    unique_elements = sorted(set(phrase_ids))
+    element_to_new_label = {
+        element: label
+        for label, element in enumerate(unique_elements)
+    }
+    phrase_ids = [element_to_new_label[element] for element in phrase_ids]
+    return phrase_ids
+
+
+@DATASETS.register_module()
+class Flickr30kDataset(BaseDetDataset):
+    """Flickr30K Dataset."""
+
+    def load_data_list(self) -> List[dict]:
+
+        self.coco = COCO(self.ann_file)
+
+        self.ids = sorted(list(self.coco.imgs.keys()))
+
+        data_list = []
+        for img_id in self.ids:
+            if isinstance(img_id, str):
+                ann_ids = self.coco.getAnnIds(imgIds=[img_id], iscrowd=None)
+            else:
+                ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+
+            coco_img = self.coco.loadImgs(img_id)[0]
+
+            caption = coco_img['caption']
+            file_name = coco_img['file_name']
+            img_path = osp.join(self.data_prefix['img'], file_name)
+            width = coco_img['width']
+            height = coco_img['height']
+            tokens_positive = coco_img['tokens_positive_eval']
+            phrases = [caption[i[0][0]:i[0][1]] for i in tokens_positive]
+            phrase_ids = []
+
+            instances = []
+            annos = self.coco.loadAnns(ann_ids)
+            for anno in annos:
+                instance = {
+                    'bbox': [
+                        anno['bbox'][0], anno['bbox'][1],
+                        anno['bbox'][0] + anno['bbox'][2],
+                        anno['bbox'][1] + anno['bbox'][3]
+                    ],
+                    'bbox_label':
+                    anno['category_id'],
+                    'ignore_flag':
+                    anno['iscrowd']
+                }
+                phrase_ids.append(anno['phrase_ids'])
+                instances.append(instance)
+
+            phrase_ids = convert_phrase_ids(phrase_ids)
+
+            data_list.append(
+                dict(
+                    img_path=img_path,
+                    img_id=img_id,
+                    height=height,
+                    width=width,
+                    instances=instances,
+                    text=caption,
+                    phrase_ids=phrase_ids,
+                    tokens_positive=tokens_positive,
+                    phrases=phrases,
+                ))
+
+        return data_list
diff --git a/head_extractor/build/lib/mmdet/datasets/isaid.py b/head_extractor/build/lib/mmdet/datasets/isaid.py
new file mode 100644
index 0000000000000000000000000000000000000000..87067d8459c4dd6e80e5f808f613e0bd600b5f2f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/isaid.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class iSAIDDataset(CocoDataset):
+    """Dataset for iSAID instance segmentation.
+
+    iSAID: A Large-scale Dataset for Instance Segmentation
+    in Aerial Images.
+
+    For more detail, please refer to "projects/iSAID/README.md"
+    """
+
+    METAINFO = dict(
+        classes=('background', 'ship', 'store_tank', 'baseball_diamond',
+                 'tennis_court', 'basketball_court', 'Ground_Track_Field',
+                 'Bridge', 'Large_Vehicle', 'Small_Vehicle', 'Helicopter',
+                 'Swimming_pool', 'Roundabout', 'Soccer_ball_field', 'plane',
+                 'Harbor'),
+        palette=[(0, 0, 0), (0, 0, 63), (0, 63, 63), (0, 63, 0), (0, 63, 127),
+                 (0, 63, 191), (0, 63, 255), (0, 127, 63), (0, 127, 127),
+                 (0, 0, 127), (0, 0, 191), (0, 0, 255), (0, 191, 127),
+                 (0, 127, 191), (0, 127, 255), (0, 100, 155)])
diff --git a/head_extractor/build/lib/mmdet/datasets/lvis.py b/head_extractor/build/lib/mmdet/datasets/lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9629f5d463da183f0b4ab4c5d0f7ff7b07e4348
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/lvis.py
@@ -0,0 +1,638 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List
+
+from mmengine.fileio import get_local_path
+
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class LVISV05Dataset(CocoDataset):
+    """LVIS v0.5 dataset for detection."""
+
+    METAINFO = {
+        'classes':
+        ('acorn', 'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock',
+         'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet',
+         'antenna', 'apple', 'apple_juice', 'applesauce', 'apricot', 'apron',
+         'aquarium', 'armband', 'armchair', 'armoire', 'armor', 'artichoke',
+         'trash_can', 'ashtray', 'asparagus', 'atomizer', 'avocado', 'award',
+         'awning', 'ax', 'baby_buggy', 'basketball_backboard', 'backpack',
+         'handbag', 'suitcase', 'bagel', 'bagpipe', 'baguet', 'bait', 'ball',
+         'ballet_skirt', 'balloon', 'bamboo', 'banana', 'Band_Aid', 'bandage',
+         'bandanna', 'banjo', 'banner', 'barbell', 'barge', 'barrel',
+         'barrette', 'barrow', 'baseball_base', 'baseball', 'baseball_bat',
+         'baseball_cap', 'baseball_glove', 'basket', 'basketball_hoop',
+         'basketball', 'bass_horn', 'bat_(animal)', 'bath_mat', 'bath_towel',
+         'bathrobe', 'bathtub', 'batter_(food)', 'battery', 'beachball',
+         'bead', 'beaker', 'bean_curd', 'beanbag', 'beanie', 'bear', 'bed',
+         'bedspread', 'cow', 'beef_(food)', 'beeper', 'beer_bottle',
+         'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt', 'belt_buckle',
+         'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor', 'binder',
+         'binoculars', 'bird', 'birdfeeder', 'birdbath', 'birdcage',
+         'birdhouse', 'birthday_cake', 'birthday_card', 'biscuit_(bread)',
+         'pirate_flag', 'black_sheep', 'blackboard', 'blanket', 'blazer',
+         'blender', 'blimp', 'blinker', 'blueberry', 'boar', 'gameboard',
+         'boat', 'bobbin', 'bobby_pin', 'boiled_egg', 'bolo_tie', 'deadbolt',
+         'bolt', 'bonnet', 'book', 'book_bag', 'bookcase', 'booklet',
+         'bookmark', 'boom_microphone', 'boot', 'bottle', 'bottle_opener',
+         'bouquet', 'bow_(weapon)', 'bow_(decorative_ribbons)', 'bow-tie',
+         'bowl', 'pipe_bowl', 'bowler_hat', 'bowling_ball', 'bowling_pin',
+         'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere',
+         'bread-bin', 'breechcloth', 'bridal_gown', 'briefcase',
+         'bristle_brush', 'broccoli', 'broach', 'broom', 'brownie',
+         'brussels_sprouts', 'bubble_gum', 'bucket', 'horse_buggy', 'bull',
+         'bulldog', 'bulldozer', 'bullet_train', 'bulletin_board',
+         'bulletproof_vest', 'bullhorn', 'corned_beef', 'bun', 'bunk_bed',
+         'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butcher_knife',
+         'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car',
+         'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf',
+         'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)',
+         'can', 'can_opener', 'candelabrum', 'candle', 'candle_holder',
+         'candy_bar', 'candy_cane', 'walking_cane', 'canister', 'cannon',
+         'canoe', 'cantaloup', 'canteen', 'cap_(headwear)', 'bottle_cap',
+         'cape', 'cappuccino', 'car_(automobile)', 'railcar_(part_of_a_train)',
+         'elevator_car', 'car_battery', 'identity_card', 'card', 'cardigan',
+         'cargo_ship', 'carnation', 'horse_carriage', 'carrot', 'tote_bag',
+         'cart', 'carton', 'cash_register', 'casserole', 'cassette', 'cast',
+         'cat', 'cauliflower', 'caviar', 'cayenne_(spice)', 'CD_player',
+         'celery', 'cellular_telephone', 'chain_mail', 'chair',
+         'chaise_longue', 'champagne', 'chandelier', 'chap', 'checkbook',
+         'checkerboard', 'cherry', 'chessboard',
+         'chest_of_drawers_(furniture)', 'chicken_(animal)', 'chicken_wire',
+         'chickpea', 'Chihuahua', 'chili_(vegetable)', 'chime', 'chinaware',
+         'crisp_(potato_chip)', 'poker_chip', 'chocolate_bar',
+         'chocolate_cake', 'chocolate_milk', 'chocolate_mousse', 'choker',
+         'chopping_board', 'chopstick', 'Christmas_tree', 'slide', 'cider',
+         'cigar_box', 'cigarette', 'cigarette_case', 'cistern', 'clarinet',
+         'clasp', 'cleansing_agent', 'clementine', 'clip', 'clipboard',
+         'clock', 'clock_tower', 'clothes_hamper', 'clothespin', 'clutch_bag',
+         'coaster', 'coat', 'coat_hanger', 'coatrack', 'cock', 'coconut',
+         'coffee_filter', 'coffee_maker', 'coffee_table', 'coffeepot', 'coil',
+         'coin', 'colander', 'coleslaw', 'coloring_material',
+         'combination_lock', 'pacifier', 'comic_book', 'computer_keyboard',
+         'concrete_mixer', 'cone', 'control', 'convertible_(automobile)',
+         'sofa_bed', 'cookie', 'cookie_jar', 'cooking_utensil',
+         'cooler_(for_food)', 'cork_(bottle_plug)', 'corkboard', 'corkscrew',
+         'edible_corn', 'cornbread', 'cornet', 'cornice', 'cornmeal', 'corset',
+         'romaine_lettuce', 'costume', 'cougar', 'coverall', 'cowbell',
+         'cowboy_hat', 'crab_(animal)', 'cracker', 'crape', 'crate', 'crayon',
+         'cream_pitcher', 'credit_card', 'crescent_roll', 'crib', 'crock_pot',
+         'crossbar', 'crouton', 'crow', 'crown', 'crucifix', 'cruise_ship',
+         'police_cruiser', 'crumb', 'crutch', 'cub_(animal)', 'cube',
+         'cucumber', 'cufflink', 'cup', 'trophy_cup', 'cupcake', 'hair_curler',
+         'curling_iron', 'curtain', 'cushion', 'custard', 'cutting_tool',
+         'cylinder', 'cymbal', 'dachshund', 'dagger', 'dartboard',
+         'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
+         'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table',
+         'tux', 'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
+         'dishwasher_detergent', 'diskette', 'dispenser', 'Dixie_cup', 'dog',
+         'dog_collar', 'doll', 'dollar', 'dolphin', 'domestic_ass', 'eye_mask',
+         'doorbell', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly',
+         'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit',
+         'dresser', 'drill', 'drinking_fountain', 'drone', 'dropper',
+         'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling',
+         'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan',
+         'Dutch_oven', 'eagle', 'earphone', 'earplug', 'earring', 'easel',
+         'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater',
+         'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk',
+         'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan',
+         'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)',
+         'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)',
+         'fire_alarm', 'fire_engine', 'fire_extinguisher', 'fire_hose',
+         'fireplace', 'fireplug', 'fish', 'fish_(food)', 'fishbowl',
+         'fishing_boat', 'fishing_rod', 'flag', 'flagpole', 'flamingo',
+         'flannel', 'flash', 'flashlight', 'fleece', 'flip-flop_(sandal)',
+         'flipper_(footwear)', 'flower_arrangement', 'flute_glass', 'foal',
+         'folding_chair', 'food_processor', 'football_(American)',
+         'football_helmet', 'footstool', 'fork', 'forklift', 'freight_car',
+         'French_toast', 'freshener', 'frisbee', 'frog', 'fruit_juice',
+         'fruit_salad', 'frying_pan', 'fudge', 'funnel', 'futon', 'gag',
+         'garbage', 'garbage_truck', 'garden_hose', 'gargle', 'gargoyle',
+         'garlic', 'gasmask', 'gazelle', 'gelatin', 'gemstone', 'giant_panda',
+         'gift_wrap', 'ginger', 'giraffe', 'cincture',
+         'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles',
+         'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose',
+         'gorilla', 'gourd', 'surgical_gown', 'grape', 'grasshopper', 'grater',
+         'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle',
+         'grillroom', 'grinder_(tool)', 'grits', 'grizzly', 'grocery_bag',
+         'guacamole', 'guitar', 'gull', 'gun', 'hair_spray', 'hairbrush',
+         'hairnet', 'hairpin', 'ham', 'hamburger', 'hammer', 'hammock',
+         'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel',
+         'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw',
+         'hardback_book', 'harmonium', 'hat', 'hatbox', 'hatch', 'veil',
+         'headband', 'headboard', 'headlight', 'headscarf', 'headset',
+         'headstall_(for_horses)', 'hearing_aid', 'heart', 'heater',
+         'helicopter', 'helmet', 'heron', 'highchair', 'hinge', 'hippopotamus',
+         'hockey_stick', 'hog', 'home_plate_(baseball)', 'honey', 'fume_hood',
+         'hook', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
+         'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
+         'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
+         'ice_tea', 'igniter', 'incense', 'inhaler', 'iPod',
+         'iron_(for_clothing)', 'ironing_board', 'jacket', 'jam', 'jean',
+         'jeep', 'jelly_bean', 'jersey', 'jet_plane', 'jewelry', 'joystick',
+         'jumpsuit', 'kayak', 'keg', 'kennel', 'kettle', 'key', 'keycard',
+         'kilt', 'kimono', 'kitchen_sink', 'kitchen_table', 'kite', 'kitten',
+         'kiwi_fruit', 'knee_pad', 'knife', 'knight_(chess_piece)',
+         'knitting_needle', 'knob', 'knocker_(on_a_door)', 'koala', 'lab_coat',
+         'ladder', 'ladle', 'ladybug', 'lamb_(animal)', 'lamb-chop', 'lamp',
+         'lamppost', 'lampshade', 'lantern', 'lanyard', 'laptop_computer',
+         'lasagna', 'latch', 'lawn_mower', 'leather', 'legging_(clothing)',
+         'Lego', 'lemon', 'lemonade', 'lettuce', 'license_plate', 'life_buoy',
+         'life_jacket', 'lightbulb', 'lightning_rod', 'lime', 'limousine',
+         'linen_paper', 'lion', 'lip_balm', 'lipstick', 'liquor', 'lizard',
+         'Loafer_(type_of_shoe)', 'log', 'lollipop', 'lotion',
+         'speaker_(stereo_equipment)', 'loveseat', 'machine_gun', 'magazine',
+         'magnet', 'mail_slot', 'mailbox_(at_home)', 'mallet', 'mammoth',
+         'mandarin_orange', 'manger', 'manhole', 'map', 'marker', 'martini',
+         'mascot', 'mashed_potato', 'masher', 'mask', 'mast',
+         'mat_(gym_equipment)', 'matchbox', 'mattress', 'measuring_cup',
+         'measuring_stick', 'meatball', 'medicine', 'melon', 'microphone',
+         'microscope', 'microwave_oven', 'milestone', 'milk', 'minivan',
+         'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)', 'money',
+         'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor',
+         'motor_scooter', 'motor_vehicle', 'motorboat', 'motorcycle',
+         'mound_(baseball)', 'mouse_(animal_rodent)',
+         'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom',
+         'music_stool', 'musical_instrument', 'nailfile', 'nameplate',
+         'napkin', 'neckerchief', 'necklace', 'necktie', 'needle', 'nest',
+         'newsstand', 'nightshirt', 'nosebag_(for_animals)',
+         'noseband_(for_animals)', 'notebook', 'notepad', 'nut', 'nutcracker',
+         'oar', 'octopus_(food)', 'octopus_(animal)', 'oil_lamp', 'olive_oil',
+         'omelet', 'onion', 'orange_(fruit)', 'orange_juice', 'oregano',
+         'ostrich', 'ottoman', 'overalls_(clothing)', 'owl', 'packet',
+         'inkpad', 'pad', 'paddle', 'padlock', 'paintbox', 'paintbrush',
+         'painting', 'pajamas', 'palette', 'pan_(for_cooking)',
+         'pan_(metal_container)', 'pancake', 'pantyhose', 'papaya',
+         'paperclip', 'paper_plate', 'paper_towel', 'paperback_book',
+         'paperweight', 'parachute', 'parakeet', 'parasail_(sports)',
+         'parchment', 'parka', 'parking_meter', 'parrot',
+         'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
+         'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
+         'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'pegboard',
+         'pelican', 'pen', 'pencil', 'pencil_box', 'pencil_sharpener',
+         'pendulum', 'penguin', 'pennant', 'penny_(coin)', 'pepper',
+         'pepper_mill', 'perfume', 'persimmon', 'baby', 'pet', 'petfood',
+         'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
+         'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
+         'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
+         'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
+         'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
+         'plate', 'platter', 'playing_card', 'playpen', 'pliers',
+         'plow_(farm_equipment)', 'pocket_watch', 'pocketknife',
+         'poker_(fire_stirring_tool)', 'pole', 'police_van', 'polo_shirt',
+         'poncho', 'pony', 'pool_table', 'pop_(soda)', 'portrait',
+         'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot',
+         'potato', 'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn',
+         'printer', 'projectile_(weapon)', 'projector', 'propeller', 'prune',
+         'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher',
+         'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit',
+         'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', 'radish',
+         'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat',
+         'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
+         'recliner', 'record_player', 'red_cabbage', 'reflector',
+         'remote_control', 'rhinoceros', 'rib_(food)', 'rifle', 'ring',
+         'river_boat', 'road_map', 'robe', 'rocking_chair', 'roller_skate',
+         'Rollerblade', 'rolling_pin', 'root_beer',
+         'router_(computer_equipment)', 'rubber_band', 'runner_(carpet)',
+         'plastic_bag', 'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag',
+         'safety_pin', 'sail', 'salad', 'salad_plate', 'salami',
+         'salmon_(fish)', 'salmon_(food)', 'salsa', 'saltshaker',
+         'sandal_(type_of_shoe)', 'sandwich', 'satchel', 'saucepan', 'saucer',
+         'sausage', 'sawhorse', 'saxophone', 'scale_(measuring_instrument)',
+         'scarecrow', 'scarf', 'school_bus', 'scissors', 'scoreboard',
+         'scrambled_eggs', 'scraper', 'scratcher', 'screwdriver',
+         'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
+         'seashell', 'seedling', 'serving_dish', 'sewing_machine', 'shaker',
+         'shampoo', 'shark', 'sharpener', 'Sharpie', 'shaver_(electric)',
+         'shaving_cream', 'shawl', 'shears', 'sheep', 'shepherd_dog',
+         'sherbert', 'shield', 'shirt', 'shoe', 'shopping_bag',
+         'shopping_cart', 'short_pants', 'shot_glass', 'shoulder_bag',
+         'shovel', 'shower_head', 'shower_curtain', 'shredder_(for_paper)',
+         'sieve', 'signboard', 'silo', 'sink', 'skateboard', 'skewer', 'ski',
+         'ski_boot', 'ski_parka', 'ski_pole', 'skirt', 'sled', 'sleeping_bag',
+         'sling_(bandage)', 'slipper_(footwear)', 'smoothie', 'snake',
+         'snowboard', 'snowman', 'snowmobile', 'soap', 'soccer_ball', 'sock',
+         'soda_fountain', 'carbonated_water', 'sofa', 'softball',
+         'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon',
+         'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)',
+         'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'sponge',
+         'spoon', 'sportswear', 'spotlight', 'squirrel',
+         'stapler_(stapling_machine)', 'starfish', 'statue_(sculpture)',
+         'steak_(food)', 'steak_knife', 'steamer_(kitchen_appliance)',
+         'steering_wheel', 'stencil', 'stepladder', 'step_stool',
+         'stereo_(sound_system)', 'stew', 'stirrer', 'stirrup',
+         'stockings_(leg_wear)', 'stool', 'stop_sign', 'brake_light', 'stove',
+         'strainer', 'strap', 'straw_(for_drinking)', 'strawberry',
+         'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer',
+         'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower',
+         'sunglasses', 'sunhat', 'sunscreen', 'surfboard', 'sushi', 'mop',
+         'sweat_pants', 'sweatband', 'sweater', 'sweatshirt', 'sweet_potato',
+         'swimsuit', 'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table',
+         'table', 'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag',
+         'taillight', 'tambourine', 'army_tank', 'tank_(storage_vessel)',
+         'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
+         'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
+         'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
+         'telephone_pole', 'telephoto_lens', 'television_camera',
+         'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
+         'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
+         'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer',
+         'tinfoil', 'tinsel', 'tissue_paper', 'toast_(food)', 'toaster',
+         'toaster_oven', 'toilet', 'toilet_tissue', 'tomato', 'tongs',
+         'toolbox', 'toothbrush', 'toothpaste', 'toothpick', 'cover',
+         'tortilla', 'tow_truck', 'towel', 'towel_rack', 'toy',
+         'tractor_(farm_equipment)', 'traffic_light', 'dirt_bike',
+         'trailer_truck', 'train_(railroad_vehicle)', 'trampoline', 'tray',
+         'tree_house', 'trench_coat', 'triangle_(musical_instrument)',
+         'tricycle', 'tripod', 'trousers', 'truck', 'truffle_(chocolate)',
+         'trunk', 'vat', 'turban', 'turkey_(bird)', 'turkey_(food)', 'turnip',
+         'turtle', 'turtleneck_(clothing)', 'typewriter', 'umbrella',
+         'underwear', 'unicycle', 'urinal', 'urn', 'vacuum_cleaner', 'valve',
+         'vase', 'vending_machine', 'vent', 'videotape', 'vinegar', 'violin',
+         'vodka', 'volleyball', 'vulture', 'waffle', 'waffle_iron', 'wagon',
+         'wagon_wheel', 'walking_stick', 'wall_clock', 'wall_socket', 'wallet',
+         'walrus', 'wardrobe', 'wasabi', 'automatic_washer', 'watch',
+         'water_bottle', 'water_cooler', 'water_faucet', 'water_filter',
+         'water_heater', 'water_jug', 'water_gun', 'water_scooter',
+         'water_ski', 'water_tower', 'watering_can', 'watermelon',
+         'weathervane', 'webcam', 'wedding_cake', 'wedding_ring', 'wet_suit',
+         'wheel', 'wheelchair', 'whipped_cream', 'whiskey', 'whistle', 'wick',
+         'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)',
+         'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket',
+         'wineglass', 'wing_chair', 'blinder_(for_horses)', 'wok', 'wolf',
+         'wooden_spoon', 'wreath', 'wrench', 'wristband', 'wristlet', 'yacht',
+         'yak', 'yogurt', 'yoke_(animal_equipment)', 'zebra', 'zucchini'),
+        'palette':
+        None
+    }
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        try:
+            import lvis
+            if getattr(lvis, '__version__', '0') >= '10.5.3':
+                warnings.warn(
+                    'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"',  # noqa: E501
+                    UserWarning)
+            from lvis import LVIS
+        except ImportError:
+            raise ImportError(
+                'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".'  # noqa: E501
+            )
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.lvis = LVIS(local_path)
+        self.cat_ids = self.lvis.get_cat_ids()
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.lvis.cat_img_map)
+
+        img_ids = self.lvis.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.lvis.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+            if raw_img_info['file_name'].startswith('COCO'):
+                # Convert form the COCO 2014 file naming convention of
+                # COCO_[train/val/test]2014_000000000000.jpg to the 2017
+                # naming convention of 000000000000.jpg
+                # (LVIS v1 will fix this naming issue)
+                raw_img_info['file_name'] = raw_img_info['file_name'][-16:]
+            ann_ids = self.lvis.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.lvis.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.lvis
+
+        return data_list
+
+
+LVISDataset = LVISV05Dataset
+DATASETS.register_module(name='LVISDataset', module=LVISDataset)
+
+
+@DATASETS.register_module()
+class LVISV1Dataset(LVISDataset):
+    """LVIS v1 dataset for detection."""
+
+    METAINFO = {
+        'classes':
+        ('aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock',
+         'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet',
+         'antenna', 'apple', 'applesauce', 'apricot', 'apron', 'aquarium',
+         'arctic_(type_of_shoe)', 'armband', 'armchair', 'armoire', 'armor',
+         'artichoke', 'trash_can', 'ashtray', 'asparagus', 'atomizer',
+         'avocado', 'award', 'awning', 'ax', 'baboon', 'baby_buggy',
+         'basketball_backboard', 'backpack', 'handbag', 'suitcase', 'bagel',
+         'bagpipe', 'baguet', 'bait', 'ball', 'ballet_skirt', 'balloon',
+         'bamboo', 'banana', 'Band_Aid', 'bandage', 'bandanna', 'banjo',
+         'banner', 'barbell', 'barge', 'barrel', 'barrette', 'barrow',
+         'baseball_base', 'baseball', 'baseball_bat', 'baseball_cap',
+         'baseball_glove', 'basket', 'basketball', 'bass_horn', 'bat_(animal)',
+         'bath_mat', 'bath_towel', 'bathrobe', 'bathtub', 'batter_(food)',
+         'battery', 'beachball', 'bead', 'bean_curd', 'beanbag', 'beanie',
+         'bear', 'bed', 'bedpan', 'bedspread', 'cow', 'beef_(food)', 'beeper',
+         'beer_bottle', 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt',
+         'belt_buckle', 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor',
+         'billboard', 'binder', 'binoculars', 'bird', 'birdfeeder', 'birdbath',
+         'birdcage', 'birdhouse', 'birthday_cake', 'birthday_card',
+         'pirate_flag', 'black_sheep', 'blackberry', 'blackboard', 'blanket',
+         'blazer', 'blender', 'blimp', 'blinker', 'blouse', 'blueberry',
+         'gameboard', 'boat', 'bob', 'bobbin', 'bobby_pin', 'boiled_egg',
+         'bolo_tie', 'deadbolt', 'bolt', 'bonnet', 'book', 'bookcase',
+         'booklet', 'bookmark', 'boom_microphone', 'boot', 'bottle',
+         'bottle_opener', 'bouquet', 'bow_(weapon)',
+         'bow_(decorative_ribbons)', 'bow-tie', 'bowl', 'pipe_bowl',
+         'bowler_hat', 'bowling_ball', 'box', 'boxing_glove', 'suspenders',
+         'bracelet', 'brass_plaque', 'brassiere', 'bread-bin', 'bread',
+         'breechcloth', 'bridal_gown', 'briefcase', 'broccoli', 'broach',
+         'broom', 'brownie', 'brussels_sprouts', 'bubble_gum', 'bucket',
+         'horse_buggy', 'bull', 'bulldog', 'bulldozer', 'bullet_train',
+         'bulletin_board', 'bulletproof_vest', 'bullhorn', 'bun', 'bunk_bed',
+         'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butter',
+         'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car', 'cabinet',
+         'locker', 'cake', 'calculator', 'calendar', 'calf', 'camcorder',
+         'camel', 'camera', 'camera_lens', 'camper_(vehicle)', 'can',
+         'can_opener', 'candle', 'candle_holder', 'candy_bar', 'candy_cane',
+         'walking_cane', 'canister', 'canoe', 'cantaloup', 'canteen',
+         'cap_(headwear)', 'bottle_cap', 'cape', 'cappuccino',
+         'car_(automobile)', 'railcar_(part_of_a_train)', 'elevator_car',
+         'car_battery', 'identity_card', 'card', 'cardigan', 'cargo_ship',
+         'carnation', 'horse_carriage', 'carrot', 'tote_bag', 'cart', 'carton',
+         'cash_register', 'casserole', 'cassette', 'cast', 'cat',
+         'cauliflower', 'cayenne_(spice)', 'CD_player', 'celery',
+         'cellular_telephone', 'chain_mail', 'chair', 'chaise_longue',
+         'chalice', 'chandelier', 'chap', 'checkbook', 'checkerboard',
+         'cherry', 'chessboard', 'chicken_(animal)', 'chickpea',
+         'chili_(vegetable)', 'chime', 'chinaware', 'crisp_(potato_chip)',
+         'poker_chip', 'chocolate_bar', 'chocolate_cake', 'chocolate_milk',
+         'chocolate_mousse', 'choker', 'chopping_board', 'chopstick',
+         'Christmas_tree', 'slide', 'cider', 'cigar_box', 'cigarette',
+         'cigarette_case', 'cistern', 'clarinet', 'clasp', 'cleansing_agent',
+         'cleat_(for_securing_rope)', 'clementine', 'clip', 'clipboard',
+         'clippers_(for_plants)', 'cloak', 'clock', 'clock_tower',
+         'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster', 'coat',
+         'coat_hanger', 'coatrack', 'cock', 'cockroach', 'cocoa_(beverage)',
+         'coconut', 'coffee_maker', 'coffee_table', 'coffeepot', 'coil',
+         'coin', 'colander', 'coleslaw', 'coloring_material',
+         'combination_lock', 'pacifier', 'comic_book', 'compass',
+         'computer_keyboard', 'condiment', 'cone', 'control',
+         'convertible_(automobile)', 'sofa_bed', 'cooker', 'cookie',
+         'cooking_utensil', 'cooler_(for_food)', 'cork_(bottle_plug)',
+         'corkboard', 'corkscrew', 'edible_corn', 'cornbread', 'cornet',
+         'cornice', 'cornmeal', 'corset', 'costume', 'cougar', 'coverall',
+         'cowbell', 'cowboy_hat', 'crab_(animal)', 'crabmeat', 'cracker',
+         'crape', 'crate', 'crayon', 'cream_pitcher', 'crescent_roll', 'crib',
+         'crock_pot', 'crossbar', 'crouton', 'crow', 'crowbar', 'crown',
+         'crucifix', 'cruise_ship', 'police_cruiser', 'crumb', 'crutch',
+         'cub_(animal)', 'cube', 'cucumber', 'cufflink', 'cup', 'trophy_cup',
+         'cupboard', 'cupcake', 'hair_curler', 'curling_iron', 'curtain',
+         'cushion', 'cylinder', 'cymbal', 'dagger', 'dalmatian', 'dartboard',
+         'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
+         'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table',
+         'tux', 'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
+         'dishwasher_detergent', 'dispenser', 'diving_board', 'Dixie_cup',
+         'dog', 'dog_collar', 'doll', 'dollar', 'dollhouse', 'dolphin',
+         'domestic_ass', 'doorknob', 'doormat', 'doughnut', 'dove',
+         'dragonfly', 'drawer', 'underdrawers', 'dress', 'dress_hat',
+         'dress_suit', 'dresser', 'drill', 'drone', 'dropper',
+         'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling',
+         'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan', 'eagle',
+         'earphone', 'earplug', 'earring', 'easel', 'eclair', 'eel', 'egg',
+         'egg_roll', 'egg_yolk', 'eggbeater', 'eggplant', 'electric_chair',
+         'refrigerator', 'elephant', 'elk', 'envelope', 'eraser', 'escargot',
+         'eyepatch', 'falcon', 'fan', 'faucet', 'fedora', 'ferret',
+         'Ferris_wheel', 'ferry', 'fig_(fruit)', 'fighter_jet', 'figurine',
+         'file_cabinet', 'file_(tool)', 'fire_alarm', 'fire_engine',
+         'fire_extinguisher', 'fire_hose', 'fireplace', 'fireplug',
+         'first-aid_kit', 'fish', 'fish_(food)', 'fishbowl', 'fishing_rod',
+         'flag', 'flagpole', 'flamingo', 'flannel', 'flap', 'flash',
+         'flashlight', 'fleece', 'flip-flop_(sandal)', 'flipper_(footwear)',
+         'flower_arrangement', 'flute_glass', 'foal', 'folding_chair',
+         'food_processor', 'football_(American)', 'football_helmet',
+         'footstool', 'fork', 'forklift', 'freight_car', 'French_toast',
+         'freshener', 'frisbee', 'frog', 'fruit_juice', 'frying_pan', 'fudge',
+         'funnel', 'futon', 'gag', 'garbage', 'garbage_truck', 'garden_hose',
+         'gargle', 'gargoyle', 'garlic', 'gasmask', 'gazelle', 'gelatin',
+         'gemstone', 'generator', 'giant_panda', 'gift_wrap', 'ginger',
+         'giraffe', 'cincture', 'glass_(drink_container)', 'globe', 'glove',
+         'goat', 'goggles', 'goldfish', 'golf_club', 'golfcart',
+         'gondola_(boat)', 'goose', 'gorilla', 'gourd', 'grape', 'grater',
+         'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle',
+         'grill', 'grits', 'grizzly', 'grocery_bag', 'guitar', 'gull', 'gun',
+         'hairbrush', 'hairnet', 'hairpin', 'halter_top', 'ham', 'hamburger',
+         'hammer', 'hammock', 'hamper', 'hamster', 'hair_dryer', 'hand_glass',
+         'hand_towel', 'handcart', 'handcuff', 'handkerchief', 'handle',
+         'handsaw', 'hardback_book', 'harmonium', 'hat', 'hatbox', 'veil',
+         'headband', 'headboard', 'headlight', 'headscarf', 'headset',
+         'headstall_(for_horses)', 'heart', 'heater', 'helicopter', 'helmet',
+         'heron', 'highchair', 'hinge', 'hippopotamus', 'hockey_stick', 'hog',
+         'home_plate_(baseball)', 'honey', 'fume_hood', 'hook', 'hookah',
+         'hornet', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
+         'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
+         'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
+         'igniter', 'inhaler', 'iPod', 'iron_(for_clothing)', 'ironing_board',
+         'jacket', 'jam', 'jar', 'jean', 'jeep', 'jelly_bean', 'jersey',
+         'jet_plane', 'jewel', 'jewelry', 'joystick', 'jumpsuit', 'kayak',
+         'keg', 'kennel', 'kettle', 'key', 'keycard', 'kilt', 'kimono',
+         'kitchen_sink', 'kitchen_table', 'kite', 'kitten', 'kiwi_fruit',
+         'knee_pad', 'knife', 'knitting_needle', 'knob', 'knocker_(on_a_door)',
+         'koala', 'lab_coat', 'ladder', 'ladle', 'ladybug', 'lamb_(animal)',
+         'lamb-chop', 'lamp', 'lamppost', 'lampshade', 'lantern', 'lanyard',
+         'laptop_computer', 'lasagna', 'latch', 'lawn_mower', 'leather',
+         'legging_(clothing)', 'Lego', 'legume', 'lemon', 'lemonade',
+         'lettuce', 'license_plate', 'life_buoy', 'life_jacket', 'lightbulb',
+         'lightning_rod', 'lime', 'limousine', 'lion', 'lip_balm', 'liquor',
+         'lizard', 'log', 'lollipop', 'speaker_(stereo_equipment)', 'loveseat',
+         'machine_gun', 'magazine', 'magnet', 'mail_slot', 'mailbox_(at_home)',
+         'mallard', 'mallet', 'mammoth', 'manatee', 'mandarin_orange',
+         'manger', 'manhole', 'map', 'marker', 'martini', 'mascot',
+         'mashed_potato', 'masher', 'mask', 'mast', 'mat_(gym_equipment)',
+         'matchbox', 'mattress', 'measuring_cup', 'measuring_stick',
+         'meatball', 'medicine', 'melon', 'microphone', 'microscope',
+         'microwave_oven', 'milestone', 'milk', 'milk_can', 'milkshake',
+         'minivan', 'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)',
+         'money', 'monitor_(computer_equipment) computer_monitor', 'monkey',
+         'motor', 'motor_scooter', 'motor_vehicle', 'motorcycle',
+         'mound_(baseball)', 'mouse_(computer_equipment)', 'mousepad',
+         'muffin', 'mug', 'mushroom', 'music_stool', 'musical_instrument',
+         'nailfile', 'napkin', 'neckerchief', 'necklace', 'necktie', 'needle',
+         'nest', 'newspaper', 'newsstand', 'nightshirt',
+         'nosebag_(for_animals)', 'noseband_(for_animals)', 'notebook',
+         'notepad', 'nut', 'nutcracker', 'oar', 'octopus_(food)',
+         'octopus_(animal)', 'oil_lamp', 'olive_oil', 'omelet', 'onion',
+         'orange_(fruit)', 'orange_juice', 'ostrich', 'ottoman', 'oven',
+         'overalls_(clothing)', 'owl', 'packet', 'inkpad', 'pad', 'paddle',
+         'padlock', 'paintbrush', 'painting', 'pajamas', 'palette',
+         'pan_(for_cooking)', 'pan_(metal_container)', 'pancake', 'pantyhose',
+         'papaya', 'paper_plate', 'paper_towel', 'paperback_book',
+         'paperweight', 'parachute', 'parakeet', 'parasail_(sports)',
+         'parasol', 'parchment', 'parka', 'parking_meter', 'parrot',
+         'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
+         'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
+         'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'wooden_leg',
+         'pegboard', 'pelican', 'pen', 'pencil', 'pencil_box',
+         'pencil_sharpener', 'pendulum', 'penguin', 'pennant', 'penny_(coin)',
+         'pepper', 'pepper_mill', 'perfume', 'persimmon', 'person', 'pet',
+         'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
+         'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
+         'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
+         'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
+         'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
+         'plate', 'platter', 'playpen', 'pliers', 'plow_(farm_equipment)',
+         'plume', 'pocket_watch', 'pocketknife', 'poker_(fire_stirring_tool)',
+         'pole', 'polo_shirt', 'poncho', 'pony', 'pool_table', 'pop_(soda)',
+         'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot',
+         'potato', 'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn',
+         'pretzel', 'printer', 'projectile_(weapon)', 'projector', 'propeller',
+         'prune', 'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin',
+         'puncher', 'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt',
+         'rabbit', 'race_car', 'racket', 'radar', 'radiator', 'radio_receiver',
+         'radish', 'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry',
+         'rat', 'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
+         'recliner', 'record_player', 'reflector', 'remote_control',
+         'rhinoceros', 'rib_(food)', 'rifle', 'ring', 'river_boat', 'road_map',
+         'robe', 'rocking_chair', 'rodent', 'roller_skate', 'Rollerblade',
+         'rolling_pin', 'root_beer', 'router_(computer_equipment)',
+         'rubber_band', 'runner_(carpet)', 'plastic_bag',
+         'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', 'safety_pin',
+         'sail', 'salad', 'salad_plate', 'salami', 'salmon_(fish)',
+         'salmon_(food)', 'salsa', 'saltshaker', 'sandal_(type_of_shoe)',
+         'sandwich', 'satchel', 'saucepan', 'saucer', 'sausage', 'sawhorse',
+         'saxophone', 'scale_(measuring_instrument)', 'scarecrow', 'scarf',
+         'school_bus', 'scissors', 'scoreboard', 'scraper', 'screwdriver',
+         'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
+         'seashell', 'sewing_machine', 'shaker', 'shampoo', 'shark',
+         'sharpener', 'Sharpie', 'shaver_(electric)', 'shaving_cream', 'shawl',
+         'shears', 'sheep', 'shepherd_dog', 'sherbert', 'shield', 'shirt',
+         'shoe', 'shopping_bag', 'shopping_cart', 'short_pants', 'shot_glass',
+         'shoulder_bag', 'shovel', 'shower_head', 'shower_cap',
+         'shower_curtain', 'shredder_(for_paper)', 'signboard', 'silo', 'sink',
+         'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka', 'ski_pole',
+         'skirt', 'skullcap', 'sled', 'sleeping_bag', 'sling_(bandage)',
+         'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman',
+         'snowmobile', 'soap', 'soccer_ball', 'sock', 'sofa', 'softball',
+         'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon',
+         'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)',
+         'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'crawfish',
+         'sponge', 'spoon', 'sportswear', 'spotlight', 'squid_(food)',
+         'squirrel', 'stagecoach', 'stapler_(stapling_machine)', 'starfish',
+         'statue_(sculpture)', 'steak_(food)', 'steak_knife', 'steering_wheel',
+         'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew',
+         'stirrer', 'stirrup', 'stool', 'stop_sign', 'brake_light', 'stove',
+         'strainer', 'strap', 'straw_(for_drinking)', 'strawberry',
+         'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer',
+         'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower',
+         'sunglasses', 'sunhat', 'surfboard', 'sushi', 'mop', 'sweat_pants',
+         'sweatband', 'sweater', 'sweatshirt', 'sweet_potato', 'swimsuit',
+         'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table', 'table',
+         'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', 'taillight',
+         'tambourine', 'army_tank', 'tank_(storage_vessel)',
+         'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
+         'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
+         'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
+         'telephone_pole', 'telephoto_lens', 'television_camera',
+         'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
+         'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
+         'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer',
+         'tinfoil', 'tinsel', 'tissue_paper', 'toast_(food)', 'toaster',
+         'toaster_oven', 'toilet', 'toilet_tissue', 'tomato', 'tongs',
+         'toolbox', 'toothbrush', 'toothpaste', 'toothpick', 'cover',
+         'tortilla', 'tow_truck', 'towel', 'towel_rack', 'toy',
+         'tractor_(farm_equipment)', 'traffic_light', 'dirt_bike',
+         'trailer_truck', 'train_(railroad_vehicle)', 'trampoline', 'tray',
+         'trench_coat', 'triangle_(musical_instrument)', 'tricycle', 'tripod',
+         'trousers', 'truck', 'truffle_(chocolate)', 'trunk', 'vat', 'turban',
+         'turkey_(food)', 'turnip', 'turtle', 'turtleneck_(clothing)',
+         'typewriter', 'umbrella', 'underwear', 'unicycle', 'urinal', 'urn',
+         'vacuum_cleaner', 'vase', 'vending_machine', 'vent', 'vest',
+         'videotape', 'vinegar', 'violin', 'vodka', 'volleyball', 'vulture',
+         'waffle', 'waffle_iron', 'wagon', 'wagon_wheel', 'walking_stick',
+         'wall_clock', 'wall_socket', 'wallet', 'walrus', 'wardrobe',
+         'washbasin', 'automatic_washer', 'watch', 'water_bottle',
+         'water_cooler', 'water_faucet', 'water_heater', 'water_jug',
+         'water_gun', 'water_scooter', 'water_ski', 'water_tower',
+         'watering_can', 'watermelon', 'weathervane', 'webcam', 'wedding_cake',
+         'wedding_ring', 'wet_suit', 'wheel', 'wheelchair', 'whipped_cream',
+         'whistle', 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)',
+         'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket',
+         'wineglass', 'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon',
+         'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', 'yogurt',
+         'yoke_(animal_equipment)', 'zebra', 'zucchini'),
+        'palette':
+        None
+    }
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        try:
+            import lvis
+            if getattr(lvis, '__version__', '0') >= '10.5.3':
+                warnings.warn(
+                    'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"',  # noqa: E501
+                    UserWarning)
+            from lvis import LVIS
+        except ImportError:
+            raise ImportError(
+                'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".'  # noqa: E501
+            )
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.lvis = LVIS(local_path)
+        self.cat_ids = self.lvis.get_cat_ids()
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.lvis.cat_img_map)
+
+        img_ids = self.lvis.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.lvis.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+            # coco_url is used in LVISv1 instead of file_name
+            # e.g. http://images.cocodataset.org/train2017/000000391895.jpg
+            # train/val split in specified in url
+            raw_img_info['file_name'] = raw_img_info['coco_url'].replace(
+                'http://images.cocodataset.org/', '')
+            ann_ids = self.lvis.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.lvis.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.lvis
+
+        return data_list
diff --git a/head_extractor/build/lib/mmdet/datasets/mdetr_style_refcoco.py b/head_extractor/build/lib/mmdet/datasets/mdetr_style_refcoco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc56dec49db72daddf929bcc65471ffc2ca6fb4d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/mdetr_style_refcoco.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from mmengine.fileio import get_local_path
+
+from mmdet.datasets import BaseDetDataset
+from mmdet.registry import DATASETS
+from .api_wrappers import COCO
+
+
+@DATASETS.register_module()
+class MDETRStyleRefCocoDataset(BaseDetDataset):
+    """RefCOCO dataset.
+
+    Only support evaluation now.
+    """
+
+    def load_data_list(self) -> List[dict]:
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            coco = COCO(local_path)
+
+        img_ids = coco.get_img_ids()
+
+        data_infos = []
+        for img_id in img_ids:
+            raw_img_info = coco.load_imgs([img_id])[0]
+            ann_ids = coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = coco.load_anns(ann_ids)
+
+            data_info = {}
+            img_path = osp.join(self.data_prefix['img'],
+                                raw_img_info['file_name'])
+            data_info['img_path'] = img_path
+            data_info['img_id'] = img_id
+            data_info['height'] = raw_img_info['height']
+            data_info['width'] = raw_img_info['width']
+            data_info['dataset_mode'] = raw_img_info['dataset_name']
+
+            data_info['text'] = raw_img_info['caption']
+            data_info['custom_entities'] = False
+            data_info['tokens_positive'] = -1
+
+            instances = []
+            for i, ann in enumerate(raw_ann_info):
+                instance = {}
+                x1, y1, w, h = ann['bbox']
+                bbox = [x1, y1, x1 + w, y1 + h]
+                instance['bbox'] = bbox
+                instance['bbox_label'] = ann['category_id']
+                instance['ignore_flag'] = 0
+                instances.append(instance)
+
+            data_info['instances'] = instances
+            data_infos.append(data_info)
+        return data_infos
diff --git a/head_extractor/build/lib/mmdet/datasets/mot_challenge_dataset.py b/head_extractor/build/lib/mmdet/datasets/mot_challenge_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffbdc48ebf8d4a4ba11a605c8bc2a479cf2a0c96
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/mot_challenge_dataset.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List, Union
+
+from mmdet.registry import DATASETS
+from .base_video_dataset import BaseVideoDataset
+
+
+@DATASETS.register_module()
+class MOTChallengeDataset(BaseVideoDataset):
+    """Dataset for MOTChallenge.
+
+    Args:
+        visibility_thr (float, optional): The minimum visibility
+            for the objects during training. Default to -1.
+    """
+
+    METAINFO = {
+        'classes':
+        ('pedestrian', 'person_on_vehicle', 'car', 'bicycle', 'motorbike',
+         'non_mot_vehicle', 'static_person', 'distractor', 'occluder',
+         'occluder_on_ground', 'occluder_full', 'reflection', 'crowd')
+    }
+
+    def __init__(self, visibility_thr: float = -1, *args, **kwargs):
+        self.visibility_thr = visibility_thr
+        super().__init__(*args, **kwargs)
+
+    def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format. The difference between this
+        function and the one in ``BaseVideoDataset`` is that the parsing here
+        adds ``visibility`` and ``mot_conf``.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+        data_info = {}
+
+        data_info.update(img_info)
+        if self.data_prefix.get('img_path', None) is not None:
+            img_path = osp.join(self.data_prefix['img_path'],
+                                img_info['file_name'])
+        else:
+            img_path = img_info['file_name']
+        data_info['img_path'] = img_path
+
+        instances = []
+        for i, ann in enumerate(ann_info):
+            instance = {}
+
+            if (not self.test_mode) and (ann['visibility'] <
+                                         self.visibility_thr):
+                continue
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+
+            if ann.get('iscrowd', False):
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[ann['category_id']]
+            instance['instance_id'] = ann['instance_id']
+            instance['category_id'] = ann['category_id']
+            instance['mot_conf'] = ann['mot_conf']
+            instance['visibility'] = ann['visibility']
+            if len(instance) > 0:
+                instances.append(instance)
+        if not self.test_mode:
+            assert len(instances) > 0, f'No valid instances found in ' \
+                f'image {data_info["img_path"]}!'
+        data_info['instances'] = instances
+        return data_info
diff --git a/head_extractor/build/lib/mmdet/datasets/objects365.py b/head_extractor/build/lib/mmdet/datasets/objects365.py
new file mode 100644
index 0000000000000000000000000000000000000000..e99869bfa309635af3c03cbfa77f732db3f50637
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/objects365.py
@@ -0,0 +1,284 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import List
+
+from mmengine.fileio import get_local_path
+
+from mmdet.registry import DATASETS
+from .api_wrappers import COCO
+from .coco import CocoDataset
+
+# images exist in annotations but not in image folder.
+objv2_ignore_list = [
+    osp.join('patch16', 'objects365_v2_00908726.jpg'),
+    osp.join('patch6', 'objects365_v1_00320532.jpg'),
+    osp.join('patch6', 'objects365_v1_00320534.jpg'),
+]
+
+
+@DATASETS.register_module()
+class Objects365V1Dataset(CocoDataset):
+    """Objects365 v1 dataset for detection."""
+
+    METAINFO = {
+        'classes':
+        ('person', 'sneakers', 'chair', 'hat', 'lamp', 'bottle',
+         'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk',
+         'handbag', 'street lights', 'book', 'plate', 'helmet',
+         'leather shoes', 'pillow', 'glove', 'potted plant', 'bracelet',
+         'flower', 'tv', 'storage box', 'vase', 'bench', 'wine glass', 'boots',
+         'bowl', 'dining table', 'umbrella', 'boat', 'flag', 'speaker',
+         'trash bin/can', 'stool', 'backpack', 'couch', 'belt', 'carpet',
+         'basket', 'towel/napkin', 'slippers', 'barrel/bucket', 'coffee table',
+         'suv', 'toy', 'tie', 'bed', 'traffic light', 'pen/pencil',
+         'microphone', 'sandals', 'canned', 'necklace', 'mirror', 'faucet',
+         'bicycle', 'bread', 'high heels', 'ring', 'van', 'watch', 'sink',
+         'horse', 'fish', 'apple', 'camera', 'candle', 'teddy bear', 'cake',
+         'motorcycle', 'wild bird', 'laptop', 'knife', 'traffic sign',
+         'cell phone', 'paddle', 'truck', 'cow', 'power outlet', 'clock',
+         'drum', 'fork', 'bus', 'hanger', 'nightstand', 'pot/pan', 'sheep',
+         'guitar', 'traffic cone', 'tea pot', 'keyboard', 'tripod', 'hockey',
+         'fan', 'dog', 'spoon', 'blackboard/whiteboard', 'balloon',
+         'air conditioner', 'cymbal', 'mouse', 'telephone', 'pickup truck',
+         'orange', 'banana', 'airplane', 'luggage', 'skis', 'soccer',
+         'trolley', 'oven', 'remote', 'baseball glove', 'paper towel',
+         'refrigerator', 'train', 'tomato', 'machinery vehicle', 'tent',
+         'shampoo/shower gel', 'head phone', 'lantern', 'donut',
+         'cleaning products', 'sailboat', 'tangerine', 'pizza', 'kite',
+         'computer box', 'elephant', 'toiletries', 'gas stove', 'broccoli',
+         'toilet', 'stroller', 'shovel', 'baseball bat', 'microwave',
+         'skateboard', 'surfboard', 'surveillance camera', 'gun', 'life saver',
+         'cat', 'lemon', 'liquid soap', 'zebra', 'duck', 'sports car',
+         'giraffe', 'pumpkin', 'piano', 'stop sign', 'radiator', 'converter',
+         'tissue ', 'carrot', 'washing machine', 'vent', 'cookies',
+         'cutting/chopping board', 'tennis racket', 'candy',
+         'skating and skiing shoes', 'scissors', 'folder', 'baseball',
+         'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine',
+         'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear',
+         'american football', 'basketball', 'potato', 'paint brush', 'printer',
+         'billiards', 'fire hydrant', 'goose', 'projector', 'sausage',
+         'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball',
+         'chopsticks', 'electronic stove and gas stove', 'pie', 'frisbee',
+         'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender',
+         'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango',
+         'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion',
+         'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale',
+         'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple',
+         'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle',
+         'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar',
+         'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD',
+         'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado',
+         'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear',
+         'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn',
+         'key', 'screwdriver', 'globe', 'broom', 'pliers', 'volleyball',
+         'hammer', 'eggplant', 'trophy', 'dates', 'board eraser', 'rice',
+         'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel',
+         'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste',
+         'antelope', 'shrimp', 'rickshaw', 'trombone', 'pomegranate',
+         'coconut', 'jellyfish', 'mushroom', 'calculator', 'treadmill',
+         'butterfly', 'egg tart', 'cheese', 'pig', 'pomelo', 'race car',
+         'rice cooker', 'tuba', 'crosswalk sign', 'papaya', 'hair drier',
+         'green onion', 'chips', 'dolphin', 'sushi', 'urinal', 'donkey',
+         'electric drill', 'spring rolls', 'tortoise/turtle', 'parrot',
+         'flute', 'measuring cup', 'shark', 'steak', 'poker card',
+         'binoculars', 'llama', 'radish', 'noodles', 'yak', 'mop', 'crab',
+         'microscope', 'barbell', 'bread/bun', 'baozi', 'lion', 'red cabbage',
+         'polar bear', 'lighter', 'seal', 'mangosteen', 'comb', 'eraser',
+         'pitaya', 'scallop', 'pencil case', 'saw', 'table tennis paddle',
+         'okra', 'starfish', 'eagle', 'monkey', 'durian', 'game board',
+         'rabbit', 'french horn', 'ambulance', 'asparagus', 'hoverboard',
+         'pasta', 'target', 'hotair balloon', 'chainsaw', 'lobster', 'iron',
+         'flashlight'),
+        'palette':
+        None
+    }
+
+    COCOAPI = COCO
+    # ann_id is unique in coco dataset.
+    ANN_ID_UNIQUE = True
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.coco = self.COCOAPI(local_path)
+
+        # 'categories' list in objects365_train.json and objects365_val.json
+        # is inconsistent, need sort list(or dict) before get cat_ids.
+        cats = self.coco.cats
+        sorted_cats = {i: cats[i] for i in sorted(cats)}
+        self.coco.cats = sorted_cats
+        categories = self.coco.dataset['categories']
+        sorted_categories = sorted(categories, key=lambda i: i['id'])
+        self.coco.dataset['categories'] = sorted_categories
+        # The order of returned `cat_ids` will not
+        # change with the order of the `classes`
+        self.cat_ids = self.coco.get_cat_ids(
+            cat_names=self.metainfo['classes'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.coco.cat_img_map)
+
+        img_ids = self.coco.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+
+            ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.coco
+
+        return data_list
+
+
+@DATASETS.register_module()
+class Objects365V2Dataset(CocoDataset):
+    """Objects365 v2 dataset for detection."""
+    METAINFO = {
+        'classes':
+        ('Person', 'Sneakers', 'Chair', 'Other Shoes', 'Hat', 'Car', 'Lamp',
+         'Glasses', 'Bottle', 'Desk', 'Cup', 'Street Lights', 'Cabinet/shelf',
+         'Handbag/Satchel', 'Bracelet', 'Plate', 'Picture/Frame', 'Helmet',
+         'Book', 'Gloves', 'Storage box', 'Boat', 'Leather Shoes', 'Flower',
+         'Bench', 'Potted Plant', 'Bowl/Basin', 'Flag', 'Pillow', 'Boots',
+         'Vase', 'Microphone', 'Necklace', 'Ring', 'SUV', 'Wine Glass', 'Belt',
+         'Moniter/TV', 'Backpack', 'Umbrella', 'Traffic Light', 'Speaker',
+         'Watch', 'Tie', 'Trash bin Can', 'Slippers', 'Bicycle', 'Stool',
+         'Barrel/bucket', 'Van', 'Couch', 'Sandals', 'Bakset', 'Drum',
+         'Pen/Pencil', 'Bus', 'Wild Bird', 'High Heels', 'Motorcycle',
+         'Guitar', 'Carpet', 'Cell Phone', 'Bread', 'Camera', 'Canned',
+         'Truck', 'Traffic cone', 'Cymbal', 'Lifesaver', 'Towel',
+         'Stuffed Toy', 'Candle', 'Sailboat', 'Laptop', 'Awning', 'Bed',
+         'Faucet', 'Tent', 'Horse', 'Mirror', 'Power outlet', 'Sink', 'Apple',
+         'Air Conditioner', 'Knife', 'Hockey Stick', 'Paddle', 'Pickup Truck',
+         'Fork', 'Traffic Sign', 'Ballon', 'Tripod', 'Dog', 'Spoon', 'Clock',
+         'Pot', 'Cow', 'Cake', 'Dinning Table', 'Sheep', 'Hanger',
+         'Blackboard/Whiteboard', 'Napkin', 'Other Fish', 'Orange/Tangerine',
+         'Toiletry', 'Keyboard', 'Tomato', 'Lantern', 'Machinery Vehicle',
+         'Fan', 'Green Vegetables', 'Banana', 'Baseball Glove', 'Airplane',
+         'Mouse', 'Train', 'Pumpkin', 'Soccer', 'Skiboard', 'Luggage',
+         'Nightstand', 'Tea pot', 'Telephone', 'Trolley', 'Head Phone',
+         'Sports Car', 'Stop Sign', 'Dessert', 'Scooter', 'Stroller', 'Crane',
+         'Remote', 'Refrigerator', 'Oven', 'Lemon', 'Duck', 'Baseball Bat',
+         'Surveillance Camera', 'Cat', 'Jug', 'Broccoli', 'Piano', 'Pizza',
+         'Elephant', 'Skateboard', 'Surfboard', 'Gun',
+         'Skating and Skiing shoes', 'Gas stove', 'Donut', 'Bow Tie', 'Carrot',
+         'Toilet', 'Kite', 'Strawberry', 'Other Balls', 'Shovel', 'Pepper',
+         'Computer Box', 'Toilet Paper', 'Cleaning Products', 'Chopsticks',
+         'Microwave', 'Pigeon', 'Baseball', 'Cutting/chopping Board',
+         'Coffee Table', 'Side Table', 'Scissors', 'Marker', 'Pie', 'Ladder',
+         'Snowboard', 'Cookies', 'Radiator', 'Fire Hydrant', 'Basketball',
+         'Zebra', 'Grape', 'Giraffe', 'Potato', 'Sausage', 'Tricycle',
+         'Violin', 'Egg', 'Fire Extinguisher', 'Candy', 'Fire Truck',
+         'Billards', 'Converter', 'Bathtub', 'Wheelchair', 'Golf Club',
+         'Briefcase', 'Cucumber', 'Cigar/Cigarette ', 'Paint Brush', 'Pear',
+         'Heavy Truck', 'Hamburger', 'Extractor', 'Extention Cord', 'Tong',
+         'Tennis Racket', 'Folder', 'American Football', 'earphone', 'Mask',
+         'Kettle', 'Tennis', 'Ship', 'Swing', 'Coffee Machine', 'Slide',
+         'Carriage', 'Onion', 'Green beans', 'Projector', 'Frisbee',
+         'Washing Machine/Drying Machine', 'Chicken', 'Printer', 'Watermelon',
+         'Saxophone', 'Tissue', 'Toothbrush', 'Ice cream', 'Hotair ballon',
+         'Cello', 'French Fries', 'Scale', 'Trophy', 'Cabbage', 'Hot dog',
+         'Blender', 'Peach', 'Rice', 'Wallet/Purse', 'Volleyball', 'Deer',
+         'Goose', 'Tape', 'Tablet', 'Cosmetics', 'Trumpet', 'Pineapple',
+         'Golf Ball', 'Ambulance', 'Parking meter', 'Mango', 'Key', 'Hurdle',
+         'Fishing Rod', 'Medal', 'Flute', 'Brush', 'Penguin', 'Megaphone',
+         'Corn', 'Lettuce', 'Garlic', 'Swan', 'Helicopter', 'Green Onion',
+         'Sandwich', 'Nuts', 'Speed Limit Sign', 'Induction Cooker', 'Broom',
+         'Trombone', 'Plum', 'Rickshaw', 'Goldfish', 'Kiwi fruit',
+         'Router/modem', 'Poker Card', 'Toaster', 'Shrimp', 'Sushi', 'Cheese',
+         'Notepaper', 'Cherry', 'Pliers', 'CD', 'Pasta', 'Hammer', 'Cue',
+         'Avocado', 'Hamimelon', 'Flask', 'Mushroon', 'Screwdriver', 'Soap',
+         'Recorder', 'Bear', 'Eggplant', 'Board Eraser', 'Coconut',
+         'Tape Measur/ Ruler', 'Pig', 'Showerhead', 'Globe', 'Chips', 'Steak',
+         'Crosswalk Sign', 'Stapler', 'Campel', 'Formula 1 ', 'Pomegranate',
+         'Dishwasher', 'Crab', 'Hoverboard', 'Meat ball', 'Rice Cooker',
+         'Tuba', 'Calculator', 'Papaya', 'Antelope', 'Parrot', 'Seal',
+         'Buttefly', 'Dumbbell', 'Donkey', 'Lion', 'Urinal', 'Dolphin',
+         'Electric Drill', 'Hair Dryer', 'Egg tart', 'Jellyfish', 'Treadmill',
+         'Lighter', 'Grapefruit', 'Game board', 'Mop', 'Radish', 'Baozi',
+         'Target', 'French', 'Spring Rolls', 'Monkey', 'Rabbit', 'Pencil Case',
+         'Yak', 'Red Cabbage', 'Binoculars', 'Asparagus', 'Barbell', 'Scallop',
+         'Noddles', 'Comb', 'Dumpling', 'Oyster', 'Table Teniis paddle',
+         'Cosmetics Brush/Eyeliner Pencil', 'Chainsaw', 'Eraser', 'Lobster',
+         'Durian', 'Okra', 'Lipstick', 'Cosmetics Mirror', 'Curling',
+         'Table Tennis '),
+        'palette':
+        None
+    }
+
+    COCOAPI = COCO
+    # ann_id is unique in coco dataset.
+    ANN_ID_UNIQUE = True
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.coco = self.COCOAPI(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the `classes`
+        self.cat_ids = self.coco.get_cat_ids(
+            cat_names=self.metainfo['classes'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.coco.cat_img_map)
+
+        img_ids = self.coco.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+
+            ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            # file_name should be `patchX/xxx.jpg`
+            file_name = osp.join(
+                osp.split(osp.split(raw_img_info['file_name'])[0])[-1],
+                osp.split(raw_img_info['file_name'])[-1])
+
+            if file_name in objv2_ignore_list:
+                continue
+
+            raw_img_info['file_name'] = file_name
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.coco
+
+        return data_list
diff --git a/head_extractor/build/lib/mmdet/datasets/odvg.py b/head_extractor/build/lib/mmdet/datasets/odvg.py
new file mode 100644
index 0000000000000000000000000000000000000000..c73865f2ea724205640bea2c701c355bbd9135e3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/odvg.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from typing import List, Optional
+
+from mmengine.fileio import get_local_path
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class ODVGDataset(BaseDetDataset):
+    """object detection and visual grounding dataset."""
+
+    def __init__(self,
+                 *args,
+                 data_root: str = '',
+                 label_map_file: Optional[str] = None,
+                 need_text: bool = True,
+                 **kwargs) -> None:
+        self.dataset_mode = 'VG'
+        self.need_text = need_text
+        if label_map_file:
+            label_map_file = osp.join(data_root, label_map_file)
+            with open(label_map_file, 'r') as file:
+                self.label_map = json.load(file)
+            self.dataset_mode = 'OD'
+        super().__init__(*args, data_root=data_root, **kwargs)
+        assert self.return_classes is True
+
+    def load_data_list(self) -> List[dict]:
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                data_list = [json.loads(line) for line in f]
+
+        out_data_list = []
+        for data in data_list:
+            data_info = {}
+            img_path = osp.join(self.data_prefix['img'], data['filename'])
+            data_info['img_path'] = img_path
+            data_info['height'] = data['height']
+            data_info['width'] = data['width']
+            if self.dataset_mode == 'OD':
+                if self.need_text:
+                    data_info['text'] = self.label_map
+                anno = data.get('detection', {})
+                instances = [obj for obj in anno.get('instances', [])]
+                bboxes = [obj['bbox'] for obj in instances]
+                bbox_labels = [str(obj['label']) for obj in instances]
+
+                instances = []
+                for bbox, label in zip(bboxes, bbox_labels):
+                    instance = {}
+                    x1, y1, x2, y2 = bbox
+                    inter_w = max(0, min(x2, data['width']) - max(x1, 0))
+                    inter_h = max(0, min(y2, data['height']) - max(y1, 0))
+                    if inter_w * inter_h == 0:
+                        continue
+                    if (x2 - x1) < 1 or (y2 - y1) < 1:
+                        continue
+                    instance['ignore_flag'] = 0
+                    instance['bbox'] = bbox
+                    instance['bbox_label'] = int(label)
+                    instances.append(instance)
+                data_info['instances'] = instances
+                data_info['dataset_mode'] = self.dataset_mode
+                out_data_list.append(data_info)
+            else:
+                anno = data['grounding']
+                data_info['text'] = anno['caption']
+                regions = anno['regions']
+
+                instances = []
+                phrases = {}
+                for i, region in enumerate(regions):
+                    bbox = region['bbox']
+                    phrase = region['phrase']
+                    tokens_positive = region['tokens_positive']
+                    if not isinstance(bbox[0], list):
+                        bbox = [bbox]
+                    for box in bbox:
+                        instance = {}
+                        x1, y1, x2, y2 = box
+                        inter_w = max(0, min(x2, data['width']) - max(x1, 0))
+                        inter_h = max(0, min(y2, data['height']) - max(y1, 0))
+                        if inter_w * inter_h == 0:
+                            continue
+                        if (x2 - x1) < 1 or (y2 - y1) < 1:
+                            continue
+                        instance['ignore_flag'] = 0
+                        instance['bbox'] = box
+                        instance['bbox_label'] = i
+                        phrases[i] = {
+                            'phrase': phrase,
+                            'tokens_positive': tokens_positive
+                        }
+                        instances.append(instance)
+                data_info['instances'] = instances
+                data_info['phrases'] = phrases
+                data_info['dataset_mode'] = self.dataset_mode
+                out_data_list.append(data_info)
+
+        del data_list
+        return out_data_list
diff --git a/head_extractor/build/lib/mmdet/datasets/openimages.py b/head_extractor/build/lib/mmdet/datasets/openimages.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3c6c8ec44fdfe86a653fc6a716009836f7d471c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/openimages.py
@@ -0,0 +1,484 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import csv
+import os.path as osp
+from collections import defaultdict
+from typing import Dict, List, Optional
+
+import numpy as np
+from mmengine.fileio import get_local_path, load
+from mmengine.utils import is_abs
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class OpenImagesDataset(BaseDetDataset):
+    """Open Images dataset for detection.
+
+    Args:
+        ann_file (str): Annotation file path.
+        label_file (str): File path of the label description file that
+            maps the classes names in MID format to their short
+            descriptions.
+        meta_file (str): File path to get image metas.
+        hierarchy_file (str): The file path of the class hierarchy.
+        image_level_ann_file (str): Human-verified image level annotation,
+            which is used in evaluation.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    METAINFO: dict = dict(dataset_type='oid_v6')
+
+    def __init__(self,
+                 label_file: str,
+                 meta_file: str,
+                 hierarchy_file: str,
+                 image_level_ann_file: Optional[str] = None,
+                 **kwargs) -> None:
+        self.label_file = label_file
+        self.meta_file = meta_file
+        self.hierarchy_file = hierarchy_file
+        self.image_level_ann_file = image_level_ann_file
+        super().__init__(**kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """
+        classes_names, label_id_mapping = self._parse_label_file(
+            self.label_file)
+        self._metainfo['classes'] = classes_names
+        self.label_id_mapping = label_id_mapping
+
+        if self.image_level_ann_file is not None:
+            img_level_anns = self._parse_img_level_ann(
+                self.image_level_ann_file)
+        else:
+            img_level_anns = None
+
+        # OpenImagesMetric can get the relation matrix from the dataset meta
+        relation_matrix = self._get_relation_matrix(self.hierarchy_file)
+        self._metainfo['RELATION_MATRIX'] = relation_matrix
+
+        data_list = []
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                last_img_id = None
+                instances = []
+                for i, line in enumerate(reader):
+                    if i == 0:
+                        continue
+                    img_id = line[0]
+                    if last_img_id is None:
+                        last_img_id = img_id
+                    label_id = line[2]
+                    assert label_id in self.label_id_mapping
+                    label = int(self.label_id_mapping[label_id])
+                    bbox = [
+                        float(line[4]),  # xmin
+                        float(line[6]),  # ymin
+                        float(line[5]),  # xmax
+                        float(line[7])  # ymax
+                    ]
+                    is_occluded = True if int(line[8]) == 1 else False
+                    is_truncated = True if int(line[9]) == 1 else False
+                    is_group_of = True if int(line[10]) == 1 else False
+                    is_depiction = True if int(line[11]) == 1 else False
+                    is_inside = True if int(line[12]) == 1 else False
+
+                    instance = dict(
+                        bbox=bbox,
+                        bbox_label=label,
+                        ignore_flag=0,
+                        is_occluded=is_occluded,
+                        is_truncated=is_truncated,
+                        is_group_of=is_group_of,
+                        is_depiction=is_depiction,
+                        is_inside=is_inside)
+                    last_img_path = osp.join(self.data_prefix['img'],
+                                             f'{last_img_id}.jpg')
+                    if img_id != last_img_id:
+                        # switch to a new image, record previous image's data.
+                        data_info = dict(
+                            img_path=last_img_path,
+                            img_id=last_img_id,
+                            instances=instances,
+                        )
+                        data_list.append(data_info)
+                        instances = []
+                    instances.append(instance)
+                    last_img_id = img_id
+                data_list.append(
+                    dict(
+                        img_path=last_img_path,
+                        img_id=last_img_id,
+                        instances=instances,
+                    ))
+
+        # add image metas to data list
+        img_metas = load(
+            self.meta_file, file_format='pkl', backend_args=self.backend_args)
+        assert len(img_metas) == len(data_list)
+        for i, meta in enumerate(img_metas):
+            img_id = data_list[i]['img_id']
+            assert f'{img_id}.jpg' == osp.split(meta['filename'])[-1]
+            h, w = meta['ori_shape'][:2]
+            data_list[i]['height'] = h
+            data_list[i]['width'] = w
+            # denormalize bboxes
+            for j in range(len(data_list[i]['instances'])):
+                data_list[i]['instances'][j]['bbox'][0] *= w
+                data_list[i]['instances'][j]['bbox'][2] *= w
+                data_list[i]['instances'][j]['bbox'][1] *= h
+                data_list[i]['instances'][j]['bbox'][3] *= h
+            # add image-level annotation
+            if img_level_anns is not None:
+                img_labels = []
+                confidences = []
+                img_ann_list = img_level_anns.get(img_id, [])
+                for ann in img_ann_list:
+                    img_labels.append(int(ann['image_level_label']))
+                    confidences.append(float(ann['confidence']))
+                data_list[i]['image_level_labels'] = np.array(
+                    img_labels, dtype=np.int64)
+                data_list[i]['confidences'] = np.array(
+                    confidences, dtype=np.float32)
+        return data_list
+
+    def _parse_label_file(self, label_file: str) -> tuple:
+        """Get classes name and index mapping from cls-label-description file.
+
+        Args:
+            label_file (str): File path of the label description file that
+                maps the classes names in MID format to their short
+                descriptions.
+
+        Returns:
+            tuple: Class name of OpenImages.
+        """
+
+        index_list = []
+        classes_names = []
+        with get_local_path(
+                label_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                for line in reader:
+                    # self.cat2label[line[0]] = line[1]
+                    classes_names.append(line[1])
+                    index_list.append(line[0])
+        index_mapping = {index: i for i, index in enumerate(index_list)}
+        return classes_names, index_mapping
+
+    def _parse_img_level_ann(self,
+                             img_level_ann_file: str) -> Dict[str, List[dict]]:
+        """Parse image level annotations from csv style ann_file.
+
+        Args:
+            img_level_ann_file (str): CSV style image level annotation
+                file path.
+
+        Returns:
+            Dict[str, List[dict]]: Annotations where item of the defaultdict
+            indicates an image, each of which has (n) dicts.
+            Keys of dicts are:
+
+                - `image_level_label` (int): Label id.
+                - `confidence` (float): Labels that are human-verified to be
+                  present in an image have confidence = 1 (positive labels).
+                  Labels that are human-verified to be absent from an image
+                  have confidence = 0 (negative labels). Machine-generated
+                  labels have fractional confidences, generally >= 0.5.
+                  The higher the confidence, the smaller the chance for
+                  the label to be a false positive.
+        """
+
+        item_lists = defaultdict(list)
+        with get_local_path(
+                img_level_ann_file,
+                backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                for i, line in enumerate(reader):
+                    if i == 0:
+                        continue
+                    img_id = line[0]
+                    item_lists[img_id].append(
+                        dict(
+                            image_level_label=int(
+                                self.label_id_mapping[line[2]]),
+                            confidence=float(line[3])))
+        return item_lists
+
+    def _get_relation_matrix(self, hierarchy_file: str) -> np.ndarray:
+        """Get the matrix of class hierarchy from the hierarchy file. Hierarchy
+        for 600 classes can be found at https://storage.googleapis.com/openimag
+        es/2018_04/bbox_labels_600_hierarchy_visualizer/circle.html.
+
+        Args:
+            hierarchy_file (str): File path to the hierarchy for classes.
+
+        Returns:
+            np.ndarray: The matrix of the corresponding relationship between
+            the parent class and the child class, of shape
+            (class_num, class_num).
+        """  # noqa
+
+        hierarchy = load(
+            hierarchy_file, file_format='json', backend_args=self.backend_args)
+        class_num = len(self._metainfo['classes'])
+        relation_matrix = np.eye(class_num, class_num)
+        relation_matrix = self._convert_hierarchy_tree(hierarchy,
+                                                       relation_matrix)
+        return relation_matrix
+
+    def _convert_hierarchy_tree(self,
+                                hierarchy_map: dict,
+                                relation_matrix: np.ndarray,
+                                parents: list = [],
+                                get_all_parents: bool = True) -> np.ndarray:
+        """Get matrix of the corresponding relationship between the parent
+        class and the child class.
+
+        Args:
+            hierarchy_map (dict): Including label name and corresponding
+                subcategory. Keys of dicts are:
+
+                - `LabeName` (str): Name of the label.
+                - `Subcategory` (dict | list): Corresponding subcategory(ies).
+            relation_matrix (ndarray): The matrix of the corresponding
+                relationship between the parent class and the child class,
+                of shape (class_num, class_num).
+            parents (list): Corresponding parent class.
+            get_all_parents (bool): Whether get all parent names.
+                Default: True
+
+        Returns:
+            ndarray: The matrix of the corresponding relationship between
+            the parent class and the child class, of shape
+            (class_num, class_num).
+        """
+
+        if 'Subcategory' in hierarchy_map:
+            for node in hierarchy_map['Subcategory']:
+                if 'LabelName' in node:
+                    children_name = node['LabelName']
+                    children_index = self.label_id_mapping[children_name]
+                    children = [children_index]
+                else:
+                    continue
+                if len(parents) > 0:
+                    for parent_index in parents:
+                        if get_all_parents:
+                            children.append(parent_index)
+                        relation_matrix[children_index, parent_index] = 1
+                relation_matrix = self._convert_hierarchy_tree(
+                    node, relation_matrix, parents=children)
+        return relation_matrix
+
+    def _join_prefix(self):
+        """Join ``self.data_root`` with annotation path."""
+        super()._join_prefix()
+        if not is_abs(self.label_file) and self.label_file:
+            self.label_file = osp.join(self.data_root, self.label_file)
+        if not is_abs(self.meta_file) and self.meta_file:
+            self.meta_file = osp.join(self.data_root, self.meta_file)
+        if not is_abs(self.hierarchy_file) and self.hierarchy_file:
+            self.hierarchy_file = osp.join(self.data_root, self.hierarchy_file)
+        if self.image_level_ann_file and not is_abs(self.image_level_ann_file):
+            self.image_level_ann_file = osp.join(self.data_root,
+                                                 self.image_level_ann_file)
+
+
+@DATASETS.register_module()
+class OpenImagesChallengeDataset(OpenImagesDataset):
+    """Open Images Challenge dataset for detection.
+
+    Args:
+        ann_file (str): Open Images Challenge box annotation in txt format.
+    """
+
+    METAINFO: dict = dict(dataset_type='oid_challenge')
+
+    def __init__(self, ann_file: str, **kwargs) -> None:
+        if not ann_file.endswith('txt'):
+            raise TypeError('The annotation file of Open Images Challenge '
+                            'should be a txt file.')
+
+        super().__init__(ann_file=ann_file, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """
+        classes_names, label_id_mapping = self._parse_label_file(
+            self.label_file)
+        self._metainfo['classes'] = classes_names
+        self.label_id_mapping = label_id_mapping
+
+        if self.image_level_ann_file is not None:
+            img_level_anns = self._parse_img_level_ann(
+                self.image_level_ann_file)
+        else:
+            img_level_anns = None
+
+        # OpenImagesMetric can get the relation matrix from the dataset meta
+        relation_matrix = self._get_relation_matrix(self.hierarchy_file)
+        self._metainfo['RELATION_MATRIX'] = relation_matrix
+
+        data_list = []
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                lines = f.readlines()
+        i = 0
+        while i < len(lines):
+            instances = []
+            filename = lines[i].rstrip()
+            i += 2
+            img_gt_size = int(lines[i])
+            i += 1
+            for j in range(img_gt_size):
+                sp = lines[i + j].split()
+                instances.append(
+                    dict(
+                        bbox=[
+                            float(sp[1]),
+                            float(sp[2]),
+                            float(sp[3]),
+                            float(sp[4])
+                        ],
+                        bbox_label=int(sp[0]) - 1,  # labels begin from 1
+                        ignore_flag=0,
+                        is_group_ofs=True if int(sp[5]) == 1 else False))
+            i += img_gt_size
+            data_list.append(
+                dict(
+                    img_path=osp.join(self.data_prefix['img'], filename),
+                    instances=instances,
+                ))
+
+        # add image metas to data list
+        img_metas = load(
+            self.meta_file, file_format='pkl', backend_args=self.backend_args)
+        assert len(img_metas) == len(data_list)
+        for i, meta in enumerate(img_metas):
+            img_id = osp.split(data_list[i]['img_path'])[-1][:-4]
+            assert img_id == osp.split(meta['filename'])[-1][:-4]
+            h, w = meta['ori_shape'][:2]
+            data_list[i]['height'] = h
+            data_list[i]['width'] = w
+            data_list[i]['img_id'] = img_id
+            # denormalize bboxes
+            for j in range(len(data_list[i]['instances'])):
+                data_list[i]['instances'][j]['bbox'][0] *= w
+                data_list[i]['instances'][j]['bbox'][2] *= w
+                data_list[i]['instances'][j]['bbox'][1] *= h
+                data_list[i]['instances'][j]['bbox'][3] *= h
+            # add image-level annotation
+            if img_level_anns is not None:
+                img_labels = []
+                confidences = []
+                img_ann_list = img_level_anns.get(img_id, [])
+                for ann in img_ann_list:
+                    img_labels.append(int(ann['image_level_label']))
+                    confidences.append(float(ann['confidence']))
+                data_list[i]['image_level_labels'] = np.array(
+                    img_labels, dtype=np.int64)
+                data_list[i]['confidences'] = np.array(
+                    confidences, dtype=np.float32)
+        return data_list
+
+    def _parse_label_file(self, label_file: str) -> tuple:
+        """Get classes name and index mapping from cls-label-description file.
+
+        Args:
+            label_file (str): File path of the label description file that
+                maps the classes names in MID format to their short
+                descriptions.
+
+        Returns:
+            tuple: Class name of OpenImages.
+        """
+        label_list = []
+        id_list = []
+        index_mapping = {}
+        with get_local_path(
+                label_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                for line in reader:
+                    label_name = line[0]
+                    label_id = int(line[2])
+                    label_list.append(line[1])
+                    id_list.append(label_id)
+                    index_mapping[label_name] = label_id - 1
+        indexes = np.argsort(id_list)
+        classes_names = []
+        for index in indexes:
+            classes_names.append(label_list[index])
+        return classes_names, index_mapping
+
+    def _parse_img_level_ann(self, image_level_ann_file):
+        """Parse image level annotations from csv style ann_file.
+
+        Args:
+            image_level_ann_file (str): CSV style image level annotation
+                file path.
+
+        Returns:
+            defaultdict[list[dict]]: Annotations where item of the defaultdict
+            indicates an image, each of which has (n) dicts.
+            Keys of dicts are:
+
+                - `image_level_label` (int): of shape 1.
+                - `confidence` (float): of shape 1.
+        """
+
+        item_lists = defaultdict(list)
+        with get_local_path(
+                image_level_ann_file,
+                backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                i = -1
+                for line in reader:
+                    i += 1
+                    if i == 0:
+                        continue
+                    else:
+                        img_id = line[0]
+                        label_id = line[1]
+                        assert label_id in self.label_id_mapping
+                        image_level_label = int(
+                            self.label_id_mapping[label_id])
+                        confidence = float(line[2])
+                        item_lists[img_id].append(
+                            dict(
+                                image_level_label=image_level_label,
+                                confidence=confidence))
+        return item_lists
+
+    def _get_relation_matrix(self, hierarchy_file: str) -> np.ndarray:
+        """Get the matrix of class hierarchy from the hierarchy file.
+
+        Args:
+            hierarchy_file (str): File path to the hierarchy for classes.
+
+        Returns:
+            np.ndarray: The matrix of the corresponding
+            relationship between the parent class and the child class,
+            of shape (class_num, class_num).
+        """
+        with get_local_path(
+                hierarchy_file, backend_args=self.backend_args) as local_path:
+            class_label_tree = np.load(local_path, allow_pickle=True)
+        return class_label_tree[1:, 1:]
diff --git a/head_extractor/build/lib/mmdet/datasets/refcoco.py b/head_extractor/build/lib/mmdet/datasets/refcoco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dae75fd547216a5b69033cc821b93a1d9ac6abc
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/refcoco.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections
+import os.path as osp
+import random
+from typing import Dict, List
+
+import mmengine
+from mmengine.dataset import BaseDataset
+
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class RefCocoDataset(BaseDataset):
+    """RefCOCO dataset.
+
+    The `Refcoco` and `Refcoco+` dataset is based on
+    `ReferItGame: Referring to Objects in Photographs of Natural Scenes
+    <http://tamaraberg.com/papers/referit.pdf>`_.
+
+    The `Refcocog` dataset is based on
+    `Generation and Comprehension of Unambiguous Object Descriptions
+    <https://arxiv.org/abs/1511.02283>`_.
+
+    Args:
+        ann_file (str): Annotation file path.
+        data_root (str): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to ''.
+        data_prefix (str): Prefix for training data.
+        split_file (str): Split file path.
+        split (str): Split name. Defaults to 'train'.
+        text_mode (str): Text mode. Defaults to 'random'.
+        **kwargs: Other keyword arguments in :class:`BaseDataset`.
+    """
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 split_file: str,
+                 data_prefix: Dict,
+                 split: str = 'train',
+                 text_mode: str = 'random',
+                 **kwargs):
+        self.split_file = split_file
+        self.split = split
+
+        assert text_mode in ['original', 'random', 'concat', 'select_first']
+        self.text_mode = text_mode
+        super().__init__(
+            data_root=data_root,
+            data_prefix=data_prefix,
+            ann_file=ann_file,
+            **kwargs,
+        )
+
+    def _join_prefix(self):
+        if not mmengine.is_abs(self.split_file) and self.split_file:
+            self.split_file = osp.join(self.data_root, self.split_file)
+
+        return super()._join_prefix()
+
+    def _init_refs(self):
+        """Initialize the refs for RefCOCO."""
+        anns, imgs = {}, {}
+        for ann in self.instances['annotations']:
+            anns[ann['id']] = ann
+        for img in self.instances['images']:
+            imgs[img['id']] = img
+
+        refs, ref_to_ann = {}, {}
+        for ref in self.splits:
+            # ids
+            ref_id = ref['ref_id']
+            ann_id = ref['ann_id']
+            # add mapping related to ref
+            refs[ref_id] = ref
+            ref_to_ann[ref_id] = anns[ann_id]
+
+        self.refs = refs
+        self.ref_to_ann = ref_to_ann
+
+    def load_data_list(self) -> List[dict]:
+        """Load data list."""
+        self.splits = mmengine.load(self.split_file, file_format='pkl')
+        self.instances = mmengine.load(self.ann_file, file_format='json')
+        self._init_refs()
+        img_prefix = self.data_prefix['img_path']
+
+        ref_ids = [
+            ref['ref_id'] for ref in self.splits if ref['split'] == self.split
+        ]
+        full_anno = []
+        for ref_id in ref_ids:
+            ref = self.refs[ref_id]
+            ann = self.ref_to_ann[ref_id]
+            ann.update(ref)
+            full_anno.append(ann)
+
+        image_id_list = []
+        final_anno = {}
+        for anno in full_anno:
+            image_id_list.append(anno['image_id'])
+            final_anno[anno['ann_id']] = anno
+        annotations = [value for key, value in final_anno.items()]
+
+        coco_train_id = []
+        image_annot = {}
+        for i in range(len(self.instances['images'])):
+            coco_train_id.append(self.instances['images'][i]['id'])
+            image_annot[self.instances['images'][i]
+                        ['id']] = self.instances['images'][i]
+
+        images = []
+        for image_id in list(set(image_id_list)):
+            images += [image_annot[image_id]]
+
+        data_list = []
+
+        grounding_dict = collections.defaultdict(list)
+        for anno in annotations:
+            image_id = int(anno['image_id'])
+            grounding_dict[image_id].append(anno)
+
+        join_path = mmengine.fileio.get_file_backend(img_prefix).join_path
+        for image in images:
+            img_id = image['id']
+            instances = []
+            sentences = []
+            for grounding_anno in grounding_dict[img_id]:
+                texts = [x['raw'].lower() for x in grounding_anno['sentences']]
+                # random select one text
+                if self.text_mode == 'random':
+                    idx = random.randint(0, len(texts) - 1)
+                    text = [texts[idx]]
+                # concat all texts
+                elif self.text_mode == 'concat':
+                    text = [''.join(texts)]
+                # select the first text
+                elif self.text_mode == 'select_first':
+                    text = [texts[0]]
+                # use all texts
+                elif self.text_mode == 'original':
+                    text = texts
+                else:
+                    raise ValueError(f'Invalid text mode "{self.text_mode}".')
+                ins = [{
+                    'mask': grounding_anno['segmentation'],
+                    'ignore_flag': 0
+                }] * len(text)
+                instances.extend(ins)
+                sentences.extend(text)
+            data_info = {
+                'img_path': join_path(img_prefix, image['file_name']),
+                'img_id': img_id,
+                'instances': instances,
+                'text': sentences
+            }
+            data_list.append(data_info)
+
+        if len(data_list) == 0:
+            raise ValueError(f'No sample in split "{self.split}".')
+
+        return data_list
diff --git a/head_extractor/build/lib/mmdet/datasets/reid_dataset.py b/head_extractor/build/lib/mmdet/datasets/reid_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eed3ee4f0358edf59d19695c2b28394336dffd3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/reid_dataset.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from collections import defaultdict
+from typing import Any, Dict, List
+
+import numpy as np
+from mmengine.dataset import BaseDataset
+from mmengine.utils import check_file_exist
+
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class ReIDDataset(BaseDataset):
+    """Dataset for ReID.
+
+    Args:
+        triplet_sampler (dict, optional): The sampler for hard mining
+            triplet loss. Defaults to None.
+        keys: num_ids (int): The number of person ids.
+              ins_per_id (int): The number of image for each person.
+    """
+
+    def __init__(self, triplet_sampler: dict = None, *args, **kwargs):
+        self.triplet_sampler = triplet_sampler
+        super().__init__(*args, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ''self.ann_file''.
+
+        Returns:
+              list[dict]: A list of annotation.
+        """
+        assert isinstance(self.ann_file, str)
+        check_file_exist(self.ann_file)
+        data_list = []
+        with open(self.ann_file) as f:
+            samples = [x.strip().split(' ') for x in f.readlines()]
+            for filename, gt_label in samples:
+                info = dict(img_prefix=self.data_prefix)
+                if self.data_prefix['img_path'] is not None:
+                    info['img_path'] = osp.join(self.data_prefix['img_path'],
+                                                filename)
+                else:
+                    info['img_path'] = filename
+                info['gt_label'] = np.array(gt_label, dtype=np.int64)
+                data_list.append(info)
+        self._parse_ann_info(data_list)
+        return data_list
+
+    def _parse_ann_info(self, data_list: List[dict]):
+        """Parse person id annotations."""
+        index_tmp_dic = defaultdict(list)  # pid->[idx1,...,idxN]
+        self.index_dic = dict()  # pid->array([idx1,...,idxN])
+        for idx, info in enumerate(data_list):
+            pid = info['gt_label']
+            index_tmp_dic[int(pid)].append(idx)
+        for pid, idxs in index_tmp_dic.items():
+            self.index_dic[pid] = np.asarray(idxs, dtype=np.int64)
+        self.pids = np.asarray(list(self.index_dic.keys()), dtype=np.int64)
+
+    def prepare_data(self, idx: int) -> Any:
+        """Get data processed by ''self.pipeline''.
+
+        Args:
+            idx (int): The index of ''data_info''
+
+        Returns:
+            Any: Depends on ''self.pipeline''
+        """
+        data_info = self.get_data_info(idx)
+        if self.triplet_sampler is not None:
+            img_info = self.triplet_sampling(data_info['gt_label'],
+                                             **self.triplet_sampler)
+            data_info = copy.deepcopy(img_info)  # triplet -> list
+        else:
+            data_info = copy.deepcopy(data_info)  # no triplet -> dict
+        return self.pipeline(data_info)
+
+    def triplet_sampling(self,
+                         pos_pid,
+                         num_ids: int = 8,
+                         ins_per_id: int = 4) -> Dict:
+        """Triplet sampler for hard mining triplet loss. First, for one
+        pos_pid, random sample ins_per_id images with same person id.
+
+        Then, random sample num_ids - 1 images for each negative id.
+        Finally, random sample ins_per_id images for each negative id.
+
+        Args:
+            pos_pid (ndarray): The person id of the anchor.
+            num_ids (int): The number of person ids.
+            ins_per_id (int): The number of images for each person.
+
+        Returns:
+            Dict: Annotation information of num_ids X ins_per_id images.
+        """
+        assert len(self.pids) >= num_ids, \
+            'The number of person ids in the training set must ' \
+            'be greater than the number of person ids in the sample.'
+
+        pos_idxs = self.index_dic[int(
+            pos_pid)]  # all positive idxs for pos_pid
+        idxs_list = []
+        # select positive samplers
+        idxs_list.extend(pos_idxs[np.random.choice(
+            pos_idxs.shape[0], ins_per_id, replace=True)])
+        # select negative ids
+        neg_pids = np.random.choice(
+            [i for i, _ in enumerate(self.pids) if i != pos_pid],
+            num_ids - 1,
+            replace=False)
+        # select negative samplers for each negative id
+        for neg_pid in neg_pids:
+            neg_idxs = self.index_dic[neg_pid]
+            idxs_list.extend(neg_idxs[np.random.choice(
+                neg_idxs.shape[0], ins_per_id, replace=True)])
+        # return the final triplet batch
+        triplet_img_infos = []
+        for idx in idxs_list:
+            triplet_img_infos.append(copy.deepcopy(self.get_data_info(idx)))
+        # Collect data_list scatters (list of dict -> dict of list)
+        out = dict()
+        for key in triplet_img_infos[0].keys():
+            out[key] = [_info[key] for _info in triplet_img_infos]
+        return out
diff --git a/head_extractor/build/lib/mmdet/datasets/samplers/__init__.py b/head_extractor/build/lib/mmdet/datasets/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ea0e4cb0628fc23bc034c51e503d8ceca5ee90c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/samplers/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .batch_sampler import (AspectRatioBatchSampler,
+                            MultiDataAspectRatioBatchSampler,
+                            TrackAspectRatioBatchSampler)
+from .class_aware_sampler import ClassAwareSampler
+from .custom_sample_size_sampler import CustomSampleSizeSampler
+from .multi_data_sampler import MultiDataSampler
+from .multi_source_sampler import GroupMultiSourceSampler, MultiSourceSampler
+from .track_img_sampler import TrackImgSampler
+
+__all__ = [
+    'ClassAwareSampler', 'AspectRatioBatchSampler', 'MultiSourceSampler',
+    'GroupMultiSourceSampler', 'TrackImgSampler',
+    'TrackAspectRatioBatchSampler', 'MultiDataSampler',
+    'MultiDataAspectRatioBatchSampler', 'CustomSampleSizeSampler'
+]
diff --git a/head_extractor/build/lib/mmdet/datasets/samplers/batch_sampler.py b/head_extractor/build/lib/mmdet/datasets/samplers/batch_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c17789c4e3ea51f1fa140d039a679f797a7660f6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/samplers/batch_sampler.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+from torch.utils.data import BatchSampler, Sampler
+
+from mmdet.datasets.samplers.track_img_sampler import TrackImgSampler
+from mmdet.registry import DATA_SAMPLERS
+
+
+# TODO: maybe replace with a data_loader wrapper
+@DATA_SAMPLERS.register_module()
+class AspectRatioBatchSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio (< 1 or.
+
+    >= 1) into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+    """
+
+    def __init__(self,
+                 sampler: Sampler,
+                 batch_size: int,
+                 drop_last: bool = False) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        # two groups for w < h and w >= h
+        self._aspect_ratio_buckets = [[] for _ in range(2)]
+
+    def __iter__(self) -> Sequence[int]:
+        for idx in self.sampler:
+            data_info = self.sampler.dataset.get_data_info(idx)
+            width, height = data_info['width'], data_info['height']
+            bucket_id = 0 if width < height else 1
+            bucket = self._aspect_ratio_buckets[bucket_id]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+
+        # yield the rest data and reset the bucket
+        left_data = self._aspect_ratio_buckets[0] + self._aspect_ratio_buckets[
+            1]
+        self._aspect_ratio_buckets = [[] for _ in range(2)]
+        while len(left_data) > 0:
+            if len(left_data) <= self.batch_size:
+                if not self.drop_last:
+                    yield left_data[:]
+                left_data = []
+            else:
+                yield left_data[:self.batch_size]
+                left_data = left_data[self.batch_size:]
+
+    def __len__(self) -> int:
+        if self.drop_last:
+            return len(self.sampler) // self.batch_size
+        else:
+            return (len(self.sampler) + self.batch_size - 1) // self.batch_size
+
+
+@DATA_SAMPLERS.register_module()
+class TrackAspectRatioBatchSampler(AspectRatioBatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio (< 1 or.
+
+    >= 1) into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+    """
+
+    def __iter__(self) -> Sequence[int]:
+        for idx in self.sampler:
+            # hard code to solve TrackImgSampler
+            if isinstance(self.sampler, TrackImgSampler):
+                video_idx, _ = idx
+            else:
+                video_idx = idx
+            # video_idx
+            data_info = self.sampler.dataset.get_data_info(video_idx)
+            # data_info {video_id, images, video_length}
+            img_data_info = data_info['images'][0]
+            width, height = img_data_info['width'], img_data_info['height']
+            bucket_id = 0 if width < height else 1
+            bucket = self._aspect_ratio_buckets[bucket_id]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+
+        # yield the rest data and reset the bucket
+        left_data = self._aspect_ratio_buckets[0] + self._aspect_ratio_buckets[
+            1]
+        self._aspect_ratio_buckets = [[] for _ in range(2)]
+        while len(left_data) > 0:
+            if len(left_data) <= self.batch_size:
+                if not self.drop_last:
+                    yield left_data[:]
+                left_data = []
+            else:
+                yield left_data[:self.batch_size]
+                left_data = left_data[self.batch_size:]
+
+
+@DATA_SAMPLERS.register_module()
+class MultiDataAspectRatioBatchSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio (< 1 or.
+
+    >= 1) into a same batch for multi-source datasets.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        batch_size (Sequence(int)): Size of mini-batch for multi-source
+        datasets.
+        num_datasets(int): Number of multi-source datasets.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+        its size would be less than ``batch_size``.
+    """
+
+    def __init__(self,
+                 sampler: Sampler,
+                 batch_size: Sequence[int],
+                 num_datasets: int,
+                 drop_last: bool = True) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.num_datasets = num_datasets
+        self.drop_last = drop_last
+        # two groups for w < h and w >= h for each dataset --> 2 * num_datasets
+        self._buckets = [[] for _ in range(2 * self.num_datasets)]
+
+    def __iter__(self) -> Sequence[int]:
+        for idx in self.sampler:
+            data_info = self.sampler.dataset.get_data_info(idx)
+            width, height = data_info['width'], data_info['height']
+            dataset_source_idx = self.sampler.dataset.get_dataset_source(idx)
+            aspect_ratio_bucket_id = 0 if width < height else 1
+            bucket_id = dataset_source_idx * 2 + aspect_ratio_bucket_id
+            bucket = self._buckets[bucket_id]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size[dataset_source_idx]:
+                yield bucket[:]
+                del bucket[:]
+
+        # yield the rest data and reset the bucket
+        for i in range(self.num_datasets):
+            left_data = self._buckets[i * 2 + 0] + self._buckets[i * 2 + 1]
+            while len(left_data) > 0:
+                if len(left_data) <= self.batch_size[i]:
+                    if not self.drop_last:
+                        yield left_data[:]
+                    left_data = []
+                else:
+                    yield left_data[:self.batch_size[i]]
+                    left_data = left_data[self.batch_size[i]:]
+
+        self._buckets = [[] for _ in range(2 * self.num_datasets)]
+
+    def __len__(self) -> int:
+        sizes = [0 for _ in range(self.num_datasets)]
+        for idx in self.sampler:
+            dataset_source_idx = self.sampler.dataset.get_dataset_source(idx)
+            sizes[dataset_source_idx] += 1
+
+        if self.drop_last:
+            lens = 0
+            for i in range(self.num_datasets):
+                lens += sizes[i] // self.batch_size[i]
+            return lens
+        else:
+            lens = 0
+            for i in range(self.num_datasets):
+                lens += (sizes[i] + self.batch_size[i] -
+                         1) // self.batch_size[i]
+            return lens
diff --git a/head_extractor/build/lib/mmdet/datasets/samplers/class_aware_sampler.py b/head_extractor/build/lib/mmdet/datasets/samplers/class_aware_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ca2f9b3ffb7c780ab25cc3704b67589763259e0
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/samplers/class_aware_sampler.py
@@ -0,0 +1,192 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Dict, Iterator, Optional, Union
+
+import numpy as np
+import torch
+from mmengine.dataset import BaseDataset
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+
+
+@DATA_SAMPLERS.register_module()
+class ClassAwareSampler(Sampler):
+    r"""Sampler that restricts data loading to the label of the dataset.
+
+    A class-aware sampling strategy to effectively tackle the
+    non-uniform class distribution. The length of the training data is
+    consistent with source data. Simple improvements based on `Relay
+    Backpropagation for Effective Learning of Deep Convolutional
+    Neural Networks <https://arxiv.org/abs/1512.05830>`_
+
+    The implementation logic is referred to
+    https://github.com/Sense-X/TSD/blob/master/mmdet/datasets/samplers/distributed_classaware_sampler.py
+
+    Args:
+        dataset: Dataset used for sampling.
+        seed (int, optional): random seed used to shuffle the sampler.
+            This number should be identical across all
+            processes in the distributed group. Defaults to None.
+        num_sample_class (int): The number of samples taken from each
+            per-label list. Defaults to 1.
+    """
+
+    def __init__(self,
+                 dataset: BaseDataset,
+                 seed: Optional[int] = None,
+                 num_sample_class: int = 1) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.epoch = 0
+        # Must be the same across all workers. If None, will use a
+        # random seed shared among workers
+        # (require synchronization among all workers)
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+
+        # The number of samples taken from each per-label list
+        assert num_sample_class > 0 and isinstance(num_sample_class, int)
+        self.num_sample_class = num_sample_class
+        # Get per-label image list from dataset
+        self.cat_dict = self.get_cat2imgs()
+
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / world_size))
+        self.total_size = self.num_samples * self.world_size
+
+        # get number of images containing each category
+        self.num_cat_imgs = [len(x) for x in self.cat_dict.values()]
+        # filter labels without images
+        self.valid_cat_inds = [
+            i for i, length in enumerate(self.num_cat_imgs) if length != 0
+        ]
+        self.num_classes = len(self.valid_cat_inds)
+
+    def get_cat2imgs(self) -> Dict[int, list]:
+        """Get a dict with class as key and img_ids as values.
+
+        Returns:
+            dict[int, list]: A dict of per-label image list,
+            the item of the dict indicates a label index,
+            corresponds to the image index that contains the label.
+        """
+        classes = self.dataset.metainfo.get('classes', None)
+        if classes is None:
+            raise ValueError('dataset metainfo must contain `classes`')
+        # sort the label index
+        cat2imgs = {i: [] for i in range(len(classes))}
+        for i in range(len(self.dataset)):
+            cat_ids = set(self.dataset.get_cat_ids(i))
+            for cat in cat_ids:
+                cat2imgs[cat].append(i)
+        return cat2imgs
+
+    def __iter__(self) -> Iterator[int]:
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch + self.seed)
+
+        # initialize label list
+        label_iter_list = RandomCycleIter(self.valid_cat_inds, generator=g)
+        # initialize each per-label image list
+        data_iter_dict = dict()
+        for i in self.valid_cat_inds:
+            data_iter_dict[i] = RandomCycleIter(self.cat_dict[i], generator=g)
+
+        def gen_cat_img_inds(cls_list, data_dict, num_sample_cls):
+            """Traverse the categories and extract `num_sample_cls` image
+            indexes of the corresponding categories one by one."""
+            id_indices = []
+            for _ in range(len(cls_list)):
+                cls_idx = next(cls_list)
+                for _ in range(num_sample_cls):
+                    id = next(data_dict[cls_idx])
+                    id_indices.append(id)
+            return id_indices
+
+        # deterministically shuffle based on epoch
+        num_bins = int(
+            math.ceil(self.total_size * 1.0 / self.num_classes /
+                      self.num_sample_class))
+        indices = []
+        for i in range(num_bins):
+            indices += gen_cat_img_inds(label_iter_list, data_iter_dict,
+                                        self.num_sample_class)
+
+        # fix extra samples to make it evenly divisible
+        if len(indices) >= self.total_size:
+            indices = indices[:self.total_size]
+        else:
+            indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self) -> int:
+        """The number of samples in this rank."""
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        """Sets the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas use a different
+        random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
+
+
+class RandomCycleIter:
+    """Shuffle the list and do it again after the list have traversed.
+
+    The implementation logic is referred to
+    https://github.com/wutong16/DistributionBalancedLoss/blob/master/mllt/datasets/loader/sampler.py
+
+    Example:
+        >>> label_list = [0, 1, 2, 4, 5]
+        >>> g = torch.Generator()
+        >>> g.manual_seed(0)
+        >>> label_iter_list = RandomCycleIter(label_list, generator=g)
+        >>> index = next(label_iter_list)
+    Args:
+        data (list or ndarray): The data that needs to be shuffled.
+        generator: An torch.Generator object, which is used in setting the seed
+            for generating random numbers.
+    """  # noqa: W605
+
+    def __init__(self,
+                 data: Union[list, np.ndarray],
+                 generator: torch.Generator = None) -> None:
+        self.data = data
+        self.length = len(data)
+        self.index = torch.randperm(self.length, generator=generator).numpy()
+        self.i = 0
+        self.generator = generator
+
+    def __iter__(self) -> Iterator:
+        return self
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def __next__(self):
+        if self.i == self.length:
+            self.index = torch.randperm(
+                self.length, generator=self.generator).numpy()
+            self.i = 0
+        idx = self.data[self.index[self.i]]
+        self.i += 1
+        return idx
diff --git a/head_extractor/build/lib/mmdet/datasets/samplers/custom_sample_size_sampler.py b/head_extractor/build/lib/mmdet/datasets/samplers/custom_sample_size_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bedf6c66be81b091a6424bae6788953ba7763a3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/samplers/custom_sample_size_sampler.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Iterator, Optional, Sequence, Sized
+
+import torch
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+from .class_aware_sampler import RandomCycleIter
+
+
+@DATA_SAMPLERS.register_module()
+class CustomSampleSizeSampler(Sampler):
+
+    def __init__(self,
+                 dataset: Sized,
+                 dataset_size: Sequence[int],
+                 ratio_mode: bool = False,
+                 seed: Optional[int] = None,
+                 round_up: bool = True) -> None:
+        assert len(dataset.datasets) == len(dataset_size)
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+        self.epoch = 0
+        self.round_up = round_up
+
+        total_size = 0
+        total_size_fake = 0
+        self.dataset_index = []
+        self.dataset_cycle_iter = []
+        new_dataset_size = []
+        for dataset, size in zip(dataset.datasets, dataset_size):
+            self.dataset_index.append(
+                list(range(total_size_fake,
+                           len(dataset) + total_size_fake)))
+            total_size_fake += len(dataset)
+            if size == -1:
+                total_size += len(dataset)
+                self.dataset_cycle_iter.append(None)
+                new_dataset_size.append(-1)
+            else:
+                if ratio_mode:
+                    size = int(size * len(dataset))
+                assert size <= len(
+                    dataset
+                ), f'dataset size {size} is larger than ' \
+                   f'dataset length {len(dataset)}'
+                total_size += size
+                new_dataset_size.append(size)
+
+                g = torch.Generator()
+                g.manual_seed(self.seed)
+                self.dataset_cycle_iter.append(
+                    RandomCycleIter(self.dataset_index[-1], generator=g))
+        self.dataset_size = new_dataset_size
+
+        if self.round_up:
+            self.num_samples = math.ceil(total_size / world_size)
+            self.total_size = self.num_samples * self.world_size
+        else:
+            self.num_samples = math.ceil((total_size - rank) / world_size)
+            self.total_size = total_size
+
+    def __iter__(self) -> Iterator[int]:
+        """Iterate the indices."""
+        # deterministically shuffle based on epoch and seed
+        g = torch.Generator()
+        g.manual_seed(self.seed + self.epoch)
+
+        out_index = []
+        for data_size, data_index, cycle_iter in zip(self.dataset_size,
+                                                     self.dataset_index,
+                                                     self.dataset_cycle_iter):
+            if data_size == -1:
+                out_index += data_index
+            else:
+                index = [next(cycle_iter) for _ in range(data_size)]
+                out_index += index
+
+        index = torch.randperm(len(out_index), generator=g).numpy().tolist()
+        indices = [out_index[i] for i in index]
+
+        if self.round_up:
+            indices = (
+                indices *
+                int(self.total_size / len(indices) + 1))[:self.total_size]
+        indices = indices[self.rank:self.total_size:self.world_size]
+        return iter(indices)
+
+    def __len__(self) -> int:
+        """The number of samples in this rank."""
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        """Sets the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas use a different
+        random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
diff --git a/head_extractor/build/lib/mmdet/datasets/samplers/multi_data_sampler.py b/head_extractor/build/lib/mmdet/datasets/samplers/multi_data_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3a4b60d84122ce9eb2090095e9744c2bd73cc3d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/samplers/multi_data_sampler.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Iterator, Optional, Sequence, Sized
+
+import torch
+from mmengine.dist import get_dist_info, sync_random_seed
+from mmengine.registry import DATA_SAMPLERS
+from torch.utils.data import Sampler
+
+
+@DATA_SAMPLERS.register_module()
+class MultiDataSampler(Sampler):
+    """The default data sampler for both distributed and non-distributed
+    environment.
+
+    It has several differences from the PyTorch ``DistributedSampler`` as
+    below:
+
+    1. This sampler supports non-distributed environment.
+
+    2. The round up behaviors are a little different.
+
+       - If ``round_up=True``, this sampler will add extra samples to make the
+         number of samples is evenly divisible by the world size. And
+         this behavior is the same as the ``DistributedSampler`` with
+         ``drop_last=False``.
+       - If ``round_up=False``, this sampler won't remove or add any samples
+         while the ``DistributedSampler`` with ``drop_last=True`` will remove
+         tail samples.
+
+    Args:
+        dataset (Sized): The dataset.
+        dataset_ratio (Sequence(int)) The ratios of different datasets.
+        seed (int, optional): Random seed used to shuffle the sampler if
+            :attr:`shuffle=True`. This number should be identical across all
+            processes in the distributed group. Defaults to None.
+        round_up (bool): Whether to add extra samples to make the number of
+            samples evenly divisible by the world size. Defaults to True.
+    """
+
+    def __init__(self,
+                 dataset: Sized,
+                 dataset_ratio: Sequence[int],
+                 seed: Optional[int] = None,
+                 round_up: bool = True) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.dataset_ratio = dataset_ratio
+
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+        self.epoch = 0
+        self.round_up = round_up
+
+        if self.round_up:
+            self.num_samples = math.ceil(len(self.dataset) / world_size)
+            self.total_size = self.num_samples * self.world_size
+        else:
+            self.num_samples = math.ceil(
+                (len(self.dataset) - rank) / world_size)
+            self.total_size = len(self.dataset)
+
+        self.sizes = [len(dataset) for dataset in self.dataset.datasets]
+
+        dataset_weight = [
+            torch.ones(s) * max(self.sizes) / s * r / sum(self.dataset_ratio)
+            for i, (r, s) in enumerate(zip(self.dataset_ratio, self.sizes))
+        ]
+        self.weights = torch.cat(dataset_weight)
+
+    def __iter__(self) -> Iterator[int]:
+        """Iterate the indices."""
+        # deterministically shuffle based on epoch and seed
+        g = torch.Generator()
+        g.manual_seed(self.seed + self.epoch)
+
+        indices = torch.multinomial(
+            self.weights, len(self.weights), generator=g,
+            replacement=True).tolist()
+
+        # add extra samples to make it evenly divisible
+        if self.round_up:
+            indices = (
+                indices *
+                int(self.total_size / len(indices) + 1))[:self.total_size]
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.world_size]
+
+        return iter(indices)
+
+    def __len__(self) -> int:
+        """The number of samples in this rank."""
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        """Sets the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas use a different
+        random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
diff --git a/head_extractor/build/lib/mmdet/datasets/samplers/multi_source_sampler.py b/head_extractor/build/lib/mmdet/datasets/samplers/multi_source_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6efcde35e1375547239825a8f78a9e74f7825290
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/samplers/multi_source_sampler.py
@@ -0,0 +1,214 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+from typing import Iterator, List, Optional, Sized, Union
+
+import numpy as np
+import torch
+from mmengine.dataset import BaseDataset
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+
+
+@DATA_SAMPLERS.register_module()
+class MultiSourceSampler(Sampler):
+    r"""Multi-Source Infinite Sampler.
+
+    According to the sampling ratio, sample data from different
+    datasets to form batches.
+
+    Args:
+        dataset (Sized): The dataset.
+        batch_size (int): Size of mini-batch.
+        source_ratio (list[int | float]): The sampling ratio of different
+            source datasets in a mini-batch.
+        shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
+        seed (int, optional): Random seed. If None, set a random seed.
+            Defaults to None.
+
+    Examples:
+        >>> dataset_type = 'ConcatDataset'
+        >>> sub_dataset_type = 'CocoDataset'
+        >>> data_root = 'data/coco/'
+        >>> sup_ann = '../coco_semi_annos/instances_train2017.1@10.json'
+        >>> unsup_ann = '../coco_semi_annos/' \
+        >>>             'instances_train2017.1@10-unlabeled.json'
+        >>> dataset = dict(type=dataset_type,
+        >>>     datasets=[
+        >>>         dict(
+        >>>             type=sub_dataset_type,
+        >>>             data_root=data_root,
+        >>>             ann_file=sup_ann,
+        >>>             data_prefix=dict(img='train2017/'),
+        >>>             filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        >>>             pipeline=sup_pipeline),
+        >>>         dict(
+        >>>             type=sub_dataset_type,
+        >>>             data_root=data_root,
+        >>>             ann_file=unsup_ann,
+        >>>             data_prefix=dict(img='train2017/'),
+        >>>             filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        >>>             pipeline=unsup_pipeline),
+        >>>         ])
+        >>>     train_dataloader = dict(
+        >>>         batch_size=5,
+        >>>         num_workers=5,
+        >>>         persistent_workers=True,
+        >>>         sampler=dict(type='MultiSourceSampler',
+        >>>             batch_size=5, source_ratio=[1, 4]),
+        >>>         batch_sampler=None,
+        >>>         dataset=dataset)
+    """
+
+    def __init__(self,
+                 dataset: Sized,
+                 batch_size: int,
+                 source_ratio: List[Union[int, float]],
+                 shuffle: bool = True,
+                 seed: Optional[int] = None) -> None:
+
+        assert hasattr(dataset, 'cumulative_sizes'),\
+            f'The dataset must be ConcatDataset, but get {dataset}'
+        assert isinstance(batch_size, int) and batch_size > 0, \
+            'batch_size must be a positive integer value, ' \
+            f'but got batch_size={batch_size}'
+        assert isinstance(source_ratio, list), \
+            f'source_ratio must be a list, but got source_ratio={source_ratio}'
+        assert len(source_ratio) == len(dataset.cumulative_sizes), \
+            'The length of source_ratio must be equal to ' \
+            f'the number of datasets, but got source_ratio={source_ratio}'
+
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.cumulative_sizes = [0] + dataset.cumulative_sizes
+        self.batch_size = batch_size
+        self.source_ratio = source_ratio
+
+        self.num_per_source = [
+            int(batch_size * sr / sum(source_ratio)) for sr in source_ratio
+        ]
+        self.num_per_source[0] = batch_size - sum(self.num_per_source[1:])
+
+        assert sum(self.num_per_source) == batch_size, \
+            'The sum of num_per_source must be equal to ' \
+            f'batch_size, but get {self.num_per_source}'
+
+        self.seed = sync_random_seed() if seed is None else seed
+        self.shuffle = shuffle
+        self.source2inds = {
+            source: self._indices_of_rank(len(ds))
+            for source, ds in enumerate(dataset.datasets)
+        }
+
+    def _infinite_indices(self, sample_size: int) -> Iterator[int]:
+        """Infinitely yield a sequence of indices."""
+        g = torch.Generator()
+        g.manual_seed(self.seed)
+        while True:
+            if self.shuffle:
+                yield from torch.randperm(sample_size, generator=g).tolist()
+            else:
+                yield from torch.arange(sample_size).tolist()
+
+    def _indices_of_rank(self, sample_size: int) -> Iterator[int]:
+        """Slice the infinite indices by rank."""
+        yield from itertools.islice(
+            self._infinite_indices(sample_size), self.rank, None,
+            self.world_size)
+
+    def __iter__(self) -> Iterator[int]:
+        batch_buffer = []
+        while True:
+            for source, num in enumerate(self.num_per_source):
+                batch_buffer_per_source = []
+                for idx in self.source2inds[source]:
+                    idx += self.cumulative_sizes[source]
+                    batch_buffer_per_source.append(idx)
+                    if len(batch_buffer_per_source) == num:
+                        batch_buffer += batch_buffer_per_source
+                        break
+            yield from batch_buffer
+            batch_buffer = []
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def set_epoch(self, epoch: int) -> None:
+        """Not supported in `epoch-based runner."""
+        pass
+
+
+@DATA_SAMPLERS.register_module()
+class GroupMultiSourceSampler(MultiSourceSampler):
+    r"""Group Multi-Source Infinite Sampler.
+
+    According to the sampling ratio, sample data from different
+    datasets but the same group to form batches.
+
+    Args:
+        dataset (Sized): The dataset.
+        batch_size (int): Size of mini-batch.
+        source_ratio (list[int | float]): The sampling ratio of different
+            source datasets in a mini-batch.
+        shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
+        seed (int, optional): Random seed. If None, set a random seed.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 dataset: BaseDataset,
+                 batch_size: int,
+                 source_ratio: List[Union[int, float]],
+                 shuffle: bool = True,
+                 seed: Optional[int] = None) -> None:
+        super().__init__(
+            dataset=dataset,
+            batch_size=batch_size,
+            source_ratio=source_ratio,
+            shuffle=shuffle,
+            seed=seed)
+
+        self._get_source_group_info()
+        self.group_source2inds = [{
+            source:
+            self._indices_of_rank(self.group2size_per_source[source][group])
+            for source in range(len(dataset.datasets))
+        } for group in range(len(self.group_ratio))]
+
+    def _get_source_group_info(self) -> None:
+        self.group2size_per_source = [{0: 0, 1: 0}, {0: 0, 1: 0}]
+        self.group2inds_per_source = [{0: [], 1: []}, {0: [], 1: []}]
+        for source, dataset in enumerate(self.dataset.datasets):
+            for idx in range(len(dataset)):
+                data_info = dataset.get_data_info(idx)
+                width, height = data_info['width'], data_info['height']
+                group = 0 if width < height else 1
+                self.group2size_per_source[source][group] += 1
+                self.group2inds_per_source[source][group].append(idx)
+
+        self.group_sizes = np.zeros(2, dtype=np.int64)
+        for group2size in self.group2size_per_source:
+            for group, size in group2size.items():
+                self.group_sizes[group] += size
+        self.group_ratio = self.group_sizes / sum(self.group_sizes)
+
+    def __iter__(self) -> Iterator[int]:
+        batch_buffer = []
+        while True:
+            group = np.random.choice(
+                list(range(len(self.group_ratio))), p=self.group_ratio)
+            for source, num in enumerate(self.num_per_source):
+                batch_buffer_per_source = []
+                for idx in self.group_source2inds[group][source]:
+                    idx = self.group2inds_per_source[source][group][
+                        idx] + self.cumulative_sizes[source]
+                    batch_buffer_per_source.append(idx)
+                    if len(batch_buffer_per_source) == num:
+                        batch_buffer += batch_buffer_per_source
+                        break
+            yield from batch_buffer
+            batch_buffer = []
diff --git a/head_extractor/build/lib/mmdet/datasets/samplers/track_img_sampler.py b/head_extractor/build/lib/mmdet/datasets/samplers/track_img_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7db629f40f3f24bdf14cd852ccc4472d1d50f1b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/samplers/track_img_sampler.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import random
+from typing import Iterator, Optional, Sized
+
+import numpy as np
+from mmengine.dataset import ClassBalancedDataset, ConcatDataset
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+from ..base_video_dataset import BaseVideoDataset
+
+
+@DATA_SAMPLERS.register_module()
+class TrackImgSampler(Sampler):
+    """Sampler that providing image-level sampling outputs for video datasets
+    in tracking tasks. It could be both used in both distributed and
+    non-distributed environment.
+    If using the default sampler in pytorch, the subsequent data receiver will
+    get one video, which is not desired in some cases:
+    (Take a non-distributed environment as an example)
+    1. In test mode, we want only one image is fed into the data pipeline. This
+    is in consideration of memory usage since feeding the whole video commonly
+    requires a large amount of memory (>=20G on MOTChallenge17 dataset), which
+    is not available in some machines.
+    2. In training mode, we may want to make sure all the images in one video
+    are randomly sampled once in one epoch and this can not be guaranteed in
+    the default sampler in pytorch.
+
+    Args:
+        dataset (Sized): Dataset used for sampling.
+        seed (int, optional): random seed used to shuffle the sampler. This
+            number should be identical across all processes in the distributed
+            group. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        dataset: Sized,
+        seed: Optional[int] = None,
+    ) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+        self.epoch = 0
+        if seed is None:
+            self.seed = sync_random_seed()
+        else:
+            self.seed = seed
+
+        self.dataset = dataset
+        self.indices = []
+        # Hard code here to handle different dataset wrapper
+        if isinstance(self.dataset, ConcatDataset):
+            cat_datasets = self.dataset.datasets
+            assert isinstance(
+                cat_datasets[0], BaseVideoDataset
+            ), f'expected BaseVideoDataset, but got {type(cat_datasets[0])}'
+            self.test_mode = cat_datasets[0].test_mode
+            assert not self.test_mode, "'ConcatDataset' should not exist in "
+            'test mode'
+            for dataset in cat_datasets:
+                num_videos = len(dataset)
+                for video_ind in range(num_videos):
+                    self.indices.extend([
+                        (video_ind, frame_ind) for frame_ind in range(
+                            dataset.get_len_per_video(video_ind))
+                    ])
+        elif isinstance(self.dataset, ClassBalancedDataset):
+            ori_dataset = self.dataset.dataset
+            assert isinstance(
+                ori_dataset, BaseVideoDataset
+            ), f'expected BaseVideoDataset, but got {type(ori_dataset)}'
+            self.test_mode = ori_dataset.test_mode
+            assert not self.test_mode, "'ClassBalancedDataset' should not "
+            'exist in test mode'
+            video_indices = self.dataset.repeat_indices
+            for index in video_indices:
+                self.indices.extend([(index, frame_ind) for frame_ind in range(
+                    ori_dataset.get_len_per_video(index))])
+        else:
+            assert isinstance(
+                self.dataset, BaseVideoDataset
+            ), 'TrackImgSampler is only supported in BaseVideoDataset or '
+            'dataset wrapper: ClassBalancedDataset and ConcatDataset, but '
+            f'got {type(self.dataset)} '
+            self.test_mode = self.dataset.test_mode
+            num_videos = len(self.dataset)
+
+            if self.test_mode:
+                # in test mode, the images belong to the same video must be put
+                # on the same device.
+                if num_videos < self.world_size:
+                    raise ValueError(f'only {num_videos} videos loaded,'
+                                     f'but {self.world_size} gpus were given.')
+                chunks = np.array_split(
+                    list(range(num_videos)), self.world_size)
+                for videos_inds in chunks:
+                    indices_chunk = []
+                    for video_ind in videos_inds:
+                        indices_chunk.extend([
+                            (video_ind, frame_ind) for frame_ind in range(
+                                self.dataset.get_len_per_video(video_ind))
+                        ])
+                    self.indices.append(indices_chunk)
+            else:
+                for video_ind in range(num_videos):
+                    self.indices.extend([
+                        (video_ind, frame_ind) for frame_ind in range(
+                            self.dataset.get_len_per_video(video_ind))
+                    ])
+
+        if self.test_mode:
+            self.num_samples = len(self.indices[self.rank])
+            self.total_size = sum(
+                [len(index_list) for index_list in self.indices])
+        else:
+            self.num_samples = int(
+                math.ceil(len(self.indices) * 1.0 / self.world_size))
+            self.total_size = self.num_samples * self.world_size
+
+    def __iter__(self) -> Iterator:
+        if self.test_mode:
+            # in test mode, the order of frames can not be shuffled.
+            indices = self.indices[self.rank]
+        else:
+            # deterministically shuffle based on epoch
+            rng = random.Random(self.epoch + self.seed)
+            indices = rng.sample(self.indices, len(self.indices))
+
+            # add extra samples to make it evenly divisible
+            indices += indices[:(self.total_size - len(indices))]
+            assert len(indices) == self.total_size
+
+            # subsample
+            indices = indices[self.rank:self.total_size:self.world_size]
+            assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/head_extractor/build/lib/mmdet/datasets/transforms/__init__.py b/head_extractor/build/lib/mmdet/datasets/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab3478feb008443cb0e56bf5084261370e38327d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/transforms/__init__.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .augment_wrappers import AutoAugment, RandAugment
+from .colorspace import (AutoContrast, Brightness, Color, ColorTransform,
+                         Contrast, Equalize, Invert, Posterize, Sharpness,
+                         Solarize, SolarizeAdd)
+from .formatting import (ImageToTensor, PackDetInputs, PackReIDInputs,
+                         PackTrackInputs, ToTensor, Transpose)
+from .frame_sampling import BaseFrameSample, UniformRefFrameSample
+from .geometric import (GeomTransform, Rotate, ShearX, ShearY, TranslateX,
+                        TranslateY)
+from .instaboost import InstaBoost
+from .loading import (FilterAnnotations, InferencerLoader, LoadAnnotations,
+                      LoadEmptyAnnotations, LoadImageFromNDArray,
+                      LoadMultiChannelImageFromFiles, LoadPanopticAnnotations,
+                      LoadProposals, LoadTrackAnnotations)
+from .text_transformers import LoadTextAnnotations, RandomSamplingNegPos
+from .transformers_glip import GTBoxSubOne_GLIP, RandomFlip_GLIP
+from .transforms import (Albu, CachedMixUp, CachedMosaic, CopyPaste, CutOut,
+                         Expand, FixScaleResize, FixShapeResize,
+                         MinIoURandomCrop, MixUp, Mosaic, Pad,
+                         PhotoMetricDistortion, RandomAffine,
+                         RandomCenterCropPad, RandomCrop, RandomErasing,
+                         RandomFlip, RandomShift, Resize, ResizeShortestEdge,
+                         SegRescale, YOLOXHSVRandomAug)
+from .wrappers import MultiBranch, ProposalBroadcaster, RandomOrder
+
+__all__ = [
+    'PackDetInputs', 'ToTensor', 'ImageToTensor', 'Transpose',
+    'LoadImageFromNDArray', 'LoadAnnotations', 'LoadPanopticAnnotations',
+    'LoadMultiChannelImageFromFiles', 'LoadProposals', 'Resize', 'RandomFlip',
+    'RandomCrop', 'SegRescale', 'MinIoURandomCrop', 'Expand',
+    'PhotoMetricDistortion', 'Albu', 'InstaBoost', 'RandomCenterCropPad',
+    'AutoAugment', 'CutOut', 'ShearX', 'ShearY', 'Rotate', 'Color', 'Equalize',
+    'Brightness', 'Contrast', 'TranslateX', 'TranslateY', 'RandomShift',
+    'Mosaic', 'MixUp', 'RandomAffine', 'YOLOXHSVRandomAug', 'CopyPaste',
+    'FilterAnnotations', 'Pad', 'GeomTransform', 'ColorTransform',
+    'RandAugment', 'Sharpness', 'Solarize', 'SolarizeAdd', 'Posterize',
+    'AutoContrast', 'Invert', 'MultiBranch', 'RandomErasing',
+    'LoadEmptyAnnotations', 'RandomOrder', 'CachedMosaic', 'CachedMixUp',
+    'FixShapeResize', 'ProposalBroadcaster', 'InferencerLoader',
+    'LoadTrackAnnotations', 'BaseFrameSample', 'UniformRefFrameSample',
+    'PackTrackInputs', 'PackReIDInputs', 'FixScaleResize',
+    'ResizeShortestEdge', 'GTBoxSubOne_GLIP', 'RandomFlip_GLIP',
+    'RandomSamplingNegPos', 'LoadTextAnnotations'
+]
diff --git a/head_extractor/build/lib/mmdet/datasets/transforms/augment_wrappers.py b/head_extractor/build/lib/mmdet/datasets/transforms/augment_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..19fae6efdf66aa4c26bb85a2f2c96a1e079320b8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/transforms/augment_wrappers.py
@@ -0,0 +1,264 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import numpy as np
+from mmcv.transforms import RandomChoice
+from mmcv.transforms.utils import cache_randomness
+from mmengine.config import ConfigDict
+
+from mmdet.registry import TRANSFORMS
+
+# AutoAugment uses reinforcement learning to search for
+# some widely useful data augmentation strategies,
+# here we provide AUTOAUG_POLICIES_V0.
+# For AUTOAUG_POLICIES_V0, each tuple is an augmentation
+# operation of the form (operation, probability, magnitude).
+# Each element in policies is a policy that will be applied
+# sequentially on the image.
+
+# RandAugment defines a data augmentation search space, RANDAUG_SPACE,
+# sampling 1~3 data augmentations each time, and
+# setting the magnitude of each data augmentation randomly,
+# which will be applied sequentially on the image.
+
+_MAX_LEVEL = 10
+
+AUTOAUG_POLICIES_V0 = [
+    [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+    [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+    [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+    [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+    [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+    [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+    [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+    [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+    [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+    [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+    [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+    [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+    [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+    [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+    [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+    [('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)],
+    [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+    [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+    [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+    [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+    [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+    [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+    [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+    [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+    [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+]
+
+
+def policies_v0():
+    """Autoaugment policies that was used in AutoAugment Paper."""
+    policies = list()
+    for policy_args in AUTOAUG_POLICIES_V0:
+        policy = list()
+        for args in policy_args:
+            policy.append(dict(type=args[0], prob=args[1], level=args[2]))
+        policies.append(policy)
+    return policies
+
+
+RANDAUG_SPACE = [[dict(type='AutoContrast')], [dict(type='Equalize')],
+                 [dict(type='Invert')], [dict(type='Rotate')],
+                 [dict(type='Posterize')], [dict(type='Solarize')],
+                 [dict(type='SolarizeAdd')], [dict(type='Color')],
+                 [dict(type='Contrast')], [dict(type='Brightness')],
+                 [dict(type='Sharpness')], [dict(type='ShearX')],
+                 [dict(type='ShearY')], [dict(type='TranslateX')],
+                 [dict(type='TranslateY')]]
+
+
+def level_to_mag(level: Optional[int], min_mag: float,
+                 max_mag: float) -> float:
+    """Map from level to magnitude."""
+    if level is None:
+        return round(np.random.rand() * (max_mag - min_mag) + min_mag, 1)
+    else:
+        return round(level / _MAX_LEVEL * (max_mag - min_mag) + min_mag, 1)
+
+
+@TRANSFORMS.register_module()
+class AutoAugment(RandomChoice):
+    """Auto augmentation.
+
+    This data augmentation is proposed in `AutoAugment: Learning
+    Augmentation Policies from Data <https://arxiv.org/abs/1805.09501>`_
+    and in `Learning Data Augmentation Strategies for Object Detection
+    <https://arxiv.org/pdf/1906.11172>`_.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_masks
+    - gt_ignore_flags
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        policies (List[List[Union[dict, ConfigDict]]]):
+            The policies of auto augmentation.Each policy in ``policies``
+            is a specific augmentation policy, and is composed by several
+            augmentations. When AutoAugment is called, a random policy in
+            ``policies`` will be selected to augment images.
+            Defaults to policy_v0().
+        prob (list[float], optional): The probabilities associated
+            with each policy. The length should be equal to the policy
+            number and the sum should be 1. If not given, a uniform
+            distribution will be assumed. Defaults to None.
+
+    Examples:
+        >>> policies = [
+        >>>     [
+        >>>         dict(type='Sharpness', prob=0.0, level=8),
+        >>>         dict(type='ShearX', prob=0.4, level=0,)
+        >>>     ],
+        >>>     [
+        >>>         dict(type='Rotate', prob=0.6, level=10),
+        >>>         dict(type='Color', prob=1.0, level=6)
+        >>>     ]
+        >>> ]
+        >>> augmentation = AutoAugment(policies)
+        >>> img = np.ones(100, 100, 3)
+        >>> gt_bboxes = np.ones(10, 4)
+        >>> results = dict(img=img, gt_bboxes=gt_bboxes)
+        >>> results = augmentation(results)
+    """
+
+    def __init__(self,
+                 policies: List[List[Union[dict, ConfigDict]]] = policies_v0(),
+                 prob: Optional[List[float]] = None) -> None:
+        assert isinstance(policies, list) and len(policies) > 0, \
+            'Policies must be a non-empty list.'
+        for policy in policies:
+            assert isinstance(policy, list) and len(policy) > 0, \
+                'Each policy in policies must be a non-empty list.'
+            for augment in policy:
+                assert isinstance(augment, dict) and 'type' in augment, \
+                    'Each specific augmentation must be a dict with key' \
+                    ' "type".'
+        super().__init__(transforms=policies, prob=prob)
+        self.policies = policies
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(policies={self.policies}, ' \
+               f'prob={self.prob})'
+
+
+@TRANSFORMS.register_module()
+class RandAugment(RandomChoice):
+    """Rand augmentation.
+
+    This data augmentation is proposed in `RandAugment:
+    Practical automated data augmentation with a reduced
+    search space <https://arxiv.org/abs/1909.13719>`_.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_masks
+    - gt_ignore_flags
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        aug_space (List[List[Union[dict, ConfigDict]]]): The augmentation space
+            of rand augmentation. Each augmentation transform in ``aug_space``
+            is a specific transform, and is composed by several augmentations.
+            When RandAugment is called, a random transform in ``aug_space``
+            will be selected to augment images. Defaults to aug_space.
+        aug_num (int): Number of augmentation to apply equentially.
+            Defaults to 2.
+        prob (list[float], optional): The probabilities associated with
+            each augmentation. The length should be equal to the
+            augmentation space and the sum should be 1. If not given,
+            a uniform distribution will be assumed. Defaults to None.
+
+    Examples:
+        >>> aug_space = [
+        >>>     dict(type='Sharpness'),
+        >>>     dict(type='ShearX'),
+        >>>     dict(type='Color'),
+        >>>     ],
+        >>> augmentation = RandAugment(aug_space)
+        >>> img = np.ones(100, 100, 3)
+        >>> gt_bboxes = np.ones(10, 4)
+        >>> results = dict(img=img, gt_bboxes=gt_bboxes)
+        >>> results = augmentation(results)
+    """
+
+    def __init__(self,
+                 aug_space: List[Union[dict, ConfigDict]] = RANDAUG_SPACE,
+                 aug_num: int = 2,
+                 prob: Optional[List[float]] = None) -> None:
+        assert isinstance(aug_space, list) and len(aug_space) > 0, \
+            'Augmentation space must be a non-empty list.'
+        for aug in aug_space:
+            assert isinstance(aug, list) and len(aug) == 1, \
+                'Each augmentation in aug_space must be a list.'
+            for transform in aug:
+                assert isinstance(transform, dict) and 'type' in transform, \
+                    'Each specific transform must be a dict with key' \
+                    ' "type".'
+        super().__init__(transforms=aug_space, prob=prob)
+        self.aug_space = aug_space
+        self.aug_num = aug_num
+
+    @cache_randomness
+    def random_pipeline_index(self):
+        indices = np.arange(len(self.transforms))
+        return np.random.choice(
+            indices, self.aug_num, p=self.prob, replace=False)
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to use RandAugment.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with RandAugment.
+        """
+        for idx in self.random_pipeline_index():
+            results = self.transforms[idx](results)
+        return results
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(' \
+               f'aug_space={self.aug_space}, '\
+               f'aug_num={self.aug_num}, ' \
+               f'prob={self.prob})'
diff --git a/head_extractor/build/lib/mmdet/datasets/transforms/colorspace.py b/head_extractor/build/lib/mmdet/datasets/transforms/colorspace.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0ba2e97c7eedf65df5ab8942ee461f48a785f39
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/transforms/colorspace.py
@@ -0,0 +1,493 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmcv.transforms.utils import cache_randomness
+
+from mmdet.registry import TRANSFORMS
+from .augment_wrappers import _MAX_LEVEL, level_to_mag
+
+
+@TRANSFORMS.register_module()
+class ColorTransform(BaseTransform):
+    """Base class for color transformations. All color transformations need to
+    inherit from this base class. ``ColorTransform`` unifies the class
+    attributes and class functions of color transformations (Color, Brightness,
+    Contrast, Sharpness, Solarize, SolarizeAdd, Equalize, AutoContrast, Invert,
+    and Posterize), and only distort color channels, without impacting the
+    locations of the instances.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing the geometric
+            transformation and should be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for color transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for color transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0 <= prob <= 1.0, f'The probability of the transformation ' \
+                                 f'should be in range [0,1], got {prob}.'
+        assert level is None or isinstance(level, int), \
+            f'The level should be None or type int, got {type(level)}.'
+        assert level is None or 0 <= level <= _MAX_LEVEL, \
+            f'The level should be in range [0,{_MAX_LEVEL}], got {level}.'
+        assert isinstance(min_mag, float), \
+            f'min_mag should be type float, got {type(min_mag)}.'
+        assert isinstance(max_mag, float), \
+            f'max_mag should be type float, got {type(max_mag)}.'
+        assert min_mag <= max_mag, \
+            f'min_mag should smaller than max_mag, ' \
+            f'got min_mag={min_mag} and max_mag={max_mag}'
+        self.prob = prob
+        self.level = level
+        self.min_mag = min_mag
+        self.max_mag = max_mag
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Transform the image."""
+        pass
+
+    @cache_randomness
+    def _random_disable(self):
+        """Randomly disable the transform."""
+        return np.random.rand() > self.prob
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        return level_to_mag(self.level, self.min_mag, self.max_mag)
+
+    def transform(self, results: dict) -> dict:
+        """Transform function for images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Transformed results.
+        """
+
+        if self._random_disable():
+            return results
+        mag = self._get_mag()
+        self._transform_img(results, mag)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'level={self.level}, '
+        repr_str += f'min_mag={self.min_mag}, '
+        repr_str += f'max_mag={self.max_mag})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Color(ColorTransform):
+    """Adjust the color balance of the image, in a manner similar to the
+    controls on a colour TV set. A magnitude=0 gives a black & white image,
+    whereas magnitude=1 gives the original image. The bboxes, masks and
+    segmentations are not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Color transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Color transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Color transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Color should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Color should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Apply Color transformation to image."""
+        # NOTE defaultly the image should be BGR format
+        img = results['img']
+        results['img'] = mmcv.adjust_color(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Brightness(ColorTransform):
+    """Adjust the brightness of the image. A magnitude=0 gives a black image,
+    whereas magnitude=1 gives the original image. The bboxes, masks and
+    segmentations are not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Brightness transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Brightness transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Brightness transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Brightness should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Brightness should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Adjust the brightness of image."""
+        img = results['img']
+        results['img'] = mmcv.adjust_brightness(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Contrast(ColorTransform):
+    """Control the contrast of the image. A magnitude=0 gives a gray image,
+    whereas magnitude=1 gives the original imageThe bboxes, masks and
+    segmentations are not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Contrast transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Contrast transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Contrast transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Contrast should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Contrast should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Adjust the image contrast."""
+        img = results['img']
+        results['img'] = mmcv.adjust_contrast(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Sharpness(ColorTransform):
+    """Adjust images sharpness. A positive magnitude would enhance the
+    sharpness and a negative magnitude would make the image blurry. A
+    magnitude=0 gives the origin img.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Sharpness transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Sharpness transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Sharpness transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Sharpness should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Sharpness should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Adjust the image sharpness."""
+        img = results['img']
+        results['img'] = mmcv.adjust_sharpness(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Solarize(ColorTransform):
+    """Solarize images (Invert all pixels above a threshold value of
+    magnitude.).
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Solarize transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Solarize transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for Solarize transformation.
+            Defaults to 256.0.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 256.0) -> None:
+        assert 0. <= min_mag <= 256.0, f'min_mag for Solarize should be ' \
+                                       f'in range [0, 256], got {min_mag}.'
+        assert 0. <= max_mag <= 256.0, f'max_mag for Solarize should be ' \
+                                       f'in range [0, 256], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Invert all pixel values above magnitude."""
+        img = results['img']
+        results['img'] = mmcv.solarize(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class SolarizeAdd(ColorTransform):
+    """SolarizeAdd images. For each pixel in the image that is less than 128,
+    add an additional amount to it decided by the magnitude.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing SolarizeAdd
+            transformation. Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for SolarizeAdd transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for SolarizeAdd transformation.
+            Defaults to 110.0.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 110.0) -> None:
+        assert 0. <= min_mag <= 110.0, f'min_mag for SolarizeAdd should be ' \
+                                       f'in range [0, 110], got {min_mag}.'
+        assert 0. <= max_mag <= 110.0, f'max_mag for SolarizeAdd should be ' \
+                                       f'in range [0, 110], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """SolarizeAdd the image."""
+        img = results['img']
+        img_solarized = np.where(img < 128, np.minimum(img + mag, 255), img)
+        results['img'] = img_solarized.astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Posterize(ColorTransform):
+    """Posterize images (reduce the number of bits for each color channel).
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Posterize
+            transformation. Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Posterize transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for Posterize transformation.
+            Defaults to 4.0.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 4.0) -> None:
+        assert 0. <= min_mag <= 8.0, f'min_mag for Posterize should be ' \
+                                     f'in range [0, 8], got {min_mag}.'
+        assert 0. <= max_mag <= 8.0, f'max_mag for Posterize should be ' \
+                                     f'in range [0, 8], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Posterize the image."""
+        img = results['img']
+        results['img'] = mmcv.posterize(img, math.ceil(mag)).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Equalize(ColorTransform):
+    """Equalize the image histogram. The bboxes, masks and segmentations are
+    not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Equalize transformation.
+            Defaults to 1.0.
+        level (int, optional): No use for Equalize transformation.
+            Defaults to None.
+        min_mag (float): No use for Equalize transformation. Defaults to 0.1.
+        max_mag (float): No use for Equalize transformation. Defaults to 1.9.
+    """
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Equalizes the histogram of one image."""
+        img = results['img']
+        results['img'] = mmcv.imequalize(img).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class AutoContrast(ColorTransform):
+    """Auto adjust image contrast.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing AutoContrast should
+             be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): No use for AutoContrast transformation.
+            Defaults to None.
+        min_mag (float): No use for AutoContrast transformation.
+            Defaults to 0.1.
+        max_mag (float): No use for AutoContrast transformation.
+            Defaults to 1.9.
+    """
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Auto adjust image contrast."""
+        img = results['img']
+        results['img'] = mmcv.auto_contrast(img).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Invert(ColorTransform):
+    """Invert images.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing invert therefore should
+             be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): No use for Invert transformation.
+            Defaults to None.
+        min_mag (float): No use for Invert transformation. Defaults to 0.1.
+        max_mag (float): No use for Invert transformation. Defaults to 1.9.
+    """
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Invert the image."""
+        img = results['img']
+        results['img'] = mmcv.iminvert(img).astype(img.dtype)
diff --git a/head_extractor/build/lib/mmdet/datasets/transforms/formatting.py b/head_extractor/build/lib/mmdet/datasets/transforms/formatting.py
new file mode 100644
index 0000000000000000000000000000000000000000..05263807c0eab470b0c73f435d327ad8cadb60b3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/transforms/formatting.py
@@ -0,0 +1,512 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
+import numpy as np
+from mmcv.transforms import to_tensor
+from mmcv.transforms.base import BaseTransform
+from mmengine.structures import InstanceData, PixelData
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures import DetDataSample, ReIDDataSample, TrackDataSample
+from mmdet.structures.bbox import BaseBoxes
+
+
+@TRANSFORMS.register_module()
+class PackDetInputs(BaseTransform):
+    """Pack the inputs data for the detection / semantic segmentation /
+    panoptic segmentation.
+
+    The ``img_meta`` item is always populated.  The contents of the
+    ``img_meta`` dictionary depends on ``meta_keys``. By default this includes:
+
+        - ``img_id``: id of the image
+
+        - ``img_path``: path to the image file
+
+        - ``ori_shape``: original shape of the image as a tuple (h, w)
+
+        - ``img_shape``: shape of the image input to the network as a tuple \
+            (h, w).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+
+        - ``scale_factor``: a float indicating the preprocessing scale
+
+        - ``flip``: a boolean indicating if image flip transform was used
+
+        - ``flip_direction``: the flipping direction
+
+    Args:
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ``('img_id', 'img_path', 'ori_shape', 'img_shape',
+            'scale_factor', 'flip', 'flip_direction')``
+    """
+    mapping_table = {
+        'gt_bboxes': 'bboxes',
+        'gt_bboxes_labels': 'labels',
+        'gt_masks': 'masks'
+    }
+
+    def __init__(self,
+                 meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                            'scale_factor', 'flip', 'flip_direction')):
+        self.meta_keys = meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict:
+
+            - 'inputs' (obj:`torch.Tensor`): The forward data of models.
+            - 'data_sample' (obj:`DetDataSample`): The annotation info of the
+                sample.
+        """
+        packed_results = dict()
+        if 'img' in results:
+            img = results['img']
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            # To improve the computational speed by by 3-5 times, apply:
+            # If image is not contiguous, use
+            # `numpy.transpose()` followed by `numpy.ascontiguousarray()`
+            # If image is already contiguous, use
+            # `torch.permute()` followed by `torch.contiguous()`
+            # Refer to https://github.com/open-mmlab/mmdetection/pull/9533
+            # for more details
+            if not img.flags.c_contiguous:
+                img = np.ascontiguousarray(img.transpose(2, 0, 1))
+                img = to_tensor(img)
+            else:
+                img = to_tensor(img).permute(2, 0, 1).contiguous()
+
+            packed_results['inputs'] = img
+
+        if 'gt_ignore_flags' in results:
+            valid_idx = np.where(results['gt_ignore_flags'] == 0)[0]
+            ignore_idx = np.where(results['gt_ignore_flags'] == 1)[0]
+
+        data_sample = DetDataSample()
+        instance_data = InstanceData()
+        ignore_instance_data = InstanceData()
+
+        for key in self.mapping_table.keys():
+            if key not in results:
+                continue
+            if key == 'gt_masks' or isinstance(results[key], BaseBoxes):
+                if 'gt_ignore_flags' in results:
+                    instance_data[
+                        self.mapping_table[key]] = results[key][valid_idx]
+                    ignore_instance_data[
+                        self.mapping_table[key]] = results[key][ignore_idx]
+                else:
+                    instance_data[self.mapping_table[key]] = results[key]
+            else:
+                if 'gt_ignore_flags' in results:
+                    instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key][valid_idx])
+                    ignore_instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key][ignore_idx])
+                else:
+                    instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key])
+        data_sample.gt_instances = instance_data
+        data_sample.ignored_instances = ignore_instance_data
+
+        if 'proposals' in results:
+            proposals = InstanceData(
+                bboxes=to_tensor(results['proposals']),
+                scores=to_tensor(results['proposals_scores']))
+            data_sample.proposals = proposals
+
+        if 'gt_seg_map' in results:
+            gt_sem_seg_data = dict(
+                sem_seg=to_tensor(results['gt_seg_map'][None, ...].copy()))
+            gt_sem_seg_data = PixelData(**gt_sem_seg_data)
+            if 'ignore_index' in results:
+                metainfo = dict(ignore_index=results['ignore_index'])
+                gt_sem_seg_data.set_metainfo(metainfo)
+            data_sample.gt_sem_seg = gt_sem_seg_data
+
+        img_meta = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_meta[key] = results[key]
+        data_sample.set_metainfo(img_meta)
+        packed_results['data_samples'] = data_sample
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ToTensor:
+    """Convert some results to :obj:`torch.Tensor` by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys that need to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert data in results to :obj:`torch.Tensor`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted
+                to :obj:`torch.Tensor`.
+        """
+        for key in self.keys:
+            results[key] = to_tensor(results[key])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@TRANSFORMS.register_module()
+class ImageToTensor:
+    """Convert image to :obj:`torch.Tensor` by given keys.
+
+    The dimension order of input image is (H, W, C). The pipeline will convert
+    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
+    (1, H, W).
+
+    Args:
+        keys (Sequence[str]): Key of images to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert image in results to :obj:`torch.Tensor` and
+        transpose the channel order.
+
+        Args:
+            results (dict): Result dict contains the image data to convert.
+
+        Returns:
+            dict: The result dict contains the image converted
+                to :obj:`torch.Tensor` and permuted to (C, H, W) order.
+        """
+        for key in self.keys:
+            img = results[key]
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            results[key] = to_tensor(img).permute(2, 0, 1).contiguous()
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@TRANSFORMS.register_module()
+class Transpose:
+    """Transpose some results by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys of results to be transposed.
+        order (Sequence[int]): Order of transpose.
+    """
+
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+
+    def __call__(self, results):
+        """Call function to transpose the channel order of data in results.
+
+        Args:
+            results (dict): Result dict contains the data to transpose.
+
+        Returns:
+            dict: The result dict contains the data transposed to \
+                ``self.order``.
+        """
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, order={self.order})'
+
+
+@TRANSFORMS.register_module()
+class WrapFieldsToLists:
+    """Wrap fields of the data dictionary into lists for evaluation.
+
+    This class can be used as a last step of a test or validation
+    pipeline for single image evaluation or inference.
+
+    Example:
+        >>> test_pipeline = [
+        >>>    dict(type='LoadImageFromFile'),
+        >>>    dict(type='Normalize',
+                    mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True),
+        >>>    dict(type='Pad', size_divisor=32),
+        >>>    dict(type='ImageToTensor', keys=['img']),
+        >>>    dict(type='Collect', keys=['img']),
+        >>>    dict(type='WrapFieldsToLists')
+        >>> ]
+    """
+
+    def __call__(self, results):
+        """Call function to wrap fields into lists.
+
+        Args:
+            results (dict): Result dict contains the data to wrap.
+
+        Returns:
+            dict: The result dict where value of ``self.keys`` are wrapped \
+                into list.
+        """
+
+        # Wrap dict fields into lists
+        for key, val in results.items():
+            results[key] = [val]
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}()'
+
+
+@TRANSFORMS.register_module()
+class PackTrackInputs(BaseTransform):
+    """Pack the inputs data for the multi object tracking and video instance
+    segmentation. All the information of images are packed to ``inputs``. All
+    the information except images are packed to ``data_samples``. In order to
+    get the original annotaiton and meta info, we add `instances` key into meta
+    keys.
+
+    Args:
+        meta_keys (Sequence[str]): Meta keys to be collected in
+            ``data_sample.metainfo``. Defaults to None.
+        default_meta_keys (tuple): Default meta keys. Defaults to ('img_id',
+            'img_path', 'ori_shape', 'img_shape', 'scale_factor',
+            'flip', 'flip_direction', 'frame_id', 'is_video_data',
+            'video_id', 'video_length', 'instances').
+    """
+    mapping_table = {
+        'gt_bboxes': 'bboxes',
+        'gt_bboxes_labels': 'labels',
+        'gt_masks': 'masks',
+        'gt_instances_ids': 'instances_ids'
+    }
+
+    def __init__(self,
+                 meta_keys: Optional[dict] = None,
+                 default_meta_keys: tuple = ('img_id', 'img_path', 'ori_shape',
+                                             'img_shape', 'scale_factor',
+                                             'flip', 'flip_direction',
+                                             'frame_id', 'video_id',
+                                             'video_length',
+                                             'ori_video_length', 'instances')):
+        self.meta_keys = default_meta_keys
+        if meta_keys is not None:
+            if isinstance(meta_keys, str):
+                meta_keys = (meta_keys, )
+            else:
+                assert isinstance(meta_keys, tuple), \
+                    'meta_keys must be str or tuple'
+            self.meta_keys += meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+        Args:
+            results (dict): Result dict from the data pipeline.
+        Returns:
+            dict:
+            - 'inputs' (dict[Tensor]): The forward data of models.
+            - 'data_samples' (obj:`TrackDataSample`): The annotation info of
+                the samples.
+        """
+        packed_results = dict()
+        packed_results['inputs'] = dict()
+
+        # 1. Pack images
+        if 'img' in results:
+            imgs = results['img']
+            imgs = np.stack(imgs, axis=0)
+            imgs = imgs.transpose(0, 3, 1, 2)
+            packed_results['inputs'] = to_tensor(imgs)
+
+        # 2. Pack InstanceData
+        if 'gt_ignore_flags' in results:
+            gt_ignore_flags_list = results['gt_ignore_flags']
+            valid_idx_list, ignore_idx_list = [], []
+            for gt_ignore_flags in gt_ignore_flags_list:
+                valid_idx = np.where(gt_ignore_flags == 0)[0]
+                ignore_idx = np.where(gt_ignore_flags == 1)[0]
+                valid_idx_list.append(valid_idx)
+                ignore_idx_list.append(ignore_idx)
+
+        assert 'img_id' in results, "'img_id' must contained in the results "
+        'for counting the number of images'
+
+        num_imgs = len(results['img_id'])
+        instance_data_list = [InstanceData() for _ in range(num_imgs)]
+        ignore_instance_data_list = [InstanceData() for _ in range(num_imgs)]
+
+        for key in self.mapping_table.keys():
+            if key not in results:
+                continue
+            if key == 'gt_masks':
+                mapped_key = self.mapping_table[key]
+                gt_masks_list = results[key]
+                if 'gt_ignore_flags' in results:
+                    for i, gt_mask in enumerate(gt_masks_list):
+                        valid_idx, ignore_idx = valid_idx_list[
+                            i], ignore_idx_list[i]
+                        instance_data_list[i][mapped_key] = gt_mask[valid_idx]
+                        ignore_instance_data_list[i][mapped_key] = gt_mask[
+                            ignore_idx]
+
+                else:
+                    for i, gt_mask in enumerate(gt_masks_list):
+                        instance_data_list[i][mapped_key] = gt_mask
+
+            else:
+                anns_list = results[key]
+                if 'gt_ignore_flags' in results:
+                    for i, ann in enumerate(anns_list):
+                        valid_idx, ignore_idx = valid_idx_list[
+                            i], ignore_idx_list[i]
+                        instance_data_list[i][
+                            self.mapping_table[key]] = to_tensor(
+                                ann[valid_idx])
+                        ignore_instance_data_list[i][
+                            self.mapping_table[key]] = to_tensor(
+                                ann[ignore_idx])
+                else:
+                    for i, ann in enumerate(anns_list):
+                        instance_data_list[i][
+                            self.mapping_table[key]] = to_tensor(ann)
+
+        det_data_samples_list = []
+        for i in range(num_imgs):
+            det_data_sample = DetDataSample()
+            det_data_sample.gt_instances = instance_data_list[i]
+            det_data_sample.ignored_instances = ignore_instance_data_list[i]
+            det_data_samples_list.append(det_data_sample)
+
+        # 3. Pack metainfo
+        for key in self.meta_keys:
+            if key not in results:
+                continue
+            img_metas_list = results[key]
+            for i, img_meta in enumerate(img_metas_list):
+                det_data_samples_list[i].set_metainfo({f'{key}': img_meta})
+
+        track_data_sample = TrackDataSample()
+        track_data_sample.video_data_samples = det_data_samples_list
+        if 'key_frame_flags' in results:
+            key_frame_flags = np.asarray(results['key_frame_flags'])
+            key_frames_inds = np.where(key_frame_flags)[0].tolist()
+            ref_frames_inds = np.where(~key_frame_flags)[0].tolist()
+            track_data_sample.set_metainfo(
+                dict(key_frames_inds=key_frames_inds))
+            track_data_sample.set_metainfo(
+                dict(ref_frames_inds=ref_frames_inds))
+
+        packed_results['data_samples'] = track_data_sample
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'meta_keys={self.meta_keys}, '
+        repr_str += f'default_meta_keys={self.default_meta_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PackReIDInputs(BaseTransform):
+    """Pack the inputs data for the ReID. The ``meta_info`` item is always
+    populated. The contents of the ``meta_info`` dictionary depends on
+    ``meta_keys``. By default this includes:
+
+        - ``img_path``: path to the image file.
+        - ``ori_shape``: original shape of the image as a tuple (H, W).
+        - ``img_shape``: shape of the image input to the network as a tuple
+            (H, W). Note that images may be zero padded on the bottom/right
+          if the batch tensor is larger than this shape.
+        - ``scale``: scale of the image as a tuple (W, H).
+        - ``scale_factor``: a float indicating the pre-processing scale.
+        -  ``flip``: a boolean indicating if image flip transform was used.
+        - ``flip_direction``: the flipping direction.
+    Args:
+        meta_keys (Sequence[str], optional): The meta keys to saved in the
+            ``metainfo`` of the packed ``data_sample``.
+    """
+    default_meta_keys = ('img_path', 'ori_shape', 'img_shape', 'scale',
+                         'scale_factor')
+
+    def __init__(self, meta_keys: Sequence[str] = ()) -> None:
+        self.meta_keys = self.default_meta_keys
+        if meta_keys is not None:
+            if isinstance(meta_keys, str):
+                meta_keys = (meta_keys, )
+            else:
+                assert isinstance(meta_keys, tuple), \
+                    'meta_keys must be str or tuple.'
+            self.meta_keys += meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+        Args:
+            results (dict): Result dict from the data pipeline.
+        Returns:
+            dict:
+            - 'inputs' (dict[Tensor]): The forward data of models.
+            - 'data_samples' (obj:`ReIDDataSample`): The meta info of the
+                sample.
+        """
+        packed_results = dict(inputs=dict(), data_samples=None)
+        assert 'img' in results, 'Missing the key ``img``.'
+        _type = type(results['img'])
+        label = results['gt_label']
+
+        if _type == list:
+            img = results['img']
+            label = np.stack(label, axis=0)  # (N,)
+            assert all([type(v) == _type for v in results.values()]), \
+                'All items in the results must have the same type.'
+        else:
+            img = [results['img']]
+
+        img = np.stack(img, axis=3)  # (H, W, C, N)
+        img = img.transpose(3, 2, 0, 1)  # (N, C, H, W)
+        img = np.ascontiguousarray(img)
+
+        packed_results['inputs'] = to_tensor(img)
+
+        data_sample = ReIDDataSample()
+        data_sample.set_gt_label(label)
+
+        meta_info = dict()
+        for key in self.meta_keys:
+            meta_info[key] = results[key]
+        data_sample.set_metainfo(meta_info)
+        packed_results['data_samples'] = data_sample
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
diff --git a/head_extractor/build/lib/mmdet/datasets/transforms/frame_sampling.py b/head_extractor/build/lib/mmdet/datasets/transforms/frame_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..a91f1e7880f8f061f183dc30a01758d97b7d03da
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/transforms/frame_sampling.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from collections import defaultdict
+from typing import Dict, List, Optional, Union
+
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class BaseFrameSample(BaseTransform):
+    """Directly get the key frame, no reference frames.
+
+    Args:
+        collect_video_keys (list[str]): The keys of video info to be
+            collected.
+    """
+
+    def __init__(self,
+                 collect_video_keys: List[str] = ['video_id', 'video_length']):
+        self.collect_video_keys = collect_video_keys
+
+    def prepare_data(self, video_infos: dict,
+                     sampled_inds: List[int]) -> Dict[str, List]:
+        """Prepare data for the subsequent pipeline.
+
+        Args:
+            video_infos (dict): The whole video information.
+            sampled_inds (list[int]): The sampled frame indices.
+
+        Returns:
+            dict: The processed data information.
+        """
+        frames_anns = video_infos['images']
+        final_data_info = defaultdict(list)
+        # for data in frames_anns:
+        for index in sampled_inds:
+            data = frames_anns[index]
+            # copy the info in video-level into img-level
+            for key in self.collect_video_keys:
+                if key == 'video_length':
+                    data['ori_video_length'] = video_infos[key]
+                    data['video_length'] = len(sampled_inds)
+                else:
+                    data[key] = video_infos[key]
+            # Collate data_list (list of dict to dict of list)
+            for key, value in data.items():
+                final_data_info[key].append(value)
+
+        return final_data_info
+
+    def transform(self, video_infos: dict) -> Optional[Dict[str, List]]:
+        """Transform the video information.
+
+        Args:
+            video_infos (dict): The whole video information.
+
+        Returns:
+            dict: The data information of the key frames.
+        """
+        if 'key_frame_id' in video_infos:
+            key_frame_id = video_infos['key_frame_id']
+            assert isinstance(video_infos['key_frame_id'], int)
+        else:
+            key_frame_id = random.sample(
+                list(range(video_infos['video_length'])), 1)[0]
+        results = self.prepare_data(video_infos, [key_frame_id])
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(collect_video_keys={self.collect_video_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class UniformRefFrameSample(BaseFrameSample):
+    """Uniformly sample reference frames.
+
+    Args:
+        num_ref_imgs (int): Number of reference frames to be sampled.
+        frame_range (int | list[int]): Range of frames to be sampled around
+            key frame. If int, the range is [-frame_range, frame_range].
+            Defaults to 10.
+        filter_key_img (bool): Whether to filter the key frame when
+            sampling reference frames. Defaults to True.
+        collect_video_keys (list[str]): The keys of video info to be
+            collected.
+    """
+
+    def __init__(self,
+                 num_ref_imgs: int = 1,
+                 frame_range: Union[int, List[int]] = 10,
+                 filter_key_img: bool = True,
+                 collect_video_keys: List[str] = ['video_id', 'video_length']):
+        self.num_ref_imgs = num_ref_imgs
+        self.filter_key_img = filter_key_img
+        if isinstance(frame_range, int):
+            assert frame_range >= 0, 'frame_range can not be a negative value.'
+            frame_range = [-frame_range, frame_range]
+        elif isinstance(frame_range, list):
+            assert len(frame_range) == 2, 'The length must be 2.'
+            assert frame_range[0] <= 0 and frame_range[1] >= 0
+            for i in frame_range:
+                assert isinstance(i, int), 'Each element must be int.'
+        else:
+            raise TypeError('The type of frame_range must be int or list.')
+        self.frame_range = frame_range
+        super().__init__(collect_video_keys=collect_video_keys)
+
+    def sampling_frames(self, video_length: int, key_frame_id: int):
+        """Sampling frames.
+
+        Args:
+            video_length (int): The length of the video.
+            key_frame_id (int): The key frame id.
+
+        Returns:
+            list[int]: The sampled frame indices.
+        """
+        if video_length > 1:
+            left = max(0, key_frame_id + self.frame_range[0])
+            right = min(key_frame_id + self.frame_range[1], video_length - 1)
+            frame_ids = list(range(0, video_length))
+
+            valid_ids = frame_ids[left:right + 1]
+            if self.filter_key_img and key_frame_id in valid_ids:
+                valid_ids.remove(key_frame_id)
+            assert len(
+                valid_ids
+            ) > 0, 'After filtering key frame, there are no valid frames'
+            if len(valid_ids) < self.num_ref_imgs:
+                valid_ids = valid_ids * self.num_ref_imgs
+            ref_frame_ids = random.sample(valid_ids, self.num_ref_imgs)
+        else:
+            ref_frame_ids = [key_frame_id] * self.num_ref_imgs
+
+        sampled_frames_ids = [key_frame_id] + ref_frame_ids
+        sampled_frames_ids = sorted(sampled_frames_ids)
+
+        key_frames_ind = sampled_frames_ids.index(key_frame_id)
+        key_frame_flags = [False] * len(sampled_frames_ids)
+        key_frame_flags[key_frames_ind] = True
+        return sampled_frames_ids, key_frame_flags
+
+    def transform(self, video_infos: dict) -> Optional[Dict[str, List]]:
+        """Transform the video information.
+
+        Args:
+            video_infos (dict): The whole video information.
+
+        Returns:
+            dict: The data information of the sampled frames.
+        """
+        if 'key_frame_id' in video_infos:
+            key_frame_id = video_infos['key_frame_id']
+            assert isinstance(video_infos['key_frame_id'], int)
+        else:
+            key_frame_id = random.sample(
+                list(range(video_infos['video_length'])), 1)[0]
+
+        (sampled_frames_ids, key_frame_flags) = self.sampling_frames(
+            video_infos['video_length'], key_frame_id=key_frame_id)
+        results = self.prepare_data(video_infos, sampled_frames_ids)
+        results['key_frame_flags'] = key_frame_flags
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_ref_imgs={self.num_ref_imgs}, '
+        repr_str += f'frame_range={self.frame_range}, '
+        repr_str += f'filter_key_img={self.filter_key_img}, '
+        repr_str += f'collect_video_keys={self.collect_video_keys})'
+        return repr_str
diff --git a/head_extractor/build/lib/mmdet/datasets/transforms/geometric.py b/head_extractor/build/lib/mmdet/datasets/transforms/geometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2cd6be258f73a69aa2c2b36fef64c6c4e46a2a4
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/transforms/geometric.py
@@ -0,0 +1,754 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Optional, Union
+
+import cv2
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmcv.transforms.utils import cache_randomness
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import autocast_box_type
+from .augment_wrappers import _MAX_LEVEL, level_to_mag
+
+
+@TRANSFORMS.register_module()
+class GeomTransform(BaseTransform):
+    """Base class for geometric transformations. All geometric transformations
+    need to inherit from this base class. ``GeomTransform`` unifies the class
+    attributes and class functions of geometric transformations (ShearX,
+    ShearY, Rotate, TranslateX, and TranslateY), and records the homography
+    matrix.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for performing the geometric
+            transformation and should be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for geometric transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for geometric transformation.
+            Defaults to 1.0.
+        reversal_prob (float): The probability that reverses the geometric
+            transformation magnitude. Should be in range [0,1].
+            Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 1.0,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0 <= prob <= 1.0, f'The probability of the transformation ' \
+                                 f'should be in range [0,1], got {prob}.'
+        assert level is None or isinstance(level, int), \
+            f'The level should be None or type int, got {type(level)}.'
+        assert level is None or 0 <= level <= _MAX_LEVEL, \
+            f'The level should be in range [0,{_MAX_LEVEL}], got {level}.'
+        assert isinstance(min_mag, float), \
+            f'min_mag should be type float, got {type(min_mag)}.'
+        assert isinstance(max_mag, float), \
+            f'max_mag should be type float, got {type(max_mag)}.'
+        assert min_mag <= max_mag, \
+            f'min_mag should smaller than max_mag, ' \
+            f'got min_mag={min_mag} and max_mag={max_mag}'
+        assert isinstance(reversal_prob, float), \
+            f'reversal_prob should be type float, got {type(max_mag)}.'
+        assert 0 <= reversal_prob <= 1.0, \
+            f'The reversal probability of the transformation magnitude ' \
+            f'should be type float, got {type(reversal_prob)}.'
+        if isinstance(img_border_value, (float, int)):
+            img_border_value = tuple([float(img_border_value)] * 3)
+        elif isinstance(img_border_value, tuple):
+            assert len(img_border_value) == 3, \
+                f'img_border_value as tuple must have 3 elements, ' \
+                f'got {len(img_border_value)}.'
+            img_border_value = tuple([float(val) for val in img_border_value])
+        else:
+            raise ValueError(
+                'img_border_value must be float or tuple with 3 elements.')
+        assert np.all([0 <= val <= 255 for val in img_border_value]), 'all ' \
+            'elements of img_border_value should between range [0,255].' \
+            f'got {img_border_value}.'
+        self.prob = prob
+        self.level = level
+        self.min_mag = min_mag
+        self.max_mag = max_mag
+        self.reversal_prob = reversal_prob
+        self.img_border_value = img_border_value
+        self.mask_border_value = mask_border_value
+        self.seg_ignore_label = seg_ignore_label
+        self.interpolation = interpolation
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Transform the image."""
+        pass
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Transform the masks."""
+        pass
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Transform the segmentation map."""
+        pass
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for the geometric transformation."""
+        return np.eye(3, dtype=np.float32)
+
+    def _transform_bboxes(self, results: dict, mag: float) -> None:
+        """Transform the bboxes."""
+        results['gt_bboxes'].project_(self.homography_matrix)
+        results['gt_bboxes'].clip_(results['img_shape'])
+
+    def _record_homography_matrix(self, results: dict) -> None:
+        """Record the homography matrix for the geometric transformation."""
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = self.homography_matrix
+        else:
+            results['homography_matrix'] = self.homography_matrix @ results[
+                'homography_matrix']
+
+    @cache_randomness
+    def _random_disable(self):
+        """Randomly disable the transform."""
+        return np.random.rand() > self.prob
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        mag = level_to_mag(self.level, self.min_mag, self.max_mag)
+        return -mag if np.random.rand() > self.reversal_prob else mag
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function for images, bounding boxes, masks and semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Transformed results.
+        """
+
+        if self._random_disable():
+            return results
+        mag = self._get_mag()
+        self.homography_matrix = self._get_homography_matrix(results, mag)
+        self._record_homography_matrix(results)
+        self._transform_img(results, mag)
+        if results.get('gt_bboxes', None) is not None:
+            self._transform_bboxes(results, mag)
+        if results.get('gt_masks', None) is not None:
+            self._transform_masks(results, mag)
+        if results.get('gt_seg_map', None) is not None:
+            self._transform_seg(results, mag)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'level={self.level}, '
+        repr_str += f'min_mag={self.min_mag}, '
+        repr_str += f'max_mag={self.max_mag}, '
+        repr_str += f'reversal_prob={self.reversal_prob}, '
+        repr_str += f'img_border_value={self.img_border_value}, '
+        repr_str += f'mask_border_value={self.mask_border_value}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ShearX(GeomTransform):
+    """Shear the images, bboxes, masks and segmentation map horizontally.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for performing Shear and should be in
+            range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum angle for the horizontal shear.
+            Defaults to 0.0.
+        max_mag (float): The maximum angle for the horizontal shear.
+            Defaults to 30.0.
+        reversal_prob (float): The probability that reverses the horizontal
+            shear magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 30.0,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 90., \
+            f'min_mag angle for ShearX should be ' \
+            f'in range [0, 90], got {min_mag}.'
+        assert 0. <= max_mag <= 90., \
+            f'max_mag angle for ShearX should be ' \
+            f'in range [0, 90], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        mag = level_to_mag(self.level, self.min_mag, self.max_mag)
+        mag = np.tan(mag * np.pi / 180)
+        return -mag if np.random.rand() > self.reversal_prob else mag
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for ShearX."""
+        return np.array([[1, mag, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Shear the image horizontally."""
+        results['img'] = mmcv.imshear(
+            results['img'],
+            mag,
+            direction='horizontal',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Shear the masks horizontally."""
+        results['gt_masks'] = results['gt_masks'].shear(
+            results['img_shape'],
+            mag,
+            direction='horizontal',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Shear the segmentation map horizontally."""
+        results['gt_seg_map'] = mmcv.imshear(
+            results['gt_seg_map'],
+            mag,
+            direction='horizontal',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class ShearY(GeomTransform):
+    """Shear the images, bboxes, masks and segmentation map vertically.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for performing ShearY and should be in
+            range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum angle for the vertical shear.
+            Defaults to 0.0.
+        max_mag (float): The maximum angle for the vertical shear.
+            Defaults to 30.0.
+        reversal_prob (float): The probability that reverses the vertical
+            shear magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 30.,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 90., \
+            f'min_mag angle for ShearY should be ' \
+            f'in range [0, 90], got {min_mag}.'
+        assert 0. <= max_mag <= 90., \
+            f'max_mag angle for ShearY should be ' \
+            f'in range [0, 90], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        mag = level_to_mag(self.level, self.min_mag, self.max_mag)
+        mag = np.tan(mag * np.pi / 180)
+        return -mag if np.random.rand() > self.reversal_prob else mag
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for ShearY."""
+        return np.array([[1, 0, 0], [mag, 1, 0], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Shear the image vertically."""
+        results['img'] = mmcv.imshear(
+            results['img'],
+            mag,
+            direction='vertical',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Shear the masks vertically."""
+        results['gt_masks'] = results['gt_masks'].shear(
+            results['img_shape'],
+            mag,
+            direction='vertical',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Shear the segmentation map vertically."""
+        results['gt_seg_map'] = mmcv.imshear(
+            results['gt_seg_map'],
+            mag,
+            direction='vertical',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class Rotate(GeomTransform):
+    """Rotate the images, bboxes, masks and segmentation map.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The maximum angle for rotation.
+            Defaults to 0.0.
+        max_mag (float): The maximum angle for rotation.
+            Defaults to 30.0.
+        reversal_prob (float): The probability that reverses the rotation
+            magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 30.0,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 180., \
+            f'min_mag for Rotate should be in range [0,180], got {min_mag}.'
+        assert 0. <= max_mag <= 180., \
+            f'max_mag for Rotate should be in range [0,180], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for Rotate."""
+        img_shape = results['img_shape']
+        center = ((img_shape[1] - 1) * 0.5, (img_shape[0] - 1) * 0.5)
+        cv2_rotation_matrix = cv2.getRotationMatrix2D(center, -mag, 1.0)
+        return np.concatenate(
+            [cv2_rotation_matrix,
+             np.array([0, 0, 1]).reshape((1, 3))]).astype(np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Rotate the image."""
+        results['img'] = mmcv.imrotate(
+            results['img'],
+            mag,
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Rotate the masks."""
+        results['gt_masks'] = results['gt_masks'].rotate(
+            results['img_shape'],
+            mag,
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Rotate the segmentation map."""
+        results['gt_seg_map'] = mmcv.imrotate(
+            results['gt_seg_map'],
+            mag,
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class TranslateX(GeomTransform):
+    """Translate the images, bboxes, masks and segmentation map horizontally.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum pixel's offset ratio for horizontal
+            translation. Defaults to 0.0.
+        max_mag (float): The maximum pixel's offset ratio for horizontal
+            translation. Defaults to 0.1.
+        reversal_prob (float): The probability that reverses the horizontal
+            translation magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 0.1,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 1., \
+            f'min_mag ratio for TranslateX should be ' \
+            f'in range [0, 1], got {min_mag}.'
+        assert 0. <= max_mag <= 1., \
+            f'max_mag ratio for TranslateX should be ' \
+            f'in range [0, 1], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for TranslateX."""
+        mag = int(results['img_shape'][1] * mag)
+        return np.array([[1, 0, mag], [0, 1, 0], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Translate the image horizontally."""
+        mag = int(results['img_shape'][1] * mag)
+        results['img'] = mmcv.imtranslate(
+            results['img'],
+            mag,
+            direction='horizontal',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Translate the masks horizontally."""
+        mag = int(results['img_shape'][1] * mag)
+        results['gt_masks'] = results['gt_masks'].translate(
+            results['img_shape'],
+            mag,
+            direction='horizontal',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Translate the segmentation map horizontally."""
+        mag = int(results['img_shape'][1] * mag)
+        results['gt_seg_map'] = mmcv.imtranslate(
+            results['gt_seg_map'],
+            mag,
+            direction='horizontal',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class TranslateY(GeomTransform):
+    """Translate the images, bboxes, masks and segmentation map vertically.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum pixel's offset ratio for vertical
+            translation. Defaults to 0.0.
+        max_mag (float): The maximum pixel's offset ratio for vertical
+            translation. Defaults to 0.1.
+        reversal_prob (float): The probability that reverses the vertical
+            translation magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 0.1,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 1., \
+            f'min_mag ratio for TranslateY should be ' \
+            f'in range [0,1], got {min_mag}.'
+        assert 0. <= max_mag <= 1., \
+            f'max_mag ratio for TranslateY should be ' \
+            f'in range [0,1], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for TranslateY."""
+        mag = int(results['img_shape'][0] * mag)
+        return np.array([[1, 0, 0], [0, 1, mag], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Translate the image vertically."""
+        mag = int(results['img_shape'][0] * mag)
+        results['img'] = mmcv.imtranslate(
+            results['img'],
+            mag,
+            direction='vertical',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Translate masks vertically."""
+        mag = int(results['img_shape'][0] * mag)
+        results['gt_masks'] = results['gt_masks'].translate(
+            results['img_shape'],
+            mag,
+            direction='vertical',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Translate segmentation map vertically."""
+        mag = int(results['img_shape'][0] * mag)
+        results['gt_seg_map'] = mmcv.imtranslate(
+            results['gt_seg_map'],
+            mag,
+            direction='vertical',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
diff --git a/head_extractor/build/lib/mmdet/datasets/transforms/instaboost.py b/head_extractor/build/lib/mmdet/datasets/transforms/instaboost.py
new file mode 100644
index 0000000000000000000000000000000000000000..30dc1603643ec8d398bfade95f5ec1c9b8f89c8d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/transforms/instaboost.py
@@ -0,0 +1,150 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import numpy as np
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class InstaBoost(BaseTransform):
+    r"""Data augmentation method in `InstaBoost: Boosting Instance
+    Segmentation Via Probability Map Guided Copy-Pasting
+    <https://arxiv.org/abs/1908.07801>`_.
+
+    Refer to https://github.com/GothicAi/Instaboost for implementation details.
+
+
+    Required Keys:
+
+    - img (np.uint8)
+    - instances
+
+    Modified Keys:
+
+    - img (np.uint8)
+    - instances
+
+    Args:
+        action_candidate (tuple): Action candidates. "normal", "horizontal", \
+            "vertical", "skip" are supported. Defaults to ('normal', \
+            'horizontal', 'skip').
+        action_prob (tuple): Corresponding action probabilities. Should be \
+            the same length as action_candidate. Defaults to (1, 0, 0).
+        scale (tuple): (min scale, max scale). Defaults to (0.8, 1.2).
+        dx (int): The maximum x-axis shift will be (instance width) / dx.
+            Defaults to 15.
+        dy (int): The maximum y-axis shift will be (instance height) / dy.
+            Defaults to 15.
+        theta (tuple): (min rotation degree, max rotation degree). \
+            Defaults to (-1, 1).
+        color_prob (float): Probability of images for color augmentation.
+            Defaults to 0.5.
+        hflag (bool): Whether to use heatmap guided. Defaults to False.
+        aug_ratio (float): Probability of applying this transformation. \
+            Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 action_candidate: tuple = ('normal', 'horizontal', 'skip'),
+                 action_prob: tuple = (1, 0, 0),
+                 scale: tuple = (0.8, 1.2),
+                 dx: int = 15,
+                 dy: int = 15,
+                 theta: tuple = (-1, 1),
+                 color_prob: float = 0.5,
+                 hflag: bool = False,
+                 aug_ratio: float = 0.5) -> None:
+
+        import matplotlib
+        import matplotlib.pyplot as plt
+        default_backend = plt.get_backend()
+
+        try:
+            import instaboostfast as instaboost
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install instaboostfast" '
+                'to install instaboostfast first for instaboost augmentation.')
+
+        # instaboost will modify the default backend
+        # and cause visualization to fail.
+        matplotlib.use(default_backend)
+
+        self.cfg = instaboost.InstaBoostConfig(action_candidate, action_prob,
+                                               scale, dx, dy, theta,
+                                               color_prob, hflag)
+        self.aug_ratio = aug_ratio
+
+    def _load_anns(self, results: dict) -> Tuple[list, list]:
+        """Convert raw anns to instaboost expected input format."""
+        anns = []
+        ignore_anns = []
+        for instance in results['instances']:
+            label = instance['bbox_label']
+            bbox = instance['bbox']
+            mask = instance['mask']
+            x1, y1, x2, y2 = bbox
+            # assert (x2 - x1) >= 1 and (y2 - y1) >= 1
+            bbox = [x1, y1, x2 - x1, y2 - y1]
+
+            if instance['ignore_flag'] == 0:
+                anns.append({
+                    'category_id': label,
+                    'segmentation': mask,
+                    'bbox': bbox
+                })
+            else:
+                # Ignore instances without data augmentation
+                ignore_anns.append(instance)
+        return anns, ignore_anns
+
+    def _parse_anns(self, results: dict, anns: list, ignore_anns: list,
+                    img: np.ndarray) -> dict:
+        """Restore the result of instaboost processing to the original anns
+        format."""
+        instances = []
+        for ann in anns:
+            x1, y1, w, h = ann['bbox']
+            # TODO: more essential bug need to be fixed in instaboost
+            if w <= 0 or h <= 0:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            instances.append(
+                dict(
+                    bbox=bbox,
+                    bbox_label=ann['category_id'],
+                    mask=ann['segmentation'],
+                    ignore_flag=0))
+
+        instances.extend(ignore_anns)
+        results['img'] = img
+        results['instances'] = instances
+        return results
+
+    def transform(self, results) -> dict:
+        """The transform function."""
+        img = results['img']
+        ori_type = img.dtype
+        if 'instances' not in results or len(results['instances']) == 0:
+            return results
+
+        anns, ignore_anns = self._load_anns(results)
+        if np.random.choice([0, 1], p=[1 - self.aug_ratio, self.aug_ratio]):
+            try:
+                import instaboostfast as instaboost
+            except ImportError:
+                raise ImportError('Please run "pip install instaboostfast" '
+                                  'to install instaboostfast first.')
+            anns, img = instaboost.get_new_data(
+                anns, img.astype(np.uint8), self.cfg, background=None)
+
+        results = self._parse_anns(results, anns, ignore_anns,
+                                   img.astype(ori_type))
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(aug_ratio={self.aug_ratio})'
+        return repr_str
diff --git a/head_extractor/build/lib/mmdet/datasets/transforms/loading.py b/head_extractor/build/lib/mmdet/datasets/transforms/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..722d4b0e7c830dfde2412746db1258b880167a2f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/transforms/loading.py
@@ -0,0 +1,1074 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+import torch
+from mmcv.transforms import BaseTransform
+from mmcv.transforms import LoadAnnotations as MMCV_LoadAnnotations
+from mmcv.transforms import LoadImageFromFile
+from mmengine.fileio import get
+from mmengine.structures import BaseDataElement
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import get_box_type
+from mmdet.structures.bbox.box_type import autocast_box_type
+from mmdet.structures.mask import BitmapMasks, PolygonMasks
+
+
+@TRANSFORMS.register_module()
+class LoadImageFromNDArray(LoadImageFromFile):
+    """Load an image from ``results['img']``.
+
+    Similar with :obj:`LoadImageFromFile`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['img']``. Can be used when loading image
+    from webcam.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_path
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            results (dict): Result dict with Webcam read image in
+                ``results['img']``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        img = results['img']
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img_path'] = None
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadMultiChannelImageFromFiles(BaseTransform):
+    """Load multi-channel images from a list of separate channel files.
+
+    Required Keys:
+
+    - img_path
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:``mmcv.imfrombytes``.
+            Defaults to 'unchanged'.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:``mmcv.imfrombytes``.
+            See :func:``mmcv.imfrombytes`` for details.
+            Defaults to 'cv2'.
+        file_client_args (dict): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet >= 3.0.0rc7. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        to_float32: bool = False,
+        color_type: str = 'unchanged',
+        imdecode_backend: str = 'cv2',
+        file_client_args: dict = None,
+        backend_args: dict = None,
+    ) -> None:
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.imdecode_backend = imdecode_backend
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+    def transform(self, results: dict) -> dict:
+        """Transform functions to load multiple images and get images meta
+        information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded images and meta information.
+        """
+
+        assert isinstance(results['img_path'], list)
+        img = []
+        for name in results['img_path']:
+            img_bytes = get(name, backend_args=self.backend_args)
+            img.append(
+                mmcv.imfrombytes(
+                    img_bytes,
+                    flag=self.color_type,
+                    backend=self.imdecode_backend))
+        img = np.stack(img, axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f"imdecode_backend='{self.imdecode_backend}', "
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadAnnotations(MMCV_LoadAnnotations):
+    """Load and process the ``instances`` and ``seg_map`` annotation provided
+    by dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'instances':
+            [
+                {
+                # List of 4 numbers representing the bounding box of the
+                # instance, in (x1, y1, x2, y2) order.
+                'bbox': [x1, y1, x2, y2],
+
+                # Label of image classification.
+                'bbox_label': 1,
+
+                # Used in instance/panoptic segmentation. The segmentation mask
+                # of the instance or the information of segments.
+                # 1. If list[list[float]], it represents a list of polygons,
+                # one for each connected component of the object. Each
+                # list[float] is one simple polygon in the format of
+                # [x1, y1, ..., xn, yn] (n >= 3). The Xs and Ys are absolute
+                # coordinates in unit of pixels.
+                # 2. If dict, it represents the per-pixel segmentation mask in
+                # COCO's compressed RLE format. The dict should have keys
+                # “size” and “counts”.  Can be loaded by pycocotools
+                'mask': list[list[float]] or dict,
+
+                }
+            ]
+            # Filename of semantic or panoptic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+
+    .. code-block:: python
+
+        {
+            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
+            # in an image
+            'gt_bboxes': BaseBoxes(N, 4)
+             # In int type.
+            'gt_bboxes_labels': np.ndarray(N, )
+             # In built-in class
+            'gt_masks': PolygonMasks (H, W) or BitmapMasks (H, W)
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+             # in (x, y, v) order, float type.
+        }
+
+    Required Keys:
+
+    - height
+    - width
+    - instances
+
+      - bbox (optional)
+      - bbox_label
+      - mask (optional)
+      - ignore_flag
+
+    - seg_map_path (optional)
+
+    Added Keys:
+
+    - gt_bboxes (BaseBoxes[torch.float32])
+    - gt_bboxes_labels (np.int64)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (bool)
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+            Defaults to True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Defaults to True.
+        with_mask (bool): Whether to parse and load the mask annotation.
+             Default: False.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Defaults to False.
+        poly2mask (bool): Whether to convert mask to bitmap. Default: True.
+        box_type (str): The box type used to wrap the bboxes. If ``box_type``
+            is None, gt_bboxes will keep being np.ndarray. Defaults to 'hbox'.
+        reduce_zero_label (bool): Whether reduce all label value
+            by 1. Usually used for datasets where 0 is background label.
+            Defaults to False.
+        ignore_index (int): The label index to be ignored.
+            Valid only if reduce_zero_label is true. Defaults is 255.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:``mmcv.imfrombytes``.
+            See :fun:``mmcv.imfrombytes`` for details.
+            Defaults to 'cv2'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            with_mask: bool = False,
+            poly2mask: bool = True,
+            box_type: str = 'hbox',
+            # use for semseg
+            reduce_zero_label: bool = False,
+            ignore_index: int = 255,
+            **kwargs) -> None:
+        super(LoadAnnotations, self).__init__(**kwargs)
+        self.with_mask = with_mask
+        self.poly2mask = poly2mask
+        self.box_type = box_type
+        self.reduce_zero_label = reduce_zero_label
+        self.ignore_index = ignore_index
+
+    def _load_bboxes(self, results: dict) -> None:
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        gt_bboxes = []
+        gt_ignore_flags = []
+        for instance in results.get('instances', []):
+            gt_bboxes.append(instance['bbox'])
+            gt_ignore_flags.append(instance['ignore_flag'])
+        if self.box_type is None:
+            results['gt_bboxes'] = np.array(
+                gt_bboxes, dtype=np.float32).reshape((-1, 4))
+        else:
+            _, box_type_cls = get_box_type(self.box_type)
+            results['gt_bboxes'] = box_type_cls(gt_bboxes, dtype=torch.float32)
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+
+    def _load_labels(self, results: dict) -> None:
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+        gt_bboxes_labels = []
+        for instance in results.get('instances', []):
+            gt_bboxes_labels.append(instance['bbox_label'])
+        # TODO: Inconsistent with mmcv, consider how to deal with it later.
+        results['gt_bboxes_labels'] = np.array(
+            gt_bboxes_labels, dtype=np.int64)
+
+    def _poly2mask(self, mask_ann: Union[list, dict], img_h: int,
+                   img_w: int) -> np.ndarray:
+        """Private function to convert masks represented with polygon to
+        bitmaps.
+
+        Args:
+            mask_ann (list | dict): Polygon mask annotation input.
+            img_h (int): The height of output mask.
+            img_w (int): The width of output mask.
+
+        Returns:
+            np.ndarray: The decode bitmap mask of shape (img_h, img_w).
+        """
+
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+            rle = maskUtils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = maskUtils.decode(rle)
+        return mask
+
+    def _process_masks(self, results: dict) -> list:
+        """Process gt_masks and filter invalid polygons.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+
+        Returns:
+            list: Processed gt_masks.
+        """
+        gt_masks = []
+        gt_ignore_flags = []
+        for instance in results.get('instances', []):
+            gt_mask = instance['mask']
+            # If the annotation of segmentation mask is invalid,
+            # ignore the whole instance.
+            if isinstance(gt_mask, list):
+                gt_mask = [
+                    np.array(polygon) for polygon in gt_mask
+                    if len(polygon) % 2 == 0 and len(polygon) >= 6
+                ]
+                if len(gt_mask) == 0:
+                    # ignore this instance and set gt_mask to a fake mask
+                    instance['ignore_flag'] = 1
+                    gt_mask = [np.zeros(6)]
+            elif not self.poly2mask:
+                # `PolygonMasks` requires a ploygon of format List[np.array],
+                # other formats are invalid.
+                instance['ignore_flag'] = 1
+                gt_mask = [np.zeros(6)]
+            elif isinstance(gt_mask, dict) and \
+                    not (gt_mask.get('counts') is not None and
+                         gt_mask.get('size') is not None and
+                         isinstance(gt_mask['counts'], (list, str))):
+                # if gt_mask is a dict, it should include `counts` and `size`,
+                # so that `BitmapMasks` can uncompressed RLE
+                instance['ignore_flag'] = 1
+                gt_mask = [np.zeros(6)]
+            gt_masks.append(gt_mask)
+            # re-process gt_ignore_flags
+            gt_ignore_flags.append(instance['ignore_flag'])
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+        return gt_masks
+
+    def _load_masks(self, results: dict) -> None:
+        """Private function to load mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        """
+        h, w = results['ori_shape']
+        gt_masks = self._process_masks(results)
+        if self.poly2mask:
+            gt_masks = BitmapMasks(
+                [self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
+        else:
+            # fake polygon masks will be ignored in `PackDetInputs`
+            gt_masks = PolygonMasks([mask for mask in gt_masks], h, w)
+        results['gt_masks'] = gt_masks
+
+    def _load_seg_map(self, results: dict) -> None:
+        """Private function to load semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+        if results.get('seg_map_path', None) is None:
+            return
+
+        img_bytes = get(
+            results['seg_map_path'], backend_args=self.backend_args)
+        gt_semantic_seg = mmcv.imfrombytes(
+            img_bytes, flag='unchanged',
+            backend=self.imdecode_backend).squeeze()
+
+        if self.reduce_zero_label:
+            # avoid using underflow conversion
+            gt_semantic_seg[gt_semantic_seg == 0] = self.ignore_index
+            gt_semantic_seg = gt_semantic_seg - 1
+            gt_semantic_seg[gt_semantic_seg == self.ignore_index -
+                            1] = self.ignore_index
+
+        # modify if custom classes
+        if results.get('label_map', None) is not None:
+            # Add deep copy to solve bug of repeatedly
+            # replace `gt_semantic_seg`, which is reported in
+            # https://github.com/open-mmlab/mmsegmentation/pull/1445/
+            gt_semantic_seg_copy = gt_semantic_seg.copy()
+            for old_id, new_id in results['label_map'].items():
+                gt_semantic_seg[gt_semantic_seg_copy == old_id] = new_id
+        results['gt_seg_map'] = gt_semantic_seg
+        results['ignore_index'] = self.ignore_index
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label and
+            semantic segmentation.
+        """
+
+        if self.with_bbox:
+            self._load_bboxes(results)
+        if self.with_label:
+            self._load_labels(results)
+        if self.with_mask:
+            self._load_masks(results)
+        if self.with_seg:
+            self._load_seg_map(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'poly2mask={self.poly2mask}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'backend_args={self.backend_args})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadPanopticAnnotations(LoadAnnotations):
+    """Load multiple types of panoptic annotations.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'instances':
+            [
+                {
+                # List of 4 numbers representing the bounding box of the
+                # instance, in (x1, y1, x2, y2) order.
+                'bbox': [x1, y1, x2, y2],
+
+                # Label of image classification.
+                'bbox_label': 1,
+                },
+                ...
+            ]
+            'segments_info':
+            [
+                {
+                # id = cls_id + instance_id * INSTANCE_OFFSET
+                'id': int,
+
+                # Contiguous category id defined in dataset.
+                'category': int
+
+                # Thing flag.
+                'is_thing': bool
+                },
+                ...
+            ]
+
+            # Filename of semantic or panoptic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+
+    .. code-block:: python
+
+        {
+            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
+            # in an image
+            'gt_bboxes': BaseBoxes(N, 4)
+             # In int type.
+            'gt_bboxes_labels': np.ndarray(N, )
+             # In built-in class
+            'gt_masks': PolygonMasks (H, W) or BitmapMasks (H, W)
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+             # in (x, y, v) order, float type.
+        }
+
+    Required Keys:
+
+    - height
+    - width
+    - instances
+      - bbox
+      - bbox_label
+      - ignore_flag
+    - segments_info
+      - id
+      - category
+      - is_thing
+    - seg_map_path
+
+    Added Keys:
+
+    - gt_bboxes (BaseBoxes[torch.float32])
+    - gt_bboxes_labels (np.int64)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (bool)
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+            Defaults to True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Defaults to True.
+        with_mask (bool): Whether to parse and load the mask annotation.
+             Defaults to True.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Defaults to False.
+        box_type (str): The box mode used to wrap the bboxes.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:``mmcv.imfrombytes``.
+            See :fun:``mmcv.imfrombytes`` for details.
+            Defaults to 'cv2'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet >= 3.0.0rc7. Defaults to None.
+    """
+
+    def __init__(self,
+                 with_bbox: bool = True,
+                 with_label: bool = True,
+                 with_mask: bool = True,
+                 with_seg: bool = True,
+                 box_type: str = 'hbox',
+                 imdecode_backend: str = 'cv2',
+                 backend_args: dict = None) -> None:
+        try:
+            from panopticapi import utils
+        except ImportError:
+            raise ImportError(
+                'panopticapi is not installed, please install it by: '
+                'pip install git+https://github.com/cocodataset/'
+                'panopticapi.git.')
+        self.rgb2id = utils.rgb2id
+
+        super(LoadPanopticAnnotations, self).__init__(
+            with_bbox=with_bbox,
+            with_label=with_label,
+            with_mask=with_mask,
+            with_seg=with_seg,
+            with_keypoints=False,
+            box_type=box_type,
+            imdecode_backend=imdecode_backend,
+            backend_args=backend_args)
+
+    def _load_masks_and_semantic_segs(self, results: dict) -> None:
+        """Private function to load mask and semantic segmentation annotations.
+
+        In gt_semantic_seg, the foreground label is from ``0`` to
+        ``num_things - 1``, the background label is from ``num_things`` to
+        ``num_things + num_stuff - 1``, 255 means the ignored label (``VOID``).
+
+        Args:
+            results (dict): Result dict from :obj:``mmdet.CustomDataset``.
+        """
+        # seg_map_path is None, when inference on the dataset without gts.
+        if results.get('seg_map_path', None) is None:
+            return
+
+        img_bytes = get(
+            results['seg_map_path'], backend_args=self.backend_args)
+        pan_png = mmcv.imfrombytes(
+            img_bytes, flag='color', channel_order='rgb').squeeze()
+        pan_png = self.rgb2id(pan_png)
+
+        gt_masks = []
+        gt_seg = np.zeros_like(pan_png) + 255  # 255 as ignore
+
+        for segment_info in results['segments_info']:
+            mask = (pan_png == segment_info['id'])
+            gt_seg = np.where(mask, segment_info['category'], gt_seg)
+
+            # The legal thing masks
+            if segment_info.get('is_thing'):
+                gt_masks.append(mask.astype(np.uint8))
+
+        if self.with_mask:
+            h, w = results['ori_shape']
+            gt_masks = BitmapMasks(gt_masks, h, w)
+            results['gt_masks'] = gt_masks
+
+        if self.with_seg:
+            results['gt_seg_map'] = gt_seg
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types panoptic annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmdet.CustomDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, mask and
+                semantic segmentation annotations.
+        """
+
+        if self.with_bbox:
+            self._load_bboxes(results)
+        if self.with_label:
+            self._load_labels(results)
+        if self.with_mask or self.with_seg:
+            # The tasks completed by '_load_masks' and '_load_semantic_segs'
+            # in LoadAnnotations are merged to one function.
+            self._load_masks_and_semantic_segs(results)
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadProposals(BaseTransform):
+    """Load proposal pipeline.
+
+    Required Keys:
+
+    - proposals
+
+    Modified Keys:
+
+    - proposals
+
+    Args:
+        num_max_proposals (int, optional): Maximum number of proposals to load.
+            If not specified, all proposals will be loaded.
+    """
+
+    def __init__(self, num_max_proposals: Optional[int] = None) -> None:
+        self.num_max_proposals = num_max_proposals
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to load proposals from file.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded proposal annotations.
+        """
+
+        proposals = results['proposals']
+        # the type of proposals should be `dict` or `InstanceData`
+        assert isinstance(proposals, dict) \
+               or isinstance(proposals, BaseDataElement)
+        bboxes = proposals['bboxes'].astype(np.float32)
+        assert bboxes.shape[1] == 4, \
+            f'Proposals should have shapes (n, 4), but found {bboxes.shape}'
+
+        if 'scores' in proposals:
+            scores = proposals['scores'].astype(np.float32)
+            assert bboxes.shape[0] == scores.shape[0]
+        else:
+            scores = np.zeros(bboxes.shape[0], dtype=np.float32)
+
+        if self.num_max_proposals is not None:
+            # proposals should sort by scores during dumping the proposals
+            bboxes = bboxes[:self.num_max_proposals]
+            scores = scores[:self.num_max_proposals]
+
+        if len(bboxes) == 0:
+            bboxes = np.zeros((0, 4), dtype=np.float32)
+            scores = np.zeros(0, dtype=np.float32)
+
+        results['proposals'] = bboxes
+        results['proposals_scores'] = scores
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(num_max_proposals={self.num_max_proposals})'
+
+
+@TRANSFORMS.register_module()
+class FilterAnnotations(BaseTransform):
+    """Filter invalid annotations.
+
+    Required Keys:
+
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_masks (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        min_gt_bbox_wh (tuple[float]): Minimum width and height of ground truth
+            boxes. Default: (1., 1.)
+        min_gt_mask_area (int): Minimum foreground area of ground truth masks.
+            Default: 1
+        by_box (bool): Filter instances with bounding boxes not meeting the
+            min_gt_bbox_wh threshold. Default: True
+        by_mask (bool): Filter instances with masks not meeting
+            min_gt_mask_area threshold. Default: False
+        keep_empty (bool): Whether to return None when it
+            becomes an empty bbox after filtering. Defaults to True.
+    """
+
+    def __init__(self,
+                 min_gt_bbox_wh: Tuple[int, int] = (1, 1),
+                 min_gt_mask_area: int = 1,
+                 by_box: bool = True,
+                 by_mask: bool = False,
+                 keep_empty: bool = True) -> None:
+        # TODO: add more filter options
+        assert by_box or by_mask
+        self.min_gt_bbox_wh = min_gt_bbox_wh
+        self.min_gt_mask_area = min_gt_mask_area
+        self.by_box = by_box
+        self.by_mask = by_mask
+        self.keep_empty = keep_empty
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """Transform function to filter annotations.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        assert 'gt_bboxes' in results
+        gt_bboxes = results['gt_bboxes']
+        if gt_bboxes.shape[0] == 0:
+            return results
+
+        tests = []
+        if self.by_box:
+            tests.append(
+                ((gt_bboxes.widths > self.min_gt_bbox_wh[0]) &
+                 (gt_bboxes.heights > self.min_gt_bbox_wh[1])).numpy())
+        if self.by_mask:
+            assert 'gt_masks' in results
+            gt_masks = results['gt_masks']
+            tests.append(gt_masks.areas >= self.min_gt_mask_area)
+
+        keep = tests[0]
+        for t in tests[1:]:
+            keep = keep & t
+
+        if not keep.any():
+            if self.keep_empty:
+                return None
+
+        keys = ('gt_bboxes', 'gt_bboxes_labels', 'gt_masks', 'gt_ignore_flags')
+        for key in keys:
+            if key in results:
+                results[key] = results[key][keep]
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(min_gt_bbox_wh={self.min_gt_bbox_wh}, ' \
+               f'keep_empty={self.keep_empty})'
+
+
+@TRANSFORMS.register_module()
+class LoadEmptyAnnotations(BaseTransform):
+    """Load Empty Annotations for unlabeled images.
+
+    Added Keys:
+    - gt_bboxes (np.float32)
+    - gt_bboxes_labels (np.int64)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (bool)
+
+    Args:
+        with_bbox (bool): Whether to load the pseudo bbox annotation.
+            Defaults to True.
+        with_label (bool): Whether to load the pseudo label annotation.
+            Defaults to True.
+        with_mask (bool): Whether to load the pseudo mask annotation.
+             Default: False.
+        with_seg (bool): Whether to load the pseudo semantic segmentation
+            annotation. Defaults to False.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+    """
+
+    def __init__(self,
+                 with_bbox: bool = True,
+                 with_label: bool = True,
+                 with_mask: bool = False,
+                 with_seg: bool = False,
+                 seg_ignore_label: int = 255) -> None:
+        self.with_bbox = with_bbox
+        self.with_label = with_label
+        self.with_mask = with_mask
+        self.with_seg = with_seg
+        self.seg_ignore_label = seg_ignore_label
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to load empty annotations.
+
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Updated result dict.
+        """
+        if self.with_bbox:
+            results['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
+            results['gt_ignore_flags'] = np.zeros((0, ), dtype=bool)
+        if self.with_label:
+            results['gt_bboxes_labels'] = np.zeros((0, ), dtype=np.int64)
+        if self.with_mask:
+            # TODO: support PolygonMasks
+            h, w = results['img_shape']
+            gt_masks = np.zeros((0, h, w), dtype=np.uint8)
+            results['gt_masks'] = BitmapMasks(gt_masks, h, w)
+        if self.with_seg:
+            h, w = results['img_shape']
+            results['gt_seg_map'] = self.seg_ignore_label * np.ones(
+                (h, w), dtype=np.uint8)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class InferencerLoader(BaseTransform):
+    """Load an image from ``results['img']``.
+
+    Similar with :obj:`LoadImageFromFile`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['img']``. Can be used when loading image
+    from webcam.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_path
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__()
+        self.from_file = TRANSFORMS.build(
+            dict(type='LoadImageFromFile', **kwargs))
+        self.from_ndarray = TRANSFORMS.build(
+            dict(type='mmdet.LoadImageFromNDArray', **kwargs))
+
+    def transform(self, results: Union[str, np.ndarray, dict]) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            results (str, np.ndarray or dict): The result.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        if isinstance(results, str):
+            inputs = dict(img_path=results)
+        elif isinstance(results, np.ndarray):
+            inputs = dict(img=results)
+        elif isinstance(results, dict):
+            inputs = results
+        else:
+            raise NotImplementedError
+
+        if 'img' in inputs:
+            return self.from_ndarray(inputs)
+        return self.from_file(inputs)
+
+
+@TRANSFORMS.register_module()
+class LoadTrackAnnotations(LoadAnnotations):
+    """Load and process the ``instances`` and ``seg_map`` annotation provided
+    by dataset. It must load ``instances_ids`` which is only used in the
+    tracking tasks. The annotation format is as the following:
+
+    .. code-block:: python
+        {
+            'instances':
+            [
+                {
+                # List of 4 numbers representing the bounding box of the
+                # instance, in (x1, y1, x2, y2) order.
+                'bbox': [x1, y1, x2, y2],
+                # Label of image classification.
+                'bbox_label': 1,
+                # Used in tracking.
+                # Id of instances.
+                'instance_id': 100,
+                # Used in instance/panoptic segmentation. The segmentation mask
+                # of the instance or the information of segments.
+                # 1. If list[list[float]], it represents a list of polygons,
+                # one for each connected component of the object. Each
+                # list[float] is one simple polygon in the format of
+                # [x1, y1, ..., xn, yn] (n >= 3). The Xs and Ys are absolute
+                # coordinates in unit of pixels.
+                # 2. If dict, it represents the per-pixel segmentation mask in
+                # COCO's compressed RLE format. The dict should have keys
+                # “size” and “counts”.  Can be loaded by pycocotools
+                'mask': list[list[float]] or dict,
+                }
+            ]
+            # Filename of semantic or panoptic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+    .. code-block:: python
+        {
+            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
+            # in an image
+            'gt_bboxes': np.ndarray(N, 4)
+             # In int type.
+            'gt_bboxes_labels': np.ndarray(N, )
+             # In built-in class
+            'gt_masks': PolygonMasks (H, W) or BitmapMasks (H, W)
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+             # in (x, y, v) order, float type.
+        }
+
+    Required Keys:
+
+    - height (optional)
+    - width (optional)
+    - instances
+      - bbox (optional)
+      - bbox_label
+      - instance_id (optional)
+      - mask (optional)
+      - ignore_flag (optional)
+    - seg_map_path (optional)
+
+    Added Keys:
+
+    - gt_bboxes (np.float32)
+    - gt_bboxes_labels (np.int32)
+    - gt_instances_ids (np.int32)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (np.bool)
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+    def _load_bboxes(self, results: dict) -> None:
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        gt_bboxes = []
+        gt_ignore_flags = []
+        # TODO: use bbox_type
+        for instance in results['instances']:
+            # The datasets which are only format in evaluation don't have
+            # groundtruth boxes.
+            if 'bbox' in instance:
+                gt_bboxes.append(instance['bbox'])
+            if 'ignore_flag' in instance:
+                gt_ignore_flags.append(instance['ignore_flag'])
+
+        # TODO: check this case
+        if len(gt_bboxes) != len(gt_ignore_flags):
+            # There may be no ``gt_ignore_flags`` in some cases, we treat them
+            # as all False in order to keep the length of ``gt_bboxes`` and
+            # ``gt_ignore_flags`` the same
+            gt_ignore_flags = [False] * len(gt_bboxes)
+
+        results['gt_bboxes'] = np.array(
+            gt_bboxes, dtype=np.float32).reshape(-1, 4)
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+
+    def _load_instances_ids(self, results: dict) -> None:
+        """Private function to load instances id annotations.
+
+        Args:
+            results (dict): Result dict from :obj :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict containing instances id annotations.
+        """
+        gt_instances_ids = []
+        for instance in results['instances']:
+            gt_instances_ids.append(instance['instance_id'])
+        results['gt_instances_ids'] = np.array(
+            gt_instances_ids, dtype=np.int32)
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, instances id
+            and semantic segmentation and keypoints annotations.
+        """
+        results = super().transform(results)
+        self._load_instances_ids(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'poly2mask={self.poly2mask}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'file_client_args={self.file_client_args})'
+        return repr_str
diff --git a/head_extractor/build/lib/mmdet/datasets/transforms/text_transformers.py b/head_extractor/build/lib/mmdet/datasets/transforms/text_transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..12a0e57db3d41baa6f5b7d1834ba74538ad9ca19
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/transforms/text_transformers.py
@@ -0,0 +1,255 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import BaseBoxes
+
+try:
+    from transformers import AutoTokenizer
+    from transformers import BertModel as HFBertModel
+except ImportError:
+    AutoTokenizer = None
+    HFBertModel = None
+
+import random
+import re
+
+import numpy as np
+
+
+def clean_name(name):
+    name = re.sub(r'\(.*\)', '', name)
+    name = re.sub(r'_', ' ', name)
+    name = re.sub(r'  ', ' ', name)
+    name = name.lower()
+    return name
+
+
+def check_for_positive_overflow(gt_bboxes, gt_labels, text, tokenizer,
+                                max_tokens):
+    # Check if we have too many positive labels
+    # generate a caption by appending the positive labels
+    positive_label_list = np.unique(gt_labels).tolist()
+    # random shuffule so we can sample different annotations
+    # at different epochs
+    random.shuffle(positive_label_list)
+
+    kept_lables = []
+    length = 0
+
+    for index, label in enumerate(positive_label_list):
+
+        label_text = clean_name(text[str(label)]) + '. '
+
+        tokenized = tokenizer.tokenize(label_text)
+
+        length += len(tokenized)
+
+        if length > max_tokens:
+            break
+        else:
+            kept_lables.append(label)
+
+    keep_box_index = []
+    keep_gt_labels = []
+    for i in range(len(gt_labels)):
+        if gt_labels[i] in kept_lables:
+            keep_box_index.append(i)
+            keep_gt_labels.append(gt_labels[i])
+
+    return gt_bboxes[keep_box_index], np.array(
+        keep_gt_labels, dtype=np.long), length
+
+
+def generate_senetence_given_labels(positive_label_list, negative_label_list,
+                                    text):
+    label_to_positions = {}
+
+    label_list = negative_label_list + positive_label_list
+
+    random.shuffle(label_list)
+
+    pheso_caption = ''
+
+    label_remap_dict = {}
+    for index, label in enumerate(label_list):
+
+        start_index = len(pheso_caption)
+
+        pheso_caption += clean_name(text[str(label)])
+
+        end_index = len(pheso_caption)
+
+        if label in positive_label_list:
+            label_to_positions[index] = [[start_index, end_index]]
+            label_remap_dict[int(label)] = index
+
+        # if index != len(label_list) - 1:
+        #     pheso_caption += '. '
+        pheso_caption += '. '
+
+    return label_to_positions, pheso_caption, label_remap_dict
+
+
+@TRANSFORMS.register_module()
+class RandomSamplingNegPos(BaseTransform):
+
+    def __init__(self,
+                 tokenizer_name,
+                 num_sample_negative=85,
+                 max_tokens=256,
+                 full_sampling_prob=0.5,
+                 label_map_file=None):
+        if AutoTokenizer is None:
+            raise RuntimeError(
+                'transformers is not installed, please install it by: '
+                'pip install transformers.')
+
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        self.num_sample_negative = num_sample_negative
+        self.full_sampling_prob = full_sampling_prob
+        self.max_tokens = max_tokens
+        self.label_map = None
+        if label_map_file:
+            with open(label_map_file, 'r') as file:
+                self.label_map = json.load(file)
+
+    def transform(self, results: dict) -> dict:
+        if 'phrases' in results:
+            return self.vg_aug(results)
+        else:
+            return self.od_aug(results)
+
+    def vg_aug(self, results):
+        gt_bboxes = results['gt_bboxes']
+        if isinstance(gt_bboxes, BaseBoxes):
+            gt_bboxes = gt_bboxes.tensor
+        gt_labels = results['gt_bboxes_labels']
+        text = results['text'].lower().strip()
+        if not text.endswith('.'):
+            text = text + '. '
+
+        phrases = results['phrases']
+        # TODO: add neg
+        positive_label_list = np.unique(gt_labels).tolist()
+        label_to_positions = {}
+        for label in positive_label_list:
+            label_to_positions[label] = phrases[label]['tokens_positive']
+
+        results['gt_bboxes'] = gt_bboxes
+        results['gt_bboxes_labels'] = gt_labels
+
+        results['text'] = text
+        results['tokens_positive'] = label_to_positions
+        return results
+
+    def od_aug(self, results):
+        gt_bboxes = results['gt_bboxes']
+        if isinstance(gt_bboxes, BaseBoxes):
+            gt_bboxes = gt_bboxes.tensor
+        gt_labels = results['gt_bboxes_labels']
+
+        if 'text' not in results:
+            assert self.label_map is not None
+            text = self.label_map
+        else:
+            text = results['text']
+
+        original_box_num = len(gt_labels)
+        # If the category name is in the format of 'a/b' (in object365),
+        # we randomly select one of them.
+        for key, value in text.items():
+            if '/' in value:
+                text[key] = random.choice(value.split('/')).strip()
+
+        gt_bboxes, gt_labels, positive_caption_length = \
+            check_for_positive_overflow(gt_bboxes, gt_labels,
+                                        text, self.tokenizer, self.max_tokens)
+
+        if len(gt_bboxes) < original_box_num:
+            print('WARNING: removed {} boxes due to positive caption overflow'.
+                  format(original_box_num - len(gt_bboxes)))
+
+        valid_negative_indexes = list(text.keys())
+
+        positive_label_list = np.unique(gt_labels).tolist()
+        full_negative = self.num_sample_negative
+
+        if full_negative > len(valid_negative_indexes):
+            full_negative = len(valid_negative_indexes)
+
+        outer_prob = random.random()
+
+        if outer_prob < self.full_sampling_prob:
+            # c. probability_full: add both all positive and all negatives
+            num_negatives = full_negative
+        else:
+            if random.random() < 1.0:
+                num_negatives = np.random.choice(max(1, full_negative)) + 1
+            else:
+                num_negatives = full_negative
+
+        # Keep some negatives
+        negative_label_list = set()
+        if num_negatives != -1:
+            if num_negatives > len(valid_negative_indexes):
+                num_negatives = len(valid_negative_indexes)
+
+            for i in np.random.choice(
+                    valid_negative_indexes, size=num_negatives, replace=False):
+                if int(i) not in positive_label_list:
+                    negative_label_list.add(i)
+
+        random.shuffle(positive_label_list)
+
+        negative_label_list = list(negative_label_list)
+        random.shuffle(negative_label_list)
+
+        negative_max_length = self.max_tokens - positive_caption_length
+        screened_negative_label_list = []
+
+        for negative_label in negative_label_list:
+            label_text = clean_name(text[str(negative_label)]) + '. '
+
+            tokenized = self.tokenizer.tokenize(label_text)
+
+            negative_max_length -= len(tokenized)
+
+            if negative_max_length > 0:
+                screened_negative_label_list.append(negative_label)
+            else:
+                break
+        negative_label_list = screened_negative_label_list
+        label_to_positions, pheso_caption, label_remap_dict = \
+            generate_senetence_given_labels(positive_label_list,
+                                            negative_label_list, text)
+
+        # label remap
+        if len(gt_labels) > 0:
+            gt_labels = np.vectorize(lambda x: label_remap_dict[x])(gt_labels)
+
+        results['gt_bboxes'] = gt_bboxes
+        results['gt_bboxes_labels'] = gt_labels
+
+        results['text'] = pheso_caption
+        results['tokens_positive'] = label_to_positions
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadTextAnnotations(BaseTransform):
+
+    def transform(self, results: dict) -> dict:
+        if 'phrases' in results:
+            tokens_positive = [
+                phrase['tokens_positive']
+                for phrase in results['phrases'].values()
+            ]
+            results['tokens_positive'] = tokens_positive
+        else:
+            text = results['text']
+            results['text'] = list(text.values())
+        return results
diff --git a/head_extractor/build/lib/mmdet/datasets/transforms/transformers_glip.py b/head_extractor/build/lib/mmdet/datasets/transforms/transformers_glip.py
new file mode 100644
index 0000000000000000000000000000000000000000..60c4f87d1b86c13f886da27584114b6420b8b8cb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/transforms/transformers_glip.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import HorizontalBoxes, autocast_box_type
+from .transforms import RandomFlip
+
+
+@TRANSFORMS.register_module()
+class GTBoxSubOne_GLIP(BaseTransform):
+    """Subtract 1 from the x2 and y2 coordinates of the gt_bboxes."""
+
+    def transform(self, results: dict) -> dict:
+        if 'gt_bboxes' in results:
+            gt_bboxes = results['gt_bboxes']
+            if isinstance(gt_bboxes, np.ndarray):
+                gt_bboxes[:, 2:] -= 1
+                results['gt_bboxes'] = gt_bboxes
+            elif isinstance(gt_bboxes, HorizontalBoxes):
+                gt_bboxes = results['gt_bboxes'].tensor
+                gt_bboxes[:, 2:] -= 1
+                results['gt_bboxes'] = HorizontalBoxes(gt_bboxes)
+            else:
+                raise NotImplementedError
+        return results
+
+
+@TRANSFORMS.register_module()
+class RandomFlip_GLIP(RandomFlip):
+    """Flip the image & bboxes & masks & segs horizontally or vertically.
+
+    When using horizontal flipping, the corresponding bbox x-coordinate needs
+    to be additionally subtracted by one.
+    """
+
+    @autocast_box_type()
+    def _flip(self, results: dict) -> None:
+        """Flip images, bounding boxes, and semantic segmentation map."""
+        # flip image
+        results['img'] = mmcv.imflip(
+            results['img'], direction=results['flip_direction'])
+
+        img_shape = results['img'].shape[:2]
+
+        # flip bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'].flip_(img_shape, results['flip_direction'])
+            # Only change this line
+            if results['flip_direction'] == 'horizontal':
+                results['gt_bboxes'].translate_([-1, 0])
+
+        # TODO: check it
+        # flip masks
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'].flip(
+                results['flip_direction'])
+
+        # flip segs
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = mmcv.imflip(
+                results['gt_seg_map'], direction=results['flip_direction'])
+
+        # record homography matrix for flip
+        self._record_homography_matrix(results)
diff --git a/head_extractor/build/lib/mmdet/datasets/transforms/transforms.py b/head_extractor/build/lib/mmdet/datasets/transforms/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..c50b987db33c91f759f6c89580f605631ce4f558
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/transforms/transforms.py
@@ -0,0 +1,3856 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+import math
+import warnings
+from typing import List, Optional, Sequence, Tuple, Union
+
+import cv2
+import mmcv
+import numpy as np
+from mmcv.image import imresize
+from mmcv.image.geometric import _scale_size
+from mmcv.transforms import BaseTransform
+from mmcv.transforms import Pad as MMCV_Pad
+from mmcv.transforms import RandomFlip as MMCV_RandomFlip
+from mmcv.transforms import Resize as MMCV_Resize
+from mmcv.transforms.utils import avoid_cache_randomness, cache_randomness
+from mmengine.dataset import BaseDataset
+from mmengine.utils import is_str
+from numpy import random
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import HorizontalBoxes, autocast_box_type
+from mmdet.structures.mask import BitmapMasks, PolygonMasks
+from mmdet.utils import log_img_scale
+
+try:
+    from imagecorruptions import corrupt
+except ImportError:
+    corrupt = None
+
+try:
+    import albumentations
+    from albumentations import Compose
+except ImportError:
+    albumentations = None
+    Compose = None
+
+Number = Union[int, float]
+
+
+def _fixed_scale_size(
+    size: Tuple[int, int],
+    scale: Union[float, int, tuple],
+) -> Tuple[int, int]:
+    """Rescale a size by a ratio.
+
+    Args:
+        size (tuple[int]): (w, h).
+        scale (float | tuple(float)): Scaling factor.
+
+    Returns:
+        tuple[int]: scaled size.
+    """
+    if isinstance(scale, (float, int)):
+        scale = (scale, scale)
+    w, h = size
+    # don't need o.5 offset
+    return int(w * float(scale[0])), int(h * float(scale[1]))
+
+
+def rescale_size(old_size: tuple,
+                 scale: Union[float, int, tuple],
+                 return_scale: bool = False) -> tuple:
+    """Calculate the new size to be rescaled to.
+
+    Args:
+        old_size (tuple[int]): The old size (w, h) of image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image size.
+
+    Returns:
+        tuple[int]: The new rescaled image size.
+    """
+    w, h = old_size
+    if isinstance(scale, (float, int)):
+        if scale <= 0:
+            raise ValueError(f'Invalid scale {scale}, must be positive.')
+        scale_factor = scale
+    elif isinstance(scale, tuple):
+        max_long_edge = max(scale)
+        max_short_edge = min(scale)
+        scale_factor = min(max_long_edge / max(h, w),
+                           max_short_edge / min(h, w))
+    else:
+        raise TypeError(
+            f'Scale must be a number or tuple of int, but got {type(scale)}')
+    # only change this
+    new_size = _fixed_scale_size((w, h), scale_factor)
+
+    if return_scale:
+        return new_size, scale_factor
+    else:
+        return new_size
+
+
+def imrescale(
+    img: np.ndarray,
+    scale: Union[float, Tuple[int, int]],
+    return_scale: bool = False,
+    interpolation: str = 'bilinear',
+    backend: Optional[str] = None
+) -> Union[np.ndarray, Tuple[np.ndarray, float]]:
+    """Resize image while keeping the aspect ratio.
+
+    Args:
+        img (ndarray): The input image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The rescaled image.
+    """
+    h, w = img.shape[:2]
+    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
+    rescaled_img = imresize(
+        img, new_size, interpolation=interpolation, backend=backend)
+    if return_scale:
+        return rescaled_img, scale_factor
+    else:
+        return rescaled_img
+
+
+@TRANSFORMS.register_module()
+class Resize(MMCV_Resize):
+    """Resize images & bbox & seg.
+
+    This transform resizes the input image according to ``scale`` or
+    ``scale_factor``. Bboxes, masks, and seg map are then resized
+    with the same scale factor.
+    if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to
+    resize.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+    - homography_matrix
+
+    Args:
+        scale (int or tuple): Images scales for resizing. Defaults to None
+        scale_factor (float or tuple[float]): Scale factors for resizing.
+            Defaults to None.
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Defaults to False.
+        clip_object_border (bool): Whether to clip the objects
+            outside the border of the image. In some dataset like MOT17, the gt
+            bboxes are allowed to cross the border of images. Therefore, we
+            don't need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def _resize_masks(self, results: dict) -> None:
+        """Resize masks with ``results['scale']``"""
+        if results.get('gt_masks', None) is not None:
+            if self.keep_ratio:
+                results['gt_masks'] = results['gt_masks'].rescale(
+                    results['scale'])
+            else:
+                results['gt_masks'] = results['gt_masks'].resize(
+                    results['img_shape'])
+
+    def _resize_bboxes(self, results: dict) -> None:
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'].rescale_(results['scale_factor'])
+            if self.clip_object_border:
+                results['gt_bboxes'].clip_(results['img_shape'])
+
+    def _record_homography_matrix(self, results: dict) -> None:
+        """Record the homography matrix for the Resize."""
+        w_scale, h_scale = results['scale_factor']
+        homography_matrix = np.array(
+            [[w_scale, 0, 0], [0, h_scale, 0], [0, 0, 1]], dtype=np.float32)
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = homography_matrix
+        else:
+            results['homography_matrix'] = homography_matrix @ results[
+                'homography_matrix']
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to resize images, bounding boxes and semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys
+            are updated in result dict.
+        """
+        if self.scale:
+            results['scale'] = self.scale
+        else:
+            img_shape = results['img'].shape[:2]
+            results['scale'] = _scale_size(img_shape[::-1], self.scale_factor)
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        self._record_homography_matrix(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(scale={self.scale}, '
+        repr_str += f'scale_factor={self.scale_factor}, '
+        repr_str += f'keep_ratio={self.keep_ratio}, '
+        repr_str += f'clip_object_border={self.clip_object_border}), '
+        repr_str += f'backend={self.backend}), '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class FixScaleResize(Resize):
+    """Compared to Resize, FixScaleResize fixes the scaling issue when
+    `keep_ratio=true`."""
+
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        if results.get('img', None) is not None:
+            if self.keep_ratio:
+                img, scale_factor = imrescale(
+                    results['img'],
+                    results['scale'],
+                    interpolation=self.interpolation,
+                    return_scale=True,
+                    backend=self.backend)
+                new_h, new_w = img.shape[:2]
+                h, w = results['img'].shape[:2]
+                w_scale = new_w / w
+                h_scale = new_h / h
+            else:
+                img, w_scale, h_scale = mmcv.imresize(
+                    results['img'],
+                    results['scale'],
+                    interpolation=self.interpolation,
+                    return_scale=True,
+                    backend=self.backend)
+            results['img'] = img
+            results['img_shape'] = img.shape[:2]
+            results['scale_factor'] = (w_scale, h_scale)
+            results['keep_ratio'] = self.keep_ratio
+
+
+@TRANSFORMS.register_module()
+class ResizeShortestEdge(BaseTransform):
+    """Resize the image and mask while keeping the aspect ratio unchanged.
+
+    Modified from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/augmentation_impl.py#L130 # noqa:E501
+
+    This transform attempts to scale the shorter edge to the given
+    `scale`, as long as the longer edge does not exceed `max_size`.
+    If `max_size` is reached, then downscale so that the longer
+    edge does not exceed `max_size`.
+
+    Required Keys:
+        - img
+        - gt_seg_map (optional)
+    Modified Keys:
+        - img
+        - img_shape
+        - gt_seg_map (optional))
+    Added Keys:
+        - scale
+        - scale_factor
+        - keep_ratio
+
+    Args:
+        scale (Union[int, Tuple[int, int]]): The target short edge length.
+            If it's tuple, will select the min value as the short edge length.
+        max_size (int): The maximum allowed longest edge length.
+    """
+
+    def __init__(self,
+                 scale: Union[int, Tuple[int, int]],
+                 max_size: Optional[int] = None,
+                 resize_type: str = 'Resize',
+                 **resize_kwargs) -> None:
+        super().__init__()
+        self.scale = scale
+        self.max_size = max_size
+
+        self.resize_cfg = dict(type=resize_type, **resize_kwargs)
+        self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})
+
+    def _get_output_shape(
+            self, img: np.ndarray,
+            short_edge_length: Union[int, Tuple[int, int]]) -> Tuple[int, int]:
+        """Compute the target image shape with the given `short_edge_length`.
+
+        Args:
+            img (np.ndarray): The input image.
+            short_edge_length (Union[int, Tuple[int, int]]): The target short
+                edge length. If it's tuple, will select the min value as the
+                short edge length.
+        """
+        h, w = img.shape[:2]
+        if isinstance(short_edge_length, int):
+            size = short_edge_length * 1.0
+        elif isinstance(short_edge_length, tuple):
+            size = min(short_edge_length) * 1.0
+        scale = size / min(h, w)
+        if h < w:
+            new_h, new_w = size, scale * w
+        else:
+            new_h, new_w = scale * h, size
+
+        if self.max_size and max(new_h, new_w) > self.max_size:
+            scale = self.max_size * 1.0 / max(new_h, new_w)
+            new_h *= scale
+            new_w *= scale
+
+        new_h = int(new_h + 0.5)
+        new_w = int(new_w + 0.5)
+        return new_w, new_h
+
+    def transform(self, results: dict) -> dict:
+        self.resize.scale = self._get_output_shape(results['img'], self.scale)
+        return self.resize(results)
+
+
+@TRANSFORMS.register_module()
+class FixShapeResize(Resize):
+    """Resize images & bbox & seg to the specified size.
+
+    This transform resizes the input image according to ``width`` and
+    ``height``. Bboxes, masks, and seg map are then resized
+    with the same parameters.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+    - homography_matrix
+
+    Args:
+        width (int): width for resizing.
+        height (int): height for resizing.
+            Defaults to None.
+        pad_val (Number | dict[str, Number], optional): Padding value for if
+            the pad_mode is "constant".  If it is a single number, the value
+            to pad the image is the number and to pad the semantic
+            segmentation map is 255. If it is a dict, it should have the
+            following keys:
+
+            - img: The value to pad the image.
+            - seg: The value to pad the semantic segmentation map.
+            Defaults to dict(img=0, seg=255).
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Defaults to False.
+        clip_object_border (bool): Whether to clip the objects
+            outside the border of the image. In some dataset like MOT17, the gt
+            bboxes are allowed to cross the border of images. Therefore, we
+            don't need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 width: int,
+                 height: int,
+                 pad_val: Union[Number, dict] = dict(img=0, seg=255),
+                 keep_ratio: bool = False,
+                 clip_object_border: bool = True,
+                 backend: str = 'cv2',
+                 interpolation: str = 'bilinear') -> None:
+        assert width is not None and height is not None, (
+            '`width` and'
+            '`height` can not be `None`')
+
+        self.width = width
+        self.height = height
+        self.scale = (width, height)
+
+        self.backend = backend
+        self.interpolation = interpolation
+        self.keep_ratio = keep_ratio
+        self.clip_object_border = clip_object_border
+
+        if keep_ratio is True:
+            # padding to the fixed size when keep_ratio=True
+            self.pad_transform = Pad(size=self.scale, pad_val=pad_val)
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to resize images, bounding boxes and semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys
+            are updated in result dict.
+        """
+        img = results['img']
+        h, w = img.shape[:2]
+        if self.keep_ratio:
+            scale_factor = min(self.width / w, self.height / h)
+            results['scale_factor'] = (scale_factor, scale_factor)
+            real_w, real_h = int(w * float(scale_factor) +
+                                 0.5), int(h * float(scale_factor) + 0.5)
+            img, scale_factor = mmcv.imrescale(
+                results['img'], (real_w, real_h),
+                interpolation=self.interpolation,
+                return_scale=True,
+                backend=self.backend)
+            # the w_scale and h_scale has minor difference
+            # a real fix should be done in the mmcv.imrescale in the future
+            results['img'] = img
+            results['img_shape'] = img.shape[:2]
+            results['keep_ratio'] = self.keep_ratio
+            results['scale'] = (real_w, real_h)
+        else:
+            results['scale'] = (self.width, self.height)
+            results['scale_factor'] = (self.width / w, self.height / h)
+            super()._resize_img(results)
+
+        self._resize_bboxes(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        self._record_homography_matrix(results)
+        if self.keep_ratio:
+            self.pad_transform(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(width={self.width}, height={self.height}, '
+        repr_str += f'keep_ratio={self.keep_ratio}, '
+        repr_str += f'clip_object_border={self.clip_object_border}), '
+        repr_str += f'backend={self.backend}), '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomFlip(MMCV_RandomFlip):
+    """Flip the image & bbox & mask & segmentation map. Added or Updated keys:
+    flip, flip_direction, img, gt_bboxes, and gt_seg_map. There are 3 flip
+    modes:
+
+     - ``prob`` is float, ``direction`` is string: the image will be
+         ``direction``ly flipped with probability of ``prob`` .
+         E.g., ``prob=0.5``, ``direction='horizontal'``,
+         then image will be horizontally flipped with probability of 0.5.
+     - ``prob`` is float, ``direction`` is list of string: the image will
+         be ``direction[i]``ly flipped with probability of
+         ``prob/len(direction)``.
+         E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``,
+         then image will be horizontally flipped with probability of 0.25,
+         vertically with probability of 0.25.
+     - ``prob`` is list of float, ``direction`` is list of string:
+         given ``len(prob) == len(direction)``, the image will
+         be ``direction[i]``ly flipped with probability of ``prob[i]``.
+         E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal',
+         'vertical']``, then image will be horizontally flipped with
+         probability of 0.3, vertically with probability of 0.5.
+
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - flip
+    - flip_direction
+    - homography_matrix
+
+
+    Args:
+         prob (float | list[float], optional): The flipping probability.
+             Defaults to None.
+         direction(str | list[str]): The flipping direction. Options
+             If input is a list, the length must equal ``prob``. Each
+             element in ``prob`` indicates the flip probability of
+             corresponding direction. Defaults to 'horizontal'.
+    """
+
+    def _record_homography_matrix(self, results: dict) -> None:
+        """Record the homography matrix for the RandomFlip."""
+        cur_dir = results['flip_direction']
+        h, w = results['img'].shape[:2]
+
+        if cur_dir == 'horizontal':
+            homography_matrix = np.array([[-1, 0, w], [0, 1, 0], [0, 0, 1]],
+                                         dtype=np.float32)
+        elif cur_dir == 'vertical':
+            homography_matrix = np.array([[1, 0, 0], [0, -1, h], [0, 0, 1]],
+                                         dtype=np.float32)
+        elif cur_dir == 'diagonal':
+            homography_matrix = np.array([[-1, 0, w], [0, -1, h], [0, 0, 1]],
+                                         dtype=np.float32)
+        else:
+            homography_matrix = np.eye(3, dtype=np.float32)
+
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = homography_matrix
+        else:
+            results['homography_matrix'] = homography_matrix @ results[
+                'homography_matrix']
+
+    @autocast_box_type()
+    def _flip(self, results: dict) -> None:
+        """Flip images, bounding boxes, and semantic segmentation map."""
+        # flip image
+        results['img'] = mmcv.imflip(
+            results['img'], direction=results['flip_direction'])
+
+        img_shape = results['img'].shape[:2]
+
+        # flip bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'].flip_(img_shape, results['flip_direction'])
+
+        # flip masks
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'].flip(
+                results['flip_direction'])
+
+        # flip segs
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = mmcv.imflip(
+                results['gt_seg_map'], direction=results['flip_direction'])
+
+        # record homography matrix for flip
+        self._record_homography_matrix(results)
+
+
+@TRANSFORMS.register_module()
+class RandomShift(BaseTransform):
+    """Shift the image and box given shift pixels and probability.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32])
+    - gt_bboxes_labels (np.int64)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_ignore_flags (bool) (optional)
+
+    Args:
+        prob (float): Probability of shifts. Defaults to 0.5.
+        max_shift_px (int): The max pixels for shifting. Defaults to 32.
+        filter_thr_px (int): The width and height threshold for filtering.
+            The bbox and the rest of the targets below the width and
+            height threshold will be filtered. Defaults to 1.
+    """
+
+    def __init__(self,
+                 prob: float = 0.5,
+                 max_shift_px: int = 32,
+                 filter_thr_px: int = 1) -> None:
+        assert 0 <= prob <= 1
+        assert max_shift_px >= 0
+        self.prob = prob
+        self.max_shift_px = max_shift_px
+        self.filter_thr_px = int(filter_thr_px)
+
+    @cache_randomness
+    def _random_prob(self) -> float:
+        return random.uniform(0, 1)
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to random shift images, bounding boxes.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Shift results.
+        """
+        if self._random_prob() < self.prob:
+            img_shape = results['img'].shape[:2]
+
+            random_shift_x = random.randint(-self.max_shift_px,
+                                            self.max_shift_px)
+            random_shift_y = random.randint(-self.max_shift_px,
+                                            self.max_shift_px)
+            new_x = max(0, random_shift_x)
+            ori_x = max(0, -random_shift_x)
+            new_y = max(0, random_shift_y)
+            ori_y = max(0, -random_shift_y)
+
+            # TODO: support mask and semantic segmentation maps.
+            bboxes = results['gt_bboxes'].clone()
+            bboxes.translate_([random_shift_x, random_shift_y])
+
+            # clip border
+            bboxes.clip_(img_shape)
+
+            # remove invalid bboxes
+            valid_inds = (bboxes.widths > self.filter_thr_px).numpy() & (
+                bboxes.heights > self.filter_thr_px).numpy()
+            # If the shift does not contain any gt-bbox area, skip this
+            # image.
+            if not valid_inds.any():
+                return results
+            bboxes = bboxes[valid_inds]
+            results['gt_bboxes'] = bboxes
+            results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
+                valid_inds]
+
+            if results.get('gt_ignore_flags', None) is not None:
+                results['gt_ignore_flags'] = \
+                    results['gt_ignore_flags'][valid_inds]
+
+            # shift img
+            img = results['img']
+            new_img = np.zeros_like(img)
+            img_h, img_w = img.shape[:2]
+            new_h = img_h - np.abs(random_shift_y)
+            new_w = img_w - np.abs(random_shift_x)
+            new_img[new_y:new_y + new_h, new_x:new_x + new_w] \
+                = img[ori_y:ori_y + new_h, ori_x:ori_x + new_w]
+            results['img'] = new_img
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'max_shift_px={self.max_shift_px}, '
+        repr_str += f'filter_thr_px={self.filter_thr_px})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Pad(MMCV_Pad):
+    """Pad the image & segmentation map.
+
+    There are three padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number. and (3)pad to square. Also,
+    pad to square and pad to the minimum size can be used as the same time.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - pad_shape
+    - pad_fixed_size
+    - pad_size_divisor
+
+    Args:
+        size (tuple, optional): Fixed padding size.
+            Expected padding shape (width, height). Defaults to None.
+        size_divisor (int, optional): The divisor of padded size. Defaults to
+            None.
+        pad_to_square (bool): Whether to pad the image into a square.
+            Currently only used for YOLOX. Defaults to False.
+        pad_val (Number | dict[str, Number], optional) - Padding value for if
+            the pad_mode is "constant".  If it is a single number, the value
+            to pad the image is the number and to pad the semantic
+            segmentation map is 255. If it is a dict, it should have the
+            following keys:
+
+            - img: The value to pad the image.
+            - seg: The value to pad the semantic segmentation map.
+            Defaults to dict(img=0, seg=255).
+        padding_mode (str): Type of padding. Should be: constant, edge,
+            reflect or symmetric. Defaults to 'constant'.
+
+            - constant: pads with a constant value, this value is specified
+              with pad_val.
+            - edge: pads with the last value at the edge of the image.
+            - reflect: pads with reflection of image without repeating the last
+              value on the edge. For example, padding [1, 2, 3, 4] with 2
+              elements on both sides in reflect mode will result in
+              [3, 2, 1, 2, 3, 4, 3, 2].
+            - symmetric: pads with reflection of image repeating the last value
+              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+              both sides in symmetric mode will result in
+              [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    def _pad_masks(self, results: dict) -> None:
+        """Pad masks according to ``results['pad_shape']``."""
+        if results.get('gt_masks', None) is not None:
+            pad_val = self.pad_val.get('masks', 0)
+            pad_shape = results['pad_shape'][:2]
+            results['gt_masks'] = results['gt_masks'].pad(
+                pad_shape, pad_val=pad_val)
+
+    def transform(self, results: dict) -> dict:
+        """Call function to pad images, masks, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        self._pad_seg(results)
+        self._pad_masks(results)
+        return results
+
+
+@TRANSFORMS.register_module()
+class RandomCrop(BaseTransform):
+    """Random crop the image & bboxes & masks.
+
+    The absolute ``crop_size`` is sampled based on ``crop_type`` and
+    ``image_size``, then the cropped results are generated.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_masks (optional)
+    - gt_ignore_flags (optional)
+    - gt_seg_map (optional)
+    - gt_instances_ids (options, only used in MOT/VIS)
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        crop_size (tuple): The relative ratio or absolute pixels of
+            (width, height).
+        crop_type (str, optional): One of "relative_range", "relative",
+            "absolute", "absolute_range". "relative" randomly crops
+            (h * crop_size[0], w * crop_size[1]) part from an input of size
+            (h, w). "relative_range" uniformly samples relative crop size from
+            range [crop_size[0], 1] and [crop_size[1], 1] for height and width
+            respectively. "absolute" crops from an input with absolute size
+            (crop_size[0], crop_size[1]). "absolute_range" uniformly samples
+            crop_h in range [crop_size[0], min(h, crop_size[1])] and crop_w
+            in range [crop_size[0], min(w, crop_size[1])].
+            Defaults to "absolute".
+        allow_negative_crop (bool, optional): Whether to allow a crop that does
+            not contain any bbox area. Defaults to False.
+        recompute_bbox (bool, optional): Whether to re-compute the boxes based
+            on cropped instance masks. Defaults to False.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+
+    Note:
+        - If the image is smaller than the absolute crop size, return the
+            original image.
+        - The keys for bboxes, labels and masks must be aligned. That is,
+          ``gt_bboxes`` corresponds to ``gt_labels`` and ``gt_masks``, and
+          ``gt_bboxes_ignore`` corresponds to ``gt_labels_ignore`` and
+          ``gt_masks_ignore``.
+        - If the crop does not contain any gt-bbox region and
+          ``allow_negative_crop`` is set to False, skip this image.
+    """
+
+    def __init__(self,
+                 crop_size: tuple,
+                 crop_type: str = 'absolute',
+                 allow_negative_crop: bool = False,
+                 recompute_bbox: bool = False,
+                 bbox_clip_border: bool = True) -> None:
+        if crop_type not in [
+                'relative_range', 'relative', 'absolute', 'absolute_range'
+        ]:
+            raise ValueError(f'Invalid crop_type {crop_type}.')
+        if crop_type in ['absolute', 'absolute_range']:
+            assert crop_size[0] > 0 and crop_size[1] > 0
+            assert isinstance(crop_size[0], int) and isinstance(
+                crop_size[1], int)
+            if crop_type == 'absolute_range':
+                assert crop_size[0] <= crop_size[1]
+        else:
+            assert 0 < crop_size[0] <= 1 and 0 < crop_size[1] <= 1
+        self.crop_size = crop_size
+        self.crop_type = crop_type
+        self.allow_negative_crop = allow_negative_crop
+        self.bbox_clip_border = bbox_clip_border
+        self.recompute_bbox = recompute_bbox
+
+    def _crop_data(self, results: dict, crop_size: Tuple[int, int],
+                   allow_negative_crop: bool) -> Union[dict, None]:
+        """Function to randomly crop images, bounding boxes, masks, semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            crop_size (Tuple[int, int]): Expected absolute size after
+                cropping, (h, w).
+            allow_negative_crop (bool): Whether to allow a crop that does not
+                contain any bbox area.
+
+        Returns:
+            results (Union[dict, None]): Randomly cropped results, 'img_shape'
+                key in result dict is updated according to crop size. None will
+                be returned when there is no valid bbox after cropping.
+        """
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        img = results['img']
+        margin_h = max(img.shape[0] - crop_size[0], 0)
+        margin_w = max(img.shape[1] - crop_size[1], 0)
+        offset_h, offset_w = self._rand_offset((margin_h, margin_w))
+        crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
+        crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
+
+        # Record the homography matrix for the RandomCrop
+        homography_matrix = np.array(
+            [[1, 0, -offset_w], [0, 1, -offset_h], [0, 0, 1]],
+            dtype=np.float32)
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = homography_matrix
+        else:
+            results['homography_matrix'] = homography_matrix @ results[
+                'homography_matrix']
+
+        # crop the image
+        img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+        img_shape = img.shape
+        results['img'] = img
+        results['img_shape'] = img_shape[:2]
+
+        # crop bboxes accordingly and clip to the image boundary
+        if results.get('gt_bboxes', None) is not None:
+            bboxes = results['gt_bboxes']
+            bboxes.translate_([-offset_w, -offset_h])
+            if self.bbox_clip_border:
+                bboxes.clip_(img_shape[:2])
+            valid_inds = bboxes.is_inside(img_shape[:2]).numpy()
+            # If the crop does not contain any gt-bbox area and
+            # allow_negative_crop is False, skip this image.
+            if (not valid_inds.any() and not allow_negative_crop):
+                return None
+
+            results['gt_bboxes'] = bboxes[valid_inds]
+
+            if results.get('gt_ignore_flags', None) is not None:
+                results['gt_ignore_flags'] = \
+                    results['gt_ignore_flags'][valid_inds]
+
+            if results.get('gt_bboxes_labels', None) is not None:
+                results['gt_bboxes_labels'] = \
+                    results['gt_bboxes_labels'][valid_inds]
+
+            if results.get('gt_masks', None) is not None:
+                results['gt_masks'] = results['gt_masks'][
+                    valid_inds.nonzero()[0]].crop(
+                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+                if self.recompute_bbox:
+                    results['gt_bboxes'] = results['gt_masks'].get_bboxes(
+                        type(results['gt_bboxes']))
+
+            # We should remove the instance ids corresponding to invalid boxes.
+            if results.get('gt_instances_ids', None) is not None:
+                results['gt_instances_ids'] = \
+                    results['gt_instances_ids'][valid_inds]
+
+        # crop semantic seg
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = results['gt_seg_map'][crop_y1:crop_y2,
+                                                          crop_x1:crop_x2]
+
+        return results
+
+    @cache_randomness
+    def _rand_offset(self, margin: Tuple[int, int]) -> Tuple[int, int]:
+        """Randomly generate crop offset.
+
+        Args:
+            margin (Tuple[int, int]): The upper bound for the offset generated
+                randomly.
+
+        Returns:
+            Tuple[int, int]: The random offset for the crop.
+        """
+        margin_h, margin_w = margin
+        offset_h = np.random.randint(0, margin_h + 1)
+        offset_w = np.random.randint(0, margin_w + 1)
+
+        return offset_h, offset_w
+
+    @cache_randomness
+    def _get_crop_size(self, image_size: Tuple[int, int]) -> Tuple[int, int]:
+        """Randomly generates the absolute crop size based on `crop_type` and
+        `image_size`.
+
+        Args:
+            image_size (Tuple[int, int]): (h, w).
+
+        Returns:
+            crop_size (Tuple[int, int]): (crop_h, crop_w) in absolute pixels.
+        """
+        h, w = image_size
+        if self.crop_type == 'absolute':
+            return min(self.crop_size[1], h), min(self.crop_size[0], w)
+        elif self.crop_type == 'absolute_range':
+            crop_h = np.random.randint(
+                min(h, self.crop_size[0]),
+                min(h, self.crop_size[1]) + 1)
+            crop_w = np.random.randint(
+                min(w, self.crop_size[0]),
+                min(w, self.crop_size[1]) + 1)
+            return crop_h, crop_w
+        elif self.crop_type == 'relative':
+            crop_w, crop_h = self.crop_size
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+        else:
+            # 'relative_range'
+            crop_size = np.asarray(self.crop_size, dtype=np.float32)
+            crop_h, crop_w = crop_size + np.random.rand(2) * (1 - crop_size)
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """Transform function to randomly crop images, bounding boxes, masks,
+        semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            results (Union[dict, None]): Randomly cropped results, 'img_shape'
+                key in result dict is updated according to crop size. None will
+                be returned when there is no valid bbox after cropping.
+        """
+        image_size = results['img'].shape[:2]
+        crop_size = self._get_crop_size(image_size)
+        results = self._crop_data(results, crop_size, self.allow_negative_crop)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'crop_type={self.crop_type}, '
+        repr_str += f'allow_negative_crop={self.allow_negative_crop}, '
+        repr_str += f'recompute_bbox={self.recompute_bbox}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class SegRescale(BaseTransform):
+    """Rescale semantic segmentation maps.
+
+    This transform rescale the ``gt_seg_map`` according to ``scale_factor``.
+
+    Required Keys:
+
+    - gt_seg_map
+
+    Modified Keys:
+
+    - gt_seg_map
+
+    Args:
+        scale_factor (float): The scale factor of the final output. Defaults
+            to 1.
+        backend (str): Image rescale backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+    """
+
+    def __init__(self, scale_factor: float = 1, backend: str = 'cv2') -> None:
+        self.scale_factor = scale_factor
+        self.backend = backend
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to scale the semantic segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with semantic segmentation map scaled.
+        """
+        if self.scale_factor != 1:
+            results['gt_seg_map'] = mmcv.imrescale(
+                results['gt_seg_map'],
+                self.scale_factor,
+                interpolation='nearest',
+                backend=self.backend)
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(scale_factor={self.scale_factor}, '
+        repr_str += f'backend={self.backend})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PhotoMetricDistortion(BaseTransform):
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+
+    Required Keys:
+
+    - img (np.uint8)
+
+    Modified Keys:
+
+    - img (np.float32)
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (sequence): range of contrast.
+        saturation_range (sequence): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta: int = 32,
+                 contrast_range: Sequence[Number] = (0.5, 1.5),
+                 saturation_range: Sequence[Number] = (0.5, 1.5),
+                 hue_delta: int = 18) -> None:
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    @cache_randomness
+    def _random_flags(self) -> Sequence[Number]:
+        mode = random.randint(2)
+        brightness_flag = random.randint(2)
+        contrast_flag = random.randint(2)
+        saturation_flag = random.randint(2)
+        hue_flag = random.randint(2)
+        swap_flag = random.randint(2)
+        delta_value = random.uniform(-self.brightness_delta,
+                                     self.brightness_delta)
+        alpha_value = random.uniform(self.contrast_lower, self.contrast_upper)
+        saturation_value = random.uniform(self.saturation_lower,
+                                          self.saturation_upper)
+        hue_value = random.uniform(-self.hue_delta, self.hue_delta)
+        swap_value = random.permutation(3)
+
+        return (mode, brightness_flag, contrast_flag, saturation_flag,
+                hue_flag, swap_flag, delta_value, alpha_value,
+                saturation_value, hue_value, swap_value)
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+        assert 'img' in results, '`img` is not found in results'
+        img = results['img']
+        img = img.astype(np.float32)
+
+        (mode, brightness_flag, contrast_flag, saturation_flag, hue_flag,
+         swap_flag, delta_value, alpha_value, saturation_value, hue_value,
+         swap_value) = self._random_flags()
+
+        # random brightness
+        if brightness_flag:
+            img += delta_value
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        if mode == 1:
+            if contrast_flag:
+                img *= alpha_value
+
+        # convert color from BGR to HSV
+        img = mmcv.bgr2hsv(img)
+
+        # random saturation
+        if saturation_flag:
+            img[..., 1] *= saturation_value
+            # For image(type=float32), after convert bgr to hsv by opencv,
+            # valid saturation value range is [0, 1]
+            if saturation_value > 1:
+                img[..., 1] = img[..., 1].clip(0, 1)
+
+        # random hue
+        if hue_flag:
+            img[..., 0] += hue_value
+            img[..., 0][img[..., 0] > 360] -= 360
+            img[..., 0][img[..., 0] < 0] += 360
+
+        # convert color from HSV to BGR
+        img = mmcv.hsv2bgr(img)
+
+        # random contrast
+        if mode == 0:
+            if contrast_flag:
+                img *= alpha_value
+
+        # randomly swap channels
+        if swap_flag:
+            img = img[..., swap_value]
+
+        results['img'] = img
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(brightness_delta={self.brightness_delta}, '
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)}, '
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)}, '
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Expand(BaseTransform):
+    """Random expand the image & bboxes & masks & segmentation map.
+
+    Randomly place the original image on a canvas of ``ratio`` x original image
+    size filled with mean values. The ratio is in the range of ratio_range.
+
+    Required Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+
+    Args:
+        mean (sequence): mean value of dataset.
+        to_rgb (bool): if need to convert the order of mean to align with RGB.
+        ratio_range (sequence)): range of expand ratio.
+        seg_ignore_label (int): label of ignore segmentation map.
+        prob (float): probability of applying this transformation
+    """
+
+    def __init__(self,
+                 mean: Sequence[Number] = (0, 0, 0),
+                 to_rgb: bool = True,
+                 ratio_range: Sequence[Number] = (1, 4),
+                 seg_ignore_label: int = None,
+                 prob: float = 0.5) -> None:
+        self.to_rgb = to_rgb
+        self.ratio_range = ratio_range
+        if to_rgb:
+            self.mean = mean[::-1]
+        else:
+            self.mean = mean
+        self.min_ratio, self.max_ratio = ratio_range
+        self.seg_ignore_label = seg_ignore_label
+        self.prob = prob
+
+    @cache_randomness
+    def _random_prob(self) -> float:
+        return random.uniform(0, 1)
+
+    @cache_randomness
+    def _random_ratio(self) -> float:
+        return random.uniform(self.min_ratio, self.max_ratio)
+
+    @cache_randomness
+    def _random_left_top(self, ratio: float, h: int,
+                         w: int) -> Tuple[int, int]:
+        left = int(random.uniform(0, w * ratio - w))
+        top = int(random.uniform(0, h * ratio - h))
+        return left, top
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to expand images, bounding boxes, masks,
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images, bounding boxes, masks, segmentation
+                map expanded.
+        """
+        if self._random_prob() > self.prob:
+            return results
+        assert 'img' in results, '`img` is not found in results'
+        img = results['img']
+        h, w, c = img.shape
+        ratio = self._random_ratio()
+        # speedup expand when meets large image
+        if np.all(self.mean == self.mean[0]):
+            expand_img = np.empty((int(h * ratio), int(w * ratio), c),
+                                  img.dtype)
+            expand_img.fill(self.mean[0])
+        else:
+            expand_img = np.full((int(h * ratio), int(w * ratio), c),
+                                 self.mean,
+                                 dtype=img.dtype)
+        left, top = self._random_left_top(ratio, h, w)
+        expand_img[top:top + h, left:left + w] = img
+        results['img'] = expand_img
+        results['img_shape'] = expand_img.shape[:2]
+
+        # expand bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'].translate_([left, top])
+
+        # expand masks
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'].expand(
+                int(h * ratio), int(w * ratio), top, left)
+
+        # expand segmentation map
+        if results.get('gt_seg_map', None) is not None:
+            gt_seg = results['gt_seg_map']
+            expand_gt_seg = np.full((int(h * ratio), int(w * ratio)),
+                                    self.seg_ignore_label,
+                                    dtype=gt_seg.dtype)
+            expand_gt_seg[top:top + h, left:left + w] = gt_seg
+            results['gt_seg_map'] = expand_gt_seg
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, to_rgb={self.to_rgb}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MinIoURandomCrop(BaseTransform):
+    """Random crop the image & bboxes & masks & segmentation map, the cropped
+    patches have minimum IoU requirement with original image & bboxes & masks.
+
+    & segmentation map, the IoU threshold is randomly selected from min_ious.
+
+
+    Required Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_masks
+    - gt_ignore_flags
+    - gt_seg_map
+
+
+    Args:
+        min_ious (Sequence[float]): minimum IoU threshold for all intersections
+            with bounding boxes.
+        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
+        where a >= min_crop_size).
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 min_ious: Sequence[float] = (0.1, 0.3, 0.5, 0.7, 0.9),
+                 min_crop_size: float = 0.3,
+                 bbox_clip_border: bool = True) -> None:
+
+        self.min_ious = min_ious
+        self.sample_mode = (1, *min_ious, 0)
+        self.min_crop_size = min_crop_size
+        self.bbox_clip_border = bbox_clip_border
+
+    @cache_randomness
+    def _random_mode(self) -> Number:
+        return random.choice(self.sample_mode)
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to crop images and bounding boxes with minimum
+        IoU constraint.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images and bounding boxes cropped, \
+                'img_shape' key is updated.
+        """
+        assert 'img' in results, '`img` is not found in results'
+        assert 'gt_bboxes' in results, '`gt_bboxes` is not found in results'
+        img = results['img']
+        boxes = results['gt_bboxes']
+        h, w, c = img.shape
+        while True:
+            mode = self._random_mode()
+            self.mode = mode
+            if mode == 1:
+                return results
+
+            min_iou = self.mode
+            for i in range(50):
+                new_w = random.uniform(self.min_crop_size * w, w)
+                new_h = random.uniform(self.min_crop_size * h, h)
+
+                # h / w in [0.5, 2]
+                if new_h / new_w < 0.5 or new_h / new_w > 2:
+                    continue
+
+                left = random.uniform(w - new_w)
+                top = random.uniform(h - new_h)
+
+                patch = np.array(
+                    (int(left), int(top), int(left + new_w), int(top + new_h)))
+                # Line or point crop is not allowed
+                if patch[2] == patch[0] or patch[3] == patch[1]:
+                    continue
+                overlaps = boxes.overlaps(
+                    HorizontalBoxes(patch.reshape(-1, 4).astype(np.float32)),
+                    boxes).numpy().reshape(-1)
+                if len(overlaps) > 0 and overlaps.min() < min_iou:
+                    continue
+
+                # center of boxes should inside the crop img
+                # only adjust boxes and instance masks when the gt is not empty
+                if len(overlaps) > 0:
+                    # adjust boxes
+                    def is_center_of_bboxes_in_patch(boxes, patch):
+                        centers = boxes.centers.numpy()
+                        mask = ((centers[:, 0] > patch[0]) *
+                                (centers[:, 1] > patch[1]) *
+                                (centers[:, 0] < patch[2]) *
+                                (centers[:, 1] < patch[3]))
+                        return mask
+
+                    mask = is_center_of_bboxes_in_patch(boxes, patch)
+                    if not mask.any():
+                        continue
+                    if results.get('gt_bboxes', None) is not None:
+                        boxes = results['gt_bboxes']
+                        mask = is_center_of_bboxes_in_patch(boxes, patch)
+                        boxes = boxes[mask]
+                        boxes.translate_([-patch[0], -patch[1]])
+                        if self.bbox_clip_border:
+                            boxes.clip_(
+                                [patch[3] - patch[1], patch[2] - patch[0]])
+                        results['gt_bboxes'] = boxes
+
+                        # ignore_flags
+                        if results.get('gt_ignore_flags', None) is not None:
+                            results['gt_ignore_flags'] = \
+                                results['gt_ignore_flags'][mask]
+
+                        # labels
+                        if results.get('gt_bboxes_labels', None) is not None:
+                            results['gt_bboxes_labels'] = results[
+                                'gt_bboxes_labels'][mask]
+
+                        # mask fields
+                        if results.get('gt_masks', None) is not None:
+                            results['gt_masks'] = results['gt_masks'][
+                                mask.nonzero()[0]].crop(patch)
+                # adjust the img no matter whether the gt is empty before crop
+                img = img[patch[1]:patch[3], patch[0]:patch[2]]
+                results['img'] = img
+                results['img_shape'] = img.shape[:2]
+
+                # seg fields
+                if results.get('gt_seg_map', None) is not None:
+                    results['gt_seg_map'] = results['gt_seg_map'][
+                        patch[1]:patch[3], patch[0]:patch[2]]
+                return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(min_ious={self.min_ious}, '
+        repr_str += f'min_crop_size={self.min_crop_size}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Corrupt(BaseTransform):
+    """Corruption augmentation.
+
+    Corruption transforms implemented based on
+    `imagecorruptions <https://github.com/bethgelab/imagecorruptions>`_.
+
+    Required Keys:
+
+    - img (np.uint8)
+
+
+    Modified Keys:
+
+    - img (np.uint8)
+
+
+    Args:
+        corruption (str): Corruption name.
+        severity (int): The severity of corruption. Defaults to 1.
+    """
+
+    def __init__(self, corruption: str, severity: int = 1) -> None:
+        self.corruption = corruption
+        self.severity = severity
+
+    def transform(self, results: dict) -> dict:
+        """Call function to corrupt image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images corrupted.
+        """
+
+        if corrupt is None:
+            raise RuntimeError('imagecorruptions is not installed')
+        results['img'] = corrupt(
+            results['img'].astype(np.uint8),
+            corruption_name=self.corruption,
+            severity=self.severity)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(corruption={self.corruption}, '
+        repr_str += f'severity={self.severity})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+@avoid_cache_randomness
+class Albu(BaseTransform):
+    """Albumentation augmentation.
+
+    Adds custom transformations from Albumentations library.
+    Please, visit `https://albumentations.readthedocs.io`
+    to get more information.
+
+    Required Keys:
+
+    - img (np.uint8)
+    - gt_bboxes (HorizontalBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+
+    Modified Keys:
+
+    - img (np.uint8)
+    - gt_bboxes (HorizontalBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - img_shape (tuple)
+
+    An example of ``transforms`` is as followed:
+
+    .. code-block::
+
+        [
+            dict(
+                type='ShiftScaleRotate',
+                shift_limit=0.0625,
+                scale_limit=0.0,
+                rotate_limit=0,
+                interpolation=1,
+                p=0.5),
+            dict(
+                type='RandomBrightnessContrast',
+                brightness_limit=[0.1, 0.3],
+                contrast_limit=[0.1, 0.3],
+                p=0.2),
+            dict(type='ChannelShuffle', p=0.1),
+            dict(
+                type='OneOf',
+                transforms=[
+                    dict(type='Blur', blur_limit=3, p=1.0),
+                    dict(type='MedianBlur', blur_limit=3, p=1.0)
+                ],
+                p=0.1),
+        ]
+
+    Args:
+        transforms (list[dict]): A list of albu transformations
+        bbox_params (dict, optional): Bbox_params for albumentation `Compose`
+        keymap (dict, optional): Contains
+            {'input key':'albumentation-style key'}
+        skip_img_without_anno (bool): Whether to skip the image if no ann left
+            after aug. Defaults to False.
+    """
+
+    def __init__(self,
+                 transforms: List[dict],
+                 bbox_params: Optional[dict] = None,
+                 keymap: Optional[dict] = None,
+                 skip_img_without_anno: bool = False) -> None:
+        if Compose is None:
+            raise RuntimeError('albumentations is not installed')
+
+        # Args will be modified later, copying it will be safer
+        transforms = copy.deepcopy(transforms)
+        if bbox_params is not None:
+            bbox_params = copy.deepcopy(bbox_params)
+        if keymap is not None:
+            keymap = copy.deepcopy(keymap)
+        self.transforms = transforms
+        self.filter_lost_elements = False
+        self.skip_img_without_anno = skip_img_without_anno
+
+        # A simple workaround to remove masks without boxes
+        if (isinstance(bbox_params, dict) and 'label_fields' in bbox_params
+                and 'filter_lost_elements' in bbox_params):
+            self.filter_lost_elements = True
+            self.origin_label_fields = bbox_params['label_fields']
+            bbox_params['label_fields'] = ['idx_mapper']
+            del bbox_params['filter_lost_elements']
+
+        self.bbox_params = (
+            self.albu_builder(bbox_params) if bbox_params else None)
+        self.aug = Compose([self.albu_builder(t) for t in self.transforms],
+                           bbox_params=self.bbox_params)
+
+        if not keymap:
+            self.keymap_to_albu = {
+                'img': 'image',
+                'gt_masks': 'masks',
+                'gt_bboxes': 'bboxes'
+            }
+        else:
+            self.keymap_to_albu = keymap
+        self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
+
+    def albu_builder(self, cfg: dict) -> albumentations:
+        """Import a module from albumentations.
+
+        It inherits some of :func:`build_from_cfg` logic.
+
+        Args:
+            cfg (dict): Config dict. It should at least contain the key "type".
+
+        Returns:
+            obj: The constructed object.
+        """
+
+        assert isinstance(cfg, dict) and 'type' in cfg
+        args = cfg.copy()
+        obj_type = args.pop('type')
+        if is_str(obj_type):
+            if albumentations is None:
+                raise RuntimeError('albumentations is not installed')
+            obj_cls = getattr(albumentations, obj_type)
+        elif inspect.isclass(obj_type):
+            obj_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a str or valid type, but got {type(obj_type)}')
+
+        if 'transforms' in args:
+            args['transforms'] = [
+                self.albu_builder(transform)
+                for transform in args['transforms']
+            ]
+
+        return obj_cls(**args)
+
+    @staticmethod
+    def mapper(d: dict, keymap: dict) -> dict:
+        """Dictionary mapper. Renames keys according to keymap provided.
+
+        Args:
+            d (dict): old dict
+            keymap (dict): {'old_key':'new_key'}
+        Returns:
+            dict: new dict.
+        """
+        updated_dict = {}
+        for k, v in zip(d.keys(), d.values()):
+            new_k = keymap.get(k, k)
+            updated_dict[new_k] = d[k]
+        return updated_dict
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """Transform function of Albu."""
+        # TODO: gt_seg_map is not currently supported
+        # dict to albumentations format
+        results = self.mapper(results, self.keymap_to_albu)
+        results, ori_masks = self._preprocess_results(results)
+        results = self.aug(**results)
+        results = self._postprocess_results(results, ori_masks)
+        if results is None:
+            return None
+        # back to the original format
+        results = self.mapper(results, self.keymap_back)
+        results['img_shape'] = results['img'].shape[:2]
+        return results
+
+    def _preprocess_results(self, results: dict) -> tuple:
+        """Pre-processing results to facilitate the use of Albu."""
+        if 'bboxes' in results:
+            # to list of boxes
+            if not isinstance(results['bboxes'], HorizontalBoxes):
+                raise NotImplementedError(
+                    'Albu only supports horizontal boxes now')
+            bboxes = results['bboxes'].numpy()
+            results['bboxes'] = [x for x in bboxes]
+            # add pseudo-field for filtration
+            if self.filter_lost_elements:
+                results['idx_mapper'] = np.arange(len(results['bboxes']))
+
+        # TODO: Support mask structure in albu
+        ori_masks = None
+        if 'masks' in results:
+            if isinstance(results['masks'], PolygonMasks):
+                raise NotImplementedError(
+                    'Albu only supports BitMap masks now')
+            ori_masks = results['masks']
+            if albumentations.__version__ < '0.5':
+                results['masks'] = results['masks'].masks
+            else:
+                results['masks'] = [mask for mask in results['masks'].masks]
+
+        return results, ori_masks
+
+    def _postprocess_results(
+            self,
+            results: dict,
+            ori_masks: Optional[Union[BitmapMasks,
+                                      PolygonMasks]] = None) -> dict:
+        """Post-processing Albu output."""
+        # albumentations may return np.array or list on different versions
+        if 'gt_bboxes_labels' in results and isinstance(
+                results['gt_bboxes_labels'], list):
+            results['gt_bboxes_labels'] = np.array(
+                results['gt_bboxes_labels'], dtype=np.int64)
+        if 'gt_ignore_flags' in results and isinstance(
+                results['gt_ignore_flags'], list):
+            results['gt_ignore_flags'] = np.array(
+                results['gt_ignore_flags'], dtype=bool)
+
+        if 'bboxes' in results:
+            if isinstance(results['bboxes'], list):
+                results['bboxes'] = np.array(
+                    results['bboxes'], dtype=np.float32)
+            results['bboxes'] = results['bboxes'].reshape(-1, 4)
+            results['bboxes'] = HorizontalBoxes(results['bboxes'])
+
+            # filter label_fields
+            if self.filter_lost_elements:
+
+                for label in self.origin_label_fields:
+                    results[label] = np.array(
+                        [results[label][i] for i in results['idx_mapper']])
+                if 'masks' in results:
+                    assert ori_masks is not None
+                    results['masks'] = np.array(
+                        [results['masks'][i] for i in results['idx_mapper']])
+                    results['masks'] = ori_masks.__class__(
+                        results['masks'],
+                        results['masks'][0].shape[0],
+                        results['masks'][0].shape[1],
+                    )
+                if (not len(results['idx_mapper'])
+                        and self.skip_img_without_anno):
+                    return None
+            elif 'masks' in results:
+                results['masks'] = ori_masks.__class__(results['masks'],
+                                                       ori_masks.height,
+                                                       ori_masks.width)
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+@avoid_cache_randomness
+class RandomCenterCropPad(BaseTransform):
+    """Random center crop and random around padding for CornerNet.
+
+    This operation generates randomly cropped image from the original image and
+    pads it simultaneously. Different from :class:`RandomCrop`, the output
+    shape may not equal to ``crop_size`` strictly. We choose a random value
+    from ``ratios`` and the output shape could be larger or smaller than
+    ``crop_size``. The padding operation is also different from :class:`Pad`,
+    here we use around padding instead of right-bottom padding.
+
+    The relation between output image (padding image) and original image:
+
+    .. code:: text
+
+                        output image
+
+               +----------------------------+
+               |          padded area       |
+        +------|----------------------------|----------+
+        |      |         cropped area       |          |
+        |      |         +---------------+  |          |
+        |      |         |    .   center |  |          | original image
+        |      |         |        range  |  |          |
+        |      |         +---------------+  |          |
+        +------|----------------------------|----------+
+               |          padded area       |
+               +----------------------------+
+
+    There are 5 main areas in the figure:
+
+    - output image: output image of this operation, also called padding
+      image in following instruction.
+    - original image: input image of this operation.
+    - padded area: non-intersect area of output image and original image.
+    - cropped area: the overlap of output image and original image.
+    - center range: a smaller area where random center chosen from.
+      center range is computed by ``border`` and original image's shape
+      to avoid our random center is too close to original image's border.
+
+    Also this operation act differently in train and test mode, the summary
+    pipeline is listed below.
+
+    Train pipeline:
+
+    1. Choose a ``random_ratio`` from ``ratios``, the shape of padding image
+       will be ``random_ratio * crop_size``.
+    2. Choose a ``random_center`` in center range.
+    3. Generate padding image with center matches the ``random_center``.
+    4. Initialize the padding image with pixel value equals to ``mean``.
+    5. Copy the cropped area to padding image.
+    6. Refine annotations.
+
+    Test pipeline:
+
+    1. Compute output shape according to ``test_pad_mode``.
+    2. Generate padding image with center matches the original image
+       center.
+    3. Initialize the padding image with pixel value equals to ``mean``.
+    4. Copy the ``cropped area`` to padding image.
+
+    Required Keys:
+
+    - img (np.float32)
+    - img_shape (tuple)
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - img (np.float32)
+    - img_shape (tuple)
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Args:
+        crop_size (tuple, optional): expected size after crop, final size will
+            computed according to ratio. Requires  (width, height)
+            in train mode, and None in test mode.
+        ratios (tuple, optional): random select a ratio from tuple and crop
+            image to (crop_size[0] * ratio) * (crop_size[1] * ratio).
+            Only available in train mode. Defaults to (0.9, 1.0, 1.1).
+        border (int, optional): max distance from center select area to image
+            border. Only available in train mode. Defaults to 128.
+        mean (sequence, optional): Mean values of 3 channels.
+        std (sequence, optional): Std values of 3 channels.
+        to_rgb (bool, optional): Whether to convert the image from BGR to RGB.
+        test_mode (bool): whether involve random variables in transform.
+            In train mode, crop_size is fixed, center coords and ratio is
+            random selected from predefined lists. In test mode, crop_size
+            is image's original shape, center coords and ratio is fixed.
+            Defaults to False.
+        test_pad_mode (tuple, optional): padding method and padding shape
+            value, only available in test mode. Default is using
+            'logical_or' with 127 as padding shape value.
+
+            - 'logical_or': final_shape = input_shape | padding_shape_value
+            - 'size_divisor': final_shape = int(
+              ceil(input_shape / padding_shape_value) * padding_shape_value)
+
+            Defaults to ('logical_or', 127).
+        test_pad_add_pix (int): Extra padding pixel in test mode.
+            Defaults to 0.
+        bbox_clip_border (bool): Whether clip the objects outside
+            the border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 crop_size: Optional[tuple] = None,
+                 ratios: Optional[tuple] = (0.9, 1.0, 1.1),
+                 border: Optional[int] = 128,
+                 mean: Optional[Sequence] = None,
+                 std: Optional[Sequence] = None,
+                 to_rgb: Optional[bool] = None,
+                 test_mode: bool = False,
+                 test_pad_mode: Optional[tuple] = ('logical_or', 127),
+                 test_pad_add_pix: int = 0,
+                 bbox_clip_border: bool = True) -> None:
+        if test_mode:
+            assert crop_size is None, 'crop_size must be None in test mode'
+            assert ratios is None, 'ratios must be None in test mode'
+            assert border is None, 'border must be None in test mode'
+            assert isinstance(test_pad_mode, (list, tuple))
+            assert test_pad_mode[0] in ['logical_or', 'size_divisor']
+        else:
+            assert isinstance(crop_size, (list, tuple))
+            assert crop_size[0] > 0 and crop_size[1] > 0, (
+                'crop_size must > 0 in train mode')
+            assert isinstance(ratios, (list, tuple))
+            assert test_pad_mode is None, (
+                'test_pad_mode must be None in train mode')
+
+        self.crop_size = crop_size
+        self.ratios = ratios
+        self.border = border
+        # We do not set default value to mean, std and to_rgb because these
+        # hyper-parameters are easy to forget but could affect the performance.
+        # Please use the same setting as Normalize for performance assurance.
+        assert mean is not None and std is not None and to_rgb is not None
+        self.to_rgb = to_rgb
+        self.input_mean = mean
+        self.input_std = std
+        if to_rgb:
+            self.mean = mean[::-1]
+            self.std = std[::-1]
+        else:
+            self.mean = mean
+            self.std = std
+        self.test_mode = test_mode
+        self.test_pad_mode = test_pad_mode
+        self.test_pad_add_pix = test_pad_add_pix
+        self.bbox_clip_border = bbox_clip_border
+
+    def _get_border(self, border, size):
+        """Get final border for the target size.
+
+        This function generates a ``final_border`` according to image's shape.
+        The area between ``final_border`` and ``size - final_border`` is the
+        ``center range``. We randomly choose center from the ``center range``
+        to avoid our random center is too close to original image's border.
+        Also ``center range`` should be larger than 0.
+
+        Args:
+            border (int): The initial border, default is 128.
+            size (int): The width or height of original image.
+        Returns:
+            int: The final border.
+        """
+        k = 2 * border / size
+        i = pow(2, np.ceil(np.log2(np.ceil(k))) + (k == int(k)))
+        return border // i
+
+    def _filter_boxes(self, patch, boxes):
+        """Check whether the center of each box is in the patch.
+
+        Args:
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+            boxes (numpy array, (N x 4)): Ground truth boxes.
+
+        Returns:
+            mask (numpy array, (N,)): Each box is inside or outside the patch.
+        """
+        center = boxes.centers.numpy()
+        mask = (center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (
+            center[:, 0] < patch[2]) * (
+                center[:, 1] < patch[3])
+        return mask
+
+    def _crop_image_and_paste(self, image, center, size):
+        """Crop image with a given center and size, then paste the cropped
+        image to a blank image with two centers align.
+
+        This function is equivalent to generating a blank image with ``size``
+        as its shape. Then cover it on the original image with two centers (
+        the center of blank image and the random center of original image)
+        aligned. The overlap area is paste from the original image and the
+        outside area is filled with ``mean pixel``.
+
+        Args:
+            image (np array, H x W x C): Original image.
+            center (list[int]): Target crop center coord.
+            size (list[int]): Target crop size. [target_h, target_w]
+
+        Returns:
+            cropped_img (np array, target_h x target_w x C): Cropped image.
+            border (np array, 4): The distance of four border of
+                ``cropped_img`` to the original image area, [top, bottom,
+                left, right]
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+        """
+        center_y, center_x = center
+        target_h, target_w = size
+        img_h, img_w, img_c = image.shape
+
+        x0 = max(0, center_x - target_w // 2)
+        x1 = min(center_x + target_w // 2, img_w)
+        y0 = max(0, center_y - target_h // 2)
+        y1 = min(center_y + target_h // 2, img_h)
+        patch = np.array((int(x0), int(y0), int(x1), int(y1)))
+
+        left, right = center_x - x0, x1 - center_x
+        top, bottom = center_y - y0, y1 - center_y
+
+        cropped_center_y, cropped_center_x = target_h // 2, target_w // 2
+        cropped_img = np.zeros((target_h, target_w, img_c), dtype=image.dtype)
+        for i in range(img_c):
+            cropped_img[:, :, i] += self.mean[i]
+        y_slice = slice(cropped_center_y - top, cropped_center_y + bottom)
+        x_slice = slice(cropped_center_x - left, cropped_center_x + right)
+        cropped_img[y_slice, x_slice, :] = image[y0:y1, x0:x1, :]
+
+        border = np.array([
+            cropped_center_y - top, cropped_center_y + bottom,
+            cropped_center_x - left, cropped_center_x + right
+        ],
+                          dtype=np.float32)
+
+        return cropped_img, border, patch
+
+    def _train_aug(self, results):
+        """Random crop and around padding the original image.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        gt_bboxes = results['gt_bboxes']
+        while True:
+            scale = random.choice(self.ratios)
+            new_h = int(self.crop_size[1] * scale)
+            new_w = int(self.crop_size[0] * scale)
+            h_border = self._get_border(self.border, h)
+            w_border = self._get_border(self.border, w)
+
+            for i in range(50):
+                center_x = random.randint(low=w_border, high=w - w_border)
+                center_y = random.randint(low=h_border, high=h - h_border)
+
+                cropped_img, border, patch = self._crop_image_and_paste(
+                    img, [center_y, center_x], [new_h, new_w])
+
+                if len(gt_bboxes) == 0:
+                    results['img'] = cropped_img
+                    results['img_shape'] = cropped_img.shape[:2]
+                    return results
+
+                # if image do not have valid bbox, any crop patch is valid.
+                mask = self._filter_boxes(patch, gt_bboxes)
+                if not mask.any():
+                    continue
+
+                results['img'] = cropped_img
+                results['img_shape'] = cropped_img.shape[:2]
+
+                x0, y0, x1, y1 = patch
+
+                left_w, top_h = center_x - x0, center_y - y0
+                cropped_center_x, cropped_center_y = new_w // 2, new_h // 2
+
+                # crop bboxes accordingly and clip to the image boundary
+                gt_bboxes = gt_bboxes[mask]
+                gt_bboxes.translate_([
+                    cropped_center_x - left_w - x0,
+                    cropped_center_y - top_h - y0
+                ])
+                if self.bbox_clip_border:
+                    gt_bboxes.clip_([new_h, new_w])
+                keep = gt_bboxes.is_inside([new_h, new_w]).numpy()
+                gt_bboxes = gt_bboxes[keep]
+
+                results['gt_bboxes'] = gt_bboxes
+
+                # ignore_flags
+                if results.get('gt_ignore_flags', None) is not None:
+                    gt_ignore_flags = results['gt_ignore_flags'][mask]
+                    results['gt_ignore_flags'] = \
+                        gt_ignore_flags[keep]
+
+                # labels
+                if results.get('gt_bboxes_labels', None) is not None:
+                    gt_labels = results['gt_bboxes_labels'][mask]
+                    results['gt_bboxes_labels'] = gt_labels[keep]
+
+                if 'gt_masks' in results or 'gt_seg_map' in results:
+                    raise NotImplementedError(
+                        'RandomCenterCropPad only supports bbox.')
+
+                return results
+
+    def _test_aug(self, results):
+        """Around padding the original image without cropping.
+
+        The padding mode and value are from ``test_pad_mode``.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        if self.test_pad_mode[0] in ['logical_or']:
+            # self.test_pad_add_pix is only used for centernet
+            target_h = (h | self.test_pad_mode[1]) + self.test_pad_add_pix
+            target_w = (w | self.test_pad_mode[1]) + self.test_pad_add_pix
+        elif self.test_pad_mode[0] in ['size_divisor']:
+            divisor = self.test_pad_mode[1]
+            target_h = int(np.ceil(h / divisor)) * divisor
+            target_w = int(np.ceil(w / divisor)) * divisor
+        else:
+            raise NotImplementedError(
+                'RandomCenterCropPad only support two testing pad mode:'
+                'logical-or and size_divisor.')
+
+        cropped_img, border, _ = self._crop_image_and_paste(
+            img, [h // 2, w // 2], [target_h, target_w])
+        results['img'] = cropped_img
+        results['img_shape'] = cropped_img.shape[:2]
+        results['border'] = border
+        return results
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        assert img.dtype == np.float32, (
+            'RandomCenterCropPad needs the input image of dtype np.float32,'
+            ' please set "to_float32=True" in "LoadImageFromFile" pipeline')
+        h, w, c = img.shape
+        assert c == len(self.mean)
+        if self.test_mode:
+            return self._test_aug(results)
+        else:
+            return self._train_aug(results)
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'ratios={self.ratios}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'mean={self.input_mean}, '
+        repr_str += f'std={self.input_std}, '
+        repr_str += f'to_rgb={self.to_rgb}, '
+        repr_str += f'test_mode={self.test_mode}, '
+        repr_str += f'test_pad_mode={self.test_pad_mode}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CutOut(BaseTransform):
+    """CutOut operation.
+
+    Randomly drop some regions of image used in
+    `Cutout <https://arxiv.org/abs/1708.04552>`_.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        n_holes (int or tuple[int, int]): Number of regions to be dropped.
+            If it is given as a list, number of holes will be randomly
+            selected from the closed interval [``n_holes[0]``, ``n_holes[1]``].
+        cutout_shape (tuple[int, int] or list[tuple[int, int]], optional):
+            The candidate shape of dropped regions. It can be
+            ``tuple[int, int]`` to use a fixed cutout shape, or
+            ``list[tuple[int, int]]`` to randomly choose shape
+            from the list. Defaults to None.
+        cutout_ratio (tuple[float, float] or list[tuple[float, float]],
+            optional): The candidate ratio of dropped regions. It can be
+            ``tuple[float, float]`` to use a fixed ratio or
+            ``list[tuple[float, float]]`` to randomly choose ratio
+            from the list. Please note that ``cutout_shape`` and
+            ``cutout_ratio`` cannot be both given at the same time.
+            Defaults to None.
+        fill_in (tuple[float, float, float] or tuple[int, int, int]): The value
+            of pixel to fill in the dropped regions. Defaults to (0, 0, 0).
+    """
+
+    def __init__(
+        self,
+        n_holes: Union[int, Tuple[int, int]],
+        cutout_shape: Optional[Union[Tuple[int, int],
+                                     List[Tuple[int, int]]]] = None,
+        cutout_ratio: Optional[Union[Tuple[float, float],
+                                     List[Tuple[float, float]]]] = None,
+        fill_in: Union[Tuple[float, float, float], Tuple[int, int,
+                                                         int]] = (0, 0, 0)
+    ) -> None:
+
+        assert (cutout_shape is None) ^ (cutout_ratio is None), \
+            'Either cutout_shape or cutout_ratio should be specified.'
+        assert (isinstance(cutout_shape, (list, tuple))
+                or isinstance(cutout_ratio, (list, tuple)))
+        if isinstance(n_holes, tuple):
+            assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1]
+        else:
+            n_holes = (n_holes, n_holes)
+        self.n_holes = n_holes
+        self.fill_in = fill_in
+        self.with_ratio = cutout_ratio is not None
+        self.candidates = cutout_ratio if self.with_ratio else cutout_shape
+        if not isinstance(self.candidates, list):
+            self.candidates = [self.candidates]
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Call function to drop some regions of image."""
+        h, w, c = results['img'].shape
+        n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
+        for _ in range(n_holes):
+            x1 = np.random.randint(0, w)
+            y1 = np.random.randint(0, h)
+            index = np.random.randint(0, len(self.candidates))
+            if not self.with_ratio:
+                cutout_w, cutout_h = self.candidates[index]
+            else:
+                cutout_w = int(self.candidates[index][0] * w)
+                cutout_h = int(self.candidates[index][1] * h)
+
+            x2 = np.clip(x1 + cutout_w, 0, w)
+            y2 = np.clip(y1 + cutout_h, 0, h)
+            results['img'][y1:y2, x1:x2, :] = self.fill_in
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(n_holes={self.n_holes}, '
+        repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio
+                     else f'cutout_shape={self.candidates}, ')
+        repr_str += f'fill_in={self.fill_in})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Mosaic(BaseTransform):
+    """Mosaic augmentation.
+
+    Given 4 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |  pad      |
+                |      +-----------+           |
+                |      |           |           |
+                |      |  image1   |--------+  |
+                |      |           |        |  |
+                |      |           | image2 |  |
+     center_y   |----+-------------+-----------|
+                |    |   cropped   |           |
+                |pad |   image3    |  image4   |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+     The mosaic transform steps are as follows:
+
+         1. Choose the mosaic center as the intersections of 4 images
+         2. Get the left top image according to the index, and randomly
+            sample another 3 images from the custom dataset.
+         3. Sub image will be cropped if image is larger than mosaic patch
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        img_scale (Sequence[int]): Image size before mosaic pipeline of single
+            image. The shape order should be (width, height).
+            Defaults to (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Defaults to (0.5, 1.5).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pad_val (int): Pad value. Defaults to 114.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 center_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 bbox_clip_border: bool = True,
+                 pad_val: float = 114.0,
+                 prob: float = 1.0) -> None:
+        assert isinstance(img_scale, tuple)
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
+                                 f'got {prob}.'
+
+        log_img_scale(img_scale, skip_square=True, shape_order='wh')
+        self.img_scale = img_scale
+        self.center_ratio_range = center_ratio_range
+        self.bbox_clip_border = bbox_clip_border
+        self.pad_val = pad_val
+        self.prob = prob
+
+    @cache_randomness
+    def get_indexes(self, dataset: BaseDataset) -> int:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+
+        Returns:
+            list: indexes.
+        """
+
+        indexes = [random.randint(0, len(dataset)) for _ in range(3)]
+        return indexes
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Mosaic transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        assert 'mix_results' in results
+        mosaic_bboxes = []
+        mosaic_bboxes_labels = []
+        mosaic_ignore_flags = []
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2)),
+                self.pad_val,
+                dtype=results['img'].dtype)
+
+        # mosaic center x, y
+        center_x = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        center_y = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[1])
+        center_position = (center_x, center_y)
+
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                results_patch = copy.deepcopy(results)
+            else:
+                results_patch = copy.deepcopy(results['mix_results'][i - 1])
+
+            img_i = results_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(self.img_scale[1] / h_i,
+                                self.img_scale[0] / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+
+            # adjust coordinate
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
+            gt_ignore_flags_i = results_patch['gt_ignore_flags']
+
+            padw = x1_p - x1_c
+            padh = y1_p - y1_c
+            gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
+            gt_bboxes_i.translate_([padw, padh])
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_bboxes_labels.append(gt_bboxes_labels_i)
+            mosaic_ignore_flags.append(gt_ignore_flags_i)
+
+        mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
+        mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
+        mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
+
+        if self.bbox_clip_border:
+            mosaic_bboxes.clip_([2 * self.img_scale[1], 2 * self.img_scale[0]])
+        # remove outside bboxes
+        inside_inds = mosaic_bboxes.is_inside(
+            [2 * self.img_scale[1], 2 * self.img_scale[0]]).numpy()
+        mosaic_bboxes = mosaic_bboxes[inside_inds]
+        mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
+        mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape[:2]
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_bboxes_labels'] = mosaic_bboxes_labels
+        results['gt_ignore_flags'] = mosaic_ignore_flags
+        return results
+
+    def _mosaic_combine(
+            self, loc: str, center_position_xy: Sequence[float],
+            img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]:
+        """Calculate global coordinate of mosaic image and local coordinate of
+        cropped sub-image.
+
+        Args:
+            loc (str): Index for the sub-image, loc in ('top_left',
+              'top_right', 'bottom_left', 'bottom_right').
+            center_position_xy (Sequence[float]): Mixing center for 4 images,
+                (x, y).
+            img_shape_wh (Sequence[int]): Width and height of sub-image
+
+        Returns:
+            tuple[tuple[float]]: Corresponding coordinate of pasting and
+                cropping
+                - paste_coord (tuple): paste corner coordinate in mosaic image.
+                - crop_coord (tuple): crop corner coordinate in mosaic image.
+        """
+        assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        if loc == 'top_left':
+            # index0 to top left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             center_position_xy[0], \
+                             center_position_xy[1]
+            crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - (
+                y2 - y1), img_shape_wh[0], img_shape_wh[1]
+
+        elif loc == 'top_right':
+            # index1 to top right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[0] * 2), \
+                             center_position_xy[1]
+            crop_coord = 0, img_shape_wh[1] - (y2 - y1), min(
+                img_shape_wh[0], x2 - x1), img_shape_wh[1]
+
+        elif loc == 'bottom_left':
+            # index2 to bottom left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             center_position_xy[1], \
+                             center_position_xy[0], \
+                             min(self.img_scale[1] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min(
+                y2 - y1, img_shape_wh[1])
+
+        else:
+            # index3 to bottom right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             center_position_xy[1], \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[0] * 2), \
+                             min(self.img_scale[1] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = 0, 0, min(img_shape_wh[0],
+                                   x2 - x1), min(y2 - y1, img_shape_wh[1])
+
+        paste_coord = x1, y1, x2, y2
+        return paste_coord, crop_coord
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MixUp(BaseTransform):
+    """MixUp data augmentation.
+
+    .. code:: text
+
+                         mixup transform
+                +------------------------------+
+                | mixup image   |              |
+                |      +--------|--------+     |
+                |      |        |        |     |
+                |---------------+        |     |
+                |      |                 |     |
+                |      |      image      |     |
+                |      |                 |     |
+                |      |                 |     |
+                |      |-----------------+     |
+                |             pad              |
+                +------------------------------+
+
+     The mixup transform steps are as follows:
+
+        1. Another random image is picked by dataset and embedded in
+           the top left patch(after padding and resizing)
+        2. The target of mixup transform is the weighted average of mixup
+           image and origin image.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+
+    Args:
+        img_scale (Sequence[int]): Image output size after mixup pipeline.
+            The shape order should be (width, height). Defaults to (640, 640).
+        ratio_range (Sequence[float]): Scale ratio of mixup image.
+            Defaults to (0.5, 1.5).
+        flip_ratio (float): Horizontal flip ratio of mixup image.
+            Defaults to 0.5.
+        pad_val (int): Pad value. Defaults to 114.
+        max_iters (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_iters`, but gt_bbox is still
+            empty, then the iteration is terminated. Defaults to 15.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 flip_ratio: float = 0.5,
+                 pad_val: float = 114.0,
+                 max_iters: int = 15,
+                 bbox_clip_border: bool = True) -> None:
+        assert isinstance(img_scale, tuple)
+        log_img_scale(img_scale, skip_square=True, shape_order='wh')
+        self.dynamic_scale = img_scale
+        self.ratio_range = ratio_range
+        self.flip_ratio = flip_ratio
+        self.pad_val = pad_val
+        self.max_iters = max_iters
+        self.bbox_clip_border = bbox_clip_border
+
+    @cache_randomness
+    def get_indexes(self, dataset: BaseDataset) -> int:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+
+        Returns:
+            list: indexes.
+        """
+
+        for i in range(self.max_iters):
+            index = random.randint(0, len(dataset))
+            gt_bboxes_i = dataset[index]['gt_bboxes']
+            if len(gt_bboxes_i) != 0:
+                break
+
+        return index
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """MixUp transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        assert 'mix_results' in results
+        assert len(
+            results['mix_results']) == 1, 'MixUp only support 2 images now !'
+
+        if results['mix_results'][0]['gt_bboxes'].shape[0] == 0:
+            # empty bbox
+            return results
+
+        retrieve_results = results['mix_results'][0]
+        retrieve_img = retrieve_results['img']
+
+        jit_factor = random.uniform(*self.ratio_range)
+        is_flip = random.uniform(0, 1) > self.flip_ratio
+
+        if len(retrieve_img.shape) == 3:
+            out_img = np.ones(
+                (self.dynamic_scale[1], self.dynamic_scale[0], 3),
+                dtype=retrieve_img.dtype) * self.pad_val
+        else:
+            out_img = np.ones(
+                self.dynamic_scale[::-1],
+                dtype=retrieve_img.dtype) * self.pad_val
+
+        # 1. keep_ratio resize
+        scale_ratio = min(self.dynamic_scale[1] / retrieve_img.shape[0],
+                          self.dynamic_scale[0] / retrieve_img.shape[1])
+        retrieve_img = mmcv.imresize(
+            retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
+                           int(retrieve_img.shape[0] * scale_ratio)))
+
+        # 2. paste
+        out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
+
+        # 3. scale jit
+        scale_ratio *= jit_factor
+        out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
+                                          int(out_img.shape[0] * jit_factor)))
+
+        # 4. flip
+        if is_flip:
+            out_img = out_img[:, ::-1, :]
+
+        # 5. random crop
+        ori_img = results['img']
+        origin_h, origin_w = out_img.shape[:2]
+        target_h, target_w = ori_img.shape[:2]
+        padded_img = np.ones((max(origin_h, target_h), max(
+            origin_w, target_w), 3)) * self.pad_val
+        padded_img = padded_img.astype(np.uint8)
+        padded_img[:origin_h, :origin_w] = out_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h,
+                                        x_offset:x_offset + target_w]
+
+        # 6. adjust bbox
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio])
+        if self.bbox_clip_border:
+            retrieve_gt_bboxes.clip_([origin_h, origin_w])
+
+        if is_flip:
+            retrieve_gt_bboxes.flip_([origin_h, origin_w],
+                                     direction='horizontal')
+
+        # 7. filter
+        cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone()
+        cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset])
+        if self.bbox_clip_border:
+            cp_retrieve_gt_bboxes.clip_([target_h, target_w])
+
+        # 8. mix up
+        ori_img = ori_img.astype(np.float32)
+        mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32)
+
+        retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
+        retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
+
+        mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat(
+            (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0)
+        mixup_gt_bboxes_labels = np.concatenate(
+            (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
+        mixup_gt_ignore_flags = np.concatenate(
+            (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
+
+        # remove outside bbox
+        inside_inds = mixup_gt_bboxes.is_inside([target_h, target_w]).numpy()
+        mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
+        mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds]
+        mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds]
+
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape[:2]
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
+        results['gt_ignore_flags'] = mixup_gt_ignore_flags
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(dynamic_scale={self.dynamic_scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'flip_ratio={self.flip_ratio}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'max_iters={self.max_iters}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomAffine(BaseTransform):
+    """Random affine transform data augmentation.
+
+    This operation randomly generates affine transform matrix which including
+    rotation, translation, shear and scaling transforms.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        max_rotate_degree (float): Maximum degrees of rotation transform.
+            Defaults to 10.
+        max_translate_ratio (float): Maximum ratio of translation.
+            Defaults to 0.1.
+        scaling_ratio_range (tuple[float]): Min and max ratio of
+            scaling transform. Defaults to (0.5, 1.5).
+        max_shear_degree (float): Maximum degrees of shear
+            transform. Defaults to 2.
+        border (tuple[int]): Distance from width and height sides of input
+            image to adjust output shape. Only used in mosaic dataset.
+            Defaults to (0, 0).
+        border_val (tuple[int]): Border padding values of 3 channels.
+            Defaults to (114, 114, 114).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+    """
+
+    def __init__(self,
+                 max_rotate_degree: float = 10.0,
+                 max_translate_ratio: float = 0.1,
+                 scaling_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 max_shear_degree: float = 2.0,
+                 border: Tuple[int, int] = (0, 0),
+                 border_val: Tuple[int, int, int] = (114, 114, 114),
+                 bbox_clip_border: bool = True) -> None:
+        assert 0 <= max_translate_ratio <= 1
+        assert scaling_ratio_range[0] <= scaling_ratio_range[1]
+        assert scaling_ratio_range[0] > 0
+        self.max_rotate_degree = max_rotate_degree
+        self.max_translate_ratio = max_translate_ratio
+        self.scaling_ratio_range = scaling_ratio_range
+        self.max_shear_degree = max_shear_degree
+        self.border = border
+        self.border_val = border_val
+        self.bbox_clip_border = bbox_clip_border
+
+    @cache_randomness
+    def _get_random_homography_matrix(self, height, width):
+        # Rotation
+        rotation_degree = random.uniform(-self.max_rotate_degree,
+                                         self.max_rotate_degree)
+        rotation_matrix = self._get_rotation_matrix(rotation_degree)
+
+        # Scaling
+        scaling_ratio = random.uniform(self.scaling_ratio_range[0],
+                                       self.scaling_ratio_range[1])
+        scaling_matrix = self._get_scaling_matrix(scaling_ratio)
+
+        # Shear
+        x_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        y_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        shear_matrix = self._get_shear_matrix(x_degree, y_degree)
+
+        # Translation
+        trans_x = random.uniform(-self.max_translate_ratio,
+                                 self.max_translate_ratio) * width
+        trans_y = random.uniform(-self.max_translate_ratio,
+                                 self.max_translate_ratio) * height
+        translate_matrix = self._get_translation_matrix(trans_x, trans_y)
+
+        warp_matrix = (
+            translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix)
+        return warp_matrix
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        height = img.shape[0] + self.border[1] * 2
+        width = img.shape[1] + self.border[0] * 2
+
+        warp_matrix = self._get_random_homography_matrix(height, width)
+
+        img = cv2.warpPerspective(
+            img,
+            warp_matrix,
+            dsize=(width, height),
+            borderValue=self.border_val)
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+
+        bboxes = results['gt_bboxes']
+        num_bboxes = len(bboxes)
+        if num_bboxes:
+            bboxes.project_(warp_matrix)
+            if self.bbox_clip_border:
+                bboxes.clip_([height, width])
+            # remove outside bbox
+            valid_index = bboxes.is_inside([height, width]).numpy()
+            results['gt_bboxes'] = bboxes[valid_index]
+            results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
+                valid_index]
+            results['gt_ignore_flags'] = results['gt_ignore_flags'][
+                valid_index]
+
+            if 'gt_masks' in results:
+                raise NotImplementedError('RandomAffine only supports bbox.')
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(max_rotate_degree={self.max_rotate_degree}, '
+        repr_str += f'max_translate_ratio={self.max_translate_ratio}, '
+        repr_str += f'scaling_ratio_range={self.scaling_ratio_range}, '
+        repr_str += f'max_shear_degree={self.max_shear_degree}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'border_val={self.border_val}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+    @staticmethod
+    def _get_rotation_matrix(rotate_degrees: float) -> np.ndarray:
+        radian = math.radians(rotate_degrees)
+        rotation_matrix = np.array(
+            [[np.cos(radian), -np.sin(radian), 0.],
+             [np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return rotation_matrix
+
+    @staticmethod
+    def _get_scaling_matrix(scale_ratio: float) -> np.ndarray:
+        scaling_matrix = np.array(
+            [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return scaling_matrix
+
+    @staticmethod
+    def _get_shear_matrix(x_shear_degrees: float,
+                          y_shear_degrees: float) -> np.ndarray:
+        x_radian = math.radians(x_shear_degrees)
+        y_radian = math.radians(y_shear_degrees)
+        shear_matrix = np.array([[1, np.tan(x_radian), 0.],
+                                 [np.tan(y_radian), 1, 0.], [0., 0., 1.]],
+                                dtype=np.float32)
+        return shear_matrix
+
+    @staticmethod
+    def _get_translation_matrix(x: float, y: float) -> np.ndarray:
+        translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]],
+                                      dtype=np.float32)
+        return translation_matrix
+
+
+@TRANSFORMS.register_module()
+class YOLOXHSVRandomAug(BaseTransform):
+    """Apply HSV augmentation to image sequentially. It is referenced from
+    https://github.com/Megvii-
+    BaseDetection/YOLOX/blob/main/yolox/data/data_augment.py#L21.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        hue_delta (int): delta of hue. Defaults to 5.
+        saturation_delta (int): delta of saturation. Defaults to 30.
+        value_delta (int): delat of value. Defaults to 30.
+    """
+
+    def __init__(self,
+                 hue_delta: int = 5,
+                 saturation_delta: int = 30,
+                 value_delta: int = 30) -> None:
+        self.hue_delta = hue_delta
+        self.saturation_delta = saturation_delta
+        self.value_delta = value_delta
+
+    @cache_randomness
+    def _get_hsv_gains(self):
+        hsv_gains = np.random.uniform(-1, 1, 3) * [
+            self.hue_delta, self.saturation_delta, self.value_delta
+        ]
+        # random selection of h, s, v
+        hsv_gains *= np.random.randint(0, 2, 3)
+        # prevent overflow
+        hsv_gains = hsv_gains.astype(np.int16)
+        return hsv_gains
+
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        hsv_gains = self._get_hsv_gains()
+        img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16)
+
+        img_hsv[..., 0] = (img_hsv[..., 0] + hsv_gains[0]) % 180
+        img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_gains[1], 0, 255)
+        img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_gains[2], 0, 255)
+        cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img)
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(hue_delta={self.hue_delta}, '
+        repr_str += f'saturation_delta={self.saturation_delta}, '
+        repr_str += f'value_delta={self.value_delta})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CopyPaste(BaseTransform):
+    """Simple Copy-Paste is a Strong Data Augmentation Method for Instance
+    Segmentation The simple copy-paste transform steps are as follows:
+
+    1. The destination image is already resized with aspect ratio kept,
+       cropped and padded.
+    2. Randomly select a source image, which is also already resized
+       with aspect ratio kept, cropped and padded in a similar way
+       as the destination image.
+    3. Randomly select some objects from the source image.
+    4. Paste these source objects to the destination image directly,
+       due to the source and destination image have the same size.
+    5. Update object masks of the destination image, for some origin objects
+       may be occluded.
+    6. Generate bboxes from the updated destination masks and
+       filter some objects which are totally occluded, and adjust bboxes
+       which are partly occluded.
+    7. Append selected source bboxes, masks, and labels.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_masks (BitmapMasks) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+    - gt_masks (optional)
+
+    Args:
+        max_num_pasted (int): The maximum number of pasted objects.
+            Defaults to 100.
+        bbox_occluded_thr (int): The threshold of occluded bbox.
+            Defaults to 10.
+        mask_occluded_thr (int): The threshold of occluded mask.
+            Defaults to 300.
+        selected (bool): Whether select objects or not. If select is False,
+            all objects of the source image will be pasted to the
+            destination image.
+            Defaults to True.
+        paste_by_box (bool): Whether use boxes as masks when masks are not
+            available.
+            Defaults to False.
+    """
+
+    def __init__(
+        self,
+        max_num_pasted: int = 100,
+        bbox_occluded_thr: int = 10,
+        mask_occluded_thr: int = 300,
+        selected: bool = True,
+        paste_by_box: bool = False,
+    ) -> None:
+        self.max_num_pasted = max_num_pasted
+        self.bbox_occluded_thr = bbox_occluded_thr
+        self.mask_occluded_thr = mask_occluded_thr
+        self.selected = selected
+        self.paste_by_box = paste_by_box
+
+    @cache_randomness
+    def get_indexes(self, dataset: BaseDataset) -> int:
+        """Call function to collect indexes.s.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+        Returns:
+            list: Indexes.
+        """
+        return random.randint(0, len(dataset))
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to make a copy-paste of image.
+
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Result dict with copy-paste transformed.
+        """
+
+        assert 'mix_results' in results
+        num_images = len(results['mix_results'])
+        assert num_images == 1, \
+            f'CopyPaste only supports processing 2 images, got {num_images}'
+        if self.selected:
+            selected_results = self._select_object(results['mix_results'][0])
+        else:
+            selected_results = results['mix_results'][0]
+        return self._copy_paste(results, selected_results)
+
+    @cache_randomness
+    def _get_selected_inds(self, num_bboxes: int) -> np.ndarray:
+        max_num_pasted = min(num_bboxes + 1, self.max_num_pasted)
+        num_pasted = np.random.randint(0, max_num_pasted)
+        return np.random.choice(num_bboxes, size=num_pasted, replace=False)
+
+    def get_gt_masks(self, results: dict) -> BitmapMasks:
+        """Get gt_masks originally or generated based on bboxes.
+
+        If gt_masks is not contained in results,
+        it will be generated based on gt_bboxes.
+        Args:
+            results (dict): Result dict.
+        Returns:
+            BitmapMasks: gt_masks, originally or generated based on bboxes.
+        """
+        if results.get('gt_masks', None) is not None:
+            if self.paste_by_box:
+                warnings.warn('gt_masks is already contained in results, '
+                              'so paste_by_box is disabled.')
+            return results['gt_masks']
+        else:
+            if not self.paste_by_box:
+                raise RuntimeError('results does not contain masks.')
+            return results['gt_bboxes'].create_masks(results['img'].shape[:2])
+
+    def _select_object(self, results: dict) -> dict:
+        """Select some objects from the source results."""
+        bboxes = results['gt_bboxes']
+        labels = results['gt_bboxes_labels']
+        masks = self.get_gt_masks(results)
+        ignore_flags = results['gt_ignore_flags']
+
+        selected_inds = self._get_selected_inds(bboxes.shape[0])
+
+        selected_bboxes = bboxes[selected_inds]
+        selected_labels = labels[selected_inds]
+        selected_masks = masks[selected_inds]
+        selected_ignore_flags = ignore_flags[selected_inds]
+
+        results['gt_bboxes'] = selected_bboxes
+        results['gt_bboxes_labels'] = selected_labels
+        results['gt_masks'] = selected_masks
+        results['gt_ignore_flags'] = selected_ignore_flags
+        return results
+
+    def _copy_paste(self, dst_results: dict, src_results: dict) -> dict:
+        """CopyPaste transform function.
+
+        Args:
+            dst_results (dict): Result dict of the destination image.
+            src_results (dict): Result dict of the source image.
+        Returns:
+            dict: Updated result dict.
+        """
+        dst_img = dst_results['img']
+        dst_bboxes = dst_results['gt_bboxes']
+        dst_labels = dst_results['gt_bboxes_labels']
+        dst_masks = self.get_gt_masks(dst_results)
+        dst_ignore_flags = dst_results['gt_ignore_flags']
+
+        src_img = src_results['img']
+        src_bboxes = src_results['gt_bboxes']
+        src_labels = src_results['gt_bboxes_labels']
+        src_masks = src_results['gt_masks']
+        src_ignore_flags = src_results['gt_ignore_flags']
+
+        if len(src_bboxes) == 0:
+            return dst_results
+
+        # update masks and generate bboxes from updated masks
+        composed_mask = np.where(np.any(src_masks.masks, axis=0), 1, 0)
+        updated_dst_masks = self._get_updated_masks(dst_masks, composed_mask)
+        updated_dst_bboxes = updated_dst_masks.get_bboxes(type(dst_bboxes))
+        assert len(updated_dst_bboxes) == len(updated_dst_masks)
+
+        # filter totally occluded objects
+        l1_distance = (updated_dst_bboxes.tensor - dst_bboxes.tensor).abs()
+        bboxes_inds = (l1_distance <= self.bbox_occluded_thr).all(
+            dim=-1).numpy()
+        masks_inds = updated_dst_masks.masks.sum(
+            axis=(1, 2)) > self.mask_occluded_thr
+        valid_inds = bboxes_inds | masks_inds
+
+        # Paste source objects to destination image directly
+        img = dst_img * (1 - composed_mask[..., np.newaxis]
+                         ) + src_img * composed_mask[..., np.newaxis]
+        bboxes = src_bboxes.cat([updated_dst_bboxes[valid_inds], src_bboxes])
+        labels = np.concatenate([dst_labels[valid_inds], src_labels])
+        masks = np.concatenate(
+            [updated_dst_masks.masks[valid_inds], src_masks.masks])
+        ignore_flags = np.concatenate(
+            [dst_ignore_flags[valid_inds], src_ignore_flags])
+
+        dst_results['img'] = img
+        dst_results['gt_bboxes'] = bboxes
+        dst_results['gt_bboxes_labels'] = labels
+        dst_results['gt_masks'] = BitmapMasks(masks, masks.shape[1],
+                                              masks.shape[2])
+        dst_results['gt_ignore_flags'] = ignore_flags
+
+        return dst_results
+
+    def _get_updated_masks(self, masks: BitmapMasks,
+                           composed_mask: np.ndarray) -> BitmapMasks:
+        """Update masks with composed mask."""
+        assert masks.masks.shape[-2:] == composed_mask.shape[-2:], \
+            'Cannot compare two arrays of different size'
+        masks.masks = np.where(composed_mask, 0, masks.masks)
+        return masks
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(max_num_pasted={self.max_num_pasted}, '
+        repr_str += f'bbox_occluded_thr={self.bbox_occluded_thr}, '
+        repr_str += f'mask_occluded_thr={self.mask_occluded_thr}, '
+        repr_str += f'selected={self.selected}), '
+        repr_str += f'paste_by_box={self.paste_by_box})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomErasing(BaseTransform):
+    """RandomErasing operation.
+
+    Random Erasing randomly selects a rectangle region
+    in an image and erases its pixels with random values.
+    `RandomErasing <https://arxiv.org/abs/1708.04896>`_.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (HorizontalBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_masks (BitmapMasks) (optional)
+
+    Modified Keys:
+    - img
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+    - gt_masks (optional)
+
+    Args:
+        n_patches (int or tuple[int, int]): Number of regions to be dropped.
+            If it is given as a tuple, number of patches will be randomly
+            selected from the closed interval [``n_patches[0]``,
+            ``n_patches[1]``].
+        ratio (float or tuple[float, float]): The ratio of erased regions.
+            It can be ``float`` to use a fixed ratio or ``tuple[float, float]``
+            to randomly choose ratio from the interval.
+        squared (bool): Whether to erase square region. Defaults to True.
+        bbox_erased_thr (float): The threshold for the maximum area proportion
+            of the bbox to be erased. When the proportion of the area where the
+            bbox is erased is greater than the threshold, the bbox will be
+            removed. Defaults to 0.9.
+        img_border_value (int or float or tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+    """
+
+    def __init__(
+        self,
+        n_patches: Union[int, Tuple[int, int]],
+        ratio: Union[float, Tuple[float, float]],
+        squared: bool = True,
+        bbox_erased_thr: float = 0.9,
+        img_border_value: Union[int, float, tuple] = 128,
+        mask_border_value: int = 0,
+        seg_ignore_label: int = 255,
+    ) -> None:
+        if isinstance(n_patches, tuple):
+            assert len(n_patches) == 2 and 0 <= n_patches[0] < n_patches[1]
+        else:
+            n_patches = (n_patches, n_patches)
+        if isinstance(ratio, tuple):
+            assert len(ratio) == 2 and 0 <= ratio[0] < ratio[1] <= 1
+        else:
+            ratio = (ratio, ratio)
+
+        self.n_patches = n_patches
+        self.ratio = ratio
+        self.squared = squared
+        self.bbox_erased_thr = bbox_erased_thr
+        self.img_border_value = img_border_value
+        self.mask_border_value = mask_border_value
+        self.seg_ignore_label = seg_ignore_label
+
+    @cache_randomness
+    def _get_patches(self, img_shape: Tuple[int, int]) -> List[list]:
+        """Get patches for random erasing."""
+        patches = []
+        n_patches = np.random.randint(self.n_patches[0], self.n_patches[1] + 1)
+        for _ in range(n_patches):
+            if self.squared:
+                ratio = np.random.random() * (self.ratio[1] -
+                                              self.ratio[0]) + self.ratio[0]
+                ratio = (ratio, ratio)
+            else:
+                ratio = (np.random.random() * (self.ratio[1] - self.ratio[0]) +
+                         self.ratio[0], np.random.random() *
+                         (self.ratio[1] - self.ratio[0]) + self.ratio[0])
+            ph, pw = int(img_shape[0] * ratio[0]), int(img_shape[1] * ratio[1])
+            px1, py1 = np.random.randint(0,
+                                         img_shape[1] - pw), np.random.randint(
+                                             0, img_shape[0] - ph)
+            px2, py2 = px1 + pw, py1 + ph
+            patches.append([px1, py1, px2, py2])
+        return np.array(patches)
+
+    def _transform_img(self, results: dict, patches: List[list]) -> None:
+        """Random erasing the image."""
+        for patch in patches:
+            px1, py1, px2, py2 = patch
+            results['img'][py1:py2, px1:px2, :] = self.img_border_value
+
+    def _transform_bboxes(self, results: dict, patches: List[list]) -> None:
+        """Random erasing the bboxes."""
+        bboxes = results['gt_bboxes']
+        # TODO: unify the logic by using operators in BaseBoxes.
+        assert isinstance(bboxes, HorizontalBoxes)
+        bboxes = bboxes.numpy()
+        left_top = np.maximum(bboxes[:, None, :2], patches[:, :2])
+        right_bottom = np.minimum(bboxes[:, None, 2:], patches[:, 2:])
+        wh = np.maximum(right_bottom - left_top, 0)
+        inter_areas = wh[:, :, 0] * wh[:, :, 1]
+        bbox_areas = (bboxes[:, 2] - bboxes[:, 0]) * (
+            bboxes[:, 3] - bboxes[:, 1])
+        bboxes_erased_ratio = inter_areas.sum(-1) / (bbox_areas + 1e-7)
+        valid_inds = bboxes_erased_ratio < self.bbox_erased_thr
+        results['gt_bboxes'] = HorizontalBoxes(bboxes[valid_inds])
+        results['gt_bboxes_labels'] = results['gt_bboxes_labels'][valid_inds]
+        results['gt_ignore_flags'] = results['gt_ignore_flags'][valid_inds]
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'][valid_inds]
+
+    def _transform_masks(self, results: dict, patches: List[list]) -> None:
+        """Random erasing the masks."""
+        for patch in patches:
+            px1, py1, px2, py2 = patch
+            results['gt_masks'].masks[:, py1:py2,
+                                      px1:px2] = self.mask_border_value
+
+    def _transform_seg(self, results: dict, patches: List[list]) -> None:
+        """Random erasing the segmentation map."""
+        for patch in patches:
+            px1, py1, px2, py2 = patch
+            results['gt_seg_map'][py1:py2, px1:px2] = self.seg_ignore_label
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to erase some regions of image."""
+        patches = self._get_patches(results['img_shape'])
+        self._transform_img(results, patches)
+        if results.get('gt_bboxes', None) is not None:
+            self._transform_bboxes(results, patches)
+        if results.get('gt_masks', None) is not None:
+            self._transform_masks(results, patches)
+        if results.get('gt_seg_map', None) is not None:
+            self._transform_seg(results, patches)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(n_patches={self.n_patches}, '
+        repr_str += f'ratio={self.ratio}, '
+        repr_str += f'squared={self.squared}, '
+        repr_str += f'bbox_erased_thr={self.bbox_erased_thr}, '
+        repr_str += f'img_border_value={self.img_border_value}, '
+        repr_str += f'mask_border_value={self.mask_border_value}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CachedMosaic(Mosaic):
+    """Cached mosaic augmentation.
+
+    Cached mosaic transform will random select images from the cache
+    and combine them into one output image.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |  pad      |
+                |      +-----------+           |
+                |      |           |           |
+                |      |  image1   |--------+  |
+                |      |           |        |  |
+                |      |           | image2 |  |
+     center_y   |----+-------------+-----------|
+                |    |   cropped   |           |
+                |pad |   image3    |  image4   |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+     The cached mosaic transform steps are as follows:
+
+         1. Append the results from the last transform into the cache.
+         2. Choose the mosaic center as the intersections of 4 images
+         3. Get the left top image according to the index, and randomly
+            sample another 3 images from the result cache.
+         4. Sub image will be cropped if image is larger than mosaic patch
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (np.float32) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        img_scale (Sequence[int]): Image size before mosaic pipeline of single
+            image. The shape order should be (width, height).
+            Defaults to (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Defaults to (0.5, 1.5).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pad_val (int): Pad value. Defaults to 114.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 40.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 *args,
+                 max_cached_images: int = 40,
+                 random_pop: bool = True,
+                 **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.results_cache = []
+        self.random_pop = random_pop
+        assert max_cached_images >= 4, 'The length of cache must >= 4, ' \
+                                       f'but got {max_cached_images}.'
+        self.max_cached_images = max_cached_images
+
+    @cache_randomness
+    def get_indexes(self, cache: list) -> list:
+        """Call function to collect indexes.
+
+        Args:
+            cache (list): The results cache.
+
+        Returns:
+            list: indexes.
+        """
+
+        indexes = [random.randint(0, len(cache) - 1) for _ in range(3)]
+        return indexes
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Mosaic transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        # cache and pop images
+        self.results_cache.append(copy.deepcopy(results))
+        if len(self.results_cache) > self.max_cached_images:
+            if self.random_pop:
+                index = random.randint(0, len(self.results_cache) - 1)
+            else:
+                index = 0
+            self.results_cache.pop(index)
+
+        if len(self.results_cache) <= 4:
+            return results
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+        indices = self.get_indexes(self.results_cache)
+        mix_results = [copy.deepcopy(self.results_cache[i]) for i in indices]
+
+        # TODO: refactor mosaic to reuse these code.
+        mosaic_bboxes = []
+        mosaic_bboxes_labels = []
+        mosaic_ignore_flags = []
+        mosaic_masks = []
+        with_mask = True if 'gt_masks' in results else False
+
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2)),
+                self.pad_val,
+                dtype=results['img'].dtype)
+
+        # mosaic center x, y
+        center_x = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        center_y = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[1])
+        center_position = (center_x, center_y)
+
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                results_patch = copy.deepcopy(results)
+            else:
+                results_patch = copy.deepcopy(mix_results[i - 1])
+
+            img_i = results_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(self.img_scale[1] / h_i,
+                                self.img_scale[0] / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+
+            # adjust coordinate
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
+            gt_ignore_flags_i = results_patch['gt_ignore_flags']
+
+            padw = x1_p - x1_c
+            padh = y1_p - y1_c
+            gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
+            gt_bboxes_i.translate_([padw, padh])
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_bboxes_labels.append(gt_bboxes_labels_i)
+            mosaic_ignore_flags.append(gt_ignore_flags_i)
+            if with_mask and results_patch.get('gt_masks', None) is not None:
+                gt_masks_i = results_patch['gt_masks']
+                gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i))
+                gt_masks_i = gt_masks_i.translate(
+                    out_shape=(int(self.img_scale[0] * 2),
+                               int(self.img_scale[1] * 2)),
+                    offset=padw,
+                    direction='horizontal')
+                gt_masks_i = gt_masks_i.translate(
+                    out_shape=(int(self.img_scale[0] * 2),
+                               int(self.img_scale[1] * 2)),
+                    offset=padh,
+                    direction='vertical')
+                mosaic_masks.append(gt_masks_i)
+
+        mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
+        mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
+        mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
+
+        if self.bbox_clip_border:
+            mosaic_bboxes.clip_([2 * self.img_scale[1], 2 * self.img_scale[0]])
+        # remove outside bboxes
+        inside_inds = mosaic_bboxes.is_inside(
+            [2 * self.img_scale[1], 2 * self.img_scale[0]]).numpy()
+        mosaic_bboxes = mosaic_bboxes[inside_inds]
+        mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
+        mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape[:2]
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_bboxes_labels'] = mosaic_bboxes_labels
+        results['gt_ignore_flags'] = mosaic_ignore_flags
+
+        if with_mask:
+            mosaic_masks = mosaic_masks[0].cat(mosaic_masks)
+            results['gt_masks'] = mosaic_masks[inside_inds]
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'prob={self.prob}, '
+        repr_str += f'max_cached_images={self.max_cached_images}, '
+        repr_str += f'random_pop={self.random_pop})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CachedMixUp(BaseTransform):
+    """Cached mixup data augmentation.
+
+    .. code:: text
+
+                         mixup transform
+                +------------------------------+
+                | mixup image   |              |
+                |      +--------|--------+     |
+                |      |        |        |     |
+                |---------------+        |     |
+                |      |                 |     |
+                |      |      image      |     |
+                |      |                 |     |
+                |      |                 |     |
+                |      |-----------------+     |
+                |             pad              |
+                +------------------------------+
+
+     The cached mixup transform steps are as follows:
+
+        1. Append the results from the last transform into the cache.
+        2. Another random image is picked from the cache and embedded in
+           the top left patch(after padding and resizing)
+        3. The target of mixup transform is the weighted average of mixup
+           image and origin image.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (np.float32) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+
+    Args:
+        img_scale (Sequence[int]): Image output size after mixup pipeline.
+            The shape order should be (width, height). Defaults to (640, 640).
+        ratio_range (Sequence[float]): Scale ratio of mixup image.
+            Defaults to (0.5, 1.5).
+        flip_ratio (float): Horizontal flip ratio of mixup image.
+            Defaults to 0.5.
+        pad_val (int): Pad value. Defaults to 114.
+        max_iters (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_iters`, but gt_bbox is still
+            empty, then the iteration is terminated. Defaults to 15.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 20.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 flip_ratio: float = 0.5,
+                 pad_val: float = 114.0,
+                 max_iters: int = 15,
+                 bbox_clip_border: bool = True,
+                 max_cached_images: int = 20,
+                 random_pop: bool = True,
+                 prob: float = 1.0) -> None:
+        assert isinstance(img_scale, tuple)
+        assert max_cached_images >= 2, 'The length of cache must >= 2, ' \
+                                       f'but got {max_cached_images}.'
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
+                                 f'got {prob}.'
+        self.dynamic_scale = img_scale
+        self.ratio_range = ratio_range
+        self.flip_ratio = flip_ratio
+        self.pad_val = pad_val
+        self.max_iters = max_iters
+        self.bbox_clip_border = bbox_clip_border
+        self.results_cache = []
+
+        self.max_cached_images = max_cached_images
+        self.random_pop = random_pop
+        self.prob = prob
+
+    @cache_randomness
+    def get_indexes(self, cache: list) -> int:
+        """Call function to collect indexes.
+
+        Args:
+            cache (list): The result cache.
+
+        Returns:
+            int: index.
+        """
+
+        for i in range(self.max_iters):
+            index = random.randint(0, len(cache) - 1)
+            gt_bboxes_i = cache[index]['gt_bboxes']
+            if len(gt_bboxes_i) != 0:
+                break
+        return index
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """MixUp transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        # cache and pop images
+        self.results_cache.append(copy.deepcopy(results))
+        if len(self.results_cache) > self.max_cached_images:
+            if self.random_pop:
+                index = random.randint(0, len(self.results_cache) - 1)
+            else:
+                index = 0
+            self.results_cache.pop(index)
+
+        if len(self.results_cache) <= 1:
+            return results
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        index = self.get_indexes(self.results_cache)
+        retrieve_results = copy.deepcopy(self.results_cache[index])
+
+        # TODO: refactor mixup to reuse these code.
+        if retrieve_results['gt_bboxes'].shape[0] == 0:
+            # empty bbox
+            return results
+
+        retrieve_img = retrieve_results['img']
+        with_mask = True if 'gt_masks' in results else False
+
+        jit_factor = random.uniform(*self.ratio_range)
+        is_flip = random.uniform(0, 1) > self.flip_ratio
+
+        if len(retrieve_img.shape) == 3:
+            out_img = np.ones(
+                (self.dynamic_scale[1], self.dynamic_scale[0], 3),
+                dtype=retrieve_img.dtype) * self.pad_val
+        else:
+            out_img = np.ones(
+                self.dynamic_scale[::-1],
+                dtype=retrieve_img.dtype) * self.pad_val
+
+        # 1. keep_ratio resize
+        scale_ratio = min(self.dynamic_scale[1] / retrieve_img.shape[0],
+                          self.dynamic_scale[0] / retrieve_img.shape[1])
+        retrieve_img = mmcv.imresize(
+            retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
+                           int(retrieve_img.shape[0] * scale_ratio)))
+
+        # 2. paste
+        out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
+
+        # 3. scale jit
+        scale_ratio *= jit_factor
+        out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
+                                          int(out_img.shape[0] * jit_factor)))
+
+        # 4. flip
+        if is_flip:
+            out_img = out_img[:, ::-1, :]
+
+        # 5. random crop
+        ori_img = results['img']
+        origin_h, origin_w = out_img.shape[:2]
+        target_h, target_w = ori_img.shape[:2]
+        padded_img = np.ones((max(origin_h, target_h), max(
+            origin_w, target_w), 3)) * self.pad_val
+        padded_img = padded_img.astype(np.uint8)
+        padded_img[:origin_h, :origin_w] = out_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h,
+                                        x_offset:x_offset + target_w]
+
+        # 6. adjust bbox
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio])
+        if with_mask:
+            retrieve_gt_masks = retrieve_results['gt_masks'].rescale(
+                scale_ratio)
+
+        if self.bbox_clip_border:
+            retrieve_gt_bboxes.clip_([origin_h, origin_w])
+
+        if is_flip:
+            retrieve_gt_bboxes.flip_([origin_h, origin_w],
+                                     direction='horizontal')
+            if with_mask:
+                retrieve_gt_masks = retrieve_gt_masks.flip()
+
+        # 7. filter
+        cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone()
+        cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset])
+        if with_mask:
+            retrieve_gt_masks = retrieve_gt_masks.translate(
+                out_shape=(target_h, target_w),
+                offset=-x_offset,
+                direction='horizontal')
+            retrieve_gt_masks = retrieve_gt_masks.translate(
+                out_shape=(target_h, target_w),
+                offset=-y_offset,
+                direction='vertical')
+
+        if self.bbox_clip_border:
+            cp_retrieve_gt_bboxes.clip_([target_h, target_w])
+
+        # 8. mix up
+        ori_img = ori_img.astype(np.float32)
+        mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32)
+
+        retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
+        retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
+
+        mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat(
+            (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0)
+        mixup_gt_bboxes_labels = np.concatenate(
+            (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
+        mixup_gt_ignore_flags = np.concatenate(
+            (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
+        if with_mask:
+            mixup_gt_masks = retrieve_gt_masks.cat(
+                [results['gt_masks'], retrieve_gt_masks])
+
+        # remove outside bbox
+        inside_inds = mixup_gt_bboxes.is_inside([target_h, target_w]).numpy()
+        mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
+        mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds]
+        mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds]
+        if with_mask:
+            mixup_gt_masks = mixup_gt_masks[inside_inds]
+
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape[:2]
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
+        results['gt_ignore_flags'] = mixup_gt_ignore_flags
+        if with_mask:
+            results['gt_masks'] = mixup_gt_masks
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(dynamic_scale={self.dynamic_scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'flip_ratio={self.flip_ratio}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'max_iters={self.max_iters}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border}, '
+        repr_str += f'max_cached_images={self.max_cached_images}, '
+        repr_str += f'random_pop={self.random_pop}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
diff --git a/head_extractor/build/lib/mmdet/datasets/transforms/wrappers.py b/head_extractor/build/lib/mmdet/datasets/transforms/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a17711c06bfbd4dc0038dce9ea7796d1476c37e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/transforms/wrappers.py
@@ -0,0 +1,277 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+from mmcv.transforms import BaseTransform, Compose
+from mmcv.transforms.utils import cache_random_params, cache_randomness
+
+from mmdet.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class MultiBranch(BaseTransform):
+    r"""Multiple branch pipeline wrapper.
+
+    Generate multiple data-augmented versions of the same image.
+    `MultiBranch` needs to specify the branch names of all
+    pipelines of the dataset, perform corresponding data augmentation
+    for the current branch, and return None for other branches,
+    which ensures the consistency of return format across
+    different samples.
+
+    Args:
+        branch_field (list): List of branch names.
+        branch_pipelines (dict): Dict of different pipeline configs
+            to be composed.
+
+    Examples:
+        >>> branch_field = ['sup', 'unsup_teacher', 'unsup_student']
+        >>> sup_pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadAnnotations', with_bbox=True),
+        >>>     dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+        >>>     dict(type='RandomFlip', prob=0.5),
+        >>>     dict(
+        >>>         type='MultiBranch',
+        >>>         branch_field=branch_field,
+        >>>         sup=dict(type='PackDetInputs'))
+        >>>     ]
+        >>> weak_pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadAnnotations', with_bbox=True),
+        >>>     dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+        >>>     dict(type='RandomFlip', prob=0.0),
+        >>>     dict(
+        >>>         type='MultiBranch',
+        >>>         branch_field=branch_field,
+        >>>         sup=dict(type='PackDetInputs'))
+        >>>     ]
+        >>> strong_pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadAnnotations', with_bbox=True),
+        >>>     dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+        >>>     dict(type='RandomFlip', prob=1.0),
+        >>>     dict(
+        >>>         type='MultiBranch',
+        >>>         branch_field=branch_field,
+        >>>         sup=dict(type='PackDetInputs'))
+        >>>     ]
+        >>> unsup_pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadEmptyAnnotations'),
+        >>>     dict(
+        >>>         type='MultiBranch',
+        >>>         branch_field=branch_field,
+        >>>         unsup_teacher=weak_pipeline,
+        >>>         unsup_student=strong_pipeline)
+        >>>     ]
+        >>> from mmcv.transforms import Compose
+        >>> sup_branch = Compose(sup_pipeline)
+        >>> unsup_branch = Compose(unsup_pipeline)
+        >>> print(sup_branch)
+        >>> Compose(
+        >>>     LoadImageFromFile(ignore_empty=False, to_float32=False, color_type='color', imdecode_backend='cv2') # noqa
+        >>>     LoadAnnotations(with_bbox=True, with_label=True, with_mask=False, with_seg=False, poly2mask=True, imdecode_backend='cv2') # noqa
+        >>>     Resize(scale=(1333, 800), scale_factor=None, keep_ratio=True, clip_object_border=True), backend=cv2), interpolation=bilinear) # noqa
+        >>>     RandomFlip(prob=0.5, direction=horizontal)
+        >>>     MultiBranch(branch_pipelines=['sup'])
+        >>> )
+        >>> print(unsup_branch)
+        >>> Compose(
+        >>>     LoadImageFromFile(ignore_empty=False, to_float32=False, color_type='color', imdecode_backend='cv2') # noqa
+        >>>     LoadEmptyAnnotations(with_bbox=True, with_label=True, with_mask=False, with_seg=False, seg_ignore_label=255) # noqa
+        >>>     MultiBranch(branch_pipelines=['unsup_teacher', 'unsup_student'])
+        >>> )
+    """
+
+    def __init__(self, branch_field: List[str],
+                 **branch_pipelines: dict) -> None:
+        self.branch_field = branch_field
+        self.branch_pipelines = {
+            branch: Compose(pipeline)
+            for branch, pipeline in branch_pipelines.items()
+        }
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to apply transforms sequentially.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict:
+
+            - 'inputs' (Dict[str, obj:`torch.Tensor`]): The forward data of
+                models from different branches.
+            - 'data_sample' (Dict[str,obj:`DetDataSample`]): The annotation
+                info of the sample from different branches.
+        """
+
+        multi_results = {}
+        for branch in self.branch_field:
+            multi_results[branch] = {'inputs': None, 'data_samples': None}
+        for branch, pipeline in self.branch_pipelines.items():
+            branch_results = pipeline(copy.deepcopy(results))
+            # If one branch pipeline returns None,
+            # it will sample another data from dataset.
+            if branch_results is None:
+                return None
+            multi_results[branch] = branch_results
+
+        format_results = {}
+        for branch, results in multi_results.items():
+            for key in results.keys():
+                if format_results.get(key, None) is None:
+                    format_results[key] = {branch: results[key]}
+                else:
+                    format_results[key][branch] = results[key]
+        return format_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(branch_pipelines={list(self.branch_pipelines.keys())})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomOrder(Compose):
+    """Shuffle the transform Sequence."""
+
+    @cache_randomness
+    def _random_permutation(self):
+        return np.random.permutation(len(self.transforms))
+
+    def transform(self, results: Dict) -> Optional[Dict]:
+        """Transform function to apply transforms in random order.
+
+        Args:
+            results (dict): A result dict contains the results to transform.
+
+        Returns:
+            dict or None: Transformed results.
+        """
+        inds = self._random_permutation()
+        for idx in inds:
+            t = self.transforms[idx]
+            results = t(results)
+            if results is None:
+                return None
+        return results
+
+    def __repr__(self):
+        """Compute the string representation."""
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += f'{t.__class__.__name__}, '
+        format_string += ')'
+        return format_string
+
+
+@TRANSFORMS.register_module()
+class ProposalBroadcaster(BaseTransform):
+    """A transform wrapper to apply the wrapped transforms to process both
+    `gt_bboxes` and `proposals` without adding any codes. It will do the
+    following steps:
+
+        1. Scatter the broadcasting targets to a list of inputs of the wrapped
+           transforms. The type of the list should be list[dict, dict], which
+           the first is the original inputs, the second is the processing
+           results that `gt_bboxes` being rewritten by the `proposals`.
+        2. Apply ``self.transforms``, with same random parameters, which is
+           sharing with a context manager. The type of the outputs is a
+           list[dict, dict].
+        3. Gather the outputs, update the `proposals` in the first item of
+           the outputs with the `gt_bboxes` in the second .
+
+    Args:
+         transforms (list, optional): Sequence of transform
+            object or config dict to be wrapped. Defaults to [].
+
+    Note: The `TransformBroadcaster` in MMCV can achieve the same operation as
+          `ProposalBroadcaster`, but need to set more complex parameters.
+
+    Examples:
+        >>> pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadProposals', num_max_proposals=2000),
+        >>>     dict(type='LoadAnnotations', with_bbox=True),
+        >>>     dict(
+        >>>         type='ProposalBroadcaster',
+        >>>         transforms=[
+        >>>             dict(type='Resize', scale=(1333, 800),
+        >>>                  keep_ratio=True),
+        >>>             dict(type='RandomFlip', prob=0.5),
+        >>>         ]),
+        >>>     dict(type='PackDetInputs')]
+    """
+
+    def __init__(self, transforms: List[Union[dict, Callable]] = []) -> None:
+        self.transforms = Compose(transforms)
+
+    def transform(self, results: dict) -> dict:
+        """Apply wrapped transform functions to process both `gt_bboxes` and
+        `proposals`.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        assert results.get('proposals', None) is not None, \
+            '`proposals` should be in the results, please delete ' \
+            '`ProposalBroadcaster` in your configs, or check whether ' \
+            'you have load proposals successfully.'
+
+        inputs = self._process_input(results)
+        outputs = self._apply_transforms(inputs)
+        outputs = self._process_output(outputs)
+        return outputs
+
+    def _process_input(self, data: dict) -> list:
+        """Scatter the broadcasting targets to a list of inputs of the wrapped
+        transforms.
+
+        Args:
+            data (dict): The original input data.
+
+        Returns:
+            list[dict]: A list of input data.
+        """
+        cp_data = copy.deepcopy(data)
+        cp_data['gt_bboxes'] = cp_data['proposals']
+        scatters = [data, cp_data]
+        return scatters
+
+    def _apply_transforms(self, inputs: list) -> list:
+        """Apply ``self.transforms``.
+
+        Args:
+            inputs (list[dict, dict]): list of input data.
+
+        Returns:
+            list[dict]: The output of the wrapped pipeline.
+        """
+        assert len(inputs) == 2
+        ctx = cache_random_params
+        with ctx(self.transforms):
+            output_scatters = [self.transforms(_input) for _input in inputs]
+        return output_scatters
+
+    def _process_output(self, output_scatters: list) -> dict:
+        """Gathering and renaming data items.
+
+        Args:
+            output_scatters (list[dict, dict]): The output of the wrapped
+                pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        assert isinstance(output_scatters, list) and \
+               isinstance(output_scatters[0], dict) and \
+               len(output_scatters) == 2
+        outputs = output_scatters[0]
+        outputs['proposals'] = output_scatters[1]['gt_bboxes']
+        return outputs
diff --git a/head_extractor/build/lib/mmdet/datasets/utils.py b/head_extractor/build/lib/mmdet/datasets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d794eb4b06ec9db56ff3a5fc7b817d1d9332a989
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/utils.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.transforms import LoadImageFromFile
+
+from mmdet.datasets.transforms import LoadAnnotations, LoadPanopticAnnotations
+from mmdet.registry import TRANSFORMS
+
+
+def get_loading_pipeline(pipeline):
+    """Only keep loading image and annotations related configuration.
+
+    Args:
+        pipeline (list[dict]): Data pipeline configs.
+
+    Returns:
+        list[dict]: The new pipeline list with only keep
+            loading image and annotations related configuration.
+
+    Examples:
+        >>> pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations', with_bbox=True),
+        ...    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+        ...    dict(type='RandomFlip', flip_ratio=0.5),
+        ...    dict(type='Normalize', **img_norm_cfg),
+        ...    dict(type='Pad', size_divisor=32),
+        ...    dict(type='DefaultFormatBundle'),
+        ...    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+        ...    ]
+        >>> expected_pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations', with_bbox=True)
+        ...    ]
+        >>> assert expected_pipelines ==\
+        ...        get_loading_pipeline(pipelines)
+    """
+    loading_pipeline_cfg = []
+    for cfg in pipeline:
+        obj_cls = TRANSFORMS.get(cfg['type'])
+        # TODO:use more elegant way to distinguish loading modules
+        if obj_cls is not None and obj_cls in (LoadImageFromFile,
+                                               LoadAnnotations,
+                                               LoadPanopticAnnotations):
+            loading_pipeline_cfg.append(cfg)
+    assert len(loading_pipeline_cfg) == 2, \
+        'The data pipeline in your config file must include ' \
+        'loading image and annotations related pipeline.'
+    return loading_pipeline_cfg
diff --git a/head_extractor/build/lib/mmdet/datasets/v3det.py b/head_extractor/build/lib/mmdet/datasets/v3det.py
new file mode 100644
index 0000000000000000000000000000000000000000..25bfe3bc718841143653c54954240186c3376955
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/v3det.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path
+from typing import Optional
+
+import mmengine
+
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class V3DetDataset(CocoDataset):
+    """Dataset for V3Det."""
+
+    METAINFO = {
+        'classes': None,
+        'palette': None,
+    }
+
+    def __init__(
+            self,
+            *args,
+            metainfo: Optional[dict] = None,
+            data_root: str = '',
+            label_file='annotations/category_name_13204_v3det_2023_v1.txt',  # noqa
+            **kwargs) -> None:
+        class_names = tuple(
+            mmengine.list_from_file(os.path.join(data_root, label_file)))
+        if metainfo is None:
+            metainfo = {'classes': class_names}
+        super().__init__(
+            *args, data_root=data_root, metainfo=metainfo, **kwargs)
diff --git a/head_extractor/build/lib/mmdet/datasets/voc.py b/head_extractor/build/lib/mmdet/datasets/voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..65e73f2f0bd4f2b16d5237cd3b5f342e44cf0438
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/voc.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .xml_style import XMLDataset
+
+
+@DATASETS.register_module()
+class VOCDataset(XMLDataset):
+    """Dataset for PASCAL VOC."""
+
+    METAINFO = {
+        'classes':
+        ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+         'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+         'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'),
+        # palette is a list of color tuples, which is used for visualization.
+        'palette': [(106, 0, 228), (119, 11, 32), (165, 42, 42), (0, 0, 192),
+                    (197, 226, 255), (0, 60, 100), (0, 0, 142), (255, 77, 255),
+                    (153, 69, 1), (120, 166, 157), (0, 182, 199),
+                    (0, 226, 252), (182, 182, 255), (0, 0, 230), (220, 20, 60),
+                    (163, 255, 0), (0, 82, 0), (3, 95, 161), (0, 80, 100),
+                    (183, 130, 88)]
+    }
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if 'VOC2007' in self.sub_data_root:
+            self._metainfo['dataset_type'] = 'VOC2007'
+        elif 'VOC2012' in self.sub_data_root:
+            self._metainfo['dataset_type'] = 'VOC2012'
+        else:
+            self._metainfo['dataset_type'] = None
diff --git a/head_extractor/build/lib/mmdet/datasets/wider_face.py b/head_extractor/build/lib/mmdet/datasets/wider_face.py
new file mode 100644
index 0000000000000000000000000000000000000000..62c7fff869ab970b6f96908a998ba6feb25ea205
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/wider_face.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import xml.etree.ElementTree as ET
+
+from mmengine.dist import is_main_process
+from mmengine.fileio import get_local_path, list_from_file
+from mmengine.utils import ProgressBar
+
+from mmdet.registry import DATASETS
+from mmdet.utils.typing_utils import List, Union
+from .xml_style import XMLDataset
+
+
+@DATASETS.register_module()
+class WIDERFaceDataset(XMLDataset):
+    """Reader for the WIDER Face dataset in PASCAL VOC format.
+
+    Conversion scripts can be found in
+    https://github.com/sovrasov/wider-face-pascal-voc-annotations
+    """
+    METAINFO = {'classes': ('face', ), 'palette': [(0, 255, 0)]}
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from XML style ann_file.
+
+        Returns:
+            list[dict]: Annotation info from XML file.
+        """
+        assert self._metainfo.get('classes', None) is not None, \
+            'classes in `XMLDataset` can not be None.'
+        self.cat2label = {
+            cat: i
+            for i, cat in enumerate(self._metainfo['classes'])
+        }
+
+        data_list = []
+        img_ids = list_from_file(self.ann_file, backend_args=self.backend_args)
+
+        # loading process takes around 10 mins
+        if is_main_process():
+            prog_bar = ProgressBar(len(img_ids))
+
+        for img_id in img_ids:
+            raw_img_info = {}
+            raw_img_info['img_id'] = img_id
+            raw_img_info['file_name'] = f'{img_id}.jpg'
+            parsed_data_info = self.parse_data_info(raw_img_info)
+            data_list.append(parsed_data_info)
+
+            if is_main_process():
+                prog_bar.update()
+        return data_list
+
+    def parse_data_info(self, img_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        Args:
+            img_info (dict): Raw image information, usually it includes
+                `img_id`, `file_name`, and `xml_path`.
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        data_info = {}
+        img_id = img_info['img_id']
+        xml_path = osp.join(self.data_prefix['img'], 'Annotations',
+                            f'{img_id}.xml')
+        data_info['img_id'] = img_id
+        data_info['xml_path'] = xml_path
+
+        # deal with xml file
+        with get_local_path(
+                xml_path, backend_args=self.backend_args) as local_path:
+            raw_ann_info = ET.parse(local_path)
+        root = raw_ann_info.getroot()
+        size = root.find('size')
+        width = int(size.find('width').text)
+        height = int(size.find('height').text)
+        folder = root.find('folder').text
+        img_path = osp.join(self.data_prefix['img'], folder,
+                            img_info['file_name'])
+        data_info['img_path'] = img_path
+
+        data_info['height'] = height
+        data_info['width'] = width
+
+        # Coordinates are in range [0, width - 1 or height - 1]
+        data_info['instances'] = self._parse_instance_info(
+            raw_ann_info, minus_one=False)
+        return data_info
diff --git a/head_extractor/build/lib/mmdet/datasets/xml_style.py b/head_extractor/build/lib/mmdet/datasets/xml_style.py
new file mode 100644
index 0000000000000000000000000000000000000000..06045ea0092238abdac9622511b336586858f8f5
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/xml_style.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import xml.etree.ElementTree as ET
+from typing import List, Optional, Union
+
+import mmcv
+from mmengine.fileio import get, get_local_path, list_from_file
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class XMLDataset(BaseDetDataset):
+    """XML dataset for detection.
+
+    Args:
+        img_subdir (str): Subdir where images are stored. Default: JPEGImages.
+        ann_subdir (str): Subdir where annotations are. Default: Annotations.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(self,
+                 img_subdir: str = 'JPEGImages',
+                 ann_subdir: str = 'Annotations',
+                 **kwargs) -> None:
+        self.img_subdir = img_subdir
+        self.ann_subdir = ann_subdir
+        super().__init__(**kwargs)
+
+    @property
+    def sub_data_root(self) -> str:
+        """Return the sub data root."""
+        return self.data_prefix.get('sub_data_root', '')
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from XML style ann_file.
+
+        Returns:
+            list[dict]: Annotation info from XML file.
+        """
+        assert self._metainfo.get('classes', None) is not None, \
+            '`classes` in `XMLDataset` can not be None.'
+        self.cat2label = {
+            cat: i
+            for i, cat in enumerate(self._metainfo['classes'])
+        }
+
+        data_list = []
+        img_ids = list_from_file(self.ann_file, backend_args=self.backend_args)
+        for img_id in img_ids:
+            file_name = osp.join(self.img_subdir, f'{img_id}.jpg')
+            xml_path = osp.join(self.sub_data_root, self.ann_subdir,
+                                f'{img_id}.xml')
+
+            raw_img_info = {}
+            raw_img_info['img_id'] = img_id
+            raw_img_info['file_name'] = file_name
+            raw_img_info['xml_path'] = xml_path
+
+            parsed_data_info = self.parse_data_info(raw_img_info)
+            data_list.append(parsed_data_info)
+        return data_list
+
+    @property
+    def bbox_min_size(self) -> Optional[int]:
+        """Return the minimum size of bounding boxes in the images."""
+        if self.filter_cfg is not None:
+            return self.filter_cfg.get('bbox_min_size', None)
+        else:
+            return None
+
+    def parse_data_info(self, img_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        Args:
+            img_info (dict): Raw image information, usually it includes
+                `img_id`, `file_name`, and `xml_path`.
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        data_info = {}
+        img_path = osp.join(self.sub_data_root, img_info['file_name'])
+        data_info['img_path'] = img_path
+        data_info['img_id'] = img_info['img_id']
+        data_info['xml_path'] = img_info['xml_path']
+
+        # deal with xml file
+        with get_local_path(
+                img_info['xml_path'],
+                backend_args=self.backend_args) as local_path:
+            raw_ann_info = ET.parse(local_path)
+        root = raw_ann_info.getroot()
+        size = root.find('size')
+        if size is not None:
+            width = int(size.find('width').text)
+            height = int(size.find('height').text)
+        else:
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, backend='cv2')
+            height, width = img.shape[:2]
+            del img, img_bytes
+
+        data_info['height'] = height
+        data_info['width'] = width
+
+        data_info['instances'] = self._parse_instance_info(
+            raw_ann_info, minus_one=True)
+
+        return data_info
+
+    def _parse_instance_info(self,
+                             raw_ann_info: ET,
+                             minus_one: bool = True) -> List[dict]:
+        """parse instance information.
+
+        Args:
+            raw_ann_info (ElementTree): ElementTree object.
+            minus_one (bool): Whether to subtract 1 from the coordinates.
+                Defaults to True.
+
+        Returns:
+            List[dict]: List of instances.
+        """
+        instances = []
+        for obj in raw_ann_info.findall('object'):
+            instance = {}
+            name = obj.find('name').text
+            if name not in self._metainfo['classes']:
+                continue
+            difficult = obj.find('difficult')
+            difficult = 0 if difficult is None else int(difficult.text)
+            bnd_box = obj.find('bndbox')
+            bbox = [
+                int(float(bnd_box.find('xmin').text)),
+                int(float(bnd_box.find('ymin').text)),
+                int(float(bnd_box.find('xmax').text)),
+                int(float(bnd_box.find('ymax').text))
+            ]
+
+            # VOC needs to subtract 1 from the coordinates
+            if minus_one:
+                bbox = [x - 1 for x in bbox]
+
+            ignore = False
+            if self.bbox_min_size is not None:
+                assert not self.test_mode
+                w = bbox[2] - bbox[0]
+                h = bbox[3] - bbox[1]
+                if w < self.bbox_min_size or h < self.bbox_min_size:
+                    ignore = True
+            if difficult or ignore:
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[name]
+            instances.append(instance)
+        return instances
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg.
+
+        Returns:
+            List[dict]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) \
+            if self.filter_cfg is not None else False
+        min_size = self.filter_cfg.get('min_size', 0) \
+            if self.filter_cfg is not None else 0
+
+        valid_data_infos = []
+        for i, data_info in enumerate(self.data_list):
+            width = data_info['width']
+            height = data_info['height']
+            if filter_empty_gt and len(data_info['instances']) == 0:
+                continue
+            if min(width, height) >= min_size:
+                valid_data_infos.append(data_info)
+
+        return valid_data_infos
diff --git a/head_extractor/build/lib/mmdet/datasets/youtube_vis_dataset.py b/head_extractor/build/lib/mmdet/datasets/youtube_vis_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..38c3d3909f1b8fd795c181546094056c54c9c4b2
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/datasets/youtube_vis_dataset.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .base_video_dataset import BaseVideoDataset
+
+
+@DATASETS.register_module()
+class YouTubeVISDataset(BaseVideoDataset):
+    """YouTube VIS dataset for video instance segmentation.
+
+    Args:
+        dataset_version (str): Select dataset year version.
+    """
+
+    def __init__(self, dataset_version: str, *args, **kwargs):
+        self.set_dataset_classes(dataset_version)
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def set_dataset_classes(cls, dataset_version: str) -> None:
+        """Pass the category of the corresponding year to metainfo.
+
+        Args:
+            dataset_version (str): Select dataset year version.
+        """
+        classes_2019_version = ('person', 'giant_panda', 'lizard', 'parrot',
+                                'skateboard', 'sedan', 'ape', 'dog', 'snake',
+                                'monkey', 'hand', 'rabbit', 'duck', 'cat',
+                                'cow', 'fish', 'train', 'horse', 'turtle',
+                                'bear', 'motorbike', 'giraffe', 'leopard',
+                                'fox', 'deer', 'owl', 'surfboard', 'airplane',
+                                'truck', 'zebra', 'tiger', 'elephant',
+                                'snowboard', 'boat', 'shark', 'mouse', 'frog',
+                                'eagle', 'earless_seal', 'tennis_racket')
+
+        classes_2021_version = ('airplane', 'bear', 'bird', 'boat', 'car',
+                                'cat', 'cow', 'deer', 'dog', 'duck',
+                                'earless_seal', 'elephant', 'fish',
+                                'flying_disc', 'fox', 'frog', 'giant_panda',
+                                'giraffe', 'horse', 'leopard', 'lizard',
+                                'monkey', 'motorbike', 'mouse', 'parrot',
+                                'person', 'rabbit', 'shark', 'skateboard',
+                                'snake', 'snowboard', 'squirrel', 'surfboard',
+                                'tennis_racket', 'tiger', 'train', 'truck',
+                                'turtle', 'whale', 'zebra')
+
+        if dataset_version == '2019':
+            cls.METAINFO = dict(classes=classes_2019_version)
+        elif dataset_version == '2021':
+            cls.METAINFO = dict(classes=classes_2021_version)
+        else:
+            raise NotImplementedError('Not supported YouTubeVIS dataset'
+                                      f'version: {dataset_version}')
diff --git a/head_extractor/build/lib/mmdet/engine/__init__.py b/head_extractor/build/lib/mmdet/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c91ace6ffa20948af572d3a0fd594e8a0b091775
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hooks import *  # noqa: F401, F403
+from .optimizers import *  # noqa: F401, F403
+from .runner import *  # noqa: F401, F403
+from .schedulers import *  # noqa: F401, F403
diff --git a/head_extractor/build/lib/mmdet/engine/hooks/__init__.py b/head_extractor/build/lib/mmdet/engine/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..889fa557adef87e2251c625a7353503226beb079
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/hooks/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .checkloss_hook import CheckInvalidLossHook
+from .mean_teacher_hook import MeanTeacherHook
+from .memory_profiler_hook import MemoryProfilerHook
+from .num_class_check_hook import NumClassCheckHook
+from .pipeline_switch_hook import PipelineSwitchHook
+from .set_epoch_info_hook import SetEpochInfoHook
+from .sync_norm_hook import SyncNormHook
+from .utils import trigger_visualization_hook
+from .visualization_hook import (DetVisualizationHook,
+                                 GroundingVisualizationHook,
+                                 TrackVisualizationHook)
+from .yolox_mode_switch_hook import YOLOXModeSwitchHook
+
+__all__ = [
+    'YOLOXModeSwitchHook', 'SyncNormHook', 'CheckInvalidLossHook',
+    'SetEpochInfoHook', 'MemoryProfilerHook', 'DetVisualizationHook',
+    'NumClassCheckHook', 'MeanTeacherHook', 'trigger_visualization_hook',
+    'PipelineSwitchHook', 'TrackVisualizationHook',
+    'GroundingVisualizationHook'
+]
diff --git a/head_extractor/build/lib/mmdet/engine/hooks/checkloss_hook.py b/head_extractor/build/lib/mmdet/engine/hooks/checkloss_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ebfcd5dfcd7ae329399723d3a9c0fc0a0d722ef
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/hooks/checkloss_hook.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class CheckInvalidLossHook(Hook):
+    """Check invalid loss hook.
+
+    This hook will regularly check whether the loss is valid
+    during training.
+
+    Args:
+        interval (int): Checking interval (every k iterations).
+            Default: 50.
+    """
+
+    def __init__(self, interval: int = 50) -> None:
+        self.interval = interval
+
+    def after_train_iter(self,
+                         runner: Runner,
+                         batch_idx: int,
+                         data_batch: Optional[dict] = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Regularly check whether the loss is valid every n iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict, Optional): Data from dataloader.
+                Defaults to None.
+            outputs (dict, Optional): Outputs from model. Defaults to None.
+        """
+        if self.every_n_train_iters(runner, self.interval):
+            assert torch.isfinite(outputs['loss']), \
+                runner.logger.info('loss become infinite or NaN!')
diff --git a/head_extractor/build/lib/mmdet/engine/hooks/mean_teacher_hook.py b/head_extractor/build/lib/mmdet/engine/hooks/mean_teacher_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..b924c0a5934248d05e7ce1add50e7574b739b9c7
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/hooks/mean_teacher_hook.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+from mmengine.runner import Runner
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class MeanTeacherHook(Hook):
+    """Mean Teacher Hook.
+
+    Mean Teacher is an efficient semi-supervised learning method in
+    `Mean Teacher <https://arxiv.org/abs/1703.01780>`_.
+    This method requires two models with exactly the same structure,
+    as the student model and the teacher model, respectively.
+    The student model updates the parameters through gradient descent,
+    and the teacher model updates the parameters through
+    exponential moving average of the student model.
+    Compared with the student model, the teacher model
+    is smoother and accumulates more knowledge.
+
+    Args:
+        momentum (float): The momentum used for updating teacher's parameter.
+            Teacher's parameter are updated with the formula:
+           `teacher = (1-momentum) * teacher + momentum * student`.
+            Defaults to 0.001.
+        interval (int): Update teacher's parameter every interval iteration.
+            Defaults to 1.
+        skip_buffers (bool): Whether to skip the model buffers, such as
+            batchnorm running stats (running_mean, running_var), it does not
+            perform the ema operation. Default to True.
+    """
+
+    def __init__(self,
+                 momentum: float = 0.001,
+                 interval: int = 1,
+                 skip_buffer=True) -> None:
+        assert 0 < momentum < 1
+        self.momentum = momentum
+        self.interval = interval
+        self.skip_buffers = skip_buffer
+
+    def before_train(self, runner: Runner) -> None:
+        """To check that teacher model and student model exist."""
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        assert hasattr(model, 'teacher')
+        assert hasattr(model, 'student')
+        # only do it at initial stage
+        if runner.iter == 0:
+            self.momentum_update(model, 1)
+
+    def after_train_iter(self,
+                         runner: Runner,
+                         batch_idx: int,
+                         data_batch: Optional[dict] = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Update teacher's parameter every self.interval iterations."""
+        if (runner.iter + 1) % self.interval != 0:
+            return
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        self.momentum_update(model, self.momentum)
+
+    def momentum_update(self, model: nn.Module, momentum: float) -> None:
+        """Compute the moving average of the parameters using exponential
+        moving average."""
+        if self.skip_buffers:
+            for (src_name, src_parm), (dst_name, dst_parm) in zip(
+                    model.student.named_parameters(),
+                    model.teacher.named_parameters()):
+                dst_parm.data.mul_(1 - momentum).add_(
+                    src_parm.data, alpha=momentum)
+        else:
+            for (src_parm,
+                 dst_parm) in zip(model.student.state_dict().values(),
+                                  model.teacher.state_dict().values()):
+                # exclude num_tracking
+                if dst_parm.dtype.is_floating_point:
+                    dst_parm.data.mul_(1 - momentum).add_(
+                        src_parm.data, alpha=momentum)
diff --git a/head_extractor/build/lib/mmdet/engine/hooks/memory_profiler_hook.py b/head_extractor/build/lib/mmdet/engine/hooks/memory_profiler_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dcdcae0b669ade46026d28c46b35f35d90b504b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/hooks/memory_profiler_hook.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+
+from mmdet.registry import HOOKS
+from mmdet.structures import DetDataSample
+
+
+@HOOKS.register_module()
+class MemoryProfilerHook(Hook):
+    """Memory profiler hook recording memory information including virtual
+    memory, swap memory, and the memory of the current process.
+
+    Args:
+        interval (int): Checking interval (every k iterations).
+            Default: 50.
+    """
+
+    def __init__(self, interval: int = 50) -> None:
+        try:
+            from psutil import swap_memory, virtual_memory
+            self._swap_memory = swap_memory
+            self._virtual_memory = virtual_memory
+        except ImportError:
+            raise ImportError('psutil is not installed, please install it by: '
+                              'pip install psutil')
+
+        try:
+            from memory_profiler import memory_usage
+            self._memory_usage = memory_usage
+        except ImportError:
+            raise ImportError(
+                'memory_profiler is not installed, please install it by: '
+                'pip install memory_profiler')
+
+        self.interval = interval
+
+    def _record_memory_information(self, runner: Runner) -> None:
+        """Regularly record memory information.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training or evaluation
+                process.
+        """
+        # in Byte
+        virtual_memory = self._virtual_memory()
+        swap_memory = self._swap_memory()
+        # in MB
+        process_memory = self._memory_usage()[0]
+        factor = 1024 * 1024
+        runner.logger.info(
+            'Memory information '
+            'available_memory: '
+            f'{round(virtual_memory.available / factor)} MB, '
+            'used_memory: '
+            f'{round(virtual_memory.used / factor)} MB, '
+            f'memory_utilization: {virtual_memory.percent} %, '
+            'available_swap_memory: '
+            f'{round((swap_memory.total - swap_memory.used) / factor)}'
+            ' MB, '
+            f'used_swap_memory: {round(swap_memory.used / factor)} MB, '
+            f'swap_memory_utilization: {swap_memory.percent} %, '
+            'current_process_memory: '
+            f'{round(process_memory)} MB')
+
+    def after_train_iter(self,
+                         runner: Runner,
+                         batch_idx: int,
+                         data_batch: Optional[dict] = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Regularly record memory information.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict, optional): Data from dataloader.
+                Defaults to None.
+            outputs (dict, optional): Outputs from model. Defaults to None.
+        """
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            self._record_memory_information(runner)
+
+    def after_val_iter(
+            self,
+            runner: Runner,
+            batch_idx: int,
+            data_batch: Optional[dict] = None,
+            outputs: Optional[Sequence[DetDataSample]] = None) -> None:
+        """Regularly record memory information.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict, optional): Data from dataloader.
+                Defaults to None.
+            outputs (Sequence[:obj:`DetDataSample`], optional):
+                Outputs from model. Defaults to None.
+        """
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            self._record_memory_information(runner)
+
+    def after_test_iter(
+            self,
+            runner: Runner,
+            batch_idx: int,
+            data_batch: Optional[dict] = None,
+            outputs: Optional[Sequence[DetDataSample]] = None) -> None:
+        """Regularly record memory information.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict, optional): Data from dataloader.
+                Defaults to None.
+            outputs (Sequence[:obj:`DetDataSample`], optional):
+                Outputs from model. Defaults to None.
+        """
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            self._record_memory_information(runner)
diff --git a/head_extractor/build/lib/mmdet/engine/hooks/num_class_check_hook.py b/head_extractor/build/lib/mmdet/engine/hooks/num_class_check_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..6588473acfbd3ffe8e80eb163aa7ee449332e6b8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/hooks/num_class_check_hook.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import VGG
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class NumClassCheckHook(Hook):
+    """Check whether the `num_classes` in head matches the length of `classes`
+    in `dataset.metainfo`."""
+
+    def _check_head(self, runner: Runner, mode: str) -> None:
+        """Check whether the `num_classes` in head matches the length of
+        `classes` in `dataset.metainfo`.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training or evaluation
+                process.
+        """
+        assert mode in ['train', 'val']
+        model = runner.model
+        dataset = runner.train_dataloader.dataset if mode == 'train' else \
+            runner.val_dataloader.dataset
+        if dataset.metainfo.get('classes', None) is None:
+            runner.logger.warning(
+                f'Please set `classes` '
+                f'in the {dataset.__class__.__name__} `metainfo` and'
+                f'check if it is consistent with the `num_classes` '
+                f'of head')
+        else:
+            classes = dataset.metainfo['classes']
+            assert type(classes) is not str, \
+                (f'`classes` in {dataset.__class__.__name__}'
+                 f'should be a tuple of str.'
+                 f'Add comma if number of classes is 1 as '
+                 f'classes = ({classes},)')
+            from mmdet.models.roi_heads.mask_heads import FusedSemanticHead
+            for name, module in model.named_modules():
+                if hasattr(module, 'num_classes') and not name.endswith(
+                        'rpn_head') and not isinstance(
+                            module, (VGG, FusedSemanticHead)):
+                    assert module.num_classes == len(classes), \
+                        (f'The `num_classes` ({module.num_classes}) in '
+                         f'{module.__class__.__name__} of '
+                         f'{model.__class__.__name__} does not matches '
+                         f'the length of `classes` '
+                         f'{len(classes)}) in '
+                         f'{dataset.__class__.__name__}')
+
+    def before_train_epoch(self, runner: Runner) -> None:
+        """Check whether the training dataset is compatible with head.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training or evaluation
+                process.
+        """
+        self._check_head(runner, 'train')
+
+    def before_val_epoch(self, runner: Runner) -> None:
+        """Check whether the dataset in val epoch is compatible with head.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training or evaluation
+                process.
+        """
+        self._check_head(runner, 'val')
diff --git a/head_extractor/build/lib/mmdet/engine/hooks/pipeline_switch_hook.py b/head_extractor/build/lib/mmdet/engine/hooks/pipeline_switch_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5abd897803b11793ebace86e45aac8f59938545
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/hooks/pipeline_switch_hook.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms import Compose
+from mmengine.hooks import Hook
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class PipelineSwitchHook(Hook):
+    """Switch data pipeline at switch_epoch.
+
+    Args:
+        switch_epoch (int): switch pipeline at this epoch.
+        switch_pipeline (list[dict]): the pipeline to switch to.
+    """
+
+    def __init__(self, switch_epoch, switch_pipeline):
+        self.switch_epoch = switch_epoch
+        self.switch_pipeline = switch_pipeline
+        self._restart_dataloader = False
+        self._has_switched = False
+
+    def before_train_epoch(self, runner):
+        """switch pipeline."""
+        epoch = runner.epoch
+        train_loader = runner.train_dataloader
+        if epoch >= self.switch_epoch and not self._has_switched:
+            runner.logger.info('Switch pipeline now!')
+            # The dataset pipeline cannot be updated when persistent_workers
+            # is True, so we need to force the dataloader's multi-process
+            # restart. This is a very hacky approach.
+            train_loader.dataset.pipeline = Compose(self.switch_pipeline)
+            if hasattr(train_loader, 'persistent_workers'
+                       ) and train_loader.persistent_workers is True:
+                train_loader._DataLoader__initialized = False
+                train_loader._iterator = None
+                self._restart_dataloader = True
+            self._has_switched = True
+        else:
+            # Once the restart is complete, we need to restore
+            # the initialization flag.
+            if self._restart_dataloader:
+                train_loader._DataLoader__initialized = True
diff --git a/head_extractor/build/lib/mmdet/engine/hooks/set_epoch_info_hook.py b/head_extractor/build/lib/mmdet/engine/hooks/set_epoch_info_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..183f3167445dc0818e4fa37bdd2049d3876ed031
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/hooks/set_epoch_info_hook.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import Hook
+from mmengine.model.wrappers import is_model_wrapper
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class SetEpochInfoHook(Hook):
+    """Set runner's epoch information to the model."""
+
+    def before_train_epoch(self, runner):
+        epoch = runner.epoch
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        model.set_epoch(epoch)
diff --git a/head_extractor/build/lib/mmdet/engine/hooks/sync_norm_hook.py b/head_extractor/build/lib/mmdet/engine/hooks/sync_norm_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1734380c83157c911568098abfce761fb3c9a1f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/hooks/sync_norm_hook.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+from mmengine.dist import get_dist_info
+from mmengine.hooks import Hook
+from torch import nn
+
+from mmdet.registry import HOOKS
+from mmdet.utils import all_reduce_dict
+
+
+def get_norm_states(module: nn.Module) -> OrderedDict:
+    """Get the state_dict of batch norms in the module."""
+    async_norm_states = OrderedDict()
+    for name, child in module.named_modules():
+        if isinstance(child, nn.modules.batchnorm._NormBase):
+            for k, v in child.state_dict().items():
+                async_norm_states['.'.join([name, k])] = v
+    return async_norm_states
+
+
+@HOOKS.register_module()
+class SyncNormHook(Hook):
+    """Synchronize Norm states before validation, currently used in YOLOX."""
+
+    def before_val_epoch(self, runner):
+        """Synchronizing norm."""
+        module = runner.model
+        _, world_size = get_dist_info()
+        if world_size == 1:
+            return
+        norm_states = get_norm_states(module)
+        if len(norm_states) == 0:
+            return
+        # TODO: use `all_reduce_dict` in mmengine
+        norm_states = all_reduce_dict(norm_states, op='mean')
+        module.load_state_dict(norm_states, strict=False)
diff --git a/head_extractor/build/lib/mmdet/engine/hooks/utils.py b/head_extractor/build/lib/mmdet/engine/hooks/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d267cfe77be163c0520568b7b7936f4453914aab
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/hooks/utils.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def trigger_visualization_hook(cfg, args):
+    default_hooks = cfg.default_hooks
+    if 'visualization' in default_hooks:
+        visualization_hook = default_hooks['visualization']
+        # Turn on visualization
+        visualization_hook['draw'] = True
+        if args.show:
+            visualization_hook['show'] = True
+            visualization_hook['wait_time'] = args.wait_time
+        if args.show_dir:
+            visualization_hook['test_out_dir'] = args.show_dir
+    else:
+        raise RuntimeError(
+            'VisualizationHook must be included in default_hooks.'
+            'refer to usage '
+            '"visualization=dict(type=\'VisualizationHook\')"')
+
+    return cfg
diff --git a/head_extractor/build/lib/mmdet/engine/hooks/visualization_hook.py b/head_extractor/build/lib/mmdet/engine/hooks/visualization_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..3408186b6ef9c4195745b0c740519541572d27d2
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/hooks/visualization_hook.py
@@ -0,0 +1,515 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import Optional, Sequence
+
+import mmcv
+import numpy as np
+from mmengine.fileio import get
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+from mmengine.utils import mkdir_or_exist
+from mmengine.visualization import Visualizer
+
+from mmdet.datasets.samplers import TrackImgSampler
+from mmdet.registry import HOOKS
+from mmdet.structures import DetDataSample, TrackDataSample
+from mmdet.structures.bbox import BaseBoxes
+from mmdet.visualization.palette import _get_adaptive_scales
+
+
+@HOOKS.register_module()
+class DetVisualizationHook(Hook):
+    """Detection Visualization Hook. Used to visualize validation and testing
+    process prediction results.
+
+    In the testing phase:
+
+    1. If ``show`` is True, it means that only the prediction results are
+        visualized without storing data, so ``vis_backends`` needs to
+        be excluded.
+    2. If ``test_out_dir`` is specified, it means that the prediction results
+        need to be saved to ``test_out_dir``. In order to avoid vis_backends
+        also storing data, so ``vis_backends`` needs to be excluded.
+    3. ``vis_backends`` takes effect if the user does not specify ``show``
+        and `test_out_dir``. You can set ``vis_backends`` to WandbVisBackend or
+        TensorboardVisBackend to store the prediction result in Wandb or
+        Tensorboard.
+
+    Args:
+        draw (bool): whether to draw prediction results. If it is False,
+            it means that no drawing will be done. Defaults to False.
+        interval (int): The interval of visualization. Defaults to 50.
+        score_thr (float): The threshold to visualize the bboxes
+            and masks. Defaults to 0.3.
+        show (bool): Whether to display the drawn image. Default to False.
+        wait_time (float): The interval of show (s). Defaults to 0.
+        test_out_dir (str, optional): directory where painted images
+            will be saved in testing process.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(self,
+                 draw: bool = False,
+                 interval: int = 50,
+                 score_thr: float = 0.3,
+                 show: bool = False,
+                 wait_time: float = 0.,
+                 test_out_dir: Optional[str] = None,
+                 backend_args: dict = None):
+        self._visualizer: Visualizer = Visualizer.get_current_instance()
+        self.interval = interval
+        self.score_thr = score_thr
+        self.show = show
+        if self.show:
+            # No need to think about vis backends.
+            self._visualizer._vis_backends = {}
+            warnings.warn('The show is True, it means that only '
+                          'the prediction results are visualized '
+                          'without storing data, so vis_backends '
+                          'needs to be excluded.')
+
+        self.wait_time = wait_time
+        self.backend_args = backend_args
+        self.draw = draw
+        self.test_out_dir = test_out_dir
+        self._test_index = 0
+
+    def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                       outputs: Sequence[DetDataSample]) -> None:
+        """Run after every ``self.interval`` validation iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DetDataSample`]]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        # There is no guarantee that the same batch of images
+        # is visualized for each evaluation.
+        total_curr_iter = runner.iter + batch_idx
+
+        # Visualize only the first data
+        img_path = outputs[0].img_path
+        img_bytes = get(img_path, backend_args=self.backend_args)
+        img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+        if total_curr_iter % self.interval == 0:
+            self._visualizer.add_datasample(
+                osp.basename(img_path) if self.show else 'val_img',
+                img,
+                data_sample=outputs[0],
+                show=self.show,
+                wait_time=self.wait_time,
+                pred_score_thr=self.score_thr,
+                step=total_curr_iter)
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[DetDataSample]) -> None:
+        """Run after every testing iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DetDataSample`]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        if self.test_out_dir is not None:
+            self.test_out_dir = osp.join(runner.work_dir, runner.timestamp,
+                                         self.test_out_dir)
+            mkdir_or_exist(self.test_out_dir)
+
+        for data_sample in outputs:
+            self._test_index += 1
+
+            img_path = data_sample.img_path
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+            out_file = None
+            if self.test_out_dir is not None:
+                out_file = osp.basename(img_path)
+                out_file = osp.join(self.test_out_dir, out_file)
+
+            self._visualizer.add_datasample(
+                osp.basename(img_path) if self.show else 'test_img',
+                img,
+                data_sample=data_sample,
+                show=self.show,
+                wait_time=self.wait_time,
+                pred_score_thr=self.score_thr,
+                out_file=out_file,
+                step=self._test_index)
+
+
+@HOOKS.register_module()
+class TrackVisualizationHook(Hook):
+    """Tracking Visualization Hook. Used to visualize validation and testing
+    process prediction results.
+
+    In the testing phase:
+
+    1. If ``show`` is True, it means that only the prediction results are
+        visualized without storing data, so ``vis_backends`` needs to
+        be excluded.
+    2. If ``test_out_dir`` is specified, it means that the prediction results
+        need to be saved to ``test_out_dir``. In order to avoid vis_backends
+        also storing data, so ``vis_backends`` needs to be excluded.
+    3. ``vis_backends`` takes effect if the user does not specify ``show``
+        and `test_out_dir``. You can set ``vis_backends`` to WandbVisBackend or
+        TensorboardVisBackend to store the prediction result in Wandb or
+        Tensorboard.
+
+    Args:
+        draw (bool): whether to draw prediction results. If it is False,
+            it means that no drawing will be done. Defaults to False.
+        frame_interval (int): The interval of visualization. Defaults to 30.
+        score_thr (float): The threshold to visualize the bboxes
+            and masks. Defaults to 0.3.
+        show (bool): Whether to display the drawn image. Default to False.
+        wait_time (float): The interval of show (s). Defaults to 0.
+        test_out_dir (str, optional): directory where painted images
+            will be saved in testing process.
+        backend_args (dict): Arguments to instantiate a file client.
+            Defaults to ``None``.
+    """
+
+    def __init__(self,
+                 draw: bool = False,
+                 frame_interval: int = 30,
+                 score_thr: float = 0.3,
+                 show: bool = False,
+                 wait_time: float = 0.,
+                 test_out_dir: Optional[str] = None,
+                 backend_args: dict = None) -> None:
+        self._visualizer: Visualizer = Visualizer.get_current_instance()
+        self.frame_interval = frame_interval
+        self.score_thr = score_thr
+        self.show = show
+        if self.show:
+            # No need to think about vis backends.
+            self._visualizer._vis_backends = {}
+            warnings.warn('The show is True, it means that only '
+                          'the prediction results are visualized '
+                          'without storing data, so vis_backends '
+                          'needs to be excluded.')
+
+        self.wait_time = wait_time
+        self.backend_args = backend_args
+        self.draw = draw
+        self.test_out_dir = test_out_dir
+        self.image_idx = 0
+
+    def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                       outputs: Sequence[TrackDataSample]) -> None:
+        """Run after every ``self.interval`` validation iteration.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`TrackDataSample`]): Outputs from model.
+        """
+        if self.draw is False:
+            return
+
+        assert len(outputs) == 1, \
+            'only batch_size=1 is supported while validating.'
+
+        sampler = runner.val_dataloader.sampler
+        if isinstance(sampler, TrackImgSampler):
+            if self.every_n_inner_iters(batch_idx, self.frame_interval):
+                total_curr_iter = runner.iter + batch_idx
+                track_data_sample = outputs[0]
+                self.visualize_single_image(track_data_sample[0],
+                                            total_curr_iter)
+        else:
+            # video visualization DefaultSampler
+            if self.every_n_inner_iters(batch_idx, 1):
+                track_data_sample = outputs[0]
+                video_length = len(track_data_sample)
+
+                for frame_id in range(video_length):
+                    if frame_id % self.frame_interval == 0:
+                        total_curr_iter = runner.iter + self.image_idx + \
+                                          frame_id
+                        img_data_sample = track_data_sample[frame_id]
+                        self.visualize_single_image(img_data_sample,
+                                                    total_curr_iter)
+                self.image_idx = self.image_idx + video_length
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[TrackDataSample]) -> None:
+        """Run after every testing iteration.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`TrackDataSample`]): Outputs from model.
+        """
+        if self.draw is False:
+            return
+
+        assert len(outputs) == 1, \
+            'only batch_size=1 is supported while testing.'
+
+        if self.test_out_dir is not None:
+            self.test_out_dir = osp.join(runner.work_dir, runner.timestamp,
+                                         self.test_out_dir)
+            mkdir_or_exist(self.test_out_dir)
+
+        sampler = runner.test_dataloader.sampler
+        if isinstance(sampler, TrackImgSampler):
+            if self.every_n_inner_iters(batch_idx, self.frame_interval):
+                track_data_sample = outputs[0]
+                self.visualize_single_image(track_data_sample[0], batch_idx)
+        else:
+            # video visualization DefaultSampler
+            if self.every_n_inner_iters(batch_idx, 1):
+                track_data_sample = outputs[0]
+                video_length = len(track_data_sample)
+
+                for frame_id in range(video_length):
+                    if frame_id % self.frame_interval == 0:
+                        img_data_sample = track_data_sample[frame_id]
+                        self.visualize_single_image(img_data_sample,
+                                                    self.image_idx + frame_id)
+                self.image_idx = self.image_idx + video_length
+
+    def visualize_single_image(self, img_data_sample: DetDataSample,
+                               step: int) -> None:
+        """
+        Args:
+            img_data_sample (DetDataSample): single image output.
+            step (int): The index of the current image.
+        """
+        img_path = img_data_sample.img_path
+        img_bytes = get(img_path, backend_args=self.backend_args)
+        img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+        out_file = None
+        if self.test_out_dir is not None:
+            video_name = img_path.split('/')[-3]
+            mkdir_or_exist(osp.join(self.test_out_dir, video_name))
+            out_file = osp.join(self.test_out_dir, video_name,
+                                osp.basename(img_path))
+
+        self._visualizer.add_datasample(
+            osp.basename(img_path) if self.show else 'test_img',
+            img,
+            data_sample=img_data_sample,
+            show=self.show,
+            wait_time=self.wait_time,
+            pred_score_thr=self.score_thr,
+            out_file=out_file,
+            step=step)
+
+
+def draw_all_character(visualizer, characters, w):
+    start_index = 2
+    y_index = 5
+    for char in characters:
+        if isinstance(char, str):
+            visualizer.draw_texts(
+                str(char),
+                positions=np.array([start_index, y_index]),
+                colors=(0, 0, 0),
+                font_families='monospace')
+            start_index += len(char) * 8
+        else:
+            visualizer.draw_texts(
+                str(char[0]),
+                positions=np.array([start_index, y_index]),
+                colors=char[1],
+                font_families='monospace')
+            start_index += len(char[0]) * 8
+
+        if start_index > w - 10:
+            start_index = 2
+            y_index += 15
+
+    drawn_text = visualizer.get_image()
+    return drawn_text
+
+
+@HOOKS.register_module()
+class GroundingVisualizationHook(DetVisualizationHook):
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[DetDataSample]) -> None:
+        """Run after every testing iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DetDataSample`]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        if self.test_out_dir is not None:
+            self.test_out_dir = osp.join(runner.work_dir, runner.timestamp,
+                                         self.test_out_dir)
+            mkdir_or_exist(self.test_out_dir)
+
+        for data_sample in outputs:
+            data_sample = data_sample.cpu()
+
+            self._test_index += 1
+
+            img_path = data_sample.img_path
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+            out_file = None
+            if self.test_out_dir is not None:
+                out_file = osp.basename(img_path)
+                out_file = osp.join(self.test_out_dir, out_file)
+
+            text = data_sample.text
+            if isinstance(text, str):  # VG
+                gt_instances = data_sample.gt_instances
+                tokens_positive = data_sample.tokens_positive
+                if 'phrase_ids' in data_sample:
+                    # flickr30k
+                    gt_labels = data_sample.phrase_ids
+                else:
+                    gt_labels = gt_instances.labels
+                gt_bboxes = gt_instances.get('bboxes', None)
+                if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes):
+                    gt_instances.bboxes = gt_bboxes.tensor
+                print(gt_labels, tokens_positive, gt_bboxes, img_path)
+                pred_instances = data_sample.pred_instances
+                pred_instances = pred_instances[
+                    pred_instances.scores > self.score_thr]
+                pred_labels = pred_instances.labels
+                pred_bboxes = pred_instances.bboxes
+                pred_scores = pred_instances.scores
+
+                max_label = 0
+                if len(gt_labels) > 0:
+                    max_label = max(gt_labels)
+                if len(pred_labels) > 0:
+                    max_label = max(max(pred_labels), max_label)
+
+                max_label = int(max(max_label, 0))
+                palette = np.random.randint(0, 256, size=(max_label + 1, 3))
+                bbox_palette = [tuple(c) for c in palette]
+                # bbox_palette = get_palette('random', max_label + 1)
+                if len(gt_labels) >= len(pred_labels):
+                    colors = [bbox_palette[label] for label in gt_labels]
+                else:
+                    colors = [bbox_palette[label] for label in pred_labels]
+
+                self._visualizer.set_image(img)
+
+                for label, bbox, color in zip(gt_labels, gt_bboxes, colors):
+                    self._visualizer.draw_bboxes(
+                        bbox, edge_colors=color, face_colors=color, alpha=0.3)
+                    self._visualizer.draw_bboxes(
+                        bbox, edge_colors=color, alpha=1)
+
+                drawn_img = self._visualizer.get_image()
+
+                new_image = np.ones(
+                    (100, img.shape[1], 3), dtype=np.uint8) * 255
+                self._visualizer.set_image(new_image)
+
+                if tokens_positive == -1:  # REC
+                    gt_tokens_positive = [[]]
+                else:  # Phrase Grounding
+                    gt_tokens_positive = [
+                        tokens_positive[label] for label in gt_labels
+                    ]
+                split_by_character = [char for char in text]
+                characters = []
+                start_index = 0
+                end_index = 0
+                for w in split_by_character:
+                    end_index += len(w)
+                    is_find = False
+                    for i, positive in enumerate(gt_tokens_positive):
+                        for p in positive:
+                            if start_index >= p[0] and end_index <= p[1]:
+                                characters.append([w, colors[i]])
+                                is_find = True
+                                break
+                        if is_find:
+                            break
+                    if not is_find:
+                        characters.append([w, (0, 0, 0)])
+                    start_index = end_index
+
+                drawn_text = draw_all_character(self._visualizer, characters,
+                                                img.shape[1])
+                drawn_gt_img = np.concatenate((drawn_img, drawn_text), axis=0)
+
+                self._visualizer.set_image(img)
+
+                for label, bbox, color in zip(pred_labels, pred_bboxes,
+                                              colors):
+                    self._visualizer.draw_bboxes(
+                        bbox, edge_colors=color, face_colors=color, alpha=0.3)
+                    self._visualizer.draw_bboxes(
+                        bbox, edge_colors=color, alpha=1)
+                print(pred_labels, pred_bboxes, pred_scores, colors)
+                areas = (pred_bboxes[:, 3] - pred_bboxes[:, 1]) * (
+                    pred_bboxes[:, 2] - pred_bboxes[:, 0])
+                scales = _get_adaptive_scales(areas)
+                score = [str(round(s.item(), 2)) for s in pred_scores]
+                font_sizes = [int(13 * scales[i]) for i in range(len(scales))]
+                self._visualizer.draw_texts(
+                    score,
+                    pred_bboxes[:, :2].int(),
+                    colors=(255, 255, 255),
+                    font_sizes=font_sizes,
+                    bboxes=[{
+                        'facecolor': 'black',
+                        'alpha': 0.8,
+                        'pad': 0.7,
+                        'edgecolor': 'none'
+                    }] * len(pred_bboxes))
+
+                drawn_img = self._visualizer.get_image()
+
+                new_image = np.ones(
+                    (100, img.shape[1], 3), dtype=np.uint8) * 255
+                self._visualizer.set_image(new_image)
+                drawn_text = draw_all_character(self._visualizer, characters,
+                                                img.shape[1])
+                drawn_pred_img = np.concatenate((drawn_img, drawn_text),
+                                                axis=0)
+                drawn_img = np.concatenate((drawn_gt_img, drawn_pred_img),
+                                           axis=1)
+
+                if self.show:
+                    self._visualizer.show(
+                        drawn_img,
+                        win_name=osp.basename(img_path),
+                        wait_time=self.wait_time)
+                if out_file is not None:
+                    mmcv.imwrite(drawn_img[..., ::-1], out_file)
+                else:
+                    self.add_image('test_img', drawn_img, self._test_index)
+            else:  # OD
+                self._visualizer.add_datasample(
+                    osp.basename(img_path) if self.show else 'test_img',
+                    img,
+                    data_sample=data_sample,
+                    show=self.show,
+                    wait_time=self.wait_time,
+                    pred_score_thr=self.score_thr,
+                    out_file=out_file,
+                    step=self._test_index)
diff --git a/head_extractor/build/lib/mmdet/engine/hooks/yolox_mode_switch_hook.py b/head_extractor/build/lib/mmdet/engine/hooks/yolox_mode_switch_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a2c69068bedd1c6fb3836e1fc34568e9f6bc83
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/hooks/yolox_mode_switch_hook.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class YOLOXModeSwitchHook(Hook):
+    """Switch the mode of YOLOX during training.
+
+    This hook turns off the mosaic and mixup data augmentation and switches
+    to use L1 loss in bbox_head.
+
+    Args:
+        num_last_epochs (int): The number of latter epochs in the end of the
+            training to close the data augmentation and switch to L1 loss.
+            Defaults to 15.
+       skip_type_keys (Sequence[str], optional): Sequence of type string to be
+            skip pipeline. Defaults to ('Mosaic', 'RandomAffine', 'MixUp').
+    """
+
+    def __init__(
+        self,
+        num_last_epochs: int = 15,
+        skip_type_keys: Sequence[str] = ('Mosaic', 'RandomAffine', 'MixUp')
+    ) -> None:
+        self.num_last_epochs = num_last_epochs
+        self.skip_type_keys = skip_type_keys
+        self._restart_dataloader = False
+        self._has_switched = False
+
+    def before_train_epoch(self, runner) -> None:
+        """Close mosaic and mixup augmentation and switches to use L1 loss."""
+        epoch = runner.epoch
+        train_loader = runner.train_dataloader
+        model = runner.model
+        # TODO: refactor after mmengine using model wrapper
+        if is_model_wrapper(model):
+            model = model.module
+        epoch_to_be_switched = ((epoch + 1) >=
+                                runner.max_epochs - self.num_last_epochs)
+        if epoch_to_be_switched and not self._has_switched:
+            runner.logger.info('No mosaic and mixup aug now!')
+            # The dataset pipeline cannot be updated when persistent_workers
+            # is True, so we need to force the dataloader's multi-process
+            # restart. This is a very hacky approach.
+            train_loader.dataset.update_skip_type_keys(self.skip_type_keys)
+            if hasattr(train_loader, 'persistent_workers'
+                       ) and train_loader.persistent_workers is True:
+                train_loader._DataLoader__initialized = False
+                train_loader._iterator = None
+                self._restart_dataloader = True
+            runner.logger.info('Add additional L1 loss now!')
+            if hasattr(model, 'detector'):
+                model.detector.bbox_head.use_l1 = True
+            else:
+                model.bbox_head.use_l1 = True
+            self._has_switched = True
+        else:
+            # Once the restart is complete, we need to restore
+            # the initialization flag.
+            if self._restart_dataloader:
+                train_loader._DataLoader__initialized = True
diff --git a/head_extractor/build/lib/mmdet/engine/optimizers/__init__.py b/head_extractor/build/lib/mmdet/engine/optimizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..83db069ee34cad0888bbf388d3cc7030ba49bbbb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/optimizers/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .layer_decay_optimizer_constructor import \
+    LearningRateDecayOptimizerConstructor
+
+__all__ = ['LearningRateDecayOptimizerConstructor']
diff --git a/head_extractor/build/lib/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py b/head_extractor/build/lib/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..73028a0aef698d63dcba8c4935d6ef6c577d0f46
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+from typing import List
+
+import torch.nn as nn
+from mmengine.dist import get_dist_info
+from mmengine.logging import MMLogger
+from mmengine.optim import DefaultOptimWrapperConstructor
+
+from mmdet.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+def get_layer_id_for_convnext(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates in ``layer_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_layer_id (int): Maximum layer id.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        stage_id = int(var_name.split('.')[2])
+        if stage_id == 0:
+            layer_id = 0
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        block_id = int(var_name.split('.')[3])
+        if stage_id == 0:
+            layer_id = 1
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3 + block_id // 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    else:
+        return max_layer_id + 1
+
+
+def get_stage_id_for_convnext(var_name, max_stage_id):
+    """Get the stage id to set the different learning rates in ``stage_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_stage_id (int): Maximum stage id.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        return 0
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        return stage_id + 1
+    else:
+        return max_stage_id - 1
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimWrapperConstructor):
+    # Different learning rates are set for different layers of backbone.
+    # Note: Currently, this optimizer constructor is built for ConvNeXt.
+
+    def add_params(self, params: List[dict], module: nn.Module,
+                   **kwargs) -> None:
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+        """
+        logger = MMLogger.get_current_instance()
+
+        parameter_groups = {}
+        logger.info(f'self.paramwise_cfg is {self.paramwise_cfg}')
+        num_layers = self.paramwise_cfg.get('num_layers') + 2
+        decay_rate = self.paramwise_cfg.get('decay_rate')
+        decay_type = self.paramwise_cfg.get('decay_type', 'layer_wise')
+        logger.info('Build LearningRateDecayOptimizerConstructor  '
+                    f'{decay_type} {decay_rate} - {num_layers}')
+        weight_decay = self.base_wd
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith('.bias') or name in (
+                    'pos_embed', 'cls_token'):
+                group_name = 'no_decay'
+                this_weight_decay = 0.
+            else:
+                group_name = 'decay'
+                this_weight_decay = weight_decay
+            if 'layer_wise' in decay_type:
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_convnext(
+                        name, self.paramwise_cfg.get('num_layers'))
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            elif decay_type == 'stage_wise':
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_stage_id_for_convnext(name, num_layers)
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            group_name = f'layer_{layer_id}_{group_name}'
+
+            if group_name not in parameter_groups:
+                scale = decay_rate**(num_layers - layer_id - 1)
+
+                parameter_groups[group_name] = {
+                    'weight_decay': this_weight_decay,
+                    'params': [],
+                    'param_names': [],
+                    'lr_scale': scale,
+                    'group_name': group_name,
+                    'lr': scale * self.base_lr,
+                }
+
+            parameter_groups[group_name]['params'].append(param)
+            parameter_groups[group_name]['param_names'].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    'param_names': parameter_groups[key]['param_names'],
+                    'lr_scale': parameter_groups[key]['lr_scale'],
+                    'lr': parameter_groups[key]['lr'],
+                    'weight_decay': parameter_groups[key]['weight_decay'],
+                }
+            logger.info(f'Param groups = {json.dumps(to_display, indent=2)}')
+        params.extend(parameter_groups.values())
diff --git a/head_extractor/build/lib/mmdet/engine/runner/__init__.py b/head_extractor/build/lib/mmdet/engine/runner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8bcce4448e48e2d64354ba6770f9f426fb3d869
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/runner/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .loops import TeacherStudentValLoop
+
+__all__ = ['TeacherStudentValLoop']
diff --git a/head_extractor/build/lib/mmdet/engine/runner/loops.py b/head_extractor/build/lib/mmdet/engine/runner/loops.py
new file mode 100644
index 0000000000000000000000000000000000000000..afe53afa5c80facf3ba6c224bd358e0859dade32
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/runner/loops.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.model import is_model_wrapper
+from mmengine.runner import ValLoop
+
+from mmdet.registry import LOOPS
+
+
+@LOOPS.register_module()
+class TeacherStudentValLoop(ValLoop):
+    """Loop for validation of model teacher and student."""
+
+    def run(self):
+        """Launch validation for model teacher and student."""
+        self.runner.call_hook('before_val')
+        self.runner.call_hook('before_val_epoch')
+        self.runner.model.eval()
+
+        model = self.runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        assert hasattr(model, 'teacher')
+        assert hasattr(model, 'student')
+
+        predict_on = model.semi_test_cfg.get('predict_on', None)
+        multi_metrics = dict()
+        for _predict_on in ['teacher', 'student']:
+            model.semi_test_cfg['predict_on'] = _predict_on
+            for idx, data_batch in enumerate(self.dataloader):
+                self.run_iter(idx, data_batch)
+            # compute metrics
+            metrics = self.evaluator.evaluate(len(self.dataloader.dataset))
+            multi_metrics.update(
+                {'/'.join((_predict_on, k)): v
+                 for k, v in metrics.items()})
+        model.semi_test_cfg['predict_on'] = predict_on
+
+        self.runner.call_hook('after_val_epoch', metrics=multi_metrics)
+        self.runner.call_hook('after_val')
diff --git a/head_extractor/build/lib/mmdet/engine/schedulers/__init__.py b/head_extractor/build/lib/mmdet/engine/schedulers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..01261646fa8255c643e86ba0517019760a50d387
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/schedulers/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .quadratic_warmup import (QuadraticWarmupLR, QuadraticWarmupMomentum,
+                               QuadraticWarmupParamScheduler)
+
+__all__ = [
+    'QuadraticWarmupParamScheduler', 'QuadraticWarmupMomentum',
+    'QuadraticWarmupLR'
+]
diff --git a/head_extractor/build/lib/mmdet/engine/schedulers/quadratic_warmup.py b/head_extractor/build/lib/mmdet/engine/schedulers/quadratic_warmup.py
new file mode 100644
index 0000000000000000000000000000000000000000..639b47854887786bf3f81d6d0a375033d190d91e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/engine/schedulers/quadratic_warmup.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.scheduler.lr_scheduler import LRSchedulerMixin
+from mmengine.optim.scheduler.momentum_scheduler import MomentumSchedulerMixin
+from mmengine.optim.scheduler.param_scheduler import INF, _ParamScheduler
+from torch.optim import Optimizer
+
+from mmdet.registry import PARAM_SCHEDULERS
+
+
+@PARAM_SCHEDULERS.register_module()
+class QuadraticWarmupParamScheduler(_ParamScheduler):
+    r"""Warm up the parameter value of each parameter group by quadratic
+    formula:
+
+    .. math::
+
+        X_{t} = X_{t-1} + \frac{2t+1}{{(end-begin)}^{2}} \times X_{base}
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: Optimizer,
+                 param_name: str,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        if end >= INF:
+            raise ValueError('``end`` must be less than infinity,'
+                             'Please set ``end`` parameter of '
+                             '``QuadraticWarmupScheduler`` as the '
+                             'number of warmup end.')
+        self.total_iters = end - begin
+        super().__init__(
+            optimizer=optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = begin * epoch_length
+        if end != INF:
+            end = end * epoch_length
+        return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step == 0:
+            return [
+                base_value * (2 * self.last_step + 1) / self.total_iters**2
+                for base_value in self.base_values
+            ]
+
+        return [
+            group[self.param_name] + base_value *
+            (2 * self.last_step + 1) / self.total_iters**2
+            for base_value, group in zip(self.base_values,
+                                         self.optimizer.param_groups)
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class QuadraticWarmupLR(LRSchedulerMixin, QuadraticWarmupParamScheduler):
+    """Warm up the learning rate of each parameter group by quadratic formula.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class QuadraticWarmupMomentum(MomentumSchedulerMixin,
+                              QuadraticWarmupParamScheduler):
+    """Warm up the momentum value of each parameter group by quadratic formula.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
diff --git a/head_extractor/build/lib/mmdet/evaluation/__init__.py b/head_extractor/build/lib/mmdet/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..126dea092eb1a4affab9fbe3fb043f5b373607ee
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .evaluator import *  # noqa: F401,F403
+from .functional import *  # noqa: F401,F403
+from .metrics import *  # noqa: F401,F403
diff --git a/head_extractor/build/lib/mmdet/evaluation/evaluator/__init__.py b/head_extractor/build/lib/mmdet/evaluation/evaluator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b13fe99548e7e2e4c6e196a2da22b9c8cbec8a3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/evaluator/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .multi_datasets_evaluator import MultiDatasetsEvaluator
+
+__all__ = ['MultiDatasetsEvaluator']
diff --git a/head_extractor/build/lib/mmdet/evaluation/evaluator/multi_datasets_evaluator.py b/head_extractor/build/lib/mmdet/evaluation/evaluator/multi_datasets_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cff1cf210e644e11b348f3aa757119ac579170d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/evaluator/multi_datasets_evaluator.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import OrderedDict
+from typing import Sequence, Union
+
+from mmengine.dist import (broadcast_object_list, collect_results,
+                           is_main_process)
+from mmengine.evaluator import BaseMetric, Evaluator
+from mmengine.evaluator.metric import _to_cpu
+from mmengine.registry import EVALUATOR
+
+from mmdet.utils import ConfigType
+
+
+@EVALUATOR.register_module()
+class MultiDatasetsEvaluator(Evaluator):
+    """Wrapper class to compose class: `ConcatDataset` and multiple
+    :class:`BaseMetric` instances.
+    The metrics will be evaluated on each dataset slice separately. The name of
+    the each metric is the concatenation of the dataset prefix, the metric
+    prefix and the key of metric - e.g.
+    `dataset_prefix/metric_prefix/accuracy`.
+
+    Args:
+        metrics (dict or BaseMetric or Sequence): The config of metrics.
+        dataset_prefixes (Sequence[str]): The prefix of each dataset. The
+            length of this sequence should be the same as the length of the
+            datasets.
+    """
+
+    def __init__(self, metrics: Union[ConfigType, BaseMetric, Sequence],
+                 dataset_prefixes: Sequence[str]) -> None:
+        super().__init__(metrics)
+        self.dataset_prefixes = dataset_prefixes
+        self._setups = False
+
+    def _get_cumulative_sizes(self):
+        # ConcatDataset have a property `cumulative_sizes`
+        if isinstance(self.dataset_meta, Sequence):
+            dataset_slices = self.dataset_meta[0]['cumulative_sizes']
+            if not self._setups:
+                self._setups = True
+                for dataset_meta, metric in zip(self.dataset_meta,
+                                                self.metrics):
+                    metric.dataset_meta = dataset_meta
+        else:
+            dataset_slices = self.dataset_meta['cumulative_sizes']
+        return dataset_slices
+
+    def evaluate(self, size: int) -> dict:
+        """Invoke ``evaluate`` method of each metric and collect the metrics
+        dictionary.
+
+        Args:
+            size (int): Length of the entire validation dataset. When batch
+                size > 1, the dataloader may pad some data samples to make
+                sure all ranks have the same length of dataset slice. The
+                ``collect_results`` function will drop the padded data based on
+                this size.
+
+        Returns:
+            dict: Evaluation results of all metrics. The keys are the names
+            of the metrics, and the values are corresponding results.
+        """
+        metrics_results = OrderedDict()
+        dataset_slices = self._get_cumulative_sizes()
+        assert len(dataset_slices) == len(self.dataset_prefixes)
+
+        for dataset_prefix, start, end, metric in zip(
+                self.dataset_prefixes, [0] + dataset_slices[:-1],
+                dataset_slices, self.metrics):
+            if len(metric.results) == 0:
+                warnings.warn(
+                    f'{metric.__class__.__name__} got empty `self.results`.'
+                    'Please ensure that the processed results are properly '
+                    'added into `self.results` in `process` method.')
+
+            results = collect_results(metric.results, size,
+                                      metric.collect_device)
+
+            if is_main_process():
+                # cast all tensors in results list to cpu
+                results = _to_cpu(results)
+                _metrics = metric.compute_metrics(
+                    results[start:end])  # type: ignore
+
+                if metric.prefix:
+                    final_prefix = '/'.join((dataset_prefix, metric.prefix))
+                else:
+                    final_prefix = dataset_prefix
+                print(f'================{final_prefix}================')
+                metric_results = {
+                    '/'.join((final_prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+
+                # Check metric name conflicts
+                for name in metric_results.keys():
+                    if name in metrics_results:
+                        raise ValueError(
+                            'There are multiple evaluation results with '
+                            f'the same metric name {name}. Please make '
+                            'sure all metrics have different prefixes.')
+                metrics_results.update(metric_results)
+            metric.results.clear()
+        if is_main_process():
+            metrics_results = [metrics_results]
+        else:
+            metrics_results = [None]  # type: ignore
+        broadcast_object_list(metrics_results)
+        return metrics_results[0]
diff --git a/head_extractor/build/lib/mmdet/evaluation/functional/__init__.py b/head_extractor/build/lib/mmdet/evaluation/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..96d58ebd3ab0dd714a6f361622a7faf2a09486cb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/functional/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_overlaps import bbox_overlaps
+from .cityscapes_utils import evaluateImgLists
+from .class_names import (cityscapes_classes, coco_classes,
+                          coco_panoptic_classes, dataset_aliases, get_classes,
+                          imagenet_det_classes, imagenet_vid_classes,
+                          objects365v1_classes, objects365v2_classes,
+                          oid_challenge_classes, oid_v6_classes, voc_classes)
+from .mean_ap import average_precision, eval_map, print_map_summary
+from .panoptic_utils import (INSTANCE_OFFSET, pq_compute_multi_core,
+                             pq_compute_single_core)
+from .recall import (eval_recalls, plot_iou_recall, plot_num_recall,
+                     print_recall_summary)
+from .ytvis import YTVIS
+from .ytviseval import YTVISeval
+
+__all__ = [
+    'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes',
+    'coco_classes', 'cityscapes_classes', 'dataset_aliases', 'get_classes',
+    'average_precision', 'eval_map', 'print_map_summary', 'eval_recalls',
+    'print_recall_summary', 'plot_num_recall', 'plot_iou_recall',
+    'oid_v6_classes', 'oid_challenge_classes', 'INSTANCE_OFFSET',
+    'pq_compute_single_core', 'pq_compute_multi_core', 'bbox_overlaps',
+    'objects365v1_classes', 'objects365v2_classes', 'coco_panoptic_classes',
+    'evaluateImgLists', 'YTVIS', 'YTVISeval'
+]
diff --git a/head_extractor/build/lib/mmdet/evaluation/functional/bbox_overlaps.py b/head_extractor/build/lib/mmdet/evaluation/functional/bbox_overlaps.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d6eb82fcfc8d5444dd2a13b7d95b978f8206a55
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/functional/bbox_overlaps.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+
+def bbox_overlaps(bboxes1,
+                  bboxes2,
+                  mode='iou',
+                  eps=1e-6,
+                  use_legacy_coordinate=False):
+    """Calculate the ious between each bbox of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (ndarray): Shape (n, 4)
+        bboxes2 (ndarray): Shape (k, 4)
+        mode (str): IOU (intersection over union) or IOF (intersection
+            over foreground)
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Note when function is used in `VOCDataset`, it should be
+            True to align with the official implementation
+            `http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar`
+            Default: False.
+
+    Returns:
+        ious (ndarray): Shape (n, k)
+    """
+
+    assert mode in ['iou', 'iof']
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+    bboxes1 = bboxes1.astype(np.float32)
+    bboxes2 = bboxes2.astype(np.float32)
+    rows = bboxes1.shape[0]
+    cols = bboxes2.shape[0]
+    ious = np.zeros((rows, cols), dtype=np.float32)
+    if rows * cols == 0:
+        return ious
+    exchange = False
+    if bboxes1.shape[0] > bboxes2.shape[0]:
+        bboxes1, bboxes2 = bboxes2, bboxes1
+        ious = np.zeros((cols, rows), dtype=np.float32)
+        exchange = True
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0] + extra_length) * (
+        bboxes1[:, 3] - bboxes1[:, 1] + extra_length)
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0] + extra_length) * (
+        bboxes2[:, 3] - bboxes2[:, 1] + extra_length)
+    for i in range(bboxes1.shape[0]):
+        x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
+        y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
+        x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
+        y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
+        overlap = np.maximum(x_end - x_start + extra_length, 0) * np.maximum(
+            y_end - y_start + extra_length, 0)
+        if mode == 'iou':
+            union = area1[i] + area2 - overlap
+        else:
+            union = area1[i] if not exchange else area2
+        union = np.maximum(union, eps)
+        ious[i, :] = overlap / union
+    if exchange:
+        ious = ious.T
+    return ious
diff --git a/head_extractor/build/lib/mmdet/evaluation/functional/cityscapes_utils.py b/head_extractor/build/lib/mmdet/evaluation/functional/cityscapes_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ced3680deefe333af7cca3675a6359c02dd96f8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/functional/cityscapes_utils.py
@@ -0,0 +1,302 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) https://github.com/mcordts/cityscapesScripts
+# A wrapper of `cityscapesscripts` which supports loading groundtruth
+# image from `backend_args`.
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Optional, Union
+
+import mmcv
+import numpy as np
+from mmengine.fileio import get
+
+try:
+    import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval  # noqa: E501
+    from cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling import \
+        CArgs  # noqa: E501
+    from cityscapesscripts.evaluation.instance import Instance
+    from cityscapesscripts.helpers.csHelpers import (id2label, labels,
+                                                     writeDict2JSON)
+    HAS_CITYSCAPESAPI = True
+except ImportError:
+    CArgs = object
+    HAS_CITYSCAPESAPI = False
+
+
+def evaluateImgLists(prediction_list: list,
+                     groundtruth_list: list,
+                     args: CArgs,
+                     backend_args: Optional[dict] = None,
+                     dump_matches: bool = False) -> dict:
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.evaluateImgLists``. Support loading
+    groundtruth image from file backend.
+    Args:
+        prediction_list (list): A list of prediction txt file.
+        groundtruth_list (list): A list of groundtruth image file.
+        args (CArgs): A global object setting in
+            obj:``cityscapesscripts.evaluation.
+            evalInstanceLevelSemanticLabeling``
+        backend_args (dict, optional): Arguments to instantiate the
+            preifx of uri corresponding backend. Defaults to None.
+        dump_matches (bool): whether dump matches.json. Defaults to False.
+    Returns:
+        dict: The computed metric.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    # determine labels of interest
+    CSEval.setInstanceLabels(args)
+    # get dictionary of all ground truth instances
+    gt_instances = getGtInstances(
+        groundtruth_list, args, backend_args=backend_args)
+    # match predictions and ground truth
+    matches = matchGtWithPreds(prediction_list, groundtruth_list, gt_instances,
+                               args, backend_args)
+    if dump_matches:
+        CSEval.writeDict2JSON(matches, 'matches.json')
+    # evaluate matches
+    apScores = CSEval.evaluateMatches(matches, args)
+    # averages
+    avgDict = CSEval.computeAverages(apScores, args)
+    # result dict
+    resDict = CSEval.prepareJSONDataForResults(avgDict, apScores, args)
+    if args.JSONOutput:
+        # create output folder if necessary
+        path = os.path.dirname(args.exportFile)
+        CSEval.ensurePath(path)
+        # Write APs to JSON
+        CSEval.writeDict2JSON(resDict, args.exportFile)
+
+    CSEval.printResults(avgDict, args)
+
+    return resDict
+
+
+def matchGtWithPreds(prediction_list: list,
+                     groundtruth_list: list,
+                     gt_instances: dict,
+                     args: CArgs,
+                     backend_args=None):
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.matchGtWithPreds``. Support loading
+    groundtruth image from file backend.
+    Args:
+        prediction_list (list): A list of prediction txt file.
+        groundtruth_list (list): A list of groundtruth image file.
+        gt_instances (dict): Groundtruth dict.
+        args (CArgs): A global object setting in
+            obj:``cityscapesscripts.evaluation.
+            evalInstanceLevelSemanticLabeling``
+        backend_args (dict, optional): Arguments to instantiate the
+            preifx of uri corresponding backend. Defaults to None.
+    Returns:
+        dict: The processed prediction and groundtruth result.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    matches: dict = dict()
+    if not args.quiet:
+        print(f'Matching {len(prediction_list)} pairs of images...')
+
+    count = 0
+    for (pred, gt) in zip(prediction_list, groundtruth_list):
+        # Read input files
+        gt_image = readGTImage(gt, backend_args)
+        pred_info = readPredInfo(pred)
+        # Get and filter ground truth instances
+        unfiltered_instances = gt_instances[gt]
+        cur_gt_instances_orig = CSEval.filterGtInstances(
+            unfiltered_instances, args)
+
+        # Try to assign all predictions
+        (cur_gt_instances,
+         cur_pred_instances) = CSEval.assignGt2Preds(cur_gt_instances_orig,
+                                                     gt_image, pred_info, args)
+
+        # append to global dict
+        matches[gt] = {}
+        matches[gt]['groundTruth'] = cur_gt_instances
+        matches[gt]['prediction'] = cur_pred_instances
+
+        count += 1
+        if not args.quiet:
+            print(f'\rImages Processed: {count}', end=' ')
+            sys.stdout.flush()
+
+    if not args.quiet:
+        print('')
+
+    return matches
+
+
+def readGTImage(image_file: Union[str, Path],
+                backend_args: Optional[dict] = None) -> np.ndarray:
+    """Read an image from path.
+
+    Same as obj:``cityscapesscripts.evaluation.
+    evalInstanceLevelSemanticLabeling.readGTImage``, but support loading
+    groundtruth image from file backend.
+    Args:
+        image_file (str or Path): Either a str or pathlib.Path.
+        backend_args (dict, optional): Instantiates the corresponding file
+            backend. It may contain `backend` key to specify the file
+            backend. If it contains, the file backend corresponding to this
+            value will be used and initialized with the remaining values,
+            otherwise the corresponding file backend will be selected
+            based on the prefix of the file path. Defaults to None.
+    Returns:
+        np.ndarray: The groundtruth image.
+    """
+    img_bytes = get(image_file, backend_args=backend_args)
+    img = mmcv.imfrombytes(img_bytes, flag='unchanged', backend='pillow')
+    return img
+
+
+def readPredInfo(prediction_file: str) -> dict:
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.readPredInfo``.
+    Args:
+        prediction_file (str): The prediction txt file.
+    Returns:
+        dict: The processed prediction results.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    printError = CSEval.printError
+
+    predInfo = {}
+    if (not os.path.isfile(prediction_file)):
+        printError(f"Infofile '{prediction_file}' "
+                   'for the predictions not found.')
+    with open(prediction_file) as f:
+        for line in f:
+            splittedLine = line.split(' ')
+            if len(splittedLine) != 3:
+                printError('Invalid prediction file. Expected content: '
+                           'relPathPrediction1 labelIDPrediction1 '
+                           'confidencePrediction1')
+            if os.path.isabs(splittedLine[0]):
+                printError('Invalid prediction file. First entry in each '
+                           'line must be a relative path.')
+
+            filename = os.path.join(
+                os.path.dirname(prediction_file), splittedLine[0])
+
+            imageInfo = {}
+            imageInfo['labelID'] = int(float(splittedLine[1]))
+            imageInfo['conf'] = float(splittedLine[2])  # type: ignore
+            predInfo[filename] = imageInfo
+
+    return predInfo
+
+
+def getGtInstances(groundtruth_list: list,
+                   args: CArgs,
+                   backend_args: Optional[dict] = None) -> dict:
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.getGtInstances``. Support loading
+    groundtruth image from file backend.
+    Args:
+        groundtruth_list (list): A list of groundtruth image file.
+        args (CArgs): A global object setting in
+            obj:``cityscapesscripts.evaluation.
+            evalInstanceLevelSemanticLabeling``
+        backend_args (dict, optional): Arguments to instantiate the
+            preifx of uri corresponding backend. Defaults to None.
+    Returns:
+        dict: The computed metric.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    # if there is a global statistics json, then load it
+    if (os.path.isfile(args.gtInstancesFile)):
+        if not args.quiet:
+            print('Loading ground truth instances from JSON.')
+        with open(args.gtInstancesFile) as json_file:
+            gt_instances = json.load(json_file)
+    # otherwise create it
+    else:
+        if (not args.quiet):
+            print('Creating ground truth instances from png files.')
+        gt_instances = instances2dict(
+            groundtruth_list, args, backend_args=backend_args)
+        writeDict2JSON(gt_instances, args.gtInstancesFile)
+
+    return gt_instances
+
+
+def instances2dict(image_list: list,
+                   args: CArgs,
+                   backend_args: Optional[dict] = None) -> dict:
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.instances2dict``. Support loading
+    groundtruth image from file backend.
+    Args:
+        image_list (list): A list of image file.
+        args (CArgs): A global object setting in
+            obj:``cityscapesscripts.evaluation.
+            evalInstanceLevelSemanticLabeling``
+        backend_args (dict, optional): Arguments to instantiate the
+            preifx of uri corresponding backend. Defaults to None.
+    Returns:
+        dict: The processed groundtruth results.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    imgCount = 0
+    instanceDict = {}
+
+    if not isinstance(image_list, list):
+        image_list = [image_list]
+
+    if not args.quiet:
+        print(f'Processing {len(image_list)} images...')
+
+    for image_name in image_list:
+        # Load image
+        img_bytes = get(image_name, backend_args=backend_args)
+        imgNp = mmcv.imfrombytes(img_bytes, flag='unchanged', backend='pillow')
+
+        # Initialize label categories
+        instances: dict = {}
+        for label in labels:
+            instances[label.name] = []
+
+        # Loop through all instance ids in instance image
+        for instanceId in np.unique(imgNp):
+            instanceObj = Instance(imgNp, instanceId)
+
+            instances[id2label[instanceObj.labelID].name].append(
+                instanceObj.toDict())
+
+        instanceDict[image_name] = instances
+        imgCount += 1
+
+        if not args.quiet:
+            print(f'\rImages Processed: {imgCount}', end=' ')
+            sys.stdout.flush()
+
+    return instanceDict
diff --git a/head_extractor/build/lib/mmdet/evaluation/functional/class_names.py b/head_extractor/build/lib/mmdet/evaluation/functional/class_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..623a89cfdc06ab04831afd3423d5f725acc881f0
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/functional/class_names.py
@@ -0,0 +1,762 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import is_str
+
+
+def wider_face_classes() -> list:
+    """Class names of WIDERFace."""
+    return ['face']
+
+
+def voc_classes() -> list:
+    """Class names of PASCAL VOC."""
+    return [
+        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+        'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
+    ]
+
+
+def imagenet_det_classes() -> list:
+    """Class names of ImageNet Det."""
+    return [
+        'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo',
+        'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam',
+        'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap',
+        'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder',
+        'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito',
+        'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle',
+        'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker',
+        'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew',
+        'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper',
+        'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly',
+        'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig',
+        'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog',
+        'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart',
+        'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger',
+        'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim',
+        'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse',
+        'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle',
+        'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard',
+        'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can',
+        'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace',
+        'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume',
+        'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza',
+        'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine',
+        'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse',
+        'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator',
+        'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler',
+        'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver',
+        'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile',
+        'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula',
+        'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer',
+        'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine',
+        'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie',
+        'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet',
+        'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin',
+        'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft',
+        'whale', 'wine_bottle', 'zebra'
+    ]
+
+
+def imagenet_vid_classes() -> list:
+    """Class names of ImageNet VID."""
+    return [
+        'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
+        'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda',
+        'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit',
+        'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle',
+        'watercraft', 'whale', 'zebra'
+    ]
+
+
+def coco_classes() -> list:
+    """Class names of COCO."""
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign',
+        'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard',
+        'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy_bear', 'hair_drier', 'toothbrush'
+    ]
+
+
+def coco_panoptic_classes() -> list:
+    """Class names of COCO panoptic."""
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+        'blanket', 'bridge', 'cardboard', 'counter', 'curtain', 'door-stuff',
+        'floor-wood', 'flower', 'fruit', 'gravel', 'house', 'light',
+        'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield',
+        'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow',
+        'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile',
+        'wall-wood', 'water-other', 'window-blind', 'window-other',
+        'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+        'cabinet-merged', 'table-merged', 'floor-other-merged',
+        'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged',
+        'paper-merged', 'food-other-merged', 'building-other-merged',
+        'rock-merged', 'wall-other-merged', 'rug-merged'
+    ]
+
+
+def cityscapes_classes() -> list:
+    """Class names of Cityscapes."""
+    return [
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+
+
+def oid_challenge_classes() -> list:
+    """Class names of Open Images Challenge."""
+    return [
+        'Footwear', 'Jeans', 'House', 'Tree', 'Woman', 'Man', 'Land vehicle',
+        'Person', 'Wheel', 'Bus', 'Human face', 'Bird', 'Dress', 'Girl',
+        'Vehicle', 'Building', 'Cat', 'Car', 'Belt', 'Elephant', 'Dessert',
+        'Butterfly', 'Train', 'Guitar', 'Poster', 'Book', 'Boy', 'Bee',
+        'Flower', 'Window', 'Hat', 'Human head', 'Dog', 'Human arm', 'Drink',
+        'Human mouth', 'Human hair', 'Human nose', 'Human hand', 'Table',
+        'Marine invertebrates', 'Fish', 'Sculpture', 'Rose', 'Street light',
+        'Glasses', 'Fountain', 'Skyscraper', 'Swimwear', 'Brassiere', 'Drum',
+        'Duck', 'Countertop', 'Furniture', 'Ball', 'Human leg', 'Boat',
+        'Balloon', 'Bicycle helmet', 'Goggles', 'Door', 'Human eye', 'Shirt',
+        'Toy', 'Teddy bear', 'Pasta', 'Tomato', 'Human ear',
+        'Vehicle registration plate', 'Microphone', 'Musical keyboard',
+        'Tower', 'Houseplant', 'Flowerpot', 'Fruit', 'Vegetable',
+        'Musical instrument', 'Suit', 'Motorcycle', 'Bagel', 'French fries',
+        'Hamburger', 'Chair', 'Salt and pepper shakers', 'Snail', 'Airplane',
+        'Horse', 'Laptop', 'Computer keyboard', 'Football helmet', 'Cocktail',
+        'Juice', 'Tie', 'Computer monitor', 'Human beard', 'Bottle',
+        'Saxophone', 'Lemon', 'Mouse', 'Sock', 'Cowboy hat', 'Sun hat',
+        'Football', 'Porch', 'Sunglasses', 'Lobster', 'Crab', 'Picture frame',
+        'Van', 'Crocodile', 'Surfboard', 'Shorts', 'Helicopter', 'Helmet',
+        'Sports uniform', 'Taxi', 'Swan', 'Goose', 'Coat', 'Jacket', 'Handbag',
+        'Flag', 'Skateboard', 'Television', 'Tire', 'Spoon', 'Palm tree',
+        'Stairs', 'Salad', 'Castle', 'Oven', 'Microwave oven', 'Wine',
+        'Ceiling fan', 'Mechanical fan', 'Cattle', 'Truck', 'Box', 'Ambulance',
+        'Desk', 'Wine glass', 'Reptile', 'Tank', 'Traffic light', 'Billboard',
+        'Tent', 'Insect', 'Spider', 'Treadmill', 'Cupboard', 'Shelf',
+        'Seat belt', 'Human foot', 'Bicycle', 'Bicycle wheel', 'Couch',
+        'Bookcase', 'Fedora', 'Backpack', 'Bench', 'Oyster',
+        'Moths and butterflies', 'Lavender', 'Waffle', 'Fork', 'Animal',
+        'Accordion', 'Mobile phone', 'Plate', 'Coffee cup', 'Saucer',
+        'Platter', 'Dagger', 'Knife', 'Bull', 'Tortoise', 'Sea turtle', 'Deer',
+        'Weapon', 'Apple', 'Ski', 'Taco', 'Traffic sign', 'Beer', 'Necklace',
+        'Sunflower', 'Piano', 'Organ', 'Harpsichord', 'Bed', 'Cabinetry',
+        'Nightstand', 'Curtain', 'Chest of drawers', 'Drawer', 'Parrot',
+        'Sandal', 'High heels', 'Tableware', 'Cart', 'Mushroom', 'Kite',
+        'Missile', 'Seafood', 'Camera', 'Paper towel', 'Toilet paper',
+        'Sombrero', 'Radish', 'Lighthouse', 'Segway', 'Pig', 'Watercraft',
+        'Golf cart', 'studio couch', 'Dolphin', 'Whale', 'Earrings', 'Otter',
+        'Sea lion', 'Whiteboard', 'Monkey', 'Gondola', 'Zebra',
+        'Baseball glove', 'Scarf', 'Adhesive tape', 'Trousers', 'Scoreboard',
+        'Lily', 'Carnivore', 'Power plugs and sockets', 'Office building',
+        'Sandwich', 'Swimming pool', 'Headphones', 'Tin can', 'Crown', 'Doll',
+        'Cake', 'Frog', 'Beetle', 'Ant', 'Gas stove', 'Canoe', 'Falcon',
+        'Blue jay', 'Egg', 'Fire hydrant', 'Raccoon', 'Muffin', 'Wall clock',
+        'Coffee', 'Mug', 'Tea', 'Bear', 'Waste container', 'Home appliance',
+        'Candle', 'Lion', 'Mirror', 'Starfish', 'Marine mammal', 'Wheelchair',
+        'Umbrella', 'Alpaca', 'Violin', 'Cello', 'Brown bear', 'Canary', 'Bat',
+        'Ruler', 'Plastic bag', 'Penguin', 'Watermelon', 'Harbor seal', 'Pen',
+        'Pumpkin', 'Harp', 'Kitchen appliance', 'Roller skates', 'Bust',
+        'Coffee table', 'Tennis ball', 'Tennis racket', 'Ladder', 'Boot',
+        'Bowl', 'Stop sign', 'Volleyball', 'Eagle', 'Paddle', 'Chicken',
+        'Skull', 'Lamp', 'Beehive', 'Maple', 'Sink', 'Goldfish', 'Tripod',
+        'Coconut', 'Bidet', 'Tap', 'Bathroom cabinet', 'Toilet',
+        'Filing cabinet', 'Pretzel', 'Table tennis racket', 'Bronze sculpture',
+        'Rocket', 'Mouse', 'Hamster', 'Lizard', 'Lifejacket', 'Goat',
+        'Washing machine', 'Trumpet', 'Horn', 'Trombone', 'Sheep',
+        'Tablet computer', 'Pillow', 'Kitchen & dining room table',
+        'Parachute', 'Raven', 'Glove', 'Loveseat', 'Christmas tree',
+        'Shellfish', 'Rifle', 'Shotgun', 'Sushi', 'Sparrow', 'Bread',
+        'Toaster', 'Watch', 'Asparagus', 'Artichoke', 'Suitcase', 'Antelope',
+        'Broccoli', 'Ice cream', 'Racket', 'Banana', 'Cookie', 'Cucumber',
+        'Dragonfly', 'Lynx', 'Caterpillar', 'Light bulb', 'Office supplies',
+        'Miniskirt', 'Skirt', 'Fireplace', 'Potato', 'Light switch',
+        'Croissant', 'Cabbage', 'Ladybug', 'Handgun', 'Luggage and bags',
+        'Window blind', 'Snowboard', 'Baseball bat', 'Digital clock',
+        'Serving tray', 'Infant bed', 'Sofa bed', 'Guacamole', 'Fox', 'Pizza',
+        'Snowplow', 'Jet ski', 'Refrigerator', 'Lantern', 'Convenience store',
+        'Sword', 'Rugby ball', 'Owl', 'Ostrich', 'Pancake', 'Strawberry',
+        'Carrot', 'Tart', 'Dice', 'Turkey', 'Rabbit', 'Invertebrate', 'Vase',
+        'Stool', 'Swim cap', 'Shower', 'Clock', 'Jellyfish', 'Aircraft',
+        'Chopsticks', 'Orange', 'Snake', 'Sewing machine', 'Kangaroo', 'Mixer',
+        'Food processor', 'Shrimp', 'Towel', 'Porcupine', 'Jaguar', 'Cannon',
+        'Limousine', 'Mule', 'Squirrel', 'Kitchen knife', 'Tiara', 'Tiger',
+        'Bow and arrow', 'Candy', 'Rhinoceros', 'Shark', 'Cricket ball',
+        'Doughnut', 'Plumbing fixture', 'Camel', 'Polar bear', 'Coin',
+        'Printer', 'Blender', 'Giraffe', 'Billiard table', 'Kettle',
+        'Dinosaur', 'Pineapple', 'Zucchini', 'Jug', 'Barge', 'Teapot',
+        'Golf ball', 'Binoculars', 'Scissors', 'Hot dog', 'Door handle',
+        'Seahorse', 'Bathtub', 'Leopard', 'Centipede', 'Grapefruit', 'Snowman',
+        'Cheetah', 'Alarm clock', 'Grape', 'Wrench', 'Wok', 'Bell pepper',
+        'Cake stand', 'Barrel', 'Woodpecker', 'Flute', 'Corded phone',
+        'Willow', 'Punching bag', 'Pomegranate', 'Telephone', 'Pear',
+        'Common fig', 'Bench', 'Wood-burning stove', 'Burrito', 'Nail',
+        'Turtle', 'Submarine sandwich', 'Drinking straw', 'Peach', 'Popcorn',
+        'Frying pan', 'Picnic basket', 'Honeycomb', 'Envelope', 'Mango',
+        'Cutting board', 'Pitcher', 'Stationary bicycle', 'Dumbbell',
+        'Personal care', 'Dog bed', 'Snowmobile', 'Oboe', 'Briefcase',
+        'Squash', 'Tick', 'Slow cooker', 'Coffeemaker', 'Measuring cup',
+        'Crutch', 'Stretcher', 'Screwdriver', 'Flashlight', 'Spatula',
+        'Pressure cooker', 'Ring binder', 'Beaker', 'Torch', 'Winter melon'
+    ]
+
+
+def oid_v6_classes() -> list:
+    """Class names of Open Images V6."""
+    return [
+        'Tortoise', 'Container', 'Magpie', 'Sea turtle', 'Football',
+        'Ambulance', 'Ladder', 'Toothbrush', 'Syringe', 'Sink', 'Toy',
+        'Organ (Musical Instrument)', 'Cassette deck', 'Apple', 'Human eye',
+        'Cosmetics', 'Paddle', 'Snowman', 'Beer', 'Chopsticks', 'Human beard',
+        'Bird', 'Parking meter', 'Traffic light', 'Croissant', 'Cucumber',
+        'Radish', 'Towel', 'Doll', 'Skull', 'Washing machine', 'Glove', 'Tick',
+        'Belt', 'Sunglasses', 'Banjo', 'Cart', 'Ball', 'Backpack', 'Bicycle',
+        'Home appliance', 'Centipede', 'Boat', 'Surfboard', 'Boot',
+        'Headphones', 'Hot dog', 'Shorts', 'Fast food', 'Bus', 'Boy',
+        'Screwdriver', 'Bicycle wheel', 'Barge', 'Laptop', 'Miniskirt',
+        'Drill (Tool)', 'Dress', 'Bear', 'Waffle', 'Pancake', 'Brown bear',
+        'Woodpecker', 'Blue jay', 'Pretzel', 'Bagel', 'Tower', 'Teapot',
+        'Person', 'Bow and arrow', 'Swimwear', 'Beehive', 'Brassiere', 'Bee',
+        'Bat (Animal)', 'Starfish', 'Popcorn', 'Burrito', 'Chainsaw',
+        'Balloon', 'Wrench', 'Tent', 'Vehicle registration plate', 'Lantern',
+        'Toaster', 'Flashlight', 'Billboard', 'Tiara', 'Limousine', 'Necklace',
+        'Carnivore', 'Scissors', 'Stairs', 'Computer keyboard', 'Printer',
+        'Traffic sign', 'Chair', 'Shirt', 'Poster', 'Cheese', 'Sock',
+        'Fire hydrant', 'Land vehicle', 'Earrings', 'Tie', 'Watercraft',
+        'Cabinetry', 'Suitcase', 'Muffin', 'Bidet', 'Snack', 'Snowmobile',
+        'Clock', 'Medical equipment', 'Cattle', 'Cello', 'Jet ski', 'Camel',
+        'Coat', 'Suit', 'Desk', 'Cat', 'Bronze sculpture', 'Juice', 'Gondola',
+        'Beetle', 'Cannon', 'Computer mouse', 'Cookie', 'Office building',
+        'Fountain', 'Coin', 'Calculator', 'Cocktail', 'Computer monitor',
+        'Box', 'Stapler', 'Christmas tree', 'Cowboy hat', 'Hiking equipment',
+        'Studio couch', 'Drum', 'Dessert', 'Wine rack', 'Drink', 'Zucchini',
+        'Ladle', 'Human mouth', 'Dairy Product', 'Dice', 'Oven', 'Dinosaur',
+        'Ratchet (Device)', 'Couch', 'Cricket ball', 'Winter melon', 'Spatula',
+        'Whiteboard', 'Pencil sharpener', 'Door', 'Hat', 'Shower', 'Eraser',
+        'Fedora', 'Guacamole', 'Dagger', 'Scarf', 'Dolphin', 'Sombrero',
+        'Tin can', 'Mug', 'Tap', 'Harbor seal', 'Stretcher', 'Can opener',
+        'Goggles', 'Human body', 'Roller skates', 'Coffee cup',
+        'Cutting board', 'Blender', 'Plumbing fixture', 'Stop sign',
+        'Office supplies', 'Volleyball (Ball)', 'Vase', 'Slow cooker',
+        'Wardrobe', 'Coffee', 'Whisk', 'Paper towel', 'Personal care', 'Food',
+        'Sun hat', 'Tree house', 'Flying disc', 'Skirt', 'Gas stove',
+        'Salt and pepper shakers', 'Mechanical fan', 'Face powder', 'Fax',
+        'Fruit', 'French fries', 'Nightstand', 'Barrel', 'Kite', 'Tart',
+        'Treadmill', 'Fox', 'Flag', 'French horn', 'Window blind',
+        'Human foot', 'Golf cart', 'Jacket', 'Egg (Food)', 'Street light',
+        'Guitar', 'Pillow', 'Human leg', 'Isopod', 'Grape', 'Human ear',
+        'Power plugs and sockets', 'Panda', 'Giraffe', 'Woman', 'Door handle',
+        'Rhinoceros', 'Bathtub', 'Goldfish', 'Houseplant', 'Goat',
+        'Baseball bat', 'Baseball glove', 'Mixing bowl',
+        'Marine invertebrates', 'Kitchen utensil', 'Light switch', 'House',
+        'Horse', 'Stationary bicycle', 'Hammer', 'Ceiling fan', 'Sofa bed',
+        'Adhesive tape', 'Harp', 'Sandal', 'Bicycle helmet', 'Saucer',
+        'Harpsichord', 'Human hair', 'Heater', 'Harmonica', 'Hamster',
+        'Curtain', 'Bed', 'Kettle', 'Fireplace', 'Scale', 'Drinking straw',
+        'Insect', 'Hair dryer', 'Kitchenware', 'Indoor rower', 'Invertebrate',
+        'Food processor', 'Bookcase', 'Refrigerator', 'Wood-burning stove',
+        'Punching bag', 'Common fig', 'Cocktail shaker', 'Jaguar (Animal)',
+        'Golf ball', 'Fashion accessory', 'Alarm clock', 'Filing cabinet',
+        'Artichoke', 'Table', 'Tableware', 'Kangaroo', 'Koala', 'Knife',
+        'Bottle', 'Bottle opener', 'Lynx', 'Lavender (Plant)', 'Lighthouse',
+        'Dumbbell', 'Human head', 'Bowl', 'Humidifier', 'Porch', 'Lizard',
+        'Billiard table', 'Mammal', 'Mouse', 'Motorcycle',
+        'Musical instrument', 'Swim cap', 'Frying pan', 'Snowplow',
+        'Bathroom cabinet', 'Missile', 'Bust', 'Man', 'Waffle iron', 'Milk',
+        'Ring binder', 'Plate', 'Mobile phone', 'Baked goods', 'Mushroom',
+        'Crutch', 'Pitcher (Container)', 'Mirror', 'Personal flotation device',
+        'Table tennis racket', 'Pencil case', 'Musical keyboard', 'Scoreboard',
+        'Briefcase', 'Kitchen knife', 'Nail (Construction)', 'Tennis ball',
+        'Plastic bag', 'Oboe', 'Chest of drawers', 'Ostrich', 'Piano', 'Girl',
+        'Plant', 'Potato', 'Hair spray', 'Sports equipment', 'Pasta',
+        'Penguin', 'Pumpkin', 'Pear', 'Infant bed', 'Polar bear', 'Mixer',
+        'Cupboard', 'Jacuzzi', 'Pizza', 'Digital clock', 'Pig', 'Reptile',
+        'Rifle', 'Lipstick', 'Skateboard', 'Raven', 'High heels', 'Red panda',
+        'Rose', 'Rabbit', 'Sculpture', 'Saxophone', 'Shotgun', 'Seafood',
+        'Submarine sandwich', 'Snowboard', 'Sword', 'Picture frame', 'Sushi',
+        'Loveseat', 'Ski', 'Squirrel', 'Tripod', 'Stethoscope', 'Submarine',
+        'Scorpion', 'Segway', 'Training bench', 'Snake', 'Coffee table',
+        'Skyscraper', 'Sheep', 'Television', 'Trombone', 'Tea', 'Tank', 'Taco',
+        'Telephone', 'Torch', 'Tiger', 'Strawberry', 'Trumpet', 'Tree',
+        'Tomato', 'Train', 'Tool', 'Picnic basket', 'Cooking spray',
+        'Trousers', 'Bowling equipment', 'Football helmet', 'Truck',
+        'Measuring cup', 'Coffeemaker', 'Violin', 'Vehicle', 'Handbag',
+        'Paper cutter', 'Wine', 'Weapon', 'Wheel', 'Worm', 'Wok', 'Whale',
+        'Zebra', 'Auto part', 'Jug', 'Pizza cutter', 'Cream', 'Monkey', 'Lion',
+        'Bread', 'Platter', 'Chicken', 'Eagle', 'Helicopter', 'Owl', 'Duck',
+        'Turtle', 'Hippopotamus', 'Crocodile', 'Toilet', 'Toilet paper',
+        'Squid', 'Clothing', 'Footwear', 'Lemon', 'Spider', 'Deer', 'Frog',
+        'Banana', 'Rocket', 'Wine glass', 'Countertop', 'Tablet computer',
+        'Waste container', 'Swimming pool', 'Dog', 'Book', 'Elephant', 'Shark',
+        'Candle', 'Leopard', 'Axe', 'Hand dryer', 'Soap dispenser',
+        'Porcupine', 'Flower', 'Canary', 'Cheetah', 'Palm tree', 'Hamburger',
+        'Maple', 'Building', 'Fish', 'Lobster', 'Garden Asparagus',
+        'Furniture', 'Hedgehog', 'Airplane', 'Spoon', 'Otter', 'Bull',
+        'Oyster', 'Horizontal bar', 'Convenience store', 'Bomb', 'Bench',
+        'Ice cream', 'Caterpillar', 'Butterfly', 'Parachute', 'Orange',
+        'Antelope', 'Beaker', 'Moths and butterflies', 'Window', 'Closet',
+        'Castle', 'Jellyfish', 'Goose', 'Mule', 'Swan', 'Peach', 'Coconut',
+        'Seat belt', 'Raccoon', 'Chisel', 'Fork', 'Lamp', 'Camera',
+        'Squash (Plant)', 'Racket', 'Human face', 'Human arm', 'Vegetable',
+        'Diaper', 'Unicycle', 'Falcon', 'Chime', 'Snail', 'Shellfish',
+        'Cabbage', 'Carrot', 'Mango', 'Jeans', 'Flowerpot', 'Pineapple',
+        'Drawer', 'Stool', 'Envelope', 'Cake', 'Dragonfly', 'Common sunflower',
+        'Microwave oven', 'Honeycomb', 'Marine mammal', 'Sea lion', 'Ladybug',
+        'Shelf', 'Watch', 'Candy', 'Salad', 'Parrot', 'Handgun', 'Sparrow',
+        'Van', 'Grinder', 'Spice rack', 'Light bulb', 'Corded phone',
+        'Sports uniform', 'Tennis racket', 'Wall clock', 'Serving tray',
+        'Kitchen & dining room table', 'Dog bed', 'Cake stand',
+        'Cat furniture', 'Bathroom accessory', 'Facial tissue holder',
+        'Pressure cooker', 'Kitchen appliance', 'Tire', 'Ruler',
+        'Luggage and bags', 'Microphone', 'Broccoli', 'Umbrella', 'Pastry',
+        'Grapefruit', 'Band-aid', 'Animal', 'Bell pepper', 'Turkey', 'Lily',
+        'Pomegranate', 'Doughnut', 'Glasses', 'Human nose', 'Pen', 'Ant',
+        'Car', 'Aircraft', 'Human hand', 'Skunk', 'Teddy bear', 'Watermelon',
+        'Cantaloupe', 'Dishwasher', 'Flute', 'Balance beam', 'Sandwich',
+        'Shrimp', 'Sewing machine', 'Binoculars', 'Rays and skates', 'Ipod',
+        'Accordion', 'Willow', 'Crab', 'Crown', 'Seahorse', 'Perfume',
+        'Alpaca', 'Taxi', 'Canoe', 'Remote control', 'Wheelchair',
+        'Rugby ball', 'Armadillo', 'Maracas', 'Helmet'
+    ]
+
+
+def objects365v1_classes() -> list:
+    """Class names of Objects365 V1."""
+    return [
+        'person', 'sneakers', 'chair', 'hat', 'lamp', 'bottle',
+        'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk',
+        'handbag', 'street lights', 'book', 'plate', 'helmet', 'leather shoes',
+        'pillow', 'glove', 'potted plant', 'bracelet', 'flower', 'tv',
+        'storage box', 'vase', 'bench', 'wine glass', 'boots', 'bowl',
+        'dining table', 'umbrella', 'boat', 'flag', 'speaker', 'trash bin/can',
+        'stool', 'backpack', 'couch', 'belt', 'carpet', 'basket',
+        'towel/napkin', 'slippers', 'barrel/bucket', 'coffee table', 'suv',
+        'toy', 'tie', 'bed', 'traffic light', 'pen/pencil', 'microphone',
+        'sandals', 'canned', 'necklace', 'mirror', 'faucet', 'bicycle',
+        'bread', 'high heels', 'ring', 'van', 'watch', 'sink', 'horse', 'fish',
+        'apple', 'camera', 'candle', 'teddy bear', 'cake', 'motorcycle',
+        'wild bird', 'laptop', 'knife', 'traffic sign', 'cell phone', 'paddle',
+        'truck', 'cow', 'power outlet', 'clock', 'drum', 'fork', 'bus',
+        'hanger', 'nightstand', 'pot/pan', 'sheep', 'guitar', 'traffic cone',
+        'tea pot', 'keyboard', 'tripod', 'hockey', 'fan', 'dog', 'spoon',
+        'blackboard/whiteboard', 'balloon', 'air conditioner', 'cymbal',
+        'mouse', 'telephone', 'pickup truck', 'orange', 'banana', 'airplane',
+        'luggage', 'skis', 'soccer', 'trolley', 'oven', 'remote',
+        'baseball glove', 'paper towel', 'refrigerator', 'train', 'tomato',
+        'machinery vehicle', 'tent', 'shampoo/shower gel', 'head phone',
+        'lantern', 'donut', 'cleaning products', 'sailboat', 'tangerine',
+        'pizza', 'kite', 'computer box', 'elephant', 'toiletries', 'gas stove',
+        'broccoli', 'toilet', 'stroller', 'shovel', 'baseball bat',
+        'microwave', 'skateboard', 'surfboard', 'surveillance camera', 'gun',
+        'life saver', 'cat', 'lemon', 'liquid soap', 'zebra', 'duck',
+        'sports car', 'giraffe', 'pumpkin', 'piano', 'stop sign', 'radiator',
+        'converter', 'tissue ', 'carrot', 'washing machine', 'vent', 'cookies',
+        'cutting/chopping board', 'tennis racket', 'candy',
+        'skating and skiing shoes', 'scissors', 'folder', 'baseball',
+        'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine',
+        'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear',
+        'american football', 'basketball', 'potato', 'paint brush', 'printer',
+        'billiards', 'fire hydrant', 'goose', 'projector', 'sausage',
+        'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball',
+        'chopsticks', 'electronic stove and gas stove', 'pie', 'frisbee',
+        'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender',
+        'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango',
+        'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion',
+        'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale',
+        'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple',
+        'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle',
+        'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar',
+        'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD',
+        'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado',
+        'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear',
+        'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn',
+        'key', 'screwdriver', 'globe', 'broom', 'pliers', 'volleyball',
+        'hammer', 'eggplant', 'trophy', 'dates', 'board eraser', 'rice',
+        'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel',
+        'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste', 'antelope',
+        'shrimp', 'rickshaw', 'trombone', 'pomegranate', 'coconut',
+        'jellyfish', 'mushroom', 'calculator', 'treadmill', 'butterfly',
+        'egg tart', 'cheese', 'pig', 'pomelo', 'race car', 'rice cooker',
+        'tuba', 'crosswalk sign', 'papaya', 'hair drier', 'green onion',
+        'chips', 'dolphin', 'sushi', 'urinal', 'donkey', 'electric drill',
+        'spring rolls', 'tortoise/turtle', 'parrot', 'flute', 'measuring cup',
+        'shark', 'steak', 'poker card', 'binoculars', 'llama', 'radish',
+        'noodles', 'yak', 'mop', 'crab', 'microscope', 'barbell', 'bread/bun',
+        'baozi', 'lion', 'red cabbage', 'polar bear', 'lighter', 'seal',
+        'mangosteen', 'comb', 'eraser', 'pitaya', 'scallop', 'pencil case',
+        'saw', 'table tennis paddle', 'okra', 'starfish', 'eagle', 'monkey',
+        'durian', 'game board', 'rabbit', 'french horn', 'ambulance',
+        'asparagus', 'hoverboard', 'pasta', 'target', 'hotair balloon',
+        'chainsaw', 'lobster', 'iron', 'flashlight'
+    ]
+
+
+def objects365v2_classes() -> list:
+    """Class names of Objects365 V2."""
+    return [
+        'Person', 'Sneakers', 'Chair', 'Other Shoes', 'Hat', 'Car', 'Lamp',
+        'Glasses', 'Bottle', 'Desk', 'Cup', 'Street Lights', 'Cabinet/shelf',
+        'Handbag/Satchel', 'Bracelet', 'Plate', 'Picture/Frame', 'Helmet',
+        'Book', 'Gloves', 'Storage box', 'Boat', 'Leather Shoes', 'Flower',
+        'Bench', 'Potted Plant', 'Bowl/Basin', 'Flag', 'Pillow', 'Boots',
+        'Vase', 'Microphone', 'Necklace', 'Ring', 'SUV', 'Wine Glass', 'Belt',
+        'Moniter/TV', 'Backpack', 'Umbrella', 'Traffic Light', 'Speaker',
+        'Watch', 'Tie', 'Trash bin Can', 'Slippers', 'Bicycle', 'Stool',
+        'Barrel/bucket', 'Van', 'Couch', 'Sandals', 'Bakset', 'Drum',
+        'Pen/Pencil', 'Bus', 'Wild Bird', 'High Heels', 'Motorcycle', 'Guitar',
+        'Carpet', 'Cell Phone', 'Bread', 'Camera', 'Canned', 'Truck',
+        'Traffic cone', 'Cymbal', 'Lifesaver', 'Towel', 'Stuffed Toy',
+        'Candle', 'Sailboat', 'Laptop', 'Awning', 'Bed', 'Faucet', 'Tent',
+        'Horse', 'Mirror', 'Power outlet', 'Sink', 'Apple', 'Air Conditioner',
+        'Knife', 'Hockey Stick', 'Paddle', 'Pickup Truck', 'Fork',
+        'Traffic Sign', 'Ballon', 'Tripod', 'Dog', 'Spoon', 'Clock', 'Pot',
+        'Cow', 'Cake', 'Dinning Table', 'Sheep', 'Hanger',
+        'Blackboard/Whiteboard', 'Napkin', 'Other Fish', 'Orange/Tangerine',
+        'Toiletry', 'Keyboard', 'Tomato', 'Lantern', 'Machinery Vehicle',
+        'Fan', 'Green Vegetables', 'Banana', 'Baseball Glove', 'Airplane',
+        'Mouse', 'Train', 'Pumpkin', 'Soccer', 'Skiboard', 'Luggage',
+        'Nightstand', 'Tea pot', 'Telephone', 'Trolley', 'Head Phone',
+        'Sports Car', 'Stop Sign', 'Dessert', 'Scooter', 'Stroller', 'Crane',
+        'Remote', 'Refrigerator', 'Oven', 'Lemon', 'Duck', 'Baseball Bat',
+        'Surveillance Camera', 'Cat', 'Jug', 'Broccoli', 'Piano', 'Pizza',
+        'Elephant', 'Skateboard', 'Surfboard', 'Gun',
+        'Skating and Skiing shoes', 'Gas stove', 'Donut', 'Bow Tie', 'Carrot',
+        'Toilet', 'Kite', 'Strawberry', 'Other Balls', 'Shovel', 'Pepper',
+        'Computer Box', 'Toilet Paper', 'Cleaning Products', 'Chopsticks',
+        'Microwave', 'Pigeon', 'Baseball', 'Cutting/chopping Board',
+        'Coffee Table', 'Side Table', 'Scissors', 'Marker', 'Pie', 'Ladder',
+        'Snowboard', 'Cookies', 'Radiator', 'Fire Hydrant', 'Basketball',
+        'Zebra', 'Grape', 'Giraffe', 'Potato', 'Sausage', 'Tricycle', 'Violin',
+        'Egg', 'Fire Extinguisher', 'Candy', 'Fire Truck', 'Billards',
+        'Converter', 'Bathtub', 'Wheelchair', 'Golf Club', 'Briefcase',
+        'Cucumber', 'Cigar/Cigarette ', 'Paint Brush', 'Pear', 'Heavy Truck',
+        'Hamburger', 'Extractor', 'Extention Cord', 'Tong', 'Tennis Racket',
+        'Folder', 'American Football', 'earphone', 'Mask', 'Kettle', 'Tennis',
+        'Ship', 'Swing', 'Coffee Machine', 'Slide', 'Carriage', 'Onion',
+        'Green beans', 'Projector', 'Frisbee',
+        'Washing Machine/Drying Machine', 'Chicken', 'Printer', 'Watermelon',
+        'Saxophone', 'Tissue', 'Toothbrush', 'Ice cream', 'Hotair ballon',
+        'Cello', 'French Fries', 'Scale', 'Trophy', 'Cabbage', 'Hot dog',
+        'Blender', 'Peach', 'Rice', 'Wallet/Purse', 'Volleyball', 'Deer',
+        'Goose', 'Tape', 'Tablet', 'Cosmetics', 'Trumpet', 'Pineapple',
+        'Golf Ball', 'Ambulance', 'Parking meter', 'Mango', 'Key', 'Hurdle',
+        'Fishing Rod', 'Medal', 'Flute', 'Brush', 'Penguin', 'Megaphone',
+        'Corn', 'Lettuce', 'Garlic', 'Swan', 'Helicopter', 'Green Onion',
+        'Sandwich', 'Nuts', 'Speed Limit Sign', 'Induction Cooker', 'Broom',
+        'Trombone', 'Plum', 'Rickshaw', 'Goldfish', 'Kiwi fruit',
+        'Router/modem', 'Poker Card', 'Toaster', 'Shrimp', 'Sushi', 'Cheese',
+        'Notepaper', 'Cherry', 'Pliers', 'CD', 'Pasta', 'Hammer', 'Cue',
+        'Avocado', 'Hamimelon', 'Flask', 'Mushroon', 'Screwdriver', 'Soap',
+        'Recorder', 'Bear', 'Eggplant', 'Board Eraser', 'Coconut',
+        'Tape Measur/ Ruler', 'Pig', 'Showerhead', 'Globe', 'Chips', 'Steak',
+        'Crosswalk Sign', 'Stapler', 'Campel', 'Formula 1 ', 'Pomegranate',
+        'Dishwasher', 'Crab', 'Hoverboard', 'Meat ball', 'Rice Cooker', 'Tuba',
+        'Calculator', 'Papaya', 'Antelope', 'Parrot', 'Seal', 'Buttefly',
+        'Dumbbell', 'Donkey', 'Lion', 'Urinal', 'Dolphin', 'Electric Drill',
+        'Hair Dryer', 'Egg tart', 'Jellyfish', 'Treadmill', 'Lighter',
+        'Grapefruit', 'Game board', 'Mop', 'Radish', 'Baozi', 'Target',
+        'French', 'Spring Rolls', 'Monkey', 'Rabbit', 'Pencil Case', 'Yak',
+        'Red Cabbage', 'Binoculars', 'Asparagus', 'Barbell', 'Scallop',
+        'Noddles', 'Comb', 'Dumpling', 'Oyster', 'Table Teniis paddle',
+        'Cosmetics Brush/Eyeliner Pencil', 'Chainsaw', 'Eraser', 'Lobster',
+        'Durian', 'Okra', 'Lipstick', 'Cosmetics Mirror', 'Curling',
+        'Table Tennis '
+    ]
+
+
+def lvis_classes() -> list:
+    """Class names of LVIS."""
+    return [
+        'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock', 'alcohol',
+        'alligator', 'almond', 'ambulance', 'amplifier', 'anklet', 'antenna',
+        'apple', 'applesauce', 'apricot', 'apron', 'aquarium',
+        'arctic_(type_of_shoe)', 'armband', 'armchair', 'armoire', 'armor',
+        'artichoke', 'trash_can', 'ashtray', 'asparagus', 'atomizer',
+        'avocado', 'award', 'awning', 'ax', 'baboon', 'baby_buggy',
+        'basketball_backboard', 'backpack', 'handbag', 'suitcase', 'bagel',
+        'bagpipe', 'baguet', 'bait', 'ball', 'ballet_skirt', 'balloon',
+        'bamboo', 'banana', 'Band_Aid', 'bandage', 'bandanna', 'banjo',
+        'banner', 'barbell', 'barge', 'barrel', 'barrette', 'barrow',
+        'baseball_base', 'baseball', 'baseball_bat', 'baseball_cap',
+        'baseball_glove', 'basket', 'basketball', 'bass_horn', 'bat_(animal)',
+        'bath_mat', 'bath_towel', 'bathrobe', 'bathtub', 'batter_(food)',
+        'battery', 'beachball', 'bead', 'bean_curd', 'beanbag', 'beanie',
+        'bear', 'bed', 'bedpan', 'bedspread', 'cow', 'beef_(food)', 'beeper',
+        'beer_bottle', 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt',
+        'belt_buckle', 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor',
+        'billboard', 'binder', 'binoculars', 'bird', 'birdfeeder', 'birdbath',
+        'birdcage', 'birdhouse', 'birthday_cake', 'birthday_card',
+        'pirate_flag', 'black_sheep', 'blackberry', 'blackboard', 'blanket',
+        'blazer', 'blender', 'blimp', 'blinker', 'blouse', 'blueberry',
+        'gameboard', 'boat', 'bob', 'bobbin', 'bobby_pin', 'boiled_egg',
+        'bolo_tie', 'deadbolt', 'bolt', 'bonnet', 'book', 'bookcase',
+        'booklet', 'bookmark', 'boom_microphone', 'boot', 'bottle',
+        'bottle_opener', 'bouquet', 'bow_(weapon)', 'bow_(decorative_ribbons)',
+        'bow-tie', 'bowl', 'pipe_bowl', 'bowler_hat', 'bowling_ball', 'box',
+        'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere',
+        'bread-bin', 'bread', 'breechcloth', 'bridal_gown', 'briefcase',
+        'broccoli', 'broach', 'broom', 'brownie', 'brussels_sprouts',
+        'bubble_gum', 'bucket', 'horse_buggy', 'bull', 'bulldog', 'bulldozer',
+        'bullet_train', 'bulletin_board', 'bulletproof_vest', 'bullhorn',
+        'bun', 'bunk_bed', 'buoy', 'burrito', 'bus_(vehicle)', 'business_card',
+        'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car',
+        'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf',
+        'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)',
+        'can', 'can_opener', 'candle', 'candle_holder', 'candy_bar',
+        'candy_cane', 'walking_cane', 'canister', 'canoe', 'cantaloup',
+        'canteen', 'cap_(headwear)', 'bottle_cap', 'cape', 'cappuccino',
+        'car_(automobile)', 'railcar_(part_of_a_train)', 'elevator_car',
+        'car_battery', 'identity_card', 'card', 'cardigan', 'cargo_ship',
+        'carnation', 'horse_carriage', 'carrot', 'tote_bag', 'cart', 'carton',
+        'cash_register', 'casserole', 'cassette', 'cast', 'cat', 'cauliflower',
+        'cayenne_(spice)', 'CD_player', 'celery', 'cellular_telephone',
+        'chain_mail', 'chair', 'chaise_longue', 'chalice', 'chandelier',
+        'chap', 'checkbook', 'checkerboard', 'cherry', 'chessboard',
+        'chicken_(animal)', 'chickpea', 'chili_(vegetable)', 'chime',
+        'chinaware', 'crisp_(potato_chip)', 'poker_chip', 'chocolate_bar',
+        'chocolate_cake', 'chocolate_milk', 'chocolate_mousse', 'choker',
+        'chopping_board', 'chopstick', 'Christmas_tree', 'slide', 'cider',
+        'cigar_box', 'cigarette', 'cigarette_case', 'cistern', 'clarinet',
+        'clasp', 'cleansing_agent', 'cleat_(for_securing_rope)', 'clementine',
+        'clip', 'clipboard', 'clippers_(for_plants)', 'cloak', 'clock',
+        'clock_tower', 'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster',
+        'coat', 'coat_hanger', 'coatrack', 'cock', 'cockroach',
+        'cocoa_(beverage)', 'coconut', 'coffee_maker', 'coffee_table',
+        'coffeepot', 'coil', 'coin', 'colander', 'coleslaw',
+        'coloring_material', 'combination_lock', 'pacifier', 'comic_book',
+        'compass', 'computer_keyboard', 'condiment', 'cone', 'control',
+        'convertible_(automobile)', 'sofa_bed', 'cooker', 'cookie',
+        'cooking_utensil', 'cooler_(for_food)', 'cork_(bottle_plug)',
+        'corkboard', 'corkscrew', 'edible_corn', 'cornbread', 'cornet',
+        'cornice', 'cornmeal', 'corset', 'costume', 'cougar', 'coverall',
+        'cowbell', 'cowboy_hat', 'crab_(animal)', 'crabmeat', 'cracker',
+        'crape', 'crate', 'crayon', 'cream_pitcher', 'crescent_roll', 'crib',
+        'crock_pot', 'crossbar', 'crouton', 'crow', 'crowbar', 'crown',
+        'crucifix', 'cruise_ship', 'police_cruiser', 'crumb', 'crutch',
+        'cub_(animal)', 'cube', 'cucumber', 'cufflink', 'cup', 'trophy_cup',
+        'cupboard', 'cupcake', 'hair_curler', 'curling_iron', 'curtain',
+        'cushion', 'cylinder', 'cymbal', 'dagger', 'dalmatian', 'dartboard',
+        'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
+        'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', 'tux',
+        'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
+        'dishwasher_detergent', 'dispenser', 'diving_board', 'Dixie_cup',
+        'dog', 'dog_collar', 'doll', 'dollar', 'dollhouse', 'dolphin',
+        'domestic_ass', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly',
+        'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit',
+        'dresser', 'drill', 'drone', 'dropper', 'drum_(musical_instrument)',
+        'drumstick', 'duck', 'duckling', 'duct_tape', 'duffel_bag', 'dumbbell',
+        'dumpster', 'dustpan', 'eagle', 'earphone', 'earplug', 'earring',
+        'easel', 'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater',
+        'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk',
+        'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan',
+        'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)',
+        'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)', 'fire_alarm',
+        'fire_engine', 'fire_extinguisher', 'fire_hose', 'fireplace',
+        'fireplug', 'first-aid_kit', 'fish', 'fish_(food)', 'fishbowl',
+        'fishing_rod', 'flag', 'flagpole', 'flamingo', 'flannel', 'flap',
+        'flash', 'flashlight', 'fleece', 'flip-flop_(sandal)',
+        'flipper_(footwear)', 'flower_arrangement', 'flute_glass', 'foal',
+        'folding_chair', 'food_processor', 'football_(American)',
+        'football_helmet', 'footstool', 'fork', 'forklift', 'freight_car',
+        'French_toast', 'freshener', 'frisbee', 'frog', 'fruit_juice',
+        'frying_pan', 'fudge', 'funnel', 'futon', 'gag', 'garbage',
+        'garbage_truck', 'garden_hose', 'gargle', 'gargoyle', 'garlic',
+        'gasmask', 'gazelle', 'gelatin', 'gemstone', 'generator',
+        'giant_panda', 'gift_wrap', 'ginger', 'giraffe', 'cincture',
+        'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles',
+        'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose',
+        'gorilla', 'gourd', 'grape', 'grater', 'gravestone', 'gravy_boat',
+        'green_bean', 'green_onion', 'griddle', 'grill', 'grits', 'grizzly',
+        'grocery_bag', 'guitar', 'gull', 'gun', 'hairbrush', 'hairnet',
+        'hairpin', 'halter_top', 'ham', 'hamburger', 'hammer', 'hammock',
+        'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel',
+        'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw',
+        'hardback_book', 'harmonium', 'hat', 'hatbox', 'veil', 'headband',
+        'headboard', 'headlight', 'headscarf', 'headset',
+        'headstall_(for_horses)', 'heart', 'heater', 'helicopter', 'helmet',
+        'heron', 'highchair', 'hinge', 'hippopotamus', 'hockey_stick', 'hog',
+        'home_plate_(baseball)', 'honey', 'fume_hood', 'hook', 'hookah',
+        'hornet', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
+        'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
+        'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
+        'igniter', 'inhaler', 'iPod', 'iron_(for_clothing)', 'ironing_board',
+        'jacket', 'jam', 'jar', 'jean', 'jeep', 'jelly_bean', 'jersey',
+        'jet_plane', 'jewel', 'jewelry', 'joystick', 'jumpsuit', 'kayak',
+        'keg', 'kennel', 'kettle', 'key', 'keycard', 'kilt', 'kimono',
+        'kitchen_sink', 'kitchen_table', 'kite', 'kitten', 'kiwi_fruit',
+        'knee_pad', 'knife', 'knitting_needle', 'knob', 'knocker_(on_a_door)',
+        'koala', 'lab_coat', 'ladder', 'ladle', 'ladybug', 'lamb_(animal)',
+        'lamb-chop', 'lamp', 'lamppost', 'lampshade', 'lantern', 'lanyard',
+        'laptop_computer', 'lasagna', 'latch', 'lawn_mower', 'leather',
+        'legging_(clothing)', 'Lego', 'legume', 'lemon', 'lemonade', 'lettuce',
+        'license_plate', 'life_buoy', 'life_jacket', 'lightbulb',
+        'lightning_rod', 'lime', 'limousine', 'lion', 'lip_balm', 'liquor',
+        'lizard', 'log', 'lollipop', 'speaker_(stereo_equipment)', 'loveseat',
+        'machine_gun', 'magazine', 'magnet', 'mail_slot', 'mailbox_(at_home)',
+        'mallard', 'mallet', 'mammoth', 'manatee', 'mandarin_orange', 'manger',
+        'manhole', 'map', 'marker', 'martini', 'mascot', 'mashed_potato',
+        'masher', 'mask', 'mast', 'mat_(gym_equipment)', 'matchbox',
+        'mattress', 'measuring_cup', 'measuring_stick', 'meatball', 'medicine',
+        'melon', 'microphone', 'microscope', 'microwave_oven', 'milestone',
+        'milk', 'milk_can', 'milkshake', 'minivan', 'mint_candy', 'mirror',
+        'mitten', 'mixer_(kitchen_tool)', 'money',
+        'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor',
+        'motor_scooter', 'motor_vehicle', 'motorcycle', 'mound_(baseball)',
+        'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom',
+        'music_stool', 'musical_instrument', 'nailfile', 'napkin',
+        'neckerchief', 'necklace', 'necktie', 'needle', 'nest', 'newspaper',
+        'newsstand', 'nightshirt', 'nosebag_(for_animals)',
+        'noseband_(for_animals)', 'notebook', 'notepad', 'nut', 'nutcracker',
+        'oar', 'octopus_(food)', 'octopus_(animal)', 'oil_lamp', 'olive_oil',
+        'omelet', 'onion', 'orange_(fruit)', 'orange_juice', 'ostrich',
+        'ottoman', 'oven', 'overalls_(clothing)', 'owl', 'packet', 'inkpad',
+        'pad', 'paddle', 'padlock', 'paintbrush', 'painting', 'pajamas',
+        'palette', 'pan_(for_cooking)', 'pan_(metal_container)', 'pancake',
+        'pantyhose', 'papaya', 'paper_plate', 'paper_towel', 'paperback_book',
+        'paperweight', 'parachute', 'parakeet', 'parasail_(sports)', 'parasol',
+        'parchment', 'parka', 'parking_meter', 'parrot',
+        'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
+        'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
+        'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'wooden_leg',
+        'pegboard', 'pelican', 'pen', 'pencil', 'pencil_box',
+        'pencil_sharpener', 'pendulum', 'penguin', 'pennant', 'penny_(coin)',
+        'pepper', 'pepper_mill', 'perfume', 'persimmon', 'person', 'pet',
+        'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
+        'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
+        'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
+        'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
+        'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
+        'plate', 'platter', 'playpen', 'pliers', 'plow_(farm_equipment)',
+        'plume', 'pocket_watch', 'pocketknife', 'poker_(fire_stirring_tool)',
+        'pole', 'polo_shirt', 'poncho', 'pony', 'pool_table', 'pop_(soda)',
+        'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', 'potato',
+        'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', 'pretzel',
+        'printer', 'projectile_(weapon)', 'projector', 'propeller', 'prune',
+        'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher',
+        'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit',
+        'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', 'radish',
+        'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat',
+        'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
+        'recliner', 'record_player', 'reflector', 'remote_control',
+        'rhinoceros', 'rib_(food)', 'rifle', 'ring', 'river_boat', 'road_map',
+        'robe', 'rocking_chair', 'rodent', 'roller_skate', 'Rollerblade',
+        'rolling_pin', 'root_beer', 'router_(computer_equipment)',
+        'rubber_band', 'runner_(carpet)', 'plastic_bag',
+        'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', 'safety_pin',
+        'sail', 'salad', 'salad_plate', 'salami', 'salmon_(fish)',
+        'salmon_(food)', 'salsa', 'saltshaker', 'sandal_(type_of_shoe)',
+        'sandwich', 'satchel', 'saucepan', 'saucer', 'sausage', 'sawhorse',
+        'saxophone', 'scale_(measuring_instrument)', 'scarecrow', 'scarf',
+        'school_bus', 'scissors', 'scoreboard', 'scraper', 'screwdriver',
+        'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
+        'seashell', 'sewing_machine', 'shaker', 'shampoo', 'shark',
+        'sharpener', 'Sharpie', 'shaver_(electric)', 'shaving_cream', 'shawl',
+        'shears', 'sheep', 'shepherd_dog', 'sherbert', 'shield', 'shirt',
+        'shoe', 'shopping_bag', 'shopping_cart', 'short_pants', 'shot_glass',
+        'shoulder_bag', 'shovel', 'shower_head', 'shower_cap',
+        'shower_curtain', 'shredder_(for_paper)', 'signboard', 'silo', 'sink',
+        'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka', 'ski_pole',
+        'skirt', 'skullcap', 'sled', 'sleeping_bag', 'sling_(bandage)',
+        'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman',
+        'snowmobile', 'soap', 'soccer_ball', 'sock', 'sofa', 'softball',
+        'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon',
+        'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)',
+        'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'crawfish',
+        'sponge', 'spoon', 'sportswear', 'spotlight', 'squid_(food)',
+        'squirrel', 'stagecoach', 'stapler_(stapling_machine)', 'starfish',
+        'statue_(sculpture)', 'steak_(food)', 'steak_knife', 'steering_wheel',
+        'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew', 'stirrer',
+        'stirrup', 'stool', 'stop_sign', 'brake_light', 'stove', 'strainer',
+        'strap', 'straw_(for_drinking)', 'strawberry', 'street_sign',
+        'streetlight', 'string_cheese', 'stylus', 'subwoofer', 'sugar_bowl',
+        'sugarcane_(plant)', 'suit_(clothing)', 'sunflower', 'sunglasses',
+        'sunhat', 'surfboard', 'sushi', 'mop', 'sweat_pants', 'sweatband',
+        'sweater', 'sweatshirt', 'sweet_potato', 'swimsuit', 'sword',
+        'syringe', 'Tabasco_sauce', 'table-tennis_table', 'table',
+        'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', 'taillight',
+        'tambourine', 'army_tank', 'tank_(storage_vessel)',
+        'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
+        'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
+        'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
+        'telephone_pole', 'telephoto_lens', 'television_camera',
+        'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
+        'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
+        'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', 'tinfoil',
+        'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', 'toaster_oven',
+        'toilet', 'toilet_tissue', 'tomato', 'tongs', 'toolbox', 'toothbrush',
+        'toothpaste', 'toothpick', 'cover', 'tortilla', 'tow_truck', 'towel',
+        'towel_rack', 'toy', 'tractor_(farm_equipment)', 'traffic_light',
+        'dirt_bike', 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline',
+        'tray', 'trench_coat', 'triangle_(musical_instrument)', 'tricycle',
+        'tripod', 'trousers', 'truck', 'truffle_(chocolate)', 'trunk', 'vat',
+        'turban', 'turkey_(food)', 'turnip', 'turtle', 'turtleneck_(clothing)',
+        'typewriter', 'umbrella', 'underwear', 'unicycle', 'urinal', 'urn',
+        'vacuum_cleaner', 'vase', 'vending_machine', 'vent', 'vest',
+        'videotape', 'vinegar', 'violin', 'vodka', 'volleyball', 'vulture',
+        'waffle', 'waffle_iron', 'wagon', 'wagon_wheel', 'walking_stick',
+        'wall_clock', 'wall_socket', 'wallet', 'walrus', 'wardrobe',
+        'washbasin', 'automatic_washer', 'watch', 'water_bottle',
+        'water_cooler', 'water_faucet', 'water_heater', 'water_jug',
+        'water_gun', 'water_scooter', 'water_ski', 'water_tower',
+        'watering_can', 'watermelon', 'weathervane', 'webcam', 'wedding_cake',
+        'wedding_ring', 'wet_suit', 'wheel', 'wheelchair', 'whipped_cream',
+        'whistle', 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)',
+        'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket',
+        'wineglass', 'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon',
+        'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', 'yogurt',
+        'yoke_(animal_equipment)', 'zebra', 'zucchini'
+    ]
+
+
+dataset_aliases = {
+    'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'],
+    'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'],
+    'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'],
+    'coco': ['coco', 'mscoco', 'ms_coco'],
+    'coco_panoptic': ['coco_panoptic', 'panoptic'],
+    'wider_face': ['WIDERFaceDataset', 'wider_face', 'WIDERFace'],
+    'cityscapes': ['cityscapes'],
+    'oid_challenge': ['oid_challenge', 'openimages_challenge'],
+    'oid_v6': ['oid_v6', 'openimages_v6'],
+    'objects365v1': ['objects365v1', 'obj365v1'],
+    'objects365v2': ['objects365v2', 'obj365v2'],
+    'lvis': ['lvis', 'lvis_v1'],
+}
+
+
+def get_classes(dataset) -> list:
+    """Get class names of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_classes()')
+        else:
+            raise ValueError(f'Unrecognized dataset: {dataset}')
+    else:
+        raise TypeError(f'dataset must a str, but got {type(dataset)}')
+    return labels
diff --git a/head_extractor/build/lib/mmdet/evaluation/functional/mean_ap.py b/head_extractor/build/lib/mmdet/evaluation/functional/mean_ap.py
new file mode 100644
index 0000000000000000000000000000000000000000..989972a48467f74fa915fa6f3807d0db3becdba2
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/functional/mean_ap.py
@@ -0,0 +1,792 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from multiprocessing import Pool
+
+import numpy as np
+from mmengine.logging import print_log
+from mmengine.utils import is_str
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+from .class_names import get_classes
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or ndarray: calculated average precision
+    """
+    no_scale = False
+    if recalls.ndim == 1:
+        no_scale = True
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+        ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    if no_scale:
+        ap = ap[0]
+    return ap
+
+
+def tpfp_imagenet(det_bboxes,
+                  gt_bboxes,
+                  gt_bboxes_ignore=None,
+                  default_iou_thr=0.5,
+                  area_ranges=None,
+                  use_legacy_coordinate=False,
+                  **kwargs):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Defaults to None
+        default_iou_thr (float): IoU threshold to be considered as matched for
+            medium and large bboxes (small ones have special rules).
+            Defaults to 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. Defaults to None.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Defaults to False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0],
+                  dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp
+    # of a certain scale.
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(
+        det_bboxes, gt_bboxes - 1, use_legacy_coordinate=use_legacy_coordinate)
+    gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length
+    gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length
+    iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)),
+                          default_iou_thr)
+    # sort all detections by scores in descending order
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+        else:
+            gt_areas = gt_w * gt_h
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            max_iou = -1
+            matched_gt = -1
+            # find best overlapped available gt
+            for j in range(num_gts):
+                # different from PASCAL VOC: allow finding other gts if the
+                # best overlapped ones are already matched by other det bboxes
+                if gt_covered[j]:
+                    continue
+                elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou:
+                    max_iou = ious[i, j]
+                    matched_gt = j
+            # there are 4 cases for a det bbox:
+            # 1. it matches a gt, tp = 1, fp = 0
+            # 2. it matches an ignored gt, tp = 0, fp = 0
+            # 3. it matches no gt and within area range, tp = 0, fp = 1
+            # 4. it matches no gt but is beyond area range, tp = 0, fp = 0
+            if matched_gt >= 0:
+                gt_covered[matched_gt] = 1
+                if not (gt_ignore_inds[matched_gt]
+                        or gt_area_ignore[matched_gt]):
+                    tp[k, i] = 1
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0] + extra_length) * (
+                    bbox[3] - bbox[1] + extra_length)
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_default(det_bboxes,
+                 gt_bboxes,
+                 gt_bboxes_ignore=None,
+                 iou_thr=0.5,
+                 area_ranges=None,
+                 use_legacy_coordinate=False,
+                 **kwargs):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Defaults to None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Defaults to 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be
+            evaluated, in the format [(min1, max1), (min2, max2), ...].
+            Defaults to None.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Defaults to False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0],
+                  dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+
+    ious = bbox_overlaps(
+        det_bboxes, gt_bboxes, use_legacy_coordinate=use_legacy_coordinate)
+    # for each det, the max iou with all gts
+    ious_max = ious.max(axis=1)
+    # for each det, which gt overlaps most with it
+    ious_argmax = ious.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+        else:
+            gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length) * (
+                gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length)
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            if ious_max[i] >= iou_thr:
+                matched_gt = ious_argmax[i]
+                if not (gt_ignore_inds[matched_gt]
+                        or gt_area_ignore[matched_gt]):
+                    if not gt_covered[matched_gt]:
+                        gt_covered[matched_gt] = True
+                        tp[k, i] = 1
+                    else:
+                        fp[k, i] = 1
+                # otherwise ignore this detected bbox, tp = 0, fp = 0
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0] + extra_length) * (
+                    bbox[3] - bbox[1] + extra_length)
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_openimages(det_bboxes,
+                    gt_bboxes,
+                    gt_bboxes_ignore=None,
+                    iou_thr=0.5,
+                    area_ranges=None,
+                    use_legacy_coordinate=False,
+                    gt_bboxes_group_of=None,
+                    use_group_of=True,
+                    ioa_thr=0.5,
+                    **kwargs):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Defaults to None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Defaults to 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be
+            evaluated, in the format [(min1, max1), (min2, max2), ...].
+            Defaults to None.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Defaults to False.
+        gt_bboxes_group_of (ndarray): GT group_of of this image, of shape
+            (k, 1). Defaults to None
+        use_group_of (bool): Whether to use group of when calculate TP and FP,
+            which only used in OpenImages evaluation. Defaults to True.
+        ioa_thr (float | None): IoA threshold to be considered as matched,
+            which only used in OpenImages evaluation. Defaults to 0.5.
+
+    Returns:
+        tuple[np.ndarray]: Returns a tuple (tp, fp, det_bboxes), where
+        (tp, fp) whose elements are 0 and 1. The shape of each array is
+        (num_scales, m). (det_bboxes) whose will filter those are not
+        matched by group of gts when processing Open Images evaluation.
+        The shape is (num_scales, m).
+    """
+
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0],
+                  dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp, det_bboxes
+
+    if gt_bboxes_group_of is not None and use_group_of:
+        # if handle group-of boxes, divided gt boxes into two parts:
+        # non-group-of and group-of.Then calculate ious and ioas through
+        # non-group-of group-of gts respectively. This only used in
+        # OpenImages evaluation.
+        assert gt_bboxes_group_of.shape[0] == gt_bboxes.shape[0]
+        non_group_gt_bboxes = gt_bboxes[~gt_bboxes_group_of]
+        group_gt_bboxes = gt_bboxes[gt_bboxes_group_of]
+        num_gts_group = group_gt_bboxes.shape[0]
+        ious = bbox_overlaps(det_bboxes, non_group_gt_bboxes)
+        ioas = bbox_overlaps(det_bboxes, group_gt_bboxes, mode='iof')
+    else:
+        # if not consider group-of boxes, only calculate ious through gt boxes
+        ious = bbox_overlaps(
+            det_bboxes, gt_bboxes, use_legacy_coordinate=use_legacy_coordinate)
+        ioas = None
+
+    if ious.shape[1] > 0:
+        # for each det, the max iou with all gts
+        ious_max = ious.max(axis=1)
+        # for each det, which gt overlaps most with it
+        ious_argmax = ious.argmax(axis=1)
+        # sort all dets in descending order by scores
+        sort_inds = np.argsort(-det_bboxes[:, -1])
+        for k, (min_area, max_area) in enumerate(area_ranges):
+            gt_covered = np.zeros(num_gts, dtype=bool)
+            # if no area range is specified, gt_area_ignore is all False
+            if min_area is None:
+                gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+            else:
+                gt_areas = (
+                    gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length) * (
+                        gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length)
+                gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+            for i in sort_inds:
+                if ious_max[i] >= iou_thr:
+                    matched_gt = ious_argmax[i]
+                    if not (gt_ignore_inds[matched_gt]
+                            or gt_area_ignore[matched_gt]):
+                        if not gt_covered[matched_gt]:
+                            gt_covered[matched_gt] = True
+                            tp[k, i] = 1
+                        else:
+                            fp[k, i] = 1
+                    # otherwise ignore this detected bbox, tp = 0, fp = 0
+                elif min_area is None:
+                    fp[k, i] = 1
+                else:
+                    bbox = det_bboxes[i, :4]
+                    area = (bbox[2] - bbox[0] + extra_length) * (
+                        bbox[3] - bbox[1] + extra_length)
+                    if area >= min_area and area < max_area:
+                        fp[k, i] = 1
+    else:
+        # if there is no no-group-of gt bboxes in this image,
+        # then all det bboxes within area range are false positives.
+        # Only used in OpenImages evaluation.
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+
+    if ioas is None or ioas.shape[1] <= 0:
+        return tp, fp, det_bboxes
+    else:
+        # The evaluation of group-of TP and FP are done in two stages:
+        # 1. All detections are first matched to non group-of boxes; true
+        #    positives are determined.
+        # 2. Detections that are determined as false positives are matched
+        #    against group-of boxes and calculated group-of TP and FP.
+        # Only used in OpenImages evaluation.
+        det_bboxes_group = np.zeros(
+            (num_scales, ioas.shape[1], det_bboxes.shape[1]), dtype=float)
+        match_group_of = np.zeros((num_scales, num_dets), dtype=bool)
+        tp_group = np.zeros((num_scales, num_gts_group), dtype=np.float32)
+        ioas_max = ioas.max(axis=1)
+        # for each det, which gt overlaps most with it
+        ioas_argmax = ioas.argmax(axis=1)
+        # sort all dets in descending order by scores
+        sort_inds = np.argsort(-det_bboxes[:, -1])
+        for k, (min_area, max_area) in enumerate(area_ranges):
+            box_is_covered = tp[k]
+            # if no area range is specified, gt_area_ignore is all False
+            if min_area is None:
+                gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+            else:
+                gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+                    gt_bboxes[:, 3] - gt_bboxes[:, 1])
+                gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+            for i in sort_inds:
+                matched_gt = ioas_argmax[i]
+                if not box_is_covered[i]:
+                    if ioas_max[i] >= ioa_thr:
+                        if not (gt_ignore_inds[matched_gt]
+                                or gt_area_ignore[matched_gt]):
+                            if not tp_group[k, matched_gt]:
+                                tp_group[k, matched_gt] = 1
+                                match_group_of[k, i] = True
+                            else:
+                                match_group_of[k, i] = True
+
+                            if det_bboxes_group[k, matched_gt, -1] < \
+                                    det_bboxes[i, -1]:
+                                det_bboxes_group[k, matched_gt] = \
+                                    det_bboxes[i]
+
+        fp_group = (tp_group <= 0).astype(float)
+        tps = []
+        fps = []
+        # concatenate tp, fp, and det-boxes which not matched group of
+        # gt boxes and tp_group, fp_group, and det_bboxes_group which
+        # matched group of boxes respectively.
+        for i in range(num_scales):
+            tps.append(
+                np.concatenate((tp[i][~match_group_of[i]], tp_group[i])))
+            fps.append(
+                np.concatenate((fp[i][~match_group_of[i]], fp_group[i])))
+            det_bboxes = np.concatenate(
+                (det_bboxes[~match_group_of[i]], det_bboxes_group[i]))
+
+        tp = np.vstack(tps)
+        fp = np.vstack(fps)
+        return tp, fp, det_bboxes
+
+
+def get_cls_results(det_results, annotations, class_id):
+    """Get det results and gt information of a certain class.
+
+    Args:
+        det_results (list[list]): Same as `eval_map()`.
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        tuple[list[np.ndarray]]: detected bboxes, gt bboxes, ignored gt bboxes
+    """
+    cls_dets = [img_res[class_id] for img_res in det_results]
+    cls_gts = []
+    cls_gts_ignore = []
+    for ann in annotations:
+        gt_inds = ann['labels'] == class_id
+        cls_gts.append(ann['bboxes'][gt_inds, :])
+
+        if ann.get('labels_ignore', None) is not None:
+            ignore_inds = ann['labels_ignore'] == class_id
+            cls_gts_ignore.append(ann['bboxes_ignore'][ignore_inds, :])
+        else:
+            cls_gts_ignore.append(np.empty((0, 4), dtype=np.float32))
+
+    return cls_dets, cls_gts, cls_gts_ignore
+
+
+def get_cls_group_ofs(annotations, class_id):
+    """Get `gt_group_of` of a certain class, which is used in Open Images.
+
+    Args:
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        list[np.ndarray]: `gt_group_of` of a certain class.
+    """
+    gt_group_ofs = []
+    for ann in annotations:
+        gt_inds = ann['labels'] == class_id
+        if ann.get('gt_is_group_ofs', None) is not None:
+            gt_group_ofs.append(ann['gt_is_group_ofs'][gt_inds])
+        else:
+            gt_group_ofs.append(np.empty((0, 1), dtype=bool))
+
+    return gt_group_ofs
+
+
+def eval_map(det_results,
+             annotations,
+             scale_ranges=None,
+             iou_thr=0.5,
+             ioa_thr=None,
+             dataset=None,
+             logger=None,
+             tpfp_fn=None,
+             nproc=4,
+             use_legacy_coordinate=False,
+             use_group_of=False,
+             eval_mode='area'):
+    """Evaluate mAP of a dataset.
+
+    Args:
+        det_results (list[list]): [[cls1_det, cls2_det, ...], ...].
+            The outer list indicates images, and the inner list indicates
+            per-class detected bboxes.
+        annotations (list[dict]): Ground truth annotations where each item of
+            the list indicates an image. Keys of annotations are:
+
+            - `bboxes`: numpy array of shape (n, 4)
+            - `labels`: numpy array of shape (n, )
+            - `bboxes_ignore` (optional): numpy array of shape (k, 4)
+            - `labels_ignore` (optional): numpy array of shape (k, )
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. A range of
+            (32, 64) means the area range between (32**2, 64**2).
+            Defaults to None.
+        iou_thr (float): IoU threshold to be considered as matched.
+            Defaults to 0.5.
+        ioa_thr (float | None): IoA threshold to be considered as matched,
+            which only used in OpenImages evaluation. Defaults to None.
+        dataset (list[str] | str | None): Dataset name or dataset classes,
+            there are minor differences in metrics for different datasets, e.g.
+            "voc", "imagenet_det", etc. Defaults to None.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmengine.logging.print_log()` for details.
+            Defaults to None.
+        tpfp_fn (callable | None): The function used to determine true/
+            false positives. If None, :func:`tpfp_default` is used as default
+            unless dataset is 'det' or 'vid' (:func:`tpfp_imagenet` in this
+            case). If it is given as a function, then this function is used
+            to evaluate tp & fp. Default None.
+        nproc (int): Processes used for computing TP and FP.
+            Defaults to 4.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Defaults to False.
+        use_group_of (bool): Whether to use group of when calculate TP and FP,
+            which only used in OpenImages evaluation. Defaults to False.
+        eval_mode (str): 'area' or '11points', 'area' means calculating the
+            area under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1],
+            PASCAL VOC2007 uses `11points` as default evaluate mode, while
+            others are 'area'. Defaults to 'area'.
+
+    Returns:
+        tuple: (mAP, [dict, dict, ...])
+    """
+    assert len(det_results) == len(annotations)
+    assert eval_mode in ['area', '11points'], \
+        f'Unrecognized {eval_mode} mode, only "area" and "11points" ' \
+        'are supported'
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    num_imgs = len(det_results)
+    num_scales = len(scale_ranges) if scale_ranges is not None else 1
+    num_classes = len(det_results[0])  # positive class num
+    area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
+                   if scale_ranges is not None else None)
+
+    # There is no need to use multi processes to process
+    # when num_imgs = 1 .
+    if num_imgs > 1:
+        assert nproc > 0, 'nproc must be at least one.'
+        nproc = min(nproc, num_imgs)
+        pool = Pool(nproc)
+
+    eval_results = []
+    for i in range(num_classes):
+        # get gt and det bboxes of this class
+        cls_dets, cls_gts, cls_gts_ignore = get_cls_results(
+            det_results, annotations, i)
+        # choose proper function according to datasets to compute tp and fp
+        if tpfp_fn is None:
+            if dataset in ['det', 'vid']:
+                tpfp_fn = tpfp_imagenet
+            elif dataset in ['oid_challenge', 'oid_v6'] \
+                    or use_group_of is True:
+                tpfp_fn = tpfp_openimages
+            else:
+                tpfp_fn = tpfp_default
+        if not callable(tpfp_fn):
+            raise ValueError(
+                f'tpfp_fn has to be a function or None, but got {tpfp_fn}')
+
+        if num_imgs > 1:
+            # compute tp and fp for each image with multiple processes
+            args = []
+            if use_group_of:
+                # used in Open Images Dataset evaluation
+                gt_group_ofs = get_cls_group_ofs(annotations, i)
+                args.append(gt_group_ofs)
+                args.append([use_group_of for _ in range(num_imgs)])
+            if ioa_thr is not None:
+                args.append([ioa_thr for _ in range(num_imgs)])
+
+            tpfp = pool.starmap(
+                tpfp_fn,
+                zip(cls_dets, cls_gts, cls_gts_ignore,
+                    [iou_thr for _ in range(num_imgs)],
+                    [area_ranges for _ in range(num_imgs)],
+                    [use_legacy_coordinate for _ in range(num_imgs)], *args))
+        else:
+            tpfp = tpfp_fn(
+                cls_dets[0],
+                cls_gts[0],
+                cls_gts_ignore[0],
+                iou_thr,
+                area_ranges,
+                use_legacy_coordinate,
+                gt_bboxes_group_of=(get_cls_group_ofs(annotations, i)[0]
+                                    if use_group_of else None),
+                use_group_of=use_group_of,
+                ioa_thr=ioa_thr)
+            tpfp = [tpfp]
+
+        if use_group_of:
+            tp, fp, cls_dets = tuple(zip(*tpfp))
+        else:
+            tp, fp = tuple(zip(*tpfp))
+        # calculate gt number of each scale
+        # ignored gts or gts beyond the specific scale are not counted
+        num_gts = np.zeros(num_scales, dtype=int)
+        for j, bbox in enumerate(cls_gts):
+            if area_ranges is None:
+                num_gts[0] += bbox.shape[0]
+            else:
+                gt_areas = (bbox[:, 2] - bbox[:, 0] + extra_length) * (
+                    bbox[:, 3] - bbox[:, 1] + extra_length)
+                for k, (min_area, max_area) in enumerate(area_ranges):
+                    num_gts[k] += np.sum((gt_areas >= min_area)
+                                         & (gt_areas < max_area))
+        # sort all det bboxes by score, also sort tp and fp
+        cls_dets = np.vstack(cls_dets)
+        num_dets = cls_dets.shape[0]
+        sort_inds = np.argsort(-cls_dets[:, -1])
+        tp = np.hstack(tp)[:, sort_inds]
+        fp = np.hstack(fp)[:, sort_inds]
+        # calculate recall and precision with tp and fp
+        tp = np.cumsum(tp, axis=1)
+        fp = np.cumsum(fp, axis=1)
+        eps = np.finfo(np.float32).eps
+        recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
+        precisions = tp / np.maximum((tp + fp), eps)
+        # calculate AP
+        if scale_ranges is None:
+            recalls = recalls[0, :]
+            precisions = precisions[0, :]
+            num_gts = num_gts.item()
+        ap = average_precision(recalls, precisions, eval_mode)
+        eval_results.append({
+            'num_gts': num_gts,
+            'num_dets': num_dets,
+            'recall': recalls,
+            'precision': precisions,
+            'ap': ap
+        })
+
+    if num_imgs > 1:
+        pool.close()
+
+    if scale_ranges is not None:
+        # shape (num_classes, num_scales)
+        all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
+        all_num_gts = np.vstack(
+            [cls_result['num_gts'] for cls_result in eval_results])
+        mean_ap = []
+        for i in range(num_scales):
+            if np.any(all_num_gts[:, i] > 0):
+                mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean())
+            else:
+                mean_ap.append(0.0)
+    else:
+        aps = []
+        for cls_result in eval_results:
+            if cls_result['num_gts'] > 0:
+                aps.append(cls_result['ap'])
+        mean_ap = np.array(aps).mean().item() if aps else 0.0
+
+    print_map_summary(
+        mean_ap, eval_results, dataset, area_ranges, logger=logger)
+
+    return mean_ap, eval_results
+
+
+def print_map_summary(mean_ap,
+                      results,
+                      dataset=None,
+                      scale_ranges=None,
+                      logger=None):
+    """Print mAP and results of each class.
+
+    A table will be printed to show the gts/dets/recall/AP of each class and
+    the mAP.
+
+    Args:
+        mean_ap (float): Calculated from `eval_map()`.
+        results (list[dict]): Calculated from `eval_map()`.
+        dataset (list[str] | str | None): Dataset name or dataset classes.
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmengine.logging.print_log()` for details.
+            Defaults to None.
+    """
+
+    if logger == 'silent':
+        return
+
+    if isinstance(results[0]['ap'], np.ndarray):
+        num_scales = len(results[0]['ap'])
+    else:
+        num_scales = 1
+
+    if scale_ranges is not None:
+        assert len(scale_ranges) == num_scales
+
+    num_classes = len(results)
+
+    recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+    aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+    num_gts = np.zeros((num_scales, num_classes), dtype=int)
+    for i, cls_result in enumerate(results):
+        if cls_result['recall'].size > 0:
+            recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+        aps[:, i] = cls_result['ap']
+        num_gts[:, i] = cls_result['num_gts']
+
+    if dataset is None:
+        label_names = [str(i) for i in range(num_classes)]
+    elif is_str(dataset):
+        label_names = get_classes(dataset)
+    else:
+        label_names = dataset
+
+    if not isinstance(mean_ap, list):
+        mean_ap = [mean_ap]
+
+    header = ['class', 'gts', 'dets', 'recall', 'ap']
+    for i in range(num_scales):
+        if scale_ranges is not None:
+            print_log(f'Scale range {scale_ranges[i]}', logger=logger)
+        table_data = [header]
+        for j in range(num_classes):
+            row_data = [
+                label_names[j], num_gts[i, j], results[j]['num_dets'],
+                f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}'
+            ]
+            table_data.append(row_data)
+        table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}'])
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print_log('\n' + table.table, logger=logger)
diff --git a/head_extractor/build/lib/mmdet/evaluation/functional/panoptic_utils.py b/head_extractor/build/lib/mmdet/evaluation/functional/panoptic_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6faa8ed52bc46c2cb74b1974b8daa521e616e996
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/functional/panoptic_utils.py
@@ -0,0 +1,228 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Copyright (c) 2018, Alexander Kirillov
+# This file supports `backend_args` for `panopticapi`,
+# the source code is copied from `panopticapi`,
+# only the way to load the gt images is modified.
+import multiprocessing
+import os
+
+import mmcv
+import numpy as np
+from mmengine.fileio import get
+
+# A custom value to distinguish instance ID and category ID; need to
+# be greater than the number of categories.
+# For a pixel in the panoptic result map:
+#   pan_id = ins_id * INSTANCE_OFFSET + cat_id
+INSTANCE_OFFSET = 1000
+
+try:
+    from panopticapi.evaluation import OFFSET, VOID, PQStat
+    from panopticapi.utils import rgb2id
+except ImportError:
+    PQStat = None
+    rgb2id = None
+    VOID = 0
+    OFFSET = 256 * 256 * 256
+
+
+def pq_compute_single_core(proc_id,
+                           annotation_set,
+                           gt_folder,
+                           pred_folder,
+                           categories,
+                           backend_args=None,
+                           print_log=False):
+    """The single core function to evaluate the metric of Panoptic
+    Segmentation.
+
+    Same as the function with the same name in `panopticapi`. Only the function
+    to load the images is changed to use the file client.
+
+    Args:
+        proc_id (int): The id of the mini process.
+        gt_folder (str): The path of the ground truth images.
+        pred_folder (str): The path of the prediction images.
+        categories (str): The categories of the dataset.
+        backend_args (object): The Backend of the dataset. If None,
+            the backend will be set to `local`.
+        print_log (bool): Whether to print the log. Defaults to False.
+    """
+    if PQStat is None:
+        raise RuntimeError(
+            'panopticapi is not installed, please install it by: '
+            'pip install git+https://github.com/cocodataset/'
+            'panopticapi.git.')
+
+    pq_stat = PQStat()
+
+    idx = 0
+    for gt_ann, pred_ann in annotation_set:
+        if print_log and idx % 100 == 0:
+            print('Core: {}, {} from {} images processed'.format(
+                proc_id, idx, len(annotation_set)))
+        idx += 1
+        # The gt images can be on the local disk or `ceph`, so we use
+        # backend here.
+        img_bytes = get(
+            os.path.join(gt_folder, gt_ann['file_name']),
+            backend_args=backend_args)
+        pan_gt = mmcv.imfrombytes(img_bytes, flag='color', channel_order='rgb')
+        pan_gt = rgb2id(pan_gt)
+
+        # The predictions can only be on the local dist now.
+        pan_pred = mmcv.imread(
+            os.path.join(pred_folder, pred_ann['file_name']),
+            flag='color',
+            channel_order='rgb')
+        pan_pred = rgb2id(pan_pred)
+
+        gt_segms = {el['id']: el for el in gt_ann['segments_info']}
+        pred_segms = {el['id']: el for el in pred_ann['segments_info']}
+
+        # predicted segments area calculation + prediction sanity checks
+        pred_labels_set = set(el['id'] for el in pred_ann['segments_info'])
+        labels, labels_cnt = np.unique(pan_pred, return_counts=True)
+        for label, label_cnt in zip(labels, labels_cnt):
+            if label not in pred_segms:
+                if label == VOID:
+                    continue
+                raise KeyError(
+                    'In the image with ID {} segment with ID {} is '
+                    'presented in PNG and not presented in JSON.'.format(
+                        gt_ann['image_id'], label))
+            pred_segms[label]['area'] = label_cnt
+            pred_labels_set.remove(label)
+            if pred_segms[label]['category_id'] not in categories:
+                raise KeyError(
+                    'In the image with ID {} segment with ID {} has '
+                    'unknown category_id {}.'.format(
+                        gt_ann['image_id'], label,
+                        pred_segms[label]['category_id']))
+        if len(pred_labels_set) != 0:
+            raise KeyError(
+                'In the image with ID {} the following segment IDs {} '
+                'are presented in JSON and not presented in PNG.'.format(
+                    gt_ann['image_id'], list(pred_labels_set)))
+
+        # confusion matrix calculation
+        pan_gt_pred = pan_gt.astype(np.uint64) * OFFSET + pan_pred.astype(
+            np.uint64)
+        gt_pred_map = {}
+        labels, labels_cnt = np.unique(pan_gt_pred, return_counts=True)
+        for label, intersection in zip(labels, labels_cnt):
+            gt_id = label // OFFSET
+            pred_id = label % OFFSET
+            gt_pred_map[(gt_id, pred_id)] = intersection
+
+        # count all matched pairs
+        gt_matched = set()
+        pred_matched = set()
+        for label_tuple, intersection in gt_pred_map.items():
+            gt_label, pred_label = label_tuple
+            if gt_label not in gt_segms:
+                continue
+            if pred_label not in pred_segms:
+                continue
+            if gt_segms[gt_label]['iscrowd'] == 1:
+                continue
+            if gt_segms[gt_label]['category_id'] != pred_segms[pred_label][
+                    'category_id']:
+                continue
+
+            union = pred_segms[pred_label]['area'] + gt_segms[gt_label][
+                'area'] - intersection - gt_pred_map.get((VOID, pred_label), 0)
+            iou = intersection / union
+            if iou > 0.5:
+                pq_stat[gt_segms[gt_label]['category_id']].tp += 1
+                pq_stat[gt_segms[gt_label]['category_id']].iou += iou
+                gt_matched.add(gt_label)
+                pred_matched.add(pred_label)
+
+        # count false positives
+        crowd_labels_dict = {}
+        for gt_label, gt_info in gt_segms.items():
+            if gt_label in gt_matched:
+                continue
+            # crowd segments are ignored
+            if gt_info['iscrowd'] == 1:
+                crowd_labels_dict[gt_info['category_id']] = gt_label
+                continue
+            pq_stat[gt_info['category_id']].fn += 1
+
+        # count false positives
+        for pred_label, pred_info in pred_segms.items():
+            if pred_label in pred_matched:
+                continue
+            # intersection of the segment with VOID
+            intersection = gt_pred_map.get((VOID, pred_label), 0)
+            # plus intersection with corresponding CROWD region if it exists
+            if pred_info['category_id'] in crowd_labels_dict:
+                intersection += gt_pred_map.get(
+                    (crowd_labels_dict[pred_info['category_id']], pred_label),
+                    0)
+            # predicted segment is ignored if more than half of
+            # the segment correspond to VOID and CROWD regions
+            if intersection / pred_info['area'] > 0.5:
+                continue
+            pq_stat[pred_info['category_id']].fp += 1
+
+    if print_log:
+        print('Core: {}, all {} images processed'.format(
+            proc_id, len(annotation_set)))
+    return pq_stat
+
+
+def pq_compute_multi_core(matched_annotations_list,
+                          gt_folder,
+                          pred_folder,
+                          categories,
+                          backend_args=None,
+                          nproc=32):
+    """Evaluate the metrics of Panoptic Segmentation with multithreading.
+
+    Same as the function with the same name in `panopticapi`.
+
+    Args:
+        matched_annotations_list (list): The matched annotation list. Each
+            element is a tuple of annotations of the same image with the
+            format (gt_anns, pred_anns).
+        gt_folder (str): The path of the ground truth images.
+        pred_folder (str): The path of the prediction images.
+        categories (str): The categories of the dataset.
+        backend_args (object): The file client of the dataset. If None,
+            the backend will be set to `local`.
+        nproc (int): Number of processes for panoptic quality computing.
+            Defaults to 32. When `nproc` exceeds the number of cpu cores,
+            the number of cpu cores is used.
+    """
+    if PQStat is None:
+        raise RuntimeError(
+            'panopticapi is not installed, please install it by: '
+            'pip install git+https://github.com/cocodataset/'
+            'panopticapi.git.')
+
+    cpu_num = min(nproc, multiprocessing.cpu_count())
+
+    annotations_split = np.array_split(matched_annotations_list, cpu_num)
+    print('Number of cores: {}, images per core: {}'.format(
+        cpu_num, len(annotations_split[0])))
+    workers = multiprocessing.Pool(processes=cpu_num)
+    processes = []
+    for proc_id, annotation_set in enumerate(annotations_split):
+        p = workers.apply_async(pq_compute_single_core,
+                                (proc_id, annotation_set, gt_folder,
+                                 pred_folder, categories, backend_args))
+        processes.append(p)
+
+    # Close the process pool, otherwise it will lead to memory
+    # leaking problems.
+    workers.close()
+    workers.join()
+
+    pq_stat = PQStat()
+    for p in processes:
+        pq_stat += p.get()
+
+    return pq_stat
diff --git a/head_extractor/build/lib/mmdet/evaluation/functional/recall.py b/head_extractor/build/lib/mmdet/evaluation/functional/recall.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bce2bf3614ab454dbbdf48efc4650018cc71b13
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/functional/recall.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Sequence
+
+import numpy as np
+from mmengine.logging import print_log
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+    img_num = all_ious.shape[0]
+    total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+    _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+    for k, proposal_num in enumerate(proposal_nums):
+        tmp_ious = np.zeros(0)
+        for i in range(img_num):
+            ious = all_ious[i][:, :proposal_num].copy()
+            gt_ious = np.zeros((ious.shape[0]))
+            if ious.size == 0:
+                tmp_ious = np.hstack((tmp_ious, gt_ious))
+                continue
+            for j in range(ious.shape[0]):
+                gt_max_overlaps = ious.argmax(axis=1)
+                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+                gt_idx = max_ious.argmax()
+                gt_ious[j] = max_ious[gt_idx]
+                box_idx = gt_max_overlaps[gt_idx]
+                ious[gt_idx, :] = -1
+                ious[:, box_idx] = -1
+            tmp_ious = np.hstack((tmp_ious, gt_ious))
+        _ious[k, :] = tmp_ious
+
+    _ious = np.fliplr(np.sort(_ious, axis=1))
+    recalls = np.zeros((proposal_nums.size, thrs.size))
+    for i, thr in enumerate(thrs):
+        recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num)
+
+    return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+    """Check proposal_nums and iou_thrs and set correct format."""
+    if isinstance(proposal_nums, Sequence):
+        _proposal_nums = np.array(proposal_nums)
+    elif isinstance(proposal_nums, int):
+        _proposal_nums = np.array([proposal_nums])
+    else:
+        _proposal_nums = proposal_nums
+
+    if iou_thrs is None:
+        _iou_thrs = np.array([0.5])
+    elif isinstance(iou_thrs, Sequence):
+        _iou_thrs = np.array(iou_thrs)
+    elif isinstance(iou_thrs, float):
+        _iou_thrs = np.array([iou_thrs])
+    else:
+        _iou_thrs = iou_thrs
+
+    return _proposal_nums, _iou_thrs
+
+
+def eval_recalls(gts,
+                 proposals,
+                 proposal_nums=None,
+                 iou_thrs=0.5,
+                 logger=None,
+                 use_legacy_coordinate=False):
+    """Calculate recalls.
+
+    Args:
+        gts (list[ndarray]): a list of arrays of shape (n, 4)
+        proposals (list[ndarray]): a list of arrays of shape (k, 4) or (k, 5)
+        proposal_nums (int | Sequence[int]): Top N proposals to be evaluated.
+        iou_thrs (float | Sequence[float]): IoU thresholds. Default: 0.5.
+        logger (logging.Logger | str | None): The way to print the recall
+            summary. See `mmengine.logging.print_log()` for details.
+            Default: None.
+        use_legacy_coordinate (bool): Whether use coordinate system
+            in mmdet v1.x. "1" was added to both height and width
+            which means w, h should be
+            computed as 'x2 - x1 + 1` and 'y2 - y1 + 1'. Default: False.
+
+
+    Returns:
+        ndarray: recalls of different ious and proposal nums
+    """
+
+    img_num = len(gts)
+    assert img_num == len(proposals)
+    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+    all_ious = []
+    for i in range(img_num):
+        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+            scores = proposals[i][:, 4]
+            sort_idx = np.argsort(scores)[::-1]
+            img_proposal = proposals[i][sort_idx, :]
+        else:
+            img_proposal = proposals[i]
+        prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+        if gts[i] is None or gts[i].shape[0] == 0:
+            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+        else:
+            ious = bbox_overlaps(
+                gts[i],
+                img_proposal[:prop_num, :4],
+                use_legacy_coordinate=use_legacy_coordinate)
+        all_ious.append(ious)
+    all_ious = np.array(all_ious)
+    recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+
+    print_recall_summary(recalls, proposal_nums, iou_thrs, logger=logger)
+    return recalls
+
+
+def print_recall_summary(recalls,
+                         proposal_nums,
+                         iou_thrs,
+                         row_idxs=None,
+                         col_idxs=None,
+                         logger=None):
+    """Print recalls in a table.
+
+    Args:
+        recalls (ndarray): calculated from `bbox_recalls`
+        proposal_nums (ndarray or list): top N proposals
+        iou_thrs (ndarray or list): iou thresholds
+        row_idxs (ndarray): which rows(proposal nums) to print
+        col_idxs (ndarray): which cols(iou thresholds) to print
+        logger (logging.Logger | str | None): The way to print the recall
+            summary. See `mmengine.logging.print_log()` for details.
+            Default: None.
+    """
+    proposal_nums = np.array(proposal_nums, dtype=np.int32)
+    iou_thrs = np.array(iou_thrs)
+    if row_idxs is None:
+        row_idxs = np.arange(proposal_nums.size)
+    if col_idxs is None:
+        col_idxs = np.arange(iou_thrs.size)
+    row_header = [''] + iou_thrs[col_idxs].tolist()
+    table_data = [row_header]
+    for i, num in enumerate(proposal_nums[row_idxs]):
+        row = [f'{val:.3f}' for val in recalls[row_idxs[i], col_idxs].tolist()]
+        row.insert(0, num)
+        table_data.append(row)
+    table = AsciiTable(table_data)
+    print_log('\n' + table.table, logger=logger)
+
+
+def plot_num_recall(recalls, proposal_nums):
+    """Plot Proposal_num-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        proposal_nums(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(proposal_nums, np.ndarray):
+        _proposal_nums = proposal_nums.tolist()
+    else:
+        _proposal_nums = proposal_nums
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot([0] + _proposal_nums, [0] + _recalls)
+    plt.xlabel('Proposal num')
+    plt.ylabel('Recall')
+    plt.axis([0, proposal_nums.max(), 0, 1])
+    f.show()
+
+
+def plot_iou_recall(recalls, iou_thrs):
+    """Plot IoU-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        iou_thrs(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(iou_thrs, np.ndarray):
+        _iou_thrs = iou_thrs.tolist()
+    else:
+        _iou_thrs = iou_thrs
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot(_iou_thrs + [1.0], _recalls + [0.])
+    plt.xlabel('IoU')
+    plt.ylabel('Recall')
+    plt.axis([iou_thrs.min(), 1, 0, 1])
+    f.show()
diff --git a/head_extractor/build/lib/mmdet/evaluation/functional/ytvis.py b/head_extractor/build/lib/mmdet/evaluation/functional/ytvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..c65a7e9bc956c7de42e0d6e511dabb3d7325782d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/functional/ytvis.py
@@ -0,0 +1,305 @@
+# Copyright (c) Github URL
+# Copied from
+# https://github.com/youtubevos/cocoapi/blob/master/PythonAPI/pycocotools/ytvos.py
+__author__ = 'ychfan'
+# Interface for accessing the YouTubeVIS dataset.
+
+# The following API functions are defined:
+#  YTVIS       - YTVIS api class that loads YouTubeVIS annotation file
+#  and prepare data structures.
+#  decodeMask - Decode binary mask M encoded via run-length encoding.
+#  encodeMask - Encode binary mask M using run-length encoding.
+#  getAnnIds  - Get ann ids that satisfy given filter conditions.
+#  getCatIds  - Get cat ids that satisfy given filter conditions.
+#  getImgIds  - Get img ids that satisfy given filter conditions.
+#  loadAnns   - Load anns with the specified ids.
+#  loadCats   - Load cats with the specified ids.
+#  loadImgs   - Load imgs with the specified ids.
+#  annToMask  - Convert segmentation in an annotation to binary mask.
+#  loadRes    - Load algorithm results and create API for accessing them.
+
+# Microsoft COCO Toolbox.      version 2.0
+# Data, paper, and tutorials available at:  http://mscoco.org/
+# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
+# Licensed under the Simplified BSD License [see bsd.txt]
+
+import copy
+import itertools
+import json
+import sys
+import time
+from collections import defaultdict
+
+import numpy as np
+from pycocotools import mask as maskUtils
+
+PYTHON_VERSION = sys.version_info[0]
+
+
+def _isArrayLike(obj):
+    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
+
+
+class YTVIS:
+
+    def __init__(self, annotation_file=None):
+        """Constructor of Microsoft COCO helper class for reading and
+        visualizing annotations.
+
+        :param annotation_file (str | dict): location of annotation file or
+            dict results.
+        :param image_folder (str): location to the folder that hosts images.
+        :return:
+        """
+        # load dataset
+        self.dataset, self.anns, self.cats, self.vids = dict(), dict(), dict(
+        ), dict()
+        self.vidToAnns, self.catToVids = defaultdict(list), defaultdict(list)
+        if annotation_file is not None:
+            print('loading annotations into memory...')
+            tic = time.time()
+            if type(annotation_file) == str:
+                dataset = json.load(open(annotation_file, 'r'))
+            else:
+                dataset = annotation_file
+            assert type(
+                dataset
+            ) == dict, 'annotation file format {} not supported'.format(
+                type(dataset))
+            print('Done (t={:0.2f}s)'.format(time.time() - tic))
+            self.dataset = dataset
+            self.createIndex()
+
+    def createIndex(self):
+        # create index
+        print('creating index...')
+        anns, cats, vids = {}, {}, {}
+        vidToAnns, catToVids = defaultdict(list), defaultdict(list)
+        if 'annotations' in self.dataset:
+            for ann in self.dataset['annotations']:
+                vidToAnns[ann['video_id']].append(ann)
+                anns[ann['id']] = ann
+
+        if 'videos' in self.dataset:
+            for vid in self.dataset['videos']:
+                vids[vid['id']] = vid
+
+        if 'categories' in self.dataset:
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
+            for ann in self.dataset['annotations']:
+                catToVids[ann['category_id']].append(ann['video_id'])
+
+        print('index created!')
+
+        # create class members
+        self.anns = anns
+        self.vidToAnns = vidToAnns
+        self.catToVids = catToVids
+        self.vids = vids
+        self.cats = cats
+
+    def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None):
+        """Get ann ids that satisfy given filter conditions. default skips that
+        filter.
+
+        :param vidIds  (int array)     : get anns for given vids
+               catIds  (int array)     : get anns for given cats
+               areaRng (float array)   : get anns for given area range
+               iscrowd (boolean)       : get anns for given crowd label
+        :return: ids (int array)       : integer array of ann ids
+        """
+        vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(vidIds) == len(catIds) == len(areaRng) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(vidIds) == 0:
+                lists = [
+                    self.vidToAnns[vidId] for vidId in vidIds
+                    if vidId in self.vidToAnns
+                ]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.dataset['annotations']
+            anns = anns if len(catIds) == 0 else [
+                ann for ann in anns if ann['category_id'] in catIds
+            ]
+            anns = anns if len(areaRng) == 0 else [
+                ann for ann in anns if ann['avg_area'] > areaRng[0]
+                and ann['avg_area'] < areaRng[1]
+            ]
+        if iscrowd is not None:
+            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
+        else:
+            ids = [ann['id'] for ann in anns]
+        return ids
+
+    def getCatIds(self, catNms=[], supNms=[], catIds=[]):
+        """filtering parameters. default skips that filter.
+
+        :param catNms (str array)  : get cats for given cat names
+        :param supNms (str array)  : get cats for given supercategory names
+        :param catIds (int array)  : get cats for given cat ids
+        :return: ids (int array)   : integer array of cat ids
+        """
+        catNms = catNms if _isArrayLike(catNms) else [catNms]
+        supNms = supNms if _isArrayLike(supNms) else [supNms]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(catNms) == len(supNms) == len(catIds) == 0:
+            cats = self.dataset['categories']
+        else:
+            cats = self.dataset['categories']
+            cats = cats if len(catNms) == 0 else [
+                cat for cat in cats if cat['name'] in catNms
+            ]
+            cats = cats if len(supNms) == 0 else [
+                cat for cat in cats if cat['supercategory'] in supNms
+            ]
+            cats = cats if len(catIds) == 0 else [
+                cat for cat in cats if cat['id'] in catIds
+            ]
+        ids = [cat['id'] for cat in cats]
+        return ids
+
+    def getVidIds(self, vidIds=[], catIds=[]):
+        """Get vid ids that satisfy given filter conditions.
+
+        :param vidIds (int array) : get vids for given ids
+        :param catIds (int array) : get vids with all given cats
+        :return: ids (int array)  : integer array of vid ids
+        """
+        vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(vidIds) == len(catIds) == 0:
+            ids = self.vids.keys()
+        else:
+            ids = set(vidIds)
+            for i, catId in enumerate(catIds):
+                if i == 0 and len(ids) == 0:
+                    ids = set(self.catToVids[catId])
+                else:
+                    ids &= set(self.catToVids[catId])
+        return list(ids)
+
+    def loadAnns(self, ids=[]):
+        """Load anns with the specified ids.
+
+        :param ids (int array)       : integer ids specifying anns
+        :return: anns (object array) : loaded ann objects
+        """
+        if _isArrayLike(ids):
+            return [self.anns[id] for id in ids]
+        elif type(ids) == int:
+            return [self.anns[ids]]
+
+    def loadCats(self, ids=[]):
+        """Load cats with the specified ids.
+
+        :param ids (int array)       : integer ids specifying cats
+        :return: cats (object array) : loaded cat objects
+        """
+        if _isArrayLike(ids):
+            return [self.cats[id] for id in ids]
+        elif type(ids) == int:
+            return [self.cats[ids]]
+
+    def loadVids(self, ids=[]):
+        """Load anns with the specified ids.
+
+        :param ids (int array)       : integer ids specifying vid
+        :return: vids (object array) : loaded vid objects
+        """
+        if _isArrayLike(ids):
+            return [self.vids[id] for id in ids]
+        elif type(ids) == int:
+            return [self.vids[ids]]
+
+    def loadRes(self, resFile):
+        """Load result file and return a result api object.
+
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = YTVIS()
+        res.dataset['videos'] = [img for img in self.dataset['videos']]
+
+        print('Loading and preparing results...')
+        tic = time.time()
+        if type(resFile) == str or (PYTHON_VERSION == 2
+                                    and type(resFile) == str):
+            anns = json.load(open(resFile))
+        elif type(resFile) == np.ndarray:
+            anns = self.loadNumpyAnnotations(resFile)
+        else:
+            anns = resFile
+        assert type(anns) == list, 'results in not an array of objects'
+        annsVidIds = [ann['video_id'] for ann in anns]
+        assert set(annsVidIds) == (set(annsVidIds) & set(self.getVidIds())), \
+               'Results do not correspond to current coco set'
+        if 'segmentations' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(
+                self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                ann['areas'] = []
+                if 'bboxes' not in ann:
+                    ann['bboxes'] = []
+                for seg in ann['segmentations']:
+                    # now only support compressed RLE format
+                    # as segmentation results
+                    if seg:
+                        ann['areas'].append(maskUtils.area(seg))
+                        if len(ann['bboxes']) < len(ann['areas']):
+                            ann['bboxes'].append(maskUtils.toBbox(seg))
+                    else:
+                        ann['areas'].append(None)
+                        if len(ann['bboxes']) < len(ann['areas']):
+                            ann['bboxes'].append(None)
+                ann['id'] = id + 1
+                l_ori = [a for a in ann['areas'] if a]
+                if len(l_ori) == 0:
+                    ann['avg_area'] = 0
+                else:
+                    ann['avg_area'] = np.array(l_ori).mean()
+                ann['iscrowd'] = 0
+        print('DONE (t={:0.2f}s)'.format(time.time() - tic))
+
+        res.dataset['annotations'] = anns
+        res.createIndex()
+        return res
+
+    def annToRLE(self, ann, frameId):
+        """Convert annotation which can be polygons, uncompressed RLE to RLE.
+
+        :return: binary mask (numpy 2D array)
+        """
+        t = self.vids[ann['video_id']]
+        h, w = t['height'], t['width']
+        segm = ann['segmentations'][frameId]
+        if type(segm) == list:
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(segm, h, w)
+            rle = maskUtils.merge(rles)
+        elif type(segm['counts']) == list:
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(segm, h, w)
+        else:
+            # rle
+            rle = segm
+        return rle
+
+    def annToMask(self, ann, frameId):
+        """Convert annotation which can be polygons, uncompressed RLE, or RLE
+        to binary mask.
+
+        :return: binary mask (numpy 2D array)
+        """
+        rle = self.annToRLE(ann, frameId)
+        m = maskUtils.decode(rle)
+        return m
diff --git a/head_extractor/build/lib/mmdet/evaluation/functional/ytviseval.py b/head_extractor/build/lib/mmdet/evaluation/functional/ytviseval.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdaf110d37c61b4e02873a4dc83e1722a70a29f1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/functional/ytviseval.py
@@ -0,0 +1,623 @@
+# Copyright (c) Github URL
+# Copied from
+# https://github.com/youtubevos/cocoapi/blob/master/PythonAPI/pycocotools/ytvoseval.py
+__author__ = 'ychfan'
+
+import copy
+import datetime
+import time
+from collections import defaultdict
+
+import numpy as np
+from pycocotools import mask as maskUtils
+
+
+class YTVISeval:
+    # Interface for evaluating video instance segmentation on
+    # the YouTubeVIS dataset.
+    #
+    # The usage for YTVISeval is as follows:
+    #  cocoGt=..., cocoDt=...       # load dataset and results
+    #  E = YTVISeval(cocoGt,cocoDt); # initialize YTVISeval object
+    #  E.params.recThrs = ...;      # set parameters as desired
+    #  E.evaluate();                # run per image evaluation
+    #  E.accumulate();              # accumulate per image results
+    #  E.summarize();               # display summary metrics of results
+    # For example usage see evalDemo.m and http://mscoco.org/.
+    #
+    # The evaluation parameters are as follows (defaults in brackets):
+    #  imgIds     - [all] N img ids to use for evaluation
+    #  catIds     - [all] K cat ids to use for evaluation
+    #  iouThrs    - [.5:.05:.95] T=10 IoU thresholds for evaluation
+    #  recThrs    - [0:.01:1] R=101 recall thresholds for evaluation
+    #  areaRng    - [...] A=4 object area ranges for evaluation
+    #  maxDets    - [1 10 100] M=3 thresholds on max detections per image
+    #  iouType    - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints'
+    #  iouType replaced the now DEPRECATED useSegm parameter.
+    #  useCats    - [1] if true use category labels for evaluation
+    # Note: if useCats=0 category labels are ignored as in proposal scoring.
+    # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
+    #
+    # evaluate(): evaluates detections on every image and every category and
+    # concats the results into the "evalImgs" with fields:
+    #  dtIds      - [1xD] id for each of the D detections (dt)
+    #  gtIds      - [1xG] id for each of the G ground truths (gt)
+    #  dtMatches  - [TxD] matching gt id at each IoU or 0
+    #  gtMatches  - [TxG] matching dt id at each IoU or 0
+    #  dtScores   - [1xD] confidence of each dt
+    #  gtIgnore   - [1xG] ignore flag for each gt
+    #  dtIgnore   - [TxD] ignore flag for each dt at each IoU
+    #
+    # accumulate(): accumulates the per-image, per-category evaluation
+    # results in "evalImgs" into the dictionary "eval" with fields:
+    #  params     - parameters used for evaluation
+    #  date       - date evaluation was performed
+    #  counts     - [T,R,K,A,M] parameter dimensions (see above)
+    #  precision  - [TxRxKxAxM] precision for every evaluation setting
+    #  recall     - [TxKxAxM] max recall for every evaluation setting
+    # Note: precision and recall==-1 for settings with no gt objects.
+    #
+    # See also coco, mask, pycocoDemo, pycocoEvalDemo
+    #
+    # Microsoft COCO Toolbox.      version 2.0
+    # Data, paper, and tutorials available at:  http://mscoco.org/
+    # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+    # Licensed under the Simplified BSD License [see coco/license.txt]
+    def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
+        """Initialize CocoEval using coco APIs for gt and dt.
+
+        :param cocoGt: coco object with ground truth annotations
+        :param cocoDt: coco object with detection results
+        :return: None
+        """
+        if not iouType:
+            print('iouType not specified. use default iouType segm')
+        self.cocoGt = cocoGt  # ground truth COCO API
+        self.cocoDt = cocoDt  # detections COCO API
+        self.params = {}  # evaluation parameters
+        self.evalVids = defaultdict(
+            list)  # per-image per-category evaluation results [KxAxI] elements
+        self.eval = {}  # accumulated evaluation results
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self.params = Params(iouType=iouType)  # parameters
+        self._paramsEval = {}  # parameters for evaluation
+        self.stats = []  # result summarization
+        self.ious = {}  # ious between all gts and dts
+        if cocoGt is not None:
+            self.params.vidIds = sorted(cocoGt.getVidIds())
+            self.params.catIds = sorted(cocoGt.getCatIds())
+
+    def _prepare(self):
+        '''
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        '''
+
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                for i, a in enumerate(ann['segmentations']):
+                    if a:
+                        rle = coco.annToRLE(ann, i)
+                        ann['segmentations'][i] = rle
+                l_ori = [a for a in ann['areas'] if a]
+                if len(l_ori) == 0:
+                    ann['avg_area'] = 0
+                else:
+                    ann['avg_area'] = np.array(l_ori).mean()
+
+        p = self.params
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(
+                self.cocoGt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds))
+            dts = self.cocoDt.loadAnns(
+                self.cocoDt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds))
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds))
+
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == 'segm':
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+        # set ignore flag
+        for gt in gts:
+            gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
+            gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
+            if p.iouType == 'keypoints':
+                gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        for gt in gts:
+            self._gts[gt['video_id'], gt['category_id']].append(gt)
+        for dt in dts:
+            self._dts[dt['video_id'], dt['category_id']].append(dt)
+        self.evalVids = defaultdict(
+            list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def evaluate(self):
+        '''
+        Run per image evaluation on given images and store
+        results (a list of dict) in self.evalVids
+        :return: None
+        '''
+        tic = time.time()
+        print('Running per image evaluation...')
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+            print('useSegm (deprecated) is not None. Running {} evaluation'.
+                  format(p.iouType))
+        print('Evaluate annotation type *{}*'.format(p.iouType))
+        p.vidIds = list(np.unique(p.vidIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType == 'segm' or p.iouType == 'bbox':
+            computeIoU = self.computeIoU
+        elif p.iouType == 'keypoints':
+            computeIoU = self.computeOks
+        self.ious = {(vidId, catId): computeIoU(vidId, catId)
+                     for vidId in p.vidIds for catId in catIds}
+
+        evaluateVid = self.evaluateVid
+        maxDet = p.maxDets[-1]
+
+        self.evalImgs = [
+            evaluateVid(vidId, catId, areaRng, maxDet) for catId in catIds
+            for areaRng in p.areaRng for vidId in p.vidIds
+        ]
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format(toc - tic))
+
+    def computeIoU(self, vidId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[vidId, catId]
+            dt = self._dts[vidId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[vidId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[vidId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        inds = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0:p.maxDets[-1]]
+
+        if p.iouType == 'segm':
+            g = [g['segmentations'] for g in gt]
+            d = [d['segmentations'] for d in dt]
+        elif p.iouType == 'bbox':
+            g = [g['bboxes'] for g in gt]
+            d = [d['bboxes'] for d in dt]
+        else:
+            raise Exception('unknown iouType for iou computation')
+
+        # compute iou between each dt and gt region
+
+        def iou_seq(d_seq, g_seq):
+            i = .0
+            u = .0
+            for d, g in zip(d_seq, g_seq):
+                if d and g:
+                    i += maskUtils.area(maskUtils.merge([d, g], True))
+                    u += maskUtils.area(maskUtils.merge([d, g], False))
+                elif not d and g:
+                    u += maskUtils.area(g)
+                elif d and not g:
+                    u += maskUtils.area(d)
+            if not u > .0:
+                print('Mask sizes in video {} and category {} may not match!'.
+                      format(vidId, catId))
+            iou = i / u if u > .0 else .0
+            return iou
+
+        ious = np.zeros([len(d), len(g)])
+        for i, j in np.ndindex(ious.shape):
+            ious[i, j] = iou_seq(d[i], g[j])
+
+        return ious
+
+    def computeOks(self, imgId, catId):
+        p = self.params
+
+        gts = self._gts[imgId, catId]
+        dts = self._dts[imgId, catId]
+        inds = np.argsort([-d['score'] for d in dts], kind='mergesort')
+        dts = [dts[i] for i in inds]
+        if len(dts) > p.maxDets[-1]:
+            dts = dts[0:p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(gts) == 0 or len(dts) == 0:
+            return []
+        ious = np.zeros((len(dts), len(gts)))
+        sigmas = np.array([
+            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
+            .87, .87, .89, .89
+        ]) / 10.0
+        vars = (sigmas * 2)**2
+        k = len(sigmas)
+        # compute oks between each detection and ground truth object
+        for j, gt in enumerate(gts):
+            # create bounds for ignore regions(double the gt bbox)
+            g = np.array(gt['keypoints'])
+            xg = g[0::3]
+            yg = g[1::3]
+            vg = g[2::3]
+            k1 = np.count_nonzero(vg > 0)
+            bb = gt['bbox']
+            x0 = bb[0] - bb[2]
+            x1 = bb[0] + bb[2] * 2
+            y0 = bb[1] - bb[3]
+            y1 = bb[1] + bb[3] * 2
+            for i, dt in enumerate(dts):
+                d = np.array(dt['keypoints'])
+                xd = d[0::3]
+                yd = d[1::3]
+                if k1 > 0:
+                    # measure the per-keypoint distance if keypoints visible
+                    dx = xd - xg
+                    dy = yd - yg
+                else:
+                    # measure minimum distance to keypoints
+                    z = np.zeros((k))
+                    dx = np.max((z, x0 - xd), axis=0) + np.max(
+                        (z, xd - x1), axis=0)
+                    dy = np.max((z, y0 - yd), axis=0) + np.max(
+                        (z, yd - y1), axis=0)
+                e = (dx**2 + dy**2) / vars / (gt['avg_area'] +
+                                              np.spacing(1)) / 2
+                if k1 > 0:
+                    e = e[vg > 0]
+                ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
+        return ious
+
+    def evaluateVid(self, vidId, catId, aRng, maxDet):
+        '''
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        '''
+        p = self.params
+        if p.useCats:
+            gt = self._gts[vidId, catId]
+            dt = self._dts[vidId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[vidId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[vidId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+
+        for g in gt:
+            if g['ignore'] or (g['avg_area'] < aRng[0]
+                               or g['avg_area'] > aRng[1]):
+                g['_ignore'] = 1
+            else:
+                g['_ignore'] = 0
+
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        # load computed ious
+        ious = self.ious[vidId, catId][:, gtind] if len(
+            self.ious[vidId, catId]) > 0 else self.ious[vidId, catId]
+
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm = np.zeros((T, G))
+        dtm = np.zeros((T, D))
+        gtIg = np.array([g['_ignore'] for g in gt])
+        dtIg = np.zeros((T, D))
+        if not len(ious) == 0:
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t, 1 - 1e-10])
+                    m = -1
+                    for gind, g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
+                            break
+                        # continue to next gt unless better match made
+                        if ious[dind, gind] < iou:
+                            continue
+                        # if match successful and best so far,
+                        # store appropriately
+                        iou = ious[dind, gind]
+                        m = gind
+                    # if match made store id of match for both dt and gt
+                    if m == -1:
+                        continue
+                    dtIg[tind, dind] = gtIg[m]
+                    dtm[tind, dind] = gt[m]['id']
+                    gtm[tind, m] = d['id']
+        # set unmatched detections outside of area range to ignore
+        a = np.array([
+            d['avg_area'] < aRng[0] or d['avg_area'] > aRng[1] for d in dt
+        ]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T,
+                                                                      0)))
+        # store results for given image and category
+        return {
+            'video_id': vidId,
+            'category_id': catId,
+            'aRng': aRng,
+            'maxDet': maxDet,
+            'dtIds': [d['id'] for d in dt],
+            'gtIds': [g['id'] for g in gt],
+            'dtMatches': dtm,
+            'gtMatches': gtm,
+            'dtScores': [d['score'] for d in dt],
+            'gtIgnore': gtIg,
+            'dtIgnore': dtIg,
+        }
+
+    def accumulate(self, p=None):
+        """Accumulate per image evaluation results and store the result in
+        self.eval.
+
+        :param p: input params for evaluation
+        :return: None
+        """
+        print('Accumulating evaluation results...')
+        tic = time.time()
+        if not self.evalImgs:
+            print('Please run evaluate() first')
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        p.catIds = p.catIds if p.useCats == 1 else [-1]
+        T = len(p.iouThrs)
+        R = len(p.recThrs)
+        K = len(p.catIds) if p.useCats else 1
+        A = len(p.areaRng)
+        M = len(p.maxDets)
+        precision = -np.ones(
+            (T, R, K, A, M))  # -1 for the precision of absent categories
+        recall = -np.ones((T, K, A, M))
+        scores = -np.ones((T, R, K, A, M))
+
+        # create dictionary for future indexing
+        _pe = self._paramsEval
+        catIds = _pe.catIds if _pe.useCats else [-1]
+        setK = set(catIds)
+        setA = set(map(tuple, _pe.areaRng))
+        setM = set(_pe.maxDets)
+        setI = set(_pe.vidIds)
+        # get inds to evaluate
+        k_list = [n for n, k in enumerate(p.catIds) if k in setK]
+        m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
+        a_list = [
+            n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng))
+            if a in setA
+        ]
+        i_list = [n for n, i in enumerate(p.vidIds) if i in setI]
+        I0 = len(_pe.vidIds)
+        A0 = len(_pe.areaRng)
+        # retrieve E at each category, area range, and max number of detections
+        for k, k0 in enumerate(k_list):
+            Nk = k0 * A0 * I0
+            for a, a0 in enumerate(a_list):
+                Na = a0 * I0
+                for m, maxDet in enumerate(m_list):
+                    E = [self.evalImgs[Nk + Na + i] for i in i_list]
+                    E = [e for e in E if e is not None]
+                    if len(E) == 0:
+                        continue
+                    dtScores = np.concatenate(
+                        [e['dtScores'][0:maxDet] for e in E])
+
+                    inds = np.argsort(-dtScores, kind='mergesort')
+                    dtScoresSorted = dtScores[inds]
+
+                    dtm = np.concatenate(
+                        [e['dtMatches'][:, 0:maxDet] for e in E], axis=1)[:,
+                                                                          inds]
+                    dtIg = np.concatenate(
+                        [e['dtIgnore'][:, 0:maxDet] for e in E], axis=1)[:,
+                                                                         inds]
+                    gtIg = np.concatenate([e['gtIgnore'] for e in E])
+                    npig = np.count_nonzero(gtIg == 0)
+                    if npig == 0:
+                        continue
+                    tps = np.logical_and(dtm, np.logical_not(dtIg))
+                    fps = np.logical_and(
+                        np.logical_not(dtm), np.logical_not(dtIg))
+
+                    tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
+                    fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
+                    for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                        tp = np.array(tp)
+                        fp = np.array(fp)
+                        nd_ori = len(tp)
+                        rc = tp / npig
+                        pr = tp / (fp + tp + np.spacing(1))
+                        q = np.zeros((R, ))
+                        ss = np.zeros((R, ))
+
+                        if nd_ori:
+                            recall[t, k, a, m] = rc[-1]
+                        else:
+                            recall[t, k, a, m] = 0
+
+                        # use python array gets significant speed improvement
+                        pr = pr.tolist()
+                        q = q.tolist()
+
+                        for i in range(nd_ori - 1, 0, -1):
+                            if pr[i] > pr[i - 1]:
+                                pr[i - 1] = pr[i]
+
+                        inds = np.searchsorted(rc, p.recThrs, side='left')
+                        try:
+                            for ri, pi in enumerate(inds):
+                                q[ri] = pr[pi]
+                                ss[ri] = dtScoresSorted[pi]
+                        except Exception:
+                            pass
+                        precision[t, :, k, a, m] = np.array(q)
+                        scores[t, :, k, a, m] = np.array(ss)
+        self.eval = {
+            'params': p,
+            'counts': [T, R, K, A, M],
+            'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+            'precision': precision,
+            'recall': recall,
+            'scores': scores,
+        }
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format(toc - tic))
+
+    def summarize(self):
+        """Compute and display summary metrics for evaluation results.
+
+        Note this function can *only* be applied on the default parameter
+        setting
+        """
+
+        def _summarize(ap=1, iouThr=None, areaRng='all', maxDets=100):
+            p = self.params
+            iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | ' \
+                   'maxDets={:>3d} ] = {:0.3f}'
+            titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+            typeStr = '(AP)' if ap == 1 else '(AR)'
+            iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
+                if iouThr is None else '{:0.2f}'.format(iouThr)
+
+            aind = [
+                i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng
+            ]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval['precision']
+                # IoU
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval['recall']
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            print(
+                iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets,
+                            mean_s))
+            return mean_s
+
+        def _summarizeDets():
+            stats = np.zeros((12, ))
+            stats[0] = _summarize(1)
+            stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
+            stats[2] = _summarize(
+                1, iouThr=.75, maxDets=self.params.maxDets[2])
+            stats[3] = _summarize(
+                1, areaRng='small', maxDets=self.params.maxDets[2])
+            stats[4] = _summarize(
+                1, areaRng='medium', maxDets=self.params.maxDets[2])
+            stats[5] = _summarize(
+                1, areaRng='large', maxDets=self.params.maxDets[2])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+            stats[9] = _summarize(
+                0, areaRng='small', maxDets=self.params.maxDets[2])
+            stats[10] = _summarize(
+                0, areaRng='medium', maxDets=self.params.maxDets[2])
+            stats[11] = _summarize(
+                0, areaRng='large', maxDets=self.params.maxDets[2])
+            return stats
+
+        def _summarizeKps():
+            stats = np.zeros((10, ))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng='medium')
+            stats[4] = _summarize(1, maxDets=20, areaRng='large')
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng='medium')
+            stats[9] = _summarize(0, maxDets=20, areaRng='large')
+            return stats
+
+        if not self.eval:
+            raise Exception('Please run accumulate() first')
+        iouType = self.params.iouType
+        if iouType == 'segm' or iouType == 'bbox':
+            summarize = _summarizeDets
+        elif iouType == 'keypoints':
+            summarize = _summarizeKps
+        self.stats = summarize()
+
+    def __str__(self):
+        self.summarize()
+
+
+class Params:
+    """Params for coco evaluation api."""
+
+    def setDetParams(self):
+        self.vidIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange
+        # is slightly larger than the true value
+        self.iouThrs = np.linspace(
+            .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(
+            .0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [[0**2, 1e5**2], [0**2, 128**2], [128**2, 256**2],
+                        [256**2, 1e5**2]]
+        self.areaRngLbl = ['all', 'small', 'medium', 'large']
+        self.useCats = 1
+
+    def setKpParams(self):
+        self.vidIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange
+        # is slightly larger than the true value
+        self.iouThrs = np.linspace(
+            .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(
+            .0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0**2, 1e5**2], [32**2, 96**2], [96**2, 1e5**2]]
+        self.areaRngLbl = ['all', 'medium', 'large']
+        self.useCats = 1
+
+    def __init__(self, iouType='segm'):
+        if iouType == 'segm' or iouType == 'bbox':
+            self.setDetParams()
+        elif iouType == 'keypoints':
+            self.setKpParams()
+        else:
+            raise Exception('iouType not supported')
+        self.iouType = iouType
+        # useSegm is deprecated
+        self.useSegm = None
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/__init__.py b/head_extractor/build/lib/mmdet/evaluation/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ad040cf6ffe3ada4b77e6a6b9caee3ad7afdf1d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/__init__.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_video_metric import BaseVideoMetric
+from .cityscapes_metric import CityScapesMetric
+from .coco_caption_metric import COCOCaptionMetric
+from .coco_metric import CocoMetric
+from .coco_occluded_metric import CocoOccludedSeparatedMetric
+from .coco_panoptic_metric import CocoPanopticMetric
+from .coco_video_metric import CocoVideoMetric
+from .crowdhuman_metric import CrowdHumanMetric
+from .dod_metric import DODCocoMetric
+from .dump_det_results import DumpDetResults
+from .dump_odvg_results import DumpODVGResults
+from .dump_proposals_metric import DumpProposals
+from .flickr30k_metric import Flickr30kMetric
+from .grefcoco_metric import gRefCOCOMetric
+from .lvis_metric import LVISMetric
+from .mot_challenge_metric import MOTChallengeMetric
+from .openimages_metric import OpenImagesMetric
+from .ov_coco_metric import OVCocoMetric
+from .refexp_metric import RefExpMetric
+from .refseg_metric import RefSegMetric
+from .reid_metric import ReIDMetrics
+from .semseg_metric import SemSegMetric
+from .voc_metric import VOCMetric
+from .youtube_vis_metric import YouTubeVISMetric
+
+__all__ = [
+    'CityScapesMetric', 'CocoMetric', 'CocoPanopticMetric', 'OpenImagesMetric',
+    'VOCMetric', 'LVISMetric', 'CrowdHumanMetric', 'DumpProposals',
+    'CocoOccludedSeparatedMetric', 'DumpDetResults', 'BaseVideoMetric',
+    'MOTChallengeMetric', 'CocoVideoMetric', 'ReIDMetrics', 'YouTubeVISMetric',
+    'COCOCaptionMetric', 'SemSegMetric', 'RefSegMetric', 'RefExpMetric',
+    'gRefCOCOMetric', 'DODCocoMetric', 'DumpODVGResults', 'Flickr30kMetric',
+    'OVCocoMetric'
+]
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/base_video_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/base_video_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..90c7cdcbed5f12b59b6978ccba7576d6d2c25c5e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/base_video_metric.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import warnings
+from typing import Optional, Sequence
+
+import torch
+from mmengine.dist import (barrier, broadcast, broadcast_object_list,
+                           get_dist_info, is_main_process)
+from mmengine.evaluator import BaseMetric
+from mmengine.utils import mkdir_or_exist
+
+
+class BaseVideoMetric(BaseMetric):
+    """Base class for a metric in video task.
+
+    The metric first processes each batch of data_samples and predictions,
+    and appends the processed results to the results list. Then it
+    collects all results together from all ranks if distributed training
+    is used. Finally, it computes the metrics of the entire dataset.
+
+    A subclass of class:`BaseVideoMetric` should assign a meaningful value
+    to the class attribute `default_prefix`. See the argument `prefix` for
+    details.
+    """
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for track_data_sample in data_samples:
+            video_data_samples = track_data_sample['video_data_samples']
+            ori_video_len = video_data_samples[0].ori_video_length
+            if ori_video_len == len(video_data_samples):
+                # video process
+                self.process_video(video_data_samples)
+            else:
+                # image process
+                self.process_image(video_data_samples, ori_video_len)
+
+    def evaluate(self, size: int = 1) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        if len(self.results) == 0:
+            warnings.warn(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.')
+
+        results = collect_tracking_results(self.results, self.collect_device)
+
+        if is_main_process():
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
+
+
+def collect_tracking_results(results: list,
+                             device: str = 'cpu',
+                             tmpdir: Optional[str] = None) -> Optional[list]:
+    """Collected results in distributed environments. different from the
+    function mmengine.dist.collect_results, tracking compute metrics don't use
+    paramenter size, which means length of the entire validation dataset.
+    because it's equal to video num, but compute metrics need image num.
+
+    Args:
+        results (list): Result list containing result parts to be
+            collected. Each item of ``result_part`` should be a picklable
+            object.
+        device (str): Device name. Optional values are 'cpu' and 'gpu'.
+        tmpdir (str | None): Temporal directory for collected results to
+            store. If set to None, it will create a temporal directory for it.
+            ``tmpdir`` should be None when device is 'gpu'. Defaults to None.
+
+    Returns:
+        list or None: The collected results.
+    """
+    if device not in ['gpu', 'cpu']:
+        raise NotImplementedError(
+            f"device must be 'cpu' or 'gpu', but got {device}")
+
+    if device == 'gpu':
+        assert tmpdir is None, 'tmpdir should be None when device is "gpu"'
+        raise NotImplementedError('GPU collecting has not been supported yet')
+    else:
+        return collect_tracking_results_cpu(results, tmpdir)
+
+
+def collect_tracking_results_cpu(result_part: list,
+                                 tmpdir: Optional[str] = None
+                                 ) -> Optional[list]:
+    """Collect results on cpu mode.
+
+    Saves the results on different gpus to 'tmpdir' and collects them by the
+    rank 0 worker.
+
+    Args:
+        result_part (list): The part of prediction results.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode. If is None, use `tempfile.mkdtemp()`
+            to make a temporary path. Defaults to None.
+
+    Returns:
+        list or None: The collected results.
+    """
+    rank, world_size = get_dist_info()
+    if world_size == 1:
+        return result_part
+
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8)
+        if rank == 0:
+            mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8)
+            dir_tensor[:len(tmpdir)] = tmpdir
+        broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.numpy().tobytes().decode().rstrip()
+    else:
+        mkdir_or_exist(tmpdir)
+
+    # dump the part result to the dir
+    with open(osp.join(tmpdir, f'part_{rank}.pkl'), 'wb') as f:  # type: ignore
+        pickle.dump(result_part, f, protocol=2)
+
+    barrier()
+
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            path = osp.join(tmpdir, f'part_{i}.pkl')  # type: ignore
+            with open(path, 'rb') as f:
+                part_list.extend(pickle.load(f))
+        shutil.rmtree(tmpdir)
+        return part_list
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/cityscapes_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/cityscapes_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5cdc179a3c76ef3742dd3ee6692c7deb9905459
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/cityscapes_metric.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+import tempfile
+from collections import OrderedDict
+from typing import Dict, Optional, Sequence
+
+import mmcv
+import numpy as np
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS
+
+try:
+    import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval  # noqa: E501
+    import cityscapesscripts.helpers.labels as CSLabels
+
+    from mmdet.evaluation.functional import evaluateImgLists
+    HAS_CITYSCAPESAPI = True
+except ImportError:
+    HAS_CITYSCAPESAPI = False
+
+
+@METRICS.register_module()
+class CityScapesMetric(BaseMetric):
+    """CityScapes metric for instance segmentation.
+
+    Args:
+        outfile_prefix (str): The prefix of txt and png files. The txt and
+            png file will be save in a directory whose path is
+            "outfile_prefix.results/".
+        seg_prefix (str, optional): Path to the directory which contains the
+            cityscapes instance segmentation masks. It's necessary when
+            training and validation. It could be None when infer on test
+            dataset. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+        dump_matches (bool): Whether dump matches.json file during evaluating.
+            Defaults to False.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+    default_prefix: Optional[str] = 'cityscapes'
+
+    def __init__(self,
+                 outfile_prefix: str,
+                 seg_prefix: Optional[str] = None,
+                 format_only: bool = False,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 dump_matches: bool = False,
+                 file_client_args: dict = None,
+                 backend_args: dict = None) -> None:
+
+        if not HAS_CITYSCAPESAPI:
+            raise RuntimeError('Failed to import `cityscapesscripts`.'
+                               'Please try to install official '
+                               'cityscapesscripts by '
+                               '"pip install cityscapesscripts"')
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.tmp_dir = None
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+        else:
+            assert seg_prefix is not None, '`seg_prefix` is necessary when '
+            'computing the CityScapes metrics'
+
+        if outfile_prefix is None:
+            self.tmp_dir = tempfile.TemporaryDirectory()
+            self.outfile_prefix = osp.join(self.tmp_dir.name, 'results')
+        else:
+            # the directory to save predicted panoptic segmentation mask
+            self.outfile_prefix = osp.join(outfile_prefix, 'results')  # type: ignore # yapf: disable # noqa: E501
+
+        dir_name = osp.expanduser(self.outfile_prefix)
+
+        if osp.exists(dir_name) and is_main_process():
+            logger: MMLogger = MMLogger.get_current_instance()
+            logger.info('remove previous results.')
+            shutil.rmtree(dir_name)
+        os.makedirs(dir_name, exist_ok=True)
+
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        self.seg_prefix = seg_prefix
+        self.dump_matches = dump_matches
+
+    def __del__(self) -> None:
+        """Clean up the results if necessary."""
+        if self.tmp_dir is not None:
+            self.tmp_dir.cleanup()
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            # parse pred
+            result = dict()
+            pred = data_sample['pred_instances']
+            filename = data_sample['img_path']
+            basename = osp.splitext(osp.basename(filename))[0]
+            pred_txt = osp.join(self.outfile_prefix, basename + '_pred.txt')
+            result['pred_txt'] = pred_txt
+            labels = pred['labels'].cpu().numpy()
+            masks = pred['masks'].cpu().numpy().astype(np.uint8)
+            if 'mask_scores' in pred:
+                # some detectors use different scores for bbox and mask
+                mask_scores = pred['mask_scores'].cpu().numpy()
+            else:
+                mask_scores = pred['scores'].cpu().numpy()
+
+            with open(pred_txt, 'w') as f:
+                for i, (label, mask, mask_score) in enumerate(
+                        zip(labels, masks, mask_scores)):
+                    class_name = self.dataset_meta['classes'][label]
+                    class_id = CSLabels.name2label[class_name].id
+                    png_filename = osp.join(
+                        self.outfile_prefix,
+                        basename + f'_{i}_{class_name}.png')
+                    mmcv.imwrite(mask, png_filename)
+                    f.write(f'{osp.basename(png_filename)} '
+                            f'{class_id} {mask_score}\n')
+
+            # parse gt
+            gt = dict()
+            img_path = filename.replace('leftImg8bit.png',
+                                        'gtFine_instanceIds.png')
+            gt['file_name'] = img_path.replace('leftImg8bit', 'gtFine')
+
+            self.results.append((gt, result))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        if self.format_only:
+            logger.info(
+                f'results are saved to {osp.dirname(self.outfile_prefix)}')
+            return OrderedDict()
+        logger.info('starts to compute metric')
+
+        gts, preds = zip(*results)
+        # set global states in cityscapes evaluation API
+        gt_instances_file = osp.join(self.outfile_prefix, 'gtInstances.json')  # type: ignore # yapf: disable # noqa: E501
+        # split gt and prediction list
+        gts, preds = zip(*results)
+        CSEval.args.JSONOutput = False
+        CSEval.args.colorized = False
+        CSEval.args.gtInstancesFile = gt_instances_file
+
+        groundTruthImgList = [gt['file_name'] for gt in gts]
+        predictionImgList = [pred['pred_txt'] for pred in preds]
+        CSEval_results = evaluateImgLists(
+            predictionImgList,
+            groundTruthImgList,
+            CSEval.args,
+            self.backend_args,
+            dump_matches=self.dump_matches)['averages']
+
+        eval_results = OrderedDict()
+        eval_results['mAP'] = CSEval_results['allAp']
+        eval_results['AP@50'] = CSEval_results['allAp50%']
+
+        return eval_results
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/coco_caption_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/coco_caption_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8c7350150f73d8d568597b352e33ad2a202c609
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/coco_caption_metric.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import tempfile
+from typing import List, Optional
+
+from mmengine.evaluator import BaseMetric
+from mmengine.utils import track_iter_progress
+from pycocotools.coco import COCO
+
+from mmdet.registry import METRICS
+
+try:
+    from pycocoevalcap.eval import COCOEvalCap
+except ImportError:
+    COCOEvalCap = None
+
+
+@METRICS.register_module()
+class COCOCaptionMetric(BaseMetric):
+    """Coco Caption evaluation wrapper.
+
+    Save the generated captions and transform into coco format.
+    Calling COCO API for caption metrics.
+
+    Args:
+        ann_file (str): the path for the COCO format caption ground truth
+            json file, load for evaluations.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Should be modified according to the
+            `retrieval_type` for unambiguous results. Defaults to TR.
+    """
+
+    def __init__(self,
+                 ann_file: str,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None):
+        if COCOEvalCap is None:
+            raise RuntimeError(
+                'COCOEvalCap is not installed, please install it by: '
+                'pip install pycocoevalcap')
+
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.ann_file = ann_file
+
+    def process(self, data_batch, data_samples):
+        """Process one batch of data samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch: A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+
+        for data_sample in data_samples:
+            result = dict()
+
+            result['caption'] = data_sample['pred_caption']
+            result['image_id'] = int(data_sample['img_id'])
+
+            # Save the result to `self.results`.
+            self.results.append(result)
+
+    def compute_metrics(self, results: List):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (dict): The processed results of each batch.
+
+        Returns:
+            Dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        # NOTICE: don't access `self.results` from the method.
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+
+            eval_result_file = save_result(
+                result=results,
+                result_dir=temp_dir,
+                filename='caption_pred',
+                remove_duplicate='image_id',
+            )
+
+            coco_val = coco_caption_eval(eval_result_file, self.ann_file)
+
+        return coco_val
+
+
+def save_result(result, result_dir, filename, remove_duplicate=''):
+    """Saving predictions as json file for evaluation."""
+    # combine results from all processes
+    if remove_duplicate:
+        result_new = []
+        id_list = []
+        for res in track_iter_progress(result):
+            if res[remove_duplicate] not in id_list:
+                id_list.append(res[remove_duplicate])
+                result_new.append(res)
+        result = result_new
+
+    final_result_file_url = os.path.join(result_dir, '%s.json' % filename)
+    print(f'result file saved to {final_result_file_url}')
+    json.dump(result, open(final_result_file_url, 'w'))
+
+    return final_result_file_url
+
+
+def coco_caption_eval(results_file, ann_file):
+    """Evaluation between gt json and prediction json files."""
+    # create coco object and coco_result object
+    coco = COCO(ann_file)
+    coco_result = coco.loadRes(results_file)
+
+    # create coco_eval object by taking coco and coco_result
+    coco_eval = COCOEvalCap(coco, coco_result)
+
+    # make sure the image ids are the same
+    coco_eval.params['image_id'] = coco_result.getImgIds()
+
+    # This will take some times at the first run
+    coco_eval.evaluate()
+
+    # print output evaluation scores
+    for metric, score in coco_eval.eval.items():
+        print(f'{metric}: {score:.3f}')
+
+    return coco_eval.eval
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/coco_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/coco_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfdc66e03b96e62366a921c137fc5a5727e26302
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/coco_metric.py
@@ -0,0 +1,597 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import itertools
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import dump, get_local_path, load
+from mmengine.logging import MMLogger
+from terminaltables import AsciiTable
+
+from mmdet.datasets.api_wrappers import COCO, COCOeval, COCOevalMP
+from mmdet.registry import METRICS
+from mmdet.structures.mask import encode_mask_results
+from ..functional import eval_recalls
+
+
+@METRICS.register_module()
+class CocoMetric(BaseMetric):
+    """COCO evaluation metric.
+
+    Evaluate AR, AP, and mAP for detection tasks including proposal/box
+    detection and instance segmentation. Please refer to
+    https://cocodataset.org/#detection-eval for more details.
+
+    Args:
+        ann_file (str, optional): Path to the coco format annotation file.
+            If not specified, ground truth annotations from the dataset will
+            be converted to coco format. Defaults to None.
+        metric (str | List[str]): Metrics to be evaluated. Valid metrics
+            include 'bbox', 'segm', 'proposal', and 'proposal_fast'.
+            Defaults to 'bbox'.
+        classwise (bool): Whether to evaluate the metric class-wise.
+            Defaults to False.
+        proposal_nums (Sequence[int]): Numbers of proposals to be evaluated.
+            Defaults to (100, 300, 1000).
+        iou_thrs (float | List[float], optional): IoU threshold to compute AP
+            and AR. If not specified, IoUs from 0.5 to 0.95 will be used.
+            Defaults to None.
+        metric_items (List[str], optional): Metric result names to be
+            recorded in the evaluation result. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        outfile_prefix (str, optional): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+        sort_categories (bool): Whether sort categories in annotations. Only
+            used for `Objects365V1Dataset`. Defaults to False.
+        use_mp_eval (bool): Whether to use mul-processing evaluation
+    """
+    default_prefix: Optional[str] = 'coco'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: Union[str, List[str]] = 'bbox',
+                 classwise: bool = False,
+                 proposal_nums: Sequence[int] = (100, 300, 1000),
+                 iou_thrs: Optional[Union[float, Sequence[float]]] = None,
+                 metric_items: Optional[Sequence[str]] = None,
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 sort_categories: bool = False,
+                 use_mp_eval: bool = False) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        # coco evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(
+                    "metric should be one of 'bbox', 'segm', 'proposal', "
+                    f"'proposal_fast', but got {metric}.")
+
+        # do class wise evaluation, default False
+        self.classwise = classwise
+        # whether to use multi processing evaluation, default False
+        self.use_mp_eval = use_mp_eval
+
+        # proposal_nums used to compute recall or precision.
+        self.proposal_nums = list(proposal_nums)
+
+        # iou_thrs used to compute recall or precision.
+        if iou_thrs is None:
+            iou_thrs = np.linspace(
+                .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.iou_thrs = iou_thrs
+        self.metric_items = metric_items
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+
+        self.outfile_prefix = outfile_prefix
+
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        # if ann_file is not specified,
+        # initialize coco api with the converted dataset
+        if ann_file is not None:
+            with get_local_path(
+                    ann_file, backend_args=self.backend_args) as local_path:
+                self._coco_api = COCO(local_path)
+                if sort_categories:
+                    # 'categories' list in objects365_train.json and
+                    # objects365_val.json is inconsistent, need sort
+                    # list(or dict) before get cat_ids.
+                    cats = self._coco_api.cats
+                    sorted_cats = {i: cats[i] for i in sorted(cats)}
+                    self._coco_api.cats = sorted_cats
+                    categories = self._coco_api.dataset['categories']
+                    sorted_categories = sorted(
+                        categories, key=lambda i: i['id'])
+                    self._coco_api.dataset['categories'] = sorted_categories
+        else:
+            self._coco_api = None
+
+        # handle dataset lazy init
+        self.cat_ids = None
+        self.img_ids = None
+
+    def fast_eval_recall(self,
+                         results: List[dict],
+                         proposal_nums: Sequence[int],
+                         iou_thrs: Sequence[float],
+                         logger: Optional[MMLogger] = None) -> np.ndarray:
+        """Evaluate proposal recall with COCO's fast_eval_recall.
+
+        Args:
+            results (List[dict]): Results of the dataset.
+            proposal_nums (Sequence[int]): Proposal numbers used for
+                evaluation.
+            iou_thrs (Sequence[float]): IoU thresholds used for evaluation.
+            logger (MMLogger, optional): Logger used for logging the recall
+                summary.
+        Returns:
+            np.ndarray: Averaged recall results.
+        """
+        gt_bboxes = []
+        pred_bboxes = [result['bboxes'] for result in results]
+        for i in range(len(self.img_ids)):
+            ann_ids = self._coco_api.get_ann_ids(img_ids=self.img_ids[i])
+            ann_info = self._coco_api.load_anns(ann_ids)
+            if len(ann_info) == 0:
+                gt_bboxes.append(np.zeros((0, 4)))
+                continue
+            bboxes = []
+            for ann in ann_info:
+                if ann.get('ignore', False) or ann['iscrowd']:
+                    continue
+                x1, y1, w, h = ann['bbox']
+                bboxes.append([x1, y1, x1 + w, y1 + h])
+            bboxes = np.array(bboxes, dtype=np.float32)
+            if bboxes.shape[0] == 0:
+                bboxes = np.zeros((0, 4))
+            gt_bboxes.append(bboxes)
+
+        recalls = eval_recalls(
+            gt_bboxes, pred_bboxes, proposal_nums, iou_thrs, logger=logger)
+        ar = recalls.mean(axis=1)
+        return ar
+
+    def xyxy2xywh(self, bbox: np.ndarray) -> list:
+        """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO
+        evaluation.
+
+        Args:
+            bbox (numpy.ndarray): The bounding boxes, shape (4, ), in
+                ``xyxy`` order.
+
+        Returns:
+            list[float]: The converted bounding boxes, in ``xywh`` order.
+        """
+
+        _bbox: List = bbox.tolist()
+        return [
+            _bbox[0],
+            _bbox[1],
+            _bbox[2] - _bbox[0],
+            _bbox[3] - _bbox[1],
+        ]
+
+    def results2json(self, results: Sequence[dict],
+                     outfile_prefix: str) -> dict:
+        """Dump the detection results to a COCO style json file.
+
+        There are 3 types of results: proposals, bbox predictions, mask
+        predictions, and they have different data types. This method will
+        automatically recognize the type, and dump them to json files.
+
+        Args:
+            results (Sequence[dict]): Testing results of the
+                dataset.
+            outfile_prefix (str): The filename prefix of the json files. If the
+                prefix is "somepath/xxx", the json files will be named
+                "somepath/xxx.bbox.json", "somepath/xxx.segm.json",
+                "somepath/xxx.proposal.json".
+
+        Returns:
+            dict: Possible keys are "bbox", "segm", "proposal", and
+            values are corresponding filenames.
+        """
+        bbox_json_results = []
+        segm_json_results = [] if 'masks' in results[0] else None
+        for idx, result in enumerate(results):
+            image_id = result.get('img_id', idx)
+            labels = result['labels']
+            bboxes = result['bboxes']
+            scores = result['scores']
+            # bbox results
+            for i, label in enumerate(labels):
+                data = dict()
+                data['image_id'] = image_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(scores[i])
+                data['category_id'] = self.cat_ids[label]
+                bbox_json_results.append(data)
+
+            if segm_json_results is None:
+                continue
+
+            # segm results
+            masks = result['masks']
+            mask_scores = result.get('mask_scores', scores)
+            for i, label in enumerate(labels):
+                data = dict()
+                data['image_id'] = image_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(mask_scores[i])
+                data['category_id'] = self.cat_ids[label]
+                if isinstance(masks[i]['counts'], bytes):
+                    masks[i]['counts'] = masks[i]['counts'].decode()
+                data['segmentation'] = masks[i]
+                segm_json_results.append(data)
+
+        result_files = dict()
+        result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+        result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+        dump(bbox_json_results, result_files['bbox'])
+
+        if segm_json_results is not None:
+            result_files['segm'] = f'{outfile_prefix}.segm.json'
+            dump(segm_json_results, result_files['segm'])
+
+        return result_files
+
+    def gt_to_coco_json(self, gt_dicts: Sequence[dict],
+                        outfile_prefix: str) -> str:
+        """Convert ground truth to coco format json file.
+
+        Args:
+            gt_dicts (Sequence[dict]): Ground truth of the dataset.
+            outfile_prefix (str): The filename prefix of the json files. If the
+                prefix is "somepath/xxx", the json file will be named
+                "somepath/xxx.gt.json".
+        Returns:
+            str: The filename of the json file.
+        """
+        categories = [
+            dict(id=id, name=name)
+            for id, name in enumerate(self.dataset_meta['classes'])
+        ]
+        image_infos = []
+        annotations = []
+
+        for idx, gt_dict in enumerate(gt_dicts):
+            img_id = gt_dict.get('img_id', idx)
+            image_info = dict(
+                id=img_id,
+                width=gt_dict['width'],
+                height=gt_dict['height'],
+                file_name='')
+            image_infos.append(image_info)
+            for ann in gt_dict['anns']:
+                label = ann['bbox_label']
+                bbox = ann['bbox']
+                coco_bbox = [
+                    bbox[0],
+                    bbox[1],
+                    bbox[2] - bbox[0],
+                    bbox[3] - bbox[1],
+                ]
+
+                annotation = dict(
+                    id=len(annotations) +
+                    1,  # coco api requires id starts with 1
+                    image_id=img_id,
+                    bbox=coco_bbox,
+                    iscrowd=ann.get('ignore_flag', 0),
+                    category_id=int(label),
+                    area=coco_bbox[2] * coco_bbox[3])
+                if ann.get('mask', None):
+                    mask = ann['mask']
+                    # area = mask_util.area(mask)
+                    if isinstance(mask, dict) and isinstance(
+                            mask['counts'], bytes):
+                        mask['counts'] = mask['counts'].decode()
+                    annotation['segmentation'] = mask
+                    # annotation['area'] = float(area)
+                annotations.append(annotation)
+
+        info = dict(
+            date_created=str(datetime.datetime.now()),
+            description='Coco json file converted by mmdet CocoMetric.')
+        coco_json = dict(
+            info=info,
+            images=image_infos,
+            categories=categories,
+            licenses=None,
+        )
+        if len(annotations) > 0:
+            coco_json['annotations'] = annotations
+        converted_json_path = f'{outfile_prefix}.gt.json'
+        dump(coco_json, converted_json_path)
+        return converted_json_path
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            result['labels'] = pred['labels'].cpu().numpy()
+            # encode mask to RLE
+            if 'masks' in pred:
+                result['masks'] = encode_mask_results(
+                    pred['masks'].detach().cpu().numpy()) if isinstance(
+                        pred['masks'], torch.Tensor) else pred['masks']
+            # some detectors use different scores for bbox and mask
+            if 'mask_scores' in pred:
+                result['mask_scores'] = pred['mask_scores'].cpu().numpy()
+
+            # parse gt
+            gt = dict()
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+            gt['img_id'] = data_sample['img_id']
+            if self._coco_api is None:
+                # TODO: Need to refactor to support LoadAnnotations
+                assert 'instances' in data_sample, \
+                    'ground truth is required for evaluation when ' \
+                    '`ann_file` is not provided'
+                gt['anns'] = data_sample['instances']
+            # add converted result to the results list
+            self.results.append((gt, result))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # split gt and prediction list
+        gts, preds = zip(*results)
+
+        tmp_dir = None
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+
+        if self._coco_api is None:
+            # use converted gt json file to initialize coco api
+            logger.info('Converting ground truth to coco format...')
+            coco_json_path = self.gt_to_coco_json(
+                gt_dicts=gts, outfile_prefix=outfile_prefix)
+            self._coco_api = COCO(coco_json_path)
+
+        # handle lazy init
+        if self.cat_ids is None:
+            self.cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['classes'])
+        if self.img_ids is None:
+            self.img_ids = self._coco_api.get_img_ids()
+
+        # convert predictions to coco format and dump to json file
+        result_files = self.results2json(preds, outfile_prefix)
+
+        eval_results = OrderedDict()
+        if self.format_only:
+            logger.info('results are saved in '
+                        f'{osp.dirname(outfile_prefix)}')
+            return eval_results
+
+        for metric in self.metrics:
+            logger.info(f'Evaluating {metric}...')
+
+            # TODO: May refactor fast_eval_recall to an independent metric?
+            # fast eval recall
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    preds, self.proposal_nums, self.iou_thrs, logger=logger)
+                log_msg = []
+                for i, num in enumerate(self.proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+                log_msg = ''.join(log_msg)
+                logger.info(log_msg)
+                continue
+
+            # evaluate proposal, bbox and segm
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            if metric not in result_files:
+                raise KeyError(f'{metric} is not in results')
+            try:
+                predictions = load(result_files[metric])
+                if iou_type == 'segm':
+                    # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331  # noqa
+                    # When evaluating mask AP, if the results contain bbox,
+                    # cocoapi will use the box area instead of the mask area
+                    # for calculating the instance area. Though the overall AP
+                    # is not affected, this leads to different
+                    # small/medium/large mask AP results.
+                    for x in predictions:
+                        x.pop('bbox')
+                coco_dt = self._coco_api.loadRes(predictions)
+
+            except IndexError:
+                logger.error(
+                    'The testing results of the whole dataset is empty.')
+                break
+
+            if self.use_mp_eval:
+                coco_eval = COCOevalMP(self._coco_api, coco_dt, iou_type)
+            else:
+                coco_eval = COCOeval(self._coco_api, coco_dt, iou_type)
+
+            coco_eval.params.catIds = self.cat_ids
+            coco_eval.params.imgIds = self.img_ids
+            coco_eval.params.maxDets = list(self.proposal_nums)
+            coco_eval.params.iouThrs = self.iou_thrs
+
+            # mapping of cocoEval.stats
+            coco_metric_names = {
+                'mAP': 0,
+                'mAP_50': 1,
+                'mAP_75': 2,
+                'mAP_s': 3,
+                'mAP_m': 4,
+                'mAP_l': 5,
+                'AR@100': 6,
+                'AR@300': 7,
+                'AR@1000': 8,
+                'AR_s@1000': 9,
+                'AR_m@1000': 10,
+                'AR_l@1000': 11
+            }
+            metric_items = self.metric_items
+            if metric_items is not None:
+                for metric_item in metric_items:
+                    if metric_item not in coco_metric_names:
+                        raise KeyError(
+                            f'metric item "{metric_item}" is not supported')
+
+            if metric == 'proposal':
+                coco_eval.params.useCats = 0
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if metric_items is None:
+                    metric_items = [
+                        'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
+                        'AR_m@1000', 'AR_l@1000'
+                    ]
+
+                for item in metric_items:
+                    val = float(
+                        f'{coco_eval.stats[coco_metric_names[item]]:.3f}')
+                    eval_results[item] = val
+            else:
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if self.classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = coco_eval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, cat_id in enumerate(self.cat_ids):
+                        t = []
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        nm = self._coco_api.loadCats(cat_id)[0]
+                        precision = precisions[:, :, idx, 0, -1]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        t.append(f'{nm["name"]}')
+                        t.append(f'{round(ap, 3)}')
+                        eval_results[f'{nm["name"]}_precision'] = round(ap, 3)
+
+                        # indexes of IoU  @50 and @75
+                        for iou in [0, 5]:
+                            precision = precisions[iou, :, idx, 0, -1]
+                            precision = precision[precision > -1]
+                            if precision.size:
+                                ap = np.mean(precision)
+                            else:
+                                ap = float('nan')
+                            t.append(f'{round(ap, 3)}')
+
+                        # indexes of area of small, median and large
+                        for area in [1, 2, 3]:
+                            precision = precisions[:, :, idx, area, -1]
+                            precision = precision[precision > -1]
+                            if precision.size:
+                                ap = np.mean(precision)
+                            else:
+                                ap = float('nan')
+                            t.append(f'{round(ap, 3)}')
+                        results_per_category.append(tuple(t))
+
+                    num_columns = len(results_per_category[0])
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = [
+                        'category', 'mAP', 'mAP_50', 'mAP_75', 'mAP_s',
+                        'mAP_m', 'mAP_l'
+                    ]
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    logger.info('\n' + table.table)
+
+                if metric_items is None:
+                    metric_items = [
+                        'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+                    ]
+
+                for metric_item in metric_items:
+                    key = f'{metric}_{metric_item}'
+                    val = coco_eval.stats[coco_metric_names[metric_item]]
+                    eval_results[key] = float(f'{round(val, 3)}')
+
+                ap = coco_eval.stats[:6]
+                logger.info(f'{metric}_mAP_copypaste: {ap[0]:.3f} '
+                            f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                            f'{ap[4]:.3f} {ap[5]:.3f}')
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/coco_occluded_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/coco_occluded_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..81235a04e6ee1929cfd6b5cdc284d239765b0d69
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/coco_occluded_metric.py
@@ -0,0 +1,204 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Union
+
+import mmengine
+import numpy as np
+from mmengine.fileio import load
+from mmengine.logging import print_log
+from pycocotools import mask as coco_mask
+from terminaltables import AsciiTable
+
+from mmdet.registry import METRICS
+from .coco_metric import CocoMetric
+
+
+@METRICS.register_module()
+class CocoOccludedSeparatedMetric(CocoMetric):
+    """Metric of separated and occluded masks which presented in paper `A Tri-
+    Layer Plugin to Improve Occluded Detection.
+
+    <https://arxiv.org/abs/2210.10046>`_.
+
+    Separated COCO and Occluded COCO are automatically generated subsets of
+    COCO val dataset, collecting separated objects and partially occluded
+    objects for a large variety of categories. In this way, we define
+    occlusion into two major categories: separated and partially occluded.
+
+    - Separation: target object segmentation mask is separated into distinct
+      regions by the occluder.
+    - Partial Occlusion: target object is partially occluded but the
+      segmentation mask is connected.
+
+    These two new scalable real-image datasets are to benchmark a model's
+    capability to detect occluded objects of 80 common categories.
+
+    Please cite the paper if you use this dataset:
+
+    @article{zhan2022triocc,
+        title={A Tri-Layer Plugin to Improve Occluded Detection},
+        author={Zhan, Guanqi and Xie, Weidi and Zisserman, Andrew},
+        journal={British Machine Vision Conference},
+        year={2022}
+    }
+
+    Args:
+        occluded_ann (str): Path to the occluded coco annotation file.
+        separated_ann (str): Path to the separated coco annotation file.
+        score_thr (float): Score threshold of the detection masks.
+            Defaults to 0.3.
+        iou_thr (float): IoU threshold for the recall calculation.
+            Defaults to 0.75.
+        metric (str | List[str]): Metrics to be evaluated. Valid metrics
+            include 'bbox', 'segm', 'proposal', and 'proposal_fast'.
+            Defaults to 'bbox'.
+    """
+    default_prefix: Optional[str] = 'coco'
+
+    def __init__(
+            self,
+            *args,
+            occluded_ann:
+        str = 'https://www.robots.ox.ac.uk/~vgg/research/tpod/datasets/occluded_coco.pkl',  # noqa
+            separated_ann:
+        str = 'https://www.robots.ox.ac.uk/~vgg/research/tpod/datasets/separated_coco.pkl',  # noqa
+            score_thr: float = 0.3,
+            iou_thr: float = 0.75,
+            metric: Union[str, List[str]] = ['bbox', 'segm'],
+            **kwargs) -> None:
+        super().__init__(*args, metric=metric, **kwargs)
+        self.occluded_ann = load(occluded_ann)
+        self.separated_ann = load(separated_ann)
+        self.score_thr = score_thr
+        self.iou_thr = iou_thr
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        coco_metric_res = super().compute_metrics(results)
+        eval_res = self.evaluate_occluded_separated(results)
+        coco_metric_res.update(eval_res)
+        return coco_metric_res
+
+    def evaluate_occluded_separated(self, results: List[tuple]) -> dict:
+        """Compute the recall of occluded and separated masks.
+
+        Args:
+            results (list[tuple]): Testing results of the dataset.
+
+        Returns:
+            dict[str, float]: The recall of occluded and separated masks.
+        """
+        dict_det = {}
+        print_log('processing detection results...')
+        prog_bar = mmengine.ProgressBar(len(results))
+        for i in range(len(results)):
+            gt, dt = results[i]
+            img_id = dt['img_id']
+            cur_img_name = self._coco_api.imgs[img_id]['file_name']
+            if cur_img_name not in dict_det.keys():
+                dict_det[cur_img_name] = []
+
+            for bbox, score, label, mask in zip(dt['bboxes'], dt['scores'],
+                                                dt['labels'], dt['masks']):
+                cur_binary_mask = coco_mask.decode(mask)
+                dict_det[cur_img_name].append([
+                    score, self.dataset_meta['classes'][label],
+                    cur_binary_mask, bbox
+                ])
+            dict_det[cur_img_name].sort(
+                key=lambda x: (-x[0], x[3][0], x[3][1])
+            )  # rank by confidence from high to low, avoid same confidence
+            prog_bar.update()
+        print_log('\ncomputing occluded mask recall...', logger='current')
+        occluded_correct_num, occluded_recall = self.compute_recall(
+            dict_det, gt_ann=self.occluded_ann, is_occ=True)
+        print_log(
+            f'\nCOCO occluded mask recall: {occluded_recall:.2f}%',
+            logger='current')
+        print_log(
+            f'COCO occluded mask success num: {occluded_correct_num}',
+            logger='current')
+        print_log('computing separated mask recall...', logger='current')
+        separated_correct_num, separated_recall = self.compute_recall(
+            dict_det, gt_ann=self.separated_ann, is_occ=False)
+        print_log(
+            f'\nCOCO separated mask recall: {separated_recall:.2f}%',
+            logger='current')
+        print_log(
+            f'COCO separated mask success num: {separated_correct_num}',
+            logger='current')
+        table_data = [
+            ['mask type', 'recall', 'num correct'],
+            ['occluded', f'{occluded_recall:.2f}%', occluded_correct_num],
+            ['separated', f'{separated_recall:.2f}%', separated_correct_num]
+        ]
+        table = AsciiTable(table_data)
+        print_log('\n' + table.table, logger='current')
+        return dict(
+            occluded_recall=occluded_recall, separated_recall=separated_recall)
+
+    def compute_recall(self,
+                       result_dict: dict,
+                       gt_ann: list,
+                       is_occ: bool = True) -> tuple:
+        """Compute the recall of occluded or separated masks.
+
+        Args:
+            result_dict (dict): Processed mask results.
+            gt_ann (list): Occluded or separated coco annotations.
+            is_occ (bool): Whether the annotation is occluded mask.
+                Defaults to True.
+        Returns:
+            tuple: number of correct masks and the recall.
+        """
+        correct = 0
+        prog_bar = mmengine.ProgressBar(len(gt_ann))
+        for iter_i in range(len(gt_ann)):
+            cur_item = gt_ann[iter_i]
+            cur_img_name = cur_item[0]
+            cur_gt_bbox = cur_item[3]
+            if is_occ:
+                cur_gt_bbox = [
+                    cur_gt_bbox[0], cur_gt_bbox[1],
+                    cur_gt_bbox[0] + cur_gt_bbox[2],
+                    cur_gt_bbox[1] + cur_gt_bbox[3]
+                ]
+            cur_gt_class = cur_item[1]
+            cur_gt_mask = coco_mask.decode(cur_item[4])
+
+            assert cur_img_name in result_dict.keys()
+            cur_detections = result_dict[cur_img_name]
+
+            correct_flag = False
+            for i in range(len(cur_detections)):
+                cur_det_confidence = cur_detections[i][0]
+                if cur_det_confidence < self.score_thr:
+                    break
+                cur_det_class = cur_detections[i][1]
+                if cur_det_class != cur_gt_class:
+                    continue
+                cur_det_mask = cur_detections[i][2]
+                cur_iou = self.mask_iou(cur_det_mask, cur_gt_mask)
+                if cur_iou >= self.iou_thr:
+                    correct_flag = True
+                    break
+            if correct_flag:
+                correct += 1
+            prog_bar.update()
+        recall = correct / len(gt_ann) * 100
+        return correct, recall
+
+    def mask_iou(self, mask1: np.ndarray, mask2: np.ndarray) -> np.ndarray:
+        """Compute IoU between two masks."""
+        mask1_area = np.count_nonzero(mask1 == 1)
+        mask2_area = np.count_nonzero(mask2 == 1)
+        intersection = np.count_nonzero(np.logical_and(mask1 == 1, mask2 == 1))
+        iou = intersection / (mask1_area + mask2_area - intersection)
+        return iou
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/coco_panoptic_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/coco_panoptic_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..f86be916f9cacbdd1160d0fdb3dd6b5d43399299
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/coco_panoptic_metric.py
@@ -0,0 +1,618 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import itertools
+import os.path as osp
+import tempfile
+from typing import Dict, Optional, Sequence, Tuple, Union
+
+import mmcv
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import dump, get_local_path, load
+from mmengine.logging import MMLogger, print_log
+from terminaltables import AsciiTable
+
+from mmdet.datasets.api_wrappers import COCOPanoptic
+from mmdet.registry import METRICS
+from ..functional import (INSTANCE_OFFSET, pq_compute_multi_core,
+                          pq_compute_single_core)
+
+try:
+    import panopticapi
+    from panopticapi.evaluation import VOID, PQStat
+    from panopticapi.utils import id2rgb, rgb2id
+except ImportError:
+    panopticapi = None
+    id2rgb = None
+    rgb2id = None
+    VOID = None
+    PQStat = None
+
+
+@METRICS.register_module()
+class CocoPanopticMetric(BaseMetric):
+    """COCO panoptic segmentation evaluation metric.
+
+    Evaluate PQ, SQ RQ for panoptic segmentation tasks. Please refer to
+    https://cocodataset.org/#panoptic-eval for more details.
+
+    Args:
+        ann_file (str, optional): Path to the coco format annotation file.
+            If not specified, ground truth annotations from the dataset will
+            be converted to coco format. Defaults to None.
+        seg_prefix (str, optional): Path to the directory which contains the
+            coco panoptic segmentation mask. It should be specified when
+            evaluate. Defaults to None.
+        classwise (bool): Whether to evaluate the metric class-wise.
+            Defaults to False.
+        outfile_prefix (str, optional): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created.
+            It should be specified when format_only is True. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        nproc (int): Number of processes for panoptic quality computing.
+            Defaults to 32. When ``nproc`` exceeds the number of cpu cores,
+            the number of cpu cores is used.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+    default_prefix: Optional[str] = 'coco_panoptic'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 seg_prefix: Optional[str] = None,
+                 classwise: bool = False,
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 nproc: int = 32,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        if panopticapi is None:
+            raise RuntimeError(
+                'panopticapi is not installed, please install it by: '
+                'pip install git+https://github.com/cocodataset/'
+                'panopticapi.git.')
+
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.classwise = classwise
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+
+        self.tmp_dir = None
+        # outfile_prefix should be a prefix of a path which points to a shared
+        # storage when train or test with multi nodes.
+        self.outfile_prefix = outfile_prefix
+        if outfile_prefix is None:
+            self.tmp_dir = tempfile.TemporaryDirectory()
+            self.outfile_prefix = osp.join(self.tmp_dir.name, 'results')
+        # the directory to save predicted panoptic segmentation mask
+        self.seg_out_dir = f'{self.outfile_prefix}.panoptic'
+        self.nproc = nproc
+        self.seg_prefix = seg_prefix
+
+        self.cat_ids = None
+        self.cat2label = None
+
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        if ann_file:
+            with get_local_path(
+                    ann_file, backend_args=self.backend_args) as local_path:
+                self._coco_api = COCOPanoptic(local_path)
+            self.categories = self._coco_api.cats
+        else:
+            self._coco_api = None
+            self.categories = None
+
+    def __del__(self) -> None:
+        """Clean up."""
+        if self.tmp_dir is not None:
+            self.tmp_dir.cleanup()
+
+    def gt_to_coco_json(self, gt_dicts: Sequence[dict],
+                        outfile_prefix: str) -> Tuple[str, str]:
+        """Convert ground truth to coco panoptic segmentation format json file.
+
+        Args:
+            gt_dicts (Sequence[dict]): Ground truth of the dataset.
+            outfile_prefix (str): The filename prefix of the json file. If the
+                prefix is "somepath/xxx", the json file will be named
+                "somepath/xxx.gt.json".
+
+        Returns:
+            Tuple[str, str]: The filename of the json file and the name of the\
+                directory which contains panoptic segmentation masks.
+        """
+        assert len(gt_dicts) > 0, 'gt_dicts is empty.'
+        gt_folder = osp.dirname(gt_dicts[0]['seg_map_path'])
+        converted_json_path = f'{outfile_prefix}.gt.json'
+
+        categories = []
+        for id, name in enumerate(self.dataset_meta['classes']):
+            isthing = 1 if name in self.dataset_meta['thing_classes'] else 0
+            categories.append({'id': id, 'name': name, 'isthing': isthing})
+
+        image_infos = []
+        annotations = []
+        for gt_dict in gt_dicts:
+            img_id = gt_dict['image_id']
+            image_info = {
+                'id': img_id,
+                'width': gt_dict['width'],
+                'height': gt_dict['height'],
+                'file_name': osp.split(gt_dict['seg_map_path'])[-1]
+            }
+            image_infos.append(image_info)
+
+            pan_png = mmcv.imread(gt_dict['seg_map_path']).squeeze()
+            pan_png = pan_png[:, :, ::-1]
+            pan_png = rgb2id(pan_png)
+            segments_info = []
+            for segment_info in gt_dict['segments_info']:
+                id = segment_info['id']
+                label = segment_info['category']
+                mask = pan_png == id
+                isthing = categories[label]['isthing']
+                if isthing:
+                    iscrowd = 1 if not segment_info['is_thing'] else 0
+                else:
+                    iscrowd = 0
+
+                new_segment_info = {
+                    'id': id,
+                    'category_id': label,
+                    'isthing': isthing,
+                    'iscrowd': iscrowd,
+                    'area': mask.sum()
+                }
+                segments_info.append(new_segment_info)
+
+            segm_file = image_info['file_name'].replace('.jpg', '.png')
+            annotation = dict(
+                image_id=img_id,
+                segments_info=segments_info,
+                file_name=segm_file)
+            annotations.append(annotation)
+            pan_png = id2rgb(pan_png)
+
+        info = dict(
+            date_created=str(datetime.datetime.now()),
+            description='Coco json file converted by mmdet CocoPanopticMetric.'
+        )
+        coco_json = dict(
+            info=info,
+            images=image_infos,
+            categories=categories,
+            licenses=None,
+        )
+        if len(annotations) > 0:
+            coco_json['annotations'] = annotations
+        dump(coco_json, converted_json_path)
+        return converted_json_path, gt_folder
+
+    def result2json(self, results: Sequence[dict],
+                    outfile_prefix: str) -> Tuple[str, str]:
+        """Dump the panoptic results to a COCO style json file and a directory.
+
+        Args:
+            results (Sequence[dict]): Testing results of the dataset.
+            outfile_prefix (str): The filename prefix of the json files and the
+                directory.
+
+        Returns:
+            Tuple[str, str]: The json file and the directory which contains \
+                panoptic segmentation masks. The filename of the json is
+                "somepath/xxx.panoptic.json" and name of the directory is
+                "somepath/xxx.panoptic".
+        """
+        label2cat = dict((v, k) for (k, v) in self.cat2label.items())
+        pred_annotations = []
+        for idx in range(len(results)):
+            result = results[idx]
+            for segment_info in result['segments_info']:
+                sem_label = segment_info['category_id']
+                # convert sem_label to json label
+                cat_id = label2cat[sem_label]
+                segment_info['category_id'] = label2cat[sem_label]
+                is_thing = self.categories[cat_id]['isthing']
+                segment_info['isthing'] = is_thing
+            pred_annotations.append(result)
+        pan_json_results = dict(annotations=pred_annotations)
+        json_filename = f'{outfile_prefix}.panoptic.json'
+        dump(pan_json_results, json_filename)
+        return json_filename, (
+            self.seg_out_dir
+            if self.tmp_dir is None else tempfile.gettempdir())
+
+    def _parse_predictions(self,
+                           pred: dict,
+                           img_id: int,
+                           segm_file: str,
+                           label2cat=None) -> dict:
+        """Parse panoptic segmentation predictions.
+
+        Args:
+            pred (dict): Panoptic segmentation predictions.
+            img_id (int): Image id.
+            segm_file (str): Segmentation file name.
+            label2cat (dict): Mapping from label to category id.
+                Defaults to None.
+
+        Returns:
+            dict: Parsed predictions.
+        """
+        result = dict()
+        result['img_id'] = img_id
+        # shape (1, H, W) -> (H, W)
+        pan = pred['pred_panoptic_seg']['sem_seg'].cpu().numpy()[0]
+        ignore_index = pred['pred_panoptic_seg'].get(
+            'ignore_index', len(self.dataset_meta['classes']))
+        pan_labels = np.unique(pan)
+        segments_info = []
+        for pan_label in pan_labels:
+            sem_label = pan_label % INSTANCE_OFFSET
+            # We reserve the length of dataset_meta['classes']
+            # and ignore_index for VOID label
+            if sem_label == len(
+                    self.dataset_meta['classes']) or sem_label == ignore_index:
+                continue
+            mask = pan == pan_label
+            area = mask.sum()
+            segments_info.append({
+                'id':
+                int(pan_label),
+                # when ann_file provided, sem_label should be cat_id, otherwise
+                # sem_label should be a continuous id, not the cat_id
+                # defined in dataset
+                'category_id':
+                label2cat[sem_label] if label2cat else sem_label,
+                'area':
+                int(area)
+            })
+        # evaluation script uses 0 for VOID label.
+        pan[pan % INSTANCE_OFFSET == len(self.dataset_meta['classes'])] = VOID
+        pan[pan % INSTANCE_OFFSET == ignore_index] = VOID
+
+        pan = id2rgb(pan).astype(np.uint8)
+        mmcv.imwrite(pan[:, :, ::-1], osp.join(self.seg_out_dir, segm_file))
+        result = {
+            'image_id': img_id,
+            'segments_info': segments_info,
+            'file_name': segm_file
+        }
+
+        return result
+
+    def _compute_batch_pq_stats(self, data_samples: Sequence[dict]):
+        """Process gts and predictions when ``outfile_prefix`` is not set, gts
+        are from dataset or a json file which is defined by ``ann_file``.
+
+        Intermediate results, ``pq_stats``, are computed here and put into
+        ``self.results``.
+        """
+        if self._coco_api is None:
+            categories = dict()
+            for id, name in enumerate(self.dataset_meta['classes']):
+                isthing = 1 if name in self.dataset_meta['thing_classes']\
+                    else 0
+                categories[id] = {'id': id, 'name': name, 'isthing': isthing}
+            label2cat = None
+        else:
+            categories = self.categories
+            cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['classes'])
+            label2cat = {i: cat_id for i, cat_id in enumerate(cat_ids)}
+
+        for data_sample in data_samples:
+            # parse pred
+            img_id = data_sample['img_id']
+            segm_file = osp.basename(data_sample['img_path']).replace(
+                '.jpg', '.png')
+            result = self._parse_predictions(
+                pred=data_sample,
+                img_id=img_id,
+                segm_file=segm_file,
+                label2cat=label2cat)
+
+            # parse gt
+            gt = dict()
+            gt['image_id'] = img_id
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+            gt['file_name'] = segm_file
+
+            if self._coco_api is None:
+                # get segments_info from data_sample
+                seg_map_path = osp.join(self.seg_prefix, segm_file)
+                pan_png = mmcv.imread(seg_map_path).squeeze()
+                pan_png = pan_png[:, :, ::-1]
+                pan_png = rgb2id(pan_png)
+                segments_info = []
+
+                for segment_info in data_sample['segments_info']:
+                    id = segment_info['id']
+                    label = segment_info['category']
+                    mask = pan_png == id
+                    isthing = categories[label]['isthing']
+                    if isthing:
+                        iscrowd = 1 if not segment_info['is_thing'] else 0
+                    else:
+                        iscrowd = 0
+
+                    new_segment_info = {
+                        'id': id,
+                        'category_id': label,
+                        'isthing': isthing,
+                        'iscrowd': iscrowd,
+                        'area': mask.sum()
+                    }
+                    segments_info.append(new_segment_info)
+            else:
+                # get segments_info from annotation file
+                segments_info = self._coco_api.imgToAnns[img_id]
+
+            gt['segments_info'] = segments_info
+
+            pq_stats = pq_compute_single_core(
+                proc_id=0,
+                annotation_set=[(gt, result)],
+                gt_folder=self.seg_prefix,
+                pred_folder=self.seg_out_dir,
+                categories=categories,
+                backend_args=self.backend_args)
+
+            self.results.append(pq_stats)
+
+    def _process_gt_and_predictions(self, data_samples: Sequence[dict]):
+        """Process gts and predictions when ``outfile_prefix`` is set.
+
+        The predictions will be saved to directory specified by
+        ``outfile_predfix``. The matched pair (gt, result) will be put into
+        ``self.results``.
+        """
+        for data_sample in data_samples:
+            # parse pred
+            img_id = data_sample['img_id']
+            segm_file = osp.basename(data_sample['img_path']).replace(
+                '.jpg', '.png')
+            result = self._parse_predictions(
+                pred=data_sample, img_id=img_id, segm_file=segm_file)
+
+            # parse gt
+            gt = dict()
+            gt['image_id'] = img_id
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+
+            if self._coco_api is None:
+                # get segments_info from dataset
+                gt['segments_info'] = data_sample['segments_info']
+                gt['seg_map_path'] = data_sample['seg_map_path']
+
+            self.results.append((gt, result))
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        # If ``self.tmp_dir`` is none, it will save gt and predictions to
+        # self.results, otherwise, it will compute pq_stats here.
+        if self.tmp_dir is None:
+            self._process_gt_and_predictions(data_samples)
+        else:
+            self._compute_batch_pq_stats(data_samples)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch. There
+                are two cases:
+
+                - When ``outfile_prefix`` is not provided, the elements in
+                  results are pq_stats which can be summed directly to get PQ.
+                - When ``outfile_prefix`` is provided, the elements in
+                  results are tuples like (gt, pred).
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        if self.tmp_dir is None:
+            # do evaluation after collect all the results
+
+            # split gt and prediction list
+            gts, preds = zip(*results)
+
+            if self._coco_api is None:
+                # use converted gt json file to initialize coco api
+                logger.info('Converting ground truth to coco format...')
+                coco_json_path, gt_folder = self.gt_to_coco_json(
+                    gt_dicts=gts, outfile_prefix=self.outfile_prefix)
+                self._coco_api = COCOPanoptic(coco_json_path)
+            else:
+                gt_folder = self.seg_prefix
+
+            self.cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['classes'])
+            self.cat2label = {
+                cat_id: i
+                for i, cat_id in enumerate(self.cat_ids)
+            }
+            self.img_ids = self._coco_api.get_img_ids()
+            self.categories = self._coco_api.cats
+
+            # convert predictions to coco format and dump to json file
+            json_filename, pred_folder = self.result2json(
+                results=preds, outfile_prefix=self.outfile_prefix)
+
+            if self.format_only:
+                logger.info('results are saved in '
+                            f'{osp.dirname(self.outfile_prefix)}')
+                return dict()
+
+            imgs = self._coco_api.imgs
+            gt_json = self._coco_api.img_ann_map
+            gt_json = [{
+                'image_id': k,
+                'segments_info': v,
+                'file_name': imgs[k]['segm_file']
+            } for k, v in gt_json.items()]
+            pred_json = load(json_filename)
+            pred_json = dict(
+                (el['image_id'], el) for el in pred_json['annotations'])
+
+            # match the gt_anns and pred_anns in the same image
+            matched_annotations_list = []
+            for gt_ann in gt_json:
+                img_id = gt_ann['image_id']
+                if img_id not in pred_json.keys():
+                    raise Exception('no prediction for the image'
+                                    ' with id: {}'.format(img_id))
+                matched_annotations_list.append((gt_ann, pred_json[img_id]))
+
+            pq_stat = pq_compute_multi_core(
+                matched_annotations_list,
+                gt_folder,
+                pred_folder,
+                self.categories,
+                backend_args=self.backend_args,
+                nproc=self.nproc)
+
+        else:
+            # aggregate the results generated in process
+            if self._coco_api is None:
+                categories = dict()
+                for id, name in enumerate(self.dataset_meta['classes']):
+                    isthing = 1 if name in self.dataset_meta[
+                        'thing_classes'] else 0
+                    categories[id] = {
+                        'id': id,
+                        'name': name,
+                        'isthing': isthing
+                    }
+                self.categories = categories
+
+            pq_stat = PQStat()
+            for result in results:
+                pq_stat += result
+
+        metrics = [('All', None), ('Things', True), ('Stuff', False)]
+        pq_results = {}
+
+        for name, isthing in metrics:
+            pq_results[name], classwise_results = pq_stat.pq_average(
+                self.categories, isthing=isthing)
+            if name == 'All':
+                pq_results['classwise'] = classwise_results
+
+        classwise_results = None
+        if self.classwise:
+            classwise_results = {
+                k: v
+                for k, v in zip(self.dataset_meta['classes'],
+                                pq_results['classwise'].values())
+            }
+
+        print_panoptic_table(pq_results, classwise_results, logger=logger)
+        results = parse_pq_results(pq_results)
+
+        return results
+
+
+def parse_pq_results(pq_results: dict) -> dict:
+    """Parse the Panoptic Quality results.
+
+    Args:
+        pq_results (dict): Panoptic Quality results.
+
+    Returns:
+        dict: Panoptic Quality results parsed.
+    """
+    result = dict()
+    result['PQ'] = 100 * pq_results['All']['pq']
+    result['SQ'] = 100 * pq_results['All']['sq']
+    result['RQ'] = 100 * pq_results['All']['rq']
+    result['PQ_th'] = 100 * pq_results['Things']['pq']
+    result['SQ_th'] = 100 * pq_results['Things']['sq']
+    result['RQ_th'] = 100 * pq_results['Things']['rq']
+    result['PQ_st'] = 100 * pq_results['Stuff']['pq']
+    result['SQ_st'] = 100 * pq_results['Stuff']['sq']
+    result['RQ_st'] = 100 * pq_results['Stuff']['rq']
+    return result
+
+
+def print_panoptic_table(
+        pq_results: dict,
+        classwise_results: Optional[dict] = None,
+        logger: Optional[Union['MMLogger', str]] = None) -> None:
+    """Print the panoptic evaluation results table.
+
+    Args:
+        pq_results(dict): The Panoptic Quality results.
+        classwise_results(dict, optional): The classwise Panoptic Quality.
+            results. The keys are class names and the values are metrics.
+            Defaults to None.
+        logger (:obj:`MMLogger` | str, optional): Logger used for printing
+            related information during evaluation. Default: None.
+    """
+
+    headers = ['', 'PQ', 'SQ', 'RQ', 'categories']
+    data = [headers]
+    for name in ['All', 'Things', 'Stuff']:
+        numbers = [
+            f'{(pq_results[name][k] * 100):0.3f}' for k in ['pq', 'sq', 'rq']
+        ]
+        row = [name] + numbers + [pq_results[name]['n']]
+        data.append(row)
+    table = AsciiTable(data)
+    print_log('Panoptic Evaluation Results:\n' + table.table, logger=logger)
+
+    if classwise_results is not None:
+        class_metrics = [(name, ) + tuple(f'{(metrics[k] * 100):0.3f}'
+                                          for k in ['pq', 'sq', 'rq'])
+                         for name, metrics in classwise_results.items()]
+        num_columns = min(8, len(class_metrics) * 4)
+        results_flatten = list(itertools.chain(*class_metrics))
+        headers = ['category', 'PQ', 'SQ', 'RQ'] * (num_columns // 4)
+        results_2d = itertools.zip_longest(
+            *[results_flatten[i::num_columns] for i in range(num_columns)])
+        data = [headers]
+        data += [result for result in results_2d]
+        table = AsciiTable(data)
+        print_log(
+            'Classwise Panoptic Evaluation Results:\n' + table.table,
+            logger=logger)
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/coco_video_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/coco_video_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5c75d025a6109762db21a600e3d866764caf1cb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/coco_video_metric.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Sequence
+
+from mmengine.dist import broadcast_object_list, is_main_process
+
+from mmdet.registry import METRICS
+from .base_video_metric import collect_tracking_results
+from .coco_metric import CocoMetric
+
+
+@METRICS.register_module()
+class CocoVideoMetric(CocoMetric):
+    """COCO evaluation metric.
+
+    Evaluate AR, AP, and mAP for detection tasks including proposal/box
+    detection and instance segmentation. Please refer to
+    https://cocodataset.org/#detection-eval for more details.
+    """
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for track_data_sample in data_samples:
+            video_data_samples = track_data_sample['video_data_samples']
+            ori_video_len = video_data_samples[0].ori_video_length
+            video_len = len(video_data_samples)
+            if ori_video_len == video_len:
+                # video process
+                for frame_id in range(video_len):
+                    img_data_sample = video_data_samples[frame_id].to_dict()
+                    super().process(None, [img_data_sample])
+            else:
+                # image process
+                img_data_sample = video_data_samples[0].to_dict()
+                super().process(None, [img_data_sample])
+
+    def evaluate(self, size: int = 1) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        if len(self.results) == 0:
+            warnings.warn(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.')
+
+        results = collect_tracking_results(self.results, self.collect_device)
+
+        if is_main_process():
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/crowdhuman_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/crowdhuman_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..50ac210ae8606bab6cada69418334c113c90fb38
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/crowdhuman_metric.py
@@ -0,0 +1,824 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import json
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+from multiprocessing import Process, Queue
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import dump, get_text, load
+from mmengine.logging import MMLogger
+from scipy.sparse import csr_matrix
+from scipy.sparse.csgraph import maximum_bipartite_matching
+
+from mmdet.evaluation.functional.bbox_overlaps import bbox_overlaps
+from mmdet.registry import METRICS
+
+PERSON_CLASSES = ['background', 'person']
+
+
+@METRICS.register_module()
+class CrowdHumanMetric(BaseMetric):
+    """CrowdHuman evaluation metric.
+
+    Evaluate Average Precision (AP), Miss Rate (MR) and Jaccard Index (JI)
+    for detection tasks.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        metric (str | List[str]): Metrics to be evaluated. Valid metrics
+            include 'AP', 'MR' and 'JI'. Defaults to 'AP'.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        outfile_prefix (str, optional): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+        eval_mode (int): Select the mode of evaluate. Valid mode include
+            0(just body box), 1(just head box) and 2(both of them).
+            Defaults to 0.
+        iou_thres (float): IoU threshold. Defaults to 0.5.
+        compare_matching_method (str, optional): Matching method to compare
+            the detection results with the ground_truth when compute 'AP'
+            and 'MR'.Valid method include VOC and None(CALTECH). Default to
+            None.
+        mr_ref (str): Different parameter selection to calculate MR. Valid
+            ref include CALTECH_-2 and CALTECH_-4. Defaults to CALTECH_-2.
+        num_ji_process (int): The number of processes to evaluation JI.
+            Defaults to 10.
+    """
+    default_prefix: Optional[str] = 'crowd_human'
+
+    def __init__(self,
+                 ann_file: str,
+                 metric: Union[str, List[str]] = ['AP', 'MR', 'JI'],
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 eval_mode: int = 0,
+                 iou_thres: float = 0.5,
+                 compare_matching_method: Optional[str] = None,
+                 mr_ref: str = 'CALTECH_-2',
+                 num_ji_process: int = 10) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.ann_file = ann_file
+        # crowdhuman evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['MR', 'AP', 'JI']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f"metric should be one of 'MR', 'AP', 'JI',"
+                               f'but got {metric}.')
+
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+        self.outfile_prefix = outfile_prefix
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        assert eval_mode in [0, 1, 2], \
+            "Unknown eval mode. mr_ref should be one of '0', '1', '2'."
+        assert compare_matching_method is None or \
+               compare_matching_method == 'VOC', \
+               'The alternative compare_matching_method is VOC.' \
+               'This parameter defaults to CALTECH(None)'
+        assert mr_ref == 'CALTECH_-2' or mr_ref == 'CALTECH_-4', \
+            "mr_ref should be one of 'CALTECH_-2', 'CALTECH_-4'."
+        self.eval_mode = eval_mode
+        self.iou_thres = iou_thres
+        self.compare_matching_method = compare_matching_method
+        self.mr_ref = mr_ref
+        self.num_ji_process = num_ji_process
+
+    @staticmethod
+    def results2json(results: Sequence[dict], outfile_prefix: str) -> str:
+        """Dump the detection results to a json file."""
+        result_file_path = f'{outfile_prefix}.json'
+        bbox_json_results = []
+        for i, result in enumerate(results):
+            ann, pred = result
+            dump_dict = dict()
+            dump_dict['ID'] = ann['ID']
+            dump_dict['width'] = ann['width']
+            dump_dict['height'] = ann['height']
+            dtboxes = []
+            bboxes = pred.tolist()
+            for _, single_bbox in enumerate(bboxes):
+                temp_dict = dict()
+                x1, y1, x2, y2, score = single_bbox
+                temp_dict['box'] = [x1, y1, x2 - x1, y2 - y1]
+                temp_dict['score'] = score
+                temp_dict['tag'] = 1
+                dtboxes.append(temp_dict)
+            dump_dict['dtboxes'] = dtboxes
+            bbox_json_results.append(dump_dict)
+        dump(bbox_json_results, result_file_path)
+        return result_file_path
+
+    def process(self, data_batch: Sequence[dict],
+                data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            ann = dict()
+            ann['ID'] = data_sample['img_id']
+            ann['width'] = data_sample['ori_shape'][1]
+            ann['height'] = data_sample['ori_shape'][0]
+            pred_bboxes = data_sample['pred_instances']['bboxes'].cpu().numpy()
+            pred_scores = data_sample['pred_instances']['scores'].cpu().numpy()
+
+            pred_bbox_scores = np.hstack(
+                [pred_bboxes, pred_scores.reshape((-1, 1))])
+
+            self.results.append((ann, pred_bbox_scores))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            eval_results(Dict[str, float]): The computed metrics.
+            The keys are the names of the metrics, and the values
+            are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        tmp_dir = None
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'result')
+        else:
+            outfile_prefix = self.outfile_prefix
+
+        # convert predictions to coco format and dump to json file
+        result_file = self.results2json(results, outfile_prefix)
+        eval_results = OrderedDict()
+        if self.format_only:
+            logger.info(f'results are saved in {osp.dirname(outfile_prefix)}')
+            return eval_results
+
+        # load evaluation samples
+        eval_samples = self.load_eval_samples(result_file)
+
+        if 'AP' in self.metrics or 'MR' in self.metrics:
+            score_list = self.compare(eval_samples)
+            gt_num = sum([eval_samples[i].gt_num for i in eval_samples])
+            ign_num = sum([eval_samples[i].ign_num for i in eval_samples])
+            gt_num = gt_num - ign_num
+            img_num = len(eval_samples)
+
+        for metric in self.metrics:
+            logger.info(f'Evaluating {metric}...')
+            if metric == 'AP':
+                AP = self.eval_ap(score_list, gt_num, img_num)
+                eval_results['mAP'] = float(f'{round(AP, 4)}')
+            if metric == 'MR':
+                MR = self.eval_mr(score_list, gt_num, img_num)
+                eval_results['mMR'] = float(f'{round(MR, 4)}')
+            if metric == 'JI':
+                JI = self.eval_ji(eval_samples)
+                eval_results['JI'] = float(f'{round(JI, 4)}')
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        return eval_results
+
+    def load_eval_samples(self, result_file):
+        """Load data from annotations file and detection results.
+
+        Args:
+            result_file (str): The file path of the saved detection results.
+
+        Returns:
+            Dict[Image]: The detection result packaged by Image
+        """
+        gt_str = get_text(
+            self.ann_file, backend_args=self.backend_args).strip().split('\n')
+        gt_records = [json.loads(line) for line in gt_str]
+
+        pred_records = load(result_file, backend_args=self.backend_args)
+        eval_samples = dict()
+        for gt_record, pred_record in zip(gt_records, pred_records):
+            assert gt_record['ID'] == pred_record['ID'], \
+                'please set val_dataloader.sampler.shuffle=False and try again'
+            eval_samples[pred_record['ID']] = Image(self.eval_mode)
+            eval_samples[pred_record['ID']].load(gt_record, 'box', None,
+                                                 PERSON_CLASSES, True)
+            eval_samples[pred_record['ID']].load(pred_record, 'box', None,
+                                                 PERSON_CLASSES, False)
+            eval_samples[pred_record['ID']].clip_all_boader()
+        return eval_samples
+
+    def compare(self, samples):
+        """Match the detection results with the ground_truth.
+
+        Args:
+            samples (dict[Image]): The detection result packaged by Image.
+
+        Returns:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+            a list of tuples (dtbox, label, imgID) in the descending
+            sort of dtbox.score.
+        """
+        score_list = list()
+        for id in samples:
+            if self.compare_matching_method == 'VOC':
+                result = samples[id].compare_voc(self.iou_thres)
+            else:
+                result = samples[id].compare_caltech(self.iou_thres)
+            score_list.extend(result)
+        # In the descending sort of dtbox score.
+        score_list.sort(key=lambda x: x[0][-1], reverse=True)
+        return score_list
+
+    @staticmethod
+    def eval_ap(score_list, gt_num, img_num):
+        """Evaluate by average precision.
+
+        Args:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+                a list of tuples (dtbox, label, imgID) in the descending
+                sort of dtbox.score.
+            gt_num(int): The number of gt boxes in the entire dataset.
+            img_num(int): The number of images in the entire dataset.
+
+        Returns:
+            ap(float): result of average precision.
+        """
+
+        # calculate general ap score
+        def _calculate_map(_recall, _precision):
+            assert len(_recall) == len(_precision)
+            area = 0
+            for k in range(1, len(_recall)):
+                delta_h = (_precision[k - 1] + _precision[k]) / 2
+                delta_w = _recall[k] - _recall[k - 1]
+                area += delta_w * delta_h
+            return area
+
+        tp, fp = 0.0, 0.0
+        rpX, rpY = list(), list()
+
+        fpn = []
+        recalln = []
+        thr = []
+        fppi = []
+        for i, item in enumerate(score_list):
+            if item[1] == 1:
+                tp += 1.0
+            elif item[1] == 0:
+                fp += 1.0
+            fn = gt_num - tp
+            recall = tp / (tp + fn)
+            precision = tp / (tp + fp)
+            rpX.append(recall)
+            rpY.append(precision)
+            fpn.append(fp)
+            recalln.append(tp)
+            thr.append(item[0][-1])
+            fppi.append(fp / img_num)
+
+        ap = _calculate_map(rpX, rpY)
+        return ap
+
+    def eval_mr(self, score_list, gt_num, img_num):
+        """Evaluate by Caltech-style log-average miss rate.
+
+        Args:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+                a list of tuples (dtbox, label, imgID) in the descending
+                sort of dtbox.score.
+            gt_num(int): The number of gt boxes in the entire dataset.
+            img_num(int): The number of image in the entire dataset.
+
+        Returns:
+            mr(float): result of miss rate.
+        """
+
+        # find greater_than
+        def _find_gt(lst, target):
+            for idx, _item in enumerate(lst):
+                if _item >= target:
+                    return idx
+            return len(lst) - 1
+
+        if self.mr_ref == 'CALTECH_-2':
+            # CALTECH_MRREF_2: anchor points (from 10^-2 to 1) as in
+            # P.Dollar's paper
+            ref = [
+                0.0100, 0.0178, 0.03160, 0.0562, 0.1000, 0.1778, 0.3162,
+                0.5623, 1.000
+            ]
+        else:
+            # CALTECH_MRREF_4: anchor points (from 10^-4 to 1) as in
+            # S.Zhang's paper
+            ref = [
+                0.0001, 0.0003, 0.00100, 0.0032, 0.0100, 0.0316, 0.1000,
+                0.3162, 1.000
+            ]
+
+        tp, fp = 0.0, 0.0
+        fppiX, fppiY = list(), list()
+        for i, item in enumerate(score_list):
+            if item[1] == 1:
+                tp += 1.0
+            elif item[1] == 0:
+                fp += 1.0
+
+            fn = gt_num - tp
+            recall = tp / (tp + fn)
+            missrate = 1.0 - recall
+            fppi = fp / img_num
+            fppiX.append(fppi)
+            fppiY.append(missrate)
+
+        score = list()
+        for pos in ref:
+            argmin = _find_gt(fppiX, pos)
+            if argmin >= 0:
+                score.append(fppiY[argmin])
+        score = np.array(score)
+        mr = np.exp(np.log(score).mean())
+        return mr
+
+    def eval_ji(self, samples):
+        """Evaluate by JI using multi_process.
+
+        Args:
+            samples(Dict[str, Image]): The detection result packaged by Image.
+
+        Returns:
+            ji(float): result of jaccard index.
+        """
+        import math
+        res_line = []
+        res_ji = []
+        for i in range(10):
+            score_thr = 1e-1 * i
+            total = len(samples)
+            stride = math.ceil(total / self.num_ji_process)
+            result_queue = Queue(10000)
+            results, procs = [], []
+            records = list(samples.items())
+            for i in range(self.num_ji_process):
+                start = i * stride
+                end = np.min([start + stride, total])
+                sample_data = dict(records[start:end])
+                p = Process(
+                    target=self.compute_ji_with_ignore,
+                    args=(result_queue, sample_data, score_thr))
+                p.start()
+                procs.append(p)
+            for i in range(total):
+                t = result_queue.get()
+                results.append(t)
+            for p in procs:
+                p.join()
+            line, mean_ratio = self.gather(results)
+            line = 'score_thr:{:.1f}, {}'.format(score_thr, line)
+            res_line.append(line)
+            res_ji.append(mean_ratio)
+        return max(res_ji)
+
+    def compute_ji_with_ignore(self, result_queue, dt_result, score_thr):
+        """Compute JI with ignore.
+
+        Args:
+            result_queue(Queue): The Queue for save compute result when
+                multi_process.
+            dt_result(dict[Image]): Detection result packaged by Image.
+            score_thr(float): The threshold of detection score.
+        Returns:
+            dict: compute result.
+        """
+        for ID, record in dt_result.items():
+            gt_boxes = record.gt_boxes
+            dt_boxes = record.dt_boxes
+            keep = dt_boxes[:, -1] > score_thr
+            dt_boxes = dt_boxes[keep][:, :-1]
+
+            gt_tag = np.array(gt_boxes[:, -1] != -1)
+            matches = self.compute_ji_matching(dt_boxes, gt_boxes[gt_tag, :4])
+            # get the unmatched_indices
+            matched_indices = np.array([j for (j, _) in matches])
+            unmatched_indices = list(
+                set(np.arange(dt_boxes.shape[0])) - set(matched_indices))
+            num_ignore_dt = self.get_ignores(dt_boxes[unmatched_indices],
+                                             gt_boxes[~gt_tag, :4])
+            matched_indices = np.array([j for (_, j) in matches])
+            unmatched_indices = list(
+                set(np.arange(gt_boxes[gt_tag].shape[0])) -
+                set(matched_indices))
+            num_ignore_gt = self.get_ignores(
+                gt_boxes[gt_tag][unmatched_indices], gt_boxes[~gt_tag, :4])
+            # compute results
+            eps = 1e-6
+            k = len(matches)
+            m = gt_tag.sum() - num_ignore_gt
+            n = dt_boxes.shape[0] - num_ignore_dt
+            ratio = k / (m + n - k + eps)
+            recall = k / (m + eps)
+            cover = k / (n + eps)
+            noise = 1 - cover
+            result_dict = dict(
+                ratio=ratio,
+                recall=recall,
+                cover=cover,
+                noise=noise,
+                k=k,
+                m=m,
+                n=n)
+            result_queue.put_nowait(result_dict)
+
+    @staticmethod
+    def gather(results):
+        """Integrate test results."""
+        assert len(results)
+        img_num = 0
+        for result in results:
+            if result['n'] != 0 or result['m'] != 0:
+                img_num += 1
+        mean_ratio = np.sum([rb['ratio'] for rb in results]) / img_num
+        valids = np.sum([rb['k'] for rb in results])
+        total = np.sum([rb['n'] for rb in results])
+        gtn = np.sum([rb['m'] for rb in results])
+        line = 'mean_ratio:{:.4f}, valids:{}, total:{}, gtn:{}'\
+            .format(mean_ratio, valids, total, gtn)
+        return line, mean_ratio
+
+    def compute_ji_matching(self, dt_boxes, gt_boxes):
+        """Match the annotation box for each detection box.
+
+        Args:
+            dt_boxes(ndarray): Detection boxes.
+            gt_boxes(ndarray): Ground_truth boxes.
+
+        Returns:
+            matches_(list[tuple[int, int]]): Match result.
+        """
+        assert dt_boxes.shape[-1] > 3 and gt_boxes.shape[-1] > 3
+        if dt_boxes.shape[0] < 1 or gt_boxes.shape[0] < 1:
+            return list()
+
+        ious = bbox_overlaps(dt_boxes, gt_boxes, mode='iou')
+        input_ = copy.deepcopy(ious)
+        input_[input_ < self.iou_thres] = 0
+        match_scipy = maximum_bipartite_matching(
+            csr_matrix(input_), perm_type='column')
+        matches_ = []
+        for i in range(len(match_scipy)):
+            if match_scipy[i] != -1:
+                matches_.append((i, int(match_scipy[i])))
+        return matches_
+
+    def get_ignores(self, dt_boxes, gt_boxes):
+        """Get the number of ignore bboxes."""
+        if gt_boxes.size:
+            ioas = bbox_overlaps(dt_boxes, gt_boxes, mode='iof')
+            ioas = np.max(ioas, axis=1)
+            rows = np.where(ioas > self.iou_thres)[0]
+            return len(rows)
+        else:
+            return 0
+
+
+class Image(object):
+    """Data structure for evaluation of CrowdHuman.
+
+    Note:
+        This implementation is modified from https://github.com/Purkialo/
+        CrowdDet/blob/master/lib/evaluate/APMRToolkits/image.py
+
+    Args:
+        mode (int): Select the mode of evaluate. Valid mode include
+            0(just body box), 1(just head box) and 2(both of them).
+            Defaults to 0.
+    """
+
+    def __init__(self, mode):
+        self.ID = None
+        self.width = None
+        self.height = None
+        self.dt_boxes = None
+        self.gt_boxes = None
+        self.eval_mode = mode
+
+        self.ign_num = None
+        self.gt_num = None
+        self.dt_num = None
+
+    def load(self, record, body_key, head_key, class_names, gt_flag):
+        """Loading information for evaluation.
+
+        Args:
+            record (dict): Label information or test results.
+                The format might look something like this:
+                {
+                    'ID': '273271,c9db000d5146c15',
+                    'gtboxes': [
+                        {'fbox': [72, 202, 163, 503], 'tag': 'person', ...},
+                        {'fbox': [199, 180, 144, 499], 'tag': 'person', ...},
+                        ...
+                    ]
+                }
+                or:
+                {
+                    'ID': '273271,c9db000d5146c15',
+                    'width': 800,
+                    'height': 1067,
+                    'dtboxes': [
+                        {
+                            'box': [306.22, 205.95, 164.05, 394.04],
+                            'score': 0.99,
+                            'tag': 1
+                        },
+                        {
+                            'box': [403.60, 178.66, 157.15, 421.33],
+                            'score': 0.99,
+                            'tag': 1
+                        },
+                        ...
+                    ]
+                }
+            body_key (str, None): key of detection body box.
+                Valid when loading detection results and self.eval_mode!=1.
+            head_key (str, None): key of detection head box.
+                Valid when loading detection results and self.eval_mode!=0.
+            class_names (list[str]):class names of data set.
+                Defaults to ['background', 'person'].
+            gt_flag (bool): Indicate whether record is ground truth
+                or predicting the outcome.
+        """
+        if 'ID' in record and self.ID is None:
+            self.ID = record['ID']
+        if 'width' in record and self.width is None:
+            self.width = record['width']
+        if 'height' in record and self.height is None:
+            self.height = record['height']
+        if gt_flag:
+            self.gt_num = len(record['gtboxes'])
+            body_bbox, head_bbox = self.load_gt_boxes(record, 'gtboxes',
+                                                      class_names)
+            if self.eval_mode == 0:
+                self.gt_boxes = body_bbox
+                self.ign_num = (body_bbox[:, -1] == -1).sum()
+            elif self.eval_mode == 1:
+                self.gt_boxes = head_bbox
+                self.ign_num = (head_bbox[:, -1] == -1).sum()
+            else:
+                gt_tag = np.array([
+                    body_bbox[i, -1] != -1 and head_bbox[i, -1] != -1
+                    for i in range(len(body_bbox))
+                ])
+                self.ign_num = (gt_tag == 0).sum()
+                self.gt_boxes = np.hstack(
+                    (body_bbox[:, :-1], head_bbox[:, :-1],
+                     gt_tag.reshape(-1, 1)))
+
+        if not gt_flag:
+            self.dt_num = len(record['dtboxes'])
+            if self.eval_mode == 0:
+                self.dt_boxes = self.load_det_boxes(record, 'dtboxes',
+                                                    body_key, 'score')
+            elif self.eval_mode == 1:
+                self.dt_boxes = self.load_det_boxes(record, 'dtboxes',
+                                                    head_key, 'score')
+            else:
+                body_dtboxes = self.load_det_boxes(record, 'dtboxes', body_key,
+                                                   'score')
+                head_dtboxes = self.load_det_boxes(record, 'dtboxes', head_key,
+                                                   'score')
+                self.dt_boxes = np.hstack((body_dtboxes, head_dtboxes))
+
+    @staticmethod
+    def load_gt_boxes(dict_input, key_name, class_names):
+        """load ground_truth and transform [x, y, w, h] to [x1, y1, x2, y2]"""
+        assert key_name in dict_input
+        if len(dict_input[key_name]) < 1:
+            return np.empty([0, 5])
+        head_bbox = []
+        body_bbox = []
+        for rb in dict_input[key_name]:
+            if rb['tag'] in class_names:
+                body_tag = class_names.index(rb['tag'])
+                head_tag = copy.deepcopy(body_tag)
+            else:
+                body_tag = -1
+                head_tag = -1
+            if 'extra' in rb:
+                if 'ignore' in rb['extra']:
+                    if rb['extra']['ignore'] != 0:
+                        body_tag = -1
+                        head_tag = -1
+            if 'head_attr' in rb:
+                if 'ignore' in rb['head_attr']:
+                    if rb['head_attr']['ignore'] != 0:
+                        head_tag = -1
+            head_bbox.append(np.hstack((rb['hbox'], head_tag)))
+            body_bbox.append(np.hstack((rb['fbox'], body_tag)))
+        head_bbox = np.array(head_bbox)
+        head_bbox[:, 2:4] += head_bbox[:, :2]
+        body_bbox = np.array(body_bbox)
+        body_bbox[:, 2:4] += body_bbox[:, :2]
+        return body_bbox, head_bbox
+
+    @staticmethod
+    def load_det_boxes(dict_input, key_name, key_box, key_score, key_tag=None):
+        """load detection boxes."""
+        assert key_name in dict_input
+        if len(dict_input[key_name]) < 1:
+            return np.empty([0, 5])
+        else:
+            assert key_box in dict_input[key_name][0]
+            if key_score:
+                assert key_score in dict_input[key_name][0]
+            if key_tag:
+                assert key_tag in dict_input[key_name][0]
+        if key_score:
+            if key_tag:
+                bboxes = np.vstack([
+                    np.hstack((rb[key_box], rb[key_score], rb[key_tag]))
+                    for rb in dict_input[key_name]
+                ])
+            else:
+                bboxes = np.vstack([
+                    np.hstack((rb[key_box], rb[key_score]))
+                    for rb in dict_input[key_name]
+                ])
+        else:
+            if key_tag:
+                bboxes = np.vstack([
+                    np.hstack((rb[key_box], rb[key_tag]))
+                    for rb in dict_input[key_name]
+                ])
+            else:
+                bboxes = np.vstack(
+                    [rb[key_box] for rb in dict_input[key_name]])
+        bboxes[:, 2:4] += bboxes[:, :2]
+        return bboxes
+
+    def clip_all_boader(self):
+        """Make sure boxes are within the image range."""
+
+        def _clip_boundary(boxes, height, width):
+            assert boxes.shape[-1] >= 4
+            boxes[:, 0] = np.minimum(np.maximum(boxes[:, 0], 0), width - 1)
+            boxes[:, 1] = np.minimum(np.maximum(boxes[:, 1], 0), height - 1)
+            boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], width), 0)
+            boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], height), 0)
+            return boxes
+
+        assert self.dt_boxes.shape[-1] >= 4
+        assert self.gt_boxes.shape[-1] >= 4
+        assert self.width is not None and self.height is not None
+        if self.eval_mode == 2:
+            self.dt_boxes[:, :4] = _clip_boundary(self.dt_boxes[:, :4],
+                                                  self.height, self.width)
+            self.gt_boxes[:, :4] = _clip_boundary(self.gt_boxes[:, :4],
+                                                  self.height, self.width)
+            self.dt_boxes[:, 4:8] = _clip_boundary(self.dt_boxes[:, 4:8],
+                                                   self.height, self.width)
+            self.gt_boxes[:, 4:8] = _clip_boundary(self.gt_boxes[:, 4:8],
+                                                   self.height, self.width)
+        else:
+            self.dt_boxes = _clip_boundary(self.dt_boxes, self.height,
+                                           self.width)
+            self.gt_boxes = _clip_boundary(self.gt_boxes, self.height,
+                                           self.width)
+
+    def compare_voc(self, thres):
+        """Match the detection results with the ground_truth by VOC.
+
+        Args:
+            thres (float): IOU threshold.
+
+        Returns:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+            a list of tuples (dtbox, label, imgID) in the descending
+            sort of dtbox.score.
+        """
+        if self.dt_boxes is None:
+            return list()
+        dtboxes = self.dt_boxes
+        gtboxes = self.gt_boxes if self.gt_boxes is not None else list()
+        dtboxes.sort(key=lambda x: x.score, reverse=True)
+        gtboxes.sort(key=lambda x: x.ign)
+
+        score_list = list()
+        for i, dt in enumerate(dtboxes):
+            maxpos = -1
+            maxiou = thres
+
+            for j, gt in enumerate(gtboxes):
+                overlap = dt.iou(gt)
+                if overlap > maxiou:
+                    maxiou = overlap
+                    maxpos = j
+
+            if maxpos >= 0:
+                if gtboxes[maxpos].ign == 0:
+                    gtboxes[maxpos].matched = 1
+                    dtboxes[i].matched = 1
+                    score_list.append((dt, self.ID))
+                else:
+                    dtboxes[i].matched = -1
+            else:
+                dtboxes[i].matched = 0
+                score_list.append((dt, self.ID))
+        return score_list
+
+    def compare_caltech(self, thres):
+        """Match the detection results with the ground_truth by Caltech
+        matching strategy.
+
+        Args:
+            thres (float): IOU threshold.
+
+        Returns:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+            a list of tuples (dtbox, label, imgID) in the descending
+            sort of dtbox.score.
+        """
+        if self.dt_boxes is None or self.gt_boxes is None:
+            return list()
+
+        dtboxes = self.dt_boxes if self.dt_boxes is not None else list()
+        gtboxes = self.gt_boxes if self.gt_boxes is not None else list()
+        dt_matched = np.zeros(dtboxes.shape[0])
+        gt_matched = np.zeros(gtboxes.shape[0])
+
+        dtboxes = np.array(sorted(dtboxes, key=lambda x: x[-1], reverse=True))
+        gtboxes = np.array(sorted(gtboxes, key=lambda x: x[-1], reverse=True))
+        if len(dtboxes):
+            overlap_iou = bbox_overlaps(dtboxes, gtboxes, mode='iou')
+            overlap_ioa = bbox_overlaps(dtboxes, gtboxes, mode='iof')
+        else:
+            return list()
+
+        score_list = list()
+        for i, dt in enumerate(dtboxes):
+            maxpos = -1
+            maxiou = thres
+            for j, gt in enumerate(gtboxes):
+                if gt_matched[j] == 1:
+                    continue
+                if gt[-1] > 0:
+                    overlap = overlap_iou[i][j]
+                    if overlap > maxiou:
+                        maxiou = overlap
+                        maxpos = j
+                else:
+                    if maxpos >= 0:
+                        break
+                    else:
+                        overlap = overlap_ioa[i][j]
+                        if overlap > thres:
+                            maxiou = overlap
+                            maxpos = j
+            if maxpos >= 0:
+                if gtboxes[maxpos, -1] > 0:
+                    gt_matched[maxpos] = 1
+                    dt_matched[i] = 1
+                    score_list.append((dt, 1, self.ID))
+                else:
+                    dt_matched[i] = -1
+            else:
+                dt_matched[i] = 0
+                score_list.append((dt, 0, self.ID))
+        return score_list
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/dod_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/dod_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..b47d07219dad112a336123444e58c72978953439
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/dod_metric.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import List, Optional, Sequence
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger
+
+from mmdet.datasets.api_wrappers import COCO, COCOeval
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class DODCocoMetric(BaseMetric):
+
+    default_prefix: Optional[str] = 'dod'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 outfile_prefix: Optional[str] = None,
+                 backend_args: dict = None,
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.outfile_prefix = outfile_prefix
+        with get_local_path(ann_file, backend_args=backend_args) as local_path:
+            self._coco_api = COCO(local_path)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+
+            result['labels'] = pred['labels'].cpu().numpy()
+            result['labels'] = data_sample['sent_ids'][result['labels']]
+            self.results.append(result)
+
+    def xyxy2xywh(self, bbox: np.ndarray) -> list:
+        """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO
+        evaluation.
+
+        Args:
+            bbox (numpy.ndarray): The bounding boxes, shape (4, ), in
+                ``xyxy`` order.
+
+        Returns:
+            list[float]: The converted bounding boxes, in ``xywh`` order.
+        """
+
+        _bbox: List = bbox.tolist()
+        return [
+            _bbox[0],
+            _bbox[1],
+            _bbox[2] - _bbox[0],
+            _bbox[3] - _bbox[1],
+        ]
+
+    def results2json(self, results: Sequence[dict]) -> list:
+        """Dump the detection results to a COCO style json file.
+
+        There are 3 types of results: proposals, bbox predictions, mask
+        predictions, and they have different data types. This method will
+        automatically recognize the type, and dump them to json files.
+
+        Args:
+            results (Sequence[dict]): Testing results of the
+                dataset.
+
+        Returns:
+            dict: Possible keys are "bbox", "segm", "proposal", and
+            values are corresponding filenames.
+        """
+        bbox_json_results = []
+        for idx, result in enumerate(results):
+            image_id = result.get('img_id', idx)
+            labels = result['labels']
+            bboxes = result['bboxes']
+            scores = result['scores']
+            for i, label in enumerate(labels):
+                data = dict()
+                data['image_id'] = image_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(scores[i])
+                data['category_id'] = label
+                bbox_json_results.append(data)
+        return bbox_json_results
+
+    def compute_metrics(self, results: list) -> dict:
+        logger: MMLogger = MMLogger.get_current_instance()
+        result_files = self.results2json(results)
+        d3_res = self._coco_api.loadRes(result_files)
+        cocoEval = COCOeval(self._coco_api, d3_res, 'bbox')
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        cocoEval.summarize()
+
+        aps = cocoEval.eval['precision'][:, :, :, 0, -1]
+        category_ids = self._coco_api.getCatIds()
+        category_names = [
+            cat['name'] for cat in self._coco_api.loadCats(category_ids)
+        ]
+
+        aps_lens = defaultdict(list)
+        counter_lens = defaultdict(int)
+        for i in range(len(category_names)):
+            ap = aps[:, :, i]
+            ap_value = ap[ap > -1].mean()
+            if not np.isnan(ap_value):
+                len_ref = len(category_names[i].split(' '))
+                aps_lens[len_ref].append(ap_value)
+                counter_lens[len_ref] += 1
+
+        ap_sum_short = sum([sum(aps_lens[i]) for i in range(0, 4)])
+        ap_sum_mid = sum([sum(aps_lens[i]) for i in range(4, 7)])
+        ap_sum_long = sum([sum(aps_lens[i]) for i in range(7, 10)])
+        ap_sum_very_long = sum([
+            sum(aps_lens[i]) for i in range(10,
+                                            max(counter_lens.keys()) + 1)
+        ])
+        c_sum_short = sum([counter_lens[i] for i in range(1, 4)])
+        c_sum_mid = sum([counter_lens[i] for i in range(4, 7)])
+        c_sum_long = sum([counter_lens[i] for i in range(7, 10)])
+        c_sum_very_long = sum(
+            [counter_lens[i] for i in range(10,
+                                            max(counter_lens.keys()) + 1)])
+        map_short = ap_sum_short / c_sum_short
+        map_mid = ap_sum_mid / c_sum_mid
+        map_long = ap_sum_long / c_sum_long
+        map_very_long = ap_sum_very_long / c_sum_very_long
+
+        coco_metric_names = {
+            'mAP': 0,
+            'mAP_50': 1,
+            'mAP_75': 2,
+            'mAP_s': 3,
+            'mAP_m': 4,
+            'mAP_l': 5,
+            'AR@100': 6,
+            'AR@300': 7,
+            'AR@1000': 8,
+            'AR_s@1000': 9,
+            'AR_m@1000': 10,
+            'AR_l@1000': 11
+        }
+        metric_items = ['mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l']
+
+        eval_results = {}
+        for metric_item in metric_items:
+            key = f'{metric_item}'
+            val = cocoEval.stats[coco_metric_names[metric_item]]
+            eval_results[key] = float(f'{round(val, 3)}')
+
+        ap = cocoEval.stats[:6]
+        logger.info(f'mAP_copypaste: {ap[0]:.3f} '
+                    f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                    f'{ap[4]:.3f} {ap[5]:.3f}')
+
+        logger.info(f'mAP over reference length: short - {map_short:.4f}, '
+                    f'mid - {map_mid:.4f}, long - {map_long:.4f}, '
+                    f'very long - {map_very_long:.4f}')
+        eval_results['mAP_short'] = float(f'{round(map_short, 3)}')
+        eval_results['mAP_mid'] = float(f'{round(map_mid, 3)}')
+        eval_results['mAP_long'] = float(f'{round(map_long, 3)}')
+        eval_results['mAP_very_long'] = float(f'{round(map_very_long, 3)}')
+        return eval_results
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/dump_det_results.py b/head_extractor/build/lib/mmdet/evaluation/metrics/dump_det_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3071d19a6ad0199458d13dfe6f570f181a5ea7f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/dump_det_results.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Sequence
+
+from mmengine.evaluator import DumpResults
+from mmengine.evaluator.metric import _to_cpu
+
+from mmdet.registry import METRICS
+from mmdet.structures.mask import encode_mask_results
+
+
+@METRICS.register_module()
+class DumpDetResults(DumpResults):
+    """Dump model predictions to a pickle file for offline evaluation.
+
+    Different from `DumpResults` in MMEngine, it compresses instance
+    segmentation masks into RLE format.
+
+    Args:
+        out_file_path (str): Path of the dumped file. Must end with '.pkl'
+            or '.pickle'.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+    """
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """transfer tensors in predictions to CPU."""
+        data_samples = _to_cpu(data_samples)
+        for data_sample in data_samples:
+            # remove gt
+            data_sample.pop('gt_instances', None)
+            data_sample.pop('ignored_instances', None)
+            data_sample.pop('gt_panoptic_seg', None)
+
+            if 'pred_instances' in data_sample:
+                pred = data_sample['pred_instances']
+                # encode mask to RLE
+                if 'masks' in pred:
+                    pred['masks'] = encode_mask_results(pred['masks'].numpy())
+            if 'pred_panoptic_seg' in data_sample:
+                warnings.warn(
+                    'Panoptic segmentation map will not be compressed. '
+                    'The dumped file will be extremely large! '
+                    'Suggest using `CocoPanopticMetric` to save the coco '
+                    'format json and segmentation png files directly.')
+        self.results.extend(data_samples)
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/dump_odvg_results.py b/head_extractor/build/lib/mmdet/evaluation/metrics/dump_odvg_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1446b0538053e14b6b9b21bebc6d91c9564d9b5
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/dump_odvg_results.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Sequence
+
+from mmcv.ops import batched_nms
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import print_log
+
+from mmdet.registry import METRICS
+
+try:
+    import jsonlines
+except ImportError:
+    jsonlines = None
+
+
+@METRICS.register_module()
+class DumpODVGResults(BaseMetric):
+    default_prefix: Optional[str] = 'pl_odvg'
+
+    def __init__(self,
+                 outfile_path,
+                 img_prefix: str,
+                 score_thr: float = 0.1,
+                 collect_device: str = 'cpu',
+                 nms_thr: float = 0.5,
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.outfile_path = outfile_path
+        self.score_thr = score_thr
+        self.img_prefix = img_prefix
+        self.nms_thr = nms_thr
+
+        if jsonlines is None:
+            raise ImportError('Please run "pip install jsonlines" to install '
+                              'this package.')
+
+    def process(self, data_batch: Any, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = {}
+
+            filename = data_sample['img_path']
+            filename = filename.replace(self.img_prefix, '')
+            if filename.startswith('/'):
+                filename = filename[1:]
+            result['filename'] = filename
+
+            height = data_sample['ori_shape'][0]
+            width = data_sample['ori_shape'][1]
+            result['height'] = height
+            result['width'] = width
+
+            pred_instances = data_sample['pred_instances']
+
+            bboxes = pred_instances['bboxes'].cpu()
+            scores = pred_instances['scores'].cpu()
+            labels = pred_instances['labels'].cpu()
+
+            bboxes = bboxes[scores > self.score_thr]
+            labels = labels[scores > self.score_thr]
+            scores = scores[scores > self.score_thr]
+
+            if 'tokens_positive' in data_sample:
+                task = 'vg'
+            else:
+                task = 'od'
+
+            if task == 'od':
+                classes_name = data_sample['text']
+                result['detection'] = {}
+
+                if len(bboxes) > 0:
+                    det_bboxes, keep = batched_nms(
+                        bboxes, scores, labels,
+                        dict(type='nms', iou_threshold=self.nms_thr))
+                    _scores = det_bboxes[:, -1]
+                    _bboxes = det_bboxes[:, :-1]
+                    _labels = labels[keep]
+
+                    instances = []
+                    _bboxes = _bboxes.numpy().tolist()
+                    _scores = _scores.numpy().tolist()
+                    _labels = _labels.numpy().tolist()
+                    for bbox, score, label in zip(_bboxes, _scores, _labels):
+                        round_bbox = [round(b, 2) for b in bbox]
+                        round_score = round(score, 2)
+                        instances.append({
+                            'bbox': round_bbox,
+                            'score': round_score,
+                            'label': label,
+                            'category': classes_name[label]
+                        })
+                    result['detection']['instances'] = instances
+                else:
+                    result['detection']['instances'] = []
+                self.results.append(result)
+            else:
+                caption = data_sample['text']
+                result['grounding'] = {}
+                result['grounding']['caption'] = caption
+
+                tokens_positive = data_sample['tokens_positive']
+
+                region_list = []
+                for label, positive in enumerate(tokens_positive):
+                    phrase = [caption[pos[0]:pos[1]] for pos in positive]
+
+                    _bboxes = bboxes[labels == label]
+                    _scores = scores[labels == label]
+                    det_bboxes, _ = batched_nms(
+                        _bboxes,
+                        _scores,
+                        None,
+                        dict(type='nms', iou_threshold=self.nms_thr),
+                        class_agnostic=True)
+                    _scores = det_bboxes[:, -1].numpy().tolist()
+                    _bboxes = det_bboxes[:, :-1].numpy().tolist()
+
+                    round_bboxes = []
+                    for bbox in _bboxes:
+                        round_bboxes.append([round(b, 2) for b in bbox])
+                    _scores = [[round(s, 2) for s in _scores]]
+                    region = {
+                        'phrase': phrase,
+                        'bbox': round_bboxes,
+                        'score': _scores,
+                        'tokens_positive': positive
+                    }
+                    region_list.append(region)
+                result['grounding']['regions'] = region_list
+                self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        with jsonlines.open(self.outfile_path, mode='w') as writer:
+            writer.write_all(results)
+        print_log(
+            f'Results has been saved to {self.outfile_path}.',
+            logger='current')
+        return {}
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/dump_proposals_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/dump_proposals_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e9c53654c15d4b1f7e6555a9a7c53f844cb071f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/dump_proposals_metric.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from typing import Optional, Sequence
+
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import dump
+from mmengine.logging import MMLogger
+from mmengine.structures import InstanceData
+
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class DumpProposals(BaseMetric):
+    """Dump proposals pseudo metric.
+
+    Args:
+        output_dir (str): The root directory for ``proposals_file``.
+            Defaults to ''.
+        proposals_file (str): Proposals file path. Defaults to 'proposals.pkl'.
+        num_max_proposals (int, optional): Maximum number of proposals to dump.
+            If not specified, all proposals will be dumped.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    default_prefix: Optional[str] = 'dump_proposals'
+
+    def __init__(self,
+                 output_dir: str = '',
+                 proposals_file: str = 'proposals.pkl',
+                 num_max_proposals: Optional[int] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.num_max_proposals = num_max_proposals
+        # TODO: update after mmengine finish refactor fileio.
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+        self.output_dir = output_dir
+        assert proposals_file.endswith(('.pkl', '.pickle')), \
+            'The output file must be a pkl file.'
+
+        self.proposals_file = os.path.join(self.output_dir, proposals_file)
+        if is_main_process():
+            os.makedirs(self.output_dir, exist_ok=True)
+
+    def process(self, data_batch: Sequence[dict],
+                data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            pred = data_sample['pred_instances']
+            # `bboxes` is sorted by `scores`
+            ranked_scores, rank_inds = pred['scores'].sort(descending=True)
+            ranked_bboxes = pred['bboxes'][rank_inds, :]
+
+            ranked_bboxes = ranked_bboxes.cpu().numpy()
+            ranked_scores = ranked_scores.cpu().numpy()
+
+            pred_instance = InstanceData()
+            pred_instance.bboxes = ranked_bboxes
+            pred_instance.scores = ranked_scores
+            if self.num_max_proposals is not None:
+                pred_instance = pred_instance[:self.num_max_proposals]
+
+            img_path = data_sample['img_path']
+            # `file_name` is the key to obtain the proposals from the
+            # `proposals_list`.
+            file_name = osp.join(
+                osp.split(osp.split(img_path)[0])[-1],
+                osp.split(img_path)[-1])
+            result = {file_name: pred_instance}
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        """Dump the processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: An empty dict.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        dump_results = {}
+        for result in results:
+            dump_results.update(result)
+        dump(
+            dump_results,
+            file=self.proposals_file,
+            backend_args=self.backend_args)
+        logger.info(f'Results are saved at {self.proposals_file}')
+        return {}
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/flickr30k_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/flickr30k_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8b64bfda46b3e8cc4a1053d10082eff9bc421e8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/flickr30k_metric.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved
+from collections import defaultdict
+from typing import Dict, List, Optional, Sequence
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS
+from ..functional import bbox_overlaps
+
+
+class RecallTracker:
+    """Utility class to track recall@k for various k, split by categories."""
+
+    def __init__(self, topk: Sequence[int]):
+        """
+        Parameters:
+           - topk : tuple of ints corresponding to the recalls being
+           tracked (eg, recall@1, recall@10, ...)
+        """
+
+        self.total_byk_bycat: Dict[int, Dict[str, int]] = {
+            k: defaultdict(int)
+            for k in topk
+        }
+        self.positives_byk_bycat: Dict[int, Dict[str, int]] = {
+            k: defaultdict(int)
+            for k in topk
+        }
+
+    def add_positive(self, k: int, category: str):
+        """Log a positive hit @k for given category."""
+        if k not in self.total_byk_bycat:
+            raise RuntimeError(f'{k} is not a valid recall threshold')
+        self.total_byk_bycat[k][category] += 1
+        self.positives_byk_bycat[k][category] += 1
+
+    def add_negative(self, k: int, category: str):
+        """Log a negative hit @k for given category."""
+        if k not in self.total_byk_bycat:
+            raise RuntimeError(f'{k} is not a valid recall threshold')
+        self.total_byk_bycat[k][category] += 1
+
+    def report(self) -> Dict[str, Dict[str, float]]:
+        """Return a condensed report of the results as a dict of dict.
+
+        report[k][cat] is the recall@k for the given category
+        """
+        report: Dict[str, Dict[str, float]] = {}
+        for k in self.total_byk_bycat:
+            assert k in self.positives_byk_bycat
+            report[str(k)] = {
+                cat:
+                self.positives_byk_bycat[k][cat] / self.total_byk_bycat[k][cat]
+                for cat in self.total_byk_bycat[k]
+            }
+        return report
+
+
+@METRICS.register_module()
+class Flickr30kMetric(BaseMetric):
+    """Phrase Grounding Metric."""
+
+    def __init__(
+        self,
+        topk: Sequence[int] = (1, 5, 10, -1),
+        iou_thrs: float = 0.5,
+        merge_boxes: bool = False,
+        collect_device: str = 'cpu',
+        prefix: Optional[str] = None,
+    ) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.iou_thrs = iou_thrs
+        self.topk = topk
+        self.merge = merge_boxes
+
+    def merge_boxes(self, boxes: List[List[int]]) -> List[List[int]]:
+        """Return the boxes corresponding to the smallest enclosing box
+        containing all the provided boxes The boxes are expected in [x1, y1,
+        x2, y2] format."""
+        if len(boxes) == 1:
+            return boxes
+
+        np_boxes = np.asarray(boxes)
+
+        return [[
+            np.boxes[:, 0].min(), np_boxes[:, 1].min(), np_boxes[:, 2].max(),
+            np_boxes[:, 3].max()
+        ]]
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            pred = data_sample['pred_instances']
+            gt = data_sample['gt_instances']['bboxes']
+            gt_label = data_sample['phrase_ids']
+            phrases = data_sample['phrases']
+            assert len(gt) == len(gt_label)
+
+            self.results.append((pred, gt, gt_label, phrases))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        pred_list, gt_list, gt_label_list, phrase_list = zip(*results)
+
+        recall_tracker = RecallTracker(self.topk)
+
+        for pred, gt_boxes, gt_labels, phrases in zip(pred_list, gt_list,
+                                                      gt_label_list,
+                                                      phrase_list):
+            pred_boxes = pred['bboxes'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+            for i, phrase in enumerate(phrases):
+                cur_index = pred_labels == i
+                cur_boxes = pred_boxes[cur_index]
+                tar_index = [
+                    index for index, value in enumerate(gt_labels)
+                    if value == i
+                ]
+                tar_boxes = gt_boxes[tar_index]
+                if self.merge:
+                    tar_boxes = self.merge_boxes(tar_boxes)
+                if len(cur_boxes) == 0:
+                    cur_boxes = [[0., 0., 0., 0.]]
+                ious = bbox_overlaps(
+                    np.asarray(cur_boxes), np.asarray(tar_boxes))
+                for k in self.topk:
+                    if k == -1:
+                        maxi = ious.max()
+                    else:
+                        assert k > 0
+                        maxi = ious[:k].max()
+                    if maxi >= self.iou_thrs:
+                        recall_tracker.add_positive(k, 'all')
+                        # TODO: do not support class-wise evaluation yet
+                        # for phrase_type in phrase['phrase_type']:
+                        #     recall_tracker.add_positive(k, phrase_type)
+                    else:
+                        recall_tracker.add_negative(k, 'all')
+                        # for phrase_type in phrase['phrase_type']:
+                        #     recall_tracker.add_negative(k, phrase_type)
+
+        results = recall_tracker.report()
+        logger.info(results)
+        return results
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/grefcoco_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/grefcoco_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..55cc638c5e4de11480a6858d15309017ba59a16a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/grefcoco_metric.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger
+
+from mmdet.datasets.api_wrappers import COCO
+from mmdet.registry import METRICS
+from ..functional import bbox_overlaps
+
+
+# refer from https://github.com/henghuiding/gRefCOCO/blob/main/mdetr/datasets/refexp.py # noqa
+@METRICS.register_module()
+class gRefCOCOMetric(BaseMetric):
+    default_prefix: Optional[str] = 'grefcoco'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: str = 'bbox',
+                 iou_thrs: float = 0.5,
+                 thresh_score: float = 0.7,
+                 thresh_f1: float = 1.0,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.metric = metric
+        self.iou_thrs = iou_thrs
+        self.thresh_score = thresh_score
+        self.thresh_f1 = thresh_f1
+
+        with get_local_path(ann_file) as local_path:
+            self.coco = COCO(local_path)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu()
+            result['scores'] = pred['scores'].cpu()
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        correct_image = 0
+        num_image = 0
+        nt = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}
+
+        for result in results:
+            img_id = result['img_id']
+            TP = 0
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id)
+            target = self.coco.loadAnns(ann_ids[0])
+
+            converted_bbox_all = []
+            no_target_flag = False
+            for one_target in target:
+                if one_target['category_id'] == -1:
+                    no_target_flag = True
+                target_bbox = one_target['bbox']
+                converted_bbox = [
+                    target_bbox[0],
+                    target_bbox[1],
+                    target_bbox[2] + target_bbox[0],
+                    target_bbox[3] + target_bbox[1],
+                ]
+                converted_bbox_all.append(
+                    np.array(converted_bbox).reshape(-1, 4))
+            gt_bbox_all = np.concatenate(converted_bbox_all, axis=0)
+
+            idx = result['scores'] >= self.thresh_score
+            filtered_boxes = result['bboxes'][idx]
+
+            iou = bbox_overlaps(filtered_boxes.numpy(), gt_bbox_all)
+            iou = torch.from_numpy(iou)
+
+            num_prediction = filtered_boxes.shape[0]
+            num_gt = gt_bbox_all.shape[0]
+            if no_target_flag:
+                if num_prediction >= 1:
+                    nt['FN'] += 1
+                else:
+                    nt['TP'] += 1
+                if num_prediction >= 1:
+                    f_1 = 0.
+                else:
+                    f_1 = 1.0
+            else:
+                if num_prediction >= 1:
+                    nt['TN'] += 1
+                else:
+                    nt['FP'] += 1
+                for i in range(min(num_prediction, num_gt)):
+                    top_value, top_index = torch.topk(iou.flatten(0, 1), 1)
+                    if top_value < self.iou_thrs:
+                        break
+                    else:
+                        top_index_x = top_index // num_gt
+                        top_index_y = top_index % num_gt
+                        TP += 1
+                        iou[top_index_x[0], :] = 0.0
+                        iou[:, top_index_y[0]] = 0.0
+                FP = num_prediction - TP
+                FN = num_gt - TP
+                f_1 = 2 * TP / (2 * TP + FP + FN)
+
+            if f_1 >= self.thresh_f1:
+                correct_image += 1
+            num_image += 1
+
+        score = correct_image / max(num_image, 1)
+        results = {
+            'F1_score': score,
+            'T_acc': nt['TN'] / (nt['TN'] + nt['FP']),
+            'N_acc': nt['TP'] / (nt['TP'] + nt['FN'])
+        }
+        logger.info(results)
+        return results
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/lvis_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/lvis_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..a861c6ee7b48adb2e428dcdaa97e8dc7ba476a6c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/lvis_metric.py
@@ -0,0 +1,534 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+import logging
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmengine.dist import (all_gather_object, broadcast_object_list,
+                           is_main_process)
+from mmengine.evaluator import BaseMetric
+from mmengine.evaluator.metric import _to_cpu
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger, print_log
+from terminaltables import AsciiTable
+
+from mmdet.registry import METRICS
+from mmdet.structures.mask import encode_mask_results
+from ..functional import eval_recalls
+from .coco_metric import CocoMetric
+
+try:
+    import lvis
+
+    if getattr(lvis, '__version__', '0') >= '10.5.3':
+        warnings.warn(
+            'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"',  # noqa: E501
+            UserWarning)
+    from lvis import LVIS, LVISEval, LVISResults
+except ImportError:
+    lvis = None
+    LVISEval = None
+    LVISResults = None
+
+
+@METRICS.register_module()
+class LVISMetric(CocoMetric):
+    """LVIS evaluation metric.
+
+    Args:
+        ann_file (str, optional): Path to the coco format annotation file.
+            If not specified, ground truth annotations from the dataset will
+            be converted to coco format. Defaults to None.
+        metric (str | List[str]): Metrics to be evaluated. Valid metrics
+            include 'bbox', 'segm', 'proposal', and 'proposal_fast'.
+            Defaults to 'bbox'.
+        classwise (bool): Whether to evaluate the metric class-wise.
+            Defaults to False.
+        proposal_nums (Sequence[int]): Numbers of proposals to be evaluated.
+            Defaults to (100, 300, 1000).
+        iou_thrs (float | List[float], optional): IoU threshold to compute AP
+            and AR. If not specified, IoUs from 0.5 to 0.95 will be used.
+            Defaults to None.
+        metric_items (List[str], optional): Metric result names to be
+            recorded in the evaluation result. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        outfile_prefix (str, optional): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    default_prefix: Optional[str] = 'lvis'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: Union[str, List[str]] = 'bbox',
+                 classwise: bool = False,
+                 proposal_nums: Sequence[int] = (100, 300, 1000),
+                 iou_thrs: Optional[Union[float, Sequence[float]]] = None,
+                 metric_items: Optional[Sequence[str]] = None,
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None) -> None:
+        if lvis is None:
+            raise RuntimeError(
+                'Package lvis is not installed. Please run "pip install '
+                'git+https://github.com/lvis-dataset/lvis-api.git".')
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        # coco evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(
+                    "metric should be one of 'bbox', 'segm', 'proposal', "
+                    f"'proposal_fast', but got {metric}.")
+
+        # do class wise evaluation, default False
+        self.classwise = classwise
+
+        # proposal_nums used to compute recall or precision.
+        self.proposal_nums = list(proposal_nums)
+
+        # iou_thrs used to compute recall or precision.
+        if iou_thrs is None:
+            iou_thrs = np.linspace(
+                .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.iou_thrs = iou_thrs
+        self.metric_items = metric_items
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+
+        self.outfile_prefix = outfile_prefix
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        # if ann_file is not specified,
+        # initialize lvis api with the converted dataset
+        if ann_file is not None:
+            with get_local_path(
+                    ann_file, backend_args=self.backend_args) as local_path:
+                self._lvis_api = LVIS(local_path)
+        else:
+            self._lvis_api = None
+
+        # handle dataset lazy init
+        self.cat_ids = None
+        self.img_ids = None
+
+    def fast_eval_recall(self,
+                         results: List[dict],
+                         proposal_nums: Sequence[int],
+                         iou_thrs: Sequence[float],
+                         logger: Optional[MMLogger] = None) -> np.ndarray:
+        """Evaluate proposal recall with LVIS's fast_eval_recall.
+
+        Args:
+            results (List[dict]): Results of the dataset.
+            proposal_nums (Sequence[int]): Proposal numbers used for
+                evaluation.
+            iou_thrs (Sequence[float]): IoU thresholds used for evaluation.
+            logger (MMLogger, optional): Logger used for logging the recall
+                summary.
+        Returns:
+            np.ndarray: Averaged recall results.
+        """
+        gt_bboxes = []
+        pred_bboxes = [result['bboxes'] for result in results]
+        for i in range(len(self.img_ids)):
+            ann_ids = self._lvis_api.get_ann_ids(img_ids=[self.img_ids[i]])
+            ann_info = self._lvis_api.load_anns(ann_ids)
+            if len(ann_info) == 0:
+                gt_bboxes.append(np.zeros((0, 4)))
+                continue
+            bboxes = []
+            for ann in ann_info:
+                x1, y1, w, h = ann['bbox']
+                bboxes.append([x1, y1, x1 + w, y1 + h])
+            bboxes = np.array(bboxes, dtype=np.float32)
+            if bboxes.shape[0] == 0:
+                bboxes = np.zeros((0, 4))
+            gt_bboxes.append(bboxes)
+
+        recalls = eval_recalls(
+            gt_bboxes, pred_bboxes, proposal_nums, iou_thrs, logger=logger)
+        ar = recalls.mean(axis=1)
+        return ar
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            result['labels'] = pred['labels'].cpu().numpy()
+            # encode mask to RLE
+            if 'masks' in pred:
+                result['masks'] = encode_mask_results(
+                    pred['masks'].detach().cpu().numpy())
+            # some detectors use different scores for bbox and mask
+            if 'mask_scores' in pred:
+                result['mask_scores'] = pred['mask_scores'].cpu().numpy()
+
+            # parse gt
+            gt = dict()
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+            gt['img_id'] = data_sample['img_id']
+            if self._lvis_api is None:
+                # TODO: Need to refactor to support LoadAnnotations
+                assert 'instances' in data_sample, \
+                    'ground truth is required for evaluation when ' \
+                    '`ann_file` is not provided'
+                gt['anns'] = data_sample['instances']
+            # add converted result to the results list
+            self.results.append((gt, result))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # split gt and prediction list
+        gts, preds = zip(*results)
+
+        tmp_dir = None
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+
+        if self._lvis_api is None:
+            # use converted gt json file to initialize coco api
+            logger.info('Converting ground truth to coco format...')
+            coco_json_path = self.gt_to_coco_json(
+                gt_dicts=gts, outfile_prefix=outfile_prefix)
+            self._lvis_api = LVIS(coco_json_path)
+
+        # handle lazy init
+        if self.cat_ids is None:
+            self.cat_ids = self._lvis_api.get_cat_ids()
+        if self.img_ids is None:
+            self.img_ids = self._lvis_api.get_img_ids()
+
+        # convert predictions to coco format and dump to json file
+        result_files = self.results2json(preds, outfile_prefix)
+
+        eval_results = OrderedDict()
+        if self.format_only:
+            logger.info('results are saved in '
+                        f'{osp.dirname(outfile_prefix)}')
+            return eval_results
+
+        lvis_gt = self._lvis_api
+
+        for metric in self.metrics:
+            logger.info(f'Evaluating {metric}...')
+
+            # TODO: May refactor fast_eval_recall to an independent metric?
+            # fast eval recall
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    preds, self.proposal_nums, self.iou_thrs, logger=logger)
+                log_msg = []
+                for i, num in enumerate(self.proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+                log_msg = ''.join(log_msg)
+                logger.info(log_msg)
+                continue
+
+            try:
+                lvis_dt = LVISResults(lvis_gt, result_files[metric])
+            except IndexError:
+                logger.info(
+                    'The testing results of the whole dataset is empty.')
+                break
+
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            lvis_eval = LVISEval(lvis_gt, lvis_dt, iou_type)
+            lvis_eval.params.imgIds = self.img_ids
+            metric_items = self.metric_items
+            if metric == 'proposal':
+                lvis_eval.params.useCats = 0
+                lvis_eval.params.maxDets = list(self.proposal_nums)
+                lvis_eval.evaluate()
+                lvis_eval.accumulate()
+                lvis_eval.summarize()
+                if metric_items is None:
+                    metric_items = ['AR@300', 'ARs@300', 'ARm@300', 'ARl@300']
+                for k, v in lvis_eval.get_results().items():
+                    if k in metric_items:
+                        val = float('{:.3f}'.format(float(v)))
+                        eval_results[k] = val
+
+            else:
+                lvis_eval.evaluate()
+                lvis_eval.accumulate()
+                lvis_eval.summarize()
+                lvis_results = lvis_eval.get_results()
+                if self.classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = lvis_eval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, catId in enumerate(self.cat_ids):
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        # the dimensions of precisions are
+                        # [num_thrs, num_recalls, num_cats, num_area_rngs]
+                        nm = self._lvis_api.load_cats([catId])[0]
+                        precision = precisions[:, :, idx, 0]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        results_per_category.append(
+                            (f'{nm["name"]}', f'{float(ap):0.3f}'))
+                        eval_results[f'{nm["name"]}_precision'] = round(ap, 3)
+
+                    num_columns = min(6, len(results_per_category) * 2)
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = ['category', 'AP'] * (num_columns // 2)
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    logger.info('\n' + table.table)
+
+                if metric_items is None:
+                    metric_items = [
+                        'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'APr',
+                        'APc', 'APf'
+                    ]
+
+                for k, v in lvis_results.items():
+                    if k in metric_items:
+                        key = '{}_{}'.format(metric, k)
+                        val = float('{:.3f}'.format(float(v)))
+                        eval_results[key] = val
+
+            lvis_eval.print_results()
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
+
+
+def _merge_lists(listA, listB, maxN, key):
+    result = []
+    indA, indB = 0, 0
+    while (indA < len(listA) or indB < len(listB)) and len(result) < maxN:
+        if (indB < len(listB)) and (indA >= len(listA)
+                                    or key(listA[indA]) < key(listB[indB])):
+            result.append(listB[indB])
+            indB += 1
+        else:
+            result.append(listA[indA])
+            indA += 1
+    return result
+
+
+@METRICS.register_module()
+class LVISFixedAPMetric(BaseMetric):
+    default_prefix: Optional[str] = 'lvis_fixed_ap'
+
+    def __init__(self,
+                 ann_file: str,
+                 topk: int = 10000,
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 backend_args: dict = None) -> None:
+
+        if lvis is None:
+            raise RuntimeError(
+                'Package lvis is not installed. Please run "pip install '
+                'git+https://github.com/lvis-dataset/lvis-api.git".')
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+
+        self.outfile_prefix = outfile_prefix
+        self.backend_args = backend_args
+
+        with get_local_path(
+                ann_file, backend_args=self.backend_args) as local_path:
+            self._lvis_api = LVIS(local_path)
+
+        self.cat_ids = self._lvis_api.get_cat_ids()
+
+        self.results = {}
+        self.topk = topk
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        cur_results = []
+        for data_sample in data_samples:
+            pred = data_sample['pred_instances']
+            xmin, ymin, xmax, ymax = pred['bboxes'].cpu().unbind(1)
+            boxes = torch.stack((xmin, ymin, xmax - xmin, ymax - ymin),
+                                dim=1).tolist()
+
+            scores = pred['scores'].cpu().numpy()
+            labels = pred['labels'].cpu().numpy()
+
+            if len(boxes) == 0:
+                continue
+
+            cur_results.extend([{
+                'image_id': data_sample['img_id'],
+                'category_id': self.cat_ids[labels[k]],
+                'bbox': box,
+                'score': scores[k],
+            } for k, box in enumerate(boxes)])
+
+        by_cat = defaultdict(list)
+        for ann in cur_results:
+            by_cat[ann['category_id']].append(ann)
+
+        for cat, cat_anns in by_cat.items():
+            if cat not in self.results:
+                self.results[cat] = []
+
+            cur = sorted(
+                cat_anns, key=lambda x: x['score'], reverse=True)[:self.topk]
+            self.results[cat] = _merge_lists(
+                self.results[cat], cur, self.topk, key=lambda x: x['score'])
+
+    def compute_metrics(self, results: dict) -> dict:
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        new_results = []
+
+        missing_dets_cats = set()
+        for cat, cat_anns in results.items():
+            if len(cat_anns) < self.topk:
+                missing_dets_cats.add(cat)
+            new_results.extend(
+                sorted(cat_anns, key=lambda x: x['score'],
+                       reverse=True)[:self.topk])
+
+        if missing_dets_cats:
+            logger.info(
+                f'\n===\n'
+                f'{len(missing_dets_cats)} classes had less than {self.topk} '
+                f'detections!\n Outputting {self.topk} detections for each '
+                f'class will improve AP further.\n ===')
+
+        new_results = LVISResults(self._lvis_api, new_results, max_dets=-1)
+        lvis_eval = LVISEval(self._lvis_api, new_results, iou_type='bbox')
+        params = lvis_eval.params
+        params.max_dets = -1  # No limit on detections per image.
+        lvis_eval.run()
+        lvis_eval.print_results()
+        metrics = {
+            k: v
+            for k, v in lvis_eval.results.items() if k.startswith('AP')
+        }
+        logger.info(f'mAP_copypaste: {metrics}')
+        return metrics
+
+    def evaluate(self, size: int) -> dict:
+        if len(self.results) == 0:
+            print_log(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.',
+                logger='current',
+                level=logging.WARNING)
+
+        all_cats = all_gather_object(self.results)
+        results = defaultdict(list)
+        for cats in all_cats:
+            for cat, cat_anns in cats.items():
+                results[cat].extend(cat_anns)
+
+        if is_main_process():
+            # cast all tensors in results list to cpu
+            results = _to_cpu(results)
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results
+        self.results = {}
+        return metrics[0]
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/mot_challenge_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/mot_challenge_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5513c44e81de7dd869d4c5c802bfac0387bdbf6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/mot_challenge_metric.py
@@ -0,0 +1,443 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+import tempfile
+from collections import defaultdict
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+
+try:
+    import trackeval
+except ImportError:
+    trackeval = None
+from mmengine.dist import (all_gather_object, barrier, broadcast,
+                           broadcast_object_list, get_dist_info,
+                           is_main_process)
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS, TASK_UTILS
+from .base_video_metric import BaseVideoMetric
+
+
+def get_tmpdir() -> str:
+    """return the same tmpdir for all processes."""
+    rank, world_size = get_dist_info()
+    MAX_LEN = 512
+    # 32 is whitespace
+    dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8)
+    if rank == 0:
+        tmpdir = tempfile.mkdtemp()
+        tmpdir = torch.tensor(bytearray(tmpdir.encode()), dtype=torch.uint8)
+        dir_tensor[:len(tmpdir)] = tmpdir
+    broadcast(dir_tensor, 0)
+    tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    return tmpdir
+
+
+@METRICS.register_module()
+class MOTChallengeMetric(BaseVideoMetric):
+    """Evaluation metrics for MOT Challenge.
+
+    Args:
+        metric (str | list[str]): Metrics to be evaluated. Options are
+            'HOTA', 'CLEAR', 'Identity'.
+            Defaults to ['HOTA', 'CLEAR', 'Identity'].
+        outfile_prefix (str, optional): Path to save the formatted results.
+            Defaults to None.
+        track_iou_thr (float): IoU threshold for tracking evaluation.
+            Defaults to 0.5.
+        benchmark (str): Benchmark to be evaluated. Defaults to 'MOT17'.
+        format_only (bool): If True, only formatting the results to the
+            official format and not performing evaluation. Defaults to False.
+        postprocess_tracklet_cfg (List[dict], optional): configs for tracklets
+            postprocessing methods. `InterpolateTracklets` is supported.
+            Defaults to []
+            - InterpolateTracklets:
+                - min_num_frames (int, optional): The minimum length of a
+                    track that will be interpolated. Defaults to 5.
+                - max_num_frames (int, optional): The maximum disconnected
+                    length in a track. Defaults to 20.
+                - use_gsi (bool, optional): Whether to use the GSI (Gaussian-
+                    smoothed interpolation) method. Defaults to False.
+                - smooth_tau (int, optional): smoothing parameter in GSI.
+                    Defaults to 10.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+    Returns:
+    """
+    TRACKER = 'default-tracker'
+    allowed_metrics = ['HOTA', 'CLEAR', 'Identity']
+    allowed_benchmarks = ['MOT15', 'MOT16', 'MOT17', 'MOT20', 'DanceTrack']
+    default_prefix: Optional[str] = 'motchallenge-metric'
+
+    def __init__(self,
+                 metric: Union[str, List[str]] = ['HOTA', 'CLEAR', 'Identity'],
+                 outfile_prefix: Optional[str] = None,
+                 track_iou_thr: float = 0.5,
+                 benchmark: str = 'MOT17',
+                 format_only: bool = False,
+                 use_postprocess: bool = False,
+                 postprocess_tracklet_cfg: Optional[List[dict]] = [],
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        if trackeval is None:
+            raise RuntimeError(
+                'trackeval is not installed,'
+                'please install it by: pip install'
+                'git+https://github.com/JonathonLuiten/TrackEval.git'
+                'trackeval need low version numpy, please install it'
+                'by: pip install -U numpy==1.23.5')
+        if isinstance(metric, list):
+            metrics = metric
+        elif isinstance(metric, str):
+            metrics = [metric]
+        else:
+            raise TypeError('metric must be a list or a str.')
+        for metric in metrics:
+            if metric not in self.allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported.')
+        self.metrics = metrics
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+        self.use_postprocess = use_postprocess
+        self.postprocess_tracklet_cfg = postprocess_tracklet_cfg.copy()
+        self.postprocess_tracklet_methods = [
+            TASK_UTILS.build(cfg) for cfg in self.postprocess_tracklet_cfg
+        ]
+        assert benchmark in self.allowed_benchmarks
+        self.benchmark = benchmark
+        self.track_iou_thr = track_iou_thr
+        self.tmp_dir = tempfile.TemporaryDirectory()
+        self.tmp_dir.name = get_tmpdir()
+        self.seq_info = defaultdict(
+            lambda: dict(seq_length=-1, gt_tracks=[], pred_tracks=[]))
+        self.gt_dir = self._get_gt_dir()
+        self.pred_dir = self._get_pred_dir(outfile_prefix)
+        self.seqmap = osp.join(self.pred_dir, 'videoseq.txt')
+        with open(self.seqmap, 'w') as f:
+            f.write('name\n')
+
+    def __del__(self):
+        # To avoid tmpdir being cleaned up too early, because in multiple
+        # consecutive ValLoops, the value of `self.tmp_dir.name` is unchanged,
+        # and calling `tmp_dir.cleanup()` in compute_metrics will cause errors.
+        self.tmp_dir.cleanup()
+
+    def _get_pred_dir(self, outfile_prefix):
+        """Get directory to save the prediction results."""
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        if outfile_prefix is None:
+            outfile_prefix = self.tmp_dir.name
+        else:
+            if osp.exists(outfile_prefix) and is_main_process():
+                logger.info('remove previous results.')
+                shutil.rmtree(outfile_prefix)
+        pred_dir = osp.join(outfile_prefix, self.TRACKER)
+        os.makedirs(pred_dir, exist_ok=True)
+        return pred_dir
+
+    def _get_gt_dir(self):
+        """Get directory to save the gt files."""
+        output_dir = osp.join(self.tmp_dir.name, 'gt')
+        os.makedirs(output_dir, exist_ok=True)
+        return output_dir
+
+    def transform_gt_and_pred(self, img_data_sample, video, frame_id):
+
+        video = img_data_sample['img_path'].split(os.sep)[-3]
+        # load gts
+        if 'instances' in img_data_sample:
+            gt_instances = img_data_sample['instances']
+            gt_tracks = [
+                np.array([
+                    frame_id + 1, gt_instances[i]['instance_id'],
+                    gt_instances[i]['bbox'][0], gt_instances[i]['bbox'][1],
+                    gt_instances[i]['bbox'][2] - gt_instances[i]['bbox'][0],
+                    gt_instances[i]['bbox'][3] - gt_instances[i]['bbox'][1],
+                    gt_instances[i]['mot_conf'],
+                    gt_instances[i]['category_id'],
+                    gt_instances[i]['visibility']
+                ]) for i in range(len(gt_instances))
+            ]
+            self.seq_info[video]['gt_tracks'].extend(gt_tracks)
+
+        # load predictions
+        assert 'pred_track_instances' in img_data_sample
+        if self.use_postprocess:
+            pred_instances = img_data_sample['pred_track_instances']
+            pred_tracks = [
+                pred_instances['bboxes'][i]
+                for i in range(len(pred_instances['bboxes']))
+            ]
+        else:
+            pred_instances = img_data_sample['pred_track_instances']
+            pred_tracks = [
+                np.array([
+                    frame_id + 1, pred_instances['instances_id'][i].cpu(),
+                    pred_instances['bboxes'][i][0].cpu(),
+                    pred_instances['bboxes'][i][1].cpu(),
+                    (pred_instances['bboxes'][i][2] -
+                     pred_instances['bboxes'][i][0]).cpu(),
+                    (pred_instances['bboxes'][i][3] -
+                     pred_instances['bboxes'][i][1]).cpu(),
+                    pred_instances['scores'][i].cpu()
+                ]) for i in range(len(pred_instances['instances_id']))
+            ]
+        self.seq_info[video]['pred_tracks'].extend(pred_tracks)
+
+    def process_image(self, data_samples, video_len):
+
+        img_data_sample = data_samples[0].to_dict()
+        video = img_data_sample['img_path'].split(os.sep)[-3]
+        frame_id = img_data_sample['frame_id']
+        if self.seq_info[video]['seq_length'] == -1:
+            self.seq_info[video]['seq_length'] = video_len
+        self.transform_gt_and_pred(img_data_sample, video, frame_id)
+
+        if frame_id == video_len - 1:
+            # postprocessing
+            if self.postprocess_tracklet_cfg:
+                info = self.seq_info[video]
+                pred_tracks = np.array(info['pred_tracks'])
+                for postprocess_tracklet_methods in \
+                        self.postprocess_tracklet_methods:
+                    pred_tracks = postprocess_tracklet_methods\
+                        .forward(pred_tracks)
+                info['pred_tracks'] = pred_tracks
+            self._save_one_video_gts_preds(video)
+
+    def process_video(self, data_samples):
+
+        video_len = len(data_samples)
+        for frame_id in range(video_len):
+            img_data_sample = data_samples[frame_id].to_dict()
+            # load basic info
+            video = img_data_sample['img_path'].split(os.sep)[-3]
+            if self.seq_info[video]['seq_length'] == -1:
+                self.seq_info[video]['seq_length'] = video_len
+            self.transform_gt_and_pred(img_data_sample, video, frame_id)
+
+        if self.postprocess_tracklet_cfg:
+            info = self.seq_info[video]
+            pred_tracks = np.array(info['pred_tracks'])
+            for postprocess_tracklet_methods in \
+                    self.postprocess_tracklet_methods:
+                pred_tracks = postprocess_tracklet_methods \
+                    .forward(pred_tracks)
+            info['pred_tracks'] = pred_tracks
+        self._save_one_video_gts_preds(video)
+
+    def _save_one_video_gts_preds(self, seq: str) -> None:
+        """Save the gt and prediction results."""
+        info = self.seq_info[seq]
+        # save predictions
+        pred_file = osp.join(self.pred_dir, seq + '.txt')
+
+        pred_tracks = np.array(info['pred_tracks'])
+
+        with open(pred_file, 'wt') as f:
+            for tracks in pred_tracks:
+                line = '%d,%d,%.3f,%.3f,%.3f,%.3f,%.3f,-1,-1,-1\n' % (
+                    tracks[0], tracks[1], tracks[2], tracks[3], tracks[4],
+                    tracks[5], tracks[6])
+                f.writelines(line)
+
+        info['pred_tracks'] = []
+        # save gts
+        if info['gt_tracks']:
+            gt_file = osp.join(self.gt_dir, seq + '.txt')
+            with open(gt_file, 'wt') as f:
+                for tracks in info['gt_tracks']:
+                    line = '%d,%d,%d,%d,%d,%d,%d,%d,%.5f\n' % (
+                        tracks[0], tracks[1], tracks[2], tracks[3], tracks[4],
+                        tracks[5], tracks[6], tracks[7], tracks[8])
+                    f.writelines(line)
+            info['gt_tracks'].clear()
+        # save seq info
+        with open(self.seqmap, 'a') as f:
+            f.write(seq + '\n')
+            f.close()
+
+    def compute_metrics(self, results: list = None) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+                Defaults to None.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # NOTICE: don't access `self.results` from the method.
+        eval_results = dict()
+
+        if self.format_only:
+            return eval_results
+
+        eval_config = trackeval.Evaluator.get_default_eval_config()
+
+        # need to split out the tracker name
+        # caused by the implementation of TrackEval
+        pred_dir_tmp = self.pred_dir.rsplit(osp.sep, 1)[0]
+        dataset_config = self.get_dataset_cfg(self.gt_dir, pred_dir_tmp)
+
+        evaluator = trackeval.Evaluator(eval_config)
+        dataset = [trackeval.datasets.MotChallenge2DBox(dataset_config)]
+        metrics = [
+            getattr(trackeval.metrics,
+                    metric)(dict(METRICS=[metric], THRESHOLD=0.5))
+            for metric in self.metrics
+        ]
+        output_res, _ = evaluator.evaluate(dataset, metrics)
+        output_res = output_res['MotChallenge2DBox'][
+            self.TRACKER]['COMBINED_SEQ']['pedestrian']
+
+        if 'HOTA' in self.metrics:
+            logger.info('Evaluating HOTA Metrics...')
+            eval_results['HOTA'] = np.average(output_res['HOTA']['HOTA'])
+            eval_results['AssA'] = np.average(output_res['HOTA']['AssA'])
+            eval_results['DetA'] = np.average(output_res['HOTA']['DetA'])
+
+        if 'CLEAR' in self.metrics:
+            logger.info('Evaluating CLEAR Metrics...')
+            eval_results['MOTA'] = np.average(output_res['CLEAR']['MOTA'])
+            eval_results['MOTP'] = np.average(output_res['CLEAR']['MOTP'])
+            eval_results['IDSW'] = np.average(output_res['CLEAR']['IDSW'])
+            eval_results['TP'] = np.average(output_res['CLEAR']['CLR_TP'])
+            eval_results['FP'] = np.average(output_res['CLEAR']['CLR_FP'])
+            eval_results['FN'] = np.average(output_res['CLEAR']['CLR_FN'])
+            eval_results['Frag'] = np.average(output_res['CLEAR']['Frag'])
+            eval_results['MT'] = np.average(output_res['CLEAR']['MT'])
+            eval_results['ML'] = np.average(output_res['CLEAR']['ML'])
+
+        if 'Identity' in self.metrics:
+            logger.info('Evaluating Identity Metrics...')
+            eval_results['IDF1'] = np.average(output_res['Identity']['IDF1'])
+            eval_results['IDTP'] = np.average(output_res['Identity']['IDTP'])
+            eval_results['IDFN'] = np.average(output_res['Identity']['IDFN'])
+            eval_results['IDFP'] = np.average(output_res['Identity']['IDFP'])
+            eval_results['IDP'] = np.average(output_res['Identity']['IDP'])
+            eval_results['IDR'] = np.average(output_res['Identity']['IDR'])
+
+        return eval_results
+
+    def evaluate(self, size: int = 1) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+                Defaults to None.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        # wait for all processes to complete prediction.
+        barrier()
+
+        # gather seq_info and convert the list of dict to a dict.
+        # convert self.seq_info to dict first to make it picklable.
+        gathered_seq_info = all_gather_object(dict(self.seq_info))
+        all_seq_info = dict()
+        for _seq_info in gathered_seq_info:
+            all_seq_info.update(_seq_info)
+        self.seq_info = all_seq_info
+
+        if is_main_process():
+            _metrics = self.compute_metrics()  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
+
+    def get_dataset_cfg(self, gt_folder: str, tracker_folder: str):
+        """Get default configs for trackeval.datasets.MotChallenge2DBox.
+
+        Args:
+            gt_folder (str): the name of the GT folder
+            tracker_folder (str): the name of the tracker folder
+
+        Returns:
+            Dataset Configs for MotChallenge2DBox.
+        """
+        dataset_config = dict(
+            # Location of GT data
+            GT_FOLDER=gt_folder,
+            # Trackers location
+            TRACKERS_FOLDER=tracker_folder,
+            # Where to save eval results
+            # (if None, same as TRACKERS_FOLDER)
+            OUTPUT_FOLDER=None,
+            # Use self.TRACKER as the default tracker
+            TRACKERS_TO_EVAL=[self.TRACKER],
+            # Option values: ['pedestrian']
+            CLASSES_TO_EVAL=['pedestrian'],
+            # Option Values: 'MOT15', 'MOT16', 'MOT17', 'MOT20', 'DanceTrack'
+            BENCHMARK=self.benchmark,
+            # Option Values: 'train', 'test'
+            SPLIT_TO_EVAL='val' if self.benchmark == 'DanceTrack' else 'train',
+            # Whether tracker input files are zipped
+            INPUT_AS_ZIP=False,
+            # Whether to print current config
+            PRINT_CONFIG=True,
+            # Whether to perform preprocessing
+            # (never done for MOT15)
+            DO_PREPROC=False if self.benchmark == 'MOT15' else True,
+            # Tracker files are in
+            # TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+            TRACKER_SUB_FOLDER='',
+            # Output files are saved in
+            # OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+            OUTPUT_SUB_FOLDER='',
+            # Names of trackers to display
+            # (if None: TRACKERS_TO_EVAL)
+            TRACKER_DISPLAY_NAMES=None,
+            # Where seqmaps are found
+            # (if None: GT_FOLDER/seqmaps)
+            SEQMAP_FOLDER=None,
+            # Directly specify seqmap file
+            # (if none use seqmap_folder/benchmark-split_to_eval)
+            SEQMAP_FILE=self.seqmap,
+            # If not None, specify sequences to eval
+            # and their number of timesteps
+            SEQ_INFO={
+                seq: info['seq_length']
+                for seq, info in self.seq_info.items()
+            },
+            # '{gt_folder}/{seq}.txt'
+            GT_LOC_FORMAT='{gt_folder}/{seq}.txt',
+            # If False, data is in GT_FOLDER/BENCHMARK-SPLIT_TO_EVAL/ and in
+            # TRACKERS_FOLDER/BENCHMARK-SPLIT_TO_EVAL/tracker/
+            # If True, the middle 'benchmark-split' folder is skipped for both.
+            SKIP_SPLIT_FOL=True,
+        )
+
+        return dataset_config
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/openimages_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/openimages_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d75c59e0e711c90bb1e5fbcc1529e95864e99e9a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/openimages_metric.py
@@ -0,0 +1,237 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from collections import OrderedDict
+from typing import List, Optional, Sequence, Union
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+
+from mmdet.registry import METRICS
+from ..functional import eval_map
+
+
+@METRICS.register_module()
+class OpenImagesMetric(BaseMetric):
+    """OpenImages evaluation metric.
+
+    Evaluate detection mAP for OpenImages. Please refer to
+    https://storage.googleapis.com/openimages/web/evaluation.html for more
+    details.
+
+    Args:
+        iou_thrs (float or List[float]): IoU threshold. Defaults to 0.5.
+        ioa_thrs (float or List[float]): IoA threshold. Defaults to 0.5.
+        scale_ranges (List[tuple], optional): Scale ranges for evaluating
+            mAP. If not specified, all bounding boxes would be included in
+            evaluation. Defaults to None
+        use_group_of (bool): Whether consider group of groud truth bboxes
+            during evaluating. Defaults to True.
+        get_supercategory (bool): Whether to get parent class of the
+            current class. Default: True.
+        filter_labels (bool): Whether filter unannotated classes.
+            Default: True.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+    default_prefix: Optional[str] = 'openimages'
+
+    def __init__(self,
+                 iou_thrs: Union[float, List[float]] = 0.5,
+                 ioa_thrs: Union[float, List[float]] = 0.5,
+                 scale_ranges: Optional[List[tuple]] = None,
+                 use_group_of: bool = True,
+                 get_supercategory: bool = True,
+                 filter_labels: bool = True,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.iou_thrs = [iou_thrs] if isinstance(iou_thrs, float) else iou_thrs
+        self.ioa_thrs = [ioa_thrs] if (isinstance(ioa_thrs, float)
+                                       or ioa_thrs is None) else ioa_thrs
+        assert isinstance(self.iou_thrs, list) and isinstance(
+            self.ioa_thrs, list)
+        assert len(self.iou_thrs) == len(self.ioa_thrs)
+
+        self.scale_ranges = scale_ranges
+        self.use_group_of = use_group_of
+        self.get_supercategory = get_supercategory
+        self.filter_labels = filter_labels
+
+    def _get_supercategory_ann(self, instances: List[dict]) -> List[dict]:
+        """Get parent classes's annotation of the corresponding class.
+
+        Args:
+            instances (List[dict]): A list of annotations of the instances.
+
+        Returns:
+            List[dict]: Annotations extended with super-category.
+        """
+        supercat_instances = []
+        relation_matrix = self.dataset_meta['RELATION_MATRIX']
+        for instance in instances:
+            labels = np.where(relation_matrix[instance['bbox_label']])[0]
+            for label in labels:
+                if label == instance['bbox_label']:
+                    continue
+                new_instance = copy.deepcopy(instance)
+                new_instance['bbox_label'] = label
+                supercat_instances.append(new_instance)
+        return supercat_instances
+
+    def _process_predictions(self, pred_bboxes: np.ndarray,
+                             pred_scores: np.ndarray, pred_labels: np.ndarray,
+                             gt_instances: list,
+                             image_level_labels: np.ndarray) -> tuple:
+        """Process results of the corresponding class of the detection bboxes.
+
+        Note: It will choose to do the following two processing according to
+        the parameters:
+
+        1. Whether to add parent classes of the corresponding class of the
+        detection bboxes.
+
+        2. Whether to ignore the classes that unannotated on that image.
+
+        Args:
+            pred_bboxes (np.ndarray): bboxes predicted by the model
+            pred_scores (np.ndarray): scores predicted by the model
+            pred_labels (np.ndarray): labels predicted by the model
+            gt_instances (list): ground truth annotations
+            image_level_labels (np.ndarray): human-verified image level labels
+
+        Returns:
+            tuple: Processed bboxes, scores, and labels.
+        """
+        processed_bboxes = copy.deepcopy(pred_bboxes)
+        processed_scores = copy.deepcopy(pred_scores)
+        processed_labels = copy.deepcopy(pred_labels)
+        gt_labels = np.array([ins['bbox_label'] for ins in gt_instances],
+                             dtype=np.int64)
+        if image_level_labels is not None:
+            allowed_classes = np.unique(
+                np.append(gt_labels, image_level_labels))
+        else:
+            allowed_classes = np.unique(gt_labels)
+        relation_matrix = self.dataset_meta['RELATION_MATRIX']
+        pred_classes = np.unique(pred_labels)
+        for pred_class in pred_classes:
+            classes = np.where(relation_matrix[pred_class])[0]
+            for cls in classes:
+                if (cls in allowed_classes and cls != pred_class
+                        and self.get_supercategory):
+                    # add super-supercategory preds
+                    index = np.where(pred_labels == pred_class)[0]
+                    processed_scores = np.concatenate(
+                        [processed_scores, pred_scores[index]])
+                    processed_bboxes = np.concatenate(
+                        [processed_bboxes, pred_bboxes[index]])
+                    extend_labels = np.full(index.shape, cls, dtype=np.int64)
+                    processed_labels = np.concatenate(
+                        [processed_labels, extend_labels])
+                elif cls not in allowed_classes and self.filter_labels:
+                    # remove unannotated preds
+                    index = np.where(processed_labels != cls)[0]
+                    processed_scores = processed_scores[index]
+                    processed_bboxes = processed_bboxes[index]
+                    processed_labels = processed_labels[index]
+        return processed_bboxes, processed_scores, processed_labels
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            gt = copy.deepcopy(data_sample)
+            # add super-category instances
+            # TODO: Need to refactor to support LoadAnnotations
+            instances = gt['instances']
+            if self.get_supercategory:
+                supercat_instances = self._get_supercategory_ann(instances)
+                instances.extend(supercat_instances)
+            gt_labels = []
+            gt_bboxes = []
+            is_group_ofs = []
+            for ins in instances:
+                gt_labels.append(ins['bbox_label'])
+                gt_bboxes.append(ins['bbox'])
+                is_group_ofs.append(ins['is_group_of'])
+            ann = dict(
+                labels=np.array(gt_labels, dtype=np.int64),
+                bboxes=np.array(gt_bboxes, dtype=np.float32).reshape((-1, 4)),
+                gt_is_group_ofs=np.array(is_group_ofs, dtype=bool))
+
+            image_level_labels = gt.get('image_level_labels', None)
+            pred = data_sample['pred_instances']
+            pred_bboxes = pred['bboxes'].cpu().numpy()
+            pred_scores = pred['scores'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+
+            pred_bboxes, pred_scores, pred_labels = self._process_predictions(
+                pred_bboxes, pred_scores, pred_labels, instances,
+                image_level_labels)
+
+            dets = []
+            for label in range(len(self.dataset_meta['classes'])):
+                index = np.where(pred_labels == label)[0]
+                pred_bbox_scores = np.hstack(
+                    [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
+                dets.append(pred_bbox_scores)
+            self.results.append((ann, dets))
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        logger = MMLogger.get_current_instance()
+        gts, preds = zip(*results)
+        eval_results = OrderedDict()
+        # get dataset type
+        dataset_type = self.dataset_meta.get('dataset_type')
+        if dataset_type not in ['oid_challenge', 'oid_v6']:
+            dataset_type = 'oid_v6'
+            print_log(
+                'Cannot infer dataset type from the length of the'
+                ' classes. Set `oid_v6` as dataset type.',
+                logger='current')
+        mean_aps = []
+        for i, (iou_thr,
+                ioa_thr) in enumerate(zip(self.iou_thrs, self.ioa_thrs)):
+            if self.use_group_of:
+                assert ioa_thr is not None, 'ioa_thr must have value when' \
+                                            ' using group_of in evaluation.'
+            print_log(f'\n{"-" * 15}iou_thr, ioa_thr: {iou_thr}, {ioa_thr}'
+                      f'{"-" * 15}')
+            mean_ap, _ = eval_map(
+                preds,
+                gts,
+                scale_ranges=self.scale_ranges,
+                iou_thr=iou_thr,
+                ioa_thr=ioa_thr,
+                dataset=dataset_type,
+                logger=logger,
+                use_group_of=self.use_group_of)
+
+            mean_aps.append(mean_ap)
+            eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
+        eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
+        return eval_results
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/ov_coco_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/ov_coco_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..08cb902514914947551a5047c9900947738adf24
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/ov_coco_metric.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from mmengine.fileio import load
+from mmengine.logging import MMLogger
+from terminaltables import AsciiTable
+
+from mmdet.datasets.api_wrappers import COCO, COCOeval, COCOevalMP
+from mmdet.registry import METRICS
+from .coco_metric import CocoMetric
+
+
+@METRICS.register_module()
+class OVCocoMetric(CocoMetric):
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # split gt and prediction list
+        gts, preds = zip(*results)
+
+        tmp_dir = None
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+
+        if self._coco_api is None:
+            # use converted gt json file to initialize coco api
+            logger.info('Converting ground truth to coco format...')
+            coco_json_path = self.gt_to_coco_json(
+                gt_dicts=gts, outfile_prefix=outfile_prefix)
+            self._coco_api = COCO(coco_json_path)
+
+        # handle lazy init
+        if self.cat_ids is None:
+            self.cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['classes'])
+            self.base_cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['base_classes'])
+            self.novel_cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['novel_classes'])
+
+        if self.img_ids is None:
+            self.img_ids = self._coco_api.get_img_ids()
+
+        # convert predictions to coco format and dump to json file
+        result_files = self.results2json(preds, outfile_prefix)
+
+        eval_results = OrderedDict()
+        if self.format_only:
+            logger.info('results are saved in '
+                        f'{osp.dirname(outfile_prefix)}')
+            return eval_results
+
+        for metric in self.metrics:
+            logger.info(f'Evaluating {metric}...')
+
+            # TODO: May refactor fast_eval_recall to an independent metric?
+            # fast eval recall
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    preds, self.proposal_nums, self.iou_thrs, logger=logger)
+                log_msg = []
+                for i, num in enumerate(self.proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+                log_msg = ''.join(log_msg)
+                logger.info(log_msg)
+                continue
+
+            # evaluate proposal, bbox and segm
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            if metric not in result_files:
+                raise KeyError(f'{metric} is not in results')
+            try:
+                predictions = load(result_files[metric])
+                if iou_type == 'segm':
+                    # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331  # noqa
+                    # When evaluating mask AP, if the results contain bbox,
+                    # cocoapi will use the box area instead of the mask area
+                    # for calculating the instance area. Though the overall AP
+                    # is not affected, this leads to different
+                    # small/medium/large mask AP results.
+                    for x in predictions:
+                        x.pop('bbox')
+                coco_dt = self._coco_api.loadRes(predictions)
+
+            except IndexError:
+                logger.error(
+                    'The testing results of the whole dataset is empty.')
+                break
+
+            if self.use_mp_eval:
+                coco_eval = COCOevalMP(self._coco_api, coco_dt, iou_type)
+            else:
+                coco_eval = COCOeval(self._coco_api, coco_dt, iou_type)
+
+            coco_eval.params.catIds = self.cat_ids
+            coco_eval.params.imgIds = self.img_ids
+            coco_eval.params.maxDets = list(self.proposal_nums)
+            coco_eval.params.iouThrs = self.iou_thrs
+
+            # mapping of cocoEval.stats
+            coco_metric_names = {
+                'mAP': 0,
+                'mAP_50': 1,
+                'mAP_75': 2,
+                'mAP_s': 3,
+                'mAP_m': 4,
+                'mAP_l': 5,
+                'AR@100': 6,
+                'AR@300': 7,
+                'AR@1000': 8,
+                'AR_s@1000': 9,
+                'AR_m@1000': 10,
+                'AR_l@1000': 11
+            }
+            metric_items = self.metric_items
+            if metric_items is not None:
+                for metric_item in metric_items:
+                    if metric_item not in coco_metric_names:
+                        raise KeyError(
+                            f'metric item "{metric_item}" is not supported')
+
+            if metric == 'proposal':
+                coco_eval.params.useCats = 0
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if metric_items is None:
+                    metric_items = [
+                        'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
+                        'AR_m@1000', 'AR_l@1000'
+                    ]
+
+                for item in metric_items:
+                    val = float(
+                        f'{coco_eval.stats[coco_metric_names[item]]:.3f}')
+                    eval_results[item] = val
+            else:
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if self.classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = coco_eval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, cat_id in enumerate(self.cat_ids):
+                        t = []
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        nm = self._coco_api.loadCats(cat_id)[0]
+                        precision = precisions[:, :, idx, 0, -1]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        t.append(f'{nm["name"]}')
+                        t.append(f'{round(ap, 3)}')
+                        eval_results[f'{nm["name"]}_precision'] = round(ap, 3)
+
+                        # indexes of IoU  @50 and @75
+                        for iou in [0, 5]:
+                            precision = precisions[iou, :, idx, 0, -1]
+                            precision = precision[precision > -1]
+                            if precision.size:
+                                ap = np.mean(precision)
+                            else:
+                                ap = float('nan')
+                            t.append(f'{round(ap, 3)}')
+
+                        # indexes of area of small, median and large
+                        for area in [1, 2, 3]:
+                            precision = precisions[:, :, idx, area, -1]
+                            precision = precision[precision > -1]
+                            if precision.size:
+                                ap = np.mean(precision)
+                            else:
+                                ap = float('nan')
+                            t.append(f'{round(ap, 3)}')
+                        results_per_category.append(tuple(t))
+
+                    num_columns = len(results_per_category[0])
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = [
+                        'category', 'mAP', 'mAP_50', 'mAP_75', 'mAP_s',
+                        'mAP_m', 'mAP_l'
+                    ]
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    logger.info('\n' + table.table)
+
+                # ------------get novel_ap50 and base_ap50---------
+                precisions = coco_eval.eval['precision']
+                assert len(self.cat_ids) == precisions.shape[2]
+                base_inds, novel_inds = [], []
+
+                for idx, catId in enumerate(self.cat_ids):
+                    if catId in self.base_cat_ids:
+                        base_inds.append(idx)
+                    if catId in self.novel_cat_ids:
+                        novel_inds.append(idx)
+
+                base_ap = precisions[:, :, base_inds, 0, -1]
+                novel_ap = precisions[:, :, novel_inds, 0, -1]
+                base_ap50 = precisions[0, :, base_inds, 0, -1]
+                novel_ap50 = precisions[0, :, novel_inds, 0, -1]
+
+                eval_results['base_ap'] = np.mean(
+                    base_ap[base_ap > -1]) if len(
+                        base_ap[base_ap > -1]) else -1
+                eval_results['novel_ap'] = np.mean(
+                    novel_ap[novel_ap > -1]) if len(
+                        novel_ap[novel_ap > -1]) else -1
+                eval_results['base_ap50'] = np.mean(
+                    base_ap50[base_ap50 > -1]) if len(
+                        base_ap50[base_ap50 > -1]) else -1
+                eval_results['novel_ap50'] = np.mean(
+                    novel_ap50[novel_ap50 > -1]) if len(
+                        novel_ap50[novel_ap50 > -1]) else -1
+                # ------------get novel_ap50 and base_ap50---------
+                if metric_items is None:
+                    metric_items = [
+                        'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+                    ]
+
+                for metric_item in metric_items:
+                    key = f'{metric}_{metric_item}'
+                    val = coco_eval.stats[coco_metric_names[metric_item]]
+                    eval_results[key] = float(f'{round(val, 3)}')
+
+                ap = coco_eval.stats[:6]
+                logger.info(f'{metric}_mAP_copypaste: {ap[0]:.3f} '
+                            f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                            f'{ap[4]:.3f} {ap[5]:.3f}')
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/refexp_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/refexp_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bcdf1629b9bcd9519e0160769810168017a6d0d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/refexp_metric.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger
+
+from mmdet.datasets.api_wrappers import COCO
+from mmdet.registry import METRICS
+from ..functional import bbox_overlaps
+
+
+@METRICS.register_module()
+class RefExpMetric(BaseMetric):
+    default_prefix: Optional[str] = 'refexp'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: str = 'bbox',
+                 topk=(1, 5, 10),
+                 iou_thrs: float = 0.5,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.metric = metric
+        self.topk = topk
+        self.iou_thrs = iou_thrs
+
+        with get_local_path(ann_file) as local_path:
+            self.coco = COCO(local_path)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        dataset2score = {
+            'refcoco': {k: 0.0
+                        for k in self.topk},
+            'refcoco+': {k: 0.0
+                         for k in self.topk},
+            'refcocog': {k: 0.0
+                         for k in self.topk},
+        }
+        dataset2count = {'refcoco': 0.0, 'refcoco+': 0.0, 'refcocog': 0.0}
+
+        for result in results:
+            img_id = result['img_id']
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id)
+            assert len(ann_ids) == 1
+            img_info = self.coco.loadImgs(img_id)[0]
+            target = self.coco.loadAnns(ann_ids[0])
+
+            target_bbox = target[0]['bbox']
+            converted_bbox = [
+                target_bbox[0],
+                target_bbox[1],
+                target_bbox[2] + target_bbox[0],
+                target_bbox[3] + target_bbox[1],
+            ]
+            iou = bbox_overlaps(result['bboxes'],
+                                np.array(converted_bbox).reshape(-1, 4))
+            for k in self.topk:
+                if max(iou[:k]) >= self.iou_thrs:
+                    dataset2score[img_info['dataset_name']][k] += 1.0
+            dataset2count[img_info['dataset_name']] += 1.0
+
+        for key, value in dataset2score.items():
+            for k in self.topk:
+                try:
+                    value[k] /= dataset2count[key]
+                except Exception as e:
+                    print(e)
+
+        results = {}
+        mean_precision = 0.0
+        for key, value in dataset2score.items():
+            results[key] = sorted([v for k, v in value.items()])
+            mean_precision += sum(results[key])
+            logger.info(
+                f' Dataset: {key} - Precision @ 1, 5, 10: {results[key]}')
+
+        # `mean_precision` key is used for saving the best checkpoint
+        out_results = {'mean_precision': mean_precision / 9.0}
+
+        for i, k in enumerate(self.topk):
+            out_results[f'refcoco_precision@{k}'] = results['refcoco'][i]
+        for i, k in enumerate(self.topk):
+            out_results[f'refcoco+_precision@{k}'] = results['refcoco+'][i]
+        for i, k in enumerate(self.topk):
+            out_results[f'refcocog_precision@{k}'] = results['refcocog'][i]
+        return out_results
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/refseg_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/refseg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..0faee07007e809ef08e86a88e8b11c2be1a64034
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/refseg_metric.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+import torch
+from mmengine.evaluator import BaseMetric
+
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class RefSegMetric(BaseMetric):
+    """Referring Expression Segmentation Metric."""
+
+    def __init__(self, metric: Sequence = ('cIoU', 'mIoU'), **kwargs):
+        super().__init__(**kwargs)
+        assert set(metric).issubset(['cIoU', 'mIoU']), \
+            f'Only support cIoU and mIoU, but got {metric}'
+        assert len(metric) > 0, 'metrics should not be empty'
+        self.metrics = metric
+
+    def compute_iou(self, pred_seg: torch.Tensor,
+                    gt_seg: torch.Tensor) -> tuple:
+        overlap = pred_seg & gt_seg
+        union = pred_seg | gt_seg
+        return overlap, union
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_instances']['masks'].bool()
+            label = data_sample['gt_masks'].to_tensor(
+                pred_label.dtype, pred_label.device).bool()
+            # calculate iou
+            overlap, union = self.compute_iou(pred_label, label)
+
+            bs = len(pred_label)
+            iou = overlap.reshape(bs, -1).sum(-1) * 1.0 / union.reshape(
+                bs, -1).sum(-1)
+            iou = torch.nan_to_num_(iou, nan=0.0)
+            self.results.append((overlap.sum(), union.sum(), iou.sum(), bs))
+
+    def compute_metrics(self, results: list) -> dict:
+        results = tuple(zip(*results))
+        assert len(results) == 4
+        cum_i = sum(results[0])
+        cum_u = sum(results[1])
+        iou = sum(results[2])
+        seg_total = sum(results[3])
+
+        metrics = {}
+        if 'cIoU' in self.metrics:
+            metrics['cIoU'] = cum_i * 100 / cum_u
+        if 'mIoU' in self.metrics:
+            metrics['mIoU'] = iou * 100 / seg_total
+        return metrics
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/reid_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/reid_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d74df1433cdb093cfb0377b734fc5479401e09e7
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/reid_metric.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class ReIDMetrics(BaseMetric):
+    """mAP and CMC evaluation metrics for the ReID task.
+
+    Args:
+        metric (str | list[str]): Metrics to be evaluated.
+            Default value is `mAP`.
+        metric_options: (dict, optional): Options for calculating metrics.
+            Allowed keys are 'rank_list' and 'max_rank'. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+    """
+    allowed_metrics = ['mAP', 'CMC']
+    default_prefix: Optional[str] = 'reid-metric'
+
+    def __init__(self,
+                 metric: Union[str, Sequence[str]] = 'mAP',
+                 metric_options: Optional[dict] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device, prefix)
+
+        if isinstance(metric, list):
+            metrics = metric
+        elif isinstance(metric, str):
+            metrics = [metric]
+        else:
+            raise TypeError('metric must be a list or a str.')
+        for metric in metrics:
+            if metric not in self.allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported.')
+        self.metrics = metrics
+
+        self.metric_options = metric_options or dict(
+            rank_list=[1, 5, 10, 20], max_rank=20)
+        for rank in self.metric_options['rank_list']:
+            assert 1 <= rank <= self.metric_options['max_rank']
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            pred_feature = data_sample['pred_feature']
+            assert isinstance(pred_feature, torch.Tensor)
+            gt_label = data_sample.get('gt_label', data_sample['gt_label'])
+            assert isinstance(gt_label['label'], torch.Tensor)
+            result = dict(
+                pred_feature=pred_feature.data.cpu(),
+                gt_label=gt_label['label'].cpu())
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        # NOTICE: don't access `self.results` from the method.
+        metrics = {}
+
+        pids = torch.cat([result['gt_label'] for result in results]).numpy()
+        features = torch.stack([result['pred_feature'] for result in results])
+
+        n, c = features.size()
+        mat = torch.pow(features, 2).sum(dim=1, keepdim=True).expand(n, n)
+        distmat = mat + mat.t()
+        distmat.addmm_(features, features.t(), beta=1, alpha=-2)
+        distmat = distmat.numpy()
+
+        indices = np.argsort(distmat, axis=1)
+        matches = (pids[indices] == pids[:, np.newaxis]).astype(np.int32)
+
+        all_cmc = []
+        all_AP = []
+        num_valid_q = 0.
+        for q_idx in range(n):
+            # remove self
+            raw_cmc = matches[q_idx][1:]
+            if not np.any(raw_cmc):
+                # this condition is true when query identity
+                # does not appear in gallery
+                continue
+
+            cmc = raw_cmc.cumsum()
+            cmc[cmc > 1] = 1
+
+            all_cmc.append(cmc[:self.metric_options['max_rank']])
+            num_valid_q += 1.
+
+            # compute average precision
+            num_rel = raw_cmc.sum()
+            tmp_cmc = raw_cmc.cumsum()
+            tmp_cmc = [x / (i + 1.) for i, x in enumerate(tmp_cmc)]
+            tmp_cmc = np.asarray(tmp_cmc) * raw_cmc
+            AP = tmp_cmc.sum() / num_rel
+            all_AP.append(AP)
+
+        assert num_valid_q > 0, \
+            'Error: all query identities do not appear in gallery'
+
+        all_cmc = np.asarray(all_cmc)
+        all_cmc = all_cmc.sum(0) / num_valid_q
+        mAP = np.mean(all_AP)
+
+        if 'mAP' in self.metrics:
+            metrics['mAP'] = np.around(mAP, decimals=3)
+        if 'CMC' in self.metrics:
+            for rank in self.metric_options['rank_list']:
+                metrics[f'R{rank}'] = np.around(all_cmc[rank - 1], decimals=3)
+
+        return metrics
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/semseg_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/semseg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..3215f6788a6155bdbceb6a91259008b4d851868e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/semseg_metric.py
@@ -0,0 +1,279 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import OrderedDict
+from typing import Dict, Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmcv import imwrite
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+from mmengine.utils import mkdir_or_exist
+from PIL import Image
+
+try:
+    from prettytable import PrettyTable
+except ImportError:
+    PrettyTable = None
+
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class SemSegMetric(BaseMetric):
+    """mIoU evaluation metric.
+
+    Args:
+        iou_metrics (list[str] | str): Metrics to be calculated, the options
+            includes 'mIoU', 'mDice' and 'mFscore'.
+        beta (int): Determines the weight of recall in the combined score.
+            Default: 1.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        output_dir (str): The directory for output prediction. Defaults to
+            None.
+        format_only (bool): Only format result for results commit without
+            perform evaluation. It is useful when you want to save the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    def __init__(self,
+                 iou_metrics: Sequence[str] = ['mIoU'],
+                 beta: int = 1,
+                 collect_device: str = 'cpu',
+                 output_dir: Optional[str] = None,
+                 format_only: bool = False,
+                 backend_args: dict = None,
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        if isinstance(iou_metrics, str):
+            iou_metrics = [iou_metrics]
+        if not set(iou_metrics).issubset(set(['mIoU', 'mDice', 'mFscore'])):
+            raise KeyError(f'metrics {iou_metrics} is not supported. '
+                           f'Only supports mIoU/mDice/mFscore.')
+        self.metrics = iou_metrics
+        self.beta = beta
+        self.output_dir = output_dir
+        if self.output_dir and is_main_process():
+            mkdir_or_exist(self.output_dir)
+        self.format_only = format_only
+        self.backend_args = backend_args
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        num_classes = len(self.dataset_meta['classes'])
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_sem_seg']['sem_seg'].squeeze()
+            # format_only always for test dataset without ground truth
+            if not self.format_only:
+                label = data_sample['gt_sem_seg']['sem_seg'].squeeze().to(
+                    pred_label)
+                ignore_index = data_sample['pred_sem_seg'].get(
+                    'ignore_index', 255)
+                self.results.append(
+                    self._compute_pred_stats(pred_label, label, num_classes,
+                                             ignore_index))
+
+            # format_result
+            if self.output_dir is not None:
+                basename = osp.splitext(osp.basename(
+                    data_sample['img_path']))[0]
+                png_filename = osp.abspath(
+                    osp.join(self.output_dir, f'{basename}.png'))
+                output_mask = pred_label.cpu().numpy()
+                output = Image.fromarray(output_mask.astype(np.uint8))
+                imwrite(output, png_filename, backend_args=self.backend_args)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results. The key
+                mainly includes aAcc, mIoU, mAcc, mDice, mFscore, mPrecision,
+                mRecall.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.format_only:
+            logger.info(f'results are saved to {osp.dirname(self.output_dir)}')
+            return OrderedDict()
+
+        ret_metrics = self.get_return_metrics(results)
+
+        # summary table
+        ret_metrics_summary = OrderedDict({
+            ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2)
+            for ret_metric, ret_metric_value in ret_metrics.items()
+        })
+        metrics = dict()
+        for key, val in ret_metrics_summary.items():
+            if key == 'aAcc':
+                metrics[key] = val
+            else:
+                metrics['m' + key] = val
+
+        print_semantic_table(ret_metrics, self.dataset_meta['classes'], logger)
+
+        return metrics
+
+    def _compute_pred_stats(self, pred_label: torch.tensor,
+                            label: torch.tensor, num_classes: int,
+                            ignore_index: int):
+        """Parse semantic segmentation predictions.
+
+        Args:
+            pred_label (torch.tensor): Prediction segmentation map
+                or predict result filename. The shape is (H, W).
+            label (torch.tensor): Ground truth segmentation map
+                or label filename. The shape is (H, W).
+            num_classes (int): Number of categories.
+
+        Returns:
+            torch.Tensor: The intersection of prediction and ground truth
+                histogram on all classes.
+            torch.Tensor: The union of prediction and ground truth histogram on
+                all classes.
+            torch.Tensor: The prediction histogram on all classes.
+            torch.Tensor: The ground truth histogram on all classes.
+        """
+        assert pred_label.shape == label.shape
+        mask = label != ignore_index
+        label, pred_label = label[mask], pred_label[mask]
+
+        intersect = pred_label[pred_label == label]
+        area_intersect = torch.histc(
+            intersect.float(), bins=num_classes, min=0, max=num_classes - 1)
+        area_pred_label = torch.histc(
+            pred_label.float(), bins=num_classes, min=0, max=num_classes - 1)
+        area_label = torch.histc(
+            label.float(), bins=num_classes, min=0, max=num_classes - 1)
+        area_union = area_pred_label + area_label - area_intersect
+        result = dict(
+            area_intersect=area_intersect,
+            area_union=area_union,
+            area_pred_label=area_pred_label,
+            area_label=area_label)
+        return result
+
+    def get_return_metrics(self, results: list) -> dict:
+        """Calculate evaluation metrics.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, np.ndarray]: per category evaluation metrics,
+                shape (num_classes, ).
+        """
+
+        def f_score(precision, recall, beta=1):
+            """calculate the f-score value.
+
+            Args:
+                precision (float | torch.Tensor): The precision value.
+                recall (float | torch.Tensor): The recall value.
+                beta (int): Determines the weight of recall in the combined
+                    score. Default: 1.
+
+            Returns:
+                [torch.tensor]: The f-score value.
+            """
+            score = (1 + beta**2) * (precision * recall) / (
+                (beta**2 * precision) + recall)
+            return score
+
+        total_area_intersect = sum([r['area_intersect'] for r in results])
+        total_area_union = sum([r['area_union'] for r in results])
+        total_area_pred_label = sum([r['area_pred_label'] for r in results])
+        total_area_label = sum([r['area_label'] for r in results])
+
+        all_acc = total_area_intersect / total_area_label
+        ret_metrics = OrderedDict({'aAcc': all_acc})
+        for metric in self.metrics:
+            if metric == 'mIoU':
+                iou = total_area_intersect / total_area_union
+                acc = total_area_intersect / total_area_label
+                ret_metrics['IoU'] = iou
+                ret_metrics['Acc'] = acc
+            elif metric == 'mDice':
+                dice = 2 * total_area_intersect / (
+                    total_area_pred_label + total_area_label)
+                acc = total_area_intersect / total_area_label
+                ret_metrics['Dice'] = dice
+                ret_metrics['Acc'] = acc
+            elif metric == 'mFscore':
+                precision = total_area_intersect / total_area_pred_label
+                recall = total_area_intersect / total_area_label
+                f_value = torch.tensor([
+                    f_score(x[0], x[1], self.beta)
+                    for x in zip(precision, recall)
+                ])
+                ret_metrics['Fscore'] = f_value
+                ret_metrics['Precision'] = precision
+                ret_metrics['Recall'] = recall
+
+        ret_metrics = {
+            metric: value.cpu().numpy()
+            for metric, value in ret_metrics.items()
+        }
+
+        return ret_metrics
+
+
+def print_semantic_table(
+        results: dict,
+        class_names: list,
+        logger: Optional[Union['MMLogger', str]] = None) -> None:
+    """Print semantic segmentation evaluation results table.
+
+    Args:
+        results (dict): The evaluation results.
+        class_names (list): Class names.
+        logger (MMLogger | str, optional): Logger used for printing.
+            Default: None.
+    """
+    # each class table
+    results.pop('aAcc', None)
+    ret_metrics_class = OrderedDict({
+        ret_metric: np.round(ret_metric_value * 100, 2)
+        for ret_metric, ret_metric_value in results.items()
+    })
+
+    print_log('per class results:', logger)
+    if PrettyTable:
+        class_table_data = PrettyTable()
+        ret_metrics_class.update({'Class': class_names})
+        ret_metrics_class.move_to_end('Class', last=False)
+        for key, val in ret_metrics_class.items():
+            class_table_data.add_column(key, val)
+        print_log('\n' + class_table_data.get_string(), logger=logger)
+    else:
+        logger.warning(
+            '`prettytable` is not installed, for better table format, '
+            'please consider installing it with "pip install prettytable"')
+        print_result = {}
+        for class_name, iou, acc in zip(class_names, ret_metrics_class['IoU'],
+                                        ret_metrics_class['Acc']):
+            print_result[class_name] = {'IoU': iou, 'Acc': acc}
+        print_log(print_result, logger)
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/voc_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/voc_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..32d8c075de9c8b4fb842ad7f64f87a10c4d68546
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/voc_metric.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from collections import OrderedDict
+from typing import List, Optional, Sequence, Union
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS
+from ..functional import eval_map, eval_recalls
+
+
+@METRICS.register_module()
+class VOCMetric(BaseMetric):
+    """Pascal VOC evaluation metric.
+
+    Args:
+        iou_thrs (float or List[float]): IoU threshold. Defaults to 0.5.
+        scale_ranges (List[tuple], optional): Scale ranges for evaluating
+            mAP. If not specified, all bounding boxes would be included in
+            evaluation. Defaults to None.
+        metric (str | list[str]): Metrics to be evaluated. Options are
+            'mAP', 'recall'. If is list, the first setting in the list will
+             be used to evaluate metric.
+        proposal_nums (Sequence[int]): Proposal number used for evaluating
+            recalls, such as recall@100, recall@1000.
+            Default: (100, 300, 1000).
+        eval_mode (str): 'area' or '11points', 'area' means calculating the
+            area under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1].
+            The PASCAL VOC2007 defaults to use '11points', while PASCAL
+            VOC2012 defaults to use 'area'.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    default_prefix: Optional[str] = 'pascal_voc'
+
+    def __init__(self,
+                 iou_thrs: Union[float, List[float]] = 0.5,
+                 scale_ranges: Optional[List[tuple]] = None,
+                 metric: Union[str, List[str]] = 'mAP',
+                 proposal_nums: Sequence[int] = (100, 300, 1000),
+                 eval_mode: str = '11points',
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.iou_thrs = [iou_thrs] if isinstance(iou_thrs, float) \
+            else iou_thrs
+        self.scale_ranges = scale_ranges
+        # voc evaluation metrics
+        if not isinstance(metric, str):
+            assert len(metric) == 1
+            metric = metric[0]
+        allowed_metrics = ['recall', 'mAP']
+        if metric not in allowed_metrics:
+            raise KeyError(
+                f"metric should be one of 'recall', 'mAP', but got {metric}.")
+        self.metric = metric
+        self.proposal_nums = proposal_nums
+        assert eval_mode in ['area', '11points'], \
+            'Unrecognized mode, only "area" and "11points" are supported'
+        self.eval_mode = eval_mode
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            gt = copy.deepcopy(data_sample)
+            # TODO: Need to refactor to support LoadAnnotations
+            gt_instances = gt['gt_instances']
+            gt_ignore_instances = gt['ignored_instances']
+            ann = dict(
+                labels=gt_instances['labels'].cpu().numpy(),
+                bboxes=gt_instances['bboxes'].cpu().numpy(),
+                bboxes_ignore=gt_ignore_instances['bboxes'].cpu().numpy(),
+                labels_ignore=gt_ignore_instances['labels'].cpu().numpy())
+
+            pred = data_sample['pred_instances']
+            pred_bboxes = pred['bboxes'].cpu().numpy()
+            pred_scores = pred['scores'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+
+            dets = []
+            for label in range(len(self.dataset_meta['classes'])):
+                index = np.where(pred_labels == label)[0]
+                pred_bbox_scores = np.hstack(
+                    [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
+                dets.append(pred_bbox_scores)
+
+            self.results.append((ann, dets))
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        gts, preds = zip(*results)
+        eval_results = OrderedDict()
+        if self.metric == 'mAP':
+            assert isinstance(self.iou_thrs, list)
+            dataset_type = self.dataset_meta.get('dataset_type')
+            if dataset_type in ['VOC2007', 'VOC2012']:
+                dataset_name = 'voc'
+                if dataset_type == 'VOC2007' and self.eval_mode != '11points':
+                    warnings.warn('Pascal VOC2007 uses `11points` as default '
+                                  'evaluate mode, but you are using '
+                                  f'{self.eval_mode}.')
+                elif dataset_type == 'VOC2012' and self.eval_mode != 'area':
+                    warnings.warn('Pascal VOC2012 uses `area` as default '
+                                  'evaluate mode, but you are using '
+                                  f'{self.eval_mode}.')
+            else:
+                dataset_name = self.dataset_meta['classes']
+
+            mean_aps = []
+            for iou_thr in self.iou_thrs:
+                logger.info(f'\n{"-" * 15}iou_thr: {iou_thr}{"-" * 15}')
+                # Follow the official implementation,
+                # http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar
+                # we should use the legacy coordinate system in mmdet 1.x,
+                # which means w, h should be computed as 'x2 - x1 + 1` and
+                # `y2 - y1 + 1`
+                mean_ap, _ = eval_map(
+                    preds,
+                    gts,
+                    scale_ranges=self.scale_ranges,
+                    iou_thr=iou_thr,
+                    dataset=dataset_name,
+                    logger=logger,
+                    eval_mode=self.eval_mode,
+                    use_legacy_coordinate=True)
+                mean_aps.append(mean_ap)
+                eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
+            eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
+            eval_results.move_to_end('mAP', last=False)
+        elif self.metric == 'recall':
+            gt_bboxes = [gt['bboxes'] for gt in gts]
+            pr_bboxes = [pred[0] for pred in preds]
+            recalls = eval_recalls(
+                gt_bboxes,
+                pr_bboxes,
+                self.proposal_nums,
+                self.iou_thrs,
+                logger=logger,
+                use_legacy_coordinate=True)
+            for i, num in enumerate(self.proposal_nums):
+                for j, iou_thr in enumerate(self.iou_thrs):
+                    eval_results[f'recall@{num}@{iou_thr}'] = recalls[i, j]
+            if recalls.shape[1] > 1:
+                ar = recalls.mean(axis=1)
+                for i, num in enumerate(self.proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+        return eval_results
diff --git a/head_extractor/build/lib/mmdet/evaluation/metrics/youtube_vis_metric.py b/head_extractor/build/lib/mmdet/evaluation/metrics/youtube_vis_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..5abc77a591c7ee5d67cdf4dc4c4926c84894ba1d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/evaluation/metrics/youtube_vis_metric.py
@@ -0,0 +1,426 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+import zipfile
+from collections import OrderedDict, defaultdict
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import numpy as np
+from mmengine.dist import (all_gather_object, barrier, broadcast_object_list,
+                           is_main_process)
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS
+from mmdet.structures.mask import encode_mask_results
+from ..functional import YTVIS, YTVISeval
+from .base_video_metric import BaseVideoMetric, collect_tracking_results
+
+
+@METRICS.register_module()
+class YouTubeVISMetric(BaseVideoMetric):
+    """mAP evaluation metrics for the VIS task.
+
+    Args:
+        metric (str | list[str]): Metrics to be evaluated.
+            Default value is `youtube_vis_ap`.
+        metric_items (List[str], optional): Metric result names to be
+            recorded in the evaluation result. Defaults to None.
+        outfile_prefix (str | None): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonyms metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+        format_only (bool): If True, only formatting the results to the
+            official format and not performing evaluation. Defaults to False.
+    """
+
+    default_prefix: Optional[str] = 'youtube_vis'
+
+    def __init__(self,
+                 metric: Union[str, List[str]] = 'youtube_vis_ap',
+                 metric_items: Optional[Sequence[str]] = None,
+                 outfile_prefix: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 format_only: bool = False) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        # vis evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        self.format_only = format_only
+        allowed_metrics = ['youtube_vis_ap']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(
+                    f"metric should be 'youtube_vis_ap', but got {metric}.")
+
+        self.metric_items = metric_items
+        self.outfile_prefix = outfile_prefix
+        self.per_video_res = []
+        self.categories = []
+        self._vis_meta_info = defaultdict(list)  # record video and image infos
+
+    def process_video(self, data_samples):
+
+        video_length = len(data_samples)
+        for frame_id in range(video_length):
+            result = dict()
+            img_data_sample = data_samples[frame_id].to_dict()
+            pred = img_data_sample['pred_track_instances']
+            video_id = img_data_sample['video_id']
+
+            result['img_id'] = img_data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            result['labels'] = pred['labels'].cpu().numpy()
+            result['instances_id'] = pred['instances_id'].cpu().numpy()
+            # encode mask to RLE
+            assert 'masks' in pred, \
+                'masks must exist in YouTube-VIS metric'
+            result['masks'] = encode_mask_results(
+                pred['masks'].detach().cpu().numpy())
+
+            # parse gt
+            gt = dict()
+            gt['width'] = img_data_sample['ori_shape'][1]
+            gt['height'] = img_data_sample['ori_shape'][0]
+            gt['img_id'] = img_data_sample['img_id']
+            gt['frame_id'] = frame_id
+            gt['video_id'] = video_id
+            gt['video_length'] = video_length
+
+            if 'instances' in img_data_sample:
+                gt['anns'] = img_data_sample['instances']
+            else:
+                gt['anns'] = dict()
+            self.per_video_res.append((result, gt))
+
+        preds, gts = zip(*self.per_video_res)
+        # format the results
+        # we must format gts first to update self._vis_meta_info
+        gt_results = self._format_one_video_gts(gts)
+        pred_results = self._format_one_video_preds(preds)
+        self.per_video_res.clear()
+        # add converted result to the results list
+        self.results.append((pred_results, gt_results))
+
+    def compute_metrics(self, results: List) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (List): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        # split gt and prediction list
+        tmp_pred_results, tmp_gt_results = zip(*results)
+        gt_results = self.format_gts(tmp_gt_results)
+        pred_results = self.format_preds(tmp_pred_results)
+
+        if self.format_only:
+            self.save_pred_results(pred_results)
+            return dict()
+
+        ytvis = YTVIS(gt_results)
+
+        ytvis_dets = ytvis.loadRes(pred_results)
+        vid_ids = ytvis.getVidIds()
+
+        iou_type = metric = 'segm'
+        eval_results = OrderedDict()
+        ytvisEval = YTVISeval(ytvis, ytvis_dets, iou_type)
+        ytvisEval.params.vidIds = vid_ids
+        ytvisEval.evaluate()
+        ytvisEval.accumulate()
+        ytvisEval.summarize()
+
+        coco_metric_names = {
+            'mAP': 0,
+            'mAP_50': 1,
+            'mAP_75': 2,
+            'mAP_s': 3,
+            'mAP_m': 4,
+            'mAP_l': 5,
+            'AR@1': 6,
+            'AR@10': 7,
+            'AR@100': 8,
+            'AR_s@100': 9,
+            'AR_m@100': 10,
+            'AR_l@100': 11
+        }
+        metric_items = self.metric_items
+        if metric_items is not None:
+            for metric_item in metric_items:
+                if metric_item not in coco_metric_names:
+                    raise KeyError(
+                        f'metric item "{metric_item}" is not supported')
+
+        if metric_items is None:
+            metric_items = [
+                'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+            ]
+        for metric_item in metric_items:
+            key = f'{metric}_{metric_item}'
+            val = float(
+                f'{ytvisEval.stats[coco_metric_names[metric_item]]:.3f}')
+            eval_results[key] = val
+
+        return eval_results
+
+    def format_gts(self, gts: Tuple[List]) -> dict:
+        """Gather all ground-truth from self.results."""
+        self.categories = [
+            dict(id=id + 1, name=name)
+            for id, name in enumerate(self.dataset_meta['classes'])
+        ]
+        gt_results = dict(
+            categories=self.categories,
+            videos=self._vis_meta_info['videos'],
+            annotations=[])
+        for gt_result in gts:
+            gt_results['annotations'].extend(gt_result)
+        return gt_results
+
+    def format_preds(self, preds: Tuple[List]) -> List:
+        """Gather all predictions from self.results."""
+        pred_results = []
+        for pred_result in preds:
+            pred_results.extend(pred_result)
+        return pred_results
+
+    def _format_one_video_preds(self, pred_dicts: Tuple[dict]) -> List:
+        """Convert the annotation to the format of YouTube-VIS.
+
+        This operation is to make it easier to use the official eval API.
+
+        Args:
+            pred_dicts (Tuple[dict]): Prediction of the dataset.
+
+        Returns:
+            List: The formatted predictions.
+        """
+        # Collate preds scatters (tuple of dict to dict of list)
+        preds = defaultdict(list)
+        for pred in pred_dicts:
+            for key in pred.keys():
+                preds[key].append(pred[key])
+
+        img_infos = self._vis_meta_info['images']
+        vid_infos = self._vis_meta_info['videos']
+        inds = [i for i, _ in enumerate(img_infos) if _['frame_id'] == 0]
+        inds.append(len(img_infos))
+        json_results = []
+        video_id = vid_infos[-1]['id']
+        # collect data for each instances in a video.
+        collect_data = dict()
+        for frame_id, (masks, scores, labels, ids) in enumerate(
+                zip(preds['masks'], preds['scores'], preds['labels'],
+                    preds['instances_id'])):
+
+            assert len(masks) == len(labels)
+            for j, id in enumerate(ids):
+                if id not in collect_data:
+                    collect_data[id] = dict(
+                        category_ids=[], scores=[], segmentations=dict())
+                collect_data[id]['category_ids'].append(labels[j])
+                collect_data[id]['scores'].append(scores[j])
+                if isinstance(masks[j]['counts'], bytes):
+                    masks[j]['counts'] = masks[j]['counts'].decode()
+                collect_data[id]['segmentations'][frame_id] = masks[j]
+
+        # transform the collected data into official format
+        for id, id_data in collect_data.items():
+            output = dict()
+            output['video_id'] = video_id
+            output['score'] = np.array(id_data['scores']).mean().item()
+            # majority voting for sequence category
+            output['category_id'] = np.bincount(
+                np.array(id_data['category_ids'])).argmax().item() + 1
+            output['segmentations'] = []
+            for frame_id in range(inds[-1] - inds[-2]):
+                if frame_id in id_data['segmentations']:
+                    output['segmentations'].append(
+                        id_data['segmentations'][frame_id])
+                else:
+                    output['segmentations'].append(None)
+            json_results.append(output)
+
+        return json_results
+
+    def _format_one_video_gts(self, gt_dicts: Tuple[dict]) -> List:
+        """Convert the annotation to the format of YouTube-VIS.
+
+        This operation is to make it easier to use the official eval API.
+
+        Args:
+            gt_dicts (Tuple[dict]): Ground truth of the dataset.
+
+        Returns:
+            list: The formatted gts.
+        """
+        video_infos = []
+        image_infos = []
+        instance_infos = defaultdict(list)
+        len_videos = dict()  # mapping from instance_id to video_length
+        vis_anns = []
+
+        # get video infos
+        for gt_dict in gt_dicts:
+            frame_id = gt_dict['frame_id']
+            video_id = gt_dict['video_id']
+            img_id = gt_dict['img_id']
+            image_info = dict(
+                id=img_id,
+                width=gt_dict['width'],
+                height=gt_dict['height'],
+                frame_id=frame_id,
+                file_name='')
+            image_infos.append(image_info)
+            if frame_id == 0:
+                video_info = dict(
+                    id=video_id,
+                    width=gt_dict['width'],
+                    height=gt_dict['height'],
+                    file_name='')
+                video_infos.append(video_info)
+
+            for ann in gt_dict['anns']:
+                label = ann['bbox_label']
+                bbox = ann['bbox']
+                instance_id = ann['instance_id']
+                # update video length
+                len_videos[instance_id] = gt_dict['video_length']
+                coco_bbox = [
+                    bbox[0],
+                    bbox[1],
+                    bbox[2] - bbox[0],
+                    bbox[3] - bbox[1],
+                ]
+
+                annotation = dict(
+                    video_id=video_id,
+                    frame_id=frame_id,
+                    bbox=coco_bbox,
+                    instance_id=instance_id,
+                    iscrowd=ann.get('ignore_flag', 0),
+                    category_id=int(label) + 1,
+                    area=coco_bbox[2] * coco_bbox[3])
+                if ann.get('mask', None):
+                    mask = ann['mask']
+                    # area = mask_util.area(mask)
+                    if isinstance(mask, dict) and isinstance(
+                            mask['counts'], bytes):
+                        mask['counts'] = mask['counts'].decode()
+                    annotation['segmentation'] = mask
+
+                instance_infos[instance_id].append(annotation)
+
+        # update vis meta info
+        self._vis_meta_info['images'].extend(image_infos)
+        self._vis_meta_info['videos'].extend(video_infos)
+
+        for instance_id, ann_infos in instance_infos.items():
+            cur_video_len = len_videos[instance_id]
+            segm = [None] * cur_video_len
+            bbox = [None] * cur_video_len
+            area = [None] * cur_video_len
+            # In the official format, no instances are represented by
+            # 'None', however, only images with instances are recorded
+            # in the current annotations, so we need to use 'None' to
+            # initialize these lists.
+            for ann_info in ann_infos:
+                frame_id = ann_info['frame_id']
+                segm[frame_id] = ann_info['segmentation']
+                bbox[frame_id] = ann_info['bbox']
+                area[frame_id] = ann_info['area']
+            instance = dict(
+                category_id=ann_infos[0]['category_id'],
+                segmentations=segm,
+                bboxes=bbox,
+                video_id=ann_infos[0]['video_id'],
+                areas=area,
+                id=instance_id,
+                iscrowd=ann_infos[0]['iscrowd'])
+            vis_anns.append(instance)
+        return vis_anns
+
+    def save_pred_results(self, pred_results: List) -> None:
+        """Save the results to a zip file (standard format for YouTube-VIS
+        Challenge).
+
+        Args:
+            pred_results (list): Testing results of the
+                dataset.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+        mmengine.dump(pred_results, f'{outfile_prefix}.json')
+        # zip the json file in order to submit to the test server.
+        zip_file_name = f'{outfile_prefix}.submission_file.zip'
+        zf = zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED)
+        logger.info(f"zip the 'results.json' into '{zip_file_name}', "
+                    'please submmit the zip file to the test server')
+        zf.write(f'{outfile_prefix}.json', 'results.json')
+        zf.close()
+
+    def evaluate(self, size: int) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        # wait for all processes to complete prediction.
+        barrier()
+
+        if len(self.results) == 0:
+            warnings.warn(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.')
+
+        results = collect_tracking_results(self.results, self.collect_device)
+
+        # gather seq_info
+        gathered_seq_info = all_gather_object(self._vis_meta_info['videos'])
+        all_seq_info = []
+        for _seq_info in gathered_seq_info:
+            all_seq_info.extend(_seq_info)
+        # update self._vis_meta_info
+        self._vis_meta_info = dict(videos=all_seq_info)
+
+        if is_main_process():
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        # reset the vis_meta_info
+        self._vis_meta_info.clear()
+        return metrics[0]
diff --git a/head_extractor/build/lib/mmdet/models/__init__.py b/head_extractor/build/lib/mmdet/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0a0d5e8d350d81e72787ff73fd85c2176783b43
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .backbones import *  # noqa: F401,F403
+from .data_preprocessors import *  # noqa: F401,F403
+from .dense_heads import *  # noqa: F401,F403
+from .detectors import *  # noqa: F401,F403
+from .language_models import *  # noqa: F401,F403
+from .layers import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .mot import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .reid import *  # noqa: F401,F403
+from .roi_heads import *  # noqa: F401,F403
+from .seg_heads import *  # noqa: F401,F403
+from .task_modules import *  # noqa: F401,F403
+from .test_time_augs import *  # noqa: F401,F403
+from .trackers import *  # noqa: F401,F403
+from .tracking_heads import *  # noqa: F401,F403
+from .vis import *  # noqa: F401,F403
diff --git a/head_extractor/build/lib/mmdet/models/backbones/__init__.py b/head_extractor/build/lib/mmdet/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e16ff85f7037b36fb2046fcbcd3af523050a6516
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .csp_darknet import CSPDarknet
+from .cspnext import CSPNeXt
+from .darknet import Darknet
+from .detectors_resnet import DetectoRS_ResNet
+from .detectors_resnext import DetectoRS_ResNeXt
+from .efficientnet import EfficientNet
+from .hourglass import HourglassNet
+from .hrnet import HRNet
+from .mobilenet_v2 import MobileNetV2
+from .pvt import PyramidVisionTransformer, PyramidVisionTransformerV2
+from .regnet import RegNet
+from .res2net import Res2Net
+from .resnest import ResNeSt
+from .resnet import ResNet, ResNetV1d
+from .resnext import ResNeXt
+from .ssd_vgg import SSDVGG
+from .swin import SwinTransformer
+from .trident_resnet import TridentResNet
+
+__all__ = [
+    'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet',
+    'MobileNetV2', 'Res2Net', 'HourglassNet', 'DetectoRS_ResNet',
+    'DetectoRS_ResNeXt', 'Darknet', 'ResNeSt', 'TridentResNet', 'CSPDarknet',
+    'SwinTransformer', 'PyramidVisionTransformer',
+    'PyramidVisionTransformerV2', 'EfficientNet', 'CSPNeXt'
+]
diff --git a/head_extractor/build/lib/mmdet/models/backbones/csp_darknet.py b/head_extractor/build/lib/mmdet/models/backbones/csp_darknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a890b486f255befa23fe5a3e9746f8f9298ac33f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/csp_darknet.py
@@ -0,0 +1,286 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from ..layers import CSPLayer
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        kernel_size (int): The kernel size of the convolution. Default: 1
+        stride (int): The stride of the convolution. Default: 1
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish')):
+        super().__init__()
+        self.conv = ConvModule(
+            in_channels * 4,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
+
+
+class SPPBottleneck(BaseModule):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        kernel_sizes (tuple[int]): Sequential of kernel sizes of pooling
+            layers. Default: (5, 9, 13).
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        mid_channels = in_channels // 2
+        self.conv1 = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.poolings = nn.ModuleList([
+            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = mid_channels * (len(kernel_sizes) + 1)
+        self.conv2 = ConvModule(
+            conv2_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        with torch.cuda.amp.autocast(enabled=False):
+            x = torch.cat(
+                [x] + [pooling(x) for pooling in self.poolings], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+@MODELS.register_module()
+class CSPDarknet(BaseModule):
+    """CSP-Darknet backbone used in YOLOv5 and YOLOX.
+
+    Args:
+        arch (str): Architecture of CSP-Darknet, from {P5, P6}.
+            Default: P5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Default: 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Default: -1.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Default: False.
+        arch_ovewrite(list): Overwrite default arch settings. Default: None.
+        spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP
+            layers. Default: (5, 9, 13).
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Example:
+        >>> from mmdet.models import CSPDarknet
+        >>> import torch
+        >>> self = CSPDarknet(depth=53)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+               [256, 512, 9, True, False], [512, 1024, 3, False, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+               [256, 512, 9, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, False, True]]
+    }
+
+    def __init__(self,
+                 arch='P5',
+                 deepen_factor=1.0,
+                 widen_factor=1.0,
+                 out_indices=(2, 3, 4),
+                 frozen_stages=-1,
+                 use_depthwise=False,
+                 arch_ovewrite=None,
+                 spp_kernal_sizes=(5, 9, 13),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 norm_eval=False,
+                 init_cfg=dict(
+                     type='Kaiming',
+                     layer='Conv2d',
+                     a=math.sqrt(5),
+                     distribution='uniform',
+                     mode='fan_in',
+                     nonlinearity='leaky_relu')):
+        super().__init__(init_cfg)
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        assert set(out_indices).issubset(
+            i for i in range(len(arch_setting) + 1))
+        if frozen_stages not in range(-1, len(arch_setting) + 1):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             'len(arch_setting) + 1). But received '
+                             f'{frozen_stages}')
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.use_depthwise = use_depthwise
+        self.norm_eval = norm_eval
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        self.stem = Focus(
+            3,
+            int(arch_setting[0][0] * widen_factor),
+            kernel_size=3,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.layers = ['stem']
+
+        for i, (in_channels, out_channels, num_blocks, add_identity,
+                use_spp) in enumerate(arch_setting):
+            in_channels = int(in_channels * widen_factor)
+            out_channels = int(out_channels * widen_factor)
+            num_blocks = max(round(num_blocks * deepen_factor), 1)
+            stage = []
+            conv_layer = conv(
+                in_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(conv_layer)
+            if use_spp:
+                spp = SPPBottleneck(
+                    out_channels,
+                    out_channels,
+                    kernel_sizes=spp_kernal_sizes,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg)
+                stage.append(spp)
+            csp_layer = CSPLayer(
+                out_channels,
+                out_channels,
+                num_blocks=num_blocks,
+                add_identity=add_identity,
+                use_depthwise=use_depthwise,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(csp_layer)
+            self.add_module(f'stage{i + 1}', nn.Sequential(*stage))
+            self.layers.append(f'stage{i + 1}')
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages + 1):
+                m = getattr(self, self.layers[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        super(CSPDarknet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/backbones/cspnext.py b/head_extractor/build/lib/mmdet/models/backbones/cspnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..269725a70224047a1f7f7564ba8199e38df25cc8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/cspnext.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from ..layers import CSPLayer
+from .csp_darknet import SPPBottleneck
+
+
+@MODELS.register_module()
+class CSPNeXt(BaseModule):
+    """CSPNeXt backbone used in RTMDet.
+
+    Args:
+        arch (str): Architecture of CSPNeXt, from {P5, P6}.
+            Defaults to P5.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Defaults to 0.5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        arch_ovewrite (list): Overwrite default arch settings.
+            Defaults to None.
+        spp_kernel_sizes: (tuple[int]): Sequential of kernel sizes of SPP
+            layers. Defaults to (5, 9, 13).
+        channel_attention (bool): Whether to add channel attention in each
+            stage. Defaults to True.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 1024, 3, False, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, False, True]]
+    }
+
+    def __init__(
+        self,
+        arch: str = 'P5',
+        deepen_factor: float = 1.0,
+        widen_factor: float = 1.0,
+        out_indices: Sequence[int] = (2, 3, 4),
+        frozen_stages: int = -1,
+        use_depthwise: bool = False,
+        expand_ratio: float = 0.5,
+        arch_ovewrite: dict = None,
+        spp_kernel_sizes: Sequence[int] = (5, 9, 13),
+        channel_attention: bool = True,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='SiLU'),
+        norm_eval: bool = False,
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        assert set(out_indices).issubset(
+            i for i in range(len(arch_setting) + 1))
+        if frozen_stages not in range(-1, len(arch_setting) + 1):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             'len(arch_setting) + 1). But received '
+                             f'{frozen_stages}')
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.use_depthwise = use_depthwise
+        self.norm_eval = norm_eval
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.stem = nn.Sequential(
+            ConvModule(
+                3,
+                int(arch_setting[0][0] * widen_factor // 2),
+                3,
+                padding=1,
+                stride=2,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                int(arch_setting[0][0] * widen_factor // 2),
+                int(arch_setting[0][0] * widen_factor // 2),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                int(arch_setting[0][0] * widen_factor // 2),
+                int(arch_setting[0][0] * widen_factor),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+        self.layers = ['stem']
+
+        for i, (in_channels, out_channels, num_blocks, add_identity,
+                use_spp) in enumerate(arch_setting):
+            in_channels = int(in_channels * widen_factor)
+            out_channels = int(out_channels * widen_factor)
+            num_blocks = max(round(num_blocks * deepen_factor), 1)
+            stage = []
+            conv_layer = conv(
+                in_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(conv_layer)
+            if use_spp:
+                spp = SPPBottleneck(
+                    out_channels,
+                    out_channels,
+                    kernel_sizes=spp_kernel_sizes,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg)
+                stage.append(spp)
+            csp_layer = CSPLayer(
+                out_channels,
+                out_channels,
+                num_blocks=num_blocks,
+                add_identity=add_identity,
+                use_depthwise=use_depthwise,
+                use_cspnext_block=True,
+                expand_ratio=expand_ratio,
+                channel_attention=channel_attention,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(csp_layer)
+            self.add_module(f'stage{i + 1}', nn.Sequential(*stage))
+            self.layers.append(f'stage{i + 1}')
+
+    def _freeze_stages(self) -> None:
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages + 1):
+                m = getattr(self, self.layers[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True) -> None:
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def forward(self, x: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]:
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/backbones/darknet.py b/head_extractor/build/lib/mmdet/models/backbones/darknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d44da1e03f04a7e0801c10e5338277cf6244ab1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/darknet.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+
+
+class ResBlock(BaseModule):
+    """The basic residual block used in Darknet. Each ResBlock consists of two
+    ConvModules and the input is added to the final output. Each ConvModule is
+    composed of Conv, BN, and LeakyReLU. In YoloV3 paper, the first convLayer
+    has half of the number of the filters as much as the second convLayer. The
+    first convLayer has filter size of 1x1 and the second one has the filter
+    size of 3x3.
+
+    Args:
+        in_channels (int): The input channels. Must be even.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 init_cfg=None):
+        super(ResBlock, self).__init__(init_cfg)
+        assert in_channels % 2 == 0  # ensure the in_channels is even
+        half_in_channels = in_channels // 2
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        self.conv1 = ConvModule(in_channels, half_in_channels, 1, **cfg)
+        self.conv2 = ConvModule(
+            half_in_channels, in_channels, 3, padding=1, **cfg)
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = out + residual
+
+        return out
+
+
+@MODELS.register_module()
+class Darknet(BaseModule):
+    """Darknet backbone.
+
+    Args:
+        depth (int): Depth of Darknet. Currently only support 53.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import Darknet
+        >>> import torch
+        >>> self = Darknet(depth=53)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+
+    # Dict(depth: (layers, channels))
+    arch_settings = {
+        53: ((1, 2, 8, 8, 4), ((32, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 1024)))
+    }
+
+    def __init__(self,
+                 depth=53,
+                 out_indices=(3, 4, 5),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 norm_eval=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(Darknet, self).__init__(init_cfg)
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for darknet')
+
+        self.depth = depth
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.layers, self.channels = self.arch_settings[depth]
+
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        self.conv1 = ConvModule(3, 32, 3, padding=1, **cfg)
+
+        self.cr_blocks = ['conv1']
+        for i, n_layers in enumerate(self.layers):
+            layer_name = f'conv_res_block{i + 1}'
+            in_c, out_c = self.channels[i]
+            self.add_module(
+                layer_name,
+                self.make_conv_res_block(in_c, out_c, n_layers, **cfg))
+            self.cr_blocks.append(layer_name)
+
+        self.norm_eval = norm_eval
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.cr_blocks):
+            cr_block = getattr(self, layer_name)
+            x = cr_block(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages):
+                m = getattr(self, self.cr_blocks[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        super(Darknet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    @staticmethod
+    def make_conv_res_block(in_channels,
+                            out_channels,
+                            res_repeat,
+                            conv_cfg=None,
+                            norm_cfg=dict(type='BN', requires_grad=True),
+                            act_cfg=dict(type='LeakyReLU',
+                                         negative_slope=0.1)):
+        """In Darknet backbone, ConvLayer is usually followed by ResBlock. This
+        function will make that. The Conv layers always have 3x3 filters with
+        stride=2. The number of the filters in Conv layer is the same as the
+        out channels of the ResBlock.
+
+        Args:
+            in_channels (int): The number of input channels.
+            out_channels (int): The number of output channels.
+            res_repeat (int): The number of ResBlocks.
+            conv_cfg (dict): Config dict for convolution layer. Default: None.
+            norm_cfg (dict): Dictionary to construct and config norm layer.
+                Default: dict(type='BN', requires_grad=True)
+            act_cfg (dict): Config dict for activation layer.
+                Default: dict(type='LeakyReLU', negative_slope=0.1).
+        """
+
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        model = nn.Sequential()
+        model.add_module(
+            'conv',
+            ConvModule(
+                in_channels, out_channels, 3, stride=2, padding=1, **cfg))
+        for idx in range(res_repeat):
+            model.add_module('res{}'.format(idx),
+                             ResBlock(out_channels, **cfg))
+        return model
diff --git a/head_extractor/build/lib/mmdet/models/backbones/detectors_resnet.py b/head_extractor/build/lib/mmdet/models/backbones/detectors_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f33424fce4a933d675f1f1d3d4ad89e0173c5f9e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/detectors_resnet.py
@@ -0,0 +1,353 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.logging import MMLogger
+from mmengine.model import Sequential, constant_init, kaiming_init
+from mmengine.runner.checkpoint import load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from .resnet import BasicBlock
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+    r"""Bottleneck for the ResNet backbone in `DetectoRS
+    <https://arxiv.org/pdf/2006.02334.pdf>`_.
+
+    This bottleneck allows the users to specify whether to use
+    SAC (Switchable Atrous Convolution) and RFP (Recursive Feature Pyramid).
+
+    Args:
+         inplanes (int): The number of input channels.
+         planes (int): The number of output channels before expansion.
+         rfp_inplanes (int, optional): The number of channels from RFP.
+             Default: None. If specified, an additional conv layer will be
+             added for ``rfp_feat``. Otherwise, the structure is the same as
+             base class.
+         sac (dict, optional): Dictionary to construct SAC. Default: None.
+         init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 rfp_inplanes=None,
+                 sac=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(Bottleneck, self).__init__(
+            inplanes, planes, init_cfg=init_cfg, **kwargs)
+
+        assert sac is None or isinstance(sac, dict)
+        self.sac = sac
+        self.with_sac = sac is not None
+        if self.with_sac:
+            self.conv2 = build_conv_layer(
+                self.sac,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                bias=False)
+
+        self.rfp_inplanes = rfp_inplanes
+        if self.rfp_inplanes:
+            self.rfp_conv = build_conv_layer(
+                None,
+                self.rfp_inplanes,
+                planes * self.expansion,
+                1,
+                stride=1,
+                bias=True)
+            if init_cfg is None:
+                self.init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='rfp_conv'))
+
+    def rfp_forward(self, x, rfp_feat):
+        """The forward function that also takes the RFP features as input."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        if self.rfp_inplanes:
+            rfp_feat = self.rfp_conv(rfp_feat)
+            out = out + rfp_feat
+
+        out = self.relu(out)
+
+        return out
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone for RPF in detectoRS.
+
+    The difference between this module and base class is that we pass
+    ``rfp_inplanes`` to the first block.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+        rfp_inplanes (int, optional): The number of channels from RFP.
+            Default: None. If specified, an additional conv layer will be
+            added for ``rfp_feat``. Otherwise, the structure is the same as
+            base class.
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 rfp_inplanes=None,
+                 **kwargs):
+        self.block = block
+        assert downsample_first, f'downsample_first={downsample_first} is ' \
+                                 'not supported in DetectoRS'
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down and stride != 1:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                rfp_inplanes=rfp_inplanes,
+                **kwargs))
+        inplanes = planes * block.expansion
+        for _ in range(1, num_blocks):
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+
+        super(ResLayer, self).__init__(*layers)
+
+
+@MODELS.register_module()
+class DetectoRS_ResNet(ResNet):
+    """ResNet backbone for DetectoRS.
+
+    Args:
+        sac (dict, optional): Dictionary to construct SAC (Switchable Atrous
+            Convolution). Default: None.
+        stage_with_sac (list): Which stage to use sac. Default: (False, False,
+            False, False).
+        rfp_inplanes (int, optional): The number of channels from RFP.
+            Default: None. If specified, an additional conv layer will be
+            added for ``rfp_feat``. Otherwise, the structure is the same as
+            base class.
+        output_img (bool): If ``True``, the input image will be inserted into
+            the starting position of output. Default: False.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 sac=None,
+                 stage_with_sac=(False, False, False, False),
+                 rfp_inplanes=None,
+                 output_img=False,
+                 pretrained=None,
+                 init_cfg=None,
+                 **kwargs):
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        self.pretrained = pretrained
+        if init_cfg is not None:
+            assert isinstance(init_cfg, dict), \
+                f'init_cfg must be a dict, but got {type(init_cfg)}'
+            if 'type' in init_cfg:
+                assert init_cfg.get('type') == 'Pretrained', \
+                    'Only can initialize module by loading a pretrained model'
+            else:
+                raise KeyError('`init_cfg` must contain the key "type"')
+            self.pretrained = init_cfg.get('checkpoint')
+        self.sac = sac
+        self.stage_with_sac = stage_with_sac
+        self.rfp_inplanes = rfp_inplanes
+        self.output_img = output_img
+        super(DetectoRS_ResNet, self).__init__(**kwargs)
+
+        self.inplanes = self.stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            sac = self.sac if self.stage_with_sac[i] else None
+            if self.plugins is not None:
+                stage_plugins = self.make_stage_plugins(self.plugins, i)
+            else:
+                stage_plugins = None
+            planes = self.base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                dcn=dcn,
+                sac=sac,
+                rfp_inplanes=rfp_inplanes if i > 0 else None,
+                plugins=stage_plugins)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+    # In order to be properly initialized by RFP
+    def init_weights(self):
+        # Calling this method will cause parameter initialization exception
+        # super(DetectoRS_ResNet, self).init_weights()
+
+        if isinstance(self.pretrained, str):
+            logger = MMLogger.get_current_instance()
+            load_checkpoint(self, self.pretrained, strict=False, logger=logger)
+        elif self.pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.dcn is not None:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck) and hasattr(
+                            m.conv2, 'conv_offset'):
+                        constant_init(m.conv2.conv_offset, 0)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer`` for DetectoRS."""
+        return ResLayer(**kwargs)
+
+    def forward(self, x):
+        """Forward function."""
+        outs = list(super(DetectoRS_ResNet, self).forward(x))
+        if self.output_img:
+            outs.insert(0, x)
+        return tuple(outs)
+
+    def rfp_forward(self, x, rfp_feats):
+        """Forward function for RFP."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            rfp_feat = rfp_feats[i] if i > 0 else None
+            for layer in res_layer:
+                x = layer.rfp_forward(x, rfp_feat)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/backbones/detectors_resnext.py b/head_extractor/build/lib/mmdet/models/backbones/detectors_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bbd63154bb47910e27cf6a75e4b359e050063e1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/detectors_resnext.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from mmdet.registry import MODELS
+from .detectors_resnet import Bottleneck as _Bottleneck
+from .detectors_resnet import DetectoRS_ResNet
+
+
+class Bottleneck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 **kwargs):
+        """Bottleneck block for ResNeXt.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if self.with_sac:
+            self.conv2 = build_conv_layer(
+                self.sac,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        elif not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@MODELS.register_module()
+class DetectoRS_ResNeXt(DetectoRS_ResNet):
+    """ResNeXt backbone for DetectoRS.
+
+    Args:
+        groups (int): The number of groups in ResNeXt.
+        base_width (int): The base width of ResNeXt.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        super(DetectoRS_ResNeXt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return super().make_res_layer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmdet/models/backbones/efficientnet.py b/head_extractor/build/lib/mmdet/models/backbones/efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8484afe2e34e2bf8327e8aefedb968bd9a1e7792
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/efficientnet.py
@@ -0,0 +1,418 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn.bricks import ConvModule, DropPath
+from mmengine.model import BaseModule, Sequential
+
+from mmdet.registry import MODELS
+from ..layers import InvertedResidual, SELayer
+from ..utils import make_divisible
+
+
+class EdgeResidual(BaseModule):
+    """Edge Residual Block.
+
+    Args:
+        in_channels (int): The input channels of this module.
+        out_channels (int): The output channels of this module.
+        mid_channels (int): The input channels of the second convolution.
+        kernel_size (int): The kernel size of the first convolution.
+            Defaults to 3.
+        stride (int): The stride of the first convolution. Defaults to 1.
+        se_cfg (dict, optional): Config dict for se layer. Defaults to None,
+            which means no se layer.
+        with_residual (bool): Use residual connection. Defaults to True.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='BN')``.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='ReLU')``.
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict | list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_residual=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 drop_path_rate=0.,
+                 with_cp=False,
+                 init_cfg=None,
+                 **kwargs):
+        super(EdgeResidual, self).__init__(init_cfg=init_cfg)
+        assert stride in [1, 2]
+        self.with_cp = with_cp
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.with_se = se_cfg is not None
+        self.with_residual = (
+            stride == 1 and in_channels == out_channels and with_residual)
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.conv2 = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+            out = self.conv1(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.conv2(out)
+
+            if self.with_residual:
+                return x + self.drop_path(out)
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+def model_scaling(layer_setting, arch_setting):
+    """Scaling operation to the layer's parameters according to the
+    arch_setting."""
+    # scale width
+    new_layer_setting = copy.deepcopy(layer_setting)
+    for layer_cfg in new_layer_setting:
+        for block_cfg in layer_cfg:
+            block_cfg[1] = make_divisible(block_cfg[1] * arch_setting[0], 8)
+
+    # scale depth
+    split_layer_setting = [new_layer_setting[0]]
+    for layer_cfg in new_layer_setting[1:-1]:
+        tmp_index = [0]
+        for i in range(len(layer_cfg) - 1):
+            if layer_cfg[i + 1][1] != layer_cfg[i][1]:
+                tmp_index.append(i + 1)
+        tmp_index.append(len(layer_cfg))
+        for i in range(len(tmp_index) - 1):
+            split_layer_setting.append(layer_cfg[tmp_index[i]:tmp_index[i +
+                                                                        1]])
+    split_layer_setting.append(new_layer_setting[-1])
+
+    num_of_layers = [len(layer_cfg) for layer_cfg in split_layer_setting[1:-1]]
+    new_layers = [
+        int(math.ceil(arch_setting[1] * num)) for num in num_of_layers
+    ]
+
+    merge_layer_setting = [split_layer_setting[0]]
+    for i, layer_cfg in enumerate(split_layer_setting[1:-1]):
+        if new_layers[i] <= num_of_layers[i]:
+            tmp_layer_cfg = layer_cfg[:new_layers[i]]
+        else:
+            tmp_layer_cfg = copy.deepcopy(layer_cfg) + [layer_cfg[-1]] * (
+                new_layers[i] - num_of_layers[i])
+        if tmp_layer_cfg[0][3] == 1 and i != 0:
+            merge_layer_setting[-1] += tmp_layer_cfg.copy()
+        else:
+            merge_layer_setting.append(tmp_layer_cfg.copy())
+    merge_layer_setting.append(split_layer_setting[-1])
+
+    return merge_layer_setting
+
+
+@MODELS.register_module()
+class EfficientNet(BaseModule):
+    """EfficientNet backbone.
+
+    Args:
+        arch (str): Architecture of efficientnet. Defaults to b0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (6, ).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Defaults to 0, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+    """
+
+    # Parameters to build layers.
+    # 'b' represents the architecture of normal EfficientNet family includes
+    # 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8'.
+    # 'e' represents the architecture of EfficientNet-EdgeTPU including 'es',
+    # 'em', 'el'.
+    # 6 parameters are needed to construct a layer, From left to right:
+    # - kernel_size: The kernel size of the block
+    # - out_channel: The number of out_channels of the block
+    # - se_ratio: The sequeeze ratio of SELayer.
+    # - stride: The stride of the block
+    # - expand_ratio: The expand_ratio of the mid_channels
+    # - block_type: -1: Not a block, 0: InvertedResidual, 1: EdgeResidual
+    layer_settings = {
+        'b': [[[3, 32, 0, 2, 0, -1]],
+              [[3, 16, 4, 1, 1, 0]],
+              [[3, 24, 4, 2, 6, 0],
+               [3, 24, 4, 1, 6, 0]],
+              [[5, 40, 4, 2, 6, 0],
+               [5, 40, 4, 1, 6, 0]],
+              [[3, 80, 4, 2, 6, 0],
+               [3, 80, 4, 1, 6, 0],
+               [3, 80, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0]],
+              [[5, 192, 4, 2, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [3, 320, 4, 1, 6, 0]],
+              [[1, 1280, 0, 1, 0, -1]]
+              ],
+        'e': [[[3, 32, 0, 2, 0, -1]],
+              [[3, 24, 0, 1, 3, 1]],
+              [[3, 32, 0, 2, 8, 1],
+               [3, 32, 0, 1, 8, 1]],
+              [[3, 48, 0, 2, 8, 1],
+               [3, 48, 0, 1, 8, 1],
+               [3, 48, 0, 1, 8, 1],
+               [3, 48, 0, 1, 8, 1]],
+              [[5, 96, 0, 2, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0]],
+              [[5, 192, 0, 2, 8, 0],
+               [5, 192, 0, 1, 8, 0]],
+              [[1, 1280, 0, 1, 0, -1]]
+              ]
+    }  # yapf: disable
+
+    # Parameters to build different kinds of architecture.
+    # From left to right: scaling factor for width, scaling factor for depth,
+    # resolution.
+    arch_settings = {
+        'b0': (1.0, 1.0, 224),
+        'b1': (1.0, 1.1, 240),
+        'b2': (1.1, 1.2, 260),
+        'b3': (1.2, 1.4, 300),
+        'b4': (1.4, 1.8, 380),
+        'b5': (1.6, 2.2, 456),
+        'b6': (1.8, 2.6, 528),
+        'b7': (2.0, 3.1, 600),
+        'b8': (2.2, 3.6, 672),
+        'es': (1.0, 1.0, 224),
+        'em': (1.0, 1.1, 240),
+        'el': (1.2, 1.4, 300)
+    }
+
+    def __init__(self,
+                 arch='b0',
+                 drop_path_rate=0.,
+                 out_indices=(6, ),
+                 frozen_stages=0,
+                 conv_cfg=dict(type='Conv2dAdaptivePadding'),
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='Swish'),
+                 norm_eval=False,
+                 with_cp=False,
+                 init_cfg=[
+                     dict(type='Kaiming', layer='Conv2d'),
+                     dict(
+                         type='Constant',
+                         layer=['_BatchNorm', 'GroupNorm'],
+                         val=1)
+                 ]):
+        super(EfficientNet, self).__init__(init_cfg)
+        assert arch in self.arch_settings, \
+            f'"{arch}" is not one of the arch_settings ' \
+            f'({", ".join(self.arch_settings.keys())})'
+        self.arch_setting = self.arch_settings[arch]
+        self.layer_setting = self.layer_settings[arch[:1]]
+        for index in out_indices:
+            if index not in range(0, len(self.layer_setting)):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, {len(self.layer_setting)}). '
+                                 f'But received {index}')
+
+        if frozen_stages not in range(len(self.layer_setting) + 1):
+            raise ValueError('frozen_stages must be in range(0, '
+                             f'{len(self.layer_setting) + 1}). '
+                             f'But received {frozen_stages}')
+        self.drop_path_rate = drop_path_rate
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.layer_setting = model_scaling(self.layer_setting,
+                                           self.arch_setting)
+        block_cfg_0 = self.layer_setting[0][0]
+        block_cfg_last = self.layer_setting[-1][0]
+        self.in_channels = make_divisible(block_cfg_0[1], 8)
+        self.out_channels = block_cfg_last[1]
+        self.layers = nn.ModuleList()
+        self.layers.append(
+            ConvModule(
+                in_channels=3,
+                out_channels=self.in_channels,
+                kernel_size=block_cfg_0[0],
+                stride=block_cfg_0[3],
+                padding=block_cfg_0[0] // 2,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        self.make_layer()
+        # Avoid building unused layers in mmdetection.
+        if len(self.layers) < max(self.out_indices) + 1:
+            self.layers.append(
+                ConvModule(
+                    in_channels=self.in_channels,
+                    out_channels=self.out_channels,
+                    kernel_size=block_cfg_last[0],
+                    stride=block_cfg_last[3],
+                    padding=block_cfg_last[0] // 2,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+    def make_layer(self):
+        # Without the first and the final conv block.
+        layer_setting = self.layer_setting[1:-1]
+
+        total_num_blocks = sum([len(x) for x in layer_setting])
+        block_idx = 0
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, total_num_blocks)
+        ]  # stochastic depth decay rule
+
+        for i, layer_cfg in enumerate(layer_setting):
+            # Avoid building unused layers in mmdetection.
+            if i > max(self.out_indices) - 1:
+                break
+            layer = []
+            for i, block_cfg in enumerate(layer_cfg):
+                (kernel_size, out_channels, se_ratio, stride, expand_ratio,
+                 block_type) = block_cfg
+
+                mid_channels = int(self.in_channels * expand_ratio)
+                out_channels = make_divisible(out_channels, 8)
+                if se_ratio <= 0:
+                    se_cfg = None
+                else:
+                    # In mmdetection, the `divisor` is deleted to align
+                    # the logic of SELayer with mmpretrain.
+                    se_cfg = dict(
+                        channels=mid_channels,
+                        ratio=expand_ratio * se_ratio,
+                        act_cfg=(self.act_cfg, dict(type='Sigmoid')))
+                if block_type == 1:  # edge tpu
+                    if i > 0 and expand_ratio == 3:
+                        with_residual = False
+                        expand_ratio = 4
+                    else:
+                        with_residual = True
+                    mid_channels = int(self.in_channels * expand_ratio)
+                    if se_cfg is not None:
+                        # In mmdetection, the `divisor` is deleted to align
+                        # the logic of SELayer with mmpretrain.
+                        se_cfg = dict(
+                            channels=mid_channels,
+                            ratio=se_ratio * expand_ratio,
+                            act_cfg=(self.act_cfg, dict(type='Sigmoid')))
+                    block = partial(EdgeResidual, with_residual=with_residual)
+                else:
+                    block = InvertedResidual
+                layer.append(
+                    block(
+                        in_channels=self.in_channels,
+                        out_channels=out_channels,
+                        mid_channels=mid_channels,
+                        kernel_size=kernel_size,
+                        stride=stride,
+                        se_cfg=se_cfg,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        drop_path_rate=dpr[block_idx],
+                        with_cp=self.with_cp,
+                        # In mmdetection, `with_expand_conv` is set to align
+                        # the logic of InvertedResidual with mmpretrain.
+                        with_expand_conv=(mid_channels != self.in_channels)))
+                self.in_channels = out_channels
+                block_idx += 1
+            self.layers.append(Sequential(*layer))
+
+    def forward(self, x):
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        for i in range(self.frozen_stages):
+            m = self.layers[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super(EfficientNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/head_extractor/build/lib/mmdet/models/backbones/hourglass.py b/head_extractor/build/lib/mmdet/models/backbones/hourglass.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb58799f7b32138b3f58383419ddce9aa6d5ca18
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/hourglass.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from ..layers import ResLayer
+from .resnet import BasicBlock
+
+
+class HourglassModule(BaseModule):
+    """Hourglass Module for HourglassNet backbone.
+
+    Generate module recursively and use BasicBlock as the base unit.
+
+    Args:
+        depth (int): Depth of current HourglassModule.
+        stage_channels (list[int]): Feature channels of sub-modules in current
+            and follow-up HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in current and
+            follow-up HourglassModule.
+        norm_cfg (ConfigType): Dictionary to construct and config norm layer.
+            Defaults to `dict(type='BN', requires_grad=True)`
+        upsample_cfg (ConfigType): Config dict for interpolate layer.
+            Defaults to `dict(mode='nearest')`
+       init_cfg (dict or ConfigDict, optional): the config to control the
+           initialization.
+    """
+
+    def __init__(self,
+                 depth: int,
+                 stage_channels: List[int],
+                 stage_blocks: List[int],
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 upsample_cfg: ConfigType = dict(mode='nearest'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg)
+
+        self.depth = depth
+
+        cur_block = stage_blocks[0]
+        next_block = stage_blocks[1]
+
+        cur_channel = stage_channels[0]
+        next_channel = stage_channels[1]
+
+        self.up1 = ResLayer(
+            BasicBlock, cur_channel, cur_channel, cur_block, norm_cfg=norm_cfg)
+
+        self.low1 = ResLayer(
+            BasicBlock,
+            cur_channel,
+            next_channel,
+            cur_block,
+            stride=2,
+            norm_cfg=norm_cfg)
+
+        if self.depth > 1:
+            self.low2 = HourglassModule(depth - 1, stage_channels[1:],
+                                        stage_blocks[1:])
+        else:
+            self.low2 = ResLayer(
+                BasicBlock,
+                next_channel,
+                next_channel,
+                next_block,
+                norm_cfg=norm_cfg)
+
+        self.low3 = ResLayer(
+            BasicBlock,
+            next_channel,
+            cur_channel,
+            cur_block,
+            norm_cfg=norm_cfg,
+            downsample_first=False)
+
+        self.up2 = F.interpolate
+        self.upsample_cfg = upsample_cfg
+
+    def forward(self, x: torch.Tensor) -> nn.Module:
+        """Forward function."""
+        up1 = self.up1(x)
+        low1 = self.low1(x)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        # Fixing `scale factor` (e.g. 2) is common for upsampling, but
+        # in some cases the spatial size is mismatched and error will arise.
+        if 'scale_factor' in self.upsample_cfg:
+            up2 = self.up2(low3, **self.upsample_cfg)
+        else:
+            shape = up1.shape[2:]
+            up2 = self.up2(low3, size=shape, **self.upsample_cfg)
+        return up1 + up2
+
+
+@MODELS.register_module()
+class HourglassNet(BaseModule):
+    """HourglassNet backbone.
+
+    Stacked Hourglass Networks for Human Pose Estimation.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1603.06937>`_ .
+
+    Args:
+        downsample_times (int): Downsample times in a HourglassModule.
+        num_stacks (int): Number of HourglassModule modules stacked,
+            1 for Hourglass-52, 2 for Hourglass-104.
+        stage_channels (Sequence[int]): Feature channel of each sub-module in a
+            HourglassModule.
+        stage_blocks (Sequence[int]): Number of sub-modules stacked in a
+            HourglassModule.
+        feat_channel (int): Feature channel of conv after a HourglassModule.
+        norm_cfg (norm_cfg): Dictionary to construct and config norm layer.
+       init_cfg (dict or ConfigDict, optional): the config to control the
+           initialization.
+
+    Example:
+        >>> from mmdet.models import HourglassNet
+        >>> import torch
+        >>> self = HourglassNet()
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 256, 128, 128)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 downsample_times: int = 5,
+                 num_stacks: int = 2,
+                 stage_channels: Sequence = (256, 256, 384, 384, 384, 512),
+                 stage_blocks: Sequence = (2, 2, 2, 2, 2, 4),
+                 feat_channel: int = 256,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg)
+
+        self.num_stacks = num_stacks
+        assert self.num_stacks >= 1
+        assert len(stage_channels) == len(stage_blocks)
+        assert len(stage_channels) > downsample_times
+
+        cur_channel = stage_channels[0]
+
+        self.stem = nn.Sequential(
+            ConvModule(
+                3, cur_channel // 2, 7, padding=3, stride=2,
+                norm_cfg=norm_cfg),
+            ResLayer(
+                BasicBlock,
+                cur_channel // 2,
+                cur_channel,
+                1,
+                stride=2,
+                norm_cfg=norm_cfg))
+
+        self.hourglass_modules = nn.ModuleList([
+            HourglassModule(downsample_times, stage_channels, stage_blocks)
+            for _ in range(num_stacks)
+        ])
+
+        self.inters = ResLayer(
+            BasicBlock,
+            cur_channel,
+            cur_channel,
+            num_stacks - 1,
+            norm_cfg=norm_cfg)
+
+        self.conv1x1s = nn.ModuleList([
+            ConvModule(
+                cur_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            ConvModule(
+                cur_channel, feat_channel, 3, padding=1, norm_cfg=norm_cfg)
+            for _ in range(num_stacks)
+        ])
+
+        self.remap_convs = nn.ModuleList([
+            ConvModule(
+                feat_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self) -> None:
+        """Init module weights."""
+        # Training Centripetal Model needs to reset parameters for Conv2d
+        super().init_weights()
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                m.reset_parameters()
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        """Forward function."""
+        inter_feat = self.stem(x)
+        out_feats = []
+
+        for ind in range(self.num_stacks):
+            single_hourglass = self.hourglass_modules[ind]
+            out_conv = self.out_convs[ind]
+
+            hourglass_feat = single_hourglass(inter_feat)
+            out_feat = out_conv(hourglass_feat)
+            out_feats.append(out_feat)
+
+            if ind < self.num_stacks - 1:
+                inter_feat = self.conv1x1s[ind](
+                    inter_feat) + self.remap_convs[ind](
+                        out_feat)
+                inter_feat = self.inters[ind](self.relu(inter_feat))
+
+        return out_feats
diff --git a/head_extractor/build/lib/mmdet/models/backbones/hrnet.py b/head_extractor/build/lib/mmdet/models/backbones/hrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..77bd3cc7125bb7ba03cd201ab3a55174b01dde50
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/hrnet.py
@@ -0,0 +1,589 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule, ModuleList, Sequential
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from .resnet import BasicBlock, Bottleneck
+
+
+class HRModule(BaseModule):
+    """High-Resolution Module for HRNet.
+
+    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
+    is in this module.
+    """
+
+    def __init__(self,
+                 num_branches,
+                 blocks,
+                 num_blocks,
+                 in_channels,
+                 num_channels,
+                 multiscale_output=True,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 block_init_cfg=None,
+                 init_cfg=None):
+        super(HRModule, self).__init__(init_cfg)
+        self.block_init_cfg = block_init_cfg
+        self._check_branches(num_branches, num_blocks, in_channels,
+                             num_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.multiscale_output = multiscale_output
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.with_cp = with_cp
+        self.branches = self._make_branches(num_branches, blocks, num_blocks,
+                                            num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=False)
+
+    def _check_branches(self, num_branches, num_blocks, in_channels,
+                        num_channels):
+        if num_branches != len(num_blocks):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_BLOCKS({len(num_blocks)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_CHANNELS({len(num_channels)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_INCHANNELS({len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        downsample = None
+        if stride != 1 or \
+                self.in_channels[branch_index] != \
+                num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, num_channels[branch_index] *
+                                 block.expansion)[1])
+
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index],
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=self.block_init_cfg))
+        self.in_channels[branch_index] = \
+            num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index],
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=self.block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i), mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=False)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = 0
+            for j in range(self.num_branches):
+                if i == j:
+                    y += x[j]
+                else:
+                    y += self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+
+
+@MODELS.register_module()
+class HRNet(BaseModule):
+    """HRNet backbone.
+
+    `High-Resolution Representations for Labeling Pixels and Regions
+    arXiv: <https://arxiv.org/abs/1904.04514>`_.
+
+    Args:
+        extra (dict): Detailed configuration for each stage of HRNet.
+            There must be 4 stages, the configuration for each stage must have
+            5 keys:
+
+                - num_modules(int): The number of HRModule in this stage.
+                - num_branches(int): The number of branches in the HRModule.
+                - block(str): The type of convolution block.
+                - num_blocks(tuple): The number of blocks in each branch.
+                    The length must be equal to num_branches.
+                - num_channels(tuple): The number of channels in each branch.
+                    The length must be equal to num_branches.
+        in_channels (int): Number of input image channels. Default: 3.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: True.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: False.
+        multiscale_output (bool): Whether to output multi-level features
+            produced by multiple branches. If False, only the first level
+            feature will be output. Default: True.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Example:
+        >>> from mmdet.models import HRNet
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(4, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=3,
+        >>>         num_branches=4,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4, 4),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+        (1, 64, 4, 4)
+        (1, 128, 2, 2)
+        (1, 256, 1, 1)
+    """
+
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 norm_eval=True,
+                 with_cp=False,
+                 zero_init_residual=False,
+                 multiscale_output=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(HRNet, self).__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        # Assert configurations of 4 stages are in extra
+        assert 'stage1' in extra and 'stage2' in extra \
+               and 'stage3' in extra and 'stage4' in extra
+        # Assert whether the length of `num_blocks` and `num_channels` are
+        # equal to `num_branches`
+        for i in range(4):
+            cfg = extra[f'stage{i + 1}']
+            assert len(cfg['num_blocks']) == cfg['num_branches'] and \
+                   len(cfg['num_channels']) == cfg['num_branches']
+
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+
+        # stem net
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            64,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # stage 1
+        self.stage1_cfg = self.extra['stage1']
+        num_channels = self.stage1_cfg['num_channels'][0]
+        block_type = self.stage1_cfg['block']
+        num_blocks = self.stage1_cfg['num_blocks'][0]
+
+        block = self.blocks_dict[block_type]
+        stage1_out_channels = num_channels * block.expansion
+        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
+
+        # stage 2
+        self.stage2_cfg = self.extra['stage2']
+        num_channels = self.stage2_cfg['num_channels']
+        block_type = self.stage2_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition1 = self._make_transition_layer([stage1_out_channels],
+                                                       num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        # stage 3
+        self.stage3_cfg = self.extra['stage3']
+        num_channels = self.stage3_cfg['num_channels']
+        block_type = self.stage3_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition2 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        # stage 4
+        self.stage4_cfg = self.extra['stage4']
+        num_channels = self.stage4_cfg['num_channels']
+        block_type = self.stage4_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition3 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multiscale_output=multiscale_output)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, planes * block.expansion)[1])
+
+        layers = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=block_init_cfg,
+            ))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+
+        hr_modules = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+
+        for i in range(num_modules):
+            # multi_scale_output is only used for the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            hr_modules.append(
+                HRModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    in_channels,
+                    num_channels,
+                    reset_multiscale_output,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    block_init_cfg=block_init_cfg))
+
+        return Sequential(*hr_modules), in_channels
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['num_branches']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['num_branches']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['num_branches']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        return y_list
+
+    def train(self, mode=True):
+        """Convert the model into training mode will keeping the normalization
+        layer freezed."""
+        super(HRNet, self).train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/head_extractor/build/lib/mmdet/models/backbones/mobilenet_v2.py b/head_extractor/build/lib/mmdet/models/backbones/mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4fd0519ad4d5106e1acb82624d6393052596ce8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/mobilenet_v2.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from ..layers import InvertedResidual
+from ..utils import make_divisible
+
+
+@MODELS.register_module()
+class MobileNetV2(BaseModule):
+    """MobileNetV2 backbone.
+
+    Args:
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int], optional): Output from which stages.
+            Default: (1, 2, 4, 7).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    # Parameters to build layers. 4 parameters are needed to construct a
+    # layer, from left to right: expand_ratio, channel, num_blocks, stride.
+    arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2],
+                     [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2],
+                     [6, 320, 1, 1]]
+
+    def __init__(self,
+                 widen_factor=1.,
+                 out_indices=(1, 2, 4, 7),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super(MobileNetV2, self).__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.widen_factor = widen_factor
+        self.out_indices = out_indices
+        if not set(out_indices).issubset(set(range(0, 8))):
+            raise ValueError('out_indices must be a subset of range'
+                             f'(0, 8). But received {out_indices}')
+
+        if frozen_stages not in range(-1, 8):
+            raise ValueError('frozen_stages must be in range(-1, 8). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = make_divisible(32 * widen_factor, 8)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.layers = []
+
+        for i, layer_cfg in enumerate(self.arch_settings):
+            expand_ratio, channel, num_blocks, stride = layer_cfg
+            out_channels = make_divisible(channel * widen_factor, 8)
+            inverted_res_layer = self.make_layer(
+                out_channels=out_channels,
+                num_blocks=num_blocks,
+                stride=stride,
+                expand_ratio=expand_ratio)
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, inverted_res_layer)
+            self.layers.append(layer_name)
+
+        if widen_factor > 1.0:
+            self.out_channel = int(1280 * widen_factor)
+        else:
+            self.out_channel = 1280
+
+        layer = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.add_module('conv2', layer)
+        self.layers.append('conv2')
+
+    def make_layer(self, out_channels, num_blocks, stride, expand_ratio):
+        """Stack InvertedResidual blocks to build a layer for MobileNetV2.
+
+        Args:
+            out_channels (int): out_channels of block.
+            num_blocks (int): number of blocks.
+            stride (int): stride of the first block. Default: 1
+            expand_ratio (int): Expand the number of channels of the
+                hidden layer in InvertedResidual by this ratio. Default: 6.
+        """
+        layers = []
+        for i in range(num_blocks):
+            if i >= 1:
+                stride = 1
+            layers.append(
+                InvertedResidual(
+                    self.in_channels,
+                    out_channels,
+                    mid_channels=int(round(self.in_channels * expand_ratio)),
+                    stride=stride,
+                    with_expand_conv=expand_ratio != 1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        frozen."""
+        super(MobileNetV2, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/head_extractor/build/lib/mmdet/models/backbones/pvt.py b/head_extractor/build/lib/mmdet/models/backbones/pvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b250f63c1b22f21a892faf4c41ccc2d20e83e13
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/pvt.py
@@ -0,0 +1,665 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import MultiheadAttention
+from mmengine.logging import MMLogger
+from mmengine.model import (BaseModule, ModuleList, Sequential, constant_init,
+                            normal_init, trunc_normal_init)
+from mmengine.model.weight_init import trunc_normal_
+from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict
+from torch.nn.modules.utils import _pair as to_2tuple
+
+from mmdet.registry import MODELS
+from ..layers import PatchEmbed, nchw_to_nlc, nlc_to_nchw
+
+
+class MixFFN(BaseModule):
+    """An implementation of MixFFN of PVT.
+
+    The differences between MixFFN & FFN:
+        1. Use 1X1 Conv to replace Linear layer.
+        2. Introduce 3X3 Depth-wise Conv to encode positional information.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`.
+        feedforward_channels (int): The hidden dimension of FFNs.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='GELU').
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+            Default: None.
+        use_conv (bool): If True, add 3x3 DWConv between two Linear layers.
+            Defaults: False.
+        init_cfg (obj:`mmengine.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 feedforward_channels,
+                 act_cfg=dict(type='GELU'),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 use_conv=False,
+                 init_cfg=None):
+        super(MixFFN, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.act_cfg = act_cfg
+        activate = build_activation_layer(act_cfg)
+
+        in_channels = embed_dims
+        fc1 = Conv2d(
+            in_channels=in_channels,
+            out_channels=feedforward_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        if use_conv:
+            # 3x3 depth wise conv to provide positional encode information
+            dw_conv = Conv2d(
+                in_channels=feedforward_channels,
+                out_channels=feedforward_channels,
+                kernel_size=3,
+                stride=1,
+                padding=(3 - 1) // 2,
+                bias=True,
+                groups=feedforward_channels)
+        fc2 = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        drop = nn.Dropout(ffn_drop)
+        layers = [fc1, activate, drop, fc2, drop]
+        if use_conv:
+            layers.insert(1, dw_conv)
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+
+    def forward(self, x, hw_shape, identity=None):
+        out = nlc_to_nchw(x, hw_shape)
+        out = self.layers(out)
+        out = nchw_to_nlc(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+class SpatialReductionAttention(MultiheadAttention):
+    """An implementation of Spatial Reduction Attention of PVT.
+
+    This module is modified from MultiheadAttention which is a module from
+    mmcv.cnn.bricks.transformer.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut. Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        qkv_bias (bool): enable bias for qkv if True. Default: True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention of PVT. Default: 1.
+        init_cfg (obj:`mmengine.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=None,
+                 batch_first=True,
+                 qkv_bias=True,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 init_cfg=None):
+        super().__init__(
+            embed_dims,
+            num_heads,
+            attn_drop,
+            proj_drop,
+            batch_first=batch_first,
+            dropout_layer=dropout_layer,
+            bias=qkv_bias,
+            init_cfg=init_cfg)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = Conv2d(
+                in_channels=embed_dims,
+                out_channels=embed_dims,
+                kernel_size=sr_ratio,
+                stride=sr_ratio)
+            # The ret[0] of build_norm_layer is norm name.
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        # handle the BC-breaking from https://github.com/open-mmlab/mmcv/pull/1418 # noqa
+        from mmdet import digit_version, mmcv_version
+        if mmcv_version < digit_version('1.3.17'):
+            warnings.warn('The legacy version of forward function in'
+                          'SpatialReductionAttention is deprecated in'
+                          'mmcv>=1.3.17 and will no longer support in the'
+                          'future. Please upgrade your mmcv.')
+            self.forward = self.legacy_forward
+
+    def forward(self, x, hw_shape, identity=None):
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_queries, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_queries, embed_dims) to num_queries_first
+        # (num_queries ,batch, embed_dims), and recover ``attn_output``
+        # from num_queries_first to batch_first.
+        if self.batch_first:
+            x_q = x_q.transpose(0, 1)
+            x_kv = x_kv.transpose(0, 1)
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+    def legacy_forward(self, x, hw_shape, identity=None):
+        """multi head attention forward in mmcv version < 1.3.17."""
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+class PVTEncoderLayer(BaseModule):
+    """Implements one encoder layer in PVT.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed.
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): stochastic depth rate. Default: 0.0.
+        qkv_bias (bool): enable bias for qkv if True.
+            Default: True.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention of PVT. Default: 1.
+        use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN.
+            Default: False.
+        init_cfg (dict, optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 use_conv_ffn=False,
+                 init_cfg=None):
+        super(PVTEncoderLayer, self).__init__(init_cfg=init_cfg)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.attn = SpatialReductionAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.ffn = MixFFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            use_conv=use_conv_ffn,
+            act_cfg=act_cfg)
+
+    def forward(self, x, hw_shape):
+        x = self.attn(self.norm1(x), hw_shape, identity=x)
+        x = self.ffn(self.norm2(x), hw_shape, identity=x)
+
+        return x
+
+
+class AbsolutePositionEmbedding(BaseModule):
+    """An implementation of the absolute position embedding in PVT.
+
+    Args:
+        pos_shape (int): The shape of the absolute position embedding.
+        pos_dim (int): The dimension of the absolute position embedding.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default: 0.0.
+    """
+
+    def __init__(self, pos_shape, pos_dim, drop_rate=0., init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(pos_shape, int):
+            pos_shape = to_2tuple(pos_shape)
+        elif isinstance(pos_shape, tuple):
+            if len(pos_shape) == 1:
+                pos_shape = to_2tuple(pos_shape[0])
+            assert len(pos_shape) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pos_shape)}'
+        self.pos_shape = pos_shape
+        self.pos_dim = pos_dim
+
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, pos_shape[0] * pos_shape[1], pos_dim))
+        self.drop = nn.Dropout(p=drop_rate)
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+
+    def resize_pos_embed(self, pos_embed, input_shape, mode='bilinear'):
+        """Resize pos_embed weights.
+
+        Resize pos_embed using bilinear interpolate method.
+
+        Args:
+            pos_embed (torch.Tensor): Position embedding weights.
+            input_shape (tuple): Tuple for (downsampled input image height,
+                downsampled input image width).
+            mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'bilinear'``.
+
+        Return:
+            torch.Tensor: The resized pos_embed of shape [B, L_new, C].
+        """
+        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
+        pos_h, pos_w = self.pos_shape
+        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
+        pos_embed_weight = pos_embed_weight.reshape(
+            1, pos_h, pos_w, self.pos_dim).permute(0, 3, 1, 2).contiguous()
+        pos_embed_weight = F.interpolate(
+            pos_embed_weight, size=input_shape, mode=mode)
+        pos_embed_weight = torch.flatten(pos_embed_weight,
+                                         2).transpose(1, 2).contiguous()
+        pos_embed = pos_embed_weight
+
+        return pos_embed
+
+    def forward(self, x, hw_shape, mode='bilinear'):
+        pos_embed = self.resize_pos_embed(self.pos_embed, hw_shape, mode)
+        return self.drop(x + pos_embed)
+
+
+@MODELS.register_module()
+class PyramidVisionTransformer(BaseModule):
+    """Pyramid Vision Transformer (PVT)
+
+    Implementation of `Pyramid Vision Transformer: A Versatile Backbone for
+    Dense Prediction without Convolutions
+    <https://arxiv.org/pdf/2102.12122.pdf>`_.
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 64.
+        num_stags (int): The num of stages. Default: 4.
+        num_layers (Sequence[int]): The layer number of each transformer encode
+            layer. Default: [3, 4, 6, 3].
+        num_heads (Sequence[int]): The attention heads of each transformer
+            encode layer. Default: [1, 2, 5, 8].
+        patch_sizes (Sequence[int]): The patch_size of each patch embedding.
+            Default: [4, 2, 2, 2].
+        strides (Sequence[int]): The stride of each patch embedding.
+            Default: [4, 2, 2, 2].
+        paddings (Sequence[int]): The padding of each patch embedding.
+            Default: [0, 0, 0, 0].
+        sr_ratios (Sequence[int]): The spatial reduction rate of each
+            transformer encode layer. Default: [8, 4, 2, 1].
+        out_indices (Sequence[int] | int): Output from which stages.
+            Default: (0, 1, 2, 3).
+        mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the
+            embedding dim of each transformer encode layer.
+            Default: [8, 8, 4, 4].
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: True.
+        use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN.
+            Default: False.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        pretrained (str, optional): model pretrained path. Default: None.
+        convert_weights (bool): The flag indicates whether the
+            pre-trained model is from the original repo. We may need
+            to convert some keys to make it compatible.
+            Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 in_channels=3,
+                 embed_dims=64,
+                 num_stages=4,
+                 num_layers=[3, 4, 6, 3],
+                 num_heads=[1, 2, 5, 8],
+                 patch_sizes=[4, 2, 2, 2],
+                 strides=[4, 2, 2, 2],
+                 paddings=[0, 0, 0, 0],
+                 sr_ratios=[8, 4, 2, 1],
+                 out_indices=(0, 1, 2, 3),
+                 mlp_ratios=[8, 8, 4, 4],
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_abs_pos_embed=True,
+                 norm_after_stage=False,
+                 use_conv_ffn=False,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 pretrained=None,
+                 convert_weights=True,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.convert_weights = convert_weights
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            self.init_cfg = init_cfg
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.embed_dims = embed_dims
+
+        self.num_stages = num_stages
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.sr_ratios = sr_ratios
+        assert num_stages == len(num_layers) == len(num_heads) \
+               == len(patch_sizes) == len(strides) == len(sr_ratios)
+
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_stages
+        self.pretrained = pretrained
+
+        # transformer encoder
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, sum(num_layers))
+        ]  # stochastic num_layer decay rule
+
+        cur = 0
+        self.layers = ModuleList()
+        for i, num_layer in enumerate(num_layers):
+            embed_dims_i = embed_dims * num_heads[i]
+            patch_embed = PatchEmbed(
+                in_channels=in_channels,
+                embed_dims=embed_dims_i,
+                kernel_size=patch_sizes[i],
+                stride=strides[i],
+                padding=paddings[i],
+                bias=True,
+                norm_cfg=norm_cfg)
+
+            layers = ModuleList()
+            if use_abs_pos_embed:
+                pos_shape = pretrain_img_size // np.prod(patch_sizes[:i + 1])
+                pos_embed = AbsolutePositionEmbedding(
+                    pos_shape=pos_shape,
+                    pos_dim=embed_dims_i,
+                    drop_rate=drop_rate)
+                layers.append(pos_embed)
+            layers.extend([
+                PVTEncoderLayer(
+                    embed_dims=embed_dims_i,
+                    num_heads=num_heads[i],
+                    feedforward_channels=mlp_ratios[i] * embed_dims_i,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=dpr[cur + idx],
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    sr_ratio=sr_ratios[i],
+                    use_conv_ffn=use_conv_ffn) for idx in range(num_layer)
+            ])
+            in_channels = embed_dims_i
+            # The ret[0] of build_norm_layer is norm name.
+            if norm_after_stage:
+                norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
+            else:
+                norm = nn.Identity()
+            self.layers.append(ModuleList([patch_embed, layers, norm]))
+            cur += num_layer
+
+    def init_weights(self):
+        logger = MMLogger.get_current_instance()
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, 1.0)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(m, 0, math.sqrt(2.0 / fan_out))
+                elif isinstance(m, AbsolutePositionEmbedding):
+                    m.init_weights()
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            checkpoint = CheckpointLoader.load_checkpoint(
+                self.init_cfg.checkpoint, logger=logger, map_location='cpu')
+            logger.warn(f'Load pre-trained model for '
+                        f'{self.__class__.__name__} from original repo')
+            if 'state_dict' in checkpoint:
+                state_dict = checkpoint['state_dict']
+            elif 'model' in checkpoint:
+                state_dict = checkpoint['model']
+            else:
+                state_dict = checkpoint
+            if self.convert_weights:
+                # Because pvt backbones are not supported by mmpretrain,
+                # so we need to convert pre-trained weights to match this
+                # implementation.
+                state_dict = pvt_convert(state_dict)
+            load_state_dict(self, state_dict, strict=False, logger=logger)
+
+    def forward(self, x):
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            x, hw_shape = layer[0](x)
+
+            for block in layer[1]:
+                x = block(x, hw_shape)
+            x = layer[2](x)
+            x = nlc_to_nchw(x, hw_shape)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return outs
+
+
+@MODELS.register_module()
+class PyramidVisionTransformerV2(PyramidVisionTransformer):
+    """Implementation of `PVTv2: Improved Baselines with Pyramid Vision
+    Transformer <https://arxiv.org/pdf/2106.13797.pdf>`_."""
+
+    def __init__(self, **kwargs):
+        super(PyramidVisionTransformerV2, self).__init__(
+            patch_sizes=[7, 3, 3, 3],
+            paddings=[3, 1, 1, 1],
+            use_abs_pos_embed=False,
+            norm_after_stage=True,
+            use_conv_ffn=True,
+            **kwargs)
+
+
+def pvt_convert(ckpt):
+    new_ckpt = OrderedDict()
+    # Process the concat between q linear weights and kv linear weights
+    use_abs_pos_embed = False
+    use_conv_ffn = False
+    for k in ckpt.keys():
+        if k.startswith('pos_embed'):
+            use_abs_pos_embed = True
+        if k.find('dwconv') >= 0:
+            use_conv_ffn = True
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        if k.startswith('norm.'):
+            continue
+        if k.startswith('cls_token'):
+            continue
+        if k.startswith('pos_embed'):
+            stage_i = int(k.replace('pos_embed', ''))
+            new_k = k.replace(f'pos_embed{stage_i}',
+                              f'layers.{stage_i - 1}.1.0.pos_embed')
+            if stage_i == 4 and v.size(1) == 50:  # 1 (cls token) + 7 * 7
+                new_v = v[:, 1:, :]  # remove cls token
+            else:
+                new_v = v
+        elif k.startswith('patch_embed'):
+            stage_i = int(k.split('.')[0].replace('patch_embed', ''))
+            new_k = k.replace(f'patch_embed{stage_i}',
+                              f'layers.{stage_i - 1}.0')
+            new_v = v
+            if 'proj.' in new_k:
+                new_k = new_k.replace('proj.', 'projection.')
+        elif k.startswith('block'):
+            stage_i = int(k.split('.')[0].replace('block', ''))
+            layer_i = int(k.split('.')[1])
+            new_layer_i = layer_i + use_abs_pos_embed
+            new_k = k.replace(f'block{stage_i}.{layer_i}',
+                              f'layers.{stage_i - 1}.1.{new_layer_i}')
+            new_v = v
+            if 'attn.q.' in new_k:
+                sub_item_k = k.replace('q.', 'kv.')
+                new_k = new_k.replace('q.', 'attn.in_proj_')
+                new_v = torch.cat([v, ckpt[sub_item_k]], dim=0)
+            elif 'attn.kv.' in new_k:
+                continue
+            elif 'attn.proj.' in new_k:
+                new_k = new_k.replace('proj.', 'attn.out_proj.')
+            elif 'attn.sr.' in new_k:
+                new_k = new_k.replace('sr.', 'sr.')
+            elif 'mlp.' in new_k:
+                string = f'{new_k}-'
+                new_k = new_k.replace('mlp.', 'ffn.layers.')
+                if 'fc1.weight' in new_k or 'fc2.weight' in new_k:
+                    new_v = v.reshape((*v.shape, 1, 1))
+                new_k = new_k.replace('fc1.', '0.')
+                new_k = new_k.replace('dwconv.dwconv.', '1.')
+                if use_conv_ffn:
+                    new_k = new_k.replace('fc2.', '4.')
+                else:
+                    new_k = new_k.replace('fc2.', '3.')
+                string += f'{new_k} {v.shape}-{new_v.shape}'
+        elif k.startswith('norm'):
+            stage_i = int(k[4])
+            new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i - 1}.2')
+            new_v = v
+        else:
+            new_k = k
+            new_v = v
+        new_ckpt[new_k] = new_v
+
+    return new_ckpt
diff --git a/head_extractor/build/lib/mmdet/models/backbones/regnet.py b/head_extractor/build/lib/mmdet/models/backbones/regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..55d3ce075f0cec68de4537a71ed569151d684562
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/regnet.py
@@ -0,0 +1,356 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from mmdet.registry import MODELS
+from .resnet import ResNet
+from .resnext import Bottleneck
+
+
+@MODELS.register_module()
+class RegNet(ResNet):
+    """RegNet backbone.
+
+    More details can be found in `paper <https://arxiv.org/abs/2003.13678>`_ .
+
+    Args:
+        arch (dict): The parameter of RegNets.
+
+            - w0 (int): initial width
+            - wa (float): slope of width
+            - wm (float): quantization parameter to quantize the width
+            - depth (int): depth of the backbone
+            - group_w (int): width of group
+            - bot_mul (float): bottleneck ratio, i.e. expansion of bottleneck.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        base_channels (int): Base channels after stem layer.
+        in_channels (int): Number of input image channels. Default: 3.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import RegNet
+        >>> import torch
+        >>> self = RegNet(
+                arch=dict(
+                    w0=88,
+                    wa=26.31,
+                    wm=2.25,
+                    group_w=48,
+                    depth=25,
+                    bot_mul=1.0))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 96, 8, 8)
+        (1, 192, 4, 4)
+        (1, 432, 2, 2)
+        (1, 1008, 1, 1)
+    """
+    arch_settings = {
+        'regnetx_400mf':
+        dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        'regnetx_800mf':
+        dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, bot_mul=1.0),
+        'regnetx_1.6gf':
+        dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, bot_mul=1.0),
+        'regnetx_3.2gf':
+        dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0),
+        'regnetx_4.0gf':
+        dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, bot_mul=1.0),
+        'regnetx_6.4gf':
+        dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, bot_mul=1.0),
+        'regnetx_8.0gf':
+        dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, bot_mul=1.0),
+        'regnetx_12gf':
+        dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0),
+    }
+
+    def __init__(self,
+                 arch,
+                 in_channels=3,
+                 stem_channels=32,
+                 base_channels=32,
+                 strides=(2, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResNet, self).__init__(init_cfg)
+
+        # Generate RegNet parameters first
+        if isinstance(arch, str):
+            assert arch in self.arch_settings, \
+                f'"arch": "{arch}" is not one of the' \
+                ' arch_settings'
+            arch = self.arch_settings[arch]
+        elif not isinstance(arch, dict):
+            raise ValueError('Expect "arch" to be either a string '
+                             f'or a dict, got {type(arch)}')
+
+        widths, num_stages = self.generate_regnet(
+            arch['w0'],
+            arch['wa'],
+            arch['wm'],
+            arch['depth'],
+        )
+        # Convert to per stage format
+        stage_widths, stage_blocks = self.get_stages_from_blocks(widths)
+        # Generate group widths and bot muls
+        group_widths = [arch['group_w'] for _ in range(num_stages)]
+        self.bottleneck_ratio = [arch['bot_mul'] for _ in range(num_stages)]
+        # Adjust the compatibility of stage_widths and group_widths
+        stage_widths, group_widths = self.adjust_width_group(
+            stage_widths, self.bottleneck_ratio, group_widths)
+
+        # Group params by stage
+        self.stage_widths = stage_widths
+        self.group_widths = group_widths
+        self.depth = sum(stage_blocks)
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.zero_init_residual = zero_init_residual
+        self.block = Bottleneck
+        expansion_bak = self.block.expansion
+        self.block.expansion = 1
+        self.stage_blocks = stage_blocks[:num_stages]
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                if self.zero_init_residual:
+                    block_init_cfg = dict(
+                        type='Constant', val=0, override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.inplanes = stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            group_width = self.group_widths[i]
+            width = int(round(self.stage_widths[i] * self.bottleneck_ratio[i]))
+            stage_groups = width // group_width
+
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if self.plugins is not None:
+                stage_plugins = self.make_stage_plugins(self.plugins, i)
+            else:
+                stage_plugins = None
+
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=self.stage_widths[i],
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                groups=stage_groups,
+                base_width=group_width,
+                base_channels=self.stage_widths[i],
+                init_cfg=block_init_cfg)
+            self.inplanes = self.stage_widths[i]
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = stage_widths[-1]
+        self.block.expansion = expansion_bak
+
+    def _make_stem_layer(self, in_channels, base_channels):
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            base_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, base_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def generate_regnet(self,
+                        initial_width,
+                        width_slope,
+                        width_parameter,
+                        depth,
+                        divisor=8):
+        """Generates per block width from RegNet parameters.
+
+        Args:
+            initial_width ([int]): Initial width of the backbone
+            width_slope ([float]): Slope of the quantized linear function
+            width_parameter ([int]): Parameter used to quantize the width.
+            depth ([int]): Depth of the backbone.
+            divisor (int, optional): The divisor of channels. Defaults to 8.
+
+        Returns:
+            list, int: return a list of widths of each stage and the number \
+                of stages
+        """
+        assert width_slope >= 0
+        assert initial_width > 0
+        assert width_parameter > 1
+        assert initial_width % divisor == 0
+        widths_cont = np.arange(depth) * width_slope + initial_width
+        ks = np.round(
+            np.log(widths_cont / initial_width) / np.log(width_parameter))
+        widths = initial_width * np.power(width_parameter, ks)
+        widths = np.round(np.divide(widths, divisor)) * divisor
+        num_stages = len(np.unique(widths))
+        widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist()
+        return widths, num_stages
+
+    @staticmethod
+    def quantize_float(number, divisor):
+        """Converts a float to closest non-zero int divisible by divisor.
+
+        Args:
+            number (int): Original number to be quantized.
+            divisor (int): Divisor used to quantize the number.
+
+        Returns:
+            int: quantized number that is divisible by devisor.
+        """
+        return int(round(number / divisor) * divisor)
+
+    def adjust_width_group(self, widths, bottleneck_ratio, groups):
+        """Adjusts the compatibility of widths and groups.
+
+        Args:
+            widths (list[int]): Width of each stage.
+            bottleneck_ratio (float): Bottleneck ratio.
+            groups (int): number of groups in each stage
+
+        Returns:
+            tuple(list): The adjusted widths and groups of each stage.
+        """
+        bottleneck_width = [
+            int(w * b) for w, b in zip(widths, bottleneck_ratio)
+        ]
+        groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_width)]
+        bottleneck_width = [
+            self.quantize_float(w_bot, g)
+            for w_bot, g in zip(bottleneck_width, groups)
+        ]
+        widths = [
+            int(w_bot / b)
+            for w_bot, b in zip(bottleneck_width, bottleneck_ratio)
+        ]
+        return widths, groups
+
+    def get_stages_from_blocks(self, widths):
+        """Gets widths/stage_blocks of network at each stage.
+
+        Args:
+            widths (list[int]): Width in each stage.
+
+        Returns:
+            tuple(list): width and depth of each stage
+        """
+        width_diff = [
+            width != width_prev
+            for width, width_prev in zip(widths + [0], [0] + widths)
+        ]
+        stage_widths = [
+            width for width, diff in zip(widths, width_diff[:-1]) if diff
+        ]
+        stage_blocks = np.diff([
+            depth for depth, diff in zip(range(len(width_diff)), width_diff)
+            if diff
+        ]).tolist()
+        return stage_widths, stage_blocks
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/backbones/res2net.py b/head_extractor/build/lib/mmdet/models/backbones/res2net.py
new file mode 100644
index 0000000000000000000000000000000000000000..958fc88465c6769cb4c50907c92335331e8b7834
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/res2net.py
@@ -0,0 +1,327 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import Sequential
+
+from mmdet.registry import MODELS
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottle2neck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 scales=4,
+                 base_width=26,
+                 base_channels=64,
+                 stage_type='normal',
+                 **kwargs):
+        """Bottle2neck block for Res2Net.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottle2neck, self).__init__(inplanes, planes, **kwargs)
+        assert scales > 1, 'Res2Net degenerates to ResNet when scales = 1.'
+        width = int(math.floor(self.planes * (base_width / base_channels)))
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width * scales, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width * scales,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+
+        if stage_type == 'stage' and self.conv2_stride != 1:
+            self.pool = nn.AvgPool2d(
+                kernel_size=3, stride=self.conv2_stride, padding=1)
+        convs = []
+        bns = []
+
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            for i in range(scales - 1):
+                convs.append(
+                    build_conv_layer(
+                        self.conv_cfg,
+                        width,
+                        width,
+                        kernel_size=3,
+                        stride=self.conv2_stride,
+                        padding=self.dilation,
+                        dilation=self.dilation,
+                        bias=False))
+                bns.append(
+                    build_norm_layer(self.norm_cfg, width, postfix=i + 1)[1])
+            self.convs = nn.ModuleList(convs)
+            self.bns = nn.ModuleList(bns)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            for i in range(scales - 1):
+                convs.append(
+                    build_conv_layer(
+                        self.dcn,
+                        width,
+                        width,
+                        kernel_size=3,
+                        stride=self.conv2_stride,
+                        padding=self.dilation,
+                        dilation=self.dilation,
+                        bias=False))
+                bns.append(
+                    build_norm_layer(self.norm_cfg, width, postfix=i + 1)[1])
+            self.convs = nn.ModuleList(convs)
+            self.bns = nn.ModuleList(bns)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width * scales,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.stage_type = stage_type
+        self.scales = scales
+        self.width = width
+        delattr(self, 'conv2')
+        delattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            spx = torch.split(out, self.width, 1)
+            sp = self.convs[0](spx[0].contiguous())
+            sp = self.relu(self.bns[0](sp))
+            out = sp
+            for i in range(1, self.scales - 1):
+                if self.stage_type == 'stage':
+                    sp = spx[i]
+                else:
+                    sp = sp + spx[i]
+                sp = self.convs[i](sp.contiguous())
+                sp = self.relu(self.bns[i](sp))
+                out = torch.cat((out, sp), 1)
+
+            if self.stage_type == 'normal' or self.conv2_stride == 1:
+                out = torch.cat((out, spx[self.scales - 1]), 1)
+            elif self.stage_type == 'stage':
+                out = torch.cat((out, self.pool(spx[self.scales - 1])), 1)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Res2Layer(Sequential):
+    """Res2Layer to build Res2Net style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottle2neck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        scales (int): Scales used in Res2Net. Default: 4
+        base_width (int): Basic width of each scale. Default: 26
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 scales=4,
+                 base_width=26,
+                 **kwargs):
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.AvgPool2d(
+                    kernel_size=stride,
+                    stride=stride,
+                    ceil_mode=True,
+                    count_include_pad=False),
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=1,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1],
+            )
+
+        layers = []
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                scales=scales,
+                base_width=base_width,
+                stage_type='stage',
+                **kwargs))
+        inplanes = planes * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    scales=scales,
+                    base_width=base_width,
+                    **kwargs))
+        super(Res2Layer, self).__init__(*layers)
+
+
+@MODELS.register_module()
+class Res2Net(ResNet):
+    """Res2Net backbone.
+
+    Args:
+        scales (int): Scales used in Res2Net. Default: 4
+        base_width (int): Basic width of each scale. Default: 26
+        depth (int): Depth of res2net, from {50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Res2net stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottle2neck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import Res2Net
+        >>> import torch
+        >>> self = Res2Net(depth=50, scales=4, base_width=26)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 8, 8)
+        (1, 512, 4, 4)
+        (1, 1024, 2, 2)
+        (1, 2048, 1, 1)
+    """
+
+    arch_settings = {
+        50: (Bottle2neck, (3, 4, 6, 3)),
+        101: (Bottle2neck, (3, 4, 23, 3)),
+        152: (Bottle2neck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 scales=4,
+                 base_width=26,
+                 style='pytorch',
+                 deep_stem=True,
+                 avg_down=True,
+                 pretrained=None,
+                 init_cfg=None,
+                 **kwargs):
+        self.scales = scales
+        self.base_width = base_width
+        super(Res2Net, self).__init__(
+            style='pytorch',
+            deep_stem=True,
+            avg_down=True,
+            pretrained=pretrained,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return Res2Layer(
+            scales=self.scales,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmdet/models/backbones/resnest.py b/head_extractor/build/lib/mmdet/models/backbones/resnest.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4466c4cc416237bee1f870b52e3c20a849c5a60
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/resnest.py
@@ -0,0 +1,322 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from ..layers import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNetV1d
+
+
+class RSoftmax(nn.Module):
+    """Radix Softmax module in ``SplitAttentionConv2d``.
+
+    Args:
+        radix (int): Radix of input.
+        groups (int): Groups of input.
+    """
+
+    def __init__(self, radix, groups):
+        super().__init__()
+        self.radix = radix
+        self.groups = groups
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class SplitAttentionConv2d(BaseModule):
+    """Split-Attention Conv2d in ResNeSt.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        channels (int): Number of intermediate channels.
+        kernel_size (int | tuple[int]): Size of the convolution kernel.
+        stride (int | tuple[int]): Stride of the convolution.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+        dilation (int | tuple[int]): Spacing between kernel elements.
+        groups (int): Number of blocked connections from input channels to
+            output channels.
+        groups (int): Same as nn.Conv2d.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels. Default: 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        dcn (dict): Config dict for DCN. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 radix=2,
+                 reduction_factor=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 init_cfg=None):
+        super(SplitAttentionConv2d, self).__init__(init_cfg)
+        inter_channels = max(in_channels * radix // reduction_factor, 32)
+        self.radix = radix
+        self.groups = groups
+        self.channels = channels
+        self.with_dcn = dcn is not None
+        self.dcn = dcn
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if self.with_dcn and not fallback_on_stride:
+            assert conv_cfg is None, 'conv_cfg must be None for DCN'
+            conv_cfg = dcn
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            channels * radix,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups * radix,
+            bias=False)
+        # To be consistent with original implementation, starting from 0
+        self.norm0_name, norm0 = build_norm_layer(
+            norm_cfg, channels * radix, postfix=0)
+        self.add_module(self.norm0_name, norm0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc1 = build_conv_layer(
+            None, channels, inter_channels, 1, groups=self.groups)
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, inter_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.fc2 = build_conv_layer(
+            None, inter_channels, channels * radix, 1, groups=self.groups)
+        self.rsoftmax = RSoftmax(radix, groups)
+
+    @property
+    def norm0(self):
+        """nn.Module: the normalization layer named "norm0" """
+        return getattr(self, self.norm0_name)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm0(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        batch = x.size(0)
+        if self.radix > 1:
+            splits = x.view(batch, self.radix, -1, *x.shape[2:])
+            gap = splits.sum(dim=1)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        gap = self.norm1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            attens = atten.view(batch, self.radix, -1, *atten.shape[2:])
+            out = torch.sum(attens * splits, dim=1)
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeSt.
+
+    Args:
+        inplane (int): Input planes of this block.
+        planes (int): Middle planes of this block.
+        groups (int): Groups of conv2.
+        base_width (int): Base of width in terms of base channels. Default: 4.
+        base_channels (int): Base of channels for calculating width.
+            Default: 64.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Key word arguments for base class.
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        """Bottleneck block for ResNeSt."""
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.with_modulated_dcn = False
+        self.conv2 = SplitAttentionConv2d(
+            width,
+            width,
+            kernel_size=3,
+            stride=1 if self.avg_down_stride else self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            radix=radix,
+            reduction_factor=reduction_factor,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dcn=self.dcn)
+        delattr(self, self.norm2_name)
+
+        if self.avg_down_stride:
+            self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+
+            if self.avg_down_stride:
+                out = self.avd_layer(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class ResNeSt(ResNetV1d):
+    """ResNeSt backbone.
+
+    Args:
+        groups (int): Number of groups of Bottleneck. Default: 1
+        base_width (int): Base width of Bottleneck. Default: 4
+        radix (int): Radix of SplitAttentionConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Keyword arguments for ResNet.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3)),
+        200: (Bottleneck, (3, 24, 36, 3))
+    }
+
+    def __init__(self,
+                 groups=1,
+                 base_width=4,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        self.radix = radix
+        self.reduction_factor = reduction_factor
+        self.avg_down_stride = avg_down_stride
+        super(ResNeSt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            radix=self.radix,
+            reduction_factor=self.reduction_factor,
+            avg_down_stride=self.avg_down_stride,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmdet/models/backbones/resnet.py b/head_extractor/build/lib/mmdet/models/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6f48f94f286e3c5e3179f752a7b36ea77c0d45
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/resnet.py
@@ -0,0 +1,672 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_plugin_layer
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from ..layers import ResLayer
+
+
+class BasicBlock(BaseModule):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super(BasicBlock, self).__init__(init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        """Bottleneck block for ResNet.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(init_cfg)
+        assert style in ['pytorch', 'caffe']
+        assert dcn is None or isinstance(dcn, dict)
+        assert plugins is None or isinstance(plugins, list)
+        if plugins is not None:
+            allowed_position = ['after_conv1', 'after_conv2', 'after_conv3']
+            assert all(p['position'] in allowed_position for p in plugins)
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+        self.plugins = plugins
+        self.with_plugins = plugins is not None
+
+        if self.with_plugins:
+            # collect plugins for conv1/conv2/conv3
+            self.after_conv1_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv1'
+            ]
+            self.after_conv2_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv2'
+            ]
+            self.after_conv3_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv3'
+            ]
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                conv_cfg,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                dcn,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+        if self.with_plugins:
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                planes, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                planes, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                planes * self.expansion, self.after_conv3_plugins)
+
+    def make_block_plugins(self, in_channels, plugins):
+        """make plugins for block.
+
+        Args:
+            in_channels (int): Input channels of plugin.
+            plugins (list[dict]): List of plugins cfg to build.
+
+        Returns:
+            list[str]: List of the names of plugin.
+        """
+        assert isinstance(plugins, list)
+        plugin_names = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            name, layer = build_plugin_layer(
+                plugin,
+                in_channels=in_channels,
+                postfix=plugin.pop('postfix', ''))
+            assert not hasattr(self, name), f'duplicate plugin {name}'
+            self.add_module(name, layer)
+            plugin_names.append(name)
+        return plugin_names
+
+    def forward_plugin(self, x, plugin_names):
+        out = x
+        for name in plugin_names:
+            out = getattr(self, name)(out)
+        return out
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class ResNet(BaseModule):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        stem_channels (int | None): Number of stem channels. If not specified,
+            it will be the same as `base_channels`. Default: None.
+        base_channels (int): Number of base channels of res layer. Default: 64.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=None,
+                 base_channels=64,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResNet, self).__init__(init_cfg)
+        self.zero_init_residual = zero_init_residual
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                block = self.arch_settings[depth][0]
+                if self.zero_init_residual:
+                    if block is BasicBlock:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm2'))
+                    elif block is Bottleneck:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depth = depth
+        if stem_channels is None:
+            stem_channels = base_channels
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = stem_channels
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if plugins is not None:
+                stage_plugins = self.make_stage_plugins(plugins, i)
+            else:
+                stage_plugins = None
+            planes = base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                init_cfg=block_init_cfg)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    def make_stage_plugins(self, plugins, stage_idx):
+        """Make plugins for ResNet ``stage_idx`` th stage.
+
+        Currently we support to insert ``context_block``,
+        ``empirical_attention_block``, ``nonlocal_block`` into the backbone
+        like ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
+        Bottleneck.
+
+        An example of plugins format could be:
+
+        Examples:
+            >>> plugins=[
+            ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+            ...          stages=(False, True, True, True),
+            ...          position='after_conv2'),
+            ...     dict(cfg=dict(type='yyy'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='1'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='2'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3')
+            ... ]
+            >>> self = ResNet(depth=18)
+            >>> stage_plugins = self.make_stage_plugins(plugins, 0)
+            >>> assert len(stage_plugins) == 3
+
+        Suppose ``stage_idx=0``, the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->conv3->yyy->zzz1->zzz2
+
+        Suppose 'stage_idx=1', the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2
+
+        If stages is missing, the plugin would be applied to all stages.
+
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+
+        Returns:
+            list[dict]: Plugins for current stage
+        """
+        stage_plugins = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            # whether to insert plugin into current stage
+            if stages is None or stages[stage_idx]:
+                stage_plugins.append(plugin)
+
+        return stage_plugins
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels)[1],
+                nn.ReLU(inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(ResNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@MODELS.register_module()
+class ResNetV1d(ResNet):
+    r"""ResNetV1d variant described in `Bag of Tricks
+    <https://arxiv.org/pdf/1812.01187.pdf>`_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super(ResNetV1d, self).__init__(
+            deep_stem=True, avg_down=True, **kwargs)
diff --git a/head_extractor/build/lib/mmdet/models/backbones/resnext.py b/head_extractor/build/lib/mmdet/models/backbones/resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..df3d79e046c3ab9b289bcfeb6f937c87f6c09bfa
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/resnext.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from mmdet.registry import MODELS
+from ..layers import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 **kwargs):
+        """Bottleneck block for ResNeXt.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        if self.with_plugins:
+            self._del_block_plugins(self.after_conv1_plugin_names +
+                                    self.after_conv2_plugin_names +
+                                    self.after_conv3_plugin_names)
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                width, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                width, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                self.planes * self.expansion, self.after_conv3_plugins)
+
+    def _del_block_plugins(self, plugin_names):
+        """delete plugins for block if exist.
+
+        Args:
+            plugin_names (list[str]): List of plugins name to delete.
+        """
+        assert isinstance(plugin_names, list)
+        for plugin_name in plugin_names:
+            del self._modules[plugin_name]
+
+
+@MODELS.register_module()
+class ResNeXt(ResNet):
+    """ResNeXt backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        groups (int): Group of resnext.
+        base_width (int): Base width of resnext.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        super(ResNeXt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``"""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmdet/models/backbones/ssd_vgg.py b/head_extractor/build/lib/mmdet/models/backbones/ssd_vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..843e82e2722f93b9b2abb5180c827c8f2a430b48
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/ssd_vgg.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import VGG
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from ..necks import ssd_neck
+
+
+@MODELS.register_module()
+class SSDVGG(VGG, BaseModule):
+    """VGG Backbone network for single-shot-detection.
+
+    Args:
+        depth (int): Depth of vgg, from {11, 13, 16, 19}.
+        with_last_pool (bool): Whether to add a pooling layer at the last
+            of the model
+        ceil_mode (bool): When True, will use `ceil` instead of `floor`
+            to compute the output shape.
+        out_indices (Sequence[int]): Output from which stages.
+        out_feature_indices (Sequence[int]): Output from which feature map.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+        input_size (int, optional): Deprecated argumment.
+            Width and height of input, from {300, 512}.
+        l2_norm_scale (float, optional) : Deprecated argumment.
+            L2 normalization layer init scale.
+
+    Example:
+        >>> self = SSDVGG(input_size=300, depth=11)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 300, 300)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 1024, 19, 19)
+        (1, 512, 10, 10)
+        (1, 256, 5, 5)
+        (1, 256, 3, 3)
+        (1, 256, 1, 1)
+    """
+    extra_setting = {
+        300: (256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256),
+        512: (256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128),
+    }
+
+    def __init__(self,
+                 depth,
+                 with_last_pool=False,
+                 ceil_mode=True,
+                 out_indices=(3, 4),
+                 out_feature_indices=(22, 34),
+                 pretrained=None,
+                 init_cfg=None,
+                 input_size=None,
+                 l2_norm_scale=None):
+        # TODO: in_channels for mmcv.VGG
+        super(SSDVGG, self).__init__(
+            depth,
+            with_last_pool=with_last_pool,
+            ceil_mode=ceil_mode,
+            out_indices=out_indices)
+
+        self.features.add_module(
+            str(len(self.features)),
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
+        self.features.add_module(
+            str(len(self.features)),
+            nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6))
+        self.features.add_module(
+            str(len(self.features)), nn.ReLU(inplace=True))
+        self.features.add_module(
+            str(len(self.features)), nn.Conv2d(1024, 1024, kernel_size=1))
+        self.features.add_module(
+            str(len(self.features)), nn.ReLU(inplace=True))
+        self.out_feature_indices = out_feature_indices
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+
+        if init_cfg is not None:
+            self.init_cfg = init_cfg
+        elif isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            self.init_cfg = [
+                dict(type='Kaiming', layer='Conv2d'),
+                dict(type='Constant', val=1, layer='BatchNorm2d'),
+                dict(type='Normal', std=0.01, layer='Linear'),
+            ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        if input_size is not None:
+            warnings.warn('DeprecationWarning: input_size is deprecated')
+        if l2_norm_scale is not None:
+            warnings.warn('DeprecationWarning: l2_norm_scale in VGG is '
+                          'deprecated, it has been moved to SSDNeck.')
+
+    def init_weights(self, pretrained=None):
+        super(VGG, self).init_weights()
+
+    def forward(self, x):
+        """Forward function."""
+        outs = []
+        for i, layer in enumerate(self.features):
+            x = layer(x)
+            if i in self.out_feature_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+
+class L2Norm(ssd_neck.L2Norm):
+
+    def __init__(self, **kwargs):
+        super(L2Norm, self).__init__(**kwargs)
+        warnings.warn('DeprecationWarning: L2Norm in ssd_vgg.py '
+                      'is deprecated, please use L2Norm in '
+                      'mmdet/models/necks/ssd_neck.py instead')
diff --git a/head_extractor/build/lib/mmdet/models/backbones/swin.py b/head_extractor/build/lib/mmdet/models/backbones/swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..062190fa077d7b01e0c1db76bea0cfb5dc7b6620
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/swin.py
@@ -0,0 +1,819 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, build_dropout
+from mmengine.logging import MMLogger
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, trunc_normal_,
+                                        trunc_normal_init)
+from mmengine.runner.checkpoint import CheckpointLoader
+from mmengine.utils import to_2tuple
+
+from mmdet.registry import MODELS
+from ..layers import PatchEmbed, PatchMerging
+
+
+class WindowMSA(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 init_cfg=None):
+
+        super().__init__()
+        self.embed_dims = embed_dims
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.scale = qk_scale or head_embed_dims**-0.5
+        self.init_cfg = init_cfg
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # About 2x faster than original impl
+        Wh, Ww = self.window_size
+        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
+        rel_position_index = rel_index_coords + rel_index_coords.T
+        rel_position_index = rel_position_index.flip(1).contiguous()
+        self.register_buffer('relative_position_index', rel_position_index)
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def init_weights(self):
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+
+            x (tensor): input features with shape of (num_windows*B, N, C)
+            mask (tensor | None, Optional): mask with shape of (num_windows,
+                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        # make torchscript happy (cannot use tensor as tuple)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def double_step_seq(step1, len1, step2, len2):
+        seq1 = torch.arange(0, step1 * len1, step1)
+        seq2 = torch.arange(0, step2 * len2, step2)
+        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
+
+
+class ShiftWindowMSA(BaseModule):
+    """Shifted Window Multihead Self-Attention Module.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): The height and width of the window.
+        shift_size (int, optional): The shift step of each window towards
+            right-bottom. If zero, act as regular window-msa. Defaults to 0.
+        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
+            Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Defaults: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Defaults: 0.
+        proj_drop_rate (float, optional): Dropout ratio of output.
+            Defaults: 0.
+        dropout_layer (dict, optional): The dropout_layer used before output.
+            Defaults: dict(type='DropPath', drop_prob=0.).
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 shift_size=0,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0,
+                 proj_drop_rate=0,
+                 dropout_layer=dict(type='DropPath', drop_prob=0.),
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.window_size = window_size
+        self.shift_size = shift_size
+        assert 0 <= self.shift_size < self.window_size
+
+        self.w_msa = WindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=to_2tuple(window_size),
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate,
+            init_cfg=None)
+
+        self.drop = build_dropout(dropout_layer)
+
+    def forward(self, query, hw_shape):
+        B, L, C = query.shape
+        H, W = hw_shape
+        assert L == H * W, 'input feature has wrong size'
+        query = query.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
+        H_pad, W_pad = query.shape[1], query.shape[2]
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_query = torch.roll(
+                query,
+                shifts=(-self.shift_size, -self.shift_size),
+                dims=(1, 2))
+
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device)
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            # nW, window_size, window_size, 1
+            mask_windows = self.window_partition(img_mask)
+            mask_windows = mask_windows.view(
+                -1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                              float(-100.0)).masked_fill(
+                                                  attn_mask == 0, float(0.0))
+        else:
+            shifted_query = query
+            attn_mask = None
+
+        # nW*B, window_size, window_size, C
+        query_windows = self.window_partition(shifted_query)
+        # nW*B, window_size*window_size, C
+        query_windows = query_windows.view(-1, self.window_size**2, C)
+
+        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
+        attn_windows = self.w_msa(query_windows, mask=attn_mask)
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+
+        # B H' W' C
+        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        x = self.drop(x)
+        return x
+
+    def window_reverse(self, windows, H, W):
+        """
+        Args:
+            windows: (num_windows*B, window_size, window_size, C)
+            H (int): Height of image
+            W (int): Width of image
+        Returns:
+            x: (B, H, W, C)
+        """
+        window_size = self.window_size
+        B = int(windows.shape[0] / (H * W / window_size / window_size))
+        x = windows.view(B, H // window_size, W // window_size, window_size,
+                         window_size, -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+        return x
+
+    def window_partition(self, x):
+        """
+        Args:
+            x: (B, H, W, C)
+        Returns:
+            windows: (num_windows*B, window_size, window_size, C)
+        """
+        B, H, W, C = x.shape
+        window_size = self.window_size
+        x = x.view(B, H // window_size, window_size, W // window_size,
+                   window_size, C)
+        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+        windows = windows.view(-1, window_size, window_size, C)
+        return windows
+
+
+class SwinBlock(BaseModule):
+    """"
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        window_size (int, optional): The local window scale. Default: 7.
+        shift (bool, optional): whether to shift window or not. Default False.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 window_size=7,
+                 shift=False,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+
+        super(SwinBlock, self).__init__()
+
+        self.init_cfg = init_cfg
+        self.with_cp = with_cp
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.attn = ShiftWindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            shift_size=window_size // 2 if shift else 0,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            init_cfg=None)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=2,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=True,
+            init_cfg=None)
+
+    def forward(self, x, hw_shape):
+
+        def _inner_forward(x):
+            identity = x
+            x = self.norm1(x)
+            x = self.attn(x, hw_shape)
+
+            x = x + identity
+
+            identity = x
+            x = self.norm2(x)
+            x = self.ffn(x, identity=identity)
+
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+
+        return x
+
+
+class SwinBlockSequence(BaseModule):
+    """Implements one stage in Swin Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        depth (int): The number of blocks in this stage.
+        window_size (int, optional): The local window scale. Default: 7.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float | list[float], optional): Stochastic depth
+            rate. Default: 0.
+        downsample (BaseModule | None, optional): The downsample operation
+            module. Default: None.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 depth,
+                 window_size=7,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 downsample=None,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(drop_path_rate, list):
+            drop_path_rates = drop_path_rate
+            assert len(drop_path_rates) == depth
+        else:
+            drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)]
+
+        self.blocks = ModuleList()
+        for i in range(depth):
+            block = SwinBlock(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                feedforward_channels=feedforward_channels,
+                window_size=window_size,
+                shift=False if i % 2 == 0 else True,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=drop_path_rates[i],
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.blocks.append(block)
+
+        self.downsample = downsample
+
+    def forward(self, x, hw_shape):
+        for block in self.blocks:
+            x = block(x, hw_shape)
+
+        if self.downsample:
+            x_down, down_hw_shape = self.downsample(x, hw_shape)
+            return x_down, down_hw_shape, x, hw_shape
+        else:
+            return x, hw_shape, x, hw_shape
+
+
+@MODELS.register_module()
+class SwinTransformer(BaseModule):
+    """ Swin Transformer
+    A PyTorch implement of : `Swin Transformer:
+    Hierarchical Vision Transformer using Shifted Windows`  -
+        https://arxiv.org/abs/2103.14030
+
+    Inspiration from
+    https://github.com/microsoft/Swin-Transformer
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): The num of input channels.
+            Defaults: 3.
+        embed_dims (int): The feature dimension. Default: 96.
+        patch_size (int | tuple[int]): Patch size. Default: 4.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+            Default: (2, 2, 6, 2).
+        num_heads (tuple[int]): Parallel attention heads of each Swin
+            Transformer stage. Default: (3, 6, 12, 24).
+        strides (tuple[int]): The patch merging or patch embedding stride of
+            each Swin Transformer stage. (In swin, we set kernel size equal to
+            stride.) Default: (4, 2, 2, 2).
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key,
+            value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        patch_norm (bool): If add a norm layer for patch embed and patch
+            merging. Default: True.
+        drop_rate (float): Dropout rate. Defaults: 0.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults: 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: False.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer at
+            output of backone. Defaults: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        pretrained (str, optional): model pretrained path. Default: None.
+        convert_weights (bool): The flag indicates whether the
+            pre-trained model is from the original repo. We may need
+            to convert some keys to make it compatible.
+            Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            Default: -1 (-1 means not freezing any parameters).
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 in_channels=3,
+                 embed_dims=96,
+                 patch_size=4,
+                 window_size=7,
+                 mlp_ratio=4,
+                 depths=(2, 2, 6, 2),
+                 num_heads=(3, 6, 12, 24),
+                 strides=(4, 2, 2, 2),
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=True,
+                 qk_scale=None,
+                 patch_norm=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_abs_pos_embed=False,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 pretrained=None,
+                 convert_weights=False,
+                 frozen_stages=-1,
+                 init_cfg=None):
+        self.convert_weights = convert_weights
+        self.frozen_stages = frozen_stages
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            self.init_cfg = init_cfg
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        super(SwinTransformer, self).__init__(init_cfg=init_cfg)
+
+        num_layers = len(depths)
+        self.out_indices = out_indices
+        self.use_abs_pos_embed = use_abs_pos_embed
+
+        assert strides[0] == patch_size, 'Use non-overlapping patch embed.'
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=strides[0],
+            norm_cfg=norm_cfg if patch_norm else None,
+            init_cfg=None)
+
+        if self.use_abs_pos_embed:
+            patch_row = pretrain_img_size[0] // patch_size
+            patch_col = pretrain_img_size[1] // patch_size
+            num_patches = patch_row * patch_col
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros((1, num_patches, embed_dims)))
+
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+
+        # set stochastic depth decay rule
+        total_depth = sum(depths)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
+        ]
+
+        self.stages = ModuleList()
+        in_channels = embed_dims
+        for i in range(num_layers):
+            if i < num_layers - 1:
+                downsample = PatchMerging(
+                    in_channels=in_channels,
+                    out_channels=2 * in_channels,
+                    stride=strides[i + 1],
+                    norm_cfg=norm_cfg if patch_norm else None,
+                    init_cfg=None)
+            else:
+                downsample = None
+
+            stage = SwinBlockSequence(
+                embed_dims=in_channels,
+                num_heads=num_heads[i],
+                feedforward_channels=mlp_ratio * in_channels,
+                depth=depths[i],
+                window_size=window_size,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                downsample=downsample,
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.stages.append(stage)
+            if downsample:
+                in_channels = downsample.out_channels
+
+        self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)]
+        # Add a norm layer for each output
+        for i in out_indices:
+            layer = build_norm_layer(norm_cfg, self.num_features[i])[1]
+            layer_name = f'norm{i}'
+            self.add_module(layer_name, layer)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            if self.use_abs_pos_embed:
+                self.absolute_pos_embed.requires_grad = False
+            self.drop_after_pos.eval()
+
+        for i in range(1, self.frozen_stages + 1):
+
+            if (i - 1) in self.out_indices:
+                norm_layer = getattr(self, f'norm{i-1}')
+                norm_layer.eval()
+                for param in norm_layer.parameters():
+                    param.requires_grad = False
+
+            m = self.stages[i - 1]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self):
+        logger = MMLogger.get_current_instance()
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            if self.use_abs_pos_embed:
+                trunc_normal_(self.absolute_pos_embed, std=0.02)
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, 1.0)
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            ckpt = CheckpointLoader.load_checkpoint(
+                self.init_cfg.checkpoint, logger=logger, map_location='cpu')
+            if 'state_dict' in ckpt:
+                _state_dict = ckpt['state_dict']
+            elif 'model' in ckpt:
+                _state_dict = ckpt['model']
+            else:
+                _state_dict = ckpt
+            if self.convert_weights:
+                # supported loading weight from original repo,
+                _state_dict = swin_converter(_state_dict)
+
+            state_dict = OrderedDict()
+            for k, v in _state_dict.items():
+                if k.startswith('backbone.'):
+                    state_dict[k[9:]] = v
+
+            # strip prefix of state_dict
+            if list(state_dict.keys())[0].startswith('module.'):
+                state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+            # reshape absolute position embedding
+            if state_dict.get('absolute_pos_embed') is not None:
+                absolute_pos_embed = state_dict['absolute_pos_embed']
+                N1, L, C1 = absolute_pos_embed.size()
+                N2, C2, H, W = self.absolute_pos_embed.size()
+                if N1 != N2 or C1 != C2 or L != H * W:
+                    logger.warning('Error in loading absolute_pos_embed, pass')
+                else:
+                    state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
+                        N2, H, W, C2).permute(0, 3, 1, 2).contiguous()
+
+            # interpolate position bias table if needed
+            relative_position_bias_table_keys = [
+                k for k in state_dict.keys()
+                if 'relative_position_bias_table' in k
+            ]
+            for table_key in relative_position_bias_table_keys:
+                table_pretrained = state_dict[table_key]
+                table_current = self.state_dict()[table_key]
+                L1, nH1 = table_pretrained.size()
+                L2, nH2 = table_current.size()
+                if nH1 != nH2:
+                    logger.warning(f'Error in loading {table_key}, pass')
+                elif L1 != L2:
+                    S1 = int(L1**0.5)
+                    S2 = int(L2**0.5)
+                    table_pretrained_resized = F.interpolate(
+                        table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1),
+                        size=(S2, S2),
+                        mode='bicubic')
+                    state_dict[table_key] = table_pretrained_resized.view(
+                        nH2, L2).permute(1, 0).contiguous()
+
+            # load state_dict
+            self.load_state_dict(state_dict, False)
+
+    def forward(self, x):
+        x, hw_shape = self.patch_embed(x)
+
+        if self.use_abs_pos_embed:
+            x = x + self.absolute_pos_embed
+        x = self.drop_after_pos(x)
+
+        outs = []
+        for i, stage in enumerate(self.stages):
+            x, hw_shape, out, out_hw_shape = stage(x, hw_shape)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                out = norm_layer(out)
+                out = out.view(-1, *out_hw_shape,
+                               self.num_features[i]).permute(0, 3, 1,
+                                                             2).contiguous()
+                outs.append(out)
+
+        return outs
+
+
+def swin_converter(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    def correct_unfold_reduction_order(x):
+        out_channel, in_channel = x.shape
+        x = x.reshape(out_channel, 4, in_channel // 4)
+        x = x[:, [0, 2, 1, 3], :].transpose(1,
+                                            2).reshape(out_channel, in_channel)
+        return x
+
+    def correct_unfold_norm_order(x):
+        in_channel = x.shape[0]
+        x = x.reshape(4, in_channel // 4)
+        x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+        return x
+
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        elif k.startswith('layers'):
+            new_v = v
+            if 'attn.' in k:
+                new_k = k.replace('attn.', 'attn.w_msa.')
+            elif 'mlp.' in k:
+                if 'mlp.fc1.' in k:
+                    new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.')
+                elif 'mlp.fc2.' in k:
+                    new_k = k.replace('mlp.fc2.', 'ffn.layers.1.')
+                else:
+                    new_k = k.replace('mlp.', 'ffn.')
+            elif 'downsample' in k:
+                new_k = k
+                if 'reduction.' in k:
+                    new_v = correct_unfold_reduction_order(v)
+                elif 'norm.' in k:
+                    new_v = correct_unfold_norm_order(v)
+            else:
+                new_k = k
+            new_k = new_k.replace('layers', 'stages', 1)
+        elif k.startswith('patch_embed'):
+            new_v = v
+            if 'proj' in k:
+                new_k = k.replace('proj', 'projection')
+            else:
+                new_k = k
+        else:
+            new_v = v
+            new_k = k
+
+        new_ckpt['backbone.' + new_k] = new_v
+
+    return new_ckpt
diff --git a/head_extractor/build/lib/mmdet/models/backbones/trident_resnet.py b/head_extractor/build/lib/mmdet/models/backbones/trident_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..22c76354522ff8533b094df6858ec361ba400c1e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/backbones/trident_resnet.py
@@ -0,0 +1,298 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.backbones.resnet import Bottleneck, ResNet
+from mmdet.registry import MODELS
+
+
+class TridentConv(BaseModule):
+    """Trident Convolution Module.
+
+    Args:
+        in_channels (int): Number of channels in input.
+        out_channels (int): Number of channels in output.
+        kernel_size (int): Size of convolution kernel.
+        stride (int, optional): Convolution stride. Default: 1.
+        trident_dilations (tuple[int, int, int], optional): Dilations of
+            different trident branch. Default: (1, 2, 3).
+        test_branch_idx (int, optional): In inference, all 3 branches will
+            be used if `test_branch_idx==-1`, otherwise only branch with
+            index `test_branch_idx` will be used. Default: 1.
+        bias (bool, optional): Whether to use bias in convolution or not.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 trident_dilations=(1, 2, 3),
+                 test_branch_idx=1,
+                 bias=False,
+                 init_cfg=None):
+        super(TridentConv, self).__init__(init_cfg)
+        self.num_branch = len(trident_dilations)
+        self.with_bias = bias
+        self.test_branch_idx = test_branch_idx
+        self.stride = _pair(stride)
+        self.kernel_size = _pair(kernel_size)
+        self.paddings = _pair(trident_dilations)
+        self.dilations = trident_dilations
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.bias = bias
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels, *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+
+    def extra_repr(self):
+        tmpstr = f'in_channels={self.in_channels}'
+        tmpstr += f', out_channels={self.out_channels}'
+        tmpstr += f', kernel_size={self.kernel_size}'
+        tmpstr += f', num_branch={self.num_branch}'
+        tmpstr += f', test_branch_idx={self.test_branch_idx}'
+        tmpstr += f', stride={self.stride}'
+        tmpstr += f', paddings={self.paddings}'
+        tmpstr += f', dilations={self.dilations}'
+        tmpstr += f', bias={self.bias}'
+        return tmpstr
+
+    def forward(self, inputs):
+        if self.training or self.test_branch_idx == -1:
+            outputs = [
+                F.conv2d(input, self.weight, self.bias, self.stride, padding,
+                         dilation) for input, dilation, padding in zip(
+                             inputs, self.dilations, self.paddings)
+            ]
+        else:
+            assert len(inputs) == 1
+            outputs = [
+                F.conv2d(inputs[0], self.weight, self.bias, self.stride,
+                         self.paddings[self.test_branch_idx],
+                         self.dilations[self.test_branch_idx])
+            ]
+
+        return outputs
+
+
+# Since TridentNet is defined over ResNet50 and ResNet101, here we
+# only support TridentBottleneckBlock.
+class TridentBottleneck(Bottleneck):
+    """BottleBlock for TridentResNet.
+
+    Args:
+        trident_dilations (tuple[int, int, int]): Dilations of different
+            trident branch.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+        concat_output (bool): Whether to concat the output list to a Tensor.
+            `True` only in the last Block.
+    """
+
+    def __init__(self, trident_dilations, test_branch_idx, concat_output,
+                 **kwargs):
+
+        super(TridentBottleneck, self).__init__(**kwargs)
+        self.trident_dilations = trident_dilations
+        self.num_branch = len(trident_dilations)
+        self.concat_output = concat_output
+        self.test_branch_idx = test_branch_idx
+        self.conv2 = TridentConv(
+            self.planes,
+            self.planes,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            bias=False,
+            trident_dilations=self.trident_dilations,
+            test_branch_idx=test_branch_idx,
+            init_cfg=dict(
+                type='Kaiming',
+                distribution='uniform',
+                mode='fan_in',
+                override=dict(name='conv2')))
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            num_branch = (
+                self.num_branch
+                if self.training or self.test_branch_idx == -1 else 1)
+            identity = x
+            if not isinstance(x, list):
+                x = (x, ) * num_branch
+                identity = x
+                if self.downsample is not None:
+                    identity = [self.downsample(b) for b in x]
+
+            out = [self.conv1(b) for b in x]
+            out = [self.norm1(b) for b in out]
+            out = [self.relu(b) for b in out]
+
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = [self.norm2(b) for b in out]
+            out = [self.relu(b) for b in out]
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv2_plugin_names)
+
+            out = [self.conv3(b) for b in out]
+            out = [self.norm3(b) for b in out]
+
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv3_plugin_names)
+
+            out = [
+                out_b + identity_b for out_b, identity_b in zip(out, identity)
+            ]
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = [self.relu(b) for b in out]
+        if self.concat_output:
+            out = torch.cat(out, dim=0)
+        return out
+
+
+def make_trident_res_layer(block,
+                           inplanes,
+                           planes,
+                           num_blocks,
+                           stride=1,
+                           trident_dilations=(1, 2, 3),
+                           style='pytorch',
+                           with_cp=False,
+                           conv_cfg=None,
+                           norm_cfg=dict(type='BN'),
+                           dcn=None,
+                           plugins=None,
+                           test_branch_idx=-1):
+    """Build Trident Res Layers."""
+
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = []
+        conv_stride = stride
+        downsample.extend([
+            build_conv_layer(
+                conv_cfg,
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=conv_stride,
+                bias=False),
+            build_norm_layer(norm_cfg, planes * block.expansion)[1]
+        ])
+        downsample = nn.Sequential(*downsample)
+
+    layers = []
+    for i in range(num_blocks):
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride if i == 0 else 1,
+                trident_dilations=trident_dilations,
+                downsample=downsample if i == 0 else None,
+                style=style,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=plugins,
+                test_branch_idx=test_branch_idx,
+                concat_output=True if i == num_blocks - 1 else False))
+        inplanes = planes * block.expansion
+    return nn.Sequential(*layers)
+
+
+@MODELS.register_module()
+class TridentResNet(ResNet):
+    """The stem layer, stage 1 and stage 2 in Trident ResNet are identical to
+    ResNet, while in stage 3, Trident BottleBlock is utilized to replace the
+    normal BottleBlock to yield trident output. Different branch shares the
+    convolution weight but uses different dilations to achieve multi-scale
+    output.
+
+                               / stage3(b0) \
+    x - stem - stage1 - stage2 - stage3(b1) - output
+                               \ stage3(b2) /
+
+    Args:
+        depth (int): Depth of resnet, from {50, 101, 152}.
+        num_branch (int): Number of branches in TridentNet.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+        trident_dilations (tuple[int]): Dilations of different trident branch.
+            len(trident_dilations) should be equal to num_branch.
+    """  # noqa
+
+    def __init__(self, depth, num_branch, test_branch_idx, trident_dilations,
+                 **kwargs):
+
+        assert num_branch == len(trident_dilations)
+        assert depth in (50, 101, 152)
+        super(TridentResNet, self).__init__(depth, **kwargs)
+        assert self.num_stages == 3
+        self.test_branch_idx = test_branch_idx
+        self.num_branch = num_branch
+
+        last_stage_idx = self.num_stages - 1
+        stride = self.strides[last_stage_idx]
+        dilation = trident_dilations
+        dcn = self.dcn if self.stage_with_dcn[last_stage_idx] else None
+        if self.plugins is not None:
+            stage_plugins = self.make_stage_plugins(self.plugins,
+                                                    last_stage_idx)
+        else:
+            stage_plugins = None
+        planes = self.base_channels * 2**last_stage_idx
+        res_layer = make_trident_res_layer(
+            TridentBottleneck,
+            inplanes=(self.block.expansion * self.base_channels *
+                      2**(last_stage_idx - 1)),
+            planes=planes,
+            num_blocks=self.stage_blocks[last_stage_idx],
+            stride=stride,
+            trident_dilations=dilation,
+            style=self.style,
+            with_cp=self.with_cp,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dcn=dcn,
+            plugins=stage_plugins,
+            test_branch_idx=self.test_branch_idx)
+
+        layer_name = f'layer{last_stage_idx + 1}'
+
+        self.__setattr__(layer_name, res_layer)
+        self.res_layers.pop(last_stage_idx)
+        self.res_layers.insert(last_stage_idx, layer_name)
+
+        self._freeze_stages()
diff --git a/head_extractor/build/lib/mmdet/models/data_preprocessors/__init__.py b/head_extractor/build/lib/mmdet/models/data_preprocessors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..201a1da6a4f320a17cea9c65d5c102bfdd7700d8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/data_preprocessors/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_preprocessor import (BatchFixedSizePad, BatchResize,
+                                BatchSyncRandomResize, BoxInstDataPreprocessor,
+                                DetDataPreprocessor,
+                                MultiBranchDataPreprocessor)
+from .reid_data_preprocessor import ReIDDataPreprocessor
+from .track_data_preprocessor import TrackDataPreprocessor
+
+__all__ = [
+    'DetDataPreprocessor', 'BatchSyncRandomResize', 'BatchFixedSizePad',
+    'MultiBranchDataPreprocessor', 'BatchResize', 'BoxInstDataPreprocessor',
+    'TrackDataPreprocessor', 'ReIDDataPreprocessor'
+]
diff --git a/head_extractor/build/lib/mmdet/models/data_preprocessors/data_preprocessor.py b/head_extractor/build/lib/mmdet/models/data_preprocessors/data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..55b5c35b3a4888c95c6646df3fa080347afe4704
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/data_preprocessors/data_preprocessor.py
@@ -0,0 +1,793 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from numbers import Number
+from typing import List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.dist import barrier, broadcast, get_dist_info
+from mmengine.logging import MessageHub
+from mmengine.model import BaseDataPreprocessor, ImgDataPreprocessor
+from mmengine.structures import PixelData
+from mmengine.utils import is_seq_of
+from torch import Tensor
+
+from mmdet.models.utils import unfold_wo_center
+from mmdet.models.utils.misc import samplelist_boxtype2tensor
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.mask import BitmapMasks
+from mmdet.utils import ConfigType
+
+try:
+    import skimage
+except ImportError:
+    skimage = None
+
+
+@MODELS.register_module()
+class DetDataPreprocessor(ImgDataPreprocessor):
+    """Image pre-processor for detection tasks.
+
+    Comparing with the :class:`mmengine.ImgDataPreprocessor`,
+
+    1. It supports batch augmentations.
+    2. It will additionally append batch_input_shape and pad_shape
+    to data_samples considering the object detection task.
+
+    It provides the data pre-processing as follows
+
+    - Collate and move data to the target device.
+    - Pad inputs to the maximum size of current batch with defined
+      ``pad_value``. The padding size can be divisible by a defined
+      ``pad_size_divisor``
+    - Stack inputs to batch_inputs.
+    - Convert inputs from bgr to rgb if the shape of input is (3, H, W).
+    - Normalize image with defined std and mean.
+    - Do batch augmentations during training.
+
+    Args:
+        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
+            Defaults to None.
+        std (Sequence[Number], optional): The pixel standard deviation of
+            R, G, B channels. Defaults to None.
+        pad_size_divisor (int): The size of padded image should be
+            divisible by ``pad_size_divisor``. Defaults to 1.
+        pad_value (Number): The padded pixel value. Defaults to 0.
+        pad_mask (bool): Whether to pad instance masks. Defaults to False.
+        mask_pad_value (int): The padded pixel value for instance masks.
+            Defaults to 0.
+        pad_seg (bool): Whether to pad semantic segmentation maps.
+            Defaults to False.
+        seg_pad_value (int): The padded pixel value for semantic
+            segmentation maps. Defaults to 255.
+        bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): whether to convert image from RGB to RGB.
+            Defaults to False.
+        boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of
+            bboxes data to ``Tensor`` type. Defaults to True.
+        non_blocking (bool): Whether block current process
+            when transferring data to device. Defaults to False.
+        batch_augments (list[dict], optional): Batch-level augmentations
+    """
+
+    def __init__(self,
+                 mean: Sequence[Number] = None,
+                 std: Sequence[Number] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Union[float, int] = 0,
+                 pad_mask: bool = False,
+                 mask_pad_value: int = 0,
+                 pad_seg: bool = False,
+                 seg_pad_value: int = 255,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False,
+                 boxtype2tensor: bool = True,
+                 non_blocking: Optional[bool] = False,
+                 batch_augments: Optional[List[dict]] = None):
+        super().__init__(
+            mean=mean,
+            std=std,
+            pad_size_divisor=pad_size_divisor,
+            pad_value=pad_value,
+            bgr_to_rgb=bgr_to_rgb,
+            rgb_to_bgr=rgb_to_bgr,
+            non_blocking=non_blocking)
+        if batch_augments is not None:
+            self.batch_augments = nn.ModuleList(
+                [MODELS.build(aug) for aug in batch_augments])
+        else:
+            self.batch_augments = None
+        self.pad_mask = pad_mask
+        self.mask_pad_value = mask_pad_value
+        self.pad_seg = pad_seg
+        self.seg_pad_value = seg_pad_value
+        self.boxtype2tensor = boxtype2tensor
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization,padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        batch_pad_shape = self._get_pad_shape(data)
+        data = super().forward(data=data, training=training)
+        inputs, data_samples = data['inputs'], data['data_samples']
+
+        if data_samples is not None:
+            # NOTE the batched image size information may be useful, e.g.
+            # in DETR, this is needed for the construction of masks, which is
+            # then used for the transformer_head.
+            batch_input_shape = tuple(inputs[0].size()[-2:])
+            for data_sample, pad_shape in zip(data_samples, batch_pad_shape):
+                data_sample.set_metainfo({
+                    'batch_input_shape': batch_input_shape,
+                    'pad_shape': pad_shape
+                })
+
+            if self.boxtype2tensor:
+                samplelist_boxtype2tensor(data_samples)
+
+            if self.pad_mask and training:
+                self.pad_gt_masks(data_samples)
+
+            if self.pad_seg and training:
+                self.pad_gt_sem_seg(data_samples)
+
+        if training and self.batch_augments is not None:
+            for batch_aug in self.batch_augments:
+                inputs, data_samples = batch_aug(inputs, data_samples)
+
+        return {'inputs': inputs, 'data_samples': data_samples}
+
+    def _get_pad_shape(self, data: dict) -> List[tuple]:
+        """Get the pad_shape of each image based on data and
+        pad_size_divisor."""
+        _batch_inputs = data['inputs']
+        # Process data with `pseudo_collate`.
+        if is_seq_of(_batch_inputs, torch.Tensor):
+            batch_pad_shape = []
+            for ori_input in _batch_inputs:
+                pad_h = int(
+                    np.ceil(ori_input.shape[1] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                pad_w = int(
+                    np.ceil(ori_input.shape[2] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                batch_pad_shape.append((pad_h, pad_w))
+        # Process data with `default_collate`.
+        elif isinstance(_batch_inputs, torch.Tensor):
+            assert _batch_inputs.dim() == 4, (
+                'The input of `ImgDataPreprocessor` should be a NCHW tensor '
+                'or a list of tensor, but got a tensor with shape: '
+                f'{_batch_inputs.shape}')
+            pad_h = int(
+                np.ceil(_batch_inputs.shape[2] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            pad_w = int(
+                np.ceil(_batch_inputs.shape[3] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0]
+        else:
+            raise TypeError('Output of `cast_data` should be a dict '
+                            'or a tuple with inputs and data_samples, but got'
+                            f'{type(data)}: {data}')
+        return batch_pad_shape
+
+    def pad_gt_masks(self,
+                     batch_data_samples: Sequence[DetDataSample]) -> None:
+        """Pad gt_masks to shape of batch_input_shape."""
+        if 'masks' in batch_data_samples[0].gt_instances:
+            for data_samples in batch_data_samples:
+                masks = data_samples.gt_instances.masks
+                data_samples.gt_instances.masks = masks.pad(
+                    data_samples.batch_input_shape,
+                    pad_val=self.mask_pad_value)
+
+    def pad_gt_sem_seg(self,
+                       batch_data_samples: Sequence[DetDataSample]) -> None:
+        """Pad gt_sem_seg to shape of batch_input_shape."""
+        if 'gt_sem_seg' in batch_data_samples[0]:
+            for data_samples in batch_data_samples:
+                gt_sem_seg = data_samples.gt_sem_seg.sem_seg
+                h, w = gt_sem_seg.shape[-2:]
+                pad_h, pad_w = data_samples.batch_input_shape
+                gt_sem_seg = F.pad(
+                    gt_sem_seg,
+                    pad=(0, max(pad_w - w, 0), 0, max(pad_h - h, 0)),
+                    mode='constant',
+                    value=self.seg_pad_value)
+                data_samples.gt_sem_seg = PixelData(sem_seg=gt_sem_seg)
+
+
+@MODELS.register_module()
+class BatchSyncRandomResize(nn.Module):
+    """Batch random resize which synchronizes the random size across ranks.
+
+    Args:
+        random_size_range (tuple): The multi-scale random range during
+            multi-scale training.
+        interval (int): The iter interval of change
+            image size. Defaults to 10.
+        size_divisor (int): Image size divisible factor.
+            Defaults to 32.
+    """
+
+    def __init__(self,
+                 random_size_range: Tuple[int, int],
+                 interval: int = 10,
+                 size_divisor: int = 32) -> None:
+        super().__init__()
+        self.rank, self.world_size = get_dist_info()
+        self._input_size = None
+        self._random_size_range = (round(random_size_range[0] / size_divisor),
+                                   round(random_size_range[1] / size_divisor))
+        self._interval = interval
+        self._size_divisor = size_divisor
+
+    def forward(
+        self, inputs: Tensor, data_samples: List[DetDataSample]
+    ) -> Tuple[Tensor, List[DetDataSample]]:
+        """resize a batch of images and bboxes to shape ``self._input_size``"""
+        h, w = inputs.shape[-2:]
+        if self._input_size is None:
+            self._input_size = (h, w)
+        scale_y = self._input_size[0] / h
+        scale_x = self._input_size[1] / w
+        if scale_x != 1 or scale_y != 1:
+            inputs = F.interpolate(
+                inputs,
+                size=self._input_size,
+                mode='bilinear',
+                align_corners=False)
+            for data_sample in data_samples:
+                img_shape = (int(data_sample.img_shape[0] * scale_y),
+                             int(data_sample.img_shape[1] * scale_x))
+                pad_shape = (int(data_sample.pad_shape[0] * scale_y),
+                             int(data_sample.pad_shape[1] * scale_x))
+                data_sample.set_metainfo({
+                    'img_shape': img_shape,
+                    'pad_shape': pad_shape,
+                    'batch_input_shape': self._input_size
+                })
+                data_sample.gt_instances.bboxes[
+                    ...,
+                    0::2] = data_sample.gt_instances.bboxes[...,
+                                                            0::2] * scale_x
+                data_sample.gt_instances.bboxes[
+                    ...,
+                    1::2] = data_sample.gt_instances.bboxes[...,
+                                                            1::2] * scale_y
+                if 'ignored_instances' in data_sample:
+                    data_sample.ignored_instances.bboxes[
+                        ..., 0::2] = data_sample.ignored_instances.bboxes[
+                            ..., 0::2] * scale_x
+                    data_sample.ignored_instances.bboxes[
+                        ..., 1::2] = data_sample.ignored_instances.bboxes[
+                            ..., 1::2] * scale_y
+        message_hub = MessageHub.get_current_instance()
+        if (message_hub.get_info('iter') + 1) % self._interval == 0:
+            self._input_size = self._get_random_size(
+                aspect_ratio=float(w / h), device=inputs.device)
+        return inputs, data_samples
+
+    def _get_random_size(self, aspect_ratio: float,
+                         device: torch.device) -> Tuple[int, int]:
+        """Randomly generate a shape in ``_random_size_range`` and broadcast to
+        all ranks."""
+        tensor = torch.LongTensor(2).to(device)
+        if self.rank == 0:
+            size = random.randint(*self._random_size_range)
+            size = (self._size_divisor * size,
+                    self._size_divisor * int(aspect_ratio * size))
+            tensor[0] = size[0]
+            tensor[1] = size[1]
+        barrier()
+        broadcast(tensor, 0)
+        input_size = (tensor[0].item(), tensor[1].item())
+        return input_size
+
+
+@MODELS.register_module()
+class BatchFixedSizePad(nn.Module):
+    """Fixed size padding for batch images.
+
+    Args:
+        size (Tuple[int, int]): Fixed padding size. Expected padding
+            shape (h, w). Defaults to None.
+        img_pad_value (int): The padded pixel value for images.
+            Defaults to 0.
+        pad_mask (bool): Whether to pad instance masks. Defaults to False.
+        mask_pad_value (int): The padded pixel value for instance masks.
+            Defaults to 0.
+        pad_seg (bool): Whether to pad semantic segmentation maps.
+            Defaults to False.
+        seg_pad_value (int): The padded pixel value for semantic
+            segmentation maps. Defaults to 255.
+    """
+
+    def __init__(self,
+                 size: Tuple[int, int],
+                 img_pad_value: int = 0,
+                 pad_mask: bool = False,
+                 mask_pad_value: int = 0,
+                 pad_seg: bool = False,
+                 seg_pad_value: int = 255) -> None:
+        super().__init__()
+        self.size = size
+        self.pad_mask = pad_mask
+        self.pad_seg = pad_seg
+        self.img_pad_value = img_pad_value
+        self.mask_pad_value = mask_pad_value
+        self.seg_pad_value = seg_pad_value
+
+    def forward(
+        self,
+        inputs: Tensor,
+        data_samples: Optional[List[dict]] = None
+    ) -> Tuple[Tensor, Optional[List[dict]]]:
+        """Pad image, instance masks, segmantic segmentation maps."""
+        src_h, src_w = inputs.shape[-2:]
+        dst_h, dst_w = self.size
+
+        if src_h >= dst_h and src_w >= dst_w:
+            return inputs, data_samples
+
+        inputs = F.pad(
+            inputs,
+            pad=(0, max(0, dst_w - src_w), 0, max(0, dst_h - src_h)),
+            mode='constant',
+            value=self.img_pad_value)
+
+        if data_samples is not None:
+            # update batch_input_shape
+            for data_sample in data_samples:
+                data_sample.set_metainfo({
+                    'batch_input_shape': (dst_h, dst_w),
+                    'pad_shape': (dst_h, dst_w)
+                })
+
+            if self.pad_mask:
+                for data_sample in data_samples:
+                    masks = data_sample.gt_instances.masks
+                    data_sample.gt_instances.masks = masks.pad(
+                        (dst_h, dst_w), pad_val=self.mask_pad_value)
+
+            if self.pad_seg:
+                for data_sample in data_samples:
+                    gt_sem_seg = data_sample.gt_sem_seg.sem_seg
+                    h, w = gt_sem_seg.shape[-2:]
+                    gt_sem_seg = F.pad(
+                        gt_sem_seg,
+                        pad=(0, max(0, dst_w - w), 0, max(0, dst_h - h)),
+                        mode='constant',
+                        value=self.seg_pad_value)
+                    data_sample.gt_sem_seg = PixelData(sem_seg=gt_sem_seg)
+
+        return inputs, data_samples
+
+
+@MODELS.register_module()
+class MultiBranchDataPreprocessor(BaseDataPreprocessor):
+    """DataPreprocessor wrapper for multi-branch data.
+
+    Take semi-supervised object detection as an example, assume that
+    the ratio of labeled data and unlabeled data in a batch is 1:2,
+    `sup` indicates the branch where the labeled data is augmented,
+    `unsup_teacher` and `unsup_student` indicate the branches where
+    the unlabeled data is augmented by different pipeline.
+
+    The input format of multi-branch data is shown as below :
+
+    .. code-block:: none
+        {
+            'inputs':
+                {
+                    'sup': [Tensor, None, None],
+                    'unsup_teacher': [None, Tensor, Tensor],
+                    'unsup_student': [None, Tensor, Tensor],
+                },
+            'data_sample':
+                {
+                    'sup': [DetDataSample, None, None],
+                    'unsup_teacher': [None, DetDataSample, DetDataSample],
+                    'unsup_student': [NOne, DetDataSample, DetDataSample],
+                }
+        }
+
+    The format of multi-branch data
+    after filtering None is shown as below :
+
+    .. code-block:: none
+        {
+            'inputs':
+                {
+                    'sup': [Tensor],
+                    'unsup_teacher': [Tensor, Tensor],
+                    'unsup_student': [Tensor, Tensor],
+                },
+            'data_sample':
+                {
+                    'sup': [DetDataSample],
+                    'unsup_teacher': [DetDataSample, DetDataSample],
+                    'unsup_student': [DetDataSample, DetDataSample],
+                }
+        }
+
+    In order to reuse `DetDataPreprocessor` for the data
+    from different branches, the format of multi-branch data
+    grouped by branch is as below :
+
+    .. code-block:: none
+        {
+            'sup':
+                {
+                    'inputs': [Tensor]
+                    'data_sample': [DetDataSample, DetDataSample]
+                },
+            'unsup_teacher':
+                {
+                    'inputs': [Tensor, Tensor]
+                    'data_sample': [DetDataSample, DetDataSample]
+                },
+            'unsup_student':
+                {
+                    'inputs': [Tensor, Tensor]
+                    'data_sample': [DetDataSample, DetDataSample]
+                },
+        }
+
+    After preprocessing data from different branches,
+    the multi-branch data needs to be reformatted as:
+
+    .. code-block:: none
+        {
+            'inputs':
+                {
+                    'sup': [Tensor],
+                    'unsup_teacher': [Tensor, Tensor],
+                    'unsup_student': [Tensor, Tensor],
+                },
+            'data_sample':
+                {
+                    'sup': [DetDataSample],
+                    'unsup_teacher': [DetDataSample, DetDataSample],
+                    'unsup_student': [DetDataSample, DetDataSample],
+                }
+        }
+
+    Args:
+        data_preprocessor (:obj:`ConfigDict` or dict): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+    """
+
+    def __init__(self, data_preprocessor: ConfigType) -> None:
+        super().__init__()
+        self.data_preprocessor = MODELS.build(data_preprocessor)
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization,padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor`` for multi-branch data.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict:
+
+            - 'inputs' (Dict[str, obj:`torch.Tensor`]): The forward data of
+                models from different branches.
+            - 'data_sample' (Dict[str, obj:`DetDataSample`]): The annotation
+                info of the sample from different branches.
+        """
+
+        if training is False:
+            return self.data_preprocessor(data, training)
+
+        # Filter out branches with a value of None
+        for key in data.keys():
+            for branch in data[key].keys():
+                data[key][branch] = list(
+                    filter(lambda x: x is not None, data[key][branch]))
+
+        # Group data by branch
+        multi_branch_data = {}
+        for key in data.keys():
+            for branch in data[key].keys():
+                if multi_branch_data.get(branch, None) is None:
+                    multi_branch_data[branch] = {key: data[key][branch]}
+                elif multi_branch_data[branch].get(key, None) is None:
+                    multi_branch_data[branch][key] = data[key][branch]
+                else:
+                    multi_branch_data[branch][key].append(data[key][branch])
+
+        # Preprocess data from different branches
+        for branch, _data in multi_branch_data.items():
+            multi_branch_data[branch] = self.data_preprocessor(_data, training)
+
+        # Format data by inputs and data_samples
+        format_data = {}
+        for branch in multi_branch_data.keys():
+            for key in multi_branch_data[branch].keys():
+                if format_data.get(key, None) is None:
+                    format_data[key] = {branch: multi_branch_data[branch][key]}
+                elif format_data[key].get(branch, None) is None:
+                    format_data[key][branch] = multi_branch_data[branch][key]
+                else:
+                    format_data[key][branch].append(
+                        multi_branch_data[branch][key])
+
+        return format_data
+
+    @property
+    def device(self):
+        return self.data_preprocessor.device
+
+    def to(self, device: Optional[Union[int, torch.device]], *args,
+           **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Args:
+            device (int or torch.device, optional): The desired device of the
+                parameters and buffers in this module.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+
+        return self.data_preprocessor.to(device, *args, **kwargs)
+
+    def cuda(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+
+        return self.data_preprocessor.cuda(*args, **kwargs)
+
+    def cpu(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+
+        return self.data_preprocessor.cpu(*args, **kwargs)
+
+
+@MODELS.register_module()
+class BatchResize(nn.Module):
+    """Batch resize during training. This implementation is modified from
+    https://github.com/Purkialo/CrowdDet/blob/master/lib/data/CrowdHuman.py.
+
+    It provides the data pre-processing as follows:
+    - A batch of all images will pad to a uniform size and stack them into
+      a torch.Tensor by `DetDataPreprocessor`.
+    - `BatchFixShapeResize` resize all images to the target size.
+    - Padding images to make sure the size of image can be divisible by
+      ``pad_size_divisor``.
+
+    Args:
+        scale (tuple): Images scales for resizing.
+        pad_size_divisor (int): Image size divisible factor.
+            Defaults to 1.
+        pad_value (Number): The padded pixel value. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        scale: tuple,
+        pad_size_divisor: int = 1,
+        pad_value: Union[float, int] = 0,
+    ) -> None:
+        super().__init__()
+        self.min_size = min(scale)
+        self.max_size = max(scale)
+        self.pad_size_divisor = pad_size_divisor
+        self.pad_value = pad_value
+
+    def forward(
+        self, inputs: Tensor, data_samples: List[DetDataSample]
+    ) -> Tuple[Tensor, List[DetDataSample]]:
+        """resize a batch of images and bboxes."""
+
+        batch_height, batch_width = inputs.shape[-2:]
+        target_height, target_width, scale = self.get_target_size(
+            batch_height, batch_width)
+
+        inputs = F.interpolate(
+            inputs,
+            size=(target_height, target_width),
+            mode='bilinear',
+            align_corners=False)
+
+        inputs = self.get_padded_tensor(inputs, self.pad_value)
+
+        if data_samples is not None:
+            batch_input_shape = tuple(inputs.size()[-2:])
+            for data_sample in data_samples:
+                img_shape = [
+                    int(scale * _) for _ in list(data_sample.img_shape)
+                ]
+                data_sample.set_metainfo({
+                    'img_shape': tuple(img_shape),
+                    'batch_input_shape': batch_input_shape,
+                    'pad_shape': batch_input_shape,
+                    'scale_factor': (scale, scale)
+                })
+
+                data_sample.gt_instances.bboxes *= scale
+                data_sample.ignored_instances.bboxes *= scale
+
+        return inputs, data_samples
+
+    def get_target_size(self, height: int,
+                        width: int) -> Tuple[int, int, float]:
+        """Get the target size of a batch of images based on data and scale."""
+        im_size_min = np.min([height, width])
+        im_size_max = np.max([height, width])
+        scale = self.min_size / im_size_min
+        if scale * im_size_max > self.max_size:
+            scale = self.max_size / im_size_max
+        target_height, target_width = int(round(height * scale)), int(
+            round(width * scale))
+        return target_height, target_width, scale
+
+    def get_padded_tensor(self, tensor: Tensor, pad_value: int) -> Tensor:
+        """Pad images according to pad_size_divisor."""
+        assert tensor.ndim == 4
+        target_height, target_width = tensor.shape[-2], tensor.shape[-1]
+        divisor = self.pad_size_divisor
+        padded_height = (target_height + divisor - 1) // divisor * divisor
+        padded_width = (target_width + divisor - 1) // divisor * divisor
+        padded_tensor = torch.ones([
+            tensor.shape[0], tensor.shape[1], padded_height, padded_width
+        ]) * pad_value
+        padded_tensor = padded_tensor.type_as(tensor)
+        padded_tensor[:, :, :target_height, :target_width] = tensor
+        return padded_tensor
+
+
+@MODELS.register_module()
+class BoxInstDataPreprocessor(DetDataPreprocessor):
+    """Pseudo mask pre-processor for BoxInst.
+
+    Comparing with the :class:`mmdet.DetDataPreprocessor`,
+
+    1. It generates masks using box annotations.
+    2. It computes the images color similarity in LAB color space.
+
+    Args:
+        mask_stride (int): The mask output stride in boxinst. Defaults to 4.
+        pairwise_size (int): The size of neighborhood for each pixel.
+            Defaults to 3.
+        pairwise_dilation (int): The dilation of neighborhood for each pixel.
+            Defaults to 2.
+        pairwise_color_thresh (float): The thresh of image color similarity.
+            Defaults to 0.3.
+        bottom_pixels_removed (int): The length of removed pixels in bottom.
+            It is caused by the annotation error in coco dataset.
+            Defaults to 10.
+    """
+
+    def __init__(self,
+                 *arg,
+                 mask_stride: int = 4,
+                 pairwise_size: int = 3,
+                 pairwise_dilation: int = 2,
+                 pairwise_color_thresh: float = 0.3,
+                 bottom_pixels_removed: int = 10,
+                 **kwargs) -> None:
+        super().__init__(*arg, **kwargs)
+        self.mask_stride = mask_stride
+        self.pairwise_size = pairwise_size
+        self.pairwise_dilation = pairwise_dilation
+        self.pairwise_color_thresh = pairwise_color_thresh
+        self.bottom_pixels_removed = bottom_pixels_removed
+
+        if skimage is None:
+            raise RuntimeError('skimage is not installed,\
+                 please install it by: pip install scikit-image')
+
+    def get_images_color_similarity(self, inputs: Tensor,
+                                    image_masks: Tensor) -> Tensor:
+        """Compute the image color similarity in LAB color space."""
+        assert inputs.dim() == 4
+        assert inputs.size(0) == 1
+
+        unfolded_images = unfold_wo_center(
+            inputs,
+            kernel_size=self.pairwise_size,
+            dilation=self.pairwise_dilation)
+        diff = inputs[:, :, None] - unfolded_images
+        similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5)
+
+        unfolded_weights = unfold_wo_center(
+            image_masks[None, None],
+            kernel_size=self.pairwise_size,
+            dilation=self.pairwise_dilation)
+        unfolded_weights = torch.max(unfolded_weights, dim=1)[0]
+
+        return similarity * unfolded_weights
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Get pseudo mask labels using color similarity."""
+        det_data = super().forward(data, training)
+        inputs, data_samples = det_data['inputs'], det_data['data_samples']
+
+        if training:
+            # get image masks and remove bottom pixels
+            b_img_h, b_img_w = data_samples[0].batch_input_shape
+            img_masks = []
+            for i in range(inputs.shape[0]):
+                img_h, img_w = data_samples[i].img_shape
+                img_mask = inputs.new_ones((img_h, img_w))
+                pixels_removed = int(self.bottom_pixels_removed *
+                                     float(img_h) / float(b_img_h))
+                if pixels_removed > 0:
+                    img_mask[-pixels_removed:, :] = 0
+                pad_w = b_img_w - img_w
+                pad_h = b_img_h - img_h
+                img_mask = F.pad(img_mask, (0, pad_w, 0, pad_h), 'constant',
+                                 0.)
+                img_masks.append(img_mask)
+            img_masks = torch.stack(img_masks, dim=0)
+            start = int(self.mask_stride // 2)
+            img_masks = img_masks[:, start::self.mask_stride,
+                                  start::self.mask_stride]
+
+            # Get origin rgb image for color similarity
+            ori_imgs = inputs * self.std + self.mean
+            downsampled_imgs = F.avg_pool2d(
+                ori_imgs.float(),
+                kernel_size=self.mask_stride,
+                stride=self.mask_stride,
+                padding=0)
+
+            # Compute color similarity for pseudo mask generation
+            for im_i, data_sample in enumerate(data_samples):
+                # TODO: Support rgb2lab in mmengine?
+                images_lab = skimage.color.rgb2lab(
+                    downsampled_imgs[im_i].byte().permute(1, 2,
+                                                          0).cpu().numpy())
+                images_lab = torch.as_tensor(
+                    images_lab, device=ori_imgs.device, dtype=torch.float32)
+                images_lab = images_lab.permute(2, 0, 1)[None]
+                images_color_similarity = self.get_images_color_similarity(
+                    images_lab, img_masks[im_i])
+                pairwise_mask = (images_color_similarity >=
+                                 self.pairwise_color_thresh).float()
+
+                per_im_bboxes = data_sample.gt_instances.bboxes
+                if per_im_bboxes.shape[0] > 0:
+                    per_im_masks = []
+                    for per_box in per_im_bboxes:
+                        mask_full = torch.zeros((b_img_h, b_img_w),
+                                                device=self.device).float()
+                        mask_full[int(per_box[1]):int(per_box[3] + 1),
+                                  int(per_box[0]):int(per_box[2] + 1)] = 1.0
+                        per_im_masks.append(mask_full)
+                    per_im_masks = torch.stack(per_im_masks, dim=0)
+                    pairwise_masks = torch.cat(
+                        [pairwise_mask for _ in range(per_im_bboxes.shape[0])],
+                        dim=0)
+                else:
+                    per_im_masks = torch.zeros((0, b_img_h, b_img_w))
+                    pairwise_masks = torch.zeros(
+                        (0, self.pairwise_size**2 - 1, b_img_h, b_img_w))
+
+                # TODO: Support BitmapMasks with tensor?
+                data_sample.gt_instances.masks = BitmapMasks(
+                    per_im_masks.cpu().numpy(), b_img_h, b_img_w)
+                data_sample.gt_instances.pairwise_masks = pairwise_masks
+        return {'inputs': inputs, 'data_samples': data_samples}
diff --git a/head_extractor/build/lib/mmdet/models/data_preprocessors/reid_data_preprocessor.py b/head_extractor/build/lib/mmdet/models/data_preprocessors/reid_data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d0a1d45d97ba350e8845c6620f3b73f05545e61
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/data_preprocessors/reid_data_preprocessor.py
@@ -0,0 +1,216 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from numbers import Number
+from typing import Optional, Sequence
+
+import torch
+import torch.nn.functional as F
+from mmengine.model import BaseDataPreprocessor, stack_batch
+
+from mmdet.registry import MODELS
+
+try:
+    import mmpretrain
+    from mmpretrain.models.utils.batch_augments import RandomBatchAugment
+    from mmpretrain.structures import (batch_label_to_onehot, cat_batch_labels,
+                                       tensor_split)
+except ImportError:
+    mmpretrain = None
+
+
+def stack_batch_scores(elements, device=None):
+    """Stack the ``score`` of a batch of :obj:`LabelData` to a tensor.
+
+    Args:
+        elements (List[LabelData]): A batch of :obj`LabelData`.
+        device (torch.device, optional): The output device of the batch label.
+            Defaults to None.
+    Returns:
+        torch.Tensor: The stacked score tensor.
+    """
+    item = elements[0]
+    if 'score' not in item._data_fields:
+        return None
+
+    batch_score = torch.stack([element.score for element in elements])
+    if device is not None:
+        batch_score = batch_score.to(device)
+    return batch_score
+
+
+@MODELS.register_module()
+class ReIDDataPreprocessor(BaseDataPreprocessor):
+    """Image pre-processor for classification tasks.
+
+    Comparing with the :class:`mmengine.model.ImgDataPreprocessor`,
+
+    1. It won't do normalization if ``mean`` is not specified.
+    2. It does normalization and color space conversion after stacking batch.
+    3. It supports batch augmentations like mixup and cutmix.
+
+    It provides the data pre-processing as follows
+
+    - Collate and move data to the target device.
+    - Pad inputs to the maximum size of current batch with defined
+      ``pad_value``. The padding size can be divisible by a defined
+      ``pad_size_divisor``
+    - Stack inputs to batch_inputs.
+    - Convert inputs from bgr to rgb if the shape of input is (3, H, W).
+    - Normalize image with defined std and mean.
+    - Do batch augmentations like Mixup and Cutmix during training.
+
+    Args:
+        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
+            Defaults to None.
+        std (Sequence[Number], optional): The pixel standard deviation of
+            R, G, B channels. Defaults to None.
+        pad_size_divisor (int): The size of padded image should be
+            divisible by ``pad_size_divisor``. Defaults to 1.
+        pad_value (Number): The padded pixel value. Defaults to 0.
+        to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        to_onehot (bool): Whether to generate one-hot format gt-labels and set
+            to data samples. Defaults to False.
+        num_classes (int, optional): The number of classes. Defaults to None.
+        batch_augments (dict, optional): The batch augmentations settings,
+            including "augments" and "probs". For more details, see
+            :class:`mmpretrain.models.RandomBatchAugment`.
+    """
+
+    def __init__(self,
+                 mean: Sequence[Number] = None,
+                 std: Sequence[Number] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Number = 0,
+                 to_rgb: bool = False,
+                 to_onehot: bool = False,
+                 num_classes: Optional[int] = None,
+                 batch_augments: Optional[dict] = None):
+        if mmpretrain is None:
+            raise RuntimeError('Please run "pip install openmim" and '
+                               'run "mim install mmpretrain" to '
+                               'install mmpretrain first.')
+        super().__init__()
+        self.pad_size_divisor = pad_size_divisor
+        self.pad_value = pad_value
+        self.to_rgb = to_rgb
+        self.to_onehot = to_onehot
+        self.num_classes = num_classes
+
+        if mean is not None:
+            assert std is not None, 'To enable the normalization in ' \
+                'preprocessing, please specify both `mean` and `std`.'
+            # Enable the normalization in preprocessing.
+            self._enable_normalize = True
+            self.register_buffer('mean',
+                                 torch.tensor(mean).view(-1, 1, 1), False)
+            self.register_buffer('std',
+                                 torch.tensor(std).view(-1, 1, 1), False)
+        else:
+            self._enable_normalize = False
+
+        if batch_augments is not None:
+            self.batch_augments = RandomBatchAugment(**batch_augments)
+            if not self.to_onehot:
+                from mmengine.logging import MMLogger
+                MMLogger.get_current_instance().info(
+                    'Because batch augmentations are enabled, the data '
+                    'preprocessor automatically enables the `to_onehot` '
+                    'option to generate one-hot format labels.')
+                self.to_onehot = True
+        else:
+            self.batch_augments = None
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization, padding, bgr2rgb conversion and batch
+        augmentation based on ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict): data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        inputs = self.cast_data(data['inputs'])
+
+        if isinstance(inputs, torch.Tensor):
+            # The branch if use `default_collate` as the collate_fn in the
+            # dataloader.
+
+            # ------ To RGB ------
+            if self.to_rgb and inputs.size(1) == 3:
+                inputs = inputs.flip(1)
+
+            # -- Normalization ---
+            inputs = inputs.float()
+            if self._enable_normalize:
+                inputs = (inputs - self.mean) / self.std
+
+            # ------ Padding -----
+            if self.pad_size_divisor > 1:
+                h, w = inputs.shape[-2:]
+
+                target_h = math.ceil(
+                    h / self.pad_size_divisor) * self.pad_size_divisor
+                target_w = math.ceil(
+                    w / self.pad_size_divisor) * self.pad_size_divisor
+                pad_h = target_h - h
+                pad_w = target_w - w
+                inputs = F.pad(inputs, (0, pad_w, 0, pad_h), 'constant',
+                               self.pad_value)
+        else:
+            # The branch if use `pseudo_collate` as the collate_fn in the
+            # dataloader.
+
+            processed_inputs = []
+            for input_ in inputs:
+                # ------ To RGB ------
+                if self.to_rgb and input_.size(0) == 3:
+                    input_ = input_.flip(0)
+
+                # -- Normalization ---
+                input_ = input_.float()
+                if self._enable_normalize:
+                    input_ = (input_ - self.mean) / self.std
+
+                processed_inputs.append(input_)
+            # Combine padding and stack
+            inputs = stack_batch(processed_inputs, self.pad_size_divisor,
+                                 self.pad_value)
+
+        data_samples = data.get('data_samples', None)
+        sample_item = data_samples[0] if data_samples is not None else None
+        if 'gt_label' in sample_item:
+            gt_labels = [sample.gt_label for sample in data_samples]
+            gt_labels_tensor = [gt_label.label for gt_label in gt_labels]
+            batch_label, label_indices = cat_batch_labels(gt_labels_tensor)
+            batch_label = batch_label.to(self.device)
+
+            batch_score = stack_batch_scores(gt_labels, device=self.device)
+            if batch_score is None and self.to_onehot:
+                assert batch_label is not None, \
+                    'Cannot generate onehot format labels because no labels.'
+                num_classes = self.num_classes or data_samples[0].get(
+                    'num_classes')
+                assert num_classes is not None, \
+                    'Cannot generate one-hot format labels because not set ' \
+                    '`num_classes` in `data_preprocessor`.'
+                batch_score = batch_label_to_onehot(batch_label, label_indices,
+                                                    num_classes)
+
+            # ----- Batch Augmentations ----
+            if training and self.batch_augments is not None:
+                inputs, batch_score = self.batch_augments(inputs, batch_score)
+
+            # ----- scatter labels and scores to data samples ---
+            if batch_label is not None:
+                for sample, label in zip(
+                        data_samples, tensor_split(batch_label,
+                                                   label_indices)):
+                    sample.set_gt_label(label)
+            if batch_score is not None:
+                for sample, score in zip(data_samples, batch_score):
+                    sample.set_gt_score(score)
+
+        return {'inputs': inputs, 'data_samples': data_samples}
diff --git a/head_extractor/build/lib/mmdet/models/data_preprocessors/track_data_preprocessor.py b/head_extractor/build/lib/mmdet/models/data_preprocessors/track_data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..40a65b8eaebacdaddd574768fbb00e8c5a072d85
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/data_preprocessors/track_data_preprocessor.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmengine.model.utils import stack_batch
+
+from mmdet.models.utils.misc import samplelist_boxtype2tensor
+from mmdet.registry import MODELS
+from mmdet.structures import TrackDataSample
+from mmdet.structures.mask import BitmapMasks
+from .data_preprocessor import DetDataPreprocessor
+
+
+@MODELS.register_module()
+class TrackDataPreprocessor(DetDataPreprocessor):
+    """Image pre-processor for tracking tasks.
+
+        Accepts the data sampled by the dataloader, and preprocesses
+        it into the format of the model input. ``TrackDataPreprocessor``
+        provides the tracking data pre-processing as follows:
+
+        - Collate and move data to the target device.
+        - Pad inputs to the maximum size of current batch with defined
+          ``pad_value``. The padding size can be divisible by a defined
+          ``pad_size_divisor``
+        - Stack inputs to inputs.
+        - Convert inputs from bgr to rgb if the shape of input is (1, 3, H, W).
+        - Normalize image with defined std and mean.
+        - Do batch augmentations during training.
+        - Record the information of ``batch_input_shape`` and ``pad_shape``.
+
+        Args:
+            mean (Sequence[Number], optional): The pixel mean of R, G, B
+                channels. Defaults to None.
+            std (Sequence[Number], optional): The pixel standard deviation of
+                R, G, B channels. Defaults to None.
+            pad_size_divisor (int): The size of padded image should be
+                divisible by ``pad_size_divisor``. Defaults to 1.
+            pad_value (Number): The padded pixel value. Defaults to 0.
+            pad_mask (bool): Whether to pad instance masks. Defaults to False.
+            mask_pad_value (int): The padded pixel value for instance masks.
+                Defaults to 0.
+            bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+                Defaults to False.
+            rgb_to_bgr (bool): whether to convert image from RGB to RGB.
+                Defaults to False.
+            use_det_processor: (bool): whether to use DetDataPreprocessor
+                in training phrase. This is mainly for some tracking models
+                fed into one image rather than a group of image in training.
+                Defaults to False.
+    .       boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of
+                bboxes data to ``Tensor`` type. Defaults to True.
+            batch_augments (list[dict], optional): Batch-level augmentations
+    """
+
+    def __init__(self,
+                 mean: Optional[Sequence[Union[float, int]]] = None,
+                 std: Optional[Sequence[Union[float, int]]] = None,
+                 use_det_processor: bool = False,
+                 **kwargs):
+        super().__init__(mean=mean, std=std, **kwargs)
+        self.use_det_processor = use_det_processor
+        if mean is not None and not self.use_det_processor:
+            # overwrite the ``register_bufffer`` in ``ImgDataPreprocessor``
+            # since the shape of ``mean`` and ``std`` in tracking tasks must be
+            # (T, C, H, W), which T is the temporal length of the video.
+            self.register_buffer('mean',
+                                 torch.tensor(mean).view(1, -1, 1, 1), False)
+            self.register_buffer('std',
+                                 torch.tensor(std).view(1, -1, 1, 1), False)
+
+    def forward(self, data: dict, training: bool = False) -> Dict:
+        """Perform normalization,padding and bgr2rgb conversion based on
+        ``TrackDataPreprocessor``.
+
+        Args:
+            data (dict): data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            Tuple[Dict[str, List[torch.Tensor]], OptSampleList]: Data in the
+            same format as the model input.
+        """
+        if self.use_det_processor and training:
+            batch_pad_shape = self._get_pad_shape(data)
+        else:
+            batch_pad_shape = self._get_track_pad_shape(data)
+
+        data = self.cast_data(data)
+        imgs, data_samples = data['inputs'], data['data_samples']
+
+        if self.use_det_processor and training:
+            assert imgs[0].dim() == 3, \
+                'Only support the 3 dims when use detpreprocessor in training'
+            if self._channel_conversion:
+                imgs = [_img[[2, 1, 0], ...] for _img in imgs]
+            # Convert to `float`
+            imgs = [_img.float() for _img in imgs]
+            if self._enable_normalize:
+                imgs = [(_img - self.mean) / self.std for _img in imgs]
+            inputs = stack_batch(imgs, self.pad_size_divisor, self.pad_value)
+        else:
+            assert imgs[0].dim() == 4, \
+                'Only support the 4 dims when use trackprocessor in training'
+            # The shape of imgs[0] is (T, C, H, W).
+            channel = imgs[0].size(1)
+            if self._channel_conversion and channel == 3:
+                imgs = [_img[:, [2, 1, 0], ...] for _img in imgs]
+            # change to `float`
+            imgs = [_img.float() for _img in imgs]
+            if self._enable_normalize:
+                imgs = [(_img - self.mean) / self.std for _img in imgs]
+            inputs = stack_track_batch(imgs, self.pad_size_divisor,
+                                       self.pad_value)
+
+        if data_samples is not None:
+            # NOTE the batched image size information may be useful, e.g.
+            # in DETR, this is needed for the construction of masks, which is
+            # then used for the transformer_head.
+            batch_input_shape = tuple(inputs.size()[-2:])
+            if self.use_det_processor and training:
+                for data_sample, pad_shape in zip(data_samples,
+                                                  batch_pad_shape):
+                    data_sample.set_metainfo({
+                        'batch_input_shape': batch_input_shape,
+                        'pad_shape': pad_shape
+                    })
+                if self.boxtype2tensor:
+                    samplelist_boxtype2tensor(data_samples)
+                if self.pad_mask:
+                    self.pad_gt_masks(data_samples)
+            else:
+                for track_data_sample, pad_shapes in zip(
+                        data_samples, batch_pad_shape):
+                    for i in range(len(track_data_sample)):
+                        det_data_sample = track_data_sample[i]
+                        det_data_sample.set_metainfo({
+                            'batch_input_shape': batch_input_shape,
+                            'pad_shape': pad_shapes[i]
+                        })
+                if self.pad_mask and training:
+                    self.pad_track_gt_masks(data_samples)
+
+        if training and self.batch_augments is not None:
+            for batch_aug in self.batch_augments:
+                if self.use_det_processor and training:
+                    inputs, data_samples = batch_aug(inputs, data_samples)
+                else:
+                    # we only support T==1 when using batch augments.
+                    # Only yolox need batch_aug, and yolox can only process
+                    # (N, C, H, W) shape.
+                    # The shape of `inputs` is (N, T, C, H, W), hence, we use
+                    # inputs[:, 0] to change the shape to (N, C, H, W).
+                    assert inputs.size(1) == 1 and len(
+                        data_samples[0]
+                    ) == 1, 'Only support the number of sequence images equals to 1 when using batch augment.'  # noqa: E501
+                    det_data_samples = [
+                        track_data_sample[0]
+                        for track_data_sample in data_samples
+                    ]
+                    aug_inputs, aug_det_samples = batch_aug(
+                        inputs[:, 0], det_data_samples)
+                    inputs = aug_inputs.unsqueeze(1)
+                    for track_data_sample, det_sample in zip(
+                            data_samples, aug_det_samples):
+                        track_data_sample.video_data_samples = [det_sample]
+
+        # Note: inputs may contain large number of frames, so we must make
+        # sure that the mmeory is contiguous for stable forward
+        inputs = inputs.contiguous()
+
+        return dict(inputs=inputs, data_samples=data_samples)
+
+    def _get_track_pad_shape(self, data: dict) -> Dict[str, List]:
+        """Get the pad_shape of each image based on data and pad_size_divisor.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+
+        Returns:
+            Dict[str, List]: The shape of padding.
+        """
+        batch_pad_shape = dict()
+        batch_pad_shape = []
+        for imgs in data['inputs']:
+            # The sequence images in one sample among a batch have the same
+            # original shape
+            pad_h = int(np.ceil(imgs.shape[-2] /
+                                self.pad_size_divisor)) * self.pad_size_divisor
+            pad_w = int(np.ceil(imgs.shape[-1] /
+                                self.pad_size_divisor)) * self.pad_size_divisor
+            pad_shapes = [(pad_h, pad_w)] * imgs.size(0)
+            batch_pad_shape.append(pad_shapes)
+        return batch_pad_shape
+
+    def pad_track_gt_masks(self,
+                           data_samples: Sequence[TrackDataSample]) -> None:
+        """Pad gt_masks to shape of batch_input_shape."""
+        if 'masks' in data_samples[0][0].get('gt_instances', None):
+            for track_data_sample in data_samples:
+                for i in range(len(track_data_sample)):
+                    det_data_sample = track_data_sample[i]
+                    masks = det_data_sample.gt_instances.masks
+                    # TODO: whether to use BitmapMasks
+                    assert isinstance(masks, BitmapMasks)
+                    batch_input_shape = det_data_sample.batch_input_shape
+                    det_data_sample.gt_instances.masks = masks.pad(
+                        batch_input_shape, pad_val=self.mask_pad_value)
+
+
+def stack_track_batch(tensors: List[torch.Tensor],
+                      pad_size_divisor: int = 0,
+                      pad_value: Union[int, float] = 0) -> torch.Tensor:
+    """Stack multiple tensors to form a batch and pad the images to the max
+    shape use the right bottom padding mode in these images. If
+    ``pad_size_divisor > 0``, add padding to ensure the common height and width
+    is divisible by ``pad_size_divisor``. The difference between this function
+    and ``stack_batch`` in MMEngine is that this function can process batch
+    sequence images with shape (N, T, C, H, W).
+
+    Args:
+        tensors (List[Tensor]): The input multiple tensors. each is a
+            TCHW 4D-tensor. T denotes the number of key/reference frames.
+        pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding
+            to ensure the common height and width is divisible by
+            ``pad_size_divisor``. This depends on the model, and many
+            models need a divisibility of 32. Defaults to 0
+        pad_value (int, float): The padding value. Defaults to 0
+
+    Returns:
+       Tensor: The NTCHW 5D-tensor. N denotes the batch size.
+    """
+    assert isinstance(tensors, list), \
+        f'Expected input type to be list, but got {type(tensors)}'
+    assert len(set([tensor.ndim for tensor in tensors])) == 1, \
+        f'Expected the dimensions of all tensors must be the same, ' \
+        f'but got {[tensor.ndim for tensor in tensors]}'
+    assert tensors[0].ndim == 4, f'Expected tensor dimension to be 4, ' \
+                                 f'but got {tensors[0].ndim}'
+    assert len(set([tensor.shape[0] for tensor in tensors])) == 1, \
+        f'Expected the channels of all tensors must be the same, ' \
+        f'but got {[tensor.shape[0] for tensor in tensors]}'
+
+    tensor_sizes = [(tensor.shape[-2], tensor.shape[-1]) for tensor in tensors]
+    max_size = np.stack(tensor_sizes).max(0)
+
+    if pad_size_divisor > 1:
+        # the last two dims are H,W, both subject to divisibility requirement
+        max_size = (
+            max_size +
+            (pad_size_divisor - 1)) // pad_size_divisor * pad_size_divisor
+
+    padded_samples = []
+    for tensor in tensors:
+        padding_size = [
+            0, max_size[-1] - tensor.shape[-1], 0,
+            max_size[-2] - tensor.shape[-2]
+        ]
+        if sum(padding_size) == 0:
+            padded_samples.append(tensor)
+        else:
+            padded_samples.append(F.pad(tensor, padding_size, value=pad_value))
+
+    return torch.stack(padded_samples, dim=0)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/__init__.py b/head_extractor/build/lib/mmdet/models/dense_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b55ec2a4230a741e9a2c696ec434bf9cc8bafa
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/__init__.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor_free_head import AnchorFreeHead
+from .anchor_head import AnchorHead
+from .atss_head import ATSSHead
+from .atss_vlfusion_head import ATSSVLFusionHead
+from .autoassign_head import AutoAssignHead
+from .boxinst_head import BoxInstBboxHead, BoxInstMaskHead
+from .cascade_rpn_head import CascadeRPNHead, StageCascadeRPNHead
+from .centernet_head import CenterNetHead
+from .centernet_update_head import CenterNetUpdateHead
+from .centripetal_head import CentripetalHead
+from .condinst_head import CondInstBboxHead, CondInstMaskHead
+from .conditional_detr_head import ConditionalDETRHead
+from .corner_head import CornerHead
+from .dab_detr_head import DABDETRHead
+from .ddod_head import DDODHead
+from .ddq_detr_head import DDQDETRHead
+from .deformable_detr_head import DeformableDETRHead
+from .detr_head import DETRHead
+from .dino_head import DINOHead
+from .embedding_rpn_head import EmbeddingRPNHead
+from .fcos_head import FCOSHead
+from .fovea_head import FoveaHead
+from .free_anchor_retina_head import FreeAnchorRetinaHead
+from .fsaf_head import FSAFHead
+from .ga_retina_head import GARetinaHead
+from .ga_rpn_head import GARPNHead
+from .gfl_head import GFLHead
+from .grounding_dino_head import GroundingDINOHead
+from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
+from .lad_head import LADHead
+from .ld_head import LDHead
+from .mask2former_head import Mask2FormerHead
+from .maskformer_head import MaskFormerHead
+from .nasfcos_head import NASFCOSHead
+from .paa_head import PAAHead
+from .pisa_retinanet_head import PISARetinaHead
+from .pisa_ssd_head import PISASSDHead
+from .reppoints_head import RepPointsHead
+from .retina_head import RetinaHead
+from .retina_sepbn_head import RetinaSepBNHead
+from .rpn_head import RPNHead
+from .rtmdet_head import RTMDetHead, RTMDetSepBNHead
+from .rtmdet_ins_head import RTMDetInsHead, RTMDetInsSepBNHead
+from .sabl_retina_head import SABLRetinaHead
+from .solo_head import DecoupledSOLOHead, DecoupledSOLOLightHead, SOLOHead
+from .solov2_head import SOLOV2Head
+from .ssd_head import SSDHead
+from .tood_head import TOODHead
+from .vfnet_head import VFNetHead
+from .yolact_head import YOLACTHead, YOLACTProtonet
+from .yolo_head import YOLOV3Head
+from .yolof_head import YOLOFHead
+from .yolox_head import YOLOXHead
+
+__all__ = [
+    'AnchorFreeHead', 'AnchorHead', 'GuidedAnchorHead', 'FeatureAdaption',
+    'RPNHead', 'GARPNHead', 'RetinaHead', 'RetinaSepBNHead', 'GARetinaHead',
+    'SSDHead', 'FCOSHead', 'RepPointsHead', 'FoveaHead',
+    'FreeAnchorRetinaHead', 'ATSSHead', 'FSAFHead', 'NASFCOSHead',
+    'PISARetinaHead', 'PISASSDHead', 'GFLHead', 'CornerHead', 'YOLACTHead',
+    'YOLACTProtonet', 'YOLOV3Head', 'PAAHead', 'SABLRetinaHead',
+    'CentripetalHead', 'VFNetHead', 'StageCascadeRPNHead', 'CascadeRPNHead',
+    'EmbeddingRPNHead', 'LDHead', 'AutoAssignHead', 'DETRHead', 'YOLOFHead',
+    'DeformableDETRHead', 'CenterNetHead', 'YOLOXHead', 'SOLOHead',
+    'DecoupledSOLOHead', 'DecoupledSOLOLightHead', 'SOLOV2Head', 'LADHead',
+    'TOODHead', 'MaskFormerHead', 'Mask2FormerHead', 'DDODHead',
+    'CenterNetUpdateHead', 'RTMDetHead', 'RTMDetSepBNHead', 'CondInstBboxHead',
+    'CondInstMaskHead', 'RTMDetInsHead', 'RTMDetInsSepBNHead',
+    'BoxInstBboxHead', 'BoxInstMaskHead', 'ConditionalDETRHead', 'DINOHead',
+    'ATSSVLFusionHead', 'DABDETRHead', 'DDQDETRHead', 'GroundingDINOHead'
+]
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/anchor_free_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/anchor_free_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..90a9b3625b8fef12a2ee3a964c89597b597cb2ec
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/anchor_free_head.py
@@ -0,0 +1,317 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Any, List, Sequence, Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from numpy import ndarray
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList)
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..utils import multi_apply
+from .base_dense_head import BaseDenseHead
+
+StrideType = Union[Sequence[int], Sequence[Tuple[int, int]]]
+
+
+@MODELS.register_module()
+class AnchorFreeHead(BaseDenseHead):
+    """Anchor-free head (FCOS, Fovea, RepPoints, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        stacked_convs (int): Number of stacking convs of the head.
+        strides (Sequence[int] or Sequence[Tuple[int, int]]): Downsample
+            factor of each feature map.
+        dcn_on_last_conv (bool): If true, use dcn in the last layer of
+            towers. Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Default: "auto".
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. Defaults
+            'DistancePointBBoxCoder'.
+        conv_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
+            normalization layer. Defaults to None.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Training config of
+            anchor-free head.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of
+            anchor-free head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """  # noqa: W605
+
+    _version = 1
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 4,
+        strides: StrideType = (4, 8, 16, 32, 64),
+        dcn_on_last_conv: bool = False,
+        conv_bias: Union[bool, str] = 'auto',
+        loss_cls: ConfigType = dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox: ConfigType = dict(type='IoULoss', loss_weight=1.0),
+        bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        init_cfg: MultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='conv_cls', std=0.01, bias_prob=0.01))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+
+        self.prior_generator = MlvlPointGenerator(strides)
+
+        # In order to keep a more general interface and be consistent with
+        # anchor_head. We can think of point like one anchor
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self._init_cls_convs()
+        self._init_reg_convs()
+        self._init_predictor()
+
+    def _init_cls_convs(self) -> None:
+        """Initialize classification conv layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_reg_convs(self) -> None:
+        """Initialize bbox regression conv layers of the head."""
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_predictor(self) -> None:
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Hack some keys of the model state dict so that can load checkpoints
+        of previous version."""
+        version = local_metadata.get('version', None)
+        if version is None:
+            # the key is different in early versions
+            # for example, 'fcos_cls' become 'conv_cls' now
+            bbox_head_keys = [
+                k for k in state_dict.keys() if k.startswith(prefix)
+            ]
+            ori_predictor_keys = []
+            new_predictor_keys = []
+            # e.g. 'fcos_cls' or 'fcos_reg'
+            for key in bbox_head_keys:
+                ori_predictor_keys.append(key)
+                key = key.split('.')
+                if len(key) < 2:
+                    conv_name = None
+                elif key[1].endswith('cls'):
+                    conv_name = 'conv_cls'
+                elif key[1].endswith('reg'):
+                    conv_name = 'conv_reg'
+                elif key[1].endswith('centerness'):
+                    conv_name = 'conv_centerness'
+                else:
+                    conv_name = None
+                if conv_name is not None:
+                    key[1] = conv_name
+                    new_predictor_keys.append('.'.join(key))
+                else:
+                    ori_predictor_keys.pop(-1)
+            for i in range(len(new_predictor_keys)):
+                state_dict[new_predictor_keys[i]] = state_dict.pop(
+                    ori_predictor_keys[i])
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually contain classification scores and bbox predictions.
+
+            - cls_scores (list[Tensor]): Box scores for each scale level, \
+            each is a 4D-tensor, the channel number is \
+            num_points * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for each scale \
+            level, each is a 4D-tensor, the channel number is num_points * 4.
+        """
+        return multi_apply(self.forward_single, x)[:2]
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, ...]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+
+        Returns:
+            tuple: Scores for each class, bbox predictions, features
+            after classification and regression conv layers, some
+            models needs these features like FCOS.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        cls_score = self.conv_cls(cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = self.conv_reg(reg_feat)
+        return cls_score, bbox_pred, cls_feat, reg_feat
+
+    @abstractmethod
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_targets(self, points: List[Tensor],
+                    batch_gt_instances: InstanceList) -> Any:
+        """Compute regression, classification and centerness targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+        """
+        raise NotImplementedError
+
+    # TODO refactor aug_test
+    def aug_test(self,
+                 aug_batch_feats: List[Tensor],
+                 aug_batch_img_metas: List[List[Tensor]],
+                 rescale: bool = False) -> List[ndarray]:
+        """Test function with test time augmentation.
+
+        Args:
+            aug_batch_feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            aug_batch_img_metas (list[list[dict]]): the outer list indicates
+                test-time augs (multiscale, flip, etc.) and the inner list
+                indicates images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[ndarray]: bbox results of each class
+        """
+        return self.aug_test_bboxes(
+            aug_batch_feats, aug_batch_img_metas, rescale=rescale)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/anchor_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/anchor_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4578caca818550397875a0df34c128f461e6ec75
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/anchor_head.py
@@ -0,0 +1,530 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, cat_boxes, get_box_tensor
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from ..task_modules.prior_generators import (AnchorGenerator,
+                                             anchor_inside_flags)
+from ..task_modules.samplers import PseudoSampler
+from ..utils import images_to_levels, multi_apply, unmap
+from .base_dense_head import BaseDenseHead
+
+
+@MODELS.register_module()
+class AnchorHead(BaseDenseHead):
+    """Anchor-based head (RPN, RetinaNet, SSD, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        anchor_generator (dict): Config dict for anchor generator
+        bbox_coder (dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            scales=[8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder: ConfigType = dict(
+            type='DeltaXYWHBBoxCoder',
+            clip_border=True,
+            target_means=(.0, .0, .0, .0),
+            target_stds=(1.0, 1.0, 1.0, 1.0)),
+        reg_decoded_bbox: bool = False,
+        loss_cls: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox: ConfigType = dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        init_cfg: OptMultiConfig = dict(
+            type='Normal', layer='Conv2d', std=0.01)
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        if self.cls_out_channels <= 0:
+            raise ValueError(f'num_classes={num_classes} is too small')
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        self.fp16_enabled = False
+
+        self.prior_generator = TASK_UTILS.build(anchor_generator)
+
+        # Usually the numbers of anchors for each level are the same
+        # except SSD detectors. So it is an int in the most dense
+        # heads but a list of int in SSDHead
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+        self._init_layers()
+
+    @property
+    def num_anchors(self) -> int:
+        warnings.warn('DeprecationWarning: `num_anchors` is deprecated, '
+                      'for consistency or also use '
+                      '`num_base_priors` instead')
+        return self.prior_generator.num_base_priors[0]
+
+    @property
+    def anchor_generator(self) -> AnchorGenerator:
+        warnings.warn('DeprecationWarning: anchor_generator is deprecated, '
+                      'please use "prior_generator" instead')
+        return self.prior_generator
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.conv_cls = nn.Conv2d(self.in_channels,
+                                  self.num_base_priors * self.cls_out_channels,
+                                  1)
+        reg_dim = self.bbox_coder.encode_size
+        self.conv_reg = nn.Conv2d(self.in_channels,
+                                  self.num_base_priors * reg_dim, 1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level \
+                    the channels number is num_base_priors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_base_priors * 4.
+        """
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        return cls_score, bbox_pred
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_scores (list[Tensor]): Classification scores for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * num_classes.
+                - bbox_preds (list[Tensor]): Box energies / deltas for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * 4.
+        """
+        return multi_apply(self.forward_single, x)
+
+    def get_anchors(self,
+                    featmap_sizes: List[tuple],
+                    batch_img_metas: List[dict],
+                    device: Union[torch.device, str] = 'cuda') \
+            -> Tuple[List[List[Tensor]], List[List[Tensor]]]:
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+            device (torch.device | str): Device for returned tensors.
+                Defaults to cuda.
+
+        Returns:
+            tuple:
+
+                - anchor_list (list[list[Tensor]]): Anchors of each image.
+                - valid_flag_list (list[list[Tensor]]): Valid flags of each
+                  image.
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = self.prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device)
+            valid_flag_list.append(multi_level_flags)
+
+        return anchor_list, valid_flag_list
+
+    def _get_targets_single(self,
+                            flat_anchors: Union[Tensor, BaseBoxes],
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            flat_anchors (Tensor or :obj:`BaseBoxes`): Multi-level anchors
+                of the image, which are concatenated into a single tensor
+                or box type of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors, ).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.  Defaults to True.
+
+        Returns:
+            tuple:
+
+                - labels (Tensor): Labels of each level.
+                - label_weights (Tensor): Label weights of each level.
+                - bbox_targets (Tensor): BBox targets of each level.
+                - bbox_weights (Tensor): BBox weights of each level.
+                - pos_inds (Tensor): positive samples indexes.
+                - neg_inds (Tensor): negative samples indexes.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags]
+
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+        # No sampling is required except for RPN and
+        # Guided Anchoring algorithms
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        target_dim = gt_instances.bboxes.size(-1) if self.reg_decoded_bbox \
+            else self.bbox_coder.encode_size
+        bbox_targets = anchors.new_zeros(num_valid_anchors, target_dim)
+        bbox_weights = anchors.new_zeros(num_valid_anchors, target_dim)
+
+        # TODO: Considering saving memory, is it necessary to be long?
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        # `bbox_coder.encode` accepts tensor or box type inputs and generates
+        # tensor targets. If regressing decoded boxes, the code will convert
+        # box type `pos_bbox_targets` to tensor.
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_priors, sampling_result.pos_gt_bboxes)
+            else:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+                pos_bbox_targets = get_box_tensor(pos_bbox_targets)
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def get_targets(self,
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True,
+                    return_sampling_results: bool = False) -> tuple:
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+            return_sampling_results (bool): Whether to return the sampling
+                results. Defaults to False.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - avg_factor (int): Average factor that is used to average
+                  the loss. When using sampling method, avg_factor is usually
+                  the sum of positive and negative priors. When using
+                  `PseudoSampler`, `avg_factor` is usually equal to the number
+                  of positive priors.
+
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(cat_boxes(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        results = multi_apply(
+            self._get_targets_single,
+            concat_anchor_list,
+            concat_valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore,
+            unmap_outputs=unmap_outputs)
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         pos_inds_list, neg_inds_list, sampling_results_list) = results[:7]
+        rest_results = list(results[7:])  # user-added return values
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # update `_raw_positive_infos`, which will be used when calling
+        # `get_positive_infos`.
+        self._raw_positive_infos.update(sampling_results=sampling_results_list)
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        res = (labels_list, label_weights_list, bbox_targets_list,
+               bbox_weights_list, avg_factor)
+        if return_sampling_results:
+            res = res + (sampling_results_list, )
+        for i, r in enumerate(rest_results):  # user-added return values
+            rest_results[i] = images_to_levels(r, num_level_anchors)
+
+        return res + tuple(rest_results)
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            anchors: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            bbox_weights: Tensor, avg_factor: int) -> tuple:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor
+                weight shape (N, num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (N, num_total_anchors, 4).
+            avg_factor (int): Average factor that is used to average the loss.
+
+        Returns:
+            tuple: loss components.
+        """
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+        # regression loss
+        target_dim = bbox_targets.size(-1)
+        bbox_targets = bbox_targets.reshape(-1, target_dim)
+        bbox_weights = bbox_weights.reshape(-1, target_dim)
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1,
+                                                 self.bbox_coder.encode_size)
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            anchors = anchors.reshape(-1, anchors.size(-1))
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+            bbox_pred = get_box_tensor(bbox_pred)
+        loss_bbox = self.loss_bbox(
+            bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor)
+        return loss_cls, loss_bbox
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor) = cls_reg_targets
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(cat_boxes(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            avg_factor=avg_factor)
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/atss_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/atss_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ce71b3eff5e0ed624ec7ae16e8db80c90e8ffa1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/atss_head.py
@@ -0,0 +1,524 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import images_to_levels, multi_apply, unmap
+from .anchor_head import AnchorHead
+
+
+@MODELS.register_module()
+class ATSSHead(AnchorHead):
+    """Detection Head of `ATSS <https://arxiv.org/abs/1912.02424>`_.
+
+    ATSS head structure is similar with FCOS, however ATSS use anchor boxes
+    and assign label by Adaptive Training Sample Selection instead max-iou.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        pred_kernel_size (int): Kernel size of ``nn.Conv2d``
+        stacked_convs (int): Number of stacking convs of the head.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to ``dict(type='GN', num_groups=32,
+            requires_grad=True)``.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Defaults to False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        loss_centerness (:obj:`ConfigDict` or dict): Config of centerness loss.
+            Defaults to ``dict(type='CrossEntropyLoss', use_sigmoid=True,
+            loss_weight=1.0)``.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 pred_kernel_size: int = 3,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 reg_decoded_bbox: bool = True,
+                 loss_centerness: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='atss_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.pred_kernel_size = pred_kernel_size
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            reg_decoded_bbox=reg_decoded_bbox,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        self.sampling = False
+        self.loss_centerness = MODELS.build(loss_centerness)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        pred_pad_size = self.pred_kernel_size // 2
+        self.atss_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_anchors * self.cls_out_channels,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.atss_reg = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * 4,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.atss_centerness = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * 1,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * 4.
+        """
+        return multi_apply(self.forward_single, x, self.scales)
+
+    def forward_single(self, x: Tensor, scale: Scale) -> Sequence[Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_anchors * 4.
+                centerness (Tensor): Centerness for a single scale level, the
+                    channel number is (N, num_anchors * 1, H, W).
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.atss_cls(cls_feat)
+        # we just follow atss, not apply exp in bbox_pred
+        bbox_pred = scale(self.atss_reg(reg_feat)).float()
+        centerness = self.atss_centerness(reg_feat)
+        return cls_score, bbox_pred, centerness
+
+    def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor,
+                            bbox_pred: Tensor, centerness: Tensor,
+                            labels: Tensor, label_weights: Tensor,
+                            bbox_targets: Tensor, avg_factor: float) -> dict:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            avg_factor (float): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        centerness = centerness.permute(0, 2, 3, 1).reshape(-1)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # classification loss
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_centerness = centerness[pos_inds]
+
+            centerness_targets = self.centerness_target(
+                pos_anchors, pos_bbox_targets)
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_pred)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_bbox_targets,
+                weight=centerness_targets,
+                avg_factor=1.0)
+
+            # centerness loss
+            loss_centerness = self.loss_centerness(
+                pos_centerness, centerness_targets, avg_factor=avg_factor)
+
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_centerness = centerness.sum() * 0
+            centerness_targets = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, loss_centerness, centerness_targets.sum()
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            centernesses: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            centernesses (list[Tensor]): Centerness for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        losses_cls, losses_bbox, loss_centerness, \
+            bbox_avg_factor = multi_apply(
+                self.loss_by_feat_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                centernesses,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                avg_factor=avg_factor)
+
+        bbox_avg_factor = sum(bbox_avg_factor)
+        bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_centerness=loss_centerness)
+
+    def centerness_target(self, anchors: Tensor, gts: Tensor) -> Tensor:
+        """Calculate the centerness between anchors and gts.
+
+        Only calculate pos centerness targets, otherwise there may be nan.
+
+        Args:
+            anchors (Tensor): Anchors with shape (N, 4), "xyxy" format.
+            gts (Tensor): Ground truth bboxes with shape (N, 4), "xyxy" format.
+
+        Returns:
+            Tensor: Centerness between anchors and gts.
+        """
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        l_ = anchors_cx - gts[:, 0]
+        t_ = anchors_cy - gts[:, 1]
+        r_ = gts[:, 2] - anchors_cx
+        b_ = gts[:, 3] - anchors_cy
+
+        left_right = torch.stack([l_, r_], dim=1)
+        top_bottom = torch.stack([t_, b_], dim=1)
+        centerness = torch.sqrt(
+            (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) *
+            (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]))
+        assert not torch.isnan(centerness).any()
+        return centerness
+
+    def get_targets(self,
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True) -> tuple:
+        """Get targets for ATSS head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. Besides
+        returning the targets as the parent method does, it also returns the
+        anchors as the first element of the returned tuple.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             anchor_list,
+             valid_flag_list,
+             num_level_anchors_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs)
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, avg_factor)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            num_level_anchors: List[int],
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors (List[int]): Number of anchors of each scale
+                level.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                bbox_weights (Tensor): BBox weights of all anchors in the
+                    image with shape (N, 4)
+                pos_inds (Tensor): Indices of positive anchor with shape
+                    (num_pos,).
+                neg_inds (Tensor): Indices of negative anchor with shape
+                    (num_neg,).
+                sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(pred_instances,
+                                             num_level_anchors_inside,
+                                             gt_instances, gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if self.reg_decoded_bbox:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            else:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_priors, sampling_result.pos_gt_bboxes)
+
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds, sampling_result)
+
+    def get_num_level_anchors_inside(self, num_level_anchors, inside_flags):
+        """Get the number of valid anchors in every level."""
+
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/atss_vlfusion_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/atss_vlfusion_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5cd28b4a040ba447130aed07629f6312f95dcf3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/atss_vlfusion_head.py
@@ -0,0 +1,949 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from typing import Callable, List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Scale
+from mmcv.ops.modulated_deform_conv import ModulatedDeformConv2d
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModel
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+try:
+    from transformers import BertConfig
+except ImportError:
+    BertConfig = None
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import cat_boxes
+from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
+from ..utils import (BertEncoderLayer, VLFuse, filter_scores_and_topk,
+                     permute_and_flatten, select_single_mlvl,
+                     unpack_gt_instances)
+from ..utils.vlfuse_helper import MAX_CLAMP_VALUE
+from .atss_head import ATSSHead
+
+
+def convert_grounding_to_cls_scores(logits: Tensor,
+                                    positive_maps: List[dict]) -> Tensor:
+    """Convert logits to class scores."""
+    assert len(positive_maps) == logits.shape[0]  # batch size
+
+    scores = torch.zeros(logits.shape[0], logits.shape[1],
+                         len(positive_maps[0])).to(logits.device)
+    if positive_maps is not None:
+        if all(x == positive_maps[0] for x in positive_maps):
+            # only need to compute once
+            positive_map = positive_maps[0]
+            for label_j in positive_map:
+                scores[:, :, label_j -
+                       1] = logits[:, :,
+                                   torch.LongTensor(positive_map[label_j]
+                                                    )].mean(-1)
+        else:
+            for i, positive_map in enumerate(positive_maps):
+                for label_j in positive_map:
+                    scores[i, :, label_j - 1] = logits[
+                        i, :, torch.LongTensor(positive_map[label_j])].mean(-1)
+    return scores
+
+
+class Conv3x3Norm(nn.Module):
+    """Conv3x3 and norm."""
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 stride: int,
+                 groups: int = 1,
+                 use_dcn: bool = False,
+                 norm_type: Optional[Union[Sequence, str]] = None):
+        super().__init__()
+
+        if use_dcn:
+            self.conv = ModulatedDeformConv2d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                groups=groups)
+        else:
+            self.conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                groups=groups)
+
+        if isinstance(norm_type, Sequence):
+            assert len(norm_type) == 2
+            assert norm_type[0] == 'gn'
+            gn_group = norm_type[1]
+            norm_type = norm_type[0]
+
+        if norm_type == 'bn':
+            bn_op = nn.BatchNorm2d(out_channels)
+        elif norm_type == 'gn':
+            bn_op = nn.GroupNorm(
+                num_groups=gn_group, num_channels=out_channels)
+        if norm_type is not None:
+            self.bn = bn_op
+        else:
+            self.bn = None
+
+    def forward(self, x, **kwargs):
+        x = self.conv(x, **kwargs)
+        if self.bn:
+            x = self.bn(x)
+        return x
+
+
+class DyReLU(nn.Module):
+    """Dynamic ReLU."""
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expand_ratio: int = 4):
+        super().__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.expand_ratio = expand_ratio
+        self.out_channels = out_channels
+
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, in_channels // expand_ratio),
+            nn.ReLU(inplace=True),
+            nn.Linear(in_channels // expand_ratio,
+                      out_channels * self.expand_ratio),
+            nn.Hardsigmoid(inplace=True))
+
+    def forward(self, x) -> Tensor:
+        x_out = x
+        b, c, h, w = x.size()
+        x = self.avg_pool(x).view(b, c)
+        x = self.fc(x).view(b, -1, 1, 1)
+
+        a1, b1, a2, b2 = torch.split(x, self.out_channels, dim=1)
+        a1 = (a1 - 0.5) * 2 + 1.0
+        a2 = (a2 - 0.5) * 2
+        b1 = b1 - 0.5
+        b2 = b2 - 0.5
+        out = torch.max(x_out * a1 + b1, x_out * a2 + b2)
+        return out
+
+
+class DyConv(nn.Module):
+    """Dynamic Convolution."""
+
+    def __init__(self,
+                 conv_func: Callable,
+                 in_channels: int,
+                 out_channels: int,
+                 use_dyfuse: bool = True,
+                 use_dyrelu: bool = False,
+                 use_dcn: bool = False):
+        super().__init__()
+
+        self.dyconvs = nn.ModuleList()
+        self.dyconvs.append(conv_func(in_channels, out_channels, 1))
+        self.dyconvs.append(conv_func(in_channels, out_channels, 1))
+        self.dyconvs.append(conv_func(in_channels, out_channels, 2))
+
+        if use_dyfuse:
+            self.attnconv = nn.Sequential(
+                nn.AdaptiveAvgPool2d(1),
+                nn.Conv2d(in_channels, 1, kernel_size=1),
+                nn.ReLU(inplace=True))
+            self.h_sigmoid = nn.Hardsigmoid(inplace=True)
+        else:
+            self.attnconv = None
+
+        if use_dyrelu:
+            self.relu = DyReLU(in_channels, out_channels)
+        else:
+            self.relu = nn.ReLU()
+
+        if use_dcn:
+            self.offset = nn.Conv2d(
+                in_channels, 27, kernel_size=3, stride=1, padding=1)
+        else:
+            self.offset = None
+
+        self.init_weights()
+
+    def init_weights(self):
+        for m in self.dyconvs.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight.data, 0, 0.01)
+                if m.bias is not None:
+                    m.bias.data.zero_()
+        if self.attnconv is not None:
+            for m in self.attnconv.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.normal_(m.weight.data, 0, 0.01)
+                    if m.bias is not None:
+                        m.bias.data.zero_()
+
+    def forward(self, inputs: dict) -> dict:
+        visual_feats = inputs['visual']
+
+        out_vis_feats = []
+        for level, feature in enumerate(visual_feats):
+
+            offset_conv_args = {}
+            if self.offset is not None:
+                offset_mask = self.offset(feature)
+                offset = offset_mask[:, :18, :, :]
+                mask = offset_mask[:, 18:, :, :].sigmoid()
+                offset_conv_args = dict(offset=offset, mask=mask)
+
+            temp_feats = [self.dyconvs[1](feature, **offset_conv_args)]
+
+            if level > 0:
+                temp_feats.append(self.dyconvs[2](visual_feats[level - 1],
+                                                  **offset_conv_args))
+            if level < len(visual_feats) - 1:
+                temp_feats.append(
+                    F.upsample_bilinear(
+                        self.dyconvs[0](visual_feats[level + 1],
+                                        **offset_conv_args),
+                        size=[feature.size(2),
+                              feature.size(3)]))
+            mean_feats = torch.mean(
+                torch.stack(temp_feats), dim=0, keepdim=False)
+
+            if self.attnconv is not None:
+                attn_feat = []
+                res_feat = []
+                for feat in temp_feats:
+                    res_feat.append(feat)
+                    attn_feat.append(self.attnconv(feat))
+
+                res_feat = torch.stack(res_feat)
+                spa_pyr_attn = self.h_sigmoid(torch.stack(attn_feat))
+
+                mean_feats = torch.mean(
+                    res_feat * spa_pyr_attn, dim=0, keepdim=False)
+
+            out_vis_feats.append(mean_feats)
+
+        out_vis_feats = [self.relu(item) for item in out_vis_feats]
+
+        features_dict = {'visual': out_vis_feats, 'lang': inputs['lang']}
+
+        return features_dict
+
+
+class VLFusionModule(BaseModel):
+    """Visual-lang Fusion Module."""
+
+    def __init__(self,
+                 in_channels: int,
+                 feat_channels: int,
+                 num_base_priors: int,
+                 early_fuse: bool = False,
+                 num_dyhead_blocks: int = 6,
+                 lang_model_name: str = 'bert-base-uncased',
+                 use_dyrelu: bool = True,
+                 use_dyfuse: bool = True,
+                 use_dcn: bool = True,
+                 use_checkpoint: bool = False,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        if BertConfig is None:
+            raise RuntimeError(
+                'transformers is not installed, please install it by: '
+                'pip install transformers.')
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.num_base_priors = num_base_priors
+        self.early_fuse = early_fuse
+        self.num_dyhead_blocks = num_dyhead_blocks
+        self.use_dyrelu = use_dyrelu
+        self.use_dyfuse = use_dyfuse
+        self.use_dcn = use_dcn
+        self.use_checkpoint = use_checkpoint
+
+        self.lang_cfg = BertConfig.from_pretrained(lang_model_name)
+        self.lang_dim = self.lang_cfg.hidden_size
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the model."""
+        bias_value = -math.log((1 - 0.01) / 0.01)
+
+        dyhead_tower = []
+        for i in range(self.num_dyhead_blocks):
+            if self.early_fuse:
+                # cross-modality fusion
+                dyhead_tower.append(VLFuse(use_checkpoint=self.use_checkpoint))
+                # lang branch
+                dyhead_tower.append(
+                    BertEncoderLayer(
+                        self.lang_cfg,
+                        clamp_min_for_underflow=True,
+                        clamp_max_for_overflow=True))
+
+            # vision branch
+            dyhead_tower.append(
+                DyConv(
+                    lambda i, o, s: Conv3x3Norm(
+                        i, o, s, use_dcn=self.use_dcn, norm_type=['gn', 16]),
+                    self.in_channels if i == 0 else self.feat_channels,
+                    self.feat_channels,
+                    use_dyrelu=(self.use_dyrelu
+                                and self.in_channels == self.feat_channels)
+                    if i == 0 else self.use_dyrelu,
+                    use_dyfuse=(self.use_dyfuse
+                                and self.in_channels == self.feat_channels)
+                    if i == 0 else self.use_dyfuse,
+                    use_dcn=(self.use_dcn
+                             and self.in_channels == self.feat_channels)
+                    if i == 0 else self.use_dcn,
+                ))
+
+        self.add_module('dyhead_tower', nn.Sequential(*dyhead_tower))
+
+        self.bbox_pred = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, kernel_size=1)
+        self.centerness = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 1, kernel_size=1)
+        self.dot_product_projection_text = nn.Linear(
+            self.lang_dim,
+            self.num_base_priors * self.feat_channels,
+            bias=True)
+        self.log_scale = nn.Parameter(torch.Tensor([0.0]), requires_grad=True)
+        self.bias_lang = nn.Parameter(
+            torch.zeros(self.lang_dim), requires_grad=True)
+        self.bias0 = nn.Parameter(
+            torch.Tensor([bias_value]), requires_grad=True)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in range(5)])
+
+    def forward(self, visual_feats: Tuple[Tensor],
+                language_feats: dict) -> Tuple:
+        feat_inputs = {'visual': visual_feats, 'lang': language_feats}
+        dyhead_tower = self.dyhead_tower(feat_inputs)
+
+        if self.early_fuse:
+            embedding = dyhead_tower['lang']['hidden']
+        else:
+            embedding = language_feats['embedded']
+
+        embedding = F.normalize(embedding, p=2, dim=-1)
+        dot_product_proj_tokens = self.dot_product_projection_text(embedding /
+                                                                   2.0)
+        dot_product_proj_tokens_bias = torch.matmul(
+            embedding, self.bias_lang) + self.bias0
+
+        bbox_preds = []
+        centerness = []
+        cls_logits = []
+
+        for i, feature in enumerate(visual_feats):
+            visual = dyhead_tower['visual'][i]
+            B, C, H, W = visual.shape
+
+            bbox_pred = self.scales[i](self.bbox_pred(visual))
+            bbox_preds.append(bbox_pred)
+            centerness.append(self.centerness(visual))
+
+            dot_product_proj_queries = permute_and_flatten(
+                visual, B, self.num_base_priors, C, H, W)
+
+            bias = dot_product_proj_tokens_bias.unsqueeze(1).repeat(
+                1, self.num_base_priors, 1)
+            dot_product_logit = (
+                torch.matmul(dot_product_proj_queries,
+                             dot_product_proj_tokens.transpose(-1, -2)) /
+                self.log_scale.exp()) + bias
+            dot_product_logit = torch.clamp(
+                dot_product_logit, max=MAX_CLAMP_VALUE)
+            dot_product_logit = torch.clamp(
+                dot_product_logit, min=-MAX_CLAMP_VALUE)
+            cls_logits.append(dot_product_logit)
+
+        return bbox_preds, centerness, cls_logits
+
+
+@MODELS.register_module()
+class ATSSVLFusionHead(ATSSHead):
+    """ATSS head with visual-language fusion module.
+
+    Args:
+        early_fuse (bool): Whether to fuse visual and language features
+            Defaults to False.
+        use_checkpoint (bool): Whether to use checkpoint. Defaults to False.
+        num_dyhead_blocks (int): Number of dynamic head blocks. Defaults to 6.
+        lang_model_name (str): Name of the language model.
+            Defaults to 'bert-base-uncased'.
+    """
+
+    def __init__(self,
+                 *args,
+                 early_fuse: bool = False,
+                 use_checkpoint: bool = False,
+                 num_dyhead_blocks: int = 6,
+                 lang_model_name: str = 'bert-base-uncased',
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(*args, **kwargs, init_cfg=init_cfg)
+        self.head = VLFusionModule(
+            in_channels=self.in_channels,
+            feat_channels=self.feat_channels,
+            num_base_priors=self.num_base_priors,
+            early_fuse=early_fuse,
+            use_checkpoint=use_checkpoint,
+            num_dyhead_blocks=num_dyhead_blocks,
+            lang_model_name=lang_model_name)
+        self.text_masks = None
+
+    def _init_layers(self) -> None:
+        """No need to initialize the ATSS head layer."""
+        pass
+
+    def forward(self, visual_feats: Tuple[Tensor],
+                language_feats: dict) -> Tuple[Tensor]:
+        """Forward function."""
+        bbox_preds, centerness, cls_logits = self.head(visual_feats,
+                                                       language_feats)
+        return cls_logits, bbox_preds, centerness
+
+    def loss(self, visual_feats: Tuple[Tensor], language_feats: dict,
+             batch_data_samples):
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        outs = self(visual_feats, language_feats)
+        self.text_masks = language_feats['masks']
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            centernesses: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            centernesses (list[Tensor]): Centerness for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        anchors = torch.cat(anchor_list, dim=1)
+        labels = torch.cat(labels_list, dim=1)
+        label_weights = torch.cat(label_weights_list, dim=1)
+        bbox_targets = torch.cat(bbox_targets_list, dim=1)
+        cls_scores = torch.cat(cls_scores, dim=1)
+
+        centernesses_ = []
+        bbox_preds_ = []
+        for bbox_pred, centerness in zip(bbox_preds, centernesses):
+            centernesses_.append(
+                centerness.permute(0, 2, 3,
+                                   1).reshape(cls_scores.size(0), -1, 1))
+            bbox_preds_.append(
+                bbox_pred.permute(0, 2, 3,
+                                  1).reshape(cls_scores.size(0), -1, 4))
+        bbox_preds = torch.cat(bbox_preds_, dim=1)
+        centernesses = torch.cat(centernesses_, dim=1)
+
+        losses_cls, losses_bbox, loss_centerness, bbox_avg_factor = \
+            self._loss_by_feat(
+                anchors,
+                cls_scores,
+                bbox_preds,
+                centernesses,
+                labels,
+                label_weights,
+                bbox_targets,
+                avg_factor=avg_factor)
+
+        bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item()
+        losses_bbox = losses_bbox / bbox_avg_factor
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_centerness=loss_centerness)
+
+    def _loss_by_feat(self, anchors: Tensor, cls_score: Tensor,
+                      bbox_pred: Tensor, centerness: Tensor, labels: Tensor,
+                      label_weights: Tensor, bbox_targets: Tensor,
+                      avg_factor: float) -> dict:
+        """Calculate the loss of all scale level based on the features
+        extracted by the detection head.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        anchors = anchors.reshape(-1, 4)
+
+        # ===== this change =====
+        pos_inds = (labels.sum(-1) > 0).reshape(-1)
+
+        # Loss is not computed for the padded regions of the text.
+        assert (self.text_masks.dim() == 2)
+        text_mask = (self.text_masks > 0).unsqueeze(1)
+        text_mask = text_mask.repeat(1, cls_score.size(1), 1)
+        cls_score = torch.masked_select(cls_score, text_mask).contiguous()
+        labels = torch.masked_select(labels, text_mask)
+        label_weights = label_weights[...,
+                                      None].repeat(1, 1, text_mask.size(-1))
+        label_weights = torch.masked_select(label_weights, text_mask)
+
+        bbox_pred = bbox_pred.reshape(-1, 4)
+        centerness = centerness.reshape(-1)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # classification loss
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+
+        if pos_inds.sum() > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_centerness = centerness[pos_inds]
+
+            centerness_targets = self.centerness_target(
+                pos_anchors, pos_bbox_targets)
+
+            if torch.isnan(centerness_targets).any():
+                print('=====Centerness includes NaN=====')
+                mask = ~torch.isnan(centerness_targets)
+                centerness_targets = centerness_targets[mask]
+                pos_centerness = pos_centerness[mask]
+                pos_anchors = pos_anchors[mask]
+                pos_bbox_targets = pos_bbox_targets[mask]
+                pos_bbox_pred = pos_bbox_pred[mask]
+
+                if pos_bbox_targets.shape[0] == 0:
+                    loss_bbox = bbox_pred.sum() * 0
+                    loss_centerness = centerness.sum() * 0
+                    centerness_targets = bbox_targets.new_tensor(0.)
+                    return loss_cls, loss_bbox, loss_centerness, \
+                        centerness_targets.sum()
+
+            # The decoding process takes the offset into consideration.
+            pos_anchors[:, 2:] += 1
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_pred)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_bbox_targets,
+                weight=centerness_targets,
+                avg_factor=1.0)
+
+            # centerness loss
+            loss_centerness = self.loss_centerness(
+                pos_centerness, centerness_targets, avg_factor=avg_factor)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_centerness = centerness.sum() * 0
+            centerness_targets = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, loss_centerness, centerness_targets.sum()
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            num_level_anchors: List[int],
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors (List[int]): Number of anchors of each scale
+                level.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                bbox_weights (Tensor): BBox weights of all anchors in the
+                    image with shape (N, 4)
+                pos_inds (Tensor): Indices of positive anchor with shape
+                    (num_pos,).
+                neg_inds (Tensor): Indices of negative anchor with shape
+                    (num_neg,).
+                sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        anchors = flat_anchors
+        # Align the official implementation
+        anchors[:, 2:] -= 1
+
+        num_level_anchors_inside = num_level_anchors
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(pred_instances,
+                                             num_level_anchors_inside,
+                                             gt_instances, gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+
+        # ===== this change =====
+        labels = anchors.new_full((num_valid_anchors, self.feat_channels),
+                                  0,
+                                  dtype=torch.float32)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if self.reg_decoded_bbox:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            else:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_priors, sampling_result.pos_gt_bboxes)
+
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            # ===== this change =====
+            labels[pos_inds] = gt_instances.positive_maps[
+                sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds, sampling_result)
+
+    def centerness_target(self, anchors: Tensor, gts: Tensor) -> Tensor:
+        """Calculate the centerness between anchors and gts.
+
+        Only calculate pos centerness targets, otherwise there may be nan.
+
+        Args:
+            anchors (Tensor): Anchors with shape (N, 4), "xyxy" format.
+            gts (Tensor): Ground truth bboxes with shape (N, 4), "xyxy" format.
+
+        Returns:
+            Tensor: Centerness between anchors and gts.
+        """
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        l_ = anchors_cx - gts[:, 0]
+        t_ = anchors_cy - gts[:, 1]
+        r_ = gts[:, 2] - anchors_cx
+        b_ = gts[:, 3] - anchors_cy
+
+        left_right = torch.stack([l_, r_], dim=1)
+        top_bottom = torch.stack([t_, b_], dim=1)
+        centerness = torch.sqrt(
+            (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) *
+            (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]))
+        # assert not torch.isnan(centerness).any()
+        return centerness
+
+    def predict(self,
+                visual_feats: Tuple[Tensor],
+                language_feats: dict,
+                batch_data_samples,
+                rescale: bool = True):
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            visual_feats (tuple[Tensor]): Multi-level visual features from the
+                upstream network, each is a 4D-tensor.
+            language_feats (dict): Language features from the upstream network.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        batch_token_positive_maps = [
+            data_samples.token_positive_map
+            for data_samples in batch_data_samples
+        ]
+        outs = self(visual_feats, language_feats)
+
+        predictions = self.predict_by_feat(
+            *outs,
+            batch_img_metas=batch_img_metas,
+            batch_token_positive_maps=batch_token_positive_maps,
+            rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        cls_logits: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        score_factors: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        batch_token_positive_maps: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_logits (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            batch_token_positive_maps (list[dict], Optional): Batch token
+                positive map. Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(bbox_preds) == len(score_factors)
+        num_levels = len(bbox_preds)
+
+        featmap_sizes = [bbox_preds[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+
+        result_list = []
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            token_positive_maps = batch_token_positive_maps[img_id]
+            bbox_pred_list = select_single_mlvl(
+                bbox_preds, img_id, detach=True)
+            score_factor_list = select_single_mlvl(
+                score_factors, img_id, detach=True)
+            cls_logit_list = select_single_mlvl(
+                cls_logits, img_id, detach=True)
+
+            results = self._predict_by_feat_single(
+                bbox_pred_list=bbox_pred_list,
+                score_factor_list=score_factor_list,
+                cls_logit_list=cls_logit_list,
+                mlvl_priors=mlvl_priors,
+                token_positive_maps=token_positive_maps,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                cls_logit_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                token_positive_maps: dict,
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = True,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            cls_logit_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            token_positive_maps (dict): Token positive map.
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+        score_thr = cfg.get('score_thr', 0)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_labels = []
+
+        for level_idx, (bbox_pred, score_factor, cls_logit, priors) in \
+                enumerate(zip(bbox_pred_list,
+                              score_factor_list, cls_logit_list, mlvl_priors)):
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(
+                -1, self.bbox_coder.encode_size)
+            score_factor = score_factor.permute(1, 2, 0).reshape(-1).sigmoid()
+
+            scores = convert_grounding_to_cls_scores(
+                logits=cls_logit.sigmoid()[None],
+                positive_maps=[token_positive_maps])[0]
+
+            results = filter_scores_and_topk(
+                scores, score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+            score_factor = score_factor[keep_idxs]
+            scores = torch.sqrt(scores * score_factor)
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_priors)
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        predictions = self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+        if len(predictions) > 0:
+            # Note: GLIP adopts a very strange bbox decoder logic,
+            # and if 1 is not added here, it will not align with
+            # the official mAP.
+            predictions.bboxes[:, 2:] = predictions.bboxes[:, 2:] + 1
+        return predictions
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/autoassign_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/autoassign_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2b30ff0d7d41205f0a92ede7b8eb10a234c5942
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/autoassign_head.py
@@ -0,0 +1,524 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Scale
+from mmengine.model import bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..utils import levels_to_images, multi_apply
+from .fcos_head import FCOSHead
+
+EPS = 1e-12
+
+
+class CenterPrior(nn.Module):
+    """Center Weighting module to adjust the category-specific prior
+    distributions.
+
+    Args:
+        force_topk (bool): When no point falls into gt_bbox, forcibly
+            select the k points closest to the center to calculate
+            the center prior. Defaults to False.
+        topk (int): The number of points used to calculate the
+            center prior when no point falls in gt_bbox. Only work when
+            force_topk if True. Defaults to 9.
+        num_classes (int): The class number of dataset. Defaults to 80.
+        strides (Sequence[int]): The stride of each input feature map.
+            Defaults to (8, 16, 32, 64, 128).
+    """
+
+    def __init__(
+        self,
+        force_topk: bool = False,
+        topk: int = 9,
+        num_classes: int = 80,
+        strides: Sequence[int] = (8, 16, 32, 64, 128)
+    ) -> None:
+        super().__init__()
+        self.mean = nn.Parameter(torch.zeros(num_classes, 2))
+        self.sigma = nn.Parameter(torch.ones(num_classes, 2))
+        self.strides = strides
+        self.force_topk = force_topk
+        self.topk = topk
+
+    def forward(self, anchor_points_list: List[Tensor],
+                gt_instances: InstanceData,
+                inside_gt_bbox_mask: Tensor) -> Tuple[Tensor, Tensor]:
+        """Get the center prior of each point on the feature map for each
+        instance.
+
+        Args:
+            anchor_points_list (list[Tensor]): list of coordinate
+                of points on feature map. Each with shape
+                (num_points, 2).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            inside_gt_bbox_mask (Tensor): Tensor of bool type,
+                with shape of (num_points, num_gt), each
+                value is used to mark whether this point falls
+                within a certain gt.
+
+        Returns:
+            tuple[Tensor, Tensor]:
+
+            - center_prior_weights(Tensor): Float tensor with shape  of \
+            (num_points, num_gt). Each value represents the center \
+            weighting coefficient.
+            - inside_gt_bbox_mask (Tensor): Tensor of bool type, with shape \
+            of (num_points, num_gt), each value is used to mark whether this \
+            point falls within a certain gt or is the topk nearest points for \
+            a specific gt_bbox.
+        """
+        gt_bboxes = gt_instances.bboxes
+        labels = gt_instances.labels
+
+        inside_gt_bbox_mask = inside_gt_bbox_mask.clone()
+        num_gts = len(labels)
+        num_points = sum([len(item) for item in anchor_points_list])
+        if num_gts == 0:
+            return gt_bboxes.new_zeros(num_points,
+                                       num_gts), inside_gt_bbox_mask
+        center_prior_list = []
+        for slvl_points, stride in zip(anchor_points_list, self.strides):
+            # slvl_points: points from single level in FPN, has shape (h*w, 2)
+            # single_level_points has shape (h*w, num_gt, 2)
+            single_level_points = slvl_points[:, None, :].expand(
+                (slvl_points.size(0), len(gt_bboxes), 2))
+            gt_center_x = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2)
+            gt_center_y = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2)
+            gt_center = torch.stack((gt_center_x, gt_center_y), dim=1)
+            gt_center = gt_center[None]
+            # instance_center has shape (1, num_gt, 2)
+            instance_center = self.mean[labels][None]
+            # instance_sigma has shape (1, num_gt, 2)
+            instance_sigma = self.sigma[labels][None]
+            # distance has shape (num_points, num_gt, 2)
+            distance = (((single_level_points - gt_center) / float(stride) -
+                         instance_center)**2)
+            center_prior = torch.exp(-distance /
+                                     (2 * instance_sigma**2)).prod(dim=-1)
+            center_prior_list.append(center_prior)
+        center_prior_weights = torch.cat(center_prior_list, dim=0)
+
+        if self.force_topk:
+            gt_inds_no_points_inside = torch.nonzero(
+                inside_gt_bbox_mask.sum(0) == 0).reshape(-1)
+            if gt_inds_no_points_inside.numel():
+                topk_center_index = \
+                    center_prior_weights[:, gt_inds_no_points_inside].topk(
+                                                             self.topk,
+                                                             dim=0)[1]
+                temp_mask = inside_gt_bbox_mask[:, gt_inds_no_points_inside]
+                inside_gt_bbox_mask[:, gt_inds_no_points_inside] = \
+                    torch.scatter(temp_mask,
+                                  dim=0,
+                                  index=topk_center_index,
+                                  src=torch.ones_like(
+                                    topk_center_index,
+                                    dtype=torch.bool))
+
+        center_prior_weights[~inside_gt_bbox_mask] = 0
+        return center_prior_weights, inside_gt_bbox_mask
+
+
+@MODELS.register_module()
+class AutoAssignHead(FCOSHead):
+    """AutoAssignHead head used in AutoAssign.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.03496>`_ .
+
+    Args:
+        force_topk (bool): Used in center prior initialization to
+            handle extremely small gt. Default is False.
+        topk (int): The number of points used to calculate the
+            center prior when no point falls in gt_bbox. Only work when
+            force_topk if True. Defaults to 9.
+        pos_loss_weight (float): The loss weight of positive loss
+            and with default value 0.25.
+        neg_loss_weight (float): The loss weight of negative loss
+            and with default value 0.75.
+        center_loss_weight (float): The loss weight of center prior
+            loss and with default value 0.75.
+    """
+
+    def __init__(self,
+                 *args,
+                 force_topk: bool = False,
+                 topk: int = 9,
+                 pos_loss_weight: float = 0.25,
+                 neg_loss_weight: float = 0.75,
+                 center_loss_weight: float = 0.75,
+                 **kwargs) -> None:
+        super().__init__(*args, conv_bias=True, **kwargs)
+        self.center_prior = CenterPrior(
+            force_topk=force_topk,
+            topk=topk,
+            num_classes=self.num_classes,
+            strides=self.strides)
+        self.pos_loss_weight = pos_loss_weight
+        self.neg_loss_weight = neg_loss_weight
+        self.center_loss_weight = center_loss_weight
+        self.prior_generator = MlvlPointGenerator(self.strides, offset=0)
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head.
+
+        In particular, we have special initialization for classified conv's and
+        regression conv's bias
+        """
+
+        super(AutoAssignHead, self).init_weights()
+        bias_cls = bias_init_with_prob(0.02)
+        normal_init(self.conv_cls, std=0.01, bias=bias_cls)
+        normal_init(self.conv_reg, std=0.01, bias=4.0)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple[Tensor, Tensor, Tensor]: scores for each class, bbox
+            predictions and centerness predictions of input feature maps.
+        """
+        cls_score, bbox_pred, cls_feat, reg_feat = super(
+            FCOSHead, self).forward_single(x)
+        centerness = self.conv_centerness(reg_feat)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        # bbox_pred needed for gradient computation has been modified
+        # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+        # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+        bbox_pred = bbox_pred.clamp(min=0)
+        bbox_pred *= stride
+        return cls_score, bbox_pred, centerness
+
+    def get_pos_loss_single(self, cls_score: Tensor, objectness: Tensor,
+                            reg_loss: Tensor, gt_instances: InstanceData,
+                            center_prior_weights: Tensor) -> Tuple[Tensor]:
+        """Calculate the positive loss of all points in gt_bboxes.
+
+        Args:
+            cls_score (Tensor): All category scores for each point on
+                the feature map. The shape is (num_points, num_class).
+            objectness (Tensor): Foreground probability of all points,
+                has shape (num_points, 1).
+            reg_loss (Tensor): The regression loss of each gt_bbox and each
+                prediction box, has shape of (num_points, num_gt).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            center_prior_weights (Tensor): Float tensor with shape
+                of (num_points, num_gt). Each value represents
+                the center weighting coefficient.
+
+        Returns:
+            tuple[Tensor]:
+
+            - pos_loss (Tensor): The positive loss of all points in the \
+            gt_bboxes.
+        """
+        gt_labels = gt_instances.labels
+        # p_loc: localization confidence
+        p_loc = torch.exp(-reg_loss)
+        # p_cls: classification confidence
+        p_cls = (cls_score * objectness)[:, gt_labels]
+        # p_pos: joint confidence indicator
+        p_pos = p_cls * p_loc
+
+        # 3 is a hyper-parameter to control the contributions of high and
+        # low confidence locations towards positive losses.
+        confidence_weight = torch.exp(p_pos * 3)
+        p_pos_weight = (confidence_weight * center_prior_weights) / (
+            (confidence_weight * center_prior_weights).sum(
+                0, keepdim=True)).clamp(min=EPS)
+        reweighted_p_pos = (p_pos * p_pos_weight).sum(0)
+        pos_loss = F.binary_cross_entropy(
+            reweighted_p_pos,
+            torch.ones_like(reweighted_p_pos),
+            reduction='none')
+        pos_loss = pos_loss.sum() * self.pos_loss_weight
+        return pos_loss,
+
+    def get_neg_loss_single(self, cls_score: Tensor, objectness: Tensor,
+                            gt_instances: InstanceData, ious: Tensor,
+                            inside_gt_bbox_mask: Tensor) -> Tuple[Tensor]:
+        """Calculate the negative loss of all points in feature map.
+
+        Args:
+            cls_score (Tensor): All category scores for each point on
+                the feature map. The shape is (num_points, num_class).
+            objectness (Tensor): Foreground probability of all points
+                and is shape of (num_points, 1).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            ious (Tensor): Float tensor with shape of (num_points, num_gt).
+                Each value represent the iou of pred_bbox and gt_bboxes.
+            inside_gt_bbox_mask (Tensor): Tensor of bool type,
+                with shape of (num_points, num_gt), each
+                value is used to mark whether this point falls
+                within a certain gt.
+
+        Returns:
+            tuple[Tensor]:
+
+            - neg_loss (Tensor): The negative loss of all points in the \
+            feature map.
+        """
+        gt_labels = gt_instances.labels
+        num_gts = len(gt_labels)
+        joint_conf = (cls_score * objectness)
+        p_neg_weight = torch.ones_like(joint_conf)
+        if num_gts > 0:
+            # the order of dinmension would affect the value of
+            # p_neg_weight, we strictly follow the original
+            # implementation.
+            inside_gt_bbox_mask = inside_gt_bbox_mask.permute(1, 0)
+            ious = ious.permute(1, 0)
+
+            foreground_idxs = torch.nonzero(inside_gt_bbox_mask, as_tuple=True)
+            temp_weight = (1 / (1 - ious[foreground_idxs]).clamp_(EPS))
+
+            def normalize(x):
+                return (x - x.min() + EPS) / (x.max() - x.min() + EPS)
+
+            for instance_idx in range(num_gts):
+                idxs = foreground_idxs[0] == instance_idx
+                if idxs.any():
+                    temp_weight[idxs] = normalize(temp_weight[idxs])
+
+            p_neg_weight[foreground_idxs[1],
+                         gt_labels[foreground_idxs[0]]] = 1 - temp_weight
+
+        logits = (joint_conf * p_neg_weight)
+        neg_loss = (
+            logits**2 * F.binary_cross_entropy(
+                logits, torch.zeros_like(logits), reduction='none'))
+        neg_loss = neg_loss.sum() * self.neg_loss_weight
+        return neg_loss,
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        objectnesses: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            objectnesses (list[Tensor]): objectness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        assert len(cls_scores) == len(bbox_preds) == len(objectnesses)
+        all_num_gt = sum([len(item) for item in batch_gt_instances])
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+        inside_gt_bbox_mask_list, bbox_targets_list = self.get_targets(
+            all_level_points, batch_gt_instances)
+
+        center_prior_weight_list = []
+        temp_inside_gt_bbox_mask_list = []
+        for gt_instances, inside_gt_bbox_mask in zip(batch_gt_instances,
+                                                     inside_gt_bbox_mask_list):
+            center_prior_weight, inside_gt_bbox_mask = \
+                self.center_prior(all_level_points, gt_instances,
+                                  inside_gt_bbox_mask)
+            center_prior_weight_list.append(center_prior_weight)
+            temp_inside_gt_bbox_mask_list.append(inside_gt_bbox_mask)
+        inside_gt_bbox_mask_list = temp_inside_gt_bbox_mask_list
+        mlvl_points = torch.cat(all_level_points, dim=0)
+        bbox_preds = levels_to_images(bbox_preds)
+        cls_scores = levels_to_images(cls_scores)
+        objectnesses = levels_to_images(objectnesses)
+
+        reg_loss_list = []
+        ious_list = []
+        num_points = len(mlvl_points)
+
+        for bbox_pred, encoded_targets, inside_gt_bbox_mask in zip(
+                bbox_preds, bbox_targets_list, inside_gt_bbox_mask_list):
+            temp_num_gt = encoded_targets.size(1)
+            expand_mlvl_points = mlvl_points[:, None, :].expand(
+                num_points, temp_num_gt, 2).reshape(-1, 2)
+            encoded_targets = encoded_targets.reshape(-1, 4)
+            expand_bbox_pred = bbox_pred[:, None, :].expand(
+                num_points, temp_num_gt, 4).reshape(-1, 4)
+            decoded_bbox_preds = self.bbox_coder.decode(
+                expand_mlvl_points, expand_bbox_pred)
+            decoded_target_preds = self.bbox_coder.decode(
+                expand_mlvl_points, encoded_targets)
+            with torch.no_grad():
+                ious = bbox_overlaps(
+                    decoded_bbox_preds, decoded_target_preds, is_aligned=True)
+                ious = ious.reshape(num_points, temp_num_gt)
+                if temp_num_gt:
+                    ious = ious.max(
+                        dim=-1, keepdim=True).values.repeat(1, temp_num_gt)
+                else:
+                    ious = ious.new_zeros(num_points, temp_num_gt)
+                ious[~inside_gt_bbox_mask] = 0
+                ious_list.append(ious)
+            loss_bbox = self.loss_bbox(
+                decoded_bbox_preds,
+                decoded_target_preds,
+                weight=None,
+                reduction_override='none')
+            reg_loss_list.append(loss_bbox.reshape(num_points, temp_num_gt))
+
+        cls_scores = [item.sigmoid() for item in cls_scores]
+        objectnesses = [item.sigmoid() for item in objectnesses]
+        pos_loss_list, = multi_apply(self.get_pos_loss_single, cls_scores,
+                                     objectnesses, reg_loss_list,
+                                     batch_gt_instances,
+                                     center_prior_weight_list)
+        pos_avg_factor = reduce_mean(
+            bbox_pred.new_tensor(all_num_gt)).clamp_(min=1)
+        pos_loss = sum(pos_loss_list) / pos_avg_factor
+
+        neg_loss_list, = multi_apply(self.get_neg_loss_single, cls_scores,
+                                     objectnesses, batch_gt_instances,
+                                     ious_list, inside_gt_bbox_mask_list)
+        neg_avg_factor = sum(item.data.sum()
+                             for item in center_prior_weight_list)
+        neg_avg_factor = reduce_mean(neg_avg_factor).clamp_(min=1)
+        neg_loss = sum(neg_loss_list) / neg_avg_factor
+
+        center_loss = []
+        for i in range(len(batch_img_metas)):
+
+            if inside_gt_bbox_mask_list[i].any():
+                center_loss.append(
+                    len(batch_gt_instances[i]) /
+                    center_prior_weight_list[i].sum().clamp_(min=EPS))
+            # when width or height of gt_bbox is smaller than stride of p3
+            else:
+                center_loss.append(center_prior_weight_list[i].sum() * 0)
+
+        center_loss = torch.stack(center_loss).mean() * self.center_loss_weight
+
+        # avoid dead lock in DDP
+        if all_num_gt == 0:
+            pos_loss = bbox_preds[0].sum() * 0
+            dummy_center_prior_loss = self.center_prior.mean.sum(
+            ) * 0 + self.center_prior.sigma.sum() * 0
+            center_loss = objectnesses[0].sum() * 0 + dummy_center_prior_loss
+
+        loss = dict(
+            loss_pos=pos_loss, loss_neg=neg_loss, loss_center=center_loss)
+
+        return loss
+
+    def get_targets(
+            self, points: List[Tensor], batch_gt_instances: InstanceList
+    ) -> Tuple[List[Tensor], List[Tensor]]:
+        """Compute regression targets and each point inside or outside gt_bbox
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of all fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple(list[Tensor], list[Tensor]):
+
+            - inside_gt_bbox_mask_list (list[Tensor]): Each Tensor is with \
+            bool type and shape of (num_points, num_gt), each value is used \
+            to mark whether this point falls within a certain gt.
+            - concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+            level. Each tensor has shape (num_points, num_gt, 4).
+        """
+
+        concat_points = torch.cat(points, dim=0)
+        # the number of points per img, per lvl
+        inside_gt_bbox_mask_list, bbox_targets_list = multi_apply(
+            self._get_targets_single, batch_gt_instances, points=concat_points)
+        return inside_gt_bbox_mask_list, bbox_targets_list
+
+    def _get_targets_single(self, gt_instances: InstanceData,
+                            points: Tensor) -> Tuple[Tensor, Tensor]:
+        """Compute regression targets and each point inside or outside gt_bbox
+        for a single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            points (Tensor): Points of all fpn level, has shape
+                (num_points, 2).
+
+        Returns:
+            tuple[Tensor, Tensor]: Containing the following Tensors:
+
+            - inside_gt_bbox_mask (Tensor): Bool tensor with shape \
+            (num_points, num_gt), each value is used to mark whether this \
+            point falls within a certain gt.
+            - bbox_targets (Tensor): BBox targets of each points with each \
+            gt_bboxes, has shape (num_points, num_gt, 4).
+        """
+        gt_bboxes = gt_instances.bboxes
+        num_points = points.size(0)
+        num_gts = gt_bboxes.size(0)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None]
+        ys = ys[:, None]
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+        if num_gts:
+            inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
+        else:
+            inside_gt_bbox_mask = bbox_targets.new_zeros((num_points, num_gts),
+                                                         dtype=torch.bool)
+
+        return inside_gt_bbox_mask, bbox_targets
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/base_dense_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/base_dense_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4469e02c469d029cc2791289dbf41554d6a53
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/base_dense_head.py
@@ -0,0 +1,583 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+from inspect import signature
+from typing import List, Optional, Tuple
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, constant_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import (cat_boxes, get_box_tensor, get_box_wh,
+                                   scale_boxes)
+from mmdet.utils import InstanceList, OptMultiConfig
+from ..test_time_augs import merge_aug_results
+from ..utils import (filter_scores_and_topk, select_single_mlvl,
+                     unpack_gt_instances)
+
+
+class BaseDenseHead(BaseModule, metaclass=ABCMeta):
+    """Base class for DenseHeads.
+
+    1. The ``init_weights`` method is used to initialize densehead's
+    model parameters. After detector initialization, ``init_weights``
+    is triggered when ``detector.init_weights()`` is called externally.
+
+    2. The ``loss`` method is used to calculate the loss of densehead,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``loss_by_feat`` method
+    is called based on the feature maps to calculate the loss.
+
+    .. code:: text
+
+    loss(): forward() -> loss_by_feat()
+
+    3. The ``predict`` method is used to predict detection results,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``predict_by_feat`` method
+    is called based on the feature maps to predict detection results including
+    post-processing.
+
+    .. code:: text
+
+    predict(): forward() -> predict_by_feat()
+
+    4. The ``loss_and_predict`` method is used to return loss and detection
+    results at the same time. It will call densehead's ``forward``,
+    ``loss_by_feat`` and ``predict_by_feat`` methods in order.  If one-stage is
+    used as RPN, the densehead needs to return both losses and predictions.
+    This predictions is used as the proposal of roihead.
+
+    .. code:: text
+
+    loss_and_predict(): forward() -> loss_by_feat() -> predict_by_feat()
+    """
+
+    def __init__(self, init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        # `_raw_positive_infos` will be used in `get_positive_infos`, which
+        # can get positive information.
+        self._raw_positive_infos = dict()
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+        super().init_weights()
+        # avoid init_cfg overwrite the initialization of `conv_offset`
+        for m in self.modules():
+            # DeformConv2dPack, ModulatedDeformConv2dPack
+            if hasattr(m, 'conv_offset'):
+                constant_init(m.conv_offset, 0)
+
+    def get_positive_infos(self) -> InstanceList:
+        """Get positive information from sampling results.
+
+        Returns:
+            list[:obj:`InstanceData`]: Positive information of each image,
+            usually including positive bboxes, positive labels, positive
+            priors, etc.
+        """
+        if len(self._raw_positive_infos) == 0:
+            return None
+
+        sampling_results = self._raw_positive_infos.get(
+            'sampling_results', None)
+        assert sampling_results is not None
+        positive_infos = []
+        for sampling_result in enumerate(sampling_results):
+            pos_info = InstanceData()
+            pos_info.bboxes = sampling_result.pos_gt_bboxes
+            pos_info.labels = sampling_result.pos_gt_labels
+            pos_info.priors = sampling_result.pos_priors
+            pos_info.pos_assigned_gt_inds = \
+                sampling_result.pos_assigned_gt_inds
+            pos_info.pos_inds = sampling_result.pos_inds
+            positive_infos.append(pos_info)
+        return positive_infos
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        outs = self(x)
+
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    @abstractmethod
+    def loss_by_feat(self, **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head."""
+        pass
+
+    def loss_and_predict(
+        self,
+        x: Tuple[Tensor],
+        batch_data_samples: SampleList,
+        proposal_cfg: Optional[ConfigDict] = None
+    ) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            proposal_cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+                - losses: (dict[str, Tensor]): A dictionary of loss components.
+                - predictions (list[:obj:`InstanceData`]): Detection
+                  results of each image after the post process.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        outs = self(x)
+
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg)
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        outs = self(x)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        score_factors: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        if score_factors is None:
+            # e.g. Retina, FreeAnchor, Foveabox, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, AutoAssign, etc.
+            with_score_factors = True
+            assert len(cls_scores) == len(score_factors)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device)
+
+        result_list = []
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            cls_score_list = select_single_mlvl(
+                cls_scores, img_id, detach=True)
+            bbox_pred_list = select_single_mlvl(
+                bbox_preds, img_id, detach=True)
+            if with_score_factors:
+                score_factor_list = select_single_mlvl(
+                    score_factors, img_id, detach=True)
+            else:
+                score_factor_list = [None for _ in range(num_levels)]
+
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                score_factor_list=score_factor_list,
+                mlvl_priors=mlvl_priors,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if score_factor_list[0] is None:
+            # e.g. Retina, FreeAnchor, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_labels = []
+        if with_score_factors:
+            mlvl_score_factors = []
+        else:
+            mlvl_score_factors = None
+        for level_idx, (cls_score, bbox_pred, score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, mlvl_priors)):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim)
+            if with_score_factors:
+                score_factor = score_factor.permute(1, 2,
+                                                    0).reshape(-1).sigmoid()
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+
+            # the `custom_cls_channels` parameter is derived from
+            # CrossEntropyCustomLoss and FocalCustomLoss, and is currently used
+            # in v3det.
+            if getattr(self.loss_cls, 'custom_cls_channels', False):
+                scores = self.loss_cls.get_activation(cls_score)
+            elif self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            score_thr = cfg.get('score_thr', 0)
+
+            results = filter_scores_and_topk(
+                scores, score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            if with_score_factors:
+                score_factor = score_factor[keep_idxs]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+            if with_score_factors:
+                mlvl_score_factors.append(score_factor)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_priors)
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+        if with_score_factors:
+            results.score_factors = torch.cat(mlvl_score_factors)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def _bbox_post_process(self,
+                           results: InstanceData,
+                           cfg: ConfigDict,
+                           rescale: bool = False,
+                           with_nms: bool = True,
+                           img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (ConfigDict): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            results.bboxes = scale_boxes(results.bboxes, scale_factor)
+
+        if hasattr(results, 'score_factors'):
+            # TODO: Add sqrt operation in order to be consistent with
+            #  the paper.
+            score_factors = results.pop('score_factors')
+            results.scores = results.scores * score_factors
+
+        # filter small size bboxes
+        if cfg.get('min_bbox_size', -1) >= 0:
+            w, h = get_box_wh(results.bboxes)
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                results = results[valid_mask]
+
+        # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg
+        if with_nms and results.bboxes.numel() > 0:
+            bboxes = get_box_tensor(results.bboxes)
+            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
+                                                results.labels, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:cfg.max_per_img]
+
+        return results
+
+    def aug_test(self,
+                 aug_batch_feats,
+                 aug_batch_img_metas,
+                 rescale=False,
+                 with_ori_nms=False,
+                 **kwargs):
+        """Test function with test time augmentation.
+
+        Args:
+            aug_batch_feats (list[tuple[Tensor]]): The outer list
+                indicates test-time augmentations and inner tuple
+                indicate the multi-level feats from
+                FPN, each Tensor should have a shape (B, C, H, W),
+            aug_batch_img_metas (list[list[dict]]): Meta information
+                of images under the different test-time augs
+                (multiscale, flip, etc.). The outer list indicate
+                the
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+            with_ori_nms (bool): Whether execute the nms in original head.
+                Defaults to False. It will be `True` when the head is
+                adopted as `rpn_head`.
+
+        Returns:
+            list(obj:`InstanceData`): Detection results of the
+            input images. Each item usually contains\
+            following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance,)
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances,).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        # TODO: remove this for detr and deformdetr
+        sig_of_get_results = signature(self.get_results)
+        get_results_args = [
+            p.name for p in sig_of_get_results.parameters.values()
+        ]
+        get_results_single_sig = signature(self._get_results_single)
+        get_results_single_sig_args = [
+            p.name for p in get_results_single_sig.parameters.values()
+        ]
+        assert ('with_nms' in get_results_args) and \
+               ('with_nms' in get_results_single_sig_args), \
+               f'{self.__class__.__name__}' \
+               'does not support test-time augmentation '
+
+        num_imgs = len(aug_batch_img_metas[0])
+        aug_batch_results = []
+        for x, img_metas in zip(aug_batch_feats, aug_batch_img_metas):
+            outs = self.forward(x)
+            batch_instance_results = self.get_results(
+                *outs,
+                img_metas=img_metas,
+                cfg=self.test_cfg,
+                rescale=False,
+                with_nms=with_ori_nms,
+                **kwargs)
+            aug_batch_results.append(batch_instance_results)
+
+        # after merging, bboxes will be rescaled to the original image
+        batch_results = merge_aug_results(aug_batch_results,
+                                          aug_batch_img_metas)
+
+        final_results = []
+        for img_id in range(num_imgs):
+            results = batch_results[img_id]
+            det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores,
+                                                results.labels,
+                                                self.test_cfg.nms)
+            results = results[keep_idxs]
+            # some nms operation may reweight the score such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:self.test_cfg.max_per_img]
+            if rescale:
+                # all results have been mapped to the original scale
+                # in `merge_aug_results`, so just pass
+                pass
+            else:
+                # map to the first aug image scale
+                scale_factor = results.bboxes.new_tensor(
+                    aug_batch_img_metas[0][img_id]['scale_factor'])
+                results.bboxes = \
+                    results.bboxes * scale_factor
+
+            final_results.append(results)
+
+        return final_results
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/base_mask_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/base_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7183d782829aa15bf12b9e2f7ade999c84d0593f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/base_mask_head.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Tuple, Union
+
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig
+from ..utils import unpack_gt_instances
+
+
+class BaseMaskHead(BaseModule, metaclass=ABCMeta):
+    """Base class for mask heads used in One-Stage Instance Segmentation."""
+
+    def __init__(self, init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+    @abstractmethod
+    def loss_by_feat(self, *args, **kwargs):
+        """Calculate the loss based on the features extracted by the mask
+        head."""
+        pass
+
+    @abstractmethod
+    def predict_by_feat(self, *args, **kwargs):
+        """Transform a batch of output features extracted from the head into
+        mask results."""
+        pass
+
+    def loss(self,
+             x: Union[List[Tensor], Tuple[Tensor]],
+             batch_data_samples: SampleList,
+             positive_infos: OptInstanceList = None,
+             **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the mask head on
+        the features of the upstream network.
+
+        Args:
+            x (list[Tensor] | tuple[Tensor]): Features from FPN.
+                Each has a shape (B, C, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            positive_infos (list[:obj:`InstanceData`], optional): Information
+                of positive samples. Used when the label assignment is
+                done outside the MaskHead, e.g., BboxHead in
+                YOLACT or CondInst, etc. When the label assignment is done in
+                MaskHead, it would be None, like SOLO or SOLOv2. All values
+                in it should have shape (num_positive_samples, *).
+
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        if positive_infos is None:
+            outs = self(x)
+        else:
+            outs = self(x, positive_infos)
+
+        assert isinstance(outs, tuple), 'Forward results should be a tuple, ' \
+                                        'even if only one item is returned'
+
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+        for gt_instances, img_metas in zip(batch_gt_instances,
+                                           batch_img_metas):
+            img_shape = img_metas['batch_input_shape']
+            gt_masks = gt_instances.masks.pad(img_shape)
+            gt_instances.masks = gt_masks
+
+        losses = self.loss_by_feat(
+            *outs,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+            positive_infos=positive_infos,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            **kwargs)
+        return losses
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False,
+                results_list: OptInstanceList = None,
+                **kwargs) -> InstanceList:
+        """Test function without test-time augmentation.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+            results_list (list[obj:`InstanceData`], optional): Detection
+                results of each image after the post process. Only exist
+                if there is a `bbox_head`, like `YOLACT`, `CondInst`, etc.
+
+        Returns:
+            list[obj:`InstanceData`]: Instance segmentation
+            results of each image after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance,)
+                - labels (Tensor): Has a shape (num_instances,).
+                - masks (Tensor): Processed mask results, has a
+                  shape (num_instances, h, w).
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        if results_list is None:
+            outs = self(x)
+        else:
+            outs = self(x, results_list)
+
+        results_list = self.predict_by_feat(
+            *outs,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale,
+            results_list=results_list,
+            **kwargs)
+
+        return results_list
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/boxinst_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/boxinst_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d6e8f7777a852cad89b709e59af2d8e12b343a6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/boxinst_head.py
@@ -0,0 +1,252 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from mmengine import MessageHub
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList
+from ..utils.misc import unfold_wo_center
+from .condinst_head import CondInstBboxHead, CondInstMaskHead
+
+
+@MODELS.register_module()
+class BoxInstBboxHead(CondInstBboxHead):
+    """BoxInst box head used in https://arxiv.org/abs/2012.02310."""
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+
+@MODELS.register_module()
+class BoxInstMaskHead(CondInstMaskHead):
+    """BoxInst mask head used in https://arxiv.org/abs/2012.02310.
+
+    This head outputs the mask for BoxInst.
+
+    Args:
+        pairwise_size (dict): The size of neighborhood for each pixel.
+            Defaults to 3.
+        pairwise_dilation (int): The dilation of neighborhood for each pixel.
+            Defaults to 2.
+        warmup_iters (int): Warmup iterations for pair-wise loss.
+            Defaults to 10000.
+    """
+
+    def __init__(self,
+                 *arg,
+                 pairwise_size: int = 3,
+                 pairwise_dilation: int = 2,
+                 warmup_iters: int = 10000,
+                 **kwargs) -> None:
+        self.pairwise_size = pairwise_size
+        self.pairwise_dilation = pairwise_dilation
+        self.warmup_iters = warmup_iters
+        super().__init__(*arg, **kwargs)
+
+    def get_pairwise_affinity(self, mask_logits: Tensor) -> Tensor:
+        """Compute the pairwise affinity for each pixel."""
+        log_fg_prob = F.logsigmoid(mask_logits).unsqueeze(1)
+        log_bg_prob = F.logsigmoid(-mask_logits).unsqueeze(1)
+
+        log_fg_prob_unfold = unfold_wo_center(
+            log_fg_prob,
+            kernel_size=self.pairwise_size,
+            dilation=self.pairwise_dilation)
+        log_bg_prob_unfold = unfold_wo_center(
+            log_bg_prob,
+            kernel_size=self.pairwise_size,
+            dilation=self.pairwise_dilation)
+
+        # the probability of making the same prediction:
+        # p_i * p_j + (1 - p_i) * (1 - p_j)
+        # we compute the the probability in log space
+        # to avoid numerical instability
+        log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold
+        log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold
+
+        # TODO: Figure out the difference between it and directly sum
+        max_ = torch.max(log_same_fg_prob, log_same_bg_prob)
+        log_same_prob = torch.log(
+            torch.exp(log_same_fg_prob - max_) +
+            torch.exp(log_same_bg_prob - max_)) + max_
+
+        return -log_same_prob[:, 0]
+
+    def loss_by_feat(self, mask_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], positive_infos: InstanceList,
+                     **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (list[Tensor]): List of predicted masks, each has
+                shape (num_classes, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+            positive_infos (List[:obj:``InstanceData``]): Information of
+                positive samples of each image that are assigned in detection
+                head.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert positive_infos is not None, \
+            'positive_infos should not be None in `BoxInstMaskHead`'
+        losses = dict()
+
+        loss_mask_project = 0.
+        loss_mask_pairwise = 0.
+        num_imgs = len(mask_preds)
+        total_pos = 0.
+        avg_fatcor = 0.
+
+        for idx in range(num_imgs):
+            (mask_pred, pos_mask_targets, pos_pairwise_masks, num_pos) = \
+                self._get_targets_single(
+                mask_preds[idx], batch_gt_instances[idx],
+                positive_infos[idx])
+            # mask loss
+            total_pos += num_pos
+            if num_pos == 0 or pos_mask_targets is None:
+                loss_project = mask_pred.new_zeros(1).mean()
+                loss_pairwise = mask_pred.new_zeros(1).mean()
+                avg_fatcor += 0.
+            else:
+                # compute the project term
+                loss_project_x = self.loss_mask(
+                    mask_pred.max(dim=1, keepdim=True)[0],
+                    pos_mask_targets.max(dim=1, keepdim=True)[0],
+                    reduction_override='none').sum()
+                loss_project_y = self.loss_mask(
+                    mask_pred.max(dim=2, keepdim=True)[0],
+                    pos_mask_targets.max(dim=2, keepdim=True)[0],
+                    reduction_override='none').sum()
+                loss_project = loss_project_x + loss_project_y
+                # compute the pairwise term
+                pairwise_affinity = self.get_pairwise_affinity(mask_pred)
+                avg_fatcor += pos_pairwise_masks.sum().clamp(min=1.0)
+                loss_pairwise = (pairwise_affinity * pos_pairwise_masks).sum()
+
+            loss_mask_project += loss_project
+            loss_mask_pairwise += loss_pairwise
+
+        if total_pos == 0:
+            total_pos += 1  # avoid nan
+        if avg_fatcor == 0:
+            avg_fatcor += 1  # avoid nan
+        loss_mask_project = loss_mask_project / total_pos
+        loss_mask_pairwise = loss_mask_pairwise / avg_fatcor
+        message_hub = MessageHub.get_current_instance()
+        iter = message_hub.get_info('iter')
+        warmup_factor = min(iter / float(self.warmup_iters), 1.0)
+        loss_mask_pairwise *= warmup_factor
+
+        losses.update(
+            loss_mask_project=loss_mask_project,
+            loss_mask_pairwise=loss_mask_pairwise)
+        return losses
+
+    def _get_targets_single(self, mask_preds: Tensor,
+                            gt_instances: InstanceData,
+                            positive_info: InstanceData):
+        """Compute targets for predictions of single image.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes with shape
+                (num_classes, H, W).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            positive_info (:obj:`InstanceData`): Information of positive
+                samples that are assigned in detection head. It usually
+                contains following keys.
+
+                    - pos_assigned_gt_inds (Tensor): Assigner GT indexes of
+                      positive proposals, has shape (num_pos, )
+                    - pos_inds (Tensor): Positive index of image, has
+                      shape (num_pos, ).
+                    - param_pred (Tensor): Positive param preditions
+                      with shape (num_pos, num_params).
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+            - mask_preds (Tensor): Positive predicted mask with shape
+              (num_pos, mask_h, mask_w).
+            - pos_mask_targets (Tensor): Positive mask targets with shape
+              (num_pos, mask_h, mask_w).
+            - pos_pairwise_masks (Tensor): Positive pairwise masks with
+              shape: (num_pos, num_neighborhood, mask_h, mask_w).
+            - num_pos (int): Positive numbers.
+        """
+        gt_bboxes = gt_instances.bboxes
+        device = gt_bboxes.device
+        # Note that gt_masks are generated by full box
+        # from BoxInstDataPreprocessor
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device).float()
+        # Note that pairwise_masks are generated by image color similarity
+        # from BoxInstDataPreprocessor
+        pairwise_masks = gt_instances.pairwise_masks
+        pairwise_masks = pairwise_masks.to(device=device)
+
+        # process with mask targets
+        pos_assigned_gt_inds = positive_info.get('pos_assigned_gt_inds')
+        scores = positive_info.get('scores')
+        centernesses = positive_info.get('centernesses')
+        num_pos = pos_assigned_gt_inds.size(0)
+
+        if gt_masks.size(0) == 0 or num_pos == 0:
+            return mask_preds, None, None, 0
+        # Since we're producing (near) full image masks,
+        # it'd take too much vram to backprop on every single mask.
+        # Thus we select only a subset.
+        if (self.max_masks_to_train != -1) and \
+           (num_pos > self.max_masks_to_train):
+            perm = torch.randperm(num_pos)
+            select = perm[:self.max_masks_to_train]
+            mask_preds = mask_preds[select]
+            pos_assigned_gt_inds = pos_assigned_gt_inds[select]
+            num_pos = self.max_masks_to_train
+        elif self.topk_masks_per_img != -1:
+            unique_gt_inds = pos_assigned_gt_inds.unique()
+            num_inst_per_gt = max(
+                int(self.topk_masks_per_img / len(unique_gt_inds)), 1)
+
+            keep_mask_preds = []
+            keep_pos_assigned_gt_inds = []
+            for gt_ind in unique_gt_inds:
+                per_inst_pos_inds = (pos_assigned_gt_inds == gt_ind)
+                mask_preds_per_inst = mask_preds[per_inst_pos_inds]
+                gt_inds_per_inst = pos_assigned_gt_inds[per_inst_pos_inds]
+                if sum(per_inst_pos_inds) > num_inst_per_gt:
+                    per_inst_scores = scores[per_inst_pos_inds].sigmoid().max(
+                        dim=1)[0]
+                    per_inst_centerness = centernesses[
+                        per_inst_pos_inds].sigmoid().reshape(-1, )
+                    select = (per_inst_scores * per_inst_centerness).topk(
+                        k=num_inst_per_gt, dim=0)[1]
+                    mask_preds_per_inst = mask_preds_per_inst[select]
+                    gt_inds_per_inst = gt_inds_per_inst[select]
+                keep_mask_preds.append(mask_preds_per_inst)
+                keep_pos_assigned_gt_inds.append(gt_inds_per_inst)
+            mask_preds = torch.cat(keep_mask_preds)
+            pos_assigned_gt_inds = torch.cat(keep_pos_assigned_gt_inds)
+            num_pos = pos_assigned_gt_inds.size(0)
+
+        # Follow the origin implement
+        start = int(self.mask_out_stride // 2)
+        gt_masks = gt_masks[:, start::self.mask_out_stride,
+                            start::self.mask_out_stride]
+        gt_masks = gt_masks.gt(0.5).float()
+        pos_mask_targets = gt_masks[pos_assigned_gt_inds]
+        pos_pairwise_masks = pairwise_masks[pos_assigned_gt_inds]
+        pos_pairwise_masks = pos_pairwise_masks * pos_mask_targets.unsqueeze(1)
+
+        return (mask_preds, pos_mask_targets, pos_pairwise_masks, num_pos)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/cascade_rpn_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/cascade_rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8686cc2c9118094df34a04fdeabd87daa636707
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/cascade_rpn_head.py
@@ -0,0 +1,1110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import division
+import copy
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.ops import DeformConv2d
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, ModuleList
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig,
+                         OptInstanceList, OptMultiConfig)
+from ..task_modules.assigners import RegionAssigner
+from ..task_modules.samplers import PseudoSampler
+from ..utils import (images_to_levels, multi_apply, select_single_mlvl,
+                     unpack_gt_instances)
+from .base_dense_head import BaseDenseHead
+from .rpn_head import RPNHead
+
+
+class AdaptiveConv(BaseModule):
+    """AdaptiveConv used to adapt the sampling location with the anchors.
+
+    Args:
+        in_channels (int): Number of channels in the input image.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int or tuple[int]): Size of the conv kernel.
+            Defaults to 3.
+        stride (int or tuple[int]): Stride of the convolution. Defaults to 1.
+        padding (int or tuple[int]): Zero-padding added to both sides of
+            the input. Defaults to 1.
+        dilation (int or tuple[int]): Spacing between kernel elements.
+            Defaults to 3.
+        groups (int): Number of blocked connections from input channels to
+            output channels. Defaults to 1.
+        bias (bool): If set True, adds a learnable bias to the output.
+            Defaults to False.
+        adapt_type (str): Type of adaptive conv, can be either ``offset``
+            (arbitrary anchors) or 'dilation' (uniform anchor).
+            Defaults to 'dilation'.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict]): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int]] = 3,
+        stride: Union[int, Tuple[int]] = 1,
+        padding: Union[int, Tuple[int]] = 1,
+        dilation: Union[int, Tuple[int]] = 3,
+        groups: int = 1,
+        bias: bool = False,
+        adapt_type: str = 'dilation',
+        init_cfg: MultiConfig = dict(
+            type='Normal', std=0.01, override=dict(name='conv'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert adapt_type in ['offset', 'dilation']
+        self.adapt_type = adapt_type
+
+        assert kernel_size == 3, 'Adaptive conv only supports kernels 3'
+        if self.adapt_type == 'offset':
+            assert stride == 1 and padding == 1 and groups == 1, \
+                'Adaptive conv offset mode only supports padding: {1}, ' \
+                f'stride: {1}, groups: {1}'
+            self.conv = DeformConv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                padding=padding,
+                stride=stride,
+                groups=groups,
+                bias=bias)
+        else:
+            self.conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                padding=dilation,
+                dilation=dilation)
+
+    def forward(self, x: Tensor, offset: Tensor) -> Tensor:
+        """Forward function."""
+        if self.adapt_type == 'offset':
+            N, _, H, W = x.shape
+            assert offset is not None
+            assert H * W == offset.shape[1]
+            # reshape [N, NA, 18] to (N, 18, H, W)
+            offset = offset.permute(0, 2, 1).reshape(N, -1, H, W)
+            offset = offset.contiguous()
+            x = self.conv(x, offset)
+        else:
+            assert offset is None
+            x = self.conv(x)
+        return x
+
+
+@MODELS.register_module()
+class StageCascadeRPNHead(RPNHead):
+    """Stage of CascadeRPNHead.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        anchor_generator (:obj:`ConfigDict` or dict): anchor generator config.
+        adapt_cfg (:obj:`ConfigDict` or dict): adaptation config.
+        bridged_feature (bool): whether update rpn feature. Defaults to False.
+        with_cls (bool): whether use classification branch. Defaults to True.
+        init_cfg :obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 anchor_generator: ConfigType = dict(
+                     type='AnchorGenerator',
+                     scales=[8],
+                     ratios=[1.0],
+                     strides=[4, 8, 16, 32, 64]),
+                 adapt_cfg: ConfigType = dict(type='dilation', dilation=3),
+                 bridged_feature: bool = False,
+                 with_cls: bool = True,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        self.with_cls = with_cls
+        self.anchor_strides = anchor_generator['strides']
+        self.anchor_scales = anchor_generator['scales']
+        self.bridged_feature = bridged_feature
+        self.adapt_cfg = adapt_cfg
+        super().__init__(
+            in_channels=in_channels,
+            anchor_generator=anchor_generator,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        # override sampling and sampler
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            # use PseudoSampler when sampling is False
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        if init_cfg is None:
+            self.init_cfg = dict(
+                type='Normal', std=0.01, override=[dict(name='rpn_reg')])
+            if self.with_cls:
+                self.init_cfg['override'].append(dict(name='rpn_cls'))
+
+    def _init_layers(self) -> None:
+        """Init layers of a CascadeRPN stage."""
+        adapt_cfg = copy.deepcopy(self.adapt_cfg)
+        adapt_cfg['adapt_type'] = adapt_cfg.pop('type')
+        self.rpn_conv = AdaptiveConv(self.in_channels, self.feat_channels,
+                                     **adapt_cfg)
+        if self.with_cls:
+            self.rpn_cls = nn.Conv2d(self.feat_channels,
+                                     self.num_anchors * self.cls_out_channels,
+                                     1)
+        self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward_single(self, x: Tensor, offset: Tensor) -> Tuple[Tensor]:
+        """Forward function of single scale."""
+        bridged_x = x
+        x = self.relu(self.rpn_conv(x, offset))
+        if self.bridged_feature:
+            bridged_x = x  # update feature
+        cls_score = self.rpn_cls(x) if self.with_cls else None
+        bbox_pred = self.rpn_reg(x)
+        return bridged_x, cls_score, bbox_pred
+
+    def forward(
+            self,
+            feats: List[Tensor],
+            offset_list: Optional[List[Tensor]] = None) -> Tuple[List[Tensor]]:
+        """Forward function."""
+        if offset_list is None:
+            offset_list = [None for _ in range(len(feats))]
+        return multi_apply(self.forward_single, feats, offset_list)
+
+    def _region_targets_single(self, flat_anchors: Tensor, valid_flags: Tensor,
+                               gt_instances: InstanceData, img_meta: dict,
+                               gt_instances_ignore: InstanceData,
+                               featmap_sizes: List[Tuple[int, int]],
+                               num_level_anchors: List[int]) -> tuple:
+        """Get anchor targets based on region for single level.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors, ).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            featmap_sizes (list[Tuple[int, int]]): Feature map size each level.
+            num_level_anchors (list[int]): The number of anchors in each level.
+
+        Returns:
+            tuple:
+
+                - labels (Tensor): Labels of each level.
+                - label_weights (Tensor): Label weights of each level.
+                - bbox_targets (Tensor): BBox targets of each level.
+                - bbox_weights (Tensor): BBox weights of each level.
+                - pos_inds (Tensor): positive samples indexes.
+                - neg_inds (Tensor): negative samples indexes.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        pred_instances = InstanceData()
+        pred_instances.priors = flat_anchors
+        pred_instances.valid_flags = valid_flags
+
+        assign_result = self.assigner.assign(
+            pred_instances,
+            gt_instances,
+            img_meta,
+            featmap_sizes,
+            num_level_anchors,
+            self.anchor_scales[0],
+            self.anchor_strides,
+            gt_instances_ignore=gt_instances_ignore,
+            allowed_border=self.train_cfg['allowed_border'])
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_anchors = flat_anchors.shape[0]
+        bbox_targets = torch.zeros_like(flat_anchors)
+        bbox_weights = torch.zeros_like(flat_anchors)
+        labels = flat_anchors.new_zeros(num_anchors, dtype=torch.long)
+        label_weights = flat_anchors.new_zeros(num_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def region_targets(
+        self,
+        anchor_list: List[List[Tensor]],
+        valid_flag_list: List[List[Tensor]],
+        featmap_sizes: List[Tuple[int, int]],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None,
+        return_sampling_results: bool = False,
+    ) -> tuple:
+        """Compute regression and classification targets for anchors when using
+        RegionAssigner.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image.
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image.
+            featmap_sizes (list[Tuple[int, int]]): Feature map size each level.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            tuple:
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - avg_factor (int): Average factor that is used to average
+                  the loss. When using sampling method, avg_factor is usually
+                  the sum of positive and negative priors. When using
+                  ``PseudoSampler``, ``avg_factor`` is usually equal to the
+                  number of positive priors.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         pos_inds_list, neg_inds_list, sampling_results_list) = multi_apply(
+             self._region_targets_single,
+             concat_anchor_list,
+             concat_valid_flag_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             featmap_sizes=featmap_sizes,
+             num_level_anchors=num_level_anchors)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        res = (labels_list, label_weights_list, bbox_targets_list,
+               bbox_weights_list, avg_factor)
+        if return_sampling_results:
+            res = res + (sampling_results_list, )
+        return res
+
+    def get_targets(
+        self,
+        anchor_list: List[List[Tensor]],
+        valid_flag_list: List[List[Tensor]],
+        featmap_sizes: List[Tuple[int, int]],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None,
+        return_sampling_results: bool = False,
+    ) -> tuple:
+        """Compute regression and classification targets for anchors.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image.
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image.
+            featmap_sizes (list[Tuple[int, int]]): Feature map size each level.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            return_sampling_results (bool): Whether to return the sampling
+                results. Defaults to False.
+
+        Returns:
+            tuple:
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - avg_factor (int): Average factor that is used to average
+                  the loss. When using sampling method, avg_factor is usually
+                  the sum of positive and negative priors. When using
+                  ``PseudoSampler``, ``avg_factor`` is usually equal to the
+                  number of positive priors.
+        """
+        if isinstance(self.assigner, RegionAssigner):
+            cls_reg_targets = self.region_targets(
+                anchor_list,
+                valid_flag_list,
+                featmap_sizes,
+                batch_gt_instances,
+                batch_img_metas,
+                batch_gt_instances_ignore=batch_gt_instances_ignore,
+                return_sampling_results=return_sampling_results)
+        else:
+            cls_reg_targets = super().get_targets(
+                anchor_list,
+                valid_flag_list,
+                batch_gt_instances,
+                batch_img_metas,
+                batch_gt_instances_ignore=batch_gt_instances_ignore,
+                return_sampling_results=return_sampling_results)
+        return cls_reg_targets
+
+    def anchor_offset(self, anchor_list: List[List[Tensor]],
+                      anchor_strides: List[int],
+                      featmap_sizes: List[Tuple[int, int]]) -> List[Tensor]:
+        """ Get offset for deformable conv based on anchor shape
+        NOTE: currently support deformable kernel_size=3 and dilation=1
+
+        Args:
+            anchor_list (list[list[tensor])): [NI, NLVL, NA, 4] list of
+                multi-level anchors
+            anchor_strides (list[int]): anchor stride of each level
+
+        Returns:
+            list[tensor]: offset of DeformConv kernel with shapes of
+            [NLVL, NA, 2, 18].
+        """
+
+        def _shape_offset(anchors, stride, ks=3, dilation=1):
+            # currently support kernel_size=3 and dilation=1
+            assert ks == 3 and dilation == 1
+            pad = (ks - 1) // 2
+            idx = torch.arange(-pad, pad + 1, dtype=dtype, device=device)
+            yy, xx = torch.meshgrid(idx, idx)  # return order matters
+            xx = xx.reshape(-1)
+            yy = yy.reshape(-1)
+            w = (anchors[:, 2] - anchors[:, 0]) / stride
+            h = (anchors[:, 3] - anchors[:, 1]) / stride
+            w = w / (ks - 1) - dilation
+            h = h / (ks - 1) - dilation
+            offset_x = w[:, None] * xx  # (NA, ks**2)
+            offset_y = h[:, None] * yy  # (NA, ks**2)
+            return offset_x, offset_y
+
+        def _ctr_offset(anchors, stride, featmap_size):
+            feat_h, feat_w = featmap_size
+            assert len(anchors) == feat_h * feat_w
+
+            x = (anchors[:, 0] + anchors[:, 2]) * 0.5
+            y = (anchors[:, 1] + anchors[:, 3]) * 0.5
+            # compute centers on feature map
+            x = x / stride
+            y = y / stride
+            # compute predefine centers
+            xx = torch.arange(0, feat_w, device=anchors.device)
+            yy = torch.arange(0, feat_h, device=anchors.device)
+            yy, xx = torch.meshgrid(yy, xx)
+            xx = xx.reshape(-1).type_as(x)
+            yy = yy.reshape(-1).type_as(y)
+
+            offset_x = x - xx  # (NA, )
+            offset_y = y - yy  # (NA, )
+            return offset_x, offset_y
+
+        num_imgs = len(anchor_list)
+        num_lvls = len(anchor_list[0])
+        dtype = anchor_list[0][0].dtype
+        device = anchor_list[0][0].device
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        offset_list = []
+        for i in range(num_imgs):
+            mlvl_offset = []
+            for lvl in range(num_lvls):
+                c_offset_x, c_offset_y = _ctr_offset(anchor_list[i][lvl],
+                                                     anchor_strides[lvl],
+                                                     featmap_sizes[lvl])
+                s_offset_x, s_offset_y = _shape_offset(anchor_list[i][lvl],
+                                                       anchor_strides[lvl])
+
+                # offset = ctr_offset + shape_offset
+                offset_x = s_offset_x + c_offset_x[:, None]
+                offset_y = s_offset_y + c_offset_y[:, None]
+
+                # offset order (y0, x0, y1, x2, .., y8, x8, y9, x9)
+                offset = torch.stack([offset_y, offset_x], dim=-1)
+                offset = offset.reshape(offset.size(0), -1)  # [NA, 2*ks**2]
+                mlvl_offset.append(offset)
+            offset_list.append(torch.cat(mlvl_offset))  # [totalNA, 2*ks**2]
+        offset_list = images_to_levels(offset_list, num_level_anchors)
+        return offset_list
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            anchors: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            bbox_weights: Tensor, avg_factor: int) -> tuple:
+        """Loss function on single scale."""
+        # classification loss
+        if self.with_cls:
+            labels = labels.reshape(-1)
+            label_weights = label_weights.reshape(-1)
+            cls_score = cls_score.permute(0, 2, 3,
+                                          1).reshape(-1, self.cls_out_channels)
+            loss_cls = self.loss_cls(
+                cls_score, labels, label_weights, avg_factor=avg_factor)
+        # regression loss
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        bbox_weights = bbox_weights.reshape(-1, 4)
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            anchors = anchors.reshape(-1, 4)
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+        loss_reg = self.loss_bbox(
+            bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor)
+        if self.with_cls:
+            return loss_cls, loss_reg
+        return None, loss_reg
+
+    def loss_by_feat(
+        self,
+        anchor_list: List[List[Tensor]],
+        valid_flag_list: List[List[Tensor]],
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Compute losses of the head.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image.
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds]
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            featmap_sizes,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            return_sampling_results=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results_list) = cls_reg_targets
+        if not sampling_results_list[0].avg_factor_with_neg:
+            # 200 is hard-coded average factor,
+            # which follows guided anchoring.
+            avg_factor = sum([label.numel() for label in labels_list]) / 200.0
+
+        # change per image, per level anchor_list to per_level, per_image
+        mlvl_anchor_list = list(zip(*anchor_list))
+        # concat mlvl_anchor_list
+        mlvl_anchor_list = [
+            torch.cat(anchors, dim=0) for anchors in mlvl_anchor_list
+        ]
+
+        losses = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            mlvl_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            avg_factor=avg_factor)
+        if self.with_cls:
+            return dict(loss_rpn_cls=losses[0], loss_rpn_reg=losses[1])
+        return dict(loss_rpn_reg=losses[1])
+
+    def predict_by_feat(self,
+                        anchor_list: List[List[Tensor]],
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        batch_img_metas: List[dict],
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False) -> InstanceList:
+        """Get proposal predict. Overriding to enable input ``anchor_list``
+        from outside.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image.
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            batch_img_metas (list[dict], Optional): Image meta info.
+            cfg (:obj:`ConfigDict`, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score_list = select_single_mlvl(cls_scores, img_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
+            proposals = self._predict_by_feat_single(
+                cls_scores=cls_score_list,
+                bbox_preds=bbox_pred_list,
+                mlvl_anchors=anchor_list[img_id],
+                img_meta=batch_img_metas[img_id],
+                cfg=cfg,
+                rescale=rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                mlvl_anchors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False) -> InstanceData:
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has
+                shape (num_anchors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Box reference from all scale
+                levels of a single image, each item has shape
+                (num_total_anchors, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (:obj:`ConfigDict`): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        # bboxes from different level should be independent during NMS,
+        # level_ids are used as labels for batched NMS to separate them
+        level_ids = []
+        mlvl_scores = []
+        mlvl_bbox_preds = []
+        mlvl_valid_anchors = []
+        nms_pre = cfg.get('nms_pre', -1)
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                # We set FG labels to [0, num_class-1] and BG label to
+                # num_class in RPN head since mmdet v2.5, which is unified to
+                # be consistent with other head since mmdet v2.0. In mmdet v2.0
+                # to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.
+                scores = rpn_cls_score.softmax(dim=1)[:, 0]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            anchors = mlvl_anchors[idx]
+
+            if 0 < nms_pre < scores.shape[0]:
+                # sort is faster than topk
+                # _, topk_inds = scores.topk(cfg.nms_pre)
+                ranked_scores, rank_inds = scores.sort(descending=True)
+                topk_inds = rank_inds[:nms_pre]
+                scores = ranked_scores[:nms_pre]
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+            mlvl_scores.append(scores)
+            mlvl_bbox_preds.append(rpn_bbox_pred)
+            mlvl_valid_anchors.append(anchors)
+            level_ids.append(
+                scores.new_full((scores.size(0), ), idx, dtype=torch.long))
+
+        anchors = torch.cat(mlvl_valid_anchors)
+        rpn_bbox_pred = torch.cat(mlvl_bbox_preds)
+        bboxes = self.bbox_coder.decode(
+            anchors, rpn_bbox_pred, max_shape=img_meta['img_shape'])
+
+        proposals = InstanceData()
+        proposals.bboxes = bboxes
+        proposals.scores = torch.cat(mlvl_scores)
+        proposals.level_ids = torch.cat(level_ids)
+
+        return self._bbox_post_process(
+            results=proposals, cfg=cfg, rescale=rescale, img_meta=img_meta)
+
+    def refine_bboxes(self, anchor_list: List[List[Tensor]],
+                      bbox_preds: List[Tensor],
+                      img_metas: List[dict]) -> List[List[Tensor]]:
+        """Refine bboxes through stages."""
+        num_levels = len(bbox_preds)
+        new_anchor_list = []
+        for img_id in range(len(img_metas)):
+            mlvl_anchors = []
+            for i in range(num_levels):
+                bbox_pred = bbox_preds[i][img_id].detach()
+                bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+                img_shape = img_metas[img_id]['img_shape']
+                bboxes = self.bbox_coder.decode(anchor_list[img_id][i],
+                                                bbox_pred, img_shape)
+                mlvl_anchors.append(bboxes)
+            new_anchor_list.append(mlvl_anchors)
+        return new_anchor_list
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, _, batch_img_metas = outputs
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        if self.adapt_cfg['type'] == 'offset':
+            offset_list = self.anchor_offset(anchor_list, self.anchor_strides,
+                                             featmap_sizes)
+        else:
+            offset_list = None
+
+        x, cls_score, bbox_pred = self(x, offset_list)
+        rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score, bbox_pred,
+                           batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*rpn_loss_inputs)
+
+        return losses
+
+    def loss_and_predict(
+        self,
+        x: Tuple[Tensor],
+        batch_data_samples: SampleList,
+        proposal_cfg: Optional[ConfigDict] = None,
+    ) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            proposal_cfg (:obj`ConfigDict`, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+                - losses: (dict[str, Tensor]): A dictionary of loss components.
+                - predictions (list[:obj:`InstanceData`]): Detection
+                  results of each image after the post process.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, _, batch_img_metas = outputs
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        if self.adapt_cfg['type'] == 'offset':
+            offset_list = self.anchor_offset(anchor_list, self.anchor_strides,
+                                             featmap_sizes)
+        else:
+            offset_list = None
+
+        x, cls_score, bbox_pred = self(x, offset_list)
+        rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score, bbox_pred,
+                           batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*rpn_loss_inputs)
+
+        predictions = self.predict_by_feat(
+            anchor_list,
+            cls_score,
+            bbox_pred,
+            batch_img_metas=batch_img_metas,
+            cfg=proposal_cfg)
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, _ = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        if self.adapt_cfg['type'] == 'offset':
+            offset_list = self.anchor_offset(anchor_list, self.anchor_strides,
+                                             featmap_sizes)
+        else:
+            offset_list = None
+
+        x, cls_score, bbox_pred = self(x, offset_list)
+        predictions = self.stages[-1].predict_by_feat(
+            anchor_list,
+            cls_score,
+            bbox_pred,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale)
+        return predictions
+
+
+@MODELS.register_module()
+class CascadeRPNHead(BaseDenseHead):
+    """The CascadeRPNHead will predict more accurate region proposals, which is
+    required for two-stage detectors (such as Fast/Faster R-CNN). CascadeRPN
+    consists of a sequence of RPNStage to progressively improve the accuracy of
+    the detected proposals.
+
+    More details can be found in ``https://arxiv.org/abs/1909.06720``.
+
+    Args:
+        num_stages (int): number of CascadeRPN stages.
+        stages (list[:obj:`ConfigDict` or dict]): list of configs to build
+            the stages.
+        train_cfg (list[:obj:`ConfigDict` or dict]): list of configs at
+            training time each stage.
+        test_cfg (:obj:`ConfigDict` or dict): config at testing time.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict]): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 num_stages: int,
+                 stages: List[ConfigType],
+                 train_cfg: List[ConfigType],
+                 test_cfg: ConfigType,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert num_classes == 1, 'Only support num_classes == 1'
+        assert num_stages == len(stages)
+        self.num_stages = num_stages
+        # Be careful! Pretrained weights cannot be loaded when use
+        # nn.ModuleList
+        self.stages = ModuleList()
+        for i in range(len(stages)):
+            train_cfg_i = train_cfg[i] if train_cfg is not None else None
+            stages[i].update(train_cfg=train_cfg_i)
+            stages[i].update(test_cfg=test_cfg)
+            self.stages.append(MODELS.build(stages[i]))
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def loss_by_feat(self):
+        """loss_by_feat() is implemented in StageCascadeRPNHead."""
+        pass
+
+    def predict_by_feat(self):
+        """predict_by_feat() is implemented in StageCascadeRPNHead."""
+        pass
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, _, batch_img_metas = outputs
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, valid_flag_list = self.stages[0].get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        losses = dict()
+
+        for i in range(self.num_stages):
+            stage = self.stages[i]
+
+            if stage.adapt_cfg['type'] == 'offset':
+                offset_list = stage.anchor_offset(anchor_list,
+                                                  stage.anchor_strides,
+                                                  featmap_sizes)
+            else:
+                offset_list = None
+            x, cls_score, bbox_pred = stage(x, offset_list)
+            rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score,
+                               bbox_pred, batch_gt_instances, batch_img_metas)
+            stage_loss = stage.loss_by_feat(*rpn_loss_inputs)
+            for name, value in stage_loss.items():
+                losses['s{}.{}'.format(i, name)] = value
+
+            # refine boxes
+            if i < self.num_stages - 1:
+                anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
+                                                  batch_img_metas)
+
+        return losses
+
+    def loss_and_predict(
+        self,
+        x: Tuple[Tensor],
+        batch_data_samples: SampleList,
+        proposal_cfg: Optional[ConfigDict] = None,
+    ) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            proposal_cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+                - losses: (dict[str, Tensor]): A dictionary of loss components.
+                - predictions (list[:obj:`InstanceData`]): Detection
+                  results of each image after the post process.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, _, batch_img_metas = outputs
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, valid_flag_list = self.stages[0].get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        losses = dict()
+
+        for i in range(self.num_stages):
+            stage = self.stages[i]
+
+            if stage.adapt_cfg['type'] == 'offset':
+                offset_list = stage.anchor_offset(anchor_list,
+                                                  stage.anchor_strides,
+                                                  featmap_sizes)
+            else:
+                offset_list = None
+            x, cls_score, bbox_pred = stage(x, offset_list)
+            rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score,
+                               bbox_pred, batch_gt_instances, batch_img_metas)
+            stage_loss = stage.loss_by_feat(*rpn_loss_inputs)
+            for name, value in stage_loss.items():
+                losses['s{}.{}'.format(i, name)] = value
+
+            # refine boxes
+            if i < self.num_stages - 1:
+                anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
+                                                  batch_img_metas)
+
+        predictions = self.stages[-1].predict_by_feat(
+            anchor_list,
+            cls_score,
+            bbox_pred,
+            batch_img_metas=batch_img_metas,
+            cfg=proposal_cfg)
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, _ = self.stages[0].get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        for i in range(self.num_stages):
+            stage = self.stages[i]
+            if stage.adapt_cfg['type'] == 'offset':
+                offset_list = stage.anchor_offset(anchor_list,
+                                                  stage.anchor_strides,
+                                                  featmap_sizes)
+            else:
+                offset_list = None
+            x, cls_score, bbox_pred = stage(x, offset_list)
+            if i < self.num_stages - 1:
+                anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
+                                                  batch_img_metas)
+
+        predictions = self.stages[-1].predict_by_feat(
+            anchor_list,
+            cls_score,
+            bbox_pred,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale)
+        return predictions
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/centernet_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/centernet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..09f3e599eb176965e53f270014cbd326858b7c17
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/centernet_head.py
@@ -0,0 +1,447 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.ops import batched_nms
+from mmengine.config import ConfigDict
+from mmengine.model import bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from ..utils import (gaussian_radius, gen_gaussian_target, get_local_maximum,
+                     get_topk_from_heatmap, multi_apply,
+                     transpose_and_gather_feat)
+from .base_dense_head import BaseDenseHead
+
+
+@MODELS.register_module()
+class CenterNetHead(BaseDenseHead):
+    """Objects as Points Head. CenterHead use center_point to indicate object's
+    position. Paper link <https://arxiv.org/abs/1904.07850>
+
+    Args:
+        in_channels (int): Number of channel in the input feature map.
+        feat_channels (int): Number of channel in the intermediate feature map.
+        num_classes (int): Number of categories excluding the background
+            category.
+        loss_center_heatmap (:obj:`ConfigDict` or dict): Config of center
+            heatmap loss. Defaults to
+            dict(type='GaussianFocalLoss', loss_weight=1.0)
+        loss_wh (:obj:`ConfigDict` or dict): Config of wh loss. Defaults to
+             dict(type='L1Loss', loss_weight=0.1).
+        loss_offset (:obj:`ConfigDict` or dict): Config of offset loss.
+            Defaults to dict(type='L1Loss', loss_weight=1.0).
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
+            Useless in CenterNet, but we keep this variable for
+            SingleStageDetector.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config
+            of CenterNet.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`], optional): Initialization
+            config dict.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 feat_channels: int,
+                 num_classes: int,
+                 loss_center_heatmap: ConfigType = dict(
+                     type='GaussianFocalLoss', loss_weight=1.0),
+                 loss_wh: ConfigType = dict(type='L1Loss', loss_weight=0.1),
+                 loss_offset: ConfigType = dict(
+                     type='L1Loss', loss_weight=1.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.heatmap_head = self._build_head(in_channels, feat_channels,
+                                             num_classes)
+        self.wh_head = self._build_head(in_channels, feat_channels, 2)
+        self.offset_head = self._build_head(in_channels, feat_channels, 2)
+
+        self.loss_center_heatmap = MODELS.build(loss_center_heatmap)
+        self.loss_wh = MODELS.build(loss_wh)
+        self.loss_offset = MODELS.build(loss_offset)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.fp16_enabled = False
+
+    def _build_head(self, in_channels: int, feat_channels: int,
+                    out_channels: int) -> nn.Sequential:
+        """Build head for each branch."""
+        layer = nn.Sequential(
+            nn.Conv2d(in_channels, feat_channels, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(feat_channels, out_channels, kernel_size=1))
+        return layer
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        bias_init = bias_init_with_prob(0.1)
+        self.heatmap_head[-1].bias.data.fill_(bias_init)
+        for head in [self.wh_head, self.offset_head]:
+            for m in head.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+
+    def forward(self, x: Tuple[Tensor, ...]) -> Tuple[List[Tensor]]:
+        """Forward features. Notice CenterNet head does not use FPN.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            center_heatmap_preds (list[Tensor]): center predict heatmaps for
+                all levels, the channels number is num_classes.
+            wh_preds (list[Tensor]): wh predicts for all levels, the channels
+                number is 2.
+            offset_preds (list[Tensor]): offset predicts for all levels, the
+               channels number is 2.
+        """
+        return multi_apply(self.forward_single, x)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, ...]:
+        """Forward feature of a single level.
+
+        Args:
+            x (Tensor): Feature of a single level.
+
+        Returns:
+            center_heatmap_pred (Tensor): center predict heatmaps, the
+               channels number is num_classes.
+            wh_pred (Tensor): wh predicts, the channels number is 2.
+            offset_pred (Tensor): offset predicts, the channels number is 2.
+        """
+        center_heatmap_pred = self.heatmap_head(x).sigmoid()
+        wh_pred = self.wh_head(x)
+        offset_pred = self.offset_head(x)
+        return center_heatmap_pred, wh_pred, offset_pred
+
+    def loss_by_feat(
+            self,
+            center_heatmap_preds: List[Tensor],
+            wh_preds: List[Tensor],
+            offset_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            center_heatmap_preds (list[Tensor]): center predict heatmaps for
+               all levels with shape (B, num_classes, H, W).
+            wh_preds (list[Tensor]): wh predicts for all levels with
+               shape (B, 2, H, W).
+            offset_preds (list[Tensor]): offset predicts for all levels
+               with shape (B, 2, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: which has components below:
+                - loss_center_heatmap (Tensor): loss of center heatmap.
+                - loss_wh (Tensor): loss of hw heatmap
+                - loss_offset (Tensor): loss of offset heatmap.
+        """
+        assert len(center_heatmap_preds) == len(wh_preds) == len(
+            offset_preds) == 1
+        center_heatmap_pred = center_heatmap_preds[0]
+        wh_pred = wh_preds[0]
+        offset_pred = offset_preds[0]
+
+        gt_bboxes = [
+            gt_instances.bboxes for gt_instances in batch_gt_instances
+        ]
+        gt_labels = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        target_result, avg_factor = self.get_targets(gt_bboxes, gt_labels,
+                                                     center_heatmap_pred.shape,
+                                                     img_shape)
+
+        center_heatmap_target = target_result['center_heatmap_target']
+        wh_target = target_result['wh_target']
+        offset_target = target_result['offset_target']
+        wh_offset_target_weight = target_result['wh_offset_target_weight']
+
+        # Since the channel of wh_target and offset_target is 2, the avg_factor
+        # of loss_center_heatmap is always 1/2 of loss_wh and loss_offset.
+        loss_center_heatmap = self.loss_center_heatmap(
+            center_heatmap_pred, center_heatmap_target, avg_factor=avg_factor)
+        loss_wh = self.loss_wh(
+            wh_pred,
+            wh_target,
+            wh_offset_target_weight,
+            avg_factor=avg_factor * 2)
+        loss_offset = self.loss_offset(
+            offset_pred,
+            offset_target,
+            wh_offset_target_weight,
+            avg_factor=avg_factor * 2)
+        return dict(
+            loss_center_heatmap=loss_center_heatmap,
+            loss_wh=loss_wh,
+            loss_offset=loss_offset)
+
+    def get_targets(self, gt_bboxes: List[Tensor], gt_labels: List[Tensor],
+                    feat_shape: tuple, img_shape: tuple) -> Tuple[dict, int]:
+        """Compute regression and classification targets in multiple images.
+
+        Args:
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box.
+            feat_shape (tuple): feature map shape with value [B, _, H, W]
+            img_shape (tuple): image shape.
+
+        Returns:
+            tuple[dict, float]: The float value is mean avg_factor, the dict
+            has components below:
+               - center_heatmap_target (Tensor): targets of center heatmap, \
+                   shape (B, num_classes, H, W).
+               - wh_target (Tensor): targets of wh predict, shape \
+                   (B, 2, H, W).
+               - offset_target (Tensor): targets of offset predict, shape \
+                   (B, 2, H, W).
+               - wh_offset_target_weight (Tensor): weights of wh and offset \
+                   predict, shape (B, 2, H, W).
+        """
+        img_h, img_w = img_shape[:2]
+        bs, _, feat_h, feat_w = feat_shape
+
+        width_ratio = float(feat_w / img_w)
+        height_ratio = float(feat_h / img_h)
+
+        center_heatmap_target = gt_bboxes[-1].new_zeros(
+            [bs, self.num_classes, feat_h, feat_w])
+        wh_target = gt_bboxes[-1].new_zeros([bs, 2, feat_h, feat_w])
+        offset_target = gt_bboxes[-1].new_zeros([bs, 2, feat_h, feat_w])
+        wh_offset_target_weight = gt_bboxes[-1].new_zeros(
+            [bs, 2, feat_h, feat_w])
+
+        for batch_id in range(bs):
+            gt_bbox = gt_bboxes[batch_id]
+            gt_label = gt_labels[batch_id]
+            center_x = (gt_bbox[:, [0]] + gt_bbox[:, [2]]) * width_ratio / 2
+            center_y = (gt_bbox[:, [1]] + gt_bbox[:, [3]]) * height_ratio / 2
+            gt_centers = torch.cat((center_x, center_y), dim=1)
+
+            for j, ct in enumerate(gt_centers):
+                ctx_int, cty_int = ct.int()
+                ctx, cty = ct
+                scale_box_h = (gt_bbox[j][3] - gt_bbox[j][1]) * height_ratio
+                scale_box_w = (gt_bbox[j][2] - gt_bbox[j][0]) * width_ratio
+                radius = gaussian_radius([scale_box_h, scale_box_w],
+                                         min_overlap=0.3)
+                radius = max(0, int(radius))
+                ind = gt_label[j]
+                gen_gaussian_target(center_heatmap_target[batch_id, ind],
+                                    [ctx_int, cty_int], radius)
+
+                wh_target[batch_id, 0, cty_int, ctx_int] = scale_box_w
+                wh_target[batch_id, 1, cty_int, ctx_int] = scale_box_h
+
+                offset_target[batch_id, 0, cty_int, ctx_int] = ctx - ctx_int
+                offset_target[batch_id, 1, cty_int, ctx_int] = cty - cty_int
+
+                wh_offset_target_weight[batch_id, :, cty_int, ctx_int] = 1
+
+        avg_factor = max(1, center_heatmap_target.eq(1).sum())
+        target_result = dict(
+            center_heatmap_target=center_heatmap_target,
+            wh_target=wh_target,
+            offset_target=offset_target,
+            wh_offset_target_weight=wh_offset_target_weight)
+        return target_result, avg_factor
+
+    def predict_by_feat(self,
+                        center_heatmap_preds: List[Tensor],
+                        wh_preds: List[Tensor],
+                        offset_preds: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        rescale: bool = True,
+                        with_nms: bool = False) -> InstanceList:
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            center_heatmap_preds (list[Tensor]): Center predict heatmaps for
+                all levels with shape (B, num_classes, H, W).
+            wh_preds (list[Tensor]): WH predicts for all levels with
+                shape (B, 2, H, W).
+            offset_preds (list[Tensor]): Offset predicts for all levels
+                with shape (B, 2, H, W).
+            batch_img_metas (list[dict], optional): Batch image meta info.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to True.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Instance segmentation
+            results of each image after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(center_heatmap_preds) == len(wh_preds) == len(
+            offset_preds) == 1
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            result_list.append(
+                self._predict_by_feat_single(
+                    center_heatmap_preds[0][img_id:img_id + 1, ...],
+                    wh_preds[0][img_id:img_id + 1, ...],
+                    offset_preds[0][img_id:img_id + 1, ...],
+                    batch_img_metas[img_id],
+                    rescale=rescale,
+                    with_nms=with_nms))
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                center_heatmap_pred: Tensor,
+                                wh_pred: Tensor,
+                                offset_pred: Tensor,
+                                img_meta: dict,
+                                rescale: bool = True,
+                                with_nms: bool = False) -> InstanceData:
+        """Transform outputs of a single image into bbox results.
+
+        Args:
+            center_heatmap_pred (Tensor): Center heatmap for current level with
+                shape (1, num_classes, H, W).
+            wh_pred (Tensor): WH heatmap for current level with shape
+                (1, num_classes, H, W).
+            offset_pred (Tensor): Offset for current level with shape
+                (1, corner_offset_channels, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to True.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        batch_det_bboxes, batch_labels = self._decode_heatmap(
+            center_heatmap_pred,
+            wh_pred,
+            offset_pred,
+            img_meta['batch_input_shape'],
+            k=self.test_cfg.topk,
+            kernel=self.test_cfg.local_maximum_kernel)
+
+        det_bboxes = batch_det_bboxes.view([-1, 5])
+        det_labels = batch_labels.view(-1)
+
+        batch_border = det_bboxes.new_tensor(img_meta['border'])[...,
+                                                                 [2, 0, 2, 0]]
+        det_bboxes[..., :4] -= batch_border
+
+        if rescale and 'scale_factor' in img_meta:
+            det_bboxes[..., :4] /= det_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        if with_nms:
+            det_bboxes, det_labels = self._bboxes_nms(det_bboxes, det_labels,
+                                                      self.test_cfg)
+        results = InstanceData()
+        results.bboxes = det_bboxes[..., :4]
+        results.scores = det_bboxes[..., 4]
+        results.labels = det_labels
+        return results
+
+    def _decode_heatmap(self,
+                        center_heatmap_pred: Tensor,
+                        wh_pred: Tensor,
+                        offset_pred: Tensor,
+                        img_shape: tuple,
+                        k: int = 100,
+                        kernel: int = 3) -> Tuple[Tensor, Tensor]:
+        """Transform outputs into detections raw bbox prediction.
+
+        Args:
+            center_heatmap_pred (Tensor): center predict heatmap,
+               shape (B, num_classes, H, W).
+            wh_pred (Tensor): wh predict, shape (B, 2, H, W).
+            offset_pred (Tensor): offset predict, shape (B, 2, H, W).
+            img_shape (tuple): image shape in hw format.
+            k (int): Get top k center keypoints from heatmap. Defaults to 100.
+            kernel (int): Max pooling kernel for extract local maximum pixels.
+               Defaults to 3.
+
+        Returns:
+            tuple[Tensor]: Decoded output of CenterNetHead, containing
+               the following Tensors:
+
+              - batch_bboxes (Tensor): Coords of each box with shape (B, k, 5)
+              - batch_topk_labels (Tensor): Categories of each box with \
+                  shape (B, k)
+        """
+        height, width = center_heatmap_pred.shape[2:]
+        inp_h, inp_w = img_shape
+
+        center_heatmap_pred = get_local_maximum(
+            center_heatmap_pred, kernel=kernel)
+
+        *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
+            center_heatmap_pred, k=k)
+        batch_scores, batch_index, batch_topk_labels = batch_dets
+
+        wh = transpose_and_gather_feat(wh_pred, batch_index)
+        offset = transpose_and_gather_feat(offset_pred, batch_index)
+        topk_xs = topk_xs + offset[..., 0]
+        topk_ys = topk_ys + offset[..., 1]
+        tl_x = (topk_xs - wh[..., 0] / 2) * (inp_w / width)
+        tl_y = (topk_ys - wh[..., 1] / 2) * (inp_h / height)
+        br_x = (topk_xs + wh[..., 0] / 2) * (inp_w / width)
+        br_y = (topk_ys + wh[..., 1] / 2) * (inp_h / height)
+
+        batch_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], dim=2)
+        batch_bboxes = torch.cat((batch_bboxes, batch_scores[..., None]),
+                                 dim=-1)
+        return batch_bboxes, batch_topk_labels
+
+    def _bboxes_nms(self, bboxes: Tensor, labels: Tensor,
+                    cfg: ConfigDict) -> Tuple[Tensor, Tensor]:
+        """bboxes nms."""
+        if labels.numel() > 0:
+            max_num = cfg.max_per_img
+            bboxes, keep = batched_nms(bboxes[:, :4], bboxes[:,
+                                                             -1].contiguous(),
+                                       labels, cfg.nms)
+            if max_num > 0:
+                bboxes = bboxes[:max_num]
+                labels = labels[keep][:max_num]
+
+        return bboxes, labels
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/centernet_update_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/centernet_update_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..00cfcb89806209c9416b1bd7e9a14d82a4911175
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/centernet_update_head.py
@@ -0,0 +1,624 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Scale
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox2distance
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..utils import multi_apply
+from .anchor_free_head import AnchorFreeHead
+
+INF = 1000000000
+RangeType = Sequence[Tuple[int, int]]
+
+
+def _transpose(tensor_list: List[Tensor],
+               num_point_list: list) -> List[Tensor]:
+    """This function is used to transpose image first tensors to level first
+    ones."""
+    for img_idx in range(len(tensor_list)):
+        tensor_list[img_idx] = torch.split(
+            tensor_list[img_idx], num_point_list, dim=0)
+
+    tensors_level_first = []
+    for targets_per_level in zip(*tensor_list):
+        tensors_level_first.append(torch.cat(targets_per_level, dim=0))
+    return tensors_level_first
+
+
+@MODELS.register_module()
+class CenterNetUpdateHead(AnchorFreeHead):
+    """CenterNetUpdateHead is an improved version of CenterNet in CenterNet2.
+    Paper link `<https://arxiv.org/abs/2103.07461>`_.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channel in the input feature map.
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        hm_min_radius (int): Heatmap target minimum radius of cls branch.
+            Defaults to 4.
+        hm_min_overlap (float): Heatmap target minimum overlap of cls branch.
+            Defaults to 0.8.
+        more_pos_thresh (float): The filtering threshold when the cls branch
+            adds more positive samples. Defaults to 0.2.
+        more_pos_topk (int): The maximum number of additional positive samples
+            added to each gt. Defaults to 9.
+        soft_weight_on_reg (bool): Whether to use the soft target of the
+            cls branch as the soft weight of the bbox branch.
+            Defaults to False.
+        loss_cls (:obj:`ConfigDict` or dict): Config of cls loss. Defaults to
+            dict(type='GaussianFocalLoss', loss_weight=1.0)
+        loss_bbox (:obj:`ConfigDict` or dict): Config of bbox loss. Defaults to
+             dict(type='GIoULoss', loss_weight=2.0).
+        norm_cfg (:obj:`ConfigDict` or dict, optional): dictionary to construct
+            and config norm layer.  Defaults to
+            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
+            Unused in CenterNet. Reserved for compatibility with
+            SingleStageDetector.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config
+            of CenterNet.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 regress_ranges: RangeType = ((0, 80), (64, 160), (128, 320),
+                                              (256, 640), (512, INF)),
+                 hm_min_radius: int = 4,
+                 hm_min_overlap: float = 0.8,
+                 more_pos_thresh: float = 0.2,
+                 more_pos_topk: int = 9,
+                 soft_weight_on_reg: bool = False,
+                 loss_cls: ConfigType = dict(
+                     type='GaussianFocalLoss',
+                     pos_weight=0.25,
+                     neg_weight=0.75,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='GIoULoss', loss_weight=2.0),
+                 norm_cfg: OptConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            norm_cfg=norm_cfg,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            **kwargs)
+        self.soft_weight_on_reg = soft_weight_on_reg
+        self.hm_min_radius = hm_min_radius
+        self.more_pos_thresh = more_pos_thresh
+        self.more_pos_topk = more_pos_topk
+        self.delta = (1 - hm_min_overlap) / (1 + hm_min_overlap)
+        self.sigmoid_clamp = 0.0001
+
+        # GaussianFocalLoss must be sigmoid mode
+        self.use_sigmoid_cls = True
+        self.cls_out_channels = num_classes
+
+        self.regress_ranges = regress_ranges
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+    def _init_predictor(self) -> None:
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.num_classes, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of each level outputs.
+
+            - cls_scores (list[Tensor]): Box scores for each scale level, \
+            each is a 4D-tensor, the channel number is num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for each \
+            scale level, each is a 4D-tensor, the channel number is 4.
+        """
+        return multi_apply(self.forward_single, x, self.scales, self.strides)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps.
+
+        Returns:
+            tuple: scores for each class, bbox predictions of
+            input feature maps.
+        """
+        cls_score, bbox_pred, _, _ = super().forward_single(x)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        # bbox_pred needed for gradient computation has been modified
+        # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+        # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+        bbox_pred = bbox_pred.clamp(min=0)
+        if not self.training:
+            bbox_pred *= stride
+        return cls_score, bbox_pred
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = cls_scores[0].size(0)
+        assert len(cls_scores) == len(bbox_preds)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+
+        # 1 flatten outputs
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        assert (torch.isfinite(flatten_bbox_preds).all().item())
+
+        # 2 calc reg and cls branch targets
+        cls_targets, bbox_targets = self.get_targets(all_level_points,
+                                                     batch_gt_instances)
+
+        # 3 add more pos index for cls branch
+        featmap_sizes = flatten_points.new_tensor(featmap_sizes)
+        pos_inds, cls_labels = self.add_cls_pos_inds(flatten_points,
+                                                     flatten_bbox_preds,
+                                                     featmap_sizes,
+                                                     batch_gt_instances)
+
+        # 4 calc cls loss
+        if pos_inds is None:
+            # num_gts=0
+            num_pos_cls = bbox_preds[0].new_tensor(0, dtype=torch.float)
+        else:
+            num_pos_cls = bbox_preds[0].new_tensor(
+                len(pos_inds), dtype=torch.float)
+        num_pos_cls = max(reduce_mean(num_pos_cls), 1.0)
+        flatten_cls_scores = flatten_cls_scores.sigmoid().clamp(
+            min=self.sigmoid_clamp, max=1 - self.sigmoid_clamp)
+        cls_loss = self.loss_cls(
+            flatten_cls_scores,
+            cls_targets,
+            pos_inds=pos_inds,
+            pos_labels=cls_labels,
+            avg_factor=num_pos_cls)
+
+        # 5 calc reg loss
+        pos_bbox_inds = torch.nonzero(
+            bbox_targets.max(dim=1)[0] >= 0).squeeze(1)
+        pos_bbox_preds = flatten_bbox_preds[pos_bbox_inds]
+        pos_bbox_targets = bbox_targets[pos_bbox_inds]
+
+        bbox_weight_map = cls_targets.max(dim=1)[0]
+        bbox_weight_map = bbox_weight_map[pos_bbox_inds]
+        bbox_weight_map = bbox_weight_map if self.soft_weight_on_reg \
+            else torch.ones_like(bbox_weight_map)
+        num_pos_bbox = max(reduce_mean(bbox_weight_map.sum()), 1.0)
+
+        if len(pos_bbox_inds) > 0:
+            pos_points = flatten_points[pos_bbox_inds]
+            pos_decoded_bbox_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_preds)
+            pos_decoded_target_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_targets)
+            bbox_loss = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds,
+                weight=bbox_weight_map,
+                avg_factor=num_pos_bbox)
+        else:
+            bbox_loss = flatten_bbox_preds.sum() * 0
+
+        return dict(loss_cls=cls_loss, loss_bbox=bbox_loss)
+
+    def get_targets(
+        self,
+        points: List[Tensor],
+        batch_gt_instances: InstanceList,
+    ) -> Tuple[Tensor, Tensor]:
+        """Compute classification and bbox targets for points in multiple
+        images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple: Targets of each level.
+
+            - concat_lvl_labels (Tensor): Labels of all level and batch.
+            - concat_lvl_bbox_targets (Tensor): BBox targets of all \
+            level and batch.
+        """
+        assert len(points) == len(self.regress_ranges)
+
+        num_levels = len(points)
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+        concat_strides = torch.cat([
+            concat_points.new_ones(num_points[i]) * self.strides[i]
+            for i in range(num_levels)
+        ])
+
+        # get labels and bbox_targets of each image
+        cls_targets_list, bbox_targets_list = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            points=concat_points,
+            regress_ranges=concat_regress_ranges,
+            strides=concat_strides)
+
+        bbox_targets_list = _transpose(bbox_targets_list, num_points)
+        cls_targets_list = _transpose(cls_targets_list, num_points)
+        concat_lvl_bbox_targets = torch.cat(bbox_targets_list, 0)
+        concat_lvl_cls_targets = torch.cat(cls_targets_list, dim=0)
+        return concat_lvl_cls_targets, concat_lvl_bbox_targets
+
+    def _get_targets_single(self, gt_instances: InstanceData, points: Tensor,
+                            regress_ranges: Tensor,
+                            strides: Tensor) -> Tuple[Tensor, Tensor]:
+        """Compute classification and bbox targets for a single image."""
+        num_points = points.size(0)
+        num_gts = len(gt_instances)
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,
+                                       self.num_classes),
+                                      self.num_classes), \
+                   gt_bboxes.new_full((num_points, 4), -1)
+
+        # Calculate the regression tblr target corresponding to all points
+        points = points[:, None].expand(num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        strides = strides[:, None, None].expand(num_points, num_gts, 2)
+
+        bbox_target = bbox2distance(points, gt_bboxes)  # M x N x 4
+
+        # condition1: inside a gt bbox
+        inside_gt_bbox_mask = bbox_target.min(dim=2)[0] > 0  # M x N
+
+        # condition2: Calculate the nearest points from
+        # the upper, lower, left and right ranges from
+        # the center of the gt bbox
+        centers = ((gt_bboxes[..., [0, 1]] + gt_bboxes[..., [2, 3]]) / 2)
+        centers_discret = ((centers / strides).int() * strides).float() + \
+            strides / 2
+
+        centers_discret_dist = points - centers_discret
+        dist_x = centers_discret_dist[..., 0].abs()
+        dist_y = centers_discret_dist[..., 1].abs()
+        inside_gt_center3x3_mask = (dist_x <= strides[..., 0]) & \
+                                   (dist_y <= strides[..., 0])
+
+        # condition3: limit the regression range for each location
+        bbox_target_wh = bbox_target[..., :2] + bbox_target[..., 2:]
+        crit = (bbox_target_wh**2).sum(dim=2)**0.5 / 2
+        inside_fpn_level_mask = (crit >= regress_ranges[:, [0]]) & \
+                                (crit <= regress_ranges[:, [1]])
+        bbox_target_mask = inside_gt_bbox_mask & \
+            inside_gt_center3x3_mask & \
+            inside_fpn_level_mask
+
+        # Calculate the distance weight map
+        gt_center_peak_mask = ((centers_discret_dist**2).sum(dim=2) == 0)
+        weighted_dist = ((points - centers)**2).sum(dim=2)  # M x N
+        weighted_dist[gt_center_peak_mask] = 0
+
+        areas = (gt_bboxes[..., 2] - gt_bboxes[..., 0]) * (
+            gt_bboxes[..., 3] - gt_bboxes[..., 1])
+        radius = self.delta**2 * 2 * areas
+        radius = torch.clamp(radius, min=self.hm_min_radius**2)
+        weighted_dist = weighted_dist / radius
+
+        # Calculate bbox_target
+        bbox_weighted_dist = weighted_dist.clone()
+        bbox_weighted_dist[bbox_target_mask == 0] = INF * 1.0
+        min_dist, min_inds = bbox_weighted_dist.min(dim=1)
+        bbox_target = bbox_target[range(len(bbox_target)),
+                                  min_inds]  # M x N x 4 --> M x 4
+        bbox_target[min_dist == INF] = -INF
+
+        # Convert to feature map scale
+        bbox_target /= strides[:, 0, :].repeat(1, 2)
+
+        # Calculate cls_target
+        cls_target = self._create_heatmaps_from_dist(weighted_dist, gt_labels)
+
+        return cls_target, bbox_target
+
+    @torch.no_grad()
+    def add_cls_pos_inds(
+        self, flatten_points: Tensor, flatten_bbox_preds: Tensor,
+        featmap_sizes: Tensor, batch_gt_instances: InstanceList
+    ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        """Provide additional adaptive positive samples to the classification
+        branch.
+
+        Args:
+            flatten_points (Tensor): The point after flatten, including
+                batch image and all levels. The shape is (N, 2).
+            flatten_bbox_preds (Tensor): The bbox predicts after flatten,
+                including batch image and all levels. The shape is (N, 4).
+            featmap_sizes (Tensor): Feature map size of all layers.
+                The shape is (5, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+           tuple:
+
+           - pos_inds (Tensor): Adaptively selected positive sample index.
+           - cls_labels (Tensor): Corresponding positive class label.
+        """
+        outputs = self._get_center3x3_region_index_targets(
+            batch_gt_instances, featmap_sizes)
+        cls_labels, fpn_level_masks, center3x3_inds, \
+            center3x3_bbox_targets, center3x3_masks = outputs
+
+        num_gts, total_level, K = cls_labels.shape[0], len(
+            self.strides), center3x3_masks.shape[-1]
+
+        if num_gts == 0:
+            return None, None
+
+        # The out-of-bounds index is forcibly set to 0
+        # to prevent loss calculation errors
+        center3x3_inds[center3x3_masks == 0] = 0
+        reg_pred_center3x3 = flatten_bbox_preds[center3x3_inds]
+        center3x3_points = flatten_points[center3x3_inds].view(-1, 2)
+
+        center3x3_bbox_targets_expand = center3x3_bbox_targets.view(
+            -1, 4).clamp(min=0)
+
+        pos_decoded_bbox_preds = self.bbox_coder.decode(
+            center3x3_points, reg_pred_center3x3.view(-1, 4))
+        pos_decoded_target_preds = self.bbox_coder.decode(
+            center3x3_points, center3x3_bbox_targets_expand)
+        center3x3_bbox_loss = self.loss_bbox(
+            pos_decoded_bbox_preds,
+            pos_decoded_target_preds,
+            None,
+            reduction_override='none').view(num_gts, total_level,
+                                            K) / self.loss_bbox.loss_weight
+
+        # Invalid index Loss set to infinity
+        center3x3_bbox_loss[center3x3_masks == 0] = INF
+
+        # 4 is the center point of the sampled 9 points, the center point
+        # of gt bbox after discretization.
+        # The center point of gt bbox after discretization
+        # must be a positive sample, so we force its loss to be set to 0.
+        center3x3_bbox_loss.view(-1, K)[fpn_level_masks.view(-1), 4] = 0
+        center3x3_bbox_loss = center3x3_bbox_loss.view(num_gts, -1)
+
+        loss_thr = torch.kthvalue(
+            center3x3_bbox_loss, self.more_pos_topk, dim=1)[0]
+
+        loss_thr[loss_thr > self.more_pos_thresh] = self.more_pos_thresh
+        new_pos = center3x3_bbox_loss < loss_thr.view(num_gts, 1)
+        pos_inds = center3x3_inds.view(num_gts, -1)[new_pos]
+        cls_labels = cls_labels.view(num_gts,
+                                     1).expand(num_gts,
+                                               total_level * K)[new_pos]
+        return pos_inds, cls_labels
+
+    def _create_heatmaps_from_dist(self, weighted_dist: Tensor,
+                                   cls_labels: Tensor) -> Tensor:
+        """Generate heatmaps of classification branch based on weighted
+        distance map."""
+        heatmaps = weighted_dist.new_zeros(
+            (weighted_dist.shape[0], self.num_classes))
+        for c in range(self.num_classes):
+            inds = (cls_labels == c)  # N
+            if inds.int().sum() == 0:
+                continue
+            heatmaps[:, c] = torch.exp(-weighted_dist[:, inds].min(dim=1)[0])
+            zeros = heatmaps[:, c] < 1e-4
+            heatmaps[zeros, c] = 0
+        return heatmaps
+
+    def _get_center3x3_region_index_targets(self,
+                                            bacth_gt_instances: InstanceList,
+                                            shapes_per_level: Tensor) -> tuple:
+        """Get the center (and the 3x3 region near center) locations and target
+        of each objects."""
+        cls_labels = []
+        inside_fpn_level_masks = []
+        center3x3_inds = []
+        center3x3_masks = []
+        center3x3_bbox_targets = []
+
+        total_levels = len(self.strides)
+        batch = len(bacth_gt_instances)
+
+        shapes_per_level = shapes_per_level.long()
+        area_per_level = (shapes_per_level[:, 0] * shapes_per_level[:, 1])
+
+        # Select a total of 9 positions of 3x3 in the center of the gt bbox
+        # as candidate positive samples
+        K = 9
+        dx = shapes_per_level.new_tensor([-1, 0, 1, -1, 0, 1, -1, 0,
+                                          1]).view(1, 1, K)
+        dy = shapes_per_level.new_tensor([-1, -1, -1, 0, 0, 0, 1, 1,
+                                          1]).view(1, 1, K)
+
+        regress_ranges = shapes_per_level.new_tensor(self.regress_ranges).view(
+            len(self.regress_ranges), 2)  # L x 2
+        strides = shapes_per_level.new_tensor(self.strides)
+
+        start_coord_pre_level = []
+        _start = 0
+        for level in range(total_levels):
+            start_coord_pre_level.append(_start)
+            _start = _start + batch * area_per_level[level]
+        start_coord_pre_level = shapes_per_level.new_tensor(
+            start_coord_pre_level).view(1, total_levels, 1)
+        area_per_level = area_per_level.view(1, total_levels, 1)
+
+        for im_i in range(batch):
+            gt_instance = bacth_gt_instances[im_i]
+            gt_bboxes = gt_instance.bboxes
+            gt_labels = gt_instance.labels
+            num_gts = gt_bboxes.shape[0]
+            if num_gts == 0:
+                continue
+
+            cls_labels.append(gt_labels)
+
+            gt_bboxes = gt_bboxes[:, None].expand(num_gts, total_levels, 4)
+            expanded_strides = strides[None, :,
+                                       None].expand(num_gts, total_levels, 2)
+            expanded_regress_ranges = regress_ranges[None].expand(
+                num_gts, total_levels, 2)
+            expanded_shapes_per_level = shapes_per_level[None].expand(
+                num_gts, total_levels, 2)
+
+            # calc reg_target
+            centers = ((gt_bboxes[..., [0, 1]] + gt_bboxes[..., [2, 3]]) / 2)
+            centers_inds = (centers / expanded_strides).long()
+            centers_discret = centers_inds * expanded_strides \
+                + expanded_strides // 2
+
+            bbox_target = bbox2distance(centers_discret,
+                                        gt_bboxes)  # M x N x 4
+
+            # calc inside_fpn_level_mask
+            bbox_target_wh = bbox_target[..., :2] + bbox_target[..., 2:]
+            crit = (bbox_target_wh**2).sum(dim=2)**0.5 / 2
+            inside_fpn_level_mask = \
+                (crit >= expanded_regress_ranges[..., 0]) & \
+                (crit <= expanded_regress_ranges[..., 1])
+
+            inside_gt_bbox_mask = bbox_target.min(dim=2)[0] >= 0
+            inside_fpn_level_mask = inside_gt_bbox_mask & inside_fpn_level_mask
+            inside_fpn_level_masks.append(inside_fpn_level_mask)
+
+            # calc center3x3_ind and mask
+            expand_ws = expanded_shapes_per_level[..., 1:2].expand(
+                num_gts, total_levels, K)
+            expand_hs = expanded_shapes_per_level[..., 0:1].expand(
+                num_gts, total_levels, K)
+            centers_inds_x = centers_inds[..., 0:1]
+            centers_inds_y = centers_inds[..., 1:2]
+
+            center3x3_idx = start_coord_pre_level + \
+                im_i * area_per_level + \
+                (centers_inds_y + dy) * expand_ws + \
+                (centers_inds_x + dx)
+            center3x3_mask = \
+                ((centers_inds_y + dy) < expand_hs) & \
+                ((centers_inds_y + dy) >= 0) & \
+                ((centers_inds_x + dx) < expand_ws) & \
+                ((centers_inds_x + dx) >= 0)
+
+            # recalc center3x3 region reg target
+            bbox_target = bbox_target / expanded_strides.repeat(1, 1, 2)
+            center3x3_bbox_target = bbox_target[..., None, :].expand(
+                num_gts, total_levels, K, 4).clone()
+            center3x3_bbox_target[..., 0] += dx
+            center3x3_bbox_target[..., 1] += dy
+            center3x3_bbox_target[..., 2] -= dx
+            center3x3_bbox_target[..., 3] -= dy
+            # update center3x3_mask
+            center3x3_mask = center3x3_mask & (
+                center3x3_bbox_target.min(dim=3)[0] >= 0)  # n x L x K
+
+            center3x3_inds.append(center3x3_idx)
+            center3x3_masks.append(center3x3_mask)
+            center3x3_bbox_targets.append(center3x3_bbox_target)
+
+        if len(inside_fpn_level_masks) > 0:
+            cls_labels = torch.cat(cls_labels, dim=0)
+            inside_fpn_level_masks = torch.cat(inside_fpn_level_masks, dim=0)
+            center3x3_inds = torch.cat(center3x3_inds, dim=0).long()
+            center3x3_bbox_targets = torch.cat(center3x3_bbox_targets, dim=0)
+            center3x3_masks = torch.cat(center3x3_masks, dim=0)
+        else:
+            cls_labels = shapes_per_level.new_zeros(0).long()
+            inside_fpn_level_masks = shapes_per_level.new_zeros(
+                (0, total_levels)).bool()
+            center3x3_inds = shapes_per_level.new_zeros(
+                (0, total_levels, K)).long()
+            center3x3_bbox_targets = shapes_per_level.new_zeros(
+                (0, total_levels, K, 4)).float()
+            center3x3_masks = shapes_per_level.new_zeros(
+                (0, total_levels, K)).bool()
+        return cls_labels, inside_fpn_level_masks, center3x3_inds, \
+            center3x3_bbox_targets, center3x3_masks
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/centripetal_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/centripetal_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..18f6601ff82394864d53351b10b40f51eb2aec6b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/centripetal_head.py
@@ -0,0 +1,459 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import DeformConv2d
+from mmengine.model import normal_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, OptInstanceList,
+                         OptMultiConfig)
+from ..utils import multi_apply
+from .corner_head import CornerHead
+
+
+@MODELS.register_module()
+class CentripetalHead(CornerHead):
+    """Head of CentripetalNet: Pursuing High-quality Keypoint Pairs for Object
+    Detection.
+
+    CentripetalHead inherits from :class:`CornerHead`. It removes the
+    embedding branch and adds guiding shift and centripetal shift branches.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2003.09119>`_ .
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        num_feat_levels (int): Levels of feature from the previous module.
+            2 for HourglassNet-104 and 1 for HourglassNet-52. HourglassNet-104
+            outputs the final feature and intermediate supervision feature and
+            HourglassNet-52 only outputs the final feature. Defaults to 2.
+        corner_emb_channels (int): Channel of embedding vector. Defaults to 1.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
+            Useless in CornerHead, but we keep this variable for
+            SingleStageDetector.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            CornerHead.
+        loss_heatmap (:obj:`ConfigDict` or dict): Config of corner heatmap
+            loss. Defaults to GaussianFocalLoss.
+        loss_embedding (:obj:`ConfigDict` or dict): Config of corner embedding
+            loss. Defaults to AssociativeEmbeddingLoss.
+        loss_offset (:obj:`ConfigDict` or dict): Config of corner offset loss.
+            Defaults to SmoothL1Loss.
+        loss_guiding_shift (:obj:`ConfigDict` or dict): Config of
+            guiding shift loss. Defaults to SmoothL1Loss.
+        loss_centripetal_shift (:obj:`ConfigDict` or dict): Config of
+            centripetal shift loss. Defaults to SmoothL1Loss.
+       init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+           the initialization.
+    """
+
+    def __init__(self,
+                 *args,
+                 centripetal_shift_channels: int = 2,
+                 guiding_shift_channels: int = 2,
+                 feat_adaption_conv_kernel: int = 3,
+                 loss_guiding_shift: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=0.05),
+                 loss_centripetal_shift: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1),
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        assert centripetal_shift_channels == 2, (
+            'CentripetalHead only support centripetal_shift_channels == 2')
+        self.centripetal_shift_channels = centripetal_shift_channels
+        assert guiding_shift_channels == 2, (
+            'CentripetalHead only support guiding_shift_channels == 2')
+        self.guiding_shift_channels = guiding_shift_channels
+        self.feat_adaption_conv_kernel = feat_adaption_conv_kernel
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        self.loss_guiding_shift = MODELS.build(loss_guiding_shift)
+        self.loss_centripetal_shift = MODELS.build(loss_centripetal_shift)
+
+    def _init_centripetal_layers(self) -> None:
+        """Initialize centripetal layers.
+
+        Including feature adaption deform convs (feat_adaption), deform offset
+        prediction convs (dcn_off), guiding shift (guiding_shift) and
+        centripetal shift ( centripetal_shift). Each branch has two parts:
+        prefix `tl_` for top-left and `br_` for bottom-right.
+        """
+        self.tl_feat_adaption = nn.ModuleList()
+        self.br_feat_adaption = nn.ModuleList()
+        self.tl_dcn_offset = nn.ModuleList()
+        self.br_dcn_offset = nn.ModuleList()
+        self.tl_guiding_shift = nn.ModuleList()
+        self.br_guiding_shift = nn.ModuleList()
+        self.tl_centripetal_shift = nn.ModuleList()
+        self.br_centripetal_shift = nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_feat_adaption.append(
+                DeformConv2d(self.in_channels, self.in_channels,
+                             self.feat_adaption_conv_kernel, 1, 1))
+            self.br_feat_adaption.append(
+                DeformConv2d(self.in_channels, self.in_channels,
+                             self.feat_adaption_conv_kernel, 1, 1))
+
+            self.tl_guiding_shift.append(
+                self._make_layers(
+                    out_channels=self.guiding_shift_channels,
+                    in_channels=self.in_channels))
+            self.br_guiding_shift.append(
+                self._make_layers(
+                    out_channels=self.guiding_shift_channels,
+                    in_channels=self.in_channels))
+
+            self.tl_dcn_offset.append(
+                ConvModule(
+                    self.guiding_shift_channels,
+                    self.feat_adaption_conv_kernel**2 *
+                    self.guiding_shift_channels,
+                    1,
+                    bias=False,
+                    act_cfg=None))
+            self.br_dcn_offset.append(
+                ConvModule(
+                    self.guiding_shift_channels,
+                    self.feat_adaption_conv_kernel**2 *
+                    self.guiding_shift_channels,
+                    1,
+                    bias=False,
+                    act_cfg=None))
+
+            self.tl_centripetal_shift.append(
+                self._make_layers(
+                    out_channels=self.centripetal_shift_channels,
+                    in_channels=self.in_channels))
+            self.br_centripetal_shift.append(
+                self._make_layers(
+                    out_channels=self.centripetal_shift_channels,
+                    in_channels=self.in_channels))
+
+    def _init_layers(self) -> None:
+        """Initialize layers for CentripetalHead.
+
+        Including two parts: CornerHead layers and CentripetalHead layers
+        """
+        super()._init_layers()  # using _init_layers in CornerHead
+        self._init_centripetal_layers()
+
+    def init_weights(self) -> None:
+        super().init_weights()
+        for i in range(self.num_feat_levels):
+            normal_init(self.tl_feat_adaption[i], std=0.01)
+            normal_init(self.br_feat_adaption[i], std=0.01)
+            normal_init(self.tl_dcn_offset[i].conv, std=0.1)
+            normal_init(self.br_dcn_offset[i].conv, std=0.1)
+            _ = [x.conv.reset_parameters() for x in self.tl_guiding_shift[i]]
+            _ = [x.conv.reset_parameters() for x in self.br_guiding_shift[i]]
+            _ = [
+                x.conv.reset_parameters() for x in self.tl_centripetal_shift[i]
+            ]
+            _ = [
+                x.conv.reset_parameters() for x in self.br_centripetal_shift[i]
+            ]
+
+    def forward_single(self, x: Tensor, lvl_ind: int) -> List[Tensor]:
+        """Forward feature of a single level.
+
+        Args:
+            x (Tensor): Feature of a single level.
+            lvl_ind (int): Level index of current feature.
+
+        Returns:
+            tuple[Tensor]: A tuple of CentripetalHead's output for current
+            feature level. Containing the following Tensors:
+
+                - tl_heat (Tensor): Predicted top-left corner heatmap.
+                - br_heat (Tensor): Predicted bottom-right corner heatmap.
+                - tl_off (Tensor): Predicted top-left offset heatmap.
+                - br_off (Tensor): Predicted bottom-right offset heatmap.
+                - tl_guiding_shift (Tensor): Predicted top-left guiding shift
+                  heatmap.
+                - br_guiding_shift (Tensor): Predicted bottom-right guiding
+                  shift heatmap.
+                - tl_centripetal_shift (Tensor): Predicted top-left centripetal
+                  shift heatmap.
+                - br_centripetal_shift (Tensor): Predicted bottom-right
+                  centripetal shift heatmap.
+        """
+        tl_heat, br_heat, _, _, tl_off, br_off, tl_pool, br_pool = super(
+        ).forward_single(
+            x, lvl_ind, return_pool=True)
+
+        tl_guiding_shift = self.tl_guiding_shift[lvl_ind](tl_pool)
+        br_guiding_shift = self.br_guiding_shift[lvl_ind](br_pool)
+
+        tl_dcn_offset = self.tl_dcn_offset[lvl_ind](tl_guiding_shift.detach())
+        br_dcn_offset = self.br_dcn_offset[lvl_ind](br_guiding_shift.detach())
+
+        tl_feat_adaption = self.tl_feat_adaption[lvl_ind](tl_pool,
+                                                          tl_dcn_offset)
+        br_feat_adaption = self.br_feat_adaption[lvl_ind](br_pool,
+                                                          br_dcn_offset)
+
+        tl_centripetal_shift = self.tl_centripetal_shift[lvl_ind](
+            tl_feat_adaption)
+        br_centripetal_shift = self.br_centripetal_shift[lvl_ind](
+            br_feat_adaption)
+
+        result_list = [
+            tl_heat, br_heat, tl_off, br_off, tl_guiding_shift,
+            br_guiding_shift, tl_centripetal_shift, br_centripetal_shift
+        ]
+        return result_list
+
+    def loss_by_feat(
+            self,
+            tl_heats: List[Tensor],
+            br_heats: List[Tensor],
+            tl_offs: List[Tensor],
+            br_offs: List[Tensor],
+            tl_guiding_shifts: List[Tensor],
+            br_guiding_shifts: List[Tensor],
+            tl_centripetal_shifts: List[Tensor],
+            br_centripetal_shifts: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            tl_guiding_shifts (list[Tensor]): Top-left guiding shifts for each
+                level with shape (N, guiding_shift_channels, H, W).
+            br_guiding_shifts (list[Tensor]): Bottom-right guiding shifts for
+                each level with shape (N, guiding_shift_channels, H, W).
+            tl_centripetal_shifts (list[Tensor]): Top-left centripetal shifts
+                for each level with shape (N, centripetal_shift_channels, H,
+                W).
+            br_centripetal_shifts (list[Tensor]): Bottom-right centripetal
+                shifts for each level with shape (N,
+                centripetal_shift_channels, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Specify which bounding boxes can be ignored when computing
+                the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components. Containing the
+            following losses:
+
+                - det_loss (list[Tensor]): Corner keypoint losses of all
+                  feature levels.
+                - off_loss (list[Tensor]): Corner offset losses of all feature
+                  levels.
+                - guiding_loss (list[Tensor]): Guiding shift losses of all
+                  feature levels.
+                - centripetal_loss (list[Tensor]): Centripetal shift losses of
+                  all feature levels.
+        """
+        gt_bboxes = [
+            gt_instances.bboxes for gt_instances in batch_gt_instances
+        ]
+        gt_labels = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+
+        targets = self.get_targets(
+            gt_bboxes,
+            gt_labels,
+            tl_heats[-1].shape,
+            batch_img_metas[0]['batch_input_shape'],
+            with_corner_emb=self.with_corner_emb,
+            with_guiding_shift=True,
+            with_centripetal_shift=True)
+        mlvl_targets = [targets for _ in range(self.num_feat_levels)]
+        [det_losses, off_losses, guiding_losses, centripetal_losses
+         ] = multi_apply(self.loss_by_feat_single, tl_heats, br_heats, tl_offs,
+                         br_offs, tl_guiding_shifts, br_guiding_shifts,
+                         tl_centripetal_shifts, br_centripetal_shifts,
+                         mlvl_targets)
+        loss_dict = dict(
+            det_loss=det_losses,
+            off_loss=off_losses,
+            guiding_loss=guiding_losses,
+            centripetal_loss=centripetal_losses)
+        return loss_dict
+
+    def loss_by_feat_single(self, tl_hmp: Tensor, br_hmp: Tensor,
+                            tl_off: Tensor, br_off: Tensor,
+                            tl_guiding_shift: Tensor, br_guiding_shift: Tensor,
+                            tl_centripetal_shift: Tensor,
+                            br_centripetal_shift: Tensor,
+                            targets: dict) -> Tuple[Tensor, ...]:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            tl_hmp (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_hmp (Tensor): Bottom-right corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            tl_guiding_shift (Tensor): Top-left guiding shift for current level
+                with shape (N, guiding_shift_channels, H, W).
+            br_guiding_shift (Tensor): Bottom-right guiding shift for current
+                level with shape (N, guiding_shift_channels, H, W).
+            tl_centripetal_shift (Tensor): Top-left centripetal shift for
+                current level with shape (N, centripetal_shift_channels, H, W).
+            br_centripetal_shift (Tensor): Bottom-right centripetal shift for
+                current level with shape (N, centripetal_shift_channels, H, W).
+            targets (dict): Corner target generated by `get_targets`.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of the head's different branches
+            containing the following losses:
+
+                - det_loss (Tensor): Corner keypoint loss.
+                - off_loss (Tensor): Corner offset loss.
+                - guiding_loss (Tensor): Guiding shift loss.
+                - centripetal_loss (Tensor): Centripetal shift loss.
+        """
+        targets['corner_embedding'] = None
+
+        det_loss, _, _, off_loss = super().loss_by_feat_single(
+            tl_hmp, br_hmp, None, None, tl_off, br_off, targets)
+
+        gt_tl_guiding_shift = targets['topleft_guiding_shift']
+        gt_br_guiding_shift = targets['bottomright_guiding_shift']
+        gt_tl_centripetal_shift = targets['topleft_centripetal_shift']
+        gt_br_centripetal_shift = targets['bottomright_centripetal_shift']
+
+        gt_tl_heatmap = targets['topleft_heatmap']
+        gt_br_heatmap = targets['bottomright_heatmap']
+        # We only compute the offset loss at the real corner position.
+        # The value of real corner would be 1 in heatmap ground truth.
+        # The mask is computed in class agnostic mode and its shape is
+        # batch * 1 * width * height.
+        tl_mask = gt_tl_heatmap.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_tl_heatmap)
+        br_mask = gt_br_heatmap.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_br_heatmap)
+
+        # Guiding shift loss
+        tl_guiding_loss = self.loss_guiding_shift(
+            tl_guiding_shift,
+            gt_tl_guiding_shift,
+            tl_mask,
+            avg_factor=tl_mask.sum())
+        br_guiding_loss = self.loss_guiding_shift(
+            br_guiding_shift,
+            gt_br_guiding_shift,
+            br_mask,
+            avg_factor=br_mask.sum())
+        guiding_loss = (tl_guiding_loss + br_guiding_loss) / 2.0
+        # Centripetal shift loss
+        tl_centripetal_loss = self.loss_centripetal_shift(
+            tl_centripetal_shift,
+            gt_tl_centripetal_shift,
+            tl_mask,
+            avg_factor=tl_mask.sum())
+        br_centripetal_loss = self.loss_centripetal_shift(
+            br_centripetal_shift,
+            gt_br_centripetal_shift,
+            br_mask,
+            avg_factor=br_mask.sum())
+        centripetal_loss = (tl_centripetal_loss + br_centripetal_loss) / 2.0
+
+        return det_loss, off_loss, guiding_loss, centripetal_loss
+
+    def predict_by_feat(self,
+                        tl_heats: List[Tensor],
+                        br_heats: List[Tensor],
+                        tl_offs: List[Tensor],
+                        br_offs: List[Tensor],
+                        tl_guiding_shifts: List[Tensor],
+                        br_guiding_shifts: List[Tensor],
+                        tl_centripetal_shifts: List[Tensor],
+                        br_centripetal_shifts: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            tl_guiding_shifts (list[Tensor]): Top-left guiding shifts for each
+                level with shape (N, guiding_shift_channels, H, W). Useless in
+                this function, we keep this arg because it's the raw output
+                from CentripetalHead.
+            br_guiding_shifts (list[Tensor]): Bottom-right guiding shifts for
+                each level with shape (N, guiding_shift_channels, H, W).
+                Useless in this function, we keep this arg because it's the
+                raw output from CentripetalHead.
+            tl_centripetal_shifts (list[Tensor]): Top-left centripetal shifts
+                for each level with shape (N, centripetal_shift_channels, H,
+                W).
+            br_centripetal_shifts (list[Tensor]): Bottom-right centripetal
+                shifts for each level with shape (N,
+                centripetal_shift_channels, H, W).
+            batch_img_metas (list[dict], optional): Batch image meta info.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len(
+            batch_img_metas)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            result_list.append(
+                self._predict_by_feat_single(
+                    tl_heats[-1][img_id:img_id + 1, :],
+                    br_heats[-1][img_id:img_id + 1, :],
+                    tl_offs[-1][img_id:img_id + 1, :],
+                    br_offs[-1][img_id:img_id + 1, :],
+                    batch_img_metas[img_id],
+                    tl_emb=None,
+                    br_emb=None,
+                    tl_centripetal_shift=tl_centripetal_shifts[-1][
+                        img_id:img_id + 1, :],
+                    br_centripetal_shift=br_centripetal_shifts[-1][
+                        img_id:img_id + 1, :],
+                    rescale=rescale,
+                    with_nms=with_nms))
+
+        return result_list
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/condinst_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/condinst_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..35a25e6339a8161314cb0523e7181f9d400023ac
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/condinst_head.py
@@ -0,0 +1,1226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, kaiming_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import cat_boxes
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..utils import (aligned_bilinear, filter_scores_and_topk, multi_apply,
+                     relative_coordinate_maps, select_single_mlvl)
+from ..utils.misc import empty_instances
+from .base_mask_head import BaseMaskHead
+from .fcos_head import FCOSHead
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class CondInstBboxHead(FCOSHead):
+    """CondInst box head used in https://arxiv.org/abs/1904.02689.
+
+    Note that CondInst Bbox Head is a extension of FCOS head.
+    Two differences are described as follows:
+
+    1. CondInst box head predicts a set of params for each instance.
+    2. CondInst box head return the pos_gt_inds and pos_inds.
+
+    Args:
+        num_params (int): Number of params for instance segmentation.
+    """
+
+    def __init__(self, *args, num_params: int = 169, **kwargs) -> None:
+        self.num_params = num_params
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        super()._init_layers()
+        self.controller = nn.Conv2d(
+            self.feat_channels, self.num_params, 3, padding=1)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox predictions, centerness
+            predictions and param predictions of input feature maps.
+        """
+        cls_score, bbox_pred, cls_feat, reg_feat = \
+            super(FCOSHead, self).forward_single(x)
+        if self.centerness_on_reg:
+            centerness = self.conv_centerness(reg_feat)
+        else:
+            centerness = self.conv_centerness(cls_feat)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        if self.norm_on_bbox:
+            # bbox_pred needed for gradient computation has been modified
+            # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+            # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+            bbox_pred = bbox_pred.clamp(min=0)
+            if not self.training:
+                bbox_pred *= stride
+        else:
+            bbox_pred = bbox_pred.exp()
+        param_pred = self.controller(reg_feat)
+        return cls_score, bbox_pred, centerness, param_pred
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        centernesses: List[Tensor],
+        param_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            centernesses (list[Tensor]): centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            param_preds (List[Tensor]): param_pred for each scale level, each
+                is a 4D-tensor, the channel number is num_params.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(centernesses)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        # Need stride for rel coord compute
+        all_level_points_strides = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device,
+            with_stride=True)
+        all_level_points = [i[:, :2] for i in all_level_points_strides]
+        all_level_strides = [i[:, 2] for i in all_level_points_strides]
+        labels, bbox_targets, pos_inds_list, pos_gt_inds_list = \
+            self.get_targets(all_level_points, batch_gt_instances)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and centerness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_centerness = [
+            centerness.permute(0, 2, 3, 1).reshape(-1)
+            for centerness in centernesses
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_centerness = torch.cat(flatten_centerness)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < bg_class_ind)).nonzero().reshape(-1)
+        num_pos = torch.tensor(
+            len(pos_inds), dtype=torch.float, device=bbox_preds[0].device)
+        num_pos = max(reduce_mean(num_pos), 1.0)
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels, avg_factor=num_pos)
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_centerness = flatten_centerness[pos_inds]
+        pos_bbox_targets = flatten_bbox_targets[pos_inds]
+        pos_centerness_targets = self.centerness_target(pos_bbox_targets)
+        # centerness weighted iou loss
+        centerness_denorm = max(
+            reduce_mean(pos_centerness_targets.sum().detach()), 1e-6)
+
+        if len(pos_inds) > 0:
+            pos_points = flatten_points[pos_inds]
+            pos_decoded_bbox_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_preds)
+            pos_decoded_target_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_targets)
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds,
+                weight=pos_centerness_targets,
+                avg_factor=centerness_denorm)
+            loss_centerness = self.loss_centerness(
+                pos_centerness, pos_centerness_targets, avg_factor=num_pos)
+        else:
+            loss_bbox = pos_bbox_preds.sum()
+            loss_centerness = pos_centerness.sum()
+
+        self._raw_positive_infos.update(cls_scores=cls_scores)
+        self._raw_positive_infos.update(centernesses=centernesses)
+        self._raw_positive_infos.update(param_preds=param_preds)
+        self._raw_positive_infos.update(all_level_points=all_level_points)
+        self._raw_positive_infos.update(all_level_strides=all_level_strides)
+        self._raw_positive_infos.update(pos_gt_inds_list=pos_gt_inds_list)
+        self._raw_positive_infos.update(pos_inds_list=pos_inds_list)
+
+        return dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_centerness=loss_centerness)
+
+    def get_targets(
+        self, points: List[Tensor], batch_gt_instances: InstanceList
+    ) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]:
+        """Compute regression, classification and centerness targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple: Targets of each level.
+
+            - concat_lvl_labels (list[Tensor]): Labels of each level.
+            - concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+            level.
+            - pos_inds_list (list[Tensor]): pos_inds of each image.
+            - pos_gt_inds_list (List[Tensor]): pos_gt_inds of each image.
+        """
+        assert len(points) == len(self.regress_ranges)
+        num_levels = len(points)
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        # get labels and bbox_targets of each image
+        labels_list, bbox_targets_list, pos_inds_list, pos_gt_inds_list = \
+            multi_apply(
+                self._get_targets_single,
+                batch_gt_instances,
+                points=concat_points,
+                regress_ranges=concat_regress_ranges,
+                num_points_per_lvl=num_points)
+
+        # split to per img, per level
+        labels_list = [labels.split(num_points, 0) for labels in labels_list]
+        bbox_targets_list = [
+            bbox_targets.split(num_points, 0)
+            for bbox_targets in bbox_targets_list
+        ]
+
+        # concat per level image
+        concat_lvl_labels = []
+        concat_lvl_bbox_targets = []
+        for i in range(num_levels):
+            concat_lvl_labels.append(
+                torch.cat([labels[i] for labels in labels_list]))
+            bbox_targets = torch.cat(
+                [bbox_targets[i] for bbox_targets in bbox_targets_list])
+            if self.norm_on_bbox:
+                bbox_targets = bbox_targets / self.strides[i]
+            concat_lvl_bbox_targets.append(bbox_targets)
+        return (concat_lvl_labels, concat_lvl_bbox_targets, pos_inds_list,
+                pos_gt_inds_list)
+
+    def _get_targets_single(
+        self, gt_instances: InstanceData, points: Tensor,
+        regress_ranges: Tensor, num_points_per_lvl: List[int]
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        """Compute regression and classification targets for a single image."""
+        num_points = points.size(0)
+        num_gts = len(gt_instances)
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        gt_masks = gt_instances.get('masks', None)
+
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,), self.num_classes), \
+                   gt_bboxes.new_zeros((num_points, 4)), \
+                   gt_bboxes.new_zeros((0,), dtype=torch.int64), \
+                   gt_bboxes.new_zeros((0,), dtype=torch.int64)
+
+        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+            gt_bboxes[:, 3] - gt_bboxes[:, 1])
+        # TODO: figure out why these two are different
+        # areas = areas[None].expand(num_points, num_gts)
+        areas = areas[None].repeat(num_points, 1)
+        regress_ranges = regress_ranges[:, None, :].expand(
+            num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None].expand(num_points, num_gts)
+        ys = ys[:, None].expand(num_points, num_gts)
+
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+
+        if self.center_sampling:
+            # condition1: inside a `center bbox`
+            radius = self.center_sample_radius
+            # if gt_mask not None, use gt mask's centroid to determine
+            # the center region rather than gt_bbox center
+            if gt_masks is None:
+                center_xs = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) / 2
+                center_ys = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) / 2
+            else:
+                h, w = gt_masks.height, gt_masks.width
+                masks = gt_masks.to_tensor(
+                    dtype=torch.bool, device=gt_bboxes.device)
+                yys = torch.arange(
+                    0, h, dtype=torch.float32, device=masks.device)
+                xxs = torch.arange(
+                    0, w, dtype=torch.float32, device=masks.device)
+                # m00/m10/m01 represent the moments of a contour
+                # centroid is computed by m00/m10 and m00/m01
+                m00 = masks.sum(dim=-1).sum(dim=-1).clamp(min=1e-6)
+                m10 = (masks * xxs).sum(dim=-1).sum(dim=-1)
+                m01 = (masks * yys[:, None]).sum(dim=-1).sum(dim=-1)
+                center_xs = m10 / m00
+                center_ys = m01 / m00
+
+                center_xs = center_xs[None].expand(num_points, num_gts)
+                center_ys = center_ys[None].expand(num_points, num_gts)
+            center_gts = torch.zeros_like(gt_bboxes)
+            stride = center_xs.new_zeros(center_xs.shape)
+
+            # project the points on current lvl back to the `original` sizes
+            lvl_begin = 0
+            for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
+                lvl_end = lvl_begin + num_points_lvl
+                stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
+                lvl_begin = lvl_end
+
+            x_mins = center_xs - stride
+            y_mins = center_ys - stride
+            x_maxs = center_xs + stride
+            y_maxs = center_ys + stride
+            center_gts[..., 0] = torch.where(x_mins > gt_bboxes[..., 0],
+                                             x_mins, gt_bboxes[..., 0])
+            center_gts[..., 1] = torch.where(y_mins > gt_bboxes[..., 1],
+                                             y_mins, gt_bboxes[..., 1])
+            center_gts[..., 2] = torch.where(x_maxs > gt_bboxes[..., 2],
+                                             gt_bboxes[..., 2], x_maxs)
+            center_gts[..., 3] = torch.where(y_maxs > gt_bboxes[..., 3],
+                                             gt_bboxes[..., 3], y_maxs)
+
+            cb_dist_left = xs - center_gts[..., 0]
+            cb_dist_right = center_gts[..., 2] - xs
+            cb_dist_top = ys - center_gts[..., 1]
+            cb_dist_bottom = center_gts[..., 3] - ys
+            center_bbox = torch.stack(
+                (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
+            inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+        else:
+            # condition1: inside a gt bbox
+            inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
+
+        # condition2: limit the regression range for each location
+        max_regress_distance = bbox_targets.max(-1)[0]
+        inside_regress_range = (
+            (max_regress_distance >= regress_ranges[..., 0])
+            & (max_regress_distance <= regress_ranges[..., 1]))
+
+        # if there are still more than one objects for a location,
+        # we choose the one with minimal area
+        areas[inside_gt_bbox_mask == 0] = INF
+        areas[inside_regress_range == 0] = INF
+        min_area, min_area_inds = areas.min(dim=1)
+
+        labels = gt_labels[min_area_inds]
+        labels[min_area == INF] = self.num_classes  # set as BG
+        bbox_targets = bbox_targets[range(num_points), min_area_inds]
+
+        # return pos_inds & pos_gt_inds
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().reshape(-1)
+        pos_gt_inds = min_area_inds[labels < self.num_classes]
+        return labels, bbox_targets, pos_inds, pos_gt_inds
+
+    def get_positive_infos(self) -> InstanceList:
+        """Get positive information from sampling results.
+
+        Returns:
+            list[:obj:`InstanceData`]: Positive information of each image,
+            usually including positive bboxes, positive labels, positive
+            priors, etc.
+        """
+        assert len(self._raw_positive_infos) > 0
+
+        pos_gt_inds_list = self._raw_positive_infos['pos_gt_inds_list']
+        pos_inds_list = self._raw_positive_infos['pos_inds_list']
+        num_imgs = len(pos_gt_inds_list)
+
+        cls_score_list = []
+        centerness_list = []
+        param_pred_list = []
+        point_list = []
+        stride_list = []
+        for cls_score_per_lvl, centerness_per_lvl, param_pred_per_lvl,\
+            point_per_lvl, stride_per_lvl in \
+            zip(self._raw_positive_infos['cls_scores'],
+                self._raw_positive_infos['centernesses'],
+                self._raw_positive_infos['param_preds'],
+                self._raw_positive_infos['all_level_points'],
+                self._raw_positive_infos['all_level_strides']):
+            cls_score_per_lvl = \
+                cls_score_per_lvl.permute(
+                    0, 2, 3, 1).reshape(num_imgs, -1, self.num_classes)
+            centerness_per_lvl = \
+                centerness_per_lvl.permute(
+                    0, 2, 3, 1).reshape(num_imgs, -1, 1)
+            param_pred_per_lvl = \
+                param_pred_per_lvl.permute(
+                    0, 2, 3, 1).reshape(num_imgs, -1, self.num_params)
+            point_per_lvl = point_per_lvl.unsqueeze(0).repeat(num_imgs, 1, 1)
+            stride_per_lvl = stride_per_lvl.unsqueeze(0).repeat(num_imgs, 1)
+
+            cls_score_list.append(cls_score_per_lvl)
+            centerness_list.append(centerness_per_lvl)
+            param_pred_list.append(param_pred_per_lvl)
+            point_list.append(point_per_lvl)
+            stride_list.append(stride_per_lvl)
+        cls_scores = torch.cat(cls_score_list, dim=1)
+        centernesses = torch.cat(centerness_list, dim=1)
+        param_preds = torch.cat(param_pred_list, dim=1)
+        all_points = torch.cat(point_list, dim=1)
+        all_strides = torch.cat(stride_list, dim=1)
+
+        positive_infos = []
+        for i, (pos_gt_inds,
+                pos_inds) in enumerate(zip(pos_gt_inds_list, pos_inds_list)):
+            pos_info = InstanceData()
+            pos_info.points = all_points[i][pos_inds]
+            pos_info.strides = all_strides[i][pos_inds]
+            pos_info.scores = cls_scores[i][pos_inds]
+            pos_info.centernesses = centernesses[i][pos_inds]
+            pos_info.param_preds = param_preds[i][pos_inds]
+            pos_info.pos_assigned_gt_inds = pos_gt_inds
+            pos_info.pos_inds = pos_inds
+            positive_infos.append(pos_info)
+        return positive_infos
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        score_factors: Optional[List[Tensor]] = None,
+                        param_preds: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            param_preds (list[Tensor], optional): Params for all scale
+                level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_params, H, W)
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        if score_factors is None:
+            # e.g. Retina, FreeAnchor, Foveabox, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, AutoAssign, etc.
+            with_score_factors = True
+            assert len(cls_scores) == len(score_factors)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        all_level_points_strides = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device,
+            with_stride=True)
+        all_level_points = [i[:, :2] for i in all_level_points_strides]
+        all_level_strides = [i[:, 2] for i in all_level_points_strides]
+
+        result_list = []
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            cls_score_list = select_single_mlvl(
+                cls_scores, img_id, detach=True)
+            bbox_pred_list = select_single_mlvl(
+                bbox_preds, img_id, detach=True)
+            if with_score_factors:
+                score_factor_list = select_single_mlvl(
+                    score_factors, img_id, detach=True)
+            else:
+                score_factor_list = [None for _ in range(num_levels)]
+            param_pred_list = select_single_mlvl(
+                param_preds, img_id, detach=True)
+
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                score_factor_list=score_factor_list,
+                param_pred_list=param_pred_list,
+                mlvl_points=all_level_points,
+                mlvl_strides=all_level_strides,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                param_pred_list: List[Tensor],
+                                mlvl_points: List[Tensor],
+                                mlvl_strides: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            param_pred_list (List[Tensor]): Param predition from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_params, H, W).
+            mlvl_points (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid.
+                It has shape (num_priors, 2)
+            mlvl_strides (List[Tensor]):  Each element in the list is
+                the stride of a single level in feature pyramid.
+                It has shape (num_priors, 1)
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if score_factor_list[0] is None:
+            # e.g. Retina, FreeAnchor, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_param_preds = []
+        mlvl_valid_points = []
+        mlvl_valid_strides = []
+        mlvl_scores = []
+        mlvl_labels = []
+        if with_score_factors:
+            mlvl_score_factors = []
+        else:
+            mlvl_score_factors = None
+        for level_idx, (cls_score, bbox_pred, score_factor,
+                        param_pred, points, strides) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, param_pred_list,
+                              mlvl_points, mlvl_strides)):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim)
+            if with_score_factors:
+                score_factor = score_factor.permute(1, 2,
+                                                    0).reshape(-1).sigmoid()
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            param_pred = param_pred.permute(1, 2,
+                                            0).reshape(-1, self.num_params)
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            score_thr = cfg.get('score_thr', 0)
+
+            results = filter_scores_and_topk(
+                scores, score_thr, nms_pre,
+                dict(
+                    bbox_pred=bbox_pred,
+                    param_pred=param_pred,
+                    points=points,
+                    strides=strides))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            param_pred = filtered_results['param_pred']
+            points = filtered_results['points']
+            strides = filtered_results['strides']
+
+            if with_score_factors:
+                score_factor = score_factor[keep_idxs]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_param_preds.append(param_pred)
+            mlvl_valid_points.append(points)
+            mlvl_valid_strides.append(strides)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+            if with_score_factors:
+                mlvl_score_factors.append(score_factor)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_points)
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+        results.param_preds = torch.cat(mlvl_param_preds)
+        results.points = torch.cat(mlvl_valid_points)
+        results.strides = torch.cat(mlvl_valid_strides)
+        if with_score_factors:
+            results.score_factors = torch.cat(mlvl_score_factors)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+
+class MaskFeatModule(BaseModule):
+    """CondInst mask feature map branch used in \
+    https://arxiv.org/abs/1904.02689.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels of the mask feature
+             map branch.
+        start_level (int): The starting feature map level from RPN that
+             will be used to predict the mask feature map.
+        end_level (int): The ending feature map level from rpn that
+             will be used to predict the mask feature map.
+        out_channels (int): Number of output channels of the mask feature
+             map branch. This is the channel count of the mask
+             feature map that to be dynamically convolved with the predicted
+             kernel.
+        mask_stride (int): Downsample factor of the mask feature map output.
+            Defaults to 4.
+        num_stacked_convs (int): Number of convs in mask feature branch.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 feat_channels: int,
+                 start_level: int,
+                 end_level: int,
+                 out_channels: int,
+                 mask_stride: int = 4,
+                 num_stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 init_cfg: MultiConfig = [
+                     dict(type='Normal', layer='Conv2d', std=0.01)
+                 ],
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.start_level = start_level
+        self.end_level = end_level
+        self.mask_stride = mask_stride
+        self.num_stacked_convs = num_stacked_convs
+        assert start_level >= 0 and end_level >= start_level
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.convs_all_levels = nn.ModuleList()
+        for i in range(self.start_level, self.end_level + 1):
+            convs_per_level = nn.Sequential()
+            convs_per_level.add_module(
+                f'conv{i}',
+                ConvModule(
+                    self.in_channels,
+                    self.feat_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=False,
+                    bias=False))
+            self.convs_all_levels.append(convs_per_level)
+
+        conv_branch = []
+        for _ in range(self.num_stacked_convs):
+            conv_branch.append(
+                ConvModule(
+                    self.feat_channels,
+                    self.feat_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=False))
+        self.conv_branch = nn.Sequential(*conv_branch)
+
+        self.conv_pred = nn.Conv2d(
+            self.feat_channels, self.out_channels, 1, stride=1)
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        super().init_weights()
+        kaiming_init(self.convs_all_levels, a=1, distribution='uniform')
+        kaiming_init(self.conv_branch, a=1, distribution='uniform')
+        kaiming_init(self.conv_pred, a=1, distribution='uniform')
+
+    def forward(self, x: Tuple[Tensor]) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: The predicted mask feature map.
+        """
+        inputs = x[self.start_level:self.end_level + 1]
+        assert len(inputs) == (self.end_level - self.start_level + 1)
+        feature_add_all_level = self.convs_all_levels[0](inputs[0])
+        target_h, target_w = feature_add_all_level.size()[2:]
+        for i in range(1, len(inputs)):
+            input_p = inputs[i]
+            x_p = self.convs_all_levels[i](input_p)
+            h, w = x_p.size()[2:]
+            factor_h = target_h // h
+            factor_w = target_w // w
+            assert factor_h == factor_w
+            feature_per_level = aligned_bilinear(x_p, factor_h)
+            feature_add_all_level = feature_add_all_level + \
+                feature_per_level
+
+        feature_add_all_level = self.conv_branch(feature_add_all_level)
+        feature_pred = self.conv_pred(feature_add_all_level)
+        return feature_pred
+
+
+@MODELS.register_module()
+class CondInstMaskHead(BaseMaskHead):
+    """CondInst mask head used in https://arxiv.org/abs/1904.02689.
+
+    This head outputs the mask for CondInst.
+
+    Args:
+        mask_feature_head (dict): Config of CondInstMaskFeatHead.
+        num_layers (int): Number of dynamic conv layers.
+        feat_channels (int): Number of channels in the dynamic conv.
+        mask_out_stride (int): The stride of the mask feat.
+        size_of_interest (int): The size of the region used in rel coord.
+        max_masks_to_train (int): Maximum number of masks to train for
+            each image.
+        loss_segm (:obj:`ConfigDict` or dict, optional): Config of
+            segmentation loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config
+            of head.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            head.
+    """
+
+    def __init__(self,
+                 mask_feature_head: ConfigType,
+                 num_layers: int = 3,
+                 feat_channels: int = 8,
+                 mask_out_stride: int = 4,
+                 size_of_interest: int = 8,
+                 max_masks_to_train: int = -1,
+                 topk_masks_per_img: int = -1,
+                 loss_mask: ConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None) -> None:
+        super().__init__()
+        self.mask_feature_head = MaskFeatModule(**mask_feature_head)
+        self.mask_feat_stride = self.mask_feature_head.mask_stride
+        self.in_channels = self.mask_feature_head.out_channels
+        self.num_layers = num_layers
+        self.feat_channels = feat_channels
+        self.size_of_interest = size_of_interest
+        self.mask_out_stride = mask_out_stride
+        self.max_masks_to_train = max_masks_to_train
+        self.topk_masks_per_img = topk_masks_per_img
+        self.prior_generator = MlvlPointGenerator([self.mask_feat_stride])
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.loss_mask = MODELS.build(loss_mask)
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        weight_nums, bias_nums = [], []
+        for i in range(self.num_layers):
+            if i == 0:
+                weight_nums.append((self.in_channels + 2) * self.feat_channels)
+                bias_nums.append(self.feat_channels)
+            elif i == self.num_layers - 1:
+                weight_nums.append(self.feat_channels * 1)
+                bias_nums.append(1)
+            else:
+                weight_nums.append(self.feat_channels * self.feat_channels)
+                bias_nums.append(self.feat_channels)
+
+        self.weight_nums = weight_nums
+        self.bias_nums = bias_nums
+        self.num_params = sum(weight_nums) + sum(bias_nums)
+
+    def parse_dynamic_params(
+            self, params: Tensor) -> Tuple[List[Tensor], List[Tensor]]:
+        """parse the dynamic params for dynamic conv."""
+        num_insts = params.size(0)
+        params_splits = list(
+            torch.split_with_sizes(
+                params, self.weight_nums + self.bias_nums, dim=1))
+        weight_splits = params_splits[:self.num_layers]
+        bias_splits = params_splits[self.num_layers:]
+        for i in range(self.num_layers):
+            if i < self.num_layers - 1:
+                weight_splits[i] = weight_splits[i].reshape(
+                    num_insts * self.in_channels, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(num_insts *
+                                                        self.in_channels)
+            else:
+                # out_channels x in_channels x 1 x 1
+                weight_splits[i] = weight_splits[i].reshape(
+                    num_insts * 1, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(num_insts)
+
+        return weight_splits, bias_splits
+
+    def dynamic_conv_forward(self, features: Tensor, weights: List[Tensor],
+                             biases: List[Tensor], num_insts: int) -> Tensor:
+        """dynamic forward, each layer follow a relu."""
+        n_layers = len(weights)
+        x = features
+        for i, (w, b) in enumerate(zip(weights, biases)):
+            x = F.conv2d(x, w, bias=b, stride=1, padding=0, groups=num_insts)
+            if i < n_layers - 1:
+                x = F.relu(x)
+        return x
+
+    def forward(self, x: tuple, positive_infos: InstanceList) -> tuple:
+        """Forward feature from the upstream network to get prototypes and
+        linearly combine the prototypes, using masks coefficients, into
+        instance masks. Finally, crop the instance masks with given bboxes.
+
+        Args:
+            x (Tuple[Tensor]): Feature from the upstream network, which is
+                a 4D-tensor.
+            positive_infos (List[:obj:``InstanceData``]): Positive information
+                that calculate from detect head.
+
+        Returns:
+            tuple: Predicted instance segmentation masks
+        """
+        mask_feats = self.mask_feature_head(x)
+        return multi_apply(self.forward_single, mask_feats, positive_infos)
+
+    def forward_single(self, mask_feat: Tensor,
+                       positive_info: InstanceData) -> Tensor:
+        """Forward features of a each image."""
+        pos_param_preds = positive_info.get('param_preds')
+        pos_points = positive_info.get('points')
+        pos_strides = positive_info.get('strides')
+
+        num_inst = pos_param_preds.shape[0]
+        mask_feat = mask_feat[None].repeat(num_inst, 1, 1, 1)
+        _, _, H, W = mask_feat.size()
+        if num_inst == 0:
+            return (pos_param_preds.new_zeros((0, 1, H, W)), )
+
+        locations = self.prior_generator.single_level_grid_priors(
+            mask_feat.size()[2:], 0, device=mask_feat.device)
+
+        rel_coords = relative_coordinate_maps(locations, pos_points,
+                                              pos_strides,
+                                              self.size_of_interest,
+                                              mask_feat.size()[2:])
+        mask_head_inputs = torch.cat([rel_coords, mask_feat], dim=1)
+        mask_head_inputs = mask_head_inputs.reshape(1, -1, H, W)
+
+        weights, biases = self.parse_dynamic_params(pos_param_preds)
+        mask_preds = self.dynamic_conv_forward(mask_head_inputs, weights,
+                                               biases, num_inst)
+        mask_preds = mask_preds.reshape(-1, H, W)
+        mask_preds = aligned_bilinear(
+            mask_preds.unsqueeze(0),
+            int(self.mask_feat_stride / self.mask_out_stride)).squeeze(0)
+
+        return (mask_preds, )
+
+    def loss_by_feat(self, mask_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], positive_infos: InstanceList,
+                     **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (list[Tensor]): List of predicted masks, each has
+                shape (num_classes, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+            positive_infos (List[:obj:``InstanceData``]): Information of
+                positive samples of each image that are assigned in detection
+                head.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert positive_infos is not None, \
+            'positive_infos should not be None in `CondInstMaskHead`'
+        losses = dict()
+
+        loss_mask = 0.
+        num_imgs = len(mask_preds)
+        total_pos = 0
+
+        for idx in range(num_imgs):
+            (mask_pred, pos_mask_targets, num_pos) = \
+                self._get_targets_single(
+                mask_preds[idx], batch_gt_instances[idx],
+                positive_infos[idx])
+            # mask loss
+            total_pos += num_pos
+            if num_pos == 0 or pos_mask_targets is None:
+                loss = mask_pred.new_zeros(1).mean()
+            else:
+                loss = self.loss_mask(
+                    mask_pred, pos_mask_targets,
+                    reduction_override='none').sum()
+            loss_mask += loss
+
+        if total_pos == 0:
+            total_pos += 1  # avoid nan
+        loss_mask = loss_mask / total_pos
+        losses.update(loss_mask=loss_mask)
+        return losses
+
+    def _get_targets_single(self, mask_preds: Tensor,
+                            gt_instances: InstanceData,
+                            positive_info: InstanceData):
+        """Compute targets for predictions of single image.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes with shape
+                (num_classes, H, W).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            positive_info (:obj:`InstanceData`): Information of positive
+                samples that are assigned in detection head. It usually
+                contains following keys.
+
+                    - pos_assigned_gt_inds (Tensor): Assigner GT indexes of
+                      positive proposals, has shape (num_pos, )
+                    - pos_inds (Tensor): Positive index of image, has
+                      shape (num_pos, ).
+                    - param_pred (Tensor): Positive param preditions
+                      with shape (num_pos, num_params).
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+            - mask_preds (Tensor): Positive predicted mask with shape
+              (num_pos, mask_h, mask_w).
+            - pos_mask_targets (Tensor): Positive mask targets with shape
+              (num_pos, mask_h, mask_w).
+            - num_pos (int): Positive numbers.
+        """
+        gt_bboxes = gt_instances.bboxes
+        device = gt_bboxes.device
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device).float()
+
+        # process with mask targets
+        pos_assigned_gt_inds = positive_info.get('pos_assigned_gt_inds')
+        scores = positive_info.get('scores')
+        centernesses = positive_info.get('centernesses')
+        num_pos = pos_assigned_gt_inds.size(0)
+
+        if gt_masks.size(0) == 0 or num_pos == 0:
+            return mask_preds, None, 0
+        # Since we're producing (near) full image masks,
+        # it'd take too much vram to backprop on every single mask.
+        # Thus we select only a subset.
+        if (self.max_masks_to_train != -1) and \
+           (num_pos > self.max_masks_to_train):
+            perm = torch.randperm(num_pos)
+            select = perm[:self.max_masks_to_train]
+            mask_preds = mask_preds[select]
+            pos_assigned_gt_inds = pos_assigned_gt_inds[select]
+            num_pos = self.max_masks_to_train
+        elif self.topk_masks_per_img != -1:
+            unique_gt_inds = pos_assigned_gt_inds.unique()
+            num_inst_per_gt = max(
+                int(self.topk_masks_per_img / len(unique_gt_inds)), 1)
+
+            keep_mask_preds = []
+            keep_pos_assigned_gt_inds = []
+            for gt_ind in unique_gt_inds:
+                per_inst_pos_inds = (pos_assigned_gt_inds == gt_ind)
+                mask_preds_per_inst = mask_preds[per_inst_pos_inds]
+                gt_inds_per_inst = pos_assigned_gt_inds[per_inst_pos_inds]
+                if sum(per_inst_pos_inds) > num_inst_per_gt:
+                    per_inst_scores = scores[per_inst_pos_inds].sigmoid().max(
+                        dim=1)[0]
+                    per_inst_centerness = centernesses[
+                        per_inst_pos_inds].sigmoid().reshape(-1, )
+                    select = (per_inst_scores * per_inst_centerness).topk(
+                        k=num_inst_per_gt, dim=0)[1]
+                    mask_preds_per_inst = mask_preds_per_inst[select]
+                    gt_inds_per_inst = gt_inds_per_inst[select]
+                keep_mask_preds.append(mask_preds_per_inst)
+                keep_pos_assigned_gt_inds.append(gt_inds_per_inst)
+            mask_preds = torch.cat(keep_mask_preds)
+            pos_assigned_gt_inds = torch.cat(keep_pos_assigned_gt_inds)
+            num_pos = pos_assigned_gt_inds.size(0)
+
+        # Follow the origin implement
+        start = int(self.mask_out_stride // 2)
+        gt_masks = gt_masks[:, start::self.mask_out_stride,
+                            start::self.mask_out_stride]
+        gt_masks = gt_masks.gt(0.5).float()
+        pos_mask_targets = gt_masks[pos_assigned_gt_inds]
+
+        return (mask_preds, pos_mask_targets, num_pos)
+
+    def predict_by_feat(self,
+                        mask_preds: List[Tensor],
+                        results_list: InstanceList,
+                        batch_img_metas: List[dict],
+                        rescale: bool = True,
+                        **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (list[Tensor]): Predicted prototypes with shape
+                (num_classes, H, W).
+            results_list (List[:obj:``InstanceData``]): BBoxHead results.
+            batch_img_metas (list[dict]): Meta information of all images.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        assert len(mask_preds) == len(results_list) == len(batch_img_metas)
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            results = results_list[img_id]
+            bboxes = results.bboxes
+            mask_pred = mask_preds[img_id]
+            if bboxes.shape[0] == 0 or mask_pred.shape[0] == 0:
+                results_list[img_id] = empty_instances(
+                    [img_meta],
+                    bboxes.device,
+                    task_type='mask',
+                    instance_results=[results])[0]
+            else:
+                im_mask = self._predict_by_feat_single(
+                    mask_preds=mask_pred,
+                    bboxes=bboxes,
+                    img_meta=img_meta,
+                    rescale=rescale)
+                results.masks = im_mask
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                mask_preds: Tensor,
+                                bboxes: Tensor,
+                                img_meta: dict,
+                                rescale: bool,
+                                cfg: OptConfigType = None):
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes, has shape [H, W, N].
+            img_meta (dict): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If rescale is False, then returned masks will
+                fit the scale of imgs[0].
+            cfg (dict, optional): Config used in test phase.
+                Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+            (1, 2))
+        img_h, img_w = img_meta['img_shape'][:2]
+        ori_h, ori_w = img_meta['ori_shape'][:2]
+
+        mask_preds = mask_preds.sigmoid().unsqueeze(0)
+        mask_preds = aligned_bilinear(mask_preds, self.mask_out_stride)
+        mask_preds = mask_preds[:, :, :img_h, :img_w]
+        if rescale:  # in-placed rescale the bboxes
+            scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+            bboxes /= scale_factor
+
+            masks = F.interpolate(
+                mask_preds, (ori_h, ori_w),
+                mode='bilinear',
+                align_corners=False).squeeze(0) > cfg.mask_thr
+        else:
+            masks = mask_preds.squeeze(0) > cfg.mask_thr
+
+        return masks
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/conditional_detr_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/conditional_detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc2df2c215667121c5fe329f369510ecd4666faf
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/conditional_detr_head.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+from mmengine.model import bias_init_with_prob
+from torch import Tensor
+
+from mmdet.models.layers.transformer import inverse_sigmoid
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList
+from .detr_head import DETRHead
+
+
+@MODELS.register_module()
+class ConditionalDETRHead(DETRHead):
+    """Head of Conditional DETR. Conditional DETR: Conditional DETR for Fast
+    Training Convergence. More details can be found in the `paper.
+
+    <https://arxiv.org/abs/2108.06152>`_ .
+    """
+
+    def init_weights(self):
+        """Initialize weights of the transformer head."""
+        super().init_weights()
+        # The initialization below for transformer head is very
+        # important as we use Focal_loss for loss_cls
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            nn.init.constant_(self.fc_cls.bias, bias_init)
+
+    def forward(self, hidden_states: Tensor,
+                references: Tensor) -> Tuple[Tensor, Tensor]:
+        """"Forward function.
+
+        Args:
+            hidden_states (Tensor): Features from transformer decoder. If
+                `return_intermediate_dec` is True output has shape
+                (num_decoder_layers, bs, num_queries, dim), else has shape (1,
+                bs, num_queries, dim) which only contains the last layer
+                outputs.
+            references (Tensor): References from transformer decoder, has
+                shape (bs, num_queries, 2).
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - layers_cls_scores (Tensor): Outputs from the classification head,
+              shape (num_decoder_layers, bs, num_queries, cls_out_channels).
+              Note cls_out_channels should include background.
+            - layers_bbox_preds (Tensor): Sigmoid outputs from the regression
+              head with normalized coordinate format (cx, cy, w, h), has shape
+              (num_decoder_layers, bs, num_queries, 4).
+        """
+
+        references_unsigmoid = inverse_sigmoid(references)
+        layers_bbox_preds = []
+        for layer_id in range(hidden_states.shape[0]):
+            tmp_reg_preds = self.fc_reg(
+                self.activate(self.reg_ffn(hidden_states[layer_id])))
+            tmp_reg_preds[..., :2] += references_unsigmoid
+            outputs_coord = tmp_reg_preds.sigmoid()
+            layers_bbox_preds.append(outputs_coord)
+        layers_bbox_preds = torch.stack(layers_bbox_preds)
+
+        layers_cls_scores = self.fc_cls(hidden_states)
+        return layers_cls_scores, layers_bbox_preds
+
+    def loss(self, hidden_states: Tensor, references: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Features from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, dim).
+            references (Tensor): References from the transformer decoder, has
+               shape (num_decoder_layers, bs, num_queries, 2).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_and_predict(
+            self, hidden_states: Tensor, references: Tensor,
+            batch_data_samples: SampleList) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples. Over-write because
+        img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (Tensor): Features from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, dim).
+            references (Tensor): References from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, 2).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns:
+            tuple: The return value is a tuple contains:
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+            - predictions (list[:obj:`InstanceData`]): Detection
+              results of each image after the post process.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas)
+        return losses, predictions
+
+    def predict(self,
+                hidden_states: Tensor,
+                references: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network. Over-write
+        because img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (Tensor): Features from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, dim).
+            references (Tensor): References from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, 2).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        last_layer_hidden_state = hidden_states[-1].unsqueeze(0)
+        outs = self(last_layer_hidden_state, references)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+
+        return predictions
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/corner_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/corner_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cec71d50947ff58224ae698ec9c2f9406b58efb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/corner_head.py
@@ -0,0 +1,1084 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from logging import warning
+from math import ceil, log
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import CornerPool, batched_nms
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from ..utils import (gather_feat, gaussian_radius, gen_gaussian_target,
+                     get_local_maximum, get_topk_from_heatmap, multi_apply,
+                     transpose_and_gather_feat)
+from .base_dense_head import BaseDenseHead
+
+
+class BiCornerPool(BaseModule):
+    """Bidirectional Corner Pooling Module (TopLeft, BottomRight, etc.)
+
+    Args:
+        in_channels (int): Input channels of module.
+        directions (list[str]): Directions of two CornerPools.
+        out_channels (int): Output channels of module.
+        feat_channels (int): Feature channels of module.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct
+            and config norm layer.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to
+            control the initialization.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 directions: List[int],
+                 feat_channels: int = 128,
+                 out_channels: int = 128,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg)
+        self.direction1_conv = ConvModule(
+            in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg)
+        self.direction2_conv = ConvModule(
+            in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.aftpool_conv = ConvModule(
+            feat_channels,
+            out_channels,
+            3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.conv1 = ConvModule(
+            in_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        self.conv2 = ConvModule(
+            in_channels, out_channels, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.direction1_pool = CornerPool(directions[0])
+        self.direction2_pool = CornerPool(directions[1])
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tensor): Input feature of BiCornerPool.
+
+        Returns:
+            conv2 (tensor): Output feature of BiCornerPool.
+        """
+        direction1_conv = self.direction1_conv(x)
+        direction2_conv = self.direction2_conv(x)
+        direction1_feat = self.direction1_pool(direction1_conv)
+        direction2_feat = self.direction2_pool(direction2_conv)
+        aftpool_conv = self.aftpool_conv(direction1_feat + direction2_feat)
+        conv1 = self.conv1(x)
+        relu = self.relu(aftpool_conv + conv1)
+        conv2 = self.conv2(relu)
+        return conv2
+
+
+@MODELS.register_module()
+class CornerHead(BaseDenseHead):
+    """Head of CornerNet: Detecting Objects as Paired Keypoints.
+
+    Code is modified from the `official github repo
+    <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/
+    kp.py#L73>`_ .
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1808.01244>`_ .
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        num_feat_levels (int): Levels of feature from the previous module.
+            2 for HourglassNet-104 and 1 for HourglassNet-52. Because
+            HourglassNet-104 outputs the final feature and intermediate
+            supervision feature and HourglassNet-52 only outputs the final
+            feature. Defaults to 2.
+        corner_emb_channels (int): Channel of embedding vector. Defaults to 1.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
+            Useless in CornerHead, but we keep this variable for
+            SingleStageDetector.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            CornerHead.
+        loss_heatmap (:obj:`ConfigDict` or dict): Config of corner heatmap
+            loss. Defaults to GaussianFocalLoss.
+        loss_embedding (:obj:`ConfigDict` or dict): Config of corner embedding
+            loss. Defaults to AssociativeEmbeddingLoss.
+        loss_offset (:obj:`ConfigDict` or dict): Config of corner offset loss.
+            Defaults to SmoothL1Loss.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 num_feat_levels: int = 2,
+                 corner_emb_channels: int = 1,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 loss_heatmap: ConfigType = dict(
+                     type='GaussianFocalLoss',
+                     alpha=2.0,
+                     gamma=4.0,
+                     loss_weight=1),
+                 loss_embedding: ConfigType = dict(
+                     type='AssociativeEmbeddingLoss',
+                     pull_weight=0.25,
+                     push_weight=0.25),
+                 loss_offset: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1),
+                 init_cfg: OptMultiConfig = None) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.corner_emb_channels = corner_emb_channels
+        self.with_corner_emb = self.corner_emb_channels > 0
+        self.corner_offset_channels = 2
+        self.num_feat_levels = num_feat_levels
+        self.loss_heatmap = MODELS.build(
+            loss_heatmap) if loss_heatmap is not None else None
+        self.loss_embedding = MODELS.build(
+            loss_embedding) if loss_embedding is not None else None
+        self.loss_offset = MODELS.build(
+            loss_offset) if loss_offset is not None else None
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self._init_layers()
+
+    def _make_layers(self,
+                     out_channels: int,
+                     in_channels: int = 256,
+                     feat_channels: int = 256) -> nn.Sequential:
+        """Initialize conv sequential for CornerHead."""
+        return nn.Sequential(
+            ConvModule(in_channels, feat_channels, 3, padding=1),
+            ConvModule(
+                feat_channels, out_channels, 1, norm_cfg=None, act_cfg=None))
+
+    def _init_corner_kpt_layers(self) -> None:
+        """Initialize corner keypoint layers.
+
+        Including corner heatmap branch and corner offset branch. Each branch
+        has two parts: prefix `tl_` for top-left and `br_` for bottom-right.
+        """
+        self.tl_pool, self.br_pool = nn.ModuleList(), nn.ModuleList()
+        self.tl_heat, self.br_heat = nn.ModuleList(), nn.ModuleList()
+        self.tl_off, self.br_off = nn.ModuleList(), nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_pool.append(
+                BiCornerPool(
+                    self.in_channels, ['top', 'left'],
+                    out_channels=self.in_channels))
+            self.br_pool.append(
+                BiCornerPool(
+                    self.in_channels, ['bottom', 'right'],
+                    out_channels=self.in_channels))
+
+            self.tl_heat.append(
+                self._make_layers(
+                    out_channels=self.num_classes,
+                    in_channels=self.in_channels))
+            self.br_heat.append(
+                self._make_layers(
+                    out_channels=self.num_classes,
+                    in_channels=self.in_channels))
+
+            self.tl_off.append(
+                self._make_layers(
+                    out_channels=self.corner_offset_channels,
+                    in_channels=self.in_channels))
+            self.br_off.append(
+                self._make_layers(
+                    out_channels=self.corner_offset_channels,
+                    in_channels=self.in_channels))
+
+    def _init_corner_emb_layers(self) -> None:
+        """Initialize corner embedding layers.
+
+        Only include corner embedding branch with two parts: prefix `tl_` for
+        top-left and `br_` for bottom-right.
+        """
+        self.tl_emb, self.br_emb = nn.ModuleList(), nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_emb.append(
+                self._make_layers(
+                    out_channels=self.corner_emb_channels,
+                    in_channels=self.in_channels))
+            self.br_emb.append(
+                self._make_layers(
+                    out_channels=self.corner_emb_channels,
+                    in_channels=self.in_channels))
+
+    def _init_layers(self) -> None:
+        """Initialize layers for CornerHead.
+
+        Including two parts: corner keypoint layers and corner embedding layers
+        """
+        self._init_corner_kpt_layers()
+        if self.with_corner_emb:
+            self._init_corner_emb_layers()
+
+    def init_weights(self) -> None:
+        super().init_weights()
+        bias_init = bias_init_with_prob(0.1)
+        for i in range(self.num_feat_levels):
+            # The initialization of parameters are different between
+            # nn.Conv2d and ConvModule. Our experiments show that
+            # using the original initialization of nn.Conv2d increases
+            # the final mAP by about 0.2%
+            self.tl_heat[i][-1].conv.reset_parameters()
+            self.tl_heat[i][-1].conv.bias.data.fill_(bias_init)
+            self.br_heat[i][-1].conv.reset_parameters()
+            self.br_heat[i][-1].conv.bias.data.fill_(bias_init)
+            self.tl_off[i][-1].conv.reset_parameters()
+            self.br_off[i][-1].conv.reset_parameters()
+            if self.with_corner_emb:
+                self.tl_emb[i][-1].conv.reset_parameters()
+                self.br_emb[i][-1].conv.reset_parameters()
+
+    def forward(self, feats: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of corner heatmaps, offset heatmaps and
+            embedding heatmaps.
+                - tl_heats (list[Tensor]): Top-left corner heatmaps for all
+                  levels, each is a 4D-tensor, the channels number is
+                  num_classes.
+                - br_heats (list[Tensor]): Bottom-right corner heatmaps for all
+                  levels, each is a 4D-tensor, the channels number is
+                  num_classes.
+                - tl_embs (list[Tensor] | list[None]): Top-left embedding
+                  heatmaps for all levels, each is a 4D-tensor or None.
+                  If not None, the channels number is corner_emb_channels.
+                - br_embs (list[Tensor] | list[None]): Bottom-right embedding
+                  heatmaps for all levels, each is a 4D-tensor or None.
+                  If not None, the channels number is corner_emb_channels.
+                - tl_offs (list[Tensor]): Top-left offset heatmaps for all
+                  levels, each is a 4D-tensor. The channels number is
+                  corner_offset_channels.
+                - br_offs (list[Tensor]): Bottom-right offset heatmaps for all
+                  levels, each is a 4D-tensor. The channels number is
+                  corner_offset_channels.
+        """
+        lvl_ind = list(range(self.num_feat_levels))
+        return multi_apply(self.forward_single, feats, lvl_ind)
+
+    def forward_single(self,
+                       x: Tensor,
+                       lvl_ind: int,
+                       return_pool: bool = False) -> List[Tensor]:
+        """Forward feature of a single level.
+
+        Args:
+            x (Tensor): Feature of a single level.
+            lvl_ind (int): Level index of current feature.
+            return_pool (bool): Return corner pool feature or not.
+                Defaults to False.
+
+        Returns:
+            tuple[Tensor]: A tuple of CornerHead's output for current feature
+            level. Containing the following Tensors:
+
+                - tl_heat (Tensor): Predicted top-left corner heatmap.
+                - br_heat (Tensor): Predicted bottom-right corner heatmap.
+                - tl_emb (Tensor | None): Predicted top-left embedding heatmap.
+                  None for `self.with_corner_emb == False`.
+                - br_emb (Tensor | None): Predicted bottom-right embedding
+                  heatmap. None for `self.with_corner_emb == False`.
+                - tl_off (Tensor): Predicted top-left offset heatmap.
+                - br_off (Tensor): Predicted bottom-right offset heatmap.
+                - tl_pool (Tensor): Top-left corner pool feature. Not must
+                  have.
+                - br_pool (Tensor): Bottom-right corner pool feature. Not must
+                  have.
+        """
+        tl_pool = self.tl_pool[lvl_ind](x)
+        tl_heat = self.tl_heat[lvl_ind](tl_pool)
+        br_pool = self.br_pool[lvl_ind](x)
+        br_heat = self.br_heat[lvl_ind](br_pool)
+
+        tl_emb, br_emb = None, None
+        if self.with_corner_emb:
+            tl_emb = self.tl_emb[lvl_ind](tl_pool)
+            br_emb = self.br_emb[lvl_ind](br_pool)
+
+        tl_off = self.tl_off[lvl_ind](tl_pool)
+        br_off = self.br_off[lvl_ind](br_pool)
+
+        result_list = [tl_heat, br_heat, tl_emb, br_emb, tl_off, br_off]
+        if return_pool:
+            result_list.append(tl_pool)
+            result_list.append(br_pool)
+
+        return result_list
+
+    def get_targets(self,
+                    gt_bboxes: List[Tensor],
+                    gt_labels: List[Tensor],
+                    feat_shape: Sequence[int],
+                    img_shape: Sequence[int],
+                    with_corner_emb: bool = False,
+                    with_guiding_shift: bool = False,
+                    with_centripetal_shift: bool = False) -> dict:
+        """Generate corner targets.
+
+        Including corner heatmap, corner offset.
+
+        Optional: corner embedding, corner guiding shift, centripetal shift.
+
+        For CornerNet, we generate corner heatmap, corner offset and corner
+        embedding from this function.
+
+        For CentripetalNet, we generate corner heatmap, corner offset, guiding
+        shift and centripetal shift from this function.
+
+        Args:
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image, each
+                has shape (num_gt, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box, each has
+                shape (num_gt, ).
+            feat_shape (Sequence[int]): Shape of output feature,
+                [batch, channel, height, width].
+            img_shape (Sequence[int]): Shape of input image,
+                [height, width, channel].
+            with_corner_emb (bool): Generate corner embedding target or not.
+                Defaults to False.
+            with_guiding_shift (bool): Generate guiding shift target or not.
+                Defaults to False.
+            with_centripetal_shift (bool): Generate centripetal shift target or
+                not. Defaults to False.
+
+        Returns:
+            dict: Ground truth of corner heatmap, corner offset, corner
+            embedding, guiding shift and centripetal shift. Containing the
+            following keys:
+
+                - topleft_heatmap (Tensor): Ground truth top-left corner
+                  heatmap.
+                - bottomright_heatmap (Tensor): Ground truth bottom-right
+                  corner heatmap.
+                - topleft_offset (Tensor): Ground truth top-left corner offset.
+                - bottomright_offset (Tensor): Ground truth bottom-right corner
+                  offset.
+                - corner_embedding (list[list[list[int]]]): Ground truth corner
+                  embedding. Not must have.
+                - topleft_guiding_shift (Tensor): Ground truth top-left corner
+                  guiding shift. Not must have.
+                - bottomright_guiding_shift (Tensor): Ground truth bottom-right
+                  corner guiding shift. Not must have.
+                - topleft_centripetal_shift (Tensor): Ground truth top-left
+                  corner centripetal shift. Not must have.
+                - bottomright_centripetal_shift (Tensor): Ground truth
+                  bottom-right corner centripetal shift. Not must have.
+        """
+        batch_size, _, height, width = feat_shape
+        img_h, img_w = img_shape[:2]
+
+        width_ratio = float(width / img_w)
+        height_ratio = float(height / img_h)
+
+        gt_tl_heatmap = gt_bboxes[-1].new_zeros(
+            [batch_size, self.num_classes, height, width])
+        gt_br_heatmap = gt_bboxes[-1].new_zeros(
+            [batch_size, self.num_classes, height, width])
+        gt_tl_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width])
+        gt_br_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width])
+
+        if with_corner_emb:
+            match = []
+
+        # Guiding shift is a kind of offset, from center to corner
+        if with_guiding_shift:
+            gt_tl_guiding_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+            gt_br_guiding_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+        # Centripetal shift is also a kind of offset, from center to corner
+        # and normalized by log.
+        if with_centripetal_shift:
+            gt_tl_centripetal_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+            gt_br_centripetal_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+
+        for batch_id in range(batch_size):
+            # Ground truth of corner embedding per image is a list of coord set
+            corner_match = []
+            for box_id in range(len(gt_labels[batch_id])):
+                left, top, right, bottom = gt_bboxes[batch_id][box_id]
+                center_x = (left + right) / 2.0
+                center_y = (top + bottom) / 2.0
+                label = gt_labels[batch_id][box_id]
+
+                # Use coords in the feature level to generate ground truth
+                scale_left = left * width_ratio
+                scale_right = right * width_ratio
+                scale_top = top * height_ratio
+                scale_bottom = bottom * height_ratio
+                scale_center_x = center_x * width_ratio
+                scale_center_y = center_y * height_ratio
+
+                # Int coords on feature map/ground truth tensor
+                left_idx = int(min(scale_left, width - 1))
+                right_idx = int(min(scale_right, width - 1))
+                top_idx = int(min(scale_top, height - 1))
+                bottom_idx = int(min(scale_bottom, height - 1))
+
+                # Generate gaussian heatmap
+                scale_box_width = ceil(scale_right - scale_left)
+                scale_box_height = ceil(scale_bottom - scale_top)
+                radius = gaussian_radius((scale_box_height, scale_box_width),
+                                         min_overlap=0.3)
+                radius = max(0, int(radius))
+                gt_tl_heatmap[batch_id, label] = gen_gaussian_target(
+                    gt_tl_heatmap[batch_id, label], [left_idx, top_idx],
+                    radius)
+                gt_br_heatmap[batch_id, label] = gen_gaussian_target(
+                    gt_br_heatmap[batch_id, label], [right_idx, bottom_idx],
+                    radius)
+
+                # Generate corner offset
+                left_offset = scale_left - left_idx
+                top_offset = scale_top - top_idx
+                right_offset = scale_right - right_idx
+                bottom_offset = scale_bottom - bottom_idx
+                gt_tl_offset[batch_id, 0, top_idx, left_idx] = left_offset
+                gt_tl_offset[batch_id, 1, top_idx, left_idx] = top_offset
+                gt_br_offset[batch_id, 0, bottom_idx, right_idx] = right_offset
+                gt_br_offset[batch_id, 1, bottom_idx,
+                             right_idx] = bottom_offset
+
+                # Generate corner embedding
+                if with_corner_emb:
+                    corner_match.append([[top_idx, left_idx],
+                                         [bottom_idx, right_idx]])
+                # Generate guiding shift
+                if with_guiding_shift:
+                    gt_tl_guiding_shift[batch_id, 0, top_idx,
+                                        left_idx] = scale_center_x - left_idx
+                    gt_tl_guiding_shift[batch_id, 1, top_idx,
+                                        left_idx] = scale_center_y - top_idx
+                    gt_br_guiding_shift[batch_id, 0, bottom_idx,
+                                        right_idx] = right_idx - scale_center_x
+                    gt_br_guiding_shift[
+                        batch_id, 1, bottom_idx,
+                        right_idx] = bottom_idx - scale_center_y
+                # Generate centripetal shift
+                if with_centripetal_shift:
+                    gt_tl_centripetal_shift[batch_id, 0, top_idx,
+                                            left_idx] = log(scale_center_x -
+                                                            scale_left)
+                    gt_tl_centripetal_shift[batch_id, 1, top_idx,
+                                            left_idx] = log(scale_center_y -
+                                                            scale_top)
+                    gt_br_centripetal_shift[batch_id, 0, bottom_idx,
+                                            right_idx] = log(scale_right -
+                                                             scale_center_x)
+                    gt_br_centripetal_shift[batch_id, 1, bottom_idx,
+                                            right_idx] = log(scale_bottom -
+                                                             scale_center_y)
+
+            if with_corner_emb:
+                match.append(corner_match)
+
+        target_result = dict(
+            topleft_heatmap=gt_tl_heatmap,
+            topleft_offset=gt_tl_offset,
+            bottomright_heatmap=gt_br_heatmap,
+            bottomright_offset=gt_br_offset)
+
+        if with_corner_emb:
+            target_result.update(corner_embedding=match)
+        if with_guiding_shift:
+            target_result.update(
+                topleft_guiding_shift=gt_tl_guiding_shift,
+                bottomright_guiding_shift=gt_br_guiding_shift)
+        if with_centripetal_shift:
+            target_result.update(
+                topleft_centripetal_shift=gt_tl_centripetal_shift,
+                bottomright_centripetal_shift=gt_br_centripetal_shift)
+
+        return target_result
+
+    def loss_by_feat(
+            self,
+            tl_heats: List[Tensor],
+            br_heats: List[Tensor],
+            tl_embs: List[Tensor],
+            br_embs: List[Tensor],
+            tl_offs: List[Tensor],
+            br_offs: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_embs (list[Tensor]): Top-left corner embeddings for each level
+                with shape (N, corner_emb_channels, H, W).
+            br_embs (list[Tensor]): Bottom-right corner embeddings for each
+                level with shape (N, corner_emb_channels, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Specify which bounding boxes can be ignored when computing
+                the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components. Containing the
+            following losses:
+
+                - det_loss (list[Tensor]): Corner keypoint losses of all
+                  feature levels.
+                - pull_loss (list[Tensor]): Part one of AssociativeEmbedding
+                  losses of all feature levels.
+                - push_loss (list[Tensor]): Part two of AssociativeEmbedding
+                  losses of all feature levels.
+                - off_loss (list[Tensor]): Corner offset losses of all feature
+                  levels.
+        """
+        gt_bboxes = [
+            gt_instances.bboxes for gt_instances in batch_gt_instances
+        ]
+        gt_labels = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+
+        targets = self.get_targets(
+            gt_bboxes,
+            gt_labels,
+            tl_heats[-1].shape,
+            batch_img_metas[0]['batch_input_shape'],
+            with_corner_emb=self.with_corner_emb)
+        mlvl_targets = [targets for _ in range(self.num_feat_levels)]
+        det_losses, pull_losses, push_losses, off_losses = multi_apply(
+            self.loss_by_feat_single, tl_heats, br_heats, tl_embs, br_embs,
+            tl_offs, br_offs, mlvl_targets)
+        loss_dict = dict(det_loss=det_losses, off_loss=off_losses)
+        if self.with_corner_emb:
+            loss_dict.update(pull_loss=pull_losses, push_loss=push_losses)
+        return loss_dict
+
+    def loss_by_feat_single(self, tl_hmp: Tensor, br_hmp: Tensor,
+                            tl_emb: Optional[Tensor], br_emb: Optional[Tensor],
+                            tl_off: Tensor, br_off: Tensor,
+                            targets: dict) -> Tuple[Tensor, ...]:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            tl_hmp (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_hmp (Tensor): Bottom-right corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            tl_emb (Tensor, optional): Top-left corner embedding for current
+                level with shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor, optional): Bottom-right corner embedding for
+                current level with shape (N, corner_emb_channels, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            targets (dict): Corner target generated by `get_targets`.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of the head's different branches
+            containing the following losses:
+
+                - det_loss (Tensor): Corner keypoint loss.
+                - pull_loss (Tensor): Part one of AssociativeEmbedding loss.
+                - push_loss (Tensor): Part two of AssociativeEmbedding loss.
+                - off_loss (Tensor): Corner offset loss.
+        """
+        gt_tl_hmp = targets['topleft_heatmap']
+        gt_br_hmp = targets['bottomright_heatmap']
+        gt_tl_off = targets['topleft_offset']
+        gt_br_off = targets['bottomright_offset']
+        gt_embedding = targets['corner_embedding']
+
+        # Detection loss
+        tl_det_loss = self.loss_heatmap(
+            tl_hmp.sigmoid(),
+            gt_tl_hmp,
+            avg_factor=max(1,
+                           gt_tl_hmp.eq(1).sum()))
+        br_det_loss = self.loss_heatmap(
+            br_hmp.sigmoid(),
+            gt_br_hmp,
+            avg_factor=max(1,
+                           gt_br_hmp.eq(1).sum()))
+        det_loss = (tl_det_loss + br_det_loss) / 2.0
+
+        # AssociativeEmbedding loss
+        if self.with_corner_emb and self.loss_embedding is not None:
+            pull_loss, push_loss = self.loss_embedding(tl_emb, br_emb,
+                                                       gt_embedding)
+        else:
+            pull_loss, push_loss = None, None
+
+        # Offset loss
+        # We only compute the offset loss at the real corner position.
+        # The value of real corner would be 1 in heatmap ground truth.
+        # The mask is computed in class agnostic mode and its shape is
+        # batch * 1 * width * height.
+        tl_off_mask = gt_tl_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_tl_hmp)
+        br_off_mask = gt_br_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_br_hmp)
+        tl_off_loss = self.loss_offset(
+            tl_off,
+            gt_tl_off,
+            tl_off_mask,
+            avg_factor=max(1, tl_off_mask.sum()))
+        br_off_loss = self.loss_offset(
+            br_off,
+            gt_br_off,
+            br_off_mask,
+            avg_factor=max(1, br_off_mask.sum()))
+
+        off_loss = (tl_off_loss + br_off_loss) / 2.0
+
+        return det_loss, pull_loss, push_loss, off_loss
+
+    def predict_by_feat(self,
+                        tl_heats: List[Tensor],
+                        br_heats: List[Tensor],
+                        tl_embs: List[Tensor],
+                        br_embs: List[Tensor],
+                        tl_offs: List[Tensor],
+                        br_offs: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_embs (list[Tensor]): Top-left corner embeddings for each level
+                with shape (N, corner_emb_channels, H, W).
+            br_embs (list[Tensor]): Bottom-right corner embeddings for each
+                level with shape (N, corner_emb_channels, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            batch_img_metas (list[dict], optional): Batch image meta info.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len(
+            batch_img_metas)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            result_list.append(
+                self._predict_by_feat_single(
+                    tl_heats[-1][img_id:img_id + 1, :],
+                    br_heats[-1][img_id:img_id + 1, :],
+                    tl_offs[-1][img_id:img_id + 1, :],
+                    br_offs[-1][img_id:img_id + 1, :],
+                    batch_img_metas[img_id],
+                    tl_emb=tl_embs[-1][img_id:img_id + 1, :],
+                    br_emb=br_embs[-1][img_id:img_id + 1, :],
+                    rescale=rescale,
+                    with_nms=with_nms))
+
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                tl_heat: Tensor,
+                                br_heat: Tensor,
+                                tl_off: Tensor,
+                                br_off: Tensor,
+                                img_meta: dict,
+                                tl_emb: Optional[Tensor] = None,
+                                br_emb: Optional[Tensor] = None,
+                                tl_centripetal_shift: Optional[Tensor] = None,
+                                br_centripetal_shift: Optional[Tensor] = None,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            tl_heat (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_heat (Tensor): Bottom-right corner heatmap for current level
+                with shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            tl_emb (Tensor): Top-left corner embedding for current level with
+                shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor): Bottom-right corner embedding for current level
+                with shape (N, corner_emb_channels, H, W).
+            tl_centripetal_shift: Top-left corner's centripetal shift for
+                current level with shape (N, 2, H, W).
+            br_centripetal_shift: Bottom-right corner's centripetal shift for
+                current level with shape (N, 2, H, W).
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if isinstance(img_meta, (list, tuple)):
+            img_meta = img_meta[0]
+
+        batch_bboxes, batch_scores, batch_clses = self._decode_heatmap(
+            tl_heat=tl_heat.sigmoid(),
+            br_heat=br_heat.sigmoid(),
+            tl_off=tl_off,
+            br_off=br_off,
+            tl_emb=tl_emb,
+            br_emb=br_emb,
+            tl_centripetal_shift=tl_centripetal_shift,
+            br_centripetal_shift=br_centripetal_shift,
+            img_meta=img_meta,
+            k=self.test_cfg.corner_topk,
+            kernel=self.test_cfg.local_maximum_kernel,
+            distance_threshold=self.test_cfg.distance_threshold)
+
+        if rescale and 'scale_factor' in img_meta:
+            batch_bboxes /= batch_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        bboxes = batch_bboxes.view([-1, 4])
+        scores = batch_scores.view(-1)
+        clses = batch_clses.view(-1)
+
+        det_bboxes = torch.cat([bboxes, scores.unsqueeze(-1)], -1)
+        keepinds = (det_bboxes[:, -1] > -0.1)
+        det_bboxes = det_bboxes[keepinds]
+        det_labels = clses[keepinds]
+
+        if with_nms:
+            det_bboxes, det_labels = self._bboxes_nms(det_bboxes, det_labels,
+                                                      self.test_cfg)
+
+        results = InstanceData()
+        results.bboxes = det_bboxes[..., :4]
+        results.scores = det_bboxes[..., 4]
+        results.labels = det_labels
+        return results
+
+    def _bboxes_nms(self, bboxes: Tensor, labels: Tensor,
+                    cfg: ConfigDict) -> Tuple[Tensor, Tensor]:
+        """bboxes nms."""
+        if 'nms_cfg' in cfg:
+            warning.warn('nms_cfg in test_cfg will be deprecated. '
+                         'Please rename it as nms')
+        if 'nms' not in cfg:
+            cfg.nms = cfg.nms_cfg
+
+        if labels.numel() > 0:
+            max_num = cfg.max_per_img
+            bboxes, keep = batched_nms(bboxes[:, :4], bboxes[:,
+                                                             -1].contiguous(),
+                                       labels, cfg.nms)
+            if max_num > 0:
+                bboxes = bboxes[:max_num]
+                labels = labels[keep][:max_num]
+
+        return bboxes, labels
+
+    def _decode_heatmap(self,
+                        tl_heat: Tensor,
+                        br_heat: Tensor,
+                        tl_off: Tensor,
+                        br_off: Tensor,
+                        tl_emb: Optional[Tensor] = None,
+                        br_emb: Optional[Tensor] = None,
+                        tl_centripetal_shift: Optional[Tensor] = None,
+                        br_centripetal_shift: Optional[Tensor] = None,
+                        img_meta: Optional[dict] = None,
+                        k: int = 100,
+                        kernel: int = 3,
+                        distance_threshold: float = 0.5,
+                        num_dets: int = 1000) -> Tuple[Tensor, Tensor, Tensor]:
+        """Transform outputs into detections raw bbox prediction.
+
+        Args:
+            tl_heat (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_heat (Tensor): Bottom-right corner heatmap for current level
+                with shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            tl_emb (Tensor, Optional): Top-left corner embedding for current
+                level with shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor, Optional): Bottom-right corner embedding for
+                current level with shape (N, corner_emb_channels, H, W).
+            tl_centripetal_shift (Tensor, Optional): Top-left centripetal shift
+                for current level with shape (N, 2, H, W).
+            br_centripetal_shift (Tensor, Optional): Bottom-right centripetal
+                shift for current level with shape (N, 2, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            k (int): Get top k corner keypoints from heatmap.
+            kernel (int): Max pooling kernel for extract local maximum pixels.
+            distance_threshold (float): Distance threshold. Top-left and
+                bottom-right corner keypoints with feature distance less than
+                the threshold will be regarded as keypoints from same object.
+            num_dets (int): Num of raw boxes before doing nms.
+
+        Returns:
+            tuple[torch.Tensor]: Decoded output of CornerHead, containing the
+            following Tensors:
+
+            - bboxes (Tensor): Coords of each box.
+            - scores (Tensor): Scores of each box.
+            - clses (Tensor): Categories of each box.
+        """
+        with_embedding = tl_emb is not None and br_emb is not None
+        with_centripetal_shift = (
+            tl_centripetal_shift is not None
+            and br_centripetal_shift is not None)
+        assert with_embedding + with_centripetal_shift == 1
+        batch, _, height, width = tl_heat.size()
+        if torch.onnx.is_in_onnx_export():
+            inp_h, inp_w = img_meta['pad_shape_for_onnx'][:2]
+        else:
+            inp_h, inp_w = img_meta['batch_input_shape'][:2]
+
+        # perform nms on heatmaps
+        tl_heat = get_local_maximum(tl_heat, kernel=kernel)
+        br_heat = get_local_maximum(br_heat, kernel=kernel)
+
+        tl_scores, tl_inds, tl_clses, tl_ys, tl_xs = get_topk_from_heatmap(
+            tl_heat, k=k)
+        br_scores, br_inds, br_clses, br_ys, br_xs = get_topk_from_heatmap(
+            br_heat, k=k)
+
+        # We use repeat instead of expand here because expand is a
+        # shallow-copy function. Thus it could cause unexpected testing result
+        # sometimes. Using expand will decrease about 10% mAP during testing
+        # compared to repeat.
+        tl_ys = tl_ys.view(batch, k, 1).repeat(1, 1, k)
+        tl_xs = tl_xs.view(batch, k, 1).repeat(1, 1, k)
+        br_ys = br_ys.view(batch, 1, k).repeat(1, k, 1)
+        br_xs = br_xs.view(batch, 1, k).repeat(1, k, 1)
+
+        tl_off = transpose_and_gather_feat(tl_off, tl_inds)
+        tl_off = tl_off.view(batch, k, 1, 2)
+        br_off = transpose_and_gather_feat(br_off, br_inds)
+        br_off = br_off.view(batch, 1, k, 2)
+
+        tl_xs = tl_xs + tl_off[..., 0]
+        tl_ys = tl_ys + tl_off[..., 1]
+        br_xs = br_xs + br_off[..., 0]
+        br_ys = br_ys + br_off[..., 1]
+
+        if with_centripetal_shift:
+            tl_centripetal_shift = transpose_and_gather_feat(
+                tl_centripetal_shift, tl_inds).view(batch, k, 1, 2).exp()
+            br_centripetal_shift = transpose_and_gather_feat(
+                br_centripetal_shift, br_inds).view(batch, 1, k, 2).exp()
+
+            tl_ctxs = tl_xs + tl_centripetal_shift[..., 0]
+            tl_ctys = tl_ys + tl_centripetal_shift[..., 1]
+            br_ctxs = br_xs - br_centripetal_shift[..., 0]
+            br_ctys = br_ys - br_centripetal_shift[..., 1]
+
+        # all possible boxes based on top k corners (ignoring class)
+        tl_xs *= (inp_w / width)
+        tl_ys *= (inp_h / height)
+        br_xs *= (inp_w / width)
+        br_ys *= (inp_h / height)
+
+        if with_centripetal_shift:
+            tl_ctxs *= (inp_w / width)
+            tl_ctys *= (inp_h / height)
+            br_ctxs *= (inp_w / width)
+            br_ctys *= (inp_h / height)
+
+        x_off, y_off = 0, 0  # no crop
+        if not torch.onnx.is_in_onnx_export():
+            # since `RandomCenterCropPad` is done on CPU with numpy and it's
+            # not dynamic traceable when exporting to ONNX, thus 'border'
+            # does not appears as key in 'img_meta'. As a tmp solution,
+            # we move this 'border' handle part to the postprocess after
+            # finished exporting to ONNX, which is handle in
+            # `mmdet/core/export/model_wrappers.py`. Though difference between
+            # pytorch and exported onnx model, it might be ignored since
+            # comparable performance is achieved between them (e.g. 40.4 vs
+            # 40.6 on COCO val2017, for CornerNet without test-time flip)
+            if 'border' in img_meta:
+                x_off = img_meta['border'][2]
+                y_off = img_meta['border'][0]
+
+        tl_xs -= x_off
+        tl_ys -= y_off
+        br_xs -= x_off
+        br_ys -= y_off
+
+        zeros = tl_xs.new_zeros(*tl_xs.size())
+        tl_xs = torch.where(tl_xs > 0.0, tl_xs, zeros)
+        tl_ys = torch.where(tl_ys > 0.0, tl_ys, zeros)
+        br_xs = torch.where(br_xs > 0.0, br_xs, zeros)
+        br_ys = torch.where(br_ys > 0.0, br_ys, zeros)
+
+        bboxes = torch.stack((tl_xs, tl_ys, br_xs, br_ys), dim=3)
+        area_bboxes = ((br_xs - tl_xs) * (br_ys - tl_ys)).abs()
+
+        if with_centripetal_shift:
+            tl_ctxs -= x_off
+            tl_ctys -= y_off
+            br_ctxs -= x_off
+            br_ctys -= y_off
+
+            tl_ctxs *= tl_ctxs.gt(0.0).type_as(tl_ctxs)
+            tl_ctys *= tl_ctys.gt(0.0).type_as(tl_ctys)
+            br_ctxs *= br_ctxs.gt(0.0).type_as(br_ctxs)
+            br_ctys *= br_ctys.gt(0.0).type_as(br_ctys)
+
+            ct_bboxes = torch.stack((tl_ctxs, tl_ctys, br_ctxs, br_ctys),
+                                    dim=3)
+            area_ct_bboxes = ((br_ctxs - tl_ctxs) * (br_ctys - tl_ctys)).abs()
+
+            rcentral = torch.zeros_like(ct_bboxes)
+            # magic nums from paper section 4.1
+            mu = torch.ones_like(area_bboxes) / 2.4
+            mu[area_bboxes > 3500] = 1 / 2.1  # large bbox have smaller mu
+
+            bboxes_center_x = (bboxes[..., 0] + bboxes[..., 2]) / 2
+            bboxes_center_y = (bboxes[..., 1] + bboxes[..., 3]) / 2
+            rcentral[..., 0] = bboxes_center_x - mu * (bboxes[..., 2] -
+                                                       bboxes[..., 0]) / 2
+            rcentral[..., 1] = bboxes_center_y - mu * (bboxes[..., 3] -
+                                                       bboxes[..., 1]) / 2
+            rcentral[..., 2] = bboxes_center_x + mu * (bboxes[..., 2] -
+                                                       bboxes[..., 0]) / 2
+            rcentral[..., 3] = bboxes_center_y + mu * (bboxes[..., 3] -
+                                                       bboxes[..., 1]) / 2
+            area_rcentral = ((rcentral[..., 2] - rcentral[..., 0]) *
+                             (rcentral[..., 3] - rcentral[..., 1])).abs()
+            dists = area_ct_bboxes / area_rcentral
+
+            tl_ctx_inds = (ct_bboxes[..., 0] <= rcentral[..., 0]) | (
+                ct_bboxes[..., 0] >= rcentral[..., 2])
+            tl_cty_inds = (ct_bboxes[..., 1] <= rcentral[..., 1]) | (
+                ct_bboxes[..., 1] >= rcentral[..., 3])
+            br_ctx_inds = (ct_bboxes[..., 2] <= rcentral[..., 0]) | (
+                ct_bboxes[..., 2] >= rcentral[..., 2])
+            br_cty_inds = (ct_bboxes[..., 3] <= rcentral[..., 1]) | (
+                ct_bboxes[..., 3] >= rcentral[..., 3])
+
+        if with_embedding:
+            tl_emb = transpose_and_gather_feat(tl_emb, tl_inds)
+            tl_emb = tl_emb.view(batch, k, 1)
+            br_emb = transpose_and_gather_feat(br_emb, br_inds)
+            br_emb = br_emb.view(batch, 1, k)
+            dists = torch.abs(tl_emb - br_emb)
+
+        tl_scores = tl_scores.view(batch, k, 1).repeat(1, 1, k)
+        br_scores = br_scores.view(batch, 1, k).repeat(1, k, 1)
+
+        scores = (tl_scores + br_scores) / 2  # scores for all possible boxes
+
+        # tl and br should have same class
+        tl_clses = tl_clses.view(batch, k, 1).repeat(1, 1, k)
+        br_clses = br_clses.view(batch, 1, k).repeat(1, k, 1)
+        cls_inds = (tl_clses != br_clses)
+
+        # reject boxes based on distances
+        dist_inds = dists > distance_threshold
+
+        # reject boxes based on widths and heights
+        width_inds = (br_xs <= tl_xs)
+        height_inds = (br_ys <= tl_ys)
+
+        # No use `scores[cls_inds]`, instead we use `torch.where` here.
+        # Since only 1-D indices with type 'tensor(bool)' are supported
+        # when exporting to ONNX, any other bool indices with more dimensions
+        # (e.g. 2-D bool tensor) as input parameter in node is invalid
+        negative_scores = -1 * torch.ones_like(scores)
+        scores = torch.where(cls_inds, negative_scores, scores)
+        scores = torch.where(width_inds, negative_scores, scores)
+        scores = torch.where(height_inds, negative_scores, scores)
+        scores = torch.where(dist_inds, negative_scores, scores)
+
+        if with_centripetal_shift:
+            scores[tl_ctx_inds] = -1
+            scores[tl_cty_inds] = -1
+            scores[br_ctx_inds] = -1
+            scores[br_cty_inds] = -1
+
+        scores = scores.view(batch, -1)
+        scores, inds = torch.topk(scores, num_dets)
+        scores = scores.unsqueeze(2)
+
+        bboxes = bboxes.view(batch, -1, 4)
+        bboxes = gather_feat(bboxes, inds)
+
+        clses = tl_clses.contiguous().view(batch, -1, 1)
+        clses = gather_feat(clses, inds)
+
+        return bboxes, scores, clses
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/dab_detr_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/dab_detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..892833ffce5f17f6f9e82e67b7d32c6b9c1bafc0
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/dab_detr_head.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn as nn
+from mmcv.cnn import Linear
+from mmengine.model import bias_init_with_prob, constant_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList
+from ..layers import MLP, inverse_sigmoid
+from .conditional_detr_head import ConditionalDETRHead
+
+
+@MODELS.register_module()
+class DABDETRHead(ConditionalDETRHead):
+    """Head of DAB-DETR. DAB-DETR: Dynamic Anchor Boxes are Better Queries for
+    DETR.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2201.12329>`_ .
+    """
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the transformer head."""
+        # cls branch
+        self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        # reg branch
+        self.fc_reg = MLP(self.embed_dims, self.embed_dims, 4, 3)
+
+    def init_weights(self) -> None:
+        """initialize weights."""
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            nn.init.constant_(self.fc_cls.bias, bias_init)
+        constant_init(self.fc_reg.layers[-1], 0., bias=0.)
+
+    def forward(self, hidden_states: Tensor,
+                references: Tensor) -> Tuple[Tensor, Tensor]:
+        """"Forward function.
+
+        Args:
+            hidden_states (Tensor): Features from transformer decoder. If
+                `return_intermediate_dec` is True output has shape
+                (num_decoder_layers, bs, num_queries, dim), else has shape (1,
+                bs, num_queries, dim) which only contains the last layer
+                outputs.
+            references (Tensor): References from transformer decoder. If
+                `return_intermediate_dec` is True output has shape
+                (num_decoder_layers, bs, num_queries, 2/4), else has shape (1,
+                bs, num_queries, 2/4)
+                which only contains the last layer reference.
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - layers_cls_scores (Tensor): Outputs from the classification head,
+              shape (num_decoder_layers, bs, num_queries, cls_out_channels).
+              Note cls_out_channels should include background.
+            - layers_bbox_preds (Tensor): Sigmoid outputs from the regression
+              head with normalized coordinate format (cx, cy, w, h), has shape
+              (num_decoder_layers, bs, num_queries, 4).
+        """
+        layers_cls_scores = self.fc_cls(hidden_states)
+        references_before_sigmoid = inverse_sigmoid(references, eps=1e-3)
+        tmp_reg_preds = self.fc_reg(hidden_states)
+        tmp_reg_preds[..., :references_before_sigmoid.
+                      size(-1)] += references_before_sigmoid
+        layers_bbox_preds = tmp_reg_preds.sigmoid()
+        return layers_cls_scores, layers_bbox_preds
+
+    def predict(self,
+                hidden_states: Tensor,
+                references: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network. Over-write
+        because img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (Tensor): Feature from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, dim).
+            references (Tensor): references from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, 2/4).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        last_layer_hidden_state = hidden_states[-1].unsqueeze(0)
+        last_layer_reference = references[-1].unsqueeze(0)
+        outs = self(last_layer_hidden_state, last_layer_reference)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+        return predictions
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/ddod_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/ddod_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..64e91ff0135230a8d634c5964eb520e1461c872a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/ddod_head.py
@@ -0,0 +1,794 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+from mmengine.model import bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import images_to_levels, multi_apply, unmap
+from .anchor_head import AnchorHead
+
+EPS = 1e-12
+
+
+@MODELS.register_module()
+class DDODHead(AnchorHead):
+    """Detection Head of `DDOD <https://arxiv.org/abs/2107.02963>`_.
+
+    DDOD head decomposes conjunctions lying in most current one-stage
+    detectors via label assignment disentanglement, spatial feature
+    disentanglement, and pyramid supervision disentanglement.
+
+    Args:
+        num_classes (int): Number of categories excluding the
+            background category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): The number of stacked Conv. Defaults to 4.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        use_dcn (bool): Use dcn, Same as ATSS when False. Defaults to True.
+        norm_cfg (:obj:`ConfigDict` or dict): Normal config of ddod head.
+            Defaults to dict(type='GN', num_groups=32, requires_grad=True).
+        loss_iou (:obj:`ConfigDict` or dict): Config of IoU loss. Defaults to
+            dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0).
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 use_dcn: bool = True,
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 loss_iou: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 **kwargs) -> None:
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.use_dcn = use_dcn
+        super().__init__(num_classes, in_channels, **kwargs)
+
+        if self.train_cfg:
+            self.cls_assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            self.reg_assigner = TASK_UTILS.build(
+                self.train_cfg['reg_assigner'])
+        self.loss_iou = MODELS.build(loss_iou)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=dict(type='DCN', deform_groups=1)
+                    if i == 0 and self.use_dcn else self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=dict(type='DCN', deform_groups=1)
+                    if i == 0 and self.use_dcn else self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.atss_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.atss_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+        self.atss_iou = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 1, 3, padding=1)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+        # we use the global list in loss
+        self.cls_num_pos_samples_per_level = [
+            0. for _ in range(len(self.prior_generator.strides))
+        ]
+        self.reg_num_pos_samples_per_level = [
+            0. for _ in range(len(self.prior_generator.strides))
+        ]
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.cls_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            normal_init(m.conv, std=0.01)
+        normal_init(self.atss_reg, std=0.01)
+        normal_init(self.atss_iou, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.atss_cls, std=0.01, bias=bias_cls)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores, bbox predictions,
+            and iou predictions.
+
+            - cls_scores (list[Tensor]): Classification scores for all \
+            scale levels, each is a 4D-tensor, the channels number is \
+            num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all \
+            scale levels, each is a 4D-tensor, the channels number is \
+            num_base_priors * 4.
+            - iou_preds (list[Tensor]): IoU scores for all scale levels, \
+            each is a 4D-tensor, the channels number is num_base_priors * 1.
+        """
+        return multi_apply(self.forward_single, x, self.scales)
+
+    def forward_single(self, x: Tensor, scale: Scale) -> Sequence[Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+
+            - cls_score (Tensor): Cls scores for a single scale level \
+            the channels number is num_base_priors * num_classes.
+            - bbox_pred (Tensor): Box energies / deltas for a single \
+            scale level, the channels number is num_base_priors * 4.
+            - iou_pred (Tensor): Iou for a single scale level, the \
+            channel number is (N, num_base_priors * 1, H, W).
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.atss_cls(cls_feat)
+        # we just follow atss, not apply exp in bbox_pred
+        bbox_pred = scale(self.atss_reg(reg_feat)).float()
+        iou_pred = self.atss_iou(reg_feat)
+        return cls_score, bbox_pred, iou_pred
+
+    def loss_cls_by_feat_single(self, cls_score: Tensor, labels: Tensor,
+                                label_weights: Tensor,
+                                reweight_factor: List[float],
+                                avg_factor: float) -> Tuple[Tensor]:
+        """Compute cls loss of a single scale level.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_base_priors * num_classes, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            reweight_factor (List[float]): Reweight factor for cls and reg
+                loss.
+            avg_factor (float): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            Tuple[Tensor]: A tuple of loss components.
+        """
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+        return reweight_factor * loss_cls,
+
+    def loss_reg_by_feat_single(self, anchors: Tensor, bbox_pred: Tensor,
+                                iou_pred: Tensor, labels,
+                                label_weights: Tensor, bbox_targets: Tensor,
+                                bbox_weights: Tensor,
+                                reweight_factor: List[float],
+                                avg_factor: float) -> Tuple[Tensor, Tensor]:
+        """Compute reg loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_base_priors * 4, H, W).
+            iou_pred (Tensor): Iou for a single scale level, the
+                channel number is (N, num_base_priors * 1, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            bbox_weights (Tensor): BBox weights of all anchors in the
+                image with shape (N, 4)
+            reweight_factor (List[float]): Reweight factor for cls and reg
+                loss.
+            avg_factor (float): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+        Returns:
+            Tuple[Tensor, Tensor]: A tuple of loss components.
+        """
+        anchors = anchors.reshape(-1, 4)
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        iou_pred = iou_pred.permute(0, 2, 3, 1).reshape(-1, )
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        bbox_weights = bbox_weights.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        iou_targets = label_weights.new_zeros(labels.shape)
+        iou_weights = label_weights.new_zeros(labels.shape)
+        iou_weights[(bbox_weights.sum(axis=1) > 0).nonzero(
+            as_tuple=False)] = 1.
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    &
+                    (labels < bg_class_ind)).nonzero(as_tuple=False).squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_pred)
+            pos_decode_bbox_targets = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_targets)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                avg_factor=avg_factor)
+
+            iou_targets[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            loss_iou = self.loss_iou(
+                iou_pred, iou_targets, iou_weights, avg_factor=avg_factor)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_iou = iou_pred.sum() * 0
+
+        return reweight_factor * loss_bbox, reweight_factor * loss_iou
+
+    def calc_reweight_factor(self, labels_list: List[Tensor]) -> List[float]:
+        """Compute reweight_factor for regression and classification loss."""
+        # get pos samples for each level
+        bg_class_ind = self.num_classes
+        for ii, each_level_label in enumerate(labels_list):
+            pos_inds = ((each_level_label >= 0) &
+                        (each_level_label < bg_class_ind)).nonzero(
+                            as_tuple=False).squeeze(1)
+            self.cls_num_pos_samples_per_level[ii] += len(pos_inds)
+        # get reweight factor from 1 ~ 2 with bilinear interpolation
+        min_pos_samples = min(self.cls_num_pos_samples_per_level)
+        max_pos_samples = max(self.cls_num_pos_samples_per_level)
+        interval = 1. / (max_pos_samples - min_pos_samples + 1e-10)
+        reweight_factor_per_level = []
+        for pos_samples in self.cls_num_pos_samples_per_level:
+            factor = 2. - (pos_samples - min_pos_samples) * interval
+            reweight_factor_per_level.append(factor)
+        return reweight_factor_per_level
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            iou_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_base_priors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_base_priors * 4, H, W)
+            iou_preds (list[Tensor]): Score factor for all scale level,
+                each is a 4D-tensor, has shape (batch_size, 1, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        # calculate common vars for cls and reg assigners at once
+        targets_com = self.process_predictions_and_anchors(
+            anchor_list, valid_flag_list, cls_scores, bbox_preds,
+            batch_img_metas, batch_gt_instances_ignore)
+        (anchor_list, valid_flag_list, num_level_anchors_list, cls_score_list,
+         bbox_pred_list, batch_gt_instances_ignore) = targets_com
+
+        # classification branch assigner
+        cls_targets = self.get_cls_targets(
+            anchor_list,
+            valid_flag_list,
+            num_level_anchors_list,
+            cls_score_list,
+            bbox_pred_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (cls_anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_targets
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+        avg_factor = max(avg_factor, 1.0)
+
+        reweight_factor_per_level = self.calc_reweight_factor(labels_list)
+
+        cls_losses_cls, = multi_apply(
+            self.loss_cls_by_feat_single,
+            cls_scores,
+            labels_list,
+            label_weights_list,
+            reweight_factor_per_level,
+            avg_factor=avg_factor)
+
+        # regression branch assigner
+        reg_targets = self.get_reg_targets(
+            anchor_list,
+            valid_flag_list,
+            num_level_anchors_list,
+            cls_score_list,
+            bbox_pred_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (reg_anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = reg_targets
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+        avg_factor = max(avg_factor, 1.0)
+
+        reweight_factor_per_level = self.calc_reweight_factor(labels_list)
+
+        reg_losses_bbox, reg_losses_iou = multi_apply(
+            self.loss_reg_by_feat_single,
+            reg_anchor_list,
+            bbox_preds,
+            iou_preds,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            reweight_factor_per_level,
+            avg_factor=avg_factor)
+
+        return dict(
+            loss_cls=cls_losses_cls,
+            loss_bbox=reg_losses_bbox,
+            loss_iou=reg_losses_iou)
+
+    def process_predictions_and_anchors(
+            self,
+            anchor_list: List[List[Tensor]],
+            valid_flag_list: List[List[Tensor]],
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> tuple:
+        """Compute common vars for regression and classification targets.
+
+        Args:
+            anchor_list (List[List[Tensor]]): anchors of each image.
+            valid_flag_list (List[List[Tensor]]): Valid flags of each image.
+            cls_scores (List[Tensor]): Classification scores for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * 4.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Return:
+            tuple[Tensor]: A tuple of common loss vars.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        anchor_list_ = []
+        valid_flag_list_ = []
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list_.append(torch.cat(anchor_list[i]))
+            valid_flag_list_.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None for _ in range(num_imgs)]
+
+        num_levels = len(cls_scores)
+        cls_score_list = []
+        bbox_pred_list = []
+
+        mlvl_cls_score_list = [
+            cls_score.permute(0, 2, 3, 1).reshape(
+                num_imgs, -1, self.num_base_priors * self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        mlvl_bbox_pred_list = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.num_base_priors * 4)
+            for bbox_pred in bbox_preds
+        ]
+
+        for i in range(num_imgs):
+            mlvl_cls_tensor_list = [
+                mlvl_cls_score_list[j][i] for j in range(num_levels)
+            ]
+            mlvl_bbox_tensor_list = [
+                mlvl_bbox_pred_list[j][i] for j in range(num_levels)
+            ]
+            cat_mlvl_cls_score = torch.cat(mlvl_cls_tensor_list, dim=0)
+            cat_mlvl_bbox_pred = torch.cat(mlvl_bbox_tensor_list, dim=0)
+            cls_score_list.append(cat_mlvl_cls_score)
+            bbox_pred_list.append(cat_mlvl_bbox_pred)
+        return (anchor_list_, valid_flag_list_, num_level_anchors_list,
+                cls_score_list, bbox_pred_list, batch_gt_instances_ignore)
+
+    def get_cls_targets(self,
+                        anchor_list: List[Tensor],
+                        valid_flag_list: List[Tensor],
+                        num_level_anchors_list: List[int],
+                        cls_score_list: List[Tensor],
+                        bbox_pred_list: List[Tensor],
+                        batch_gt_instances: InstanceList,
+                        batch_img_metas: List[dict],
+                        batch_gt_instances_ignore: OptInstanceList = None,
+                        unmap_outputs: bool = True) -> tuple:
+        """Get cls targets for DDOD head.
+
+        This method is almost the same as `AnchorHead.get_targets()`.
+        Besides returning the targets as the parent  method does,
+        it also returns the anchors as the first element of the
+        returned tuple.
+
+        Args:
+            anchor_list (list[Tensor]): anchors of each image.
+            valid_flag_list (list[Tensor]): Valid flags of each image.
+            num_level_anchors_list (list[Tensor]): Number of anchors of each
+                scale level of all image.
+            cls_score_list (list[Tensor]): Classification scores for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * num_classes.
+            bbox_pred_list (list[Tensor]): Box energies / deltas for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Return:
+            tuple[Tensor]: A tuple of cls targets components.
+        """
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             anchor_list,
+             valid_flag_list,
+             cls_score_list,
+             bbox_pred_list,
+             num_level_anchors_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs,
+             is_cls_assigner=True)
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors_list[0])
+        labels_list = images_to_levels(all_labels, num_level_anchors_list[0])
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors_list[0])
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors_list[0])
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors_list[0])
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, avg_factor)
+
+    def get_reg_targets(self,
+                        anchor_list: List[Tensor],
+                        valid_flag_list: List[Tensor],
+                        num_level_anchors_list: List[int],
+                        cls_score_list: List[Tensor],
+                        bbox_pred_list: List[Tensor],
+                        batch_gt_instances: InstanceList,
+                        batch_img_metas: List[dict],
+                        batch_gt_instances_ignore: OptInstanceList = None,
+                        unmap_outputs: bool = True) -> tuple:
+        """Get reg targets for DDOD head.
+
+        This method is almost the same as `AnchorHead.get_targets()` when
+        is_cls_assigner is False. Besides returning the targets as the parent
+        method does, it also returns the anchors as the first element of the
+        returned tuple.
+
+        Args:
+            anchor_list (list[Tensor]): anchors of each image.
+            valid_flag_list (list[Tensor]): Valid flags of each image.
+            num_level_anchors_list (list[Tensor]): Number of anchors of each
+                scale level of all image.
+            cls_score_list (list[Tensor]): Classification scores for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * num_classes.
+            bbox_pred_list (list[Tensor]): Box energies / deltas for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Return:
+            tuple[Tensor]: A tuple of reg targets components.
+        """
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             anchor_list,
+             valid_flag_list,
+             cls_score_list,
+             bbox_pred_list,
+             num_level_anchors_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs,
+             is_cls_assigner=False)
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors_list[0])
+        labels_list = images_to_levels(all_labels, num_level_anchors_list[0])
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors_list[0])
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors_list[0])
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors_list[0])
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, avg_factor)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            cls_scores: Tensor,
+                            bbox_preds: Tensor,
+                            num_level_anchors: List[int],
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True,
+                            is_cls_assigner: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image,
+                which are concatenated into a single tensor of shape
+                (num_base_priors, 4).
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                shape (num_base_priors,).
+            cls_scores (Tensor): Classification scores for all scale
+                levels of the image.
+            bbox_preds (Tensor): Box energies / deltas for all scale
+                levels of the image.
+            num_level_anchors (List[int]): Number of anchors of each
+                scale level.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+            is_cls_assigner (bool): Classification or regression.
+                Defaults to True.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+            - anchors (Tensor): all anchors in the image with shape (N, 4).
+            - labels (Tensor): Labels of all anchors in the image with \
+            shape (N, ).
+            - label_weights (Tensor): Label weights of all anchor in the \
+            image with shape (N, ).
+            - bbox_targets (Tensor): BBox targets of all anchors in the \
+            image with shape (N, 4).
+            - bbox_weights (Tensor): BBox weights of all anchors in the \
+            image with shape (N, 4)
+            - pos_inds (Tensor): Indices of positive anchor with shape \
+            (num_pos, ).
+            - neg_inds (Tensor): Indices of negative anchor with shape \
+            (num_neg, ).
+            - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        bbox_preds_valid = bbox_preds[inside_flags, :]
+        cls_scores_valid = cls_scores[inside_flags, :]
+
+        assigner = self.cls_assigner if is_cls_assigner else self.reg_assigner
+
+        # decode prediction out of assigner
+        bbox_preds_valid = self.bbox_coder.decode(anchors, bbox_preds_valid)
+        pred_instances = InstanceData(
+            priors=anchors, bboxes=bbox_preds_valid, scores=cls_scores_valid)
+
+        assign_result = assigner.assign(
+            pred_instances=pred_instances,
+            num_level_priors=num_level_anchors_inside,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            pos_bbox_targets = self.bbox_coder.encode(
+                sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds, sampling_result)
+
+    def get_num_level_anchors_inside(self, num_level_anchors: List[int],
+                                     inside_flags: Tensor) -> List[int]:
+        """Get the anchors of each scale level inside.
+
+        Args:
+            num_level_anchors (list[int]): Number of anchors of each
+                scale level.
+            inside_flags (Tensor): Multi level inside flags of the image,
+                which are concatenated into a single tensor of
+                shape (num_base_priors,).
+
+        Returns:
+            list[int]: Number of anchors of each scale level inside.
+        """
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/ddq_detr_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/ddq_detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0580653ac264ea0a597eec76624ab7eb3c7f6a10
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/ddq_detr_head.py
@@ -0,0 +1,550 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Tuple
+
+import torch
+from mmengine.model import bias_init_with_prob, constant_init
+from torch import Tensor, nn
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
+from ..layers import inverse_sigmoid
+from ..losses import DDQAuxLoss
+from ..utils import multi_apply
+from .dino_head import DINOHead
+
+
+@MODELS.register_module()
+class DDQDETRHead(DINOHead):
+    r"""Head of DDQDETR: Dense Distinct Query for
+        End-to-End Object Detection.
+
+    Code is modified from the `official github repo
+        <https://github.com/jshilong/DDQ>`_.
+
+    More details can be found in the `paper
+        <https://arxiv.org/abs/2303.12776>`_ .
+
+    Args:
+        aux_num_pos (int): Number of positive targets assigned to a
+            perdicted object. Defaults to 4.
+    """
+
+    def __init__(self, *args, aux_num_pos=4, **kwargs):
+        super(DDQDETRHead, self).__init__(*args, **kwargs)
+        self.aux_loss_for_dense = DDQAuxLoss(
+            train_cfg=dict(
+                assigner=dict(type='TopkHungarianAssigner', topk=aux_num_pos),
+                alpha=1,
+                beta=6))
+
+    def _init_layers(self) -> None:
+        """Initialize classification branch and regression branch of aux head
+        for dense queries."""
+        super(DDQDETRHead, self)._init_layers()
+        # If decoder `num_layers` = 6 and `as_two_stage` = True, then:
+        #   1) 6 main heads are required for
+        #       each decoder output of distinct queries.
+        #   2) 1 main head is required for `output_memory` of distinct queries.
+        #   3) 1 aux head is required for `output_memory` of dense queries,
+        #       which is done by code below this comment.
+        # So 8 heads are required in sum.
+        # aux head for dense queries on encoder feature map
+        self.cls_branches.append(copy.deepcopy(self.cls_branches[-1]))
+        self.reg_branches.append(copy.deepcopy(self.reg_branches[-1]))
+
+        # If decoder `num_layers` = 6 and `as_two_stage` = True, then:
+        #   6 aux heads are required for each decoder output of dense queries.
+        # So 8 + 6 = 14 heads and heads are requires in sum.
+        # self.num_pred_layer is 7
+        # aux head for dense queries in decoder
+        self.aux_cls_branches = nn.ModuleList([
+            copy.deepcopy(self.cls_branches[-1])
+            for _ in range(self.num_pred_layer - 1)
+        ])
+        self.aux_reg_branches = nn.ModuleList([
+            copy.deepcopy(self.reg_branches[-1])
+            for _ in range(self.num_pred_layer - 1)
+        ])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the Deformable DETR head."""
+        bias_init = bias_init_with_prob(0.01)
+        for m in self.cls_branches:
+            nn.init.constant_(m.bias, bias_init)
+        for m in self.aux_cls_branches:
+            nn.init.constant_(m.bias, bias_init)
+        for m in self.reg_branches:
+            constant_init(m[-1], 0, bias=0)
+        for m in self.reg_branches:
+            nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+        for m in self.aux_reg_branches:
+            constant_init(m[-1], 0, bias=0)
+
+        for m in self.aux_reg_branches:
+            nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+    def forward(self, hidden_states: Tensor,
+                references: List[Tensor]) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries_total,
+                dim), where `num_queries_total` is the sum of
+                `num_denoising_queries`, `num_queries` and `num_dense_queries`
+                when `self.training` is `True`, else `num_queries`.
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). Each reference has shape (bs,
+                num_queries_total, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+
+        Returns:
+            tuple[Tensor]: results of head containing the following tensors.
+
+            - all_layers_outputs_classes (Tensor): Outputs from the
+              classification head, has shape (num_decoder_layers, bs,
+              num_queries_total, cls_out_channels).
+            - all_layers_outputs_coords (Tensor): Sigmoid outputs from the
+              regression head with normalized coordinate format (cx, cy, w,
+              h), has shape (num_decoder_layers, bs, num_queries_total, 4)
+              with the last dimension arranged as (cx, cy, w, h).
+        """
+        all_layers_outputs_classes = []
+        all_layers_outputs_coords = []
+        if self.training:
+            num_dense = self.cache_dict['num_dense_queries']
+        for layer_id in range(hidden_states.shape[0]):
+            reference = inverse_sigmoid(references[layer_id])
+            hidden_state = hidden_states[layer_id]
+            if self.training:
+                dense_hidden_state = hidden_state[:, -num_dense:]
+                hidden_state = hidden_state[:, :-num_dense]
+
+            outputs_class = self.cls_branches[layer_id](hidden_state)
+            tmp_reg_preds = self.reg_branches[layer_id](hidden_state)
+            if self.training:
+                dense_outputs_class = self.aux_cls_branches[layer_id](
+                    dense_hidden_state)
+                dense_tmp_reg_preds = self.aux_reg_branches[layer_id](
+                    dense_hidden_state)
+                outputs_class = torch.cat([outputs_class, dense_outputs_class],
+                                          dim=1)
+                tmp_reg_preds = torch.cat([tmp_reg_preds, dense_tmp_reg_preds],
+                                          dim=1)
+
+            if reference.shape[-1] == 4:
+                tmp_reg_preds += reference
+            else:
+                assert reference.shape[-1] == 2
+                tmp_reg_preds[..., :2] += reference
+            outputs_coord = tmp_reg_preds.sigmoid()
+            all_layers_outputs_classes.append(outputs_class)
+            all_layers_outputs_coords.append(outputs_coord)
+
+        all_layers_outputs_classes = torch.stack(all_layers_outputs_classes)
+        all_layers_outputs_coords = torch.stack(all_layers_outputs_coords)
+
+        return all_layers_outputs_classes, all_layers_outputs_coords
+
+    def loss(self,
+             hidden_states: Tensor,
+             references: List[Tensor],
+             enc_outputs_class: Tensor,
+             enc_outputs_coord: Tensor,
+             batch_data_samples: SampleList,
+             dn_meta: Dict[str, int],
+             aux_enc_outputs_class=None,
+             aux_enc_outputs_coord=None) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries_total,
+                dim), where `num_queries_total` is the sum of
+                `num_denoising_queries`, `num_queries` and `num_dense_queries`
+                when `self.training` is `True`, else `num_queries`.
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). Each reference has shape (bs,
+                num_queries_total, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            enc_outputs_class (Tensor): The top k classification score of
+                each point on encoder feature map, has shape (bs, num_queries,
+                cls_out_channels).
+            enc_outputs_coord (Tensor): The proposal generated from points
+                with top k score, has shape (bs, num_queries, 4) with the
+                last dimension arranged as (cx, cy, w, h).
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+            aux_enc_outputs_class (Tensor): The `dense_topk` classification
+                score of each point on encoder feature map, has shape (bs,
+                num_dense_queries, cls_out_channels).
+                It is `None` when `self.training` is `False`.
+            aux_enc_outputs_coord (Tensor): The proposal generated from points
+                with `dense_topk` score, has shape (bs, num_dense_queries, 4)
+                with the last dimension arranged as (cx, cy, w, h).
+                It is `None` when `self.training` is `False`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
+                              batch_gt_instances, batch_img_metas, dn_meta)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        aux_enc_outputs_coord = bbox_cxcywh_to_xyxy(aux_enc_outputs_coord)
+        aux_enc_outputs_coord_list = []
+        for img_id in range(len(aux_enc_outputs_coord)):
+            det_bboxes = aux_enc_outputs_coord[img_id]
+            img_shape = batch_img_metas[img_id]['img_shape']
+            det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+            det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+            aux_enc_outputs_coord_list.append(det_bboxes)
+        aux_enc_outputs_coord = torch.stack(aux_enc_outputs_coord_list)
+        aux_loss = self.aux_loss_for_dense.loss(
+            aux_enc_outputs_class.sigmoid(), aux_enc_outputs_coord,
+            [item.bboxes for item in batch_gt_instances],
+            [item.labels for item in batch_gt_instances], batch_img_metas)
+        for k, v in aux_loss.items():
+            losses[f'aux_enc_{k}'] = v
+
+        return losses
+
+    def loss_by_feat(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        enc_cls_scores: Tensor,
+        enc_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        dn_meta: Dict[str, int],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries_total, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Bbox coordinates of all decoder
+                layers. Each has shape (num_decoder_layers, bs,
+                num_queries_total, 4) with normalized coordinate format
+                (cx, cy, w, h).
+            enc_cls_scores (Tensor): The top k score of each point on
+                encoder feature map, has shape (bs, num_queries,
+                cls_out_channels).
+            enc_bbox_preds (Tensor): The proposal generated from points
+                with top k score, has shape (bs, num_queries, 4) with the
+                last dimension arranged as (cx, cy, w, h).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+                group collation, including 'num_denoising_queries' and
+                'num_denoising_groups'. It will be used for split outputs of
+                denoising and matching parts and loss calculation.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        (all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+         all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds) = \
+            self.split_outputs(
+                all_layers_cls_scores, all_layers_bbox_preds, dn_meta)
+
+        num_dense_queries = dn_meta['num_dense_queries']
+        num_layer = all_layers_matching_bbox_preds.size(0)
+        dense_all_layers_matching_cls_scores = all_layers_matching_cls_scores[:, :,  # noqa: E501
+                                                                              -num_dense_queries:]  # noqa: E501
+        dense_all_layers_matching_bbox_preds = all_layers_matching_bbox_preds[:, :,  # noqa: E501
+                                                                              -num_dense_queries:]  # noqa: E501
+
+        all_layers_matching_cls_scores = all_layers_matching_cls_scores[:, :, :  # noqa: E501
+                                                                        -num_dense_queries]  # noqa: E501
+        all_layers_matching_bbox_preds = all_layers_matching_bbox_preds[:, :, :  # noqa: E501
+                                                                        -num_dense_queries]  # noqa: E501
+
+        loss_dict = self.loss_for_distinct_queries(
+            all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+            batch_gt_instances, batch_img_metas, batch_gt_instances_ignore)
+
+        if enc_cls_scores is not None:
+
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_by_feat_single(
+                    enc_cls_scores, enc_bbox_preds,
+                    batch_gt_instances=batch_gt_instances,
+                    batch_img_metas=batch_img_metas)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+            loss_dict['enc_loss_iou'] = enc_losses_iou
+
+        if all_layers_denoising_cls_scores is not None:
+            dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn(
+                all_layers_denoising_cls_scores,
+                all_layers_denoising_bbox_preds,
+                batch_gt_instances=batch_gt_instances,
+                batch_img_metas=batch_img_metas,
+                dn_meta=dn_meta)
+            loss_dict['dn_loss_cls'] = dn_losses_cls[-1]
+            loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1]
+            loss_dict['dn_loss_iou'] = dn_losses_iou[-1]
+            for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in \
+                    enumerate(zip(dn_losses_cls[:-1], dn_losses_bbox[:-1],
+                                  dn_losses_iou[:-1])):
+                loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i
+
+        for l_id in range(num_layer):
+            cls_scores = dense_all_layers_matching_cls_scores[l_id].sigmoid()
+            bbox_preds = dense_all_layers_matching_bbox_preds[l_id]
+
+            bbox_preds = bbox_cxcywh_to_xyxy(bbox_preds)
+            bbox_preds_list = []
+            for img_id in range(len(bbox_preds)):
+                det_bboxes = bbox_preds[img_id]
+                img_shape = batch_img_metas[img_id]['img_shape']
+                det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+                det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+                bbox_preds_list.append(det_bboxes)
+            bbox_preds = torch.stack(bbox_preds_list)
+            aux_loss = self.aux_loss_for_dense.loss(
+                cls_scores, bbox_preds,
+                [item.bboxes for item in batch_gt_instances],
+                [item.labels for item in batch_gt_instances], batch_img_metas)
+            for k, v in aux_loss.items():
+                loss_dict[f'{l_id}_aux_{k}'] = v
+
+        return loss_dict
+
+    def loss_for_distinct_queries(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss of distinct queries, that is, excluding denoising
+        and dense queries. Only select the distinct queries in decoder for
+        loss.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Bbox coordinates of all decoder
+                layers. It has shape (num_decoder_layers, bs,
+                num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image,
+            e.g., image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert batch_gt_instances_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            'for batch_gt_instances_ignore setting to None.'
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self._loss_for_distinct_queries_single,
+            all_layers_cls_scores,
+            all_layers_bbox_preds,
+            [i for i in range(len(all_layers_bbox_preds))],
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in \
+                zip(losses_cls[:-1], losses_bbox[:-1], losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def _loss_for_distinct_queries_single(self, cls_scores, bbox_preds, l_id,
+                                          batch_gt_instances, batch_img_metas):
+        """Calculate the loss for outputs from a single decoder layer of
+        distinct queries, that is, excluding denoising and dense queries. Only
+        select the distinct queries in decoder for loss.
+
+        Args:
+            cls_scores (Tensor): Classification scores of a single
+                decoder layer, has shape (bs, num_queries, cls_out_channels).
+            bbox_preds (Tensor): Bbox coordinates of a single decoder
+                layer. It has shape (bs, num_queries, 4) with the last
+                dimension arranged as (cx, cy, w, h).
+            l_id (int): Decoder layer index for these outputs.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image,
+            e.g., image size, scaling factor, etc.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        num_imgs = cls_scores.size(0)
+        if 0 < l_id:
+            batch_mask = [
+                self.cache_dict['distinct_query_mask'][l_id - 1][
+                    img_id * self.cache_dict['num_heads']][0]
+                for img_id in range(num_imgs)
+            ]
+        else:
+            batch_mask = [
+                torch.ones(len(cls_scores[i]),
+                           device=cls_scores.device).bool()
+                for i in range(num_imgs)
+            ]
+        # only select the distinct queries in decoder for loss
+        cls_scores_list = [
+            cls_scores[i][batch_mask[i]] for i in range(num_imgs)
+        ]
+        bbox_preds_list = [
+            bbox_preds[i][batch_mask[i]] for i in range(num_imgs)
+        ]
+        cls_scores = torch.cat(cls_scores_list)
+
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           batch_gt_instances, batch_img_metas)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds_list):
+            img_h, img_w, = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = torch.cat(bbox_preds_list)
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def predict_by_feat(self,
+                        layer_cls_scores: Tensor,
+                        layer_bbox_preds: Tensor,
+                        batch_img_metas: List[dict],
+                        rescale: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            layer_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries, cls_out_channels).
+            layer_bbox_preds (Tensor): Bbox coordinates of all decoder layers.
+                Each has shape (num_decoder_layers, bs, num_queries, 4)
+                with normalized coordinate format (cx, cy, w, h).
+            batch_img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Default `False`.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        cls_scores = layer_cls_scores[-1]
+        bbox_preds = layer_bbox_preds[-1]
+
+        num_imgs = cls_scores.size(0)
+        # -1 is last layer input query mask
+
+        batch_mask = [
+            self.cache_dict['distinct_query_mask'][-1][
+                img_id * self.cache_dict['num_heads']][0]
+            for img_id in range(num_imgs)
+        ]
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score = cls_scores[img_id][batch_mask[img_id]]
+            bbox_pred = bbox_preds[img_id][batch_mask[img_id]]
+            img_meta = batch_img_metas[img_id]
+            results = self._predict_by_feat_single(cls_score, bbox_pred,
+                                                   img_meta, rescale)
+            result_list.append(results)
+        return result_list
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/deformable_detr_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/deformable_detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..adedd4aa6b533bcfece618eed4045c95bf0fdebb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/deformable_detr_head.py
@@ -0,0 +1,329 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Linear
+from mmengine.model import bias_init_with_prob, constant_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList, OptInstanceList
+from ..layers import inverse_sigmoid
+from .detr_head import DETRHead
+
+
+@MODELS.register_module()
+class DeformableDETRHead(DETRHead):
+    r"""Head of DeformDETR: Deformable DETR: Deformable Transformers for
+    End-to-End Object Detection.
+
+    Code is modified from the `official github repo
+    <https://github.com/fundamentalvision/Deformable-DETR>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2010.04159>`_ .
+
+    Args:
+        share_pred_layer (bool): Whether to share parameters for all the
+            prediction layers. Defaults to `False`.
+        num_pred_layer (int): The number of the prediction layers.
+            Defaults to 6.
+        as_two_stage (bool, optional): Whether to generate the proposal
+            from the outputs of encoder. Defaults to `False`.
+    """
+
+    def __init__(self,
+                 *args,
+                 share_pred_layer: bool = False,
+                 num_pred_layer: int = 6,
+                 as_two_stage: bool = False,
+                 **kwargs) -> None:
+        self.share_pred_layer = share_pred_layer
+        self.num_pred_layer = num_pred_layer
+        self.as_two_stage = as_two_stage
+
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize classification branch and regression branch of head."""
+        fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, 4))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        if self.share_pred_layer:
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(self.num_pred_layer)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(self.num_pred_layer)])
+        else:
+            self.cls_branches = nn.ModuleList(
+                [copy.deepcopy(fc_cls) for _ in range(self.num_pred_layer)])
+            self.reg_branches = nn.ModuleList([
+                copy.deepcopy(reg_branch) for _ in range(self.num_pred_layer)
+            ])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the Deformable DETR head."""
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias, bias_init)
+        for m in self.reg_branches:
+            constant_init(m[-1], 0, bias=0)
+        nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)
+        if self.as_two_stage:
+            for m in self.reg_branches:
+                nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+    def forward(self, hidden_states: Tensor,
+                references: List[Tensor]) -> Tuple[Tensor, Tensor]:
+        """Forward function.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries, dim).
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - all_layers_outputs_classes (Tensor): Outputs from the
+              classification head, has shape (num_decoder_layers, bs,
+              num_queries, cls_out_channels).
+            - all_layers_outputs_coords (Tensor): Sigmoid outputs from the
+              regression head with normalized coordinate format (cx, cy, w,
+              h), has shape (num_decoder_layers, bs, num_queries, 4) with the
+              last dimension arranged as (cx, cy, w, h).
+        """
+        all_layers_outputs_classes = []
+        all_layers_outputs_coords = []
+
+        for layer_id in range(hidden_states.shape[0]):
+            reference = inverse_sigmoid(references[layer_id])
+            # NOTE The last reference will not be used.
+            hidden_state = hidden_states[layer_id]
+            outputs_class = self.cls_branches[layer_id](hidden_state)
+            tmp_reg_preds = self.reg_branches[layer_id](hidden_state)
+            if reference.shape[-1] == 4:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `True`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `True`.
+                tmp_reg_preds += reference
+            else:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `False`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `False`.
+                assert reference.shape[-1] == 2
+                tmp_reg_preds[..., :2] += reference
+            outputs_coord = tmp_reg_preds.sigmoid()
+            all_layers_outputs_classes.append(outputs_class)
+            all_layers_outputs_coords.append(outputs_coord)
+
+        all_layers_outputs_classes = torch.stack(all_layers_outputs_classes)
+        all_layers_outputs_coords = torch.stack(all_layers_outputs_coords)
+
+        return all_layers_outputs_classes, all_layers_outputs_coords
+
+    def loss(self, hidden_states: Tensor, references: List[Tensor],
+             enc_outputs_class: Tensor, enc_outputs_coord: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, num_queries, bs, dim).
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+            enc_outputs_class (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+                Only when `as_two_stage` is `True` it would be passed in,
+                otherwise it would be `None`.
+            enc_outputs_coord (Tensor): The proposal generate from the encode
+                feature map, has shape (bs, num_feat_points, 4) with the last
+                dimension arranged as (cx, cy, w, h). Only when `as_two_stage`
+                is `True` it would be passed in, otherwise it would be `None`.
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
+                              batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        enc_cls_scores: Tensor,
+        enc_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs, num_queries,
+                cls_out_channels).
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
+                num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            enc_cls_scores (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+                Only when `as_two_stage` is `True` it would be passes in,
+                otherwise, it would be `None`.
+            enc_bbox_preds (Tensor): The proposal generate from the encode
+                feature map, has shape (bs, num_feat_points, 4) with the last
+                dimension arranged as (cx, cy, w, h). Only when `as_two_stage`
+                is `True` it would be passed in, otherwise it would be `None`.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        loss_dict = super().loss_by_feat(all_layers_cls_scores,
+                                         all_layers_bbox_preds,
+                                         batch_gt_instances, batch_img_metas,
+                                         batch_gt_instances_ignore)
+
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            proposal_gt_instances = copy.deepcopy(batch_gt_instances)
+            for i in range(len(proposal_gt_instances)):
+                proposal_gt_instances[i].labels = torch.zeros_like(
+                    proposal_gt_instances[i].labels)
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_by_feat_single(
+                    enc_cls_scores, enc_bbox_preds,
+                    batch_gt_instances=proposal_gt_instances,
+                    batch_img_metas=batch_img_metas)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+            loss_dict['enc_loss_iou'] = enc_losses_iou
+        return loss_dict
+
+    def predict(self,
+                hidden_states: Tensor,
+                references: List[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, num_queries, bs, dim).
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Defaults to `True`.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        outs = self(hidden_states, references)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        all_layers_cls_scores: Tensor,
+                        all_layers_bbox_preds: Tensor,
+                        batch_img_metas: List[Dict],
+                        rescale: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs, num_queries,
+                cls_out_channels).
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and shape (num_decoder_layers, bs, num_queries,
+                4) with the last dimension arranged as (cx, cy, w, h).
+            batch_img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Default `False`.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        cls_scores = all_layers_cls_scores[-1]
+        bbox_preds = all_layers_bbox_preds[-1]
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_meta = batch_img_metas[img_id]
+            results = self._predict_by_feat_single(cls_score, bbox_pred,
+                                                   img_meta, rescale)
+            result_list.append(results)
+        return result_list
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/dense_test_mixins.py b/head_extractor/build/lib/mmdet/models/dense_heads/dense_test_mixins.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7526d48430d6bc6b82777980d0bef418e80b91c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/dense_test_mixins.py
@@ -0,0 +1,215 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+import warnings
+from inspect import signature
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.structures import InstanceData
+
+from mmdet.structures.bbox import bbox_mapping_back
+from ..test_time_augs import merge_aug_proposals
+
+if sys.version_info >= (3, 7):
+    from mmdet.utils.contextmanagers import completed
+
+
+class BBoxTestMixin(object):
+    """Mixin class for testing det bboxes via DenseHead."""
+
+    def simple_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes without test-time augmentation, can be applied in
+        DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``,
+        etc.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each
+                image after the post process. \
+                Each item usually contains following keys. \
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance,)
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances,).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        warnings.warn('You are calling `simple_test_bboxes` in '
+                      '`dense_test_mixins`, but the `dense_test_mixins`'
+                      'will be deprecated soon. Please use '
+                      '`simple_test` instead.')
+        outs = self.forward(feats)
+        results_list = self.get_results(
+            *outs, img_metas=img_metas, rescale=rescale)
+        return results_list
+
+    def aug_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes with test time augmentation, can be applied in
+        DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``,
+        etc.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,). The length of list should always be 1.
+        """
+
+        warnings.warn('You are calling `aug_test_bboxes` in '
+                      '`dense_test_mixins`, but the `dense_test_mixins`'
+                      'will be deprecated soon. Please use '
+                      '`aug_test` instead.')
+        # check with_nms argument
+        gb_sig = signature(self.get_results)
+        gb_args = [p.name for p in gb_sig.parameters.values()]
+        gbs_sig = signature(self._get_results_single)
+        gbs_args = [p.name for p in gbs_sig.parameters.values()]
+        assert ('with_nms' in gb_args) and ('with_nms' in gbs_args), \
+            f'{self.__class__.__name__}' \
+            ' does not support test-time augmentation'
+
+        aug_bboxes = []
+        aug_scores = []
+        aug_labels = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            outs = self.forward(x)
+            bbox_outputs = self.get_results(
+                *outs,
+                img_metas=img_meta,
+                cfg=self.test_cfg,
+                rescale=False,
+                with_nms=False)[0]
+            aug_bboxes.append(bbox_outputs.bboxes)
+            aug_scores.append(bbox_outputs.scores)
+            if len(bbox_outputs) >= 3:
+                aug_labels.append(bbox_outputs.labels)
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = self.merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas)
+        merged_labels = torch.cat(aug_labels, dim=0) if aug_labels else None
+
+        if merged_bboxes.numel() == 0:
+            det_bboxes = torch.cat([merged_bboxes, merged_scores[:, None]], -1)
+            return [
+                (det_bboxes, merged_labels),
+            ]
+
+        det_bboxes, keep_idxs = batched_nms(merged_bboxes, merged_scores,
+                                            merged_labels, self.test_cfg.nms)
+        det_bboxes = det_bboxes[:self.test_cfg.max_per_img]
+        det_labels = merged_labels[keep_idxs][:self.test_cfg.max_per_img]
+
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= det_bboxes.new_tensor(
+                img_metas[0][0]['scale_factor'])
+
+        results = InstanceData()
+        results.bboxes = _det_bboxes[:, :4]
+        results.scores = _det_bboxes[:, 4]
+        results.labels = det_labels
+        return [results]
+
+    def aug_test_rpn(self, feats, img_metas):
+        """Test with augmentation for only for ``RPNHead`` and its variants,
+        e.g., ``GARPNHead``, etc.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                        a 4D-tensor.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            list[Tensor]: Proposals of each image, each item has shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+        """
+        samples_per_gpu = len(img_metas[0])
+        aug_proposals = [[] for _ in range(samples_per_gpu)]
+        for x, img_meta in zip(feats, img_metas):
+            results_list = self.simple_test_rpn(x, img_meta)
+            for i, results in enumerate(results_list):
+                proposals = torch.cat(
+                    [results.bboxes, results.scores[:, None]], dim=-1)
+                aug_proposals[i].append(proposals)
+        # reorganize the order of 'img_metas' to match the dimensions
+        # of 'aug_proposals'
+        aug_img_metas = []
+        for i in range(samples_per_gpu):
+            aug_img_meta = []
+            for j in range(len(img_metas)):
+                aug_img_meta.append(img_metas[j][i])
+            aug_img_metas.append(aug_img_meta)
+        # after merging, proposals will be rescaled to the original image size
+
+        merged_proposals = []
+        for proposals, aug_img_meta in zip(aug_proposals, aug_img_metas):
+            merged_proposal = merge_aug_proposals(proposals, aug_img_meta,
+                                                  self.test_cfg)
+            results = InstanceData()
+            results.bboxes = merged_proposal[:, :4]
+            results.scores = merged_proposal[:, 4]
+            merged_proposals.append(results)
+        return merged_proposals
+
+    if sys.version_info >= (3, 7):
+
+        async def async_simple_test_rpn(self, x, img_metas):
+            sleep_interval = self.test_cfg.pop('async_sleep_interval', 0.025)
+            async with completed(
+                    __name__, 'rpn_head_forward',
+                    sleep_interval=sleep_interval):
+                rpn_outs = self(x)
+
+            proposal_list = self.get_results(*rpn_outs, img_metas=img_metas)
+            return proposal_list
+
+    def merge_aug_bboxes(self, aug_bboxes, aug_scores, img_metas):
+        """Merge augmented detection bboxes and scores.
+
+        Args:
+            aug_bboxes (list[Tensor]): shape (n, 4*#class)
+            aug_scores (list[Tensor] or None): shape (n, #class)
+            img_shapes (list[Tensor]): shape (3, ).
+
+        Returns:
+            tuple[Tensor]: ``bboxes`` with shape (n,4), where
+            4 represent (tl_x, tl_y, br_x, br_y)
+            and ``scores`` with shape (n,).
+        """
+        recovered_bboxes = []
+        for bboxes, img_info in zip(aug_bboxes, img_metas):
+            img_shape = img_info[0]['img_shape']
+            scale_factor = img_info[0]['scale_factor']
+            flip = img_info[0]['flip']
+            flip_direction = img_info[0]['flip_direction']
+            bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
+                                       flip_direction)
+            recovered_bboxes.append(bboxes)
+        bboxes = torch.cat(recovered_bboxes, dim=0)
+        if aug_scores is None:
+            return bboxes
+        else:
+            scores = torch.cat(aug_scores, dim=0)
+            return bboxes, scores
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/detr_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9daeb4740057c1f07095ffbf97b73ea40fc93106
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/detr_head.py
@@ -0,0 +1,634 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Linear
+from mmcv.cnn.bricks.transformer import FFN
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps,
+                                   bbox_xyxy_to_cxcywh)
+from mmdet.utils import (ConfigType, InstanceList, OptInstanceList,
+                         OptMultiConfig, reduce_mean)
+from ..losses import QualityFocalLoss
+from ..utils import multi_apply
+
+
+@MODELS.register_module()
+class DETRHead(BaseModule):
+    r"""Head of DETR. DETR:End-to-End Object Detection with Transformers.
+
+    More details can be found in the `paper
+    <https://arxiv.org/pdf/2005.12872>`_ .
+
+    Args:
+        num_classes (int): Number of categories excluding the background.
+        embed_dims (int): The dims of Transformer embedding.
+        num_reg_fcs (int): Number of fully-connected layers used in `FFN`,
+            which is then used for the regression head. Defaults to 2.
+        sync_cls_avg_factor (bool): Whether to sync the `avg_factor` of
+            all ranks. Default to `False`.
+        loss_cls (:obj:`ConfigDict` or dict): Config of the classification
+            loss. Defaults to `CrossEntropyLoss`.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of the regression bbox
+            loss. Defaults to `L1Loss`.
+        loss_iou (:obj:`ConfigDict` or dict): Config of the regression iou
+            loss. Defaults to `GIoULoss`.
+        train_cfg (:obj:`ConfigDict` or dict): Training config of transformer
+            head.
+        test_cfg (:obj:`ConfigDict` or dict): Testing config of transformer
+            head.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    _version = 2
+
+    def __init__(
+            self,
+            num_classes: int,
+            embed_dims: int = 256,
+            num_reg_fcs: int = 2,
+            sync_cls_avg_factor: bool = False,
+            loss_cls: ConfigType = dict(
+                type='CrossEntropyLoss',
+                bg_cls_weight=0.1,
+                use_sigmoid=False,
+                loss_weight=1.0,
+                class_weight=1.0),
+            loss_bbox: ConfigType = dict(type='L1Loss', loss_weight=5.0),
+            loss_iou: ConfigType = dict(type='GIoULoss', loss_weight=2.0),
+            train_cfg: ConfigType = dict(
+                assigner=dict(
+                    type='HungarianAssigner',
+                    match_costs=[
+                        dict(type='ClassificationCost', weight=1.),
+                        dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                        dict(type='IoUCost', iou_mode='giou', weight=2.0)
+                    ])),
+            test_cfg: ConfigType = dict(max_per_img=100),
+            init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.bg_cls_weight = 0
+        self.sync_cls_avg_factor = sync_cls_avg_factor
+        class_weight = loss_cls.get('class_weight', None)
+        if class_weight is not None and (self.__class__ is DETRHead):
+            assert isinstance(class_weight, float), 'Expected ' \
+                'class_weight to have type float. Found ' \
+                f'{type(class_weight)}.'
+            # NOTE following the official DETR repo, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
+            assert isinstance(bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(bg_cls_weight)}.'
+            class_weight = torch.ones(num_classes + 1) * class_weight
+            # set background class as the last indice
+            class_weight[num_classes] = bg_cls_weight
+            loss_cls.update({'class_weight': class_weight})
+            if 'bg_cls_weight' in loss_cls:
+                loss_cls.pop('bg_cls_weight')
+            self.bg_cls_weight = bg_cls_weight
+
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided ' \
+                                            'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            self.assigner = TASK_UTILS.build(assigner)
+            if train_cfg.get('sampler', None) is not None:
+                raise RuntimeError('DETR do not build sampler.')
+        self.num_classes = num_classes
+        self.embed_dims = embed_dims
+        self.num_reg_fcs = num_reg_fcs
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_iou = MODELS.build(loss_iou)
+
+        if self.loss_cls.use_sigmoid:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the transformer head."""
+        # cls branch
+        self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        # reg branch
+        self.activate = nn.ReLU()
+        self.reg_ffn = FFN(
+            self.embed_dims,
+            self.embed_dims,
+            self.num_reg_fcs,
+            dict(type='ReLU', inplace=True),
+            dropout=0.0,
+            add_residual=False)
+        # NOTE the activations of reg_branch here is the same as
+        # those in transformer, but they are actually different
+        # in DAB-DETR (prelu in transformer and relu in reg_branch)
+        self.fc_reg = Linear(self.embed_dims, 4)
+
+    def forward(self, hidden_states: Tensor) -> Tuple[Tensor]:
+        """"Forward function.
+
+        Args:
+            hidden_states (Tensor): Features from transformer decoder. If
+                `return_intermediate_dec` in detr.py is True output has shape
+                (num_decoder_layers, bs, num_queries, dim), else has shape
+                (1, bs, num_queries, dim) which only contains the last layer
+                outputs.
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - layers_cls_scores (Tensor): Outputs from the classification head,
+              shape (num_decoder_layers, bs, num_queries, cls_out_channels).
+              Note cls_out_channels should include background.
+            - layers_bbox_preds (Tensor): Sigmoid outputs from the regression
+              head with normalized coordinate format (cx, cy, w, h), has shape
+              (num_decoder_layers, bs, num_queries, 4).
+        """
+        layers_cls_scores = self.fc_cls(hidden_states)
+        layers_bbox_preds = self.fc_reg(
+            self.activate(self.reg_ffn(hidden_states))).sigmoid()
+        return layers_cls_scores, layers_bbox_preds
+
+    def loss(self, hidden_states: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Feature from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, cls_out_channels)
+                or (num_decoder_layers, num_queries, bs, cls_out_channels).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """"Loss function.
+
+        Only outputs from the last feature level are used for computing
+        losses by default.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification outputs
+                of each decoder layers. Each is a 4D-tensor, has shape
+                (num_decoder_layers, bs, num_queries, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Sigmoid regression
+                outputs of each decoder layers. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                (num_decoder_layers, bs, num_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert batch_gt_instances_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            'for batch_gt_instances_ignore setting to None.'
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_by_feat_single,
+            all_layers_cls_scores,
+            all_layers_bbox_preds,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in \
+                zip(losses_cls[:-1], losses_bbox[:-1], losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor,
+                            batch_gt_instances: InstanceList,
+                            batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images, has shape (bs, num_queries, cls_out_channels).
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape (bs, num_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           batch_gt_instances, batch_img_metas)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if isinstance(self.loss_cls, QualityFocalLoss):
+            bg_class_ind = self.num_classes
+            pos_inds = ((labels >= 0)
+                        & (labels < bg_class_ind)).nonzero().squeeze(1)
+            scores = label_weights.new_zeros(labels.shape)
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_decode_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets)
+            pos_bbox_pred = bbox_preds.reshape(-1, 4)[pos_inds]
+            pos_decode_bbox_pred = bbox_cxcywh_to_xyxy(pos_bbox_pred)
+            scores[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            loss_cls = self.loss_cls(
+                cls_scores, (labels, scores),
+                label_weights,
+                avg_factor=cls_avg_factor)
+        else:
+            loss_cls = self.loss_cls(
+                cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds):
+            img_h, img_w, = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_targets(self, cls_scores_list: List[Tensor],
+                    bbox_preds_list: List[Tensor],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict]) -> tuple:
+        """Compute regression and classification targets for a batch image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image, has shape [num_queries,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_queries, 4].
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+            - labels_list (list[Tensor]): Labels for all images.
+            - label_weights_list (list[Tensor]): Label weights for all images.
+            - bbox_targets_list (list[Tensor]): BBox targets for all images.
+            - bbox_weights_list (list[Tensor]): BBox weights for all images.
+            - num_total_pos (int): Number of positive samples in all images.
+            - num_total_neg (int): Number of negative samples in all images.
+        """
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pos_inds_list,
+         neg_inds_list) = multi_apply(self._get_targets_single,
+                                      cls_scores_list, bbox_preds_list,
+                                      batch_gt_instances, batch_img_metas)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> tuple:
+        """Compute regression and classification targets for one image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_queries, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_queries, 4].
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        img_h, img_w = img_meta['img_shape']
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        num_bboxes = bbox_pred.size(0)
+        # convert bbox_pred from xywh, normalized to xyxy, unnormalized
+        bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
+        bbox_pred = bbox_pred * factor
+
+        pred_instances = InstanceData(scores=cls_score, bboxes=bbox_pred)
+        # assigner and sampler
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            img_meta=img_meta)
+
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+        pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :]
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype)
+        bbox_weights = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype)
+        bbox_weights[pos_inds] = 1.0
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        pos_gt_bboxes_normalized = pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    def loss_and_predict(
+            self, hidden_states: Tuple[Tensor],
+            batch_data_samples: SampleList) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples. Over-write because
+        img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (tuple[Tensor]): Feature from the transformer
+                decoder, has shape (num_decoder_layers, bs, num_queries, dim).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+            - predictions (list[:obj:`InstanceData`]): Detection
+              results of each image after the post process.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas)
+        return losses, predictions
+
+    def predict(self,
+                hidden_states: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network. Over-write
+        because img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        last_layer_hidden_state = hidden_states[-1].unsqueeze(0)
+        outs = self(last_layer_hidden_state)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+
+        return predictions
+
+    def predict_by_feat(self,
+                        layer_cls_scores: Tensor,
+                        layer_bbox_preds: Tensor,
+                        batch_img_metas: List[dict],
+                        rescale: bool = True) -> InstanceList:
+        """Transform network outputs for a batch into bbox predictions.
+
+        Args:
+            layer_cls_scores (Tensor): Classification outputs of the last or
+                all decoder layer. Each is a 4D-tensor, has shape
+                (num_decoder_layers, bs, num_queries, cls_out_channels).
+            layer_bbox_preds (Tensor): Sigmoid regression outputs of the last
+                or all decoder layer. Each is a 4D-tensor with normalized
+                coordinate format (cx, cy, w, h) and shape
+                (num_decoder_layers, bs, num_queries, 4).
+            batch_img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Defaults to `True`.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        # NOTE only using outputs from the last feature level,
+        # and only the outputs from the last decoder layer is used.
+        cls_scores = layer_cls_scores[-1]
+        bbox_preds = layer_bbox_preds[-1]
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_meta = batch_img_metas[img_id]
+            results = self._predict_by_feat_single(cls_score, bbox_pred,
+                                                   img_meta, rescale)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score: Tensor,
+                                bbox_pred: Tensor,
+                                img_meta: dict,
+                                rescale: bool = True) -> InstanceData:
+        """Transform outputs from the last decoder layer into bbox predictions
+        for each image.
+
+        Args:
+            cls_score (Tensor): Box score logits from the last decoder layer
+                for each image. Shape [num_queries, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
+                for each image, with coordinate format (cx, cy, w, h) and
+                shape [num_queries, 4].
+            img_meta (dict): Image meta info.
+            rescale (bool): If True, return boxes in original image
+                space. Default True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_score) == len(bbox_pred)  # num_queries
+        max_per_img = self.test_cfg.get('max_per_img', len(cls_score))
+        img_shape = img_meta['img_shape']
+        # exclude background
+        if self.loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+            scores, indexes = cls_score.view(-1).topk(max_per_img)
+            det_labels = indexes % self.num_classes
+            bbox_index = indexes // self.num_classes
+            bbox_pred = bbox_pred[bbox_index]
+        else:
+            scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
+            scores, bbox_index = scores.topk(max_per_img)
+            bbox_pred = bbox_pred[bbox_index]
+            det_labels = det_labels[bbox_index]
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
+        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
+        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            det_bboxes /= det_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        results = InstanceData()
+        results.bboxes = det_bboxes
+        results.scores = scores
+        results.labels = det_labels
+        return results
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/dino_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/dino_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..54f46d1474f97f2d183926a6dc68a0be79f7cef1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/dino_head.py
@@ -0,0 +1,479 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps,
+                                   bbox_xyxy_to_cxcywh)
+from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
+from ..losses import QualityFocalLoss
+from ..utils import multi_apply
+from .deformable_detr_head import DeformableDETRHead
+
+
+@MODELS.register_module()
+class DINOHead(DeformableDETRHead):
+    r"""Head of the DINO: DETR with Improved DeNoising Anchor Boxes
+    for End-to-End Object Detection
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/DINO>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2203.03605>`_ .
+    """
+
+    def loss(self, hidden_states: Tensor, references: List[Tensor],
+             enc_outputs_class: Tensor, enc_outputs_coord: Tensor,
+             batch_data_samples: SampleList, dn_meta: Dict[str, int]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries_total,
+                dim), where `num_queries_total` is the sum of
+                `num_denoising_queries` and `num_matching_queries` when
+                `self.training` is `True`, else `num_matching_queries`.
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries_total, 4) and each `inter_reference` has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            enc_outputs_class (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+            enc_outputs_coord (Tensor): The proposal generate from the
+                encode feature map, has shape (bs, num_feat_points, 4) with the
+                last dimension arranged as (cx, cy, w, h).
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
+                              batch_gt_instances, batch_img_metas, dn_meta)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        enc_cls_scores: Tensor,
+        enc_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        dn_meta: Dict[str, int],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries_total, cls_out_channels), where
+                `num_queries_total` is the sum of `num_denoising_queries`
+                and `num_matching_queries`.
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
+                num_queries_total, 4).
+            enc_cls_scores (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+            enc_bbox_preds (Tensor): The proposal generate from the encode
+                feature map, has shape (bs, num_feat_points, 4) with the last
+                dimension arranged as (cx, cy, w, h).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+                group collation, including 'num_denoising_queries' and
+                'num_denoising_groups'. It will be used for split outputs of
+                denoising and matching parts and loss calculation.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # extract denoising and matching part of outputs
+        (all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+         all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds) = \
+            self.split_outputs(
+                all_layers_cls_scores, all_layers_bbox_preds, dn_meta)
+
+        loss_dict = super(DeformableDETRHead, self).loss_by_feat(
+            all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+            batch_gt_instances, batch_img_metas, batch_gt_instances_ignore)
+        # NOTE DETRHead.loss_by_feat but not DeformableDETRHead.loss_by_feat
+        # is called, because the encoder loss calculations are different
+        # between DINO and DeformableDETR.
+
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            # NOTE The enc_loss calculation of the DINO is
+            # different from that of Deformable DETR.
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_by_feat_single(
+                    enc_cls_scores, enc_bbox_preds,
+                    batch_gt_instances=batch_gt_instances,
+                    batch_img_metas=batch_img_metas)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+            loss_dict['enc_loss_iou'] = enc_losses_iou
+
+        if all_layers_denoising_cls_scores is not None:
+            # calculate denoising loss from all decoder layers
+            dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn(
+                all_layers_denoising_cls_scores,
+                all_layers_denoising_bbox_preds,
+                batch_gt_instances=batch_gt_instances,
+                batch_img_metas=batch_img_metas,
+                dn_meta=dn_meta)
+            # collate denoising loss
+            loss_dict['dn_loss_cls'] = dn_losses_cls[-1]
+            loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1]
+            loss_dict['dn_loss_iou'] = dn_losses_iou[-1]
+            for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in \
+                    enumerate(zip(dn_losses_cls[:-1], dn_losses_bbox[:-1],
+                                  dn_losses_iou[:-1])):
+                loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i
+        return loss_dict
+
+    def loss_dn(self, all_layers_denoising_cls_scores: Tensor,
+                all_layers_denoising_bbox_preds: Tensor,
+                batch_gt_instances: InstanceList, batch_img_metas: List[dict],
+                dn_meta: Dict[str, int]) -> Tuple[List[Tensor]]:
+        """Calculate denoising loss.
+
+        Args:
+            all_layers_denoising_cls_scores (Tensor): Classification scores of
+                all decoder layers in denoising part, has shape (
+                num_decoder_layers, bs, num_denoising_queries,
+                cls_out_channels).
+            all_layers_denoising_bbox_preds (Tensor): Regression outputs of all
+                decoder layers in denoising part. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and has shape
+                (num_decoder_layers, bs, num_denoising_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            Tuple[List[Tensor]]: The loss_dn_cls, loss_dn_bbox, and loss_dn_iou
+            of each decoder layers.
+        """
+        return multi_apply(
+            self._loss_dn_single,
+            all_layers_denoising_cls_scores,
+            all_layers_denoising_bbox_preds,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+            dn_meta=dn_meta)
+
+    def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor,
+                        batch_gt_instances: InstanceList,
+                        batch_img_metas: List[dict],
+                        dn_meta: Dict[str, int]) -> Tuple[Tensor]:
+        """Denoising loss for outputs from a single decoder layer.
+
+        Args:
+            dn_cls_scores (Tensor): Classification scores of a single decoder
+                layer in denoising part, has shape (bs, num_denoising_queries,
+                cls_out_channels).
+            dn_bbox_preds (Tensor): Regression outputs of a single decoder
+                layer in denoising part. Each is a 4D-tensor with normalized
+                coordinate format (cx, cy, w, h) and has shape
+                (bs, num_denoising_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        cls_reg_targets = self.get_dn_targets(batch_gt_instances,
+                                              batch_img_metas, dn_meta)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = dn_cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = \
+            num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if len(cls_scores) > 0:
+            if isinstance(self.loss_cls, QualityFocalLoss):
+                bg_class_ind = self.num_classes
+                pos_inds = ((labels >= 0)
+                            & (labels < bg_class_ind)).nonzero().squeeze(1)
+                scores = label_weights.new_zeros(labels.shape)
+                pos_bbox_targets = bbox_targets[pos_inds]
+                pos_decode_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets)
+                pos_bbox_pred = dn_bbox_preds.reshape(-1, 4)[pos_inds]
+                pos_decode_bbox_pred = bbox_cxcywh_to_xyxy(pos_bbox_pred)
+                scores[pos_inds] = bbox_overlaps(
+                    pos_decode_bbox_pred.detach(),
+                    pos_decode_bbox_targets,
+                    is_aligned=True)
+                loss_cls = self.loss_cls(
+                    cls_scores, (labels, scores),
+                    weight=label_weights,
+                    avg_factor=cls_avg_factor)
+            else:
+                loss_cls = self.loss_cls(
+                    cls_scores,
+                    labels,
+                    label_weights,
+                    avg_factor=cls_avg_factor)
+        else:
+            loss_cls = torch.zeros(
+                1, dtype=cls_scores.dtype, device=cls_scores.device)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, dn_bbox_preds):
+            img_h, img_w = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = dn_bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_dn_targets(self, batch_gt_instances: InstanceList,
+                       batch_img_metas: dict, dn_meta: Dict[str,
+                                                            int]) -> tuple:
+        """Get targets in denoising part for a batch of images.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+            - labels_list (list[Tensor]): Labels for all images.
+            - label_weights_list (list[Tensor]): Label weights for all images.
+            - bbox_targets_list (list[Tensor]): BBox targets for all images.
+            - bbox_weights_list (list[Tensor]): BBox weights for all images.
+            - num_total_pos (int): Number of positive samples in all images.
+            - num_total_neg (int): Number of negative samples in all images.
+        """
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_dn_targets_single,
+             batch_gt_instances,
+             batch_img_metas,
+             dn_meta=dn_meta)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_dn_targets_single(self, gt_instances: InstanceData,
+                               img_meta: dict, dn_meta: Dict[str,
+                                                             int]) -> tuple:
+        """Get targets in denoising part for one image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        num_groups = dn_meta['num_denoising_groups']
+        num_denoising_queries = dn_meta['num_denoising_queries']
+        num_queries_each_group = int(num_denoising_queries / num_groups)
+        device = gt_bboxes.device
+
+        if len(gt_labels) > 0:
+            t = torch.arange(len(gt_labels), dtype=torch.long, device=device)
+            t = t.unsqueeze(0).repeat(num_groups, 1)
+            pos_assigned_gt_inds = t.flatten()
+            pos_inds = torch.arange(
+                num_groups, dtype=torch.long, device=device)
+            pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t
+            pos_inds = pos_inds.flatten()
+        else:
+            pos_inds = pos_assigned_gt_inds = \
+                gt_bboxes.new_tensor([], dtype=torch.long)
+
+        neg_inds = pos_inds + num_queries_each_group // 2
+
+        # label targets
+        labels = gt_bboxes.new_full((num_denoising_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_denoising_queries)
+
+        # bbox targets
+        bbox_targets = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        gt_bboxes_normalized = gt_bboxes / factor
+        gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized)
+        bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1])
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    @staticmethod
+    def split_outputs(all_layers_cls_scores: Tensor,
+                      all_layers_bbox_preds: Tensor,
+                      dn_meta: Dict[str, int]) -> Tuple[Tensor]:
+        """Split outputs of the denoising part and the matching part.
+
+        For the total outputs of `num_queries_total` length, the former
+        `num_denoising_queries` outputs are from denoising queries, and
+        the rest `num_matching_queries` ones are from matching queries,
+        where `num_queries_total` is the sum of `num_denoising_queries` and
+        `num_matching_queries`.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries_total, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
+                num_queries_total, 4).
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'.
+
+        Returns:
+            Tuple[Tensor]: a tuple containing the following outputs.
+
+            - all_layers_matching_cls_scores (Tensor): Classification scores
+              of all decoder layers in matching part, has shape
+              (num_decoder_layers, bs, num_matching_queries, cls_out_channels).
+            - all_layers_matching_bbox_preds (Tensor): Regression outputs of
+              all decoder layers in matching part. Each is a 4D-tensor with
+              normalized coordinate format (cx, cy, w, h) and has shape
+              (num_decoder_layers, bs, num_matching_queries, 4).
+            - all_layers_denoising_cls_scores (Tensor): Classification scores
+              of all decoder layers in denoising part, has shape
+              (num_decoder_layers, bs, num_denoising_queries,
+              cls_out_channels).
+            - all_layers_denoising_bbox_preds (Tensor): Regression outputs of
+              all decoder layers in denoising part. Each is a 4D-tensor with
+              normalized coordinate format (cx, cy, w, h) and has shape
+              (num_decoder_layers, bs, num_denoising_queries, 4).
+        """
+        num_denoising_queries = dn_meta['num_denoising_queries']
+        if dn_meta is not None:
+            all_layers_denoising_cls_scores = \
+                all_layers_cls_scores[:, :, : num_denoising_queries, :]
+            all_layers_denoising_bbox_preds = \
+                all_layers_bbox_preds[:, :, : num_denoising_queries, :]
+            all_layers_matching_cls_scores = \
+                all_layers_cls_scores[:, :, num_denoising_queries:, :]
+            all_layers_matching_bbox_preds = \
+                all_layers_bbox_preds[:, :, num_denoising_queries:, :]
+        else:
+            all_layers_denoising_cls_scores = None
+            all_layers_denoising_bbox_preds = None
+            all_layers_matching_cls_scores = all_layers_cls_scores
+            all_layers_matching_bbox_preds = all_layers_bbox_preds
+        return (all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+                all_layers_denoising_cls_scores,
+                all_layers_denoising_bbox_preds)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/embedding_rpn_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/embedding_rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..97e84fa83b892c0274615d582fe43a6693541617
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/embedding_rpn_head.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from mmdet.structures.det_data_sample import SampleList
+from mmdet.utils import InstanceList, OptConfigType
+
+
+@MODELS.register_module()
+class EmbeddingRPNHead(BaseModule):
+    """RPNHead in the `Sparse R-CNN <https://arxiv.org/abs/2011.12450>`_ .
+
+    Unlike traditional RPNHead, this module does not need FPN input, but just
+    decode `init_proposal_bboxes` and expand the first dimension of
+    `init_proposal_bboxes` and `init_proposal_features` to the batch_size.
+
+    Args:
+        num_proposals (int): Number of init_proposals. Defaults to 100.
+        proposal_feature_channel (int): Channel number of
+            init_proposal_feature. Defaults to 256.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_proposals: int = 100,
+                 proposal_feature_channel: int = 256,
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        # `**kwargs` is necessary to avoid some potential error.
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.num_proposals = num_proposals
+        self.proposal_feature_channel = proposal_feature_channel
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize a sparse set of proposal boxes and proposal features."""
+        self.init_proposal_bboxes = nn.Embedding(self.num_proposals, 4)
+        self.init_proposal_features = nn.Embedding(
+            self.num_proposals, self.proposal_feature_channel)
+
+    def init_weights(self) -> None:
+        """Initialize the init_proposal_bboxes as normalized.
+
+        [c_x, c_y, w, h], and we initialize it to the size of  the entire
+        image.
+        """
+        super().init_weights()
+        nn.init.constant_(self.init_proposal_bboxes.weight[:, :2], 0.5)
+        nn.init.constant_(self.init_proposal_bboxes.weight[:, 2:], 1)
+
+    def _decode_init_proposals(self, x: List[Tensor],
+                               batch_data_samples: SampleList) -> InstanceList:
+        """Decode init_proposal_bboxes according to the size of images and
+        expand dimension of init_proposal_features to batch_size.
+
+        Args:
+            x (list[Tensor]): List of FPN features.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            List[:obj:`InstanceData`:] Detection results of each image.
+            Each item usually contains following keys.
+
+            - proposals: Decoded proposal bboxes,
+              has shape (num_proposals, 4).
+            - features: init_proposal_features, expanded proposal
+              features, has shape
+              (num_proposals, proposal_feature_channel).
+            - imgs_whwh: Tensor with shape
+              (num_proposals, 4), the dimension means
+              [img_width, img_height, img_width, img_height].
+        """
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+
+        proposals = self.init_proposal_bboxes.weight.clone()
+        proposals = bbox_cxcywh_to_xyxy(proposals)
+        imgs_whwh = []
+        for meta in batch_img_metas:
+            h, w = meta['img_shape'][:2]
+            imgs_whwh.append(x[0].new_tensor([[w, h, w, h]]))
+        imgs_whwh = torch.cat(imgs_whwh, dim=0)
+        imgs_whwh = imgs_whwh[:, None, :]
+        proposals = proposals * imgs_whwh
+
+        rpn_results_list = []
+        for idx in range(len(batch_img_metas)):
+            rpn_results = InstanceData()
+            rpn_results.bboxes = proposals[idx]
+            rpn_results.imgs_whwh = imgs_whwh[idx].repeat(
+                self.num_proposals, 1)
+            rpn_results.features = self.init_proposal_features.weight.clone()
+            rpn_results_list.append(rpn_results)
+        return rpn_results_list
+
+    def loss(self, *args, **kwargs):
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network."""
+        raise NotImplementedError(
+            'EmbeddingRPNHead does not have `loss`, please use '
+            '`predict` or `loss_and_predict` instead.')
+
+    def predict(self, x: List[Tensor], batch_data_samples: SampleList,
+                **kwargs) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network."""
+        # `**kwargs` is necessary to avoid some potential error.
+        return self._decode_init_proposals(
+            x=x, batch_data_samples=batch_data_samples)
+
+    def loss_and_predict(self, x: List[Tensor], batch_data_samples: SampleList,
+                         **kwargs) -> tuple:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples."""
+        # `**kwargs` is necessary to avoid some potential error.
+        predictions = self._decode_init_proposals(
+            x=x, batch_data_samples=batch_data_samples)
+
+        return dict(), predictions
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/fcos_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/fcos_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba4d4640010c7e8e7c6a4db3e0fce887b4105217
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/fcos_head.py
@@ -0,0 +1,476 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Scale
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.layers import NormedConv2d
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig,
+                         OptInstanceList, RangeType, reduce_mean)
+from ..utils import multi_apply
+from .anchor_free_head import AnchorFreeHead
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class FCOSHead(AnchorFreeHead):
+    """Anchor-free head used in `FCOS <https://arxiv.org/abs/1904.01355>`_.
+
+    The FCOS head does not use anchor boxes. Instead bounding boxes are
+    predicted at each pixel and a centerness measure is used to suppress
+    low-quality predictions.
+    Here norm_on_bbox, centerness_on_reg, dcn_on_last_conv are training
+    tricks used in official repo, which will bring remarkable mAP gains
+    of up to 4.9. Please see https://github.com/tianzhi0549/FCOS for
+    more detail.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        strides (Sequence[int] or Sequence[Tuple[int, int]]): Strides of points
+            in multiple feature levels. Defaults to (4, 8, 16, 32, 64).
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling.
+            Defaults to False.
+        center_sample_radius (float): Radius of center sampling.
+            Defaults to 1.5.
+        norm_on_bbox (bool): If true, normalize the regression targets with
+            FPN strides. Defaults to False.
+        centerness_on_reg (bool): If true, position centerness on the
+            regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
+            Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Defaults to "auto".
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_centerness (:obj:`ConfigDict`, or dict): Config of centerness
+            loss.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer.  Defaults to
+            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
+        cls_predictor_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config conv_cls. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+
+    Example:
+        >>> self = FCOSHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred, centerness = self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 regress_ranges: RangeType = ((-1, 64), (64, 128), (128, 256),
+                                              (256, 512), (512, INF)),
+                 center_sampling: bool = False,
+                 center_sample_radius: float = 1.5,
+                 norm_on_bbox: bool = False,
+                 centerness_on_reg: bool = False,
+                 loss_cls: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(type='IoULoss', loss_weight=1.0),
+                 loss_centerness: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 cls_predictor_cfg=None,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.regress_ranges = regress_ranges
+        self.center_sampling = center_sampling
+        self.center_sample_radius = center_sample_radius
+        self.norm_on_bbox = norm_on_bbox
+        self.centerness_on_reg = centerness_on_reg
+        self.cls_predictor_cfg = cls_predictor_cfg
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.loss_centerness = MODELS.build(loss_centerness)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        super()._init_layers()
+        self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+        if self.cls_predictor_cfg is not None:
+            self.cls_predictor_cfg.pop('type')
+            self.conv_cls = NormedConv2d(
+                self.feat_channels,
+                self.cls_out_channels,
+                1,
+                padding=0,
+                **self.cls_predictor_cfg)
+
+    def forward(
+            self, x: Tuple[Tensor]
+    ) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of each level outputs.
+
+            - cls_scores (list[Tensor]): Box scores for each scale level, \
+            each is a 4D-tensor, the channel number is \
+            num_points * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for each \
+            scale level, each is a 4D-tensor, the channel number is \
+            num_points * 4.
+            - centernesses (list[Tensor]): centerness for each scale level, \
+            each is a 4D-tensor, the channel number is num_points * 1.
+        """
+        return multi_apply(self.forward_single, x, self.scales, self.strides)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox predictions and centerness
+            predictions of input feature maps.
+        """
+        cls_score, bbox_pred, cls_feat, reg_feat = super().forward_single(x)
+        if self.centerness_on_reg:
+            centerness = self.conv_centerness(reg_feat)
+        else:
+            centerness = self.conv_centerness(cls_feat)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        if self.norm_on_bbox:
+            # bbox_pred needed for gradient computation has been modified
+            # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+            # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+            bbox_pred = bbox_pred.clamp(min=0)
+            if not self.training:
+                bbox_pred *= stride
+        else:
+            bbox_pred = bbox_pred.exp()
+        return cls_score, bbox_pred, centerness
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        centernesses: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            centernesses (list[Tensor]): centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(centernesses)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+        labels, bbox_targets = self.get_targets(all_level_points,
+                                                batch_gt_instances)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and centerness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_centerness = [
+            centerness.permute(0, 2, 3, 1).reshape(-1)
+            for centerness in centernesses
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_centerness = torch.cat(flatten_centerness)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        losses = dict()
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < bg_class_ind)).nonzero().reshape(-1)
+        num_pos = torch.tensor(
+            len(pos_inds), dtype=torch.float, device=bbox_preds[0].device)
+        num_pos = max(reduce_mean(num_pos), 1.0)
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels, avg_factor=num_pos)
+
+        if getattr(self.loss_cls, 'custom_accuracy', False):
+            acc = self.loss_cls.get_accuracy(flatten_cls_scores,
+                                             flatten_labels)
+            losses.update(acc)
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_centerness = flatten_centerness[pos_inds]
+        pos_bbox_targets = flatten_bbox_targets[pos_inds]
+        pos_centerness_targets = self.centerness_target(pos_bbox_targets)
+        # centerness weighted iou loss
+        centerness_denorm = max(
+            reduce_mean(pos_centerness_targets.sum().detach()), 1e-6)
+
+        if len(pos_inds) > 0:
+            pos_points = flatten_points[pos_inds]
+            pos_decoded_bbox_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_preds)
+            pos_decoded_target_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_targets)
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds,
+                weight=pos_centerness_targets,
+                avg_factor=centerness_denorm)
+            loss_centerness = self.loss_centerness(
+                pos_centerness, pos_centerness_targets, avg_factor=num_pos)
+        else:
+            loss_bbox = pos_bbox_preds.sum()
+            loss_centerness = pos_centerness.sum()
+
+        losses['loss_cls'] = loss_cls
+        losses['loss_bbox'] = loss_bbox
+        losses['loss_centerness'] = loss_centerness
+
+        return losses
+
+    def get_targets(
+            self, points: List[Tensor], batch_gt_instances: InstanceList
+    ) -> Tuple[List[Tensor], List[Tensor]]:
+        """Compute regression, classification and centerness targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple: Targets of each level.
+
+            - concat_lvl_labels (list[Tensor]): Labels of each level.
+            - concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+            level.
+        """
+        assert len(points) == len(self.regress_ranges)
+        num_levels = len(points)
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        # get labels and bbox_targets of each image
+        labels_list, bbox_targets_list = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            points=concat_points,
+            regress_ranges=concat_regress_ranges,
+            num_points_per_lvl=num_points)
+
+        # split to per img, per level
+        labels_list = [labels.split(num_points, 0) for labels in labels_list]
+        bbox_targets_list = [
+            bbox_targets.split(num_points, 0)
+            for bbox_targets in bbox_targets_list
+        ]
+
+        # concat per level image
+        concat_lvl_labels = []
+        concat_lvl_bbox_targets = []
+        for i in range(num_levels):
+            concat_lvl_labels.append(
+                torch.cat([labels[i] for labels in labels_list]))
+            bbox_targets = torch.cat(
+                [bbox_targets[i] for bbox_targets in bbox_targets_list])
+            if self.norm_on_bbox:
+                bbox_targets = bbox_targets / self.strides[i]
+            concat_lvl_bbox_targets.append(bbox_targets)
+        return concat_lvl_labels, concat_lvl_bbox_targets
+
+    def _get_targets_single(
+            self, gt_instances: InstanceData, points: Tensor,
+            regress_ranges: Tensor,
+            num_points_per_lvl: List[int]) -> Tuple[Tensor, Tensor]:
+        """Compute regression and classification targets for a single image."""
+        num_points = points.size(0)
+        num_gts = len(gt_instances)
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,), self.num_classes), \
+                   gt_bboxes.new_zeros((num_points, 4))
+
+        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+            gt_bboxes[:, 3] - gt_bboxes[:, 1])
+        # TODO: figure out why these two are different
+        # areas = areas[None].expand(num_points, num_gts)
+        areas = areas[None].repeat(num_points, 1)
+        regress_ranges = regress_ranges[:, None, :].expand(
+            num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None].expand(num_points, num_gts)
+        ys = ys[:, None].expand(num_points, num_gts)
+
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+
+        if self.center_sampling:
+            # condition1: inside a `center bbox`
+            radius = self.center_sample_radius
+            center_xs = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) / 2
+            center_ys = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) / 2
+            center_gts = torch.zeros_like(gt_bboxes)
+            stride = center_xs.new_zeros(center_xs.shape)
+
+            # project the points on current lvl back to the `original` sizes
+            lvl_begin = 0
+            for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
+                lvl_end = lvl_begin + num_points_lvl
+                stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
+                lvl_begin = lvl_end
+
+            x_mins = center_xs - stride
+            y_mins = center_ys - stride
+            x_maxs = center_xs + stride
+            y_maxs = center_ys + stride
+            center_gts[..., 0] = torch.where(x_mins > gt_bboxes[..., 0],
+                                             x_mins, gt_bboxes[..., 0])
+            center_gts[..., 1] = torch.where(y_mins > gt_bboxes[..., 1],
+                                             y_mins, gt_bboxes[..., 1])
+            center_gts[..., 2] = torch.where(x_maxs > gt_bboxes[..., 2],
+                                             gt_bboxes[..., 2], x_maxs)
+            center_gts[..., 3] = torch.where(y_maxs > gt_bboxes[..., 3],
+                                             gt_bboxes[..., 3], y_maxs)
+
+            cb_dist_left = xs - center_gts[..., 0]
+            cb_dist_right = center_gts[..., 2] - xs
+            cb_dist_top = ys - center_gts[..., 1]
+            cb_dist_bottom = center_gts[..., 3] - ys
+            center_bbox = torch.stack(
+                (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
+            inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+        else:
+            # condition1: inside a gt bbox
+            inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
+
+        # condition2: limit the regression range for each location
+        max_regress_distance = bbox_targets.max(-1)[0]
+        inside_regress_range = (
+            (max_regress_distance >= regress_ranges[..., 0])
+            & (max_regress_distance <= regress_ranges[..., 1]))
+
+        # if there are still more than one objects for a location,
+        # we choose the one with minimal area
+        areas[inside_gt_bbox_mask == 0] = INF
+        areas[inside_regress_range == 0] = INF
+        min_area, min_area_inds = areas.min(dim=1)
+
+        labels = gt_labels[min_area_inds]
+        labels[min_area == INF] = self.num_classes  # set as BG
+        bbox_targets = bbox_targets[range(num_points), min_area_inds]
+
+        return labels, bbox_targets
+
+    def centerness_target(self, pos_bbox_targets: Tensor) -> Tensor:
+        """Compute centerness targets.
+
+        Args:
+            pos_bbox_targets (Tensor): BBox targets of positive bboxes in shape
+                (num_pos, 4)
+
+        Returns:
+            Tensor: Centerness target.
+        """
+        # only calculate pos centerness targets, otherwise there may be nan
+        left_right = pos_bbox_targets[:, [0, 2]]
+        top_bottom = pos_bbox_targets[:, [1, 3]]
+        if len(left_right) == 0:
+            centerness_targets = left_right[..., 0]
+        else:
+            centerness_targets = (
+                left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
+                    top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+        return torch.sqrt(centerness_targets)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/fovea_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/fovea_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..89353deac7f0189c1e464288521ee8e4238f0107
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/fovea_head.py
@@ -0,0 +1,509 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import DeformConv2d
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig
+from ..utils import filter_scores_and_topk, multi_apply
+from .anchor_free_head import AnchorFreeHead
+
+INF = 1e8
+
+
+class FeatureAlign(BaseModule):
+    """Feature Align Module.
+
+    Feature Align Module is implemented based on DCN v1.
+    It uses anchor shape prediction rather than feature map to
+    predict offsets of deform conv layer.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        out_channels (int): Number of channels in the output feature map.
+        kernel_size (int): Size of the convolution kernel.
+            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
+        deform_groups: (int): Group number of DCN in
+            FeatureAdaption module.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        deform_groups: int = 4,
+        init_cfg: OptMultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.1,
+            override=dict(type='Normal', name='conv_adaption', std=0.01))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        offset_channels = kernel_size * kernel_size * 2
+        self.conv_offset = nn.Conv2d(
+            4, deform_groups * offset_channels, 1, bias=False)
+        self.conv_adaption = DeformConv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(kernel_size - 1) // 2,
+            deform_groups=deform_groups)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor, shape: Tensor) -> Tensor:
+        """Forward function of feature align module.
+
+        Args:
+            x (Tensor): Features from the upstream network.
+            shape (Tensor): Exponential of bbox predictions.
+
+        Returns:
+            x (Tensor): The aligned features.
+        """
+        offset = self.conv_offset(shape)
+        x = self.relu(self.conv_adaption(x, offset))
+        return x
+
+
+@MODELS.register_module()
+class FoveaHead(AnchorFreeHead):
+    """Detection Head of `FoveaBox: Beyond Anchor-based Object Detector.
+
+    <https://arxiv.org/abs/1904.03797>`_.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        base_edge_list (list[int]): List of edges.
+        scale_ranges (list[tuple]): Range of scales.
+        sigma (float): Super parameter of ``FoveaHead``.
+        with_deform (bool):  Whether use deform conv.
+        deform_groups (int): Deformable conv group size.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 base_edge_list: List[int] = (16, 32, 64, 128, 256),
+                 scale_ranges: List[tuple] = ((8, 32), (16, 64), (32, 128),
+                                              (64, 256), (128, 512)),
+                 sigma: float = 0.4,
+                 with_deform: bool = False,
+                 deform_groups: int = 4,
+                 init_cfg: OptMultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.base_edge_list = base_edge_list
+        self.scale_ranges = scale_ranges
+        self.sigma = sigma
+        self.with_deform = with_deform
+        self.deform_groups = deform_groups
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        # box branch
+        super()._init_reg_convs()
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+        # cls branch
+        if not self.with_deform:
+            super()._init_cls_convs()
+            self.conv_cls = nn.Conv2d(
+                self.feat_channels, self.cls_out_channels, 3, padding=1)
+        else:
+            self.cls_convs = nn.ModuleList()
+            self.cls_convs.append(
+                ConvModule(
+                    self.feat_channels, (self.feat_channels * 4),
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None))
+            self.cls_convs.append(
+                ConvModule((self.feat_channels * 4), (self.feat_channels * 4),
+                           1,
+                           stride=1,
+                           padding=0,
+                           conv_cfg=self.conv_cfg,
+                           norm_cfg=self.norm_cfg,
+                           bias=self.norm_cfg is None))
+            self.feature_adaption = FeatureAlign(
+                self.feat_channels,
+                self.feat_channels,
+                kernel_size=3,
+                deform_groups=self.deform_groups)
+            self.conv_cls = nn.Conv2d(
+                int(self.feat_channels * 4),
+                self.cls_out_channels,
+                3,
+                padding=1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+
+        Returns:
+            tuple: scores for each class and bbox predictions of input
+            feature maps.
+        """
+        cls_feat = x
+        reg_feat = x
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = self.conv_reg(reg_feat)
+        if self.with_deform:
+            cls_feat = self.feature_adaption(cls_feat, bbox_pred.exp())
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        cls_score = self.conv_cls(cls_feat)
+        return cls_score, bbox_pred
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+        num_imgs = cls_scores[0].size(0)
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_labels, flatten_bbox_targets = self.get_targets(
+            batch_gt_instances, featmap_sizes, priors)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < self.num_classes)).nonzero().view(-1)
+        num_pos = len(pos_inds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels, avg_factor=num_pos + num_imgs)
+        if num_pos > 0:
+            pos_bbox_preds = flatten_bbox_preds[pos_inds]
+            pos_bbox_targets = flatten_bbox_targets[pos_inds]
+            pos_weights = pos_bbox_targets.new_ones(pos_bbox_targets.size())
+            loss_bbox = self.loss_bbox(
+                pos_bbox_preds,
+                pos_bbox_targets,
+                pos_weights,
+                avg_factor=num_pos)
+        else:
+            loss_bbox = torch.tensor(
+                0,
+                dtype=flatten_bbox_preds.dtype,
+                device=flatten_bbox_preds.device)
+        return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
+
+    def get_targets(
+            self, batch_gt_instances: InstanceList, featmap_sizes: List[tuple],
+            priors_list: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Compute regression and classification for priors in multiple images.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            featmap_sizes (list[tuple]): Size tuple of feature maps.
+            priors_list (list[Tensor]): Priors list of each fpn level, each has
+                shape (num_priors, 2).
+
+        Returns:
+            tuple: Targets of each level.
+
+            - flatten_labels (list[Tensor]): Labels of each level.
+            - flatten_bbox_targets (list[Tensor]): BBox targets of each
+              level.
+        """
+        label_list, bbox_target_list = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            featmap_size_list=featmap_sizes,
+            priors_list=priors_list)
+        flatten_labels = [
+            torch.cat([
+                labels_level_img.flatten() for labels_level_img in labels_level
+            ]) for labels_level in zip(*label_list)
+        ]
+        flatten_bbox_targets = [
+            torch.cat([
+                bbox_targets_level_img.reshape(-1, 4)
+                for bbox_targets_level_img in bbox_targets_level
+            ]) for bbox_targets_level in zip(*bbox_target_list)
+        ]
+        flatten_labels = torch.cat(flatten_labels)
+        flatten_bbox_targets = torch.cat(flatten_bbox_targets)
+        return flatten_labels, flatten_bbox_targets
+
+    def _get_targets_single(self,
+                            gt_instances: InstanceData,
+                            featmap_size_list: List[tuple] = None,
+                            priors_list: List[Tensor] = None) -> tuple:
+        """Compute regression and classification targets for a single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            featmap_size_list (list[tuple]): Size tuple of feature maps.
+            priors_list (list[Tensor]): Priors of each fpn level, each has
+                shape (num_priors, 2).
+
+        Returns:
+            tuple:
+
+            - label_list (list[Tensor]): Labels of all anchors in the image.
+            - box_target_list (list[Tensor]): BBox targets of all anchors in
+              the image.
+        """
+        gt_bboxes_raw = gt_instances.bboxes
+        gt_labels_raw = gt_instances.labels
+        gt_areas = torch.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *
+                              (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))
+        label_list = []
+        bbox_target_list = []
+        # for each pyramid, find the cls and box target
+        for base_len, (lower_bound, upper_bound), stride, featmap_size, \
+            priors in zip(self.base_edge_list, self.scale_ranges,
+                          self.strides, featmap_size_list, priors_list):
+            # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+            priors = priors.view(*featmap_size, 2)
+            x, y = priors[..., 0], priors[..., 1]
+            labels = gt_labels_raw.new_full(featmap_size, self.num_classes)
+            bbox_targets = gt_bboxes_raw.new_ones(featmap_size[0],
+                                                  featmap_size[1], 4)
+            # scale assignment
+            hit_indices = ((gt_areas >= lower_bound) &
+                           (gt_areas <= upper_bound)).nonzero().flatten()
+            if len(hit_indices) == 0:
+                label_list.append(labels)
+                bbox_target_list.append(torch.log(bbox_targets))
+                continue
+            _, hit_index_order = torch.sort(-gt_areas[hit_indices])
+            hit_indices = hit_indices[hit_index_order]
+            gt_bboxes = gt_bboxes_raw[hit_indices, :] / stride
+            gt_labels = gt_labels_raw[hit_indices]
+            half_w = 0.5 * (gt_bboxes[:, 2] - gt_bboxes[:, 0])
+            half_h = 0.5 * (gt_bboxes[:, 3] - gt_bboxes[:, 1])
+            # valid fovea area: left, right, top, down
+            pos_left = torch.ceil(
+                gt_bboxes[:, 0] + (1 - self.sigma) * half_w - 0.5).long(). \
+                clamp(0, featmap_size[1] - 1)
+            pos_right = torch.floor(
+                gt_bboxes[:, 0] + (1 + self.sigma) * half_w - 0.5).long(). \
+                clamp(0, featmap_size[1] - 1)
+            pos_top = torch.ceil(
+                gt_bboxes[:, 1] + (1 - self.sigma) * half_h - 0.5).long(). \
+                clamp(0, featmap_size[0] - 1)
+            pos_down = torch.floor(
+                gt_bboxes[:, 1] + (1 + self.sigma) * half_h - 0.5).long(). \
+                clamp(0, featmap_size[0] - 1)
+            for px1, py1, px2, py2, label, (gt_x1, gt_y1, gt_x2, gt_y2) in \
+                    zip(pos_left, pos_top, pos_right, pos_down, gt_labels,
+                        gt_bboxes_raw[hit_indices, :]):
+                labels[py1:py2 + 1, px1:px2 + 1] = label
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 0] = \
+                    (x[py1:py2 + 1, px1:px2 + 1] - gt_x1) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 1] = \
+                    (y[py1:py2 + 1, px1:px2 + 1] - gt_y1) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 2] = \
+                    (gt_x2 - x[py1:py2 + 1, px1:px2 + 1]) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 3] = \
+                    (gt_y2 - y[py1:py2 + 1, px1:px2 + 1]) / base_len
+            bbox_targets = bbox_targets.clamp(min=1. / 16, max=16.)
+            label_list.append(labels)
+            bbox_target_list.append(torch.log(bbox_targets))
+        return label_list, bbox_target_list
+
+    # Same as base_dense_head/_predict_by_feat_single except self._bbox_decode
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: Optional[ConfigDict] = None,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 2).
+            img_meta (dict): Image meta info.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for level_idx, (cls_score, bbox_pred, stride, base_len, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list, self.strides,
+                              self.base_edge_list, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, _, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            bboxes = self._bbox_decode(priors, bbox_pred, base_len, img_shape)
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def _bbox_decode(self, priors: Tensor, bbox_pred: Tensor, base_len: int,
+                     max_shape: int) -> Tensor:
+        """Function to decode bbox.
+
+        Args:
+            priors (Tensor): Center proiors of an image, has shape
+                (num_instances, 2).
+            bbox_preds (Tensor): Box energies / deltas for all instances,
+                has shape (batch_size, num_instances, 4).
+            base_len (int): The base length.
+            max_shape (int): The max shape of bbox.
+
+        Returns:
+            Tensor: Decoded bboxes in (tl_x, tl_y, br_x, br_y) format. Has
+            shape (batch_size, num_instances, 4).
+        """
+        bbox_pred = bbox_pred.exp()
+
+        y = priors[:, 1]
+        x = priors[:, 0]
+        x1 = (x - base_len * bbox_pred[:, 0]). \
+            clamp(min=0, max=max_shape[1] - 1)
+        y1 = (y - base_len * bbox_pred[:, 1]). \
+            clamp(min=0, max=max_shape[0] - 1)
+        x2 = (x + base_len * bbox_pred[:, 2]). \
+            clamp(min=0, max=max_shape[1] - 1)
+        y2 = (y + base_len * bbox_pred[:, 3]). \
+            clamp(min=0, max=max_shape[0] - 1)
+        decoded_bboxes = torch.stack([x1, y1, x2, y2], -1)
+        return decoded_bboxes
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/free_anchor_retina_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/free_anchor_retina_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..df6fb9202c32735121bf7738e332fbfc5ac7e6bd
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/free_anchor_retina_head.py
@@ -0,0 +1,312 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import InstanceList, OptConfigType, OptInstanceList
+from ..utils import multi_apply
+from .retina_head import RetinaHead
+
+EPS = 1e-12
+
+
+@MODELS.register_module()
+class FreeAnchorRetinaHead(RetinaHead):
+    """FreeAnchor RetinaHead used in https://arxiv.org/abs/1909.02466.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Defaults to 4.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): dictionary to
+            construct and config conv layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): dictionary to
+            construct and config norm layer. Defaults to
+            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+        pre_anchor_topk (int): Number of boxes that be token in each bag.
+            Defaults to 50
+        bbox_thr (float): The threshold of the saturated linear function.
+            It is usually the same with the IoU threshold used in NMS.
+            Defaults to 0.6.
+        gamma (float): Gamma parameter in focal loss. Defaults to 2.0.
+        alpha (float): Alpha parameter in focal loss. Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 pre_anchor_topk: int = 50,
+                 bbox_thr: float = 0.6,
+                 gamma: float = 2.0,
+                 alpha: float = 0.5,
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            stacked_convs=stacked_convs,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            **kwargs)
+
+        self.pre_anchor_topk = pre_anchor_topk
+        self.bbox_thr = bbox_thr
+        self.gamma = gamma
+        self.alpha = alpha
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, _ = self.get_anchors(
+            featmap_sizes=featmap_sizes,
+            batch_img_metas=batch_img_metas,
+            device=device)
+        concat_anchor_list = [torch.cat(anchor) for anchor in anchor_list]
+
+        # concatenate each level
+        cls_scores = [
+            cls.permute(0, 2, 3,
+                        1).reshape(cls.size(0), -1, self.cls_out_channels)
+            for cls in cls_scores
+        ]
+        bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(bbox_pred.size(0), -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        cls_scores = torch.cat(cls_scores, dim=1)
+        cls_probs = torch.sigmoid(cls_scores)
+        bbox_preds = torch.cat(bbox_preds, dim=1)
+
+        box_probs, positive_losses, num_pos_list = multi_apply(
+            self.positive_loss_single, cls_probs, bbox_preds,
+            concat_anchor_list, batch_gt_instances)
+
+        num_pos = sum(num_pos_list)
+        positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos)
+
+        # box_prob: P{a_{j} \in A_{+}}
+        box_probs = torch.stack(box_probs, dim=0)
+
+        # negative_loss:
+        # \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B||
+        negative_loss = self.negative_bag_loss(cls_probs, box_probs).sum() / \
+            max(1, num_pos * self.pre_anchor_topk)
+
+        # avoid the absence of gradients in regression subnet
+        # when no ground-truth in a batch
+        if num_pos == 0:
+            positive_loss = bbox_preds.sum() * 0
+
+        losses = {
+            'positive_bag_loss': positive_loss,
+            'negative_bag_loss': negative_loss
+        }
+        return losses
+
+    def positive_loss_single(self, cls_prob: Tensor, bbox_pred: Tensor,
+                             flat_anchors: Tensor,
+                             gt_instances: InstanceData) -> tuple:
+        """Compute positive loss.
+
+        Args:
+            cls_prob (Tensor): Classification probability of shape
+                (num_anchors, num_classes).
+            bbox_pred (Tensor): Box probability of shape (num_anchors, 4).
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple:
+
+                - box_prob (Tensor): Box probability of shape (num_anchors, 4).
+                - positive_loss (Tensor): Positive loss of shape (num_pos, ).
+                - num_pos (int): positive samples indexes.
+        """
+
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        with torch.no_grad():
+            if len(gt_bboxes) == 0:
+                image_box_prob = torch.zeros(
+                    flat_anchors.size(0),
+                    self.cls_out_channels).type_as(bbox_pred)
+            else:
+                # box_localization: a_{j}^{loc}, shape: [j, 4]
+                pred_boxes = self.bbox_coder.decode(flat_anchors, bbox_pred)
+
+                # object_box_iou: IoU_{ij}^{loc}, shape: [i, j]
+                object_box_iou = bbox_overlaps(gt_bboxes, pred_boxes)
+
+                # object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j]
+                t1 = self.bbox_thr
+                t2 = object_box_iou.max(
+                    dim=1, keepdim=True).values.clamp(min=t1 + 1e-12)
+                object_box_prob = ((object_box_iou - t1) / (t2 - t1)).clamp(
+                    min=0, max=1)
+
+                # object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j]
+                num_obj = gt_labels.size(0)
+                indices = torch.stack(
+                    [torch.arange(num_obj).type_as(gt_labels), gt_labels],
+                    dim=0)
+                object_cls_box_prob = torch.sparse_coo_tensor(
+                    indices, object_box_prob)
+
+                # image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j]
+                """
+                from "start" to "end" implement:
+                image_box_iou = torch.sparse.max(object_cls_box_prob,
+                                                 dim=0).t()
+
+                """
+                # start
+                box_cls_prob = torch.sparse.sum(
+                    object_cls_box_prob, dim=0).to_dense()
+
+                indices = torch.nonzero(box_cls_prob, as_tuple=False).t_()
+                if indices.numel() == 0:
+                    image_box_prob = torch.zeros(
+                        flat_anchors.size(0),
+                        self.cls_out_channels).type_as(object_box_prob)
+                else:
+                    nonzero_box_prob = torch.where(
+                        (gt_labels.unsqueeze(dim=-1) == indices[0]),
+                        object_box_prob[:, indices[1]],
+                        torch.tensor(
+                            [0]).type_as(object_box_prob)).max(dim=0).values
+
+                    # upmap to shape [j, c]
+                    image_box_prob = torch.sparse_coo_tensor(
+                        indices.flip([0]),
+                        nonzero_box_prob,
+                        size=(flat_anchors.size(0),
+                              self.cls_out_channels)).to_dense()
+                # end
+            box_prob = image_box_prob
+
+        # construct bags for objects
+        match_quality_matrix = bbox_overlaps(gt_bboxes, flat_anchors)
+        _, matched = torch.topk(
+            match_quality_matrix, self.pre_anchor_topk, dim=1, sorted=False)
+        del match_quality_matrix
+
+        # matched_cls_prob: P_{ij}^{cls}
+        matched_cls_prob = torch.gather(
+            cls_prob[matched], 2,
+            gt_labels.view(-1, 1, 1).repeat(1, self.pre_anchor_topk,
+                                            1)).squeeze(2)
+
+        # matched_box_prob: P_{ij}^{loc}
+        matched_anchors = flat_anchors[matched]
+        matched_object_targets = self.bbox_coder.encode(
+            matched_anchors,
+            gt_bboxes.unsqueeze(dim=1).expand_as(matched_anchors))
+        loss_bbox = self.loss_bbox(
+            bbox_pred[matched],
+            matched_object_targets,
+            reduction_override='none').sum(-1)
+        matched_box_prob = torch.exp(-loss_bbox)
+
+        # positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )}
+        num_pos = len(gt_bboxes)
+        positive_loss = self.positive_bag_loss(matched_cls_prob,
+                                               matched_box_prob)
+
+        return box_prob, positive_loss, num_pos
+
+    def positive_bag_loss(self, matched_cls_prob: Tensor,
+                          matched_box_prob: Tensor) -> Tensor:
+        """Compute positive bag loss.
+
+        :math:`-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )`.
+
+        :math:`P_{ij}^{cls}`: matched_cls_prob, classification probability of matched samples.
+
+        :math:`P_{ij}^{loc}`: matched_box_prob, box probability of matched samples.
+
+        Args:
+            matched_cls_prob (Tensor): Classification probability of matched
+                samples in shape (num_gt, pre_anchor_topk).
+            matched_box_prob (Tensor): BBox probability of matched samples,
+                in shape (num_gt, pre_anchor_topk).
+
+        Returns:
+            Tensor: Positive bag loss in shape (num_gt,).
+        """  # noqa: E501, W605
+        # bag_prob = Mean-max(matched_prob)
+        matched_prob = matched_cls_prob * matched_box_prob
+        weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None)
+        weight /= weight.sum(dim=1).unsqueeze(dim=-1)
+        bag_prob = (weight * matched_prob).sum(dim=1)
+        # positive_bag_loss = -self.alpha * log(bag_prob)
+        return self.alpha * F.binary_cross_entropy(
+            bag_prob, torch.ones_like(bag_prob), reduction='none')
+
+    def negative_bag_loss(self, cls_prob: Tensor, box_prob: Tensor) -> Tensor:
+        """Compute negative bag loss.
+
+        :math:`FL((1 - P_{a_{j} \in A_{+}}) * (1 - P_{j}^{bg}))`.
+
+        :math:`P_{a_{j} \in A_{+}}`: Box_probability of matched samples.
+
+        :math:`P_{j}^{bg}`: Classification probability of negative samples.
+
+        Args:
+            cls_prob (Tensor): Classification probability, in shape
+                (num_img, num_anchors, num_classes).
+            box_prob (Tensor): Box probability, in shape
+                (num_img, num_anchors, num_classes).
+
+        Returns:
+            Tensor: Negative bag loss in shape (num_img, num_anchors,
+            num_classes).
+        """  # noqa: E501, W605
+        prob = cls_prob * (1 - box_prob)
+        # There are some cases when neg_prob = 0.
+        # This will cause the neg_prob.log() to be inf without clamp.
+        prob = prob.clamp(min=EPS, max=1 - EPS)
+        negative_bag_loss = prob**self.gamma * F.binary_cross_entropy(
+            prob, torch.zeros_like(prob), reduction='none')
+        return (1 - self.alpha) * negative_bag_loss
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/fsaf_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/fsaf_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a01c487406693253eb17b883cac9ed06cf95802
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/fsaf_head.py
@@ -0,0 +1,458 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig
+from ..losses.accuracy import accuracy
+from ..losses.utils import weight_reduce_loss
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import images_to_levels, multi_apply, unmap
+from .retina_head import RetinaHead
+
+
+@MODELS.register_module()
+class FSAFHead(RetinaHead):
+    """Anchor-free head used in `FSAF <https://arxiv.org/abs/1903.00621>`_.
+
+    The head contains two subnetworks. The first classifies anchor boxes and
+    the second regresses deltas for the anchors (num_anchors is 1 for anchor-
+    free methods)
+
+    Args:
+        *args: Same as its base class in :class:`RetinaHead`
+        score_threshold (float, optional): The score_threshold to calculate
+            positive recall. If given, prediction scores lower than this value
+            is counted as incorrect prediction. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+        **kwargs: Same as its base class in :class:`RetinaHead`
+
+    Example:
+        >>> import torch
+        >>> self = FSAFHead(11, 7)
+        >>> x = torch.rand(1, 7, 32, 32)
+        >>> cls_score, bbox_pred = self.forward_single(x)
+        >>> # Each anchor predicts a score for each class except background
+        >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
+        >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
+        >>> assert cls_per_anchor == self.num_classes
+        >>> assert box_per_anchor == 4
+    """
+
+    def __init__(self,
+                 *args,
+                 score_threshold: Optional[float] = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        # The positive bias in self.retina_reg conv is to prevent predicted \
+        #  bbox with 0 area
+        if init_cfg is None:
+            init_cfg = dict(
+                type='Normal',
+                layer='Conv2d',
+                std=0.01,
+                override=[
+                    dict(
+                        type='Normal',
+                        name='retina_cls',
+                        std=0.01,
+                        bias_prob=0.01),
+                    dict(
+                        type='Normal', name='retina_reg', std=0.01, bias=0.25)
+                ])
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        self.score_threshold = score_threshold
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward feature map of a single scale level.
+
+        Args:
+            x (Tensor): Feature map of a single scale level.
+
+        Returns:
+            tuple[Tensor, Tensor]:
+
+            - cls_score (Tensor): Box scores for each scale level Has \
+            shape (N, num_points * num_classes, H, W).
+            - bbox_pred (Tensor): Box energies / deltas for each scale \
+            level with shape (N, num_points * 4, H, W).
+        """
+        cls_score, bbox_pred = super().forward_single(x)
+        # relu: TBLR encoder only accepts positive bbox_pred
+        return cls_score, self.relu(bbox_pred)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Most of the codes are the same with the base class :obj: `AnchorHead`,
+        except that it also collects and returns the matched gt index in the
+        image (from 0 to num_gt-1). If the anchor bbox is not matched to any
+        gt, the corresponding value in pos_gt_inds is -1.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors, ).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.  Defaults to True.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # Assign gt and sample anchors
+        anchors = flat_anchors[inside_flags.type(torch.bool), :]
+
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(
+            (num_valid_anchors, self.cls_out_channels), dtype=torch.float)
+        pos_gt_inds = anchors.new_full((num_valid_anchors, ),
+                                       -1,
+                                       dtype=torch.long)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+                # is applied directly on the decoded bounding boxes, both
+                # the predicted boxes and regression targets should be with
+                # absolute coordinate format.
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            # The assigned gt_index for each anchor. (0-based)
+            pos_gt_inds[pos_inds] = sampling_result.pos_assigned_gt_inds
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # shadowed_labels is a tensor composed of tuples
+        #  (anchor_inds, class_label) that indicate those anchors lying in the
+        #  outer region of a gt or overlapped by another gt with a smaller
+        #  area.
+        #
+        # Therefore, only the shadowed labels are ignored for loss calculation.
+        # the key `shadowed_labels` is defined in :obj:`CenterRegionAssigner`
+        shadowed_labels = assign_result.get_extra_property('shadowed_labels')
+        if shadowed_labels is not None and shadowed_labels.numel():
+            if len(shadowed_labels.shape) == 2:
+                idx_, label_ = shadowed_labels[:, 0], shadowed_labels[:, 1]
+                assert (labels[idx_] != label_).all(), \
+                    'One label cannot be both positive and ignored'
+                label_weights[idx_, label_] = 0
+            else:
+                label_weights[shadowed_labels] = 0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+            pos_gt_inds = unmap(
+                pos_gt_inds, num_total_anchors, inside_flags, fill=-1)
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result, pos_gt_inds)
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        for i in range(len(bbox_preds)):  # loop over fpn level
+            # avoid 0 area of the predicted bbox
+            bbox_preds[i] = bbox_preds[i].clamp(min=1e-4)
+        # TODO: It may directly use the base-class loss function.
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+        batch_size = len(batch_img_metas)
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            return_sampling_results=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results_list,
+         pos_assigned_gt_inds_list) = cls_reg_targets
+
+        num_gts = np.array(list(map(len, batch_gt_instances)))
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            avg_factor=avg_factor)
+
+        # `pos_assigned_gt_inds_list` (length: fpn_levels) stores the assigned
+        # gt index of each anchor bbox in each fpn level.
+        cum_num_gts = list(np.cumsum(num_gts))  # length of batch_size
+        for i, assign in enumerate(pos_assigned_gt_inds_list):
+            # loop over fpn levels
+            for j in range(1, batch_size):
+                # loop over batch size
+                # Convert gt indices in each img to those in the batch
+                assign[j][assign[j] >= 0] += int(cum_num_gts[j - 1])
+            pos_assigned_gt_inds_list[i] = assign.flatten()
+            labels_list[i] = labels_list[i].flatten()
+        num_gts = num_gts.sum()  # total number of gt in the batch
+        # The unique label index of each gt in the batch
+        label_sequence = torch.arange(num_gts, device=device)
+        # Collect the average loss of each gt in each level
+        with torch.no_grad():
+            loss_levels, = multi_apply(
+                self.collect_loss_level_single,
+                losses_cls,
+                losses_bbox,
+                pos_assigned_gt_inds_list,
+                labels_seq=label_sequence)
+            # Shape: (fpn_levels, num_gts). Loss of each gt at each fpn level
+            loss_levels = torch.stack(loss_levels, dim=0)
+            # Locate the best fpn level for loss back-propagation
+            if loss_levels.numel() == 0:  # zero gt
+                argmin = loss_levels.new_empty((num_gts, ), dtype=torch.long)
+            else:
+                _, argmin = loss_levels.min(dim=0)
+
+        # Reweight the loss of each (anchor, label) pair, so that only those
+        #  at the best gt level are back-propagated.
+        losses_cls, losses_bbox, pos_inds = multi_apply(
+            self.reweight_loss_single,
+            losses_cls,
+            losses_bbox,
+            pos_assigned_gt_inds_list,
+            labels_list,
+            list(range(len(losses_cls))),
+            min_levels=argmin)
+        num_pos = torch.cat(pos_inds, 0).sum().float()
+        pos_recall = self.calculate_pos_recall(cls_scores, labels_list,
+                                               pos_inds)
+
+        if num_pos == 0:  # No gt
+            num_total_neg = sum(
+                [results.num_neg for results in sampling_results_list])
+            avg_factor = num_pos + num_total_neg
+        else:
+            avg_factor = num_pos
+        for i in range(len(losses_cls)):
+            losses_cls[i] /= avg_factor
+            losses_bbox[i] /= avg_factor
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            num_pos=num_pos / batch_size,
+            pos_recall=pos_recall)
+
+    def calculate_pos_recall(self, cls_scores: List[Tensor],
+                             labels_list: List[Tensor],
+                             pos_inds: List[Tensor]) -> Tensor:
+        """Calculate positive recall with score threshold.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores at all fpn levels.
+                Each tensor is in shape (N, num_classes * num_anchors, H, W)
+            labels_list (list[Tensor]): The label that each anchor is assigned
+                to. Shape (N * H * W * num_anchors, )
+            pos_inds (list[Tensor]): List of bool tensors indicating whether
+                the anchor is assigned to a positive label.
+                Shape (N * H * W * num_anchors, )
+
+        Returns:
+            Tensor: A single float number indicating the positive recall.
+        """
+        with torch.no_grad():
+            num_class = self.num_classes
+            scores = [
+                cls.permute(0, 2, 3, 1).reshape(-1, num_class)[pos]
+                for cls, pos in zip(cls_scores, pos_inds)
+            ]
+            labels = [
+                label.reshape(-1)[pos]
+                for label, pos in zip(labels_list, pos_inds)
+            ]
+            scores = torch.cat(scores, dim=0)
+            labels = torch.cat(labels, dim=0)
+            if self.use_sigmoid_cls:
+                scores = scores.sigmoid()
+            else:
+                scores = scores.softmax(dim=1)
+
+            return accuracy(scores, labels, thresh=self.score_threshold)
+
+    def collect_loss_level_single(self, cls_loss: Tensor, reg_loss: Tensor,
+                                  assigned_gt_inds: Tensor,
+                                  labels_seq: Tensor) -> Tensor:
+        """Get the average loss in each FPN level w.r.t. each gt label.
+
+        Args:
+            cls_loss (Tensor): Classification loss of each feature map pixel,
+              shape (num_anchor, num_class)
+            reg_loss (Tensor): Regression loss of each feature map pixel,
+              shape (num_anchor, 4)
+            assigned_gt_inds (Tensor): It indicates which gt the prior is
+              assigned to (0-based, -1: no assignment). shape (num_anchor),
+            labels_seq: The rank of labels. shape (num_gt)
+
+        Returns:
+            Tensor: shape (num_gt), average loss of each gt in this level
+        """
+        if len(reg_loss.shape) == 2:  # iou loss has shape (num_prior, 4)
+            reg_loss = reg_loss.sum(dim=-1)  # sum loss in tblr dims
+        if len(cls_loss.shape) == 2:
+            cls_loss = cls_loss.sum(dim=-1)  # sum loss in class dims
+        loss = cls_loss + reg_loss
+        assert loss.size(0) == assigned_gt_inds.size(0)
+        # Default loss value is 1e6 for a layer where no anchor is positive
+        #  to ensure it will not be chosen to back-propagate gradient
+        losses_ = loss.new_full(labels_seq.shape, 1e6)
+        for i, l in enumerate(labels_seq):
+            match = assigned_gt_inds == l
+            if match.any():
+                losses_[i] = loss[match].mean()
+        return losses_,
+
+    def reweight_loss_single(self, cls_loss: Tensor, reg_loss: Tensor,
+                             assigned_gt_inds: Tensor, labels: Tensor,
+                             level: int, min_levels: Tensor) -> tuple:
+        """Reweight loss values at each level.
+
+        Reassign loss values at each level by masking those where the
+        pre-calculated loss is too large. Then return the reduced losses.
+
+        Args:
+            cls_loss (Tensor): Element-wise classification loss.
+              Shape: (num_anchors, num_classes)
+            reg_loss (Tensor): Element-wise regression loss.
+              Shape: (num_anchors, 4)
+            assigned_gt_inds (Tensor): The gt indices that each anchor bbox
+              is assigned to. -1 denotes a negative anchor, otherwise it is the
+              gt index (0-based). Shape: (num_anchors, ),
+            labels (Tensor): Label assigned to anchors. Shape: (num_anchors, ).
+            level (int): The current level index in the pyramid
+              (0-4 for RetinaNet)
+            min_levels (Tensor): The best-matching level for each gt.
+              Shape: (num_gts, ),
+
+        Returns:
+            tuple:
+
+            - cls_loss: Reduced corrected classification loss. Scalar.
+            - reg_loss: Reduced corrected regression loss. Scalar.
+            - pos_flags (Tensor): Corrected bool tensor indicating the \
+            final positive anchors. Shape: (num_anchors, ).
+        """
+        loc_weight = torch.ones_like(reg_loss)
+        cls_weight = torch.ones_like(cls_loss)
+        pos_flags = assigned_gt_inds >= 0  # positive pixel flag
+        pos_indices = torch.nonzero(pos_flags, as_tuple=False).flatten()
+
+        if pos_flags.any():  # pos pixels exist
+            pos_assigned_gt_inds = assigned_gt_inds[pos_flags]
+            zeroing_indices = (min_levels[pos_assigned_gt_inds] != level)
+            neg_indices = pos_indices[zeroing_indices]
+
+            if neg_indices.numel():
+                pos_flags[neg_indices] = 0
+                loc_weight[neg_indices] = 0
+                # Only the weight corresponding to the label is
+                #  zeroed out if not selected
+                zeroing_labels = labels[neg_indices]
+                assert (zeroing_labels >= 0).all()
+                cls_weight[neg_indices, zeroing_labels] = 0
+
+        # Weighted loss for both cls and reg loss
+        cls_loss = weight_reduce_loss(cls_loss, cls_weight, reduction='sum')
+        reg_loss = weight_reduce_loss(reg_loss, loc_weight, reduction='sum')
+
+        return cls_loss, reg_loss, pos_flags
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/ga_retina_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/ga_retina_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..569910b365126e90638256f0d10addfa230fd141
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/ga_retina_head.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import MaskedConv2d
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
+
+
+@MODELS.register_module()
+class GARetinaHead(GuidedAnchorHead):
+    """Guided-Anchor-based RetinaNet head."""
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        if init_cfg is None:
+            init_cfg = dict(
+                type='Normal',
+                layer='Conv2d',
+                std=0.01,
+                override=[
+                    dict(
+                        type='Normal',
+                        name='conv_loc',
+                        std=0.01,
+                        bias_prob=0.01),
+                    dict(
+                        type='Normal',
+                        name='retina_cls',
+                        std=0.01,
+                        bias_prob=0.01)
+                ])
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+        self.conv_loc = nn.Conv2d(self.feat_channels, 1, 1)
+        num_anchors = self.square_anchor_generator.num_base_priors[0]
+        self.conv_shape = nn.Conv2d(self.feat_channels, num_anchors * 2, 1)
+        self.feature_adaption_cls = FeatureAdaption(
+            self.feat_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.feature_adaption_reg = FeatureAdaption(
+            self.feat_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.retina_cls = MaskedConv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.retina_reg = MaskedConv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward feature map of a single scale level."""
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+
+        loc_pred = self.conv_loc(cls_feat)
+        shape_pred = self.conv_shape(reg_feat)
+
+        cls_feat = self.feature_adaption_cls(cls_feat, shape_pred)
+        reg_feat = self.feature_adaption_reg(reg_feat, shape_pred)
+
+        if not self.training:
+            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
+        else:
+            mask = None
+        cls_score = self.retina_cls(cls_feat, mask)
+        bbox_pred = self.retina_reg(reg_feat, mask)
+        return cls_score, bbox_pred, shape_pred, loc_pred
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/ga_rpn_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/ga_rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9614463165533358b8465420a87dfa47e7de1177
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/ga_rpn_head.py
@@ -0,0 +1,222 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops import nms
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptInstanceList
+from .guided_anchor_head import GuidedAnchorHead
+
+
+@MODELS.register_module()
+class GARPNHead(GuidedAnchorHead):
+    """Guided-Anchor-based RPN head."""
+
+    def __init__(self,
+                 in_channels: int,
+                 num_classes: int = 1,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_loc',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.rpn_conv = nn.Conv2d(
+            self.in_channels, self.feat_channels, 3, padding=1)
+        super(GARPNHead, self)._init_layers()
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward feature of a single scale level."""
+
+        x = self.rpn_conv(x)
+        x = F.relu(x, inplace=True)
+        (cls_score, bbox_pred, shape_pred,
+         loc_pred) = super().forward_single(x)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            shape_preds: List[Tensor],
+            loc_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            shape_preds (list[Tensor]): shape predictions for each scale
+                level with shape (N, 1, H, W).
+            loc_preds (list[Tensor]): location predictions for each scale
+                level with shape (N, num_anchors * 2, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        losses = super().loss_by_feat(
+            cls_scores,
+            bbox_preds,
+            shape_preds,
+            loc_preds,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        return dict(
+            loss_rpn_cls=losses['loss_cls'],
+            loss_rpn_bbox=losses['loss_bbox'],
+            loss_anchor_shape=losses['loss_shape'],
+            loss_anchor_loc=losses['loss_loc'])
+
+    def _predict_by_feat_single(self,
+                                cls_scores: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                mlvl_anchors: List[Tensor],
+                                mlvl_masks: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = False) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Each element in the list is
+                the anchors of a single level in feature pyramid. it has
+                shape (num_priors, 4).
+            mlvl_masks (list[Tensor]): Each element in the list is location
+                masks of a single level.
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict` or dict): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last
+              dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        assert cfg.nms.get('type', 'nms') == 'nms', 'GARPNHead only support ' \
+            'naive nms.'
+
+        mlvl_proposals = []
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            anchors = mlvl_anchors[idx]
+            mask = mlvl_masks[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = rpn_cls_score.softmax(dim=1)[:, :-1]
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1,
+                                                                   4)[mask, :]
+            if scores.dim() == 0:
+                rpn_bbox_pred = rpn_bbox_pred.unsqueeze(0)
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
+                _, topk_inds = scores.topk(cfg.nms_pre)
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+                scores = scores[topk_inds]
+            # get proposals w.r.t. anchors and rpn_bbox_pred
+            proposals = self.bbox_coder.decode(
+                anchors, rpn_bbox_pred, max_shape=img_meta['img_shape'])
+            # filter out too small bboxes
+            if cfg.min_bbox_size >= 0:
+                w = proposals[:, 2] - proposals[:, 0]
+                h = proposals[:, 3] - proposals[:, 1]
+                valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+                if not valid_mask.all():
+                    proposals = proposals[valid_mask]
+                    scores = scores[valid_mask]
+
+            # NMS in current level
+            proposals, _ = nms(proposals, scores, cfg.nms.iou_threshold)
+            proposals = proposals[:cfg.nms_post, :]
+            mlvl_proposals.append(proposals)
+        proposals = torch.cat(mlvl_proposals, 0)
+        if cfg.get('nms_across_levels', False):
+            # NMS across multi levels
+            proposals, _ = nms(proposals[:, :4], proposals[:, -1],
+                               cfg.nms.iou_threshold)
+            proposals = proposals[:cfg.max_per_img, :]
+        else:
+            scores = proposals[:, 4]
+            num = min(cfg.max_per_img, proposals.shape[0])
+            _, topk_inds = scores.topk(num)
+            proposals = proposals[topk_inds, :]
+
+        bboxes = proposals[:, :-1]
+        scores = proposals[:, -1]
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            bboxes /= bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = scores
+        results.labels = scores.new_zeros(scores.size(0), dtype=torch.long)
+        return results
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/gfl_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/gfl_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..be43d9b4da39da602b3b87bd3c9739c67367615b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/gfl_head.py
@@ -0,0 +1,667 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..task_modules.samplers import PseudoSampler
+from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply,
+                     unmap)
+from .anchor_head import AnchorHead
+
+
+class Integral(nn.Module):
+    """A fixed layer for calculating integral result from distribution.
+
+    This layer calculates the target location by :math: ``sum{P(y_i) * y_i}``,
+    P(y_i) denotes the softmax vector that represents the discrete distribution
+    y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
+
+    Args:
+        reg_max (int): The maximal value of the discrete set. Defaults to 16.
+            You may want to reset it according to your new dataset or related
+            settings.
+    """
+
+    def __init__(self, reg_max: int = 16) -> None:
+        super().__init__()
+        self.reg_max = reg_max
+        self.register_buffer('project',
+                             torch.linspace(0, self.reg_max, self.reg_max + 1))
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward feature from the regression head to get integral result of
+        bounding box location.
+
+        Args:
+            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
+                n is self.reg_max.
+
+        Returns:
+            x (Tensor): Integral result of box locations, i.e., distance
+                offsets from the box center in four directions, shape (N, 4).
+        """
+        x = F.softmax(x.reshape(-1, self.reg_max + 1), dim=1)
+        x = F.linear(x, self.project.type_as(x)).reshape(-1, 4)
+        return x
+
+
+@MODELS.register_module()
+class GFLHead(AnchorHead):
+    """Generalized Focal Loss: Learning Qualified and Distributed Bounding
+    Boxes for Dense Object Detection.
+
+    GFL head structure is similar with ATSS, however GFL uses
+    1) joint representation for classification and localization quality, and
+    2) flexible General distribution for bounding box locations,
+    which are supervised by
+    Quality Focal Loss (QFL) and Distribution Focal Loss (DFL), respectively
+
+    https://arxiv.org/abs/2006.04388
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Defaults to 4.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): dictionary to construct
+            and config conv layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer. Default: dict(type='GN', num_groups=32,
+            requires_grad=True).
+        loss_qfl (:obj:`ConfigDict` or dict): Config of Quality Focal Loss
+            (QFL).
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. Defaults
+             to 'DistancePointBBoxCoder'.
+        reg_max (int): Max value of integral set :math: ``{0, ..., reg_max}``
+            in QFL setting. Defaults to 16.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    Example:
+        >>> self = GFLHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_quality_score, bbox_pred = self.forward(feats)
+        >>> assert len(cls_quality_score) == len(self.scales)
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 loss_dfl: ConfigType = dict(
+                     type='DistributionFocalLoss', loss_weight=0.25),
+                 bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+                 reg_max: int = 16,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='gfl_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.reg_max = reg_max
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            bbox_coder=bbox_coder,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        self.integral = Integral(self.reg_max)
+        self.loss_dfl = MODELS.build(loss_dfl)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU()
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        assert self.num_anchors == 1, 'anchor free version'
+        self.gfl_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.gfl_reg = nn.Conv2d(
+            self.feat_channels, 4 * (self.reg_max + 1), 3, padding=1)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+
+            - cls_scores (list[Tensor]): Classification and quality (IoU)
+              joint scores for all scale levels, each is a 4D-tensor,
+              the channel number is num_classes.
+            - bbox_preds (list[Tensor]): Box distribution logits for all
+              scale levels, each is a 4D-tensor, the channel number is
+              4*(n+1), n is max value of integral set.
+        """
+        return multi_apply(self.forward_single, x, self.scales)
+
+    def forward_single(self, x: Tensor, scale: Scale) -> Sequence[Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+
+            - cls_score (Tensor): Cls and quality joint scores for a single
+              scale level the channel number is num_classes.
+            - bbox_pred (Tensor): Box distribution logits for a single scale
+              level, the channel number is 4*(n+1), n is max value of
+              integral set.
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.gfl_cls(cls_feat)
+        bbox_pred = scale(self.gfl_reg(reg_feat)).float()
+        return cls_score, bbox_pred
+
+    def anchor_center(self, anchors: Tensor) -> Tensor:
+        """Get anchor centers from anchors.
+
+        Args:
+            anchors (Tensor): Anchor list with shape (N, 4), ``xyxy`` format.
+
+        Returns:
+            Tensor: Anchor centers with shape (N, 2), ``xy`` format.
+        """
+        anchors_cx = (anchors[..., 2] + anchors[..., 0]) / 2
+        anchors_cy = (anchors[..., 3] + anchors[..., 1]) / 2
+        return torch.stack([anchors_cx, anchors_cy], dim=-1)
+
+    def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor,
+                            bbox_pred: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            stride: Tuple[int], avg_factor: int) -> dict:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Cls and quality joint scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_pred (Tensor): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            stride (Tuple[int]): Stride in this scale level.
+            avg_factor (int): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1, 4 * (self.reg_max + 1))
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        score = label_weights.new_zeros(labels.shape)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0]
+
+            weight_targets = cls_score.detach().sigmoid()
+            weight_targets = weight_targets.max(dim=1)[0][pos_inds]
+            pos_bbox_pred_corners = self.integral(pos_bbox_pred)
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchor_centers, pos_bbox_pred_corners)
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+            score[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1)
+            target_corners = self.bbox_coder.encode(pos_anchor_centers,
+                                                    pos_decode_bbox_targets,
+                                                    self.reg_max).reshape(-1)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=weight_targets,
+                avg_factor=1.0)
+
+            # dfl loss
+            loss_dfl = self.loss_dfl(
+                pred_corners,
+                target_corners,
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_dfl = bbox_pred.sum() * 0
+            weight_targets = bbox_pred.new_tensor(0)
+
+        # cls (qfl) loss
+        loss_cls = self.loss_cls(
+            cls_score, (labels, score),
+            weight=label_weights,
+            avg_factor=avg_factor)
+
+        return loss_cls, loss_bbox, loss_dfl, weight_targets.sum()
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Cls and quality scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_preds (list[Tensor]): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        losses_cls, losses_bbox, losses_dfl,\
+            avg_factor = multi_apply(
+                self.loss_by_feat_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                self.prior_generator.strides,
+                avg_factor=avg_factor)
+
+        avg_factor = sum(avg_factor)
+        avg_factor = reduce_mean(avg_factor).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / avg_factor, losses_bbox))
+        losses_dfl = list(map(lambda x: x / avg_factor, losses_dfl))
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dfl=losses_dfl)
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image. GFL head does not need this value.
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (:obj: `ConfigDict`): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+            is False and mlvl_score_factor is None, return mlvl_bboxes and
+            mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+            mlvl_score_factor. Usually with_nms is False is used for aug
+            test. If with_nms is True, then return the following format
+
+            - det_bboxes (Tensor): Predicted bboxes with shape
+              [num_bboxes, 5], where the first 4 columns are bounding
+              box positions (tl_x, tl_y, br_x, br_y) and the 5-th
+              column are scores between 0 and 1.
+            - det_labels (Tensor): Predicted labels of the corresponding
+              box with shape [num_bboxes].
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for level_idx, (cls_score, bbox_pred, stride, priors) in enumerate(
+                zip(cls_score_list, bbox_pred_list,
+                    self.prior_generator.strides, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert stride[0] == stride[1]
+
+            bbox_pred = bbox_pred.permute(1, 2, 0)
+            bbox_pred = self.integral(bbox_pred) * stride[0]
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, _, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            bboxes = self.bbox_coder.decode(
+                self.anchor_center(priors), bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def get_targets(self,
+                    anchor_list: List[Tensor],
+                    valid_flag_list: List[Tensor],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs=True) -> tuple:
+        """Get targets for GFL head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. Besides
+        returning the targets as the parent method does, it also returns the
+        anchors as the first element of the returned tuple.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             anchor_list,
+             valid_flag_list,
+             num_level_anchors_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs)
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, avg_factor)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            num_level_anchors: List[int],
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors (list[int]): Number of anchors of each scale
+                level.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+
+            - anchors (Tensor): All anchors in the image with shape (N, 4).
+            - labels (Tensor): Labels of all anchors in the image with
+              shape (N,).
+            - label_weights (Tensor): Label weights of all anchor in the
+              image with shape (N,).
+            - bbox_targets (Tensor): BBox targets of all anchors in the
+              image with shape (N, 4).
+            - bbox_weights (Tensor): BBox weights of all anchors in the
+              image with shape (N, 4).
+            - pos_inds (Tensor): Indices of positive anchor with shape
+              (num_pos,).
+            - neg_inds (Tensor): Indices of negative anchor with shape
+              (num_neg,).
+            - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            num_level_priors=num_level_anchors_inside,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds, sampling_result)
+
+    def get_num_level_anchors_inside(self, num_level_anchors: List[int],
+                                     inside_flags: Tensor) -> List[int]:
+        """Get the number of valid anchors in every level."""
+
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/grounding_dino_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/grounding_dino_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8088322546f24ae6f3e60aff1378d5c2feefdcf0
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/grounding_dino_head.py
@@ -0,0 +1,774 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Linear
+from mmengine.model import constant_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.losses import QualityFocalLoss
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
+from mmdet.utils import InstanceList, reduce_mean
+from ..layers import inverse_sigmoid
+from .atss_vlfusion_head import convert_grounding_to_cls_scores
+from .dino_head import DINOHead
+
+
+class ContrastiveEmbed(nn.Module):
+    """text visual ContrastiveEmbed layer.
+
+    Args:
+        max_text_len (int, optional): Maximum length of text.
+        log_scale (Optional[Union[str, float]]):  The initial value of a
+          learnable parameter to multiply with the similarity
+          matrix to normalize the output.  Defaults to 0.0.
+          - If set to 'auto', the similarity matrix will be normalized by
+            a fixed value ``sqrt(d_c)`` where ``d_c`` is the channel number.
+          - If set to 'none' or ``None``, there is no normalization applied.
+          - If set to a float number, the similarity matrix will be multiplied
+            by ``exp(log_scale)``, where ``log_scale`` is learnable.
+        bias (bool, optional): Whether to add bias to the output.
+          If set to ``True``, a learnable bias that is initialized as -4.6
+          will be added to the output. Useful when training from scratch.
+          Defaults to False.
+    """
+
+    def __init__(self,
+                 max_text_len: int = 256,
+                 log_scale: Optional[Union[str, float]] = None,
+                 bias: bool = False):
+        super().__init__()
+        self.max_text_len = max_text_len
+        self.log_scale = log_scale
+        if isinstance(log_scale, float):
+            self.log_scale = nn.Parameter(
+                torch.Tensor([float(log_scale)]), requires_grad=True)
+        elif log_scale not in ['auto', 'none', None]:
+            raise ValueError(f'log_scale should be one of '
+                             f'"auto", "none", None, but got {log_scale}')
+
+        self.bias = None
+        if bias:
+            bias_value = -math.log((1 - 0.01) / 0.01)
+            self.bias = nn.Parameter(
+                torch.Tensor([bias_value]), requires_grad=True)
+
+    def forward(self, visual_feat: Tensor, text_feat: Tensor,
+                text_token_mask: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            visual_feat (Tensor): Visual features.
+            text_feat (Tensor): Text features.
+            text_token_mask (Tensor): A mask used for text feats.
+
+        Returns:
+            Tensor: Classification score.
+        """
+        res = visual_feat @ text_feat.transpose(-1, -2)
+        if isinstance(self.log_scale, nn.Parameter):
+            res = res * self.log_scale.exp()
+        elif self.log_scale == 'auto':
+            # NOTE: similar to the normalizer in self-attention
+            res = res / math.sqrt(visual_feat.shape[-1])
+        if self.bias is not None:
+            res = res + self.bias
+        res.masked_fill_(~text_token_mask[:, None, :], float('-inf'))
+
+        new_res = torch.full((*res.shape[:-1], self.max_text_len),
+                             float('-inf'),
+                             device=res.device)
+        new_res[..., :res.shape[-1]] = res
+
+        return new_res
+
+
+@MODELS.register_module()
+class GroundingDINOHead(DINOHead):
+    """Head of the Grounding DINO: Marrying DINO with Grounded Pre-Training for
+    Open-Set Object Detection.
+
+    Args:
+        contrastive_cfg (dict, optional): Contrastive config that contains
+          keys like ``max_text_len``. Defaults to dict(max_text_len=256).
+    """
+
+    def __init__(self, contrastive_cfg=dict(max_text_len=256), **kwargs):
+        self.contrastive_cfg = contrastive_cfg
+        self.max_text_len = contrastive_cfg.get('max_text_len', 256)
+        super().__init__(**kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize classification branch and regression branch of head."""
+        fc_cls = ContrastiveEmbed(**self.contrastive_cfg)
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, 4))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        # NOTE: due to the fc_cls is a contrastive embedding and don't
+        # have any trainable parameters,we do not need to copy it.
+        if self.share_pred_layer:
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(self.num_pred_layer)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(self.num_pred_layer)])
+        else:
+            self.cls_branches = nn.ModuleList(
+                [copy.deepcopy(fc_cls) for _ in range(self.num_pred_layer)])
+            self.reg_branches = nn.ModuleList([
+                copy.deepcopy(reg_branch) for _ in range(self.num_pred_layer)
+            ])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the Deformable DETR head."""
+        for m in self.reg_branches:
+            constant_init(m[-1], 0, bias=0)
+        nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)
+        if self.as_two_stage:
+            for m in self.reg_branches:
+                nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+    def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> tuple:
+        """Compute regression and classification targets for one image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_queries, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_queries, 4].
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        img_h, img_w = img_meta['img_shape']
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        num_bboxes = bbox_pred.size(0)
+        # convert bbox_pred from xywh, normalized to xyxy, unnormalized
+        bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
+        bbox_pred = bbox_pred * factor
+
+        pred_instances = InstanceData(scores=cls_score, bboxes=bbox_pred)
+        # assigner and sampler
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            img_meta=img_meta)
+        gt_bboxes = gt_instances.bboxes
+
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+        pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :]
+
+        # Major changes. The labels are 0-1 binary labels for each bbox
+        # and text tokens.
+        labels = gt_bboxes.new_full((num_bboxes, self.max_text_len),
+                                    0,
+                                    dtype=torch.float32)
+        labels[pos_inds] = gt_instances.positive_maps[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype)
+        bbox_weights = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype)
+        bbox_weights[pos_inds] = 1.0
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        pos_gt_bboxes_normalized = pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        references: List[Tensor],
+        memory_text: Tensor,
+        text_token_mask: Tensor,
+    ) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries, dim).
+            references (List[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+            memory_text (Tensor): Memory text. It has shape (bs, len_text,
+                text_embed_dims).
+            text_token_mask (Tensor): Text token mask. It has shape (bs,
+                len_text).
+
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - all_layers_outputs_classes (Tensor): Outputs from the
+              classification head, has shape (num_decoder_layers, bs,
+              num_queries, cls_out_channels).
+            - all_layers_outputs_coords (Tensor): Sigmoid outputs from the
+              regression head with normalized coordinate format (cx, cy, w,
+              h), has shape (num_decoder_layers, bs, num_queries, 4) with the
+              last dimension arranged as (cx, cy, w, h).
+        """
+        all_layers_outputs_classes = []
+        all_layers_outputs_coords = []
+
+        for layer_id in range(hidden_states.shape[0]):
+            reference = inverse_sigmoid(references[layer_id])
+            # NOTE The last reference will not be used.
+            hidden_state = hidden_states[layer_id]
+            outputs_class = self.cls_branches[layer_id](hidden_state,
+                                                        memory_text,
+                                                        text_token_mask)
+            tmp_reg_preds = self.reg_branches[layer_id](hidden_state)
+            if reference.shape[-1] == 4:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `True`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `True`.
+                tmp_reg_preds += reference
+            else:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `False`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `False`.
+                assert reference.shape[-1] == 2
+                tmp_reg_preds[..., :2] += reference
+            outputs_coord = tmp_reg_preds.sigmoid()
+            all_layers_outputs_classes.append(outputs_class)
+            all_layers_outputs_coords.append(outputs_coord)
+
+        all_layers_outputs_classes = torch.stack(all_layers_outputs_classes)
+        all_layers_outputs_coords = torch.stack(all_layers_outputs_coords)
+
+        return all_layers_outputs_classes, all_layers_outputs_coords
+
+    def predict(self,
+                hidden_states: Tensor,
+                references: List[Tensor],
+                memory_text: Tensor,
+                text_token_mask: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, num_queries, bs, dim).
+            references (List[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+            memory_text (Tensor): Memory text. It has shape (bs, len_text,
+                text_embed_dims).
+            text_token_mask (Tensor): Text token mask. It has shape (bs,
+                len_text).
+            batch_data_samples (SampleList): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Defaults to `True`.
+
+        Returns:
+            InstanceList: Detection results of each image
+                after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        batch_token_positive_maps = [
+            data_samples.token_positive_map
+            for data_samples in batch_data_samples
+        ]
+
+        outs = self(hidden_states, references, memory_text, text_token_mask)
+
+        predictions = self.predict_by_feat(
+            *outs,
+            batch_img_metas=batch_img_metas,
+            batch_token_positive_maps=batch_token_positive_maps,
+            rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        all_layers_cls_scores: Tensor,
+                        all_layers_bbox_preds: Tensor,
+                        batch_img_metas: List[Dict],
+                        batch_token_positive_maps: Optional[List[dict]] = None,
+                        rescale: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            all_layers_cls_scores (Tensor):  Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs, num_queries,
+                cls_out_channels).
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and shape (num_decoder_layers, bs, num_queries,
+                4) with the last dimension arranged as (cx, cy, w, h).
+            batch_img_metas (List[Dict]): _description_
+            batch_token_positive_maps (list[dict], Optional): Batch token
+                positive map. Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cls_scores = all_layers_cls_scores[-1]
+        bbox_preds = all_layers_bbox_preds[-1]
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_meta = batch_img_metas[img_id]
+            token_positive_maps = batch_token_positive_maps[img_id]
+            results = self._predict_by_feat_single(cls_score, bbox_pred,
+                                                   token_positive_maps,
+                                                   img_meta, rescale)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score: Tensor,
+                                bbox_pred: Tensor,
+                                token_positive_maps: dict,
+                                img_meta: dict,
+                                rescale: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score (Tensor): Box score logits from the last decoder layer
+                for each image. Shape [num_queries, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
+                for each image, with coordinate format (cx, cy, w, h) and
+                shape [num_queries, 4].
+            token_positive_maps (dict): Token positive map.
+            img_meta (dict): Image meta info.
+            rescale (bool, optional): If True, return boxes in original image
+                space. Default True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_score) == len(bbox_pred)  # num_queries
+        max_per_img = self.test_cfg.get('max_per_img', len(cls_score))
+        img_shape = img_meta['img_shape']
+
+        if token_positive_maps is not None:
+            cls_score = convert_grounding_to_cls_scores(
+                logits=cls_score.sigmoid()[None],
+                positive_maps=[token_positive_maps])[0]
+            scores, indexes = cls_score.view(-1).topk(max_per_img)
+            num_classes = cls_score.shape[-1]
+            det_labels = indexes % num_classes
+            bbox_index = indexes // num_classes
+            bbox_pred = bbox_pred[bbox_index]
+        else:
+            cls_score = cls_score.sigmoid()
+            scores, _ = cls_score.max(-1)
+            scores, indexes = scores.topk(max_per_img)
+            bbox_pred = bbox_pred[indexes]
+            det_labels = scores.new_zeros(scores.shape, dtype=torch.long)
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
+        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
+        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            det_bboxes /= det_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+        results = InstanceData()
+        results.bboxes = det_bboxes
+        results.scores = scores
+        results.labels = det_labels
+        return results
+
+    def loss(self, hidden_states: Tensor, references: List[Tensor],
+             memory_text: Tensor, text_token_mask: Tensor,
+             enc_outputs_class: Tensor, enc_outputs_coord: Tensor,
+             batch_data_samples: SampleList, dn_meta: Dict[str, int]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries_total,
+                dim), where `num_queries_total` is the sum of
+                `num_denoising_queries` and `num_matching_queries` when
+                `self.training` is `True`, else `num_matching_queries`.
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries_total, 4) and each `inter_reference` has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            memory_text (Tensor): Memory text. It has shape (bs, len_text,
+                text_embed_dims).
+            enc_outputs_class (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+            enc_outputs_coord (Tensor): The proposal generate from the
+                encode feature map, has shape (bs, num_feat_points, 4) with the
+                last dimension arranged as (cx, cy, w, h).
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references, memory_text, text_token_mask)
+        self.text_masks = text_token_mask
+        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
+                              batch_gt_instances, batch_img_metas, dn_meta)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor,
+                            batch_gt_instances: InstanceList,
+                            batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images, has shape (bs, num_queries, cls_out_channels).
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape (bs, num_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        with torch.no_grad():
+            cls_reg_targets = self.get_targets(cls_scores_list,
+                                               bbox_preds_list,
+                                               batch_gt_instances,
+                                               batch_img_metas)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.stack(labels_list, 0)
+        label_weights = torch.stack(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # ===== this change =====
+        # Loss is not computed for the padded regions of the text.
+        assert (self.text_masks.dim() == 2)
+        text_masks = self.text_masks.new_zeros(
+            (self.text_masks.size(0), self.max_text_len))
+        text_masks[:, :self.text_masks.size(1)] = self.text_masks
+        text_mask = (text_masks > 0).unsqueeze(1)
+        text_mask = text_mask.repeat(1, cls_scores.size(1), 1)
+        cls_scores = torch.masked_select(cls_scores, text_mask).contiguous()
+
+        labels = torch.masked_select(labels, text_mask)
+        label_weights = label_weights[...,
+                                      None].repeat(1, 1, text_mask.size(-1))
+        label_weights = torch.masked_select(label_weights, text_mask)
+
+        # classification loss
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if isinstance(self.loss_cls, QualityFocalLoss):
+            raise NotImplementedError(
+                'QualityFocalLoss for GroundingDINOHead is not supported yet.')
+        else:
+            loss_cls = self.loss_cls(
+                cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds):
+            img_h, img_w, = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor,
+                        batch_gt_instances: InstanceList,
+                        batch_img_metas: List[dict],
+                        dn_meta: Dict[str, int]) -> Tuple[Tensor]:
+        """Denoising loss for outputs from a single decoder layer.
+
+        Args:
+            dn_cls_scores (Tensor): Classification scores of a single decoder
+                layer in denoising part, has shape (bs, num_denoising_queries,
+                cls_out_channels).
+            dn_bbox_preds (Tensor): Regression outputs of a single decoder
+                layer in denoising part. Each is a 4D-tensor with normalized
+                coordinate format (cx, cy, w, h) and has shape
+                (bs, num_denoising_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        cls_reg_targets = self.get_dn_targets(batch_gt_instances,
+                                              batch_img_metas, dn_meta)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.stack(labels_list, 0)
+        label_weights = torch.stack(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+        # ===== this change =====
+        # Loss is not computed for the padded regions of the text.
+        assert (self.text_masks.dim() == 2)
+        text_masks = self.text_masks.new_zeros(
+            (self.text_masks.size(0), self.max_text_len))
+        text_masks[:, :self.text_masks.size(1)] = self.text_masks
+        text_mask = (text_masks > 0).unsqueeze(1)
+        text_mask = text_mask.repeat(1, dn_cls_scores.size(1), 1)
+        cls_scores = torch.masked_select(dn_cls_scores, text_mask).contiguous()
+        labels = torch.masked_select(labels, text_mask)
+        label_weights = label_weights[...,
+                                      None].repeat(1, 1, text_mask.size(-1))
+        label_weights = torch.masked_select(label_weights, text_mask)
+        # =======================
+
+        # classification loss
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = \
+            num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if len(cls_scores) > 0:
+            if isinstance(self.loss_cls, QualityFocalLoss):
+                raise NotImplementedError('QualityFocalLoss is not supported')
+            else:
+                loss_cls = self.loss_cls(
+                    cls_scores,
+                    labels,
+                    label_weights,
+                    avg_factor=cls_avg_factor)
+        else:
+            loss_cls = torch.zeros(
+                1, dtype=cls_scores.dtype, device=cls_scores.device)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, dn_bbox_preds):
+            img_h, img_w = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = dn_bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def _get_dn_targets_single(self, gt_instances: InstanceData,
+                               img_meta: dict, dn_meta: Dict[str,
+                                                             int]) -> tuple:
+        """Get targets in denoising part for one image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        num_groups = dn_meta['num_denoising_groups']
+        num_denoising_queries = dn_meta['num_denoising_queries']
+        num_queries_each_group = int(num_denoising_queries / num_groups)
+        device = gt_bboxes.device
+
+        if len(gt_labels) > 0:
+            t = torch.arange(len(gt_labels), dtype=torch.long, device=device)
+            t = t.unsqueeze(0).repeat(num_groups, 1)
+            pos_assigned_gt_inds = t.flatten()
+            pos_inds = torch.arange(
+                num_groups, dtype=torch.long, device=device)
+            pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t
+            pos_inds = pos_inds.flatten()
+        else:
+            pos_inds = pos_assigned_gt_inds = \
+                gt_bboxes.new_tensor([], dtype=torch.long)
+
+        neg_inds = pos_inds + num_queries_each_group // 2
+        # label targets
+        # this change
+        labels = gt_bboxes.new_full((num_denoising_queries, self.max_text_len),
+                                    0,
+                                    dtype=torch.float32)
+        labels[pos_inds] = gt_instances.positive_maps[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_denoising_queries)
+
+        # bbox targets
+        bbox_targets = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        gt_bboxes_normalized = gt_bboxes / factor
+        gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized)
+        bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1])
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/guided_anchor_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/guided_anchor_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..59f6dd3336e66065dc88b702e925965d4089c72f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/guided_anchor_head.py
@@ -0,0 +1,994 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.ops import DeformConv2d, MaskedConv2d
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList)
+from ..layers import multiclass_nms
+from ..task_modules.prior_generators import anchor_inside_flags, calc_region
+from ..task_modules.samplers import PseudoSampler
+from ..utils import images_to_levels, multi_apply, unmap
+from .anchor_head import AnchorHead
+
+
+class FeatureAdaption(BaseModule):
+    """Feature Adaption Module.
+
+    Feature Adaption Module is implemented based on DCN v1.
+    It uses anchor shape prediction rather than feature map to
+    predict offsets of deform conv layer.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        out_channels (int): Number of channels in the output feature map.
+        kernel_size (int): Deformable conv kernel size. Defaults to 3.
+        deform_groups (int): Deformable conv group size. Defaults to 4.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        deform_groups: int = 4,
+        init_cfg: MultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.1,
+            override=dict(type='Normal', name='conv_adaption', std=0.01))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        offset_channels = kernel_size * kernel_size * 2
+        self.conv_offset = nn.Conv2d(
+            2, deform_groups * offset_channels, 1, bias=False)
+        self.conv_adaption = DeformConv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(kernel_size - 1) // 2,
+            deform_groups=deform_groups)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor, shape: Tensor) -> Tensor:
+        offset = self.conv_offset(shape.detach())
+        x = self.relu(self.conv_adaption(x, offset))
+        return x
+
+
+@MODELS.register_module()
+class GuidedAnchorHead(AnchorHead):
+    """Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.).
+
+    This GuidedAnchorHead will predict high-quality feature guided
+    anchors and locations where anchors will be kept in inference.
+    There are mainly 3 categories of bounding-boxes.
+
+    - Sampled 9 pairs for target assignment. (approxes)
+    - The square boxes where the predicted anchors are based on. (squares)
+    - Guided anchors.
+
+    Please refer to https://arxiv.org/abs/1901.03278 for more details.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Defaults to 256.
+        approx_anchor_generator (:obj:`ConfigDict` or dict): Config dict
+            for approx generator
+        square_anchor_generator (:obj:`ConfigDict` or dict): Config dict
+            for square generator
+        anchor_coder (:obj:`ConfigDict` or dict): Config dict for anchor coder
+        bbox_coder (:obj:`ConfigDict` or dict): Config dict for bbox coder
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Defaults to False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        deform_groups: (int): Group number of DCN in FeatureAdaption module.
+            Defaults to 4.
+        loc_filter_thr (float): Threshold to filter out unconcerned regions.
+            Defaults to 0.01.
+        loss_loc (:obj:`ConfigDict` or dict): Config of location loss.
+        loss_shape (:obj:`ConfigDict` or dict): Config of anchor shape loss.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of bbox regression loss.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        approx_anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            octave_base_scale=8,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        square_anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[8],
+            strides=[4, 8, 16, 32, 64]),
+        anchor_coder: ConfigType = dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        bbox_coder: ConfigType = dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        reg_decoded_bbox: bool = False,
+        deform_groups: int = 4,
+        loc_filter_thr: float = 0.01,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        loss_loc: ConfigType = dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape: ConfigType = dict(
+            type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox: ConfigType = dict(
+            type='SmoothL1Loss', beta=1.0, loss_weight=1.0),
+        init_cfg: MultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='conv_loc', std=0.01, lbias_prob=0.01))
+    ) -> None:
+        super(AnchorHead, self).__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.deform_groups = deform_groups
+        self.loc_filter_thr = loc_filter_thr
+
+        # build approx_anchor_generator and square_anchor_generator
+        assert (approx_anchor_generator['octave_base_scale'] ==
+                square_anchor_generator['scales'][0])
+        assert (approx_anchor_generator['strides'] ==
+                square_anchor_generator['strides'])
+        self.approx_anchor_generator = TASK_UTILS.build(
+            approx_anchor_generator)
+        self.square_anchor_generator = TASK_UTILS.build(
+            square_anchor_generator)
+        self.approxs_per_octave = self.approx_anchor_generator \
+            .num_base_priors[0]
+
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        # one anchor per location
+        self.num_base_priors = self.square_anchor_generator.num_base_priors[0]
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        self.loc_focal_loss = loss_loc['type'] in ['FocalLoss']
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes
+        else:
+            self.cls_out_channels = self.num_classes + 1
+
+        # build bbox_coder
+        self.anchor_coder = TASK_UTILS.build(anchor_coder)
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+
+        # build losses
+        self.loss_loc = MODELS.build(loss_loc)
+        self.loss_shape = MODELS.build(loss_shape)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            # use PseudoSampler when no sampler in train_cfg
+            if train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler()
+
+            self.ga_assigner = TASK_UTILS.build(self.train_cfg['ga_assigner'])
+            if train_cfg.get('ga_sampler', None) is not None:
+                self.ga_sampler = TASK_UTILS.build(
+                    self.train_cfg['ga_sampler'],
+                    default_args=dict(context=self))
+            else:
+                self.ga_sampler = PseudoSampler()
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.conv_loc = nn.Conv2d(self.in_channels, 1, 1)
+        self.conv_shape = nn.Conv2d(self.in_channels, self.num_base_priors * 2,
+                                    1)
+        self.feature_adaption = FeatureAdaption(
+            self.in_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.conv_cls = MaskedConv2d(
+            self.feat_channels, self.num_base_priors * self.cls_out_channels,
+            1)
+        self.conv_reg = MaskedConv2d(self.feat_channels,
+                                     self.num_base_priors * 4, 1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward feature of a single scale level."""
+        loc_pred = self.conv_loc(x)
+        shape_pred = self.conv_shape(x)
+        x = self.feature_adaption(x, shape_pred)
+        # masked conv is only used during inference for speed-up
+        if not self.training:
+            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
+        else:
+            mask = None
+        cls_score = self.conv_cls(x, mask)
+        bbox_pred = self.conv_reg(x, mask)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def forward(self, x: List[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network."""
+        return multi_apply(self.forward_single, x)
+
+    def get_sampled_approxs(self,
+                            featmap_sizes: List[Tuple[int, int]],
+                            batch_img_metas: List[dict],
+                            device: str = 'cuda') -> tuple:
+        """Get sampled approxs and inside flags according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+            device (str): device for returned tensors
+
+        Returns:
+            tuple: approxes of each image, inside flags of each image
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # approxes for one time
+        multi_level_approxs = self.approx_anchor_generator.grid_priors(
+            featmap_sizes, device=device)
+        approxs_list = [multi_level_approxs for _ in range(num_imgs)]
+
+        # for each image, we compute inside flags of multi level approxes
+        inside_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = []
+            multi_level_approxs = approxs_list[img_id]
+
+            # obtain valid flags for each approx first
+            multi_level_approx_flags = self.approx_anchor_generator \
+                .valid_flags(featmap_sizes,
+                             img_meta['pad_shape'],
+                             device=device)
+
+            for i, flags in enumerate(multi_level_approx_flags):
+                approxs = multi_level_approxs[i]
+                inside_flags_list = []
+                for j in range(self.approxs_per_octave):
+                    split_valid_flags = flags[j::self.approxs_per_octave]
+                    split_approxs = approxs[j::self.approxs_per_octave, :]
+                    inside_flags = anchor_inside_flags(
+                        split_approxs, split_valid_flags,
+                        img_meta['img_shape'][:2],
+                        self.train_cfg['allowed_border'])
+                    inside_flags_list.append(inside_flags)
+                # inside_flag for a position is true if any anchor in this
+                # position is true
+                inside_flags = (
+                    torch.stack(inside_flags_list, 0).sum(dim=0) > 0)
+                multi_level_flags.append(inside_flags)
+            inside_flag_list.append(multi_level_flags)
+        return approxs_list, inside_flag_list
+
+    def get_anchors(self,
+                    featmap_sizes: List[Tuple[int, int]],
+                    shape_preds: List[Tensor],
+                    loc_preds: List[Tensor],
+                    batch_img_metas: List[dict],
+                    use_loc_filter: bool = False,
+                    device: str = 'cuda') -> tuple:
+        """Get squares according to feature map sizes and guided anchors.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            shape_preds (list[tensor]): Multi-level shape predictions.
+            loc_preds (list[tensor]): Multi-level location predictions.
+            batch_img_metas (list[dict]): Image meta info.
+            use_loc_filter (bool): Use loc filter or not. Defaults to False
+            device (str): device for returned tensors.
+                Defaults to `cuda`.
+
+        Returns:
+            tuple: square approxs of each image, guided anchors of each image,
+            loc masks of each image.
+        """
+        num_imgs = len(batch_img_metas)
+        num_levels = len(featmap_sizes)
+
+        # since feature map sizes of all images are the same, we only compute
+        # squares for one time
+        multi_level_squares = self.square_anchor_generator.grid_priors(
+            featmap_sizes, device=device)
+        squares_list = [multi_level_squares for _ in range(num_imgs)]
+
+        # for each image, we compute multi level guided anchors
+        guided_anchors_list = []
+        loc_mask_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_guided_anchors = []
+            multi_level_loc_mask = []
+            for i in range(num_levels):
+                squares = squares_list[img_id][i]
+                shape_pred = shape_preds[i][img_id]
+                loc_pred = loc_preds[i][img_id]
+                guided_anchors, loc_mask = self._get_guided_anchors_single(
+                    squares,
+                    shape_pred,
+                    loc_pred,
+                    use_loc_filter=use_loc_filter)
+                multi_level_guided_anchors.append(guided_anchors)
+                multi_level_loc_mask.append(loc_mask)
+            guided_anchors_list.append(multi_level_guided_anchors)
+            loc_mask_list.append(multi_level_loc_mask)
+        return squares_list, guided_anchors_list, loc_mask_list
+
+    def _get_guided_anchors_single(
+            self,
+            squares: Tensor,
+            shape_pred: Tensor,
+            loc_pred: Tensor,
+            use_loc_filter: bool = False) -> Tuple[Tensor]:
+        """Get guided anchors and loc masks for a single level.
+
+        Args:
+            squares (tensor): Squares of a single level.
+            shape_pred (tensor): Shape predictions of a single level.
+            loc_pred (tensor): Loc predictions of a single level.
+            use_loc_filter (list[tensor]): Use loc filter or not.
+                Defaults to False.
+
+        Returns:
+            tuple: guided anchors, location masks
+        """
+        # calculate location filtering mask
+        loc_pred = loc_pred.sigmoid().detach()
+        if use_loc_filter:
+            loc_mask = loc_pred >= self.loc_filter_thr
+        else:
+            loc_mask = loc_pred >= 0.0
+        mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_base_priors)
+        mask = mask.contiguous().view(-1)
+        # calculate guided anchors
+        squares = squares[mask]
+        anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view(
+            -1, 2).detach()[mask]
+        bbox_deltas = anchor_deltas.new_full(squares.size(), 0)
+        bbox_deltas[:, 2:] = anchor_deltas
+        guided_anchors = self.anchor_coder.decode(
+            squares, bbox_deltas, wh_ratio_clip=1e-6)
+        return guided_anchors, mask
+
+    def ga_loc_targets(self, batch_gt_instances: InstanceList,
+                       featmap_sizes: List[Tuple[int, int]]) -> tuple:
+        """Compute location targets for guided anchoring.
+
+        Each feature map is divided into positive, negative and ignore regions.
+        - positive regions: target 1, weight 1
+        - ignore regions: target 0, weight 0
+        - negative regions: target 0, weight 0.1
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            featmap_sizes (list[tuple]): Multi level sizes of each feature
+                maps.
+
+        Returns:
+            tuple: Returns a tuple containing location targets.
+        """
+        anchor_scale = self.approx_anchor_generator.octave_base_scale
+        anchor_strides = self.approx_anchor_generator.strides
+        # Currently only supports same stride in x and y direction.
+        for stride in anchor_strides:
+            assert (stride[0] == stride[1])
+        anchor_strides = [stride[0] for stride in anchor_strides]
+
+        center_ratio = self.train_cfg['center_ratio']
+        ignore_ratio = self.train_cfg['ignore_ratio']
+        img_per_gpu = len(batch_gt_instances)
+        num_lvls = len(featmap_sizes)
+        r1 = (1 - center_ratio) / 2
+        r2 = (1 - ignore_ratio) / 2
+        all_loc_targets = []
+        all_loc_weights = []
+        all_ignore_map = []
+        for lvl_id in range(num_lvls):
+            h, w = featmap_sizes[lvl_id]
+            loc_targets = torch.zeros(
+                img_per_gpu,
+                1,
+                h,
+                w,
+                device=batch_gt_instances[0].bboxes.device,
+                dtype=torch.float32)
+            loc_weights = torch.full_like(loc_targets, -1)
+            ignore_map = torch.zeros_like(loc_targets)
+            all_loc_targets.append(loc_targets)
+            all_loc_weights.append(loc_weights)
+            all_ignore_map.append(ignore_map)
+        for img_id in range(img_per_gpu):
+            gt_bboxes = batch_gt_instances[img_id].bboxes
+            scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                               (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+            min_anchor_size = scale.new_full(
+                (1, ), float(anchor_scale * anchor_strides[0]))
+            # assign gt bboxes to different feature levels w.r.t. their scales
+            target_lvls = torch.floor(
+                torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
+            target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
+            for gt_id in range(gt_bboxes.size(0)):
+                lvl = target_lvls[gt_id].item()
+                # rescaled to corresponding feature map
+                gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl]
+                # calculate ignore regions
+                ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                    gt_, r2, featmap_sizes[lvl])
+                # calculate positive (center) regions
+                ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region(
+                    gt_, r1, featmap_sizes[lvl])
+                all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
+                                     ctr_x1:ctr_x2 + 1] = 1
+                all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                     ignore_x1:ignore_x2 + 1] = 0
+                all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
+                                     ctr_x1:ctr_x2 + 1] = 1
+                # calculate ignore map on nearby low level feature
+                if lvl > 0:
+                    d_lvl = lvl - 1
+                    # rescaled to corresponding feature map
+                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl]
+                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                        gt_, r2, featmap_sizes[d_lvl])
+                    all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                          ignore_x1:ignore_x2 + 1] = 1
+                # calculate ignore map on nearby high level feature
+                if lvl < num_lvls - 1:
+                    u_lvl = lvl + 1
+                    # rescaled to corresponding feature map
+                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl]
+                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                        gt_, r2, featmap_sizes[u_lvl])
+                    all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                          ignore_x1:ignore_x2 + 1] = 1
+        for lvl_id in range(num_lvls):
+            # ignore negative regions w.r.t. ignore map
+            all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0)
+                                    & (all_ignore_map[lvl_id] > 0)] = 0
+            # set negative regions with weight 0.1
+            all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1
+        # loc average factor to balance loss
+        loc_avg_factor = sum(
+            [t.size(0) * t.size(-1) * t.size(-2)
+             for t in all_loc_targets]) / 200
+        return all_loc_targets, all_loc_weights, loc_avg_factor
+
+    def _ga_shape_target_single(self,
+                                flat_approxs: Tensor,
+                                inside_flags: Tensor,
+                                flat_squares: Tensor,
+                                gt_instances: InstanceData,
+                                gt_instances_ignore: Optional[InstanceData],
+                                img_meta: dict,
+                                unmap_outputs: bool = True) -> tuple:
+        """Compute guided anchoring targets.
+
+        This function returns sampled anchors and gt bboxes directly
+        rather than calculates regression targets.
+
+        Args:
+            flat_approxs (Tensor): flat approxs of a single image,
+                shape (n, 4)
+            inside_flags (Tensor): inside flags of a single image,
+                shape (n, ).
+            flat_squares (Tensor): flat squares of a single image,
+                shape (approxs_per_octave * n, 4)
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+            img_meta (dict): Meta info of a single image.
+            unmap_outputs (bool): unmap outputs or not.
+
+        Returns:
+            tuple: Returns a tuple containing shape targets of each image.
+        """
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        num_square = flat_squares.size(0)
+        approxs = flat_approxs.view(num_square, self.approxs_per_octave, 4)
+        approxs = approxs[inside_flags, ...]
+        squares = flat_squares[inside_flags, :]
+
+        pred_instances = InstanceData()
+        pred_instances.priors = squares
+        pred_instances.approxs = approxs
+
+        assign_result = self.ga_assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+        sampling_result = self.ga_sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+
+        bbox_anchors = torch.zeros_like(squares)
+        bbox_gts = torch.zeros_like(squares)
+        bbox_weights = torch.zeros_like(squares)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes
+            bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes
+            bbox_weights[pos_inds, :] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_squares.size(0)
+            bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags)
+            bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds,
+                sampling_result)
+
+    def ga_shape_targets(self,
+                         approx_list: List[List[Tensor]],
+                         inside_flag_list: List[List[Tensor]],
+                         square_list: List[List[Tensor]],
+                         batch_gt_instances: InstanceList,
+                         batch_img_metas: List[dict],
+                         batch_gt_instances_ignore: OptInstanceList = None,
+                         unmap_outputs: bool = True) -> tuple:
+        """Compute guided anchoring targets.
+
+        Args:
+            approx_list (list[list[Tensor]]): Multi level approxs of each
+                image.
+            inside_flag_list (list[list[Tensor]]): Multi level inside flags
+                of each image.
+            square_list (list[list[Tensor]]): Multi level squares of each
+                image.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): unmap outputs or not. Defaults to None.
+
+        Returns:
+            tuple:  Returns a tuple containing shape targets.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(approx_list) == len(inside_flag_list) == len(
+            square_list) == num_imgs
+        # anchor number of multi levels
+        num_level_squares = [squares.size(0) for squares in square_list[0]]
+        # concat all level anchors and flags to a single tensor
+        inside_flag_flat_list = []
+        approx_flat_list = []
+        square_flat_list = []
+        for i in range(num_imgs):
+            assert len(square_list[i]) == len(inside_flag_list[i])
+            inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
+            approx_flat_list.append(torch.cat(approx_list[i]))
+            square_flat_list.append(torch.cat(square_list[i]))
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None for _ in range(num_imgs)]
+        (all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list,
+         neg_inds_list, sampling_results_list) = multi_apply(
+             self._ga_shape_target_single,
+             approx_flat_list,
+             inside_flag_flat_list,
+             square_flat_list,
+             batch_gt_instances,
+             batch_gt_instances_ignore,
+             batch_img_metas,
+             unmap_outputs=unmap_outputs)
+        # sampled anchors of all images
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        bbox_anchors_list = images_to_levels(all_bbox_anchors,
+                                             num_level_squares)
+        bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_squares)
+        return (bbox_anchors_list, bbox_gts_list, bbox_weights_list,
+                avg_factor)
+
+    def loss_shape_single(self, shape_pred: Tensor, bbox_anchors: Tensor,
+                          bbox_gts: Tensor, anchor_weights: Tensor,
+                          avg_factor: int) -> Tensor:
+        """Compute shape loss in single level."""
+        shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2)
+        bbox_anchors = bbox_anchors.contiguous().view(-1, 4)
+        bbox_gts = bbox_gts.contiguous().view(-1, 4)
+        anchor_weights = anchor_weights.contiguous().view(-1, 4)
+        bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0)
+        bbox_deltas[:, 2:] += shape_pred
+        # filter out negative samples to speed-up weighted_bounded_iou_loss
+        inds = torch.nonzero(
+            anchor_weights[:, 0] > 0, as_tuple=False).squeeze(1)
+        bbox_deltas_ = bbox_deltas[inds]
+        bbox_anchors_ = bbox_anchors[inds]
+        bbox_gts_ = bbox_gts[inds]
+        anchor_weights_ = anchor_weights[inds]
+        pred_anchors_ = self.anchor_coder.decode(
+            bbox_anchors_, bbox_deltas_, wh_ratio_clip=1e-6)
+        loss_shape = self.loss_shape(
+            pred_anchors_, bbox_gts_, anchor_weights_, avg_factor=avg_factor)
+        return loss_shape
+
+    def loss_loc_single(self, loc_pred: Tensor, loc_target: Tensor,
+                        loc_weight: Tensor, avg_factor: float) -> Tensor:
+        """Compute location loss in single level."""
+        loss_loc = self.loss_loc(
+            loc_pred.reshape(-1, 1),
+            loc_target.reshape(-1).long(),
+            loc_weight.reshape(-1),
+            avg_factor=avg_factor)
+        return loss_loc
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            shape_preds: List[Tensor],
+            loc_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            shape_preds (list[Tensor]): shape predictions for each scale
+                level with shape (N, 1, H, W).
+            loc_preds (list[Tensor]): location predictions for each scale
+                level with shape (N, num_anchors * 2, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        # get loc targets
+        loc_targets, loc_weights, loc_avg_factor = self.ga_loc_targets(
+            batch_gt_instances, featmap_sizes)
+
+        # get sampled approxes
+        approxs_list, inside_flag_list = self.get_sampled_approxs(
+            featmap_sizes, batch_img_metas, device=device)
+        # get squares and guided anchors
+        squares_list, guided_anchors_list, _ = self.get_anchors(
+            featmap_sizes,
+            shape_preds,
+            loc_preds,
+            batch_img_metas,
+            device=device)
+
+        # get shape targets
+        shape_targets = self.ga_shape_targets(approxs_list, inside_flag_list,
+                                              squares_list, batch_gt_instances,
+                                              batch_img_metas)
+        (bbox_anchors_list, bbox_gts_list, anchor_weights_list,
+         ga_avg_factor) = shape_targets
+
+        # get anchor targets
+        cls_reg_targets = self.get_targets(
+            guided_anchors_list,
+            inside_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor) = cls_reg_targets
+
+        # anchor number of multi levels
+        num_level_anchors = [
+            anchors.size(0) for anchors in guided_anchors_list[0]
+        ]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        for i in range(len(guided_anchors_list)):
+            concat_anchor_list.append(torch.cat(guided_anchors_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        # get classification and bbox regression losses
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            avg_factor=avg_factor)
+
+        # get anchor location loss
+        losses_loc = []
+        for i in range(len(loc_preds)):
+            loss_loc = self.loss_loc_single(
+                loc_preds[i],
+                loc_targets[i],
+                loc_weights[i],
+                avg_factor=loc_avg_factor)
+            losses_loc.append(loss_loc)
+
+        # get anchor shape loss
+        losses_shape = []
+        for i in range(len(shape_preds)):
+            loss_shape = self.loss_shape_single(
+                shape_preds[i],
+                bbox_anchors_list[i],
+                bbox_gts_list[i],
+                anchor_weights_list[i],
+                avg_factor=ga_avg_factor)
+            losses_shape.append(loss_shape)
+
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_shape=losses_shape,
+            loss_loc=losses_loc)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        shape_preds: List[Tensor],
+                        loc_preds: List[Tensor],
+                        batch_img_metas: List[dict],
+                        cfg: OptConfigType = None,
+                        rescale: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            shape_preds (list[Tensor]): shape predictions for each scale
+                level with shape (N, 1, H, W).
+            loc_preds (list[Tensor]): location predictions for each scale
+                level with shape (N, num_anchors * 2, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last
+              dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len(
+            loc_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        device = cls_scores[0].device
+        # get guided anchors
+        _, guided_anchors, loc_masks = self.get_anchors(
+            featmap_sizes,
+            shape_preds,
+            loc_preds,
+            batch_img_metas,
+            use_loc_filter=not self.training,
+            device=device)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            guided_anchor_list = [
+                guided_anchors[img_id][i].detach() for i in range(num_levels)
+            ]
+            loc_mask_list = [
+                loc_masks[img_id][i].detach() for i in range(num_levels)
+            ]
+            proposals = self._predict_by_feat_single(
+                cls_scores=cls_score_list,
+                bbox_preds=bbox_pred_list,
+                mlvl_anchors=guided_anchor_list,
+                mlvl_masks=loc_mask_list,
+                img_meta=batch_img_metas[img_id],
+                cfg=cfg,
+                rescale=rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                mlvl_anchors: List[Tensor],
+                                mlvl_masks: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = False) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Each element in the list is
+                the anchors of a single level in feature pyramid. it has
+                shape (num_priors, 4).
+            mlvl_masks (list[Tensor]): Each element in the list is location
+                masks of a single level.
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict` or dict): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last
+              dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bbox_preds = []
+        mlvl_valid_anchors = []
+        mlvl_scores = []
+        for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds,
+                                                       mlvl_anchors,
+                                                       mlvl_masks):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            # reshape scores and bbox_pred
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask, :]
+            bbox_pred = bbox_pred[mask, :]
+            if scores.dim() == 0:
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+                bbox_pred = bbox_pred.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_anchors.append(anchors)
+            mlvl_scores.append(scores)
+
+        mlvl_bbox_preds = torch.cat(mlvl_bbox_preds)
+        mlvl_anchors = torch.cat(mlvl_valid_anchors)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_bboxes = self.bbox_coder.decode(
+            mlvl_anchors, mlvl_bbox_preds, max_shape=img_meta['img_shape'])
+
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+            # BG cat_id: num_class
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        # multi class NMS
+        det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
+                                                cfg.score_thr, cfg.nms,
+                                                cfg.max_per_img)
+
+        results = InstanceData()
+        results.bboxes = det_bboxes[:, :-1]
+        results.scores = det_bboxes[:, -1]
+        results.labels = det_labels
+        return results
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/lad_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/lad_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1218e1f88206704d4f414d151ccd34a189ac5d0
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/lad_head.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import InstanceList, OptInstanceList
+from ..utils import levels_to_images, multi_apply, unpack_gt_instances
+from .paa_head import PAAHead
+
+
+@MODELS.register_module()
+class LADHead(PAAHead):
+    """Label Assignment Head from the paper: `Improving Object Detection by
+    Label Assignment Distillation <https://arxiv.org/pdf/2108.10520.pdf>`_"""
+
+    def get_label_assignment(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            iou_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> tuple:
+        """Get label assignment (from teacher).
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            iou_preds (list[Tensor]): iou_preds for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            tuple: Returns a tuple containing label assignment variables.
+
+            - labels (Tensor): Labels of all anchors, each with
+              shape (num_anchors,).
+            - labels_weight (Tensor): Label weights of all anchor.
+              each with shape (num_anchors,).
+            - bboxes_target (Tensor): BBox targets of all anchors.
+              each with shape (num_anchors, 4).
+            - bboxes_weight (Tensor): BBox weights of all anchors.
+              each with shape (num_anchors, 4).
+            - pos_inds_flatten (Tensor): Contains all index of positive
+              sample in all anchor.
+            - pos_anchors (Tensor): Positive anchors.
+            - num_pos (int): Number of positive anchors.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+        )
+        (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds,
+         pos_gt_index) = cls_reg_targets
+        cls_scores = levels_to_images(cls_scores)
+        cls_scores = [
+            item.reshape(-1, self.cls_out_channels) for item in cls_scores
+        ]
+        bbox_preds = levels_to_images(bbox_preds)
+        bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
+        pos_losses_list, = multi_apply(self.get_pos_loss, anchor_list,
+                                       cls_scores, bbox_preds, labels,
+                                       labels_weight, bboxes_target,
+                                       bboxes_weight, pos_inds)
+
+        with torch.no_grad():
+            reassign_labels, reassign_label_weight, \
+                reassign_bbox_weights, num_pos = multi_apply(
+                    self.paa_reassign,
+                    pos_losses_list,
+                    labels,
+                    labels_weight,
+                    bboxes_weight,
+                    pos_inds,
+                    pos_gt_index,
+                    anchor_list)
+            num_pos = sum(num_pos)
+        # convert all tensor list to a flatten tensor
+        labels = torch.cat(reassign_labels, 0).view(-1)
+        flatten_anchors = torch.cat(
+            [torch.cat(item, 0) for item in anchor_list])
+        labels_weight = torch.cat(reassign_label_weight, 0).view(-1)
+        bboxes_target = torch.cat(bboxes_target,
+                                  0).view(-1, bboxes_target[0].size(-1))
+
+        pos_inds_flatten = ((labels >= 0)
+                            &
+                            (labels < self.num_classes)).nonzero().reshape(-1)
+
+        if num_pos:
+            pos_anchors = flatten_anchors[pos_inds_flatten]
+        else:
+            pos_anchors = None
+
+        label_assignment_results = (labels, labels_weight, bboxes_target,
+                                    bboxes_weight, pos_inds_flatten,
+                                    pos_anchors, num_pos)
+        return label_assignment_results
+
+    def loss(self, x: List[Tensor], label_assignment_results: tuple,
+             batch_data_samples: SampleList) -> dict:
+        """Forward train with the available label assignment (student receives
+        from teacher).
+
+        Args:
+            x (list[Tensor]): Features from FPN.
+            label_assignment_results (tuple): As the outputs defined in the
+                function `self.get_label_assignment`.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            losses: (dict[str, Tensor]): A dictionary of loss components.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        outs = self(x)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(
+            *loss_inputs,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            label_assignment_results=label_assignment_results)
+        return losses
+
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     iou_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None,
+                     label_assignment_results: Optional[tuple] = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            iou_preds (list[Tensor]): iou_preds for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            label_assignment_results (tuple, optional): As the outputs defined
+                in the function `self.get_
+                label_assignment`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss gmm_assignment.
+        """
+
+        (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds_flatten,
+         pos_anchors, num_pos) = label_assignment_results
+
+        cls_scores = levels_to_images(cls_scores)
+        cls_scores = [
+            item.reshape(-1, self.cls_out_channels) for item in cls_scores
+        ]
+        bbox_preds = levels_to_images(bbox_preds)
+        bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
+        iou_preds = levels_to_images(iou_preds)
+        iou_preds = [item.reshape(-1, 1) for item in iou_preds]
+
+        # convert all tensor list to a flatten tensor
+        cls_scores = torch.cat(cls_scores, 0).view(-1, cls_scores[0].size(-1))
+        bbox_preds = torch.cat(bbox_preds, 0).view(-1, bbox_preds[0].size(-1))
+        iou_preds = torch.cat(iou_preds, 0).view(-1, iou_preds[0].size(-1))
+
+        losses_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            labels_weight,
+            avg_factor=max(num_pos, len(batch_img_metas)))  # avoid num_pos=0
+        if num_pos:
+            pos_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, bbox_preds[pos_inds_flatten])
+            pos_bbox_target = bboxes_target[pos_inds_flatten]
+            iou_target = bbox_overlaps(
+                pos_bbox_pred.detach(), pos_bbox_target, is_aligned=True)
+            losses_iou = self.loss_centerness(
+                iou_preds[pos_inds_flatten],
+                iou_target.unsqueeze(-1),
+                avg_factor=num_pos)
+            losses_bbox = self.loss_bbox(
+                pos_bbox_pred, pos_bbox_target, avg_factor=num_pos)
+
+        else:
+            losses_iou = iou_preds.sum() * 0
+            losses_bbox = bbox_preds.sum() * 0
+
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_iou=losses_iou)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/ld_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/ld_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2558fac97ee26ff89c5fa1b386f5ce68c3ad384d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/ld_head.py
@@ -0,0 +1,257 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
+from ..utils import multi_apply, unpack_gt_instances
+from .gfl_head import GFLHead
+
+
+@MODELS.register_module()
+class LDHead(GFLHead):
+    """Localization distillation Head. (Short description)
+
+    It utilizes the learned bbox distributions to transfer the localization
+    dark knowledge from teacher to student. Original paper: `Localization
+    Distillation for Object Detection. <https://arxiv.org/abs/2102.12252>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        loss_ld (:obj:`ConfigDict` or dict): Config of Localization
+            Distillation Loss (LD), T is the temperature for distillation.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_ld: ConfigType = dict(
+                     type='LocalizationDistillationLoss',
+                     loss_weight=0.25,
+                     T=10),
+                 **kwargs) -> dict:
+
+        super().__init__(
+            num_classes=num_classes, in_channels=in_channels, **kwargs)
+        self.loss_ld = MODELS.build(loss_ld)
+
+    def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor,
+                            bbox_pred: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            stride: Tuple[int], soft_targets: Tensor,
+                            avg_factor: int):
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Cls and quality joint scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_pred (Tensor): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            stride (tuple): Stride in this scale level.
+            soft_targets (Tensor): Soft BBox regression targets.
+            avg_factor (int): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            dict[tuple, Tensor]: Loss components and weight targets.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1, 4 * (self.reg_max + 1))
+        soft_targets = soft_targets.permute(0, 2, 3,
+                                            1).reshape(-1,
+                                                       4 * (self.reg_max + 1))
+
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        score = label_weights.new_zeros(labels.shape)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0]
+
+            weight_targets = cls_score.detach().sigmoid()
+            weight_targets = weight_targets.max(dim=1)[0][pos_inds]
+            pos_bbox_pred_corners = self.integral(pos_bbox_pred)
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchor_centers, pos_bbox_pred_corners)
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+            score[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1)
+            pos_soft_targets = soft_targets[pos_inds]
+            soft_corners = pos_soft_targets.reshape(-1, self.reg_max + 1)
+
+            target_corners = self.bbox_coder.encode(pos_anchor_centers,
+                                                    pos_decode_bbox_targets,
+                                                    self.reg_max).reshape(-1)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=weight_targets,
+                avg_factor=1.0)
+
+            # dfl loss
+            loss_dfl = self.loss_dfl(
+                pred_corners,
+                target_corners,
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0)
+
+            # ld loss
+            loss_ld = self.loss_ld(
+                pred_corners,
+                soft_corners,
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0)
+
+        else:
+            loss_ld = bbox_pred.sum() * 0
+            loss_bbox = bbox_pred.sum() * 0
+            loss_dfl = bbox_pred.sum() * 0
+            weight_targets = bbox_pred.new_tensor(0)
+
+        # cls (qfl) loss
+        loss_cls = self.loss_cls(
+            cls_score, (labels, score),
+            weight=label_weights,
+            avg_factor=avg_factor)
+
+        return loss_cls, loss_bbox, loss_dfl, loss_ld, weight_targets.sum()
+
+    def loss(self, x: List[Tensor], out_teacher: Tuple[Tensor],
+             batch_data_samples: SampleList) -> dict:
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            out_teacher (tuple[Tensor]): The output of teacher.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            tuple[dict, list]: The loss components and proposals of each image.
+
+            - losses (dict[str, Tensor]): A dictionary of loss components.
+            - proposal_list (list[Tensor]): Proposals of each image.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        outs = self(x)
+        soft_targets = out_teacher[1]
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas,
+                              soft_targets)
+        losses = self.loss_by_feat(
+            *loss_inputs, batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        return losses
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            soft_targets: List[Tensor],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Cls and quality scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_preds (list[Tensor]): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            soft_targets (list[Tensor]): Soft BBox regression targets.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        losses_cls, losses_bbox, losses_dfl, losses_ld, \
+            avg_factor = multi_apply(
+                self.loss_by_feat_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                self.prior_generator.strides,
+                soft_targets,
+                avg_factor=avg_factor)
+
+        avg_factor = sum(avg_factor) + 1e-6
+        avg_factor = reduce_mean(avg_factor).item()
+        losses_bbox = [x / avg_factor for x in losses_bbox]
+        losses_dfl = [x / avg_factor for x in losses_dfl]
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_dfl=losses_dfl,
+            loss_ld=losses_ld)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/mask2former_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/mask2former_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d47c655255f92819646b8ea304b9736ec30660
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/mask2former_head.py
@@ -0,0 +1,459 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d
+from mmcv.ops import point_sample
+from mmengine.model import ModuleList, caffe2_xavier_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig, reduce_mean
+from ..layers import Mask2FormerTransformerDecoder, SinePositionalEncoding
+from ..utils import get_uncertain_point_coords_with_randomness
+from .anchor_free_head import AnchorFreeHead
+from .maskformer_head import MaskFormerHead
+
+
+@MODELS.register_module()
+class Mask2FormerHead(MaskFormerHead):
+    """Implements the Mask2Former head.
+
+    See `Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for features.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer decoder.
+        pixel_decoder (:obj:`ConfigDict` or dict): Config for pixel
+            decoder. Defaults to None.
+        enforce_decoder_input_project (bool, optional): Whether to add
+            a layer to change the embed_dim of tranformer encoder in
+            pixel decoder to the embed_dim of transformer decoder.
+            Defaults to False.
+        transformer_decoder (:obj:`ConfigDict` or dict): Config for
+            transformer decoder. Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer decoder position encoding. Defaults to
+            dict(num_feats=128, normalize=True).
+        loss_cls (:obj:`ConfigDict` or dict): Config of the classification
+            loss. Defaults to None.
+        loss_mask (:obj:`ConfigDict` or dict): Config of the mask loss.
+            Defaults to None.
+        loss_dice (:obj:`ConfigDict` or dict): Config of the dice loss.
+            Defaults to None.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            Mask2Former head.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            Mask2Former head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 feat_channels: int,
+                 out_channels: int,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 num_queries: int = 100,
+                 num_transformer_feat_level: int = 3,
+                 pixel_decoder: ConfigType = ...,
+                 enforce_decoder_input_project: bool = False,
+                 transformer_decoder: ConfigType = ...,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=2.0,
+                     reduction='mean',
+                     class_weight=[1.0] * 133 + [0.1]),
+                 loss_mask: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='mean',
+                     loss_weight=5.0),
+                 loss_dice: ConfigType = dict(
+                     type='DiceLoss',
+                     use_sigmoid=True,
+                     activate=True,
+                     reduction='mean',
+                     naive_dice=True,
+                     eps=1.0,
+                     loss_weight=5.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(AnchorFreeHead, self).__init__(init_cfg=init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_heads = transformer_decoder.layer_cfg.cross_attn_cfg.num_heads
+        self.num_transformer_decoder_layers = transformer_decoder.num_layers
+        assert pixel_decoder.encoder.layer_cfg. \
+            self_attn_cfg.num_levels == num_transformer_feat_level
+        pixel_decoder_ = copy.deepcopy(pixel_decoder)
+        pixel_decoder_.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = MODELS.build(pixel_decoder_)
+        self.transformer_decoder = Mask2FormerTransformerDecoder(
+            **transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+
+        self.decoder_input_projs = ModuleList()
+        # from low resolution to high resolution
+        for _ in range(num_transformer_feat_level):
+            if (self.decoder_embed_dims != feat_channels
+                    or enforce_decoder_input_project):
+                self.decoder_input_projs.append(
+                    Conv2d(
+                        feat_channels, self.decoder_embed_dims, kernel_size=1))
+            else:
+                self.decoder_input_projs.append(nn.Identity())
+        self.decoder_positional_encoding = SinePositionalEncoding(
+            **positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
+        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
+        # from low resolution to high resolution
+        self.level_embed = nn.Embedding(self.num_transformer_feat_level,
+                                        feat_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            self.sampler = TASK_UTILS.build(
+                self.train_cfg['sampler'], default_args=dict(context=self))
+            self.num_points = self.train_cfg.get('num_points', 12544)
+            self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
+            self.importance_sample_ratio = self.train_cfg.get(
+                'importance_sample_ratio', 0.75)
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_mask = MODELS.build(loss_mask)
+        self.loss_dice = MODELS.build(loss_dice)
+
+    def init_weights(self) -> None:
+        for m in self.decoder_input_projs:
+            if isinstance(m, Conv2d):
+                caffe2_xavier_init(m, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> Tuple[Tensor]:
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_instances (:obj:`InstanceData`): It contains ``labels`` and
+                ``masks``.
+            img_meta (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each \
+                    image.
+                - neg_inds (Tensor): Sampled negative indices for each \
+                    image.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        gt_labels = gt_instances.labels
+        gt_masks = gt_instances.masks
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(
+            mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1,
+                                                        1)).squeeze(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(
+            gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1,
+                                                               1)).squeeze(1)
+
+        sampled_gt_instances = InstanceData(
+            labels=gt_labels, masks=gt_points_masks)
+        sampled_pred_instances = InstanceData(
+            scores=cls_score, masks=mask_points_pred)
+        # assign and sample
+        assign_result = self.assigner.assign(
+            pred_instances=sampled_pred_instances,
+            gt_instances=sampled_gt_instances,
+            img_meta=img_meta)
+        pred_instances = InstanceData(scores=cls_score, masks=mask_pred)
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones((self.num_queries, ))
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def _loss_by_feat_single(self, cls_scores: Tensor, mask_preds: Tensor,
+                             batch_gt_instances: List[InstanceData],
+                             batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single \
+                decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         avg_factor) = self.get_targets(cls_scores_list, mask_preds_list,
+                                        batch_gt_instances, batch_img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([avg_factor]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        with torch.no_grad():
+            points_coords = get_uncertain_point_coords_with_randomness(
+                mask_preds.unsqueeze(1), None, self.num_points,
+                self.oversample_ratio, self.importance_sample_ratio)
+            # shape (num_total_gts, h, w) -> (num_total_gts, num_points)
+            mask_point_targets = point_sample(
+                mask_targets.unsqueeze(1).float(), points_coords).squeeze(1)
+        # shape (num_queries, h, w) -> (num_queries, num_points)
+        mask_point_preds = point_sample(
+            mask_preds.unsqueeze(1), points_coords).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_point_preds, mask_point_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # shape (num_queries, num_points) -> (num_queries * num_points, )
+        mask_point_preds = mask_point_preds.reshape(-1)
+        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
+        mask_point_targets = mask_point_targets.reshape(-1)
+        loss_mask = self.loss_mask(
+            mask_point_preds,
+            mask_point_targets,
+            avg_factor=num_total_masks * self.num_points)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def _forward_head(self, decoder_out: Tensor, mask_feature: Tensor,
+                      attn_mask_target_size: Tuple[int, int]) -> Tuple[Tensor]:
+        """Forward for head part which is called after every decoder layer.
+
+        Args:
+            decoder_out (Tensor): in shape (batch_size, num_queries, c).
+            mask_feature (Tensor): in shape (batch_size, c, h, w).
+            attn_mask_target_size (tuple[int, int]): target attention
+                mask size.
+
+        Returns:
+            tuple: A tuple contain three elements.
+
+                - cls_pred (Tensor): Classification scores in shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should includes background.
+                - mask_pred (Tensor): Mask scores in shape \
+                    (batch_size, num_queries,h, w).
+                - attn_mask (Tensor): Attention mask in shape \
+                    (batch_size * num_heads, num_queries, h, w).
+        """
+        decoder_out = self.transformer_decoder.post_norm(decoder_out)
+        # shape (num_queries, batch_size, c)
+        cls_pred = self.cls_embed(decoder_out)
+        # shape (num_queries, batch_size, c)
+        mask_embed = self.mask_embed(decoder_out)
+        # shape (num_queries, batch_size, h, w)
+        mask_pred = torch.einsum('bqc,bchw->bqhw', mask_embed, mask_feature)
+        attn_mask = F.interpolate(
+            mask_pred,
+            attn_mask_target_size,
+            mode='bilinear',
+            align_corners=False)
+        # shape (num_queries, batch_size, h, w) ->
+        #   (batch_size * num_head, num_queries, h, w)
+        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
+            (1, self.num_heads, 1, 1)).flatten(0, 1)
+        attn_mask = attn_mask.sigmoid() < 0.5
+        attn_mask = attn_mask.detach()
+
+        return cls_pred, mask_pred, attn_mask
+
+    def forward(self, x: List[Tensor],
+                batch_data_samples: SampleList) -> Tuple[List[Tensor]]:
+        """Forward function.
+
+        Args:
+            x (list[Tensor]): Multi scale Features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            tuple[list[Tensor]]: A tuple contains two elements.
+
+                - cls_pred_list (list[Tensor)]: Classification logits \
+                    for each decoder layer. Each is a 3D-tensor with shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should includes background.
+                - mask_pred_list (list[Tensor]): Mask logits for each \
+                    decoder layer. Each with shape (batch_size, num_queries, \
+                    h, w).
+        """
+        batch_size = x[0].shape[0]
+        mask_features, multi_scale_memorys = self.pixel_decoder(x)
+        # multi_scale_memorys (from low resolution to high resolution)
+        decoder_inputs = []
+        decoder_positional_encodings = []
+        for i in range(self.num_transformer_feat_level):
+            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
+            # shape (batch_size, c, h, w) -> (batch_size, h*w, c)
+            decoder_input = decoder_input.flatten(2).permute(0, 2, 1)
+            level_embed = self.level_embed.weight[i].view(1, 1, -1)
+            decoder_input = decoder_input + level_embed
+            # shape (batch_size, c, h, w) -> (batch_size, h*w, c)
+            mask = decoder_input.new_zeros(
+                (batch_size, ) + multi_scale_memorys[i].shape[-2:],
+                dtype=torch.bool)
+            decoder_positional_encoding = self.decoder_positional_encoding(
+                mask)
+            decoder_positional_encoding = decoder_positional_encoding.flatten(
+                2).permute(0, 2, 1)
+            decoder_inputs.append(decoder_input)
+            decoder_positional_encodings.append(decoder_positional_encoding)
+        # shape (num_queries, c) -> (batch_size, num_queries, c)
+        query_feat = self.query_feat.weight.unsqueeze(0).repeat(
+            (batch_size, 1, 1))
+        query_embed = self.query_embed.weight.unsqueeze(0).repeat(
+            (batch_size, 1, 1))
+
+        cls_pred_list = []
+        mask_pred_list = []
+        cls_pred, mask_pred, attn_mask = self._forward_head(
+            query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
+        cls_pred_list.append(cls_pred)
+        mask_pred_list.append(mask_pred)
+
+        for i in range(self.num_transformer_decoder_layers):
+            level_idx = i % self.num_transformer_feat_level
+            # if a mask is all True(all background), then set it all False.
+            mask_sum = (attn_mask.sum(-1) != attn_mask.shape[-1]).unsqueeze(-1)
+            attn_mask = attn_mask & mask_sum
+            # cross_attn + self_attn
+            layer = self.transformer_decoder.layers[i]
+            query_feat = layer(
+                query=query_feat,
+                key=decoder_inputs[level_idx],
+                value=decoder_inputs[level_idx],
+                query_pos=query_embed,
+                key_pos=decoder_positional_encodings[level_idx],
+                cross_attn_mask=attn_mask,
+                query_key_padding_mask=None,
+                # here we do not apply masking on padded region
+                key_padding_mask=None)
+            cls_pred, mask_pred, attn_mask = self._forward_head(
+                query_feat, mask_features, multi_scale_memorys[
+                    (i + 1) % self.num_transformer_feat_level].shape[-2:])
+
+            cls_pred_list.append(cls_pred)
+            mask_pred_list.append(mask_pred)
+
+        return cls_pred_list, mask_pred_list
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/maskformer_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/maskformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..24c0655ee1c36e0110cf6578d1c095c50a297d81
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/maskformer_head.py
@@ -0,0 +1,601 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d
+from mmengine.model import caffe2_xavier_init
+from mmengine.structures import InstanceData, PixelData
+from torch import Tensor
+
+from mmdet.models.layers.pixel_decoder import PixelDecoder
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptMultiConfig, reduce_mean)
+from ..layers import DetrTransformerDecoder, SinePositionalEncoding
+from ..utils import multi_apply, preprocess_panoptic_gt
+from .anchor_free_head import AnchorFreeHead
+
+
+@MODELS.register_module()
+class MaskFormerHead(AnchorFreeHead):
+    """Implements the MaskFormer head.
+
+    See `Per-Pixel Classification is Not All You Need for Semantic
+    Segmentation <https://arxiv.org/pdf/2107.06278>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for feature.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer.
+        pixel_decoder (:obj:`ConfigDict` or dict): Config for pixel
+            decoder.
+        enforce_decoder_input_project (bool): Whether to add a layer
+            to change the embed_dim of transformer encoder in pixel decoder to
+            the embed_dim of transformer decoder. Defaults to False.
+        transformer_decoder (:obj:`ConfigDict` or dict): Config for
+            transformer decoder.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer decoder position encoding.
+        loss_cls (:obj:`ConfigDict` or dict): Config of the classification
+            loss. Defaults to `CrossEntropyLoss`.
+        loss_mask (:obj:`ConfigDict` or dict): Config of the mask loss.
+            Defaults to `FocalLoss`.
+        loss_dice (:obj:`ConfigDict` or dict): Config of the dice loss.
+            Defaults to `DiceLoss`.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            MaskFormer head.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            MaskFormer head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 feat_channels: int,
+                 out_channels: int,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 num_queries: int = 100,
+                 pixel_decoder: ConfigType = ...,
+                 enforce_decoder_input_project: bool = False,
+                 transformer_decoder: ConfigType = ...,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0,
+                     class_weight=[1.0] * 133 + [0.1]),
+                 loss_mask: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=20.0),
+                 loss_dice: ConfigType = dict(
+                     type='DiceLoss',
+                     use_sigmoid=True,
+                     activate=True,
+                     naive_dice=True,
+                     loss_weight=1.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(AnchorFreeHead, self).__init__(init_cfg=init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+
+        pixel_decoder.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = MODELS.build(pixel_decoder)
+        self.transformer_decoder = DetrTransformerDecoder(
+            **transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+        if type(self.pixel_decoder) == PixelDecoder and (
+                self.decoder_embed_dims != in_channels[-1]
+                or enforce_decoder_input_project):
+            self.decoder_input_proj = Conv2d(
+                in_channels[-1], self.decoder_embed_dims, kernel_size=1)
+        else:
+            self.decoder_input_proj = nn.Identity()
+        self.decoder_pe = SinePositionalEncoding(**positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, out_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = TASK_UTILS.build(train_cfg['assigner'])
+            self.sampler = TASK_UTILS.build(
+                train_cfg['sampler'], default_args=dict(context=self))
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_mask = MODELS.build(loss_mask)
+        self.loss_dice = MODELS.build(loss_dice)
+
+    def init_weights(self) -> None:
+        if isinstance(self.decoder_input_proj, Conv2d):
+            caffe2_xavier_init(self.decoder_input_proj, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def preprocess_gt(
+            self, batch_gt_instances: InstanceList,
+            batch_gt_semantic_segs: List[Optional[PixelData]]) -> InstanceList:
+        """Preprocess the ground truth for all images.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``labels``, each is
+                ground truth labels of each bbox, with shape (num_gts, )
+                and ``masks``, each is ground truth masks of each instances
+                of a image, shape (num_gts, h, w).
+            gt_semantic_seg (list[Optional[PixelData]]): Ground truth of
+                semantic segmentation, each with the shape (1, h, w).
+                [0, num_thing_class - 1] means things,
+                [num_thing_class, num_class-1] means stuff,
+                255 means VOID. It's None when training instance segmentation.
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+
+                - labels (Tensor): Ground truth class indices\
+                    for a image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in a image.
+                - masks (Tensor): Ground truth mask for a\
+                    image, with shape (n, h, w).
+        """
+        num_things_list = [self.num_things_classes] * len(batch_gt_instances)
+        num_stuff_list = [self.num_stuff_classes] * len(batch_gt_instances)
+        gt_labels_list = [
+            gt_instances['labels'] for gt_instances in batch_gt_instances
+        ]
+        gt_masks_list = [
+            gt_instances['masks'] for gt_instances in batch_gt_instances
+        ]
+        gt_semantic_segs = [
+            None if gt_semantic_seg is None else gt_semantic_seg.sem_seg
+            for gt_semantic_seg in batch_gt_semantic_segs
+        ]
+        targets = multi_apply(preprocess_panoptic_gt, gt_labels_list,
+                              gt_masks_list, gt_semantic_segs, num_things_list,
+                              num_stuff_list)
+        labels, masks = targets
+        batch_gt_instances = [
+            InstanceData(labels=label, masks=mask)
+            for label, mask in zip(labels, masks)
+        ]
+        return batch_gt_instances
+
+    def get_targets(
+        self,
+        cls_scores_list: List[Tensor],
+        mask_preds_list: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        return_sampling_results: bool = False
+    ) -> Tuple[List[Union[Tensor, int]]]:
+        """Compute classification and mask targets for all images for a decoder
+        layer.
+
+        Args:
+            cls_scores_list (list[Tensor]): Mask score logits from a single
+                decoder layer for all images. Each with shape (num_queries,
+                cls_out_channels).
+            mask_preds_list (list[Tensor]): Mask logits from a single decoder
+                layer for all images. Each with shape (num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+            return_sampling_results (bool): Whether to return the sampling
+                results. Defaults to False.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels of all images.\
+                    Each with shape (num_queries, ).
+                - label_weights_list (list[Tensor]): Label weights\
+                    of all images. Each with shape (num_queries, ).
+                - mask_targets_list (list[Tensor]): Mask targets of\
+                    all images. Each with shape (num_queries, h, w).
+                - mask_weights_list (list[Tensor]): Mask weights of\
+                    all images. Each with shape (num_queries, ).
+                - avg_factor (int): Average factor that is used to average\
+                    the loss. When using sampling method, avg_factor is
+                    usually the sum of positive and negative priors. When
+                    using `MaskPseudoSampler`, `avg_factor` is usually equal
+                    to the number of positive priors.
+
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end.
+        """
+        results = multi_apply(self._get_targets_single, cls_scores_list,
+                              mask_preds_list, batch_gt_instances,
+                              batch_img_metas)
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         pos_inds_list, neg_inds_list, sampling_results_list) = results[:7]
+        rest_results = list(results[7:])
+
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+
+        res = (labels_list, label_weights_list, mask_targets_list,
+               mask_weights_list, avg_factor)
+        if return_sampling_results:
+            res = res + (sampling_results_list)
+
+        return res + tuple(rest_results)
+
+    def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> Tuple[Tensor]:
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_instances (:obj:`InstanceData`): It contains ``labels`` and
+                ``masks``.
+            img_meta (dict): Image informtation.
+
+        Returns:
+            tuple: a tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image.
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image.
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image.
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image.
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        gt_masks = gt_instances.masks
+        gt_labels = gt_instances.labels
+
+        target_shape = mask_pred.shape[-2:]
+        if gt_masks.shape[0] > 0:
+            gt_masks_downsampled = F.interpolate(
+                gt_masks.unsqueeze(1).float(), target_shape,
+                mode='nearest').squeeze(1).long()
+        else:
+            gt_masks_downsampled = gt_masks
+
+        pred_instances = InstanceData(scores=cls_score, masks=mask_pred)
+        downsampled_gt_instances = InstanceData(
+            labels=gt_labels, masks=gt_masks_downsampled)
+        # assign and sample
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=downsampled_gt_instances,
+            img_meta=img_meta)
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones(self.num_queries)
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def loss_by_feat(self, all_cls_scores: Tensor, all_mask_preds: Tensor,
+                     batch_gt_instances: List[InstanceData],
+                     batch_img_metas: List[dict]) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification scores for all decoder
+                layers with shape (num_decoder, batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            all_mask_preds (Tensor): Mask scores for all decoder layers with
+                shape (num_decoder, batch_size, num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_dec_layers = len(all_cls_scores)
+        batch_gt_instances_list = [
+            batch_gt_instances for _ in range(num_dec_layers)
+        ]
+        img_metas_list = [batch_img_metas for _ in range(num_dec_layers)]
+        losses_cls, losses_mask, losses_dice = multi_apply(
+            self._loss_by_feat_single, all_cls_scores, all_mask_preds,
+            batch_gt_instances_list, img_metas_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_mask'] = losses_mask[-1]
+        loss_dict['loss_dice'] = losses_dice[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_mask_i, loss_dice_i in zip(
+                losses_cls[:-1], losses_mask[:-1], losses_dice[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_mask'] = loss_mask_i
+            loss_dict[f'd{num_dec_layer}.loss_dice'] = loss_dice_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def _loss_by_feat_single(self, cls_scores: Tensor, mask_preds: Tensor,
+                             batch_gt_instances: List[InstanceData],
+                             batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single decoder\
+                layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         avg_factor) = self.get_targets(cls_scores_list, mask_preds_list,
+                                        batch_gt_instances, batch_img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([avg_factor]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+        target_shape = mask_targets.shape[-2:]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        # upsample to shape of target
+        # shape (num_total_gts, h, w)
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(1),
+            target_shape,
+            mode='bilinear',
+            align_corners=False).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_preds, mask_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # FocalLoss support input of shape (n, num_class)
+        h, w = mask_preds.shape[-2:]
+        # shape (num_total_gts, h, w) -> (num_total_gts * h * w, 1)
+        mask_preds = mask_preds.reshape(-1, 1)
+        # shape (num_total_gts, h, w) -> (num_total_gts * h * w)
+        mask_targets = mask_targets.reshape(-1)
+        # target is (1 - mask_targets) !!!
+        loss_mask = self.loss_mask(
+            mask_preds, 1 - mask_targets, avg_factor=num_total_masks * h * w)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def forward(self, x: Tuple[Tensor],
+                batch_data_samples: SampleList) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: a tuple contains two elements.
+
+                - all_cls_scores (Tensor): Classification scores for each\
+                    scale level. Each is a 4D-tensor with shape\
+                    (num_decoder, batch_size, num_queries, cls_out_channels).\
+                    Note `cls_out_channels` should includes background.
+                - all_mask_preds (Tensor): Mask scores for each decoder\
+                    layer. Each with shape (num_decoder, batch_size,\
+                    num_queries, h, w).
+        """
+        batch_img_metas = [
+            data_sample.metainfo for data_sample in batch_data_samples
+        ]
+        batch_size = x[0].shape[0]
+        input_img_h, input_img_w = batch_img_metas[0]['batch_input_shape']
+        padding_mask = x[-1].new_ones((batch_size, input_img_h, input_img_w),
+                                      dtype=torch.float32)
+        for i in range(batch_size):
+            img_h, img_w = batch_img_metas[i]['img_shape']
+            padding_mask[i, :img_h, :img_w] = 0
+        padding_mask = F.interpolate(
+            padding_mask.unsqueeze(1), size=x[-1].shape[-2:],
+            mode='nearest').to(torch.bool).squeeze(1)
+        # when backbone is swin, memory is output of last stage of swin.
+        # when backbone is r50, memory is output of tranformer encoder.
+        mask_features, memory = self.pixel_decoder(x, batch_img_metas)
+        pos_embed = self.decoder_pe(padding_mask)
+        memory = self.decoder_input_proj(memory)
+        # shape (batch_size, c, h, w) -> (batch_size, h*w, c)
+        memory = memory.flatten(2).permute(0, 2, 1)
+        pos_embed = pos_embed.flatten(2).permute(0, 2, 1)
+        # shape (batch_size, h * w)
+        padding_mask = padding_mask.flatten(1)
+        # shape = (num_queries, embed_dims)
+        query_embed = self.query_embed.weight
+        # shape = (batch_size, num_queries, embed_dims)
+        query_embed = query_embed.unsqueeze(0).repeat(batch_size, 1, 1)
+        target = torch.zeros_like(query_embed)
+        # shape (num_decoder, num_queries, batch_size, embed_dims)
+        out_dec = self.transformer_decoder(
+            query=target,
+            key=memory,
+            value=memory,
+            query_pos=query_embed,
+            key_pos=pos_embed,
+            key_padding_mask=padding_mask)
+
+        # cls_scores
+        all_cls_scores = self.cls_embed(out_dec)
+
+        # mask_preds
+        mask_embed = self.mask_embed(out_dec)
+        all_mask_preds = torch.einsum('lbqc,bchw->lbqhw', mask_embed,
+                                      mask_features)
+
+        return all_cls_scores, all_mask_preds
+
+    def loss(
+        self,
+        x: Tuple[Tensor],
+        batch_data_samples: SampleList,
+    ) -> Dict[str, Tensor]:
+        """Perform forward propagation and loss calculation of the panoptic
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+        batch_gt_semantic_segs = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+            if 'gt_sem_seg' in data_sample:
+                batch_gt_semantic_segs.append(data_sample.gt_sem_seg)
+            else:
+                batch_gt_semantic_segs.append(None)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+
+        # preprocess ground truth
+        batch_gt_instances = self.preprocess_gt(batch_gt_instances,
+                                                batch_gt_semantic_segs)
+
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self, x: Tuple[Tensor],
+                batch_data_samples: SampleList) -> Tuple[Tensor]:
+        """Test without augmentaton.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: A tuple contains two tensors.
+
+                - mask_cls_results (Tensor): Mask classification logits,\
+                    shape (batch_size, num_queries, cls_out_channels).
+                    Note `cls_out_channels` should includes background.
+                - mask_pred_results (Tensor): Mask logits, shape \
+                    (batch_size, num_queries, h, w).
+        """
+        batch_img_metas = [
+            data_sample.metainfo for data_sample in batch_data_samples
+        ]
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        # upsample masks
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=(img_shape[0], img_shape[1]),
+            mode='bilinear',
+            align_corners=False)
+
+        return mask_cls_results, mask_pred_results
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/nasfcos_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/nasfcos_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..14ee62a7910d90a108fefb2acef00c91ab83ecc8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/nasfcos_head.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+
+from mmdet.models.dense_heads.fcos_head import FCOSHead
+from mmdet.registry import MODELS
+from mmdet.utils import OptMultiConfig
+
+
+@MODELS.register_module()
+class NASFCOSHead(FCOSHead):
+    """Anchor-free head used in `NASFCOS <https://arxiv.org/abs/1906.04423>`_.
+
+    It is quite similar with FCOS head, except for the searched structure of
+    classification branch and bbox regression branch, where a structure of
+    "dconv3x3, conv3x3, dconv3x3, conv1x1" is utilized instead.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        strides (Sequence[int] or Sequence[Tuple[int, int]]): Strides of points
+            in multiple feature levels. Defaults to (4, 8, 16, 32, 64).
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling.
+            Defaults to False.
+        center_sample_radius (float): Radius of center sampling.
+            Defaults to 1.5.
+        norm_on_bbox (bool): If true, normalize the regression targets with
+            FPN strides. Defaults to False.
+        centerness_on_reg (bool): If true, position centerness on the
+            regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
+            Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Defaults to "auto".
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_centerness (:obj:`ConfigDict`, or dict): Config of centerness
+            loss.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer.  Defaults to
+            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], opitonal): Initialization config dict.
+    """  # noqa: E501
+
+    def __init__(self,
+                 *args,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        if init_cfg is None:
+            init_cfg = [
+                dict(type='Caffe2Xavier', layer=['ConvModule', 'Conv2d']),
+                dict(
+                    type='Normal',
+                    std=0.01,
+                    override=[
+                        dict(name='conv_reg'),
+                        dict(name='conv_centerness'),
+                        dict(
+                            name='conv_cls',
+                            type='Normal',
+                            std=0.01,
+                            bias_prob=0.01)
+                    ]),
+            ]
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        dconv3x3_config = dict(
+            type='DCNv2',
+            kernel_size=3,
+            use_bias=True,
+            deform_groups=2,
+            padding=1)
+        conv3x3_config = dict(type='Conv', kernel_size=3, padding=1)
+        conv1x1_config = dict(type='Conv', kernel_size=1)
+
+        self.arch_config = [
+            dconv3x3_config, conv3x3_config, dconv3x3_config, conv1x1_config
+        ]
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i, op_ in enumerate(self.arch_config):
+            op = copy.deepcopy(op_)
+            chn = self.in_channels if i == 0 else self.feat_channels
+            assert isinstance(op, dict)
+            use_bias = op.pop('use_bias', False)
+            padding = op.pop('padding', 0)
+            kernel_size = op.pop('kernel_size')
+            module = ConvModule(
+                chn,
+                self.feat_channels,
+                kernel_size,
+                stride=1,
+                padding=padding,
+                norm_cfg=self.norm_cfg,
+                bias=use_bias,
+                conv_cfg=op)
+
+            self.cls_convs.append(copy.deepcopy(module))
+            self.reg_convs.append(copy.deepcopy(module))
+
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
+
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/paa_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/paa_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c1f453d2788b354970254e8875068e824c370d4
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/paa_head.py
@@ -0,0 +1,730 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList)
+from ..layers import multiclass_nms
+from ..utils import levels_to_images, multi_apply
+from . import ATSSHead
+
+EPS = 1e-12
+try:
+    import sklearn.mixture as skm
+except ImportError:
+    skm = None
+
+
+@MODELS.register_module()
+class PAAHead(ATSSHead):
+    """Head of PAAAssignment: Probabilistic Anchor Assignment with IoU
+    Prediction for Object Detection.
+
+    Code is modified from the `official github repo
+    <https://github.com/kkhoot/PAA/blob/master/paa_core
+    /modeling/rpn/paa/loss.py>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.08103>`_ .
+
+    Args:
+        topk (int): Select topk samples with smallest loss in
+            each level.
+        score_voting (bool): Whether to use score voting in post-process.
+        covariance_type : String describing the type of covariance parameters
+            to be used in :class:`sklearn.mixture.GaussianMixture`.
+            It must be one of:
+
+            - 'full': each component has its own general covariance matrix
+            - 'tied': all components share the same general covariance matrix
+            - 'diag': each component has its own diagonal covariance matrix
+            - 'spherical': each component has its own single variance
+            Default: 'diag'. From 'full' to 'spherical', the gmm fitting
+            process is faster yet the performance could be influenced. For most
+            cases, 'diag' should be a good choice.
+    """
+
+    def __init__(self,
+                 *args,
+                 topk: int = 9,
+                 score_voting: bool = True,
+                 covariance_type: str = 'diag',
+                 **kwargs):
+        # topk used in paa reassign process
+        self.topk = topk
+        self.with_score_voting = score_voting
+        self.covariance_type = covariance_type
+        super().__init__(*args, **kwargs)
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            iou_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            iou_preds (list[Tensor]): iou_preds for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss gmm_assignment.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+        )
+        (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds,
+         pos_gt_index) = cls_reg_targets
+        cls_scores = levels_to_images(cls_scores)
+        cls_scores = [
+            item.reshape(-1, self.cls_out_channels) for item in cls_scores
+        ]
+        bbox_preds = levels_to_images(bbox_preds)
+        bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
+        iou_preds = levels_to_images(iou_preds)
+        iou_preds = [item.reshape(-1, 1) for item in iou_preds]
+        pos_losses_list, = multi_apply(self.get_pos_loss, anchor_list,
+                                       cls_scores, bbox_preds, labels,
+                                       labels_weight, bboxes_target,
+                                       bboxes_weight, pos_inds)
+
+        with torch.no_grad():
+            reassign_labels, reassign_label_weight, \
+                reassign_bbox_weights, num_pos = multi_apply(
+                    self.paa_reassign,
+                    pos_losses_list,
+                    labels,
+                    labels_weight,
+                    bboxes_weight,
+                    pos_inds,
+                    pos_gt_index,
+                    anchor_list)
+            num_pos = sum(num_pos)
+        # convert all tensor list to a flatten tensor
+        cls_scores = torch.cat(cls_scores, 0).view(-1, cls_scores[0].size(-1))
+        bbox_preds = torch.cat(bbox_preds, 0).view(-1, bbox_preds[0].size(-1))
+        iou_preds = torch.cat(iou_preds, 0).view(-1, iou_preds[0].size(-1))
+        labels = torch.cat(reassign_labels, 0).view(-1)
+        flatten_anchors = torch.cat(
+            [torch.cat(item, 0) for item in anchor_list])
+        labels_weight = torch.cat(reassign_label_weight, 0).view(-1)
+        bboxes_target = torch.cat(bboxes_target,
+                                  0).view(-1, bboxes_target[0].size(-1))
+
+        pos_inds_flatten = ((labels >= 0)
+                            &
+                            (labels < self.num_classes)).nonzero().reshape(-1)
+
+        losses_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            labels_weight,
+            avg_factor=max(num_pos, len(batch_img_metas)))  # avoid num_pos=0
+        if num_pos:
+            pos_bbox_pred = self.bbox_coder.decode(
+                flatten_anchors[pos_inds_flatten],
+                bbox_preds[pos_inds_flatten])
+            pos_bbox_target = bboxes_target[pos_inds_flatten]
+            iou_target = bbox_overlaps(
+                pos_bbox_pred.detach(), pos_bbox_target, is_aligned=True)
+            losses_iou = self.loss_centerness(
+                iou_preds[pos_inds_flatten],
+                iou_target.unsqueeze(-1),
+                avg_factor=num_pos)
+            losses_bbox = self.loss_bbox(
+                pos_bbox_pred,
+                pos_bbox_target,
+                iou_target.clamp(min=EPS),
+                avg_factor=iou_target.sum())
+        else:
+            losses_iou = iou_preds.sum() * 0
+            losses_bbox = bbox_preds.sum() * 0
+
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_iou=losses_iou)
+
+    def get_pos_loss(self, anchors: List[Tensor], cls_score: Tensor,
+                     bbox_pred: Tensor, label: Tensor, label_weight: Tensor,
+                     bbox_target: dict, bbox_weight: Tensor,
+                     pos_inds: Tensor) -> Tensor:
+        """Calculate loss of all potential positive samples obtained from first
+        match process.
+
+        Args:
+            anchors (list[Tensor]): Anchors of each scale.
+            cls_score (Tensor): Box scores of single image with shape
+                (num_anchors, num_classes)
+            bbox_pred (Tensor): Box energies / deltas of single image
+                with shape (num_anchors, 4)
+            label (Tensor): classification target of each anchor with
+                shape (num_anchors,)
+            label_weight (Tensor): Classification loss weight of each
+                anchor with shape (num_anchors).
+            bbox_target (dict): Regression target of each anchor with
+                shape (num_anchors, 4).
+            bbox_weight (Tensor): Bbox weight of each anchor with shape
+                (num_anchors, 4).
+            pos_inds (Tensor): Index of all positive samples got from
+                first assign process.
+
+        Returns:
+            Tensor: Losses of all positive samples in single image.
+        """
+        if not len(pos_inds):
+            return cls_score.new([]),
+        anchors_all_level = torch.cat(anchors, 0)
+        pos_scores = cls_score[pos_inds]
+        pos_bbox_pred = bbox_pred[pos_inds]
+        pos_label = label[pos_inds]
+        pos_label_weight = label_weight[pos_inds]
+        pos_bbox_target = bbox_target[pos_inds]
+        pos_bbox_weight = bbox_weight[pos_inds]
+        pos_anchors = anchors_all_level[pos_inds]
+        pos_bbox_pred = self.bbox_coder.decode(pos_anchors, pos_bbox_pred)
+
+        # to keep loss dimension
+        loss_cls = self.loss_cls(
+            pos_scores,
+            pos_label,
+            pos_label_weight,
+            avg_factor=1.0,
+            reduction_override='none')
+
+        loss_bbox = self.loss_bbox(
+            pos_bbox_pred,
+            pos_bbox_target,
+            pos_bbox_weight,
+            avg_factor=1.0,  # keep same loss weight before reassign
+            reduction_override='none')
+
+        loss_cls = loss_cls.sum(-1)
+        pos_loss = loss_bbox + loss_cls
+        return pos_loss,
+
+    def paa_reassign(self, pos_losses: Tensor, label: Tensor,
+                     label_weight: Tensor, bbox_weight: Tensor,
+                     pos_inds: Tensor, pos_gt_inds: Tensor,
+                     anchors: List[Tensor]) -> tuple:
+        """Fit loss to GMM distribution and separate positive, ignore, negative
+        samples again with GMM model.
+
+        Args:
+            pos_losses (Tensor): Losses of all positive samples in
+                single image.
+            label (Tensor): classification target of each anchor with
+                shape (num_anchors,)
+            label_weight (Tensor): Classification loss weight of each
+                anchor with shape (num_anchors).
+            bbox_weight (Tensor): Bbox weight of each anchor with shape
+                (num_anchors, 4).
+            pos_inds (Tensor): Index of all positive samples got from
+                first assign process.
+            pos_gt_inds (Tensor): Gt_index of all positive samples got
+                from first assign process.
+            anchors (list[Tensor]): Anchors of each scale.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - label (Tensor): classification target of each anchor after
+                  paa assign, with shape (num_anchors,)
+                - label_weight (Tensor): Classification loss weight of each
+                  anchor after paa assign, with shape (num_anchors).
+                - bbox_weight (Tensor): Bbox weight of each anchor with shape
+                  (num_anchors, 4).
+                - num_pos (int): The number of positive samples after paa
+                  assign.
+        """
+        if not len(pos_inds):
+            return label, label_weight, bbox_weight, 0
+        label = label.clone()
+        label_weight = label_weight.clone()
+        bbox_weight = bbox_weight.clone()
+        num_gt = pos_gt_inds.max() + 1
+        num_level = len(anchors)
+        num_anchors_each_level = [item.size(0) for item in anchors]
+        num_anchors_each_level.insert(0, 0)
+        inds_level_interval = np.cumsum(num_anchors_each_level)
+        pos_level_mask = []
+        for i in range(num_level):
+            mask = (pos_inds >= inds_level_interval[i]) & (
+                pos_inds < inds_level_interval[i + 1])
+            pos_level_mask.append(mask)
+        pos_inds_after_paa = [label.new_tensor([])]
+        ignore_inds_after_paa = [label.new_tensor([])]
+        for gt_ind in range(num_gt):
+            pos_inds_gmm = []
+            pos_loss_gmm = []
+            gt_mask = pos_gt_inds == gt_ind
+            for level in range(num_level):
+                level_mask = pos_level_mask[level]
+                level_gt_mask = level_mask & gt_mask
+                value, topk_inds = pos_losses[level_gt_mask].topk(
+                    min(level_gt_mask.sum(), self.topk), largest=False)
+                pos_inds_gmm.append(pos_inds[level_gt_mask][topk_inds])
+                pos_loss_gmm.append(value)
+            pos_inds_gmm = torch.cat(pos_inds_gmm)
+            pos_loss_gmm = torch.cat(pos_loss_gmm)
+            # fix gmm need at least two sample
+            if len(pos_inds_gmm) < 2:
+                continue
+            device = pos_inds_gmm.device
+            pos_loss_gmm, sort_inds = pos_loss_gmm.sort()
+            pos_inds_gmm = pos_inds_gmm[sort_inds]
+            pos_loss_gmm = pos_loss_gmm.view(-1, 1).cpu().numpy()
+            min_loss, max_loss = pos_loss_gmm.min(), pos_loss_gmm.max()
+            means_init = np.array([min_loss, max_loss]).reshape(2, 1)
+            weights_init = np.array([0.5, 0.5])
+            precisions_init = np.array([1.0, 1.0]).reshape(2, 1, 1)  # full
+            if self.covariance_type == 'spherical':
+                precisions_init = precisions_init.reshape(2)
+            elif self.covariance_type == 'diag':
+                precisions_init = precisions_init.reshape(2, 1)
+            elif self.covariance_type == 'tied':
+                precisions_init = np.array([[1.0]])
+            if skm is None:
+                raise ImportError('Please run "pip install sklearn" '
+                                  'to install sklearn first.')
+            gmm = skm.GaussianMixture(
+                2,
+                weights_init=weights_init,
+                means_init=means_init,
+                precisions_init=precisions_init,
+                covariance_type=self.covariance_type)
+            gmm.fit(pos_loss_gmm)
+            gmm_assignment = gmm.predict(pos_loss_gmm)
+            scores = gmm.score_samples(pos_loss_gmm)
+            gmm_assignment = torch.from_numpy(gmm_assignment).to(device)
+            scores = torch.from_numpy(scores).to(device)
+
+            pos_inds_temp, ignore_inds_temp = self.gmm_separation_scheme(
+                gmm_assignment, scores, pos_inds_gmm)
+            pos_inds_after_paa.append(pos_inds_temp)
+            ignore_inds_after_paa.append(ignore_inds_temp)
+
+        pos_inds_after_paa = torch.cat(pos_inds_after_paa)
+        ignore_inds_after_paa = torch.cat(ignore_inds_after_paa)
+        reassign_mask = (pos_inds.unsqueeze(1) != pos_inds_after_paa).all(1)
+        reassign_ids = pos_inds[reassign_mask]
+        label[reassign_ids] = self.num_classes
+        label_weight[ignore_inds_after_paa] = 0
+        bbox_weight[reassign_ids] = 0
+        num_pos = len(pos_inds_after_paa)
+        return label, label_weight, bbox_weight, num_pos
+
+    def gmm_separation_scheme(self, gmm_assignment: Tensor, scores: Tensor,
+                              pos_inds_gmm: Tensor) -> Tuple[Tensor, Tensor]:
+        """A general separation scheme for gmm model.
+
+        It separates a GMM distribution of candidate samples into three
+        parts, 0 1 and uncertain areas, and you can implement other
+        separation schemes by rewriting this function.
+
+        Args:
+            gmm_assignment (Tensor): The prediction of GMM which is of shape
+                (num_samples,). The 0/1 value indicates the distribution
+                that each sample comes from.
+            scores (Tensor): The probability of sample coming from the
+                fit GMM distribution. The tensor is of shape (num_samples,).
+            pos_inds_gmm (Tensor): All the indexes of samples which are used
+                to fit GMM model. The tensor is of shape (num_samples,)
+
+        Returns:
+            tuple[Tensor, Tensor]: The indices of positive and ignored samples.
+
+                - pos_inds_temp (Tensor): Indices of positive samples.
+                - ignore_inds_temp (Tensor): Indices of ignore samples.
+        """
+        # The implementation is (c) in Fig.3 in origin paper instead of (b).
+        # You can refer to issues such as
+        # https://github.com/kkhoot/PAA/issues/8 and
+        # https://github.com/kkhoot/PAA/issues/9.
+        fgs = gmm_assignment == 0
+        pos_inds_temp = fgs.new_tensor([], dtype=torch.long)
+        ignore_inds_temp = fgs.new_tensor([], dtype=torch.long)
+        if fgs.nonzero().numel():
+            _, pos_thr_ind = scores[fgs].topk(1)
+            pos_inds_temp = pos_inds_gmm[fgs][:pos_thr_ind + 1]
+            ignore_inds_temp = pos_inds_gmm.new_tensor([])
+        return pos_inds_temp, ignore_inds_temp
+
+    def get_targets(self,
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True) -> tuple:
+        """Get targets for PAA head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. We direct
+        return the results from _get_targets_single instead map it to levels
+        by images_to_levels function.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels (list[Tensor]): Labels of all anchors, each with
+                    shape (num_anchors,).
+                - label_weights (list[Tensor]): Label weights of all anchor.
+                    each with shape (num_anchors,).
+                - bbox_targets (list[Tensor]): BBox targets of all anchors.
+                    each with shape (num_anchors, 4).
+                - bbox_weights (list[Tensor]): BBox weights of all anchors.
+                    each with shape (num_anchors, 4).
+                - pos_inds (list[Tensor]): Contains all index of positive
+                    sample in all anchor.
+                - gt_inds (list[Tensor]): Contains all gt_index of positive
+                    sample in all anchor.
+        """
+
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        results = multi_apply(
+            self._get_targets_single,
+            concat_anchor_list,
+            concat_valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore,
+            unmap_outputs=unmap_outputs)
+
+        (labels, label_weights, bbox_targets, bbox_weights, valid_pos_inds,
+         valid_neg_inds, sampling_result) = results
+
+        # Due to valid flag of anchors, we have to calculate the real pos_inds
+        # in origin anchor set.
+        pos_inds = []
+        for i, single_labels in enumerate(labels):
+            pos_mask = (0 <= single_labels) & (
+                single_labels < self.num_classes)
+            pos_inds.append(pos_mask.nonzero().view(-1))
+
+        gt_inds = [item.pos_assigned_gt_inds for item in sampling_result]
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                gt_inds)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        This method is same as `AnchorHead._get_targets_single()`.
+        """
+        assert unmap_outputs, 'We must map outputs back to the original' \
+                              'set of anchors in PAAhead'
+        return super(ATSSHead, self)._get_targets_single(
+            flat_anchors,
+            valid_flags,
+            gt_instances,
+            img_meta,
+            gt_instances_ignore,
+            unmap_outputs=True)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        score_factors: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: OptConfigType = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        This method is same as `BaseDenseHead.get_results()`.
+        """
+        assert with_nms, 'PAA only supports "with_nms=True" now and it ' \
+                         'means PAAHead does not support ' \
+                         'test-time augmentation'
+        return super().predict_by_feat(
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            score_factors=score_factors,
+            batch_img_metas=batch_img_metas,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms)
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: OptConfigType = None,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factors from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict` or dict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_score_factors = []
+        for level_idx, (cls_score, bbox_pred, score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            score_factor = score_factor.permute(1, 2, 0).reshape(-1).sigmoid()
+
+            if 0 < nms_pre < scores.shape[0]:
+                max_scores, _ = (scores *
+                                 score_factor[:, None]).sqrt().max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                priors = priors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                score_factor = score_factor[topk_inds]
+
+            bboxes = self.bbox_coder.decode(
+                priors, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_score_factors.append(score_factor)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.score_factors = torch.cat(mlvl_score_factors)
+
+        return self._bbox_post_process(results, cfg, rescale, with_nms,
+                                       img_meta)
+
+    def _bbox_post_process(self,
+                           results: InstanceData,
+                           cfg: ConfigType,
+                           rescale: bool = False,
+                           with_nms: bool = True,
+                           img_meta: Optional[dict] = None):
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually with_nms is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (:obj:`ConfigDict` or dict): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if rescale:
+            results.bboxes /= results.bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+        # Add a dummy background class to the backend when using sigmoid
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        padding = results.scores.new_zeros(results.scores.shape[0], 1)
+        mlvl_scores = torch.cat([results.scores, padding], dim=1)
+
+        mlvl_nms_scores = (mlvl_scores * results.score_factors[:, None]).sqrt()
+        det_bboxes, det_labels = multiclass_nms(
+            results.bboxes,
+            mlvl_nms_scores,
+            cfg.score_thr,
+            cfg.nms,
+            cfg.max_per_img,
+            score_factors=None)
+        if self.with_score_voting and len(det_bboxes) > 0:
+            det_bboxes, det_labels = self.score_voting(det_bboxes, det_labels,
+                                                       results.bboxes,
+                                                       mlvl_nms_scores,
+                                                       cfg.score_thr)
+        nms_results = InstanceData()
+        nms_results.bboxes = det_bboxes[:, :-1]
+        nms_results.scores = det_bboxes[:, -1]
+        nms_results.labels = det_labels
+        return nms_results
+
+    def score_voting(self, det_bboxes: Tensor, det_labels: Tensor,
+                     mlvl_bboxes: Tensor, mlvl_nms_scores: Tensor,
+                     score_thr: float) -> Tuple[Tensor, Tensor]:
+        """Implementation of score voting method works on each remaining boxes
+        after NMS procedure.
+
+        Args:
+            det_bboxes (Tensor): Remaining boxes after NMS procedure,
+                with shape (k, 5), each dimension means
+                (x1, y1, x2, y2, score).
+            det_labels (Tensor): The label of remaining boxes, with shape
+                (k, 1),Labels are 0-based.
+            mlvl_bboxes (Tensor): All boxes before the NMS procedure,
+                with shape (num_anchors,4).
+            mlvl_nms_scores (Tensor): The scores of all boxes which is used
+                in the NMS procedure, with shape (num_anchors, num_class)
+            score_thr (float): The score threshold of bboxes.
+
+        Returns:
+            tuple: Usually returns a tuple containing voting results.
+
+                - det_bboxes_voted (Tensor): Remaining boxes after
+                    score voting procedure, with shape (k, 5), each
+                    dimension means (x1, y1, x2, y2, score).
+                - det_labels_voted (Tensor): Label of remaining bboxes
+                    after voting, with shape (num_anchors,).
+        """
+        candidate_mask = mlvl_nms_scores > score_thr
+        candidate_mask_nonzeros = candidate_mask.nonzero(as_tuple=False)
+        candidate_inds = candidate_mask_nonzeros[:, 0]
+        candidate_labels = candidate_mask_nonzeros[:, 1]
+        candidate_bboxes = mlvl_bboxes[candidate_inds]
+        candidate_scores = mlvl_nms_scores[candidate_mask]
+        det_bboxes_voted = []
+        det_labels_voted = []
+        for cls in range(self.cls_out_channels):
+            candidate_cls_mask = candidate_labels == cls
+            if not candidate_cls_mask.any():
+                continue
+            candidate_cls_scores = candidate_scores[candidate_cls_mask]
+            candidate_cls_bboxes = candidate_bboxes[candidate_cls_mask]
+            det_cls_mask = det_labels == cls
+            det_cls_bboxes = det_bboxes[det_cls_mask].view(
+                -1, det_bboxes.size(-1))
+            det_candidate_ious = bbox_overlaps(det_cls_bboxes[:, :4],
+                                               candidate_cls_bboxes)
+            for det_ind in range(len(det_cls_bboxes)):
+                single_det_ious = det_candidate_ious[det_ind]
+                pos_ious_mask = single_det_ious > 0.01
+                pos_ious = single_det_ious[pos_ious_mask]
+                pos_bboxes = candidate_cls_bboxes[pos_ious_mask]
+                pos_scores = candidate_cls_scores[pos_ious_mask]
+                pis = (torch.exp(-(1 - pos_ious)**2 / 0.025) *
+                       pos_scores)[:, None]
+                voted_box = torch.sum(
+                    pis * pos_bboxes, dim=0) / torch.sum(
+                        pis, dim=0)
+                voted_score = det_cls_bboxes[det_ind][-1:][None, :]
+                det_bboxes_voted.append(
+                    torch.cat((voted_box[None, :], voted_score), dim=1))
+                det_labels_voted.append(cls)
+
+        det_bboxes_voted = torch.cat(det_bboxes_voted, dim=0)
+        det_labels_voted = det_labels.new_tensor(det_labels_voted)
+        return det_bboxes_voted, det_labels_voted
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/pisa_retinanet_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/pisa_retinanet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..85fd54f5be3605d0994c2a2d4d9d7deac4c0f284
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/pisa_retinanet_head.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptInstanceList
+from ..losses import carl_loss, isr_p
+from ..utils import images_to_levels
+from .retina_head import RetinaHead
+
+
+@MODELS.register_module()
+class PISARetinaHead(RetinaHead):
+    """PISA Retinanet Head.
+
+    The head owns the same structure with Retinanet Head, but differs in two
+        aspects:
+        1. Importance-based Sample Reweighting Positive (ISR-P) is applied to
+            change the positive loss weights.
+        2. Classification-aware regression loss is adopted as a third loss.
+    """
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: Loss dict, comprise classification loss, regression loss and
+            carl loss.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            return_sampling_results=True)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results_list) = cls_reg_targets
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        num_imgs = len(batch_img_metas)
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, label_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_cls_scores = torch.cat(
+            flatten_cls_scores, dim=1).reshape(-1,
+                                               flatten_cls_scores[0].size(-1))
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_bbox_preds = torch.cat(
+            flatten_bbox_preds, dim=1).view(-1, flatten_bbox_preds[0].size(-1))
+        flatten_labels = torch.cat(labels_list, dim=1).reshape(-1)
+        flatten_label_weights = torch.cat(
+            label_weights_list, dim=1).reshape(-1)
+        flatten_anchors = torch.cat(all_anchor_list, dim=1).reshape(-1, 4)
+        flatten_bbox_targets = torch.cat(
+            bbox_targets_list, dim=1).reshape(-1, 4)
+        flatten_bbox_weights = torch.cat(
+            bbox_weights_list, dim=1).reshape(-1, 4)
+
+        # Apply ISR-P
+        isr_cfg = self.train_cfg.get('isr', None)
+        if isr_cfg is not None:
+            all_targets = (flatten_labels, flatten_label_weights,
+                           flatten_bbox_targets, flatten_bbox_weights)
+            with torch.no_grad():
+                all_targets = isr_p(
+                    flatten_cls_scores,
+                    flatten_bbox_preds,
+                    all_targets,
+                    flatten_anchors,
+                    sampling_results_list,
+                    bbox_coder=self.bbox_coder,
+                    loss_cls=self.loss_cls,
+                    num_class=self.num_classes,
+                    **self.train_cfg['isr'])
+            (flatten_labels, flatten_label_weights, flatten_bbox_targets,
+             flatten_bbox_weights) = all_targets
+
+        # For convenience we compute loss once instead separating by fpn level,
+        # so that we don't need to separate the weights by level again.
+        # The result should be the same
+        losses_cls = self.loss_cls(
+            flatten_cls_scores,
+            flatten_labels,
+            flatten_label_weights,
+            avg_factor=avg_factor)
+        losses_bbox = self.loss_bbox(
+            flatten_bbox_preds,
+            flatten_bbox_targets,
+            flatten_bbox_weights,
+            avg_factor=avg_factor)
+        loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+        # CARL Loss
+        carl_cfg = self.train_cfg.get('carl', None)
+        if carl_cfg is not None:
+            loss_carl = carl_loss(
+                flatten_cls_scores,
+                flatten_labels,
+                flatten_bbox_preds,
+                flatten_bbox_targets,
+                self.loss_bbox,
+                **self.train_cfg['carl'],
+                avg_factor=avg_factor,
+                sigmoid=True,
+                num_class=self.num_classes)
+            loss_dict.update(loss_carl)
+
+        return loss_dict
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/pisa_ssd_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/pisa_ssd_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec09cb40a9c95d3f9889d736b80dfccef07f6fd1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/pisa_ssd_head.py
@@ -0,0 +1,182 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptInstanceList
+from ..losses import CrossEntropyLoss, SmoothL1Loss, carl_loss, isr_p
+from ..utils import multi_apply
+from .ssd_head import SSDHead
+
+
+# TODO: add loss evaluator for SSD
+@MODELS.register_module()
+class PISASSDHead(SSDHead):
+    """Implementation of `PISA SSD head <https://arxiv.org/abs/1904.04821>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Sequence[int]): Number of channels in the input feature
+            map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Defaults to 0.
+        feat_channels (int): Number of hidden channels when stacked_convs
+            > 0. Defaults to 256.
+        use_depthwise (bool): Whether to use DepthwiseSeparableConv.
+            Defaults to False.
+        conv_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config conv layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config norm layer. Defaults to None.
+        act_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config activation layer. Defaults to None.
+        anchor_generator (:obj:`ConfigDict` or dict): Config dict for anchor
+            generator.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Defaults to False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Training config of
+            anchor head.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of
+            anchor head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], Optional): Initialization config dict.
+    """  # noqa: W605
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Union[List[Tensor], Tensor]]:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Union[List[Tensor], Tensor]]: A dictionary of loss
+            components. the dict has components below:
+
+            - loss_cls (list[Tensor]): A list containing each feature map \
+            classification loss.
+            - loss_bbox (list[Tensor]): A list containing each feature map \
+            regression loss.
+            - loss_carl (Tensor): The loss of CARL.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            unmap_outputs=False,
+            return_sampling_results=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results_list) = cls_reg_targets
+
+        num_images = len(batch_img_metas)
+        all_cls_scores = torch.cat([
+            s.permute(0, 2, 3, 1).reshape(
+                num_images, -1, self.cls_out_channels) for s in cls_scores
+        ], 1)
+        all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+        all_label_weights = torch.cat(label_weights_list,
+                                      -1).view(num_images, -1)
+        all_bbox_preds = torch.cat([
+            b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+            for b in bbox_preds
+        ], -2)
+        all_bbox_targets = torch.cat(bbox_targets_list,
+                                     -2).view(num_images, -1, 4)
+        all_bbox_weights = torch.cat(bbox_weights_list,
+                                     -2).view(num_images, -1, 4)
+
+        # concat all level anchors to a single tensor
+        all_anchors = []
+        for i in range(num_images):
+            all_anchors.append(torch.cat(anchor_list[i]))
+
+        isr_cfg = self.train_cfg.get('isr', None)
+        all_targets = (all_labels.view(-1), all_label_weights.view(-1),
+                       all_bbox_targets.view(-1,
+                                             4), all_bbox_weights.view(-1, 4))
+        # apply ISR-P
+        if isr_cfg is not None:
+            all_targets = isr_p(
+                all_cls_scores.view(-1, all_cls_scores.size(-1)),
+                all_bbox_preds.view(-1, 4),
+                all_targets,
+                torch.cat(all_anchors),
+                sampling_results_list,
+                loss_cls=CrossEntropyLoss(),
+                bbox_coder=self.bbox_coder,
+                **self.train_cfg['isr'],
+                num_class=self.num_classes)
+            (new_labels, new_label_weights, new_bbox_targets,
+             new_bbox_weights) = all_targets
+            all_labels = new_labels.view(all_labels.shape)
+            all_label_weights = new_label_weights.view(all_label_weights.shape)
+            all_bbox_targets = new_bbox_targets.view(all_bbox_targets.shape)
+            all_bbox_weights = new_bbox_weights.view(all_bbox_weights.shape)
+
+        # add CARL loss
+        carl_loss_cfg = self.train_cfg.get('carl', None)
+        if carl_loss_cfg is not None:
+            loss_carl = carl_loss(
+                all_cls_scores.view(-1, all_cls_scores.size(-1)),
+                all_targets[0],
+                all_bbox_preds.view(-1, 4),
+                all_targets[2],
+                SmoothL1Loss(beta=1.),
+                **self.train_cfg['carl'],
+                avg_factor=avg_factor,
+                num_class=self.num_classes)
+
+        # check NaN and Inf
+        assert torch.isfinite(all_cls_scores).all().item(), \
+            'classification scores become infinite or NaN!'
+        assert torch.isfinite(all_bbox_preds).all().item(), \
+            'bbox predications become infinite or NaN!'
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            all_cls_scores,
+            all_bbox_preds,
+            all_anchors,
+            all_labels,
+            all_label_weights,
+            all_bbox_targets,
+            all_bbox_weights,
+            avg_factor=avg_factor)
+        loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+        if carl_loss_cfg is not None:
+            loss_dict.update(loss_carl)
+        return loss_dict
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/reppoints_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/reppoints_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..22f3e3401a4abd9cc35b41d24efe23e5655a905e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/reppoints_head.py
@@ -0,0 +1,885 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Sequence, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import DeformConv2d
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptInstanceList
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..task_modules.samplers import PseudoSampler
+from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply,
+                     unmap)
+from .anchor_free_head import AnchorFreeHead
+
+
+@MODELS.register_module()
+class RepPointsHead(AnchorFreeHead):
+    """RepPoint head.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        point_feat_channels (int): Number of channels of points features.
+        num_points (int): Number of points.
+        gradient_mul (float): The multiplier to gradients from
+            points refinement and recognition.
+        point_strides (Sequence[int]): points strides.
+        point_base_scale (int): bbox scale for assigning labels.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox_init (:obj:`ConfigDict` or dict): Config of initial points
+            loss.
+        loss_bbox_refine (:obj:`ConfigDict` or dict): Config of points loss in
+            refinement.
+        use_grid_points (bool): If we use bounding box representation, the
+        reppoints is represented as grid points on the bounding box.
+        center_init (bool): Whether to use center point assignment.
+        transform_method (str): The methods to transform RepPoints to bbox.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 point_feat_channels: int = 256,
+                 num_points: int = 9,
+                 gradient_mul: float = 0.1,
+                 point_strides: Sequence[int] = [8, 16, 32, 64, 128],
+                 point_base_scale: int = 4,
+                 loss_cls: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox_init: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.5),
+                 loss_bbox_refine: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+                 use_grid_points: bool = False,
+                 center_init: bool = True,
+                 transform_method: str = 'moment',
+                 moment_mul: float = 0.01,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='reppoints_cls_out',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.num_points = num_points
+        self.point_feat_channels = point_feat_channels
+        self.use_grid_points = use_grid_points
+        self.center_init = center_init
+
+        # we use deform conv to extract points features
+        self.dcn_kernel = int(np.sqrt(num_points))
+        self.dcn_pad = int((self.dcn_kernel - 1) / 2)
+        assert self.dcn_kernel * self.dcn_kernel == num_points, \
+            'The points number should be a square number.'
+        assert self.dcn_kernel % 2 == 1, \
+            'The points number should be an odd square number.'
+        dcn_base = np.arange(-self.dcn_pad,
+                             self.dcn_pad + 1).astype(np.float64)
+        dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
+        dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
+        dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
+            (-1))
+        self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)
+
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        self.gradient_mul = gradient_mul
+        self.point_base_scale = point_base_scale
+        self.point_strides = point_strides
+        self.prior_generator = MlvlPointGenerator(
+            self.point_strides, offset=0.)
+
+        if self.train_cfg:
+            self.init_assigner = TASK_UTILS.build(
+                self.train_cfg['init']['assigner'])
+            self.refine_assigner = TASK_UTILS.build(
+                self.train_cfg['refine']['assigner'])
+
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        self.transform_method = transform_method
+        if self.transform_method == 'moment':
+            self.moment_transfer = nn.Parameter(
+                data=torch.zeros(2), requires_grad=True)
+            self.moment_mul = moment_mul
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes
+        else:
+            self.cls_out_channels = self.num_classes + 1
+        self.loss_bbox_init = MODELS.build(loss_bbox_init)
+        self.loss_bbox_refine = MODELS.build(loss_bbox_refine)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        pts_out_dim = 4 if self.use_grid_points else 2 * self.num_points
+        self.reppoints_cls_conv = DeformConv2d(self.feat_channels,
+                                               self.point_feat_channels,
+                                               self.dcn_kernel, 1,
+                                               self.dcn_pad)
+        self.reppoints_cls_out = nn.Conv2d(self.point_feat_channels,
+                                           self.cls_out_channels, 1, 1, 0)
+        self.reppoints_pts_init_conv = nn.Conv2d(self.feat_channels,
+                                                 self.point_feat_channels, 3,
+                                                 1, 1)
+        self.reppoints_pts_init_out = nn.Conv2d(self.point_feat_channels,
+                                                pts_out_dim, 1, 1, 0)
+        self.reppoints_pts_refine_conv = DeformConv2d(self.feat_channels,
+                                                      self.point_feat_channels,
+                                                      self.dcn_kernel, 1,
+                                                      self.dcn_pad)
+        self.reppoints_pts_refine_out = nn.Conv2d(self.point_feat_channels,
+                                                  pts_out_dim, 1, 1, 0)
+
+    def points2bbox(self, pts: Tensor, y_first: bool = True) -> Tensor:
+        """Converting the points set into bounding box.
+
+        Args:
+            pts (Tensor): the input points sets (fields), each points
+                set (fields) is represented as 2n scalar.
+            y_first (bool): if y_first=True, the point set is
+                represented as [y1, x1, y2, x2 ... yn, xn], otherwise
+                the point set is represented as
+                [x1, y1, x2, y2 ... xn, yn]. Defaults to True.
+
+        Returns:
+            Tensor: each points set is converting to a bbox [x1, y1, x2, y2].
+        """
+        pts_reshape = pts.view(pts.shape[0], -1, 2, *pts.shape[2:])
+        pts_y = pts_reshape[:, :, 0, ...] if y_first else pts_reshape[:, :, 1,
+                                                                      ...]
+        pts_x = pts_reshape[:, :, 1, ...] if y_first else pts_reshape[:, :, 0,
+                                                                      ...]
+        if self.transform_method == 'minmax':
+            bbox_left = pts_x.min(dim=1, keepdim=True)[0]
+            bbox_right = pts_x.max(dim=1, keepdim=True)[0]
+            bbox_up = pts_y.min(dim=1, keepdim=True)[0]
+            bbox_bottom = pts_y.max(dim=1, keepdim=True)[0]
+            bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom],
+                             dim=1)
+        elif self.transform_method == 'partial_minmax':
+            pts_y = pts_y[:, :4, ...]
+            pts_x = pts_x[:, :4, ...]
+            bbox_left = pts_x.min(dim=1, keepdim=True)[0]
+            bbox_right = pts_x.max(dim=1, keepdim=True)[0]
+            bbox_up = pts_y.min(dim=1, keepdim=True)[0]
+            bbox_bottom = pts_y.max(dim=1, keepdim=True)[0]
+            bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom],
+                             dim=1)
+        elif self.transform_method == 'moment':
+            pts_y_mean = pts_y.mean(dim=1, keepdim=True)
+            pts_x_mean = pts_x.mean(dim=1, keepdim=True)
+            pts_y_std = torch.std(pts_y - pts_y_mean, dim=1, keepdim=True)
+            pts_x_std = torch.std(pts_x - pts_x_mean, dim=1, keepdim=True)
+            moment_transfer = (self.moment_transfer * self.moment_mul) + (
+                self.moment_transfer.detach() * (1 - self.moment_mul))
+            moment_width_transfer = moment_transfer[0]
+            moment_height_transfer = moment_transfer[1]
+            half_width = pts_x_std * torch.exp(moment_width_transfer)
+            half_height = pts_y_std * torch.exp(moment_height_transfer)
+            bbox = torch.cat([
+                pts_x_mean - half_width, pts_y_mean - half_height,
+                pts_x_mean + half_width, pts_y_mean + half_height
+            ],
+                             dim=1)
+        else:
+            raise NotImplementedError
+        return bbox
+
+    def gen_grid_from_reg(self, reg: Tensor,
+                          previous_boxes: Tensor) -> Tuple[Tensor]:
+        """Base on the previous bboxes and regression values, we compute the
+        regressed bboxes and generate the grids on the bboxes.
+
+        Args:
+            reg (Tensor): the regression value to previous bboxes.
+            previous_boxes (Tensor): previous bboxes.
+
+        Returns:
+            Tuple[Tensor]: generate grids on the regressed bboxes.
+        """
+        b, _, h, w = reg.shape
+        bxy = (previous_boxes[:, :2, ...] + previous_boxes[:, 2:, ...]) / 2.
+        bwh = (previous_boxes[:, 2:, ...] -
+               previous_boxes[:, :2, ...]).clamp(min=1e-6)
+        grid_topleft = bxy + bwh * reg[:, :2, ...] - 0.5 * bwh * torch.exp(
+            reg[:, 2:, ...])
+        grid_wh = bwh * torch.exp(reg[:, 2:, ...])
+        grid_left = grid_topleft[:, [0], ...]
+        grid_top = grid_topleft[:, [1], ...]
+        grid_width = grid_wh[:, [0], ...]
+        grid_height = grid_wh[:, [1], ...]
+        intervel = torch.linspace(0., 1., self.dcn_kernel).view(
+            1, self.dcn_kernel, 1, 1).type_as(reg)
+        grid_x = grid_left + grid_width * intervel
+        grid_x = grid_x.unsqueeze(1).repeat(1, self.dcn_kernel, 1, 1, 1)
+        grid_x = grid_x.view(b, -1, h, w)
+        grid_y = grid_top + grid_height * intervel
+        grid_y = grid_y.unsqueeze(2).repeat(1, 1, self.dcn_kernel, 1, 1)
+        grid_y = grid_y.view(b, -1, h, w)
+        grid_yx = torch.stack([grid_y, grid_x], dim=2)
+        grid_yx = grid_yx.view(b, -1, h, w)
+        regressed_bbox = torch.cat([
+            grid_left, grid_top, grid_left + grid_width, grid_top + grid_height
+        ], 1)
+        return grid_yx, regressed_bbox
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor]:
+        return multi_apply(self.forward_single, feats)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward feature map of a single FPN level."""
+        dcn_base_offset = self.dcn_base_offset.type_as(x)
+        # If we use center_init, the initial reppoints is from center points.
+        # If we use bounding bbox representation, the initial reppoints is
+        #   from regular grid placed on a pre-defined bbox.
+        if self.use_grid_points or not self.center_init:
+            scale = self.point_base_scale / 2
+            points_init = dcn_base_offset / dcn_base_offset.max() * scale
+            bbox_init = x.new_tensor([-scale, -scale, scale,
+                                      scale]).view(1, 4, 1, 1)
+        else:
+            points_init = 0
+        cls_feat = x
+        pts_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            pts_feat = reg_conv(pts_feat)
+        # initialize reppoints
+        pts_out_init = self.reppoints_pts_init_out(
+            self.relu(self.reppoints_pts_init_conv(pts_feat)))
+        if self.use_grid_points:
+            pts_out_init, bbox_out_init = self.gen_grid_from_reg(
+                pts_out_init, bbox_init.detach())
+        else:
+            pts_out_init = pts_out_init + points_init
+        # refine and classify reppoints
+        pts_out_init_grad_mul = (1 - self.gradient_mul) * pts_out_init.detach(
+        ) + self.gradient_mul * pts_out_init
+        dcn_offset = pts_out_init_grad_mul - dcn_base_offset
+        cls_out = self.reppoints_cls_out(
+            self.relu(self.reppoints_cls_conv(cls_feat, dcn_offset)))
+        pts_out_refine = self.reppoints_pts_refine_out(
+            self.relu(self.reppoints_pts_refine_conv(pts_feat, dcn_offset)))
+        if self.use_grid_points:
+            pts_out_refine, bbox_out_refine = self.gen_grid_from_reg(
+                pts_out_refine, bbox_out_init.detach())
+        else:
+            pts_out_refine = pts_out_refine + pts_out_init.detach()
+
+        if self.training:
+            return cls_out, pts_out_init, pts_out_refine
+        else:
+            return cls_out, self.points2bbox(pts_out_refine)
+
+    def get_points(self, featmap_sizes: List[Tuple[int]],
+                   batch_img_metas: List[dict], device: str) -> tuple:
+        """Get points according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+
+        Returns:
+            tuple: points of each image, valid flags of each image
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # points center for one time
+        multi_level_points = self.prior_generator.grid_priors(
+            featmap_sizes, device=device, with_stride=True)
+        points_list = [[point.clone() for point in multi_level_points]
+                       for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level grids
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = self.prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device=device)
+            valid_flag_list.append(multi_level_flags)
+
+        return points_list, valid_flag_list
+
+    def centers_to_bboxes(self, point_list: List[Tensor]) -> List[Tensor]:
+        """Get bboxes according to center points.
+
+        Only used in :class:`MaxIoUAssigner`.
+        """
+        bbox_list = []
+        for i_img, point in enumerate(point_list):
+            bbox = []
+            for i_lvl in range(len(self.point_strides)):
+                scale = self.point_base_scale * self.point_strides[i_lvl] * 0.5
+                bbox_shift = torch.Tensor([-scale, -scale, scale,
+                                           scale]).view(1, 4).type_as(point[0])
+                bbox_center = torch.cat(
+                    [point[i_lvl][:, :2], point[i_lvl][:, :2]], dim=1)
+                bbox.append(bbox_center + bbox_shift)
+            bbox_list.append(bbox)
+        return bbox_list
+
+    def offset_to_pts(self, center_list: List[Tensor],
+                      pred_list: List[Tensor]) -> List[Tensor]:
+        """Change from point offset to point coordinate."""
+        pts_list = []
+        for i_lvl in range(len(self.point_strides)):
+            pts_lvl = []
+            for i_img in range(len(center_list)):
+                pts_center = center_list[i_img][i_lvl][:, :2].repeat(
+                    1, self.num_points)
+                pts_shift = pred_list[i_lvl][i_img]
+                yx_pts_shift = pts_shift.permute(1, 2, 0).view(
+                    -1, 2 * self.num_points)
+                y_pts_shift = yx_pts_shift[..., 0::2]
+                x_pts_shift = yx_pts_shift[..., 1::2]
+                xy_pts_shift = torch.stack([x_pts_shift, y_pts_shift], -1)
+                xy_pts_shift = xy_pts_shift.view(*yx_pts_shift.shape[:-1], -1)
+                pts = xy_pts_shift * self.point_strides[i_lvl] + pts_center
+                pts_lvl.append(pts)
+            pts_lvl = torch.stack(pts_lvl, 0)
+            pts_list.append(pts_lvl)
+        return pts_list
+
+    def _get_targets_single(self,
+                            flat_proposals: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            gt_instances_ignore: InstanceData,
+                            stage: str = 'init',
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute corresponding GT box and classification targets for
+        proposals.
+
+        Args:
+            flat_proposals (Tensor): Multi level points of a image.
+            valid_flags (Tensor): Multi level valid flags of a image.
+            gt_instances (InstanceData): It usually includes ``bboxes`` and
+                ``labels`` attributes.
+            gt_instances_ignore (InstanceData): It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+            stage (str): 'init' or 'refine'. Generate target for
+                init stage or refine stage. Defaults to 'init'.
+            unmap_outputs (bool): Whether to map outputs back to
+                the original set of anchors. Defaults to True.
+
+        Returns:
+            tuple:
+
+                - labels (Tensor): Labels of each level.
+                - label_weights (Tensor): Label weights of each level.
+                - bbox_targets (Tensor): BBox targets of each level.
+                - bbox_weights (Tensor): BBox weights of each level.
+                - pos_inds (Tensor): positive samples indexes.
+                - neg_inds (Tensor): negative samples indexes.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = valid_flags
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid proposal inside the image boundary. Please '
+                'check the image size.')
+        # assign gt and sample proposals
+        proposals = flat_proposals[inside_flags, :]
+        pred_instances = InstanceData(priors=proposals)
+
+        if stage == 'init':
+            assigner = self.init_assigner
+            pos_weight = self.train_cfg['init']['pos_weight']
+        else:
+            assigner = self.refine_assigner
+            pos_weight = self.train_cfg['refine']['pos_weight']
+
+        assign_result = assigner.assign(pred_instances, gt_instances,
+                                        gt_instances_ignore)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_proposals = proposals.shape[0]
+        bbox_gt = proposals.new_zeros([num_valid_proposals, 4])
+        pos_proposals = torch.zeros_like(proposals)
+        proposals_weights = proposals.new_zeros([num_valid_proposals, 4])
+        labels = proposals.new_full((num_valid_proposals, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        label_weights = proposals.new_zeros(
+            num_valid_proposals, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            bbox_gt[pos_inds, :] = sampling_result.pos_gt_bboxes
+            pos_proposals[pos_inds, :] = proposals[pos_inds, :]
+            proposals_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of proposals
+        if unmap_outputs:
+            num_total_proposals = flat_proposals.size(0)
+            labels = unmap(
+                labels,
+                num_total_proposals,
+                inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_proposals,
+                                  inside_flags)
+            bbox_gt = unmap(bbox_gt, num_total_proposals, inside_flags)
+            pos_proposals = unmap(pos_proposals, num_total_proposals,
+                                  inside_flags)
+            proposals_weights = unmap(proposals_weights, num_total_proposals,
+                                      inside_flags)
+
+        return (labels, label_weights, bbox_gt, pos_proposals,
+                proposals_weights, pos_inds, neg_inds, sampling_result)
+
+    def get_targets(self,
+                    proposals_list: List[Tensor],
+                    valid_flag_list: List[Tensor],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    stage: str = 'init',
+                    unmap_outputs: bool = True,
+                    return_sampling_results: bool = False) -> tuple:
+        """Compute corresponding GT box and classification targets for
+        proposals.
+
+        Args:
+            proposals_list (list[Tensor]): Multi level points/bboxes of each
+                image.
+            valid_flag_list (list[Tensor]): Multi level valid flags of each
+                image.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            stage (str): 'init' or 'refine'. Generate target for init stage or
+                refine stage.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+            return_sampling_results (bool): Whether to return the sampling
+                results. Defaults to False.
+
+        Returns:
+            tuple:
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_gt_list (list[Tensor]): Ground truth bbox of each level.
+                - proposals_list (list[Tensor]): Proposals(points/bboxes) of
+                  each level.
+                - proposal_weights_list (list[Tensor]): Proposal weights of
+                  each level.
+                - avg_factor (int): Average factor that is used to average
+                  the loss. When using sampling method, avg_factor is usually
+                  the sum of positive and negative priors. When using
+                  `PseudoSampler`, `avg_factor` is usually equal to the number
+                  of positive priors.
+        """
+        assert stage in ['init', 'refine']
+        num_imgs = len(batch_img_metas)
+        assert len(proposals_list) == len(valid_flag_list) == num_imgs
+
+        # points number of multi levels
+        num_level_proposals = [points.size(0) for points in proposals_list[0]]
+
+        # concat all level points and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(proposals_list[i]) == len(valid_flag_list[i])
+            proposals_list[i] = torch.cat(proposals_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        (all_labels, all_label_weights, all_bbox_gt, all_proposals,
+         all_proposal_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             proposals_list,
+             valid_flag_list,
+             batch_gt_instances,
+             batch_gt_instances_ignore,
+             stage=stage,
+             unmap_outputs=unmap_outputs)
+
+        # sampled points of all images
+        avg_refactor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        labels_list = images_to_levels(all_labels, num_level_proposals)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_proposals)
+        bbox_gt_list = images_to_levels(all_bbox_gt, num_level_proposals)
+        proposals_list = images_to_levels(all_proposals, num_level_proposals)
+        proposal_weights_list = images_to_levels(all_proposal_weights,
+                                                 num_level_proposals)
+        res = (labels_list, label_weights_list, bbox_gt_list, proposals_list,
+               proposal_weights_list, avg_refactor)
+        if return_sampling_results:
+            res = res + (sampling_results_list, )
+
+        return res
+
+    def loss_by_feat_single(self, cls_score: Tensor, pts_pred_init: Tensor,
+                            pts_pred_refine: Tensor, labels: Tensor,
+                            label_weights, bbox_gt_init: Tensor,
+                            bbox_weights_init: Tensor, bbox_gt_refine: Tensor,
+                            bbox_weights_refine: Tensor, stride: int,
+                            avg_factor_init: int,
+                            avg_factor_refine: int) -> Tuple[Tensor]:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_classes, h_i, w_i).
+            pts_pred_init (Tensor): Points of shape
+                (batch_size, h_i * w_i, num_points * 2).
+            pts_pred_refine (Tensor): Points refined of shape
+                (batch_size, h_i * w_i, num_points * 2).
+            labels (Tensor): Ground truth class indices with shape
+                (batch_size, h_i * w_i).
+            label_weights (Tensor): Label weights of shape
+                (batch_size, h_i * w_i).
+            bbox_gt_init (Tensor): BBox regression targets in the init stage
+                of shape (batch_size, h_i * w_i, 4).
+            bbox_weights_init (Tensor): BBox regression loss weights in the
+                init stage of shape (batch_size, h_i * w_i, 4).
+            bbox_gt_refine (Tensor): BBox regression targets in the refine
+                stage of shape (batch_size, h_i * w_i, 4).
+            bbox_weights_refine (Tensor): BBox regression loss weights in the
+                refine stage of shape (batch_size, h_i * w_i, 4).
+            stride (int): Point stride.
+            avg_factor_init (int): Average factor that is used to average
+                the loss in the init stage.
+            avg_factor_refine (int): Average factor that is used to average
+                the loss in the refine stage.
+
+        Returns:
+            Tuple[Tensor]: loss components.
+        """
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        cls_score = cls_score.contiguous()
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor_refine)
+
+        # points loss
+        bbox_gt_init = bbox_gt_init.reshape(-1, 4)
+        bbox_weights_init = bbox_weights_init.reshape(-1, 4)
+        bbox_pred_init = self.points2bbox(
+            pts_pred_init.reshape(-1, 2 * self.num_points), y_first=False)
+        bbox_gt_refine = bbox_gt_refine.reshape(-1, 4)
+        bbox_weights_refine = bbox_weights_refine.reshape(-1, 4)
+        bbox_pred_refine = self.points2bbox(
+            pts_pred_refine.reshape(-1, 2 * self.num_points), y_first=False)
+        normalize_term = self.point_base_scale * stride
+        loss_pts_init = self.loss_bbox_init(
+            bbox_pred_init / normalize_term,
+            bbox_gt_init / normalize_term,
+            bbox_weights_init,
+            avg_factor=avg_factor_init)
+        loss_pts_refine = self.loss_bbox_refine(
+            bbox_pred_refine / normalize_term,
+            bbox_gt_refine / normalize_term,
+            bbox_weights_refine,
+            avg_factor=avg_factor_refine)
+        return loss_cls, loss_pts_init, loss_pts_refine
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        pts_preds_init: List[Tensor],
+        pts_preds_refine: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, of shape (batch_size, num_classes, h, w).
+            pts_preds_init (list[Tensor]): Points for each scale level, each is
+                a 3D-tensor, of shape (batch_size, h_i * w_i, num_points * 2).
+            pts_preds_refine (list[Tensor]): Points refined for each scale
+                level, each is a 3D-tensor, of shape
+                (batch_size, h_i * w_i, num_points * 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        device = cls_scores[0].device
+
+        # target for initial stage
+        center_list, valid_flag_list = self.get_points(featmap_sizes,
+                                                       batch_img_metas, device)
+        pts_coordinate_preds_init = self.offset_to_pts(center_list,
+                                                       pts_preds_init)
+        if self.train_cfg['init']['assigner']['type'] == 'PointAssigner':
+            # Assign target for center list
+            candidate_list = center_list
+        else:
+            # transform center list to bbox list and
+            #   assign target for bbox list
+            bbox_list = self.centers_to_bboxes(center_list)
+            candidate_list = bbox_list
+        cls_reg_targets_init = self.get_targets(
+            proposals_list=candidate_list,
+            valid_flag_list=valid_flag_list,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            stage='init',
+            return_sampling_results=False)
+        (*_, bbox_gt_list_init, candidate_list_init, bbox_weights_list_init,
+         avg_factor_init) = cls_reg_targets_init
+
+        # target for refinement stage
+        center_list, valid_flag_list = self.get_points(featmap_sizes,
+                                                       batch_img_metas, device)
+        pts_coordinate_preds_refine = self.offset_to_pts(
+            center_list, pts_preds_refine)
+        bbox_list = []
+        for i_img, center in enumerate(center_list):
+            bbox = []
+            for i_lvl in range(len(pts_preds_refine)):
+                bbox_preds_init = self.points2bbox(
+                    pts_preds_init[i_lvl].detach())
+                bbox_shift = bbox_preds_init * self.point_strides[i_lvl]
+                bbox_center = torch.cat(
+                    [center[i_lvl][:, :2], center[i_lvl][:, :2]], dim=1)
+                bbox.append(bbox_center +
+                            bbox_shift[i_img].permute(1, 2, 0).reshape(-1, 4))
+            bbox_list.append(bbox)
+        cls_reg_targets_refine = self.get_targets(
+            proposals_list=bbox_list,
+            valid_flag_list=valid_flag_list,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            stage='refine',
+            return_sampling_results=False)
+        (labels_list, label_weights_list, bbox_gt_list_refine,
+         candidate_list_refine, bbox_weights_list_refine,
+         avg_factor_refine) = cls_reg_targets_refine
+
+        # compute loss
+        losses_cls, losses_pts_init, losses_pts_refine = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            pts_coordinate_preds_init,
+            pts_coordinate_preds_refine,
+            labels_list,
+            label_weights_list,
+            bbox_gt_list_init,
+            bbox_weights_list_init,
+            bbox_gt_list_refine,
+            bbox_weights_list_refine,
+            self.point_strides,
+            avg_factor_init=avg_factor_init,
+            avg_factor_refine=avg_factor_refine)
+        loss_dict_all = {
+            'loss_cls': losses_cls,
+            'loss_pts_init': losses_pts_init,
+            'loss_pts_refine': losses_pts_refine
+        }
+        return loss_dict_all
+
+    # Same as base_dense_head/_get_bboxes_single except self._bbox_decode
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image. RepPoints head does not need
+                this value.
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 2).
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict`): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for level_idx, (cls_score, bbox_pred, priors) in enumerate(
+                zip(cls_score_list, bbox_pred_list, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, _, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            bboxes = self._bbox_decode(priors, bbox_pred,
+                                       self.point_strides[level_idx],
+                                       img_shape)
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def _bbox_decode(self, points: Tensor, bbox_pred: Tensor, stride: int,
+                     max_shape: Tuple[int, int]) -> Tensor:
+        """Decode the prediction to bounding box.
+
+        Args:
+            points (Tensor): shape (h_i * w_i, 2).
+            bbox_pred (Tensor): shape (h_i * w_i, 4).
+            stride (int): Stride for bbox_pred in different level.
+            max_shape (Tuple[int, int]): image shape.
+
+        Returns:
+            Tensor: Bounding boxes decoded.
+        """
+        bbox_pos_center = torch.cat([points[:, :2], points[:, :2]], dim=1)
+        bboxes = bbox_pred * stride + bbox_pos_center
+        x1 = bboxes[:, 0].clamp(min=0, max=max_shape[1])
+        y1 = bboxes[:, 1].clamp(min=0, max=max_shape[0])
+        x2 = bboxes[:, 2].clamp(min=0, max=max_shape[1])
+        y2 = bboxes[:, 3].clamp(min=0, max=max_shape[0])
+        decoded_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+        return decoded_bboxes
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/retina_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/retina_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..be3ae74d81ba38609646f0d0406098ecbdcef688
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/retina_head.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmdet.registry import MODELS
+from .anchor_head import AnchorHead
+
+
+@MODELS.register_module()
+class RetinaHead(AnchorHead):
+    r"""An anchor-based head used in `RetinaNet
+    <https://arxiv.org/pdf/1708.02002.pdf>`_.
+
+    The head contains two subnetworks. The first classifies anchor boxes and
+    the second regresses deltas for the anchors.
+
+    Example:
+        >>> import torch
+        >>> self = RetinaHead(11, 7)
+        >>> x = torch.rand(1, 7, 32, 32)
+        >>> cls_score, bbox_pred = self.forward_single(x)
+        >>> # Each anchor predicts a score for each class except background
+        >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
+        >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
+        >>> assert cls_per_anchor == (self.num_classes)
+        >>> assert box_per_anchor == 4
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     octave_base_scale=4,
+                     scales_per_octave=3,
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[8, 16, 32, 64, 128]),
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='retina_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs):
+        assert stacked_convs >= 0, \
+            '`stacked_convs` must be non-negative integers, ' \
+            f'but got {stacked_convs} instead.'
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super(RetinaHead, self).__init__(
+            num_classes,
+            in_channels,
+            anchor_generator=anchor_generator,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        in_channels = self.in_channels
+        for i in range(self.stacked_convs):
+            self.cls_convs.append(
+                ConvModule(
+                    in_channels,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    in_channels,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            in_channels = self.feat_channels
+        self.retina_cls = nn.Conv2d(
+            in_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        reg_dim = self.bbox_coder.encode_size
+        self.retina_reg = nn.Conv2d(
+            in_channels, self.num_base_priors * reg_dim, 3, padding=1)
+
+    def forward_single(self, x):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_anchors * 4.
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.retina_cls(cls_feat)
+        bbox_pred = self.retina_reg(reg_feat)
+        return cls_score, bbox_pred
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/retina_sepbn_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/retina_sepbn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..681a39983a08670adaa3e24a4099c4f26bc967ce
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/retina_sepbn_head.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import bias_init_with_prob, normal_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .anchor_head import AnchorHead
+
+
+@MODELS.register_module()
+class RetinaSepBNHead(AnchorHead):
+    """"RetinaHead with separate BN.
+
+    In RetinaHead, conv/norm layers are shared across different FPN levels,
+    while in RetinaSepBNHead, conv layers are shared across different FPN
+    levels, but BN layers are separated.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 num_ins: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.num_ins = num_ins
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.num_ins):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            for j in range(self.stacked_convs):
+                chn = self.in_channels if j == 0 else self.feat_channels
+                cls_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+                reg_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+        for i in range(self.stacked_convs):
+            for j in range(1, self.num_ins):
+                self.cls_convs[j][i].conv = self.cls_convs[0][i].conv
+                self.reg_convs[j][i].conv = self.reg_convs[0][i].conv
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.retina_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        super().init_weights()
+        for m in self.cls_convs[0]:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs[0]:
+            normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.retina_cls, std=0.01, bias=bias_cls)
+        normal_init(self.retina_reg, std=0.01)
+
+    def forward(self, feats: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+
+                - cls_scores (list[Tensor]): Classification scores for all
+                  scale levels, each is a 4D-tensor, the channels number is
+                  num_anchors * num_classes.
+                - bbox_preds (list[Tensor]): Box energies / deltas for all
+                  scale levels, each is a 4D-tensor, the channels number is
+                  num_anchors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for i, x in enumerate(feats):
+            cls_feat = feats[i]
+            reg_feat = feats[i]
+            for cls_conv in self.cls_convs[i]:
+                cls_feat = cls_conv(cls_feat)
+            for reg_conv in self.reg_convs[i]:
+                reg_feat = reg_conv(reg_feat)
+            cls_score = self.retina_cls(cls_feat)
+            bbox_pred = self.retina_reg(reg_feat)
+            cls_scores.append(cls_score)
+            bbox_preds.append(bbox_pred)
+        return cls_scores, bbox_preds
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/rpn_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b544009d2ffc4c3c9065707a0a8a72c577eb432
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/rpn_head.py
@@ -0,0 +1,302 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.ops import batched_nms
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import (cat_boxes, empty_box_as, get_box_tensor,
+                                   get_box_wh, scale_boxes)
+from mmdet.utils import InstanceList, MultiConfig, OptInstanceList
+from .anchor_head import AnchorHead
+
+
+@MODELS.register_module()
+class RPNHead(AnchorHead):
+    """Implementation of RPN head.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        num_classes (int): Number of categories excluding the background
+            category. Defaults to 1.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict]): Initialization config dict.
+        num_convs (int): Number of convolution layers in the head.
+            Defaults to 1.
+    """  # noqa: W605
+
+    def __init__(self,
+                 in_channels: int,
+                 num_classes: int = 1,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal', layer='Conv2d', std=0.01),
+                 num_convs: int = 1,
+                 **kwargs) -> None:
+        self.num_convs = num_convs
+        assert num_classes == 1
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        if self.num_convs > 1:
+            rpn_convs = []
+            for i in range(self.num_convs):
+                if i == 0:
+                    in_channels = self.in_channels
+                else:
+                    in_channels = self.feat_channels
+                # use ``inplace=False`` to avoid error: one of the variables
+                # needed for gradient computation has been modified by an
+                # inplace operation.
+                rpn_convs.append(
+                    ConvModule(
+                        in_channels,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        inplace=False))
+            self.rpn_conv = nn.Sequential(*rpn_convs)
+        else:
+            self.rpn_conv = nn.Conv2d(
+                self.in_channels, self.feat_channels, 3, padding=1)
+        self.rpn_cls = nn.Conv2d(self.feat_channels,
+                                 self.num_base_priors * self.cls_out_channels,
+                                 1)
+        reg_dim = self.bbox_coder.encode_size
+        self.rpn_reg = nn.Conv2d(self.feat_channels,
+                                 self.num_base_priors * reg_dim, 1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level \
+                    the channels number is num_base_priors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_base_priors * 4.
+        """
+        x = self.rpn_conv(x)
+        x = F.relu(x)
+        rpn_cls_score = self.rpn_cls(x)
+        rpn_bbox_pred = self.rpn_reg(x)
+        return rpn_cls_score, rpn_bbox_pred
+
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None) \
+            -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[obj:InstanceData]): Batch of gt_instance.
+                It usually includes ``bboxes`` and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[obj:InstanceData], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        losses = super().loss_by_feat(
+            cls_scores,
+            bbox_preds,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        return dict(
+            loss_rpn_cls=losses['loss_cls'], loss_rpn_bbox=losses['loss_bbox'])
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Be compatible with
+                BaseDenseHead. Not used in RPNHead.
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (ConfigDict, optional): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        level_ids = []
+        for level_idx, (cls_score, bbox_pred, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            reg_dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, reg_dim)
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0] since mmdet v2.0
+                # BG cat_id: 1
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            scores = torch.squeeze(scores)
+            if 0 < nms_pre < scores.shape[0]:
+                # sort is faster than topk
+                # _, topk_inds = scores.topk(cfg.nms_pre)
+                ranked_scores, rank_inds = scores.sort(descending=True)
+                topk_inds = rank_inds[:nms_pre]
+                scores = ranked_scores[:nms_pre]
+                bbox_pred = bbox_pred[topk_inds, :]
+                priors = priors[topk_inds]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+
+            # use level id to implement the separate level nms
+            level_ids.append(
+                scores.new_full((scores.size(0), ),
+                                level_idx,
+                                dtype=torch.long))
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_priors)
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = torch.cat(mlvl_scores)
+        results.level_ids = torch.cat(level_ids)
+
+        return self._bbox_post_process(
+            results=results, cfg=cfg, rescale=rescale, img_meta=img_meta)
+
+    def _bbox_post_process(self,
+                           results: InstanceData,
+                           cfg: ConfigDict,
+                           rescale: bool = False,
+                           with_nms: bool = True,
+                           img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (ConfigDict): Test / postprocessing configuration.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert with_nms, '`with_nms` must be True in RPNHead'
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            results.bboxes = scale_boxes(results.bboxes, scale_factor)
+
+        # filter small size bboxes
+        if cfg.get('min_bbox_size', -1) >= 0:
+            w, h = get_box_wh(results.bboxes)
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                results = results[valid_mask]
+
+        if results.bboxes.numel() > 0:
+            bboxes = get_box_tensor(results.bboxes)
+            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
+                                                results.level_ids, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:cfg.max_per_img]
+            # TODO: This would unreasonably show the 0th class label
+            #  in visualization
+            results.labels = results.scores.new_zeros(
+                len(results), dtype=torch.long)
+            del results.level_ids
+        else:
+            # To avoid some potential error
+            results_ = InstanceData()
+            results_.bboxes = empty_box_as(results.bboxes)
+            results_.scores = results.scores.new_zeros(0)
+            results_.labels = results.scores.new_zeros(0)
+            results = results_
+        return results
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/rtmdet_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/rtmdet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae0ee6d2f35a0fa46ba0b8de21054433d0420b65
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/rtmdet_head.py
@@ -0,0 +1,692 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule, Scale, is_norm
+from mmengine.model import bias_init_with_prob, constant_init, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import distance2bbox
+from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
+from ..layers.transformer import inverse_sigmoid
+from ..task_modules import anchor_inside_flags
+from ..utils import (images_to_levels, multi_apply, sigmoid_geometric_mean,
+                     unmap)
+from .atss_head import ATSSHead
+
+
+@MODELS.register_module()
+class RTMDetHead(ATSSHead):
+    """Detection Head of RTMDet.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        with_objectness (bool): Whether to add an objectness branch.
+            Defaults to True.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='ReLU')
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 with_objectness: bool = True,
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 **kwargs) -> None:
+        self.act_cfg = act_cfg
+        self.with_objectness = with_objectness
+        super().__init__(num_classes, in_channels, **kwargs)
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        pred_pad_size = self.pred_kernel_size // 2
+        self.rtm_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.rtm_reg = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * 4,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        if self.with_objectness:
+            self.rtm_obj = nn.Conv2d(
+                self.feat_channels,
+                1,
+                self.pred_kernel_size,
+                padding=pred_pad_size)
+
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.rtm_cls, std=0.01, bias=bias_cls)
+        normal_init(self.rtm_reg, std=0.01)
+        if self.with_objectness:
+            normal_init(self.rtm_obj, std=0.01, bias=bias_cls)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+        """
+
+        cls_scores = []
+        bbox_preds = []
+        for idx, (x, scale, stride) in enumerate(
+                zip(feats, self.scales, self.prior_generator.strides)):
+            cls_feat = x
+            reg_feat = x
+
+            for cls_layer in self.cls_convs:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls(cls_feat)
+
+            for reg_layer in self.reg_convs:
+                reg_feat = reg_layer(reg_feat)
+
+            if self.with_objectness:
+                objectness = self.rtm_obj(reg_feat)
+                cls_score = inverse_sigmoid(
+                    sigmoid_geometric_mean(cls_score, objectness))
+
+            reg_dist = scale(self.rtm_reg(reg_feat).exp()).float() * stride[0]
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+        return tuple(cls_scores), tuple(bbox_preds)
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            labels: Tensor, label_weights: Tensor,
+                            bbox_targets: Tensor, assign_metrics: Tensor,
+                            stride: List[int]):
+        """Compute loss of a single scale level.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Decoded bboxes for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors).
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            assign_metrics (Tensor): Assign metrics with shape
+                (N, num_total_anchors).
+            stride (List[int]): Downsample stride of the feature map.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        bbox_pred = bbox_pred.reshape(-1, 4)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        assign_metrics = assign_metrics.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        targets = (labels, assign_metrics)
+
+        loss_cls = self.loss_cls(
+            cls_score, targets, label_weights, avg_factor=1.0)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+
+            pos_decode_bbox_pred = pos_bbox_pred
+            pos_decode_bbox_targets = pos_bbox_targets
+
+            # regression loss
+            pos_bbox_weight = assign_metrics[pos_inds]
+
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=pos_bbox_weight,
+                avg_factor=1.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            pos_bbox_weight = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, assign_metrics.sum(), pos_bbox_weight.sum()
+
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1)
+        decoded_bboxes = []
+        for anchor, bbox_pred in zip(anchor_list[0], bbox_preds):
+            anchor = anchor.reshape(-1, 4)
+            bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            bbox_pred = distance2bbox(anchor, bbox_pred)
+            decoded_bboxes.append(bbox_pred)
+
+        flatten_bboxes = torch.cat(decoded_bboxes, 1)
+
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bboxes,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         assign_metrics_list, sampling_results_list) = cls_reg_targets
+
+        losses_cls, losses_bbox,\
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_by_feat_single,
+                cls_scores,
+                decoded_bboxes,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                assign_metrics_list,
+                self.prior_generator.strides)
+
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+    def get_targets(self,
+                    cls_scores: Tensor,
+                    bbox_preds: Tensor,
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs=True):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            cls_scores (Tensor): Classification predictions of images,
+                a 3D-Tensor with shape [num_imgs, num_priors, num_classes].
+            bbox_preds (Tensor): Decoded bboxes predictions of one image,
+                a 3D-Tensor with shape [num_imgs, num_priors, 4] in [tl_x,
+                tl_y, br_x, br_y] format.
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: a tuple containing learning targets.
+
+            - anchors_list (list[list[Tensor]]): Anchors of each level.
+            - labels_list (list[Tensor]): Labels of each level.
+            - label_weights_list (list[Tensor]): Label weights of each
+              level.
+            - bbox_targets_list (list[Tensor]): BBox targets of each level.
+            - assign_metrics_list (list[Tensor]): alignment metrics of each
+              level.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        # anchor_list: list(b * [-1, 4])
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_assign_metrics, sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             cls_scores.detach(),
+             bbox_preds.detach(),
+             anchor_list,
+             valid_flag_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        assign_metrics_list = images_to_levels(all_assign_metrics,
+                                               num_level_anchors)
+
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, assign_metrics_list, sampling_results_list)
+
+    def _get_targets_single(self,
+                            cls_scores: Tensor,
+                            bbox_preds: Tensor,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs=True):
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            cls_scores (list(Tensor)): Box scores for each image.
+            bbox_preds (list(Tensor)): Box energies / deltas for each image.
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+
+            - anchors (Tensor): All anchors in the image with shape (N, 4).
+            - labels (Tensor): Labels of all anchors in the image with shape
+              (N,).
+            - label_weights (Tensor): Label weights of all anchor in the
+              image with shape (N,).
+            - bbox_targets (Tensor): BBox targets of all anchors in the
+              image with shape (N, 4).
+            - norm_alignment_metrics (Tensor): Normalized alignment metrics
+              of all priors in the image with shape (N,).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        pred_instances = InstanceData(
+            scores=cls_scores[inside_flags, :],
+            bboxes=bbox_preds[inside_flags, :],
+            priors=anchors)
+
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        assign_metrics = anchors.new_zeros(
+            num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            # point-based
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        class_assigned_gt_inds = torch.unique(
+            sampling_result.pos_assigned_gt_inds)
+        for gt_inds in class_assigned_gt_inds:
+            gt_class_inds = pos_inds[sampling_result.pos_assigned_gt_inds ==
+                                     gt_inds]
+            assign_metrics[gt_class_inds] = assign_result.max_overlaps[
+                gt_class_inds]
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            assign_metrics = unmap(assign_metrics, num_total_anchors,
+                                   inside_flags)
+        return (anchors, labels, label_weights, bbox_targets, assign_metrics,
+                sampling_result)
+
+    def get_anchors(self,
+                    featmap_sizes: List[tuple],
+                    batch_img_metas: List[dict],
+                    device: Union[torch.device, str] = 'cuda') \
+            -> Tuple[List[List[Tensor]], List[List[Tensor]]]:
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+            device (torch.device or str): Device for returned tensors.
+                Defaults to cuda.
+
+        Returns:
+            tuple:
+
+            - anchor_list (list[list[Tensor]]): Anchors of each image.
+            - valid_flag_list (list[list[Tensor]]): Valid flags of each
+              image.
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device, with_stride=True)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = self.prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device)
+            valid_flag_list.append(multi_level_flags)
+        return anchor_list, valid_flag_list
+
+
+@MODELS.register_module()
+class RTMDetSepBNHead(RTMDetHead):
+    """RTMDetHead with separated BN layers and shared conv layers.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        share_conv (bool): Whether to share conv layers between stages.
+            Defaults to True.
+        use_depthwise (bool): Whether to use depthwise separable convolution in
+            head. Defaults to False.
+        norm_cfg (:obj:`ConfigDict` or dict)): Config dict for normalization
+            layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (:obj:`ConfigDict` or dict)): Config dict for activation layer.
+            Defaults to dict(type='SiLU').
+        pred_kernel_size (int): Kernel size of prediction layer. Defaults to 1.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 share_conv: bool = True,
+                 use_depthwise: bool = False,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU'),
+                 pred_kernel_size: int = 1,
+                 exp_on_reg=False,
+                 **kwargs) -> None:
+        self.share_conv = share_conv
+        self.exp_on_reg = exp_on_reg
+        self.use_depthwise = use_depthwise
+        super().__init__(
+            num_classes,
+            in_channels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            pred_kernel_size=pred_kernel_size,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+
+        self.rtm_cls = nn.ModuleList()
+        self.rtm_reg = nn.ModuleList()
+        if self.with_objectness:
+            self.rtm_obj = nn.ModuleList()
+        for n in range(len(self.prior_generator.strides)):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                cls_convs.append(
+                    conv(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_convs.append(
+                    conv(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+
+            self.rtm_cls.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * self.cls_out_channels,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2))
+            self.rtm_reg.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * 4,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2))
+            if self.with_objectness:
+                self.rtm_obj.append(
+                    nn.Conv2d(
+                        self.feat_channels,
+                        1,
+                        self.pred_kernel_size,
+                        padding=self.pred_kernel_size // 2))
+
+        if self.share_conv:
+            for n in range(len(self.prior_generator.strides)):
+                for i in range(self.stacked_convs):
+                    self.cls_convs[n][i].conv = self.cls_convs[0][i].conv
+                    self.reg_convs[n][i].conv = self.reg_convs[0][i].conv
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+        bias_cls = bias_init_with_prob(0.01)
+        for rtm_cls, rtm_reg in zip(self.rtm_cls, self.rtm_reg):
+            normal_init(rtm_cls, std=0.01, bias=bias_cls)
+            normal_init(rtm_reg, std=0.01)
+        if self.with_objectness:
+            for rtm_obj in self.rtm_obj:
+                normal_init(rtm_obj, std=0.01, bias=bias_cls)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+
+            - cls_scores (tuple[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_anchors * num_classes.
+            - bbox_preds (tuple[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_anchors * 4.
+        """
+
+        cls_scores = []
+        bbox_preds = []
+        for idx, (x, stride) in enumerate(
+                zip(feats, self.prior_generator.strides)):
+            cls_feat = x
+            reg_feat = x
+
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls[idx](cls_feat)
+
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+
+            if self.with_objectness:
+                objectness = self.rtm_obj[idx](reg_feat)
+                cls_score = inverse_sigmoid(
+                    sigmoid_geometric_mean(cls_score, objectness))
+            if self.exp_on_reg:
+                reg_dist = self.rtm_reg[idx](reg_feat).exp() * stride[0]
+            else:
+                reg_dist = self.rtm_reg[idx](reg_feat) * stride[0]
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+        return tuple(cls_scores), tuple(bbox_preds)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/rtmdet_ins_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/rtmdet_ins_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..261a57fe485245dcbe41696c9237258f829ca25a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/rtmdet_ins_head.py
@@ -0,0 +1,1034 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, is_norm
+from mmcv.ops import batched_nms
+from mmengine.model import (BaseModule, bias_init_with_prob, constant_init,
+                            normal_init)
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.layers.transformer import inverse_sigmoid
+from mmdet.models.utils import (filter_scores_and_topk, multi_apply,
+                                select_single_mlvl, sigmoid_geometric_mean)
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import (cat_boxes, distance2bbox, get_box_tensor,
+                                   get_box_wh, scale_boxes)
+from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
+from .rtmdet_head import RTMDetHead
+
+
+@MODELS.register_module()
+class RTMDetInsHead(RTMDetHead):
+    """Detection Head of RTMDet-Ins.
+
+    Args:
+        num_prototypes (int): Number of mask prototype features extracted
+            from the mask head. Defaults to 8.
+        dyconv_channels (int): Channel of the dynamic conv layers.
+            Defaults to 8.
+        num_dyconvs (int): Number of the dynamic convolution layers.
+            Defaults to 3.
+        mask_loss_stride (int): Down sample stride of the masks for loss
+            computation. Defaults to 4.
+        loss_mask (:obj:`ConfigDict` or dict): Config dict for mask loss.
+    """
+
+    def __init__(self,
+                 *args,
+                 num_prototypes: int = 8,
+                 dyconv_channels: int = 8,
+                 num_dyconvs: int = 3,
+                 mask_loss_stride: int = 4,
+                 loss_mask=dict(
+                     type='DiceLoss',
+                     loss_weight=2.0,
+                     eps=5e-6,
+                     reduction='mean'),
+                 **kwargs) -> None:
+        self.num_prototypes = num_prototypes
+        self.num_dyconvs = num_dyconvs
+        self.dyconv_channels = dyconv_channels
+        self.mask_loss_stride = mask_loss_stride
+        super().__init__(*args, **kwargs)
+        self.loss_mask = MODELS.build(loss_mask)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        super()._init_layers()
+        # a branch to predict kernels of dynamic convs
+        self.kernel_convs = nn.ModuleList()
+        # calculate num dynamic parameters
+        weight_nums, bias_nums = [], []
+        for i in range(self.num_dyconvs):
+            if i == 0:
+                weight_nums.append(
+                    # mask prototype and coordinate features
+                    (self.num_prototypes + 2) * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels * 1)
+            elif i == self.num_dyconvs - 1:
+                weight_nums.append(self.dyconv_channels * 1)
+                bias_nums.append(1)
+            else:
+                weight_nums.append(self.dyconv_channels * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels * 1)
+        self.weight_nums = weight_nums
+        self.bias_nums = bias_nums
+        self.num_gen_params = sum(weight_nums) + sum(bias_nums)
+
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.kernel_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        pred_pad_size = self.pred_kernel_size // 2
+        self.rtm_kernel = nn.Conv2d(
+            self.feat_channels,
+            self.num_gen_params,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.mask_head = MaskFeatModule(
+            in_channels=self.in_channels,
+            feat_channels=self.feat_channels,
+            stacked_convs=4,
+            num_levels=len(self.prior_generator.strides),
+            num_prototypes=self.num_prototypes,
+            act_cfg=self.act_cfg,
+            norm_cfg=self.norm_cfg)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+            - kernel_preds (list[Tensor]): Dynamic conv kernels for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_gen_params.
+            - mask_feat (Tensor): Output feature of the mask head. Each is a
+              4D-tensor, the channels number is num_prototypes.
+        """
+        mask_feat = self.mask_head(feats)
+
+        cls_scores = []
+        bbox_preds = []
+        kernel_preds = []
+        for idx, (x, scale, stride) in enumerate(
+                zip(feats, self.scales, self.prior_generator.strides)):
+            cls_feat = x
+            reg_feat = x
+            kernel_feat = x
+
+            for cls_layer in self.cls_convs:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls(cls_feat)
+
+            for kernel_layer in self.kernel_convs:
+                kernel_feat = kernel_layer(kernel_feat)
+            kernel_pred = self.rtm_kernel(kernel_feat)
+
+            for reg_layer in self.reg_convs:
+                reg_feat = reg_layer(reg_feat)
+
+            if self.with_objectness:
+                objectness = self.rtm_obj(reg_feat)
+                cls_score = inverse_sigmoid(
+                    sigmoid_geometric_mean(cls_score, objectness))
+
+            reg_dist = scale(self.rtm_reg(reg_feat)) * stride[0]
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+            kernel_preds.append(kernel_pred)
+        return tuple(cls_scores), tuple(bbox_preds), tuple(
+            kernel_preds), mask_feat
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        kernel_preds: List[Tensor],
+                        mask_feat: Tensor,
+                        score_factors: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigType] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            kernel_preds (list[Tensor]): Kernel predictions of dynamic
+                convs for all scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_params, H, W).
+            mask_feat (Tensor): Mask prototype features extracted from the
+                mask head, has shape (batch_size, num_prototypes, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        if score_factors is None:
+            # e.g. Retina, FreeAnchor, Foveabox, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, AutoAssign, etc.
+            with_score_factors = True
+            assert len(cls_scores) == len(score_factors)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+
+        result_list = []
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            cls_score_list = select_single_mlvl(
+                cls_scores, img_id, detach=True)
+            bbox_pred_list = select_single_mlvl(
+                bbox_preds, img_id, detach=True)
+            kernel_pred_list = select_single_mlvl(
+                kernel_preds, img_id, detach=True)
+            if with_score_factors:
+                score_factor_list = select_single_mlvl(
+                    score_factors, img_id, detach=True)
+            else:
+                score_factor_list = [None for _ in range(num_levels)]
+
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                kernel_pred_list=kernel_pred_list,
+                mask_feat=mask_feat[img_id],
+                score_factor_list=score_factor_list,
+                mlvl_priors=mlvl_priors,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                kernel_pred_list: List[Tensor],
+                                mask_feat: Tensor,
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox and mask results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            kernel_preds (list[Tensor]): Kernel predictions of dynamic
+                convs for all scale levels of a single image, each is a
+                4D-tensor, has shape (num_params, H, W).
+            mask_feat (Tensor): Mask prototype features of a single image
+                extracted from the mask head, has shape (num_prototypes, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        if score_factor_list[0] is None:
+            # e.g. Retina, FreeAnchor, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_kernels = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_labels = []
+        if with_score_factors:
+            mlvl_score_factors = []
+        else:
+            mlvl_score_factors = None
+
+        for level_idx, (cls_score, bbox_pred, kernel_pred,
+                        score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list, kernel_pred_list,
+                              score_factor_list, mlvl_priors)):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim)
+            if with_score_factors:
+                score_factor = score_factor.permute(1, 2,
+                                                    0).reshape(-1).sigmoid()
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            kernel_pred = kernel_pred.permute(1, 2, 0).reshape(
+                -1, self.num_gen_params)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            score_thr = cfg.get('score_thr', 0)
+
+            results = filter_scores_and_topk(
+                scores, score_thr, nms_pre,
+                dict(
+                    bbox_pred=bbox_pred,
+                    priors=priors,
+                    kernel_pred=kernel_pred))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+            kernel_pred = filtered_results['kernel_pred']
+
+            if with_score_factors:
+                score_factor = score_factor[keep_idxs]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+            mlvl_kernels.append(kernel_pred)
+
+            if with_score_factors:
+                mlvl_score_factors.append(score_factor)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_priors)
+        bboxes = self.bbox_coder.decode(
+            priors[..., :2], bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.priors = priors
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+        results.kernels = torch.cat(mlvl_kernels)
+        if with_score_factors:
+            results.score_factors = torch.cat(mlvl_score_factors)
+
+        return self._bbox_mask_post_process(
+            results=results,
+            mask_feat=mask_feat,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def _bbox_mask_post_process(
+            self,
+            results: InstanceData,
+            mask_feat,
+            cfg: ConfigType,
+            rescale: bool = False,
+            with_nms: bool = True,
+            img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox and mask post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (ConfigDict): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        stride = self.prior_generator.strides[0][0]
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            results.bboxes = scale_boxes(results.bboxes, scale_factor)
+
+        if hasattr(results, 'score_factors'):
+            # TODO: Add sqrt operation in order to be consistent with
+            #  the paper.
+            score_factors = results.pop('score_factors')
+            results.scores = results.scores * score_factors
+
+        # filter small size bboxes
+        if cfg.get('min_bbox_size', -1) >= 0:
+            w, h = get_box_wh(results.bboxes)
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                results = results[valid_mask]
+
+        # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg
+        assert with_nms, 'with_nms must be True for RTMDet-Ins'
+        if results.bboxes.numel() > 0:
+            bboxes = get_box_tensor(results.bboxes)
+            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
+                                                results.labels, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:cfg.max_per_img]
+
+            # process masks
+            mask_logits = self._mask_predict_by_feat_single(
+                mask_feat, results.kernels, results.priors)
+
+            mask_logits = F.interpolate(
+                mask_logits.unsqueeze(0), scale_factor=stride, mode='bilinear')
+            if rescale:
+                ori_h, ori_w = img_meta['ori_shape'][:2]
+                mask_logits = F.interpolate(
+                    mask_logits,
+                    size=[
+                        math.ceil(mask_logits.shape[-2] * scale_factor[0]),
+                        math.ceil(mask_logits.shape[-1] * scale_factor[1])
+                    ],
+                    mode='bilinear',
+                    align_corners=False)[..., :ori_h, :ori_w]
+            masks = mask_logits.sigmoid().squeeze(0)
+            masks = masks > cfg.mask_thr_binary
+            results.masks = masks
+        else:
+            h, w = img_meta['ori_shape'][:2] if rescale else img_meta[
+                'img_shape'][:2]
+            results.masks = torch.zeros(
+                size=(results.bboxes.shape[0], h, w),
+                dtype=torch.bool,
+                device=results.bboxes.device)
+
+        return results
+
+    def parse_dynamic_params(self, flatten_kernels: Tensor) -> tuple:
+        """split kernel head prediction to conv weight and bias."""
+        n_inst = flatten_kernels.size(0)
+        n_layers = len(self.weight_nums)
+        params_splits = list(
+            torch.split_with_sizes(
+                flatten_kernels, self.weight_nums + self.bias_nums, dim=1))
+        weight_splits = params_splits[:n_layers]
+        bias_splits = params_splits[n_layers:]
+        for i in range(n_layers):
+            if i < n_layers - 1:
+                weight_splits[i] = weight_splits[i].reshape(
+                    n_inst * self.dyconv_channels, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst *
+                                                        self.dyconv_channels)
+            else:
+                weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst)
+
+        return weight_splits, bias_splits
+
+    def _mask_predict_by_feat_single(self, mask_feat: Tensor, kernels: Tensor,
+                                     priors: Tensor) -> Tensor:
+        """Generate mask logits from mask features with dynamic convs.
+
+        Args:
+            mask_feat (Tensor): Mask prototype features.
+                Has shape (num_prototypes, H, W).
+            kernels (Tensor): Kernel parameters for each instance.
+                Has shape (num_instance, num_params)
+            priors (Tensor): Center priors for each instance.
+                Has shape (num_instance, 4).
+        Returns:
+            Tensor: Instance segmentation masks for each instance.
+                Has shape (num_instance, H, W).
+        """
+        num_inst = priors.shape[0]
+        h, w = mask_feat.size()[-2:]
+        if num_inst < 1:
+            return torch.empty(
+                size=(num_inst, h, w),
+                dtype=mask_feat.dtype,
+                device=mask_feat.device)
+        if len(mask_feat.shape) < 4:
+            mask_feat.unsqueeze(0)
+
+        coord = self.prior_generator.single_level_grid_priors(
+            (h, w), level_idx=0, device=mask_feat.device).reshape(1, -1, 2)
+        num_inst = priors.shape[0]
+        points = priors[:, :2].reshape(-1, 1, 2)
+        strides = priors[:, 2:].reshape(-1, 1, 2)
+        relative_coord = (points - coord).permute(0, 2, 1) / (
+            strides[..., 0].reshape(-1, 1, 1) * 8)
+        relative_coord = relative_coord.reshape(num_inst, 2, h, w)
+
+        mask_feat = torch.cat(
+            [relative_coord,
+             mask_feat.repeat(num_inst, 1, 1, 1)], dim=1)
+        weights, biases = self.parse_dynamic_params(kernels)
+
+        n_layers = len(weights)
+        x = mask_feat.reshape(1, -1, h, w)
+        for i, (weight, bias) in enumerate(zip(weights, biases)):
+            x = F.conv2d(
+                x, weight, bias=bias, stride=1, padding=0, groups=num_inst)
+            if i < n_layers - 1:
+                x = F.relu(x)
+        x = x.reshape(num_inst, h, w)
+        return x
+
+    def loss_mask_by_feat(self, mask_feats: Tensor, flatten_kernels: Tensor,
+                          sampling_results_list: list,
+                          batch_gt_instances: InstanceList) -> Tensor:
+        """Compute instance segmentation loss.
+
+        Args:
+            mask_feats (list[Tensor]): Mask prototype features extracted from
+                the mask head. Has shape (N, num_prototypes, H, W)
+            flatten_kernels (list[Tensor]): Kernels of the dynamic conv layers.
+                Has shape (N, num_instances, num_params)
+            sampling_results_list (list[:obj:`SamplingResults`]) Batch of
+                assignment results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            Tensor: The mask loss tensor.
+        """
+        batch_pos_mask_logits = []
+        pos_gt_masks = []
+        for idx, (mask_feat, kernels, sampling_results,
+                  gt_instances) in enumerate(
+                      zip(mask_feats, flatten_kernels, sampling_results_list,
+                          batch_gt_instances)):
+            pos_priors = sampling_results.pos_priors
+            pos_inds = sampling_results.pos_inds
+            pos_kernels = kernels[pos_inds]  # n_pos, num_gen_params
+            pos_mask_logits = self._mask_predict_by_feat_single(
+                mask_feat, pos_kernels, pos_priors)
+            if gt_instances.masks.numel() == 0:
+                gt_masks = torch.empty_like(gt_instances.masks)
+            else:
+                gt_masks = gt_instances.masks[
+                    sampling_results.pos_assigned_gt_inds, :]
+            batch_pos_mask_logits.append(pos_mask_logits)
+            pos_gt_masks.append(gt_masks)
+
+        pos_gt_masks = torch.cat(pos_gt_masks, 0)
+        batch_pos_mask_logits = torch.cat(batch_pos_mask_logits, 0)
+
+        # avg_factor
+        num_pos = batch_pos_mask_logits.shape[0]
+        num_pos = reduce_mean(mask_feats.new_tensor([num_pos
+                                                     ])).clamp_(min=1).item()
+
+        if batch_pos_mask_logits.shape[0] == 0:
+            return mask_feats.sum() * 0
+
+        scale = self.prior_generator.strides[0][0] // self.mask_loss_stride
+        # upsample pred masks
+        batch_pos_mask_logits = F.interpolate(
+            batch_pos_mask_logits.unsqueeze(0),
+            scale_factor=scale,
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        # downsample gt masks
+        pos_gt_masks = pos_gt_masks[:, self.mask_loss_stride //
+                                    2::self.mask_loss_stride,
+                                    self.mask_loss_stride //
+                                    2::self.mask_loss_stride]
+
+        loss_mask = self.loss_mask(
+            batch_pos_mask_logits,
+            pos_gt_masks,
+            weight=None,
+            avg_factor=num_pos)
+
+        return loss_mask
+
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     kernel_preds: List[Tensor],
+                     mask_feat: Tensor,
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1)
+        flatten_kernels = torch.cat([
+            kernel_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                    self.num_gen_params)
+            for kernel_pred in kernel_preds
+        ], 1)
+        decoded_bboxes = []
+        for anchor, bbox_pred in zip(anchor_list[0], bbox_preds):
+            anchor = anchor.reshape(-1, 4)
+            bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            bbox_pred = distance2bbox(anchor, bbox_pred)
+            decoded_bboxes.append(bbox_pred)
+
+        flatten_bboxes = torch.cat(decoded_bboxes, 1)
+        for gt_instances in batch_gt_instances:
+            gt_instances.masks = gt_instances.masks.to_tensor(
+                dtype=torch.bool, device=device)
+
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bboxes,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         assign_metrics_list, sampling_results_list) = cls_reg_targets
+
+        losses_cls, losses_bbox,\
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_by_feat_single,
+                cls_scores,
+                decoded_bboxes,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                assign_metrics_list,
+                self.prior_generator.strides)
+
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+
+        loss_mask = self.loss_mask_by_feat(mask_feat, flatten_kernels,
+                                           sampling_results_list,
+                                           batch_gt_instances)
+        loss = dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_mask=loss_mask)
+        return loss
+
+
+class MaskFeatModule(BaseModule):
+    """Mask feature head used in RTMDet-Ins.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels of the mask feature
+             map branch.
+        num_levels (int): The starting feature map level from RPN that
+             will be used to predict the mask feature map.
+        num_prototypes (int): Number of output channel of the mask feature
+             map branch. This is the channel count of the mask
+             feature map that to be dynamically convolved with the predicted
+             kernel.
+        stacked_convs (int): Number of convs in mask feature branch.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True)
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 4,
+        num_levels: int = 3,
+        num_prototypes: int = 8,
+        act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+        norm_cfg: ConfigType = dict(type='BN')
+    ) -> None:
+        super().__init__(init_cfg=None)
+        self.num_levels = num_levels
+        self.fusion_conv = nn.Conv2d(num_levels * in_channels, in_channels, 1)
+        convs = []
+        for i in range(stacked_convs):
+            in_c = in_channels if i == 0 else feat_channels
+            convs.append(
+                ConvModule(
+                    in_c,
+                    feat_channels,
+                    3,
+                    padding=1,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg))
+        self.stacked_convs = nn.Sequential(*convs)
+        self.projection = nn.Conv2d(
+            feat_channels, num_prototypes, kernel_size=1)
+
+    def forward(self, features: Tuple[Tensor, ...]) -> Tensor:
+        # multi-level feature fusion
+        fusion_feats = [features[0]]
+        size = features[0].shape[-2:]
+        for i in range(1, self.num_levels):
+            f = F.interpolate(features[i], size=size, mode='bilinear')
+            fusion_feats.append(f)
+        fusion_feats = torch.cat(fusion_feats, dim=1)
+        fusion_feats = self.fusion_conv(fusion_feats)
+        # pred mask feats
+        mask_features = self.stacked_convs(fusion_feats)
+        mask_features = self.projection(mask_features)
+        return mask_features
+
+
+@MODELS.register_module()
+class RTMDetInsSepBNHead(RTMDetInsHead):
+    """Detection Head of RTMDet-Ins with sep-bn layers.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        share_conv (bool): Whether to share conv layers between stages.
+            Defaults to True.
+        norm_cfg (:obj:`ConfigDict` or dict)): Config dict for normalization
+            layer. Defaults to dict(type='BN').
+        act_cfg (:obj:`ConfigDict` or dict)): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        pred_kernel_size (int): Kernel size of prediction layer. Defaults to 1.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 share_conv: bool = True,
+                 with_objectness: bool = False,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 pred_kernel_size: int = 1,
+                 **kwargs) -> None:
+        self.share_conv = share_conv
+        super().__init__(
+            num_classes,
+            in_channels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            pred_kernel_size=pred_kernel_size,
+            with_objectness=with_objectness,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.kernel_convs = nn.ModuleList()
+
+        self.rtm_cls = nn.ModuleList()
+        self.rtm_reg = nn.ModuleList()
+        self.rtm_kernel = nn.ModuleList()
+        self.rtm_obj = nn.ModuleList()
+
+        # calculate num dynamic parameters
+        weight_nums, bias_nums = [], []
+        for i in range(self.num_dyconvs):
+            if i == 0:
+                weight_nums.append(
+                    (self.num_prototypes + 2) * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels)
+            elif i == self.num_dyconvs - 1:
+                weight_nums.append(self.dyconv_channels)
+                bias_nums.append(1)
+            else:
+                weight_nums.append(self.dyconv_channels * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels)
+        self.weight_nums = weight_nums
+        self.bias_nums = bias_nums
+        self.num_gen_params = sum(weight_nums) + sum(bias_nums)
+        pred_pad_size = self.pred_kernel_size // 2
+
+        for n in range(len(self.prior_generator.strides)):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            kernel_convs = nn.ModuleList()
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                cls_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                kernel_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(cls_convs)
+            self.kernel_convs.append(kernel_convs)
+
+            self.rtm_cls.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * self.cls_out_channels,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+            self.rtm_reg.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * 4,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+            self.rtm_kernel.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_gen_params,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+            if self.with_objectness:
+                self.rtm_obj.append(
+                    nn.Conv2d(
+                        self.feat_channels,
+                        1,
+                        self.pred_kernel_size,
+                        padding=pred_pad_size))
+
+        if self.share_conv:
+            for n in range(len(self.prior_generator.strides)):
+                for i in range(self.stacked_convs):
+                    self.cls_convs[n][i].conv = self.cls_convs[0][i].conv
+                    self.reg_convs[n][i].conv = self.reg_convs[0][i].conv
+
+        self.mask_head = MaskFeatModule(
+            in_channels=self.in_channels,
+            feat_channels=self.feat_channels,
+            stacked_convs=4,
+            num_levels=len(self.prior_generator.strides),
+            num_prototypes=self.num_prototypes,
+            act_cfg=self.act_cfg,
+            norm_cfg=self.norm_cfg)
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+        bias_cls = bias_init_with_prob(0.01)
+        for rtm_cls, rtm_reg, rtm_kernel in zip(self.rtm_cls, self.rtm_reg,
+                                                self.rtm_kernel):
+            normal_init(rtm_cls, std=0.01, bias=bias_cls)
+            normal_init(rtm_reg, std=0.01, bias=1)
+        if self.with_objectness:
+            for rtm_obj in self.rtm_obj:
+                normal_init(rtm_obj, std=0.01, bias=bias_cls)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+            - kernel_preds (list[Tensor]): Dynamic conv kernels for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_gen_params.
+            - mask_feat (Tensor): Output feature of the mask head. Each is a
+              4D-tensor, the channels number is num_prototypes.
+        """
+        mask_feat = self.mask_head(feats)
+
+        cls_scores = []
+        bbox_preds = []
+        kernel_preds = []
+        for idx, (x, stride) in enumerate(
+                zip(feats, self.prior_generator.strides)):
+            cls_feat = x
+            reg_feat = x
+            kernel_feat = x
+
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls[idx](cls_feat)
+
+            for kernel_layer in self.kernel_convs[idx]:
+                kernel_feat = kernel_layer(kernel_feat)
+            kernel_pred = self.rtm_kernel[idx](kernel_feat)
+
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+
+            if self.with_objectness:
+                objectness = self.rtm_obj[idx](reg_feat)
+                cls_score = inverse_sigmoid(
+                    sigmoid_geometric_mean(cls_score, objectness))
+
+            reg_dist = F.relu(self.rtm_reg[idx](reg_feat)) * stride[0]
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+            kernel_preds.append(kernel_pred)
+        return tuple(cls_scores), tuple(bbox_preds), tuple(
+            kernel_preds), mask_feat
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/sabl_retina_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/sabl_retina_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cd1b71cc2c80035a0378180da70caddf853375d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/sabl_retina_head.py
@@ -0,0 +1,706 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList)
+from ..task_modules.samplers import PseudoSampler
+from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply,
+                     unmap)
+from .base_dense_head import BaseDenseHead
+from .guided_anchor_head import GuidedAnchorHead
+
+
+@MODELS.register_module()
+class SABLRetinaHead(BaseDenseHead):
+    """Side-Aware Boundary Localization (SABL) for RetinaNet.
+
+    The anchor generation, assigning and sampling in SABLRetinaHead
+    are the same as GuidedAnchorHead for guided anchoring.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of Convs for classification and
+            regression branches. Defaults to 4.
+        feat_channels (int): Number of hidden channels. Defaults to 256.
+        approx_anchor_generator (:obj:`ConfigType` or dict): Config dict for
+            approx generator.
+        square_anchor_generator (:obj:`ConfigDict` or dict): Config dict for
+            square generator.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            ConvModule. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            Norm Layer. Defaults to None.
+        bbox_coder (:obj:`ConfigDict` or dict): Config dict for bbox coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be ``True`` when
+            using ``IoULoss``, ``GIoULoss``, or ``DIoULoss`` in the bbox head.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            SABLRetinaHead.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            SABLRetinaHead.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox_cls (:obj:`ConfigDict` or dict): Config of classification
+            loss for bbox branch.
+        loss_bbox_reg (:obj:`ConfigDict` or dict): Config of regression loss
+            for bbox branch.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        stacked_convs: int = 4,
+        feat_channels: int = 256,
+        approx_anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        bbox_coder: ConfigType = dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        reg_decoded_bbox: bool = False,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        loss_cls: ConfigType = dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg: ConfigType = dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5),
+        init_cfg: MultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='retina_cls', std=0.01, bias_prob=0.01))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.num_buckets = bbox_coder['num_buckets']
+        self.side_num = int(np.ceil(self.num_buckets / 2))
+
+        assert (approx_anchor_generator['octave_base_scale'] ==
+                square_anchor_generator['scales'][0])
+        assert (approx_anchor_generator['strides'] ==
+                square_anchor_generator['strides'])
+
+        self.approx_anchor_generator = TASK_UTILS.build(
+            approx_anchor_generator)
+        self.square_anchor_generator = TASK_UTILS.build(
+            square_anchor_generator)
+        self.approxs_per_octave = (
+            self.approx_anchor_generator.num_base_priors[0])
+
+        # one anchor per location
+        self.num_base_priors = self.square_anchor_generator.num_base_priors[0]
+
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox_cls = MODELS.build(loss_bbox_cls)
+        self.loss_bbox_reg = MODELS.build(loss_bbox_reg)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            # use PseudoSampler when sampling is False
+            if 'sampler' in self.train_cfg:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.retina_bbox_reg = nn.Conv2d(
+            self.feat_channels, self.side_num * 4, 3, padding=1)
+        self.retina_bbox_cls = nn.Conv2d(
+            self.feat_channels, self.side_num * 4, 3, padding=1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.retina_cls(cls_feat)
+        bbox_cls_pred = self.retina_bbox_cls(reg_feat)
+        bbox_reg_pred = self.retina_bbox_reg(reg_feat)
+        bbox_pred = (bbox_cls_pred, bbox_reg_pred)
+        return cls_score, bbox_pred
+
+    def forward(self, feats: List[Tensor]) -> Tuple[List[Tensor]]:
+        return multi_apply(self.forward_single, feats)
+
+    def get_anchors(
+        self,
+        featmap_sizes: List[tuple],
+        img_metas: List[dict],
+        device: Union[torch.device, str] = 'cuda'
+    ) -> Tuple[List[List[Tensor]], List[List[Tensor]]]:
+        """Get squares according to feature map sizes and guided anchors.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): device for returned tensors
+
+        Returns:
+            tuple: square approxs of each image
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # squares for one time
+        multi_level_squares = self.square_anchor_generator.grid_priors(
+            featmap_sizes, device=device)
+        squares_list = [multi_level_squares for _ in range(num_imgs)]
+
+        return squares_list
+
+    def get_targets(self,
+                    approx_list: List[List[Tensor]],
+                    inside_flag_list: List[List[Tensor]],
+                    square_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas,
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs=True) -> tuple:
+        """Compute bucketing targets.
+
+        Args:
+            approx_list (list[list[Tensor]]): Multi level approxs of each
+                image.
+            inside_flag_list (list[list[Tensor]]): Multi level inside flags of
+                each image.
+            square_list (list[list[Tensor]]): Multi level squares of each
+                image.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: Returns a tuple containing learning targets.
+
+            - labels_list (list[Tensor]): Labels of each level.
+            - label_weights_list (list[Tensor]): Label weights of each level.
+            - bbox_cls_targets_list (list[Tensor]): BBox cls targets of \
+            each level.
+            - bbox_cls_weights_list (list[Tensor]): BBox cls weights of \
+            each level.
+            - bbox_reg_targets_list (list[Tensor]): BBox reg targets of \
+            each level.
+            - bbox_reg_weights_list (list[Tensor]): BBox reg weights of \
+            each level.
+            - num_total_pos (int): Number of positive samples in all images.
+            - num_total_neg (int): Number of negative samples in all images.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(approx_list) == len(inside_flag_list) == len(
+            square_list) == num_imgs
+        # anchor number of multi levels
+        num_level_squares = [squares.size(0) for squares in square_list[0]]
+        # concat all level anchors and flags to a single tensor
+        inside_flag_flat_list = []
+        approx_flat_list = []
+        square_flat_list = []
+        for i in range(num_imgs):
+            assert len(square_list[i]) == len(inside_flag_list[i])
+            inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
+            approx_flat_list.append(torch.cat(approx_list[i]))
+            square_flat_list.append(torch.cat(square_list[i]))
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None for _ in range(num_imgs)]
+        (all_labels, all_label_weights, all_bbox_cls_targets,
+         all_bbox_cls_weights, all_bbox_reg_targets, all_bbox_reg_weights,
+         pos_inds_list, neg_inds_list, sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             approx_flat_list,
+             inside_flag_flat_list,
+             square_flat_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs)
+
+        # sampled anchors of all images
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_squares)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_squares)
+        bbox_cls_targets_list = images_to_levels(all_bbox_cls_targets,
+                                                 num_level_squares)
+        bbox_cls_weights_list = images_to_levels(all_bbox_cls_weights,
+                                                 num_level_squares)
+        bbox_reg_targets_list = images_to_levels(all_bbox_reg_targets,
+                                                 num_level_squares)
+        bbox_reg_weights_list = images_to_levels(all_bbox_reg_weights,
+                                                 num_level_squares)
+        return (labels_list, label_weights_list, bbox_cls_targets_list,
+                bbox_cls_weights_list, bbox_reg_targets_list,
+                bbox_reg_weights_list, avg_factor)
+
+    def _get_targets_single(self,
+                            flat_approxs: Tensor,
+                            inside_flags: Tensor,
+                            flat_squares: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            flat_approxs (Tensor): flat approxs of a single image,
+                shape (n, 4)
+            inside_flags (Tensor): inside flags of a single image,
+                shape (n, ).
+            flat_squares (Tensor): flat squares of a single image,
+                shape (approxs_per_octave * n, 4)
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.  Defaults to True.
+
+        Returns:
+            tuple:
+
+            - labels_list (Tensor): Labels in a single image.
+            - label_weights (Tensor): Label weights in a single image.
+            - bbox_cls_targets (Tensor): BBox cls targets in a single image.
+            - bbox_cls_weights (Tensor): BBox cls weights in a single image.
+            - bbox_reg_targets (Tensor): BBox reg targets in a single image.
+            - bbox_reg_weights (Tensor): BBox reg weights in a single image.
+            - num_total_pos (int): Number of positive samples in a single \
+            image.
+            - num_total_neg (int): Number of negative samples in a single \
+            image.
+            - sampling_result (:obj:`SamplingResult`): Sampling result object.
+        """
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        num_square = flat_squares.size(0)
+        approxs = flat_approxs.view(num_square, self.approxs_per_octave, 4)
+        approxs = approxs[inside_flags, ...]
+        squares = flat_squares[inside_flags, :]
+
+        pred_instances = InstanceData()
+        pred_instances.priors = squares
+        pred_instances.approxs = approxs
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_squares = squares.shape[0]
+        bbox_cls_targets = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_cls_weights = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_reg_targets = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_reg_weights = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        labels = squares.new_full((num_valid_squares, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = squares.new_zeros(num_valid_squares, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            (pos_bbox_reg_targets, pos_bbox_reg_weights, pos_bbox_cls_targets,
+             pos_bbox_cls_weights) = self.bbox_coder.encode(
+                 sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+
+            bbox_cls_targets[pos_inds, :] = pos_bbox_cls_targets
+            bbox_reg_targets[pos_inds, :] = pos_bbox_reg_targets
+            bbox_cls_weights[pos_inds, :] = pos_bbox_cls_weights
+            bbox_reg_weights[pos_inds, :] = pos_bbox_reg_weights
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_squares.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_cls_targets = unmap(bbox_cls_targets, num_total_anchors,
+                                     inside_flags)
+            bbox_cls_weights = unmap(bbox_cls_weights, num_total_anchors,
+                                     inside_flags)
+            bbox_reg_targets = unmap(bbox_reg_targets, num_total_anchors,
+                                     inside_flags)
+            bbox_reg_weights = unmap(bbox_reg_weights, num_total_anchors,
+                                     inside_flags)
+        return (labels, label_weights, bbox_cls_targets, bbox_cls_weights,
+                bbox_reg_targets, bbox_reg_weights, pos_inds, neg_inds,
+                sampling_result)
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            labels: Tensor, label_weights: Tensor,
+                            bbox_cls_targets: Tensor, bbox_cls_weights: Tensor,
+                            bbox_reg_targets: Tensor, bbox_reg_weights: Tensor,
+                            avg_factor: float) -> Tuple[Tensor]:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            labels (Tensor): Labels in a single image.
+            label_weights (Tensor): Label weights in a single level.
+            bbox_cls_targets (Tensor): BBox cls targets in a single level.
+            bbox_cls_weights (Tensor): BBox cls weights in a single level.
+            bbox_reg_targets (Tensor): BBox reg targets in a single level.
+            bbox_reg_weights (Tensor): BBox reg weights in a single level.
+            avg_factor (int): Average factor that is used to average the loss.
+
+        Returns:
+            tuple: loss components.
+        """
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+        # regression loss
+        bbox_cls_targets = bbox_cls_targets.reshape(-1, self.side_num * 4)
+        bbox_cls_weights = bbox_cls_weights.reshape(-1, self.side_num * 4)
+        bbox_reg_targets = bbox_reg_targets.reshape(-1, self.side_num * 4)
+        bbox_reg_weights = bbox_reg_weights.reshape(-1, self.side_num * 4)
+        (bbox_cls_pred, bbox_reg_pred) = bbox_pred
+        bbox_cls_pred = bbox_cls_pred.permute(0, 2, 3, 1).reshape(
+            -1, self.side_num * 4)
+        bbox_reg_pred = bbox_reg_pred.permute(0, 2, 3, 1).reshape(
+            -1, self.side_num * 4)
+        loss_bbox_cls = self.loss_bbox_cls(
+            bbox_cls_pred,
+            bbox_cls_targets.long(),
+            bbox_cls_weights,
+            avg_factor=avg_factor * 4 * self.side_num)
+        loss_bbox_reg = self.loss_bbox_reg(
+            bbox_reg_pred,
+            bbox_reg_targets,
+            bbox_reg_weights,
+            avg_factor=avg_factor * 4 * self.bbox_coder.offset_topk)
+        return loss_cls, loss_bbox_cls, loss_bbox_reg
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        # get sampled approxes
+        approxs_list, inside_flag_list = GuidedAnchorHead.get_sampled_approxs(
+            self, featmap_sizes, batch_img_metas, device=device)
+
+        square_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            approxs_list,
+            inside_flag_list,
+            square_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (labels_list, label_weights_list, bbox_cls_targets_list,
+         bbox_cls_weights_list, bbox_reg_targets_list, bbox_reg_weights_list,
+         avg_factor) = cls_reg_targets
+
+        losses_cls, losses_bbox_cls, losses_bbox_reg = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            labels_list,
+            label_weights_list,
+            bbox_cls_targets_list,
+            bbox_cls_weights_list,
+            bbox_reg_targets_list,
+            bbox_reg_weights_list,
+            avg_factor=avg_factor)
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox_cls=losses_bbox_cls,
+            loss_bbox_reg=losses_bbox_reg)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        batch_img_metas: List[dict],
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+            cfg (:obj:`ConfigDict`, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+
+        device = cls_scores[0].device
+        mlvl_anchors = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_cls_pred_list = [
+                bbox_preds[i][0][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_reg_pred_list = [
+                bbox_preds[i][1][img_id].detach() for i in range(num_levels)
+            ]
+            proposals = self._predict_by_feat_single(
+                cls_scores=cls_score_list,
+                bbox_cls_preds=bbox_cls_pred_list,
+                bbox_reg_preds=bbox_reg_pred_list,
+                mlvl_anchors=mlvl_anchors[img_id],
+                img_meta=batch_img_metas[img_id],
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(proposals)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: List[Tensor],
+                                bbox_cls_preds: List[Tensor],
+                                bbox_reg_preds: List[Tensor],
+                                mlvl_anchors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        cfg = self.test_cfg if cfg is None else cfg
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_confids = []
+        mlvl_labels = []
+        assert len(cls_scores) == len(bbox_cls_preds) == len(
+            bbox_reg_preds) == len(mlvl_anchors)
+        for cls_score, bbox_cls_pred, bbox_reg_pred, anchors in zip(
+                cls_scores, bbox_cls_preds, bbox_reg_preds, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_cls_pred.size(
+            )[-2:] == bbox_reg_pred.size()[-2::]
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)[:, :-1]
+            bbox_cls_pred = bbox_cls_pred.permute(1, 2, 0).reshape(
+                -1, self.side_num * 4)
+            bbox_reg_pred = bbox_reg_pred.permute(1, 2, 0).reshape(
+                -1, self.side_num * 4)
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(
+                    anchors=anchors,
+                    bbox_cls_pred=bbox_cls_pred,
+                    bbox_reg_pred=bbox_reg_pred))
+            scores, labels, _, filtered_results = results
+
+            anchors = filtered_results['anchors']
+            bbox_cls_pred = filtered_results['bbox_cls_pred']
+            bbox_reg_pred = filtered_results['bbox_reg_pred']
+
+            bbox_preds = [
+                bbox_cls_pred.contiguous(),
+                bbox_reg_pred.contiguous()
+            ]
+            bboxes, confids = self.bbox_coder.decode(
+                anchors.contiguous(),
+                bbox_preds,
+                max_shape=img_meta['img_shape'])
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_confids.append(confids)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.score_factors = torch.cat(mlvl_confids)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/solo_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/solo_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cf338451358b01899faa4b299d33fafd7262d21
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/solo_head.py
@@ -0,0 +1,1263 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.utils.misc import floordiv
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType
+from ..layers import mask_matrix_nms
+from ..utils import center_of_mass, generate_coordinate, multi_apply
+from .base_mask_head import BaseMaskHead
+
+
+@MODELS.register_module()
+class SOLOHead(BaseMaskHead):
+    """SOLO mask head used in `SOLO: Segmenting Objects by Locations.
+
+    <https://arxiv.org/abs/1912.04488>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+            Defaults to 256.
+        stacked_convs (int): Number of stacking convs of the head.
+            Defaults to 4.
+        strides (tuple): Downsample factor of each feature map.
+        scale_ranges (tuple[tuple[int, int]]): Area range of multiple
+            level masks, in the format [(min1, max1), (min2, max2), ...].
+            A range of (16, 64) means the area range between (16, 64).
+        pos_scale (float): Constant scale factor to control the center region.
+        num_grids (list[int]): Divided image into a uniform grids, each
+            feature map has a different grid value. The number of output
+            channels is grid ** 2. Defaults to [40, 36, 24, 16, 12].
+        cls_down_index (int): The index of downsample operation in
+            classification branch. Defaults to 0.
+        loss_mask (dict): Config of mask loss.
+        loss_cls (dict): Config of classification loss.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to norm_cfg=dict(type='GN', num_groups=32,
+            requires_grad=True).
+        train_cfg (dict): Training config of head.
+        test_cfg (dict): Testing config of head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 4,
+        strides: tuple = (4, 8, 16, 32, 64),
+        scale_ranges: tuple = ((8, 32), (16, 64), (32, 128), (64, 256), (128,
+                                                                         512)),
+        pos_scale: float = 0.2,
+        num_grids: list = [40, 36, 24, 16, 12],
+        cls_down_index: int = 0,
+        loss_mask: ConfigType = dict(
+            type='DiceLoss', use_sigmoid=True, loss_weight=3.0),
+        loss_cls: ConfigType = dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        norm_cfg: ConfigType = dict(
+            type='GN', num_groups=32, requires_grad=True),
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        init_cfg: MultiConfig = [
+            dict(type='Normal', layer='Conv2d', std=0.01),
+            dict(
+                type='Normal',
+                std=0.01,
+                bias_prob=0.01,
+                override=dict(name='conv_mask_list')),
+            dict(
+                type='Normal',
+                std=0.01,
+                bias_prob=0.01,
+                override=dict(name='conv_cls'))
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.cls_out_channels = self.num_classes
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.num_grids = num_grids
+        # number of FPN feats
+        self.num_levels = len(strides)
+        assert self.num_levels == len(scale_ranges) == len(num_grids)
+        self.scale_ranges = scale_ranges
+        self.pos_scale = pos_scale
+
+        self.cls_down_index = cls_down_index
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_mask = MODELS.build(loss_mask)
+        self.norm_cfg = norm_cfg
+        self.init_cfg = init_cfg
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.mask_convs = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels + 2 if i == 0 else self.feat_channels
+            self.mask_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+        self.conv_mask_list = nn.ModuleList()
+        for num_grid in self.num_grids:
+            self.conv_mask_list.append(
+                nn.Conv2d(self.feat_channels, num_grid**2, 1))
+
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def resize_feats(self, x: Tuple[Tensor]) -> List[Tensor]:
+        """Downsample the first feat and upsample last feat in feats.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            list[Tensor]: Features after resizing, each is a 4D-tensor.
+        """
+        out = []
+        for i in range(len(x)):
+            if i == 0:
+                out.append(
+                    F.interpolate(x[0], scale_factor=0.5, mode='bilinear'))
+            elif i == len(x) - 1:
+                out.append(
+                    F.interpolate(
+                        x[i], size=x[i - 1].shape[-2:], mode='bilinear'))
+            else:
+                out.append(x[i])
+        return out
+
+    def forward(self, x: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and mask prediction.
+
+                - mlvl_mask_preds (list[Tensor]): Multi-level mask prediction.
+                  Each element in the list has shape
+                  (batch_size, num_grids**2 ,h ,w).
+                - mlvl_cls_preds (list[Tensor]): Multi-level scores.
+                  Each element in the list has shape
+                  (batch_size, num_classes, num_grids ,num_grids).
+        """
+        assert len(x) == self.num_levels
+        feats = self.resize_feats(x)
+        mlvl_mask_preds = []
+        mlvl_cls_preds = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            mask_feat = x
+            cls_feat = x
+            # generate and concat the coordinate
+            coord_feat = generate_coordinate(mask_feat.size(),
+                                             mask_feat.device)
+            mask_feat = torch.cat([mask_feat, coord_feat], 1)
+
+            for mask_layer in (self.mask_convs):
+                mask_feat = mask_layer(mask_feat)
+
+            mask_feat = F.interpolate(
+                mask_feat, scale_factor=2, mode='bilinear')
+            mask_preds = self.conv_mask_list[i](mask_feat)
+
+            # cls branch
+            for j, cls_layer in enumerate(self.cls_convs):
+                if j == self.cls_down_index:
+                    num_grid = self.num_grids[i]
+                    cls_feat = F.interpolate(
+                        cls_feat, size=num_grid, mode='bilinear')
+                cls_feat = cls_layer(cls_feat)
+
+            cls_pred = self.conv_cls(cls_feat)
+
+            if not self.training:
+                feat_wh = feats[0].size()[-2:]
+                upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2)
+                mask_preds = F.interpolate(
+                    mask_preds.sigmoid(), size=upsampled_size, mode='bilinear')
+                cls_pred = cls_pred.sigmoid()
+                # get local maximum
+                local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1)
+                keep_mask = local_max[:, :, :-1, :-1] == cls_pred
+                cls_pred = cls_pred * keep_mask
+
+            mlvl_mask_preds.append(mask_preds)
+            mlvl_cls_preds.append(cls_pred)
+        return mlvl_mask_preds, mlvl_cls_preds
+
+    def loss_by_feat(self, mlvl_mask_preds: List[Tensor],
+                     mlvl_cls_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mlvl_mask_preds (list[Tensor]): Multi-level mask prediction.
+                Each element in the list has shape
+                (batch_size, num_grids**2 ,h ,w).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_levels = self.num_levels
+        num_imgs = len(batch_img_metas)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in mlvl_mask_preds]
+
+        # `BoolTensor` in `pos_masks` represent
+        # whether the corresponding point is
+        # positive
+        pos_mask_targets, labels, pos_masks = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            featmap_sizes=featmap_sizes)
+
+        # change from the outside list meaning multi images
+        # to the outside list meaning multi levels
+        mlvl_pos_mask_targets = [[] for _ in range(num_levels)]
+        mlvl_pos_mask_preds = [[] for _ in range(num_levels)]
+        mlvl_pos_masks = [[] for _ in range(num_levels)]
+        mlvl_labels = [[] for _ in range(num_levels)]
+        for img_id in range(num_imgs):
+            assert num_levels == len(pos_mask_targets[img_id])
+            for lvl in range(num_levels):
+                mlvl_pos_mask_targets[lvl].append(
+                    pos_mask_targets[img_id][lvl])
+                mlvl_pos_mask_preds[lvl].append(
+                    mlvl_mask_preds[lvl][img_id, pos_masks[img_id][lvl], ...])
+                mlvl_pos_masks[lvl].append(pos_masks[img_id][lvl].flatten())
+                mlvl_labels[lvl].append(labels[img_id][lvl].flatten())
+
+        # cat multiple image
+        temp_mlvl_cls_preds = []
+        for lvl in range(num_levels):
+            mlvl_pos_mask_targets[lvl] = torch.cat(
+                mlvl_pos_mask_targets[lvl], dim=0)
+            mlvl_pos_mask_preds[lvl] = torch.cat(
+                mlvl_pos_mask_preds[lvl], dim=0)
+            mlvl_pos_masks[lvl] = torch.cat(mlvl_pos_masks[lvl], dim=0)
+            mlvl_labels[lvl] = torch.cat(mlvl_labels[lvl], dim=0)
+            temp_mlvl_cls_preds.append(mlvl_cls_preds[lvl].permute(
+                0, 2, 3, 1).reshape(-1, self.cls_out_channels))
+
+        num_pos = sum(item.sum() for item in mlvl_pos_masks)
+        # dice loss
+        loss_mask = []
+        for pred, target in zip(mlvl_pos_mask_preds, mlvl_pos_mask_targets):
+            if pred.size()[0] == 0:
+                loss_mask.append(pred.sum().unsqueeze(0))
+                continue
+            loss_mask.append(
+                self.loss_mask(pred, target, reduction_override='none'))
+        if num_pos > 0:
+            loss_mask = torch.cat(loss_mask).sum() / num_pos
+        else:
+            loss_mask = torch.cat(loss_mask).mean()
+
+        flatten_labels = torch.cat(mlvl_labels)
+        flatten_cls_preds = torch.cat(temp_mlvl_cls_preds)
+        loss_cls = self.loss_cls(
+            flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1)
+        return dict(loss_mask=loss_mask, loss_cls=loss_cls)
+
+    def _get_targets_single(self,
+                            gt_instances: InstanceData,
+                            featmap_sizes: Optional[list] = None) -> tuple:
+        """Compute targets for predictions of single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            featmap_sizes (list[:obj:`torch.size`]): Size of each
+                feature map from feature pyramid, each element
+                means (feat_h, feat_w). Defaults to None.
+
+        Returns:
+            Tuple: Usually returns a tuple containing targets for predictions.
+
+                - mlvl_pos_mask_targets (list[Tensor]): Each element represent
+                  the binary mask targets for positive points in this
+                  level, has shape (num_pos, out_h, out_w).
+                - mlvl_labels (list[Tensor]): Each element is
+                  classification labels for all
+                  points in this level, has shape
+                  (num_grid, num_grid).
+                - mlvl_pos_masks (list[Tensor]): Each element is
+                  a `BoolTensor` to represent whether the
+                  corresponding point in single level
+                  is positive, has shape (num_grid **2).
+        """
+        gt_labels = gt_instances.labels
+        device = gt_labels.device
+
+        gt_bboxes = gt_instances.bboxes
+        gt_areas = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                              (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device)
+
+        mlvl_pos_mask_targets = []
+        mlvl_labels = []
+        mlvl_pos_masks = []
+        for (lower_bound, upper_bound), stride, featmap_size, num_grid \
+                in zip(self.scale_ranges, self.strides,
+                       featmap_sizes, self.num_grids):
+
+            mask_target = torch.zeros(
+                [num_grid**2, featmap_size[0], featmap_size[1]],
+                dtype=torch.uint8,
+                device=device)
+            # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+            labels = torch.zeros([num_grid, num_grid],
+                                 dtype=torch.int64,
+                                 device=device) + self.num_classes
+            pos_mask = torch.zeros([num_grid**2],
+                                   dtype=torch.bool,
+                                   device=device)
+
+            gt_inds = ((gt_areas >= lower_bound) &
+                       (gt_areas <= upper_bound)).nonzero().flatten()
+            if len(gt_inds) == 0:
+                mlvl_pos_mask_targets.append(
+                    mask_target.new_zeros(0, featmap_size[0], featmap_size[1]))
+                mlvl_labels.append(labels)
+                mlvl_pos_masks.append(pos_mask)
+                continue
+            hit_gt_bboxes = gt_bboxes[gt_inds]
+            hit_gt_labels = gt_labels[gt_inds]
+            hit_gt_masks = gt_masks[gt_inds, ...]
+
+            pos_w_ranges = 0.5 * (hit_gt_bboxes[:, 2] -
+                                  hit_gt_bboxes[:, 0]) * self.pos_scale
+            pos_h_ranges = 0.5 * (hit_gt_bboxes[:, 3] -
+                                  hit_gt_bboxes[:, 1]) * self.pos_scale
+
+            # Make sure hit_gt_masks has a value
+            valid_mask_flags = hit_gt_masks.sum(dim=-1).sum(dim=-1) > 0
+            output_stride = stride / 2
+
+            for gt_mask, gt_label, pos_h_range, pos_w_range, \
+                valid_mask_flag in \
+                    zip(hit_gt_masks, hit_gt_labels, pos_h_ranges,
+                        pos_w_ranges, valid_mask_flags):
+                if not valid_mask_flag:
+                    continue
+                upsampled_size = (featmap_sizes[0][0] * 4,
+                                  featmap_sizes[0][1] * 4)
+                center_h, center_w = center_of_mass(gt_mask)
+
+                coord_w = int(
+                    floordiv((center_w / upsampled_size[1]), (1. / num_grid),
+                             rounding_mode='trunc'))
+                coord_h = int(
+                    floordiv((center_h / upsampled_size[0]), (1. / num_grid),
+                             rounding_mode='trunc'))
+
+                # left, top, right, down
+                top_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_h - pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                down_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_h + pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                left_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_w - pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                right_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_w + pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+
+                top = max(top_box, coord_h - 1)
+                down = min(down_box, coord_h + 1)
+                left = max(coord_w - 1, left_box)
+                right = min(right_box, coord_w + 1)
+
+                labels[top:(down + 1), left:(right + 1)] = gt_label
+                # ins
+                gt_mask = np.uint8(gt_mask.cpu().numpy())
+                # Follow the original implementation, F.interpolate is
+                # different from cv2 and opencv
+                gt_mask = mmcv.imrescale(gt_mask, scale=1. / output_stride)
+                gt_mask = torch.from_numpy(gt_mask).to(device=device)
+
+                for i in range(top, down + 1):
+                    for j in range(left, right + 1):
+                        index = int(i * num_grid + j)
+                        mask_target[index, :gt_mask.shape[0], :gt_mask.
+                                    shape[1]] = gt_mask
+                        pos_mask[index] = True
+            mlvl_pos_mask_targets.append(mask_target[pos_mask])
+            mlvl_labels.append(labels)
+            mlvl_pos_masks.append(pos_mask)
+        return mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks
+
+    def predict_by_feat(self, mlvl_mask_preds: List[Tensor],
+                        mlvl_cls_scores: List[Tensor],
+                        batch_img_metas: List[dict], **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mlvl_mask_preds (list[Tensor]): Multi-level mask prediction.
+                Each element in the list has shape
+                (batch_size, num_grids**2 ,h ,w).
+            mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids ,num_grids).
+            batch_img_metas (list[dict]): Meta information of all images.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        mlvl_cls_scores = [
+            item.permute(0, 2, 3, 1) for item in mlvl_cls_scores
+        ]
+        assert len(mlvl_mask_preds) == len(mlvl_cls_scores)
+        num_levels = len(mlvl_cls_scores)
+
+        results_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_pred_list = [
+                mlvl_cls_scores[lvl][img_id].view(-1, self.cls_out_channels)
+                for lvl in range(num_levels)
+            ]
+            mask_pred_list = [
+                mlvl_mask_preds[lvl][img_id] for lvl in range(num_levels)
+            ]
+
+            cls_pred_list = torch.cat(cls_pred_list, dim=0)
+            mask_pred_list = torch.cat(mask_pred_list, dim=0)
+            img_meta = batch_img_metas[img_id]
+
+            results = self._predict_by_feat_single(
+                cls_pred_list, mask_pred_list, img_meta=img_meta)
+            results_list.append(results)
+
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: Tensor,
+                                mask_preds: Tensor,
+                                img_meta: dict,
+                                cfg: OptConfigType = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            cls_scores (Tensor): Classification score of all points
+                in single image, has shape (num_points, num_classes).
+            mask_preds (Tensor): Mask prediction of all points in
+                single image, has shape (num_points, feat_h, feat_w).
+            img_meta (dict): Meta information of corresponding image.
+            cfg (dict, optional): Config used in test phase.
+                Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+
+        def empty_results(cls_scores, ori_shape):
+            """Generate a empty results."""
+            results = InstanceData()
+            results.scores = cls_scores.new_ones(0)
+            results.masks = cls_scores.new_zeros(0, *ori_shape)
+            results.labels = cls_scores.new_ones(0)
+            results.bboxes = cls_scores.new_zeros(0, 4)
+            return results
+
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(mask_preds)
+
+        featmap_size = mask_preds.size()[-2:]
+
+        h, w = img_meta['img_shape'][:2]
+        upsampled_size = (featmap_size[0] * 4, featmap_size[1] * 4)
+
+        score_mask = (cls_scores > cfg.score_thr)
+        cls_scores = cls_scores[score_mask]
+        if len(cls_scores) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+
+        inds = score_mask.nonzero()
+        cls_labels = inds[:, 1]
+
+        # Filter the mask mask with an area is smaller than
+        # stride of corresponding feature level
+        lvl_interval = cls_labels.new_tensor(self.num_grids).pow(2).cumsum(0)
+        strides = cls_scores.new_ones(lvl_interval[-1])
+        strides[:lvl_interval[0]] *= self.strides[0]
+        for lvl in range(1, self.num_levels):
+            strides[lvl_interval[lvl -
+                                 1]:lvl_interval[lvl]] *= self.strides[lvl]
+        strides = strides[inds[:, 0]]
+        mask_preds = mask_preds[inds[:, 0]]
+
+        masks = mask_preds > cfg.mask_thr
+        sum_masks = masks.sum((1, 2)).float()
+        keep = sum_masks > strides
+        if keep.sum() == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        masks = masks[keep]
+        mask_preds = mask_preds[keep]
+        sum_masks = sum_masks[keep]
+        cls_scores = cls_scores[keep]
+        cls_labels = cls_labels[keep]
+
+        # maskness.
+        mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks
+        cls_scores *= mask_scores
+
+        scores, labels, _, keep_inds = mask_matrix_nms(
+            masks,
+            cls_labels,
+            cls_scores,
+            mask_area=sum_masks,
+            nms_pre=cfg.nms_pre,
+            max_num=cfg.max_per_img,
+            kernel=cfg.kernel,
+            sigma=cfg.sigma,
+            filter_thr=cfg.filter_thr)
+        # mask_matrix_nms may return an empty Tensor
+        if len(keep_inds) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        mask_preds = mask_preds[keep_inds]
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(0), size=upsampled_size,
+            mode='bilinear')[:, :, :h, :w]
+        mask_preds = F.interpolate(
+            mask_preds, size=img_meta['ori_shape'][:2],
+            mode='bilinear').squeeze(0)
+        masks = mask_preds > cfg.mask_thr
+
+        results = InstanceData()
+        results.masks = masks
+        results.labels = labels
+        results.scores = scores
+        # create an empty bbox in InstanceData to avoid bugs when
+        # calculating metrics.
+        results.bboxes = results.scores.new_zeros(len(scores), 4)
+        return results
+
+
+@MODELS.register_module()
+class DecoupledSOLOHead(SOLOHead):
+    """Decoupled SOLO mask head used in `SOLO: Segmenting Objects by Locations.
+
+    <https://arxiv.org/abs/1912.04488>`_
+
+    Args:
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 *args,
+                 init_cfg: MultiConfig = [
+                     dict(type='Normal', layer='Conv2d', std=0.01),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_x')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_y')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_cls'))
+                 ],
+                 **kwargs) -> None:
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self) -> None:
+        self.mask_convs_x = nn.ModuleList()
+        self.mask_convs_y = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+
+        for i in range(self.stacked_convs):
+            chn = self.in_channels + 1 if i == 0 else self.feat_channels
+            self.mask_convs_x.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+            self.mask_convs_y.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+
+        self.conv_mask_list_x = nn.ModuleList()
+        self.conv_mask_list_y = nn.ModuleList()
+        for num_grid in self.num_grids:
+            self.conv_mask_list_x.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+            self.conv_mask_list_y.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and mask prediction.
+
+                - mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                  from x branch. Each element in the list has shape
+                  (batch_size, num_grids ,h ,w).
+                - mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction
+                  from y branch. Each element in the list has shape
+                  (batch_size, num_grids ,h ,w).
+                - mlvl_cls_preds (list[Tensor]): Multi-level scores.
+                  Each element in the list has shape
+                  (batch_size, num_classes, num_grids ,num_grids).
+        """
+        assert len(x) == self.num_levels
+        feats = self.resize_feats(x)
+        mask_preds_x = []
+        mask_preds_y = []
+        cls_preds = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            mask_feat = x
+            cls_feat = x
+            # generate and concat the coordinate
+            coord_feat = generate_coordinate(mask_feat.size(),
+                                             mask_feat.device)
+            mask_feat_x = torch.cat([mask_feat, coord_feat[:, 0:1, ...]], 1)
+            mask_feat_y = torch.cat([mask_feat, coord_feat[:, 1:2, ...]], 1)
+
+            for mask_layer_x, mask_layer_y in \
+                    zip(self.mask_convs_x, self.mask_convs_y):
+                mask_feat_x = mask_layer_x(mask_feat_x)
+                mask_feat_y = mask_layer_y(mask_feat_y)
+
+            mask_feat_x = F.interpolate(
+                mask_feat_x, scale_factor=2, mode='bilinear')
+            mask_feat_y = F.interpolate(
+                mask_feat_y, scale_factor=2, mode='bilinear')
+
+            mask_pred_x = self.conv_mask_list_x[i](mask_feat_x)
+            mask_pred_y = self.conv_mask_list_y[i](mask_feat_y)
+
+            # cls branch
+            for j, cls_layer in enumerate(self.cls_convs):
+                if j == self.cls_down_index:
+                    num_grid = self.num_grids[i]
+                    cls_feat = F.interpolate(
+                        cls_feat, size=num_grid, mode='bilinear')
+                cls_feat = cls_layer(cls_feat)
+
+            cls_pred = self.conv_cls(cls_feat)
+
+            if not self.training:
+                feat_wh = feats[0].size()[-2:]
+                upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2)
+                mask_pred_x = F.interpolate(
+                    mask_pred_x.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                mask_pred_y = F.interpolate(
+                    mask_pred_y.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                cls_pred = cls_pred.sigmoid()
+                # get local maximum
+                local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1)
+                keep_mask = local_max[:, :, :-1, :-1] == cls_pred
+                cls_pred = cls_pred * keep_mask
+
+            mask_preds_x.append(mask_pred_x)
+            mask_preds_y.append(mask_pred_y)
+            cls_preds.append(cls_pred)
+        return mask_preds_x, mask_preds_y, cls_preds
+
+    def loss_by_feat(self, mlvl_mask_preds_x: List[Tensor],
+                     mlvl_mask_preds_y: List[Tensor],
+                     mlvl_cls_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                from x branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction
+                from y branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_cls_preds (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids ,num_grids).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_levels = self.num_levels
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in mlvl_mask_preds_x]
+
+        pos_mask_targets, labels, xy_pos_indexes = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            featmap_sizes=featmap_sizes)
+
+        # change from the outside list meaning multi images
+        # to the outside list meaning multi levels
+        mlvl_pos_mask_targets = [[] for _ in range(num_levels)]
+        mlvl_pos_mask_preds_x = [[] for _ in range(num_levels)]
+        mlvl_pos_mask_preds_y = [[] for _ in range(num_levels)]
+        mlvl_labels = [[] for _ in range(num_levels)]
+        for img_id in range(num_imgs):
+
+            for lvl in range(num_levels):
+                mlvl_pos_mask_targets[lvl].append(
+                    pos_mask_targets[img_id][lvl])
+                mlvl_pos_mask_preds_x[lvl].append(
+                    mlvl_mask_preds_x[lvl][img_id,
+                                           xy_pos_indexes[img_id][lvl][:, 1]])
+                mlvl_pos_mask_preds_y[lvl].append(
+                    mlvl_mask_preds_y[lvl][img_id,
+                                           xy_pos_indexes[img_id][lvl][:, 0]])
+                mlvl_labels[lvl].append(labels[img_id][lvl].flatten())
+
+        # cat multiple image
+        temp_mlvl_cls_preds = []
+        for lvl in range(num_levels):
+            mlvl_pos_mask_targets[lvl] = torch.cat(
+                mlvl_pos_mask_targets[lvl], dim=0)
+            mlvl_pos_mask_preds_x[lvl] = torch.cat(
+                mlvl_pos_mask_preds_x[lvl], dim=0)
+            mlvl_pos_mask_preds_y[lvl] = torch.cat(
+                mlvl_pos_mask_preds_y[lvl], dim=0)
+            mlvl_labels[lvl] = torch.cat(mlvl_labels[lvl], dim=0)
+            temp_mlvl_cls_preds.append(mlvl_cls_preds[lvl].permute(
+                0, 2, 3, 1).reshape(-1, self.cls_out_channels))
+
+        num_pos = 0.
+        # dice loss
+        loss_mask = []
+        for pred_x, pred_y, target in \
+                zip(mlvl_pos_mask_preds_x,
+                    mlvl_pos_mask_preds_y, mlvl_pos_mask_targets):
+            num_masks = pred_x.size(0)
+            if num_masks == 0:
+                # make sure can get grad
+                loss_mask.append((pred_x.sum() + pred_y.sum()).unsqueeze(0))
+                continue
+            num_pos += num_masks
+            pred_mask = pred_y.sigmoid() * pred_x.sigmoid()
+            loss_mask.append(
+                self.loss_mask(pred_mask, target, reduction_override='none'))
+        if num_pos > 0:
+            loss_mask = torch.cat(loss_mask).sum() / num_pos
+        else:
+            loss_mask = torch.cat(loss_mask).mean()
+
+        # cate
+        flatten_labels = torch.cat(mlvl_labels)
+        flatten_cls_preds = torch.cat(temp_mlvl_cls_preds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1)
+        return dict(loss_mask=loss_mask, loss_cls=loss_cls)
+
+    def _get_targets_single(self,
+                            gt_instances: InstanceData,
+                            featmap_sizes: Optional[list] = None) -> tuple:
+        """Compute targets for predictions of single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            featmap_sizes (list[:obj:`torch.size`]): Size of each
+                feature map from feature pyramid, each element
+                means (feat_h, feat_w). Defaults to None.
+
+        Returns:
+            Tuple: Usually returns a tuple containing targets for predictions.
+
+                - mlvl_pos_mask_targets (list[Tensor]): Each element represent
+                  the binary mask targets for positive points in this
+                  level, has shape (num_pos, out_h, out_w).
+                - mlvl_labels (list[Tensor]): Each element is
+                  classification labels for all
+                  points in this level, has shape
+                  (num_grid, num_grid).
+                - mlvl_xy_pos_indexes (list[Tensor]): Each element
+                  in the list contains the index of positive samples in
+                  corresponding level, has shape (num_pos, 2), last
+                  dimension 2 present (index_x, index_y).
+        """
+        mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks = \
+            super()._get_targets_single(gt_instances,
+                                        featmap_sizes=featmap_sizes)
+
+        mlvl_xy_pos_indexes = [(item - self.num_classes).nonzero()
+                               for item in mlvl_labels]
+
+        return mlvl_pos_mask_targets, mlvl_labels, mlvl_xy_pos_indexes
+
+    def predict_by_feat(self, mlvl_mask_preds_x: List[Tensor],
+                        mlvl_mask_preds_y: List[Tensor],
+                        mlvl_cls_scores: List[Tensor],
+                        batch_img_metas: List[dict], **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                from x branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction
+                from y branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes ,num_grids ,num_grids).
+            batch_img_metas (list[dict]): Meta information of all images.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        mlvl_cls_scores = [
+            item.permute(0, 2, 3, 1) for item in mlvl_cls_scores
+        ]
+        assert len(mlvl_mask_preds_x) == len(mlvl_cls_scores)
+        num_levels = len(mlvl_cls_scores)
+
+        results_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_pred_list = [
+                mlvl_cls_scores[i][img_id].view(
+                    -1, self.cls_out_channels).detach()
+                for i in range(num_levels)
+            ]
+            mask_pred_list_x = [
+                mlvl_mask_preds_x[i][img_id] for i in range(num_levels)
+            ]
+            mask_pred_list_y = [
+                mlvl_mask_preds_y[i][img_id] for i in range(num_levels)
+            ]
+
+            cls_pred_list = torch.cat(cls_pred_list, dim=0)
+            mask_pred_list_x = torch.cat(mask_pred_list_x, dim=0)
+            mask_pred_list_y = torch.cat(mask_pred_list_y, dim=0)
+            img_meta = batch_img_metas[img_id]
+
+            results = self._predict_by_feat_single(
+                cls_pred_list,
+                mask_pred_list_x,
+                mask_pred_list_y,
+                img_meta=img_meta)
+            results_list.append(results)
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: Tensor,
+                                mask_preds_x: Tensor,
+                                mask_preds_y: Tensor,
+                                img_meta: dict,
+                                cfg: OptConfigType = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            cls_scores (Tensor): Classification score of all points
+                in single image, has shape (num_points, num_classes).
+            mask_preds_x (Tensor): Mask prediction of x branch of
+                all points in single image, has shape
+                (sum_num_grids, feat_h, feat_w).
+            mask_preds_y (Tensor): Mask prediction of y branch of
+                all points in single image, has shape
+                (sum_num_grids, feat_h, feat_w).
+            img_meta (dict): Meta information of corresponding image.
+            cfg (dict): Config used in test phase.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+
+        def empty_results(cls_scores, ori_shape):
+            """Generate a empty results."""
+            results = InstanceData()
+            results.scores = cls_scores.new_ones(0)
+            results.masks = cls_scores.new_zeros(0, *ori_shape)
+            results.labels = cls_scores.new_ones(0)
+            results.bboxes = cls_scores.new_zeros(0, 4)
+            return results
+
+        cfg = self.test_cfg if cfg is None else cfg
+
+        featmap_size = mask_preds_x.size()[-2:]
+
+        h, w = img_meta['img_shape'][:2]
+        upsampled_size = (featmap_size[0] * 4, featmap_size[1] * 4)
+
+        score_mask = (cls_scores > cfg.score_thr)
+        cls_scores = cls_scores[score_mask]
+        inds = score_mask.nonzero()
+        lvl_interval = inds.new_tensor(self.num_grids).pow(2).cumsum(0)
+        num_all_points = lvl_interval[-1]
+        lvl_start_index = inds.new_ones(num_all_points)
+        num_grids = inds.new_ones(num_all_points)
+        seg_size = inds.new_tensor(self.num_grids).cumsum(0)
+        mask_lvl_start_index = inds.new_ones(num_all_points)
+        strides = inds.new_ones(num_all_points)
+
+        lvl_start_index[:lvl_interval[0]] *= 0
+        mask_lvl_start_index[:lvl_interval[0]] *= 0
+        num_grids[:lvl_interval[0]] *= self.num_grids[0]
+        strides[:lvl_interval[0]] *= self.strides[0]
+
+        for lvl in range(1, self.num_levels):
+            lvl_start_index[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                lvl_interval[lvl - 1]
+            mask_lvl_start_index[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                seg_size[lvl - 1]
+            num_grids[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                self.num_grids[lvl]
+            strides[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                self.strides[lvl]
+
+        lvl_start_index = lvl_start_index[inds[:, 0]]
+        mask_lvl_start_index = mask_lvl_start_index[inds[:, 0]]
+        num_grids = num_grids[inds[:, 0]]
+        strides = strides[inds[:, 0]]
+
+        y_lvl_offset = (inds[:, 0] - lvl_start_index) // num_grids
+        x_lvl_offset = (inds[:, 0] - lvl_start_index) % num_grids
+        y_inds = mask_lvl_start_index + y_lvl_offset
+        x_inds = mask_lvl_start_index + x_lvl_offset
+
+        cls_labels = inds[:, 1]
+        mask_preds = mask_preds_x[x_inds, ...] * mask_preds_y[y_inds, ...]
+
+        masks = mask_preds > cfg.mask_thr
+        sum_masks = masks.sum((1, 2)).float()
+        keep = sum_masks > strides
+        if keep.sum() == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+
+        masks = masks[keep]
+        mask_preds = mask_preds[keep]
+        sum_masks = sum_masks[keep]
+        cls_scores = cls_scores[keep]
+        cls_labels = cls_labels[keep]
+
+        # maskness.
+        mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks
+        cls_scores *= mask_scores
+
+        scores, labels, _, keep_inds = mask_matrix_nms(
+            masks,
+            cls_labels,
+            cls_scores,
+            mask_area=sum_masks,
+            nms_pre=cfg.nms_pre,
+            max_num=cfg.max_per_img,
+            kernel=cfg.kernel,
+            sigma=cfg.sigma,
+            filter_thr=cfg.filter_thr)
+        # mask_matrix_nms may return an empty Tensor
+        if len(keep_inds) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        mask_preds = mask_preds[keep_inds]
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(0), size=upsampled_size,
+            mode='bilinear')[:, :, :h, :w]
+        mask_preds = F.interpolate(
+            mask_preds, size=img_meta['ori_shape'][:2],
+            mode='bilinear').squeeze(0)
+        masks = mask_preds > cfg.mask_thr
+
+        results = InstanceData()
+        results.masks = masks
+        results.labels = labels
+        results.scores = scores
+        # create an empty bbox in InstanceData to avoid bugs when
+        # calculating metrics.
+        results.bboxes = results.scores.new_zeros(len(scores), 4)
+
+        return results
+
+
+@MODELS.register_module()
+class DecoupledSOLOLightHead(DecoupledSOLOHead):
+    """Decoupled Light SOLO mask head used in `SOLO: Segmenting Objects by
+    Locations <https://arxiv.org/abs/1912.04488>`_
+
+    Args:
+        with_dcn (bool): Whether use dcn in mask_convs and cls_convs,
+            Defaults to False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 *args,
+                 dcn_cfg: OptConfigType = None,
+                 init_cfg: MultiConfig = [
+                     dict(type='Normal', layer='Conv2d', std=0.01),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_x')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_y')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_cls'))
+                 ],
+                 **kwargs) -> None:
+        assert dcn_cfg is None or isinstance(dcn_cfg, dict)
+        self.dcn_cfg = dcn_cfg
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self) -> None:
+        self.mask_convs = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+
+        for i in range(self.stacked_convs):
+            if self.dcn_cfg is not None \
+                    and i == self.stacked_convs - 1:
+                conv_cfg = self.dcn_cfg
+            else:
+                conv_cfg = None
+
+            chn = self.in_channels + 2 if i == 0 else self.feat_channels
+            self.mask_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+        self.conv_mask_list_x = nn.ModuleList()
+        self.conv_mask_list_y = nn.ModuleList()
+        for num_grid in self.num_grids:
+            self.conv_mask_list_x.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+            self.conv_mask_list_y.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and mask prediction.
+
+                - mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                  from x branch. Each element in the list has shape
+                  (batch_size, num_grids ,h ,w).
+                - mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction
+                  from y branch. Each element in the list has shape
+                  (batch_size, num_grids ,h ,w).
+                - mlvl_cls_preds (list[Tensor]): Multi-level scores.
+                  Each element in the list has shape
+                  (batch_size, num_classes, num_grids ,num_grids).
+        """
+        assert len(x) == self.num_levels
+        feats = self.resize_feats(x)
+        mask_preds_x = []
+        mask_preds_y = []
+        cls_preds = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            mask_feat = x
+            cls_feat = x
+            # generate and concat the coordinate
+            coord_feat = generate_coordinate(mask_feat.size(),
+                                             mask_feat.device)
+            mask_feat = torch.cat([mask_feat, coord_feat], 1)
+
+            for mask_layer in self.mask_convs:
+                mask_feat = mask_layer(mask_feat)
+
+            mask_feat = F.interpolate(
+                mask_feat, scale_factor=2, mode='bilinear')
+
+            mask_pred_x = self.conv_mask_list_x[i](mask_feat)
+            mask_pred_y = self.conv_mask_list_y[i](mask_feat)
+
+            # cls branch
+            for j, cls_layer in enumerate(self.cls_convs):
+                if j == self.cls_down_index:
+                    num_grid = self.num_grids[i]
+                    cls_feat = F.interpolate(
+                        cls_feat, size=num_grid, mode='bilinear')
+                cls_feat = cls_layer(cls_feat)
+
+            cls_pred = self.conv_cls(cls_feat)
+
+            if not self.training:
+                feat_wh = feats[0].size()[-2:]
+                upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2)
+                mask_pred_x = F.interpolate(
+                    mask_pred_x.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                mask_pred_y = F.interpolate(
+                    mask_pred_y.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                cls_pred = cls_pred.sigmoid()
+                # get local maximum
+                local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1)
+                keep_mask = local_max[:, :, :-1, :-1] == cls_pred
+                cls_pred = cls_pred * keep_mask
+
+            mask_preds_x.append(mask_pred_x)
+            mask_preds_y.append(mask_pred_y)
+            cls_preds.append(cls_pred)
+        return mask_preds_x, mask_preds_y, cls_preds
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/solov2_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/solov2_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..35b9df0c45148cb18e8afb659b10dd0b9e866b99
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/solov2_head.py
@@ -0,0 +1,799 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional, Tuple
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.utils.misc import floordiv
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType
+from ..layers import mask_matrix_nms
+from ..utils import center_of_mass, generate_coordinate, multi_apply
+from .solo_head import SOLOHead
+
+
+class MaskFeatModule(BaseModule):
+    """SOLOv2 mask feature map branch used in `SOLOv2: Dynamic and Fast
+    Instance Segmentation. <https://arxiv.org/pdf/2003.10152>`_
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels of the mask feature
+             map branch.
+        start_level (int): The starting feature map level from RPN that
+             will be used to predict the mask feature map.
+        end_level (int): The ending feature map level from rpn that
+             will be used to predict the mask feature map.
+        out_channels (int): Number of output channels of the mask feature
+             map branch. This is the channel count of the mask
+             feature map that to be dynamically convolved with the predicted
+             kernel.
+        mask_stride (int): Downsample factor of the mask feature map output.
+            Defaults to 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        feat_channels: int,
+        start_level: int,
+        end_level: int,
+        out_channels: int,
+        mask_stride: int = 4,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        init_cfg: MultiConfig = [
+            dict(type='Normal', layer='Conv2d', std=0.01)
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.start_level = start_level
+        self.end_level = end_level
+        self.mask_stride = mask_stride
+        assert start_level >= 0 and end_level >= start_level
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self._init_layers()
+        self.fp16_enabled = False
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.convs_all_levels = nn.ModuleList()
+        for i in range(self.start_level, self.end_level + 1):
+            convs_per_level = nn.Sequential()
+            if i == 0:
+                convs_per_level.add_module(
+                    f'conv{i}',
+                    ConvModule(
+                        self.in_channels,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        inplace=False))
+                self.convs_all_levels.append(convs_per_level)
+                continue
+
+            for j in range(i):
+                if j == 0:
+                    if i == self.end_level:
+                        chn = self.in_channels + 2
+                    else:
+                        chn = self.in_channels
+                    convs_per_level.add_module(
+                        f'conv{j}',
+                        ConvModule(
+                            chn,
+                            self.feat_channels,
+                            3,
+                            padding=1,
+                            conv_cfg=self.conv_cfg,
+                            norm_cfg=self.norm_cfg,
+                            inplace=False))
+                    convs_per_level.add_module(
+                        f'upsample{j}',
+                        nn.Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=False))
+                    continue
+
+                convs_per_level.add_module(
+                    f'conv{j}',
+                    ConvModule(
+                        self.feat_channels,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        inplace=False))
+                convs_per_level.add_module(
+                    f'upsample{j}',
+                    nn.Upsample(
+                        scale_factor=2, mode='bilinear', align_corners=False))
+
+            self.convs_all_levels.append(convs_per_level)
+
+        self.conv_pred = ConvModule(
+            self.feat_channels,
+            self.out_channels,
+            1,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+
+    def forward(self, x: Tuple[Tensor]) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: The predicted mask feature map.
+        """
+        inputs = x[self.start_level:self.end_level + 1]
+        assert len(inputs) == (self.end_level - self.start_level + 1)
+        feature_add_all_level = self.convs_all_levels[0](inputs[0])
+        for i in range(1, len(inputs)):
+            input_p = inputs[i]
+            if i == len(inputs) - 1:
+                coord_feat = generate_coordinate(input_p.size(),
+                                                 input_p.device)
+                input_p = torch.cat([input_p, coord_feat], 1)
+
+            feature_add_all_level = feature_add_all_level + \
+                self.convs_all_levels[i](input_p)
+
+        feature_pred = self.conv_pred(feature_add_all_level)
+        return feature_pred
+
+
+@MODELS.register_module()
+class SOLOV2Head(SOLOHead):
+    """SOLOv2 mask head used in `SOLOv2: Dynamic and Fast Instance
+    Segmentation. <https://arxiv.org/pdf/2003.10152>`_
+
+    Args:
+        mask_feature_head (dict): Config of SOLOv2MaskFeatHead.
+        dynamic_conv_size (int): Dynamic Conv kernel size. Defaults to 1.
+        dcn_cfg (dict): Dcn conv configurations in kernel_convs and cls_conv.
+            Defaults to None.
+        dcn_apply_to_all_conv (bool): Whether to use dcn in every layer of
+            kernel_convs and cls_convs, or only the last layer. It shall be set
+            `True` for the normal version of SOLOv2 and `False` for the
+            light-weight version. Defaults to True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 *args,
+                 mask_feature_head: ConfigType,
+                 dynamic_conv_size: int = 1,
+                 dcn_cfg: OptConfigType = None,
+                 dcn_apply_to_all_conv: bool = True,
+                 init_cfg: MultiConfig = [
+                     dict(type='Normal', layer='Conv2d', std=0.01),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_cls'))
+                 ],
+                 **kwargs) -> None:
+        assert dcn_cfg is None or isinstance(dcn_cfg, dict)
+        self.dcn_cfg = dcn_cfg
+        self.with_dcn = dcn_cfg is not None
+        self.dcn_apply_to_all_conv = dcn_apply_to_all_conv
+        self.dynamic_conv_size = dynamic_conv_size
+        mask_out_channels = mask_feature_head.get('out_channels')
+        self.kernel_out_channels = \
+            mask_out_channels * self.dynamic_conv_size * self.dynamic_conv_size
+
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+
+        # update the in_channels of mask_feature_head
+        if mask_feature_head.get('in_channels', None) is not None:
+            if mask_feature_head.in_channels != self.in_channels:
+                warnings.warn('The `in_channels` of SOLOv2MaskFeatHead and '
+                              'SOLOv2Head should be same, changing '
+                              'mask_feature_head.in_channels to '
+                              f'{self.in_channels}')
+                mask_feature_head.update(in_channels=self.in_channels)
+        else:
+            mask_feature_head.update(in_channels=self.in_channels)
+
+        self.mask_feature_head = MaskFeatModule(**mask_feature_head)
+        self.mask_stride = self.mask_feature_head.mask_stride
+        self.fp16_enabled = False
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.kernel_convs = nn.ModuleList()
+        conv_cfg = None
+        for i in range(self.stacked_convs):
+            if self.with_dcn:
+                if self.dcn_apply_to_all_conv:
+                    conv_cfg = self.dcn_cfg
+                elif i == self.stacked_convs - 1:
+                    # light head
+                    conv_cfg = self.dcn_cfg
+
+            chn = self.in_channels + 2 if i == 0 else self.feat_channels
+            self.kernel_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None))
+
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None))
+
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+        self.conv_kernel = nn.Conv2d(
+            self.feat_channels, self.kernel_out_channels, 3, padding=1)
+
+    def forward(self, x):
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores, mask prediction,
+            and mask features.
+
+                - mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel
+                  prediction. The kernel is used to generate instance
+                  segmentation masks by dynamic convolution. Each element in
+                  the list has shape
+                  (batch_size, kernel_out_channels, num_grids, num_grids).
+                - mlvl_cls_preds (list[Tensor]): Multi-level scores. Each
+                  element in the list has shape
+                  (batch_size, num_classes, num_grids, num_grids).
+                - mask_feats (Tensor): Unified mask feature map used to
+                  generate instance segmentation masks by dynamic convolution.
+                  Has shape (batch_size, mask_out_channels, h, w).
+        """
+        assert len(x) == self.num_levels
+        mask_feats = self.mask_feature_head(x)
+        ins_kernel_feats = self.resize_feats(x)
+        mlvl_kernel_preds = []
+        mlvl_cls_preds = []
+        for i in range(self.num_levels):
+            ins_kernel_feat = ins_kernel_feats[i]
+            # ins branch
+            # concat coord
+            coord_feat = generate_coordinate(ins_kernel_feat.size(),
+                                             ins_kernel_feat.device)
+            ins_kernel_feat = torch.cat([ins_kernel_feat, coord_feat], 1)
+
+            # kernel branch
+            kernel_feat = ins_kernel_feat
+            kernel_feat = F.interpolate(
+                kernel_feat,
+                size=self.num_grids[i],
+                mode='bilinear',
+                align_corners=False)
+
+            cate_feat = kernel_feat[:, :-2, :, :]
+
+            kernel_feat = kernel_feat.contiguous()
+            for i, kernel_conv in enumerate(self.kernel_convs):
+                kernel_feat = kernel_conv(kernel_feat)
+            kernel_pred = self.conv_kernel(kernel_feat)
+
+            # cate branch
+            cate_feat = cate_feat.contiguous()
+            for i, cls_conv in enumerate(self.cls_convs):
+                cate_feat = cls_conv(cate_feat)
+            cate_pred = self.conv_cls(cate_feat)
+
+            mlvl_kernel_preds.append(kernel_pred)
+            mlvl_cls_preds.append(cate_pred)
+
+        return mlvl_kernel_preds, mlvl_cls_preds, mask_feats
+
+    def _get_targets_single(self,
+                            gt_instances: InstanceData,
+                            featmap_sizes: Optional[list] = None) -> tuple:
+        """Compute targets for predictions of single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            featmap_sizes (list[:obj:`torch.size`]): Size of each
+                feature map from feature pyramid, each element
+                means (feat_h, feat_w). Defaults to None.
+
+        Returns:
+            Tuple: Usually returns a tuple containing targets for predictions.
+
+                - mlvl_pos_mask_targets (list[Tensor]): Each element represent
+                  the binary mask targets for positive points in this
+                  level, has shape (num_pos, out_h, out_w).
+                - mlvl_labels (list[Tensor]): Each element is
+                  classification labels for all
+                  points in this level, has shape
+                  (num_grid, num_grid).
+                - mlvl_pos_masks  (list[Tensor]): Each element is
+                  a `BoolTensor` to represent whether the
+                  corresponding point in single level
+                  is positive, has shape (num_grid **2).
+                - mlvl_pos_indexes  (list[list]): Each element
+                  in the list contains the positive index in
+                  corresponding level, has shape (num_pos).
+        """
+        gt_labels = gt_instances.labels
+        device = gt_labels.device
+
+        gt_bboxes = gt_instances.bboxes
+        gt_areas = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                              (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device)
+
+        mlvl_pos_mask_targets = []
+        mlvl_pos_indexes = []
+        mlvl_labels = []
+        mlvl_pos_masks = []
+        for (lower_bound, upper_bound), num_grid \
+                in zip(self.scale_ranges, self.num_grids):
+            mask_target = []
+            # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+            pos_index = []
+            labels = torch.zeros([num_grid, num_grid],
+                                 dtype=torch.int64,
+                                 device=device) + self.num_classes
+            pos_mask = torch.zeros([num_grid**2],
+                                   dtype=torch.bool,
+                                   device=device)
+
+            gt_inds = ((gt_areas >= lower_bound) &
+                       (gt_areas <= upper_bound)).nonzero().flatten()
+            if len(gt_inds) == 0:
+                mlvl_pos_mask_targets.append(
+                    torch.zeros([0, featmap_sizes[0], featmap_sizes[1]],
+                                dtype=torch.uint8,
+                                device=device))
+                mlvl_labels.append(labels)
+                mlvl_pos_masks.append(pos_mask)
+                mlvl_pos_indexes.append([])
+                continue
+            hit_gt_bboxes = gt_bboxes[gt_inds]
+            hit_gt_labels = gt_labels[gt_inds]
+            hit_gt_masks = gt_masks[gt_inds, ...]
+
+            pos_w_ranges = 0.5 * (hit_gt_bboxes[:, 2] -
+                                  hit_gt_bboxes[:, 0]) * self.pos_scale
+            pos_h_ranges = 0.5 * (hit_gt_bboxes[:, 3] -
+                                  hit_gt_bboxes[:, 1]) * self.pos_scale
+
+            # Make sure hit_gt_masks has a value
+            valid_mask_flags = hit_gt_masks.sum(dim=-1).sum(dim=-1) > 0
+
+            for gt_mask, gt_label, pos_h_range, pos_w_range, \
+                valid_mask_flag in \
+                    zip(hit_gt_masks, hit_gt_labels, pos_h_ranges,
+                        pos_w_ranges, valid_mask_flags):
+                if not valid_mask_flag:
+                    continue
+                upsampled_size = (featmap_sizes[0] * self.mask_stride,
+                                  featmap_sizes[1] * self.mask_stride)
+                center_h, center_w = center_of_mass(gt_mask)
+
+                coord_w = int(
+                    floordiv((center_w / upsampled_size[1]), (1. / num_grid),
+                             rounding_mode='trunc'))
+                coord_h = int(
+                    floordiv((center_h / upsampled_size[0]), (1. / num_grid),
+                             rounding_mode='trunc'))
+
+                # left, top, right, down
+                top_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_h - pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                down_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_h + pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                left_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_w - pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                right_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_w + pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+
+                top = max(top_box, coord_h - 1)
+                down = min(down_box, coord_h + 1)
+                left = max(coord_w - 1, left_box)
+                right = min(right_box, coord_w + 1)
+
+                labels[top:(down + 1), left:(right + 1)] = gt_label
+                # ins
+                gt_mask = np.uint8(gt_mask.cpu().numpy())
+                # Follow the original implementation, F.interpolate is
+                # different from cv2 and opencv
+                gt_mask = mmcv.imrescale(gt_mask, scale=1. / self.mask_stride)
+                gt_mask = torch.from_numpy(gt_mask).to(device=device)
+
+                for i in range(top, down + 1):
+                    for j in range(left, right + 1):
+                        index = int(i * num_grid + j)
+                        this_mask_target = torch.zeros(
+                            [featmap_sizes[0], featmap_sizes[1]],
+                            dtype=torch.uint8,
+                            device=device)
+                        this_mask_target[:gt_mask.shape[0], :gt_mask.
+                                         shape[1]] = gt_mask
+                        mask_target.append(this_mask_target)
+                        pos_mask[index] = True
+                        pos_index.append(index)
+            if len(mask_target) == 0:
+                mask_target = torch.zeros(
+                    [0, featmap_sizes[0], featmap_sizes[1]],
+                    dtype=torch.uint8,
+                    device=device)
+            else:
+                mask_target = torch.stack(mask_target, 0)
+            mlvl_pos_mask_targets.append(mask_target)
+            mlvl_labels.append(labels)
+            mlvl_pos_masks.append(pos_mask)
+            mlvl_pos_indexes.append(pos_index)
+        return (mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks,
+                mlvl_pos_indexes)
+
+    def loss_by_feat(self, mlvl_kernel_preds: List[Tensor],
+                     mlvl_cls_preds: List[Tensor], mask_feats: Tensor,
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel
+                prediction. The kernel is used to generate instance
+                segmentation masks by dynamic convolution. Each element in the
+                list has shape
+                (batch_size, kernel_out_channels, num_grids, num_grids).
+            mlvl_cls_preds (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids, num_grids).
+            mask_feats (Tensor): Unified mask feature map used to generate
+                instance segmentation masks by dynamic convolution. Has shape
+                (batch_size, mask_out_channels, h, w).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = mask_feats.size()[-2:]
+
+        pos_mask_targets, labels, pos_masks, pos_indexes = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            featmap_sizes=featmap_sizes)
+
+        mlvl_mask_targets = [
+            torch.cat(lvl_mask_targets, 0)
+            for lvl_mask_targets in zip(*pos_mask_targets)
+        ]
+
+        mlvl_pos_kernel_preds = []
+        for lvl_kernel_preds, lvl_pos_indexes in zip(mlvl_kernel_preds,
+                                                     zip(*pos_indexes)):
+            lvl_pos_kernel_preds = []
+            for img_lvl_kernel_preds, img_lvl_pos_indexes in zip(
+                    lvl_kernel_preds, lvl_pos_indexes):
+                img_lvl_pos_kernel_preds = img_lvl_kernel_preds.view(
+                    img_lvl_kernel_preds.shape[0], -1)[:, img_lvl_pos_indexes]
+                lvl_pos_kernel_preds.append(img_lvl_pos_kernel_preds)
+            mlvl_pos_kernel_preds.append(lvl_pos_kernel_preds)
+
+        # make multilevel mlvl_mask_pred
+        mlvl_mask_preds = []
+        for lvl_pos_kernel_preds in mlvl_pos_kernel_preds:
+            lvl_mask_preds = []
+            for img_id, img_lvl_pos_kernel_pred in enumerate(
+                    lvl_pos_kernel_preds):
+                if img_lvl_pos_kernel_pred.size()[-1] == 0:
+                    continue
+                img_mask_feats = mask_feats[[img_id]]
+                h, w = img_mask_feats.shape[-2:]
+                num_kernel = img_lvl_pos_kernel_pred.shape[1]
+                img_lvl_mask_pred = F.conv2d(
+                    img_mask_feats,
+                    img_lvl_pos_kernel_pred.permute(1, 0).view(
+                        num_kernel, -1, self.dynamic_conv_size,
+                        self.dynamic_conv_size),
+                    stride=1).view(-1, h, w)
+                lvl_mask_preds.append(img_lvl_mask_pred)
+            if len(lvl_mask_preds) == 0:
+                lvl_mask_preds = None
+            else:
+                lvl_mask_preds = torch.cat(lvl_mask_preds, 0)
+            mlvl_mask_preds.append(lvl_mask_preds)
+        # dice loss
+        num_pos = 0
+        for img_pos_masks in pos_masks:
+            for lvl_img_pos_masks in img_pos_masks:
+                # Fix `Tensor` object has no attribute `count_nonzero()`
+                # in PyTorch 1.6, the type of `lvl_img_pos_masks`
+                # should be `torch.bool`.
+                num_pos += lvl_img_pos_masks.nonzero().numel()
+        loss_mask = []
+        for lvl_mask_preds, lvl_mask_targets in zip(mlvl_mask_preds,
+                                                    mlvl_mask_targets):
+            if lvl_mask_preds is None:
+                continue
+            loss_mask.append(
+                self.loss_mask(
+                    lvl_mask_preds,
+                    lvl_mask_targets,
+                    reduction_override='none'))
+        if num_pos > 0:
+            loss_mask = torch.cat(loss_mask).sum() / num_pos
+        else:
+            loss_mask = mask_feats.sum() * 0
+
+        # cate
+        flatten_labels = [
+            torch.cat(
+                [img_lvl_labels.flatten() for img_lvl_labels in lvl_labels])
+            for lvl_labels in zip(*labels)
+        ]
+        flatten_labels = torch.cat(flatten_labels)
+
+        flatten_cls_preds = [
+            lvl_cls_preds.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
+            for lvl_cls_preds in mlvl_cls_preds
+        ]
+        flatten_cls_preds = torch.cat(flatten_cls_preds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1)
+        return dict(loss_mask=loss_mask, loss_cls=loss_cls)
+
+    def predict_by_feat(self, mlvl_kernel_preds: List[Tensor],
+                        mlvl_cls_scores: List[Tensor], mask_feats: Tensor,
+                        batch_img_metas: List[dict], **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel
+                prediction. The kernel is used to generate instance
+                segmentation masks by dynamic convolution. Each element in the
+                list has shape
+                (batch_size, kernel_out_channels, num_grids, num_grids).
+            mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids, num_grids).
+            mask_feats (Tensor): Unified mask feature map used to generate
+                instance segmentation masks by dynamic convolution. Has shape
+                (batch_size, mask_out_channels, h, w).
+            batch_img_metas (list[dict]): Meta information of all images.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        num_levels = len(mlvl_cls_scores)
+        assert len(mlvl_kernel_preds) == len(mlvl_cls_scores)
+
+        for lvl in range(num_levels):
+            cls_scores = mlvl_cls_scores[lvl]
+            cls_scores = cls_scores.sigmoid()
+            local_max = F.max_pool2d(cls_scores, 2, stride=1, padding=1)
+            keep_mask = local_max[:, :, :-1, :-1] == cls_scores
+            cls_scores = cls_scores * keep_mask
+            mlvl_cls_scores[lvl] = cls_scores.permute(0, 2, 3, 1)
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            img_cls_pred = [
+                mlvl_cls_scores[lvl][img_id].view(-1, self.cls_out_channels)
+                for lvl in range(num_levels)
+            ]
+            img_mask_feats = mask_feats[[img_id]]
+            img_kernel_pred = [
+                mlvl_kernel_preds[lvl][img_id].permute(1, 2, 0).view(
+                    -1, self.kernel_out_channels) for lvl in range(num_levels)
+            ]
+            img_cls_pred = torch.cat(img_cls_pred, dim=0)
+            img_kernel_pred = torch.cat(img_kernel_pred, dim=0)
+            result = self._predict_by_feat_single(
+                img_kernel_pred,
+                img_cls_pred,
+                img_mask_feats,
+                img_meta=batch_img_metas[img_id])
+            result_list.append(result)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                kernel_preds: Tensor,
+                                cls_scores: Tensor,
+                                mask_feats: Tensor,
+                                img_meta: dict,
+                                cfg: OptConfigType = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            kernel_preds (Tensor): Dynamic kernel prediction of all points
+                in single image, has shape
+                (num_points, kernel_out_channels).
+            cls_scores (Tensor): Classification score of all points
+                in single image, has shape (num_points, num_classes).
+            mask_feats (Tensor): Mask prediction of all points in
+                single image, has shape (num_points, feat_h, feat_w).
+            img_meta (dict): Meta information of corresponding image.
+            cfg (dict, optional): Config used in test phase.
+                Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+
+        def empty_results(cls_scores, ori_shape):
+            """Generate a empty results."""
+            results = InstanceData()
+            results.scores = cls_scores.new_ones(0)
+            results.masks = cls_scores.new_zeros(0, *ori_shape)
+            results.labels = cls_scores.new_ones(0)
+            results.bboxes = cls_scores.new_zeros(0, 4)
+            return results
+
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(kernel_preds) == len(cls_scores)
+
+        featmap_size = mask_feats.size()[-2:]
+
+        # overall info
+        h, w = img_meta['img_shape'][:2]
+        upsampled_size = (featmap_size[0] * self.mask_stride,
+                          featmap_size[1] * self.mask_stride)
+
+        # process.
+        score_mask = (cls_scores > cfg.score_thr)
+        cls_scores = cls_scores[score_mask]
+        if len(cls_scores) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+
+        # cate_labels & kernel_preds
+        inds = score_mask.nonzero()
+        cls_labels = inds[:, 1]
+        kernel_preds = kernel_preds[inds[:, 0]]
+
+        # trans vector.
+        lvl_interval = cls_labels.new_tensor(self.num_grids).pow(2).cumsum(0)
+        strides = kernel_preds.new_ones(lvl_interval[-1])
+
+        strides[:lvl_interval[0]] *= self.strides[0]
+        for lvl in range(1, self.num_levels):
+            strides[lvl_interval[lvl -
+                                 1]:lvl_interval[lvl]] *= self.strides[lvl]
+        strides = strides[inds[:, 0]]
+
+        # mask encoding.
+        kernel_preds = kernel_preds.view(
+            kernel_preds.size(0), -1, self.dynamic_conv_size,
+            self.dynamic_conv_size)
+        mask_preds = F.conv2d(
+            mask_feats, kernel_preds, stride=1).squeeze(0).sigmoid()
+        # mask.
+        masks = mask_preds > cfg.mask_thr
+        sum_masks = masks.sum((1, 2)).float()
+        keep = sum_masks > strides
+        if keep.sum() == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        masks = masks[keep]
+        mask_preds = mask_preds[keep]
+        sum_masks = sum_masks[keep]
+        cls_scores = cls_scores[keep]
+        cls_labels = cls_labels[keep]
+
+        # maskness.
+        mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks
+        cls_scores *= mask_scores
+
+        scores, labels, _, keep_inds = mask_matrix_nms(
+            masks,
+            cls_labels,
+            cls_scores,
+            mask_area=sum_masks,
+            nms_pre=cfg.nms_pre,
+            max_num=cfg.max_per_img,
+            kernel=cfg.kernel,
+            sigma=cfg.sigma,
+            filter_thr=cfg.filter_thr)
+        if len(keep_inds) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        mask_preds = mask_preds[keep_inds]
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(0),
+            size=upsampled_size,
+            mode='bilinear',
+            align_corners=False)[:, :, :h, :w]
+        mask_preds = F.interpolate(
+            mask_preds,
+            size=img_meta['ori_shape'][:2],
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        masks = mask_preds > cfg.mask_thr
+
+        results = InstanceData()
+        results.masks = masks
+        results.labels = labels
+        results.scores = scores
+        # create an empty bbox in InstanceData to avoid bugs when
+        # calculating metrics.
+        results.bboxes = results.scores.new_zeros(len(scores), 4)
+
+        return results
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/ssd_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/ssd_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..950df29110d914cc888bc16c6cbf1856f604a1de
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/ssd_head.py
@@ -0,0 +1,362 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptInstanceList
+from ..losses import smooth_l1_loss
+from ..task_modules.samplers import PseudoSampler
+from ..utils import multi_apply
+from .anchor_head import AnchorHead
+
+
+# TODO: add loss evaluator for SSD
+@MODELS.register_module()
+class SSDHead(AnchorHead):
+    """Implementation of `SSD head <https://arxiv.org/abs/1512.02325>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Sequence[int]): Number of channels in the input feature
+            map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Defaults to 0.
+        feat_channels (int): Number of hidden channels when stacked_convs
+            > 0. Defaults to 256.
+        use_depthwise (bool): Whether to use DepthwiseSeparableConv.
+            Defaults to False.
+        conv_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config conv layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config norm layer. Defaults to None.
+        act_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config activation layer. Defaults to None.
+        anchor_generator (:obj:`ConfigDict` or dict): Config dict for anchor
+            generator.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Defaults to False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Training config of
+            anchor head.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of
+            anchor head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], Optional): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(
+        self,
+        num_classes: int = 80,
+        in_channels: Sequence[int] = (512, 1024, 512, 256, 256, 256),
+        stacked_convs: int = 0,
+        feat_channels: int = 256,
+        use_depthwise: bool = False,
+        conv_cfg: Optional[ConfigType] = None,
+        norm_cfg: Optional[ConfigType] = None,
+        act_cfg: Optional[ConfigType] = None,
+        anchor_generator: ConfigType = dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            input_size=300,
+            strides=[8, 16, 32, 64, 100, 300],
+            ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]),
+            basesize_ratio_range=(0.1, 0.9)),
+        bbox_coder: ConfigType = dict(
+            type='DeltaXYWHBBoxCoder',
+            clip_border=True,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0],
+        ),
+        reg_decoded_bbox: bool = False,
+        train_cfg: Optional[ConfigType] = None,
+        test_cfg: Optional[ConfigType] = None,
+        init_cfg: MultiConfig = dict(
+            type='Xavier', layer='Conv2d', distribution='uniform', bias=0)
+    ) -> None:
+        super(AnchorHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.stacked_convs = stacked_convs
+        self.feat_channels = feat_channels
+        self.use_depthwise = use_depthwise
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.cls_out_channels = num_classes + 1  # add background class
+        self.prior_generator = TASK_UTILS.build(anchor_generator)
+
+        # Usually the numbers of anchors for each level are the same
+        # except SSD detectors. So it is an int in the most dense
+        # heads but a list of int in SSDHead
+        self.num_base_priors = self.prior_generator.num_base_priors
+
+        self._init_layers()
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.use_sigmoid_cls = False
+        self.cls_focal_loss = False
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        # TODO: Use registry to choose ConvModule type
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+
+        for channel, num_base_priors in zip(self.in_channels,
+                                            self.num_base_priors):
+            cls_layers = []
+            reg_layers = []
+            in_channel = channel
+            # build stacked conv tower, not used in default ssd
+            for i in range(self.stacked_convs):
+                cls_layers.append(
+                    conv(
+                        in_channel,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_layers.append(
+                    conv(
+                        in_channel,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                in_channel = self.feat_channels
+            # SSD-Lite head
+            if self.use_depthwise:
+                cls_layers.append(
+                    ConvModule(
+                        in_channel,
+                        in_channel,
+                        3,
+                        padding=1,
+                        groups=in_channel,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_layers.append(
+                    ConvModule(
+                        in_channel,
+                        in_channel,
+                        3,
+                        padding=1,
+                        groups=in_channel,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            cls_layers.append(
+                nn.Conv2d(
+                    in_channel,
+                    num_base_priors * self.cls_out_channels,
+                    kernel_size=1 if self.use_depthwise else 3,
+                    padding=0 if self.use_depthwise else 1))
+            reg_layers.append(
+                nn.Conv2d(
+                    in_channel,
+                    num_base_priors * 4,
+                    kernel_size=1 if self.use_depthwise else 3,
+                    padding=0 if self.use_depthwise else 1))
+            self.cls_convs.append(nn.Sequential(*cls_layers))
+            self.reg_convs.append(nn.Sequential(*reg_layers))
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: A tuple of cls_scores list and
+            bbox_preds list.
+
+            - cls_scores (list[Tensor]): Classification scores for all scale \
+            levels, each is a 4D-tensor, the channels number is \
+            num_anchors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale \
+            levels, each is a 4D-tensor, the channels number is \
+            num_anchors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for feat, reg_conv, cls_conv in zip(x, self.reg_convs, self.cls_convs):
+            cls_scores.append(cls_conv(feat))
+            bbox_preds.append(reg_conv(feat))
+        return cls_scores, bbox_preds
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            anchor: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            bbox_weights: Tensor,
+                            avg_factor: int) -> Tuple[Tensor, Tensor]:
+        """Compute loss of a single image.
+
+        Args:
+            cls_score (Tensor): Box scores for eachimage
+                Has shape (num_total_anchors, num_classes).
+            bbox_pred (Tensor): Box energies / deltas for each image
+                level with shape (num_total_anchors, 4).
+            anchors (Tensor): Box reference for each scale level with shape
+                (num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (num_total_anchors,).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (num_total_anchors,)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (num_total_anchors, 4).
+            avg_factor (int): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            Tuple[Tensor, Tensor]: A tuple of cls loss and bbox loss of one
+            feature map.
+        """
+
+        loss_cls_all = F.cross_entropy(
+            cls_score, labels, reduction='none') * label_weights
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero(
+            as_tuple=False).reshape(-1)
+        neg_inds = (labels == self.num_classes).nonzero(
+            as_tuple=False).view(-1)
+
+        num_pos_samples = pos_inds.size(0)
+        num_neg_samples = self.train_cfg['neg_pos_ratio'] * num_pos_samples
+        if num_neg_samples > neg_inds.size(0):
+            num_neg_samples = neg_inds.size(0)
+        topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
+        loss_cls_pos = loss_cls_all[pos_inds].sum()
+        loss_cls_neg = topk_loss_cls_neg.sum()
+        loss_cls = (loss_cls_pos + loss_cls_neg) / avg_factor
+
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            bbox_pred = self.bbox_coder.decode(anchor, bbox_pred)
+
+        loss_bbox = smooth_l1_loss(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            beta=self.train_cfg['smoothl1_beta'],
+            avg_factor=avg_factor)
+        return loss_cls[None], loss_bbox
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, List[Tensor]]:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, list[Tensor]]: A dictionary of loss components. the dict
+            has components below:
+
+            - loss_cls (list[Tensor]): A list containing each feature map \
+            classification loss.
+            - loss_bbox (list[Tensor]): A list containing each feature map \
+            regression loss.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            unmap_outputs=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor) = cls_reg_targets
+
+        num_images = len(batch_img_metas)
+        all_cls_scores = torch.cat([
+            s.permute(0, 2, 3, 1).reshape(
+                num_images, -1, self.cls_out_channels) for s in cls_scores
+        ], 1)
+        all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+        all_label_weights = torch.cat(label_weights_list,
+                                      -1).view(num_images, -1)
+        all_bbox_preds = torch.cat([
+            b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+            for b in bbox_preds
+        ], -2)
+        all_bbox_targets = torch.cat(bbox_targets_list,
+                                     -2).view(num_images, -1, 4)
+        all_bbox_weights = torch.cat(bbox_weights_list,
+                                     -2).view(num_images, -1, 4)
+
+        # concat all level anchors to a single tensor
+        all_anchors = []
+        for i in range(num_images):
+            all_anchors.append(torch.cat(anchor_list[i]))
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            all_cls_scores,
+            all_bbox_preds,
+            all_anchors,
+            all_labels,
+            all_label_weights,
+            all_bbox_targets,
+            all_bbox_weights,
+            avg_factor=avg_factor)
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/tood_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/tood_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c59598d89289df6d1a87c7b6fde112429ac8f45
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/tood_head.py
@@ -0,0 +1,805 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale
+from mmcv.ops import deform_conv2d
+from mmengine import MessageHub
+from mmengine.config import ConfigDict
+from mmengine.model import bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import distance2bbox
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply,
+                     sigmoid_geometric_mean, unmap)
+from .atss_head import ATSSHead
+
+
+class TaskDecomposition(nn.Module):
+    """Task decomposition module in task-aligned predictor of TOOD.
+
+    Args:
+        feat_channels (int): Number of feature channels in TOOD head.
+        stacked_convs (int): Number of conv layers in TOOD head.
+        la_down_rate (int): Downsample rate of layer attention.
+            Defaults to 8.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional):  Config dict for
+        normalization layer. Defaults to None.
+    """
+
+    def __init__(self,
+                 feat_channels: int,
+                 stacked_convs: int,
+                 la_down_rate: int = 8,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None) -> None:
+        super().__init__()
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.in_channels = self.feat_channels * self.stacked_convs
+        self.norm_cfg = norm_cfg
+        self.layer_attention = nn.Sequential(
+            nn.Conv2d(self.in_channels, self.in_channels // la_down_rate, 1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                self.in_channels // la_down_rate,
+                self.stacked_convs,
+                1,
+                padding=0), nn.Sigmoid())
+
+        self.reduction_conv = ConvModule(
+            self.in_channels,
+            self.feat_channels,
+            1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=norm_cfg is None)
+
+    def init_weights(self) -> None:
+        """Initialize the parameters."""
+        for m in self.layer_attention.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001)
+        normal_init(self.reduction_conv.conv, std=0.01)
+
+    def forward(self,
+                feat: Tensor,
+                avg_feat: Optional[Tensor] = None) -> Tensor:
+        """Forward function of task decomposition module."""
+        b, c, h, w = feat.shape
+        if avg_feat is None:
+            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
+        weight = self.layer_attention(avg_feat)
+
+        # here we first compute the product between layer attention weight and
+        # conv weight, and then compute the convolution between new conv weight
+        # and feature map, in order to save memory and FLOPs.
+        conv_weight = weight.reshape(
+            b, 1, self.stacked_convs,
+            1) * self.reduction_conv.conv.weight.reshape(
+                1, self.feat_channels, self.stacked_convs, self.feat_channels)
+        conv_weight = conv_weight.reshape(b, self.feat_channels,
+                                          self.in_channels)
+        feat = feat.reshape(b, self.in_channels, h * w)
+        feat = torch.bmm(conv_weight, feat).reshape(b, self.feat_channels, h,
+                                                    w)
+        if self.norm_cfg is not None:
+            feat = self.reduction_conv.norm(feat)
+        feat = self.reduction_conv.activate(feat)
+
+        return feat
+
+
+@MODELS.register_module()
+class TOODHead(ATSSHead):
+    """TOODHead used in `TOOD: Task-aligned One-stage Object Detection.
+
+    <https://arxiv.org/abs/2108.07755>`_.
+
+    TOOD uses Task-aligned head (T-head) and is optimized by Task Alignment
+    Learning (TAL).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        num_dcn (int): Number of deformable convolution in the head.
+            Defaults to 0.
+        anchor_type (str): If set to ``anchor_free``, the head will use centers
+            to regress bboxes. If set to ``anchor_based``, the head will
+            regress bboxes based on anchors. Defaults to ``anchor_free``.
+        initial_loss_cls (:obj:`ConfigDict` or dict): Config of initial loss.
+
+    Example:
+        >>> self = TOODHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred = self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 num_dcn: int = 0,
+                 anchor_type: str = 'anchor_free',
+                 initial_loss_cls: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     activated=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 **kwargs) -> None:
+        assert anchor_type in ['anchor_free', 'anchor_based']
+        self.num_dcn = num_dcn
+        self.anchor_type = anchor_type
+        super().__init__(
+            num_classes=num_classes, in_channels=in_channels, **kwargs)
+
+        if self.train_cfg:
+            self.initial_epoch = self.train_cfg['initial_epoch']
+            self.initial_assigner = TASK_UTILS.build(
+                self.train_cfg['initial_assigner'])
+            self.initial_loss_cls = MODELS.build(initial_loss_cls)
+            self.assigner = self.initial_assigner
+            self.alignment_assigner = TASK_UTILS.build(
+                self.train_cfg['assigner'])
+            self.alpha = self.train_cfg['alpha']
+            self.beta = self.train_cfg['beta']
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.inter_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            if i < self.num_dcn:
+                conv_cfg = dict(type='DCNv2', deform_groups=4)
+            else:
+                conv_cfg = self.conv_cfg
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.inter_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+        self.cls_decomp = TaskDecomposition(self.feat_channels,
+                                            self.stacked_convs,
+                                            self.stacked_convs * 8,
+                                            self.conv_cfg, self.norm_cfg)
+        self.reg_decomp = TaskDecomposition(self.feat_channels,
+                                            self.stacked_convs,
+                                            self.stacked_convs * 8,
+                                            self.conv_cfg, self.norm_cfg)
+
+        self.tood_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.tood_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+
+        self.cls_prob_module = nn.Sequential(
+            nn.Conv2d(self.feat_channels * self.stacked_convs,
+                      self.feat_channels // 4, 1), nn.ReLU(inplace=True),
+            nn.Conv2d(self.feat_channels // 4, 1, 3, padding=1))
+        self.reg_offset_module = nn.Sequential(
+            nn.Conv2d(self.feat_channels * self.stacked_convs,
+                      self.feat_channels // 4, 1), nn.ReLU(inplace=True),
+            nn.Conv2d(self.feat_channels // 4, 4 * 2, 3, padding=1))
+
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        bias_cls = bias_init_with_prob(0.01)
+        for m in self.inter_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.cls_prob_module:
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.01)
+        for m in self.reg_offset_module:
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001)
+        normal_init(self.cls_prob_module[-1], std=0.01, bias=bias_cls)
+
+        self.cls_decomp.init_weights()
+        self.reg_decomp.init_weights()
+
+        normal_init(self.tood_cls, std=0.01, bias=bias_cls)
+        normal_init(self.tood_reg, std=0.01)
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Decoded box for all scale levels,
+                    each is a 4D-tensor, the channels number is
+                    num_anchors * 4. In [tl_x, tl_y, br_x, br_y] format.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for idx, (x, scale, stride) in enumerate(
+                zip(feats, self.scales, self.prior_generator.strides)):
+            b, c, h, w = x.shape
+            anchor = self.prior_generator.single_level_grid_priors(
+                (h, w), idx, device=x.device)
+            anchor = torch.cat([anchor for _ in range(b)])
+            # extract task interactive features
+            inter_feats = []
+            for inter_conv in self.inter_convs:
+                x = inter_conv(x)
+                inter_feats.append(x)
+            feat = torch.cat(inter_feats, 1)
+
+            # task decomposition
+            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
+            cls_feat = self.cls_decomp(feat, avg_feat)
+            reg_feat = self.reg_decomp(feat, avg_feat)
+
+            # cls prediction and alignment
+            cls_logits = self.tood_cls(cls_feat)
+            cls_prob = self.cls_prob_module(feat)
+            cls_score = sigmoid_geometric_mean(cls_logits, cls_prob)
+
+            # reg prediction and alignment
+            if self.anchor_type == 'anchor_free':
+                reg_dist = scale(self.tood_reg(reg_feat).exp()).float()
+                reg_dist = reg_dist.permute(0, 2, 3, 1).reshape(-1, 4)
+                reg_bbox = distance2bbox(
+                    self.anchor_center(anchor) / stride[0],
+                    reg_dist).reshape(b, h, w, 4).permute(0, 3, 1,
+                                                          2)  # (b, c, h, w)
+            elif self.anchor_type == 'anchor_based':
+                reg_dist = scale(self.tood_reg(reg_feat)).float()
+                reg_dist = reg_dist.permute(0, 2, 3, 1).reshape(-1, 4)
+                reg_bbox = self.bbox_coder.decode(anchor, reg_dist).reshape(
+                    b, h, w, 4).permute(0, 3, 1, 2) / stride[0]
+            else:
+                raise NotImplementedError(
+                    f'Unknown anchor type: {self.anchor_type}.'
+                    f'Please use `anchor_free` or `anchor_based`.')
+            reg_offset = self.reg_offset_module(feat)
+            bbox_pred = self.deform_sampling(reg_bbox.contiguous(),
+                                             reg_offset.contiguous())
+
+            # After deform_sampling, some boxes will become invalid (The
+            # left-top point is at the right or bottom of the right-bottom
+            # point), which will make the GIoULoss negative.
+            invalid_bbox_idx = (bbox_pred[:, [0]] > bbox_pred[:, [2]]) | \
+                               (bbox_pred[:, [1]] > bbox_pred[:, [3]])
+            invalid_bbox_idx = invalid_bbox_idx.expand_as(bbox_pred)
+            bbox_pred = torch.where(invalid_bbox_idx, reg_bbox, bbox_pred)
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(bbox_pred)
+        return tuple(cls_scores), tuple(bbox_preds)
+
+    def deform_sampling(self, feat: Tensor, offset: Tensor) -> Tensor:
+        """Sampling the feature x according to offset.
+
+        Args:
+            feat (Tensor): Feature
+            offset (Tensor): Spatial offset for feature sampling
+        """
+        # it is an equivalent implementation of bilinear interpolation
+        b, c, h, w = feat.shape
+        weight = feat.new_ones(c, 1, 1, 1)
+        y = deform_conv2d(feat, offset, weight, 1, 0, 1, c, c)
+        return y
+
+    def anchor_center(self, anchors: Tensor) -> Tensor:
+        """Get anchor centers from anchors.
+
+        Args:
+            anchors (Tensor): Anchor list with shape (N, 4), "xyxy" format.
+
+        Returns:
+            Tensor: Anchor centers with shape (N, 2), "xy" format.
+        """
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        return torch.stack([anchors_cx, anchors_cy], dim=-1)
+
+    def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor,
+                            bbox_pred: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            alignment_metrics: Tensor,
+                            stride: Tuple[int, int]) -> dict:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Decoded bboxes for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors).
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            alignment_metrics (Tensor): Alignment metrics with shape
+                (N, num_total_anchors).
+            stride (Tuple[int, int]): Downsample stride of the feature map.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        alignment_metrics = alignment_metrics.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        targets = labels if self.epoch < self.initial_epoch else (
+            labels, alignment_metrics)
+        cls_loss_func = self.initial_loss_cls \
+            if self.epoch < self.initial_epoch else self.loss_cls
+
+        loss_cls = cls_loss_func(
+            cls_score, targets, label_weights, avg_factor=1.0)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+
+            pos_decode_bbox_pred = pos_bbox_pred
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+
+            # regression loss
+            pos_bbox_weight = self.centerness_target(
+                pos_anchors, pos_bbox_targets
+            ) if self.epoch < self.initial_epoch else alignment_metrics[
+                pos_inds]
+
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=pos_bbox_weight,
+                avg_factor=1.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            pos_bbox_weight = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, alignment_metrics.sum(
+        ), pos_bbox_weight.sum()
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1)
+        flatten_bbox_preds = torch.cat([
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) * stride[0]
+            for bbox_pred, stride in zip(bbox_preds,
+                                         self.prior_generator.strides)
+        ], 1)
+
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bbox_preds,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         alignment_metrics_list) = cls_reg_targets
+
+        losses_cls, losses_bbox, \
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_by_feat_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                alignment_metrics_list,
+                self.prior_generator.strides)
+
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: Optional[ConfigDict] = None,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict`, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+
+        cfg = self.test_cfg if cfg is None else cfg
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for cls_score, bbox_pred, priors, stride in zip(
+                cls_score_list, bbox_pred_list, mlvl_priors,
+                self.prior_generator.strides):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) * stride[0]
+            scores = cls_score.permute(1, 2,
+                                       0).reshape(-1, self.cls_out_channels)
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bboxes = filtered_results['bbox_pred']
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def get_targets(self,
+                    cls_scores: List[List[Tensor]],
+                    bbox_preds: List[List[Tensor]],
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            cls_scores (list[list[Tensor]]): Classification predictions of
+                images, a 3D-Tensor with shape [num_imgs, num_priors,
+                num_classes].
+            bbox_preds (list[list[Tensor]]): Decoded bboxes predictions of one
+                image, a 3D-Tensor with shape [num_imgs, num_priors, 4] in
+                [tl_x, tl_y, br_x, br_y] format.
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: a tuple containing learning targets.
+
+                - anchors_list (list[list[Tensor]]): Anchors of each level.
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - norm_alignment_metrics_list (list[Tensor]): Normalized
+                  alignment metrics of each level.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        # anchor_list: list(b * [-1, 4])
+
+        # get epoch information from message hub
+        message_hub = MessageHub.get_current_instance()
+        self.epoch = message_hub.get_info('epoch')
+
+        if self.epoch < self.initial_epoch:
+            (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+             all_bbox_weights, pos_inds_list, neg_inds_list,
+             sampling_result) = multi_apply(
+                 super()._get_targets_single,
+                 anchor_list,
+                 valid_flag_list,
+                 num_level_anchors_list,
+                 batch_gt_instances,
+                 batch_img_metas,
+                 batch_gt_instances_ignore,
+                 unmap_outputs=unmap_outputs)
+            all_assign_metrics = [
+                weight[..., 0] for weight in all_bbox_weights
+            ]
+        else:
+            (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+             all_assign_metrics) = multi_apply(
+                 self._get_targets_single,
+                 cls_scores,
+                 bbox_preds,
+                 anchor_list,
+                 valid_flag_list,
+                 batch_gt_instances,
+                 batch_img_metas,
+                 batch_gt_instances_ignore,
+                 unmap_outputs=unmap_outputs)
+
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        norm_alignment_metrics_list = images_to_levels(all_assign_metrics,
+                                                       num_level_anchors)
+
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, norm_alignment_metrics_list)
+
+    def _get_targets_single(self,
+                            cls_scores: Tensor,
+                            bbox_preds: Tensor,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            cls_scores (Tensor): Box scores for each image.
+            bbox_preds (Tensor): Box energies / deltas for each image.
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                anchors (Tensor): All anchors in the image with shape (N, 4).
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                norm_alignment_metrics (Tensor): Normalized alignment metrics
+                    of all priors in the image with shape (N,).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+        pred_instances = InstanceData(
+            priors=anchors,
+            scores=cls_scores[inside_flags, :],
+            bboxes=bbox_preds[inside_flags, :])
+        assign_result = self.alignment_assigner.assign(pred_instances,
+                                                       gt_instances,
+                                                       gt_instances_ignore,
+                                                       self.alpha, self.beta)
+        assign_ious = assign_result.max_overlaps
+        assign_metrics = assign_result.assign_metrics
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        norm_alignment_metrics = anchors.new_zeros(
+            num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            # point-based
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        class_assigned_gt_inds = torch.unique(
+            sampling_result.pos_assigned_gt_inds)
+        for gt_inds in class_assigned_gt_inds:
+            gt_class_inds = pos_inds[sampling_result.pos_assigned_gt_inds ==
+                                     gt_inds]
+            pos_alignment_metrics = assign_metrics[gt_class_inds]
+            pos_ious = assign_ious[gt_class_inds]
+            pos_norm_alignment_metrics = pos_alignment_metrics / (
+                pos_alignment_metrics.max() + 10e-8) * pos_ious.max()
+            norm_alignment_metrics[gt_class_inds] = pos_norm_alignment_metrics
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            norm_alignment_metrics = unmap(norm_alignment_metrics,
+                                           num_total_anchors, inside_flags)
+        return (anchors, labels, label_weights, bbox_targets,
+                norm_alignment_metrics)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/vfnet_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/vfnet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..430b06d085d94760d56a7ea083eaf23bd32b1f53
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/vfnet_head.py
@@ -0,0 +1,722 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+from mmcv.ops import DeformConv2d
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig,
+                         OptInstanceList, RangeType, reduce_mean)
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..task_modules.samplers import PseudoSampler
+from ..utils import multi_apply
+from .atss_head import ATSSHead
+from .fcos_head import FCOSHead
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class VFNetHead(ATSSHead, FCOSHead):
+    """Head of `VarifocalNet (VFNet): An IoU-aware Dense Object
+    Detector.<https://arxiv.org/abs/2008.13367>`_.
+
+    The VFNet predicts IoU-aware classification scores which mix the
+    object presence confidence and object localization accuracy as the
+    detection score. It is built on the FCOS architecture and uses ATSS
+    for defining positive/negative training examples. The VFNet is trained
+    with Varifocal Loss and empolys star-shaped deformable convolution to
+    extract features for a bbox.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling. Defaults to False.
+        center_sample_radius (float): Radius of center sampling. Defaults to 1.5.
+        sync_num_pos (bool): If true, synchronize the number of positive
+            examples across GPUs. Defaults to True
+        gradient_mul (float): The multiplier to gradients from bbox refinement
+            and recognition. Defaults to 0.1.
+        bbox_norm_type (str): The bbox normalization type, 'reg_denom' or
+            'stride'. Defaults to reg_denom
+        loss_cls_fl (:obj:`ConfigDict` or dict): Config of focal loss.
+        use_vfl (bool): If true, use varifocal loss for training.
+            Defaults to True.
+        loss_cls (:obj:`ConfigDict` or dict): Config of varifocal loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss,
+            GIoU Loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization
+            refinement loss, GIoU Loss.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer. Defaults to norm_cfg=dict(type='GN',
+            num_groups=32, requires_grad=True).
+        use_atss (bool): If true, use ATSS to define positive/negative
+            examples. Defaults to True.
+        anchor_generator (:obj:`ConfigDict` or dict): Config of anchor
+            generator for ATSS.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+
+    Example:
+        >>> self = VFNetHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred, bbox_pred_refine= self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 regress_ranges: RangeType = ((-1, 64), (64, 128), (128, 256),
+                                              (256, 512), (512, INF)),
+                 center_sampling: bool = False,
+                 center_sample_radius: float = 1.5,
+                 sync_num_pos: bool = True,
+                 gradient_mul: float = 0.1,
+                 bbox_norm_type: str = 'reg_denom',
+                 loss_cls_fl: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 use_vfl: bool = True,
+                 loss_cls: ConfigType = dict(
+                     type='VarifocalLoss',
+                     use_sigmoid=True,
+                     alpha=0.75,
+                     gamma=2.0,
+                     iou_weighted=True,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='GIoULoss', loss_weight=1.5),
+                 loss_bbox_refine: ConfigType = dict(
+                     type='GIoULoss', loss_weight=2.0),
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 use_atss: bool = True,
+                 reg_decoded_bbox: bool = True,
+                 anchor_generator: ConfigType = dict(
+                     type='AnchorGenerator',
+                     ratios=[1.0],
+                     octave_base_scale=8,
+                     scales_per_octave=1,
+                     center_offset=0.0,
+                     strides=[8, 16, 32, 64, 128]),
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='vfnet_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        # dcn base offsets, adapted from reppoints_head.py
+        self.num_dconv_points = 9
+        self.dcn_kernel = int(np.sqrt(self.num_dconv_points))
+        self.dcn_pad = int((self.dcn_kernel - 1) / 2)
+        dcn_base = np.arange(-self.dcn_pad,
+                             self.dcn_pad + 1).astype(np.float64)
+        dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
+        dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
+        dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
+            (-1))
+        self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)
+
+        super(FCOSHead, self).__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.regress_ranges = regress_ranges
+        self.reg_denoms = [
+            regress_range[-1] for regress_range in regress_ranges
+        ]
+        self.reg_denoms[-1] = self.reg_denoms[-2] * 2
+        self.center_sampling = center_sampling
+        self.center_sample_radius = center_sample_radius
+        self.sync_num_pos = sync_num_pos
+        self.bbox_norm_type = bbox_norm_type
+        self.gradient_mul = gradient_mul
+        self.use_vfl = use_vfl
+        if self.use_vfl:
+            self.loss_cls = MODELS.build(loss_cls)
+        else:
+            self.loss_cls = MODELS.build(loss_cls_fl)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_bbox_refine = MODELS.build(loss_bbox_refine)
+
+        # for getting ATSS targets
+        self.use_atss = use_atss
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+
+        self.anchor_center_offset = anchor_generator['center_offset']
+
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler()
+        # only be used in `get_atss_targets` when `use_atss` is True
+        self.atss_prior_generator = TASK_UTILS.build(anchor_generator)
+
+        self.fcos_prior_generator = MlvlPointGenerator(
+            anchor_generator['strides'],
+            self.anchor_center_offset if self.use_atss else 0.5)
+
+        # In order to reuse the `get_bboxes` in `BaseDenseHead.
+        # Only be used in testing phase.
+        self.prior_generator = self.fcos_prior_generator
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        super(FCOSHead, self)._init_cls_convs()
+        super(FCOSHead, self)._init_reg_convs()
+        self.relu = nn.ReLU()
+        self.vfnet_reg_conv = ConvModule(
+            self.feat_channels,
+            self.feat_channels,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            bias=self.conv_bias)
+        self.vfnet_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+        self.vfnet_reg_refine_dconv = DeformConv2d(
+            self.feat_channels,
+            self.feat_channels,
+            self.dcn_kernel,
+            1,
+            padding=self.dcn_pad)
+        self.vfnet_reg_refine = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.scales_refine = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+        self.vfnet_cls_dconv = DeformConv2d(
+            self.feat_channels,
+            self.feat_channels,
+            self.dcn_kernel,
+            1,
+            padding=self.dcn_pad)
+        self.vfnet_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+
+            - cls_scores (list[Tensor]): Box iou-aware scores for each scale
+              level, each is a 4D-tensor, the channel number is
+              num_points * num_classes.
+            - bbox_preds (list[Tensor]): Box offsets for each
+              scale level, each is a 4D-tensor, the channel number is
+              num_points * 4.
+            - bbox_preds_refine (list[Tensor]): Refined Box offsets for
+              each scale level, each is a 4D-tensor, the channel
+              number is num_points * 4.
+        """
+        return multi_apply(self.forward_single, x, self.scales,
+                           self.scales_refine, self.strides, self.reg_denoms)
+
+    def forward_single(self, x: Tensor, scale: Scale, scale_refine: Scale,
+                       stride: int, reg_denom: int) -> tuple:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            scale_refine (:obj: `mmcv.cnn.Scale`): Learnable scale module to
+                resize the refined bbox prediction.
+            stride (int): The corresponding stride for feature maps,
+                used to normalize the bbox prediction when
+                bbox_norm_type = 'stride'.
+            reg_denom (int): The corresponding regression range for feature
+                maps, only used to normalize the bbox prediction when
+                bbox_norm_type = 'reg_denom'.
+
+        Returns:
+            tuple: iou-aware cls scores for each box, bbox predictions and
+            refined bbox predictions of input feature maps.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+
+        # predict the bbox_pred of different level
+        reg_feat_init = self.vfnet_reg_conv(reg_feat)
+        if self.bbox_norm_type == 'reg_denom':
+            bbox_pred = scale(
+                self.vfnet_reg(reg_feat_init)).float().exp() * reg_denom
+        elif self.bbox_norm_type == 'stride':
+            bbox_pred = scale(
+                self.vfnet_reg(reg_feat_init)).float().exp() * stride
+        else:
+            raise NotImplementedError
+
+        # compute star deformable convolution offsets
+        # converting dcn_offset to reg_feat.dtype thus VFNet can be
+        # trained with FP16
+        dcn_offset = self.star_dcn_offset(bbox_pred, self.gradient_mul,
+                                          stride).to(reg_feat.dtype)
+
+        # refine the bbox_pred
+        reg_feat = self.relu(self.vfnet_reg_refine_dconv(reg_feat, dcn_offset))
+        bbox_pred_refine = scale_refine(
+            self.vfnet_reg_refine(reg_feat)).float().exp()
+        bbox_pred_refine = bbox_pred_refine * bbox_pred.detach()
+
+        # predict the iou-aware cls score
+        cls_feat = self.relu(self.vfnet_cls_dconv(cls_feat, dcn_offset))
+        cls_score = self.vfnet_cls(cls_feat)
+
+        if self.training:
+            return cls_score, bbox_pred, bbox_pred_refine
+        else:
+            return cls_score, bbox_pred_refine
+
+    def star_dcn_offset(self, bbox_pred: Tensor, gradient_mul: float,
+                        stride: int) -> Tensor:
+        """Compute the star deformable conv offsets.
+
+        Args:
+            bbox_pred (Tensor): Predicted bbox distance offsets (l, r, t, b).
+            gradient_mul (float): Gradient multiplier.
+            stride (int): The corresponding stride for feature maps,
+                used to project the bbox onto the feature map.
+
+        Returns:
+            Tensor: The offsets for deformable convolution.
+        """
+        dcn_base_offset = self.dcn_base_offset.type_as(bbox_pred)
+        bbox_pred_grad_mul = (1 - gradient_mul) * bbox_pred.detach() + \
+            gradient_mul * bbox_pred
+        # map to the feature map scale
+        bbox_pred_grad_mul = bbox_pred_grad_mul / stride
+        N, C, H, W = bbox_pred.size()
+
+        x1 = bbox_pred_grad_mul[:, 0, :, :]
+        y1 = bbox_pred_grad_mul[:, 1, :, :]
+        x2 = bbox_pred_grad_mul[:, 2, :, :]
+        y2 = bbox_pred_grad_mul[:, 3, :, :]
+        bbox_pred_grad_mul_offset = bbox_pred.new_zeros(
+            N, 2 * self.num_dconv_points, H, W)
+        bbox_pred_grad_mul_offset[:, 0, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 1, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 2, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 4, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 5, :, :] = x2  # x2
+        bbox_pred_grad_mul_offset[:, 7, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 11, :, :] = x2  # x2
+        bbox_pred_grad_mul_offset[:, 12, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 13, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 14, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 16, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 17, :, :] = x2  # x2
+        dcn_offset = bbox_pred_grad_mul_offset - dcn_base_offset
+
+        return dcn_offset
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            bbox_preds_refine: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box offsets for each
+                scale level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            bbox_preds_refine (list[Tensor]): Refined Box offsets for
+                each scale level, each is a 4D-tensor, the channel
+                number is num_points * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(bbox_preds_refine)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.fcos_prior_generator.grid_priors(
+            featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device)
+        labels, label_weights, bbox_targets, bbox_weights = self.get_targets(
+            cls_scores,
+            all_level_points,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and bbox_preds_refine
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3,
+                              1).reshape(-1,
+                                         self.cls_out_channels).contiguous()
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4).contiguous()
+            for bbox_pred in bbox_preds
+        ]
+        flatten_bbox_preds_refine = [
+            bbox_pred_refine.permute(0, 2, 3, 1).reshape(-1, 4).contiguous()
+            for bbox_pred_refine in bbox_preds_refine
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_bbox_preds_refine = torch.cat(flatten_bbox_preds_refine)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        # FG cat_id: [0, num_classes - 1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = torch.where(
+            ((flatten_labels >= 0) & (flatten_labels < bg_class_ind)) > 0)[0]
+        num_pos = len(pos_inds)
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_bbox_preds_refine = flatten_bbox_preds_refine[pos_inds]
+        pos_labels = flatten_labels[pos_inds]
+
+        # sync num_pos across all gpus
+        if self.sync_num_pos:
+            num_pos_avg_per_gpu = reduce_mean(
+                pos_inds.new_tensor(num_pos).float()).item()
+            num_pos_avg_per_gpu = max(num_pos_avg_per_gpu, 1.0)
+        else:
+            num_pos_avg_per_gpu = num_pos
+
+        pos_bbox_targets = flatten_bbox_targets[pos_inds]
+        pos_points = flatten_points[pos_inds]
+
+        pos_decoded_bbox_preds = self.bbox_coder.decode(
+            pos_points, pos_bbox_preds)
+        pos_decoded_target_preds = self.bbox_coder.decode(
+            pos_points, pos_bbox_targets)
+        iou_targets_ini = bbox_overlaps(
+            pos_decoded_bbox_preds,
+            pos_decoded_target_preds.detach(),
+            is_aligned=True).clamp(min=1e-6)
+        bbox_weights_ini = iou_targets_ini.clone().detach()
+        bbox_avg_factor_ini = reduce_mean(
+            bbox_weights_ini.sum()).clamp_(min=1).item()
+
+        pos_decoded_bbox_preds_refine = \
+            self.bbox_coder.decode(pos_points, pos_bbox_preds_refine)
+        iou_targets_rf = bbox_overlaps(
+            pos_decoded_bbox_preds_refine,
+            pos_decoded_target_preds.detach(),
+            is_aligned=True).clamp(min=1e-6)
+        bbox_weights_rf = iou_targets_rf.clone().detach()
+        bbox_avg_factor_rf = reduce_mean(
+            bbox_weights_rf.sum()).clamp_(min=1).item()
+
+        if num_pos > 0:
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds.detach(),
+                weight=bbox_weights_ini,
+                avg_factor=bbox_avg_factor_ini)
+
+            loss_bbox_refine = self.loss_bbox_refine(
+                pos_decoded_bbox_preds_refine,
+                pos_decoded_target_preds.detach(),
+                weight=bbox_weights_rf,
+                avg_factor=bbox_avg_factor_rf)
+
+            # build IoU-aware cls_score targets
+            if self.use_vfl:
+                pos_ious = iou_targets_rf.clone().detach()
+                cls_iou_targets = torch.zeros_like(flatten_cls_scores)
+                cls_iou_targets[pos_inds, pos_labels] = pos_ious
+        else:
+            loss_bbox = pos_bbox_preds.sum() * 0
+            loss_bbox_refine = pos_bbox_preds_refine.sum() * 0
+            if self.use_vfl:
+                cls_iou_targets = torch.zeros_like(flatten_cls_scores)
+
+        if self.use_vfl:
+            loss_cls = self.loss_cls(
+                flatten_cls_scores,
+                cls_iou_targets,
+                avg_factor=num_pos_avg_per_gpu)
+        else:
+            loss_cls = self.loss_cls(
+                flatten_cls_scores,
+                flatten_labels,
+                weight=label_weights,
+                avg_factor=num_pos_avg_per_gpu)
+
+        return dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_bbox_rf=loss_bbox_refine)
+
+    def get_targets(
+            self,
+            cls_scores: List[Tensor],
+            mlvl_points: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> tuple:
+        """A wrapper for computing ATSS and FCOS targets for points in multiple
+        images.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level with shape (N, num_points * num_classes, H, W).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            tuple:
+
+            - labels_list (list[Tensor]): Labels of each level.
+            - label_weights (Tensor/None): Label weights of all levels.
+            - bbox_targets_list (list[Tensor]): Regression targets of each
+              level, (l, t, r, b).
+            - bbox_weights (Tensor/None): Bbox weights of all levels.
+        """
+        if self.use_atss:
+            return self.get_atss_targets(cls_scores, mlvl_points,
+                                         batch_gt_instances, batch_img_metas,
+                                         batch_gt_instances_ignore)
+        else:
+            self.norm_on_bbox = False
+            return self.get_fcos_targets(mlvl_points, batch_gt_instances)
+
+    def _get_targets_single(self, *args, **kwargs):
+        """Avoid ambiguity in multiple inheritance."""
+        if self.use_atss:
+            return ATSSHead._get_targets_single(self, *args, **kwargs)
+        else:
+            return FCOSHead._get_targets_single(self, *args, **kwargs)
+
+    def get_fcos_targets(self, points: List[Tensor],
+                         batch_gt_instances: InstanceList) -> tuple:
+        """Compute FCOS regression and classification targets for points in
+        multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple:
+
+            - labels (list[Tensor]): Labels of each level.
+            - label_weights: None, to be compatible with ATSS targets.
+            - bbox_targets (list[Tensor]): BBox targets of each level.
+            - bbox_weights: None, to be compatible with ATSS targets.
+        """
+        labels, bbox_targets = FCOSHead.get_targets(self, points,
+                                                    batch_gt_instances)
+        label_weights = None
+        bbox_weights = None
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_anchors(self,
+                    featmap_sizes: List[Tuple],
+                    batch_img_metas: List[dict],
+                    device: str = 'cuda') -> tuple:
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+            device (str): Device for returned tensors
+
+        Returns:
+            tuple:
+
+            - anchor_list (list[Tensor]): Anchors of each image.
+            - valid_flag_list (list[Tensor]): Valid flags of each image.
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.atss_prior_generator.grid_priors(
+            featmap_sizes, device=device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = self.atss_prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device=device)
+            valid_flag_list.append(multi_level_flags)
+
+        return anchor_list, valid_flag_list
+
+    def get_atss_targets(
+            self,
+            cls_scores: List[Tensor],
+            mlvl_points: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> tuple:
+        """A wrapper for computing ATSS targets for points in multiple images.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level with shape (N, num_points * num_classes, H, W).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            tuple:
+
+            - labels_list (list[Tensor]): Labels of each level.
+            - label_weights (Tensor): Label weights of all levels.
+            - bbox_targets_list (list[Tensor]): Regression targets of each
+              level, (l, t, r, b).
+            - bbox_weights (Tensor): Bbox weights of all levels.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(
+            featmap_sizes
+        ) == self.atss_prior_generator.num_levels == \
+            self.fcos_prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = ATSSHead.get_targets(
+            self,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore,
+            unmap_outputs=True)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+
+        bbox_targets_list = [
+            bbox_targets.reshape(-1, 4) for bbox_targets in bbox_targets_list
+        ]
+
+        num_imgs = len(batch_img_metas)
+        # transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format
+        bbox_targets_list = self.transform_bbox_targets(
+            bbox_targets_list, mlvl_points, num_imgs)
+
+        labels_list = [labels.reshape(-1) for labels in labels_list]
+        label_weights_list = [
+            label_weights.reshape(-1) for label_weights in label_weights_list
+        ]
+        bbox_weights_list = [
+            bbox_weights.reshape(-1) for bbox_weights in bbox_weights_list
+        ]
+        label_weights = torch.cat(label_weights_list)
+        bbox_weights = torch.cat(bbox_weights_list)
+        return labels_list, label_weights, bbox_targets_list, bbox_weights
+
+    def transform_bbox_targets(self, decoded_bboxes: List[Tensor],
+                               mlvl_points: List[Tensor],
+                               num_imgs: int) -> List[Tensor]:
+        """Transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format.
+
+        Args:
+            decoded_bboxes (list[Tensor]): Regression targets of each level,
+                in the form of (x1, y1, x2, y2).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            num_imgs (int): the number of images in a batch.
+
+        Returns:
+            bbox_targets (list[Tensor]): Regression targets of each level in
+                the form of (l, t, r, b).
+        """
+        # TODO: Re-implemented in Class PointCoder
+        assert len(decoded_bboxes) == len(mlvl_points)
+        num_levels = len(decoded_bboxes)
+        mlvl_points = [points.repeat(num_imgs, 1) for points in mlvl_points]
+        bbox_targets = []
+        for i in range(num_levels):
+            bbox_target = self.bbox_coder.encode(mlvl_points[i],
+                                                 decoded_bboxes[i])
+            bbox_targets.append(bbox_target)
+
+        return bbox_targets
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Override the method in the parent class to avoid changing para's
+        name."""
+        pass
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/yolact_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/yolact_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3390c136a31bee81134667eb28ad8829ddb84cc3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/yolact_head.py
@@ -0,0 +1,1193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from ..layers import fast_nms
+from ..utils import images_to_levels, multi_apply, select_single_mlvl
+from ..utils.misc import empty_instances
+from .anchor_head import AnchorHead
+from .base_mask_head import BaseMaskHead
+
+
+@MODELS.register_module()
+class YOLACTHead(AnchorHead):
+    """YOLACT box head used in https://arxiv.org/abs/1904.02689.
+
+    Note that YOLACT head is a light version of RetinaNet head.
+    Four differences are described as follows:
+
+    1. YOLACT box head has three-times fewer anchors.
+    2. YOLACT box head shares the convs for box and cls branches.
+    3. YOLACT box head uses OHEM instead of Focal loss.
+    4. YOLACT box head predicts a set of mask coefficients for each box.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        anchor_generator (:obj:`ConfigDict` or dict): Config dict for
+            anchor generator
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        num_head_convs (int): Number of the conv layers shared by
+            box and cls branches.
+        num_protos (int): Number of the mask coefficients.
+        use_ohem (bool): If true, ``loss_single_OHEM`` will be used for
+            cls loss calculation. If false, ``loss_single`` will be used.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Dictionary to
+            construct and config conv layer.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Dictionary to
+            construct and config norm layer.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 anchor_generator: ConfigType = dict(
+                     type='AnchorGenerator',
+                     octave_base_scale=3,
+                     scales_per_octave=1,
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[8, 16, 32, 64, 128]),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     reduction='none',
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1.5),
+                 num_head_convs: int = 1,
+                 num_protos: int = 32,
+                 use_ohem: bool = True,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = dict(
+                     type='Xavier',
+                     distribution='uniform',
+                     bias=0,
+                     layer='Conv2d'),
+                 **kwargs) -> None:
+        self.num_head_convs = num_head_convs
+        self.num_protos = num_protos
+        self.use_ohem = use_ohem
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            anchor_generator=anchor_generator,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.head_convs = ModuleList()
+        for i in range(self.num_head_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.head_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.conv_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+        self.conv_coeff = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.num_protos,
+            3,
+            padding=1)
+
+    def forward_single(self, x: Tensor) -> tuple:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+
+            - cls_score (Tensor): Cls scores for a single scale level
+              the channels number is num_anchors * num_classes.
+            - bbox_pred (Tensor): Box energies / deltas for a single scale
+              level, the channels number is num_anchors * 4.
+            - coeff_pred (Tensor): Mask coefficients for a single scale
+              level, the channels number is num_anchors * num_protos.
+        """
+        for head_conv in self.head_convs:
+            x = head_conv(x)
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        coeff_pred = self.conv_coeff(x).tanh()
+        return cls_score, bbox_pred, coeff_pred
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            coeff_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the bbox head.
+
+        When ``self.use_ohem == True``, it functions like ``SSDHead.loss``,
+        otherwise, it follows ``AnchorHead.loss``.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            coeff_preds (list[Tensor]): Mask coefficients for each scale
+                level with shape (N, num_anchors * num_protos, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            unmap_outputs=not self.use_ohem,
+            return_sampling_results=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results) = cls_reg_targets
+
+        if self.use_ohem:
+            num_images = len(batch_img_metas)
+            all_cls_scores = torch.cat([
+                s.permute(0, 2, 3, 1).reshape(
+                    num_images, -1, self.cls_out_channels) for s in cls_scores
+            ], 1)
+            all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+            all_label_weights = torch.cat(label_weights_list,
+                                          -1).view(num_images, -1)
+            all_bbox_preds = torch.cat([
+                b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+                for b in bbox_preds
+            ], -2)
+            all_bbox_targets = torch.cat(bbox_targets_list,
+                                         -2).view(num_images, -1, 4)
+            all_bbox_weights = torch.cat(bbox_weights_list,
+                                         -2).view(num_images, -1, 4)
+
+            # concat all level anchors to a single tensor
+            all_anchors = []
+            for i in range(num_images):
+                all_anchors.append(torch.cat(anchor_list[i]))
+
+            # check NaN and Inf
+            assert torch.isfinite(all_cls_scores).all().item(), \
+                'classification scores become infinite or NaN!'
+            assert torch.isfinite(all_bbox_preds).all().item(), \
+                'bbox predications become infinite or NaN!'
+
+            losses_cls, losses_bbox = multi_apply(
+                self.OHEMloss_by_feat_single,
+                all_cls_scores,
+                all_bbox_preds,
+                all_anchors,
+                all_labels,
+                all_label_weights,
+                all_bbox_targets,
+                all_bbox_weights,
+                avg_factor=avg_factor)
+        else:
+            # anchor number of multi levels
+            num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+            # concat all level anchors and flags to a single tensor
+            concat_anchor_list = []
+            for i in range(len(anchor_list)):
+                concat_anchor_list.append(torch.cat(anchor_list[i]))
+            all_anchor_list = images_to_levels(concat_anchor_list,
+                                               num_level_anchors)
+            losses_cls, losses_bbox = multi_apply(
+                self.loss_by_feat_single,
+                cls_scores,
+                bbox_preds,
+                all_anchor_list,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                bbox_weights_list,
+                avg_factor=avg_factor)
+        losses = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+        # update `_raw_positive_infos`, which will be used when calling
+        # `get_positive_infos`.
+        self._raw_positive_infos.update(coeff_preds=coeff_preds)
+        return losses
+
+    def OHEMloss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                                anchors: Tensor, labels: Tensor,
+                                label_weights: Tensor, bbox_targets: Tensor,
+                                bbox_weights: Tensor,
+                                avg_factor: int) -> tuple:
+        """Compute loss of a single image. Similar to
+        func:``SSDHead.loss_by_feat_single``
+
+        Args:
+            cls_score (Tensor): Box scores for eachimage
+                Has shape (num_total_anchors, num_classes).
+            bbox_pred (Tensor): Box energies / deltas for each image
+                level with shape (num_total_anchors, 4).
+            anchors (Tensor): Box reference for each scale level with shape
+                (num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (num_total_anchors,).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (num_total_anchors,)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (num_total_anchors, 4).
+            avg_factor (int): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            Tuple[Tensor, Tensor]: A tuple of cls loss and bbox loss of one
+            feature map.
+        """
+
+        loss_cls_all = self.loss_cls(cls_score, labels, label_weights)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero(
+            as_tuple=False).reshape(-1)
+        neg_inds = (labels == self.num_classes).nonzero(
+            as_tuple=False).view(-1)
+
+        num_pos_samples = pos_inds.size(0)
+        if num_pos_samples == 0:
+            num_neg_samples = neg_inds.size(0)
+        else:
+            num_neg_samples = self.train_cfg['neg_pos_ratio'] * \
+                              num_pos_samples
+            if num_neg_samples > neg_inds.size(0):
+                num_neg_samples = neg_inds.size(0)
+        topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
+        loss_cls_pos = loss_cls_all[pos_inds].sum()
+        loss_cls_neg = topk_loss_cls_neg.sum()
+        loss_cls = (loss_cls_pos + loss_cls_neg) / avg_factor
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+        loss_bbox = self.loss_bbox(
+            bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor)
+        return loss_cls[None], loss_bbox
+
+    def get_positive_infos(self) -> InstanceList:
+        """Get positive information from sampling results.
+
+        Returns:
+            list[:obj:`InstanceData`]: Positive Information of each image,
+            usually including positive bboxes, positive labels, positive
+            priors, positive coeffs, etc.
+        """
+        assert len(self._raw_positive_infos) > 0
+        sampling_results = self._raw_positive_infos['sampling_results']
+        num_imgs = len(sampling_results)
+
+        coeff_pred_list = []
+        for coeff_pred_per_level in self._raw_positive_infos['coeff_preds']:
+            coeff_pred_per_level = \
+                coeff_pred_per_level.permute(
+                    0, 2, 3, 1).reshape(num_imgs, -1, self.num_protos)
+            coeff_pred_list.append(coeff_pred_per_level)
+        coeff_preds = torch.cat(coeff_pred_list, dim=1)
+
+        pos_info_list = []
+        for idx, sampling_result in enumerate(sampling_results):
+            pos_info = InstanceData()
+            coeff_preds_single = coeff_preds[idx]
+            pos_info.pos_assigned_gt_inds = \
+                sampling_result.pos_assigned_gt_inds
+            pos_info.pos_inds = sampling_result.pos_inds
+            pos_info.coeffs = coeff_preds_single[sampling_result.pos_inds]
+            pos_info.bboxes = sampling_result.pos_gt_bboxes
+            pos_info_list.append(pos_info)
+        return pos_info_list
+
+    def predict_by_feat(self,
+                        cls_scores,
+                        bbox_preds,
+                        coeff_preds,
+                        batch_img_metas,
+                        cfg=None,
+                        rescale=True,
+                        **kwargs):
+        """Similar to func:``AnchorHead.get_bboxes``, but additionally
+        processes coeff_preds.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                with shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            coeff_preds (list[Tensor]): Mask coefficients for each scale
+                level with shape (N, num_anchors * num_protos, H, W)
+            batch_img_metas (list[dict]): Batch image meta info.
+            cfg (:obj:`Config` | None): Test / postprocessing configuration,
+                if None, test_cfg would be used
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - coeffs (Tensor): the predicted mask coefficients of
+                  instance inside the corresponding box has a shape
+                  (n, num_protos).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+
+        device = cls_scores[0].device
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device)
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            cls_score_list = select_single_mlvl(cls_scores, img_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
+            coeff_pred_list = select_single_mlvl(coeff_preds, img_id)
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                coeff_preds_list=coeff_pred_list,
+                mlvl_priors=mlvl_priors,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                coeff_preds_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results. Similar to func:``AnchorHead._predict_by_feat_single``,
+        but additionally processes coeff_preds_list and uses fast NMS instead
+        of traditional NMS.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores for a single scale level
+                Has shape (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas for a single
+                scale level with shape (num_priors * 4, H, W).
+            coeff_preds_list (list[Tensor]): Mask coefficients for a single
+                scale level with shape (num_priors * num_protos, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid,
+                has shape (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - coeffs (Tensor): the predicted mask coefficients of
+                  instance inside the corresponding box has a shape
+                  (n, num_protos).
+        """
+        assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_priors)
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_coeffs = []
+        for cls_score, bbox_pred, coeff_pred, priors in \
+                zip(cls_score_list, bbox_pred_list,
+                    coeff_preds_list, mlvl_priors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            coeff_pred = coeff_pred.permute(1, 2,
+                                            0).reshape(-1, self.num_protos)
+
+            if 0 < nms_pre < scores.shape[0]:
+                # Get maximum scores for foreground classes.
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                priors = priors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                coeff_pred = coeff_pred[topk_inds, :]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+            mlvl_coeffs.append(coeff_pred)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = torch.cat(mlvl_valid_priors)
+        multi_bboxes = self.bbox_coder.decode(
+            priors, bbox_pred, max_shape=img_shape)
+
+        multi_scores = torch.cat(mlvl_scores)
+        multi_coeffs = torch.cat(mlvl_coeffs)
+
+        return self._bbox_post_process(
+            multi_bboxes=multi_bboxes,
+            multi_scores=multi_scores,
+            multi_coeffs=multi_coeffs,
+            cfg=cfg,
+            rescale=rescale,
+            img_meta=img_meta)
+
+    def _bbox_post_process(self,
+                           multi_bboxes: Tensor,
+                           multi_scores: Tensor,
+                           multi_coeffs: Tensor,
+                           cfg: ConfigType,
+                           rescale: bool = False,
+                           img_meta: Optional[dict] = None,
+                           **kwargs) -> InstanceData:
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            multi_bboxes (Tensor): Predicted bbox that concat all levels.
+            multi_scores (Tensor): Bbox scores that concat all levels.
+            multi_coeffs (Tensor): Mask coefficients  that concat all levels.
+            cfg (ConfigDict): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default to False.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - coeffs (Tensor): the predicted mask coefficients of
+                  instance inside the corresponding box has a shape
+                  (n, num_protos).
+        """
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            multi_bboxes /= multi_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+            # mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+            # BG cat_id: num_class
+
+            padding = multi_scores.new_zeros(multi_scores.shape[0], 1)
+            multi_scores = torch.cat([multi_scores, padding], dim=1)
+        det_bboxes, det_labels, det_coeffs = fast_nms(
+            multi_bboxes, multi_scores, multi_coeffs, cfg.score_thr,
+            cfg.iou_thr, cfg.top_k, cfg.max_per_img)
+        results = InstanceData()
+        results.bboxes = det_bboxes[:, :4]
+        results.scores = det_bboxes[:, -1]
+        results.labels = det_labels
+        results.coeffs = det_coeffs
+        return results
+
+
+@MODELS.register_module()
+class YOLACTProtonet(BaseMaskHead):
+    """YOLACT mask head used in https://arxiv.org/abs/1904.02689.
+
+    This head outputs the mask prototypes for YOLACT.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        proto_channels (tuple[int]): Output channels of protonet convs.
+        proto_kernel_sizes (tuple[int]): Kernel sizes of protonet convs.
+        include_last_relu (bool): If keep the last relu of protonet.
+        num_protos (int): Number of prototypes.
+        num_classes (int): Number of categories excluding the background
+            category.
+        loss_mask_weight (float): Reweight the mask loss by this factor.
+        max_masks_to_train (int): Maximum number of masks to train for
+            each image.
+        with_seg_branch (bool): Whether to apply a semantic segmentation
+            branch and calculate loss during training to increase
+            performance with no speed penalty. Defaults to True.
+        loss_segm (:obj:`ConfigDict` or dict, optional): Config of
+            semantic segmentation loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config
+            of head.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            head.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int = 256,
+        proto_channels: tuple = (256, 256, 256, None, 256, 32),
+        proto_kernel_sizes: tuple = (3, 3, 3, -2, 3, 1),
+        include_last_relu: bool = True,
+        num_protos: int = 32,
+        loss_mask_weight: float = 1.0,
+        max_masks_to_train: int = 100,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        with_seg_branch: bool = True,
+        loss_segm: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        init_cfg=dict(
+            type='Xavier',
+            distribution='uniform',
+            override=dict(name='protonet'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.proto_channels = proto_channels
+        self.proto_kernel_sizes = proto_kernel_sizes
+        self.include_last_relu = include_last_relu
+
+        # Segmentation branch
+        self.with_seg_branch = with_seg_branch
+        self.segm_branch = SegmentationModule(
+            num_classes=num_classes, in_channels=in_channels) \
+            if with_seg_branch else None
+        self.loss_segm = MODELS.build(loss_segm) if with_seg_branch else None
+
+        self.loss_mask_weight = loss_mask_weight
+        self.num_protos = num_protos
+        self.num_classes = num_classes
+        self.max_masks_to_train = max_masks_to_train
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        # Possible patterns:
+        # ( 256, 3) -> conv
+        # ( 256,-2) -> deconv
+        # (None,-2) -> bilinear interpolate
+        in_channels = self.in_channels
+        protonets = ModuleList()
+        for num_channels, kernel_size in zip(self.proto_channels,
+                                             self.proto_kernel_sizes):
+            if kernel_size > 0:
+                layer = nn.Conv2d(
+                    in_channels,
+                    num_channels,
+                    kernel_size,
+                    padding=kernel_size // 2)
+            else:
+                if num_channels is None:
+                    layer = InterpolateModule(
+                        scale_factor=-kernel_size,
+                        mode='bilinear',
+                        align_corners=False)
+                else:
+                    layer = nn.ConvTranspose2d(
+                        in_channels,
+                        num_channels,
+                        -kernel_size,
+                        padding=kernel_size // 2)
+            protonets.append(layer)
+            protonets.append(nn.ReLU(inplace=True))
+            in_channels = num_channels if num_channels is not None \
+                else in_channels
+        if not self.include_last_relu:
+            protonets = protonets[:-1]
+        self.protonet = nn.Sequential(*protonets)
+
+    def forward(self, x: tuple, positive_infos: InstanceList) -> tuple:
+        """Forward feature from the upstream network to get prototypes and
+        linearly combine the prototypes, using masks coefficients, into
+        instance masks. Finally, crop the instance masks with given bboxes.
+
+        Args:
+            x (Tuple[Tensor]): Feature from the upstream network, which is
+                a 4D-tensor.
+            positive_infos (List[:obj:``InstanceData``]): Positive information
+                that calculate from detect head.
+
+        Returns:
+            tuple: Predicted instance segmentation masks and
+            semantic segmentation map.
+        """
+        # YOLACT used single feature map to get segmentation masks
+        single_x = x[0]
+
+        # YOLACT segmentation branch, if not training or segmentation branch
+        # is None, will not process the forward function.
+        if self.segm_branch is not None and self.training:
+            segm_preds = self.segm_branch(single_x)
+        else:
+            segm_preds = None
+        # YOLACT mask head
+        prototypes = self.protonet(single_x)
+        prototypes = prototypes.permute(0, 2, 3, 1).contiguous()
+
+        num_imgs = single_x.size(0)
+
+        mask_pred_list = []
+        for idx in range(num_imgs):
+            cur_prototypes = prototypes[idx]
+            pos_coeffs = positive_infos[idx].coeffs
+
+            # Linearly combine the prototypes with the mask coefficients
+            mask_preds = cur_prototypes @ pos_coeffs.t()
+            mask_preds = torch.sigmoid(mask_preds)
+            mask_pred_list.append(mask_preds)
+        return mask_pred_list, segm_preds
+
+    def loss_by_feat(self, mask_preds: List[Tensor], segm_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], positive_infos: InstanceList,
+                     **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (list[Tensor]): List of predicted prototypes, each has
+                shape (num_classes, H, W).
+            segm_preds (Tensor):  Predicted semantic segmentation map with
+                shape (N, num_classes, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+            positive_infos (List[:obj:``InstanceData``]): Information of
+                positive samples of each image that are assigned in detection
+                head.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert positive_infos is not None, \
+            'positive_infos should not be None in `YOLACTProtonet`'
+        losses = dict()
+
+        # crop
+        croped_mask_pred = self.crop_mask_preds(mask_preds, batch_img_metas,
+                                                positive_infos)
+
+        loss_mask = []
+        loss_segm = []
+        num_imgs, _, mask_h, mask_w = segm_preds.size()
+        assert num_imgs == len(croped_mask_pred)
+        segm_avg_factor = num_imgs * mask_h * mask_w
+        total_pos = 0
+
+        if self.segm_branch is not None:
+            assert segm_preds is not None
+
+        for idx in range(num_imgs):
+            img_meta = batch_img_metas[idx]
+
+            (mask_preds, pos_mask_targets, segm_targets, num_pos,
+             gt_bboxes_for_reweight) = self._get_targets_single(
+                 croped_mask_pred[idx], segm_preds[idx],
+                 batch_gt_instances[idx], positive_infos[idx])
+
+            # segmentation loss
+            if self.with_seg_branch:
+                if segm_targets is None:
+                    loss = segm_preds[idx].sum() * 0.
+                else:
+                    loss = self.loss_segm(
+                        segm_preds[idx],
+                        segm_targets,
+                        avg_factor=segm_avg_factor)
+                loss_segm.append(loss)
+            # mask loss
+            total_pos += num_pos
+            if num_pos == 0 or pos_mask_targets is None:
+                loss = mask_preds.sum() * 0.
+            else:
+                mask_preds = torch.clamp(mask_preds, 0, 1)
+                loss = F.binary_cross_entropy(
+                    mask_preds, pos_mask_targets,
+                    reduction='none') * self.loss_mask_weight
+
+                h, w = img_meta['img_shape'][:2]
+                gt_bboxes_width = (gt_bboxes_for_reweight[:, 2] -
+                                   gt_bboxes_for_reweight[:, 0]) / w
+                gt_bboxes_height = (gt_bboxes_for_reweight[:, 3] -
+                                    gt_bboxes_for_reweight[:, 1]) / h
+                loss = loss.mean(dim=(1,
+                                      2)) / gt_bboxes_width / gt_bboxes_height
+                loss = torch.sum(loss)
+            loss_mask.append(loss)
+
+        if total_pos == 0:
+            total_pos += 1  # avoid nan
+        loss_mask = [x / total_pos for x in loss_mask]
+
+        losses.update(loss_mask=loss_mask)
+        if self.with_seg_branch:
+            losses.update(loss_segm=loss_segm)
+
+        return losses
+
+    def _get_targets_single(self, mask_preds: Tensor, segm_pred: Tensor,
+                            gt_instances: InstanceData,
+                            positive_info: InstanceData):
+        """Compute targets for predictions of single image.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes with shape
+                (num_classes, H, W).
+            segm_pred (Tensor): Predicted semantic segmentation map
+                with shape (num_classes, H, W).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            positive_info (:obj:`InstanceData`): Information of positive
+                samples that are assigned in detection head. It usually
+                contains following keys.
+
+                    - pos_assigned_gt_inds (Tensor): Assigner GT indexes of
+                      positive proposals, has shape (num_pos, )
+                    - pos_inds (Tensor): Positive index of image, has
+                      shape (num_pos, ).
+                    - coeffs (Tensor): Positive mask coefficients
+                      with shape (num_pos, num_protos).
+                    - bboxes (Tensor): Positive bboxes with shape
+                      (num_pos, 4)
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+            - mask_preds (Tensor): Positive predicted mask with shape
+              (num_pos, mask_h, mask_w).
+            - pos_mask_targets (Tensor): Positive mask targets with shape
+              (num_pos, mask_h, mask_w).
+            - segm_targets (Tensor): Semantic segmentation targets with shape
+              (num_classes, segm_h, segm_w).
+            - num_pos (int): Positive numbers.
+            - gt_bboxes_for_reweight (Tensor): GT bboxes that match to the
+              positive priors has shape (num_pos, 4).
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        device = gt_bboxes.device
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device).float()
+        if gt_masks.size(0) == 0:
+            return mask_preds, None, None, 0, None
+
+        # process with semantic segmentation targets
+        if segm_pred is not None:
+            num_classes, segm_h, segm_w = segm_pred.size()
+            with torch.no_grad():
+                downsampled_masks = F.interpolate(
+                    gt_masks.unsqueeze(0), (segm_h, segm_w),
+                    mode='bilinear',
+                    align_corners=False).squeeze(0)
+                downsampled_masks = downsampled_masks.gt(0.5).float()
+                segm_targets = torch.zeros_like(segm_pred, requires_grad=False)
+                for obj_idx in range(downsampled_masks.size(0)):
+                    segm_targets[gt_labels[obj_idx] - 1] = torch.max(
+                        segm_targets[gt_labels[obj_idx] - 1],
+                        downsampled_masks[obj_idx])
+        else:
+            segm_targets = None
+        # process with mask targets
+        pos_assigned_gt_inds = positive_info.pos_assigned_gt_inds
+        num_pos = pos_assigned_gt_inds.size(0)
+        # Since we're producing (near) full image masks,
+        # it'd take too much vram to backprop on every single mask.
+        # Thus we select only a subset.
+        if num_pos > self.max_masks_to_train:
+            perm = torch.randperm(num_pos)
+            select = perm[:self.max_masks_to_train]
+            mask_preds = mask_preds[select]
+            pos_assigned_gt_inds = pos_assigned_gt_inds[select]
+            num_pos = self.max_masks_to_train
+
+        gt_bboxes_for_reweight = gt_bboxes[pos_assigned_gt_inds]
+
+        mask_h, mask_w = mask_preds.shape[-2:]
+        gt_masks = F.interpolate(
+            gt_masks.unsqueeze(0), (mask_h, mask_w),
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        gt_masks = gt_masks.gt(0.5).float()
+        pos_mask_targets = gt_masks[pos_assigned_gt_inds]
+
+        return (mask_preds, pos_mask_targets, segm_targets, num_pos,
+                gt_bboxes_for_reweight)
+
+    def crop_mask_preds(self, mask_preds: List[Tensor],
+                        batch_img_metas: List[dict],
+                        positive_infos: InstanceList) -> list:
+        """Crop predicted masks by zeroing out everything not in the predicted
+        bbox.
+
+        Args:
+            mask_preds (list[Tensor]): Predicted prototypes with shape
+                (num_classes, H, W).
+            batch_img_metas (list[dict]): Meta information of multiple images.
+            positive_infos (List[:obj:``InstanceData``]): Positive
+                information that calculate from detect head.
+
+        Returns:
+            list: The cropped masks.
+        """
+        croped_mask_preds = []
+        for img_meta, mask_preds, cur_info in zip(batch_img_metas, mask_preds,
+                                                  positive_infos):
+            bboxes_for_cropping = copy.deepcopy(cur_info.bboxes)
+            h, w = img_meta['img_shape'][:2]
+            bboxes_for_cropping[:, 0::2] /= w
+            bboxes_for_cropping[:, 1::2] /= h
+            mask_preds = self.crop_single(mask_preds, bboxes_for_cropping)
+            mask_preds = mask_preds.permute(2, 0, 1).contiguous()
+            croped_mask_preds.append(mask_preds)
+        return croped_mask_preds
+
+    def crop_single(self,
+                    masks: Tensor,
+                    boxes: Tensor,
+                    padding: int = 1) -> Tensor:
+        """Crop single predicted masks by zeroing out everything not in the
+        predicted bbox.
+
+        Args:
+            masks (Tensor): Predicted prototypes, has shape [H, W, N].
+            boxes (Tensor): Bbox coords in relative point form with
+                shape [N, 4].
+            padding (int): Image padding size.
+
+        Return:
+            Tensor: The cropped masks.
+        """
+        h, w, n = masks.size()
+        x1, x2 = self.sanitize_coordinates(
+            boxes[:, 0], boxes[:, 2], w, padding, cast=False)
+        y1, y2 = self.sanitize_coordinates(
+            boxes[:, 1], boxes[:, 3], h, padding, cast=False)
+
+        rows = torch.arange(
+            w, device=masks.device, dtype=x1.dtype).view(1, -1,
+                                                         1).expand(h, w, n)
+        cols = torch.arange(
+            h, device=masks.device, dtype=x1.dtype).view(-1, 1,
+                                                         1).expand(h, w, n)
+
+        masks_left = rows >= x1.view(1, 1, -1)
+        masks_right = rows < x2.view(1, 1, -1)
+        masks_up = cols >= y1.view(1, 1, -1)
+        masks_down = cols < y2.view(1, 1, -1)
+
+        crop_mask = masks_left * masks_right * masks_up * masks_down
+
+        return masks * crop_mask.float()
+
+    def sanitize_coordinates(self,
+                             x1: Tensor,
+                             x2: Tensor,
+                             img_size: int,
+                             padding: int = 0,
+                             cast: bool = True) -> tuple:
+        """Sanitizes the input coordinates so that x1 < x2, x1 != x2, x1 >= 0,
+        and x2 <= image_size. Also converts from relative to absolute
+        coordinates and casts the results to long tensors.
+
+        Warning: this does things in-place behind the scenes so
+        copy if necessary.
+
+        Args:
+            x1 (Tensor): shape (N, ).
+            x2 (Tensor): shape (N, ).
+            img_size (int): Size of the input image.
+            padding (int): x1 >= padding, x2 <= image_size-padding.
+            cast (bool): If cast is false, the result won't be cast to longs.
+
+        Returns:
+            tuple:
+
+            - x1 (Tensor): Sanitized _x1.
+            - x2 (Tensor): Sanitized _x2.
+        """
+        x1 = x1 * img_size
+        x2 = x2 * img_size
+        if cast:
+            x1 = x1.long()
+            x2 = x2.long()
+        x1 = torch.min(x1, x2)
+        x2 = torch.max(x1, x2)
+        x1 = torch.clamp(x1 - padding, min=0)
+        x2 = torch.clamp(x2 + padding, max=img_size)
+        return x1, x2
+
+    def predict_by_feat(self,
+                        mask_preds: List[Tensor],
+                        segm_preds: Tensor,
+                        results_list: InstanceList,
+                        batch_img_metas: List[dict],
+                        rescale: bool = True,
+                        **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (list[Tensor]): Predicted prototypes with shape
+                (num_classes, H, W).
+            results_list (List[:obj:``InstanceData``]): BBoxHead results.
+            batch_img_metas (list[dict]): Meta information of all images.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        assert len(mask_preds) == len(results_list) == len(batch_img_metas)
+
+        croped_mask_pred = self.crop_mask_preds(mask_preds, batch_img_metas,
+                                                results_list)
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            results = results_list[img_id]
+            bboxes = results.bboxes
+            mask_preds = croped_mask_pred[img_id]
+            if bboxes.shape[0] == 0 or mask_preds.shape[0] == 0:
+                results_list[img_id] = empty_instances(
+                    [img_meta],
+                    bboxes.device,
+                    task_type='mask',
+                    instance_results=[results])[0]
+            else:
+                im_mask = self._predict_by_feat_single(
+                    mask_preds=croped_mask_pred[img_id],
+                    bboxes=bboxes,
+                    img_meta=img_meta,
+                    rescale=rescale)
+                results.masks = im_mask
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                mask_preds: Tensor,
+                                bboxes: Tensor,
+                                img_meta: dict,
+                                rescale: bool,
+                                cfg: OptConfigType = None):
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes, has shape [H, W, N].
+            bboxes (Tensor): Bbox coords in relative point form with
+                shape [N, 4].
+            img_meta (dict): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If rescale is False, then returned masks will
+                fit the scale of imgs[0].
+            cfg (dict, optional): Config used in test phase.
+                Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+            (1, 2))
+        img_h, img_w = img_meta['ori_shape'][:2]
+        if rescale:  # in-placed rescale the bboxes
+            scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+            bboxes /= scale_factor
+        else:
+            w_scale, h_scale = scale_factor[0, 0], scale_factor[0, 1]
+            img_h = np.round(img_h * h_scale.item()).astype(np.int32)
+            img_w = np.round(img_w * w_scale.item()).astype(np.int32)
+
+        masks = F.interpolate(
+            mask_preds.unsqueeze(0), (img_h, img_w),
+            mode='bilinear',
+            align_corners=False).squeeze(0) > cfg.mask_thr
+
+        if cfg.mask_thr_binary < 0:
+            # for visualization and debugging
+            masks = (masks * 255).to(dtype=torch.uint8)
+
+        return masks
+
+
+class SegmentationModule(BaseModule):
+    """YOLACT segmentation branch used in <https://arxiv.org/abs/1904.02689>`_
+
+    In mmdet v2.x `segm_loss` is calculated in YOLACTSegmHead, while in
+    mmdet v3.x `SegmentationModule` is used to obtain the predicted semantic
+    segmentation map and `segm_loss` is calculated in YOLACTProtonet.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int = 256,
+        init_cfg: ConfigType = dict(
+            type='Xavier',
+            distribution='uniform',
+            override=dict(name='segm_conv'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.segm_conv = nn.Conv2d(
+            self.in_channels, self.num_classes, kernel_size=1)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward feature from the upstream network.
+
+        Args:
+            x (Tensor): Feature from the upstream network, which is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: Predicted semantic segmentation map with shape
+                (N, num_classes, H, W).
+        """
+        return self.segm_conv(x)
+
+
+class InterpolateModule(BaseModule):
+    """This is a module version of F.interpolate.
+
+    Any arguments you give it just get passed along for the ride.
+    """
+
+    def __init__(self, *args, init_cfg=None, **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.args = args
+        self.kwargs = kwargs
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tensor): Feature from the upstream network, which is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: A 4D-tensor feature map.
+        """
+        return F.interpolate(x, *self.args, **self.kwargs)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/yolo_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/yolo_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f63afbbc94353e16e4c67ec5bc0b6cd1200de07
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/yolo_head.py
@@ -0,0 +1,527 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+import copy
+import warnings
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, is_norm
+from mmengine.model import bias_init_with_prob, constant_init, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList)
+from ..task_modules.samplers import PseudoSampler
+from ..utils import filter_scores_and_topk, images_to_levels, multi_apply
+from .base_dense_head import BaseDenseHead
+
+
+@MODELS.register_module()
+class YOLOV3Head(BaseDenseHead):
+    """YOLOV3Head Paper link: https://arxiv.org/abs/1804.02767.
+
+    Args:
+        num_classes (int): The number of object classes (w/o background)
+        in_channels (Sequence[int]): Number of input channels per scale.
+        out_channels (Sequence[int]): The number of output channels per scale
+            before the final 1x1 layer. Default: (1024, 512, 256).
+        anchor_generator (:obj:`ConfigDict` or dict): Config dict for anchor
+            generator.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bounding box coder.
+        featmap_strides (Sequence[int]): The stride of each scale.
+            Should be in descending order. Defaults to (32, 16, 8).
+        one_hot_smoother (float): Set a non-zero value to enable label-smooth
+            Defaults to 0.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='LeakyReLU', negative_slope=0.1).
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_conf (:obj:`ConfigDict` or dict): Config of confidence loss.
+        loss_xy (:obj:`ConfigDict` or dict): Config of xy coordinate loss.
+        loss_wh (:obj:`ConfigDict` or dict): Config of wh coordinate loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            YOLOV3 head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            YOLOV3 head. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Sequence[int],
+                 out_channels: Sequence[int] = (1024, 512, 256),
+                 anchor_generator: ConfigType = dict(
+                     type='YOLOAnchorGenerator',
+                     base_sizes=[[(116, 90), (156, 198), (373, 326)],
+                                 [(30, 61), (62, 45), (59, 119)],
+                                 [(10, 13), (16, 30), (33, 23)]],
+                     strides=[32, 16, 8]),
+                 bbox_coder: ConfigType = dict(type='YOLOBBoxCoder'),
+                 featmap_strides: Sequence[int] = (32, 16, 8),
+                 one_hot_smoother: float = 0.,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(
+                     type='LeakyReLU', negative_slope=0.1),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_conf: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_xy: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_wh: ConfigType = dict(type='MSELoss', loss_weight=1.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None) -> None:
+        super().__init__(init_cfg=None)
+        # Check params
+        assert (len(in_channels) == len(out_channels) == len(featmap_strides))
+
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.featmap_strides = featmap_strides
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], context=self)
+            else:
+                self.sampler = PseudoSampler()
+
+        self.one_hot_smoother = one_hot_smoother
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+
+        self.prior_generator = TASK_UTILS.build(anchor_generator)
+
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_conf = MODELS.build(loss_conf)
+        self.loss_xy = MODELS.build(loss_xy)
+        self.loss_wh = MODELS.build(loss_wh)
+
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+        assert len(
+            self.prior_generator.num_base_priors) == len(featmap_strides)
+        self._init_layers()
+
+    @property
+    def num_levels(self) -> int:
+        """int: number of feature map levels"""
+        return len(self.featmap_strides)
+
+    @property
+    def num_attrib(self) -> int:
+        """int: number of attributes in pred_map, bboxes (4) +
+        objectness (1) + num_classes"""
+
+        return 5 + self.num_classes
+
+    def _init_layers(self) -> None:
+        """initialize conv layers in YOLOv3 head."""
+        self.convs_bridge = nn.ModuleList()
+        self.convs_pred = nn.ModuleList()
+        for i in range(self.num_levels):
+            conv_bridge = ConvModule(
+                self.in_channels[i],
+                self.out_channels[i],
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            conv_pred = nn.Conv2d(self.out_channels[i],
+                                  self.num_base_priors * self.num_attrib, 1)
+
+            self.convs_bridge.append(conv_bridge)
+            self.convs_pred.append(conv_pred)
+
+    def init_weights(self) -> None:
+        """initialize weights."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+
+        # Use prior in model initialization to improve stability
+        for conv_pred, stride in zip(self.convs_pred, self.featmap_strides):
+            bias = conv_pred.bias.reshape(self.num_base_priors, -1)
+            # init objectness with prior of 8 objects per feature map
+            # refer to https://github.com/ultralytics/yolov3
+            nn.init.constant_(bias.data[:, 4],
+                              bias_init_with_prob(8 / (608 / stride)**2))
+            nn.init.constant_(bias.data[:, 5:], bias_init_with_prob(0.01))
+
+    def forward(self, x: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple[Tensor]: A tuple of multi-level predication map, each is a
+                4D-tensor of shape (batch_size, 5+num_classes, height, width).
+        """
+
+        assert len(x) == self.num_levels
+        pred_maps = []
+        for i in range(self.num_levels):
+            feat = x[i]
+            feat = self.convs_bridge[i](feat)
+            pred_map = self.convs_pred[i](feat)
+            pred_maps.append(pred_map)
+
+        return tuple(pred_maps),
+
+    def predict_by_feat(self,
+                        pred_maps: Sequence[Tensor],
+                        batch_img_metas: Optional[List[dict]],
+                        cfg: OptConfigType = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results. It has been accelerated since PR #5991.
+
+        Args:
+            pred_maps (Sequence[Tensor]): Raw predictions for a batch of
+                images.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (:obj:`ConfigDict` or dict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(pred_maps) == self.num_levels
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [pred_map.shape[-2:] for pred_map in pred_maps]
+
+        mlvl_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=pred_maps[0].device)
+        flatten_preds = []
+        flatten_strides = []
+        for pred, stride in zip(pred_maps, self.featmap_strides):
+            pred = pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                    self.num_attrib)
+            pred[..., :2].sigmoid_()
+            flatten_preds.append(pred)
+            flatten_strides.append(
+                pred.new_tensor(stride).expand(pred.size(1)))
+
+        flatten_preds = torch.cat(flatten_preds, dim=1)
+        flatten_bbox_preds = flatten_preds[..., :4]
+        flatten_objectness = flatten_preds[..., 4].sigmoid()
+        flatten_cls_scores = flatten_preds[..., 5:].sigmoid()
+        flatten_anchors = torch.cat(mlvl_anchors)
+        flatten_strides = torch.cat(flatten_strides)
+        flatten_bboxes = self.bbox_coder.decode(flatten_anchors,
+                                                flatten_bbox_preds,
+                                                flatten_strides.unsqueeze(-1))
+        results_list = []
+        for (bboxes, scores, objectness,
+             img_meta) in zip(flatten_bboxes, flatten_cls_scores,
+                              flatten_objectness, batch_img_metas):
+            # Filtering out all predictions with conf < conf_thr
+            conf_thr = cfg.get('conf_thr', -1)
+            if conf_thr > 0:
+                conf_inds = objectness >= conf_thr
+                bboxes = bboxes[conf_inds, :]
+                scores = scores[conf_inds, :]
+                objectness = objectness[conf_inds]
+
+            score_thr = cfg.get('score_thr', 0)
+            nms_pre = cfg.get('nms_pre', -1)
+            scores, labels, keep_idxs, _ = filter_scores_and_topk(
+                scores, score_thr, nms_pre)
+
+            results = InstanceData(
+                scores=scores,
+                labels=labels,
+                bboxes=bboxes[keep_idxs],
+                score_factors=objectness[keep_idxs],
+            )
+            results = self._bbox_post_process(
+                results=results,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms,
+                img_meta=img_meta)
+            results_list.append(results)
+        return results_list
+
+    def loss_by_feat(
+            self,
+            pred_maps: Sequence[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            pred_maps (list[Tensor]): Prediction map for each scale level,
+                shape (N, num_anchors * num_attrib, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        device = pred_maps[0][0].device
+
+        featmap_sizes = [
+            pred_maps[i].shape[-2:] for i in range(self.num_levels)
+        ]
+        mlvl_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device)
+        anchor_list = [mlvl_anchors for _ in range(num_imgs)]
+
+        responsible_flag_list = []
+        for img_id in range(num_imgs):
+            responsible_flag_list.append(
+                self.responsible_flags(featmap_sizes,
+                                       batch_gt_instances[img_id].bboxes,
+                                       device))
+
+        target_maps_list, neg_maps_list = self.get_targets(
+            anchor_list, responsible_flag_list, batch_gt_instances)
+
+        losses_cls, losses_conf, losses_xy, losses_wh = multi_apply(
+            self.loss_by_feat_single, pred_maps, target_maps_list,
+            neg_maps_list)
+
+        return dict(
+            loss_cls=losses_cls,
+            loss_conf=losses_conf,
+            loss_xy=losses_xy,
+            loss_wh=losses_wh)
+
+    def loss_by_feat_single(self, pred_map: Tensor, target_map: Tensor,
+                            neg_map: Tensor) -> tuple:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            pred_map (Tensor): Raw predictions for a single level.
+            target_map (Tensor): The Ground-Truth target for a single level.
+            neg_map (Tensor): The negative masks for a single level.
+
+        Returns:
+            tuple:
+                loss_cls (Tensor): Classification loss.
+                loss_conf (Tensor): Confidence loss.
+                loss_xy (Tensor): Regression loss of x, y coordinate.
+                loss_wh (Tensor): Regression loss of w, h coordinate.
+        """
+
+        num_imgs = len(pred_map)
+        pred_map = pred_map.permute(0, 2, 3,
+                                    1).reshape(num_imgs, -1, self.num_attrib)
+        neg_mask = neg_map.float()
+        pos_mask = target_map[..., 4]
+        pos_and_neg_mask = neg_mask + pos_mask
+        pos_mask = pos_mask.unsqueeze(dim=-1)
+        if torch.max(pos_and_neg_mask) > 1.:
+            warnings.warn('There is overlap between pos and neg sample.')
+            pos_and_neg_mask = pos_and_neg_mask.clamp(min=0., max=1.)
+
+        pred_xy = pred_map[..., :2]
+        pred_wh = pred_map[..., 2:4]
+        pred_conf = pred_map[..., 4]
+        pred_label = pred_map[..., 5:]
+
+        target_xy = target_map[..., :2]
+        target_wh = target_map[..., 2:4]
+        target_conf = target_map[..., 4]
+        target_label = target_map[..., 5:]
+
+        loss_cls = self.loss_cls(pred_label, target_label, weight=pos_mask)
+        loss_conf = self.loss_conf(
+            pred_conf, target_conf, weight=pos_and_neg_mask)
+        loss_xy = self.loss_xy(pred_xy, target_xy, weight=pos_mask)
+        loss_wh = self.loss_wh(pred_wh, target_wh, weight=pos_mask)
+
+        return loss_cls, loss_conf, loss_xy, loss_wh
+
+    def get_targets(self, anchor_list: List[List[Tensor]],
+                    responsible_flag_list: List[List[Tensor]],
+                    batch_gt_instances: List[InstanceData]) -> tuple:
+        """Compute target maps for anchors in multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_total_anchors, 4).
+            responsible_flag_list (list[list[Tensor]]): Multi level responsible
+                flags of each image. Each element is a tensor of shape
+                (num_total_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+                - target_map_list (list[Tensor]): Target map of each level.
+                - neg_map_list (list[Tensor]): Negative map of each level.
+        """
+        num_imgs = len(anchor_list)
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        results = multi_apply(self._get_targets_single, anchor_list,
+                              responsible_flag_list, batch_gt_instances)
+
+        all_target_maps, all_neg_maps = results
+        assert num_imgs == len(all_target_maps) == len(all_neg_maps)
+        target_maps_list = images_to_levels(all_target_maps, num_level_anchors)
+        neg_maps_list = images_to_levels(all_neg_maps, num_level_anchors)
+
+        return target_maps_list, neg_maps_list
+
+    def _get_targets_single(self, anchors: List[Tensor],
+                            responsible_flags: List[Tensor],
+                            gt_instances: InstanceData) -> tuple:
+        """Generate matching bounding box prior and converted GT.
+
+        Args:
+            anchors (List[Tensor]): Multi-level anchors of the image.
+            responsible_flags (List[Tensor]): Multi-level responsible flags of
+                anchors
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple:
+                target_map (Tensor): Predication target map of each
+                    scale level, shape (num_total_anchors,
+                    5+num_classes)
+                neg_map (Tensor): Negative map of each scale level,
+                    shape (num_total_anchors,)
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        anchor_strides = []
+        for i in range(len(anchors)):
+            anchor_strides.append(
+                torch.tensor(self.featmap_strides[i],
+                             device=gt_bboxes.device).repeat(len(anchors[i])))
+        concat_anchors = torch.cat(anchors)
+        concat_responsible_flags = torch.cat(responsible_flags)
+
+        anchor_strides = torch.cat(anchor_strides)
+        assert len(anchor_strides) == len(concat_anchors) == \
+               len(concat_responsible_flags)
+        pred_instances = InstanceData(
+            priors=concat_anchors, responsible_flags=concat_responsible_flags)
+
+        assign_result = self.assigner.assign(pred_instances, gt_instances)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        target_map = concat_anchors.new_zeros(
+            concat_anchors.size(0), self.num_attrib)
+
+        target_map[sampling_result.pos_inds, :4] = self.bbox_coder.encode(
+            sampling_result.pos_priors, sampling_result.pos_gt_bboxes,
+            anchor_strides[sampling_result.pos_inds])
+
+        target_map[sampling_result.pos_inds, 4] = 1
+
+        gt_labels_one_hot = F.one_hot(
+            gt_labels, num_classes=self.num_classes).float()
+        if self.one_hot_smoother != 0:  # label smooth
+            gt_labels_one_hot = gt_labels_one_hot * (
+                1 - self.one_hot_smoother
+            ) + self.one_hot_smoother / self.num_classes
+        target_map[sampling_result.pos_inds, 5:] = gt_labels_one_hot[
+            sampling_result.pos_assigned_gt_inds]
+
+        neg_map = concat_anchors.new_zeros(
+            concat_anchors.size(0), dtype=torch.uint8)
+        neg_map[sampling_result.neg_inds] = 1
+
+        return target_map, neg_map
+
+    def responsible_flags(self, featmap_sizes: List[tuple], gt_bboxes: Tensor,
+                          device: str) -> List[Tensor]:
+        """Generate responsible anchor flags of grid cells in multiple scales.
+
+        Args:
+            featmap_sizes (List[tuple]): List of feature map sizes in multiple
+                feature levels.
+            gt_bboxes (Tensor): Ground truth boxes, shape (n, 4).
+            device (str): Device where the anchors will be put on.
+
+        Return:
+            List[Tensor]: responsible flags of anchors in multiple level
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_responsible_flags = []
+        for i in range(self.num_levels):
+            anchor_stride = self.prior_generator.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            gt_cx = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) * 0.5).to(device)
+            gt_cy = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) * 0.5).to(device)
+            gt_grid_x = torch.floor(gt_cx / anchor_stride[0]).long()
+            gt_grid_y = torch.floor(gt_cy / anchor_stride[1]).long()
+            # row major indexing
+            gt_bboxes_grid_idx = gt_grid_y * feat_w + gt_grid_x
+
+            responsible_grid = torch.zeros(
+                feat_h * feat_w, dtype=torch.uint8, device=device)
+            responsible_grid[gt_bboxes_grid_idx] = 1
+
+            responsible_grid = responsible_grid[:, None].expand(
+                responsible_grid.size(0),
+                self.prior_generator.num_base_priors[i]).contiguous().view(-1)
+
+            multi_level_responsible_flags.append(responsible_grid)
+        return multi_level_responsible_flags
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/yolof_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/yolof_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5e5e6b7a92861bcd2ba3824df1f94270ba51160
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/yolof_head.py
@@ -0,0 +1,399 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, is_norm
+from mmengine.model import bias_init_with_prob, constant_init, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import levels_to_images, multi_apply, unmap
+from .anchor_head import AnchorHead
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class YOLOFHead(AnchorHead):
+    """Detection Head of `YOLOF <https://arxiv.org/abs/2103.09460>`_
+
+    Args:
+        num_classes (int): The number of object classes (w/o background)
+        in_channels (list[int]): The number of input channels per scale.
+        cls_num_convs (int): The number of convolutions of cls branch.
+           Defaults to 2.
+        reg_num_convs (int): The number of convolutions of reg branch.
+           Defaults to 4.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to ``dict(type='BN', requires_grad=True)``.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: List[int],
+                 num_cls_convs: int = 2,
+                 num_reg_convs: int = 4,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 **kwargs) -> None:
+        self.num_cls_convs = num_cls_convs
+        self.num_reg_convs = num_reg_convs
+        self.norm_cfg = norm_cfg
+        super().__init__(
+            num_classes=num_classes, in_channels=in_channels, **kwargs)
+
+    def _init_layers(self) -> None:
+        cls_subnet = []
+        bbox_subnet = []
+        for i in range(self.num_cls_convs):
+            cls_subnet.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+        for i in range(self.num_reg_convs):
+            bbox_subnet.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+        self.cls_subnet = nn.Sequential(*cls_subnet)
+        self.bbox_subnet = nn.Sequential(*bbox_subnet)
+        self.cls_score = nn.Conv2d(
+            self.in_channels,
+            self.num_base_priors * self.num_classes,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.bbox_pred = nn.Conv2d(
+            self.in_channels,
+            self.num_base_priors * 4,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.object_pred = nn.Conv2d(
+            self.in_channels,
+            self.num_base_priors,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+
+    def init_weights(self) -> None:
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+
+        # Use prior in model initialization to improve stability
+        bias_cls = bias_init_with_prob(0.01)
+        torch.nn.init.constant_(self.cls_score.bias, bias_cls)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                normalized_cls_score (Tensor): Normalized Cls scores for a \
+                    single scale level, the channels number is \
+                    num_base_priors * num_classes.
+                bbox_reg (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_base_priors * 4.
+        """
+        cls_score = self.cls_score(self.cls_subnet(x))
+        N, _, H, W = cls_score.shape
+        cls_score = cls_score.view(N, -1, self.num_classes, H, W)
+
+        reg_feat = self.bbox_subnet(x)
+        bbox_reg = self.bbox_pred(reg_feat)
+        objectness = self.object_pred(reg_feat)
+
+        # implicit objectness
+        objectness = objectness.view(N, -1, 1, H, W)
+        normalized_cls_score = cls_score + objectness - torch.log(
+            1. + torch.clamp(cls_score.exp(), max=INF) +
+            torch.clamp(objectness.exp(), max=INF))
+        normalized_cls_score = normalized_cls_score.view(N, -1, H, W)
+        return normalized_cls_score, bbox_reg
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        assert len(cls_scores) == 1
+        assert self.prior_generator.num_levels == 1
+
+        device = cls_scores[0].device
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        # The output level is always 1
+        anchor_list = [anchors[0] for anchors in anchor_list]
+        valid_flag_list = [valid_flags[0] for valid_flags in valid_flag_list]
+
+        cls_scores_list = levels_to_images(cls_scores)
+        bbox_preds_list = levels_to_images(bbox_preds)
+
+        cls_reg_targets = self.get_targets(
+            cls_scores_list,
+            bbox_preds_list,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        if cls_reg_targets is None:
+            return None
+        (batch_labels, batch_label_weights, avg_factor, batch_bbox_weights,
+         batch_pos_predicted_boxes, batch_target_boxes) = cls_reg_targets
+
+        flatten_labels = batch_labels.reshape(-1)
+        batch_label_weights = batch_label_weights.reshape(-1)
+        cls_score = cls_scores[0].permute(0, 2, 3,
+                                          1).reshape(-1, self.cls_out_channels)
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        # classification loss
+        loss_cls = self.loss_cls(
+            cls_score,
+            flatten_labels,
+            batch_label_weights,
+            avg_factor=avg_factor)
+
+        # regression loss
+        if batch_pos_predicted_boxes.shape[0] == 0:
+            # no pos sample
+            loss_bbox = batch_pos_predicted_boxes.sum() * 0
+        else:
+            loss_bbox = self.loss_bbox(
+                batch_pos_predicted_boxes,
+                batch_target_boxes,
+                batch_bbox_weights.float(),
+                avg_factor=avg_factor)
+
+        return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
+
+    def get_targets(self,
+                    cls_scores_list: List[Tensor],
+                    bbox_preds_list: List[Tensor],
+                    anchor_list: List[Tensor],
+                    valid_flag_list: List[Tensor],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            cls_scores_list (list[Tensor]): Classification scores of
+                each image. each is a 4D-tensor, the shape is
+                (h * w, num_anchors * num_classes).
+            bbox_preds_list (list[Tensor]): Bbox preds of each image.
+                each is a 4D-tensor, the shape is (h * w, num_anchors * 4).
+            anchor_list (list[Tensor]): Anchors of each image. Each element of
+                is a tensor of shape (h * w * num_anchors, 4).
+            valid_flag_list (list[Tensor]): Valid flags of each image. Each
+               element of is a tensor of shape (h * w * num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - batch_labels (Tensor): Label of all images. Each element \
+                    of is a tensor of shape (batch, h * w * num_anchors)
+                - batch_label_weights (Tensor): Label weights of all images \
+                    of is a tensor of shape (batch, h * w * num_anchors)
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        results = multi_apply(
+            self._get_targets_single,
+            bbox_preds_list,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore,
+            unmap_outputs=unmap_outputs)
+        (all_labels, all_label_weights, pos_inds, neg_inds,
+         sampling_results_list) = results[:5]
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        rest_results = list(results[5:])  # user-added return values
+
+        batch_labels = torch.stack(all_labels, 0)
+        batch_label_weights = torch.stack(all_label_weights, 0)
+
+        res = (batch_labels, batch_label_weights, avg_factor)
+        for i, rests in enumerate(rest_results):  # user-added return values
+            rest_results[i] = torch.cat(rests, 0)
+
+        return res + tuple(rest_results)
+
+    def _get_targets_single(self,
+                            bbox_preds: Tensor,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            bbox_preds (Tensor): Bbox prediction of the image, which
+                shape is (h * w ,4)
+            flat_anchors (Tensor): Anchors of the image, which shape is
+                (h * w * num_anchors ,4)
+            valid_flags (Tensor): Valid flags of the image, which shape is
+                (h * w * num_anchors,).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple:
+                labels (Tensor): Labels of image, which shape is
+                    (h * w * num_anchors, ).
+                label_weights (Tensor): Label weights of image, which shape is
+                    (h * w * num_anchors, ).
+                pos_inds (Tensor): Pos index of image.
+                neg_inds (Tensor): Neg index of image.
+                sampling_result (obj:`SamplingResult`): Sampling result.
+                pos_bbox_weights (Tensor): The Weight of using to calculate
+                    the bbox branch loss, which shape is (num, ).
+                pos_predicted_boxes (Tensor): boxes predicted value of
+                    using to calculate the bbox branch loss, which shape is
+                    (num, 4).
+                pos_target_boxes (Tensor): boxes target value of
+                    using to calculate the bbox branch loss, which shape is
+                    (num, 4).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bbox_preds = bbox_preds[inside_flags, :]
+
+        # decoded bbox
+        decoder_bbox_preds = self.bbox_coder.decode(anchors, bbox_preds)
+        pred_instances = InstanceData(
+            priors=anchors, decoder_priors=decoder_bbox_preds)
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+
+        pos_bbox_weights = assign_result.get_extra_property('pos_idx')
+        pos_predicted_boxes = assign_result.get_extra_property(
+            'pos_predicted_boxes')
+        pos_target_boxes = assign_result.get_extra_property('target_boxes')
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+        num_valid_anchors = anchors.shape[0]
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+
+        return (labels, label_weights, pos_inds, neg_inds, sampling_result,
+                pos_bbox_weights, pos_predicted_boxes, pos_target_boxes)
diff --git a/head_extractor/build/lib/mmdet/models/dense_heads/yolox_head.py b/head_extractor/build/lib/mmdet/models/dense_heads/yolox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..00fe1e42766e4ca0052cf31d2e940dfab73fb200
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/dense_heads/yolox_head.py
@@ -0,0 +1,618 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmcv.ops.nms import batched_nms
+from mmengine.config import ConfigDict
+from mmengine.model import bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import bbox_xyxy_to_cxcywh
+from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
+                         OptMultiConfig, reduce_mean)
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..task_modules.samplers import PseudoSampler
+from ..utils import multi_apply
+from .base_dense_head import BaseDenseHead
+
+
+@MODELS.register_module()
+class YOLOXHead(BaseDenseHead):
+    """YOLOXHead head used in `YOLOX <https://arxiv.org/abs/2107.08430>`_.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels in stacking convs.
+            Defaults to 256
+        stacked_convs (int): Number of stacking convs of the head.
+            Defaults to (8, 16, 32).
+        strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to None.
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Defaults to False.
+        dcn_on_last_conv (bool): If true, use dcn in the last layer of
+            towers. Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Defaults to "auto".
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to None.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss.
+        loss_l1 (:obj:`ConfigDict` or dict): Config of L1 loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 2,
+        strides: Sequence[int] = (8, 16, 32),
+        use_depthwise: bool = False,
+        dcn_on_last_conv: bool = False,
+        conv_bias: Union[bool, str] = 'auto',
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='Swish'),
+        loss_cls: ConfigType = dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        loss_bbox: ConfigType = dict(
+            type='IoULoss',
+            mode='square',
+            eps=1e-16,
+            reduction='sum',
+            loss_weight=5.0),
+        loss_obj: ConfigType = dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        loss_l1: ConfigType = dict(
+            type='L1Loss', reduction='sum', loss_weight=1.0),
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.cls_out_channels = num_classes
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.use_depthwise = use_depthwise
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.use_sigmoid_cls = True
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.loss_cls: nn.Module = MODELS.build(loss_cls)
+        self.loss_bbox: nn.Module = MODELS.build(loss_bbox)
+        self.loss_obj: nn.Module = MODELS.build(loss_obj)
+
+        self.use_l1 = False  # This flag will be modified by hooks.
+        self.loss_l1: nn.Module = MODELS.build(loss_l1)
+
+        self.prior_generator = MlvlPointGenerator(strides, offset=0)
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            # YOLOX does not support sampling
+            self.sampler = PseudoSampler()
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize heads for all level feature maps."""
+        self.multi_level_cls_convs = nn.ModuleList()
+        self.multi_level_reg_convs = nn.ModuleList()
+        self.multi_level_conv_cls = nn.ModuleList()
+        self.multi_level_conv_reg = nn.ModuleList()
+        self.multi_level_conv_obj = nn.ModuleList()
+        for _ in self.strides:
+            self.multi_level_cls_convs.append(self._build_stacked_convs())
+            self.multi_level_reg_convs.append(self._build_stacked_convs())
+            conv_cls, conv_reg, conv_obj = self._build_predictor()
+            self.multi_level_conv_cls.append(conv_cls)
+            self.multi_level_conv_reg.append(conv_reg)
+            self.multi_level_conv_obj.append(conv_obj)
+
+    def _build_stacked_convs(self) -> nn.Sequential:
+        """Initialize conv layers of a single level head."""
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+        stacked_convs = []
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            stacked_convs.append(
+                conv(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    bias=self.conv_bias))
+        return nn.Sequential(*stacked_convs)
+
+    def _build_predictor(self) -> Tuple[nn.Module, nn.Module, nn.Module]:
+        """Initialize predictor layers of a single level head."""
+        conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1)
+        conv_reg = nn.Conv2d(self.feat_channels, 4, 1)
+        conv_obj = nn.Conv2d(self.feat_channels, 1, 1)
+        return conv_cls, conv_reg, conv_obj
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        super(YOLOXHead, self).init_weights()
+        # Use prior in model initialization to improve stability
+        bias_init = bias_init_with_prob(0.01)
+        for conv_cls, conv_obj in zip(self.multi_level_conv_cls,
+                                      self.multi_level_conv_obj):
+            conv_cls.bias.data.fill_(bias_init)
+            conv_obj.bias.data.fill_(bias_init)
+
+    def forward_single(self, x: Tensor, cls_convs: nn.Module,
+                       reg_convs: nn.Module, conv_cls: nn.Module,
+                       conv_reg: nn.Module,
+                       conv_obj: nn.Module) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward feature of a single scale level."""
+
+        cls_feat = cls_convs(x)
+        reg_feat = reg_convs(x)
+
+        cls_score = conv_cls(cls_feat)
+        bbox_pred = conv_reg(reg_feat)
+        objectness = conv_obj(reg_feat)
+
+        return cls_score, bbox_pred, objectness
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, and objectnesses.
+        """
+
+        return multi_apply(self.forward_single, x, self.multi_level_cls_convs,
+                           self.multi_level_reg_convs,
+                           self.multi_level_conv_cls,
+                           self.multi_level_conv_reg,
+                           self.multi_level_conv_obj)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        objectnesses: Optional[List[Tensor]],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> List[InstanceData]:
+        """Transform a batch of output features extracted by the head into
+        bbox results.
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            objectnesses (list[Tensor], Optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(objectnesses)
+        cfg = self.test_cfg if cfg is None else cfg
+
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+
+        # flatten cls_scores, bbox_preds and objectness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_objectness = [
+            objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+            for objectness in objectnesses
+        ]
+
+        flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid()
+        flatten_priors = torch.cat(mlvl_priors)
+
+        flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds)
+
+        result_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            max_scores, labels = torch.max(flatten_cls_scores[img_id], 1)
+            valid_mask = flatten_objectness[
+                img_id] * max_scores >= cfg.score_thr
+            results = InstanceData(
+                bboxes=flatten_bboxes[img_id][valid_mask],
+                scores=max_scores[valid_mask] *
+                flatten_objectness[img_id][valid_mask],
+                labels=labels[valid_mask])
+
+            result_list.append(
+                self._bbox_post_process(
+                    results=results,
+                    cfg=cfg,
+                    rescale=rescale,
+                    with_nms=with_nms,
+                    img_meta=img_meta))
+
+        return result_list
+
+    def _bbox_decode(self, priors: Tensor, bbox_preds: Tensor) -> Tensor:
+        """Decode regression results (delta_x, delta_x, w, h) to bboxes (tl_x,
+        tl_y, br_x, br_y).
+
+        Args:
+            priors (Tensor): Center proiors of an image, has shape
+                (num_instances, 2).
+            bbox_preds (Tensor): Box energies / deltas for all instances,
+                has shape (batch_size, num_instances, 4).
+
+        Returns:
+            Tensor: Decoded bboxes in (tl_x, tl_y, br_x, br_y) format. Has
+            shape (batch_size, num_instances, 4).
+        """
+        xys = (bbox_preds[..., :2] * priors[:, 2:]) + priors[:, :2]
+        whs = bbox_preds[..., 2:].exp() * priors[:, 2:]
+
+        tl_x = (xys[..., 0] - whs[..., 0] / 2)
+        tl_y = (xys[..., 1] - whs[..., 1] / 2)
+        br_x = (xys[..., 0] + whs[..., 0] / 2)
+        br_y = (xys[..., 1] + whs[..., 1] / 2)
+
+        decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1)
+        return decoded_bboxes
+
+    def _bbox_post_process(self,
+                           results: InstanceData,
+                           cfg: ConfigDict,
+                           rescale: bool = False,
+                           with_nms: bool = True,
+                           img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            results.bboxes /= results.bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        if with_nms and results.bboxes.numel() > 0:
+            det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores,
+                                                results.labels, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+        return results
+
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            objectnesses: Sequence[Tensor],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        num_imgs = len(batch_img_metas)
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+
+        flatten_cls_preds = [
+            cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                 self.cls_out_channels)
+            for cls_pred in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_objectness = [
+            objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+            for objectness in objectnesses
+        ]
+
+        flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_objectness = torch.cat(flatten_objectness, dim=1)
+        flatten_priors = torch.cat(mlvl_priors)
+        flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds)
+
+        (pos_masks, cls_targets, obj_targets, bbox_targets, l1_targets,
+         num_fg_imgs) = multi_apply(
+             self._get_targets_single,
+             flatten_priors.unsqueeze(0).repeat(num_imgs, 1, 1),
+             flatten_cls_preds.detach(), flatten_bboxes.detach(),
+             flatten_objectness.detach(), batch_gt_instances, batch_img_metas,
+             batch_gt_instances_ignore)
+
+        # The experimental results show that 'reduce_mean' can improve
+        # performance on the COCO dataset.
+        num_pos = torch.tensor(
+            sum(num_fg_imgs),
+            dtype=torch.float,
+            device=flatten_cls_preds.device)
+        num_total_samples = max(reduce_mean(num_pos), 1.0)
+
+        pos_masks = torch.cat(pos_masks, 0)
+        cls_targets = torch.cat(cls_targets, 0)
+        obj_targets = torch.cat(obj_targets, 0)
+        bbox_targets = torch.cat(bbox_targets, 0)
+        if self.use_l1:
+            l1_targets = torch.cat(l1_targets, 0)
+
+        loss_obj = self.loss_obj(flatten_objectness.view(-1, 1),
+                                 obj_targets) / num_total_samples
+        if num_pos > 0:
+            loss_cls = self.loss_cls(
+                flatten_cls_preds.view(-1, self.num_classes)[pos_masks],
+                cls_targets) / num_total_samples
+            loss_bbox = self.loss_bbox(
+                flatten_bboxes.view(-1, 4)[pos_masks],
+                bbox_targets) / num_total_samples
+        else:
+            # Avoid cls and reg branch not participating in the gradient
+            # propagation when there is no ground-truth in the images.
+            # For more details, please refer to
+            # https://github.com/open-mmlab/mmdetection/issues/7298
+            loss_cls = flatten_cls_preds.sum() * 0
+            loss_bbox = flatten_bboxes.sum() * 0
+
+        loss_dict = dict(
+            loss_cls=loss_cls, loss_bbox=loss_bbox, loss_obj=loss_obj)
+
+        if self.use_l1:
+            if num_pos > 0:
+                loss_l1 = self.loss_l1(
+                    flatten_bbox_preds.view(-1, 4)[pos_masks],
+                    l1_targets) / num_total_samples
+            else:
+                # Avoid cls and reg branch not participating in the gradient
+                # propagation when there is no ground-truth in the images.
+                # For more details, please refer to
+                # https://github.com/open-mmlab/mmdetection/issues/7298
+                loss_l1 = flatten_bbox_preds.sum() * 0
+            loss_dict.update(loss_l1=loss_l1)
+
+        return loss_dict
+
+    @torch.no_grad()
+    def _get_targets_single(
+            self,
+            priors: Tensor,
+            cls_preds: Tensor,
+            decoded_bboxes: Tensor,
+            objectness: Tensor,
+            gt_instances: InstanceData,
+            img_meta: dict,
+            gt_instances_ignore: Optional[InstanceData] = None) -> tuple:
+        """Compute classification, regression, and objectness targets for
+        priors in a single image.
+
+        Args:
+            priors (Tensor): All priors of one image, a 2D-Tensor with shape
+                [num_priors, 4] in [cx, xy, stride_w, stride_y] format.
+            cls_preds (Tensor): Classification predictions of one image,
+                a 2D-Tensor with shape [num_priors, num_classes]
+            decoded_bboxes (Tensor): Decoded bboxes predictions of one image,
+                a 2D-Tensor with shape [num_priors, 4] in [tl_x, tl_y,
+                br_x, br_y] format.
+            objectness (Tensor): Objectness predictions of one image,
+                a 1D-Tensor with shape [num_priors]
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            tuple:
+                foreground_mask (list[Tensor]): Binary mask of foreground
+                targets.
+                cls_target (list[Tensor]): Classification targets of an image.
+                obj_target (list[Tensor]): Objectness targets of an image.
+                bbox_target (list[Tensor]): BBox targets of an image.
+                l1_target (int): BBox L1 targets of an image.
+                num_pos_per_img (int): Number of positive samples in an image.
+        """
+
+        num_priors = priors.size(0)
+        num_gts = len(gt_instances)
+        # No target
+        if num_gts == 0:
+            cls_target = cls_preds.new_zeros((0, self.num_classes))
+            bbox_target = cls_preds.new_zeros((0, 4))
+            l1_target = cls_preds.new_zeros((0, 4))
+            obj_target = cls_preds.new_zeros((num_priors, 1))
+            foreground_mask = cls_preds.new_zeros(num_priors).bool()
+            return (foreground_mask, cls_target, obj_target, bbox_target,
+                    l1_target, 0)
+
+        # YOLOX uses center priors with 0.5 offset to assign targets,
+        # but use center priors without offset to regress bboxes.
+        offset_priors = torch.cat(
+            [priors[:, :2] + priors[:, 2:] * 0.5, priors[:, 2:]], dim=-1)
+
+        scores = cls_preds.sigmoid() * objectness.unsqueeze(1).sigmoid()
+        pred_instances = InstanceData(
+            bboxes=decoded_bboxes, scores=scores.sqrt_(), priors=offset_priors)
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+        pos_inds = sampling_result.pos_inds
+        num_pos_per_img = pos_inds.size(0)
+
+        pos_ious = assign_result.max_overlaps[pos_inds]
+        # IOU aware classification score
+        cls_target = F.one_hot(sampling_result.pos_gt_labels,
+                               self.num_classes) * pos_ious.unsqueeze(-1)
+        obj_target = torch.zeros_like(objectness).unsqueeze(-1)
+        obj_target[pos_inds] = 1
+        bbox_target = sampling_result.pos_gt_bboxes
+        l1_target = cls_preds.new_zeros((num_pos_per_img, 4))
+        if self.use_l1:
+            l1_target = self._get_l1_target(l1_target, bbox_target,
+                                            priors[pos_inds])
+        foreground_mask = torch.zeros_like(objectness).to(torch.bool)
+        foreground_mask[pos_inds] = 1
+        return (foreground_mask, cls_target, obj_target, bbox_target,
+                l1_target, num_pos_per_img)
+
+    def _get_l1_target(self,
+                       l1_target: Tensor,
+                       gt_bboxes: Tensor,
+                       priors: Tensor,
+                       eps: float = 1e-8) -> Tensor:
+        """Convert gt bboxes to center offset and log width height."""
+        gt_cxcywh = bbox_xyxy_to_cxcywh(gt_bboxes)
+        l1_target[:, :2] = (gt_cxcywh[:, :2] - priors[:, :2]) / priors[:, 2:]
+        l1_target[:, 2:] = torch.log(gt_cxcywh[:, 2:] / priors[:, 2:] + eps)
+        return l1_target
diff --git a/head_extractor/build/lib/mmdet/models/detectors/__init__.py b/head_extractor/build/lib/mmdet/models/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5a06d2813c810504e12592506be9347111d6696
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/__init__.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .atss import ATSS
+from .autoassign import AutoAssign
+from .base import BaseDetector
+from .base_detr import DetectionTransformer
+from .boxinst import BoxInst
+from .cascade_rcnn import CascadeRCNN
+from .centernet import CenterNet
+from .condinst import CondInst
+from .conditional_detr import ConditionalDETR
+from .cornernet import CornerNet
+from .crowddet import CrowdDet
+from .d2_wrapper import Detectron2Wrapper
+from .dab_detr import DABDETR
+from .ddod import DDOD
+from .ddq_detr import DDQDETR
+from .deformable_detr import DeformableDETR
+from .detr import DETR
+from .dino import DINO
+from .fast_rcnn import FastRCNN
+from .faster_rcnn import FasterRCNN
+from .fcos import FCOS
+from .fovea import FOVEA
+from .fsaf import FSAF
+from .gfl import GFL
+from .glip import GLIP
+from .grid_rcnn import GridRCNN
+from .grounding_dino import GroundingDINO
+from .htc import HybridTaskCascade
+from .kd_one_stage import KnowledgeDistillationSingleStageDetector
+from .lad import LAD
+from .mask2former import Mask2Former
+from .mask_rcnn import MaskRCNN
+from .mask_scoring_rcnn import MaskScoringRCNN
+from .maskformer import MaskFormer
+from .nasfcos import NASFCOS
+from .paa import PAA
+from .panoptic_fpn import PanopticFPN
+from .panoptic_two_stage_segmentor import TwoStagePanopticSegmentor
+from .point_rend import PointRend
+from .queryinst import QueryInst
+from .reppoints_detector import RepPointsDetector
+from .retinanet import RetinaNet
+from .rpn import RPN
+from .rtmdet import RTMDet
+from .scnet import SCNet
+from .semi_base import SemiBaseDetector
+from .single_stage import SingleStageDetector
+from .soft_teacher import SoftTeacher
+from .solo import SOLO
+from .solov2 import SOLOv2
+from .sparse_rcnn import SparseRCNN
+from .tood import TOOD
+from .trident_faster_rcnn import TridentFasterRCNN
+from .two_stage import TwoStageDetector
+from .vfnet import VFNet
+from .yolact import YOLACT
+from .yolo import YOLOV3
+from .yolof import YOLOF
+from .yolox import YOLOX
+
+__all__ = [
+    'ATSS', 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN',
+    'KnowledgeDistillationSingleStageDetector', 'FastRCNN', 'FasterRCNN',
+    'MaskRCNN', 'CascadeRCNN', 'HybridTaskCascade', 'RetinaNet', 'FCOS',
+    'GridRCNN', 'MaskScoringRCNN', 'RepPointsDetector', 'FOVEA', 'FSAF',
+    'NASFCOS', 'PointRend', 'GFL', 'CornerNet', 'PAA', 'YOLOV3', 'YOLACT',
+    'VFNet', 'DETR', 'TridentFasterRCNN', 'SparseRCNN', 'SCNet', 'SOLO',
+    'SOLOv2', 'DeformableDETR', 'AutoAssign', 'YOLOF', 'CenterNet', 'YOLOX',
+    'TwoStagePanopticSegmentor', 'PanopticFPN', 'QueryInst', 'LAD', 'TOOD',
+    'MaskFormer', 'DDOD', 'Mask2Former', 'SemiBaseDetector', 'SoftTeacher',
+    'RTMDet', 'Detectron2Wrapper', 'CrowdDet', 'CondInst', 'BoxInst',
+    'DetectionTransformer', 'ConditionalDETR', 'DINO', 'DABDETR', 'GLIP',
+    'DDQDETR', 'GroundingDINO'
+]
diff --git a/head_extractor/build/lib/mmdet/models/detectors/atss.py b/head_extractor/build/lib/mmdet/models/detectors/atss.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bfcc728dc4cc33c0b705a2ab22a4e3f4ad7386d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/atss.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class ATSS(SingleStageDetector):
+    """Implementation of `ATSS <https://arxiv.org/abs/1912.02424>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of ATSS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of ATSS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/autoassign.py b/head_extractor/build/lib/mmdet/models/detectors/autoassign.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b3570fe6e0c3812a72bc677038bb4e76b05576
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/autoassign.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class AutoAssign(SingleStageDetector):
+    """Implementation of `AutoAssign: Differentiable Label Assignment for Dense
+    Object Detection <https://arxiv.org/abs/2007.03496>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of AutoAssign. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of AutoAssign. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/base.py b/head_extractor/build/lib/mmdet/models/detectors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a193b0ca9ca3d2b42fda452004d5c97421f426c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/base.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple, Union
+
+import torch
+from mmengine.model import BaseModel
+from torch import Tensor
+
+from mmdet.structures import DetDataSample, OptSampleList, SampleList
+from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig
+from ..utils import samplelist_boxtype2tensor
+
+ForwardResults = Union[Dict[str, torch.Tensor], List[DetDataSample],
+                       Tuple[torch.Tensor], torch.Tensor]
+
+
+class BaseDetector(BaseModel, metaclass=ABCMeta):
+    """Base class for detectors.
+
+    Args:
+       data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`BaseDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+       init_cfg (dict or ConfigDict, optional): the config to control the
+           initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+    @property
+    def with_neck(self) -> bool:
+        """bool: whether the detector has a neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    # TODO: these properties need to be carefully handled
+    # for both single stage & two stage detectors
+    @property
+    def with_shared_head(self) -> bool:
+        """bool: whether the detector has a shared head in the RoI Head"""
+        return hasattr(self, 'roi_head') and self.roi_head.with_shared_head
+
+    @property
+    def with_bbox(self) -> bool:
+        """bool: whether the detector has a bbox head"""
+        return ((hasattr(self, 'roi_head') and self.roi_head.with_bbox)
+                or (hasattr(self, 'bbox_head') and self.bbox_head is not None))
+
+    @property
+    def with_mask(self) -> bool:
+        """bool: whether the detector has a mask head"""
+        return ((hasattr(self, 'roi_head') and self.roi_head.with_mask)
+                or (hasattr(self, 'mask_head') and self.mask_head is not None))
+
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: OptSampleList = None,
+                mode: str = 'tensor') -> ForwardResults:
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+        processed to a list of :obj:`DetDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle either back propagation or
+        parameter update, which are supposed to be done in :meth:`train_step`.
+
+        Args:
+            inputs (torch.Tensor): The input tensor with shape
+                (N, C, ...) in general.
+            data_samples (list[:obj:`DetDataSample`], optional): A batch of
+                data samples that contain annotations and predictions.
+                Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'tensor'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`DetDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if mode == 'loss':
+            return self.loss(inputs, data_samples)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    @abstractmethod
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples."""
+        pass
+
+    @abstractmethod
+    def predict(self, batch_inputs: Tensor,
+                batch_data_samples: SampleList) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing."""
+        pass
+
+    @abstractmethod
+    def _forward(self,
+                 batch_inputs: Tensor,
+                 batch_data_samples: OptSampleList = None):
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        pass
+
+    @abstractmethod
+    def extract_feat(self, batch_inputs: Tensor):
+        """Extract features from images."""
+        pass
+
+    def add_pred_to_datasample(self, data_samples: SampleList,
+                               results_list: InstanceList) -> SampleList:
+        """Add predictions to `DetDataSample`.
+
+        Args:
+            data_samples (list[:obj:`DetDataSample`], optional): A batch of
+                data samples that contain annotations and predictions.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        for data_sample, pred_instances in zip(data_samples, results_list):
+            data_sample.pred_instances = pred_instances
+        samplelist_boxtype2tensor(data_samples)
+        return data_samples
diff --git a/head_extractor/build/lib/mmdet/models/detectors/base_detr.py b/head_extractor/build/lib/mmdet/models/detectors/base_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..88f00ec7408c389a1eb06beac6b383007f80b893
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/base_detr.py
@@ -0,0 +1,332 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple, Union
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+
+@MODELS.register_module()
+class DetectionTransformer(BaseDetector, metaclass=ABCMeta):
+    r"""Base class for Detection Transformer.
+
+    In Detection Transformer, an encoder is used to process output features of
+    neck, then several queries interact with the encoder features using a
+    decoder and do the regression and classification with the bounding box
+    head.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): Config of the backbone.
+        neck (:obj:`ConfigDict` or dict, optional): Config of the neck.
+            Defaults to None.
+        encoder (:obj:`ConfigDict` or dict, optional): Config of the
+            Transformer encoder. Defaults to None.
+        decoder (:obj:`ConfigDict` or dict, optional): Config of the
+            Transformer decoder. Defaults to None.
+        bbox_head (:obj:`ConfigDict` or dict, optional): Config for the
+            bounding box head module. Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict, optional): Config
+            of the positional encoding module. Defaults to None.
+        num_queries (int, optional): Number of decoder query in Transformer.
+            Defaults to 100.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            the bounding box head module. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            the bounding box head module. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 encoder: OptConfigType = None,
+                 decoder: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 positional_encoding: OptConfigType = None,
+                 num_queries: int = 100,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        # process args
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.encoder = encoder
+        self.decoder = decoder
+        self.positional_encoding = positional_encoding
+        self.num_queries = num_queries
+
+        # init model layers
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        self.bbox_head = MODELS.build(bbox_head)
+        self._init_layers()
+
+    @abstractmethod
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        pass
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, list]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (bs, dim, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+        img_feats = self.extract_feat(batch_inputs)
+        head_inputs_dict = self.forward_transformer(img_feats,
+                                                    batch_data_samples)
+        losses = self.bbox_head.loss(
+            **head_inputs_dict, batch_data_samples=batch_data_samples)
+
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs, has shape (bs, dim, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the input images.
+            Each DetDataSample usually contain 'pred_instances'. And the
+            `pred_instances` usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        img_feats = self.extract_feat(batch_inputs)
+        head_inputs_dict = self.forward_transformer(img_feats,
+                                                    batch_data_samples)
+        results_list = self.bbox_head.predict(
+            **head_inputs_dict,
+            rescale=rescale,
+            batch_data_samples=batch_data_samples)
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
+
+    def _forward(
+            self,
+            batch_inputs: Tensor,
+            batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            batch_inputs (Tensor): Inputs, has shape (bs, dim, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`], optional): The
+                batch data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[Tensor]: A tuple of features from ``bbox_head`` forward.
+        """
+        img_feats = self.extract_feat(batch_inputs)
+        head_inputs_dict = self.forward_transformer(img_feats,
+                                                    batch_data_samples)
+        results = self.bbox_head.forward(**head_inputs_dict)
+        return results
+
+    def forward_transformer(self,
+                            img_feats: Tuple[Tensor],
+                            batch_data_samples: OptSampleList = None) -> Dict:
+        """Forward process of Transformer, which includes four steps:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'. We
+        summarized the parameters flow of the existing DETR-like detector,
+        which can be illustrated as follow:
+
+        .. code:: text
+
+                 img_feats & batch_data_samples
+                               |
+                               V
+                      +-----------------+
+                      | pre_transformer |
+                      +-----------------+
+                          |          |
+                          |          V
+                          |    +-----------------+
+                          |    | forward_encoder |
+                          |    +-----------------+
+                          |             |
+                          |             V
+                          |     +---------------+
+                          |     |  pre_decoder  |
+                          |     +---------------+
+                          |         |       |
+                          V         V       |
+                      +-----------------+   |
+                      | forward_decoder |   |
+                      +-----------------+   |
+                                |           |
+                                V           V
+                               head_inputs_dict
+
+        Args:
+            img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each
+                    feature map has shape (bs, dim, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`], optional): The
+                batch data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            dict: The dictionary of bbox_head function inputs, which always
+            includes the `hidden_states` of the decoder output and may contain
+            `references` including the initial and intermediate references.
+        """
+        encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer(
+            img_feats, batch_data_samples)
+
+        encoder_outputs_dict = self.forward_encoder(**encoder_inputs_dict)
+
+        tmp_dec_in, head_inputs_dict = self.pre_decoder(**encoder_outputs_dict)
+        decoder_inputs_dict.update(tmp_dec_in)
+
+        decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict)
+        head_inputs_dict.update(decoder_outputs_dict)
+        return head_inputs_dict
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor, has shape (bs, dim, H, W).
+
+        Returns:
+            tuple[Tensor]: Tuple of feature maps from neck. Each feature map
+            has shape (bs, dim, H, W).
+        """
+        x = self.backbone(batch_inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    @abstractmethod
+    def pre_transformer(
+            self,
+            img_feats: Tuple[Tensor],
+            batch_data_samples: OptSampleList = None) -> Tuple[Dict, Dict]:
+        """Process image features before feeding them to the transformer.
+
+        Args:
+            img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each
+                feature map has shape (bs, dim, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`], optional): The
+                batch data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of encoder
+            and the second dict contains the inputs of decoder.
+
+            - encoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_encoder()`, which includes 'feat', 'feat_mask',
+              'feat_pos', and other algorithm-specific arguments.
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_decoder()`, which includes 'memory_mask', and
+              other algorithm-specific arguments.
+        """
+        pass
+
+    @abstractmethod
+    def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
+                        feat_pos: Tensor, **kwargs) -> Dict:
+        """Forward with Transformer encoder.
+
+        Args:
+            feat (Tensor): Sequential features, has shape (bs, num_feat_points,
+                dim).
+            feat_mask (Tensor): ByteTensor, the padding mask of the features,
+                has shape (bs, num_feat_points).
+            feat_pos (Tensor): The positional embeddings of the features, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of encoder outputs, which includes the
+            `memory` of the encoder output and other algorithm-specific
+            arguments.
+        """
+        pass
+
+    @abstractmethod
+    def pre_decoder(self, memory: Tensor, **kwargs) -> Tuple[Dict, Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`, and `reference_points`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of decoder
+            and the second dict contains the inputs of the bbox_head function.
+
+            - decoder_inputs_dict (dict): The keyword dictionary args of
+              `self.forward_decoder()`, which includes 'query', 'query_pos',
+              'memory', and other algorithm-specific arguments.
+            - head_inputs_dict (dict): The keyword dictionary args of the
+              bbox_head functions, which is usually empty, or includes
+              `enc_outputs_class` and `enc_outputs_class` when the detector
+              support 'two stage' or 'query selection' strategies.
+        """
+        pass
+
+    @abstractmethod
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        **kwargs) -> Dict:
+        """Forward with Transformer decoder.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` of the decoder output, `references` including
+            the initial and intermediate reference_points, and other
+            algorithm-specific arguments.
+        """
+        pass
diff --git a/head_extractor/build/lib/mmdet/models/detectors/boxinst.py b/head_extractor/build/lib/mmdet/models/detectors/boxinst.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca6b0bdd90a2a7e78f429a6822dbde6f809426da
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/boxinst.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class BoxInst(SingleStageInstanceSegmentor):
+    """Implementation of `BoxInst <https://arxiv.org/abs/2012.02310>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 mask_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/cascade_rcnn.py b/head_extractor/build/lib/mmdet/models/detectors/cascade_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecf733ff104b99436fcc74130b0ccea12a0fa6d0
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/cascade_rcnn.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class CascadeRCNN(TwoStageDetector):
+    r"""Implementation of `Cascade R-CNN: Delving into High Quality Object
+    Detection <https://arxiv.org/abs/1906.09756>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 rpn_head: OptConfigType = None,
+                 roi_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/centernet.py b/head_extractor/build/lib/mmdet/models/detectors/centernet.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c6622d6280227ecba9ede4aabf72c22a764e11d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/centernet.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class CenterNet(SingleStageDetector):
+    """Implementation of CenterNet(Objects as Points)
+
+    <https://arxiv.org/abs/1904.07850>.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/condinst.py b/head_extractor/build/lib/mmdet/models/detectors/condinst.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed2dc99eea3faf7b03a3970d46a372d28eb89fe1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/condinst.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class CondInst(SingleStageInstanceSegmentor):
+    """Implementation of `CondInst <https://arxiv.org/abs/2003.05664>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 mask_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/conditional_detr.py b/head_extractor/build/lib/mmdet/models/detectors/conditional_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..d57868e63a2ece085a7e5b67ee93c921ba334830
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/conditional_detr.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from ..layers import (ConditionalDetrTransformerDecoder,
+                      DetrTransformerEncoder, SinePositionalEncoding)
+from .detr import DETR
+
+
+@MODELS.register_module()
+class ConditionalDETR(DETR):
+    r"""Implementation of `Conditional DETR for Fast Training Convergence.
+
+    <https://arxiv.org/abs/2108.06152>`_.
+
+    Code is modified from the `official github repo
+    <https://github.com/Atten4Vis/ConditionalDETR>`_.
+    """
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DetrTransformerEncoder(**self.encoder)
+        self.decoder = ConditionalDetrTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        # NOTE The embed_dims is typically passed from the inside out.
+        # For example in DETR, The embed_dims is passed as
+        # self_attn -> the first encoder layer -> encoder -> detector.
+        self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims)
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            f'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        memory_mask: Tensor, memory_pos: Tensor) -> Dict:
+        """Forward with Transformer decoder.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            memory_pos (Tensor): The positional embeddings of memory, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` and `references` of the decoder output.
+
+            - hidden_states (Tensor): Has shape
+                (num_decoder_layers, bs, num_queries, dim)
+            - references (Tensor): Has shape
+                (bs, num_queries, 2)
+        """
+
+        hidden_states, references = self.decoder(
+            query=query,
+            key=memory,
+            query_pos=query_pos,
+            key_pos=memory_pos,
+            key_padding_mask=memory_mask)
+        head_inputs_dict = dict(
+            hidden_states=hidden_states, references=references)
+        return head_inputs_dict
diff --git a/head_extractor/build/lib/mmdet/models/detectors/cornernet.py b/head_extractor/build/lib/mmdet/models/detectors/cornernet.py
new file mode 100644
index 0000000000000000000000000000000000000000..946af4dbe6ae339d44f8db265ff7f11b9e02d239
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/cornernet.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class CornerNet(SingleStageDetector):
+    """CornerNet.
+
+    This detector is the implementation of the paper `CornerNet: Detecting
+    Objects as Paired Keypoints <https://arxiv.org/abs/1808.01244>`_ .
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/crowddet.py b/head_extractor/build/lib/mmdet/models/detectors/crowddet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f43bc08aa95756324381ee4182f001a008613c8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/crowddet.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class CrowdDet(TwoStageDetector):
+    """Implementation of `CrowdDet <https://arxiv.org/abs/2003.09163>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        rpn_head (:obj:`ConfigDict` or dict): The rpn config.
+        roi_head (:obj:`ConfigDict` or dict): The roi config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of FCOS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of FCOS. Defaults to None.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/d2_wrapper.py b/head_extractor/build/lib/mmdet/models/detectors/d2_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a2daa413e8fe0397ec37008d781ce449e7a26fd
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/d2_wrapper.py
@@ -0,0 +1,291 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import BaseBoxes
+from mmdet.structures.mask import BitmapMasks, PolygonMasks
+from mmdet.utils import ConfigType
+from .base import BaseDetector
+
+try:
+    import detectron2
+    from detectron2.config import get_cfg
+    from detectron2.modeling import build_model
+    from detectron2.structures.masks import BitMasks as D2_BitMasks
+    from detectron2.structures.masks import PolygonMasks as D2_PolygonMasks
+    from detectron2.utils.events import EventStorage
+except ImportError:
+    detectron2 = None
+
+
+def _to_cfgnode_list(cfg: ConfigType,
+                     config_list: list = [],
+                     father_name: str = 'MODEL') -> tuple:
+    """Convert the key and value of mmengine.ConfigDict into a list.
+
+    Args:
+        cfg (ConfigDict): The detectron2 model config.
+        config_list (list): A list contains the key and value of ConfigDict.
+            Defaults to [].
+        father_name (str): The father name add before the key.
+            Defaults to "MODEL".
+
+    Returns:
+        tuple:
+
+        - config_list: A list contains the key and value of ConfigDict.
+        - father_name (str): The father name add before the key.
+          Defaults to "MODEL".
+    """
+    for key, value in cfg.items():
+        name = f'{father_name}.{key.upper()}'
+        if isinstance(value, ConfigDict) or isinstance(value, dict):
+            config_list, fater_name = \
+                _to_cfgnode_list(value, config_list, name)
+        else:
+            config_list.append(name)
+            config_list.append(value)
+
+    return config_list, father_name
+
+
+def convert_d2_pred_to_datasample(data_samples: SampleList,
+                                  d2_results_list: list) -> SampleList:
+    """Convert the Detectron2's result to DetDataSample.
+
+    Args:
+        data_samples (list[:obj:`DetDataSample`]): The batch
+            data samples. It usually includes information such
+            as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+        d2_results_list (list): The list of the results of Detectron2's model.
+
+    Returns:
+        list[:obj:`DetDataSample`]: Detection results of the
+        input images. Each DetDataSample usually contain
+        'pred_instances'. And the ``pred_instances`` usually
+        contains following keys.
+
+        - scores (Tensor): Classification scores, has a shape
+          (num_instance, )
+        - labels (Tensor): Labels of bboxes, has a shape
+          (num_instances, ).
+        - bboxes (Tensor): Has a shape (num_instances, 4),
+          the last dimension 4 arrange as (x1, y1, x2, y2).
+    """
+    assert len(data_samples) == len(d2_results_list)
+    for data_sample, d2_results in zip(data_samples, d2_results_list):
+        d2_instance = d2_results['instances']
+
+        results = InstanceData()
+        results.bboxes = d2_instance.pred_boxes.tensor
+        results.scores = d2_instance.scores
+        results.labels = d2_instance.pred_classes
+
+        if d2_instance.has('pred_masks'):
+            results.masks = d2_instance.pred_masks
+        data_sample.pred_instances = results
+
+    return data_samples
+
+
+@MODELS.register_module()
+class Detectron2Wrapper(BaseDetector):
+    """Wrapper of a Detectron2 model. Input/output formats of this class follow
+    MMDetection's convention, so a Detectron2 model can be trained and
+    evaluated in MMDetection.
+
+    Args:
+        detector (:obj:`ConfigDict` or dict): The module config of
+            Detectron2.
+        bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): whether to convert image from RGB to BGR.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 detector: ConfigType,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False) -> None:
+        if detectron2 is None:
+            raise ImportError('Please install Detectron2 first')
+        assert not (bgr_to_rgb and rgb_to_bgr), (
+            '`bgr2rgb` and `rgb2bgr` cannot be set to True at the same time')
+        super().__init__()
+        self._channel_conversion = rgb_to_bgr or bgr_to_rgb
+        cfgnode_list, _ = _to_cfgnode_list(detector)
+        self.cfg = get_cfg()
+        self.cfg.merge_from_list(cfgnode_list)
+        self.d2_model = build_model(self.cfg)
+        self.storage = EventStorage()
+
+    def init_weights(self) -> None:
+        """Initialization Backbone.
+
+        NOTE: The initialization of other layers are in Detectron2,
+        if users want to change the initialization way, please
+        change the code in Detectron2.
+        """
+        from detectron2.checkpoint import DetectionCheckpointer
+        checkpointer = DetectionCheckpointer(model=self.d2_model)
+        checkpointer.load(self.cfg.MODEL.WEIGHTS, checkpointables=[])
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        The inputs will first convert to the Detectron2 type and feed into
+        D2 models.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        d2_batched_inputs = self._convert_to_d2_inputs(
+            batch_inputs=batch_inputs,
+            batch_data_samples=batch_data_samples,
+            training=True)
+
+        with self.storage as storage:  # noqa
+            losses = self.d2_model(d2_batched_inputs)
+        # storage contains some training information, such as cls_accuracy.
+        # you can use storage.latest() to get the detail information
+        return losses
+
+    def predict(self, batch_inputs: Tensor,
+                batch_data_samples: SampleList) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        The inputs will first convert to the Detectron2 type and feed into
+        D2 models. And the results will convert back to the MMDet type.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        d2_batched_inputs = self._convert_to_d2_inputs(
+            batch_inputs=batch_inputs,
+            batch_data_samples=batch_data_samples,
+            training=False)
+        # results in detectron2 has already rescale
+        d2_results_list = self.d2_model(d2_batched_inputs)
+        batch_data_samples = convert_d2_pred_to_datasample(
+            data_samples=batch_data_samples, d2_results_list=d2_results_list)
+
+        return batch_data_samples
+
+    def _forward(self, *args, **kwargs):
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        raise NotImplementedError(
+            f'`_forward` is not implemented in {self.__class__.__name__}')
+
+    def extract_feat(self, *args, **kwargs):
+        """Extract features from images.
+
+        `extract_feat` will not be used in obj:``Detectron2Wrapper``.
+        """
+        pass
+
+    def _convert_to_d2_inputs(self,
+                              batch_inputs: Tensor,
+                              batch_data_samples: SampleList,
+                              training=True) -> list:
+        """Convert inputs type to support Detectron2's model.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+            training (bool): Whether to enable training time processing.
+
+        Returns:
+            list[dict]: A list of dict, which will be fed into Detectron2's
+            model. And the dict usually contains following keys.
+
+            - image (Tensor): Image in (C, H, W) format.
+            - instances (Instances): GT Instance.
+            - height (int): the output height resolution of the model
+            - width (int): the output width resolution of the model
+        """
+        from detectron2.data.detection_utils import filter_empty_instances
+        from detectron2.structures import Boxes, Instances
+
+        batched_d2_inputs = []
+        for image, data_samples in zip(batch_inputs, batch_data_samples):
+            d2_inputs = dict()
+            # deal with metainfo
+            meta_info = data_samples.metainfo
+            d2_inputs['file_name'] = meta_info['img_path']
+            d2_inputs['height'], d2_inputs['width'] = meta_info['ori_shape']
+            d2_inputs['image_id'] = meta_info['img_id']
+            # deal with image
+            if self._channel_conversion:
+                image = image[[2, 1, 0], ...]
+            d2_inputs['image'] = image
+            # deal with gt_instances
+            gt_instances = data_samples.gt_instances
+            d2_instances = Instances(meta_info['img_shape'])
+
+            gt_boxes = gt_instances.bboxes
+            # TODO: use mmdet.structures.box.get_box_tensor after PR 8658
+            #  has merged
+            if isinstance(gt_boxes, BaseBoxes):
+                gt_boxes = gt_boxes.tensor
+            d2_instances.gt_boxes = Boxes(gt_boxes)
+
+            d2_instances.gt_classes = gt_instances.labels
+            if gt_instances.get('masks', None) is not None:
+                gt_masks = gt_instances.masks
+                if isinstance(gt_masks, PolygonMasks):
+                    d2_instances.gt_masks = D2_PolygonMasks(gt_masks.masks)
+                elif isinstance(gt_masks, BitmapMasks):
+                    d2_instances.gt_masks = D2_BitMasks(gt_masks.masks)
+                else:
+                    raise TypeError('The type of `gt_mask` can be '
+                                    '`PolygonMasks` or `BitMasks`, but get '
+                                    f'{type(gt_masks)}.')
+            # convert to cpu and convert back to cuda to avoid
+            # some potential error
+            if training:
+                device = gt_boxes.device
+                d2_instances = filter_empty_instances(
+                    d2_instances.to('cpu')).to(device)
+                d2_inputs['instances'] = d2_instances
+            batched_d2_inputs.append(d2_inputs)
+
+        return batched_d2_inputs
diff --git a/head_extractor/build/lib/mmdet/models/detectors/dab_detr.py b/head_extractor/build/lib/mmdet/models/detectors/dab_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..b61301cf6660924f0832f4068841a4664797c585
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/dab_detr.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+from mmengine.model import uniform_init
+from torch import Tensor, nn
+
+from mmdet.registry import MODELS
+from ..layers import SinePositionalEncoding
+from ..layers.transformer import (DABDetrTransformerDecoder,
+                                  DABDetrTransformerEncoder, inverse_sigmoid)
+from .detr import DETR
+
+
+@MODELS.register_module()
+class DABDETR(DETR):
+    r"""Implementation of `DAB-DETR:
+    Dynamic Anchor Boxes are Better Queries for DETR.
+
+    <https://arxiv.org/abs/2201.12329>`_.
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/DAB-DETR>`_.
+
+    Args:
+        with_random_refpoints (bool): Whether to randomly initialize query
+            embeddings and not update them during training.
+            Defaults to False.
+        num_patterns (int): Inspired by Anchor-DETR. Defaults to 0.
+    """
+
+    def __init__(self,
+                 *args,
+                 with_random_refpoints: bool = False,
+                 num_patterns: int = 0,
+                 **kwargs) -> None:
+        self.with_random_refpoints = with_random_refpoints
+        assert isinstance(num_patterns, int), \
+            f'num_patterns should be int but {num_patterns}.'
+        self.num_patterns = num_patterns
+
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DABDetrTransformerEncoder(**self.encoder)
+        self.decoder = DABDetrTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        self.query_dim = self.decoder.query_dim
+        self.query_embedding = nn.Embedding(self.num_queries, self.query_dim)
+        if self.num_patterns > 0:
+            self.patterns = nn.Embedding(self.num_patterns, self.embed_dims)
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            f'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super(DABDETR, self).init_weights()
+        if self.with_random_refpoints:
+            uniform_init(self.query_embedding)
+            self.query_embedding.weight.data[:, :2] = \
+                inverse_sigmoid(self.query_embedding.weight.data[:, :2])
+            self.query_embedding.weight.data[:, :2].requires_grad = False
+
+    def pre_decoder(self, memory: Tensor) -> Tuple[Dict, Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of decoder
+            and the second dict contains the inputs of the bbox_head function.
+
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+                `self.forward_decoder()`, which includes 'query', 'query_pos',
+                'memory' and 'reg_branches'.
+            - head_inputs_dict (dict): The keyword args dictionary of the
+                bbox_head functions, which is usually empty, or includes
+                `enc_outputs_class` and `enc_outputs_class` when the detector
+                support 'two stage' or 'query selection' strategies.
+        """
+        batch_size = memory.size(0)
+        query_pos = self.query_embedding.weight
+        query_pos = query_pos.unsqueeze(0).repeat(batch_size, 1, 1)
+        if self.num_patterns == 0:
+            query = query_pos.new_zeros(batch_size, self.num_queries,
+                                        self.embed_dims)
+        else:
+            query = self.patterns.weight[:, None, None, :]\
+                .repeat(1, self.num_queries, batch_size, 1)\
+                .view(-1, batch_size, self.embed_dims)\
+                .permute(1, 0, 2)
+            query_pos = query_pos.repeat(1, self.num_patterns, 1)
+
+        decoder_inputs_dict = dict(
+            query_pos=query_pos, query=query, memory=memory)
+        head_inputs_dict = dict()
+        return decoder_inputs_dict, head_inputs_dict
+
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        memory_mask: Tensor, memory_pos: Tensor) -> Dict:
+        """Forward with Transformer decoder.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            memory_pos (Tensor): The positional embeddings of memory, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` and `references` of the decoder output.
+        """
+
+        hidden_states, references = self.decoder(
+            query=query,
+            key=memory,
+            query_pos=query_pos,
+            key_pos=memory_pos,
+            key_padding_mask=memory_mask,
+            reg_branches=self.bbox_head.
+            fc_reg  # iterative refinement for anchor boxes
+        )
+        head_inputs_dict = dict(
+            hidden_states=hidden_states, references=references)
+        return head_inputs_dict
diff --git a/head_extractor/build/lib/mmdet/models/detectors/ddod.py b/head_extractor/build/lib/mmdet/models/detectors/ddod.py
new file mode 100644
index 0000000000000000000000000000000000000000..3503a40c8eb6d6c0496ea0f31740acecf774113a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/ddod.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class DDOD(SingleStageDetector):
+    """Implementation of `DDOD <https://arxiv.org/pdf/2107.02963.pdf>`_.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of ATSS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of ATSS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/ddq_detr.py b/head_extractor/build/lib/mmdet/models/detectors/ddq_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..57d4959d50ddd7a761d5e5c7a29d1f7f233f838a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/ddq_detr.py
@@ -0,0 +1,274 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import MultiScaleDeformableAttention, batched_nms
+from torch import Tensor, nn
+from torch.nn.init import normal_
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from mmdet.utils import OptConfigType
+from ..layers import DDQTransformerDecoder
+from ..utils import align_tensor
+from .deformable_detr import DeformableDETR
+from .dino import DINO
+
+
+@MODELS.register_module()
+class DDQDETR(DINO):
+    r"""Implementation of `Dense Distinct Query for
+    End-to-End Object Detection <https://arxiv.org/abs/2303.12776>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/jshilong/DDQ>`_.
+
+    Args:
+        dense_topk_ratio (float): Ratio of num_dense queries to num_queries.
+            Defaults to 1.5.
+        dqs_cfg (:obj:`ConfigDict` or dict, optional): Config of
+            Distinct Queries Selection. Defaults to nms with
+            `iou_threshold` = 0.8.
+    """
+
+    def __init__(self,
+                 *args,
+                 dense_topk_ratio: float = 1.5,
+                 dqs_cfg: OptConfigType = dict(type='nms', iou_threshold=0.8),
+                 **kwargs):
+        self.dense_topk_ratio = dense_topk_ratio
+        self.decoder_cfg = kwargs['decoder']
+        self.dqs_cfg = dqs_cfg
+        super().__init__(*args, **kwargs)
+
+        # a share dict in all moduls
+        # pass some intermediate results and config parameters
+        cache_dict = dict()
+        for m in self.modules():
+            m.cache_dict = cache_dict
+        # first element is the start index of matching queries
+        # second element is the number of matching queries
+        self.cache_dict['dis_query_info'] = [0, 0]
+
+        # mask for distinct queries in each decoder layer
+        self.cache_dict['distinct_query_mask'] = []
+        # pass to decoder do the dqs
+        self.cache_dict['cls_branches'] = self.bbox_head.cls_branches
+        # Used to construct the attention mask after dqs
+        self.cache_dict['num_heads'] = self.encoder.layers[
+            0].self_attn.num_heads
+        # pass to decoder to do the dqs
+        self.cache_dict['dqs_cfg'] = self.dqs_cfg
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        super(DDQDETR, self)._init_layers()
+        self.decoder = DDQTransformerDecoder(**self.decoder_cfg)
+        self.query_embedding = None
+        self.query_map = nn.Linear(self.embed_dims, self.embed_dims)
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super(DeformableDETR, self).init_weights()
+        for coder in self.encoder, self.decoder:
+            for p in coder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        nn.init.xavier_uniform_(self.memory_trans_fc.weight)
+        normal_(self.level_embed)
+
+    def pre_decoder(
+        self,
+        memory: Tensor,
+        memory_mask: Tensor,
+        spatial_shapes: Tensor,
+        batch_data_samples: OptSampleList = None,
+    ) -> Tuple[Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `memory`, and `reference_points`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points). Will only be used when
+                `as_two_stage` is `True`.
+            spatial_shapes (Tensor): Spatial shapes of features in all levels.
+                With shape (num_levels, 2), last dimension represents (h, w).
+                Will only be used when `as_two_stage` is `True`.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict]: The decoder_inputs_dict and head_inputs_dict.
+
+            - decoder_inputs_dict (dict): The keyword dictionary args of
+              `self.forward_decoder()`, which includes 'query', 'memory',
+              `reference_points`, and `dn_mask`. The reference points of
+              decoder input here are 4D boxes, although it has `points`
+              in its name.
+            - head_inputs_dict (dict): The keyword dictionary args of the
+              bbox_head functions, which includes `topk_score`, `topk_coords`,
+              `dense_topk_score`, `dense_topk_coords`,
+              and `dn_meta`, when `self.training` is `True`, else is empty.
+        """
+        bs, _, c = memory.shape
+        output_memory, output_proposals = self.gen_encoder_output_proposals(
+            memory, memory_mask, spatial_shapes)
+        enc_outputs_class = self.bbox_head.cls_branches[
+            self.decoder.num_layers](
+                output_memory)
+        enc_outputs_coord_unact = self.bbox_head.reg_branches[
+            self.decoder.num_layers](output_memory) + output_proposals
+
+        if self.training:
+            # aux dense branch particularly in DDQ DETR, which doesn't exist
+            #   in DINO.
+            # -1 is the aux head for the encoder
+            dense_enc_outputs_class = self.bbox_head.cls_branches[-1](
+                output_memory)
+            dense_enc_outputs_coord_unact = self.bbox_head.reg_branches[-1](
+                output_memory) + output_proposals
+
+        topk = self.num_queries
+        dense_topk = int(topk * self.dense_topk_ratio)
+
+        proposals = enc_outputs_coord_unact.sigmoid()
+        proposals = bbox_cxcywh_to_xyxy(proposals)
+        scores = enc_outputs_class.max(-1)[0].sigmoid()
+
+        if self.training:
+            # aux dense branch particularly in DDQ DETR, which doesn't exist
+            #   in DINO.
+            dense_proposals = dense_enc_outputs_coord_unact.sigmoid()
+            dense_proposals = bbox_cxcywh_to_xyxy(dense_proposals)
+            dense_scores = dense_enc_outputs_class.max(-1)[0].sigmoid()
+
+        num_imgs = len(scores)
+        topk_score = []
+        topk_coords_unact = []
+        # Distinct query.
+        query = []
+
+        dense_topk_score = []
+        dense_topk_coords_unact = []
+        dense_query = []
+
+        for img_id in range(num_imgs):
+            single_proposals = proposals[img_id]
+            single_scores = scores[img_id]
+
+            # `batched_nms` of class scores and bbox coordinations is used
+            #   particularly by DDQ DETR for region proposal generation,
+            #   instead of `topk` of class scores by DINO.
+            _, keep_idxs = batched_nms(
+                single_proposals, single_scores,
+                torch.ones(len(single_scores), device=single_scores.device),
+                self.cache_dict['dqs_cfg'])
+
+            if self.training:
+                # aux dense branch particularly in DDQ DETR, which doesn't
+                #   exist in DINO.
+                dense_single_proposals = dense_proposals[img_id]
+                dense_single_scores = dense_scores[img_id]
+                # sort according the score
+                # Only sort by classification score, neither nms nor topk is
+                #   required. So input parameter `nms_cfg` = None.
+                _, dense_keep_idxs = batched_nms(
+                    dense_single_proposals, dense_single_scores,
+                    torch.ones(
+                        len(dense_single_scores),
+                        device=dense_single_scores.device), None)
+
+                dense_topk_score.append(dense_enc_outputs_class[img_id]
+                                        [dense_keep_idxs][:dense_topk])
+                dense_topk_coords_unact.append(
+                    dense_enc_outputs_coord_unact[img_id][dense_keep_idxs]
+                    [:dense_topk])
+
+            topk_score.append(enc_outputs_class[img_id][keep_idxs][:topk])
+
+            # Instead of initializing the content part with transformed
+            #   coordinates in Deformable DETR, we fuse the feature map
+            #   embedding of distinct positions as the content part, which
+            #   makes the initial queries more distinct.
+            topk_coords_unact.append(
+                enc_outputs_coord_unact[img_id][keep_idxs][:topk])
+
+            map_memory = self.query_map(memory[img_id].detach())
+            query.append(map_memory[keep_idxs][:topk])
+            if self.training:
+                # aux dense branch particularly in DDQ DETR, which doesn't
+                # exist in DINO.
+                dense_query.append(map_memory[dense_keep_idxs][:dense_topk])
+
+        topk_score = align_tensor(topk_score, topk)
+        topk_coords_unact = align_tensor(topk_coords_unact, topk)
+        query = align_tensor(query, topk)
+        if self.training:
+            dense_topk_score = align_tensor(dense_topk_score)
+            dense_topk_coords_unact = align_tensor(dense_topk_coords_unact)
+
+            dense_query = align_tensor(dense_query)
+            num_dense_queries = dense_query.size(1)
+        if self.training:
+            query = torch.cat([query, dense_query], dim=1)
+            topk_coords_unact = torch.cat(
+                [topk_coords_unact, dense_topk_coords_unact], dim=1)
+
+        topk_coords = topk_coords_unact.sigmoid()
+        if self.training:
+            dense_topk_coords = topk_coords[:, -num_dense_queries:]
+            topk_coords = topk_coords[:, :-num_dense_queries]
+
+        topk_coords_unact = topk_coords_unact.detach()
+
+        if self.training:
+            dn_label_query, dn_bbox_query, dn_mask, dn_meta = \
+                self.dn_query_generator(batch_data_samples)
+            query = torch.cat([dn_label_query, query], dim=1)
+            reference_points = torch.cat([dn_bbox_query, topk_coords_unact],
+                                         dim=1)
+
+            # Update `dn_mask` to add mask for dense queries.
+            ori_size = dn_mask.size(-1)
+            new_size = dn_mask.size(-1) + num_dense_queries
+            new_dn_mask = dn_mask.new_ones((new_size, new_size)).bool()
+            dense_mask = torch.zeros(num_dense_queries,
+                                     num_dense_queries).bool()
+            self.cache_dict['dis_query_info'] = [dn_label_query.size(1), topk]
+
+            new_dn_mask[ori_size:, ori_size:] = dense_mask
+            new_dn_mask[:ori_size, :ori_size] = dn_mask
+            dn_meta['num_dense_queries'] = num_dense_queries
+            dn_mask = new_dn_mask
+            self.cache_dict['num_dense_queries'] = num_dense_queries
+            self.decoder.aux_reg_branches = self.bbox_head.aux_reg_branches
+
+        else:
+            self.cache_dict['dis_query_info'] = [0, topk]
+            reference_points = topk_coords_unact
+            dn_mask, dn_meta = None, None
+
+        reference_points = reference_points.sigmoid()
+
+        decoder_inputs_dict = dict(
+            query=query,
+            memory=memory,
+            reference_points=reference_points,
+            dn_mask=dn_mask)
+        head_inputs_dict = dict(
+            enc_outputs_class=topk_score,
+            enc_outputs_coord=topk_coords,
+            aux_enc_outputs_class=dense_topk_score,
+            aux_enc_outputs_coord=dense_topk_coords,
+            dn_meta=dn_meta) if self.training else dict()
+
+        return decoder_inputs_dict, head_inputs_dict
diff --git a/head_extractor/build/lib/mmdet/models/detectors/deformable_detr.py b/head_extractor/build/lib/mmdet/models/detectors/deformable_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eb5cd2f95204542d5a9ace1a6d92e0b858c139f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/deformable_detr.py
@@ -0,0 +1,572 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Dict, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention
+from mmengine.model import xavier_init
+from torch import Tensor, nn
+from torch.nn.init import normal_
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList
+from mmdet.utils import OptConfigType
+from ..layers import (DeformableDetrTransformerDecoder,
+                      DeformableDetrTransformerEncoder, SinePositionalEncoding)
+from .base_detr import DetectionTransformer
+
+
+@MODELS.register_module()
+class DeformableDETR(DetectionTransformer):
+    r"""Implementation of `Deformable DETR: Deformable Transformers for
+    End-to-End Object Detection <https://arxiv.org/abs/2010.04159>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/fundamentalvision/Deformable-DETR>`_.
+
+    Args:
+        decoder (:obj:`ConfigDict` or dict, optional): Config of the
+            Transformer decoder. Defaults to None.
+        bbox_head (:obj:`ConfigDict` or dict, optional): Config for the
+            bounding box head module. Defaults to None.
+        with_box_refine (bool, optional): Whether to refine the references
+            in the decoder. Defaults to `False`.
+        as_two_stage (bool, optional): Whether to generate the proposal
+            from the outputs of encoder. Defaults to `False`.
+        num_feature_levels (int, optional): Number of feature levels.
+            Defaults to 4.
+    """
+
+    def __init__(self,
+                 *args,
+                 decoder: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 with_box_refine: bool = False,
+                 as_two_stage: bool = False,
+                 num_feature_levels: int = 4,
+                 **kwargs) -> None:
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        self.num_feature_levels = num_feature_levels
+
+        if bbox_head is not None:
+            assert 'share_pred_layer' not in bbox_head and \
+                   'num_pred_layer' not in bbox_head and \
+                   'as_two_stage' not in bbox_head, \
+                'The two keyword args `share_pred_layer`, `num_pred_layer`, ' \
+                'and `as_two_stage are set in `detector.__init__()`, users ' \
+                'should not set them in `bbox_head` config.'
+            # The last prediction layer is used to generate proposal
+            # from encode feature map when `as_two_stage` is `True`.
+            # And all the prediction layers should share parameters
+            # when `with_box_refine` is `True`.
+            bbox_head['share_pred_layer'] = not with_box_refine
+            bbox_head['num_pred_layer'] = (decoder['num_layers'] + 1) \
+                if self.as_two_stage else decoder['num_layers']
+            bbox_head['as_two_stage'] = as_two_stage
+
+        super().__init__(*args, decoder=decoder, bbox_head=bbox_head, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DeformableDetrTransformerEncoder(**self.encoder)
+        self.decoder = DeformableDetrTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        if not self.as_two_stage:
+            self.query_embedding = nn.Embedding(self.num_queries,
+                                                self.embed_dims * 2)
+            # NOTE The query_embedding will be split into query and query_pos
+            # in self.pre_decoder, hence, the embed_dims are doubled.
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+        self.level_embed = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+
+        if self.as_two_stage:
+            self.memory_trans_fc = nn.Linear(self.embed_dims, self.embed_dims)
+            self.memory_trans_norm = nn.LayerNorm(self.embed_dims)
+            self.pos_trans_fc = nn.Linear(self.embed_dims * 2,
+                                          self.embed_dims * 2)
+            self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2)
+        else:
+            self.reference_points_fc = nn.Linear(self.embed_dims, 2)
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super().init_weights()
+        for coder in self.encoder, self.decoder:
+            for p in coder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        if self.as_two_stage:
+            nn.init.xavier_uniform_(self.memory_trans_fc.weight)
+            nn.init.xavier_uniform_(self.pos_trans_fc.weight)
+        else:
+            xavier_init(
+                self.reference_points_fc, distribution='uniform', bias=0.)
+        normal_(self.level_embed)
+
+    def pre_transformer(
+            self,
+            mlvl_feats: Tuple[Tensor],
+            batch_data_samples: OptSampleList = None) -> Tuple[Dict]:
+        """Process image features before feeding them to the transformer.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            mlvl_feats (tuple[Tensor]): Multi-level features that may have
+                different resolutions, output from neck. Each feature has
+                shape (bs, dim, h_lvl, w_lvl), where 'lvl' means 'layer'.
+            batch_data_samples (list[:obj:`DetDataSample`], optional): The
+                batch data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict]: The first dict contains the inputs of encoder and the
+            second dict contains the inputs of decoder.
+
+            - encoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_encoder()`, which includes 'feat', 'feat_mask',
+              and 'feat_pos'.
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_decoder()`, which includes 'memory_mask'.
+        """
+        batch_size = mlvl_feats[0].size(0)
+
+        # construct binary masks for the transformer.
+        assert batch_data_samples is not None
+        batch_input_shape = batch_data_samples[0].batch_input_shape
+        input_img_h, input_img_w = batch_input_shape
+        img_shape_list = [sample.img_shape for sample in batch_data_samples]
+        same_shape_flag = all([
+            s[0] == input_img_h and s[1] == input_img_w for s in img_shape_list
+        ])
+        # support torch2onnx without feeding masks
+        if torch.onnx.is_in_onnx_export() or same_shape_flag:
+            mlvl_masks = []
+            mlvl_pos_embeds = []
+            for feat in mlvl_feats:
+                mlvl_masks.append(None)
+                mlvl_pos_embeds.append(
+                    self.positional_encoding(None, input=feat))
+        else:
+            masks = mlvl_feats[0].new_ones(
+                (batch_size, input_img_h, input_img_w))
+            for img_id in range(batch_size):
+                img_h, img_w = img_shape_list[img_id]
+                masks[img_id, :img_h, :img_w] = 0
+            # NOTE following the official DETR repo, non-zero
+            # values representing ignored positions, while
+            # zero values means valid positions.
+
+            mlvl_masks = []
+            mlvl_pos_embeds = []
+            for feat in mlvl_feats:
+                mlvl_masks.append(
+                    F.interpolate(masks[None], size=feat.shape[-2:]).to(
+                        torch.bool).squeeze(0))
+                mlvl_pos_embeds.append(
+                    self.positional_encoding(mlvl_masks[-1]))
+
+        feat_flatten = []
+        lvl_pos_embed_flatten = []
+        mask_flatten = []
+        spatial_shapes = []
+        for lvl, (feat, mask, pos_embed) in enumerate(
+                zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
+            batch_size, c, h, w = feat.shape
+            spatial_shape = torch._shape_as_tensor(feat)[2:].to(feat.device)
+            # [bs, c, h_lvl, w_lvl] -> [bs, h_lvl*w_lvl, c]
+            feat = feat.view(batch_size, c, -1).permute(0, 2, 1)
+            pos_embed = pos_embed.view(batch_size, c, -1).permute(0, 2, 1)
+            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            # [bs, h_lvl, w_lvl] -> [bs, h_lvl*w_lvl]
+            if mask is not None:
+                mask = mask.flatten(1)
+
+            feat_flatten.append(feat)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            mask_flatten.append(mask)
+            spatial_shapes.append(spatial_shape)
+
+        # (bs, num_feat_points, dim)
+        feat_flatten = torch.cat(feat_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        # (bs, num_feat_points), where num_feat_points = sum_lvl(h_lvl*w_lvl)
+        if mask_flatten[0] is not None:
+            mask_flatten = torch.cat(mask_flatten, 1)
+        else:
+            mask_flatten = None
+
+        # (num_level, 2)
+        spatial_shapes = torch.cat(spatial_shapes).view(-1, 2)
+        level_start_index = torch.cat((
+            spatial_shapes.new_zeros((1, )),  # (num_level)
+            spatial_shapes.prod(1).cumsum(0)[:-1]))
+        if mlvl_masks[0] is not None:
+            valid_ratios = torch.stack(  # (bs, num_level, 2)
+                [self.get_valid_ratio(m) for m in mlvl_masks], 1)
+        else:
+            valid_ratios = mlvl_feats[0].new_ones(batch_size, len(mlvl_feats),
+                                                  2)
+
+        encoder_inputs_dict = dict(
+            feat=feat_flatten,
+            feat_mask=mask_flatten,
+            feat_pos=lvl_pos_embed_flatten,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios)
+        decoder_inputs_dict = dict(
+            memory_mask=mask_flatten,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios)
+        return encoder_inputs_dict, decoder_inputs_dict
+
+    def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
+                        feat_pos: Tensor, spatial_shapes: Tensor,
+                        level_start_index: Tensor,
+                        valid_ratios: Tensor) -> Dict:
+        """Forward with Transformer encoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            feat (Tensor): Sequential features, has shape (bs, num_feat_points,
+                dim).
+            feat_mask (Tensor): ByteTensor, the padding mask of the features,
+                has shape (bs, num_feat_points).
+            feat_pos (Tensor): The positional embeddings of the features, has
+                shape (bs, num_feat_points, dim).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+
+        Returns:
+            dict: The dictionary of encoder outputs, which includes the
+            `memory` of the encoder output.
+        """
+        memory = self.encoder(
+            query=feat,
+            query_pos=feat_pos,
+            key_padding_mask=feat_mask,  # for self_attn
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios)
+        encoder_outputs_dict = dict(
+            memory=memory,
+            memory_mask=feat_mask,
+            spatial_shapes=spatial_shapes)
+        return encoder_outputs_dict
+
+    def pre_decoder(self, memory: Tensor, memory_mask: Tensor,
+                    spatial_shapes: Tensor) -> Tuple[Dict, Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`, and `reference_points`.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points). It will only be used when
+                `as_two_stage` is `True`.
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+                It will only be used when `as_two_stage` is `True`.
+
+        Returns:
+            tuple[dict, dict]: The decoder_inputs_dict and head_inputs_dict.
+
+            - decoder_inputs_dict (dict): The keyword dictionary args of
+              `self.forward_decoder()`, which includes 'query', 'query_pos',
+              'memory', and `reference_points`. The reference_points of
+              decoder input here are 4D boxes when `as_two_stage` is `True`,
+              otherwise 2D points, although it has `points` in its name.
+              The reference_points in encoder is always 2D points.
+            - head_inputs_dict (dict): The keyword dictionary args of the
+              bbox_head functions, which includes `enc_outputs_class` and
+              `enc_outputs_coord`. They are both `None` when 'as_two_stage'
+              is `False`. The dict is empty when `self.training` is `False`.
+        """
+        batch_size, _, c = memory.shape
+        if self.as_two_stage:
+            output_memory, output_proposals = \
+                self.gen_encoder_output_proposals(
+                    memory, memory_mask, spatial_shapes)
+            enc_outputs_class = self.bbox_head.cls_branches[
+                self.decoder.num_layers](
+                    output_memory)
+            enc_outputs_coord_unact = self.bbox_head.reg_branches[
+                self.decoder.num_layers](output_memory) + output_proposals
+            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
+            # We only use the first channel in enc_outputs_class as foreground,
+            # the other (num_classes - 1) channels are actually not used.
+            # Its targets are set to be 0s, which indicates the first
+            # class (foreground) because we use [0, num_classes - 1] to
+            # indicate class labels, background class is indicated by
+            # num_classes (similar convention in RPN).
+            # See https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/deformable_detr_head.py#L241 # noqa
+            # This follows the official implementation of Deformable DETR.
+            topk_proposals = torch.topk(
+                enc_outputs_class[..., 0], self.num_queries, dim=1)[1]
+            topk_coords_unact = torch.gather(
+                enc_outputs_coord_unact, 1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+            topk_coords_unact = topk_coords_unact.detach()
+            reference_points = topk_coords_unact.sigmoid()
+            pos_trans_out = self.pos_trans_fc(
+                self.get_proposal_pos_embed(topk_coords_unact))
+            pos_trans_out = self.pos_trans_norm(pos_trans_out)
+            query_pos, query = torch.split(pos_trans_out, c, dim=2)
+        else:
+            enc_outputs_class, enc_outputs_coord = None, None
+            query_embed = self.query_embedding.weight
+            query_pos, query = torch.split(query_embed, c, dim=1)
+            query_pos = query_pos.unsqueeze(0).expand(batch_size, -1, -1)
+            query = query.unsqueeze(0).expand(batch_size, -1, -1)
+            reference_points = self.reference_points_fc(query_pos).sigmoid()
+
+        decoder_inputs_dict = dict(
+            query=query,
+            query_pos=query_pos,
+            memory=memory,
+            reference_points=reference_points)
+        head_inputs_dict = dict(
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord=enc_outputs_coord) if self.training else dict()
+        return decoder_inputs_dict, head_inputs_dict
+
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        memory_mask: Tensor, reference_points: Tensor,
+                        spatial_shapes: Tensor, level_start_index: Tensor,
+                        valid_ratios: Tensor) -> Dict:
+        """Forward with Transformer decoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h) when `as_two_stage` is `True`, otherwise has
+                shape (bs, num_queries, 2) with the last dimension arranged as
+                (cx, cy).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` of the decoder output and `references` including
+            the initial and intermediate reference_points.
+        """
+        inter_states, inter_references = self.decoder(
+            query=query,
+            value=memory,
+            query_pos=query_pos,
+            key_padding_mask=memory_mask,  # for cross_attn
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=self.bbox_head.reg_branches
+            if self.with_box_refine else None)
+        references = [reference_points, *inter_references]
+        decoder_outputs_dict = dict(
+            hidden_states=inter_states, references=references)
+        return decoder_outputs_dict
+
+    @staticmethod
+    def get_valid_ratio(mask: Tensor) -> Tensor:
+        """Get the valid radios of feature map in a level.
+
+        .. code:: text
+
+                    |---> valid_W <---|
+                 ---+-----------------+-----+---
+                  A |                 |     | A
+                  | |                 |     | |
+                  | |                 |     | |
+            valid_H |                 |     | |
+                  | |                 |     | H
+                  | |                 |     | |
+                  V |                 |     | |
+                 ---+-----------------+     | |
+                    |                       | V
+                    +-----------------------+---
+                    |---------> W <---------|
+
+          The valid_ratios are defined as:
+                r_h = valid_H / H,  r_w = valid_W / W
+          They are the factors to re-normalize the relative coordinates of the
+          image to the relative coordinates of the current level feature map.
+
+        Args:
+            mask (Tensor): Binary mask of a feature map, has shape (bs, H, W).
+
+        Returns:
+            Tensor: valid ratios [r_w, r_h] of a feature map, has shape (1, 2).
+        """
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def gen_encoder_output_proposals(
+            self, memory: Tensor, memory_mask: Tensor,
+            spatial_shapes: Tensor) -> Tuple[Tensor, Tensor]:
+        """Generate proposals from encoded memory. The function will only be
+        used when `as_two_stage` is `True`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+
+        Returns:
+            tuple: A tuple of transformed memory and proposals.
+
+            - output_memory (Tensor): The transformed memory for obtaining
+              top-k proposals, has shape (bs, num_feat_points, dim).
+            - output_proposals (Tensor): The inverse-normalized proposal, has
+              shape (batch_size, num_keys, 4) with the last dimension arranged
+              as (cx, cy, w, h).
+        """
+
+        bs = memory.size(0)
+        proposals = []
+        _cur = 0  # start index in the sequence of the current level
+        for lvl, HW in enumerate(spatial_shapes):
+            H, W = HW
+
+            if memory_mask is not None:
+                mask_flatten_ = memory_mask[:, _cur:(_cur + H * W)].view(
+                    bs, H, W, 1)
+                valid_H = torch.sum(~mask_flatten_[:, :, 0, 0],
+                                    1).unsqueeze(-1)
+                valid_W = torch.sum(~mask_flatten_[:, 0, :, 0],
+                                    1).unsqueeze(-1)
+                scale = torch.cat([valid_W, valid_H], 1).view(bs, 1, 1, 2)
+            else:
+                if not isinstance(HW, torch.Tensor):
+                    HW = memory.new_tensor(HW)
+                scale = HW.unsqueeze(0).flip(dims=[0, 1]).view(1, 1, 1, 2)
+            grid_y, grid_x = torch.meshgrid(
+                torch.linspace(
+                    0, H - 1, H, dtype=torch.float32, device=memory.device),
+                torch.linspace(
+                    0, W - 1, W, dtype=torch.float32, device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+            grid = (grid.unsqueeze(0).expand(bs, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0**lvl)
+            proposal = torch.cat((grid, wh), -1).view(bs, -1, 4)
+            proposals.append(proposal)
+            _cur += (H * W)
+        output_proposals = torch.cat(proposals, 1)
+        # do not use `all` to make it exportable to onnx
+        output_proposals_valid = (
+            (output_proposals > 0.01) & (output_proposals < 0.99)).sum(
+                -1, keepdim=True) == output_proposals.shape[-1]
+        # inverse_sigmoid
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        if memory_mask is not None:
+            output_proposals = output_proposals.masked_fill(
+                memory_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(
+            ~output_proposals_valid, float('inf'))
+
+        output_memory = memory
+        if memory_mask is not None:
+            output_memory = output_memory.masked_fill(
+                memory_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid,
+                                                  float(0))
+        output_memory = self.memory_trans_fc(output_memory)
+        output_memory = self.memory_trans_norm(output_memory)
+        # [bs, sum(hw), 2]
+        return output_memory, output_proposals
+
+    @staticmethod
+    def get_proposal_pos_embed(proposals: Tensor,
+                               num_pos_feats: int = 128,
+                               temperature: int = 10000) -> Tensor:
+        """Get the position embedding of the proposal.
+
+        Args:
+            proposals (Tensor): Not normalized proposals, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            num_pos_feats (int, optional): The feature dimension for each
+                position along x, y, w, and h-axis. Note the final returned
+                dimension for each position is 4 times of num_pos_feats.
+                Default to 128.
+            temperature (int, optional): The temperature used for scaling the
+                position embedding. Defaults to 10000.
+
+        Returns:
+            Tensor: The position embedding of proposal, has shape
+            (bs, num_queries, num_pos_feats * 4), with the last dimension
+            arranged as (cx, cy, w, h)
+        """
+        scale = 2 * math.pi
+        dim_t = torch.arange(
+            num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = proposals.sigmoid() * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()),
+                          dim=4).flatten(2)
+        return pos
diff --git a/head_extractor/build/lib/mmdet/models/detectors/detr.py b/head_extractor/build/lib/mmdet/models/detectors/detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..7895e9ecb4eb66cb75d173c191c2128c3f55c197
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/detr.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList
+from ..layers import (DetrTransformerDecoder, DetrTransformerEncoder,
+                      SinePositionalEncoding)
+from .base_detr import DetectionTransformer
+
+
+@MODELS.register_module()
+class DETR(DetectionTransformer):
+    r"""Implementation of `DETR: End-to-End Object Detection with Transformers.
+
+    <https://arxiv.org/pdf/2005.12872>`_.
+
+    Code is modified from the `official github repo
+    <https://github.com/facebookresearch/detr>`_.
+    """
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DetrTransformerEncoder(**self.encoder)
+        self.decoder = DetrTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        # NOTE The embed_dims is typically passed from the inside out.
+        # For example in DETR, The embed_dims is passed as
+        # self_attn -> the first encoder layer -> encoder -> detector.
+        self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims)
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super().init_weights()
+        for coder in self.encoder, self.decoder:
+            for p in coder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+
+    def pre_transformer(
+            self,
+            img_feats: Tuple[Tensor],
+            batch_data_samples: OptSampleList = None) -> Tuple[Dict, Dict]:
+        """Prepare the inputs of the Transformer.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            img_feats (Tuple[Tensor]): Tuple of features output from the neck,
+                has shape (bs, c, h, w).
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such as
+                `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of encoder
+            and the second dict contains the inputs of decoder.
+
+            - encoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_encoder()`, which includes 'feat', 'feat_mask',
+              and 'feat_pos'.
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_decoder()`, which includes 'memory_mask',
+              and 'memory_pos'.
+        """
+
+        feat = img_feats[-1]  # NOTE img_feats contains only one feature.
+        batch_size, feat_dim, _, _ = feat.shape
+        # construct binary masks which for the transformer.
+        assert batch_data_samples is not None
+        batch_input_shape = batch_data_samples[0].batch_input_shape
+        input_img_h, input_img_w = batch_input_shape
+        img_shape_list = [sample.img_shape for sample in batch_data_samples]
+        same_shape_flag = all([
+            s[0] == input_img_h and s[1] == input_img_w for s in img_shape_list
+        ])
+        if torch.onnx.is_in_onnx_export() or same_shape_flag:
+            masks = None
+            # [batch_size, embed_dim, h, w]
+            pos_embed = self.positional_encoding(masks, input=feat)
+        else:
+            masks = feat.new_ones((batch_size, input_img_h, input_img_w))
+            for img_id in range(batch_size):
+                img_h, img_w = img_shape_list[img_id]
+                masks[img_id, :img_h, :img_w] = 0
+            # NOTE following the official DETR repo, non-zero values represent
+            # ignored positions, while zero values mean valid positions.
+
+            masks = F.interpolate(
+                masks.unsqueeze(1),
+                size=feat.shape[-2:]).to(torch.bool).squeeze(1)
+            # [batch_size, embed_dim, h, w]
+            pos_embed = self.positional_encoding(masks)
+
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        # [bs, c, h, w] -> [bs, h*w, c]
+        feat = feat.view(batch_size, feat_dim, -1).permute(0, 2, 1)
+        pos_embed = pos_embed.view(batch_size, feat_dim, -1).permute(0, 2, 1)
+        # [bs, h, w] -> [bs, h*w]
+        if masks is not None:
+            masks = masks.view(batch_size, -1)
+
+        # prepare transformer_inputs_dict
+        encoder_inputs_dict = dict(
+            feat=feat, feat_mask=masks, feat_pos=pos_embed)
+        decoder_inputs_dict = dict(memory_mask=masks, memory_pos=pos_embed)
+        return encoder_inputs_dict, decoder_inputs_dict
+
+    def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
+                        feat_pos: Tensor) -> Dict:
+        """Forward with Transformer encoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            feat (Tensor): Sequential features, has shape (bs, num_feat_points,
+                dim).
+            feat_mask (Tensor): ByteTensor, the padding mask of the features,
+                has shape (bs, num_feat_points).
+            feat_pos (Tensor): The positional embeddings of the features, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of encoder outputs, which includes the
+            `memory` of the encoder output.
+        """
+        memory = self.encoder(
+            query=feat, query_pos=feat_pos,
+            key_padding_mask=feat_mask)  # for self_attn
+        encoder_outputs_dict = dict(memory=memory)
+        return encoder_outputs_dict
+
+    def pre_decoder(self, memory: Tensor) -> Tuple[Dict, Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of decoder
+            and the second dict contains the inputs of the bbox_head function.
+
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_decoder()`, which includes 'query', 'query_pos',
+              'memory'.
+            - head_inputs_dict (dict): The keyword args dictionary of the
+              bbox_head functions, which is usually empty, or includes
+              `enc_outputs_class` and `enc_outputs_class` when the detector
+              support 'two stage' or 'query selection' strategies.
+        """
+
+        batch_size = memory.size(0)  # (bs, num_feat_points, dim)
+        query_pos = self.query_embedding.weight
+        # (num_queries, dim) -> (bs, num_queries, dim)
+        query_pos = query_pos.unsqueeze(0).repeat(batch_size, 1, 1)
+        query = torch.zeros_like(query_pos)
+
+        decoder_inputs_dict = dict(
+            query_pos=query_pos, query=query, memory=memory)
+        head_inputs_dict = dict()
+        return decoder_inputs_dict, head_inputs_dict
+
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        memory_mask: Tensor, memory_pos: Tensor) -> Dict:
+        """Forward with Transformer decoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            memory_pos (Tensor): The positional embeddings of memory, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` of the decoder output.
+
+            - hidden_states (Tensor): Has shape
+              (num_decoder_layers, bs, num_queries, dim)
+        """
+
+        hidden_states = self.decoder(
+            query=query,
+            key=memory,
+            value=memory,
+            query_pos=query_pos,
+            key_pos=memory_pos,
+            key_padding_mask=memory_mask)  # for cross_attn
+
+        head_inputs_dict = dict(hidden_states=hidden_states)
+        return head_inputs_dict
diff --git a/head_extractor/build/lib/mmdet/models/detectors/dino.py b/head_extractor/build/lib/mmdet/models/detectors/dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..ade47f531d27246511cafc2997a07d58677538a7
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/dino.py
@@ -0,0 +1,287 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple
+
+import torch
+from torch import Tensor, nn
+from torch.nn.init import normal_
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList
+from mmdet.utils import OptConfigType
+from ..layers import (CdnQueryGenerator, DeformableDetrTransformerEncoder,
+                      DinoTransformerDecoder, SinePositionalEncoding)
+from .deformable_detr import DeformableDETR, MultiScaleDeformableAttention
+
+
+@MODELS.register_module()
+class DINO(DeformableDETR):
+    r"""Implementation of `DINO: DETR with Improved DeNoising Anchor Boxes
+    for End-to-End Object Detection <https://arxiv.org/abs/2203.03605>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/DINO>`_.
+
+    Args:
+        dn_cfg (:obj:`ConfigDict` or dict, optional): Config of denoising
+            query generator. Defaults to `None`.
+    """
+
+    def __init__(self, *args, dn_cfg: OptConfigType = None, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        assert self.as_two_stage, 'as_two_stage must be True for DINO'
+        assert self.with_box_refine, 'with_box_refine must be True for DINO'
+
+        if dn_cfg is not None:
+            assert 'num_classes' not in dn_cfg and \
+                   'num_queries' not in dn_cfg and \
+                   'hidden_dim' not in dn_cfg, \
+                'The three keyword args `num_classes`, `embed_dims`, and ' \
+                '`num_matching_queries` are set in `detector.__init__()`, ' \
+                'users should not set them in `dn_cfg` config.'
+            dn_cfg['num_classes'] = self.bbox_head.num_classes
+            dn_cfg['embed_dims'] = self.embed_dims
+            dn_cfg['num_matching_queries'] = self.num_queries
+        self.dn_query_generator = CdnQueryGenerator(**dn_cfg)
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DeformableDetrTransformerEncoder(**self.encoder)
+        self.decoder = DinoTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims)
+        # NOTE In DINO, the query_embedding only contains content
+        # queries, while in Deformable DETR, the query_embedding
+        # contains both content and spatial queries, and in DETR,
+        # it only contains spatial queries.
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            f'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+        self.level_embed = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+        self.memory_trans_fc = nn.Linear(self.embed_dims, self.embed_dims)
+        self.memory_trans_norm = nn.LayerNorm(self.embed_dims)
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super(DeformableDETR, self).init_weights()
+        for coder in self.encoder, self.decoder:
+            for p in coder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        nn.init.xavier_uniform_(self.memory_trans_fc.weight)
+        nn.init.xavier_uniform_(self.query_embedding.weight)
+        normal_(self.level_embed)
+
+    def forward_transformer(
+        self,
+        img_feats: Tuple[Tensor],
+        batch_data_samples: OptSampleList = None,
+    ) -> Dict:
+        """Forward process of Transformer.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+        The difference is that the ground truth in `batch_data_samples` is
+        required for the `pre_decoder` to prepare the query of DINO.
+        Additionally, DINO inherits the `pre_transformer` method and the
+        `forward_encoder` method of DeformableDETR. More details about the
+        two methods can be found in `mmdet/detector/deformable_detr.py`.
+
+        Args:
+            img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each
+                feature map has shape (bs, dim, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            dict: The dictionary of bbox_head function inputs, which always
+            includes the `hidden_states` of the decoder output and may contain
+            `references` including the initial and intermediate references.
+        """
+        encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer(
+            img_feats, batch_data_samples)
+
+        encoder_outputs_dict = self.forward_encoder(**encoder_inputs_dict)
+
+        tmp_dec_in, head_inputs_dict = self.pre_decoder(
+            **encoder_outputs_dict, batch_data_samples=batch_data_samples)
+        decoder_inputs_dict.update(tmp_dec_in)
+
+        decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict)
+        head_inputs_dict.update(decoder_outputs_dict)
+        return head_inputs_dict
+
+    def pre_decoder(
+        self,
+        memory: Tensor,
+        memory_mask: Tensor,
+        spatial_shapes: Tensor,
+        batch_data_samples: OptSampleList = None,
+    ) -> Tuple[Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`, and `reference_points`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points). Will only be used when
+                `as_two_stage` is `True`.
+            spatial_shapes (Tensor): Spatial shapes of features in all levels.
+                With shape (num_levels, 2), last dimension represents (h, w).
+                Will only be used when `as_two_stage` is `True`.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict]: The decoder_inputs_dict and head_inputs_dict.
+
+            - decoder_inputs_dict (dict): The keyword dictionary args of
+              `self.forward_decoder()`, which includes 'query', 'memory',
+              `reference_points`, and `dn_mask`. The reference points of
+              decoder input here are 4D boxes, although it has `points`
+              in its name.
+            - head_inputs_dict (dict): The keyword dictionary args of the
+              bbox_head functions, which includes `topk_score`, `topk_coords`,
+              and `dn_meta` when `self.training` is `True`, else is empty.
+        """
+        bs, _, c = memory.shape
+        cls_out_features = self.bbox_head.cls_branches[
+            self.decoder.num_layers].out_features
+
+        output_memory, output_proposals = self.gen_encoder_output_proposals(
+            memory, memory_mask, spatial_shapes)
+        enc_outputs_class = self.bbox_head.cls_branches[
+            self.decoder.num_layers](
+                output_memory)
+        enc_outputs_coord_unact = self.bbox_head.reg_branches[
+            self.decoder.num_layers](output_memory) + output_proposals
+
+        # NOTE The DINO selects top-k proposals according to scores of
+        # multi-class classification, while DeformDETR, where the input
+        # is `enc_outputs_class[..., 0]` selects according to scores of
+        # binary classification.
+        topk_indices = torch.topk(
+            enc_outputs_class.max(-1)[0], k=self.num_queries, dim=1)[1]
+        topk_score = torch.gather(
+            enc_outputs_class, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features))
+        topk_coords_unact = torch.gather(
+            enc_outputs_coord_unact, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, 4))
+        topk_coords = topk_coords_unact.sigmoid()
+        topk_coords_unact = topk_coords_unact.detach()
+
+        query = self.query_embedding.weight[:, None, :]
+        query = query.repeat(1, bs, 1).transpose(0, 1)
+        if self.training:
+            dn_label_query, dn_bbox_query, dn_mask, dn_meta = \
+                self.dn_query_generator(batch_data_samples)
+            query = torch.cat([dn_label_query, query], dim=1)
+            reference_points = torch.cat([dn_bbox_query, topk_coords_unact],
+                                         dim=1)
+        else:
+            reference_points = topk_coords_unact
+            dn_mask, dn_meta = None, None
+        reference_points = reference_points.sigmoid()
+
+        decoder_inputs_dict = dict(
+            query=query,
+            memory=memory,
+            reference_points=reference_points,
+            dn_mask=dn_mask)
+        # NOTE DINO calculates encoder losses on scores and coordinates
+        # of selected top-k encoder queries, while DeformDETR is of all
+        # encoder queries.
+        head_inputs_dict = dict(
+            enc_outputs_class=topk_score,
+            enc_outputs_coord=topk_coords,
+            dn_meta=dn_meta) if self.training else dict()
+        return decoder_inputs_dict, head_inputs_dict
+
+    def forward_decoder(self,
+                        query: Tensor,
+                        memory: Tensor,
+                        memory_mask: Tensor,
+                        reference_points: Tensor,
+                        spatial_shapes: Tensor,
+                        level_start_index: Tensor,
+                        valid_ratios: Tensor,
+                        dn_mask: Optional[Tensor] = None,
+                        **kwargs) -> Dict:
+        """Forward with Transformer decoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries_total, dim), where `num_queries_total` is the
+                sum of `num_denoising_queries` and `num_matching_queries` when
+                `self.training` is `True`, else `num_matching_queries`.
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries_total, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            dn_mask (Tensor, optional): The attention mask to prevent
+                information leakage from different denoising groups and
+                matching parts, will be used as `self_attn_mask` of the
+                `self.decoder`, has shape (num_queries_total,
+                num_queries_total).
+                It is `None` when `self.training` is `False`.
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` of the decoder output and `references` including
+            the initial and intermediate reference_points.
+        """
+        inter_states, references = self.decoder(
+            query=query,
+            value=memory,
+            key_padding_mask=memory_mask,
+            self_attn_mask=dn_mask,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=self.bbox_head.reg_branches,
+            **kwargs)
+
+        if len(query) == self.num_queries:
+            # NOTE: This is to make sure label_embeding can be involved to
+            # produce loss even if there is no denoising query (no ground truth
+            # target in this GPU), otherwise, this will raise runtime error in
+            # distributed training.
+            inter_states[0] += \
+                self.dn_query_generator.label_embedding.weight[0, 0] * 0.0
+
+        decoder_outputs_dict = dict(
+            hidden_states=inter_states, references=list(references))
+        return decoder_outputs_dict
diff --git a/head_extractor/build/lib/mmdet/models/detectors/fast_rcnn.py b/head_extractor/build/lib/mmdet/models/detectors/fast_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b39050fdc2989eb5c870704e1c1417987d53d46
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/fast_rcnn.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class FastRCNN(TwoStageDetector):
+    """Implementation of `Fast R-CNN <https://arxiv.org/abs/1504.08083>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/faster_rcnn.py b/head_extractor/build/lib/mmdet/models/detectors/faster_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..36109e3200a2d8e7d8a1032f7028e47a7699fb6a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/faster_rcnn.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class FasterRCNN(TwoStageDetector):
+    """Implementation of `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/fcos.py b/head_extractor/build/lib/mmdet/models/detectors/fcos.py
new file mode 100644
index 0000000000000000000000000000000000000000..c628059313ac80644ec2ba2c806e7baf2e418a41
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/fcos.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class FCOS(SingleStageDetector):
+    """Implementation of `FCOS <https://arxiv.org/abs/1904.01355>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of FCOS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of FCOS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/fovea.py b/head_extractor/build/lib/mmdet/models/detectors/fovea.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e4f21caa239147e3b81e66280aa1da043715b42
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/fovea.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class FOVEA(SingleStageDetector):
+    """Implementation of `FoveaBox <https://arxiv.org/abs/1904.03797>`_
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of FOVEA. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of FOVEA. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/fsaf.py b/head_extractor/build/lib/mmdet/models/detectors/fsaf.py
new file mode 100644
index 0000000000000000000000000000000000000000..01b40273341f2a85cfa427f8adfc945a1b7da58a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/fsaf.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class FSAF(SingleStageDetector):
+    """Implementation of `FSAF <https://arxiv.org/abs/1903.00621>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/gfl.py b/head_extractor/build/lib/mmdet/models/detectors/gfl.py
new file mode 100644
index 0000000000000000000000000000000000000000..c26821af68c224d4b55a1ca3d2be4c6e1d1b155d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/gfl.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class GFL(SingleStageDetector):
+    """Implementation of `GFL <https://arxiv.org/abs/2006.04388>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of GFL. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of GFL. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/glip.py b/head_extractor/build/lib/mmdet/models/detectors/glip.py
new file mode 100644
index 0000000000000000000000000000000000000000..45cfe7d39fd7b8d9e9bc37c49fe369ff87bc68d9
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/glip.py
@@ -0,0 +1,590 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import re
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+def find_noun_phrases(caption: str) -> list:
+    """Find noun phrases in a caption using nltk.
+    Args:
+        caption (str): The caption to analyze.
+
+    Returns:
+        list: List of noun phrases found in the caption.
+
+    Examples:
+        >>> caption = 'There is two cat and a remote in the picture'
+        >>> find_noun_phrases(caption) # ['cat', 'a remote', 'the picture']
+    """
+    try:
+        import nltk
+        nltk.download('punkt', download_dir='~/nltk_data')
+        nltk.download('averaged_perceptron_tagger', download_dir='~/nltk_data')
+    except ImportError:
+        raise RuntimeError('nltk is not installed, please install it by: '
+                           'pip install nltk.')
+
+    caption = caption.lower()
+    tokens = nltk.word_tokenize(caption)
+    pos_tags = nltk.pos_tag(tokens)
+
+    grammar = 'NP: {<DT>?<JJ.*>*<NN.*>+}'
+    cp = nltk.RegexpParser(grammar)
+    result = cp.parse(pos_tags)
+
+    noun_phrases = []
+    for subtree in result.subtrees():
+        if subtree.label() == 'NP':
+            noun_phrases.append(' '.join(t[0] for t in subtree.leaves()))
+
+    return noun_phrases
+
+
+def remove_punctuation(text: str) -> str:
+    """Remove punctuation from a text.
+    Args:
+        text (str): The input text.
+
+    Returns:
+        str: The text with punctuation removed.
+    """
+    punctuation = [
+        '|', ':', ';', '@', '(', ')', '[', ']', '{', '}', '^', '\'', '\"', '’',
+        '`', '?', '$', '%', '#', '!', '&', '*', '+', ',', '.'
+    ]
+    for p in punctuation:
+        text = text.replace(p, '')
+    return text.strip()
+
+
+def run_ner(caption: str) -> Tuple[list, list]:
+    """Run NER on a caption and return the tokens and noun phrases.
+    Args:
+        caption (str): The input caption.
+
+    Returns:
+        Tuple[List, List]: A tuple containing the tokens and noun phrases.
+            - tokens_positive (List): A list of token positions.
+            - noun_phrases (List): A list of noun phrases.
+    """
+    noun_phrases = find_noun_phrases(caption)
+    noun_phrases = [remove_punctuation(phrase) for phrase in noun_phrases]
+    noun_phrases = [phrase for phrase in noun_phrases if phrase != '']
+    print('noun_phrases:', noun_phrases)
+    relevant_phrases = noun_phrases
+    labels = noun_phrases
+
+    tokens_positive = []
+    for entity, label in zip(relevant_phrases, labels):
+        try:
+            # search all occurrences and mark them as different entities
+            # TODO: Not Robust
+            for m in re.finditer(entity, caption.lower()):
+                tokens_positive.append([[m.start(), m.end()]])
+        except Exception:
+            print('noun entities:', noun_phrases)
+            print('entity:', entity)
+            print('caption:', caption.lower())
+    return tokens_positive, noun_phrases
+
+
+def create_positive_map(tokenized,
+                        tokens_positive: list,
+                        max_num_entities: int = 256) -> Tensor:
+    """construct a map such that positive_map[i,j] = True
+    if box i is associated to token j
+
+    Args:
+        tokenized: The tokenized input.
+        tokens_positive (list): A list of token ranges
+            associated with positive boxes.
+        max_num_entities (int, optional): The maximum number of entities.
+            Defaults to 256.
+
+    Returns:
+        torch.Tensor: The positive map.
+
+    Raises:
+        Exception: If an error occurs during token-to-char mapping.
+    """
+    positive_map = torch.zeros((len(tokens_positive), max_num_entities),
+                               dtype=torch.float)
+
+    for j, tok_list in enumerate(tokens_positive):
+        for (beg, end) in tok_list:
+            try:
+                beg_pos = tokenized.char_to_token(beg)
+                end_pos = tokenized.char_to_token(end - 1)
+            except Exception as e:
+                print('beg:', beg, 'end:', end)
+                print('token_positive:', tokens_positive)
+                raise e
+            if beg_pos is None:
+                try:
+                    beg_pos = tokenized.char_to_token(beg + 1)
+                    if beg_pos is None:
+                        beg_pos = tokenized.char_to_token(beg + 2)
+                except Exception:
+                    beg_pos = None
+            if end_pos is None:
+                try:
+                    end_pos = tokenized.char_to_token(end - 2)
+                    if end_pos is None:
+                        end_pos = tokenized.char_to_token(end - 3)
+                except Exception:
+                    end_pos = None
+            if beg_pos is None or end_pos is None:
+                continue
+
+            assert beg_pos is not None and end_pos is not None
+            positive_map[j, beg_pos:end_pos + 1].fill_(1)
+    return positive_map / (positive_map.sum(-1)[:, None] + 1e-6)
+
+
+def create_positive_map_label_to_token(positive_map: Tensor,
+                                       plus: int = 0) -> dict:
+    """Create a dictionary mapping the label to the token.
+    Args:
+        positive_map (Tensor): The positive map tensor.
+        plus (int, optional): Value added to the label for indexing.
+            Defaults to 0.
+
+    Returns:
+        dict: The dictionary mapping the label to the token.
+    """
+    positive_map_label_to_token = {}
+    for i in range(len(positive_map)):
+        positive_map_label_to_token[i + plus] = torch.nonzero(
+            positive_map[i], as_tuple=True)[0].tolist()
+    return positive_map_label_to_token
+
+
+def clean_label_name(name: str) -> str:
+    name = re.sub(r'\(.*\)', '', name)
+    name = re.sub(r'_', ' ', name)
+    name = re.sub(r'  ', ' ', name)
+    return name
+
+
+def chunks(lst: list, n: int) -> list:
+    """Yield successive n-sized chunks from lst."""
+    all_ = []
+    for i in range(0, len(lst), n):
+        data_index = lst[i:i + n]
+        all_.append(data_index)
+    counter = 0
+    for i in all_:
+        counter += len(i)
+    assert (counter == len(lst))
+
+    return all_
+
+
+@MODELS.register_module()
+class GLIP(SingleStageDetector):
+    """Implementation of `GLIP <https://arxiv.org/abs/2112.03857>`_
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        language_model (:obj:`ConfigDict` or dict): The language model config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of GLIP. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of GLIP. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 language_model: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+        self.language_model = MODELS.build(language_model)
+
+        self._special_tokens = '. '
+
+    def to_enhance_text_prompts(self, original_caption, enhanced_text_prompts):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            if word in enhanced_text_prompts:
+                enhanced_text_dict = enhanced_text_prompts[word]
+                if 'prefix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['prefix']
+                start_i = len(caption_string)
+                if 'name' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['name']
+                else:
+                    caption_string += word
+                end_i = len(caption_string)
+                tokens_positive.append([[start_i, end_i]])
+
+                if 'suffix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['suffix']
+            else:
+                tokens_positive.append(
+                    [[len(caption_string),
+                      len(caption_string) + len(word)]])
+                caption_string += word
+
+            if idx != len(original_caption) - 1:
+                caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
+    def to_plain_text_prompts(self, original_caption):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            tokens_positive.append(
+                [[len(caption_string),
+                  len(caption_string) + len(word)]])
+            caption_string += word
+            if idx != len(original_caption) - 1:
+                caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
+    def get_tokens_and_prompts(
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompts: Optional[ConfigType] = None
+    ) -> Tuple[dict, str, list, list]:
+        """Get the tokens positive and prompts for the caption."""
+        if isinstance(original_caption, (list, tuple)) or custom_entities:
+            if custom_entities and isinstance(original_caption, str):
+                original_caption = original_caption.strip(self._special_tokens)
+                original_caption = original_caption.split(self._special_tokens)
+                original_caption = list(
+                    filter(lambda x: len(x) > 0, original_caption))
+
+            original_caption = [clean_label_name(i) for i in original_caption]
+
+            if custom_entities and enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption, enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption)
+
+            tokenized = self.language_model.tokenizer([caption_string],
+                                                      return_tensors='pt')
+            entities = original_caption
+        else:
+            original_caption = original_caption.strip(self._special_tokens)
+            tokenized = self.language_model.tokenizer([original_caption],
+                                                      return_tensors='pt')
+            tokens_positive, noun_phrases = run_ner(original_caption)
+            entities = noun_phrases
+            caption_string = original_caption
+
+        return tokenized, caption_string, tokens_positive, entities
+
+    def get_positive_map(self, tokenized, tokens_positive):
+        positive_map = create_positive_map(tokenized, tokens_positive)
+        positive_map_label_to_token = create_positive_map_label_to_token(
+            positive_map, plus=1)
+        return positive_map_label_to_token, positive_map
+
+    def get_tokens_positive_and_prompts(
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompt: Optional[ConfigType] = None,
+        tokens_positive: Optional[list] = None,
+    ) -> Tuple[dict, str, Tensor, list]:
+        if tokens_positive is not None:
+            if tokens_positive == -1:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                return None, original_caption, None, original_caption
+            else:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                tokenized = self.language_model.tokenizer([original_caption],
+                                                          return_tensors='pt')
+                positive_map_label_to_token, positive_map = \
+                    self.get_positive_map(tokenized, tokens_positive)
+
+                entities = []
+                for token_positive in tokens_positive:
+                    instance_entities = []
+                    for t in token_positive:
+                        instance_entities.append(original_caption[t[0]:t[1]])
+                    entities.append(' / '.join(instance_entities))
+                return positive_map_label_to_token, original_caption, \
+                    positive_map, entities
+
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        if not self.training and chunked_size > 0:
+            assert isinstance(original_caption,
+                              (list, tuple)) or custom_entities is True
+            all_output = self.get_tokens_positive_and_prompts_chunked(
+                original_caption, enhanced_text_prompt)
+            positive_map_label_to_token, \
+                caption_string, \
+                positive_map, \
+                entities = all_output
+        else:
+            tokenized, caption_string, tokens_positive, entities = \
+                self.get_tokens_and_prompts(
+                    original_caption, custom_entities, enhanced_text_prompt)
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
+            if tokenized.input_ids.shape[1] > self.language_model.max_tokens:
+                warnings.warn('Inputting a text that is too long will result '
+                              'in poor prediction performance. '
+                              'Please reduce the text length.')
+        return positive_map_label_to_token, caption_string, \
+            positive_map, entities
+
+    def get_tokens_positive_and_prompts_chunked(
+            self,
+            original_caption: Union[list, tuple],
+            enhanced_text_prompts: Optional[ConfigType] = None):
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        original_caption = [clean_label_name(i) for i in original_caption]
+
+        original_caption_chunked = chunks(original_caption, chunked_size)
+        ids_chunked = chunks(
+            list(range(1,
+                       len(original_caption) + 1)), chunked_size)
+
+        positive_map_label_to_token_chunked = []
+        caption_string_chunked = []
+        positive_map_chunked = []
+        entities_chunked = []
+
+        for i in range(len(ids_chunked)):
+            if enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption_chunked[i], enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption_chunked[i])
+            tokenized = self.language_model.tokenizer([caption_string],
+                                                      return_tensors='pt')
+            if tokenized.input_ids.shape[1] > self.language_model.max_tokens:
+                warnings.warn('Inputting a text that is too long will result '
+                              'in poor prediction performance. '
+                              'Please reduce the --chunked-size.')
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
+
+            caption_string_chunked.append(caption_string)
+            positive_map_label_to_token_chunked.append(
+                positive_map_label_to_token)
+            positive_map_chunked.append(positive_map)
+            entities_chunked.append(original_caption_chunked[i])
+
+        return positive_map_label_to_token_chunked, \
+            caption_string_chunked, \
+            positive_map_chunked, \
+            entities_chunked
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, list]:
+        # TODO: Only open vocabulary tasks are supported for training now.
+        text_prompts = [
+            data_samples.text for data_samples in batch_data_samples
+        ]
+
+        gt_labels = [
+            data_samples.gt_instances.labels
+            for data_samples in batch_data_samples
+        ]
+
+        new_text_prompts = []
+        positive_maps = []
+        if len(set(text_prompts)) == 1:
+            # All the text prompts are the same,
+            # so there is no need to calculate them multiple times.
+            tokenized, caption_string, tokens_positive, _ = \
+                self.get_tokens_and_prompts(
+                    text_prompts[0], True)
+            new_text_prompts = [caption_string] * len(batch_inputs)
+            for gt_label in gt_labels:
+                new_tokens_positive = [
+                    tokens_positive[label] for label in gt_label
+                ]
+                _, positive_map = self.get_positive_map(
+                    tokenized, new_tokens_positive)
+                positive_maps.append(positive_map)
+        else:
+            for text_prompt, gt_label in zip(text_prompts, gt_labels):
+                tokenized, caption_string, tokens_positive, _ = \
+                    self.get_tokens_and_prompts(
+                        text_prompt, True)
+                new_tokens_positive = [
+                    tokens_positive[label] for label in gt_label
+                ]
+                _, positive_map = self.get_positive_map(
+                    tokenized, new_tokens_positive)
+                positive_maps.append(positive_map)
+                new_text_prompts.append(caption_string)
+
+        language_dict_features = self.language_model(new_text_prompts)
+        for i, data_samples in enumerate(batch_data_samples):
+            # .bool().float() is very important
+            positive_map = positive_maps[i].to(
+                batch_inputs.device).bool().float()
+            data_samples.gt_instances.positive_maps = positive_map
+
+        visual_features = self.extract_feat(batch_inputs)
+
+        losses = self.bbox_head.loss(visual_features, language_dict_features,
+                                     batch_data_samples)
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - label_names (List[str]): Label names of bboxes.
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        text_prompts = []
+        enhanced_text_prompts = []
+        tokens_positives = []
+        for data_samples in batch_data_samples:
+            text_prompts.append(data_samples.text)
+            if 'caption_prompt' in data_samples:
+                enhanced_text_prompts.append(data_samples.caption_prompt)
+            else:
+                enhanced_text_prompts.append(None)
+            tokens_positives.append(data_samples.get('tokens_positive', None))
+
+        if 'custom_entities' in batch_data_samples[0]:
+            # Assuming that the `custom_entities` flag
+            # inside a batch is always the same. For single image inference
+            custom_entities = batch_data_samples[0].custom_entities
+        else:
+            custom_entities = False
+
+        if len(set(text_prompts)) == 1:
+            # All the text prompts are the same,
+            # so there is no need to calculate them multiple times.
+            _positive_maps_and_prompts = [
+                self.get_tokens_positive_and_prompts(
+                    text_prompts[0], custom_entities, enhanced_text_prompts[0],
+                    tokens_positives[0])
+            ] * len(batch_inputs)
+        else:
+            _positive_maps_and_prompts = [
+                self.get_tokens_positive_and_prompts(text_prompt,
+                                                     custom_entities,
+                                                     enhanced_text_prompt,
+                                                     tokens_positive)
+                for text_prompt, enhanced_text_prompt, tokens_positive in zip(
+                    text_prompts, enhanced_text_prompts, tokens_positives)
+            ]
+
+        token_positive_maps, text_prompts, _, entities = zip(
+            *_positive_maps_and_prompts)
+
+        visual_features = self.extract_feat(batch_inputs)
+
+        if isinstance(text_prompts[0], list):
+            # chunked text prompts, only bs=1 is supported
+            assert len(batch_inputs) == 1
+            count = 0
+            results_list = []
+
+            entities = [[item for lst in entities[0] for item in lst]]
+
+            for b in range(len(text_prompts[0])):
+                text_prompts_once = [text_prompts[0][b]]
+                token_positive_maps_once = token_positive_maps[0][b]
+                language_dict_features = self.language_model(text_prompts_once)
+                batch_data_samples[
+                    0].token_positive_map = token_positive_maps_once
+
+                pred_instances = self.bbox_head.predict(
+                    copy.deepcopy(visual_features),
+                    language_dict_features,
+                    batch_data_samples,
+                    rescale=rescale)[0]
+
+                if len(pred_instances) > 0:
+                    pred_instances.labels += count
+                count += len(token_positive_maps_once)
+                results_list.append(pred_instances)
+            results_list = [results_list[0].cat(results_list)]
+        else:
+            language_dict_features = self.language_model(list(text_prompts))
+
+            for i, data_samples in enumerate(batch_data_samples):
+                data_samples.token_positive_map = token_positive_maps[i]
+
+            results_list = self.bbox_head.predict(
+                visual_features,
+                language_dict_features,
+                batch_data_samples,
+                rescale=rescale)
+
+        for data_sample, pred_instances, entity in zip(batch_data_samples,
+                                                       results_list, entities):
+            if len(pred_instances) > 0:
+                label_names = []
+                for labels in pred_instances.labels:
+                    if labels >= len(entity):
+                        warnings.warn(
+                            'The unexpected output indicates an issue with '
+                            'named entity recognition. You can try '
+                            'setting custom_entities=True and running '
+                            'again to see if it helps.')
+                        label_names.append('unobject')
+                    else:
+                        label_names.append(entity[labels])
+                # for visualization
+                pred_instances.label_names = label_names
+            data_sample.pred_instances = pred_instances
+        return batch_data_samples
diff --git a/head_extractor/build/lib/mmdet/models/detectors/grid_rcnn.py b/head_extractor/build/lib/mmdet/models/detectors/grid_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bcb5b033edc620f1cf61b986c345961b719e6f1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/grid_rcnn.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class GridRCNN(TwoStageDetector):
+    """Grid R-CNN.
+
+    This detector is the implementation of:
+    - Grid R-CNN (https://arxiv.org/abs/1811.12030)
+    - Grid R-CNN Plus: Faster and Better (https://arxiv.org/abs/1906.05688)
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/grounding_dino.py b/head_extractor/build/lib/mmdet/models/detectors/grounding_dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1ab7c2da16453e4aa43020681811a8b24767ad0
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/grounding_dino.py
@@ -0,0 +1,621 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import re
+import warnings
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.runner.amp import autocast
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType
+from ..layers import SinePositionalEncoding
+from ..layers.transformer.grounding_dino_layers import (
+    GroundingDinoTransformerDecoder, GroundingDinoTransformerEncoder)
+from .dino import DINO
+from .glip import (create_positive_map, create_positive_map_label_to_token,
+                   run_ner)
+
+
+def clean_label_name(name: str) -> str:
+    name = re.sub(r'\(.*\)', '', name)
+    name = re.sub(r'_', ' ', name)
+    name = re.sub(r'  ', ' ', name)
+    return name
+
+
+def chunks(lst: list, n: int) -> list:
+    """Yield successive n-sized chunks from lst."""
+    all_ = []
+    for i in range(0, len(lst), n):
+        data_index = lst[i:i + n]
+        all_.append(data_index)
+    counter = 0
+    for i in all_:
+        counter += len(i)
+    assert (counter == len(lst))
+
+    return all_
+
+
+@MODELS.register_module()
+class GroundingDINO(DINO):
+    """Implementation of `Grounding DINO: Marrying DINO with Grounded Pre-
+    Training for Open-Set Object Detection.
+
+    <https://arxiv.org/abs/2303.05499>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/GroundingDINO>`_.
+    """
+
+    def __init__(self,
+                 language_model,
+                 *args,
+                 use_autocast=False,
+                 **kwargs) -> None:
+
+        self.language_model_cfg = language_model
+        self._special_tokens = '. '
+        self.use_autocast = use_autocast
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = GroundingDinoTransformerEncoder(**self.encoder)
+        self.decoder = GroundingDinoTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims)
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            f'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+        self.level_embed = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+        self.memory_trans_fc = nn.Linear(self.embed_dims, self.embed_dims)
+        self.memory_trans_norm = nn.LayerNorm(self.embed_dims)
+
+        # text modules
+        self.language_model = MODELS.build(self.language_model_cfg)
+        self.text_feat_map = nn.Linear(
+            self.language_model.language_backbone.body.language_dim,
+            self.embed_dims,
+            bias=True)
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super().init_weights()
+        nn.init.constant_(self.text_feat_map.bias.data, 0)
+        nn.init.xavier_uniform_(self.text_feat_map.weight.data)
+
+    def to_enhance_text_prompts(self, original_caption, enhanced_text_prompts):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            if word in enhanced_text_prompts:
+                enhanced_text_dict = enhanced_text_prompts[word]
+                if 'prefix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['prefix']
+                start_i = len(caption_string)
+                if 'name' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['name']
+                else:
+                    caption_string += word
+                end_i = len(caption_string)
+                tokens_positive.append([[start_i, end_i]])
+
+                if 'suffix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['suffix']
+            else:
+                tokens_positive.append(
+                    [[len(caption_string),
+                      len(caption_string) + len(word)]])
+                caption_string += word
+            caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
+    def to_plain_text_prompts(self, original_caption):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            tokens_positive.append(
+                [[len(caption_string),
+                  len(caption_string) + len(word)]])
+            caption_string += word
+            caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
+    def get_tokens_and_prompts(
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompts: Optional[ConfigType] = None
+    ) -> Tuple[dict, str, list]:
+        """Get the tokens positive and prompts for the caption."""
+        if isinstance(original_caption, (list, tuple)) or custom_entities:
+            if custom_entities and isinstance(original_caption, str):
+                original_caption = original_caption.strip(self._special_tokens)
+                original_caption = original_caption.split(self._special_tokens)
+                original_caption = list(
+                    filter(lambda x: len(x) > 0, original_caption))
+
+            original_caption = [clean_label_name(i) for i in original_caption]
+
+            if custom_entities and enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption, enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption)
+
+            # NOTE: Tokenizer in Grounding DINO is different from
+            # that in GLIP. The tokenizer in GLIP will pad the
+            # caption_string to max_length, while the tokenizer
+            # in Grounding DINO will not.
+            tokenized = self.language_model.tokenizer(
+                [caption_string],
+                padding='max_length'
+                if self.language_model.pad_to_max else 'longest',
+                return_tensors='pt')
+            entities = original_caption
+        else:
+            if not original_caption.endswith('.'):
+                original_caption = original_caption + self._special_tokens
+            # NOTE: Tokenizer in Grounding DINO is different from
+            # that in GLIP. The tokenizer in GLIP will pad the
+            # caption_string to max_length, while the tokenizer
+            # in Grounding DINO will not.
+            tokenized = self.language_model.tokenizer(
+                [original_caption],
+                padding='max_length'
+                if self.language_model.pad_to_max else 'longest',
+                return_tensors='pt')
+            tokens_positive, noun_phrases = run_ner(original_caption)
+            entities = noun_phrases
+            caption_string = original_caption
+
+        return tokenized, caption_string, tokens_positive, entities
+
+    def get_positive_map(self, tokenized, tokens_positive):
+        positive_map = create_positive_map(
+            tokenized,
+            tokens_positive,
+            max_num_entities=self.bbox_head.cls_branches[
+                self.decoder.num_layers].max_text_len)
+        positive_map_label_to_token = create_positive_map_label_to_token(
+            positive_map, plus=1)
+        return positive_map_label_to_token, positive_map
+
+    def get_tokens_positive_and_prompts(
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompt: Optional[ConfigType] = None,
+        tokens_positive: Optional[list] = None,
+    ) -> Tuple[dict, str, Tensor, list]:
+        """Get the tokens positive and prompts for the caption.
+
+        Args:
+            original_caption (str): The original caption, e.g. 'bench . car .'
+            custom_entities (bool, optional): Whether to use custom entities.
+                If ``True``, the ``original_caption`` should be a list of
+                strings, each of which is a word. Defaults to False.
+
+        Returns:
+            Tuple[dict, str, dict, str]: The dict is a mapping from each entity
+            id, which is numbered from 1, to its positive token id.
+            The str represents the prompts.
+        """
+        if tokens_positive is not None:
+            if tokens_positive == -1:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                return None, original_caption, None, original_caption
+            else:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                tokenized = self.language_model.tokenizer(
+                    [original_caption],
+                    padding='max_length'
+                    if self.language_model.pad_to_max else 'longest',
+                    return_tensors='pt')
+                positive_map_label_to_token, positive_map = \
+                    self.get_positive_map(tokenized, tokens_positive)
+
+                entities = []
+                for token_positive in tokens_positive:
+                    instance_entities = []
+                    for t in token_positive:
+                        instance_entities.append(original_caption[t[0]:t[1]])
+                    entities.append(' / '.join(instance_entities))
+                return positive_map_label_to_token, original_caption, \
+                    positive_map, entities
+
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        if not self.training and chunked_size > 0:
+            assert isinstance(original_caption,
+                              (list, tuple)) or custom_entities is True
+            all_output = self.get_tokens_positive_and_prompts_chunked(
+                original_caption, enhanced_text_prompt)
+            positive_map_label_to_token, \
+                caption_string, \
+                positive_map, \
+                entities = all_output
+        else:
+            tokenized, caption_string, tokens_positive, entities = \
+                self.get_tokens_and_prompts(
+                    original_caption, custom_entities, enhanced_text_prompt)
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
+        return positive_map_label_to_token, caption_string, \
+            positive_map, entities
+
+    def get_tokens_positive_and_prompts_chunked(
+            self,
+            original_caption: Union[list, tuple],
+            enhanced_text_prompts: Optional[ConfigType] = None):
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        original_caption = [clean_label_name(i) for i in original_caption]
+
+        original_caption_chunked = chunks(original_caption, chunked_size)
+        ids_chunked = chunks(
+            list(range(1,
+                       len(original_caption) + 1)), chunked_size)
+
+        positive_map_label_to_token_chunked = []
+        caption_string_chunked = []
+        positive_map_chunked = []
+        entities_chunked = []
+
+        for i in range(len(ids_chunked)):
+            if enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption_chunked[i], enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption_chunked[i])
+            tokenized = self.language_model.tokenizer([caption_string],
+                                                      return_tensors='pt')
+            if tokenized.input_ids.shape[1] > self.language_model.max_tokens:
+                warnings.warn('Inputting a text that is too long will result '
+                              'in poor prediction performance. '
+                              'Please reduce the --chunked-size.')
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
+
+            caption_string_chunked.append(caption_string)
+            positive_map_label_to_token_chunked.append(
+                positive_map_label_to_token)
+            positive_map_chunked.append(positive_map)
+            entities_chunked.append(original_caption_chunked[i])
+
+        return positive_map_label_to_token_chunked, \
+            caption_string_chunked, \
+            positive_map_chunked, \
+            entities_chunked
+
+    def forward_transformer(
+        self,
+        img_feats: Tuple[Tensor],
+        text_dict: Dict,
+        batch_data_samples: OptSampleList = None,
+    ) -> Dict:
+        encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer(
+            img_feats, batch_data_samples)
+
+        encoder_outputs_dict = self.forward_encoder(
+            **encoder_inputs_dict, text_dict=text_dict)
+
+        tmp_dec_in, head_inputs_dict = self.pre_decoder(
+            **encoder_outputs_dict, batch_data_samples=batch_data_samples)
+        decoder_inputs_dict.update(tmp_dec_in)
+
+        decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict)
+        head_inputs_dict.update(decoder_outputs_dict)
+        return head_inputs_dict
+
+    def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
+                        feat_pos: Tensor, spatial_shapes: Tensor,
+                        level_start_index: Tensor, valid_ratios: Tensor,
+                        text_dict: Dict) -> Dict:
+        text_token_mask = text_dict['text_token_mask']
+        memory, memory_text = self.encoder(
+            query=feat,
+            query_pos=feat_pos,
+            key_padding_mask=feat_mask,  # for self_attn
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            # for text encoder
+            memory_text=text_dict['embedded'],
+            text_attention_mask=~text_token_mask,
+            position_ids=text_dict['position_ids'],
+            text_self_attention_masks=text_dict['masks'])
+        encoder_outputs_dict = dict(
+            memory=memory,
+            memory_mask=feat_mask,
+            spatial_shapes=spatial_shapes,
+            memory_text=memory_text,
+            text_token_mask=text_token_mask)
+        return encoder_outputs_dict
+
+    def pre_decoder(
+        self,
+        memory: Tensor,
+        memory_mask: Tensor,
+        spatial_shapes: Tensor,
+        memory_text: Tensor,
+        text_token_mask: Tensor,
+        batch_data_samples: OptSampleList = None,
+    ) -> Tuple[Dict]:
+        bs, _, c = memory.shape
+
+        output_memory, output_proposals = self.gen_encoder_output_proposals(
+            memory, memory_mask, spatial_shapes)
+
+        enc_outputs_class = self.bbox_head.cls_branches[
+            self.decoder.num_layers](output_memory, memory_text,
+                                     text_token_mask)
+        cls_out_features = self.bbox_head.cls_branches[
+            self.decoder.num_layers].max_text_len
+        enc_outputs_coord_unact = self.bbox_head.reg_branches[
+            self.decoder.num_layers](output_memory) + output_proposals
+
+        # NOTE The DINO selects top-k proposals according to scores of
+        # multi-class classification, while DeformDETR, where the input
+        # is `enc_outputs_class[..., 0]` selects according to scores of
+        # binary classification.
+        topk_indices = torch.topk(
+            enc_outputs_class.max(-1)[0], k=self.num_queries, dim=1)[1]
+
+        topk_score = torch.gather(
+            enc_outputs_class, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features))
+        topk_coords_unact = torch.gather(
+            enc_outputs_coord_unact, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, 4))
+        topk_coords = topk_coords_unact.sigmoid()
+        topk_coords_unact = topk_coords_unact.detach()
+
+        query = self.query_embedding.weight[:, None, :]
+        query = query.repeat(1, bs, 1).transpose(0, 1)
+        if self.training:
+            dn_label_query, dn_bbox_query, dn_mask, dn_meta = \
+                self.dn_query_generator(batch_data_samples)
+            query = torch.cat([dn_label_query, query], dim=1)
+            reference_points = torch.cat([dn_bbox_query, topk_coords_unact],
+                                         dim=1)
+        else:
+            reference_points = topk_coords_unact
+            dn_mask, dn_meta = None, None
+        reference_points = reference_points.sigmoid()
+
+        decoder_inputs_dict = dict(
+            query=query,
+            memory=memory,
+            reference_points=reference_points,
+            dn_mask=dn_mask,
+            memory_text=memory_text,
+            text_attention_mask=~text_token_mask,
+        )
+        # NOTE DINO calculates encoder losses on scores and coordinates
+        # of selected top-k encoder queries, while DeformDETR is of all
+        # encoder queries.
+        head_inputs_dict = dict(
+            enc_outputs_class=topk_score,
+            enc_outputs_coord=topk_coords,
+            dn_meta=dn_meta) if self.training else dict()
+        # append text_feats to head_inputs_dict
+        head_inputs_dict['memory_text'] = memory_text
+        head_inputs_dict['text_token_mask'] = text_token_mask
+        return decoder_inputs_dict, head_inputs_dict
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, list]:
+        text_prompts = [
+            data_samples.text for data_samples in batch_data_samples
+        ]
+
+        gt_labels = [
+            data_samples.gt_instances.labels
+            for data_samples in batch_data_samples
+        ]
+
+        if 'tokens_positive' in batch_data_samples[0]:
+            tokens_positive = [
+                data_samples.tokens_positive
+                for data_samples in batch_data_samples
+            ]
+            positive_maps = []
+            for token_positive, text_prompt, gt_label in zip(
+                    tokens_positive, text_prompts, gt_labels):
+                tokenized = self.language_model.tokenizer(
+                    [text_prompt],
+                    padding='max_length'
+                    if self.language_model.pad_to_max else 'longest',
+                    return_tensors='pt')
+                new_tokens_positive = [
+                    token_positive[label.item()] for label in gt_label
+                ]
+                _, positive_map = self.get_positive_map(
+                    tokenized, new_tokens_positive)
+                positive_maps.append(positive_map)
+            new_text_prompts = text_prompts
+        else:
+            new_text_prompts = []
+            positive_maps = []
+            if len(set(text_prompts)) == 1:
+                # All the text prompts are the same,
+                # so there is no need to calculate them multiple times.
+                tokenized, caption_string, tokens_positive, _ = \
+                    self.get_tokens_and_prompts(
+                        text_prompts[0], True)
+                new_text_prompts = [caption_string] * len(batch_inputs)
+                for gt_label in gt_labels:
+                    new_tokens_positive = [
+                        tokens_positive[label] for label in gt_label
+                    ]
+                    _, positive_map = self.get_positive_map(
+                        tokenized, new_tokens_positive)
+                    positive_maps.append(positive_map)
+            else:
+                for text_prompt, gt_label in zip(text_prompts, gt_labels):
+                    tokenized, caption_string, tokens_positive, _ = \
+                        self.get_tokens_and_prompts(
+                            text_prompt, True)
+                    new_tokens_positive = [
+                        tokens_positive[label] for label in gt_label
+                    ]
+                    _, positive_map = self.get_positive_map(
+                        tokenized, new_tokens_positive)
+                    positive_maps.append(positive_map)
+                    new_text_prompts.append(caption_string)
+
+        text_dict = self.language_model(new_text_prompts)
+        if self.text_feat_map is not None:
+            text_dict['embedded'] = self.text_feat_map(text_dict['embedded'])
+
+        for i, data_samples in enumerate(batch_data_samples):
+            positive_map = positive_maps[i].to(
+                batch_inputs.device).bool().float()
+            text_token_mask = text_dict['text_token_mask'][i]
+            data_samples.gt_instances.positive_maps = positive_map
+            data_samples.gt_instances.text_token_mask = \
+                text_token_mask.unsqueeze(0).repeat(
+                    len(positive_map), 1)
+        if self.use_autocast:
+            with autocast(enabled=True):
+                visual_features = self.extract_feat(batch_inputs)
+        else:
+            visual_features = self.extract_feat(batch_inputs)
+        head_inputs_dict = self.forward_transformer(visual_features, text_dict,
+                                                    batch_data_samples)
+
+        losses = self.bbox_head.loss(
+            **head_inputs_dict, batch_data_samples=batch_data_samples)
+        return losses
+
+    def predict(self, batch_inputs, batch_data_samples, rescale: bool = True):
+        text_prompts = []
+        enhanced_text_prompts = []
+        tokens_positives = []
+        for data_samples in batch_data_samples:
+            text_prompts.append(data_samples.text)
+            if 'caption_prompt' in data_samples:
+                enhanced_text_prompts.append(data_samples.caption_prompt)
+            else:
+                enhanced_text_prompts.append(None)
+            tokens_positives.append(data_samples.get('tokens_positive', None))
+
+        if 'custom_entities' in batch_data_samples[0]:
+            # Assuming that the `custom_entities` flag
+            # inside a batch is always the same. For single image inference
+            custom_entities = batch_data_samples[0].custom_entities
+        else:
+            custom_entities = False
+        if len(text_prompts) == 1:
+            # All the text prompts are the same,
+            # so there is no need to calculate them multiple times.
+            _positive_maps_and_prompts = [
+                self.get_tokens_positive_and_prompts(
+                    text_prompts[0], custom_entities, enhanced_text_prompts[0],
+                    tokens_positives[0])
+            ] * len(batch_inputs)
+        else:
+            _positive_maps_and_prompts = [
+                self.get_tokens_positive_and_prompts(text_prompt,
+                                                     custom_entities,
+                                                     enhanced_text_prompt,
+                                                     tokens_positive)
+                for text_prompt, enhanced_text_prompt, tokens_positive in zip(
+                    text_prompts, enhanced_text_prompts, tokens_positives)
+            ]
+        token_positive_maps, text_prompts, _, entities = zip(
+            *_positive_maps_and_prompts)
+
+        # image feature extraction
+        visual_feats = self.extract_feat(batch_inputs)
+
+        if isinstance(text_prompts[0], list):
+            # chunked text prompts, only bs=1 is supported
+            assert len(batch_inputs) == 1
+            count = 0
+            results_list = []
+
+            entities = [[item for lst in entities[0] for item in lst]]
+
+            for b in range(len(text_prompts[0])):
+                text_prompts_once = [text_prompts[0][b]]
+                token_positive_maps_once = token_positive_maps[0][b]
+                text_dict = self.language_model(text_prompts_once)
+                # text feature map layer
+                if self.text_feat_map is not None:
+                    text_dict['embedded'] = self.text_feat_map(
+                        text_dict['embedded'])
+
+                batch_data_samples[
+                    0].token_positive_map = token_positive_maps_once
+
+                head_inputs_dict = self.forward_transformer(
+                    copy.deepcopy(visual_feats), text_dict, batch_data_samples)
+                pred_instances = self.bbox_head.predict(
+                    **head_inputs_dict,
+                    rescale=rescale,
+                    batch_data_samples=batch_data_samples)[0]
+
+                if len(pred_instances) > 0:
+                    pred_instances.labels += count
+                count += len(token_positive_maps_once)
+                results_list.append(pred_instances)
+            results_list = [results_list[0].cat(results_list)]
+            is_rec_tasks = [False] * len(results_list)
+        else:
+            # extract text feats
+            text_dict = self.language_model(list(text_prompts))
+            # text feature map layer
+            if self.text_feat_map is not None:
+                text_dict['embedded'] = self.text_feat_map(
+                    text_dict['embedded'])
+
+            is_rec_tasks = []
+            for i, data_samples in enumerate(batch_data_samples):
+                if token_positive_maps[i] is not None:
+                    is_rec_tasks.append(False)
+                else:
+                    is_rec_tasks.append(True)
+                data_samples.token_positive_map = token_positive_maps[i]
+
+            head_inputs_dict = self.forward_transformer(
+                visual_feats, text_dict, batch_data_samples)
+            results_list = self.bbox_head.predict(
+                **head_inputs_dict,
+                rescale=rescale,
+                batch_data_samples=batch_data_samples)
+
+        for data_sample, pred_instances, entity, is_rec_task in zip(
+                batch_data_samples, results_list, entities, is_rec_tasks):
+            if len(pred_instances) > 0:
+                label_names = []
+                for labels in pred_instances.labels:
+                    if is_rec_task:
+                        label_names.append(entity)
+                        continue
+                    if labels >= len(entity):
+                        warnings.warn(
+                            'The unexpected output indicates an issue with '
+                            'named entity recognition. You can try '
+                            'setting custom_entities=True and running '
+                            'again to see if it helps.')
+                        label_names.append('unobject')
+                    else:
+                        label_names.append(entity[labels])
+                # for visualization
+                pred_instances.label_names = label_names
+            data_sample.pred_instances = pred_instances
+        return batch_data_samples
diff --git a/head_extractor/build/lib/mmdet/models/detectors/htc.py b/head_extractor/build/lib/mmdet/models/detectors/htc.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a2aa889a59fd0e0afeb95a7369028def6e4fa9
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/htc.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from .cascade_rcnn import CascadeRCNN
+
+
+@MODELS.register_module()
+class HybridTaskCascade(CascadeRCNN):
+    """Implementation of `HTC <https://arxiv.org/abs/1901.07518>`_"""
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+    @property
+    def with_semantic(self) -> bool:
+        """bool: whether the detector has a semantic head"""
+        return self.roi_head.with_semantic
diff --git a/head_extractor/build/lib/mmdet/models/detectors/kd_one_stage.py b/head_extractor/build/lib/mmdet/models/detectors/kd_one_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a4a1bb564c0f6e4cabe32a5c01cfea252ecfb7d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/kd_one_stage.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from pathlib import Path
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmengine.config import Config
+from mmengine.runner import load_checkpoint
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class KnowledgeDistillationSingleStageDetector(SingleStageDetector):
+    r"""Implementation of `Distilling the Knowledge in a Neural Network.
+    <https://arxiv.org/abs/1503.02531>`_.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        teacher_config (:obj:`ConfigDict` | dict | str | Path): Config file
+            path or the config object of teacher model.
+        teacher_ckpt (str, optional): Checkpoint path of teacher model.
+            If left as None, the model will not load any weights.
+            Defaults to True.
+        eval_teacher (bool): Set the train mode for teacher.
+            Defaults to True.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of ATSS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of ATSS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        backbone: ConfigType,
+        neck: ConfigType,
+        bbox_head: ConfigType,
+        teacher_config: Union[ConfigType, str, Path],
+        teacher_ckpt: Optional[str] = None,
+        eval_teacher: bool = True,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        data_preprocessor: OptConfigType = None,
+    ) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor)
+        self.eval_teacher = eval_teacher
+        # Build teacher model
+        if isinstance(teacher_config, (str, Path)):
+            teacher_config = Config.fromfile(teacher_config)
+        self.teacher_model = MODELS.build(teacher_config['model'])
+        if teacher_ckpt is not None:
+            load_checkpoint(
+                self.teacher_model, teacher_ckpt, map_location='cpu')
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+        with torch.no_grad():
+            teacher_x = self.teacher_model.extract_feat(batch_inputs)
+            out_teacher = self.teacher_model.bbox_head(teacher_x)
+        losses = self.bbox_head.loss(x, out_teacher, batch_data_samples)
+        return losses
+
+    def cuda(self, device: Optional[str] = None) -> nn.Module:
+        """Since teacher_model is registered as a plain object, it is necessary
+        to put the teacher model to cuda when calling ``cuda`` function."""
+        self.teacher_model.cuda(device=device)
+        return super().cuda(device=device)
+
+    def to(self, device: Optional[str] = None) -> nn.Module:
+        """Since teacher_model is registered as a plain object, it is necessary
+        to put the teacher model to other device when calling ``to``
+        function."""
+        self.teacher_model.to(device=device)
+        return super().to(device=device)
+
+    def train(self, mode: bool = True) -> None:
+        """Set the same train mode for teacher and student model."""
+        if self.eval_teacher:
+            self.teacher_model.train(False)
+        else:
+            self.teacher_model.train(mode)
+        super().train(mode)
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        """Set attribute, i.e. self.name = value
+
+        This reloading prevent the teacher model from being registered as a
+        nn.Module. The teacher module is registered as a plain object, so that
+        the teacher parameters will not show up when calling
+        ``self.parameters``, ``self.modules``, ``self.children`` methods.
+        """
+        if name == 'teacher_model':
+            object.__setattr__(self, name, value)
+        else:
+            super().__setattr__(name, value)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/lad.py b/head_extractor/build/lib/mmdet/models/detectors/lad.py
new file mode 100644
index 0000000000000000000000000000000000000000..008f898772988715c67783d9218ff39c4dd95d80
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/lad.py
@@ -0,0 +1,93 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmengine.runner import load_checkpoint
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType
+from ..utils.misc import unpack_gt_instances
+from .kd_one_stage import KnowledgeDistillationSingleStageDetector
+
+
+@MODELS.register_module()
+class LAD(KnowledgeDistillationSingleStageDetector):
+    """Implementation of `LAD <https://arxiv.org/pdf/2108.10520.pdf>`_."""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 teacher_backbone: ConfigType,
+                 teacher_neck: ConfigType,
+                 teacher_bbox_head: ConfigType,
+                 teacher_ckpt: Optional[str] = None,
+                 eval_teacher: bool = True,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None) -> None:
+        super(KnowledgeDistillationSingleStageDetector, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor)
+        self.eval_teacher = eval_teacher
+        self.teacher_model = nn.Module()
+        self.teacher_model.backbone = MODELS.build(teacher_backbone)
+        if teacher_neck is not None:
+            self.teacher_model.neck = MODELS.build(teacher_neck)
+        teacher_bbox_head.update(train_cfg=train_cfg)
+        teacher_bbox_head.update(test_cfg=test_cfg)
+        self.teacher_model.bbox_head = MODELS.build(teacher_bbox_head)
+        if teacher_ckpt is not None:
+            load_checkpoint(
+                self.teacher_model, teacher_ckpt, map_location='cpu')
+
+    @property
+    def with_teacher_neck(self) -> bool:
+        """bool: whether the detector has a teacher_neck"""
+        return hasattr(self.teacher_model, 'neck') and \
+            self.teacher_model.neck is not None
+
+    def extract_teacher_feat(self, batch_inputs: Tensor) -> Tensor:
+        """Directly extract teacher features from the backbone+neck."""
+        x = self.teacher_model.backbone(batch_inputs)
+        if self.with_teacher_neck:
+            x = self.teacher_model.neck(x)
+        return x
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+        # get label assignment from the teacher
+        with torch.no_grad():
+            x_teacher = self.extract_teacher_feat(batch_inputs)
+            outs_teacher = self.teacher_model.bbox_head(x_teacher)
+            label_assignment_results = \
+                self.teacher_model.bbox_head.get_label_assignment(
+                    *outs_teacher, batch_gt_instances, batch_img_metas,
+                    batch_gt_instances_ignore)
+
+        # the student use the label assignment from the teacher to learn
+        x = self.extract_feat(batch_inputs)
+        losses = self.bbox_head.loss(x, label_assignment_results,
+                                     batch_data_samples)
+        return losses
diff --git a/head_extractor/build/lib/mmdet/models/detectors/mask2former.py b/head_extractor/build/lib/mmdet/models/detectors/mask2former.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f38ef44e482039fdf7476d048eee5df2a96fd9b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/mask2former.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .maskformer import MaskFormer
+
+
+@MODELS.register_module()
+class Mask2Former(MaskFormer):
+    r"""Implementation of `Masked-attention Mask
+    Transformer for Universal Image Segmentation
+    <https://arxiv.org/pdf/2112.01527>`_."""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 panoptic_head: OptConfigType = None,
+                 panoptic_fusion_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            panoptic_head=panoptic_head,
+            panoptic_fusion_head=panoptic_fusion_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/mask_rcnn.py b/head_extractor/build/lib/mmdet/models/detectors/mask_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..880ee1e8ac3926d618ef47985549d3214175ee73
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/mask_rcnn.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import ConfigDict
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class MaskRCNN(TwoStageDetector):
+    """Implementation of `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_"""
+
+    def __init__(self,
+                 backbone: ConfigDict,
+                 rpn_head: ConfigDict,
+                 roi_head: ConfigDict,
+                 train_cfg: ConfigDict,
+                 test_cfg: ConfigDict,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/mask_scoring_rcnn.py b/head_extractor/build/lib/mmdet/models/detectors/mask_scoring_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e09d3a1041f929113962e42bdf8b169e52dabe25
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/mask_scoring_rcnn.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class MaskScoringRCNN(TwoStageDetector):
+    """Mask Scoring RCNN.
+
+    https://arxiv.org/abs/1903.00241
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/maskformer.py b/head_extractor/build/lib/mmdet/models/detectors/maskformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7493c00e1b87cf9b2fbd2c80f1e642f6eb2bea55
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/maskformer.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class MaskFormer(SingleStageDetector):
+    r"""Implementation of `Per-Pixel Classification is
+    NOT All You Need for Semantic Segmentation
+    <https://arxiv.org/pdf/2107.06278>`_."""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 panoptic_head: OptConfigType = None,
+                 panoptic_fusion_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super(SingleStageDetector, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+
+        panoptic_head_ = panoptic_head.deepcopy()
+        panoptic_head_.update(train_cfg=train_cfg)
+        panoptic_head_.update(test_cfg=test_cfg)
+        self.panoptic_head = MODELS.build(panoptic_head_)
+
+        panoptic_fusion_head_ = panoptic_fusion_head.deepcopy()
+        panoptic_fusion_head_.update(test_cfg=test_cfg)
+        self.panoptic_fusion_head = MODELS.build(panoptic_fusion_head_)
+
+        self.num_things_classes = self.panoptic_head.num_things_classes
+        self.num_stuff_classes = self.panoptic_head.num_stuff_classes
+        self.num_classes = self.panoptic_head.num_classes
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        x = self.extract_feat(batch_inputs)
+        losses = self.panoptic_head.loss(x, batch_data_samples)
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances' and `pred_panoptic_seg`. And the
+            ``pred_instances`` usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+
+            And the ``pred_panoptic_seg`` contains the following key
+
+                - sem_seg (Tensor): panoptic segmentation mask, has a
+                    shape (1, h, w).
+        """
+        feats = self.extract_feat(batch_inputs)
+        mask_cls_results, mask_pred_results = self.panoptic_head.predict(
+            feats, batch_data_samples)
+        results_list = self.panoptic_fusion_head.predict(
+            mask_cls_results,
+            mask_pred_results,
+            batch_data_samples,
+            rescale=rescale)
+        results = self.add_pred_to_datasample(batch_data_samples, results_list)
+
+        return results
+
+    def add_pred_to_datasample(self, data_samples: SampleList,
+                               results_list: List[dict]) -> SampleList:
+        """Add predictions to `DetDataSample`.
+
+        Args:
+            data_samples (list[:obj:`DetDataSample`], optional): A batch of
+                data samples that contain annotations and predictions.
+            results_list (List[dict]): Instance segmentation, segmantic
+                segmentation and panoptic segmentation results.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances' and `pred_panoptic_seg`. And the
+            ``pred_instances`` usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+
+            And the ``pred_panoptic_seg`` contains the following key
+
+                - sem_seg (Tensor): panoptic segmentation mask, has a
+                    shape (1, h, w).
+        """
+        for data_sample, pred_results in zip(data_samples, results_list):
+            if 'pan_results' in pred_results:
+                data_sample.pred_panoptic_seg = pred_results['pan_results']
+
+            if 'ins_results' in pred_results:
+                data_sample.pred_instances = pred_results['ins_results']
+
+            assert 'sem_results' not in pred_results, 'segmantic ' \
+                'segmentation results are not supported yet.'
+
+        return data_samples
+
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> Tuple[List[Tensor]]:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            tuple[List[Tensor]]: A tuple of features from ``panoptic_head``
+            forward.
+        """
+        feats = self.extract_feat(batch_inputs)
+        results = self.panoptic_head.forward(feats, batch_data_samples)
+        return results
diff --git a/head_extractor/build/lib/mmdet/models/detectors/nasfcos.py b/head_extractor/build/lib/mmdet/models/detectors/nasfcos.py
new file mode 100644
index 0000000000000000000000000000000000000000..da2b911bcfc6b0ba51b00d9b3948a3df7af2e74f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/nasfcos.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class NASFCOS(SingleStageDetector):
+    """Implementation of `NAS-FCOS: Fast Neural Architecture Search for Object
+    Detection. <https://arxiv.org/abs/1906.0442>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of NASFCOS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of NASFCOS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/paa.py b/head_extractor/build/lib/mmdet/models/detectors/paa.py
new file mode 100644
index 0000000000000000000000000000000000000000..094306b2fbd18ba45536470ec80443e4ff793e67
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/paa.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class PAA(SingleStageDetector):
+    """Implementation of `PAA <https://arxiv.org/pdf/2007.08103.pdf>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of PAA. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of PAA. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/panoptic_fpn.py b/head_extractor/build/lib/mmdet/models/detectors/panoptic_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae63ccc38931daa60b4e62f94dcf9f44574d3669
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/panoptic_fpn.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .panoptic_two_stage_segmentor import TwoStagePanopticSegmentor
+
+
+@MODELS.register_module()
+class PanopticFPN(TwoStagePanopticSegmentor):
+    r"""Implementation of `Panoptic feature pyramid
+    networks <https://arxiv.org/pdf/1901.02446>`_"""
+
+    def __init__(
+            self,
+            backbone: ConfigType,
+            neck: OptConfigType = None,
+            rpn_head: OptConfigType = None,
+            roi_head: OptConfigType = None,
+            train_cfg: OptConfigType = None,
+            test_cfg: OptConfigType = None,
+            data_preprocessor: OptConfigType = None,
+            init_cfg: OptMultiConfig = None,
+            # for panoptic segmentation
+            semantic_head: OptConfigType = None,
+            panoptic_fusion_head: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg,
+            semantic_head=semantic_head,
+            panoptic_fusion_head=panoptic_fusion_head)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/panoptic_two_stage_segmentor.py b/head_extractor/build/lib/mmdet/models/detectors/panoptic_two_stage_segmentor.py
new file mode 100644
index 0000000000000000000000000000000000000000..879edbe1ac6a0f482fdd740f4058e508e728414d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/panoptic_two_stage_segmentor.py
@@ -0,0 +1,234 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List
+
+import torch
+from mmengine.structures import PixelData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class TwoStagePanopticSegmentor(TwoStageDetector):
+    """Base class of Two-stage Panoptic Segmentor.
+
+    As well as the components in TwoStageDetector, Panoptic Segmentor has extra
+    semantic_head and panoptic_fusion_head.
+    """
+
+    def __init__(
+            self,
+            backbone: ConfigType,
+            neck: OptConfigType = None,
+            rpn_head: OptConfigType = None,
+            roi_head: OptConfigType = None,
+            train_cfg: OptConfigType = None,
+            test_cfg: OptConfigType = None,
+            data_preprocessor: OptConfigType = None,
+            init_cfg: OptMultiConfig = None,
+            # for panoptic segmentation
+            semantic_head: OptConfigType = None,
+            panoptic_fusion_head: OptConfigType = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+        if semantic_head is not None:
+            self.semantic_head = MODELS.build(semantic_head)
+
+        if panoptic_fusion_head is not None:
+            panoptic_cfg = test_cfg.panoptic if test_cfg is not None else None
+            panoptic_fusion_head_ = panoptic_fusion_head.deepcopy()
+            panoptic_fusion_head_.update(test_cfg=panoptic_cfg)
+            self.panoptic_fusion_head = MODELS.build(panoptic_fusion_head_)
+
+            self.num_things_classes = self.panoptic_fusion_head.\
+                num_things_classes
+            self.num_stuff_classes = self.panoptic_fusion_head.\
+                num_stuff_classes
+            self.num_classes = self.panoptic_fusion_head.num_classes
+
+    @property
+    def with_semantic_head(self) -> bool:
+        """bool: whether the detector has semantic head"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    @property
+    def with_panoptic_fusion_head(self) -> bool:
+        """bool: whether the detector has panoptic fusion head"""
+        return hasattr(self, 'panoptic_fusion_head') and \
+            self.panoptic_fusion_head is not None
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.with_rpn:
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            rpn_data_samples = copy.deepcopy(batch_data_samples)
+            # set cat_id of gt_labels to 0 in RPN
+            for data_sample in rpn_data_samples:
+                data_sample.gt_instances.labels = \
+                    torch.zeros_like(data_sample.gt_instances.labels)
+
+            rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict(
+                x, rpn_data_samples, proposal_cfg=proposal_cfg)
+            # avoid get same name with roi_head loss
+            keys = rpn_losses.keys()
+            for key in list(keys):
+                if 'loss' in key and 'rpn' not in key:
+                    rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+            losses.update(rpn_losses)
+        else:
+            # TODO: Not support currently, should have a check at Fast R-CNN
+            assert batch_data_samples[0].get('proposals', None) is not None
+            # use pre-defined proposals in InstanceData for the second stage
+            # to extract ROI features.
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        roi_losses = self.roi_head.loss(x, rpn_results_list,
+                                        batch_data_samples)
+        losses.update(roi_losses)
+
+        semantic_loss = self.semantic_head.loss(x, batch_data_samples)
+        losses.update(semantic_loss)
+
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            List[:obj:`DetDataSample`]: Return the packed panoptic segmentation
+                results of input images. Each DetDataSample usually contains
+                'pred_panoptic_seg'. And the 'pred_panoptic_seg' has a key
+                ``sem_seg``, which is a tensor of shape (1, h, w).
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        x = self.extract_feat(batch_inputs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            rpn_results_list = self.rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        results_list = self.roi_head.predict(
+            x, rpn_results_list, batch_data_samples, rescale=rescale)
+
+        seg_preds = self.semantic_head.predict(x, batch_img_metas, rescale)
+
+        results_list = self.panoptic_fusion_head.predict(
+            results_list, seg_preds)
+
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
+
+    # TODO the code has not been verified and needs to be refactored later.
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+
+        Returns:
+            tuple: A tuple of features from ``rpn_head``, ``roi_head`` and
+                ``semantic_head`` forward.
+        """
+        results = ()
+        x = self.extract_feat(batch_inputs)
+        rpn_outs = self.rpn_head.forward(x)
+        results = results + (rpn_outs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            batch_img_metas = [
+                data_samples.metainfo for data_samples in batch_data_samples
+            ]
+            rpn_results_list = self.rpn_head.predict_by_feat(
+                *rpn_outs, batch_img_metas=batch_img_metas, rescale=False)
+        else:
+            # TODO: Not checked currently.
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        # roi_head
+        roi_outs = self.roi_head(x, rpn_results_list)
+        results = results + (roi_outs)
+
+        # semantic_head
+        sem_outs = self.semantic_head.forward(x)
+        results = results + (sem_outs['seg_preds'], )
+
+        return results
+
+    def add_pred_to_datasample(self, data_samples: SampleList,
+                               results_list: List[PixelData]) -> SampleList:
+        """Add predictions to `DetDataSample`.
+
+        Args:
+            data_samples (list[:obj:`DetDataSample`]): The
+                annotation data of every samples.
+            results_list (List[PixelData]): Panoptic segmentation results of
+                each image.
+
+        Returns:
+            List[:obj:`DetDataSample`]: Return the packed panoptic segmentation
+                results of input images. Each DetDataSample usually contains
+                'pred_panoptic_seg'. And the 'pred_panoptic_seg' has a key
+                ``sem_seg``, which is a tensor of shape (1, h, w).
+        """
+
+        for data_sample, pred_panoptic_seg in zip(data_samples, results_list):
+            data_sample.pred_panoptic_seg = pred_panoptic_seg
+        return data_samples
diff --git a/head_extractor/build/lib/mmdet/models/detectors/point_rend.py b/head_extractor/build/lib/mmdet/models/detectors/point_rend.py
new file mode 100644
index 0000000000000000000000000000000000000000..5062ac0c945e79bd53e66e1642aec51113475cad
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/point_rend.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import ConfigDict
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class PointRend(TwoStageDetector):
+    """PointRend: Image Segmentation as Rendering
+
+    This detector is the implementation of
+    `PointRend <https://arxiv.org/abs/1912.08193>`_.
+
+    """
+
+    def __init__(self,
+                 backbone: ConfigDict,
+                 rpn_head: ConfigDict,
+                 roi_head: ConfigDict,
+                 train_cfg: ConfigDict,
+                 test_cfg: ConfigDict,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/queryinst.py b/head_extractor/build/lib/mmdet/models/detectors/queryinst.py
new file mode 100644
index 0000000000000000000000000000000000000000..400ce20c01f5c3825e343f2d32accf740c5dd55c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/queryinst.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .sparse_rcnn import SparseRCNN
+
+
+@MODELS.register_module()
+class QueryInst(SparseRCNN):
+    r"""Implementation of
+    `Instances as Queries <http://arxiv.org/abs/2105.01928>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/reppoints_detector.py b/head_extractor/build/lib/mmdet/models/detectors/reppoints_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..d86cec2ecda0671939e227c50f00379e81d3ac9c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/reppoints_detector.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class RepPointsDetector(SingleStageDetector):
+    """RepPoints: Point Set Representation for Object Detection.
+
+        This detector is the implementation of:
+        - RepPoints detector (https://arxiv.org/pdf/1904.11490)
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/retinanet.py b/head_extractor/build/lib/mmdet/models/detectors/retinanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..03e3cb20e5bda603e9384d83688a56fa590e6de8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/retinanet.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class RetinaNet(SingleStageDetector):
+    """Implementation of `RetinaNet <https://arxiv.org/abs/1708.02002>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/rpn.py b/head_extractor/build/lib/mmdet/models/detectors/rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..72fe8521fcc9bc796801b2dd68269bb57aaab984
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/rpn.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class RPN(SingleStageDetector):
+    """Implementation of Region Proposal Network.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 rpn_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(SingleStageDetector, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        self.neck = MODELS.build(neck) if neck is not None else None
+        rpn_train_cfg = train_cfg['rpn'] if train_cfg is not None else None
+        rpn_head_num_classes = rpn_head.get('num_classes', 1)
+        if rpn_head_num_classes != 1:
+            warnings.warn('The `num_classes` should be 1 in RPN, but get '
+                          f'{rpn_head_num_classes}, please set '
+                          'rpn_head.num_classes = 1 in your config file.')
+            rpn_head.update(num_classes=1)
+        rpn_head.update(train_cfg=rpn_train_cfg)
+        rpn_head.update(test_cfg=test_cfg['rpn'])
+        self.bbox_head = MODELS.build(rpn_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+
+        # set cat_id of gt_labels to 0 in RPN
+        rpn_data_samples = copy.deepcopy(batch_data_samples)
+        for data_sample in rpn_data_samples:
+            data_sample.gt_instances.labels = \
+                torch.zeros_like(data_sample.gt_instances.labels)
+
+        losses = self.bbox_head.loss(x, rpn_data_samples)
+        return losses
diff --git a/head_extractor/build/lib/mmdet/models/detectors/rtmdet.py b/head_extractor/build/lib/mmdet/models/detectors/rtmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b43e053fc41a4b8400bbc0946fffedfa735b9451
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/rtmdet.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.dist import get_world_size
+from mmengine.logging import print_log
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class RTMDet(SingleStageDetector):
+    """Implementation of RTMDet.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of ATSS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of ATSS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+        use_syncbn (bool): Whether to use SyncBatchNorm. Defaults to True.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 use_syncbn: bool = True) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+        # TODO: Waiting for mmengine support
+        if use_syncbn and get_world_size() > 1:
+            torch.nn.SyncBatchNorm.convert_sync_batchnorm(self)
+            print_log('Using SyncBatchNorm()', 'current')
diff --git a/head_extractor/build/lib/mmdet/models/detectors/scnet.py b/head_extractor/build/lib/mmdet/models/detectors/scnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..606a0203869f1731a21d811f06c4781f5cd90d8d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/scnet.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from .cascade_rcnn import CascadeRCNN
+
+
+@MODELS.register_module()
+class SCNet(CascadeRCNN):
+    """Implementation of `SCNet <https://arxiv.org/abs/2012.10150>`_"""
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/semi_base.py b/head_extractor/build/lib/mmdet/models/detectors/semi_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3f0c8c030830e188bf3ad245d5b3cb471ecb04f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/semi_base.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.models.utils import (filter_gt_instances, rename_loss_dict,
+                                reweight_loss_dict)
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_project
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+
+@MODELS.register_module()
+class SemiBaseDetector(BaseDetector):
+    """Base class for semi-supervised detectors.
+
+    Semi-supervised detectors typically consisting of a teacher model
+    updated by exponential moving average and a student model updated
+    by gradient descent.
+
+    Args:
+        detector (:obj:`ConfigDict` or dict): The detector config.
+        semi_train_cfg (:obj:`ConfigDict` or dict, optional):
+            The semi-supervised training config.
+        semi_test_cfg (:obj:`ConfigDict` or dict, optional):
+            The semi-supervised testing config.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: ConfigType,
+                 semi_train_cfg: OptConfigType = None,
+                 semi_test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.student = MODELS.build(detector)
+        self.teacher = MODELS.build(detector)
+        self.semi_train_cfg = semi_train_cfg
+        self.semi_test_cfg = semi_test_cfg
+        if self.semi_train_cfg.get('freeze_teacher', True) is True:
+            self.freeze(self.teacher)
+
+    @staticmethod
+    def freeze(model: nn.Module):
+        """Freeze the model."""
+        model.eval()
+        for param in model.parameters():
+            param.requires_grad = False
+
+    def loss(self, multi_batch_inputs: Dict[str, Tensor],
+             multi_batch_data_samples: Dict[str, SampleList]) -> dict:
+        """Calculate losses from multi-branch inputs and data samples.
+
+        Args:
+            multi_batch_inputs (Dict[str, Tensor]): The dict of multi-branch
+                input images, each value with shape (N, C, H, W).
+                Each value should usually be mean centered and std scaled.
+            multi_batch_data_samples (Dict[str, List[:obj:`DetDataSample`]]):
+                The dict of multi-branch data samples.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+        losses = dict()
+        losses.update(**self.loss_by_gt_instances(
+            multi_batch_inputs['sup'], multi_batch_data_samples['sup']))
+
+        origin_pseudo_data_samples, batch_info = self.get_pseudo_instances(
+            multi_batch_inputs['unsup_teacher'],
+            multi_batch_data_samples['unsup_teacher'])
+        multi_batch_data_samples[
+            'unsup_student'] = self.project_pseudo_instances(
+                origin_pseudo_data_samples,
+                multi_batch_data_samples['unsup_student'])
+        losses.update(**self.loss_by_pseudo_instances(
+            multi_batch_inputs['unsup_student'],
+            multi_batch_data_samples['unsup_student'], batch_info))
+        return losses
+
+    def loss_by_gt_instances(self, batch_inputs: Tensor,
+                             batch_data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and ground-truth data
+        samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+
+        losses = self.student.loss(batch_inputs, batch_data_samples)
+        sup_weight = self.semi_train_cfg.get('sup_weight', 1.)
+        return rename_loss_dict('sup_', reweight_loss_dict(losses, sup_weight))
+
+    def loss_by_pseudo_instances(self,
+                                 batch_inputs: Tensor,
+                                 batch_data_samples: SampleList,
+                                 batch_info: Optional[dict] = None) -> dict:
+        """Calculate losses from a batch of inputs and pseudo data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+            batch_info (dict): Batch information of teacher model
+                forward propagation process. Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+        batch_data_samples = filter_gt_instances(
+            batch_data_samples, score_thr=self.semi_train_cfg.cls_pseudo_thr)
+        losses = self.student.loss(batch_inputs, batch_data_samples)
+        pseudo_instances_num = sum([
+            len(data_samples.gt_instances)
+            for data_samples in batch_data_samples
+        ])
+        unsup_weight = self.semi_train_cfg.get(
+            'unsup_weight', 1.) if pseudo_instances_num > 0 else 0.
+        return rename_loss_dict('unsup_',
+                                reweight_loss_dict(losses, unsup_weight))
+
+    @torch.no_grad()
+    def get_pseudo_instances(
+            self, batch_inputs: Tensor, batch_data_samples: SampleList
+    ) -> Tuple[SampleList, Optional[dict]]:
+        """Get pseudo instances from teacher model."""
+        self.teacher.eval()
+        results_list = self.teacher.predict(
+            batch_inputs, batch_data_samples, rescale=False)
+        batch_info = {}
+        for data_samples, results in zip(batch_data_samples, results_list):
+            data_samples.gt_instances = results.pred_instances
+            data_samples.gt_instances.bboxes = bbox_project(
+                data_samples.gt_instances.bboxes,
+                torch.from_numpy(data_samples.homography_matrix).inverse().to(
+                    self.data_preprocessor.device), data_samples.ori_shape)
+        return batch_data_samples, batch_info
+
+    def project_pseudo_instances(self, batch_pseudo_instances: SampleList,
+                                 batch_data_samples: SampleList) -> SampleList:
+        """Project pseudo instances."""
+        for pseudo_instances, data_samples in zip(batch_pseudo_instances,
+                                                  batch_data_samples):
+            data_samples.gt_instances = copy.deepcopy(
+                pseudo_instances.gt_instances)
+            data_samples.gt_instances.bboxes = bbox_project(
+                data_samples.gt_instances.bboxes,
+                torch.tensor(data_samples.homography_matrix).to(
+                    self.data_preprocessor.device), data_samples.img_shape)
+        wh_thr = self.semi_train_cfg.get('min_pseudo_bbox_wh', (1e-2, 1e-2))
+        return filter_gt_instances(batch_data_samples, wh_thr=wh_thr)
+
+    def predict(self, batch_inputs: Tensor,
+                batch_data_samples: SampleList) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Return the detection results of the
+            input images. The returns value is DetDataSample,
+            which usually contain 'pred_instances'. And the
+            ``pred_instances`` usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        if self.semi_test_cfg.get('predict_on', 'teacher') == 'teacher':
+            return self.teacher(
+                batch_inputs, batch_data_samples, mode='predict')
+        else:
+            return self.student(
+                batch_inputs, batch_data_samples, mode='predict')
+
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> SampleList:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+
+        Returns:
+            tuple: A tuple of features from ``rpn_head`` and ``roi_head``
+            forward.
+        """
+        if self.semi_test_cfg.get('forward_on', 'teacher') == 'teacher':
+            return self.teacher(
+                batch_inputs, batch_data_samples, mode='tensor')
+        else:
+            return self.student(
+                batch_inputs, batch_data_samples, mode='tensor')
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
+
+        Returns:
+            tuple[Tensor]: Multi-level features that may have
+            different resolutions.
+        """
+        if self.semi_test_cfg.get('extract_feat_on', 'teacher') == 'teacher':
+            return self.teacher.extract_feat(batch_inputs)
+        else:
+            return self.student.extract_feat(batch_inputs)
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Add teacher and student prefixes to model parameter names."""
+        if not any([
+                'student' in key or 'teacher' in key
+                for key in state_dict.keys()
+        ]):
+            keys = list(state_dict.keys())
+            state_dict.update({'teacher.' + k: state_dict[k] for k in keys})
+            state_dict.update({'student.' + k: state_dict[k] for k in keys})
+            for k in keys:
+                state_dict.pop(k)
+        return super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
diff --git a/head_extractor/build/lib/mmdet/models/detectors/single_stage.py b/head_extractor/build/lib/mmdet/models/detectors/single_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..06c074085967bbc9040d93e5eb446b67a006087e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/single_stage.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+
+@MODELS.register_module()
+class SingleStageDetector(BaseDetector):
+    """Base class for single-stage detectors.
+
+    Single-stage detectors directly and densely predict bounding boxes on the
+    output features of the backbone+neck.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = MODELS.build(bbox_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Exchange bbox_head key to rpn_head key when loading two-stage
+        weights into single-stage model."""
+        bbox_head_prefix = prefix + '.bbox_head' if prefix else 'bbox_head'
+        bbox_head_keys = [
+            k for k in state_dict.keys() if k.startswith(bbox_head_prefix)
+        ]
+        rpn_head_prefix = prefix + '.rpn_head' if prefix else 'rpn_head'
+        rpn_head_keys = [
+            k for k in state_dict.keys() if k.startswith(rpn_head_prefix)
+        ]
+        if len(bbox_head_keys) == 0 and len(rpn_head_keys) != 0:
+            for rpn_head_key in rpn_head_keys:
+                bbox_head_key = bbox_head_prefix + \
+                                rpn_head_key[len(rpn_head_prefix):]
+                state_dict[bbox_head_key] = state_dict.pop(rpn_head_key)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, list]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+        losses = self.bbox_head.loss(x, batch_data_samples)
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        x = self.extract_feat(batch_inputs)
+        results_list = self.bbox_head.predict(
+            x, batch_data_samples, rescale=rescale)
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
+
+    def _forward(
+            self,
+            batch_inputs: Tensor,
+            batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns:
+            tuple[list]: A tuple of features from ``bbox_head`` forward.
+        """
+        x = self.extract_feat(batch_inputs)
+        results = self.bbox_head.forward(x)
+        return results
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
+
+        Returns:
+            tuple[Tensor]: Multi-level features that may have
+            different resolutions.
+        """
+        x = self.backbone(batch_inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
diff --git a/head_extractor/build/lib/mmdet/models/detectors/single_stage_instance_seg.py b/head_extractor/build/lib/mmdet/models/detectors/single_stage_instance_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..acb5f0d2f8e4636b86b4b66cbf5c4916d0dae16f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/single_stage_instance_seg.py
@@ -0,0 +1,180 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Tuple
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class SingleStageInstanceSegmentor(BaseDetector):
+    """Base class for single-stage instance segmentors."""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 mask_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        else:
+            self.neck = None
+        if bbox_head is not None:
+            bbox_head.update(train_cfg=copy.deepcopy(train_cfg))
+            bbox_head.update(test_cfg=copy.deepcopy(test_cfg))
+            self.bbox_head = MODELS.build(bbox_head)
+        else:
+            self.bbox_head = None
+
+        assert mask_head, f'`mask_head` must ' \
+                          f'be implemented in {self.__class__.__name__}'
+        mask_head.update(train_cfg=copy.deepcopy(train_cfg))
+        mask_head.update(test_cfg=copy.deepcopy(test_cfg))
+        self.mask_head = MODELS.build(mask_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
+
+        Returns:
+            tuple[Tensor]: Multi-level features that may have different
+            resolutions.
+        """
+        x = self.backbone(batch_inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def _forward(self,
+                 batch_inputs: Tensor,
+                 batch_data_samples: OptSampleList = None,
+                 **kwargs) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+
+        Returns:
+            tuple: A tuple of features from ``bbox_head`` forward.
+        """
+        outs = ()
+        # backbone
+        x = self.extract_feat(batch_inputs)
+        # bbox_head
+        positive_infos = None
+        if self.with_bbox:
+            assert batch_data_samples is not None
+            bbox_outs = self.bbox_head.forward(x)
+            outs = outs + (bbox_outs, )
+            # It is necessary to use `bbox_head.loss` to update
+            # `_raw_positive_infos` which will be used in `get_positive_infos`
+            # positive_infos will be used in the following mask head.
+            _ = self.bbox_head.loss(x, batch_data_samples, **kwargs)
+            positive_infos = self.bbox_head.get_positive_infos()
+        # mask_head
+        if positive_infos is None:
+            mask_outs = self.mask_head.forward(x)
+        else:
+            mask_outs = self.mask_head.forward(x, positive_infos)
+        outs = outs + (mask_outs, )
+        return outs
+
+    def loss(self, batch_inputs: Tensor, batch_data_samples: SampleList,
+             **kwargs) -> dict:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+        losses = dict()
+
+        positive_infos = None
+        # CondInst and YOLACT have bbox_head
+        if self.with_bbox:
+            bbox_losses = self.bbox_head.loss(x, batch_data_samples, **kwargs)
+            losses.update(bbox_losses)
+            # get positive information from bbox head, which will be used
+            # in the following mask head.
+            positive_infos = self.bbox_head.get_positive_infos()
+
+        mask_loss = self.mask_head.loss(
+            x, batch_data_samples, positive_infos=positive_infos, **kwargs)
+        # avoid loss override
+        assert not set(mask_loss.keys()) & set(losses.keys())
+
+        losses.update(mask_loss)
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True,
+                **kwargs) -> SampleList:
+        """Perform forward propagation of the mask head and predict mask
+        results on the features of the upstream network.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+                (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+                the last dimension 4 arrange as (x1, y1, x2, y2).
+            - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        x = self.extract_feat(batch_inputs)
+        if self.with_bbox:
+            # the bbox branch does not need to be scaled to the original
+            # image scale, because the mask branch will scale both bbox
+            # and mask at the same time.
+            bbox_rescale = rescale if not self.with_mask else False
+            results_list = self.bbox_head.predict(
+                x, batch_data_samples, rescale=bbox_rescale)
+        else:
+            results_list = None
+
+        results_list = self.mask_head.predict(
+            x, batch_data_samples, rescale=rescale, results_list=results_list)
+
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
diff --git a/head_extractor/build/lib/mmdet/models/detectors/soft_teacher.py b/head_extractor/build/lib/mmdet/models/detectors/soft_teacher.py
new file mode 100644
index 0000000000000000000000000000000000000000..80853f1d8399c70008923067777a2581671ede0b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/soft_teacher.py
@@ -0,0 +1,378 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional, Tuple
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.utils import (filter_gt_instances, rename_loss_dict,
+                                reweight_loss_dict)
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi, bbox_project
+from mmdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig
+from ..utils.misc import unpack_gt_instances
+from .semi_base import SemiBaseDetector
+
+
+@MODELS.register_module()
+class SoftTeacher(SemiBaseDetector):
+    r"""Implementation of `End-to-End Semi-Supervised Object Detection
+    with Soft Teacher <https://arxiv.org/abs/2106.09018>`_
+
+    Args:
+        detector (:obj:`ConfigDict` or dict): The detector config.
+        semi_train_cfg (:obj:`ConfigDict` or dict, optional):
+            The semi-supervised training config.
+        semi_test_cfg (:obj:`ConfigDict` or dict, optional):
+            The semi-supervised testing config.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: ConfigType,
+                 semi_train_cfg: OptConfigType = None,
+                 semi_test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            detector=detector,
+            semi_train_cfg=semi_train_cfg,
+            semi_test_cfg=semi_test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+    def loss_by_pseudo_instances(self,
+                                 batch_inputs: Tensor,
+                                 batch_data_samples: SampleList,
+                                 batch_info: Optional[dict] = None) -> dict:
+        """Calculate losses from a batch of inputs and pseudo data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+            batch_info (dict): Batch information of teacher model
+                forward propagation process. Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+
+        x = self.student.extract_feat(batch_inputs)
+
+        losses = {}
+        rpn_losses, rpn_results_list = self.rpn_loss_by_pseudo_instances(
+            x, batch_data_samples)
+        losses.update(**rpn_losses)
+        losses.update(**self.rcnn_cls_loss_by_pseudo_instances(
+            x, rpn_results_list, batch_data_samples, batch_info))
+        losses.update(**self.rcnn_reg_loss_by_pseudo_instances(
+            x, rpn_results_list, batch_data_samples))
+        unsup_weight = self.semi_train_cfg.get('unsup_weight', 1.)
+        return rename_loss_dict('unsup_',
+                                reweight_loss_dict(losses, unsup_weight))
+
+    @torch.no_grad()
+    def get_pseudo_instances(
+            self, batch_inputs: Tensor, batch_data_samples: SampleList
+    ) -> Tuple[SampleList, Optional[dict]]:
+        """Get pseudo instances from teacher model."""
+        assert self.teacher.with_bbox, 'Bbox head must be implemented.'
+        x = self.teacher.extract_feat(batch_inputs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            rpn_results_list = self.teacher.rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        results_list = self.teacher.roi_head.predict(
+            x, rpn_results_list, batch_data_samples, rescale=False)
+
+        for data_samples, results in zip(batch_data_samples, results_list):
+            data_samples.gt_instances = results
+
+        batch_data_samples = filter_gt_instances(
+            batch_data_samples,
+            score_thr=self.semi_train_cfg.pseudo_label_initial_score_thr)
+
+        reg_uncs_list = self.compute_uncertainty_with_aug(
+            x, batch_data_samples)
+
+        for data_samples, reg_uncs in zip(batch_data_samples, reg_uncs_list):
+            data_samples.gt_instances['reg_uncs'] = reg_uncs
+            data_samples.gt_instances.bboxes = bbox_project(
+                data_samples.gt_instances.bboxes,
+                torch.from_numpy(data_samples.homography_matrix).inverse().to(
+                    self.data_preprocessor.device), data_samples.ori_shape)
+
+        batch_info = {
+            'feat': x,
+            'img_shape': [],
+            'homography_matrix': [],
+            'metainfo': []
+        }
+        for data_samples in batch_data_samples:
+            batch_info['img_shape'].append(data_samples.img_shape)
+            batch_info['homography_matrix'].append(
+                torch.from_numpy(data_samples.homography_matrix).to(
+                    self.data_preprocessor.device))
+            batch_info['metainfo'].append(data_samples.metainfo)
+        return batch_data_samples, batch_info
+
+    def rpn_loss_by_pseudo_instances(self, x: Tuple[Tensor],
+                                     batch_data_samples: SampleList) -> dict:
+        """Calculate rpn loss from a batch of inputs and pseudo data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+        Returns:
+            dict: A dictionary of rpn loss components
+        """
+
+        rpn_data_samples = copy.deepcopy(batch_data_samples)
+        rpn_data_samples = filter_gt_instances(
+            rpn_data_samples, score_thr=self.semi_train_cfg.rpn_pseudo_thr)
+        proposal_cfg = self.student.train_cfg.get('rpn_proposal',
+                                                  self.student.test_cfg.rpn)
+        # set cat_id of gt_labels to 0 in RPN
+        for data_sample in rpn_data_samples:
+            data_sample.gt_instances.labels = \
+                torch.zeros_like(data_sample.gt_instances.labels)
+
+        rpn_losses, rpn_results_list = self.student.rpn_head.loss_and_predict(
+            x, rpn_data_samples, proposal_cfg=proposal_cfg)
+        for key in rpn_losses.keys():
+            if 'loss' in key and 'rpn' not in key:
+                rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+        return rpn_losses, rpn_results_list
+
+    def rcnn_cls_loss_by_pseudo_instances(self, x: Tuple[Tensor],
+                                          unsup_rpn_results_list: InstanceList,
+                                          batch_data_samples: SampleList,
+                                          batch_info: dict) -> dict:
+        """Calculate classification loss from a batch of inputs and pseudo data
+        samples.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            unsup_rpn_results_list (list[:obj:`InstanceData`]):
+                List of region proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+            batch_info (dict): Batch information of teacher model
+                forward propagation process.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of rcnn
+                classification loss components
+        """
+        rpn_results_list = copy.deepcopy(unsup_rpn_results_list)
+        cls_data_samples = copy.deepcopy(batch_data_samples)
+        cls_data_samples = filter_gt_instances(
+            cls_data_samples, score_thr=self.semi_train_cfg.cls_pseudo_thr)
+
+        outputs = unpack_gt_instances(cls_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(cls_data_samples)
+        sampling_results = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+            assign_result = self.student.roi_head.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.student.roi_head.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            sampling_results.append(sampling_result)
+
+        selected_bboxes = [res.priors for res in sampling_results]
+        rois = bbox2roi(selected_bboxes)
+        bbox_results = self.student.roi_head._bbox_forward(x, rois)
+        # cls_reg_targets is a tuple of labels, label_weights,
+        # and bbox_targets, bbox_weights
+        cls_reg_targets = self.student.roi_head.bbox_head.get_targets(
+            sampling_results, self.student.train_cfg.rcnn)
+
+        selected_results_list = []
+        for bboxes, data_samples, teacher_matrix, teacher_img_shape in zip(
+                selected_bboxes, batch_data_samples,
+                batch_info['homography_matrix'], batch_info['img_shape']):
+            student_matrix = torch.tensor(
+                data_samples.homography_matrix, device=teacher_matrix.device)
+            homography_matrix = teacher_matrix @ student_matrix.inverse()
+            projected_bboxes = bbox_project(bboxes, homography_matrix,
+                                            teacher_img_shape)
+            selected_results_list.append(InstanceData(bboxes=projected_bboxes))
+
+        with torch.no_grad():
+            results_list = self.teacher.roi_head.predict_bbox(
+                batch_info['feat'],
+                batch_info['metainfo'],
+                selected_results_list,
+                rcnn_test_cfg=None,
+                rescale=False)
+            bg_score = torch.cat(
+                [results.scores[:, -1] for results in results_list])
+            # cls_reg_targets[0] is labels
+            neg_inds = cls_reg_targets[
+                0] == self.student.roi_head.bbox_head.num_classes
+            # cls_reg_targets[1] is label_weights
+            cls_reg_targets[1][neg_inds] = bg_score[neg_inds].detach()
+
+        losses = self.student.roi_head.bbox_head.loss(
+            bbox_results['cls_score'], bbox_results['bbox_pred'], rois,
+            *cls_reg_targets)
+        # cls_reg_targets[1] is label_weights
+        losses['loss_cls'] = losses['loss_cls'] * len(
+            cls_reg_targets[1]) / max(sum(cls_reg_targets[1]), 1.0)
+        return losses
+
+    def rcnn_reg_loss_by_pseudo_instances(
+            self, x: Tuple[Tensor], unsup_rpn_results_list: InstanceList,
+            batch_data_samples: SampleList) -> dict:
+        """Calculate rcnn regression loss from a batch of inputs and pseudo
+        data samples.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            unsup_rpn_results_list (list[:obj:`InstanceData`]):
+                List of region proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of rcnn
+                regression loss components
+        """
+        rpn_results_list = copy.deepcopy(unsup_rpn_results_list)
+        reg_data_samples = copy.deepcopy(batch_data_samples)
+        for data_samples in reg_data_samples:
+            if data_samples.gt_instances.bboxes.shape[0] > 0:
+                data_samples.gt_instances = data_samples.gt_instances[
+                    data_samples.gt_instances.reg_uncs <
+                    self.semi_train_cfg.reg_pseudo_thr]
+        roi_losses = self.student.roi_head.loss(x, rpn_results_list,
+                                                reg_data_samples)
+        return {'loss_bbox': roi_losses['loss_bbox']}
+
+    def compute_uncertainty_with_aug(
+            self, x: Tuple[Tensor],
+            batch_data_samples: SampleList) -> List[Tensor]:
+        """Compute uncertainty with augmented bboxes.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+
+        Returns:
+            list[Tensor]: A list of uncertainty for pseudo bboxes.
+        """
+        auged_results_list = self.aug_box(batch_data_samples,
+                                          self.semi_train_cfg.jitter_times,
+                                          self.semi_train_cfg.jitter_scale)
+        # flatten
+        auged_results_list = [
+            InstanceData(bboxes=auged.reshape(-1, auged.shape[-1]))
+            for auged in auged_results_list
+        ]
+
+        self.teacher.roi_head.test_cfg = None
+        results_list = self.teacher.roi_head.predict(
+            x, auged_results_list, batch_data_samples, rescale=False)
+        self.teacher.roi_head.test_cfg = self.teacher.test_cfg.rcnn
+
+        reg_channel = max(
+            [results.bboxes.shape[-1] for results in results_list]) // 4
+        bboxes = [
+            results.bboxes.reshape(self.semi_train_cfg.jitter_times, -1,
+                                   results.bboxes.shape[-1])
+            if results.bboxes.numel() > 0 else results.bboxes.new_zeros(
+                self.semi_train_cfg.jitter_times, 0, 4 * reg_channel).float()
+            for results in results_list
+        ]
+
+        box_unc = [bbox.std(dim=0) for bbox in bboxes]
+        bboxes = [bbox.mean(dim=0) for bbox in bboxes]
+        labels = [
+            data_samples.gt_instances.labels
+            for data_samples in batch_data_samples
+        ]
+        if reg_channel != 1:
+            bboxes = [
+                bbox.reshape(bbox.shape[0], reg_channel,
+                             4)[torch.arange(bbox.shape[0]), label]
+                for bbox, label in zip(bboxes, labels)
+            ]
+            box_unc = [
+                unc.reshape(unc.shape[0], reg_channel,
+                            4)[torch.arange(unc.shape[0]), label]
+                for unc, label in zip(box_unc, labels)
+            ]
+
+        box_shape = [(bbox[:, 2:4] - bbox[:, :2]).clamp(min=1.0)
+                     for bbox in bboxes]
+        box_unc = [
+            torch.mean(
+                unc / wh[:, None, :].expand(-1, 2, 2).reshape(-1, 4), dim=-1)
+            if wh.numel() > 0 else unc for unc, wh in zip(box_unc, box_shape)
+        ]
+        return box_unc
+
+    @staticmethod
+    def aug_box(batch_data_samples, times, frac):
+        """Augment bboxes with jitter."""
+
+        def _aug_single(box):
+            box_scale = box[:, 2:4] - box[:, :2]
+            box_scale = (
+                box_scale.clamp(min=1)[:, None, :].expand(-1, 2,
+                                                          2).reshape(-1, 4))
+            aug_scale = box_scale * frac  # [n,4]
+
+            offset = (
+                torch.randn(times, box.shape[0], 4, device=box.device) *
+                aug_scale[None, ...])
+            new_box = box.clone()[None, ...].expand(times, box.shape[0],
+                                                    -1) + offset
+            return new_box
+
+        return [
+            _aug_single(data_samples.gt_instances.bboxes)
+            for data_samples in batch_data_samples
+        ]
diff --git a/head_extractor/build/lib/mmdet/models/detectors/solo.py b/head_extractor/build/lib/mmdet/models/detectors/solo.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bf47ba24941e09fd795b241a3f6aa0b67ae3380
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/solo.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class SOLO(SingleStageInstanceSegmentor):
+    """`SOLO: Segmenting Objects by Locations
+    <https://arxiv.org/abs/1912.04488>`_
+
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 mask_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/solov2.py b/head_extractor/build/lib/mmdet/models/detectors/solov2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eefe4c532267be1480d13b8d73fc54bf694e81c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/solov2.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class SOLOv2(SingleStageInstanceSegmentor):
+    """`SOLOv2: Dynamic and Fast Instance Segmentation
+    <https://arxiv.org/abs/2003.10152>`_
+
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 mask_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/sparse_rcnn.py b/head_extractor/build/lib/mmdet/models/detectors/sparse_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..75442a69e472953854ded9fc8c30ac4ab30535d3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/sparse_rcnn.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class SparseRCNN(TwoStageDetector):
+    r"""Implementation of `Sparse R-CNN: End-to-End Object Detection with
+    Learnable Proposals <https://arxiv.org/abs/2011.12450>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 rpn_head: OptConfigType = None,
+                 roi_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+        assert self.with_rpn, 'Sparse R-CNN and QueryInst ' \
+            'do not support external proposals'
diff --git a/head_extractor/build/lib/mmdet/models/detectors/tood.py b/head_extractor/build/lib/mmdet/models/detectors/tood.py
new file mode 100644
index 0000000000000000000000000000000000000000..38720482c5451471f5a66a6cf689dbed6100c9fa
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/tood.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class TOOD(SingleStageDetector):
+    r"""Implementation of `TOOD: Task-aligned One-stage Object Detection.
+    <https://arxiv.org/abs/2108.07755>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of TOOD. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of TOOD. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/trident_faster_rcnn.py b/head_extractor/build/lib/mmdet/models/detectors/trident_faster_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4244925beaebea820f836b41ab5463f5f499f4d0
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/trident_faster_rcnn.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .faster_rcnn import FasterRCNN
+
+
+@MODELS.register_module()
+class TridentFasterRCNN(FasterRCNN):
+    """Implementation of `TridentNet <https://arxiv.org/abs/1901.01892>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+        assert self.backbone.num_branch == self.roi_head.num_branch
+        assert self.backbone.test_branch_idx == self.roi_head.test_branch_idx
+        self.num_branch = self.backbone.num_branch
+        self.test_branch_idx = self.backbone.test_branch_idx
+
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> tuple:
+        """copy the ``batch_data_samples`` to fit multi-branch."""
+        num_branch = self.num_branch \
+            if self.training or self.test_branch_idx == -1 else 1
+        trident_data_samples = batch_data_samples * num_branch
+        return super()._forward(
+            batch_inputs=batch_inputs, batch_data_samples=trident_data_samples)
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """copy the ``batch_data_samples`` to fit multi-branch."""
+        num_branch = self.num_branch \
+            if self.training or self.test_branch_idx == -1 else 1
+        trident_data_samples = batch_data_samples * num_branch
+        return super().loss(
+            batch_inputs=batch_inputs, batch_data_samples=trident_data_samples)
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """copy the ``batch_data_samples`` to fit multi-branch."""
+        num_branch = self.num_branch \
+            if self.training or self.test_branch_idx == -1 else 1
+        trident_data_samples = batch_data_samples * num_branch
+        return super().predict(
+            batch_inputs=batch_inputs,
+            batch_data_samples=trident_data_samples,
+            rescale=rescale)
+
+    # TODO need to refactor
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        x = self.extract_feats(imgs)
+        num_branch = (self.num_branch if self.test_branch_idx == -1 else 1)
+        trident_img_metas = [img_metas * num_branch for img_metas in img_metas]
+        proposal_list = self.rpn_head.aug_test_rpn(x, trident_img_metas)
+        return self.roi_head.aug_test(
+            x, proposal_list, img_metas, rescale=rescale)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/two_stage.py b/head_extractor/build/lib/mmdet/models/detectors/two_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e83df9eb5ce837636e10c4592fe26a7edce1657
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/two_stage.py
@@ -0,0 +1,243 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List, Tuple, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+
+@MODELS.register_module()
+class TwoStageDetector(BaseDetector):
+    """Base class for two-stage detectors.
+
+    Two-stage detectors typically consisting of a region proposal network and a
+    task-specific regression head.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 rpn_head: OptConfigType = None,
+                 roi_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+
+        if rpn_head is not None:
+            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
+            rpn_head_ = rpn_head.copy()
+            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
+            rpn_head_num_classes = rpn_head_.get('num_classes', None)
+            if rpn_head_num_classes is None:
+                rpn_head_.update(num_classes=1)
+            else:
+                if rpn_head_num_classes != 1:
+                    warnings.warn(
+                        'The `num_classes` should be 1 in RPN, but get '
+                        f'{rpn_head_num_classes}, please set '
+                        'rpn_head.num_classes = 1 in your config file.')
+                    rpn_head_.update(num_classes=1)
+            self.rpn_head = MODELS.build(rpn_head_)
+
+        if roi_head is not None:
+            # update train and test cfg here for now
+            # TODO: refactor assigner & sampler
+            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
+            roi_head.update(train_cfg=rcnn_train_cfg)
+            roi_head.update(test_cfg=test_cfg.rcnn)
+            self.roi_head = MODELS.build(roi_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Exchange bbox_head key to rpn_head key when loading single-stage
+        weights into two-stage model."""
+        bbox_head_prefix = prefix + '.bbox_head' if prefix else 'bbox_head'
+        bbox_head_keys = [
+            k for k in state_dict.keys() if k.startswith(bbox_head_prefix)
+        ]
+        rpn_head_prefix = prefix + '.rpn_head' if prefix else 'rpn_head'
+        rpn_head_keys = [
+            k for k in state_dict.keys() if k.startswith(rpn_head_prefix)
+        ]
+        if len(bbox_head_keys) != 0 and len(rpn_head_keys) == 0:
+            for bbox_head_key in bbox_head_keys:
+                rpn_head_key = rpn_head_prefix + \
+                               bbox_head_key[len(bbox_head_prefix):]
+                state_dict[rpn_head_key] = state_dict.pop(bbox_head_key)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    @property
+    def with_rpn(self) -> bool:
+        """bool: whether the detector has RPN"""
+        return hasattr(self, 'rpn_head') and self.rpn_head is not None
+
+    @property
+    def with_roi_head(self) -> bool:
+        """bool: whether the detector has a RoI head"""
+        return hasattr(self, 'roi_head') and self.roi_head is not None
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
+
+        Returns:
+            tuple[Tensor]: Multi-level features that may have
+            different resolutions.
+        """
+        x = self.backbone(batch_inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns:
+            tuple: A tuple of features from ``rpn_head`` and ``roi_head``
+            forward.
+        """
+        results = ()
+        x = self.extract_feat(batch_inputs)
+
+        if self.with_rpn:
+            rpn_results_list = self.rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            assert batch_data_samples[0].get('proposals', None) is not None
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+        roi_outs = self.roi_head.forward(x, rpn_results_list,
+                                         batch_data_samples)
+        results = results + (roi_outs, )
+        return results
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+        x = self.extract_feat(batch_inputs)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.with_rpn:
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            rpn_data_samples = copy.deepcopy(batch_data_samples)
+            # set cat_id of gt_labels to 0 in RPN
+            for data_sample in rpn_data_samples:
+                data_sample.gt_instances.labels = \
+                    torch.zeros_like(data_sample.gt_instances.labels)
+
+            rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict(
+                x, rpn_data_samples, proposal_cfg=proposal_cfg)
+            # avoid get same name with roi_head loss
+            keys = rpn_losses.keys()
+            for key in list(keys):
+                if 'loss' in key and 'rpn' not in key:
+                    rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+            losses.update(rpn_losses)
+        else:
+            assert batch_data_samples[0].get('proposals', None) is not None
+            # use pre-defined proposals in InstanceData for the second stage
+            # to extract ROI features.
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        roi_losses = self.roi_head.loss(x, rpn_results_list,
+                                        batch_data_samples)
+        losses.update(roi_losses)
+
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Return the detection results of the
+            input images. The returns value is DetDataSample,
+            which usually contain 'pred_instances'. And the
+            ``pred_instances`` usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        x = self.extract_feat(batch_inputs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            rpn_results_list = self.rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        results_list = self.roi_head.predict(
+            x, rpn_results_list, batch_data_samples, rescale=rescale)
+
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
diff --git a/head_extractor/build/lib/mmdet/models/detectors/vfnet.py b/head_extractor/build/lib/mmdet/models/detectors/vfnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a695513faa7d37756d7716cbca0e457060400518
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/vfnet.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class VFNet(SingleStageDetector):
+    """Implementation of `VarifocalNet
+    (VFNet).<https://arxiv.org/abs/2008.13367>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of VFNet. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of VFNet. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/yolact.py b/head_extractor/build/lib/mmdet/models/detectors/yolact.py
new file mode 100644
index 0000000000000000000000000000000000000000..f15fb7b70263b0c4018751067771b1365af96f67
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/yolact.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class YOLACT(SingleStageInstanceSegmentor):
+    """Implementation of `YOLACT <https://arxiv.org/abs/1904.02689>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 mask_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/yolo.py b/head_extractor/build/lib/mmdet/models/detectors/yolo.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cb9a9cd250a2c26af22032b1ed4bb5a7a8af605
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/yolo.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class YOLOV3(SingleStageDetector):
+    r"""Implementation of `Yolov3: An incremental improvement
+    <https://arxiv.org/abs/1804.02767>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of YOLOX. Default: None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of YOLOX. Default: None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional):
+            Model preprocessing config for processing the input data.
+            it usually includes ``to_rgb``, ``pad_size_divisor``,
+            ``pad_value``, ``mean`` and ``std``. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/yolof.py b/head_extractor/build/lib/mmdet/models/detectors/yolof.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d98b9134a7f422fa7ea1f1a1e0d548d36603e8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/yolof.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class YOLOF(SingleStageDetector):
+    r"""Implementation of `You Only Look One-level Feature
+    <https://arxiv.org/abs/2103.09460>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of YOLOF. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of YOLOF. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional):
+            Model preprocessing config for processing the input data.
+            it usually includes ``to_rgb``, ``pad_size_divisor``,
+            ``pad_value``, ``mean`` and ``std``. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/detectors/yolox.py b/head_extractor/build/lib/mmdet/models/detectors/yolox.py
new file mode 100644
index 0000000000000000000000000000000000000000..df9190c93f7b043910fbce3bd5ee8dc0ef7b5f68
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/detectors/yolox.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class YOLOX(SingleStageDetector):
+    r"""Implementation of `YOLOX: Exceeding YOLO Series in 2021
+    <https://arxiv.org/abs/2107.08430>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of YOLOX. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of YOLOX. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/language_models/__init__.py b/head_extractor/build/lib/mmdet/models/language_models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..70f1a22c7c01624ba3235f1737f8aea1e26a19fe
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/language_models/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bert import BertModel
+
+__all__ = ['BertModel']
diff --git a/head_extractor/build/lib/mmdet/models/language_models/bert.py b/head_extractor/build/lib/mmdet/models/language_models/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..efb0f46bad6eb0734a324c32a7b05f2795604265
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/language_models/bert.py
@@ -0,0 +1,231 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Sequence
+
+import torch
+from mmengine.model import BaseModel
+from torch import nn
+
+try:
+    from transformers import AutoTokenizer, BertConfig
+    from transformers import BertModel as HFBertModel
+except ImportError:
+    AutoTokenizer = None
+    HFBertModel = None
+
+from mmdet.registry import MODELS
+
+
+def generate_masks_with_special_tokens_and_transfer_map(
+        tokenized, special_tokens_list):
+    """Generate attention mask between each pair of special tokens.
+
+    Only token pairs in between two special tokens are attended to
+    and thus the attention mask for these pairs is positive.
+
+    Args:
+        input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+        special_tokens_mask (list): special tokens mask.
+
+    Returns:
+        Tuple(Tensor, Tensor):
+        - attention_mask is the attention mask between each tokens.
+          Only token pairs in between two special tokens are positive.
+          Shape: [bs, num_token, num_token].
+        - position_ids is the position id of tokens within each valid sentence.
+          The id starts from 0 whenenver a special token is encountered.
+          Shape: [bs, num_token]
+    """
+    input_ids = tokenized['input_ids']
+    bs, num_token = input_ids.shape
+    # special_tokens_mask:
+    # bs, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((bs, num_token),
+                                      device=input_ids.device).bool()
+
+    for special_token in special_tokens_list:
+        special_tokens_mask |= input_ids == special_token
+
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+
+    # generate attention mask and positional ids
+    attention_mask = (
+        torch.eye(num_token,
+                  device=input_ids.device).bool().unsqueeze(0).repeat(
+                      bs, 1, 1))
+    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1:col + 1,
+                           previous_col + 1:col + 1] = True
+            position_ids[row, previous_col + 1:col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device)
+        previous_col = col
+
+    return attention_mask, position_ids.to(torch.long)
+
+
+@MODELS.register_module()
+class BertModel(BaseModel):
+    """BERT model for language embedding only encoder.
+
+    Args:
+        name (str, optional): name of the pretrained BERT model from
+            HuggingFace. Defaults to bert-base-uncased.
+        max_tokens (int, optional): maximum number of tokens to be
+            used for BERT. Defaults to 256.
+        pad_to_max (bool, optional): whether to pad the tokens to max_tokens.
+             Defaults to True.
+        use_sub_sentence_represent (bool, optional): whether to use sub
+            sentence represent introduced in `Grounding DINO
+            <https://arxiv.org/abs/2303.05499>`. Defaults to False.
+        special_tokens_list (list, optional): special tokens used to split
+            subsentence. It cannot be None when `use_sub_sentence_represent`
+            is True. Defaults to None.
+        add_pooling_layer (bool, optional): whether to adding pooling
+            layer in bert encoder. Defaults to False.
+        num_layers_of_embedded (int, optional): number of layers of
+            the embedded model. Defaults to 1.
+        use_checkpoint (bool, optional): whether to use gradient checkpointing.
+             Defaults to False.
+    """
+
+    def __init__(self,
+                 name: str = 'bert-base-uncased',
+                 max_tokens: int = 256,
+                 pad_to_max: bool = True,
+                 use_sub_sentence_represent: bool = False,
+                 special_tokens_list: list = None,
+                 add_pooling_layer: bool = False,
+                 num_layers_of_embedded: int = 1,
+                 use_checkpoint: bool = False,
+                 **kwargs) -> None:
+
+        super().__init__(**kwargs)
+        self.max_tokens = max_tokens
+        self.pad_to_max = pad_to_max
+
+        if AutoTokenizer is None:
+            raise RuntimeError(
+                'transformers is not installed, please install it by: '
+                'pip install transformers.')
+
+        self.tokenizer = AutoTokenizer.from_pretrained(name)
+        self.language_backbone = nn.Sequential(
+            OrderedDict([('body',
+                          BertEncoder(
+                              name,
+                              add_pooling_layer=add_pooling_layer,
+                              num_layers_of_embedded=num_layers_of_embedded,
+                              use_checkpoint=use_checkpoint))]))
+
+        self.use_sub_sentence_represent = use_sub_sentence_represent
+        if self.use_sub_sentence_represent:
+            assert special_tokens_list is not None, \
+                'special_tokens should not be None \
+                    if use_sub_sentence_represent is True'
+
+            self.special_tokens = self.tokenizer.convert_tokens_to_ids(
+                special_tokens_list)
+
+    def forward(self, captions: Sequence[str], **kwargs) -> dict:
+        """Forward function."""
+        device = next(self.language_backbone.parameters()).device
+        tokenized = self.tokenizer.batch_encode_plus(
+            captions,
+            max_length=self.max_tokens,
+            padding='max_length' if self.pad_to_max else 'longest',
+            return_special_tokens_mask=True,
+            return_tensors='pt',
+            truncation=True).to(device)
+        input_ids = tokenized.input_ids
+        if self.use_sub_sentence_represent:
+            attention_mask, position_ids = \
+                generate_masks_with_special_tokens_and_transfer_map(
+                    tokenized, self.special_tokens)
+            token_type_ids = tokenized['token_type_ids']
+
+        else:
+            attention_mask = tokenized.attention_mask
+            position_ids = None
+            token_type_ids = None
+
+        tokenizer_input = {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'position_ids': position_ids,
+            'token_type_ids': token_type_ids
+        }
+        language_dict_features = self.language_backbone(tokenizer_input)
+        if self.use_sub_sentence_represent:
+            language_dict_features['position_ids'] = position_ids
+            language_dict_features[
+                'text_token_mask'] = tokenized.attention_mask.bool()
+        return language_dict_features
+
+
+class BertEncoder(nn.Module):
+    """BERT encoder for language embedding.
+
+    Args:
+        name (str): name of the pretrained BERT model from HuggingFace.
+                Defaults to bert-base-uncased.
+        add_pooling_layer (bool): whether to add a pooling layer.
+        num_layers_of_embedded (int): number of layers of the embedded model.
+                Defaults to 1.
+        use_checkpoint (bool): whether to use gradient checkpointing.
+                Defaults to False.
+    """
+
+    def __init__(self,
+                 name: str,
+                 add_pooling_layer: bool = False,
+                 num_layers_of_embedded: int = 1,
+                 use_checkpoint: bool = False):
+        super().__init__()
+        if BertConfig is None:
+            raise RuntimeError(
+                'transformers is not installed, please install it by: '
+                'pip install transformers.')
+        config = BertConfig.from_pretrained(name)
+        config.gradient_checkpointing = use_checkpoint
+        # only encoder
+        self.model = HFBertModel.from_pretrained(
+            name, add_pooling_layer=add_pooling_layer, config=config)
+        self.language_dim = config.hidden_size
+        self.num_layers_of_embedded = num_layers_of_embedded
+
+    def forward(self, x) -> dict:
+        mask = x['attention_mask']
+
+        outputs = self.model(
+            input_ids=x['input_ids'],
+            attention_mask=mask,
+            position_ids=x['position_ids'],
+            token_type_ids=x['token_type_ids'],
+            output_hidden_states=True,
+        )
+
+        # outputs has 13 layers, 1 input layer and 12 hidden layers
+        encoded_layers = outputs.hidden_states[1:]
+        features = torch.stack(encoded_layers[-self.num_layers_of_embedded:],
+                               1).mean(1)
+        # language embedding has shape [len(phrase), seq_len, language_dim]
+        features = features / self.num_layers_of_embedded
+        if mask.dim() == 2:
+            embedded = features * mask.unsqueeze(-1).float()
+        else:
+            embedded = features
+
+        results = {
+            'embedded': embedded,
+            'masks': mask,
+            'hidden': encoded_layers[-1]
+        }
+        return results
diff --git a/head_extractor/build/lib/mmdet/models/layers/__init__.py b/head_extractor/build/lib/mmdet/models/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3c41f64d11bbdb7f2c8e128a2e28b2845159589
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/__init__.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .activations import SiLU
+from .bbox_nms import fast_nms, multiclass_nms
+from .brick_wrappers import (AdaptiveAvgPool2d, FrozenBatchNorm2d,
+                             adaptive_avg_pool2d)
+from .conv_upsample import ConvUpsample
+from .csp_layer import CSPLayer
+from .dropblock import DropBlock
+from .ema import ExpMomentumEMA
+from .inverted_residual import InvertedResidual
+from .matrix_nms import mask_matrix_nms
+from .msdeformattn_pixel_decoder import MSDeformAttnPixelDecoder
+from .normed_predictor import NormedConv2d, NormedLinear
+from .pixel_decoder import PixelDecoder, TransformerEncoderPixelDecoder
+from .positional_encoding import (LearnedPositionalEncoding,
+                                  SinePositionalEncoding,
+                                  SinePositionalEncoding3D)
+from .res_layer import ResLayer, SimplifiedBasicBlock
+from .se_layer import ChannelAttention, DyReLU, SELayer
+# yapf: disable
+from .transformer import (MLP, AdaptivePadding, CdnQueryGenerator,
+                          ConditionalAttention,
+                          ConditionalDetrTransformerDecoder,
+                          ConditionalDetrTransformerDecoderLayer,
+                          DABDetrTransformerDecoder,
+                          DABDetrTransformerDecoderLayer,
+                          DABDetrTransformerEncoder, DDQTransformerDecoder,
+                          DeformableDetrTransformerDecoder,
+                          DeformableDetrTransformerDecoderLayer,
+                          DeformableDetrTransformerEncoder,
+                          DeformableDetrTransformerEncoderLayer,
+                          DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DetrTransformerEncoder, DetrTransformerEncoderLayer,
+                          DinoTransformerDecoder, DynamicConv,
+                          Mask2FormerTransformerDecoder,
+                          Mask2FormerTransformerDecoderLayer,
+                          Mask2FormerTransformerEncoder, PatchEmbed,
+                          PatchMerging, coordinate_to_encoding,
+                          inverse_sigmoid, nchw_to_nlc, nlc_to_nchw)
+
+# yapf: enable
+
+__all__ = [
+    'fast_nms', 'multiclass_nms', 'mask_matrix_nms', 'DropBlock',
+    'PixelDecoder', 'TransformerEncoderPixelDecoder',
+    'MSDeformAttnPixelDecoder', 'ResLayer', 'PatchMerging',
+    'SinePositionalEncoding', 'LearnedPositionalEncoding', 'DynamicConv',
+    'SimplifiedBasicBlock', 'NormedLinear', 'NormedConv2d', 'InvertedResidual',
+    'SELayer', 'ConvUpsample', 'CSPLayer', 'adaptive_avg_pool2d',
+    'AdaptiveAvgPool2d', 'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw', 'DyReLU',
+    'ExpMomentumEMA', 'inverse_sigmoid', 'ChannelAttention', 'SiLU', 'MLP',
+    'DetrTransformerEncoderLayer', 'DetrTransformerDecoderLayer',
+    'DetrTransformerEncoder', 'DetrTransformerDecoder',
+    'DeformableDetrTransformerEncoder', 'DeformableDetrTransformerDecoder',
+    'DeformableDetrTransformerEncoderLayer',
+    'DeformableDetrTransformerDecoderLayer', 'AdaptivePadding',
+    'coordinate_to_encoding', 'ConditionalAttention',
+    'DABDetrTransformerDecoderLayer', 'DABDetrTransformerDecoder',
+    'DABDetrTransformerEncoder', 'DDQTransformerDecoder',
+    'ConditionalDetrTransformerDecoder',
+    'ConditionalDetrTransformerDecoderLayer', 'DinoTransformerDecoder',
+    'CdnQueryGenerator', 'Mask2FormerTransformerEncoder',
+    'Mask2FormerTransformerDecoderLayer', 'Mask2FormerTransformerDecoder',
+    'SinePositionalEncoding3D', 'FrozenBatchNorm2d'
+]
diff --git a/head_extractor/build/lib/mmdet/models/layers/activations.py b/head_extractor/build/lib/mmdet/models/layers/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e73ef42180ccd3dddb4bcca224c0b4eb5da807c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/activations.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.utils import digit_version
+
+from mmdet.registry import MODELS
+
+if digit_version(torch.__version__) >= digit_version('1.7.0'):
+    from torch.nn import SiLU
+else:
+
+    class SiLU(nn.Module):
+        """Sigmoid Weighted Liner Unit."""
+
+        def __init__(self, inplace=True):
+            super().__init__()
+
+        def forward(self, inputs) -> torch.Tensor:
+            return inputs * torch.sigmoid(inputs)
+
+
+MODELS.register_module(module=SiLU, name='SiLU')
diff --git a/head_extractor/build/lib/mmdet/models/layers/bbox_nms.py b/head_extractor/build/lib/mmdet/models/layers/bbox_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd67a45f60ca98c354e095127ab7dbb9653deca5
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/bbox_nms.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from mmcv.ops.nms import batched_nms
+from torch import Tensor
+
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import ConfigType
+
+
+def multiclass_nms(
+    multi_bboxes: Tensor,
+    multi_scores: Tensor,
+    score_thr: float,
+    nms_cfg: ConfigType,
+    max_num: int = -1,
+    score_factors: Optional[Tensor] = None,
+    return_inds: bool = False,
+    box_dim: int = 4
+) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_cfg (Union[:obj:`ConfigDict`, dict]): a dict that contains
+            the arguments of nms operations.
+        max_num (int, optional): if there are more than max_num bboxes after
+            NMS, only top max_num will be kept. Default to -1.
+        score_factors (Tensor, optional): The factors multiplied to scores
+            before applying NMS. Default to None.
+        return_inds (bool, optional): Whether return the indices of kept
+            bboxes. Default to False.
+        box_dim (int): The dimension of boxes. Defaults to 4.
+
+    Returns:
+        Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+            (dets, labels, indices (optional)), tensors of shape (k, 5),
+            (k), and (k). Dets are boxes with scores. Labels are 0-based.
+    """
+    num_classes = multi_scores.size(1) - 1
+    # exclude background category
+    if multi_bboxes.shape[1] > box_dim:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, box_dim)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, box_dim)
+
+    scores = multi_scores[:, :-1]
+
+    labels = torch.arange(num_classes, dtype=torch.long, device=scores.device)
+    labels = labels.view(1, -1).expand_as(scores)
+
+    bboxes = bboxes.reshape(-1, box_dim)
+    scores = scores.reshape(-1)
+    labels = labels.reshape(-1)
+
+    if not torch.onnx.is_in_onnx_export():
+        # NonZero not supported  in TensorRT
+        # remove low scoring boxes
+        valid_mask = scores > score_thr
+    # multiply score_factor after threshold to preserve more bboxes, improve
+    # mAP by 1% for YOLOv3
+    if score_factors is not None:
+        # expand the shape to match original shape of score
+        score_factors = score_factors.view(-1, 1).expand(
+            multi_scores.size(0), num_classes)
+        score_factors = score_factors.reshape(-1)
+        scores = scores * score_factors
+
+    if not torch.onnx.is_in_onnx_export():
+        # NonZero not supported  in TensorRT
+        inds = valid_mask.nonzero(as_tuple=False).squeeze(1)
+        bboxes, scores, labels = bboxes[inds], scores[inds], labels[inds]
+    else:
+        # TensorRT NMS plugin has invalid output filled with -1
+        # add dummy data to make detection output correct.
+        bboxes = torch.cat([bboxes, bboxes.new_zeros(1, box_dim)], dim=0)
+        scores = torch.cat([scores, scores.new_zeros(1)], dim=0)
+        labels = torch.cat([labels, labels.new_zeros(1)], dim=0)
+
+    if bboxes.numel() == 0:
+        if torch.onnx.is_in_onnx_export():
+            raise RuntimeError('[ONNX Error] Can not record NMS '
+                               'as it has not been executed this time')
+        dets = torch.cat([bboxes, scores[:, None]], -1)
+        if return_inds:
+            return dets, labels, inds
+        else:
+            return dets, labels
+
+    dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)
+
+    if max_num > 0:
+        dets = dets[:max_num]
+        keep = keep[:max_num]
+
+    if return_inds:
+        return dets, labels[keep], inds[keep]
+    else:
+        return dets, labels[keep]
+
+
+def fast_nms(
+    multi_bboxes: Tensor,
+    multi_scores: Tensor,
+    multi_coeffs: Tensor,
+    score_thr: float,
+    iou_thr: float,
+    top_k: int,
+    max_num: int = -1
+) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+    """Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_.
+
+    Fast NMS allows already-removed detections to suppress other detections so
+    that every instance can be decided to be kept or discarded in parallel,
+    which is not possible in traditional NMS. This relaxation allows us to
+    implement Fast NMS entirely in standard GPU-accelerated matrix operations.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class+1), where the last column
+            contains scores of the background class, but this will be ignored.
+        multi_coeffs (Tensor): shape (n, #class*coeffs_dim).
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        iou_thr (float): IoU threshold to be considered as conflicted.
+        top_k (int): if there are more than top_k bboxes before NMS,
+            only top top_k will be kept.
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept. If -1, keep all the bboxes.
+            Default: -1.
+
+    Returns:
+        Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+            (dets, labels, coefficients), tensors of shape (k, 5), (k, 1),
+            and (k, coeffs_dim). Dets are boxes with scores.
+            Labels are 0-based.
+    """
+
+    scores = multi_scores[:, :-1].t()  # [#class, n]
+    scores, idx = scores.sort(1, descending=True)
+
+    idx = idx[:, :top_k].contiguous()
+    scores = scores[:, :top_k]  # [#class, topk]
+    num_classes, num_dets = idx.size()
+    boxes = multi_bboxes[idx.view(-1), :].view(num_classes, num_dets, 4)
+    coeffs = multi_coeffs[idx.view(-1), :].view(num_classes, num_dets, -1)
+
+    iou = bbox_overlaps(boxes, boxes)  # [#class, topk, topk]
+    iou.triu_(diagonal=1)
+    iou_max, _ = iou.max(dim=1)
+
+    # Now just filter out the ones higher than the threshold
+    keep = iou_max <= iou_thr
+
+    # Second thresholding introduces 0.2 mAP gain at negligible time cost
+    keep *= scores > score_thr
+
+    # Assign each kept detection to its corresponding class
+    classes = torch.arange(
+        num_classes, device=boxes.device)[:, None].expand_as(keep)
+    classes = classes[keep]
+
+    boxes = boxes[keep]
+    coeffs = coeffs[keep]
+    scores = scores[keep]
+
+    # Only keep the top max_num highest scores across all classes
+    scores, idx = scores.sort(0, descending=True)
+    if max_num > 0:
+        idx = idx[:max_num]
+        scores = scores[:max_num]
+
+    classes = classes[idx]
+    boxes = boxes[idx]
+    coeffs = coeffs[idx]
+
+    cls_dets = torch.cat([boxes, scores[:, None]], dim=1)
+    return cls_dets, classes, coeffs
diff --git a/head_extractor/build/lib/mmdet/models/layers/brick_wrappers.py b/head_extractor/build/lib/mmdet/models/layers/brick_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ecb8499de329132561dfedb8f55c36080787b31
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/brick_wrappers.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn.bricks.wrappers import NewEmptyTensorOp, obsolete_torch_version
+
+from mmdet.registry import MODELS
+
+if torch.__version__ == 'parrots':
+    TORCH_VERSION = torch.__version__
+else:
+    # torch.__version__ could be 1.3.1+cu92, we only need the first two
+    # for comparison
+    TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
+
+
+def adaptive_avg_pool2d(input, output_size):
+    """Handle empty batch dimension to adaptive_avg_pool2d.
+
+    Args:
+        input (tensor): 4D tensor.
+        output_size (int, tuple[int,int]): the target output size.
+    """
+    if input.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+        output_size = [*input.shape[:2], *output_size]
+        empty = NewEmptyTensorOp.apply(input, output_size)
+        return empty
+    else:
+        return F.adaptive_avg_pool2d(input, output_size)
+
+
+class AdaptiveAvgPool2d(nn.AdaptiveAvgPool2d):
+    """Handle empty batch dimension to AdaptiveAvgPool2d."""
+
+    def forward(self, x):
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+            output_size = self.output_size
+            if isinstance(output_size, int):
+                output_size = [output_size, output_size]
+            else:
+                output_size = [
+                    v if v is not None else d
+                    for v, d in zip(output_size,
+                                    x.size()[-2:])
+                ]
+            output_size = [*x.shape[:2], *output_size]
+            empty = NewEmptyTensorOp.apply(x, output_size)
+            return empty
+
+        return super().forward(x)
+
+
+# Modified from
+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py#L13 # noqa
+@MODELS.register_module('FrozenBN')
+class FrozenBatchNorm2d(nn.Module):
+    """BatchNorm2d where the batch statistics and the affine parameters are
+    fixed.
+
+    It contains non-trainable buffers called
+    "weight" and "bias", "running_mean", "running_var",
+    initialized to perform identity transformation.
+    Args:
+       num_features (int):  :math:`C` from an expected input of size
+            :math:`(N, C, H, W)`.
+       eps (float): a value added to the denominator for numerical stability.
+            Default: 1e-5
+    """
+
+    def __init__(self, num_features, eps=1e-5, **kwargs):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.register_buffer('weight', torch.ones(num_features))
+        self.register_buffer('bias', torch.zeros(num_features))
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features) - eps)
+
+    def forward(self, x):
+        if x.requires_grad:
+            # When gradients are needed, F.batch_norm will use extra memory
+            # because its backward op computes gradients for weight/bias
+            # as well.
+            scale = self.weight * (self.running_var + self.eps).rsqrt()
+            bias = self.bias - self.running_mean * scale
+            scale = scale.reshape(1, -1, 1, 1)
+            bias = bias.reshape(1, -1, 1, 1)
+            out_dtype = x.dtype  # may be half
+            return x * scale.to(out_dtype) + bias.to(out_dtype)
+        else:
+            # When gradients are not needed, F.batch_norm is a single fused op
+            # and provide more optimization opportunities.
+            return F.batch_norm(
+                x,
+                self.running_mean,
+                self.running_var,
+                self.weight,
+                self.bias,
+                training=False,
+                eps=self.eps,
+            )
+
+    def __repr__(self):
+        return 'FrozenBatchNorm2d(num_features={}, eps={})'.format(
+            self.num_features, self.eps)
+
+    @classmethod
+    def convert_frozen_batchnorm(cls, module):
+        """Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+
+        Args:
+            module (torch.nn.Module):
+        Returns:
+            If module is BatchNorm/SyncBatchNorm, returns a new module.
+            Otherwise, in-place convert module and return it.
+        Similar to convert_sync_batchnorm in
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+        """
+        bn_module = nn.modules.batchnorm
+        bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+        res = module
+        if isinstance(module, bn_module):
+            res = cls(module.num_features)
+            if module.affine:
+                res.weight.data = module.weight.data.clone().detach()
+                res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data
+            res.running_var.data = module.running_var.data
+            res.eps = module.eps
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozen_batchnorm(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
diff --git a/head_extractor/build/lib/mmdet/models/layers/conv_upsample.py b/head_extractor/build/lib/mmdet/models/layers/conv_upsample.py
new file mode 100644
index 0000000000000000000000000000000000000000..32505875a2162330ed7d00455f088d08d94f679e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/conv_upsample.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList
+
+
+class ConvUpsample(BaseModule):
+    """ConvUpsample performs 2x upsampling after Conv.
+
+    There are several `ConvModule` layers. In the first few layers, upsampling
+    will be applied after each layer of convolution. The number of upsampling
+    must be no more than the number of ConvModule layers.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        inner_channels (int): Number of channels produced by the convolution.
+        num_layers (int): Number of convolution layers.
+        num_upsample (int | optional): Number of upsampling layer. Must be no
+            more than num_layers. Upsampling will be applied after the first
+            ``num_upsample`` layers of convolution. Default: ``num_layers``.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict): Config dict for initialization. Default: None.
+        kwargs (key word augments): Other augments used in ConvModule.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 inner_channels,
+                 num_layers=1,
+                 num_upsample=None,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(ConvUpsample, self).__init__(init_cfg)
+        if num_upsample is None:
+            num_upsample = num_layers
+        assert num_upsample <= num_layers, \
+            f'num_upsample({num_upsample})must be no more than ' \
+            f'num_layers({num_layers})'
+        self.num_layers = num_layers
+        self.num_upsample = num_upsample
+        self.conv = ModuleList()
+        for i in range(num_layers):
+            self.conv.append(
+                ConvModule(
+                    in_channels,
+                    inner_channels,
+                    3,
+                    padding=1,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            in_channels = inner_channels
+
+    def forward(self, x):
+        num_upsample = self.num_upsample
+        for i in range(self.num_layers):
+            x = self.conv[i](x)
+            if num_upsample > 0:
+                num_upsample -= 1
+                x = F.interpolate(
+                    x, scale_factor=2, mode='bilinear', align_corners=False)
+        return x
diff --git a/head_extractor/build/lib/mmdet/models/layers/csp_layer.py b/head_extractor/build/lib/mmdet/models/layers/csp_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8b547b8994862bfe14739033bb6b254ef886f29
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/csp_layer.py
@@ -0,0 +1,246 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .se_layer import ChannelAttention
+
+
+class DarknetBottleneck(BaseModule):
+    """The basic bottleneck block used in Darknet.
+
+    Each ResBlock consists of two ConvModules and the input is added to the
+    final output. Each ConvModule is composed of Conv, BN, and LeakyReLU.
+    The first convLayer has filter size of 1x1 and the second one has the
+    filter size of 3x3.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        expansion (float): The kernel size of the convolution.
+            Defaults to 0.5.
+        add_identity (bool): Whether to add identity to the out.
+            Defaults to True.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expansion: float = 0.5,
+                 add_identity: bool = True,
+                 use_depthwise: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='Swish'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        hidden_channels = int(out_channels * expansion)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.conv1 = ConvModule(
+            in_channels,
+            hidden_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = conv(
+            hidden_channels,
+            out_channels,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.add_identity = \
+            add_identity and in_channels == out_channels
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.add_identity:
+            return out + identity
+        else:
+            return out
+
+
+class CSPNeXtBlock(BaseModule):
+    """The basic bottleneck block used in CSPNeXt.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        expansion (float): Expand ratio of the hidden channel. Defaults to 0.5.
+        add_identity (bool): Whether to add identity to the out. Only works
+            when in_channels == out_channels. Defaults to True.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        kernel_size (int): The kernel size of the second convolution layer.
+            Defaults to 5.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU').
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expansion: float = 0.5,
+                 add_identity: bool = True,
+                 use_depthwise: bool = False,
+                 kernel_size: int = 5,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        hidden_channels = int(out_channels * expansion)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.conv1 = conv(
+            in_channels,
+            hidden_channels,
+            3,
+            stride=1,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = DepthwiseSeparableConvModule(
+            hidden_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.add_identity = \
+            add_identity and in_channels == out_channels
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.add_identity:
+            return out + identity
+        else:
+            return out
+
+
+class CSPLayer(BaseModule):
+    """Cross Stage Partial Layer.
+
+    Args:
+        in_channels (int): The input channels of the CSP layer.
+        out_channels (int): The output channels of the CSP layer.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Defaults to 0.5.
+        num_blocks (int): Number of blocks. Defaults to 1.
+        add_identity (bool): Whether to add identity in blocks.
+            Defaults to True.
+        use_cspnext_block (bool): Whether to use CSPNeXt block.
+            Defaults to False.
+        use_depthwise (bool): Whether to use depthwise separable convolution in
+            blocks. Defaults to False.
+        channel_attention (bool): Whether to add channel attention in each
+            stage. Defaults to True.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish')
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expand_ratio: float = 0.5,
+                 num_blocks: int = 1,
+                 add_identity: bool = True,
+                 use_depthwise: bool = False,
+                 use_cspnext_block: bool = False,
+                 channel_attention: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='Swish'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        block = CSPNeXtBlock if use_cspnext_block else DarknetBottleneck
+        mid_channels = int(out_channels * expand_ratio)
+        self.channel_attention = channel_attention
+        self.main_conv = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.short_conv = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.final_conv = ConvModule(
+            2 * mid_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.blocks = nn.Sequential(*[
+            block(
+                mid_channels,
+                mid_channels,
+                1.0,
+                add_identity,
+                use_depthwise,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg) for _ in range(num_blocks)
+        ])
+        if channel_attention:
+            self.attention = ChannelAttention(2 * mid_channels)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        x_short = self.short_conv(x)
+
+        x_main = self.main_conv(x)
+        x_main = self.blocks(x_main)
+
+        x_final = torch.cat((x_main, x_short), dim=1)
+
+        if self.channel_attention:
+            x_final = self.attention(x_final)
+        return self.final_conv(x_final)
diff --git a/head_extractor/build/lib/mmdet/models/layers/dropblock.py b/head_extractor/build/lib/mmdet/models/layers/dropblock.py
new file mode 100644
index 0000000000000000000000000000000000000000..7938199b761d637afdb1b2c62dbca01d1bf629eb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/dropblock.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.registry import MODELS
+
+eps = 1e-6
+
+
+@MODELS.register_module()
+class DropBlock(nn.Module):
+    """Randomly drop some regions of feature maps.
+
+     Please refer to the method proposed in `DropBlock
+     <https://arxiv.org/abs/1810.12890>`_ for details.
+
+    Args:
+        drop_prob (float): The probability of dropping each block.
+        block_size (int): The size of dropped blocks.
+        warmup_iters (int): The drop probability will linearly increase
+            from `0` to `drop_prob` during the first `warmup_iters` iterations.
+            Default: 2000.
+    """
+
+    def __init__(self, drop_prob, block_size, warmup_iters=2000, **kwargs):
+        super(DropBlock, self).__init__()
+        assert block_size % 2 == 1
+        assert 0 < drop_prob <= 1
+        assert warmup_iters >= 0
+        self.drop_prob = drop_prob
+        self.block_size = block_size
+        self.warmup_iters = warmup_iters
+        self.iter_cnt = 0
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Input feature map on which some areas will be randomly
+                dropped.
+
+        Returns:
+            Tensor: The tensor after DropBlock layer.
+        """
+        if not self.training:
+            return x
+        self.iter_cnt += 1
+        N, C, H, W = list(x.shape)
+        gamma = self._compute_gamma((H, W))
+        mask_shape = (N, C, H - self.block_size + 1, W - self.block_size + 1)
+        mask = torch.bernoulli(torch.full(mask_shape, gamma, device=x.device))
+
+        mask = F.pad(mask, [self.block_size // 2] * 4, value=0)
+        mask = F.max_pool2d(
+            input=mask,
+            stride=(1, 1),
+            kernel_size=(self.block_size, self.block_size),
+            padding=self.block_size // 2)
+        mask = 1 - mask
+        x = x * mask * mask.numel() / (eps + mask.sum())
+        return x
+
+    def _compute_gamma(self, feat_size):
+        """Compute the value of gamma according to paper. gamma is the
+        parameter of bernoulli distribution, which controls the number of
+        features to drop.
+
+        gamma = (drop_prob * fm_area) / (drop_area * keep_area)
+
+        Args:
+            feat_size (tuple[int, int]): The height and width of feature map.
+
+        Returns:
+            float: The value of gamma.
+        """
+        gamma = (self.drop_prob * feat_size[0] * feat_size[1])
+        gamma /= ((feat_size[0] - self.block_size + 1) *
+                  (feat_size[1] - self.block_size + 1))
+        gamma /= (self.block_size**2)
+        factor = (1.0 if self.iter_cnt > self.warmup_iters else self.iter_cnt /
+                  self.warmup_iters)
+        return gamma * factor
+
+    def extra_repr(self):
+        return (f'drop_prob={self.drop_prob}, block_size={self.block_size}, '
+                f'warmup_iters={self.warmup_iters}')
diff --git a/head_extractor/build/lib/mmdet/models/layers/ema.py b/head_extractor/build/lib/mmdet/models/layers/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..73a0ca67c2888a0b17476e60b60eaf0b7eba4a6a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/ema.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmengine.model import ExponentialMovingAverage
+from torch import Tensor
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class ExpMomentumEMA(ExponentialMovingAverage):
+    """Exponential moving average (EMA) with exponential momentum strategy,
+    which is used in YOLOX.
+
+    Args:
+        model (nn.Module): The model to be averaged.
+        momentum (float): The momentum used for updating ema parameter.
+            Ema's parameter are updated with the formula:
+           `averaged_param = (1-momentum) * averaged_param + momentum *
+           source_param`. Defaults to 0.0002.
+        gamma (int): Use a larger momentum early in training and gradually
+            annealing to a smaller value to update the ema model smoothly. The
+            momentum is calculated as
+            `(1 - momentum) * exp(-(1 + steps) / gamma) + momentum`.
+            Defaults to 2000.
+        interval (int): Interval between two updates. Defaults to 1.
+        device (torch.device, optional): If provided, the averaged model will
+            be stored on the :attr:`device`. Defaults to None.
+        update_buffers (bool): if True, it will compute running averages for
+            both the parameters and the buffers of the model. Defaults to
+            False.
+    """
+
+    def __init__(self,
+                 model: nn.Module,
+                 momentum: float = 0.0002,
+                 gamma: int = 2000,
+                 interval=1,
+                 device: Optional[torch.device] = None,
+                 update_buffers: bool = False) -> None:
+        super().__init__(
+            model=model,
+            momentum=momentum,
+            interval=interval,
+            device=device,
+            update_buffers=update_buffers)
+        assert gamma > 0, f'gamma must be greater than 0, but got {gamma}'
+        self.gamma = gamma
+
+    def avg_func(self, averaged_param: Tensor, source_param: Tensor,
+                 steps: int) -> None:
+        """Compute the moving average of the parameters using the exponential
+        momentum strategy.
+
+        Args:
+            averaged_param (Tensor): The averaged parameters.
+            source_param (Tensor): The source parameters.
+            steps (int): The number of times the parameters have been
+                updated.
+        """
+        momentum = (1 - self.momentum) * math.exp(
+            -float(1 + steps) / self.gamma) + self.momentum
+        averaged_param.lerp_(source_param, momentum)
diff --git a/head_extractor/build/lib/mmdet/models/layers/inverted_residual.py b/head_extractor/build/lib/mmdet/models/layers/inverted_residual.py
new file mode 100644
index 0000000000000000000000000000000000000000..a174ccc8835a1ee720f9cdaa7c5be210f5be8113
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/inverted_residual.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import DropPath
+from mmengine.model import BaseModule
+
+from .se_layer import SELayer
+
+
+class InvertedResidual(BaseModule):
+    """Inverted Residual Block.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        mid_channels (int): The input channels of the depthwise convolution.
+        kernel_size (int): The kernel size of the depthwise convolution.
+            Default: 3.
+        stride (int): The stride of the depthwise convolution. Default: 1.
+        se_cfg (dict): Config dict for se layer. Default: None, which means no
+            se layer.
+        with_expand_conv (bool): Use expand conv or not. If set False,
+            mid_channels must be the same with in_channels.
+            Default: True.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_expand_conv=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 drop_path_rate=0.,
+                 with_cp=False,
+                 init_cfg=None):
+        super(InvertedResidual, self).__init__(init_cfg)
+        self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
+        assert stride in [1, 2], f'stride must in [1, 2]. ' \
+            f'But received {stride}.'
+        self.with_cp = with_cp
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.with_se = se_cfg is not None
+        self.with_expand_conv = with_expand_conv
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+        if not self.with_expand_conv:
+            assert mid_channels == in_channels
+
+        if self.with_expand_conv:
+            self.expand_conv = ConvModule(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.depthwise_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=mid_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.linear_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+
+            if self.with_expand_conv:
+                out = self.expand_conv(out)
+
+            out = self.depthwise_conv(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.linear_conv(out)
+
+            if self.with_res_shortcut:
+                return x + self.drop_path(out)
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
diff --git a/head_extractor/build/lib/mmdet/models/layers/matrix_nms.py b/head_extractor/build/lib/mmdet/models/layers/matrix_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dc8c4f74e28127fb69ccc684f0bdb2bd3943b20
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/matrix_nms.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def mask_matrix_nms(masks,
+                    labels,
+                    scores,
+                    filter_thr=-1,
+                    nms_pre=-1,
+                    max_num=-1,
+                    kernel='gaussian',
+                    sigma=2.0,
+                    mask_area=None):
+    """Matrix NMS for multi-class masks.
+
+    Args:
+        masks (Tensor): Has shape (num_instances, h, w)
+        labels (Tensor): Labels of corresponding masks,
+            has shape (num_instances,).
+        scores (Tensor): Mask scores of corresponding masks,
+            has shape (num_instances).
+        filter_thr (float): Score threshold to filter the masks
+            after matrix nms. Default: -1, which means do not
+            use filter_thr.
+        nms_pre (int): The max number of instances to do the matrix nms.
+            Default: -1, which means do not use nms_pre.
+        max_num (int, optional): If there are more than max_num masks after
+            matrix, only top max_num will be kept. Default: -1, which means
+            do not use max_num.
+        kernel (str): 'linear' or 'gaussian'.
+        sigma (float): std in gaussian method.
+        mask_area (Tensor): The sum of seg_masks.
+
+    Returns:
+        tuple(Tensor): Processed mask results.
+
+            - scores (Tensor): Updated scores, has shape (n,).
+            - labels (Tensor): Remained labels, has shape (n,).
+            - masks (Tensor): Remained masks, has shape (n, w, h).
+            - keep_inds (Tensor): The indices number of
+                the remaining mask in the input mask, has shape (n,).
+    """
+    assert len(labels) == len(masks) == len(scores)
+    if len(labels) == 0:
+        return scores.new_zeros(0), labels.new_zeros(0), masks.new_zeros(
+            0, *masks.shape[-2:]), labels.new_zeros(0)
+    if mask_area is None:
+        mask_area = masks.sum((1, 2)).float()
+    else:
+        assert len(masks) == len(mask_area)
+
+    # sort and keep top nms_pre
+    scores, sort_inds = torch.sort(scores, descending=True)
+
+    keep_inds = sort_inds
+    if nms_pre > 0 and len(sort_inds) > nms_pre:
+        sort_inds = sort_inds[:nms_pre]
+        keep_inds = keep_inds[:nms_pre]
+        scores = scores[:nms_pre]
+    masks = masks[sort_inds]
+    mask_area = mask_area[sort_inds]
+    labels = labels[sort_inds]
+
+    num_masks = len(labels)
+    flatten_masks = masks.reshape(num_masks, -1).float()
+    # inter.
+    inter_matrix = torch.mm(flatten_masks, flatten_masks.transpose(1, 0))
+    expanded_mask_area = mask_area.expand(num_masks, num_masks)
+    # Upper triangle iou matrix.
+    iou_matrix = (inter_matrix /
+                  (expanded_mask_area + expanded_mask_area.transpose(1, 0) -
+                   inter_matrix)).triu(diagonal=1)
+    # label_specific matrix.
+    expanded_labels = labels.expand(num_masks, num_masks)
+    # Upper triangle label matrix.
+    label_matrix = (expanded_labels == expanded_labels.transpose(
+        1, 0)).triu(diagonal=1)
+
+    # IoU compensation
+    compensate_iou, _ = (iou_matrix * label_matrix).max(0)
+    compensate_iou = compensate_iou.expand(num_masks,
+                                           num_masks).transpose(1, 0)
+
+    # IoU decay
+    decay_iou = iou_matrix * label_matrix
+
+    # Calculate the decay_coefficient
+    if kernel == 'gaussian':
+        decay_matrix = torch.exp(-1 * sigma * (decay_iou**2))
+        compensate_matrix = torch.exp(-1 * sigma * (compensate_iou**2))
+        decay_coefficient, _ = (decay_matrix / compensate_matrix).min(0)
+    elif kernel == 'linear':
+        decay_matrix = (1 - decay_iou) / (1 - compensate_iou)
+        decay_coefficient, _ = decay_matrix.min(0)
+    else:
+        raise NotImplementedError(
+            f'{kernel} kernel is not supported in matrix nms!')
+    # update the score.
+    scores = scores * decay_coefficient
+
+    if filter_thr > 0:
+        keep = scores >= filter_thr
+        keep_inds = keep_inds[keep]
+        if not keep.any():
+            return scores.new_zeros(0), labels.new_zeros(0), masks.new_zeros(
+                0, *masks.shape[-2:]), labels.new_zeros(0)
+        masks = masks[keep]
+        scores = scores[keep]
+        labels = labels[keep]
+
+    # sort and keep top max_num
+    scores, sort_inds = torch.sort(scores, descending=True)
+    keep_inds = keep_inds[sort_inds]
+    if max_num > 0 and len(sort_inds) > max_num:
+        sort_inds = sort_inds[:max_num]
+        keep_inds = keep_inds[:max_num]
+        scores = scores[:max_num]
+    masks = masks[sort_inds]
+    labels = labels[sort_inds]
+
+    return scores, labels, masks, keep_inds
diff --git a/head_extractor/build/lib/mmdet/models/layers/msdeformattn_pixel_decoder.py b/head_extractor/build/lib/mmdet/models/layers/msdeformattn_pixel_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a67dc3c4437f83ebe1c82d12b3ed91f429030ce7
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/msdeformattn_pixel_decoder.py
@@ -0,0 +1,246 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, ConvModule
+from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention
+from mmengine.model import (BaseModule, ModuleList, caffe2_xavier_init,
+                            normal_init, xavier_init)
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from ..task_modules.prior_generators import MlvlPointGenerator
+from .positional_encoding import SinePositionalEncoding
+from .transformer import Mask2FormerTransformerEncoder
+
+
+@MODELS.register_module()
+class MSDeformAttnPixelDecoder(BaseModule):
+    """Pixel decoder with multi-scale deformable attention.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        strides (list[int] | tuple[int]): Output strides of feature from
+            backbone.
+        feat_channels (int): Number of channels for feature.
+        out_channels (int): Number of channels for output.
+        num_outs (int): Number of output scales.
+        norm_cfg (:obj:`ConfigDict` or dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`ConfigDict` or dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`ConfigDict` or dict): Config for transformer
+            encoder. Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(num_feats=128, normalize=True).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: Union[List[int],
+                                    Tuple[int]] = [256, 512, 1024, 2048],
+                 strides: Union[List[int], Tuple[int]] = [4, 8, 16, 32],
+                 feat_channels: int = 256,
+                 out_channels: int = 256,
+                 num_outs: int = 3,
+                 norm_cfg: ConfigType = dict(type='GN', num_groups=32),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 encoder: ConfigType = None,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.strides = strides
+        self.num_input_levels = len(in_channels)
+        self.num_encoder_levels = \
+            encoder.layer_cfg.self_attn_cfg.num_levels
+        assert self.num_encoder_levels >= 1, \
+            'num_levels in attn_cfgs must be at least one'
+        input_conv_list = []
+        # from top to down (low to high resolution)
+        for i in range(self.num_input_levels - 1,
+                       self.num_input_levels - self.num_encoder_levels - 1,
+                       -1):
+            input_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                bias=True)
+            input_conv_list.append(input_conv)
+        self.input_convs = ModuleList(input_conv_list)
+
+        self.encoder = Mask2FormerTransformerEncoder(**encoder)
+        self.postional_encoding = SinePositionalEncoding(**positional_encoding)
+        # high resolution to low resolution
+        self.level_encoding = nn.Embedding(self.num_encoder_levels,
+                                           feat_channels)
+
+        # fpn-like structure
+        self.lateral_convs = ModuleList()
+        self.output_convs = ModuleList()
+        self.use_bias = norm_cfg is None
+        # from top to down (low to high resolution)
+        # fpn for the rest features that didn't pass in encoder
+        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1,
+                       -1):
+            lateral_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            output_conv = ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.lateral_convs.append(lateral_conv)
+            self.output_convs.append(output_conv)
+
+        self.mask_feature = Conv2d(
+            feat_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+        self.num_outs = num_outs
+        self.point_generator = MlvlPointGenerator(strides)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        for i in range(0, self.num_encoder_levels):
+            xavier_init(
+                self.input_convs[i].conv,
+                gain=1,
+                bias=0,
+                distribution='uniform')
+
+        for i in range(0, self.num_input_levels - self.num_encoder_levels):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+
+        normal_init(self.level_encoding, mean=0, std=1)
+        for p in self.encoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+        # init_weights defined in MultiScaleDeformableAttention
+        for m in self.encoder.layers.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+
+    def forward(self, feats: List[Tensor]) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+
+        Returns:
+            tuple: A tuple containing the following:
+
+                - mask_feature (Tensor): shape (batch_size, c, h, w).
+                - multi_scale_features (list[Tensor]): Multi scale \
+                        features, each in shape (batch_size, c, h, w).
+        """
+        # generate padding mask for each level, for each image
+        batch_size = feats[0].shape[0]
+        encoder_input_list = []
+        padding_mask_list = []
+        level_positional_encoding_list = []
+        spatial_shapes = []
+        reference_points_list = []
+        for i in range(self.num_encoder_levels):
+            level_idx = self.num_input_levels - i - 1
+            feat = feats[level_idx]
+            feat_projected = self.input_convs[i](feat)
+            feat_hw = torch._shape_as_tensor(feat)[2:].to(feat.device)
+
+            # no padding
+            padding_mask_resized = feat.new_zeros(
+                (batch_size, ) + feat.shape[-2:], dtype=torch.bool)
+            pos_embed = self.postional_encoding(padding_mask_resized)
+            level_embed = self.level_encoding.weight[i]
+            level_pos_embed = level_embed.view(1, -1, 1, 1) + pos_embed
+            # (h_i * w_i, 2)
+            reference_points = self.point_generator.single_level_grid_priors(
+                feat.shape[-2:], level_idx, device=feat.device)
+            # normalize
+            feat_wh = feat_hw.unsqueeze(0).flip(dims=[0, 1])
+            factor = feat_wh * self.strides[level_idx]
+            reference_points = reference_points / factor
+
+            # shape (batch_size, c, h_i, w_i) -> (h_i * w_i, batch_size, c)
+            feat_projected = feat_projected.flatten(2).permute(0, 2, 1)
+            level_pos_embed = level_pos_embed.flatten(2).permute(0, 2, 1)
+            padding_mask_resized = padding_mask_resized.flatten(1)
+
+            encoder_input_list.append(feat_projected)
+            padding_mask_list.append(padding_mask_resized)
+            level_positional_encoding_list.append(level_pos_embed)
+            spatial_shapes.append(feat_hw)
+            reference_points_list.append(reference_points)
+        # shape (batch_size, total_num_queries),
+        # total_num_queries=sum([., h_i * w_i,.])
+        padding_masks = torch.cat(padding_mask_list, dim=1)
+        # shape (total_num_queries, batch_size, c)
+        encoder_inputs = torch.cat(encoder_input_list, dim=1)
+        level_positional_encodings = torch.cat(
+            level_positional_encoding_list, dim=1)
+        # shape (num_encoder_levels, 2), from low
+        # resolution to high resolution
+        num_queries_per_level = [e[0] * e[1] for e in spatial_shapes]
+        spatial_shapes = torch.cat(spatial_shapes).view(-1, 2)
+        # shape (0, h_0*w_0, h_0*w_0+h_1*w_1, ...)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        reference_points = torch.cat(reference_points_list, dim=0)
+        reference_points = reference_points[None, :, None].repeat(
+            batch_size, 1, self.num_encoder_levels, 1)
+        valid_radios = reference_points.new_ones(
+            (batch_size, self.num_encoder_levels, 2))
+        # shape (num_total_queries, batch_size, c)
+        memory = self.encoder(
+            query=encoder_inputs,
+            query_pos=level_positional_encodings,
+            key_padding_mask=padding_masks,
+            spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            level_start_index=level_start_index,
+            valid_ratios=valid_radios)
+        # (batch_size, c, num_total_queries)
+        memory = memory.permute(0, 2, 1)
+
+        # from low resolution to high resolution
+        outs = torch.split(memory, num_queries_per_level, dim=-1)
+        outs = [
+            x.reshape(batch_size, -1, spatial_shapes[i][0],
+                      spatial_shapes[i][1]) for i, x in enumerate(outs)
+        ]
+
+        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1,
+                       -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + F.interpolate(
+                outs[-1],
+                size=cur_feat.shape[-2:],
+                mode='bilinear',
+                align_corners=False)
+            y = self.output_convs[i](y)
+            outs.append(y)
+        multi_scale_features = outs[:self.num_outs]
+
+        mask_feature = self.mask_feature(outs[-1])
+        return mask_feature, multi_scale_features
diff --git a/head_extractor/build/lib/mmdet/models/layers/normed_predictor.py b/head_extractor/build/lib/mmdet/models/layers/normed_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..592194b1dbbb8582f4c642bf29135573e1f8c3c8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/normed_predictor.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.utils import digit_version
+from torch import Tensor
+
+from mmdet.registry import MODELS
+
+MODELS.register_module('Linear', module=nn.Linear)
+
+
+@MODELS.register_module(name='NormedLinear')
+class NormedLinear(nn.Linear):
+    """Normalized Linear Layer.
+
+    Args:
+        tempeature (float, optional): Tempeature term. Defaults to 20.
+        power (int, optional): Power term. Defaults to 1.0.
+        eps (float, optional): The minimal value of divisor to
+             keep numerical stability. Defaults to 1e-6.
+    """
+
+    def __init__(self,
+                 *args,
+                 tempearture: float = 20,
+                 power: int = 1.0,
+                 eps: float = 1e-6,
+                 **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.tempearture = tempearture
+        self.power = power
+        self.eps = eps
+        self.init_weights()
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+        nn.init.normal_(self.weight, mean=0, std=0.01)
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for `NormedLinear`."""
+        weight_ = self.weight / (
+            self.weight.norm(dim=1, keepdim=True).pow(self.power) + self.eps)
+        x_ = x / (x.norm(dim=1, keepdim=True).pow(self.power) + self.eps)
+        x_ = x_ * self.tempearture
+
+        return F.linear(x_, weight_, self.bias)
+
+
+@MODELS.register_module(name='NormedConv2d')
+class NormedConv2d(nn.Conv2d):
+    """Normalized Conv2d Layer.
+
+    Args:
+        tempeature (float, optional): Tempeature term. Defaults to 20.
+        power (int, optional): Power term. Defaults to 1.0.
+        eps (float, optional): The minimal value of divisor to
+             keep numerical stability. Defaults to 1e-6.
+        norm_over_kernel (bool, optional): Normalize over kernel.
+             Defaults to False.
+    """
+
+    def __init__(self,
+                 *args,
+                 tempearture: float = 20,
+                 power: int = 1.0,
+                 eps: float = 1e-6,
+                 norm_over_kernel: bool = False,
+                 **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.tempearture = tempearture
+        self.power = power
+        self.norm_over_kernel = norm_over_kernel
+        self.eps = eps
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for `NormedConv2d`."""
+        if not self.norm_over_kernel:
+            weight_ = self.weight / (
+                self.weight.norm(dim=1, keepdim=True).pow(self.power) +
+                self.eps)
+        else:
+            weight_ = self.weight / (
+                self.weight.view(self.weight.size(0), -1).norm(
+                    dim=1, keepdim=True).pow(self.power)[..., None, None] +
+                self.eps)
+        x_ = x / (x.norm(dim=1, keepdim=True).pow(self.power) + self.eps)
+        x_ = x_ * self.tempearture
+
+        if hasattr(self, 'conv2d_forward'):
+            x_ = self.conv2d_forward(x_, weight_)
+        else:
+            if digit_version(torch.__version__) >= digit_version('1.8'):
+                x_ = self._conv_forward(x_, weight_, self.bias)
+            else:
+                x_ = self._conv_forward(x_, weight_)
+        return x_
diff --git a/head_extractor/build/lib/mmdet/models/layers/pixel_decoder.py b/head_extractor/build/lib/mmdet/models/layers/pixel_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb61434045eb9996276518577800132e4a25eb3e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/pixel_decoder.py
@@ -0,0 +1,249 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, ConvModule
+from mmengine.model import BaseModule, ModuleList, caffe2_xavier_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from .positional_encoding import SinePositionalEncoding
+from .transformer import DetrTransformerEncoder
+
+
+@MODELS.register_module()
+class PixelDecoder(BaseModule):
+    """Pixel decoder with a structure like fpn.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        feat_channels (int): Number channels for feature.
+        out_channels (int): Number channels for output.
+        norm_cfg (:obj:`ConfigDict` or dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`ConfigDict` or dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`ConfigDict` or dict): Config for transorformer
+            encoder.Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(type='SinePositionalEncoding', num_feats=128,
+            normalize=True).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: Union[List[int], Tuple[int]],
+                 feat_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType = dict(type='GN', num_groups=32),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_inputs = len(in_channels)
+        self.lateral_convs = ModuleList()
+        self.output_convs = ModuleList()
+        self.use_bias = norm_cfg is None
+        for i in range(0, self.num_inputs - 1):
+            lateral_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            output_conv = ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.lateral_convs.append(lateral_conv)
+            self.output_convs.append(output_conv)
+
+        self.last_feat_conv = ConvModule(
+            in_channels[-1],
+            feat_channels,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            bias=self.use_bias,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.mask_feature = Conv2d(
+            feat_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        for i in range(0, self.num_inputs - 2):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+        caffe2_xavier_init(self.last_feat_conv, bias=0)
+
+    def forward(self, feats: List[Tensor],
+                batch_img_metas: List[dict]) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+            batch_img_metas (list[dict]): List of image information.
+                Pass in for creating more accurate padding mask. Not
+                used here.
+
+        Returns:
+            tuple[Tensor, Tensor]: a tuple containing the following:
+
+                - mask_feature (Tensor): Shape (batch_size, c, h, w).
+                - memory (Tensor): Output of last stage of backbone.\
+                        Shape (batch_size, c, h, w).
+        """
+        y = self.last_feat_conv(feats[-1])
+        for i in range(self.num_inputs - 2, -1, -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + \
+                F.interpolate(y, size=cur_feat.shape[-2:], mode='nearest')
+            y = self.output_convs[i](y)
+
+        mask_feature = self.mask_feature(y)
+        memory = feats[-1]
+        return mask_feature, memory
+
+
+@MODELS.register_module()
+class TransformerEncoderPixelDecoder(PixelDecoder):
+    """Pixel decoder with transormer encoder inside.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        feat_channels (int): Number channels for feature.
+        out_channels (int): Number channels for output.
+        norm_cfg (:obj:`ConfigDict` or dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`ConfigDict` or dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`ConfigDict` or dict): Config for transformer encoder.
+            Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(num_feats=128, normalize=True).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: Union[List[int], Tuple[int]],
+                 feat_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType = dict(type='GN', num_groups=32),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 encoder: ConfigType = None,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=init_cfg)
+        self.last_feat_conv = None
+
+        self.encoder = DetrTransformerEncoder(**encoder)
+        self.encoder_embed_dims = self.encoder.embed_dims
+        assert self.encoder_embed_dims == feat_channels, 'embed_dims({}) of ' \
+            'tranformer encoder must equal to feat_channels({})'.format(
+                feat_channels, self.encoder_embed_dims)
+        self.positional_encoding = SinePositionalEncoding(
+            **positional_encoding)
+        self.encoder_in_proj = Conv2d(
+            in_channels[-1], feat_channels, kernel_size=1)
+        self.encoder_out_proj = ConvModule(
+            feat_channels,
+            feat_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=self.use_bias,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        for i in range(0, self.num_inputs - 2):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+        caffe2_xavier_init(self.encoder_in_proj, bias=0)
+        caffe2_xavier_init(self.encoder_out_proj.conv, bias=0)
+
+        for p in self.encoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, feats: List[Tensor],
+                batch_img_metas: List[dict]) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+            batch_img_metas (list[dict]): List of image information. Pass in
+                for creating more accurate padding mask.
+
+        Returns:
+            tuple: a tuple containing the following:
+
+                - mask_feature (Tensor): shape (batch_size, c, h, w).
+                - memory (Tensor): shape (batch_size, c, h, w).
+        """
+        feat_last = feats[-1]
+        bs, c, h, w = feat_last.shape
+        input_img_h, input_img_w = batch_img_metas[0]['batch_input_shape']
+        padding_mask = feat_last.new_ones((bs, input_img_h, input_img_w),
+                                          dtype=torch.float32)
+        for i in range(bs):
+            img_h, img_w = batch_img_metas[i]['img_shape']
+            padding_mask[i, :img_h, :img_w] = 0
+        padding_mask = F.interpolate(
+            padding_mask.unsqueeze(1),
+            size=feat_last.shape[-2:],
+            mode='nearest').to(torch.bool).squeeze(1)
+
+        pos_embed = self.positional_encoding(padding_mask)
+        feat_last = self.encoder_in_proj(feat_last)
+        # (batch_size, c, h, w) -> (batch_size, num_queries, c)
+        feat_last = feat_last.flatten(2).permute(0, 2, 1)
+        pos_embed = pos_embed.flatten(2).permute(0, 2, 1)
+        # (batch_size, h, w) -> (batch_size, h*w)
+        padding_mask = padding_mask.flatten(1)
+        memory = self.encoder(
+            query=feat_last,
+            query_pos=pos_embed,
+            key_padding_mask=padding_mask)
+        # (batch_size, num_queries, c) -> (batch_size, c, h, w)
+        memory = memory.permute(0, 2, 1).view(bs, self.encoder_embed_dims, h,
+                                              w)
+        y = self.encoder_out_proj(memory)
+        for i in range(self.num_inputs - 2, -1, -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + \
+                F.interpolate(y, size=cur_feat.shape[-2:], mode='nearest')
+            y = self.output_convs[i](y)
+
+        mask_feature = self.mask_feature(y)
+        return mask_feature, memory
diff --git a/head_extractor/build/lib/mmdet/models/layers/positional_encoding.py b/head_extractor/build/lib/mmdet/models/layers/positional_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..87080d81a9f155839d453b8671103e5d51fbf88a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/positional_encoding.py
@@ -0,0 +1,269 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig, OptMultiConfig
+
+
+@MODELS.register_module()
+class SinePositionalEncoding(BaseModule):
+    """Position encoding with sine and cosine functions.
+
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None
+    """
+
+    def __init__(self,
+                 num_feats: int,
+                 temperature: int = 10000,
+                 normalize: bool = False,
+                 scale: float = 2 * math.pi,
+                 eps: float = 1e-6,
+                 offset: float = 0.,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), 'when normalize is set,' \
+                'scale should be provided and in float or int type, ' \
+                f'found {type(scale)}'
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+
+    def forward(self, mask: Tensor, input: Optional[Tensor] = None) -> Tensor:
+        """Forward function for `SinePositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+            input (Tensor, optional): Input image/feature Tensor.
+                Shape [bs, c, h, w]
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        assert not (mask is None and input is None)
+
+        if mask is not None:
+            B, H, W = mask.size()
+            device = mask.device
+            # For convenience of exporting to ONNX,
+            # it's required to convert
+            # `masks` from bool to int.
+            mask = mask.to(torch.int)
+            not_mask = 1 - mask  # logical_not
+            y_embed = not_mask.cumsum(1, dtype=torch.float32)
+            x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        else:
+            # single image or batch image with no padding
+            B, _, H, W = input.shape
+            device = input.device
+            x_embed = torch.arange(
+                1, W + 1, dtype=torch.float32, device=device)
+            x_embed = x_embed.view(1, 1, -1).repeat(B, H, 1)
+            y_embed = torch.arange(
+                1, H + 1, dtype=torch.float32, device=device)
+            y_embed = y_embed.view(1, -1, 1).repeat(B, 1, W)
+        if self.normalize:
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+    def __repr__(self) -> str:
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'temperature={self.temperature}, '
+        repr_str += f'normalize={self.normalize}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'eps={self.eps})'
+        return repr_str
+
+
+@MODELS.register_module()
+class LearnedPositionalEncoding(BaseModule):
+    """Position embedding with learnable embedding weights.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. The final returned dimension for
+            each position is 2 times of this value.
+        row_num_embed (int, optional): The dictionary size of row embeddings.
+            Defaults to 50.
+        col_num_embed (int, optional): The dictionary size of col embeddings.
+            Defaults to 50.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_feats: int,
+        row_num_embed: int = 50,
+        col_num_embed: int = 50,
+        init_cfg: MultiConfig = dict(type='Uniform', layer='Embedding')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.row_embed = nn.Embedding(row_num_embed, num_feats)
+        self.col_embed = nn.Embedding(col_num_embed, num_feats)
+        self.num_feats = num_feats
+        self.row_num_embed = row_num_embed
+        self.col_num_embed = col_num_embed
+
+    def forward(self, mask: Tensor) -> Tensor:
+        """Forward function for `LearnedPositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        h, w = mask.shape[-2:]
+        x = torch.arange(w, device=mask.device)
+        y = torch.arange(h, device=mask.device)
+        x_embed = self.col_embed(x)
+        y_embed = self.row_embed(y)
+        pos = torch.cat(
+            (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(
+                1, w, 1)),
+            dim=-1).permute(2, 0,
+                            1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1)
+        return pos
+
+    def __repr__(self) -> str:
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'row_num_embed={self.row_num_embed}, '
+        repr_str += f'col_num_embed={self.col_num_embed})'
+        return repr_str
+
+
+@MODELS.register_module()
+class SinePositionalEncoding3D(SinePositionalEncoding):
+    """Position encoding with sine and cosine functions.
+
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def forward(self, mask: Tensor) -> Tensor:
+        """Forward function for `SinePositionalEncoding3D`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, t, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        assert mask.dim() == 4,\
+            f'{mask.shape} should be a 4-dimensional Tensor,' \
+            f' got {mask.dim()}-dimensional Tensor instead '
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        z_embed = not_mask.cumsum(1, dtype=torch.float32)
+        y_embed = not_mask.cumsum(2, dtype=torch.float32)
+        x_embed = not_mask.cumsum(3, dtype=torch.float32)
+        if self.normalize:
+            z_embed = (z_embed + self.offset) / \
+                      (z_embed[:, -1:, :, :] + self.eps) * self.scale
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, :, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+
+        dim_t_z = torch.arange((self.num_feats * 2),
+                               dtype=torch.float32,
+                               device=mask.device)
+        dim_t_z = self.temperature**(2 * (dim_t_z // 2) / (self.num_feats * 2))
+
+        pos_x = x_embed[:, :, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, :, None] / dim_t
+        pos_z = z_embed[:, :, :, :, None] / dim_t_z
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, T, H, W = mask.size()
+        pos_x = torch.stack(
+            (pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()),
+            dim=5).view(B, T, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()),
+            dim=5).view(B, T, H, W, -1)
+        pos_z = torch.stack(
+            (pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()),
+            dim=5).view(B, T, H, W, -1)
+        pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3)
+        return pos
diff --git a/head_extractor/build/lib/mmdet/models/layers/res_layer.py b/head_extractor/build/lib/mmdet/models/layers/res_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff24d3e8562d1c3c724b35f7dc10cafe48e47650
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/res_layer.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule, Sequential
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Defaults to 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Defaults to False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Defaults to None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Defaults to dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Defaults to True
+    """
+
+    def __init__(self,
+                 block: BaseModule,
+                 inplanes: int,
+                 planes: int,
+                 num_blocks: int,
+                 stride: int = 1,
+                 avg_down: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 downsample_first: bool = True,
+                 **kwargs) -> None:
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            inplanes = planes * block.expansion
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(
+                        inplanes=inplanes,
+                        planes=planes,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+
+        else:  # downsample_first=False is for HourglassModule
+            for _ in range(num_blocks - 1):
+                layers.append(
+                    block(
+                        inplanes=inplanes,
+                        planes=inplanes,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+        super().__init__(*layers)
+
+
+class SimplifiedBasicBlock(BaseModule):
+    """Simplified version of original basic residual block. This is used in
+    `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    - Norm layer is now optional
+    - Last ReLU in forward function is removed
+    """
+    expansion = 1
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[Sequential] = None,
+                 style: ConfigType = 'pytorch',
+                 with_cp: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 dcn: OptConfigType = None,
+                 plugins: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+        assert not with_cp, 'Not implemented yet.'
+        self.with_norm = norm_cfg is not None
+        with_bias = True if norm_cfg is None else False
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=with_bias)
+        if self.with_norm:
+            self.norm1_name, norm1 = build_norm_layer(
+                norm_cfg, planes, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=with_bias)
+        if self.with_norm:
+            self.norm2_name, norm2 = build_norm_layer(
+                norm_cfg, planes, postfix=2)
+            self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self) -> Optional[BaseModule]:
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name) if self.with_norm else None
+
+    @property
+    def norm2(self) -> Optional[BaseModule]:
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name) if self.with_norm else None
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for SimplifiedBasicBlock."""
+
+        identity = x
+
+        out = self.conv1(x)
+        if self.with_norm:
+            out = self.norm1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        if self.with_norm:
+            out = self.norm2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+
+        return out
diff --git a/head_extractor/build/lib/mmdet/models/layers/se_layer.py b/head_extractor/build/lib/mmdet/models/layers/se_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5598dabaf6f3b3a09f4348fcd65ff39897b7068f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/se_layer.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from mmengine.utils import digit_version, is_tuple_of
+from torch import Tensor
+
+from mmdet.utils import MultiConfig, OptConfigType, OptMultiConfig
+
+
+class SELayer(BaseModule):
+    """Squeeze-and-Excitation Module.
+
+    Args:
+        channels (int): The input (and output) channels of the SE layer.
+        ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
+            ``int(channels/ratio)``. Defaults to 16.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Defaults to (dict(type='ReLU'), dict(type='Sigmoid'))
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None
+    """
+
+    def __init__(self,
+                 channels: int,
+                 ratio: int = 16,
+                 conv_cfg: OptConfigType = None,
+                 act_cfg: MultiConfig = (dict(type='ReLU'),
+                                         dict(type='Sigmoid')),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for SELayer."""
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
+
+
+class DyReLU(BaseModule):
+    """Dynamic ReLU (DyReLU) module.
+
+    See `Dynamic ReLU <https://arxiv.org/abs/2003.10027>`_ for details.
+    Current implementation is specialized for task-aware attention in DyHead.
+    HSigmoid arguments in default act_cfg follow DyHead official code.
+    https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py
+
+    Args:
+        channels (int): The input (and output) channels of DyReLU module.
+        ratio (int): Squeeze ratio in Squeeze-and-Excitation-like module,
+            the intermediate channel will be ``int(channels/ratio)``.
+            Defaults to 4.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Defaults to (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0,
+            divisor=6.0))
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None
+    """
+
+    def __init__(self,
+                 channels: int,
+                 ratio: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 act_cfg: MultiConfig = (dict(type='ReLU'),
+                                         dict(
+                                             type='HSigmoid',
+                                             bias=3.0,
+                                             divisor=6.0)),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert is_tuple_of(act_cfg, dict)
+        self.channels = channels
+        self.expansion = 4  # for a1, b1, a2, b2
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels * self.expansion,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        coeffs = self.global_avgpool(x)
+        coeffs = self.conv1(coeffs)
+        coeffs = self.conv2(coeffs) - 0.5  # value range: [-0.5, 0.5]
+        a1, b1, a2, b2 = torch.split(coeffs, self.channels, dim=1)
+        a1 = a1 * 2.0 + 1.0  # [-1.0, 1.0] + 1.0
+        a2 = a2 * 2.0  # [-1.0, 1.0]
+        out = torch.max(x * a1 + b1, x * a2 + b2)
+        return out
+
+
+class ChannelAttention(BaseModule):
+    """Channel attention Module.
+
+    Args:
+        channels (int): The input (and output) channels of the attention layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None
+    """
+
+    def __init__(self, channels: int, init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
+        if digit_version(torch.__version__) < (1, 7, 0):
+            self.act = nn.Hardsigmoid()
+        else:
+            self.act = nn.Hardsigmoid(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for ChannelAttention."""
+        with torch.cuda.amp.autocast(enabled=False):
+            out = self.global_avgpool(x)
+        out = self.fc(out)
+        out = self.act(out)
+        return x * out
diff --git a/head_extractor/build/lib/mmdet/models/layers/transformer/__init__.py b/head_extractor/build/lib/mmdet/models/layers/transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..839d936412673d765cd9f89a44a366a64976bb9c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/transformer/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .conditional_detr_layers import (ConditionalDetrTransformerDecoder,
+                                      ConditionalDetrTransformerDecoderLayer)
+from .dab_detr_layers import (DABDetrTransformerDecoder,
+                              DABDetrTransformerDecoderLayer,
+                              DABDetrTransformerEncoder)
+from .ddq_detr_layers import DDQTransformerDecoder
+from .deformable_detr_layers import (DeformableDetrTransformerDecoder,
+                                     DeformableDetrTransformerDecoderLayer,
+                                     DeformableDetrTransformerEncoder,
+                                     DeformableDetrTransformerEncoderLayer)
+from .detr_layers import (DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DetrTransformerEncoder, DetrTransformerEncoderLayer)
+from .dino_layers import CdnQueryGenerator, DinoTransformerDecoder
+from .grounding_dino_layers import (GroundingDinoTransformerDecoder,
+                                    GroundingDinoTransformerDecoderLayer,
+                                    GroundingDinoTransformerEncoder)
+from .mask2former_layers import (Mask2FormerTransformerDecoder,
+                                 Mask2FormerTransformerDecoderLayer,
+                                 Mask2FormerTransformerEncoder)
+from .utils import (MLP, AdaptivePadding, ConditionalAttention, DynamicConv,
+                    PatchEmbed, PatchMerging, coordinate_to_encoding,
+                    inverse_sigmoid, nchw_to_nlc, nlc_to_nchw)
+
+__all__ = [
+    'nlc_to_nchw', 'nchw_to_nlc', 'AdaptivePadding', 'PatchEmbed',
+    'PatchMerging', 'inverse_sigmoid', 'DynamicConv', 'MLP',
+    'DetrTransformerEncoder', 'DetrTransformerDecoder',
+    'DetrTransformerEncoderLayer', 'DetrTransformerDecoderLayer',
+    'DeformableDetrTransformerEncoder', 'DeformableDetrTransformerDecoder',
+    'DeformableDetrTransformerEncoderLayer',
+    'DeformableDetrTransformerDecoderLayer', 'coordinate_to_encoding',
+    'ConditionalAttention', 'DABDetrTransformerDecoderLayer',
+    'DABDetrTransformerDecoder', 'DABDetrTransformerEncoder',
+    'DDQTransformerDecoder', 'ConditionalDetrTransformerDecoder',
+    'ConditionalDetrTransformerDecoderLayer', 'DinoTransformerDecoder',
+    'CdnQueryGenerator', 'Mask2FormerTransformerEncoder',
+    'Mask2FormerTransformerDecoderLayer', 'Mask2FormerTransformerDecoder',
+    'GroundingDinoTransformerDecoderLayer', 'GroundingDinoTransformerEncoder',
+    'GroundingDinoTransformerDecoder'
+]
diff --git a/head_extractor/build/lib/mmdet/models/layers/transformer/conditional_detr_layers.py b/head_extractor/build/lib/mmdet/models/layers/transformer/conditional_detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6db12a1340c758996e8c0e96f0b21cbc6fa928c9
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/transformer/conditional_detr_layers.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN
+from torch import Tensor
+from torch.nn import ModuleList
+
+from .detr_layers import DetrTransformerDecoder, DetrTransformerDecoderLayer
+from .utils import MLP, ConditionalAttention, coordinate_to_encoding
+
+
+class ConditionalDetrTransformerDecoder(DetrTransformerDecoder):
+    """Decoder of Conditional DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers and other layers."""
+        self.layers = ModuleList([
+            ConditionalDetrTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        self.post_norm = build_norm_layer(self.post_norm_cfg,
+                                          self.embed_dims)[1]
+        # conditional detr affline
+        self.query_scale = MLP(self.embed_dims, self.embed_dims,
+                               self.embed_dims, 2)
+        self.ref_point_head = MLP(self.embed_dims, self.embed_dims, 2, 2)
+        # we have substitute 'qpos_proj' with 'qpos_sine_proj' except for
+        # the first decoder layer), so 'qpos_proj' should be deleted
+        # in other layers.
+        for layer_id in range(self.num_layers - 1):
+            self.layers[layer_id + 1].cross_attn.qpos_proj = None
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                key_padding_mask: Tensor = None):
+        """Forward function of decoder.
+
+        Args:
+            query (Tensor): The input query with shape
+                (bs, num_queries, dim).
+            key (Tensor): The input key with shape (bs, num_keys, dim) If
+                `None`, the `query` will be used. Defaults to `None`.
+            query_pos (Tensor): The positional encoding for `query`, with the
+                same shape as `query`. If not `None`, it will be added to
+                `query` before forward function. Defaults to `None`.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. If not `None`, it will be added to
+                `key` before forward function. If `None`, and `query_pos`
+                has the same shape as `key`, then `query_pos` will be used
+                as `key_pos`. Defaults to `None`.
+            key_padding_mask (Tensor): ByteTensor with shape (bs, num_keys).
+                Defaults to `None`.
+        Returns:
+            List[Tensor]: forwarded results with shape (num_decoder_layers,
+            bs, num_queries, dim) if `return_intermediate` is True, otherwise
+            with shape (1, bs, num_queries, dim). References with shape
+            (bs, num_queries, 2).
+        """
+        reference_unsigmoid = self.ref_point_head(
+            query_pos)  # [bs, num_queries, 2]
+        reference = reference_unsigmoid.sigmoid()
+        reference_xy = reference[..., :2]
+        intermediate = []
+        for layer_id, layer in enumerate(self.layers):
+            if layer_id == 0:
+                pos_transformation = 1
+            else:
+                pos_transformation = self.query_scale(query)
+            # get sine embedding for the query reference
+            ref_sine_embed = coordinate_to_encoding(coord_tensor=reference_xy)
+            # apply transformation
+            ref_sine_embed = ref_sine_embed * pos_transformation
+            query = layer(
+                query,
+                key=key,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                key_padding_mask=key_padding_mask,
+                ref_sine_embed=ref_sine_embed,
+                is_first=(layer_id == 0))
+            if self.return_intermediate:
+                intermediate.append(self.post_norm(query))
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), reference
+
+        query = self.post_norm(query)
+        return query.unsqueeze(0), reference
+
+
+class ConditionalDetrTransformerDecoderLayer(DetrTransformerDecoderLayer):
+    """Implements decoder layer in Conditional DETR transformer."""
+
+    def _init_layers(self):
+        """Initialize self-attention, cross-attention, FFN, and
+        normalization."""
+        self.self_attn = ConditionalAttention(**self.self_attn_cfg)
+        self.cross_attn = ConditionalAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(3)
+        ]
+        self.norms = ModuleList(norms_list)
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                self_attn_masks: Tensor = None,
+                cross_attn_masks: Tensor = None,
+                key_padding_mask: Tensor = None,
+                ref_sine_embed: Tensor = None,
+                is_first: bool = False):
+        """
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim)
+            key (Tensor, optional): The input key, has shape (bs, num_keys,
+                dim). If `None`, the `query` will be used. Defaults to `None`.
+            query_pos (Tensor, optional): The positional encoding for `query`,
+                has the same shape as `query`. If not `None`, it will be
+                added to `query` before forward function. Defaults to `None`.
+            ref_sine_embed (Tensor): The positional encoding for query in
+                cross attention, with the same shape as `x`. Defaults to None.
+            key_pos (Tensor, optional): The positional encoding for `key`, has
+                the same shape as `key`. If not None, it will be added to
+                `key` before forward function. If None, and `query_pos` has
+                the same shape as `key`, then `query_pos` will be used for
+                `key_pos`. Defaults to None.
+            self_attn_masks (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), Same in `nn.MultiheadAttention.
+                forward`. Defaults to None.
+            cross_attn_masks (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), Same in `nn.MultiheadAttention.
+                forward`. Defaults to None.
+            key_padding_mask (Tensor, optional): ByteTensor, has shape
+                (bs, num_keys). Defaults to None.
+            is_first (bool): A indicator to tell whether the current layer
+                is the first layer of the decoder. Defaults to False.
+
+        Returns:
+            Tensor: Forwarded results, has shape (bs, num_queries, dim).
+        """
+        query = self.self_attn(
+            query=query,
+            key=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_masks)
+        query = self.norms[0](query)
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=cross_attn_masks,
+            key_padding_mask=key_padding_mask,
+            ref_sine_embed=ref_sine_embed,
+            is_first=is_first)
+        query = self.norms[1](query)
+        query = self.ffn(query)
+        query = self.norms[2](query)
+
+        return query
diff --git a/head_extractor/build/lib/mmdet/models/layers/transformer/dab_detr_layers.py b/head_extractor/build/lib/mmdet/models/layers/transformer/dab_detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8a6e7724a1b1ca18f26dd10455f3e3a4d696460
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/transformer/dab_detr_layers.py
@@ -0,0 +1,298 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from .detr_layers import (DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DetrTransformerEncoder, DetrTransformerEncoderLayer)
+from .utils import (MLP, ConditionalAttention, coordinate_to_encoding,
+                    inverse_sigmoid)
+
+
+class DABDetrTransformerDecoderLayer(DetrTransformerDecoderLayer):
+    """Implements decoder layer in DAB-DETR transformer."""
+
+    def _init_layers(self):
+        """Initialize self-attention, cross-attention, FFN, normalization and
+        others."""
+        self.self_attn = ConditionalAttention(**self.self_attn_cfg)
+        self.cross_attn = ConditionalAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(3)
+        ]
+        self.norms = ModuleList(norms_list)
+        self.keep_query_pos = self.cross_attn.keep_query_pos
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                query_pos: Tensor,
+                key_pos: Tensor,
+                ref_sine_embed: Tensor = None,
+                self_attn_masks: Tensor = None,
+                cross_attn_masks: Tensor = None,
+                key_padding_mask: Tensor = None,
+                is_first: bool = False,
+                **kwargs) -> Tensor:
+        """
+        Args:
+            query (Tensor): The input query with shape [bs, num_queries,
+                dim].
+            key (Tensor): The key tensor with shape [bs, num_keys,
+                dim].
+            query_pos (Tensor): The positional encoding for query in self
+                attention, with the same shape as `x`.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`.
+            ref_sine_embed (Tensor): The positional encoding for query in
+                cross attention, with the same shape as `x`.
+                Defaults to None.
+            self_attn_masks (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            cross_attn_masks (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+            is_first (bool): A indicator to tell whether the current layer
+                is the first layer of the decoder.
+                Defaults to False.
+
+        Returns:
+            Tensor: forwarded results with shape
+            [bs, num_queries, dim].
+        """
+
+        query = self.self_attn(
+            query=query,
+            key=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_masks,
+            **kwargs)
+        query = self.norms[0](query)
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            ref_sine_embed=ref_sine_embed,
+            attn_mask=cross_attn_masks,
+            key_padding_mask=key_padding_mask,
+            is_first=is_first,
+            **kwargs)
+        query = self.norms[1](query)
+        query = self.ffn(query)
+        query = self.norms[2](query)
+
+        return query
+
+
+class DABDetrTransformerDecoder(DetrTransformerDecoder):
+    """Decoder of DAB-DETR.
+
+    Args:
+        query_dim (int): The last dimension of query pos,
+            4 for anchor format, 2 for point format.
+            Defaults to 4.
+        query_scale_type (str): Type of transformation applied
+            to content query. Defaults to `cond_elewise`.
+        with_modulated_hw_attn (bool): Whether to inject h&w info
+            during cross conditional attention. Defaults to True.
+    """
+
+    def __init__(self,
+                 *args,
+                 query_dim: int = 4,
+                 query_scale_type: str = 'cond_elewise',
+                 with_modulated_hw_attn: bool = True,
+                 **kwargs):
+
+        self.query_dim = query_dim
+        self.query_scale_type = query_scale_type
+        self.with_modulated_hw_attn = with_modulated_hw_attn
+
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self):
+        """Initialize decoder layers and other layers."""
+        assert self.query_dim in [2, 4], \
+            f'{"dab-detr only supports anchor prior or reference point prior"}'
+        assert self.query_scale_type in [
+            'cond_elewise', 'cond_scalar', 'fix_elewise'
+        ]
+
+        self.layers = ModuleList([
+            DABDetrTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+
+        embed_dims = self.layers[0].embed_dims
+        self.embed_dims = embed_dims
+
+        self.post_norm = build_norm_layer(self.post_norm_cfg, embed_dims)[1]
+        if self.query_scale_type == 'cond_elewise':
+            self.query_scale = MLP(embed_dims, embed_dims, embed_dims, 2)
+        elif self.query_scale_type == 'cond_scalar':
+            self.query_scale = MLP(embed_dims, embed_dims, 1, 2)
+        elif self.query_scale_type == 'fix_elewise':
+            self.query_scale = nn.Embedding(self.num_layers, embed_dims)
+        else:
+            raise NotImplementedError('Unknown query_scale_type: {}'.format(
+                self.query_scale_type))
+
+        self.ref_point_head = MLP(self.query_dim // 2 * embed_dims, embed_dims,
+                                  embed_dims, 2)
+
+        if self.with_modulated_hw_attn and self.query_dim == 4:
+            self.ref_anchor_head = MLP(embed_dims, embed_dims, 2, 2)
+
+        self.keep_query_pos = self.layers[0].keep_query_pos
+        if not self.keep_query_pos:
+            for layer_id in range(self.num_layers - 1):
+                self.layers[layer_id + 1].cross_attn.qpos_proj = None
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                query_pos: Tensor,
+                key_pos: Tensor,
+                reg_branches: nn.Module,
+                key_padding_mask: Tensor = None,
+                **kwargs) -> List[Tensor]:
+        """Forward function of decoder.
+
+        Args:
+            query (Tensor): The input query with shape (bs, num_queries, dim).
+            key (Tensor): The input key with shape (bs, num_keys, dim).
+            query_pos (Tensor): The positional encoding for `query`, with the
+                same shape as `query`.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`.
+            reg_branches (nn.Module): The regression branch for dynamically
+                updating references in each layer.
+            key_padding_mask (Tensor): ByteTensor with shape (bs, num_keys).
+                Defaults to `None`.
+
+        Returns:
+            List[Tensor]: forwarded results with shape (num_decoder_layers,
+            bs, num_queries, dim) if `return_intermediate` is True, otherwise
+            with shape (1, bs, num_queries, dim). references with shape
+            (num_decoder_layers, bs, num_queries, 2/4).
+        """
+        output = query
+        unsigmoid_references = query_pos
+
+        reference_points = unsigmoid_references.sigmoid()
+        intermediate_reference_points = [reference_points]
+
+        intermediate = []
+        for layer_id, layer in enumerate(self.layers):
+            obj_center = reference_points[..., :self.query_dim]
+            ref_sine_embed = coordinate_to_encoding(
+                coord_tensor=obj_center, num_feats=self.embed_dims // 2)
+            query_pos = self.ref_point_head(
+                ref_sine_embed)  # [bs, nq, 2c] -> [bs, nq, c]
+            # For the first decoder layer, do not apply transformation
+            if self.query_scale_type != 'fix_elewise':
+                if layer_id == 0:
+                    pos_transformation = 1
+                else:
+                    pos_transformation = self.query_scale(output)
+            else:
+                pos_transformation = self.query_scale.weight[layer_id]
+            # apply transformation
+            ref_sine_embed = ref_sine_embed[
+                ..., :self.embed_dims] * pos_transformation
+            # modulated height and weight attention
+            if self.with_modulated_hw_attn:
+                assert obj_center.size(-1) == 4
+                ref_hw = self.ref_anchor_head(output).sigmoid()
+                ref_sine_embed[..., self.embed_dims // 2:] *= \
+                    (ref_hw[..., 0] / obj_center[..., 2]).unsqueeze(-1)
+                ref_sine_embed[..., : self.embed_dims // 2] *= \
+                    (ref_hw[..., 1] / obj_center[..., 3]).unsqueeze(-1)
+
+            output = layer(
+                output,
+                key,
+                query_pos=query_pos,
+                ref_sine_embed=ref_sine_embed,
+                key_pos=key_pos,
+                key_padding_mask=key_padding_mask,
+                is_first=(layer_id == 0),
+                **kwargs)
+            # iter update
+            tmp_reg_preds = reg_branches(output)
+            tmp_reg_preds[..., :self.query_dim] += inverse_sigmoid(
+                reference_points)
+            new_reference_points = tmp_reg_preds[
+                ..., :self.query_dim].sigmoid()
+            if layer_id != self.num_layers - 1:
+                intermediate_reference_points.append(new_reference_points)
+            reference_points = new_reference_points.detach()
+
+            if self.return_intermediate:
+                intermediate.append(self.post_norm(output))
+
+        output = self.post_norm(output)
+
+        if self.return_intermediate:
+            return [
+                torch.stack(intermediate),
+                torch.stack(intermediate_reference_points),
+            ]
+        else:
+            return [
+                output.unsqueeze(0),
+                torch.stack(intermediate_reference_points)
+            ]
+
+
+class DABDetrTransformerEncoder(DetrTransformerEncoder):
+    """Encoder of DAB-DETR."""
+
+    def _init_layers(self):
+        """Initialize encoder layers."""
+        self.layers = ModuleList([
+            DetrTransformerEncoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        embed_dims = self.layers[0].embed_dims
+        self.embed_dims = embed_dims
+        self.query_scale = MLP(embed_dims, embed_dims, embed_dims, 2)
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, **kwargs):
+        """Forward function of encoder.
+
+        Args:
+            query (Tensor): Input queries of encoder, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional embeddings of the queries, has
+                shape (bs, num_feat_points, dim).
+            key_padding_mask (Tensor): ByteTensor, the key padding mask
+                of the queries, has shape (bs, num_feat_points).
+
+        Returns:
+            Tensor: With shape (num_queries, bs, dim).
+        """
+
+        for layer in self.layers:
+            pos_scales = self.query_scale(query)
+            query = layer(
+                query,
+                query_pos=query_pos * pos_scales,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+
+        return query
diff --git a/head_extractor/build/lib/mmdet/models/layers/transformer/ddq_detr_layers.py b/head_extractor/build/lib/mmdet/models/layers/transformer/ddq_detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..57664c7ea2bdd17681ccdabe9140eb043a99e155
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/transformer/ddq_detr_layers.py
@@ -0,0 +1,223 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+from mmcv.ops import batched_nms
+from torch import Tensor, nn
+
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from .deformable_detr_layers import DeformableDetrTransformerDecoder
+from .utils import MLP, coordinate_to_encoding, inverse_sigmoid
+
+
+class DDQTransformerDecoder(DeformableDetrTransformerDecoder):
+    """Transformer decoder of DDQ."""
+
+    def _init_layers(self) -> None:
+        """Initialize encoder layers."""
+        super()._init_layers()
+        self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims,
+                                  self.embed_dims, 2)
+        self.norm = nn.LayerNorm(self.embed_dims)
+
+    def select_distinct_queries(self, reference_points: Tensor, query: Tensor,
+                                self_attn_mask: Tensor, layer_index):
+        """Get updated `self_attn_mask` for distinct queries selection, it is
+        used in self attention layers of decoder.
+
+        Args:
+            reference_points (Tensor): The input reference of decoder,
+                has shape (bs, num_queries, 4) with the last dimension
+                arranged as (cx, cy, w, h).
+            query (Tensor): The input query of decoder, has shape
+                (bs, num_queries, dims).
+            self_attn_mask (Tensor): The input self attention mask of
+                last decoder layer, has shape (bs, num_queries_total,
+                num_queries_total).
+            layer_index (int): Last decoder layer index, used to get
+                classification score of last layer output, for
+                distinct queries selection.
+
+        Returns:
+            Tensor: `self_attn_mask` used in self attention layers
+                of decoder, has shape (bs, num_queries_total,
+                num_queries_total).
+        """
+        num_imgs = len(reference_points)
+        dis_start, num_dis = self.cache_dict['dis_query_info']
+        # shape of self_attn_mask
+        # (batch⋅num_heads, num_queries, embed_dims)
+        dis_mask = self_attn_mask[:, dis_start:dis_start + num_dis,
+                                  dis_start:dis_start + num_dis]
+        # cls_branches from DDQDETRHead
+        scores = self.cache_dict['cls_branches'][layer_index](
+            query[:, dis_start:dis_start + num_dis]).sigmoid().max(-1).values
+        proposals = reference_points[:, dis_start:dis_start + num_dis]
+        proposals = bbox_cxcywh_to_xyxy(proposals)
+
+        attn_mask_list = []
+        for img_id in range(num_imgs):
+            single_proposals = proposals[img_id]
+            single_scores = scores[img_id]
+            attn_mask = ~dis_mask[img_id * self.cache_dict['num_heads']][0]
+            # distinct query inds in this layer
+            ori_index = attn_mask.nonzero().view(-1)
+            _, keep_idxs = batched_nms(single_proposals[ori_index],
+                                       single_scores[ori_index],
+                                       torch.ones(len(ori_index)),
+                                       self.cache_dict['dqs_cfg'])
+
+            real_keep_index = ori_index[keep_idxs]
+
+            attn_mask = torch.ones_like(dis_mask[0]).bool()
+            # such a attn_mask give best result
+            # If it requires to keep index i, then all cells in row or column
+            #   i should be kept in `attn_mask` . For example, if
+            #   `real_keep_index` = [1, 4], and `attn_mask` size = [8, 8],
+            #   then all cells at rows or columns [1, 4] should be kept, and
+            #   all the other cells should be masked out. So the value of
+            #  `attn_mask` should be:
+            #
+            # target\source   0 1 2 3 4 5 6 7
+            #             0 [ 0 1 0 0 1 0 0 0 ]
+            #             1 [ 1 1 1 1 1 1 1 1 ]
+            #             2 [ 0 1 0 0 1 0 0 0 ]
+            #             3 [ 0 1 0 0 1 0 0 0 ]
+            #             4 [ 1 1 1 1 1 1 1 1 ]
+            #             5 [ 0 1 0 0 1 0 0 0 ]
+            #             6 [ 0 1 0 0 1 0 0 0 ]
+            #             7 [ 0 1 0 0 1 0 0 0 ]
+            attn_mask[real_keep_index] = False
+            attn_mask[:, real_keep_index] = False
+
+            attn_mask = attn_mask[None].repeat(self.cache_dict['num_heads'], 1,
+                                               1)
+            attn_mask_list.append(attn_mask)
+        attn_mask = torch.cat(attn_mask_list)
+        self_attn_mask = copy.deepcopy(self_attn_mask)
+        self_attn_mask[:, dis_start:dis_start + num_dis,
+                       dis_start:dis_start + num_dis] = attn_mask
+        # will be used in loss and inference
+        self.cache_dict['distinct_query_mask'].append(~attn_mask)
+        return self_attn_mask
+
+    def forward(self, query: Tensor, value: Tensor, key_padding_mask: Tensor,
+                self_attn_mask: Tensor, reference_points: Tensor,
+                spatial_shapes: Tensor, level_start_index: Tensor,
+                valid_ratios: Tensor, reg_branches: nn.ModuleList,
+                **kwargs) -> Tensor:
+        """Forward function of Transformer decoder.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries,
+                dims).
+            value (Tensor): The input values, has shape (bs, num_value, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `cross_attn`
+                input. ByteTensor, has shape (bs, num_value).
+            self_attn_mask (Tensor): The attention mask to prevent information
+                leakage from different denoising groups, distinct queries and
+                dense queries, has shape (num_queries_total,
+                num_queries_total). It will be updated for distinct queries
+                selection in this forward function. It is `None` when
+                `self.training` is `False`.
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            reg_branches: (obj:`nn.ModuleList`): Used for refining the
+                regression results.
+
+        Returns:
+            tuple[Tensor]: Output queries and references of Transformer
+                decoder
+
+            - query (Tensor): Output embeddings of the last decoder, has
+              shape (bs, num_queries, embed_dims) when `return_intermediate`
+              is `False`. Otherwise, Intermediate output embeddings of all
+              decoder layers, has shape (num_decoder_layers, bs, num_queries,
+              embed_dims).
+            - reference_points (Tensor): The reference of the last decoder
+              layer, has shape (bs, num_queries, 4) when `return_intermediate`
+              is `False`. Otherwise, Intermediate references of all decoder
+              layers, has shape (1 + num_decoder_layers, bs, num_queries, 4).
+              The coordinates are arranged as (cx, cy, w, h).
+        """
+        intermediate = []
+        intermediate_reference_points = [reference_points]
+        self.cache_dict['distinct_query_mask'] = []
+        if self_attn_mask is None:
+            self_attn_mask = torch.zeros((query.size(1), query.size(1)),
+                                         device=query.device).bool()
+        # shape is (batch*number_heads, num_queries, num_queries)
+        self_attn_mask = self_attn_mask[None].repeat(
+            len(query) * self.cache_dict['num_heads'], 1, 1)
+        for layer_index, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = \
+                    reference_points[:, :, None] * torch.cat(
+                        [valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = \
+                    reference_points[:, :, None] * valid_ratios[:, None]
+
+            query_sine_embed = coordinate_to_encoding(
+                reference_points_input[:, :, 0, :],
+                num_feats=self.embed_dims // 2)
+            query_pos = self.ref_point_head(query_sine_embed)
+
+            query = layer(
+                query,
+                query_pos=query_pos,
+                value=value,
+                key_padding_mask=key_padding_mask,
+                self_attn_mask=self_attn_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points_input,
+                **kwargs)
+
+            if not self.training:
+                tmp = reg_branches[layer_index](query)
+                assert reference_points.shape[-1] == 4
+                new_reference_points = tmp + inverse_sigmoid(
+                    reference_points, eps=1e-3)
+                new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+                if layer_index < (len(self.layers) - 1):
+                    self_attn_mask = self.select_distinct_queries(
+                        reference_points, query, self_attn_mask, layer_index)
+
+            else:
+                num_dense = self.cache_dict['num_dense_queries']
+                tmp = reg_branches[layer_index](query[:, :-num_dense])
+                tmp_dense = self.aux_reg_branches[layer_index](
+                    query[:, -num_dense:])
+
+                tmp = torch.cat([tmp, tmp_dense], dim=1)
+                assert reference_points.shape[-1] == 4
+                new_reference_points = tmp + inverse_sigmoid(
+                    reference_points, eps=1e-3)
+                new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+                if layer_index < (len(self.layers) - 1):
+                    self_attn_mask = self.select_distinct_queries(
+                        reference_points, query, self_attn_mask, layer_index)
+
+            if self.return_intermediate:
+                intermediate.append(self.norm(query))
+                intermediate_reference_points.append(new_reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return query, reference_points
diff --git a/head_extractor/build/lib/mmdet/models/layers/transformer/deformable_detr_layers.py b/head_extractor/build/lib/mmdet/models/layers/transformer/deformable_detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..da6325d61270eb3546a39d5487587bc0610434d6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/transformer/deformable_detr_layers.py
@@ -0,0 +1,265 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmcv.ops import MultiScaleDeformableAttention
+from mmengine.model import ModuleList
+from torch import Tensor, nn
+
+from .detr_layers import (DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DetrTransformerEncoder, DetrTransformerEncoderLayer)
+from .utils import inverse_sigmoid
+
+try:
+    from fairscale.nn.checkpoint import checkpoint_wrapper
+except Exception:
+    checkpoint_wrapper = None
+
+
+class DeformableDetrTransformerEncoder(DetrTransformerEncoder):
+    """Transformer encoder of Deformable DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize encoder layers."""
+        self.layers = ModuleList([
+            DeformableDetrTransformerEncoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+
+        if self.num_cp > 0:
+            if checkpoint_wrapper is None:
+                raise NotImplementedError(
+                    'If you want to reduce GPU memory usage, \
+                    please install fairscale by executing the \
+                    following command: pip install fairscale.')
+            for i in range(self.num_cp):
+                self.layers[i] = checkpoint_wrapper(self.layers[i])
+
+        self.embed_dims = self.layers[0].embed_dims
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, spatial_shapes: Tensor,
+                level_start_index: Tensor, valid_ratios: Tensor,
+                **kwargs) -> Tensor:
+        """Forward function of Transformer encoder.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            query_pos (Tensor): The positional encoding for query, has shape
+                (bs, num_queries, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (bs, num_queries).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+
+        Returns:
+            Tensor: Output queries of Transformer encoder, which is also
+            called 'encoder output embeddings' or 'memory', has shape
+            (bs, num_queries, dim)
+        """
+        reference_points = self.get_encoder_reference_points(
+            spatial_shapes, valid_ratios, device=query.device)
+        for layer in self.layers:
+            query = layer(
+                query=query,
+                query_pos=query_pos,
+                key_padding_mask=key_padding_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points,
+                **kwargs)
+        return query
+
+    @staticmethod
+    def get_encoder_reference_points(
+            spatial_shapes: Tensor, valid_ratios: Tensor,
+            device: Union[torch.device, str]) -> Tensor:
+        """Get the reference points used in encoder.
+
+        Args:
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            device (obj:`device` or str): The device acquired by the
+                `reference_points`.
+
+        Returns:
+            Tensor: Reference points used in decoder, has shape (bs, length,
+            num_levels, 2).
+        """
+
+        reference_points_list = []
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(
+                    0.5, H - 0.5, H, dtype=torch.float32, device=device),
+                torch.linspace(
+                    0.5, W - 0.5, W, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (
+                valid_ratios[:, None, lvl, 1] * H)
+            ref_x = ref_x.reshape(-1)[None] / (
+                valid_ratios[:, None, lvl, 0] * W)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        # [bs, sum(hw), num_level, 2]
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+
+class DeformableDetrTransformerDecoder(DetrTransformerDecoder):
+    """Transformer Decoder of Deformable DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        self.layers = ModuleList([
+            DeformableDetrTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        if self.post_norm_cfg is not None:
+            raise ValueError('There is not post_norm in '
+                             f'{self._get_name()}')
+
+    def forward(self,
+                query: Tensor,
+                query_pos: Tensor,
+                value: Tensor,
+                key_padding_mask: Tensor,
+                reference_points: Tensor,
+                spatial_shapes: Tensor,
+                level_start_index: Tensor,
+                valid_ratios: Tensor,
+                reg_branches: Optional[nn.Module] = None,
+                **kwargs) -> Tuple[Tensor]:
+        """Forward function of Transformer decoder.
+
+        Args:
+            query (Tensor): The input queries, has shape (bs, num_queries,
+                dim).
+            query_pos (Tensor): The input positional query, has shape
+                (bs, num_queries, dim). It will be added to `query` before
+                forward function.
+            value (Tensor): The input values, has shape (bs, num_value, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `cross_attn`
+                input. ByteTensor, has shape (bs, num_value).
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h) when `as_two_stage` is `True`, otherwise has
+                shape (bs, num_queries, 2) with the last dimension arranged
+                as (cx, cy).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            reg_branches: (obj:`nn.ModuleList`, optional): Used for refining
+                the regression results. Only would be passed when
+                `with_box_refine` is `True`, otherwise would be `None`.
+
+        Returns:
+            tuple[Tensor]: Outputs of Deformable Transformer Decoder.
+
+            - output (Tensor): Output embeddings of the last decoder, has
+              shape (num_queries, bs, embed_dims) when `return_intermediate`
+              is `False`. Otherwise, Intermediate output embeddings of all
+              decoder layers, has shape (num_decoder_layers, num_queries, bs,
+              embed_dims).
+            - reference_points (Tensor): The reference of the last decoder
+              layer, has shape (bs, num_queries, 4)  when `return_intermediate`
+              is `False`. Otherwise, Intermediate references of all decoder
+              layers, has shape (num_decoder_layers, bs, num_queries, 4). The
+              coordinates are arranged as (cx, cy, w, h)
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for layer_id, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = \
+                    reference_points[:, :, None] * \
+                    torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = \
+                    reference_points[:, :, None] * \
+                    valid_ratios[:, None]
+            output = layer(
+                output,
+                query_pos=query_pos,
+                value=value,
+                key_padding_mask=key_padding_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points_input,
+                **kwargs)
+
+            if reg_branches is not None:
+                tmp_reg_preds = reg_branches[layer_id](output)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp_reg_preds + inverse_sigmoid(
+                        reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    assert reference_points.shape[-1] == 2
+                    new_reference_points = tmp_reg_preds
+                    new_reference_points[..., :2] = tmp_reg_preds[
+                        ..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+class DeformableDetrTransformerEncoderLayer(DetrTransformerEncoderLayer):
+    """Encoder layer of Deformable DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize self_attn, ffn, and norms."""
+        self.self_attn = MultiScaleDeformableAttention(**self.self_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(2)
+        ]
+        self.norms = ModuleList(norms_list)
+
+
+class DeformableDetrTransformerDecoderLayer(DetrTransformerDecoderLayer):
+    """Decoder layer of Deformable DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize self_attn, cross-attn, ffn, and norms."""
+        self.self_attn = MultiheadAttention(**self.self_attn_cfg)
+        self.cross_attn = MultiScaleDeformableAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(3)
+        ]
+        self.norms = ModuleList(norms_list)
diff --git a/head_extractor/build/lib/mmdet/models/layers/transformer/detr_layers.py b/head_extractor/build/lib/mmdet/models/layers/transformer/detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a83dd2faa660ed8f54bdd08271db1fcf6b53886
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/transformer/detr_layers.py
@@ -0,0 +1,374 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmengine import ConfigDict
+from mmengine.model import BaseModule, ModuleList
+from torch import Tensor
+
+from mmdet.utils import ConfigType, OptConfigType
+
+try:
+    from fairscale.nn.checkpoint import checkpoint_wrapper
+except Exception:
+    checkpoint_wrapper = None
+
+
+class DetrTransformerEncoder(BaseModule):
+    """Encoder of DETR.
+
+    Args:
+        num_layers (int): Number of encoder layers.
+        layer_cfg (:obj:`ConfigDict` or dict): the config of each encoder
+            layer. All the layers will share the same config.
+        num_cp (int): Number of checkpointing blocks in encoder layer.
+            Default to -1.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_layers: int,
+                 layer_cfg: ConfigType,
+                 num_cp: int = -1,
+                 init_cfg: OptConfigType = None) -> None:
+
+        super().__init__(init_cfg=init_cfg)
+        self.num_layers = num_layers
+        self.layer_cfg = layer_cfg
+        self.num_cp = num_cp
+        assert self.num_cp <= self.num_layers
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize encoder layers."""
+        self.layers = ModuleList([
+            DetrTransformerEncoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+
+        if self.num_cp > 0:
+            if checkpoint_wrapper is None:
+                raise NotImplementedError(
+                    'If you want to reduce GPU memory usage, \
+                    please install fairscale by executing the \
+                    following command: pip install fairscale.')
+            for i in range(self.num_cp):
+                self.layers[i] = checkpoint_wrapper(self.layers[i])
+
+        self.embed_dims = self.layers[0].embed_dims
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, **kwargs) -> Tensor:
+        """Forward function of encoder.
+
+        Args:
+            query (Tensor): Input queries of encoder, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional embeddings of the queries, has
+                shape (bs, num_queries, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (bs, num_queries).
+
+        Returns:
+            Tensor: Has shape (bs, num_queries, dim) if `batch_first` is
+            `True`, otherwise (num_queries, bs, dim).
+        """
+        for layer in self.layers:
+            query = layer(query, query_pos, key_padding_mask, **kwargs)
+        return query
+
+
+class DetrTransformerDecoder(BaseModule):
+    """Decoder of DETR.
+
+    Args:
+        num_layers (int): Number of decoder layers.
+        layer_cfg (:obj:`ConfigDict` or dict): the config of each encoder
+            layer. All the layers will share the same config.
+        post_norm_cfg (:obj:`ConfigDict` or dict, optional): Config of the
+            post normalization layer. Defaults to `LN`.
+        return_intermediate (bool, optional): Whether to return outputs of
+            intermediate layers. Defaults to `True`,
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_layers: int,
+                 layer_cfg: ConfigType,
+                 post_norm_cfg: OptConfigType = dict(type='LN'),
+                 return_intermediate: bool = True,
+                 init_cfg: Union[dict, ConfigDict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.layer_cfg = layer_cfg
+        self.num_layers = num_layers
+        self.post_norm_cfg = post_norm_cfg
+        self.return_intermediate = return_intermediate
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        self.layers = ModuleList([
+            DetrTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        self.post_norm = build_norm_layer(self.post_norm_cfg,
+                                          self.embed_dims)[1]
+
+    def forward(self, query: Tensor, key: Tensor, value: Tensor,
+                query_pos: Tensor, key_pos: Tensor, key_padding_mask: Tensor,
+                **kwargs) -> Tensor:
+        """Forward function of decoder
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            key (Tensor): The input key, has shape (bs, num_keys, dim).
+            value (Tensor): The input value with the same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`, with the
+                same shape as `query`.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`.
+            key_padding_mask (Tensor): The `key_padding_mask` of `cross_attn`
+                input. ByteTensor, has shape (bs, num_value).
+
+        Returns:
+            Tensor: The forwarded results will have shape
+            (num_decoder_layers, bs, num_queries, dim) if
+            `return_intermediate` is `True` else (1, bs, num_queries, dim).
+        """
+        intermediate = []
+        for layer in self.layers:
+            query = layer(
+                query,
+                key=key,
+                value=value,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+            if self.return_intermediate:
+                intermediate.append(self.post_norm(query))
+        query = self.post_norm(query)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return query.unsqueeze(0)
+
+
+class DetrTransformerEncoderLayer(BaseModule):
+    """Implements encoder layer in DETR transformer.
+
+    Args:
+        self_attn_cfg (:obj:`ConfigDict` or dict, optional): Config for self
+            attention.
+        ffn_cfg (:obj:`ConfigDict` or dict, optional): Config for FFN.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config for
+            normalization layers. All the layers will share the same
+            config. Defaults to `LN`.
+        init_cfg (:obj:`ConfigDict` or dict, optional): Config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 self_attn_cfg: OptConfigType = dict(
+                     embed_dims=256, num_heads=8, dropout=0.0),
+                 ffn_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True)),
+                 norm_cfg: OptConfigType = dict(type='LN'),
+                 init_cfg: OptConfigType = None) -> None:
+
+        super().__init__(init_cfg=init_cfg)
+
+        self.self_attn_cfg = self_attn_cfg
+        if 'batch_first' not in self.self_attn_cfg:
+            self.self_attn_cfg['batch_first'] = True
+        else:
+            assert self.self_attn_cfg['batch_first'] is True, 'First \
+            dimension of all DETRs in mmdet is `batch`, \
+            please set `batch_first` flag.'
+
+        self.ffn_cfg = ffn_cfg
+        self.norm_cfg = norm_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize self-attention, FFN, and normalization."""
+        self.self_attn = MultiheadAttention(**self.self_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(2)
+        ]
+        self.norms = ModuleList(norms_list)
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, **kwargs) -> Tensor:
+        """Forward function of an encoder layer.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `query`.
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor. has shape (bs, num_queries).
+        Returns:
+            Tensor: forwarded results, has shape (bs, num_queries, dim).
+        """
+        query = self.self_attn(
+            query=query,
+            key=query,
+            value=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+        query = self.norms[0](query)
+        query = self.ffn(query)
+        query = self.norms[1](query)
+
+        return query
+
+
+class DetrTransformerDecoderLayer(BaseModule):
+    """Implements decoder layer in DETR transformer.
+
+    Args:
+        self_attn_cfg (:obj:`ConfigDict` or dict, optional): Config for self
+            attention.
+        cross_attn_cfg (:obj:`ConfigDict` or dict, optional): Config for cross
+            attention.
+        ffn_cfg (:obj:`ConfigDict` or dict, optional): Config for FFN.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config for
+            normalization layers. All the layers will share the same
+            config. Defaults to `LN`.
+        init_cfg (:obj:`ConfigDict` or dict, optional): Config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 self_attn_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     num_heads=8,
+                     dropout=0.0,
+                     batch_first=True),
+                 cross_attn_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     num_heads=8,
+                     dropout=0.0,
+                     batch_first=True),
+                 ffn_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 norm_cfg: OptConfigType = dict(type='LN'),
+                 init_cfg: OptConfigType = None) -> None:
+
+        super().__init__(init_cfg=init_cfg)
+
+        self.self_attn_cfg = self_attn_cfg
+        self.cross_attn_cfg = cross_attn_cfg
+        if 'batch_first' not in self.self_attn_cfg:
+            self.self_attn_cfg['batch_first'] = True
+        else:
+            assert self.self_attn_cfg['batch_first'] is True, 'First \
+            dimension of all DETRs in mmdet is `batch`, \
+            please set `batch_first` flag.'
+
+        if 'batch_first' not in self.cross_attn_cfg:
+            self.cross_attn_cfg['batch_first'] = True
+        else:
+            assert self.cross_attn_cfg['batch_first'] is True, 'First \
+            dimension of all DETRs in mmdet is `batch`, \
+            please set `batch_first` flag.'
+
+        self.ffn_cfg = ffn_cfg
+        self.norm_cfg = norm_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize self-attention, FFN, and normalization."""
+        self.self_attn = MultiheadAttention(**self.self_attn_cfg)
+        self.cross_attn = MultiheadAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(3)
+        ]
+        self.norms = ModuleList(norms_list)
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                value: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                self_attn_mask: Tensor = None,
+                cross_attn_mask: Tensor = None,
+                key_padding_mask: Tensor = None,
+                **kwargs) -> Tensor:
+        """
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            key (Tensor, optional): The input key, has shape (bs, num_keys,
+                dim). If `None`, the `query` will be used. Defaults to `None`.
+            value (Tensor, optional): The input value, has the same shape as
+                `key`, as in `nn.MultiheadAttention.forward`. If `None`, the
+                `key` will be used. Defaults to `None`.
+            query_pos (Tensor, optional): The positional encoding for `query`,
+                has the same shape as `query`. If not `None`, it will be added
+                to `query` before forward function. Defaults to `None`.
+            key_pos (Tensor, optional): The positional encoding for `key`, has
+                the same shape as `key`. If not `None`, it will be added to
+                `key` before forward function. If None, and `query_pos` has the
+                same shape as `key`, then `query_pos` will be used for
+                `key_pos`. Defaults to None.
+            self_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            cross_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor, optional): The `key_padding_mask` of
+                `self_attn` input. ByteTensor, has shape (bs, num_value).
+                Defaults to None.
+
+        Returns:
+            Tensor: forwarded results, has shape (bs, num_queries, dim).
+        """
+
+        query = self.self_attn(
+            query=query,
+            key=query,
+            value=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_mask,
+            **kwargs)
+        query = self.norms[0](query)
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            value=value,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=cross_attn_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+        query = self.norms[1](query)
+        query = self.ffn(query)
+        query = self.norms[2](query)
+
+        return query
diff --git a/head_extractor/build/lib/mmdet/models/layers/transformer/dino_layers.py b/head_extractor/build/lib/mmdet/models/layers/transformer/dino_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..64610d0a7c0121a88f5e4279b6f854924230237e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/transformer/dino_layers.py
@@ -0,0 +1,562 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Tuple, Union
+
+import torch
+from mmengine.model import BaseModule
+from torch import Tensor, nn
+
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_xyxy_to_cxcywh
+from mmdet.utils import OptConfigType
+from .deformable_detr_layers import DeformableDetrTransformerDecoder
+from .utils import MLP, coordinate_to_encoding, inverse_sigmoid
+
+
+class DinoTransformerDecoder(DeformableDetrTransformerDecoder):
+    """Transformer decoder of DINO."""
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        super()._init_layers()
+        self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims,
+                                  self.embed_dims, 2)
+        self.norm = nn.LayerNorm(self.embed_dims)
+
+    def forward(self, query: Tensor, value: Tensor, key_padding_mask: Tensor,
+                self_attn_mask: Tensor, reference_points: Tensor,
+                spatial_shapes: Tensor, level_start_index: Tensor,
+                valid_ratios: Tensor, reg_branches: nn.ModuleList,
+                **kwargs) -> Tuple[Tensor]:
+        """Forward function of Transformer decoder.
+
+        Args:
+            query (Tensor): The input query, has shape (num_queries, bs, dim).
+            value (Tensor): The input values, has shape (num_value, bs, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (num_queries, bs).
+            self_attn_mask (Tensor): The attention mask to prevent information
+                leakage from different denoising groups and matching parts, has
+                shape (num_queries_total, num_queries_total). It is `None` when
+                `self.training` is `False`.
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            reg_branches: (obj:`nn.ModuleList`): Used for refining the
+                regression results.
+
+        Returns:
+            tuple[Tensor]: Output queries and references of Transformer
+                decoder
+
+            - query (Tensor): Output embeddings of the last decoder, has
+              shape (num_queries, bs, embed_dims) when `return_intermediate`
+              is `False`. Otherwise, Intermediate output embeddings of all
+              decoder layers, has shape (num_decoder_layers, num_queries, bs,
+              embed_dims).
+            - reference_points (Tensor): The reference of the last decoder
+              layer, has shape (bs, num_queries, 4)  when `return_intermediate`
+              is `False`. Otherwise, Intermediate references of all decoder
+              layers, has shape (num_decoder_layers, bs, num_queries, 4). The
+              coordinates are arranged as (cx, cy, w, h)
+        """
+        intermediate = []
+        intermediate_reference_points = [reference_points]
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = \
+                    reference_points[:, :, None] * torch.cat(
+                        [valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = \
+                    reference_points[:, :, None] * valid_ratios[:, None]
+
+            query_sine_embed = coordinate_to_encoding(
+                reference_points_input[:, :, 0, :])
+            query_pos = self.ref_point_head(query_sine_embed)
+
+            query = layer(
+                query,
+                query_pos=query_pos,
+                value=value,
+                key_padding_mask=key_padding_mask,
+                self_attn_mask=self_attn_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points_input,
+                **kwargs)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](query)
+                assert reference_points.shape[-1] == 4
+                new_reference_points = tmp + inverse_sigmoid(
+                    reference_points, eps=1e-3)
+                new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            if self.return_intermediate:
+                intermediate.append(self.norm(query))
+                intermediate_reference_points.append(new_reference_points)
+                # NOTE this is for the "Look Forward Twice" module,
+                # in the DeformDETR, reference_points was appended.
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return query, reference_points
+
+
+class CdnQueryGenerator(BaseModule):
+    """Implement query generator of the Contrastive denoising (CDN) proposed in
+    `DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object
+    Detection <https://arxiv.org/abs/2203.03605>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/DINO>`_.
+
+    Args:
+        num_classes (int): Number of object classes.
+        embed_dims (int): The embedding dimensions of the generated queries.
+        num_matching_queries (int): The queries number of the matching part.
+            Used for generating dn_mask.
+        label_noise_scale (float): The scale of label noise, defaults to 0.5.
+        box_noise_scale (float): The scale of box noise, defaults to 1.0.
+        group_cfg (:obj:`ConfigDict` or dict, optional): The config of the
+            denoising queries grouping, includes `dynamic`, `num_dn_queries`,
+            and `num_groups`. Two grouping strategies, 'static dn groups' and
+            'dynamic dn groups', are supported. When `dynamic` is `False`,
+            the `num_groups` should be set, and the number of denoising query
+            groups will always be `num_groups`. When `dynamic` is `True`, the
+            `num_dn_queries` should be set, and the group number will be
+            dynamic to ensure that the denoising queries number will not exceed
+            `num_dn_queries` to prevent large fluctuations of memory. Defaults
+            to `None`.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 embed_dims: int,
+                 num_matching_queries: int,
+                 label_noise_scale: float = 0.5,
+                 box_noise_scale: float = 1.0,
+                 group_cfg: OptConfigType = None) -> None:
+        super().__init__()
+        self.num_classes = num_classes
+        self.embed_dims = embed_dims
+        self.num_matching_queries = num_matching_queries
+        self.label_noise_scale = label_noise_scale
+        self.box_noise_scale = box_noise_scale
+
+        # prepare grouping strategy
+        group_cfg = {} if group_cfg is None else group_cfg
+        self.dynamic_dn_groups = group_cfg.get('dynamic', True)
+        if self.dynamic_dn_groups:
+            if 'num_dn_queries' not in group_cfg:
+                warnings.warn("'num_dn_queries' should be set when using "
+                              'dynamic dn groups, use 100 as default.')
+            self.num_dn_queries = group_cfg.get('num_dn_queries', 100)
+            assert isinstance(self.num_dn_queries, int), \
+                f'Expected the num_dn_queries to have type int, but got ' \
+                f'{self.num_dn_queries}({type(self.num_dn_queries)}). '
+        else:
+            assert 'num_groups' in group_cfg, \
+                'num_groups should be set when using static dn groups'
+            self.num_groups = group_cfg['num_groups']
+            assert isinstance(self.num_groups, int), \
+                f'Expected the num_groups to have type int, but got ' \
+                f'{self.num_groups}({type(self.num_groups)}). '
+
+        # NOTE The original repo of DINO set the num_embeddings 92 for coco,
+        # 91 (0~90) of which represents target classes and the 92 (91)
+        # indicates `Unknown` class. However, the embedding of `unknown` class
+        # is not used in the original DINO.
+        # TODO: num_classes + 1 or num_classes ?
+        self.label_embedding = nn.Embedding(self.num_classes, self.embed_dims)
+
+    def __call__(self, batch_data_samples: SampleList) -> tuple:
+        """Generate contrastive denoising (cdn) queries with ground truth.
+
+        Descriptions of the Number Values in code and comments:
+            - num_target_total: the total target number of the input batch
+              samples.
+            - max_num_target: the max target number of the input batch samples.
+            - num_noisy_targets: the total targets number after adding noise,
+              i.e., num_target_total * num_groups * 2.
+            - num_denoising_queries: the length of the output batched queries,
+              i.e., max_num_target * num_groups * 2.
+
+        NOTE The format of input bboxes in batch_data_samples is unnormalized
+        (x, y, x, y), and the output bbox queries are embedded by normalized
+        (cx, cy, w, h) format bboxes going through inverse_sigmoid.
+
+        Args:
+            batch_data_samples (list[:obj:`DetDataSample`]): List of the batch
+                data samples, each includes `gt_instance` which has attributes
+                `bboxes` and `labels`. The `bboxes` has unnormalized coordinate
+                format (x, y, x, y).
+
+        Returns:
+            tuple: The outputs of the dn query generator.
+
+            - dn_label_query (Tensor): The output content queries for denoising
+              part, has shape (bs, num_denoising_queries, dim), where
+              `num_denoising_queries = max_num_target * num_groups * 2`.
+            - dn_bbox_query (Tensor): The output reference bboxes as positions
+              of queries for denoising part, which are embedded by normalized
+              (cx, cy, w, h) format bboxes going through inverse_sigmoid, has
+              shape (bs, num_denoising_queries, 4) with the last dimension
+              arranged as (cx, cy, w, h).
+            - attn_mask (Tensor): The attention mask to prevent information
+              leakage from different denoising groups and matching parts,
+              will be used as `self_attn_mask` of the `decoder`, has shape
+              (num_queries_total, num_queries_total), where `num_queries_total`
+              is the sum of `num_denoising_queries` and `num_matching_queries`.
+            - dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+        """
+        # normalize bbox and collate ground truth (gt)
+        gt_labels_list = []
+        gt_bboxes_list = []
+        for sample in batch_data_samples:
+            img_h, img_w = sample.img_shape
+            bboxes = sample.gt_instances.bboxes
+            factor = bboxes.new_tensor([img_w, img_h, img_w,
+                                        img_h]).unsqueeze(0)
+            bboxes_normalized = bboxes / factor
+            gt_bboxes_list.append(bboxes_normalized)
+            gt_labels_list.append(sample.gt_instances.labels)
+        gt_labels = torch.cat(gt_labels_list)  # (num_target_total, 4)
+        gt_bboxes = torch.cat(gt_bboxes_list)
+
+        num_target_list = [len(bboxes) for bboxes in gt_bboxes_list]
+        max_num_target = max(num_target_list)
+        num_groups = self.get_num_groups(max_num_target)
+
+        dn_label_query = self.generate_dn_label_query(gt_labels, num_groups)
+        dn_bbox_query = self.generate_dn_bbox_query(gt_bboxes, num_groups)
+
+        # The `batch_idx` saves the batch index of the corresponding sample
+        # for each target, has shape (num_target_total).
+        batch_idx = torch.cat([
+            torch.full_like(t.long(), i) for i, t in enumerate(gt_labels_list)
+        ])
+        dn_label_query, dn_bbox_query = self.collate_dn_queries(
+            dn_label_query, dn_bbox_query, batch_idx, len(batch_data_samples),
+            num_groups)
+
+        attn_mask = self.generate_dn_mask(
+            max_num_target, num_groups, device=dn_label_query.device)
+
+        dn_meta = dict(
+            num_denoising_queries=int(max_num_target * 2 * num_groups),
+            num_denoising_groups=num_groups)
+
+        return dn_label_query, dn_bbox_query, attn_mask, dn_meta
+
+    def get_num_groups(self, max_num_target: int = None) -> int:
+        """Calculate denoising query groups number.
+
+        Two grouping strategies, 'static dn groups' and 'dynamic dn groups',
+        are supported. When `self.dynamic_dn_groups` is `False`, the number
+        of denoising query groups will always be `self.num_groups`. When
+        `self.dynamic_dn_groups` is `True`, the group number will be dynamic,
+        ensuring the denoising queries number will not exceed
+        `self.num_dn_queries` to prevent large fluctuations of memory.
+
+        NOTE The `num_group` is shared for different samples in a batch. When
+        the target numbers in the samples varies, the denoising queries of the
+        samples containing fewer targets are padded to the max length.
+
+        Args:
+            max_num_target (int, optional): The max target number of the batch
+                samples. It will only be used when `self.dynamic_dn_groups` is
+                `True`. Defaults to `None`.
+
+        Returns:
+            int: The denoising group number of the current batch.
+        """
+        if self.dynamic_dn_groups:
+            assert max_num_target is not None, \
+                'group_queries should be provided when using ' \
+                'dynamic dn groups'
+            if max_num_target == 0:
+                num_groups = 1
+            else:
+                num_groups = self.num_dn_queries // max_num_target
+        else:
+            num_groups = self.num_groups
+        if num_groups < 1:
+            num_groups = 1
+        return int(num_groups)
+
+    def generate_dn_label_query(self, gt_labels: Tensor,
+                                num_groups: int) -> Tensor:
+        """Generate noisy labels and their query embeddings.
+
+        The strategy for generating noisy labels is: Randomly choose labels of
+        `self.label_noise_scale * 0.5` proportion and override each of them
+        with a random object category label.
+
+        NOTE Not add noise to all labels. Besides, the `self.label_noise_scale
+        * 0.5` arg is the ratio of the chosen positions, which is higher than
+        the actual proportion of noisy labels, because the labels to override
+        may be correct. And the gap becomes larger as the number of target
+        categories decreases. The users should notice this and modify the scale
+        arg or the corresponding logic according to specific dataset.
+
+        Args:
+            gt_labels (Tensor): The concatenated gt labels of all samples
+                in the batch, has shape (num_target_total, ) where
+                `num_target_total = sum(num_target_list)`.
+            num_groups (int): The number of denoising query groups.
+
+        Returns:
+            Tensor: The query embeddings of noisy labels, has shape
+            (num_noisy_targets, embed_dims), where `num_noisy_targets =
+            num_target_total * num_groups * 2`.
+        """
+        assert self.label_noise_scale > 0
+        gt_labels_expand = gt_labels.repeat(2 * num_groups,
+                                            1).view(-1)  # Note `* 2`  # noqa
+        p = torch.rand_like(gt_labels_expand.float())
+        chosen_indice = torch.nonzero(p < (self.label_noise_scale * 0.5)).view(
+            -1)  # Note `* 0.5`
+        new_labels = torch.randint_like(chosen_indice, 0, self.num_classes)
+        noisy_labels_expand = gt_labels_expand.scatter(0, chosen_indice,
+                                                       new_labels)
+        dn_label_query = self.label_embedding(noisy_labels_expand)
+        return dn_label_query
+
+    def generate_dn_bbox_query(self, gt_bboxes: Tensor,
+                               num_groups: int) -> Tensor:
+        """Generate noisy bboxes and their query embeddings.
+
+        The strategy for generating noisy bboxes is as follow:
+
+        .. code:: text
+
+            +--------------------+
+            |      negative      |
+            |    +----------+    |
+            |    | positive |    |
+            |    |    +-----|----+------------+
+            |    |    |     |    |            |
+            |    +----+-----+    |            |
+            |         |          |            |
+            +---------+----------+            |
+                      |                       |
+                      |        gt bbox        |
+                      |                       |
+                      |             +---------+----------+
+                      |             |         |          |
+                      |             |    +----+-----+    |
+                      |             |    |    |     |    |
+                      +-------------|--- +----+     |    |
+                                    |    | positive |    |
+                                    |    +----------+    |
+                                    |      negative      |
+                                    +--------------------+
+
+         The random noise is added to the top-left and down-right point
+         positions, hence, normalized (x, y, x, y) format of bboxes are
+         required. The noisy bboxes of positive queries have the points
+         both within the inner square, while those of negative queries
+         have the points both between the inner and outer squares.
+
+        Besides, the length of outer square is twice as long as that of
+        the inner square, i.e., self.box_noise_scale * w_or_h / 2.
+        NOTE The noise is added to all the bboxes. Moreover, there is still
+        unconsidered case when one point is within the positive square and
+        the others is between the inner and outer squares.
+
+        Args:
+            gt_bboxes (Tensor): The concatenated gt bboxes of all samples
+                in the batch, has shape (num_target_total, 4) with the last
+                dimension arranged as (cx, cy, w, h) where
+                `num_target_total = sum(num_target_list)`.
+            num_groups (int): The number of denoising query groups.
+
+        Returns:
+            Tensor: The output noisy bboxes, which are embedded by normalized
+            (cx, cy, w, h) format bboxes going through inverse_sigmoid, has
+            shape (num_noisy_targets, 4) with the last dimension arranged as
+            (cx, cy, w, h), where
+            `num_noisy_targets = num_target_total * num_groups * 2`.
+        """
+        assert self.box_noise_scale > 0
+        device = gt_bboxes.device
+
+        # expand gt_bboxes as groups
+        gt_bboxes_expand = gt_bboxes.repeat(2 * num_groups, 1)  # xyxy
+
+        # obtain index of negative queries in gt_bboxes_expand
+        positive_idx = torch.arange(
+            len(gt_bboxes), dtype=torch.long, device=device)
+        positive_idx = positive_idx.unsqueeze(0).repeat(num_groups, 1)
+        positive_idx += 2 * len(gt_bboxes) * torch.arange(
+            num_groups, dtype=torch.long, device=device)[:, None]
+        positive_idx = positive_idx.flatten()
+        negative_idx = positive_idx + len(gt_bboxes)
+
+        # determine the sign of each element in the random part of the added
+        # noise to be positive or negative randomly.
+        rand_sign = torch.randint_like(
+            gt_bboxes_expand, low=0, high=2,
+            dtype=torch.float32) * 2.0 - 1.0  # [low, high), 1 or -1, randomly
+
+        # calculate the random part of the added noise
+        rand_part = torch.rand_like(gt_bboxes_expand)  # [0, 1)
+        rand_part[negative_idx] += 1.0  # pos: [0, 1); neg: [1, 2)
+        rand_part *= rand_sign  # pos: (-1, 1); neg: (-2, -1] U [1, 2)
+
+        # add noise to the bboxes
+        bboxes_whwh = bbox_xyxy_to_cxcywh(gt_bboxes_expand)[:, 2:].repeat(1, 2)
+        noisy_bboxes_expand = gt_bboxes_expand + torch.mul(
+            rand_part, bboxes_whwh) * self.box_noise_scale / 2  # xyxy
+        noisy_bboxes_expand = noisy_bboxes_expand.clamp(min=0.0, max=1.0)
+        noisy_bboxes_expand = bbox_xyxy_to_cxcywh(noisy_bboxes_expand)
+
+        dn_bbox_query = inverse_sigmoid(noisy_bboxes_expand, eps=1e-3)
+        return dn_bbox_query
+
+    def collate_dn_queries(self, input_label_query: Tensor,
+                           input_bbox_query: Tensor, batch_idx: Tensor,
+                           batch_size: int, num_groups: int) -> Tuple[Tensor]:
+        """Collate generated queries to obtain batched dn queries.
+
+        The strategy for query collation is as follow:
+
+        .. code:: text
+
+                    input_queries (num_target_total, query_dim)
+            P_A1 P_B1 P_B2 N_A1 N_B1 N_B2 P'A1 P'B1 P'B2 N'A1 N'B1 N'B2
+              |________ group1 ________|    |________ group2 ________|
+                                         |
+                                         V
+                      P_A1 Pad0 N_A1 Pad0 P'A1 Pad0 N'A1 Pad0
+                      P_B1 P_B2 N_B1 N_B2 P'B1 P'B2 N'B1 N'B2
+                       |____ group1 ____| |____ group2 ____|
+             batched_queries (batch_size, max_num_target, query_dim)
+
+            where query_dim is 4 for bbox and self.embed_dims for label.
+            Notation: _-group 1; '-group 2;
+                      A-Sample1(has 1 target); B-sample2(has 2 targets)
+
+        Args:
+            input_label_query (Tensor): The generated label queries of all
+                targets, has shape (num_target_total, embed_dims) where
+                `num_target_total = sum(num_target_list)`.
+            input_bbox_query (Tensor): The generated bbox queries of all
+                targets, has shape (num_target_total, 4) with the last
+                dimension arranged as (cx, cy, w, h).
+            batch_idx (Tensor): The batch index of the corresponding sample
+                for each target, has shape (num_target_total).
+            batch_size (int): The size of the input batch.
+            num_groups (int): The number of denoising query groups.
+
+        Returns:
+            tuple[Tensor]: Output batched label and bbox queries.
+            - batched_label_query (Tensor): The output batched label queries,
+              has shape (batch_size, max_num_target, embed_dims).
+            - batched_bbox_query (Tensor): The output batched bbox queries,
+              has shape (batch_size, max_num_target, 4) with the last dimension
+              arranged as (cx, cy, w, h).
+        """
+        device = input_label_query.device
+        num_target_list = [
+            torch.sum(batch_idx == idx) for idx in range(batch_size)
+        ]
+        max_num_target = max(num_target_list)
+        num_denoising_queries = int(max_num_target * 2 * num_groups)
+
+        map_query_index = torch.cat([
+            torch.arange(num_target, device=device)
+            for num_target in num_target_list
+        ])
+        map_query_index = torch.cat([
+            map_query_index + max_num_target * i for i in range(2 * num_groups)
+        ]).long()
+        batch_idx_expand = batch_idx.repeat(2 * num_groups, 1).view(-1)
+        mapper = (batch_idx_expand, map_query_index)
+
+        batched_label_query = torch.zeros(
+            batch_size, num_denoising_queries, self.embed_dims, device=device)
+        batched_bbox_query = torch.zeros(
+            batch_size, num_denoising_queries, 4, device=device)
+
+        batched_label_query[mapper] = input_label_query
+        batched_bbox_query[mapper] = input_bbox_query
+        return batched_label_query, batched_bbox_query
+
+    def generate_dn_mask(self, max_num_target: int, num_groups: int,
+                         device: Union[torch.device, str]) -> Tensor:
+        """Generate attention mask to prevent information leakage from
+        different denoising groups and matching parts.
+
+        .. code:: text
+
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+         max_num_target |_|           |_________| num_matching_queries
+                        |_____________| num_denoising_queries
+
+               1 -> True  (Masked), means 'can not see'.
+               0 -> False (UnMasked), means 'can see'.
+
+        Args:
+            max_num_target (int): The max target number of the input batch
+                samples.
+            num_groups (int): The number of denoising query groups.
+            device (obj:`device` or str): The device of generated mask.
+
+        Returns:
+            Tensor: The attention mask to prevent information leakage from
+            different denoising groups and matching parts, will be used as
+            `self_attn_mask` of the `decoder`, has shape (num_queries_total,
+            num_queries_total), where `num_queries_total` is the sum of
+            `num_denoising_queries` and `num_matching_queries`.
+        """
+        num_denoising_queries = int(max_num_target * 2 * num_groups)
+        num_queries_total = num_denoising_queries + self.num_matching_queries
+        attn_mask = torch.zeros(
+            num_queries_total,
+            num_queries_total,
+            device=device,
+            dtype=torch.bool)
+        # Make the matching part cannot see the denoising groups
+        attn_mask[num_denoising_queries:, :num_denoising_queries] = True
+        # Make the denoising groups cannot see each other
+        for i in range(num_groups):
+            # Mask rows of one group per step.
+            row_scope = slice(max_num_target * 2 * i,
+                              max_num_target * 2 * (i + 1))
+            left_scope = slice(max_num_target * 2 * i)
+            right_scope = slice(max_num_target * 2 * (i + 1),
+                                num_denoising_queries)
+            attn_mask[row_scope, right_scope] = True
+            attn_mask[row_scope, left_scope] = True
+        return attn_mask
diff --git a/head_extractor/build/lib/mmdet/models/layers/transformer/grounding_dino_layers.py b/head_extractor/build/lib/mmdet/models/layers/transformer/grounding_dino_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c285768f36af98075607b43e48e6f1018125ad1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/transformer/grounding_dino_layers.py
@@ -0,0 +1,270 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmcv.ops import MultiScaleDeformableAttention
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from mmdet.models.utils.vlfuse_helper import SingleScaleBiAttentionBlock
+from mmdet.utils import ConfigType, OptConfigType
+from .deformable_detr_layers import (DeformableDetrTransformerDecoderLayer,
+                                     DeformableDetrTransformerEncoder,
+                                     DeformableDetrTransformerEncoderLayer)
+from .detr_layers import DetrTransformerEncoderLayer
+from .dino_layers import DinoTransformerDecoder
+from .utils import MLP, get_text_sine_pos_embed
+
+try:
+    from fairscale.nn.checkpoint import checkpoint_wrapper
+except Exception:
+    checkpoint_wrapper = None
+
+
+class GroundingDinoTransformerDecoderLayer(
+        DeformableDetrTransformerDecoderLayer):
+
+    def __init__(self,
+                 cross_attn_text_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     num_heads=8,
+                     dropout=0.0,
+                     batch_first=True),
+                 **kwargs) -> None:
+        """Decoder layer of Deformable DETR."""
+        self.cross_attn_text_cfg = cross_attn_text_cfg
+        if 'batch_first' not in self.cross_attn_text_cfg:
+            self.cross_attn_text_cfg['batch_first'] = True
+        super().__init__(**kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize self_attn, cross-attn, ffn, and norms."""
+        self.self_attn = MultiheadAttention(**self.self_attn_cfg)
+        self.cross_attn_text = MultiheadAttention(**self.cross_attn_text_cfg)
+        self.cross_attn = MultiScaleDeformableAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(4)
+        ]
+        self.norms = ModuleList(norms_list)
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                value: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                self_attn_mask: Tensor = None,
+                cross_attn_mask: Tensor = None,
+                key_padding_mask: Tensor = None,
+                memory_text: Tensor = None,
+                text_attention_mask: Tensor = None,
+                **kwargs) -> Tensor:
+        """Implements decoder layer in Grounding DINO transformer.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            key (Tensor, optional): The input key, has shape (bs, num_keys,
+                dim). If `None`, the `query` will be used. Defaults to `None`.
+            value (Tensor, optional): The input value, has the same shape as
+                `key`, as in `nn.MultiheadAttention.forward`. If `None`, the
+                `key` will be used. Defaults to `None`.
+            query_pos (Tensor, optional): The positional encoding for `query`,
+                has the same shape as `query`. If not `None`, it will be added
+                to `query` before forward function. Defaults to `None`.
+            key_pos (Tensor, optional): The positional encoding for `key`, has
+                the same shape as `key`. If not `None`, it will be added to
+                `key` before forward function. If None, and `query_pos` has the
+                same shape as `key`, then `query_pos` will be used for
+                `key_pos`. Defaults to None.
+            self_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            cross_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor, optional): The `key_padding_mask` of
+                `self_attn` input. ByteTensor, has shape (bs, num_value).
+                Defaults to None.
+            memory_text (Tensor): Memory text. It has shape (bs, len_text,
+                text_embed_dims).
+            text_attention_mask (Tensor): Text token mask. It has shape (bs,
+                len_text).
+
+        Returns:
+            Tensor: forwarded results, has shape (bs, num_queries, dim).
+        """
+        # self attention
+        query = self.self_attn(
+            query=query,
+            key=query,
+            value=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_mask,
+            **kwargs)
+        query = self.norms[0](query)
+        # cross attention between query and text
+        query = self.cross_attn_text(
+            query=query,
+            query_pos=query_pos,
+            key=memory_text,
+            value=memory_text,
+            key_padding_mask=text_attention_mask)
+        query = self.norms[1](query)
+        # cross attention between query and image
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            value=value,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=cross_attn_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+        query = self.norms[2](query)
+        query = self.ffn(query)
+        query = self.norms[3](query)
+
+        return query
+
+
+class GroundingDinoTransformerEncoder(DeformableDetrTransformerEncoder):
+
+    def __init__(self, text_layer_cfg: ConfigType,
+                 fusion_layer_cfg: ConfigType, **kwargs) -> None:
+        self.text_layer_cfg = text_layer_cfg
+        self.fusion_layer_cfg = fusion_layer_cfg
+        super().__init__(**kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize encoder layers."""
+        self.layers = ModuleList([
+            DeformableDetrTransformerEncoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.text_layers = ModuleList([
+            DetrTransformerEncoderLayer(**self.text_layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.fusion_layers = ModuleList([
+            SingleScaleBiAttentionBlock(**self.fusion_layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        if self.num_cp > 0:
+            if checkpoint_wrapper is None:
+                raise NotImplementedError(
+                    'If you want to reduce GPU memory usage, \
+                    please install fairscale by executing the \
+                    following command: pip install fairscale.')
+            for i in range(self.num_cp):
+                self.layers[i] = checkpoint_wrapper(self.layers[i])
+                self.fusion_layers[i] = checkpoint_wrapper(
+                    self.fusion_layers[i])
+
+    def forward(self,
+                query: Tensor,
+                query_pos: Tensor,
+                key_padding_mask: Tensor,
+                spatial_shapes: Tensor,
+                level_start_index: Tensor,
+                valid_ratios: Tensor,
+                memory_text: Tensor = None,
+                text_attention_mask: Tensor = None,
+                pos_text: Tensor = None,
+                text_self_attention_masks: Tensor = None,
+                position_ids: Tensor = None):
+        """Forward function of Transformer encoder.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            query_pos (Tensor): The positional encoding for query, has shape
+                (bs, num_queries, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (bs, num_queries).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            memory_text (Tensor, optional): Memory text. It has shape (bs,
+                len_text, text_embed_dims).
+            text_attention_mask (Tensor, optional): Text token mask. It has
+                shape (bs,len_text).
+            pos_text (Tensor, optional): The positional encoding for text.
+                Defaults to None.
+            text_self_attention_masks (Tensor, optional): Text self attention
+                mask. Defaults to None.
+            position_ids (Tensor, optional): Text position ids.
+                Defaults to None.
+        """
+        output = query
+        reference_points = self.get_encoder_reference_points(
+            spatial_shapes, valid_ratios, device=query.device)
+        if self.text_layers:
+            # generate pos_text
+            bs, n_text, _ = memory_text.shape
+            if pos_text is None and position_ids is None:
+                pos_text = (
+                    torch.arange(n_text,
+                                 device=memory_text.device).float().unsqueeze(
+                                     0).unsqueeze(-1).repeat(bs, 1, 1))
+                pos_text = get_text_sine_pos_embed(
+                    pos_text, num_pos_feats=256, exchange_xy=False)
+            if position_ids is not None:
+                pos_text = get_text_sine_pos_embed(
+                    position_ids[..., None],
+                    num_pos_feats=256,
+                    exchange_xy=False)
+
+        # main process
+        for layer_id, layer in enumerate(self.layers):
+            if self.fusion_layers:
+                output, memory_text = self.fusion_layers[layer_id](
+                    visual_feature=output,
+                    lang_feature=memory_text,
+                    attention_mask_v=key_padding_mask,
+                    attention_mask_l=text_attention_mask,
+                )
+            if self.text_layers:
+                text_num_heads = self.text_layers[
+                    layer_id].self_attn_cfg.num_heads
+                memory_text = self.text_layers[layer_id](
+                    query=memory_text,
+                    query_pos=(pos_text if pos_text is not None else None),
+                    attn_mask=~text_self_attention_masks.repeat(
+                        text_num_heads, 1, 1),  # note we use ~ for mask here
+                    key_padding_mask=None,
+                )
+            output = layer(
+                query=output,
+                query_pos=query_pos,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                key_padding_mask=key_padding_mask)
+        return output, memory_text
+
+
+class GroundingDinoTransformerDecoder(DinoTransformerDecoder):
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        self.layers = ModuleList([
+            GroundingDinoTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        if self.post_norm_cfg is not None:
+            raise ValueError('There is not post_norm in '
+                             f'{self._get_name()}')
+        self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims,
+                                  self.embed_dims, 2)
+        self.norm = nn.LayerNorm(self.embed_dims)
diff --git a/head_extractor/build/lib/mmdet/models/layers/transformer/mask2former_layers.py b/head_extractor/build/lib/mmdet/models/layers/transformer/mask2former_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcc604e277d91151334ed520d78e6a5a8f388036
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/transformer/mask2former_layers.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_norm_layer
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from .deformable_detr_layers import DeformableDetrTransformerEncoder
+from .detr_layers import DetrTransformerDecoder, DetrTransformerDecoderLayer
+
+
+class Mask2FormerTransformerEncoder(DeformableDetrTransformerEncoder):
+    """Encoder in PixelDecoder of Mask2Former."""
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, spatial_shapes: Tensor,
+                level_start_index: Tensor, valid_ratios: Tensor,
+                reference_points: Tensor, **kwargs) -> Tensor:
+        """Forward function of Transformer encoder.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            query_pos (Tensor): The positional encoding for query, has shape
+                (bs, num_queries, dim). If not None, it will be added to the
+                `query` before forward function. Defaults to None.
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (bs, num_queries).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 2) with the last dimension arranged
+                as (cx, cy).
+
+        Returns:
+            Tensor: Output queries of Transformer encoder, which is also
+            called 'encoder output embeddings' or 'memory', has shape
+            (bs, num_queries, dim)
+        """
+        for layer in self.layers:
+            query = layer(
+                query=query,
+                query_pos=query_pos,
+                key_padding_mask=key_padding_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points,
+                **kwargs)
+        return query
+
+
+class Mask2FormerTransformerDecoder(DetrTransformerDecoder):
+    """Decoder of Mask2Former."""
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        self.layers = ModuleList([
+            Mask2FormerTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        self.post_norm = build_norm_layer(self.post_norm_cfg,
+                                          self.embed_dims)[1]
+
+
+class Mask2FormerTransformerDecoderLayer(DetrTransformerDecoderLayer):
+    """Implements decoder layer in Mask2Former transformer."""
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                value: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                self_attn_mask: Tensor = None,
+                cross_attn_mask: Tensor = None,
+                key_padding_mask: Tensor = None,
+                **kwargs) -> Tensor:
+        """
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            key (Tensor, optional): The input key, has shape (bs, num_keys,
+                dim). If `None`, the `query` will be used. Defaults to `None`.
+            value (Tensor, optional): The input value, has the same shape as
+                `key`, as in `nn.MultiheadAttention.forward`. If `None`, the
+                `key` will be used. Defaults to `None`.
+            query_pos (Tensor, optional): The positional encoding for `query`,
+                has the same shape as `query`. If not `None`, it will be added
+                to `query` before forward function. Defaults to `None`.
+            key_pos (Tensor, optional): The positional encoding for `key`, has
+                the same shape as `key`. If not `None`, it will be added to
+                `key` before forward function. If None, and `query_pos` has the
+                same shape as `key`, then `query_pos` will be used for
+                `key_pos`. Defaults to None.
+            self_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            cross_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor, optional): The `key_padding_mask` of
+                `self_attn` input. ByteTensor, has shape (bs, num_value).
+                Defaults to None.
+
+        Returns:
+            Tensor: forwarded results, has shape (bs, num_queries, dim).
+        """
+
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            value=value,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=cross_attn_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+        query = self.norms[0](query)
+        query = self.self_attn(
+            query=query,
+            key=query,
+            value=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_mask,
+            **kwargs)
+        query = self.norms[1](query)
+        query = self.ffn(query)
+        query = self.norms[2](query)
+
+        return query
diff --git a/head_extractor/build/lib/mmdet/models/layers/transformer/utils.py b/head_extractor/build/lib/mmdet/models/layers/transformer/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e43a172ca7175b23c82f60894faf38ec6c437e3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/layers/transformer/utils.py
@@ -0,0 +1,915 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from typing import Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer,
+                      build_norm_layer)
+from mmcv.cnn.bricks.drop import Dropout
+from mmengine.model import BaseModule, ModuleList
+from mmengine.utils import to_2tuple
+from torch import Tensor, nn
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+def nlc_to_nchw(x: Tensor, hw_shape: Sequence[int]) -> Tensor:
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len does not match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+def coordinate_to_encoding(coord_tensor: Tensor,
+                           num_feats: int = 128,
+                           temperature: int = 10000,
+                           scale: float = 2 * math.pi):
+    """Convert coordinate tensor to positional encoding.
+
+    Args:
+        coord_tensor (Tensor): Coordinate tensor to be converted to
+            positional encoding. With the last dimension as 2 or 4.
+        num_feats (int, optional): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value. Defaults to 128.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+    Returns:
+        Tensor: Returned encoded positional tensor.
+    """
+    dim_t = torch.arange(
+        num_feats, dtype=torch.float32, device=coord_tensor.device)
+    dim_t = temperature**(2 * (dim_t // 2) / num_feats)
+    x_embed = coord_tensor[..., 0] * scale
+    y_embed = coord_tensor[..., 1] * scale
+    pos_x = x_embed[..., None] / dim_t
+    pos_y = y_embed[..., None] / dim_t
+    pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()),
+                        dim=-1).flatten(2)
+    pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()),
+                        dim=-1).flatten(2)
+    if coord_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=-1)
+    elif coord_tensor.size(-1) == 4:
+        w_embed = coord_tensor[..., 2] * scale
+        pos_w = w_embed[..., None] / dim_t
+        pos_w = torch.stack((pos_w[..., 0::2].sin(), pos_w[..., 1::2].cos()),
+                            dim=-1).flatten(2)
+
+        h_embed = coord_tensor[..., 3] * scale
+        pos_h = h_embed[..., None] / dim_t
+        pos_h = torch.stack((pos_h[..., 0::2].sin(), pos_h[..., 1::2].cos()),
+                            dim=-1).flatten(2)
+
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=-1)
+    else:
+        raise ValueError('Unknown pos_tensor shape(-1):{}'.format(
+            coord_tensor.size(-1)))
+    return pos
+
+
+def inverse_sigmoid(x: Tensor, eps: float = 1e-5) -> Tensor:
+    """Inverse function of sigmoid.
+
+    Args:
+        x (Tensor): The tensor to do the inverse.
+        eps (float): EPS avoid numerical overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse function of sigmoid, has the same
+        shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding to input (if needed) so that input can get fully covered
+    by filter you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
+    input. The "corner"  mode would pad zero to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel:
+        stride (int | tuple): Stride of the filter. Default: 1:
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+
+        super(AdaptivePadding, self).__init__()
+
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        padding = to_2tuple(padding)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The config dict for embedding
+            conv layer type selection. Default: "Conv2d.
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: None (Would be set as `kernel_size`).
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only work when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmengine.ConfigDict`, optional): The Config for
+            initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 3,
+                 embed_dims: int = 768,
+                 conv_type: str = 'Conv2d',
+                 kernel_size: int = 16,
+                 stride: int = 16,
+                 padding: Union[int, tuple, str] = 'corner',
+                 dilation: int = 1,
+                 bias: bool = True,
+                 norm_cfg: OptConfigType = None,
+                 input_size: Union[int, tuple] = None,
+                 init_cfg: OptConfigType = None) -> None:
+        super(PatchEmbed, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adap_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adap_padding:
+                pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tuple[int]]:
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (out_h, out_w).
+        """
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
+    merge patch, which is about 25% faster than original implementation.
+    Instead, we need to modify pretrained models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+            to gets fully covered by filter and stride you specified..
+            Default: True.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Optional[Union[int, tuple]] = 2,
+                 stride: Optional[Union[int, tuple]] = None,
+                 padding: Union[int, tuple, str] = 'corner',
+                 dilation: Optional[Union[int, tuple]] = 1,
+                 bias: Optional[bool] = False,
+                 norm_cfg: OptConfigType = dict(type='LN'),
+                 init_cfg: OptConfigType = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adap_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x: Tensor,
+                input_size: Tuple[int]) -> Tuple[Tensor, Tuple[int]]:
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+            H, W = x.shape[-2:]
+
+        x = self.sampler(x)
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
+
+
+class ConditionalAttention(BaseModule):
+    """A wrapper of conditional attention, dropout and residual connection.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop: A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        cross_attn (bool): Whether the attention module is for cross attention.
+            Default: False
+        keep_query_pos (bool): Whether to transform query_pos before cross
+            attention.
+            Default: False.
+        batch_first (bool): When it is True, Key, Query and Value are shape of
+            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+             Default: True.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 num_heads: int,
+                 attn_drop: float = 0.,
+                 proj_drop: float = 0.,
+                 cross_attn: bool = False,
+                 keep_query_pos: bool = False,
+                 batch_first: bool = True,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+
+        assert batch_first is True, 'Set `batch_first`\
+        to False is NOT supported in ConditionalAttention. \
+        First dimension of all DETRs in mmdet is `batch`, \
+        please set `batch_first` to True.'
+
+        self.cross_attn = cross_attn
+        self.keep_query_pos = keep_query_pos
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.attn_drop = Dropout(attn_drop)
+        self.proj_drop = Dropout(proj_drop)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers for qkv projection."""
+        embed_dims = self.embed_dims
+        self.qcontent_proj = Linear(embed_dims, embed_dims)
+        self.qpos_proj = Linear(embed_dims, embed_dims)
+        self.kcontent_proj = Linear(embed_dims, embed_dims)
+        self.kpos_proj = Linear(embed_dims, embed_dims)
+        self.v_proj = Linear(embed_dims, embed_dims)
+        if self.cross_attn:
+            self.qpos_sine_proj = Linear(embed_dims, embed_dims)
+        self.out_proj = Linear(embed_dims, embed_dims)
+
+        nn.init.constant_(self.out_proj.bias, 0.)
+
+    def forward_attn(self,
+                     query: Tensor,
+                     key: Tensor,
+                     value: Tensor,
+                     attn_mask: Tensor = None,
+                     key_padding_mask: Tensor = None) -> Tuple[Tensor]:
+        """Forward process for `ConditionalAttention`.
+
+        Args:
+            query (Tensor): The input query with shape [bs, num_queries,
+                embed_dims].
+            key (Tensor): The key tensor with shape [bs, num_keys,
+                embed_dims].
+                If None, the `query` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+        Returns:
+            Tuple[Tensor]: Attention outputs of shape :math:`(N, L, E)`,
+            where :math:`N` is the batch size, :math:`L` is the target
+            sequence length , and :math:`E` is the embedding dimension
+            `embed_dim`. Attention weights per head of shape :math:`
+            (num_heads, L, S)`. where :math:`N` is batch size, :math:`L`
+            is target sequence length, and :math:`S` is the source sequence
+            length.
+        """
+        assert key.size(1) == value.size(1), \
+            f'{"key, value must have the same sequence length"}'
+        assert query.size(0) == key.size(0) == value.size(0), \
+            f'{"batch size must be equal for query, key, value"}'
+        assert query.size(2) == key.size(2), \
+            f'{"q_dims, k_dims must be equal"}'
+        assert value.size(2) == self.embed_dims, \
+            f'{"v_dims must be equal to embed_dims"}'
+
+        bs, tgt_len, hidden_dims = query.size()
+        _, src_len, _ = key.size()
+        head_dims = hidden_dims // self.num_heads
+        v_head_dims = self.embed_dims // self.num_heads
+        assert head_dims * self.num_heads == hidden_dims, \
+            f'{"hidden_dims must be divisible by num_heads"}'
+        scaling = float(head_dims)**-0.5
+
+        q = query * scaling
+        k = key
+        v = value
+
+        if attn_mask is not None:
+            assert attn_mask.dtype == torch.float32 or \
+                   attn_mask.dtype == torch.float64 or \
+                   attn_mask.dtype == torch.float16 or \
+                   attn_mask.dtype == torch.uint8 or \
+                   attn_mask.dtype == torch.bool, \
+                   'Only float, byte, and bool types are supported for \
+                    attn_mask'
+
+            if attn_mask.dtype == torch.uint8:
+                warnings.warn('Byte tensor for attn_mask is deprecated.\
+                     Use bool tensor instead.')
+                attn_mask = attn_mask.to(torch.bool)
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(0)
+                if list(attn_mask.size()) != [1, query.size(1), key.size(1)]:
+                    raise RuntimeError(
+                        'The size of the 2D attn_mask is not correct.')
+            elif attn_mask.dim() == 3:
+                if list(attn_mask.size()) != [
+                        bs * self.num_heads,
+                        query.size(1),
+                        key.size(1)
+                ]:
+                    raise RuntimeError(
+                        'The size of the 3D attn_mask is not correct.')
+            else:
+                raise RuntimeError(
+                    "attn_mask's dimension {} is not supported".format(
+                        attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+
+        if key_padding_mask is not None and key_padding_mask.dtype == int:
+            key_padding_mask = key_padding_mask.to(torch.bool)
+
+        q = q.contiguous().view(bs, tgt_len, self.num_heads,
+                                head_dims).permute(0, 2, 1, 3).flatten(0, 1)
+        if k is not None:
+            k = k.contiguous().view(bs, src_len, self.num_heads,
+                                    head_dims).permute(0, 2, 1,
+                                                       3).flatten(0, 1)
+        if v is not None:
+            v = v.contiguous().view(bs, src_len, self.num_heads,
+                                    v_head_dims).permute(0, 2, 1,
+                                                         3).flatten(0, 1)
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bs
+            assert key_padding_mask.size(1) == src_len
+
+        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+        assert list(attn_output_weights.size()) == [
+            bs * self.num_heads, tgt_len, src_len
+        ]
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+            else:
+                attn_output_weights += attn_mask
+
+        if key_padding_mask is not None:
+            attn_output_weights = attn_output_weights.view(
+                bs, self.num_heads, tgt_len, src_len)
+            attn_output_weights = attn_output_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                float('-inf'),
+            )
+            attn_output_weights = attn_output_weights.view(
+                bs * self.num_heads, tgt_len, src_len)
+
+        attn_output_weights = F.softmax(
+            attn_output_weights -
+            attn_output_weights.max(dim=-1, keepdim=True)[0],
+            dim=-1)
+        attn_output_weights = self.attn_drop(attn_output_weights)
+
+        attn_output = torch.bmm(attn_output_weights, v)
+        assert list(
+            attn_output.size()) == [bs * self.num_heads, tgt_len, v_head_dims]
+        attn_output = attn_output.view(bs, self.num_heads, tgt_len,
+                                       v_head_dims).permute(0, 2, 1,
+                                                            3).flatten(2)
+        attn_output = self.out_proj(attn_output)
+
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bs, self.num_heads,
+                                                       tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / self.num_heads
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                query_pos: Tensor = None,
+                ref_sine_embed: Tensor = None,
+                key_pos: Tensor = None,
+                attn_mask: Tensor = None,
+                key_padding_mask: Tensor = None,
+                is_first: bool = False) -> Tensor:
+        """Forward function for `ConditionalAttention`.
+        Args:
+            query (Tensor): The input query with shape [bs, num_queries,
+                embed_dims].
+            key (Tensor): The key tensor with shape [bs, num_keys,
+                embed_dims].
+                If None, the `query` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query in self
+                attention, with the same shape as `x`. If not None, it will
+                be added to `x` before forward function.
+                Defaults to None.
+            query_sine_embed (Tensor): The positional encoding for query in
+                cross attention, with the same shape as `x`. If not None, it
+                will be added to `x` before forward function.
+                Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+            is_first (bool): A indicator to tell whether the current layer
+                is the first layer of the decoder.
+                Defaults to False.
+        Returns:
+            Tensor: forwarded results with shape
+            [bs, num_queries, embed_dims].
+        """
+
+        if self.cross_attn:
+            q_content = self.qcontent_proj(query)
+            k_content = self.kcontent_proj(key)
+            v = self.v_proj(key)
+
+            bs, nq, c = q_content.size()
+            _, hw, _ = k_content.size()
+
+            k_pos = self.kpos_proj(key_pos)
+            if is_first or self.keep_query_pos:
+                q_pos = self.qpos_proj(query_pos)
+                q = q_content + q_pos
+                k = k_content + k_pos
+            else:
+                q = q_content
+                k = k_content
+            q = q.view(bs, nq, self.num_heads, c // self.num_heads)
+            query_sine_embed = self.qpos_sine_proj(ref_sine_embed)
+            query_sine_embed = query_sine_embed.view(bs, nq, self.num_heads,
+                                                     c // self.num_heads)
+            q = torch.cat([q, query_sine_embed], dim=3).view(bs, nq, 2 * c)
+            k = k.view(bs, hw, self.num_heads, c // self.num_heads)
+            k_pos = k_pos.view(bs, hw, self.num_heads, c // self.num_heads)
+            k = torch.cat([k, k_pos], dim=3).view(bs, hw, 2 * c)
+            ca_output = self.forward_attn(
+                query=q,
+                key=k,
+                value=v,
+                attn_mask=attn_mask,
+                key_padding_mask=key_padding_mask)[0]
+            query = query + self.proj_drop(ca_output)
+        else:
+            q_content = self.qcontent_proj(query)
+            q_pos = self.qpos_proj(query_pos)
+            k_content = self.kcontent_proj(query)
+            k_pos = self.kpos_proj(query_pos)
+            v = self.v_proj(query)
+            q = q_content if q_pos is None else q_content + q_pos
+            k = k_content if k_pos is None else k_content + k_pos
+            sa_output = self.forward_attn(
+                query=q,
+                key=k,
+                value=v,
+                attn_mask=attn_mask,
+                key_padding_mask=key_padding_mask)[0]
+            query = query + self.proj_drop(sa_output)
+
+        return query
+
+
+class MLP(BaseModule):
+    """Very simple multi-layer perceptron (also called FFN) with relu. Mostly
+    used in DETR series detectors.
+
+    Args:
+        input_dim (int): Feature dim of the input tensor.
+        hidden_dim (int): Feature dim of the hidden layer.
+        output_dim (int): Feature dim of the output tensor.
+        num_layers (int): Number of FFN layers. As the last
+            layer of MLP only contains FFN (Linear).
+    """
+
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int,
+                 num_layers: int) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = ModuleList(
+            Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function of MLP.
+
+        Args:
+            x (Tensor): The input feature, has shape
+                (num_queries, bs, input_dim).
+        Returns:
+            Tensor: The output feature, has shape
+                (num_queries, bs, output_dim).
+        """
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+@MODELS.register_module()
+class DynamicConv(BaseModule):
+    """Implements Dynamic Convolution.
+
+    This module generate parameters for each sample and
+    use bmm to implement 1*1 convolution. Code is modified
+    from the `official github repo <https://github.com/PeizeSun/
+    SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py#L258>`_ .
+
+    Args:
+        in_channels (int): The input feature channel.
+            Defaults to 256.
+        feat_channels (int): The inner feature channel.
+            Defaults to 64.
+        out_channels (int, optional): The output feature channel.
+            When not specified, it will be set to `in_channels`
+            by default
+        input_feat_shape (int): The shape of input feature.
+            Defaults to 7.
+        with_proj (bool): Project two-dimentional feature to
+            one-dimentional feature. Default to True.
+        act_cfg (dict): The activation config for DynamicConv.
+        norm_cfg (dict): Config dict for normalization layer. Default
+            layer normalization.
+        init_cfg (obj:`mmengine.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 256,
+                 feat_channels: int = 64,
+                 out_channels: Optional[int] = None,
+                 input_feat_shape: int = 7,
+                 with_proj: bool = True,
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 norm_cfg: OptConfigType = dict(type='LN'),
+                 init_cfg: OptConfigType = None) -> None:
+        super(DynamicConv, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.out_channels_raw = out_channels
+        self.input_feat_shape = input_feat_shape
+        self.with_proj = with_proj
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.out_channels = out_channels if out_channels else in_channels
+
+        self.num_params_in = self.in_channels * self.feat_channels
+        self.num_params_out = self.out_channels * self.feat_channels
+        self.dynamic_layer = nn.Linear(
+            self.in_channels, self.num_params_in + self.num_params_out)
+
+        self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.norm_out = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+        self.activation = build_activation_layer(act_cfg)
+
+        num_output = self.out_channels * input_feat_shape**2
+        if self.with_proj:
+            self.fc_layer = nn.Linear(num_output, self.out_channels)
+            self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+    def forward(self, param_feature: Tensor, input_feature: Tensor) -> Tensor:
+        """Forward function for `DynamicConv`.
+
+        Args:
+            param_feature (Tensor): The feature can be used
+                to generate the parameter, has shape
+                (num_all_proposals, in_channels).
+            input_feature (Tensor): Feature that
+                interact with parameters, has shape
+                (num_all_proposals, in_channels, H, W).
+
+        Returns:
+            Tensor: The output feature has shape
+            (num_all_proposals, out_channels).
+        """
+        input_feature = input_feature.flatten(2).permute(2, 0, 1)
+
+        input_feature = input_feature.permute(1, 0, 2)
+        parameters = self.dynamic_layer(param_feature)
+
+        param_in = parameters[:, :self.num_params_in].view(
+            -1, self.in_channels, self.feat_channels)
+        param_out = parameters[:, -self.num_params_out:].view(
+            -1, self.feat_channels, self.out_channels)
+
+        # input_feature has shape (num_all_proposals, H*W, in_channels)
+        # param_in has shape (num_all_proposals, in_channels, feat_channels)
+        # feature has shape (num_all_proposals, H*W, feat_channels)
+        features = torch.bmm(input_feature, param_in)
+        features = self.norm_in(features)
+        features = self.activation(features)
+
+        # param_out has shape (batch_size, feat_channels, out_channels)
+        features = torch.bmm(features, param_out)
+        features = self.norm_out(features)
+        features = self.activation(features)
+
+        if self.with_proj:
+            features = features.flatten(1)
+            features = self.fc_layer(features)
+            features = self.fc_norm(features)
+            features = self.activation(features)
+
+        return features
+
+
+def get_text_sine_pos_embed(
+    pos_tensor: torch.Tensor,
+    num_pos_feats: int = 128,
+    temperature: int = 10000,
+    exchange_xy: bool = True,
+):
+    """generate sine position embedding from a position tensor
+    Args:
+        pos_tensor (torch.Tensor): shape: [..., n].
+        num_pos_feats (int): projected shape for each float in the tensor.
+        temperature (int): temperature in the sine/cosine function.
+        exchange_xy (bool, optional): exchange pos x and pos y. For example,
+            input tensor is [x,y], the results will be [pos(y), pos(x)].
+            Defaults to True.
+    Returns:
+        pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(
+        num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = temperature**(2 * torch.div(dim_t, 2, rounding_mode='floor') /
+                          num_pos_feats)
+
+    def sine_func(x: torch.Tensor):
+        sin_x = x * scale / dim_t
+        sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()),
+                            dim=3).flatten(2)
+        return sin_x
+
+    pos_res = [
+        sine_func(x)
+        for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)
+    ]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = torch.cat(pos_res, dim=-1)
+    return pos_res
diff --git a/head_extractor/build/lib/mmdet/models/losses/__init__.py b/head_extractor/build/lib/mmdet/models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c57a3a96879c6bd5eb61c300d316e2b4579b287
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .accuracy import Accuracy, accuracy
+from .ae_loss import AssociativeEmbeddingLoss
+from .balanced_l1_loss import BalancedL1Loss, balanced_l1_loss
+from .cross_entropy_loss import (CrossEntropyCustomLoss, CrossEntropyLoss,
+                                 binary_cross_entropy, cross_entropy,
+                                 mask_cross_entropy)
+from .ddq_detr_aux_loss import DDQAuxLoss
+from .dice_loss import DiceLoss
+from .eqlv2_loss import EQLV2Loss
+from .focal_loss import FocalCustomLoss, FocalLoss, sigmoid_focal_loss
+from .gaussian_focal_loss import GaussianFocalLoss
+from .gfocal_loss import DistributionFocalLoss, QualityFocalLoss
+from .ghm_loss import GHMC, GHMR
+from .iou_loss import (BoundedIoULoss, CIoULoss, DIoULoss, EIoULoss, GIoULoss,
+                       IoULoss, SIoULoss, bounded_iou_loss, iou_loss)
+from .kd_loss import KnowledgeDistillationKLDivLoss
+from .l2_loss import L2Loss
+from .margin_loss import MarginL2Loss
+from .mse_loss import MSELoss, mse_loss
+from .multipos_cross_entropy_loss import MultiPosCrossEntropyLoss
+from .pisa_loss import carl_loss, isr_p
+from .seesaw_loss import SeesawLoss
+from .smooth_l1_loss import L1Loss, SmoothL1Loss, l1_loss, smooth_l1_loss
+from .triplet_loss import TripletLoss
+from .utils import reduce_loss, weight_reduce_loss, weighted_loss
+from .varifocal_loss import VarifocalLoss
+
+__all__ = [
+    'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
+    'mask_cross_entropy', 'CrossEntropyLoss', 'sigmoid_focal_loss',
+    'FocalLoss', 'smooth_l1_loss', 'SmoothL1Loss', 'balanced_l1_loss',
+    'BalancedL1Loss', 'mse_loss', 'MSELoss', 'iou_loss', 'bounded_iou_loss',
+    'IoULoss', 'BoundedIoULoss', 'GIoULoss', 'DIoULoss', 'CIoULoss',
+    'EIoULoss', 'SIoULoss', 'GHMC', 'GHMR', 'reduce_loss',
+    'weight_reduce_loss', 'weighted_loss', 'L1Loss', 'l1_loss', 'isr_p',
+    'carl_loss', 'AssociativeEmbeddingLoss', 'GaussianFocalLoss',
+    'QualityFocalLoss', 'DistributionFocalLoss', 'VarifocalLoss',
+    'KnowledgeDistillationKLDivLoss', 'SeesawLoss', 'DiceLoss', 'EQLV2Loss',
+    'MarginL2Loss', 'MultiPosCrossEntropyLoss', 'L2Loss', 'TripletLoss',
+    'DDQAuxLoss', 'CrossEntropyCustomLoss', 'FocalCustomLoss'
+]
diff --git a/head_extractor/build/lib/mmdet/models/losses/accuracy.py b/head_extractor/build/lib/mmdet/models/losses/accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..d68484e13965ced3bd6b104071d22657a9b3fde6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/accuracy.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+
+def accuracy(pred, target, topk=1, thresh=None):
+    """Calculate accuracy according to the prediction and target.
+
+    Args:
+        pred (torch.Tensor): The model prediction, shape (N, num_class)
+        target (torch.Tensor): The target of each prediction, shape (N, )
+        topk (int | tuple[int], optional): If the predictions in ``topk``
+            matches the target, the predictions will be regarded as
+            correct ones. Defaults to 1.
+        thresh (float, optional): If not None, predictions with scores under
+            this threshold are considered incorrect. Default to None.
+
+    Returns:
+        float | tuple[float]: If the input ``topk`` is a single integer,
+            the function will return a single float as accuracy. If
+            ``topk`` is a tuple containing multiple integers, the
+            function will return a tuple containing accuracies of
+            each ``topk`` number.
+    """
+    assert isinstance(topk, (int, tuple))
+    if isinstance(topk, int):
+        topk = (topk, )
+        return_single = True
+    else:
+        return_single = False
+
+    maxk = max(topk)
+    if pred.size(0) == 0:
+        accu = [pred.new_tensor(0.) for i in range(len(topk))]
+        return accu[0] if return_single else accu
+    assert pred.ndim == 2 and target.ndim == 1
+    assert pred.size(0) == target.size(0)
+    assert maxk <= pred.size(1), \
+        f'maxk {maxk} exceeds pred dimension {pred.size(1)}'
+    pred_value, pred_label = pred.topk(maxk, dim=1)
+    pred_label = pred_label.t()  # transpose to shape (maxk, N)
+    correct = pred_label.eq(target.view(1, -1).expand_as(pred_label))
+    if thresh is not None:
+        # Only prediction values larger than thresh are counted as correct
+        correct = correct & (pred_value > thresh).t()
+    res = []
+    for k in topk:
+        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / pred.size(0)))
+    return res[0] if return_single else res
+
+
+class Accuracy(nn.Module):
+
+    def __init__(self, topk=(1, ), thresh=None):
+        """Module to calculate the accuracy.
+
+        Args:
+            topk (tuple, optional): The criterion used to calculate the
+                accuracy. Defaults to (1,).
+            thresh (float, optional): If not None, predictions with scores
+                under this threshold are considered incorrect. Default to None.
+        """
+        super().__init__()
+        self.topk = topk
+        self.thresh = thresh
+
+    def forward(self, pred, target):
+        """Forward function to calculate accuracy.
+
+        Args:
+            pred (torch.Tensor): Prediction of models.
+            target (torch.Tensor): Target for each prediction.
+
+        Returns:
+            tuple[float]: The accuracies under different topk criterions.
+        """
+        return accuracy(pred, target, self.topk, self.thresh)
diff --git a/head_extractor/build/lib/mmdet/models/losses/ae_loss.py b/head_extractor/build/lib/mmdet/models/losses/ae_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aa7d696be4b937a2d45545a8309aaa936fe5f22
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/ae_loss.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.registry import MODELS
+
+
+def ae_loss_per_image(tl_preds, br_preds, match):
+    """Associative Embedding Loss in one image.
+
+    Associative Embedding Loss including two parts: pull loss and push loss.
+    Pull loss makes embedding vectors from same object closer to each other.
+    Push loss distinguish embedding vector from different objects, and makes
+        the gap between them is large enough.
+
+    During computing, usually there are 3 cases:
+        - no object in image: both pull loss and push loss will be 0.
+        - one object in image: push loss will be 0 and pull loss is computed
+            by the two corner of the only object.
+        - more than one objects in image: pull loss is computed by corner pairs
+            from each object, push loss is computed by each object with all
+            other objects. We use confusion matrix with 0 in diagonal to
+            compute the push loss.
+
+    Args:
+        tl_preds (tensor): Embedding feature map of left-top corner.
+        br_preds (tensor): Embedding feature map of bottim-right corner.
+        match (list): Downsampled coordinates pair of each ground truth box.
+    """
+
+    tl_list, br_list, me_list = [], [], []
+    if len(match) == 0:  # no object in image
+        pull_loss = tl_preds.sum() * 0.
+        push_loss = tl_preds.sum() * 0.
+    else:
+        for m in match:
+            [tl_y, tl_x], [br_y, br_x] = m
+            tl_e = tl_preds[:, tl_y, tl_x].view(-1, 1)
+            br_e = br_preds[:, br_y, br_x].view(-1, 1)
+            tl_list.append(tl_e)
+            br_list.append(br_e)
+            me_list.append((tl_e + br_e) / 2.0)
+
+        tl_list = torch.cat(tl_list)
+        br_list = torch.cat(br_list)
+        me_list = torch.cat(me_list)
+
+        assert tl_list.size() == br_list.size()
+
+        # N is object number in image, M is dimension of embedding vector
+        N, M = tl_list.size()
+
+        pull_loss = (tl_list - me_list).pow(2) + (br_list - me_list).pow(2)
+        pull_loss = pull_loss.sum() / N
+
+        margin = 1  # exp setting of CornerNet, details in section 3.3 of paper
+
+        # confusion matrix of push loss
+        conf_mat = me_list.expand((N, N, M)).permute(1, 0, 2) - me_list
+        conf_weight = 1 - torch.eye(N).type_as(me_list)
+        conf_mat = conf_weight * (margin - conf_mat.sum(-1).abs())
+
+        if N > 1:  # more than one object in current image
+            push_loss = F.relu(conf_mat).sum() / (N * (N - 1))
+        else:
+            push_loss = tl_preds.sum() * 0.
+
+    return pull_loss, push_loss
+
+
+@MODELS.register_module()
+class AssociativeEmbeddingLoss(nn.Module):
+    """Associative Embedding Loss.
+
+    More details can be found in
+    `Associative Embedding <https://arxiv.org/abs/1611.05424>`_ and
+    `CornerNet <https://arxiv.org/abs/1808.01244>`_ .
+    Code is modified from `kp_utils.py <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/kp_utils.py#L180>`_  # noqa: E501
+
+    Args:
+        pull_weight (float): Loss weight for corners from same object.
+        push_weight (float): Loss weight for corners from different object.
+    """
+
+    def __init__(self, pull_weight=0.25, push_weight=0.25):
+        super(AssociativeEmbeddingLoss, self).__init__()
+        self.pull_weight = pull_weight
+        self.push_weight = push_weight
+
+    def forward(self, pred, target, match):
+        """Forward function."""
+        batch = pred.size(0)
+        pull_all, push_all = 0.0, 0.0
+        for i in range(batch):
+            pull, push = ae_loss_per_image(pred[i], target[i], match[i])
+
+            pull_all += self.pull_weight * pull
+            push_all += self.push_weight * push
+
+        return pull_all, push_all
diff --git a/head_extractor/build/lib/mmdet/models/losses/balanced_l1_loss.py b/head_extractor/build/lib/mmdet/models/losses/balanced_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..25adaab2239e871476d9d4e3cbb1a238c3043041
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/balanced_l1_loss.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def balanced_l1_loss(pred,
+                     target,
+                     beta=1.0,
+                     alpha=0.5,
+                     gamma=1.5,
+                     reduction='mean'):
+    """Calculate balanced L1 loss.
+
+    Please see the `Libra R-CNN <https://arxiv.org/pdf/1904.02701.pdf>`_
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 4).
+        target (torch.Tensor): The learning target of the prediction with
+            shape (N, 4).
+        beta (float): The loss is a piecewise function of prediction and target
+            and ``beta`` serves as a threshold for the difference between the
+            prediction and target. Defaults to 1.0.
+        alpha (float): The denominator ``alpha`` in the balanced L1 loss.
+            Defaults to 0.5.
+        gamma (float): The ``gamma`` in the balanced L1 loss.
+            Defaults to 1.5.
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    assert beta > 0
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    assert pred.size() == target.size()
+
+    diff = torch.abs(pred - target)
+    b = np.e**(gamma / alpha) - 1
+    loss = torch.where(
+        diff < beta, alpha / b *
+        (b * diff + 1) * torch.log(b * diff / beta + 1) - alpha * diff,
+        gamma * diff + gamma / b - alpha * beta)
+
+    return loss
+
+
+@MODELS.register_module()
+class BalancedL1Loss(nn.Module):
+    """Balanced L1 Loss.
+
+    arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019)
+
+    Args:
+        alpha (float): The denominator ``alpha`` in the balanced L1 loss.
+            Defaults to 0.5.
+        gamma (float): The ``gamma`` in the balanced L1 loss. Defaults to 1.5.
+        beta (float, optional): The loss is a piecewise function of prediction
+            and target. ``beta`` serves as a threshold for the difference
+            between the prediction and target. Defaults to 1.0.
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+
+    def __init__(self,
+                 alpha=0.5,
+                 gamma=1.5,
+                 beta=1.0,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(BalancedL1Loss, self).__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function of loss.
+
+        Args:
+            pred (torch.Tensor): The prediction with shape (N, 4).
+            target (torch.Tensor): The learning target of the prediction with
+                shape (N, 4).
+            weight (torch.Tensor, optional): Sample-wise loss weight with
+                shape (N, ).
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * balanced_l1_loss(
+            pred,
+            target,
+            weight,
+            alpha=self.alpha,
+            gamma=self.gamma,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_bbox
diff --git a/head_extractor/build/lib/mmdet/models/losses/cross_entropy_loss.py b/head_extractor/build/lib/mmdet/models/losses/cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..49fac7743ceddd2454f44b76c63d514de43b5aef
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/cross_entropy_loss.py
@@ -0,0 +1,401 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.registry import MODELS
+from .accuracy import accuracy
+from .utils import weight_reduce_loss
+
+
+def cross_entropy(pred,
+                  label,
+                  weight=None,
+                  reduction='mean',
+                  avg_factor=None,
+                  class_weight=None,
+                  ignore_index=-100,
+                  avg_non_ignore=False):
+    """Calculate the CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        label (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int | None): The label index to be ignored.
+            If None, it will be set to default value. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    # The default value of ignore_index is the same as F.cross_entropy
+    ignore_index = -100 if ignore_index is None else ignore_index
+    # element-wise losses
+    loss = F.cross_entropy(
+        pred,
+        label,
+        weight=class_weight,
+        reduction='none',
+        ignore_index=ignore_index)
+
+    # average loss over non-ignored elements
+    # pytorch's official cross_entropy average loss over non-ignored elements
+    # refer to https://github.com/pytorch/pytorch/blob/56b43f4fec1f76953f15a627694d4bba34588969/torch/nn/functional.py#L2660  # noqa
+    if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
+        avg_factor = label.numel() - (label == ignore_index).sum().item()
+
+    # apply weights and do the reduction
+    if weight is not None:
+        weight = weight.float()
+    loss = weight_reduce_loss(
+        loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def _expand_onehot_labels(labels, label_weights, label_channels, ignore_index):
+    """Expand onehot labels to match the size of prediction."""
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    valid_mask = (labels >= 0) & (labels != ignore_index)
+    inds = torch.nonzero(
+        valid_mask & (labels < label_channels), as_tuple=False)
+
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds]] = 1
+
+    valid_mask = valid_mask.view(-1, 1).expand(labels.size(0),
+                                               label_channels).float()
+    if label_weights is None:
+        bin_label_weights = valid_mask
+    else:
+        bin_label_weights = label_weights.view(-1, 1).repeat(1, label_channels)
+        bin_label_weights *= valid_mask
+
+    return bin_labels, bin_label_weights, valid_mask
+
+
+def binary_cross_entropy(pred,
+                         label,
+                         weight=None,
+                         reduction='mean',
+                         avg_factor=None,
+                         class_weight=None,
+                         ignore_index=-100,
+                         avg_non_ignore=False):
+    """Calculate the binary CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1) or (N, ).
+            When the shape of pred is (N, 1), label will be expanded to
+            one-hot format, and when the shape of pred is (N, ), label
+            will not be expanded to one-hot format.
+        label (torch.Tensor): The learning label of the prediction,
+            with shape (N, ).
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int | None): The label index to be ignored.
+            If None, it will be set to default value. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    # The default value of ignore_index is the same as F.cross_entropy
+    ignore_index = -100 if ignore_index is None else ignore_index
+
+    if pred.dim() != label.dim():
+        label, weight, valid_mask = _expand_onehot_labels(
+            label, weight, pred.size(-1), ignore_index)
+    else:
+        # should mask out the ignored elements
+        valid_mask = ((label >= 0) & (label != ignore_index)).float()
+        if weight is not None:
+            # The inplace writing method will have a mismatched broadcast
+            # shape error if the weight and valid_mask dimensions
+            # are inconsistent such as (B,N,1) and (B,N,C).
+            weight = weight * valid_mask
+        else:
+            weight = valid_mask
+
+    # average loss over non-ignored elements
+    if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
+        avg_factor = valid_mask.sum().item()
+
+    # weighted element-wise losses
+    weight = weight.float()
+    loss = F.binary_cross_entropy_with_logits(
+        pred, label.float(), pos_weight=class_weight, reduction='none')
+    # do the reduction for the weighted loss
+    loss = weight_reduce_loss(
+        loss, weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def mask_cross_entropy(pred,
+                       target,
+                       label,
+                       reduction='mean',
+                       avg_factor=None,
+                       class_weight=None,
+                       ignore_index=None,
+                       **kwargs):
+    """Calculate the CrossEntropy loss for masks.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C, *), C is the
+            number of classes. The trailing * indicates arbitrary shape.
+        target (torch.Tensor): The learning label of the prediction.
+        label (torch.Tensor): ``label`` indicates the class label of the mask
+            corresponding object. This will be used to select the mask in the
+            of the class which the object belongs to when the mask prediction
+            if not class-agnostic.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (None): Placeholder, to be consistent with other loss.
+            Default: None.
+
+    Returns:
+        torch.Tensor: The calculated loss
+
+    Example:
+        >>> N, C = 3, 11
+        >>> H, W = 2, 2
+        >>> pred = torch.randn(N, C, H, W) * 1000
+        >>> target = torch.rand(N, H, W)
+        >>> label = torch.randint(0, C, size=(N,))
+        >>> reduction = 'mean'
+        >>> avg_factor = None
+        >>> class_weights = None
+        >>> loss = mask_cross_entropy(pred, target, label, reduction,
+        >>>                           avg_factor, class_weights)
+        >>> assert loss.shape == (1,)
+    """
+    assert ignore_index is None, 'BCE loss does not support ignore_index'
+    # TODO: handle these two reserved arguments
+    assert reduction == 'mean' and avg_factor is None
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(
+        pred_slice, target, weight=class_weight, reduction='mean')[None]
+
+
+@MODELS.register_module()
+class CrossEntropyLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 class_weight=None,
+                 ignore_index=None,
+                 loss_weight=1.0,
+                 avg_non_ignore=False):
+        """CrossEntropyLoss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to False.
+            use_mask (bool, optional): Whether to use mask cross entropy loss.
+                Defaults to False.
+            reduction (str, optional): . Defaults to 'mean'.
+                Options are "none", "mean" and "sum".
+            class_weight (list[float], optional): Weight of each class.
+                Defaults to None.
+            ignore_index (int | None): The label index to be ignored.
+                Defaults to None.
+            loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+            avg_non_ignore (bool): The flag decides to whether the loss is
+                only averaged over non-ignored targets. Default: False.
+        """
+        super(CrossEntropyLoss, self).__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = class_weight
+        self.ignore_index = ignore_index
+        self.avg_non_ignore = avg_non_ignore
+        if ((ignore_index is not None) and not self.avg_non_ignore
+                and self.reduction == 'mean'):
+            warnings.warn(
+                'Default ``avg_non_ignore`` is False, if you would like to '
+                'ignore the certain label and average loss over non-ignore '
+                'labels, which is the same with PyTorch official '
+                'cross_entropy, set ``avg_non_ignore=True``.')
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'avg_non_ignore={self.avg_non_ignore}'
+        return s
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                ignore_index=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The prediction.
+            label (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): Sample-wise loss weight.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The method used to reduce the
+                loss. Options are "none", "mean" and "sum".
+            ignore_index (int | None): The label index to be ignored.
+                If not None, it will override the default value. Default: None.
+        Returns:
+            torch.Tensor: The calculated loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if ignore_index is None:
+            ignore_index = self.ignore_index
+
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(
+                self.class_weight, device=cls_score.device)
+        else:
+            class_weight = None
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            weight,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            ignore_index=ignore_index,
+            avg_non_ignore=self.avg_non_ignore,
+            **kwargs)
+        return loss_cls
+
+
+@MODELS.register_module()
+class CrossEntropyCustomLoss(CrossEntropyLoss):
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 num_classes=-1,
+                 class_weight=None,
+                 ignore_index=None,
+                 loss_weight=1.0,
+                 avg_non_ignore=False):
+        """CrossEntropyCustomLoss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to False.
+            use_mask (bool, optional): Whether to use mask cross entropy loss.
+                Defaults to False.
+            reduction (str, optional): . Defaults to 'mean'.
+                Options are "none", "mean" and "sum".
+            num_classes (int): Number of classes to classify.
+            class_weight (list[float], optional): Weight of each class.
+                Defaults to None.
+            ignore_index (int | None): The label index to be ignored.
+                Defaults to None.
+            loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+            avg_non_ignore (bool): The flag decides to whether the loss is
+                only averaged over non-ignored targets. Default: False.
+        """
+        super(CrossEntropyCustomLoss, self).__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = class_weight
+        self.ignore_index = ignore_index
+        self.avg_non_ignore = avg_non_ignore
+        if ((ignore_index is not None) and not self.avg_non_ignore
+                and self.reduction == 'mean'):
+            warnings.warn(
+                'Default ``avg_non_ignore`` is False, if you would like to '
+                'ignore the certain label and average loss over non-ignore '
+                'labels, which is the same with PyTorch official '
+                'cross_entropy, set ``avg_non_ignore=True``.')
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+
+        self.num_classes = num_classes
+
+        assert self.num_classes != -1
+
+        # custom output channels of the classifier
+        self.custom_cls_channels = True
+        # custom activation of cls_score
+        self.custom_activation = True
+        # custom accuracy of the classsifier
+        self.custom_accuracy = True
+
+    def get_cls_channels(self, num_classes):
+        assert num_classes == self.num_classes
+        if not self.use_sigmoid:
+            return num_classes + 1
+        else:
+            return num_classes
+
+    def get_activation(self, cls_score):
+
+        fine_cls_score = cls_score[:, :self.num_classes]
+
+        if not self.use_sigmoid:
+            bg_score = cls_score[:, [-1]]
+            new_score = torch.cat([fine_cls_score, bg_score], dim=-1)
+            scores = F.softmax(new_score, dim=-1)
+        else:
+            score_classes = fine_cls_score.sigmoid()
+            score_neg = 1 - score_classes.sum(dim=1, keepdim=True)
+            score_neg = score_neg.clamp(min=0, max=1)
+            scores = torch.cat([score_classes, score_neg], dim=1)
+
+        return scores
+
+    def get_accuracy(self, cls_score, labels):
+
+        fine_cls_score = cls_score[:, :self.num_classes]
+
+        pos_inds = labels < self.num_classes
+        acc_classes = accuracy(fine_cls_score[pos_inds], labels[pos_inds])
+        acc = dict()
+        acc['acc_classes'] = acc_classes
+        return acc
diff --git a/head_extractor/build/lib/mmdet/models/losses/ddq_detr_aux_loss.py b/head_extractor/build/lib/mmdet/models/losses/ddq_detr_aux_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..41f1c7166e6c7d05c5414cd04ad3eb3cd467f1b6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/ddq_detr_aux_loss.py
@@ -0,0 +1,303 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.structures import BaseDataElement
+
+from mmdet.models.utils import multi_apply
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import reduce_mean
+
+
+class DDQAuxLoss(nn.Module):
+    """DDQ auxiliary branches loss for dense queries.
+
+    Args:
+        loss_cls (dict):
+            Configuration of classification loss function.
+        loss_bbox (dict):
+            Configuration of bbox regression loss function.
+        train_cfg (dict):
+            Configuration of gt targets assigner for each predicted bbox.
+    """
+
+    def __init__(
+        self,
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            activated=True,  # use probability instead of logit as input
+            beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        train_cfg=dict(
+            assigner=dict(type='TopkHungarianAssigner', topk=8),
+            alpha=1,
+            beta=6),
+    ):
+        super(DDQAuxLoss, self).__init__()
+        self.train_cfg = train_cfg
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+
+        sampler_cfg = dict(type='PseudoSampler')
+        self.sampler = TASK_UTILS.build(sampler_cfg)
+
+    def loss_single(self, cls_score, bbox_pred, labels, label_weights,
+                    bbox_targets, alignment_metrics):
+        """Calculate auxiliary branches loss for dense queries for one image.
+
+        Args:
+            cls_score (Tensor): Predicted normalized classification
+                scores for one image, has shape (num_dense_queries,
+                cls_out_channels).
+            bbox_pred (Tensor): Predicted unnormalized bbox coordinates
+                for one image, has shape (num_dense_queries, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+            labels (Tensor): Labels for one image.
+            label_weights (Tensor): Label weights for one image.
+            bbox_targets (Tensor): Bbox targets for one image.
+            alignment_metrics (Tensor): Normalized alignment metrics for one
+                image.
+
+        Returns:
+            tuple: A tuple of loss components and loss weights.
+        """
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        alignment_metrics = alignment_metrics.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        targets = (labels, alignment_metrics)
+        cls_loss_func = self.loss_cls
+
+        loss_cls = cls_loss_func(
+            cls_score, targets, label_weights, avg_factor=1.0)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = cls_score.size(-1)
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+
+            pos_decode_bbox_pred = pos_bbox_pred
+            pos_decode_bbox_targets = pos_bbox_targets
+
+            # regression loss
+            pos_bbox_weight = alignment_metrics[pos_inds]
+
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=pos_bbox_weight,
+                avg_factor=1.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            pos_bbox_weight = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, alignment_metrics.sum(
+        ), pos_bbox_weight.sum()
+
+    def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas,
+             **kwargs):
+        """Calculate auxiliary branches loss for dense queries.
+
+        Args:
+            cls_scores (Tensor): Predicted normalized classification
+                scores, has shape (bs, num_dense_queries,
+                cls_out_channels).
+            bbox_preds (Tensor): Predicted unnormalized bbox coordinates,
+                has shape (bs, num_dense_queries, 4) with the last
+                dimension arranged as (x1, y1, x2, y2).
+            gt_bboxes (list[Tensor]): List of unnormalized ground truth
+                bboxes for each image, each has shape (num_gt, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+                NOTE: num_gt is dynamic for each image.
+            gt_labels (list[Tensor]): List of ground truth classification
+                index for each image, each has shape (num_gt,).
+                NOTE: num_gt is dynamic for each image.
+            img_metas (list[dict]): Meta information for one image,
+                e.g., image size, scaling factor, etc.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        flatten_cls_scores = cls_scores
+        flatten_bbox_preds = bbox_preds
+
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bbox_preds,
+            gt_bboxes,
+            img_metas,
+            gt_labels_list=gt_labels,
+        )
+        (labels_list, label_weights_list, bbox_targets_list,
+         alignment_metrics_list) = cls_reg_targets
+
+        losses_cls, losses_bbox, \
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_single,
+                flatten_cls_scores,
+                flatten_bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                alignment_metrics_list,
+                )
+
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(aux_loss_cls=losses_cls, aux_loss_bbox=losses_bbox)
+
+    def get_targets(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_labels_list=None,
+                    **kwargs):
+        """Compute regression and classification targets for a batch images.
+
+        Args:
+            cls_scores (Tensor): Predicted normalized classification
+                scores, has shape (bs, num_dense_queries,
+                cls_out_channels).
+            bbox_preds (Tensor): Predicted unnormalized bbox coordinates,
+                has shape (bs, num_dense_queries, 4) with the last
+                dimension arranged as (x1, y1, x2, y2).
+            gt_bboxes_list (List[Tensor]): List of unnormalized ground truth
+                bboxes for each image, each has shape (num_gt, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+                NOTE: num_gt is dynamic for each image.
+            img_metas (list[dict]): Meta information for one image,
+                e.g., image size, scaling factor, etc.
+            gt_labels_list (list[Tensor]): List of ground truth classification
+                    index for each image, each has shape (num_gt,).
+                    NOTE: num_gt is dynamic for each image.
+                    Default: None.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+            - all_labels (list[Tensor]): Labels for all images.
+            - all_label_weights (list[Tensor]): Label weights for all images.
+            - all_bbox_targets (list[Tensor]): Bbox targets for all images.
+            - all_assign_metrics (list[Tensor]): Normalized alignment metrics
+                for all images.
+        """
+        (all_labels, all_label_weights, all_bbox_targets,
+         all_assign_metrics) = multi_apply(self._get_target_single, cls_scores,
+                                           bbox_preds, gt_bboxes_list,
+                                           gt_labels_list, img_metas)
+
+        return (all_labels, all_label_weights, all_bbox_targets,
+                all_assign_metrics)
+
+    def _get_target_single(self, cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                           img_meta, **kwargs):
+        """Compute regression and classification targets for one image.
+
+        Args:
+            cls_scores (Tensor): Predicted normalized classification
+                scores for one image, has shape (num_dense_queries,
+                cls_out_channels).
+            bbox_preds (Tensor): Predicted unnormalized bbox coordinates
+                for one image, has shape (num_dense_queries, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+            gt_bboxes (Tensor): Unnormalized ground truth
+                bboxes for one image, has shape (num_gt, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+                NOTE: num_gt is dynamic for each image.
+            gt_labels (Tensor): Ground truth classification
+                    index for the image, has shape (num_gt,).
+                    NOTE: num_gt is dynamic for each image.
+            img_meta (dict): Meta information for one image.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels for one image.
+            - label_weights (Tensor): Label weights for one image.
+            - bbox_targets (Tensor): Bbox targets for one image.
+            - norm_alignment_metrics (Tensor): Normalized alignment
+                metrics for one image.
+        """
+        if len(gt_labels) == 0:
+            num_valid_anchors = len(cls_scores)
+            bbox_targets = torch.zeros_like(bbox_preds)
+            labels = bbox_preds.new_full((num_valid_anchors, ),
+                                         cls_scores.size(-1),
+                                         dtype=torch.long)
+            label_weights = bbox_preds.new_zeros(
+                num_valid_anchors, dtype=torch.float)
+            norm_alignment_metrics = bbox_preds.new_zeros(
+                num_valid_anchors, dtype=torch.float)
+            return (labels, label_weights, bbox_targets,
+                    norm_alignment_metrics)
+
+        assign_result = self.assigner.assign(cls_scores, bbox_preds, gt_bboxes,
+                                             gt_labels, img_meta)
+        assign_ious = assign_result.max_overlaps
+        assign_metrics = assign_result.assign_metrics
+
+        pred_instances = BaseDataElement()
+        gt_instances = BaseDataElement()
+
+        pred_instances.bboxes = bbox_preds
+        gt_instances.bboxes = gt_bboxes
+
+        pred_instances.priors = cls_scores
+        gt_instances.labels = gt_labels
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = len(cls_scores)
+        bbox_targets = torch.zeros_like(bbox_preds)
+        labels = bbox_preds.new_full((num_valid_anchors, ),
+                                     cls_scores.size(-1),
+                                     dtype=torch.long)
+        label_weights = bbox_preds.new_zeros(
+            num_valid_anchors, dtype=torch.float)
+        norm_alignment_metrics = bbox_preds.new_zeros(
+            num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            # point-based
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+
+            if gt_labels is None:
+                # Only dense_heads gives gt_labels as None
+                # Foreground is the first class since v2.5.0
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+
+            label_weights[pos_inds] = 1.0
+
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        class_assigned_gt_inds = torch.unique(
+            sampling_result.pos_assigned_gt_inds)
+        for gt_inds in class_assigned_gt_inds:
+            gt_class_inds = sampling_result.pos_assigned_gt_inds == gt_inds
+            pos_alignment_metrics = assign_metrics[gt_class_inds]
+            pos_ious = assign_ious[gt_class_inds]
+            pos_norm_alignment_metrics = pos_alignment_metrics / (
+                pos_alignment_metrics.max() + 10e-8) * pos_ious.max()
+            norm_alignment_metrics[
+                pos_inds[gt_class_inds]] = pos_norm_alignment_metrics
+
+        return (labels, label_weights, bbox_targets, norm_alignment_metrics)
diff --git a/head_extractor/build/lib/mmdet/models/losses/dice_loss.py b/head_extractor/build/lib/mmdet/models/losses/dice_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d5cac1e9710a6a72fe0401db22b8b72cfe058f9
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/dice_loss.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def dice_loss(pred,
+              target,
+              weight=None,
+              eps=1e-3,
+              reduction='mean',
+              naive_dice=False,
+              avg_factor=None):
+    """Calculate dice loss, there are two forms of dice loss is supported:
+
+        - the one proposed in `V-Net: Fully Convolutional Neural
+            Networks for Volumetric Medical Image Segmentation
+            <https://arxiv.org/abs/1606.04797>`_.
+        - the dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (n, *)
+        target (torch.Tensor): The learning label of the prediction,
+            shape (n, *), same shape of pred.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction, has a shape (n,). Defaults to None.
+        eps (float): Avoid dividing by zero. Default: 1e-3.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power.Defaults to False.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+
+    input = pred.flatten(1)
+    target = target.flatten(1).float()
+
+    a = torch.sum(input * target, 1)
+    if naive_dice:
+        b = torch.sum(input, 1)
+        c = torch.sum(target, 1)
+        d = (2 * a + eps) / (b + c + eps)
+    else:
+        b = torch.sum(input * input, 1) + eps
+        c = torch.sum(target * target, 1) + eps
+        d = (2 * a) / (b + c)
+
+    loss = 1 - d
+    if weight is not None:
+        assert weight.ndim == loss.ndim
+        assert len(weight) == len(pred)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class DiceLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 activate=True,
+                 reduction='mean',
+                 naive_dice=False,
+                 loss_weight=1.0,
+                 eps=1e-3):
+        """Compute dice loss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            activate (bool): Whether to activate the predictions inside,
+                this will disable the inside sigmoid operation.
+                Defaults to True.
+            reduction (str, optional): The method used
+                to reduce the loss. Options are "none",
+                "mean" and "sum". Defaults to 'mean'.
+            naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power. Defaults to False.
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            eps (float): Avoid dividing by zero. Defaults to 1e-3.
+        """
+
+        super(DiceLoss, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        self.reduction = reduction
+        self.naive_dice = naive_dice
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self.activate = activate
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                reduction_override=None,
+                avg_factor=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction, has a shape (n, *).
+            target (torch.Tensor): The label of the prediction,
+                shape (n, *), same shape of pred.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction, has a shape (n,). Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        if self.activate:
+            if self.use_sigmoid:
+                pred = pred.sigmoid()
+            else:
+                raise NotImplementedError
+
+        loss = self.loss_weight * dice_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            naive_dice=self.naive_dice,
+            avg_factor=avg_factor)
+
+        return loss
diff --git a/head_extractor/build/lib/mmdet/models/losses/eqlv2_loss.py b/head_extractor/build/lib/mmdet/models/losses/eqlv2_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea1f4a9a8f7c71119c2bed743d714a34ab4db82c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/eqlv2_loss.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from functools import partial
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.logging import print_log
+from torch import Tensor
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class EQLV2Loss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid: bool = True,
+                 reduction: str = 'mean',
+                 class_weight: Optional[Tensor] = None,
+                 loss_weight: float = 1.0,
+                 num_classes: int = 1203,
+                 use_distributed: bool = False,
+                 mu: float = 0.8,
+                 alpha: float = 4.0,
+                 gamma: int = 12,
+                 vis_grad: bool = False,
+                 test_with_obj: bool = True) -> None:
+        """`Equalization Loss v2 <https://arxiv.org/abs/2012.08548>`_
+
+        Args:
+            use_sigmoid (bool): EQLv2 uses the sigmoid function to transform
+                the predicted logits to an estimated probability distribution.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'.
+            class_weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            loss_weight (float, optional): The weight of the total EQLv2 loss.
+                Defaults to 1.0.
+            num_classes (int): 1203 for lvis v1.0, 1230 for lvis v0.5.
+            use_distributed (bool, float): EQLv2 will calculate the gradients
+                on all GPUs if there is any. Change to True if you are using
+                distributed training. Default to False.
+            mu (float, optional): Defaults to 0.8
+            alpha (float, optional): A balance factor for the negative part of
+                EQLV2 Loss. Defaults to 4.0.
+            gamma (int, optional): The gamma for calculating the modulating
+                factor. Defaults to 12.
+            vis_grad (bool, optional): Default to False.
+            test_with_obj (bool, optional): Default to True.
+
+        Returns:
+            None.
+        """
+        super().__init__()
+        self.use_sigmoid = True
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = class_weight
+        self.num_classes = num_classes
+        self.group = True
+
+        # cfg for eqlv2
+        self.vis_grad = vis_grad
+        self.mu = mu
+        self.alpha = alpha
+        self.gamma = gamma
+        self.use_distributed = use_distributed
+
+        # initial variables
+        self.register_buffer('pos_grad', torch.zeros(self.num_classes))
+        self.register_buffer('neg_grad', torch.zeros(self.num_classes))
+        # At the beginning of training, we set a high value (eg. 100)
+        # for the initial gradient ratio so that the weight for pos
+        # gradients and neg gradients are 1.
+        self.register_buffer('pos_neg', torch.ones(self.num_classes) * 100)
+
+        self.test_with_obj = test_with_obj
+
+        def _func(x, gamma, mu):
+            return 1 / (1 + torch.exp(-gamma * (x - mu)))
+
+        self.map_func = partial(_func, gamma=self.gamma, mu=self.mu)
+
+        print_log(
+            f'build EQL v2, gamma: {gamma}, mu: {mu}, alpha: {alpha}',
+            logger='current',
+            level=logging.DEBUG)
+
+    def forward(self,
+                cls_score: Tensor,
+                label: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[Tensor] = None) -> Tensor:
+        """`Equalization Loss v2 <https://arxiv.org/abs/2012.08548>`_
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C), C is the
+                number of classes.
+            label (Tensor): The ground truth label of the predicted target with
+                shape (N, C), C is the number of classes.
+            weight (Tensor, optional): The weight of loss for each prediction.
+                Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+           Tensor: The calculated loss
+        """
+        self.n_i, self.n_c = cls_score.size()
+        self.gt_classes = label
+        self.pred_class_logits = cls_score
+
+        def expand_label(pred, gt_classes):
+            target = pred.new_zeros(self.n_i, self.n_c)
+            target[torch.arange(self.n_i), gt_classes] = 1
+            return target
+
+        target = expand_label(cls_score, label)
+
+        pos_w, neg_w = self.get_weight(cls_score)
+
+        weight = pos_w * target + neg_w * (1 - target)
+
+        cls_loss = F.binary_cross_entropy_with_logits(
+            cls_score, target, reduction='none')
+        cls_loss = torch.sum(cls_loss * weight) / self.n_i
+
+        self.collect_grad(cls_score.detach(), target.detach(), weight.detach())
+
+        return self.loss_weight * cls_loss
+
+    def get_channel_num(self, num_classes):
+        num_channel = num_classes + 1
+        return num_channel
+
+    def get_activation(self, pred):
+        pred = torch.sigmoid(pred)
+        n_i, n_c = pred.size()
+        bg_score = pred[:, -1].view(n_i, 1)
+        if self.test_with_obj:
+            pred[:, :-1] *= (1 - bg_score)
+        return pred
+
+    def collect_grad(self, pred, target, weight):
+        prob = torch.sigmoid(pred)
+        grad = target * (prob - 1) + (1 - target) * prob
+        grad = torch.abs(grad)
+
+        # do not collect grad for objectiveness branch [:-1]
+        pos_grad = torch.sum(grad * target * weight, dim=0)[:-1]
+        neg_grad = torch.sum(grad * (1 - target) * weight, dim=0)[:-1]
+
+        if self.use_distributed:
+            dist.all_reduce(pos_grad)
+            dist.all_reduce(neg_grad)
+
+        self.pos_grad += pos_grad
+        self.neg_grad += neg_grad
+        self.pos_neg = self.pos_grad / (self.neg_grad + 1e-10)
+
+    def get_weight(self, pred):
+        neg_w = torch.cat([self.map_func(self.pos_neg), pred.new_ones(1)])
+        pos_w = 1 + self.alpha * (1 - neg_w)
+        neg_w = neg_w.view(1, -1).expand(self.n_i, self.n_c)
+        pos_w = pos_w.view(1, -1).expand(self.n_i, self.n_c)
+        return pos_w, neg_w
diff --git a/head_extractor/build/lib/mmdet/models/losses/focal_loss.py b/head_extractor/build/lib/mmdet/models/losses/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..15bef293a591a7f4c099febdaa82abaf7fb4928a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/focal_loss.py
@@ -0,0 +1,371 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss
+
+from mmdet.registry import MODELS
+from .accuracy import accuracy
+from .utils import weight_reduce_loss
+
+
+# This method is only for debugging
+def py_sigmoid_focal_loss(pred,
+                          target,
+                          weight=None,
+                          gamma=2.0,
+                          alpha=0.25,
+                          reduction='mean',
+                          avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    # Actually, pt here denotes (1 - pt) in the Focal Loss paper
+    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    # Thus it's pt.pow(gamma) rather than (1 - pt).pow(gamma)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def py_focal_loss_with_prob(pred,
+                            target,
+                            weight=None,
+                            gamma=2.0,
+                            alpha=0.25,
+                            reduction='mean',
+                            avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+    Different from `py_sigmoid_focal_loss`, this function accepts probability
+    as input.
+
+    Args:
+        pred (torch.Tensor): The prediction probability with shape (N, C),
+            C is the number of classes.
+        target (torch.Tensor): The learning label of the prediction.
+            The target shape support (N,C) or (N,), (N,C) means one-hot form.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    if pred.dim() != target.dim():
+        num_classes = pred.size(1)
+        target = F.one_hot(target, num_classes=num_classes + 1)
+        target = target[:, :num_classes]
+
+    target = target.type_as(pred)
+    pt = (1 - pred) * target + pred * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy(
+        pred, target, reduction='none') * focal_weight
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       weight=None,
+                       gamma=2.0,
+                       alpha=0.25,
+                       reduction='mean',
+                       avg_factor=None):
+    r"""A wrapper of cuda version `Focal Loss
+    <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    # Function.apply does not accept keyword arguments, so the decorator
+    # "weighted_loss" is not applicable
+    loss = _sigmoid_focal_loss(pred.contiguous(), target.contiguous(), gamma,
+                               alpha, None, 'none')
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 activated=False):
+        """`Focal Loss <https://arxiv.org/abs/1708.02002>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float, optional): A balanced form for Focal Loss.
+                Defaults to 0.25.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            activated (bool, optional): Whether the input is activated.
+                If True, it means the input has been activated and can be
+                treated as probabilities. Else, it should be treated as logits.
+                Defaults to False.
+        """
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid focal loss supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.activated = activated
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning label of the prediction.
+                The target shape support (N,C) or (N,), (N,C) means
+                one-hot form.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            if self.activated:
+                calculate_loss_func = py_focal_loss_with_prob
+            else:
+                if pred.dim() == target.dim():
+                    # this means that target is already in One-Hot form.
+                    calculate_loss_func = py_sigmoid_focal_loss
+                elif torch.cuda.is_available() and pred.is_cuda:
+                    calculate_loss_func = sigmoid_focal_loss
+                else:
+                    num_classes = pred.size(1)
+                    target = F.one_hot(target, num_classes=num_classes + 1)
+                    target = target[:, :num_classes]
+                    calculate_loss_func = py_sigmoid_focal_loss
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                reduction=reduction,
+                avg_factor=avg_factor)
+
+        else:
+            raise NotImplementedError
+        return loss_cls
+
+
+@MODELS.register_module()
+class FocalCustomLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 num_classes=-1,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 activated=False):
+        """`Focal Loss for V3Det <https://arxiv.org/abs/1708.02002>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            num_classes (int): Number of classes to classify.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float, optional): A balanced form for Focal Loss.
+                Defaults to 0.25.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            activated (bool, optional): Whether the input is activated.
+                If True, it means the input has been activated and can be
+                treated as probabilities. Else, it should be treated as logits.
+                Defaults to False.
+        """
+        super(FocalCustomLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid focal loss supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.num_classes = num_classes
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.activated = activated
+
+        assert self.num_classes != -1
+
+        # custom output channels of the classifier
+        self.custom_cls_channels = True
+        # custom activation of cls_score
+        self.custom_activation = True
+        # custom accuracy of the classsifier
+        self.custom_accuracy = True
+
+    def get_cls_channels(self, num_classes):
+        assert num_classes == self.num_classes
+        return num_classes
+
+    def get_activation(self, cls_score):
+
+        fine_cls_score = cls_score[:, :self.num_classes]
+
+        score_classes = fine_cls_score.sigmoid()
+
+        return score_classes
+
+    def get_accuracy(self, cls_score, labels):
+
+        fine_cls_score = cls_score[:, :self.num_classes]
+
+        pos_inds = labels < self.num_classes
+        acc_classes = accuracy(fine_cls_score[pos_inds], labels[pos_inds])
+        acc = dict()
+        acc['acc_classes'] = acc_classes
+        return acc
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+
+            num_classes = pred.size(1)
+            target = F.one_hot(target, num_classes=num_classes + 1)
+            target = target[:, :num_classes]
+            calculate_loss_func = py_sigmoid_focal_loss
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                reduction=reduction,
+                avg_factor=avg_factor)
+
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/head_extractor/build/lib/mmdet/models/losses/gaussian_focal_loss.py b/head_extractor/build/lib/mmdet/models/losses/gaussian_focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..14fa8da462a5e7cabde2166878a1b9f2ccc16d62
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/gaussian_focal_loss.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss, weighted_loss
+
+
+@weighted_loss
+def gaussian_focal_loss(pred: Tensor,
+                        gaussian_target: Tensor,
+                        alpha: float = 2.0,
+                        gamma: float = 4.0,
+                        pos_weight: float = 1.0,
+                        neg_weight: float = 1.0) -> Tensor:
+    """`Focal Loss <https://arxiv.org/abs/1708.02002>`_ for targets in gaussian
+    distribution.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        gaussian_target (torch.Tensor): The learning target of the prediction
+            in gaussian distribution.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 2.0.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 4.0.
+        pos_weight(float): Positive sample loss weight. Defaults to 1.0.
+        neg_weight(float): Negative sample loss weight. Defaults to 1.0.
+    """
+    eps = 1e-12
+    pos_weights = gaussian_target.eq(1)
+    neg_weights = (1 - gaussian_target).pow(gamma)
+    pos_loss = -(pred + eps).log() * (1 - pred).pow(alpha) * pos_weights
+    neg_loss = -(1 - pred + eps).log() * pred.pow(alpha) * neg_weights
+    return pos_weight * pos_loss + neg_weight * neg_loss
+
+
+def gaussian_focal_loss_with_pos_inds(
+        pred: Tensor,
+        gaussian_target: Tensor,
+        pos_inds: Tensor,
+        pos_labels: Tensor,
+        alpha: float = 2.0,
+        gamma: float = 4.0,
+        pos_weight: float = 1.0,
+        neg_weight: float = 1.0,
+        reduction: str = 'mean',
+        avg_factor: Optional[Union[int, float]] = None) -> Tensor:
+    """`Focal Loss <https://arxiv.org/abs/1708.02002>`_ for targets in gaussian
+    distribution.
+
+    Note: The index with a value of 1 in ``gaussian_target`` in the
+    ``gaussian_focal_loss`` function is a positive sample, but in
+    ``gaussian_focal_loss_with_pos_inds`` the positive sample is passed
+    in through the ``pos_inds`` parameter.
+
+    Args:
+        pred (torch.Tensor): The prediction. The shape is (N, num_classes).
+        gaussian_target (torch.Tensor): The learning target of the prediction
+            in gaussian distribution. The shape is (N, num_classes).
+        pos_inds (torch.Tensor): The positive sample index.
+            The shape is (M, ).
+        pos_labels (torch.Tensor): The label corresponding to the positive
+            sample index. The shape is (M, ).
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 2.0.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 4.0.
+        pos_weight(float): Positive sample loss weight. Defaults to 1.0.
+        neg_weight(float): Negative sample loss weight. Defaults to 1.0.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to 'mean`.
+        avg_factor (int, float, optional): Average factor that is used to
+            average the loss. Defaults to None.
+    """
+    eps = 1e-12
+    neg_weights = (1 - gaussian_target).pow(gamma)
+
+    pos_pred_pix = pred[pos_inds]
+    pos_pred = pos_pred_pix.gather(1, pos_labels.unsqueeze(1))
+    pos_loss = -(pos_pred + eps).log() * (1 - pos_pred).pow(alpha)
+    pos_loss = weight_reduce_loss(pos_loss, None, reduction, avg_factor)
+
+    neg_loss = -(1 - pred + eps).log() * pred.pow(alpha) * neg_weights
+    neg_loss = weight_reduce_loss(neg_loss, None, reduction, avg_factor)
+
+    return pos_weight * pos_loss + neg_weight * neg_loss
+
+
+@MODELS.register_module()
+class GaussianFocalLoss(nn.Module):
+    """GaussianFocalLoss is a variant of focal loss.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1808.01244>`_
+    Code is modified from `kp_utils.py
+    <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/kp_utils.py#L152>`_  # noqa: E501
+    Please notice that the target in GaussianFocalLoss is a gaussian heatmap,
+    not 0/1 binary target.
+
+    Args:
+        alpha (float): Power of prediction.
+        gamma (float): Power of target for negative samples.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+        pos_weight(float): Positive sample loss weight. Defaults to 1.0.
+        neg_weight(float): Negative sample loss weight. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 alpha: float = 2.0,
+                 gamma: float = 4.0,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 pos_weight: float = 1.0,
+                 neg_weight: float = 1.0) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.pos_weight = pos_weight
+        self.neg_weight = neg_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                pos_inds: Optional[Tensor] = None,
+                pos_labels: Optional[Tensor] = None,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[Union[int, float]] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        If you want to manually determine which positions are
+        positive samples, you can set the pos_index and pos_label
+        parameter. Currently, only the CenterNet update version uses
+        the parameter.
+
+        Args:
+            pred (torch.Tensor): The prediction. The shape is (N, num_classes).
+            target (torch.Tensor): The learning target of the prediction
+                in gaussian distribution. The shape is (N, num_classes).
+            pos_inds (torch.Tensor): The positive sample index.
+                Defaults to None.
+            pos_labels (torch.Tensor): The label corresponding to the positive
+                sample index. Defaults to None.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if pos_inds is not None:
+            assert pos_labels is not None
+            # Only used by centernet update version
+            loss_reg = self.loss_weight * gaussian_focal_loss_with_pos_inds(
+                pred,
+                target,
+                pos_inds,
+                pos_labels,
+                alpha=self.alpha,
+                gamma=self.gamma,
+                pos_weight=self.pos_weight,
+                neg_weight=self.neg_weight,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            loss_reg = self.loss_weight * gaussian_focal_loss(
+                pred,
+                target,
+                weight,
+                alpha=self.alpha,
+                gamma=self.gamma,
+                pos_weight=self.pos_weight,
+                neg_weight=self.neg_weight,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        return loss_reg
diff --git a/head_extractor/build/lib/mmdet/models/losses/gfocal_loss.py b/head_extractor/build/lib/mmdet/models/losses/gfocal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3a1172207e859039ca5ed7e0604d8b787131c29
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/gfocal_loss.py
@@ -0,0 +1,295 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.models.losses.utils import weighted_loss
+from mmdet.registry import MODELS
+
+
+@weighted_loss
+def quality_focal_loss(pred, target, beta=2.0):
+    r"""Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted joint representation of classification
+            and quality (IoU) estimation with shape (N, C), C is the number of
+            classes.
+        target (tuple([torch.Tensor])): Target category label with shape (N,)
+            and target quality label with shape (N,).
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    assert len(target) == 2, """target for QFL must be a tuple of two elements,
+        including category label and quality label, respectively"""
+    # label denotes the category id, score denotes the quality score
+    label, score = target
+
+    # negatives are supervised by 0 quality score
+    pred_sigmoid = pred.sigmoid()
+    scale_factor = pred_sigmoid
+    zerolabel = scale_factor.new_zeros(pred.shape)
+    loss = F.binary_cross_entropy_with_logits(
+        pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+    bg_class_ind = pred.size(1)
+    pos = ((label >= 0) & (label < bg_class_ind)).nonzero().squeeze(1)
+    pos_label = label[pos].long()
+    # positives are supervised by bbox quality (IoU) score
+    scale_factor = score[pos] - pred_sigmoid[pos, pos_label]
+    loss[pos, pos_label] = F.binary_cross_entropy_with_logits(
+        pred[pos, pos_label], score[pos],
+        reduction='none') * scale_factor.abs().pow(beta)
+
+    loss = loss.sum(dim=1, keepdim=False)
+    return loss
+
+
+@weighted_loss
+def quality_focal_loss_tensor_target(pred, target, beta=2.0, activated=False):
+    """`QualityFocal Loss <https://arxiv.org/abs/2008.13367>`_
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning target of the iou-aware
+            classification score with shape (N, C), C is the number of classes.
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+        activated (bool): Whether the input is activated.
+            If True, it means the input has been activated and can be
+            treated as probabilities. Else, it should be treated as logits.
+            Defaults to False.
+    """
+    # pred and target should be of the same size
+    assert pred.size() == target.size()
+    if activated:
+        pred_sigmoid = pred
+        loss_function = F.binary_cross_entropy
+    else:
+        pred_sigmoid = pred.sigmoid()
+        loss_function = F.binary_cross_entropy_with_logits
+
+    scale_factor = pred_sigmoid
+    target = target.type_as(pred)
+
+    zerolabel = scale_factor.new_zeros(pred.shape)
+    loss = loss_function(
+        pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    pos = (target != 0)
+    scale_factor = target[pos] - pred_sigmoid[pos]
+    loss[pos] = loss_function(
+        pred[pos], target[pos],
+        reduction='none') * scale_factor.abs().pow(beta)
+
+    loss = loss.sum(dim=1, keepdim=False)
+    return loss
+
+
+@weighted_loss
+def quality_focal_loss_with_prob(pred, target, beta=2.0):
+    r"""Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+    Different from `quality_focal_loss`, this function accepts probability
+    as input.
+
+    Args:
+        pred (torch.Tensor): Predicted joint representation of classification
+            and quality (IoU) estimation with shape (N, C), C is the number of
+            classes.
+        target (tuple([torch.Tensor])): Target category label with shape (N,)
+            and target quality label with shape (N,).
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    assert len(target) == 2, """target for QFL must be a tuple of two elements,
+        including category label and quality label, respectively"""
+    # label denotes the category id, score denotes the quality score
+    label, score = target
+
+    # negatives are supervised by 0 quality score
+    pred_sigmoid = pred
+    scale_factor = pred_sigmoid
+    zerolabel = scale_factor.new_zeros(pred.shape)
+    loss = F.binary_cross_entropy(
+        pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+    bg_class_ind = pred.size(1)
+    pos = ((label >= 0) & (label < bg_class_ind)).nonzero().squeeze(1)
+    pos_label = label[pos].long()
+    # positives are supervised by bbox quality (IoU) score
+    scale_factor = score[pos] - pred_sigmoid[pos, pos_label]
+    loss[pos, pos_label] = F.binary_cross_entropy(
+        pred[pos, pos_label], score[pos],
+        reduction='none') * scale_factor.abs().pow(beta)
+
+    loss = loss.sum(dim=1, keepdim=False)
+    return loss
+
+
+@weighted_loss
+def distribution_focal_loss(pred, label):
+    r"""Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted general distribution of bounding boxes
+            (before softmax) with shape (N, n+1), n is the max value of the
+            integral set `{0, ..., n}` in paper.
+        label (torch.Tensor): Target distance label for bounding boxes with
+            shape (N,).
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    dis_left = label.long()
+    dis_right = dis_left + 1
+    weight_left = dis_right.float() - label
+    weight_right = label - dis_left.float()
+    loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \
+        + F.cross_entropy(pred, dis_right, reduction='none') * weight_right
+    return loss
+
+
+@MODELS.register_module()
+class QualityFocalLoss(nn.Module):
+    r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.
+            Defaults to True.
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+        activated (bool, optional): Whether the input is activated.
+            If True, it means the input has been activated and can be
+            treated as probabilities. Else, it should be treated as logits.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 beta=2.0,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 activated=False):
+        super(QualityFocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid in QFL supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.activated = activated
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): Predicted joint representation of
+                classification and quality (IoU) estimation with shape (N, C),
+                C is the number of classes.
+            target (Union(tuple([torch.Tensor]),Torch.Tensor)): The type is
+                tuple, it should be included Target category label with
+                shape (N,) and target quality label with shape (N,).The type
+                is torch.Tensor, the target should be one-hot form with
+                soft weights.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            if self.activated:
+                calculate_loss_func = quality_focal_loss_with_prob
+            else:
+                calculate_loss_func = quality_focal_loss
+            if isinstance(target, torch.Tensor):
+                # the target shape with (N,C) or (N,C,...), which means
+                # the target is one-hot form with soft weights.
+                calculate_loss_func = partial(
+                    quality_focal_loss_tensor_target, activated=self.activated)
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                weight,
+                beta=self.beta,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            raise NotImplementedError
+        return loss_cls
+
+
+@MODELS.register_module()
+class DistributionFocalLoss(nn.Module):
+    r"""Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(DistributionFocalLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): Predicted general distribution of bounding
+                boxes (before softmax) with shape (N, n+1), n is the max value
+                of the integral set `{0, ..., n}` in paper.
+            target (torch.Tensor): Target distance label for bounding boxes
+                with shape (N,).
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_cls = self.loss_weight * distribution_focal_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_cls
diff --git a/head_extractor/build/lib/mmdet/models/losses/ghm_loss.py b/head_extractor/build/lib/mmdet/models/losses/ghm_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a874c0038cc4a77769705a3a06a95a56d3e8dd2d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/ghm_loss.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def _expand_onehot_labels(labels, label_weights, label_channels):
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    inds = torch.nonzero(
+        (labels >= 0) & (labels < label_channels), as_tuple=False).squeeze()
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds]] = 1
+    bin_label_weights = label_weights.view(-1, 1).expand(
+        label_weights.size(0), label_channels)
+    return bin_labels, bin_label_weights
+
+
+# TODO: code refactoring to make it consistent with other losses
+@MODELS.register_module()
+class GHMC(nn.Module):
+    """GHM Classification Loss.
+
+    Details of the theorem can be viewed in the paper
+    `Gradient Harmonized Single-stage Detector
+    <https://arxiv.org/abs/1811.05181>`_.
+
+    Args:
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        use_sigmoid (bool): Can only be true for BCE based loss now.
+        loss_weight (float): The weight of the total GHM-C loss.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to "mean"
+    """
+
+    def __init__(self,
+                 bins=10,
+                 momentum=0,
+                 use_sigmoid=True,
+                 loss_weight=1.0,
+                 reduction='mean'):
+        super(GHMC, self).__init__()
+        self.bins = bins
+        self.momentum = momentum
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] += 1e-6
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.use_sigmoid = use_sigmoid
+        if not self.use_sigmoid:
+            raise NotImplementedError
+        self.loss_weight = loss_weight
+        self.reduction = reduction
+
+    def forward(self,
+                pred,
+                target,
+                label_weight,
+                reduction_override=None,
+                **kwargs):
+        """Calculate the GHM-C loss.
+
+        Args:
+            pred (float tensor of size [batch_num, class_num]):
+                The direct prediction of classification fc layer.
+            target (float tensor of size [batch_num, class_num]):
+                Binary class target for each sample.
+            label_weight (float tensor of size [batch_num, class_num]):
+                the value is 1 if the sample is valid and 0 if ignored.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        Returns:
+            The gradient harmonized loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # the target should be binary class label
+        if pred.dim() != target.dim():
+            target, label_weight = _expand_onehot_labels(
+                target, label_weight, pred.size(-1))
+        target, label_weight = target.float(), label_weight.float()
+        edges = self.edges
+        mmt = self.momentum
+        weights = torch.zeros_like(pred)
+
+        # gradient length
+        g = torch.abs(pred.sigmoid().detach() - target)
+
+        valid = label_weight > 0
+        tot = max(valid.float().sum().item(), 1.0)
+        n = 0  # n valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+                n += 1
+        if n > 0:
+            weights = weights / n
+
+        loss = F.binary_cross_entropy_with_logits(
+            pred, target, reduction='none')
+        loss = weight_reduce_loss(
+            loss, weights, reduction=reduction, avg_factor=tot)
+        return loss * self.loss_weight
+
+
+# TODO: code refactoring to make it consistent with other losses
+@MODELS.register_module()
+class GHMR(nn.Module):
+    """GHM Regression Loss.
+
+    Details of the theorem can be viewed in the paper
+    `Gradient Harmonized Single-stage Detector
+    <https://arxiv.org/abs/1811.05181>`_.
+
+    Args:
+        mu (float): The parameter for the Authentic Smooth L1 loss.
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        loss_weight (float): The weight of the total GHM-R loss.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to "mean"
+    """
+
+    def __init__(self,
+                 mu=0.02,
+                 bins=10,
+                 momentum=0,
+                 loss_weight=1.0,
+                 reduction='mean'):
+        super(GHMR, self).__init__()
+        self.mu = mu
+        self.bins = bins
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] = 1e3
+        self.momentum = momentum
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.loss_weight = loss_weight
+        self.reduction = reduction
+
+    # TODO: support reduction parameter
+    def forward(self,
+                pred,
+                target,
+                label_weight,
+                avg_factor=None,
+                reduction_override=None):
+        """Calculate the GHM-R loss.
+
+        Args:
+            pred (float tensor of size [batch_num, 4 (* class_num)]):
+                The prediction of box regression layer. Channel number can be 4
+                or 4 * class_num depending on whether it is class-agnostic.
+            target (float tensor of size [batch_num, 4 (* class_num)]):
+                The target regression values with the same size of pred.
+            label_weight (float tensor of size [batch_num, 4 (* class_num)]):
+                The weight of each sample, 0 if ignored.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        Returns:
+            The gradient harmonized loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        mu = self.mu
+        edges = self.edges
+        mmt = self.momentum
+
+        # ASL1 loss
+        diff = pred - target
+        loss = torch.sqrt(diff * diff + mu * mu) - mu
+
+        # gradient length
+        g = torch.abs(diff / torch.sqrt(mu * mu + diff * diff)).detach()
+        weights = torch.zeros_like(g)
+
+        valid = label_weight > 0
+        tot = max(label_weight.float().sum().item(), 1.0)
+        n = 0  # n: valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                n += 1
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+        if n > 0:
+            weights /= n
+        loss = weight_reduce_loss(
+            loss, weights, reduction=reduction, avg_factor=tot)
+        return loss * self.loss_weight
diff --git a/head_extractor/build/lib/mmdet/models/losses/iou_loss.py b/head_extractor/build/lib/mmdet/models/losses/iou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8a2b977868cef6f4039b49277bfc853ffc720bd
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/iou_loss.py
@@ -0,0 +1,926 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+from .utils import weighted_loss
+
+
+@weighted_loss
+def iou_loss(pred: Tensor,
+             target: Tensor,
+             linear: bool = False,
+             mode: str = 'log',
+             eps: float = 1e-6) -> Tensor:
+    """IoU loss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+    The loss is calculated as negative log of IoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        linear (bool, optional): If True, use linear scale of loss instead of
+            log scale. Default: False.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'log'
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    assert mode in ['linear', 'square', 'log']
+    if linear:
+        mode = 'linear'
+        warnings.warn('DeprecationWarning: Setting "linear=True" in '
+                      'iou_loss is deprecated, please use "mode=`linear`" '
+                      'instead.')
+    # avoid fp16 overflow
+    if pred.dtype == torch.float16:
+        fp16 = True
+        pred = pred.to(torch.float32)
+    else:
+        fp16 = False
+
+    ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps)
+
+    if fp16:
+        ious = ious.to(torch.float16)
+
+    if mode == 'linear':
+        loss = 1 - ious
+    elif mode == 'square':
+        loss = 1 - ious**2
+    elif mode == 'log':
+        loss = -ious.log()
+    else:
+        raise NotImplementedError
+    return loss
+
+
+@weighted_loss
+def bounded_iou_loss(pred: Tensor,
+                     target: Tensor,
+                     beta: float = 0.2,
+                     eps: float = 1e-3) -> Tensor:
+    """BIoULoss.
+
+    This is an implementation of paper
+    `Improving Object Localization with Fitness NMS and Bounded IoU Loss.
+    <https://arxiv.org/abs/1711.00164>`_.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        beta (float, optional): Beta parameter in smoothl1.
+        eps (float, optional): Epsilon to avoid NaN values.
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    pred_ctrx = (pred[:, 0] + pred[:, 2]) * 0.5
+    pred_ctry = (pred[:, 1] + pred[:, 3]) * 0.5
+    pred_w = pred[:, 2] - pred[:, 0]
+    pred_h = pred[:, 3] - pred[:, 1]
+    with torch.no_grad():
+        target_ctrx = (target[:, 0] + target[:, 2]) * 0.5
+        target_ctry = (target[:, 1] + target[:, 3]) * 0.5
+        target_w = target[:, 2] - target[:, 0]
+        target_h = target[:, 3] - target[:, 1]
+
+    dx = target_ctrx - pred_ctrx
+    dy = target_ctry - pred_ctry
+
+    loss_dx = 1 - torch.max(
+        (target_w - 2 * dx.abs()) /
+        (target_w + 2 * dx.abs() + eps), torch.zeros_like(dx))
+    loss_dy = 1 - torch.max(
+        (target_h - 2 * dy.abs()) /
+        (target_h + 2 * dy.abs() + eps), torch.zeros_like(dy))
+    loss_dw = 1 - torch.min(target_w / (pred_w + eps), pred_w /
+                            (target_w + eps))
+    loss_dh = 1 - torch.min(target_h / (pred_h + eps), pred_h /
+                            (target_h + eps))
+    # view(..., -1) does not work for empty tensor
+    loss_comb = torch.stack([loss_dx, loss_dy, loss_dw, loss_dh],
+                            dim=-1).flatten(1)
+
+    loss = torch.where(loss_comb < beta, 0.5 * loss_comb * loss_comb / beta,
+                       loss_comb - 0.5 * beta)
+    return loss
+
+
+@weighted_loss
+def giou_loss(pred: Tensor, target: Tensor, eps: float = 1e-7) -> Tensor:
+    r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding
+    Box Regression <https://arxiv.org/abs/1902.09630>`_.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    # avoid fp16 overflow
+    if pred.dtype == torch.float16:
+        fp16 = True
+        pred = pred.to(torch.float32)
+    else:
+        fp16 = False
+
+    gious = bbox_overlaps(pred, target, mode='giou', is_aligned=True, eps=eps)
+
+    if fp16:
+        gious = gious.to(torch.float16)
+
+    loss = 1 - gious
+    return loss
+
+
+@weighted_loss
+def diou_loss(pred: Tensor, target: Tensor, eps: float = 1e-7) -> Tensor:
+    r"""Implementation of `Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression https://arxiv.org/abs/1911.08287`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    # DIoU
+    dious = ious - rho2 / c2
+    loss = 1 - dious
+    return loss
+
+
+@weighted_loss
+def ciou_loss(pred: Tensor, target: Tensor, eps: float = 1e-7) -> Tensor:
+    r"""`Implementation of paper `Enhancing Geometric Factors into
+    Model Learning and Inference for Object Detection and Instance
+    Segmentation <https://arxiv.org/abs/2005.03572>`_.
+
+    Code is modified from https://github.com/Zzh-tju/CIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    factor = 4 / math.pi**2
+    v = factor * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
+
+    with torch.no_grad():
+        alpha = (ious > 0.5).float() * v / (1 - ious + v)
+
+    # CIoU
+    cious = ious - (rho2 / c2 + alpha * v)
+    loss = 1 - cious.clamp(min=-1.0, max=1.0)
+    return loss
+
+
+@weighted_loss
+def eiou_loss(pred: Tensor,
+              target: Tensor,
+              smooth_point: float = 0.1,
+              eps: float = 1e-7) -> Tensor:
+    r"""Implementation of paper `Extended-IoU Loss: A Systematic
+    IoU-Related Method: Beyond Simplified Regression for Better
+    Localization <https://ieeexplore.ieee.org/abstract/document/9429909>`_
+
+    Code is modified from https://github.com//ShiqiYu/libfacedetection.train.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        smooth_point (float): hyperparameter, default is 0.1.
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    px1, py1, px2, py2 = pred[:, 0], pred[:, 1], pred[:, 2], pred[:, 3]
+    tx1, ty1, tx2, ty2 = target[:, 0], target[:, 1], target[:, 2], target[:, 3]
+
+    # extent top left
+    ex1 = torch.min(px1, tx1)
+    ey1 = torch.min(py1, ty1)
+
+    # intersection coordinates
+    ix1 = torch.max(px1, tx1)
+    iy1 = torch.max(py1, ty1)
+    ix2 = torch.min(px2, tx2)
+    iy2 = torch.min(py2, ty2)
+
+    # extra
+    xmin = torch.min(ix1, ix2)
+    ymin = torch.min(iy1, iy2)
+    xmax = torch.max(ix1, ix2)
+    ymax = torch.max(iy1, iy2)
+
+    # Intersection
+    intersection = (ix2 - ex1) * (iy2 - ey1) + (xmin - ex1) * (ymin - ey1) - (
+        ix1 - ex1) * (ymax - ey1) - (xmax - ex1) * (
+            iy1 - ey1)
+    # Union
+    union = (px2 - px1) * (py2 - py1) + (tx2 - tx1) * (
+        ty2 - ty1) - intersection + eps
+    # IoU
+    ious = 1 - (intersection / union)
+
+    # Smooth-EIoU
+    smooth_sign = (ious < smooth_point).detach().float()
+    loss = 0.5 * smooth_sign * (ious**2) / smooth_point + (1 - smooth_sign) * (
+        ious - 0.5 * smooth_point)
+    return loss
+
+
+@weighted_loss
+def siou_loss(pred, target, eps=1e-7, neg_gamma=False):
+    r"""`Implementation of paper `SIoU Loss: More Powerful Learning
+    for Bounding Box Regression <https://arxiv.org/abs/2205.12740>`_.
+
+    Code is modified from https://github.com/meituan/YOLOv6.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+        neg_gamma (bool): `True` follows original implementation in paper.
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    # modified clamp threshold zero to eps to avoid NaN
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=eps)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+
+    # angle cost
+    s_cw = (b2_x1 + b2_x2 - b1_x1 - b1_x2) * 0.5 + eps
+    s_ch = (b2_y1 + b2_y2 - b1_y1 - b1_y2) * 0.5 + eps
+
+    sigma = torch.pow(s_cw**2 + s_ch**2, 0.5)
+
+    sin_alpha_1 = torch.abs(s_cw) / sigma
+    sin_alpha_2 = torch.abs(s_ch) / sigma
+    threshold = pow(2, 0.5) / 2
+    sin_alpha = torch.where(sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1)
+    angle_cost = torch.cos(torch.asin(sin_alpha) * 2 - math.pi / 2)
+
+    # distance cost
+    rho_x = (s_cw / cw)**2
+    rho_y = (s_ch / ch)**2
+
+    # `neg_gamma=True` follows original implementation in paper
+    # but setting `neg_gamma=False` makes training more stable.
+    gamma = angle_cost - 2 if neg_gamma else 2 - angle_cost
+    distance_cost = 2 - torch.exp(gamma * rho_x) - torch.exp(gamma * rho_y)
+
+    # shape cost
+    omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2)
+    omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2)
+    shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), 4) + torch.pow(
+        1 - torch.exp(-1 * omiga_h), 4)
+
+    # SIoU
+    sious = ious - 0.5 * (distance_cost + shape_cost)
+    loss = 1 - sious.clamp(min=-1.0, max=1.0)
+    return loss
+
+
+@MODELS.register_module()
+class IoULoss(nn.Module):
+    """IoULoss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+
+    Args:
+        linear (bool): If True, use linear scale of loss else determined
+            by mode. Default: False.
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'log'
+    """
+
+    def __init__(self,
+                 linear: bool = False,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 mode: str = 'log') -> None:
+        super().__init__()
+        assert mode in ['linear', 'square', 'log']
+        if linear:
+            mode = 'linear'
+            warnings.warn('DeprecationWarning: Setting "linear=True" in '
+                          'IOULoss is deprecated, please use "mode=`linear`" '
+                          'instead.')
+        self.mode = mode
+        self.linear = linear
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Return:
+            Tensor: Loss tensor.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # iou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * iou_loss(
+            pred,
+            target,
+            weight,
+            mode=self.mode,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class BoundedIoULoss(nn.Module):
+    """BIoULoss.
+
+    This is an implementation of paper
+    `Improving Object Localization with Fitness NMS and Bounded IoU Loss.
+    <https://arxiv.org/abs/1711.00164>`_.
+
+    Args:
+        beta (float, optional): Beta parameter in smoothl1.
+        eps (float, optional): Epsilon to avoid NaN values.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 beta: float = 0.2,
+                 eps: float = 1e-3,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.beta = beta
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * bounded_iou_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class GIoULoss(nn.Module):
+    r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding
+    Box Regression <https://arxiv.org/abs/1902.09630>`_.
+
+    Args:
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * giou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class DIoULoss(nn.Module):
+    r"""Implementation of `Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression https://arxiv.org/abs/1911.08287`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * diou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class CIoULoss(nn.Module):
+    r"""`Implementation of paper `Enhancing Geometric Factors into
+    Model Learning and Inference for Object Detection and Instance
+    Segmentation <https://arxiv.org/abs/2005.03572>`_.
+
+    Code is modified from https://github.com/Zzh-tju/CIoU.
+
+    Args:
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * ciou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class EIoULoss(nn.Module):
+    r"""Implementation of paper `Extended-IoU Loss: A Systematic
+    IoU-Related Method: Beyond Simplified Regression for Better
+    Localization <https://ieeexplore.ieee.org/abstract/document/9429909>`_
+
+    Code is modified from https://github.com//ShiqiYu/libfacedetection.train.
+
+    Args:
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+        smooth_point (float): hyperparameter, default is 0.1.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 smooth_point: float = 0.1) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.smooth_point = smooth_point
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * eiou_loss(
+            pred,
+            target,
+            weight,
+            smooth_point=self.smooth_point,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class SIoULoss(nn.Module):
+    r"""`Implementation of paper `SIoU Loss: More Powerful Learning
+    for Bounding Box Regression <https://arxiv.org/abs/2205.12740>`_.
+
+    Code is modified from https://github.com/meituan/YOLOv6.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+        neg_gamma (bool): `True` follows original implementation in paper.
+
+    Return:
+        Tensor: Loss tensor.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 neg_gamma: bool = False) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.neg_gamma = neg_gamma
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * siou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            neg_gamma=self.neg_gamma,
+            **kwargs)
+        return loss
diff --git a/head_extractor/build/lib/mmdet/models/losses/kd_loss.py b/head_extractor/build/lib/mmdet/models/losses/kd_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a7d5ef24a0b0d7d7390a27c7cd9cbfdbe61d823
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/kd_loss.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def knowledge_distillation_kl_div_loss(pred: Tensor,
+                                       soft_label: Tensor,
+                                       T: int,
+                                       detach_target: bool = True) -> Tensor:
+    r"""Loss function for knowledge distilling using KL divergence.
+
+    Args:
+        pred (Tensor): Predicted logits with shape (N, n + 1).
+        soft_label (Tensor): Target logits with shape (N, N + 1).
+        T (int): Temperature for distillation.
+        detach_target (bool): Remove soft_label from automatic differentiation
+
+    Returns:
+        Tensor: Loss tensor with shape (N,).
+    """
+    assert pred.size() == soft_label.size()
+    target = F.softmax(soft_label / T, dim=1)
+    if detach_target:
+        target = target.detach()
+
+    kd_loss = F.kl_div(
+        F.log_softmax(pred / T, dim=1), target, reduction='none').mean(1) * (
+            T * T)
+
+    return kd_loss
+
+
+@MODELS.register_module()
+class KnowledgeDistillationKLDivLoss(nn.Module):
+    """Loss function for knowledge distilling using KL divergence.
+
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+        T (int): Temperature for distillation.
+    """
+
+    def __init__(self,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 T: int = 10) -> None:
+        super().__init__()
+        assert T >= 1
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.T = T
+
+    def forward(self,
+                pred: Tensor,
+                soft_label: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted logits with shape (N, n + 1).
+            soft_label (Tensor): Target logits with shape (N, N + 1).
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        loss_kd = self.loss_weight * knowledge_distillation_kl_div_loss(
+            pred,
+            soft_label,
+            weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            T=self.T)
+
+        return loss_kd
diff --git a/head_extractor/build/lib/mmdet/models/losses/l2_loss.py b/head_extractor/build/lib/mmdet/models/losses/l2_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..6210a3007b2c39540f022925cc93181c7328e42d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/l2_loss.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def l2_loss(pred: Tensor, target: Tensor) -> Tensor:
+    """L2 loss.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)**2
+    return loss
+
+
+@MODELS.register_module()
+class L2Loss(BaseModule):
+    """L2 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self,
+                 neg_pos_ub: int = -1,
+                 pos_margin: float = -1,
+                 neg_margin: float = -1,
+                 hard_mining: bool = False,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0):
+        super(L2Loss, self).__init__()
+        self.neg_pos_ub = neg_pos_ub
+        self.pos_margin = pos_margin
+        self.neg_margin = neg_margin
+        self.hard_mining = hard_mining
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        pred, weight, avg_factor = self.update_weight(pred, target, weight,
+                                                      avg_factor)
+        loss_bbox = self.loss_weight * l2_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+    def update_weight(self, pred: Tensor, target: Tensor, weight: Tensor,
+                      avg_factor: float) -> Tuple[Tensor, Tensor, float]:
+        """Update the weight according to targets."""
+        if weight is None:
+            weight = target.new_ones(target.size())
+
+        invalid_inds = weight <= 0
+        target[invalid_inds] = -1
+        pos_inds = target == 1
+        neg_inds = target == 0
+
+        if self.pos_margin > 0:
+            pred[pos_inds] -= self.pos_margin
+        if self.neg_margin > 0:
+            pred[neg_inds] -= self.neg_margin
+        pred = torch.clamp(pred, min=0, max=1)
+
+        num_pos = int((target == 1).sum())
+        num_neg = int((target == 0).sum())
+        if self.neg_pos_ub > 0 and num_neg / (num_pos +
+                                              1e-6) > self.neg_pos_ub:
+            num_neg = num_pos * self.neg_pos_ub
+            neg_idx = torch.nonzero(target == 0, as_tuple=False)
+
+            if self.hard_mining:
+                costs = l2_loss(
+                    pred, target, reduction='none')[neg_idx[:, 0],
+                                                    neg_idx[:, 1]].detach()
+                neg_idx = neg_idx[costs.topk(num_neg)[1], :]
+            else:
+                neg_idx = self.random_choice(neg_idx, num_neg)
+
+            new_neg_inds = neg_inds.new_zeros(neg_inds.size()).bool()
+            new_neg_inds[neg_idx[:, 0], neg_idx[:, 1]] = True
+
+            invalid_neg_inds = torch.logical_xor(neg_inds, new_neg_inds)
+            weight[invalid_neg_inds] = 0
+
+        avg_factor = (weight > 0).sum()
+        return pred, weight, avg_factor
+
+    @staticmethod
+    def random_choice(gallery: Union[list, np.ndarray, Tensor],
+                      num: int) -> np.ndarray:
+        """Random select some elements from the gallery.
+
+        It seems that Pytorch's implementation is slower than numpy so we use
+        numpy to randperm the indices.
+        """
+        assert len(gallery) >= num
+        if isinstance(gallery, list):
+            gallery = np.array(gallery)
+        cands = np.arange(len(gallery))
+        np.random.shuffle(cands)
+        rand_inds = cands[:num]
+        if not isinstance(gallery, np.ndarray):
+            rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
+        return gallery[rand_inds]
diff --git a/head_extractor/build/lib/mmdet/models/losses/margin_loss.py b/head_extractor/build/lib/mmdet/models/losses/margin_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..0609e1db50edf89c8ae8b65709e8ab786f580366
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/margin_loss.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .mse_loss import mse_loss
+
+
+@MODELS.register_module()
+class MarginL2Loss(BaseModule):
+    """L2 loss with margin.
+
+    Args:
+        neg_pos_ub (int, optional): The upper bound of negative to positive
+            samples in hard mining. Defaults to -1.
+        pos_margin (float, optional): The similarity margin for positive
+            samples in hard mining. Defaults to -1.
+        neg_margin (float, optional): The similarity margin for negative
+            samples in hard mining. Defaults to -1.
+        hard_mining (bool, optional): Whether to use hard mining. Defaults to
+            False.
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum". Defaults to "mean".
+        loss_weight (float, optional): The weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 neg_pos_ub: int = -1,
+                 pos_margin: float = -1,
+                 neg_margin: float = -1,
+                 hard_mining: bool = False,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0):
+        super(MarginL2Loss, self).__init__()
+        self.neg_pos_ub = neg_pos_ub
+        self.pos_margin = pos_margin
+        self.neg_margin = neg_margin
+        self.hard_mining = hard_mining
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        pred, weight, avg_factor = self.update_weight(pred, target, weight,
+                                                      avg_factor)
+        loss_bbox = self.loss_weight * mse_loss(
+            pred,
+            target.float(),
+            weight.float(),
+            reduction=reduction,
+            avg_factor=avg_factor)
+        return loss_bbox
+
+    def update_weight(self, pred: Tensor, target: Tensor, weight: Tensor,
+                      avg_factor: float) -> Tuple[Tensor, Tensor, float]:
+        """Update the weight according to targets.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor): The weight of loss for each prediction.
+            avg_factor (float): Average factor that is used to average the
+                loss.
+
+        Returns:
+            tuple[torch.Tensor]: The updated prediction, weight and average
+            factor.
+        """
+        if weight is None:
+            weight = target.new_ones(target.size())
+
+        invalid_inds = weight <= 0
+        target[invalid_inds] = -1
+        pos_inds = target == 1
+        neg_inds = target == 0
+
+        if self.pos_margin > 0:
+            pred[pos_inds] -= self.pos_margin
+        if self.neg_margin > 0:
+            pred[neg_inds] -= self.neg_margin
+        pred = torch.clamp(pred, min=0, max=1)
+
+        num_pos = int((target == 1).sum())
+        num_neg = int((target == 0).sum())
+        if self.neg_pos_ub > 0 and num_neg / (num_pos +
+                                              1e-6) > self.neg_pos_ub:
+            num_neg = num_pos * self.neg_pos_ub
+            neg_idx = torch.nonzero(target == 0, as_tuple=False)
+
+            if self.hard_mining:
+                costs = mse_loss(
+                    pred, target.float(),
+                    reduction='none')[neg_idx[:, 0], neg_idx[:, 1]].detach()
+                neg_idx = neg_idx[costs.topk(num_neg)[1], :]
+            else:
+                neg_idx = self.random_choice(neg_idx, num_neg)
+
+            new_neg_inds = neg_inds.new_zeros(neg_inds.size()).bool()
+            new_neg_inds[neg_idx[:, 0], neg_idx[:, 1]] = True
+
+            invalid_neg_inds = torch.logical_xor(neg_inds, new_neg_inds)
+            weight[invalid_neg_inds] = 0
+
+        avg_factor = (weight > 0).sum()
+        return pred, weight, avg_factor
+
+    @staticmethod
+    def random_choice(gallery: Union[list, np.ndarray, Tensor],
+                      num: int) -> np.ndarray:
+        """Random select some elements from the gallery.
+
+        It seems that Pytorch's implementation is slower than numpy so we use
+        numpy to randperm the indices.
+
+        Args:
+            gallery (list | np.ndarray | torch.Tensor): The gallery from
+                which to sample.
+            num (int): The number of elements to sample.
+        """
+        assert len(gallery) >= num
+        if isinstance(gallery, list):
+            gallery = np.array(gallery)
+        cands = np.arange(len(gallery))
+        np.random.shuffle(cands)
+        rand_inds = cands[:num]
+        if not isinstance(gallery, np.ndarray):
+            rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
+        return gallery[rand_inds]
diff --git a/head_extractor/build/lib/mmdet/models/losses/mse_loss.py b/head_extractor/build/lib/mmdet/models/losses/mse_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..6048218ad36a8105e7fa182f40fae93ef7c9268f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/mse_loss.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def mse_loss(pred: Tensor, target: Tensor) -> Tensor:
+    """A Wrapper of MSE loss.
+    Args:
+        pred (Tensor): The prediction.
+        target (Tensor): The learning target of the prediction.
+
+    Returns:
+        Tensor: loss Tensor
+    """
+    return F.mse_loss(pred, target, reduction='none')
+
+
+@MODELS.register_module()
+class MSELoss(nn.Module):
+    """MSELoss.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+
+    def __init__(self,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function of loss.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            weight (Tensor, optional): Weight of the loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: The calculated loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * mse_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss
diff --git a/head_extractor/build/lib/mmdet/models/losses/multipos_cross_entropy_loss.py b/head_extractor/build/lib/mmdet/models/losses/multipos_cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d1561ed414b7c15412b5e746dff39ca0c53ba1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/multipos_cross_entropy_loss.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+@MODELS.register_module()
+class MultiPosCrossEntropyLoss(BaseModule):
+    """multi-positive targets cross entropy loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum". Defaults to "mean".
+        loss_weight (float, optional): The weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self, reduction: str = 'mean', loss_weight: float = 1.0):
+        super(MultiPosCrossEntropyLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def multi_pos_cross_entropy(self,
+                                pred: Tensor,
+                                label: Tensor,
+                                weight: Optional[Tensor] = None,
+                                reduction: str = 'mean',
+                                avg_factor: Optional[float] = None) -> Tensor:
+        """Multi-positive targets cross entropy loss.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            label (torch.Tensor): The assigned label of the prediction.
+            weight (torch.Tensor): The element-wise weight.
+            reduction (str): Same as built-in losses of PyTorch.
+            avg_factor (float): Average factor when computing
+                the mean of losses.
+
+        Returns:
+            torch.Tensor: Calculated loss
+        """
+
+        pos_inds = (label >= 1)
+        neg_inds = (label == 0)
+        pred_pos = pred * pos_inds.float()
+        pred_neg = pred * neg_inds.float()
+        # use -inf to mask out unwanted elements.
+        pred_pos[neg_inds] = pred_pos[neg_inds] + float('inf')
+        pred_neg[pos_inds] = pred_neg[pos_inds] + float('-inf')
+
+        _pos_expand = torch.repeat_interleave(pred_pos, pred.shape[1], dim=1)
+        _neg_expand = pred_neg.repeat(1, pred.shape[1])
+
+        x = torch.nn.functional.pad((_neg_expand - _pos_expand), (0, 1),
+                                    'constant', 0)
+        loss = torch.logsumexp(x, dim=1)
+
+        # apply weights and do the reduction
+        if weight is not None:
+            weight = weight.float()
+        loss = weight_reduce_loss(
+            loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+        return loss
+
+    def forward(self,
+                cls_score: Tensor,
+                label: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The classification score.
+            label (torch.Tensor): The assigned label of the prediction.
+            weight (torch.Tensor): The element-wise weight.
+            avg_factor (float): Average factor when computing
+                the mean of losses.
+            reduction_override (str): Same as built-in losses of PyTorch.
+
+        Returns:
+            torch.Tensor: Calculated loss
+        """
+        assert cls_score.size() == label.size()
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_cls = self.loss_weight * self.multi_pos_cross_entropy(
+            cls_score,
+            label,
+            weight,
+            reduction=reduction,
+            avg_factor=avg_factor)
+        return loss_cls
diff --git a/head_extractor/build/lib/mmdet/models/losses/pisa_loss.py b/head_extractor/build/lib/mmdet/models/losses/pisa_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b192aa0dbc7eb554755eb2f242eab0ea7f1fc650
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/pisa_loss.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.structures.bbox import bbox_overlaps
+from ..task_modules.coders import BaseBBoxCoder
+from ..task_modules.samplers import SamplingResult
+
+
+def isr_p(cls_score: Tensor,
+          bbox_pred: Tensor,
+          bbox_targets: Tuple[Tensor],
+          rois: Tensor,
+          sampling_results: List[SamplingResult],
+          loss_cls: nn.Module,
+          bbox_coder: BaseBBoxCoder,
+          k: float = 2,
+          bias: float = 0,
+          num_class: int = 80) -> tuple:
+    """Importance-based Sample Reweighting (ISR_P), positive part.
+
+    Args:
+        cls_score (Tensor): Predicted classification scores.
+        bbox_pred (Tensor): Predicted bbox deltas.
+        bbox_targets (tuple[Tensor]): A tuple of bbox targets, the are
+            labels, label_weights, bbox_targets, bbox_weights, respectively.
+        rois (Tensor): Anchors (single_stage) in shape (n, 4) or RoIs
+            (two_stage) in shape (n, 5).
+        sampling_results (:obj:`SamplingResult`): Sampling results.
+        loss_cls (:obj:`nn.Module`): Classification loss func of the head.
+        bbox_coder (:obj:`BaseBBoxCoder`): BBox coder of the head.
+        k (float): Power of the non-linear mapping. Defaults to 2.
+        bias (float): Shift of the non-linear mapping. Defaults to 0.
+        num_class (int): Number of classes, defaults to 80.
+
+    Return:
+        tuple([Tensor]): labels, imp_based_label_weights, bbox_targets,
+            bbox_target_weights
+    """
+
+    labels, label_weights, bbox_targets, bbox_weights = bbox_targets
+    pos_label_inds = ((labels >= 0) &
+                      (labels < num_class)).nonzero().reshape(-1)
+    pos_labels = labels[pos_label_inds]
+
+    # if no positive samples, return the original targets
+    num_pos = float(pos_label_inds.size(0))
+    if num_pos == 0:
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    # merge pos_assigned_gt_inds of per image to a single tensor
+    gts = list()
+    last_max_gt = 0
+    for i in range(len(sampling_results)):
+        gt_i = sampling_results[i].pos_assigned_gt_inds
+        gts.append(gt_i + last_max_gt)
+        if len(gt_i) != 0:
+            last_max_gt = gt_i.max() + 1
+    gts = torch.cat(gts)
+    assert len(gts) == num_pos
+
+    cls_score = cls_score.detach()
+    bbox_pred = bbox_pred.detach()
+
+    # For single stage detectors, rois here indicate anchors, in shape (N, 4)
+    # For two stage detectors, rois are in shape (N, 5)
+    if rois.size(-1) == 5:
+        pos_rois = rois[pos_label_inds][:, 1:]
+    else:
+        pos_rois = rois[pos_label_inds]
+
+    if bbox_pred.size(-1) > 4:
+        bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4)
+        pos_delta_pred = bbox_pred[pos_label_inds, pos_labels].view(-1, 4)
+    else:
+        pos_delta_pred = bbox_pred[pos_label_inds].view(-1, 4)
+
+    # compute iou of the predicted bbox and the corresponding GT
+    pos_delta_target = bbox_targets[pos_label_inds].view(-1, 4)
+    pos_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_pred)
+    target_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_target)
+    ious = bbox_overlaps(pos_bbox_pred, target_bbox_pred, is_aligned=True)
+
+    pos_imp_weights = label_weights[pos_label_inds]
+    # Two steps to compute IoU-HLR. Samples are first sorted by IoU locally,
+    # then sorted again within the same-rank group
+    max_l_num = pos_labels.bincount().max()
+    for label in pos_labels.unique():
+        l_inds = (pos_labels == label).nonzero().view(-1)
+        l_gts = gts[l_inds]
+        for t in l_gts.unique():
+            t_inds = l_inds[l_gts == t]
+            t_ious = ious[t_inds]
+            _, t_iou_rank_idx = t_ious.sort(descending=True)
+            _, t_iou_rank = t_iou_rank_idx.sort()
+            ious[t_inds] += max_l_num - t_iou_rank.float()
+        l_ious = ious[l_inds]
+        _, l_iou_rank_idx = l_ious.sort(descending=True)
+        _, l_iou_rank = l_iou_rank_idx.sort()  # IoU-HLR
+        # linearly map HLR to label weights
+        pos_imp_weights[l_inds] *= (max_l_num - l_iou_rank.float()) / max_l_num
+
+    pos_imp_weights = (bias + pos_imp_weights * (1 - bias)).pow(k)
+
+    # normalize to make the new weighted loss value equal to the original loss
+    pos_loss_cls = loss_cls(
+        cls_score[pos_label_inds], pos_labels, reduction_override='none')
+    if pos_loss_cls.dim() > 1:
+        ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds][:,
+                                                                        None]
+        new_pos_loss_cls = pos_loss_cls * pos_imp_weights[:, None]
+    else:
+        ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds]
+        new_pos_loss_cls = pos_loss_cls * pos_imp_weights
+    pos_loss_cls_ratio = ori_pos_loss_cls.sum() / new_pos_loss_cls.sum()
+    pos_imp_weights = pos_imp_weights * pos_loss_cls_ratio
+    label_weights[pos_label_inds] = pos_imp_weights
+
+    bbox_targets = labels, label_weights, bbox_targets, bbox_weights
+    return bbox_targets
+
+
+def carl_loss(cls_score: Tensor,
+              labels: Tensor,
+              bbox_pred: Tensor,
+              bbox_targets: Tensor,
+              loss_bbox: nn.Module,
+              k: float = 1,
+              bias: float = 0.2,
+              avg_factor: Optional[int] = None,
+              sigmoid: bool = False,
+              num_class: int = 80) -> dict:
+    """Classification-Aware Regression Loss (CARL).
+
+    Args:
+        cls_score (Tensor): Predicted classification scores.
+        labels (Tensor): Targets of classification.
+        bbox_pred (Tensor): Predicted bbox deltas.
+        bbox_targets (Tensor): Target of bbox regression.
+        loss_bbox (func): Regression loss func of the head.
+        bbox_coder (obj): BBox coder of the head.
+        k (float): Power of the non-linear mapping. Defaults to 1.
+        bias (float): Shift of the non-linear mapping. Defaults to 0.2.
+        avg_factor (int, optional): Average factor used in regression loss.
+        sigmoid (bool): Activation of the classification score.
+        num_class (int): Number of classes, defaults to 80.
+
+    Return:
+        dict: CARL loss dict.
+    """
+    pos_label_inds = ((labels >= 0) &
+                      (labels < num_class)).nonzero().reshape(-1)
+    if pos_label_inds.numel() == 0:
+        return dict(loss_carl=cls_score.sum()[None] * 0.)
+    pos_labels = labels[pos_label_inds]
+
+    # multiply pos_cls_score with the corresponding bbox weight
+    # and remain gradient
+    if sigmoid:
+        pos_cls_score = cls_score.sigmoid()[pos_label_inds, pos_labels]
+    else:
+        pos_cls_score = cls_score.softmax(-1)[pos_label_inds, pos_labels]
+    carl_loss_weights = (bias + (1 - bias) * pos_cls_score).pow(k)
+
+    # normalize carl_loss_weight to make its sum equal to num positive
+    num_pos = float(pos_cls_score.size(0))
+    weight_ratio = num_pos / carl_loss_weights.sum()
+    carl_loss_weights *= weight_ratio
+
+    if avg_factor is None:
+        avg_factor = bbox_targets.size(0)
+    # if is class agnostic, bbox pred is in shape (N, 4)
+    # otherwise, bbox pred is in shape (N, #classes, 4)
+    if bbox_pred.size(-1) > 4:
+        bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4)
+        pos_bbox_preds = bbox_pred[pos_label_inds, pos_labels]
+    else:
+        pos_bbox_preds = bbox_pred[pos_label_inds]
+    ori_loss_reg = loss_bbox(
+        pos_bbox_preds,
+        bbox_targets[pos_label_inds],
+        reduction_override='none') / avg_factor
+    loss_carl = (ori_loss_reg * carl_loss_weights[:, None]).sum()
+    return dict(loss_carl=loss_carl[None])
diff --git a/head_extractor/build/lib/mmdet/models/losses/seesaw_loss.py b/head_extractor/build/lib/mmdet/models/losses/seesaw_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dec62b0afdc01e848e0c7f53ba0b6b10b899ea4
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/seesaw_loss.py
@@ -0,0 +1,278 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .accuracy import accuracy
+from .cross_entropy_loss import cross_entropy
+from .utils import weight_reduce_loss
+
+
+def seesaw_ce_loss(cls_score: Tensor,
+                   labels: Tensor,
+                   label_weights: Tensor,
+                   cum_samples: Tensor,
+                   num_classes: int,
+                   p: float,
+                   q: float,
+                   eps: float,
+                   reduction: str = 'mean',
+                   avg_factor: Optional[int] = None) -> Tensor:
+    """Calculate the Seesaw CrossEntropy loss.
+
+    Args:
+        cls_score (Tensor): The prediction with shape (N, C),
+             C is the number of classes.
+        labels (Tensor): The learning label of the prediction.
+        label_weights (Tensor): Sample-wise loss weight.
+        cum_samples (Tensor): Cumulative samples for each category.
+        num_classes (int): The number of classes.
+        p (float): The ``p`` in the mitigation factor.
+        q (float): The ``q`` in the compenstation factor.
+        eps (float): The minimal value of divisor to smooth
+             the computation of compensation factor
+        reduction (str, optional): The method used to reduce the loss.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+
+    Returns:
+        Tensor: The calculated loss
+    """
+    assert cls_score.size(-1) == num_classes
+    assert len(cum_samples) == num_classes
+
+    onehot_labels = F.one_hot(labels, num_classes)
+    seesaw_weights = cls_score.new_ones(onehot_labels.size())
+
+    # mitigation factor
+    if p > 0:
+        sample_ratio_matrix = cum_samples[None, :].clamp(
+            min=1) / cum_samples[:, None].clamp(min=1)
+        index = (sample_ratio_matrix < 1.0).float()
+        sample_weights = sample_ratio_matrix.pow(p) * index + (1 - index)
+        mitigation_factor = sample_weights[labels.long(), :]
+        seesaw_weights = seesaw_weights * mitigation_factor
+
+    # compensation factor
+    if q > 0:
+        scores = F.softmax(cls_score.detach(), dim=1)
+        self_scores = scores[
+            torch.arange(0, len(scores)).to(scores.device).long(),
+            labels.long()]
+        score_matrix = scores / self_scores[:, None].clamp(min=eps)
+        index = (score_matrix > 1.0).float()
+        compensation_factor = score_matrix.pow(q) * index + (1 - index)
+        seesaw_weights = seesaw_weights * compensation_factor
+
+    cls_score = cls_score + (seesaw_weights.log() * (1 - onehot_labels))
+
+    loss = F.cross_entropy(cls_score, labels, weight=None, reduction='none')
+
+    if label_weights is not None:
+        label_weights = label_weights.float()
+    loss = weight_reduce_loss(
+        loss, weight=label_weights, reduction=reduction, avg_factor=avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class SeesawLoss(nn.Module):
+    """
+    Seesaw Loss for Long-Tailed Instance Segmentation (CVPR 2021)
+    arXiv: https://arxiv.org/abs/2008.10032
+
+    Args:
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+             of softmax. Only False is supported.
+        p (float, optional): The ``p`` in the mitigation factor.
+             Defaults to 0.8.
+        q (float, optional): The ``q`` in the compenstation factor.
+             Defaults to 2.0.
+        num_classes (int, optional): The number of classes.
+             Default to 1203 for LVIS v1 dataset.
+        eps (float, optional): The minimal value of divisor to smooth
+             the computation of compensation factor
+        reduction (str, optional): The method that reduces the loss to a
+             scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+        return_dict (bool, optional): Whether return the losses as a dict.
+             Default to True.
+    """
+
+    def __init__(self,
+                 use_sigmoid: bool = False,
+                 p: float = 0.8,
+                 q: float = 2.0,
+                 num_classes: int = 1203,
+                 eps: float = 1e-2,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 return_dict: bool = True) -> None:
+        super().__init__()
+        assert not use_sigmoid
+        self.use_sigmoid = False
+        self.p = p
+        self.q = q
+        self.num_classes = num_classes
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.return_dict = return_dict
+
+        # 0 for pos, 1 for neg
+        self.cls_criterion = seesaw_ce_loss
+
+        # cumulative samples for each category
+        self.register_buffer(
+            'cum_samples',
+            torch.zeros(self.num_classes + 1, dtype=torch.float))
+
+        # custom output channels of the classifier
+        self.custom_cls_channels = True
+        # custom activation of cls_score
+        self.custom_activation = True
+        # custom accuracy of the classsifier
+        self.custom_accuracy = True
+
+    def _split_cls_score(self, cls_score: Tensor) -> Tuple[Tensor, Tensor]:
+        """split cls_score.
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C + 2).
+
+        Returns:
+            Tuple[Tensor, Tensor]: The score for classes and objectness,
+                 respectively
+        """
+        # split cls_score to cls_score_classes and cls_score_objectness
+        assert cls_score.size(-1) == self.num_classes + 2
+        cls_score_classes = cls_score[..., :-2]
+        cls_score_objectness = cls_score[..., -2:]
+        return cls_score_classes, cls_score_objectness
+
+    def get_cls_channels(self, num_classes: int) -> int:
+        """Get custom classification channels.
+
+        Args:
+            num_classes (int): The number of classes.
+
+        Returns:
+            int: The custom classification channels.
+        """
+        assert num_classes == self.num_classes
+        return num_classes + 2
+
+    def get_activation(self, cls_score: Tensor) -> Tensor:
+        """Get custom activation of cls_score.
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C + 2).
+
+        Returns:
+            Tensor: The custom activation of cls_score with shape
+                 (N, C + 1).
+        """
+        cls_score_classes, cls_score_objectness = self._split_cls_score(
+            cls_score)
+        score_classes = F.softmax(cls_score_classes, dim=-1)
+        score_objectness = F.softmax(cls_score_objectness, dim=-1)
+        score_pos = score_objectness[..., [0]]
+        score_neg = score_objectness[..., [1]]
+        score_classes = score_classes * score_pos
+        scores = torch.cat([score_classes, score_neg], dim=-1)
+        return scores
+
+    def get_accuracy(self, cls_score: Tensor,
+                     labels: Tensor) -> Dict[str, Tensor]:
+        """Get custom accuracy w.r.t. cls_score and labels.
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C + 2).
+            labels (Tensor): The learning label of the prediction.
+
+        Returns:
+            Dict [str, Tensor]: The accuracy for objectness and classes,
+                 respectively.
+        """
+        pos_inds = labels < self.num_classes
+        obj_labels = (labels == self.num_classes).long()
+        cls_score_classes, cls_score_objectness = self._split_cls_score(
+            cls_score)
+        acc_objectness = accuracy(cls_score_objectness, obj_labels)
+        acc_classes = accuracy(cls_score_classes[pos_inds], labels[pos_inds])
+        acc = dict()
+        acc['acc_objectness'] = acc_objectness
+        acc['acc_classes'] = acc_classes
+        return acc
+
+    def forward(
+        self,
+        cls_score: Tensor,
+        labels: Tensor,
+        label_weights: Optional[Tensor] = None,
+        avg_factor: Optional[int] = None,
+        reduction_override: Optional[str] = None
+    ) -> Union[Tensor, Dict[str, Tensor]]:
+        """Forward function.
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C + 2).
+            labels (Tensor): The learning label of the prediction.
+            label_weights (Tensor, optional): Sample-wise loss weight.
+            avg_factor (int, optional): Average factor that is used to average
+                 the loss. Defaults to None.
+            reduction (str, optional): The method used to reduce the loss.
+                 Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor | Dict [str, Tensor]:
+                 if return_dict == False: The calculated loss |
+                 if return_dict == True: The dict of calculated losses
+                 for objectness and classes, respectively.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        assert cls_score.size(-1) == self.num_classes + 2
+        pos_inds = labels < self.num_classes
+        # 0 for pos, 1 for neg
+        obj_labels = (labels == self.num_classes).long()
+
+        # accumulate the samples for each category
+        unique_labels = labels.unique()
+        for u_l in unique_labels:
+            inds_ = labels == u_l.item()
+            self.cum_samples[u_l] += inds_.sum()
+
+        if label_weights is not None:
+            label_weights = label_weights.float()
+        else:
+            label_weights = labels.new_ones(labels.size(), dtype=torch.float)
+
+        cls_score_classes, cls_score_objectness = self._split_cls_score(
+            cls_score)
+        # calculate loss_cls_classes (only need pos samples)
+        if pos_inds.sum() > 0:
+            loss_cls_classes = self.loss_weight * self.cls_criterion(
+                cls_score_classes[pos_inds], labels[pos_inds],
+                label_weights[pos_inds], self.cum_samples[:self.num_classes],
+                self.num_classes, self.p, self.q, self.eps, reduction,
+                avg_factor)
+        else:
+            loss_cls_classes = cls_score_classes[pos_inds].sum()
+        # calculate loss_cls_objectness
+        loss_cls_objectness = self.loss_weight * cross_entropy(
+            cls_score_objectness, obj_labels, label_weights, reduction,
+            avg_factor)
+
+        if self.return_dict:
+            loss_cls = dict()
+            loss_cls['loss_cls_objectness'] = loss_cls_objectness
+            loss_cls['loss_cls_classes'] = loss_cls_classes
+        else:
+            loss_cls = loss_cls_classes + loss_cls_objectness
+        return loss_cls
diff --git a/head_extractor/build/lib/mmdet/models/losses/smooth_l1_loss.py b/head_extractor/build/lib/mmdet/models/losses/smooth_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..102f9780706172a44ade2ebe1709c7a1e847db7c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/smooth_l1_loss.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def smooth_l1_loss(pred: Tensor, target: Tensor, beta: float = 1.0) -> Tensor:
+    """Smooth L1 loss.
+
+    Args:
+        pred (Tensor): The prediction.
+        target (Tensor): The learning target of the prediction.
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+
+    Returns:
+        Tensor: Calculated loss
+    """
+    assert beta > 0
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    assert pred.size() == target.size()
+    diff = torch.abs(pred - target)
+    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+                       diff - 0.5 * beta)
+    return loss
+
+
+@weighted_loss
+def l1_loss(pred: Tensor, target: Tensor) -> Tensor:
+    """L1 loss.
+
+    Args:
+        pred (Tensor): The prediction.
+        target (Tensor): The learning target of the prediction.
+
+    Returns:
+        Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)
+    return loss
+
+
+@MODELS.register_module()
+class SmoothL1Loss(nn.Module):
+    """Smooth L1 loss.
+
+    Args:
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum". Defaults to "mean".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self,
+                 beta: float = 1.0,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: Calculated loss
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * smooth_l1_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_bbox
+
+
+@MODELS.register_module()
+class L1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: Calculated loss
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
diff --git a/head_extractor/build/lib/mmdet/models/losses/triplet_loss.py b/head_extractor/build/lib/mmdet/models/losses/triplet_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..4528239beb4bf122fa1a05ee2ce21cb1cb144bde
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/triplet_loss.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class TripletLoss(BaseModule):
+    """Triplet loss with hard positive/negative mining.
+
+    Reference:
+        Hermans et al. In Defense of the Triplet Loss for
+            Person Re-Identification. arXiv:1703.07737.
+    Imported from `<https://github.com/KaiyangZhou/deep-person-reid/blob/
+        master/torchreid/losses/hard_mine_triplet_loss.py>`_.
+    Args:
+        margin (float, optional): Margin for triplet loss. Defaults to 0.3.
+        loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+        hard_mining (bool, optional): Whether to perform hard mining.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 margin: float = 0.3,
+                 loss_weight: float = 1.0,
+                 hard_mining=True):
+        super(TripletLoss, self).__init__()
+        self.margin = margin
+        self.ranking_loss = nn.MarginRankingLoss(margin=margin)
+        self.loss_weight = loss_weight
+        self.hard_mining = hard_mining
+
+    def hard_mining_triplet_loss_forward(
+            self, inputs: torch.Tensor,
+            targets: torch.LongTensor) -> torch.Tensor:
+        """
+        Args:
+            inputs (torch.Tensor): feature matrix with shape
+                (batch_size, feat_dim).
+            targets (torch.LongTensor): ground truth labels with shape
+                (batch_size).
+
+        Returns:
+            torch.Tensor: triplet loss with hard mining.
+        """
+
+        batch_size = inputs.size(0)
+
+        # Compute Euclidean distance
+        dist = torch.pow(inputs, 2).sum(
+            dim=1, keepdim=True).expand(batch_size, batch_size)
+        dist = dist + dist.t()
+        dist.addmm_(inputs, inputs.t(), beta=1, alpha=-2)
+        dist = dist.clamp(min=1e-12).sqrt()  # for numerical stability
+
+        # For each anchor, find the furthest positive sample
+        # and nearest negative sample in the embedding space
+        mask = targets.expand(batch_size, batch_size).eq(
+            targets.expand(batch_size, batch_size).t())
+        dist_ap, dist_an = [], []
+        for i in range(batch_size):
+            dist_ap.append(dist[i][mask[i]].max().unsqueeze(0))
+            dist_an.append(dist[i][mask[i] == 0].min().unsqueeze(0))
+        dist_ap = torch.cat(dist_ap)
+        dist_an = torch.cat(dist_an)
+
+        # Compute ranking hinge loss
+        y = torch.ones_like(dist_an)
+        return self.loss_weight * self.ranking_loss(dist_an, dist_ap, y)
+
+    def forward(self, inputs: torch.Tensor,
+                targets: torch.LongTensor) -> torch.Tensor:
+        """
+        Args:
+            inputs (torch.Tensor): feature matrix with shape
+                (batch_size, feat_dim).
+            targets (torch.LongTensor): ground truth labels with shape
+                (num_classes).
+
+        Returns:
+            torch.Tensor: triplet loss.
+        """
+        if self.hard_mining:
+            return self.hard_mining_triplet_loss_forward(inputs, targets)
+        else:
+            raise NotImplementedError()
diff --git a/head_extractor/build/lib/mmdet/models/losses/utils.py b/head_extractor/build/lib/mmdet/models/losses/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e6e7859f353f3e5456f0cfc1f66b4b0ad535427
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/utils.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+from typing import Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def reduce_loss(loss: Tensor, reduction: str) -> Tensor:
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+def weight_reduce_loss(loss: Tensor,
+                       weight: Optional[Tensor] = None,
+                       reduction: str = 'mean',
+                       avg_factor: Optional[float] = None) -> Tensor:
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Optional[Tensor], optional): Element-wise weights.
+            Defaults to None.
+        reduction (str, optional): Same as built-in losses of PyTorch.
+            Defaults to 'mean'.
+        avg_factor (Optional[float], optional): Average factor when
+            computing the mean of losses. Defaults to None.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = torch.finfo(torch.float32).eps
+            loss = loss.sum() / (avg_factor + eps)
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+def weighted_loss(loss_func: Callable) -> Callable:
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                reduction: str = 'mean',
+                avg_factor: Optional[int] = None,
+                **kwargs) -> Tensor:
+        """
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): Target bboxes.
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            reduction (str, optional): Options are "none", "mean" and "sum".
+                Defaults to 'mean'.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
diff --git a/head_extractor/build/lib/mmdet/models/losses/varifocal_loss.py b/head_extractor/build/lib/mmdet/models/losses/varifocal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..58ab167352e1ae32566f5e731339966d5fd10759
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/losses/varifocal_loss.py
@@ -0,0 +1,141 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def varifocal_loss(pred: Tensor,
+                   target: Tensor,
+                   weight: Optional[Tensor] = None,
+                   alpha: float = 0.75,
+                   gamma: float = 2.0,
+                   iou_weighted: bool = True,
+                   reduction: str = 'mean',
+                   avg_factor: Optional[int] = None) -> Tensor:
+    """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+    Args:
+        pred (Tensor): The prediction with shape (N, C), C is the
+            number of classes.
+        target (Tensor): The learning target of the iou-aware
+            classification score with shape (N, C), C is the number of classes.
+        weight (Tensor, optional): The weight of loss for each
+            prediction. Defaults to None.
+        alpha (float, optional): A balance factor for the negative part of
+            Varifocal Loss, which is different from the alpha of Focal Loss.
+            Defaults to 0.75.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        iou_weighted (bool, optional): Whether to weight the loss of the
+            positive example with the iou target. Defaults to True.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and
+            "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+
+    Returns:
+        Tensor: Loss tensor.
+    """
+    # pred and target should be of the same size
+    assert pred.size() == target.size()
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    if iou_weighted:
+        focal_weight = target * (target > 0.0).float() + \
+            alpha * (pred_sigmoid - target).abs().pow(gamma) * \
+            (target <= 0.0).float()
+    else:
+        focal_weight = (target > 0.0).float() + \
+            alpha * (pred_sigmoid - target).abs().pow(gamma) * \
+            (target <= 0.0).float()
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class VarifocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid: bool = True,
+                 alpha: float = 0.75,
+                 gamma: float = 2.0,
+                 iou_weighted: bool = True,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            alpha (float, optional): A balance factor for the negative part of
+                Varifocal Loss, which is different from the alpha of Focal
+                Loss. Defaults to 0.75.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            iou_weighted (bool, optional): Whether to weight the loss of the
+                positive examples with the iou target. Defaults to True.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+        """
+        super().__init__()
+        assert use_sigmoid is True, \
+            'Only sigmoid varifocal loss supported now.'
+        assert alpha >= 0.0
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.iou_weighted = iou_weighted
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction with shape (N, C), C is the
+                number of classes.
+            target (Tensor): The learning target of the iou-aware
+                classification score with shape (N, C), C is
+                the number of classes.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            loss_cls = self.loss_weight * varifocal_loss(
+                pred,
+                target,
+                weight,
+                alpha=self.alpha,
+                gamma=self.gamma,
+                iou_weighted=self.iou_weighted,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/head_extractor/build/lib/mmdet/models/mot/__init__.py b/head_extractor/build/lib/mmdet/models/mot/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd3c8d3ba53daad736e05b5d29a6abb377fd595
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/mot/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseMOTModel
+from .bytetrack import ByteTrack
+from .deep_sort import DeepSORT
+from .ocsort import OCSORT
+from .qdtrack import QDTrack
+from .strongsort import StrongSORT
+
+__all__ = [
+    'BaseMOTModel', 'ByteTrack', 'QDTrack', 'DeepSORT', 'StrongSORT', 'OCSORT'
+]
diff --git a/head_extractor/build/lib/mmdet/models/mot/base.py b/head_extractor/build/lib/mmdet/models/mot/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9981417924af3970319b0cbe6a9cc8d8a1095451
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/mot/base.py
@@ -0,0 +1,147 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple, Union
+
+from mmengine.model import BaseModel
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptTrackSampleList, TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class BaseMOTModel(BaseModel, metaclass=ABCMeta):
+    """Base class for multiple object tracking.
+
+    Args:
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Initialization config dict.
+    """
+
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+    def freeze_module(self, module: Union[List[str], Tuple[str], str]) -> None:
+        """Freeze module during training."""
+        if isinstance(module, str):
+            modules = [module]
+        else:
+            if not (isinstance(module, list) or isinstance(module, tuple)):
+                raise TypeError('module must be a str or a list.')
+            else:
+                modules = module
+        for module in modules:
+            m = getattr(self, module)
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    @property
+    def with_detector(self) -> bool:
+        """bool: whether the framework has a detector."""
+        return hasattr(self, 'detector') and self.detector is not None
+
+    @property
+    def with_reid(self) -> bool:
+        """bool: whether the framework has a reid model."""
+        return hasattr(self, 'reid') and self.reid is not None
+
+    @property
+    def with_motion(self) -> bool:
+        """bool: whether the framework has a motion model."""
+        return hasattr(self, 'motion') and self.motion is not None
+
+    @property
+    def with_track_head(self) -> bool:
+        """bool: whether the framework has a track_head."""
+        return hasattr(self, 'track_head') and self.track_head is not None
+
+    @property
+    def with_tracker(self) -> bool:
+        """bool: whether the framework has a tracker."""
+        return hasattr(self, 'tracker') and self.tracker is not None
+
+    def forward(self,
+                inputs: Dict[str, Tensor],
+                data_samples: OptTrackSampleList = None,
+                mode: str = 'predict',
+                **kwargs):
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+        processed to a list of :obj:`TrackDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W)
+                encoding input images. Typically these should be mean centered
+                and std scaled. The N denotes batch size. The T denotes the
+                number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`], optional): The
+                annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'predict'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`TrackDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if mode == 'loss':
+            return self.loss(inputs, data_samples, **kwargs)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples, **kwargs)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples, **kwargs)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    @abstractmethod
+    def loss(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList,
+             **kwargs) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples."""
+        pass
+
+    @abstractmethod
+    def predict(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing."""
+        pass
+
+    def _forward(self,
+                 inputs: Dict[str, Tensor],
+                 data_samples: OptTrackSampleList = None,
+                 **kwargs):
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W).
+            data_samples (List[:obj:`TrackDataSample`], optional): The
+                Data Samples. It usually includes information such as
+                `gt_instance`.
+
+        Returns:
+            tuple[list]: A tuple of features from ``head`` forward.
+        """
+        raise NotImplementedError(
+            "_forward function (namely 'tensor' mode) is not supported now")
diff --git a/head_extractor/build/lib/mmdet/models/mot/bytetrack.py b/head_extractor/build/lib/mmdet/models/mot/bytetrack.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a3bb867cb284aad9854de44b2942341a4a33be8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/mot/bytetrack.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList, TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .base import BaseMOTModel
+
+
+@MODELS.register_module()
+class ByteTrack(BaseMOTModel):
+    """ByteTrack: Multi-Object Tracking by Associating Every Detection Box.
+
+    This multi object tracker is the implementation of `ByteTrack
+    <https://arxiv.org/abs/2110.06864>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+    def loss(self, inputs: Tensor, data_samples: SampleList, **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): of shape (N, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size
+            data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        return self.detector.loss(inputs, data_samples, **kwargs)
+
+    def predict(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post-processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'Bytetrack inference only support ' \
+            '1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'Bytetrack inference only support 1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            # det_results List[DetDataSample]
+            det_results = self.detector.predict(single_img, [img_data_sample])
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+
+            pred_track_instances = self.tracker.track(
+                data_sample=det_results[0], **kwargs)
+            img_data_sample.pred_track_instances = pred_track_instances
+
+        return [track_data_sample]
diff --git a/head_extractor/build/lib/mmdet/models/mot/deep_sort.py b/head_extractor/build/lib/mmdet/models/mot/deep_sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..70b30c7b07b2211fd0ad70767f479e57b6cd33f6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/mot/deep_sort.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType
+from .base import BaseMOTModel
+
+
+@MODELS.register_module()
+class DeepSORT(BaseMOTModel):
+    """Simple online and realtime tracking with a deep association metric.
+
+    Details can be found at `DeepSORT<https://arxiv.org/abs/1703.07402>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        reid (dict): Configuration of reid. Defaults to None
+        tracker (dict): Configuration of tracker. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 reid: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if reid is not None:
+            self.reid = MODELS.build(reid)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+        self.preprocess_cfg = data_preprocessor
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+        raise NotImplementedError(
+            'Please train `detector` and `reid` models firstly, then \
+                inference with SORT/DeepSORT.')
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post- processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of key frames
+                and reference frames.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: List[TrackDataSample]
+            Tracking results of the input videos.
+            Each DetDataSample usually contains ``pred_track_instances``.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+        if track_data_sample[0].frame_id == 0:
+            self.tracker.reset()
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            # det_results List[DetDataSample]
+            det_results = self.detector.predict(single_img, [img_data_sample])
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+
+            pred_track_instances = self.tracker.track(
+                model=self,
+                img=single_img,
+                feats=None,
+                data_sample=det_results[0],
+                data_preprocessor=self.preprocess_cfg,
+                rescale=rescale,
+                **kwargs)
+            img_data_sample.pred_track_instances = pred_track_instances
+
+        return [track_data_sample]
diff --git a/head_extractor/build/lib/mmdet/models/mot/ocsort.py b/head_extractor/build/lib/mmdet/models/mot/ocsort.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf4eb3b06e2b1b223fe948f30dac877248377e3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/mot/ocsort.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Dict, Optional
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .base import BaseMOTModel
+
+
+@MODELS.register_module()
+class OCSORT(BaseMOTModel):
+    """OCOSRT: Observation-Centric SORT: Rethinking SORT for Robust
+    Multi-Object Tracking
+
+    This multi object tracker is the implementation of `OC-SORT
+    <https://arxiv.org/abs/2203.14360>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        motion (dict): Configuration of motion. Defaults to None.
+        init_cfg (dict): Configuration of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+        return self.detector.loss(inputs, data_samples, **kwargs)
+
+    def predict(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post-processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'OCSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'OCSORT inference only support 1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            # det_results List[DetDataSample]
+            det_results = self.detector.predict(single_img, [img_data_sample])
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+
+            pred_track_instances = self.tracker.track(
+                data_sample=det_results[0], **kwargs)
+            img_data_sample.pred_track_instances = pred_track_instances
+
+        return [track_data_sample]
diff --git a/head_extractor/build/lib/mmdet/models/mot/qdtrack.py b/head_extractor/build/lib/mmdet/models/mot/qdtrack.py
new file mode 100644
index 0000000000000000000000000000000000000000..43d5dd60b8af8a6200e21a196c47d00dd2812a46
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/mot/qdtrack.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .base import BaseMOTModel
+
+
+@MODELS.register_module()
+class QDTrack(BaseMOTModel):
+    """Quasi-Dense Similarity Learning for Multiple Object Tracking.
+
+    This multi object tracker is the implementation of `QDTrack
+    <https://arxiv.org/abs/2006.06664>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        track_head (dict): Configuration of track head. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        freeze_detector (bool): If True, freeze the detector weights.
+            Defaults to False.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 track_head: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 freeze_detector: bool = False,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if track_head is not None:
+            self.track_head = MODELS.build(track_head)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+        self.freeze_detector = freeze_detector
+        if self.freeze_detector:
+            self.freeze_module('detector')
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post- processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'QDTrack inference only support 1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'QDTrack only support 1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+        if track_data_sample[0].frame_id == 0:
+            self.tracker.reset()
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            x = self.detector.extract_feat(single_img)
+            rpn_results_list = self.detector.rpn_head.predict(
+                x, [img_data_sample])
+            # det_results List[InstanceData]
+            det_results = self.detector.roi_head.predict(
+                x, rpn_results_list, [img_data_sample], rescale=rescale)
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+            img_data_sample.pred_instances = det_results[0]
+            frame_pred_track_instances = self.tracker.track(
+                model=self,
+                img=single_img,
+                feats=x,
+                data_sample=img_data_sample,
+                **kwargs)
+            img_data_sample.pred_track_instances = frame_pred_track_instances
+
+        return [track_data_sample]
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size. The T denotes the number of
+                frames.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        # modify the inputs shape to fit mmdet
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(1) == 2, \
+            'QDTrack can only have 1 key frame and 1 reference frame.'
+
+        # split the data_samples into two aspects: key frames and reference
+        # frames
+        ref_data_samples, key_data_samples = [], []
+        key_frame_inds, ref_frame_inds = [], []
+        # set cat_id of gt_labels to 0 in RPN
+        for track_data_sample in data_samples:
+            key_frame_inds.append(track_data_sample.key_frames_inds[0])
+            ref_frame_inds.append(track_data_sample.ref_frames_inds[0])
+            key_data_sample = track_data_sample.get_key_frames()[0]
+            key_data_sample.gt_instances.labels = \
+                torch.zeros_like(key_data_sample.gt_instances.labels)
+            key_data_samples.append(key_data_sample)
+            ref_data_sample = track_data_sample.get_ref_frames()[0]
+            ref_data_samples.append(ref_data_sample)
+
+        key_frame_inds = torch.tensor(key_frame_inds, dtype=torch.int64)
+        ref_frame_inds = torch.tensor(ref_frame_inds, dtype=torch.int64)
+        batch_inds = torch.arange(len(inputs))
+        key_imgs = inputs[batch_inds, key_frame_inds].contiguous()
+        ref_imgs = inputs[batch_inds, ref_frame_inds].contiguous()
+
+        x = self.detector.extract_feat(key_imgs)
+        ref_x = self.detector.extract_feat(ref_imgs)
+
+        losses = dict()
+        # RPN head forward and loss
+        assert self.detector.with_rpn, \
+            'QDTrack only support detector with RPN.'
+
+        proposal_cfg = self.detector.train_cfg.get('rpn_proposal',
+                                                   self.detector.test_cfg.rpn)
+        rpn_losses, rpn_results_list = self.detector.rpn_head. \
+            loss_and_predict(x,
+                             key_data_samples,
+                             proposal_cfg=proposal_cfg,
+                             **kwargs)
+        ref_rpn_results_list = self.detector.rpn_head.predict(
+            ref_x, ref_data_samples, **kwargs)
+
+        # avoid get same name with roi_head loss
+        keys = rpn_losses.keys()
+        for key in keys:
+            if 'loss' in key and 'rpn' not in key:
+                rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+        losses.update(rpn_losses)
+
+        # roi_head loss
+        losses_detect = self.detector.roi_head.loss(x, rpn_results_list,
+                                                    key_data_samples, **kwargs)
+        losses.update(losses_detect)
+
+        # tracking head loss
+        losses_track = self.track_head.loss(x, ref_x, rpn_results_list,
+                                            ref_rpn_results_list, data_samples,
+                                            **kwargs)
+        losses.update(losses_track)
+
+        return losses
diff --git a/head_extractor/build/lib/mmdet/models/mot/strongsort.py b/head_extractor/build/lib/mmdet/models/mot/strongsort.py
new file mode 100644
index 0000000000000000000000000000000000000000..6129bf49972233206b3c05daa2174f99723d1b9d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/mot/strongsort.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import numpy as np
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType
+from .deep_sort import DeepSORT
+
+
+@MODELS.register_module()
+class StrongSORT(DeepSORT):
+    """StrongSORT: Make DeepSORT Great Again.
+
+    Details can be found at `StrongSORT<https://arxiv.org/abs/2202.13514>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        reid (dict): Configuration of reid. Defaults to None
+        tracker (dict): Configuration of tracker. Defaults to None.
+        kalman (dict): Configuration of Kalman filter. Defaults to None.
+        cmc (dict): Configuration of camera model compensation.
+            Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 reid: Optional[dict] = None,
+                 cmc: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 postprocess_model: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(detector, reid, tracker, data_preprocessor, init_cfg)
+
+        if cmc is not None:
+            self.cmc = TASK_UTILS.build(cmc)
+
+        if postprocess_model is not None:
+            self.postprocess_model = TASK_UTILS.build(postprocess_model)
+
+    @property
+    def with_cmc(self):
+        """bool: whether the framework has a camera model compensation
+                model.
+        """
+        return hasattr(self, 'cmc') and self.cmc is not None
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post- processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of key frames
+                and reference frames.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: List[TrackDataSample]
+            Tracking results of the input videos.
+            Each DetDataSample usually contains ``pred_track_instances``.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+
+        video_track_instances = []
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            # det_results List[DetDataSample]
+            det_results = self.detector.predict(single_img, [img_data_sample])
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+
+            pred_track_instances = self.tracker.track(
+                model=self,
+                img=single_img,
+                data_sample=det_results[0],
+                data_preprocessor=self.preprocess_cfg,
+                rescale=rescale,
+                **kwargs)
+            for i in range(len(pred_track_instances.instances_id)):
+                video_track_instances.append(
+                    np.array([
+                        frame_id + 1,
+                        pred_track_instances.instances_id[i].cpu(),
+                        pred_track_instances.bboxes[i][0].cpu(),
+                        pred_track_instances.bboxes[i][1].cpu(),
+                        (pred_track_instances.bboxes[i][2] -
+                         pred_track_instances.bboxes[i][0]).cpu(),
+                        (pred_track_instances.bboxes[i][3] -
+                         pred_track_instances.bboxes[i][1]).cpu(),
+                        pred_track_instances.scores[i].cpu()
+                    ]))
+        video_track_instances = np.array(video_track_instances).reshape(-1, 7)
+        video_track_instances = self.postprocess_model.forward(
+            video_track_instances)
+        for frame_id in range(video_len):
+            track_data_sample[frame_id].pred_track_instances = \
+                    InstanceData(bboxes=video_track_instances[
+                        video_track_instances[:, 0] == frame_id + 1, :])
+
+        return [track_data_sample]
diff --git a/head_extractor/build/lib/mmdet/models/necks/__init__.py b/head_extractor/build/lib/mmdet/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..343fbfefbd871d00e855d1c3cf4b531345e4dcf1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bfp import BFP
+from .channel_mapper import ChannelMapper
+from .cspnext_pafpn import CSPNeXtPAFPN
+from .ct_resnet_neck import CTResNetNeck
+from .dilated_encoder import DilatedEncoder
+from .dyhead import DyHead
+from .fpg import FPG
+from .fpn import FPN
+from .fpn_carafe import FPN_CARAFE
+from .fpn_dropblock import FPN_DropBlock
+from .hrfpn import HRFPN
+from .nas_fpn import NASFPN
+from .nasfcos_fpn import NASFCOS_FPN
+from .pafpn import PAFPN
+from .rfp import RFP
+from .ssd_neck import SSDNeck
+from .ssh import SSH
+from .yolo_neck import YOLOV3Neck
+from .yolox_pafpn import YOLOXPAFPN
+
+__all__ = [
+    'FPN', 'BFP', 'ChannelMapper', 'HRFPN', 'NASFPN', 'FPN_CARAFE', 'PAFPN',
+    'NASFCOS_FPN', 'RFP', 'YOLOV3Neck', 'FPG', 'DilatedEncoder',
+    'CTResNetNeck', 'SSDNeck', 'YOLOXPAFPN', 'DyHead', 'CSPNeXtPAFPN', 'SSH',
+    'FPN_DropBlock'
+]
diff --git a/head_extractor/build/lib/mmdet/models/necks/bfp.py b/head_extractor/build/lib/mmdet/models/necks/bfp.py
new file mode 100644
index 0000000000000000000000000000000000000000..401cdb0f552b06c9e8eb185c3e8ae0ba7112a9d8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/bfp.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import NonLocal2d
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class BFP(BaseModule):
+    """BFP (Balanced Feature Pyramids)
+
+    BFP takes multi-level features as inputs and gather them into a single one,
+    then refine the gathered feature and scatter the refined results to
+    multi-level features. This module is used in Libra R-CNN (CVPR 2019), see
+    the paper `Libra R-CNN: Towards Balanced Learning for Object Detection
+    <https://arxiv.org/abs/1904.02701>`_ for details.
+
+    Args:
+        in_channels (int): Number of input channels (feature maps of all levels
+            should have the same channels).
+        num_levels (int): Number of input feature levels.
+        refine_level (int): Index of integration and refine level of BSF in
+            multi-level features from bottom to top.
+        refine_type (str): Type of the refine op, currently support
+            [None, 'conv', 'non_local'].
+        conv_cfg (:obj:`ConfigDict` or dict, optional): The config dict for
+            convolution layers.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): The config dict for
+            normalization layers.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        num_levels: int,
+        refine_level: int = 2,
+        refine_type: str = None,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        init_cfg: OptMultiConfig = dict(
+            type='Xavier', layer='Conv2d', distribution='uniform')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert refine_type in [None, 'conv', 'non_local']
+
+        self.in_channels = in_channels
+        self.num_levels = num_levels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.refine_level = refine_level
+        self.refine_type = refine_type
+        assert 0 <= self.refine_level < self.num_levels
+
+        if self.refine_type == 'conv':
+            self.refine = ConvModule(
+                self.in_channels,
+                self.in_channels,
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+        elif self.refine_type == 'non_local':
+            self.refine = NonLocal2d(
+                self.in_channels,
+                reduction=1,
+                use_scale=False,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+
+    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward function."""
+        assert len(inputs) == self.num_levels
+
+        # step 1: gather multi-level features by resize and average
+        feats = []
+        gather_size = inputs[self.refine_level].size()[2:]
+        for i in range(self.num_levels):
+            if i < self.refine_level:
+                gathered = F.adaptive_max_pool2d(
+                    inputs[i], output_size=gather_size)
+            else:
+                gathered = F.interpolate(
+                    inputs[i], size=gather_size, mode='nearest')
+            feats.append(gathered)
+
+        bsf = sum(feats) / len(feats)
+
+        # step 2: refine gathered features
+        if self.refine_type is not None:
+            bsf = self.refine(bsf)
+
+        # step 3: scatter refined features to multi-levels by a residual path
+        outs = []
+        for i in range(self.num_levels):
+            out_size = inputs[i].size()[2:]
+            if i < self.refine_level:
+                residual = F.interpolate(bsf, size=out_size, mode='nearest')
+            else:
+                residual = F.adaptive_max_pool2d(bsf, output_size=out_size)
+            outs.append(residual + inputs[i])
+
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/necks/channel_mapper.py b/head_extractor/build/lib/mmdet/models/necks/channel_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..74293618f2b8a649328ae4a5a0571809de9991dd
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/channel_mapper.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class ChannelMapper(BaseModule):
+    """Channel Mapper to reduce/increase channels of backbone features.
+
+    This is used to reduce/increase channels of backbone features.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        kernel_size (int, optional): kernel_size for reducing channels (used
+            at each scale). Default: 3.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Default: None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Default: None.
+        act_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            activation layer in ConvModule. Default: dict(type='ReLU').
+        bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
+            False. Default: "auto".
+        num_outs (int, optional): Number of output feature maps. There would
+            be extra_convs when num_outs larger than the length of in_channels.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or dict],
+            optional): Initialization config dict.
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = ChannelMapper(in_channels, 11, 3).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        kernel_size: int = 3,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        act_cfg: OptConfigType = dict(type='ReLU'),
+        bias: Union[bool, str] = 'auto',
+        num_outs: int = None,
+        init_cfg: OptMultiConfig = dict(
+            type='Xavier', layer='Conv2d', distribution='uniform')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(in_channels, list)
+        self.extra_convs = None
+        if num_outs is None:
+            num_outs = len(in_channels)
+        self.convs = nn.ModuleList()
+        for in_channel in in_channels:
+            self.convs.append(
+                ConvModule(
+                    in_channel,
+                    out_channels,
+                    kernel_size,
+                    padding=(kernel_size - 1) // 2,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    bias=bias))
+        if num_outs > len(in_channels):
+            self.extra_convs = nn.ModuleList()
+            for i in range(len(in_channels), num_outs):
+                if i == len(in_channels):
+                    in_channel = in_channels[-1]
+                else:
+                    in_channel = out_channels
+                self.extra_convs.append(
+                    ConvModule(
+                        in_channel,
+                        out_channels,
+                        3,
+                        stride=2,
+                        padding=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg,
+                        bias=bias))
+
+    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward function."""
+        assert len(inputs) == len(self.convs)
+        outs = [self.convs[i](inputs[i]) for i in range(len(inputs))]
+        if self.extra_convs:
+            for i in range(len(self.extra_convs)):
+                if i == 0:
+                    outs.append(self.extra_convs[0](inputs[-1]))
+                else:
+                    outs.append(self.extra_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/necks/cspnext_pafpn.py b/head_extractor/build/lib/mmdet/models/necks/cspnext_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a52ba72d9b3e48c4866fb16507bc2118eb23010e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/cspnext_pafpn.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from ..layers import CSPLayer
+
+
+@MODELS.register_module()
+class CSPNeXtPAFPN(BaseModule):
+    """Path Aggregation Network with CSPNeXt blocks.
+
+    Args:
+        in_channels (Sequence[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer.
+            Defaults to 3.
+        use_depthwise (bool): Whether to use depthwise separable convolution in
+            blocks. Defaults to False.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Default: 0.5
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(scale_factor=2, mode='nearest')`
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish')
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(
+        self,
+        in_channels: Sequence[int],
+        out_channels: int,
+        num_csp_blocks: int = 3,
+        use_depthwise: bool = False,
+        expand_ratio: float = 0.5,
+        upsample_cfg: ConfigType = dict(scale_factor=2, mode='nearest'),
+        conv_cfg: bool = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='Swish'),
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+        super().__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(**upsample_cfg)
+        self.reduce_layers = nn.ModuleList()
+        self.top_down_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.reduce_layers.append(
+                ConvModule(
+                    in_channels[idx],
+                    in_channels[idx - 1],
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.top_down_blocks.append(
+                CSPLayer(
+                    in_channels[idx - 1] * 2,
+                    in_channels[idx - 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    use_cspnext_block=True,
+                    expand_ratio=expand_ratio,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        # build bottom-up blocks
+        self.downsamples = nn.ModuleList()
+        self.bottom_up_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv(
+                    in_channels[idx],
+                    in_channels[idx],
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.bottom_up_blocks.append(
+                CSPLayer(
+                    in_channels[idx] * 2,
+                    in_channels[idx + 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    use_cspnext_block=True,
+                    expand_ratio=expand_ratio,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        self.out_convs = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.out_convs.append(
+                conv(
+                    in_channels[i],
+                    out_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]:
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+
+        Returns:
+            tuple[Tensor]: YOLOXPAFPN features.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = inputs[idx - 1]
+            feat_heigh = self.reduce_layers[len(self.in_channels) - 1 - idx](
+                feat_heigh)
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = self.upsample(feat_heigh)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                torch.cat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](
+                torch.cat([downsample_feat, feat_height], 1))
+            outs.append(out)
+
+        # out convs
+        for idx, conv in enumerate(self.out_convs):
+            outs[idx] = conv(outs[idx])
+
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/necks/ct_resnet_neck.py b/head_extractor/build/lib/mmdet/models/necks/ct_resnet_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..9109fe79290fafecd954f223d5365ef619c0c301
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/ct_resnet_neck.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptMultiConfig
+
+
+@MODELS.register_module()
+class CTResNetNeck(BaseModule):
+    """The neck used in `CenterNet <https://arxiv.org/abs/1904.07850>`_ for
+    object classification and box regression.
+
+    Args:
+         in_channels (int): Number of input channels.
+         num_deconv_filters (tuple[int]): Number of filters per stage.
+         num_deconv_kernels (tuple[int]): Number of kernels per stage.
+         use_dcn (bool): If True, use DCNv2. Defaults to True.
+         init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+             list[:obj:`ConfigDict`], optional): Initialization
+             config dict.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 num_deconv_filters: Tuple[int, ...],
+                 num_deconv_kernels: Tuple[int, ...],
+                 use_dcn: bool = True,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert len(num_deconv_filters) == len(num_deconv_kernels)
+        self.fp16_enabled = False
+        self.use_dcn = use_dcn
+        self.in_channels = in_channels
+        self.deconv_layers = self._make_deconv_layer(num_deconv_filters,
+                                                     num_deconv_kernels)
+
+    def _make_deconv_layer(
+            self, num_deconv_filters: Tuple[int, ...],
+            num_deconv_kernels: Tuple[int, ...]) -> nn.Sequential:
+        """use deconv layers to upsample backbone's output."""
+        layers = []
+        for i in range(len(num_deconv_filters)):
+            feat_channels = num_deconv_filters[i]
+            conv_module = ConvModule(
+                self.in_channels,
+                feat_channels,
+                3,
+                padding=1,
+                conv_cfg=dict(type='DCNv2') if self.use_dcn else None,
+                norm_cfg=dict(type='BN'))
+            layers.append(conv_module)
+            upsample_module = ConvModule(
+                feat_channels,
+                feat_channels,
+                num_deconv_kernels[i],
+                stride=2,
+                padding=1,
+                conv_cfg=dict(type='deconv'),
+                norm_cfg=dict(type='BN'))
+            layers.append(upsample_module)
+            self.in_channels = feat_channels
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self) -> None:
+        """Initialize the parameters."""
+        for m in self.modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                # In order to be consistent with the source code,
+                # reset the ConvTranspose2d initialization parameters
+                m.reset_parameters()
+                # Simulated bilinear upsampling kernel
+                w = m.weight.data
+                f = math.ceil(w.size(2) / 2)
+                c = (2 * f - 1 - f % 2) / (2. * f)
+                for i in range(w.size(2)):
+                    for j in range(w.size(3)):
+                        w[0, 0, i, j] = \
+                            (1 - math.fabs(i / f - c)) * (
+                                    1 - math.fabs(j / f - c))
+                for c in range(1, w.size(0)):
+                    w[c, 0, :, :] = w[0, 0, :, :]
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            # self.use_dcn is False
+            elif not self.use_dcn and isinstance(m, nn.Conv2d):
+                # In order to be consistent with the source code,
+                # reset the Conv2d initialization parameters
+                m.reset_parameters()
+
+    def forward(self, x: Sequence[torch.Tensor]) -> Tuple[torch.Tensor]:
+        """model forward."""
+        assert isinstance(x, (list, tuple))
+        outs = self.deconv_layers(x[-1])
+        return outs,
diff --git a/head_extractor/build/lib/mmdet/models/necks/dilated_encoder.py b/head_extractor/build/lib/mmdet/models/necks/dilated_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9beb3ea9b4289da8d0100ae7759927f045829bb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/dilated_encoder.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, is_norm
+from mmengine.model import caffe2_xavier_init, constant_init, normal_init
+from torch.nn import BatchNorm2d
+
+from mmdet.registry import MODELS
+
+
+class Bottleneck(nn.Module):
+    """Bottleneck block for DilatedEncoder used in `YOLOF.
+
+    <https://arxiv.org/abs/2103.09460>`.
+
+    The Bottleneck contains three ConvLayers and one residual connection.
+
+    Args:
+        in_channels (int): The number of input channels.
+        mid_channels (int): The number of middle output channels.
+        dilation (int): Dilation rate.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 dilation,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        super(Bottleneck, self).__init__()
+        self.conv1 = ConvModule(
+            in_channels, mid_channels, 1, norm_cfg=norm_cfg)
+        self.conv2 = ConvModule(
+            mid_channels,
+            mid_channels,
+            3,
+            padding=dilation,
+            dilation=dilation,
+            norm_cfg=norm_cfg)
+        self.conv3 = ConvModule(
+            mid_channels, in_channels, 1, norm_cfg=norm_cfg)
+
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = self.conv3(out)
+        out = out + identity
+        return out
+
+
+@MODELS.register_module()
+class DilatedEncoder(nn.Module):
+    """Dilated Encoder for YOLOF <https://arxiv.org/abs/2103.09460>`.
+
+    This module contains two types of components:
+        - the original FPN lateral convolution layer and fpn convolution layer,
+              which are 1x1 conv + 3x3 conv
+        - the dilated residual block
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        block_mid_channels (int): The number of middle block output channels
+        num_residual_blocks (int): The number of residual blocks.
+        block_dilations (list): The list of residual blocks dilation.
+    """
+
+    def __init__(self, in_channels, out_channels, block_mid_channels,
+                 num_residual_blocks, block_dilations):
+        super(DilatedEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.block_mid_channels = block_mid_channels
+        self.num_residual_blocks = num_residual_blocks
+        self.block_dilations = block_dilations
+        self._init_layers()
+
+    def _init_layers(self):
+        self.lateral_conv = nn.Conv2d(
+            self.in_channels, self.out_channels, kernel_size=1)
+        self.lateral_norm = BatchNorm2d(self.out_channels)
+        self.fpn_conv = nn.Conv2d(
+            self.out_channels, self.out_channels, kernel_size=3, padding=1)
+        self.fpn_norm = BatchNorm2d(self.out_channels)
+        encoder_blocks = []
+        for i in range(self.num_residual_blocks):
+            dilation = self.block_dilations[i]
+            encoder_blocks.append(
+                Bottleneck(
+                    self.out_channels,
+                    self.block_mid_channels,
+                    dilation=dilation))
+        self.dilated_encoder_blocks = nn.Sequential(*encoder_blocks)
+
+    def init_weights(self):
+        caffe2_xavier_init(self.lateral_conv)
+        caffe2_xavier_init(self.fpn_conv)
+        for m in [self.lateral_norm, self.fpn_norm]:
+            constant_init(m, 1)
+        for m in self.dilated_encoder_blocks.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+
+    def forward(self, feature):
+        out = self.lateral_norm(self.lateral_conv(feature[-1]))
+        out = self.fpn_norm(self.fpn_conv(out))
+        return self.dilated_encoder_blocks(out),
diff --git a/head_extractor/build/lib/mmdet/models/necks/dyhead.py b/head_extractor/build/lib/mmdet/models/necks/dyhead.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f5ae0b285c20558a0c7bcc59cbb7b214684eab2
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/dyhead.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.ops.modulated_deform_conv import ModulatedDeformConv2d
+from mmengine.model import BaseModule, constant_init, normal_init
+
+from mmdet.registry import MODELS
+from ..layers import DyReLU
+
+# Reference:
+# https://github.com/microsoft/DynamicHead
+# https://github.com/jshilong/SEPC
+
+
+class DyDCNv2(nn.Module):
+    """ModulatedDeformConv2d with normalization layer used in DyHead.
+
+    This module cannot be configured with `conv_cfg=dict(type='DCNv2')`
+    because DyHead calculates offset and mask from middle-level feature.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        stride (int | tuple[int], optional): Stride of the convolution.
+            Default: 1.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='GN', num_groups=16, requires_grad=True).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 norm_cfg=dict(type='GN', num_groups=16, requires_grad=True)):
+        super().__init__()
+        self.with_norm = norm_cfg is not None
+        bias = not self.with_norm
+        self.conv = ModulatedDeformConv2d(
+            in_channels, out_channels, 3, stride=stride, padding=1, bias=bias)
+        if self.with_norm:
+            self.norm = build_norm_layer(norm_cfg, out_channels)[1]
+
+    def forward(self, x, offset, mask):
+        """Forward function."""
+        x = self.conv(x.contiguous(), offset, mask)
+        if self.with_norm:
+            x = self.norm(x)
+        return x
+
+
+class DyHeadBlock(nn.Module):
+    """DyHead Block with three types of attention.
+
+    HSigmoid arguments in default act_cfg follow official code, not paper.
+    https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        zero_init_offset (bool, optional): Whether to use zero init for
+            `spatial_conv_offset`. Default: True.
+        act_cfg (dict, optional): Config dict for the last activation layer of
+            scale-aware attention. Default: dict(type='HSigmoid', bias=3.0,
+            divisor=6.0).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 zero_init_offset=True,
+                 act_cfg=dict(type='HSigmoid', bias=3.0, divisor=6.0)):
+        super().__init__()
+        self.zero_init_offset = zero_init_offset
+        # (offset_x, offset_y, mask) * kernel_size_y * kernel_size_x
+        self.offset_and_mask_dim = 3 * 3 * 3
+        self.offset_dim = 2 * 3 * 3
+
+        self.spatial_conv_high = DyDCNv2(in_channels, out_channels)
+        self.spatial_conv_mid = DyDCNv2(in_channels, out_channels)
+        self.spatial_conv_low = DyDCNv2(in_channels, out_channels, stride=2)
+        self.spatial_conv_offset = nn.Conv2d(
+            in_channels, self.offset_and_mask_dim, 3, padding=1)
+        self.scale_attn_module = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1), nn.Conv2d(out_channels, 1, 1),
+            nn.ReLU(inplace=True), build_activation_layer(act_cfg))
+        self.task_attn_module = DyReLU(out_channels)
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, 0, 0.01)
+        if self.zero_init_offset:
+            constant_init(self.spatial_conv_offset, 0)
+
+    def forward(self, x):
+        """Forward function."""
+        outs = []
+        for level in range(len(x)):
+            # calculate offset and mask of DCNv2 from middle-level feature
+            offset_and_mask = self.spatial_conv_offset(x[level])
+            offset = offset_and_mask[:, :self.offset_dim, :, :]
+            mask = offset_and_mask[:, self.offset_dim:, :, :].sigmoid()
+
+            mid_feat = self.spatial_conv_mid(x[level], offset, mask)
+            sum_feat = mid_feat * self.scale_attn_module(mid_feat)
+            summed_levels = 1
+            if level > 0:
+                low_feat = self.spatial_conv_low(x[level - 1], offset, mask)
+                sum_feat += low_feat * self.scale_attn_module(low_feat)
+                summed_levels += 1
+            if level < len(x) - 1:
+                # this upsample order is weird, but faster than natural order
+                # https://github.com/microsoft/DynamicHead/issues/25
+                high_feat = F.interpolate(
+                    self.spatial_conv_high(x[level + 1], offset, mask),
+                    size=x[level].shape[-2:],
+                    mode='bilinear',
+                    align_corners=True)
+                sum_feat += high_feat * self.scale_attn_module(high_feat)
+                summed_levels += 1
+            outs.append(self.task_attn_module(sum_feat / summed_levels))
+
+        return outs
+
+
+@MODELS.register_module()
+class DyHead(BaseModule):
+    """DyHead neck consisting of multiple DyHead Blocks.
+
+    See `Dynamic Head: Unifying Object Detection Heads with Attentions
+    <https://arxiv.org/abs/2106.08322>`_ for details.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_blocks (int, optional): Number of DyHead Blocks. Default: 6.
+        zero_init_offset (bool, optional): Whether to use zero init for
+            `spatial_conv_offset`. Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=6,
+                 zero_init_offset=True,
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.zero_init_offset = zero_init_offset
+
+        dyhead_blocks = []
+        for i in range(num_blocks):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+            dyhead_blocks.append(
+                DyHeadBlock(
+                    in_channels,
+                    self.out_channels,
+                    zero_init_offset=zero_init_offset))
+        self.dyhead_blocks = nn.Sequential(*dyhead_blocks)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert isinstance(inputs, (tuple, list))
+        outs = self.dyhead_blocks(inputs)
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/necks/fpg.py b/head_extractor/build/lib/mmdet/models/necks/fpg.py
new file mode 100644
index 0000000000000000000000000000000000000000..73ee799bb83645ab2556fe871dcd8b1c5bbff89e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/fpg.py
@@ -0,0 +1,406 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+class Transition(BaseModule):
+    """Base class for transition.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+    """
+
+    def __init__(self, in_channels, out_channels, init_cfg=None):
+        super().__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+    def forward(x):
+        pass
+
+
+class UpInterpolationConv(Transition):
+    """A transition used for up-sampling.
+
+    Up-sample the input by interpolation then refines the feature by
+    a convolution layer.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        scale_factor (int): Up-sampling factor. Default: 2.
+        mode (int): Interpolation mode. Default: nearest.
+        align_corners (bool): Whether align corners when interpolation.
+            Default: None.
+        kernel_size (int): Kernel size for the conv. Default: 3.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 scale_factor=2,
+                 mode='nearest',
+                 align_corners=None,
+                 kernel_size=3,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, init_cfg)
+        self.mode = mode
+        self.scale_factor = scale_factor
+        self.align_corners = align_corners
+        self.conv = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=(kernel_size - 1) // 2,
+            **kwargs)
+
+    def forward(self, x):
+        x = F.interpolate(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners)
+        x = self.conv(x)
+        return x
+
+
+class LastConv(Transition):
+    """A transition used for refining the output of the last stage.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_inputs (int): Number of inputs of the FPN features.
+        kernel_size (int): Kernel size for the conv. Default: 3.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_inputs,
+                 kernel_size=3,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, init_cfg)
+        self.num_inputs = num_inputs
+        self.conv_out = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=(kernel_size - 1) // 2,
+            **kwargs)
+
+    def forward(self, inputs):
+        assert len(inputs) == self.num_inputs
+        return self.conv_out(inputs[-1])
+
+
+@MODELS.register_module()
+class FPG(BaseModule):
+    """FPG.
+
+    Implementation of `Feature Pyramid Grids (FPG)
+    <https://arxiv.org/abs/2004.03580>`_.
+    This implementation only gives the basic structure stated in the paper.
+    But users can implement different type of transitions to fully explore the
+    the potential power of the structure of FPG.
+
+    Args:
+        in_channels (int): Number of input channels (feature maps of all levels
+            should have the same channels).
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        stack_times (int): The number of times the pyramid architecture will
+            be stacked.
+        paths (list[str]): Specify the path order of each stack level.
+            Each element in the list should be either 'bu' (bottom-up) or
+            'td' (top-down).
+        inter_channels (int): Number of inter channels.
+        same_up_trans (dict): Transition that goes down at the same stage.
+        same_down_trans (dict): Transition that goes up at the same stage.
+        across_lateral_trans (dict): Across-pathway same-stage
+        across_down_trans (dict): Across-pathway bottom-up connection.
+        across_up_trans (dict): Across-pathway top-down connection.
+        across_skip_trans (dict): Across-pathway skip connection.
+        output_trans (dict): Transition that trans the output of the
+            last stage.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool): It decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    transition_types = {
+        'conv': ConvModule,
+        'interpolation_conv': UpInterpolationConv,
+        'last_conv': LastConv,
+    }
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 stack_times,
+                 paths,
+                 inter_channels=None,
+                 same_down_trans=None,
+                 same_up_trans=dict(
+                     type='conv', kernel_size=3, stride=2, padding=1),
+                 across_lateral_trans=dict(type='conv', kernel_size=1),
+                 across_down_trans=dict(type='conv', kernel_size=3),
+                 across_up_trans=None,
+                 across_skip_trans=dict(type='identity'),
+                 output_trans=dict(type='last_conv', kernel_size=3),
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 norm_cfg=None,
+                 skip_inds=None,
+                 init_cfg=[
+                     dict(type='Caffe2Xavier', layer='Conv2d'),
+                     dict(
+                         type='Constant',
+                         layer=[
+                             '_BatchNorm', '_InstanceNorm', 'GroupNorm',
+                             'LayerNorm'
+                         ],
+                         val=1.0)
+                 ]):
+        super(FPG, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        if inter_channels is None:
+            self.inter_channels = [out_channels for _ in range(num_outs)]
+        elif isinstance(inter_channels, int):
+            self.inter_channels = [inter_channels for _ in range(num_outs)]
+        else:
+            assert isinstance(inter_channels, list)
+            assert len(inter_channels) == num_outs
+            self.inter_channels = inter_channels
+        self.stack_times = stack_times
+        self.paths = paths
+        assert isinstance(paths, list) and len(paths) == stack_times
+        for d in paths:
+            assert d in ('bu', 'td')
+
+        self.same_down_trans = same_down_trans
+        self.same_up_trans = same_up_trans
+        self.across_lateral_trans = across_lateral_trans
+        self.across_down_trans = across_down_trans
+        self.across_up_trans = across_up_trans
+        self.output_trans = output_trans
+        self.across_skip_trans = across_skip_trans
+
+        self.with_bias = norm_cfg is None
+        # skip inds must be specified if across skip trans is not None
+        if self.across_skip_trans is not None:
+            skip_inds is not None
+        self.skip_inds = skip_inds
+        assert len(self.skip_inds[0]) <= self.stack_times
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+
+        # build lateral 1x1 convs to reduce channels
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = nn.Conv2d(self.in_channels[i],
+                               self.inter_channels[i - self.start_level], 1)
+            self.lateral_convs.append(l_conv)
+
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            if self.add_extra_convs:
+                fpn_idx = self.backbone_end_level - self.start_level + i
+                extra_conv = nn.Conv2d(
+                    self.inter_channels[fpn_idx - 1],
+                    self.inter_channels[fpn_idx],
+                    3,
+                    stride=2,
+                    padding=1)
+                self.extra_downsamples.append(extra_conv)
+            else:
+                self.extra_downsamples.append(nn.MaxPool2d(1, stride=2))
+
+        self.fpn_transitions = nn.ModuleList()  # stack times
+        for s in range(self.stack_times):
+            stage_trans = nn.ModuleList()  # num of feature levels
+            for i in range(self.num_outs):
+                # same, across_lateral, across_down, across_up
+                trans = nn.ModuleDict()
+                if s in self.skip_inds[i]:
+                    stage_trans.append(trans)
+                    continue
+                # build same-stage down trans (used in bottom-up paths)
+                if i == 0 or self.same_up_trans is None:
+                    same_up_trans = None
+                else:
+                    same_up_trans = self.build_trans(
+                        self.same_up_trans, self.inter_channels[i - 1],
+                        self.inter_channels[i])
+                trans['same_up'] = same_up_trans
+                # build same-stage up trans (used in top-down paths)
+                if i == self.num_outs - 1 or self.same_down_trans is None:
+                    same_down_trans = None
+                else:
+                    same_down_trans = self.build_trans(
+                        self.same_down_trans, self.inter_channels[i + 1],
+                        self.inter_channels[i])
+                trans['same_down'] = same_down_trans
+                # build across lateral trans
+                across_lateral_trans = self.build_trans(
+                    self.across_lateral_trans, self.inter_channels[i],
+                    self.inter_channels[i])
+                trans['across_lateral'] = across_lateral_trans
+                # build across down trans
+                if i == self.num_outs - 1 or self.across_down_trans is None:
+                    across_down_trans = None
+                else:
+                    across_down_trans = self.build_trans(
+                        self.across_down_trans, self.inter_channels[i + 1],
+                        self.inter_channels[i])
+                trans['across_down'] = across_down_trans
+                # build across up trans
+                if i == 0 or self.across_up_trans is None:
+                    across_up_trans = None
+                else:
+                    across_up_trans = self.build_trans(
+                        self.across_up_trans, self.inter_channels[i - 1],
+                        self.inter_channels[i])
+                trans['across_up'] = across_up_trans
+                if self.across_skip_trans is None:
+                    across_skip_trans = None
+                else:
+                    across_skip_trans = self.build_trans(
+                        self.across_skip_trans, self.inter_channels[i - 1],
+                        self.inter_channels[i])
+                trans['across_skip'] = across_skip_trans
+                # build across_skip trans
+                stage_trans.append(trans)
+            self.fpn_transitions.append(stage_trans)
+
+        self.output_transition = nn.ModuleList()  # output levels
+        for i in range(self.num_outs):
+            trans = self.build_trans(
+                self.output_trans,
+                self.inter_channels[i],
+                self.out_channels,
+                num_inputs=self.stack_times + 1)
+            self.output_transition.append(trans)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def build_trans(self, cfg, in_channels, out_channels, **extra_args):
+        cfg_ = cfg.copy()
+        trans_type = cfg_.pop('type')
+        trans_cls = self.transition_types[trans_type]
+        return trans_cls(in_channels, out_channels, **cfg_, **extra_args)
+
+    def fuse(self, fuse_dict):
+        out = None
+        for item in fuse_dict.values():
+            if item is not None:
+                if out is None:
+                    out = item
+                else:
+                    out = out + item
+        return out
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # build all levels from original feature maps
+        feats = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        for downsample in self.extra_downsamples:
+            feats.append(downsample(feats[-1]))
+
+        outs = [feats]
+
+        for i in range(self.stack_times):
+            current_outs = outs[-1]
+            next_outs = []
+            direction = self.paths[i]
+            for j in range(self.num_outs):
+                if i in self.skip_inds[j]:
+                    next_outs.append(outs[-1][j])
+                    continue
+                # feature level
+                if direction == 'td':
+                    lvl = self.num_outs - j - 1
+                else:
+                    lvl = j
+                # get transitions
+                if direction == 'td':
+                    same_trans = self.fpn_transitions[i][lvl]['same_down']
+                else:
+                    same_trans = self.fpn_transitions[i][lvl]['same_up']
+                across_lateral_trans = self.fpn_transitions[i][lvl][
+                    'across_lateral']
+                across_down_trans = self.fpn_transitions[i][lvl]['across_down']
+                across_up_trans = self.fpn_transitions[i][lvl]['across_up']
+                across_skip_trans = self.fpn_transitions[i][lvl]['across_skip']
+                # init output
+                to_fuse = dict(
+                    same=None, lateral=None, across_up=None, across_down=None)
+                # same downsample/upsample
+                if same_trans is not None:
+                    to_fuse['same'] = same_trans(next_outs[-1])
+                # across lateral
+                if across_lateral_trans is not None:
+                    to_fuse['lateral'] = across_lateral_trans(
+                        current_outs[lvl])
+                # across downsample
+                if lvl > 0 and across_up_trans is not None:
+                    to_fuse['across_up'] = across_up_trans(current_outs[lvl -
+                                                                        1])
+                # across upsample
+                if (lvl < self.num_outs - 1 and across_down_trans is not None):
+                    to_fuse['across_down'] = across_down_trans(
+                        current_outs[lvl + 1])
+                if across_skip_trans is not None:
+                    to_fuse['across_skip'] = across_skip_trans(outs[0][lvl])
+                x = self.fuse(to_fuse)
+                next_outs.append(x)
+
+            if direction == 'td':
+                outs.append(next_outs[::-1])
+            else:
+                outs.append(next_outs)
+
+        # output trans
+        final_outs = []
+        for i in range(self.num_outs):
+            lvl_out_list = []
+            for s in range(len(outs)):
+                lvl_out_list.append(outs[s][i])
+            lvl_out = self.output_transition[i](lvl_out_list)
+            final_outs.append(lvl_out)
+
+        return final_outs
diff --git a/head_extractor/build/lib/mmdet/models/necks/fpn.py b/head_extractor/build/lib/mmdet/models/necks/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..67bd8879641f8539f329e6ffb94f88d25e417244
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/fpn.py
@@ -0,0 +1,221 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class FPN(BaseModule):
+    r"""Feature Pyramid Network.
+
+    This is an implementation of paper `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Defaults to 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Defaults to -1, which means the
+            last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Defaults to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral': Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Defaults to False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Defaults to False.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
+        act_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            activation layer in ConvModule. Defaults to None.
+        upsample_cfg (:obj:`ConfigDict` or dict, optional): Config dict
+            for interpolate layer. Defaults to dict(mode='nearest').
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        num_outs: int,
+        start_level: int = 0,
+        end_level: int = -1,
+        add_extra_convs: Union[bool, str] = False,
+        relu_before_extra_convs: bool = False,
+        no_norm_on_lateral: bool = False,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        act_cfg: OptConfigType = None,
+        upsample_cfg: ConfigType = dict(mode='nearest'),
+        init_cfg: MultiConfig = dict(
+            type='Xavier', layer='Conv2d', distribution='uniform')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            self.add_extra_convs = 'on_input'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    def forward(self, inputs: Tuple[Tensor]) -> tuple:
+        """Forward function.
+
+        Args:
+            inputs (tuple[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+
+        Returns:
+            tuple: Feature maps, each is a 4D-tensor.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/necks/fpn_carafe.py b/head_extractor/build/lib/mmdet/models/necks/fpn_carafe.py
new file mode 100644
index 0000000000000000000000000000000000000000..b393ff7c340c0c343fc4c91a4d87d341f66a3177
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/fpn_carafe.py
@@ -0,0 +1,275 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_upsample_layer
+from mmcv.ops.carafe import CARAFEPack
+from mmengine.model import BaseModule, ModuleList, xavier_init
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class FPN_CARAFE(BaseModule):
+    """FPN_CARAFE is a more flexible implementation of FPN. It allows more
+    choice for upsample methods during the top-down pathway.
+
+    It can reproduce the performance of ICCV 2019 paper
+    CARAFE: Content-Aware ReAssembly of FEatures
+    Please refer to https://arxiv.org/abs/1905.02188 for more details.
+
+    Args:
+        in_channels (list[int]): Number of channels for each input feature map.
+        out_channels (int): Output channels of feature pyramids.
+        num_outs (int): Number of output stages.
+        start_level (int): Start level of feature pyramids.
+            (Default: 0)
+        end_level (int): End level of feature pyramids.
+            (Default: -1 indicates the last level).
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        activate (str): Type of activation function in ConvModule
+            (Default: None indicates w/o activation).
+        order (dict): Order of components in ConvModule.
+        upsample (str): Type of upsample layer.
+        upsample_cfg (dict): Dictionary to construct and config upsample layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 order=('conv', 'norm', 'act'),
+                 upsample_cfg=dict(
+                     type='carafe',
+                     up_kernel=5,
+                     up_group=1,
+                     encoder_kernel=3,
+                     encoder_dilation=1),
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super(FPN_CARAFE, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.with_bias = norm_cfg is None
+        self.upsample_cfg = upsample_cfg.copy()
+        self.upsample = self.upsample_cfg.get('type')
+        self.relu = nn.ReLU(inplace=False)
+
+        self.order = order
+        assert order in [('conv', 'norm', 'act'), ('act', 'conv', 'norm')]
+
+        assert self.upsample in [
+            'nearest', 'bilinear', 'deconv', 'pixel_shuffle', 'carafe', None
+        ]
+        if self.upsample in ['deconv', 'pixel_shuffle']:
+            assert hasattr(
+                self.upsample_cfg,
+                'upsample_kernel') and self.upsample_cfg.upsample_kernel > 0
+            self.upsample_kernel = self.upsample_cfg.pop('upsample_kernel')
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+
+        self.lateral_convs = ModuleList()
+        self.fpn_convs = ModuleList()
+        self.upsample_modules = ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                norm_cfg=norm_cfg,
+                bias=self.with_bias,
+                act_cfg=act_cfg,
+                inplace=False,
+                order=self.order)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                bias=self.with_bias,
+                act_cfg=act_cfg,
+                inplace=False,
+                order=self.order)
+            if i != self.backbone_end_level - 1:
+                upsample_cfg_ = self.upsample_cfg.copy()
+                if self.upsample == 'deconv':
+                    upsample_cfg_.update(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        kernel_size=self.upsample_kernel,
+                        stride=2,
+                        padding=(self.upsample_kernel - 1) // 2,
+                        output_padding=(self.upsample_kernel - 1) // 2)
+                elif self.upsample == 'pixel_shuffle':
+                    upsample_cfg_.update(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        scale_factor=2,
+                        upsample_kernel=self.upsample_kernel)
+                elif self.upsample == 'carafe':
+                    upsample_cfg_.update(channels=out_channels, scale_factor=2)
+                else:
+                    # suppress warnings
+                    align_corners = (None
+                                     if self.upsample == 'nearest' else False)
+                    upsample_cfg_.update(
+                        scale_factor=2,
+                        mode=self.upsample,
+                        align_corners=align_corners)
+                upsample_module = build_upsample_layer(upsample_cfg_)
+                self.upsample_modules.append(upsample_module)
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_out_levels = (
+            num_outs - self.backbone_end_level + self.start_level)
+        if extra_out_levels >= 1:
+            for i in range(extra_out_levels):
+                in_channels = (
+                    self.in_channels[self.backbone_end_level -
+                                     1] if i == 0 else out_channels)
+                extra_l_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    bias=self.with_bias,
+                    act_cfg=act_cfg,
+                    inplace=False,
+                    order=self.order)
+                if self.upsample == 'deconv':
+                    upsampler_cfg_ = dict(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        kernel_size=self.upsample_kernel,
+                        stride=2,
+                        padding=(self.upsample_kernel - 1) // 2,
+                        output_padding=(self.upsample_kernel - 1) // 2)
+                elif self.upsample == 'pixel_shuffle':
+                    upsampler_cfg_ = dict(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        scale_factor=2,
+                        upsample_kernel=self.upsample_kernel)
+                elif self.upsample == 'carafe':
+                    upsampler_cfg_ = dict(
+                        channels=out_channels,
+                        scale_factor=2,
+                        **self.upsample_cfg)
+                else:
+                    # suppress warnings
+                    align_corners = (None
+                                     if self.upsample == 'nearest' else False)
+                    upsampler_cfg_ = dict(
+                        scale_factor=2,
+                        mode=self.upsample,
+                        align_corners=align_corners)
+                upsampler_cfg_['type'] = self.upsample
+                upsample_module = build_upsample_layer(upsampler_cfg_)
+                extra_fpn_conv = ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.with_bias,
+                    act_cfg=act_cfg,
+                    inplace=False,
+                    order=self.order)
+                self.upsample_modules.append(upsample_module)
+                self.fpn_convs.append(extra_fpn_conv)
+                self.lateral_convs.append(extra_l_conv)
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        """Initialize the weights of module."""
+        super(FPN_CARAFE, self).init_weights()
+        for m in self.modules():
+            if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
+                xavier_init(m, distribution='uniform')
+        for m in self.modules():
+            if isinstance(m, CARAFEPack):
+                m.init_weights()
+
+    def slice_as(self, src, dst):
+        """Slice ``src`` as ``dst``
+
+        Note:
+            ``src`` should have the same or larger size than ``dst``.
+
+        Args:
+            src (torch.Tensor): Tensors to be sliced.
+            dst (torch.Tensor): ``src`` will be sliced to have the same
+                size as ``dst``.
+
+        Returns:
+            torch.Tensor: Sliced tensor.
+        """
+        assert (src.size(2) >= dst.size(2)) and (src.size(3) >= dst.size(3))
+        if src.size(2) == dst.size(2) and src.size(3) == dst.size(3):
+            return src
+        else:
+            return src[:, :, :dst.size(2), :dst.size(3)]
+
+    def tensor_add(self, a, b):
+        """Add tensors ``a`` and ``b`` that might have different sizes."""
+        if a.size() == b.size():
+            c = a + b
+        else:
+            c = a + self.slice_as(b, a)
+        return c
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = []
+        for i, lateral_conv in enumerate(self.lateral_convs):
+            if i <= self.backbone_end_level - self.start_level:
+                input = inputs[min(i + self.start_level, len(inputs) - 1)]
+            else:
+                input = laterals[-1]
+            lateral = lateral_conv(input)
+            laterals.append(lateral)
+
+        # build top-down path
+        for i in range(len(laterals) - 1, 0, -1):
+            if self.upsample is not None:
+                upsample_feat = self.upsample_modules[i - 1](laterals[i])
+            else:
+                upsample_feat = laterals[i]
+            laterals[i - 1] = self.tensor_add(laterals[i - 1], upsample_feat)
+
+        # build outputs
+        num_conv_outs = len(self.fpn_convs)
+        outs = []
+        for i in range(num_conv_outs):
+            out = self.fpn_convs[i](laterals[i])
+            outs.append(out)
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/necks/fpn_dropblock.py b/head_extractor/build/lib/mmdet/models/necks/fpn_dropblock.py
new file mode 100644
index 0000000000000000000000000000000000000000..473af924cdaaecf88aa4a0a6e1500511530b91a2
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/fpn_dropblock.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .fpn import FPN
+
+
+@MODELS.register_module()
+class FPN_DropBlock(FPN):
+
+    def __init__(self,
+                 *args,
+                 plugin: Optional[dict] = dict(
+                     type='DropBlock',
+                     drop_prob=0.3,
+                     block_size=3,
+                     warmup_iters=0),
+                 **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.plugin = None
+        if plugin is not None:
+            self.plugin = MODELS.build(plugin)
+
+    def forward(self, inputs: Tuple[Tensor]) -> tuple:
+        """Forward function.
+
+        Args:
+            inputs (tuple[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+
+        Returns:
+            tuple: Feature maps, each is a 4D-tensor.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+            if self.plugin is not None:
+                laterals[i - 1] = self.plugin(laterals[i - 1])
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/necks/hrfpn.py b/head_extractor/build/lib/mmdet/models/necks/hrfpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2627549b4cb8acc6833bc40425e459c28aa5c20
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/hrfpn.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch.utils.checkpoint import checkpoint
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class HRFPN(BaseModule):
+    """HRFPN (High Resolution Feature Pyramids)
+
+    paper: `High-Resolution Representations for Labeling Pixels and Regions
+    <https://arxiv.org/abs/1904.04514>`_.
+
+    Args:
+        in_channels (list): number of channels for each branch.
+        out_channels (int): output channels of feature pyramids.
+        num_outs (int): number of output stages.
+        pooling_type (str): pooling for generating feature pyramids
+            from {MAX, AVG}.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        with_cp  (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        stride (int): stride of 3x3 convolutional layers
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs=5,
+                 pooling_type='AVG',
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 with_cp=False,
+                 stride=1,
+                 init_cfg=dict(type='Caffe2Xavier', layer='Conv2d')):
+        super(HRFPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.reduction_conv = ConvModule(
+            sum(in_channels),
+            out_channels,
+            kernel_size=1,
+            conv_cfg=self.conv_cfg,
+            act_cfg=None)
+
+        self.fpn_convs = nn.ModuleList()
+        for i in range(self.num_outs):
+            self.fpn_convs.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    stride=stride,
+                    conv_cfg=self.conv_cfg,
+                    act_cfg=None))
+
+        if pooling_type == 'MAX':
+            self.pooling = F.max_pool2d
+        else:
+            self.pooling = F.avg_pool2d
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == self.num_ins
+        outs = [inputs[0]]
+        for i in range(1, self.num_ins):
+            outs.append(
+                F.interpolate(inputs[i], scale_factor=2**i, mode='bilinear'))
+        out = torch.cat(outs, dim=1)
+        if out.requires_grad and self.with_cp:
+            out = checkpoint(self.reduction_conv, out)
+        else:
+            out = self.reduction_conv(out)
+        outs = [out]
+        for i in range(1, self.num_outs):
+            outs.append(self.pooling(out, kernel_size=2**i, stride=2**i))
+        outputs = []
+
+        for i in range(self.num_outs):
+            if outs[i].requires_grad and self.with_cp:
+                tmp_out = checkpoint(self.fpn_convs[i], outs[i])
+            else:
+                tmp_out = self.fpn_convs[i](outs[i])
+            outputs.append(tmp_out)
+        return tuple(outputs)
diff --git a/head_extractor/build/lib/mmdet/models/necks/nas_fpn.py b/head_extractor/build/lib/mmdet/models/necks/nas_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ec90cd6eed3aa65a3a192d332cbfd8c16d5bc36
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/nas_fpn.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops.merge_cells import GlobalPoolingCell, SumCell
+from mmengine.model import BaseModule, ModuleList
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class NASFPN(BaseModule):
+    """NAS-FPN.
+
+    Implementation of `NAS-FPN: Learning Scalable Feature Pyramid Architecture
+    for Object Detection <https://arxiv.org/abs/1904.07392>`_
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        stack_times (int): The number of times the pyramid architecture will
+            be stacked.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Defaults to 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Defaults to -1, which means the
+            last level.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        num_outs: int,
+        stack_times: int,
+        start_level: int = 0,
+        end_level: int = -1,
+        norm_cfg: OptConfigType = None,
+        init_cfg: MultiConfig = dict(type='Caffe2Xavier', layer='Conv2d')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)  # num of input feature levels
+        self.num_outs = num_outs  # num of output feature levels
+        self.stack_times = stack_times
+        self.norm_cfg = norm_cfg
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+
+        # add lateral connections
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            self.lateral_convs.append(l_conv)
+
+        # add extra downsample layers (stride-2 pooling or conv)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            extra_conv = ConvModule(
+                out_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+            self.extra_downsamples.append(
+                nn.Sequential(extra_conv, nn.MaxPool2d(2, 2)))
+
+        # add NAS FPN connections
+        self.fpn_stages = ModuleList()
+        for _ in range(self.stack_times):
+            stage = nn.ModuleDict()
+            # gp(p6, p4) -> p4_1
+            stage['gp_64_4'] = GlobalPoolingCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p4_1, p4) -> p4_2
+            stage['sum_44_4'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p4_2, p3) -> p3_out
+            stage['sum_43_3'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p3_out, p4_2) -> p4_out
+            stage['sum_34_4'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p5, gp(p4_out, p3_out)) -> p5_out
+            stage['gp_43_5'] = GlobalPoolingCell(with_out_conv=False)
+            stage['sum_55_5'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p7, gp(p5_out, p4_2)) -> p7_out
+            stage['gp_54_7'] = GlobalPoolingCell(with_out_conv=False)
+            stage['sum_77_7'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # gp(p7_out, p5_out) -> p6_out
+            stage['gp_75_6'] = GlobalPoolingCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            self.fpn_stages.append(stage)
+
+    def forward(self, inputs: Tuple[Tensor]) -> tuple:
+        """Forward function.
+
+         Args:
+            inputs (tuple[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+
+        Returns:
+            tuple: Feature maps, each is a 4D-tensor.
+        """
+        # build P3-P5
+        feats = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        # build P6-P7 on top of P5
+        for downsample in self.extra_downsamples:
+            feats.append(downsample(feats[-1]))
+
+        p3, p4, p5, p6, p7 = feats
+
+        for stage in self.fpn_stages:
+            # gp(p6, p4) -> p4_1
+            p4_1 = stage['gp_64_4'](p6, p4, out_size=p4.shape[-2:])
+            # sum(p4_1, p4) -> p4_2
+            p4_2 = stage['sum_44_4'](p4_1, p4, out_size=p4.shape[-2:])
+            # sum(p4_2, p3) -> p3_out
+            p3 = stage['sum_43_3'](p4_2, p3, out_size=p3.shape[-2:])
+            # sum(p3_out, p4_2) -> p4_out
+            p4 = stage['sum_34_4'](p3, p4_2, out_size=p4.shape[-2:])
+            # sum(p5, gp(p4_out, p3_out)) -> p5_out
+            p5_tmp = stage['gp_43_5'](p4, p3, out_size=p5.shape[-2:])
+            p5 = stage['sum_55_5'](p5, p5_tmp, out_size=p5.shape[-2:])
+            # sum(p7, gp(p5_out, p4_2)) -> p7_out
+            p7_tmp = stage['gp_54_7'](p5, p4_2, out_size=p7.shape[-2:])
+            p7 = stage['sum_77_7'](p7, p7_tmp, out_size=p7.shape[-2:])
+            # gp(p7_out, p5_out) -> p6_out
+            p6 = stage['gp_75_6'](p7, p5, out_size=p6.shape[-2:])
+
+        return p3, p4, p5, p6, p7
diff --git a/head_extractor/build/lib/mmdet/models/necks/nasfcos_fpn.py b/head_extractor/build/lib/mmdet/models/necks/nasfcos_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d0848f7634bb0113e0b5a16b5b65ba8b7ebb9c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/nasfcos_fpn.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.ops.merge_cells import ConcatCell
+from mmengine.model import BaseModule, caffe2_xavier_init
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class NASFCOS_FPN(BaseModule):
+    """FPN structure in NASFPN.
+
+    Implementation of paper `NAS-FCOS: Fast Neural Architecture Search for
+    Object Detection <https://arxiv.org/abs/1906.04423>`_
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool): It decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=1,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super(NASFCOS_FPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+
+        self.adapt_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            adapt_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                stride=1,
+                padding=0,
+                bias=False,
+                norm_cfg=dict(type='BN'),
+                act_cfg=dict(type='ReLU', inplace=False))
+            self.adapt_convs.append(adapt_conv)
+
+        # C2 is omitted according to the paper
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+
+        def build_concat_cell(with_input1_conv, with_input2_conv):
+            cell_conv_cfg = dict(
+                kernel_size=1, padding=0, bias=False, groups=out_channels)
+            return ConcatCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                with_out_conv=True,
+                out_conv_cfg=cell_conv_cfg,
+                out_norm_cfg=dict(type='BN'),
+                out_conv_order=('norm', 'act', 'conv'),
+                with_input1_conv=with_input1_conv,
+                with_input2_conv=with_input2_conv,
+                input_conv_cfg=conv_cfg,
+                input_norm_cfg=norm_cfg,
+                upsample_mode='nearest')
+
+        # Denote c3=f0, c4=f1, c5=f2 for convince
+        self.fpn = nn.ModuleDict()
+        self.fpn['c22_1'] = build_concat_cell(True, True)
+        self.fpn['c22_2'] = build_concat_cell(True, True)
+        self.fpn['c32'] = build_concat_cell(True, False)
+        self.fpn['c02'] = build_concat_cell(True, False)
+        self.fpn['c42'] = build_concat_cell(True, True)
+        self.fpn['c36'] = build_concat_cell(True, True)
+        self.fpn['c61'] = build_concat_cell(True, True)  # f9
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            extra_act_cfg = None if i == 0 \
+                else dict(type='ReLU', inplace=False)
+            self.extra_downsamples.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    act_cfg=extra_act_cfg,
+                    order=('act', 'norm', 'conv')))
+
+    def forward(self, inputs):
+        """Forward function."""
+        feats = [
+            adapt_conv(inputs[i + self.start_level])
+            for i, adapt_conv in enumerate(self.adapt_convs)
+        ]
+
+        for (i, module_name) in enumerate(self.fpn):
+            idx_1, idx_2 = int(module_name[1]), int(module_name[2])
+            res = self.fpn[module_name](feats[idx_1], feats[idx_2])
+            feats.append(res)
+
+        ret = []
+        for (idx, input_idx) in zip([9, 8, 7], [1, 2, 3]):  # add P3, P4, P5
+            feats1, feats2 = feats[idx], feats[5]
+            feats2_resize = F.interpolate(
+                feats2,
+                size=feats1.size()[2:],
+                mode='bilinear',
+                align_corners=False)
+
+            feats_sum = feats1 + feats2_resize
+            ret.append(
+                F.interpolate(
+                    feats_sum,
+                    size=inputs[input_idx].size()[2:],
+                    mode='bilinear',
+                    align_corners=False))
+
+        for submodule in self.extra_downsamples:
+            ret.append(submodule(ret[-1]))
+
+        return tuple(ret)
+
+    def init_weights(self):
+        """Initialize the weights of module."""
+        super(NASFCOS_FPN, self).init_weights()
+        for module in self.fpn.values():
+            if hasattr(module, 'conv_out'):
+                caffe2_xavier_init(module.out_conv.conv)
+
+        for modules in [
+                self.adapt_convs.modules(),
+                self.extra_downsamples.modules()
+        ]:
+            for module in modules:
+                if isinstance(module, nn.Conv2d):
+                    caffe2_xavier_init(module)
diff --git a/head_extractor/build/lib/mmdet/models/necks/pafpn.py b/head_extractor/build/lib/mmdet/models/necks/pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..557638f48a629691f780d3e1466e234bbe987518
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/pafpn.py
@@ -0,0 +1,157 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmdet.registry import MODELS
+from .fpn import FPN
+
+
+@MODELS.register_module()
+class PAFPN(FPN):
+    """Path Aggregation Network for Instance Segmentation.
+
+    This is an implementation of the `PAFPN in Path Aggregation Network
+    <https://arxiv.org/abs/1803.01534>`_.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral':  Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (str): Config dict for activation layer in ConvModule.
+            Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(PAFPN, self).__init__(
+            in_channels,
+            out_channels,
+            num_outs,
+            start_level,
+            end_level,
+            add_extra_convs,
+            relu_before_extra_convs,
+            no_norm_on_lateral,
+            conv_cfg,
+            norm_cfg,
+            act_cfg,
+            init_cfg=init_cfg)
+        # add extra bottom up pathway
+        self.downsample_convs = nn.ModuleList()
+        self.pafpn_convs = nn.ModuleList()
+        for i in range(self.start_level + 1, self.backbone_end_level):
+            d_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+            pafpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+            self.downsample_convs.append(d_conv)
+            self.pafpn_convs.append(pafpn_conv)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            prev_shape = laterals[i - 1].shape[2:]
+            laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                laterals[i], size=prev_shape, mode='nearest')
+
+        # build outputs
+        # part 1: from original levels
+        inter_outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+
+        # part 2: add bottom-up path
+        for i in range(0, used_backbone_levels - 1):
+            inter_outs[i + 1] = inter_outs[i + 1] + \
+                                self.downsample_convs[i](inter_outs[i])
+
+        outs = []
+        outs.append(inter_outs[0])
+        outs.extend([
+            self.pafpn_convs[i - 1](inter_outs[i])
+            for i in range(1, used_backbone_levels)
+        ])
+
+        # part 3: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    orig = inputs[self.backbone_end_level - 1]
+                    outs.append(self.fpn_convs[used_backbone_levels](orig))
+                elif self.add_extra_convs == 'on_lateral':
+                    outs.append(self.fpn_convs[used_backbone_levels](
+                        laterals[-1]))
+                elif self.add_extra_convs == 'on_output':
+                    outs.append(self.fpn_convs[used_backbone_levels](outs[-1]))
+                else:
+                    raise NotImplementedError
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/necks/rfp.py b/head_extractor/build/lib/mmdet/models/necks/rfp.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec9b3753c5031bb12a2b4c88733f13bf27c44e2
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/rfp.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule, ModuleList, constant_init, xavier_init
+
+from mmdet.registry import MODELS
+from .fpn import FPN
+
+
+class ASPP(BaseModule):
+    """ASPP (Atrous Spatial Pyramid Pooling)
+
+    This is an implementation of the ASPP module used in DetectoRS
+    (https://arxiv.org/pdf/2006.02334.pdf)
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of channels produced by this module
+        dilations (tuple[int]): Dilations of the four branches.
+            Default: (1, 3, 6, 1)
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 dilations=(1, 3, 6, 1),
+                 init_cfg=dict(type='Kaiming', layer='Conv2d')):
+        super().__init__(init_cfg)
+        assert dilations[-1] == 1
+        self.aspp = nn.ModuleList()
+        for dilation in dilations:
+            kernel_size = 3 if dilation > 1 else 1
+            padding = dilation if dilation > 1 else 0
+            conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                dilation=dilation,
+                padding=padding,
+                bias=True)
+            self.aspp.append(conv)
+        self.gap = nn.AdaptiveAvgPool2d(1)
+
+    def forward(self, x):
+        avg_x = self.gap(x)
+        out = []
+        for aspp_idx in range(len(self.aspp)):
+            inp = avg_x if (aspp_idx == len(self.aspp) - 1) else x
+            out.append(F.relu_(self.aspp[aspp_idx](inp)))
+        out[-1] = out[-1].expand_as(out[-2])
+        out = torch.cat(out, dim=1)
+        return out
+
+
+@MODELS.register_module()
+class RFP(FPN):
+    """RFP (Recursive Feature Pyramid)
+
+    This is an implementation of RFP in `DetectoRS
+    <https://arxiv.org/pdf/2006.02334.pdf>`_. Different from standard FPN, the
+    input of RFP should be multi level features along with origin input image
+    of backbone.
+
+    Args:
+        rfp_steps (int): Number of unrolled steps of RFP.
+        rfp_backbone (dict): Configuration of the backbone for RFP.
+        aspp_out_channels (int): Number of output channels of ASPP module.
+        aspp_dilations (tuple[int]): Dilation rates of four branches.
+            Default: (1, 3, 6, 1)
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 rfp_steps,
+                 rfp_backbone,
+                 aspp_out_channels,
+                 aspp_dilations=(1, 3, 6, 1),
+                 init_cfg=None,
+                 **kwargs):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg, **kwargs)
+        self.rfp_steps = rfp_steps
+        # Be careful! Pretrained weights cannot be loaded when use
+        # nn.ModuleList
+        self.rfp_modules = ModuleList()
+        for rfp_idx in range(1, rfp_steps):
+            rfp_module = MODELS.build(rfp_backbone)
+            self.rfp_modules.append(rfp_module)
+        self.rfp_aspp = ASPP(self.out_channels, aspp_out_channels,
+                             aspp_dilations)
+        self.rfp_weight = nn.Conv2d(
+            self.out_channels,
+            1,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+
+    def init_weights(self):
+        # Avoid using super().init_weights(), which may alter the default
+        # initialization of the modules in self.rfp_modules that have missing
+        # keys in the pretrained checkpoint.
+        for convs in [self.lateral_convs, self.fpn_convs]:
+            for m in convs.modules():
+                if isinstance(m, nn.Conv2d):
+                    xavier_init(m, distribution='uniform')
+        for rfp_idx in range(self.rfp_steps - 1):
+            self.rfp_modules[rfp_idx].init_weights()
+        constant_init(self.rfp_weight, 0)
+
+    def forward(self, inputs):
+        inputs = list(inputs)
+        assert len(inputs) == len(self.in_channels) + 1  # +1 for input image
+        img = inputs.pop(0)
+        # FPN forward
+        x = super().forward(tuple(inputs))
+        for rfp_idx in range(self.rfp_steps - 1):
+            rfp_feats = [x[0]] + list(
+                self.rfp_aspp(x[i]) for i in range(1, len(x)))
+            x_idx = self.rfp_modules[rfp_idx].rfp_forward(img, rfp_feats)
+            # FPN forward
+            x_idx = super().forward(x_idx)
+            x_new = []
+            for ft_idx in range(len(x_idx)):
+                add_weight = torch.sigmoid(self.rfp_weight(x_idx[ft_idx]))
+                x_new.append(add_weight * x_idx[ft_idx] +
+                             (1 - add_weight) * x[ft_idx])
+            x = x_new
+        return x
diff --git a/head_extractor/build/lib/mmdet/models/necks/ssd_neck.py b/head_extractor/build/lib/mmdet/models/necks/ssd_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ba319370b988b9c7e2d98c2f10607ff8f8b5c3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/ssd_neck.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class SSDNeck(BaseModule):
+    """Extra layers of SSD backbone to generate multi-scale feature maps.
+
+    Args:
+        in_channels (Sequence[int]): Number of input channels per scale.
+        out_channels (Sequence[int]): Number of output channels per scale.
+        level_strides (Sequence[int]): Stride of 3x3 conv per level.
+        level_paddings (Sequence[int]): Padding size of 3x3 conv per level.
+        l2_norm_scale (float|None): L2 normalization layer init scale.
+            If None, not use L2 normalization on the first input feature.
+        last_kernel_size (int): Kernel size of the last conv layer.
+            Default: 3.
+        use_depthwise (bool): Whether to use DepthwiseSeparableConv.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 level_strides,
+                 level_paddings,
+                 l2_norm_scale=20.,
+                 last_kernel_size=3,
+                 use_depthwise=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=[
+                     dict(
+                         type='Xavier', distribution='uniform',
+                         layer='Conv2d'),
+                     dict(type='Constant', val=1, layer='BatchNorm2d'),
+                 ]):
+        super(SSDNeck, self).__init__(init_cfg)
+        assert len(out_channels) > len(in_channels)
+        assert len(out_channels) - len(in_channels) == len(level_strides)
+        assert len(level_strides) == len(level_paddings)
+        assert in_channels == out_channels[:len(in_channels)]
+
+        if l2_norm_scale:
+            self.l2_norm = L2Norm(in_channels[0], l2_norm_scale)
+            self.init_cfg += [
+                dict(
+                    type='Constant',
+                    val=self.l2_norm.scale,
+                    override=dict(name='l2_norm'))
+            ]
+
+        self.extra_layers = nn.ModuleList()
+        extra_layer_channels = out_channels[len(in_channels):]
+        second_conv = DepthwiseSeparableConvModule if \
+            use_depthwise else ConvModule
+
+        for i, (out_channel, stride, padding) in enumerate(
+                zip(extra_layer_channels, level_strides, level_paddings)):
+            kernel_size = last_kernel_size \
+                if i == len(extra_layer_channels) - 1 else 3
+            per_lvl_convs = nn.Sequential(
+                ConvModule(
+                    out_channels[len(in_channels) - 1 + i],
+                    out_channel // 2,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+                second_conv(
+                    out_channel // 2,
+                    out_channel,
+                    kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.extra_layers.append(per_lvl_convs)
+
+    def forward(self, inputs):
+        """Forward function."""
+        outs = [feat for feat in inputs]
+        if hasattr(self, 'l2_norm'):
+            outs[0] = self.l2_norm(outs[0])
+
+        feat = outs[-1]
+        for layer in self.extra_layers:
+            feat = layer(feat)
+            outs.append(feat)
+        return tuple(outs)
+
+
+class L2Norm(nn.Module):
+
+    def __init__(self, n_dims, scale=20., eps=1e-10):
+        """L2 normalization layer.
+
+        Args:
+            n_dims (int): Number of dimensions to be normalized
+            scale (float, optional): Defaults to 20..
+            eps (float, optional): Used to avoid division by zero.
+                Defaults to 1e-10.
+        """
+        super(L2Norm, self).__init__()
+        self.n_dims = n_dims
+        self.weight = nn.Parameter(torch.Tensor(self.n_dims))
+        self.eps = eps
+        self.scale = scale
+
+    def forward(self, x):
+        """Forward function."""
+        # normalization layer convert to FP32 in FP16 training
+        x_float = x.float()
+        norm = x_float.pow(2).sum(1, keepdim=True).sqrt() + self.eps
+        return (self.weight[None, :, None, None].float().expand_as(x_float) *
+                x_float / norm).type_as(x)
diff --git a/head_extractor/build/lib/mmdet/models/necks/ssh.py b/head_extractor/build/lib/mmdet/models/necks/ssh.py
new file mode 100644
index 0000000000000000000000000000000000000000..75a6561489d8d3634fc34829dafe819bbf066ed4
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/ssh.py
@@ -0,0 +1,216 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+
+
+class SSHContextModule(BaseModule):
+    """This is an implementation of `SSH context module` described in `SSH:
+    Single Stage Headless Face Detector.
+
+    <https://arxiv.org/pdf/1708.03979.pdf>`_.
+
+    Args:
+        in_channels (int): Number of input channels used at each scale.
+        out_channels (int): Number of output channels used at each scale.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN').
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        assert out_channels % 4 == 0
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.conv5x5_1 = ConvModule(
+            self.in_channels,
+            self.out_channels // 4,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+        )
+
+        self.conv5x5_2 = ConvModule(
+            self.out_channels // 4,
+            self.out_channels // 4,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.conv7x7_2 = ConvModule(
+            self.out_channels // 4,
+            self.out_channels // 4,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+        )
+
+        self.conv7x7_3 = ConvModule(
+            self.out_channels // 4,
+            self.out_channels // 4,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None,
+        )
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        conv5x5_1 = self.conv5x5_1(x)
+        conv5x5 = self.conv5x5_2(conv5x5_1)
+        conv7x7_2 = self.conv7x7_2(conv5x5_1)
+        conv7x7 = self.conv7x7_3(conv7x7_2)
+
+        return (conv5x5, conv7x7)
+
+
+class SSHDetModule(BaseModule):
+    """This is an implementation of `SSH detection module` described in `SSH:
+    Single Stage Headless Face Detector.
+
+    <https://arxiv.org/pdf/1708.03979.pdf>`_.
+
+    Args:
+        in_channels (int): Number of input channels used at each scale.
+        out_channels (int): Number of output channels used at each scale.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN').
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        assert out_channels % 4 == 0
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.conv3x3 = ConvModule(
+            self.in_channels,
+            self.out_channels // 2,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.context_module = SSHContextModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        conv3x3 = self.conv3x3(x)
+        conv5x5, conv7x7 = self.context_module(x)
+        out = torch.cat([conv3x3, conv5x5, conv7x7], dim=1)
+        out = F.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class SSH(BaseModule):
+    """`SSH Neck` used in `SSH: Single Stage Headless Face Detector.
+
+    <https://arxiv.org/pdf/1708.03979.pdf>`_.
+
+    Args:
+        num_scales (int): The number of scales / stages.
+        in_channels (list[int]): The number of input channels per scale.
+        out_channels (list[int]): The number of output channels  per scale.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN').
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [8, 16, 32, 64]
+        >>> out_channels = [16, 32, 64, 128]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = SSH(num_scales=4, in_channels=in_channels,
+        ...           out_channels=out_channels)
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 16, 340, 340])
+        outputs[1].shape = torch.Size([1, 32, 170, 170])
+        outputs[2].shape = torch.Size([1, 64, 84, 84])
+        outputs[3].shape = torch.Size([1, 128, 43, 43])
+    """
+
+    def __init__(self,
+                 num_scales: int,
+                 in_channels: List[int],
+                 out_channels: List[int],
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: OptMultiConfig = dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super().__init__(init_cfg=init_cfg)
+        assert (num_scales == len(in_channels) == len(out_channels))
+        self.num_scales = num_scales
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        for idx in range(self.num_scales):
+            in_c, out_c = self.in_channels[idx], self.out_channels[idx]
+            self.add_module(
+                f'ssh_module{idx}',
+                SSHDetModule(
+                    in_channels=in_c,
+                    out_channels=out_c,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg))
+
+    def forward(self, inputs: Tuple[torch.Tensor]) -> tuple:
+        assert len(inputs) == self.num_scales
+
+        outs = []
+        for idx, x in enumerate(inputs):
+            ssh_module = getattr(self, f'ssh_module{idx}')
+            out = ssh_module(x)
+            outs.append(out)
+
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/necks/yolo_neck.py b/head_extractor/build/lib/mmdet/models/necks/yolo_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a6b1a4897c85083aa1e1e7d692263f66de67c3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/yolo_neck.py
@@ -0,0 +1,145 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+
+
+class DetectionBlock(BaseModule):
+    """Detection block in YOLO neck.
+
+    Let out_channels = n, the DetectionBlock contains:
+    Six ConvLayers, 1 Conv2D Layer and 1 YoloLayer.
+    The first 6 ConvLayers are formed the following way:
+        1x1xn, 3x3x2n, 1x1xn, 3x3x2n, 1x1xn, 3x3x2n.
+    The Conv2D layer is 1x1x255.
+    Some block will have branch after the fifth ConvLayer.
+    The input channel is arbitrary (in_channels)
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(
+                     type='LeakyReLU', negative_slope=0.1),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(DetectionBlock, self).__init__(init_cfg)
+        double_out_channels = out_channels * 2
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.conv1 = ConvModule(in_channels, out_channels, 1, **cfg)
+        self.conv2 = ConvModule(
+            out_channels, double_out_channels, 3, padding=1, **cfg)
+        self.conv3 = ConvModule(double_out_channels, out_channels, 1, **cfg)
+        self.conv4 = ConvModule(
+            out_channels, double_out_channels, 3, padding=1, **cfg)
+        self.conv5 = ConvModule(double_out_channels, out_channels, 1, **cfg)
+
+    def forward(self, x: Tensor) -> Tensor:
+        tmp = self.conv1(x)
+        tmp = self.conv2(tmp)
+        tmp = self.conv3(tmp)
+        tmp = self.conv4(tmp)
+        out = self.conv5(tmp)
+        return out
+
+
+@MODELS.register_module()
+class YOLOV3Neck(BaseModule):
+    """The neck of YOLOV3.
+
+    It can be treated as a simplified version of FPN. It
+    will take the result from Darknet backbone and do some upsampling and
+    concatenation. It will finally output the detection result.
+
+    Note:
+        The input feats should be from top to bottom.
+            i.e., from high-lvl to low-lvl
+        But YOLOV3Neck will process them in reversed order.
+            i.e., from bottom (high-lvl) to top (low-lvl)
+
+    Args:
+        num_scales (int): The number of scales / stages.
+        in_channels (List[int]): The number of input channels per scale.
+        out_channels (List[int]): The number of output channels  per scale.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict, optional): Dictionary to construct and config norm
+            layer. Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict, optional): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_scales: int,
+                 in_channels: List[int],
+                 out_channels: List[int],
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(
+                     type='LeakyReLU', negative_slope=0.1),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(YOLOV3Neck, self).__init__(init_cfg)
+        assert (num_scales == len(in_channels) == len(out_channels))
+        self.num_scales = num_scales
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        # To support arbitrary scales, the code looks awful, but it works.
+        # Better solution is welcomed.
+        self.detect1 = DetectionBlock(in_channels[0], out_channels[0], **cfg)
+        for i in range(1, self.num_scales):
+            in_c, out_c = self.in_channels[i], self.out_channels[i]
+            inter_c = out_channels[i - 1]
+            self.add_module(f'conv{i}', ConvModule(inter_c, out_c, 1, **cfg))
+            # in_c + out_c : High-lvl feats will be cat with low-lvl feats
+            self.add_module(f'detect{i+1}',
+                            DetectionBlock(in_c + out_c, out_c, **cfg))
+
+    def forward(self, feats=Tuple[Tensor]) -> Tuple[Tensor]:
+        assert len(feats) == self.num_scales
+
+        # processed from bottom (high-lvl) to top (low-lvl)
+        outs = []
+        out = self.detect1(feats[-1])
+        outs.append(out)
+
+        for i, x in enumerate(reversed(feats[:-1])):
+            conv = getattr(self, f'conv{i+1}')
+            tmp = conv(out)
+
+            # Cat with low-lvl feats
+            tmp = F.interpolate(tmp, scale_factor=2)
+            tmp = torch.cat((tmp, x), 1)
+
+            detect = getattr(self, f'detect{i+2}')
+            out = detect(tmp)
+            outs.append(out)
+
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/necks/yolox_pafpn.py b/head_extractor/build/lib/mmdet/models/necks/yolox_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ec3d12bfde8158c1a817fbf223a8eea94798667
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/necks/yolox_pafpn.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from ..layers import CSPLayer
+
+
+@MODELS.register_module()
+class YOLOXPAFPN(BaseModule):
+    """Path Aggregation Network used in YOLOX.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: False
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(scale_factor=2, mode='nearest')`
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish')
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_csp_blocks=3,
+                 use_depthwise=False,
+                 upsample_cfg=dict(scale_factor=2, mode='nearest'),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 init_cfg=dict(
+                     type='Kaiming',
+                     layer='Conv2d',
+                     a=math.sqrt(5),
+                     distribution='uniform',
+                     mode='fan_in',
+                     nonlinearity='leaky_relu')):
+        super(YOLOXPAFPN, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(**upsample_cfg)
+        self.reduce_layers = nn.ModuleList()
+        self.top_down_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.reduce_layers.append(
+                ConvModule(
+                    in_channels[idx],
+                    in_channels[idx - 1],
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.top_down_blocks.append(
+                CSPLayer(
+                    in_channels[idx - 1] * 2,
+                    in_channels[idx - 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        # build bottom-up blocks
+        self.downsamples = nn.ModuleList()
+        self.bottom_up_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv(
+                    in_channels[idx],
+                    in_channels[idx],
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.bottom_up_blocks.append(
+                CSPLayer(
+                    in_channels[idx] * 2,
+                    in_channels[idx + 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        self.out_convs = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.out_convs.append(
+                ConvModule(
+                    in_channels[i],
+                    out_channels,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+
+        Returns:
+            tuple[Tensor]: YOLOXPAFPN features.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = inputs[idx - 1]
+            feat_heigh = self.reduce_layers[len(self.in_channels) - 1 - idx](
+                feat_heigh)
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = self.upsample(feat_heigh)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                torch.cat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](
+                torch.cat([downsample_feat, feat_height], 1))
+            outs.append(out)
+
+        # out convs
+        for idx, conv in enumerate(self.out_convs):
+            outs[idx] = conv(outs[idx])
+
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmdet/models/reid/__init__.py b/head_extractor/build/lib/mmdet/models/reid/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aca617f7dea0b8047891c666ddb684dbbd018c81
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/reid/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_reid import BaseReID
+from .fc_module import FcModule
+from .gap import GlobalAveragePooling
+from .linear_reid_head import LinearReIDHead
+
+__all__ = ['BaseReID', 'GlobalAveragePooling', 'LinearReIDHead', 'FcModule']
diff --git a/head_extractor/build/lib/mmdet/models/reid/base_reid.py b/head_extractor/build/lib/mmdet/models/reid/base_reid.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c45964394aa1651f846f2a7e63da3ee70b78909
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/reid/base_reid.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch
+
+try:
+    import mmpretrain
+    from mmpretrain.models.classifiers import ImageClassifier
+except ImportError:
+    mmpretrain = None
+    ImageClassifier = object
+
+from mmdet.registry import MODELS
+from mmdet.structures import ReIDDataSample
+
+
+@MODELS.register_module()
+class BaseReID(ImageClassifier):
+    """Base model for re-identification."""
+
+    def __init__(self, *args, **kwargs):
+        if mmpretrain is None:
+            raise RuntimeError('Please run "pip install openmim" and '
+                               'run "mim install mmpretrain" to '
+                               'install mmpretrain first.')
+        super().__init__(*args, **kwargs)
+
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: Optional[List[ReIDDataSample]] = None,
+                mode: str = 'tensor'):
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+          tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+          processed to a list of :obj:`ReIDDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+          inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (torch.Tensor): The input tensor with shape
+                (N, C, H, W) or (N, T, C, H, W).
+            data_samples (List[ReIDDataSample], optional): The annotation
+                data of every sample. It's required if ``mode="loss"``.
+                Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'tensor'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of
+              :obj:`ReIDDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if len(inputs.size()) == 5:
+            assert inputs.size(0) == 1
+            inputs = inputs[0]
+        return super().forward(inputs, data_samples, mode)
diff --git a/head_extractor/build/lib/mmdet/models/reid/fc_module.py b/head_extractor/build/lib/mmdet/models/reid/fc_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..76e7efd66e300a242bb250cc6ba5cc68ed722034
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/reid/fc_module.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class FcModule(BaseModule):
+    """Fully-connected layer module.
+
+    Args:
+        in_channels (int): Input channels.
+        out_channels (int): Ourput channels.
+        norm_cfg (dict, optional): Configuration of normlization method
+            after fc. Defaults to None.
+        act_cfg (dict, optional): Configuration of activation method after fc.
+            Defaults to dict(type='ReLU').
+        inplace (bool, optional): Whether inplace the activatation module.
+            Defaults to True.
+        init_cfg (dict, optional): Initialization config dict.
+            Defaults to dict(type='Kaiming', layer='Linear').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: dict = None,
+                 act_cfg: dict = dict(type='ReLU'),
+                 inplace: bool = True,
+                 init_cfg=dict(type='Kaiming', layer='Linear')):
+        super(FcModule, self).__init__(init_cfg)
+        assert norm_cfg is None or isinstance(norm_cfg, dict)
+        assert act_cfg is None or isinstance(act_cfg, dict)
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.inplace = inplace
+
+        self.with_norm = norm_cfg is not None
+        self.with_activation = act_cfg is not None
+
+        self.fc = nn.Linear(in_channels, out_channels)
+        # build normalization layers
+        if self.with_norm:
+            self.norm_name, norm = build_norm_layer(norm_cfg, out_channels)
+            self.add_module(self.norm_name, norm)
+
+        # build activation layer
+        if self.with_activation:
+            act_cfg_ = act_cfg.copy()
+            # nn.Tanh has no 'inplace' argument
+            if act_cfg_['type'] not in [
+                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish'
+            ]:
+                act_cfg_.setdefault('inplace', inplace)
+            self.activate = build_activation_layer(act_cfg_)
+
+    @property
+    def norm(self):
+        """Normalization."""
+        return getattr(self, self.norm_name)
+
+    def forward(self, x, activate=True, norm=True):
+        """Model forward."""
+        x = self.fc(x)
+        if norm and self.with_norm:
+            x = self.norm(x)
+        if activate and self.with_activation:
+            x = self.activate(x)
+        return x
diff --git a/head_extractor/build/lib/mmdet/models/reid/gap.py b/head_extractor/build/lib/mmdet/models/reid/gap.py
new file mode 100644
index 0000000000000000000000000000000000000000..aadc25e7144f2ca9efb66b496bf8ffa5504619ff
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/reid/gap.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class GlobalAveragePooling(BaseModule):
+    """Global Average Pooling neck.
+
+    Note that we use `view` to remove extra channel after pooling. We do not
+    use `squeeze` as it will also remove the batch dimension when the tensor
+    has a batch dimension of size 1, which can lead to unexpected errors.
+    """
+
+    def __init__(self, kernel_size=None, stride=None):
+        super(GlobalAveragePooling, self).__init__()
+        if kernel_size is None and stride is None:
+            self.gap = nn.AdaptiveAvgPool2d((1, 1))
+        else:
+            self.gap = nn.AvgPool2d(kernel_size, stride)
+
+    def forward(self, inputs):
+        if isinstance(inputs, tuple):
+            outs = tuple([self.gap(x) for x in inputs])
+            outs = tuple([
+                out.view(x.size(0),
+                         torch.tensor(out.size()[1:]).prod())
+                for out, x in zip(outs, inputs)
+            ])
+        elif isinstance(inputs, torch.Tensor):
+            outs = self.gap(inputs)
+            outs = outs.view(
+                inputs.size(0),
+                torch.tensor(outs.size()[1:]).prod())
+        else:
+            raise TypeError('neck inputs should be tuple or torch.tensor')
+        return outs
diff --git a/head_extractor/build/lib/mmdet/models/reid/linear_reid_head.py b/head_extractor/build/lib/mmdet/models/reid/linear_reid_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f35aaf6c2fc57b60e36017268e2a632df60ed342
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/reid/linear_reid_head.py
@@ -0,0 +1,202 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+try:
+    import mmpretrain
+    from mmpretrain.evaluation.metrics import Accuracy
+except ImportError:
+    mmpretrain = None
+
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.structures import ReIDDataSample
+from .fc_module import FcModule
+
+
+@MODELS.register_module()
+class LinearReIDHead(BaseModule):
+    """Linear head for re-identification.
+
+    Args:
+        num_fcs (int): Number of fcs.
+        in_channels (int): Number of channels in the input.
+        fc_channels (int): Number of channels in the fcs.
+        out_channels (int): Number of channels in the output.
+        norm_cfg (dict, optional): Configuration of normlization method
+            after fc. Defaults to None.
+        act_cfg (dict, optional): Configuration of activation method after fc.
+            Defaults to None.
+        num_classes (int, optional): Number of the identities. Default to None.
+        loss_cls (dict, optional): Cross entropy loss to train the ReID module.
+            Defaults to None.
+        loss_triplet (dict, optional): Triplet loss to train the ReID module.
+            Defaults to None.
+        topk (int | Tuple[int]): Top-k accuracy. Defaults to ``(1, )``.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to dict(type='Normal',layer='Linear', mean=0, std=0.01,
+            bias=0).
+    """
+
+    def __init__(self,
+                 num_fcs: int,
+                 in_channels: int,
+                 fc_channels: int,
+                 out_channels: int,
+                 norm_cfg: Optional[dict] = None,
+                 act_cfg: Optional[dict] = None,
+                 num_classes: Optional[int] = None,
+                 loss_cls: Optional[dict] = None,
+                 loss_triplet: Optional[dict] = None,
+                 topk: Union[int, Tuple[int]] = (1, ),
+                 init_cfg: Union[dict, List[dict]] = dict(
+                     type='Normal', layer='Linear', mean=0, std=0.01, bias=0)):
+        if mmpretrain is None:
+            raise RuntimeError('Please run "pip install openmim" and '
+                               'run "mim install mmpretrain" to '
+                               'install mmpretrain first.')
+        super(LinearReIDHead, self).__init__(init_cfg=init_cfg)
+
+        assert isinstance(topk, (int, tuple))
+        if isinstance(topk, int):
+            topk = (topk, )
+        for _topk in topk:
+            assert _topk > 0, 'Top-k should be larger than 0'
+        self.topk = topk
+
+        if loss_cls is None:
+            if isinstance(num_classes, int):
+                warnings.warn('Since cross entropy is not set, '
+                              'the num_classes will be ignored.')
+            if loss_triplet is None:
+                raise ValueError('Please choose at least one loss in '
+                                 'triplet loss and cross entropy loss.')
+        elif not isinstance(num_classes, int):
+            raise TypeError('The num_classes must be a current number, '
+                            'if there is cross entropy loss.')
+        self.loss_cls = MODELS.build(loss_cls) if loss_cls else None
+        self.loss_triplet = MODELS.build(loss_triplet) \
+            if loss_triplet else None
+
+        self.num_fcs = num_fcs
+        self.in_channels = in_channels
+        self.fc_channels = fc_channels
+        self.out_channels = out_channels
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.num_classes = num_classes
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize fc layers."""
+        self.fcs = nn.ModuleList()
+        for i in range(self.num_fcs):
+            in_channels = self.in_channels if i == 0 else self.fc_channels
+            self.fcs.append(
+                FcModule(in_channels, self.fc_channels, self.norm_cfg,
+                         self.act_cfg))
+        in_channels = self.in_channels if self.num_fcs == 0 else \
+            self.fc_channels
+        self.fc_out = nn.Linear(in_channels, self.out_channels)
+        if self.loss_cls:
+            self.bn = nn.BatchNorm1d(self.out_channels)
+            self.classifier = nn.Linear(self.out_channels, self.num_classes)
+
+    def forward(self, feats: Tuple[torch.Tensor]) -> torch.Tensor:
+        """The forward process."""
+        # Multiple stage inputs are acceptable
+        # but only the last stage will be used.
+        feats = feats[-1]
+
+        for m in self.fcs:
+            feats = m(feats)
+        feats = self.fc_out(feats)
+        return feats
+
+    def loss(self, feats: Tuple[torch.Tensor],
+             data_samples: List[ReIDDataSample]) -> dict:
+        """Calculate losses.
+
+        Args:
+            feats (tuple[Tensor]): The features extracted from the backbone.
+            data_samples (List[ReIDDataSample]): The annotation data of
+                every samples.
+
+        Returns:
+            dict: a dictionary of loss components
+        """
+        # The part can be traced by torch.fx
+        feats = self(feats)
+
+        # The part can not be traced by torch.fx
+        losses = self.loss_by_feat(feats, data_samples)
+        return losses
+
+    def loss_by_feat(self, feats: torch.Tensor,
+                     data_samples: List[ReIDDataSample]) -> dict:
+        """Unpack data samples and compute loss."""
+        losses = dict()
+        gt_label = torch.cat([i.gt_label.label for i in data_samples])
+        gt_label = gt_label.to(feats.device)
+
+        if self.loss_triplet:
+            losses['triplet_loss'] = self.loss_triplet(feats, gt_label)
+
+        if self.loss_cls:
+            feats_bn = self.bn(feats)
+            cls_score = self.classifier(feats_bn)
+            losses['ce_loss'] = self.loss_cls(cls_score, gt_label)
+            acc = Accuracy.calculate(cls_score, gt_label, topk=self.topk)
+            losses.update(
+                {f'accuracy_top-{k}': a
+                 for k, a in zip(self.topk, acc)})
+
+        return losses
+
+    def predict(
+            self,
+            feats: Tuple[torch.Tensor],
+            data_samples: List[ReIDDataSample] = None) -> List[ReIDDataSample]:
+        """Inference without augmentation.
+
+        Args:
+            feats (Tuple[Tensor]): The features extracted from the backbone.
+                Multiple stage inputs are acceptable but only the last stage
+                will be used.
+            data_samples (List[ReIDDataSample], optional): The annotation
+                data of every samples. If not None, set ``pred_label`` of
+                the input data samples. Defaults to None.
+
+        Returns:
+            List[ReIDDataSample]: A list of data samples which contains the
+            predicted results.
+        """
+        # The part can be traced by torch.fx
+        feats = self(feats)
+
+        # The part can not be traced by torch.fx
+        data_samples = self.predict_by_feat(feats, data_samples)
+
+        return data_samples
+
+    def predict_by_feat(
+            self,
+            feats: torch.Tensor,
+            data_samples: List[ReIDDataSample] = None) -> List[ReIDDataSample]:
+        """Add prediction features to data samples."""
+        if data_samples is not None:
+            for data_sample, feat in zip(data_samples, feats):
+                data_sample.pred_feature = feat
+        else:
+            data_samples = []
+            for feat in feats:
+                data_sample = ReIDDataSample()
+                data_sample.pred_feature = feat
+                data_samples.append(data_sample)
+
+        return data_samples
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/__init__.py b/head_extractor/build/lib/mmdet/models/roi_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bba5664cc5ae5229ddebcb42f7583364ca9f77d8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_roi_head import BaseRoIHead
+from .bbox_heads import (BBoxHead, ConvFCBBoxHead, DIIHead,
+                         DoubleConvFCBBoxHead, SABLHead, SCNetBBoxHead,
+                         Shared2FCBBoxHead, Shared4Conv1FCBBoxHead)
+from .cascade_roi_head import CascadeRoIHead
+from .double_roi_head import DoubleHeadRoIHead
+from .dynamic_roi_head import DynamicRoIHead
+from .grid_roi_head import GridRoIHead
+from .htc_roi_head import HybridTaskCascadeRoIHead
+from .mask_heads import (CoarseMaskHead, FCNMaskHead, FeatureRelayHead,
+                         FusedSemanticHead, GlobalContextHead, GridHead,
+                         HTCMaskHead, MaskIoUHead, MaskPointHead,
+                         SCNetMaskHead, SCNetSemanticHead)
+from .mask_scoring_roi_head import MaskScoringRoIHead
+from .multi_instance_roi_head import MultiInstanceRoIHead
+from .pisa_roi_head import PISARoIHead
+from .point_rend_roi_head import PointRendRoIHead
+from .roi_extractors import (BaseRoIExtractor, GenericRoIExtractor,
+                             SingleRoIExtractor)
+from .scnet_roi_head import SCNetRoIHead
+from .shared_heads import ResLayer
+from .sparse_roi_head import SparseRoIHead
+from .standard_roi_head import StandardRoIHead
+from .trident_roi_head import TridentRoIHead
+
+__all__ = [
+    'BaseRoIHead', 'CascadeRoIHead', 'DoubleHeadRoIHead', 'MaskScoringRoIHead',
+    'HybridTaskCascadeRoIHead', 'GridRoIHead', 'ResLayer', 'BBoxHead',
+    'ConvFCBBoxHead', 'DIIHead', 'SABLHead', 'Shared2FCBBoxHead',
+    'StandardRoIHead', 'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead',
+    'FCNMaskHead', 'HTCMaskHead', 'FusedSemanticHead', 'GridHead',
+    'MaskIoUHead', 'BaseRoIExtractor', 'GenericRoIExtractor',
+    'SingleRoIExtractor', 'PISARoIHead', 'PointRendRoIHead', 'MaskPointHead',
+    'CoarseMaskHead', 'DynamicRoIHead', 'SparseRoIHead', 'TridentRoIHead',
+    'SCNetRoIHead', 'SCNetMaskHead', 'SCNetSemanticHead', 'SCNetBBoxHead',
+    'FeatureRelayHead', 'GlobalContextHead', 'MultiInstanceRoIHead'
+]
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/base_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/base_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..405f80a73ecc5db7343d81ca55518160fcbc2b63
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/base_roi_head.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Tuple
+
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig
+
+
+class BaseRoIHead(BaseModule, metaclass=ABCMeta):
+    """Base class for RoIHeads."""
+
+    def __init__(self,
+                 bbox_roi_extractor: OptMultiConfig = None,
+                 bbox_head: OptMultiConfig = None,
+                 mask_roi_extractor: OptMultiConfig = None,
+                 mask_head: OptMultiConfig = None,
+                 shared_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if shared_head is not None:
+            self.shared_head = MODELS.build(shared_head)
+
+        if bbox_head is not None:
+            self.init_bbox_head(bbox_roi_extractor, bbox_head)
+
+        if mask_head is not None:
+            self.init_mask_head(mask_roi_extractor, mask_head)
+
+        self.init_assigner_sampler()
+
+    @property
+    def with_bbox(self) -> bool:
+        """bool: whether the RoI head contains a `bbox_head`"""
+        return hasattr(self, 'bbox_head') and self.bbox_head is not None
+
+    @property
+    def with_mask(self) -> bool:
+        """bool: whether the RoI head contains a `mask_head`"""
+        return hasattr(self, 'mask_head') and self.mask_head is not None
+
+    @property
+    def with_shared_head(self) -> bool:
+        """bool: whether the RoI head contains a `shared_head`"""
+        return hasattr(self, 'shared_head') and self.shared_head is not None
+
+    @abstractmethod
+    def init_bbox_head(self, *args, **kwargs):
+        """Initialize ``bbox_head``"""
+        pass
+
+    @abstractmethod
+    def init_mask_head(self, *args, **kwargs):
+        """Initialize ``mask_head``"""
+        pass
+
+    @abstractmethod
+    def init_assigner_sampler(self, *args, **kwargs):
+        """Initialize assigner and sampler."""
+        pass
+
+    @abstractmethod
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList):
+        """Perform forward propagation and loss calculation of the roi head on
+        the features of the upstream network."""
+
+    def predict(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (N, C, H, W).
+            rpn_results_list (list[:obj:`InstanceData`]): list of region
+                proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results to
+                the original image. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        # TODO: nms_op in mmcv need be enhanced, the bbox result may get
+        #  difference when not rescale in bbox_head
+
+        # If it has the mask branch, the bbox branch does not need
+        # to be scaled to the original image scale, because the mask
+        # branch will scale both bbox and mask at the same time.
+        bbox_rescale = rescale if not self.with_mask else False
+        results_list = self.predict_bbox(
+            x,
+            batch_img_metas,
+            rpn_results_list,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=bbox_rescale)
+
+        if self.with_mask:
+            results_list = self.predict_mask(
+                x, batch_img_metas, results_list, rescale=rescale)
+
+        return results_list
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/__init__.py b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9e742abfecfc9dfe37b78822407fc92e9d64cc3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_head import BBoxHead
+from .convfc_bbox_head import (ConvFCBBoxHead, Shared2FCBBoxHead,
+                               Shared4Conv1FCBBoxHead)
+from .dii_head import DIIHead
+from .double_bbox_head import DoubleConvFCBBoxHead
+from .multi_instance_bbox_head import MultiInstanceBBoxHead
+from .sabl_head import SABLHead
+from .scnet_bbox_head import SCNetBBoxHead
+
+__all__ = [
+    'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead',
+    'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'SABLHead', 'DIIHead',
+    'SCNetBBoxHead', 'MultiInstanceBBoxHead'
+]
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/bbox_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b2e8aae0833ae0351b544099d79d296f082a76e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/bbox_head.py
@@ -0,0 +1,708 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.layers import multiclass_nms
+from mmdet.models.losses import accuracy
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import empty_instances, multi_apply
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import get_box_tensor, scale_boxes
+from mmdet.utils import ConfigType, InstanceList, OptMultiConfig
+
+
+@MODELS.register_module()
+class BBoxHead(BaseModule):
+    """Simplest RoI head, with only two fc layers for classification and
+    regression respectively."""
+
+    def __init__(self,
+                 with_avg_pool: bool = False,
+                 with_cls: bool = True,
+                 with_reg: bool = True,
+                 roi_feat_size: int = 7,
+                 in_channels: int = 256,
+                 num_classes: int = 80,
+                 bbox_coder: ConfigType = dict(
+                     type='DeltaXYWHBBoxCoder',
+                     clip_border=True,
+                     target_means=[0., 0., 0., 0.],
+                     target_stds=[0.1, 0.1, 0.2, 0.2]),
+                 predict_box_type: str = 'hbox',
+                 reg_class_agnostic: bool = False,
+                 reg_decoded_bbox: bool = False,
+                 reg_predictor_cfg: ConfigType = dict(type='Linear'),
+                 cls_predictor_cfg: ConfigType = dict(type='Linear'),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert with_cls or with_reg
+        self.with_avg_pool = with_avg_pool
+        self.with_cls = with_cls
+        self.with_reg = with_reg
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1]
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.predict_box_type = predict_box_type
+        self.reg_class_agnostic = reg_class_agnostic
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.reg_predictor_cfg = reg_predictor_cfg
+        self.cls_predictor_cfg = cls_predictor_cfg
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+
+        in_channels = self.in_channels
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(self.roi_feat_size)
+        else:
+            in_channels *= self.roi_feat_area
+        if self.with_cls:
+            # need to add background class
+            if self.custom_cls_channels:
+                cls_channels = self.loss_cls.get_cls_channels(self.num_classes)
+            else:
+                cls_channels = num_classes + 1
+            cls_predictor_cfg_ = self.cls_predictor_cfg.copy()
+            cls_predictor_cfg_.update(
+                in_features=in_channels, out_features=cls_channels)
+            self.fc_cls = MODELS.build(cls_predictor_cfg_)
+        if self.with_reg:
+            box_dim = self.bbox_coder.encode_size
+            out_dim_reg = box_dim if reg_class_agnostic else \
+                box_dim * num_classes
+            reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
+            if isinstance(reg_predictor_cfg_, (dict, ConfigDict)):
+                reg_predictor_cfg_.update(
+                    in_features=in_channels, out_features=out_dim_reg)
+            self.fc_reg = MODELS.build(reg_predictor_cfg_)
+        self.debug_imgs = None
+        if init_cfg is None:
+            self.init_cfg = []
+            if self.with_cls:
+                self.init_cfg += [
+                    dict(
+                        type='Normal', std=0.01, override=dict(name='fc_cls'))
+                ]
+            if self.with_reg:
+                self.init_cfg += [
+                    dict(
+                        type='Normal', std=0.001, override=dict(name='fc_reg'))
+                ]
+
+    # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead
+    @property
+    def custom_cls_channels(self) -> bool:
+        """get custom_cls_channels from loss_cls."""
+        return getattr(self.loss_cls, 'custom_cls_channels', False)
+
+    # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead
+    @property
+    def custom_activation(self) -> bool:
+        """get custom_activation from loss_cls."""
+        return getattr(self.loss_cls, 'custom_activation', False)
+
+    # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead
+    @property
+    def custom_accuracy(self) -> bool:
+        """get custom_accuracy from loss_cls."""
+        return getattr(self.loss_cls, 'custom_accuracy', False)
+
+    def forward(self, x: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_score (Tensor): Classification scores for all
+                  scale levels, each is a 4D-tensor, the channels number
+                  is num_base_priors * num_classes.
+                - bbox_pred (Tensor): Box energies / deltas for all
+                  scale levels, each is a 4D-tensor, the channels number
+                  is num_base_priors * 4.
+        """
+        if self.with_avg_pool:
+            if x.numel() > 0:
+                x = self.avg_pool(x)
+                x = x.view(x.size(0), -1)
+            else:
+                # avg_pool does not support empty tensor,
+                # so use torch.mean instead it
+                x = torch.mean(x, dim=(-1, -2))
+        cls_score = self.fc_cls(x) if self.with_cls else None
+        bbox_pred = self.fc_reg(x) if self.with_reg else None
+        return cls_score, bbox_pred
+
+    def _get_targets_single(self, pos_priors: Tensor, neg_priors: Tensor,
+                            pos_gt_bboxes: Tensor, pos_gt_labels: Tensor,
+                            cfg: ConfigDict) -> tuple:
+        """Calculate the ground truth for proposals in the single image
+        according to the sampling results.
+
+        Args:
+            pos_priors (Tensor): Contains all the positive boxes,
+                has shape (num_pos, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            neg_priors (Tensor): Contains all the negative boxes,
+                has shape (num_neg, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_bboxes (Tensor): Contains gt_boxes for
+                all positive samples, has shape (num_pos, 4),
+                the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_labels (Tensor): Contains gt_labels for
+                all positive samples, has shape (num_pos, ).
+            cfg (obj:`ConfigDict`): `train_cfg` of R-CNN.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals
+            in a single image. Containing the following Tensors:
+
+                - labels(Tensor): Gt_labels for all proposals, has
+                  shape (num_proposals,).
+                - label_weights(Tensor): Labels_weights for all
+                  proposals, has shape (num_proposals,).
+                - bbox_targets(Tensor):Regression target for all
+                  proposals, has shape (num_proposals, 4), the
+                  last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+                - bbox_weights(Tensor):Regression weights for all
+                  proposals, has shape (num_proposals, 4).
+        """
+        num_pos = pos_priors.size(0)
+        num_neg = neg_priors.size(0)
+        num_samples = num_pos + num_neg
+
+        # original implementation uses new_zeros since BG are set to be 0
+        # now use empty & fill because BG cat_id = num_classes,
+        # FG cat_id = [0, num_classes-1]
+        labels = pos_priors.new_full((num_samples, ),
+                                     self.num_classes,
+                                     dtype=torch.long)
+        reg_dim = pos_gt_bboxes.size(-1) if self.reg_decoded_bbox \
+            else self.bbox_coder.encode_size
+        label_weights = pos_priors.new_zeros(num_samples)
+        bbox_targets = pos_priors.new_zeros(num_samples, reg_dim)
+        bbox_weights = pos_priors.new_zeros(num_samples, reg_dim)
+        if num_pos > 0:
+            labels[:num_pos] = pos_gt_labels
+            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
+            label_weights[:num_pos] = pos_weight
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    pos_priors, pos_gt_bboxes)
+            else:
+                # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+                # is applied directly on the decoded bounding boxes, both
+                # the predicted boxes and regression targets should be with
+                # absolute coordinate format.
+                pos_bbox_targets = get_box_tensor(pos_gt_bboxes)
+            bbox_targets[:num_pos, :] = pos_bbox_targets
+            bbox_weights[:num_pos, :] = 1
+        if num_neg > 0:
+            label_weights[-num_neg:] = 1.0
+
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_targets(self,
+                    sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict,
+                    concat: bool = True) -> tuple:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Almost the same as the implementation in bbox_head, we passed
+        additional parameters pos_inds_list and neg_inds_list to
+        `_get_targets_single` function.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following list of Tensors:
+
+            - labels (list[Tensor],Tensor): Gt_labels for all
+                proposals in a batch, each tensor in list has
+                shape (num_proposals,) when `concat=False`, otherwise
+                just a single tensor has shape (num_all_proposals,).
+            - label_weights (list[Tensor]): Labels_weights for
+                all proposals in a batch, each tensor in list has
+                shape (num_proposals,) when `concat=False`, otherwise
+                just a single tensor has shape (num_all_proposals,).
+            - bbox_targets (list[Tensor],Tensor): Regression target
+                for all proposals in a batch, each tensor in list
+                has shape (num_proposals, 4) when `concat=False`,
+                otherwise just a single tensor has shape
+                (num_all_proposals, 4), the last dimension 4 represents
+                [tl_x, tl_y, br_x, br_y].
+            - bbox_weights (list[tensor],Tensor): Regression weights for
+                all proposals in a batch, each tensor in list has shape
+                (num_proposals, 4) when `concat=False`, otherwise just a
+                single tensor has shape (num_all_proposals, 4).
+        """
+        pos_priors_list = [res.pos_priors for res in sampling_results]
+        neg_priors_list = [res.neg_priors for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
+        labels, label_weights, bbox_targets, bbox_weights = multi_apply(
+            self._get_targets_single,
+            pos_priors_list,
+            neg_priors_list,
+            pos_gt_bboxes_list,
+            pos_gt_labels_list,
+            cfg=rcnn_train_cfg)
+
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            bbox_weights = torch.cat(bbox_weights, 0)
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def loss_and_target(self,
+                        cls_score: Tensor,
+                        bbox_pred: Tensor,
+                        rois: Tensor,
+                        sampling_results: List[SamplingResult],
+                        rcnn_train_cfg: ConfigDict,
+                        concat: bool = True,
+                        reduction_override: Optional[str] = None) -> dict:
+        """Calculate the loss based on the features extracted by the bbox head.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            bbox_pred (Tensor): Regression prediction results,
+                has shape
+                (batch_size * num_proposals_single_image, 4), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            rois (Tensor): RoIs with the shape
+                (batch_size * num_proposals_single_image, 5) where the first
+                column indicates batch id of each RoI.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch. Defaults to True.
+            reduction_override (str, optional): The reduction
+                method used to override the original reduction
+                method of the loss. Options are "none",
+                "mean" and "sum". Defaults to None,
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+                The targets are only used for cascade rcnn.
+        """
+
+        cls_reg_targets = self.get_targets(
+            sampling_results, rcnn_train_cfg, concat=concat)
+        losses = self.loss(
+            cls_score,
+            bbox_pred,
+            rois,
+            *cls_reg_targets,
+            reduction_override=reduction_override)
+
+        # cls_reg_targets is only for cascade rcnn
+        return dict(loss_bbox=losses, bbox_targets=cls_reg_targets)
+
+    def loss(self,
+             cls_score: Tensor,
+             bbox_pred: Tensor,
+             rois: Tensor,
+             labels: Tensor,
+             label_weights: Tensor,
+             bbox_targets: Tensor,
+             bbox_weights: Tensor,
+             reduction_override: Optional[str] = None) -> dict:
+        """Calculate the loss based on the network predictions and targets.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            bbox_pred (Tensor): Regression prediction results,
+                has shape
+                (batch_size * num_proposals_single_image, 4), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            rois (Tensor): RoIs with the shape
+                (batch_size * num_proposals_single_image, 5) where the first
+                column indicates batch id of each RoI.
+            labels (Tensor): Gt_labels for all proposals in a batch, has
+                shape (batch_size * num_proposals_single_image, ).
+            label_weights (Tensor): Labels_weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, ).
+            bbox_targets (Tensor): Regression target for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, 4),
+                the last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            bbox_weights (Tensor): Regression weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, 4).
+            reduction_override (str, optional): The reduction
+                method used to override the original reduction
+                method of the loss. Options are "none",
+                "mean" and "sum". Defaults to None,
+
+        Returns:
+            dict: A dictionary of loss.
+        """
+
+        losses = dict()
+
+        if cls_score is not None:
+            avg_factor = max(torch.sum(label_weights > 0).float().item(), 1.)
+            if cls_score.numel() > 0:
+                loss_cls_ = self.loss_cls(
+                    cls_score,
+                    labels,
+                    label_weights,
+                    avg_factor=avg_factor,
+                    reduction_override=reduction_override)
+                if isinstance(loss_cls_, dict):
+                    losses.update(loss_cls_)
+                else:
+                    losses['loss_cls'] = loss_cls_
+                if self.custom_activation:
+                    acc_ = self.loss_cls.get_accuracy(cls_score, labels)
+                    losses.update(acc_)
+                else:
+                    losses['acc'] = accuracy(cls_score, labels)
+        if bbox_pred is not None:
+            bg_class_ind = self.num_classes
+            # 0~self.num_classes-1 are FG, self.num_classes is BG
+            pos_inds = (labels >= 0) & (labels < bg_class_ind)
+            # do not perform bounding box regression for BG anymore.
+            if pos_inds.any():
+                if self.reg_decoded_bbox:
+                    # When the regression loss (e.g. `IouLoss`,
+                    # `GIouLoss`, `DIouLoss`) is applied directly on
+                    # the decoded bounding boxes, it decodes the
+                    # already encoded coordinates to absolute format.
+                    bbox_pred = self.bbox_coder.decode(rois[:, 1:], bbox_pred)
+                    bbox_pred = get_box_tensor(bbox_pred)
+                if self.reg_class_agnostic:
+                    pos_bbox_pred = bbox_pred.view(
+                        bbox_pred.size(0), -1)[pos_inds.type(torch.bool)]
+                else:
+                    pos_bbox_pred = bbox_pred.view(
+                        bbox_pred.size(0), self.num_classes,
+                        -1)[pos_inds.type(torch.bool),
+                            labels[pos_inds.type(torch.bool)]]
+                losses['loss_bbox'] = self.loss_bbox(
+                    pos_bbox_pred,
+                    bbox_targets[pos_inds.type(torch.bool)],
+                    bbox_weights[pos_inds.type(torch.bool)],
+                    avg_factor=bbox_targets.size(0),
+                    reduction_override=reduction_override)
+            else:
+                losses['loss_bbox'] = bbox_pred[pos_inds].sum()
+
+        return losses
+
+    def predict_by_feat(self,
+                        rois: Tuple[Tensor],
+                        cls_scores: Tuple[Tensor],
+                        bbox_preds: Tuple[Tensor],
+                        batch_img_metas: List[dict],
+                        rcnn_test_cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            rois (tuple[Tensor]): Tuple of boxes to be transformed.
+                Each has shape  (num_boxes, 5). last dimension 5 arrange as
+                (batch_index, x1, y1, x2, y2).
+            cls_scores (tuple[Tensor]): Tuple of box scores, each has shape
+                (num_boxes, num_classes + 1).
+            bbox_preds (tuple[Tensor]): Tuple of box energies / deltas, each
+                has shape (num_boxes, num_classes * 4).
+            batch_img_metas (list[dict]): List of image information.
+            rcnn_test_cfg (obj:`ConfigDict`, optional): `test_cfg` of R-CNN.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Instance segmentation
+            results of each image after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            results = self._predict_by_feat_single(
+                roi=rois[img_id],
+                cls_score=cls_scores[img_id],
+                bbox_pred=bbox_preds[img_id],
+                img_meta=img_meta,
+                rescale=rescale,
+                rcnn_test_cfg=rcnn_test_cfg)
+            result_list.append(results)
+
+        return result_list
+
+    def _predict_by_feat_single(
+            self,
+            roi: Tensor,
+            cls_score: Tensor,
+            bbox_pred: Tensor,
+            img_meta: dict,
+            rescale: bool = False,
+            rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5).
+                last dimension 5 arrange as (batch_index, x1, y1, x2, y2).
+            cls_score (Tensor): Box scores, has shape
+                (num_boxes, num_classes + 1).
+            bbox_pred (Tensor): Box energies / deltas.
+                has shape (num_boxes, num_classes * 4).
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image\
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results = InstanceData()
+        if roi.shape[0] == 0:
+            return empty_instances([img_meta],
+                                   roi.device,
+                                   task_type='bbox',
+                                   instance_results=[results],
+                                   box_type=self.predict_box_type,
+                                   use_box_type=False,
+                                   num_classes=self.num_classes,
+                                   score_per_cls=rcnn_test_cfg is None)[0]
+
+        # some loss (Seesaw loss..) may have custom activation
+        if self.custom_cls_channels:
+            scores = self.loss_cls.get_activation(cls_score)
+        else:
+            scores = F.softmax(
+                cls_score, dim=-1) if cls_score is not None else None
+
+        img_shape = img_meta['img_shape']
+        num_rois = roi.size(0)
+        # bbox_pred would be None in some detector when with_reg is False,
+        # e.g. Grid R-CNN.
+        if bbox_pred is not None:
+            num_classes = 1 if self.reg_class_agnostic else self.num_classes
+            roi = roi.repeat_interleave(num_classes, dim=0)
+            bbox_pred = bbox_pred.view(-1, self.bbox_coder.encode_size)
+            bboxes = self.bbox_coder.decode(
+                roi[..., 1:], bbox_pred, max_shape=img_shape)
+        else:
+            bboxes = roi[:, 1:].clone()
+            if img_shape is not None and bboxes.size(-1) == 4:
+                bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1])
+                bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0])
+
+        if rescale and bboxes.size(0) > 0:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            bboxes = scale_boxes(bboxes, scale_factor)
+
+        # Get the inside tensor when `bboxes` is a box type
+        bboxes = get_box_tensor(bboxes)
+        box_dim = bboxes.size(-1)
+        bboxes = bboxes.view(num_rois, -1)
+
+        if rcnn_test_cfg is None:
+            # This means that it is aug test.
+            # It needs to return the raw results without nms.
+            results.bboxes = bboxes
+            results.scores = scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes,
+                scores,
+                rcnn_test_cfg.score_thr,
+                rcnn_test_cfg.nms,
+                rcnn_test_cfg.max_per_img,
+                box_dim=box_dim)
+            results.bboxes = det_bboxes[:, :-1]
+            results.scores = det_bboxes[:, -1]
+            results.labels = det_labels
+        return results
+
+    def refine_bboxes(self, sampling_results: Union[List[SamplingResult],
+                                                    InstanceList],
+                      bbox_results: dict,
+                      batch_img_metas: List[dict]) -> InstanceList:
+        """Refine bboxes during training.
+
+        Args:
+            sampling_results (List[:obj:`SamplingResult`] or
+                List[:obj:`InstanceData`]): Sampling results.
+                :obj:`SamplingResult` is the real sampling results
+                calculate from bbox_head, while :obj:`InstanceData` is
+                fake sampling results, e.g., in Sparse R-CNN or QueryInst, etc.
+            bbox_results (dict): Usually is a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+            batch_img_metas (List[dict]): List of image information.
+
+        Returns:
+            list[:obj:`InstanceData`]: Refined bboxes of each image.
+
+        Example:
+            >>> # xdoctest: +REQUIRES(module:kwarray)
+            >>> import numpy as np
+            >>> from mmdet.models.task_modules.samplers.
+            ... sampling_result import random_boxes
+            >>> from mmdet.models.task_modules.samplers import SamplingResult
+            >>> self = BBoxHead(reg_class_agnostic=True)
+            >>> n_roi = 2
+            >>> n_img = 4
+            >>> scale = 512
+            >>> rng = np.random.RandomState(0)
+            ... batch_img_metas = [{'img_shape': (scale, scale)}
+            >>>                     for _ in range(n_img)]
+            >>> sampling_results = [SamplingResult.random(rng=10)
+            ...                     for _ in range(n_img)]
+            >>> # Create rois in the expected format
+            >>> roi_boxes = random_boxes(n_roi, scale=scale, rng=rng)
+            >>> img_ids = torch.randint(0, n_img, (n_roi,))
+            >>> img_ids = img_ids.float()
+            >>> rois = torch.cat([img_ids[:, None], roi_boxes], dim=1)
+            >>> # Create other args
+            >>> labels = torch.randint(0, 81, (scale,)).long()
+            >>> bbox_preds = random_boxes(n_roi, scale=scale, rng=rng)
+            >>> cls_score = torch.randn((scale, 81))
+            ... # For each image, pretend random positive boxes are gts
+            >>> bbox_targets = (labels, None, None, None)
+            ... bbox_results = dict(rois=rois, bbox_pred=bbox_preds,
+            ...                     cls_score=cls_score,
+            ...                     bbox_targets=bbox_targets)
+            >>> bboxes_list = self.refine_bboxes(sampling_results,
+            ...                                  bbox_results,
+            ...                                  batch_img_metas)
+            >>> print(bboxes_list)
+        """
+        pos_is_gts = [res.pos_is_gt for res in sampling_results]
+        # bbox_targets is a tuple
+        labels = bbox_results['bbox_targets'][0]
+        cls_scores = bbox_results['cls_score']
+        rois = bbox_results['rois']
+        bbox_preds = bbox_results['bbox_pred']
+        if self.custom_activation:
+            # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead
+            cls_scores = self.loss_cls.get_activation(cls_scores)
+        if cls_scores.numel() == 0:
+            return None
+        if cls_scores.shape[-1] == self.num_classes + 1:
+            # remove background class
+            cls_scores = cls_scores[:, :-1]
+        elif cls_scores.shape[-1] != self.num_classes:
+            raise ValueError('The last dim of `cls_scores` should equal to '
+                             '`num_classes` or `num_classes + 1`,'
+                             f'but got {cls_scores.shape[-1]}.')
+        labels = torch.where(labels == self.num_classes, cls_scores.argmax(1),
+                             labels)
+
+        img_ids = rois[:, 0].long().unique(sorted=True)
+        assert img_ids.numel() <= len(batch_img_metas)
+
+        results_list = []
+        for i in range(len(batch_img_metas)):
+            inds = torch.nonzero(
+                rois[:, 0] == i, as_tuple=False).squeeze(dim=1)
+            num_rois = inds.numel()
+
+            bboxes_ = rois[inds, 1:]
+            label_ = labels[inds]
+            bbox_pred_ = bbox_preds[inds]
+            img_meta_ = batch_img_metas[i]
+            pos_is_gts_ = pos_is_gts[i]
+
+            bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_,
+                                           img_meta_)
+            # filter gt bboxes
+            pos_keep = 1 - pos_is_gts_
+            keep_inds = pos_is_gts_.new_ones(num_rois)
+            keep_inds[:len(pos_is_gts_)] = pos_keep
+            results = InstanceData(bboxes=bboxes[keep_inds.type(torch.bool)])
+            results_list.append(results)
+
+        return results_list
+
+    def regress_by_class(self, priors: Tensor, label: Tensor,
+                         bbox_pred: Tensor, img_meta: dict) -> Tensor:
+        """Regress the bbox for the predicted class. Used in Cascade R-CNN.
+
+        Args:
+            priors (Tensor): Priors from `rpn_head` or last stage
+                `bbox_head`, has shape (num_proposals, 4).
+            label (Tensor): Only used when `self.reg_class_agnostic`
+                is False, has shape (num_proposals, ).
+            bbox_pred (Tensor): Regression prediction of
+                current stage `bbox_head`. When `self.reg_class_agnostic`
+                is False, it has shape (n, num_classes * 4), otherwise
+                it has shape (n, 4).
+            img_meta (dict): Image meta info.
+
+        Returns:
+            Tensor: Regressed bboxes, the same shape as input rois.
+        """
+        reg_dim = self.bbox_coder.encode_size
+        if not self.reg_class_agnostic:
+            label = label * reg_dim
+            inds = torch.stack([label + i for i in range(reg_dim)], 1)
+            bbox_pred = torch.gather(bbox_pred, 1, inds)
+        assert bbox_pred.size()[1] == reg_dim
+
+        max_shape = img_meta['img_shape']
+        regressed_bboxes = self.bbox_coder.decode(
+            priors, bbox_pred, max_shape=max_shape)
+        return regressed_bboxes
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb6aadd86d34af3605d432492931442026432cc8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
@@ -0,0 +1,249 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .bbox_head import BBoxHead
+
+
+@MODELS.register_module()
+class ConvFCBBoxHead(BBoxHead):
+    r"""More general bbox head, with shared conv and fc layers and two optional
+    separated branches.
+
+    .. code-block:: none
+
+                                    /-> cls convs -> cls fcs -> cls
+        shared convs -> shared fcs
+                                    \-> reg convs -> reg fcs -> reg
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_shared_convs: int = 0,
+                 num_shared_fcs: int = 0,
+                 num_cls_convs: int = 0,
+                 num_cls_fcs: int = 0,
+                 num_reg_convs: int = 0,
+                 num_reg_fcs: int = 0,
+                 conv_out_channels: int = 256,
+                 fc_out_channels: int = 1024,
+                 conv_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 norm_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 init_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 *args,
+                 **kwargs) -> None:
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        assert (num_shared_convs + num_shared_fcs + num_cls_convs +
+                num_cls_fcs + num_reg_convs + num_reg_fcs > 0)
+        if num_cls_convs > 0 or num_reg_convs > 0:
+            assert num_shared_fcs == 0
+        if not self.with_cls:
+            assert num_cls_convs == 0 and num_cls_fcs == 0
+        if not self.with_reg:
+            assert num_reg_convs == 0 and num_reg_fcs == 0
+        self.num_shared_convs = num_shared_convs
+        self.num_shared_fcs = num_shared_fcs
+        self.num_cls_convs = num_cls_convs
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_convs = num_reg_convs
+        self.num_reg_fcs = num_reg_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        # add shared convs and fcs
+        self.shared_convs, self.shared_fcs, last_layer_dim = \
+            self._add_conv_fc_branch(
+                self.num_shared_convs, self.num_shared_fcs, self.in_channels,
+                True)
+        self.shared_out_channels = last_layer_dim
+
+        # add cls specific branch
+        self.cls_convs, self.cls_fcs, self.cls_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels)
+
+        # add reg specific branch
+        self.reg_convs, self.reg_fcs, self.reg_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels)
+
+        if self.num_shared_fcs == 0 and not self.with_avg_pool:
+            if self.num_cls_fcs == 0:
+                self.cls_last_dim *= self.roi_feat_area
+            if self.num_reg_fcs == 0:
+                self.reg_last_dim *= self.roi_feat_area
+
+        self.relu = nn.ReLU(inplace=True)
+        # reconstruct fc_cls and fc_reg since input channels are changed
+        if self.with_cls:
+            if self.custom_cls_channels:
+                cls_channels = self.loss_cls.get_cls_channels(self.num_classes)
+            else:
+                cls_channels = self.num_classes + 1
+            cls_predictor_cfg_ = self.cls_predictor_cfg.copy()
+            cls_predictor_cfg_.update(
+                in_features=self.cls_last_dim, out_features=cls_channels)
+            self.fc_cls = MODELS.build(cls_predictor_cfg_)
+        if self.with_reg:
+            box_dim = self.bbox_coder.encode_size
+            out_dim_reg = box_dim if self.reg_class_agnostic else \
+                box_dim * self.num_classes
+            reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
+            if isinstance(reg_predictor_cfg_, (dict, ConfigDict)):
+                reg_predictor_cfg_.update(
+                    in_features=self.reg_last_dim, out_features=out_dim_reg)
+            self.fc_reg = MODELS.build(reg_predictor_cfg_)
+
+        if init_cfg is None:
+            # when init_cfg is None,
+            # It has been set to
+            # [[dict(type='Normal', std=0.01, override=dict(name='fc_cls'))],
+            #  [dict(type='Normal', std=0.001, override=dict(name='fc_reg'))]
+            # after `super(ConvFCBBoxHead, self).__init__()`
+            # we only need to append additional configuration
+            # for `shared_fcs`, `cls_fcs` and `reg_fcs`
+            self.init_cfg += [
+                dict(
+                    type='Xavier',
+                    distribution='uniform',
+                    override=[
+                        dict(name='shared_fcs'),
+                        dict(name='cls_fcs'),
+                        dict(name='reg_fcs')
+                    ])
+            ]
+
+    def _add_conv_fc_branch(self,
+                            num_branch_convs: int,
+                            num_branch_fcs: int,
+                            in_channels: int,
+                            is_shared: bool = False) -> tuple:
+        """Add shared or separable branch.
+
+        convs -> avg pool (optional) -> fcs
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels,
+                        self.conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            last_layer_dim = self.conv_out_channels
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            # for shared branch, only consider self.with_avg_pool
+            # for separated branches, also consider self.num_shared_fcs
+            if (is_shared
+                    or self.num_shared_fcs == 0) and not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+        return branch_convs, branch_fcs, last_layer_dim
+
+    def forward(self, x: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_score (Tensor): Classification scores for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * num_classes.
+                - bbox_pred (Tensor): Box energies / deltas for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * 4.
+        """
+        # shared part
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+
+            x = x.flatten(1)
+
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+        # separate branches
+        x_cls = x
+        x_reg = x
+
+        for conv in self.cls_convs:
+            x_cls = conv(x_cls)
+        if x_cls.dim() > 2:
+            if self.with_avg_pool:
+                x_cls = self.avg_pool(x_cls)
+            x_cls = x_cls.flatten(1)
+        for fc in self.cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+
+        for conv in self.reg_convs:
+            x_reg = conv(x_reg)
+        if x_reg.dim() > 2:
+            if self.with_avg_pool:
+                x_reg = self.avg_pool(x_reg)
+            x_reg = x_reg.flatten(1)
+        for fc in self.reg_fcs:
+            x_reg = self.relu(fc(x_reg))
+
+        cls_score = self.fc_cls(x_cls) if self.with_cls else None
+        bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
+        return cls_score, bbox_pred
+
+
+@MODELS.register_module()
+class Shared2FCBBoxHead(ConvFCBBoxHead):
+
+    def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None:
+        super().__init__(
+            num_shared_convs=0,
+            num_shared_fcs=2,
+            num_cls_convs=0,
+            num_cls_fcs=0,
+            num_reg_convs=0,
+            num_reg_fcs=0,
+            fc_out_channels=fc_out_channels,
+            *args,
+            **kwargs)
+
+
+@MODELS.register_module()
+class Shared4Conv1FCBBoxHead(ConvFCBBoxHead):
+
+    def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None:
+        super().__init__(
+            num_shared_convs=4,
+            num_shared_fcs=1,
+            num_cls_convs=0,
+            num_cls_fcs=0,
+            num_reg_convs=0,
+            num_reg_fcs=0,
+            fc_out_channels=fc_out_channels,
+            *args,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/dii_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/dii_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae9a31bbeb2a8f1da62b457363fa05031d21925a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/dii_head.py
@@ -0,0 +1,422 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmengine.config import ConfigDict
+from mmengine.model import bias_init_with_prob
+from torch import Tensor
+
+from mmdet.models.losses import accuracy
+from mmdet.models.task_modules import SamplingResult
+from mmdet.models.utils import multi_apply
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, reduce_mean
+from .bbox_head import BBoxHead
+
+
+@MODELS.register_module()
+class DIIHead(BBoxHead):
+    r"""Dynamic Instance Interactive Head for `Sparse R-CNN: End-to-End Object
+    Detection with Learnable Proposals <https://arxiv.org/abs/2011.12450>`_
+
+    Args:
+        num_classes (int): Number of class in dataset.
+            Defaults to 80.
+        num_ffn_fcs (int): The number of fully-connected
+            layers in FFNs. Defaults to 2.
+        num_heads (int): The hidden dimension of FFNs.
+            Defaults to 8.
+        num_cls_fcs (int): The number of fully-connected
+            layers in classification subnet. Defaults to 1.
+        num_reg_fcs (int): The number of fully-connected
+            layers in regression subnet. Defaults to 3.
+        feedforward_channels (int): The hidden dimension
+            of FFNs. Defaults to 2048
+        in_channels (int): Hidden_channels of MultiheadAttention.
+            Defaults to 256.
+        dropout (float): Probability of drop the channel.
+            Defaults to 0.0
+        ffn_act_cfg (:obj:`ConfigDict` or dict): The activation config
+            for FFNs.
+        dynamic_conv_cfg (:obj:`ConfigDict` or dict): The convolution
+            config for DynamicConv.
+        loss_iou (:obj:`ConfigDict` or dict): The config for iou or
+            giou loss.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int = 80,
+                 num_ffn_fcs: int = 2,
+                 num_heads: int = 8,
+                 num_cls_fcs: int = 1,
+                 num_reg_fcs: int = 3,
+                 feedforward_channels: int = 2048,
+                 in_channels: int = 256,
+                 dropout: float = 0.0,
+                 ffn_act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 dynamic_conv_cfg: ConfigType = dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     input_feat_shape=7,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN')),
+                 loss_iou: ConfigType = dict(type='GIoULoss', loss_weight=2.0),
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(
+            num_classes=num_classes,
+            reg_decoded_bbox=True,
+            reg_class_agnostic=True,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.loss_iou = MODELS.build(loss_iou)
+        self.in_channels = in_channels
+        self.fp16_enabled = False
+        self.attention = MultiheadAttention(in_channels, num_heads, dropout)
+        self.attention_norm = build_norm_layer(dict(type='LN'), in_channels)[1]
+
+        self.instance_interactive_conv = MODELS.build(dynamic_conv_cfg)
+        self.instance_interactive_conv_dropout = nn.Dropout(dropout)
+        self.instance_interactive_conv_norm = build_norm_layer(
+            dict(type='LN'), in_channels)[1]
+
+        self.ffn = FFN(
+            in_channels,
+            feedforward_channels,
+            num_ffn_fcs,
+            act_cfg=ffn_act_cfg,
+            dropout=dropout)
+        self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1]
+
+        self.cls_fcs = nn.ModuleList()
+        for _ in range(num_cls_fcs):
+            self.cls_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.cls_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.cls_fcs.append(
+                build_activation_layer(dict(type='ReLU', inplace=True)))
+
+        # over load the self.fc_cls in BBoxHead
+        if self.loss_cls.use_sigmoid:
+            self.fc_cls = nn.Linear(in_channels, self.num_classes)
+        else:
+            self.fc_cls = nn.Linear(in_channels, self.num_classes + 1)
+
+        self.reg_fcs = nn.ModuleList()
+        for _ in range(num_reg_fcs):
+            self.reg_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.reg_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.reg_fcs.append(
+                build_activation_layer(dict(type='ReLU', inplace=True)))
+        # over load the self.fc_cls in BBoxHead
+        self.fc_reg = nn.Linear(in_channels, 4)
+
+        assert self.reg_class_agnostic, 'DIIHead only ' \
+            'suppport `reg_class_agnostic=True` '
+        assert self.reg_decoded_bbox, 'DIIHead only ' \
+            'suppport `reg_decoded_bbox=True`'
+
+    def init_weights(self) -> None:
+        """Use xavier initialization for all weight parameter and set
+        classification head bias as a specific value when use focal loss."""
+        super().init_weights()
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+            else:
+                # adopt the default initialization for
+                # the weight and bias of the layer norm
+                pass
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            nn.init.constant_(self.fc_cls.bias, bias_init)
+
+    def forward(self, roi_feat: Tensor, proposal_feat: Tensor) -> tuple:
+        """Forward function of Dynamic Instance Interactive Head.
+
+        Args:
+            roi_feat (Tensor): Roi-pooling features with shape
+                (batch_size*num_proposals, feature_dimensions,
+                pooling_h , pooling_w).
+            proposal_feat (Tensor): Intermediate feature get from
+                diihead in last stage, has shape
+                (batch_size, num_proposals, feature_dimensions)
+
+        Returns:
+            tuple[Tensor]: Usually a tuple of classification scores
+            and bbox prediction and a intermediate feature.
+
+            - cls_scores (Tensor): Classification scores for
+              all proposals, has shape
+              (batch_size, num_proposals, num_classes).
+            - bbox_preds (Tensor): Box energies / deltas for
+              all proposals, has shape
+              (batch_size, num_proposals, 4).
+            - obj_feat (Tensor): Object feature before classification
+              and regression subnet, has shape
+              (batch_size, num_proposal, feature_dimensions).
+            - attn_feats (Tensor): Intermediate feature.
+        """
+        N, num_proposals = proposal_feat.shape[:2]
+
+        # Self attention
+        proposal_feat = proposal_feat.permute(1, 0, 2)
+        proposal_feat = self.attention_norm(self.attention(proposal_feat))
+        attn_feats = proposal_feat.permute(1, 0, 2)
+
+        # instance interactive
+        proposal_feat = attn_feats.reshape(-1, self.in_channels)
+        proposal_feat_iic = self.instance_interactive_conv(
+            proposal_feat, roi_feat)
+        proposal_feat = proposal_feat + self.instance_interactive_conv_dropout(
+            proposal_feat_iic)
+        obj_feat = self.instance_interactive_conv_norm(proposal_feat)
+
+        # FFN
+        obj_feat = self.ffn_norm(self.ffn(obj_feat))
+
+        cls_feat = obj_feat
+        reg_feat = obj_feat
+
+        for cls_layer in self.cls_fcs:
+            cls_feat = cls_layer(cls_feat)
+        for reg_layer in self.reg_fcs:
+            reg_feat = reg_layer(reg_feat)
+
+        cls_score = self.fc_cls(cls_feat).view(
+            N, num_proposals, self.num_classes
+            if self.loss_cls.use_sigmoid else self.num_classes + 1)
+        bbox_delta = self.fc_reg(reg_feat).view(N, num_proposals, 4)
+
+        return cls_score, bbox_delta, obj_feat.view(
+            N, num_proposals, self.in_channels), attn_feats
+
+    def loss_and_target(self,
+                        cls_score: Tensor,
+                        bbox_pred: Tensor,
+                        sampling_results: List[SamplingResult],
+                        rcnn_train_cfg: ConfigType,
+                        imgs_whwh: Tensor,
+                        concat: bool = True,
+                        reduction_override: str = None) -> dict:
+        """Calculate the loss based on the features extracted by the DIIHead.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            bbox_pred (Tensor): Regression prediction results, has shape
+                (batch_size * num_proposals_single_image, 4), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            imgs_whwh (Tensor): imgs_whwh (Tensor): Tensor with\
+                shape (batch_size, num_proposals, 4), the last
+                dimension means
+                [img_width,img_height, img_width, img_height].
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch. Defaults to True.
+            reduction_override (str, optional): The reduction
+                method used to override the original reduction
+                method of the loss. Options are "none",
+                "mean" and "sum". Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+            The targets are only used for cascade rcnn.
+        """
+        cls_reg_targets = self.get_targets(
+            sampling_results=sampling_results,
+            rcnn_train_cfg=rcnn_train_cfg,
+            concat=concat)
+        (labels, label_weights, bbox_targets, bbox_weights) = cls_reg_targets
+
+        losses = dict()
+        bg_class_ind = self.num_classes
+        # note in spare rcnn num_gt == num_pos
+        pos_inds = (labels >= 0) & (labels < bg_class_ind)
+        num_pos = pos_inds.sum().float()
+        avg_factor = reduce_mean(num_pos)
+        if cls_score is not None:
+            if cls_score.numel() > 0:
+                losses['loss_cls'] = self.loss_cls(
+                    cls_score,
+                    labels,
+                    label_weights,
+                    avg_factor=avg_factor,
+                    reduction_override=reduction_override)
+                losses['pos_acc'] = accuracy(cls_score[pos_inds],
+                                             labels[pos_inds])
+        if bbox_pred is not None:
+            # 0~self.num_classes-1 are FG, self.num_classes is BG
+            # do not perform bounding box regression for BG anymore.
+            if pos_inds.any():
+                pos_bbox_pred = bbox_pred.reshape(bbox_pred.size(0),
+                                                  4)[pos_inds.type(torch.bool)]
+                imgs_whwh = imgs_whwh.reshape(bbox_pred.size(0),
+                                              4)[pos_inds.type(torch.bool)]
+                losses['loss_bbox'] = self.loss_bbox(
+                    pos_bbox_pred / imgs_whwh,
+                    bbox_targets[pos_inds.type(torch.bool)] / imgs_whwh,
+                    bbox_weights[pos_inds.type(torch.bool)],
+                    avg_factor=avg_factor)
+                losses['loss_iou'] = self.loss_iou(
+                    pos_bbox_pred,
+                    bbox_targets[pos_inds.type(torch.bool)],
+                    bbox_weights[pos_inds.type(torch.bool)],
+                    avg_factor=avg_factor)
+            else:
+                losses['loss_bbox'] = bbox_pred.sum() * 0
+                losses['loss_iou'] = bbox_pred.sum() * 0
+        return dict(loss_bbox=losses, bbox_targets=cls_reg_targets)
+
+    def _get_targets_single(self, pos_inds: Tensor, neg_inds: Tensor,
+                            pos_priors: Tensor, neg_priors: Tensor,
+                            pos_gt_bboxes: Tensor, pos_gt_labels: Tensor,
+                            cfg: ConfigDict) -> tuple:
+        """Calculate the ground truth for proposals in the single image
+        according to the sampling results.
+
+        Almost the same as the implementation in `bbox_head`,
+        we add pos_inds and neg_inds to select positive and
+        negative samples instead of selecting the first num_pos
+        as positive samples.
+
+        Args:
+            pos_inds (Tensor): The length is equal to the
+                positive sample numbers contain all index
+                of the positive sample in the origin proposal set.
+            neg_inds (Tensor): The length is equal to the
+                negative sample numbers contain all index
+                of the negative sample in the origin proposal set.
+            pos_priors (Tensor): Contains all the positive boxes,
+                has shape (num_pos, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            neg_priors (Tensor): Contains all the negative boxes,
+                has shape (num_neg, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_bboxes (Tensor): Contains gt_boxes for
+                all positive samples, has shape (num_pos, 4),
+                the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_labels (Tensor): Contains gt_labels for
+                all positive samples, has shape (num_pos, ).
+            cfg (obj:`ConfigDict`): `train_cfg` of R-CNN.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following Tensors:
+
+            - labels(Tensor): Gt_labels for all proposals, has
+              shape (num_proposals,).
+            - label_weights(Tensor): Labels_weights for all proposals, has
+              shape (num_proposals,).
+            - bbox_targets(Tensor):Regression target for all proposals, has
+              shape (num_proposals, 4), the last dimension 4
+              represents [tl_x, tl_y, br_x, br_y].
+            - bbox_weights(Tensor):Regression weights for all proposals,
+              has shape (num_proposals, 4).
+        """
+        num_pos = pos_priors.size(0)
+        num_neg = neg_priors.size(0)
+        num_samples = num_pos + num_neg
+
+        # original implementation uses new_zeros since BG are set to be 0
+        # now use empty & fill because BG cat_id = num_classes,
+        # FG cat_id = [0, num_classes-1]
+        labels = pos_priors.new_full((num_samples, ),
+                                     self.num_classes,
+                                     dtype=torch.long)
+        label_weights = pos_priors.new_zeros(num_samples)
+        bbox_targets = pos_priors.new_zeros(num_samples, 4)
+        bbox_weights = pos_priors.new_zeros(num_samples, 4)
+        if num_pos > 0:
+            labels[pos_inds] = pos_gt_labels
+            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
+            label_weights[pos_inds] = pos_weight
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    pos_priors, pos_gt_bboxes)
+            else:
+                pos_bbox_targets = pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1
+        if num_neg > 0:
+            label_weights[neg_inds] = 1.0
+
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_targets(self,
+                    sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict,
+                    concat: bool = True) -> tuple:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Almost the same as the implementation in bbox_head, we passed
+        additional parameters pos_inds_list and neg_inds_list to
+        `_get_targets_single` function.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following list of Tensors:
+
+            - labels (list[Tensor],Tensor): Gt_labels for all
+              proposals in a batch, each tensor in list has
+              shape (num_proposals,) when `concat=False`, otherwise just
+              a single tensor has shape (num_all_proposals,).
+            - label_weights (list[Tensor]): Labels_weights for
+              all proposals in a batch, each tensor in list has shape
+              (num_proposals,) when `concat=False`, otherwise just a
+              single tensor has shape (num_all_proposals,).
+            - bbox_targets (list[Tensor],Tensor): Regression target
+              for all proposals in a batch, each tensor in list has
+              shape (num_proposals, 4) when `concat=False`, otherwise
+              just a single tensor has shape (num_all_proposals, 4),
+              the last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            - bbox_weights (list[tensor],Tensor): Regression weights for
+              all proposals in a batch, each tensor in list has shape
+              (num_proposals, 4) when `concat=False`, otherwise just a
+              single tensor has shape (num_all_proposals, 4).
+        """
+        pos_inds_list = [res.pos_inds for res in sampling_results]
+        neg_inds_list = [res.neg_inds for res in sampling_results]
+        pos_priors_list = [res.pos_priors for res in sampling_results]
+        neg_priors_list = [res.neg_priors for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
+        labels, label_weights, bbox_targets, bbox_weights = multi_apply(
+            self._get_targets_single,
+            pos_inds_list,
+            neg_inds_list,
+            pos_priors_list,
+            neg_priors_list,
+            pos_gt_bboxes_list,
+            pos_gt_labels_list,
+            cfg=rcnn_train_cfg)
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            bbox_weights = torch.cat(bbox_weights, 0)
+        return labels, label_weights, bbox_targets, bbox_weights
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..076c35843375c7aef5e58786d55ebacd281d54a3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList
+from torch import Tensor
+
+from mmdet.models.backbones.resnet import Bottleneck
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, MultiConfig, OptConfigType, OptMultiConfig
+from .bbox_head import BBoxHead
+
+
+class BasicResBlock(BaseModule):
+    """Basic residual block.
+
+    This block is a little different from the block in the ResNet backbone.
+    The kernel size of conv1 is 1 in this block while 3 in ResNet BasicBlock.
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        out_channels (int): Channels of the output feature map.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): The config dict
+            for convolution layers.
+        norm_cfg (:obj:`ConfigDict` or dict): The config dict for
+            normalization layers.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        # main path
+        self.conv1 = ConvModule(
+            in_channels,
+            in_channels,
+            kernel_size=3,
+            padding=1,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+        self.conv2 = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        # identity path
+        self.conv_identity = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        identity = x
+
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        identity = self.conv_identity(identity)
+        out = x + identity
+
+        out = self.relu(out)
+        return out
+
+
+@MODELS.register_module()
+class DoubleConvFCBBoxHead(BBoxHead):
+    r"""Bbox head used in Double-Head R-CNN
+
+    .. code-block:: none
+
+                                          /-> cls
+                      /-> shared convs ->
+                                          \-> reg
+        roi features
+                                          /-> cls
+                      \-> shared fc    ->
+                                          \-> reg
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_convs: int = 0,
+                 num_fcs: int = 0,
+                 conv_out_channels: int = 1024,
+                 fc_out_channels: int = 1024,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     override=[
+                         dict(type='Normal', name='fc_cls', std=0.01),
+                         dict(type='Normal', name='fc_reg', std=0.001),
+                         dict(
+                             type='Xavier',
+                             name='fc_branch',
+                             distribution='uniform')
+                     ]),
+                 **kwargs) -> None:
+        kwargs.setdefault('with_avg_pool', True)
+        super().__init__(init_cfg=init_cfg, **kwargs)
+        assert self.with_avg_pool
+        assert num_convs > 0
+        assert num_fcs > 0
+        self.num_convs = num_convs
+        self.num_fcs = num_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        # increase the channel of input features
+        self.res_block = BasicResBlock(self.in_channels,
+                                       self.conv_out_channels)
+
+        # add conv heads
+        self.conv_branch = self._add_conv_branch()
+        # add fc heads
+        self.fc_branch = self._add_fc_branch()
+
+        out_dim_reg = 4 if self.reg_class_agnostic else 4 * self.num_classes
+        self.fc_reg = nn.Linear(self.conv_out_channels, out_dim_reg)
+
+        self.fc_cls = nn.Linear(self.fc_out_channels, self.num_classes + 1)
+        self.relu = nn.ReLU()
+
+    def _add_conv_branch(self) -> None:
+        """Add the fc branch which consists of a sequential of conv layers."""
+        branch_convs = ModuleList()
+        for i in range(self.num_convs):
+            branch_convs.append(
+                Bottleneck(
+                    inplanes=self.conv_out_channels,
+                    planes=self.conv_out_channels // 4,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        return branch_convs
+
+    def _add_fc_branch(self) -> None:
+        """Add the fc branch which consists of a sequential of fc layers."""
+        branch_fcs = ModuleList()
+        for i in range(self.num_fcs):
+            fc_in_channels = (
+                self.in_channels *
+                self.roi_feat_area if i == 0 else self.fc_out_channels)
+            branch_fcs.append(nn.Linear(fc_in_channels, self.fc_out_channels))
+        return branch_fcs
+
+    def forward(self, x_cls: Tensor, x_reg: Tensor) -> Tuple[Tensor]:
+        """Forward features from the upstream network.
+
+        Args:
+            x_cls (Tensor): Classification features of rois
+            x_reg (Tensor): Regression features from the upstream network.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_score (Tensor): Classification score predictions of rois.
+                  each roi predicts num_classes + 1 channels.
+                - bbox_pred (Tensor): BBox deltas predictions of rois. each roi
+                  predicts 4 * num_classes channels.
+        """
+        # conv head
+        x_conv = self.res_block(x_reg)
+
+        for conv in self.conv_branch:
+            x_conv = conv(x_conv)
+
+        if self.with_avg_pool:
+            x_conv = self.avg_pool(x_conv)
+
+        x_conv = x_conv.view(x_conv.size(0), -1)
+        bbox_pred = self.fc_reg(x_conv)
+
+        # fc head
+        x_fc = x_cls.view(x_cls.size(0), -1)
+        for fc in self.fc_branch:
+            x_fc = self.relu(fc(x_fc))
+
+        cls_score = self.fc_cls(x_fc)
+
+        return cls_score, bbox_pred
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e57d2eddd580b13256da63c9bd8723be98e764
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py
@@ -0,0 +1,626 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor, nn
+
+from mmdet.models.roi_heads.bbox_heads.bbox_head import BBoxHead
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import empty_instances
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+
+
+@MODELS.register_module()
+class MultiInstanceBBoxHead(BBoxHead):
+    r"""Bbox head used in CrowdDet.
+
+    .. code-block:: none
+
+                                      /-> cls convs_1 -> cls fcs_1 -> cls_1
+                                   |--
+                                   |  \-> reg convs_1 -> reg fcs_1 -> reg_1
+                                   |
+                                   |  /-> cls convs_2 -> cls fcs_2 -> cls_2
+        shared convs -> shared fcs |--
+                                   |  \-> reg convs_2 -> reg fcs_2 -> reg_2
+                                   |
+                                   |                     ...
+                                   |
+                                   |  /-> cls convs_k -> cls fcs_k -> cls_k
+                                   |--
+                                      \-> reg convs_k -> reg fcs_k -> reg_k
+
+
+    Args:
+        num_instance (int): The number of branches after shared fcs.
+            Defaults to 2.
+        with_refine (bool): Whether to use refine module. Defaults to False.
+        num_shared_convs (int): The number of shared convs. Defaults to 0.
+        num_shared_fcs (int): The number of shared fcs. Defaults to 2.
+        num_cls_convs (int): The number of cls convs. Defaults to 0.
+        num_cls_fcs (int): The number of cls fcs. Defaults to 0.
+        num_reg_convs (int): The number of reg convs. Defaults to 0.
+        num_reg_fcs (int): The number of reg fcs. Defaults to 0.
+        conv_out_channels (int): The number of conv out channels.
+            Defaults to 256.
+        fc_out_channels (int): The number of fc out channels. Defaults to 1024.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_instance: int = 2,
+                 with_refine: bool = False,
+                 num_shared_convs: int = 0,
+                 num_shared_fcs: int = 2,
+                 num_cls_convs: int = 0,
+                 num_cls_fcs: int = 0,
+                 num_reg_convs: int = 0,
+                 num_reg_fcs: int = 0,
+                 conv_out_channels: int = 256,
+                 fc_out_channels: int = 1024,
+                 init_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 *args,
+                 **kwargs) -> None:
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        assert (num_shared_convs + num_shared_fcs + num_cls_convs +
+                num_cls_fcs + num_reg_convs + num_reg_fcs > 0)
+        assert num_instance == 2, 'Currently only 2 instances are supported'
+        if num_cls_convs > 0 or num_reg_convs > 0:
+            assert num_shared_fcs == 0
+        if not self.with_cls:
+            assert num_cls_convs == 0 and num_cls_fcs == 0
+        if not self.with_reg:
+            assert num_reg_convs == 0 and num_reg_fcs == 0
+        self.num_instance = num_instance
+        self.num_shared_convs = num_shared_convs
+        self.num_shared_fcs = num_shared_fcs
+        self.num_cls_convs = num_cls_convs
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_convs = num_reg_convs
+        self.num_reg_fcs = num_reg_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.with_refine = with_refine
+
+        # add shared convs and fcs
+        self.shared_convs, self.shared_fcs, last_layer_dim = \
+            self._add_conv_fc_branch(
+                self.num_shared_convs, self.num_shared_fcs, self.in_channels,
+                True)
+        self.shared_out_channels = last_layer_dim
+        self.relu = nn.ReLU(inplace=True)
+
+        if self.with_refine:
+            refine_model_cfg = {
+                'type': 'Linear',
+                'in_features': self.shared_out_channels + 20,
+                'out_features': self.shared_out_channels
+            }
+            self.shared_fcs_ref = MODELS.build(refine_model_cfg)
+            self.fc_cls_ref = nn.ModuleList()
+            self.fc_reg_ref = nn.ModuleList()
+
+        self.cls_convs = nn.ModuleList()
+        self.cls_fcs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.reg_fcs = nn.ModuleList()
+        self.cls_last_dim = list()
+        self.reg_last_dim = list()
+        self.fc_cls = nn.ModuleList()
+        self.fc_reg = nn.ModuleList()
+        for k in range(self.num_instance):
+            # add cls specific branch
+            cls_convs, cls_fcs, cls_last_dim = self._add_conv_fc_branch(
+                self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels)
+            self.cls_convs.append(cls_convs)
+            self.cls_fcs.append(cls_fcs)
+            self.cls_last_dim.append(cls_last_dim)
+
+            # add reg specific branch
+            reg_convs, reg_fcs, reg_last_dim = self._add_conv_fc_branch(
+                self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels)
+            self.reg_convs.append(reg_convs)
+            self.reg_fcs.append(reg_fcs)
+            self.reg_last_dim.append(reg_last_dim)
+
+            if self.num_shared_fcs == 0 and not self.with_avg_pool:
+                if self.num_cls_fcs == 0:
+                    self.cls_last_dim *= self.roi_feat_area
+                if self.num_reg_fcs == 0:
+                    self.reg_last_dim *= self.roi_feat_area
+
+            if self.with_cls:
+                if self.custom_cls_channels:
+                    cls_channels = self.loss_cls.get_cls_channels(
+                        self.num_classes)
+                else:
+                    cls_channels = self.num_classes + 1
+                cls_predictor_cfg_ = self.cls_predictor_cfg.copy()  # deepcopy
+                cls_predictor_cfg_.update(
+                    in_features=self.cls_last_dim[k],
+                    out_features=cls_channels)
+                self.fc_cls.append(MODELS.build(cls_predictor_cfg_))
+                if self.with_refine:
+                    self.fc_cls_ref.append(MODELS.build(cls_predictor_cfg_))
+
+            if self.with_reg:
+                out_dim_reg = (4 if self.reg_class_agnostic else 4 *
+                               self.num_classes)
+                reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
+                reg_predictor_cfg_.update(
+                    in_features=self.reg_last_dim[k], out_features=out_dim_reg)
+                self.fc_reg.append(MODELS.build(reg_predictor_cfg_))
+                if self.with_refine:
+                    self.fc_reg_ref.append(MODELS.build(reg_predictor_cfg_))
+
+        if init_cfg is None:
+            # when init_cfg is None,
+            # It has been set to
+            # [[dict(type='Normal', std=0.01, override=dict(name='fc_cls'))],
+            #  [dict(type='Normal', std=0.001, override=dict(name='fc_reg'))]
+            # after `super(ConvFCBBoxHead, self).__init__()`
+            # we only need to append additional configuration
+            # for `shared_fcs`, `cls_fcs` and `reg_fcs`
+            self.init_cfg += [
+                dict(
+                    type='Xavier',
+                    distribution='uniform',
+                    override=[
+                        dict(name='shared_fcs'),
+                        dict(name='cls_fcs'),
+                        dict(name='reg_fcs')
+                    ])
+            ]
+
+    def _add_conv_fc_branch(self,
+                            num_branch_convs: int,
+                            num_branch_fcs: int,
+                            in_channels: int,
+                            is_shared: bool = False) -> tuple:
+        """Add shared or separable branch.
+
+        convs -> avg pool (optional) -> fcs
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels, self.conv_out_channels, 3,
+                        padding=1))
+            last_layer_dim = self.conv_out_channels
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            # for shared branch, only consider self.with_avg_pool
+            # for separated branches, also consider self.num_shared_fcs
+            if (is_shared
+                    or self.num_shared_fcs == 0) and not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+        return branch_convs, branch_fcs, last_layer_dim
+
+    def forward(self, x: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_score (Tensor): Classification scores for all scale
+                  levels, each is a 4D-tensor, the channels number is
+                  num_base_priors * num_classes.
+                - bbox_pred (Tensor): Box energies / deltas for all scale
+                  levels, each is a 4D-tensor, the channels number is
+                  num_base_priors * 4.
+                - cls_score_ref (Tensor): The cls_score after refine model.
+                - bbox_pred_ref (Tensor): The bbox_pred after refine model.
+        """
+        # shared part
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+
+            x = x.flatten(1)
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+
+        x_cls = x
+        x_reg = x
+        # separate branches
+        cls_score = list()
+        bbox_pred = list()
+        for k in range(self.num_instance):
+            for conv in self.cls_convs[k]:
+                x_cls = conv(x_cls)
+            if x_cls.dim() > 2:
+                if self.with_avg_pool:
+                    x_cls = self.avg_pool(x_cls)
+                x_cls = x_cls.flatten(1)
+            for fc in self.cls_fcs[k]:
+                x_cls = self.relu(fc(x_cls))
+
+            for conv in self.reg_convs[k]:
+                x_reg = conv(x_reg)
+            if x_reg.dim() > 2:
+                if self.with_avg_pool:
+                    x_reg = self.avg_pool(x_reg)
+                x_reg = x_reg.flatten(1)
+            for fc in self.reg_fcs[k]:
+                x_reg = self.relu(fc(x_reg))
+
+            cls_score.append(self.fc_cls[k](x_cls) if self.with_cls else None)
+            bbox_pred.append(self.fc_reg[k](x_reg) if self.with_reg else None)
+
+        if self.with_refine:
+            x_ref = x
+            cls_score_ref = list()
+            bbox_pred_ref = list()
+            for k in range(self.num_instance):
+                feat_ref = cls_score[k].softmax(dim=-1)
+                feat_ref = torch.cat((bbox_pred[k], feat_ref[:, 1][:, None]),
+                                     dim=1).repeat(1, 4)
+                feat_ref = torch.cat((x_ref, feat_ref), dim=1)
+                feat_ref = F.relu_(self.shared_fcs_ref(feat_ref))
+
+                cls_score_ref.append(self.fc_cls_ref[k](feat_ref))
+                bbox_pred_ref.append(self.fc_reg_ref[k](feat_ref))
+
+            cls_score = torch.cat(cls_score, dim=1)
+            bbox_pred = torch.cat(bbox_pred, dim=1)
+            cls_score_ref = torch.cat(cls_score_ref, dim=1)
+            bbox_pred_ref = torch.cat(bbox_pred_ref, dim=1)
+            return cls_score, bbox_pred, cls_score_ref, bbox_pred_ref
+
+        cls_score = torch.cat(cls_score, dim=1)
+        bbox_pred = torch.cat(bbox_pred, dim=1)
+
+        return cls_score, bbox_pred
+
+    def get_targets(self,
+                    sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict,
+                    concat: bool = True) -> tuple:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Almost the same as the implementation in bbox_head, we passed
+        additional parameters pos_inds_list and neg_inds_list to
+        `_get_targets_single` function.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following list of Tensors:
+
+            - labels (list[Tensor],Tensor): Gt_labels for all proposals in a
+              batch, each tensor in list has shape (num_proposals,) when
+              `concat=False`, otherwise just a single tensor has shape
+              (num_all_proposals,).
+            - label_weights (list[Tensor]): Labels_weights for
+              all proposals in a batch, each tensor in list has shape
+              (num_proposals,) when `concat=False`, otherwise just a single
+              tensor has shape (num_all_proposals,).
+            - bbox_targets (list[Tensor],Tensor): Regression target for all
+              proposals in a batch, each tensor in list has shape
+              (num_proposals, 4) when `concat=False`, otherwise just a single
+              tensor has shape (num_all_proposals, 4), the last dimension 4
+              represents [tl_x, tl_y, br_x, br_y].
+            - bbox_weights (list[tensor],Tensor): Regression weights for
+              all proposals in a batch, each tensor in list has shape
+              (num_proposals, 4) when `concat=False`, otherwise just a
+              single tensor has shape (num_all_proposals, 4).
+        """
+        labels = []
+        bbox_targets = []
+        bbox_weights = []
+        label_weights = []
+        for i in range(len(sampling_results)):
+            sample_bboxes = torch.cat([
+                sampling_results[i].pos_gt_bboxes,
+                sampling_results[i].neg_gt_bboxes
+            ])
+            sample_priors = sampling_results[i].priors
+            sample_priors = sample_priors.repeat(1, self.num_instance).reshape(
+                -1, 4)
+            sample_bboxes = sample_bboxes.reshape(-1, 4)
+
+            if not self.reg_decoded_bbox:
+                _bbox_targets = self.bbox_coder.encode(sample_priors,
+                                                       sample_bboxes)
+            else:
+                _bbox_targets = sample_priors
+            _bbox_targets = _bbox_targets.reshape(-1, self.num_instance * 4)
+            _bbox_weights = torch.ones(_bbox_targets.shape)
+            _labels = torch.cat([
+                sampling_results[i].pos_gt_labels,
+                sampling_results[i].neg_gt_labels
+            ])
+            _labels_weights = torch.ones(_labels.shape)
+
+            bbox_targets.append(_bbox_targets)
+            bbox_weights.append(_bbox_weights)
+            labels.append(_labels)
+            label_weights.append(_labels_weights)
+
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            bbox_weights = torch.cat(bbox_weights, 0)
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def loss(self, cls_score: Tensor, bbox_pred: Tensor, rois: Tensor,
+             labels: Tensor, label_weights: Tensor, bbox_targets: Tensor,
+             bbox_weights: Tensor, **kwargs) -> dict:
+        """Calculate the loss based on the network predictions and targets.
+
+        Args:
+            cls_score (Tensor): Classification prediction results of all class,
+                has shape (batch_size * num_proposals_single_image,
+                (num_classes + 1) * k), k represents the number of prediction
+                boxes generated by each proposal box.
+            bbox_pred (Tensor): Regression prediction results, has shape
+                (batch_size * num_proposals_single_image, 4 * k), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            rois (Tensor): RoIs with the shape
+                (batch_size * num_proposals_single_image, 5) where the first
+                column indicates batch id of each RoI.
+            labels (Tensor): Gt_labels for all proposals in a batch, has
+                shape (batch_size * num_proposals_single_image, k).
+            label_weights (Tensor): Labels_weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, k).
+            bbox_targets (Tensor): Regression target for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image,
+                4 * k), the last dimension 4 represents [tl_x, tl_y, br_x,
+                br_y].
+            bbox_weights (Tensor): Regression weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image,
+                4 * k).
+
+        Returns:
+            dict: A dictionary of loss.
+        """
+        losses = dict()
+        if bbox_pred.numel():
+            loss_0 = self.emd_loss(bbox_pred[:, 0:4], cls_score[:, 0:2],
+                                   bbox_pred[:, 4:8], cls_score[:, 2:4],
+                                   bbox_targets, labels)
+            loss_1 = self.emd_loss(bbox_pred[:, 4:8], cls_score[:, 2:4],
+                                   bbox_pred[:, 0:4], cls_score[:, 0:2],
+                                   bbox_targets, labels)
+            loss = torch.cat([loss_0, loss_1], dim=1)
+            _, min_indices = loss.min(dim=1)
+            loss_emd = loss[torch.arange(loss.shape[0]), min_indices]
+            loss_emd = loss_emd.mean()
+        else:
+            loss_emd = bbox_pred.sum()
+        losses['loss_rcnn_emd'] = loss_emd
+        return losses
+
+    def emd_loss(self, bbox_pred_0: Tensor, cls_score_0: Tensor,
+                 bbox_pred_1: Tensor, cls_score_1: Tensor, targets: Tensor,
+                 labels: Tensor) -> Tensor:
+        """Calculate the emd loss.
+
+        Note:
+            This implementation is modified from https://github.com/Purkialo/
+            CrowdDet/blob/master/lib/det_oprs/loss_opr.py
+
+        Args:
+            bbox_pred_0 (Tensor): Part of regression prediction results, has
+                shape (batch_size * num_proposals_single_image, 4), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            cls_score_0 (Tensor): Part of classification prediction results,
+                has shape (batch_size * num_proposals_single_image,
+                (num_classes + 1)), where 1 represents the background.
+            bbox_pred_1 (Tensor): The other part of regression prediction
+                results, has shape (batch_size*num_proposals_single_image, 4).
+            cls_score_1 (Tensor):The other part of classification prediction
+                results, has shape (batch_size * num_proposals_single_image,
+                (num_classes + 1)).
+            targets (Tensor):Regression target for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image,
+                4 * k), the last dimension 4 represents [tl_x, tl_y, br_x,
+                br_y], k represents the number of prediction boxes generated
+                by each proposal box.
+            labels (Tensor): Gt_labels for all proposals in a batch, has
+                shape (batch_size * num_proposals_single_image, k).
+
+        Returns:
+            torch.Tensor: The calculated loss.
+        """
+
+        bbox_pred = torch.cat([bbox_pred_0, bbox_pred_1],
+                              dim=1).reshape(-1, bbox_pred_0.shape[-1])
+        cls_score = torch.cat([cls_score_0, cls_score_1],
+                              dim=1).reshape(-1, cls_score_0.shape[-1])
+        targets = targets.reshape(-1, 4)
+        labels = labels.long().flatten()
+
+        # masks
+        valid_masks = labels >= 0
+        fg_masks = labels > 0
+
+        # multiple class
+        bbox_pred = bbox_pred.reshape(-1, self.num_classes, 4)
+        fg_gt_classes = labels[fg_masks]
+        bbox_pred = bbox_pred[fg_masks, fg_gt_classes - 1, :]
+
+        # loss for regression
+        loss_bbox = self.loss_bbox(bbox_pred, targets[fg_masks])
+        loss_bbox = loss_bbox.sum(dim=1)
+
+        # loss for classification
+        labels = labels * valid_masks
+        loss_cls = self.loss_cls(cls_score, labels)
+
+        loss_cls[fg_masks] = loss_cls[fg_masks] + loss_bbox
+        loss = loss_cls.reshape(-1, 2).sum(dim=1)
+        return loss.reshape(-1, 1)
+
+    def _predict_by_feat_single(
+            self,
+            roi: Tensor,
+            cls_score: Tensor,
+            bbox_pred: Tensor,
+            img_meta: dict,
+            rescale: bool = False,
+            rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5).
+                last dimension 5 arrange as (batch_index, x1, y1, x2, y2).
+            cls_score (Tensor): Box scores, has shape
+                (num_boxes, num_classes + 1).
+            bbox_pred (Tensor): Box energies / deltas. has shape
+                (num_boxes, num_classes * 4).
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+
+        cls_score = cls_score.reshape(-1, self.num_classes + 1)
+        bbox_pred = bbox_pred.reshape(-1, 4)
+        roi = roi.repeat_interleave(self.num_instance, dim=0)
+
+        results = InstanceData()
+        if roi.shape[0] == 0:
+            return empty_instances([img_meta],
+                                   roi.device,
+                                   task_type='bbox',
+                                   instance_results=[results])[0]
+
+        scores = cls_score.softmax(dim=-1) if cls_score is not None else None
+        img_shape = img_meta['img_shape']
+        bboxes = self.bbox_coder.decode(
+            roi[..., 1:], bbox_pred, max_shape=img_shape)
+
+        if rescale and bboxes.size(0) > 0:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+            bboxes = (bboxes.view(bboxes.size(0), -1, 4) / scale_factor).view(
+                bboxes.size()[0], -1)
+
+        if rcnn_test_cfg is None:
+            # This means that it is aug test.
+            # It needs to return the raw results without nms.
+            results.bboxes = bboxes
+            results.scores = scores
+        else:
+            roi_idx = np.tile(
+                np.arange(bboxes.shape[0] / self.num_instance)[:, None],
+                (1, self.num_instance)).reshape(-1, 1)[:, 0]
+            roi_idx = torch.from_numpy(roi_idx).to(bboxes.device).reshape(
+                -1, 1)
+            bboxes = torch.cat([bboxes, roi_idx], dim=1)
+            det_bboxes, det_scores = self.set_nms(
+                bboxes, scores[:, 1], rcnn_test_cfg.score_thr,
+                rcnn_test_cfg.nms['iou_threshold'], rcnn_test_cfg.max_per_img)
+
+            results.bboxes = det_bboxes[:, :-1]
+            results.scores = det_scores
+            results.labels = torch.zeros_like(det_scores)
+
+        return results
+
+    @staticmethod
+    def set_nms(bboxes: Tensor,
+                scores: Tensor,
+                score_thr: float,
+                iou_threshold: float,
+                max_num: int = -1) -> Tuple[Tensor, Tensor]:
+        """NMS for multi-instance prediction. Please refer to
+        https://github.com/Purkialo/CrowdDet for more details.
+
+        Args:
+            bboxes (Tensor): predict bboxes.
+            scores (Tensor): The score of each predict bbox.
+            score_thr (float): bbox threshold, bboxes with scores lower than it
+                will not be considered.
+            iou_threshold (float): IoU threshold to be considered as
+                conflicted.
+            max_num (int, optional): if there are more than max_num bboxes
+                after NMS, only top max_num will be kept. Default to -1.
+
+        Returns:
+            Tuple[Tensor, Tensor]: (bboxes, scores).
+        """
+
+        bboxes = bboxes[scores > score_thr]
+        scores = scores[scores > score_thr]
+
+        ordered_scores, order = scores.sort(descending=True)
+        ordered_bboxes = bboxes[order]
+        roi_idx = ordered_bboxes[:, -1]
+
+        keep = torch.ones(len(ordered_bboxes)) == 1
+        ruler = torch.arange(len(ordered_bboxes))
+
+        keep = keep.to(bboxes.device)
+        ruler = ruler.to(bboxes.device)
+
+        while ruler.shape[0] > 0:
+            basement = ruler[0]
+            ruler = ruler[1:]
+            idx = roi_idx[basement]
+            # calculate the body overlap
+            basement_bbox = ordered_bboxes[:, :4][basement].reshape(-1, 4)
+            ruler_bbox = ordered_bboxes[:, :4][ruler].reshape(-1, 4)
+            overlap = bbox_overlaps(basement_bbox, ruler_bbox)
+            indices = torch.where(overlap > iou_threshold)[1]
+            loc = torch.where(roi_idx[ruler][indices] == idx)
+            # the mask won't change in the step
+            mask = keep[ruler[indices][loc]]
+            keep[ruler[indices]] = False
+            keep[ruler[indices][loc][mask]] = True
+            ruler[~keep[ruler]] = -1
+            ruler = ruler[ruler > 0]
+
+        keep = keep[order.sort()[1]]
+        return bboxes[keep][:max_num, :], scores[keep][:max_num]
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/sabl_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/sabl_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a9ee6aba9669514ec8ce7218e8c97e026830f6c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/sabl_head.py
@@ -0,0 +1,684 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.layers import multiclass_nms
+from mmdet.models.losses import accuracy
+from mmdet.models.task_modules import SamplingResult
+from mmdet.models.utils import multi_apply
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig
+from .bbox_head import BBoxHead
+
+
+@MODELS.register_module()
+class SABLHead(BBoxHead):
+    """Side-Aware Boundary Localization (SABL) for RoI-Head.
+
+    Side-Aware features are extracted by conv layers
+    with an attention mechanism.
+    Boundary Localization with Bucketing and Bucketing Guided Rescoring
+    are implemented in BucketingBBoxCoder.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        cls_in_channels (int): Input channels of cls RoI feature. \
+            Defaults to 256.
+        reg_in_channels (int): Input channels of reg RoI feature. \
+            Defaults to 256.
+        roi_feat_size (int): Size of RoI features. Defaults to 7.
+        reg_feat_up_ratio (int): Upsample ratio of reg features. \
+            Defaults to 2.
+        reg_pre_kernel (int): Kernel of 2D conv layers before \
+            attention pooling. Defaults to 3.
+        reg_post_kernel (int): Kernel of 1D conv layers after \
+            attention pooling. Defaults to 3.
+        reg_pre_num (int): Number of pre convs. Defaults to 2.
+        reg_post_num (int): Number of post convs. Defaults to 1.
+        num_classes (int): Number of classes in dataset. Defaults to 80.
+        cls_out_channels (int): Hidden channels in cls fcs. Defaults to 1024.
+        reg_offset_out_channels (int): Hidden and output channel \
+            of reg offset branch. Defaults to 256.
+        reg_cls_out_channels (int): Hidden and output channel \
+            of reg cls branch. Defaults to 256.
+        num_cls_fcs (int): Number of fcs for cls branch. Defaults to 1.
+        num_reg_fcs (int): Number of fcs for reg branch.. Defaults to 0.
+        reg_class_agnostic (bool): Class agnostic regression or not. \
+            Defaults to True.
+        norm_cfg (dict): Config of norm layers. Defaults to None.
+        bbox_coder (dict): Config of bbox coder. Defaults 'BucketingBBoxCoder'.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox_cls (dict): Config of classification loss for bbox branch.
+        loss_bbox_reg (dict): Config of regression loss for bbox branch.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 cls_in_channels: int = 256,
+                 reg_in_channels: int = 256,
+                 roi_feat_size: int = 7,
+                 reg_feat_up_ratio: int = 2,
+                 reg_pre_kernel: int = 3,
+                 reg_post_kernel: int = 3,
+                 reg_pre_num: int = 2,
+                 reg_post_num: int = 1,
+                 cls_out_channels: int = 1024,
+                 reg_offset_out_channels: int = 256,
+                 reg_cls_out_channels: int = 256,
+                 num_cls_fcs: int = 1,
+                 num_reg_fcs: int = 0,
+                 reg_class_agnostic: bool = True,
+                 norm_cfg: OptConfigType = None,
+                 bbox_coder: ConfigType = dict(
+                     type='BucketingBBoxCoder',
+                     num_buckets=14,
+                     scale_factor=1.7),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_bbox_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox_reg: ConfigType = dict(
+                     type='SmoothL1Loss', beta=0.1, loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(BBoxHead, self).__init__(init_cfg=init_cfg)
+        self.cls_in_channels = cls_in_channels
+        self.reg_in_channels = reg_in_channels
+        self.roi_feat_size = roi_feat_size
+        self.reg_feat_up_ratio = int(reg_feat_up_ratio)
+        self.num_buckets = bbox_coder['num_buckets']
+        assert self.reg_feat_up_ratio // 2 >= 1
+        self.up_reg_feat_size = roi_feat_size * self.reg_feat_up_ratio
+        assert self.up_reg_feat_size == bbox_coder['num_buckets']
+        self.reg_pre_kernel = reg_pre_kernel
+        self.reg_post_kernel = reg_post_kernel
+        self.reg_pre_num = reg_pre_num
+        self.reg_post_num = reg_post_num
+        self.num_classes = num_classes
+        self.cls_out_channels = cls_out_channels
+        self.reg_offset_out_channels = reg_offset_out_channels
+        self.reg_cls_out_channels = reg_cls_out_channels
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_fcs = num_reg_fcs
+        self.reg_class_agnostic = reg_class_agnostic
+        assert self.reg_class_agnostic
+        self.norm_cfg = norm_cfg
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox_cls = MODELS.build(loss_bbox_cls)
+        self.loss_bbox_reg = MODELS.build(loss_bbox_reg)
+
+        self.cls_fcs = self._add_fc_branch(self.num_cls_fcs,
+                                           self.cls_in_channels,
+                                           self.roi_feat_size,
+                                           self.cls_out_channels)
+
+        self.side_num = int(np.ceil(self.num_buckets / 2))
+
+        if self.reg_feat_up_ratio > 1:
+            self.upsample_x = nn.ConvTranspose1d(
+                reg_in_channels,
+                reg_in_channels,
+                self.reg_feat_up_ratio,
+                stride=self.reg_feat_up_ratio)
+            self.upsample_y = nn.ConvTranspose1d(
+                reg_in_channels,
+                reg_in_channels,
+                self.reg_feat_up_ratio,
+                stride=self.reg_feat_up_ratio)
+
+        self.reg_pre_convs = nn.ModuleList()
+        for i in range(self.reg_pre_num):
+            reg_pre_conv = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=reg_pre_kernel,
+                padding=reg_pre_kernel // 2,
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_pre_convs.append(reg_pre_conv)
+
+        self.reg_post_conv_xs = nn.ModuleList()
+        for i in range(self.reg_post_num):
+            reg_post_conv_x = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=(1, reg_post_kernel),
+                padding=(0, reg_post_kernel // 2),
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_post_conv_xs.append(reg_post_conv_x)
+        self.reg_post_conv_ys = nn.ModuleList()
+        for i in range(self.reg_post_num):
+            reg_post_conv_y = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=(reg_post_kernel, 1),
+                padding=(reg_post_kernel // 2, 0),
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_post_conv_ys.append(reg_post_conv_y)
+
+        self.reg_conv_att_x = nn.Conv2d(reg_in_channels, 1, 1)
+        self.reg_conv_att_y = nn.Conv2d(reg_in_channels, 1, 1)
+
+        self.fc_cls = nn.Linear(self.cls_out_channels, self.num_classes + 1)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.reg_cls_fcs = self._add_fc_branch(self.num_reg_fcs,
+                                               self.reg_in_channels, 1,
+                                               self.reg_cls_out_channels)
+        self.reg_offset_fcs = self._add_fc_branch(self.num_reg_fcs,
+                                                  self.reg_in_channels, 1,
+                                                  self.reg_offset_out_channels)
+        self.fc_reg_cls = nn.Linear(self.reg_cls_out_channels, 1)
+        self.fc_reg_offset = nn.Linear(self.reg_offset_out_channels, 1)
+
+        if init_cfg is None:
+            self.init_cfg = [
+                dict(
+                    type='Xavier',
+                    layer='Linear',
+                    distribution='uniform',
+                    override=[
+                        dict(type='Normal', name='reg_conv_att_x', std=0.01),
+                        dict(type='Normal', name='reg_conv_att_y', std=0.01),
+                        dict(type='Normal', name='fc_reg_cls', std=0.01),
+                        dict(type='Normal', name='fc_cls', std=0.01),
+                        dict(type='Normal', name='fc_reg_offset', std=0.001)
+                    ])
+            ]
+            if self.reg_feat_up_ratio > 1:
+                self.init_cfg += [
+                    dict(
+                        type='Kaiming',
+                        distribution='normal',
+                        override=[
+                            dict(name='upsample_x'),
+                            dict(name='upsample_y')
+                        ])
+                ]
+
+    def _add_fc_branch(self, num_branch_fcs: int, in_channels: int,
+                       roi_feat_size: int,
+                       fc_out_channels: int) -> nn.ModuleList:
+        """build fc layers."""
+        in_channels = in_channels * roi_feat_size * roi_feat_size
+        branch_fcs = nn.ModuleList()
+        for i in range(num_branch_fcs):
+            fc_in_channels = (in_channels if i == 0 else fc_out_channels)
+            branch_fcs.append(nn.Linear(fc_in_channels, fc_out_channels))
+        return branch_fcs
+
+    def cls_forward(self, cls_x: Tensor) -> Tensor:
+        """forward of classification fc layers."""
+        cls_x = cls_x.view(cls_x.size(0), -1)
+        for fc in self.cls_fcs:
+            cls_x = self.relu(fc(cls_x))
+        cls_score = self.fc_cls(cls_x)
+        return cls_score
+
+    def attention_pool(self, reg_x: Tensor) -> tuple:
+        """Extract direction-specific features fx and fy with attention
+        methanism."""
+        reg_fx = reg_x
+        reg_fy = reg_x
+        reg_fx_att = self.reg_conv_att_x(reg_fx).sigmoid()
+        reg_fy_att = self.reg_conv_att_y(reg_fy).sigmoid()
+        reg_fx_att = reg_fx_att / reg_fx_att.sum(dim=2).unsqueeze(2)
+        reg_fy_att = reg_fy_att / reg_fy_att.sum(dim=3).unsqueeze(3)
+        reg_fx = (reg_fx * reg_fx_att).sum(dim=2)
+        reg_fy = (reg_fy * reg_fy_att).sum(dim=3)
+        return reg_fx, reg_fy
+
+    def side_aware_feature_extractor(self, reg_x: Tensor) -> tuple:
+        """Refine and extract side-aware features without split them."""
+        for reg_pre_conv in self.reg_pre_convs:
+            reg_x = reg_pre_conv(reg_x)
+        reg_fx, reg_fy = self.attention_pool(reg_x)
+
+        if self.reg_post_num > 0:
+            reg_fx = reg_fx.unsqueeze(2)
+            reg_fy = reg_fy.unsqueeze(3)
+            for i in range(self.reg_post_num):
+                reg_fx = self.reg_post_conv_xs[i](reg_fx)
+                reg_fy = self.reg_post_conv_ys[i](reg_fy)
+            reg_fx = reg_fx.squeeze(2)
+            reg_fy = reg_fy.squeeze(3)
+        if self.reg_feat_up_ratio > 1:
+            reg_fx = self.relu(self.upsample_x(reg_fx))
+            reg_fy = self.relu(self.upsample_y(reg_fy))
+        reg_fx = torch.transpose(reg_fx, 1, 2)
+        reg_fy = torch.transpose(reg_fy, 1, 2)
+        return reg_fx.contiguous(), reg_fy.contiguous()
+
+    def reg_pred(self, x: Tensor, offset_fcs: nn.ModuleList,
+                 cls_fcs: nn.ModuleList) -> tuple:
+        """Predict bucketing estimation (cls_pred) and fine regression (offset
+        pred) with side-aware features."""
+        x_offset = x.view(-1, self.reg_in_channels)
+        x_cls = x.view(-1, self.reg_in_channels)
+
+        for fc in offset_fcs:
+            x_offset = self.relu(fc(x_offset))
+        for fc in cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+        offset_pred = self.fc_reg_offset(x_offset)
+        cls_pred = self.fc_reg_cls(x_cls)
+
+        offset_pred = offset_pred.view(x.size(0), -1)
+        cls_pred = cls_pred.view(x.size(0), -1)
+
+        return offset_pred, cls_pred
+
+    def side_aware_split(self, feat: Tensor) -> Tensor:
+        """Split side-aware features aligned with orders of bucketing
+        targets."""
+        l_end = int(np.ceil(self.up_reg_feat_size / 2))
+        r_start = int(np.floor(self.up_reg_feat_size / 2))
+        feat_fl = feat[:, :l_end]
+        feat_fr = feat[:, r_start:].flip(dims=(1, ))
+        feat_fl = feat_fl.contiguous()
+        feat_fr = feat_fr.contiguous()
+        feat = torch.cat([feat_fl, feat_fr], dim=-1)
+        return feat
+
+    def bbox_pred_split(self, bbox_pred: tuple,
+                        num_proposals_per_img: Sequence[int]) -> tuple:
+        """Split batch bbox prediction back to each image."""
+        bucket_cls_preds, bucket_offset_preds = bbox_pred
+        bucket_cls_preds = bucket_cls_preds.split(num_proposals_per_img, 0)
+        bucket_offset_preds = bucket_offset_preds.split(
+            num_proposals_per_img, 0)
+        bbox_pred = tuple(zip(bucket_cls_preds, bucket_offset_preds))
+        return bbox_pred
+
+    def reg_forward(self, reg_x: Tensor) -> tuple:
+        """forward of regression branch."""
+        outs = self.side_aware_feature_extractor(reg_x)
+        edge_offset_preds = []
+        edge_cls_preds = []
+        reg_fx = outs[0]
+        reg_fy = outs[1]
+        offset_pred_x, cls_pred_x = self.reg_pred(reg_fx, self.reg_offset_fcs,
+                                                  self.reg_cls_fcs)
+        offset_pred_y, cls_pred_y = self.reg_pred(reg_fy, self.reg_offset_fcs,
+                                                  self.reg_cls_fcs)
+        offset_pred_x = self.side_aware_split(offset_pred_x)
+        offset_pred_y = self.side_aware_split(offset_pred_y)
+        cls_pred_x = self.side_aware_split(cls_pred_x)
+        cls_pred_y = self.side_aware_split(cls_pred_y)
+        edge_offset_preds = torch.cat([offset_pred_x, offset_pred_y], dim=-1)
+        edge_cls_preds = torch.cat([cls_pred_x, cls_pred_y], dim=-1)
+
+        return edge_cls_preds, edge_offset_preds
+
+    def forward(self, x: Tensor) -> tuple:
+        """Forward features from the upstream network."""
+        bbox_pred = self.reg_forward(x)
+        cls_score = self.cls_forward(x)
+
+        return cls_score, bbox_pred
+
+    def get_targets(self,
+                    sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict,
+                    concat: bool = True) -> tuple:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results."""
+        pos_proposals = [res.pos_bboxes for res in sampling_results]
+        neg_proposals = [res.neg_bboxes for res in sampling_results]
+        pos_gt_bboxes = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels = [res.pos_gt_labels for res in sampling_results]
+        cls_reg_targets = self.bucket_target(
+            pos_proposals,
+            neg_proposals,
+            pos_gt_bboxes,
+            pos_gt_labels,
+            rcnn_train_cfg,
+            concat=concat)
+        (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+         bucket_offset_targets, bucket_offset_weights) = cls_reg_targets
+        return (labels, label_weights, (bucket_cls_targets,
+                                        bucket_offset_targets),
+                (bucket_cls_weights, bucket_offset_weights))
+
+    def bucket_target(self,
+                      pos_proposals_list: list,
+                      neg_proposals_list: list,
+                      pos_gt_bboxes_list: list,
+                      pos_gt_labels_list: list,
+                      rcnn_train_cfg: ConfigDict,
+                      concat: bool = True) -> tuple:
+        """Compute bucketing estimation targets and fine regression targets for
+        a batch of images."""
+        (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+         bucket_offset_targets, bucket_offset_weights) = multi_apply(
+             self._bucket_target_single,
+             pos_proposals_list,
+             neg_proposals_list,
+             pos_gt_bboxes_list,
+             pos_gt_labels_list,
+             cfg=rcnn_train_cfg)
+
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bucket_cls_targets = torch.cat(bucket_cls_targets, 0)
+            bucket_cls_weights = torch.cat(bucket_cls_weights, 0)
+            bucket_offset_targets = torch.cat(bucket_offset_targets, 0)
+            bucket_offset_weights = torch.cat(bucket_offset_weights, 0)
+        return (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+                bucket_offset_targets, bucket_offset_weights)
+
+    def _bucket_target_single(self, pos_proposals: Tensor,
+                              neg_proposals: Tensor, pos_gt_bboxes: Tensor,
+                              pos_gt_labels: Tensor, cfg: ConfigDict) -> tuple:
+        """Compute bucketing estimation targets and fine regression targets for
+        a single image.
+
+        Args:
+            pos_proposals (Tensor): positive proposals of a single image,
+                 Shape (n_pos, 4)
+            neg_proposals (Tensor): negative proposals of a single image,
+                 Shape (n_neg, 4).
+            pos_gt_bboxes (Tensor): gt bboxes assigned to positive proposals
+                 of a single image, Shape (n_pos, 4).
+            pos_gt_labels (Tensor): gt labels assigned to positive proposals
+                 of a single image, Shape (n_pos, ).
+            cfg (dict): Config of calculating targets
+
+        Returns:
+            tuple:
+
+            - labels (Tensor): Labels in a single image. Shape (n,).
+            - label_weights (Tensor): Label weights in a single image.
+                Shape (n,)
+            - bucket_cls_targets (Tensor): Bucket cls targets in
+                a single image. Shape (n, num_buckets*2).
+            - bucket_cls_weights (Tensor): Bucket cls weights in
+                a single image. Shape (n, num_buckets*2).
+            - bucket_offset_targets (Tensor): Bucket offset targets
+                in a single image. Shape (n, num_buckets*2).
+            - bucket_offset_targets (Tensor): Bucket offset weights
+                in a single image. Shape (n, num_buckets*2).
+        """
+        num_pos = pos_proposals.size(0)
+        num_neg = neg_proposals.size(0)
+        num_samples = num_pos + num_neg
+        labels = pos_gt_bboxes.new_full((num_samples, ),
+                                        self.num_classes,
+                                        dtype=torch.long)
+        label_weights = pos_proposals.new_zeros(num_samples)
+        bucket_cls_targets = pos_proposals.new_zeros(num_samples,
+                                                     4 * self.side_num)
+        bucket_cls_weights = pos_proposals.new_zeros(num_samples,
+                                                     4 * self.side_num)
+        bucket_offset_targets = pos_proposals.new_zeros(
+            num_samples, 4 * self.side_num)
+        bucket_offset_weights = pos_proposals.new_zeros(
+            num_samples, 4 * self.side_num)
+        if num_pos > 0:
+            labels[:num_pos] = pos_gt_labels
+            label_weights[:num_pos] = 1.0
+            (pos_bucket_offset_targets, pos_bucket_offset_weights,
+             pos_bucket_cls_targets,
+             pos_bucket_cls_weights) = self.bbox_coder.encode(
+                 pos_proposals, pos_gt_bboxes)
+            bucket_cls_targets[:num_pos, :] = pos_bucket_cls_targets
+            bucket_cls_weights[:num_pos, :] = pos_bucket_cls_weights
+            bucket_offset_targets[:num_pos, :] = pos_bucket_offset_targets
+            bucket_offset_weights[:num_pos, :] = pos_bucket_offset_weights
+        if num_neg > 0:
+            label_weights[-num_neg:] = 1.0
+        return (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+                bucket_offset_targets, bucket_offset_weights)
+
+    def loss(self,
+             cls_score: Tensor,
+             bbox_pred: Tuple[Tensor, Tensor],
+             rois: Tensor,
+             labels: Tensor,
+             label_weights: Tensor,
+             bbox_targets: Tuple[Tensor, Tensor],
+             bbox_weights: Tuple[Tensor, Tensor],
+             reduction_override: Optional[str] = None) -> dict:
+        """Calculate the loss based on the network predictions and targets.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            bbox_pred (Tensor): A tuple of regression prediction results
+                containing `bucket_cls_preds and` `bucket_offset_preds`.
+            rois (Tensor): RoIs with the shape
+                (batch_size * num_proposals_single_image, 5) where the first
+                column indicates batch id of each RoI.
+            labels (Tensor): Gt_labels for all proposals in a batch, has
+                shape (batch_size * num_proposals_single_image, ).
+            label_weights (Tensor): Labels_weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, ).
+            bbox_targets (Tuple[Tensor, Tensor]): A tuple of regression target
+                containing `bucket_cls_targets` and `bucket_offset_targets`.
+                the last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            bbox_weights (Tuple[Tensor, Tensor]): A tuple of regression
+                weights containing `bucket_cls_weights` and
+                `bucket_offset_weights`.
+            reduction_override (str, optional): The reduction
+                method used to override the original reduction
+                method of the loss. Options are "none",
+                "mean" and "sum". Defaults to None,
+
+        Returns:
+            dict: A dictionary of loss.
+        """
+        losses = dict()
+        if cls_score is not None:
+            avg_factor = max(torch.sum(label_weights > 0).float().item(), 1.)
+            losses['loss_cls'] = self.loss_cls(
+                cls_score,
+                labels,
+                label_weights,
+                avg_factor=avg_factor,
+                reduction_override=reduction_override)
+            losses['acc'] = accuracy(cls_score, labels)
+
+        if bbox_pred is not None:
+            bucket_cls_preds, bucket_offset_preds = bbox_pred
+            bucket_cls_targets, bucket_offset_targets = bbox_targets
+            bucket_cls_weights, bucket_offset_weights = bbox_weights
+            # edge cls
+            bucket_cls_preds = bucket_cls_preds.view(-1, self.side_num)
+            bucket_cls_targets = bucket_cls_targets.view(-1, self.side_num)
+            bucket_cls_weights = bucket_cls_weights.view(-1, self.side_num)
+            losses['loss_bbox_cls'] = self.loss_bbox_cls(
+                bucket_cls_preds,
+                bucket_cls_targets,
+                bucket_cls_weights,
+                avg_factor=bucket_cls_targets.size(0),
+                reduction_override=reduction_override)
+
+            losses['loss_bbox_reg'] = self.loss_bbox_reg(
+                bucket_offset_preds,
+                bucket_offset_targets,
+                bucket_offset_weights,
+                avg_factor=bucket_offset_targets.size(0),
+                reduction_override=reduction_override)
+
+        return losses
+
+    def _predict_by_feat_single(
+            self,
+            roi: Tensor,
+            cls_score: Tensor,
+            bbox_pred: Tuple[Tensor, Tensor],
+            img_meta: dict,
+            rescale: bool = False,
+            rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5).
+                last dimension 5 arrange as (batch_index, x1, y1, x2, y2).
+            cls_score (Tensor): Box scores, has shape
+                (num_boxes, num_classes + 1).
+            bbox_pred (Tuple[Tensor, Tensor]): Box cls preds and offset preds.
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results = InstanceData()
+        if isinstance(cls_score, list):
+            cls_score = sum(cls_score) / float(len(cls_score))
+        scores = F.softmax(cls_score, dim=1) if cls_score is not None else None
+        img_shape = img_meta['img_shape']
+        if bbox_pred is not None:
+            bboxes, confidences = self.bbox_coder.decode(
+                roi[:, 1:], bbox_pred, img_shape)
+        else:
+            bboxes = roi[:, 1:].clone()
+            confidences = None
+            if img_shape is not None:
+                bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1] - 1)
+                bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0] - 1)
+
+        if rescale and bboxes.size(0) > 0:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+            bboxes = (bboxes.view(bboxes.size(0), -1, 4) / scale_factor).view(
+                bboxes.size()[0], -1)
+
+        if rcnn_test_cfg is None:
+            results.bboxes = bboxes
+            results.scores = scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes,
+                scores,
+                rcnn_test_cfg.score_thr,
+                rcnn_test_cfg.nms,
+                rcnn_test_cfg.max_per_img,
+                score_factors=confidences)
+            results.bboxes = det_bboxes[:, :4]
+            results.scores = det_bboxes[:, -1]
+            results.labels = det_labels
+        return results
+
+    def refine_bboxes(self, sampling_results: List[SamplingResult],
+                      bbox_results: dict,
+                      batch_img_metas: List[dict]) -> InstanceList:
+        """Refine bboxes during training.
+
+        Args:
+            sampling_results (List[:obj:`SamplingResult`]): Sampling results.
+            bbox_results (dict): Usually is a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+            batch_img_metas (List[dict]): List of image information.
+
+        Returns:
+            list[:obj:`InstanceData`]: Refined bboxes of each image.
+        """
+        pos_is_gts = [res.pos_is_gt for res in sampling_results]
+        # bbox_targets is a tuple
+        labels = bbox_results['bbox_targets'][0]
+        cls_scores = bbox_results['cls_score']
+        rois = bbox_results['rois']
+        bbox_preds = bbox_results['bbox_pred']
+
+        if cls_scores.numel() == 0:
+            return None
+
+        labels = torch.where(labels == self.num_classes,
+                             cls_scores[:, :-1].argmax(1), labels)
+
+        img_ids = rois[:, 0].long().unique(sorted=True)
+        assert img_ids.numel() <= len(batch_img_metas)
+
+        results_list = []
+        for i in range(len(batch_img_metas)):
+            inds = torch.nonzero(
+                rois[:, 0] == i, as_tuple=False).squeeze(dim=1)
+            num_rois = inds.numel()
+
+            bboxes_ = rois[inds, 1:]
+            label_ = labels[inds]
+            edge_cls_preds, edge_offset_preds = bbox_preds
+            edge_cls_preds_ = edge_cls_preds[inds]
+            edge_offset_preds_ = edge_offset_preds[inds]
+            bbox_pred_ = (edge_cls_preds_, edge_offset_preds_)
+            img_meta_ = batch_img_metas[i]
+            pos_is_gts_ = pos_is_gts[i]
+
+            bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_,
+                                           img_meta_)
+            # filter gt bboxes
+            pos_keep = 1 - pos_is_gts_
+            keep_inds = pos_is_gts_.new_ones(num_rois)
+            keep_inds[:len(pos_is_gts_)] = pos_keep
+            results = InstanceData(bboxes=bboxes[keep_inds.type(torch.bool)])
+            results_list.append(results)
+
+        return results_list
+
+    def regress_by_class(self, rois: Tensor, label: Tensor, bbox_pred: tuple,
+                         img_meta: dict) -> Tensor:
+        """Regress the bbox for the predicted class. Used in Cascade R-CNN.
+
+        Args:
+            rois (Tensor): shape (n, 4) or (n, 5)
+            label (Tensor): shape (n, )
+            bbox_pred (Tuple[Tensor]): shape [(n, num_buckets *2), \
+                (n, num_buckets *2)]
+            img_meta (dict): Image meta info.
+
+        Returns:
+            Tensor: Regressed bboxes, the same shape as input rois.
+        """
+        assert rois.size(1) == 4 or rois.size(1) == 5
+
+        if rois.size(1) == 4:
+            new_rois, _ = self.bbox_coder.decode(rois, bbox_pred,
+                                                 img_meta['img_shape'])
+        else:
+            bboxes, _ = self.bbox_coder.decode(rois[:, 1:], bbox_pred,
+                                               img_meta['img_shape'])
+            new_rois = torch.cat((rois[:, [0]], bboxes), dim=1)
+
+        return new_rois
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..790b08fb207970927c7925cb8b3fb365bc183dc4
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .convfc_bbox_head import ConvFCBBoxHead
+
+
+@MODELS.register_module()
+class SCNetBBoxHead(ConvFCBBoxHead):
+    """BBox head for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    This inherits ``ConvFCBBoxHead`` with modified forward() function, allow us
+    to get intermediate shared feature.
+    """
+
+    def _forward_shared(self, x: Tensor) -> Tensor:
+        """Forward function for shared part.
+
+        Args:
+            x (Tensor): Input feature.
+
+        Returns:
+            Tensor: Shared feature.
+        """
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+
+            x = x.flatten(1)
+
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+
+        return x
+
+    def _forward_cls_reg(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward function for classification and regression parts.
+
+        Args:
+            x (Tensor): Input feature.
+
+        Returns:
+            tuple[Tensor]:
+
+                - cls_score (Tensor): classification prediction.
+                - bbox_pred (Tensor): bbox prediction.
+        """
+        x_cls = x
+        x_reg = x
+
+        for conv in self.cls_convs:
+            x_cls = conv(x_cls)
+        if x_cls.dim() > 2:
+            if self.with_avg_pool:
+                x_cls = self.avg_pool(x_cls)
+            x_cls = x_cls.flatten(1)
+        for fc in self.cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+
+        for conv in self.reg_convs:
+            x_reg = conv(x_reg)
+        if x_reg.dim() > 2:
+            if self.with_avg_pool:
+                x_reg = self.avg_pool(x_reg)
+            x_reg = x_reg.flatten(1)
+        for fc in self.reg_fcs:
+            x_reg = self.relu(fc(x_reg))
+
+        cls_score = self.fc_cls(x_cls) if self.with_cls else None
+        bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
+
+        return cls_score, bbox_pred
+
+    def forward(
+            self,
+            x: Tensor,
+            return_shared_feat: bool = False) -> Union[Tensor, Tuple[Tensor]]:
+        """Forward function.
+
+        Args:
+            x (Tensor): input features
+            return_shared_feat (bool): If True, return cls-reg-shared feature.
+
+        Return:
+            out (tuple[Tensor]): contain ``cls_score`` and ``bbox_pred``,
+                if  ``return_shared_feat`` is True, append ``x_shared`` to the
+                returned tuple.
+        """
+        x_shared = self._forward_shared(x)
+        out = self._forward_cls_reg(x_shared)
+
+        if return_shared_feat:
+            out += (x_shared, )
+
+        return out
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/cascade_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/cascade_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..81db671113a63beb7849abdc0e432a738ee46f5e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/cascade_roi_head.py
@@ -0,0 +1,568 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.model import ModuleList
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.test_time_augs import merge_aug_masks
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi, get_box_tensor
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptMultiConfig)
+from ..utils.misc import empty_instances, unpack_gt_instances
+from .base_roi_head import BaseRoIHead
+
+
+@MODELS.register_module()
+class CascadeRoIHead(BaseRoIHead):
+    """Cascade roi head including one bbox head and one mask head.
+
+    https://arxiv.org/abs/1712.00726
+    """
+
+    def __init__(self,
+                 num_stages: int,
+                 stage_loss_weights: Union[List[float], Tuple[float]],
+                 bbox_roi_extractor: OptMultiConfig = None,
+                 bbox_head: OptMultiConfig = None,
+                 mask_roi_extractor: OptMultiConfig = None,
+                 mask_head: OptMultiConfig = None,
+                 shared_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        assert bbox_roi_extractor is not None
+        assert bbox_head is not None
+        assert shared_head is None, \
+            'Shared head is not supported in Cascade RCNN anymore'
+
+        self.num_stages = num_stages
+        self.stage_loss_weights = stage_loss_weights
+        super().__init__(
+            bbox_roi_extractor=bbox_roi_extractor,
+            bbox_head=bbox_head,
+            mask_roi_extractor=mask_roi_extractor,
+            mask_head=mask_head,
+            shared_head=shared_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+
+    def init_bbox_head(self, bbox_roi_extractor: MultiConfig,
+                       bbox_head: MultiConfig) -> None:
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (:obj:`ConfigDict`, dict or list):
+                Config of box roi extractor.
+            bbox_head (:obj:`ConfigDict`, dict or list): Config
+                of box in box head.
+        """
+        self.bbox_roi_extractor = ModuleList()
+        self.bbox_head = ModuleList()
+        if not isinstance(bbox_roi_extractor, list):
+            bbox_roi_extractor = [
+                bbox_roi_extractor for _ in range(self.num_stages)
+            ]
+        if not isinstance(bbox_head, list):
+            bbox_head = [bbox_head for _ in range(self.num_stages)]
+        assert len(bbox_roi_extractor) == len(bbox_head) == self.num_stages
+        for roi_extractor, head in zip(bbox_roi_extractor, bbox_head):
+            self.bbox_roi_extractor.append(MODELS.build(roi_extractor))
+            self.bbox_head.append(MODELS.build(head))
+
+    def init_mask_head(self, mask_roi_extractor: MultiConfig,
+                       mask_head: MultiConfig) -> None:
+        """Initialize mask head and mask roi extractor.
+
+        Args:
+            mask_head (dict): Config of mask in mask head.
+            mask_roi_extractor (:obj:`ConfigDict`, dict or list):
+                Config of mask roi extractor.
+        """
+        self.mask_head = nn.ModuleList()
+        if not isinstance(mask_head, list):
+            mask_head = [mask_head for _ in range(self.num_stages)]
+        assert len(mask_head) == self.num_stages
+        for head in mask_head:
+            self.mask_head.append(MODELS.build(head))
+        if mask_roi_extractor is not None:
+            self.share_roi_extractor = False
+            self.mask_roi_extractor = ModuleList()
+            if not isinstance(mask_roi_extractor, list):
+                mask_roi_extractor = [
+                    mask_roi_extractor for _ in range(self.num_stages)
+                ]
+            assert len(mask_roi_extractor) == self.num_stages
+            for roi_extractor in mask_roi_extractor:
+                self.mask_roi_extractor.append(MODELS.build(roi_extractor))
+        else:
+            self.share_roi_extractor = True
+            self.mask_roi_extractor = self.bbox_roi_extractor
+
+    def init_assigner_sampler(self) -> None:
+        """Initialize assigner and sampler for each stage."""
+        self.bbox_assigner = []
+        self.bbox_sampler = []
+        if self.train_cfg is not None:
+            for idx, rcnn_train_cfg in enumerate(self.train_cfg):
+                self.bbox_assigner.append(
+                    TASK_UTILS.build(rcnn_train_cfg.assigner))
+                self.current_stage = idx
+                self.bbox_sampler.append(
+                    TASK_UTILS.build(
+                        rcnn_train_cfg.sampler,
+                        default_args=dict(context=self)))
+
+    def _bbox_forward(self, stage: int, x: Tuple[Tensor],
+                      rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        cls_score, bbox_pred = bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def bbox_loss(self, stage: int, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult]) -> dict:
+        """Run forward function and calculate loss for box head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+        """
+        bbox_head = self.bbox_head[stage]
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(stage, x, rois)
+        bbox_results.update(rois=rois)
+
+        bbox_loss_and_target = bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg[stage])
+        bbox_results.update(bbox_loss_and_target)
+
+        return bbox_results
+
+    def _mask_forward(self, stage: int, x: Tuple[Tensor],
+                      rois: Tensor) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        mask_preds = mask_head(mask_feats)
+
+        mask_results = dict(mask_preds=mask_preds)
+        return mask_results
+
+    def mask_loss(self, stage: int, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  batch_gt_instances: InstanceList) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+        mask_results = self._mask_forward(stage, x, pos_rois)
+
+        mask_head = self.mask_head[stage]
+
+        mask_loss_and_target = mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg[stage])
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        # TODO: May add a new function in baseroihead
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        num_imgs = len(batch_data_samples)
+        losses = dict()
+        results_list = rpn_results_list
+        for stage in range(self.num_stages):
+            self.current_stage = stage
+
+            stage_loss_weight = self.stage_loss_weights[stage]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            if self.with_bbox or self.with_mask:
+                bbox_assigner = self.bbox_assigner[stage]
+                bbox_sampler = self.bbox_sampler[stage]
+
+                for i in range(num_imgs):
+                    results = results_list[i]
+                    # rename rpn_results.bboxes to rpn_results.priors
+                    results.priors = results.pop('bboxes')
+
+                    assign_result = bbox_assigner.assign(
+                        results, batch_gt_instances[i],
+                        batch_gt_instances_ignore[i])
+
+                    sampling_result = bbox_sampler.sample(
+                        assign_result,
+                        results,
+                        batch_gt_instances[i],
+                        feats=[lvl_feat[i][None] for lvl_feat in x])
+                    sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = self.bbox_loss(stage, x, sampling_results)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{stage}.{name}'] = (
+                    value * stage_loss_weight if 'loss' in name else value)
+
+            # mask head forward and loss
+            if self.with_mask:
+                mask_results = self.mask_loss(stage, x, sampling_results,
+                                              batch_gt_instances)
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{stage}.{name}'] = (
+                        value * stage_loss_weight if 'loss' in name else value)
+
+            # refine bboxes
+            if stage < self.num_stages - 1:
+                bbox_head = self.bbox_head[stage]
+                with torch.no_grad():
+                    results_list = bbox_head.refine_bboxes(
+                        sampling_results, bbox_results, batch_img_metas)
+                    # Empty proposal
+                    if results_list is None:
+                        break
+        return losses
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False,
+                     **kwargs) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposals = [res.bboxes for res in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+
+        if rois.shape[0] == 0:
+            return empty_instances(
+                batch_img_metas,
+                rois.device,
+                task_type='bbox',
+                box_type=self.bbox_head[-1].predict_box_type,
+                num_classes=self.bbox_head[-1].num_classes,
+                score_per_cls=rcnn_test_cfg is None)
+
+        rois, cls_scores, bbox_preds = self._refine_roi(
+            x=x,
+            rois=rois,
+            batch_img_metas=batch_img_metas,
+            num_proposals_per_img=num_proposals_per_img,
+            **kwargs)
+
+        results_list = self.bbox_head[-1].predict_by_feat(
+            rois=rois,
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale,
+            rcnn_test_cfg=rcnn_test_cfg)
+        return results_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: List[InstanceData],
+                     rescale: bool = False) -> List[InstanceData]:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        aug_masks = []
+        for stage in range(self.num_stages):
+            mask_results = self._mask_forward(stage, x, mask_rois)
+            mask_preds = mask_results['mask_preds']
+            # split batch mask prediction back to each image
+            mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+            aug_masks.append([m.sigmoid().detach() for m in mask_preds])
+
+        merged_masks = []
+        for i in range(len(batch_img_metas)):
+            aug_mask = [mask[i] for mask in aug_masks]
+            merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+            merged_masks.append(merged_mask)
+        results_list = self.mask_head[-1].predict_by_feat(
+            mask_preds=merged_masks,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale,
+            activate_map=True)
+        return results_list
+
+    def _refine_roi(self, x: Tuple[Tensor], rois: Tensor,
+                    batch_img_metas: List[dict],
+                    num_proposals_per_img: Sequence[int], **kwargs) -> tuple:
+        """Multi-stage refinement of RoI.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): shape (n, 5), [batch_ind, x1, y1, x2, y2]
+            batch_img_metas (list[dict]): List of image information.
+            num_proposals_per_img (sequence[int]): number of proposals
+                in each image.
+
+        Returns:
+            tuple:
+
+               - rois (Tensor): Refined RoI.
+               - cls_scores (list[Tensor]): Average predicted
+                   cls score per image.
+               - bbox_preds (list[Tensor]): Bbox branch predictions
+                   for the last stage of per image.
+        """
+        # "ms" in variable names means multi-stage
+        ms_scores = []
+        for stage in range(self.num_stages):
+            bbox_results = self._bbox_forward(
+                stage=stage, x=x, rois=rois, **kwargs)
+
+            # split batch bbox prediction back to each image
+            cls_scores = bbox_results['cls_score']
+            bbox_preds = bbox_results['bbox_pred']
+
+            rois = rois.split(num_proposals_per_img, 0)
+            cls_scores = cls_scores.split(num_proposals_per_img, 0)
+            ms_scores.append(cls_scores)
+
+            # some detector with_reg is False, bbox_preds will be None
+            if bbox_preds is not None:
+                # TODO move this to a sabl_roi_head
+                # the bbox prediction of some detectors like SABL is not Tensor
+                if isinstance(bbox_preds, torch.Tensor):
+                    bbox_preds = bbox_preds.split(num_proposals_per_img, 0)
+                else:
+                    bbox_preds = self.bbox_head[stage].bbox_pred_split(
+                        bbox_preds, num_proposals_per_img)
+            else:
+                bbox_preds = (None, ) * len(batch_img_metas)
+
+            if stage < self.num_stages - 1:
+                bbox_head = self.bbox_head[stage]
+                if bbox_head.custom_activation:
+                    cls_scores = [
+                        bbox_head.loss_cls.get_activation(s)
+                        for s in cls_scores
+                    ]
+                refine_rois_list = []
+                for i in range(len(batch_img_metas)):
+                    if rois[i].shape[0] > 0:
+                        bbox_label = cls_scores[i][:, :-1].argmax(dim=1)
+                        # Refactor `bbox_head.regress_by_class` to only accept
+                        # box tensor without img_idx concatenated.
+                        refined_bboxes = bbox_head.regress_by_class(
+                            rois[i][:, 1:], bbox_label, bbox_preds[i],
+                            batch_img_metas[i])
+                        refined_bboxes = get_box_tensor(refined_bboxes)
+                        refined_rois = torch.cat(
+                            [rois[i][:, [0]], refined_bboxes], dim=1)
+                        refine_rois_list.append(refined_rois)
+                rois = torch.cat(refine_rois_list)
+
+        # average scores of each image by stages
+        cls_scores = [
+            sum([score[i] for score in ms_scores]) / float(len(ms_scores))
+            for i in range(len(batch_img_metas))
+        ]
+        return rois, cls_scores, bbox_preds
+
+    def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+                batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            rois, cls_scores, bbox_preds = self._refine_roi(
+                x, rois, batch_img_metas, num_proposals_per_img)
+            results = results + (cls_scores, bbox_preds)
+        # mask head
+        if self.with_mask:
+            aug_masks = []
+            rois = torch.cat(rois)
+            for stage in range(self.num_stages):
+                mask_results = self._mask_forward(stage, x, rois)
+                mask_preds = mask_results['mask_preds']
+                mask_preds = mask_preds.split(num_proposals_per_img, 0)
+                aug_masks.append([m.sigmoid().detach() for m in mask_preds])
+
+            merged_masks = []
+            for i in range(len(batch_img_metas)):
+                aug_mask = [mask[i] for mask in aug_masks]
+                merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+                merged_masks.append(merged_mask)
+            results = results + (merged_masks, )
+        return results
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/double_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/double_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9464ff55bafcca9f3545a3a72dde1eb3939cece
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/double_roi_head.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class DoubleHeadRoIHead(StandardRoIHead):
+    """RoI head for `Double Head RCNN <https://arxiv.org/abs/1904.06493>`_.
+
+    Args:
+        reg_roi_scale_factor (float): The scale factor to extend the rois
+            used to extract the regression features.
+    """
+
+    def __init__(self, reg_roi_scale_factor: float, **kwargs):
+        super().__init__(**kwargs)
+        self.reg_roi_scale_factor = reg_roi_scale_factor
+
+    def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_cls_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        bbox_reg_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs],
+            rois,
+            roi_scale_factor=self.reg_roi_scale_factor)
+        if self.with_shared_head:
+            bbox_cls_feats = self.shared_head(bbox_cls_feats)
+            bbox_reg_feats = self.shared_head(bbox_reg_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_cls_feats, bbox_reg_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            bbox_feats=bbox_cls_feats)
+        return bbox_results
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/dynamic_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/dynamic_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c7f7bd2f68cab0fcdec725501f74b65274eb30e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/dynamic_roi_head.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.models.losses import SmoothL1Loss
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList
+from ..utils.misc import unpack_gt_instances
+from .standard_roi_head import StandardRoIHead
+
+EPS = 1e-15
+
+
+@MODELS.register_module()
+class DynamicRoIHead(StandardRoIHead):
+    """RoI head for `Dynamic R-CNN <https://arxiv.org/abs/2004.06002>`_."""
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        assert isinstance(self.bbox_head.loss_bbox, SmoothL1Loss)
+        # the IoU history of the past `update_iter_interval` iterations
+        self.iou_history = []
+        # the beta history of the past `update_iter_interval` iterations
+        self.beta_history = []
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Forward function for training.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(batch_data_samples)
+        sampling_results = []
+        cur_iou = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            # record the `iou_topk`-th largest IoU in an image
+            iou_topk = min(self.train_cfg.dynamic_rcnn.iou_topk,
+                           len(assign_result.max_overlaps))
+            ious, _ = torch.topk(assign_result.max_overlaps, iou_topk)
+            cur_iou.append(ious[-1].item())
+            sampling_results.append(sampling_result)
+        # average the current IoUs over images
+        cur_iou = np.mean(cur_iou)
+        self.iou_history.append(cur_iou)
+
+        losses = dict()
+        # bbox head forward and loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(x, sampling_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self.mask_loss(x, sampling_results,
+                                          bbox_results['bbox_feats'],
+                                          batch_gt_instances)
+            losses.update(mask_results['loss_mask'])
+
+        # update IoU threshold and SmoothL1 beta
+        update_iter_interval = self.train_cfg.dynamic_rcnn.update_iter_interval
+        if len(self.iou_history) % update_iter_interval == 0:
+            new_iou_thr, new_beta = self.update_hyperparameters()
+
+        return losses
+
+    def bbox_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult]) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        bbox_loss_and_target = self.bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg)
+        bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
+
+        # record the `beta_topk`-th smallest target
+        # `bbox_targets[2]` and `bbox_targets[3]` stand for bbox_targets
+        # and bbox_weights, respectively
+        bbox_targets = bbox_loss_and_target['bbox_targets']
+        pos_inds = bbox_targets[3][:, 0].nonzero().squeeze(1)
+        num_pos = len(pos_inds)
+        num_imgs = len(sampling_results)
+        if num_pos > 0:
+            cur_target = bbox_targets[2][pos_inds, :2].abs().mean(dim=1)
+            beta_topk = min(self.train_cfg.dynamic_rcnn.beta_topk * num_imgs,
+                            num_pos)
+            cur_target = torch.kthvalue(cur_target, beta_topk)[0].item()
+            self.beta_history.append(cur_target)
+
+        return bbox_results
+
+    def update_hyperparameters(self):
+        """Update hyperparameters like IoU thresholds for assigner and beta for
+        SmoothL1 loss based on the training statistics.
+
+        Returns:
+            tuple[float]: the updated ``iou_thr`` and ``beta``.
+        """
+        new_iou_thr = max(self.train_cfg.dynamic_rcnn.initial_iou,
+                          np.mean(self.iou_history))
+        self.iou_history = []
+        self.bbox_assigner.pos_iou_thr = new_iou_thr
+        self.bbox_assigner.neg_iou_thr = new_iou_thr
+        self.bbox_assigner.min_pos_iou = new_iou_thr
+        if (not self.beta_history) or (np.median(self.beta_history) < EPS):
+            # avoid 0 or too small value for new_beta
+            new_beta = self.bbox_head.loss_bbox.beta
+        else:
+            new_beta = min(self.train_cfg.dynamic_rcnn.initial_beta,
+                           np.median(self.beta_history))
+        self.beta_history = []
+        self.bbox_head.loss_bbox.beta = new_beta
+        return new_iou_thr, new_beta
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/grid_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/grid_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eda7f01bcd4e44faca14b61ec4956ee2c372ad6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/grid_roi_head.py
@@ -0,0 +1,280 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils.misc import unpack_gt_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class GridRoIHead(StandardRoIHead):
+    """Implementation of `Grid RoI Head <https://arxiv.org/abs/1811.12030>`_
+
+    Args:
+        grid_roi_extractor (:obj:`ConfigDict` or dict): Config of
+            roi extractor.
+        grid_head (:obj:`ConfigDict` or dict): Config of grid head
+    """
+
+    def __init__(self, grid_roi_extractor: ConfigType, grid_head: ConfigType,
+                 **kwargs) -> None:
+        assert grid_head is not None
+        super().__init__(**kwargs)
+        if grid_roi_extractor is not None:
+            self.grid_roi_extractor = MODELS.build(grid_roi_extractor)
+            self.share_roi_extractor = False
+        else:
+            self.share_roi_extractor = True
+            self.grid_roi_extractor = self.bbox_roi_extractor
+        self.grid_head = MODELS.build(grid_head)
+
+    def _random_jitter(self,
+                       sampling_results: List[SamplingResult],
+                       batch_img_metas: List[dict],
+                       amplitude: float = 0.15) -> List[SamplingResult]:
+        """Ramdom jitter positive proposals for training.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_img_metas (list[dict]): List of image information.
+            amplitude (float): Amplitude of random offset. Defaults to 0.15.
+
+        Returns:
+            list[obj:SamplingResult]: SamplingResults after random jittering.
+        """
+        for sampling_result, img_meta in zip(sampling_results,
+                                             batch_img_metas):
+            bboxes = sampling_result.pos_priors
+            random_offsets = bboxes.new_empty(bboxes.shape[0], 4).uniform_(
+                -amplitude, amplitude)
+            # before jittering
+            cxcy = (bboxes[:, 2:4] + bboxes[:, :2]) / 2
+            wh = (bboxes[:, 2:4] - bboxes[:, :2]).abs()
+            # after jittering
+            new_cxcy = cxcy + wh * random_offsets[:, :2]
+            new_wh = wh * (1 + random_offsets[:, 2:])
+            # xywh to xyxy
+            new_x1y1 = (new_cxcy - new_wh / 2)
+            new_x2y2 = (new_cxcy + new_wh / 2)
+            new_bboxes = torch.cat([new_x1y1, new_x2y2], dim=1)
+            # clip bboxes
+            max_shape = img_meta['img_shape']
+            if max_shape is not None:
+                new_bboxes[:, 0::2].clamp_(min=0, max=max_shape[1] - 1)
+                new_bboxes[:, 1::2].clamp_(min=0, max=max_shape[0] - 1)
+
+            sampling_result.pos_priors = new_bboxes
+        return sampling_results
+
+    # TODO: Forward is incorrect and need to refactor.
+    def forward(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList = None) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (Tuple[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+            the meta information of each image and corresponding
+            annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            bbox_results = self._bbox_forward(x, rois)
+            results = results + (bbox_results['cls_score'], )
+            if self.bbox_head.with_reg:
+                results = results + (bbox_results['bbox_pred'], )
+
+            # grid head
+            grid_rois = rois[:100]
+            grid_feats = self.grid_roi_extractor(
+                x[:len(self.grid_roi_extractor.featmap_strides)], grid_rois)
+            if self.with_shared_head:
+                grid_feats = self.shared_head(grid_feats)
+            self.grid_head.test_mode = True
+            grid_preds = self.grid_head(grid_feats)
+            results = results + (grid_preds, )
+
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_results = self._mask_forward(x, mask_rois)
+            results = results + (mask_results['mask_preds'], )
+        return results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList, **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(batch_data_samples)
+        sampling_results = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            sampling_results.append(sampling_result)
+
+        losses = dict()
+        # bbox head loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(x, sampling_results, batch_img_metas)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self.mask_loss(x, sampling_results,
+                                          bbox_results['bbox_feats'],
+                                          batch_gt_instances)
+            losses.update(mask_results['loss_mask'])
+
+        return losses
+
+    def bbox_loss(self,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  batch_img_metas: Optional[List[dict]] = None) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list[:obj:`SamplingResult`]): Sampling results.
+            batch_img_metas (list[dict], optional): Meta information of each
+                image, e.g., image size, scaling factor, etc.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+            - `cls_score` (Tensor): Classification scores.
+            - `bbox_pred` (Tensor): Box energies / deltas.
+            - `bbox_feats` (Tensor): Extract bbox RoI features.
+            - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        assert batch_img_metas is not None
+        bbox_results = super().bbox_loss(x, sampling_results)
+
+        # Grid head forward and loss
+        sampling_results = self._random_jitter(sampling_results,
+                                               batch_img_metas)
+        pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+
+        # GN in head does not support zero shape input
+        if pos_rois.shape[0] == 0:
+            return bbox_results
+
+        grid_feats = self.grid_roi_extractor(
+            x[:self.grid_roi_extractor.num_inputs], pos_rois)
+        if self.with_shared_head:
+            grid_feats = self.shared_head(grid_feats)
+        # Accelerate training
+        max_sample_num_grid = self.train_cfg.get('max_num_grid', 192)
+        sample_idx = torch.randperm(
+            grid_feats.shape[0])[:min(grid_feats.shape[0], max_sample_num_grid
+                                      )]
+        grid_feats = grid_feats[sample_idx]
+        grid_pred = self.grid_head(grid_feats)
+
+        loss_grid = self.grid_head.loss(grid_pred, sample_idx,
+                                        sampling_results, self.train_cfg)
+
+        bbox_results['loss_bbox'].update(loss_grid)
+        return bbox_results
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (:obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape \
+            (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last \
+            dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results_list = super().predict_bbox(
+            x,
+            batch_img_metas=batch_img_metas,
+            rpn_results_list=rpn_results_list,
+            rcnn_test_cfg=rcnn_test_cfg,
+            rescale=False)
+
+        grid_rois = bbox2roi([res.bboxes for res in results_list])
+        if grid_rois.shape[0] != 0:
+            grid_feats = self.grid_roi_extractor(
+                x[:len(self.grid_roi_extractor.featmap_strides)], grid_rois)
+            if self.with_shared_head:
+                grid_feats = self.shared_head(grid_feats)
+            self.grid_head.test_mode = True
+            grid_preds = self.grid_head(grid_feats)
+            results_list = self.grid_head.predict_by_feat(
+                grid_preds=grid_preds,
+                results_list=results_list,
+                batch_img_metas=batch_img_metas,
+                rescale=rescale)
+
+        return results_list
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/htc_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/htc_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fdd99ddd5ce4d9d42345d1f1d14ecbcae658124
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/htc_roi_head.py
@@ -0,0 +1,581 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.models.test_time_augs import merge_aug_masks
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList, OptConfigType
+from ..layers import adaptive_avg_pool2d
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances, unpack_gt_instances
+from .cascade_roi_head import CascadeRoIHead
+
+
+@MODELS.register_module()
+class HybridTaskCascadeRoIHead(CascadeRoIHead):
+    """Hybrid task cascade roi head including one bbox head and one mask head.
+
+    https://arxiv.org/abs/1901.07518
+
+    Args:
+        num_stages (int): Number of cascade stages.
+        stage_loss_weights (list[float]): Loss weight for every stage.
+        semantic_roi_extractor (:obj:`ConfigDict` or dict, optional):
+            Config of semantic roi extractor. Defaults to None.
+        Semantic_head (:obj:`ConfigDict` or dict, optional):
+            Config of semantic head. Defaults to None.
+        interleaved (bool): Whether to interleaves the box branch and mask
+            branch. If True, the mask branch can take the refined bounding
+            box predictions. Defaults to True.
+        mask_info_flow (bool): Whether to turn on the mask information flow,
+            which means that feeding the mask features of the preceding stage
+            to the current stage. Defaults to True.
+    """
+
+    def __init__(self,
+                 num_stages: int,
+                 stage_loss_weights: List[float],
+                 semantic_roi_extractor: OptConfigType = None,
+                 semantic_head: OptConfigType = None,
+                 semantic_fusion: Tuple[str] = ('bbox', 'mask'),
+                 interleaved: bool = True,
+                 mask_info_flow: bool = True,
+                 **kwargs) -> None:
+        super().__init__(
+            num_stages=num_stages,
+            stage_loss_weights=stage_loss_weights,
+            **kwargs)
+        assert self.with_bbox
+        assert not self.with_shared_head  # shared head is not supported
+
+        if semantic_head is not None:
+            self.semantic_roi_extractor = MODELS.build(semantic_roi_extractor)
+            self.semantic_head = MODELS.build(semantic_head)
+
+        self.semantic_fusion = semantic_fusion
+        self.interleaved = interleaved
+        self.mask_info_flow = mask_info_flow
+
+    # TODO move to base_roi_head later
+    @property
+    def with_semantic(self) -> bool:
+        """bool: whether the head has semantic head"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    def _bbox_forward(
+            self,
+            stage: int,
+            x: Tuple[Tensor],
+            rois: Tensor,
+            semantic_feat: Optional[Tensor] = None) -> Dict[str, Tensor]:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            semantic_feat (Tensor, optional): Semantic feature. Defaults to
+                None.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        if self.with_semantic and 'bbox' in self.semantic_fusion:
+            bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]:
+                bbox_semantic_feat = adaptive_avg_pool2d(
+                    bbox_semantic_feat, bbox_feats.shape[-2:])
+            bbox_feats += bbox_semantic_feat
+        cls_score, bbox_pred = bbox_head(bbox_feats)
+
+        bbox_results = dict(cls_score=cls_score, bbox_pred=bbox_pred)
+        return bbox_results
+
+    def bbox_loss(self,
+                  stage: int,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  semantic_feat: Optional[Tensor] = None) -> dict:
+        """Run forward function and calculate loss for box head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            semantic_feat (Tensor, optional): Semantic feature. Defaults to
+                None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+        """
+        bbox_head = self.bbox_head[stage]
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(
+            stage, x, rois, semantic_feat=semantic_feat)
+        bbox_results.update(rois=rois)
+
+        bbox_loss_and_target = bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg[stage])
+        bbox_results.update(bbox_loss_and_target)
+        return bbox_results
+
+    def _mask_forward(self,
+                      stage: int,
+                      x: Tuple[Tensor],
+                      rois: Tensor,
+                      semantic_feat: Optional[Tensor] = None,
+                      training: bool = True) -> Dict[str, Tensor]:
+        """Mask head forward function used only in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            semantic_feat (Tensor, optional): Semantic feature. Defaults to
+                None.
+            training (bool): Mask Forward is different between training and
+                testing. If True, use the mask forward in training.
+                Defaults to True.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        rois)
+
+        # semantic feature fusion
+        # element-wise sum for original features and pooled semantic features
+        if self.with_semantic and 'mask' in self.semantic_fusion:
+            mask_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]:
+                mask_semantic_feat = F.adaptive_avg_pool2d(
+                    mask_semantic_feat, mask_feats.shape[-2:])
+            mask_feats = mask_feats + mask_semantic_feat
+
+        # mask information flow
+        # forward all previous mask heads to obtain last_feat, and fuse it
+        # with the normal mask feature
+        if training:
+            if self.mask_info_flow:
+                last_feat = None
+                for i in range(stage):
+                    last_feat = self.mask_head[i](
+                        mask_feats, last_feat, return_logits=False)
+                mask_preds = mask_head(
+                    mask_feats, last_feat, return_feat=False)
+            else:
+                mask_preds = mask_head(mask_feats, return_feat=False)
+
+            mask_results = dict(mask_preds=mask_preds)
+        else:
+            aug_masks = []
+            last_feat = None
+            for i in range(self.num_stages):
+                mask_head = self.mask_head[i]
+                if self.mask_info_flow:
+                    mask_preds, last_feat = mask_head(mask_feats, last_feat)
+                else:
+                    mask_preds = mask_head(mask_feats)
+            aug_masks.append(mask_preds)
+
+            mask_results = dict(mask_preds=aug_masks)
+
+        return mask_results
+
+    def mask_loss(self,
+                  stage: int,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  batch_gt_instances: InstanceList,
+                  semantic_feat: Optional[Tensor] = None) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            semantic_feat (Tensor, optional): Semantic feature. Defaults to
+                None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+        mask_results = self._mask_forward(
+            stage=stage,
+            x=x,
+            rois=pos_rois,
+            semantic_feat=semantic_feat,
+            training=True)
+
+        mask_head = self.mask_head[stage]
+        mask_loss_and_target = mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg[stage])
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        # semantic segmentation part
+        # 2 outputs: segmentation prediction and embedded features
+        losses = dict()
+        if self.with_semantic:
+            gt_semantic_segs = [
+                data_sample.gt_sem_seg.sem_seg
+                for data_sample in batch_data_samples
+            ]
+            gt_semantic_segs = torch.stack(gt_semantic_segs)
+            semantic_pred, semantic_feat = self.semantic_head(x)
+            loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_segs)
+            losses['loss_semantic_seg'] = loss_seg
+        else:
+            semantic_feat = None
+
+        results_list = rpn_results_list
+        num_imgs = len(batch_img_metas)
+        for stage in range(self.num_stages):
+            self.current_stage = stage
+
+            stage_loss_weight = self.stage_loss_weights[stage]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            bbox_assigner = self.bbox_assigner[stage]
+            bbox_sampler = self.bbox_sampler[stage]
+            for i in range(num_imgs):
+                results = results_list[i]
+                # rename rpn_results.bboxes to rpn_results.priors
+                if 'bboxes' in results:
+                    results.priors = results.pop('bboxes')
+
+                assign_result = bbox_assigner.assign(
+                    results, batch_gt_instances[i],
+                    batch_gt_instances_ignore[i])
+                sampling_result = bbox_sampler.sample(
+                    assign_result,
+                    results,
+                    batch_gt_instances[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = self.bbox_loss(
+                stage=stage,
+                x=x,
+                sampling_results=sampling_results,
+                semantic_feat=semantic_feat)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{stage}.{name}'] = (
+                    value * stage_loss_weight if 'loss' in name else value)
+
+            # mask head forward and loss
+            if self.with_mask:
+                # interleaved execution: use regressed bboxes by the box branch
+                # to train the mask branch
+                if self.interleaved:
+                    bbox_head = self.bbox_head[stage]
+                    with torch.no_grad():
+                        results_list = bbox_head.refine_bboxes(
+                            sampling_results, bbox_results, batch_img_metas)
+                        # re-assign and sample 512 RoIs from 512 RoIs
+                        sampling_results = []
+                        for i in range(num_imgs):
+                            results = results_list[i]
+                            # rename rpn_results.bboxes to rpn_results.priors
+                            results.priors = results.pop('bboxes')
+                            assign_result = bbox_assigner.assign(
+                                results, batch_gt_instances[i],
+                                batch_gt_instances_ignore[i])
+                            sampling_result = bbox_sampler.sample(
+                                assign_result,
+                                results,
+                                batch_gt_instances[i],
+                                feats=[lvl_feat[i][None] for lvl_feat in x])
+                            sampling_results.append(sampling_result)
+                mask_results = self.mask_loss(
+                    stage=stage,
+                    x=x,
+                    sampling_results=sampling_results,
+                    batch_gt_instances=batch_gt_instances,
+                    semantic_feat=semantic_feat)
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{stage}.{name}'] = (
+                        value * stage_loss_weight if 'loss' in name else value)
+
+            # refine bboxes (same as Cascade R-CNN)
+            if stage < self.num_stages - 1 and not self.interleaved:
+                bbox_head = self.bbox_head[stage]
+                with torch.no_grad():
+                    results_list = bbox_head.refine_bboxes(
+                        sampling_results=sampling_results,
+                        bbox_results=bbox_results,
+                        batch_img_metas=batch_img_metas)
+
+        return losses
+
+    def predict(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (N, C, H, W).
+            rpn_results_list (list[:obj:`InstanceData`]): list of region
+                proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results to
+                the original image. Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        # TODO: nms_op in mmcv need be enhanced, the bbox result may get
+        #  difference when not rescale in bbox_head
+
+        # If it has the mask branch, the bbox branch does not need
+        # to be scaled to the original image scale, because the mask
+        # branch will scale both bbox and mask at the same time.
+        bbox_rescale = rescale if not self.with_mask else False
+        results_list = self.predict_bbox(
+            x=x,
+            semantic_feat=semantic_feat,
+            batch_img_metas=batch_img_metas,
+            rpn_results_list=rpn_results_list,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=bbox_rescale)
+
+        if self.with_mask:
+            results_list = self.predict_mask(
+                x=x,
+                semantic_heat=semantic_feat,
+                batch_img_metas=batch_img_metas,
+                results_list=results_list,
+                rescale=rescale)
+
+        return results_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     semantic_heat: Tensor,
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            semantic_feat (Tensor): Semantic feature.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        num_imgs = len(batch_img_metas)
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas=batch_img_metas,
+                device=mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        mask_results = self._mask_forward(
+            stage=-1,
+            x=x,
+            rois=mask_rois,
+            semantic_feat=semantic_heat,
+            training=False)
+        # split batch mask prediction back to each image
+        aug_masks = [[
+            mask.sigmoid().detach()
+            for mask in mask_preds.split(num_mask_rois_per_img, 0)
+        ] for mask_preds in mask_results['mask_preds']]
+
+        merged_masks = []
+        for i in range(num_imgs):
+            aug_mask = [mask[i] for mask in aug_masks]
+            merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+            merged_masks.append(merged_mask)
+
+        results_list = self.mask_head[-1].predict_by_feat(
+            mask_preds=merged_masks,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale,
+            activate_map=True)
+
+        return results_list
+
+    def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+                batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        num_imgs = len(batch_img_metas)
+
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            rois, cls_scores, bbox_preds = self._refine_roi(
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                batch_img_metas=batch_img_metas,
+                num_proposals_per_img=num_proposals_per_img)
+            results = results + (cls_scores, bbox_preds)
+        # mask head
+        if self.with_mask:
+            rois = torch.cat(rois)
+            mask_results = self._mask_forward(
+                stage=-1,
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                training=False)
+            aug_masks = [[
+                mask.sigmoid().detach()
+                for mask in mask_preds.split(num_proposals_per_img, 0)
+            ] for mask_preds in mask_results['mask_preds']]
+
+            merged_masks = []
+            for i in range(num_imgs):
+                aug_mask = [mask[i] for mask in aug_masks]
+                merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+                merged_masks.append(merged_mask)
+            results = results + (merged_masks, )
+        return results
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/__init__.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a5d4227be41b8985403251e1803f78cf500636
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .coarse_mask_head import CoarseMaskHead
+from .dynamic_mask_head import DynamicMaskHead
+from .fcn_mask_head import FCNMaskHead
+from .feature_relay_head import FeatureRelayHead
+from .fused_semantic_head import FusedSemanticHead
+from .global_context_head import GlobalContextHead
+from .grid_head import GridHead
+from .htc_mask_head import HTCMaskHead
+from .mask_point_head import MaskPointHead
+from .maskiou_head import MaskIoUHead
+from .scnet_mask_head import SCNetMaskHead
+from .scnet_semantic_head import SCNetSemanticHead
+
+__all__ = [
+    'FCNMaskHead', 'HTCMaskHead', 'FusedSemanticHead', 'GridHead',
+    'MaskIoUHead', 'CoarseMaskHead', 'MaskPointHead', 'SCNetMaskHead',
+    'SCNetSemanticHead', 'GlobalContextHead', 'FeatureRelayHead',
+    'DynamicMaskHead'
+]
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1caa901228f2439492b82d1890eba468963eb28d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule, Linear
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig
+from .fcn_mask_head import FCNMaskHead
+
+
+@MODELS.register_module()
+class CoarseMaskHead(FCNMaskHead):
+    """Coarse mask head used in PointRend.
+
+    Compared with standard ``FCNMaskHead``, ``CoarseMaskHead`` will downsample
+    the input feature map instead of upsample it.
+
+    Args:
+        num_convs (int): Number of conv layers in the head. Defaults to 0.
+        num_fcs (int): Number of fc layers in the head. Defaults to 2.
+        fc_out_channels (int): Number of output channels of fc layer.
+            Defaults to 1024.
+        downsample_factor (int): The factor that feature map is downsampled by.
+            Defaults to 2.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_convs: int = 0,
+                 num_fcs: int = 2,
+                 fc_out_channels: int = 1024,
+                 downsample_factor: int = 2,
+                 init_cfg: MultiConfig = dict(
+                     type='Xavier',
+                     override=[
+                         dict(name='fcs'),
+                         dict(type='Constant', val=0.001, name='fc_logits')
+                     ]),
+                 *arg,
+                 **kwarg) -> None:
+        super().__init__(
+            *arg,
+            num_convs=num_convs,
+            upsample_cfg=dict(type=None),
+            init_cfg=None,
+            **kwarg)
+        self.init_cfg = init_cfg
+        self.num_fcs = num_fcs
+        assert self.num_fcs > 0
+        self.fc_out_channels = fc_out_channels
+        self.downsample_factor = downsample_factor
+        assert self.downsample_factor >= 1
+        # remove conv_logit
+        delattr(self, 'conv_logits')
+
+        if downsample_factor > 1:
+            downsample_in_channels = (
+                self.conv_out_channels
+                if self.num_convs > 0 else self.in_channels)
+            self.downsample_conv = ConvModule(
+                downsample_in_channels,
+                self.conv_out_channels,
+                kernel_size=downsample_factor,
+                stride=downsample_factor,
+                padding=0,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+        else:
+            self.downsample_conv = None
+
+        self.output_size = (self.roi_feat_size[0] // downsample_factor,
+                            self.roi_feat_size[1] // downsample_factor)
+        self.output_area = self.output_size[0] * self.output_size[1]
+
+        last_layer_dim = self.conv_out_channels * self.output_area
+
+        self.fcs = ModuleList()
+        for i in range(num_fcs):
+            fc_in_channels = (
+                last_layer_dim if i == 0 else self.fc_out_channels)
+            self.fcs.append(Linear(fc_in_channels, self.fc_out_channels))
+        last_layer_dim = self.fc_out_channels
+        output_channels = self.num_classes * self.output_area
+        self.fc_logits = Linear(last_layer_dim, output_channels)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        super(FCNMaskHead, self).init_weights()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tensor): Extract mask RoI features.
+
+        Returns:
+            Tensor: Predicted foreground masks.
+        """
+        for conv in self.convs:
+            x = conv(x)
+
+        if self.downsample_conv is not None:
+            x = self.downsample_conv(x)
+
+        x = x.flatten(1)
+        for fc in self.fcs:
+            x = self.relu(fc(x))
+        mask_preds = self.fc_logits(x).view(
+            x.size(0), self.num_classes, *self.output_size)
+        return mask_preds
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f33612b1b141668d0463435975c14a26fbe5a0cd
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmengine.config import ConfigDict
+from torch import Tensor
+
+from mmdet.models.task_modules import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, OptConfigType, reduce_mean
+from .fcn_mask_head import FCNMaskHead
+
+
+@MODELS.register_module()
+class DynamicMaskHead(FCNMaskHead):
+    r"""Dynamic Mask Head for
+    `Instances as Queries <http://arxiv.org/abs/2105.01928>`_
+
+    Args:
+        num_convs (int): Number of convolution layer.
+            Defaults to 4.
+        roi_feat_size (int): The output size of RoI extractor,
+            Defaults to 14.
+        in_channels (int): Input feature channels.
+            Defaults to 256.
+        conv_kernel_size (int): Kernel size of convolution layers.
+            Defaults to 3.
+        conv_out_channels (int): Output channels of convolution layers.
+            Defaults to 256.
+        num_classes (int): Number of classes.
+            Defaults to 80
+        class_agnostic (int): Whether generate class agnostic prediction.
+            Defaults to False.
+        dropout (float): Probability of drop the channel.
+            Defaults to 0.0
+        upsample_cfg (:obj:`ConfigDict` or dict): The config for
+            upsample layer.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): The convolution
+            layer config.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): The norm layer config.
+        dynamic_conv_cfg (:obj:`ConfigDict` or dict): The dynamic convolution
+            layer config.
+        loss_mask (:obj:`ConfigDict` or dict): The config for mask loss.
+    """
+
+    def __init__(self,
+                 num_convs: int = 4,
+                 roi_feat_size: int = 14,
+                 in_channels: int = 256,
+                 conv_kernel_size: int = 3,
+                 conv_out_channels: int = 256,
+                 num_classes: int = 80,
+                 class_agnostic: bool = False,
+                 upsample_cfg: ConfigType = dict(
+                     type='deconv', scale_factor=2),
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 dynamic_conv_cfg: ConfigType = dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     input_feat_shape=14,
+                     with_proj=False,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN')),
+                 loss_mask: ConfigType = dict(
+                     type='DiceLoss', loss_weight=8.0),
+                 **kwargs) -> None:
+        super().__init__(
+            num_convs=num_convs,
+            roi_feat_size=roi_feat_size,
+            in_channels=in_channels,
+            conv_kernel_size=conv_kernel_size,
+            conv_out_channels=conv_out_channels,
+            num_classes=num_classes,
+            class_agnostic=class_agnostic,
+            upsample_cfg=upsample_cfg,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            loss_mask=loss_mask,
+            **kwargs)
+        assert class_agnostic is False, \
+            'DynamicMaskHead only support class_agnostic=False'
+        self.fp16_enabled = False
+
+        self.instance_interactive_conv = MODELS.build(dynamic_conv_cfg)
+
+    def init_weights(self) -> None:
+        """Use xavier initialization for all weight parameter and set
+        classification head bias as a specific value when use focal loss."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+            nn.init.constant_(self.conv_logits.bias, 0.)
+
+    def forward(self, roi_feat: Tensor, proposal_feat: Tensor) -> Tensor:
+        """Forward function of DynamicMaskHead.
+
+        Args:
+            roi_feat (Tensor): Roi-pooling features with shape
+                (batch_size*num_proposals, feature_dimensions,
+                pooling_h , pooling_w).
+            proposal_feat (Tensor): Intermediate feature get from
+                diihead in last stage, has shape
+                (batch_size*num_proposals, feature_dimensions)
+
+          Returns:
+            mask_preds (Tensor): Predicted foreground masks with shape
+            (batch_size*num_proposals, num_classes, pooling_h*2, pooling_w*2).
+        """
+
+        proposal_feat = proposal_feat.reshape(-1, self.in_channels)
+        proposal_feat_iic = self.instance_interactive_conv(
+            proposal_feat, roi_feat)
+
+        x = proposal_feat_iic.permute(0, 2, 1).reshape(roi_feat.size())
+
+        for conv in self.convs:
+            x = conv(x)
+        if self.upsample is not None:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+        mask_preds = self.conv_logits(x)
+        return mask_preds
+
+    def loss_and_target(self, mask_preds: Tensor,
+                        sampling_results: List[SamplingResult],
+                        batch_gt_instances: InstanceList,
+                        rcnn_train_cfg: ConfigDict) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (Tensor): Predicted foreground masks, has shape
+                (num_pos, num_classes, h, w).
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+        """
+        mask_targets = self.get_targets(
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=rcnn_train_cfg)
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+
+        num_pos = pos_labels.new_ones(pos_labels.size()).float().sum()
+        avg_factor = torch.clamp(reduce_mean(num_pos), min=1.).item()
+        loss = dict()
+        if mask_preds.size(0) == 0:
+            loss_mask = mask_preds.sum()
+        else:
+            loss_mask = self.loss_mask(
+                mask_preds[torch.arange(num_pos).long(), pos_labels,
+                           ...].sigmoid(),
+                mask_targets,
+                avg_factor=avg_factor)
+        loss['loss_mask'] = loss_mask
+        return dict(loss_mask=loss, mask_targets=mask_targets)
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a089dfafcb69784f2fc266f0945e6d56b0466d3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
@@ -0,0 +1,474 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_conv_layer, build_upsample_layer
+from mmcv.ops.carafe import CARAFEPack
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, ModuleList
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import empty_instances
+from mmdet.registry import MODELS
+from mmdet.structures.mask import mask_target
+from mmdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig
+
+BYTES_PER_FLOAT = 4
+# TODO: This memory limit may be too much or too little. It would be better to
+#  determine it based on available resources.
+GPU_MEM_LIMIT = 1024**3  # 1 GB memory limit
+
+
+@MODELS.register_module()
+class FCNMaskHead(BaseModule):
+
+    def __init__(self,
+                 num_convs: int = 4,
+                 roi_feat_size: int = 14,
+                 in_channels: int = 256,
+                 conv_kernel_size: int = 3,
+                 conv_out_channels: int = 256,
+                 num_classes: int = 80,
+                 class_agnostic: int = False,
+                 upsample_cfg: ConfigType = dict(
+                     type='deconv', scale_factor=2),
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 predictor_cfg: ConfigType = dict(type='Conv'),
+                 loss_mask: ConfigType = dict(
+                     type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.upsample_cfg = upsample_cfg.copy()
+        if self.upsample_cfg['type'] not in [
+                None, 'deconv', 'nearest', 'bilinear', 'carafe'
+        ]:
+            raise ValueError(
+                f'Invalid upsample method {self.upsample_cfg["type"]}, '
+                'accepted methods are "deconv", "nearest", "bilinear", '
+                '"carafe"')
+        self.num_convs = num_convs
+        # WARN: roi_feat_size is reserved and not used
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.in_channels = in_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.conv_out_channels = conv_out_channels
+        self.upsample_method = self.upsample_cfg.get('type')
+        self.scale_factor = self.upsample_cfg.pop('scale_factor', None)
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.predictor_cfg = predictor_cfg
+        self.loss_mask = MODELS.build(loss_mask)
+
+        self.convs = ModuleList()
+        for i in range(self.num_convs):
+            in_channels = (
+                self.in_channels if i == 0 else self.conv_out_channels)
+            padding = (self.conv_kernel_size - 1) // 2
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    self.conv_out_channels,
+                    self.conv_kernel_size,
+                    padding=padding,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg))
+        upsample_in_channels = (
+            self.conv_out_channels if self.num_convs > 0 else in_channels)
+        upsample_cfg_ = self.upsample_cfg.copy()
+        if self.upsample_method is None:
+            self.upsample = None
+        elif self.upsample_method == 'deconv':
+            upsample_cfg_.update(
+                in_channels=upsample_in_channels,
+                out_channels=self.conv_out_channels,
+                kernel_size=self.scale_factor,
+                stride=self.scale_factor)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+        elif self.upsample_method == 'carafe':
+            upsample_cfg_.update(
+                channels=upsample_in_channels, scale_factor=self.scale_factor)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+        else:
+            # suppress warnings
+            align_corners = (None
+                             if self.upsample_method == 'nearest' else False)
+            upsample_cfg_.update(
+                scale_factor=self.scale_factor,
+                mode=self.upsample_method,
+                align_corners=align_corners)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+
+        out_channels = 1 if self.class_agnostic else self.num_classes
+        logits_in_channel = (
+            self.conv_out_channels
+            if self.upsample_method == 'deconv' else upsample_in_channels)
+        self.conv_logits = build_conv_layer(self.predictor_cfg,
+                                            logits_in_channel, out_channels, 1)
+        self.relu = nn.ReLU(inplace=True)
+        self.debug_imgs = None
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+        super().init_weights()
+        for m in [self.upsample, self.conv_logits]:
+            if m is None:
+                continue
+            elif isinstance(m, CARAFEPack):
+                m.init_weights()
+            elif hasattr(m, 'weight') and hasattr(m, 'bias'):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tensor): Extract mask RoI features.
+
+        Returns:
+            Tensor: Predicted foreground masks.
+        """
+        for conv in self.convs:
+            x = conv(x)
+        if self.upsample is not None:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+        mask_preds = self.conv_logits(x)
+        return mask_preds
+
+    def get_targets(self, sampling_results: List[SamplingResult],
+                    batch_gt_instances: InstanceList,
+                    rcnn_train_cfg: ConfigDict) -> Tensor:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+
+        Returns:
+            Tensor: Mask target of each positive proposals in the image.
+        """
+        pos_proposals = [res.pos_priors for res in sampling_results]
+        pos_assigned_gt_inds = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        gt_masks = [res.masks for res in batch_gt_instances]
+        mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds,
+                                   gt_masks, rcnn_train_cfg)
+        return mask_targets
+
+    def loss_and_target(self, mask_preds: Tensor,
+                        sampling_results: List[SamplingResult],
+                        batch_gt_instances: InstanceList,
+                        rcnn_train_cfg: ConfigDict) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (Tensor): Predicted foreground masks, has shape
+                (num_pos, num_classes, h, w).
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+        """
+        mask_targets = self.get_targets(
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=rcnn_train_cfg)
+
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+
+        loss = dict()
+        if mask_preds.size(0) == 0:
+            loss_mask = mask_preds.sum()
+        else:
+            if self.class_agnostic:
+                loss_mask = self.loss_mask(mask_preds, mask_targets,
+                                           torch.zeros_like(pos_labels))
+            else:
+                loss_mask = self.loss_mask(mask_preds, mask_targets,
+                                           pos_labels)
+        loss['loss_mask'] = loss_mask
+        # TODO: which algorithm requires mask_targets?
+        return dict(loss_mask=loss, mask_targets=mask_targets)
+
+    def predict_by_feat(self,
+                        mask_preds: Tuple[Tensor],
+                        results_list: List[InstanceData],
+                        batch_img_metas: List[dict],
+                        rcnn_test_cfg: ConfigDict,
+                        rescale: bool = False,
+                        activate_map: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (tuple[Tensor]): Tuple of predicted foreground masks,
+                each has shape (n, num_classes, h, w).
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            batch_img_metas (list[dict]): List of image information.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            activate_map (book): Whether get results with augmentations test.
+                If True, the `mask_preds` will not process with sigmoid.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert len(mask_preds) == len(results_list) == len(batch_img_metas)
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            results = results_list[img_id]
+            bboxes = results.bboxes
+            if bboxes.shape[0] == 0:
+                results_list[img_id] = empty_instances(
+                    [img_meta],
+                    bboxes.device,
+                    task_type='mask',
+                    instance_results=[results],
+                    mask_thr_binary=rcnn_test_cfg.mask_thr_binary)[0]
+            else:
+                im_mask = self._predict_by_feat_single(
+                    mask_preds=mask_preds[img_id],
+                    bboxes=bboxes,
+                    labels=results.labels,
+                    img_meta=img_meta,
+                    rcnn_test_cfg=rcnn_test_cfg,
+                    rescale=rescale,
+                    activate_map=activate_map)
+                results.masks = im_mask
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                mask_preds: Tensor,
+                                bboxes: Tensor,
+                                labels: Tensor,
+                                img_meta: dict,
+                                rcnn_test_cfg: ConfigDict,
+                                rescale: bool = False,
+                                activate_map: bool = False) -> Tensor:
+        """Get segmentation masks from mask_preds and bboxes.
+
+        Args:
+            mask_preds (Tensor): Predicted foreground masks, has shape
+                (n, num_classes, h, w).
+            bboxes (Tensor): Predicted bboxes, has shape (n, 4)
+            labels (Tensor): Labels of bboxes, has shape (n, )
+            img_meta (dict): image information.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            activate_map (book): Whether get results with augmentations test.
+                If True, the `mask_preds` will not process with sigmoid.
+                Defaults to False.
+
+        Returns:
+            Tensor: Encoded masks, has shape (n, img_w, img_h)
+
+        Example:
+            >>> from mmengine.config import Config
+            >>> from mmdet.models.roi_heads.mask_heads.fcn_mask_head import *  # NOQA
+            >>> N = 7  # N = number of extracted ROIs
+            >>> C, H, W = 11, 32, 32
+            >>> # Create example instance of FCN Mask Head.
+            >>> self = FCNMaskHead(num_classes=C, num_convs=0)
+            >>> inputs = torch.rand(N, self.in_channels, H, W)
+            >>> mask_preds = self.forward(inputs)
+            >>> # Each input is associated with some bounding box
+            >>> bboxes = torch.Tensor([[1, 1, 42, 42 ]] * N)
+            >>> labels = torch.randint(0, C, size=(N,))
+            >>> rcnn_test_cfg = Config({'mask_thr_binary': 0, })
+            >>> ori_shape = (H * 4, W * 4)
+            >>> scale_factor = (1, 1)
+            >>> rescale = False
+            >>> img_meta = {'scale_factor': scale_factor,
+            ...             'ori_shape': ori_shape}
+            >>> # Encoded masks are a list for each category.
+            >>> encoded_masks = self._get_seg_masks_single(
+            ...     mask_preds, bboxes, labels,
+            ...     img_meta, rcnn_test_cfg, rescale)
+            >>> assert encoded_masks.size()[0] == N
+            >>> assert encoded_masks.size()[1:] == ori_shape
+        """
+        scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+            (1, 2))
+        img_h, img_w = img_meta['ori_shape'][:2]
+        device = bboxes.device
+
+        if not activate_map:
+            mask_preds = mask_preds.sigmoid()
+        else:
+            # In AugTest, has been activated before
+            mask_preds = bboxes.new_tensor(mask_preds)
+
+        if rescale:  # in-placed rescale the bboxes
+            bboxes /= scale_factor
+        else:
+            w_scale, h_scale = scale_factor[0, 0], scale_factor[0, 1]
+            img_h = np.round(img_h * h_scale.item()).astype(np.int32)
+            img_w = np.round(img_w * w_scale.item()).astype(np.int32)
+
+        N = len(mask_preds)
+        # The actual implementation split the input into chunks,
+        # and paste them chunk by chunk.
+        if device.type == 'cpu':
+            # CPU is most efficient when they are pasted one by one with
+            # skip_empty=True, so that it performs minimal number of
+            # operations.
+            num_chunks = N
+        else:
+            # GPU benefits from parallelism for larger chunks,
+            # but may have memory issue
+            # the types of img_w and img_h are np.int32,
+            # when the image resolution is large,
+            # the calculation of num_chunks will overflow.
+            # so we need to change the types of img_w and img_h to int.
+            # See https://github.com/open-mmlab/mmdetection/pull/5191
+            num_chunks = int(
+                np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT /
+                        GPU_MEM_LIMIT))
+            assert (num_chunks <=
+                    N), 'Default GPU_MEM_LIMIT is too small; try increasing it'
+        chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
+
+        threshold = rcnn_test_cfg.mask_thr_binary
+        im_mask = torch.zeros(
+            N,
+            img_h,
+            img_w,
+            device=device,
+            dtype=torch.bool if threshold >= 0 else torch.uint8)
+
+        if not self.class_agnostic:
+            mask_preds = mask_preds[range(N), labels][:, None]
+
+        for inds in chunks:
+            masks_chunk, spatial_inds = _do_paste_mask(
+                mask_preds[inds],
+                bboxes[inds],
+                img_h,
+                img_w,
+                skip_empty=device.type == 'cpu')
+
+            if threshold >= 0:
+                masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
+            else:
+                # for visualization and debugging
+                masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
+
+            im_mask[(inds, ) + spatial_inds] = masks_chunk
+        return im_mask
+
+
+def _do_paste_mask(masks: Tensor,
+                   boxes: Tensor,
+                   img_h: int,
+                   img_w: int,
+                   skip_empty: bool = True) -> tuple:
+    """Paste instance masks according to boxes.
+
+    This implementation is modified from
+    https://github.com/facebookresearch/detectron2/
+
+    Args:
+        masks (Tensor): N, 1, H, W
+        boxes (Tensor): N, 4
+        img_h (int): Height of the image to be pasted.
+        img_w (int): Width of the image to be pasted.
+        skip_empty (bool): Only paste masks within the region that
+            tightly bound all boxes, and returns the results this region only.
+            An important optimization for CPU.
+
+    Returns:
+        tuple: (Tensor, tuple). The first item is mask tensor, the second one
+        is the slice object.
+
+            If skip_empty == False, the whole image will be pasted. It will
+            return a mask of shape (N, img_h, img_w) and an empty tuple.
+
+            If skip_empty == True, only area around the mask will be pasted.
+            A mask of shape (N, h', w') and its start and end coordinates
+            in the original image will be returned.
+    """
+    # On GPU, paste all masks together (up to chunk size)
+    # by using the entire image to sample the masks
+    # Compared to pasting them one by one,
+    # this has more operations but is faster on COCO-scale dataset.
+    device = masks.device
+    if skip_empty:
+        x0_int, y0_int = torch.clamp(
+            boxes.min(dim=0).values.floor()[:2] - 1,
+            min=0).to(dtype=torch.int32)
+        x1_int = torch.clamp(
+            boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
+        y1_int = torch.clamp(
+            boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
+    else:
+        x0_int, y0_int = 0, 0
+        x1_int, y1_int = img_w, img_h
+    x0, y0, x1, y1 = torch.split(boxes, 1, dim=1)  # each is Nx1
+
+    N = masks.shape[0]
+
+    img_y = torch.arange(y0_int, y1_int, device=device).to(torch.float32) + 0.5
+    img_x = torch.arange(x0_int, x1_int, device=device).to(torch.float32) + 0.5
+    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
+    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
+    # img_x, img_y have shapes (N, w), (N, h)
+    # IsInf op is not supported with ONNX<=1.7.0
+    if not torch.onnx.is_in_onnx_export():
+        if torch.isinf(img_x).any():
+            inds = torch.where(torch.isinf(img_x))
+            img_x[inds] = 0
+        if torch.isinf(img_y).any():
+            inds = torch.where(torch.isinf(img_y))
+            img_y[inds] = 0
+
+    gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
+    gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
+    grid = torch.stack([gx, gy], dim=3)
+
+    img_masks = F.grid_sample(
+        masks.to(dtype=torch.float32), grid, align_corners=False)
+
+    if skip_empty:
+        return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
+    else:
+        return img_masks[:, 0], ()
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/feature_relay_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/feature_relay_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c34561fa5fd749329eda164465ce9787278d357
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/feature_relay_head.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig
+
+
+@MODELS.register_module()
+class FeatureRelayHead(BaseModule):
+    """Feature Relay Head used in `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        in_channels (int): number of input channels. Defaults to 256.
+        conv_out_channels (int): number of output channels before
+            classification layer. Defaults to 256.
+        roi_feat_size (int): roi feat size at box head. Default: 7.
+        scale_factor (int): scale factor to match roi feat size
+            at mask head. Defaults to 2.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict. Defaults to
+            dict(type='Kaiming', layer='Linear').
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 1024,
+        out_conv_channels: int = 256,
+        roi_feat_size: int = 7,
+        scale_factor: int = 2,
+        init_cfg: MultiConfig = dict(type='Kaiming', layer='Linear')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(roi_feat_size, int)
+
+        self.in_channels = in_channels
+        self.out_conv_channels = out_conv_channels
+        self.roi_feat_size = roi_feat_size
+        self.out_channels = (roi_feat_size**2) * out_conv_channels
+        self.scale_factor = scale_factor
+        self.fp16_enabled = False
+
+        self.fc = nn.Linear(self.in_channels, self.out_channels)
+        self.upsample = nn.Upsample(
+            scale_factor=scale_factor, mode='bilinear', align_corners=True)
+
+    def forward(self, x: Tensor) -> Optional[Tensor]:
+        """Forward function.
+
+        Args:
+            x (Tensor): Input feature.
+
+        Returns:
+            Optional[Tensor]: Output feature. When the first dim of input is
+            0, None is returned.
+        """
+        N, _ = x.shape
+        if N > 0:
+            out_C = self.out_conv_channels
+            out_HW = self.roi_feat_size
+            x = self.fc(x)
+            x = x.reshape(N, out_C, out_HW, out_HW)
+            x = self.upsample(x)
+            return x
+        return None
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20beb2975a563f03e7b6b2afcef287cb41af05a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Tuple
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class FusedSemanticHead(BaseModule):
+    r"""Multi-level fused semantic segmentation head.
+
+    .. code-block:: none
+
+        in_1 -> 1x1 conv ---
+                            |
+        in_2 -> 1x1 conv -- |
+                           ||
+        in_3 -> 1x1 conv - ||
+                          |||                  /-> 1x1 conv (mask prediction)
+        in_4 -> 1x1 conv -----> 3x3 convs (*4)
+                            |                  \-> 1x1 conv (feature)
+        in_5 -> 1x1 conv ---
+    """  # noqa: W605
+
+    def __init__(
+        self,
+        num_ins: int,
+        fusion_level: int,
+        seg_scale_factor=1 / 8,
+        num_convs: int = 4,
+        in_channels: int = 256,
+        conv_out_channels: int = 256,
+        num_classes: int = 183,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        ignore_label: int = None,
+        loss_weight: float = None,
+        loss_seg: ConfigDict = dict(
+            type='CrossEntropyLoss', ignore_index=255, loss_weight=0.2),
+        init_cfg: MultiConfig = dict(
+            type='Kaiming', override=dict(name='conv_logits'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_ins = num_ins
+        self.fusion_level = fusion_level
+        self.seg_scale_factor = seg_scale_factor
+        self.num_convs = num_convs
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.num_classes = num_classes
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.num_ins):
+            self.lateral_convs.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=False))
+
+        self.convs = nn.ModuleList()
+        for i in range(self.num_convs):
+            in_channels = self.in_channels if i == 0 else conv_out_channels
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    conv_out_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.conv_embedding = ConvModule(
+            conv_out_channels,
+            conv_out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+        self.conv_logits = nn.Conv2d(conv_out_channels, self.num_classes, 1)
+        if ignore_label:
+            loss_seg['ignore_index'] = ignore_label
+        if loss_weight:
+            loss_seg['loss_weight'] = loss_weight
+        if ignore_label or loss_weight:
+            warnings.warn('``ignore_label`` and ``loss_weight`` would be '
+                          'deprecated soon. Please set ``ingore_index`` and '
+                          '``loss_weight`` in ``loss_seg`` instead.')
+        self.criterion = MODELS.build(loss_seg)
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            feats (tuple[Tensor]): Multi scale feature maps.
+
+        Returns:
+            tuple[Tensor]:
+
+                - mask_preds (Tensor): Predicted mask logits.
+                - x (Tensor): Fused feature.
+        """
+        x = self.lateral_convs[self.fusion_level](feats[self.fusion_level])
+        fused_size = tuple(x.shape[-2:])
+        for i, feat in enumerate(feats):
+            if i != self.fusion_level:
+                feat = F.interpolate(
+                    feat, size=fused_size, mode='bilinear', align_corners=True)
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                x = x + self.lateral_convs[i](feat)
+
+        for i in range(self.num_convs):
+            x = self.convs[i](x)
+
+        mask_preds = self.conv_logits(x)
+        x = self.conv_embedding(x)
+        return mask_preds, x
+
+    def loss(self, mask_preds: Tensor, labels: Tensor) -> Tensor:
+        """Loss function.
+
+        Args:
+            mask_preds (Tensor): Predicted mask logits.
+            labels (Tensor): Ground truth.
+
+        Returns:
+            Tensor: Semantic segmentation loss.
+        """
+        labels = F.interpolate(
+            labels.float(), scale_factor=self.seg_scale_factor, mode='nearest')
+        labels = labels.squeeze(1).long()
+        loss_semantic_seg = self.criterion(mask_preds, labels)
+        return loss_semantic_seg
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/global_context_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/global_context_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb947ea582227d2b74112cbb930e1a3f85b77ff5
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/global_context_head.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.models.layers import ResLayer, SimplifiedBasicBlock
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class GlobalContextHead(BaseModule):
+    """Global context head used in `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        num_convs (int, optional): number of convolutional layer in GlbCtxHead.
+            Defaults to 4.
+        in_channels (int, optional): number of input channels. Defaults to 256.
+        conv_out_channels (int, optional): number of output channels before
+            classification layer. Defaults to 256.
+        num_classes (int, optional): number of classes. Defaults to 80.
+        loss_weight (float, optional): global context loss weight.
+            Defaults to 1.
+        conv_cfg (dict, optional): config to init conv layer. Defaults to None.
+        norm_cfg (dict, optional): config to init norm layer. Defaults to None.
+        conv_to_res (bool, optional): if True, 2 convs will be grouped into
+            1 `SimplifiedBasicBlock` using a skip connection.
+            Defaults to False.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict. Defaults to
+            dict(type='Normal', std=0.01, override=dict(name='fc')).
+    """
+
+    def __init__(
+        self,
+        num_convs: int = 4,
+        in_channels: int = 256,
+        conv_out_channels: int = 256,
+        num_classes: int = 80,
+        loss_weight: float = 1.0,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        conv_to_res: bool = False,
+        init_cfg: MultiConfig = dict(
+            type='Normal', std=0.01, override=dict(name='fc'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_convs = num_convs
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.num_classes = num_classes
+        self.loss_weight = loss_weight
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.conv_to_res = conv_to_res
+        self.fp16_enabled = False
+
+        if self.conv_to_res:
+            num_res_blocks = num_convs // 2
+            self.convs = ResLayer(
+                SimplifiedBasicBlock,
+                in_channels,
+                self.conv_out_channels,
+                num_res_blocks,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+            self.num_convs = num_res_blocks
+        else:
+            self.convs = nn.ModuleList()
+            for i in range(self.num_convs):
+                in_channels = self.in_channels if i == 0 else conv_out_channels
+                self.convs.append(
+                    ConvModule(
+                        in_channels,
+                        conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(conv_out_channels, num_classes)
+
+        self.criterion = nn.BCEWithLogitsLoss()
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            feats (Tuple[Tensor]): Multi-scale feature maps.
+
+        Returns:
+            Tuple[Tensor]:
+
+                - mc_pred (Tensor): Multi-class prediction.
+                - x (Tensor): Global context feature.
+        """
+        x = feats[-1]
+        for i in range(self.num_convs):
+            x = self.convs[i](x)
+        x = self.pool(x)
+
+        # multi-class prediction
+        mc_pred = x.reshape(x.size(0), -1)
+        mc_pred = self.fc(mc_pred)
+
+        return mc_pred, x
+
+    def loss(self, pred: Tensor, labels: List[Tensor]) -> Tensor:
+        """Loss function.
+
+        Args:
+            pred (Tensor): Logits.
+            labels (list[Tensor]): Grouth truths.
+
+        Returns:
+            Tensor: Loss.
+        """
+        labels = [lbl.unique() for lbl in labels]
+        targets = pred.new_zeros(pred.size())
+        for i, label in enumerate(labels):
+            targets[i, label] = 1.0
+        loss = self.loss_weight * self.criterion(pred, targets)
+        return loss
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/grid_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/grid_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9514ae7bcfc1b7d5613fa0107e9bd087e13dd46
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/grid_head.py
@@ -0,0 +1,490 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class GridHead(BaseModule):
+    """Implementation of `Grid Head <https://arxiv.org/abs/1811.12030>`_
+
+    Args:
+        grid_points (int): The number of grid points. Defaults to 9.
+        num_convs (int): The number of convolution layers. Defaults to 8.
+        roi_feat_size (int): RoI feature size. Default to 14.
+        in_channels (int): The channel number of inputs features.
+            Defaults to 256.
+        conv_kernel_size (int): The kernel size of convolution layers.
+            Defaults to 3.
+        point_feat_channels (int): The number of channels of each point
+            features. Defaults to 64.
+        class_agnostic (bool): Whether use class agnostic classification.
+            If so, the output channels of logits will be 1. Defaults to False.
+        loss_grid (:obj:`ConfigDict` or dict): Config of grid loss.
+        conv_cfg (:obj:`ConfigDict` or dict, optional) dictionary to
+            construct and config conv layer.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        grid_points: int = 9,
+        num_convs: int = 8,
+        roi_feat_size: int = 14,
+        in_channels: int = 256,
+        conv_kernel_size: int = 3,
+        point_feat_channels: int = 64,
+        deconv_kernel_size: int = 4,
+        class_agnostic: bool = False,
+        loss_grid: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=15),
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='GN', num_groups=36),
+        init_cfg: MultiConfig = [
+            dict(type='Kaiming', layer=['Conv2d', 'Linear']),
+            dict(
+                type='Normal',
+                layer='ConvTranspose2d',
+                std=0.001,
+                override=dict(
+                    type='Normal',
+                    name='deconv2',
+                    std=0.001,
+                    bias=-np.log(0.99 / 0.01)))
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.grid_points = grid_points
+        self.num_convs = num_convs
+        self.roi_feat_size = roi_feat_size
+        self.in_channels = in_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.point_feat_channels = point_feat_channels
+        self.conv_out_channels = self.point_feat_channels * self.grid_points
+        self.class_agnostic = class_agnostic
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        if isinstance(norm_cfg, dict) and norm_cfg['type'] == 'GN':
+            assert self.conv_out_channels % norm_cfg['num_groups'] == 0
+
+        assert self.grid_points >= 4
+        self.grid_size = int(np.sqrt(self.grid_points))
+        if self.grid_size * self.grid_size != self.grid_points:
+            raise ValueError('grid_points must be a square number')
+
+        # the predicted heatmap is half of whole_map_size
+        if not isinstance(self.roi_feat_size, int):
+            raise ValueError('Only square RoIs are supporeted in Grid R-CNN')
+        self.whole_map_size = self.roi_feat_size * 4
+
+        # compute point-wise sub-regions
+        self.sub_regions = self.calc_sub_regions()
+
+        self.convs = []
+        for i in range(self.num_convs):
+            in_channels = (
+                self.in_channels if i == 0 else self.conv_out_channels)
+            stride = 2 if i == 0 else 1
+            padding = (self.conv_kernel_size - 1) // 2
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    self.conv_out_channels,
+                    self.conv_kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=True))
+        self.convs = nn.Sequential(*self.convs)
+
+        self.deconv1 = nn.ConvTranspose2d(
+            self.conv_out_channels,
+            self.conv_out_channels,
+            kernel_size=deconv_kernel_size,
+            stride=2,
+            padding=(deconv_kernel_size - 2) // 2,
+            groups=grid_points)
+        self.norm1 = nn.GroupNorm(grid_points, self.conv_out_channels)
+        self.deconv2 = nn.ConvTranspose2d(
+            self.conv_out_channels,
+            grid_points,
+            kernel_size=deconv_kernel_size,
+            stride=2,
+            padding=(deconv_kernel_size - 2) // 2,
+            groups=grid_points)
+
+        # find the 4-neighbor of each grid point
+        self.neighbor_points = []
+        grid_size = self.grid_size
+        for i in range(grid_size):  # i-th column
+            for j in range(grid_size):  # j-th row
+                neighbors = []
+                if i > 0:  # left: (i - 1, j)
+                    neighbors.append((i - 1) * grid_size + j)
+                if j > 0:  # up: (i, j - 1)
+                    neighbors.append(i * grid_size + j - 1)
+                if j < grid_size - 1:  # down: (i, j + 1)
+                    neighbors.append(i * grid_size + j + 1)
+                if i < grid_size - 1:  # right: (i + 1, j)
+                    neighbors.append((i + 1) * grid_size + j)
+                self.neighbor_points.append(tuple(neighbors))
+        # total edges in the grid
+        self.num_edges = sum([len(p) for p in self.neighbor_points])
+
+        self.forder_trans = nn.ModuleList()  # first-order feature transition
+        self.sorder_trans = nn.ModuleList()  # second-order feature transition
+        for neighbors in self.neighbor_points:
+            fo_trans = nn.ModuleList()
+            so_trans = nn.ModuleList()
+            for _ in range(len(neighbors)):
+                # each transition module consists of a 5x5 depth-wise conv and
+                # 1x1 conv.
+                fo_trans.append(
+                    nn.Sequential(
+                        nn.Conv2d(
+                            self.point_feat_channels,
+                            self.point_feat_channels,
+                            5,
+                            stride=1,
+                            padding=2,
+                            groups=self.point_feat_channels),
+                        nn.Conv2d(self.point_feat_channels,
+                                  self.point_feat_channels, 1)))
+                so_trans.append(
+                    nn.Sequential(
+                        nn.Conv2d(
+                            self.point_feat_channels,
+                            self.point_feat_channels,
+                            5,
+                            1,
+                            2,
+                            groups=self.point_feat_channels),
+                        nn.Conv2d(self.point_feat_channels,
+                                  self.point_feat_channels, 1)))
+            self.forder_trans.append(fo_trans)
+            self.sorder_trans.append(so_trans)
+
+        self.loss_grid = MODELS.build(loss_grid)
+
+    def forward(self, x: Tensor) -> Dict[str, Tensor]:
+        """forward function of ``GridHead``.
+
+        Args:
+            x (Tensor): RoI features, has shape
+                (num_rois, num_channels, roi_feat_size, roi_feat_size).
+
+        Returns:
+            Dict[str, Tensor]: Return a dict including fused and unfused
+            heatmap.
+        """
+        assert x.shape[-1] == x.shape[-2] == self.roi_feat_size
+        # RoI feature transformation, downsample 2x
+        x = self.convs(x)
+
+        c = self.point_feat_channels
+        # first-order fusion
+        x_fo = [None for _ in range(self.grid_points)]
+        for i, points in enumerate(self.neighbor_points):
+            x_fo[i] = x[:, i * c:(i + 1) * c]
+            for j, point_idx in enumerate(points):
+                x_fo[i] = x_fo[i] + self.forder_trans[i][j](
+                    x[:, point_idx * c:(point_idx + 1) * c])
+
+        # second-order fusion
+        x_so = [None for _ in range(self.grid_points)]
+        for i, points in enumerate(self.neighbor_points):
+            x_so[i] = x[:, i * c:(i + 1) * c]
+            for j, point_idx in enumerate(points):
+                x_so[i] = x_so[i] + self.sorder_trans[i][j](x_fo[point_idx])
+
+        # predicted heatmap with fused features
+        x2 = torch.cat(x_so, dim=1)
+        x2 = self.deconv1(x2)
+        x2 = F.relu(self.norm1(x2), inplace=True)
+        heatmap = self.deconv2(x2)
+
+        # predicted heatmap with original features (applicable during training)
+        if self.training:
+            x1 = x
+            x1 = self.deconv1(x1)
+            x1 = F.relu(self.norm1(x1), inplace=True)
+            heatmap_unfused = self.deconv2(x1)
+        else:
+            heatmap_unfused = heatmap
+
+        return dict(fused=heatmap, unfused=heatmap_unfused)
+
+    def calc_sub_regions(self) -> List[Tuple[float]]:
+        """Compute point specific representation regions.
+
+        See `Grid R-CNN Plus <https://arxiv.org/abs/1906.05688>`_ for details.
+        """
+        # to make it consistent with the original implementation, half_size
+        # is computed as 2 * quarter_size, which is smaller
+        half_size = self.whole_map_size // 4 * 2
+        sub_regions = []
+        for i in range(self.grid_points):
+            x_idx = i // self.grid_size
+            y_idx = i % self.grid_size
+            if x_idx == 0:
+                sub_x1 = 0
+            elif x_idx == self.grid_size - 1:
+                sub_x1 = half_size
+            else:
+                ratio = x_idx / (self.grid_size - 1) - 0.25
+                sub_x1 = max(int(ratio * self.whole_map_size), 0)
+
+            if y_idx == 0:
+                sub_y1 = 0
+            elif y_idx == self.grid_size - 1:
+                sub_y1 = half_size
+            else:
+                ratio = y_idx / (self.grid_size - 1) - 0.25
+                sub_y1 = max(int(ratio * self.whole_map_size), 0)
+            sub_regions.append(
+                (sub_x1, sub_y1, sub_x1 + half_size, sub_y1 + half_size))
+        return sub_regions
+
+    def get_targets(self, sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict) -> Tensor:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.".
+
+        Args:
+            sampling_results (List[:obj:`SamplingResult`]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (:obj:`ConfigDict`): `train_cfg` of RCNN.
+
+        Returns:
+            Tensor: Grid heatmap targets.
+        """
+        # mix all samples (across images) together.
+        pos_bboxes = torch.cat([res.pos_bboxes for res in sampling_results],
+                               dim=0).cpu()
+        pos_gt_bboxes = torch.cat(
+            [res.pos_gt_bboxes for res in sampling_results], dim=0).cpu()
+        assert pos_bboxes.shape == pos_gt_bboxes.shape
+
+        # expand pos_bboxes to 2x of original size
+        x1 = pos_bboxes[:, 0] - (pos_bboxes[:, 2] - pos_bboxes[:, 0]) / 2
+        y1 = pos_bboxes[:, 1] - (pos_bboxes[:, 3] - pos_bboxes[:, 1]) / 2
+        x2 = pos_bboxes[:, 2] + (pos_bboxes[:, 2] - pos_bboxes[:, 0]) / 2
+        y2 = pos_bboxes[:, 3] + (pos_bboxes[:, 3] - pos_bboxes[:, 1]) / 2
+        pos_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+        pos_bbox_ws = (pos_bboxes[:, 2] - pos_bboxes[:, 0]).unsqueeze(-1)
+        pos_bbox_hs = (pos_bboxes[:, 3] - pos_bboxes[:, 1]).unsqueeze(-1)
+
+        num_rois = pos_bboxes.shape[0]
+        map_size = self.whole_map_size
+        # this is not the final target shape
+        targets = torch.zeros((num_rois, self.grid_points, map_size, map_size),
+                              dtype=torch.float)
+
+        # pre-compute interpolation factors for all grid points.
+        # the first item is the factor of x-dim, and the second is y-dim.
+        # for a 9-point grid, factors are like (1, 0), (0.5, 0.5), (0, 1)
+        factors = []
+        for j in range(self.grid_points):
+            x_idx = j // self.grid_size
+            y_idx = j % self.grid_size
+            factors.append((1 - x_idx / (self.grid_size - 1),
+                            1 - y_idx / (self.grid_size - 1)))
+
+        radius = rcnn_train_cfg.pos_radius
+        radius2 = radius**2
+        for i in range(num_rois):
+            # ignore small bboxes
+            if (pos_bbox_ws[i] <= self.grid_size
+                    or pos_bbox_hs[i] <= self.grid_size):
+                continue
+            # for each grid point, mark a small circle as positive
+            for j in range(self.grid_points):
+                factor_x, factor_y = factors[j]
+                gridpoint_x = factor_x * pos_gt_bboxes[i, 0] + (
+                    1 - factor_x) * pos_gt_bboxes[i, 2]
+                gridpoint_y = factor_y * pos_gt_bboxes[i, 1] + (
+                    1 - factor_y) * pos_gt_bboxes[i, 3]
+
+                cx = int((gridpoint_x - pos_bboxes[i, 0]) / pos_bbox_ws[i] *
+                         map_size)
+                cy = int((gridpoint_y - pos_bboxes[i, 1]) / pos_bbox_hs[i] *
+                         map_size)
+
+                for x in range(cx - radius, cx + radius + 1):
+                    for y in range(cy - radius, cy + radius + 1):
+                        if x >= 0 and x < map_size and y >= 0 and y < map_size:
+                            if (x - cx)**2 + (y - cy)**2 <= radius2:
+                                targets[i, j, y, x] = 1
+        # reduce the target heatmap size by a half
+        # proposed in Grid R-CNN Plus (https://arxiv.org/abs/1906.05688).
+        sub_targets = []
+        for i in range(self.grid_points):
+            sub_x1, sub_y1, sub_x2, sub_y2 = self.sub_regions[i]
+            sub_targets.append(targets[:, [i], sub_y1:sub_y2, sub_x1:sub_x2])
+        sub_targets = torch.cat(sub_targets, dim=1)
+        sub_targets = sub_targets.to(sampling_results[0].pos_bboxes.device)
+        return sub_targets
+
+    def loss(self, grid_pred: Tensor, sample_idx: Tensor,
+             sampling_results: List[SamplingResult],
+             rcnn_train_cfg: ConfigDict) -> dict:
+        """Calculate the loss based on the features extracted by the grid head.
+
+        Args:
+            grid_pred (dict[str, Tensor]): Outputs of grid_head forward.
+            sample_idx (Tensor): The sampling index of ``grid_pred``.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:`ConfigDict`): `train_cfg` of RCNN.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+        """
+        grid_targets = self.get_targets(sampling_results, rcnn_train_cfg)
+        grid_targets = grid_targets[sample_idx]
+
+        loss_fused = self.loss_grid(grid_pred['fused'], grid_targets)
+        loss_unfused = self.loss_grid(grid_pred['unfused'], grid_targets)
+        loss_grid = loss_fused + loss_unfused
+        return dict(loss_grid=loss_grid)
+
+    def predict_by_feat(self,
+                        grid_preds: Dict[str, Tensor],
+                        results_list: List[InstanceData],
+                        batch_img_metas: List[dict],
+                        rescale: bool = False) -> InstanceList:
+        """Adjust the predicted bboxes from bbox head.
+
+        Args:
+            grid_preds (dict[str, Tensor]): dictionary outputted by forward
+                function.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            batch_img_metas (list[dict]): List of image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process. Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape \
+            (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last \
+            dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        num_roi_per_img = tuple(res.bboxes.size(0) for res in results_list)
+        grid_preds = {
+            k: v.split(num_roi_per_img, 0)
+            for k, v in grid_preds.items()
+        }
+
+        for i, results in enumerate(results_list):
+            if len(results) != 0:
+                bboxes = self._predict_by_feat_single(
+                    grid_pred=grid_preds['fused'][i],
+                    bboxes=results.bboxes,
+                    img_meta=batch_img_metas[i],
+                    rescale=rescale)
+                results.bboxes = bboxes
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                grid_pred: Tensor,
+                                bboxes: Tensor,
+                                img_meta: dict,
+                                rescale: bool = False) -> Tensor:
+        """Adjust ``bboxes`` according to ``grid_pred``.
+
+        Args:
+            grid_pred (Tensor): Grid fused heatmap.
+            bboxes (Tensor): Predicted bboxes, has shape (n, 4)
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            Tensor: adjusted bboxes.
+        """
+        assert bboxes.size(0) == grid_pred.size(0)
+        grid_pred = grid_pred.sigmoid()
+
+        R, c, h, w = grid_pred.shape
+        half_size = self.whole_map_size // 4 * 2
+        assert h == w == half_size
+        assert c == self.grid_points
+
+        # find the point with max scores in the half-sized heatmap
+        grid_pred = grid_pred.view(R * c, h * w)
+        pred_scores, pred_position = grid_pred.max(dim=1)
+        xs = pred_position % w
+        ys = pred_position // w
+
+        # get the position in the whole heatmap instead of half-sized heatmap
+        for i in range(self.grid_points):
+            xs[i::self.grid_points] += self.sub_regions[i][0]
+            ys[i::self.grid_points] += self.sub_regions[i][1]
+
+        # reshape to (num_rois, grid_points)
+        pred_scores, xs, ys = tuple(
+            map(lambda x: x.view(R, c), [pred_scores, xs, ys]))
+
+        # get expanded pos_bboxes
+        widths = (bboxes[:, 2] - bboxes[:, 0]).unsqueeze(-1)
+        heights = (bboxes[:, 3] - bboxes[:, 1]).unsqueeze(-1)
+        x1 = (bboxes[:, 0, None] - widths / 2)
+        y1 = (bboxes[:, 1, None] - heights / 2)
+        # map the grid point to the absolute coordinates
+        abs_xs = (xs.float() + 0.5) / w * widths + x1
+        abs_ys = (ys.float() + 0.5) / h * heights + y1
+
+        # get the grid points indices that fall on the bbox boundaries
+        x1_inds = [i for i in range(self.grid_size)]
+        y1_inds = [i * self.grid_size for i in range(self.grid_size)]
+        x2_inds = [
+            self.grid_points - self.grid_size + i
+            for i in range(self.grid_size)
+        ]
+        y2_inds = [(i + 1) * self.grid_size - 1 for i in range(self.grid_size)]
+
+        # voting of all grid points on some boundary
+        bboxes_x1 = (abs_xs[:, x1_inds] * pred_scores[:, x1_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, x1_inds].sum(dim=1, keepdim=True))
+        bboxes_y1 = (abs_ys[:, y1_inds] * pred_scores[:, y1_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, y1_inds].sum(dim=1, keepdim=True))
+        bboxes_x2 = (abs_xs[:, x2_inds] * pred_scores[:, x2_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, x2_inds].sum(dim=1, keepdim=True))
+        bboxes_y2 = (abs_ys[:, y2_inds] * pred_scores[:, y2_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, y2_inds].sum(dim=1, keepdim=True))
+
+        bboxes = torch.cat([bboxes_x1, bboxes_y1, bboxes_x2, bboxes_y2], dim=1)
+        bboxes[:, [0, 2]].clamp_(min=0, max=img_meta['img_shape'][1])
+        bboxes[:, [1, 3]].clamp_(min=0, max=img_meta['img_shape'][0])
+
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            bboxes /= bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+
+        return bboxes
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/htc_mask_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/htc_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..73ac1e6e5f115927e1a2accdd693aae512cac753
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/htc_mask_head.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+from mmcv.cnn import ConvModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .fcn_mask_head import FCNMaskHead
+
+
+@MODELS.register_module()
+class HTCMaskHead(FCNMaskHead):
+    """Mask head for HTC.
+
+    Args:
+        with_conv_res (bool): Whether add conv layer for ``res_feat``.
+            Defaults to True.
+    """
+
+    def __init__(self, with_conv_res: bool = True, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.with_conv_res = with_conv_res
+        if self.with_conv_res:
+            self.conv_res = ConvModule(
+                self.conv_out_channels,
+                self.conv_out_channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+
+    def forward(self,
+                x: Tensor,
+                res_feat: Optional[Tensor] = None,
+                return_logits: bool = True,
+                return_feat: bool = True) -> Union[Tensor, List[Tensor]]:
+        """
+        Args:
+            x (Tensor): Feature map.
+            res_feat (Tensor, optional): Feature for residual connection.
+                Defaults to None.
+            return_logits (bool): Whether return mask logits. Defaults to True.
+            return_feat (bool): Whether return feature map. Defaults to True.
+
+        Returns:
+            Union[Tensor, List[Tensor]]: The return result is one of three
+                results: res_feat, logits, or [logits, res_feat].
+        """
+        assert not (not return_logits and not return_feat)
+        if res_feat is not None:
+            assert self.with_conv_res
+            res_feat = self.conv_res(res_feat)
+            x = x + res_feat
+        for conv in self.convs:
+            x = conv(x)
+        res_feat = x
+        outs = []
+        if return_logits:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+            mask_preds = self.conv_logits(x)
+            outs.append(mask_preds)
+        if return_feat:
+            outs.append(res_feat)
+        return outs if len(outs) > 1 else outs[0]
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/mask_point_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/mask_point_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2084f59f07b48bf2e5b05bb7af61172df8737478
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/mask_point_head.py
@@ -0,0 +1,284 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py  # noqa
+
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import (get_uncertain_point_coords_with_randomness,
+                                get_uncertainty)
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class MaskPointHead(BaseModule):
+    """A mask point head use in PointRend.
+
+    ``MaskPointHead`` use shared multi-layer perceptron (equivalent to
+    nn.Conv1d) to predict the logit of input points. The fine-grained feature
+    and coarse feature will be concatenate together for predication.
+
+    Args:
+        num_fcs (int): Number of fc layers in the head. Defaults to 3.
+        in_channels (int): Number of input channels. Defaults to 256.
+        fc_channels (int): Number of fc channels. Defaults to 256.
+        num_classes (int): Number of classes for logits. Defaults to 80.
+        class_agnostic (bool): Whether use class agnostic classification.
+            If so, the output channels of logits will be 1. Defaults to False.
+        coarse_pred_each_layer (bool): Whether concatenate coarse feature with
+            the output of each fc layer. Defaults to True.
+        conv_cfg (:obj:`ConfigDict` or dict): Dictionary to construct
+            and config conv layer. Defaults to dict(type='Conv1d')).
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Dictionary to construct
+            and config norm layer. Defaults to None.
+        loss_point (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config loss layer of point head. Defaults to
+            dict(type='CrossEntropyLoss', use_mask=True, loss_weight=1.0).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        num_fcs: int = 3,
+        in_channels: int = 256,
+        fc_channels: int = 256,
+        class_agnostic: bool = False,
+        coarse_pred_each_layer: bool = True,
+        conv_cfg: ConfigType = dict(type='Conv1d'),
+        norm_cfg: OptConfigType = None,
+        act_cfg: ConfigType = dict(type='ReLU'),
+        loss_point: ConfigType = dict(
+            type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
+        init_cfg: MultiConfig = dict(
+            type='Normal', std=0.001, override=dict(name='fc_logits'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_fcs = num_fcs
+        self.in_channels = in_channels
+        self.fc_channels = fc_channels
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+        self.coarse_pred_each_layer = coarse_pred_each_layer
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.loss_point = MODELS.build(loss_point)
+
+        fc_in_channels = in_channels + num_classes
+        self.fcs = nn.ModuleList()
+        for _ in range(num_fcs):
+            fc = ConvModule(
+                fc_in_channels,
+                fc_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.fcs.append(fc)
+            fc_in_channels = fc_channels
+            fc_in_channels += num_classes if self.coarse_pred_each_layer else 0
+
+        out_channels = 1 if self.class_agnostic else self.num_classes
+        self.fc_logits = nn.Conv1d(
+            fc_in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, fine_grained_feats: Tensor,
+                coarse_feats: Tensor) -> Tensor:
+        """Classify each point base on fine grained and coarse feats.
+
+        Args:
+            fine_grained_feats (Tensor): Fine grained feature sampled from FPN,
+                shape (num_rois, in_channels, num_points).
+            coarse_feats (Tensor): Coarse feature sampled from CoarseMaskHead,
+                shape (num_rois, num_classes, num_points).
+
+        Returns:
+            Tensor: Point classification results,
+            shape (num_rois, num_class, num_points).
+        """
+
+        x = torch.cat([fine_grained_feats, coarse_feats], dim=1)
+        for fc in self.fcs:
+            x = fc(x)
+            if self.coarse_pred_each_layer:
+                x = torch.cat((x, coarse_feats), dim=1)
+        return self.fc_logits(x)
+
+    def get_targets(self, rois: Tensor, rel_roi_points: Tensor,
+                    sampling_results: List[SamplingResult],
+                    batch_gt_instances: InstanceList,
+                    cfg: ConfigType) -> Tensor:
+        """Get training targets of MaskPointHead for all images.
+
+        Args:
+            rois (Tensor): Region of Interest, shape (num_rois, 5).
+            rel_roi_points (Tensor): Points coordinates relative to RoI, shape
+                (num_rois, num_points, 2).
+            sampling_results (:obj:`SamplingResult`): Sampling result after
+                sampling and assignment.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            cfg (obj:`ConfigDict` or dict): Training cfg.
+
+        Returns:
+            Tensor: Point target, shape (num_rois, num_points).
+        """
+
+        num_imgs = len(sampling_results)
+        rois_list = []
+        rel_roi_points_list = []
+        for batch_ind in range(num_imgs):
+            inds = (rois[:, 0] == batch_ind)
+            rois_list.append(rois[inds])
+            rel_roi_points_list.append(rel_roi_points[inds])
+        pos_assigned_gt_inds_list = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        cfg_list = [cfg for _ in range(num_imgs)]
+
+        point_targets = map(self._get_targets_single, rois_list,
+                            rel_roi_points_list, pos_assigned_gt_inds_list,
+                            batch_gt_instances, cfg_list)
+        point_targets = list(point_targets)
+
+        if len(point_targets) > 0:
+            point_targets = torch.cat(point_targets)
+
+        return point_targets
+
+    def _get_targets_single(self, rois: Tensor, rel_roi_points: Tensor,
+                            pos_assigned_gt_inds: Tensor,
+                            gt_instances: InstanceData,
+                            cfg: ConfigType) -> Tensor:
+        """Get training target of MaskPointHead for each image."""
+        num_pos = rois.size(0)
+        num_points = cfg.num_points
+        if num_pos > 0:
+            gt_masks_th = (
+                gt_instances.masks.to_tensor(rois.dtype,
+                                             rois.device).index_select(
+                                                 0, pos_assigned_gt_inds))
+            gt_masks_th = gt_masks_th.unsqueeze(1)
+            rel_img_points = rel_roi_point_to_rel_img_point(
+                rois, rel_roi_points, gt_masks_th)
+            point_targets = point_sample(gt_masks_th,
+                                         rel_img_points).squeeze(1)
+        else:
+            point_targets = rois.new_zeros((0, num_points))
+        return point_targets
+
+    def loss_and_target(self, point_pred: Tensor, rel_roi_points: Tensor,
+                        sampling_results: List[SamplingResult],
+                        batch_gt_instances: InstanceList,
+                        cfg: ConfigType) -> dict:
+        """Calculate loss for MaskPointHead.
+
+        Args:
+            point_pred (Tensor): Point predication result, shape
+                (num_rois, num_classes, num_points).
+            rel_roi_points (Tensor): Points coordinates relative to RoI, shape
+                (num_rois, num_points, 2).
+             sampling_results (:obj:`SamplingResult`): Sampling result after
+                sampling and assignment.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            cfg (obj:`ConfigDict` or dict): Training cfg.
+
+        Returns:
+            dict: a dictionary of point loss and point target.
+        """
+        rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+
+        point_target = self.get_targets(rois, rel_roi_points, sampling_results,
+                                        batch_gt_instances, cfg)
+        if self.class_agnostic:
+            loss_point = self.loss_point(point_pred, point_target,
+                                         torch.zeros_like(pos_labels))
+        else:
+            loss_point = self.loss_point(point_pred, point_target, pos_labels)
+
+        return dict(loss_point=loss_point, point_target=point_target)
+
+    def get_roi_rel_points_train(self, mask_preds: Tensor, labels: Tensor,
+                                 cfg: ConfigType) -> Tensor:
+        """Get ``num_points`` most uncertain points with random points during
+        train.
+
+        Sample points in [0, 1] x [0, 1] coordinate space based on their
+        uncertainty. The uncertainties are calculated for each point using
+        '_get_uncertainty()' function that takes point's logit prediction as
+        input.
+
+        Args:
+            mask_preds (Tensor): A tensor of shape (num_rois, num_classes,
+                mask_height, mask_width) for class-specific or class-agnostic
+                prediction.
+            labels (Tensor): The ground truth class for each instance.
+            cfg (:obj:`ConfigDict` or dict): Training config of point head.
+
+        Returns:
+            point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+            that contains the coordinates sampled points.
+        """
+        point_coords = get_uncertain_point_coords_with_randomness(
+            mask_preds, labels, cfg.num_points, cfg.oversample_ratio,
+            cfg.importance_sample_ratio)
+        return point_coords
+
+    def get_roi_rel_points_test(self, mask_preds: Tensor, label_preds: Tensor,
+                                cfg: ConfigType) -> Tuple[Tensor, Tensor]:
+        """Get ``num_points`` most uncertain points during test.
+
+        Args:
+            mask_preds (Tensor): A tensor of shape (num_rois, num_classes,
+                mask_height, mask_width) for class-specific or class-agnostic
+                prediction.
+            label_preds (Tensor): The predication class for each instance.
+            cfg (:obj:`ConfigDict` or dict): Testing config of point head.
+
+        Returns:
+            tuple:
+
+            - point_indices (Tensor): A tensor of shape (num_rois, num_points)
+              that contains indices from [0, mask_height x mask_width) of the
+              most uncertain points.
+            - point_coords (Tensor): A tensor of shape (num_rois, num_points,
+              2) that contains [0, 1] x [0, 1] normalized coordinates of the
+              most uncertain points from the [mask_height, mask_width] grid.
+        """
+        num_points = cfg.subdivision_num_points
+        uncertainty_map = get_uncertainty(mask_preds, label_preds)
+        num_rois, _, mask_height, mask_width = uncertainty_map.shape
+
+        # During ONNX exporting, the type of each elements of 'shape' is
+        # `Tensor(float)`, while it is `float` during PyTorch inference.
+        if isinstance(mask_height, torch.Tensor):
+            h_step = 1.0 / mask_height.float()
+            w_step = 1.0 / mask_width.float()
+        else:
+            h_step = 1.0 / mask_height
+            w_step = 1.0 / mask_width
+        # cast to int to avoid dynamic K for TopK op in ONNX
+        mask_size = int(mask_height * mask_width)
+        uncertainty_map = uncertainty_map.view(num_rois, mask_size)
+        num_points = min(mask_size, num_points)
+        point_indices = uncertainty_map.topk(num_points, dim=1)[1]
+        xs = w_step / 2.0 + (point_indices % mask_width).float() * w_step
+        ys = h_step / 2.0 + (point_indices // mask_width).float() * h_step
+        point_coords = torch.stack([xs, ys], dim=2)
+        return point_indices, point_coords
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/maskiou_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/maskiou_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8901871e754c491f7bc94eb68a27fa1b50e29148
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/maskiou_head.py
@@ -0,0 +1,277 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import Conv2d, Linear, MaxPool2d
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, OptMultiConfig
+
+
+@MODELS.register_module()
+class MaskIoUHead(BaseModule):
+    """Mask IoU Head.
+
+    This head predicts the IoU of predicted masks and corresponding gt masks.
+
+    Args:
+        num_convs (int): The number of convolution layers. Defaults to 4.
+        num_fcs (int): The number of fully connected layers. Defaults to 2.
+        roi_feat_size (int): RoI feature size. Default to 14.
+        in_channels (int): The channel number of inputs features.
+            Defaults to 256.
+        conv_out_channels (int): The feature channels of convolution layers.
+            Defaults to 256.
+        fc_out_channels (int): The feature channels of fully connected layers.
+            Defaults to 1024.
+        num_classes (int): Number of categories excluding the background
+            category. Defaults to 80.
+        loss_iou (:obj:`ConfigDict` or dict): IoU loss.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_convs: int = 4,
+        num_fcs: int = 2,
+        roi_feat_size: int = 14,
+        in_channels: int = 256,
+        conv_out_channels: int = 256,
+        fc_out_channels: int = 1024,
+        num_classes: int = 80,
+        loss_iou: ConfigType = dict(type='MSELoss', loss_weight=0.5),
+        init_cfg: OptMultiConfig = [
+            dict(type='Kaiming', override=dict(name='convs')),
+            dict(type='Caffe2Xavier', override=dict(name='fcs')),
+            dict(type='Normal', std=0.01, override=dict(name='fc_mask_iou'))
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.num_classes = num_classes
+
+        self.convs = nn.ModuleList()
+        for i in range(num_convs):
+            if i == 0:
+                # concatenation of mask feature and mask prediction
+                in_channels = self.in_channels + 1
+            else:
+                in_channels = self.conv_out_channels
+            stride = 2 if i == num_convs - 1 else 1
+            self.convs.append(
+                Conv2d(
+                    in_channels,
+                    self.conv_out_channels,
+                    3,
+                    stride=stride,
+                    padding=1))
+
+        roi_feat_size = _pair(roi_feat_size)
+        pooled_area = (roi_feat_size[0] // 2) * (roi_feat_size[1] // 2)
+        self.fcs = nn.ModuleList()
+        for i in range(num_fcs):
+            in_channels = (
+                self.conv_out_channels *
+                pooled_area if i == 0 else self.fc_out_channels)
+            self.fcs.append(Linear(in_channels, self.fc_out_channels))
+
+        self.fc_mask_iou = Linear(self.fc_out_channels, self.num_classes)
+        self.relu = nn.ReLU()
+        self.max_pool = MaxPool2d(2, 2)
+        self.loss_iou = MODELS.build(loss_iou)
+
+    def forward(self, mask_feat: Tensor, mask_preds: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            mask_feat (Tensor): Mask features from upstream models.
+            mask_preds (Tensor): Mask predictions from mask head.
+
+        Returns:
+            Tensor: Mask IoU predictions.
+        """
+        mask_preds = mask_preds.sigmoid()
+        mask_pred_pooled = self.max_pool(mask_preds.unsqueeze(1))
+
+        x = torch.cat((mask_feat, mask_pred_pooled), 1)
+
+        for conv in self.convs:
+            x = self.relu(conv(x))
+        x = x.flatten(1)
+        for fc in self.fcs:
+            x = self.relu(fc(x))
+        mask_iou = self.fc_mask_iou(x)
+        return mask_iou
+
+    def loss_and_target(self, mask_iou_pred: Tensor, mask_preds: Tensor,
+                        mask_targets: Tensor,
+                        sampling_results: List[SamplingResult],
+                        batch_gt_instances: InstanceList,
+                        rcnn_train_cfg: ConfigDict) -> dict:
+        """Calculate the loss and targets of MaskIoUHead.
+
+        Args:
+            mask_iou_pred (Tensor): Mask IoU predictions results, has shape
+                (num_pos, num_classes)
+            mask_preds (Tensor): Mask predictions from mask head, has shape
+                (num_pos, mask_size, mask_size).
+            mask_targets (Tensor): The ground truth masks assigned with
+                predictions, has shape
+                (num_pos, mask_size, mask_size).
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It includes ``masks`` inside.
+            rcnn_train_cfg (obj:`ConfigDict`): `train_cfg` of RCNN.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+                The targets are only used for cascade rcnn.
+        """
+        mask_iou_targets = self.get_targets(
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            mask_preds=mask_preds,
+            mask_targets=mask_targets,
+            rcnn_train_cfg=rcnn_train_cfg)
+
+        pos_inds = mask_iou_targets > 0
+        if pos_inds.sum() > 0:
+            loss_mask_iou = self.loss_iou(mask_iou_pred[pos_inds],
+                                          mask_iou_targets[pos_inds])
+        else:
+            loss_mask_iou = mask_iou_pred.sum() * 0
+        return dict(loss_mask_iou=loss_mask_iou)
+
+    def get_targets(self, sampling_results: List[SamplingResult],
+                    batch_gt_instances: InstanceList, mask_preds: Tensor,
+                    mask_targets: Tensor,
+                    rcnn_train_cfg: ConfigDict) -> Tensor:
+        """Compute target of mask IoU.
+
+        Mask IoU target is the IoU of the predicted mask (inside a bbox) and
+        the gt mask of corresponding gt mask (the whole instance).
+        The intersection area is computed inside the bbox, and the gt mask area
+        is computed with two steps, firstly we compute the gt area inside the
+        bbox, then divide it by the area ratio of gt area inside the bbox and
+        the gt area of the whole instance.
+
+        Args:
+            sampling_results (list[:obj:`SamplingResult`]): sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It includes ``masks`` inside.
+            mask_preds (Tensor): Predicted masks of each positive proposal,
+                shape (num_pos, h, w).
+            mask_targets (Tensor): Gt mask of each positive proposal,
+                binary map of the shape (num_pos, h, w).
+            rcnn_train_cfg (obj:`ConfigDict`): Training config for R-CNN part.
+
+        Returns:
+            Tensor: mask iou target (length == num positive).
+        """
+        pos_proposals = [res.pos_priors for res in sampling_results]
+        pos_assigned_gt_inds = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        gt_masks = [res.masks for res in batch_gt_instances]
+
+        # compute the area ratio of gt areas inside the proposals and
+        # the whole instance
+        area_ratios = map(self._get_area_ratio, pos_proposals,
+                          pos_assigned_gt_inds, gt_masks)
+        area_ratios = torch.cat(list(area_ratios))
+        assert mask_targets.size(0) == area_ratios.size(0)
+
+        mask_preds = (mask_preds > rcnn_train_cfg.mask_thr_binary).float()
+        mask_pred_areas = mask_preds.sum((-1, -2))
+
+        # mask_preds and mask_targets are binary maps
+        overlap_areas = (mask_preds * mask_targets).sum((-1, -2))
+
+        # compute the mask area of the whole instance
+        gt_full_areas = mask_targets.sum((-1, -2)) / (area_ratios + 1e-7)
+
+        mask_iou_targets = overlap_areas / (
+            mask_pred_areas + gt_full_areas - overlap_areas)
+        return mask_iou_targets
+
+    def _get_area_ratio(self, pos_proposals: Tensor,
+                        pos_assigned_gt_inds: Tensor,
+                        gt_masks: InstanceData) -> Tensor:
+        """Compute area ratio of the gt mask inside the proposal and the gt
+        mask of the corresponding instance.
+
+        Args:
+            pos_proposals (Tensor): Positive proposals, has shape (num_pos, 4).
+            pos_assigned_gt_inds (Tensor): positive proposals assigned ground
+                truth index.
+            gt_masks (BitmapMask or PolygonMask): Gt masks (the whole instance)
+                of each image, with the same shape of the input image.
+
+        Returns:
+            Tensor: The area ratio of the gt mask inside the proposal and the
+            gt mask of the corresponding instance.
+        """
+        num_pos = pos_proposals.size(0)
+        if num_pos > 0:
+            area_ratios = []
+            proposals_np = pos_proposals.cpu().numpy()
+            pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+            # compute mask areas of gt instances (batch processing for speedup)
+            gt_instance_mask_area = gt_masks.areas
+            for i in range(num_pos):
+                gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+
+                # crop the gt mask inside the proposal
+                bbox = proposals_np[i, :].astype(np.int32)
+                gt_mask_in_proposal = gt_mask.crop(bbox)
+
+                ratio = gt_mask_in_proposal.areas[0] / (
+                    gt_instance_mask_area[pos_assigned_gt_inds[i]] + 1e-7)
+                area_ratios.append(ratio)
+            area_ratios = torch.from_numpy(np.stack(area_ratios)).float().to(
+                pos_proposals.device)
+        else:
+            area_ratios = pos_proposals.new_zeros((0, ))
+        return area_ratios
+
+    def predict_by_feat(self, mask_iou_preds: Tuple[Tensor],
+                        results_list: InstanceList) -> InstanceList:
+        """Predict the mask iou and calculate it into ``results.scores``.
+
+        Args:
+            mask_iou_preds (Tensor): Mask IoU predictions results, has shape
+                (num_proposals, num_classes)
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert len(mask_iou_preds) == len(results_list)
+        for results, mask_iou_pred in zip(results_list, mask_iou_preds):
+            labels = results.labels
+            scores = results.scores
+            results.scores = scores * mask_iou_pred[range(labels.size(0)),
+                                                    labels]
+        return results_list
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd30c337c37f4e280980e459c126df177fe7efa
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.layers import ResLayer, SimplifiedBasicBlock
+from mmdet.registry import MODELS
+from .fcn_mask_head import FCNMaskHead
+
+
+@MODELS.register_module()
+class SCNetMaskHead(FCNMaskHead):
+    """Mask head for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        conv_to_res (bool, optional): if True, change the conv layers to
+            ``SimplifiedBasicBlock``.
+    """
+
+    def __init__(self, conv_to_res: bool = True, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.conv_to_res = conv_to_res
+        if conv_to_res:
+            assert self.conv_kernel_size == 3
+            self.num_res_blocks = self.num_convs // 2
+            self.convs = ResLayer(
+                SimplifiedBasicBlock,
+                self.in_channels,
+                self.conv_out_channels,
+                self.num_res_blocks,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..55c5c8e4fae7d4e941a770d985c7253fd70f2226
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.layers import ResLayer, SimplifiedBasicBlock
+from mmdet.registry import MODELS
+from .fused_semantic_head import FusedSemanticHead
+
+
+@MODELS.register_module()
+class SCNetSemanticHead(FusedSemanticHead):
+    """Mask head for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        conv_to_res (bool, optional): if True, change the conv layers to
+            ``SimplifiedBasicBlock``.
+    """
+
+    def __init__(self, conv_to_res: bool = True, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.conv_to_res = conv_to_res
+        if self.conv_to_res:
+            num_res_blocks = self.num_convs // 2
+            self.convs = ResLayer(
+                SimplifiedBasicBlock,
+                self.in_channels,
+                self.conv_out_channels,
+                num_res_blocks,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+            self.num_convs = num_res_blocks
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/mask_scoring_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/mask_scoring_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6545c0ed41ee7ad17b5f1b841f8bc8d65a7b6391
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/mask_scoring_roi_head.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils.misc import empty_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class MaskScoringRoIHead(StandardRoIHead):
+    """Mask Scoring RoIHead for `Mask Scoring RCNN.
+
+    <https://arxiv.org/abs/1903.00241>`_.
+
+    Args:
+        mask_iou_head (:obj`ConfigDict`, dict): The config of mask_iou_head.
+    """
+
+    def __init__(self, mask_iou_head: ConfigType, **kwargs):
+        assert mask_iou_head is not None
+        super().__init__(**kwargs)
+        self.mask_iou_head = MODELS.build(mask_iou_head)
+
+    def forward(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList = None) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+            the meta information of each image and corresponding
+            annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            bbox_results = self._bbox_forward(x, rois)
+            results = results + (bbox_results['cls_score'],
+                                 bbox_results['bbox_pred'])
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_results = self._mask_forward(x, mask_rois)
+            results = results + (mask_results['mask_preds'], )
+
+            # mask iou head
+            cls_score = bbox_results['cls_score'][:100]
+            mask_preds = mask_results['mask_preds']
+            mask_feats = mask_results['mask_feats']
+            _, labels = cls_score[:, :self.bbox_head.num_classes].max(dim=1)
+            mask_iou_preds = self.mask_iou_head(
+                mask_feats, mask_preds[range(labels.size(0)), labels])
+            results = results + (mask_iou_preds, )
+
+        return results
+
+    def mask_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult], bbox_feats,
+                  batch_gt_instances: InstanceList) -> dict:
+        """Perform forward propagation and loss calculation of the mask head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            bbox_feats (Tensor): Extract bbox RoI features.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `mask_feats` (Tensor): Extract mask RoI features.
+                - `mask_targets` (Tensor): Mask target of each positive\
+                    proposals in the image.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+                - `loss_mask_iou` (Tensor): mask iou loss.
+        """
+        if not self.share_roi_extractor:
+            pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+            mask_results = self._mask_forward(x, pos_rois)
+        else:
+            pos_inds = []
+            device = bbox_feats.device
+            for res in sampling_results:
+                pos_inds.append(
+                    torch.ones(
+                        res.pos_priors.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+                pos_inds.append(
+                    torch.zeros(
+                        res.neg_priors.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+            pos_inds = torch.cat(pos_inds)
+
+            mask_results = self._mask_forward(
+                x, pos_inds=pos_inds, bbox_feats=bbox_feats)
+
+        mask_loss_and_target = self.mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg)
+        mask_targets = mask_loss_and_target['mask_targets']
+        mask_results.update(loss_mask=mask_loss_and_target['loss_mask'])
+        if mask_results['loss_mask'] is None:
+            return mask_results
+
+        # mask iou head forward and loss
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        pos_mask_pred = mask_results['mask_preds'][
+            range(mask_results['mask_preds'].size(0)), pos_labels]
+        mask_iou_pred = self.mask_iou_head(mask_results['mask_feats'],
+                                           pos_mask_pred)
+        pos_mask_iou_pred = mask_iou_pred[range(mask_iou_pred.size(0)),
+                                          pos_labels]
+
+        loss_mask_iou = self.mask_iou_head.loss_and_target(
+            pos_mask_iou_pred, pos_mask_pred, mask_targets, sampling_results,
+            batch_gt_instances, self.train_cfg)
+        mask_results['loss_mask'].update(loss_mask_iou)
+        return mask_results
+
+    def predict_mask(self,
+                     x: Tensor,
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        mask_results = self._mask_forward(x, mask_rois)
+        mask_preds = mask_results['mask_preds']
+        mask_feats = mask_results['mask_feats']
+        # get mask scores with mask iou head
+        labels = torch.cat([res.labels for res in results_list])
+        mask_iou_preds = self.mask_iou_head(
+            mask_feats, mask_preds[range(labels.size(0)), labels])
+        # split batch mask prediction back to each image
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+        mask_iou_preds = mask_iou_preds.split(num_mask_rois_per_img, 0)
+
+        # TODO: Handle the case where rescale is false
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=mask_preds,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+        results_list = self.mask_iou_head.predict_by_feat(
+            mask_iou_preds=mask_iou_preds, results_list=results_list)
+        return results_list
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/multi_instance_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/multi_instance_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..fee55b0a5d341c03165649f59737fd34d85c207e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/multi_instance_roi_head.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances, unpack_gt_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class MultiInstanceRoIHead(StandardRoIHead):
+    """The roi head for Multi-instance prediction."""
+
+    def __init__(self, num_instance: int = 2, *args, **kwargs) -> None:
+        self.num_instance = num_instance
+        super().__init__(*args, **kwargs)
+
+    def init_bbox_head(self, bbox_roi_extractor: ConfigType,
+                       bbox_head: ConfigType) -> None:
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (dict or ConfigDict): Config of box
+                roi extractor.
+            bbox_head (dict or ConfigDict): Config of box in box head.
+        """
+        self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor)
+        self.bbox_head = MODELS.build(bbox_head)
+
+    def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `cls_score_ref` (Tensor): The cls_score after refine model.
+                - `bbox_pred_ref` (Tensor): The bbox_pred after refine model.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        # TODO: a more flexible way to decide which feature maps to use
+        bbox_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        bbox_results = self.bbox_head(bbox_feats)
+
+        if self.bbox_head.with_refine:
+            bbox_results = dict(
+                cls_score=bbox_results[0],
+                bbox_pred=bbox_results[1],
+                cls_score_ref=bbox_results[2],
+                bbox_pred_ref=bbox_results[3],
+                bbox_feats=bbox_feats)
+        else:
+            bbox_results = dict(
+                cls_score=bbox_results[0],
+                bbox_pred=bbox_results[1],
+                bbox_feats=bbox_feats)
+
+        return bbox_results
+
+    def bbox_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult]) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        # If there is a refining process, add refine loss.
+        if 'cls_score_ref' in bbox_results:
+            bbox_loss_and_target = self.bbox_head.loss_and_target(
+                cls_score=bbox_results['cls_score'],
+                bbox_pred=bbox_results['bbox_pred'],
+                rois=rois,
+                sampling_results=sampling_results,
+                rcnn_train_cfg=self.train_cfg)
+            bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
+            bbox_loss_and_target_ref = self.bbox_head.loss_and_target(
+                cls_score=bbox_results['cls_score_ref'],
+                bbox_pred=bbox_results['bbox_pred_ref'],
+                rois=rois,
+                sampling_results=sampling_results,
+                rcnn_train_cfg=self.train_cfg)
+            bbox_results['loss_bbox']['loss_rcnn_emd_ref'] = \
+                bbox_loss_and_target_ref['loss_bbox']['loss_rcnn_emd']
+        else:
+            bbox_loss_and_target = self.bbox_head.loss_and_target(
+                cls_score=bbox_results['cls_score'],
+                bbox_pred=bbox_results['bbox_pred'],
+                rois=rois,
+                sampling_results=sampling_results,
+                rcnn_train_cfg=self.train_cfg)
+            bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
+
+        return bbox_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: List[DetDataSample]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        sampling_results = []
+        for i in range(len(batch_data_samples)):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                batch_gt_instances_ignore=batch_gt_instances_ignore[i])
+            sampling_results.append(sampling_result)
+
+        losses = dict()
+        # bbox head loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(x, sampling_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        return losses
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposals = [res.bboxes for res in rpn_results_list]
+        rois = bbox2roi(proposals)
+
+        if rois.shape[0] == 0:
+            return empty_instances(
+                batch_img_metas, rois.device, task_type='bbox')
+
+        bbox_results = self._bbox_forward(x, rois)
+
+        # split batch bbox prediction back to each image
+        if 'cls_score_ref' in bbox_results:
+            cls_scores = bbox_results['cls_score_ref']
+            bbox_preds = bbox_results['bbox_pred_ref']
+        else:
+            cls_scores = bbox_results['cls_score']
+            bbox_preds = bbox_results['bbox_pred']
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = rois.split(num_proposals_per_img, 0)
+        cls_scores = cls_scores.split(num_proposals_per_img, 0)
+
+        if bbox_preds is not None:
+            bbox_preds = bbox_preds.split(num_proposals_per_img, 0)
+        else:
+            bbox_preds = (None, ) * len(proposals)
+
+        result_list = self.bbox_head.predict_by_feat(
+            rois=rois,
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=rcnn_test_cfg,
+            rescale=rescale)
+        return result_list
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/pisa_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/pisa_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..45d59879da73b48df790c55d40a4a88f1d099111
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/pisa_roi_head.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+from torch import Tensor
+
+from mmdet.models.task_modules import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList
+from ..losses.pisa_loss import carl_loss, isr_p
+from ..utils import unpack_gt_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class PISARoIHead(StandardRoIHead):
+    r"""The RoI head for `Prime Sample Attention in Object Detection
+    <https://arxiv.org/abs/1904.04821>`_."""
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: List[DetDataSample]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(batch_data_samples)
+        sampling_results = []
+        neg_label_weights = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            if isinstance(sampling_result, tuple):
+                sampling_result, neg_label_weight = sampling_result
+            sampling_results.append(sampling_result)
+            neg_label_weights.append(neg_label_weight)
+
+        losses = dict()
+        # bbox head forward and loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(
+                x, sampling_results, neg_label_weights=neg_label_weights)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self.mask_loss(x, sampling_results,
+                                          bbox_results['bbox_feats'],
+                                          batch_gt_instances)
+            losses.update(mask_results['loss_mask'])
+
+        return losses
+
+    def bbox_loss(self,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  neg_label_weights: List[Tensor] = None) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+            - `cls_score` (Tensor): Classification scores.
+            - `bbox_pred` (Tensor): Box energies / deltas.
+            - `bbox_feats` (Tensor): Extract bbox RoI features.
+            - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+        bbox_targets = self.bbox_head.get_targets(sampling_results,
+                                                  self.train_cfg)
+
+        # neg_label_weights obtained by sampler is image-wise, mapping back to
+        # the corresponding location in label weights
+        if neg_label_weights[0] is not None:
+            label_weights = bbox_targets[1]
+            cur_num_rois = 0
+            for i in range(len(sampling_results)):
+                num_pos = sampling_results[i].pos_inds.size(0)
+                num_neg = sampling_results[i].neg_inds.size(0)
+                label_weights[cur_num_rois + num_pos:cur_num_rois + num_pos +
+                              num_neg] = neg_label_weights[i]
+                cur_num_rois += num_pos + num_neg
+
+        cls_score = bbox_results['cls_score']
+        bbox_pred = bbox_results['bbox_pred']
+
+        # Apply ISR-P
+        isr_cfg = self.train_cfg.get('isr', None)
+        if isr_cfg is not None:
+            bbox_targets = isr_p(
+                cls_score,
+                bbox_pred,
+                bbox_targets,
+                rois,
+                sampling_results,
+                self.bbox_head.loss_cls,
+                self.bbox_head.bbox_coder,
+                **isr_cfg,
+                num_class=self.bbox_head.num_classes)
+        loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, rois,
+                                        *bbox_targets)
+
+        # Add CARL Loss
+        carl_cfg = self.train_cfg.get('carl', None)
+        if carl_cfg is not None:
+            loss_carl = carl_loss(
+                cls_score,
+                bbox_targets[0],
+                bbox_pred,
+                bbox_targets[2],
+                self.bbox_head.loss_bbox,
+                **carl_cfg,
+                num_class=self.bbox_head.num_classes)
+            loss_bbox.update(loss_carl)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/point_rend_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/point_rend_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a0641549631e243c3db25039b01fed64fb1e0d1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/point_rend_roi_head.py
@@ -0,0 +1,236 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class PointRendRoIHead(StandardRoIHead):
+    """`PointRend <https://arxiv.org/abs/1912.08193>`_."""
+
+    def __init__(self, point_head: ConfigType, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        assert self.with_bbox and self.with_mask
+        self.init_point_head(point_head)
+
+    def init_point_head(self, point_head: ConfigType) -> None:
+        """Initialize ``point_head``"""
+        self.point_head = MODELS.build(point_head)
+
+    def mask_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult], bbox_feats: Tensor,
+                  batch_gt_instances: InstanceList) -> dict:
+        """Run forward function and calculate loss for mask head and point head
+        in training."""
+        mask_results = super().mask_loss(
+            x=x,
+            sampling_results=sampling_results,
+            bbox_feats=bbox_feats,
+            batch_gt_instances=batch_gt_instances)
+
+        mask_point_results = self._mask_point_loss(
+            x=x,
+            sampling_results=sampling_results,
+            mask_preds=mask_results['mask_preds'],
+            batch_gt_instances=batch_gt_instances)
+        mask_results['loss_mask'].update(
+            loss_point=mask_point_results['loss_point'])
+
+        return mask_results
+
+    def _mask_point_loss(self, x: Tuple[Tensor],
+                         sampling_results: List[SamplingResult],
+                         mask_preds: Tensor,
+                         batch_gt_instances: InstanceList) -> dict:
+        """Run forward function and calculate loss for point head in
+        training."""
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        rel_roi_points = self.point_head.get_roi_rel_points_train(
+            mask_preds, pos_labels, cfg=self.train_cfg)
+        rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+
+        fine_grained_point_feats = self._get_fine_grained_point_feats(
+            x, rois, rel_roi_points)
+        coarse_point_feats = point_sample(mask_preds, rel_roi_points)
+        mask_point_pred = self.point_head(fine_grained_point_feats,
+                                          coarse_point_feats)
+
+        loss_and_target = self.point_head.loss_and_target(
+            point_pred=mask_point_pred,
+            rel_roi_points=rel_roi_points,
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            cfg=self.train_cfg)
+
+        return loss_and_target
+
+    def _mask_point_forward_test(self, x: Tuple[Tensor], rois: Tensor,
+                                 label_preds: Tensor,
+                                 mask_preds: Tensor) -> Tensor:
+        """Mask refining process with point head in testing.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            rois (Tensor): shape (num_rois, 5).
+            label_preds (Tensor): The predication class for each rois.
+            mask_preds (Tensor): The predication coarse masks of
+                shape (num_rois, num_classes, small_size, small_size).
+
+        Returns:
+            Tensor: The refined masks of shape (num_rois, num_classes,
+            large_size, large_size).
+        """
+        refined_mask_pred = mask_preds.clone()
+        for subdivision_step in range(self.test_cfg.subdivision_steps):
+            refined_mask_pred = F.interpolate(
+                refined_mask_pred,
+                scale_factor=self.test_cfg.scale_factor,
+                mode='bilinear',
+                align_corners=False)
+            # If `subdivision_num_points` is larger or equal to the
+            # resolution of the next step, then we can skip this step
+            num_rois, channels, mask_height, mask_width = \
+                refined_mask_pred.shape
+            if (self.test_cfg.subdivision_num_points >=
+                    self.test_cfg.scale_factor**2 * mask_height * mask_width
+                    and
+                    subdivision_step < self.test_cfg.subdivision_steps - 1):
+                continue
+            point_indices, rel_roi_points = \
+                self.point_head.get_roi_rel_points_test(
+                    refined_mask_pred, label_preds, cfg=self.test_cfg)
+
+            fine_grained_point_feats = self._get_fine_grained_point_feats(
+                x=x, rois=rois, rel_roi_points=rel_roi_points)
+            coarse_point_feats = point_sample(mask_preds, rel_roi_points)
+            mask_point_pred = self.point_head(fine_grained_point_feats,
+                                              coarse_point_feats)
+
+            point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1)
+            refined_mask_pred = refined_mask_pred.reshape(
+                num_rois, channels, mask_height * mask_width)
+            refined_mask_pred = refined_mask_pred.scatter_(
+                2, point_indices, mask_point_pred)
+            refined_mask_pred = refined_mask_pred.view(num_rois, channels,
+                                                       mask_height, mask_width)
+
+        return refined_mask_pred
+
+    def _get_fine_grained_point_feats(self, x: Tuple[Tensor], rois: Tensor,
+                                      rel_roi_points: Tensor) -> Tensor:
+        """Sample fine grained feats from each level feature map and
+        concatenate them together.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            rois (Tensor): shape (num_rois, 5).
+            rel_roi_points (Tensor): A tensor of shape (num_rois, num_points,
+                2) that contains [0, 1] x [0, 1] normalized coordinates of the
+                most uncertain points from the [mask_height, mask_width] grid.
+
+        Returns:
+            Tensor: The fine grained features for each points,
+            has shape (num_rois, feats_channels, num_points).
+        """
+        assert rois.shape[0] > 0, 'RoI is a empty tensor.'
+        num_imgs = x[0].shape[0]
+        fine_grained_feats = []
+        for idx in range(self.mask_roi_extractor.num_inputs):
+            feats = x[idx]
+            spatial_scale = 1. / float(
+                self.mask_roi_extractor.featmap_strides[idx])
+            point_feats = []
+            for batch_ind in range(num_imgs):
+                # unravel batch dim
+                feat = feats[batch_ind].unsqueeze(0)
+                inds = (rois[:, 0].long() == batch_ind)
+                if inds.any():
+                    rel_img_points = rel_roi_point_to_rel_img_point(
+                        rois=rois[inds],
+                        rel_roi_points=rel_roi_points[inds],
+                        img=feat.shape[2:],
+                        spatial_scale=spatial_scale).unsqueeze(0)
+                    point_feat = point_sample(feat, rel_img_points)
+                    point_feat = point_feat.squeeze(0).transpose(0, 1)
+                    point_feats.append(point_feat)
+            fine_grained_feats.append(torch.cat(point_feats, dim=0))
+        return torch.cat(fine_grained_feats, dim=1)
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+            - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        # don't need to consider aug_test.
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        mask_results = self._mask_forward(x, mask_rois)
+        mask_preds = mask_results['mask_preds']
+        # split batch mask prediction back to each image
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+
+        # refine mask_preds
+        mask_rois = mask_rois.split(num_mask_rois_per_img, 0)
+        mask_preds_refined = []
+        for i in range(len(batch_img_metas)):
+            labels = results_list[i].labels
+            x_i = [xx[[i]] for xx in x]
+            mask_rois_i = mask_rois[i]
+            mask_rois_i[:, 0] = 0
+            mask_pred_i = self._mask_point_forward_test(
+                x_i, mask_rois_i, labels, mask_preds[i])
+            mask_preds_refined.append(mask_pred_i)
+
+        # TODO: Handle the case where rescale is false
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=mask_preds_refined,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+        return results_list
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/roi_extractors/__init__.py b/head_extractor/build/lib/mmdet/models/roi_heads/roi_extractors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f60214991b0ed14cdbc3964aee15356c6aaf2aa
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/roi_extractors/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_roi_extractor import BaseRoIExtractor
+from .generic_roi_extractor import GenericRoIExtractor
+from .single_level_roi_extractor import SingleRoIExtractor
+
+__all__ = ['BaseRoIExtractor', 'SingleRoIExtractor', 'GenericRoIExtractor']
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py b/head_extractor/build/lib/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8de0518818aba8d9aac7b807e3215d0da6c9b99
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv import ops
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.utils import ConfigType, OptMultiConfig
+
+
+class BaseRoIExtractor(BaseModule, metaclass=ABCMeta):
+    """Base class for RoI extractor.
+
+    Args:
+        roi_layer (:obj:`ConfigDict` or dict): Specify RoI layer type and
+            arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (list[int]): Strides of input feature maps.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 roi_layer: ConfigType,
+                 out_channels: int,
+                 featmap_strides: List[int],
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides)
+        self.out_channels = out_channels
+        self.featmap_strides = featmap_strides
+
+    @property
+    def num_inputs(self) -> int:
+        """int: Number of input feature maps."""
+        return len(self.featmap_strides)
+
+    def build_roi_layers(self, layer_cfg: ConfigType,
+                         featmap_strides: List[int]) -> nn.ModuleList:
+        """Build RoI operator to extract feature from each level feature map.
+
+        Args:
+            layer_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+                config RoI layer operation. Options are modules under
+                ``mmcv/ops`` such as ``RoIAlign``.
+            featmap_strides (list[int]): The stride of input feature map w.r.t
+                to the original image size, which would be used to scale RoI
+                coordinate (original image coordinate system) to feature
+                coordinate system.
+
+        Returns:
+            :obj:`nn.ModuleList`: The RoI extractor modules for each level
+                feature map.
+        """
+
+        cfg = layer_cfg.copy()
+        layer_type = cfg.pop('type')
+        if isinstance(layer_type, str):
+            assert hasattr(ops, layer_type)
+            layer_cls = getattr(ops, layer_type)
+        else:
+            layer_cls = layer_type
+        roi_layers = nn.ModuleList(
+            [layer_cls(spatial_scale=1 / s, **cfg) for s in featmap_strides])
+        return roi_layers
+
+    def roi_rescale(self, rois: Tensor, scale_factor: float) -> Tensor:
+        """Scale RoI coordinates by scale factor.
+
+        Args:
+            rois (Tensor): RoI (Region of Interest), shape (n, 5)
+            scale_factor (float): Scale factor that RoI will be multiplied by.
+
+        Returns:
+            Tensor: Scaled RoI.
+        """
+
+        cx = (rois[:, 1] + rois[:, 3]) * 0.5
+        cy = (rois[:, 2] + rois[:, 4]) * 0.5
+        w = rois[:, 3] - rois[:, 1]
+        h = rois[:, 4] - rois[:, 2]
+        new_w = w * scale_factor
+        new_h = h * scale_factor
+        x1 = cx - new_w * 0.5
+        x2 = cx + new_w * 0.5
+        y1 = cy - new_h * 0.5
+        y2 = cy + new_h * 0.5
+        new_rois = torch.stack((rois[:, 0], x1, y1, x2, y2), dim=-1)
+        return new_rois
+
+    @abstractmethod
+    def forward(self,
+                feats: Tuple[Tensor],
+                rois: Tensor,
+                roi_scale_factor: Optional[float] = None) -> Tensor:
+        """Extractor ROI feats.
+
+        Args:
+            feats (Tuple[Tensor]): Multi-scale features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            roi_scale_factor (Optional[float]): RoI scale factor.
+                Defaults to None.
+
+        Returns:
+            Tensor: RoI feature.
+        """
+        pass
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py b/head_extractor/build/lib/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d4c90135d853404d564391f029558841ac9cac
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+from mmcv.cnn.bricks import build_plugin_layer
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType
+from .base_roi_extractor import BaseRoIExtractor
+
+
+@MODELS.register_module()
+class GenericRoIExtractor(BaseRoIExtractor):
+    """Extract RoI features from all level feature maps levels.
+
+    This is the implementation of `A novel Region of Interest Extraction Layer
+    for Instance Segmentation <https://arxiv.org/abs/2004.13665>`_.
+
+    Args:
+        aggregation (str): The method to aggregate multiple feature maps.
+            Options are 'sum', 'concat'. Defaults to 'sum'.
+        pre_cfg (:obj:`ConfigDict` or dict): Specify pre-processing modules.
+            Defaults to None.
+        post_cfg (:obj:`ConfigDict` or dict): Specify post-processing modules.
+            Defaults to None.
+        kwargs (keyword arguments): Arguments that are the same
+            as :class:`BaseRoIExtractor`.
+    """
+
+    def __init__(self,
+                 aggregation: str = 'sum',
+                 pre_cfg: OptConfigType = None,
+                 post_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+
+        assert aggregation in ['sum', 'concat']
+
+        self.aggregation = aggregation
+        self.with_post = post_cfg is not None
+        self.with_pre = pre_cfg is not None
+        # build pre/post processing modules
+        if self.with_post:
+            self.post_module = build_plugin_layer(post_cfg, '_post_module')[1]
+        if self.with_pre:
+            self.pre_module = build_plugin_layer(pre_cfg, '_pre_module')[1]
+
+    def forward(self,
+                feats: Tuple[Tensor],
+                rois: Tensor,
+                roi_scale_factor: Optional[float] = None) -> Tensor:
+        """Extractor ROI feats.
+
+        Args:
+            feats (Tuple[Tensor]): Multi-scale features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            roi_scale_factor (Optional[float]): RoI scale factor.
+                Defaults to None.
+
+        Returns:
+            Tensor: RoI feature.
+        """
+        out_size = self.roi_layers[0].output_size
+        num_levels = len(feats)
+        roi_feats = feats[0].new_zeros(
+            rois.size(0), self.out_channels, *out_size)
+
+        # some times rois is an empty tensor
+        if roi_feats.shape[0] == 0:
+            return roi_feats
+
+        if num_levels == 1:
+            return self.roi_layers[0](feats[0], rois)
+
+        if roi_scale_factor is not None:
+            rois = self.roi_rescale(rois, roi_scale_factor)
+
+        # mark the starting channels for concat mode
+        start_channels = 0
+        for i in range(num_levels):
+            roi_feats_t = self.roi_layers[i](feats[i], rois)
+            end_channels = start_channels + roi_feats_t.size(1)
+            if self.with_pre:
+                # apply pre-processing to a RoI extracted from each layer
+                roi_feats_t = self.pre_module(roi_feats_t)
+            if self.aggregation == 'sum':
+                # and sum them all
+                roi_feats += roi_feats_t
+            else:
+                # and concat them along channel dimension
+                roi_feats[:, start_channels:end_channels] = roi_feats_t
+            # update channels starting position
+            start_channels = end_channels
+        # check if concat channels match at the end
+        if self.aggregation == 'concat':
+            assert start_channels == self.out_channels
+
+        if self.with_post:
+            # apply post-processing before return the result
+            roi_feats = self.post_module(roi_feats)
+        return roi_feats
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py b/head_extractor/build/lib/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..59229e0b0b0a18dff81abca6f5c20cb50b0d542c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from .base_roi_extractor import BaseRoIExtractor
+
+
+@MODELS.register_module()
+class SingleRoIExtractor(BaseRoIExtractor):
+    """Extract RoI features from a single level feature map.
+
+    If there are multiple input feature levels, each RoI is mapped to a level
+    according to its scale. The mapping rule is proposed in
+    `FPN <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        roi_layer (:obj:`ConfigDict` or dict): Specify RoI layer type and
+            arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (List[int]): Strides of input feature maps.
+        finest_scale (int): Scale threshold of mapping to level 0.
+            Defaults to 56.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 roi_layer: ConfigType,
+                 out_channels: int,
+                 featmap_strides: List[int],
+                 finest_scale: int = 56,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            roi_layer=roi_layer,
+            out_channels=out_channels,
+            featmap_strides=featmap_strides,
+            init_cfg=init_cfg)
+        self.finest_scale = finest_scale
+
+    def map_roi_levels(self, rois: Tensor, num_levels: int) -> Tensor:
+        """Map rois to corresponding feature levels by scales.
+
+        - scale < finest_scale * 2: level 0
+        - finest_scale * 2 <= scale < finest_scale * 4: level 1
+        - finest_scale * 4 <= scale < finest_scale * 8: level 2
+        - scale >= finest_scale * 8: level 3
+
+        Args:
+            rois (Tensor): Input RoIs, shape (k, 5).
+            num_levels (int): Total level number.
+
+        Returns:
+            Tensor: Level index (0-based) of each RoI, shape (k, )
+        """
+        scale = torch.sqrt(
+            (rois[:, 3] - rois[:, 1]) * (rois[:, 4] - rois[:, 2]))
+        target_lvls = torch.floor(torch.log2(scale / self.finest_scale + 1e-6))
+        target_lvls = target_lvls.clamp(min=0, max=num_levels - 1).long()
+        return target_lvls
+
+    def forward(self,
+                feats: Tuple[Tensor],
+                rois: Tensor,
+                roi_scale_factor: Optional[float] = None):
+        """Extractor ROI feats.
+
+        Args:
+            feats (Tuple[Tensor]): Multi-scale features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            roi_scale_factor (Optional[float]): RoI scale factor.
+                Defaults to None.
+
+        Returns:
+            Tensor: RoI feature.
+        """
+        # convert fp32 to fp16 when amp is on
+        rois = rois.type_as(feats[0])
+        out_size = self.roi_layers[0].output_size
+        num_levels = len(feats)
+        roi_feats = feats[0].new_zeros(
+            rois.size(0), self.out_channels, *out_size)
+
+        # TODO: remove this when parrots supports
+        if torch.__version__ == 'parrots':
+            roi_feats.requires_grad = True
+
+        if num_levels == 1:
+            if len(rois) == 0:
+                return roi_feats
+            return self.roi_layers[0](feats[0], rois)
+
+        target_lvls = self.map_roi_levels(rois, num_levels)
+
+        if roi_scale_factor is not None:
+            rois = self.roi_rescale(rois, roi_scale_factor)
+
+        for i in range(num_levels):
+            mask = target_lvls == i
+            inds = mask.nonzero(as_tuple=False).squeeze(1)
+            if inds.numel() > 0:
+                rois_ = rois[inds]
+                roi_feats_t = self.roi_layers[i](feats[i], rois_)
+                roi_feats[inds] = roi_feats_t
+            else:
+                # Sometimes some pyramid levels will not be used for RoI
+                # feature extraction and this will cause an incomplete
+                # computation graph in one GPU, which is different from those
+                # in other GPUs and will cause a hanging error.
+                # Therefore, we add it to ensure each feature pyramid is
+                # included in the computation graph to avoid runtime bugs.
+                roi_feats += sum(
+                    x.view(-1)[0]
+                    for x in self.parameters()) * 0. + feats[i].sum() * 0.
+        return roi_feats
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/scnet_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/scnet_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6d2bc1915bae38011cc75a720e48ed53b51ddb5
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/scnet_roi_head.py
@@ -0,0 +1,677 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList, OptConfigType
+from ..layers import adaptive_avg_pool2d
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances, unpack_gt_instances
+from .cascade_roi_head import CascadeRoIHead
+
+
+@MODELS.register_module()
+class SCNetRoIHead(CascadeRoIHead):
+    """RoIHead for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        num_stages (int): number of cascade stages.
+        stage_loss_weights (list): loss weight of cascade stages.
+        semantic_roi_extractor (dict): config to init semantic roi extractor.
+        semantic_head (dict): config to init semantic head.
+        feat_relay_head (dict): config to init feature_relay_head.
+        glbctx_head (dict): config to init global context head.
+    """
+
+    def __init__(self,
+                 num_stages: int,
+                 stage_loss_weights: List[float],
+                 semantic_roi_extractor: OptConfigType = None,
+                 semantic_head: OptConfigType = None,
+                 feat_relay_head: OptConfigType = None,
+                 glbctx_head: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(
+            num_stages=num_stages,
+            stage_loss_weights=stage_loss_weights,
+            **kwargs)
+        assert self.with_bbox and self.with_mask
+        assert not self.with_shared_head  # shared head is not supported
+
+        if semantic_head is not None:
+            self.semantic_roi_extractor = MODELS.build(semantic_roi_extractor)
+            self.semantic_head = MODELS.build(semantic_head)
+
+        if feat_relay_head is not None:
+            self.feat_relay_head = MODELS.build(feat_relay_head)
+
+        if glbctx_head is not None:
+            self.glbctx_head = MODELS.build(glbctx_head)
+
+    def init_mask_head(self, mask_roi_extractor: ConfigType,
+                       mask_head: ConfigType) -> None:
+        """Initialize ``mask_head``"""
+        if mask_roi_extractor is not None:
+            self.mask_roi_extractor = MODELS.build(mask_roi_extractor)
+            self.mask_head = MODELS.build(mask_head)
+
+    # TODO move to base_roi_head later
+    @property
+    def with_semantic(self) -> bool:
+        """bool: whether the head has semantic head"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    @property
+    def with_feat_relay(self) -> bool:
+        """bool: whether the head has feature relay head"""
+        return (hasattr(self, 'feat_relay_head')
+                and self.feat_relay_head is not None)
+
+    @property
+    def with_glbctx(self) -> bool:
+        """bool: whether the head has global context head"""
+        return hasattr(self, 'glbctx_head') and self.glbctx_head is not None
+
+    def _fuse_glbctx(self, roi_feats: Tensor, glbctx_feat: Tensor,
+                     rois: Tensor) -> Tensor:
+        """Fuse global context feats with roi feats.
+
+        Args:
+            roi_feats (Tensor): RoI features.
+            glbctx_feat (Tensor): Global context feature..
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+            Tensor: Fused feature.
+        """
+        assert roi_feats.size(0) == rois.size(0)
+        # RuntimeError: isDifferentiableType(variable.scalar_type())
+        # INTERNAL ASSERT FAILED if detach() is not used when calling
+        # roi_head.predict().
+        img_inds = torch.unique(rois[:, 0].detach().cpu(), sorted=True).long()
+        fused_feats = torch.zeros_like(roi_feats)
+        for img_id in img_inds:
+            inds = (rois[:, 0] == img_id.item())
+            fused_feats[inds] = roi_feats[inds] + glbctx_feat[img_id]
+        return fused_feats
+
+    def _slice_pos_feats(self, feats: Tensor,
+                         sampling_results: List[SamplingResult]) -> Tensor:
+        """Get features from pos rois.
+
+        Args:
+            feats (Tensor): Input features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            Tensor: Sliced features.
+        """
+        num_rois = [res.priors.size(0) for res in sampling_results]
+        num_pos_rois = [res.pos_priors.size(0) for res in sampling_results]
+        inds = torch.zeros(sum(num_rois), dtype=torch.bool)
+        start = 0
+        for i in range(len(num_rois)):
+            start = 0 if i == 0 else start + num_rois[i - 1]
+            stop = start + num_pos_rois[i]
+            inds[start:stop] = 1
+        sliced_feats = feats[inds]
+        return sliced_feats
+
+    def _bbox_forward(self,
+                      stage: int,
+                      x: Tuple[Tensor],
+                      rois: Tensor,
+                      semantic_feat: Optional[Tensor] = None,
+                      glbctx_feat: Optional[Tensor] = None) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            semantic_feat (Tensor): Semantic feature. Defaults to None.
+            glbctx_feat (Tensor): Global context feature. Defaults to None.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        if self.with_semantic and semantic_feat is not None:
+            bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]:
+                bbox_semantic_feat = adaptive_avg_pool2d(
+                    bbox_semantic_feat, bbox_feats.shape[-2:])
+            bbox_feats += bbox_semantic_feat
+        if self.with_glbctx and glbctx_feat is not None:
+            bbox_feats = self._fuse_glbctx(bbox_feats, glbctx_feat, rois)
+        cls_score, bbox_pred, relayed_feat = bbox_head(
+            bbox_feats, return_shared_feat=True)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            relayed_feat=relayed_feat)
+        return bbox_results
+
+    def _mask_forward(self,
+                      x: Tuple[Tensor],
+                      rois: Tensor,
+                      semantic_feat: Optional[Tensor] = None,
+                      glbctx_feat: Optional[Tensor] = None,
+                      relayed_feat: Optional[Tensor] = None) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            semantic_feat (Tensor): Semantic feature. Defaults to None.
+            glbctx_feat (Tensor): Global context feature. Defaults to None.
+            relayed_feat (Tensor): Relayed feature. Defaults to None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_feats = self.mask_roi_extractor(
+            x[:self.mask_roi_extractor.num_inputs], rois)
+        if self.with_semantic and semantic_feat is not None:
+            mask_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]:
+                mask_semantic_feat = F.adaptive_avg_pool2d(
+                    mask_semantic_feat, mask_feats.shape[-2:])
+            mask_feats += mask_semantic_feat
+        if self.with_glbctx and glbctx_feat is not None:
+            mask_feats = self._fuse_glbctx(mask_feats, glbctx_feat, rois)
+        if self.with_feat_relay and relayed_feat is not None:
+            mask_feats = mask_feats + relayed_feat
+        mask_preds = self.mask_head(mask_feats)
+        mask_results = dict(mask_preds=mask_preds)
+
+        return mask_results
+
+    def bbox_loss(self,
+                  stage: int,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  semantic_feat: Optional[Tensor] = None,
+                  glbctx_feat: Optional[Tensor] = None) -> dict:
+        """Run forward function and calculate loss for box head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            semantic_feat (Tensor): Semantic feature. Defaults to None.
+            glbctx_feat (Tensor): Global context feature. Defaults to None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+        """
+        bbox_head = self.bbox_head[stage]
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(
+            stage,
+            x,
+            rois,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat)
+        bbox_results.update(rois=rois)
+
+        bbox_loss_and_target = bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg[stage])
+
+        bbox_results.update(bbox_loss_and_target)
+        return bbox_results
+
+    def mask_loss(self,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  batch_gt_instances: InstanceList,
+                  semantic_feat: Optional[Tensor] = None,
+                  glbctx_feat: Optional[Tensor] = None,
+                  relayed_feat: Optional[Tensor] = None) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            semantic_feat (Tensor): Semantic feature. Defaults to None.
+            glbctx_feat (Tensor): Global context feature. Defaults to None.
+            relayed_feat (Tensor): Relayed feature. Defaults to None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+        mask_results = self._mask_forward(
+            x,
+            pos_rois,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat,
+            relayed_feat=relayed_feat)
+
+        mask_loss_and_target = self.mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg[-1])
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def semantic_loss(self, x: Tuple[Tensor],
+                      batch_data_samples: SampleList) -> dict:
+        """Semantic segmentation loss.
+
+        Args:
+            x (Tuple[Tensor]): Tuple of multi-level img features.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `semantic_feat` (Tensor): Semantic feature.
+                - `loss_seg` (dict): Semantic segmentation loss.
+        """
+        gt_semantic_segs = [
+            data_sample.gt_sem_seg.sem_seg
+            for data_sample in batch_data_samples
+        ]
+        gt_semantic_segs = torch.stack(gt_semantic_segs)
+        semantic_pred, semantic_feat = self.semantic_head(x)
+        loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_segs)
+
+        semantic_results = dict(loss_seg=loss_seg, semantic_feat=semantic_feat)
+
+        return semantic_results
+
+    def global_context_loss(self, x: Tuple[Tensor],
+                            batch_gt_instances: InstanceList) -> dict:
+        """Global context loss.
+
+        Args:
+            x (Tuple[Tensor]): Tuple of multi-level img features.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `glbctx_feat` (Tensor): Global context feature.
+                - `loss_glbctx` (dict): Global context loss.
+        """
+        gt_labels = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+        mc_pred, glbctx_feat = self.glbctx_head(x)
+        loss_glbctx = self.glbctx_head.loss(mc_pred, gt_labels)
+        global_context_results = dict(
+            loss_glbctx=loss_glbctx, glbctx_feat=glbctx_feat)
+
+        return global_context_results
+
+    def loss(self, x: Tensor, rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        losses = dict()
+
+        # semantic segmentation branch
+        if self.with_semantic:
+            semantic_results = self.semantic_loss(
+                x=x, batch_data_samples=batch_data_samples)
+            losses['loss_semantic_seg'] = semantic_results['loss_seg']
+            semantic_feat = semantic_results['semantic_feat']
+        else:
+            semantic_feat = None
+
+        # global context branch
+        if self.with_glbctx:
+            global_context_results = self.global_context_loss(
+                x=x, batch_gt_instances=batch_gt_instances)
+            losses['loss_glbctx'] = global_context_results['loss_glbctx']
+            glbctx_feat = global_context_results['glbctx_feat']
+        else:
+            glbctx_feat = None
+
+        results_list = rpn_results_list
+        num_imgs = len(batch_img_metas)
+        for stage in range(self.num_stages):
+            stage_loss_weight = self.stage_loss_weights[stage]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            bbox_assigner = self.bbox_assigner[stage]
+            bbox_sampler = self.bbox_sampler[stage]
+            for i in range(num_imgs):
+                results = results_list[i]
+                # rename rpn_results.bboxes to rpn_results.priors
+                results.priors = results.pop('bboxes')
+
+                assign_result = bbox_assigner.assign(
+                    results, batch_gt_instances[i],
+                    batch_gt_instances_ignore[i])
+                sampling_result = bbox_sampler.sample(
+                    assign_result,
+                    results,
+                    batch_gt_instances[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = self.bbox_loss(
+                stage=stage,
+                x=x,
+                sampling_results=sampling_results,
+                semantic_feat=semantic_feat,
+                glbctx_feat=glbctx_feat)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{stage}.{name}'] = (
+                    value * stage_loss_weight if 'loss' in name else value)
+
+            # refine bboxes
+            if stage < self.num_stages - 1:
+                bbox_head = self.bbox_head[stage]
+                with torch.no_grad():
+                    results_list = bbox_head.refine_bboxes(
+                        sampling_results=sampling_results,
+                        bbox_results=bbox_results,
+                        batch_img_metas=batch_img_metas)
+
+        if self.with_feat_relay:
+            relayed_feat = self._slice_pos_feats(bbox_results['relayed_feat'],
+                                                 sampling_results)
+            relayed_feat = self.feat_relay_head(relayed_feat)
+        else:
+            relayed_feat = None
+
+        # mask head forward and loss
+        mask_results = self.mask_loss(
+            x=x,
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat,
+            relayed_feat=relayed_feat)
+        mask_stage_loss_weight = sum(self.stage_loss_weights)
+        losses['loss_mask'] = mask_stage_loss_weight * mask_results[
+            'loss_mask']['loss_mask']
+
+        return losses
+
+    def predict(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (N, C, H, W).
+            rpn_results_list (list[:obj:`InstanceData`]): list of region
+                proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results to
+                the original image. Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        if self.with_glbctx:
+            _, glbctx_feat = self.glbctx_head(x)
+        else:
+            glbctx_feat = None
+
+        # TODO: nms_op in mmcv need be enhanced, the bbox result may get
+        #  difference when not rescale in bbox_head
+
+        # If it has the mask branch, the bbox branch does not need
+        # to be scaled to the original image scale, because the mask
+        # branch will scale both bbox and mask at the same time.
+        bbox_rescale = rescale if not self.with_mask else False
+        results_list = self.predict_bbox(
+            x=x,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat,
+            batch_img_metas=batch_img_metas,
+            rpn_results_list=rpn_results_list,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=bbox_rescale)
+
+        if self.with_mask:
+            results_list = self.predict_mask(
+                x=x,
+                semantic_heat=semantic_feat,
+                glbctx_feat=glbctx_feat,
+                batch_img_metas=batch_img_metas,
+                results_list=results_list,
+                rescale=rescale)
+
+        return results_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     semantic_heat: Tensor,
+                     glbctx_feat: Tensor,
+                     batch_img_metas: List[dict],
+                     results_list: List[InstanceData],
+                     rescale: bool = False) -> List[InstanceData]:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            semantic_feat (Tensor): Semantic feature.
+            glbctx_feat (Tensor): Global context feature.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas=batch_img_metas,
+                device=mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        bboxes_results = self._bbox_forward(
+            stage=-1,
+            x=x,
+            rois=mask_rois,
+            semantic_feat=semantic_heat,
+            glbctx_feat=glbctx_feat)
+        relayed_feat = bboxes_results['relayed_feat']
+        relayed_feat = self.feat_relay_head(relayed_feat)
+
+        mask_results = self._mask_forward(
+            x=x,
+            rois=mask_rois,
+            semantic_feat=semantic_heat,
+            glbctx_feat=glbctx_feat,
+            relayed_feat=relayed_feat)
+        mask_preds = mask_results['mask_preds']
+
+        # split batch mask prediction back to each image
+        num_bbox_per_img = tuple(len(_bbox) for _bbox in bboxes)
+        mask_preds = mask_preds.split(num_bbox_per_img, 0)
+
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=mask_preds,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+
+        return results_list
+
+    def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+                batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        if self.with_glbctx:
+            _, glbctx_feat = self.glbctx_head(x)
+        else:
+            glbctx_feat = None
+
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            rois, cls_scores, bbox_preds = self._refine_roi(
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                glbctx_feat=glbctx_feat,
+                batch_img_metas=batch_img_metas,
+                num_proposals_per_img=num_proposals_per_img)
+            results = results + (cls_scores, bbox_preds)
+        # mask head
+        if self.with_mask:
+            rois = torch.cat(rois)
+            bboxes_results = self._bbox_forward(
+                stage=-1,
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                glbctx_feat=glbctx_feat)
+            relayed_feat = bboxes_results['relayed_feat']
+            relayed_feat = self.feat_relay_head(relayed_feat)
+            mask_results = self._mask_forward(
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                glbctx_feat=glbctx_feat,
+                relayed_feat=relayed_feat)
+            mask_preds = mask_results['mask_preds']
+            mask_preds = mask_preds.split(num_proposals_per_img, 0)
+            results = results + (mask_preds, )
+        return results
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/shared_heads/__init__.py b/head_extractor/build/lib/mmdet/models/roi_heads/shared_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d56636ab34d1dd2592828238099bcdccf179d6d3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/shared_heads/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .res_layer import ResLayer
+
+__all__ = ['ResLayer']
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/shared_heads/res_layer.py b/head_extractor/build/lib/mmdet/models/roi_heads/shared_heads/res_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9210cb928fec92135a195d44d13a8588382b947
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/shared_heads/res_layer.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmdet.models.backbones import ResNet
+from mmdet.models.layers import ResLayer as _ResLayer
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class ResLayer(BaseModule):
+
+    def __init__(self,
+                 depth,
+                 stage=3,
+                 stride=2,
+                 dilation=1,
+                 style='pytorch',
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 with_cp=False,
+                 dcn=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResLayer, self).__init__(init_cfg)
+
+        self.norm_eval = norm_eval
+        self.norm_cfg = norm_cfg
+        self.stage = stage
+        self.fp16_enabled = False
+        block, stage_blocks = ResNet.arch_settings[depth]
+        stage_block = stage_blocks[stage]
+        planes = 64 * 2**stage
+        inplanes = 64 * 2**(stage - 1) * block.expansion
+
+        res_layer = _ResLayer(
+            block,
+            inplanes,
+            planes,
+            stage_block,
+            stride=stride,
+            dilation=dilation,
+            style=style,
+            with_cp=with_cp,
+            norm_cfg=self.norm_cfg,
+            dcn=dcn)
+        self.add_module(f'layer{stage + 1}', res_layer)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        res_layer = getattr(self, f'layer{self.stage + 1}')
+        out = res_layer(x)
+        return out
+
+    def train(self, mode=True):
+        super(ResLayer, self).train(mode)
+        if self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/sparse_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/sparse_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..19c3e1e335ca4e4a9d5befcbffcf4665b459cb5a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/sparse_roi_head.py
@@ -0,0 +1,601 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.task_modules.samplers import PseudoSampler
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList, OptConfigType
+from ..utils.misc import empty_instances, unpack_gt_instances
+from .cascade_roi_head import CascadeRoIHead
+
+
+@MODELS.register_module()
+class SparseRoIHead(CascadeRoIHead):
+    r"""The RoIHead for `Sparse R-CNN: End-to-End Object Detection with
+    Learnable Proposals <https://arxiv.org/abs/2011.12450>`_
+    and `Instances as Queries <http://arxiv.org/abs/2105.01928>`_
+
+    Args:
+        num_stages (int): Number of stage whole iterative process.
+            Defaults to 6.
+        stage_loss_weights (Tuple[float]): The loss
+            weight of each stage. By default all stages have
+            the same weight 1.
+        bbox_roi_extractor (:obj:`ConfigDict` or dict): Config of box
+            roi extractor.
+        mask_roi_extractor (:obj:`ConfigDict` or dict): Config of mask
+            roi extractor.
+        bbox_head (:obj:`ConfigDict` or dict): Config of box head.
+        mask_head (:obj:`ConfigDict` or dict): Config of mask head.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Configuration
+            information in train stage. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Configuration
+            information in test stage. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_stages: int = 6,
+                 stage_loss_weights: Tuple[float] = (1, 1, 1, 1, 1, 1),
+                 proposal_feature_channel: int = 256,
+                 bbox_roi_extractor: ConfigType = dict(
+                     type='SingleRoIExtractor',
+                     roi_layer=dict(
+                         type='RoIAlign', output_size=7, sampling_ratio=2),
+                     out_channels=256,
+                     featmap_strides=[4, 8, 16, 32]),
+                 mask_roi_extractor: OptConfigType = None,
+                 bbox_head: ConfigType = dict(
+                     type='DIIHead',
+                     num_classes=80,
+                     num_fcs=2,
+                     num_heads=8,
+                     num_cls_fcs=1,
+                     num_reg_fcs=3,
+                     feedforward_channels=2048,
+                     hidden_channels=256,
+                     dropout=0.0,
+                     roi_feat_size=7,
+                     ffn_act_cfg=dict(type='ReLU', inplace=True)),
+                 mask_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptConfigType = None) -> None:
+        assert bbox_roi_extractor is not None
+        assert bbox_head is not None
+        assert len(stage_loss_weights) == num_stages
+        self.num_stages = num_stages
+        self.stage_loss_weights = stage_loss_weights
+        self.proposal_feature_channel = proposal_feature_channel
+        super().__init__(
+            num_stages=num_stages,
+            stage_loss_weights=stage_loss_weights,
+            bbox_roi_extractor=bbox_roi_extractor,
+            mask_roi_extractor=mask_roi_extractor,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        # train_cfg would be None when run the test.py
+        if train_cfg is not None:
+            for stage in range(num_stages):
+                assert isinstance(self.bbox_sampler[stage], PseudoSampler), \
+                    'Sparse R-CNN and QueryInst only support `PseudoSampler`'
+
+    def bbox_loss(self, stage: int, x: Tuple[Tensor],
+                  results_list: InstanceList, object_feats: Tensor,
+                  batch_img_metas: List[dict],
+                  batch_gt_instances: InstanceList) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            stage (int): The current stage in iterative process.
+            x (tuple[Tensor]): List of multi-level img features.
+            results_list (List[:obj:`InstanceData`]) : List of region
+                proposals.
+            object_feats (Tensor): The object feature extracted from
+                the previous stage.
+            batch_img_metas (list[dict]): Meta information of each image.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+            - `cls_score` (Tensor): Classification scores.
+            - `bbox_pred` (Tensor): Box energies / deltas.
+            - `bbox_feats` (Tensor): Extract bbox RoI features.
+            - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        proposal_list = [res.bboxes for res in results_list]
+        rois = bbox2roi(proposal_list)
+        bbox_results = self._bbox_forward(stage, x, rois, object_feats,
+                                          batch_img_metas)
+        imgs_whwh = torch.cat(
+            [res.imgs_whwh[None, ...] for res in results_list])
+        cls_pred_list = bbox_results['detached_cls_scores']
+        proposal_list = bbox_results['detached_proposals']
+
+        sampling_results = []
+        bbox_head = self.bbox_head[stage]
+        for i in range(len(batch_img_metas)):
+            pred_instances = InstanceData()
+            # TODO: Enhance the logic
+            pred_instances.bboxes = proposal_list[i]  # for assinger
+            pred_instances.scores = cls_pred_list[i]
+            pred_instances.priors = proposal_list[i]  # for sampler
+
+            assign_result = self.bbox_assigner[stage].assign(
+                pred_instances=pred_instances,
+                gt_instances=batch_gt_instances[i],
+                gt_instances_ignore=None,
+                img_meta=batch_img_metas[i])
+
+            sampling_result = self.bbox_sampler[stage].sample(
+                assign_result, pred_instances, batch_gt_instances[i])
+            sampling_results.append(sampling_result)
+
+        bbox_results.update(sampling_results=sampling_results)
+
+        cls_score = bbox_results['cls_score']
+        decoded_bboxes = bbox_results['decoded_bboxes']
+        cls_score = cls_score.view(-1, cls_score.size(-1))
+        decoded_bboxes = decoded_bboxes.view(-1, 4)
+        bbox_loss_and_target = bbox_head.loss_and_target(
+            cls_score,
+            decoded_bboxes,
+            sampling_results,
+            self.train_cfg[stage],
+            imgs_whwh=imgs_whwh,
+            concat=True)
+        bbox_results.update(bbox_loss_and_target)
+
+        # propose for the new proposal_list
+        proposal_list = []
+        for idx in range(len(batch_img_metas)):
+            results = InstanceData()
+            results.imgs_whwh = results_list[idx].imgs_whwh
+            results.bboxes = bbox_results['detached_proposals'][idx]
+            proposal_list.append(results)
+        bbox_results.update(results_list=proposal_list)
+        return bbox_results
+
+    def _bbox_forward(self, stage: int, x: Tuple[Tensor], rois: Tensor,
+                      object_feats: Tensor,
+                      batch_img_metas: List[dict]) -> dict:
+        """Box head forward function used in both training and testing. Returns
+        all regression, classification results and a intermediate feature.
+
+        Args:
+            stage (int): The current stage in iterative process.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+                Each dimension means (img_index, x1, y1, x2, y2).
+            object_feats (Tensor): The object feature extracted from
+                the previous stage.
+            batch_img_metas (list[dict]): Meta information of each image.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of bbox head outputs,
+            Containing the following results:
+
+            - cls_score (Tensor): The score of each class, has
+              shape (batch_size, num_proposals, num_classes)
+              when use focal loss or
+              (batch_size, num_proposals, num_classes+1)
+              otherwise.
+            - decoded_bboxes (Tensor): The regression results
+              with shape (batch_size, num_proposal, 4).
+              The last dimension 4 represents
+              [tl_x, tl_y, br_x, br_y].
+            - object_feats (Tensor): The object feature extracted
+              from current stage
+            - detached_cls_scores (list[Tensor]): The detached
+              classification results, length is batch_size, and
+              each tensor has shape (num_proposal, num_classes).
+            - detached_proposals (list[tensor]): The detached
+              regression results, length is batch_size, and each
+              tensor has shape (num_proposal, 4). The last
+              dimension 4 represents [tl_x, tl_y, br_x, br_y].
+        """
+        num_imgs = len(batch_img_metas)
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        cls_score, bbox_pred, object_feats, attn_feats = bbox_head(
+            bbox_feats, object_feats)
+
+        fake_bbox_results = dict(
+            rois=rois,
+            bbox_targets=(rois.new_zeros(len(rois), dtype=torch.long), None),
+            bbox_pred=bbox_pred.view(-1, bbox_pred.size(-1)),
+            cls_score=cls_score.view(-1, cls_score.size(-1)))
+        fake_sampling_results = [
+            InstanceData(pos_is_gt=rois.new_zeros(object_feats.size(1)))
+            for _ in range(len(batch_img_metas))
+        ]
+
+        results_list = bbox_head.refine_bboxes(
+            sampling_results=fake_sampling_results,
+            bbox_results=fake_bbox_results,
+            batch_img_metas=batch_img_metas)
+        proposal_list = [res.bboxes for res in results_list]
+        bbox_results = dict(
+            cls_score=cls_score,
+            decoded_bboxes=torch.cat(proposal_list),
+            object_feats=object_feats,
+            attn_feats=attn_feats,
+            # detach then use it in label assign
+            detached_cls_scores=[
+                cls_score[i].detach() for i in range(num_imgs)
+            ],
+            detached_proposals=[item.detach() for item in proposal_list])
+
+        return bbox_results
+
+    def _mask_forward(self, stage: int, x: Tuple[Tensor], rois: Tensor,
+                      attn_feats) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            attn_feats (Tensot): Intermediate feature get from the last
+                diihead, has shape
+                (batch_size*num_proposals, feature_dimensions)
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+            - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        mask_preds = mask_head(mask_feats, attn_feats)
+
+        mask_results = dict(mask_preds=mask_preds)
+        return mask_results
+
+    def mask_loss(self, stage: int, x: Tuple[Tensor], bbox_results: dict,
+                  batch_gt_instances: InstanceList,
+                  rcnn_train_cfg: ConfigDict) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            bbox_results (dict): Results obtained from `bbox_loss`.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+            - `mask_preds` (Tensor): Mask prediction.
+            - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        attn_feats = bbox_results['attn_feats']
+        sampling_results = bbox_results['sampling_results']
+
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+
+        attn_feats = torch.cat([
+            feats[res.pos_inds]
+            for (feats, res) in zip(attn_feats, sampling_results)
+        ])
+        mask_results = self._mask_forward(stage, x, pos_rois, attn_feats)
+
+        mask_loss_and_target = self.mask_head[stage].loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=rcnn_train_cfg)
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (List[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: a dictionary of loss components of all stage.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        object_feats = torch.cat(
+            [res.pop('features')[None, ...] for res in rpn_results_list])
+        results_list = rpn_results_list
+        losses = {}
+        for stage in range(self.num_stages):
+            stage_loss_weight = self.stage_loss_weights[stage]
+
+            # bbox head forward and loss
+            bbox_results = self.bbox_loss(
+                stage=stage,
+                x=x,
+                object_feats=object_feats,
+                results_list=results_list,
+                batch_img_metas=batch_img_metas,
+                batch_gt_instances=batch_gt_instances)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{stage}.{name}'] = (
+                    value * stage_loss_weight if 'loss' in name else value)
+
+            if self.with_mask:
+                mask_results = self.mask_loss(
+                    stage=stage,
+                    x=x,
+                    bbox_results=bbox_results,
+                    batch_gt_instances=batch_gt_instances,
+                    rcnn_train_cfg=self.train_cfg[stage])
+
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{stage}.{name}'] = (
+                        value * stage_loss_weight if 'loss' in name else value)
+
+            object_feats = bbox_results['object_feats']
+            results_list = bbox_results['results_list']
+        return losses
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x(tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposal_list = [res.bboxes for res in rpn_results_list]
+        object_feats = torch.cat(
+            [res.pop('features')[None, ...] for res in rpn_results_list])
+        if all([proposal.shape[0] == 0 for proposal in proposal_list]):
+            # There is no proposal in the whole batch
+            return empty_instances(
+                batch_img_metas, x[0].device, task_type='bbox')
+
+        for stage in range(self.num_stages):
+            rois = bbox2roi(proposal_list)
+            bbox_results = self._bbox_forward(stage, x, rois, object_feats,
+                                              batch_img_metas)
+            object_feats = bbox_results['object_feats']
+            cls_score = bbox_results['cls_score']
+            proposal_list = bbox_results['detached_proposals']
+
+        num_classes = self.bbox_head[-1].num_classes
+
+        if self.bbox_head[-1].loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+        else:
+            cls_score = cls_score.softmax(-1)[..., :-1]
+
+        topk_inds_list = []
+        results_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score_per_img = cls_score[img_id]
+            scores_per_img, topk_inds = cls_score_per_img.flatten(0, 1).topk(
+                self.test_cfg.max_per_img, sorted=False)
+            labels_per_img = topk_inds % num_classes
+            bboxes_per_img = proposal_list[img_id][topk_inds // num_classes]
+            topk_inds_list.append(topk_inds)
+            if rescale and bboxes_per_img.size(0) > 0:
+                assert batch_img_metas[img_id].get('scale_factor') is not None
+                scale_factor = bboxes_per_img.new_tensor(
+                    batch_img_metas[img_id]['scale_factor']).repeat((1, 2))
+                bboxes_per_img = (
+                    bboxes_per_img.view(bboxes_per_img.size(0), -1, 4) /
+                    scale_factor).view(bboxes_per_img.size()[0], -1)
+
+            results = InstanceData()
+            results.bboxes = bboxes_per_img
+            results.scores = scores_per_img
+            results.labels = labels_per_img
+            results_list.append(results)
+        if self.with_mask:
+            for img_id in range(len(batch_img_metas)):
+                # add positive information in InstanceData to predict
+                # mask results in `mask_head`.
+                proposals = bbox_results['detached_proposals'][img_id]
+                topk_inds = topk_inds_list[img_id]
+                attn_feats = bbox_results['attn_feats'][img_id]
+
+                results_list[img_id].proposals = proposals
+                results_list[img_id].topk_inds = topk_inds
+                results_list[img_id].attn_feats = attn_feats
+        return results_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image. Each item usually contains following keys:
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - proposal (Tensor): Bboxes predicted from bbox_head,
+                  has a shape (num_instances, 4).
+                - topk_inds (Tensor): Topk indices of each image, has
+                  shape (num_instances, )
+                - attn_feats (Tensor): Intermediate feature get from the last
+                  diihead, has shape (num_instances, feature_dimensions)
+
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+            - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        proposal_list = [res.pop('proposals') for res in results_list]
+        topk_inds_list = [res.pop('topk_inds') for res in results_list]
+        attn_feats = torch.cat(
+            [res.pop('attn_feats')[None, ...] for res in results_list])
+
+        rois = bbox2roi(proposal_list)
+
+        if rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        last_stage = self.num_stages - 1
+        mask_results = self._mask_forward(last_stage, x, rois, attn_feats)
+
+        num_imgs = len(batch_img_metas)
+        mask_results['mask_preds'] = mask_results['mask_preds'].reshape(
+            num_imgs, -1, *mask_results['mask_preds'].size()[1:])
+        num_classes = self.bbox_head[-1].num_classes
+
+        mask_preds = []
+        for img_id in range(num_imgs):
+            topk_inds = topk_inds_list[img_id]
+            masks_per_img = mask_results['mask_preds'][img_id].flatten(
+                0, 1)[topk_inds]
+            masks_per_img = masks_per_img[:, None,
+                                          ...].repeat(1, num_classes, 1, 1)
+            mask_preds.append(masks_per_img)
+        results_list = self.mask_head[-1].predict_by_feat(
+            mask_preds,
+            results_list,
+            batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+
+        return results_list
+
+    # TODO: Need to refactor later
+    def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+                batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (List[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        all_stage_bbox_results = []
+        object_feats = torch.cat(
+            [res.pop('features')[None, ...] for res in rpn_results_list])
+        results_list = rpn_results_list
+        if self.with_bbox:
+            for stage in range(self.num_stages):
+                bbox_results = self.bbox_loss(
+                    stage=stage,
+                    x=x,
+                    results_list=results_list,
+                    object_feats=object_feats,
+                    batch_img_metas=batch_img_metas,
+                    batch_gt_instances=batch_gt_instances)
+                bbox_results.pop('loss_bbox')
+                # torch.jit does not support obj:SamplingResult
+                bbox_results.pop('results_list')
+                bbox_res = bbox_results.copy()
+                bbox_res.pop('sampling_results')
+                all_stage_bbox_results.append((bbox_res, ))
+
+                if self.with_mask:
+                    attn_feats = bbox_results['attn_feats']
+                    sampling_results = bbox_results['sampling_results']
+
+                    pos_rois = bbox2roi(
+                        [res.pos_priors for res in sampling_results])
+
+                    attn_feats = torch.cat([
+                        feats[res.pos_inds]
+                        for (feats, res) in zip(attn_feats, sampling_results)
+                    ])
+                    mask_results = self._mask_forward(stage, x, pos_rois,
+                                                      attn_feats)
+                    all_stage_bbox_results[-1] += (mask_results, )
+        return tuple(all_stage_bbox_results)
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/standard_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/standard_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d168eba0fb2ccf6aa89bde5c637160f10aea83a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/standard_roi_head.py
@@ -0,0 +1,419 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import DetDataSample, SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances, unpack_gt_instances
+from .base_roi_head import BaseRoIHead
+
+
+@MODELS.register_module()
+class StandardRoIHead(BaseRoIHead):
+    """Simplest base roi head including one bbox head and one mask head."""
+
+    def init_assigner_sampler(self) -> None:
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.bbox_sampler = TASK_UTILS.build(
+                self.train_cfg.sampler, default_args=dict(context=self))
+
+    def init_bbox_head(self, bbox_roi_extractor: ConfigType,
+                       bbox_head: ConfigType) -> None:
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (dict or ConfigDict): Config of box
+                roi extractor.
+            bbox_head (dict or ConfigDict): Config of box in box head.
+        """
+        self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor)
+        self.bbox_head = MODELS.build(bbox_head)
+
+    def init_mask_head(self, mask_roi_extractor: ConfigType,
+                       mask_head: ConfigType) -> None:
+        """Initialize mask head and mask roi extractor.
+
+        Args:
+            mask_roi_extractor (dict or ConfigDict): Config of mask roi
+                extractor.
+            mask_head (dict or ConfigDict): Config of mask in mask head.
+        """
+        if mask_roi_extractor is not None:
+            self.mask_roi_extractor = MODELS.build(mask_roi_extractor)
+            self.share_roi_extractor = False
+        else:
+            self.share_roi_extractor = True
+            self.mask_roi_extractor = self.bbox_roi_extractor
+        self.mask_head = MODELS.build(mask_head)
+
+    # TODO: Need to refactor later
+    def forward(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList = None) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+            the meta information of each image and corresponding
+            annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            bbox_results = self._bbox_forward(x, rois)
+            results = results + (bbox_results['cls_score'],
+                                 bbox_results['bbox_pred'])
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_results = self._mask_forward(x, mask_rois)
+            results = results + (mask_results['mask_preds'], )
+        return results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: List[DetDataSample]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(batch_data_samples)
+        sampling_results = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            sampling_results.append(sampling_result)
+
+        losses = dict()
+        # bbox head loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(x, sampling_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self.mask_loss(x, sampling_results,
+                                          bbox_results['bbox_feats'],
+                                          batch_gt_instances)
+            losses.update(mask_results['loss_mask'])
+
+        return losses
+
+    def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        # TODO: a more flexible way to decide which feature maps to use
+        bbox_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        if self.with_shared_head:
+            bbox_feats = self.shared_head(bbox_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def bbox_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult]) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        bbox_loss_and_target = self.bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg)
+
+        bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
+        return bbox_results
+
+    def mask_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult], bbox_feats: Tensor,
+                  batch_gt_instances: InstanceList) -> dict:
+        """Perform forward propagation and loss calculation of the mask head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            bbox_feats (Tensor): Extract bbox RoI features.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `mask_feats` (Tensor): Extract mask RoI features.
+                - `mask_targets` (Tensor): Mask target of each positive\
+                    proposals in the image.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        if not self.share_roi_extractor:
+            pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+            mask_results = self._mask_forward(x, pos_rois)
+        else:
+            pos_inds = []
+            device = bbox_feats.device
+            for res in sampling_results:
+                pos_inds.append(
+                    torch.ones(
+                        res.pos_priors.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+                pos_inds.append(
+                    torch.zeros(
+                        res.neg_priors.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+            pos_inds = torch.cat(pos_inds)
+
+            mask_results = self._mask_forward(
+                x, pos_inds=pos_inds, bbox_feats=bbox_feats)
+
+        mask_loss_and_target = self.mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg)
+
+        mask_results.update(loss_mask=mask_loss_and_target['loss_mask'])
+        return mask_results
+
+    def _mask_forward(self,
+                      x: Tuple[Tensor],
+                      rois: Tensor = None,
+                      pos_inds: Optional[Tensor] = None,
+                      bbox_feats: Optional[Tensor] = None) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            pos_inds (Tensor, optional): Indices of positive samples.
+                Defaults to None.
+            bbox_feats (Tensor): Extract bbox RoI features. Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `mask_feats` (Tensor): Extract mask RoI features.
+        """
+        assert ((rois is not None) ^
+                (pos_inds is not None and bbox_feats is not None))
+        if rois is not None:
+            mask_feats = self.mask_roi_extractor(
+                x[:self.mask_roi_extractor.num_inputs], rois)
+            if self.with_shared_head:
+                mask_feats = self.shared_head(mask_feats)
+        else:
+            assert bbox_feats is not None
+            mask_feats = bbox_feats[pos_inds]
+
+        mask_preds = self.mask_head(mask_feats)
+        mask_results = dict(mask_preds=mask_preds, mask_feats=mask_feats)
+        return mask_results
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposals = [res.bboxes for res in rpn_results_list]
+        rois = bbox2roi(proposals)
+
+        if rois.shape[0] == 0:
+            return empty_instances(
+                batch_img_metas,
+                rois.device,
+                task_type='bbox',
+                box_type=self.bbox_head.predict_box_type,
+                num_classes=self.bbox_head.num_classes,
+                score_per_cls=rcnn_test_cfg is None)
+
+        bbox_results = self._bbox_forward(x, rois)
+
+        # split batch bbox prediction back to each image
+        cls_scores = bbox_results['cls_score']
+        bbox_preds = bbox_results['bbox_pred']
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = rois.split(num_proposals_per_img, 0)
+        cls_scores = cls_scores.split(num_proposals_per_img, 0)
+
+        # some detector with_reg is False, bbox_preds will be None
+        if bbox_preds is not None:
+            # TODO move this to a sabl_roi_head
+            # the bbox prediction of some detectors like SABL is not Tensor
+            if isinstance(bbox_preds, torch.Tensor):
+                bbox_preds = bbox_preds.split(num_proposals_per_img, 0)
+            else:
+                bbox_preds = self.bbox_head.bbox_pred_split(
+                    bbox_preds, num_proposals_per_img)
+        else:
+            bbox_preds = (None, ) * len(proposals)
+
+        result_list = self.bbox_head.predict_by_feat(
+            rois=rois,
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=rcnn_test_cfg,
+            rescale=rescale)
+        return result_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        # don't need to consider aug_test.
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        mask_results = self._mask_forward(x, mask_rois)
+        mask_preds = mask_results['mask_preds']
+        # split batch mask prediction back to each image
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+
+        # TODO: Handle the case where rescale is false
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=mask_preds,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+        return results_list
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/test_mixins.py b/head_extractor/build/lib/mmdet/models/roi_heads/test_mixins.py
new file mode 100644
index 0000000000000000000000000000000000000000..940490454d9cf1fde4d69c1f890c173b92d522a1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/test_mixins.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# TODO: delete this file after refactor
+import sys
+
+import torch
+
+from mmdet.models.layers import multiclass_nms
+from mmdet.models.test_time_augs import merge_aug_bboxes, merge_aug_masks
+from mmdet.structures.bbox import bbox2roi, bbox_mapping
+
+if sys.version_info >= (3, 7):
+    from mmdet.utils.contextmanagers import completed
+
+
+class BBoxTestMixin:
+
+    if sys.version_info >= (3, 7):
+        # TODO: Currently not supported
+        async def async_test_bboxes(self,
+                                    x,
+                                    img_metas,
+                                    proposals,
+                                    rcnn_test_cfg,
+                                    rescale=False,
+                                    **kwargs):
+            """Asynchronized test for box head without augmentation."""
+            rois = bbox2roi(proposals)
+            roi_feats = self.bbox_roi_extractor(
+                x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+            if self.with_shared_head:
+                roi_feats = self.shared_head(roi_feats)
+            sleep_interval = rcnn_test_cfg.get('async_sleep_interval', 0.017)
+
+            async with completed(
+                    __name__, 'bbox_head_forward',
+                    sleep_interval=sleep_interval):
+                cls_score, bbox_pred = self.bbox_head(roi_feats)
+
+            img_shape = img_metas[0]['img_shape']
+            scale_factor = img_metas[0]['scale_factor']
+            det_bboxes, det_labels = self.bbox_head.get_bboxes(
+                rois,
+                cls_score,
+                bbox_pred,
+                img_shape,
+                scale_factor,
+                rescale=rescale,
+                cfg=rcnn_test_cfg)
+            return det_bboxes, det_labels
+
+    # TODO: Currently not supported
+    def aug_test_bboxes(self, feats, img_metas, rpn_results_list,
+                        rcnn_test_cfg):
+        """Test det bboxes with test time augmentation."""
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            flip_direction = img_meta[0]['flip_direction']
+            # TODO more flexible
+            proposals = bbox_mapping(rpn_results_list[0][:, :4], img_shape,
+                                     scale_factor, flip, flip_direction)
+            rois = bbox2roi([proposals])
+            bbox_results = self.bbox_forward(x, rois)
+            bboxes, scores = self.bbox_head.get_bboxes(
+                rois,
+                bbox_results['cls_score'],
+                bbox_results['bbox_pred'],
+                img_shape,
+                scale_factor,
+                rescale=False,
+                cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
+        if merged_bboxes.shape[0] == 0:
+            # There is no proposal in the single image
+            det_bboxes = merged_bboxes.new_zeros(0, 5)
+            det_labels = merged_bboxes.new_zeros((0, ), dtype=torch.long)
+        else:
+            det_bboxes, det_labels = multiclass_nms(merged_bboxes,
+                                                    merged_scores,
+                                                    rcnn_test_cfg.score_thr,
+                                                    rcnn_test_cfg.nms,
+                                                    rcnn_test_cfg.max_per_img)
+        return det_bboxes, det_labels
+
+
+class MaskTestMixin:
+
+    if sys.version_info >= (3, 7):
+        # TODO: Currently not supported
+        async def async_test_mask(self,
+                                  x,
+                                  img_metas,
+                                  det_bboxes,
+                                  det_labels,
+                                  rescale=False,
+                                  mask_test_cfg=None):
+            """Asynchronized test for mask head without augmentation."""
+            # image shape of the first image in the batch (only one)
+            ori_shape = img_metas[0]['ori_shape']
+            scale_factor = img_metas[0]['scale_factor']
+            if det_bboxes.shape[0] == 0:
+                segm_result = [[] for _ in range(self.mask_head.num_classes)]
+            else:
+                if rescale and not isinstance(scale_factor,
+                                              (float, torch.Tensor)):
+                    scale_factor = det_bboxes.new_tensor(scale_factor)
+                _bboxes = (
+                    det_bboxes[:, :4] *
+                    scale_factor if rescale else det_bboxes)
+                mask_rois = bbox2roi([_bboxes])
+                mask_feats = self.mask_roi_extractor(
+                    x[:len(self.mask_roi_extractor.featmap_strides)],
+                    mask_rois)
+
+                if self.with_shared_head:
+                    mask_feats = self.shared_head(mask_feats)
+                if mask_test_cfg and \
+                        mask_test_cfg.get('async_sleep_interval'):
+                    sleep_interval = mask_test_cfg['async_sleep_interval']
+                else:
+                    sleep_interval = 0.035
+                async with completed(
+                        __name__,
+                        'mask_head_forward',
+                        sleep_interval=sleep_interval):
+                    mask_pred = self.mask_head(mask_feats)
+                segm_result = self.mask_head.get_results(
+                    mask_pred, _bboxes, det_labels, self.test_cfg, ori_shape,
+                    scale_factor, rescale)
+            return segm_result
+
+    # TODO: Currently not supported
+    def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels):
+        """Test for mask head with test time augmentation."""
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes)]
+        else:
+            aug_masks = []
+            for x, img_meta in zip(feats, img_metas):
+                img_shape = img_meta[0]['img_shape']
+                scale_factor = img_meta[0]['scale_factor']
+                flip = img_meta[0]['flip']
+                flip_direction = img_meta[0]['flip_direction']
+                _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                       scale_factor, flip, flip_direction)
+                mask_rois = bbox2roi([_bboxes])
+                mask_results = self._mask_forward(x, mask_rois)
+                # convert to numpy array to save memory
+                aug_masks.append(
+                    mask_results['mask_pred'].sigmoid().cpu().numpy())
+            merged_masks = merge_aug_masks(aug_masks, img_metas, self.test_cfg)
+
+            ori_shape = img_metas[0][0]['ori_shape']
+            scale_factor = det_bboxes.new_ones(4)
+            segm_result = self.mask_head.get_results(
+                merged_masks,
+                det_bboxes,
+                det_labels,
+                self.test_cfg,
+                ori_shape,
+                scale_factor=scale_factor,
+                rescale=False)
+        return segm_result
diff --git a/head_extractor/build/lib/mmdet/models/roi_heads/trident_roi_head.py b/head_extractor/build/lib/mmdet/models/roi_heads/trident_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..5215327296282a8e7ca502f3321aced8a4f840b7
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/roi_heads/trident_roi_head.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class TridentRoIHead(StandardRoIHead):
+    """Trident roi head.
+
+    Args:
+        num_branch (int): Number of branches in TridentNet.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+    """
+
+    def __init__(self, num_branch: int, test_branch_idx: int,
+                 **kwargs) -> None:
+        self.num_branch = num_branch
+        self.test_branch_idx = test_branch_idx
+        super().__init__(**kwargs)
+
+    def merge_trident_bboxes(self,
+                             trident_results: InstanceList) -> InstanceData:
+        """Merge bbox predictions of each branch.
+
+        Args:
+            trident_results (List[:obj:`InstanceData`]): A list of InstanceData
+                predicted from every branch.
+
+        Returns:
+            :obj:`InstanceData`: merged InstanceData.
+        """
+        bboxes = torch.cat([res.bboxes for res in trident_results])
+        scores = torch.cat([res.scores for res in trident_results])
+        labels = torch.cat([res.labels for res in trident_results])
+
+        nms_cfg = self.test_cfg['nms']
+        results = InstanceData()
+        if bboxes.numel() == 0:
+            results.bboxes = bboxes
+            results.scores = scores
+            results.labels = labels
+        else:
+            det_bboxes, keep = batched_nms(bboxes, scores, labels, nms_cfg)
+            results.bboxes = det_bboxes[:, :-1]
+            results.scores = det_bboxes[:, -1]
+            results.labels = labels[keep]
+
+        if self.test_cfg['max_per_img'] > 0:
+            results = results[:self.test_cfg['max_per_img']]
+        return results
+
+    def predict(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        - Compute prediction bbox and label per branch.
+        - Merge predictions of each branch according to scores of
+          bboxes, i.e., bboxes with higher score are kept to give
+          top-k prediction.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (N, C, H, W).
+            rpn_results_list (list[:obj:`InstanceData`]): list of region
+                proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results to
+                the original image. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results_list = super().predict(
+            x=x,
+            rpn_results_list=rpn_results_list,
+            batch_data_samples=batch_data_samples,
+            rescale=rescale)
+
+        num_branch = self.num_branch \
+            if self.training or self.test_branch_idx == -1 else 1
+
+        merged_results_list = []
+        for i in range(len(batch_data_samples) // num_branch):
+            merged_results_list.append(
+                self.merge_trident_bboxes(results_list[i * num_branch:(i + 1) *
+                                                       num_branch]))
+        return merged_results_list
diff --git a/head_extractor/build/lib/mmdet/models/seg_heads/__init__.py b/head_extractor/build/lib/mmdet/models/seg_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b489a905b1e9b6cef2e8b9575600990563128e4e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/seg_heads/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .panoptic_fpn_head import PanopticFPNHead  # noqa: F401,F403
+from .panoptic_fusion_heads import *  # noqa: F401,F403
diff --git a/head_extractor/build/lib/mmdet/models/seg_heads/base_semantic_head.py b/head_extractor/build/lib/mmdet/models/seg_heads/base_semantic_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1db71549d89766c45012517c20cef443f4760419
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/seg_heads/base_semantic_head.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple, Union
+
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class BaseSemanticHead(BaseModule, metaclass=ABCMeta):
+    """Base module of Semantic Head.
+
+    Args:
+        num_classes (int): the number of classes.
+        seg_rescale_factor (float): the rescale factor for ``gt_sem_seg``,
+            which equals to ``1 / output_strides``. The output_strides is
+            for ``seg_preds``. Defaults to  1 / 4.
+        init_cfg (Optional[Union[:obj:`ConfigDict`, dict]]): the initialization
+            config.
+        loss_seg (Union[:obj:`ConfigDict`, dict]): the loss of the semantic
+            head.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 seg_rescale_factor: float = 1 / 4.,
+                 loss_seg: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     ignore_index=255,
+                     loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.loss_seg = MODELS.build(loss_seg)
+        self.num_classes = num_classes
+        self.seg_rescale_factor = seg_rescale_factor
+
+    @abstractmethod
+    def forward(self, x: Union[Tensor, Tuple[Tensor]]) -> Dict[str, Tensor]:
+        """Placeholder of forward function.
+
+        Args:
+            x (Tensor): Feature maps.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary, including features
+                and predicted scores. Required keys: 'seg_preds'
+                and 'feats'.
+        """
+        pass
+
+    @abstractmethod
+    def loss(self, x: Union[Tensor, Tuple[Tensor]],
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """
+        Args:
+            x (Union[Tensor, Tuple[Tensor]]): Feature maps.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Args:
+            x (Tensor): Feature maps.
+
+        Returns:
+            Dict[str, Tensor]: The loss of semantic head.
+        """
+        pass
+
+    def predict(self,
+                x: Union[Tensor, Tuple[Tensor]],
+                batch_img_metas: List[dict],
+                rescale: bool = False) -> List[Tensor]:
+        """Test without Augmentation.
+
+        Args:
+            x (Union[Tensor, Tuple[Tensor]]): Feature maps.
+            batch_img_metas (List[dict]): List of image information.
+            rescale (bool): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[Tensor]: semantic segmentation logits.
+        """
+        seg_preds = self.forward(x)['seg_preds']
+        seg_preds = F.interpolate(
+            seg_preds,
+            size=batch_img_metas[0]['batch_input_shape'],
+            mode='bilinear',
+            align_corners=False)
+        seg_preds = [seg_preds[i] for i in range(len(batch_img_metas))]
+
+        if rescale:
+            seg_pred_list = []
+            for i in range(len(batch_img_metas)):
+                h, w = batch_img_metas[i]['img_shape']
+                seg_pred = seg_preds[i][:, :h, :w]
+
+                h, w = batch_img_metas[i]['ori_shape']
+                seg_pred = F.interpolate(
+                    seg_pred[None],
+                    size=(h, w),
+                    mode='bilinear',
+                    align_corners=False)[0]
+                seg_pred_list.append(seg_pred)
+        else:
+            seg_pred_list = seg_preds
+
+        return seg_pred_list
diff --git a/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fpn_head.py b/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d8b901360922f6cdb9f8d15b60dac8d7514ee75
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fpn_head.py
@@ -0,0 +1,174 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from ..layers import ConvUpsample
+from ..utils import interpolate_as
+from .base_semantic_head import BaseSemanticHead
+
+
+@MODELS.register_module()
+class PanopticFPNHead(BaseSemanticHead):
+    """PanopticFPNHead used in Panoptic FPN.
+
+    In this head, the number of output channels is ``num_stuff_classes
+    + 1``, including all stuff classes and one thing class. The stuff
+    classes will be reset from ``0`` to ``num_stuff_classes - 1``, the
+    thing classes will be merged to ``num_stuff_classes``-th channel.
+
+    Arg:
+        num_things_classes (int): Number of thing classes. Default: 80.
+        num_stuff_classes (int): Number of stuff classes. Default: 53.
+        in_channels (int): Number of channels in the input feature
+            map.
+        inner_channels (int): Number of channels in inner features.
+        start_level (int): The start level of the input features
+            used in PanopticFPN.
+        end_level (int): The end level of the used features, the
+            ``end_level``-th layer will not be used.
+        conv_cfg (Optional[Union[ConfigDict, dict]]): Dictionary to construct
+            and config conv layer.
+        norm_cfg (Union[ConfigDict, dict]): Dictionary to construct and config
+            norm layer. Use ``GN`` by default.
+        init_cfg (Optional[Union[ConfigDict, dict]]): Initialization config
+            dict.
+        loss_seg (Union[ConfigDict, dict]): the loss of the semantic head.
+    """
+
+    def __init__(self,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 in_channels: int = 256,
+                 inner_channels: int = 128,
+                 start_level: int = 0,
+                 end_level: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 loss_seg: ConfigType = dict(
+                     type='CrossEntropyLoss', ignore_index=-1,
+                     loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        seg_rescale_factor = 1 / 2**(start_level + 2)
+        super().__init__(
+            num_classes=num_stuff_classes + 1,
+            seg_rescale_factor=seg_rescale_factor,
+            loss_seg=loss_seg,
+            init_cfg=init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        # Used feature layers are [start_level, end_level)
+        self.start_level = start_level
+        self.end_level = end_level
+        self.num_stages = end_level - start_level
+        self.inner_channels = inner_channels
+
+        self.conv_upsample_layers = ModuleList()
+        for i in range(start_level, end_level):
+            self.conv_upsample_layers.append(
+                ConvUpsample(
+                    in_channels,
+                    inner_channels,
+                    num_layers=i if i > 0 else 1,
+                    num_upsample=i if i > 0 else 0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                ))
+        self.conv_logits = nn.Conv2d(inner_channels, self.num_classes, 1)
+
+    def _set_things_to_void(self, gt_semantic_seg: Tensor) -> Tensor:
+        """Merge thing classes to one class.
+
+        In PanopticFPN, the background labels will be reset from `0` to
+        `self.num_stuff_classes-1`, the foreground labels will be merged to
+        `self.num_stuff_classes`-th channel.
+        """
+        gt_semantic_seg = gt_semantic_seg.int()
+        fg_mask = gt_semantic_seg < self.num_things_classes
+        bg_mask = (gt_semantic_seg >= self.num_things_classes) * (
+            gt_semantic_seg < self.num_things_classes + self.num_stuff_classes)
+
+        new_gt_seg = torch.clone(gt_semantic_seg)
+        new_gt_seg = torch.where(bg_mask,
+                                 gt_semantic_seg - self.num_things_classes,
+                                 new_gt_seg)
+        new_gt_seg = torch.where(fg_mask,
+                                 fg_mask.int() * self.num_stuff_classes,
+                                 new_gt_seg)
+        return new_gt_seg
+
+    def loss(self, x: Union[Tensor, Tuple[Tensor]],
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """
+        Args:
+            x (Union[Tensor, Tuple[Tensor]]): Feature maps.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            Dict[str, Tensor]: The loss of semantic head.
+        """
+        seg_preds = self(x)['seg_preds']
+        gt_semantic_segs = [
+            data_sample.gt_sem_seg.sem_seg
+            for data_sample in batch_data_samples
+        ]
+
+        gt_semantic_segs = torch.stack(gt_semantic_segs)
+        if self.seg_rescale_factor != 1.0:
+            gt_semantic_segs = F.interpolate(
+                gt_semantic_segs.float(),
+                scale_factor=self.seg_rescale_factor,
+                mode='nearest').squeeze(1)
+
+        # Things classes will be merged to one class in PanopticFPN.
+        gt_semantic_segs = self._set_things_to_void(gt_semantic_segs)
+
+        if seg_preds.shape[-2:] != gt_semantic_segs.shape[-2:]:
+            seg_preds = interpolate_as(seg_preds, gt_semantic_segs)
+        seg_preds = seg_preds.permute((0, 2, 3, 1))
+
+        loss_seg = self.loss_seg(
+            seg_preds.reshape(-1, self.num_classes),  # => [NxHxW, C]
+            gt_semantic_segs.reshape(-1).long())
+
+        return dict(loss_seg=loss_seg)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        super().init_weights()
+        nn.init.normal_(self.conv_logits.weight.data, 0, 0.01)
+        self.conv_logits.bias.data.zero_()
+
+    def forward(self, x: Tuple[Tensor]) -> Dict[str, Tensor]:
+        """Forward.
+
+        Args:
+            x (Tuple[Tensor]): Multi scale Feature maps.
+
+        Returns:
+            dict[str, Tensor]: semantic segmentation predictions and
+                feature maps.
+        """
+        # the number of subnets must be not more than
+        # the length of features.
+        assert self.num_stages <= len(x)
+
+        feats = []
+        for i, layer in enumerate(self.conv_upsample_layers):
+            f = layer(x[self.start_level + i])
+            feats.append(f)
+
+        seg_feats = torch.sum(torch.stack(feats, dim=0), dim=0)
+        seg_preds = self.conv_logits(seg_feats)
+        out = dict(seg_preds=seg_preds, seg_feats=seg_feats)
+        return out
diff --git a/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py b/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..41625a61d6d1c38c633062c24b1e3455bd3ae2df
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_panoptic_fusion_head import \
+    BasePanopticFusionHead  # noqa: F401,F403
+from .heuristic_fusion_head import HeuristicFusionHead  # noqa: F401,F403
+from .maskformer_fusion_head import MaskFormerFusionHead  # noqa: F401,F403
diff --git a/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py b/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6b20e1cd144eaebd042b8017f143c0a643adde1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class BasePanopticFusionHead(BaseModule, metaclass=ABCMeta):
+    """Base class for panoptic heads."""
+
+    def __init__(self,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 test_cfg: OptConfigType = None,
+                 loss_panoptic: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = num_things_classes + num_stuff_classes
+        self.test_cfg = test_cfg
+
+        if loss_panoptic:
+            self.loss_panoptic = MODELS.build(loss_panoptic)
+        else:
+            self.loss_panoptic = None
+
+    @property
+    def with_loss(self) -> bool:
+        """bool: whether the panoptic head contains loss function."""
+        return self.loss_panoptic is not None
+
+    @abstractmethod
+    def loss(self, **kwargs):
+        """Loss function."""
+
+    @abstractmethod
+    def predict(self, **kwargs):
+        """Predict function."""
diff --git a/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py b/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a4a4200edd97f42e9a138e14a1d07328ad9b139
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmengine.structures import InstanceData, PixelData
+from torch import Tensor
+
+from mmdet.evaluation.functional import INSTANCE_OFFSET
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig, PixelList
+from .base_panoptic_fusion_head import BasePanopticFusionHead
+
+
+@MODELS.register_module()
+class HeuristicFusionHead(BasePanopticFusionHead):
+    """Fusion Head with Heuristic method."""
+
+    def __init__(self,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super().__init__(
+            num_things_classes=num_things_classes,
+            num_stuff_classes=num_stuff_classes,
+            test_cfg=test_cfg,
+            loss_panoptic=None,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def loss(self, **kwargs) -> dict:
+        """HeuristicFusionHead has no training loss."""
+        return dict()
+
+    def _lay_masks(self,
+                   mask_results: InstanceData,
+                   overlap_thr: float = 0.5) -> Tensor:
+        """Lay instance masks to a result map.
+
+        Args:
+            mask_results (:obj:`InstanceData`): Instance segmentation results,
+                each contains ``bboxes``, ``labels``, ``scores`` and ``masks``.
+            overlap_thr (float): Threshold to determine whether two masks
+                overlap. default: 0.5.
+
+        Returns:
+            Tensor: The result map, (H, W).
+        """
+        bboxes = mask_results.bboxes
+        scores = mask_results.scores
+        labels = mask_results.labels
+        masks = mask_results.masks
+
+        num_insts = bboxes.shape[0]
+        id_map = torch.zeros(
+            masks.shape[-2:], device=bboxes.device, dtype=torch.long)
+        if num_insts == 0:
+            return id_map, labels
+
+        # Sort by score to use heuristic fusion
+        order = torch.argsort(-scores)
+        bboxes = bboxes[order]
+        labels = labels[order]
+        segm_masks = masks[order]
+
+        instance_id = 1
+        left_labels = []
+        for idx in range(bboxes.shape[0]):
+            _cls = labels[idx]
+            _mask = segm_masks[idx]
+            instance_id_map = torch.ones_like(
+                _mask, dtype=torch.long) * instance_id
+            area = _mask.sum()
+            if area == 0:
+                continue
+
+            pasted = id_map > 0
+            intersect = (_mask * pasted).sum()
+            if (intersect / (area + 1e-5)) > overlap_thr:
+                continue
+
+            _part = _mask * (~pasted)
+            id_map = torch.where(_part, instance_id_map, id_map)
+            left_labels.append(_cls)
+            instance_id += 1
+
+        if len(left_labels) > 0:
+            instance_labels = torch.stack(left_labels)
+        else:
+            instance_labels = bboxes.new_zeros((0, ), dtype=torch.long)
+        assert instance_id == (len(instance_labels) + 1)
+        return id_map, instance_labels
+
+    def _predict_single(self, mask_results: InstanceData, seg_preds: Tensor,
+                        **kwargs) -> PixelData:
+        """Fuse the results of instance and semantic segmentations.
+
+        Args:
+            mask_results (:obj:`InstanceData`): Instance segmentation results,
+                each contains ``bboxes``, ``labels``, ``scores`` and ``masks``.
+            seg_preds (Tensor): The semantic segmentation results,
+                (num_stuff + 1, H, W).
+
+        Returns:
+            Tensor: The panoptic segmentation result, (H, W).
+        """
+        id_map, labels = self._lay_masks(mask_results,
+                                         self.test_cfg.mask_overlap)
+
+        seg_results = seg_preds.argmax(dim=0)
+        seg_results = seg_results + self.num_things_classes
+
+        pan_results = seg_results
+        instance_id = 1
+        for idx in range(len(mask_results)):
+            _mask = id_map == (idx + 1)
+            if _mask.sum() == 0:
+                continue
+            _cls = labels[idx]
+            # simply trust detection
+            segment_id = _cls + instance_id * INSTANCE_OFFSET
+            pan_results[_mask] = segment_id
+            instance_id += 1
+
+        ids, counts = torch.unique(
+            pan_results % INSTANCE_OFFSET, return_counts=True)
+        stuff_ids = ids[ids >= self.num_things_classes]
+        stuff_counts = counts[ids >= self.num_things_classes]
+        ignore_stuff_ids = stuff_ids[
+            stuff_counts < self.test_cfg.stuff_area_limit]
+
+        assert pan_results.ndim == 2
+        pan_results[(pan_results.unsqueeze(2) == ignore_stuff_ids.reshape(
+            1, 1, -1)).any(dim=2)] = self.num_classes
+
+        pan_results = PixelData(sem_seg=pan_results[None].int())
+        return pan_results
+
+    def predict(self, mask_results_list: InstanceList,
+                seg_preds_list: List[Tensor], **kwargs) -> PixelList:
+        """Predict results by fusing the results of instance and semantic
+        segmentations.
+
+        Args:
+            mask_results_list (list[:obj:`InstanceData`]): Instance
+                segmentation results, each contains ``bboxes``, ``labels``,
+                ``scores`` and ``masks``.
+            seg_preds_list (Tensor): List of semantic segmentation results.
+
+        Returns:
+            List[PixelData]: Panoptic segmentation result.
+        """
+        results_list = [
+            self._predict_single(mask_results_list[i], seg_preds_list[i])
+            for i in range(len(mask_results_list))
+        ]
+
+        return results_list
diff --git a/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py b/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b76e6b45bb9be2584f8b3eca2e5e1c0809249fa
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData, PixelData
+from torch import Tensor
+
+from mmdet.evaluation.functional import INSTANCE_OFFSET
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.mask import mask2bbox
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .base_panoptic_fusion_head import BasePanopticFusionHead
+
+
+@MODELS.register_module()
+class MaskFormerFusionHead(BasePanopticFusionHead):
+    """MaskFormer fusion head which postprocesses results for panoptic
+    segmentation, instance segmentation and semantic segmentation."""
+
+    def __init__(self,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 test_cfg: OptConfigType = None,
+                 loss_panoptic: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs):
+        super().__init__(
+            num_things_classes=num_things_classes,
+            num_stuff_classes=num_stuff_classes,
+            test_cfg=test_cfg,
+            loss_panoptic=loss_panoptic,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def loss(self, **kwargs):
+        """MaskFormerFusionHead has no training loss."""
+        return dict()
+
+    def panoptic_postprocess(self, mask_cls: Tensor,
+                             mask_pred: Tensor) -> PixelData:
+        """Panoptic segmengation inference.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            :obj:`PixelData`: Panoptic segment result of shape \
+                (h, w), each element in Tensor means: \
+                ``segment_id = _cls + instance_id * INSTANCE_OFFSET``.
+        """
+        object_mask_thr = self.test_cfg.get('object_mask_thr', 0.8)
+        iou_thr = self.test_cfg.get('iou_thr', 0.8)
+        filter_low_score = self.test_cfg.get('filter_low_score', False)
+
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        mask_pred = mask_pred.sigmoid()
+
+        keep = labels.ne(self.num_classes) & (scores > object_mask_thr)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.full((h, w),
+                                  self.num_classes,
+                                  dtype=torch.int32,
+                                  device=cur_masks.device)
+        if cur_masks.shape[0] == 0:
+            # We didn't detect any mask :(
+            pass
+        else:
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            instance_id = 1
+            for k in range(cur_classes.shape[0]):
+                pred_class = int(cur_classes[k].item())
+                isthing = pred_class < self.num_things_classes
+                mask = cur_mask_ids == k
+                mask_area = mask.sum().item()
+                original_area = (cur_masks[k] >= 0.5).sum().item()
+
+                if filter_low_score:
+                    mask = mask & (cur_masks[k] >= 0.5)
+
+                if mask_area > 0 and original_area > 0:
+                    if mask_area / original_area < iou_thr:
+                        continue
+
+                    if not isthing:
+                        # different stuff regions of same class will be
+                        # merged here, and stuff share the instance_id 0.
+                        panoptic_seg[mask] = pred_class
+                    else:
+                        panoptic_seg[mask] = (
+                            pred_class + instance_id * INSTANCE_OFFSET)
+                        instance_id += 1
+
+        return PixelData(sem_seg=panoptic_seg[None])
+
+    def semantic_postprocess(self, mask_cls: Tensor,
+                             mask_pred: Tensor) -> PixelData:
+        """Semantic segmengation postprocess.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            :obj:`PixelData`: Semantic segment result.
+        """
+        # TODO add semantic segmentation result
+        raise NotImplementedError
+
+    def instance_postprocess(self, mask_cls: Tensor,
+                             mask_pred: Tensor) -> InstanceData:
+        """Instance segmengation postprocess.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            :obj:`InstanceData`: Instance segmentation results.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        max_per_image = self.test_cfg.get('max_per_image', 100)
+        num_queries = mask_cls.shape[0]
+        # shape (num_queries, num_class)
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        # shape (num_queries * num_class, )
+        labels = torch.arange(self.num_classes, device=mask_cls.device).\
+            unsqueeze(0).repeat(num_queries, 1).flatten(0, 1)
+        scores_per_image, top_indices = scores.flatten(0, 1).topk(
+            max_per_image, sorted=False)
+        labels_per_image = labels[top_indices]
+
+        query_indices = top_indices // self.num_classes
+        mask_pred = mask_pred[query_indices]
+
+        # extract things
+        is_thing = labels_per_image < self.num_things_classes
+        scores_per_image = scores_per_image[is_thing]
+        labels_per_image = labels_per_image[is_thing]
+        mask_pred = mask_pred[is_thing]
+
+        mask_pred_binary = (mask_pred > 0).float()
+        mask_scores_per_image = (mask_pred.sigmoid() *
+                                 mask_pred_binary).flatten(1).sum(1) / (
+                                     mask_pred_binary.flatten(1).sum(1) + 1e-6)
+        det_scores = scores_per_image * mask_scores_per_image
+        mask_pred_binary = mask_pred_binary.bool()
+        bboxes = mask2bbox(mask_pred_binary)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.labels = labels_per_image
+        results.scores = det_scores
+        results.masks = mask_pred_binary
+        return results
+
+    def predict(self,
+                mask_cls_results: Tensor,
+                mask_pred_results: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = False,
+                **kwargs) -> List[dict]:
+        """Test segment without test-time aumengtation.
+
+        Only the output of last decoder layers was used.
+
+        Args:
+            mask_cls_results (Tensor): Mask classification logits,
+                shape (batch_size, num_queries, cls_out_channels).
+                Note `cls_out_channels` should includes background.
+            mask_pred_results (Tensor): Mask logits, shape
+                (batch_size, num_queries, h, w).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): If True, return boxes in
+                original image space. Default False.
+
+        Returns:
+            list[dict]: Instance segmentation \
+                results and panoptic segmentation results for each \
+                image.
+
+            .. code-block:: none
+
+                [
+                    {
+                        'pan_results': PixelData,
+                        'ins_results': InstanceData,
+                        # semantic segmentation results are not supported yet
+                        'sem_results': PixelData
+                    },
+                    ...
+                ]
+        """
+        batch_img_metas = [
+            data_sample.metainfo for data_sample in batch_data_samples
+        ]
+        panoptic_on = self.test_cfg.get('panoptic_on', True)
+        semantic_on = self.test_cfg.get('semantic_on', False)
+        instance_on = self.test_cfg.get('instance_on', False)
+        assert not semantic_on, 'segmantic segmentation '\
+            'results are not supported yet.'
+
+        results = []
+        for mask_cls_result, mask_pred_result, meta in zip(
+                mask_cls_results, mask_pred_results, batch_img_metas):
+            # remove padding
+            img_height, img_width = meta['img_shape'][:2]
+            mask_pred_result = mask_pred_result[:, :img_height, :img_width]
+
+            if rescale:
+                # return result in original resolution
+                ori_height, ori_width = meta['ori_shape'][:2]
+                mask_pred_result = F.interpolate(
+                    mask_pred_result[:, None],
+                    size=(ori_height, ori_width),
+                    mode='bilinear',
+                    align_corners=False)[:, 0]
+
+            result = dict()
+            if panoptic_on:
+                pan_results = self.panoptic_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['pan_results'] = pan_results
+
+            if instance_on:
+                ins_results = self.instance_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['ins_results'] = ins_results
+
+            if semantic_on:
+                sem_results = self.semantic_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['sem_results'] = sem_results
+
+            results.append(result)
+
+        return results
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/__init__.py b/head_extractor/build/lib/mmdet/models/task_modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bfd8f058ed656760e0b1a3fd6118f31a799cb11
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .assigners import *  # noqa: F401,F403
+from .builder import (ANCHOR_GENERATORS, BBOX_ASSIGNERS, BBOX_CODERS,
+                      BBOX_SAMPLERS, IOU_CALCULATORS, MATCH_COSTS,
+                      PRIOR_GENERATORS, build_anchor_generator, build_assigner,
+                      build_bbox_coder, build_iou_calculator, build_match_cost,
+                      build_prior_generator, build_sampler)
+from .coders import *  # noqa: F401,F403
+from .prior_generators import *  # noqa: F401,F403
+from .samplers import *  # noqa: F401,F403
+from .tracking import *  # noqa: F401,F403
+
+__all__ = [
+    'ANCHOR_GENERATORS', 'PRIOR_GENERATORS', 'BBOX_ASSIGNERS', 'BBOX_SAMPLERS',
+    'MATCH_COSTS', 'BBOX_CODERS', 'IOU_CALCULATORS', 'build_anchor_generator',
+    'build_prior_generator', 'build_assigner', 'build_sampler',
+    'build_iou_calculator', 'build_match_cost', 'build_bbox_coder'
+]
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/__init__.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e564f24c95b1cc6be8a35a1a309ebf10e582032
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .approx_max_iou_assigner import ApproxMaxIoUAssigner
+from .assign_result import AssignResult
+from .atss_assigner import ATSSAssigner
+from .base_assigner import BaseAssigner
+from .center_region_assigner import CenterRegionAssigner
+from .dynamic_soft_label_assigner import DynamicSoftLabelAssigner
+from .grid_assigner import GridAssigner
+from .hungarian_assigner import HungarianAssigner
+from .iou2d_calculator import BboxOverlaps2D, BboxOverlaps2D_GLIP
+from .match_cost import (BBoxL1Cost, BinaryFocalLossCost, ClassificationCost,
+                         CrossEntropyLossCost, DiceCost, FocalLossCost,
+                         IoUCost)
+from .max_iou_assigner import MaxIoUAssigner
+from .multi_instance_assigner import MultiInstanceAssigner
+from .point_assigner import PointAssigner
+from .region_assigner import RegionAssigner
+from .sim_ota_assigner import SimOTAAssigner
+from .task_aligned_assigner import TaskAlignedAssigner
+from .topk_hungarian_assigner import TopkHungarianAssigner
+from .uniform_assigner import UniformAssigner
+
+__all__ = [
+    'BaseAssigner', 'BinaryFocalLossCost', 'MaxIoUAssigner',
+    'ApproxMaxIoUAssigner', 'AssignResult', 'PointAssigner', 'ATSSAssigner',
+    'CenterRegionAssigner', 'GridAssigner', 'HungarianAssigner',
+    'RegionAssigner', 'UniformAssigner', 'SimOTAAssigner',
+    'TaskAlignedAssigner', 'TopkHungarianAssigner', 'BBoxL1Cost',
+    'ClassificationCost', 'CrossEntropyLossCost', 'DiceCost', 'FocalLossCost',
+    'IoUCost', 'BboxOverlaps2D', 'DynamicSoftLabelAssigner',
+    'MultiInstanceAssigner', 'BboxOverlaps2D_GLIP'
+]
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..471d54e578d640da242355b54cebe05658309ca2
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .max_iou_assigner import MaxIoUAssigner
+
+
+@TASK_UTILS.register_module()
+class ApproxMaxIoUAssigner(MaxIoUAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with an integer indicating the ground truth
+     index. (semi-positive index: gt label (0-based), -1: background)
+
+    - -1: negative sample, no assigned gt
+    - semi-positive integer: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+        match_low_quality (bool): Whether to allow quality matches. This is
+            usually allowed for RPN and single stage detectors, but not allowed
+            in the second stage.
+        gpu_assign_thr (int): The upper bound of the number of GT for GPU
+            assign. When the number of gt is above this threshold, will assign
+            on CPU device. Negative values mean not assign on CPU.
+        iou_calculator (:obj:`ConfigDict` or dict): Config of overlaps
+            Calculator.
+    """
+
+    def __init__(
+        self,
+        pos_iou_thr: float,
+        neg_iou_thr: Union[float, tuple],
+        min_pos_iou: float = .0,
+        gt_max_assign_all: bool = True,
+        ignore_iof_thr: float = -1,
+        ignore_wrt_candidates: bool = True,
+        match_low_quality: bool = True,
+        gpu_assign_thr: int = -1,
+        iou_calculator: Union[ConfigDict, dict] = dict(type='BboxOverlaps2D')
+    ) -> None:
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to approxs.
+
+        This method assign a gt bbox to each group of approxs (bboxes),
+        each group of approxs is represent by a base approx (bbox) and
+        will be assigned with -1, or a semi-positive number.
+        background_label (-1) means negative sample,
+        semi-positive number is the index (0-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to background_label (-1)
+        2. use the max IoU of each group of approxs to assign
+        2. assign proposals whose iou with all gts < neg_iou_thr to background
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). ``approxs`` means the
+                group of approxs aligned with ``priors``, has shape
+                (n, num_approxs, 4).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        squares = pred_instances.priors
+        approxs = pred_instances.approxs
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        gt_bboxes_ignore = None if gt_instances_ignore is None else \
+            gt_instances_ignore.get('bboxes', None)
+        approxs_per_octave = approxs.size(1)
+
+        num_squares = squares.size(0)
+        num_gts = gt_bboxes.size(0)
+
+        if num_squares == 0 or num_gts == 0:
+            # No predictions and/or truth, return empty assignment
+            overlaps = approxs.new(num_gts, num_squares)
+            assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+            return assign_result
+
+        # re-organize anchors by approxs_per_octave x num_squares
+        approxs = torch.transpose(approxs, 0, 1).contiguous().view(-1, 4)
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            num_gts > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = approxs.device
+            approxs = approxs.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+            if gt_labels is not None:
+                gt_labels = gt_labels.cpu()
+        all_overlaps = self.iou_calculator(approxs, gt_bboxes)
+
+        overlaps, _ = all_overlaps.view(approxs_per_octave, num_squares,
+                                        num_gts).max(dim=0)
+        overlaps = torch.transpose(overlaps, 0, 1)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and squares.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = self.iou_calculator(
+                    squares, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = self.iou_calculator(
+                    gt_bboxes_ignore, squares, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/assign_result.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/assign_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..56ca2c3c18fee94cc4a039b769e42521bd14907d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/assign_result.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import Tensor
+
+from mmdet.utils import util_mixins
+
+
+class AssignResult(util_mixins.NiceRepr):
+    """Stores assignments between predicted and truth boxes.
+
+    Attributes:
+        num_gts (int): the number of truth boxes considered when computing this
+            assignment
+        gt_inds (Tensor): for each predicted box indicates the 1-based
+            index of the assigned truth box. 0 means unassigned and -1 means
+            ignore.
+        max_overlaps (Tensor): the iou between the predicted box and its
+            assigned truth box.
+        labels (Tensor): If specified, for each predicted box
+            indicates the category label of the assigned truth box.
+
+    Example:
+        >>> # An assign result between 4 predicted boxes and 9 true boxes
+        >>> # where only two boxes were assigned.
+        >>> num_gts = 9
+        >>> max_overlaps = torch.LongTensor([0, .5, .9, 0])
+        >>> gt_inds = torch.LongTensor([-1, 1, 2, 0])
+        >>> labels = torch.LongTensor([0, 3, 4, 0])
+        >>> self = AssignResult(num_gts, gt_inds, max_overlaps, labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(4,), max_overlaps.shape=(4,),
+                      labels.shape=(4,))>
+        >>> # Force addition of gt labels (when adding gt as proposals)
+        >>> new_labels = torch.LongTensor([3, 4, 5])
+        >>> self.add_gt_(new_labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(7,), max_overlaps.shape=(7,),
+                      labels.shape=(7,))>
+    """
+
+    def __init__(self, num_gts: int, gt_inds: Tensor, max_overlaps: Tensor,
+                 labels: Tensor) -> None:
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+        # Interface for possible user-defined properties
+        self._extra_properties = {}
+
+    @property
+    def num_preds(self):
+        """int: the number of predictions in this assignment"""
+        return len(self.gt_inds)
+
+    def set_extra_property(self, key, value):
+        """Set user-defined new property."""
+        assert key not in self.info
+        self._extra_properties[key] = value
+
+    def get_extra_property(self, key):
+        """Get user-defined property."""
+        return self._extra_properties.get(key, None)
+
+    @property
+    def info(self):
+        """dict: a dictionary of info about the object"""
+        basic_info = {
+            'num_gts': self.num_gts,
+            'num_preds': self.num_preds,
+            'gt_inds': self.gt_inds,
+            'max_overlaps': self.max_overlaps,
+            'labels': self.labels,
+        }
+        basic_info.update(self._extra_properties)
+        return basic_info
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this assign result"""
+        parts = []
+        parts.append(f'num_gts={self.num_gts!r}')
+        if self.gt_inds is None:
+            parts.append(f'gt_inds={self.gt_inds!r}')
+        else:
+            parts.append(f'gt_inds.shape={tuple(self.gt_inds.shape)!r}')
+        if self.max_overlaps is None:
+            parts.append(f'max_overlaps={self.max_overlaps!r}')
+        else:
+            parts.append('max_overlaps.shape='
+                         f'{tuple(self.max_overlaps.shape)!r}')
+        if self.labels is None:
+            parts.append(f'labels={self.labels!r}')
+        else:
+            parts.append(f'labels.shape={tuple(self.labels.shape)!r}')
+        return ', '.join(parts)
+
+    @classmethod
+    def random(cls, **kwargs):
+        """Create random AssignResult for tests or debugging.
+
+        Args:
+            num_preds: number of predicted boxes
+            num_gts: number of true boxes
+            p_ignore (float): probability of a predicted box assigned to an
+                ignored truth
+            p_assigned (float): probability of a predicted box not being
+                assigned
+            p_use_label (float | bool): with labels or not
+            rng (None | int | numpy.random.RandomState): seed or state
+
+        Returns:
+            :obj:`AssignResult`: Randomly generated assign results.
+
+        Example:
+            >>> from mmdet.models.task_modules.assigners.assign_result import *  # NOQA
+            >>> self = AssignResult.random()
+            >>> print(self.info)
+        """
+        from ..samplers.sampling_result import ensure_rng
+        rng = ensure_rng(kwargs.get('rng', None))
+
+        num_gts = kwargs.get('num_gts', None)
+        num_preds = kwargs.get('num_preds', None)
+        p_ignore = kwargs.get('p_ignore', 0.3)
+        p_assigned = kwargs.get('p_assigned', 0.7)
+        num_classes = kwargs.get('num_classes', 3)
+
+        if num_gts is None:
+            num_gts = rng.randint(0, 8)
+        if num_preds is None:
+            num_preds = rng.randint(0, 16)
+
+        if num_gts == 0:
+            max_overlaps = torch.zeros(num_preds, dtype=torch.float32)
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+            labels = torch.zeros(num_preds, dtype=torch.int64)
+
+        else:
+            import numpy as np
+
+            # Create an overlap for each predicted box
+            max_overlaps = torch.from_numpy(rng.rand(num_preds))
+
+            # Construct gt_inds for each predicted box
+            is_assigned = torch.from_numpy(rng.rand(num_preds) < p_assigned)
+            # maximum number of assignments constraints
+            n_assigned = min(num_preds, min(num_gts, is_assigned.sum()))
+
+            assigned_idxs = np.where(is_assigned)[0]
+            rng.shuffle(assigned_idxs)
+            assigned_idxs = assigned_idxs[0:n_assigned]
+            assigned_idxs.sort()
+
+            is_assigned[:] = 0
+            is_assigned[assigned_idxs] = True
+
+            is_ignore = torch.from_numpy(
+                rng.rand(num_preds) < p_ignore) & is_assigned
+
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+
+            true_idxs = np.arange(num_gts)
+            rng.shuffle(true_idxs)
+            true_idxs = torch.from_numpy(true_idxs)
+            gt_inds[is_assigned] = true_idxs[:n_assigned].long()
+
+            gt_inds = torch.from_numpy(
+                rng.randint(1, num_gts + 1, size=num_preds))
+            gt_inds[is_ignore] = -1
+            gt_inds[~is_assigned] = 0
+            max_overlaps[~is_assigned] = 0
+
+            if num_classes == 0:
+                labels = torch.zeros(num_preds, dtype=torch.int64)
+            else:
+                labels = torch.from_numpy(
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    rng.randint(0, num_classes, size=num_preds))
+                labels[~is_assigned] = 0
+
+        self = cls(num_gts, gt_inds, max_overlaps, labels)
+        return self
+
+    def add_gt_(self, gt_labels):
+        """Add ground truth as assigned results.
+
+        Args:
+            gt_labels (torch.Tensor): Labels of gt boxes
+        """
+        self_inds = torch.arange(
+            1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device)
+        self.gt_inds = torch.cat([self_inds, self.gt_inds])
+
+        self.max_overlaps = torch.cat(
+            [self.max_overlaps.new_ones(len(gt_labels)), self.max_overlaps])
+
+        self.labels = torch.cat([gt_labels, self.labels])
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/atss_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/atss_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..2796b990c5ae4c56bcf314e1342671d950232ae6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/atss_assigner.py
@@ -0,0 +1,254 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def bbox_center_distance(bboxes: Tensor, priors: Tensor) -> Tensor:
+    """Compute the center distance between bboxes and priors.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for , "xyxy" format.
+        priors (Tensor): Shape (n, 4) for priors, "xyxy" format.
+
+    Returns:
+        Tensor: Center distances between bboxes and priors.
+    """
+    bbox_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
+    bbox_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
+    bbox_points = torch.stack((bbox_cx, bbox_cy), dim=1)
+
+    priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0
+    priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0
+    priors_points = torch.stack((priors_cx, priors_cy), dim=1)
+
+    distances = (priors_points[:, None, :] -
+                 bbox_points[None, :, :]).pow(2).sum(-1).sqrt()
+
+    return distances
+
+
+@TASK_UTILS.register_module()
+class ATSSAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each prior.
+
+    Each proposals will be assigned with `0` or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    If ``alpha`` is not None, it means that the dynamic cost
+    ATSSAssigner is adopted, which is currently only used in the DDOD.
+
+    Args:
+        topk (int): number of priors selected in each level
+        alpha (float, optional): param of cost rate for each proposal only
+            in DDOD. Defaults to None.
+        iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou
+            calculator. Defaults to ``dict(type='BboxOverlaps2D')``
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes. Defaults to -1.
+    """
+
+    def __init__(self,
+                 topk: int,
+                 alpha: Optional[float] = None,
+                 iou_calculator: ConfigType = dict(type='BboxOverlaps2D'),
+                 ignore_iof_thr: float = -1) -> None:
+        self.topk = topk
+        self.alpha = alpha
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+        self.ignore_iof_thr = ignore_iof_thr
+
+    # https://github.com/sfzhang15/ATSS/blob/master/atss_core/modeling/rpn/atss/loss.py
+    def assign(
+            self,
+            pred_instances: InstanceData,
+            num_level_priors: List[int],
+            gt_instances: InstanceData,
+            gt_instances_ignore: Optional[InstanceData] = None
+    ) -> AssignResult:
+        """Assign gt to priors.
+
+        The assignment is done in following steps
+
+        1. compute iou between all prior (prior of all pyramid levels) and gt
+        2. compute center distance between all prior and gt
+        3. on each pyramid level, for each gt, select k prior whose center
+           are closest to the gt center, so we total select k*l prior as
+           candidates for each gt
+        4. get corresponding iou for the these candidates, and compute the
+           mean and std, set mean + std as the iou threshold
+        5. select these candidates whose iou are greater than or equal to
+           the threshold as positive
+        6. limit the positive sample's center in gt
+
+        If ``alpha`` is not None, and ``cls_scores`` and `bbox_preds`
+        are not None, the overlaps calculation in the first step
+        will also include dynamic cost, which is currently only used in
+        the DDOD.
+
+        Args:
+            pred_instances (:obj:`InstaceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            num_level_priors (List): Number of bboxes in each level
+            gt_instances (:obj:`InstaceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            gt_instances_ignore (:obj:`InstaceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+        if gt_instances_ignore is not None:
+            gt_bboxes_ignore = gt_instances_ignore.bboxes
+        else:
+            gt_bboxes_ignore = None
+
+        INF = 100000000
+        priors = priors[:, :4]
+        num_gt, num_priors = gt_bboxes.size(0), priors.size(0)
+
+        message = 'Invalid alpha parameter because cls_scores or ' \
+                  'bbox_preds are None. If you want to use the ' \
+                  'cost-based ATSSAssigner,  please set cls_scores, ' \
+                  'bbox_preds and self.alpha at the same time. '
+
+        # compute iou between all bbox and gt
+        if self.alpha is None:
+            # ATSSAssigner
+            overlaps = self.iou_calculator(priors, gt_bboxes)
+            if ('scores' in pred_instances or 'bboxes' in pred_instances):
+                warnings.warn(message)
+
+        else:
+            # Dynamic cost ATSSAssigner in DDOD
+            assert ('scores' in pred_instances
+                    and 'bboxes' in pred_instances), message
+            cls_scores = pred_instances.scores
+            bbox_preds = pred_instances.bboxes
+
+            # compute cls cost for bbox and GT
+            cls_cost = torch.sigmoid(cls_scores[:, gt_labels])
+
+            # compute iou between all bbox and gt
+            overlaps = self.iou_calculator(bbox_preds, gt_bboxes)
+
+            # make sure that we are in element-wise multiplication
+            assert cls_cost.shape == overlaps.shape
+
+            # overlaps is actually a cost matrix
+            overlaps = cls_cost**(1 - self.alpha) * overlaps**self.alpha
+
+        # assign 0 by default
+        assigned_gt_inds = overlaps.new_full((num_priors, ),
+                                             0,
+                                             dtype=torch.long)
+
+        if num_gt == 0 or num_priors == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_priors, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            assigned_labels = overlaps.new_full((num_priors, ),
+                                                -1,
+                                                dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        # compute center distance between all bbox and gt
+        distances = bbox_center_distance(gt_bboxes, priors)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and priors.numel() > 0):
+            ignore_overlaps = self.iou_calculator(
+                priors, gt_bboxes_ignore, mode='iof')
+            ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            ignore_idxs = ignore_max_overlaps > self.ignore_iof_thr
+            distances[ignore_idxs, :] = INF
+            assigned_gt_inds[ignore_idxs] = -1
+
+        # Selecting candidates based on the center distance
+        candidate_idxs = []
+        start_idx = 0
+        for level, priors_per_level in enumerate(num_level_priors):
+            # on each pyramid level, for each gt,
+            # select k bbox whose center are closest to the gt center
+            end_idx = start_idx + priors_per_level
+            distances_per_level = distances[start_idx:end_idx, :]
+            selectable_k = min(self.topk, priors_per_level)
+            _, topk_idxs_per_level = distances_per_level.topk(
+                selectable_k, dim=0, largest=False)
+            candidate_idxs.append(topk_idxs_per_level + start_idx)
+            start_idx = end_idx
+        candidate_idxs = torch.cat(candidate_idxs, dim=0)
+
+        # get corresponding iou for the these candidates, and compute the
+        # mean and std, set mean + std as the iou threshold
+        candidate_overlaps = overlaps[candidate_idxs, torch.arange(num_gt)]
+        overlaps_mean_per_gt = candidate_overlaps.mean(0)
+        overlaps_std_per_gt = candidate_overlaps.std(0)
+        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
+
+        is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :]
+
+        # limit the positive sample's center in gt
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_priors
+        priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0
+        priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0
+        ep_priors_cx = priors_cx.view(1, -1).expand(
+            num_gt, num_priors).contiguous().view(-1)
+        ep_priors_cy = priors_cy.view(1, -1).expand(
+            num_gt, num_priors).contiguous().view(-1)
+        candidate_idxs = candidate_idxs.view(-1)
+
+        # calculate the left, top, right, bottom distance between positive
+        # prior center and gt side
+        l_ = ep_priors_cx[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 0]
+        t_ = ep_priors_cy[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - ep_priors_cx[candidate_idxs].view(-1, num_gt)
+        b_ = gt_bboxes[:, 3] - ep_priors_cy[candidate_idxs].view(-1, num_gt)
+        is_in_gts = torch.stack([l_, t_, r_, b_], dim=1).min(dim=1)[0] > 0.01
+
+        is_pos = is_pos & is_in_gts
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest IoU will be selected.
+        overlaps_inf = torch.full_like(overlaps,
+                                       -INF).t().contiguous().view(-1)
+        index = candidate_idxs.view(-1)[is_pos.view(-1)]
+        overlaps_inf[index] = overlaps.t().contiguous().view(-1)[index]
+        overlaps_inf = overlaps_inf.view(num_gt, -1).t()
+
+        max_overlaps, argmax_overlaps = overlaps_inf.max(dim=1)
+        assigned_gt_inds[
+            max_overlaps != -INF] = argmax_overlaps[max_overlaps != -INF] + 1
+
+        assigned_labels = assigned_gt_inds.new_full((num_priors, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/base_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/base_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..b12280ad746c7557008313dd936a62a99e8c78d5
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/base_assigner.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Optional
+
+from mmengine.structures import InstanceData
+
+
+class BaseAssigner(metaclass=ABCMeta):
+    """Base assigner that assigns boxes to ground truth boxes."""
+
+    @abstractmethod
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs):
+        """Assign boxes to either a ground truth boxes or a negative boxes."""
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/center_region_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/center_region_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..11c8055c67cdf46c1ae0f877e88192db33795581
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/center_region_assigner.py
@@ -0,0 +1,366 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def scale_boxes(bboxes: Tensor, scale: float) -> Tensor:
+    """Expand an array of boxes by a given scale.
+
+    Args:
+        bboxes (Tensor): Shape (m, 4)
+        scale (float): The scale factor of bboxes
+
+    Returns:
+        Tensor: Shape (m, 4). Scaled bboxes
+    """
+    assert bboxes.size(1) == 4
+    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
+    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
+    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
+    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_scaled = torch.zeros_like(bboxes)
+    boxes_scaled[:, 0] = x_c - w_half
+    boxes_scaled[:, 2] = x_c + w_half
+    boxes_scaled[:, 1] = y_c - h_half
+    boxes_scaled[:, 3] = y_c + h_half
+    return boxes_scaled
+
+
+def is_located_in(points: Tensor, bboxes: Tensor) -> Tensor:
+    """Are points located in bboxes.
+
+    Args:
+        points (Tensor): Points, shape: (m, 2).
+        bboxes (Tensor): Bounding boxes, shape: (n, 4).
+
+    Return:
+        Tensor: Flags indicating if points are located in bboxes,
+        shape: (m, n).
+    """
+    assert points.size(1) == 2
+    assert bboxes.size(1) == 4
+    return (points[:, 0].unsqueeze(1) > bboxes[:, 0].unsqueeze(0)) & \
+           (points[:, 0].unsqueeze(1) < bboxes[:, 2].unsqueeze(0)) & \
+           (points[:, 1].unsqueeze(1) > bboxes[:, 1].unsqueeze(0)) & \
+           (points[:, 1].unsqueeze(1) < bboxes[:, 3].unsqueeze(0))
+
+
+def bboxes_area(bboxes: Tensor) -> Tensor:
+    """Compute the area of an array of bboxes.
+
+    Args:
+        bboxes (Tensor): The coordinates ox bboxes. Shape: (m, 4)
+
+    Returns:
+        Tensor: Area of the bboxes. Shape: (m, )
+    """
+    assert bboxes.size(1) == 4
+    w = (bboxes[:, 2] - bboxes[:, 0])
+    h = (bboxes[:, 3] - bboxes[:, 1])
+    areas = w * h
+    return areas
+
+
+@TASK_UTILS.register_module()
+class CenterRegionAssigner(BaseAssigner):
+    """Assign pixels at the center region of a bbox as positive.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+    - -1: negative samples
+    - semi-positive numbers: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_scale (float): Threshold within which pixels are
+            labelled as positive.
+        neg_scale (float): Threshold above which pixels are
+            labelled as positive.
+        min_pos_iof (float): Minimum iof of a pixel with a gt to be
+            labelled as positive. Default: 1e-2
+        ignore_gt_scale (float): Threshold within which the pixels
+            are ignored when the gt is labelled as shadowed. Default: 0.5
+        foreground_dominate (bool): If True, the bbox will be assigned as
+            positive when a gt's kernel region overlaps with another's shadowed
+            (ignored) region, otherwise it is set as ignored. Default to False.
+        iou_calculator (:obj:`ConfigDict` or dict): Config of overlaps
+            Calculator.
+    """
+
+    def __init__(
+        self,
+        pos_scale: float,
+        neg_scale: float,
+        min_pos_iof: float = 1e-2,
+        ignore_gt_scale: float = 0.5,
+        foreground_dominate: bool = False,
+        iou_calculator: ConfigType = dict(type='BboxOverlaps2D')
+    ) -> None:
+        self.pos_scale = pos_scale
+        self.neg_scale = neg_scale
+        self.min_pos_iof = min_pos_iof
+        self.ignore_gt_scale = ignore_gt_scale
+        self.foreground_dominate = foreground_dominate
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def get_gt_priorities(self, gt_bboxes: Tensor) -> Tensor:
+        """Get gt priorities according to their areas.
+
+        Smaller gt has higher priority.
+
+        Args:
+            gt_bboxes (Tensor): Ground truth boxes, shape (k, 4).
+
+        Returns:
+            Tensor: The priority of gts so that gts with larger priority is
+            more likely to be assigned. Shape (k, )
+        """
+        gt_areas = bboxes_area(gt_bboxes)
+        # Rank all gt bbox areas. Smaller objects has larger priority
+        _, sort_idx = gt_areas.sort(descending=True)
+        sort_idx = sort_idx.argsort()
+        return sort_idx
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes.
+
+        This method assigns gts to every prior (proposal/anchor), each prior
+        will be assigned with -1, or a semi-positive number. -1 means
+        negative sample, semi-positive number is the index (0-based) of
+        assigned gt.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result. Note that shadowed_labels
+            of shape (N, 2) is also added as an `assign_result` attribute.
+            `shadowed_labels` is a tensor composed of N pairs of anchor_ind,
+            class_label], where N is the number of anchors that lie in the
+            outer region of a gt, anchor_ind is the shadowed anchor index
+            and class_label is the shadowed class label.
+
+        Example:
+            >>> from mmengine.structures import InstanceData
+            >>> self = CenterRegionAssigner(0.2, 0.2)
+            >>> pred_instances.priors = torch.Tensor([[0, 0, 10, 10],
+            ...                                      [10, 10, 20, 20]])
+            >>> gt_instances = InstanceData()
+            >>> gt_instances.bboxes = torch.Tensor([[0, 0, 10, 10]])
+            >>> gt_instances.labels = torch.Tensor([0])
+            >>> assign_result = self.assign(pred_instances, gt_instances)
+            >>> expected_gt_inds = torch.LongTensor([1, 0])
+            >>> assert torch.all(assign_result.gt_inds == expected_gt_inds)
+        """
+        # There are in total 5 steps in the pixel assignment
+        # 1. Find core (the center region, say inner 0.2)
+        #     and shadow (the relatively ourter part, say inner 0.2-0.5)
+        #     regions of every gt.
+        # 2. Find all prior bboxes that lie in gt_core and gt_shadow regions
+        # 3. Assign prior bboxes in gt_core with a one-hot id of the gt in
+        #      the image.
+        #    3.1. For overlapping objects, the prior bboxes in gt_core is
+        #           assigned with the object with smallest area
+        # 4. Assign prior bboxes with class label according to its gt id.
+        #    4.1. Assign -1 to prior bboxes lying in shadowed gts
+        #    4.2. Assign positive prior boxes with the corresponding label
+        # 5. Find pixels lying in the shadow of an object and assign them with
+        #      background label, but set the loss weight of its corresponding
+        #      gt to zero.
+
+        # TODO not extract bboxes in assign.
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+
+        assert priors.size(1) == 4, 'priors must have size of 4'
+        # 1. Find core positive and shadow region of every gt
+        gt_core = scale_boxes(gt_bboxes, self.pos_scale)
+        gt_shadow = scale_boxes(gt_bboxes, self.neg_scale)
+
+        # 2. Find prior bboxes that lie in gt_core and gt_shadow regions
+        prior_centers = (priors[:, 2:4] + priors[:, 0:2]) / 2
+        # The center points lie within the gt boxes
+        is_prior_in_gt = is_located_in(prior_centers, gt_bboxes)
+        # Only calculate prior and gt_core IoF. This enables small prior bboxes
+        #   to match large gts
+        prior_and_gt_core_overlaps = self.iou_calculator(
+            priors, gt_core, mode='iof')
+        # The center point of effective priors should be within the gt box
+        is_prior_in_gt_core = is_prior_in_gt & (
+            prior_and_gt_core_overlaps > self.min_pos_iof)  # shape (n, k)
+
+        is_prior_in_gt_shadow = (
+            self.iou_calculator(priors, gt_shadow, mode='iof') >
+            self.min_pos_iof)
+        # Rule out center effective positive pixels
+        is_prior_in_gt_shadow &= (~is_prior_in_gt_core)
+
+        num_gts, num_priors = gt_bboxes.size(0), priors.size(0)
+        if num_gts == 0 or num_priors == 0:
+            # If no gts exist, assign all pixels to negative
+            assigned_gt_ids = \
+                is_prior_in_gt_core.new_zeros((num_priors,),
+                                              dtype=torch.long)
+            pixels_in_gt_shadow = assigned_gt_ids.new_empty((0, 2))
+        else:
+            # Step 3: assign a one-hot gt id to each pixel, and smaller objects
+            #    have high priority to assign the pixel.
+            sort_idx = self.get_gt_priorities(gt_bboxes)
+            assigned_gt_ids, pixels_in_gt_shadow = \
+                self.assign_one_hot_gt_indices(is_prior_in_gt_core,
+                                               is_prior_in_gt_shadow,
+                                               gt_priority=sort_idx)
+
+        if (gt_instances_ignore is not None
+                and gt_instances_ignore.bboxes.numel() > 0):
+            # No ground truth or boxes, return empty assignment
+            gt_bboxes_ignore = gt_instances_ignore.bboxes
+            gt_bboxes_ignore = scale_boxes(
+                gt_bboxes_ignore, scale=self.ignore_gt_scale)
+            is_prior_in_ignored_gts = is_located_in(prior_centers,
+                                                    gt_bboxes_ignore)
+            is_prior_in_ignored_gts = is_prior_in_ignored_gts.any(dim=1)
+            assigned_gt_ids[is_prior_in_ignored_gts] = -1
+
+        # 4. Assign prior bboxes with class label according to its gt id.
+        # Default assigned label is the background (-1)
+        assigned_labels = assigned_gt_ids.new_full((num_priors, ), -1)
+        pos_inds = torch.nonzero(assigned_gt_ids > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_ids[pos_inds] -
+                                                  1]
+        # 5. Find pixels lying in the shadow of an object
+        shadowed_pixel_labels = pixels_in_gt_shadow.clone()
+        if pixels_in_gt_shadow.numel() > 0:
+            pixel_idx, gt_idx =\
+                pixels_in_gt_shadow[:, 0], pixels_in_gt_shadow[:, 1]
+            assert (assigned_gt_ids[pixel_idx] != gt_idx).all(), \
+                'Some pixels are dually assigned to ignore and gt!'
+            shadowed_pixel_labels[:, 1] = gt_labels[gt_idx - 1]
+            override = (
+                assigned_labels[pixel_idx] == shadowed_pixel_labels[:, 1])
+            if self.foreground_dominate:
+                # When a pixel is both positive and shadowed, set it as pos
+                shadowed_pixel_labels = shadowed_pixel_labels[~override]
+            else:
+                # When a pixel is both pos and shadowed, set it as shadowed
+                assigned_labels[pixel_idx[override]] = -1
+                assigned_gt_ids[pixel_idx[override]] = 0
+
+        assign_result = AssignResult(
+            num_gts, assigned_gt_ids, None, labels=assigned_labels)
+        # Add shadowed_labels as assign_result property. Shape: (num_shadow, 2)
+        assign_result.set_extra_property('shadowed_labels',
+                                         shadowed_pixel_labels)
+        return assign_result
+
+    def assign_one_hot_gt_indices(
+            self,
+            is_prior_in_gt_core: Tensor,
+            is_prior_in_gt_shadow: Tensor,
+            gt_priority: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
+        """Assign only one gt index to each prior box.
+
+        Gts with large gt_priority are more likely to be assigned.
+
+        Args:
+            is_prior_in_gt_core (Tensor): Bool tensor indicating the prior
+                center is in the core area of a gt (e.g. 0-0.2).
+                Shape: (num_prior, num_gt).
+            is_prior_in_gt_shadow (Tensor): Bool tensor indicating the prior
+                center is in the shadowed area of a gt (e.g. 0.2-0.5).
+                Shape: (num_prior, num_gt).
+            gt_priority (Tensor): Priorities of gts. The gt with a higher
+                priority is more likely to be assigned to the bbox when the
+                bbox match with multiple gts. Shape: (num_gt, ).
+
+        Returns:
+            tuple: Returns (assigned_gt_inds, shadowed_gt_inds).
+
+            - assigned_gt_inds: The assigned gt index of each prior bbox \
+            (i.e. index from 1 to num_gts). Shape: (num_prior, ).
+            - shadowed_gt_inds: shadowed gt indices. It is a tensor of \
+            shape (num_ignore, 2) with first column being the shadowed prior \
+            bbox indices and the second column the shadowed gt \
+            indices (1-based).
+        """
+        num_bboxes, num_gts = is_prior_in_gt_core.shape
+
+        if gt_priority is None:
+            gt_priority = torch.arange(
+                num_gts, device=is_prior_in_gt_core.device)
+        assert gt_priority.size(0) == num_gts
+        # The bigger gt_priority, the more preferable to be assigned
+        # The assigned inds are by default 0 (background)
+        assigned_gt_inds = is_prior_in_gt_core.new_zeros((num_bboxes, ),
+                                                         dtype=torch.long)
+        # Shadowed bboxes are assigned to be background. But the corresponding
+        #   label is ignored during loss calculation, which is done through
+        #   shadowed_gt_inds
+        shadowed_gt_inds = torch.nonzero(is_prior_in_gt_shadow, as_tuple=False)
+        if is_prior_in_gt_core.sum() == 0:  # No gt match
+            shadowed_gt_inds[:, 1] += 1  # 1-based. For consistency issue
+            return assigned_gt_inds, shadowed_gt_inds
+
+        # The priority of each prior box and gt pair. If one prior box is
+        #  matched bo multiple gts. Only the pair with the highest priority
+        #  is saved
+        pair_priority = is_prior_in_gt_core.new_full((num_bboxes, num_gts),
+                                                     -1,
+                                                     dtype=torch.long)
+
+        # Each bbox could match with multiple gts.
+        # The following codes deal with this situation
+        # Matched  bboxes (to any gt). Shape: (num_pos_anchor, )
+        inds_of_match = torch.any(is_prior_in_gt_core, dim=1)
+        # The matched gt index of each positive bbox. Length >= num_pos_anchor
+        #   , since one bbox could match multiple gts
+        matched_bbox_gt_inds = torch.nonzero(
+            is_prior_in_gt_core, as_tuple=False)[:, 1]
+        # Assign priority to each bbox-gt pair.
+        pair_priority[is_prior_in_gt_core] = gt_priority[matched_bbox_gt_inds]
+        _, argmax_priority = pair_priority[inds_of_match].max(dim=1)
+        assigned_gt_inds[inds_of_match] = argmax_priority + 1  # 1-based
+        # Zero-out the assigned anchor box to filter the shadowed gt indices
+        is_prior_in_gt_core[inds_of_match, argmax_priority] = 0
+        # Concat the shadowed indices due to overlapping with that out side of
+        #   effective scale. shape: (total_num_ignore, 2)
+        shadowed_gt_inds = torch.cat(
+            (shadowed_gt_inds,
+             torch.nonzero(is_prior_in_gt_core, as_tuple=False)),
+            dim=0)
+        # Change `is_prior_in_gt_core` back to keep arguments intact.
+        is_prior_in_gt_core[inds_of_match, argmax_priority] = 1
+        # 1-based shadowed gt indices, to be consistent with `assigned_gt_inds`
+        if shadowed_gt_inds.numel() > 0:
+            shadowed_gt_inds[:, 1] += 1
+        return assigned_gt_inds, shadowed_gt_inds
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fc7af39b22cd6dc00248e330547176787c23963
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+INF = 100000000
+EPS = 1.0e-7
+
+
+def center_of_mass(masks: Tensor, eps: float = 1e-7) -> Tensor:
+    """Compute the masks center of mass.
+
+    Args:
+        masks: Mask tensor, has shape (num_masks, H, W).
+        eps: a small number to avoid normalizer to be zero.
+            Defaults to 1e-7.
+    Returns:
+        Tensor: The masks center of mass. Has shape (num_masks, 2).
+    """
+    n, h, w = masks.shape
+    grid_h = torch.arange(h, device=masks.device)[:, None]
+    grid_w = torch.arange(w, device=masks.device)
+    normalizer = masks.sum(dim=(1, 2)).float().clamp(min=eps)
+    center_y = (masks * grid_h).sum(dim=(1, 2)) / normalizer
+    center_x = (masks * grid_w).sum(dim=(1, 2)) / normalizer
+    center = torch.cat([center_x[:, None], center_y[:, None]], dim=1)
+    return center
+
+
+@TASK_UTILS.register_module()
+class DynamicSoftLabelAssigner(BaseAssigner):
+    """Computes matching between predictions and ground truth with dynamic soft
+    label assignment.
+
+    Args:
+        soft_center_radius (float): Radius of the soft center prior.
+            Defaults to 3.0.
+        topk (int): Select top-k predictions to calculate dynamic k
+            best matches for each gt. Defaults to 13.
+        iou_weight (float): The scale factor of iou cost. Defaults to 3.0.
+        iou_calculator (ConfigType): Config of overlaps Calculator.
+            Defaults to dict(type='BboxOverlaps2D').
+    """
+
+    def __init__(
+        self,
+        soft_center_radius: float = 3.0,
+        topk: int = 13,
+        iou_weight: float = 3.0,
+        iou_calculator: ConfigType = dict(type='BboxOverlaps2D')
+    ) -> None:
+        self.soft_center_radius = soft_center_radius
+        self.topk = topk
+        self.iou_weight = iou_weight
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to priors.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            obj:`AssignResult`: The assigned result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        num_gt = gt_bboxes.size(0)
+
+        decoded_bboxes = pred_instances.bboxes
+        pred_scores = pred_instances.scores
+        priors = pred_instances.priors
+        num_bboxes = decoded_bboxes.size(0)
+
+        # assign 0 by default
+        assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ),
+                                                   0,
+                                                   dtype=torch.long)
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        prior_center = priors[:, :2]
+        if isinstance(gt_bboxes, BaseBoxes):
+            is_in_gts = gt_bboxes.find_inside_points(prior_center)
+        else:
+            # Tensor boxes will be treated as horizontal boxes by defaults
+            lt_ = prior_center[:, None] - gt_bboxes[:, :2]
+            rb_ = gt_bboxes[:, 2:] - prior_center[:, None]
+
+            deltas = torch.cat([lt_, rb_], dim=-1)
+            is_in_gts = deltas.min(dim=-1).values > 0
+
+        valid_mask = is_in_gts.sum(dim=1) > 0
+
+        valid_decoded_bbox = decoded_bboxes[valid_mask]
+        valid_pred_scores = pred_scores[valid_mask]
+        num_valid = valid_decoded_bbox.size(0)
+
+        if num_valid == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+        if hasattr(gt_instances, 'masks'):
+            gt_center = center_of_mass(gt_instances.masks, eps=EPS)
+        elif isinstance(gt_bboxes, BaseBoxes):
+            gt_center = gt_bboxes.centers
+        else:
+            # Tensor boxes will be treated as horizontal boxes by defaults
+            gt_center = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) / 2.0
+        valid_prior = priors[valid_mask]
+        strides = valid_prior[:, 2]
+        distance = (valid_prior[:, None, :2] - gt_center[None, :, :]
+                    ).pow(2).sum(-1).sqrt() / strides[:, None]
+        soft_center_prior = torch.pow(10, distance - self.soft_center_radius)
+
+        pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes)
+        iou_cost = -torch.log(pairwise_ious + EPS) * self.iou_weight
+
+        gt_onehot_label = (
+            F.one_hot(gt_labels.to(torch.int64),
+                      pred_scores.shape[-1]).float().unsqueeze(0).repeat(
+                          num_valid, 1, 1))
+        valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(1, num_gt, 1)
+
+        soft_label = gt_onehot_label * pairwise_ious[..., None]
+        scale_factor = soft_label - valid_pred_scores.sigmoid()
+        soft_cls_cost = F.binary_cross_entropy_with_logits(
+            valid_pred_scores, soft_label,
+            reduction='none') * scale_factor.abs().pow(2.0)
+        soft_cls_cost = soft_cls_cost.sum(dim=-1)
+
+        cost_matrix = soft_cls_cost + iou_cost + soft_center_prior
+
+        matched_pred_ious, matched_gt_inds = self.dynamic_k_matching(
+            cost_matrix, pairwise_ious, num_gt, valid_mask)
+
+        # convert to AssignResult format
+        assigned_gt_inds[valid_mask] = matched_gt_inds + 1
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long()
+        max_overlaps = assigned_gt_inds.new_full((num_bboxes, ),
+                                                 -INF,
+                                                 dtype=torch.float32)
+        max_overlaps[valid_mask] = matched_pred_ious
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+    def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor,
+                           num_gt: int,
+                           valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
+        """Use IoU and matching cost to calculate the dynamic top-k positive
+        targets. Same as SimOTA.
+
+        Args:
+            cost (Tensor): Cost matrix.
+            pairwise_ious (Tensor): Pairwise iou matrix.
+            num_gt (int): Number of gt.
+            valid_mask (Tensor): Mask for valid bboxes.
+
+        Returns:
+            tuple: matched ious and gt indexes.
+        """
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+        # select candidate topk ious for dynamic-k calculation
+        candidate_topk = min(self.topk, pairwise_ious.size(0))
+        topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0)
+        # calculate dynamic k for each gt
+        dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
+            matching_matrix[:, gt_idx][pos_idx] = 1
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        prior_match_gt_mask = matching_matrix.sum(1) > 1
+        if prior_match_gt_mask.sum() > 0:
+            cost_min, cost_argmin = torch.min(
+                cost[prior_match_gt_mask, :], dim=1)
+            matching_matrix[prior_match_gt_mask, :] *= 0
+            matching_matrix[prior_match_gt_mask, cost_argmin] = 1
+        # get foreground mask inside box and center prior
+        fg_mask_inboxes = matching_matrix.sum(1) > 0
+        valid_mask[valid_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1)
+        matched_pred_ious = (matching_matrix *
+                             pairwise_ious).sum(1)[fg_mask_inboxes]
+        return matched_pred_ious, matched_gt_inds
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/grid_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/grid_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8935d2df2937f90c71599e5b45ed9a3dff8cd7e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/grid_assigner.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class GridAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple[float, float]): IoU threshold for negative
+        bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+            Defaults to 0.
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        iou_calculator (:obj:`ConfigDict` or dict): Config of overlaps
+            Calculator.
+    """
+
+    def __init__(
+        self,
+        pos_iou_thr: float,
+        neg_iou_thr: Union[float, Tuple[float, float]],
+        min_pos_iou: float = .0,
+        gt_max_assign_all: bool = True,
+        iou_calculator: ConfigType = dict(type='BboxOverlaps2D')
+    ) -> None:
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes. The process is very much like the max iou
+        assigner, except that positive samples are constrained within the cell
+        that the gt boxes fell in.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, 0, or a positive number. -1 means don't care,
+        0 means negative sample, positive number is the index (1-based) of
+        assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to -1
+        2. assign proposals whose iou with all gts <= neg_iou_thr to 0
+        3. for each bbox within a cell, if the iou with its nearest gt >
+            pos_iou_thr and the center of that gt falls inside the cell,
+            assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals within the cell the
+            gt bbox falls in to itself.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+
+        priors = pred_instances.priors
+        responsible_flags = pred_instances.responsible_flags
+
+        num_gts, num_priors = gt_bboxes.size(0), priors.size(0)
+
+        # compute iou between all gt and priors
+        overlaps = self.iou_calculator(gt_bboxes, priors)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = overlaps.new_full((num_priors, ),
+                                             -1,
+                                             dtype=torch.long)
+
+        if num_gts == 0 or num_priors == 0:
+            # No ground truth or priors, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_priors, ))
+            if num_gts == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            assigned_labels = overlaps.new_full((num_priors, ),
+                                                -1,
+                                                dtype=torch.long)
+            return AssignResult(
+                num_gts,
+                assigned_gt_inds,
+                max_overlaps,
+                labels=assigned_labels)
+
+        # 2. assign negative: below
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        # shape of max_overlaps == argmax_overlaps == num_priors
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+        if isinstance(self.neg_iou_thr, float):
+            assigned_gt_inds[(max_overlaps >= 0)
+                             & (max_overlaps <= self.neg_iou_thr)] = 0
+        elif isinstance(self.neg_iou_thr, (tuple, list)):
+            assert len(self.neg_iou_thr) == 2
+            assigned_gt_inds[(max_overlaps > self.neg_iou_thr[0])
+                             & (max_overlaps <= self.neg_iou_thr[1])] = 0
+
+        # 3. assign positive: falls into responsible cell and above
+        # positive IOU threshold, the order matters.
+        # the prior condition of comparison is to filter out all
+        # unrelated anchors, i.e. not responsible_flags
+        overlaps[:, ~responsible_flags.type(torch.bool)] = -1.
+
+        # calculate max_overlaps again, but this time we only consider IOUs
+        # for anchors responsible for prediction
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        # shape of gt_max_overlaps == gt_argmax_overlaps == num_gts
+        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
+
+        pos_inds = (max_overlaps > self.pos_iou_thr) & responsible_flags.type(
+            torch.bool)
+        assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+        # 4. assign positive to max overlapped anchors within responsible cell
+        for i in range(num_gts):
+            if gt_max_overlaps[i] > self.min_pos_iou:
+                if self.gt_max_assign_all:
+                    max_iou_inds = (overlaps[i, :] == gt_max_overlaps[i]) & \
+                         responsible_flags.type(torch.bool)
+                    assigned_gt_inds[max_iou_inds] = i + 1
+                elif responsible_flags[gt_argmax_overlaps[i]]:
+                    assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        # assign labels of positive anchors
+        assigned_labels = assigned_gt_inds.new_full((num_priors, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+
+        return AssignResult(
+            num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/hungarian_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/hungarian_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6745a36cdc713c74f801f62dae0d8fe3d03828f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/hungarian_assigner.py
@@ -0,0 +1,145 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import torch
+from mmengine import ConfigDict
+from mmengine.structures import InstanceData
+from scipy.optimize import linear_sum_assignment
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class HungarianAssigner(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of some components.
+    For DETR the costs are weighted sum of classification cost, regression L1
+    cost and regression iou cost. The targets don't include the no_object, so
+    generally there are more predictions than targets. After the one-to-one
+    matching, the un-matched are treated as backgrounds. Thus each query
+    prediction will be assigned with `0` or a positive integer indicating the
+    ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        match_costs (:obj:`ConfigDict` or dict or \
+            List[Union[:obj:`ConfigDict`, dict]]): Match cost configs.
+    """
+
+    def __init__(
+        self, match_costs: Union[List[Union[dict, ConfigDict]], dict,
+                                 ConfigDict]
+    ) -> None:
+
+        if isinstance(match_costs, dict):
+            match_costs = [match_costs]
+        elif isinstance(match_costs, list):
+            assert len(match_costs) > 0, \
+                'match_costs must not be a empty list.'
+
+        self.match_costs = [
+            TASK_UTILS.build(match_cost) for match_cost in match_costs
+        ]
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               img_meta: Optional[dict] = None,
+               **kwargs) -> AssignResult:
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places. It may includes ``masks``, with shape
+                (n, h, w) or (n, l).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                ``labels``, with shape (k, ) and ``masks``, with shape
+                (k, h, w) or (k, l).
+            img_meta (dict): Image information.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert isinstance(gt_instances.labels, Tensor)
+        num_gts, num_preds = len(gt_instances), len(pred_instances)
+        gt_labels = gt_instances.labels
+        device = gt_labels.device
+
+        # 1. assign -1 by default
+        assigned_gt_inds = torch.full((num_preds, ),
+                                      -1,
+                                      dtype=torch.long,
+                                      device=device)
+        assigned_labels = torch.full((num_preds, ),
+                                     -1,
+                                     dtype=torch.long,
+                                     device=device)
+
+        if num_gts == 0 or num_preds == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts=num_gts,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=None,
+                labels=assigned_labels)
+
+        # 2. compute weighted cost
+        cost_list = []
+        for match_cost in self.match_costs:
+            cost = match_cost(
+                pred_instances=pred_instances,
+                gt_instances=gt_instances,
+                img_meta=img_meta)
+            cost_list.append(cost)
+        cost = torch.stack(cost_list).sum(dim=0)
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts=num_gts,
+            gt_inds=assigned_gt_inds,
+            max_overlaps=None,
+            labels=assigned_labels)
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/iou2d_calculator.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/iou2d_calculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6daa94feb46ac2f188df41c7be59ffdc3905e58
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/iou2d_calculator.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps, get_box_tensor
+
+
+def cast_tensor_type(x, scale=1., dtype=None):
+    if dtype == 'fp16':
+        # scale is for preventing overflows
+        x = (x / scale).half()
+    return x
+
+
+@TASK_UTILS.register_module()
+class BboxOverlaps2D:
+    """2D Overlaps (e.g. IoUs, GIoUs) Calculator."""
+
+    def __init__(self, scale=1., dtype=None):
+        self.scale = scale
+        self.dtype = dtype
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate IoU between 2D bboxes.
+
+        Args:
+            bboxes1 (Tensor or :obj:`BaseBoxes`): bboxes have shape (m, 4)
+                in <x1, y1, x2, y2> format, or shape (m, 5) in <x1, y1, x2,
+                y2, score> format.
+            bboxes2 (Tensor or :obj:`BaseBoxes`): bboxes have shape (m, 4)
+                in <x1, y1, x2, y2> format, shape (m, 5) in <x1, y1, x2, y2,
+                score> format, or be empty. If ``is_aligned `` is ``True``,
+                then m and n must be equal.
+            mode (str): "iou" (intersection over union), "iof" (intersection
+                over foreground), or "giou" (generalized intersection over
+                union).
+            is_aligned (bool, optional): If True, then m and n must be equal.
+                Default False.
+
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+        """
+        bboxes1 = get_box_tensor(bboxes1)
+        bboxes2 = get_box_tensor(bboxes2)
+        assert bboxes1.size(-1) in [0, 4, 5]
+        assert bboxes2.size(-1) in [0, 4, 5]
+        if bboxes2.size(-1) == 5:
+            bboxes2 = bboxes2[..., :4]
+        if bboxes1.size(-1) == 5:
+            bboxes1 = bboxes1[..., :4]
+
+        if self.dtype == 'fp16':
+            # change tensor type to save cpu and cuda memory and keep speed
+            bboxes1 = cast_tensor_type(bboxes1, self.scale, self.dtype)
+            bboxes2 = cast_tensor_type(bboxes2, self.scale, self.dtype)
+            overlaps = bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+            if not overlaps.is_cuda and overlaps.dtype == torch.float16:
+                # resume cpu float32
+                overlaps = overlaps.float()
+            return overlaps
+
+        return bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+
+    def __repr__(self):
+        """str: a string describing the module"""
+        repr_str = self.__class__.__name__ + f'(' \
+            f'scale={self.scale}, dtype={self.dtype})'
+        return repr_str
+
+
+@TASK_UTILS.register_module()
+class BboxOverlaps2D_GLIP(BboxOverlaps2D):
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        TO_REMOVE = 1
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + TO_REMOVE) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + TO_REMOVE)
+        area2 = (bboxes2[:, 2] - bboxes2[:, 0] + TO_REMOVE) * (
+            bboxes2[:, 3] - bboxes2[:, 1] + TO_REMOVE)
+
+        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [N,M,2]
+        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [N,M,2]
+
+        wh = (rb - lt + TO_REMOVE).clamp(min=0)  # [N,M,2]
+        inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+        iou = inter / (area1[:, None] + area2 - inter)
+        return iou
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/match_cost.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/match_cost.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fc62f01f29138cba31ef2b41254f497351fe0d0
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/match_cost.py
@@ -0,0 +1,525 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps, bbox_xyxy_to_cxcywh
+
+
+class BaseMatchCost:
+    """Base match cost class.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self, weight: Union[float, int] = 1.) -> None:
+        self.weight = weight
+
+    @abstractmethod
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            img_meta (dict, optional): Image information.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pass
+
+
+@TASK_UTILS.register_module()
+class BBoxL1Cost(BaseMatchCost):
+    """BBoxL1Cost.
+
+    Note: ``bboxes`` in ``InstanceData`` passed in is of format 'xyxy'
+    and its coordinates are unnormalized.
+
+    Args:
+        box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN.
+            Defaults to 'xyxy'.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+
+    Examples:
+        >>> from mmdet.models.task_modules.assigners.
+        ... match_costs.match_cost import BBoxL1Cost
+        >>> import torch
+        >>> self = BBoxL1Cost()
+        >>> bbox_pred = torch.rand(1, 4)
+        >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+        >>> factor = torch.tensor([10, 8, 10, 8])
+        >>> self(bbox_pred, gt_bboxes, factor)
+        tensor([[1.6172, 1.6422]])
+    """
+
+    def __init__(self,
+                 box_format: str = 'xyxy',
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        assert box_format in ['xyxy', 'xywh']
+        self.box_format = box_format
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): ``bboxes`` inside is
+                predicted boxes with unnormalized coordinate
+                (x, y, x, y).
+            gt_instances (:obj:`InstanceData`): ``bboxes`` inside is gt
+                bboxes with unnormalized coordinate (x, y, x, y).
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_bboxes = pred_instances.bboxes
+        gt_bboxes = gt_instances.bboxes
+
+        # convert box format
+        if self.box_format == 'xywh':
+            gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes)
+            pred_bboxes = bbox_xyxy_to_cxcywh(pred_bboxes)
+
+        # normalized
+        img_h, img_w = img_meta['img_shape']
+        factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        gt_bboxes = gt_bboxes / factor
+        pred_bboxes = pred_bboxes / factor
+
+        bbox_cost = torch.cdist(pred_bboxes, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class IoUCost(BaseMatchCost):
+    """IoUCost.
+
+    Note: ``bboxes`` in ``InstanceData`` passed in is of format 'xyxy'
+    and its coordinates are unnormalized.
+
+    Args:
+        iou_mode (str): iou mode such as 'iou', 'giou'. Defaults to 'giou'.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+
+    Examples:
+        >>> from mmdet.models.task_modules.assigners.
+        ... match_costs.match_cost import IoUCost
+        >>> import torch
+        >>> self = IoUCost()
+        >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+        >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+        >>> self(bboxes, gt_bboxes)
+        tensor([[-0.1250,  0.1667],
+            [ 0.1667, -0.5000]])
+    """
+
+    def __init__(self, iou_mode: str = 'giou', weight: Union[float, int] = 1.):
+        super().__init__(weight=weight)
+        self.iou_mode = iou_mode
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs):
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): ``bboxes`` inside is
+                predicted boxes with unnormalized coordinate
+                (x, y, x, y).
+            gt_instances (:obj:`InstanceData`): ``bboxes`` inside is gt
+                bboxes with unnormalized coordinate (x, y, x, y).
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_bboxes = pred_instances.bboxes
+        gt_bboxes = gt_instances.bboxes
+
+        # avoid fp16 overflow
+        if pred_bboxes.dtype == torch.float16:
+            fp16 = True
+            pred_bboxes = pred_bboxes.to(torch.float32)
+        else:
+            fp16 = False
+
+        overlaps = bbox_overlaps(
+            pred_bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False)
+
+        if fp16:
+            overlaps = overlaps.to(torch.float16)
+
+        # The 1 is a constant that doesn't change the matching, so omitted.
+        iou_cost = -overlaps
+        return iou_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class ClassificationCost(BaseMatchCost):
+    """ClsSoftmaxCost.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+
+    Examples:
+        >>> from mmdet.models.task_modules.assigners.
+        ...  match_costs.match_cost import ClassificationCost
+        >>> import torch
+        >>> self = ClassificationCost()
+        >>> cls_pred = torch.rand(4, 3)
+        >>> gt_labels = torch.tensor([0, 1, 2])
+        >>> factor = torch.tensor([10, 8, 10, 8])
+        >>> self(cls_pred, gt_labels)
+        tensor([[-0.3430, -0.3525, -0.3045],
+            [-0.3077, -0.2931, -0.3992],
+            [-0.3664, -0.3455, -0.2881],
+            [-0.3343, -0.2701, -0.3956]])
+    """
+
+    def __init__(self, weight: Union[float, int] = 1) -> None:
+        super().__init__(weight=weight)
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): ``scores`` inside is
+                predicted classification logits, of shape
+                (num_queries, num_class).
+            gt_instances (:obj:`InstanceData`): ``labels`` inside should have
+                shape (num_gt, ).
+            img_meta (Optional[dict]): _description_. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_scores = pred_instances.scores
+        gt_labels = gt_instances.labels
+
+        pred_scores = pred_scores.softmax(-1)
+        cls_cost = -pred_scores[:, gt_labels]
+
+        return cls_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class FocalLossCost(BaseMatchCost):
+    """FocalLossCost.
+
+    Args:
+        alpha (Union[float, int]): focal_loss alpha. Defaults to 0.25.
+        gamma (Union[float, int]): focal_loss gamma. Defaults to 2.
+        eps (float): Defaults to 1e-12.
+        binary_input (bool): Whether the input is binary. Currently,
+            binary_input = True is for masks input, binary_input = False
+            is for label input. Defaults to False.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 alpha: Union[float, int] = 0.25,
+                 gamma: Union[float, int] = 2,
+                 eps: float = 1e-12,
+                 binary_input: bool = False,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.alpha = alpha
+        self.gamma = gamma
+        self.eps = eps
+        self.binary_input = binary_input
+
+    def _focal_loss_cost(self, cls_pred: Tensor, gt_labels: Tensor) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_queries, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels]
+        return cls_cost * self.weight
+
+    def _mask_focal_loss_cost(self, cls_pred, gt_labels) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits.
+                in shape (num_queries, d1, ..., dn), dtype=torch.float32.
+            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
+                dtype=torch.long. Labels should be binary.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_queries, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
+        return cls_cost / n * self.weight
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``scores`` or ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``labels`` or ``mask``.
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        if self.binary_input:
+            pred_masks = pred_instances.masks
+            gt_masks = gt_instances.masks
+            return self._mask_focal_loss_cost(pred_masks, gt_masks)
+        else:
+            pred_scores = pred_instances.scores
+            gt_labels = gt_instances.labels
+            return self._focal_loss_cost(pred_scores, gt_labels)
+
+
+@TASK_UTILS.register_module()
+class BinaryFocalLossCost(FocalLossCost):
+
+    def _focal_loss_cost(self, cls_pred: Tensor, gt_labels: Tensor) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_queries, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
+        return cls_cost * self.weight
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``scores`` or ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``labels`` or ``mask``.
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        # gt_instances.text_token_mask is a repeated tensor of the same length
+        # of instances. Only gt_instances.text_token_mask[0] is useful
+        text_token_mask = torch.nonzero(
+            gt_instances.text_token_mask[0]).squeeze(-1)
+        pred_scores = pred_instances.scores[:, text_token_mask]
+        gt_labels = gt_instances.positive_maps[:, text_token_mask]
+        return self._focal_loss_cost(pred_scores, gt_labels)
+
+
+@TASK_UTILS.register_module()
+class DiceCost(BaseMatchCost):
+    """Cost of mask assignments based on dice losses.
+
+    Args:
+        pred_act (bool): Whether to apply sigmoid to mask_pred.
+            Defaults to False.
+        eps (float): Defaults to 1e-3.
+        naive_dice (bool): If True, use the naive dice loss
+            in which the power of the number in the denominator is
+            the first power. If False, use the second power that
+            is adopted by K-Net and SOLO. Defaults to True.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 pred_act: bool = False,
+                 eps: float = 1e-3,
+                 naive_dice: bool = True,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.pred_act = pred_act
+        self.eps = eps
+        self.naive_dice = naive_dice
+
+    def _binary_mask_dice_loss(self, mask_preds: Tensor,
+                               gt_masks: Tensor) -> Tensor:
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction in shape (num_queries, *).
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+                store 0 or 1, 0 for negative class and 1 for
+                positive class.
+
+        Returns:
+            Tensor: Dice cost matrix in shape (num_queries, num_gt).
+        """
+        mask_preds = mask_preds.flatten(1)
+        gt_masks = gt_masks.flatten(1).float()
+        numerator = 2 * torch.einsum('nc,mc->nm', mask_preds, gt_masks)
+        if self.naive_dice:
+            denominator = mask_preds.sum(-1)[:, None] + \
+                          gt_masks.sum(-1)[None, :]
+        else:
+            denominator = mask_preds.pow(2).sum(1)[:, None] + \
+                          gt_masks.pow(2).sum(1)[None, :]
+        loss = 1 - (numerator + self.eps) / (denominator + self.eps)
+        return loss
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``mask``.
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+
+        if self.pred_act:
+            pred_masks = pred_masks.sigmoid()
+        dice_cost = self._binary_mask_dice_loss(pred_masks, gt_masks)
+        return dice_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class CrossEntropyLossCost(BaseMatchCost):
+    """CrossEntropyLossCost.
+
+    Args:
+        use_sigmoid (bool): Whether the prediction uses sigmoid
+                of softmax. Defaults to True.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 use_sigmoid: bool = True,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.use_sigmoid = use_sigmoid
+
+    def _binary_cross_entropy(self, cls_pred: Tensor,
+                              gt_labels: Tensor) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): The prediction with shape (num_queries, 1, *) or
+                (num_queries, *).
+            gt_labels (Tensor): The learning label of prediction with
+                shape (num_gt, *).
+
+        Returns:
+            Tensor: Cross entropy cost matrix in shape (num_queries, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1).float()
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        pos = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.ones_like(cls_pred), reduction='none')
+        neg = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.zeros_like(cls_pred), reduction='none')
+        cls_cost = torch.einsum('nc,mc->nm', pos, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg, 1 - gt_labels)
+        cls_cost = cls_cost / n
+
+        return cls_cost
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``scores`` or ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``labels`` or ``masks``.
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+        if self.use_sigmoid:
+            cls_cost = self._binary_cross_entropy(pred_masks, gt_masks)
+        else:
+            raise NotImplementedError
+
+        return cls_cost * self.weight
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/max_iou_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/max_iou_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..71da54429ae0526bf52277bc3b1d24630acceaed
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/max_iou_assigner.py
@@ -0,0 +1,325 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Optional, Union
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def _perm_box(bboxes,
+              iou_calculator,
+              iou_thr=0.97,
+              perm_range=0.01,
+              counter=0,
+              max_iter=5):
+    """Compute the permuted bboxes.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for , "xyxy" format.
+        iou_calculator (obj): Overlaps Calculator.
+        iou_thr (float): The permuted bboxes should have IoU > iou_thr.
+        perm_range (float): The scale of permutation.
+        counter (int): Counter of permutation iteration.
+        max_iter (int): The max iterations of permutation.
+    Returns:
+        Tensor: The permuted bboxes.
+    """
+    ori_bboxes = copy.deepcopy(bboxes)
+    is_valid = True
+    N = bboxes.size(0)
+    perm_factor = bboxes.new_empty(N, 4).uniform_(1 - perm_range,
+                                                  1 + perm_range)
+    bboxes *= perm_factor
+    new_wh = bboxes[:, 2:] - bboxes[:, :2]
+    if (new_wh <= 0).any():
+        is_valid = False
+    iou = iou_calculator(ori_bboxes.unique(dim=0), bboxes)
+    if (iou < iou_thr).any():
+        is_valid = False
+    if not is_valid and counter < max_iter:
+        return _perm_box(
+            ori_bboxes,
+            iou_calculator,
+            perm_range=max(perm_range - counter * 0.001, 1e-3),
+            counter=counter + 1)
+    return bboxes
+
+
+def perm_repeat_bboxes(bboxes, iou_calculator=None, perm_repeat_cfg=None):
+    """Permute the repeated bboxes.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for , "xyxy" format.
+        iou_calculator (obj): Overlaps Calculator.
+        perm_repeat_cfg (Dict): Config of permutation.
+    Returns:
+        Tensor: Bboxes after permuted repeated bboxes.
+    """
+    assert isinstance(bboxes, torch.Tensor)
+    if iou_calculator is None:
+        import torchvision
+        iou_calculator = torchvision.ops.box_iou
+    bboxes = copy.deepcopy(bboxes)
+    unique_bboxes = bboxes.unique(dim=0)
+    iou_thr = perm_repeat_cfg.get('iou_thr', 0.97)
+    perm_range = perm_repeat_cfg.get('perm_range', 0.01)
+    for box in unique_bboxes:
+        inds = (bboxes == box).sum(-1).float() == 4
+        if inds.float().sum().item() == 1:
+            continue
+        bboxes[inds] = _perm_box(
+            bboxes[inds],
+            iou_calculator,
+            iou_thr=iou_thr,
+            perm_range=perm_range,
+            counter=0)
+    return bboxes
+
+
+@TASK_UTILS.register_module()
+class MaxIoUAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, or a semi-positive integer
+    indicating the ground truth index.
+
+    - -1: negative sample, no assigned gt
+    - semi-positive integer: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+            `min_pos_iou` is set to avoid assigning bboxes that have extremely
+            small iou with GT as positive samples. It brings about 0.3 mAP
+            improvements in 1x schedule but does not affect the performance of
+            3x schedule. More comparisons can be found in
+            `PR #7464 <https://github.com/open-mmlab/mmdetection/pull/7464>`_.
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+        match_low_quality (bool): Whether to allow low quality matches. This is
+            usually allowed for RPN and single stage detectors, but not allowed
+            in the second stage. Details are demonstrated in Step 4.
+        gpu_assign_thr (int): The upper bound of the number of GT for GPU
+            assign. When the number of gt is above this threshold, will assign
+            on CPU device. Negative values mean not assign on CPU.
+        iou_calculator (dict): Config of overlaps Calculator.
+        perm_repeat_gt_cfg (dict): Config of permute repeated gt bboxes.
+    """
+
+    def __init__(self,
+                 pos_iou_thr: float,
+                 neg_iou_thr: Union[float, tuple],
+                 min_pos_iou: float = .0,
+                 gt_max_assign_all: bool = True,
+                 ignore_iof_thr: float = -1,
+                 ignore_wrt_candidates: bool = True,
+                 match_low_quality: bool = True,
+                 gpu_assign_thr: float = -1,
+                 iou_calculator: dict = dict(type='BboxOverlaps2D'),
+                 perm_repeat_gt_cfg=None):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+        self.perm_repeat_gt_cfg = perm_repeat_gt_cfg
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, or a semi-positive number. -1 means negative
+        sample, semi-positive number is the index (0-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to the background
+        2. assign proposals whose iou with all gts < neg_iou_thr to 0
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+
+        Example:
+            >>> from mmengine.structures import InstanceData
+            >>> self = MaxIoUAssigner(0.5, 0.5)
+            >>> pred_instances = InstanceData()
+            >>> pred_instances.priors = torch.Tensor([[0, 0, 10, 10],
+            ...                                      [10, 10, 20, 20]])
+            >>> gt_instances = InstanceData()
+            >>> gt_instances.bboxes = torch.Tensor([[0, 0, 10, 9]])
+            >>> gt_instances.labels = torch.Tensor([0])
+            >>> assign_result = self.assign(pred_instances, gt_instances)
+            >>> expected_gt_inds = torch.LongTensor([1, 0])
+            >>> assert torch.all(assign_result.gt_inds == expected_gt_inds)
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+        if gt_instances_ignore is not None:
+            gt_bboxes_ignore = gt_instances_ignore.bboxes
+        else:
+            gt_bboxes_ignore = None
+
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            gt_bboxes.shape[0] > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = priors.device
+            priors = priors.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            gt_labels = gt_labels.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+
+        if self.perm_repeat_gt_cfg is not None and priors.numel() > 0:
+            gt_bboxes_unique = perm_repeat_bboxes(gt_bboxes,
+                                                  self.iou_calculator,
+                                                  self.perm_repeat_gt_cfg)
+        else:
+            gt_bboxes_unique = gt_bboxes
+        overlaps = self.iou_calculator(gt_bboxes_unique, priors)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and priors.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = self.iou_calculator(
+                    priors, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = self.iou_calculator(
+                    gt_bboxes_ignore, priors, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
+
+    def assign_wrt_overlaps(self, overlaps: Tensor,
+                            gt_labels: Tensor) -> AssignResult:
+        """Assign w.r.t. the overlaps of priors with gts.
+
+        Args:
+            overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes,
+                shape(k, n).
+            gt_labels (Tensor): Labels of k gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        num_gts, num_bboxes = overlaps.size(0), overlaps.size(1)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = overlaps.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_bboxes, ))
+            assigned_labels = overlaps.new_full((num_bboxes, ),
+                                                -1,
+                                                dtype=torch.long)
+            if num_gts == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts=num_gts,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=max_overlaps,
+                labels=assigned_labels)
+
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
+
+        # 2. assign negative: below
+        # the negative inds are set to be 0
+        if isinstance(self.neg_iou_thr, float):
+            assigned_gt_inds[(max_overlaps >= 0)
+                             & (max_overlaps < self.neg_iou_thr)] = 0
+        elif isinstance(self.neg_iou_thr, tuple):
+            assert len(self.neg_iou_thr) == 2
+            assigned_gt_inds[(max_overlaps >= self.neg_iou_thr[0])
+                             & (max_overlaps < self.neg_iou_thr[1])] = 0
+
+        # 3. assign positive: above positive IoU threshold
+        pos_inds = max_overlaps >= self.pos_iou_thr
+        assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+        if self.match_low_quality:
+            # Low-quality matching will overwrite the assigned_gt_inds assigned
+            # in Step 3. Thus, the assigned gt might not be the best one for
+            # prediction.
+            # For example, if bbox A has 0.9 and 0.8 iou with GT bbox 1 & 2,
+            # bbox 1 will be assigned as the best target for bbox A in step 3.
+            # However, if GT bbox 2's gt_argmax_overlaps = A, bbox A's
+            # assigned_gt_inds will be overwritten to be bbox 2.
+            # This might be the reason that it is not used in ROI Heads.
+            for i in range(num_gts):
+                if gt_max_overlaps[i] >= self.min_pos_iou:
+                    if self.gt_max_assign_all:
+                        max_iou_inds = overlaps[i, :] == gt_max_overlaps[i]
+                        assigned_gt_inds[max_iou_inds] = i + 1
+                    else:
+                        assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+
+        return AssignResult(
+            num_gts=num_gts,
+            gt_inds=assigned_gt_inds,
+            max_overlaps=max_overlaps,
+            labels=assigned_labels)
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/multi_instance_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/multi_instance_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ba32afe856b3c2ad03ed89562d080f15b6ccf30
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/multi_instance_assigner.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .max_iou_assigner import MaxIoUAssigner
+
+
+@TASK_UTILS.register_module()
+class MultiInstanceAssigner(MaxIoUAssigner):
+    """Assign a corresponding gt bbox or background to each proposal bbox. If
+    we need to use a proposal box to generate multiple predict boxes,
+    `MultiInstanceAssigner` can assign multiple gt to each proposal box.
+
+    Args:
+        num_instance (int): How many bboxes are predicted by each proposal box.
+    """
+
+    def __init__(self, num_instance: int = 2, **kwargs):
+        super().__init__(**kwargs)
+        self.num_instance = num_instance
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes.
+
+        This method assign gt bboxes to every bbox (proposal/anchor), each bbox
+        is assigned a set of gts, and the number of gts in this set is defined
+        by `self.num_instance`.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        # Set the FG label to 1 and add ignored annotations
+        gt_labels = gt_instances.labels + 1
+        if gt_instances_ignore is not None:
+            gt_bboxes_ignore = gt_instances_ignore.bboxes
+            if hasattr(gt_instances_ignore, 'labels'):
+                gt_labels_ignore = gt_instances_ignore.labels
+            else:
+                gt_labels_ignore = torch.ones_like(gt_bboxes_ignore)[:, 0] * -1
+        else:
+            gt_bboxes_ignore = None
+            gt_labels_ignore = None
+
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            gt_bboxes.shape[0] > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = priors.device
+            priors = priors.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            gt_labels = gt_labels.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+                gt_labels_ignore = gt_labels_ignore.cpu()
+
+        if gt_bboxes_ignore is not None:
+            all_bboxes = torch.cat([gt_bboxes, gt_bboxes_ignore], dim=0)
+            all_labels = torch.cat([gt_labels, gt_labels_ignore], dim=0)
+        else:
+            all_bboxes = gt_bboxes
+            all_labels = gt_labels
+        all_priors = torch.cat([priors, all_bboxes], dim=0)
+
+        overlaps_normal = self.iou_calculator(
+            all_priors, all_bboxes, mode='iou')
+        overlaps_ignore = self.iou_calculator(
+            all_priors, all_bboxes, mode='iof')
+        gt_ignore_mask = all_labels.eq(-1).repeat(all_priors.shape[0], 1)
+        overlaps_normal = overlaps_normal * ~gt_ignore_mask
+        overlaps_ignore = overlaps_ignore * gt_ignore_mask
+
+        overlaps_normal, overlaps_normal_indices = overlaps_normal.sort(
+            descending=True, dim=1)
+        overlaps_ignore, overlaps_ignore_indices = overlaps_ignore.sort(
+            descending=True, dim=1)
+
+        # select the roi with the higher score
+        max_overlaps_normal = overlaps_normal[:, :self.num_instance].flatten()
+        gt_assignment_normal = overlaps_normal_indices[:, :self.
+                                                       num_instance].flatten()
+        max_overlaps_ignore = overlaps_ignore[:, :self.num_instance].flatten()
+        gt_assignment_ignore = overlaps_ignore_indices[:, :self.
+                                                       num_instance].flatten()
+
+        # ignore or not
+        ignore_assign_mask = (max_overlaps_normal < self.pos_iou_thr) * (
+            max_overlaps_ignore > max_overlaps_normal)
+        overlaps = (max_overlaps_normal * ~ignore_assign_mask) + (
+            max_overlaps_ignore * ignore_assign_mask)
+        gt_assignment = (gt_assignment_normal * ~ignore_assign_mask) + (
+            gt_assignment_ignore * ignore_assign_mask)
+
+        assigned_labels = all_labels[gt_assignment]
+        fg_mask = (overlaps >= self.pos_iou_thr) * (assigned_labels != -1)
+        bg_mask = (overlaps < self.neg_iou_thr) * (overlaps >= 0)
+        assigned_labels[fg_mask] = 1
+        assigned_labels[bg_mask] = 0
+
+        overlaps = overlaps.reshape(-1, self.num_instance)
+        gt_assignment = gt_assignment.reshape(-1, self.num_instance)
+        assigned_labels = assigned_labels.reshape(-1, self.num_instance)
+
+        assign_result = AssignResult(
+            num_gts=all_bboxes.size(0),
+            gt_inds=gt_assignment,
+            max_overlaps=overlaps,
+            labels=assigned_labels)
+
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/point_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/point_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..4da60a490b0022ac76c46db8a34f814bc9da8e2e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/point_assigner.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class PointAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each point.
+
+    Each proposals will be assigned with `0`, or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    """
+
+    def __init__(self, scale: int = 4, pos_num: int = 3) -> None:
+        self.scale = scale
+        self.pos_num = pos_num
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to points.
+
+        This method assign a gt bbox to every points set, each points set
+        will be assigned with  the background_label (-1), or a label number.
+        -1 is background, and semi-positive number is the index (0-based) of
+        assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every points to the background_label (-1)
+        2. A point is assigned to some gt bbox if
+            (i) the point is within the k closest points to the gt bbox
+            (ii) the distance between this point and the gt is smaller than
+                other gt bboxes
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+
+
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        # points to be assigned, shape(n, 3) while last
+        # dimension stands for (x, y, stride).
+        points = pred_instances.priors
+
+        num_points = points.shape[0]
+        num_gts = gt_bboxes.shape[0]
+
+        if num_gts == 0 or num_points == 0:
+            # If no truth assign everything to the background
+            assigned_gt_inds = points.new_full((num_points, ),
+                                               0,
+                                               dtype=torch.long)
+            assigned_labels = points.new_full((num_points, ),
+                                              -1,
+                                              dtype=torch.long)
+            return AssignResult(
+                num_gts=num_gts,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=None,
+                labels=assigned_labels)
+
+        points_xy = points[:, :2]
+        points_stride = points[:, 2]
+        points_lvl = torch.log2(
+            points_stride).int()  # [3...,4...,5...,6...,7...]
+        lvl_min, lvl_max = points_lvl.min(), points_lvl.max()
+
+        # assign gt box
+        gt_bboxes_xy = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) / 2
+        gt_bboxes_wh = (gt_bboxes[:, 2:] - gt_bboxes[:, :2]).clamp(min=1e-6)
+        scale = self.scale
+        gt_bboxes_lvl = ((torch.log2(gt_bboxes_wh[:, 0] / scale) +
+                          torch.log2(gt_bboxes_wh[:, 1] / scale)) / 2).int()
+        gt_bboxes_lvl = torch.clamp(gt_bboxes_lvl, min=lvl_min, max=lvl_max)
+
+        # stores the assigned gt index of each point
+        assigned_gt_inds = points.new_zeros((num_points, ), dtype=torch.long)
+        # stores the assigned gt dist (to this point) of each point
+        assigned_gt_dist = points.new_full((num_points, ), float('inf'))
+        points_range = torch.arange(points.shape[0])
+
+        for idx in range(num_gts):
+            gt_lvl = gt_bboxes_lvl[idx]
+            # get the index of points in this level
+            lvl_idx = gt_lvl == points_lvl
+            points_index = points_range[lvl_idx]
+            # get the points in this level
+            lvl_points = points_xy[lvl_idx, :]
+            # get the center point of gt
+            gt_point = gt_bboxes_xy[[idx], :]
+            # get width and height of gt
+            gt_wh = gt_bboxes_wh[[idx], :]
+            # compute the distance between gt center and
+            #   all points in this level
+            points_gt_dist = ((lvl_points - gt_point) / gt_wh).norm(dim=1)
+            # find the nearest k points to gt center in this level
+            min_dist, min_dist_index = torch.topk(
+                points_gt_dist, self.pos_num, largest=False)
+            # the index of nearest k points to gt center in this level
+            min_dist_points_index = points_index[min_dist_index]
+            # The less_than_recorded_index stores the index
+            #   of min_dist that is less then the assigned_gt_dist. Where
+            #   assigned_gt_dist stores the dist from previous assigned gt
+            #   (if exist) to each point.
+            less_than_recorded_index = min_dist < assigned_gt_dist[
+                min_dist_points_index]
+            # The min_dist_points_index stores the index of points satisfy:
+            #   (1) it is k nearest to current gt center in this level.
+            #   (2) it is closer to current gt center than other gt center.
+            min_dist_points_index = min_dist_points_index[
+                less_than_recorded_index]
+            # assign the result
+            assigned_gt_inds[min_dist_points_index] = idx + 1
+            assigned_gt_dist[min_dist_points_index] = min_dist[
+                less_than_recorded_index]
+
+        assigned_labels = assigned_gt_inds.new_full((num_points, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+
+        return AssignResult(
+            num_gts=num_gts,
+            gt_inds=assigned_gt_inds,
+            max_overlaps=None,
+            labels=assigned_labels)
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/region_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/region_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..df549143086c1195efaf12a2f3e81259da0e6c97
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/region_assigner.py
@@ -0,0 +1,239 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from ..prior_generators import anchor_inside_flags
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def calc_region(
+        bbox: Tensor,
+        ratio: float,
+        stride: int,
+        featmap_size: Optional[Tuple[int, int]] = None) -> Tuple[Tensor]:
+    """Calculate region of the box defined by the ratio, the ratio is from the
+    center of the box to every edge."""
+    # project bbox on the feature
+    f_bbox = bbox / stride
+    x1 = torch.round((1 - ratio) * f_bbox[0] + ratio * f_bbox[2])
+    y1 = torch.round((1 - ratio) * f_bbox[1] + ratio * f_bbox[3])
+    x2 = torch.round(ratio * f_bbox[0] + (1 - ratio) * f_bbox[2])
+    y2 = torch.round(ratio * f_bbox[1] + (1 - ratio) * f_bbox[3])
+    if featmap_size is not None:
+        x1 = x1.clamp(min=0, max=featmap_size[1])
+        y1 = y1.clamp(min=0, max=featmap_size[0])
+        x2 = x2.clamp(min=0, max=featmap_size[1])
+        y2 = y2.clamp(min=0, max=featmap_size[0])
+    return (x1, y1, x2, y2)
+
+
+def anchor_ctr_inside_region_flags(anchors: Tensor, stride: int,
+                                   region: Tuple[Tensor]) -> Tensor:
+    """Get the flag indicate whether anchor centers are inside regions."""
+    x1, y1, x2, y2 = region
+    f_anchors = anchors / stride
+    x = (f_anchors[:, 0] + f_anchors[:, 2]) * 0.5
+    y = (f_anchors[:, 1] + f_anchors[:, 3]) * 0.5
+    flags = (x >= x1) & (x <= x2) & (y >= y1) & (y <= y2)
+    return flags
+
+
+@TASK_UTILS.register_module()
+class RegionAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        center_ratio (float): ratio of the region in the center of the bbox to
+            define positive sample.
+        ignore_ratio (float): ratio of the region to define ignore samples.
+    """
+
+    def __init__(self,
+                 center_ratio: float = 0.2,
+                 ignore_ratio: float = 0.5) -> None:
+        self.center_ratio = center_ratio
+        self.ignore_ratio = ignore_ratio
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               img_meta: dict,
+               featmap_sizes: List[Tuple[int, int]],
+               num_level_anchors: List[int],
+               anchor_scale: int,
+               anchor_strides: List[int],
+               gt_instances_ignore: Optional[InstanceData] = None,
+               allowed_border: int = 0) -> AssignResult:
+        """Assign gt to anchors.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, 0, or a positive number. -1 means don't care,
+        0 means negative sample, positive number is the index (1-based) of
+        assigned gt.
+
+        The assignment is done in following steps, and the order matters.
+
+        1. Assign every anchor to 0 (negative)
+        2. (For each gt_bboxes) Compute ignore flags based on ignore_region
+           then assign -1 to anchors w.r.t. ignore flags
+        3. (For each gt_bboxes) Compute pos flags based on center_region then
+           assign gt_bboxes to anchors w.r.t. pos flags
+        4. (For each gt_bboxes) Compute ignore flags based on adjacent anchor
+           level then assign -1 to anchors w.r.t. ignore flags
+        5. Assign anchor outside of image to -1
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            img_meta (dict): Meta info of image.
+            featmap_sizes (list[tuple[int, int]]): Feature map size each level.
+            num_level_anchors (list[int]): The number of anchors in each level.
+            anchor_scale (int): Scale of the anchor.
+            anchor_strides (list[int]): Stride of the anchor.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+            allowed_border (int, optional): The border to allow the valid
+                anchor. Defaults to 0.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        if gt_instances_ignore is not None:
+            raise NotImplementedError
+
+        num_gts = len(gt_instances)
+        num_bboxes = len(pred_instances)
+
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        flat_anchors = pred_instances.priors
+        flat_valid_flags = pred_instances.valid_flags
+        mlvl_anchors = torch.split(flat_anchors, num_level_anchors)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = gt_bboxes.new_zeros((num_bboxes, ))
+            assigned_gt_inds = gt_bboxes.new_zeros((num_bboxes, ),
+                                                   dtype=torch.long)
+            assigned_labels = gt_bboxes.new_full((num_bboxes, ),
+                                                 -1,
+                                                 dtype=torch.long)
+            return AssignResult(
+                num_gts=num_gts,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=max_overlaps,
+                labels=assigned_labels)
+
+        num_lvls = len(mlvl_anchors)
+        r1 = (1 - self.center_ratio) / 2
+        r2 = (1 - self.ignore_ratio) / 2
+
+        scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                           (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+        min_anchor_size = scale.new_full(
+            (1, ), float(anchor_scale * anchor_strides[0]))
+        target_lvls = torch.floor(
+            torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
+        target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
+
+        # 1. assign 0 (negative) by default
+        mlvl_assigned_gt_inds = []
+        mlvl_ignore_flags = []
+        for lvl in range(num_lvls):
+            assigned_gt_inds = gt_bboxes.new_full((num_level_anchors[lvl], ),
+                                                  0,
+                                                  dtype=torch.long)
+            ignore_flags = torch.zeros_like(assigned_gt_inds)
+            mlvl_assigned_gt_inds.append(assigned_gt_inds)
+            mlvl_ignore_flags.append(ignore_flags)
+
+        for gt_id in range(num_gts):
+            lvl = target_lvls[gt_id].item()
+            featmap_size = featmap_sizes[lvl]
+            stride = anchor_strides[lvl]
+            anchors = mlvl_anchors[lvl]
+            gt_bbox = gt_bboxes[gt_id, :4]
+
+            # Compute regions
+            ignore_region = calc_region(gt_bbox, r2, stride, featmap_size)
+            ctr_region = calc_region(gt_bbox, r1, stride, featmap_size)
+
+            # 2. Assign -1 to ignore flags
+            ignore_flags = anchor_ctr_inside_region_flags(
+                anchors, stride, ignore_region)
+            mlvl_assigned_gt_inds[lvl][ignore_flags] = -1
+
+            # 3. Assign gt_bboxes to pos flags
+            pos_flags = anchor_ctr_inside_region_flags(anchors, stride,
+                                                       ctr_region)
+            mlvl_assigned_gt_inds[lvl][pos_flags] = gt_id + 1
+
+            # 4. Assign -1 to ignore adjacent lvl
+            if lvl > 0:
+                d_lvl = lvl - 1
+                d_anchors = mlvl_anchors[d_lvl]
+                d_featmap_size = featmap_sizes[d_lvl]
+                d_stride = anchor_strides[d_lvl]
+                d_ignore_region = calc_region(gt_bbox, r2, d_stride,
+                                              d_featmap_size)
+                ignore_flags = anchor_ctr_inside_region_flags(
+                    d_anchors, d_stride, d_ignore_region)
+                mlvl_ignore_flags[d_lvl][ignore_flags] = 1
+            if lvl < num_lvls - 1:
+                u_lvl = lvl + 1
+                u_anchors = mlvl_anchors[u_lvl]
+                u_featmap_size = featmap_sizes[u_lvl]
+                u_stride = anchor_strides[u_lvl]
+                u_ignore_region = calc_region(gt_bbox, r2, u_stride,
+                                              u_featmap_size)
+                ignore_flags = anchor_ctr_inside_region_flags(
+                    u_anchors, u_stride, u_ignore_region)
+                mlvl_ignore_flags[u_lvl][ignore_flags] = 1
+
+        # 4. (cont.) Assign -1 to ignore adjacent lvl
+        for lvl in range(num_lvls):
+            ignore_flags = mlvl_ignore_flags[lvl]
+            mlvl_assigned_gt_inds[lvl][ignore_flags == 1] = -1
+
+        # 5. Assign -1 to anchor outside of image
+        flat_assigned_gt_inds = torch.cat(mlvl_assigned_gt_inds)
+        assert (flat_assigned_gt_inds.shape[0] == flat_anchors.shape[0] ==
+                flat_valid_flags.shape[0])
+        inside_flags = anchor_inside_flags(flat_anchors, flat_valid_flags,
+                                           img_meta['img_shape'],
+                                           allowed_border)
+        outside_flags = ~inside_flags
+        flat_assigned_gt_inds[outside_flags] = -1
+
+        assigned_labels = torch.zeros_like(flat_assigned_gt_inds)
+        pos_flags = flat_assigned_gt_inds > 0
+        assigned_labels[pos_flags] = gt_labels[flat_assigned_gt_inds[pos_flags]
+                                               - 1]
+
+        return AssignResult(
+            num_gts=num_gts,
+            gt_inds=flat_assigned_gt_inds,
+            max_overlaps=None,
+            labels=assigned_labels)
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/sim_ota_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/sim_ota_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..d54a8b91d132d9bf661267de666bfed7e915a65a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/sim_ota_assigner.py
@@ -0,0 +1,223 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+INF = 100000.0
+EPS = 1.0e-7
+
+
+@TASK_UTILS.register_module()
+class SimOTAAssigner(BaseAssigner):
+    """Computes matching between predictions and ground truth.
+
+    Args:
+        center_radius (float): Ground truth center size
+            to judge whether a prior is in center. Defaults to 2.5.
+        candidate_topk (int): The candidate top-k which used to
+            get top-k ious to calculate dynamic-k. Defaults to 10.
+        iou_weight (float): The scale factor for regression
+            iou cost. Defaults to 3.0.
+        cls_weight (float): The scale factor for classification
+            cost. Defaults to 1.0.
+        iou_calculator (ConfigType): Config of overlaps Calculator.
+            Defaults to dict(type='BboxOverlaps2D').
+    """
+
+    def __init__(self,
+                 center_radius: float = 2.5,
+                 candidate_topk: int = 10,
+                 iou_weight: float = 3.0,
+                 cls_weight: float = 1.0,
+                 iou_calculator: ConfigType = dict(type='BboxOverlaps2D')):
+        self.center_radius = center_radius
+        self.candidate_topk = candidate_topk
+        self.iou_weight = iou_weight
+        self.cls_weight = cls_weight
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to priors using SimOTA.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            obj:`AssignResult`: The assigned result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        num_gt = gt_bboxes.size(0)
+
+        decoded_bboxes = pred_instances.bboxes
+        pred_scores = pred_instances.scores
+        priors = pred_instances.priors
+        num_bboxes = decoded_bboxes.size(0)
+
+        # assign 0 by default
+        assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ),
+                                                   0,
+                                                   dtype=torch.long)
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        valid_mask, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(
+            priors, gt_bboxes)
+        valid_decoded_bbox = decoded_bboxes[valid_mask]
+        valid_pred_scores = pred_scores[valid_mask]
+        num_valid = valid_decoded_bbox.size(0)
+        if num_valid == 0:
+            # No valid bboxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes)
+        iou_cost = -torch.log(pairwise_ious + EPS)
+
+        gt_onehot_label = (
+            F.one_hot(gt_labels.to(torch.int64),
+                      pred_scores.shape[-1]).float().unsqueeze(0).repeat(
+                          num_valid, 1, 1))
+
+        valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(1, num_gt, 1)
+        # disable AMP autocast and calculate BCE with FP32 to avoid overflow
+        with torch.cuda.amp.autocast(enabled=False):
+            cls_cost = (
+                F.binary_cross_entropy(
+                    valid_pred_scores.to(dtype=torch.float32),
+                    gt_onehot_label,
+                    reduction='none',
+                ).sum(-1).to(dtype=valid_pred_scores.dtype))
+
+        cost_matrix = (
+            cls_cost * self.cls_weight + iou_cost * self.iou_weight +
+            (~is_in_boxes_and_center) * INF)
+
+        matched_pred_ious, matched_gt_inds = \
+            self.dynamic_k_matching(
+                cost_matrix, pairwise_ious, num_gt, valid_mask)
+
+        # convert to AssignResult format
+        assigned_gt_inds[valid_mask] = matched_gt_inds + 1
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long()
+        max_overlaps = assigned_gt_inds.new_full((num_bboxes, ),
+                                                 -INF,
+                                                 dtype=torch.float32)
+        max_overlaps[valid_mask] = matched_pred_ious
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+    def get_in_gt_and_in_center_info(
+            self, priors: Tensor, gt_bboxes: Tensor) -> Tuple[Tensor, Tensor]:
+        """Get the information of which prior is in gt bboxes and gt center
+        priors."""
+        num_gt = gt_bboxes.size(0)
+
+        repeated_x = priors[:, 0].unsqueeze(1).repeat(1, num_gt)
+        repeated_y = priors[:, 1].unsqueeze(1).repeat(1, num_gt)
+        repeated_stride_x = priors[:, 2].unsqueeze(1).repeat(1, num_gt)
+        repeated_stride_y = priors[:, 3].unsqueeze(1).repeat(1, num_gt)
+
+        # is prior centers in gt bboxes, shape: [n_prior, n_gt]
+        l_ = repeated_x - gt_bboxes[:, 0]
+        t_ = repeated_y - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - repeated_x
+        b_ = gt_bboxes[:, 3] - repeated_y
+
+        deltas = torch.stack([l_, t_, r_, b_], dim=1)
+        is_in_gts = deltas.min(dim=1).values > 0
+        is_in_gts_all = is_in_gts.sum(dim=1) > 0
+
+        # is prior centers in gt centers
+        gt_cxs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_cys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        ct_box_l = gt_cxs - self.center_radius * repeated_stride_x
+        ct_box_t = gt_cys - self.center_radius * repeated_stride_y
+        ct_box_r = gt_cxs + self.center_radius * repeated_stride_x
+        ct_box_b = gt_cys + self.center_radius * repeated_stride_y
+
+        cl_ = repeated_x - ct_box_l
+        ct_ = repeated_y - ct_box_t
+        cr_ = ct_box_r - repeated_x
+        cb_ = ct_box_b - repeated_y
+
+        ct_deltas = torch.stack([cl_, ct_, cr_, cb_], dim=1)
+        is_in_cts = ct_deltas.min(dim=1).values > 0
+        is_in_cts_all = is_in_cts.sum(dim=1) > 0
+
+        # in boxes or in centers, shape: [num_priors]
+        is_in_gts_or_centers = is_in_gts_all | is_in_cts_all
+
+        # both in boxes and centers, shape: [num_fg, num_gt]
+        is_in_boxes_and_centers = (
+            is_in_gts[is_in_gts_or_centers, :]
+            & is_in_cts[is_in_gts_or_centers, :])
+        return is_in_gts_or_centers, is_in_boxes_and_centers
+
+    def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor,
+                           num_gt: int,
+                           valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
+        """Use IoU and matching cost to calculate the dynamic top-k positive
+        targets."""
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+        # select candidate topk ious for dynamic-k calculation
+        candidate_topk = min(self.candidate_topk, pairwise_ious.size(0))
+        topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0)
+        # calculate dynamic k for each gt
+        dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
+            matching_matrix[:, gt_idx][pos_idx] = 1
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        prior_match_gt_mask = matching_matrix.sum(1) > 1
+        if prior_match_gt_mask.sum() > 0:
+            cost_min, cost_argmin = torch.min(
+                cost[prior_match_gt_mask, :], dim=1)
+            matching_matrix[prior_match_gt_mask, :] *= 0
+            matching_matrix[prior_match_gt_mask, cost_argmin] = 1
+        # get foreground mask inside box and center prior
+        fg_mask_inboxes = matching_matrix.sum(1) > 0
+        valid_mask[valid_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1)
+        matched_pred_ious = (matching_matrix *
+                             pairwise_ious).sum(1)[fg_mask_inboxes]
+        return matched_pred_ious, matched_gt_inds
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/task_aligned_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/task_aligned_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..220ea8485933ab3243f6c1e205dbf1b973df08d7
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/task_aligned_assigner.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+INF = 100000000
+
+
+@TASK_UTILS.register_module()
+class TaskAlignedAssigner(BaseAssigner):
+    """Task aligned assigner used in the paper:
+    `TOOD: Task-aligned One-stage Object Detection.
+    <https://arxiv.org/abs/2108.07755>`_.
+
+    Assign a corresponding gt bbox or background to each predicted bbox.
+    Each bbox will be assigned with `0` or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        topk (int): number of bbox selected in each level
+        iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou
+            calculator. Defaults to ``dict(type='BboxOverlaps2D')``
+    """
+
+    def __init__(self,
+                 topk: int,
+                 iou_calculator: ConfigType = dict(type='BboxOverlaps2D')):
+        assert topk >= 1
+        self.topk = topk
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               alpha: int = 1,
+               beta: int = 6) -> AssignResult:
+        """Assign gt to bboxes.
+
+        The assignment is done in following steps
+
+        1. compute alignment metric between all bbox (bbox of all pyramid
+           levels) and gt
+        2. select top-k bbox as candidates for each gt
+        3. limit the positive sample's center in gt (because the anchor-free
+           detector only can predict positive distance)
+
+
+        Args:
+            pred_instances (:obj:`InstaceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            gt_instances (:obj:`InstaceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            gt_instances_ignore (:obj:`InstaceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+            alpha (int): Hyper-parameters related to alignment_metrics.
+                Defaults to 1.
+            beta (int): Hyper-parameters related to alignment_metrics.
+                Defaults to 6.
+
+        Returns:
+            :obj:`TaskAlignedAssignResult`: The assign result.
+        """
+        priors = pred_instances.priors
+        decode_bboxes = pred_instances.bboxes
+        pred_scores = pred_instances.scores
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+
+        priors = priors[:, :4]
+        num_gt, num_bboxes = gt_bboxes.size(0), priors.size(0)
+        # compute alignment metric between all bbox and gt
+        overlaps = self.iou_calculator(decode_bboxes, gt_bboxes).detach()
+        bbox_scores = pred_scores[:, gt_labels].detach()
+        # assign 0 by default
+        assigned_gt_inds = priors.new_full((num_bboxes, ), 0, dtype=torch.long)
+        assign_metrics = priors.new_zeros((num_bboxes, ))
+
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = priors.new_zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No gt boxes, assign everything to background
+                assigned_gt_inds[:] = 0
+            assigned_labels = priors.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+            assign_result = AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+            assign_result.assign_metrics = assign_metrics
+            return assign_result
+
+        # select top-k bboxes as candidates for each gt
+        alignment_metrics = bbox_scores**alpha * overlaps**beta
+        topk = min(self.topk, alignment_metrics.size(0))
+        _, candidate_idxs = alignment_metrics.topk(topk, dim=0, largest=True)
+        candidate_metrics = alignment_metrics[candidate_idxs,
+                                              torch.arange(num_gt)]
+        is_pos = candidate_metrics > 0
+
+        # limit the positive sample's center in gt
+        priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0
+        priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
+        ep_priors_cx = priors_cx.view(1, -1).expand(
+            num_gt, num_bboxes).contiguous().view(-1)
+        ep_priors_cy = priors_cy.view(1, -1).expand(
+            num_gt, num_bboxes).contiguous().view(-1)
+        candidate_idxs = candidate_idxs.view(-1)
+
+        # calculate the left, top, right, bottom distance between positive
+        # bbox center and gt side
+        l_ = ep_priors_cx[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 0]
+        t_ = ep_priors_cy[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - ep_priors_cx[candidate_idxs].view(-1, num_gt)
+        b_ = gt_bboxes[:, 3] - ep_priors_cy[candidate_idxs].view(-1, num_gt)
+        is_in_gts = torch.stack([l_, t_, r_, b_], dim=1).min(dim=1)[0] > 0.01
+        is_pos = is_pos & is_in_gts
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest iou will be selected.
+        overlaps_inf = torch.full_like(overlaps,
+                                       -INF).t().contiguous().view(-1)
+        index = candidate_idxs.view(-1)[is_pos.view(-1)]
+        overlaps_inf[index] = overlaps.t().contiguous().view(-1)[index]
+        overlaps_inf = overlaps_inf.view(num_gt, -1).t()
+
+        max_overlaps, argmax_overlaps = overlaps_inf.max(dim=1)
+        assigned_gt_inds[
+            max_overlaps != -INF] = argmax_overlaps[max_overlaps != -INF] + 1
+        assign_metrics[max_overlaps != -INF] = alignment_metrics[
+            max_overlaps != -INF, argmax_overlaps[max_overlaps != -INF]]
+
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+        assign_result = AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+        assign_result.assign_metrics = assign_metrics
+        return assign_result
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..e48f092ac1ae99eadfdf7502b591b57c782e6354
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py
@@ -0,0 +1,182 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.structures import BaseDataElement
+from scipy.optimize import linear_sum_assignment
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .task_aligned_assigner import TaskAlignedAssigner
+
+
+@TASK_UTILS.register_module()
+class TopkHungarianAssigner(TaskAlignedAssigner):
+    """Computes 1-to-k matching between ground truth and predictions.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of some components.
+    For DETR the costs are weighted sum of classification cost, regression L1
+    cost and regression iou cost. The targets don't include the no_object, so
+    generally there are more predictions than targets. After the 1-to-k
+    gt-pred matching, the un-matched are treated as backgrounds. Thus each
+    query prediction will be assigned with `0` or a positive integer
+    indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        cls_cost (dict): Classification cost configuration.
+        reg_cost (dict): Regression L1  cost configuration.
+        iou_cost (dict): Regression iou cost configuration.
+    """
+
+    def __init__(self,
+                 *args,
+                 cls_cost=dict(type='FocalLossCost', weight=2.0),
+                 reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+                 iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                 **kwargs):
+        super(TopkHungarianAssigner, self).__init__(*args, **kwargs)
+
+        self.cls_cost = TASK_UTILS.build(cls_cost)
+        self.reg_cost = TASK_UTILS.build(reg_cost)
+        self.iou_cost = TASK_UTILS.build(iou_cost)
+
+    def assign(self,
+               pred_scores,
+               decode_bboxes,
+               gt_bboxes,
+               gt_labels,
+               img_meta,
+               alpha=1,
+               beta=6,
+               **kwargs):
+        """Computes 1-to-k gt-pred matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. Assign every prediction to -1.
+        2. Compute the weighted costs, each cost has shape (num_pred, num_gt).
+        3. Update topk to be min(topk, int(num_pred / num_gt)), then repeat
+            costs topk times to shape: (num_pred, num_gt * topk), so that each
+            gt will match topk predictions.
+        3. Do Hungarian matching on CPU based on the costs.
+        4. Assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        5. Calculate alignment metrics and overlaps of each matched pred-gt
+            pair.
+
+        Args:
+            pred_scores (Tensor): Predicted normalized classification
+                scores for one image, has shape (num_dense_queries,
+                cls_out_channels).
+            decode_bboxes (Tensor): Predicted unnormalized bbox coordinates
+                for one image, has shape (num_dense_queries, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+            gt_bboxes (Tensor): Unnormalized ground truth
+                bboxes for one image, has shape (num_gt, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+                NOTE: num_gt is dynamic for each image.
+            gt_labels (Tensor): Ground truth classification
+                    index for the image, has shape (num_gt,).
+                    NOTE: num_gt is dynamic for each image.
+            img_meta (dict): Meta information for one image.
+            alpha (int): Hyper-parameters related to alignment_metrics.
+                Defaults to 1.
+            beta (int): Hyper-parameters related to alignment_metrics.
+                Defaults to 6.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        pred_scores = pred_scores.detach()
+        decode_bboxes = decode_bboxes.detach()
+        temp_overlaps = self.iou_calculator(decode_bboxes, gt_bboxes).detach()
+        bbox_scores = pred_scores[:, gt_labels].detach()
+        alignment_metrics = bbox_scores**alpha * temp_overlaps**beta
+
+        pred_instances = BaseDataElement()
+        gt_instances = BaseDataElement()
+
+        pred_instances.bboxes = decode_bboxes
+        gt_instances.bboxes = gt_bboxes
+
+        pred_instances.scores = pred_scores
+        gt_instances.labels = gt_labels
+
+        reg_cost = self.reg_cost(pred_instances, gt_instances, img_meta)
+        iou_cost = self.iou_cost(pred_instances, gt_instances, img_meta)
+        cls_cost = self.cls_cost(pred_instances, gt_instances, img_meta)
+        all_cost = cls_cost + reg_cost + iou_cost
+
+        num_gt, num_bboxes = gt_bboxes.size(0), pred_scores.size(0)
+        if num_gt > 0:
+            # assign 0 by default
+            assigned_gt_inds = pred_scores.new_full((num_bboxes, ),
+                                                    0,
+                                                    dtype=torch.long)
+            select_cost = all_cost
+
+            topk = min(self.topk, int(len(select_cost) / num_gt))
+
+            # Repeat the ground truth `topk` times to perform 1-to-k gt-pred
+            #   matching. For example, if `num_pred` = 900, `num_gt` = 3, then
+            #   there are only 3 gt-pred pairs in sum for 1-1 matching.
+            #   However, for 1-k gt-pred matching, if `topk` = 4, then each
+            #   gt is assigned 4 unique predictions, so there would be 12
+            #   gt-pred pairs in sum.
+            repeat_select_cost = select_cost[...,
+                                             None].repeat(1, 1, topk).view(
+                                                 select_cost.size(0), -1)
+            # anchor index and gt index
+            matched_row_inds, matched_col_inds = linear_sum_assignment(
+                repeat_select_cost.detach().cpu().numpy())
+            matched_row_inds = torch.from_numpy(matched_row_inds).to(
+                pred_scores.device)
+            matched_col_inds = torch.from_numpy(matched_col_inds).to(
+                pred_scores.device)
+
+            match_gt_ids = matched_col_inds // topk
+            candidate_idxs = matched_row_inds
+
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+
+            if candidate_idxs.numel() > 0:
+                assigned_labels[candidate_idxs] = gt_labels[match_gt_ids]
+            else:
+                assigned_labels = None
+
+            assigned_gt_inds[candidate_idxs] = match_gt_ids + 1
+
+            overlaps = self.iou_calculator(
+                decode_bboxes[candidate_idxs],
+                gt_bboxes[match_gt_ids],
+                is_aligned=True).detach()
+
+            temp_pos_alignment_metrics = alignment_metrics[candidate_idxs]
+            pos_alignment_metrics = torch.gather(temp_pos_alignment_metrics, 1,
+                                                 match_gt_ids[:,
+                                                              None]).view(-1)
+            assign_result = AssignResult(
+                num_gt, assigned_gt_inds, overlaps, labels=assigned_labels)
+
+            assign_result.assign_metrics = pos_alignment_metrics
+            return assign_result
+        else:
+
+            assigned_gt_inds = pred_scores.new_full((num_bboxes, ),
+                                                    -1,
+                                                    dtype=torch.long)
+
+            assigned_labels = pred_scores.new_full((num_bboxes, ),
+                                                   -1,
+                                                   dtype=torch.long)
+
+            assigned_gt_inds[:] = 0
+            return AssignResult(
+                0, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/assigners/uniform_assigner.py b/head_extractor/build/lib/mmdet/models/task_modules/assigners/uniform_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a83bfd0b46a3690dce9cf0adf2c1e676f304d06
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/assigners/uniform_assigner.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox_xyxy_to_cxcywh
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class UniformAssigner(BaseAssigner):
+    """Uniform Matching between the priors and gt boxes, which can achieve
+    balance in positive priors, and gt_bboxes_ignore was not considered for
+    now.
+
+    Args:
+        pos_ignore_thr (float): the threshold to ignore positive priors
+        neg_ignore_thr (float): the threshold to ignore negative priors
+        match_times(int): Number of positive priors for each gt box.
+           Defaults to 4.
+        iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou
+            calculator. Defaults to ``dict(type='BboxOverlaps2D')``
+    """
+
+    def __init__(self,
+                 pos_ignore_thr: float,
+                 neg_ignore_thr: float,
+                 match_times: int = 4,
+                 iou_calculator: ConfigType = dict(type='BboxOverlaps2D')):
+        self.match_times = match_times
+        self.pos_ignore_thr = pos_ignore_thr
+        self.neg_ignore_thr = neg_ignore_thr
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(
+            self,
+            pred_instances: InstanceData,
+            gt_instances: InstanceData,
+            gt_instances_ignore: Optional[InstanceData] = None
+    ) -> AssignResult:
+        """Assign gt to priors.
+
+        The assignment is done in following steps
+
+        1. assign -1 by default
+        2. compute the L1 cost between boxes. Note that we use priors and
+           predict boxes both
+        3. compute the ignore indexes use gt_bboxes and predict boxes
+        4. compute the ignore indexes of positive sample use priors and
+           predict boxes
+
+
+        Args:
+            pred_instances (:obj:`InstaceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be priors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            gt_instances (:obj:`InstaceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            gt_instances_ignore (:obj:`InstaceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        priors = pred_instances.priors
+        bbox_pred = pred_instances.decoder_priors
+
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              0,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            assign_result = AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+            assign_result.set_extra_property(
+                'pos_idx', bbox_pred.new_empty(0, dtype=torch.bool))
+            assign_result.set_extra_property('pos_predicted_boxes',
+                                             bbox_pred.new_empty((0, 4)))
+            assign_result.set_extra_property('target_boxes',
+                                             bbox_pred.new_empty((0, 4)))
+            return assign_result
+
+        # 2. Compute the L1 cost between boxes
+        # Note that we use priors and predict boxes both
+        cost_bbox = torch.cdist(
+            bbox_xyxy_to_cxcywh(bbox_pred),
+            bbox_xyxy_to_cxcywh(gt_bboxes),
+            p=1)
+        cost_bbox_priors = torch.cdist(
+            bbox_xyxy_to_cxcywh(priors), bbox_xyxy_to_cxcywh(gt_bboxes), p=1)
+
+        # We found that topk function has different results in cpu and
+        # cuda mode. In order to ensure consistency with the source code,
+        # we also use cpu mode.
+        # TODO: Check whether the performance of cpu and cuda are the same.
+        C = cost_bbox.cpu()
+        C1 = cost_bbox_priors.cpu()
+
+        # self.match_times x n
+        index = torch.topk(
+            C,  # c=b,n,x c[i]=n,x
+            k=self.match_times,
+            dim=0,
+            largest=False)[1]
+
+        # self.match_times x n
+        index1 = torch.topk(C1, k=self.match_times, dim=0, largest=False)[1]
+        # (self.match_times*2) x n
+        indexes = torch.cat((index, index1),
+                            dim=1).reshape(-1).to(bbox_pred.device)
+
+        pred_overlaps = self.iou_calculator(bbox_pred, gt_bboxes)
+        anchor_overlaps = self.iou_calculator(priors, gt_bboxes)
+        pred_max_overlaps, _ = pred_overlaps.max(dim=1)
+        anchor_max_overlaps, _ = anchor_overlaps.max(dim=0)
+
+        # 3. Compute the ignore indexes use gt_bboxes and predict boxes
+        ignore_idx = pred_max_overlaps > self.neg_ignore_thr
+        assigned_gt_inds[ignore_idx] = -1
+
+        # 4. Compute the ignore indexes of positive sample use priors
+        # and predict boxes
+        pos_gt_index = torch.arange(
+            0, C1.size(1),
+            device=bbox_pred.device).repeat(self.match_times * 2)
+        pos_ious = anchor_overlaps[indexes, pos_gt_index]
+        pos_ignore_idx = pos_ious < self.pos_ignore_thr
+
+        pos_gt_index_with_ignore = pos_gt_index + 1
+        pos_gt_index_with_ignore[pos_ignore_idx] = -1
+        assigned_gt_inds[indexes] = pos_gt_index_with_ignore
+
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_inds > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+        else:
+            assigned_labels = None
+
+        assign_result = AssignResult(
+            num_gts,
+            assigned_gt_inds,
+            anchor_max_overlaps,
+            labels=assigned_labels)
+        assign_result.set_extra_property('pos_idx', ~pos_ignore_idx)
+        assign_result.set_extra_property('pos_predicted_boxes',
+                                         bbox_pred[indexes])
+        assign_result.set_extra_property('target_boxes',
+                                         gt_bboxes[pos_gt_index])
+        return assign_result
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/builder.py b/head_extractor/build/lib/mmdet/models/task_modules/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6736049fef688e0d663d6195c79ec9688dc4c5d7
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/builder.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmdet.registry import TASK_UTILS
+
+PRIOR_GENERATORS = TASK_UTILS
+ANCHOR_GENERATORS = TASK_UTILS
+BBOX_ASSIGNERS = TASK_UTILS
+BBOX_SAMPLERS = TASK_UTILS
+BBOX_CODERS = TASK_UTILS
+MATCH_COSTS = TASK_UTILS
+IOU_CALCULATORS = TASK_UTILS
+
+
+def build_bbox_coder(cfg, **default_args):
+    """Builder of box coder."""
+    warnings.warn('``build_sampler`` would be deprecated soon, please use '
+                  '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_iou_calculator(cfg, default_args=None):
+    """Builder of IoU calculator."""
+    warnings.warn(
+        '``build_iou_calculator`` would be deprecated soon, please use '
+        '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_match_cost(cfg, default_args=None):
+    """Builder of IoU calculator."""
+    warnings.warn('``build_match_cost`` would be deprecated soon, please use '
+                  '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_assigner(cfg, **default_args):
+    """Builder of box assigner."""
+    warnings.warn('``build_assigner`` would be deprecated soon, please use '
+                  '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box sampler."""
+    warnings.warn('``build_sampler`` would be deprecated soon, please use '
+                  '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_prior_generator(cfg, default_args=None):
+    warnings.warn(
+        '``build_prior_generator`` would be deprecated soon, please use '
+        '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_anchor_generator(cfg, default_args=None):
+    warnings.warn(
+        '``build_anchor_generator`` would be deprecated soon, please use '
+        '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/coders/__init__.py b/head_extractor/build/lib/mmdet/models/task_modules/coders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97c3982140021958dabdd03f8040519f946250ff
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/coders/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_bbox_coder import BaseBBoxCoder
+from .bucketing_bbox_coder import BucketingBBoxCoder
+from .delta_xywh_bbox_coder import (DeltaXYWHBBoxCoder,
+                                    DeltaXYWHBBoxCoderForGLIP)
+from .distance_point_bbox_coder import DistancePointBBoxCoder
+from .legacy_delta_xywh_bbox_coder import LegacyDeltaXYWHBBoxCoder
+from .pseudo_bbox_coder import PseudoBBoxCoder
+from .tblr_bbox_coder import TBLRBBoxCoder
+from .yolo_bbox_coder import YOLOBBoxCoder
+
+__all__ = [
+    'BaseBBoxCoder', 'PseudoBBoxCoder', 'DeltaXYWHBBoxCoder',
+    'LegacyDeltaXYWHBBoxCoder', 'TBLRBBoxCoder', 'YOLOBBoxCoder',
+    'BucketingBBoxCoder', 'DistancePointBBoxCoder', 'DeltaXYWHBBoxCoderForGLIP'
+]
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/coders/base_bbox_coder.py b/head_extractor/build/lib/mmdet/models/task_modules/coders/base_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..806d2651869e02173578c9eb331758743a068dd9
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/coders/base_bbox_coder.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseBBoxCoder(metaclass=ABCMeta):
+    """Base bounding box coder.
+
+    Args:
+        use_box_type (bool): Whether to warp decoded boxes with the
+            box type data structure. Defaults to False.
+    """
+
+    # The size of the last of dimension of the encoded tensor.
+    encode_size = 4
+
+    def __init__(self, use_box_type: bool = False, **kwargs):
+        self.use_box_type = use_box_type
+
+    @abstractmethod
+    def encode(self, bboxes, gt_bboxes):
+        """Encode deltas between bboxes and ground truth boxes."""
+
+    @abstractmethod
+    def decode(self, bboxes, bboxes_pred):
+        """Decode the predicted bboxes according to prediction and base
+        boxes."""
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/coders/bucketing_bbox_coder.py b/head_extractor/build/lib/mmdet/models/task_modules/coders/bucketing_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..4044e1cd91d619521606f3c03032a40a9fc27130
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/coders/bucketing_bbox_coder.py
@@ -0,0 +1,366 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import (BaseBoxes, HorizontalBoxes, bbox_rescale,
+                                   get_box_tensor)
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class BucketingBBoxCoder(BaseBBoxCoder):
+    """Bucketing BBox Coder for Side-Aware Boundary Localization (SABL).
+
+    Boundary Localization with Bucketing and Bucketing Guided Rescoring
+    are implemented here.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        num_buckets (int): Number of buckets.
+        scale_factor (int): Scale factor of proposals to generate buckets.
+        offset_topk (int): Topk buckets are used to generate
+             bucket fine regression targets. Defaults to 2.
+        offset_upperbound (float): Offset upperbound to generate
+             bucket fine regression targets.
+             To avoid too large offset displacements. Defaults to 1.0.
+        cls_ignore_neighbor (bool): Ignore second nearest bucket or Not.
+             Defaults to True.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 num_buckets: int,
+                 scale_factor: int,
+                 offset_topk: int = 2,
+                 offset_upperbound: float = 1.0,
+                 cls_ignore_neighbor: bool = True,
+                 clip_border: bool = True,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.num_buckets = num_buckets
+        self.scale_factor = scale_factor
+        self.offset_topk = offset_topk
+        self.offset_upperbound = offset_upperbound
+        self.cls_ignore_neighbor = cls_ignore_neighbor
+        self.clip_border = clip_border
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tuple[Tensor]:
+        """Get bucketing estimation and fine regression targets during
+        training.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): target of the
+                transformation, e.g., ground truth boxes.
+
+        Returns:
+           encoded_bboxes(tuple[Tensor]): bucketing estimation
+            and fine regression targets and weights
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bbox2bucket(bboxes, gt_bboxes, self.num_buckets,
+                                     self.scale_factor, self.offset_topk,
+                                     self.offset_upperbound,
+                                     self.cls_ignore_neighbor)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Tuple[int]] = None
+    ) -> Tuple[Union[Tensor, BaseBoxes], Tensor]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+        Args:
+            boxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes.
+            pred_bboxes (torch.Tensor): Predictions for bucketing estimation
+                and fine regression
+            max_shape (tuple[int], optional): Maximum shape of boxes.
+                Defaults to None.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert len(pred_bboxes) == 2
+        cls_preds, offset_preds = pred_bboxes
+        assert cls_preds.size(0) == bboxes.size(0) and offset_preds.size(
+            0) == bboxes.size(0)
+        bboxes, loc_confidence = bucket2bbox(bboxes, cls_preds, offset_preds,
+                                             self.num_buckets,
+                                             self.scale_factor, max_shape,
+                                             self.clip_border)
+        if self.use_box_type:
+            bboxes = HorizontalBoxes(bboxes, clone=False)
+        return bboxes, loc_confidence
+
+
+def generat_buckets(proposals: Tensor,
+                    num_buckets: int,
+                    scale_factor: float = 1.0) -> Tuple[Tensor]:
+    """Generate buckets w.r.t bucket number and scale factor of proposals.
+
+    Args:
+        proposals (Tensor): Shape (n, 4)
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+
+    Returns:
+        tuple[Tensor]: (bucket_w, bucket_h, l_buckets, r_buckets,
+         t_buckets, d_buckets)
+
+            - bucket_w: Width of buckets on x-axis. Shape (n, ).
+            - bucket_h: Height of buckets on y-axis. Shape (n, ).
+            - l_buckets: Left buckets. Shape (n, ceil(side_num/2)).
+            - r_buckets: Right buckets. Shape (n, ceil(side_num/2)).
+            - t_buckets: Top buckets. Shape (n, ceil(side_num/2)).
+            - d_buckets: Down buckets. Shape (n, ceil(side_num/2)).
+    """
+    proposals = bbox_rescale(proposals, scale_factor)
+
+    # number of buckets in each side
+    side_num = int(np.ceil(num_buckets / 2.0))
+    pw = proposals[..., 2] - proposals[..., 0]
+    ph = proposals[..., 3] - proposals[..., 1]
+    px1 = proposals[..., 0]
+    py1 = proposals[..., 1]
+    px2 = proposals[..., 2]
+    py2 = proposals[..., 3]
+
+    bucket_w = pw / num_buckets
+    bucket_h = ph / num_buckets
+
+    # left buckets
+    l_buckets = px1[:, None] + (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_w[:, None]
+    # right buckets
+    r_buckets = px2[:, None] - (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_w[:, None]
+    # top buckets
+    t_buckets = py1[:, None] + (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_h[:, None]
+    # down buckets
+    d_buckets = py2[:, None] - (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_h[:, None]
+    return bucket_w, bucket_h, l_buckets, r_buckets, t_buckets, d_buckets
+
+
+def bbox2bucket(proposals: Tensor,
+                gt: Tensor,
+                num_buckets: int,
+                scale_factor: float,
+                offset_topk: int = 2,
+                offset_upperbound: float = 1.0,
+                cls_ignore_neighbor: bool = True) -> Tuple[Tensor]:
+    """Generate buckets estimation and fine regression targets.
+
+    Args:
+        proposals (Tensor): Shape (n, 4)
+        gt (Tensor): Shape (n, 4)
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+        offset_topk (int): Topk buckets are used to generate
+             bucket fine regression targets. Defaults to 2.
+        offset_upperbound (float): Offset allowance to generate
+             bucket fine regression targets.
+             To avoid too large offset displacements. Defaults to 1.0.
+        cls_ignore_neighbor (bool): Ignore second nearest bucket or Not.
+             Defaults to True.
+
+    Returns:
+        tuple[Tensor]: (offsets, offsets_weights, bucket_labels, cls_weights).
+
+            - offsets: Fine regression targets. \
+                Shape (n, num_buckets*2).
+            - offsets_weights: Fine regression weights. \
+                Shape (n, num_buckets*2).
+            - bucket_labels: Bucketing estimation labels. \
+                Shape (n, num_buckets*2).
+            - cls_weights: Bucketing estimation weights. \
+                Shape (n, num_buckets*2).
+    """
+    assert proposals.size() == gt.size()
+
+    # generate buckets
+    proposals = proposals.float()
+    gt = gt.float()
+    (bucket_w, bucket_h, l_buckets, r_buckets, t_buckets,
+     d_buckets) = generat_buckets(proposals, num_buckets, scale_factor)
+
+    gx1 = gt[..., 0]
+    gy1 = gt[..., 1]
+    gx2 = gt[..., 2]
+    gy2 = gt[..., 3]
+
+    # generate offset targets and weights
+    # offsets from buckets to gts
+    l_offsets = (l_buckets - gx1[:, None]) / bucket_w[:, None]
+    r_offsets = (r_buckets - gx2[:, None]) / bucket_w[:, None]
+    t_offsets = (t_buckets - gy1[:, None]) / bucket_h[:, None]
+    d_offsets = (d_buckets - gy2[:, None]) / bucket_h[:, None]
+
+    # select top-k nearest buckets
+    l_topk, l_label = l_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    r_topk, r_label = r_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    t_topk, t_label = t_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    d_topk, d_label = d_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+
+    offset_l_weights = l_offsets.new_zeros(l_offsets.size())
+    offset_r_weights = r_offsets.new_zeros(r_offsets.size())
+    offset_t_weights = t_offsets.new_zeros(t_offsets.size())
+    offset_d_weights = d_offsets.new_zeros(d_offsets.size())
+    inds = torch.arange(0, proposals.size(0)).to(proposals).long()
+
+    # generate offset weights of top-k nearest buckets
+    for k in range(offset_topk):
+        if k >= 1:
+            offset_l_weights[inds, l_label[:,
+                                           k]] = (l_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_r_weights[inds, r_label[:,
+                                           k]] = (r_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_t_weights[inds, t_label[:,
+                                           k]] = (t_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_d_weights[inds, d_label[:,
+                                           k]] = (d_topk[:, k] <
+                                                  offset_upperbound).float()
+        else:
+            offset_l_weights[inds, l_label[:, k]] = 1.0
+            offset_r_weights[inds, r_label[:, k]] = 1.0
+            offset_t_weights[inds, t_label[:, k]] = 1.0
+            offset_d_weights[inds, d_label[:, k]] = 1.0
+
+    offsets = torch.cat([l_offsets, r_offsets, t_offsets, d_offsets], dim=-1)
+    offsets_weights = torch.cat([
+        offset_l_weights, offset_r_weights, offset_t_weights, offset_d_weights
+    ],
+                                dim=-1)
+
+    # generate bucket labels and weight
+    side_num = int(np.ceil(num_buckets / 2.0))
+    labels = torch.stack(
+        [l_label[:, 0], r_label[:, 0], t_label[:, 0], d_label[:, 0]], dim=-1)
+
+    batch_size = labels.size(0)
+    bucket_labels = F.one_hot(labels.view(-1), side_num).view(batch_size,
+                                                              -1).float()
+    bucket_cls_l_weights = (l_offsets.abs() < 1).float()
+    bucket_cls_r_weights = (r_offsets.abs() < 1).float()
+    bucket_cls_t_weights = (t_offsets.abs() < 1).float()
+    bucket_cls_d_weights = (d_offsets.abs() < 1).float()
+    bucket_cls_weights = torch.cat([
+        bucket_cls_l_weights, bucket_cls_r_weights, bucket_cls_t_weights,
+        bucket_cls_d_weights
+    ],
+                                   dim=-1)
+    # ignore second nearest buckets for cls if necessary
+    if cls_ignore_neighbor:
+        bucket_cls_weights = (~((bucket_cls_weights == 1) &
+                                (bucket_labels == 0))).float()
+    else:
+        bucket_cls_weights[:] = 1.0
+    return offsets, offsets_weights, bucket_labels, bucket_cls_weights
+
+
+def bucket2bbox(proposals: Tensor,
+                cls_preds: Tensor,
+                offset_preds: Tensor,
+                num_buckets: int,
+                scale_factor: float = 1.0,
+                max_shape: Optional[Union[Sequence[int], Tensor,
+                                          Sequence[Sequence[int]]]] = None,
+                clip_border: bool = True) -> Tuple[Tensor]:
+    """Apply bucketing estimation (cls preds) and fine regression (offset
+    preds) to generate det bboxes.
+
+    Args:
+        proposals (Tensor): Boxes to be transformed. Shape (n, 4)
+        cls_preds (Tensor): bucketing estimation. Shape (n, num_buckets*2).
+        offset_preds (Tensor): fine regression. Shape (n, num_buckets*2).
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+        max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+
+    Returns:
+        tuple[Tensor]: (bboxes, loc_confidence).
+
+            - bboxes: predicted bboxes. Shape (n, 4)
+            - loc_confidence: localization confidence of predicted bboxes.
+                Shape (n,).
+    """
+
+    side_num = int(np.ceil(num_buckets / 2.0))
+    cls_preds = cls_preds.view(-1, side_num)
+    offset_preds = offset_preds.view(-1, side_num)
+
+    scores = F.softmax(cls_preds, dim=1)
+    score_topk, score_label = scores.topk(2, dim=1, largest=True, sorted=True)
+
+    rescaled_proposals = bbox_rescale(proposals, scale_factor)
+
+    pw = rescaled_proposals[..., 2] - rescaled_proposals[..., 0]
+    ph = rescaled_proposals[..., 3] - rescaled_proposals[..., 1]
+    px1 = rescaled_proposals[..., 0]
+    py1 = rescaled_proposals[..., 1]
+    px2 = rescaled_proposals[..., 2]
+    py2 = rescaled_proposals[..., 3]
+
+    bucket_w = pw / num_buckets
+    bucket_h = ph / num_buckets
+
+    score_inds_l = score_label[0::4, 0]
+    score_inds_r = score_label[1::4, 0]
+    score_inds_t = score_label[2::4, 0]
+    score_inds_d = score_label[3::4, 0]
+    l_buckets = px1 + (0.5 + score_inds_l.float()) * bucket_w
+    r_buckets = px2 - (0.5 + score_inds_r.float()) * bucket_w
+    t_buckets = py1 + (0.5 + score_inds_t.float()) * bucket_h
+    d_buckets = py2 - (0.5 + score_inds_d.float()) * bucket_h
+
+    offsets = offset_preds.view(-1, 4, side_num)
+    inds = torch.arange(proposals.size(0)).to(proposals).long()
+    l_offsets = offsets[:, 0, :][inds, score_inds_l]
+    r_offsets = offsets[:, 1, :][inds, score_inds_r]
+    t_offsets = offsets[:, 2, :][inds, score_inds_t]
+    d_offsets = offsets[:, 3, :][inds, score_inds_d]
+
+    x1 = l_buckets - l_offsets * bucket_w
+    x2 = r_buckets - r_offsets * bucket_w
+    y1 = t_buckets - t_offsets * bucket_h
+    y2 = d_buckets - d_offsets * bucket_h
+
+    if clip_border and max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+    bboxes = torch.cat([x1[:, None], y1[:, None], x2[:, None], y2[:, None]],
+                       dim=-1)
+
+    # bucketing guided rescoring
+    loc_confidence = score_topk[:, 0]
+    top2_neighbor_inds = (score_label[:, 0] - score_label[:, 1]).abs() == 1
+    loc_confidence += score_topk[:, 1] * top2_neighbor_inds.float()
+    loc_confidence = loc_confidence.view(-1, 4).mean(dim=1)
+
+    return bboxes, loc_confidence
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py b/head_extractor/build/lib/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2b60b5ee791e05ce4f5f8d8e1876f7f61e964ed
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py
@@ -0,0 +1,579 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class DeltaXYWHBBoxCoder(BaseBBoxCoder):
+    """Delta XYWH BBox coder.
+
+    Following the practice in `R-CNN <https://arxiv.org/abs/1311.2524>`_,
+    this coder encodes bbox (x1, y1, x2, y2) into delta (dx, dy, dw, dh) and
+    decodes delta (dx, dy, dw, dh) back to original bbox (x1, y1, x2, y2).
+
+    Args:
+        target_means (Sequence[float]): Denormalizing means of target for
+            delta coordinates
+        target_stds (Sequence[float]): Denormalizing standard deviation of
+            target for delta coordinates
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+        add_ctr_clamp (bool): Whether to add center clamp, when added, the
+            predicted box is clamped is its center is too far away from
+            the original anchor's center. Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+    """
+
+    def __init__(self,
+                 target_means: Sequence[float] = (0., 0., 0., 0.),
+                 target_stds: Sequence[float] = (1., 1., 1., 1.),
+                 clip_border: bool = True,
+                 add_ctr_clamp: bool = False,
+                 ctr_clamp: int = 32,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.means = target_means
+        self.stds = target_stds
+        self.clip_border = clip_border
+        self.add_ctr_clamp = add_ctr_clamp
+        self.ctr_clamp = ctr_clamp
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the
+                transformation, e.g., ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bbox2delta(bboxes, gt_bboxes, self.means, self.stds)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None,
+        wh_ratio_clip: Optional[float] = 16 / 1000
+    ) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes. Shape
+                (B, N, 4) or (N, 4)
+            pred_bboxes (Tensor): Encoded offsets with respect to each roi.
+               Has shape (B, N, num_classes * 4) or (B, N, 4) or
+               (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H
+               when rois is a grid of anchors.Offset encoding follows [1]_.
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+               Sequence[int]],optional): Maximum bounds for boxes, specifies
+               (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then
+               the max_shape should be a Sequence[Sequence[int]]
+               and the length of max_shape should also be B.
+            wh_ratio_clip (float, optional): The allowed ratio between
+                width and height.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        if pred_bboxes.ndim == 3:
+            assert pred_bboxes.size(1) == bboxes.size(1)
+
+        if pred_bboxes.ndim == 2 and not torch.onnx.is_in_onnx_export():
+            # single image decode
+            decoded_bboxes = delta2bbox(bboxes, pred_bboxes, self.means,
+                                        self.stds, max_shape, wh_ratio_clip,
+                                        self.clip_border, self.add_ctr_clamp,
+                                        self.ctr_clamp)
+        else:
+            if pred_bboxes.ndim == 3 and not torch.onnx.is_in_onnx_export():
+                warnings.warn(
+                    'DeprecationWarning: onnx_delta2bbox is deprecated '
+                    'in the case of batch decoding and non-ONNX, '
+                    'please use “delta2bbox” instead. In order to improve '
+                    'the decoding speed, the batch function will no '
+                    'longer be supported. ')
+            decoded_bboxes = onnx_delta2bbox(bboxes, pred_bboxes, self.means,
+                                             self.stds, max_shape,
+                                             wh_ratio_clip, self.clip_border,
+                                             self.add_ctr_clamp,
+                                             self.ctr_clamp)
+
+        if self.use_box_type:
+            assert decoded_bboxes.size(-1) == 4, \
+                ('Cannot warp decoded boxes with box type when decoded boxes'
+                 'have shape of (N, num_classes * 4)')
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
+
+
+@TASK_UTILS.register_module()
+class DeltaXYWHBBoxCoderForGLIP(DeltaXYWHBBoxCoder):
+    """This is designed specifically for the GLIP algorithm.
+
+    In order to completely match the official performance, we need to perform
+    special calculations in the encoding and decoding processes, such as
+    additional +1 and -1 calculations. However, this is not a user-friendly
+    design.
+    """
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the
+                transformation, e.g., ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bbox2delta(bboxes, gt_bboxes, self.means, self.stds)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None,
+        wh_ratio_clip: Optional[float] = 16 / 1000
+    ) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes. Shape
+                (B, N, 4) or (N, 4)
+            pred_bboxes (Tensor): Encoded offsets with respect to each roi.
+               Has shape (B, N, num_classes * 4) or (B, N, 4) or
+               (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H
+               when rois is a grid of anchors.Offset encoding follows [1]_.
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+               Sequence[int]],optional): Maximum bounds for boxes, specifies
+               (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then
+               the max_shape should be a Sequence[Sequence[int]]
+               and the length of max_shape should also be B.
+            wh_ratio_clip (float, optional): The allowed ratio between
+                width and height.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        if pred_bboxes.ndim == 3:
+            assert pred_bboxes.size(1) == bboxes.size(1)
+
+        if pred_bboxes.ndim == 2 and not torch.onnx.is_in_onnx_export():
+            # single image decode
+            decoded_bboxes = delta2bbox_glip(bboxes, pred_bboxes, self.means,
+                                             self.stds, max_shape,
+                                             wh_ratio_clip, self.clip_border,
+                                             self.add_ctr_clamp,
+                                             self.ctr_clamp)
+        else:
+            raise NotImplementedError()
+
+        if self.use_box_type:
+            assert decoded_bboxes.size(-1) == 4, \
+                ('Cannot warp decoded boxes with box type when decoded boxes'
+                 'have shape of (N, num_classes * 4)')
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
+
+
+def bbox2delta(
+    proposals: Tensor,
+    gt: Tensor,
+    means: Sequence[float] = (0., 0., 0., 0.),
+    stds: Sequence[float] = (1., 1., 1., 1.)
+) -> Tensor:
+    """Compute deltas of proposals w.r.t. gt.
+
+    We usually compute the deltas of x, y, w, h of proposals w.r.t ground
+    truth bboxes to get regression target.
+    This is the inverse function of :func:`delta2bbox`.
+
+    Args:
+        proposals (Tensor): Boxes to be transformed, shape (N, ..., 4)
+        gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4)
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+
+    Returns:
+        Tensor: deltas with shape (N, 4), where columns represent dx, dy,
+            dw, dh.
+    """
+    assert proposals.size() == gt.size()
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0]
+    ph = proposals[..., 3] - proposals[..., 1]
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0]
+    gh = gt[..., 3] - gt[..., 1]
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw / pw)
+    dh = torch.log(gh / ph)
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+def delta2bbox(rois: Tensor,
+               deltas: Tensor,
+               means: Sequence[float] = (0., 0., 0., 0.),
+               stds: Sequence[float] = (1., 1., 1., 1.),
+               max_shape: Optional[Union[Sequence[int], Tensor,
+                                         Sequence[Sequence[int]]]] = None,
+               wh_ratio_clip: float = 16 / 1000,
+               clip_border: bool = True,
+               add_ctr_clamp: bool = False,
+               ctr_clamp: int = 32) -> Tensor:
+    """Apply deltas to shift/scale base boxes.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of :func:`bbox2delta`.
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4).
+        deltas (Tensor): Encoded offsets relative to each roi.
+            Has shape (N, num_classes * 4) or (N, 4). Note
+            N = num_base_anchors * W * H, when rois is a grid of
+            anchors. Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates.
+            Default (0., 0., 0., 0.).
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates. Default (1., 1., 1., 1.).
+        max_shape (tuple[int, int]): Maximum bounds for boxes, specifies
+           (H, W). Default None.
+        wh_ratio_clip (float): Maximum aspect ratio for boxes. Default
+            16 / 1000.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Default True.
+        add_ctr_clamp (bool): Whether to add center clamp. When set to True,
+            the center of the prediction bounding box will be clamped to
+            avoid being too far away from the center of the anchor.
+            Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+
+    Returns:
+        Tensor: Boxes with shape (N, num_classes * 4) or (N, 4), where 4
+           represent tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3))
+        tensor([[0.0000, 0.0000, 1.0000, 1.0000],
+                [0.1409, 0.1409, 2.8591, 2.8591],
+                [0.0000, 0.3161, 4.1945, 0.6839],
+                [5.0000, 5.0000, 5.0000, 5.0000]])
+    """
+    num_bboxes, num_classes = deltas.size(0), deltas.size(1) // 4
+    if num_bboxes == 0:
+        return deltas
+
+    deltas = deltas.reshape(-1, 4)
+
+    means = deltas.new_tensor(means).view(1, -1)
+    stds = deltas.new_tensor(stds).view(1, -1)
+    denorm_deltas = deltas * stds + means
+
+    dxy = denorm_deltas[:, :2]
+    dwh = denorm_deltas[:, 2:]
+
+    # Compute width/height of each roi
+    rois_ = rois.repeat(1, num_classes).reshape(-1, 4)
+    pxy = ((rois_[:, :2] + rois_[:, 2:]) * 0.5)
+    pwh = (rois_[:, 2:] - rois_[:, :2])
+
+    dxy_wh = pwh * dxy
+
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    if add_ctr_clamp:
+        dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
+        dwh = torch.clamp(dwh, max=max_ratio)
+    else:
+        dwh = dwh.clamp(min=-max_ratio, max=max_ratio)
+
+    gxy = pxy + dxy_wh
+    gwh = pwh * dwh.exp()
+    x1y1 = gxy - (gwh * 0.5)
+    x2y2 = gxy + (gwh * 0.5)
+    bboxes = torch.cat([x1y1, x2y2], dim=-1)
+    if clip_border and max_shape is not None:
+        bboxes[..., 0::2].clamp_(min=0, max=max_shape[1])
+        bboxes[..., 1::2].clamp_(min=0, max=max_shape[0])
+    bboxes = bboxes.reshape(num_bboxes, -1)
+    return bboxes
+
+
+def onnx_delta2bbox(rois: Tensor,
+                    deltas: Tensor,
+                    means: Sequence[float] = (0., 0., 0., 0.),
+                    stds: Sequence[float] = (1., 1., 1., 1.),
+                    max_shape: Optional[Union[Sequence[int], Tensor,
+                                              Sequence[Sequence[int]]]] = None,
+                    wh_ratio_clip: float = 16 / 1000,
+                    clip_border: Optional[bool] = True,
+                    add_ctr_clamp: bool = False,
+                    ctr_clamp: int = 32) -> Tensor:
+    """Apply deltas to shift/scale base boxes.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of :func:`bbox2delta`.
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4) or (B, N, 4)
+        deltas (Tensor): Encoded offsets with respect to each roi.
+            Has shape (B, N, num_classes * 4) or (B, N, 4) or
+            (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H
+            when rois is a grid of anchors.Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates.
+            Default (0., 0., 0., 0.).
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates. Default (1., 1., 1., 1.).
+        max_shape (Sequence[int] or torch.Tensor or Sequence[
+            Sequence[int]],optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If rois shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B. Default None.
+        wh_ratio_clip (float): Maximum aspect ratio for boxes.
+            Default 16 / 1000.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Default True.
+        add_ctr_clamp (bool): Whether to add center clamp, when added, the
+            predicted box is clamped is its center is too far away from
+            the original anchor's center. Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+
+    Returns:
+        Tensor: Boxes with shape (B, N, num_classes * 4) or (B, N, 4) or
+           (N, num_classes * 4) or (N, 4), where 4 represent
+           tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3))
+        tensor([[0.0000, 0.0000, 1.0000, 1.0000],
+                [0.1409, 0.1409, 2.8591, 2.8591],
+                [0.0000, 0.3161, 4.1945, 0.6839],
+                [5.0000, 5.0000, 5.0000, 5.0000]])
+    """
+    means = deltas.new_tensor(means).view(1,
+                                          -1).repeat(1,
+                                                     deltas.size(-1) // 4)
+    stds = deltas.new_tensor(stds).view(1, -1).repeat(1, deltas.size(-1) // 4)
+    denorm_deltas = deltas * stds + means
+    dx = denorm_deltas[..., 0::4]
+    dy = denorm_deltas[..., 1::4]
+    dw = denorm_deltas[..., 2::4]
+    dh = denorm_deltas[..., 3::4]
+
+    x1, y1 = rois[..., 0], rois[..., 1]
+    x2, y2 = rois[..., 2], rois[..., 3]
+    # Compute center of each roi
+    px = ((x1 + x2) * 0.5).unsqueeze(-1).expand_as(dx)
+    py = ((y1 + y2) * 0.5).unsqueeze(-1).expand_as(dy)
+    # Compute width/height of each roi
+    pw = (x2 - x1).unsqueeze(-1).expand_as(dw)
+    ph = (y2 - y1).unsqueeze(-1).expand_as(dh)
+
+    dx_width = pw * dx
+    dy_height = ph * dy
+
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    if add_ctr_clamp:
+        dx_width = torch.clamp(dx_width, max=ctr_clamp, min=-ctr_clamp)
+        dy_height = torch.clamp(dy_height, max=ctr_clamp, min=-ctr_clamp)
+        dw = torch.clamp(dw, max=max_ratio)
+        dh = torch.clamp(dh, max=max_ratio)
+    else:
+        dw = dw.clamp(min=-max_ratio, max=max_ratio)
+        dh = dh.clamp(min=-max_ratio, max=max_ratio)
+    # Use exp(network energy) to enlarge/shrink each roi
+    gw = pw * dw.exp()
+    gh = ph * dh.exp()
+    # Use network energy to shift the center of each roi
+    gx = px + dx_width
+    gy = py + dy_height
+    # Convert center-xy/width/height to top-left, bottom-right
+    x1 = gx - gw * 0.5
+    y1 = gy - gh * 0.5
+    x2 = gx + gw * 0.5
+    y2 = gy + gh * 0.5
+
+    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size())
+
+    if clip_border and max_shape is not None:
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            from mmdet.core.export import dynamic_clip_for_onnx
+            x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape)
+            bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size())
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = x1.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(x1)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = x1.new_tensor(0)
+        max_xy = torch.cat(
+            [max_shape] * (deltas.size(-1) // 2),
+            dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
+
+
+def delta2bbox_glip(rois: Tensor,
+                    deltas: Tensor,
+                    means: Sequence[float] = (0., 0., 0., 0.),
+                    stds: Sequence[float] = (1., 1., 1., 1.),
+                    max_shape: Optional[Union[Sequence[int], Tensor,
+                                              Sequence[Sequence[int]]]] = None,
+                    wh_ratio_clip: float = 16 / 1000,
+                    clip_border: bool = True,
+                    add_ctr_clamp: bool = False,
+                    ctr_clamp: int = 32) -> Tensor:
+    """Apply deltas to shift/scale base boxes.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of :func:`bbox2delta`.
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4).
+        deltas (Tensor): Encoded offsets relative to each roi.
+            Has shape (N, num_classes * 4) or (N, 4). Note
+            N = num_base_anchors * W * H, when rois is a grid of
+            anchors. Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates.
+            Default (0., 0., 0., 0.).
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates. Default (1., 1., 1., 1.).
+        max_shape (tuple[int, int]): Maximum bounds for boxes, specifies
+           (H, W). Default None.
+        wh_ratio_clip (float): Maximum aspect ratio for boxes. Default
+            16 / 1000.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Default True.
+        add_ctr_clamp (bool): Whether to add center clamp. When set to True,
+            the center of the prediction bounding box will be clamped to
+            avoid being too far away from the center of the anchor.
+            Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+
+    Returns:
+        Tensor: Boxes with shape (N, num_classes * 4) or (N, 4), where 4
+           represent tl_x, tl_y, br_x, br_y.
+    """
+    num_bboxes, num_classes = deltas.size(0), deltas.size(1) // 4
+    if num_bboxes == 0:
+        return deltas
+
+    deltas = deltas.reshape(-1, 4)
+
+    means = deltas.new_tensor(means).view(1, -1)
+    stds = deltas.new_tensor(stds).view(1, -1)
+    denorm_deltas = deltas * stds + means
+
+    dxy = denorm_deltas[:, :2]
+    dwh = denorm_deltas[:, 2:]
+
+    # Compute width/height of each roi
+    rois_ = rois.repeat(1, num_classes).reshape(-1, 4)
+    pxy = ((rois_[:, :2] + rois_[:, 2:] - 1) * 0.5)  # note
+    pwh = (rois_[:, 2:] - rois_[:, :2])
+
+    dxy_wh = pwh * dxy
+
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    if add_ctr_clamp:
+        dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
+        dwh = torch.clamp(dwh, max=max_ratio)
+    else:
+        dwh = dwh.clamp(min=-max_ratio, max=max_ratio)
+
+    gxy = pxy + dxy_wh
+    gwh = pwh * dwh.exp()
+
+    x1y1 = gxy - (gwh - 1) * 0.5  # Note
+    x2y2 = gxy + (gwh - 1) * 0.5  # Note
+
+    bboxes = torch.cat([x1y1, x2y2], dim=-1)
+
+    if clip_border and max_shape is not None:
+        bboxes[..., 0::2].clamp_(min=0, max=max_shape[1] - 1)  # Note
+        bboxes[..., 1::2].clamp_(min=0, max=max_shape[0] - 1)  # Note
+    bboxes = bboxes.reshape(num_bboxes, -1)
+    return bboxes
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/coders/distance_point_bbox_coder.py b/head_extractor/build/lib/mmdet/models/task_modules/coders/distance_point_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab26bf4b96c48df689da3722c23aa65e646348db
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/coders/distance_point_bbox_coder.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import (BaseBoxes, HorizontalBoxes, bbox2distance,
+                                   distance2bbox, get_box_tensor)
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class DistancePointBBoxCoder(BaseBBoxCoder):
+    """Distance Point BBox coder.
+
+    This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,
+    right) and decode it back to the original.
+
+    Args:
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self, clip_border: Optional[bool] = True, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.clip_border = clip_border
+
+    def encode(self,
+               points: Tensor,
+               gt_bboxes: Union[Tensor, BaseBoxes],
+               max_dis: Optional[float] = None,
+               eps: float = 0.1) -> Tensor:
+        """Encode bounding box to distances.
+
+        Args:
+            points (Tensor): Shape (N, 2), The format is [x, y].
+            gt_bboxes (Tensor or :obj:`BaseBoxes`): Shape (N, 4), The format
+                is "xyxy"
+            max_dis (float): Upper bound of the distance. Default None.
+            eps (float): a small value to ensure target < max_dis, instead <=.
+                Default 0.1.
+
+        Returns:
+            Tensor: Box transformation deltas. The shape is (N, 4).
+        """
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert points.size(0) == gt_bboxes.size(0)
+        assert points.size(-1) == 2
+        assert gt_bboxes.size(-1) == 4
+        return bbox2distance(points, gt_bboxes, max_dis, eps)
+
+    def decode(
+        self,
+        points: Tensor,
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None
+    ) -> Union[Tensor, BaseBoxes]:
+        """Decode distance prediction to bounding box.
+
+        Args:
+            points (Tensor): Shape (B, N, 2) or (N, 2).
+            pred_bboxes (Tensor): Distance from the given point to 4
+                boundaries (left, top, right, bottom). Shape (B, N, 4)
+                or (N, 4)
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+                Sequence[int]],optional): Maximum bounds for boxes, specifies
+                (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+                the max_shape should be a Sequence[Sequence[int]],
+                and the length of max_shape should also be B.
+                Default None.
+        Returns:
+            Union[Tensor, :obj:`BaseBoxes`]: Boxes with shape (N, 4) or
+            (B, N, 4)
+        """
+        assert points.size(0) == pred_bboxes.size(0)
+        assert points.size(-1) == 2
+        assert pred_bboxes.size(-1) == 4
+        if self.clip_border is False:
+            max_shape = None
+        bboxes = distance2bbox(points, pred_bboxes, max_shape)
+
+        if self.use_box_type:
+            bboxes = HorizontalBoxes(bboxes)
+        return bboxes
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py b/head_extractor/build/lib/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eb1bedb3fbe19433c8bdb37f80891efa2cb72fc
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py
@@ -0,0 +1,235 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class LegacyDeltaXYWHBBoxCoder(BaseBBoxCoder):
+    """Legacy Delta XYWH BBox coder used in MMDet V1.x.
+
+    Following the practice in R-CNN [1]_, this coder encodes bbox (x1, y1, x2,
+    y2) into delta (dx, dy, dw, dh) and decodes delta (dx, dy, dw, dh)
+    back to original bbox (x1, y1, x2, y2).
+
+    Note:
+        The main difference between :class`LegacyDeltaXYWHBBoxCoder` and
+        :class:`DeltaXYWHBBoxCoder` is whether ``+ 1`` is used during width and
+        height calculation. We suggest to only use this coder when testing with
+        MMDet V1.x models.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Args:
+        target_means (Sequence[float]): denormalizing means of target for
+            delta coordinates
+        target_stds (Sequence[float]): denormalizing standard deviation of
+            target for delta coordinates
+    """
+
+    def __init__(self,
+                 target_means: Sequence[float] = (0., 0., 0., 0.),
+                 target_stds: Sequence[float] = (1., 1., 1., 1.),
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.means = target_means
+        self.stds = target_stds
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): target of the
+                transformation, e.g., ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = legacy_bbox2delta(bboxes, gt_bboxes, self.means,
+                                           self.stds)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None,
+        wh_ratio_clip: Optional[float] = 16 / 1000
+    ) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            boxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            max_shape (tuple[int], optional): Maximum shape of boxes.
+                Defaults to None.
+            wh_ratio_clip (float, optional): The allowed ratio between
+                width and height.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        decoded_bboxes = legacy_delta2bbox(bboxes, pred_bboxes, self.means,
+                                           self.stds, max_shape, wh_ratio_clip)
+
+        if self.use_box_type:
+            assert decoded_bboxes.size(-1) == 4, \
+                ('Cannot warp decoded boxes with box type when decoded boxes'
+                 'have shape of (N, num_classes * 4)')
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
+
+
+def legacy_bbox2delta(
+    proposals: Tensor,
+    gt: Tensor,
+    means: Sequence[float] = (0., 0., 0., 0.),
+    stds: Sequence[float] = (1., 1., 1., 1.)
+) -> Tensor:
+    """Compute deltas of proposals w.r.t. gt in the MMDet V1.x manner.
+
+    We usually compute the deltas of x, y, w, h of proposals w.r.t ground
+    truth bboxes to get regression target.
+    This is the inverse function of `delta2bbox()`
+
+    Args:
+        proposals (Tensor): Boxes to be transformed, shape (N, ..., 4)
+        gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4)
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+
+    Returns:
+        Tensor: deltas with shape (N, 4), where columns represent dx, dy,
+            dw, dh.
+    """
+    assert proposals.size() == gt.size()
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0] + 1.0
+    ph = proposals[..., 3] - proposals[..., 1] + 1.0
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0] + 1.0
+    gh = gt[..., 3] - gt[..., 1] + 1.0
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw / pw)
+    dh = torch.log(gh / ph)
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+def legacy_delta2bbox(rois: Tensor,
+                      deltas: Tensor,
+                      means: Sequence[float] = (0., 0., 0., 0.),
+                      stds: Sequence[float] = (1., 1., 1., 1.),
+                      max_shape: Optional[
+                          Union[Sequence[int], Tensor,
+                                Sequence[Sequence[int]]]] = None,
+                      wh_ratio_clip: float = 16 / 1000) -> Tensor:
+    """Apply deltas to shift/scale base boxes in the MMDet V1.x manner.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of `bbox2delta()`
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4)
+        deltas (Tensor): Encoded offsets with respect to each roi.
+            Has shape (N, 4 * num_classes). Note N = num_anchors * W * H when
+            rois is a grid of anchors. Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+        max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
+        wh_ratio_clip (float): Maximum aspect ratio for boxes.
+
+    Returns:
+        Tensor: Boxes with shape (N, 4), where columns represent
+            tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> legacy_delta2bbox(rois, deltas, max_shape=(32, 32))
+        tensor([[0.0000, 0.0000, 1.5000, 1.5000],
+                [0.0000, 0.0000, 5.2183, 5.2183],
+                [0.0000, 0.1321, 7.8891, 0.8679],
+                [5.3967, 2.4251, 6.0033, 3.7749]])
+    """
+    means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4)
+    stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4)
+    denorm_deltas = deltas * stds + means
+    dx = denorm_deltas[:, 0::4]
+    dy = denorm_deltas[:, 1::4]
+    dw = denorm_deltas[:, 2::4]
+    dh = denorm_deltas[:, 3::4]
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    dw = dw.clamp(min=-max_ratio, max=max_ratio)
+    dh = dh.clamp(min=-max_ratio, max=max_ratio)
+    # Compute center of each roi
+    px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
+    py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
+    # Compute width/height of each roi
+    pw = (rois[:, 2] - rois[:, 0] + 1.0).unsqueeze(1).expand_as(dw)
+    ph = (rois[:, 3] - rois[:, 1] + 1.0).unsqueeze(1).expand_as(dh)
+    # Use exp(network energy) to enlarge/shrink each roi
+    gw = pw * dw.exp()
+    gh = ph * dh.exp()
+    # Use network energy to shift the center of each roi
+    gx = px + pw * dx
+    gy = py + ph * dy
+    # Convert center-xy/width/height to top-left, bottom-right
+
+    # The true legacy box coder should +- 0.5 here.
+    # However, current implementation improves the performance when testing
+    # the models trained in MMDetection 1.X (~0.5 bbox AP, 0.2 mask AP)
+    x1 = gx - gw * 0.5
+    y1 = gy - gh * 0.5
+    x2 = gx + gw * 0.5
+    y2 = gy + gh * 0.5
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas)
+    return bboxes
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/coders/pseudo_bbox_coder.py b/head_extractor/build/lib/mmdet/models/task_modules/coders/pseudo_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ee74311f6d12bde49d0c678edb60540a8c95c8b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/coders/pseudo_bbox_coder.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class PseudoBBoxCoder(BaseBBoxCoder):
+    """Pseudo bounding box coder."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def encode(self, bboxes: Tensor, gt_bboxes: Union[Tensor,
+                                                      BaseBoxes]) -> Tensor:
+        """torch.Tensor: return the given ``bboxes``"""
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        return gt_bboxes
+
+    def decode(self, bboxes: Tensor, pred_bboxes: Union[Tensor,
+                                                        BaseBoxes]) -> Tensor:
+        """torch.Tensor: return the given ``pred_bboxes``"""
+        if self.use_box_type:
+            pred_bboxes = HorizontalBoxes(pred_bboxes)
+        return pred_bboxes
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/coders/tblr_bbox_coder.py b/head_extractor/build/lib/mmdet/models/task_modules/coders/tblr_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..74b388f7bad6ebc1911cee5b0b7d73bbd04de17a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/coders/tblr_bbox_coder.py
@@ -0,0 +1,228 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class TBLRBBoxCoder(BaseBBoxCoder):
+    """TBLR BBox coder.
+
+    Following the practice in `FSAF <https://arxiv.org/abs/1903.00621>`_,
+    this coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,
+    right) and decode it back to the original.
+
+    Args:
+        normalizer (list | float): Normalization factor to be
+          divided with when coding the coordinates. If it is a list, it should
+          have length of 4 indicating normalization factor in tblr dims.
+          Otherwise it is a unified float factor for all dims. Default: 4.0
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 normalizer: Union[Sequence[float], float] = 4.0,
+                 clip_border: bool = True,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.normalizer = normalizer
+        self.clip_border = clip_border
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes`` in the (top, left,
+        bottom, right) order.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): target of the
+                transformation, e.g., ground truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bboxes2tblr(
+            bboxes, gt_bboxes, normalizer=self.normalizer)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None
+    ) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes.Shape
+                (B, N, 4) or (N, 4)
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+               (B, N, 4) or (N, 4)
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+               Sequence[int]],optional): Maximum bounds for boxes, specifies
+               (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then
+               the max_shape should be a Sequence[Sequence[int]]
+               and the length of max_shape should also be B.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        decoded_bboxes = tblr2bboxes(
+            bboxes,
+            pred_bboxes,
+            normalizer=self.normalizer,
+            max_shape=max_shape,
+            clip_border=self.clip_border)
+
+        if self.use_box_type:
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
+
+
+def bboxes2tblr(priors: Tensor,
+                gts: Tensor,
+                normalizer: Union[Sequence[float], float] = 4.0,
+                normalize_by_wh: bool = True) -> Tensor:
+    """Encode ground truth boxes to tblr coordinate.
+
+    It first convert the gt coordinate to tblr format,
+     (top, bottom, left, right), relative to prior box centers.
+     The tblr coordinate may be normalized by the side length of prior bboxes
+     if `normalize_by_wh` is specified as True, and it is then normalized by
+     the `normalizer` factor.
+
+    Args:
+        priors (Tensor): Prior boxes in point form
+            Shape: (num_proposals,4).
+        gts (Tensor): Coords of ground truth for each prior in point-form
+            Shape: (num_proposals, 4).
+        normalizer (Sequence[float] | float): normalization parameter of
+            encoded boxes. If it is a list, it has to have length = 4.
+            Default: 4.0
+        normalize_by_wh (bool): Whether to normalize tblr coordinate by the
+            side length (wh) of prior bboxes.
+
+    Return:
+        encoded boxes (Tensor), Shape: (num_proposals, 4)
+    """
+
+    # dist b/t match center and prior's center
+    if not isinstance(normalizer, float):
+        normalizer = torch.tensor(normalizer, device=priors.device)
+        assert len(normalizer) == 4, 'Normalizer must have length = 4'
+    assert priors.size(0) == gts.size(0)
+    prior_centers = (priors[:, 0:2] + priors[:, 2:4]) / 2
+    xmin, ymin, xmax, ymax = gts.split(1, dim=1)
+    top = prior_centers[:, 1].unsqueeze(1) - ymin
+    bottom = ymax - prior_centers[:, 1].unsqueeze(1)
+    left = prior_centers[:, 0].unsqueeze(1) - xmin
+    right = xmax - prior_centers[:, 0].unsqueeze(1)
+    loc = torch.cat((top, bottom, left, right), dim=1)
+    if normalize_by_wh:
+        # Normalize tblr by anchor width and height
+        wh = priors[:, 2:4] - priors[:, 0:2]
+        w, h = torch.split(wh, 1, dim=1)
+        loc[:, :2] /= h  # tb is normalized by h
+        loc[:, 2:] /= w  # lr is normalized by w
+    # Normalize tblr by the given normalization factor
+    return loc / normalizer
+
+
+def tblr2bboxes(priors: Tensor,
+                tblr: Tensor,
+                normalizer: Union[Sequence[float], float] = 4.0,
+                normalize_by_wh: bool = True,
+                max_shape: Optional[Union[Sequence[int], Tensor,
+                                          Sequence[Sequence[int]]]] = None,
+                clip_border: bool = True) -> Tensor:
+    """Decode tblr outputs to prediction boxes.
+
+    The process includes 3 steps: 1) De-normalize tblr coordinates by
+    multiplying it with `normalizer`; 2) De-normalize tblr coordinates by the
+    prior bbox width and height if `normalize_by_wh` is `True`; 3) Convert
+    tblr (top, bottom, left, right) pair relative to the center of priors back
+    to (xmin, ymin, xmax, ymax) coordinate.
+
+    Args:
+        priors (Tensor): Prior boxes in point form (x0, y0, x1, y1)
+          Shape: (N,4) or (B, N, 4).
+        tblr (Tensor): Coords of network output in tblr form
+          Shape: (N, 4) or (B, N, 4).
+        normalizer (Sequence[float] | float): Normalization parameter of
+          encoded boxes. By list, it represents the normalization factors at
+          tblr dims. By float, it is the unified normalization factor at all
+          dims. Default: 4.0
+        normalize_by_wh (bool): Whether the tblr coordinates have been
+          normalized by the side length (wh) of prior bboxes.
+        max_shape (Sequence[int] or torch.Tensor or Sequence[
+            Sequence[int]],optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+
+    Return:
+        encoded boxes (Tensor): Boxes with shape (N, 4) or (B, N, 4)
+    """
+    if not isinstance(normalizer, float):
+        normalizer = torch.tensor(normalizer, device=priors.device)
+        assert len(normalizer) == 4, 'Normalizer must have length = 4'
+    assert priors.size(0) == tblr.size(0)
+    if priors.ndim == 3:
+        assert priors.size(1) == tblr.size(1)
+
+    loc_decode = tblr * normalizer
+    prior_centers = (priors[..., 0:2] + priors[..., 2:4]) / 2
+    if normalize_by_wh:
+        wh = priors[..., 2:4] - priors[..., 0:2]
+        w, h = torch.split(wh, 1, dim=-1)
+        # Inplace operation with slice would failed for exporting to ONNX
+        th = h * loc_decode[..., :2]  # tb
+        tw = w * loc_decode[..., 2:]  # lr
+        loc_decode = torch.cat([th, tw], dim=-1)
+    # Cannot be exported using onnx when loc_decode.split(1, dim=-1)
+    top, bottom, left, right = loc_decode.split((1, 1, 1, 1), dim=-1)
+    xmin = prior_centers[..., 0].unsqueeze(-1) - left
+    xmax = prior_centers[..., 0].unsqueeze(-1) + right
+    ymin = prior_centers[..., 1].unsqueeze(-1) - top
+    ymax = prior_centers[..., 1].unsqueeze(-1) + bottom
+
+    bboxes = torch.cat((xmin, ymin, xmax, ymax), dim=-1)
+
+    if clip_border and max_shape is not None:
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            from mmdet.core.export import dynamic_clip_for_onnx
+            xmin, ymin, xmax, ymax = dynamic_clip_for_onnx(
+                xmin, ymin, xmax, ymax, max_shape)
+            bboxes = torch.cat([xmin, ymin, xmax, ymax], dim=-1)
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = priors.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(priors)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = priors.new_tensor(0)
+        max_xy = torch.cat([max_shape, max_shape],
+                           dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/coders/yolo_bbox_coder.py b/head_extractor/build/lib/mmdet/models/task_modules/coders/yolo_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e1c766789bec844ff359e225435bc3b2f5dd736
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/coders/yolo_bbox_coder.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class YOLOBBoxCoder(BaseBBoxCoder):
+    """YOLO BBox coder.
+
+    Following `YOLO <https://arxiv.org/abs/1506.02640>`_, this coder divide
+    image into grids, and encode bbox (x1, y1, x2, y2) into (cx, cy, dw, dh).
+    cx, cy in [0., 1.], denotes relative center position w.r.t the center of
+    bboxes. dw, dh are the same as :obj:`DeltaXYWHBBoxCoder`.
+
+    Args:
+        eps (float): Min value of cx, cy when encoding.
+    """
+
+    def __init__(self, eps: float = 1e-6, **kwargs):
+        super().__init__(**kwargs)
+        self.eps = eps
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes],
+               stride: Union[Tensor, int]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes,
+                e.g., anchors.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the
+                transformation, e.g., ground-truth boxes.
+            stride (torch.Tensor | int): Stride of bboxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        x_center_gt = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) * 0.5
+        y_center_gt = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) * 0.5
+        w_gt = gt_bboxes[..., 2] - gt_bboxes[..., 0]
+        h_gt = gt_bboxes[..., 3] - gt_bboxes[..., 1]
+        x_center = (bboxes[..., 0] + bboxes[..., 2]) * 0.5
+        y_center = (bboxes[..., 1] + bboxes[..., 3]) * 0.5
+        w = bboxes[..., 2] - bboxes[..., 0]
+        h = bboxes[..., 3] - bboxes[..., 1]
+        w_target = torch.log((w_gt / w).clamp(min=self.eps))
+        h_target = torch.log((h_gt / h).clamp(min=self.eps))
+        x_center_target = ((x_center_gt - x_center) / stride + 0.5).clamp(
+            self.eps, 1 - self.eps)
+        y_center_target = ((y_center_gt - y_center) / stride + 0.5).clamp(
+            self.eps, 1 - self.eps)
+        encoded_bboxes = torch.stack(
+            [x_center_target, y_center_target, w_target, h_target], dim=-1)
+        return encoded_bboxes
+
+    def decode(self, bboxes: Union[Tensor, BaseBoxes], pred_bboxes: Tensor,
+               stride: Union[Tensor, int]) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            boxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes,
+                e.g. anchors.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            stride (torch.Tensor | int): Strides of bboxes.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert pred_bboxes.size(-1) == bboxes.size(-1) == 4
+        xy_centers = (bboxes[..., :2] + bboxes[..., 2:]) * 0.5 + (
+            pred_bboxes[..., :2] - 0.5) * stride
+        whs = (bboxes[..., 2:] -
+               bboxes[..., :2]) * 0.5 * pred_bboxes[..., 2:].exp()
+        decoded_bboxes = torch.stack(
+            (xy_centers[..., 0] - whs[..., 0], xy_centers[..., 1] -
+             whs[..., 1], xy_centers[..., 0] + whs[..., 0],
+             xy_centers[..., 1] + whs[..., 1]),
+            dim=-1)
+
+        if self.use_box_type:
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/prior_generators/__init__.py b/head_extractor/build/lib/mmdet/models/task_modules/prior_generators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7795e98ca77bb5ffc77ff1da848130717d8f85a6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/prior_generators/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor_generator import (AnchorGenerator, LegacyAnchorGenerator,
+                               SSDAnchorGenerator, YOLOAnchorGenerator)
+from .point_generator import MlvlPointGenerator, PointGenerator
+from .utils import anchor_inside_flags, calc_region
+
+__all__ = [
+    'AnchorGenerator', 'LegacyAnchorGenerator', 'anchor_inside_flags',
+    'PointGenerator', 'calc_region', 'YOLOAnchorGenerator',
+    'MlvlPointGenerator', 'SSDAnchorGenerator'
+]
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/prior_generators/anchor_generator.py b/head_extractor/build/lib/mmdet/models/task_modules/prior_generators/anchor_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..2757697ce2283ec8b46ba89325e63fad0be4a7e8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/prior_generators/anchor_generator.py
@@ -0,0 +1,848 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.utils import is_tuple_of
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import HorizontalBoxes
+
+DeviceType = Union[str, torch.device]
+
+
+@TASK_UTILS.register_module()
+class AnchorGenerator:
+    """Standard anchor generator for 2D anchor-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int], Optional): Anchor scales for anchors
+            in a single level. It cannot be set at the same time
+            if `octave_base_scale` and `scales_per_octave` are set.
+        base_sizes (list[int], Optional): The basic sizes
+            of anchors in multiple levels.
+            If None is given, strides will be used as base_sizes.
+            (If strides are non square, the shortest stride is taken.)
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int, Optional): The base scale of octave.
+        scales_per_octave (int, Optional): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float]], Optional): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. If a list of tuple of
+            float is given, they will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in proportion to anchors'
+            width and height. By default it is 0 in V2.0.
+        use_box_type (bool): Whether to warp anchors with the box type data
+            structure. Defaults to False.
+
+    Examples:
+        >>> from mmdet.models.task_modules.
+        ... prior_generators import AnchorGenerator
+        >>> self = AnchorGenerator([16], [1.], [1.], [9])
+        >>> all_anchors = self.grid_priors([(2, 2)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]])]
+        >>> self = AnchorGenerator([16, 32], [1.], [1.], [9, 18])
+        >>> all_anchors = self.grid_priors([(2, 2), (1, 1)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]]), \
+        tensor([[-9., -9., 9., 9.]])]
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 ratios: List[float],
+                 scales: Optional[List[int]] = None,
+                 base_sizes: Optional[List[int]] = None,
+                 scale_major: bool = True,
+                 octave_base_scale: Optional[int] = None,
+                 scales_per_octave: Optional[int] = None,
+                 centers: Optional[List[Tuple[float, float]]] = None,
+                 center_offset: float = 0.,
+                 use_box_type: bool = False) -> None:
+        # check center and center_offset
+        if center_offset != 0:
+            assert centers is None, 'center cannot be set when center_offset' \
+                                    f'!=0, {centers} is given.'
+        if not (0 <= center_offset <= 1):
+            raise ValueError('center_offset should be in range [0, 1], '
+                             f'{center_offset} is given.')
+        if centers is not None:
+            assert len(centers) == len(strides), \
+                'The number of strides should be the same as centers, got ' \
+                f'{strides} and {centers}'
+
+        # calculate base sizes of anchors
+        self.strides = [_pair(stride) for stride in strides]
+        self.base_sizes = [min(stride) for stride in self.strides
+                           ] if base_sizes is None else base_sizes
+        assert len(self.base_sizes) == len(self.strides), \
+            'The number of strides should be the same as base sizes, got ' \
+            f'{self.strides} and {self.base_sizes}'
+
+        # calculate scales of anchors
+        assert ((octave_base_scale is not None
+                 and scales_per_octave is not None) ^ (scales is not None)), \
+            'scales and octave_base_scale with scales_per_octave cannot' \
+            ' be set at the same time'
+        if scales is not None:
+            self.scales = torch.Tensor(scales)
+        elif octave_base_scale is not None and scales_per_octave is not None:
+            octave_scales = np.array(
+                [2**(i / scales_per_octave) for i in range(scales_per_octave)])
+            scales = octave_scales * octave_base_scale
+            self.scales = torch.Tensor(scales)
+        else:
+            raise ValueError('Either scales or octave_base_scale with '
+                             'scales_per_octave should be set')
+
+        self.octave_base_scale = octave_base_scale
+        self.scales_per_octave = scales_per_octave
+        self.ratios = torch.Tensor(ratios)
+        self.scale_major = scale_major
+        self.centers = centers
+        self.center_offset = center_offset
+        self.base_anchors = self.gen_base_anchors()
+        self.use_box_type = use_box_type
+
+    @property
+    def num_base_anchors(self) -> List[int]:
+        """list[int]: total number of base anchors in a feature grid"""
+        return self.num_base_priors
+
+    @property
+    def num_base_priors(self) -> List[int]:
+        """list[int]: The number of priors (anchors) at a point
+        on the feature grid"""
+        return [base_anchors.size(0) for base_anchors in self.base_anchors]
+
+    @property
+    def num_levels(self) -> int:
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    def gen_base_anchors(self) -> List[Tensor]:
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(
+                self.gen_single_level_base_anchors(
+                    base_size,
+                    scales=self.scales,
+                    ratios=self.ratios,
+                    center=center))
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self,
+                                      base_size: Union[int, float],
+                                      scales: Tensor,
+                                      ratios: Tensor,
+                                      center: Optional[Tuple[float]] = None) \
+            -> Tensor:
+        """Generate base anchors of a single level.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between the height
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * w
+            y_center = self.center_offset * h
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [
+            x_center - 0.5 * ws, y_center - 0.5 * hs, x_center + 0.5 * ws,
+            y_center + 0.5 * hs
+        ]
+        base_anchors = torch.stack(base_anchors, dim=-1)
+
+        return base_anchors
+
+    def _meshgrid(self,
+                  x: Tensor,
+                  y: Tensor,
+                  row_major: bool = True) -> Tuple[Tensor]:
+        """Generate mesh grid of x and y.
+
+        Args:
+            x (torch.Tensor): Grids of x dimension.
+            y (torch.Tensor): Grids of y dimension.
+            row_major (bool): Whether to return y grids first.
+                Defaults to True.
+
+        Returns:
+            tuple[torch.Tensor]: The mesh grids of x and y.
+        """
+        # use shape instead of len to keep tracing while exporting to onnx
+        xx = x.repeat(y.shape[0])
+        yy = y.view(-1, 1).repeat(1, x.shape[0]).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_priors(self,
+                    featmap_sizes: List[Tuple],
+                    dtype: torch.dtype = torch.float32,
+                    device: DeviceType = 'cuda') -> List[Tensor]:
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            dtype (:obj:`torch.dtype`): Dtype of priors.
+                Defaults to torch.float32.
+            device (str | torch.device): The device where the anchors
+                will be put on.
+
+        Return:
+            list[torch.Tensor]: Anchors in multiple feature levels. \
+                The sizes of each tensor should be [N, 4], where \
+                N = width * height * num_base_anchors, width and height \
+                are the sizes of the corresponding feature level, \
+                num_base_anchors is the number of anchors for that level.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_priors(
+                featmap_sizes[i], level_idx=i, dtype=dtype, device=device)
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_priors(self,
+                                 featmap_size: Tuple[int, int],
+                                 level_idx: int,
+                                 dtype: torch.dtype = torch.float32,
+                                 device: DeviceType = 'cuda') -> Tensor:
+        """Generate grid anchors of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int, int]): Size of the feature maps.
+            level_idx (int): The index of corresponding feature map level.
+            dtype (obj:`torch.dtype`): Date type of points.Defaults to
+                ``torch.float32``.
+            device (str | torch.device): The device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature maps.
+        """
+
+        base_anchors = self.base_anchors[level_idx].to(device).to(dtype)
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        # First create Range with the default dtype, than convert to
+        # target `dtype` for onnx exporting.
+        shift_x = torch.arange(0, feat_w, device=device).to(dtype) * stride_w
+        shift_y = torch.arange(0, feat_h, device=device).to(dtype) * stride_h
+
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        if self.use_box_type:
+            all_anchors = HorizontalBoxes(all_anchors)
+        return all_anchors
+
+    def sparse_priors(self,
+                      prior_idxs: Tensor,
+                      featmap_size: Tuple[int, int],
+                      level_idx: int,
+                      dtype: torch.dtype = torch.float32,
+                      device: DeviceType = 'cuda') -> Tensor:
+        """Generate sparse anchors according to the ``prior_idxs``.
+
+        Args:
+            prior_idxs (Tensor): The index of corresponding anchors
+                in the feature map.
+            featmap_size (tuple[int, int]): feature map size arrange as (h, w).
+            level_idx (int): The level index of corresponding feature
+                map.
+            dtype (obj:`torch.dtype`): Date type of points.Defaults to
+                ``torch.float32``.
+            device (str | torch.device): The device where the points is
+                located.
+        Returns:
+            Tensor: Anchor with shape (N, 4), N should be equal to
+                the length of ``prior_idxs``.
+        """
+
+        height, width = featmap_size
+        num_base_anchors = self.num_base_anchors[level_idx]
+        base_anchor_id = prior_idxs % num_base_anchors
+        x = (prior_idxs //
+             num_base_anchors) % width * self.strides[level_idx][0]
+        y = (prior_idxs // width //
+             num_base_anchors) % height * self.strides[level_idx][1]
+        priors = torch.stack([x, y, x, y], 1).to(dtype).to(device) + \
+            self.base_anchors[level_idx][base_anchor_id, :].to(device)
+
+        return priors
+
+    def grid_anchors(self,
+                     featmap_sizes: List[Tuple],
+                     device: DeviceType = 'cuda') -> List[Tensor]:
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            device (str | torch.device): Device where the anchors will be
+                put on.
+
+        Return:
+            list[torch.Tensor]: Anchors in multiple feature levels. \
+                The sizes of each tensor should be [N, 4], where \
+                N = width * height * num_base_anchors, width and height \
+                are the sizes of the corresponding feature level, \
+                num_base_anchors is the number of anchors for that level.
+        """
+        warnings.warn('``grid_anchors`` would be deprecated soon. '
+                      'Please use ``grid_priors`` ')
+
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_anchors(
+                self.base_anchors[i].to(device),
+                featmap_sizes[i],
+                self.strides[i],
+                device=device)
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_anchors(self,
+                                  base_anchors: Tensor,
+                                  featmap_size: Tuple[int, int],
+                                  stride: Tuple[int, int] = (16, 16),
+                                  device: DeviceType = 'cuda') -> Tensor:
+        """Generate grid anchors of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_anchors``.
+
+        Args:
+            base_anchors (torch.Tensor): The base anchors of a feature grid.
+            featmap_size (tuple[int]): Size of the feature maps.
+            stride (tuple[int, int]): Stride of the feature map in order
+                (w, h). Defaults to (16, 16).
+            device (str | torch.device): Device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature maps.
+        """
+
+        warnings.warn(
+            '``single_level_grid_anchors`` would be deprecated soon. '
+            'Please use ``single_level_grid_priors`` ')
+
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0, feat_w, device=device) * stride[0]
+        shift_y = torch.arange(0, feat_h, device=device) * stride[1]
+
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        shifts = shifts.type_as(base_anchors)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        return all_anchors
+
+    def valid_flags(self,
+                    featmap_sizes: List[Tuple[int, int]],
+                    pad_shape: Tuple,
+                    device: DeviceType = 'cuda') -> List[Tensor]:
+        """Generate valid flags of anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple[int, int])): List of feature map sizes in
+                multiple feature levels.
+            pad_shape (tuple): The padded shape of the image.
+            device (str | torch.device): Device where the anchors will be
+                put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of anchors in multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            anchor_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / anchor_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / anchor_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w),
+                                                  (valid_feat_h, valid_feat_w),
+                                                  self.num_base_anchors[i],
+                                                  device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self,
+                                 featmap_size: Tuple[int, int],
+                                 valid_size: Tuple[int, int],
+                                 num_base_anchors: int,
+                                 device: DeviceType = 'cuda') -> Tensor:
+        """Generate the valid flags of anchor in a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+            num_base_anchors (int): The number of base anchors.
+            device (str | torch.device): Device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each anchor in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        valid = valid[:, None].expand(valid.size(0),
+                                      num_base_anchors).contiguous().view(-1)
+        return valid
+
+    def __repr__(self) -> str:
+        """str: a string that describes the module"""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}strides={self.strides},\n'
+        repr_str += f'{indent_str}ratios={self.ratios},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+        repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+        repr_str += f'{indent_str}octave_base_scale='
+        repr_str += f'{self.octave_base_scale},\n'
+        repr_str += f'{indent_str}scales_per_octave='
+        repr_str += f'{self.scales_per_octave},\n'
+        repr_str += f'{indent_str}num_levels={self.num_levels}\n'
+        repr_str += f'{indent_str}centers={self.centers},\n'
+        repr_str += f'{indent_str}center_offset={self.center_offset})'
+        return repr_str
+
+
+@TASK_UTILS.register_module()
+class SSDAnchorGenerator(AnchorGenerator):
+    """Anchor generator for SSD.
+
+    Args:
+        strides (list[int]  | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        min_sizes (list[float]): The list of minimum anchor sizes on each
+            level.
+        max_sizes (list[float]): The list of maximum anchor sizes on each
+            level.
+        basesize_ratio_range (tuple(float)): Ratio range of anchors. Being
+            used when not setting min_sizes and max_sizes.
+        input_size (int): Size of feature map, 300 for SSD300, 512 for
+            SSD512. Being used when not setting min_sizes and max_sizes.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. It is always set to be False in SSD.
+        use_box_type (bool): Whether to warp anchors with the box type data
+            structure. Defaults to False.
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 ratios: List[float],
+                 min_sizes: Optional[List[float]] = None,
+                 max_sizes: Optional[List[float]] = None,
+                 basesize_ratio_range: Tuple[float] = (0.15, 0.9),
+                 input_size: int = 300,
+                 scale_major: bool = True,
+                 use_box_type: bool = False) -> None:
+        assert len(strides) == len(ratios)
+        assert not (min_sizes is None) ^ (max_sizes is None)
+        self.strides = [_pair(stride) for stride in strides]
+        self.centers = [(stride[0] / 2., stride[1] / 2.)
+                        for stride in self.strides]
+
+        if min_sizes is None and max_sizes is None:
+            # use hard code to generate SSD anchors
+            self.input_size = input_size
+            assert is_tuple_of(basesize_ratio_range, float)
+            self.basesize_ratio_range = basesize_ratio_range
+            # calculate anchor ratios and sizes
+            min_ratio, max_ratio = basesize_ratio_range
+            min_ratio = int(min_ratio * 100)
+            max_ratio = int(max_ratio * 100)
+            step = int(np.floor(max_ratio - min_ratio) / (self.num_levels - 2))
+            min_sizes = []
+            max_sizes = []
+            for ratio in range(int(min_ratio), int(max_ratio) + 1, step):
+                min_sizes.append(int(self.input_size * ratio / 100))
+                max_sizes.append(int(self.input_size * (ratio + step) / 100))
+            if self.input_size == 300:
+                if basesize_ratio_range[0] == 0.15:  # SSD300 COCO
+                    min_sizes.insert(0, int(self.input_size * 7 / 100))
+                    max_sizes.insert(0, int(self.input_size * 15 / 100))
+                elif basesize_ratio_range[0] == 0.2:  # SSD300 VOC
+                    min_sizes.insert(0, int(self.input_size * 10 / 100))
+                    max_sizes.insert(0, int(self.input_size * 20 / 100))
+                else:
+                    raise ValueError(
+                        'basesize_ratio_range[0] should be either 0.15'
+                        'or 0.2 when input_size is 300, got '
+                        f'{basesize_ratio_range[0]}.')
+            elif self.input_size == 512:
+                if basesize_ratio_range[0] == 0.1:  # SSD512 COCO
+                    min_sizes.insert(0, int(self.input_size * 4 / 100))
+                    max_sizes.insert(0, int(self.input_size * 10 / 100))
+                elif basesize_ratio_range[0] == 0.15:  # SSD512 VOC
+                    min_sizes.insert(0, int(self.input_size * 7 / 100))
+                    max_sizes.insert(0, int(self.input_size * 15 / 100))
+                else:
+                    raise ValueError(
+                        'When not setting min_sizes and max_sizes,'
+                        'basesize_ratio_range[0] should be either 0.1'
+                        'or 0.15 when input_size is 512, got'
+                        f' {basesize_ratio_range[0]}.')
+            else:
+                raise ValueError(
+                    'Only support 300 or 512 in SSDAnchorGenerator when '
+                    'not setting min_sizes and max_sizes, '
+                    f'got {self.input_size}.')
+
+        assert len(min_sizes) == len(max_sizes) == len(strides)
+
+        anchor_ratios = []
+        anchor_scales = []
+        for k in range(len(self.strides)):
+            scales = [1., np.sqrt(max_sizes[k] / min_sizes[k])]
+            anchor_ratio = [1.]
+            for r in ratios[k]:
+                anchor_ratio += [1 / r, r]  # 4 or 6 ratio
+            anchor_ratios.append(torch.Tensor(anchor_ratio))
+            anchor_scales.append(torch.Tensor(scales))
+
+        self.base_sizes = min_sizes
+        self.scales = anchor_scales
+        self.ratios = anchor_ratios
+        self.scale_major = scale_major
+        self.center_offset = 0
+        self.base_anchors = self.gen_base_anchors()
+        self.use_box_type = use_box_type
+
+    def gen_base_anchors(self) -> List[Tensor]:
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            base_anchors = self.gen_single_level_base_anchors(
+                base_size,
+                scales=self.scales[i],
+                ratios=self.ratios[i],
+                center=self.centers[i])
+            indices = list(range(len(self.ratios[i])))
+            indices.insert(1, len(indices))
+            base_anchors = torch.index_select(base_anchors, 0,
+                                              torch.LongTensor(indices))
+            multi_level_base_anchors.append(base_anchors)
+        return multi_level_base_anchors
+
+    def __repr__(self) -> str:
+        """str: a string that describes the module"""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}strides={self.strides},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+        repr_str += f'{indent_str}input_size={self.input_size},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}ratios={self.ratios},\n'
+        repr_str += f'{indent_str}num_levels={self.num_levels},\n'
+        repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+        repr_str += f'{indent_str}basesize_ratio_range='
+        repr_str += f'{self.basesize_ratio_range})'
+        return repr_str
+
+
+@TASK_UTILS.register_module()
+class LegacyAnchorGenerator(AnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    Note:
+        Difference to the V2.0 anchor generator:
+
+        1. The center offset of V1.x anchors are set to be 0.5 rather than 0.
+        2. The width/height are minused by 1 when calculating the anchors' \
+            centers and corners to meet the V1.x coordinate system.
+        3. The anchors' corners are quantized.
+
+    Args:
+        strides (list[int] | list[tuple[int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int] | None): Anchor scales for anchors in a single level.
+            It cannot be set at the same time if `octave_base_scale` and
+            `scales_per_octave` are set.
+        base_sizes (list[int]): The basic sizes of anchors in multiple levels.
+            If None is given, strides will be used to generate base_sizes.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int): The base scale of octave.
+        scales_per_octave (int): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float, float]] | None): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. It a list of float
+            is given, this list will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in proportion to anchors'
+            width and height. By default it is 0.5 in V2.0 but it should be 0.5
+            in v1.x models.
+        use_box_type (bool): Whether to warp anchors with the box type data
+            structure. Defaults to False.
+
+    Examples:
+        >>> from mmdet.models.task_modules.
+        ... prior_generators import LegacyAnchorGenerator
+        >>> self = LegacyAnchorGenerator(
+        >>>     [16], [1.], [1.], [9], center_offset=0.5)
+        >>> all_anchors = self.grid_anchors(((2, 2),), device='cpu')
+        >>> print(all_anchors)
+        [tensor([[ 0.,  0.,  8.,  8.],
+                [16.,  0., 24.,  8.],
+                [ 0., 16.,  8., 24.],
+                [16., 16., 24., 24.]])]
+    """
+
+    def gen_single_level_base_anchors(self,
+                                      base_size: Union[int, float],
+                                      scales: Tensor,
+                                      ratios: Tensor,
+                                      center: Optional[Tuple[float]] = None) \
+            -> Tensor:
+        """Generate base anchors of a single level.
+
+        Note:
+            The width/height of anchors are minused by 1 when calculating \
+                the centers and corners to meet the V1.x coordinate system.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between the height.
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature map.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * (w - 1)
+            y_center = self.center_offset * (h - 1)
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [
+            x_center - 0.5 * (ws - 1), y_center - 0.5 * (hs - 1),
+            x_center + 0.5 * (ws - 1), y_center + 0.5 * (hs - 1)
+        ]
+        base_anchors = torch.stack(base_anchors, dim=-1).round()
+
+        return base_anchors
+
+
+@TASK_UTILS.register_module()
+class LegacySSDAnchorGenerator(SSDAnchorGenerator, LegacyAnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    The difference between `LegacySSDAnchorGenerator` and `SSDAnchorGenerator`
+    can be found in `LegacyAnchorGenerator`.
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 ratios: List[float],
+                 basesize_ratio_range: Tuple[float],
+                 input_size: int = 300,
+                 scale_major: bool = True,
+                 use_box_type: bool = False) -> None:
+        super(LegacySSDAnchorGenerator, self).__init__(
+            strides=strides,
+            ratios=ratios,
+            basesize_ratio_range=basesize_ratio_range,
+            input_size=input_size,
+            scale_major=scale_major,
+            use_box_type=use_box_type)
+        self.centers = [((stride - 1) / 2., (stride - 1) / 2.)
+                        for stride in strides]
+        self.base_anchors = self.gen_base_anchors()
+
+
+@TASK_UTILS.register_module()
+class YOLOAnchorGenerator(AnchorGenerator):
+    """Anchor generator for YOLO.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        base_sizes (list[list[tuple[int, int]]]): The basic sizes
+            of anchors in multiple levels.
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 base_sizes: List[List[Tuple[int, int]]],
+                 use_box_type: bool = False) -> None:
+        self.strides = [_pair(stride) for stride in strides]
+        self.centers = [(stride[0] / 2., stride[1] / 2.)
+                        for stride in self.strides]
+        self.base_sizes = []
+        num_anchor_per_level = len(base_sizes[0])
+        for base_sizes_per_level in base_sizes:
+            assert num_anchor_per_level == len(base_sizes_per_level)
+            self.base_sizes.append(
+                [_pair(base_size) for base_size in base_sizes_per_level])
+        self.base_anchors = self.gen_base_anchors()
+        self.use_box_type = use_box_type
+
+    @property
+    def num_levels(self) -> int:
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.base_sizes)
+
+    def gen_base_anchors(self) -> List[Tensor]:
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_sizes_per_level in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(
+                self.gen_single_level_base_anchors(base_sizes_per_level,
+                                                   center))
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self,
+                                      base_sizes_per_level: List[Tuple[int]],
+                                      center: Optional[Tuple[float]] = None) \
+            -> Tensor:
+        """Generate base anchors of a single level.
+
+        Args:
+            base_sizes_per_level (list[tuple[int]]): Basic sizes of
+                anchors.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        x_center, y_center = center
+        base_anchors = []
+        for base_size in base_sizes_per_level:
+            w, h = base_size
+
+            # use float anchor and the anchor's center is aligned with the
+            # pixel center
+            base_anchor = torch.Tensor([
+                x_center - 0.5 * w, y_center - 0.5 * h, x_center + 0.5 * w,
+                y_center + 0.5 * h
+            ])
+            base_anchors.append(base_anchor)
+        base_anchors = torch.stack(base_anchors, dim=0)
+
+        return base_anchors
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/prior_generators/point_generator.py b/head_extractor/build/lib/mmdet/models/task_modules/prior_generators/point_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..c87ad656c61cb251bfdfcbd23b1cc5263c68bf5f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/prior_generators/point_generator.py
@@ -0,0 +1,321 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.registry import TASK_UTILS
+
+DeviceType = Union[str, torch.device]
+
+
+@TASK_UTILS.register_module()
+class PointGenerator:
+
+    def _meshgrid(self,
+                  x: Tensor,
+                  y: Tensor,
+                  row_major: bool = True) -> Tuple[Tensor, Tensor]:
+        """Generate mesh grid of x and y.
+
+        Args:
+            x (torch.Tensor): Grids of x dimension.
+            y (torch.Tensor): Grids of y dimension.
+            row_major (bool): Whether to return y grids first.
+                Defaults to True.
+
+        Returns:
+            tuple[torch.Tensor]: The mesh grids of x and y.
+        """
+        xx = x.repeat(len(y))
+        yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_points(self,
+                    featmap_size: Tuple[int, int],
+                    stride=16,
+                    device: DeviceType = 'cuda') -> Tensor:
+        """Generate grid points of a single level.
+
+        Args:
+            featmap_size (tuple[int, int]): Size of the feature maps.
+            stride (int): The stride of corresponding feature map.
+            device (str | torch.device): The device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: grid point in a feature map.
+        """
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0., feat_w, device=device) * stride
+        shift_y = torch.arange(0., feat_h, device=device) * stride
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        stride = shift_x.new_full((shift_xx.shape[0], ), stride)
+        shifts = torch.stack([shift_xx, shift_yy, stride], dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self,
+                    featmap_size: Tuple[int, int],
+                    valid_size: Tuple[int, int],
+                    device: DeviceType = 'cuda') -> Tensor:
+        """Generate valid flags of anchors in a feature map.
+
+        Args:
+            featmap_sizes (list(tuple[int, int])): List of feature map sizes in
+                multiple feature levels.
+            valid_shape (tuple[int, int]): The valid shape of the image.
+            device (str | torch.device): Device where the anchors will be
+                put on.
+
+        Return:
+            torch.Tensor: Valid flags of anchors in a level.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
+
+
+@TASK_UTILS.register_module()
+class MlvlPointGenerator:
+    """Standard points generator for multi-level (Mlvl) feature maps in 2D
+    points-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        offset (float): The offset of points, the value is normalized with
+            corresponding stride. Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 offset: float = 0.5) -> None:
+        self.strides = [_pair(stride) for stride in strides]
+        self.offset = offset
+
+    @property
+    def num_levels(self) -> int:
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    @property
+    def num_base_priors(self) -> List[int]:
+        """list[int]: The number of priors (points) at a point
+        on the feature grid"""
+        return [1 for _ in range(len(self.strides))]
+
+    def _meshgrid(self,
+                  x: Tensor,
+                  y: Tensor,
+                  row_major: bool = True) -> Tuple[Tensor, Tensor]:
+        yy, xx = torch.meshgrid(y, x)
+        if row_major:
+            # warning .flatten() would cause error in ONNX exporting
+            # have to use reshape here
+            return xx.reshape(-1), yy.reshape(-1)
+
+        else:
+            return yy.reshape(-1), xx.reshape(-1)
+
+    def grid_priors(self,
+                    featmap_sizes: List[Tuple],
+                    dtype: torch.dtype = torch.float32,
+                    device: DeviceType = 'cuda',
+                    with_stride: bool = False) -> List[Tensor]:
+        """Generate grid points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            dtype (:obj:`dtype`): Dtype of priors. Defaults to torch.float32.
+            device (str | torch.device): The device where the anchors will be
+                put on.
+            with_stride (bool): Whether to concatenate the stride to
+                the last dimension of points.
+
+        Return:
+            list[torch.Tensor]: Points of  multiple feature levels.
+            The sizes of each tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_priors = []
+        for i in range(self.num_levels):
+            priors = self.single_level_grid_priors(
+                featmap_sizes[i],
+                level_idx=i,
+                dtype=dtype,
+                device=device,
+                with_stride=with_stride)
+            multi_level_priors.append(priors)
+        return multi_level_priors
+
+    def single_level_grid_priors(self,
+                                 featmap_size: Tuple[int],
+                                 level_idx: int,
+                                 dtype: torch.dtype = torch.float32,
+                                 device: DeviceType = 'cuda',
+                                 with_stride: bool = False) -> Tensor:
+        """Generate grid Points of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature maps, arrange as
+                (h, w).
+            level_idx (int): The index of corresponding feature map level.
+            dtype (:obj:`dtype`): Dtype of priors. Defaults to torch.float32.
+            device (str | torch.device): The device the tensor will be put on.
+                Defaults to 'cuda'.
+            with_stride (bool): Concatenate the stride to the last dimension
+                of points.
+
+        Return:
+            Tensor: Points of single feature levels.
+            The shape of tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        shift_x = (torch.arange(0, feat_w, device=device) +
+                   self.offset) * stride_w
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_x = shift_x.to(dtype)
+
+        shift_y = (torch.arange(0, feat_h, device=device) +
+                   self.offset) * stride_h
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_y = shift_y.to(dtype)
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        if not with_stride:
+            shifts = torch.stack([shift_xx, shift_yy], dim=-1)
+        else:
+            # use `shape[0]` instead of `len(shift_xx)` for ONNX export
+            stride_w = shift_xx.new_full((shift_xx.shape[0], ),
+                                         stride_w).to(dtype)
+            stride_h = shift_xx.new_full((shift_yy.shape[0], ),
+                                         stride_h).to(dtype)
+            shifts = torch.stack([shift_xx, shift_yy, stride_w, stride_h],
+                                 dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self,
+                    featmap_sizes: List[Tuple[int, int]],
+                    pad_shape: Tuple[int],
+                    device: DeviceType = 'cuda') -> List[Tensor]:
+        """Generate valid flags of points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            pad_shape (tuple(int)): The padded shape of the image,
+                arrange as (h, w).
+            device (str | torch.device): The device where the anchors will be
+                put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of points of multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            point_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / point_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / point_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w),
+                                                  (valid_feat_h, valid_feat_w),
+                                                  device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self,
+                                 featmap_size: Tuple[int, int],
+                                 valid_size: Tuple[int, int],
+                                 device: DeviceType = 'cuda') -> Tensor:
+        """Generate the valid flags of points of a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange as
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+                The size arrange as as (h, w).
+            device (str | torch.device): The device where the flags will be
+            put on. Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each points in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
+
+    def sparse_priors(self,
+                      prior_idxs: Tensor,
+                      featmap_size: Tuple[int],
+                      level_idx: int,
+                      dtype: torch.dtype = torch.float32,
+                      device: DeviceType = 'cuda') -> Tensor:
+        """Generate sparse points according to the ``prior_idxs``.
+
+        Args:
+            prior_idxs (Tensor): The index of corresponding anchors
+                in the feature map.
+            featmap_size (tuple[int]): feature map size arrange as (w, h).
+            level_idx (int): The level index of corresponding feature
+                map.
+            dtype (obj:`torch.dtype`): Date type of points. Defaults to
+                ``torch.float32``.
+            device (str | torch.device): The device where the points is
+                located.
+        Returns:
+            Tensor: Anchor with shape (N, 2), N should be equal to
+            the length of ``prior_idxs``. And last dimension
+            2 represent (coord_x, coord_y).
+        """
+        height, width = featmap_size
+        x = (prior_idxs % width + self.offset) * self.strides[level_idx][0]
+        y = ((prior_idxs // width) % height +
+             self.offset) * self.strides[level_idx][1]
+        prioris = torch.stack([x, y], 1).to(dtype)
+        prioris = prioris.to(device)
+        return prioris
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/prior_generators/utils.py b/head_extractor/build/lib/mmdet/models/task_modules/prior_generators/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aa2dfd49669ba931d20ad9482cb841698cceb8a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/prior_generators/utils.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.structures.bbox import BaseBoxes
+
+
+def anchor_inside_flags(flat_anchors: Tensor,
+                        valid_flags: Tensor,
+                        img_shape: Tuple[int],
+                        allowed_border: int = 0) -> Tensor:
+    """Check whether the anchors are inside the border.
+
+    Args:
+        flat_anchors (torch.Tensor): Flatten anchors, shape (n, 4).
+        valid_flags (torch.Tensor): An existing valid flags of anchors.
+        img_shape (tuple(int)): Shape of current image.
+        allowed_border (int): The border to allow the valid anchor.
+            Defaults to 0.
+
+    Returns:
+        torch.Tensor: Flags indicating whether the anchors are inside a \
+            valid range.
+    """
+    img_h, img_w = img_shape[:2]
+    if allowed_border >= 0:
+        if isinstance(flat_anchors, BaseBoxes):
+            inside_flags = valid_flags & \
+                flat_anchors.is_inside([img_h, img_w],
+                                       all_inside=True,
+                                       allowed_border=allowed_border)
+        else:
+            inside_flags = valid_flags & \
+                (flat_anchors[:, 0] >= -allowed_border) & \
+                (flat_anchors[:, 1] >= -allowed_border) & \
+                (flat_anchors[:, 2] < img_w + allowed_border) & \
+                (flat_anchors[:, 3] < img_h + allowed_border)
+    else:
+        inside_flags = valid_flags
+    return inside_flags
+
+
+def calc_region(bbox: Tensor,
+                ratio: float,
+                featmap_size: Optional[Tuple] = None) -> Tuple[int]:
+    """Calculate a proportional bbox region.
+
+    The bbox center are fixed and the new h' and w' is h * ratio and w * ratio.
+
+    Args:
+        bbox (Tensor): Bboxes to calculate regions, shape (n, 4).
+        ratio (float): Ratio of the output region.
+        featmap_size (tuple, Optional): Feature map size in (height, width)
+            order used for clipping the boundary. Defaults to None.
+
+    Returns:
+        tuple: x1, y1, x2, y2
+    """
+    x1 = torch.round((1 - ratio) * bbox[0] + ratio * bbox[2]).long()
+    y1 = torch.round((1 - ratio) * bbox[1] + ratio * bbox[3]).long()
+    x2 = torch.round(ratio * bbox[0] + (1 - ratio) * bbox[2]).long()
+    y2 = torch.round(ratio * bbox[1] + (1 - ratio) * bbox[3]).long()
+    if featmap_size is not None:
+        x1 = x1.clamp(min=0, max=featmap_size[1])
+        y1 = y1.clamp(min=0, max=featmap_size[0])
+        x2 = x2.clamp(min=0, max=featmap_size[1])
+        y2 = y2.clamp(min=0, max=featmap_size[0])
+    return (x1, y1, x2, y2)
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/__init__.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3782eb898cf8acace63b4f16204cae6c07eb6e30
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_sampler import BaseSampler
+from .combined_sampler import CombinedSampler
+from .instance_balanced_pos_sampler import InstanceBalancedPosSampler
+from .iou_balanced_neg_sampler import IoUBalancedNegSampler
+from .mask_pseudo_sampler import MaskPseudoSampler
+from .mask_sampling_result import MaskSamplingResult
+from .multi_instance_random_sampler import MultiInsRandomSampler
+from .multi_instance_sampling_result import MultiInstanceSamplingResult
+from .ohem_sampler import OHEMSampler
+from .pseudo_sampler import PseudoSampler
+from .random_sampler import RandomSampler
+from .sampling_result import SamplingResult
+from .score_hlr_sampler import ScoreHLRSampler
+
+__all__ = [
+    'BaseSampler', 'PseudoSampler', 'RandomSampler',
+    'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
+    'OHEMSampler', 'SamplingResult', 'ScoreHLRSampler', 'MaskPseudoSampler',
+    'MaskSamplingResult', 'MultiInstanceSamplingResult',
+    'MultiInsRandomSampler'
+]
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/base_sampler.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/base_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..be8a9a5ee3ec4e70b19aeea21b7998cf2b131d59
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/base_sampler.py
@@ -0,0 +1,136 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.structures.bbox import BaseBoxes, cat_boxes
+from ..assigners import AssignResult
+from .sampling_result import SamplingResult
+
+
+class BaseSampler(metaclass=ABCMeta):
+    """Base class of samplers.
+
+    Args:
+        num (int): Number of samples
+        pos_fraction (float): Fraction of positive samples
+        neg_pos_up (int): Upper bound number of negative and
+            positive samples. Defaults to -1.
+        add_gt_as_proposals (bool): Whether to add ground truth
+            boxes as proposals. Defaults to True.
+    """
+
+    def __init__(self,
+                 num: int,
+                 pos_fraction: float,
+                 neg_pos_ub: int = -1,
+                 add_gt_as_proposals: bool = True,
+                 **kwargs) -> None:
+        self.num = num
+        self.pos_fraction = pos_fraction
+        self.neg_pos_ub = neg_pos_ub
+        self.add_gt_as_proposals = add_gt_as_proposals
+        self.pos_sampler = self
+        self.neg_sampler = self
+
+    @abstractmethod
+    def _sample_pos(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs):
+        """Sample positive samples."""
+        pass
+
+    @abstractmethod
+    def _sample_neg(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs):
+        """Sample negative samples."""
+        pass
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, **kwargs) -> SamplingResult:
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigning results.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+
+        Example:
+            >>> from mmengine.structures import InstanceData
+            >>> from mmdet.models.task_modules.samplers import RandomSampler,
+            >>> from mmdet.models.task_modules.assigners import AssignResult
+            >>> from mmdet.models.task_modules.samplers.
+            ... sampling_result import ensure_rng, random_boxes
+            >>> rng = ensure_rng(None)
+            >>> assign_result = AssignResult.random(rng=rng)
+            >>> pred_instances = InstanceData()
+            >>> pred_instances.priors = random_boxes(assign_result.num_preds,
+            ...                                      rng=rng)
+            >>> gt_instances = InstanceData()
+            >>> gt_instances.bboxes = random_boxes(assign_result.num_gts,
+            ...                                    rng=rng)
+            >>> gt_instances.labels = torch.randint(
+            ...     0, 5, (assign_result.num_gts,), dtype=torch.long)
+            >>> self = RandomSampler(num=32, pos_fraction=0.5, neg_pos_ub=-1,
+            >>>                      add_gt_as_proposals=False)
+            >>> self = self.sample(assign_result, pred_instances, gt_instances)
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+        if len(priors.shape) < 2:
+            priors = priors[None, :]
+
+        gt_flags = priors.new_zeros((priors.shape[0], ), dtype=torch.uint8)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            # When `gt_bboxes` and `priors` are all box type, convert
+            # `gt_bboxes` type to `priors` type.
+            if (isinstance(gt_bboxes, BaseBoxes)
+                    and isinstance(priors, BaseBoxes)):
+                gt_bboxes_ = gt_bboxes.convert_to(type(priors))
+            else:
+                gt_bboxes_ = gt_bboxes
+            priors = cat_boxes([gt_bboxes_, priors], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = priors.new_ones(gt_bboxes_.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=priors, **kwargs)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(
+            assign_result, num_expected_neg, bboxes=priors, **kwargs)
+        neg_inds = neg_inds.unique()
+
+        sampling_result = SamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags)
+        return sampling_result
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/combined_sampler.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/combined_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e0560e372efffe865fa32028d823280a8bd5d87
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/combined_sampler.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import TASK_UTILS
+from .base_sampler import BaseSampler
+
+
+@TASK_UTILS.register_module()
+class CombinedSampler(BaseSampler):
+    """A sampler that combines positive sampler and negative sampler."""
+
+    def __init__(self, pos_sampler, neg_sampler, **kwargs):
+        super(CombinedSampler, self).__init__(**kwargs)
+        self.pos_sampler = TASK_UTILS.build(pos_sampler, default_args=kwargs)
+        self.neg_sampler = TASK_UTILS.build(neg_sampler, default_args=kwargs)
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..e48d8e9158e8dabf0bb4072b8e421de9b6410d00
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.registry import TASK_UTILS
+from .random_sampler import RandomSampler
+
+
+@TASK_UTILS.register_module()
+class InstanceBalancedPosSampler(RandomSampler):
+    """Instance balanced sampler that samples equal number of positive samples
+    for each instance."""
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Sample positive boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): The assigned results of boxes.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            unique_gt_inds = assign_result.gt_inds[pos_inds].unique()
+            num_gts = len(unique_gt_inds)
+            num_per_gt = int(round(num_expected / float(num_gts)) + 1)
+            sampled_inds = []
+            for i in unique_gt_inds:
+                inds = torch.nonzero(
+                    assign_result.gt_inds == i.item(), as_tuple=False)
+                if inds.numel() != 0:
+                    inds = inds.squeeze(1)
+                else:
+                    continue
+                if len(inds) > num_per_gt:
+                    inds = self.random_choice(inds, num_per_gt)
+                sampled_inds.append(inds)
+            sampled_inds = torch.cat(sampled_inds)
+            if len(sampled_inds) < num_expected:
+                num_extra = num_expected - len(sampled_inds)
+                extra_inds = np.array(
+                    list(set(pos_inds.cpu()) - set(sampled_inds.cpu())))
+                if len(extra_inds) > num_extra:
+                    extra_inds = self.random_choice(extra_inds, num_extra)
+                extra_inds = torch.from_numpy(extra_inds).to(
+                    assign_result.gt_inds.device).long()
+                sampled_inds = torch.cat([sampled_inds, extra_inds])
+            elif len(sampled_inds) > num_expected:
+                sampled_inds = self.random_choice(sampled_inds, num_expected)
+            return sampled_inds
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc1f46413c99d115f31ef190b4fb198b588a156e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.registry import TASK_UTILS
+from .random_sampler import RandomSampler
+
+
+@TASK_UTILS.register_module()
+class IoUBalancedNegSampler(RandomSampler):
+    """IoU Balanced Sampling.
+
+    arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019)
+
+    Sampling proposals according to their IoU. `floor_fraction` of needed RoIs
+    are sampled from proposals whose IoU are lower than `floor_thr` randomly.
+    The others are sampled from proposals whose IoU are higher than
+    `floor_thr`. These proposals are sampled from some bins evenly, which are
+    split by `num_bins` via IoU evenly.
+
+    Args:
+        num (int): number of proposals.
+        pos_fraction (float): fraction of positive proposals.
+        floor_thr (float): threshold (minimum) IoU for IoU balanced sampling,
+            set to -1 if all using IoU balanced sampling.
+        floor_fraction (float): sampling fraction of proposals under floor_thr.
+        num_bins (int): number of bins in IoU balanced sampling.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 floor_thr=-1,
+                 floor_fraction=0,
+                 num_bins=3,
+                 **kwargs):
+        super(IoUBalancedNegSampler, self).__init__(num, pos_fraction,
+                                                    **kwargs)
+        assert floor_thr >= 0 or floor_thr == -1
+        assert 0 <= floor_fraction <= 1
+        assert num_bins >= 1
+
+        self.floor_thr = floor_thr
+        self.floor_fraction = floor_fraction
+        self.num_bins = num_bins
+
+    def sample_via_interval(self, max_overlaps, full_set, num_expected):
+        """Sample according to the iou interval.
+
+        Args:
+            max_overlaps (torch.Tensor): IoU between bounding boxes and ground
+                truth boxes.
+            full_set (set(int)): A full set of indices of boxes。
+            num_expected (int): Number of expected samples。
+
+        Returns:
+            np.ndarray: Indices  of samples
+        """
+        max_iou = max_overlaps.max()
+        iou_interval = (max_iou - self.floor_thr) / self.num_bins
+        per_num_expected = int(num_expected / self.num_bins)
+
+        sampled_inds = []
+        for i in range(self.num_bins):
+            start_iou = self.floor_thr + i * iou_interval
+            end_iou = self.floor_thr + (i + 1) * iou_interval
+            tmp_set = set(
+                np.where(
+                    np.logical_and(max_overlaps >= start_iou,
+                                   max_overlaps < end_iou))[0])
+            tmp_inds = list(tmp_set & full_set)
+            if len(tmp_inds) > per_num_expected:
+                tmp_sampled_set = self.random_choice(tmp_inds,
+                                                     per_num_expected)
+            else:
+                tmp_sampled_set = np.array(tmp_inds, dtype=np.int64)
+            sampled_inds.append(tmp_sampled_set)
+
+        sampled_inds = np.concatenate(sampled_inds)
+        if len(sampled_inds) < num_expected:
+            num_extra = num_expected - len(sampled_inds)
+            extra_inds = np.array(list(full_set - set(sampled_inds)))
+            if len(extra_inds) > num_extra:
+                extra_inds = self.random_choice(extra_inds, num_extra)
+            sampled_inds = np.concatenate([sampled_inds, extra_inds])
+
+        return sampled_inds
+
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Sample negative boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): The assigned results of boxes.
+            num_expected (int): The number of expected negative samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            max_overlaps = assign_result.max_overlaps.cpu().numpy()
+            # balance sampling for negative samples
+            neg_set = set(neg_inds.cpu().numpy())
+
+            if self.floor_thr > 0:
+                floor_set = set(
+                    np.where(
+                        np.logical_and(max_overlaps >= 0,
+                                       max_overlaps < self.floor_thr))[0])
+                iou_sampling_set = set(
+                    np.where(max_overlaps >= self.floor_thr)[0])
+            elif self.floor_thr == 0:
+                floor_set = set(np.where(max_overlaps == 0)[0])
+                iou_sampling_set = set(
+                    np.where(max_overlaps > self.floor_thr)[0])
+            else:
+                floor_set = set()
+                iou_sampling_set = set(
+                    np.where(max_overlaps > self.floor_thr)[0])
+                # for sampling interval calculation
+                self.floor_thr = 0
+
+            floor_neg_inds = list(floor_set & neg_set)
+            iou_sampling_neg_inds = list(iou_sampling_set & neg_set)
+            num_expected_iou_sampling = int(num_expected *
+                                            (1 - self.floor_fraction))
+            if len(iou_sampling_neg_inds) > num_expected_iou_sampling:
+                if self.num_bins >= 2:
+                    iou_sampled_inds = self.sample_via_interval(
+                        max_overlaps, set(iou_sampling_neg_inds),
+                        num_expected_iou_sampling)
+                else:
+                    iou_sampled_inds = self.random_choice(
+                        iou_sampling_neg_inds, num_expected_iou_sampling)
+            else:
+                iou_sampled_inds = np.array(
+                    iou_sampling_neg_inds, dtype=np.int64)
+            num_expected_floor = num_expected - len(iou_sampled_inds)
+            if len(floor_neg_inds) > num_expected_floor:
+                sampled_floor_inds = self.random_choice(
+                    floor_neg_inds, num_expected_floor)
+            else:
+                sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int64)
+            sampled_inds = np.concatenate(
+                (sampled_floor_inds, iou_sampled_inds))
+            if len(sampled_inds) < num_expected:
+                num_extra = num_expected - len(sampled_inds)
+                extra_inds = np.array(list(neg_set - set(sampled_inds)))
+                if len(extra_inds) > num_extra:
+                    extra_inds = self.random_choice(extra_inds, num_extra)
+                sampled_inds = np.concatenate((sampled_inds, extra_inds))
+            sampled_inds = torch.from_numpy(sampled_inds).long().to(
+                assign_result.gt_inds.device)
+            return sampled_inds
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..307dd5d15c962b97dc60b899e60170d0bfed90a7
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""copy from
+https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py."""
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from ..assigners import AssignResult
+from .base_sampler import BaseSampler
+from .mask_sampling_result import MaskSamplingResult
+
+
+@TASK_UTILS.register_module()
+class MaskPseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, *args, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Mask assigning results.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``scores`` and ``masks`` predicted
+                by the model.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``labels`` and ``masks``
+                attributes.
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        gt_flags = pred_masks.new_zeros(pred_masks.shape[0], dtype=torch.uint8)
+        sampling_result = MaskSamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            masks=pred_masks,
+            gt_masks=gt_masks,
+            assign_result=assign_result,
+            gt_flags=gt_flags,
+            avg_factor_with_neg=False)
+        return sampling_result
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/mask_sampling_result.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/mask_sampling_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..adaa62e8a0af28bb004a34b961f672ec03988d2c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/mask_sampling_result.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""copy from
+https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py."""
+
+import torch
+from torch import Tensor
+
+from ..assigners import AssignResult
+from .sampling_result import SamplingResult
+
+
+class MaskSamplingResult(SamplingResult):
+    """Mask sampling result."""
+
+    def __init__(self,
+                 pos_inds: Tensor,
+                 neg_inds: Tensor,
+                 masks: Tensor,
+                 gt_masks: Tensor,
+                 assign_result: AssignResult,
+                 gt_flags: Tensor,
+                 avg_factor_with_neg: bool = True) -> None:
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.num_pos = max(pos_inds.numel(), 1)
+        self.num_neg = max(neg_inds.numel(), 1)
+        self.avg_factor = self.num_pos + self.num_neg \
+            if avg_factor_with_neg else self.num_pos
+
+        self.pos_masks = masks[pos_inds]
+        self.neg_masks = masks[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_masks.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_masks.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_masks = torch.empty_like(gt_masks)
+        else:
+            self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :]
+
+    @property
+    def masks(self) -> Tensor:
+        """torch.Tensor: concatenated positive and negative masks."""
+        return torch.cat([self.pos_masks, self.neg_masks])
+
+    def __nice__(self) -> str:
+        data = self.info.copy()
+        data['pos_masks'] = data.pop('pos_masks').shape
+        data['neg_masks'] = data.pop('neg_masks').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self) -> dict:
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_masks': self.pos_masks,
+            'neg_masks': self.neg_masks,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+        }
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b74054e3a11ed6025e98e90bd0addb131a1dc02
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from mmengine.structures import InstanceData
+from numpy import ndarray
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from ..assigners import AssignResult
+from .multi_instance_sampling_result import MultiInstanceSamplingResult
+from .random_sampler import RandomSampler
+
+
+@TASK_UTILS.register_module()
+class MultiInsRandomSampler(RandomSampler):
+    """Random sampler for multi instance.
+
+    Note:
+        Multi-instance means to predict multiple detection boxes with
+        one proposal box. `AssignResult` may assign multiple gt boxes
+        to each proposal box, in this case `RandomSampler` should be
+        replaced by `MultiInsRandomSampler`
+    """
+
+    def _sample_pos(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some positive samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(
+            assign_result.labels[:, 0] > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some negative samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(
+            assign_result.labels[:, 0] == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            return self.random_choice(neg_inds, num_expected)
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               **kwargs) -> MultiInstanceSamplingResult:
+        """Sample positive and negative bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigning results from
+                MultiInstanceAssigner.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+
+        Returns:
+            :obj:`MultiInstanceSamplingResult`: Sampling result.
+        """
+
+        assert 'batch_gt_instances_ignore' in kwargs, \
+            'batch_gt_instances_ignore is necessary for MultiInsRandomSampler'
+
+        gt_bboxes = gt_instances.bboxes
+        ignore_bboxes = kwargs['batch_gt_instances_ignore'].bboxes
+        gt_and_ignore_bboxes = torch.cat([gt_bboxes, ignore_bboxes], dim=0)
+        priors = pred_instances.priors
+        if len(priors.shape) < 2:
+            priors = priors[None, :]
+        priors = priors[:, :4]
+
+        gt_flags = priors.new_zeros((priors.shape[0], ), dtype=torch.uint8)
+        priors = torch.cat([priors, gt_and_ignore_bboxes], dim=0)
+        gt_ones = priors.new_ones(
+            gt_and_ignore_bboxes.shape[0], dtype=torch.uint8)
+        gt_flags = torch.cat([gt_flags, gt_ones])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(assign_result,
+                                                num_expected_pos)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(assign_result,
+                                                num_expected_neg)
+        neg_inds = neg_inds.unique()
+
+        sampling_result = MultiInstanceSamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_and_ignore_bboxes=gt_and_ignore_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags)
+        return sampling_result
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..438a0aa91c0cc8904f6d8bba7139408dd99b98cf
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import Tensor
+
+from ..assigners import AssignResult
+from .sampling_result import SamplingResult
+
+
+class MultiInstanceSamplingResult(SamplingResult):
+    """Bbox sampling result. Further encapsulation of SamplingResult. Three
+    attributes neg_assigned_gt_inds, neg_gt_labels, and neg_gt_bboxes have been
+    added for SamplingResult.
+
+    Args:
+        pos_inds (Tensor): Indices of positive samples.
+        neg_inds (Tensor): Indices of negative samples.
+        priors (Tensor): The priors can be anchors or points,
+            or the bboxes predicted by the previous stage.
+        gt_and_ignore_bboxes (Tensor): Ground truth and ignore bboxes.
+        assign_result (:obj:`AssignResult`): Assigning results.
+        gt_flags (Tensor): The Ground truth flags.
+        avg_factor_with_neg (bool):  If True, ``avg_factor`` equal to
+            the number of total priors; Otherwise, it is the number of
+            positive priors. Defaults to True.
+    """
+
+    def __init__(self,
+                 pos_inds: Tensor,
+                 neg_inds: Tensor,
+                 priors: Tensor,
+                 gt_and_ignore_bboxes: Tensor,
+                 assign_result: AssignResult,
+                 gt_flags: Tensor,
+                 avg_factor_with_neg: bool = True) -> None:
+        self.neg_assigned_gt_inds = assign_result.gt_inds[neg_inds]
+        self.neg_gt_labels = assign_result.labels[neg_inds]
+
+        if gt_and_ignore_bboxes.numel() == 0:
+            self.neg_gt_bboxes = torch.empty_like(gt_and_ignore_bboxes).view(
+                -1, 4)
+        else:
+            if len(gt_and_ignore_bboxes.shape) < 2:
+                gt_and_ignore_bboxes = gt_and_ignore_bboxes.view(-1, 4)
+            self.neg_gt_bboxes = gt_and_ignore_bboxes[
+                self.neg_assigned_gt_inds.long(), :]
+
+        # To resist the minus 1 operation in `SamplingResult.init()`.
+        assign_result.gt_inds += 1
+        super().__init__(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_and_ignore_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags,
+            avg_factor_with_neg=avg_factor_with_neg)
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/ohem_sampler.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/ohem_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f478a448cde00d64caeba1d0ba613d2497a7fb12
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/ohem_sampler.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox2roi
+from .base_sampler import BaseSampler
+
+
+@TASK_UTILS.register_module()
+class OHEMSampler(BaseSampler):
+    r"""Online Hard Example Mining Sampler described in `Training Region-based
+    Object Detectors with Online Hard Example Mining
+    <https://arxiv.org/abs/1604.03540>`_.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 context,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 loss_key='loss_cls',
+                 **kwargs):
+        super(OHEMSampler, self).__init__(num, pos_fraction, neg_pos_ub,
+                                          add_gt_as_proposals)
+        self.context = context
+        if not hasattr(self.context, 'num_stages'):
+            self.bbox_head = self.context.bbox_head
+        else:
+            self.bbox_head = self.context.bbox_head[self.context.current_stage]
+
+        self.loss_key = loss_key
+
+    def hard_mining(self, inds, num_expected, bboxes, labels, feats):
+        with torch.no_grad():
+            rois = bbox2roi([bboxes])
+            if not hasattr(self.context, 'num_stages'):
+                bbox_results = self.context._bbox_forward(feats, rois)
+            else:
+                bbox_results = self.context._bbox_forward(
+                    self.context.current_stage, feats, rois)
+            cls_score = bbox_results['cls_score']
+            loss = self.bbox_head.loss(
+                cls_score=cls_score,
+                bbox_pred=None,
+                rois=rois,
+                labels=labels,
+                label_weights=cls_score.new_ones(cls_score.size(0)),
+                bbox_targets=None,
+                bbox_weights=None,
+                reduction_override='none')[self.loss_key]
+            _, topk_loss_inds = loss.topk(num_expected)
+        return inds[topk_loss_inds]
+
+    def _sample_pos(self,
+                    assign_result,
+                    num_expected,
+                    bboxes=None,
+                    feats=None,
+                    **kwargs):
+        """Sample positive boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            num_expected (int): Number of expected positive samples
+            bboxes (torch.Tensor, optional): Boxes. Defaults to None.
+            feats (list[torch.Tensor], optional): Multi-level features.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Indices  of positive samples
+        """
+        # Sample some hard positive samples
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.hard_mining(pos_inds, num_expected, bboxes[pos_inds],
+                                    assign_result.labels[pos_inds], feats)
+
+    def _sample_neg(self,
+                    assign_result,
+                    num_expected,
+                    bboxes=None,
+                    feats=None,
+                    **kwargs):
+        """Sample negative boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            num_expected (int): Number of expected negative samples
+            bboxes (torch.Tensor, optional): Boxes. Defaults to None.
+            feats (list[torch.Tensor], optional): Multi-level features.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Indices  of negative samples
+        """
+        # Sample some hard negative samples
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            neg_labels = assign_result.labels.new_empty(
+                neg_inds.size(0)).fill_(self.bbox_head.num_classes)
+            return self.hard_mining(neg_inds, num_expected, bboxes[neg_inds],
+                                    neg_labels, feats)
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/pseudo_sampler.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/pseudo_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8186cc3364516f34abe1c293017db6e2042d92a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/pseudo_sampler.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from ..assigners import AssignResult
+from .base_sampler import BaseSampler
+from .sampling_result import SamplingResult
+
+
+@TASK_UTILS.register_module()
+class PseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, *args, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+
+        gt_flags = priors.new_zeros(priors.shape[0], dtype=torch.uint8)
+        sampling_result = SamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags,
+            avg_factor_with_neg=False)
+        return sampling_result
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/random_sampler.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/random_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa03665fc36cc6a0084431324b16727b2dc8993e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/random_sampler.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from numpy import ndarray
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from ..assigners import AssignResult
+from .base_sampler import BaseSampler
+
+
+@TASK_UTILS.register_module()
+class RandomSampler(BaseSampler):
+    """Random sampler.
+
+    Args:
+        num (int): Number of samples
+        pos_fraction (float): Fraction of positive samples
+        neg_pos_up (int): Upper bound number of negative and
+            positive samples. Defaults to -1.
+        add_gt_as_proposals (bool): Whether to add ground truth
+            boxes as proposals. Defaults to True.
+    """
+
+    def __init__(self,
+                 num: int,
+                 pos_fraction: float,
+                 neg_pos_ub: int = -1,
+                 add_gt_as_proposals: bool = True,
+                 **kwargs):
+        from .sampling_result import ensure_rng
+        super().__init__(
+            num=num,
+            pos_fraction=pos_fraction,
+            neg_pos_ub=neg_pos_ub,
+            add_gt_as_proposals=add_gt_as_proposals)
+        self.rng = ensure_rng(kwargs.get('rng', None))
+
+    def random_choice(self, gallery: Union[Tensor, ndarray, list],
+                      num: int) -> Union[Tensor, ndarray]:
+        """Random select some elements from the gallery.
+
+        If `gallery` is a Tensor, the returned indices will be a Tensor;
+        If `gallery` is a ndarray or list, the returned indices will be a
+        ndarray.
+
+        Args:
+            gallery (Tensor | ndarray | list): indices pool.
+            num (int): expected sample num.
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        assert len(gallery) >= num
+
+        is_tensor = isinstance(gallery, torch.Tensor)
+        if not is_tensor:
+            if torch.cuda.is_available():
+                device = torch.cuda.current_device()
+            else:
+                device = 'cpu'
+            gallery = torch.tensor(gallery, dtype=torch.long, device=device)
+        # This is a temporary fix. We can revert the following code
+        # when PyTorch fixes the abnormal return of torch.randperm.
+        # See: https://github.com/open-mmlab/mmdetection/pull/5014
+        perm = torch.randperm(gallery.numel())[:num].to(device=gallery.device)
+        rand_inds = gallery[perm]
+        if not is_tensor:
+            rand_inds = rand_inds.cpu().numpy()
+        return rand_inds
+
+    def _sample_pos(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some positive samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some negative samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            return self.random_choice(neg_inds, num_expected)
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/sampling_result.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/sampling_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb510ee68f24b8c444b6ed447016bfc785b825c2
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/sampling_result.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.structures.bbox import BaseBoxes, cat_boxes
+from mmdet.utils import util_mixins
+from mmdet.utils.util_random import ensure_rng
+from ..assigners import AssignResult
+
+
+def random_boxes(num=1, scale=1, rng=None):
+    """Simple version of ``kwimage.Boxes.random``
+
+    Returns:
+        Tensor: shape (n, 4) in x1, y1, x2, y2 format.
+
+    References:
+        https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390
+
+    Example:
+        >>> num = 3
+        >>> scale = 512
+        >>> rng = 0
+        >>> boxes = random_boxes(num, scale, rng)
+        >>> print(boxes)
+        tensor([[280.9925, 278.9802, 308.6148, 366.1769],
+                [216.9113, 330.6978, 224.0446, 456.5878],
+                [405.3632, 196.3221, 493.3953, 270.7942]])
+    """
+    rng = ensure_rng(rng)
+
+    tlbr = rng.rand(num, 4).astype(np.float32)
+
+    tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2])
+    tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3])
+    br_x = np.maximum(tlbr[:, 0], tlbr[:, 2])
+    br_y = np.maximum(tlbr[:, 1], tlbr[:, 3])
+
+    tlbr[:, 0] = tl_x * scale
+    tlbr[:, 1] = tl_y * scale
+    tlbr[:, 2] = br_x * scale
+    tlbr[:, 3] = br_y * scale
+
+    boxes = torch.from_numpy(tlbr)
+    return boxes
+
+
+class SamplingResult(util_mixins.NiceRepr):
+    """Bbox sampling result.
+
+    Args:
+        pos_inds (Tensor): Indices of positive samples.
+        neg_inds (Tensor): Indices of negative samples.
+        priors (Tensor): The priors can be anchors or points,
+            or the bboxes predicted by the previous stage.
+        gt_bboxes (Tensor): Ground truth of bboxes.
+        assign_result (:obj:`AssignResult`): Assigning results.
+        gt_flags (Tensor): The Ground truth flags.
+        avg_factor_with_neg (bool):  If True, ``avg_factor`` equal to
+            the number of total priors; Otherwise, it is the number of
+            positive priors. Defaults to True.
+
+    Example:
+        >>> # xdoctest: +IGNORE_WANT
+        >>> from mmdet.models.task_modules.samplers.sampling_result import *  # NOQA
+        >>> self = SamplingResult.random(rng=10)
+        >>> print(f'self = {self}')
+        self = <SamplingResult({
+            'neg_inds': tensor([1,  2,  3,  5,  6,  7,  8,
+                                9, 10, 11, 12, 13]),
+            'neg_priors': torch.Size([12, 4]),
+            'num_gts': 1,
+            'num_neg': 12,
+            'num_pos': 1,
+            'avg_factor': 13,
+            'pos_assigned_gt_inds': tensor([0]),
+            'pos_inds': tensor([0]),
+            'pos_is_gt': tensor([1], dtype=torch.uint8),
+            'pos_priors': torch.Size([1, 4])
+        })>
+    """
+
+    def __init__(self,
+                 pos_inds: Tensor,
+                 neg_inds: Tensor,
+                 priors: Tensor,
+                 gt_bboxes: Tensor,
+                 assign_result: AssignResult,
+                 gt_flags: Tensor,
+                 avg_factor_with_neg: bool = True) -> None:
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.num_pos = max(pos_inds.numel(), 1)
+        self.num_neg = max(neg_inds.numel(), 1)
+        self.avg_factor_with_neg = avg_factor_with_neg
+        self.avg_factor = self.num_pos + self.num_neg \
+            if avg_factor_with_neg else self.num_pos
+        self.pos_priors = priors[pos_inds]
+        self.neg_priors = priors[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_bboxes.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+        self.pos_gt_labels = assign_result.labels[pos_inds]
+        box_dim = gt_bboxes.box_dim if isinstance(gt_bboxes, BaseBoxes) else 4
+        if gt_bboxes.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_bboxes = gt_bboxes.view(-1, box_dim)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, box_dim)
+            self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds.long()]
+
+    @property
+    def priors(self):
+        """torch.Tensor: concatenated positive and negative priors"""
+        return cat_boxes([self.pos_priors, self.neg_priors])
+
+    @property
+    def bboxes(self):
+        """torch.Tensor: concatenated positive and negative boxes"""
+        warnings.warn('DeprecationWarning: bboxes is deprecated, '
+                      'please use "priors" instead')
+        return self.priors
+
+    @property
+    def pos_bboxes(self):
+        warnings.warn('DeprecationWarning: pos_bboxes is deprecated, '
+                      'please use "pos_priors" instead')
+        return self.pos_priors
+
+    @property
+    def neg_bboxes(self):
+        warnings.warn('DeprecationWarning: neg_bboxes is deprecated, '
+                      'please use "neg_priors" instead')
+        return self.neg_priors
+
+    def to(self, device):
+        """Change the device of the data inplace.
+
+        Example:
+            >>> self = SamplingResult.random()
+            >>> print(f'self = {self.to(None)}')
+            >>> # xdoctest: +REQUIRES(--gpu)
+            >>> print(f'self = {self.to(0)}')
+        """
+        _dict = self.__dict__
+        for key, value in _dict.items():
+            if isinstance(value, (torch.Tensor, BaseBoxes)):
+                _dict[key] = value.to(device)
+        return self
+
+    def __nice__(self):
+        data = self.info.copy()
+        data['pos_priors'] = data.pop('pos_priors').shape
+        data['neg_priors'] = data.pop('neg_priors').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_priors': self.pos_priors,
+            'neg_priors': self.neg_priors,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+            'num_pos': self.num_pos,
+            'num_neg': self.num_neg,
+            'avg_factor': self.avg_factor
+        }
+
+    @classmethod
+    def random(cls, rng=None, **kwargs):
+        """
+        Args:
+            rng (None | int | numpy.random.RandomState): seed or state.
+            kwargs (keyword arguments):
+                - num_preds: Number of predicted boxes.
+                - num_gts: Number of true boxes.
+                - p_ignore (float): Probability of a predicted box assigned to
+                    an ignored truth.
+                - p_assigned (float): probability of a predicted box not being
+                    assigned.
+
+        Returns:
+            :obj:`SamplingResult`: Randomly generated sampling result.
+
+        Example:
+            >>> from mmdet.models.task_modules.samplers.sampling_result import *  # NOQA
+            >>> self = SamplingResult.random()
+            >>> print(self.__dict__)
+        """
+        from mmengine.structures import InstanceData
+
+        from mmdet.models.task_modules.assigners import AssignResult
+        from mmdet.models.task_modules.samplers import RandomSampler
+        rng = ensure_rng(rng)
+
+        # make probabilistic?
+        num = 32
+        pos_fraction = 0.5
+        neg_pos_ub = -1
+
+        assign_result = AssignResult.random(rng=rng, **kwargs)
+
+        # Note we could just compute an assignment
+        priors = random_boxes(assign_result.num_preds, rng=rng)
+        gt_bboxes = random_boxes(assign_result.num_gts, rng=rng)
+        gt_labels = torch.randint(
+            0, 5, (assign_result.num_gts, ), dtype=torch.long)
+
+        pred_instances = InstanceData()
+        pred_instances.priors = priors
+
+        gt_instances = InstanceData()
+        gt_instances.bboxes = gt_bboxes
+        gt_instances.labels = gt_labels
+
+        add_gt_as_proposals = True
+
+        sampler = RandomSampler(
+            num,
+            pos_fraction,
+            neg_pos_ub=neg_pos_ub,
+            add_gt_as_proposals=add_gt_as_proposals,
+            rng=rng)
+        self = sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+        return self
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/samplers/score_hlr_sampler.py b/head_extractor/build/lib/mmdet/models/task_modules/samplers/score_hlr_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..0227585b92329625d053f1e9f8c161fd02af8aef
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/samplers/score_hlr_sampler.py
@@ -0,0 +1,290 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from mmcv.ops import nms_match
+from mmengine.structures import InstanceData
+from numpy import ndarray
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox2roi
+from ..assigners import AssignResult
+from .base_sampler import BaseSampler
+from .sampling_result import SamplingResult
+
+
+@TASK_UTILS.register_module()
+class ScoreHLRSampler(BaseSampler):
+    r"""Importance-based Sample Reweighting (ISR_N), described in `Prime Sample
+    Attention in Object Detection <https://arxiv.org/abs/1904.04821>`_.
+
+    Score hierarchical local rank (HLR) differentiates with RandomSampler in
+    negative part. It firstly computes Score-HLR in a two-step way,
+    then linearly maps score hlr to the loss weights.
+
+    Args:
+        num (int): Total number of sampled RoIs.
+        pos_fraction (float): Fraction of positive samples.
+        context (:obj:`BaseRoIHead`): RoI head that the sampler belongs to.
+        neg_pos_ub (int): Upper bound of the ratio of num negative to num
+            positive, -1 means no upper bound. Defaults to -1.
+        add_gt_as_proposals (bool): Whether to add ground truth as proposals.
+            Defaults to True.
+        k (float): Power of the non-linear mapping. Defaults to 0.5
+        bias (float): Shift of the non-linear mapping. Defaults to 0.
+        score_thr (float): Minimum score that a negative sample is to be
+            considered as valid bbox. Defaults to 0.05.
+        iou_thr (float): IoU threshold for NMS match. Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 num: int,
+                 pos_fraction: float,
+                 context,
+                 neg_pos_ub: int = -1,
+                 add_gt_as_proposals: bool = True,
+                 k: float = 0.5,
+                 bias: float = 0,
+                 score_thr: float = 0.05,
+                 iou_thr: float = 0.5,
+                 **kwargs) -> None:
+        super().__init__(
+            num=num,
+            pos_fraction=pos_fraction,
+            neg_pos_ub=neg_pos_ub,
+            add_gt_as_proposals=add_gt_as_proposals)
+        self.k = k
+        self.bias = bias
+        self.score_thr = score_thr
+        self.iou_thr = iou_thr
+        self.context = context
+        # context of cascade detectors is a list, so distinguish them here.
+        if not hasattr(context, 'num_stages'):
+            self.bbox_roi_extractor = context.bbox_roi_extractor
+            self.bbox_head = context.bbox_head
+            self.with_shared_head = context.with_shared_head
+            if self.with_shared_head:
+                self.shared_head = context.shared_head
+        else:
+            self.bbox_roi_extractor = context.bbox_roi_extractor[
+                context.current_stage]
+            self.bbox_head = context.bbox_head[context.current_stage]
+
+    @staticmethod
+    def random_choice(gallery: Union[Tensor, ndarray, list],
+                      num: int) -> Union[Tensor, ndarray]:
+        """Randomly select some elements from the gallery.
+
+        If `gallery` is a Tensor, the returned indices will be a Tensor;
+        If `gallery` is a ndarray or list, the returned indices will be a
+        ndarray.
+
+        Args:
+            gallery (Tensor or ndarray or list): indices pool.
+            num (int): expected sample num.
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        assert len(gallery) >= num
+
+        is_tensor = isinstance(gallery, torch.Tensor)
+        if not is_tensor:
+            if torch.cuda.is_available():
+                device = torch.cuda.current_device()
+            else:
+                device = 'cpu'
+            gallery = torch.tensor(gallery, dtype=torch.long, device=device)
+        perm = torch.randperm(gallery.numel(), device=gallery.device)[:num]
+        rand_inds = gallery[perm]
+        if not is_tensor:
+            rand_inds = rand_inds.cpu().numpy()
+        return rand_inds
+
+    def _sample_pos(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some positive samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0).flatten()
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result: AssignResult, num_expected: int,
+                    bboxes: Tensor, feats: Tensor,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Sample negative samples.
+
+        Score-HLR sampler is done in the following steps:
+        1. Take the maximum positive score prediction of each negative samples
+            as s_i.
+        2. Filter out negative samples whose s_i <= score_thr, the left samples
+            are called valid samples.
+        3. Use NMS-Match to divide valid samples into different groups,
+            samples in the same group will greatly overlap with each other
+        4. Rank the matched samples in two-steps to get Score-HLR.
+            (1) In the same group, rank samples with their scores.
+            (2) In the same score rank across different groups,
+                rank samples with their scores again.
+        5. Linearly map Score-HLR to the final label weights.
+
+        Args:
+            assign_result (:obj:`AssignResult`): result of assigner.
+            num_expected (int): Expected number of samples.
+            bboxes (Tensor): bbox to be sampled.
+            feats (Tensor): Features come from FPN.
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0).flatten()
+        num_neg = neg_inds.size(0)
+        if num_neg == 0:
+            return neg_inds, None
+        with torch.no_grad():
+            neg_bboxes = bboxes[neg_inds]
+            neg_rois = bbox2roi([neg_bboxes])
+            bbox_result = self.context._bbox_forward(feats, neg_rois)
+            cls_score, bbox_pred = bbox_result['cls_score'], bbox_result[
+                'bbox_pred']
+
+            ori_loss = self.bbox_head.loss(
+                cls_score=cls_score,
+                bbox_pred=None,
+                rois=None,
+                labels=neg_inds.new_full((num_neg, ),
+                                         self.bbox_head.num_classes),
+                label_weights=cls_score.new_ones(num_neg),
+                bbox_targets=None,
+                bbox_weights=None,
+                reduction_override='none')['loss_cls']
+
+            # filter out samples with the max score lower than score_thr
+            max_score, argmax_score = cls_score.softmax(-1)[:, :-1].max(-1)
+            valid_inds = (max_score > self.score_thr).nonzero().view(-1)
+            invalid_inds = (max_score <= self.score_thr).nonzero().view(-1)
+            num_valid = valid_inds.size(0)
+            num_invalid = invalid_inds.size(0)
+
+            num_expected = min(num_neg, num_expected)
+            num_hlr = min(num_valid, num_expected)
+            num_rand = num_expected - num_hlr
+            if num_valid > 0:
+                valid_rois = neg_rois[valid_inds]
+                valid_max_score = max_score[valid_inds]
+                valid_argmax_score = argmax_score[valid_inds]
+                valid_bbox_pred = bbox_pred[valid_inds]
+
+                # valid_bbox_pred shape: [num_valid, #num_classes, 4]
+                valid_bbox_pred = valid_bbox_pred.view(
+                    valid_bbox_pred.size(0), -1, 4)
+                selected_bbox_pred = valid_bbox_pred[range(num_valid),
+                                                     valid_argmax_score]
+                pred_bboxes = self.bbox_head.bbox_coder.decode(
+                    valid_rois[:, 1:], selected_bbox_pred)
+                pred_bboxes_with_score = torch.cat(
+                    [pred_bboxes, valid_max_score[:, None]], -1)
+                group = nms_match(pred_bboxes_with_score, self.iou_thr)
+
+                # imp: importance
+                imp = cls_score.new_zeros(num_valid)
+                for g in group:
+                    g_score = valid_max_score[g]
+                    # g_score has already sorted
+                    rank = g_score.new_tensor(range(g_score.size(0)))
+                    imp[g] = num_valid - rank + g_score
+                _, imp_rank_inds = imp.sort(descending=True)
+                _, imp_rank = imp_rank_inds.sort()
+                hlr_inds = imp_rank_inds[:num_expected]
+
+                if num_rand > 0:
+                    rand_inds = torch.randperm(num_invalid)[:num_rand]
+                    select_inds = torch.cat(
+                        [valid_inds[hlr_inds], invalid_inds[rand_inds]])
+                else:
+                    select_inds = valid_inds[hlr_inds]
+
+                neg_label_weights = cls_score.new_ones(num_expected)
+
+                up_bound = max(num_expected, num_valid)
+                imp_weights = (up_bound -
+                               imp_rank[hlr_inds].float()) / up_bound
+                neg_label_weights[:num_hlr] = imp_weights
+                neg_label_weights[num_hlr:] = imp_weights.min()
+                neg_label_weights = (self.bias +
+                                     (1 - self.bias) * neg_label_weights).pow(
+                                         self.k)
+                ori_selected_loss = ori_loss[select_inds]
+                new_loss = ori_selected_loss * neg_label_weights
+                norm_ratio = ori_selected_loss.sum() / new_loss.sum()
+                neg_label_weights *= norm_ratio
+            else:
+                neg_label_weights = cls_score.new_ones(num_expected)
+                select_inds = torch.randperm(num_neg)[:num_expected]
+
+            return neg_inds[select_inds], neg_label_weights
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, **kwargs) -> SamplingResult:
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigning results.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+
+        gt_flags = priors.new_zeros((priors.shape[0], ), dtype=torch.uint8)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            priors = torch.cat([gt_bboxes, priors], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = priors.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=priors, **kwargs)
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds, neg_label_weights = self.neg_sampler._sample_neg(
+            assign_result, num_expected_neg, bboxes=priors, **kwargs)
+
+        sampling_result = SamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags)
+        return sampling_result, neg_label_weights
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/tracking/__init__.py b/head_extractor/build/lib/mmdet/models/task_modules/tracking/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..57a86d739d586e47e007d26de4542d6bdeced755
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/tracking/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .aflink import AppearanceFreeLink
+from .camera_motion_compensation import CameraMotionCompensation
+from .interpolation import InterpolateTracklets
+from .kalman_filter import KalmanFilter
+from .similarity import embed_similarity
+
+__all__ = [
+    'KalmanFilter', 'InterpolateTracklets', 'embed_similarity',
+    'AppearanceFreeLink', 'CameraMotionCompensation'
+]
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/tracking/aflink.py b/head_extractor/build/lib/mmdet/models/task_modules/tracking/aflink.py
new file mode 100644
index 0000000000000000000000000000000000000000..52461067e372b30bbd28325ead00f5381c546326
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/tracking/aflink.py
@@ -0,0 +1,281 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import Tuple
+
+import numpy as np
+import torch
+from mmengine.model import BaseModule
+from mmengine.runner.checkpoint import load_checkpoint
+from scipy.optimize import linear_sum_assignment
+from torch import Tensor, nn
+
+from mmdet.registry import TASK_UTILS
+
+INFINITY = 1e5
+
+
+class TemporalBlock(BaseModule):
+    """The temporal block of AFLink model.
+
+    Args:
+        in_channel (int): the dimension of the input channels.
+        out_channel (int): the dimension of the output channels.
+    """
+
+    def __init__(self,
+                 in_channel: int,
+                 out_channel: int,
+                 kernel_size: tuple = (7, 1)):
+        super(TemporalBlock, self).__init__()
+        self.conv = nn.Conv2d(in_channel, out_channel, kernel_size, bias=False)
+        self.relu = nn.ReLU(inplace=True)
+        self.bnf = nn.BatchNorm1d(out_channel)
+        self.bnx = nn.BatchNorm1d(out_channel)
+        self.bny = nn.BatchNorm1d(out_channel)
+
+    def bn(self, x: Tensor) -> Tensor:
+        x[:, :, :, 0] = self.bnf(x[:, :, :, 0])
+        x[:, :, :, 1] = self.bnx(x[:, :, :, 1])
+        x[:, :, :, 2] = self.bny(x[:, :, :, 2])
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class FusionBlock(BaseModule):
+    """The fusion block of AFLink model.
+
+    Args:
+        in_channel (int): the dimension of the input channels.
+        out_channel (int): the dimension of the output channels.
+    """
+
+    def __init__(self, in_channel: int, out_channel: int):
+        super(FusionBlock, self).__init__()
+        self.conv = nn.Conv2d(in_channel, out_channel, (1, 3), bias=False)
+        self.bn = nn.BatchNorm2d(out_channel)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class Classifier(BaseModule):
+    """The classifier of AFLink model.
+
+    Args:
+        in_channel (int): the dimension of the input channels.
+    """
+
+    def __init__(self, in_channel: int, out_channel: int):
+        super(Classifier, self).__init__()
+        self.fc1 = nn.Linear(in_channel * 2, in_channel // 2)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc2 = nn.Linear(in_channel // 2, out_channel)
+
+    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        x = torch.cat((x1, x2), dim=1)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        return x
+
+
+class AFLinkModel(BaseModule):
+    """Appearance-Free Link Model."""
+
+    def __init__(self,
+                 temporal_module_channels: list = [1, 32, 64, 128, 256],
+                 fusion_module_channels: list = [256, 256],
+                 classifier_channels: list = [256, 2]):
+        super(AFLinkModel, self).__init__()
+        self.TemporalModule_1 = nn.Sequential(*[
+            TemporalBlock(temporal_module_channels[i],
+                          temporal_module_channels[i + 1])
+            for i in range(len(temporal_module_channels) - 1)
+        ])
+
+        self.TemporalModule_2 = nn.Sequential(*[
+            TemporalBlock(temporal_module_channels[i],
+                          temporal_module_channels[i + 1])
+            for i in range(len(temporal_module_channels) - 1)
+        ])
+
+        self.FusionBlock_1 = FusionBlock(*fusion_module_channels)
+        self.FusionBlock_2 = FusionBlock(*fusion_module_channels)
+
+        self.pooling = nn.AdaptiveAvgPool2d((1, 1))
+        self.classifier = Classifier(*classifier_channels)
+
+    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        assert not self.training, 'Only testing is supported for AFLink.'
+        x1 = x1[:, :, :, :3]
+        x2 = x2[:, :, :, :3]
+        x1 = self.TemporalModule_1(x1)  # [B,1,30,3] -> [B,256,6,3]
+        x2 = self.TemporalModule_2(x2)
+        x1 = self.FusionBlock_1(x1)
+        x2 = self.FusionBlock_2(x2)
+        x1 = self.pooling(x1).squeeze(-1).squeeze(-1)
+        x2 = self.pooling(x2).squeeze(-1).squeeze(-1)
+        y = self.classifier(x1, x2)
+        y = torch.softmax(y, dim=1)[0, 1]
+        return y
+
+
+@TASK_UTILS.register_module()
+class AppearanceFreeLink(BaseModule):
+    """Appearance-Free Link method.
+
+    This method is proposed in
+    "StrongSORT: Make DeepSORT Great Again"
+    `StrongSORT<https://arxiv.org/abs/2202.13514>`_.
+
+    Args:
+        checkpoint (str): Checkpoint path.
+        temporal_threshold (tuple, optional): The temporal constraint
+            for tracklets association. Defaults to (0, 30).
+        spatial_threshold (int, optional): The spatial constraint for
+            tracklets association. Defaults to 75.
+        confidence_threshold (float, optional): The minimum confidence
+            threshold for tracklets association. Defaults to 0.95.
+    """
+
+    def __init__(self,
+                 checkpoint: str,
+                 temporal_threshold: tuple = (0, 30),
+                 spatial_threshold: int = 75,
+                 confidence_threshold: float = 0.95):
+        super(AppearanceFreeLink, self).__init__()
+        self.temporal_threshold = temporal_threshold
+        self.spatial_threshold = spatial_threshold
+        self.confidence_threshold = confidence_threshold
+
+        self.model = AFLinkModel()
+        if checkpoint:
+            load_checkpoint(self.model, checkpoint)
+        if torch.cuda.is_available():
+            self.model.cuda()
+        self.model.eval()
+
+        self.device = next(self.model.parameters()).device
+        self.fn_l2 = lambda x, y: np.sqrt(x**2 + y**2)
+
+    def data_transform(self,
+                       track1: np.ndarray,
+                       track2: np.ndarray,
+                       length: int = 30) -> Tuple[np.ndarray]:
+        """Data Transformation. This is used to standardize the length of
+        tracks to a unified length. Then perform min-max normalization to the
+        motion embeddings.
+
+        Args:
+            track1 (ndarray): the first track with shape (N,C).
+            track2 (ndarray): the second track with shape (M,C).
+            length (int): the unified length of tracks. Defaults to 30.
+
+        Returns:
+            Tuple[ndarray]: the transformed track1 and track2.
+        """
+        # fill or cut track1
+        length_1 = track1.shape[0]
+        track1 = track1[-length:] if length_1 >= length else \
+            np.pad(track1, ((length - length_1, 0), (0, 0)))
+
+        # fill or cut track1
+        length_2 = track2.shape[0]
+        track2 = track2[:length] if length_2 >= length else \
+            np.pad(track2, ((0, length - length_2), (0, 0)))
+
+        # min-max normalization
+        min_ = np.concatenate((track1, track2), axis=0).min(axis=0)
+        max_ = np.concatenate((track1, track2), axis=0).max(axis=0)
+        subtractor = (max_ + min_) / 2
+        divisor = (max_ - min_) / 2 + 1e-5
+        track1 = (track1 - subtractor) / divisor
+        track2 = (track2 - subtractor) / divisor
+
+        return track1, track2
+
+    def forward(self, pred_tracks: np.ndarray) -> np.ndarray:
+        """Forward function.
+
+        pred_tracks (ndarray): With shape (N, 7). Each row denotes
+            (frame_id, track_id, x1, y1, x2, y2, score).
+
+        Returns:
+            ndarray: The linked tracks with shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score)
+        """
+        # sort tracks by the frame id
+        pred_tracks = pred_tracks[np.argsort(pred_tracks[:, 0])]
+
+        # gather tracks information
+        id2info = defaultdict(list)
+        for row in pred_tracks:
+            frame_id, track_id, x1, y1, x2, y2 = row[:6]
+            id2info[track_id].append([frame_id, x1, y1, x2 - x1, y2 - y1])
+        id2info = {k: np.array(v) for k, v in id2info.items()}
+        num_track = len(id2info)
+        track_ids = np.array(list(id2info))
+        cost_matrix = np.full((num_track, num_track), INFINITY)
+
+        # compute the cost matrix
+        for i, id_i in enumerate(track_ids):
+            for j, id_j in enumerate(track_ids):
+                if id_i == id_j:
+                    continue
+                info_i, info_j = id2info[id_i], id2info[id_j]
+                frame_i, box_i = info_i[-1][0], info_i[-1][1:3]
+                frame_j, box_j = info_j[0][0], info_j[0][1:3]
+                # temporal constraint
+                if not self.temporal_threshold[0] <= \
+                        frame_j - frame_i <= self.temporal_threshold[1]:
+                    continue
+                # spatial constraint
+                if self.fn_l2(box_i[0] - box_j[0], box_i[1] - box_j[1]) \
+                        > self.spatial_threshold:
+                    continue
+                # confidence constraint
+                track_i, track_j = self.data_transform(info_i, info_j)
+
+                # numpy to torch
+                track_i = torch.tensor(
+                    track_i, dtype=torch.float).to(self.device)
+                track_j = torch.tensor(
+                    track_j, dtype=torch.float).to(self.device)
+                track_i = track_i.unsqueeze(0).unsqueeze(0)
+                track_j = track_j.unsqueeze(0).unsqueeze(0)
+
+                confidence = self.model(track_i,
+                                        track_j).detach().cpu().numpy()
+                if confidence >= self.confidence_threshold:
+                    cost_matrix[i, j] = 1 - confidence
+
+        # linear assignment
+        indices = linear_sum_assignment(cost_matrix)
+        _id2id = dict()  # the temporary assignment results
+        id2id = dict()  # the final assignment results
+        for i, j in zip(indices[0], indices[1]):
+            if cost_matrix[i, j] < INFINITY:
+                _id2id[i] = j
+        for k, v in _id2id.items():
+            if k in id2id:
+                id2id[v] = id2id[k]
+            else:
+                id2id[v] = k
+
+        # link
+        for k, v in id2id.items():
+            pred_tracks[pred_tracks[:, 1] == k, 1] = v
+
+        # deduplicate
+        _, index = np.unique(pred_tracks[:, :2], return_index=True, axis=0)
+
+        return pred_tracks[index]
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/tracking/camera_motion_compensation.py b/head_extractor/build/lib/mmdet/models/task_modules/tracking/camera_motion_compensation.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a6298494fd1c24e0e7bba457dd50864725f98c8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/tracking/camera_motion_compensation.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox_cxcyah_to_xyxy, bbox_xyxy_to_cxcyah
+
+
+@TASK_UTILS.register_module()
+class CameraMotionCompensation:
+    """Camera motion compensation.
+
+    Args:
+        warp_mode (str): Warp mode in opencv.
+            Defaults to 'cv2.MOTION_EUCLIDEAN'.
+        num_iters (int): Number of the iterations. Defaults to 50.
+        stop_eps (float): Terminate threshold. Defaults to 0.001.
+    """
+
+    def __init__(self,
+                 warp_mode: str = 'cv2.MOTION_EUCLIDEAN',
+                 num_iters: int = 50,
+                 stop_eps: float = 0.001):
+        self.warp_mode = eval(warp_mode)
+        self.num_iters = num_iters
+        self.stop_eps = stop_eps
+
+    def get_warp_matrix(self, img: np.ndarray, ref_img: np.ndarray) -> Tensor:
+        """Calculate warping matrix between two images."""
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        ref_img = cv2.cvtColor(ref_img, cv2.COLOR_BGR2GRAY)
+
+        warp_matrix = np.eye(2, 3, dtype=np.float32)
+        criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT,
+                    self.num_iters, self.stop_eps)
+        cc, warp_matrix = cv2.findTransformECC(img, ref_img, warp_matrix,
+                                               self.warp_mode, criteria, None,
+                                               1)
+        warp_matrix = torch.from_numpy(warp_matrix)
+        return warp_matrix
+
+    def warp_bboxes(self, bboxes: Tensor, warp_matrix: Tensor) -> Tensor:
+        """Warp bounding boxes according to the warping matrix."""
+        tl, br = bboxes[:, :2], bboxes[:, 2:]
+        tl = torch.cat((tl, torch.ones(tl.shape[0], 1).to(bboxes.device)),
+                       dim=1)
+        br = torch.cat((br, torch.ones(tl.shape[0], 1).to(bboxes.device)),
+                       dim=1)
+        trans_tl = torch.mm(warp_matrix, tl.t()).t()
+        trans_br = torch.mm(warp_matrix, br.t()).t()
+        trans_bboxes = torch.cat((trans_tl, trans_br), dim=1)
+        return trans_bboxes.to(bboxes.device)
+
+    def warp_means(self, means: np.ndarray, warp_matrix: Tensor) -> np.ndarray:
+        """Warp track.mean according to the warping matrix."""
+        cxcyah = torch.from_numpy(means[:, :4]).float()
+        xyxy = bbox_cxcyah_to_xyxy(cxcyah)
+        warped_xyxy = self.warp_bboxes(xyxy, warp_matrix)
+        warped_cxcyah = bbox_xyxy_to_cxcyah(warped_xyxy).numpy()
+        means[:, :4] = warped_cxcyah
+        return means
+
+    def track(self, img: Tensor, ref_img: Tensor, tracks: dict,
+              num_samples: int, frame_id: int, metainfo: dict) -> dict:
+        """Tracking forward."""
+        img = img.squeeze(0).cpu().numpy().transpose((1, 2, 0))
+        ref_img = ref_img.squeeze(0).cpu().numpy().transpose((1, 2, 0))
+        warp_matrix = self.get_warp_matrix(img, ref_img)
+
+        # rescale the warp_matrix due to the `resize` in pipeline
+        scale_factor_h, scale_factor_w = metainfo['scale_factor']
+        warp_matrix[0, 2] = warp_matrix[0, 2] / scale_factor_w
+        warp_matrix[1, 2] = warp_matrix[1, 2] / scale_factor_h
+
+        bboxes = []
+        num_bboxes = []
+        means = []
+        for k, v in tracks.items():
+            if int(v['frame_ids'][-1]) < frame_id - 1:
+                _num = 1
+            else:
+                _num = min(num_samples, len(v.bboxes))
+            num_bboxes.append(_num)
+            bboxes.extend(v.bboxes[-_num:])
+            if len(v.mean) > 0:
+                means.append(v.mean)
+        bboxes = torch.cat(bboxes, dim=0)
+        warped_bboxes = self.warp_bboxes(bboxes, warp_matrix.to(bboxes.device))
+
+        warped_bboxes = torch.split(warped_bboxes, num_bboxes)
+        for b, (k, v) in zip(warped_bboxes, tracks.items()):
+            _num = b.shape[0]
+            b = torch.split(b, [1] * _num)
+            tracks[k].bboxes[-_num:] = b
+
+        if means:
+            means = np.asarray(means)
+            warped_means = self.warp_means(means, warp_matrix)
+            for m, (k, v) in zip(warped_means, tracks.items()):
+                tracks[k].mean = m
+
+        return tracks
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/tracking/interpolation.py b/head_extractor/build/lib/mmdet/models/task_modules/tracking/interpolation.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb6a25af4f253e3ec6b9781831ff43c6bafe50e1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/tracking/interpolation.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+try:
+    from sklearn.gaussian_process import GaussianProcessRegressor as GPR
+    from sklearn.gaussian_process.kernels import RBF
+    HAS_SKIKIT_LEARN = True
+except ImportError:
+    HAS_SKIKIT_LEARN = False
+
+from mmdet.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class InterpolateTracklets:
+    """Interpolate tracks to make tracks more complete.
+
+    Args:
+        min_num_frames (int, optional): The minimum length of a track that will
+            be interpolated. Defaults to 5.
+        max_num_frames (int, optional): The maximum disconnected length in
+            a track. Defaults to 20.
+        use_gsi (bool, optional): Whether to use the GSI (Gaussian-smoothed
+            interpolation) method. Defaults to False.
+        smooth_tau (int, optional): smoothing parameter in GSI. Defaults to 10.
+    """
+
+    def __init__(self,
+                 min_num_frames: int = 5,
+                 max_num_frames: int = 20,
+                 use_gsi: bool = False,
+                 smooth_tau: int = 10):
+        if not HAS_SKIKIT_LEARN:
+            raise RuntimeError('sscikit-learn is not installed,\
+                 please install it by: pip install scikit-learn')
+        self.min_num_frames = min_num_frames
+        self.max_num_frames = max_num_frames
+        self.use_gsi = use_gsi
+        self.smooth_tau = smooth_tau
+
+    def _interpolate_track(self,
+                           track: np.ndarray,
+                           track_id: int,
+                           max_num_frames: int = 20) -> np.ndarray:
+        """Interpolate a track linearly to make the track more complete.
+
+        This function is proposed in
+        "ByteTrack: Multi-Object Tracking by Associating Every Detection Box."
+        `ByteTrack<https://arxiv.org/abs/2110.06864>`_.
+
+        Args:
+            track (ndarray): With shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score).
+            max_num_frames (int, optional): The maximum disconnected length in
+                the track. Defaults to 20.
+
+        Returns:
+            ndarray: The interpolated track with shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score)
+        """
+        assert (track[:, 1] == track_id).all(), \
+            'The track id should not changed when interpolate a track.'
+
+        frame_ids = track[:, 0]
+        interpolated_track = np.zeros((0, 7))
+        # perform interpolation for the disconnected frames in the track.
+        for i in np.where(np.diff(frame_ids) > 1)[0]:
+            left_frame_id = frame_ids[i]
+            right_frame_id = frame_ids[i + 1]
+            num_disconnected_frames = int(right_frame_id - left_frame_id)
+
+            if 1 < num_disconnected_frames < max_num_frames:
+                left_bbox = track[i, 2:6]
+                right_bbox = track[i + 1, 2:6]
+
+                # perform interpolation for two adjacent tracklets.
+                for j in range(1, num_disconnected_frames):
+                    cur_bbox = j / (num_disconnected_frames) * (
+                        right_bbox - left_bbox) + left_bbox
+                    cur_result = np.ones((7, ))
+                    cur_result[0] = j + left_frame_id
+                    cur_result[1] = track_id
+                    cur_result[2:6] = cur_bbox
+
+                    interpolated_track = np.concatenate(
+                        (interpolated_track, cur_result[None]), axis=0)
+
+        interpolated_track = np.concatenate((track, interpolated_track),
+                                            axis=0)
+        return interpolated_track
+
+    def gaussian_smoothed_interpolation(self,
+                                        track: np.ndarray,
+                                        smooth_tau: int = 10) -> np.ndarray:
+        """Gaussian-Smoothed Interpolation.
+
+        This function is proposed in
+        "StrongSORT: Make DeepSORT Great Again"
+        `StrongSORT<https://arxiv.org/abs/2202.13514>`_.
+
+        Args:
+            track (ndarray): With shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score).
+            smooth_tau (int, optional): smoothing parameter in GSI.
+                Defaults to 10.
+
+        Returns:
+            ndarray: The interpolated tracks with shape (N, 7). Each row
+                denotes (frame_id, track_id, x1, y1, x2, y2, score)
+        """
+        len_scale = np.clip(smooth_tau * np.log(smooth_tau**3 / len(track)),
+                            smooth_tau**-1, smooth_tau**2)
+        gpr = GPR(RBF(len_scale, 'fixed'))
+        t = track[:, 0].reshape(-1, 1)
+        x1 = track[:, 2].reshape(-1, 1)
+        y1 = track[:, 3].reshape(-1, 1)
+        x2 = track[:, 4].reshape(-1, 1)
+        y2 = track[:, 5].reshape(-1, 1)
+        gpr.fit(t, x1)
+        x1_gpr = gpr.predict(t)
+        gpr.fit(t, y1)
+        y1_gpr = gpr.predict(t)
+        gpr.fit(t, x2)
+        x2_gpr = gpr.predict(t)
+        gpr.fit(t, y2)
+        y2_gpr = gpr.predict(t)
+        gsi_track = [[
+            t[i, 0], track[i, 1], x1_gpr[i], y1_gpr[i], x2_gpr[i], y2_gpr[i],
+            track[i, 6]
+        ] for i in range(len(t))]
+        return np.array(gsi_track)
+
+    def forward(self, pred_tracks: np.ndarray) -> np.ndarray:
+        """Forward function.
+
+        pred_tracks (ndarray): With shape (N, 7). Each row denotes
+            (frame_id, track_id, x1, y1, x2, y2, score).
+
+        Returns:
+            ndarray: The interpolated tracks with shape (N, 7). Each row
+            denotes (frame_id, track_id, x1, y1, x2, y2, score).
+        """
+        max_track_id = int(np.max(pred_tracks[:, 1]))
+        min_track_id = int(np.min(pred_tracks[:, 1]))
+
+        # perform interpolation for each track
+        interpolated_tracks = []
+        for track_id in range(min_track_id, max_track_id + 1):
+            inds = pred_tracks[:, 1] == track_id
+            track = pred_tracks[inds]
+            num_frames = len(track)
+            if num_frames <= 2:
+                continue
+
+            if num_frames > self.min_num_frames:
+                interpolated_track = self._interpolate_track(
+                    track, track_id, self.max_num_frames)
+            else:
+                interpolated_track = track
+
+            if self.use_gsi:
+                interpolated_track = self.gaussian_smoothed_interpolation(
+                    interpolated_track, self.smooth_tau)
+
+            interpolated_tracks.append(interpolated_track)
+
+        interpolated_tracks = np.concatenate(interpolated_tracks)
+        return interpolated_tracks[interpolated_tracks[:, 0].argsort()]
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/tracking/kalman_filter.py b/head_extractor/build/lib/mmdet/models/task_modules/tracking/kalman_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8ae1416af69bce17fd20dd5231eba2f12f7ed64
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/tracking/kalman_filter.py
@@ -0,0 +1,267 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import numpy as np
+import torch
+
+try:
+    import scipy.linalg
+    HAS_SCIPY = True
+except ImportError:
+    HAS_SCIPY = False
+
+from mmdet.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class KalmanFilter:
+    """A simple Kalman filter for tracking bounding boxes in image space.
+
+    The implementation is referred to https://github.com/nwojke/deep_sort.
+
+    Args:
+        center_only (bool): If True, distance computation is done with
+            respect to the bounding box center position only.
+            Defaults to False.
+        use_nsa (bool): Whether to use the NSA (Noise Scale Adaptive) Kalman
+            Filter, which adaptively modulates the noise scale according to
+            the quality of detections. More details in
+            https://arxiv.org/abs/2202.11983. Defaults to False.
+    """
+    chi2inv95 = {
+        1: 3.8415,
+        2: 5.9915,
+        3: 7.8147,
+        4: 9.4877,
+        5: 11.070,
+        6: 12.592,
+        7: 14.067,
+        8: 15.507,
+        9: 16.919
+    }
+
+    def __init__(self, center_only: bool = False, use_nsa: bool = False):
+        if not HAS_SCIPY:
+            raise RuntimeError('sscikit-learn is not installed,\
+                 please install it by: pip install scikit-learn')
+        self.center_only = center_only
+        if self.center_only:
+            self.gating_threshold = self.chi2inv95[2]
+        else:
+            self.gating_threshold = self.chi2inv95[4]
+
+        self.use_nsa = use_nsa
+        ndim, dt = 4, 1.
+
+        # Create Kalman filter model matrices.
+        self._motion_mat = np.eye(2 * ndim, 2 * ndim)
+        for i in range(ndim):
+            self._motion_mat[i, ndim + i] = dt
+        self._update_mat = np.eye(ndim, 2 * ndim)
+
+        # Motion and observation uncertainty are chosen relative to the current
+        # state estimate. These weights control the amount of uncertainty in
+        # the model. This is a bit hacky.
+        self._std_weight_position = 1. / 20
+        self._std_weight_velocity = 1. / 160
+
+    def initiate(self, measurement: np.array) -> Tuple[np.array, np.array]:
+        """Create track from unassociated measurement.
+
+        Args:
+            measurement (ndarray):  Bounding box coordinates (x, y, a, h) with
+            center position (x, y), aspect ratio a, and height h.
+
+        Returns:
+             (ndarray, ndarray): Returns the mean vector (8 dimensional) and
+                covariance matrix (8x8 dimensional) of the new track.
+                Unobserved velocities are initialized to 0 mean.
+        """
+        mean_pos = measurement
+        mean_vel = np.zeros_like(mean_pos)
+        mean = np.r_[mean_pos, mean_vel]
+
+        std = [
+            2 * self._std_weight_position * measurement[3],
+            2 * self._std_weight_position * measurement[3], 1e-2,
+            2 * self._std_weight_position * measurement[3],
+            10 * self._std_weight_velocity * measurement[3],
+            10 * self._std_weight_velocity * measurement[3], 1e-5,
+            10 * self._std_weight_velocity * measurement[3]
+        ]
+        covariance = np.diag(np.square(std))
+        return mean, covariance
+
+    def predict(self, mean: np.array,
+                covariance: np.array) -> Tuple[np.array, np.array]:
+        """Run Kalman filter prediction step.
+
+        Args:
+            mean (ndarray): The 8 dimensional mean vector of the object
+                state at the previous time step.
+
+            covariance (ndarray): The 8x8 dimensional covariance matrix
+                of the object state at the previous time step.
+
+        Returns:
+            (ndarray, ndarray): Returns the mean vector and covariance
+                matrix of the predicted state. Unobserved velocities are
+                initialized to 0 mean.
+        """
+        std_pos = [
+            self._std_weight_position * mean[3],
+            self._std_weight_position * mean[3], 1e-2,
+            self._std_weight_position * mean[3]
+        ]
+        std_vel = [
+            self._std_weight_velocity * mean[3],
+            self._std_weight_velocity * mean[3], 1e-5,
+            self._std_weight_velocity * mean[3]
+        ]
+        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))
+
+        mean = np.dot(self._motion_mat, mean)
+        covariance = np.linalg.multi_dot(
+            (self._motion_mat, covariance, self._motion_mat.T)) + motion_cov
+
+        return mean, covariance
+
+    def project(self,
+                mean: np.array,
+                covariance: np.array,
+                bbox_score: float = 0.) -> Tuple[np.array, np.array]:
+        """Project state distribution to measurement space.
+
+        Args:
+            mean (ndarray): The state's mean vector (8 dimensional array).
+            covariance (ndarray): The state's covariance matrix (8x8
+                dimensional).
+            bbox_score (float): The confidence score of the bbox.
+                Defaults to 0.
+
+        Returns:
+            (ndarray, ndarray):  Returns the projected mean and covariance
+            matrix of the given state estimate.
+        """
+        std = [
+            self._std_weight_position * mean[3],
+            self._std_weight_position * mean[3], 1e-1,
+            self._std_weight_position * mean[3]
+        ]
+
+        if self.use_nsa:
+            std = [(1 - bbox_score) * x for x in std]
+
+        innovation_cov = np.diag(np.square(std))
+
+        mean = np.dot(self._update_mat, mean)
+        covariance = np.linalg.multi_dot(
+            (self._update_mat, covariance, self._update_mat.T))
+        return mean, covariance + innovation_cov
+
+    def update(self,
+               mean: np.array,
+               covariance: np.array,
+               measurement: np.array,
+               bbox_score: float = 0.) -> Tuple[np.array, np.array]:
+        """Run Kalman filter correction step.
+
+        Args:
+            mean (ndarray): The predicted state's mean vector (8 dimensional).
+            covariance (ndarray): The state's covariance matrix (8x8
+                dimensional).
+            measurement (ndarray): The 4 dimensional measurement vector
+                (x, y, a, h), where (x, y) is the center position, a the
+                aspect ratio, and h the height of the bounding box.
+            bbox_score (float): The confidence score of the bbox.
+                Defaults to 0.
+
+        Returns:
+             (ndarray, ndarray): Returns the measurement-corrected state
+             distribution.
+        """
+        projected_mean, projected_cov = \
+            self.project(mean, covariance, bbox_score)
+
+        chol_factor, lower = scipy.linalg.cho_factor(
+            projected_cov, lower=True, check_finite=False)
+        kalman_gain = scipy.linalg.cho_solve((chol_factor, lower),
+                                             np.dot(covariance,
+                                                    self._update_mat.T).T,
+                                             check_finite=False).T
+        innovation = measurement - projected_mean
+
+        new_mean = mean + np.dot(innovation, kalman_gain.T)
+        new_covariance = covariance - np.linalg.multi_dot(
+            (kalman_gain, projected_cov, kalman_gain.T))
+        return new_mean, new_covariance
+
+    def gating_distance(self,
+                        mean: np.array,
+                        covariance: np.array,
+                        measurements: np.array,
+                        only_position: bool = False) -> np.array:
+        """Compute gating distance between state distribution and measurements.
+
+        A suitable distance threshold can be obtained from `chi2inv95`. If
+        `only_position` is False, the chi-square distribution has 4 degrees of
+        freedom, otherwise 2.
+
+        Args:
+            mean (ndarray): Mean vector over the state distribution (8
+                dimensional).
+            covariance (ndarray): Covariance of the state distribution (8x8
+                dimensional).
+            measurements (ndarray): An Nx4 dimensional matrix of N
+                measurements, each in format (x, y, a, h) where (x, y) is the
+                bounding box center position, a the aspect ratio, and h the
+                height.
+            only_position (bool, optional): If True, distance computation is
+                done with respect to the bounding box center position only.
+                Defaults to False.
+
+        Returns:
+            ndarray: Returns an array of length N, where the i-th element
+            contains the squared Mahalanobis distance between
+            (mean, covariance) and `measurements[i]`.
+        """
+        mean, covariance = self.project(mean, covariance)
+        if only_position:
+            mean, covariance = mean[:2], covariance[:2, :2]
+            measurements = measurements[:, :2]
+
+        cholesky_factor = np.linalg.cholesky(covariance)
+        d = measurements - mean
+        z = scipy.linalg.solve_triangular(
+            cholesky_factor,
+            d.T,
+            lower=True,
+            check_finite=False,
+            overwrite_b=True)
+        squared_maha = np.sum(z * z, axis=0)
+        return squared_maha
+
+    def track(self, tracks: dict,
+              bboxes: torch.Tensor) -> Tuple[dict, np.array]:
+        """Track forward.
+
+        Args:
+            tracks (dict[int:dict]): Track buffer.
+            bboxes (Tensor): Detected bounding boxes.
+
+        Returns:
+            (dict[int:dict], ndarray): Updated tracks and bboxes.
+        """
+        costs = []
+        for id, track in tracks.items():
+            track.mean, track.covariance = self.predict(
+                track.mean, track.covariance)
+            gating_distance = self.gating_distance(track.mean,
+                                                   track.covariance,
+                                                   bboxes.cpu().numpy(),
+                                                   self.center_only)
+            costs.append(gating_distance)
+
+        costs = np.stack(costs, 0)
+        costs[costs > self.gating_threshold] = np.nan
+        return tracks, costs
diff --git a/head_extractor/build/lib/mmdet/models/task_modules/tracking/similarity.py b/head_extractor/build/lib/mmdet/models/task_modules/tracking/similarity.py
new file mode 100644
index 0000000000000000000000000000000000000000..730e43b86214ae92ffdcab8ae39e6f9261075caa
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/task_modules/tracking/similarity.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def embed_similarity(key_embeds: Tensor,
+                     ref_embeds: Tensor,
+                     method: str = 'dot_product',
+                     temperature: int = -1) -> Tensor:
+    """Calculate feature similarity from embeddings.
+
+    Args:
+        key_embeds (Tensor): Shape (N1, C).
+        ref_embeds (Tensor): Shape (N2, C).
+        method (str, optional): Method to calculate the similarity,
+            options are 'dot_product' and 'cosine'. Defaults to
+            'dot_product'.
+        temperature (int, optional): Softmax temperature. Defaults to -1.
+
+    Returns:
+        Tensor: Similarity matrix of shape (N1, N2).
+    """
+    assert method in ['dot_product', 'cosine']
+
+    if method == 'cosine':
+        key_embeds = F.normalize(key_embeds, p=2, dim=1)
+        ref_embeds = F.normalize(ref_embeds, p=2, dim=1)
+
+    similarity = torch.mm(key_embeds, ref_embeds.T)
+
+    if temperature > 0:
+        similarity /= float(temperature)
+    return similarity
diff --git a/head_extractor/build/lib/mmdet/models/test_time_augs/__init__.py b/head_extractor/build/lib/mmdet/models/test_time_augs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e4926efb011b45b3ab7d3d303fb2d105aaa192
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/test_time_augs/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .det_tta import DetTTAModel
+from .merge_augs import (merge_aug_bboxes, merge_aug_masks,
+                         merge_aug_proposals, merge_aug_results,
+                         merge_aug_scores)
+
+__all__ = [
+    'merge_aug_bboxes', 'merge_aug_masks', 'merge_aug_proposals',
+    'merge_aug_scores', 'merge_aug_results', 'DetTTAModel'
+]
diff --git a/head_extractor/build/lib/mmdet/models/test_time_augs/det_tta.py b/head_extractor/build/lib/mmdet/models/test_time_augs/det_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..95f91db9e1250358db0e1a572cf4c37cc7fe6e6f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/test_time_augs/det_tta.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.model import BaseTTAModel
+from mmengine.registry import MODELS
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox_flip
+
+
+@MODELS.register_module()
+class DetTTAModel(BaseTTAModel):
+    """Merge augmented detection results, only bboxes corresponding score under
+    flipping and multi-scale resizing can be processed now.
+
+    Examples:
+        >>> tta_model = dict(
+        >>>     type='DetTTAModel',
+        >>>     tta_cfg=dict(nms=dict(
+        >>>                     type='nms',
+        >>>                     iou_threshold=0.5),
+        >>>                     max_per_img=100))
+        >>>
+        >>> tta_pipeline = [
+        >>>     dict(type='LoadImageFromFile',
+        >>>          backend_args=None),
+        >>>     dict(
+        >>>         type='TestTimeAug',
+        >>>         transforms=[[
+        >>>             dict(type='Resize',
+        >>>                  scale=(1333, 800),
+        >>>                  keep_ratio=True),
+        >>>         ], [
+        >>>             dict(type='RandomFlip', prob=1.),
+        >>>             dict(type='RandomFlip', prob=0.)
+        >>>         ], [
+        >>>             dict(
+        >>>                 type='PackDetInputs',
+        >>>                 meta_keys=('img_id', 'img_path', 'ori_shape',
+        >>>                         'img_shape', 'scale_factor', 'flip',
+        >>>                         'flip_direction'))
+        >>>         ]])]
+    """
+
+    def __init__(self, tta_cfg=None, **kwargs):
+        super().__init__(**kwargs)
+        self.tta_cfg = tta_cfg
+
+    def merge_aug_bboxes(self, aug_bboxes: List[Tensor],
+                         aug_scores: List[Tensor],
+                         img_metas: List[str]) -> Tuple[Tensor, Tensor]:
+        """Merge augmented detection bboxes and scores.
+
+        Args:
+            aug_bboxes (list[Tensor]): shape (n, 4*#class)
+            aug_scores (list[Tensor] or None): shape (n, #class)
+        Returns:
+            tuple[Tensor]: ``bboxes`` with shape (n,4), where
+            4 represent (tl_x, tl_y, br_x, br_y)
+            and ``scores`` with shape (n,).
+        """
+        recovered_bboxes = []
+        for bboxes, img_info in zip(aug_bboxes, img_metas):
+            ori_shape = img_info['ori_shape']
+            flip = img_info['flip']
+            flip_direction = img_info['flip_direction']
+            if flip:
+                bboxes = bbox_flip(
+                    bboxes=bboxes,
+                    img_shape=ori_shape,
+                    direction=flip_direction)
+            recovered_bboxes.append(bboxes)
+        bboxes = torch.cat(recovered_bboxes, dim=0)
+        if aug_scores is None:
+            return bboxes
+        else:
+            scores = torch.cat(aug_scores, dim=0)
+            return bboxes, scores
+
+    def merge_preds(self, data_samples_list: List[List[DetDataSample]]):
+        """Merge batch predictions of enhanced data.
+
+        Args:
+            data_samples_list (List[List[DetDataSample]]): List of predictions
+                of all enhanced data. The outer list indicates images, and the
+                inner list corresponds to the different views of one image.
+                Each element of the inner list is a ``DetDataSample``.
+        Returns:
+            List[DetDataSample]: Merged batch prediction.
+        """
+        merged_data_samples = []
+        for data_samples in data_samples_list:
+            merged_data_samples.append(self._merge_single_sample(data_samples))
+        return merged_data_samples
+
+    def _merge_single_sample(
+            self, data_samples: List[DetDataSample]) -> DetDataSample:
+        """Merge predictions which come form the different views of one image
+        to one prediction.
+
+        Args:
+            data_samples (List[DetDataSample]): List of predictions
+            of enhanced data which come form one image.
+        Returns:
+            List[DetDataSample]: Merged prediction.
+        """
+        aug_bboxes = []
+        aug_scores = []
+        aug_labels = []
+        img_metas = []
+        # TODO: support instance segmentation TTA
+        assert data_samples[0].pred_instances.get('masks', None) is None, \
+            'TTA of instance segmentation does not support now.'
+        for data_sample in data_samples:
+            aug_bboxes.append(data_sample.pred_instances.bboxes)
+            aug_scores.append(data_sample.pred_instances.scores)
+            aug_labels.append(data_sample.pred_instances.labels)
+            img_metas.append(data_sample.metainfo)
+
+        merged_bboxes, merged_scores = self.merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas)
+        merged_labels = torch.cat(aug_labels, dim=0)
+
+        if merged_bboxes.numel() == 0:
+            return data_samples[0]
+
+        det_bboxes, keep_idxs = batched_nms(merged_bboxes, merged_scores,
+                                            merged_labels, self.tta_cfg.nms)
+
+        det_bboxes = det_bboxes[:self.tta_cfg.max_per_img]
+        det_labels = merged_labels[keep_idxs][:self.tta_cfg.max_per_img]
+
+        results = InstanceData()
+        _det_bboxes = det_bboxes.clone()
+        results.bboxes = _det_bboxes[:, :-1]
+        results.scores = _det_bboxes[:, -1]
+        results.labels = det_labels
+        det_results = data_samples[0]
+        det_results.pred_instances = results
+        return det_results
diff --git a/head_extractor/build/lib/mmdet/models/test_time_augs/merge_augs.py b/head_extractor/build/lib/mmdet/models/test_time_augs/merge_augs.py
new file mode 100644
index 0000000000000000000000000000000000000000..5935a8614c39d70253a09a339f51c144661c64fb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/test_time_augs/merge_augs.py
@@ -0,0 +1,219 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+from mmcv.ops import nms
+from mmengine.config import ConfigDict
+from torch import Tensor
+
+from mmdet.structures.bbox import bbox_mapping_back
+
+
+# TODO remove this, never be used in mmdet
+def merge_aug_proposals(aug_proposals, img_metas, cfg):
+    """Merge augmented proposals (multiscale, flip, etc.)
+
+    Args:
+        aug_proposals (list[Tensor]): proposals from different testing
+            schemes, shape (n, 5). Note that they are not rescaled to the
+            original image size.
+
+        img_metas (list[dict]): list of image info dict where each dict has:
+            'img_shape', 'scale_factor', 'flip', and may also contain
+            'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            For details on the values of these keys see
+            `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+        cfg (dict): rpn test config.
+
+    Returns:
+        Tensor: shape (n, 4), proposals corresponding to original image scale.
+    """
+
+    cfg = copy.deepcopy(cfg)
+
+    # deprecate arguments warning
+    if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg:
+        warnings.warn(
+            'In rpn_proposal or test_cfg, '
+            'nms_thr has been moved to a dict named nms as '
+            'iou_threshold, max_num has been renamed as max_per_img, '
+            'name of original arguments and the way to specify '
+            'iou_threshold of NMS will be deprecated.')
+    if 'nms' not in cfg:
+        cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr))
+    if 'max_num' in cfg:
+        if 'max_per_img' in cfg:
+            assert cfg.max_num == cfg.max_per_img, f'You set max_num and ' \
+                f'max_per_img at the same time, but get {cfg.max_num} ' \
+                f'and {cfg.max_per_img} respectively' \
+                f'Please delete max_num which will be deprecated.'
+        else:
+            cfg.max_per_img = cfg.max_num
+    if 'nms_thr' in cfg:
+        assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set ' \
+            f'iou_threshold in nms and ' \
+            f'nms_thr at the same time, but get ' \
+            f'{cfg.nms.iou_threshold} and {cfg.nms_thr}' \
+            f' respectively. Please delete the nms_thr ' \
+            f'which will be deprecated.'
+
+    recovered_proposals = []
+    for proposals, img_info in zip(aug_proposals, img_metas):
+        img_shape = img_info['img_shape']
+        scale_factor = img_info['scale_factor']
+        flip = img_info['flip']
+        flip_direction = img_info['flip_direction']
+        _proposals = proposals.clone()
+        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape,
+                                              scale_factor, flip,
+                                              flip_direction)
+        recovered_proposals.append(_proposals)
+    aug_proposals = torch.cat(recovered_proposals, dim=0)
+    merged_proposals, _ = nms(aug_proposals[:, :4].contiguous(),
+                              aug_proposals[:, -1].contiguous(),
+                              cfg.nms.iou_threshold)
+    scores = merged_proposals[:, 4]
+    _, order = scores.sort(0, descending=True)
+    num = min(cfg.max_per_img, merged_proposals.shape[0])
+    order = order[:num]
+    merged_proposals = merged_proposals[order, :]
+    return merged_proposals
+
+
+# TODO remove this, never be used in mmdet
+def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
+    """Merge augmented detection bboxes and scores.
+
+    Args:
+        aug_bboxes (list[Tensor]): shape (n, 4*#class)
+        aug_scores (list[Tensor] or None): shape (n, #class)
+        img_shapes (list[Tensor]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_bboxes = []
+    for bboxes, img_info in zip(aug_bboxes, img_metas):
+        img_shape = img_info[0]['img_shape']
+        scale_factor = img_info[0]['scale_factor']
+        flip = img_info[0]['flip']
+        flip_direction = img_info[0]['flip_direction']
+        bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
+                                   flip_direction)
+        recovered_bboxes.append(bboxes)
+    bboxes = torch.stack(recovered_bboxes).mean(dim=0)
+    if aug_scores is None:
+        return bboxes
+    else:
+        scores = torch.stack(aug_scores).mean(dim=0)
+        return bboxes, scores
+
+
+def merge_aug_results(aug_batch_results, aug_batch_img_metas):
+    """Merge augmented detection results, only bboxes corresponding score under
+    flipping and multi-scale resizing can be processed now.
+
+    Args:
+        aug_batch_results (list[list[[obj:`InstanceData`]]):
+            Detection results of multiple images with
+            different augmentations.
+            The outer list indicate the augmentation . The inter
+            list indicate the batch dimension.
+            Each item usually contains the following keys.
+
+            - scores (Tensor): Classification scores, in shape
+              (num_instance,)
+            - labels (Tensor): Labels of bboxes, in shape
+              (num_instances,).
+            - bboxes (Tensor): In shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        aug_batch_img_metas (list[list[dict]]): The outer list
+            indicates test-time augs (multiscale, flip, etc.)
+            and the inner list indicates
+            images in a batch. Each dict in the list contains
+            information of an image in the batch.
+
+    Returns:
+        batch_results (list[obj:`InstanceData`]): Same with
+        the input `aug_results` except that all bboxes have
+        been mapped to the original scale.
+    """
+    num_augs = len(aug_batch_results)
+    num_imgs = len(aug_batch_results[0])
+
+    batch_results = []
+    aug_batch_results = copy.deepcopy(aug_batch_results)
+    for img_id in range(num_imgs):
+        aug_results = []
+        for aug_id in range(num_augs):
+            img_metas = aug_batch_img_metas[aug_id][img_id]
+            results = aug_batch_results[aug_id][img_id]
+
+            img_shape = img_metas['img_shape']
+            scale_factor = img_metas['scale_factor']
+            flip = img_metas['flip']
+            flip_direction = img_metas['flip_direction']
+            bboxes = bbox_mapping_back(results.bboxes, img_shape, scale_factor,
+                                       flip, flip_direction)
+            results.bboxes = bboxes
+            aug_results.append(results)
+        merged_aug_results = results.cat(aug_results)
+        batch_results.append(merged_aug_results)
+
+    return batch_results
+
+
+def merge_aug_scores(aug_scores):
+    """Merge augmented bbox scores."""
+    if isinstance(aug_scores[0], torch.Tensor):
+        return torch.mean(torch.stack(aug_scores), dim=0)
+    else:
+        return np.mean(aug_scores, axis=0)
+
+
+def merge_aug_masks(aug_masks: List[Tensor],
+                    img_metas: dict,
+                    weights: Optional[Union[list, Tensor]] = None) -> Tensor:
+    """Merge augmented mask prediction.
+
+    Args:
+        aug_masks (list[Tensor]): each has shape
+            (n, c, h, w).
+        img_metas (dict): Image information.
+        weights (list or Tensor): Weight of each aug_masks,
+            the length should be n.
+
+    Returns:
+        Tensor: has shape (n, c, h, w)
+    """
+    recovered_masks = []
+    for i, mask in enumerate(aug_masks):
+        if weights is not None:
+            assert len(weights) == len(aug_masks)
+            weight = weights[i]
+        else:
+            weight = 1
+        flip = img_metas.get('flip', False)
+        if flip:
+            flip_direction = img_metas['flip_direction']
+            if flip_direction == 'horizontal':
+                mask = mask[:, :, :, ::-1]
+            elif flip_direction == 'vertical':
+                mask = mask[:, :, ::-1, :]
+            elif flip_direction == 'diagonal':
+                mask = mask[:, :, :, ::-1]
+                mask = mask[:, :, ::-1, :]
+            else:
+                raise ValueError(
+                    f"Invalid flipping direction '{flip_direction}'")
+        recovered_masks.append(mask[None, :] * weight)
+
+    merged_masks = torch.cat(recovered_masks, 0).mean(dim=0)
+    if weights is not None:
+        merged_masks = merged_masks * len(weights) / sum(weights)
+    return merged_masks
diff --git a/head_extractor/build/lib/mmdet/models/trackers/__init__.py b/head_extractor/build/lib/mmdet/models/trackers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..00284bb7b40dd007c28b6cc9175ac26a52c6c528
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/trackers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_tracker import BaseTracker
+from .byte_tracker import ByteTracker
+from .masktrack_rcnn_tracker import MaskTrackRCNNTracker
+from .ocsort_tracker import OCSORTTracker
+from .quasi_dense_tracker import QuasiDenseTracker
+from .sort_tracker import SORTTracker
+from .strongsort_tracker import StrongSORTTracker
+
+__all__ = [
+    'BaseTracker', 'ByteTracker', 'QuasiDenseTracker', 'SORTTracker',
+    'StrongSORTTracker', 'OCSORTTracker', 'MaskTrackRCNNTracker'
+]
diff --git a/head_extractor/build/lib/mmdet/models/trackers/base_tracker.py b/head_extractor/build/lib/mmdet/models/trackers/base_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cf188653cd9adda59decd45f65fc4ede63fe3a7
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/trackers/base_tracker.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from addict import Dict
+
+
+class BaseTracker(metaclass=ABCMeta):
+    """Base tracker model.
+
+    Args:
+        momentums (dict[str:float], optional): Momentums to update the buffers.
+            The `str` indicates the name of the buffer while the `float`
+            indicates the momentum. Defaults to None.
+        num_frames_retain (int, optional). If a track is disappeared more than
+            `num_frames_retain` frames, it will be deleted in the memo.
+             Defaults to 10.
+    """
+
+    def __init__(self,
+                 momentums: Optional[dict] = None,
+                 num_frames_retain: int = 10) -> None:
+        super().__init__()
+        if momentums is not None:
+            assert isinstance(momentums, dict), 'momentums must be a dict'
+        self.momentums = momentums
+        self.num_frames_retain = num_frames_retain
+
+        self.reset()
+
+    def reset(self) -> None:
+        """Reset the buffer of the tracker."""
+        self.num_tracks = 0
+        self.tracks = dict()
+
+    @property
+    def empty(self) -> bool:
+        """Whether the buffer is empty or not."""
+        return False if self.tracks else True
+
+    @property
+    def ids(self) -> List[dict]:
+        """All ids in the tracker."""
+        return list(self.tracks.keys())
+
+    @property
+    def with_reid(self) -> bool:
+        """bool: whether the framework has a reid model"""
+        return hasattr(self, 'reid') and self.reid is not None
+
+    def update(self, **kwargs) -> None:
+        """Update the tracker.
+
+        Args:
+            kwargs (dict[str: Tensor | int]): The `str` indicates the
+                name of the input variable. `ids` and `frame_ids` are
+                obligatory in the keys.
+        """
+        memo_items = [k for k, v in kwargs.items() if v is not None]
+        rm_items = [k for k in kwargs.keys() if k not in memo_items]
+        for item in rm_items:
+            kwargs.pop(item)
+        if not hasattr(self, 'memo_items'):
+            self.memo_items = memo_items
+        else:
+            assert memo_items == self.memo_items
+
+        assert 'ids' in memo_items
+        num_objs = len(kwargs['ids'])
+        id_indice = memo_items.index('ids')
+        assert 'frame_ids' in memo_items
+        frame_id = int(kwargs['frame_ids'])
+        if isinstance(kwargs['frame_ids'], int):
+            kwargs['frame_ids'] = torch.tensor([kwargs['frame_ids']] *
+                                               num_objs)
+        # cur_frame_id = int(kwargs['frame_ids'][0])
+        for k, v in kwargs.items():
+            if len(v) != num_objs:
+                raise ValueError('kwargs value must both equal')
+
+        for obj in zip(*kwargs.values()):
+            id = int(obj[id_indice])
+            if id in self.tracks:
+                self.update_track(id, obj)
+            else:
+                self.init_track(id, obj)
+
+        self.pop_invalid_tracks(frame_id)
+
+    def pop_invalid_tracks(self, frame_id: int) -> None:
+        """Pop out invalid tracks."""
+        invalid_ids = []
+        for k, v in self.tracks.items():
+            if frame_id - v['frame_ids'][-1] >= self.num_frames_retain:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracks.pop(invalid_id)
+
+    def update_track(self, id: int, obj: Tuple[torch.Tensor]):
+        """Update a track."""
+        for k, v in zip(self.memo_items, obj):
+            v = v[None]
+            if self.momentums is not None and k in self.momentums:
+                m = self.momentums[k]
+                self.tracks[id][k] = (1 - m) * self.tracks[id][k] + m * v
+            else:
+                self.tracks[id][k].append(v)
+
+    def init_track(self, id: int, obj: Tuple[torch.Tensor]):
+        """Initialize a track."""
+        self.tracks[id] = Dict()
+        for k, v in zip(self.memo_items, obj):
+            v = v[None]
+            if self.momentums is not None and k in self.momentums:
+                self.tracks[id][k] = v
+            else:
+                self.tracks[id][k] = [v]
+
+    @property
+    def memo(self) -> dict:
+        """Return all buffers in the tracker."""
+        outs = Dict()
+        for k in self.memo_items:
+            outs[k] = []
+
+        for id, objs in self.tracks.items():
+            for k, v in objs.items():
+                if k not in outs:
+                    continue
+                if self.momentums is not None and k in self.momentums:
+                    v = v
+                else:
+                    v = v[-1]
+                outs[k].append(v)
+
+        for k, v in outs.items():
+            outs[k] = torch.cat(v, dim=0)
+        return outs
+
+    def get(self,
+            item: str,
+            ids: Optional[list] = None,
+            num_samples: Optional[int] = None,
+            behavior: Optional[str] = None) -> torch.Tensor:
+        """Get the buffer of a specific item.
+
+        Args:
+            item (str): The demanded item.
+            ids (list[int], optional): The demanded ids. Defaults to None.
+            num_samples (int, optional): Number of samples to calculate the
+                results. Defaults to None.
+            behavior (str, optional): Behavior to calculate the results.
+                Options are `mean` | None. Defaults to None.
+
+        Returns:
+            Tensor: The results of the demanded item.
+        """
+        if ids is None:
+            ids = self.ids
+
+        outs = []
+        for id in ids:
+            out = self.tracks[id][item]
+            if isinstance(out, list):
+                if num_samples is not None:
+                    out = out[-num_samples:]
+                    out = torch.cat(out, dim=0)
+                    if behavior == 'mean':
+                        out = out.mean(dim=0, keepdim=True)
+                    elif behavior is None:
+                        out = out[None]
+                    else:
+                        raise NotImplementedError()
+                else:
+                    out = out[-1]
+            outs.append(out)
+        return torch.cat(outs, dim=0)
+
+    @abstractmethod
+    def track(self, *args, **kwargs):
+        """Tracking forward function."""
+        pass
+
+    def crop_imgs(self,
+                  img: torch.Tensor,
+                  meta_info: dict,
+                  bboxes: torch.Tensor,
+                  rescale: bool = False) -> torch.Tensor:
+        """Crop the images according to some bounding boxes. Typically for re-
+        identification sub-module.
+
+        Args:
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+            meta_info (dict): image information dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            bboxes (Tensor): of shape (N, 4) or (N, 5).
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the scale of the image. Defaults to False.
+
+        Returns:
+            Tensor: Image tensor of shape (T, C, H, W).
+        """
+        h, w = meta_info['img_shape']
+        img = img[:, :, :h, :w]
+        if rescale:
+            factor_x, factor_y = meta_info['scale_factor']
+            bboxes[:, :4] *= torch.tensor(
+                [factor_x, factor_y, factor_x, factor_y]).to(bboxes.device)
+        bboxes[:, 0] = torch.clamp(bboxes[:, 0], min=0, max=w - 1)
+        bboxes[:, 1] = torch.clamp(bboxes[:, 1], min=0, max=h - 1)
+        bboxes[:, 2] = torch.clamp(bboxes[:, 2], min=1, max=w)
+        bboxes[:, 3] = torch.clamp(bboxes[:, 3], min=1, max=h)
+
+        crop_imgs = []
+        for bbox in bboxes:
+            x1, y1, x2, y2 = map(int, bbox)
+            if x2 <= x1:
+                x2 = x1 + 1
+            if y2 <= y1:
+                y2 = y1 + 1
+            crop_img = img[:, :, y1:y2, x1:x2]
+            if self.reid.get('img_scale', False):
+                crop_img = F.interpolate(
+                    crop_img,
+                    size=self.reid['img_scale'],
+                    mode='bilinear',
+                    align_corners=False)
+            crop_imgs.append(crop_img)
+
+        if len(crop_imgs) > 0:
+            return torch.cat(crop_imgs, dim=0)
+        elif self.reid.get('img_scale', False):
+            _h, _w = self.reid['img_scale']
+            return img.new_zeros((0, 3, _h, _w))
+        else:
+            return img.new_zeros((0, 3, h, w))
diff --git a/head_extractor/build/lib/mmdet/models/trackers/byte_tracker.py b/head_extractor/build/lib/mmdet/models/trackers/byte_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..11f3adc53c58339f6289cbfa77aed738259fc98c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/trackers/byte_tracker.py
@@ -0,0 +1,334 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+try:
+    import lap
+except ImportError:
+    lap = None
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import (bbox_cxcyah_to_xyxy, bbox_overlaps,
+                                   bbox_xyxy_to_cxcyah)
+from .base_tracker import BaseTracker
+
+
+@MODELS.register_module()
+class ByteTracker(BaseTracker):
+    """Tracker for ByteTrack.
+
+    Args:
+        motion (dict): Configuration of motion. Defaults to None.
+        obj_score_thrs (dict): Detection score threshold for matching objects.
+            - high (float): Threshold of the first matching. Defaults to 0.6.
+            - low (float): Threshold of the second matching. Defaults to 0.1.
+        init_track_thr (float): Detection score threshold for initializing a
+            new tracklet. Defaults to 0.7.
+        weight_iou_with_det_scores (bool): Whether using detection scores to
+            weight IOU which is used for matching. Defaults to True.
+        match_iou_thrs (dict): IOU distance threshold for matching between two
+            frames.
+            - high (float): Threshold of the first matching. Defaults to 0.1.
+            - low (float): Threshold of the second matching. Defaults to 0.5.
+            - tentative (float): Threshold of the matching for tentative
+                tracklets. Defaults to 0.3.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 3.
+    """
+
+    def __init__(self,
+                 motion: Optional[dict] = None,
+                 obj_score_thrs: dict = dict(high=0.6, low=0.1),
+                 init_track_thr: float = 0.7,
+                 weight_iou_with_det_scores: bool = True,
+                 match_iou_thrs: dict = dict(high=0.1, low=0.5, tentative=0.3),
+                 num_tentatives: int = 3,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        if lap is None:
+            raise RuntimeError('lap is not installed,\
+                 please install it by: pip install lap')
+        if motion is not None:
+            self.motion = TASK_UTILS.build(motion)
+
+        self.obj_score_thrs = obj_score_thrs
+        self.init_track_thr = init_track_thr
+
+        self.weight_iou_with_det_scores = weight_iou_with_det_scores
+        self.match_iou_thrs = match_iou_thrs
+
+        self.num_tentatives = num_tentatives
+
+    @property
+    def confirmed_ids(self) -> List:
+        """Confirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if not track.tentative]
+        return ids
+
+    @property
+    def unconfirmed_ids(self) -> List:
+        """Unconfirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if track.tentative]
+        return ids
+
+    def init_track(self, id: int, obj: Tuple[torch.Tensor]) -> None:
+        """Initialize a track."""
+        super().init_track(id, obj)
+        if self.tracks[id].frame_ids[-1] == 0:
+            self.tracks[id].tentative = False
+        else:
+            self.tracks[id].tentative = True
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate(
+            bbox)
+
+    def update_track(self, id: int, obj: Tuple[torch.Tensor]) -> None:
+        """Update a track."""
+        super().update_track(id, obj)
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        track_label = self.tracks[id]['labels'][-1]
+        label_idx = self.memo_items.index('labels')
+        obj_label = obj[label_idx]
+        assert obj_label == track_label
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox)
+
+    def pop_invalid_tracks(self, frame_id: int) -> None:
+        """Pop out invalid tracks."""
+        invalid_ids = []
+        for k, v in self.tracks.items():
+            # case1: disappeared frames >= self.num_frames_retrain
+            case1 = frame_id - v['frame_ids'][-1] >= self.num_frames_retain
+            # case2: tentative tracks but not matched in this frame
+            case2 = v.tentative and v['frame_ids'][-1] != frame_id
+            if case1 or case2:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracks.pop(invalid_id)
+
+    def assign_ids(
+            self,
+            ids: List[int],
+            det_bboxes: torch.Tensor,
+            det_labels: torch.Tensor,
+            det_scores: torch.Tensor,
+            weight_iou_with_det_scores: Optional[bool] = False,
+            match_iou_thr: Optional[float] = 0.5
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Assign ids.
+
+        Args:
+            ids (list[int]): Tracking ids.
+            det_bboxes (Tensor): of shape (N, 4)
+            det_labels (Tensor): of shape (N,)
+            det_scores (Tensor): of shape (N,)
+            weight_iou_with_det_scores (bool, optional): Whether using
+                detection scores to weight IOU which is used for matching.
+                Defaults to False.
+            match_iou_thr (float, optional): Matching threshold.
+                Defaults to 0.5.
+
+        Returns:
+            tuple(np.ndarray, np.ndarray): The assigning ids.
+        """
+        # get track_bboxes
+        track_bboxes = np.zeros((0, 4))
+        for id in ids:
+            track_bboxes = np.concatenate(
+                (track_bboxes, self.tracks[id].mean[:4][None]), axis=0)
+        track_bboxes = torch.from_numpy(track_bboxes).to(det_bboxes)
+        track_bboxes = bbox_cxcyah_to_xyxy(track_bboxes)
+
+        # compute distance
+        ious = bbox_overlaps(track_bboxes, det_bboxes)
+        if weight_iou_with_det_scores:
+            ious *= det_scores
+        # support multi-class association
+        track_labels = torch.tensor([
+            self.tracks[id]['labels'][-1] for id in ids
+        ]).to(det_bboxes.device)
+
+        cate_match = det_labels[None, :] == track_labels[:, None]
+        # to avoid det and track of different categories are matched
+        cate_cost = (1 - cate_match.int()) * 1e6
+
+        dists = (1 - ious + cate_cost).cpu().numpy()
+
+        # bipartite match
+        if dists.size > 0:
+            cost, row, col = lap.lapjv(
+                dists, extend_cost=True, cost_limit=1 - match_iou_thr)
+        else:
+            row = np.zeros(len(ids)).astype(np.int32) - 1
+            col = np.zeros(len(det_bboxes)).astype(np.int32) - 1
+        return row, col
+
+    def track(self, data_sample: DetDataSample, **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            data_sample (:obj:`DetDataSample`): The data sample.
+                It includes information such as `pred_instances`.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = self.motion
+
+        if self.empty or bboxes.size(0) == 0:
+            valid_inds = scores > self.init_track_thr
+            scores = scores[valid_inds]
+            bboxes = bboxes[valid_inds]
+            labels = labels[valid_inds]
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(self.num_tracks,
+                               self.num_tracks + num_new_tracks).to(labels)
+            self.num_tracks += num_new_tracks
+
+        else:
+            # 0. init
+            ids = torch.full((bboxes.size(0), ),
+                             -1,
+                             dtype=labels.dtype,
+                             device=labels.device)
+
+            # get the detection bboxes for the first association
+            first_det_inds = scores > self.obj_score_thrs['high']
+            first_det_bboxes = bboxes[first_det_inds]
+            first_det_labels = labels[first_det_inds]
+            first_det_scores = scores[first_det_inds]
+            first_det_ids = ids[first_det_inds]
+
+            # get the detection bboxes for the second association
+            second_det_inds = (~first_det_inds) & (
+                scores > self.obj_score_thrs['low'])
+            second_det_bboxes = bboxes[second_det_inds]
+            second_det_labels = labels[second_det_inds]
+            second_det_scores = scores[second_det_inds]
+            second_det_ids = ids[second_det_inds]
+
+            # 1. use Kalman Filter to predict current location
+            for id in self.confirmed_ids:
+                # track is lost in previous frame
+                if self.tracks[id].frame_ids[-1] != frame_id - 1:
+                    self.tracks[id].mean[7] = 0
+                (self.tracks[id].mean,
+                 self.tracks[id].covariance) = self.kf.predict(
+                     self.tracks[id].mean, self.tracks[id].covariance)
+
+            # 2. first match
+            first_match_track_inds, first_match_det_inds = self.assign_ids(
+                self.confirmed_ids, first_det_bboxes, first_det_labels,
+                first_det_scores, self.weight_iou_with_det_scores,
+                self.match_iou_thrs['high'])
+            # '-1' mean a detection box is not matched with tracklets in
+            # previous frame
+            valid = first_match_det_inds > -1
+            first_det_ids[valid] = torch.tensor(
+                self.confirmed_ids)[first_match_det_inds[valid]].to(labels)
+
+            first_match_det_bboxes = first_det_bboxes[valid]
+            first_match_det_labels = first_det_labels[valid]
+            first_match_det_scores = first_det_scores[valid]
+            first_match_det_ids = first_det_ids[valid]
+            assert (first_match_det_ids > -1).all()
+
+            first_unmatch_det_bboxes = first_det_bboxes[~valid]
+            first_unmatch_det_labels = first_det_labels[~valid]
+            first_unmatch_det_scores = first_det_scores[~valid]
+            first_unmatch_det_ids = first_det_ids[~valid]
+            assert (first_unmatch_det_ids == -1).all()
+
+            # 3. use unmatched detection bboxes from the first match to match
+            # the unconfirmed tracks
+            (tentative_match_track_inds,
+             tentative_match_det_inds) = self.assign_ids(
+                 self.unconfirmed_ids, first_unmatch_det_bboxes,
+                 first_unmatch_det_labels, first_unmatch_det_scores,
+                 self.weight_iou_with_det_scores,
+                 self.match_iou_thrs['tentative'])
+            valid = tentative_match_det_inds > -1
+            first_unmatch_det_ids[valid] = torch.tensor(self.unconfirmed_ids)[
+                tentative_match_det_inds[valid]].to(labels)
+
+            # 4. second match for unmatched tracks from the first match
+            first_unmatch_track_ids = []
+            for i, id in enumerate(self.confirmed_ids):
+                # tracklet is not matched in the first match
+                case_1 = first_match_track_inds[i] == -1
+                # tracklet is not lost in the previous frame
+                case_2 = self.tracks[id].frame_ids[-1] == frame_id - 1
+                if case_1 and case_2:
+                    first_unmatch_track_ids.append(id)
+
+            second_match_track_inds, second_match_det_inds = self.assign_ids(
+                first_unmatch_track_ids, second_det_bboxes, second_det_labels,
+                second_det_scores, False, self.match_iou_thrs['low'])
+            valid = second_match_det_inds > -1
+            second_det_ids[valid] = torch.tensor(first_unmatch_track_ids)[
+                second_match_det_inds[valid]].to(ids)
+
+            # 5. gather all matched detection bboxes from step 2-4
+            # we only keep matched detection bboxes in second match, which
+            # means the id != -1
+            valid = second_det_ids > -1
+            bboxes = torch.cat(
+                (first_match_det_bboxes, first_unmatch_det_bboxes), dim=0)
+            bboxes = torch.cat((bboxes, second_det_bboxes[valid]), dim=0)
+
+            labels = torch.cat(
+                (first_match_det_labels, first_unmatch_det_labels), dim=0)
+            labels = torch.cat((labels, second_det_labels[valid]), dim=0)
+
+            scores = torch.cat(
+                (first_match_det_scores, first_unmatch_det_scores), dim=0)
+            scores = torch.cat((scores, second_det_scores[valid]), dim=0)
+
+            ids = torch.cat((first_match_det_ids, first_unmatch_det_ids),
+                            dim=0)
+            ids = torch.cat((ids, second_det_ids[valid]), dim=0)
+
+            # 6. assign new ids
+            new_track_inds = ids == -1
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum()).to(labels)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            scores=scores,
+            labels=labels,
+            frame_ids=frame_id)
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/head_extractor/build/lib/mmdet/models/trackers/masktrack_rcnn_tracker.py b/head_extractor/build/lib/mmdet/models/trackers/masktrack_rcnn_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc167786b8b412629885a4f134a1bf79f3dfaa93
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/trackers/masktrack_rcnn_tracker.py
@@ -0,0 +1,189 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox_overlaps
+from .base_tracker import BaseTracker
+
+
+@MODELS.register_module()
+class MaskTrackRCNNTracker(BaseTracker):
+    """Tracker for MaskTrack R-CNN.
+
+    Args:
+        match_weights (dict[str : float]): The Weighting factor when computing
+        the match score. It contains keys as follows:
+
+            - det_score (float): The coefficient of `det_score` when computing
+                match score.
+            - iou (float): The coefficient of `ious` when computing match
+                score.
+            - det_label (float): The coefficient of `label_deltas` when
+                computing match score.
+    """
+
+    def __init__(self,
+                 match_weights: dict = dict(
+                     det_score=1.0, iou=2.0, det_label=10.0),
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.match_weights = match_weights
+
+    def get_match_score(self, bboxes: Tensor, labels: Tensor, scores: Tensor,
+                        prev_bboxes: Tensor, prev_labels: Tensor,
+                        similarity_logits: Tensor) -> Tensor:
+        """Get the match score.
+
+        Args:
+            bboxes (torch.Tensor): of shape (num_current_bboxes, 4) in
+                [tl_x, tl_y, br_x, br_y] format. Denoting the detection
+                bboxes of current frame.
+            labels (torch.Tensor): of shape (num_current_bboxes, )
+            scores (torch.Tensor): of shape (num_current_bboxes, )
+            prev_bboxes (torch.Tensor): of shape (num_previous_bboxes, 4) in
+                [tl_x, tl_y, br_x, br_y] format.  Denoting the detection bboxes
+                of previous frame.
+            prev_labels (torch.Tensor): of shape (num_previous_bboxes, )
+            similarity_logits (torch.Tensor): of shape (num_current_bboxes,
+                num_previous_bboxes + 1). Denoting the similarity logits from
+                track head.
+
+        Returns:
+            torch.Tensor: The matching score of shape (num_current_bboxes,
+            num_previous_bboxes + 1)
+        """
+        similarity_scores = similarity_logits.softmax(dim=1)
+
+        ious = bbox_overlaps(bboxes, prev_bboxes)
+        iou_dummy = ious.new_zeros(ious.shape[0], 1)
+        ious = torch.cat((iou_dummy, ious), dim=1)
+
+        label_deltas = (labels.view(-1, 1) == prev_labels).float()
+        label_deltas_dummy = label_deltas.new_ones(label_deltas.shape[0], 1)
+        label_deltas = torch.cat((label_deltas_dummy, label_deltas), dim=1)
+
+        match_score = similarity_scores.log()
+        match_score += self.match_weights['det_score'] * \
+            scores.view(-1, 1).log()
+        match_score += self.match_weights['iou'] * ious
+        match_score += self.match_weights['det_label'] * label_deltas
+
+        return match_score
+
+    def assign_ids(self, match_scores: Tensor):
+        num_prev_bboxes = match_scores.shape[1] - 1
+        _, match_ids = match_scores.max(dim=1)
+
+        ids = match_ids.new_zeros(match_ids.shape[0]) - 1
+        best_match_scores = match_scores.new_zeros(num_prev_bboxes) - 1e6
+        for idx, match_id in enumerate(match_ids):
+            if match_id == 0:
+                ids[idx] = self.num_tracks
+                self.num_tracks += 1
+            else:
+                match_score = match_scores[idx, match_id]
+                # TODO: fix the bug where multiple candidate might match
+                # with the same previous object.
+                if match_score > best_match_scores[match_id - 1]:
+                    ids[idx] = self.ids[match_id - 1]
+                    best_match_scores[match_id - 1] = match_score
+        return ids, best_match_scores
+
+    def track(self,
+              model: torch.nn.Module,
+              feats: List[torch.Tensor],
+              data_sample: DetDataSample,
+              rescale=True,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): VIS model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                MaskTrackRCNN method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                True.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        masks = data_sample.pred_instances.masks
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        # create pred_track_instances
+        pred_track_instances = InstanceData()
+
+        if bboxes.shape[0] == 0:
+            ids = torch.zeros_like(labels)
+            pred_track_instances = data_sample.pred_instances.clone()
+            pred_track_instances.instances_id = ids
+            return pred_track_instances
+
+        rescaled_bboxes = bboxes.clone()
+        if rescale:
+            scale_factor = rescaled_bboxes.new_tensor(
+                metainfo['scale_factor']).repeat((1, 2))
+            rescaled_bboxes = rescaled_bboxes * scale_factor
+        roi_feats, _ = model.track_head.extract_roi_feats(
+            feats, [rescaled_bboxes])
+
+        if self.empty:
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(
+                self.num_tracks,
+                self.num_tracks + num_new_tracks,
+                dtype=torch.long)
+            self.num_tracks += num_new_tracks
+        else:
+            prev_bboxes = self.get('bboxes')
+            prev_labels = self.get('labels')
+            prev_roi_feats = self.get('roi_feats')
+
+            similarity_logits = model.track_head.predict(
+                roi_feats, prev_roi_feats)
+            match_scores = self.get_match_score(bboxes, labels, scores,
+                                                prev_bboxes, prev_labels,
+                                                similarity_logits)
+            ids, _ = self.assign_ids(match_scores)
+
+        valid_inds = ids > -1
+        ids = ids[valid_inds]
+        bboxes = bboxes[valid_inds]
+        labels = labels[valid_inds]
+        scores = scores[valid_inds]
+        masks = masks[valid_inds]
+        roi_feats = roi_feats[valid_inds]
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            labels=labels,
+            scores=scores,
+            masks=masks,
+            roi_feats=roi_feats,
+            frame_ids=frame_id)
+        # update pred_track_instances
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.masks = masks
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/head_extractor/build/lib/mmdet/models/trackers/ocsort_tracker.py b/head_extractor/build/lib/mmdet/models/trackers/ocsort_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e09990c603aee8ced3bf3a65ceb530142e6e873
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/trackers/ocsort_tracker.py
@@ -0,0 +1,531 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+try:
+    import lap
+except ImportError:
+    lap = None
+import numpy as np
+import torch
+from addict import Dict
+from mmengine.structures import InstanceData
+
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import (bbox_cxcyah_to_xyxy, bbox_overlaps,
+                                   bbox_xyxy_to_cxcyah)
+from .sort_tracker import SORTTracker
+
+
+@MODELS.register_module()
+class OCSORTTracker(SORTTracker):
+    """Tracker for OC-SORT.
+
+    Args:
+        motion (dict): Configuration of motion. Defaults to None.
+        obj_score_thrs (float): Detection score threshold for matching objects.
+            Defaults to 0.3.
+        init_track_thr (float): Detection score threshold for initializing a
+            new tracklet. Defaults to 0.7.
+        weight_iou_with_det_scores (bool): Whether using detection scores to
+            weight IOU which is used for matching. Defaults to True.
+        match_iou_thr (float): IOU distance threshold for matching between two
+            frames. Defaults to 0.3.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 3.
+        vel_consist_weight (float): Weight of the velocity consistency term in
+            association (OCM term in the paper).
+        vel_delta_t (int): The difference of time step for calculating of the
+            velocity direction of tracklets.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 motion: Optional[dict] = None,
+                 obj_score_thr: float = 0.3,
+                 init_track_thr: float = 0.7,
+                 weight_iou_with_det_scores: bool = True,
+                 match_iou_thr: float = 0.3,
+                 num_tentatives: int = 3,
+                 vel_consist_weight: float = 0.2,
+                 vel_delta_t: int = 3,
+                 **kwargs):
+        if lap is None:
+            raise RuntimeError('lap is not installed,\
+                 please install it by: pip install lap')
+        super().__init__(motion=motion, **kwargs)
+        self.obj_score_thr = obj_score_thr
+        self.init_track_thr = init_track_thr
+
+        self.weight_iou_with_det_scores = weight_iou_with_det_scores
+        self.match_iou_thr = match_iou_thr
+        self.vel_consist_weight = vel_consist_weight
+        self.vel_delta_t = vel_delta_t
+
+        self.num_tentatives = num_tentatives
+
+    @property
+    def unconfirmed_ids(self):
+        """Unconfirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if track.tentative]
+        return ids
+
+    def init_track(self, id: int, obj: Tuple[torch.Tensor]):
+        """Initialize a track."""
+        super().init_track(id, obj)
+        if self.tracks[id].frame_ids[-1] == 0:
+            self.tracks[id].tentative = False
+        else:
+            self.tracks[id].tentative = True
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate(
+            bbox)
+        # track.obs maintains the history associated detections to this track
+        self.tracks[id].obs = []
+        bbox_id = self.memo_items.index('bboxes')
+        self.tracks[id].obs.append(obj[bbox_id])
+        # a placefolder to save mean/covariance before losing tracking it
+        # parameters to save: mean, covariance, measurement
+        self.tracks[id].tracked = True
+        self.tracks[id].saved_attr = Dict()
+        self.tracks[id].velocity = torch.tensor(
+            (-1, -1)).to(obj[bbox_id].device)  # placeholder
+
+    def update_track(self, id: int, obj: Tuple[torch.Tensor]):
+        """Update a track."""
+        super().update_track(id, obj)
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox)
+        self.tracks[id].tracked = True
+        bbox_id = self.memo_items.index('bboxes')
+        self.tracks[id].obs.append(obj[bbox_id])
+
+        bbox1 = self.k_step_observation(self.tracks[id])
+        bbox2 = obj[bbox_id]
+        self.tracks[id].velocity = self.vel_direction(bbox1, bbox2).to(
+            obj[bbox_id].device)
+
+    def vel_direction(self, bbox1: torch.Tensor, bbox2: torch.Tensor):
+        """Estimate the direction vector between two boxes."""
+        if bbox1.sum() < 0 or bbox2.sum() < 0:
+            return torch.tensor((-1, -1))
+        cx1, cy1 = (bbox1[0] + bbox1[2]) / 2.0, (bbox1[1] + bbox1[3]) / 2.0
+        cx2, cy2 = (bbox2[0] + bbox2[2]) / 2.0, (bbox2[1] + bbox2[3]) / 2.0
+        speed = torch.tensor([cy2 - cy1, cx2 - cx1])
+        norm = torch.sqrt((speed[0])**2 + (speed[1])**2) + 1e-6
+        return speed / norm
+
+    def vel_direction_batch(self, bboxes1: torch.Tensor,
+                            bboxes2: torch.Tensor):
+        """Estimate the direction vector given two batches of boxes."""
+        cx1, cy1 = (bboxes1[:, 0] + bboxes1[:, 2]) / 2.0, (bboxes1[:, 1] +
+                                                           bboxes1[:, 3]) / 2.0
+        cx2, cy2 = (bboxes2[:, 0] + bboxes2[:, 2]) / 2.0, (bboxes2[:, 1] +
+                                                           bboxes2[:, 3]) / 2.0
+        speed_diff_y = cy2[None, :] - cy1[:, None]
+        speed_diff_x = cx2[None, :] - cx1[:, None]
+        speed = torch.cat((speed_diff_y[..., None], speed_diff_x[..., None]),
+                          dim=-1)
+        norm = torch.sqrt((speed[:, :, 0])**2 + (speed[:, :, 1])**2) + 1e-6
+        speed[:, :, 0] /= norm
+        speed[:, :, 1] /= norm
+        return speed
+
+    def k_step_observation(self, track: Dict):
+        """return the observation k step away before."""
+        obs_seqs = track.obs
+        num_obs = len(obs_seqs)
+        if num_obs == 0:
+            return torch.tensor((-1, -1, -1, -1)).to(track.obs[0].device)
+        elif num_obs > self.vel_delta_t:
+            if obs_seqs[num_obs - 1 - self.vel_delta_t] is not None:
+                return obs_seqs[num_obs - 1 - self.vel_delta_t]
+            else:
+                return self.last_obs(track)
+        else:
+            return self.last_obs(track)
+
+    def ocm_assign_ids(self,
+                       ids: List[int],
+                       det_bboxes: torch.Tensor,
+                       det_labels: torch.Tensor,
+                       det_scores: torch.Tensor,
+                       weight_iou_with_det_scores: Optional[bool] = False,
+                       match_iou_thr: Optional[float] = 0.5):
+        """Apply Observation-Centric Momentum (OCM) to assign ids.
+
+        OCM adds movement direction consistency into the association cost
+        matrix. This term requires no additional assumption but from the
+        same linear motion assumption as the canonical Kalman Filter in SORT.
+
+        Args:
+            ids (list[int]): Tracking ids.
+            det_bboxes (Tensor): of shape (N, 4)
+            det_labels (Tensor): of shape (N,)
+            det_scores (Tensor): of shape (N,)
+            weight_iou_with_det_scores (bool, optional): Whether using
+                detection scores to weight IOU which is used for matching.
+                Defaults to False.
+            match_iou_thr (float, optional): Matching threshold.
+                Defaults to 0.5.
+
+        Returns:
+            tuple(int): The assigning ids.
+
+        OC-SORT uses velocity consistency besides IoU for association
+        """
+        # get track_bboxes
+        track_bboxes = np.zeros((0, 4))
+        for id in ids:
+            track_bboxes = np.concatenate(
+                (track_bboxes, self.tracks[id].mean[:4][None]), axis=0)
+        track_bboxes = torch.from_numpy(track_bboxes).to(det_bboxes)
+        track_bboxes = bbox_cxcyah_to_xyxy(track_bboxes)
+
+        # compute distance
+        ious = bbox_overlaps(track_bboxes, det_bboxes)
+        if weight_iou_with_det_scores:
+            ious *= det_scores
+
+        # support multi-class association
+        track_labels = torch.tensor([
+            self.tracks[id]['labels'][-1] for id in ids
+        ]).to(det_bboxes.device)
+        cate_match = det_labels[None, :] == track_labels[:, None]
+        # to avoid det and track of different categories are matched
+        cate_cost = (1 - cate_match.int()) * 1e6
+
+        dists = (1 - ious + cate_cost).cpu().numpy()
+
+        if len(ids) > 0 and len(det_bboxes) > 0:
+            track_velocities = torch.stack(
+                [self.tracks[id].velocity for id in ids]).to(det_bboxes.device)
+            k_step_observations = torch.stack([
+                self.k_step_observation(self.tracks[id]) for id in ids
+            ]).to(det_bboxes.device)
+            # valid1: if the track has previous observations to estimate speed
+            # valid2: if the associated observation k steps ago is a detection
+            valid1 = track_velocities.sum(dim=1) != -2
+            valid2 = k_step_observations.sum(dim=1) != -4
+            valid = valid1 & valid2
+
+            vel_to_match = self.vel_direction_batch(k_step_observations,
+                                                    det_bboxes)
+            track_velocities = track_velocities[:, None, :].repeat(
+                1, det_bboxes.shape[0], 1)
+
+            angle_cos = (vel_to_match * track_velocities).sum(dim=-1)
+            angle_cos = torch.clamp(angle_cos, min=-1, max=1)
+            angle = torch.acos(angle_cos)  # [0, pi]
+            norm_angle = (angle - np.pi / 2.) / np.pi  # [-0.5, 0.5]
+            valid_matrix = valid[:, None].int().repeat(1, det_bboxes.shape[0])
+            # set non-valid entries 0
+            valid_norm_angle = norm_angle * valid_matrix
+
+            dists += valid_norm_angle.cpu().numpy() * self.vel_consist_weight
+
+        # bipartite match
+        if dists.size > 0:
+            cost, row, col = lap.lapjv(
+                dists, extend_cost=True, cost_limit=1 - match_iou_thr)
+        else:
+            row = np.zeros(len(ids)).astype(np.int32) - 1
+            col = np.zeros(len(det_bboxes)).astype(np.int32) - 1
+        return row, col
+
+    def last_obs(self, track: Dict):
+        """extract the last associated observation."""
+        for bbox in track.obs[::-1]:
+            if bbox is not None:
+                return bbox
+
+    def ocr_assign_ids(self,
+                       track_obs: torch.Tensor,
+                       last_track_labels: torch.Tensor,
+                       det_bboxes: torch.Tensor,
+                       det_labels: torch.Tensor,
+                       det_scores: torch.Tensor,
+                       weight_iou_with_det_scores: Optional[bool] = False,
+                       match_iou_thr: Optional[float] = 0.5):
+        """association for Observation-Centric Recovery.
+
+        As try to recover tracks from being lost whose estimated velocity is
+        out- to-date, we use IoU-only matching strategy.
+
+        Args:
+            track_obs (Tensor): the list of historical associated
+                detections of tracks
+            det_bboxes (Tensor): of shape (N, 5), unmatched detections
+            det_labels (Tensor): of shape (N,)
+            det_scores (Tensor): of shape (N,)
+            weight_iou_with_det_scores (bool, optional): Whether using
+                detection scores to weight IOU which is used for matching.
+                Defaults to False.
+            match_iou_thr (float, optional): Matching threshold.
+                Defaults to 0.5.
+
+        Returns:
+            tuple(int): The assigning ids.
+        """
+        # compute distance
+        ious = bbox_overlaps(track_obs, det_bboxes)
+        if weight_iou_with_det_scores:
+            ious *= det_scores
+
+        # support multi-class association
+        cate_match = det_labels[None, :] == last_track_labels[:, None]
+        # to avoid det and track of different categories are matched
+        cate_cost = (1 - cate_match.int()) * 1e6
+
+        dists = (1 - ious + cate_cost).cpu().numpy()
+
+        # bipartite match
+        if dists.size > 0:
+            cost, row, col = lap.lapjv(
+                dists, extend_cost=True, cost_limit=1 - match_iou_thr)
+        else:
+            row = np.zeros(len(track_obs)).astype(np.int32) - 1
+            col = np.zeros(len(det_bboxes)).astype(np.int32) - 1
+        return row, col
+
+    def online_smooth(self, track: Dict, obj: torch.Tensor):
+        """Once a track is recovered from being lost, online smooth its
+        parameters to fix the error accumulated during being lost.
+
+        NOTE: you can use different virtual trajectory generation
+        strategies, we adopt the naive linear interpolation as default
+        """
+        last_match_bbox = self.last_obs(track)
+        new_match_bbox = obj
+        unmatch_len = 0
+        for bbox in track.obs[::-1]:
+            if bbox is None:
+                unmatch_len += 1
+            else:
+                break
+        bbox_shift_per_step = (new_match_bbox - last_match_bbox) / (
+            unmatch_len + 1)
+        track.mean = track.saved_attr.mean
+        track.covariance = track.saved_attr.covariance
+        for i in range(unmatch_len):
+            virtual_bbox = last_match_bbox + (i + 1) * bbox_shift_per_step
+            virtual_bbox = bbox_xyxy_to_cxcyah(virtual_bbox[None, :])
+            virtual_bbox = virtual_bbox.squeeze(0).cpu().numpy()
+            track.mean, track.covariance = self.kf.update(
+                track.mean, track.covariance, virtual_bbox)
+
+    def track(self, data_sample: DetDataSample, **kwargs) -> InstanceData:
+        """Tracking forward function.
+        NOTE: this implementation is slightly different from the original
+        OC-SORT implementation (https://github.com/noahcao/OC_SORT)that we
+        do association between detections and tentative/non-tentative tracks
+        independently while the original implementation combines them together.
+
+        Args:
+            data_sample (:obj:`DetDataSample`): The data sample.
+                It includes information such as `pred_instances`.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = self.motion
+
+        if self.empty or bboxes.size(0) == 0:
+            valid_inds = scores > self.init_track_thr
+            scores = scores[valid_inds]
+            bboxes = bboxes[valid_inds]
+            labels = labels[valid_inds]
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(self.num_tracks,
+                               self.num_tracks + num_new_tracks).to(labels)
+            self.num_tracks += num_new_tracks
+        else:
+            # 0. init
+            ids = torch.full((bboxes.size(0), ),
+                             -1,
+                             dtype=labels.dtype,
+                             device=labels.device)
+
+            # get the detection bboxes for the first association
+            det_inds = scores > self.obj_score_thr
+            det_bboxes = bboxes[det_inds]
+            det_labels = labels[det_inds]
+            det_scores = scores[det_inds]
+            det_ids = ids[det_inds]
+
+            # 1. predict by Kalman Filter
+            for id in self.confirmed_ids:
+                # track is lost in previous frame
+                if self.tracks[id].frame_ids[-1] != frame_id - 1:
+                    self.tracks[id].mean[7] = 0
+                if self.tracks[id].tracked:
+                    self.tracks[id].saved_attr.mean = self.tracks[id].mean
+                    self.tracks[id].saved_attr.covariance = self.tracks[
+                        id].covariance
+                (self.tracks[id].mean,
+                 self.tracks[id].covariance) = self.kf.predict(
+                     self.tracks[id].mean, self.tracks[id].covariance)
+
+            # 2. match detections and tracks' predicted locations
+            match_track_inds, raw_match_det_inds = self.ocm_assign_ids(
+                self.confirmed_ids, det_bboxes, det_labels, det_scores,
+                self.weight_iou_with_det_scores, self.match_iou_thr)
+            # '-1' mean a detection box is not matched with tracklets in
+            # previous frame
+            valid = raw_match_det_inds > -1
+            det_ids[valid] = torch.tensor(
+                self.confirmed_ids)[raw_match_det_inds[valid]].to(labels)
+
+            match_det_bboxes = det_bboxes[valid]
+            match_det_labels = det_labels[valid]
+            match_det_scores = det_scores[valid]
+            match_det_ids = det_ids[valid]
+            assert (match_det_ids > -1).all()
+
+            # unmatched tracks and detections
+            unmatch_det_bboxes = det_bboxes[~valid]
+            unmatch_det_labels = det_labels[~valid]
+            unmatch_det_scores = det_scores[~valid]
+            unmatch_det_ids = det_ids[~valid]
+            assert (unmatch_det_ids == -1).all()
+
+            # 3. use unmatched detection bboxes from the first match to match
+            # the unconfirmed tracks
+            (tentative_match_track_inds,
+             tentative_match_det_inds) = self.ocm_assign_ids(
+                 self.unconfirmed_ids, unmatch_det_bboxes, unmatch_det_labels,
+                 unmatch_det_scores, self.weight_iou_with_det_scores,
+                 self.match_iou_thr)
+            valid = tentative_match_det_inds > -1
+            unmatch_det_ids[valid] = torch.tensor(self.unconfirmed_ids)[
+                tentative_match_det_inds[valid]].to(labels)
+
+            match_det_bboxes = torch.cat(
+                (match_det_bboxes, unmatch_det_bboxes[valid]), dim=0)
+            match_det_labels = torch.cat(
+                (match_det_labels, unmatch_det_labels[valid]), dim=0)
+            match_det_scores = torch.cat(
+                (match_det_scores, unmatch_det_scores[valid]), dim=0)
+            match_det_ids = torch.cat((match_det_ids, unmatch_det_ids[valid]),
+                                      dim=0)
+            assert (match_det_ids > -1).all()
+
+            unmatch_det_bboxes = unmatch_det_bboxes[~valid]
+            unmatch_det_labels = unmatch_det_labels[~valid]
+            unmatch_det_scores = unmatch_det_scores[~valid]
+            unmatch_det_ids = unmatch_det_ids[~valid]
+            assert (unmatch_det_ids == -1).all()
+
+            all_track_ids = [id for id, _ in self.tracks.items()]
+            unmatched_track_inds = torch.tensor(
+                [ind for ind in all_track_ids if ind not in match_det_ids])
+
+            if len(unmatched_track_inds) > 0:
+                # 4. still some tracks not associated yet, perform OCR
+                last_observations = []
+                for id in unmatched_track_inds:
+                    last_box = self.last_obs(self.tracks[id.item()])
+                    last_observations.append(last_box)
+                last_observations = torch.stack(last_observations)
+                last_track_labels = torch.tensor([
+                    self.tracks[id.item()]['labels'][-1]
+                    for id in unmatched_track_inds
+                ]).to(det_bboxes.device)
+
+                remain_det_ids = torch.full((unmatch_det_bboxes.size(0), ),
+                                            -1,
+                                            dtype=labels.dtype,
+                                            device=labels.device)
+
+                _, ocr_match_det_inds = self.ocr_assign_ids(
+                    last_observations, last_track_labels, unmatch_det_bboxes,
+                    unmatch_det_labels, unmatch_det_scores,
+                    self.weight_iou_with_det_scores, self.match_iou_thr)
+
+                valid = ocr_match_det_inds > -1
+                remain_det_ids[valid] = unmatched_track_inds.clone()[
+                    ocr_match_det_inds[valid]].to(labels)
+
+                ocr_match_det_bboxes = unmatch_det_bboxes[valid]
+                ocr_match_det_labels = unmatch_det_labels[valid]
+                ocr_match_det_scores = unmatch_det_scores[valid]
+                ocr_match_det_ids = remain_det_ids[valid]
+                assert (ocr_match_det_ids > -1).all()
+
+                ocr_unmatch_det_bboxes = unmatch_det_bboxes[~valid]
+                ocr_unmatch_det_labels = unmatch_det_labels[~valid]
+                ocr_unmatch_det_scores = unmatch_det_scores[~valid]
+                ocr_unmatch_det_ids = remain_det_ids[~valid]
+                assert (ocr_unmatch_det_ids == -1).all()
+
+                unmatch_det_bboxes = ocr_unmatch_det_bboxes
+                unmatch_det_labels = ocr_unmatch_det_labels
+                unmatch_det_scores = ocr_unmatch_det_scores
+                unmatch_det_ids = ocr_unmatch_det_ids
+                match_det_bboxes = torch.cat(
+                    (match_det_bboxes, ocr_match_det_bboxes), dim=0)
+                match_det_labels = torch.cat(
+                    (match_det_labels, ocr_match_det_labels), dim=0)
+                match_det_scores = torch.cat(
+                    (match_det_scores, ocr_match_det_scores), dim=0)
+                match_det_ids = torch.cat((match_det_ids, ocr_match_det_ids),
+                                          dim=0)
+
+            # 5. summarize the track results
+            for i in range(len(match_det_ids)):
+                det_bbox = match_det_bboxes[i]
+                track_id = match_det_ids[i].item()
+                if not self.tracks[track_id].tracked:
+                    # the track is lost before this step
+                    self.online_smooth(self.tracks[track_id], det_bbox)
+
+            for track_id in all_track_ids:
+                if track_id not in match_det_ids:
+                    self.tracks[track_id].tracked = False
+                    self.tracks[track_id].obs.append(None)
+
+            bboxes = torch.cat((match_det_bboxes, unmatch_det_bboxes), dim=0)
+            labels = torch.cat((match_det_labels, unmatch_det_labels), dim=0)
+            scores = torch.cat((match_det_scores, unmatch_det_scores), dim=0)
+            ids = torch.cat((match_det_ids, unmatch_det_ids), dim=0)
+            # 6. assign new ids
+            new_track_inds = ids == -1
+
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum()).to(labels)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            labels=labels,
+            scores=scores,
+            frame_ids=frame_id)
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+        return pred_track_instances
diff --git a/head_extractor/build/lib/mmdet/models/trackers/quasi_dense_tracker.py b/head_extractor/build/lib/mmdet/models/trackers/quasi_dense_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..c93c3c4c3bd5c8939e77195f30a7eb2f0314e225
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/trackers/quasi_dense_tracker.py
@@ -0,0 +1,316 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import TrackDataSample
+from mmdet.structures.bbox import bbox_overlaps
+from .base_tracker import BaseTracker
+
+
+@MODELS.register_module()
+class QuasiDenseTracker(BaseTracker):
+    """Tracker for Quasi-Dense Tracking.
+
+    Args:
+        init_score_thr (float): The cls_score threshold to
+            initialize a new tracklet. Defaults to 0.8.
+        obj_score_thr (float): The cls_score threshold to
+            update a tracked tracklet. Defaults to 0.5.
+        match_score_thr (float): The match threshold. Defaults to 0.5.
+        memo_tracklet_frames (int): The most frames in a tracklet memory.
+            Defaults to 10.
+        memo_backdrop_frames (int): The most frames in the backdrops.
+            Defaults to 1.
+        memo_momentum (float): The momentum value for embeds updating.
+            Defaults to 0.8.
+        nms_conf_thr (float): The nms threshold for confidence.
+            Defaults to 0.5.
+        nms_backdrop_iou_thr (float): The nms threshold for backdrop IoU.
+            Defaults to 0.3.
+        nms_class_iou_thr (float): The nms threshold for class IoU.
+            Defaults to 0.7.
+        with_cats (bool): Whether to track with the same category.
+            Defaults to True.
+        match_metric (str): The match metric. Defaults to 'bisoftmax'.
+    """
+
+    def __init__(self,
+                 init_score_thr: float = 0.8,
+                 obj_score_thr: float = 0.5,
+                 match_score_thr: float = 0.5,
+                 memo_tracklet_frames: int = 10,
+                 memo_backdrop_frames: int = 1,
+                 memo_momentum: float = 0.8,
+                 nms_conf_thr: float = 0.5,
+                 nms_backdrop_iou_thr: float = 0.3,
+                 nms_class_iou_thr: float = 0.7,
+                 with_cats: bool = True,
+                 match_metric: str = 'bisoftmax',
+                 **kwargs):
+        super().__init__(**kwargs)
+        assert 0 <= memo_momentum <= 1.0
+        assert memo_tracklet_frames >= 0
+        assert memo_backdrop_frames >= 0
+        self.init_score_thr = init_score_thr
+        self.obj_score_thr = obj_score_thr
+        self.match_score_thr = match_score_thr
+        self.memo_tracklet_frames = memo_tracklet_frames
+        self.memo_backdrop_frames = memo_backdrop_frames
+        self.memo_momentum = memo_momentum
+        self.nms_conf_thr = nms_conf_thr
+        self.nms_backdrop_iou_thr = nms_backdrop_iou_thr
+        self.nms_class_iou_thr = nms_class_iou_thr
+        self.with_cats = with_cats
+        assert match_metric in ['bisoftmax', 'softmax', 'cosine']
+        self.match_metric = match_metric
+
+        self.num_tracks = 0
+        self.tracks = dict()
+        self.backdrops = []
+
+    def reset(self):
+        """Reset the buffer of the tracker."""
+        self.num_tracks = 0
+        self.tracks = dict()
+        self.backdrops = []
+
+    def update(self, ids: Tensor, bboxes: Tensor, embeds: Tensor,
+               labels: Tensor, scores: Tensor, frame_id: int) -> None:
+        """Tracking forward function.
+
+        Args:
+            ids (Tensor): of shape(N, ).
+            bboxes (Tensor): of shape (N, 5).
+            embeds (Tensor): of shape (N, 256).
+            labels (Tensor): of shape (N, ).
+            scores (Tensor): of shape (N, ).
+            frame_id (int): The id of current frame, 0-index.
+        """
+        tracklet_inds = ids > -1
+
+        for id, bbox, embed, label, score in zip(ids[tracklet_inds],
+                                                 bboxes[tracklet_inds],
+                                                 embeds[tracklet_inds],
+                                                 labels[tracklet_inds],
+                                                 scores[tracklet_inds]):
+            id = int(id)
+            # update the tracked ones and initialize new tracks
+            if id in self.tracks.keys():
+                velocity = (bbox - self.tracks[id]['bbox']) / (
+                    frame_id - self.tracks[id]['last_frame'])
+                self.tracks[id]['bbox'] = bbox
+                self.tracks[id]['embed'] = (
+                    1 - self.memo_momentum
+                ) * self.tracks[id]['embed'] + self.memo_momentum * embed
+                self.tracks[id]['last_frame'] = frame_id
+                self.tracks[id]['label'] = label
+                self.tracks[id]['score'] = score
+                self.tracks[id]['velocity'] = (
+                    self.tracks[id]['velocity'] * self.tracks[id]['acc_frame']
+                    + velocity) / (
+                        self.tracks[id]['acc_frame'] + 1)
+                self.tracks[id]['acc_frame'] += 1
+            else:
+                self.tracks[id] = dict(
+                    bbox=bbox,
+                    embed=embed,
+                    label=label,
+                    score=score,
+                    last_frame=frame_id,
+                    velocity=torch.zeros_like(bbox),
+                    acc_frame=0)
+        # backdrop update according to IoU
+        backdrop_inds = torch.nonzero(ids == -1, as_tuple=False).squeeze(1)
+        ious = bbox_overlaps(bboxes[backdrop_inds], bboxes)
+        for i, ind in enumerate(backdrop_inds):
+            if (ious[i, :ind] > self.nms_backdrop_iou_thr).any():
+                backdrop_inds[i] = -1
+        backdrop_inds = backdrop_inds[backdrop_inds > -1]
+        # old backdrops would be removed at first
+        self.backdrops.insert(
+            0,
+            dict(
+                bboxes=bboxes[backdrop_inds],
+                embeds=embeds[backdrop_inds],
+                labels=labels[backdrop_inds]))
+
+        # pop memo
+        invalid_ids = []
+        for k, v in self.tracks.items():
+            if frame_id - v['last_frame'] >= self.memo_tracklet_frames:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracks.pop(invalid_id)
+
+        if len(self.backdrops) > self.memo_backdrop_frames:
+            self.backdrops.pop()
+
+    @property
+    def memo(self) -> Tuple[Tensor, ...]:
+        """Get tracks memory."""
+        memo_embeds = []
+        memo_ids = []
+        memo_bboxes = []
+        memo_labels = []
+        # velocity of tracks
+        memo_vs = []
+        # get tracks
+        for k, v in self.tracks.items():
+            memo_bboxes.append(v['bbox'][None, :])
+            memo_embeds.append(v['embed'][None, :])
+            memo_ids.append(k)
+            memo_labels.append(v['label'].view(1, 1))
+            memo_vs.append(v['velocity'][None, :])
+        memo_ids = torch.tensor(memo_ids, dtype=torch.long).view(1, -1)
+        # get backdrops
+        for backdrop in self.backdrops:
+            backdrop_ids = torch.full((1, backdrop['embeds'].size(0)),
+                                      -1,
+                                      dtype=torch.long)
+            backdrop_vs = torch.zeros_like(backdrop['bboxes'])
+            memo_bboxes.append(backdrop['bboxes'])
+            memo_embeds.append(backdrop['embeds'])
+            memo_ids = torch.cat([memo_ids, backdrop_ids], dim=1)
+            memo_labels.append(backdrop['labels'][:, None])
+            memo_vs.append(backdrop_vs)
+
+        memo_bboxes = torch.cat(memo_bboxes, dim=0)
+        memo_embeds = torch.cat(memo_embeds, dim=0)
+        memo_labels = torch.cat(memo_labels, dim=0).squeeze(1)
+        memo_vs = torch.cat(memo_vs, dim=0)
+        return memo_bboxes, memo_labels, memo_embeds, memo_ids.squeeze(
+            0), memo_vs
+
+    def track(self,
+              model: torch.nn.Module,
+              img: torch.Tensor,
+              feats: List[torch.Tensor],
+              data_sample: TrackDataSample,
+              rescale=True,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                QDTrack method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_instances`.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                True.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        # create pred_track_instances
+        pred_track_instances = InstanceData()
+
+        # return zero bboxes if there is no track targets
+        if bboxes.shape[0] == 0:
+            ids = torch.zeros_like(labels)
+            pred_track_instances = data_sample.pred_instances.clone()
+            pred_track_instances.instances_id = ids
+            return pred_track_instances
+
+        # get track feats
+        rescaled_bboxes = bboxes.clone()
+        if rescale:
+            scale_factor = rescaled_bboxes.new_tensor(
+                metainfo['scale_factor']).repeat((1, 2))
+            rescaled_bboxes = rescaled_bboxes * scale_factor
+        track_feats = model.track_head.predict(feats, [rescaled_bboxes])
+        # sort according to the object_score
+        _, inds = scores.sort(descending=True)
+        bboxes = bboxes[inds]
+        scores = scores[inds]
+        labels = labels[inds]
+        embeds = track_feats[inds, :]
+
+        # duplicate removal for potential backdrops and cross classes
+        valids = bboxes.new_ones((bboxes.size(0)))
+        ious = bbox_overlaps(bboxes, bboxes)
+        for i in range(1, bboxes.size(0)):
+            thr = self.nms_backdrop_iou_thr if scores[
+                i] < self.obj_score_thr else self.nms_class_iou_thr
+            if (ious[i, :i] > thr).any():
+                valids[i] = 0
+        valids = valids == 1
+        bboxes = bboxes[valids]
+        scores = scores[valids]
+        labels = labels[valids]
+        embeds = embeds[valids, :]
+
+        # init ids container
+        ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long)
+
+        # match if buffer is not empty
+        if bboxes.size(0) > 0 and not self.empty:
+            (memo_bboxes, memo_labels, memo_embeds, memo_ids,
+             memo_vs) = self.memo
+
+            if self.match_metric == 'bisoftmax':
+                feats = torch.mm(embeds, memo_embeds.t())
+                d2t_scores = feats.softmax(dim=1)
+                t2d_scores = feats.softmax(dim=0)
+                match_scores = (d2t_scores + t2d_scores) / 2
+            elif self.match_metric == 'softmax':
+                feats = torch.mm(embeds, memo_embeds.t())
+                match_scores = feats.softmax(dim=1)
+            elif self.match_metric == 'cosine':
+                match_scores = torch.mm(
+                    F.normalize(embeds, p=2, dim=1),
+                    F.normalize(memo_embeds, p=2, dim=1).t())
+            else:
+                raise NotImplementedError
+            # track with the same category
+            if self.with_cats:
+                cat_same = labels.view(-1, 1) == memo_labels.view(1, -1)
+                match_scores *= cat_same.float().to(match_scores.device)
+            # track according to match_scores
+            for i in range(bboxes.size(0)):
+                conf, memo_ind = torch.max(match_scores[i, :], dim=0)
+                id = memo_ids[memo_ind]
+                if conf > self.match_score_thr:
+                    if id > -1:
+                        # keep bboxes with high object score
+                        # and remove background bboxes
+                        if scores[i] > self.obj_score_thr:
+                            ids[i] = id
+                            match_scores[:i, memo_ind] = 0
+                            match_scores[i + 1:, memo_ind] = 0
+                        else:
+                            if conf > self.nms_conf_thr:
+                                ids[i] = -2
+        # initialize new tracks
+        new_inds = (ids == -1) & (scores > self.init_score_thr).cpu()
+        num_news = new_inds.sum()
+        ids[new_inds] = torch.arange(
+            self.num_tracks, self.num_tracks + num_news, dtype=torch.long)
+        self.num_tracks += num_news
+
+        self.update(ids, bboxes, embeds, labels, scores, frame_id)
+        tracklet_inds = ids > -1
+        # update pred_track_instances
+        pred_track_instances.bboxes = bboxes[tracklet_inds]
+        pred_track_instances.labels = labels[tracklet_inds]
+        pred_track_instances.scores = scores[tracklet_inds]
+        pred_track_instances.instances_id = ids[tracklet_inds]
+
+        return pred_track_instances
diff --git a/head_extractor/build/lib/mmdet/models/trackers/sort_tracker.py b/head_extractor/build/lib/mmdet/models/trackers/sort_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4a4fed92702f7d1ea66917a7157fcf5d0773a30
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/trackers/sort_tracker.py
@@ -0,0 +1,268 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+try:
+    import motmetrics
+    from motmetrics.lap import linear_sum_assignment
+except ImportError:
+    motmetrics = None
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox_overlaps, bbox_xyxy_to_cxcyah
+from mmdet.utils import OptConfigType
+from ..utils import imrenormalize
+from .base_tracker import BaseTracker
+
+
+@MODELS.register_module()
+class SORTTracker(BaseTracker):
+    """Tracker for SORT/DeepSORT.
+
+    Args:
+        obj_score_thr (float, optional): Threshold to filter the objects.
+            Defaults to 0.3.
+        motion (dict): Configuration of motion. Defaults to None.
+        reid (dict, optional): Configuration for the ReID model.
+            - num_samples (int, optional): Number of samples to calculate the
+                feature embeddings of a track. Default to 10.
+            - image_scale (tuple, optional): Input scale of the ReID model.
+                Default to (256, 128).
+            - img_norm_cfg (dict, optional): Configuration to normalize the
+                input. Default to None.
+            - match_score_thr (float, optional): Similarity threshold for the
+                matching process. Default to 2.0.
+        match_iou_thr (float, optional): Threshold of the IoU matching process.
+            Defaults to 0.7.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 3.
+    """
+
+    def __init__(self,
+                 motion: Optional[dict] = None,
+                 obj_score_thr: float = 0.3,
+                 reid: dict = dict(
+                     num_samples=10,
+                     img_scale=(256, 128),
+                     img_norm_cfg=None,
+                     match_score_thr=2.0),
+                 match_iou_thr: float = 0.7,
+                 num_tentatives: int = 3,
+                 **kwargs):
+        if motmetrics is None:
+            raise RuntimeError('motmetrics is not installed,\
+                 please install it by: pip install motmetrics')
+        super().__init__(**kwargs)
+        if motion is not None:
+            self.motion = TASK_UTILS.build(motion)
+            assert self.motion is not None, 'SORT/Deep SORT need KalmanFilter'
+        self.obj_score_thr = obj_score_thr
+        self.reid = reid
+        self.match_iou_thr = match_iou_thr
+        self.num_tentatives = num_tentatives
+
+    @property
+    def confirmed_ids(self) -> List:
+        """Confirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if not track.tentative]
+        return ids
+
+    def init_track(self, id: int, obj: Tuple[Tensor]) -> None:
+        """Initialize a track."""
+        super().init_track(id, obj)
+        self.tracks[id].tentative = True
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate(
+            bbox)
+
+    def update_track(self, id: int, obj: Tuple[Tensor]) -> None:
+        """Update a track."""
+        super().update_track(id, obj)
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox)
+
+    def pop_invalid_tracks(self, frame_id: int) -> None:
+        """Pop out invalid tracks."""
+        invalid_ids = []
+        for k, v in self.tracks.items():
+            # case1: disappeared frames >= self.num_frames_retrain
+            case1 = frame_id - v['frame_ids'][-1] >= self.num_frames_retain
+            # case2: tentative tracks but not matched in this frame
+            case2 = v.tentative and v['frame_ids'][-1] != frame_id
+            if case1 or case2:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracks.pop(invalid_id)
+
+    def track(self,
+              model: torch.nn.Module,
+              img: Tensor,
+              data_sample: DetDataSample,
+              data_preprocessor: OptConfigType = None,
+              rescale: bool = False,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                SORT method.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            data_preprocessor (dict or ConfigDict, optional): The pre-process
+               config of :class:`TrackDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                False.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = self.motion
+
+        if self.with_reid:
+            if self.reid.get('img_norm_cfg', False):
+                img_norm_cfg = dict(
+                    mean=data_preprocessor['mean'],
+                    std=data_preprocessor['std'],
+                    to_bgr=data_preprocessor['rgb_to_bgr'])
+                reid_img = imrenormalize(img, img_norm_cfg,
+                                         self.reid['img_norm_cfg'])
+            else:
+                reid_img = img.clone()
+
+        valid_inds = scores > self.obj_score_thr
+        bboxes = bboxes[valid_inds]
+        labels = labels[valid_inds]
+        scores = scores[valid_inds]
+
+        if self.empty or bboxes.size(0) == 0:
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(
+                self.num_tracks,
+                self.num_tracks + num_new_tracks,
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += num_new_tracks
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                if crops.size(0) > 0:
+                    embeds = model.reid(crops, mode='tensor')
+                else:
+                    embeds = crops.new_zeros((0, model.reid.head.out_channels))
+        else:
+            ids = torch.full((bboxes.size(0), ), -1,
+                             dtype=torch.long).to(bboxes.device)
+
+            # motion
+            self.tracks, costs = self.motion.track(self.tracks,
+                                                   bbox_xyxy_to_cxcyah(bboxes))
+
+            active_ids = self.confirmed_ids
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                embeds = model.reid(crops, mode='tensor')
+
+                # reid
+                if len(active_ids) > 0:
+                    track_embeds = self.get(
+                        'embeds',
+                        active_ids,
+                        self.reid.get('num_samples', None),
+                        behavior='mean')
+                    reid_dists = torch.cdist(track_embeds, embeds)
+
+                    # support multi-class association
+                    track_labels = torch.tensor([
+                        self.tracks[id]['labels'][-1] for id in active_ids
+                    ]).to(bboxes.device)
+                    cate_match = labels[None, :] == track_labels[:, None]
+                    cate_cost = (1 - cate_match.int()) * 1e6
+                    reid_dists = (reid_dists + cate_cost).cpu().numpy()
+
+                    valid_inds = [list(self.ids).index(_) for _ in active_ids]
+                    reid_dists[~np.isfinite(costs[valid_inds, :])] = np.nan
+
+                    row, col = linear_sum_assignment(reid_dists)
+                    for r, c in zip(row, col):
+                        dist = reid_dists[r, c]
+                        if not np.isfinite(dist):
+                            continue
+                        if dist <= self.reid['match_score_thr']:
+                            ids[c] = active_ids[r]
+
+            active_ids = [
+                id for id in self.ids if id not in ids
+                and self.tracks[id].frame_ids[-1] == frame_id - 1
+            ]
+            if len(active_ids) > 0:
+                active_dets = torch.nonzero(ids == -1).squeeze(1)
+                track_bboxes = self.get('bboxes', active_ids)
+                ious = bbox_overlaps(track_bboxes, bboxes[active_dets])
+
+                # support multi-class association
+                track_labels = torch.tensor([
+                    self.tracks[id]['labels'][-1] for id in active_ids
+                ]).to(bboxes.device)
+                cate_match = labels[None, active_dets] == track_labels[:, None]
+                cate_cost = (1 - cate_match.int()) * 1e6
+
+                dists = (1 - ious + cate_cost).cpu().numpy()
+
+                row, col = linear_sum_assignment(dists)
+                for r, c in zip(row, col):
+                    dist = dists[r, c]
+                    if dist < 1 - self.match_iou_thr:
+                        ids[active_dets[c]] = active_ids[r]
+
+            new_track_inds = ids == -1
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum(),
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            scores=scores,
+            labels=labels,
+            embeds=embeds if self.with_reid else None,
+            frame_ids=frame_id)
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/head_extractor/build/lib/mmdet/models/trackers/strongsort_tracker.py b/head_extractor/build/lib/mmdet/models/trackers/strongsort_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d7075701bc3205b9ea30f03790cfa1c42a97822
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/trackers/strongsort_tracker.py
@@ -0,0 +1,273 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+try:
+    import motmetrics
+    from motmetrics.lap import linear_sum_assignment
+except ImportError:
+    motmetrics = None
+from torch import Tensor
+
+from mmdet.models.utils import imrenormalize
+from mmdet.registry import MODELS
+from mmdet.structures import TrackDataSample
+from mmdet.structures.bbox import bbox_overlaps, bbox_xyxy_to_cxcyah
+from mmdet.utils import OptConfigType
+from .sort_tracker import SORTTracker
+
+
+def cosine_distance(x: Tensor, y: Tensor) -> np.ndarray:
+    """compute the cosine distance.
+
+    Args:
+        x (Tensor): embeddings with shape (N,C).
+        y (Tensor): embeddings with shape (M,C).
+
+    Returns:
+        ndarray: cosine distance with shape (N,M).
+    """
+    x = x.cpu().numpy()
+    y = y.cpu().numpy()
+    x = x / np.linalg.norm(x, axis=1, keepdims=True)
+    y = y / np.linalg.norm(y, axis=1, keepdims=True)
+    dists = 1. - np.dot(x, y.T)
+    return dists
+
+
+@MODELS.register_module()
+class StrongSORTTracker(SORTTracker):
+    """Tracker for StrongSORT.
+
+    Args:
+        obj_score_thr (float, optional): Threshold to filter the objects.
+            Defaults to 0.6.
+        motion (dict): Configuration of motion. Defaults to None.
+        reid (dict, optional): Configuration for the ReID model.
+            - num_samples (int, optional): Number of samples to calculate the
+                feature embeddings of a track. Default to None.
+            - image_scale (tuple, optional): Input scale of the ReID model.
+                Default to (256, 128).
+            - img_norm_cfg (dict, optional): Configuration to normalize the
+                input. Default to None.
+            - match_score_thr (float, optional): Similarity threshold for the
+                matching process. Default to 0.3.
+            - motion_weight (float, optional): the weight of the motion cost.
+                Defaults to 0.02.
+        match_iou_thr (float, optional): Threshold of the IoU matching process.
+            Defaults to 0.7.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 2.
+    """
+
+    def __init__(self,
+                 motion: Optional[dict] = None,
+                 obj_score_thr: float = 0.6,
+                 reid: dict = dict(
+                     num_samples=None,
+                     img_scale=(256, 128),
+                     img_norm_cfg=None,
+                     match_score_thr=0.3,
+                     motion_weight=0.02),
+                 match_iou_thr: float = 0.7,
+                 num_tentatives: int = 2,
+                 **kwargs):
+        if motmetrics is None:
+            raise RuntimeError('motmetrics is not installed,\
+                 please install it by: pip install motmetrics')
+        super().__init__(motion, obj_score_thr, reid, match_iou_thr,
+                         num_tentatives, **kwargs)
+
+    def update_track(self, id: int, obj: Tuple[Tensor]) -> None:
+        """Update a track."""
+        for k, v in zip(self.memo_items, obj):
+            v = v[None]
+            if self.momentums is not None and k in self.momentums:
+                m = self.momentums[k]
+                self.tracks[id][k] = (1 - m) * self.tracks[id][k] + m * v
+            else:
+                self.tracks[id][k].append(v)
+
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        score = float(self.tracks[id].scores[-1].cpu())
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox, score)
+
+    def track(self,
+              model: torch.nn.Module,
+              img: Tensor,
+              data_sample: TrackDataSample,
+              data_preprocessor: OptConfigType = None,
+              rescale: bool = False,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                SORT method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            data_preprocessor (dict or ConfigDict, optional): The pre-process
+               config of :class:`TrackDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                False.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = self.motion
+
+        if self.with_reid:
+            if self.reid.get('img_norm_cfg', False):
+                img_norm_cfg = dict(
+                    mean=data_preprocessor.get('mean', [0, 0, 0]),
+                    std=data_preprocessor.get('std', [1, 1, 1]),
+                    to_bgr=data_preprocessor.get('rgb_to_bgr', False))
+                reid_img = imrenormalize(img, img_norm_cfg,
+                                         self.reid['img_norm_cfg'])
+            else:
+                reid_img = img.clone()
+
+        valid_inds = scores > self.obj_score_thr
+        bboxes = bboxes[valid_inds]
+        labels = labels[valid_inds]
+        scores = scores[valid_inds]
+
+        if self.empty or bboxes.size(0) == 0:
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(
+                self.num_tracks,
+                self.num_tracks + num_new_tracks,
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += num_new_tracks
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                if crops.size(0) > 0:
+                    embeds = model.reid(crops, mode='tensor')
+                else:
+                    embeds = crops.new_zeros((0, model.reid.head.out_channels))
+        else:
+            ids = torch.full((bboxes.size(0), ), -1,
+                             dtype=torch.long).to(bboxes.device)
+
+            # motion
+            if model.with_cmc:
+                num_samples = 1
+                self.tracks = model.cmc.track(self.last_img, img, self.tracks,
+                                              num_samples, frame_id, metainfo)
+
+            self.tracks, motion_dists = self.motion.track(
+                self.tracks, bbox_xyxy_to_cxcyah(bboxes))
+
+            active_ids = self.confirmed_ids
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                embeds = model.reid(crops, mode='tensor')
+
+                # reid
+                if len(active_ids) > 0:
+                    track_embeds = self.get(
+                        'embeds',
+                        active_ids,
+                        self.reid.get('num_samples', None),
+                        behavior='mean')
+                    reid_dists = cosine_distance(track_embeds, embeds)
+                    valid_inds = [list(self.ids).index(_) for _ in active_ids]
+                    reid_dists[~np.isfinite(motion_dists[
+                        valid_inds, :])] = np.nan
+
+                    weight_motion = self.reid.get('motion_weight')
+                    match_dists = (1 - weight_motion) * reid_dists + \
+                        weight_motion * motion_dists[valid_inds]
+
+                    # support multi-class association
+                    track_labels = torch.tensor([
+                        self.tracks[id]['labels'][-1] for id in active_ids
+                    ]).to(bboxes.device)
+                    cate_match = labels[None, :] == track_labels[:, None]
+                    cate_cost = ((1 - cate_match.int()) * 1e6).cpu().numpy()
+                    match_dists = match_dists + cate_cost
+
+                    row, col = linear_sum_assignment(match_dists)
+                    for r, c in zip(row, col):
+                        dist = match_dists[r, c]
+                        if not np.isfinite(dist):
+                            continue
+                        if dist <= self.reid['match_score_thr']:
+                            ids[c] = active_ids[r]
+
+            active_ids = [
+                id for id in self.ids if id not in ids
+                and self.tracks[id].frame_ids[-1] == frame_id - 1
+            ]
+            if len(active_ids) > 0:
+                active_dets = torch.nonzero(ids == -1).squeeze(1)
+                track_bboxes = self.get('bboxes', active_ids)
+                ious = bbox_overlaps(track_bboxes, bboxes[active_dets])
+
+                # support multi-class association
+                track_labels = torch.tensor([
+                    self.tracks[id]['labels'][-1] for id in active_ids
+                ]).to(bboxes.device)
+                cate_match = labels[None, active_dets] == track_labels[:, None]
+                cate_cost = (1 - cate_match.int()) * 1e6
+
+                dists = (1 - ious + cate_cost).cpu().numpy()
+
+                row, col = linear_sum_assignment(dists)
+                for r, c in zip(row, col):
+                    dist = dists[r, c]
+                    if dist < 1 - self.match_iou_thr:
+                        ids[active_dets[c]] = active_ids[r]
+
+            new_track_inds = ids == -1
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum(),
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            scores=scores,
+            labels=labels,
+            embeds=embeds if self.with_reid else None,
+            frame_ids=frame_id)
+        self.last_img = img
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/head_extractor/build/lib/mmdet/models/tracking_heads/__init__.py b/head_extractor/build/lib/mmdet/models/tracking_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd1f0561cc076f2a603a64eb479cc6de0372a438
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/tracking_heads/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mask2former_track_head import Mask2FormerTrackHead
+from .quasi_dense_embed_head import QuasiDenseEmbedHead
+from .quasi_dense_track_head import QuasiDenseTrackHead
+from .roi_embed_head import RoIEmbedHead
+from .roi_track_head import RoITrackHead
+
+__all__ = [
+    'QuasiDenseEmbedHead', 'QuasiDenseTrackHead', 'Mask2FormerTrackHead',
+    'RoIEmbedHead', 'RoITrackHead'
+]
diff --git a/head_extractor/build/lib/mmdet/models/tracking_heads/mask2former_track_head.py b/head_extractor/build/lib/mmdet/models/tracking_heads/mask2former_track_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0877241bc33fcd1ef8f7ed154d503d9dbd8ab938
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/tracking_heads/mask2former_track_head.py
@@ -0,0 +1,729 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from collections import defaultdict
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d
+from mmcv.ops import point_sample
+from mmengine.model import ModuleList
+from mmengine.model.weight_init import caffe2_xavier_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.dense_heads import AnchorFreeHead, MaskFormerHead
+from mmdet.models.utils import get_uncertain_point_coords_with_randomness
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import TrackDataSample, TrackSampleList
+from mmdet.structures.mask import mask2bbox
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptMultiConfig, reduce_mean)
+from ..layers import Mask2FormerTransformerDecoder
+
+
+@MODELS.register_module()
+class Mask2FormerTrackHead(MaskFormerHead):
+    """Implements the Mask2Former head.
+
+    See `Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for features.
+        out_channels (int): Number of channels for output.
+        num_classes (int): Number of VIS classes.
+        num_queries (int): Number of query in Transformer decoder.
+            Defaults to 100.
+        num_transformer_feat_level (int): Number of feats levels.
+            Defaults to 3.
+        pixel_decoder (:obj:`ConfigDict` or dict): Config for pixel
+            decoder.
+        enforce_decoder_input_project (bool, optional): Whether to add
+            a layer to change the embed_dim of transformer encoder in
+            pixel decoder to the embed_dim of transformer decoder.
+            Defaults to False.
+        transformer_decoder (:obj:`ConfigDict` or dict): Config for
+            transformer decoder.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer decoder position encoding.
+            Defaults to `SinePositionalEncoding3D`.
+        loss_cls (:obj:`ConfigDict` or dict): Config of the classification
+            loss. Defaults to `CrossEntropyLoss`.
+        loss_mask (:obj:`ConfigDict` or dict): Config of the mask loss.
+            Defaults to 'CrossEntropyLoss'.
+        loss_dice (:obj:`ConfigDict` or dict): Config of the dice loss.
+            Defaults to 'DiceLoss'.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            Mask2Former head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            Mask2Former head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 feat_channels: int,
+                 out_channels: int,
+                 num_classes: int,
+                 num_frames: int = 2,
+                 num_queries: int = 100,
+                 num_transformer_feat_level: int = 3,
+                 pixel_decoder: ConfigType = ...,
+                 enforce_decoder_input_project: bool = False,
+                 transformer_decoder: ConfigType = ...,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=2.0,
+                     reduction='mean',
+                     class_weight=[1.0] * 133 + [0.1]),
+                 loss_mask: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='mean',
+                     loss_weight=5.0),
+                 loss_dice: ConfigType = dict(
+                     type='DiceLoss',
+                     use_sigmoid=True,
+                     activate=True,
+                     reduction='mean',
+                     naive_dice=True,
+                     eps=1.0,
+                     loss_weight=5.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(AnchorFreeHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.num_frames = num_frames
+        self.num_queries = num_queries
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_heads = transformer_decoder.layer_cfg.cross_attn_cfg.num_heads
+        self.num_transformer_decoder_layers = transformer_decoder.num_layers
+        assert pixel_decoder.encoder.layer_cfg. \
+            self_attn_cfg.num_levels == num_transformer_feat_level
+        pixel_decoder_ = copy.deepcopy(pixel_decoder)
+        pixel_decoder_.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = MODELS.build(pixel_decoder_)
+        self.transformer_decoder = Mask2FormerTransformerDecoder(
+            **transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+
+        self.decoder_input_projs = ModuleList()
+        # from low resolution to high resolution
+        for _ in range(num_transformer_feat_level):
+            if (self.decoder_embed_dims != feat_channels
+                    or enforce_decoder_input_project):
+                self.decoder_input_projs.append(
+                    Conv2d(
+                        feat_channels, self.decoder_embed_dims, kernel_size=1))
+            else:
+                self.decoder_input_projs.append(nn.Identity())
+        self.decoder_positional_encoding = MODELS.build(positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
+        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
+        # from low resolution to high resolution
+        self.level_embed = nn.Embedding(self.num_transformer_feat_level,
+                                        feat_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.sampler = TASK_UTILS.build(
+                # self.train_cfg.sampler, default_args=dict(context=self))
+                self.train_cfg['sampler'],
+                default_args=dict(context=self))
+            self.num_points = self.train_cfg.get('num_points', 12544)
+            self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
+            self.importance_sample_ratio = self.train_cfg.get(
+                'importance_sample_ratio', 0.75)
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_mask = MODELS.build(loss_mask)
+        self.loss_dice = MODELS.build(loss_dice)
+
+    def init_weights(self) -> None:
+        for m in self.decoder_input_projs:
+            if isinstance(m, Conv2d):
+                caffe2_xavier_init(m, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def preprocess_gt(self, batch_gt_instances: InstanceList) -> InstanceList:
+        """Preprocess the ground truth for all images.
+
+        It aims to reorganize the `gt`. For example, in the
+        `batch_data_sample.gt_instances.mask`, its shape is
+        `(all_num_gts, h, w)`, but we don't know each gt belongs to which `img`
+        (assume `num_frames` is 2). So, this func used to reshape the `gt_mask`
+        to `(num_gts_per_img, num_frames, h, w)`. In addition, we can't
+        guarantee that the number of instances in these two images is equal,
+        so `-1` refers to nonexistent instances.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``labels``, each is
+                ground truth labels of each bbox, with shape (num_gts, )
+                and ``masks``, each is ground truth masks of each instances
+                of an image, shape (num_gts, h, w).
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+
+                - labels (Tensor): Ground truth class indices\
+                    for an image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in an image.
+                - masks (Tensor): Ground truth mask for a\
+                    image, with shape (n, t, h, w).
+        """
+        final_batch_gt_instances = []
+        batch_size = len(batch_gt_instances) // self.num_frames
+        for batch_idx in range(batch_size):
+            pair_gt_insatences = batch_gt_instances[batch_idx *
+                                                    self.num_frames:batch_idx *
+                                                    self.num_frames +
+                                                    self.num_frames]
+
+            assert len(
+                pair_gt_insatences
+            ) > 1, f'mask2former for vis need multi frames to train, \
+                but you only use {len(pair_gt_insatences)} frames'
+
+            _device = pair_gt_insatences[0].labels.device
+
+            for gt_instances in pair_gt_insatences:
+                gt_instances.masks = gt_instances.masks.to_tensor(
+                    dtype=torch.bool, device=_device)
+            all_ins_id = torch.cat([
+                gt_instances.instances_ids
+                for gt_instances in pair_gt_insatences
+            ])
+            all_ins_id = all_ins_id.unique().tolist()
+            map_ins_id = dict()
+            for i, ins_id in enumerate(all_ins_id):
+                map_ins_id[ins_id] = i
+
+            num_instances = len(all_ins_id)
+            mask_shape = [
+                num_instances, self.num_frames,
+                pair_gt_insatences[0].masks.shape[1],
+                pair_gt_insatences[0].masks.shape[2]
+            ]
+            gt_masks_per_video = torch.zeros(
+                mask_shape, dtype=torch.bool, device=_device)
+            gt_ids_per_video = torch.full((num_instances, self.num_frames),
+                                          -1,
+                                          dtype=torch.long,
+                                          device=_device)
+            gt_labels_per_video = torch.full((num_instances, ),
+                                             -1,
+                                             dtype=torch.long,
+                                             device=_device)
+
+            for frame_id in range(self.num_frames):
+                cur_frame_gts = pair_gt_insatences[frame_id]
+                ins_ids = cur_frame_gts.instances_ids.tolist()
+                for i, id in enumerate(ins_ids):
+                    gt_masks_per_video[map_ins_id[id],
+                                       frame_id, :, :] = cur_frame_gts.masks[i]
+                    gt_ids_per_video[map_ins_id[id],
+                                     frame_id] = cur_frame_gts.instances_ids[i]
+                    gt_labels_per_video[
+                        map_ins_id[id]] = cur_frame_gts.labels[i]
+
+            tmp_instances = InstanceData(
+                labels=gt_labels_per_video,
+                masks=gt_masks_per_video.long(),
+                instances_id=gt_ids_per_video)
+            final_batch_gt_instances.append(tmp_instances)
+
+        return final_batch_gt_instances
+
+    def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> Tuple[Tensor]:
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, num_frames, h, w).
+            gt_instances (:obj:`InstanceData`): It contains ``labels`` and
+                ``masks``.
+            img_meta (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, num_frames, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each \
+                    image.
+                - neg_inds (Tensor): Sampled negative indices for each \
+                    image.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        # (num_gts, )
+        gt_labels = gt_instances.labels
+        # (num_gts, num_frames, h, w)
+        gt_masks = gt_instances.masks
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(mask_pred,
+                                        point_coords.repeat(num_queries, 1,
+                                                            1)).flatten(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(gt_masks.float(),
+                                       point_coords.repeat(num_gts, 1,
+                                                           1)).flatten(1)
+
+        sampled_gt_instances = InstanceData(
+            labels=gt_labels, masks=gt_points_masks)
+        sampled_pred_instances = InstanceData(
+            scores=cls_score, masks=mask_points_pred)
+        # assign and sample
+        assign_result = self.assigner.assign(
+            pred_instances=sampled_pred_instances,
+            gt_instances=sampled_gt_instances,
+            img_meta=img_meta)
+        pred_instances = InstanceData(scores=cls_score, masks=mask_pred)
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones((self.num_queries, ))
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def _loss_by_feat_single(self, cls_scores: Tensor, mask_preds: Tensor,
+                             batch_gt_instances: List[InstanceData],
+                             batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should include
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, num_frames,h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single \
+                decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         avg_factor) = self.get_targets(cls_scores_list, mask_preds_list,
+                                        batch_gt_instances, batch_img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, num_frames, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([avg_factor]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, num_frames, h, w)
+        # -> (num_total_gts, num_frames, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        with torch.no_grad():
+            points_coords = get_uncertain_point_coords_with_randomness(
+                mask_preds.flatten(0, 1).unsqueeze(1), None, self.num_points,
+                self.oversample_ratio, self.importance_sample_ratio)
+            # shape (num_total_gts * num_frames, h, w) ->
+            # (num_total_gts, num_points)
+            mask_point_targets = point_sample(
+                mask_targets.flatten(0, 1).unsqueeze(1).float(),
+                points_coords).squeeze(1)
+        # shape (num_total_gts * num_frames, num_points)
+        mask_point_preds = point_sample(
+            mask_preds.flatten(0, 1).unsqueeze(1), points_coords).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_point_preds, mask_point_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # shape (num_total_gts * num_frames, num_points) ->
+        # (num_total_gts * num_frames * num_points, )
+        mask_point_preds = mask_point_preds.reshape(-1)
+        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
+        mask_point_targets = mask_point_targets.reshape(-1)
+        loss_mask = self.loss_mask(
+            mask_point_preds,
+            mask_point_targets,
+            avg_factor=num_total_masks * self.num_points / self.num_frames)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def _forward_head(
+        self, decoder_out: Tensor, mask_feature: Tensor,
+        attn_mask_target_size: Tuple[int,
+                                     int]) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward for head part which is called after every decoder layer.
+
+        Args:
+            decoder_out (Tensor): in shape (num_queries, batch_size, c).
+            mask_feature (Tensor): in shape (batch_size, t, c, h, w).
+            attn_mask_target_size (tuple[int, int]): target attention
+                mask size.
+
+        Returns:
+            tuple: A tuple contain three elements.
+
+                - cls_pred (Tensor): Classification scores in shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should include background.
+                - mask_pred (Tensor): Mask scores in shape \
+                    (batch_size, num_queries,h, w).
+                - attn_mask (Tensor): Attention mask in shape \
+                    (batch_size * num_heads, num_queries, h, w).
+        """
+        decoder_out = self.transformer_decoder.post_norm(decoder_out)
+        cls_pred = self.cls_embed(decoder_out)
+        mask_embed = self.mask_embed(decoder_out)
+
+        # shape (batch_size, num_queries, t, h, w)
+        mask_pred = torch.einsum('bqc,btchw->bqthw', mask_embed, mask_feature)
+        b, q, t, _, _ = mask_pred.shape
+
+        attn_mask = F.interpolate(
+            mask_pred.flatten(0, 1),
+            attn_mask_target_size,
+            mode='bilinear',
+            align_corners=False).view(b, q, t, attn_mask_target_size[0],
+                                      attn_mask_target_size[1])
+
+        # shape (batch_size, num_queries, t, h, w) ->
+        # (batch_size, num_queries, t*h*w) ->
+        # (batch_size, num_head, num_queries, t*h*w) ->
+        # (batch_size*num_head, num_queries, t*h*w)
+        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
+            (1, self.num_heads, 1, 1)).flatten(0, 1)
+        attn_mask = attn_mask.sigmoid() < 0.5
+        attn_mask = attn_mask.detach()
+
+        return cls_pred, mask_pred, attn_mask
+
+    def forward(
+            self, x: List[Tensor], data_samples: TrackDataSample
+    ) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward function.
+
+        Args:
+            x (list[Tensor]): Multi scale Features from the
+                upstream network, each is a 4D-tensor.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+
+        Returns:
+            tuple[list[Tensor]]: A tuple contains two elements.
+
+                - cls_pred_list (list[Tensor)]: Classification logits \
+                    for each decoder layer. Each is a 3D-tensor with shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should include background.
+                - mask_pred_list (list[Tensor]): Mask logits for each \
+                    decoder layer. Each with shape (batch_size, num_queries, \
+                    h, w).
+        """
+        mask_features, multi_scale_memorys = self.pixel_decoder(x)
+        bt, c_m, h_m, w_m = mask_features.shape
+        batch_size = bt // self.num_frames if self.training else 1
+        t = bt // batch_size
+        mask_features = mask_features.view(batch_size, t, c_m, h_m, w_m)
+        # multi_scale_memorys (from low resolution to high resolution)
+        decoder_inputs = []
+        decoder_positional_encodings = []
+        for i in range(self.num_transformer_feat_level):
+            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
+            decoder_input = decoder_input.flatten(2)
+            level_embed = self.level_embed.weight[i][None, :, None]
+            decoder_input = decoder_input + level_embed
+            _, c, hw = decoder_input.shape
+            # shape (batch_size*t, c, h, w) ->
+            # (batch_size, t, c, hw) ->
+            # (batch_size, t*h*w, c)
+            decoder_input = decoder_input.view(batch_size, t, c,
+                                               hw).permute(0, 1, 3,
+                                                           2).flatten(1, 2)
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            mask = decoder_input.new_zeros(
+                (batch_size, t) + multi_scale_memorys[i].shape[-2:],
+                dtype=torch.bool)
+            decoder_positional_encoding = self.decoder_positional_encoding(
+                mask)
+            decoder_positional_encoding = decoder_positional_encoding.flatten(
+                3).permute(0, 1, 3, 2).flatten(1, 2)
+            decoder_inputs.append(decoder_input)
+            decoder_positional_encodings.append(decoder_positional_encoding)
+        # shape (num_queries, c) -> (batch_size, num_queries, c)
+        query_feat = self.query_feat.weight.unsqueeze(0).repeat(
+            (batch_size, 1, 1))
+        query_embed = self.query_embed.weight.unsqueeze(0).repeat(
+            (batch_size, 1, 1))
+
+        cls_pred_list = []
+        mask_pred_list = []
+        cls_pred, mask_pred, attn_mask = self._forward_head(
+            query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
+        cls_pred_list.append(cls_pred)
+        mask_pred_list.append(mask_pred)
+
+        for i in range(self.num_transformer_decoder_layers):
+            level_idx = i % self.num_transformer_feat_level
+            # if a mask is all True(all background), then set it all False.
+            attn_mask[torch.where(
+                attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+
+            # cross_attn + self_attn
+            layer = self.transformer_decoder.layers[i]
+            query_feat = layer(
+                query=query_feat,
+                key=decoder_inputs[level_idx],
+                value=decoder_inputs[level_idx],
+                query_pos=query_embed,
+                key_pos=decoder_positional_encodings[level_idx],
+                cross_attn_mask=attn_mask,
+                query_key_padding_mask=None,
+                # here we do not apply masking on padded region
+                key_padding_mask=None)
+            cls_pred, mask_pred, attn_mask = self._forward_head(
+                query_feat, mask_features, multi_scale_memorys[
+                    (i + 1) % self.num_transformer_feat_level].shape[-2:])
+
+            cls_pred_list.append(cls_pred)
+            mask_pred_list.append(mask_pred)
+
+        return cls_pred_list, mask_pred_list
+
+    def loss(
+        self,
+        x: Tuple[Tensor],
+        data_samples: TrackSampleList,
+    ) -> Dict[str, Tensor]:
+        """Perform forward propagation and loss calculation of the track head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+
+        for data_sample in data_samples:
+            video_img_metas = defaultdict(list)
+            for image_idx in range(len(data_sample)):
+                batch_gt_instances.append(data_sample[image_idx].gt_instances)
+                for key, value in data_sample[image_idx].metainfo.items():
+                    video_img_metas[key].append(value)
+            batch_img_metas.append(video_img_metas)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, data_samples)
+
+        # preprocess ground truth
+        batch_gt_instances = self.preprocess_gt(batch_gt_instances)
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self,
+                x: Tuple[Tensor],
+                data_samples: TrackDataSample,
+                rescale: bool = True) -> InstanceList:
+        """Test without augmentation.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+                - labels (Tensor): Prediction class indices\
+                    for an image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in an image.
+                - masks (Tensor): Prediction mask for a\
+                    image, with shape (n, t, h, w).
+        """
+
+        batch_img_metas = [
+            data_samples[img_idx].metainfo
+            for img_idx in range(len(data_samples))
+        ]
+        all_cls_scores, all_mask_preds = self(x, data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        mask_cls_results = mask_cls_results[0]
+        # upsample masks
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results[0],
+            size=(img_shape[0], img_shape[1]),
+            mode='bilinear',
+            align_corners=False)
+
+        results = self.predict_by_feat(mask_cls_results, mask_pred_results,
+                                       batch_img_metas)
+        return results
+
+    def predict_by_feat(self,
+                        mask_cls_results: List[Tensor],
+                        mask_pred_results: List[Tensor],
+                        batch_img_metas: List[dict],
+                        rescale: bool = True) -> InstanceList:
+        """Get top-10 predictions.
+
+        Args:
+            mask_cls_results (Tensor): Mask classification logits,\
+                shape (batch_size, num_queries, cls_out_channels).
+                Note `cls_out_channels` should include background.
+            mask_pred_results (Tensor): Mask logits, shape \
+                (batch_size, num_queries, h, w).
+            batch_img_metas (list[dict]): List of image meta information.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+                - labels (Tensor): Prediction class indices\
+                    for an image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in an image.
+                - masks (Tensor): Prediction mask for a\
+                    image, with shape (n, t, h, w).
+        """
+        results = []
+        if len(mask_cls_results) > 0:
+            scores = F.softmax(mask_cls_results, dim=-1)[:, :-1]
+            labels = torch.arange(self.num_classes).unsqueeze(0).repeat(
+                self.num_queries, 1).flatten(0, 1).to(scores.device)
+            # keep top-10 predictions
+            scores_per_image, topk_indices = scores.flatten(0, 1).topk(
+                10, sorted=False)
+            labels_per_image = labels[topk_indices]
+            topk_indices = topk_indices // self.num_classes
+            mask_pred_results = mask_pred_results[topk_indices]
+
+            img_shape = batch_img_metas[0]['img_shape']
+            mask_pred_results = \
+                mask_pred_results[:, :, :img_shape[0], :img_shape[1]]
+            if rescale:
+                # return result in original resolution
+                ori_height, ori_width = batch_img_metas[0]['ori_shape'][:2]
+                mask_pred_results = F.interpolate(
+                    mask_pred_results,
+                    size=(ori_height, ori_width),
+                    mode='bilinear',
+                    align_corners=False)
+
+            masks = mask_pred_results > 0.
+
+            # format top-10 predictions
+            for img_idx in range(len(batch_img_metas)):
+                pred_track_instances = InstanceData()
+
+                pred_track_instances.masks = masks[:, img_idx]
+                pred_track_instances.bboxes = mask2bbox(masks[:, img_idx])
+                pred_track_instances.labels = labels_per_image
+                pred_track_instances.scores = scores_per_image
+                pred_track_instances.instances_id = torch.arange(10)
+
+                results.append(pred_track_instances)
+
+            return results
diff --git a/head_extractor/build/lib/mmdet/models/tracking_heads/quasi_dense_embed_head.py b/head_extractor/build/lib/mmdet/models/tracking_heads/quasi_dense_embed_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..55e3c05b7aba188608f7dd2fdda54e0759cee03c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/tracking_heads/quasi_dense_embed_head.py
@@ -0,0 +1,347 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.task_modules import SamplingResult
+from mmdet.registry import MODELS
+from ..task_modules.tracking import embed_similarity
+
+
+@MODELS.register_module()
+class QuasiDenseEmbedHead(BaseModule):
+    """The quasi-dense roi embed head.
+
+    Args:
+        embed_channels (int): The input channel of embed features.
+            Defaults to 256.
+        softmax_temp (int): Softmax temperature. Defaults to -1.
+        loss_track (dict): The loss function for tracking. Defaults to
+            MultiPosCrossEntropyLoss.
+        loss_track_aux (dict): The auxiliary loss function for tracking.
+            Defaults to MarginL2Loss.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_convs: int = 0,
+                 num_fcs: int = 0,
+                 roi_feat_size: int = 7,
+                 in_channels: int = 256,
+                 conv_out_channels: int = 256,
+                 with_avg_pool: bool = False,
+                 fc_out_channels: int = 1024,
+                 conv_cfg: Optional[dict] = None,
+                 norm_cfg: Optional[dict] = None,
+                 embed_channels: int = 256,
+                 softmax_temp: int = -1,
+                 loss_track: Optional[dict] = None,
+                 loss_track_aux: dict = dict(
+                     type='MarginL2Loss',
+                     sample_ratio=3,
+                     margin=0.3,
+                     loss_weight=1.0,
+                     hard_mining=True),
+                 init_cfg: dict = dict(
+                     type='Xavier',
+                     layer='Linear',
+                     distribution='uniform',
+                     bias=0,
+                     override=dict(
+                         type='Normal',
+                         name='fc_embed',
+                         mean=0,
+                         std=0.01,
+                         bias=0))):
+        super(QuasiDenseEmbedHead, self).__init__(init_cfg=init_cfg)
+        self.num_convs = num_convs
+        self.num_fcs = num_fcs
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1]
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.with_avg_pool = with_avg_pool
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(self.roi_feat_size)
+        # add convs and fcs
+        self.convs, self.fcs, self.last_layer_dim = self._add_conv_fc_branch(
+            self.num_convs, self.num_fcs, self.in_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+        if loss_track is None:
+            loss_track = dict(
+                type='MultiPosCrossEntropyLoss', loss_weight=0.25)
+
+        self.fc_embed = nn.Linear(self.last_layer_dim, embed_channels)
+        self.softmax_temp = softmax_temp
+        self.loss_track = MODELS.build(loss_track)
+        if loss_track_aux is not None:
+            self.loss_track_aux = MODELS.build(loss_track_aux)
+        else:
+            self.loss_track_aux = None
+
+    def _add_conv_fc_branch(
+            self, num_branch_convs: int, num_branch_fcs: int,
+            in_channels: int) -> Tuple[nn.ModuleList, nn.ModuleList, int]:
+        """Add shared or separable branch. convs -> avg pool (optional) -> fcs.
+
+        Args:
+            num_branch_convs (int): The number of convoluational layers.
+            num_branch_fcs (int): The number of fully connection layers.
+            in_channels (int): The input channel of roi features.
+
+        Returns:
+            Tuple[nn.ModuleList, nn.ModuleList, int]: The convs, fcs and the
+                last layer dimension.
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels,
+                        self.conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            last_layer_dim = self.conv_out_channels
+
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            if not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+
+        return branch_convs, branch_fcs, last_layer_dim
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x (Tensor): The input features from ROI head.
+
+        Returns:
+            Tensor: The embedding feature map.
+        """
+
+        if self.num_convs > 0:
+            for conv in self.convs:
+                x = conv(x)
+        x = x.flatten(1)
+        if self.num_fcs > 0:
+            for fc in self.fcs:
+                x = self.relu(fc(x))
+        x = self.fc_embed(x)
+        return x
+
+    def get_targets(
+            self, gt_match_indices: List[Tensor],
+            key_sampling_results: List[SamplingResult],
+            ref_sampling_results: List[SamplingResult]) -> Tuple[List, List]:
+        """Calculate the track targets and track weights for all samples in a
+        batch according to the sampling_results.
+
+        Args:
+            gt_match_indices (list(Tensor)): Mapping from gt_instance_ids to
+                ref_gt_instance_ids of the same tracklet in a pair of images.
+            key_sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResult]): Assign results of
+                all reference images in a batch after sampling.
+
+        Returns:
+            Tuple[list[Tensor]]: Association results.
+            Containing the following list of Tensors:
+
+                - track_targets (list[Tensor]): The mapping instance ids from
+                    all positive proposals in the key image to all proposals
+                    in the reference image, each tensor in list has
+                    shape (len(key_pos_bboxes), len(ref_bboxes)).
+                - track_weights (list[Tensor]): Loss weights for all positive
+                    proposals in a batch, each tensor in list has
+                    shape (len(key_pos_bboxes),).
+        """
+
+        track_targets = []
+        track_weights = []
+        for _gt_match_indices, key_res, ref_res in zip(gt_match_indices,
+                                                       key_sampling_results,
+                                                       ref_sampling_results):
+            targets = _gt_match_indices.new_zeros(
+                (key_res.pos_bboxes.size(0), ref_res.bboxes.size(0)),
+                dtype=torch.int)
+            _match_indices = _gt_match_indices[key_res.pos_assigned_gt_inds]
+            pos2pos = (_match_indices.view(
+                -1, 1) == ref_res.pos_assigned_gt_inds.view(1, -1)).int()
+            targets[:, :pos2pos.size(1)] = pos2pos
+            weights = (targets.sum(dim=1) > 0).float()
+            track_targets.append(targets)
+            track_weights.append(weights)
+        return track_targets, track_weights
+
+    def match(
+        self, key_embeds: Tensor, ref_embeds: Tensor,
+        key_sampling_results: List[SamplingResult],
+        ref_sampling_results: List[SamplingResult]
+    ) -> Tuple[List[Tensor], List[Tensor]]:
+        """Calculate the dist matrixes for loss measurement.
+
+        Args:
+            key_embeds (Tensor): Embeds of positive bboxes in sampling results
+                of key image.
+            ref_embeds (Tensor): Embeds of all bboxes in sampling results
+                of the reference image.
+            key_sampling_results (List[obj:SamplingResults]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResults]): Assign results of
+                all reference images in a batch after sampling.
+
+        Returns:
+            Tuple[list[Tensor]]: Calculation results.
+            Containing the following list of Tensors:
+
+                - dists (list[Tensor]): Dot-product dists between
+                    key_embeds and ref_embeds, each tensor in list has
+                    shape (len(key_pos_bboxes), len(ref_bboxes)).
+                - cos_dists (list[Tensor]): Cosine dists between
+                    key_embeds and ref_embeds, each tensor in list has
+                    shape (len(key_pos_bboxes), len(ref_bboxes)).
+        """
+
+        num_key_rois = [res.pos_bboxes.size(0) for res in key_sampling_results]
+        key_embeds = torch.split(key_embeds, num_key_rois)
+        num_ref_rois = [res.bboxes.size(0) for res in ref_sampling_results]
+        ref_embeds = torch.split(ref_embeds, num_ref_rois)
+
+        dists, cos_dists = [], []
+        for key_embed, ref_embed in zip(key_embeds, ref_embeds):
+            dist = embed_similarity(
+                key_embed,
+                ref_embed,
+                method='dot_product',
+                temperature=self.softmax_temp)
+            dists.append(dist)
+            if self.loss_track_aux is not None:
+                cos_dist = embed_similarity(
+                    key_embed, ref_embed, method='cosine')
+                cos_dists.append(cos_dist)
+            else:
+                cos_dists.append(None)
+        return dists, cos_dists
+
+    def loss(self, key_roi_feats: Tensor, ref_roi_feats: Tensor,
+             key_sampling_results: List[SamplingResult],
+             ref_sampling_results: List[SamplingResult],
+             gt_match_indices_list: List[Tensor]) -> dict:
+        """Calculate the track loss and the auxiliary track loss.
+
+        Args:
+            key_roi_feats (Tensor): Embeds of positive bboxes in sampling
+                results of key image.
+            ref_roi_feats (Tensor): Embeds of all bboxes in sampling results
+                of the reference image.
+            key_sampling_results (List[obj:SamplingResults]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResults]): Assign results of
+                all reference images in a batch after sampling.
+            gt_match_indices_list (list(Tensor)): Mapping from gt_instances_ids
+                to ref_gt_instances_ids of the same tracklet in a pair of
+                images.
+
+        Returns:
+            Dict [str: Tensor]: Calculation results.
+            Containing the following list of Tensors:
+
+                - loss_track (Tensor): Results of loss_track function.
+                - loss_track_aux (Tensor): Results of loss_track_aux function.
+        """
+        key_track_feats = self(key_roi_feats)
+        ref_track_feats = self(ref_roi_feats)
+
+        losses = self.loss_by_feat(key_track_feats, ref_track_feats,
+                                   key_sampling_results, ref_sampling_results,
+                                   gt_match_indices_list)
+        return losses
+
+    def loss_by_feat(self, key_track_feats: Tensor, ref_track_feats: Tensor,
+                     key_sampling_results: List[SamplingResult],
+                     ref_sampling_results: List[SamplingResult],
+                     gt_match_indices_list: List[Tensor]) -> dict:
+        """Calculate the track loss and the auxiliary track loss.
+
+        Args:
+            key_track_feats (Tensor): Embeds of positive bboxes in sampling
+                results of key image.
+            ref_track_feats (Tensor): Embeds of all bboxes in sampling results
+                of the reference image.
+            key_sampling_results (List[obj:SamplingResults]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResults]): Assign results of
+                all reference images in a batch after sampling.
+            gt_match_indices_list (list(Tensor)): Mapping from instances_ids
+                from key image to reference image of the same tracklet in a
+                pair of images.
+
+        Returns:
+            Dict [str: Tensor]: Calculation results.
+            Containing the following list of Tensors:
+
+                - loss_track (Tensor): Results of loss_track function.
+                - loss_track_aux (Tensor): Results of loss_track_aux function.
+        """
+        dists, cos_dists = self.match(key_track_feats, ref_track_feats,
+                                      key_sampling_results,
+                                      ref_sampling_results)
+        targets, weights = self.get_targets(gt_match_indices_list,
+                                            key_sampling_results,
+                                            ref_sampling_results)
+        losses = dict()
+
+        loss_track = 0.
+        loss_track_aux = 0.
+        for _dists, _cos_dists, _targets, _weights in zip(
+                dists, cos_dists, targets, weights):
+            loss_track += self.loss_track(
+                _dists, _targets, _weights, avg_factor=_weights.sum())
+            if self.loss_track_aux is not None:
+                loss_track_aux += self.loss_track_aux(_cos_dists, _targets)
+        losses['loss_track'] = loss_track / len(dists)
+
+        if self.loss_track_aux is not None:
+            losses['loss_track_aux'] = loss_track_aux / len(dists)
+
+        return losses
+
+    def predict(self, bbox_feats: Tensor) -> Tensor:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            bbox_feats: The extracted roi features.
+
+        Returns:
+            Tensor: The extracted track features.
+        """
+        track_feats = self(bbox_feats)
+        return track_feats
diff --git a/head_extractor/build/lib/mmdet/models/tracking_heads/quasi_dense_track_head.py b/head_extractor/build/lib/mmdet/models/tracking_heads/quasi_dense_track_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd078dac827e35c7514330870cf884001985156b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/tracking_heads/quasi_dense_track_head.py
@@ -0,0 +1,178 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import TrackSampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList
+
+
+@MODELS.register_module()
+class QuasiDenseTrackHead(BaseModule):
+    """The quasi-dense track head."""
+
+    def __init__(self,
+                 roi_extractor: Optional[dict] = None,
+                 embed_head: Optional[dict] = None,
+                 regress_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 **kwargs):
+        super().__init__(init_cfg=init_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if embed_head is not None:
+            self.init_embed_head(roi_extractor, embed_head)
+
+        if regress_head is not None:
+            raise NotImplementedError('Regression head is not supported yet.')
+
+        self.init_assigner_sampler()
+
+    def init_embed_head(self, roi_extractor, embed_head) -> None:
+        """Initialize ``embed_head``
+
+        Args:
+            roi_extractor (dict, optional): Configuration of roi extractor.
+                Defaults to None.
+            embed_head (dict, optional): Configuration of embed head. Defaults
+                to None.
+        """
+        self.roi_extractor = MODELS.build(roi_extractor)
+        self.embed_head = MODELS.build(embed_head)
+
+    def init_assigner_sampler(self) -> None:
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.bbox_sampler = TASK_UTILS.build(
+                self.train_cfg.sampler, default_args=dict(context=self))
+
+    @property
+    def with_track(self) -> bool:
+        """bool: whether the multi-object tracker has an embed head"""
+        return hasattr(self, 'embed_head') and self.embed_head is not None
+
+    def extract_roi_feats(self, feats: List[Tensor],
+                          bboxes: List[Tensor]) -> Tensor:
+        """Extract roi features.
+
+        Args:
+            feats (list[Tensor]): list of multi-level image features.
+            bboxes (list[Tensor]): list of bboxes in sampling result.
+
+        Returns:
+            Tensor: The extracted roi features.
+        """
+        rois = bbox2roi(bboxes)
+        bbox_feats = self.roi_extractor(feats[:self.roi_extractor.num_inputs],
+                                        rois)
+        return bbox_feats
+
+    def loss(self, key_feats: List[Tensor], ref_feats: List[Tensor],
+             rpn_results_list: InstanceList,
+             ref_rpn_results_list: InstanceList, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            key_feats (list[Tensor]): list of multi-level image features.
+            ref_feats (list[Tensor]): list of multi-level ref_img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals of key img.
+            ref_rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals of ref img.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        assert self.with_track
+        num_imgs = len(data_samples)
+        batch_gt_instances = []
+        ref_batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        gt_match_indices_list = []
+        for track_data_sample in data_samples:
+            key_data_sample = track_data_sample.get_key_frames()[0]
+            ref_data_sample = track_data_sample.get_ref_frames()[0]
+            batch_gt_instances.append(key_data_sample.gt_instances)
+            ref_batch_gt_instances.append(ref_data_sample.gt_instances)
+            if 'ignored_instances' in key_data_sample:
+                batch_gt_instances_ignore.append(
+                    key_data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+            # get gt_match_indices
+            ins_ids = key_data_sample.gt_instances.instances_ids.tolist()
+            ref_ins_ids = ref_data_sample.gt_instances.instances_ids.tolist()
+            match_indices = Tensor([
+                ref_ins_ids.index(i) if (i in ref_ins_ids and i > 0) else -1
+                for i in ins_ids
+            ]).to(key_feats[0].device)
+            gt_match_indices_list.append(match_indices)
+
+        key_sampling_results, ref_sampling_results = [], []
+        for i in range(num_imgs):
+            rpn_results = rpn_results_list[i]
+            ref_rpn_results = ref_rpn_results_list[i]
+            # rename ref_rpn_results.bboxes to ref_rpn_results.priors
+            ref_rpn_results.priors = ref_rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in key_feats])
+            key_sampling_results.append(sampling_result)
+
+            ref_assign_result = self.bbox_assigner.assign(
+                ref_rpn_results, ref_batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            ref_sampling_result = self.bbox_sampler.sample(
+                ref_assign_result,
+                ref_rpn_results,
+                ref_batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in ref_feats])
+            ref_sampling_results.append(ref_sampling_result)
+
+        key_bboxes = [res.pos_bboxes for res in key_sampling_results]
+        key_roi_feats = self.extract_roi_feats(key_feats, key_bboxes)
+        ref_bboxes = [res.bboxes for res in ref_sampling_results]
+        ref_roi_feats = self.extract_roi_feats(ref_feats, ref_bboxes)
+
+        loss_track = self.embed_head.loss(key_roi_feats, ref_roi_feats,
+                                          key_sampling_results,
+                                          ref_sampling_results,
+                                          gt_match_indices_list)
+
+        return loss_track
+
+    def predict(self, feats: List[Tensor],
+                rescaled_bboxes: List[Tensor]) -> Tensor:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            rescaled_bboxes (list[Tensor]): list of rescaled bboxes in sampling
+                result.
+
+        Returns:
+            Tensor: The extracted track features.
+        """
+        bbox_feats = self.extract_roi_feats(feats, rescaled_bboxes)
+        track_feats = self.embed_head.predict(bbox_feats)
+        return track_feats
diff --git a/head_extractor/build/lib/mmdet/models/tracking_heads/roi_embed_head.py b/head_extractor/build/lib/mmdet/models/tracking_heads/roi_embed_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e18b81fbe52e109e7afb3e6d5e8e6624ef48242f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/tracking_heads/roi_embed_head.py
@@ -0,0 +1,391 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.losses import accuracy
+from mmdet.models.task_modules import SamplingResult
+from mmdet.models.task_modules.tracking import embed_similarity
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class RoIEmbedHead(BaseModule):
+    """The roi embed head.
+
+    This module is used in multi-object tracking methods, such as MaskTrack
+    R-CNN.
+
+    Args:
+        num_convs (int): The number of convoluational layers to embed roi
+            features. Defaults to 0.
+        num_fcs (int): The number of fully connection layers to embed roi
+            features. Defaults to 0.
+        roi_feat_size (int|tuple(int)): The spatial size of roi features.
+            Defaults to 7.
+        in_channels (int): The input channel of roi features. Defaults to 256.
+        conv_out_channels (int): The output channel of roi features after
+            forwarding convoluational layers. Defaults to 256.
+        with_avg_pool (bool): Whether use average pooling before passing roi
+            features into fully connection layers. Defaults to False.
+        fc_out_channels (int): The output channel of roi features after
+            forwarding fully connection layers. Defaults to 1024.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Defaults to None.
+        loss_match (dict): The loss function. Defaults to
+            dict(type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)
+        init_cfg (dict): Configuration of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_convs: int = 0,
+                 num_fcs: int = 0,
+                 roi_feat_size: int = 7,
+                 in_channels: int = 256,
+                 conv_out_channels: int = 256,
+                 with_avg_pool: bool = False,
+                 fc_out_channels: int = 1024,
+                 conv_cfg: Optional[dict] = None,
+                 norm_cfg: Optional[dict] = None,
+                 loss_match: dict = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 init_cfg: Optional[dict] = None,
+                 **kwargs):
+        super(RoIEmbedHead, self).__init__(init_cfg=init_cfg)
+        self.num_convs = num_convs
+        self.num_fcs = num_fcs
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1]
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.with_avg_pool = with_avg_pool
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.loss_match = MODELS.build(loss_match)
+        self.fp16_enabled = False
+
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(self.roi_feat_size)
+        # add convs and fcs
+        self.convs, self.fcs, self.last_layer_dim = self._add_conv_fc_branch(
+            self.num_convs, self.num_fcs, self.in_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+    def _add_conv_fc_branch(
+            self, num_branch_convs: int, num_branch_fcs: int,
+            in_channels: int) -> Tuple[nn.ModuleList, nn.ModuleList, int]:
+        """Add shared or separable branch.
+
+        convs -> avg pool (optional) -> fcs
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels,
+                        self.conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            last_layer_dim = self.conv_out_channels
+
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            if not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+
+        return branch_convs, branch_fcs, last_layer_dim
+
+    @property
+    def custom_activation(self):
+        return getattr(self.loss_match, 'custom_activation', False)
+
+    def extract_feat(self, x: Tensor,
+                     num_x_per_img: List[int]) -> Tuple[Tensor]:
+        """Extract feature from the input `x`, and split the output to a list.
+
+        Args:
+            x (Tensor): of shape [N, C, H, W]. N is the number of proposals.
+            num_x_per_img (list[int]): The `x` contains proposals of
+                multi-images. `num_x_per_img` denotes the number of proposals
+                for each image.
+
+        Returns:
+            list[Tensor]: Each Tensor denotes the embed features belonging to
+            an image in a batch.
+        """
+        if self.num_convs > 0:
+            for conv in self.convs:
+                x = conv(x)
+
+        if self.num_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+            x = x.flatten(1)
+            for fc in self.fcs:
+                x = self.relu(fc(x))
+        else:
+            x = x.flatten(1)
+
+        x_split = torch.split(x, num_x_per_img, dim=0)
+        return x_split
+
+    def forward(
+            self, x: Tensor, ref_x: Tensor, num_x_per_img: List[int],
+            num_x_per_ref_img: List[int]
+    ) -> Tuple[Tuple[Tensor], Tuple[Tensor]]:
+        """Computing the similarity scores between `x` and `ref_x`.
+
+        Args:
+            x (Tensor): of shape [N, C, H, W]. N is the number of key frame
+                proposals.
+            ref_x (Tensor): of shape [M, C, H, W]. M is the number of reference
+                frame proposals.
+            num_x_per_img (list[int]): The `x` contains proposals of
+                multi-images. `num_x_per_img` denotes the number of proposals
+                for each key image.
+            num_x_per_ref_img (list[int]): The `ref_x` contains proposals of
+                multi-images. `num_x_per_ref_img` denotes the number of
+                proposals for each reference image.
+
+        Returns:
+            tuple[tuple[Tensor], tuple[Tensor]]: Each tuple of tensor denotes
+            the embed features belonging to an image in a batch.
+        """
+        x_split = self.extract_feat(x, num_x_per_img)
+        ref_x_split = self.extract_feat(ref_x, num_x_per_ref_img)
+
+        return x_split, ref_x_split
+
+    def get_targets(self, sampling_results: List[SamplingResult],
+                    gt_instance_ids: List[Tensor],
+                    ref_gt_instance_ids: List[Tensor]) -> Tuple[List, List]:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of
+                all images in a batch, each tensor has shape (num_gt, ).
+            ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes
+                of all reference images in a batch, each tensor has shape
+                (num_gt, ).
+
+        Returns:
+            Tuple[list[Tensor]]: Ground truth for proposals in a batch.
+            Containing the following list of Tensors:
+
+                - track_id_targets (list[Tensor]): The instance ids of
+                  Gt_labels for all proposals in a batch, each tensor in list
+                  has shape (num_proposals,).
+                - track_id_weights (list[Tensor]): Labels_weights for
+                  all proposals in a batch, each tensor in list has
+                  shape (num_proposals,).
+        """
+        track_id_targets = []
+        track_id_weights = []
+
+        for res, gt_instance_id, ref_gt_instance_id in zip(
+                sampling_results, gt_instance_ids, ref_gt_instance_ids):
+            pos_instance_ids = gt_instance_id[res.pos_assigned_gt_inds]
+            pos_match_id = gt_instance_id.new_zeros(len(pos_instance_ids))
+            for i, id in enumerate(pos_instance_ids):
+                if id in ref_gt_instance_id:
+                    pos_match_id[i] = ref_gt_instance_id.tolist().index(id) + 1
+
+            track_id_target = gt_instance_id.new_zeros(
+                len(res.bboxes), dtype=torch.int64)
+            track_id_target[:len(res.pos_bboxes)] = pos_match_id
+            track_id_weight = res.bboxes.new_zeros(len(res.bboxes))
+            track_id_weight[:len(res.pos_bboxes)] = 1.0
+
+            track_id_targets.append(track_id_target)
+            track_id_weights.append(track_id_weight)
+
+        return track_id_targets, track_id_weights
+
+    def loss(
+        self,
+        bbox_feats: Tensor,
+        ref_bbox_feats: Tensor,
+        num_bbox_per_img: int,
+        num_bbox_per_ref_img: int,
+        sampling_results: List[SamplingResult],
+        gt_instance_ids: List[Tensor],
+        ref_gt_instance_ids: List[Tensor],
+        reduction_override: Optional[str] = None,
+    ) -> dict:
+        """Calculate the loss in a batch.
+
+        Args:
+            bbox_feats (Tensor): of shape [N, C, H, W]. N is the number of
+                bboxes.
+            ref_bbox_feats (Tensor): of shape [M, C, H, W]. M is the number of
+                reference bboxes.
+            num_bbox_per_img (list[int]): The `bbox_feats` contains proposals
+                of multi-images. `num_bbox_per_img` denotes the number of
+                proposals for each key image.
+            num_bbox_per_ref_img (list[int]): The `ref_bbox_feats` contains
+                proposals of multi-images. `num_bbox_per_ref_img` denotes the
+                number of proposals for each reference image.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of
+                all images in a batch, each tensor has shape (num_gt, ).
+            ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes
+                of all reference images in a batch, each tensor has shape
+                (num_gt, ).
+            reduction_override (str, optional): The method used to reduce the
+                loss. Options are "none", "mean" and "sum".
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        x_split, ref_x_split = self(bbox_feats, ref_bbox_feats,
+                                    num_bbox_per_img, num_bbox_per_ref_img)
+
+        losses = self.loss_by_feat(x_split, ref_x_split, sampling_results,
+                                   gt_instance_ids, ref_gt_instance_ids,
+                                   reduction_override)
+        return losses
+
+    def loss_by_feat(self,
+                     x_split: Tuple[Tensor],
+                     ref_x_split: Tuple[Tensor],
+                     sampling_results: List[SamplingResult],
+                     gt_instance_ids: List[Tensor],
+                     ref_gt_instance_ids: List[Tensor],
+                     reduction_override: Optional[str] = None) -> dict:
+        """Calculate losses.
+
+        Args:
+            x_split (Tensor): The embed features belonging to key image.
+            ref_x_split (Tensor): The embed features belonging to ref image.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of
+                all images in a batch, each tensor has shape (num_gt, ).
+            ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes
+                of all reference images in a batch, each tensor has shape
+                (num_gt, ).
+            reduction_override (str, optional): The method used to reduce the
+                loss. Options are "none", "mean" and "sum".
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        track_id_targets, track_id_weights = self.get_targets(
+            sampling_results, gt_instance_ids, ref_gt_instance_ids)
+        assert isinstance(track_id_targets, list)
+        assert isinstance(track_id_weights, list)
+        assert len(track_id_weights) == len(track_id_targets)
+
+        losses = defaultdict(list)
+        similarity_logits = []
+        for one_x, one_ref_x in zip(x_split, ref_x_split):
+            similarity_logit = embed_similarity(
+                one_x, one_ref_x, method='dot_product')
+            dummy = similarity_logit.new_zeros(one_x.shape[0], 1)
+            similarity_logit = torch.cat((dummy, similarity_logit), dim=1)
+            similarity_logits.append(similarity_logit)
+        assert isinstance(similarity_logits, list)
+        assert len(similarity_logits) == len(track_id_targets)
+
+        for similarity_logit, track_id_target, track_id_weight in zip(
+                similarity_logits, track_id_targets, track_id_weights):
+            avg_factor = max(torch.sum(track_id_target > 0).float().item(), 1.)
+            if similarity_logit.numel() > 0:
+                loss_match = self.loss_match(
+                    similarity_logit,
+                    track_id_target,
+                    track_id_weight,
+                    avg_factor=avg_factor,
+                    reduction_override=reduction_override)
+                if isinstance(loss_match, dict):
+                    for key, value in loss_match.items():
+                        losses[key].append(value)
+                else:
+                    losses['loss_match'].append(loss_match)
+
+                valid_index = track_id_weight > 0
+                valid_similarity_logit = similarity_logit[valid_index]
+                valid_track_id_target = track_id_target[valid_index]
+                if self.custom_activation:
+                    match_accuracy = self.loss_match.get_accuracy(
+                        valid_similarity_logit, valid_track_id_target)
+                    for key, value in match_accuracy.items():
+                        losses[key].append(value)
+                else:
+                    losses['match_accuracy'].append(
+                        accuracy(valid_similarity_logit,
+                                 valid_track_id_target))
+
+        for key, value in losses.items():
+            losses[key] = sum(losses[key]) / len(similarity_logits)
+        return losses
+
+    def predict(self, roi_feats: Tensor,
+                prev_roi_feats: Tensor) -> List[Tensor]:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            roi_feats (Tensor): Feature map of current images rois.
+            prev_roi_feats (Tensor): Feature map of previous images rois.
+
+        Returns:
+            list[Tensor]: The predicted similarity_logits of each pair of key
+            image and reference image.
+        """
+        x_split, ref_x_split = self(roi_feats, prev_roi_feats,
+                                    [roi_feats.shape[0]],
+                                    [prev_roi_feats.shape[0]])
+
+        similarity_logits = self.predict_by_feat(x_split, ref_x_split)
+
+        return similarity_logits
+
+    def predict_by_feat(self, x_split: Tuple[Tensor],
+                        ref_x_split: Tuple[Tensor]) -> List[Tensor]:
+        """Get similarity_logits.
+
+        Args:
+            x_split (Tensor): The embed features belonging to key image.
+            ref_x_split (Tensor): The embed features belonging to ref image.
+
+        Returns:
+            list[Tensor]: The predicted similarity_logits of each pair of key
+            image and reference image.
+        """
+        similarity_logits = []
+        for one_x, one_ref_x in zip(x_split, ref_x_split):
+            similarity_logit = embed_similarity(
+                one_x, one_ref_x, method='dot_product')
+            dummy = similarity_logit.new_zeros(one_x.shape[0], 1)
+            similarity_logit = torch.cat((dummy, similarity_logit), dim=1)
+            similarity_logits.append(similarity_logit)
+        return similarity_logits
diff --git a/head_extractor/build/lib/mmdet/models/tracking_heads/roi_track_head.py b/head_extractor/build/lib/mmdet/models/tracking_heads/roi_track_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c51c810022cc856411e1de83278e38fdc2b670c8
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/tracking_heads/roi_track_head.py
@@ -0,0 +1,178 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+from typing import List, Optional, Tuple
+
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import TrackSampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList
+
+
+@MODELS.register_module()
+class RoITrackHead(BaseModule, metaclass=ABCMeta):
+    """The roi track head.
+
+    This module is used in multi-object tracking methods, such as MaskTrack
+    R-CNN.
+
+    Args:
+        roi_extractor (dict): Configuration of roi extractor. Defaults to None.
+        embed_head (dict): Configuration of embed head. Defaults to None.
+        train_cfg (dict): Configuration when training. Defaults to None.
+        test_cfg (dict): Configuration when testing. Defaults to None.
+        init_cfg (dict): Configuration of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 roi_extractor: Optional[dict] = None,
+                 embed_head: Optional[dict] = None,
+                 regress_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 *args,
+                 **kwargs):
+        super().__init__(init_cfg=init_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if embed_head is not None:
+            self.init_embed_head(roi_extractor, embed_head)
+
+        if regress_head is not None:
+            raise NotImplementedError('Regression head is not supported yet.')
+
+        self.init_assigner_sampler()
+
+    def init_embed_head(self, roi_extractor, embed_head) -> None:
+        """Initialize ``embed_head``"""
+        self.roi_extractor = MODELS.build(roi_extractor)
+        self.embed_head = MODELS.build(embed_head)
+
+    def init_assigner_sampler(self) -> None:
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.bbox_sampler = TASK_UTILS.build(
+                self.train_cfg.sampler, default_args=dict(context=self))
+
+    @property
+    def with_track(self) -> bool:
+        """bool: whether the multi-object tracker has an embed head"""
+        return hasattr(self, 'embed_head') and self.embed_head is not None
+
+    def extract_roi_feats(
+            self, feats: List[Tensor],
+            bboxes: List[Tensor]) -> Tuple[Tuple[Tensor], List[int]]:
+        """Extract roi features.
+
+        Args:
+            feats (list[Tensor]): list of multi-level image features.
+            bboxes (list[Tensor]): list of bboxes in sampling result.
+
+        Returns:
+            tuple[tuple[Tensor], list[int]]: The extracted roi features and
+            the number of bboxes in each image.
+        """
+        rois = bbox2roi(bboxes)
+        bbox_feats = self.roi_extractor(feats[:self.roi_extractor.num_inputs],
+                                        rois)
+        num_bbox_per_img = [len(bbox) for bbox in bboxes]
+        return bbox_feats, num_bbox_per_img
+
+    def loss(self, key_feats: List[Tensor], ref_feats: List[Tensor],
+             rpn_results_list: InstanceList, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            key_feats (list[Tensor]): list of multi-level image features.
+            ref_feats (list[Tensor]): list of multi-level ref_img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        assert self.with_track
+        batch_gt_instances = []
+        ref_batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        gt_instance_ids = []
+        ref_gt_instance_ids = []
+        for track_data_sample in data_samples:
+            key_data_sample = track_data_sample.get_key_frames()[0]
+            ref_data_sample = track_data_sample.get_ref_frames()[0]
+            batch_gt_instances.append(key_data_sample.gt_instances)
+            ref_batch_gt_instances.append(ref_data_sample.gt_instances)
+            if 'ignored_instances' in key_data_sample:
+                batch_gt_instances_ignore.append(
+                    key_data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+
+            gt_instance_ids.append(key_data_sample.gt_instances.instances_ids)
+            ref_gt_instance_ids.append(
+                ref_data_sample.gt_instances.instances_ids)
+
+        losses = dict()
+        num_imgs = len(data_samples)
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        sampling_results = []
+        for i in range(num_imgs):
+            rpn_results = rpn_results_list[i]
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in key_feats])
+            sampling_results.append(sampling_result)
+
+        bboxes = [res.bboxes for res in sampling_results]
+        bbox_feats, num_bbox_per_img = self.extract_roi_feats(
+            key_feats, bboxes)
+
+        # batch_size is 1
+        ref_gt_bboxes = [
+            ref_batch_gt_instance.bboxes
+            for ref_batch_gt_instance in ref_batch_gt_instances
+        ]
+        ref_bbox_feats, num_bbox_per_ref_img = self.extract_roi_feats(
+            ref_feats, ref_gt_bboxes)
+
+        loss_track = self.embed_head.loss(bbox_feats, ref_bbox_feats,
+                                          num_bbox_per_img,
+                                          num_bbox_per_ref_img,
+                                          sampling_results, gt_instance_ids,
+                                          ref_gt_instance_ids)
+        losses.update(loss_track)
+
+        return losses
+
+    def predict(self, roi_feats: Tensor,
+                prev_roi_feats: Tensor) -> List[Tensor]:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            roi_feats (Tensor): Feature map of current images rois.
+            prev_roi_feats (Tensor): Feature map of previous images rois.
+
+        Returns:
+            list[Tensor]: The predicted similarity_logits of each pair of key
+            image and reference image.
+        """
+        return self.embed_head.predict(roi_feats, prev_roi_feats)[0]
diff --git a/head_extractor/build/lib/mmdet/models/utils/__init__.py b/head_extractor/build/lib/mmdet/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a00d9a37f33169dc1c523c68db55f823dd0424fa
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/utils/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .gaussian_target import (gather_feat, gaussian_radius,
+                              gen_gaussian_target, get_local_maximum,
+                              get_topk_from_heatmap, transpose_and_gather_feat)
+from .image import imrenormalize
+from .make_divisible import make_divisible
+# Disable yapf because it conflicts with isort.
+# yapf: disable
+from .misc import (align_tensor, aligned_bilinear, center_of_mass,
+                   empty_instances, filter_gt_instances,
+                   filter_scores_and_topk, flip_tensor, generate_coordinate,
+                   images_to_levels, interpolate_as, levels_to_images,
+                   mask2ndarray, multi_apply, relative_coordinate_maps,
+                   rename_loss_dict, reweight_loss_dict,
+                   samplelist_boxtype2tensor, select_single_mlvl,
+                   sigmoid_geometric_mean, unfold_wo_center, unmap,
+                   unpack_gt_instances)
+from .panoptic_gt_processing import preprocess_panoptic_gt
+from .point_sample import (get_uncertain_point_coords_with_randomness,
+                           get_uncertainty)
+from .vlfuse_helper import BertEncoderLayer, VLFuse, permute_and_flatten
+from .wbf import weighted_boxes_fusion
+
+__all__ = [
+    'gaussian_radius', 'gen_gaussian_target', 'make_divisible',
+    'get_local_maximum', 'get_topk_from_heatmap', 'transpose_and_gather_feat',
+    'interpolate_as', 'sigmoid_geometric_mean', 'gather_feat',
+    'preprocess_panoptic_gt', 'get_uncertain_point_coords_with_randomness',
+    'get_uncertainty', 'unpack_gt_instances', 'empty_instances',
+    'center_of_mass', 'filter_scores_and_topk', 'flip_tensor',
+    'generate_coordinate', 'levels_to_images', 'mask2ndarray', 'multi_apply',
+    'select_single_mlvl', 'unmap', 'images_to_levels',
+    'samplelist_boxtype2tensor', 'filter_gt_instances', 'rename_loss_dict',
+    'reweight_loss_dict', 'relative_coordinate_maps', 'aligned_bilinear',
+    'unfold_wo_center', 'imrenormalize', 'VLFuse', 'permute_and_flatten',
+    'BertEncoderLayer', 'align_tensor', 'weighted_boxes_fusion'
+]
diff --git a/head_extractor/build/lib/mmdet/models/utils/gaussian_target.py b/head_extractor/build/lib/mmdet/models/utils/gaussian_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bf4d558ce05c4f953e1c3fcf75016e5874afce1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/utils/gaussian_target.py
@@ -0,0 +1,268 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from math import sqrt
+
+import torch
+import torch.nn.functional as F
+
+
+def gaussian2D(radius, sigma=1, dtype=torch.float32, device='cpu'):
+    """Generate 2D gaussian kernel.
+
+    Args:
+        radius (int): Radius of gaussian kernel.
+        sigma (int): Sigma of gaussian function. Default: 1.
+        dtype (torch.dtype): Dtype of gaussian tensor. Default: torch.float32.
+        device (str): Device of gaussian tensor. Default: 'cpu'.
+
+    Returns:
+        h (Tensor): Gaussian kernel with a
+            ``(2 * radius + 1) * (2 * radius + 1)`` shape.
+    """
+    x = torch.arange(
+        -radius, radius + 1, dtype=dtype, device=device).view(1, -1)
+    y = torch.arange(
+        -radius, radius + 1, dtype=dtype, device=device).view(-1, 1)
+
+    h = (-(x * x + y * y) / (2 * sigma * sigma)).exp()
+
+    h[h < torch.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def gen_gaussian_target(heatmap, center, radius, k=1):
+    """Generate 2D gaussian heatmap.
+
+    Args:
+        heatmap (Tensor): Input heatmap, the gaussian kernel will cover on
+            it and maintain the max value.
+        center (list[int]): Coord of gaussian kernel's center.
+        radius (int): Radius of gaussian kernel.
+        k (int): Coefficient of gaussian kernel. Default: 1.
+
+    Returns:
+        out_heatmap (Tensor): Updated heatmap covered by gaussian kernel.
+    """
+    diameter = 2 * radius + 1
+    gaussian_kernel = gaussian2D(
+        radius, sigma=diameter / 6, dtype=heatmap.dtype, device=heatmap.device)
+
+    x, y = center
+
+    height, width = heatmap.shape[:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian_kernel[radius - top:radius + bottom,
+                                      radius - left:radius + right]
+    out_heatmap = heatmap
+    torch.max(
+        masked_heatmap,
+        masked_gaussian * k,
+        out=out_heatmap[y - top:y + bottom, x - left:x + right])
+
+    return out_heatmap
+
+
+def gaussian_radius(det_size, min_overlap):
+    r"""Generate 2D gaussian radius.
+
+    This function is modified from the `official github repo
+    <https://github.com/princeton-vl/CornerNet-Lite/blob/master/core/sample/
+    utils.py#L65>`_.
+
+    Given ``min_overlap``, radius could computed by a quadratic equation
+    according to Vieta's formulas.
+
+    There are 3 cases for computing gaussian radius, details are following:
+
+    - Explanation of figure: ``lt`` and ``br`` indicates the left-top and
+      bottom-right corner of ground truth box. ``x`` indicates the
+      generated corner at the limited position when ``radius=r``.
+
+    - Case1: one corner is inside the gt box and the other is outside.
+
+    .. code:: text
+
+        |<   width   >|
+
+        lt-+----------+         -
+        |  |          |         ^
+        +--x----------+--+
+        |  |          |  |
+        |  |          |  |    height
+        |  | overlap  |  |
+        |  |          |  |
+        |  |          |  |      v
+        +--+---------br--+      -
+           |          |  |
+           +----------+--x
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{(w-r)*(h-r)}{w*h+(w+h)r-r^2} \ge {iou} \quad\Rightarrow\quad
+        {r^2-(w+h)r+\cfrac{1-iou}{1+iou}*w*h} \ge 0 \\
+        {a} = 1,\quad{b} = {-(w+h)},\quad{c} = {\cfrac{1-iou}{1+iou}*w*h}
+        {r} \le \cfrac{-b-\sqrt{b^2-4*a*c}}{2*a}
+
+    - Case2: both two corners are inside the gt box.
+
+    .. code:: text
+
+        |<   width   >|
+
+        lt-+----------+         -
+        |  |          |         ^
+        +--x-------+  |
+        |  |       |  |
+        |  |overlap|  |       height
+        |  |       |  |
+        |  +-------x--+
+        |          |  |         v
+        +----------+-br         -
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{(w-2*r)*(h-2*r)}{w*h} \ge {iou} \quad\Rightarrow\quad
+        {4r^2-2(w+h)r+(1-iou)*w*h} \ge 0 \\
+        {a} = 4,\quad {b} = {-2(w+h)},\quad {c} = {(1-iou)*w*h}
+        {r} \le \cfrac{-b-\sqrt{b^2-4*a*c}}{2*a}
+
+    - Case3: both two corners are outside the gt box.
+
+    .. code:: text
+
+           |<   width   >|
+
+        x--+----------------+
+        |  |                |
+        +-lt-------------+  |   -
+        |  |             |  |   ^
+        |  |             |  |
+        |  |   overlap   |  | height
+        |  |             |  |
+        |  |             |  |   v
+        |  +------------br--+   -
+        |                |  |
+        +----------------+--x
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{w*h}{(w+2*r)*(h+2*r)} \ge {iou} \quad\Rightarrow\quad
+        {4*iou*r^2+2*iou*(w+h)r+(iou-1)*w*h} \le 0 \\
+        {a} = {4*iou},\quad {b} = {2*iou*(w+h)},\quad {c} = {(iou-1)*w*h} \\
+        {r} \le \cfrac{-b+\sqrt{b^2-4*a*c}}{2*a}
+
+    Args:
+        det_size (list[int]): Shape of object.
+        min_overlap (float): Min IoU with ground truth for boxes generated by
+            keypoints inside the gaussian kernel.
+
+    Returns:
+        radius (int): Radius of gaussian kernel.
+    """
+    height, width = det_size
+
+    a1 = 1
+    b1 = (height + width)
+    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = sqrt(b1**2 - 4 * a1 * c1)
+    r1 = (b1 - sq1) / (2 * a1)
+
+    a2 = 4
+    b2 = 2 * (height + width)
+    c2 = (1 - min_overlap) * width * height
+    sq2 = sqrt(b2**2 - 4 * a2 * c2)
+    r2 = (b2 - sq2) / (2 * a2)
+
+    a3 = 4 * min_overlap
+    b3 = -2 * min_overlap * (height + width)
+    c3 = (min_overlap - 1) * width * height
+    sq3 = sqrt(b3**2 - 4 * a3 * c3)
+    r3 = (b3 + sq3) / (2 * a3)
+    return min(r1, r2, r3)
+
+
+def get_local_maximum(heat, kernel=3):
+    """Extract local maximum pixel with given kernel.
+
+    Args:
+        heat (Tensor): Target heatmap.
+        kernel (int): Kernel size of max pooling. Default: 3.
+
+    Returns:
+        heat (Tensor): A heatmap where local maximum pixels maintain its
+            own value and other positions are 0.
+    """
+    pad = (kernel - 1) // 2
+    hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)
+    keep = (hmax == heat).float()
+    return heat * keep
+
+
+def get_topk_from_heatmap(scores, k=20):
+    """Get top k positions from heatmap.
+
+    Args:
+        scores (Tensor): Target heatmap with shape
+            [batch, num_classes, height, width].
+        k (int): Target number. Default: 20.
+
+    Returns:
+        tuple[torch.Tensor]: Scores, indexes, categories and coords of
+            topk keypoint. Containing following Tensors:
+
+        - topk_scores (Tensor): Max scores of each topk keypoint.
+        - topk_inds (Tensor): Indexes of each topk keypoint.
+        - topk_clses (Tensor): Categories of each topk keypoint.
+        - topk_ys (Tensor): Y-coord of each topk keypoint.
+        - topk_xs (Tensor): X-coord of each topk keypoint.
+    """
+    batch, _, height, width = scores.size()
+    topk_scores, topk_inds = torch.topk(scores.view(batch, -1), k)
+    topk_clses = topk_inds // (height * width)
+    topk_inds = topk_inds % (height * width)
+    topk_ys = topk_inds // width
+    topk_xs = (topk_inds % width).int().float()
+    return topk_scores, topk_inds, topk_clses, topk_ys, topk_xs
+
+
+def gather_feat(feat, ind, mask=None):
+    """Gather feature according to index.
+
+    Args:
+        feat (Tensor): Target feature map.
+        ind (Tensor): Target coord index.
+        mask (Tensor | None): Mask of feature map. Default: None.
+
+    Returns:
+        feat (Tensor): Gathered feature.
+    """
+    dim = feat.size(2)
+    ind = ind.unsqueeze(2).repeat(1, 1, dim)
+    feat = feat.gather(1, ind)
+    if mask is not None:
+        mask = mask.unsqueeze(2).expand_as(feat)
+        feat = feat[mask]
+        feat = feat.view(-1, dim)
+    return feat
+
+
+def transpose_and_gather_feat(feat, ind):
+    """Transpose and gather feature according to index.
+
+    Args:
+        feat (Tensor): Target feature map.
+        ind (Tensor): Target coord index.
+
+    Returns:
+        feat (Tensor): Transposed and gathered feature.
+    """
+    feat = feat.permute(0, 2, 3, 1).contiguous()
+    feat = feat.view(feat.size(0), -1, feat.size(3))
+    feat = gather_feat(feat, ind)
+    return feat
diff --git a/head_extractor/build/lib/mmdet/models/utils/image.py b/head_extractor/build/lib/mmdet/models/utils/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..16b5787a78232e46f47585c99526ca2b4ca9d1a1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/utils/image.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import mmcv
+import numpy as np
+import torch
+from torch import Tensor
+
+
+def imrenormalize(img: Union[Tensor, np.ndarray], img_norm_cfg: dict,
+                  new_img_norm_cfg: dict) -> Union[Tensor, np.ndarray]:
+    """Re-normalize the image.
+
+    Args:
+        img (Tensor | ndarray): Input image. If the input is a Tensor, the
+            shape is (1, C, H, W). If the input is a ndarray, the shape
+            is (H, W, C).
+        img_norm_cfg (dict): Original configuration for the normalization.
+        new_img_norm_cfg (dict): New configuration for the normalization.
+
+    Returns:
+        Tensor | ndarray: Output image with the same type and shape of
+        the input.
+    """
+    if isinstance(img, torch.Tensor):
+        assert img.ndim == 4 and img.shape[0] == 1
+        new_img = img.squeeze(0).cpu().numpy().transpose(1, 2, 0)
+        new_img = _imrenormalize(new_img, img_norm_cfg, new_img_norm_cfg)
+        new_img = new_img.transpose(2, 0, 1)[None]
+        return torch.from_numpy(new_img).to(img)
+    else:
+        return _imrenormalize(img, img_norm_cfg, new_img_norm_cfg)
+
+
+def _imrenormalize(img: Union[Tensor, np.ndarray], img_norm_cfg: dict,
+                   new_img_norm_cfg: dict) -> Union[Tensor, np.ndarray]:
+    """Re-normalize the image."""
+    img_norm_cfg = img_norm_cfg.copy()
+    new_img_norm_cfg = new_img_norm_cfg.copy()
+    for k, v in img_norm_cfg.items():
+        if (k == 'mean' or k == 'std') and not isinstance(v, np.ndarray):
+            img_norm_cfg[k] = np.array(v, dtype=img.dtype)
+    # reverse cfg
+    if 'bgr_to_rgb' in img_norm_cfg:
+        img_norm_cfg['rgb_to_bgr'] = img_norm_cfg['bgr_to_rgb']
+        img_norm_cfg.pop('bgr_to_rgb')
+    for k, v in new_img_norm_cfg.items():
+        if (k == 'mean' or k == 'std') and not isinstance(v, np.ndarray):
+            new_img_norm_cfg[k] = np.array(v, dtype=img.dtype)
+    img = mmcv.imdenormalize(img, **img_norm_cfg)
+    img = mmcv.imnormalize(img, **new_img_norm_cfg)
+    return img
diff --git a/head_extractor/build/lib/mmdet/models/utils/make_divisible.py b/head_extractor/build/lib/mmdet/models/utils/make_divisible.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed42c2eeea2a6aed03a0be5516b8d1ef1139e486
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/utils/make_divisible.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+    """Make divisible function.
+
+    This function rounds the channel number to the nearest value that can be
+    divisible by the divisor. It is taken from the original tf repo. It ensures
+    that all layers have a channel number that is divisible by divisor. It can
+    be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py  # noqa
+
+    Args:
+        value (int): The original channel number.
+        divisor (int): The divisor to fully divide the channel number.
+        min_value (int): The minimum value of the output channel.
+            Default: None, means that the minimum value equal to the divisor.
+        min_ratio (float): The minimum ratio of the rounded channel number to
+            the original channel number. Default: 0.9.
+
+    Returns:
+        int: The modified output channel number.
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than (1-min_ratio).
+    if new_value < min_ratio * value:
+        new_value += divisor
+    return new_value
diff --git a/head_extractor/build/lib/mmdet/models/utils/misc.py b/head_extractor/build/lib/mmdet/models/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cf429153ba7e0be025396b069aef8212144e34d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/utils/misc.py
@@ -0,0 +1,697 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+from typing import List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+from mmengine.utils import digit_version
+from six.moves import map, zip
+from torch import Tensor
+from torch.autograd import Function
+from torch.nn import functional as F
+
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import BaseBoxes, get_box_type, stack_boxes
+from mmdet.structures.mask import BitmapMasks, PolygonMasks
+from mmdet.utils import OptInstanceList
+
+
+class SigmoidGeometricMean(Function):
+    """Forward and backward function of geometric mean of two sigmoid
+    functions.
+
+    This implementation with analytical gradient function substitutes
+    the autograd function of (x.sigmoid() * y.sigmoid()).sqrt(). The
+    original implementation incurs none during gradient backprapagation
+    if both x and y are very small values.
+    """
+
+    @staticmethod
+    def forward(ctx, x, y):
+        x_sigmoid = x.sigmoid()
+        y_sigmoid = y.sigmoid()
+        z = (x_sigmoid * y_sigmoid).sqrt()
+        ctx.save_for_backward(x_sigmoid, y_sigmoid, z)
+        return z
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x_sigmoid, y_sigmoid, z = ctx.saved_tensors
+        grad_x = grad_output * z * (1 - x_sigmoid) / 2
+        grad_y = grad_output * z * (1 - y_sigmoid) / 2
+        return grad_x, grad_y
+
+
+sigmoid_geometric_mean = SigmoidGeometricMean.apply
+
+
+def interpolate_as(source, target, mode='bilinear', align_corners=False):
+    """Interpolate the `source` to the shape of the `target`.
+
+    The `source` must be a Tensor, but the `target` can be a Tensor or a
+    np.ndarray with the shape (..., target_h, target_w).
+
+    Args:
+        source (Tensor): A 3D/4D Tensor with the shape (N, H, W) or
+            (N, C, H, W).
+        target (Tensor | np.ndarray): The interpolation target with the shape
+            (..., target_h, target_w).
+        mode (str): Algorithm used for interpolation. The options are the
+            same as those in F.interpolate(). Default: ``'bilinear'``.
+        align_corners (bool): The same as the argument in F.interpolate().
+
+    Returns:
+        Tensor: The interpolated source Tensor.
+    """
+    assert len(target.shape) >= 2
+
+    def _interpolate_as(source, target, mode='bilinear', align_corners=False):
+        """Interpolate the `source` (4D) to the shape of the `target`."""
+        target_h, target_w = target.shape[-2:]
+        source_h, source_w = source.shape[-2:]
+        if target_h != source_h or target_w != source_w:
+            source = F.interpolate(
+                source,
+                size=(target_h, target_w),
+                mode=mode,
+                align_corners=align_corners)
+        return source
+
+    if len(source.shape) == 3:
+        source = source[:, None, :, :]
+        source = _interpolate_as(source, target, mode, align_corners)
+        return source[:, 0, :, :]
+    else:
+        return _interpolate_as(source, target, mode, align_corners)
+
+
+def unpack_gt_instances(batch_data_samples: SampleList) -> tuple:
+    """Unpack ``gt_instances``, ``gt_instances_ignore`` and ``img_metas`` based
+    on ``batch_data_samples``
+
+    Args:
+        batch_data_samples (List[:obj:`DetDataSample`]): The Data
+            Samples. It usually includes information such as
+            `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+    Returns:
+        tuple:
+
+            - batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            - batch_gt_instances_ignore (list[:obj:`InstanceData`]):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            - batch_img_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+    """
+    batch_gt_instances = []
+    batch_gt_instances_ignore = []
+    batch_img_metas = []
+    for data_sample in batch_data_samples:
+        batch_img_metas.append(data_sample.metainfo)
+        batch_gt_instances.append(data_sample.gt_instances)
+        if 'ignored_instances' in data_sample:
+            batch_gt_instances_ignore.append(data_sample.ignored_instances)
+        else:
+            batch_gt_instances_ignore.append(None)
+
+    return batch_gt_instances, batch_gt_instances_ignore, batch_img_metas
+
+
+def empty_instances(batch_img_metas: List[dict],
+                    device: torch.device,
+                    task_type: str,
+                    instance_results: OptInstanceList = None,
+                    mask_thr_binary: Union[int, float] = 0,
+                    box_type: Union[str, type] = 'hbox',
+                    use_box_type: bool = False,
+                    num_classes: int = 80,
+                    score_per_cls: bool = False) -> List[InstanceData]:
+    """Handle predicted instances when RoI is empty.
+
+    Note: If ``instance_results`` is not None, it will be modified
+    in place internally, and then return ``instance_results``
+
+    Args:
+        batch_img_metas (list[dict]): List of image information.
+        device (torch.device): Device of tensor.
+        task_type (str): Expected returned task type. it currently
+            supports bbox and mask.
+        instance_results (list[:obj:`InstanceData`]): List of instance
+            results.
+        mask_thr_binary (int, float): mask binarization threshold.
+            Defaults to 0.
+        box_type (str or type): The empty box type. Defaults to `hbox`.
+        use_box_type (bool): Whether to warp boxes with the box type.
+            Defaults to False.
+        num_classes (int): num_classes of bbox_head. Defaults to 80.
+        score_per_cls (bool):  Whether to generate classwise score for
+            the empty instance. ``score_per_cls`` will be True when the model
+            needs to produce raw results without nms. Defaults to False.
+
+    Returns:
+        list[:obj:`InstanceData`]: Detection results of each image
+    """
+    assert task_type in ('bbox', 'mask'), 'Only support bbox and mask,' \
+                                          f' but got {task_type}'
+
+    if instance_results is not None:
+        assert len(instance_results) == len(batch_img_metas)
+
+    results_list = []
+    for img_id in range(len(batch_img_metas)):
+        if instance_results is not None:
+            results = instance_results[img_id]
+            assert isinstance(results, InstanceData)
+        else:
+            results = InstanceData()
+
+        if task_type == 'bbox':
+            _, box_type = get_box_type(box_type)
+            bboxes = torch.zeros(0, box_type.box_dim, device=device)
+            if use_box_type:
+                bboxes = box_type(bboxes, clone=False)
+            results.bboxes = bboxes
+            score_shape = (0, num_classes + 1) if score_per_cls else (0, )
+            results.scores = torch.zeros(score_shape, device=device)
+            results.labels = torch.zeros((0, ),
+                                         device=device,
+                                         dtype=torch.long)
+        else:
+            # TODO: Handle the case where rescale is false
+            img_h, img_w = batch_img_metas[img_id]['ori_shape'][:2]
+            # the type of `im_mask` will be torch.bool or torch.uint8,
+            # where uint8 if for visualization and debugging.
+            im_mask = torch.zeros(
+                0,
+                img_h,
+                img_w,
+                device=device,
+                dtype=torch.bool if mask_thr_binary >= 0 else torch.uint8)
+            results.masks = im_mask
+        results_list.append(results)
+    return results_list
+
+
+def multi_apply(func, *args, **kwargs):
+    """Apply function to a list of arguments.
+
+    Note:
+        This function applies the ``func`` to multiple inputs and
+        map the multiple outputs of the ``func`` into different
+        list. Each list contains the same type of outputs corresponding
+        to different inputs.
+
+    Args:
+        func (Function): A function that will be applied to a list of
+            arguments
+
+    Returns:
+        tuple(list): A tuple containing multiple list, each list contains \
+            a kind of returned results by the function
+    """
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def unmap(data, count, inds, fill=0):
+    """Unmap a subset of item (data) back to the original set of items (of size
+    count)"""
+    if data.dim() == 1:
+        ret = data.new_full((count, ), fill)
+        ret[inds.type(torch.bool)] = data
+    else:
+        new_size = (count, ) + data.size()[1:]
+        ret = data.new_full(new_size, fill)
+        ret[inds.type(torch.bool), :] = data
+    return ret
+
+
+def mask2ndarray(mask):
+    """Convert Mask to ndarray..
+
+    Args:
+        mask (:obj:`BitmapMasks` or :obj:`PolygonMasks` or
+        torch.Tensor or np.ndarray): The mask to be converted.
+
+    Returns:
+        np.ndarray: Ndarray mask of shape (n, h, w) that has been converted
+    """
+    if isinstance(mask, (BitmapMasks, PolygonMasks)):
+        mask = mask.to_ndarray()
+    elif isinstance(mask, torch.Tensor):
+        mask = mask.detach().cpu().numpy()
+    elif not isinstance(mask, np.ndarray):
+        raise TypeError(f'Unsupported {type(mask)} data type')
+    return mask
+
+
+def flip_tensor(src_tensor, flip_direction):
+    """flip tensor base on flip_direction.
+
+    Args:
+        src_tensor (Tensor): input feature map, shape (B, C, H, W).
+        flip_direction (str): The flipping direction. Options are
+          'horizontal', 'vertical', 'diagonal'.
+
+    Returns:
+        out_tensor (Tensor): Flipped tensor.
+    """
+    assert src_tensor.ndim == 4
+    valid_directions = ['horizontal', 'vertical', 'diagonal']
+    assert flip_direction in valid_directions
+    if flip_direction == 'horizontal':
+        out_tensor = torch.flip(src_tensor, [3])
+    elif flip_direction == 'vertical':
+        out_tensor = torch.flip(src_tensor, [2])
+    else:
+        out_tensor = torch.flip(src_tensor, [2, 3])
+    return out_tensor
+
+
+def select_single_mlvl(mlvl_tensors, batch_id, detach=True):
+    """Extract a multi-scale single image tensor from a multi-scale batch
+    tensor based on batch index.
+
+    Note: The default value of detach is True, because the proposal gradient
+    needs to be detached during the training of the two-stage model. E.g
+    Cascade Mask R-CNN.
+
+    Args:
+        mlvl_tensors (list[Tensor]): Batch tensor for all scale levels,
+           each is a 4D-tensor.
+        batch_id (int): Batch index.
+        detach (bool): Whether detach gradient. Default True.
+
+    Returns:
+        list[Tensor]: Multi-scale single image tensor.
+    """
+    assert isinstance(mlvl_tensors, (list, tuple))
+    num_levels = len(mlvl_tensors)
+
+    if detach:
+        mlvl_tensor_list = [
+            mlvl_tensors[i][batch_id].detach() for i in range(num_levels)
+        ]
+    else:
+        mlvl_tensor_list = [
+            mlvl_tensors[i][batch_id] for i in range(num_levels)
+        ]
+    return mlvl_tensor_list
+
+
+def filter_scores_and_topk(scores, score_thr, topk, results=None):
+    """Filter results using score threshold and topk candidates.
+
+    Args:
+        scores (Tensor): The scores, shape (num_bboxes, K).
+        score_thr (float): The score filter threshold.
+        topk (int): The number of topk candidates.
+        results (dict or list or Tensor, Optional): The results to
+           which the filtering rule is to be applied. The shape
+           of each item is (num_bboxes, N).
+
+    Returns:
+        tuple: Filtered results
+
+            - scores (Tensor): The scores after being filtered, \
+                shape (num_bboxes_filtered, ).
+            - labels (Tensor): The class labels, shape \
+                (num_bboxes_filtered, ).
+            - anchor_idxs (Tensor): The anchor indexes, shape \
+                (num_bboxes_filtered, ).
+            - filtered_results (dict or list or Tensor, Optional): \
+                The filtered results. The shape of each item is \
+                (num_bboxes_filtered, N).
+    """
+    valid_mask = scores > score_thr
+    scores = scores[valid_mask]
+    valid_idxs = torch.nonzero(valid_mask)
+
+    num_topk = min(topk, valid_idxs.size(0))
+    # torch.sort is actually faster than .topk (at least on GPUs)
+    scores, idxs = scores.sort(descending=True)
+    scores = scores[:num_topk]
+    topk_idxs = valid_idxs[idxs[:num_topk]]
+    keep_idxs, labels = topk_idxs.unbind(dim=1)
+
+    filtered_results = None
+    if results is not None:
+        if isinstance(results, dict):
+            filtered_results = {k: v[keep_idxs] for k, v in results.items()}
+        elif isinstance(results, list):
+            filtered_results = [result[keep_idxs] for result in results]
+        elif isinstance(results, torch.Tensor):
+            filtered_results = results[keep_idxs]
+        else:
+            raise NotImplementedError(f'Only supports dict or list or Tensor, '
+                                      f'but get {type(results)}.')
+    return scores, labels, keep_idxs, filtered_results
+
+
+def center_of_mass(mask, esp=1e-6):
+    """Calculate the centroid coordinates of the mask.
+
+    Args:
+        mask (Tensor): The mask to be calculated, shape (h, w).
+        esp (float): Avoid dividing by zero. Default: 1e-6.
+
+    Returns:
+        tuple[Tensor]: the coordinates of the center point of the mask.
+
+            - center_h (Tensor): the center point of the height.
+            - center_w (Tensor): the center point of the width.
+    """
+    h, w = mask.shape
+    grid_h = torch.arange(h, device=mask.device)[:, None]
+    grid_w = torch.arange(w, device=mask.device)
+    normalizer = mask.sum().float().clamp(min=esp)
+    center_h = (mask * grid_h).sum() / normalizer
+    center_w = (mask * grid_w).sum() / normalizer
+    return center_h, center_w
+
+
+def generate_coordinate(featmap_sizes, device='cuda'):
+    """Generate the coordinate.
+
+    Args:
+        featmap_sizes (tuple): The feature to be calculated,
+            of shape (N, C, W, H).
+        device (str): The device where the feature will be put on.
+    Returns:
+        coord_feat (Tensor): The coordinate feature, of shape (N, 2, W, H).
+    """
+
+    x_range = torch.linspace(-1, 1, featmap_sizes[-1], device=device)
+    y_range = torch.linspace(-1, 1, featmap_sizes[-2], device=device)
+    y, x = torch.meshgrid(y_range, x_range)
+    y = y.expand([featmap_sizes[0], 1, -1, -1])
+    x = x.expand([featmap_sizes[0], 1, -1, -1])
+    coord_feat = torch.cat([x, y], 1)
+
+    return coord_feat
+
+
+def levels_to_images(mlvl_tensor: List[torch.Tensor]) -> List[torch.Tensor]:
+    """Concat multi-level feature maps by image.
+
+    [feature_level0, feature_level1...] -> [feature_image0, feature_image1...]
+    Convert the shape of each element in mlvl_tensor from (N, C, H, W) to
+    (N, H*W , C), then split the element to N elements with shape (H*W, C), and
+    concat elements in same image of all level along first dimension.
+
+    Args:
+        mlvl_tensor (list[Tensor]): list of Tensor which collect from
+            corresponding level. Each element is of shape (N, C, H, W)
+
+    Returns:
+        list[Tensor]: A list that contains N tensors and each tensor is
+            of shape (num_elements, C)
+    """
+    batch_size = mlvl_tensor[0].size(0)
+    batch_list = [[] for _ in range(batch_size)]
+    channels = mlvl_tensor[0].size(1)
+    for t in mlvl_tensor:
+        t = t.permute(0, 2, 3, 1)
+        t = t.view(batch_size, -1, channels).contiguous()
+        for img in range(batch_size):
+            batch_list[img].append(t[img])
+    return [torch.cat(item, 0) for item in batch_list]
+
+
+def images_to_levels(target, num_levels):
+    """Convert targets by image to targets by feature level.
+
+    [target_img0, target_img1] -> [target_level0, target_level1, ...]
+    """
+    target = stack_boxes(target, 0)
+    level_targets = []
+    start = 0
+    for n in num_levels:
+        end = start + n
+        # level_targets.append(target[:, start:end].squeeze(0))
+        level_targets.append(target[:, start:end])
+        start = end
+    return level_targets
+
+
+def samplelist_boxtype2tensor(batch_data_samples: SampleList) -> SampleList:
+    for data_samples in batch_data_samples:
+        if 'gt_instances' in data_samples:
+            bboxes = data_samples.gt_instances.get('bboxes', None)
+            if isinstance(bboxes, BaseBoxes):
+                data_samples.gt_instances.bboxes = bboxes.tensor
+        if 'pred_instances' in data_samples:
+            bboxes = data_samples.pred_instances.get('bboxes', None)
+            if isinstance(bboxes, BaseBoxes):
+                data_samples.pred_instances.bboxes = bboxes.tensor
+        if 'ignored_instances' in data_samples:
+            bboxes = data_samples.ignored_instances.get('bboxes', None)
+            if isinstance(bboxes, BaseBoxes):
+                data_samples.ignored_instances.bboxes = bboxes.tensor
+
+
+_torch_version_div_indexing = (
+    'parrots' not in torch.__version__
+    and digit_version(torch.__version__) >= digit_version('1.8'))
+
+
+def floordiv(dividend, divisor, rounding_mode='trunc'):
+    if _torch_version_div_indexing:
+        return torch.div(dividend, divisor, rounding_mode=rounding_mode)
+    else:
+        return dividend // divisor
+
+
+def _filter_gt_instances_by_score(batch_data_samples: SampleList,
+                                  score_thr: float) -> SampleList:
+    """Filter ground truth (GT) instances by score.
+
+    Args:
+        batch_data_samples (SampleList): The Data
+            Samples. It usually includes information such as
+            `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+        score_thr (float): The score filter threshold.
+
+    Returns:
+        SampleList: The Data Samples filtered by score.
+    """
+    for data_samples in batch_data_samples:
+        assert 'scores' in data_samples.gt_instances, \
+            'there does not exit scores in instances'
+        if data_samples.gt_instances.bboxes.shape[0] > 0:
+            data_samples.gt_instances = data_samples.gt_instances[
+                data_samples.gt_instances.scores > score_thr]
+    return batch_data_samples
+
+
+def _filter_gt_instances_by_size(batch_data_samples: SampleList,
+                                 wh_thr: tuple) -> SampleList:
+    """Filter ground truth (GT) instances by size.
+
+    Args:
+        batch_data_samples (SampleList): The Data
+            Samples. It usually includes information such as
+            `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+        wh_thr (tuple):  Minimum width and height of bbox.
+
+    Returns:
+        SampleList: The Data Samples filtered by score.
+    """
+    for data_samples in batch_data_samples:
+        bboxes = data_samples.gt_instances.bboxes
+        if bboxes.shape[0] > 0:
+            w = bboxes[:, 2] - bboxes[:, 0]
+            h = bboxes[:, 3] - bboxes[:, 1]
+            data_samples.gt_instances = data_samples.gt_instances[
+                (w > wh_thr[0]) & (h > wh_thr[1])]
+    return batch_data_samples
+
+
+def filter_gt_instances(batch_data_samples: SampleList,
+                        score_thr: float = None,
+                        wh_thr: tuple = None):
+    """Filter ground truth (GT) instances by score and/or size.
+
+    Args:
+        batch_data_samples (SampleList): The Data
+            Samples. It usually includes information such as
+            `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+        score_thr (float): The score filter threshold.
+        wh_thr (tuple):  Minimum width and height of bbox.
+
+    Returns:
+        SampleList: The Data Samples filtered by score and/or size.
+    """
+
+    if score_thr is not None:
+        batch_data_samples = _filter_gt_instances_by_score(
+            batch_data_samples, score_thr)
+    if wh_thr is not None:
+        batch_data_samples = _filter_gt_instances_by_size(
+            batch_data_samples, wh_thr)
+    return batch_data_samples
+
+
+def rename_loss_dict(prefix: str, losses: dict) -> dict:
+    """Rename the key names in loss dict by adding a prefix.
+
+    Args:
+        prefix (str): The prefix for loss components.
+        losses (dict):  A dictionary of loss components.
+
+    Returns:
+            dict: A dictionary of loss components with prefix.
+    """
+    return {prefix + k: v for k, v in losses.items()}
+
+
+def reweight_loss_dict(losses: dict, weight: float) -> dict:
+    """Reweight losses in the dict by weight.
+
+    Args:
+        losses (dict):  A dictionary of loss components.
+        weight (float): Weight for loss components.
+
+    Returns:
+            dict: A dictionary of weighted loss components.
+    """
+    for name, loss in losses.items():
+        if 'loss' in name:
+            if isinstance(loss, Sequence):
+                losses[name] = [item * weight for item in loss]
+            else:
+                losses[name] = loss * weight
+    return losses
+
+
+def relative_coordinate_maps(
+    locations: Tensor,
+    centers: Tensor,
+    strides: Tensor,
+    size_of_interest: int,
+    feat_sizes: Tuple[int],
+) -> Tensor:
+    """Generate the relative coordinate maps with feat_stride.
+
+    Args:
+        locations (Tensor): The prior location of mask feature map.
+            It has shape (num_priors, 2).
+        centers (Tensor): The prior points of a object in
+            all feature pyramid. It has shape (num_pos, 2)
+        strides (Tensor): The prior strides of a object in
+            all feature pyramid. It has shape (num_pos, 1)
+        size_of_interest (int): The size of the region used in rel coord.
+        feat_sizes (Tuple[int]): The feature size H and W, which has 2 dims.
+    Returns:
+        rel_coord_feat (Tensor): The coordinate feature
+            of shape (num_pos, 2, H, W).
+    """
+
+    H, W = feat_sizes
+    rel_coordinates = centers.reshape(-1, 1, 2) - locations.reshape(1, -1, 2)
+    rel_coordinates = rel_coordinates.permute(0, 2, 1).float()
+    rel_coordinates = rel_coordinates / (
+        strides[:, None, None] * size_of_interest)
+    return rel_coordinates.reshape(-1, 2, H, W)
+
+
+def aligned_bilinear(tensor: Tensor, factor: int) -> Tensor:
+    """aligned bilinear, used in original implement in CondInst:
+
+    https://github.com/aim-uofa/AdelaiDet/blob/\
+    c0b2092ce72442b0f40972f7c6dda8bb52c46d16/adet/utils/comm.py#L23
+    """
+
+    assert tensor.dim() == 4
+    assert factor >= 1
+    assert int(factor) == factor
+
+    if factor == 1:
+        return tensor
+
+    h, w = tensor.size()[2:]
+    tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode='replicate')
+    oh = factor * h + 1
+    ow = factor * w + 1
+    tensor = F.interpolate(
+        tensor, size=(oh, ow), mode='bilinear', align_corners=True)
+    tensor = F.pad(
+        tensor, pad=(factor // 2, 0, factor // 2, 0), mode='replicate')
+
+    return tensor[:, :, :oh - 1, :ow - 1]
+
+
+def unfold_wo_center(x, kernel_size: int, dilation: int) -> Tensor:
+    """unfold_wo_center, used in original implement in BoxInst:
+
+    https://github.com/aim-uofa/AdelaiDet/blob/\
+    4a3a1f7372c35b48ebf5f6adc59f135a0fa28d60/\
+    adet/modeling/condinst/condinst.py#L53
+    """
+    assert x.dim() == 4
+    assert kernel_size % 2 == 1
+
+    # using SAME padding
+    padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
+    unfolded_x = F.unfold(
+        x, kernel_size=kernel_size, padding=padding, dilation=dilation)
+    unfolded_x = unfolded_x.reshape(
+        x.size(0), x.size(1), -1, x.size(2), x.size(3))
+    # remove the center pixels
+    size = kernel_size**2
+    unfolded_x = torch.cat(
+        (unfolded_x[:, :, :size // 2], unfolded_x[:, :, size // 2 + 1:]),
+        dim=2)
+
+    return unfolded_x
+
+
+def padding_to(input_tensor: Tensor, max_len: int = 300) -> Tensor:
+    """Pad the first dimension of `input_tensor` to `max_len`.
+
+    Args:
+        input_tensor (Tensor): The tensor to be padded,
+        max_len (int): Padding target size in the first dimension.
+            Default: 300
+    https://github.com/jshilong/DDQ/blob/ddq_detr/projects/models/utils.py#L19
+    Returns:
+        Tensor: The tensor padded with the first dimension size `max_len`.
+    """
+    if max_len is None:
+        return input_tensor
+    num_padding = max_len - len(input_tensor)
+    if input_tensor.dim() > 1:
+        padding = input_tensor.new_zeros(
+            num_padding, *input_tensor.size()[1:], dtype=input_tensor.dtype)
+    else:
+        padding = input_tensor.new_zeros(num_padding, dtype=input_tensor.dtype)
+    output_tensor = torch.cat([input_tensor, padding], dim=0)
+    return output_tensor
+
+
+def align_tensor(inputs: List[Tensor],
+                 max_len: Optional[int] = None) -> Tensor:
+    """Pad each input to `max_len`, then stack them. If `max_len` is None, then
+    it is the max size of the first dimension of each input.
+
+        https://github.com/jshilong/DDQ/blob/ddq_detr/projects/models/\
+        utils.py#L12
+
+    Args:
+        inputs (list[Tensor]): The tensors to be padded,
+            Each input should have the same shape except the first dimension.
+        max_len (int): Padding target size in the first dimension.
+            Default: None
+    Returns:
+        Tensor: Stacked inputs after padding in the first dimension.
+    """
+    if max_len is None:
+        max_len = max([len(item) for item in inputs])
+
+    return torch.stack([padding_to(item, max_len) for item in inputs])
diff --git a/head_extractor/build/lib/mmdet/models/utils/panoptic_gt_processing.py b/head_extractor/build/lib/mmdet/models/utils/panoptic_gt_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3bc95fc04040b4a2a13fa63f2d02f092f725e6
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/utils/panoptic_gt_processing.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from torch import Tensor
+
+
+def preprocess_panoptic_gt(gt_labels: Tensor, gt_masks: Tensor,
+                           gt_semantic_seg: Tensor, num_things: int,
+                           num_stuff: int) -> Tuple[Tensor, Tensor]:
+    """Preprocess the ground truth for a image.
+
+    Args:
+        gt_labels (Tensor): Ground truth labels of each bbox,
+            with shape (num_gts, ).
+        gt_masks (BitmapMasks): Ground truth masks of each instances
+            of a image, shape (num_gts, h, w).
+        gt_semantic_seg (Tensor | None): Ground truth of semantic
+            segmentation with the shape (1, h, w).
+            [0, num_thing_class - 1] means things,
+            [num_thing_class, num_class-1] means stuff,
+            255 means VOID. It's None when training instance segmentation.
+
+    Returns:
+        tuple[Tensor, Tensor]: a tuple containing the following targets.
+
+            - labels (Tensor): Ground truth class indices for a
+                image, with shape (n, ), n is the sum of number
+                of stuff type and number of instance in a image.
+            - masks (Tensor): Ground truth mask for a image, with
+                shape (n, h, w). Contains stuff and things when training
+                panoptic segmentation, and things only when training
+                instance segmentation.
+    """
+    num_classes = num_things + num_stuff
+    things_masks = gt_masks.to_tensor(
+        dtype=torch.bool, device=gt_labels.device)
+
+    if gt_semantic_seg is None:
+        masks = things_masks.long()
+        return gt_labels, masks
+
+    things_labels = gt_labels
+    gt_semantic_seg = gt_semantic_seg.squeeze(0)
+
+    semantic_labels = torch.unique(
+        gt_semantic_seg,
+        sorted=False,
+        return_inverse=False,
+        return_counts=False)
+    stuff_masks_list = []
+    stuff_labels_list = []
+    for label in semantic_labels:
+        if label < num_things or label >= num_classes:
+            continue
+        stuff_mask = gt_semantic_seg == label
+        stuff_masks_list.append(stuff_mask)
+        stuff_labels_list.append(label)
+
+    if len(stuff_masks_list) > 0:
+        stuff_masks = torch.stack(stuff_masks_list, dim=0)
+        stuff_labels = torch.stack(stuff_labels_list, dim=0)
+        labels = torch.cat([things_labels, stuff_labels], dim=0)
+        masks = torch.cat([things_masks, stuff_masks], dim=0)
+    else:
+        labels = things_labels
+        masks = things_masks
+
+    masks = masks.long()
+    return labels, masks
diff --git a/head_extractor/build/lib/mmdet/models/utils/point_sample.py b/head_extractor/build/lib/mmdet/models/utils/point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..1afc957f3da7d1dc030c21d40311c768c6952ea4
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/utils/point_sample.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import point_sample
+from torch import Tensor
+
+
+def get_uncertainty(mask_preds: Tensor, labels: Tensor) -> Tensor:
+    """Estimate uncertainty based on pred logits.
+
+    We estimate uncertainty as L1 distance between 0.0 and the logits
+    prediction in 'mask_preds' for the foreground class in `classes`.
+
+    Args:
+        mask_preds (Tensor): mask predication logits, shape (num_rois,
+            num_classes, mask_height, mask_width).
+
+        labels (Tensor): Either predicted or ground truth label for
+            each predicted mask, of length num_rois.
+
+    Returns:
+        scores (Tensor): Uncertainty scores with the most uncertain
+            locations having the highest uncertainty score,
+            shape (num_rois, 1, mask_height, mask_width)
+    """
+    if mask_preds.shape[1] == 1:
+        gt_class_logits = mask_preds.clone()
+    else:
+        inds = torch.arange(mask_preds.shape[0], device=mask_preds.device)
+        gt_class_logits = mask_preds[inds, labels].unsqueeze(1)
+    return -torch.abs(gt_class_logits)
+
+
+def get_uncertain_point_coords_with_randomness(
+        mask_preds: Tensor, labels: Tensor, num_points: int,
+        oversample_ratio: float, importance_sample_ratio: float) -> Tensor:
+    """Get ``num_points`` most uncertain points with random points during
+    train.
+
+    Sample points in [0, 1] x [0, 1] coordinate space based on their
+    uncertainty. The uncertainties are calculated for each point using
+    'get_uncertainty()' function that takes point's logit prediction as
+    input.
+
+    Args:
+        mask_preds (Tensor): A tensor of shape (num_rois, num_classes,
+            mask_height, mask_width) for class-specific or class-agnostic
+            prediction.
+        labels (Tensor): The ground truth class for each instance.
+        num_points (int): The number of points to sample.
+        oversample_ratio (float): Oversampling parameter.
+        importance_sample_ratio (float): Ratio of points that are sampled
+            via importnace sampling.
+
+    Returns:
+        point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+            that contains the coordinates sampled points.
+    """
+    assert oversample_ratio >= 1
+    assert 0 <= importance_sample_ratio <= 1
+    batch_size = mask_preds.shape[0]
+    num_sampled = int(num_points * oversample_ratio)
+    point_coords = torch.rand(
+        batch_size, num_sampled, 2, device=mask_preds.device)
+    point_logits = point_sample(mask_preds, point_coords)
+    # It is crucial to calculate uncertainty based on the sampled
+    # prediction value for the points. Calculating uncertainties of the
+    # coarse predictions first and sampling them for points leads to
+    # incorrect results.  To illustrate this: assume uncertainty func(
+    # logits)=-abs(logits), a sampled point between two coarse
+    # predictions with -1 and 1 logits has 0 logits, and therefore 0
+    # uncertainty value. However, if we calculate uncertainties for the
+    # coarse predictions first, both will have -1 uncertainty,
+    # and sampled point will get -1 uncertainty.
+    point_uncertainties = get_uncertainty(point_logits, labels)
+    num_uncertain_points = int(importance_sample_ratio * num_points)
+    num_random_points = num_points - num_uncertain_points
+    idx = torch.topk(
+        point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+    shift = num_sampled * torch.arange(
+        batch_size, dtype=torch.long, device=mask_preds.device)
+    idx += shift[:, None]
+    point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+        batch_size, num_uncertain_points, 2)
+    if num_random_points > 0:
+        rand_roi_coords = torch.rand(
+            batch_size, num_random_points, 2, device=mask_preds.device)
+        point_coords = torch.cat((point_coords, rand_roi_coords), dim=1)
+    return point_coords
diff --git a/head_extractor/build/lib/mmdet/models/utils/vlfuse_helper.py b/head_extractor/build/lib/mmdet/models/utils/vlfuse_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..76b54de317c1f24d7cb40573954f988fd94fef42
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/utils/vlfuse_helper.py
@@ -0,0 +1,773 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/microsoft/GLIP/blob/main/maskrcnn_benchmark/utils/fuse_helper.py  # noqa
+# and https://github.com/microsoft/GLIP/blob/main/maskrcnn_benchmark/modeling/rpn/modeling_bert.py  # noqa
+import math
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from mmcv.cnn.bricks import DropPath
+from torch import Tensor
+
+try:
+    from transformers import BertConfig, BertPreTrainedModel
+    from transformers.modeling_utils import apply_chunking_to_forward
+    from transformers.models.bert.modeling_bert import \
+        BertAttention as HFBertAttention
+    from transformers.models.bert.modeling_bert import \
+        BertIntermediate as HFBertIntermediate
+    from transformers.models.bert.modeling_bert import \
+        BertOutput as HFBertOutput
+except ImportError:
+    BertConfig = None
+    BertPreTrainedModel = object
+    apply_chunking_to_forward = None
+    HFBertAttention = object
+    HFBertIntermediate = object
+    HFBertOutput = object
+
+MAX_CLAMP_VALUE = 50000
+
+
+def permute_and_flatten(layer: Tensor, N: int, A: int, C: int, H: int,
+                        W: int) -> Tensor:
+    """Permute and then flatten a tensor,
+
+       from size (N, A, C, H, W) to (N, H * W * A, C).
+
+    Args:
+        layer (Tensor): Tensor of shape (N, C, H, W).
+        N (int): Batch size.
+        A (int): Number of attention heads.
+        C (int): Number of channels.
+        H (int): Height of feature map.
+        W (int): Width of feature map.
+
+    Returns:
+        Tensor: A Tensor of shape (N, H * W * A, C).
+    """
+    layer = layer.view(N, A, C, H, W)
+    layer = layer.permute(0, 3, 4, 1, 2)
+    layer = layer.reshape(N, -1, C)
+    return layer
+
+
+def clamp_values(vector: Tensor) -> Tensor:
+    """Clamp the values of a vector to the range [-MAX_CLAMP_VALUE,
+    MAX_CLAMP_VALUE].
+
+    Args:
+        vector (Tensor): Tensor of shape (N, C, H, W).
+
+    Returns:
+        Tensor: A Tensor of shape (N, C, H, W) with clamped values.
+    """
+    vector = torch.clamp(vector, min=-MAX_CLAMP_VALUE, max=MAX_CLAMP_VALUE)
+    return vector
+
+
+class BiMultiHeadAttention(nn.Module):
+    """Bidirectional fusion Multi-Head Attention layer.
+
+    Args:
+        v_dim (int): The dimension of the vision input.
+        l_dim (int): The dimension of the language input.
+        embed_dim (int): The embedding dimension for the attention operation.
+        num_heads (int): The number of attention heads.
+        dropout (float, optional): The dropout probability. Defaults to 0.1.
+    """
+
+    def __init__(self,
+                 v_dim: int,
+                 l_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 dropout: float = 0.1):
+        super(BiMultiHeadAttention, self).__init__()
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.v_dim = v_dim
+        self.l_dim = l_dim
+
+        assert (
+            self.head_dim * self.num_heads == self.embed_dim
+        ), 'embed_dim must be divisible by num_heads ' \
+           f'(got `embed_dim`: {self.embed_dim} ' \
+           f'and `num_heads`: {self.num_heads}).'
+        self.scale = self.head_dim**(-0.5)
+        self.dropout = dropout
+
+        self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.l_proj = nn.Linear(self.l_dim, self.embed_dim)
+        self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)
+
+        self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)
+        self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)
+
+        self.stable_softmax_2d = False
+        self.clamp_min_for_underflow = True
+        self.clamp_max_for_overflow = True
+
+        self._reset_parameters()
+
+    def _shape(self, tensor: Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads,
+                           self.head_dim).transpose(1, 2).contiguous()
+
+    def _reset_parameters(self):
+        nn.init.xavier_uniform_(self.v_proj.weight)
+        self.v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.l_proj.weight)
+        self.l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_v_proj.weight)
+        self.values_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_l_proj.weight)
+        self.values_l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_v_proj.weight)
+        self.out_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_l_proj.weight)
+        self.out_l_proj.bias.data.fill_(0)
+
+    def forward(
+        self,
+        vision: Tensor,
+        lang: Tensor,
+        attention_mask_v: Optional[Tensor] = None,
+        attention_mask_l: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Tensor]:
+        bsz, tgt_len, _ = vision.size()
+
+        query_states = self.v_proj(vision) * self.scale
+        key_states = self._shape(self.l_proj(lang), -1, bsz)
+        value_v_states = self._shape(self.values_v_proj(vision), -1, bsz)
+        value_l_states = self._shape(self.values_l_proj(lang), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len,
+                                   bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_v_states = value_v_states.view(*proj_shape)
+        value_l_states = value_l_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f'Attention weights should be of '
+                f'size {(bsz * self.num_heads, tgt_len, src_len)}, '
+                f'but is {attn_weights.size()}')
+
+        if self.stable_softmax_2d:
+            attn_weights = attn_weights - attn_weights.max()
+
+        if self.clamp_min_for_underflow:
+            # Do not increase -50000, data type half has quite limited range
+            attn_weights = torch.clamp(attn_weights, min=-MAX_CLAMP_VALUE)
+        if self.clamp_max_for_overflow:
+            # Do not increase 50000, data type half has quite limited range
+            attn_weights = torch.clamp(attn_weights, max=MAX_CLAMP_VALUE)
+
+        attn_weights_T = attn_weights.transpose(1, 2)
+        attn_weights_l = (
+            attn_weights_T -
+            torch.max(attn_weights_T, dim=-1, keepdim=True)[0])
+        if self.clamp_min_for_underflow:
+            # Do not increase -50000, data type half has quite limited range
+            attn_weights_l = torch.clamp(attn_weights_l, min=-MAX_CLAMP_VALUE)
+        if self.clamp_max_for_overflow:
+            # Do not increase 50000, data type half has quite limited range
+            attn_weights_l = torch.clamp(attn_weights_l, max=MAX_CLAMP_VALUE)
+
+        if attention_mask_v is not None:
+            attention_mask_v = (
+                attention_mask_v[:, None,
+                                 None, :].repeat(1, self.num_heads, 1,
+                                                 1).flatten(0, 1))
+            attn_weights_l.masked_fill_(attention_mask_v, float('-inf'))
+
+        attn_weights_l = attn_weights_l.softmax(dim=-1)
+
+        if attention_mask_l is not None:
+            assert (attention_mask_l.dim() == 2)
+            attention_mask = attention_mask_l.unsqueeze(1).unsqueeze(1)
+            attention_mask = attention_mask.expand(bsz, 1, tgt_len, src_len)
+            attention_mask = attention_mask.masked_fill(
+                attention_mask == 0, -9e15)
+
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError('Attention mask should be of '
+                                 f'size {(bsz, 1, tgt_len, src_len)}')
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
+                                             src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len,
+                                             src_len)
+
+        attn_weights_v = nn.functional.softmax(attn_weights, dim=-1)
+
+        attn_probs_v = F.dropout(
+            attn_weights_v, p=self.dropout, training=self.training)
+        attn_probs_l = F.dropout(
+            attn_weights_l, p=self.dropout, training=self.training)
+
+        attn_output_v = torch.bmm(attn_probs_v, value_l_states)
+        attn_output_l = torch.bmm(attn_probs_l, value_v_states)
+
+        if attn_output_v.size() != (bsz * self.num_heads, tgt_len,
+                                    self.head_dim):
+            raise ValueError(
+                '`attn_output_v` should be of '
+                f'size {(bsz, self.num_heads, tgt_len, self.head_dim)}, '
+                f'but is {attn_output_v.size()}')
+
+        if attn_output_l.size() != (bsz * self.num_heads, src_len,
+                                    self.head_dim):
+            raise ValueError(
+                '`attn_output_l` should be of size '
+                f'{(bsz, self.num_heads, src_len, self.head_dim)}, '
+                f'but is {attn_output_l.size()}')
+
+        attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len,
+                                           self.head_dim)
+        attn_output_v = attn_output_v.transpose(1, 2)
+        attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len,
+                                           self.head_dim)
+        attn_output_l = attn_output_l.transpose(1, 2)
+        attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)
+
+        attn_output_v = self.out_v_proj(attn_output_v)
+        attn_output_l = self.out_l_proj(attn_output_l)
+
+        return attn_output_v, attn_output_l
+
+
+class BiAttentionBlock(nn.Module):
+    """BiAttentionBlock Module:
+
+    First, multi-level visual features are concat; Then the concat visual
+    feature and lang feature are fused by attention; Finally the newly visual
+    feature are split into multi levels.
+
+    Args:
+        v_dim (int): The dimension of the visual features.
+        l_dim (int): The dimension of the language feature.
+        embed_dim (int): The embedding dimension for the attention operation.
+        num_heads (int): The number of attention heads.
+        dropout (float, optional): The dropout probability. Defaults to 0.1.
+        drop_path (float, optional): The drop path probability.
+            Defaults to 0.0.
+        init_values (float, optional):
+            The initial value for the scaling parameter.
+            Defaults to 1e-4.
+    """
+
+    def __init__(self,
+                 v_dim: int,
+                 l_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 dropout: float = 0.1,
+                 drop_path: float = .0,
+                 init_values: float = 1e-4):
+        super().__init__()
+
+        # pre layer norm
+        self.layer_norm_v = nn.LayerNorm(v_dim)
+        self.layer_norm_l = nn.LayerNorm(l_dim)
+        self.attn = BiMultiHeadAttention(
+            v_dim=v_dim,
+            l_dim=l_dim,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            dropout=dropout)
+
+        # add layer scale for training stability
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.gamma_v = nn.Parameter(
+            init_values * torch.ones(v_dim), requires_grad=True)
+        self.gamma_l = nn.Parameter(
+            init_values * torch.ones(l_dim), requires_grad=True)
+
+    def forward(self,
+                vf0: Tensor,
+                vf1: Tensor,
+                vf2: Tensor,
+                vf3: Tensor,
+                vf4: Tensor,
+                lang_feature: Tensor,
+                attention_mask_l=None):
+        visual_features = [vf0, vf1, vf2, vf3, vf4]
+        size_per_level, visual_features_flatten = [], []
+        for i, feat_per_level in enumerate(visual_features):
+            bs, c, h, w = feat_per_level.shape
+            size_per_level.append([h, w])
+            feat = permute_and_flatten(feat_per_level, bs, -1, c, h, w)
+            visual_features_flatten.append(feat)
+        visual_features_flatten = torch.cat(visual_features_flatten, dim=1)
+        new_v, new_lang_feature = self.single_attention_call(
+            visual_features_flatten,
+            lang_feature,
+            attention_mask_l=attention_mask_l)
+        # [bs, N, C] -> [bs, C, N]
+        new_v = new_v.transpose(1, 2).contiguous()
+
+        start = 0
+        # fvfs is mean fusion_visual_features
+        fvfs = []
+        for (h, w) in size_per_level:
+            new_v_per_level = new_v[:, :,
+                                    start:start + h * w].view(bs, -1, h,
+                                                              w).contiguous()
+            fvfs.append(new_v_per_level)
+            start += h * w
+
+        return fvfs[0], fvfs[1], fvfs[2], fvfs[3], fvfs[4], new_lang_feature
+
+    def single_attention_call(
+        self,
+        visual: Tensor,
+        lang: Tensor,
+        attention_mask_v: Optional[Tensor] = None,
+        attention_mask_l: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Tensor]:
+        """Perform a single attention call between the visual and language
+        inputs.
+
+        Args:
+        visual (Tensor): The visual input tensor.
+        lang (Tensor): The language input tensor.
+        attention_mask_v (Optional[Tensor]):
+            An optional attention mask tensor for the visual input.
+        attention_mask_l (Optional[Tensor]):
+            An optional attention mask tensor for the language input.
+
+        Returns:
+            Tuple[Tensor, Tensor]: A tuple containing the updated
+                visual and language tensors after the attention call.
+        """
+        visual = self.layer_norm_v(visual)
+        lang = self.layer_norm_l(lang)
+        delta_v, delta_l = self.attn(
+            visual,
+            lang,
+            attention_mask_v=attention_mask_v,
+            attention_mask_l=attention_mask_l)
+        # visual, lang = visual + delta_v, l + delta_l
+        visual = visual + self.drop_path(self.gamma_v * delta_v)
+        lang = lang + self.drop_path(self.gamma_l * delta_l)
+        return visual, lang
+
+
+class SingleScaleBiAttentionBlock(BiAttentionBlock):
+    """This is a single-scale implementation of `BiAttentionBlock`.
+
+    The only differenece between it and `BiAttentionBlock` is that the
+    `forward` function of `SingleScaleBiAttentionBlock` only accepts a single
+    flatten visual feature map, while the `forward` function in
+    `BiAttentionBlock` accepts multiple visual feature maps.
+    """
+
+    def forward(self,
+                visual_feature: Tensor,
+                lang_feature: Tensor,
+                attention_mask_v=None,
+                attention_mask_l=None):
+        """Single-scale forward pass.
+
+        Args:
+            visual_feature (Tensor): The visual input tensor. Tensor of
+                shape (bs, patch_len, ch).
+            lang_feature (Tensor): The language input tensor. Tensor of
+                shape (bs, text_len, ch).
+            attention_mask_v (_type_, optional): Visual feature attention
+                mask. Defaults to None.
+            attention_mask_l (_type_, optional): Language feature attention
+                mask.Defaults to None.
+        """
+        new_v, new_lang_feature = self.single_attention_call(
+            visual_feature,
+            lang_feature,
+            attention_mask_v=attention_mask_v,
+            attention_mask_l=attention_mask_l)
+        return new_v, new_lang_feature
+
+
+class VLFuse(nn.Module):
+    """Early Fusion Module.
+
+    Args:
+        v_dim (int): Dimension of visual features.
+        l_dim (int): Dimension of language features.
+        embed_dim (int): The embedding dimension for the attention operation.
+        num_heads (int): Number of attention heads.
+        dropout (float): Dropout probability.
+        drop_path (float): Drop path probability.
+        use_checkpoint (bool): Whether to use PyTorch's checkpoint function.
+    """
+
+    def __init__(self,
+                 v_dim: int = 256,
+                 l_dim: int = 768,
+                 embed_dim: int = 2048,
+                 num_heads: int = 8,
+                 dropout: float = 0.1,
+                 drop_path: float = 0.0,
+                 use_checkpoint: bool = False):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.b_attn = BiAttentionBlock(
+            v_dim=v_dim,
+            l_dim=l_dim,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            drop_path=drop_path,
+            init_values=1.0 / 6.0)
+
+    def forward(self, x: dict) -> dict:
+        """Forward pass of the VLFuse module."""
+        visual_features = x['visual']
+        language_dict_features = x['lang']
+
+        if self.use_checkpoint:
+            # vf is mean visual_features
+            # checkpoint does not allow complex data structures as input,
+            # such as list, so we must split them.
+            vf0, vf1, vf2, vf3, vf4, language_features = checkpoint.checkpoint(
+                self.b_attn, *visual_features,
+                language_dict_features['hidden'],
+                language_dict_features['masks'])
+        else:
+            vf0, vf1, vf2, vf3, vf4, language_features = self.b_attn(
+                *visual_features, language_dict_features['hidden'],
+                language_dict_features['masks'])
+
+        language_dict_features['hidden'] = language_features
+        fused_language_dict_features = language_dict_features
+
+        features_dict = {
+            'visual': [vf0, vf1, vf2, vf3, vf4],
+            'lang': fused_language_dict_features
+        }
+
+        return features_dict
+
+
+class BertEncoderLayer(BertPreTrainedModel):
+    """A modified version of the `BertLayer` class from the
+    `transformers.models.bert.modeling_bert` module.
+
+    Args:
+        config (:class:`~transformers.BertConfig`):
+            The configuration object that
+            contains various parameters for the model.
+        clamp_min_for_underflow (bool, optional):
+            Whether to clamp the minimum value of the hidden states
+             to prevent underflow. Defaults to `False`.
+        clamp_max_for_overflow (bool, optional):
+            Whether to clamp the maximum value of the hidden states
+            to prevent overflow. Defaults to `False`.
+    """
+
+    def __init__(self,
+                 config: BertConfig,
+                 clamp_min_for_underflow: bool = False,
+                 clamp_max_for_overflow: bool = False):
+        super().__init__(config)
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+
+        self.attention = BertAttention(config, clamp_min_for_underflow,
+                                       clamp_max_for_overflow)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self, inputs: Dict[str, Dict[str, torch.Tensor]]
+    ) -> Dict[str, Dict[str, torch.Tensor]]:
+        """Applies the BertEncoderLayer to the input features."""
+        language_dict_features = inputs['lang']
+        hidden_states = language_dict_features['hidden']
+        attention_mask = language_dict_features['masks']
+
+        device = hidden_states.device
+        input_shape = hidden_states.size()[:-1]
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        self_attention_outputs = self.attention(
+            hidden_states,
+            extended_attention_mask,
+            None,
+            output_attentions=False,
+            past_key_value=None)
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+        hidden_states = outputs[0]
+
+        language_dict_features['hidden'] = hidden_states
+
+        features_dict = {
+            'visual': inputs['visual'],
+            'lang': language_dict_features
+        }
+
+        return features_dict
+
+    def feed_forward_chunk(self, attention_output: Tensor) -> Tensor:
+        """Applies the intermediate and output layers of the BertEncoderLayer
+        to a chunk of the input sequence."""
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# The following code is the same as the Huggingface code,
+# with the only difference being the additional clamp operation.
+class BertSelfAttention(nn.Module):
+    """BERT self-attention layer from Huggingface transformers.
+
+    Compared to the BertSelfAttention of Huggingface, only add the clamp.
+
+    Args:
+        config (:class:`~transformers.BertConfig`):
+            The configuration object that
+            contains various parameters for the model.
+        clamp_min_for_underflow (bool, optional):
+            Whether to clamp the minimum value of the hidden states
+             to prevent underflow. Defaults to `False`.
+        clamp_max_for_overflow (bool, optional):
+            Whether to clamp the maximum value of the hidden states
+            to prevent overflow. Defaults to `False`.
+    """
+
+    def __init__(self,
+                 config: BertConfig,
+                 clamp_min_for_underflow: bool = False,
+                 clamp_max_for_overflow: bool = False):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and \
+                not hasattr(config, 'embedding_size'):
+            raise ValueError(f'The hidden size ({config.hidden_size}) is '
+                             'not a multiple of the number of attention '
+                             f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size /
+                                       config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * \
+            self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        if self.position_embedding_type == 'relative_key' or \
+                self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+        self.clamp_min_for_underflow = clamp_min_for_underflow
+        self.clamp_max_for_overflow = clamp_max_for_overflow
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: Tensor) -> Tensor:
+        """Transpose the dimensions of `x`."""
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        head_mask: Optional[Tensor] = None,
+        encoder_hidden_states: Optional[Tensor] = None,
+        encoder_attention_mask: Optional[Tensor] = None,
+        past_key_value: Optional[Tuple[Tensor, Tensor]] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[Tensor, ...]:
+        """Perform a forward pass through the BERT self-attention layer."""
+
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key"
+        # to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or \
+                self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + \
+                    relative_position_scores_query + \
+                    relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+
+        if self.clamp_min_for_underflow:
+            attention_scores = torch.clamp(
+                attention_scores, min=-MAX_CLAMP_VALUE
+            )  # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attention_scores = torch.clamp(
+                attention_scores, max=MAX_CLAMP_VALUE
+            )  # Do not increase 50000, data type half has quite limited range
+
+        if attention_mask is not None:
+            # Apply the attention mask is
+            # (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class BertAttention(HFBertAttention):
+    """BertAttention is made up of self-attention and intermediate+output.
+
+    Compared to the BertAttention of Huggingface, only add the clamp.
+
+    Args:
+        config (:class:`~transformers.BertConfig`):
+            The configuration object that
+            contains various parameters for the model.
+        clamp_min_for_underflow (bool, optional):
+            Whether to clamp the minimum value of the hidden states
+             to prevent underflow. Defaults to `False`.
+        clamp_max_for_overflow (bool, optional):
+            Whether to clamp the maximum value of the hidden states
+            to prevent overflow. Defaults to `False`.
+    """
+
+    def __init__(self,
+                 config: BertConfig,
+                 clamp_min_for_underflow: bool = False,
+                 clamp_max_for_overflow: bool = False):
+        super().__init__(config)
+        self.self = BertSelfAttention(config, clamp_min_for_underflow,
+                                      clamp_max_for_overflow)
+
+
+class BertIntermediate(HFBertIntermediate):
+    """Modified from transformers.models.bert.modeling_bert.BertIntermediate.
+
+    Compared to the BertIntermediate of Huggingface, only add the clamp.
+    """
+
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = clamp_values(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = clamp_values(hidden_states)
+        return hidden_states
+
+
+class BertOutput(HFBertOutput):
+    """Modified from transformers.models.bert.modeling_bert.BertOutput.
+
+    Compared to the BertOutput of Huggingface, only add the clamp.
+    """
+
+    def forward(self, hidden_states: Tensor, input_tensor: Tensor) -> Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = clamp_values(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        hidden_states = clamp_values(hidden_states)
+        return hidden_states
diff --git a/head_extractor/build/lib/mmdet/models/utils/wbf.py b/head_extractor/build/lib/mmdet/models/utils/wbf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b26a2c669a520467c6fcf52d0eec53a69834a16a
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/utils/wbf.py
@@ -0,0 +1,250 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import warnings
+from typing import Tuple
+
+import numpy as np
+import torch
+from torch import Tensor
+
+
+# References: https://github.com/ZFTurbo/Weighted-Boxes-Fusion
+def weighted_boxes_fusion(
+        bboxes_list: list,
+        scores_list: list,
+        labels_list: list,
+        weights: list = None,
+        iou_thr: float = 0.55,
+        skip_box_thr: float = 0.0,
+        conf_type: str = 'avg',
+        allows_overflow: bool = False) -> Tuple[Tensor, Tensor, Tensor]:
+    """weighted boxes fusion <https://arxiv.org/abs/1910.13302> is a method for
+    fusing predictions from different object detection models, which utilizes
+    confidence scores of all proposed bounding boxes to construct averaged
+    boxes.
+
+    Args:
+        bboxes_list(list): list of boxes predictions from each model,
+                                    each box is 4 numbers.
+        scores_list(list): list of scores for each model
+        labels_list(list): list of labels for each model
+        weights: list of weights for each model.
+                Default: None, which means weight == 1 for each model
+        iou_thr: IoU value for boxes to be a match
+        skip_box_thr: exclude boxes with score lower than this variable.
+        conf_type: how to calculate confidence in weighted boxes.
+            'avg': average value,
+            'max': maximum value,
+            'box_and_model_avg': box and model wise hybrid weighted average,
+            'absent_model_aware_avg': weighted average that takes into
+                            account the absent model.
+        allows_overflow: false if we want confidence score not exceed 1.0.
+
+    Returns:
+        bboxes(Tensor): boxes coordinates (Order of boxes: x1, y1, x2, y2).
+        scores(Tensor): confidence scores
+        labels(Tensor): boxes labels
+    """
+
+    if weights is None:
+        weights = np.ones(len(bboxes_list))
+    if len(weights) != len(bboxes_list):
+        print('Warning: incorrect number of weights {}. Must be: '
+              '{}. Set weights equal to 1.'.format(
+                  len(weights), len(bboxes_list)))
+        weights = np.ones(len(bboxes_list))
+    weights = np.array(weights)
+
+    if conf_type not in [
+            'avg', 'max', 'box_and_model_avg', 'absent_model_aware_avg'
+    ]:
+        print('Unknown conf_type: {}. Must be "avg", '
+              '"max" or "box_and_model_avg", '
+              'or "absent_model_aware_avg"'.format(conf_type))
+        exit()
+
+    filtered_boxes = prefilter_boxes(bboxes_list, scores_list, labels_list,
+                                     weights, skip_box_thr)
+    if len(filtered_boxes) == 0:
+        return torch.Tensor(), torch.Tensor(), torch.Tensor()
+
+    overall_boxes = []
+
+    for label in filtered_boxes:
+        boxes = filtered_boxes[label]
+        new_boxes = []
+        weighted_boxes = np.empty((0, 8))
+
+        # Clusterize boxes
+        for j in range(0, len(boxes)):
+            index, best_iou = find_matching_box_fast(weighted_boxes, boxes[j],
+                                                     iou_thr)
+
+            if index != -1:
+                new_boxes[index].append(boxes[j])
+                weighted_boxes[index] = get_weighted_box(
+                    new_boxes[index], conf_type)
+            else:
+                new_boxes.append([boxes[j].copy()])
+                weighted_boxes = np.vstack((weighted_boxes, boxes[j].copy()))
+
+        # Rescale confidence based on number of models and boxes
+        for i in range(len(new_boxes)):
+            clustered_boxes = new_boxes[i]
+            if conf_type == 'box_and_model_avg':
+                clustered_boxes = np.array(clustered_boxes)
+                # weighted average for boxes
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] * len(
+                    clustered_boxes) / weighted_boxes[i, 2]
+                # identify unique model index by model index column
+                _, idx = np.unique(clustered_boxes[:, 3], return_index=True)
+                # rescale by unique model weights
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] * clustered_boxes[
+                    idx, 2].sum() / weights.sum()
+            elif conf_type == 'absent_model_aware_avg':
+                clustered_boxes = np.array(clustered_boxes)
+                # get unique model index in the cluster
+                models = np.unique(clustered_boxes[:, 3]).astype(int)
+                # create a mask to get unused model weights
+                mask = np.ones(len(weights), dtype=bool)
+                mask[models] = False
+                # absent model aware weighted average
+                weighted_boxes[
+                    i, 1] = weighted_boxes[i, 1] * len(clustered_boxes) / (
+                        weighted_boxes[i, 2] + weights[mask].sum())
+            elif conf_type == 'max':
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] / weights.max()
+            elif not allows_overflow:
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] * min(
+                    len(weights), len(clustered_boxes)) / weights.sum()
+            else:
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] * len(
+                    clustered_boxes) / weights.sum()
+        overall_boxes.append(weighted_boxes)
+    overall_boxes = np.concatenate(overall_boxes, axis=0)
+    overall_boxes = overall_boxes[overall_boxes[:, 1].argsort()[::-1]]
+
+    bboxes = torch.Tensor(overall_boxes[:, 4:])
+    scores = torch.Tensor(overall_boxes[:, 1])
+    labels = torch.Tensor(overall_boxes[:, 0]).int()
+
+    return bboxes, scores, labels
+
+
+def prefilter_boxes(boxes, scores, labels, weights, thr):
+
+    new_boxes = dict()
+
+    for t in range(len(boxes)):
+
+        if len(boxes[t]) != len(scores[t]):
+            print('Error. Length of boxes arrays not equal to '
+                  'length of scores array: {} != {}'.format(
+                      len(boxes[t]), len(scores[t])))
+            exit()
+
+        if len(boxes[t]) != len(labels[t]):
+            print('Error. Length of boxes arrays not equal to '
+                  'length of labels array: {} != {}'.format(
+                      len(boxes[t]), len(labels[t])))
+            exit()
+
+        for j in range(len(boxes[t])):
+            score = scores[t][j]
+            if score < thr:
+                continue
+            label = int(labels[t][j])
+            box_part = boxes[t][j]
+            x1 = float(box_part[0])
+            y1 = float(box_part[1])
+            x2 = float(box_part[2])
+            y2 = float(box_part[3])
+
+            # Box data checks
+            if x2 < x1:
+                warnings.warn('X2 < X1 value in box. Swap them.')
+                x1, x2 = x2, x1
+            if y2 < y1:
+                warnings.warn('Y2 < Y1 value in box. Swap them.')
+                y1, y2 = y2, y1
+            if (x2 - x1) * (y2 - y1) == 0.0:
+                warnings.warn('Zero area box skipped: {}.'.format(box_part))
+                continue
+
+            # [label, score, weight, model index, x1, y1, x2, y2]
+            b = [
+                int(label),
+                float(score) * weights[t], weights[t], t, x1, y1, x2, y2
+            ]
+
+            if label not in new_boxes:
+                new_boxes[label] = []
+            new_boxes[label].append(b)
+
+    # Sort each list in dict by score and transform it to numpy array
+    for k in new_boxes:
+        current_boxes = np.array(new_boxes[k])
+        new_boxes[k] = current_boxes[current_boxes[:, 1].argsort()[::-1]]
+
+    return new_boxes
+
+
+def get_weighted_box(boxes, conf_type='avg'):
+
+    box = np.zeros(8, dtype=np.float32)
+    conf = 0
+    conf_list = []
+    w = 0
+    for b in boxes:
+        box[4:] += (b[1] * b[4:])
+        conf += b[1]
+        conf_list.append(b[1])
+        w += b[2]
+    box[0] = boxes[0][0]
+    if conf_type in ('avg', 'box_and_model_avg', 'absent_model_aware_avg'):
+        box[1] = conf / len(boxes)
+    elif conf_type == 'max':
+        box[1] = np.array(conf_list).max()
+    box[2] = w
+    box[3] = -1
+    box[4:] /= conf
+
+    return box
+
+
+def find_matching_box_fast(boxes_list, new_box, match_iou):
+
+    def bb_iou_array(boxes, new_box):
+        # bb intersection over union
+        xA = np.maximum(boxes[:, 0], new_box[0])
+        yA = np.maximum(boxes[:, 1], new_box[1])
+        xB = np.minimum(boxes[:, 2], new_box[2])
+        yB = np.minimum(boxes[:, 3], new_box[3])
+
+        interArea = np.maximum(xB - xA, 0) * np.maximum(yB - yA, 0)
+
+        # compute the area of both the prediction and ground-truth rectangles
+        boxAArea = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+        boxBArea = (new_box[2] - new_box[0]) * (new_box[3] - new_box[1])
+
+        iou = interArea / (boxAArea + boxBArea - interArea)
+
+        return iou
+
+    if boxes_list.shape[0] == 0:
+        return -1, match_iou
+
+    boxes = boxes_list
+
+    ious = bb_iou_array(boxes[:, 4:], new_box[4:])
+
+    ious[boxes[:, 0] != new_box[0]] = -1
+
+    best_idx = np.argmax(ious)
+    best_iou = ious[best_idx]
+
+    if best_iou <= match_iou:
+        best_iou = match_iou
+        best_idx = -1
+
+    return best_idx, best_iou
diff --git a/head_extractor/build/lib/mmdet/models/vis/__init__.py b/head_extractor/build/lib/mmdet/models/vis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab63a9066bcf6cd25d7c9063cc66d9b0390b3d42
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/vis/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mask2former_vis import Mask2FormerVideo
+from .masktrack_rcnn import MaskTrackRCNN
+
+__all__ = ['Mask2FormerVideo', 'MaskTrackRCNN']
diff --git a/head_extractor/build/lib/mmdet/models/vis/mask2former_vis.py b/head_extractor/build/lib/mmdet/models/vis/mask2former_vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ab04296e120622f4b5e28739f4c3323d253f7d5
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/vis/mask2former_vis.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+from torch import Tensor
+
+from mmdet.models.mot import BaseMOTModel
+from mmdet.registry import MODELS
+from mmdet.structures import TrackDataSample, TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class Mask2FormerVideo(BaseMOTModel):
+    r"""Implementation of `Masked-attention Mask
+    Transformer for Universal Image Segmentation
+    <https://arxiv.org/pdf/2112.01527>`_.
+
+    Args:
+        backbone (dict): Configuration of backbone. Defaults to None.
+        track_head (dict): Configuration of track head. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            Defaults to None.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: Optional[dict] = None,
+                 track_head: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super(BaseMOTModel, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+        if backbone is not None:
+            self.backbone = MODELS.build(backbone)
+
+        if track_head is not None:
+            self.track_head = MODELS.build(track_head)
+
+        self.num_classes = self.track_head.num_classes
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """Overload in order to load mmdet pretrained ckpt."""
+        for key in list(state_dict):
+            if key.startswith('panoptic_head'):
+                state_dict[key.replace('panoptic',
+                                       'track')] = state_dict.pop(key)
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> Union[dict, tuple]:
+        """
+        Args:
+            inputs (Tensor): Input images of shape (N, T, C, H, W).
+                These should usually be mean centered and std scaled.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        # shape (N * T, C, H, W)
+        img = inputs.flatten(0, 1)
+
+        x = self.backbone(img)
+        losses = self.track_head.loss(x, data_samples)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True) -> TrackSampleList:
+        """Predict results from a batch of inputs and data samples with
+        postprocessing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+
+        assert len(data_samples) == 1, \
+            'Mask2former only support 1 batch size per gpu for now.'
+
+        # [T, C, H, W]
+        img = inputs[0]
+        track_data_sample = data_samples[0]
+        feats = self.backbone(img)
+        pred_track_ins_list = self.track_head.predict(feats, track_data_sample,
+                                                      rescale)
+
+        det_data_samples_list = []
+        for idx, pred_track_ins in enumerate(pred_track_ins_list):
+            img_data_sample = track_data_sample[idx]
+            img_data_sample.pred_track_instances = pred_track_ins
+            det_data_samples_list.append(img_data_sample)
+
+        results = TrackDataSample()
+        results.video_data_samples = det_data_samples_list
+        return [results]
diff --git a/head_extractor/build/lib/mmdet/models/vis/masktrack_rcnn.py b/head_extractor/build/lib/mmdet/models/vis/masktrack_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c28e7b8529d3d53d5a59ecff0ea46662d035f23
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/models/vis/masktrack_rcnn.py
@@ -0,0 +1,181 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from mmdet.models.mot import BaseMOTModel
+from mmdet.registry import MODELS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class MaskTrackRCNN(BaseMOTModel):
+    """Video Instance Segmentation.
+
+    This video instance segmentor is the implementation of`MaskTrack R-CNN
+    <https://arxiv.org/abs/1905.04804>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        track_head (dict): Configuration of track head. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 track_head: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+        assert hasattr(self.detector, 'roi_head'), \
+            'MaskTrack R-CNN only supports two stage detectors.'
+
+        if track_head is not None:
+            self.track_head = MODELS.build(track_head)
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size. The T denotes the number of
+                frames.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(1) == 2, \
+            'MaskTrackRCNN can only have 1 key frame and 1 reference frame.'
+
+        # split the data_samples into two aspects: key frames and reference
+        # frames
+        ref_data_samples, key_data_samples = [], []
+        key_frame_inds, ref_frame_inds = [], []
+
+        # set cat_id of gt_labels to 0 in RPN
+        for track_data_sample in data_samples:
+            key_data_sample = track_data_sample.get_key_frames()[0]
+            key_data_samples.append(key_data_sample)
+            ref_data_sample = track_data_sample.get_ref_frames()[0]
+            ref_data_samples.append(ref_data_sample)
+            key_frame_inds.append(track_data_sample.key_frames_inds[0])
+            ref_frame_inds.append(track_data_sample.ref_frames_inds[0])
+
+        key_frame_inds = torch.tensor(key_frame_inds, dtype=torch.int64)
+        ref_frame_inds = torch.tensor(ref_frame_inds, dtype=torch.int64)
+        batch_inds = torch.arange(len(inputs))
+        key_imgs = inputs[batch_inds, key_frame_inds].contiguous()
+        ref_imgs = inputs[batch_inds, ref_frame_inds].contiguous()
+
+        x = self.detector.extract_feat(key_imgs)
+        ref_x = self.detector.extract_feat(ref_imgs)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.detector.with_rpn:
+            proposal_cfg = self.detector.train_cfg.get(
+                'rpn_proposal', self.detector.test_cfg.rpn)
+
+            rpn_losses, rpn_results_list = self.detector.rpn_head. \
+                loss_and_predict(x,
+                                 key_data_samples,
+                                 proposal_cfg=proposal_cfg,
+                                 **kwargs)
+
+            # avoid get same name with roi_head loss
+            keys = rpn_losses.keys()
+            for key in keys:
+                if 'loss' in key and 'rpn' not in key:
+                    rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+            losses.update(rpn_losses)
+        else:
+            # TODO: Not support currently, should have a check at Fast R-CNN
+            assert key_data_samples[0].get('proposals', None) is not None
+            # use pre-defined proposals in InstanceData for the second stage
+            # to extract ROI features.
+            rpn_results_list = [
+                key_data_sample.proposals
+                for key_data_sample in key_data_samples
+            ]
+
+        losses_detect = self.detector.roi_head.loss(x, rpn_results_list,
+                                                    key_data_samples, **kwargs)
+        losses.update(losses_detect)
+
+        losses_track = self.track_head.loss(x, ref_x, rpn_results_list,
+                                            data_samples, **kwargs)
+        losses.update(losses_track)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True,
+                **kwargs) -> TrackSampleList:
+        """Test without augmentation.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+
+        assert len(data_samples) == 1, \
+            'MaskTrackRCNN only support 1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+        if track_data_sample[0].frame_id == 0:
+            self.tracker.reset()
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            x = self.detector.extract_feat(single_img)
+
+            rpn_results_list = self.detector.rpn_head.predict(
+                x, [img_data_sample])
+            # det_results List[InstanceData]
+            det_results = self.detector.roi_head.predict(
+                x, rpn_results_list, [img_data_sample], rescale=rescale)
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+            assert 'masks' in det_results[0], 'There are no mask results.'
+
+            img_data_sample.pred_instances = det_results[0]
+            frame_pred_track_instances = self.tracker.track(
+                model=self, feats=x, data_sample=img_data_sample, **kwargs)
+            img_data_sample.pred_track_instances = frame_pred_track_instances
+
+        return [track_data_sample]
diff --git a/head_extractor/build/lib/mmdet/registry.py b/head_extractor/build/lib/mmdet/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a5b2b28a4f80a488994b48a99043a20c604e55e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/registry.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""MMDetection provides 17 registry nodes to support using modules across
+projects. Each node is a child of the root registry in MMEngine.
+
+More details can be found at
+https://mmengine.readthedocs.io/en/latest/tutorials/registry.html.
+"""
+
+from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS
+from mmengine.registry import DATASETS as MMENGINE_DATASETS
+from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR
+from mmengine.registry import HOOKS as MMENGINE_HOOKS
+from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS
+from mmengine.registry import LOOPS as MMENGINE_LOOPS
+from mmengine.registry import METRICS as MMENGINE_METRICS
+from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS
+from mmengine.registry import MODELS as MMENGINE_MODELS
+from mmengine.registry import \
+    OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS
+from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS
+from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS
+from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS
+from mmengine.registry import \
+    RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS
+from mmengine.registry import RUNNERS as MMENGINE_RUNNERS
+from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS
+from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS
+from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS
+from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS
+from mmengine.registry import \
+    WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS
+from mmengine.registry import Registry
+
+# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner`
+RUNNERS = Registry(
+    'runner', parent=MMENGINE_RUNNERS, locations=['mmdet.engine.runner'])
+# manage runner constructors that define how to initialize runners
+RUNNER_CONSTRUCTORS = Registry(
+    'runner constructor',
+    parent=MMENGINE_RUNNER_CONSTRUCTORS,
+    locations=['mmdet.engine.runner'])
+# manage all kinds of loops like `EpochBasedTrainLoop`
+LOOPS = Registry(
+    'loop', parent=MMENGINE_LOOPS, locations=['mmdet.engine.runner'])
+# manage all kinds of hooks like `CheckpointHook`
+HOOKS = Registry(
+    'hook', parent=MMENGINE_HOOKS, locations=['mmdet.engine.hooks'])
+
+# manage data-related modules
+DATASETS = Registry(
+    'dataset', parent=MMENGINE_DATASETS, locations=['mmdet.datasets'])
+DATA_SAMPLERS = Registry(
+    'data sampler',
+    parent=MMENGINE_DATA_SAMPLERS,
+    locations=['mmdet.datasets.samplers'])
+TRANSFORMS = Registry(
+    'transform',
+    parent=MMENGINE_TRANSFORMS,
+    locations=['mmdet.datasets.transforms'])
+
+# manage all kinds of modules inheriting `nn.Module`
+MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['mmdet.models'])
+# manage all kinds of model wrappers like 'MMDistributedDataParallel'
+MODEL_WRAPPERS = Registry(
+    'model_wrapper',
+    parent=MMENGINE_MODEL_WRAPPERS,
+    locations=['mmdet.models'])
+# manage all kinds of weight initialization modules like `Uniform`
+WEIGHT_INITIALIZERS = Registry(
+    'weight initializer',
+    parent=MMENGINE_WEIGHT_INITIALIZERS,
+    locations=['mmdet.models'])
+
+# manage all kinds of optimizers like `SGD` and `Adam`
+OPTIMIZERS = Registry(
+    'optimizer',
+    parent=MMENGINE_OPTIMIZERS,
+    locations=['mmdet.engine.optimizers'])
+# manage optimizer wrapper
+OPTIM_WRAPPERS = Registry(
+    'optim_wrapper',
+    parent=MMENGINE_OPTIM_WRAPPERS,
+    locations=['mmdet.engine.optimizers'])
+# manage constructors that customize the optimization hyperparameters.
+OPTIM_WRAPPER_CONSTRUCTORS = Registry(
+    'optimizer constructor',
+    parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS,
+    locations=['mmdet.engine.optimizers'])
+# manage all kinds of parameter schedulers like `MultiStepLR`
+PARAM_SCHEDULERS = Registry(
+    'parameter scheduler',
+    parent=MMENGINE_PARAM_SCHEDULERS,
+    locations=['mmdet.engine.schedulers'])
+# manage all kinds of metrics
+METRICS = Registry(
+    'metric', parent=MMENGINE_METRICS, locations=['mmdet.evaluation'])
+# manage evaluator
+EVALUATOR = Registry(
+    'evaluator', parent=MMENGINE_EVALUATOR, locations=['mmdet.evaluation'])
+
+# manage task-specific modules like anchor generators and box coders
+TASK_UTILS = Registry(
+    'task util', parent=MMENGINE_TASK_UTILS, locations=['mmdet.models'])
+
+# manage visualizer
+VISUALIZERS = Registry(
+    'visualizer',
+    parent=MMENGINE_VISUALIZERS,
+    locations=['mmdet.visualization'])
+# manage visualizer backend
+VISBACKENDS = Registry(
+    'vis_backend',
+    parent=MMENGINE_VISBACKENDS,
+    locations=['mmdet.visualization'])
+
+# manage logprocessor
+LOG_PROCESSORS = Registry(
+    'log_processor',
+    parent=MMENGINE_LOG_PROCESSORS,
+    # TODO: update the location when mmdet has its own log processor
+    locations=['mmdet.engine'])
diff --git a/head_extractor/build/lib/mmdet/structures/__init__.py b/head_extractor/build/lib/mmdet/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..381c6a4f4549c2c4395d994cbd860a3e52eb9994
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .det_data_sample import DetDataSample, OptSampleList, SampleList
+from .reid_data_sample import ReIDDataSample
+from .track_data_sample import (OptTrackSampleList, TrackDataSample,
+                                TrackSampleList)
+
+__all__ = [
+    'DetDataSample', 'SampleList', 'OptSampleList', 'TrackDataSample',
+    'TrackSampleList', 'OptTrackSampleList', 'ReIDDataSample'
+]
diff --git a/head_extractor/build/lib/mmdet/structures/bbox/__init__.py b/head_extractor/build/lib/mmdet/structures/bbox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d531986509ad1b2141118449aab39343bbde82c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/bbox/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_boxes import BaseBoxes
+from .bbox_overlaps import bbox_overlaps
+from .box_type import (autocast_box_type, convert_box_type, get_box_type,
+                       register_box, register_box_converter)
+from .horizontal_boxes import HorizontalBoxes
+from .transforms import bbox_cxcyah_to_xyxy  # noqa: E501
+from .transforms import (bbox2corner, bbox2distance, bbox2result, bbox2roi,
+                         bbox_cxcywh_to_xyxy, bbox_flip, bbox_mapping,
+                         bbox_mapping_back, bbox_project, bbox_rescale,
+                         bbox_xyxy_to_cxcyah, bbox_xyxy_to_cxcywh, cat_boxes,
+                         corner2bbox, distance2bbox, empty_box_as,
+                         find_inside_bboxes, get_box_tensor, get_box_wh,
+                         roi2bbox, scale_boxes, stack_boxes)
+
+__all__ = [
+    'bbox_overlaps', 'bbox_flip', 'bbox_mapping', 'bbox_mapping_back',
+    'bbox2roi', 'roi2bbox', 'bbox2result', 'distance2bbox', 'bbox2distance',
+    'bbox_rescale', 'bbox_cxcywh_to_xyxy', 'bbox_xyxy_to_cxcywh',
+    'find_inside_bboxes', 'bbox2corner', 'corner2bbox', 'bbox_project',
+    'BaseBoxes', 'convert_box_type', 'get_box_type', 'register_box',
+    'register_box_converter', 'HorizontalBoxes', 'autocast_box_type',
+    'cat_boxes', 'stack_boxes', 'scale_boxes', 'get_box_wh', 'get_box_tensor',
+    'empty_box_as', 'bbox_xyxy_to_cxcyah', 'bbox_cxcyah_to_xyxy'
+]
diff --git a/head_extractor/build/lib/mmdet/structures/bbox/base_boxes.py b/head_extractor/build/lib/mmdet/structures/bbox/base_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ed667664a8a57a1b9b7e422af03d41274882747
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/bbox/base_boxes.py
@@ -0,0 +1,549 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod, abstractproperty, abstractstaticmethod
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Union
+
+import numpy as np
+import torch
+from torch import BoolTensor, Tensor
+
+from mmdet.structures.mask.structures import BitmapMasks, PolygonMasks
+
+T = TypeVar('T')
+DeviceType = Union[str, torch.device]
+IndexType = Union[slice, int, list, torch.LongTensor, torch.cuda.LongTensor,
+                  torch.BoolTensor, torch.cuda.BoolTensor, np.ndarray]
+MaskType = Union[BitmapMasks, PolygonMasks]
+
+
+class BaseBoxes(metaclass=ABCMeta):
+    """The base class for 2D box types.
+
+    The functions of ``BaseBoxes`` lie in three fields:
+
+    - Verify the boxes shape.
+    - Support tensor-like operations.
+    - Define abstract functions for 2D boxes.
+
+    In ``__init__`` , ``BaseBoxes`` verifies the validity of the data shape
+    w.r.t ``box_dim``. The tensor with the dimension >= 2 and the length
+    of the last dimension being ``box_dim`` will be regarded as valid.
+    ``BaseBoxes`` will restore them at the field ``tensor``. It's necessary
+    to override ``box_dim`` in subclass to guarantee the data shape is
+    correct.
+
+    There are many basic tensor-like functions implemented in ``BaseBoxes``.
+    In most cases, users can operate ``BaseBoxes`` instance like a normal
+    tensor. To protect the validity of data shape, All tensor-like functions
+    cannot modify the last dimension of ``self.tensor``.
+
+    When creating a new box type, users need to inherit from ``BaseBoxes``
+    and override abstract methods and specify the ``box_dim``. Then, register
+    the new box type by using the decorator ``register_box_type``.
+
+    Args:
+        data (Tensor or np.ndarray or Sequence): The box data with shape
+            (..., box_dim).
+        dtype (torch.dtype, Optional): data type of boxes. Defaults to None.
+        device (str or torch.device, Optional): device of boxes.
+            Default to None.
+        clone (bool): Whether clone ``boxes`` or not. Defaults to True.
+    """
+
+    # Used to verify the last dimension length
+    # Should override it in subclass.
+    box_dim: int = 0
+
+    def __init__(self,
+                 data: Union[Tensor, np.ndarray, Sequence],
+                 dtype: Optional[torch.dtype] = None,
+                 device: Optional[DeviceType] = None,
+                 clone: bool = True) -> None:
+        if isinstance(data, (np.ndarray, Tensor, Sequence)):
+            data = torch.as_tensor(data)
+        else:
+            raise TypeError('boxes should be Tensor, ndarray, or Sequence, ',
+                            f'but got {type(data)}')
+
+        if device is not None or dtype is not None:
+            data = data.to(dtype=dtype, device=device)
+        # Clone the data to avoid potential bugs
+        if clone:
+            data = data.clone()
+        # handle the empty input like []
+        if data.numel() == 0:
+            data = data.reshape((-1, self.box_dim))
+
+        assert data.dim() >= 2 and data.size(-1) == self.box_dim, \
+            ('The boxes dimension must >= 2 and the length of the last '
+             f'dimension must be {self.box_dim}, but got boxes with '
+             f'shape {data.shape}.')
+        self.tensor = data
+
+    def convert_to(self, dst_type: Union[str, type]) -> 'BaseBoxes':
+        """Convert self to another box type.
+
+        Args:
+            dst_type (str or type): destination box type.
+
+        Returns:
+            :obj:`BaseBoxes`: destination box type object .
+        """
+        from .box_type import convert_box_type
+        return convert_box_type(self, dst_type=dst_type)
+
+    def empty_boxes(self: T,
+                    dtype: Optional[torch.dtype] = None,
+                    device: Optional[DeviceType] = None) -> T:
+        """Create empty box.
+
+        Args:
+            dtype (torch.dtype, Optional): data type of boxes.
+            device (str or torch.device, Optional): device of boxes.
+
+        Returns:
+            T: empty boxes with shape of (0, box_dim).
+        """
+        empty_box = self.tensor.new_zeros(
+            0, self.box_dim, dtype=dtype, device=device)
+        return type(self)(empty_box, clone=False)
+
+    def fake_boxes(self: T,
+                   sizes: Tuple[int],
+                   fill: float = 0,
+                   dtype: Optional[torch.dtype] = None,
+                   device: Optional[DeviceType] = None) -> T:
+        """Create fake boxes with specific sizes and fill values.
+
+        Args:
+            sizes (Tuple[int]): The size of fake boxes. The last value must
+                be equal with ``self.box_dim``.
+            fill (float): filling value. Defaults to 0.
+            dtype (torch.dtype, Optional): data type of boxes.
+            device (str or torch.device, Optional): device of boxes.
+
+        Returns:
+            T: Fake boxes with shape of ``sizes``.
+        """
+        fake_boxes = self.tensor.new_full(
+            sizes, fill, dtype=dtype, device=device)
+        return type(self)(fake_boxes, clone=False)
+
+    def __getitem__(self: T, index: IndexType) -> T:
+        """Rewrite getitem to protect the last dimension shape."""
+        boxes = self.tensor
+        if isinstance(index, np.ndarray):
+            index = torch.as_tensor(index, device=self.device)
+        if isinstance(index, Tensor) and index.dtype == torch.bool:
+            assert index.dim() < boxes.dim()
+        elif isinstance(index, tuple):
+            assert len(index) < boxes.dim()
+            # `Ellipsis`(...) is commonly used in index like [None, ...].
+            # When `Ellipsis` is in index, it must be the last item.
+            if Ellipsis in index:
+                assert index[-1] is Ellipsis
+
+        boxes = boxes[index]
+        if boxes.dim() == 1:
+            boxes = boxes.reshape(1, -1)
+        return type(self)(boxes, clone=False)
+
+    def __setitem__(self: T, index: IndexType, values: Union[Tensor, T]) -> T:
+        """Rewrite setitem to protect the last dimension shape."""
+        assert type(values) is type(self), \
+            'The value to be set must be the same box type as self'
+        values = values.tensor
+
+        if isinstance(index, np.ndarray):
+            index = torch.as_tensor(index, device=self.device)
+        if isinstance(index, Tensor) and index.dtype == torch.bool:
+            assert index.dim() < self.tensor.dim()
+        elif isinstance(index, tuple):
+            assert len(index) < self.tensor.dim()
+            # `Ellipsis`(...) is commonly used in index like [None, ...].
+            # When `Ellipsis` is in index, it must be the last item.
+            if Ellipsis in index:
+                assert index[-1] is Ellipsis
+
+        self.tensor[index] = values
+
+    def __len__(self) -> int:
+        """Return the length of self.tensor first dimension."""
+        return self.tensor.size(0)
+
+    def __deepcopy__(self, memo):
+        """Only clone the ``self.tensor`` when applying deepcopy."""
+        cls = self.__class__
+        other = cls.__new__(cls)
+        memo[id(self)] = other
+        other.tensor = self.tensor.clone()
+        return other
+
+    def __repr__(self) -> str:
+        """Return a strings that describes the object."""
+        return self.__class__.__name__ + '(\n' + str(self.tensor) + ')'
+
+    def new_tensor(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_tensor`` from self.tensor."""
+        return self.tensor.new_tensor(*args, **kwargs)
+
+    def new_full(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_full`` from self.tensor."""
+        return self.tensor.new_full(*args, **kwargs)
+
+    def new_empty(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_empty`` from self.tensor."""
+        return self.tensor.new_empty(*args, **kwargs)
+
+    def new_ones(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_ones`` from self.tensor."""
+        return self.tensor.new_ones(*args, **kwargs)
+
+    def new_zeros(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_zeros`` from self.tensor."""
+        return self.tensor.new_zeros(*args, **kwargs)
+
+    def size(self, dim: Optional[int] = None) -> Union[int, torch.Size]:
+        """Reload new_zeros from self.tensor."""
+        # self.tensor.size(dim) cannot work when dim=None.
+        return self.tensor.size() if dim is None else self.tensor.size(dim)
+
+    def dim(self) -> int:
+        """Reload ``dim`` from self.tensor."""
+        return self.tensor.dim()
+
+    @property
+    def device(self) -> torch.device:
+        """Reload ``device`` from self.tensor."""
+        return self.tensor.device
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """Reload ``dtype`` from self.tensor."""
+        return self.tensor.dtype
+
+    @property
+    def shape(self) -> torch.Size:
+        return self.tensor.shape
+
+    def numel(self) -> int:
+        """Reload ``numel`` from self.tensor."""
+        return self.tensor.numel()
+
+    def numpy(self) -> np.ndarray:
+        """Reload ``numpy`` from self.tensor."""
+        return self.tensor.numpy()
+
+    def to(self: T, *args, **kwargs) -> T:
+        """Reload ``to`` from self.tensor."""
+        return type(self)(self.tensor.to(*args, **kwargs), clone=False)
+
+    def cpu(self: T) -> T:
+        """Reload ``cpu`` from self.tensor."""
+        return type(self)(self.tensor.cpu(), clone=False)
+
+    def cuda(self: T, *args, **kwargs) -> T:
+        """Reload ``cuda`` from self.tensor."""
+        return type(self)(self.tensor.cuda(*args, **kwargs), clone=False)
+
+    def clone(self: T) -> T:
+        """Reload ``clone`` from self.tensor."""
+        return type(self)(self.tensor)
+
+    def detach(self: T) -> T:
+        """Reload ``detach`` from self.tensor."""
+        return type(self)(self.tensor.detach(), clone=False)
+
+    def view(self: T, *shape: Tuple[int]) -> T:
+        """Reload ``view`` from self.tensor."""
+        return type(self)(self.tensor.view(shape), clone=False)
+
+    def reshape(self: T, *shape: Tuple[int]) -> T:
+        """Reload ``reshape`` from self.tensor."""
+        return type(self)(self.tensor.reshape(shape), clone=False)
+
+    def expand(self: T, *sizes: Tuple[int]) -> T:
+        """Reload ``expand`` from self.tensor."""
+        return type(self)(self.tensor.expand(sizes), clone=False)
+
+    def repeat(self: T, *sizes: Tuple[int]) -> T:
+        """Reload ``repeat`` from self.tensor."""
+        return type(self)(self.tensor.repeat(sizes), clone=False)
+
+    def transpose(self: T, dim0: int, dim1: int) -> T:
+        """Reload ``transpose`` from self.tensor."""
+        ndim = self.tensor.dim()
+        assert dim0 != -1 and dim0 != ndim - 1
+        assert dim1 != -1 and dim1 != ndim - 1
+        return type(self)(self.tensor.transpose(dim0, dim1), clone=False)
+
+    def permute(self: T, *dims: Tuple[int]) -> T:
+        """Reload ``permute`` from self.tensor."""
+        assert dims[-1] == -1 or dims[-1] == self.tensor.dim() - 1
+        return type(self)(self.tensor.permute(dims), clone=False)
+
+    def split(self: T,
+              split_size_or_sections: Union[int, Sequence[int]],
+              dim: int = 0) -> List[T]:
+        """Reload ``split`` from self.tensor."""
+        assert dim != -1 and dim != self.tensor.dim() - 1
+        boxes_list = self.tensor.split(split_size_or_sections, dim=dim)
+        return [type(self)(boxes, clone=False) for boxes in boxes_list]
+
+    def chunk(self: T, chunks: int, dim: int = 0) -> List[T]:
+        """Reload ``chunk`` from self.tensor."""
+        assert dim != -1 and dim != self.tensor.dim() - 1
+        boxes_list = self.tensor.chunk(chunks, dim=dim)
+        return [type(self)(boxes, clone=False) for boxes in boxes_list]
+
+    def unbind(self: T, dim: int = 0) -> T:
+        """Reload ``unbind`` from self.tensor."""
+        assert dim != -1 and dim != self.tensor.dim() - 1
+        boxes_list = self.tensor.unbind(dim=dim)
+        return [type(self)(boxes, clone=False) for boxes in boxes_list]
+
+    def flatten(self: T, start_dim: int = 0, end_dim: int = -2) -> T:
+        """Reload ``flatten`` from self.tensor."""
+        assert end_dim != -1 and end_dim != self.tensor.dim() - 1
+        return type(self)(self.tensor.flatten(start_dim, end_dim), clone=False)
+
+    def squeeze(self: T, dim: Optional[int] = None) -> T:
+        """Reload ``squeeze`` from self.tensor."""
+        boxes = self.tensor.squeeze() if dim is None else \
+            self.tensor.squeeze(dim)
+        return type(self)(boxes, clone=False)
+
+    def unsqueeze(self: T, dim: int) -> T:
+        """Reload ``unsqueeze`` from self.tensor."""
+        assert dim != -1 and dim != self.tensor.dim()
+        return type(self)(self.tensor.unsqueeze(dim), clone=False)
+
+    @classmethod
+    def cat(cls: Type[T], box_list: Sequence[T], dim: int = 0) -> T:
+        """Cancatenates a box instance list into one single box instance.
+        Similar to ``torch.cat``.
+
+        Args:
+            box_list (Sequence[T]): A sequence of box instances.
+            dim (int): The dimension over which the box are concatenated.
+                Defaults to 0.
+
+        Returns:
+            T: Concatenated box instance.
+        """
+        assert isinstance(box_list, Sequence)
+        if len(box_list) == 0:
+            raise ValueError('box_list should not be a empty list.')
+
+        assert dim != -1 and dim != box_list[0].dim() - 1
+        assert all(isinstance(boxes, cls) for boxes in box_list)
+
+        th_box_list = [boxes.tensor for boxes in box_list]
+        return cls(torch.cat(th_box_list, dim=dim), clone=False)
+
+    @classmethod
+    def stack(cls: Type[T], box_list: Sequence[T], dim: int = 0) -> T:
+        """Concatenates a sequence of tensors along a new dimension. Similar to
+        ``torch.stack``.
+
+        Args:
+            box_list (Sequence[T]): A sequence of box instances.
+            dim (int): Dimension to insert. Defaults to 0.
+
+        Returns:
+            T: Concatenated box instance.
+        """
+        assert isinstance(box_list, Sequence)
+        if len(box_list) == 0:
+            raise ValueError('box_list should not be a empty list.')
+
+        assert dim != -1 and dim != box_list[0].dim()
+        assert all(isinstance(boxes, cls) for boxes in box_list)
+
+        th_box_list = [boxes.tensor for boxes in box_list]
+        return cls(torch.stack(th_box_list, dim=dim), clone=False)
+
+    @abstractproperty
+    def centers(self) -> Tensor:
+        """Return a tensor representing the centers of boxes."""
+        pass
+
+    @abstractproperty
+    def areas(self) -> Tensor:
+        """Return a tensor representing the areas of boxes."""
+        pass
+
+    @abstractproperty
+    def widths(self) -> Tensor:
+        """Return a tensor representing the widths of boxes."""
+        pass
+
+    @abstractproperty
+    def heights(self) -> Tensor:
+        """Return a tensor representing the heights of boxes."""
+        pass
+
+    @abstractmethod
+    def flip_(self,
+              img_shape: Tuple[int, int],
+              direction: str = 'horizontal') -> None:
+        """Flip boxes horizontally or vertically in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+            direction (str): Flip direction, options are "horizontal",
+                "vertical" and "diagonal". Defaults to "horizontal"
+        """
+        pass
+
+    @abstractmethod
+    def translate_(self, distances: Tuple[float, float]) -> None:
+        """Translate boxes in-place.
+
+        Args:
+            distances (Tuple[float, float]): translate distances. The first
+                is horizontal distance and the second is vertical distance.
+        """
+        pass
+
+    @abstractmethod
+    def clip_(self, img_shape: Tuple[int, int]) -> None:
+        """Clip boxes according to the image shape in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+        """
+        pass
+
+    @abstractmethod
+    def rotate_(self, center: Tuple[float, float], angle: float) -> None:
+        """Rotate all boxes in-place.
+
+        Args:
+            center (Tuple[float, float]): Rotation origin.
+            angle (float): Rotation angle represented in degrees. Positive
+                values mean clockwise rotation.
+        """
+        pass
+
+    @abstractmethod
+    def project_(self, homography_matrix: Union[Tensor, np.ndarray]) -> None:
+        """Geometric transformat boxes in-place.
+
+        Args:
+            homography_matrix (Tensor or np.ndarray]):
+                Shape (3, 3) for geometric transformation.
+        """
+        pass
+
+    @abstractmethod
+    def rescale_(self, scale_factor: Tuple[float, float]) -> None:
+        """Rescale boxes w.r.t. rescale_factor in-place.
+
+        Note:
+            Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
+            w.r.t ``scale_facotr``. The difference is that ``resize_`` only
+            changes the width and the height of boxes, but ``rescale_`` also
+            rescales the box centers simultaneously.
+
+        Args:
+            scale_factor (Tuple[float, float]): factors for scaling boxes.
+                The length should be 2.
+        """
+        pass
+
+    @abstractmethod
+    def resize_(self, scale_factor: Tuple[float, float]) -> None:
+        """Resize the box width and height w.r.t scale_factor in-place.
+
+        Note:
+            Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
+            w.r.t ``scale_facotr``. The difference is that ``resize_`` only
+            changes the width and the height of boxes, but ``rescale_`` also
+            rescales the box centers simultaneously.
+
+        Args:
+            scale_factor (Tuple[float, float]): factors for scaling box
+                shapes. The length should be 2.
+        """
+        pass
+
+    @abstractmethod
+    def is_inside(self,
+                  img_shape: Tuple[int, int],
+                  all_inside: bool = False,
+                  allowed_border: int = 0) -> BoolTensor:
+        """Find boxes inside the image.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+            all_inside (bool): Whether the boxes are all inside the image or
+                part inside the image. Defaults to False.
+            allowed_border (int): Boxes that extend beyond the image shape
+                boundary by more than ``allowed_border`` are considered
+                "outside" Defaults to 0.
+        Returns:
+            BoolTensor: A BoolTensor indicating whether the box is inside
+            the image. Assuming the original boxes have shape (m, n, box_dim),
+            the output has shape (m, n).
+        """
+        pass
+
+    @abstractmethod
+    def find_inside_points(self,
+                           points: Tensor,
+                           is_aligned: bool = False) -> BoolTensor:
+        """Find inside box points. Boxes dimension must be 2.
+
+        Args:
+            points (Tensor): Points coordinates. Has shape of (m, 2).
+            is_aligned (bool): Whether ``points`` has been aligned with boxes
+                or not. If True, the length of boxes and ``points`` should be
+                the same. Defaults to False.
+
+        Returns:
+            BoolTensor: A BoolTensor indicating whether a point is inside
+            boxes. Assuming the boxes has shape of (n, box_dim), if
+            ``is_aligned`` is False. The index has shape of (m, n). If
+            ``is_aligned`` is True, m should be equal to n and the index has
+            shape of (m, ).
+        """
+        pass
+
+    @abstractstaticmethod
+    def overlaps(boxes1: 'BaseBoxes',
+                 boxes2: 'BaseBoxes',
+                 mode: str = 'iou',
+                 is_aligned: bool = False,
+                 eps: float = 1e-6) -> Tensor:
+        """Calculate overlap between two set of boxes with their types
+        converted to the present box type.
+
+        Args:
+            boxes1 (:obj:`BaseBoxes`): BaseBoxes with shape of (m, box_dim)
+                or empty.
+            boxes2 (:obj:`BaseBoxes`): BaseBoxes with shape of (n, box_dim)
+                or empty.
+            mode (str): "iou" (intersection over union), "iof" (intersection
+                over foreground). Defaults to "iou".
+            is_aligned (bool): If True, then m and n must be equal. Defaults
+                to False.
+            eps (float): A value added to the denominator for numerical
+                stability. Defaults to 1e-6.
+
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+        """
+        pass
+
+    @abstractstaticmethod
+    def from_instance_masks(masks: MaskType) -> 'BaseBoxes':
+        """Create boxes from instance masks.
+
+        Args:
+            masks (:obj:`BitmapMasks` or :obj:`PolygonMasks`): BitmapMasks or
+                PolygonMasks instance with length of n.
+
+        Returns:
+            :obj:`BaseBoxes`: Converted boxes with shape of (n, box_dim).
+        """
+        pass
diff --git a/head_extractor/build/lib/mmdet/structures/bbox/bbox_overlaps.py b/head_extractor/build/lib/mmdet/structures/bbox/bbox_overlaps.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e3435d28b38a5479a6c791f52a76d8ba293a6eb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/bbox/bbox_overlaps.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def fp16_clamp(x, min=None, max=None):
+    if not x.is_cuda and x.dtype == torch.float16:
+        # clamp for cpu float16, tensor fp16 has no clamp implementation
+        return x.float().clamp(min, max).half()
+
+    return x.clamp(min, max)
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
+    """Calculate overlap between two set of bboxes.
+
+    FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889
+    Note:
+        Assume bboxes1 is M x 4, bboxes2 is N x 4, when mode is 'iou',
+        there are some new generated variable when calculating IOU
+        using bbox_overlaps function:
+
+        1) is_aligned is False
+            area1: M x 1
+            area2: N x 1
+            lt: M x N x 2
+            rb: M x N x 2
+            wh: M x N x 2
+            overlap: M x N x 1
+            union: M x N x 1
+            ious: M x N x 1
+
+            Total memory:
+                S = (9 x N x M + N + M) * 4 Byte,
+
+            When using FP16, we can reduce:
+                R = (9 x N x M + N + M) * 4 / 2 Byte
+                R large than (N + M) * 4 * 2 is always true when N and M >= 1.
+                Obviously, N + M <= N * M < 3 * N * M, when N >=2 and M >=2,
+                           N + 1 < 3 * N, when N or M is 1.
+
+            Given M = 40 (ground truth), N = 400000 (three anchor boxes
+            in per grid, FPN, R-CNNs),
+                R = 275 MB (one times)
+
+            A special case (dense detection), M = 512 (ground truth),
+                R = 3516 MB = 3.43 GB
+
+            When the batch size is B, reduce:
+                B x R
+
+            Therefore, CUDA memory runs out frequently.
+
+            Experiments on GeForce RTX 2080Ti (11019 MiB):
+
+            |   dtype   |   M   |   N   |   Use    |   Real   |   Ideal   |
+            |:----:|:----:|:----:|:----:|:----:|:----:|
+            |   FP32   |   512 | 400000 | 8020 MiB |   --   |   --   |
+            |   FP16   |   512 | 400000 |   4504 MiB | 3516 MiB | 3516 MiB |
+            |   FP32   |   40 | 400000 |   1540 MiB |   --   |   --   |
+            |   FP16   |   40 | 400000 |   1264 MiB |   276MiB   | 275 MiB |
+
+        2) is_aligned is True
+            area1: N x 1
+            area2: N x 1
+            lt: N x 2
+            rb: N x 2
+            wh: N x 2
+            overlap: N x 1
+            union: N x 1
+            ious: N x 1
+
+            Total memory:
+                S = 11 x N * 4 Byte
+
+            When using FP16, we can reduce:
+                R = 11 x N * 4 / 2 Byte
+
+        So do the 'giou' (large than 'iou').
+
+        Time-wise, FP16 is generally faster than FP32.
+
+        When gpu_assign_thr is not -1, it takes more time on cpu
+        but not reduce memory.
+        There, we can reduce half the memory and keep the speed.
+
+    If ``is_aligned`` is ``False``, then calculate the overlaps between each
+    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+    pair of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned`` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union), "iof" (intersection over
+            foreground) or "giou" (generalized intersection over union).
+            Default "iou".
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2)
+        >>> assert overlaps.shape == (3, 3)
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+        >>> assert overlaps.shape == (3, )
+
+    Example:
+        >>> empty = torch.empty(0, 4)
+        >>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
+    # Either the boxes are empty or the length of boxes' last dimension is 4
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.size(-2)
+    cols = bboxes2.size(-2)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return bboxes1.new(batch_shape + (rows, ))
+        else:
+            return bboxes1.new(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+        bboxes1[..., 3] - bboxes1[..., 1])
+    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+        bboxes2[..., 3] - bboxes2[..., 1])
+
+    if is_aligned:
+        lt = torch.max(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
+        rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]
+
+        wh = fp16_clamp(rb - lt, min=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
+    else:
+        lt = torch.max(bboxes1[..., :, None, :2],
+                       bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
+        rb = torch.min(bboxes1[..., :, None, 2:],
+                       bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]
+
+        wh = fp16_clamp(rb - lt, min=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        else:
+            union = area1[..., None]
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :, None, :2],
+                                    bboxes2[..., None, :, :2])
+            enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
+                                    bboxes2[..., None, :, 2:])
+
+    eps = union.new_tensor([eps])
+    union = torch.max(union, eps)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    # calculate gious
+    enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
+    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+    enclose_area = torch.max(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return gious
diff --git a/head_extractor/build/lib/mmdet/structures/bbox/box_type.py b/head_extractor/build/lib/mmdet/structures/bbox/box_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7eb5494c36c8efcbb414897f7c2532a6d3a1ddb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/bbox/box_type.py
@@ -0,0 +1,296 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, Optional, Tuple, Type, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from .base_boxes import BaseBoxes
+
+BoxType = Union[np.ndarray, Tensor, BaseBoxes]
+
+box_types: dict = {}
+_box_type_to_name: dict = {}
+box_converters: dict = {}
+
+
+def _register_box(name: str, box_type: Type, force: bool = False) -> None:
+    """Register a box type.
+
+    Args:
+        name (str): The name of box type.
+        box_type (type): Box mode class to be registered.
+        force (bool): Whether to override an existing class with the same
+            name. Defaults to False.
+    """
+    assert issubclass(box_type, BaseBoxes)
+    name = name.lower()
+
+    if not force and (name in box_types or box_type in _box_type_to_name):
+        raise KeyError(f'box type {name} has been registered')
+    elif name in box_types:
+        _box_type = box_types.pop(name)
+        _box_type_to_name.pop(_box_type)
+    elif box_type in _box_type_to_name:
+        _name = _box_type_to_name.pop(box_type)
+        box_types.pop(_name)
+
+    box_types[name] = box_type
+    _box_type_to_name[box_type] = name
+
+
+def register_box(name: str,
+                 box_type: Type = None,
+                 force: bool = False) -> Union[Type, Callable]:
+    """Register a box type.
+
+    A record will be added to ``bbox_types``, whose key is the box type name
+    and value is the box type itself. Simultaneously, a reverse dictionary
+    ``_box_type_to_name`` will be updated. It can be used as a decorator or
+    a normal function.
+
+    Args:
+        name (str): The name of box type.
+        bbox_type (type, Optional): Box type class to be registered.
+            Defaults to None.
+        force (bool): Whether to override the existing box type with the same
+            name. Defaults to False.
+
+    Examples:
+        >>> from mmdet.structures.bbox import register_box
+        >>> from mmdet.structures.bbox import BaseBoxes
+
+        >>> # as a decorator
+        >>> @register_box('hbox')
+        >>> class HorizontalBoxes(BaseBoxes):
+        >>>     pass
+
+        >>> # as a normal function
+        >>> class RotatedBoxes(BaseBoxes):
+        >>>     pass
+        >>> register_box('rbox', RotatedBoxes)
+    """
+    if not isinstance(force, bool):
+        raise TypeError(f'force must be a boolean, but got {type(force)}')
+
+    # use it as a normal method: register_box(name, box_type=BoxCls)
+    if box_type is not None:
+        _register_box(name=name, box_type=box_type, force=force)
+        return box_type
+
+    # use it as a decorator: @register_box(name)
+    def _register(cls):
+        _register_box(name=name, box_type=cls, force=force)
+        return cls
+
+    return _register
+
+
+def _register_box_converter(src_type: Union[str, type],
+                            dst_type: Union[str, type],
+                            converter: Callable,
+                            force: bool = False) -> None:
+    """Register a box converter.
+
+    Args:
+        src_type (str or type): source box type name or class.
+        dst_type (str or type): destination box type name or class.
+        converter (Callable): Convert function.
+        force (bool): Whether to override the existing box type with the same
+            name. Defaults to False.
+    """
+    assert callable(converter)
+    src_type_name, _ = get_box_type(src_type)
+    dst_type_name, _ = get_box_type(dst_type)
+
+    converter_name = src_type_name + '2' + dst_type_name
+    if not force and converter_name in box_converters:
+        raise KeyError(f'The box converter from {src_type_name} to '
+                       f'{dst_type_name} has been registered.')
+
+    box_converters[converter_name] = converter
+
+
+def register_box_converter(src_type: Union[str, type],
+                           dst_type: Union[str, type],
+                           converter: Optional[Callable] = None,
+                           force: bool = False) -> Callable:
+    """Register a box converter.
+
+    A record will be added to ``box_converter``, whose key is
+    '{src_type_name}2{dst_type_name}' and value is the convert function.
+    It can be used as a decorator or a normal function.
+
+    Args:
+        src_type (str or type): source box type name or class.
+        dst_type (str or type): destination box type name or class.
+        converter (Callable): Convert function. Defaults to None.
+        force (bool): Whether to override the existing box type with the same
+            name. Defaults to False.
+
+    Examples:
+        >>> from mmdet.structures.bbox import register_box_converter
+        >>> # as a decorator
+        >>> @register_box_converter('hbox', 'rbox')
+        >>> def converter_A(boxes):
+        >>>     pass
+
+        >>> # as a normal function
+        >>> def converter_B(boxes):
+        >>>     pass
+        >>> register_box_converter('rbox', 'hbox', converter_B)
+    """
+    if not isinstance(force, bool):
+        raise TypeError(f'force must be a boolean, but got {type(force)}')
+
+    # use it as a normal method:
+    # register_box_converter(src_type, dst_type, converter=Func)
+    if converter is not None:
+        _register_box_converter(
+            src_type=src_type,
+            dst_type=dst_type,
+            converter=converter,
+            force=force)
+        return converter
+
+    # use it as a decorator: @register_box_converter(name)
+    def _register(func):
+        _register_box_converter(
+            src_type=src_type, dst_type=dst_type, converter=func, force=force)
+        return func
+
+    return _register
+
+
+def get_box_type(box_type: Union[str, type]) -> Tuple[str, type]:
+    """get both box type name and class.
+
+    Args:
+        box_type (str or type): Single box type name or class.
+
+    Returns:
+        Tuple[str, type]: A tuple of box type name and class.
+    """
+    if isinstance(box_type, str):
+        type_name = box_type.lower()
+        assert type_name in box_types, \
+            f"Box type {type_name} hasn't been registered in box_types."
+        type_cls = box_types[type_name]
+    elif issubclass(box_type, BaseBoxes):
+        assert box_type in _box_type_to_name, \
+            f"Box type {box_type} hasn't been registered in box_types."
+        type_name = _box_type_to_name[box_type]
+        type_cls = box_type
+    else:
+        raise KeyError('box_type must be a str or class inheriting from '
+                       f'BaseBoxes, but got {type(box_type)}.')
+    return type_name, type_cls
+
+
+def convert_box_type(boxes: BoxType,
+                     *,
+                     src_type: Union[str, type] = None,
+                     dst_type: Union[str, type] = None) -> BoxType:
+    """Convert boxes from source type to destination type.
+
+    If ``boxes`` is a instance of BaseBoxes, the ``src_type`` will be set
+    as the type of ``boxes``.
+
+    Args:
+        boxes (np.ndarray or Tensor or :obj:`BaseBoxes`): boxes need to
+            convert.
+        src_type (str or type, Optional): source box type. Defaults to None.
+        dst_type (str or type, Optional): destination box type. Defaults to
+            None.
+
+    Returns:
+        Union[np.ndarray, Tensor, :obj:`BaseBoxes`]: Converted boxes. It's type
+        is consistent with the input's type.
+    """
+    assert dst_type is not None
+    dst_type_name, dst_type_cls = get_box_type(dst_type)
+
+    is_box_cls = False
+    is_numpy = False
+    if isinstance(boxes, BaseBoxes):
+        src_type_name, _ = get_box_type(type(boxes))
+        is_box_cls = True
+    elif isinstance(boxes, (Tensor, np.ndarray)):
+        assert src_type is not None
+        src_type_name, _ = get_box_type(src_type)
+        if isinstance(boxes, np.ndarray):
+            is_numpy = True
+    else:
+        raise TypeError('boxes must be a instance of BaseBoxes, Tensor or '
+                        f'ndarray, but get {type(boxes)}.')
+
+    if src_type_name == dst_type_name:
+        return boxes
+
+    converter_name = src_type_name + '2' + dst_type_name
+    assert converter_name in box_converters, \
+        "Convert function hasn't been registered in box_converters."
+    converter = box_converters[converter_name]
+
+    if is_box_cls:
+        boxes = converter(boxes.tensor)
+        return dst_type_cls(boxes)
+    elif is_numpy:
+        boxes = converter(torch.from_numpy(boxes))
+        return boxes.numpy()
+    else:
+        return converter(boxes)
+
+
+def autocast_box_type(dst_box_type='hbox') -> Callable:
+    """A decorator which automatically casts results['gt_bboxes'] to the
+    destination box type.
+
+    It commenly used in mmdet.datasets.transforms to make the transforms up-
+    compatible with the np.ndarray type of results['gt_bboxes'].
+
+    The speed of processing of np.ndarray and BaseBoxes data are the same:
+
+    - np.ndarray: 0.0509 img/s
+    - BaseBoxes: 0.0551 img/s
+
+    Args:
+        dst_box_type (str): Destination box type.
+    """
+    _, box_type_cls = get_box_type(dst_box_type)
+
+    def decorator(func: Callable) -> Callable:
+
+        def wrapper(self, results: dict, *args, **kwargs) -> dict:
+            if ('gt_bboxes' not in results
+                    or isinstance(results['gt_bboxes'], BaseBoxes)):
+                return func(self, results)
+            elif isinstance(results['gt_bboxes'], np.ndarray):
+                results['gt_bboxes'] = box_type_cls(
+                    results['gt_bboxes'], clone=False)
+                if 'mix_results' in results:
+                    for res in results['mix_results']:
+                        if isinstance(res['gt_bboxes'], np.ndarray):
+                            res['gt_bboxes'] = box_type_cls(
+                                res['gt_bboxes'], clone=False)
+
+                _results = func(self, results, *args, **kwargs)
+
+                # In some cases, the function will process gt_bboxes in-place
+                # Simultaneously convert inputting and outputting gt_bboxes
+                # back to np.ndarray
+                if isinstance(_results, dict) and 'gt_bboxes' in _results:
+                    if isinstance(_results['gt_bboxes'], BaseBoxes):
+                        _results['gt_bboxes'] = _results['gt_bboxes'].numpy()
+                if isinstance(results['gt_bboxes'], BaseBoxes):
+                    results['gt_bboxes'] = results['gt_bboxes'].numpy()
+                return _results
+            else:
+                raise TypeError(
+                    "auto_box_type requires results['gt_bboxes'] to "
+                    'be BaseBoxes or np.ndarray, but got '
+                    f"{type(results['gt_bboxes'])}")
+
+        return wrapper
+
+    return decorator
diff --git a/head_extractor/build/lib/mmdet/structures/bbox/horizontal_boxes.py b/head_extractor/build/lib/mmdet/structures/bbox/horizontal_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3a78518105fda02cef2d3a2bcaceb410759165c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/bbox/horizontal_boxes.py
@@ -0,0 +1,432 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, TypeVar, Union
+
+import cv2
+import numpy as np
+import torch
+from torch import BoolTensor, Tensor
+
+from mmdet.structures.mask.structures import BitmapMasks, PolygonMasks
+from .base_boxes import BaseBoxes
+from .bbox_overlaps import bbox_overlaps
+from .box_type import register_box
+
+T = TypeVar('T')
+DeviceType = Union[str, torch.device]
+MaskType = Union[BitmapMasks, PolygonMasks]
+
+
+@register_box(name='hbox')
+class HorizontalBoxes(BaseBoxes):
+    """The horizontal box class used in MMDetection by default.
+
+    The ``box_dim`` of ``HorizontalBoxes`` is 4, which means the length of
+    the last dimension of the data should be 4. Two modes of box data are
+    supported in ``HorizontalBoxes``:
+
+    - 'xyxy': Each row of data indicates (x1, y1, x2, y2), which are the
+      coordinates of the left-top and right-bottom points.
+    - 'cxcywh': Each row of data indicates (x, y, w, h), where (x, y) are the
+      coordinates of the box centers and (w, h) are the width and height.
+
+    ``HorizontalBoxes`` only restores 'xyxy' mode of data. If the the data is
+    in 'cxcywh' mode, users need to input ``in_mode='cxcywh'`` and The code
+    will convert the 'cxcywh' data to 'xyxy' automatically.
+
+    Args:
+        data (Tensor or np.ndarray or Sequence): The box data with shape of
+            (..., 4).
+        dtype (torch.dtype, Optional): data type of boxes. Defaults to None.
+        device (str or torch.device, Optional): device of boxes.
+            Default to None.
+        clone (bool): Whether clone ``boxes`` or not. Defaults to True.
+        mode (str, Optional): the mode of boxes. If it is 'cxcywh', the
+            `data` will be converted to 'xyxy' mode. Defaults to None.
+    """
+
+    box_dim: int = 4
+
+    def __init__(self,
+                 data: Union[Tensor, np.ndarray],
+                 dtype: torch.dtype = None,
+                 device: DeviceType = None,
+                 clone: bool = True,
+                 in_mode: Optional[str] = None) -> None:
+        super().__init__(data=data, dtype=dtype, device=device, clone=clone)
+        if isinstance(in_mode, str):
+            if in_mode not in ('xyxy', 'cxcywh'):
+                raise ValueError(f'Get invalid mode {in_mode}.')
+            if in_mode == 'cxcywh':
+                self.tensor = self.cxcywh_to_xyxy(self.tensor)
+
+    @staticmethod
+    def cxcywh_to_xyxy(boxes: Tensor) -> Tensor:
+        """Convert box coordinates from (cx, cy, w, h) to (x1, y1, x2, y2).
+
+        Args:
+            boxes (Tensor): cxcywh boxes tensor with shape of (..., 4).
+
+        Returns:
+            Tensor: xyxy boxes tensor with shape of (..., 4).
+        """
+        ctr, wh = boxes.split((2, 2), dim=-1)
+        return torch.cat([(ctr - wh / 2), (ctr + wh / 2)], dim=-1)
+
+    @staticmethod
+    def xyxy_to_cxcywh(boxes: Tensor) -> Tensor:
+        """Convert box coordinates from (x1, y1, x2, y2) to (cx, cy, w, h).
+
+        Args:
+            boxes (Tensor): xyxy boxes tensor with shape of (..., 4).
+
+        Returns:
+            Tensor: cxcywh boxes tensor with shape of (..., 4).
+        """
+        xy1, xy2 = boxes.split((2, 2), dim=-1)
+        return torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1)
+
+    @property
+    def cxcywh(self) -> Tensor:
+        """Return a tensor representing the cxcywh boxes."""
+        return self.xyxy_to_cxcywh(self.tensor)
+
+    @property
+    def centers(self) -> Tensor:
+        """Return a tensor representing the centers of boxes."""
+        boxes = self.tensor
+        return (boxes[..., :2] + boxes[..., 2:]) / 2
+
+    @property
+    def areas(self) -> Tensor:
+        """Return a tensor representing the areas of boxes."""
+        boxes = self.tensor
+        return (boxes[..., 2] - boxes[..., 0]) * (
+            boxes[..., 3] - boxes[..., 1])
+
+    @property
+    def widths(self) -> Tensor:
+        """Return a tensor representing the widths of boxes."""
+        boxes = self.tensor
+        return boxes[..., 2] - boxes[..., 0]
+
+    @property
+    def heights(self) -> Tensor:
+        """Return a tensor representing the heights of boxes."""
+        boxes = self.tensor
+        return boxes[..., 3] - boxes[..., 1]
+
+    def flip_(self,
+              img_shape: Tuple[int, int],
+              direction: str = 'horizontal') -> None:
+        """Flip boxes horizontally or vertically in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+            direction (str): Flip direction, options are "horizontal",
+                "vertical" and "diagonal". Defaults to "horizontal"
+        """
+        assert direction in ['horizontal', 'vertical', 'diagonal']
+        flipped = self.tensor
+        boxes = flipped.clone()
+        if direction == 'horizontal':
+            flipped[..., 0] = img_shape[1] - boxes[..., 2]
+            flipped[..., 2] = img_shape[1] - boxes[..., 0]
+        elif direction == 'vertical':
+            flipped[..., 1] = img_shape[0] - boxes[..., 3]
+            flipped[..., 3] = img_shape[0] - boxes[..., 1]
+        else:
+            flipped[..., 0] = img_shape[1] - boxes[..., 2]
+            flipped[..., 1] = img_shape[0] - boxes[..., 3]
+            flipped[..., 2] = img_shape[1] - boxes[..., 0]
+            flipped[..., 3] = img_shape[0] - boxes[..., 1]
+
+    def translate_(self, distances: Tuple[float, float]) -> None:
+        """Translate boxes in-place.
+
+        Args:
+            distances (Tuple[float, float]): translate distances. The first
+                is horizontal distance and the second is vertical distance.
+        """
+        boxes = self.tensor
+        assert len(distances) == 2
+        self.tensor = boxes + boxes.new_tensor(distances).repeat(2)
+
+    def clip_(self, img_shape: Tuple[int, int]) -> None:
+        """Clip boxes according to the image shape in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+        """
+        boxes = self.tensor
+        boxes[..., 0::2] = boxes[..., 0::2].clamp(0, img_shape[1])
+        boxes[..., 1::2] = boxes[..., 1::2].clamp(0, img_shape[0])
+
+    def rotate_(self, center: Tuple[float, float], angle: float) -> None:
+        """Rotate all boxes in-place.
+
+        Args:
+            center (Tuple[float, float]): Rotation origin.
+            angle (float): Rotation angle represented in degrees. Positive
+                values mean clockwise rotation.
+        """
+        boxes = self.tensor
+        rotation_matrix = boxes.new_tensor(
+            cv2.getRotationMatrix2D(center, -angle, 1))
+
+        corners = self.hbox2corner(boxes)
+        corners = torch.cat(
+            [corners, corners.new_ones(*corners.shape[:-1], 1)], dim=-1)
+        corners_T = torch.transpose(corners, -1, -2)
+        corners_T = torch.matmul(rotation_matrix, corners_T)
+        corners = torch.transpose(corners_T, -1, -2)
+        self.tensor = self.corner2hbox(corners)
+
+    def project_(self, homography_matrix: Union[Tensor, np.ndarray]) -> None:
+        """Geometric transformat boxes in-place.
+
+        Args:
+            homography_matrix (Tensor or np.ndarray]):
+                Shape (3, 3) for geometric transformation.
+        """
+        boxes = self.tensor
+        if isinstance(homography_matrix, np.ndarray):
+            homography_matrix = boxes.new_tensor(homography_matrix)
+        corners = self.hbox2corner(boxes)
+        corners = torch.cat(
+            [corners, corners.new_ones(*corners.shape[:-1], 1)], dim=-1)
+        corners_T = torch.transpose(corners, -1, -2)
+        corners_T = torch.matmul(homography_matrix, corners_T)
+        corners = torch.transpose(corners_T, -1, -2)
+        # Convert to homogeneous coordinates by normalization
+        corners = corners[..., :2] / corners[..., 2:3]
+        self.tensor = self.corner2hbox(corners)
+
+    @staticmethod
+    def hbox2corner(boxes: Tensor) -> Tensor:
+        """Convert box coordinates from (x1, y1, x2, y2) to corners ((x1, y1),
+        (x2, y1), (x1, y2), (x2, y2)).
+
+        Args:
+            boxes (Tensor): Horizontal box tensor with shape of (..., 4).
+
+        Returns:
+            Tensor: Corner tensor with shape of (..., 4, 2).
+        """
+        x1, y1, x2, y2 = torch.split(boxes, 1, dim=-1)
+        corners = torch.cat([x1, y1, x2, y1, x1, y2, x2, y2], dim=-1)
+        return corners.reshape(*corners.shape[:-1], 4, 2)
+
+    @staticmethod
+    def corner2hbox(corners: Tensor) -> Tensor:
+        """Convert box coordinates from corners ((x1, y1), (x2, y1), (x1, y2),
+        (x2, y2)) to (x1, y1, x2, y2).
+
+        Args:
+            corners (Tensor): Corner tensor with shape of (..., 4, 2).
+
+        Returns:
+            Tensor: Horizontal box tensor with shape of (..., 4).
+        """
+        if corners.numel() == 0:
+            return corners.new_zeros((0, 4))
+        min_xy = corners.min(dim=-2)[0]
+        max_xy = corners.max(dim=-2)[0]
+        return torch.cat([min_xy, max_xy], dim=-1)
+
+    def rescale_(self, scale_factor: Tuple[float, float]) -> None:
+        """Rescale boxes w.r.t. rescale_factor in-place.
+
+        Note:
+            Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
+            w.r.t ``scale_facotr``. The difference is that ``resize_`` only
+            changes the width and the height of boxes, but ``rescale_`` also
+            rescales the box centers simultaneously.
+
+        Args:
+            scale_factor (Tuple[float, float]): factors for scaling boxes.
+                The length should be 2.
+        """
+        boxes = self.tensor
+        assert len(scale_factor) == 2
+        scale_factor = boxes.new_tensor(scale_factor).repeat(2)
+        self.tensor = boxes * scale_factor
+
+    def resize_(self, scale_factor: Tuple[float, float]) -> None:
+        """Resize the box width and height w.r.t scale_factor in-place.
+
+        Note:
+            Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
+            w.r.t ``scale_facotr``. The difference is that ``resize_`` only
+            changes the width and the height of boxes, but ``rescale_`` also
+            rescales the box centers simultaneously.
+
+        Args:
+            scale_factor (Tuple[float, float]): factors for scaling box
+                shapes. The length should be 2.
+        """
+        boxes = self.tensor
+        assert len(scale_factor) == 2
+        ctrs = (boxes[..., 2:] + boxes[..., :2]) / 2
+        wh = boxes[..., 2:] - boxes[..., :2]
+        scale_factor = boxes.new_tensor(scale_factor)
+        wh = wh * scale_factor
+        xy1 = ctrs - 0.5 * wh
+        xy2 = ctrs + 0.5 * wh
+        self.tensor = torch.cat([xy1, xy2], dim=-1)
+
+    def is_inside(self,
+                  img_shape: Tuple[int, int],
+                  all_inside: bool = False,
+                  allowed_border: int = 0) -> BoolTensor:
+        """Find boxes inside the image.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+            all_inside (bool): Whether the boxes are all inside the image or
+                part inside the image. Defaults to False.
+            allowed_border (int): Boxes that extend beyond the image shape
+                boundary by more than ``allowed_border`` are considered
+                "outside" Defaults to 0.
+        Returns:
+            BoolTensor: A BoolTensor indicating whether the box is inside
+            the image. Assuming the original boxes have shape (m, n, 4),
+            the output has shape (m, n).
+        """
+        img_h, img_w = img_shape
+        boxes = self.tensor
+        if all_inside:
+            return (boxes[:, 0] >= -allowed_border) & \
+                (boxes[:, 1] >= -allowed_border) & \
+                (boxes[:, 2] < img_w + allowed_border) & \
+                (boxes[:, 3] < img_h + allowed_border)
+        else:
+            return (boxes[..., 0] < img_w + allowed_border) & \
+                (boxes[..., 1] < img_h + allowed_border) & \
+                (boxes[..., 2] > -allowed_border) & \
+                (boxes[..., 3] > -allowed_border)
+
+    def find_inside_points(self,
+                           points: Tensor,
+                           is_aligned: bool = False) -> BoolTensor:
+        """Find inside box points. Boxes dimension must be 2.
+
+        Args:
+            points (Tensor): Points coordinates. Has shape of (m, 2).
+            is_aligned (bool): Whether ``points`` has been aligned with boxes
+                or not. If True, the length of boxes and ``points`` should be
+                the same. Defaults to False.
+
+        Returns:
+            BoolTensor: A BoolTensor indicating whether a point is inside
+            boxes. Assuming the boxes has shape of (n, 4), if ``is_aligned``
+            is False. The index has shape of (m, n). If ``is_aligned`` is
+            True, m should be equal to n and the index has shape of (m, ).
+        """
+        boxes = self.tensor
+        assert boxes.dim() == 2, 'boxes dimension must be 2.'
+
+        if not is_aligned:
+            boxes = boxes[None, :, :]
+            points = points[:, None, :]
+        else:
+            assert boxes.size(0) == points.size(0)
+
+        x_min, y_min, x_max, y_max = boxes.unbind(dim=-1)
+        return (points[..., 0] >= x_min) & (points[..., 0] <= x_max) & \
+            (points[..., 1] >= y_min) & (points[..., 1] <= y_max)
+
+    def create_masks(self, img_shape: Tuple[int, int]) -> BitmapMasks:
+        """
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+
+        Returns:
+            :obj:`BitmapMasks`: Converted masks
+        """
+        img_h, img_w = img_shape
+        boxes = self.tensor
+
+        xmin, ymin = boxes[:, 0:1], boxes[:, 1:2]
+        xmax, ymax = boxes[:, 2:3], boxes[:, 3:4]
+        gt_masks = np.zeros((len(boxes), img_h, img_w), dtype=np.uint8)
+        for i in range(len(boxes)):
+            gt_masks[i,
+                     int(ymin[i]):int(ymax[i]),
+                     int(xmin[i]):int(xmax[i])] = 1
+        return BitmapMasks(gt_masks, img_h, img_w)
+
+    @staticmethod
+    def overlaps(boxes1: BaseBoxes,
+                 boxes2: BaseBoxes,
+                 mode: str = 'iou',
+                 is_aligned: bool = False,
+                 eps: float = 1e-6) -> Tensor:
+        """Calculate overlap between two set of boxes with their types
+        converted to ``HorizontalBoxes``.
+
+        Args:
+            boxes1 (:obj:`BaseBoxes`): BaseBoxes with shape of (m, box_dim)
+                or empty.
+            boxes2 (:obj:`BaseBoxes`): BaseBoxes with shape of (n, box_dim)
+                or empty.
+            mode (str): "iou" (intersection over union), "iof" (intersection
+                over foreground). Defaults to "iou".
+            is_aligned (bool): If True, then m and n must be equal. Defaults
+                to False.
+            eps (float): A value added to the denominator for numerical
+                stability. Defaults to 1e-6.
+
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+        """
+        boxes1 = boxes1.convert_to('hbox')
+        boxes2 = boxes2.convert_to('hbox')
+        return bbox_overlaps(
+            boxes1.tensor,
+            boxes2.tensor,
+            mode=mode,
+            is_aligned=is_aligned,
+            eps=eps)
+
+    @staticmethod
+    def from_instance_masks(masks: MaskType) -> 'HorizontalBoxes':
+        """Create horizontal boxes from instance masks.
+
+        Args:
+            masks (:obj:`BitmapMasks` or :obj:`PolygonMasks`): BitmapMasks or
+                PolygonMasks instance with length of n.
+
+        Returns:
+            :obj:`HorizontalBoxes`: Converted boxes with shape of (n, 4).
+        """
+        num_masks = len(masks)
+        boxes = np.zeros((num_masks, 4), dtype=np.float32)
+        if isinstance(masks, BitmapMasks):
+            x_any = masks.masks.any(axis=1)
+            y_any = masks.masks.any(axis=2)
+            for idx in range(num_masks):
+                x = np.where(x_any[idx, :])[0]
+                y = np.where(y_any[idx, :])[0]
+                if len(x) > 0 and len(y) > 0:
+                    # use +1 for x_max and y_max so that the right and bottom
+                    # boundary of instance masks are fully included by the box
+                    boxes[idx, :] = np.array(
+                        [x[0], y[0], x[-1] + 1, y[-1] + 1], dtype=np.float32)
+        elif isinstance(masks, PolygonMasks):
+            for idx, poly_per_obj in enumerate(masks.masks):
+                # simply use a number that is big enough for comparison with
+                # coordinates
+                xy_min = np.array([masks.width * 2, masks.height * 2],
+                                  dtype=np.float32)
+                xy_max = np.zeros(2, dtype=np.float32)
+                for p in poly_per_obj:
+                    xy = np.array(p).reshape(-1, 2).astype(np.float32)
+                    xy_min = np.minimum(xy_min, np.min(xy, axis=0))
+                    xy_max = np.maximum(xy_max, np.max(xy, axis=0))
+                boxes[idx, :2] = xy_min
+                boxes[idx, 2:] = xy_max
+        else:
+            raise TypeError(
+                '`masks` must be `BitmapMasks`  or `PolygonMasks`, '
+                f'but got {type(masks)}.')
+        return HorizontalBoxes(boxes)
diff --git a/head_extractor/build/lib/mmdet/structures/bbox/transforms.py b/head_extractor/build/lib/mmdet/structures/bbox/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..287e6aa6fcaeaf09a8a2838a04a97157cd02a00c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/bbox/transforms.py
@@ -0,0 +1,498 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.structures.bbox import BaseBoxes
+
+
+def find_inside_bboxes(bboxes: Tensor, img_h: int, img_w: int) -> Tensor:
+    """Find bboxes as long as a part of bboxes is inside the image.
+
+    Args:
+        bboxes (Tensor): Shape (N, 4).
+        img_h (int): Image height.
+        img_w (int): Image width.
+
+    Returns:
+        Tensor: Index of the remaining bboxes.
+    """
+    inside_inds = (bboxes[:, 0] < img_w) & (bboxes[:, 2] > 0) \
+        & (bboxes[:, 1] < img_h) & (bboxes[:, 3] > 0)
+    return inside_inds
+
+
+def bbox_flip(bboxes: Tensor,
+              img_shape: Tuple[int],
+              direction: str = 'horizontal') -> Tensor:
+    """Flip bboxes horizontally or vertically.
+
+    Args:
+        bboxes (Tensor): Shape (..., 4*k)
+        img_shape (Tuple[int]): Image shape.
+        direction (str): Flip direction, options are "horizontal", "vertical",
+            "diagonal". Default: "horizontal"
+
+    Returns:
+        Tensor: Flipped bboxes.
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    flipped = bboxes.clone()
+    if direction == 'horizontal':
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+    elif direction == 'vertical':
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    else:
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    return flipped
+
+
+def bbox_mapping(bboxes: Tensor,
+                 img_shape: Tuple[int],
+                 scale_factor: Union[float, Tuple[float]],
+                 flip: bool,
+                 flip_direction: str = 'horizontal') -> Tensor:
+    """Map bboxes from the original image scale to testing scale."""
+    new_bboxes = bboxes * bboxes.new_tensor(scale_factor)
+    if flip:
+        new_bboxes = bbox_flip(new_bboxes, img_shape, flip_direction)
+    return new_bboxes
+
+
+def bbox_mapping_back(bboxes: Tensor,
+                      img_shape: Tuple[int],
+                      scale_factor: Union[float, Tuple[float]],
+                      flip: bool,
+                      flip_direction: str = 'horizontal') -> Tensor:
+    """Map bboxes from testing scale to original image scale."""
+    new_bboxes = bbox_flip(bboxes, img_shape,
+                           flip_direction) if flip else bboxes
+    new_bboxes = new_bboxes.view(-1, 4) / new_bboxes.new_tensor(scale_factor)
+    return new_bboxes.view(bboxes.shape)
+
+
+def bbox2roi(bbox_list: List[Union[Tensor, BaseBoxes]]) -> Tensor:
+    """Convert a list of bboxes to roi format.
+
+    Args:
+        bbox_list (List[Union[Tensor, :obj:`BaseBoxes`]): a list of bboxes
+            corresponding to a batch of images.
+
+    Returns:
+        Tensor: shape (n, box_dim + 1), where ``box_dim`` depends on the
+        different box types. For example, If the box type in ``bbox_list``
+        is HorizontalBoxes, the output shape is (n, 5). Each row of data
+        indicates [batch_ind, x1, y1, x2, y2].
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        bboxes = get_box_tensor(bboxes)
+        img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+        rois = torch.cat([img_inds, bboxes], dim=-1)
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def roi2bbox(rois: Tensor) -> List[Tensor]:
+    """Convert rois to bounding box format.
+
+    Args:
+        rois (Tensor): RoIs with the shape (n, 5) where the first
+            column indicates batch id of each RoI.
+
+    Returns:
+        List[Tensor]: Converted boxes of corresponding rois.
+    """
+    bbox_list = []
+    img_ids = torch.unique(rois[:, 0].cpu(), sorted=True)
+    for img_id in img_ids:
+        inds = (rois[:, 0] == img_id.item())
+        bbox = rois[inds, 1:]
+        bbox_list.append(bbox)
+    return bbox_list
+
+
+# TODO remove later
+def bbox2result(bboxes: Union[Tensor, np.ndarray], labels: Union[Tensor,
+                                                                 np.ndarray],
+                num_classes: int) -> List[np.ndarray]:
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (Tensor | np.ndarray): shape (n, 5)
+        labels (Tensor | np.ndarray): shape (n, )
+        num_classes (int): class number, including background class
+
+    Returns:
+        List(np.ndarray]): bbox results of each class
+    """
+    if bboxes.shape[0] == 0:
+        return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)]
+    else:
+        if isinstance(bboxes, torch.Tensor):
+            bboxes = bboxes.detach().cpu().numpy()
+            labels = labels.detach().cpu().numpy()
+        return [bboxes[labels == i, :] for i in range(num_classes)]
+
+
+def distance2bbox(
+    points: Tensor,
+    distance: Tensor,
+    max_shape: Optional[Union[Sequence[int], Tensor,
+                              Sequence[Sequence[int]]]] = None
+) -> Tensor:
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (B, N, 2) or (N, 2).
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom). Shape (B, N, 4) or (N, 4)
+        max_shape (Union[Sequence[int], Tensor, Sequence[Sequence[int]]],
+            optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B.
+
+    Returns:
+        Tensor: Boxes with shape (N, 4) or (B, N, 4)
+    """
+
+    x1 = points[..., 0] - distance[..., 0]
+    y1 = points[..., 1] - distance[..., 1]
+    x2 = points[..., 0] + distance[..., 2]
+    y2 = points[..., 1] + distance[..., 3]
+
+    bboxes = torch.stack([x1, y1, x2, y2], -1)
+
+    if max_shape is not None:
+        if bboxes.dim() == 2 and not torch.onnx.is_in_onnx_export():
+            # speed up
+            bboxes[:, 0::2].clamp_(min=0, max=max_shape[1])
+            bboxes[:, 1::2].clamp_(min=0, max=max_shape[0])
+            return bboxes
+
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            # TODO: delete
+            from mmdet.core.export import dynamic_clip_for_onnx
+            x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape)
+            bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = x1.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(x1)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = x1.new_tensor(0)
+        max_xy = torch.cat([max_shape, max_shape],
+                           dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
+
+
+def bbox2distance(points: Tensor,
+                  bbox: Tensor,
+                  max_dis: Optional[float] = None,
+                  eps: float = 0.1) -> Tensor:
+    """Decode bounding box based on distances.
+
+    Args:
+        points (Tensor): Shape (n, 2) or (b, n, 2), [x, y].
+        bbox (Tensor): Shape (n, 4) or (b, n, 4), "xyxy" format
+        max_dis (float, optional): Upper bound of the distance.
+        eps (float): a small value to ensure target < max_dis, instead <=
+
+    Returns:
+        Tensor: Decoded distances.
+    """
+    left = points[..., 0] - bbox[..., 0]
+    top = points[..., 1] - bbox[..., 1]
+    right = bbox[..., 2] - points[..., 0]
+    bottom = bbox[..., 3] - points[..., 1]
+    if max_dis is not None:
+        left = left.clamp(min=0, max=max_dis - eps)
+        top = top.clamp(min=0, max=max_dis - eps)
+        right = right.clamp(min=0, max=max_dis - eps)
+        bottom = bottom.clamp(min=0, max=max_dis - eps)
+    return torch.stack([left, top, right, bottom], -1)
+
+
+def bbox_rescale(bboxes: Tensor, scale_factor: float = 1.0) -> Tensor:
+    """Rescale bounding box w.r.t. scale_factor.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for bboxes or (n, 5) for rois
+        scale_factor (float): rescale factor
+
+    Returns:
+        Tensor: Rescaled bboxes.
+    """
+    if bboxes.size(1) == 5:
+        bboxes_ = bboxes[:, 1:]
+        inds_ = bboxes[:, 0]
+    else:
+        bboxes_ = bboxes
+    cx = (bboxes_[:, 0] + bboxes_[:, 2]) * 0.5
+    cy = (bboxes_[:, 1] + bboxes_[:, 3]) * 0.5
+    w = bboxes_[:, 2] - bboxes_[:, 0]
+    h = bboxes_[:, 3] - bboxes_[:, 1]
+    w = w * scale_factor
+    h = h * scale_factor
+    x1 = cx - 0.5 * w
+    x2 = cx + 0.5 * w
+    y1 = cy - 0.5 * h
+    y2 = cy + 0.5 * h
+    if bboxes.size(1) == 5:
+        rescaled_bboxes = torch.stack([inds_, x1, y1, x2, y2], dim=-1)
+    else:
+        rescaled_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+    return rescaled_bboxes
+
+
+def bbox_cxcywh_to_xyxy(bbox: Tensor) -> Tensor:
+    """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, x2, y2).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), (cx + 0.5 * w), (cy + 0.5 * h)]
+    return torch.cat(bbox_new, dim=-1)
+
+
+def bbox_xyxy_to_cxcywh(bbox: Tensor) -> Tensor:
+    """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, w, h).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    x1, y1, x2, y2 = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)]
+    return torch.cat(bbox_new, dim=-1)
+
+
+def bbox2corner(bboxes: torch.Tensor) -> torch.Tensor:
+    """Convert bbox coordinates from (x1, y1, x2, y2) to corners ((x1, y1),
+    (x2, y1), (x1, y2), (x2, y2)).
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for bboxes.
+    Returns:
+        Tensor: Shape (n*4, 2) for corners.
+    """
+    x1, y1, x2, y2 = torch.split(bboxes, 1, dim=1)
+    return torch.cat([x1, y1, x2, y1, x1, y2, x2, y2], dim=1).reshape(-1, 2)
+
+
+def corner2bbox(corners: torch.Tensor) -> torch.Tensor:
+    """Convert bbox coordinates from corners ((x1, y1), (x2, y1), (x1, y2),
+    (x2, y2)) to (x1, y1, x2, y2).
+
+    Args:
+        corners (Tensor): Shape (n*4, 2) for corners.
+    Returns:
+        Tensor: Shape (n, 4) for bboxes.
+    """
+    corners = corners.reshape(-1, 4, 2)
+    min_xy = corners.min(dim=1)[0]
+    max_xy = corners.max(dim=1)[0]
+    return torch.cat([min_xy, max_xy], dim=1)
+
+
+def bbox_project(
+    bboxes: Union[torch.Tensor, np.ndarray],
+    homography_matrix: Union[torch.Tensor, np.ndarray],
+    img_shape: Optional[Tuple[int, int]] = None
+) -> Union[torch.Tensor, np.ndarray]:
+    """Geometric transformation for bbox.
+
+    Args:
+        bboxes (Union[torch.Tensor, np.ndarray]): Shape (n, 4) for bboxes.
+        homography_matrix (Union[torch.Tensor, np.ndarray]):
+            Shape (3, 3) for geometric transformation.
+        img_shape (Tuple[int, int], optional): Image shape. Defaults to None.
+    Returns:
+        Union[torch.Tensor, np.ndarray]: Converted bboxes.
+    """
+    bboxes_type = type(bboxes)
+    if bboxes_type is np.ndarray:
+        bboxes = torch.from_numpy(bboxes)
+    if isinstance(homography_matrix, np.ndarray):
+        homography_matrix = torch.from_numpy(homography_matrix)
+    corners = bbox2corner(bboxes)
+    corners = torch.cat(
+        [corners, corners.new_ones(corners.shape[0], 1)], dim=1)
+    corners = torch.matmul(homography_matrix, corners.t()).t()
+    # Convert to homogeneous coordinates by normalization
+    corners = corners[:, :2] / corners[:, 2:3]
+    bboxes = corner2bbox(corners)
+    if img_shape is not None:
+        bboxes[:, 0::2] = bboxes[:, 0::2].clamp(0, img_shape[1])
+        bboxes[:, 1::2] = bboxes[:, 1::2].clamp(0, img_shape[0])
+    if bboxes_type is np.ndarray:
+        bboxes = bboxes.numpy()
+    return bboxes
+
+
+def cat_boxes(data_list: List[Union[Tensor, BaseBoxes]],
+              dim: int = 0) -> Union[Tensor, BaseBoxes]:
+    """Concatenate boxes with type of tensor or box type.
+
+    Args:
+        data_list (List[Union[Tensor, :obj:`BaseBoxes`]]): A list of tensors
+            or box types need to be concatenated.
+            dim (int): The dimension over which the box are concatenated.
+                Defaults to 0.
+
+    Returns:
+        Union[Tensor, :obj`BaseBoxes`]: Concatenated results.
+    """
+    if data_list and isinstance(data_list[0], BaseBoxes):
+        return data_list[0].cat(data_list, dim=dim)
+    else:
+        return torch.cat(data_list, dim=dim)
+
+
+def stack_boxes(data_list: List[Union[Tensor, BaseBoxes]],
+                dim: int = 0) -> Union[Tensor, BaseBoxes]:
+    """Stack boxes with type of tensor or box type.
+
+    Args:
+        data_list (List[Union[Tensor, :obj:`BaseBoxes`]]): A list of tensors
+            or box types need to be stacked.
+            dim (int): The dimension over which the box are stacked.
+                Defaults to 0.
+
+    Returns:
+        Union[Tensor, :obj`BaseBoxes`]: Stacked results.
+    """
+    if data_list and isinstance(data_list[0], BaseBoxes):
+        return data_list[0].stack(data_list, dim=dim)
+    else:
+        return torch.stack(data_list, dim=dim)
+
+
+def scale_boxes(boxes: Union[Tensor, BaseBoxes],
+                scale_factor: Tuple[float, float]) -> Union[Tensor, BaseBoxes]:
+    """Scale boxes with type of tensor or box type.
+
+    Args:
+        boxes (Tensor or :obj:`BaseBoxes`): boxes need to be scaled. Its type
+            can be a tensor or a box type.
+        scale_factor (Tuple[float, float]): factors for scaling boxes.
+            The length should be 2.
+
+    Returns:
+        Union[Tensor, :obj:`BaseBoxes`]: Scaled boxes.
+    """
+    if isinstance(boxes, BaseBoxes):
+        boxes.rescale_(scale_factor)
+        return boxes
+    else:
+        # Tensor boxes will be treated as horizontal boxes
+        repeat_num = int(boxes.size(-1) / 2)
+        scale_factor = boxes.new_tensor(scale_factor).repeat((1, repeat_num))
+        return boxes * scale_factor
+
+
+def get_box_wh(boxes: Union[Tensor, BaseBoxes]) -> Tuple[Tensor, Tensor]:
+    """Get the width and height of boxes with type of tensor or box type.
+
+    Args:
+        boxes (Tensor or :obj:`BaseBoxes`): boxes with type of tensor
+            or box type.
+
+    Returns:
+        Tuple[Tensor, Tensor]: the width and height of boxes.
+    """
+    if isinstance(boxes, BaseBoxes):
+        w = boxes.widths
+        h = boxes.heights
+    else:
+        # Tensor boxes will be treated as horizontal boxes by defaults
+        w = boxes[:, 2] - boxes[:, 0]
+        h = boxes[:, 3] - boxes[:, 1]
+    return w, h
+
+
+def get_box_tensor(boxes: Union[Tensor, BaseBoxes]) -> Tensor:
+    """Get tensor data from box type boxes.
+
+    Args:
+        boxes (Tensor or BaseBoxes): boxes with type of tensor or box type.
+            If its type is a tensor, the boxes will be directly returned.
+            If its type is a box type, the `boxes.tensor` will be returned.
+
+    Returns:
+        Tensor: boxes tensor.
+    """
+    if isinstance(boxes, BaseBoxes):
+        boxes = boxes.tensor
+    return boxes
+
+
+def empty_box_as(boxes: Union[Tensor, BaseBoxes]) -> Union[Tensor, BaseBoxes]:
+    """Generate empty box according to input ``boxes` type and device.
+
+    Args:
+        boxes (Tensor or :obj:`BaseBoxes`): boxes with type of tensor
+            or box type.
+
+    Returns:
+        Union[Tensor, BaseBoxes]: Generated empty box.
+    """
+    if isinstance(boxes, BaseBoxes):
+        return boxes.empty_boxes()
+    else:
+        # Tensor boxes will be treated as horizontal boxes by defaults
+        return boxes.new_zeros(0, 4)
+
+
+def bbox_xyxy_to_cxcyah(bboxes: torch.Tensor) -> torch.Tensor:
+    """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, ratio, h).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx = (bboxes[:, 2] + bboxes[:, 0]) / 2
+    cy = (bboxes[:, 3] + bboxes[:, 1]) / 2
+    w = bboxes[:, 2] - bboxes[:, 0]
+    h = bboxes[:, 3] - bboxes[:, 1]
+    xyah = torch.stack([cx, cy, w / h, h], -1)
+    return xyah
+
+
+def bbox_cxcyah_to_xyxy(bboxes: torch.Tensor) -> torch.Tensor:
+    """Convert bbox coordinates from (cx, cy, ratio, h) to (x1, y1, x2, y2).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx, cy, ratio, h = bboxes.split((1, 1, 1, 1), dim=-1)
+    w = ratio * h
+    x1y1x2y2 = [cx - w / 2.0, cy - h / 2.0, cx + w / 2.0, cy + h / 2.0]
+    return torch.cat(x1y1x2y2, dim=-1)
diff --git a/head_extractor/build/lib/mmdet/structures/det_data_sample.py b/head_extractor/build/lib/mmdet/structures/det_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..37dd74725ed2ff5eb8a088c9d23a9ac5469b07a3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/det_data_sample.py
@@ -0,0 +1,237 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+from mmengine.structures import BaseDataElement, InstanceData, PixelData
+
+
+class DetDataSample(BaseDataElement):
+    """A data structure interface of MMDetection. They are used as interfaces
+    between different components.
+
+    The attributes in ``DetDataSample`` are divided into several parts:
+
+        - ``proposals``(InstanceData): Region proposals used in two-stage
+            detectors.
+        - ``gt_instances``(InstanceData): Ground truth of instance annotations.
+        - ``pred_instances``(InstanceData): Instances of detection predictions.
+        - ``pred_track_instances``(InstanceData): Instances of tracking
+            predictions.
+        - ``ignored_instances``(InstanceData): Instances to be ignored during
+            training/testing.
+        - ``gt_panoptic_seg``(PixelData): Ground truth of panoptic
+            segmentation.
+        - ``pred_panoptic_seg``(PixelData): Prediction of panoptic
+           segmentation.
+        - ``gt_sem_seg``(PixelData): Ground truth of semantic segmentation.
+        - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+
+    Examples:
+         >>> import torch
+         >>> import numpy as np
+         >>> from mmengine.structures import InstanceData
+         >>> from mmdet.structures import DetDataSample
+
+         >>> data_sample = DetDataSample()
+         >>> img_meta = dict(img_shape=(800, 1196),
+         ...                 pad_shape=(800, 1216))
+         >>> gt_instances = InstanceData(metainfo=img_meta)
+         >>> gt_instances.bboxes = torch.rand((5, 4))
+         >>> gt_instances.labels = torch.rand((5,))
+         >>> data_sample.gt_instances = gt_instances
+         >>> assert 'img_shape' in data_sample.gt_instances.metainfo_keys()
+         >>> len(data_sample.gt_instances)
+         5
+         >>> print(data_sample)
+        <DetDataSample(
+
+            META INFORMATION
+
+            DATA FIELDS
+            gt_instances: <InstanceData(
+
+                    META INFORMATION
+                    pad_shape: (800, 1216)
+                    img_shape: (800, 1196)
+
+                    DATA FIELDS
+                    labels: tensor([0.8533, 0.1550, 0.5433, 0.7294, 0.5098])
+                    bboxes:
+                    tensor([[9.7725e-01, 5.8417e-01, 1.7269e-01, 6.5694e-01],
+                            [1.7894e-01, 5.1780e-01, 7.0590e-01, 4.8589e-01],
+                            [7.0392e-01, 6.6770e-01, 1.7520e-01, 1.4267e-01],
+                            [2.2411e-01, 5.1962e-01, 9.6953e-01, 6.6994e-01],
+                            [4.1338e-01, 2.1165e-01, 2.7239e-04, 6.8477e-01]])
+                ) at 0x7f21fb1b9190>
+        ) at 0x7f21fb1b9880>
+         >>> pred_instances = InstanceData(metainfo=img_meta)
+         >>> pred_instances.bboxes = torch.rand((5, 4))
+         >>> pred_instances.scores = torch.rand((5,))
+         >>> data_sample = DetDataSample(pred_instances=pred_instances)
+         >>> assert 'pred_instances' in data_sample
+
+         >>> pred_track_instances = InstanceData(metainfo=img_meta)
+         >>> pred_track_instances.bboxes = torch.rand((5, 4))
+         >>> pred_track_instances.scores = torch.rand((5,))
+         >>> data_sample = DetDataSample(
+         ...    pred_track_instances=pred_track_instances)
+         >>> assert 'pred_track_instances' in data_sample
+
+         >>> data_sample = DetDataSample()
+         >>> gt_instances_data = dict(
+         ...                        bboxes=torch.rand(2, 4),
+         ...                        labels=torch.rand(2),
+         ...                        masks=np.random.rand(2, 2, 2))
+         >>> gt_instances = InstanceData(**gt_instances_data)
+         >>> data_sample.gt_instances = gt_instances
+         >>> assert 'gt_instances' in data_sample
+         >>> assert 'masks' in data_sample.gt_instances
+
+         >>> data_sample = DetDataSample()
+         >>> gt_panoptic_seg_data = dict(panoptic_seg=torch.rand(2, 4))
+         >>> gt_panoptic_seg = PixelData(**gt_panoptic_seg_data)
+         >>> data_sample.gt_panoptic_seg = gt_panoptic_seg
+         >>> print(data_sample)
+        <DetDataSample(
+
+            META INFORMATION
+
+            DATA FIELDS
+            _gt_panoptic_seg: <BaseDataElement(
+
+                    META INFORMATION
+
+                    DATA FIELDS
+                    panoptic_seg: tensor([[0.7586, 0.1262, 0.2892, 0.9341],
+                                [0.3200, 0.7448, 0.1052, 0.5371]])
+                ) at 0x7f66c2bb7730>
+            gt_panoptic_seg: <BaseDataElement(
+
+                    META INFORMATION
+
+                    DATA FIELDS
+                    panoptic_seg: tensor([[0.7586, 0.1262, 0.2892, 0.9341],
+                                [0.3200, 0.7448, 0.1052, 0.5371]])
+                ) at 0x7f66c2bb7730>
+        ) at 0x7f66c2bb7280>
+        >>> data_sample = DetDataSample()
+        >>> gt_segm_seg_data = dict(segm_seg=torch.rand(2, 2, 2))
+        >>> gt_segm_seg = PixelData(**gt_segm_seg_data)
+        >>> data_sample.gt_segm_seg = gt_segm_seg
+        >>> assert 'gt_segm_seg' in data_sample
+        >>> assert 'segm_seg' in data_sample.gt_segm_seg
+    """
+
+    @property
+    def proposals(self) -> InstanceData:
+        return self._proposals
+
+    @proposals.setter
+    def proposals(self, value: InstanceData):
+        self.set_field(value, '_proposals', dtype=InstanceData)
+
+    @proposals.deleter
+    def proposals(self):
+        del self._proposals
+
+    @property
+    def gt_instances(self) -> InstanceData:
+        return self._gt_instances
+
+    @gt_instances.setter
+    def gt_instances(self, value: InstanceData):
+        self.set_field(value, '_gt_instances', dtype=InstanceData)
+
+    @gt_instances.deleter
+    def gt_instances(self):
+        del self._gt_instances
+
+    @property
+    def pred_instances(self) -> InstanceData:
+        return self._pred_instances
+
+    @pred_instances.setter
+    def pred_instances(self, value: InstanceData):
+        self.set_field(value, '_pred_instances', dtype=InstanceData)
+
+    @pred_instances.deleter
+    def pred_instances(self):
+        del self._pred_instances
+
+    # directly add ``pred_track_instances`` in ``DetDataSample``
+    # so that the ``TrackDataSample`` does not bother to access the
+    # instance-level information.
+    @property
+    def pred_track_instances(self) -> InstanceData:
+        return self._pred_track_instances
+
+    @pred_track_instances.setter
+    def pred_track_instances(self, value: InstanceData):
+        self.set_field(value, '_pred_track_instances', dtype=InstanceData)
+
+    @pred_track_instances.deleter
+    def pred_track_instances(self):
+        del self._pred_track_instances
+
+    @property
+    def ignored_instances(self) -> InstanceData:
+        return self._ignored_instances
+
+    @ignored_instances.setter
+    def ignored_instances(self, value: InstanceData):
+        self.set_field(value, '_ignored_instances', dtype=InstanceData)
+
+    @ignored_instances.deleter
+    def ignored_instances(self):
+        del self._ignored_instances
+
+    @property
+    def gt_panoptic_seg(self) -> PixelData:
+        return self._gt_panoptic_seg
+
+    @gt_panoptic_seg.setter
+    def gt_panoptic_seg(self, value: PixelData):
+        self.set_field(value, '_gt_panoptic_seg', dtype=PixelData)
+
+    @gt_panoptic_seg.deleter
+    def gt_panoptic_seg(self):
+        del self._gt_panoptic_seg
+
+    @property
+    def pred_panoptic_seg(self) -> PixelData:
+        return self._pred_panoptic_seg
+
+    @pred_panoptic_seg.setter
+    def pred_panoptic_seg(self, value: PixelData):
+        self.set_field(value, '_pred_panoptic_seg', dtype=PixelData)
+
+    @pred_panoptic_seg.deleter
+    def pred_panoptic_seg(self):
+        del self._pred_panoptic_seg
+
+    @property
+    def gt_sem_seg(self) -> PixelData:
+        return self._gt_sem_seg
+
+    @gt_sem_seg.setter
+    def gt_sem_seg(self, value: PixelData):
+        self.set_field(value, '_gt_sem_seg', dtype=PixelData)
+
+    @gt_sem_seg.deleter
+    def gt_sem_seg(self):
+        del self._gt_sem_seg
+
+    @property
+    def pred_sem_seg(self) -> PixelData:
+        return self._pred_sem_seg
+
+    @pred_sem_seg.setter
+    def pred_sem_seg(self, value: PixelData):
+        self.set_field(value, '_pred_sem_seg', dtype=PixelData)
+
+    @pred_sem_seg.deleter
+    def pred_sem_seg(self):
+        del self._pred_sem_seg
+
+
+SampleList = List[DetDataSample]
+OptSampleList = Optional[SampleList]
diff --git a/head_extractor/build/lib/mmdet/structures/mask/__init__.py b/head_extractor/build/lib/mmdet/structures/mask/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f78394701df1b493259c4c23a79aea5c5cb8be95
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/mask/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mask_target import mask_target
+from .structures import (BaseInstanceMasks, BitmapMasks, PolygonMasks,
+                         bitmap_to_polygon, polygon_to_bitmap)
+from .utils import encode_mask_results, mask2bbox, split_combined_polys
+
+__all__ = [
+    'split_combined_polys', 'mask_target', 'BaseInstanceMasks', 'BitmapMasks',
+    'PolygonMasks', 'encode_mask_results', 'mask2bbox', 'polygon_to_bitmap',
+    'bitmap_to_polygon'
+]
diff --git a/head_extractor/build/lib/mmdet/structures/mask/mask_target.py b/head_extractor/build/lib/mmdet/structures/mask/mask_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2fc5f1878300446b114c9f57c6a885fea8c927c
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/mask/mask_target.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+
+def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list,
+                cfg):
+    """Compute mask target for positive proposals in multiple images.
+
+    Args:
+        pos_proposals_list (list[Tensor]): Positive proposals in multiple
+            images, each has shape (num_pos, 4).
+        pos_assigned_gt_inds_list (list[Tensor]): Assigned GT indices for each
+            positive proposals, each has shape (num_pos,).
+        gt_masks_list (list[:obj:`BaseInstanceMasks`]): Ground truth masks of
+            each image.
+        cfg (dict): Config dict that specifies the mask size.
+
+    Returns:
+        Tensor: Mask target of each image, has shape (num_pos, w, h).
+
+    Example:
+        >>> from mmengine.config import Config
+        >>> import mmdet
+        >>> from mmdet.data_elements.mask import BitmapMasks
+        >>> from mmdet.data_elements.mask.mask_target import *
+        >>> H, W = 17, 18
+        >>> cfg = Config({'mask_size': (13, 14)})
+        >>> rng = np.random.RandomState(0)
+        >>> # Positive proposals (tl_x, tl_y, br_x, br_y) for each image
+        >>> pos_proposals_list = [
+        >>>     torch.Tensor([
+        >>>         [ 7.2425,  5.5929, 13.9414, 14.9541],
+        >>>         [ 7.3241,  3.6170, 16.3850, 15.3102],
+        >>>     ]),
+        >>>     torch.Tensor([
+        >>>         [ 4.8448, 6.4010, 7.0314, 9.7681],
+        >>>         [ 5.9790, 2.6989, 7.4416, 4.8580],
+        >>>         [ 0.0000, 0.0000, 0.1398, 9.8232],
+        >>>     ]),
+        >>> ]
+        >>> # Corresponding class index for each proposal for each image
+        >>> pos_assigned_gt_inds_list = [
+        >>>     torch.LongTensor([7, 0]),
+        >>>     torch.LongTensor([5, 4, 1]),
+        >>> ]
+        >>> # Ground truth mask for each true object for each image
+        >>> gt_masks_list = [
+        >>>     BitmapMasks(rng.rand(8, H, W), height=H, width=W),
+        >>>     BitmapMasks(rng.rand(6, H, W), height=H, width=W),
+        >>> ]
+        >>> mask_targets = mask_target(
+        >>>     pos_proposals_list, pos_assigned_gt_inds_list,
+        >>>     gt_masks_list, cfg)
+        >>> assert mask_targets.shape == (5,) + cfg['mask_size']
+    """
+    cfg_list = [cfg for _ in range(len(pos_proposals_list))]
+    mask_targets = map(mask_target_single, pos_proposals_list,
+                       pos_assigned_gt_inds_list, gt_masks_list, cfg_list)
+    mask_targets = list(mask_targets)
+    if len(mask_targets) > 0:
+        mask_targets = torch.cat(mask_targets)
+    return mask_targets
+
+
+def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg):
+    """Compute mask target for each positive proposal in the image.
+
+    Args:
+        pos_proposals (Tensor): Positive proposals.
+        pos_assigned_gt_inds (Tensor): Assigned GT inds of positive proposals.
+        gt_masks (:obj:`BaseInstanceMasks`): GT masks in the format of Bitmap
+            or Polygon.
+        cfg (dict): Config dict that indicate the mask size.
+
+    Returns:
+        Tensor: Mask target of each positive proposals in the image.
+
+    Example:
+        >>> from mmengine.config import Config
+        >>> import mmdet
+        >>> from mmdet.data_elements.mask import BitmapMasks
+        >>> from mmdet.data_elements.mask.mask_target import *  # NOQA
+        >>> H, W = 32, 32
+        >>> cfg = Config({'mask_size': (7, 11)})
+        >>> rng = np.random.RandomState(0)
+        >>> # Masks for each ground truth box (relative to the image)
+        >>> gt_masks_data = rng.rand(3, H, W)
+        >>> gt_masks = BitmapMasks(gt_masks_data, height=H, width=W)
+        >>> # Predicted positive boxes in one image
+        >>> pos_proposals = torch.FloatTensor([
+        >>>     [ 16.2,   5.5, 19.9, 20.9],
+        >>>     [ 17.3,  13.6, 19.3, 19.3],
+        >>>     [ 14.8,  16.4, 17.0, 23.7],
+        >>>     [  0.0,   0.0, 16.0, 16.0],
+        >>>     [  4.0,   0.0, 20.0, 16.0],
+        >>> ])
+        >>> # For each predicted proposal, its assignment to a gt mask
+        >>> pos_assigned_gt_inds = torch.LongTensor([0, 1, 2, 1, 1])
+        >>> mask_targets = mask_target_single(
+        >>>     pos_proposals, pos_assigned_gt_inds, gt_masks, cfg)
+        >>> assert mask_targets.shape == (5,) + cfg['mask_size']
+    """
+    device = pos_proposals.device
+    mask_size = _pair(cfg.mask_size)
+    binarize = not cfg.get('soft_mask_target', False)
+    num_pos = pos_proposals.size(0)
+    if num_pos > 0:
+        proposals_np = pos_proposals.cpu().numpy()
+        maxh, maxw = gt_masks.height, gt_masks.width
+        proposals_np[:, [0, 2]] = np.clip(proposals_np[:, [0, 2]], 0, maxw)
+        proposals_np[:, [1, 3]] = np.clip(proposals_np[:, [1, 3]], 0, maxh)
+        pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+
+        mask_targets = gt_masks.crop_and_resize(
+            proposals_np,
+            mask_size,
+            device=device,
+            inds=pos_assigned_gt_inds,
+            binarize=binarize).to_ndarray()
+
+        mask_targets = torch.from_numpy(mask_targets).float().to(device)
+    else:
+        mask_targets = pos_proposals.new_zeros((0, ) + mask_size)
+
+    return mask_targets
diff --git a/head_extractor/build/lib/mmdet/structures/mask/structures.py b/head_extractor/build/lib/mmdet/structures/mask/structures.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4fdd27570b0d11d92eba4e8f854e153750135a4
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/mask/structures.py
@@ -0,0 +1,1193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+from abc import ABCMeta, abstractmethod
+from typing import Sequence, Type, TypeVar
+
+import cv2
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+import shapely.geometry as geometry
+import torch
+from mmcv.ops.roi_align import roi_align
+
+T = TypeVar('T')
+
+
+class BaseInstanceMasks(metaclass=ABCMeta):
+    """Base class for instance masks."""
+
+    @abstractmethod
+    def rescale(self, scale, interpolation='nearest'):
+        """Rescale masks as large as possible while keeping the aspect ratio.
+        For details can refer to `mmcv.imrescale`.
+
+        Args:
+            scale (tuple[int]): The maximum size (h, w) of rescaled mask.
+            interpolation (str): Same as :func:`mmcv.imrescale`.
+
+        Returns:
+            BaseInstanceMasks: The rescaled masks.
+        """
+
+    @abstractmethod
+    def resize(self, out_shape, interpolation='nearest'):
+        """Resize masks to the given out_shape.
+
+        Args:
+            out_shape: Target (h, w) of resized mask.
+            interpolation (str): See :func:`mmcv.imresize`.
+
+        Returns:
+            BaseInstanceMasks: The resized masks.
+        """
+
+    @abstractmethod
+    def flip(self, flip_direction='horizontal'):
+        """Flip masks alone the given direction.
+
+        Args:
+            flip_direction (str): Either 'horizontal' or 'vertical'.
+
+        Returns:
+            BaseInstanceMasks: The flipped masks.
+        """
+
+    @abstractmethod
+    def pad(self, out_shape, pad_val):
+        """Pad masks to the given size of (h, w).
+
+        Args:
+            out_shape (tuple[int]): Target (h, w) of padded mask.
+            pad_val (int): The padded value.
+
+        Returns:
+            BaseInstanceMasks: The padded masks.
+        """
+
+    @abstractmethod
+    def crop(self, bbox):
+        """Crop each mask by the given bbox.
+
+        Args:
+            bbox (ndarray): Bbox in format [x1, y1, x2, y2], shape (4, ).
+
+        Return:
+            BaseInstanceMasks: The cropped masks.
+        """
+
+    @abstractmethod
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device,
+                        interpolation='bilinear',
+                        binarize=True):
+        """Crop and resize masks by the given bboxes.
+
+        This function is mainly used in mask targets computation.
+        It firstly align mask to bboxes by assigned_inds, then crop mask by the
+        assigned bbox and resize to the size of (mask_h, mask_w)
+
+        Args:
+            bboxes (Tensor): Bboxes in format [x1, y1, x2, y2], shape (N, 4)
+            out_shape (tuple[int]): Target (h, w) of resized mask
+            inds (ndarray): Indexes to assign masks to each bbox,
+                shape (N,) and values should be between [0, num_masks - 1].
+            device (str): Device of bboxes
+            interpolation (str): See `mmcv.imresize`
+            binarize (bool): if True fractional values are rounded to 0 or 1
+                after the resize operation. if False and unsupported an error
+                will be raised. Defaults to True.
+
+        Return:
+            BaseInstanceMasks: the cropped and resized masks.
+        """
+
+    @abstractmethod
+    def expand(self, expanded_h, expanded_w, top, left):
+        """see :class:`Expand`."""
+
+    @property
+    @abstractmethod
+    def areas(self):
+        """ndarray: areas of each instance."""
+
+    @abstractmethod
+    def to_ndarray(self):
+        """Convert masks to the format of ndarray.
+
+        Return:
+            ndarray: Converted masks in the format of ndarray.
+        """
+
+    @abstractmethod
+    def to_tensor(self, dtype, device):
+        """Convert masks to the format of Tensor.
+
+        Args:
+            dtype (str): Dtype of converted mask.
+            device (torch.device): Device of converted masks.
+
+        Returns:
+            Tensor: Converted masks in the format of Tensor.
+        """
+
+    @abstractmethod
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  border_value=0,
+                  interpolation='bilinear'):
+        """Translate the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+            border_value (int | float): Border value. Default 0.
+            interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+        Returns:
+            Translated masks.
+        """
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """Shear the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The shear direction, either "horizontal"
+                or "vertical".
+            border_value (int | tuple[int]): Value used in case of a
+                constant border. Default 0.
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+
+        Returns:
+            ndarray: Sheared masks.
+        """
+
+    @abstractmethod
+    def rotate(self, out_shape, angle, center=None, scale=1.0, border_value=0):
+        """Rotate the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            angle (int | float): Rotation angle in degrees. Positive values
+                mean counter-clockwise rotation.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation in source image. If not specified, the center of
+                the image will be used.
+            scale (int | float): Isotropic scale factor.
+            border_value (int | float): Border value. Default 0 for masks.
+
+        Returns:
+            Rotated masks.
+        """
+
+    def get_bboxes(self, dst_type='hbb'):
+        """Get the certain type boxes from masks.
+
+        Please refer to ``mmdet.structures.bbox.box_type`` for more details of
+        the box type.
+
+        Args:
+            dst_type: Destination box type.
+
+        Returns:
+            :obj:`BaseBoxes`: Certain type boxes.
+        """
+        from ..bbox import get_box_type
+        _, box_type_cls = get_box_type(dst_type)
+        return box_type_cls.from_instance_masks(self)
+
+    @classmethod
+    @abstractmethod
+    def cat(cls: Type[T], masks: Sequence[T]) -> T:
+        """Concatenate a sequence of masks into one single mask instance.
+
+        Args:
+            masks (Sequence[T]): A sequence of mask instances.
+
+        Returns:
+            T: Concatenated mask instance.
+        """
+
+
+class BitmapMasks(BaseInstanceMasks):
+    """This class represents masks in the form of bitmaps.
+
+    Args:
+        masks (ndarray): ndarray of masks in shape (N, H, W), where N is
+            the number of objects.
+        height (int): height of masks
+        width (int): width of masks
+
+    Example:
+        >>> from mmdet.data_elements.mask.structures import *  # NOQA
+        >>> num_masks, H, W = 3, 32, 32
+        >>> rng = np.random.RandomState(0)
+        >>> masks = (rng.rand(num_masks, H, W) > 0.1).astype(np.int64)
+        >>> self = BitmapMasks(masks, height=H, width=W)
+
+        >>> # demo crop_and_resize
+        >>> num_boxes = 5
+        >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes)
+        >>> out_shape = (14, 14)
+        >>> inds = torch.randint(0, len(self), size=(num_boxes,))
+        >>> device = 'cpu'
+        >>> interpolation = 'bilinear'
+        >>> new = self.crop_and_resize(
+        ...     bboxes, out_shape, inds, device, interpolation)
+        >>> assert len(new) == num_boxes
+        >>> assert new.height, new.width == out_shape
+    """
+
+    def __init__(self, masks, height, width):
+        self.height = height
+        self.width = width
+        if len(masks) == 0:
+            self.masks = np.empty((0, self.height, self.width), dtype=np.uint8)
+        else:
+            assert isinstance(masks, (list, np.ndarray))
+            if isinstance(masks, list):
+                assert isinstance(masks[0], np.ndarray)
+                assert masks[0].ndim == 2  # (H, W)
+            else:
+                assert masks.ndim == 3  # (N, H, W)
+
+            self.masks = np.stack(masks).reshape(-1, height, width)
+            assert self.masks.shape[1] == self.height
+            assert self.masks.shape[2] == self.width
+
+    def __getitem__(self, index):
+        """Index the BitmapMask.
+
+        Args:
+            index (int | ndarray): Indices in the format of integer or ndarray.
+
+        Returns:
+            :obj:`BitmapMasks`: Indexed bitmap masks.
+        """
+        masks = self.masks[index].reshape(-1, self.height, self.width)
+        return BitmapMasks(masks, self.height, self.width)
+
+    def __iter__(self):
+        return iter(self.masks)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'num_masks={len(self.masks)}, '
+        s += f'height={self.height}, '
+        s += f'width={self.width})'
+        return s
+
+    def __len__(self):
+        """Number of masks."""
+        return len(self.masks)
+
+    def rescale(self, scale, interpolation='nearest'):
+        """See :func:`BaseInstanceMasks.rescale`."""
+        if len(self.masks) == 0:
+            new_w, new_h = mmcv.rescale_size((self.width, self.height), scale)
+            rescaled_masks = np.empty((0, new_h, new_w), dtype=np.uint8)
+        else:
+            rescaled_masks = np.stack([
+                mmcv.imrescale(mask, scale, interpolation=interpolation)
+                for mask in self.masks
+            ])
+        height, width = rescaled_masks.shape[1:]
+        return BitmapMasks(rescaled_masks, height, width)
+
+    def resize(self, out_shape, interpolation='nearest'):
+        """See :func:`BaseInstanceMasks.resize`."""
+        if len(self.masks) == 0:
+            resized_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            resized_masks = np.stack([
+                mmcv.imresize(
+                    mask, out_shape[::-1], interpolation=interpolation)
+                for mask in self.masks
+            ])
+        return BitmapMasks(resized_masks, *out_shape)
+
+    def flip(self, flip_direction='horizontal'):
+        """See :func:`BaseInstanceMasks.flip`."""
+        assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+
+        if len(self.masks) == 0:
+            flipped_masks = self.masks
+        else:
+            flipped_masks = np.stack([
+                mmcv.imflip(mask, direction=flip_direction)
+                for mask in self.masks
+            ])
+        return BitmapMasks(flipped_masks, self.height, self.width)
+
+    def pad(self, out_shape, pad_val=0):
+        """See :func:`BaseInstanceMasks.pad`."""
+        if len(self.masks) == 0:
+            padded_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            padded_masks = np.stack([
+                mmcv.impad(mask, shape=out_shape, pad_val=pad_val)
+                for mask in self.masks
+            ])
+        return BitmapMasks(padded_masks, *out_shape)
+
+    def crop(self, bbox):
+        """See :func:`BaseInstanceMasks.crop`."""
+        assert isinstance(bbox, np.ndarray)
+        assert bbox.ndim == 1
+
+        # clip the boundary
+        bbox = bbox.copy()
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1, 1)
+        h = np.maximum(y2 - y1, 1)
+
+        if len(self.masks) == 0:
+            cropped_masks = np.empty((0, h, w), dtype=np.uint8)
+        else:
+            cropped_masks = self.masks[:, y1:y1 + h, x1:x1 + w]
+        return BitmapMasks(cropped_masks, h, w)
+
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device='cpu',
+                        interpolation='bilinear',
+                        binarize=True):
+        """See :func:`BaseInstanceMasks.crop_and_resize`."""
+        if len(self.masks) == 0:
+            empty_masks = np.empty((0, *out_shape), dtype=np.uint8)
+            return BitmapMasks(empty_masks, *out_shape)
+
+        # convert bboxes to tensor
+        if isinstance(bboxes, np.ndarray):
+            bboxes = torch.from_numpy(bboxes).to(device=device)
+        if isinstance(inds, np.ndarray):
+            inds = torch.from_numpy(inds).to(device=device)
+
+        num_bbox = bboxes.shape[0]
+        fake_inds = torch.arange(
+            num_bbox, device=device).to(dtype=bboxes.dtype)[:, None]
+        rois = torch.cat([fake_inds, bboxes], dim=1)  # Nx5
+        rois = rois.to(device=device)
+        if num_bbox > 0:
+            gt_masks_th = torch.from_numpy(self.masks).to(device).index_select(
+                0, inds).to(dtype=rois.dtype)
+            targets = roi_align(gt_masks_th[:, None, :, :], rois, out_shape,
+                                1.0, 0, 'avg', True).squeeze(1)
+            if binarize:
+                resized_masks = (targets >= 0.5).cpu().numpy()
+            else:
+                resized_masks = targets.cpu().numpy()
+        else:
+            resized_masks = []
+        return BitmapMasks(resized_masks, *out_shape)
+
+    def expand(self, expanded_h, expanded_w, top, left):
+        """See :func:`BaseInstanceMasks.expand`."""
+        if len(self.masks) == 0:
+            expanded_mask = np.empty((0, expanded_h, expanded_w),
+                                     dtype=np.uint8)
+        else:
+            expanded_mask = np.zeros((len(self), expanded_h, expanded_w),
+                                     dtype=np.uint8)
+            expanded_mask[:, top:top + self.height,
+                          left:left + self.width] = self.masks
+        return BitmapMasks(expanded_mask, expanded_h, expanded_w)
+
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  border_value=0,
+                  interpolation='bilinear'):
+        """Translate the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+            border_value (int | float): Border value. Default 0 for masks.
+            interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+        Returns:
+            BitmapMasks: Translated BitmapMasks.
+
+        Example:
+            >>> from mmdet.data_elements.mask.structures import BitmapMasks
+            >>> self = BitmapMasks.random(dtype=np.uint8)
+            >>> out_shape = (32, 32)
+            >>> offset = 4
+            >>> direction = 'horizontal'
+            >>> border_value = 0
+            >>> interpolation = 'bilinear'
+            >>> # Note, There seem to be issues when:
+            >>> # * the mask dtype is not supported by cv2.AffineWarp
+            >>> new = self.translate(out_shape, offset, direction,
+            >>>                      border_value, interpolation)
+            >>> assert len(new) == len(self)
+            >>> assert new.height, new.width == out_shape
+        """
+        if len(self.masks) == 0:
+            translated_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            masks = self.masks
+            if masks.shape[-2:] != out_shape:
+                empty_masks = np.zeros((masks.shape[0], *out_shape),
+                                       dtype=masks.dtype)
+                min_h = min(out_shape[0], masks.shape[1])
+                min_w = min(out_shape[1], masks.shape[2])
+                empty_masks[:, :min_h, :min_w] = masks[:, :min_h, :min_w]
+                masks = empty_masks
+            translated_masks = mmcv.imtranslate(
+                masks.transpose((1, 2, 0)),
+                offset,
+                direction,
+                border_value=border_value,
+                interpolation=interpolation)
+            if translated_masks.ndim == 2:
+                translated_masks = translated_masks[:, :, None]
+            translated_masks = translated_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(translated_masks, *out_shape)
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """Shear the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The shear direction, either "horizontal"
+                or "vertical".
+            border_value (int | tuple[int]): Value used in case of a
+                constant border.
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+
+        Returns:
+            BitmapMasks: The sheared masks.
+        """
+        if len(self.masks) == 0:
+            sheared_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            sheared_masks = mmcv.imshear(
+                self.masks.transpose((1, 2, 0)),
+                magnitude,
+                direction,
+                border_value=border_value,
+                interpolation=interpolation)
+            if sheared_masks.ndim == 2:
+                sheared_masks = sheared_masks[:, :, None]
+            sheared_masks = sheared_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(sheared_masks, *out_shape)
+
+    def rotate(self,
+               out_shape,
+               angle,
+               center=None,
+               scale=1.0,
+               border_value=0,
+               interpolation='bilinear'):
+        """Rotate the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            angle (int | float): Rotation angle in degrees. Positive values
+                mean counter-clockwise rotation.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation in source image. If not specified, the center of
+                the image will be used.
+            scale (int | float): Isotropic scale factor.
+            border_value (int | float): Border value. Default 0 for masks.
+            interpolation (str): Same as in :func:`mmcv.imrotate`.
+
+        Returns:
+            BitmapMasks: Rotated BitmapMasks.
+        """
+        if len(self.masks) == 0:
+            rotated_masks = np.empty((0, *out_shape), dtype=self.masks.dtype)
+        else:
+            rotated_masks = mmcv.imrotate(
+                self.masks.transpose((1, 2, 0)),
+                angle,
+                center=center,
+                scale=scale,
+                border_value=border_value,
+                interpolation=interpolation)
+            if rotated_masks.ndim == 2:
+                # case when only one mask, (h, w)
+                rotated_masks = rotated_masks[:, :, None]  # (h, w, 1)
+            rotated_masks = rotated_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(rotated_masks, *out_shape)
+
+    @property
+    def areas(self):
+        """See :py:attr:`BaseInstanceMasks.areas`."""
+        return self.masks.sum((1, 2))
+
+    def to_ndarray(self):
+        """See :func:`BaseInstanceMasks.to_ndarray`."""
+        return self.masks
+
+    def to_tensor(self, dtype, device):
+        """See :func:`BaseInstanceMasks.to_tensor`."""
+        return torch.tensor(self.masks, dtype=dtype, device=device)
+
+    @classmethod
+    def random(cls,
+               num_masks=3,
+               height=32,
+               width=32,
+               dtype=np.uint8,
+               rng=None):
+        """Generate random bitmap masks for demo / testing purposes.
+
+        Example:
+            >>> from mmdet.data_elements.mask.structures import BitmapMasks
+            >>> self = BitmapMasks.random()
+            >>> print('self = {}'.format(self))
+            self = BitmapMasks(num_masks=3, height=32, width=32)
+        """
+        from mmdet.utils.util_random import ensure_rng
+        rng = ensure_rng(rng)
+        masks = (rng.rand(num_masks, height, width) > 0.1).astype(dtype)
+        self = cls(masks, height=height, width=width)
+        return self
+
+    @classmethod
+    def cat(cls: Type[T], masks: Sequence[T]) -> T:
+        """Concatenate a sequence of masks into one single mask instance.
+
+        Args:
+            masks (Sequence[BitmapMasks]): A sequence of mask instances.
+
+        Returns:
+            BitmapMasks: Concatenated mask instance.
+        """
+        assert isinstance(masks, Sequence)
+        if len(masks) == 0:
+            raise ValueError('masks should not be an empty list.')
+        assert all(isinstance(m, cls) for m in masks)
+
+        mask_array = np.concatenate([m.masks for m in masks], axis=0)
+        return cls(mask_array, *mask_array.shape[1:])
+
+
+class PolygonMasks(BaseInstanceMasks):
+    """This class represents masks in the form of polygons.
+
+    Polygons is a list of three levels. The first level of the list
+    corresponds to objects, the second level to the polys that compose the
+    object, the third level to the poly coordinates
+
+    Args:
+        masks (list[list[ndarray]]): The first level of the list
+            corresponds to objects, the second level to the polys that
+            compose the object, the third level to the poly coordinates
+        height (int): height of masks
+        width (int): width of masks
+
+    Example:
+        >>> from mmdet.data_elements.mask.structures import *  # NOQA
+        >>> masks = [
+        >>>     [ np.array([0, 0, 10, 0, 10, 10., 0, 10, 0, 0]) ]
+        >>> ]
+        >>> height, width = 16, 16
+        >>> self = PolygonMasks(masks, height, width)
+
+        >>> # demo translate
+        >>> new = self.translate((16, 16), 4., direction='horizontal')
+        >>> assert np.all(new.masks[0][0][1::2] == masks[0][0][1::2])
+        >>> assert np.all(new.masks[0][0][0::2] == masks[0][0][0::2] + 4)
+
+        >>> # demo crop_and_resize
+        >>> num_boxes = 3
+        >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes)
+        >>> out_shape = (16, 16)
+        >>> inds = torch.randint(0, len(self), size=(num_boxes,))
+        >>> device = 'cpu'
+        >>> interpolation = 'bilinear'
+        >>> new = self.crop_and_resize(
+        ...     bboxes, out_shape, inds, device, interpolation)
+        >>> assert len(new) == num_boxes
+        >>> assert new.height, new.width == out_shape
+    """
+
+    def __init__(self, masks, height, width):
+        assert isinstance(masks, list)
+        if len(masks) > 0:
+            assert isinstance(masks[0], list)
+            assert isinstance(masks[0][0], np.ndarray)
+
+        self.height = height
+        self.width = width
+        self.masks = masks
+
+    def __getitem__(self, index):
+        """Index the polygon masks.
+
+        Args:
+            index (ndarray | List): The indices.
+
+        Returns:
+            :obj:`PolygonMasks`: The indexed polygon masks.
+        """
+        if isinstance(index, np.ndarray):
+            if index.dtype == bool:
+                index = np.where(index)[0].tolist()
+            else:
+                index = index.tolist()
+        if isinstance(index, list):
+            masks = [self.masks[i] for i in index]
+        else:
+            try:
+                masks = self.masks[index]
+            except Exception:
+                raise ValueError(
+                    f'Unsupported input of type {type(index)} for indexing!')
+        if len(masks) and isinstance(masks[0], np.ndarray):
+            masks = [masks]  # ensure a list of three levels
+        return PolygonMasks(masks, self.height, self.width)
+
+    def __iter__(self):
+        return iter(self.masks)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'num_masks={len(self.masks)}, '
+        s += f'height={self.height}, '
+        s += f'width={self.width})'
+        return s
+
+    def __len__(self):
+        """Number of masks."""
+        return len(self.masks)
+
+    def rescale(self, scale, interpolation=None):
+        """see :func:`BaseInstanceMasks.rescale`"""
+        new_w, new_h = mmcv.rescale_size((self.width, self.height), scale)
+        if len(self.masks) == 0:
+            rescaled_masks = PolygonMasks([], new_h, new_w)
+        else:
+            rescaled_masks = self.resize((new_h, new_w))
+        return rescaled_masks
+
+    def resize(self, out_shape, interpolation=None):
+        """see :func:`BaseInstanceMasks.resize`"""
+        if len(self.masks) == 0:
+            resized_masks = PolygonMasks([], *out_shape)
+        else:
+            h_scale = out_shape[0] / self.height
+            w_scale = out_shape[1] / self.width
+            resized_masks = []
+            for poly_per_obj in self.masks:
+                resized_poly = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    p[0::2] = p[0::2] * w_scale
+                    p[1::2] = p[1::2] * h_scale
+                    resized_poly.append(p)
+                resized_masks.append(resized_poly)
+            resized_masks = PolygonMasks(resized_masks, *out_shape)
+        return resized_masks
+
+    def flip(self, flip_direction='horizontal'):
+        """see :func:`BaseInstanceMasks.flip`"""
+        assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+        if len(self.masks) == 0:
+            flipped_masks = PolygonMasks([], self.height, self.width)
+        else:
+            flipped_masks = []
+            for poly_per_obj in self.masks:
+                flipped_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    if flip_direction == 'horizontal':
+                        p[0::2] = self.width - p[0::2]
+                    elif flip_direction == 'vertical':
+                        p[1::2] = self.height - p[1::2]
+                    else:
+                        p[0::2] = self.width - p[0::2]
+                        p[1::2] = self.height - p[1::2]
+                    flipped_poly_per_obj.append(p)
+                flipped_masks.append(flipped_poly_per_obj)
+            flipped_masks = PolygonMasks(flipped_masks, self.height,
+                                         self.width)
+        return flipped_masks
+
+    def crop(self, bbox):
+        """see :func:`BaseInstanceMasks.crop`"""
+        assert isinstance(bbox, np.ndarray)
+        assert bbox.ndim == 1
+
+        # clip the boundary
+        bbox = bbox.copy()
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1, 1)
+        h = np.maximum(y2 - y1, 1)
+
+        if len(self.masks) == 0:
+            cropped_masks = PolygonMasks([], h, w)
+        else:
+            # reference: https://github.com/facebookresearch/fvcore/blob/main/fvcore/transforms/transform.py  # noqa
+            crop_box = geometry.box(x1, y1, x2, y2).buffer(0.0)
+            cropped_masks = []
+            # suppress shapely warnings util it incorporates GEOS>=3.11.2
+            # reference: https://github.com/shapely/shapely/issues/1345
+            initial_settings = np.seterr()
+            np.seterr(invalid='ignore')
+            for poly_per_obj in self.masks:
+                cropped_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    p = geometry.Polygon(p.reshape(-1, 2)).buffer(0.0)
+                    # polygon must be valid to perform intersection.
+                    if not p.is_valid:
+                        continue
+                    cropped = p.intersection(crop_box)
+                    if cropped.is_empty:
+                        continue
+                    if isinstance(cropped,
+                                  geometry.collection.BaseMultipartGeometry):
+                        cropped = cropped.geoms
+                    else:
+                        cropped = [cropped]
+                    # one polygon may be cropped to multiple ones
+                    for poly in cropped:
+                        # ignore lines or points
+                        if not isinstance(
+                                poly, geometry.Polygon) or not poly.is_valid:
+                            continue
+                        coords = np.asarray(poly.exterior.coords)
+                        # remove an extra identical vertex at the end
+                        coords = coords[:-1]
+                        coords[:, 0] -= x1
+                        coords[:, 1] -= y1
+                        cropped_poly_per_obj.append(coords.reshape(-1))
+                # a dummy polygon to avoid misalignment between masks and boxes
+                if len(cropped_poly_per_obj) == 0:
+                    cropped_poly_per_obj = [np.array([0, 0, 0, 0, 0, 0])]
+                cropped_masks.append(cropped_poly_per_obj)
+            np.seterr(**initial_settings)
+            cropped_masks = PolygonMasks(cropped_masks, h, w)
+        return cropped_masks
+
+    def pad(self, out_shape, pad_val=0):
+        """padding has no effect on polygons`"""
+        return PolygonMasks(self.masks, *out_shape)
+
+    def expand(self, *args, **kwargs):
+        """TODO: Add expand for polygon"""
+        raise NotImplementedError
+
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device='cpu',
+                        interpolation='bilinear',
+                        binarize=True):
+        """see :func:`BaseInstanceMasks.crop_and_resize`"""
+        out_h, out_w = out_shape
+        if len(self.masks) == 0:
+            return PolygonMasks([], out_h, out_w)
+
+        if not binarize:
+            raise ValueError('Polygons are always binary, '
+                             'setting binarize=False is unsupported')
+
+        resized_masks = []
+        for i in range(len(bboxes)):
+            mask = self.masks[inds[i]]
+            bbox = bboxes[i, :]
+            x1, y1, x2, y2 = bbox
+            w = np.maximum(x2 - x1, 1)
+            h = np.maximum(y2 - y1, 1)
+            h_scale = out_h / max(h, 0.1)  # avoid too large scale
+            w_scale = out_w / max(w, 0.1)
+
+            resized_mask = []
+            for p in mask:
+                p = p.copy()
+                # crop
+                # pycocotools will clip the boundary
+                p[0::2] = p[0::2] - bbox[0]
+                p[1::2] = p[1::2] - bbox[1]
+
+                # resize
+                p[0::2] = p[0::2] * w_scale
+                p[1::2] = p[1::2] * h_scale
+                resized_mask.append(p)
+            resized_masks.append(resized_mask)
+        return PolygonMasks(resized_masks, *out_shape)
+
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  border_value=None,
+                  interpolation=None):
+        """Translate the PolygonMasks.
+
+        Example:
+            >>> self = PolygonMasks.random(dtype=np.int64)
+            >>> out_shape = (self.height, self.width)
+            >>> new = self.translate(out_shape, 4., direction='horizontal')
+            >>> assert np.all(new.masks[0][0][1::2] == self.masks[0][0][1::2])
+            >>> assert np.all(new.masks[0][0][0::2] == self.masks[0][0][0::2] + 4)  # noqa: E501
+        """
+        assert border_value is None or border_value == 0, \
+            'Here border_value is not '\
+            f'used, and defaultly should be None or 0. got {border_value}.'
+        if len(self.masks) == 0:
+            translated_masks = PolygonMasks([], *out_shape)
+        else:
+            translated_masks = []
+            for poly_per_obj in self.masks:
+                translated_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    if direction == 'horizontal':
+                        p[0::2] = np.clip(p[0::2] + offset, 0, out_shape[1])
+                    elif direction == 'vertical':
+                        p[1::2] = np.clip(p[1::2] + offset, 0, out_shape[0])
+                    translated_poly_per_obj.append(p)
+                translated_masks.append(translated_poly_per_obj)
+            translated_masks = PolygonMasks(translated_masks, *out_shape)
+        return translated_masks
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """See :func:`BaseInstanceMasks.shear`."""
+        if len(self.masks) == 0:
+            sheared_masks = PolygonMasks([], *out_shape)
+        else:
+            sheared_masks = []
+            if direction == 'horizontal':
+                shear_matrix = np.stack([[1, magnitude],
+                                         [0, 1]]).astype(np.float32)
+            elif direction == 'vertical':
+                shear_matrix = np.stack([[1, 0], [magnitude,
+                                                  1]]).astype(np.float32)
+            for poly_per_obj in self.masks:
+                sheared_poly = []
+                for p in poly_per_obj:
+                    p = np.stack([p[0::2], p[1::2]], axis=0)  # [2, n]
+                    new_coords = np.matmul(shear_matrix, p)  # [2, n]
+                    new_coords[0, :] = np.clip(new_coords[0, :], 0,
+                                               out_shape[1])
+                    new_coords[1, :] = np.clip(new_coords[1, :], 0,
+                                               out_shape[0])
+                    sheared_poly.append(
+                        new_coords.transpose((1, 0)).reshape(-1))
+                sheared_masks.append(sheared_poly)
+            sheared_masks = PolygonMasks(sheared_masks, *out_shape)
+        return sheared_masks
+
+    def rotate(self,
+               out_shape,
+               angle,
+               center=None,
+               scale=1.0,
+               border_value=0,
+               interpolation='bilinear'):
+        """See :func:`BaseInstanceMasks.rotate`."""
+        if len(self.masks) == 0:
+            rotated_masks = PolygonMasks([], *out_shape)
+        else:
+            rotated_masks = []
+            rotate_matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+            for poly_per_obj in self.masks:
+                rotated_poly = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    coords = np.stack([p[0::2], p[1::2]], axis=1)  # [n, 2]
+                    # pad 1 to convert from format [x, y] to homogeneous
+                    # coordinates format [x, y, 1]
+                    coords = np.concatenate(
+                        (coords, np.ones((coords.shape[0], 1), coords.dtype)),
+                        axis=1)  # [n, 3]
+                    rotated_coords = np.matmul(
+                        rotate_matrix[None, :, :],
+                        coords[:, :, None])[..., 0]  # [n, 2, 1] -> [n, 2]
+                    rotated_coords[:, 0] = np.clip(rotated_coords[:, 0], 0,
+                                                   out_shape[1])
+                    rotated_coords[:, 1] = np.clip(rotated_coords[:, 1], 0,
+                                                   out_shape[0])
+                    rotated_poly.append(rotated_coords.reshape(-1))
+                rotated_masks.append(rotated_poly)
+            rotated_masks = PolygonMasks(rotated_masks, *out_shape)
+        return rotated_masks
+
+    def to_bitmap(self):
+        """convert polygon masks to bitmap masks."""
+        bitmap_masks = self.to_ndarray()
+        return BitmapMasks(bitmap_masks, self.height, self.width)
+
+    @property
+    def areas(self):
+        """Compute areas of masks.
+
+        This func is modified from `detectron2
+        <https://github.com/facebookresearch/detectron2/blob/ffff8acc35ea88ad1cb1806ab0f00b4c1c5dbfd9/detectron2/structures/masks.py#L387>`_.
+        The function only works with Polygons using the shoelace formula.
+
+        Return:
+            ndarray: areas of each instance
+        """  # noqa: W501
+        area = []
+        for polygons_per_obj in self.masks:
+            area_per_obj = 0
+            for p in polygons_per_obj:
+                area_per_obj += self._polygon_area(p[0::2], p[1::2])
+            area.append(area_per_obj)
+        return np.asarray(area)
+
+    def _polygon_area(self, x, y):
+        """Compute the area of a component of a polygon.
+
+        Using the shoelace formula:
+        https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+
+        Args:
+            x (ndarray): x coordinates of the component
+            y (ndarray): y coordinates of the component
+
+        Return:
+            float: the are of the component
+        """  # noqa: 501
+        return 0.5 * np.abs(
+            np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
+
+    def to_ndarray(self):
+        """Convert masks to the format of ndarray."""
+        if len(self.masks) == 0:
+            return np.empty((0, self.height, self.width), dtype=np.uint8)
+        bitmap_masks = []
+        for poly_per_obj in self.masks:
+            bitmap_masks.append(
+                polygon_to_bitmap(poly_per_obj, self.height, self.width))
+        return np.stack(bitmap_masks)
+
+    def to_tensor(self, dtype, device):
+        """See :func:`BaseInstanceMasks.to_tensor`."""
+        if len(self.masks) == 0:
+            return torch.empty((0, self.height, self.width),
+                               dtype=dtype,
+                               device=device)
+        ndarray_masks = self.to_ndarray()
+        return torch.tensor(ndarray_masks, dtype=dtype, device=device)
+
+    @classmethod
+    def random(cls,
+               num_masks=3,
+               height=32,
+               width=32,
+               n_verts=5,
+               dtype=np.float32,
+               rng=None):
+        """Generate random polygon masks for demo / testing purposes.
+
+        Adapted from [1]_
+
+        References:
+            .. [1] https://gitlab.kitware.com/computer-vision/kwimage/-/blob/928cae35ca8/kwimage/structs/polygon.py#L379  # noqa: E501
+
+        Example:
+            >>> from mmdet.data_elements.mask.structures import PolygonMasks
+            >>> self = PolygonMasks.random()
+            >>> print('self = {}'.format(self))
+        """
+        from mmdet.utils.util_random import ensure_rng
+        rng = ensure_rng(rng)
+
+        def _gen_polygon(n, irregularity, spikeyness):
+            """Creates the polygon by sampling points on a circle around the
+            centre.  Random noise is added by varying the angular spacing
+            between sequential points, and by varying the radial distance of
+            each point from the centre.
+
+            Based on original code by Mike Ounsworth
+
+            Args:
+                n (int): number of vertices
+                irregularity (float): [0,1] indicating how much variance there
+                    is in the angular spacing of vertices. [0,1] will map to
+                    [0, 2pi/numberOfVerts]
+                spikeyness (float): [0,1] indicating how much variance there is
+                    in each vertex from the circle of radius aveRadius. [0,1]
+                    will map to [0, aveRadius]
+
+            Returns:
+                a list of vertices, in CCW order.
+            """
+            from scipy.stats import truncnorm
+
+            # Generate around the unit circle
+            cx, cy = (0.0, 0.0)
+            radius = 1
+
+            tau = np.pi * 2
+
+            irregularity = np.clip(irregularity, 0, 1) * 2 * np.pi / n
+            spikeyness = np.clip(spikeyness, 1e-9, 1)
+
+            # generate n angle steps
+            lower = (tau / n) - irregularity
+            upper = (tau / n) + irregularity
+            angle_steps = rng.uniform(lower, upper, n)
+
+            # normalize the steps so that point 0 and point n+1 are the same
+            k = angle_steps.sum() / (2 * np.pi)
+            angles = (angle_steps / k).cumsum() + rng.uniform(0, tau)
+
+            # Convert high and low values to be wrt the standard normal range
+            # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.truncnorm.html
+            low = 0
+            high = 2 * radius
+            mean = radius
+            std = spikeyness
+            a = (low - mean) / std
+            b = (high - mean) / std
+            tnorm = truncnorm(a=a, b=b, loc=mean, scale=std)
+
+            # now generate the points
+            radii = tnorm.rvs(n, random_state=rng)
+            x_pts = cx + radii * np.cos(angles)
+            y_pts = cy + radii * np.sin(angles)
+
+            points = np.hstack([x_pts[:, None], y_pts[:, None]])
+
+            # Scale to 0-1 space
+            points = points - points.min(axis=0)
+            points = points / points.max(axis=0)
+
+            # Randomly place within 0-1 space
+            points = points * (rng.rand() * .8 + .2)
+            min_pt = points.min(axis=0)
+            max_pt = points.max(axis=0)
+
+            high = (1 - max_pt)
+            low = (0 - min_pt)
+            offset = (rng.rand(2) * (high - low)) + low
+            points = points + offset
+            return points
+
+        def _order_vertices(verts):
+            """
+            References:
+                https://stackoverflow.com/questions/1709283/how-can-i-sort-a-coordinate-list-for-a-rectangle-counterclockwise
+            """
+            mlat = verts.T[0].sum() / len(verts)
+            mlng = verts.T[1].sum() / len(verts)
+
+            tau = np.pi * 2
+            angle = (np.arctan2(mlat - verts.T[0], verts.T[1] - mlng) +
+                     tau) % tau
+            sortx = angle.argsort()
+            verts = verts.take(sortx, axis=0)
+            return verts
+
+        # Generate a random exterior for each requested mask
+        masks = []
+        for _ in range(num_masks):
+            exterior = _order_vertices(_gen_polygon(n_verts, 0.9, 0.9))
+            exterior = (exterior * [(width, height)]).astype(dtype)
+            masks.append([exterior.ravel()])
+
+        self = cls(masks, height, width)
+        return self
+
+    @classmethod
+    def cat(cls: Type[T], masks: Sequence[T]) -> T:
+        """Concatenate a sequence of masks into one single mask instance.
+
+        Args:
+            masks (Sequence[PolygonMasks]): A sequence of mask instances.
+
+        Returns:
+            PolygonMasks: Concatenated mask instance.
+        """
+        assert isinstance(masks, Sequence)
+        if len(masks) == 0:
+            raise ValueError('masks should not be an empty list.')
+        assert all(isinstance(m, cls) for m in masks)
+
+        mask_list = list(itertools.chain(*[m.masks for m in masks]))
+        return cls(mask_list, masks[0].height, masks[0].width)
+
+
+def polygon_to_bitmap(polygons, height, width):
+    """Convert masks from the form of polygons to bitmaps.
+
+    Args:
+        polygons (list[ndarray]): masks in polygon representation
+        height (int): mask height
+        width (int): mask width
+
+    Return:
+        ndarray: the converted masks in bitmap representation
+    """
+    rles = maskUtils.frPyObjects(polygons, height, width)
+    rle = maskUtils.merge(rles)
+    bitmap_mask = maskUtils.decode(rle).astype(bool)
+    return bitmap_mask
+
+
+def bitmap_to_polygon(bitmap):
+    """Convert masks from the form of bitmaps to polygons.
+
+    Args:
+        bitmap (ndarray): masks in bitmap representation.
+
+    Return:
+        list[ndarray]: the converted mask in polygon representation.
+        bool: whether the mask has holes.
+    """
+    bitmap = np.ascontiguousarray(bitmap).astype(np.uint8)
+    # cv2.RETR_CCOMP: retrieves all of the contours and organizes them
+    #   into a two-level hierarchy. At the top level, there are external
+    #   boundaries of the components. At the second level, there are
+    #   boundaries of the holes. If there is another contour inside a hole
+    #   of a connected component, it is still put at the top level.
+    # cv2.CHAIN_APPROX_NONE: stores absolutely all the contour points.
+    outs = cv2.findContours(bitmap, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
+    contours = outs[-2]
+    hierarchy = outs[-1]
+    if hierarchy is None:
+        return [], False
+    # hierarchy[i]: 4 elements, for the indexes of next, previous,
+    # parent, or nested contours. If there is no corresponding contour,
+    # it will be -1.
+    with_hole = (hierarchy.reshape(-1, 4)[:, 3] >= 0).any()
+    contours = [c.reshape(-1, 2) for c in contours]
+    return contours, with_hole
diff --git a/head_extractor/build/lib/mmdet/structures/mask/utils.py b/head_extractor/build/lib/mmdet/structures/mask/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bd445e4fce1a312949f222d54d230a1a622d726
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/mask/utils.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from mmengine.utils import slice_list
+
+
+def split_combined_polys(polys, poly_lens, polys_per_mask):
+    """Split the combined 1-D polys into masks.
+
+    A mask is represented as a list of polys, and a poly is represented as
+    a 1-D array. In dataset, all masks are concatenated into a single 1-D
+    tensor. Here we need to split the tensor into original representations.
+
+    Args:
+        polys (list): a list (length = image num) of 1-D tensors
+        poly_lens (list): a list (length = image num) of poly length
+        polys_per_mask (list): a list (length = image num) of poly number
+            of each mask
+
+    Returns:
+        list: a list (length = image num) of list (length = mask num) of \
+            list (length = poly num) of numpy array.
+    """
+    mask_polys_list = []
+    for img_id in range(len(polys)):
+        polys_single = polys[img_id]
+        polys_lens_single = poly_lens[img_id].tolist()
+        polys_per_mask_single = polys_per_mask[img_id].tolist()
+
+        split_polys = slice_list(polys_single, polys_lens_single)
+        mask_polys = slice_list(split_polys, polys_per_mask_single)
+        mask_polys_list.append(mask_polys)
+    return mask_polys_list
+
+
+# TODO: move this function to more proper place
+def encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code.
+
+    Args:
+        mask_results (list): bitmap mask results.
+
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    encoded_mask_results = []
+    for mask in mask_results:
+        encoded_mask_results.append(
+            mask_util.encode(
+                np.array(mask[:, :, np.newaxis], order='F',
+                         dtype='uint8'))[0])  # encoded with RLE
+    return encoded_mask_results
+
+
+def mask2bbox(masks):
+    """Obtain tight bounding boxes of binary masks.
+
+    Args:
+        masks (Tensor): Binary mask of shape (n, h, w).
+
+    Returns:
+        Tensor: Bboxe with shape (n, 4) of \
+            positive region in binary mask.
+    """
+    N = masks.shape[0]
+    bboxes = masks.new_zeros((N, 4), dtype=torch.float32)
+    x_any = torch.any(masks, dim=1)
+    y_any = torch.any(masks, dim=2)
+    for i in range(N):
+        x = torch.where(x_any[i, :])[0]
+        y = torch.where(y_any[i, :])[0]
+        if len(x) > 0 and len(y) > 0:
+            bboxes[i, :] = bboxes.new_tensor(
+                [x[0], y[0], x[-1] + 1, y[-1] + 1])
+
+    return bboxes
diff --git a/head_extractor/build/lib/mmdet/structures/reid_data_sample.py b/head_extractor/build/lib/mmdet/structures/reid_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..69958eece3671c9040c1f5561e724ca2d5f8e155
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/reid_data_sample.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from numbers import Number
+from typing import Sequence, Union
+
+import mmengine
+import numpy as np
+import torch
+from mmengine.structures import BaseDataElement, LabelData
+
+
+def format_label(value: Union[torch.Tensor, np.ndarray, Sequence, int],
+                 num_classes: int = None) -> LabelData:
+    """Convert label of various python types to :obj:`mmengine.LabelData`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int`.
+
+    Args:
+        value (torch.Tensor | numpy.ndarray | Sequence | int): Label value.
+        num_classes (int, optional): The number of classes. If not None, set
+            it to the metainfo. Defaults to None.
+
+    Returns:
+        :obj:`mmengine.LabelData`: The foramtted label data.
+    """
+
+    # Handle single number
+    if isinstance(value, (torch.Tensor, np.ndarray)) and value.ndim == 0:
+        value = int(value.item())
+
+    if isinstance(value, np.ndarray):
+        value = torch.from_numpy(value)
+    elif isinstance(value, Sequence) and not mmengine.utils.is_str(value):
+        value = torch.tensor(value)
+    elif isinstance(value, int):
+        value = torch.LongTensor([value])
+    elif not isinstance(value, torch.Tensor):
+        raise TypeError(f'Type {type(value)} is not an available label type.')
+
+    metainfo = {}
+    if num_classes is not None:
+        metainfo['num_classes'] = num_classes
+        if value.max() >= num_classes:
+            raise ValueError(f'The label data ({value}) should not '
+                             f'exceed num_classes ({num_classes}).')
+    label = LabelData(label=value, metainfo=metainfo)
+    return label
+
+
+class ReIDDataSample(BaseDataElement):
+    """A data structure interface of ReID task.
+
+    It's used as interfaces between different components.
+
+    Meta field:
+        img_shape (Tuple): The shape of the corresponding input image.
+            Used for visualization.
+        ori_shape (Tuple): The original shape of the corresponding image.
+            Used for visualization.
+        num_classes (int): The number of all categories.
+            Used for label format conversion.
+
+    Data field:
+        gt_label (LabelData): The ground truth label.
+        pred_label (LabelData): The predicted label.
+        scores (torch.Tensor): The outputs of model.
+    """
+
+    @property
+    def gt_label(self):
+        return self._gt_label
+
+    @gt_label.setter
+    def gt_label(self, value: LabelData):
+        self.set_field(value, '_gt_label', dtype=LabelData)
+
+    @gt_label.deleter
+    def gt_label(self):
+        del self._gt_label
+
+    def set_gt_label(
+        self, value: Union[np.ndarray, torch.Tensor, Sequence[Number], Number]
+    ) -> 'ReIDDataSample':
+        """Set label of ``gt_label``."""
+        label = format_label(value, self.get('num_classes'))
+        if 'gt_label' in self:  # setting for the second time
+            self.gt_label.label = label.label
+        else:  # setting for the first time
+            self.gt_label = label
+        return self
+
+    def set_gt_score(self, value: torch.Tensor) -> 'ReIDDataSample':
+        """Set score of ``gt_label``."""
+        assert isinstance(value, torch.Tensor), \
+            f'The value should be a torch.Tensor but got {type(value)}.'
+        assert value.ndim == 1, \
+            f'The dims of value should be 1, but got {value.ndim}.'
+
+        if 'num_classes' in self:
+            assert value.size(0) == self.num_classes, \
+                f"The length of value ({value.size(0)}) doesn't "\
+                f'match the num_classes ({self.num_classes}).'
+            metainfo = {'num_classes': self.num_classes}
+        else:
+            metainfo = {'num_classes': value.size(0)}
+
+        if 'gt_label' in self:  # setting for the second time
+            self.gt_label.score = value
+        else:  # setting for the first time
+            self.gt_label = LabelData(score=value, metainfo=metainfo)
+        return self
+
+    @property
+    def pred_feature(self):
+        return self._pred_feature
+
+    @pred_feature.setter
+    def pred_feature(self, value: torch.Tensor):
+        self.set_field(value, '_pred_feature', dtype=torch.Tensor)
+
+    @pred_feature.deleter
+    def pred_feature(self):
+        del self._pred_feature
diff --git a/head_extractor/build/lib/mmdet/structures/track_data_sample.py b/head_extractor/build/lib/mmdet/structures/track_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..d005a5a42f57682d0b76d60d3dae463c4b4dc727
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/structures/track_data_sample.py
@@ -0,0 +1,273 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence
+
+import numpy as np
+import torch
+from mmengine.structures import BaseDataElement
+
+from .det_data_sample import DetDataSample
+
+
+class TrackDataSample(BaseDataElement):
+    """A data structure interface of tracking task in MMDetection. It is used
+    as interfaces between different components.
+
+    This data structure can be viewd as a wrapper of multiple DetDataSample to
+    some extent. Specifically, it only contains a property:
+    ``video_data_samples`` which is a list of DetDataSample, each of which
+    corresponds to a single frame. If you want to get the property of a single
+    frame, you must first get the corresponding ``DetDataSample`` by indexing
+    and then get the property of the frame, such as ``gt_instances``,
+    ``pred_instances`` and so on. As for metainfo, it differs from
+    ``DetDataSample`` in that each value corresponds to the metainfo key is a
+    list where each element corresponds to information of a single frame.
+
+    Examples:
+        >>> import torch
+        >>> from mmengine.structures import InstanceData
+        >>> from mmdet.structures import DetDataSample, TrackDataSample
+        >>> track_data_sample = TrackDataSample()
+        >>> # set the 1st frame
+        >>> frame1_data_sample = DetDataSample(metainfo=dict(
+        ...         img_shape=(100, 100), frame_id=0))
+        >>> frame1_gt_instances = InstanceData()
+        >>> frame1_gt_instances.bbox = torch.zeros([2, 4])
+        >>> frame1_data_sample.gt_instances = frame1_gt_instances
+        >>> # set the 2nd frame
+        >>> frame2_data_sample = DetDataSample(metainfo=dict(
+        ...         img_shape=(100, 100), frame_id=1))
+        >>> frame2_gt_instances = InstanceData()
+        >>> frame2_gt_instances.bbox = torch.ones([3, 4])
+        >>> frame2_data_sample.gt_instances = frame2_gt_instances
+        >>> track_data_sample.video_data_samples = [frame1_data_sample,
+        ...                                         frame2_data_sample]
+        >>> # set metainfo for track_data_sample
+        >>> track_data_sample.set_metainfo(dict(key_frames_inds=[0]))
+        >>> track_data_sample.set_metainfo(dict(ref_frames_inds=[1]))
+        >>> print(track_data_sample)
+        <TrackDataSample(
+
+            META INFORMATION
+            key_frames_inds: [0]
+            ref_frames_inds: [1]
+
+            DATA FIELDS
+            video_data_samples: [<DetDataSample(
+
+                    META INFORMATION
+                    img_shape: (100, 100)
+
+                    DATA FIELDS
+                    gt_instances: <InstanceData(
+
+                            META INFORMATION
+
+                            DATA FIELDS
+                            bbox: tensor([[0., 0., 0., 0.],
+                                        [0., 0., 0., 0.]])
+                        ) at 0x7f639320dcd0>
+                ) at 0x7f64bd223340>, <DetDataSample(
+
+                    META INFORMATION
+                    img_shape: (100, 100)
+
+                    DATA FIELDS
+                    gt_instances: <InstanceData(
+
+                            META INFORMATION
+
+                            DATA FIELDS
+                            bbox: tensor([[1., 1., 1., 1.],
+                                        [1., 1., 1., 1.],
+                                        [1., 1., 1., 1.]])
+                        ) at 0x7f64bd128b20>
+                ) at 0x7f64bd1346d0>]
+        ) at 0x7f64bd2237f0>
+        >>> print(len(track_data_sample))
+        2
+        >>> key_data_sample = track_data_sample.get_key_frames()
+        >>> print(key_data_sample[0].frame_id)
+        0
+        >>> ref_data_sample = track_data_sample.get_ref_frames()
+        >>> print(ref_data_sample[0].frame_id)
+        1
+        >>> frame1_data_sample = track_data_sample[0]
+        >>> print(frame1_data_sample.gt_instances.bbox)
+        tensor([[0., 0., 0., 0.],
+                [0., 0., 0., 0.]])
+        >>> # Tensor-like methods
+        >>> cuda_track_data_sample = track_data_sample.to('cuda')
+        >>> cuda_track_data_sample = track_data_sample.cuda()
+        >>> cpu_track_data_sample = track_data_sample.cpu()
+        >>> cpu_track_data_sample = track_data_sample.to('cpu')
+        >>> fp16_instances = cuda_track_data_sample.to(
+        ...     device=None, dtype=torch.float16, non_blocking=False,
+        ...     copy=False, memory_format=torch.preserve_format)
+    """
+
+    @property
+    def video_data_samples(self) -> List[DetDataSample]:
+        return self._video_data_samples
+
+    @video_data_samples.setter
+    def video_data_samples(self, value: List[DetDataSample]):
+        if isinstance(value, DetDataSample):
+            value = [value]
+        assert isinstance(value, list), 'video_data_samples must be a list'
+        assert isinstance(
+            value[0], DetDataSample
+        ), 'video_data_samples must be a list of DetDataSample, but got '
+        f'{value[0]}'
+        self.set_field(value, '_video_data_samples', dtype=list)
+
+    @video_data_samples.deleter
+    def video_data_samples(self):
+        del self._video_data_samples
+
+    def __getitem__(self, index):
+        assert hasattr(self,
+                       '_video_data_samples'), 'video_data_samples not set'
+        return self._video_data_samples[index]
+
+    def get_key_frames(self):
+        assert hasattr(self, 'key_frames_inds'), \
+            'key_frames_inds not set'
+        assert isinstance(self.key_frames_inds, Sequence)
+        key_frames_info = []
+        for index in self.key_frames_inds:
+            key_frames_info.append(self[index])
+        return key_frames_info
+
+    def get_ref_frames(self):
+        assert hasattr(self, 'ref_frames_inds'), \
+            'ref_frames_inds not set'
+        ref_frames_info = []
+        assert isinstance(self.ref_frames_inds, Sequence)
+        for index in self.ref_frames_inds:
+            ref_frames_info.append(self[index])
+        return ref_frames_info
+
+    def __len__(self):
+        return len(self._video_data_samples) if hasattr(
+            self, '_video_data_samples') else 0
+
+    # TODO: add UT for this Tensor-like method
+    # Tensor-like methods
+    def to(self, *args, **kwargs) -> 'BaseDataElement':
+        """Apply same name function to all tensors in data_fields."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if hasattr(v, 'to'):
+                    v = v.to(*args, **kwargs)
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def cpu(self) -> 'BaseDataElement':
+        """Convert all tensors to CPU in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.cpu()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def cuda(self) -> 'BaseDataElement':
+        """Convert all tensors to GPU in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.cuda()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def npu(self) -> 'BaseDataElement':
+        """Convert all tensors to NPU in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.npu()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def detach(self) -> 'BaseDataElement':
+        """Detach all tensors in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.detach()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def numpy(self) -> 'BaseDataElement':
+        """Convert all tensors to np.ndarray in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.detach().cpu().numpy()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    def to_tensor(self) -> 'BaseDataElement':
+        """Convert all np.ndarray to tensor in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, np.ndarray):
+                    v = torch.from_numpy(v)
+                elif isinstance(v, BaseDataElement):
+                    v = v.to_tensor()
+                data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def clone(self) -> 'BaseDataElement':
+        """Deep copy the current data element.
+
+        Returns:
+            BaseDataElement: The copy of current data element.
+        """
+        clone_data = self.__class__()
+        clone_data.set_metainfo(dict(self.metainfo_items()))
+
+        for k, v_list in self.items():
+            clone_item_list = []
+            for v in v_list:
+                clone_item_list.append(v.clone())
+            clone_data.set_data({k: clone_item_list})
+        return clone_data
+
+
+TrackSampleList = List[TrackDataSample]
+OptTrackSampleList = Optional[TrackSampleList]
diff --git a/head_extractor/build/lib/mmdet/testing/__init__.py b/head_extractor/build/lib/mmdet/testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..766fb471022ee6f2e4e1ff13a52040ae57772e53
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/testing/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ._fast_stop_training_hook import FastStopTrainingHook  # noqa: F401,F403
+from ._utils import (demo_mm_inputs, demo_mm_proposals,
+                     demo_mm_sampling_results, demo_track_inputs,
+                     get_detector_cfg, get_roi_head_cfg, random_boxes,
+                     replace_to_ceph)
+
+__all__ = [
+    'demo_mm_inputs', 'get_detector_cfg', 'get_roi_head_cfg',
+    'demo_mm_proposals', 'demo_mm_sampling_results', 'replace_to_ceph',
+    'demo_track_inputs', 'VideoDataSampleFeeder', 'random_boxes'
+]
diff --git a/head_extractor/build/lib/mmdet/testing/_fast_stop_training_hook.py b/head_extractor/build/lib/mmdet/testing/_fast_stop_training_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8e3d11439f875d2c9a6ce6b8a0b33acc832c2c5
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/testing/_fast_stop_training_hook.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import Hook
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class FastStopTrainingHook(Hook):
+    """Set runner's epoch information to the model."""
+
+    def __init__(self, by_epoch, save_ckpt=False, stop_iter_or_epoch=5):
+        self.by_epoch = by_epoch
+        self.save_ckpt = save_ckpt
+        self.stop_iter_or_epoch = stop_iter_or_epoch
+
+    def after_train_iter(self, runner, batch_idx: int, data_batch: None,
+                         outputs: None) -> None:
+        if self.save_ckpt and self.by_epoch:
+            # If it is epoch-based and want to save weights,
+            # we must run at least 1 epoch.
+            return
+        if runner.iter >= self.stop_iter_or_epoch:
+            raise RuntimeError('quick exit')
+
+    def after_train_epoch(self, runner) -> None:
+        if runner.epoch >= self.stop_iter_or_epoch - 1:
+            raise RuntimeError('quick exit')
diff --git a/head_extractor/build/lib/mmdet/testing/_utils.py b/head_extractor/build/lib/mmdet/testing/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4d3a86deab17e9c5acd1b1fe7f42e0bfa78943d
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/testing/_utils.py
@@ -0,0 +1,469 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from os.path import dirname, exists, join
+
+import numpy as np
+import torch
+from mmengine.config import Config
+from mmengine.dataset import pseudo_collate
+from mmengine.structures import InstanceData, PixelData
+
+from mmdet.utils.util_random import ensure_rng
+from ..registry import TASK_UTILS
+from ..structures import DetDataSample, TrackDataSample
+from ..structures.bbox import HorizontalBoxes
+
+
+def _get_config_directory():
+    """Find the predefined detector config directory."""
+    try:
+        # Assume we are running in the source mmdetection repo
+        repo_dpath = dirname(dirname(dirname(__file__)))
+    except NameError:
+        # For IPython development when this __file__ is not defined
+        import mmdet
+        repo_dpath = dirname(dirname(mmdet.__file__))
+    config_dpath = join(repo_dpath, 'configs')
+    if not exists(config_dpath):
+        raise Exception('Cannot find config path')
+    return config_dpath
+
+
+def _get_config_module(fname):
+    """Load a configuration as a python module."""
+    config_dpath = _get_config_directory()
+    config_fpath = join(config_dpath, fname)
+    config_mod = Config.fromfile(config_fpath)
+    return config_mod
+
+
+def get_detector_cfg(fname):
+    """Grab configs necessary to create a detector.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+    return model
+
+
+def get_roi_head_cfg(fname):
+    """Grab configs necessary to create a roi_head.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+
+    roi_head = model.roi_head
+    train_cfg = None if model.train_cfg is None else model.train_cfg.rcnn
+    test_cfg = None if model.test_cfg is None else model.test_cfg.rcnn
+    roi_head.update(dict(train_cfg=train_cfg, test_cfg=test_cfg))
+    return roi_head
+
+
+def _rand_bboxes(rng, num_boxes, w, h):
+    cx, cy, bw, bh = rng.rand(num_boxes, 4).T
+
+    tl_x = ((cx * w) - (w * bw / 2)).clip(0, w)
+    tl_y = ((cy * h) - (h * bh / 2)).clip(0, h)
+    br_x = ((cx * w) + (w * bw / 2)).clip(0, w)
+    br_y = ((cy * h) + (h * bh / 2)).clip(0, h)
+
+    bboxes = np.vstack([tl_x, tl_y, br_x, br_y]).T
+    return bboxes
+
+
+def _rand_masks(rng, num_boxes, bboxes, img_w, img_h):
+    from mmdet.structures.mask import BitmapMasks
+    masks = np.zeros((num_boxes, img_h, img_w))
+    for i, bbox in enumerate(bboxes):
+        bbox = bbox.astype(np.int32)
+        mask = (rng.rand(1, bbox[3] - bbox[1], bbox[2] - bbox[0]) >
+                0.3).astype(np.int64)
+        masks[i:i + 1, bbox[1]:bbox[3], bbox[0]:bbox[2]] = mask
+    return BitmapMasks(masks, height=img_h, width=img_w)
+
+
+def demo_mm_inputs(batch_size=2,
+                   image_shapes=(3, 128, 128),
+                   num_items=None,
+                   num_classes=10,
+                   sem_seg_output_strides=1,
+                   with_mask=False,
+                   with_semantic=False,
+                   use_box_type=False,
+                   device='cpu',
+                   texts=None,
+                   custom_entities=False):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        batch_size (int): batch size. Defaults to 2.
+        image_shapes (List[tuple], Optional): image shape.
+            Defaults to (3, 128, 128)
+        num_items (None | List[int]): specifies the number
+            of boxes in each batch item. Default to None.
+        num_classes (int): number of different labels a
+            box might have. Defaults to 10.
+        with_mask (bool): Whether to return mask annotation.
+            Defaults to False.
+        with_semantic (bool): whether to return semantic.
+            Defaults to False.
+        device (str): Destination device type. Defaults to cpu.
+    """
+    rng = np.random.RandomState(0)
+
+    if isinstance(image_shapes, list):
+        assert len(image_shapes) == batch_size
+    else:
+        image_shapes = [image_shapes] * batch_size
+
+    if isinstance(num_items, list):
+        assert len(num_items) == batch_size
+
+    if texts is not None:
+        assert batch_size == len(texts)
+
+    packed_inputs = []
+    for idx in range(batch_size):
+        image_shape = image_shapes[idx]
+        c, h, w = image_shape
+
+        image = rng.randint(0, 255, size=image_shape, dtype=np.uint8)
+
+        mm_inputs = dict()
+        mm_inputs['inputs'] = torch.from_numpy(image).to(device)
+
+        img_meta = {
+            'img_id': idx,
+            'img_shape': image_shape[1:],
+            'ori_shape': image_shape[1:],
+            'filename': '<demo>.png',
+            'scale_factor': np.array([1.1, 1.2]),
+            'flip': False,
+            'flip_direction': None,
+            'border': [1, 1, 1, 1]  # Only used by CenterNet
+        }
+
+        if texts:
+            img_meta['text'] = texts[idx]
+            img_meta['custom_entities'] = custom_entities
+
+        data_sample = DetDataSample()
+        data_sample.set_metainfo(img_meta)
+
+        # gt_instances
+        gt_instances = InstanceData()
+        if num_items is None:
+            num_boxes = rng.randint(1, 10)
+        else:
+            num_boxes = num_items[idx]
+
+        bboxes = _rand_bboxes(rng, num_boxes, w, h)
+        labels = rng.randint(1, num_classes, size=num_boxes)
+        # TODO: remove this part when all model adapted with BaseBoxes
+        if use_box_type:
+            gt_instances.bboxes = HorizontalBoxes(bboxes, dtype=torch.float32)
+        else:
+            gt_instances.bboxes = torch.FloatTensor(bboxes)
+        gt_instances.labels = torch.LongTensor(labels)
+
+        if with_mask:
+            masks = _rand_masks(rng, num_boxes, bboxes, w, h)
+            gt_instances.masks = masks
+
+        # TODO: waiting for ci to be fixed
+        # masks = np.random.randint(0, 2, (len(bboxes), h, w), dtype=np.uint8)
+        # gt_instances.mask = BitmapMasks(masks, h, w)
+
+        data_sample.gt_instances = gt_instances
+
+        # ignore_instances
+        ignore_instances = InstanceData()
+        bboxes = _rand_bboxes(rng, num_boxes, w, h)
+        if use_box_type:
+            ignore_instances.bboxes = HorizontalBoxes(
+                bboxes, dtype=torch.float32)
+        else:
+            ignore_instances.bboxes = torch.FloatTensor(bboxes)
+        data_sample.ignored_instances = ignore_instances
+
+        # gt_sem_seg
+        if with_semantic:
+            # assume gt_semantic_seg using scale 1/8 of the img
+            gt_semantic_seg = torch.from_numpy(
+                np.random.randint(
+                    0,
+                    num_classes, (1, h // sem_seg_output_strides,
+                                  w // sem_seg_output_strides),
+                    dtype=np.uint8))
+            gt_sem_seg_data = dict(sem_seg=gt_semantic_seg)
+            data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data)
+
+        mm_inputs['data_samples'] = data_sample.to(device)
+
+        # TODO: gt_ignore
+
+        packed_inputs.append(mm_inputs)
+    data = pseudo_collate(packed_inputs)
+    return data
+
+
+def demo_mm_proposals(image_shapes, num_proposals, device='cpu'):
+    """Create a list of fake porposals.
+
+    Args:
+        image_shapes (list[tuple[int]]): Batch image shapes.
+        num_proposals (int): The number of fake proposals.
+    """
+    rng = np.random.RandomState(0)
+
+    results = []
+    for img_shape in image_shapes:
+        result = InstanceData()
+        w, h = img_shape[1:]
+        proposals = _rand_bboxes(rng, num_proposals, w, h)
+        result.bboxes = torch.from_numpy(proposals).float()
+        result.scores = torch.from_numpy(rng.rand(num_proposals)).float()
+        result.labels = torch.zeros(num_proposals).long()
+        results.append(result.to(device))
+    return results
+
+
+def demo_mm_sampling_results(proposals_list,
+                             batch_gt_instances,
+                             batch_gt_instances_ignore=None,
+                             assigner_cfg=None,
+                             sampler_cfg=None,
+                             feats=None):
+    """Create sample results that can be passed to BBoxHead.get_targets."""
+    assert len(proposals_list) == len(batch_gt_instances)
+    if batch_gt_instances_ignore is None:
+        batch_gt_instances_ignore = [None for _ in batch_gt_instances]
+    else:
+        assert len(batch_gt_instances_ignore) == len(batch_gt_instances)
+
+    default_assigner_cfg = dict(
+        type='MaxIoUAssigner',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        min_pos_iou=0.5,
+        ignore_iof_thr=-1)
+    assigner_cfg = assigner_cfg if assigner_cfg is not None \
+        else default_assigner_cfg
+    default_sampler_cfg = dict(
+        type='RandomSampler',
+        num=512,
+        pos_fraction=0.25,
+        neg_pos_ub=-1,
+        add_gt_as_proposals=True)
+    sampler_cfg = sampler_cfg if sampler_cfg is not None \
+        else default_sampler_cfg
+    bbox_assigner = TASK_UTILS.build(assigner_cfg)
+    bbox_sampler = TASK_UTILS.build(sampler_cfg)
+
+    sampling_results = []
+    for i in range(len(batch_gt_instances)):
+        if feats is not None:
+            feats = [lvl_feat[i][None] for lvl_feat in feats]
+        # rename proposals.bboxes to proposals.priors
+        proposals = proposals_list[i]
+        proposals.priors = proposals.pop('bboxes')
+
+        assign_result = bbox_assigner.assign(proposals, batch_gt_instances[i],
+                                             batch_gt_instances_ignore[i])
+        sampling_result = bbox_sampler.sample(
+            assign_result, proposals, batch_gt_instances[i], feats=feats)
+        sampling_results.append(sampling_result)
+
+    return sampling_results
+
+
+def demo_track_inputs(batch_size=1,
+                      num_frames=2,
+                      key_frames_inds=None,
+                      image_shapes=(3, 128, 128),
+                      num_items=None,
+                      num_classes=1,
+                      with_mask=False,
+                      with_semantic=False):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        batch_size (int): batch size. Default to 1.
+        num_frames (int): The number of frames.
+        key_frames_inds (List): The indices of key frames.
+        image_shapes (List[tuple], Optional): image shape.
+            Default to (3, 128, 128)
+        num_items (None | List[int]): specifies the number
+            of boxes in each batch item. Default to None.
+        num_classes (int): number of different labels a
+            box might have. Default to 1.
+        with_mask (bool): Whether to return mask annotation.
+            Defaults to False.
+        with_semantic (bool): whether to return semantic.
+            Default to False.
+    """
+    rng = np.random.RandomState(0)
+
+    # Make sure the length of image_shapes is equal to ``batch_size``
+    if isinstance(image_shapes, list):
+        assert len(image_shapes) == batch_size
+    else:
+        image_shapes = [image_shapes] * batch_size
+
+    packed_inputs = []
+    for idx in range(batch_size):
+        mm_inputs = dict(inputs=dict())
+        _, h, w = image_shapes[idx]
+
+        imgs = rng.randint(
+            0, 255, size=(num_frames, *image_shapes[idx]), dtype=np.uint8)
+        mm_inputs['inputs'] = torch.from_numpy(imgs)
+
+        img_meta = {
+            'img_id': idx,
+            'img_shape': image_shapes[idx][-2:],
+            'ori_shape': image_shapes[idx][-2:],
+            'filename': '<demo>.png',
+            'scale_factor': np.array([1.1, 1.2]),
+            'flip': False,
+            'flip_direction': None,
+            'is_video_data': True,
+        }
+
+        video_data_samples = []
+        for i in range(num_frames):
+            data_sample = DetDataSample()
+            img_meta['frame_id'] = i
+            data_sample.set_metainfo(img_meta)
+
+            # gt_instances
+            gt_instances = InstanceData()
+            if num_items is None:
+                num_boxes = rng.randint(1, 10)
+            else:
+                num_boxes = num_items[idx]
+
+            bboxes = _rand_bboxes(rng, num_boxes, w, h)
+            labels = rng.randint(0, num_classes, size=num_boxes)
+            instances_id = rng.randint(100, num_classes + 100, size=num_boxes)
+            gt_instances.bboxes = torch.FloatTensor(bboxes)
+            gt_instances.labels = torch.LongTensor(labels)
+            gt_instances.instances_ids = torch.LongTensor(instances_id)
+
+            if with_mask:
+                masks = _rand_masks(rng, num_boxes, bboxes, w, h)
+                gt_instances.masks = masks
+
+            data_sample.gt_instances = gt_instances
+            # ignore_instances
+            ignore_instances = InstanceData()
+            bboxes = _rand_bboxes(rng, num_boxes, w, h)
+            ignore_instances.bboxes = bboxes
+            data_sample.ignored_instances = ignore_instances
+
+            video_data_samples.append(data_sample)
+
+        track_data_sample = TrackDataSample()
+        track_data_sample.video_data_samples = video_data_samples
+        if key_frames_inds is not None:
+            assert isinstance(
+                key_frames_inds,
+                list) and len(key_frames_inds) < num_frames and max(
+                    key_frames_inds) < num_frames
+            ref_frames_inds = [
+                i for i in range(num_frames) if i not in key_frames_inds
+            ]
+            track_data_sample.set_metainfo(
+                dict(key_frames_inds=key_frames_inds))
+            track_data_sample.set_metainfo(
+                dict(ref_frames_inds=ref_frames_inds))
+        mm_inputs['data_samples'] = track_data_sample
+
+        # TODO: gt_ignore
+        packed_inputs.append(mm_inputs)
+    data = pseudo_collate(packed_inputs)
+    return data
+
+
+def random_boxes(num=1, scale=1, rng=None):
+    """Simple version of ``kwimage.Boxes.random``
+    Returns:
+        Tensor: shape (n, 4) in x1, y1, x2, y2 format.
+    References:
+        https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390 # noqa: E501
+    Example:
+        >>> num = 3
+        >>> scale = 512
+        >>> rng = 0
+        >>> boxes = random_boxes(num, scale, rng)
+        >>> print(boxes)
+        tensor([[280.9925, 278.9802, 308.6148, 366.1769],
+                [216.9113, 330.6978, 224.0446, 456.5878],
+                [405.3632, 196.3221, 493.3953, 270.7942]])
+    """
+    rng = ensure_rng(rng)
+
+    tlbr = rng.rand(num, 4).astype(np.float32)
+
+    tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2])
+    tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3])
+    br_x = np.maximum(tlbr[:, 0], tlbr[:, 2])
+    br_y = np.maximum(tlbr[:, 1], tlbr[:, 3])
+
+    tlbr[:, 0] = tl_x * scale
+    tlbr[:, 1] = tl_y * scale
+    tlbr[:, 2] = br_x * scale
+    tlbr[:, 3] = br_y * scale
+
+    boxes = torch.from_numpy(tlbr)
+    return boxes
+
+
+# TODO: Support full ceph
+def replace_to_ceph(cfg):
+    backend_args = dict(
+        backend='petrel',
+        path_mapping=dict({
+            './data/': 's3://openmmlab/datasets/detection/',
+            'data/': 's3://openmmlab/datasets/detection/'
+        }))
+
+    # TODO: name is a reserved interface, which will be used later.
+    def _process_pipeline(dataset, name):
+
+        def replace_img(pipeline):
+            if pipeline['type'] == 'LoadImageFromFile':
+                pipeline['backend_args'] = backend_args
+
+        def replace_ann(pipeline):
+            if pipeline['type'] == 'LoadAnnotations' or pipeline[
+                    'type'] == 'LoadPanopticAnnotations':
+                pipeline['backend_args'] = backend_args
+
+        if 'pipeline' in dataset:
+            replace_img(dataset.pipeline[0])
+            replace_ann(dataset.pipeline[1])
+            if 'dataset' in dataset:
+                # dataset wrapper
+                replace_img(dataset.dataset.pipeline[0])
+                replace_ann(dataset.dataset.pipeline[1])
+        else:
+            # dataset wrapper
+            replace_img(dataset.dataset.pipeline[0])
+            replace_ann(dataset.dataset.pipeline[1])
+
+    def _process_evaluator(evaluator, name):
+        if evaluator['type'] == 'CocoPanopticMetric':
+            evaluator['backend_args'] = backend_args
+
+    # half ceph
+    _process_pipeline(cfg.train_dataloader.dataset, cfg.filename)
+    _process_pipeline(cfg.val_dataloader.dataset, cfg.filename)
+    _process_pipeline(cfg.test_dataloader.dataset, cfg.filename)
+    _process_evaluator(cfg.val_evaluator, cfg.filename)
+    _process_evaluator(cfg.test_evaluator, cfg.filename)
diff --git a/head_extractor/build/lib/mmdet/utils/__init__.py b/head_extractor/build/lib/mmdet/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..449a890bac411f84790eb3d014175e3a48757847
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .collect_env import collect_env
+from .compat_config import compat_cfg
+from .dist_utils import (all_reduce_dict, allreduce_grads, reduce_mean,
+                         sync_random_seed)
+from .logger import get_caller_name, log_img_scale
+from .memory import AvoidCUDAOOM, AvoidOOM
+from .misc import (find_latest_checkpoint, get_test_pipeline_cfg,
+                   update_data_root)
+from .mot_error_visualize import imshow_mot_errors
+from .replace_cfg_vals import replace_cfg_vals
+from .setup_env import (register_all_modules, setup_cache_size_limit_of_dynamo,
+                        setup_multi_processes)
+from .split_batch import split_batch
+from .typing_utils import (ConfigType, InstanceList, MultiConfig,
+                           OptConfigType, OptInstanceList, OptMultiConfig,
+                           OptPixelList, PixelList, RangeType)
+
+__all__ = [
+    'collect_env', 'find_latest_checkpoint', 'update_data_root',
+    'setup_multi_processes', 'get_caller_name', 'log_img_scale', 'compat_cfg',
+    'split_batch', 'register_all_modules', 'replace_cfg_vals', 'AvoidOOM',
+    'AvoidCUDAOOM', 'all_reduce_dict', 'allreduce_grads', 'reduce_mean',
+    'sync_random_seed', 'ConfigType', 'InstanceList', 'MultiConfig',
+    'OptConfigType', 'OptInstanceList', 'OptMultiConfig', 'OptPixelList',
+    'PixelList', 'RangeType', 'get_test_pipeline_cfg',
+    'setup_cache_size_limit_of_dynamo', 'imshow_mot_errors'
+]
diff --git a/head_extractor/build/lib/mmdet/utils/benchmark.py b/head_extractor/build/lib/mmdet/utils/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..5419b2d175e3c48c063a39ae28758b386f9ab597
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/benchmark.py
@@ -0,0 +1,529 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import time
+from functools import partial
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import fuse_conv_bn
+# TODO need update
+# from mmcv.runner import wrap_fp16_model
+from mmengine import MMLogger
+from mmengine.config import Config
+from mmengine.device import get_max_cuda_memory
+from mmengine.dist import get_world_size
+from mmengine.runner import Runner, load_checkpoint
+from mmengine.utils.dl_utils import set_multi_processing
+from torch.nn.parallel import DistributedDataParallel
+
+from mmdet.registry import DATASETS, MODELS
+
+try:
+    import psutil
+except ImportError:
+    psutil = None
+
+
+def custom_round(value: Union[int, float],
+                 factor: Union[int, float],
+                 precision: int = 2) -> float:
+    """Custom round function."""
+    return round(value / factor, precision)
+
+
+gb_round = partial(custom_round, factor=1024**3)
+
+
+def print_log(msg: str, logger: Optional[MMLogger] = None) -> None:
+    """Print a log message."""
+    if logger is None:
+        print(msg, flush=True)
+    else:
+        logger.info(msg)
+
+
+def print_process_memory(p: psutil.Process,
+                         logger: Optional[MMLogger] = None) -> None:
+    """print process memory info."""
+    mem_used = gb_round(psutil.virtual_memory().used)
+    memory_full_info = p.memory_full_info()
+    uss_mem = gb_round(memory_full_info.uss)
+    if hasattr(memory_full_info, 'pss'):
+        pss_mem = gb_round(memory_full_info.pss)
+
+    for children in p.children():
+        child_mem_info = children.memory_full_info()
+        uss_mem += gb_round(child_mem_info.uss)
+        if hasattr(child_mem_info, 'pss'):
+            pss_mem += gb_round(child_mem_info.pss)
+
+    process_count = 1 + len(p.children())
+
+    log_msg = f'(GB) mem_used: {mem_used:.2f} | uss: {uss_mem:.2f} | '
+    if hasattr(memory_full_info, 'pss'):
+        log_msg += f'pss: {pss_mem:.2f} | '
+    log_msg += f'total_proc: {process_count}'
+    print_log(log_msg, logger)
+
+
+class BaseBenchmark:
+    """The benchmark base class.
+
+    The ``run`` method is an external calling interface, and it will
+    call the ``run_once`` method ``repeat_num`` times for benchmarking.
+    Finally, call the ``average_multiple_runs`` method to further process
+    the results of multiple runs.
+
+    Args:
+        max_iter (int): maximum iterations of benchmark.
+        log_interval (int): interval of logging.
+        num_warmup (int): Number of Warmup.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 max_iter: int,
+                 log_interval: int,
+                 num_warmup: int,
+                 logger: Optional[MMLogger] = None):
+        self.max_iter = max_iter
+        self.log_interval = log_interval
+        self.num_warmup = num_warmup
+        self.logger = logger
+
+    def run(self, repeat_num: int = 1) -> dict:
+        """benchmark entry method.
+
+        Args:
+            repeat_num (int): Number of repeat benchmark.
+                Defaults to 1.
+        """
+        assert repeat_num >= 1
+
+        results = []
+        for _ in range(repeat_num):
+            results.append(self.run_once())
+
+        results = self.average_multiple_runs(results)
+        return results
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        raise NotImplementedError()
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        raise NotImplementedError()
+
+
+class InferenceBenchmark(BaseBenchmark):
+    """The inference benchmark class. It will be statistical inference FPS,
+    CUDA memory and CPU memory information.
+
+    Args:
+        cfg (mmengine.Config): config.
+        checkpoint (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``.
+        distributed (bool): distributed testing flag.
+        is_fuse_conv_bn (bool): Whether to fuse conv and bn, this will
+            slightly increase the inference speed.
+        max_iter (int): maximum iterations of benchmark. Defaults to 2000.
+        log_interval (int): interval of logging. Defaults to 50.
+        num_warmup (int): Number of Warmup. Defaults to 5.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 cfg: Config,
+                 checkpoint: str,
+                 distributed: bool,
+                 is_fuse_conv_bn: bool,
+                 max_iter: int = 2000,
+                 log_interval: int = 50,
+                 num_warmup: int = 5,
+                 logger: Optional[MMLogger] = None):
+        super().__init__(max_iter, log_interval, num_warmup, logger)
+
+        assert get_world_size(
+        ) == 1, 'Inference benchmark does not allow distributed multi-GPU'
+
+        self.cfg = copy.deepcopy(cfg)
+        self.distributed = distributed
+
+        if psutil is None:
+            raise ImportError('psutil is not installed, please install it by: '
+                              'pip install psutil')
+
+        self._process = psutil.Process()
+        env_cfg = self.cfg.get('env_cfg')
+        if env_cfg.get('cudnn_benchmark'):
+            torch.backends.cudnn.benchmark = True
+
+        mp_cfg: dict = env_cfg.get('mp_cfg', {})
+        set_multi_processing(**mp_cfg, distributed=self.distributed)
+
+        print_log('before build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+        self.model = self._init_model(checkpoint, is_fuse_conv_bn)
+
+        # Because multiple processes will occupy additional CPU resources,
+        # FPS statistics will be more unstable when num_workers is not 0.
+        # It is reasonable to set num_workers to 0.
+        dataloader_cfg = cfg.test_dataloader
+        dataloader_cfg['num_workers'] = 0
+        dataloader_cfg['batch_size'] = 1
+        dataloader_cfg['persistent_workers'] = False
+        self.data_loader = Runner.build_dataloader(dataloader_cfg)
+
+        print_log('after build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+    def _init_model(self, checkpoint: str, is_fuse_conv_bn: bool) -> nn.Module:
+        """Initialize the model."""
+        model = MODELS.build(self.cfg.model)
+        # TODO need update
+        # fp16_cfg = self.cfg.get('fp16', None)
+        # if fp16_cfg is not None:
+        #     wrap_fp16_model(model)
+
+        load_checkpoint(model, checkpoint, map_location='cpu')
+        if is_fuse_conv_bn:
+            model = fuse_conv_bn(model)
+
+        model = model.cuda()
+
+        if self.distributed:
+            model = DistributedDataParallel(
+                model,
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=False)
+
+        model.eval()
+        return model
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        pure_inf_time = 0
+        fps = 0
+
+        for i, data in enumerate(self.data_loader):
+
+            if (i + 1) % self.log_interval == 0:
+                print_log('==================================', self.logger)
+
+            torch.cuda.synchronize()
+            start_time = time.perf_counter()
+
+            with torch.no_grad():
+                self.model.test_step(data)
+
+            torch.cuda.synchronize()
+            elapsed = time.perf_counter() - start_time
+
+            if i >= self.num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % self.log_interval == 0:
+                    fps = (i + 1 - self.num_warmup) / pure_inf_time
+                    cuda_memory = get_max_cuda_memory()
+
+                    print_log(
+                        f'Done image [{i + 1:<3}/{self.max_iter}], '
+                        f'fps: {fps:.1f} img/s, '
+                        f'times per image: {1000 / fps:.1f} ms/img, '
+                        f'cuda memory: {cuda_memory} MB', self.logger)
+                    print_process_memory(self._process, self.logger)
+
+            if (i + 1) == self.max_iter:
+                fps = (i + 1 - self.num_warmup) / pure_inf_time
+                break
+
+        return {'fps': fps}
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        print_log('============== Done ==================', self.logger)
+
+        fps_list_ = [round(result['fps'], 1) for result in results]
+        avg_fps_ = sum(fps_list_) / len(fps_list_)
+        outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_}
+
+        if len(fps_list_) > 1:
+            times_pre_image_list_ = [
+                round(1000 / result['fps'], 1) for result in results
+            ]
+            avg_times_pre_image_ = sum(times_pre_image_list_) / len(
+                times_pre_image_list_)
+
+            print_log(
+                f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, '
+                'times per image: '
+                f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] '
+                'ms/img', self.logger)
+        else:
+            print_log(
+                f'Overall fps: {fps_list_[0]:.1f} img/s, '
+                f'times per image: {1000 / fps_list_[0]:.1f} ms/img',
+                self.logger)
+
+        print_log(f'cuda memory: {get_max_cuda_memory()} MB', self.logger)
+        print_process_memory(self._process, self.logger)
+
+        return outputs
+
+
+class DataLoaderBenchmark(BaseBenchmark):
+    """The dataloader benchmark class. It will be statistical inference FPS and
+    CPU memory information.
+
+    Args:
+        cfg (mmengine.Config): config.
+        distributed (bool): distributed testing flag.
+        dataset_type (str): benchmark data type, only supports ``train``,
+            ``val`` and ``test``.
+        max_iter (int): maximum iterations of benchmark. Defaults to 2000.
+        log_interval (int): interval of logging. Defaults to 50.
+        num_warmup (int): Number of Warmup. Defaults to 5.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 cfg: Config,
+                 distributed: bool,
+                 dataset_type: str,
+                 max_iter: int = 2000,
+                 log_interval: int = 50,
+                 num_warmup: int = 5,
+                 logger: Optional[MMLogger] = None):
+        super().__init__(max_iter, log_interval, num_warmup, logger)
+
+        assert dataset_type in ['train', 'val', 'test'], \
+            'dataset_type only supports train,' \
+            f' val and test, but got {dataset_type}'
+        assert get_world_size(
+        ) == 1, 'Dataloader benchmark does not allow distributed multi-GPU'
+
+        self.cfg = copy.deepcopy(cfg)
+        self.distributed = distributed
+
+        if psutil is None:
+            raise ImportError('psutil is not installed, please install it by: '
+                              'pip install psutil')
+        self._process = psutil.Process()
+
+        mp_cfg = self.cfg.get('env_cfg', {}).get('mp_cfg')
+        if mp_cfg is not None:
+            set_multi_processing(distributed=self.distributed, **mp_cfg)
+        else:
+            set_multi_processing(distributed=self.distributed)
+
+        print_log('before build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+        if dataset_type == 'train':
+            self.data_loader = Runner.build_dataloader(cfg.train_dataloader)
+        elif dataset_type == 'test':
+            self.data_loader = Runner.build_dataloader(cfg.test_dataloader)
+        else:
+            self.data_loader = Runner.build_dataloader(cfg.val_dataloader)
+
+        self.batch_size = self.data_loader.batch_size
+        self.num_workers = self.data_loader.num_workers
+
+        print_log('after build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        pure_inf_time = 0
+        fps = 0
+
+        # benchmark with 2000 image and take the average
+        start_time = time.perf_counter()
+        for i, data in enumerate(self.data_loader):
+            elapsed = time.perf_counter() - start_time
+
+            if (i + 1) % self.log_interval == 0:
+                print_log('==================================', self.logger)
+
+            if i >= self.num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % self.log_interval == 0:
+                    fps = (i + 1 - self.num_warmup) / pure_inf_time
+
+                    print_log(
+                        f'Done batch [{i + 1:<3}/{self.max_iter}], '
+                        f'fps: {fps:.1f} batch/s, '
+                        f'times per batch: {1000 / fps:.1f} ms/batch, '
+                        f'batch size: {self.batch_size}, num_workers: '
+                        f'{self.num_workers}', self.logger)
+                    print_process_memory(self._process, self.logger)
+
+            if (i + 1) == self.max_iter:
+                fps = (i + 1 - self.num_warmup) / pure_inf_time
+                break
+
+            start_time = time.perf_counter()
+
+        return {'fps': fps}
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        print_log('============== Done ==================', self.logger)
+
+        fps_list_ = [round(result['fps'], 1) for result in results]
+        avg_fps_ = sum(fps_list_) / len(fps_list_)
+        outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_}
+
+        if len(fps_list_) > 1:
+            times_pre_image_list_ = [
+                round(1000 / result['fps'], 1) for result in results
+            ]
+            avg_times_pre_image_ = sum(times_pre_image_list_) / len(
+                times_pre_image_list_)
+
+            print_log(
+                f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, '
+                'times per batch: '
+                f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] '
+                f'ms/batch, batch size: {self.batch_size}, num_workers: '
+                f'{self.num_workers}', self.logger)
+        else:
+            print_log(
+                f'Overall fps: {fps_list_[0]:.1f} batch/s, '
+                f'times per batch: {1000 / fps_list_[0]:.1f} ms/batch, '
+                f'batch size: {self.batch_size}, num_workers: '
+                f'{self.num_workers}', self.logger)
+
+        print_process_memory(self._process, self.logger)
+
+        return outputs
+
+
+class DatasetBenchmark(BaseBenchmark):
+    """The dataset benchmark class. It will be statistical inference FPS, FPS
+    pre transform and CPU memory information.
+
+    Args:
+        cfg (mmengine.Config): config.
+        dataset_type (str): benchmark data type, only supports ``train``,
+            ``val`` and ``test``.
+        max_iter (int): maximum iterations of benchmark. Defaults to 2000.
+        log_interval (int): interval of logging. Defaults to 50.
+        num_warmup (int): Number of Warmup. Defaults to 5.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 cfg: Config,
+                 dataset_type: str,
+                 max_iter: int = 2000,
+                 log_interval: int = 50,
+                 num_warmup: int = 5,
+                 logger: Optional[MMLogger] = None):
+        super().__init__(max_iter, log_interval, num_warmup, logger)
+        assert dataset_type in ['train', 'val', 'test'], \
+            'dataset_type only supports train,' \
+            f' val and test, but got {dataset_type}'
+        assert get_world_size(
+        ) == 1, 'Dataset benchmark does not allow distributed multi-GPU'
+        self.cfg = copy.deepcopy(cfg)
+
+        if dataset_type == 'train':
+            dataloader_cfg = copy.deepcopy(cfg.train_dataloader)
+        elif dataset_type == 'test':
+            dataloader_cfg = copy.deepcopy(cfg.test_dataloader)
+        else:
+            dataloader_cfg = copy.deepcopy(cfg.val_dataloader)
+
+        dataset_cfg = dataloader_cfg.pop('dataset')
+        dataset = DATASETS.build(dataset_cfg)
+        if hasattr(dataset, 'full_init'):
+            dataset.full_init()
+        self.dataset = dataset
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        pure_inf_time = 0
+        fps = 0
+
+        total_index = list(range(len(self.dataset)))
+        np.random.shuffle(total_index)
+
+        start_time = time.perf_counter()
+        for i, idx in enumerate(total_index):
+            if (i + 1) % self.log_interval == 0:
+                print_log('==================================', self.logger)
+
+            get_data_info_start_time = time.perf_counter()
+            data_info = self.dataset.get_data_info(idx)
+            get_data_info_elapsed = time.perf_counter(
+            ) - get_data_info_start_time
+
+            if (i + 1) % self.log_interval == 0:
+                print_log(f'get_data_info - {get_data_info_elapsed * 1000} ms',
+                          self.logger)
+
+            for t in self.dataset.pipeline.transforms:
+                transform_start_time = time.perf_counter()
+                data_info = t(data_info)
+                transform_elapsed = time.perf_counter() - transform_start_time
+
+                if (i + 1) % self.log_interval == 0:
+                    print_log(
+                        f'{t.__class__.__name__} - '
+                        f'{transform_elapsed * 1000} ms', self.logger)
+
+                if data_info is None:
+                    break
+
+            elapsed = time.perf_counter() - start_time
+
+            if i >= self.num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % self.log_interval == 0:
+                    fps = (i + 1 - self.num_warmup) / pure_inf_time
+
+                    print_log(
+                        f'Done img [{i + 1:<3}/{self.max_iter}], '
+                        f'fps: {fps:.1f} img/s, '
+                        f'times per img: {1000 / fps:.1f} ms/img', self.logger)
+
+            if (i + 1) == self.max_iter:
+                fps = (i + 1 - self.num_warmup) / pure_inf_time
+                break
+
+            start_time = time.perf_counter()
+
+        return {'fps': fps}
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        print_log('============== Done ==================', self.logger)
+
+        fps_list_ = [round(result['fps'], 1) for result in results]
+        avg_fps_ = sum(fps_list_) / len(fps_list_)
+        outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_}
+
+        if len(fps_list_) > 1:
+            times_pre_image_list_ = [
+                round(1000 / result['fps'], 1) for result in results
+            ]
+            avg_times_pre_image_ = sum(times_pre_image_list_) / len(
+                times_pre_image_list_)
+
+            print_log(
+                f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, '
+                'times per img: '
+                f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] '
+                'ms/img', self.logger)
+        else:
+            print_log(
+                f'Overall fps: {fps_list_[0]:.1f} img/s, '
+                f'times per img: {1000 / fps_list_[0]:.1f} ms/img',
+                self.logger)
+
+        return outputs
diff --git a/head_extractor/build/lib/mmdet/utils/collect_env.py b/head_extractor/build/lib/mmdet/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0eed80fe2e4630b78ea3b13fde6046914e47e8b
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/collect_env.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import get_git_hash
+from mmengine.utils.dl_utils import collect_env as collect_base_env
+
+import mmdet
+
+
+def collect_env():
+    """Collect the information of the running environments."""
+    env_info = collect_base_env()
+    env_info['MMDetection'] = mmdet.__version__ + '+' + get_git_hash()[:7]
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/head_extractor/build/lib/mmdet/utils/compat_config.py b/head_extractor/build/lib/mmdet/utils/compat_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..133adb65c2276401eca947e223e5b7c1760de418
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/compat_config.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+from mmengine.config import ConfigDict
+
+
+def compat_cfg(cfg):
+    """This function would modify some filed to keep the compatibility of
+    config.
+
+    For example, it will move some args which will be deprecated to the correct
+    fields.
+    """
+    cfg = copy.deepcopy(cfg)
+    cfg = compat_imgs_per_gpu(cfg)
+    cfg = compat_loader_args(cfg)
+    cfg = compat_runner_args(cfg)
+    return cfg
+
+
+def compat_runner_args(cfg):
+    if 'runner' not in cfg:
+        cfg.runner = ConfigDict({
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        })
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    else:
+        if 'total_epochs' in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+    return cfg
+
+
+def compat_imgs_per_gpu(cfg):
+    cfg = copy.deepcopy(cfg)
+    if 'imgs_per_gpu' in cfg.data:
+        warnings.warn('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                      'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            warnings.warn(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            warnings.warn('Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                          f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+    return cfg
+
+
+def compat_loader_args(cfg):
+    """Deprecated sample_per_gpu in cfg.data."""
+
+    cfg = copy.deepcopy(cfg)
+    if 'train_dataloader' not in cfg.data:
+        cfg.data['train_dataloader'] = ConfigDict()
+    if 'val_dataloader' not in cfg.data:
+        cfg.data['val_dataloader'] = ConfigDict()
+    if 'test_dataloader' not in cfg.data:
+        cfg.data['test_dataloader'] = ConfigDict()
+
+    # special process for train_dataloader
+    if 'samples_per_gpu' in cfg.data:
+
+        samples_per_gpu = cfg.data.pop('samples_per_gpu')
+        assert 'samples_per_gpu' not in \
+               cfg.data.train_dataloader, ('`samples_per_gpu` are set '
+                                           'in `data` field and ` '
+                                           'data.train_dataloader` '
+                                           'at the same time. '
+                                           'Please only set it in '
+                                           '`data.train_dataloader`. ')
+        cfg.data.train_dataloader['samples_per_gpu'] = samples_per_gpu
+
+    if 'persistent_workers' in cfg.data:
+
+        persistent_workers = cfg.data.pop('persistent_workers')
+        assert 'persistent_workers' not in \
+               cfg.data.train_dataloader, ('`persistent_workers` are set '
+                                           'in `data` field and ` '
+                                           'data.train_dataloader` '
+                                           'at the same time. '
+                                           'Please only set it in '
+                                           '`data.train_dataloader`. ')
+        cfg.data.train_dataloader['persistent_workers'] = persistent_workers
+
+    if 'workers_per_gpu' in cfg.data:
+
+        workers_per_gpu = cfg.data.pop('workers_per_gpu')
+        cfg.data.train_dataloader['workers_per_gpu'] = workers_per_gpu
+        cfg.data.val_dataloader['workers_per_gpu'] = workers_per_gpu
+        cfg.data.test_dataloader['workers_per_gpu'] = workers_per_gpu
+
+    # special process for val_dataloader
+    if 'samples_per_gpu' in cfg.data.val:
+        # keep default value of `sample_per_gpu` is 1
+        assert 'samples_per_gpu' not in \
+               cfg.data.val_dataloader, ('`samples_per_gpu` are set '
+                                         'in `data.val` field and ` '
+                                         'data.val_dataloader` at '
+                                         'the same time. '
+                                         'Please only set it in '
+                                         '`data.val_dataloader`. ')
+        cfg.data.val_dataloader['samples_per_gpu'] = \
+            cfg.data.val.pop('samples_per_gpu')
+    # special process for val_dataloader
+
+    # in case the test dataset is concatenated
+    if isinstance(cfg.data.test, dict):
+        if 'samples_per_gpu' in cfg.data.test:
+            assert 'samples_per_gpu' not in \
+                   cfg.data.test_dataloader, ('`samples_per_gpu` are set '
+                                              'in `data.test` field and ` '
+                                              'data.test_dataloader` '
+                                              'at the same time. '
+                                              'Please only set it in '
+                                              '`data.test_dataloader`. ')
+
+            cfg.data.test_dataloader['samples_per_gpu'] = \
+                cfg.data.test.pop('samples_per_gpu')
+
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            if 'samples_per_gpu' in ds_cfg:
+                assert 'samples_per_gpu' not in \
+                       cfg.data.test_dataloader, ('`samples_per_gpu` are set '
+                                                  'in `data.test` field and ` '
+                                                  'data.test_dataloader` at'
+                                                  ' the same time. '
+                                                  'Please only set it in '
+                                                  '`data.test_dataloader`. ')
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        cfg.data.test_dataloader['samples_per_gpu'] = samples_per_gpu
+
+    return cfg
diff --git a/head_extractor/build/lib/mmdet/utils/contextmanagers.py b/head_extractor/build/lib/mmdet/utils/contextmanagers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa12bfcaff1e781b0a8cc7d7c8b839c2f2955a05
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/contextmanagers.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import asyncio
+import contextlib
+import logging
+import os
+import time
+from typing import List
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+DEBUG_COMPLETED_TIME = bool(os.environ.get('DEBUG_COMPLETED_TIME', False))
+
+
+@contextlib.asynccontextmanager
+async def completed(trace_name='',
+                    name='',
+                    sleep_interval=0.05,
+                    streams: List[torch.cuda.Stream] = None):
+    """Async context manager that waits for work to complete on given CUDA
+    streams."""
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    stream_before_context_switch = torch.cuda.current_stream()
+    if not streams:
+        streams = [stream_before_context_switch]
+    else:
+        streams = [s if s else stream_before_context_switch for s in streams]
+
+    end_events = [
+        torch.cuda.Event(enable_timing=DEBUG_COMPLETED_TIME) for _ in streams
+    ]
+
+    if DEBUG_COMPLETED_TIME:
+        start = torch.cuda.Event(enable_timing=True)
+        stream_before_context_switch.record_event(start)
+
+        cpu_start = time.monotonic()
+    logger.debug('%s %s starting, streams: %s', trace_name, name, streams)
+    grad_enabled_before = torch.is_grad_enabled()
+    try:
+        yield
+    finally:
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_end = time.monotonic()
+        for i, stream in enumerate(streams):
+            event = end_events[i]
+            stream.record_event(event)
+
+        grad_enabled_after = torch.is_grad_enabled()
+
+        # observed change of torch.is_grad_enabled() during concurrent run of
+        # async_test_bboxes code
+        assert (grad_enabled_before == grad_enabled_after
+                ), 'Unexpected is_grad_enabled() value change'
+
+        are_done = [e.query() for e in end_events]
+        logger.debug('%s %s completed: %s streams: %s', trace_name, name,
+                     are_done, streams)
+        with torch.cuda.stream(stream_before_context_switch):
+            while not all(are_done):
+                await asyncio.sleep(sleep_interval)
+                are_done = [e.query() for e in end_events]
+                logger.debug(
+                    '%s %s completed: %s streams: %s',
+                    trace_name,
+                    name,
+                    are_done,
+                    streams,
+                )
+
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_time = (cpu_end - cpu_start) * 1000
+            stream_times_ms = ''
+            for i, stream in enumerate(streams):
+                elapsed_time = start.elapsed_time(end_events[i])
+                stream_times_ms += f' {stream} {elapsed_time:.2f} ms'
+            logger.info('%s %s %.2f ms %s', trace_name, name, cpu_time,
+                        stream_times_ms)
+
+
+@contextlib.asynccontextmanager
+async def concurrent(streamqueue: asyncio.Queue,
+                     trace_name='concurrent',
+                     name='stream'):
+    """Run code concurrently in different streams.
+
+    :param streamqueue: asyncio.Queue instance.
+
+    Queue tasks define the pool of streams used for concurrent execution.
+    """
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    initial_stream = torch.cuda.current_stream()
+
+    with torch.cuda.stream(initial_stream):
+        stream = await streamqueue.get()
+        assert isinstance(stream, torch.cuda.Stream)
+
+        try:
+            with torch.cuda.stream(stream):
+                logger.debug('%s %s is starting, stream: %s', trace_name, name,
+                             stream)
+                yield
+                current = torch.cuda.current_stream()
+                assert current == stream
+                logger.debug('%s %s has finished, stream: %s', trace_name,
+                             name, stream)
+        finally:
+            streamqueue.task_done()
+            streamqueue.put_nowait(stream)
diff --git a/head_extractor/build/lib/mmdet/utils/dist_utils.py b/head_extractor/build/lib/mmdet/utils/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f2c8614a181ec0594ba157002a2760737e2c6e3
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/dist_utils.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import pickle
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmengine.dist import get_dist_info
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters of a model
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+def reduce_mean(tensor):
+    """"Obtain the mean of tensor on different GPUs."""
+    if not (dist.is_available() and dist.is_initialized()):
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+    return tensor
+
+
+def obj2tensor(pyobj, device='cuda'):
+    """Serialize picklable python object to tensor."""
+    storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj))
+    return torch.ByteTensor(storage).to(device=device)
+
+
+def tensor2obj(tensor):
+    """Deserialize tensor to picklable python object."""
+    return pickle.loads(tensor.cpu().numpy().tobytes())
+
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """Return a process group based on gloo backend, containing all the ranks
+    The result is cached."""
+    if dist.get_backend() == 'nccl':
+        return dist.new_group(backend='gloo')
+    else:
+        return dist.group.WORLD
+
+
+def all_reduce_dict(py_dict, op='sum', group=None, to_float=True):
+    """Apply all reduce function for python dict object.
+
+    The code is modified from https://github.com/Megvii-
+    BaseDetection/YOLOX/blob/main/yolox/utils/allreduce_norm.py.
+
+    NOTE: make sure that py_dict in different ranks has the same keys and
+    the values should be in the same shape. Currently only supports
+    nccl backend.
+
+    Args:
+        py_dict (dict): Dict to be applied all reduce op.
+        op (str): Operator, could be 'sum' or 'mean'. Default: 'sum'
+        group (:obj:`torch.distributed.group`, optional): Distributed group,
+            Default: None.
+        to_float (bool): Whether to convert all values of dict to float.
+            Default: True.
+
+    Returns:
+        OrderedDict: reduced python dict object.
+    """
+    warnings.warn(
+        'group` is deprecated. Currently only supports NCCL backend.')
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return py_dict
+
+    # all reduce logic across different devices.
+    py_key = list(py_dict.keys())
+    if not isinstance(py_dict, OrderedDict):
+        py_key_tensor = obj2tensor(py_key)
+        dist.broadcast(py_key_tensor, src=0)
+        py_key = tensor2obj(py_key_tensor)
+
+    tensor_shapes = [py_dict[k].shape for k in py_key]
+    tensor_numels = [py_dict[k].numel() for k in py_key]
+
+    if to_float:
+        warnings.warn('Note: the "to_float" is True, you need to '
+                      'ensure that the behavior is reasonable.')
+        flatten_tensor = torch.cat(
+            [py_dict[k].flatten().float() for k in py_key])
+    else:
+        flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key])
+
+    dist.all_reduce(flatten_tensor, op=dist.ReduceOp.SUM)
+    if op == 'mean':
+        flatten_tensor /= world_size
+
+    split_tensors = [
+        x.reshape(shape) for x, shape in zip(
+            torch.split(flatten_tensor, tensor_numels), tensor_shapes)
+    ]
+    out_dict = {k: v for k, v in zip(py_key, split_tensors)}
+    if isinstance(py_dict, OrderedDict):
+        out_dict = OrderedDict(out_dict)
+    return out_dict
+
+
+def sync_random_seed(seed=None, device='cuda'):
+    """Make sure different ranks share the same seed.
+
+    All workers must call this function, otherwise it will deadlock.
+    This method is generally used in `DistributedSampler`,
+    because the seed should be identical across all processes
+    in the distributed group.
+
+    In distributed sampling, different ranks should sample non-overlapped
+    data in the dataset. Therefore, this function is used to make sure that
+    each rank shuffles the data indices in the same order based
+    on the same seed. Then different ranks could use different indices
+    to select non-overlapped data from the same data list.
+
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is None:
+        seed = np.random.randint(2**31)
+    assert isinstance(seed, int)
+
+    rank, world_size = get_dist_info()
+
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
diff --git a/head_extractor/build/lib/mmdet/utils/large_image.py b/head_extractor/build/lib/mmdet/utils/large_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f07c2bdc6958f2b3bdd69da0a639276252a91e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/large_image.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Tuple
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.structures import InstanceData
+
+from mmdet.structures import DetDataSample, SampleList
+
+
+def shift_rbboxes(bboxes: torch.Tensor, offset: Sequence[int]):
+    """Shift rotated bboxes with offset.
+
+    Args:
+        bboxes (Tensor): The rotated bboxes need to be translated.
+            With shape (n, 5), which means (x, y, w, h, a).
+        offset (Sequence[int]): The translation offsets with shape of (2, ).
+    Returns:
+        Tensor: Shifted rotated bboxes.
+    """
+    offset_tensor = bboxes.new_tensor(offset)
+    shifted_bboxes = bboxes.clone()
+    shifted_bboxes[:, 0:2] = shifted_bboxes[:, 0:2] + offset_tensor
+    return shifted_bboxes
+
+
+def shift_predictions(det_data_samples: SampleList,
+                      offsets: Sequence[Tuple[int, int]],
+                      src_image_shape: Tuple[int, int]) -> SampleList:
+    """Shift predictions to the original image.
+
+    Args:
+        det_data_samples (List[:obj:`DetDataSample`]): A list of patch results.
+        offsets (Sequence[Tuple[int, int]]): Positions of the left top points
+            of patches.
+        src_image_shape (Tuple[int, int]): A (height, width) tuple of the large
+            image's width and height.
+    Returns:
+        (List[:obj:`DetDataSample`]): shifted results.
+    """
+    try:
+        from sahi.slicing import shift_bboxes, shift_masks
+    except ImportError:
+        raise ImportError('Please run "pip install -U sahi" '
+                          'to install sahi first for large image inference.')
+
+    assert len(det_data_samples) == len(
+        offsets), 'The `results` should has the ' 'same length with `offsets`.'
+    shifted_predictions = []
+    for det_data_sample, offset in zip(det_data_samples, offsets):
+        pred_inst = det_data_sample.pred_instances.clone()
+
+        # Check bbox type
+        if pred_inst.bboxes.size(-1) == 4:
+            # Horizontal bboxes
+            shifted_bboxes = shift_bboxes(pred_inst.bboxes, offset)
+        elif pred_inst.bboxes.size(-1) == 5:
+            # Rotated bboxes
+            shifted_bboxes = shift_rbboxes(pred_inst.bboxes, offset)
+        else:
+            raise NotImplementedError
+
+        # shift bboxes and masks
+        pred_inst.bboxes = shifted_bboxes
+        if 'masks' in det_data_sample:
+            pred_inst.masks = shift_masks(pred_inst.masks, offset,
+                                          src_image_shape)
+
+        shifted_predictions.append(pred_inst.clone())
+
+    shifted_predictions = InstanceData.cat(shifted_predictions)
+
+    return shifted_predictions
+
+
+def merge_results_by_nms(results: SampleList, offsets: Sequence[Tuple[int,
+                                                                      int]],
+                         src_image_shape: Tuple[int, int],
+                         nms_cfg: dict) -> DetDataSample:
+    """Merge patch results by nms.
+
+    Args:
+        results (List[:obj:`DetDataSample`]): A list of patch results.
+        offsets (Sequence[Tuple[int, int]]): Positions of the left top points
+            of patches.
+        src_image_shape (Tuple[int, int]): A (height, width) tuple of the large
+            image's width and height.
+        nms_cfg (dict): it should specify nms type and other parameters
+            like `iou_threshold`.
+    Returns:
+        :obj:`DetDataSample`: merged results.
+    """
+    shifted_instances = shift_predictions(results, offsets, src_image_shape)
+
+    _, keeps = batched_nms(
+        boxes=shifted_instances.bboxes,
+        scores=shifted_instances.scores,
+        idxs=shifted_instances.labels,
+        nms_cfg=nms_cfg)
+    merged_instances = shifted_instances[keeps]
+
+    merged_result = results[0].clone()
+    merged_result.pred_instances = merged_instances
+    return merged_result
diff --git a/head_extractor/build/lib/mmdet/utils/logger.py b/head_extractor/build/lib/mmdet/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fec08bbad5517c9169eedb15b4768e7d88d39c7
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/logger.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+
+from mmengine.logging import print_log
+
+
+def get_caller_name():
+    """Get name of caller method."""
+    # this_func_frame = inspect.stack()[0][0]  # i.e., get_caller_name
+    # callee_frame = inspect.stack()[1][0]  # e.g., log_img_scale
+    caller_frame = inspect.stack()[2][0]  # e.g., caller of log_img_scale
+    caller_method = caller_frame.f_code.co_name
+    try:
+        caller_class = caller_frame.f_locals['self'].__class__.__name__
+        return f'{caller_class}.{caller_method}'
+    except KeyError:  # caller is a function
+        return caller_method
+
+
+def log_img_scale(img_scale, shape_order='hw', skip_square=False):
+    """Log image size.
+
+    Args:
+        img_scale (tuple): Image size to be logged.
+        shape_order (str, optional): The order of image shape.
+            'hw' for (height, width) and 'wh' for (width, height).
+            Defaults to 'hw'.
+        skip_square (bool, optional): Whether to skip logging for square
+            img_scale. Defaults to False.
+
+    Returns:
+        bool: Whether to have done logging.
+    """
+    if shape_order == 'hw':
+        height, width = img_scale
+    elif shape_order == 'wh':
+        width, height = img_scale
+    else:
+        raise ValueError(f'Invalid shape_order {shape_order}.')
+
+    if skip_square and (height == width):
+        return False
+
+    caller = get_caller_name()
+    print_log(
+        f'image shape: height={height}, width={width} in {caller}',
+        logger='current')
+
+    return True
diff --git a/head_extractor/build/lib/mmdet/utils/memory.py b/head_extractor/build/lib/mmdet/utils/memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6f9cbc7f9e5f54e2cc429e5e655b2a27d38d61f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/memory.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import abc
+from contextlib import contextmanager
+from functools import wraps
+
+import torch
+from mmengine.logging import MMLogger
+
+
+def cast_tensor_type(inputs, src_type=None, dst_type=None):
+    """Recursively convert Tensor in inputs from ``src_type`` to ``dst_type``.
+
+    Args:
+        inputs: Inputs that to be casted.
+        src_type (torch.dtype | torch.device): Source type.
+        src_type (torch.dtype | torch.device): Destination type.
+
+    Returns:
+        The same type with inputs, but all contained Tensors have been cast.
+    """
+    assert dst_type is not None
+    if isinstance(inputs, torch.Tensor):
+        if isinstance(dst_type, torch.device):
+            # convert Tensor to dst_device
+            if hasattr(inputs, 'to') and \
+                    hasattr(inputs, 'device') and \
+                    (inputs.device == src_type or src_type is None):
+                return inputs.to(dst_type)
+            else:
+                return inputs
+        else:
+            # convert Tensor to dst_dtype
+            if hasattr(inputs, 'to') and \
+                    hasattr(inputs, 'dtype') and \
+                    (inputs.dtype == src_type or src_type is None):
+                return inputs.to(dst_type)
+            else:
+                return inputs
+        # we need to ensure that the type of inputs to be casted are the same
+        # as the argument `src_type`.
+    elif isinstance(inputs, abc.Mapping):
+        return type(inputs)({
+            k: cast_tensor_type(v, src_type=src_type, dst_type=dst_type)
+            for k, v in inputs.items()
+        })
+    elif isinstance(inputs, abc.Iterable):
+        return type(inputs)(
+            cast_tensor_type(item, src_type=src_type, dst_type=dst_type)
+            for item in inputs)
+    # TODO: Currently not supported
+    # elif isinstance(inputs, InstanceData):
+    #     for key, value in inputs.items():
+    #         inputs[key] = cast_tensor_type(
+    #             value, src_type=src_type, dst_type=dst_type)
+    #     return inputs
+    else:
+        return inputs
+
+
+@contextmanager
+def _ignore_torch_cuda_oom():
+    """A context which ignores CUDA OOM exception from pytorch.
+
+    Code is modified from
+    <https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/memory.py>  # noqa: E501
+    """
+    try:
+        yield
+    except RuntimeError as e:
+        # NOTE: the string may change?
+        if 'CUDA out of memory. ' in str(e):
+            pass
+        else:
+            raise
+
+
+class AvoidOOM:
+    """Try to convert inputs to FP16 and CPU if got a PyTorch's CUDA Out of
+    Memory error. It will do the following steps:
+
+        1. First retry after calling `torch.cuda.empty_cache()`.
+        2. If that still fails, it will then retry by converting inputs
+          to FP16.
+        3. If that still fails trying to convert inputs to CPUs.
+          In this case, it expects the function to dispatch to
+          CPU implementation.
+
+    Args:
+        to_cpu (bool): Whether to convert outputs to CPU if get an OOM
+            error. This will slow down the code significantly.
+            Defaults to True.
+        test (bool): Skip `_ignore_torch_cuda_oom` operate that can use
+            lightweight data in unit test, only used in
+            test unit. Defaults to False.
+
+    Examples:
+        >>> from mmdet.utils.memory import AvoidOOM
+        >>> AvoidCUDAOOM = AvoidOOM()
+        >>> output = AvoidOOM.retry_if_cuda_oom(
+        >>>     some_torch_function)(input1, input2)
+        >>> # To use as a decorator
+        >>> # from mmdet.utils import AvoidCUDAOOM
+        >>> @AvoidCUDAOOM.retry_if_cuda_oom
+        >>> def function(*args, **kwargs):
+        >>>     return None
+    ```
+
+    Note:
+        1. The output may be on CPU even if inputs are on GPU. Processing
+            on CPU will slow down the code significantly.
+        2. When converting inputs to CPU, it will only look at each argument
+            and check if it has `.device` and `.to` for conversion. Nested
+            structures of tensors are not supported.
+        3. Since the function might be called more than once, it has to be
+            stateless.
+    """
+
+    def __init__(self, to_cpu=True, test=False):
+        self.to_cpu = to_cpu
+        self.test = test
+
+    def retry_if_cuda_oom(self, func):
+        """Makes a function retry itself after encountering pytorch's CUDA OOM
+        error.
+
+        The implementation logic is referred to
+        https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/memory.py
+
+        Args:
+            func: a stateless callable that takes tensor-like objects
+                as arguments.
+        Returns:
+            func: a callable which retries `func` if OOM is encountered.
+        """  # noqa: W605
+
+        @wraps(func)
+        def wrapped(*args, **kwargs):
+
+            # raw function
+            if not self.test:
+                with _ignore_torch_cuda_oom():
+                    return func(*args, **kwargs)
+
+                # Clear cache and retry
+                torch.cuda.empty_cache()
+                with _ignore_torch_cuda_oom():
+                    return func(*args, **kwargs)
+
+            # get the type and device of first tensor
+            dtype, device = None, None
+            values = args + tuple(kwargs.values())
+            for value in values:
+                if isinstance(value, torch.Tensor):
+                    dtype = value.dtype
+                    device = value.device
+                    break
+            if dtype is None or device is None:
+                raise ValueError('There is no tensor in the inputs, '
+                                 'cannot get dtype and device.')
+
+            # Convert to FP16
+            fp16_args = cast_tensor_type(args, dst_type=torch.half)
+            fp16_kwargs = cast_tensor_type(kwargs, dst_type=torch.half)
+            logger = MMLogger.get_current_instance()
+            logger.warning(f'Attempting to copy inputs of {str(func)} '
+                           'to FP16 due to CUDA OOM')
+
+            # get input tensor type, the output type will same as
+            # the first parameter type.
+            with _ignore_torch_cuda_oom():
+                output = func(*fp16_args, **fp16_kwargs)
+                output = cast_tensor_type(
+                    output, src_type=torch.half, dst_type=dtype)
+                if not self.test:
+                    return output
+            logger.warning('Using FP16 still meet CUDA OOM')
+
+            # Try on CPU. This will slow down the code significantly,
+            # therefore print a notice.
+            if self.to_cpu:
+                logger.warning(f'Attempting to copy inputs of {str(func)} '
+                               'to CPU due to CUDA OOM')
+                cpu_device = torch.empty(0).device
+                cpu_args = cast_tensor_type(args, dst_type=cpu_device)
+                cpu_kwargs = cast_tensor_type(kwargs, dst_type=cpu_device)
+
+                # convert outputs to GPU
+                with _ignore_torch_cuda_oom():
+                    logger.warning(f'Convert outputs to GPU (device={device})')
+                    output = func(*cpu_args, **cpu_kwargs)
+                    output = cast_tensor_type(
+                        output, src_type=cpu_device, dst_type=device)
+                    return output
+
+                warnings.warn('Cannot convert output to GPU due to CUDA OOM, '
+                              'the output is now on CPU, which might cause '
+                              'errors if the output need to interact with GPU '
+                              'data in subsequent operations')
+                logger.warning('Cannot convert output to GPU due to '
+                               'CUDA OOM, the output is on CPU now.')
+
+                return func(*cpu_args, **cpu_kwargs)
+            else:
+                # may still get CUDA OOM error
+                return func(*args, **kwargs)
+
+        return wrapped
+
+
+# To use AvoidOOM as a decorator
+AvoidCUDAOOM = AvoidOOM()
diff --git a/head_extractor/build/lib/mmdet/utils/misc.py b/head_extractor/build/lib/mmdet/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dfb394465196cbd1e60c96f5be3aaee416d59cf
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/misc.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import glob
+import os
+import os.path as osp
+import urllib
+import warnings
+from typing import Union
+
+import torch
+from mmengine.config import Config, ConfigDict
+from mmengine.logging import print_log
+from mmengine.utils import scandir
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
+                  '.tiff', '.webp')
+
+
+def find_latest_checkpoint(path, suffix='pth'):
+    """Find the latest checkpoint from the working directory.
+
+    Args:
+        path(str): The path to find checkpoints.
+        suffix(str): File extension.
+            Defaults to pth.
+
+    Returns:
+        latest_path(str | None): File path of the latest checkpoint.
+    References:
+        .. [1] https://github.com/microsoft/SoftTeacher
+                  /blob/main/ssod/utils/patch.py
+    """
+    if not osp.exists(path):
+        warnings.warn('The path of checkpoints does not exist.')
+        return None
+    if osp.exists(osp.join(path, f'latest.{suffix}')):
+        return osp.join(path, f'latest.{suffix}')
+
+    checkpoints = glob.glob(osp.join(path, f'*.{suffix}'))
+    if len(checkpoints) == 0:
+        warnings.warn('There are no checkpoints in the path.')
+        return None
+    latest = -1
+    latest_path = None
+    for checkpoint in checkpoints:
+        count = int(osp.basename(checkpoint).split('_')[-1].split('.')[0])
+        if count > latest:
+            latest = count
+            latest_path = checkpoint
+    return latest_path
+
+
+def update_data_root(cfg, logger=None):
+    """Update data root according to env MMDET_DATASETS.
+
+    If set env MMDET_DATASETS, update cfg.data_root according to
+    MMDET_DATASETS. Otherwise, using cfg.data_root as default.
+
+    Args:
+        cfg (:obj:`Config`): The model config need to modify
+        logger (logging.Logger | str | None): the way to print msg
+    """
+    assert isinstance(cfg, Config), \
+        f'cfg got wrong type: {type(cfg)}, expected mmengine.Config'
+
+    if 'MMDET_DATASETS' in os.environ:
+        dst_root = os.environ['MMDET_DATASETS']
+        print_log(f'MMDET_DATASETS has been set to be {dst_root}.'
+                  f'Using {dst_root} as data root.')
+    else:
+        return
+
+    assert isinstance(cfg, Config), \
+        f'cfg got wrong type: {type(cfg)}, expected mmengine.Config'
+
+    def update(cfg, src_str, dst_str):
+        for k, v in cfg.items():
+            if isinstance(v, ConfigDict):
+                update(cfg[k], src_str, dst_str)
+            if isinstance(v, str) and src_str in v:
+                cfg[k] = v.replace(src_str, dst_str)
+
+    update(cfg.data, cfg.data_root, dst_root)
+    cfg.data_root = dst_root
+
+
+def get_test_pipeline_cfg(cfg: Union[str, ConfigDict]) -> ConfigDict:
+    """Get the test dataset pipeline from entire config.
+
+    Args:
+        cfg (str or :obj:`ConfigDict`): the entire config. Can be a config
+            file or a ``ConfigDict``.
+
+    Returns:
+        :obj:`ConfigDict`: the config of test dataset.
+    """
+    if isinstance(cfg, str):
+        cfg = Config.fromfile(cfg)
+
+    def _get_test_pipeline_cfg(dataset_cfg):
+        if 'pipeline' in dataset_cfg:
+            return dataset_cfg.pipeline
+        # handle dataset wrapper
+        elif 'dataset' in dataset_cfg:
+            return _get_test_pipeline_cfg(dataset_cfg.dataset)
+        # handle dataset wrappers like ConcatDataset
+        elif 'datasets' in dataset_cfg:
+            return _get_test_pipeline_cfg(dataset_cfg.datasets[0])
+
+        raise RuntimeError('Cannot find `pipeline` in `test_dataloader`')
+
+    return _get_test_pipeline_cfg(cfg.test_dataloader.dataset)
+
+
+def get_file_list(source_root: str) -> [list, dict]:
+    """Get file list.
+
+    Args:
+        source_root (str): image or video source path
+
+    Return:
+        source_file_path_list (list): A list for all source file.
+        source_type (dict): Source type: file or url or dir.
+    """
+    is_dir = os.path.isdir(source_root)
+    is_url = source_root.startswith(('http:/', 'https:/'))
+    is_file = os.path.splitext(source_root)[-1].lower() in IMG_EXTENSIONS
+
+    source_file_path_list = []
+    if is_dir:
+        # when input source is dir
+        for file in scandir(source_root, IMG_EXTENSIONS, recursive=True):
+            source_file_path_list.append(os.path.join(source_root, file))
+    elif is_url:
+        # when input source is url
+        filename = os.path.basename(
+            urllib.parse.unquote(source_root).split('?')[0])
+        file_save_path = os.path.join(os.getcwd(), filename)
+        print(f'Downloading source file to {file_save_path}')
+        torch.hub.download_url_to_file(source_root, file_save_path)
+        source_file_path_list = [file_save_path]
+    elif is_file:
+        # when input source is single image
+        source_file_path_list = [source_root]
+    else:
+        print('Cannot find image file.')
+
+    source_type = dict(is_dir=is_dir, is_url=is_url, is_file=is_file)
+
+    return source_file_path_list, source_type
diff --git a/head_extractor/build/lib/mmdet/utils/mot_error_visualize.py b/head_extractor/build/lib/mmdet/utils/mot_error_visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..01bf8645d340aa1f5ab8251211a719f2de9845b1
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/mot_error_visualize.py
@@ -0,0 +1,273 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Union
+
+try:
+    import seaborn as sns
+except ImportError:
+    sns = None
+import cv2
+import matplotlib.pyplot as plt
+import mmcv
+import numpy as np
+from matplotlib.patches import Rectangle
+from mmengine.utils import mkdir_or_exist
+
+
+def imshow_mot_errors(*args, backend: str = 'cv2', **kwargs):
+    """Show the wrong tracks on the input image.
+
+    Args:
+        backend (str, optional): Backend of visualization.
+            Defaults to 'cv2'.
+    """
+    if backend == 'cv2':
+        return _cv2_show_wrong_tracks(*args, **kwargs)
+    elif backend == 'plt':
+        return _plt_show_wrong_tracks(*args, **kwargs)
+    else:
+        raise NotImplementedError()
+
+
+def _cv2_show_wrong_tracks(img: Union[str, np.ndarray],
+                           bboxes: np.ndarray,
+                           ids: np.ndarray,
+                           error_types: np.ndarray,
+                           thickness: int = 2,
+                           font_scale: float = 0.4,
+                           text_width: int = 10,
+                           text_height: int = 15,
+                           show: bool = False,
+                           wait_time: int = 100,
+                           out_file: str = None) -> np.ndarray:
+    """Show the wrong tracks with opencv.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): A ndarray of shape (k, 5).
+        ids (ndarray): A ndarray of shape (k, ).
+        error_types (ndarray): A ndarray of shape (k, ), where 0 denotes
+            false positives, 1 denotes false negative and 2 denotes ID switch.
+        thickness (int, optional): Thickness of lines.
+            Defaults to 2.
+        font_scale (float, optional): Font scale to draw id and score.
+            Defaults to 0.4.
+        text_width (int, optional): Width to draw id and score.
+            Defaults to 10.
+        text_height (int, optional): Height to draw id and score.
+            Defaults to 15.
+        show (bool, optional): Whether to show the image on the fly.
+            Defaults to False.
+        wait_time (int, optional): Value of waitKey param.
+            Defaults to 100.
+        out_file (str, optional): The filename to write the image.
+            Defaults to None.
+
+    Returns:
+        ndarray: Visualized image.
+    """
+    if sns is None:
+        raise ImportError('please run pip install seaborn')
+    assert bboxes.ndim == 2, \
+        f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.'
+    assert ids.ndim == 1, \
+        f' ids ndim should be 1, but its ndim is {ids.ndim}.'
+    assert error_types.ndim == 1, \
+        f' error_types ndim should be 1, but its ndim is {error_types.ndim}.'
+    assert bboxes.shape[0] == ids.shape[0], \
+        'bboxes.shape[0] and ids.shape[0] should have the same length.'
+    assert bboxes.shape[1] == 5, \
+        f' bboxes.shape[1] should be 5, but its {bboxes.shape[1]}.'
+
+    bbox_colors = sns.color_palette()
+    # red, yellow, blue
+    bbox_colors = [bbox_colors[3], bbox_colors[1], bbox_colors[0]]
+    bbox_colors = [[int(255 * _c) for _c in bbox_color][::-1]
+                   for bbox_color in bbox_colors]
+
+    if isinstance(img, str):
+        img = mmcv.imread(img)
+    else:
+        assert img.ndim == 3
+
+    img_shape = img.shape
+    bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+    bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+
+    for bbox, error_type, id in zip(bboxes, error_types, ids):
+        x1, y1, x2, y2 = bbox[:4].astype(np.int32)
+        score = float(bbox[-1])
+
+        # bbox
+        bbox_color = bbox_colors[error_type]
+        cv2.rectangle(img, (x1, y1), (x2, y2), bbox_color, thickness=thickness)
+
+        # FN does not have id and score
+        if error_type == 1:
+            continue
+
+        # score
+        text = '{:.02f}'.format(score)
+        width = (len(text) - 1) * text_width
+        img[y1:y1 + text_height, x1:x1 + width, :] = bbox_color
+        cv2.putText(
+            img,
+            text, (x1, y1 + text_height - 2),
+            cv2.FONT_HERSHEY_COMPLEX,
+            font_scale,
+            color=(0, 0, 0))
+
+        # id
+        text = str(id)
+        width = len(text) * text_width
+        img[y1 + text_height:y1 + text_height * 2,
+            x1:x1 + width, :] = bbox_color
+        cv2.putText(
+            img,
+            str(id), (x1, y1 + text_height * 2 - 2),
+            cv2.FONT_HERSHEY_COMPLEX,
+            font_scale,
+            color=(0, 0, 0))
+
+    if show:
+        mmcv.imshow(img, wait_time=wait_time)
+    if out_file is not None:
+        mmcv.imwrite(img, out_file)
+
+    return img
+
+
+def _plt_show_wrong_tracks(img: Union[str, np.ndarray],
+                           bboxes: np.ndarray,
+                           ids: np.ndarray,
+                           error_types: np.ndarray,
+                           thickness: float = 0.1,
+                           font_scale: float = 3.0,
+                           text_width: int = 8,
+                           text_height: int = 13,
+                           show: bool = False,
+                           wait_time: int = 100,
+                           out_file: str = None) -> np.ndarray:
+    """Show the wrong tracks with matplotlib.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): A ndarray of shape (k, 5).
+        ids (ndarray): A ndarray of shape (k, ).
+        error_types (ndarray): A ndarray of shape (k, ), where 0 denotes
+            false positives, 1 denotes false negative and 2 denotes ID switch.
+        thickness (float, optional): Thickness of lines.
+            Defaults to 0.1.
+        font_scale (float, optional): Font scale to draw id and score.
+            Defaults to 3.0.
+        text_width (int, optional): Width to draw id and score.
+            Defaults to 8.
+        text_height (int, optional): Height to draw id and score.
+            Defaults to 13.
+        show (bool, optional): Whether to show the image on the fly.
+            Defaults to False.
+        wait_time (int, optional): Value of waitKey param.
+            Defaults to 100.
+        out_file (str, optional): The filename to write the image.
+            Defaults to None.
+
+    Returns:
+        ndarray: Original image.
+    """
+    assert bboxes.ndim == 2, \
+        f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.'
+    assert ids.ndim == 1, \
+        f' ids ndim should be 1, but its ndim is {ids.ndim}.'
+    assert error_types.ndim == 1, \
+        f' error_types ndim should be 1, but its ndim is {error_types.ndim}.'
+    assert bboxes.shape[0] == ids.shape[0], \
+        'bboxes.shape[0] and ids.shape[0] should have the same length.'
+    assert bboxes.shape[1] == 5, \
+        f' bboxes.shape[1] should be 5, but its {bboxes.shape[1]}.'
+
+    bbox_colors = sns.color_palette()
+    # red, yellow, blue
+    bbox_colors = [bbox_colors[3], bbox_colors[1], bbox_colors[0]]
+
+    if isinstance(img, str):
+        img = plt.imread(img)
+    else:
+        assert img.ndim == 3
+        img = mmcv.bgr2rgb(img)
+
+    img_shape = img.shape
+    bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+    bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+
+    plt.imshow(img)
+    plt.gca().set_axis_off()
+    plt.autoscale(False)
+    plt.subplots_adjust(
+        top=1, bottom=0, right=1, left=0, hspace=None, wspace=None)
+    plt.margins(0, 0)
+    plt.gca().xaxis.set_major_locator(plt.NullLocator())
+    plt.gca().yaxis.set_major_locator(plt.NullLocator())
+    plt.rcParams['figure.figsize'] = img_shape[1], img_shape[0]
+
+    for bbox, error_type, id in zip(bboxes, error_types, ids):
+        x1, y1, x2, y2, score = bbox
+        w, h = int(x2 - x1), int(y2 - y1)
+        left_top = (int(x1), int(y1))
+
+        # bbox
+        plt.gca().add_patch(
+            Rectangle(
+                left_top,
+                w,
+                h,
+                thickness,
+                edgecolor=bbox_colors[error_type],
+                facecolor='none'))
+
+        # FN does not have id and score
+        if error_type == 1:
+            continue
+
+        # score
+        text = '{:.02f}'.format(score)
+        width = len(text) * text_width
+        plt.gca().add_patch(
+            Rectangle((left_top[0], left_top[1]),
+                      width,
+                      text_height,
+                      thickness,
+                      edgecolor=bbox_colors[error_type],
+                      facecolor=bbox_colors[error_type]))
+
+        plt.text(
+            left_top[0],
+            left_top[1] + text_height + 2,
+            text,
+            fontsize=font_scale)
+
+        # id
+        text = str(id)
+        width = len(text) * text_width
+        plt.gca().add_patch(
+            Rectangle((left_top[0], left_top[1] + text_height + 1),
+                      width,
+                      text_height,
+                      thickness,
+                      edgecolor=bbox_colors[error_type],
+                      facecolor=bbox_colors[error_type]))
+        plt.text(
+            left_top[0],
+            left_top[1] + 2 * (text_height + 1),
+            text,
+            fontsize=font_scale)
+
+    if out_file is not None:
+        mkdir_or_exist(osp.abspath(osp.dirname(out_file)))
+        plt.savefig(out_file, dpi=300, bbox_inches='tight', pad_inches=0.0)
+
+    if show:
+        plt.draw()
+        plt.pause(wait_time / 1000.)
+
+    plt.clf()
+    return img
diff --git a/head_extractor/build/lib/mmdet/utils/profiling.py b/head_extractor/build/lib/mmdet/utils/profiling.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f53f456c72db57bfa69a8d022c92d153580209e
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/profiling.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import contextlib
+import sys
+import time
+
+import torch
+
+if sys.version_info >= (3, 7):
+
+    @contextlib.contextmanager
+    def profile_time(trace_name,
+                     name,
+                     enabled=True,
+                     stream=None,
+                     end_stream=None):
+        """Print time spent by CPU and GPU.
+
+        Useful as a temporary context manager to find sweet spots of code
+        suitable for async implementation.
+        """
+        if (not enabled) or not torch.cuda.is_available():
+            yield
+            return
+        stream = stream if stream else torch.cuda.current_stream()
+        end_stream = end_stream if end_stream else stream
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        stream.record_event(start)
+        try:
+            cpu_start = time.monotonic()
+            yield
+        finally:
+            cpu_end = time.monotonic()
+            end_stream.record_event(end)
+            end.synchronize()
+            cpu_time = (cpu_end - cpu_start) * 1000
+            gpu_time = start.elapsed_time(end)
+            msg = f'{trace_name} {name} cpu_time {cpu_time:.2f} ms '
+            msg += f'gpu_time {gpu_time:.2f} ms stream {stream}'
+            print(msg, end_stream)
diff --git a/head_extractor/build/lib/mmdet/utils/replace_cfg_vals.py b/head_extractor/build/lib/mmdet/utils/replace_cfg_vals.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3331a36ce5a22fcc4d4a955d757f5e8b6bfc6bb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/replace_cfg_vals.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import re
+
+from mmengine.config import Config
+
+
+def replace_cfg_vals(ori_cfg):
+    """Replace the string "${key}" with the corresponding value.
+
+    Replace the "${key}" with the value of ori_cfg.key in the config. And
+    support replacing the chained ${key}. Such as, replace "${key0.key1}"
+    with the value of cfg.key0.key1. Code is modified from `vars.py
+    < https://github.com/microsoft/SoftTeacher/blob/main/ssod/utils/vars.py>`_  # noqa: E501
+
+    Args:
+        ori_cfg (mmengine.config.Config):
+            The origin config with "${key}" generated from a file.
+
+    Returns:
+        updated_cfg [mmengine.config.Config]:
+            The config with "${key}" replaced by the corresponding value.
+    """
+
+    def get_value(cfg, key):
+        for k in key.split('.'):
+            cfg = cfg[k]
+        return cfg
+
+    def replace_value(cfg):
+        if isinstance(cfg, dict):
+            return {key: replace_value(value) for key, value in cfg.items()}
+        elif isinstance(cfg, list):
+            return [replace_value(item) for item in cfg]
+        elif isinstance(cfg, tuple):
+            return tuple([replace_value(item) for item in cfg])
+        elif isinstance(cfg, str):
+            # the format of string cfg may be:
+            # 1) "${key}", which will be replaced with cfg.key directly
+            # 2) "xxx${key}xxx" or "xxx${key1}xxx${key2}xxx",
+            # which will be replaced with the string of the cfg.key
+            keys = pattern_key.findall(cfg)
+            values = [get_value(ori_cfg, key[2:-1]) for key in keys]
+            if len(keys) == 1 and keys[0] == cfg:
+                # the format of string cfg is "${key}"
+                cfg = values[0]
+            else:
+                for key, value in zip(keys, values):
+                    # the format of string cfg is
+                    # "xxx${key}xxx" or "xxx${key1}xxx${key2}xxx"
+                    assert not isinstance(value, (dict, list, tuple)), \
+                        f'for the format of string cfg is ' \
+                        f"'xxxxx${key}xxxxx' or 'xxx${key}xxx${key}xxx', " \
+                        f"the type of the value of '${key}' " \
+                        f'can not be dict, list, or tuple' \
+                        f'but you input {type(value)} in {cfg}'
+                    cfg = cfg.replace(key, str(value))
+            return cfg
+        else:
+            return cfg
+
+    # the pattern of string "${key}"
+    pattern_key = re.compile(r'\$\{[a-zA-Z\d_.]*\}')
+    # the type of ori_cfg._cfg_dict is mmengine.config.ConfigDict
+    updated_cfg = Config(
+        replace_value(ori_cfg._cfg_dict), filename=ori_cfg.filename)
+    # replace the model with model_wrapper
+    if updated_cfg.get('model_wrapper', None) is not None:
+        updated_cfg.model = updated_cfg.model_wrapper
+        updated_cfg.pop('model_wrapper')
+    return updated_cfg
diff --git a/head_extractor/build/lib/mmdet/utils/setup_env.py b/head_extractor/build/lib/mmdet/utils/setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7b37845a883752a1659fabf62c7404cff971191
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/setup_env.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import logging
+import os
+import platform
+import warnings
+
+import cv2
+import torch.multiprocessing as mp
+from mmengine import DefaultScope
+from mmengine.logging import print_log
+from mmengine.utils import digit_version
+
+
+def setup_cache_size_limit_of_dynamo():
+    """Setup cache size limit of dynamo.
+
+    Note: Due to the dynamic shape of the loss calculation and
+    post-processing parts in the object detection algorithm, these
+    functions must be compiled every time they are run.
+    Setting a large value for torch._dynamo.config.cache_size_limit
+    may result in repeated compilation, which can slow down training
+    and testing speed. Therefore, we need to set the default value of
+    cache_size_limit smaller. An empirical value is 4.
+    """
+
+    import torch
+    if digit_version(torch.__version__) >= digit_version('2.0.0'):
+        if 'DYNAMO_CACHE_SIZE_LIMIT' in os.environ:
+            import torch._dynamo
+            cache_size_limit = int(os.environ['DYNAMO_CACHE_SIZE_LIMIT'])
+            torch._dynamo.config.cache_size_limit = cache_size_limit
+            print_log(
+                f'torch._dynamo.config.cache_size_limit is force '
+                f'set to {cache_size_limit}.',
+                logger='current',
+                level=logging.WARNING)
+
+
+def setup_multi_processes(cfg):
+    """Setup multi-processing environment variables."""
+    # set multi-process start method as `fork` to speed up the training
+    if platform.system() != 'Windows':
+        mp_start_method = cfg.get('mp_start_method', 'fork')
+        current_method = mp.get_start_method(allow_none=True)
+        if current_method is not None and current_method != mp_start_method:
+            warnings.warn(
+                f'Multi-processing start method `{mp_start_method}` is '
+                f'different from the previous setting `{current_method}`.'
+                f'It will be force set to `{mp_start_method}`. You can change '
+                f'this behavior by changing `mp_start_method` in your config.')
+        mp.set_start_method(mp_start_method, force=True)
+
+    # disable opencv multithreading to avoid system being overloaded
+    opencv_num_threads = cfg.get('opencv_num_threads', 0)
+    cv2.setNumThreads(opencv_num_threads)
+
+    # setup OMP threads
+    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
+    workers_per_gpu = cfg.data.get('workers_per_gpu', 1)
+    if 'train_dataloader' in cfg.data:
+        workers_per_gpu = \
+            max(cfg.data.train_dataloader.get('workers_per_gpu', 1),
+                workers_per_gpu)
+
+    if 'OMP_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
+        omp_num_threads = 1
+        warnings.warn(
+            f'Setting OMP_NUM_THREADS environment variable for each process '
+            f'to be {omp_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
+
+    # setup MKL threads
+    if 'MKL_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
+        mkl_num_threads = 1
+        warnings.warn(
+            f'Setting MKL_NUM_THREADS environment variable for each process '
+            f'to be {mkl_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
+
+
+def register_all_modules(init_default_scope: bool = True) -> None:
+    """Register all modules in mmdet into the registries.
+
+    Args:
+        init_default_scope (bool): Whether initialize the mmdet default scope.
+            When `init_default_scope=True`, the global default scope will be
+            set to `mmdet`, and all registries will build modules from mmdet's
+            registry node. To understand more about the registry, please refer
+            to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md
+            Defaults to True.
+    """  # noqa
+    import mmdet.datasets  # noqa: F401,F403
+    import mmdet.engine  # noqa: F401,F403
+    import mmdet.evaluation  # noqa: F401,F403
+    import mmdet.models  # noqa: F401,F403
+    import mmdet.visualization  # noqa: F401,F403
+
+    if init_default_scope:
+        never_created = DefaultScope.get_current_instance() is None \
+                        or not DefaultScope.check_instance_created('mmdet')
+        if never_created:
+            DefaultScope.get_instance('mmdet', scope_name='mmdet')
+            return
+        current_scope = DefaultScope.get_current_instance()
+        if current_scope.scope_name != 'mmdet':
+            warnings.warn('The current default scope '
+                          f'"{current_scope.scope_name}" is not "mmdet", '
+                          '`register_all_modules` will force the current'
+                          'default scope to be "mmdet". If this is not '
+                          'expected, please set `init_default_scope=False`.')
+            # avoid name conflict
+            new_instance_name = f'mmdet-{datetime.datetime.now()}'
+            DefaultScope.get_instance(new_instance_name, scope_name='mmdet')
diff --git a/head_extractor/build/lib/mmdet/utils/split_batch.py b/head_extractor/build/lib/mmdet/utils/split_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..0276fb331f23c1a7f7451faf2a8f768e616d45fd
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/split_batch.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def split_batch(img, img_metas, kwargs):
+    """Split data_batch by tags.
+
+    Code is modified from
+    <https://github.com/microsoft/SoftTeacher/blob/main/ssod/utils/structure_utils.py> # noqa: E501
+
+    Args:
+        img (Tensor): of shape (N, C, H, W) encoding input images.
+            Typically these should be mean centered and std scaled.
+        img_metas (list[dict]): List of image info dict where each dict
+            has: 'img_shape', 'scale_factor', 'flip', and may also contain
+            'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            For details on the values of these keys, see
+            :class:`mmdet.datasets.pipelines.Collect`.
+        kwargs (dict): Specific to concrete implementation.
+
+    Returns:
+        data_groups (dict): a dict that data_batch splited by tags,
+            such as 'sup', 'unsup_teacher', and 'unsup_student'.
+    """
+
+    # only stack img in the batch
+    def fuse_list(obj_list, obj):
+        return torch.stack(obj_list) if isinstance(obj,
+                                                   torch.Tensor) else obj_list
+
+    # select data with tag from data_batch
+    def select_group(data_batch, current_tag):
+        group_flag = [tag == current_tag for tag in data_batch['tag']]
+        return {
+            k: fuse_list([vv for vv, gf in zip(v, group_flag) if gf], v)
+            for k, v in data_batch.items()
+        }
+
+    kwargs.update({'img': img, 'img_metas': img_metas})
+    kwargs.update({'tag': [meta['tag'] for meta in img_metas]})
+    tags = list(set(kwargs['tag']))
+    data_groups = {tag: select_group(kwargs, tag) for tag in tags}
+    for tag, group in data_groups.items():
+        group.pop('tag')
+    return data_groups
diff --git a/head_extractor/build/lib/mmdet/utils/typing_utils.py b/head_extractor/build/lib/mmdet/utils/typing_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6caf6de53274594e139dbe7c1973c747229bf010
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/typing_utils.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Collecting some commonly used type hint in mmdetection."""
+from typing import List, Optional, Sequence, Tuple, Union
+
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData, PixelData
+
+# TODO: Need to avoid circular import with assigner and sampler
+# Type hint of config data
+ConfigType = Union[ConfigDict, dict]
+OptConfigType = Optional[ConfigType]
+# Type hint of one or more config data
+MultiConfig = Union[ConfigType, List[ConfigType]]
+OptMultiConfig = Optional[MultiConfig]
+
+InstanceList = List[InstanceData]
+OptInstanceList = Optional[InstanceList]
+
+PixelList = List[PixelData]
+OptPixelList = Optional[PixelList]
+
+RangeType = Sequence[Tuple[int, int]]
diff --git a/head_extractor/build/lib/mmdet/utils/util_mixins.py b/head_extractor/build/lib/mmdet/utils/util_mixins.py
new file mode 100644
index 0000000000000000000000000000000000000000..b83b6617f5e4a202067e1659bf448962a2a2bc72
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/util_mixins.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This module defines the :class:`NiceRepr` mixin class, which defines a
+``__repr__`` and ``__str__`` method that only depend on a custom ``__nice__``
+method, which you must define. This means you only have to overload one
+function instead of two.  Furthermore, if the object defines a ``__len__``
+method, then the ``__nice__`` method defaults to something sensible, otherwise
+it is treated as abstract and raises ``NotImplementedError``.
+
+To use simply have your object inherit from :class:`NiceRepr`
+(multi-inheritance should be ok).
+
+This code was copied from the ubelt library: https://github.com/Erotemic/ubelt
+
+Example:
+    >>> # Objects that define __nice__ have a default __str__ and __repr__
+    >>> class Student(NiceRepr):
+    ...    def __init__(self, name):
+    ...        self.name = name
+    ...    def __nice__(self):
+    ...        return self.name
+    >>> s1 = Student('Alice')
+    >>> s2 = Student('Bob')
+    >>> print(f's1 = {s1}')
+    >>> print(f's2 = {s2}')
+    s1 = <Student(Alice)>
+    s2 = <Student(Bob)>
+
+Example:
+    >>> # Objects that define __len__ have a default __nice__
+    >>> class Group(NiceRepr):
+    ...    def __init__(self, data):
+    ...        self.data = data
+    ...    def __len__(self):
+    ...        return len(self.data)
+    >>> g = Group([1, 2, 3])
+    >>> print(f'g = {g}')
+    g = <Group(3)>
+"""
+import warnings
+
+
+class NiceRepr:
+    """Inherit from this class and define ``__nice__`` to "nicely" print your
+    objects.
+
+    Defines ``__str__`` and ``__repr__`` in terms of ``__nice__`` function
+    Classes that inherit from :class:`NiceRepr` should redefine ``__nice__``.
+    If the inheriting class has a ``__len__``, method then the default
+    ``__nice__`` method will return its length.
+
+    Example:
+        >>> class Foo(NiceRepr):
+        ...    def __nice__(self):
+        ...        return 'info'
+        >>> foo = Foo()
+        >>> assert str(foo) == '<Foo(info)>'
+        >>> assert repr(foo).startswith('<Foo(info) at ')
+
+    Example:
+        >>> class Bar(NiceRepr):
+        ...    pass
+        >>> bar = Bar()
+        >>> import pytest
+        >>> with pytest.warns(None) as record:
+        >>>     assert 'object at' in str(bar)
+        >>>     assert 'object at' in repr(bar)
+
+    Example:
+        >>> class Baz(NiceRepr):
+        ...    def __len__(self):
+        ...        return 5
+        >>> baz = Baz()
+        >>> assert str(baz) == '<Baz(5)>'
+    """
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this module"""
+        if hasattr(self, '__len__'):
+            # It is a common pattern for objects to use __len__ in __nice__
+            # As a convenience we define a default __nice__ for these objects
+            return str(len(self))
+        else:
+            # In all other cases force the subclass to overload __nice__
+            raise NotImplementedError(
+                f'Define the __nice__ method for {self.__class__!r}')
+
+    def __repr__(self):
+        """str: the string of the module"""
+        try:
+            nice = self.__nice__()
+            classname = self.__class__.__name__
+            return f'<{classname}({nice}) at {hex(id(self))}>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
+
+    def __str__(self):
+        """str: the string of the module"""
+        try:
+            classname = self.__class__.__name__
+            nice = self.__nice__()
+            return f'<{classname}({nice})>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
diff --git a/head_extractor/build/lib/mmdet/utils/util_random.py b/head_extractor/build/lib/mmdet/utils/util_random.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc1ecb6c03b026156c9947cb6d356a822448be0f
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/utils/util_random.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Helpers for random number generators."""
+import numpy as np
+
+
+def ensure_rng(rng=None):
+    """Coerces input into a random number generator.
+
+    If the input is None, then a global random state is returned.
+
+    If the input is a numeric value, then that is used as a seed to construct a
+    random state. Otherwise the input is returned as-is.
+
+    Adapted from [1]_.
+
+    Args:
+        rng (int | numpy.random.RandomState | None):
+            if None, then defaults to the global rng. Otherwise this can be an
+            integer or a RandomState class
+    Returns:
+        (numpy.random.RandomState) : rng -
+            a numpy random number generator
+
+    References:
+        .. [1] https://gitlab.kitware.com/computer-vision/kwarray/blob/master/kwarray/util_random.py#L270  # noqa: E501
+    """
+
+    if rng is None:
+        rng = np.random.mtrand._rand
+    elif isinstance(rng, int):
+        rng = np.random.RandomState(rng)
+    else:
+        rng = rng
+    return rng
diff --git a/head_extractor/build/lib/mmdet/version.py b/head_extractor/build/lib/mmdet/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..47989fc0a31f8d8eaa3adff72ab83db61b25b529
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/version.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+__version__ = '3.3.0'
+short_version = __version__
+
+
+def parse_version_info(version_str):
+    """Parse a version string into a tuple.
+
+    Args:
+        version_str (str): The version string.
+    Returns:
+        tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
+            (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1').
+    """
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/head_extractor/build/lib/mmdet/visualization/__init__.py b/head_extractor/build/lib/mmdet/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7edaed9d8701b1be72ff2f7ca646b865007e2eb
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/visualization/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .local_visualizer import DetLocalVisualizer, TrackLocalVisualizer
+from .palette import get_palette, jitter_color, palette_val
+
+__all__ = [
+    'palette_val', 'get_palette', 'DetLocalVisualizer', 'jitter_color',
+    'TrackLocalVisualizer'
+]
diff --git a/head_extractor/build/lib/mmdet/visualization/local_visualizer.py b/head_extractor/build/lib/mmdet/visualization/local_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc6521c56eb167c2c94a3f058594d9e832fb15ad
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/visualization/local_visualizer.py
@@ -0,0 +1,699 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import cv2
+import mmcv
+import numpy as np
+
+try:
+    import seaborn as sns
+except ImportError:
+    sns = None
+import torch
+from mmengine.dist import master_only
+from mmengine.structures import InstanceData, PixelData
+from mmengine.visualization import Visualizer
+
+from ..evaluation import INSTANCE_OFFSET
+from ..registry import VISUALIZERS
+from ..structures import DetDataSample
+from ..structures.mask import BitmapMasks, PolygonMasks, bitmap_to_polygon
+from .palette import _get_adaptive_scales, get_palette, jitter_color
+
+
+@VISUALIZERS.register_module()
+class DetLocalVisualizer(Visualizer):
+    """MMDetection Local Visualizer.
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        image (np.ndarray, optional): the origin image to draw. The format
+            should be RGB. Defaults to None.
+        vis_backends (list, optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        bbox_color (str, tuple(int), optional): Color of bbox lines.
+            The tuple of color should be in BGR order. Defaults to None.
+        text_color (str, tuple(int), optional): Color of texts.
+            The tuple of color should be in BGR order.
+            Defaults to (200, 200, 200).
+        mask_color (str, tuple(int), optional): Color of masks.
+            The tuple of color should be in BGR order.
+            Defaults to None.
+        line_width (int, float): The linewidth of lines.
+            Defaults to 3.
+        alpha (int, float): The transparency of bboxes or mask.
+            Defaults to 0.8.
+
+    Examples:
+        >>> import numpy as np
+        >>> import torch
+        >>> from mmengine.structures import InstanceData
+        >>> from mmdet.structures import DetDataSample
+        >>> from mmdet.visualization import DetLocalVisualizer
+
+        >>> det_local_visualizer = DetLocalVisualizer()
+        >>> image = np.random.randint(0, 256,
+        ...                     size=(10, 12, 3)).astype('uint8')
+        >>> gt_instances = InstanceData()
+        >>> gt_instances.bboxes = torch.Tensor([[1, 2, 2, 5]])
+        >>> gt_instances.labels = torch.randint(0, 2, (1,))
+        >>> gt_det_data_sample = DetDataSample()
+        >>> gt_det_data_sample.gt_instances = gt_instances
+        >>> det_local_visualizer.add_datasample('image', image,
+        ...                         gt_det_data_sample)
+        >>> det_local_visualizer.add_datasample(
+        ...                       'image', image, gt_det_data_sample,
+        ...                        out_file='out_file.jpg')
+        >>> det_local_visualizer.add_datasample(
+        ...                        'image', image, gt_det_data_sample,
+        ...                         show=True)
+        >>> pred_instances = InstanceData()
+        >>> pred_instances.bboxes = torch.Tensor([[2, 4, 4, 8]])
+        >>> pred_instances.labels = torch.randint(0, 2, (1,))
+        >>> pred_det_data_sample = DetDataSample()
+        >>> pred_det_data_sample.pred_instances = pred_instances
+        >>> det_local_visualizer.add_datasample('image', image,
+        ...                         gt_det_data_sample,
+        ...                         pred_det_data_sample)
+    """
+
+    def __init__(self,
+                 name: str = 'visualizer',
+                 image: Optional[np.ndarray] = None,
+                 vis_backends: Optional[Dict] = None,
+                 save_dir: Optional[str] = None,
+                 bbox_color: Optional[Union[str, Tuple[int]]] = None,
+                 text_color: Optional[Union[str,
+                                            Tuple[int]]] = (200, 200, 200),
+                 mask_color: Optional[Union[str, Tuple[int]]] = None,
+                 line_width: Union[int, float] = 3,
+                 alpha: float = 0.8) -> None:
+        super().__init__(
+            name=name,
+            image=image,
+            vis_backends=vis_backends,
+            save_dir=save_dir)
+        self.bbox_color = bbox_color
+        self.text_color = text_color
+        self.mask_color = mask_color
+        self.line_width = line_width
+        self.alpha = alpha
+        # Set default value. When calling
+        # `DetLocalVisualizer().dataset_meta=xxx`,
+        # it will override the default value.
+        self.dataset_meta = {}
+
+    def _draw_instances(self, image: np.ndarray, instances: ['InstanceData'],
+                        classes: Optional[List[str]],
+                        palette: Optional[List[tuple]]) -> np.ndarray:
+        """Draw instances of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            instances (:obj:`InstanceData`): Data structure for
+                instance-level annotations or predictions.
+            classes (List[str], optional): Category information.
+            palette (List[tuple], optional): Palette information
+                corresponding to the category.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        self.set_image(image)
+
+        if 'bboxes' in instances and instances.bboxes.sum() > 0:
+            bboxes = instances.bboxes
+            labels = instances.labels
+
+            max_label = int(max(labels) if len(labels) > 0 else 0)
+            text_palette = get_palette(self.text_color, max_label + 1)
+            text_colors = [text_palette[label] for label in labels]
+
+            bbox_color = palette if self.bbox_color is None \
+                else self.bbox_color
+            bbox_palette = get_palette(bbox_color, max_label + 1)
+            colors = [bbox_palette[label] for label in labels]
+            self.draw_bboxes(
+                bboxes,
+                edge_colors=colors,
+                alpha=self.alpha,
+                line_widths=self.line_width)
+
+            positions = bboxes[:, :2] + self.line_width
+            areas = (bboxes[:, 3] - bboxes[:, 1]) * (
+                bboxes[:, 2] - bboxes[:, 0])
+            scales = _get_adaptive_scales(areas)
+
+            for i, (pos, label) in enumerate(zip(positions, labels)):
+                if 'label_names' in instances:
+                    label_text = instances.label_names[i]
+                else:
+                    label_text = classes[
+                        label] if classes is not None else f'class {label}'
+                if 'scores' in instances:
+                    score = round(float(instances.scores[i]) * 100, 1)
+                    label_text += f': {score}'
+
+                self.draw_texts(
+                    label_text,
+                    pos,
+                    colors=text_colors[i],
+                    font_sizes=int(13 * scales[i]),
+                    bboxes=[{
+                        'facecolor': 'black',
+                        'alpha': 0.8,
+                        'pad': 0.7,
+                        'edgecolor': 'none'
+                    }])
+
+        if 'masks' in instances:
+            labels = instances.labels
+            masks = instances.masks
+            if isinstance(masks, torch.Tensor):
+                masks = masks.numpy()
+            elif isinstance(masks, (PolygonMasks, BitmapMasks)):
+                masks = masks.to_ndarray()
+
+            masks = masks.astype(bool)
+
+            max_label = int(max(labels) if len(labels) > 0 else 0)
+            mask_color = palette if self.mask_color is None \
+                else self.mask_color
+            mask_palette = get_palette(mask_color, max_label + 1)
+            colors = [jitter_color(mask_palette[label]) for label in labels]
+            text_palette = get_palette(self.text_color, max_label + 1)
+            text_colors = [text_palette[label] for label in labels]
+
+            polygons = []
+            for i, mask in enumerate(masks):
+                contours, _ = bitmap_to_polygon(mask)
+                polygons.extend(contours)
+            self.draw_polygons(polygons, edge_colors='w', alpha=self.alpha)
+            self.draw_binary_masks(masks, colors=colors, alphas=self.alpha)
+
+            if len(labels) > 0 and \
+                    ('bboxes' not in instances or
+                     instances.bboxes.sum() == 0):
+                # instances.bboxes.sum()==0 represent dummy bboxes.
+                # A typical example of SOLO does not exist bbox branch.
+                areas = []
+                positions = []
+                for mask in masks:
+                    _, _, stats, centroids = cv2.connectedComponentsWithStats(
+                        mask.astype(np.uint8), connectivity=8)
+                    if stats.shape[0] > 1:
+                        largest_id = np.argmax(stats[1:, -1]) + 1
+                        positions.append(centroids[largest_id])
+                        areas.append(stats[largest_id, -1])
+                areas = np.stack(areas, axis=0)
+                scales = _get_adaptive_scales(areas)
+
+                for i, (pos, label) in enumerate(zip(positions, labels)):
+                    if 'label_names' in instances:
+                        label_text = instances.label_names[i]
+                    else:
+                        label_text = classes[
+                            label] if classes is not None else f'class {label}'
+                    if 'scores' in instances:
+                        score = round(float(instances.scores[i]) * 100, 1)
+                        label_text += f': {score}'
+
+                    self.draw_texts(
+                        label_text,
+                        pos,
+                        colors=text_colors[i],
+                        font_sizes=int(13 * scales[i]),
+                        horizontal_alignments='center',
+                        bboxes=[{
+                            'facecolor': 'black',
+                            'alpha': 0.8,
+                            'pad': 0.7,
+                            'edgecolor': 'none'
+                        }])
+        return self.get_image()
+
+    def _draw_panoptic_seg(self, image: np.ndarray,
+                           panoptic_seg: ['PixelData'],
+                           classes: Optional[List[str]],
+                           palette: Optional[List]) -> np.ndarray:
+        """Draw panoptic seg of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            panoptic_seg (:obj:`PixelData`): Data structure for
+                pixel-level annotations or predictions.
+            classes (List[str], optional): Category information.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        # TODO: Is there a way to bypass？
+        num_classes = len(classes)
+
+        panoptic_seg_data = panoptic_seg.sem_seg[0]
+
+        ids = np.unique(panoptic_seg_data)[::-1]
+
+        if 'label_names' in panoptic_seg:
+            # open set panoptic segmentation
+            classes = panoptic_seg.metainfo['label_names']
+            ignore_index = panoptic_seg.metainfo.get('ignore_index',
+                                                     len(classes))
+            ids = ids[ids != ignore_index]
+        else:
+            # for VOID label
+            ids = ids[ids != num_classes]
+
+        labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+        segms = (panoptic_seg_data[None] == ids[:, None, None])
+
+        max_label = int(max(labels) if len(labels) > 0 else 0)
+
+        mask_color = palette if self.mask_color is None \
+            else self.mask_color
+        mask_palette = get_palette(mask_color, max_label + 1)
+        colors = [mask_palette[label] for label in labels]
+
+        self.set_image(image)
+
+        # draw segm
+        polygons = []
+        for i, mask in enumerate(segms):
+            contours, _ = bitmap_to_polygon(mask)
+            polygons.extend(contours)
+        self.draw_polygons(polygons, edge_colors='w', alpha=self.alpha)
+        self.draw_binary_masks(segms, colors=colors, alphas=self.alpha)
+
+        # draw label
+        areas = []
+        positions = []
+        for mask in segms:
+            _, _, stats, centroids = cv2.connectedComponentsWithStats(
+                mask.astype(np.uint8), connectivity=8)
+            max_id = np.argmax(stats[1:, -1]) + 1
+            positions.append(centroids[max_id])
+            areas.append(stats[max_id, -1])
+        areas = np.stack(areas, axis=0)
+        scales = _get_adaptive_scales(areas)
+
+        text_palette = get_palette(self.text_color, max_label + 1)
+        text_colors = [text_palette[label] for label in labels]
+
+        for i, (pos, label) in enumerate(zip(positions, labels)):
+            label_text = classes[label]
+
+            self.draw_texts(
+                label_text,
+                pos,
+                colors=text_colors[i],
+                font_sizes=int(13 * scales[i]),
+                bboxes=[{
+                    'facecolor': 'black',
+                    'alpha': 0.8,
+                    'pad': 0.7,
+                    'edgecolor': 'none'
+                }],
+                horizontal_alignments='center')
+        return self.get_image()
+
+    def _draw_sem_seg(self, image: np.ndarray, sem_seg: PixelData,
+                      classes: Optional[List],
+                      palette: Optional[List]) -> np.ndarray:
+        """Draw semantic seg of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            sem_seg (:obj:`PixelData`): Data structure for pixel-level
+                annotations or predictions.
+            classes (list, optional): Input classes for result rendering, as
+                the prediction of segmentation model is a segment map with
+                label indices, `classes` is a list which includes items
+                responding to the label indices. If classes is not defined,
+                visualizer will take `cityscapes` classes by default.
+                Defaults to None.
+            palette (list, optional): Input palette for result rendering, which
+                is a list of color palette responding to the classes.
+                Defaults to None.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        sem_seg_data = sem_seg.sem_seg
+        if isinstance(sem_seg_data, torch.Tensor):
+            sem_seg_data = sem_seg_data.numpy()
+
+        # 0 ~ num_class, the value 0 means background
+        ids = np.unique(sem_seg_data)
+        ignore_index = sem_seg.metainfo.get('ignore_index', 255)
+        ids = ids[ids != ignore_index]
+
+        if 'label_names' in sem_seg:
+            # open set semseg
+            label_names = sem_seg.metainfo['label_names']
+        else:
+            label_names = classes
+
+        labels = np.array(ids, dtype=np.int64)
+        colors = [palette[label] for label in labels]
+
+        self.set_image(image)
+
+        # draw semantic masks
+        for i, (label, color) in enumerate(zip(labels, colors)):
+            masks = sem_seg_data == label
+            self.draw_binary_masks(masks, colors=[color], alphas=self.alpha)
+            label_text = label_names[label]
+            _, _, stats, centroids = cv2.connectedComponentsWithStats(
+                masks[0].astype(np.uint8), connectivity=8)
+            if stats.shape[0] > 1:
+                largest_id = np.argmax(stats[1:, -1]) + 1
+                centroids = centroids[largest_id]
+
+                areas = stats[largest_id, -1]
+                scales = _get_adaptive_scales(areas)
+
+                self.draw_texts(
+                    label_text,
+                    centroids,
+                    colors=(255, 255, 255),
+                    font_sizes=int(13 * scales),
+                    horizontal_alignments='center',
+                    bboxes=[{
+                        'facecolor': 'black',
+                        'alpha': 0.8,
+                        'pad': 0.7,
+                        'edgecolor': 'none'
+                    }])
+
+        return self.get_image()
+
+    @master_only
+    def add_datasample(
+            self,
+            name: str,
+            image: np.ndarray,
+            data_sample: Optional['DetDataSample'] = None,
+            draw_gt: bool = True,
+            draw_pred: bool = True,
+            show: bool = False,
+            wait_time: float = 0,
+            # TODO: Supported in mmengine's Viusalizer.
+            out_file: Optional[str] = None,
+            pred_score_thr: float = 0.3,
+            step: int = 0) -> None:
+        """Draw datasample and save to all backends.
+
+        - If GT and prediction are plotted at the same time, they are
+        displayed in a stitched image where the left image is the
+        ground truth and the right image is the prediction.
+        - If ``show`` is True, all storage backends are ignored, and
+        the images will be displayed in a local window.
+        - If ``out_file`` is specified, the drawn image will be
+        saved to ``out_file``. t is usually used when the display
+        is not available.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to draw.
+            data_sample (:obj:`DetDataSample`, optional): A data
+                sample that contain annotations and predictions.
+                Defaults to None.
+            draw_gt (bool): Whether to draw GT DetDataSample. Default to True.
+            draw_pred (bool): Whether to draw Prediction DetDataSample.
+                Defaults to True.
+            show (bool): Whether to display the drawn image. Default to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            out_file (str): Path to output file. Defaults to None.
+            pred_score_thr (float): The threshold to visualize the bboxes
+                and masks. Defaults to 0.3.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        image = image.clip(0, 255).astype(np.uint8)
+        classes = self.dataset_meta.get('classes', None)
+        palette = self.dataset_meta.get('palette', None)
+
+        gt_img_data = None
+        pred_img_data = None
+
+        if data_sample is not None:
+            data_sample = data_sample.cpu()
+
+        if draw_gt and data_sample is not None:
+            gt_img_data = image
+            if 'gt_instances' in data_sample:
+                gt_img_data = self._draw_instances(image,
+                                                   data_sample.gt_instances,
+                                                   classes, palette)
+            if 'gt_sem_seg' in data_sample:
+                gt_img_data = self._draw_sem_seg(gt_img_data,
+                                                 data_sample.gt_sem_seg,
+                                                 classes, palette)
+
+            if 'gt_panoptic_seg' in data_sample:
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing panoptic ' \
+                                            'segmentation results.'
+                gt_img_data = self._draw_panoptic_seg(
+                    gt_img_data, data_sample.gt_panoptic_seg, classes, palette)
+
+        if draw_pred and data_sample is not None:
+            pred_img_data = image
+            if 'pred_instances' in data_sample:
+                pred_instances = data_sample.pred_instances
+                pred_instances = pred_instances[
+                    pred_instances.scores > pred_score_thr]
+                pred_img_data = self._draw_instances(image, pred_instances,
+                                                     classes, palette)
+
+            if 'pred_sem_seg' in data_sample:
+                pred_img_data = self._draw_sem_seg(pred_img_data,
+                                                   data_sample.pred_sem_seg,
+                                                   classes, palette)
+
+            if 'pred_panoptic_seg' in data_sample:
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing panoptic ' \
+                                            'segmentation results.'
+                pred_img_data = self._draw_panoptic_seg(
+                    pred_img_data, data_sample.pred_panoptic_seg.numpy(),
+                    classes, palette)
+
+        if gt_img_data is not None and pred_img_data is not None:
+            drawn_img = np.concatenate((gt_img_data, pred_img_data), axis=1)
+        elif gt_img_data is not None:
+            drawn_img = gt_img_data
+        elif pred_img_data is not None:
+            drawn_img = pred_img_data
+        else:
+            # Display the original image directly if nothing is drawn.
+            drawn_img = image
+
+        # It is convenient for users to obtain the drawn image.
+        # For example, the user wants to obtain the drawn image and
+        # save it as a video during video inference.
+        self.set_image(drawn_img)
+
+        if show:
+            self.show(drawn_img, win_name=name, wait_time=wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(drawn_img[..., ::-1], out_file)
+        else:
+            self.add_image(name, drawn_img, step)
+
+
+def random_color(seed):
+    """Random a color according to the input seed."""
+    if sns is None:
+        raise RuntimeError('motmetrics is not installed,\
+                 please install it by: pip install seaborn')
+    np.random.seed(seed)
+    colors = sns.color_palette()
+    color = colors[np.random.choice(range(len(colors)))]
+    color = tuple([int(255 * c) for c in color])
+    return color
+
+
+@VISUALIZERS.register_module()
+class TrackLocalVisualizer(Visualizer):
+    """Tracking Local Visualizer for the MOT, VIS tasks.
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        image (np.ndarray, optional): the origin image to draw. The format
+            should be RGB. Defaults to None.
+        vis_backends (list, optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        line_width (int, float): The linewidth of lines.
+            Defaults to 3.
+        alpha (int, float): The transparency of bboxes or mask.
+                Defaults to 0.8.
+    """
+
+    def __init__(self,
+                 name: str = 'visualizer',
+                 image: Optional[np.ndarray] = None,
+                 vis_backends: Optional[Dict] = None,
+                 save_dir: Optional[str] = None,
+                 line_width: Union[int, float] = 3,
+                 alpha: float = 0.8) -> None:
+        super().__init__(name, image, vis_backends, save_dir)
+        self.line_width = line_width
+        self.alpha = alpha
+        # Set default value. When calling
+        # `TrackLocalVisualizer().dataset_meta=xxx`,
+        # it will override the default value.
+        self.dataset_meta = {}
+
+    def _draw_instances(self, image: np.ndarray,
+                        instances: InstanceData) -> np.ndarray:
+        """Draw instances of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            instances (:obj:`InstanceData`): Data structure for
+                instance-level annotations or predictions.
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        self.set_image(image)
+        classes = self.dataset_meta.get('classes', None)
+
+        # get colors and texts
+        # for the MOT and VIS tasks
+        colors = [random_color(_id) for _id in instances.instances_id]
+        categories = [
+            classes[label] if classes is not None else f'cls{label}'
+            for label in instances.labels
+        ]
+        if 'scores' in instances:
+            texts = [
+                f'{category_name}\n{instance_id} | {score:.2f}'
+                for category_name, instance_id, score in zip(
+                    categories, instances.instances_id, instances.scores)
+            ]
+        else:
+            texts = [
+                f'{category_name}\n{instance_id}' for category_name,
+                instance_id in zip(categories, instances.instances_id)
+            ]
+
+        # draw bboxes and texts
+        if 'bboxes' in instances:
+            # draw bboxes
+            bboxes = instances.bboxes.clone()
+            self.draw_bboxes(
+                bboxes,
+                edge_colors=colors,
+                alpha=self.alpha,
+                line_widths=self.line_width)
+            # draw texts
+            if texts is not None:
+                positions = bboxes[:, :2] + self.line_width
+                areas = (bboxes[:, 3] - bboxes[:, 1]) * (
+                    bboxes[:, 2] - bboxes[:, 0])
+                scales = _get_adaptive_scales(areas.cpu().numpy())
+                for i, pos in enumerate(positions):
+                    self.draw_texts(
+                        texts[i],
+                        pos,
+                        colors='black',
+                        font_sizes=int(13 * scales[i]),
+                        bboxes=[{
+                            'facecolor': [c / 255 for c in colors[i]],
+                            'alpha': 0.8,
+                            'pad': 0.7,
+                            'edgecolor': 'none'
+                        }])
+
+        # draw masks
+        if 'masks' in instances:
+            masks = instances.masks
+            polygons = []
+            for i, mask in enumerate(masks):
+                contours, _ = bitmap_to_polygon(mask)
+                polygons.extend(contours)
+            self.draw_polygons(polygons, edge_colors='w', alpha=self.alpha)
+            self.draw_binary_masks(masks, colors=colors, alphas=self.alpha)
+
+        return self.get_image()
+
+    @master_only
+    def add_datasample(
+            self,
+            name: str,
+            image: np.ndarray,
+            data_sample: DetDataSample = None,
+            draw_gt: bool = True,
+            draw_pred: bool = True,
+            show: bool = False,
+            wait_time: int = 0,
+            # TODO: Supported in mmengine's Viusalizer.
+            out_file: Optional[str] = None,
+            pred_score_thr: float = 0.3,
+            step: int = 0) -> None:
+        """Draw datasample and save to all backends.
+
+        - If GT and prediction are plotted at the same time, they are
+        displayed in a stitched image where the left image is the
+        ground truth and the right image is the prediction.
+        - If ``show`` is True, all storage backends are ignored, and
+        the images will be displayed in a local window.
+        - If ``out_file`` is specified, the drawn image will be
+        saved to ``out_file``. t is usually used when the display
+        is not available.
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to draw.
+            data_sample (OptTrackSampleList): A data
+                sample that contain annotations and predictions.
+                Defaults to None.
+            draw_gt (bool): Whether to draw GT TrackDataSample.
+                Default to True.
+            draw_pred (bool): Whether to draw Prediction TrackDataSample.
+                Defaults to True.
+            show (bool): Whether to display the drawn image. Default to False.
+            wait_time (int): The interval of show (s). Defaults to 0.
+            out_file (str): Path to output file. Defaults to None.
+            pred_score_thr (float): The threshold to visualize the bboxes
+                and masks. Defaults to 0.3.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        gt_img_data = None
+        pred_img_data = None
+
+        if data_sample is not None:
+            data_sample = data_sample.cpu()
+
+        if draw_gt and data_sample is not None:
+            assert 'gt_instances' in data_sample
+            gt_img_data = self._draw_instances(image, data_sample.gt_instances)
+
+        if draw_pred and data_sample is not None:
+            assert 'pred_track_instances' in data_sample
+            pred_instances = data_sample.pred_track_instances
+            if 'scores' in pred_instances:
+                pred_instances = pred_instances[
+                    pred_instances.scores > pred_score_thr].cpu()
+            pred_img_data = self._draw_instances(image, pred_instances)
+
+        if gt_img_data is not None and pred_img_data is not None:
+            drawn_img = np.concatenate((gt_img_data, pred_img_data), axis=1)
+        elif gt_img_data is not None:
+            drawn_img = gt_img_data
+        else:
+            drawn_img = pred_img_data
+
+        if show:
+            self.show(drawn_img, win_name=name, wait_time=wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(drawn_img[..., ::-1], out_file)
+        else:
+            self.add_image(name, drawn_img, step)
diff --git a/head_extractor/build/lib/mmdet/visualization/palette.py b/head_extractor/build/lib/mmdet/visualization/palette.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c402c08823a60759c984093ba7f05f1e310dbd9
--- /dev/null
+++ b/head_extractor/build/lib/mmdet/visualization/palette.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import mmcv
+import numpy as np
+from mmengine.utils import is_str
+
+
+def palette_val(palette: List[tuple]) -> List[tuple]:
+    """Convert palette to matplotlib palette.
+
+    Args:
+        palette (List[tuple]): A list of color tuples.
+
+    Returns:
+        List[tuple[float]]: A list of RGB matplotlib color tuples.
+    """
+    new_palette = []
+    for color in palette:
+        color = [c / 255 for c in color]
+        new_palette.append(tuple(color))
+    return new_palette
+
+
+def get_palette(palette: Union[List[tuple], str, tuple],
+                num_classes: int) -> List[Tuple[int]]:
+    """Get palette from various inputs.
+
+    Args:
+        palette (list[tuple] | str | tuple): palette inputs.
+        num_classes (int): the number of classes.
+
+    Returns:
+        list[tuple[int]]: A list of color tuples.
+    """
+    assert isinstance(num_classes, int)
+
+    if isinstance(palette, list):
+        dataset_palette = palette
+    elif isinstance(palette, tuple):
+        dataset_palette = [palette] * num_classes
+    elif palette == 'random' or palette is None:
+        state = np.random.get_state()
+        # random color
+        np.random.seed(42)
+        palette = np.random.randint(0, 256, size=(num_classes, 3))
+        np.random.set_state(state)
+        dataset_palette = [tuple(c) for c in palette]
+    elif palette == 'coco':
+        from mmdet.datasets import CocoDataset, CocoPanopticDataset
+        dataset_palette = CocoDataset.METAINFO['palette']
+        if len(dataset_palette) < num_classes:
+            dataset_palette = CocoPanopticDataset.METAINFO['palette']
+    elif palette == 'citys':
+        from mmdet.datasets import CityscapesDataset
+        dataset_palette = CityscapesDataset.METAINFO['palette']
+    elif palette == 'voc':
+        from mmdet.datasets import VOCDataset
+        dataset_palette = VOCDataset.METAINFO['palette']
+    elif is_str(palette):
+        dataset_palette = [mmcv.color_val(palette)[::-1]] * num_classes
+    else:
+        raise TypeError(f'Invalid type for palette: {type(palette)}')
+
+    assert len(dataset_palette) >= num_classes, \
+        'The length of palette should not be less than `num_classes`.'
+    return dataset_palette
+
+
+def _get_adaptive_scales(areas: np.ndarray,
+                         min_area: int = 800,
+                         max_area: int = 30000) -> np.ndarray:
+    """Get adaptive scales according to areas.
+
+    The scale range is [0.5, 1.0]. When the area is less than
+    ``min_area``, the scale is 0.5 while the area is larger than
+    ``max_area``, the scale is 1.0.
+
+    Args:
+        areas (ndarray): The areas of bboxes or masks with the
+            shape of (n, ).
+        min_area (int): Lower bound areas for adaptive scales.
+            Defaults to 800.
+        max_area (int): Upper bound areas for adaptive scales.
+            Defaults to 30000.
+
+    Returns:
+        ndarray: The adaotive scales with the shape of (n, ).
+    """
+    scales = 0.5 + (areas - min_area) // (max_area - min_area)
+    scales = np.clip(scales, 0.5, 1.0)
+    return scales
+
+
+def jitter_color(color: tuple) -> tuple:
+    """Randomly jitter the given color in order to better distinguish instances
+    with the same class.
+
+    Args:
+        color (tuple): The RGB color tuple. Each value is between [0, 255].
+
+    Returns:
+        tuple: The jittered color tuple.
+    """
+    jitter = np.random.rand(3)
+    jitter = (jitter / np.linalg.norm(jitter) - 0.5) * 0.5 * 255
+    color = np.clip(jitter + color, 0, 255).astype(np.uint8)
+    return tuple(color)
diff --git a/head_extractor/build/lib/mmengine/__init__.py b/head_extractor/build/lib/mmengine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a436c950e8ffa39af69304efc180a1ebcaceb582
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# flake8: noqa
+from .config import *
+from .fileio import *
+from .logging import *
+from .registry import *
+from .utils import *
+from .version import __version__, version_info
diff --git a/head_extractor/build/lib/mmengine/_strategy/__init__.py b/head_extractor/build/lib/mmengine/_strategy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..764abcf868912838d490a9fb4f41625851de3bee
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/_strategy/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+from .base import BaseStrategy
+from .colossalai import ColossalAIStrategy
+from .deepspeed import DeepSpeedStrategy
+from .distributed import DDPStrategy
+from .single_device import SingleDeviceStrategy
+
+__all__ = [
+    'BaseStrategy', 'DDPStrategy', 'SingleDeviceStrategy', 'DeepSpeedStrategy',
+    'ColossalAIStrategy'
+]
+
+if digit_version(TORCH_VERSION) >= digit_version('2.0.0'):
+    try:
+        from .fsdp import FSDPStrategy  # noqa:F401
+        __all__.append('FSDPStrategy')
+    except:  # noqa: E722
+        pass
diff --git a/head_extractor/build/lib/mmengine/_strategy/base.py b/head_extractor/build/lib/mmengine/_strategy/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..5df3a79c92da2cfe917113406ad8b9604df53276
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/_strategy/base.py
@@ -0,0 +1,979 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+import platform
+import time
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.optim import Optimizer
+
+import mmengine
+from mmengine.config import Config, ConfigDict
+from mmengine.dist import (broadcast, get_dist_info, infer_launcher,
+                           is_distributed)
+from mmengine.logging import MMLogger
+from mmengine.model.wrappers import is_model_wrapper
+from mmengine.optim import (BaseOptimWrapper, OptimWrapperDict,
+                            _ParamScheduler, build_optim_wrapper)
+from mmengine.registry import MODELS, OPTIM_WRAPPERS, PARAM_SCHEDULERS
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import (TORCH_VERSION, collect_env,
+                                     set_multi_processing)
+
+ParamSchedulerType = Union[List[_ParamScheduler], Dict[str,
+                                                       List[_ParamScheduler]]]
+
+
+class BaseStrategy(metaclass=ABCMeta):
+    """Base class for all strategies.
+
+    In the process of supporting FSDP, DeepSpeed, and ColossalAI, the
+    scalability of the Runner faced challenges, which led to the redefinition
+    of the Runner's responsibilities. The Strategy abstraction was split out,
+    which is responsible for constructing, initializing, and saving/loading
+    the state of training components such as models, optimizers, and parameter
+    schedulers.
+
+    Warning:
+        This is an experimental feature, and its interface is subject to
+        change.
+
+    Keyword Args:
+        work_dir (str): The working directory to save checkpoints. The logs
+            will be saved in the subdirectory of `work_dir` named
+            :attr:`timestamp`. Defaults to 'work_dirs'.
+        experiment_name (str, optional): Name of current experiment. If not
+            specified, timestamp will be used as :attr:`experiment_name`.
+            Defaults to None.
+        env_kwargs (dict, optional): Environment config passed in
+            :meth:`setup_env`. Defaults to None.
+        log_kwargs (dict, optional): Logger config passed in
+            :meth:`build_logger`. Defaults to None.
+        auto_scale_lr (dict, Optional): Config to scale the learning rate
+            automatically. It includes ``base_batch_size`` and ``enable``.
+            ``base_batch_size`` is the batch size that the optimizer lr is
+            based on. ``enable`` is the switch to turn on and off the feature.
+    """
+    model: nn.Module
+    optim_wrapper: BaseOptimWrapper
+    param_schedulers: ParamSchedulerType
+
+    def __init__(
+        self,
+        *,
+        work_dir: str = 'work_dirs',
+        experiment_name: Optional[str] = None,
+        env_kwargs: Optional[dict] = None,
+        log_kwargs: Optional[dict] = None,
+        auto_scale_lr: Optional[dict] = None,
+    ):
+        self._work_dir = osp.abspath(work_dir)
+        mmengine.mkdir_or_exist(self._work_dir)
+
+        self._env_kwargs = env_kwargs or {}
+        self._setup_env(**self._env_kwargs)
+
+        if experiment_name is not None:
+            self._experiment_name = f'{experiment_name}_{self.timestamp}'
+        else:
+            self._experiment_name = self.timestamp
+
+        self._log_dir = osp.join(self.work_dir, self.timestamp)
+        mmengine.mkdir_or_exist(self._log_dir)
+
+        log_kwargs = log_kwargs or {}
+        self.logger = self.build_logger(**log_kwargs)
+
+        self._auto_scale_lr = auto_scale_lr
+
+        self.dispatch_kwargs: dict = {}
+        self._prepared = False
+
+    @property
+    def work_dir(self):
+        return self._work_dir
+
+    @property
+    def log_dir(self):
+        return self._log_dir
+
+    @property
+    def experiment_name(self):
+        return self._experiment_name
+
+    @property
+    def launcher(self):
+        return self._launcher
+
+    @property
+    def distributed(self):
+        return self._distributed
+
+    @property
+    def seed(self):
+        return self._seed
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    @property
+    def timestamp(self):
+        return self._timestamp
+
+    @property
+    def randomness(self):
+        return self._randomness
+
+    @abstractmethod
+    def prepare(
+        self,
+        model: Union[nn.Module, dict],
+        *,
+        optim_wrapper: Union[BaseOptimWrapper, dict, None] = None,
+        param_scheduler: Union[_ParamScheduler, Dict, List, None] = None,
+        compile: Union[dict, bool] = False,
+        dispatch_kwargs: Optional[dict] = None,
+    ):
+        """Prepare model and some components.
+
+        Args:
+            model (:obj:`torch.nn.Module` or dict): The model to be run. It
+                can be a dict used for building a model.
+
+        Keyword Args:
+            optim_wrapper (BaseOptimWrapper or dict, optional): Computing the
+                gradient of model parameters and updating them.
+                Defaults to None.
+                See :meth:`build_optim_wrapper` for examples.
+            param_scheduler (_ParamScheduler or dict or list, optional):
+                Parameter scheduler for updating optimizer parameters. If
+                specified, :attr:`optim_wrapper` should also be specified.
+                Defaults to None.
+                See :meth:`build_param_scheduler` for examples.
+            compile (dict, optional): Config to compile model.
+                Defaults to False. Requires PyTorch>=2.0.
+            dispatch_kwargs (dict, optional): Kwargs to be passed to other
+                methods of Strategy. Defaults to None.
+        """
+
+    def _setup_env(
+            self,
+            *,
+            launcher: Optional[str] = None,
+            cudnn_benchmark: bool = False,
+            mp_cfg: Optional[dict] = None,
+            dist_cfg: Optional[dict] = None,
+            resource_limit: int = 4096,
+            randomness: dict = dict(seed=None),
+    ):
+        """Setup environment.
+
+        This method will do the following things:
+
+        1. setup multi-processing
+        2. setup distributed
+        3. set random seed
+
+        Keyword Args:
+            launcher (str, optional): Way to launcher multi-process. Supported
+                launchers are 'pytorch', 'mpi', 'slurm' and 'none'. If 'none'
+                is provided, non-distributed environment will be launched.
+                If launcher is None, the launcher will be inferred according
+                some specified environments. Defaults to None.
+            cudnn_benchmark (bool): Whether to enable cudnn benchmark.
+                Defaults to False.
+            mp_cfg (dict, optional): Multi-processing config. Defaults to None.
+            dist_cfg (dict, optional): Distributed config. Defaults to None.
+            resource_limit (int): Resource limit. Defaults to 4096.
+            randomness (dict): Some settings to make the experiment as
+                reproducible as possible like seed and deterministic.
+                Defaults to ``dict(seed=None)``. If seed is None, a random
+                number will be generated and it will be broadcasted to all
+                other processes if in distributed environment.
+                If ``cudnn_benchmark`` is ``True`` in but ``deterministic`` is
+                ``True`` in ``randomness``, the value of
+                ``torch.backends.cudnn.benchmark`` will be ``False`` finally.
+        """
+        if launcher is None:
+            launcher = infer_launcher()
+
+        self._launcher = launcher
+        if self._launcher == 'none':
+            self._distributed = False
+        else:
+            self._distributed = True
+
+        if cudnn_benchmark:
+            torch.backends.cudnn.benchmark = True
+
+        mp_cfg = mp_cfg if mp_cfg is not None else {}
+        set_multi_processing(**mp_cfg, distributed=self._distributed)
+
+        # init distributed env first, since logger depends on the dist info.
+        if self._distributed and not is_distributed():
+            dist_cfg = dist_cfg if dist_cfg is not None else {}
+            self._setup_distributed(launcher, **dist_cfg)
+
+        self._rank, self._world_size = get_dist_info()
+
+        timestamp = torch.tensor(time.time(), dtype=torch.float64)
+        # broadcast timestamp from 0 process to other processes
+        broadcast(timestamp)
+        self._timestamp = time.strftime('%Y%m%d_%H%M%S',
+                                        time.localtime(timestamp.item()))
+
+        # https://github.com/pytorch/pytorch/issues/973
+        # set resource limit
+        if platform.system() != 'Windows':
+            import resource
+            rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+            base_soft_limit = rlimit[0]
+            hard_limit = rlimit[1]
+            soft_limit = min(max(resource_limit, base_soft_limit), hard_limit)
+            resource.setrlimit(resource.RLIMIT_NOFILE,
+                               (soft_limit, hard_limit))
+
+        self._randomness = randomness
+        self._set_randomness(**randomness)
+
+    def _setup_distributed(self, *args, **kwargs):
+        """Setup distributed environment."""
+        pass
+
+    def _set_randomness(
+        self,
+        seed: Optional[int] = None,
+        diff_rank_seed: bool = False,
+        deterministic: bool = False,
+    ) -> None:
+        """Set random seed to guarantee reproducible results.
+
+        Args:
+            seed (int, optional): A number to set random modules.
+                Defaults to None.
+            diff_rank_seed (bool): Whether or not set different seeds according
+                to global rank. Defaults to False.
+            deterministic (bool): Whether to set the deterministic option for
+                CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+                to True and `torch.backends.cudnn.benchmark` to False.
+                Defaults to False.
+                See https://pytorch.org/docs/stable/notes/randomness.html for
+                more details.
+        """
+        from mmengine.runner import set_random_seed
+        self._seed = set_random_seed(
+            seed=seed,
+            deterministic=deterministic,
+            diff_rank_seed=diff_rank_seed)
+
+    def build_model(self, model: Union[nn.Module, dict]) -> nn.Module:
+        """Build model.
+
+        If ``model`` is a dict, it will be used to build a ``nn.Module``
+        object. Otherwise, if ``model`` is a ``nn.Module`` object it will be
+        returned directly.
+
+        An example of ``model``::
+
+            model = dict(type='ResNet')
+
+        Args:
+            model (nn.Module or dict): A ``nn.Module`` object or a dict to
+                build ``nn.Module`` object. If ``model`` is a ``nn.Module``
+                object, just returns itself.
+
+        Note:
+            The returned model must implement ``train_step``, ``test_step``
+            if ``runner.train`` or ``runner.test`` will be called. If
+            ``runner.val`` will be called or ``val_cfg`` is configured,
+            model must implement `val_step`.
+
+        Returns:
+            nn.Module: Model build from ``model``.
+        """
+        if isinstance(model, nn.Module):
+            return model
+        elif isinstance(model, dict):
+            model = MODELS.build(model)
+            return model  # type: ignore
+        else:
+            raise TypeError('model should be a nn.Module object or dict, '
+                            f'but got {model}')
+
+    def compile_model(
+        self,
+        model: nn.Module,
+        compile: Union[dict, bool] = False,
+    ) -> nn.Module:
+        """Compile model.
+
+        Args:
+            model (nn.Module): Model to compile.
+
+        Returns:
+            nn.Module: Compiled model.
+        """
+        if isinstance(compile, bool) and not compile:
+            return model
+
+        assert digit_version(TORCH_VERSION) >= digit_version('2.0.0'), (
+            'PyTorch >= 2.0.0 is required to enable torch.compile')
+
+        if isinstance(compile, bool):
+            compile = dict()
+
+        target = compile.pop('target', 'forward')
+        func = getattr(model, target)
+        compiled_func = torch.compile(func, **compile)
+        setattr(model, target, compiled_func)
+        self.logger.info('Model has been "compiled". The first few iterations '
+                         'will be slow, please be patient.')
+
+        return model
+
+    def _init_model_weights(self, model: nn.Module) -> nn.Module:
+        """Initialize the model weights if the model has
+        :meth:`init_weights`"""
+        if (hasattr(model, 'init_weights') and self.dispatch_kwargs.get(
+                'init_weights_for_test_or_val', True)):
+            model.init_weights()
+            # sync params and buffers
+            for _, params in model.state_dict().items():
+                broadcast(params)
+
+        return model
+
+    def build_optim_wrapper(
+        self,
+        optim_wrapper: Union[Optimizer, BaseOptimWrapper, dict],
+        model: Optional[nn.Module] = None,
+    ) -> BaseOptimWrapper:
+        """Build optimizer wrapper.
+
+        If ``optim_wrapper`` is a config dict for only one optimizer,
+        the keys must contain ``optimizer``, and ``type`` is optional.
+        It will build a :obj:`OptimWrapper` by default.
+
+        If ``optim_wrapper`` is a config dict for multiple optimizers, i.e.,
+        it has multiple keys and each key is for an optimizer wrapper. The
+        constructor must be specified since
+        :obj:`DefaultOptimizerConstructor` cannot handle the building of
+        training with multiple optimizers.
+
+        If ``optim_wrapper`` is a dict of pre-built optimizer wrappers, i.e.,
+        each value of ``optim_wrapper`` represents an ``OptimWrapper``
+        instance. ``build_optim_wrapper`` will directly build the
+        :obj:`OptimWrapperDict` instance from ``optim_wrapper``.
+
+        Args:
+            optim_wrapper (BaseOptimWrapper or dict): An OptimWrapper object or a
+                dict to build OptimWrapper objects. If ``optim_wrapper`` is an
+                OptimWrapper, just return an ``OptimizeWrapper`` instance.
+
+        Note:
+            For single optimizer training, if `optim_wrapper` is a config
+            dict, `type` is optional(defaults to :obj:`OptimWrapper`) and it
+            must contain `optimizer` to build the corresponding optimizer.
+
+        Examples:
+            >>> # build an optimizer
+            >>> optim_wrapper_cfg = dict(type='OptimWrapper', optimizer=dict(
+            ...     type='SGD', lr=0.01))
+            >>> # optim_wrapper_cfg = dict(optimizer=dict(type='SGD', lr=0.01))
+            >>> # is also valid.
+            >>> optim_wrapper = runner.build_optim_wrapper(optim_wrapper_cfg)
+            >>> optim_wrapper
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            SGD (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.01
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+            >>> # build optimizer without `type`
+            >>> optim_wrapper_cfg = dict(optimizer=dict(type='SGD', lr=0.01))
+            >>> optim_wrapper = runner.build_optim_wrapper(optim_wrapper_cfg)
+            >>> optim_wrapper
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            SGD (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.01
+                maximize: False
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+            >>> # build multiple optimizers
+            >>> optim_wrapper_cfg = dict(
+            ...    generator=dict(type='OptimWrapper', optimizer=dict(
+            ...        type='SGD', lr=0.01)),
+            ...    discriminator=dict(type='OptimWrapper', optimizer=dict(
+            ...        type='Adam', lr=0.001))
+            ...    # need to customize a multiple optimizer constructor
+            ...    constructor='CustomMultiOptimizerConstructor',
+            ...)
+            >>> optim_wrapper = runner.optim_wrapper(optim_wrapper_cfg)
+            >>> optim_wrapper
+            name: generator
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            SGD (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.1
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+            name: discriminator
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            'discriminator': Adam (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.02
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+
+        Important:
+            If you need to build multiple optimizers, you should implement a
+            MultiOptimWrapperConstructor which gets parameters passed to
+            corresponding optimizers and compose the ``OptimWrapperDict``.
+            More details about how to customize OptimizerConstructor can be
+            found at `optimizer-docs`_.
+
+        Returns:
+            BaseOptimWrapper: Optimizer wrapper build from ``optimizer_cfg``.
+        """  # noqa: E501
+        if isinstance(optim_wrapper, BaseOptimWrapper):
+            return optim_wrapper
+        if isinstance(optim_wrapper, (dict, ConfigDict, Config)):
+            # optimizer must be defined for single optimizer training.
+            optimizer = optim_wrapper.get('optimizer', None)
+
+            # If optimizer is a built `Optimizer` instance, the optimizer
+            # wrapper should be built by `OPTIM_WRAPPERS` registry.
+            if isinstance(optimizer, Optimizer):
+                optim_wrapper.setdefault('type', 'OptimWrapper')
+                return OPTIM_WRAPPERS.build(optim_wrapper)  # type: ignore
+
+            # If `optimizer` is not None or `constructor` is defined, it means,
+            # optimizer wrapper will be built by optimizer wrapper
+            # constructor. Therefore, `build_optim_wrapper` should be called.
+            if optimizer is not None or 'constructor' in optim_wrapper:
+                assert model is not None
+                return build_optim_wrapper(model, optim_wrapper)
+            else:
+                # if `optimizer` is not defined, it should be the case of
+                # training with multiple optimizers. If `constructor` is not
+                # defined either, each value of `optim_wrapper` must be an
+                # `OptimWrapper` instance since `DefaultOptimizerConstructor`
+                # will not handle the case of training with multiple
+                # optimizers. `build_optim_wrapper` will directly build the
+                # `OptimWrapperDict` instance from `optim_wrapper.`
+                optim_wrappers = OrderedDict()
+                for name, optim in optim_wrapper.items():
+                    if not isinstance(optim, BaseOptimWrapper):
+                        raise ValueError(
+                            'each item mush be an optimizer object when '
+                            '"type" and "constructor" are not in '
+                            f'optimizer, but got {name}={optim}')
+                    optim_wrappers[name] = optim
+                return OptimWrapperDict(**optim_wrappers)
+        else:
+            raise TypeError('optimizer wrapper should be an OptimWrapper '
+                            f'object or dict, but got {optim_wrapper}')
+
+    def _build_param_scheduler(
+        self,
+        scheduler: Union[_ParamScheduler, Dict, List],
+        optim_wrapper: BaseOptimWrapper,
+        default_args: dict,
+    ) -> List[_ParamScheduler]:
+        """Build parameter schedulers for a single optimizer.
+
+        Args:
+            scheduler (_ParamScheduler or dict or list): A Param Scheduler
+                object or a dict or list of dict to build parameter schedulers.
+            optim_wrapper (BaseOptimWrapper): An optimizer wrapper object is
+                passed to construct ParamScheduler object.
+
+        Returns:
+            list[_ParamScheduler]: List of parameter schedulers build from
+            ``scheduler``.
+
+        Note:
+            If the train loop is built, when building parameter schedulers,
+            it supports setting the max epochs/iters as the default ``end``
+            of schedulers, and supports converting epoch-based schedulers
+            to iter-based according to the ``convert_to_iter_based`` key.
+        """
+        if not isinstance(scheduler, Sequence):
+            schedulers = [scheduler]
+        else:
+            schedulers = scheduler
+
+        max_epochs = default_args.pop('max_epochs', None)
+        max_iters = default_args.pop('max_iters', None)
+
+        param_schedulers = []
+        for scheduler in schedulers:
+            if isinstance(scheduler, _ParamScheduler):
+                param_schedulers.append(scheduler)
+            elif isinstance(scheduler, dict):
+                _scheduler = copy.deepcopy(scheduler)
+
+                # Set default end
+                if _scheduler.get('by_epoch', True):
+                    if max_epochs is None:
+                        raise ValueError(
+                            'max_epochs must be specified in default_args')
+                    default_end = max_epochs
+                else:
+                    if max_iters is None:
+                        raise ValueError(
+                            'max_iters must be specified in default_args')
+                    default_end = max_iters
+                _scheduler.setdefault('end', default_end)
+                self.logger.debug(
+                    f'The `end` of {_scheduler["type"]} is not set. '
+                    'Use the max epochs/iters of train loop as default.')
+
+                param_schedulers.append(
+                    PARAM_SCHEDULERS.build(
+                        _scheduler,
+                        default_args=dict(
+                            optimizer=optim_wrapper, **default_args)))
+            else:
+                raise TypeError(
+                    'scheduler should be a _ParamScheduler object or dict, '
+                    f'but got {scheduler}')
+        return param_schedulers
+
+    def build_param_scheduler(
+        self,
+        scheduler: Union[_ParamScheduler, Dict, List],
+        optim_wrapper: BaseOptimWrapper,
+        default_args: Optional[dict] = None,
+    ) -> ParamSchedulerType:
+        """Build parameter schedulers.
+
+        ``build_param_scheduler`` should be called after
+        ``build_optim_wrapper`` because the building logic will change
+        according to the number of optimizers built by the runner.
+        The cases are as below:
+
+        - Single optimizer: When only one optimizer is built and used in the
+          runner, ``build_param_scheduler`` will return a list of
+          parameter schedulers.
+        - Multiple optimizers: When two or more optimizers are built and used
+          in runner, ``build_param_scheduler`` will return a dict containing
+          the same keys with multiple optimizers and each value is a list of
+          parameter schedulers. Note that, if you want different optimizers to
+          use different parameter schedulers to update optimizer's
+          hyper-parameters, the input parameter ``scheduler`` also needs to be
+          a dict and its key are consistent with multiple optimizers.
+          Otherwise, the same parameter schedulers will be used to update
+          optimizer's hyper-parameters.
+
+        Args:
+            scheduler (_ParamScheduler or dict or list): A Param Scheduler
+                object or a dict or list of dict to build parameter schedulers.
+
+        Examples:
+            >>> # build one scheduler
+            >>> optim_cfg = dict(dict(type='SGD', lr=0.01))
+            >>> runner.optim_wrapper = runner.build_optim_wrapper(
+            >>>     optim_cfg)
+            >>> scheduler_cfg = dict(type='MultiStepLR', milestones=[1, 2])
+            >>> schedulers = runner.build_param_scheduler(scheduler_cfg)
+            >>> schedulers
+            [<mmengine.optim.scheduler.lr_scheduler.MultiStepLR at 0x7f70f6966290>]  # noqa: E501
+
+            >>> # build multiple schedulers
+            >>> scheduler_cfg = [
+            ...    dict(type='MultiStepLR', milestones=[1, 2]),
+            ...    dict(type='StepLR', step_size=1)
+            ... ]
+            >>> schedulers = runner.build_param_scheduler(scheduler_cfg)
+            >>> schedulers
+            [<mmengine.optim.scheduler.lr_scheduler.MultiStepLR at 0x7f70f60dd3d0>,  # noqa: E501
+            <mmengine.optim.scheduler.lr_scheduler.StepLR at 0x7f70f6eb6150>]
+
+        Above examples only provide the case of one optimizer and one scheduler
+        or multiple schedulers. If you want to know how to set parameter
+        scheduler when using multiple optimizers, you can find more examples
+        `optimizer-docs`_.
+
+        Returns:
+            list[_ParamScheduler] or dict[str, list[_ParamScheduler]]: List of
+            parameter schedulers or a dictionary contains list of parameter
+            schedulers build from ``scheduler``.
+
+        .. _optimizer-docs:
+           https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html
+        """
+        if default_args is None:
+            default_args = {}
+            if 'epoch_length' in self.dispatch_kwargs:
+                default_args['epoch_length'] = self.dispatch_kwargs[
+                    'epoch_length']
+            if 'max_epochs' in self.dispatch_kwargs:
+                default_args['max_epochs'] = self.dispatch_kwargs['max_epochs']
+            if 'max_iters' in self.dispatch_kwargs:
+                default_args['max_iters'] = self.dispatch_kwargs['max_iters']
+
+        param_schedulers: ParamSchedulerType
+        if not isinstance(optim_wrapper, OptimWrapperDict):
+            # Since `OptimWrapperDict` inherits from `OptimWrapper`,
+            # `isinstance(self.optim_wrapper, OptimWrapper)` cannot tell
+            # whether `self.optim_wrapper` is an `OptimizerWrapper` or
+            # `OptimWrapperDict` instance. Therefore, here we simply check
+            # self.optim_wrapper is not an `OptimWrapperDict` instance and
+            # then assert it is an OptimWrapper instance.
+            assert isinstance(optim_wrapper, BaseOptimWrapper), (
+                '`build_optimizer` should be called before'
+                '`build_param_scheduler` because the latter depends '
+                'on the former')
+            param_schedulers = self._build_param_scheduler(
+                scheduler, optim_wrapper, default_args)  # type: ignore
+            return param_schedulers
+        else:
+            param_schedulers = dict()
+            for name, optimizer in optim_wrapper.items():
+                if isinstance(scheduler, dict) and 'type' not in scheduler:
+                    # scheduler is a dict and each item is a ParamScheduler
+                    # object or a config to build ParamScheduler objects
+                    param_schedulers[name] = self._build_param_scheduler(
+                        scheduler[name], optimizer, default_args)
+                else:
+                    param_schedulers[name] = self._build_param_scheduler(
+                        scheduler, optimizer, default_args)
+
+            return param_schedulers
+
+    def _scale_lr(self) -> None:
+        """Automatically scaling learning rate in training according to the
+        ratio of ``base_batch_size`` in ``autoscalelr_cfg`` and real batch
+        size.
+
+        It scales the learning rate linearly according to the
+        `paper <https://arxiv.org/abs/1706.02677>`_.
+
+        Note:
+            ``scale_lr`` must be called after building optimizer wrappers
+            and before building parameter schedulers.
+        """
+        if (self._auto_scale_lr is None
+                or not self._auto_scale_lr.get('enable', False)):
+            return None
+
+        assert 'base_batch_size' in self._auto_scale_lr, \
+            'Lack of `base_batch_size` in `auto_scale_lr`.'
+
+        real_bs = self.world_size * self.dispatch_kwargs[
+            'train_micro_batch_size_per_gpu']
+        base_bs = self._auto_scale_lr['base_batch_size']
+        ratio = float(real_bs) / float(base_bs)
+        self.logger.info(f'LR is set based on batch size of {base_bs} '
+                         f'and the current batch size is {real_bs}. '
+                         f'Scaling the original LR by {ratio}.')
+
+        def _is_built(schedulers):
+            if isinstance(schedulers, dict):
+                return False if 'type' in schedulers else any(
+                    _is_built(s) for s in schedulers.values())
+            if isinstance(schedulers, list):
+                return any(_is_built(s) for s in schedulers)
+            return isinstance(schedulers, _ParamScheduler)
+
+        if hasattr(self, 'param_schedulers') and _is_built(
+                self.param_schedulers):
+            raise RuntimeError('`scale_lr` should be called before building '
+                               'ParamScheduler because ParamScheduler will '
+                               'store initial lr from optimizer wrappers')
+
+        assert isinstance(self.optim_wrapper, BaseOptimWrapper), \
+            '`scale_lr should be called after building OptimWrapper'
+
+        if isinstance(self.optim_wrapper, OptimWrapperDict):
+            wrappers = list(self.optim_wrapper.values())
+        else:
+            wrappers = [self.optim_wrapper]  # type: ignore
+
+        for wrapper in wrappers:
+            for group in wrapper.optimizer.param_groups:
+                group['lr'] = group['lr'] * ratio
+
+    def build_logger(
+        self,
+        log_level: Union[int, str] = 'INFO',
+        log_file: Optional[str] = None,
+        **kwargs,
+    ) -> MMLogger:
+        """Build a global asscessable MMLogger.
+
+        Args:
+            log_level (int or str): The log level of MMLogger handlers.
+                Defaults to 'INFO'.
+            log_file (str, optional): Path of filename to save log.
+                Defaults to None.
+            **kwargs: Remaining parameters passed to ``MMLogger``.
+
+        Returns:
+            MMLogger: A MMLogger object build from ``logger``.
+        """
+        if log_file is None:
+            log_file = osp.join(self.log_dir, f'{self._timestamp}.log')
+
+        log_cfg = dict(log_level=log_level, log_file=log_file, **kwargs)
+        log_cfg.setdefault('name', self.experiment_name)
+        # `torch.compile` in PyTorch 2.0 could close all user defined handlers
+        # unexpectedly. Using file mode 'a' can help prevent abnormal
+        # termination of the FileHandler and ensure that the log file could
+        # be continuously updated during the lifespan of the runner.
+        log_cfg.setdefault('file_mode', 'a')
+
+        return MMLogger.get_instance(**log_cfg)  # type: ignore
+
+    def model_state_dict(self) -> dict:
+        """Get model state dict."""
+        from mmengine.runner import weights_to_cpu
+        return weights_to_cpu(self.model.state_dict())
+
+    def optim_state_dict(self) -> dict:
+        """Get optimizer state dict."""
+        if isinstance(self.optim_wrapper, BaseOptimWrapper):
+            return self.optim_wrapper.state_dict()
+        else:
+            raise TypeError('self.optim_wrapper should be a `BaseOptimWrapper`'
+                            f' instance, but got {self.optim_wrapper}')
+
+    def scheduler_state_dict(self) -> Union[dict, list]:
+        """Get parameter scheduler state dict."""
+        if isinstance(self.param_schedulers, dict):
+            state_dict: dict = dict()
+            for name, schedulers in self.param_schedulers.items():
+                state_dict[name] = []
+                for scheduler in schedulers:
+                    state_dict[name].append(scheduler.state_dict())
+            return state_dict
+        else:
+            state_list = []
+            for scheduler in self.param_schedulers:  # type: ignore
+                state_list.append(scheduler.state_dict())
+            return state_list
+
+    def load_model_state_dict(
+        self,
+        state_dict: dict,
+        *,
+        strict: bool = False,
+        revise_keys: list = [(r'^module.', '')],
+    ) -> None:
+        """Load model state from dict."""
+        from mmengine.runner.checkpoint import _load_checkpoint_to_model
+
+        if is_model_wrapper(self.model):
+            model = self.model.module
+        else:
+            model = self.model
+
+        _load_checkpoint_to_model(
+            model, state_dict, strict=strict, revise_keys=revise_keys)
+
+    def load_optim_state_dict(self, state_dict: dict) -> None:
+        """Load optimizer state from dict."""
+        self.optim_wrapper.load_state_dict(state_dict)
+
+    def load_scheduler_state_dict(self, state_dict: Union[dict, list]) -> None:
+        """Load scheduler state from dict."""
+        if isinstance(self.param_schedulers, dict):
+            assert isinstance(state_dict, dict)
+            for name, schedulers in self.param_schedulers.items():
+                for scheduler, ckpt_scheduler in zip(schedulers,
+                                                     state_dict[name]):
+                    scheduler.load_state_dict(ckpt_scheduler)
+        else:
+            for scheduler, ckpt_scheduler in zip(
+                    self.param_schedulers,  # type: ignore
+                    state_dict):
+                scheduler.load_state_dict(ckpt_scheduler)
+
+    def load_or_resume(
+        self,
+        *,
+        load_from: Optional[str] = None,
+        resume: Union[bool, str] = False,
+    ) -> Optional[dict]:
+        """Load checkpoint or resume from checkpoint.
+
+        Args:
+            load_from (str, optional): The checkpoint file to load from.
+                Defaults to None.
+            resume (bool or str): Whether to resume training. Defaults to
+                False. If ``resume`` is True and ``load_from`` is None,
+                automatically to find latest checkpoint from ``work_dir``.
+                If not found, resuming does nothing. If ``resume`` is a string,
+                it will be treated as the checkpoint file to resume from.
+        """
+        from mmengine.runner import find_latest_checkpoint
+
+        if not resume and load_from is None:
+            return None
+
+        # decide to load from checkpoint or resume from checkpoint
+        resume_from = None
+        if isinstance(resume, str):
+            resume_from = resume
+        elif resume and load_from is None:
+            # auto resume from the latest checkpoint
+            resume_from = find_latest_checkpoint(self._work_dir)
+            self.logger.info(
+                f'Auto resumed from the latest checkpoint {resume_from}.')
+        elif resume and load_from is not None:
+            # resume from the specified checkpoint
+            resume_from = load_from
+
+        if resume_from is not None:
+            return self.resume(resume_from)
+        elif load_from is not None:
+            return self.load_checkpoint(load_from)
+
+        return None
+
+    @abstractmethod
+    def load_checkpoint(
+        self,
+        filename: str,
+        *,
+        map_location: Union[str, Callable] = 'cpu',
+        strict: bool = False,
+        revise_keys: list = [(r'^module.', '')],
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Load checkpoint from given ``filename``.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+
+        Keyword Args:
+            map_location (str or callable): A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'cpu'.
+            strict (bool): strict (bool): Whether to allow different params for
+                the model and checkpoint.
+            revise_keys (list): A list of customized keywords to modify the
+                state_dict in checkpoint. Each item is a (pattern, replacement)
+                pair of the regular expression operations. Defaults to strip
+                the prefix 'module.' by [(r'^module\\.', '')].
+            callback (callable, callable): Callback function to modify the
+                checkpoint after loading the checkpoint.
+                Defaults to None.
+        """
+
+    @abstractmethod
+    def resume(
+        self,
+        filename: str,
+        *,
+        resume_optimizer: bool = True,
+        resume_param_scheduler: bool = True,
+        map_location: Union[str, Callable] = 'default',
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Resume training from given ``filename``.
+
+        Four types of states will be resumed.
+
+        - model state
+        - optimizer state
+        - scheduler state
+        - randomness state
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+
+        Keyword Args:
+            resume_optimizer (bool): Whether to resume optimizer state.
+                Defaults to True.
+            resume_param_scheduler (bool): Whether to resume param scheduler
+                state. Defaults to True.
+            map_location (str or callable):A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'default'.
+            callback (callable, callable): Callback function to modify the
+                checkpoint before saving the checkpoint.
+                Defaults to None.
+        """
+
+    @abstractmethod
+    def save_checkpoint(
+        self,
+        filename: str,
+        *,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        extra_ckpt: Optional[dict] = None,
+        callback: Optional[Callable] = None,
+    ) -> None:
+        """Save checkpoint to given ``filename``.
+
+        Args:
+            filename (str): Filename to save checkpoint.
+
+        Keyword Args:
+            save_optimizer (bool): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            save_param_scheduler (bool): Whether to save the param_scheduler
+                to the checkpoint. Defaults to True.
+            extra_ckpt (dict, optional): Extra checkpoint to save.
+                Defaults to None.
+            callback (callable, callable): Callback function to modify the
+                checkpoint before saving the checkpoint.
+                Defaults to None.
+        """
+
+    def collect_env(self) -> Tuple[dict, dict]:
+        """Collect the information of the running environments."""
+        system_env = collect_env()
+        runtime_env: OrderedDict = OrderedDict()
+        runtime_env.update(self._env_kwargs)
+        runtime_env.update(self.randomness)
+        runtime_env['Distributed launcher'] = self.launcher
+        runtime_env['Distributed training'] = self.distributed
+        runtime_env['GPU number'] = self.world_size
+
+        return system_env, runtime_env
+
+    def _prepared_components(self):
+        return_items = [self.model]
+        if hasattr(self, 'optim_wrapper'):
+            return_items.append(self.optim_wrapper)
+
+        if hasattr(self, 'param_schedulers'):
+            return_items.append(self.param_schedulers)
+
+        return return_items[0] if len(return_items) == 1 else return_items
diff --git a/head_extractor/build/lib/mmengine/_strategy/colossalai.py b/head_extractor/build/lib/mmengine/_strategy/colossalai.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfbb925c670efa7c708dfaa7b395fb874a1ed187
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/_strategy/colossalai.py
@@ -0,0 +1,565 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import os.path as osp
+import time
+from contextlib import contextmanager
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+try:
+    import colossalai
+    import colossalai.booster.mixed_precision as colo_precision
+    import colossalai.booster.plugin as colo_plugin
+    import colossalai.nn.optimizer as colo_optimizer
+    from colossalai.booster import Booster
+    from colossalai.interface import ModelWrapper
+except Exception as e:  # noqa: F841
+    colossalai = None
+    colo_precision = None
+    colo_plugin = None
+    colo_optimizer = None
+    Booster = None
+    ModelWrapper = None
+
+import torch
+import torch.nn as nn
+
+import mmengine
+from mmengine import mkdir_or_exist
+from mmengine._strategy import BaseStrategy
+from mmengine.device import get_device
+from mmengine.dist import init_dist, is_main_process
+from mmengine.fileio import join_path
+from mmengine.model import BaseDataPreprocessor
+from mmengine.optim import BaseOptimWrapper, OptimWrapper, _ParamScheduler
+from mmengine.registry import STRATEGIES, Registry
+from mmengine.registry.root import MODEL_WRAPPERS, OPTIM_WRAPPERS, OPTIMIZERS
+from mmengine.runner.checkpoint import _load_checkpoint, save_checkpoint
+from mmengine.utils import get_git_hash
+
+# Component for colossalai `plugins` and `mixed_precisions`
+PLUGINS = Registry('plugin')
+MIXED_PRECISIONS = Registry('mixed_precision')
+
+
+def register_plugins():
+    _plugins = inspect.getmembers(
+        colo_plugin,
+        lambda x: inspect.isclass(x) and issubclass(x, colo_plugin.Plugin))
+
+    for name, plugin in _plugins:
+        PLUGINS.register_module(name=name, module=plugin)
+
+
+def register_optimizers():
+    _colo_optimizer = inspect.getmembers(
+        colo_optimizer,
+        lambda x: inspect.isclass(x) and issubclass(x, torch.optim.Optimizer))
+    for name, optim_type in _colo_optimizer:
+        OPTIMIZERS.register_module(name=name, module=optim_type, force=True)
+
+
+def register_mixed_precisions():
+    _mixed_precisions = inspect.getmembers(
+        colo_precision, lambda x: inspect.isclass(x) and issubclass(
+            x, colo_precision.MixedPrecision))
+
+    for name, mixed_precision in _mixed_precisions:
+        MIXED_PRECISIONS.register_module(name=name, module=mixed_precision)
+
+
+@OPTIM_WRAPPERS.register_module()
+class ColossalAIOptimWrapper(OptimWrapper):
+    """OptimWrapper for ColossalAI.
+
+    The available optimizers are:
+      - CPUAdam
+      - FusedAdam
+      - FusedLAMB
+      - FusedSGD
+      - HybridAdam
+      - Lamb
+      - Lars
+
+    You can find more details in the `colossalai tutorial`_
+
+    Args:
+        optimizer (dict or torch.optim.Optimizer): The optimizer to be
+            wrapped.
+        accumulative_counts (int): The number of iterations to accumulate
+            gradients. The parameters will be updated per
+            ``accumulative_counts``.
+
+    .. _colossalai tutorial: https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/optimizer
+    """  # noqa: E501
+
+    def __init__(self,
+                 optimizer: torch.optim.Optimizer,
+                 booster: Optional[Booster] = None,
+                 accumulative_counts: int = 1):
+        super().__init__(optimizer, accumulative_counts=accumulative_counts)
+        self.booster = booster
+
+    @contextmanager
+    def optim_context(self, model: nn.Module):
+        assert isinstance(self.booster, Booster), \
+            'Please set the booster attribute before using ' \
+            '`ColossalAIOptimWrapper`.'
+        if self.booster.plugin.support_no_sync():
+            no_sync_context = self.booster.no_sync(model, self.optimizer)
+        else:
+            yield
+            return
+        if self.should_sync():
+            yield
+        else:
+            with no_sync_context:
+                yield
+
+    def backward(self, loss: torch.Tensor, **kwargs) -> None:
+        self._inner_count += 1
+        self.optimizer.backward(loss, **kwargs)
+
+
+@MODEL_WRAPPERS.register_module(
+    name=['ColossalAIModelWrapper', 'CollosalAIModelWrapper'])
+class ColossalAIModelWrapper:
+
+    def __init__(self, model_wrapper: ModelWrapper, model: nn.Module):
+        self.model_wrapper = model_wrapper
+        self.model = model
+
+    def __call__(self, *args, **kwargs) -> Any:
+        return self.model_wrapper(*args, **kwargs)
+
+    def train_step(
+        self,
+        data: Union[dict, tuple, list],
+        optim_wrapper: ColossalAIOptimWrapper,
+    ) -> Dict[str, torch.Tensor]:
+        data = self.model.data_preprocessor(data, training=True)
+        with optim_wrapper.optim_context(self.model):
+            losses = self._run_forward(data, mode='loss')
+        parsed_loss, log_vars = self.model.parse_losses(losses)
+        optim_wrapper.update_params(parsed_loss)
+        return log_vars
+
+    def val_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the prediction of module during validation process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        data = self.model.data_preprocessor(data, False)
+        return self._run_forward(data, mode='predict')
+
+    test_step = val_step
+
+    def _run_forward(self, data: Union[dict, tuple, list], mode: str) -> Any:
+        """Unpacks data for :meth:`forward`
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            mode (str): Mode of forward.
+
+        Returns:
+            dict or list: Results of training or testing mode.
+        """
+        if isinstance(data, dict):
+            results = self.model_wrapper(**data, mode=mode)
+        elif isinstance(data, (list, tuple)):
+            results = self.model_wrapper(*data, mode=mode)
+        else:
+            raise TypeError('Output of `data_preprocessor` should be '
+                            f'list, tuple or dict, but got {type(data)}')
+        return results
+
+    def __getattr__(self, name):
+        if hasattr(self.model_wrapper, name):
+            return getattr(self.model_wrapper, name)
+        elif hasattr(self.model, name):
+            return getattr(self.model, name)
+        else:
+            raise AttributeError(
+                f'{self.model_wrapper} and {self.model} has no '
+                f'attribute {name}')
+
+
+@STRATEGIES.register_module()
+class ColossalAIStrategy(BaseStrategy):
+    """
+    Args:
+        config: (str or dict): The colossalai config file to setup distributed
+            environment. See more details in the `colossalai config tutorial`_.
+        mixed_precision (str or MixedPrecision): The mixed precision to run the
+            training. Defaults to None. If the argument is a string, it can be
+            'fp16', 'fp16_apex', 'bf16', or 'fp8' fp16' would use PyTorch AMP
+            while `fp16_apex` would use Nvidia Apex.
+        plugin (Plugin): The plugin to run the training. The type of `plugin`
+            could be:
+
+            - str: The available plugins are ``gemini`` and ``lowlevel-zero``.
+
+              ``gemini`` means a `ZeRO`_ implementation with chunk-based
+              memory management. You could find more details in the
+              `colossalai gemini tutorial`_. ``lowlevel-zero`` means a
+              Zero-1 and Zero-2 implementation. Although gemini is more
+              memory saving, some unexpceted error could happen for
+              some spectial model structure. lowlevel-zero is more stable.
+
+            - dict: **dict-type style config to build a colossalai plugin**.
+
+              See the `booster plugin tutorial`_ for more details.
+
+        model_wrapper (dict, optional): Dict for model wrapper. Defaults to
+            None.
+        work_dir (str): The working directory to save checkpoints. The logs
+            will be saved in the subdirectory of `work_dir` named
+            :attr:`timestamp`. Defaults to 'work_dirs'.
+        experiment_name (str, optional): Name of current experiment. If not
+            specified, timestamp will be used as :attr:`experiment_name`.
+            Defaults to None.
+        env_kwargs (dict, optional): Environment config passed in
+            :meth:`setup_env`. Defaults to None.
+        log_kwargs (dict, optional): Logger config passed in
+            :meth:`build_logger`. Defaults to None.
+        auto_scale_lr (dict, Optional): Config to scale the learning rate
+            automatically. It includes ``base_batch_size`` and ``enable``.
+            ``base_batch_size`` is the batch size that the optimizer lr is
+            based on. ``enable`` is the switch to turn on and off the feature.
+
+    .. _colossalai config tutorial: https://colossalai.org/docs/basics/configure_parallelization
+    .. _ZeRO: https://arxiv.org/abs/1910.02054
+    .. _colossalai gemini tutorial: https://colossalai.org/docs/features/zero_with_chunk/#geminiddp
+    .. _booster plugin tutorial: https://colossalai.org/docs/basics/booster_plugins
+
+    """  # noqa: E501
+    OPTIMIZER_DIR = 'optimizer'  # directory to save optimizer state.
+    MODEL_DIR = 'model'  # directory to save model
+    SCHEDULER_DIR = 'scheduler'  # directory to save scheduelrs
+    model: ColossalAIModelWrapper  # type: ignore
+    optim_wrapper: ColossalAIOptimWrapper  # type: ignore
+
+    def __init__(
+        self,
+        *,
+        config: Union[str, dict, None] = None,
+        mixed_precision: Union[str, dict, None] = None,
+        plugin: str = 'gemini',
+        model_wrapper: Optional[dict] = None,
+        **kwargs,
+    ):
+        if colossalai is None:
+            raise ModuleNotFoundError(
+                'Please install colossalai by `pip install -U colossalai`')
+        register_plugins()
+        register_mixed_precisions()
+        register_optimizers()
+
+        self.config = config or {}
+        super().__init__(**kwargs)
+        if mixed_precision is not None:
+            mixed_precision = self._build_mixed_precision(mixed_precision)
+
+        if plugin is not None:
+            plugin = self._build_plugin(plugin)
+        self.booster = Booster(mixed_precision=mixed_precision, plugin=plugin)
+        self.model_wrapper = model_wrapper
+
+    def prepare(
+        self,
+        model: Union[nn.Module, dict],
+        *,
+        optim_wrapper: Union[BaseOptimWrapper, dict, None] = None,
+        param_scheduler: Union[_ParamScheduler, Dict, List, None] = None,
+        compile: Union[dict, bool] = False,
+        dispatch_kwargs: Optional[dict] = None,
+    ):
+        """Prepare model and some components.
+
+        Args:
+            model (:obj:`torch.nn.Module` or dict): The model to be run. It
+                can be a dict used for build a model.
+
+        Keyword Args:
+            optim_wrapper (BaseOptimWrapper or dict, optional): Computing the
+                gradient of model parameters and updating them.
+                Defaults to None.
+                See :meth:`build_optim_wrapper` for examples.
+            param_scheduler (_ParamScheduler or dict or list, optional):
+                Parameter scheduler for updating optimizer parameters. If
+                specified, :attr:`optim_wrapper` should also be specified.
+                Defaults to None.
+                See :meth:`build_param_scheduler` for examples.
+            compile (dict, optional): Config to compile model.
+                Defaults to False. Requires PyTorch>=2.0.
+            dispatch_kwargs (dict, optional): Kwargs to be passed to other
+                methods of Strategy. Defaults to None.
+                If ``accumulative_counts`` is set in ``optim_wrapper``, you
+                need to provide ``max_iters`` in ``dispatch_kwargs``.
+        """
+        if self._prepared:
+            return self._prepared_components()
+        if dispatch_kwargs is not None:
+            self.dispatch_kwargs.update(dispatch_kwargs)
+
+        model = self.build_model(model)
+        model = self._init_model_weights(model)
+
+        # optim_wrapper is required by booster
+        if optim_wrapper is not None and isinstance(optim_wrapper, dict):
+            optim_wrapper.setdefault('type', 'ColossalAIOptimWrapper')
+            optim_wrapper_type = OPTIM_WRAPPERS.get(optim_wrapper['type'])
+            if optim_wrapper_type is None:
+                raise ValueError(f'Failed to find {optim_wrapper["type"]} in '
+                                 '`OPTIM_WRAPPERS`.')
+            if 'clip_grad' in optim_wrapper:
+                raise ValueError('`Please configure `clip_grad` in `plugin`')
+            if not issubclass(optim_wrapper_type, ColossalAIOptimWrapper):
+                raise ValueError(
+                    'The type of `optim_wrapper` must be '
+                    '`ColossalAIOptimWrapper` (or subclass), but got '
+                    f'{optim_wrapper_type}')
+            optim_wrapper = self.build_optim_wrapper(optim_wrapper, model)
+            optim_wrapper.booster = self.booster  # type: ignore
+
+        if optim_wrapper is not None:
+            self.model, self.optim_wrapper = self._wrap(
+                model, optim_wrapper)  # type: ignore
+        else:
+            self.model = self._wrap(model)  # type: ignore
+        # TODO: Check whether `compile` is compatible with colossalai.
+
+        if param_scheduler is not None:
+            self.param_schedulers = self.build_param_scheduler(
+                param_scheduler, optim_wrapper)  # type: ignore
+
+        if optim_wrapper is not None:
+            self._scale_lr()
+            accumulative_counts = getattr(self.optim_wrapper,
+                                          '_accumulative_counts', 1)
+            if accumulative_counts > 1:
+                if 'max_iters' not in self.dispatch_kwargs:
+                    raise ValueError(
+                        '"max_iters" must be specified because '
+                        '"accumulative_counts" was set as '
+                        f'{accumulative_counts} which is greater than 1.')
+
+                self.optim_wrapper.initialize_count_status(  # type: ignore
+                    self.model, 0, self.dispatch_kwargs['max_iters'])
+        self._prepared = True
+        return self._prepared_components()
+
+    def resume(
+        self,
+        filename: str,
+        *,
+        resume_optimizer: bool = True,
+        resume_param_scheduler: bool = True,
+        map_location: Union[str, Callable] = 'default',
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """override this method since colossalai resume optimizer from filename
+        directly."""
+        self.logger.info(f'Resume checkpoint from {filename}')
+
+        extra_ckpt = self.load_checkpoint(
+            filename, map_location=map_location, callback=callback)
+
+        if resume_optimizer:
+            self.booster.load_optimizer(
+                self.optim_wrapper.optimizer,
+                join_path(filename, self.OPTIMIZER_DIR))
+
+        if resume_param_scheduler:
+            schedulers_dir = join_path(filename, self.SCHEDULER_DIR)
+            for i, scheduler in enumerate(self.param_schedulers):
+                self.booster.load_lr_scheduler(
+                    scheduler, f'{schedulers_dir}/scheduler_{i}.pth')
+
+        # resume random seed
+        resumed_seed = extra_ckpt['meta'].get('seed', None)
+        current_seed = self._randomness.get('seed')
+        if resumed_seed is not None and resumed_seed != current_seed:
+            if current_seed is not None:
+                self.logger.warning(f'The value of random seed in the '
+                                    f'checkpoint "{resumed_seed}" is '
+                                    f'different from the value in '
+                                    f'`randomness` config "{current_seed}"')
+            self._randomness.update(seed=resumed_seed)
+            self._set_randomness(**self._randomness)
+
+        # resume iter
+        self.dispatch_kwargs['cur_iter'] = extra_ckpt['meta']['iter']
+
+        return extra_ckpt
+
+    def load_checkpoint(
+        self,
+        filename: str,
+        *,
+        map_location: Union[str, Callable] = 'cpu',
+        strict: bool = False,
+        revise_keys: list = [(r'^module.', '')],
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Load checkpoint from given ``filename``.
+
+        Warning:
+            `map_localtion` and `callback` parameters are not supported yet.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+        """
+        self.logger.info(f'Load checkpoint from {filename}')
+        self.booster.load_model(self.model.model_wrapper,
+                                join_path(filename, self.MODEL_DIR))
+        meta = _load_checkpoint(osp.join(filename, 'meta.pth'))
+        return meta
+
+    def save_checkpoint(
+        self,
+        filename: str,
+        *,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        extra_ckpt: Optional[dict] = None,
+        callback: Optional[Callable] = None,
+    ) -> None:
+        # The checkpoint directory will be:
+        # |--epoch_0.pth
+        #    |---model/
+        #    |---optimizer/
+        #    |---scheduler/
+        if extra_ckpt is None:
+            extra_ckpt = dict()
+        if 'meta' not in extra_ckpt:
+            extra_ckpt['meta'] = dict()
+        extra_ckpt['meta'].update(
+            seed=self.seed,
+            time=time.strftime('%Y%m%d_%H%M%S', time.localtime()),
+            mmengine=mmengine.__version__ + get_git_hash())
+
+        model_dir = join_path(filename, self.MODEL_DIR)
+        optimizer_dir = join_path(filename, self.OPTIMIZER_DIR)
+        schedulers_dir = join_path(filename, self.SCHEDULER_DIR)
+        mkdir_or_exist(model_dir)
+        mkdir_or_exist(optimizer_dir)
+        mkdir_or_exist(schedulers_dir)
+
+        self.booster.save_model(
+            self.model.model_wrapper, checkpoint=model_dir, shard=True)
+
+        if save_optimizer:
+            self.booster.save_optimizer(
+                self.optim_wrapper.optimizer,
+                checkpoint=optimizer_dir,
+                shard=True)
+
+        if is_main_process() and save_param_scheduler:
+            for i, scheduler in enumerate(self.param_schedulers):
+                self.booster.save_lr_scheduler(
+                    scheduler, f'{schedulers_dir}/scheduler_{i}.pth')
+
+        save_checkpoint(extra_ckpt, join_path(filename, 'meta.pth'))
+
+    def _build_plugin(self, plugin: Union[str, dict]):
+        if isinstance(plugin, str):
+            if plugin == 'gemini':
+                try:
+                    plugin = colo_plugin.GeminiPlugin(
+                        precision='bf16', placement_policy='auto')
+                except AssertionError:
+                    from colossalai.zero.gemini.placement_policy import \
+                        PlacementPolicyFactory as colo_placement
+                    raise ValueError('placement policy must be one of ' +
+                                     f'{list(colo_placement.policies.keys())}')
+            elif plugin == 'lowlevel-zero':
+                plugin = colo_plugin.LowLevelZeroPlugin()
+            else:
+                raise ValueError('`plugin` must be "gemini" or '
+                                 '"lowlevel-zero"')
+        elif isinstance(plugin, dict):
+            plugin = PLUGINS.build(plugin)
+        else:
+            raise ValueError('`plugin` must be dict or str, but got a '
+                             f'{type(plugin)} object)')
+        return plugin
+
+    def _build_mixed_precision(self, mixed_precision: Union[str, dict]):
+        if isinstance(mixed_precision, str):
+            if mixed_precision == 'fp16':
+                mixed_precision = colo_precision.FP16TorchMixedPrecision()
+            elif mixed_precision == 'fp16_apex':
+                mixed_precision = colo_precision.FP16ApexMixedPrecision()
+            elif mixed_precision == 'bf16':
+                mixed_precision = colo_precision.BF16MixedPrecision()
+            elif mixed_precision == 'fp8':
+                mixed_precision = colo_precision.FP8MixedPrecision()
+            else:
+                raise ValueError(
+                    'If `mixed_precision` is a string, it must be one of '
+                    '"fp16", "fp16_apex", "bf16" and "fp8", but got '
+                    f'{mixed_precision}')
+        elif isinstance(mixed_precision, dict):
+            mixed_precision = MIXED_PRECISIONS.build(mixed_precision)
+        else:
+            raise ValueError('mixed precision should be dict or str, but got '
+                             f'a {type(mixed_precision)} object')
+        return mixed_precision
+
+    def _wrap(
+        self,
+        model: nn.Module,
+        optim_wrapper: Optional[OptimWrapper] = None,
+    ) -> Union[Tuple[ColossalAIModelWrapper, ColossalAIOptimWrapper],
+               ColossalAIModelWrapper]:  # type: ignore
+        """Wrap model with :class:`ModelWrapper`."""
+        if self.model_wrapper is None:
+            self.model_wrapper = {'type': 'ColossalAIModelWrapper'}
+
+        # For zero series parallel, move `data_preprocessor` to current device
+        # is reasonable. We need to `BaseDataPreprocessor.to` manually since
+        # framework like colossalai and deepspeed could not handle it, leading
+        # to `data_preprocessor` move data to cpu.
+        for module in model.modules():
+            if isinstance(module, BaseDataPreprocessor):
+                module.to(get_device())
+
+        if optim_wrapper is not None:
+            optimizer = optim_wrapper.optimizer
+            if not hasattr(optimizer, '_hook_for_profile'):
+                # PyTorch 2.0 removes the `_hook_for_profile` in
+                # `torch.optim.Optimizer`. We maintain this function here to
+                # keep compatibility.
+                # TODO: Remove this hardcode when ColossalAI supports
+                # PyTorch 2.0
+                optimizer.__class__._hook_for_profile = object
+
+            # We do not pass `scheduler` and `Dataloader` here for:
+            # 1. `Booster.boost` cannot accept a list of schedulers.
+            # 2. `Strategy` cannot not accept dataloader now.
+            model_wrapper, optimizer, *_ = self.booster.boost(model, optimizer)
+            optim_wrapper.optimizer = optimizer
+            default_args = {'model_wrapper': model_wrapper, 'model': model}
+            model_wrapper = MODEL_WRAPPERS.build(
+                self.model_wrapper, default_args=default_args)
+            return model_wrapper, optim_wrapper  # type: ignore
+        else:
+            model_wrapper, *_ = self.booster.boost(model)
+            default_args = {'model_wrapper': model_wrapper, 'model': model}
+            model_wrapper = MODEL_WRAPPERS.build(
+                self.model_wrapper, default_args=default_args)
+            return model_wrapper
+
+    def _setup_distributed(  # type: ignore
+        self,
+        launcher: Optional[str] = None,
+        backend: str = 'nccl',
+        **kwargs,
+    ):
+        init_dist(
+            launcher, backend, init_backend='colossalai', config=self.config)
diff --git a/head_extractor/build/lib/mmengine/_strategy/deepspeed.py b/head_extractor/build/lib/mmengine/_strategy/deepspeed.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f89ff760d6bbe5a17a6c3afddc4879026d5f429
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/_strategy/deepspeed.py
@@ -0,0 +1,581 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import time
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+
+from mmengine.logging import print_log
+
+try:
+    import deepspeed
+except ImportError:
+    deepspeed = None
+
+import logging
+
+import torch.nn as nn
+
+import mmengine
+from mmengine.dist import init_dist, is_main_process
+from mmengine.optim import BaseOptimWrapper, _ParamScheduler
+from mmengine.registry import (MODEL_WRAPPERS, OPTIM_WRAPPERS, OPTIMIZERS,
+                               STRATEGIES)
+from mmengine.runner.checkpoint import save_checkpoint, weights_to_cpu
+from mmengine.utils import apply_to, digit_version, get_git_hash
+from .base import BaseStrategy
+
+
+def register_deepspeed_optimizers() -> List[str]:
+    """Register optimizers in ``deepspeed`` to the ``OPTIMIZERS`` registry.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    deepspeed_optimizers = []
+    try:
+        import deepspeed  # noqa: F401
+    except ImportError:
+        pass
+    else:
+        from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
+        from deepspeed.ops.lamb import FusedLamb
+        from deepspeed.runtime.fp16.onebit import (OnebitAdam, OnebitLamb,
+                                                   ZeroOneAdam)
+
+        OPTIMIZERS.register_module(module=DeepSpeedCPUAdam)
+        deepspeed_optimizers.append('DeepSpeedCPUAdam')
+        OPTIMIZERS.register_module(module=FusedAdam)
+        deepspeed_optimizers.append('FusedAdam')
+        OPTIMIZERS.register_module(module=FusedLamb)
+        deepspeed_optimizers.append('FusedLamb')
+        OPTIMIZERS.register_module(module=OnebitAdam)
+        deepspeed_optimizers.append('OnebitAdam')
+        OPTIMIZERS.register_module(module=OnebitLamb)
+        deepspeed_optimizers.append('OnebitLamb')
+        OPTIMIZERS.register_module(module=ZeroOneAdam)
+        deepspeed_optimizers.append('ZeroOneAdam')
+
+    return deepspeed_optimizers
+
+
+@OPTIM_WRAPPERS.register_module()
+class DeepSpeedOptimWrapper(BaseOptimWrapper):
+
+    def __init__(self, optimizer):
+        super().__init__(optimizer)
+        self._model = None
+
+    @property
+    def model(self):
+        if self._model is None:
+            raise ValueError('model attribute should be set before accessing.')
+        return self._model
+
+    @model.setter
+    def model(self, value):
+        self._model = value
+
+    def update_params(self, loss) -> None:  # type: ignore
+        """Update parameters in :attr:`optimizer`."""
+        self.backward(loss)
+        self.step()
+
+    def backward(self, loss: torch.Tensor, **kwargs) -> None:
+        """"Perform gradient back propagation."""
+        self.model.backward(loss)
+
+    def zero_grad(self, **kwargs) -> None:
+        raise NotImplementedError(
+            'DeepSpeedOptimWrapper does not support zero_grad method '
+            'currently.')
+
+    def step(self, **kwargs):
+        self.model.step()
+
+    def state_dict(self) -> dict:
+        state_dict = {}
+        if self.base_param_settings is not None:
+            state_dict['base_param_settings'] = self.base_param_settings
+
+        return state_dict
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        base_param_settings = state_dict.pop('base_param_settings', None)
+
+        if base_param_settings is not None:
+            self.base_param_settings = base_param_settings
+
+
+@MODEL_WRAPPERS.register_module()
+class MMDeepSpeedEngineWrapper:
+
+    def __init__(
+        self,
+        *,
+        model: 'deepspeed.DeepSpeedEngine',
+        inputs_to_half: Optional[List[Union[int, str]]] = None,
+    ):
+        self.model = model
+        self._inputs_to_half = inputs_to_half
+
+    def __getattr__(self, name):
+        return getattr(self.model, name)
+
+    def train_step(
+        self,
+        data: Union[dict, tuple, list],
+        optim_wrapper: DeepSpeedOptimWrapper,
+    ) -> Dict[str, torch.Tensor]:
+        data = self.model.module.data_preprocessor(data, training=True)
+        data = self._cast_inputs_half(data)
+        losses = self._run_forward(data, mode='loss')
+        parsed_loss, log_vars = self.model.module.parse_losses(losses)
+        optim_wrapper.update_params(parsed_loss)
+
+        return log_vars
+
+    def val_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the prediction of module during validation process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        data = self.model.module.data_preprocessor(data, False)
+        data = self._cast_inputs_half(data)
+        return self._run_forward(data, mode='predict')
+
+    def test_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the predictions of module during testing process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        data = self.model.module.data_preprocessor(data, False)
+        data = self._cast_inputs_half(data)
+        return self._run_forward(data, mode='predict')
+
+    def _run_forward(self, data: Union[dict, tuple, list], mode: str) -> Any:
+        """Unpacks data for :meth:`forward`
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            mode (str): Mode of forward.
+
+        Returns:
+            dict or list: Results of training or testing mode.
+        """
+        if isinstance(data, dict):
+            results = self.model(**data, mode=mode)
+        elif isinstance(data, (list, tuple)):
+            results = self.model(*data, mode=mode)
+        else:
+            raise TypeError('Output of `data_preprocessor` should be '
+                            f'list, tuple or dict, but got {type(data)}')
+        return results
+
+    def _cast_inputs_half(self, inputs: Union[list, tuple, dict, None]):
+        """Cast inputs to half precision if needed.
+
+        Args:
+            inputs (list or tuple or dict or None): Inputs to be casted.
+
+        Returns:
+            list or tuple or dict or None: Casted inputs.
+        """
+        if self._inputs_to_half is None:
+            return inputs
+
+        dtype = next(self.model.parameters()).dtype
+        if isinstance(inputs, (list, tuple)):
+            new_inputs = []
+            for i, v in enumerate(inputs):
+                if i in self._inputs_to_half:
+                    new_inputs.append(
+                        apply_to(v, lambda x: hasattr(x, 'to'),
+                                 lambda x: x.to(dtype)))
+                else:
+                    new_inputs.append(v)
+            return inputs.__class__(new_inputs)
+        elif isinstance(inputs, dict):
+            for k, v in inputs.items():
+                if k in self._inputs_to_half:
+                    inputs[k] = apply_to(v, lambda x: hasattr(x, 'to'),
+                                         lambda x: x.to(dtype))
+            return inputs
+        else:
+            raise TypeError('inputs should be list, tuple or dict, '
+                            f'but got {type(inputs)}')
+
+
+@STRATEGIES.register_module()
+class DeepSpeedStrategy(BaseStrategy):
+    """Support training models with DeepSpeed.
+
+    Note:
+        The detailed usage of parameters can be found at
+        https://www.deepspeed.ai/docs/config-json/.
+
+    Args:
+        config (str or dict, optional): If it is a string, it is a path to load
+            config for deepspeed. Defaults to None.
+        zero_optimization (dict, optional): Enabling and configuring ZeRO
+            memory optimizations. Defaults to None.
+        gradient_clipping (float, optional): Enable gradient clipping with
+            value. Defaults to None.
+        fp16 (dict, optional): Configuration for using mixed precision/FP16
+            training that leverages NVIDIA's Apex package. Defaults to None.
+        inputs_to_half (list[int or str], optional): Which inputs are to
+            converted to half precision. Defaults to None.
+            If ``fp16`` is enabled, it also should be set.
+        bf16 (dict, optional): Configuration for using bfloat16 floating-point
+            format as an alternative to FP16. Defaults to None.
+        amp (dict, optional): Configuration for using automatic mixed
+            precision (AMP) training that leverages NVIDIA's Apex AMP package.
+            Defaults to None.
+        activation_checkpointing (dict, optional): Reduce memory usage by
+            clearing activations of certain layers and recomputing them
+            during a backward pass.
+            Defaults to None.
+        aio (dict, optional): Configuring the asynchronous I/O module for
+            offloading parameter and optimizer states to persistent (NVMe)
+            storage. This module uses Linux native asynchronous I/O (libaio).
+            Defaults to None.
+        train_micro_batch_size_per_gpu (int, optional): Batch size to be
+            processed by one GPU in one step (without gradient accumulation).
+            Defaults to None.
+        gradient_accumulation_steps (int, optional): Number of training steps
+            to accumulate gradients before averaging and applying them.
+            Defaults to None.
+        exclude_frozen_parameters (bool, optional): Exclude frozen parameters
+            from saved checkpoint.
+    """
+
+    def __init__(
+        self,
+        *,
+        # the following args are for deepspeed
+        config: Union[str, dict, None] = None,
+        zero_optimization: Optional[dict] = None,
+        gradient_clipping: Optional[float] = None,
+        fp16: Optional[dict] = None,
+        inputs_to_half: Optional[List[Union[int, str]]] = None,
+        bf16: Optional[dict] = None,
+        amp: Optional[dict] = None,
+        activation_checkpointing: Optional[dict] = None,
+        aio: Optional[dict] = None,
+        train_micro_batch_size_per_gpu: Optional[int] = None,
+        gradient_accumulation_steps: Optional[int] = None,
+        # disable the log printed by deepseed
+        steps_per_print: int = 10000000000000,
+        # the following args are for BaseStrategy
+        exclude_frozen_parameters: Optional[bool] = None,
+        **kwargs,
+    ):
+        assert deepspeed is not None, \
+            'DeepSpeed is not installed. Please check ' \
+            'https://github.com/microsoft/DeepSpeed#installation.'
+
+        super().__init__(**kwargs)
+
+        self.config = self._parse_config(config)
+        if zero_optimization is not None:
+            self.config['zero_optimization'] = zero_optimization
+        if gradient_clipping is not None:
+            self.config['gradient_clipping'] = gradient_clipping
+        if fp16 is not None:
+            self.config['fp16'] = fp16
+        if bf16 is not None:
+            self.config['bf16'] = bf16
+        if amp is not None:
+            self.config['amp'] = amp
+        if activation_checkpointing is not None:
+            self.config['activation_checkpointing'] = activation_checkpointing
+        if aio is not None:
+            self.config['aio'] = aio
+        if train_micro_batch_size_per_gpu is not None:
+            self.config['train_micro_batch_size_per_gpu'] = \
+                train_micro_batch_size_per_gpu
+        if gradient_accumulation_steps is not None:
+            self.config['gradient_accumulation_steps'] = \
+                gradient_accumulation_steps
+        else:
+            self.config.setdefault('gradient_accumulation_steps', 1)
+        self.config['steps_per_print'] = steps_per_print
+        self._inputs_to_half = inputs_to_half
+        assert (exclude_frozen_parameters is None or
+                digit_version(deepspeed.__version__) >= digit_version('0.13.2')
+                ), ('DeepSpeed >= 0.13.2 is required to enable '
+                    'exclude_frozen_parameters')
+        self.exclude_frozen_parameters = exclude_frozen_parameters
+
+        register_deepspeed_optimizers()
+
+    def _parse_config(self, config):
+        if config is None:
+            config = dict()
+        elif isinstance(config, str):
+            with open(config) as f:
+                config = json.load(f)
+        return config
+
+    def _setup_distributed(  # type: ignore
+        self,
+        launcher: Optional[str] = None,
+        backend: str = 'nccl',
+        **kwargs,
+    ):
+        """Setup distributed environment.
+
+        Args:
+            launcher (str, optional): Way to launch multi processes.
+                DeepSpeedStrategy does not support the launcher argument.
+            backend (str): Communication Backends. Supported backends are
+                'nccl', 'gloo' and 'mpi'. Defaults to 'nccl'.
+            **kwargs: Other arguments for :func:`deepspeed.init_distributed`.
+        """
+        init_dist(launcher, backend, init_backend='deepspeed', **kwargs)
+
+    def prepare(
+        self,
+        model: Union[nn.Module, dict],
+        *,
+        optim_wrapper: Union[BaseOptimWrapper, dict, None] = None,
+        param_scheduler: Union[_ParamScheduler, Dict, List, None] = None,
+        compile: Union[dict, bool] = False,
+        dispatch_kwargs: Optional[dict] = None,
+    ):
+        """Prepare model and some components.
+
+        Args:
+            model (:obj:`torch.nn.Module` or dict): The model to be run. It
+                can be a dict used for build a model.
+
+        Keyword Args:
+            optim_wrapper (BaseOptimWrapper or dict, optional): Computing the
+                gradient of model parameters and updating them.
+                Defaults to None.
+                See :meth:`build_optim_wrapper` for examples.
+            param_scheduler (_ParamScheduler or dict or list, optional):
+                Parameter scheduler for updating optimizer parameters. If
+                specified, :attr:`optim_wrapper` should also be specified.
+                Defaults to None.
+                See :meth:`build_param_scheduler` for examples.
+            compile (dict, optional): Config to compile model.
+                Defaults to False. Requires PyTorch>=2.0.
+            dispatch_kwargs (dict, optional): Kwargs to be passed to other
+                methods of Strategy. Defaults to None.
+        """
+        if self._prepared:
+            return self._prepared_components()
+        assert dispatch_kwargs is not None
+        self.dispatch_kwargs.update(dispatch_kwargs)
+
+        model = self.build_model(model)
+        model = self._init_model_weights(model)
+
+        if optim_wrapper is not None:
+            self.optim_wrapper = self.build_optim_wrapper(optim_wrapper, model)
+            self.model = self._wrap_model(model)
+
+            self.optim_wrapper.model = self.model  # type: ignore
+
+        else:
+            self.model = self._wrap_model(model)
+
+        if param_scheduler is not None:
+            self.param_schedulers = self.build_param_scheduler(
+                param_scheduler, self.optim_wrapper)
+        self._prepared = True
+        return self._prepared_components()
+
+    def _wrap_model(self, model: nn.Module) -> nn.Module:
+        if hasattr(self, 'optim_wrapper'):
+            engine, self.optim_wrapper.optimizer, *_ = deepspeed.initialize(
+                model=model,
+                optimizer=self.optim_wrapper.optimizer,
+                config=self.config)
+        else:
+            engine, *_ = deepspeed.initialize(model=model, config=self.config)
+
+        wrapper = MMDeepSpeedEngineWrapper(
+            model=engine, inputs_to_half=self._inputs_to_half)
+        return wrapper
+
+    def load_checkpoint(
+        self,
+        filename: str,
+        *,
+        map_location: Union[str, Callable] = 'cpu',
+        strict: bool = False,
+        revise_keys: list = [(r'^module.', '')],
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Load checkpoint from given ``filename``.
+
+        Warning:
+            `map_localtion` and `callback` parameters are not supported yet.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+        """
+        self.logger.info(f'Load checkpoint from {filename}')
+
+        dirname, basename = osp.split(filename)
+        if digit_version(deepspeed.__version__) >= digit_version('0.13.2'):
+            _, extra_ckpt = self.model.load_checkpoint(
+                dirname,
+                tag=basename,
+                load_optimizer_states=False,
+                load_module_strict=not self.exclude_frozen_parameters)
+        else:
+            _, extra_ckpt = self.model.load_checkpoint(
+                dirname, tag=basename, load_optimizer_states=False)
+
+        return extra_ckpt
+
+    def resume(
+        self,
+        filename: str,
+        *,
+        resume_optimizer: bool = True,
+        resume_param_scheduler: bool = True,
+        map_location: Union[str, Callable] = 'default',
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Resume training from given ``filename``.
+
+        Warning:
+            `map_location` and `callback` parameters are not supported yet.
+
+        Args:
+            filename (str): Accept local filepath.
+
+        Keyword Args:
+            resume_optimizer (bool): Whether to resume optimizer state.
+                Defaults to True.
+            resume_param_scheduler (bool): Whether to resume param scheduler
+                state. Defaults to True.
+        """
+        self.logger.info(f'Resume checkpoint from {filename}')
+
+        dirname, basename = osp.split(filename)
+        if digit_version(deepspeed.__version__) >= digit_version('0.13.2'):
+            _, extra_ckpt = self.model.load_checkpoint(
+                dirname,
+                tag=basename,
+                load_optimizer_states=resume_optimizer,
+                load_module_strict=not self.exclude_frozen_parameters)
+        else:
+            _, extra_ckpt = self.model.load_checkpoint(
+                dirname, tag=basename, load_optimizer_states=resume_optimizer)
+
+        if resume_optimizer:
+            self.load_optim_state_dict(extra_ckpt.pop('optim_wrapper'))
+
+        if resume_param_scheduler and hasattr(self, 'param_schedulers'):
+            param_schedulers = extra_ckpt.pop('param_schedulers')
+            self.load_scheduler_state_dict(param_schedulers)
+
+        # resume random seed
+        resumed_seed = extra_ckpt['meta'].get('seed', None)
+        current_seed = self._randomness.get('seed')
+        if resumed_seed is not None and resumed_seed != current_seed:
+            if current_seed is not None:
+                self.logger.warning(f'The value of random seed in the '
+                                    f'checkpoint "{resumed_seed}" is '
+                                    f'different from the value in '
+                                    f'`randomness` config "{current_seed}"')
+            self._randomness.update(seed=resumed_seed)
+            self._set_randomness(**self._randomness)
+
+        return extra_ckpt
+
+    def save_checkpoint(
+        self,
+        filename: str,
+        *,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        extra_ckpt: Optional[dict] = None,
+        callback: Optional[Callable] = None,
+    ) -> None:
+        """Save checkpoint to given ``filename``.
+
+        Warning:
+            `callback` parameter is not supported yet.
+
+        Args:
+            filename (str): Filename to save checkpoint.
+
+        Keyword Args:
+            save_param_scheduler (bool): Whether to save the param_scheduler
+                to the checkpoint. Defaults to True.
+            extra_ckpt (dict, optional): Extra checkpoint to save.
+                Defaults to None.
+        """
+        if extra_ckpt is None:
+            extra_ckpt = dict()
+        if 'meta' not in extra_ckpt:
+            extra_ckpt['meta'] = dict()
+        extra_ckpt['meta'].update(
+            seed=self.seed,
+            time=time.strftime('%Y%m%d_%H%M%S', time.localtime()),
+            mmengine=mmengine.__version__ + get_git_hash(),
+        )
+
+        if save_param_scheduler and hasattr(self, 'param_schedulers'):
+            extra_ckpt['param_schedulers'] = self.scheduler_state_dict()
+
+        if (not save_optimizer
+                and self.model.zero_optimization_partition_weights()
+                and not self.model.zero_gather_16bit_weights_on_model_save()):
+            print_log(
+                'Configured to `save_optimizer=False`, but currently using '
+                "DeepSpeed's ZeRO stage 3 with "
+                '`gather_16bit_weights_on_model_save=False`. In '
+                'this configuration, the model cannot be saved properly '
+                'and will be saved with the optimizer state. '
+                'To support `save_optimizer=False`, please set '
+                '`gather_16bit_weights_on_model_save=True` in your '
+                'DeepSpeed config.',
+                logger='current',
+                level=logging.WARNING)
+            save_optimizer = True
+
+        state_dict_kwargs = {}
+        if digit_version(deepspeed.__version__) >= digit_version('0.13.2'):
+            state_dict_kwargs[
+                'exclude_frozen_parameters'] = self.exclude_frozen_parameters
+
+        if save_optimizer:
+            if hasattr(self, 'optim_wrapper'):
+                # The key can not be 'optimizer', otherwise error will be
+                # thrown when loading or resuming checkpoint.
+                extra_ckpt['optim_wrapper'] = self.optim_state_dict()
+
+            dirname, basename = osp.split(filename)
+            self.model.save_checkpoint(
+                dirname,
+                tag=basename,
+                client_state=extra_ckpt,
+                save_latest=False,
+                **state_dict_kwargs)
+        else:
+            if self.model.zero_optimization_partition_weights():
+                state_dict = self.model._zero3_consolidated_16bit_state_dict(
+                    **state_dict_kwargs)
+            else:
+                state_dict = self.model.module_state_dict(**state_dict_kwargs)
+
+            if is_main_process():
+                ckpt = {'state_dict': weights_to_cpu(state_dict), **extra_ckpt}
+                save_checkpoint(ckpt, filename)
diff --git a/head_extractor/build/lib/mmengine/_strategy/distributed.py b/head_extractor/build/lib/mmengine/_strategy/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c969b85b1d2bba27af6251c25eda94ca078340d
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/_strategy/distributed.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Callable, Optional
+
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel
+
+from mmengine.device import get_device
+from mmengine.dist import init_dist, is_distributed, master_only
+from mmengine.model import convert_sync_batchnorm, is_model_wrapper
+from mmengine.registry import MODEL_WRAPPERS, STRATEGIES
+from .single_device import SingleDeviceStrategy
+
+
+@STRATEGIES.register_module()
+class DDPStrategy(SingleDeviceStrategy):
+    """Distribution strategy for distributed data parallel training.
+
+    Args:
+        model_wrapper (dict): Dict for model wrapper. Defaults to None.
+        sync_bn (str): Type of sync batch norm. Defaults to None.
+            Options are 'torch' and 'mmcv'.
+        **kwargs: Other arguments for :class:`BaseStrategy`.
+    """
+
+    def __init__(
+        self,
+        *,
+        model_wrapper: Optional[dict] = None,
+        sync_bn: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.model_wrapper = model_wrapper
+        self.sync_bn = sync_bn
+
+    def _setup_distributed(  # type: ignore
+        self,
+        launcher: str = 'pytorch',
+        backend: str = 'nccl',
+        **kwargs,
+    ):
+        """Setup distributed environment.
+
+        Args:
+            launcher (str): Way to launcher multi processes. Supported
+                launchers are 'pytorch', 'mpi' and 'slurm'.
+            backend (str): Communication Backends. Supported backends are
+                'nccl', 'gloo' and 'mpi'. Defaults to 'nccl'.
+            **kwargs: Other arguments for :func:`init_dist`.
+        """
+        if not is_distributed():
+            init_dist(launcher, backend, **kwargs)
+
+    def convert_model(self, model: nn.Module) -> nn.Module:
+        """convert all ``BatchNorm`` layers in the model to ``SyncBatchNorm``
+        (SyncBN) or ``mmcv.ops.sync_bn.SyncBatchNorm`` (MMSyncBN) layers.
+
+        Args:
+            model (nn.Module): Model to be converted.
+
+        Returns:
+            nn.Module: Converted model.
+        """
+        if self.sync_bn is not None:
+            try:
+                model = convert_sync_batchnorm(model, self.sync_bn)
+            except ValueError as e:
+                self.logger.error('cfg.sync_bn should be "torch" or '
+                                  f'"mmcv", but got {self.sync_bn}')
+                raise e
+
+        return model
+
+    def _wrap_model(self, model: nn.Module) -> DistributedDataParallel:
+        """Wrap the model to :obj:``MMDistributedDataParallel`` or other custom
+        distributed data-parallel module wrappers.
+
+        Args:
+            model (nn.Module): Model to be wrapped.
+
+        Returns:
+            nn.Module or DistributedDataParallel: nn.Module or subclass of
+            ``DistributedDataParallel``.
+        """
+        if is_model_wrapper(model):
+            return model
+
+        model = model.to(get_device())
+
+        model = self.convert_model(model)
+
+        if self.model_wrapper is None:
+            # set broadcast_buffers as False to keep compatibility with
+            # OpenMMLab repos
+            self.model_wrapper = dict(
+                type='MMDistributedDataParallel', broadcast_buffers=False)
+
+        default_args = dict(
+            type='MMDistributedDataParallel',
+            module=model,
+            device_ids=[int(os.environ['LOCAL_RANK'])])
+        model = MODEL_WRAPPERS.build(
+            self.model_wrapper, default_args=default_args)
+        return model
+
+    @master_only
+    def save_checkpoint(
+        self,
+        filename: str,
+        *,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        extra_ckpt: Optional[dict] = None,
+        callback: Optional[Callable] = None,
+    ) -> None:
+        super().save_checkpoint(
+            filename=filename,
+            save_optimizer=save_optimizer,
+            save_param_scheduler=save_param_scheduler,
+            extra_ckpt=extra_ckpt,
+            callback=callback)
diff --git a/head_extractor/build/lib/mmengine/_strategy/fsdp.py b/head_extractor/build/lib/mmengine/_strategy/fsdp.py
new file mode 100644
index 0000000000000000000000000000000000000000..0788fafdabb21824ed6c1fce4e5af5f75db03a54
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/_strategy/fsdp.py
@@ -0,0 +1,643 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+import os
+import os.path as osp
+import time
+from collections import OrderedDict
+from functools import partial
+from typing import Callable, Dict, List, Optional, Sequence, Union
+
+import torch.nn as nn
+from torch.distributed.fsdp import (FullStateDictConfig,
+                                    FullyShardedDataParallel,
+                                    LocalStateDictConfig, StateDictType)
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    FullOptimStateDictConfig, LocalOptimStateDictConfig, OptimStateDictConfig,
+    StateDictConfig)
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler
+
+import mmengine
+from mmengine.config import Config, ConfigDict
+from mmengine.device import get_device
+from mmengine.dist import get_rank, is_main_process
+from mmengine.model import BaseDataPreprocessor, is_model_wrapper
+from mmengine.optim import (AmpOptimWrapper, BaseOptimWrapper, OptimWrapper,
+                            OptimWrapperDict, _ParamScheduler,
+                            build_optim_wrapper)
+from mmengine.registry import (FUNCTIONS, MODEL_WRAPPERS, OPTIM_WRAPPERS,
+                               PARAM_SCHEDULERS, STRATEGIES, Registry)
+from mmengine.utils import get_git_hash, mkdir_or_exist
+from .distributed import DDPStrategy
+from .utils import MetaTensorContext
+
+FSDP = FullyShardedDataParallel
+FSDP_CONFIGS = Registry('fsdp configs')
+FSDP_CONFIGS.register_module(module=FullOptimStateDictConfig)
+FSDP_CONFIGS.register_module(module=LocalOptimStateDictConfig)
+FSDP_CONFIGS.register_module(module=FullStateDictConfig)
+FSDP_CONFIGS.register_module(module=LocalStateDictConfig)
+
+
+@STRATEGIES.register_module()
+class FSDPStrategy(DDPStrategy):
+    """Support training model with FullyShardedDataParallel (FSDP).
+
+    Keyword Args:
+        model_wrapper (dict, optional): Config dict for model wrapper. The
+            default configuration is:
+
+            Examples:
+                >>> model_wrapper = dict(
+                >>>    type='MMFullyShardedDataParallel',
+                >>>    use_orig_params=True,
+                >>> )
+
+            See more configurable arguments in
+            :class:`MMFullyShardedDataParallel`. Defaults to None
+        skip_init_weights (bool, optional): Whether to skip initialization of
+            weights. Defaults to False. This is useful when the parameters of
+            the large model are loaded from a checkpoint, since skipping the
+            initialization of weights can save a lot of time.
+        state_dict_cfg (str or dict): Configuration for
+            how to save and load the state dict of the model, optimizer, and
+            scheduler.
+
+            - "local": save and load the sharded state dict in all ranks.
+            - "full": save and load the full state dict in rank 0.
+            - `dict` object: save and load the state dict more flexibly. For
+              example, you can first offload the state dict to the 'cpu' and
+              then save it to the disk. This can help you to load the
+              checkpoint in a non-gpu environment:
+
+              Examples:
+                >>> state_dict_cfg=dict(
+                >>>     state_dict_type='FULL_STATE_DICT',
+                >>>     state_dict_config=dict(type='FullStateDictConfig', offload_to_cpu=True),
+                >>>     optim_state_dict_config=dict(type='FullOptimStateDictConfig', offload_to_cpu=True),
+
+              See more configurable arguments for ``state_dict_cfg``,
+              ``state_dict_config``, and ``optim_state_dict_config``in
+              `FSDP official api documents`_
+        kwargs (dict): Additional arguments passed to :class:`DDPStrategy`:
+
+            - work_dir (str): The working directory to save checkpoints.
+              The logs will be saved in the subdirectory of `work_dir` named
+              :attr:`timestamp`. Defaults to 'work_dirs'.
+            - experiment_name (str, optional): Name of current experiment. If
+              not specified, timestamp will be used as :attr:`experiment_name`.
+              Defaults to None.
+            - env_kwargs (dict, optional): Environment config passed in
+              :meth:`setup_env`. Defaults to None.
+            - log_kwargs (dict, optional): Logger config passed in
+              :meth:`build_logger`. Defaults to None.
+        activation_checkpointing (dict, optional): Config dict for gradient
+            checkpoint.
+
+            Examples:
+              >>> activation_checkpointing = dict(check_fn='CustomCheckFn')
+              >>> activation_checkpointing = dict(check_fn=dict(type='CustomCheckFn', arg1=arg1))
+
+
+            ``check_fn`` field should behave consistently with
+            ``auto_wrap_policy`` defined in `model_wrapper`, and other
+            fields will be passed to ``apply_activation_checkpointing``
+
+            `New in version 0.9.0.`
+
+    .. _FSDP official api documents: https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.set_state_dict_type
+    """  # noqa: E501
+
+    def __init__(self,
+                 *,
+                 model_wrapper: Optional[dict] = None,
+                 skip_init_weights=False,
+                 state_dict_cfg: Union[str, dict] = 'local',
+                 activation_checkpointing: Optional[dict] = None,
+                 **kwargs):
+        super().__init__(model_wrapper=model_wrapper, **kwargs)
+        self._init_state_dict_cfg(state_dict_cfg)
+        if not isinstance(skip_init_weights, bool):
+            raise TypeError('skip_init_weights must be a boolean, but got '
+                            f'{type(skip_init_weights)}')
+        self.skip_init_weights = skip_init_weights
+        self.activation_checkpointing = activation_checkpointing
+
+    def _wrap_model(self, model: nn.Module) -> None:
+        """Wrap the model to :obj:``MMFullyShardedDataParallel`` or other
+        custom fully sharded data parallel module wrappers.
+
+        Args:
+            model (nn.Module): Model to be wrapped.
+
+        Returns:
+            FullyShardedDataParallel: ``MMFullyShardedDataParallel``
+            or subclass of ``FullyShardedDataParallel``.
+        """
+        try:
+            from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import \
+                apply_activation_checkpointing  # noqa: E501
+        except ImportError:
+            apply_activation_checkpointing = None
+
+        for module in model.modules():
+            if isinstance(module, BaseDataPreprocessor):
+                module.to(get_device())
+
+        if is_model_wrapper(model):
+            return
+
+        if self.model_wrapper is None:
+            self.model_wrapper = dict(type='MMFullyShardedDataParallel')
+
+        default_args = dict(
+            module=model,
+            device_id=int(os.environ['LOCAL_RANK']),
+            type='MMFullyShardedDataParallel')
+        model = MODEL_WRAPPERS.build(
+            self.model_wrapper, default_args=default_args)
+        model.set_state_dict_type(model, self.state_dict_type,
+                                  self.state_dict_config,
+                                  self.optim_state_dict_config)
+
+        if self.activation_checkpointing is not None:
+            if apply_activation_checkpointing is None:
+                raise RuntimeError(
+                    'activation_checkpointing maybe deprecated by current '
+                    'PyTorch version, maybe you could switch to PyTorch 2.0 '
+                    'or 2.1 to use `activation_checkpointing`.')
+            cfg = copy.deepcopy(self.activation_checkpointing)
+            with FUNCTIONS.switch_scope_and_registry(None):
+                check_fn = cfg.pop('check_fn')
+                if isinstance(check_fn, str):
+                    check_fn = FUNCTIONS.get(check_fn)
+                elif isinstance(check_fn, dict):
+                    fn_type = check_fn.pop('type')
+                    if isinstance(fn_type, str):
+                        fn_type = FUNCTIONS.get(fn_type)
+                    check_fn = partial(fn_type, **cfg)
+
+                if not callable(check_fn):
+                    raise TypeError('`check_fn` must be a callable function')
+                apply_activation_checkpointing(model, check_fn=check_fn, **cfg)
+        return model
+
+    def _is_full_state_dict(self):
+        """Whether to save and load the full state_dict in rank 0."""
+        return self.state_dict_type == StateDictType.FULL_STATE_DICT
+
+    def build_model(self, model: Union[nn.Module, dict]) -> nn.Module:
+        """Build model.
+
+        If skip_init_weights is True, the model will be built with an empty
+        weights. It means that :meth:`load_checkpoint` must be called to fill
+        the weights before training.
+
+        Args:
+            model (nn.Module or dict): A ``nn.Module`` object or a dict to
+                build ``nn.Module`` object. If ``model`` is a ``nn.Module``
+                object, just returns itself.
+
+        Returns:
+            nn.Module: Model build from ``model``.
+        """
+        if self.skip_init_weights:
+            if isinstance(model, dict):
+                # Accelerate initialization by skipping init weights
+                with MetaTensorContext():
+                    model = super().build_model(model)
+                model.to_empty(device='cpu')
+        else:
+            model = super().build_model(model)
+
+        # `id_to_name` will be used to convert the `optim_state_dict` of the
+        # raw optimizer to the `optim_state_dict`
+        # returned by `FSDP.optim_state_dict` in
+        # `StateDictType.FULL_STATE_DICT` mode.
+        self.id_to_name = dict()
+        for name, param in model.named_parameters():
+            self.id_to_name[id(param)] = name
+        return model
+
+    def save_checkpoint(self,
+                        filename: str,
+                        *,
+                        save_optimizer: bool = True,
+                        save_param_scheduler: bool = True,
+                        extra_ckpt: Optional[dict] = None,
+                        callback: Optional[Callable] = None) -> None:
+        """Save checkpoint to given ``filename``.
+
+        If ``state_dict_type`` is `full`, the checkpoint will only be saved in
+        rank0. The structure of the saved checkpoint is the same as the one
+        saved by ``DDPStrategy``
+
+        If ``state_dict_type`` is `local`, each rank will save the sharded
+        state dict to a directory, which means the saved structure will look
+        like this:
+
+        .. code-block:: bash
+
+            ── epoch_0.pth
+                ├── rank0.pth
+                ├── rank1.pth
+                ├── ...
+                └── rank8.pth
+
+        Args:
+            filename (str): Filename to save checkpoint.
+
+        Keyword Args:
+            save_optimizer (bool): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            save_param_scheduler (bool): Whether to save the param_scheduler
+                to the checkpoint. Defaults to True.
+            extra_ckpt (dict, optional): Extra checkpoint to save.
+                Defaults to None.
+            callback (callable, callable): Callback function to modify the
+                checkpoint before saving the checkpoint.
+                Defaults to None.
+        """
+        from mmengine.runner.checkpoint import save_checkpoint
+
+        state_dict: dict = dict()
+        state_dict['state_dict'] = self.model_state_dict()
+
+        # save optimizer state dict
+        if save_optimizer and hasattr(self, 'optim_wrapper'):
+            state_dict['optimizer'] = self.optim_state_dict()
+
+        # save param scheduler state dict
+        if save_param_scheduler and hasattr(self, 'param_schedulers'):
+            state_dict['param_schedulers'] = self.scheduler_state_dict()
+
+        # save extra checkpoint passed by users
+        if extra_ckpt is None:
+            extra_ckpt = dict()
+        if 'meta' not in extra_ckpt:
+            extra_ckpt['meta'] = dict()
+
+        extra_ckpt['meta'].update(
+            seed=self.seed,
+            time=time.strftime('%Y%m%d_%H%M%S', time.localtime()),
+            mmengine=mmengine.__version__ + get_git_hash(),
+        )
+        state_dict.update(extra_ckpt)
+
+        # users can do some modification before saving checkpoint
+        if callback is not None:
+            callback(state_dict)
+
+        # In non-FULL_STATE_DICT model, FSDPStrategy will save checkpoint
+        # of different ranks in different files.
+        if not self._is_full_state_dict():
+            rank = get_rank()
+            mkdir_or_exist(filename)
+            ckpt_name = f'rank{rank}.pth'
+            filename = osp.join(filename, ckpt_name)
+            save_checkpoint(state_dict, filename)
+
+        if is_main_process():
+            save_checkpoint(state_dict, filename)
+
+    def model_state_dict(self) -> dict:
+        """Get model state dict based on the ``state_dict_type``.
+
+        If ``state_dict_type`` is `full`, the model state dict will be the
+        same as the one of original unsharded model.
+
+        If ``state_dict_type`` is ``local``, and ``use_orig_params`` is ``True``
+        in ``model_wrapper``. The key of the state dict will be the same as
+        the one of original unsharded model, but its value will be the sharded
+        one
+
+        If ``state_dict_type`` is `local`, and ```use_orig_params``` is
+        ``False`` in ``model_wrapper``, the flatten and sharded state dict will
+        be returned.
+
+        See more details in the `official api documents`_
+
+        .. _official api documents: https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.optim_state_dict
+        """  # noqa: E501
+        # We've set state_dict by `FSDP.set_state_dict_type`, therefore we
+        # should get model state dict by `FSDP.state_dict`
+        return self.model.state_dict()
+
+    def optim_state_dict(self) -> dict:
+        """Get model state dict based on the ``state_dict_type``.
+
+        If ``state_dict_type`` is ``full``, the optimizer state dict can be
+        loaded by the original unsharded optimizer.
+
+        Otherwise, the optimizer state dict could only be loaded by the
+        optimizer with sharded parameters.
+
+        Note:
+            The optimizer state dict is not the same as the one of original
+            optimizer even if in ``full`` mode, although they can be loaded
+            correctly.
+
+        See more details in the `official api documents`_
+
+        .. _official api documents: https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.optim_state_dict
+        """  # noqa: E501
+        return FSDP.optim_state_dict(self.model, self.optim_wrapper)
+
+    def load_checkpoint(self, filename: str, **kwargs) -> dict:
+        """Load checkpoint from given ``filename``.
+
+        Note:
+            If ``state_dict_type`` is `local`, the filename should be a
+            directory contains ``rank{i}.pth``.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+
+        Keyword Args:
+            map_location (str or callable): A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'cpu'.
+            strict (bool): strict (bool): Whether to allow different params for
+                the model and checkpoint.
+            revise_keys (list): A list of customized keywords to modify the
+                state_dict in checkpoint. Each item is a (pattern, replacement)
+                pair of the regular expression operations. Defaults to strip
+                the prefix 'module.' by [(r'^module\\.', '')].
+            callback (callable, callable): Callback function to modify the
+                checkpoint after loading the checkpoint.
+                Defaults to None.
+        """
+        if self._is_full_state_dict():
+            return super(DDPStrategy, self).load_checkpoint(filename, **kwargs)
+        else:
+            rank = get_rank()
+            filename = osp.join(filename, f'rank{rank}.pth')
+            return super(DDPStrategy, self).load_checkpoint(filename, **kwargs)
+
+    def load_model_state_dict(
+        self,
+        state_dict: dict,
+        *,
+        strict: bool = False,
+        revise_keys: list = [(r'^module.', '')],
+    ) -> None:  # type: ignore
+        """Load model state from dict.
+
+        Warning:
+            `revise_keys` is not supported yet.
+
+        Args:
+            state_dict (dict): Model state dict returned by
+                :meth:`FSDPStrategy.model_state_dict`. If ``state_dict_type``
+                is ``full``. ``state_dict`` could be the result of
+                ``model.state_dict()``
+            strict (bool): Whether to load model state dict strictly.
+                Defaults to False.
+        """
+        # We should load state dict by `FSDP.load_state_dict`
+        self.model.load_state_dict(state_dict, strict=strict)
+
+    def load_optim_state_dict(self, state_dict: dict) -> None:
+        """Load optimizer state from dict.
+
+        Args:
+            state_dict (dict): The optimizer state dict. If ``state_dict_type``
+                is ``full``. ``state_dict`` could be the result of
+                ``optimizer.state_dict()``
+        """
+        optim_state_dict = FSDP.optim_state_dict_to_load(
+            state_dict, self.model, self.optim_wrapper.optimizer)
+        self.optim_wrapper.load_state_dict(optim_state_dict)
+
+    def _init_state_dict_cfg(self, state_dict_cfg: Union[str, dict]) -> None:
+        """Make ``state_dict_type`` and ``state_dict_config`` can be configured
+        with string."""
+        if isinstance(state_dict_cfg, str):
+            if state_dict_cfg == 'full':
+                self.state_dict_type = StateDictType.FULL_STATE_DICT
+                self.state_dict_config = FullStateDictConfig(
+                    rank0_only=True, offload_to_cpu=True)
+                self.optim_state_dict_config = FullOptimStateDictConfig(
+                    rank0_only=True, offload_to_cpu=True)
+            elif state_dict_cfg == 'local':
+                self.state_dict_type = StateDictType.LOCAL_STATE_DICT
+                self.state_dict_config = LocalStateDictConfig()
+                self.optim_state_dict_config = LocalOptimStateDictConfig()
+            else:
+                raise ValueError('FSDP only supports `full` and `local` '
+                                 f'state_dict_type, but got {state_dict_cfg}')
+        elif isinstance(state_dict_cfg, dict):
+            if 'state_dict_type' not in state_dict_cfg:
+                self.state_dict_type = StateDictType.LOCAL_STATE_DICT
+            else:
+                state_dict_type = state_dict_cfg['state_dict_type']
+                if isinstance(state_dict_type, str):
+                    self.state_dict_type = StateDictType[
+                        state_dict_cfg['state_dict_type']]
+                else:
+                    self.state_dict_type = state_dict_type
+            state_dict_config = state_dict_cfg.get('state_dict_config')
+            if state_dict_config is None:
+                self.state_dict_config = LocalStateDictConfig()
+            elif isinstance(state_dict_config, dict):
+                self.state_dict_config = FSDP_CONFIGS.build(
+                    state_dict_cfg['state_dict_config'])
+            else:
+                self.state_dict_config = state_dict_config
+
+            optim_state_dict_config = state_dict_cfg.get(
+                'optim_state_dict_config')
+            if optim_state_dict_config is None:
+                self.optim_state_dict_config = LocalOptimStateDictConfig()
+            elif isinstance(optim_state_dict_config, dict):
+                self.optim_state_dict_config = FSDP_CONFIGS.build(
+                    state_dict_cfg['optim_state_dict_config'])
+            else:
+                self.optim_state_dict_config = optim_state_dict_config
+        else:
+            raise TypeError('state_dict_cfg should be a `str` or a `dict`, '
+                            f'but got {type(state_dict_cfg)}')
+
+        if not isinstance(self.state_dict_type, StateDictType):
+            raise TypeError('state_dict_type must be StateDictType, but got '
+                            f'{type(self.state_dict_type)}')
+        if not isinstance(self.state_dict_config, StateDictConfig):
+            raise TypeError('state_dict_config must be StateDictConfig, but '
+                            f'got {type(self.state_dict_config)}')
+        if not isinstance(self.optim_state_dict_config, OptimStateDictConfig):
+            raise TypeError('optim_state_dict_config must be '
+                            'OptimStateDictConfig, but got '
+                            f'{type(self.optim_state_dict_config)}')
+
+    def build_optim_wrapper(
+        self,
+        optim_wrapper: Union[Optimizer, OptimWrapper, dict],
+        model: Optional[nn.Module] = None,
+    ) -> BaseOptimWrapper:
+        """Support sharding the optimizer state dict given a built optimizer or
+        optim_wrapper.
+
+        See specific usage in :meth:`BaseStrategy.build_optim_wrapper`.
+        """
+        if isinstance(optim_wrapper, Optimizer):
+            optim_wrapper = OptimWrapper(optim_wrapper)
+        if isinstance(optim_wrapper, BaseOptimWrapper):
+            assert model is not None
+            # NOTE: The only difference is that FSDPStrategy will shard
+            # the the built OptimWrapper
+            optimizer = optim_wrapper.optimizer
+            param_groups = optimizer.param_groups
+            optim_state_dict = optimizer.state_dict()
+            assert not optim_state_dict['state'], (
+                'Optimizer state_dict should be empty when giving an built '
+                'optim_wrapper to FSDPStrategy')
+            # Align the state_dict with state_dict generated by
+            # FSDP.full_optim_state_dict
+            new_param_groups = []
+            for group in param_groups:
+                new_group = {
+                    key: value
+                    for key, value in group.items() if key != 'param'
+                }
+                new_group['params'] = [
+                    self.id_to_name[id(param)] for param in group['params']
+                ]
+                new_param_groups.append(new_group)
+            optim_state_dict['param_groups'] = new_param_groups
+            defaults = {
+                k: v
+                for k, v in optimizer.defaults.items() if k != 'differentiable'
+            }
+
+            params_dict = {}
+            for k, v in model.named_parameters():
+                if '_fsdp_wrapped_module' in k:
+                    k = k.replace('_fsdp_wrapped_module.', '')
+                params_dict[k] = v
+
+            params = []
+            for param_group in new_param_groups:
+                _params = []
+                for param_name in param_group['params']:
+                    if param_name not in params_dict:
+                        raise RuntimeError(
+                            'Failed to reconstruct the sharded optimizer. '
+                            'You can try to set `use_orig_params=True` in '
+                            '`model_wrapper`')
+                    _params.append(params_dict[param_name])
+                param_group = {
+                    k: v
+                    for k, v in param_group.items() if k != 'param'
+                }
+                param_group['params'] = _params
+                params.append(param_group)
+
+            new_optimizer = optimizer.__class__(params, **defaults)
+
+            # Force to load the converted optim_state_dict in full mode.
+            with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT):
+                optim_state_dict = FSDP.optim_state_dict_to_load(
+                    optim_state_dict, model, new_optimizer)
+                new_optimizer.load_state_dict(optim_state_dict)
+            optim_wrapper.optimizer = new_optimizer
+            return optim_wrapper
+        if isinstance(optim_wrapper, (dict, ConfigDict, Config)):
+            assert model is not None
+            # optimizer must be defined for single optimizer training.
+            optimizer = optim_wrapper.get('optimizer', None)
+            optim_wrapper.setdefault('type', 'OptimWrapper')
+            if optim_wrapper.get('type',
+                                 'AmpOptimWrapper') in ('AmpOptimWrapper',
+                                                        AmpOptimWrapper):
+                optim_wrapper.setdefault('use_fsdp', True)
+
+            # If optimizer is a built `Optimizer` instance, the optimizer
+            # wrapper should be built by `OPTIM_WRAPPERS` registry.
+            if isinstance(optimizer, Optimizer):
+                return OPTIM_WRAPPERS.build(optim_wrapper)  # type: ignore
+
+            # If `optimizer` is not None or `constructor` is defined, it means,
+            # optimizer wrapper will be built by optimizer wrapper
+            # constructor. Therefore, `build_optim_wrapper` should be called.
+            if optimizer is not None or 'constructor' in optim_wrapper:
+                return build_optim_wrapper(model, optim_wrapper)
+            else:
+                # if `optimizer` is not defined, it should be the case of
+                # training with multiple optimizers. If `constructor` is not
+                # defined either, each value of `optim_wrapper` must be an
+                # `OptimWrapper` instance since `DefaultOptimizerConstructor`
+                # will not handle the case of training with multiple
+                # optimizers. `build_optim_wrapper` will directly build the
+                # `OptimWrapperDict` instance from `optim_wrapper.`
+                optim_wrappers = OrderedDict()
+                for name, optim in optim_wrapper.items():
+                    if not isinstance(optim, OptimWrapper):
+                        raise ValueError(
+                            'each item mush be an optimizer object when '
+                            '"type" and "constructor" are not in '
+                            f'optimizer, but got {name}={optim}')
+                    optim_wrappers[name] = optim
+                return OptimWrapperDict(**optim_wrappers)
+        else:
+            raise TypeError('optimizer wrapper should be an OptimWrapper '
+                            f'object or dict, but got {optim_wrapper}')
+
+    def _build_param_scheduler(
+        self,
+        scheduler: Union[_ParamScheduler, Dict, List],
+        optim_wrapper: BaseOptimWrapper,
+        default_args: dict,
+    ) -> List[_ParamScheduler]:
+        """Override this method to update the scheduler with the reconstructed
+        sharded optimzer."""
+        if not isinstance(scheduler, Sequence):
+            schedulers = [scheduler]
+        else:
+            schedulers = scheduler
+
+        max_epochs = default_args.pop('max_epochs', None)
+        max_iters = default_args.pop('max_iters', None)
+
+        param_schedulers = []
+        for scheduler in schedulers:
+            # Update the built scheduler with the sharded optimizer
+            if isinstance(scheduler, (_ParamScheduler, LRScheduler)):
+                parameter_keys = inspect.signature(
+                    scheduler.__class__).parameters.keys()
+                kwargs = {
+                    k: v
+                    for k, v in scheduler.state_dict().items()
+                    if k in parameter_keys
+                }
+                scheduler = scheduler.__class__(optim_wrapper, **kwargs)
+            elif isinstance(scheduler, dict):
+                _scheduler = copy.deepcopy(scheduler)
+
+                # Set default end
+                if _scheduler.get('by_epoch', True):
+                    if max_epochs is None:
+                        raise ValueError(
+                            'max_epochs must be specified in default_args')
+                    default_end = max_epochs
+                else:
+                    if max_iters is None:
+                        raise ValueError(
+                            'max_iters must be specified in default_args')
+                    default_end = max_iters
+                _scheduler.setdefault('end', default_end)
+                self.logger.debug(
+                    f'The `end` of {_scheduler["type"]} is not set. '
+                    'Use the max epochs/iters of train loop as default.')
+
+                param_schedulers.append(
+                    PARAM_SCHEDULERS.build(
+                        _scheduler,
+                        default_args=dict(
+                            optimizer=optim_wrapper, **default_args)))
+            else:
+                raise TypeError(
+                    'scheduler should be a _ParamScheduler object or dict, '
+                    f'but got {scheduler}')
+        return param_schedulers
diff --git a/head_extractor/build/lib/mmengine/_strategy/single_device.py b/head_extractor/build/lib/mmengine/_strategy/single_device.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7d8accd5a73e02121f19c75c644202de6a62d1a
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/_strategy/single_device.py
@@ -0,0 +1,287 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+from typing import Callable, Dict, List, Optional, Union
+
+import torch.nn as nn
+
+import mmengine
+from mmengine.device import get_device
+from mmengine.model import revert_sync_batchnorm
+from mmengine.optim import BaseOptimWrapper, _ParamScheduler
+from mmengine.registry import STRATEGIES
+from mmengine.utils import get_git_hash
+from .base import BaseStrategy
+
+
+@STRATEGIES.register_module()
+class SingleDeviceStrategy(BaseStrategy):
+    """Strategy for single device training."""
+
+    def prepare(
+        self,
+        model: Union[nn.Module, dict],
+        *,
+        optim_wrapper: Union[BaseOptimWrapper, dict, None] = None,
+        param_scheduler: Union[_ParamScheduler, Dict, List, None] = None,
+        compile: Union[dict, bool] = False,
+        dispatch_kwargs: Optional[dict] = None,
+    ):
+        """Prepare model and some components.
+
+        Args:
+            model (:obj:`torch.nn.Module` or dict): The model to be run. It
+                can be a dict used for build a model.
+
+        Keyword Args:
+            optim_wrapper (BaseOptimWrapper or dict, optional): Computing the
+                gradient of model parameters and updating them.
+                Defaults to None.
+                See :meth:`build_optim_wrapper` for examples.
+            param_scheduler (_ParamScheduler or dict or list, optional):
+                Parameter scheduler for updating optimizer parameters. If
+                specified, :attr:`optim_wrapper` should also be specified.
+                Defaults to None.
+                See :meth:`build_param_scheduler` for examples.
+            compile (dict, optional): Config to compile model.
+                Defaults to False. Requires PyTorch>=2.0.
+            dispatch_kwargs (dict, optional): Kwargs to be passed to other
+                methods of Strategy. Defaults to None.
+                If ``accumulative_counts`` is set in ``optim_wrapper``, you
+                need to provide ``max_iters`` in ``dispatch_kwargs``.
+        """
+        if self._prepared:
+            return self._prepared_components()
+        if dispatch_kwargs is not None:
+            self.dispatch_kwargs.update(dispatch_kwargs)
+
+        model = self.build_model(model)
+        model = self._init_model_weights(model)
+        model = self._wrap_model(model)
+        model = self.compile_model(model, compile=compile)
+
+        self.model = model
+
+        if optim_wrapper is not None:
+            self.optim_wrapper = self.build_optim_wrapper(optim_wrapper, model)
+            self._scale_lr()
+
+            accumulative_counts = getattr(self.optim_wrapper,
+                                          '_accumulative_counts', 1)
+            if accumulative_counts > 1:
+                if 'max_iters' not in self.dispatch_kwargs:
+                    raise ValueError(
+                        '"max_iters" must be specified because '
+                        '"accumulative_counts" was set as '
+                        f'{accumulative_counts} which is greater than 1.')
+
+                self.optim_wrapper.initialize_count_status(  # type: ignore
+                    self.model, 0, self.dispatch_kwargs['max_iters'])
+
+        if param_scheduler is not None:
+            self.param_schedulers = self.build_param_scheduler(
+                param_scheduler, self.optim_wrapper)
+
+        self._prepared = True
+        return self._prepared_components()
+
+    def _wrap_model(self, model: nn.Module) -> nn.Module:
+        model = self.convert_model(model)
+        current_device = get_device()
+        return model.to(current_device)
+
+    def convert_model(self, model: nn.Module) -> nn.Module:
+        """Convert layers of model.
+
+        convert all ``SyncBatchNorm`` (SyncBN) and
+        ``mmcv.ops.sync_bn.SyncBatchNorm`` (MMSyncBN) layers in the model to
+        ``BatchNormXd`` layers.
+
+        Args:
+            model (nn.Module): Model to convert.
+        """
+        self.logger.info(
+            'Distributed training is not used, all SyncBatchNorm (SyncBN) '
+            'layers in the model will be automatically reverted to '
+            'BatchNormXd layers if they are used.')
+        model = revert_sync_batchnorm(model)
+        return model
+
+    def load_checkpoint(
+        self,
+        filename: str,
+        *,
+        map_location: Union[str, Callable] = 'cpu',
+        strict: bool = False,
+        revise_keys: list = [(r'^module.', '')],
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Load checkpoint from given ``filename``.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+
+        Keyword Args:
+            map_location (str or callable): A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'cpu'.
+            strict (bool): strict (bool): Whether to allow different params for
+                the model and checkpoint.
+            revise_keys (list): A list of customized keywords to modify the
+                state_dict in checkpoint. Each item is a (pattern, replacement)
+                pair of the regular expression operations. Defaults to strip
+                the prefix 'module.' by [(r'^module\\.', '')].
+            callback (callable, callable): Callback function to modify the
+                checkpoint after loading the checkpoint.
+                Defaults to None.
+        """
+        from mmengine.runner.checkpoint import _load_checkpoint
+
+        self.logger.info(f'Load checkpoint from {filename}')
+
+        if map_location == 'default':
+            device = get_device()
+            checkpoint = _load_checkpoint(filename, map_location=device)
+        else:
+            checkpoint = _load_checkpoint(filename, map_location=map_location)
+
+        # users can do some modification after loading checkpoint
+        if callback is not None:
+            callback(checkpoint)
+
+        state_dict = checkpoint.pop('state_dict')
+        self.load_model_state_dict(
+            state_dict, strict=strict, revise_keys=revise_keys)
+
+        return checkpoint
+
+    def resume(
+        self,
+        filename: str,
+        *,
+        resume_optimizer: bool = True,
+        resume_param_scheduler: bool = True,
+        map_location: Union[str, Callable] = 'default',
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Resume training from given ``filename``.
+
+        Four types of states will be resumed.
+
+        - model state
+        - optimizer state
+        - scheduler state
+        - randomness state
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+
+        Keyword Args:
+            resume_optimizer (bool): Whether to resume optimizer state.
+                Defaults to True.
+            resume_param_scheduler (bool): Whether to resume param scheduler
+                state. Defaults to True.
+            map_location (str or callable):A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'default'.
+            callback (callable, callable): Callback function to modify the
+                checkpoint before saving the checkpoint.
+                Defaults to None.
+        """
+        self.logger.info(f'Resume checkpoint from {filename}')
+
+        checkpoint = self.load_checkpoint(
+            filename, map_location=map_location, callback=callback)
+
+        if resume_optimizer:
+            self.load_optim_state_dict(checkpoint.pop('optimizer'))
+
+        if resume_param_scheduler and hasattr(self, 'param_schedulers'):
+            self.load_scheduler_state_dict(checkpoint.pop('param_schedulers'))
+
+        # resume random seed
+        resumed_seed = checkpoint['meta'].get('seed', None)
+        current_seed = self._randomness.get('seed')
+        if resumed_seed is not None and resumed_seed != current_seed:
+            if current_seed is not None:
+                self.logger.warning(f'The value of random seed in the '
+                                    f'checkpoint "{resumed_seed}" is '
+                                    f'different from the value in '
+                                    f'`randomness` config "{current_seed}"')
+            self._randomness.update(seed=resumed_seed)
+            self._set_randomness(**self._randomness)
+
+        # resume iter
+        cur_iter = checkpoint['meta']['iter']
+
+        if hasattr(self, 'optim_wrapper'):
+            accumulative_counts = getattr(self.optim_wrapper,
+                                          '_accumulative_counts', 1)
+            if accumulative_counts > 1:
+                if 'max_iters' not in self.dispatch_kwargs:
+                    raise ValueError(
+                        '"max_iters" must be specified because '
+                        '"accumulative_counts" was set as '
+                        f'{accumulative_counts} which is greater than 1.')
+                    # Initiate inner count of `optim_wrapper`.
+                self.optim_wrapper.initialize_count_status(  # type: ignore
+                    self.model, cur_iter, self.dispatch_kwargs['max_iters'])
+
+        return checkpoint
+
+    def save_checkpoint(
+        self,
+        filename: str,
+        *,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        extra_ckpt: Optional[dict] = None,
+        callback: Optional[Callable] = None,
+    ) -> None:
+        """Save checkpoint to given ``filename``.
+
+        Args:
+            filename (str): Filename to save checkpoint.
+
+        Keyword Args:
+            save_optimizer (bool): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            save_param_scheduler (bool): Whether to save the param_scheduler
+                to the checkpoint. Defaults to True.
+            extra_ckpt (dict, optional): Extra checkpoint to save.
+                Defaults to None.
+            callback (callable, callable): Callback function to modify the
+                checkpoint before saving the checkpoint.
+                Defaults to None.
+        """
+        from mmengine.runner.checkpoint import save_checkpoint
+
+        state_dict: dict = dict()
+        state_dict['state_dict'] = self.model_state_dict()
+
+        # save optimizer state dict
+        if save_optimizer and hasattr(self, 'optim_wrapper'):
+            state_dict['optimizer'] = self.optim_state_dict()
+
+        if save_param_scheduler and hasattr(self, 'param_schedulers'):
+            state_dict['param_schedulers'] = self.scheduler_state_dict()
+
+        # save extra checkpoint passed by users
+        if extra_ckpt is None:
+            extra_ckpt = dict()
+        if 'meta' not in extra_ckpt:
+            extra_ckpt['meta'] = dict()
+        extra_ckpt['meta'].update(
+            seed=self.seed,
+            time=time.strftime('%Y%m%d_%H%M%S', time.localtime()),
+            mmengine=mmengine.__version__ + get_git_hash(),
+        )
+
+        state_dict.update(extra_ckpt)
+
+        # users can do some modification before saving checkpoint
+        if callback is not None:
+            callback(state_dict)
+
+        save_checkpoint(state_dict, filename)
diff --git a/head_extractor/build/lib/mmengine/_strategy/utils.py b/head_extractor/build/lib/mmengine/_strategy/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c691bd602bf106fa83a16bb92757e96fde5366ba
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/_strategy/utils.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch._subclasses.fake_tensor import _is_tensor_constructor
+from torch.utils._python_dispatch import TorchDispatchMode
+
+
+class MetaTensorContext(TorchDispatchMode):
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if _is_tensor_constructor(func):
+            device_idx = [arg.name
+                          for arg in func._schema.arguments].index('device')
+            if len(args) > device_idx:
+                args = list(args)
+                args[device_idx] = 'meta'
+            else:
+                kwargs['device'] = 'meta'
+        return func(*args, **kwargs)
diff --git a/head_extractor/build/lib/mmengine/analysis/__init__.py b/head_extractor/build/lib/mmengine/analysis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e51090c387d665f633d550aa222e06debe26357e
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/analysis/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .complexity_analysis import (ActivationAnalyzer, FlopAnalyzer,
+                                  activation_count, flop_count,
+                                  parameter_count, parameter_count_table)
+from .print_helper import get_model_complexity_info
+
+__all__ = [
+    'FlopAnalyzer', 'ActivationAnalyzer', 'flop_count', 'activation_count',
+    'parameter_count', 'parameter_count_table', 'get_model_complexity_info'
+]
diff --git a/head_extractor/build/lib/mmengine/analysis/complexity_analysis.py b/head_extractor/build/lib/mmengine/analysis/complexity_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..435e5fe5d3a1af4cf2d94ab11ae52910b290d5fc
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/analysis/complexity_analysis.py
@@ -0,0 +1,357 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import typing
+from collections import defaultdict
+from typing import Any, Counter, DefaultDict, Dict, Optional, Tuple, Union
+
+import torch.nn as nn
+from rich import box
+from rich.console import Console
+from rich.table import Table
+from torch import Tensor
+
+from .jit_analysis import JitModelAnalysis
+from .jit_handles import (Handle, addmm_flop_jit, batchnorm_flop_jit,
+                          bmm_flop_jit, conv_flop_jit, einsum_flop_jit,
+                          elementwise_flop_counter, generic_activation_jit,
+                          linear_flop_jit, matmul_flop_jit, norm_flop_counter)
+
+# A dictionary that maps supported operations to their flop count jit handles.
+_DEFAULT_SUPPORTED_FLOP_OPS: Dict[str, Handle] = {
+    'aten::addmm': addmm_flop_jit,
+    'aten::bmm': bmm_flop_jit,
+    'aten::_convolution': conv_flop_jit,
+    'aten::einsum': einsum_flop_jit,
+    'aten::matmul': matmul_flop_jit,
+    'aten::mm': matmul_flop_jit,
+    'aten::linear': linear_flop_jit,
+    # You might want to ignore BN flops due to inference-time fusion.
+    # Use `set_op_handle("aten::batch_norm", None)
+    'aten::batch_norm': batchnorm_flop_jit,
+    'aten::group_norm': norm_flop_counter(2),
+    'aten::layer_norm': norm_flop_counter(2),
+    'aten::instance_norm': norm_flop_counter(1),
+    'aten::upsample_nearest2d': elementwise_flop_counter(0, 1),
+    'aten::upsample_bilinear2d': elementwise_flop_counter(0, 4),
+    'aten::adaptive_avg_pool2d': elementwise_flop_counter(1, 0),
+    'aten::grid_sampler': elementwise_flop_counter(0, 4),  # assume bilinear
+}
+
+# A dictionary that maps supported operations to
+# their activation count handles.
+_DEFAULT_SUPPORTED_ACT_OPS: Dict[str, Handle] = {
+    'aten::_convolution': generic_activation_jit('conv'),
+    'aten::addmm': generic_activation_jit(),
+    'aten::bmm': generic_activation_jit(),
+    'aten::einsum': generic_activation_jit(),
+    'aten::matmul': generic_activation_jit(),
+    'aten::linear': generic_activation_jit(),
+}
+
+
+class FlopAnalyzer(JitModelAnalysis):
+    """Provides access to per-submodule model flop count obtained by tracing a
+    model with pytorch's jit tracing functionality.
+
+    By default, comes with standard flop counters for a few common operators.
+
+    Note:
+        - Flop is not a well-defined concept. We just produce our best
+          estimate.
+        - We count one fused multiply-add as one flop.
+
+    Handles for additional operators may be added, or the default ones
+    overwritten, using the ``.set_op_handle(name, func)`` method.
+    See the method documentation for details.
+    Flop counts can be obtained as:
+
+    - ``.total(module_name="")``: total flop count for the module
+    - ``.by_operator(module_name="")``: flop counts for the module, as a
+      Counter over different operator types
+    - ``.by_module()``: Counter of flop counts for all submodules
+    - ``.by_module_and_operator()``: dictionary indexed by descendant of
+      Counters over different operator types
+
+    An operator is treated as within a module if it is executed inside the
+    module's ``__call__`` method. Note that this does not include calls to
+    other methods of the module or explicit calls to ``module.forward(...)``.
+
+    Modified from
+    https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/flop_count.py
+
+    Args:
+        model (nn.Module): The model to analyze.
+        inputs (Union[Tensor, Tuple[Tensor, ...]]): The input to the model.
+
+    Examples:
+        >>> import torch.nn as nn
+        >>> import torch
+        >>> class TestModel(nn.Module):
+        ...    def __init__(self):
+        ...        super().__init__()
+        ...        self.fc = nn.Linear(in_features=1000, out_features=10)
+        ...        self.conv = nn.Conv2d(
+        ...            in_channels=3, out_channels=10, kernel_size=1
+        ...        )
+        ...        self.act = nn.ReLU()
+        ...    def forward(self, x):
+        ...        return self.fc(self.act(self.conv(x)).flatten(1))
+        >>> model = TestModel()
+        >>> inputs = (torch.randn((1,3,10,10)),)
+        >>> flops = FlopAnalyzer(model, inputs)
+        >>> flops.total()
+        13000
+        >>> flops.total("fc")
+        10000
+        >>> flops.by_operator()
+        Counter({"addmm" : 10000, "conv" : 3000})
+        >>> flops.by_module()
+        Counter({"" : 13000, "fc" : 10000, "conv" : 3000, "act" : 0})
+        >>> flops.by_module_and_operator()
+        {"" : Counter({"addmm" : 10000, "conv" : 3000}),
+        "fc" : Counter({"addmm" : 10000}),
+        "conv" : Counter({"conv" : 3000}),
+        "act" : Counter()
+        }
+    """
+
+    def __init__(
+        self,
+        model: nn.Module,
+        inputs: Union[Tensor, Tuple[Tensor, ...]],
+    ) -> None:
+        super().__init__(model=model, inputs=inputs)
+        self.set_op_handle(**_DEFAULT_SUPPORTED_FLOP_OPS)
+
+    __init__.__doc__ = JitModelAnalysis.__init__.__doc__
+
+
+class ActivationAnalyzer(JitModelAnalysis):
+    """Provides access to per-submodule model activation count obtained by
+    tracing a model with pytorch's jit tracing functionality.
+
+    By default, comes with standard activation counters for convolutional and
+    dot-product operators. Handles for additional operators may be added, or
+    the default ones overwritten, using the ``.set_op_handle(name, func)``
+    method. See the method documentation for details. Activation counts can be
+    obtained as:
+
+    - ``.total(module_name="")``: total activation count for a module
+    - ``.by_operator(module_name="")``: activation counts for the module,
+      as a Counter over different operator types
+    - ``.by_module()``: Counter of activation counts for all submodules
+    - ``.by_module_and_operator()``: dictionary indexed by descendant of
+      Counters over different operator types
+
+    An operator is treated as within a module if it is executed inside the
+    module's ``__call__`` method. Note that this does not include calls to
+    other methods of the module or explicit calls to ``module.forward(...)``.
+
+    Modified from
+    https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/activation_count.py
+
+    Args:
+        model (nn.Module): The model to analyze.
+        inputs (Union[Tensor, Tuple[Tensor, ...]]): The input to the model.
+
+    Examples:
+        >>> import torch.nn as nn
+        >>> import torch
+        >>> class TestModel(nn.Module):
+        ...     def __init__(self):
+        ...        super().__init__()
+        ...        self.fc = nn.Linear(in_features=1000, out_features=10)
+        ...        self.conv = nn.Conv2d(
+        ...            in_channels=3, out_channels=10, kernel_size=1
+        ...        )
+        ...        self.act = nn.ReLU()
+        ...    def forward(self, x):
+        ...        return self.fc(self.act(self.conv(x)).flatten(1))
+        >>> model = TestModel()
+        >>> inputs = (torch.randn((1,3,10,10)),)
+        >>> acts = ActivationAnalyzer(model, inputs)
+        >>> acts.total()
+        1010
+        >>> acts.total("fc")
+        10
+        >>> acts.by_operator()
+        Counter({"conv" : 1000, "addmm" : 10})
+        >>> acts.by_module()
+        Counter({"" : 1010, "fc" : 10, "conv" : 1000, "act" : 0})
+        >>> acts.by_module_and_operator()
+        {"" : Counter({"conv" : 1000, "addmm" : 10}),
+        "fc" : Counter({"addmm" : 10}),
+        "conv" : Counter({"conv" : 1000}),
+        "act" : Counter()
+        }
+    """
+
+    def __init__(
+        self,
+        model: nn.Module,
+        inputs: Union[Tensor, Tuple[Tensor, ...]],
+    ) -> None:
+        super().__init__(model=model, inputs=inputs)
+        self.set_op_handle(**_DEFAULT_SUPPORTED_ACT_OPS)
+
+    __init__.__doc__ = JitModelAnalysis.__init__.__doc__
+
+
+def flop_count(
+    model: nn.Module,
+    inputs: Tuple[Any, ...],
+    supported_ops: Optional[Dict[str, Handle]] = None,
+) -> Tuple[DefaultDict[str, float], Counter[str]]:
+    """Given a model and an input to the model, compute the per-operator Gflops
+    of the given model.
+
+    Adopted from
+    https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/flop_count.py
+
+    Args:
+        model (nn.Module): The model to compute flop counts.
+        inputs (tuple): Inputs that are passed to `model` to count flops.
+            Inputs need to be in a tuple.
+        supported_ops (dict(str,Callable) or None) : provide additional
+            handlers for extra ops, or overwrite the existing handlers for
+            convolution and matmul and einsum. The key is operator name and
+            the value is a function that takes (inputs, outputs) of the op.
+            We count one Multiply-Add as one FLOP.
+
+    Returns:
+        tuple[defaultdict, Counter]: A dictionary that records the number of
+        gflops for each operation and a Counter that records the number of
+        unsupported operations.
+    """
+    if supported_ops is None:
+        supported_ops = {}
+    flop_counter = FlopAnalyzer(model, inputs).set_op_handle(**supported_ops)
+    giga_flops = defaultdict(float)
+    for op, flop in flop_counter.by_operator().items():
+        giga_flops[op] = flop / 1e9
+    return giga_flops, flop_counter.unsupported_ops()
+
+
+def activation_count(
+    model: nn.Module,
+    inputs: Tuple[Any, ...],
+    supported_ops: Optional[Dict[str, Handle]] = None,
+) -> Tuple[DefaultDict[str, float], Counter[str]]:
+    """Given a model and an input to the model, compute the total number of
+    activations of the model.
+
+    Adopted from
+    https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/activation_count.py
+
+    Args:
+        model (nn.Module): The model to compute activation counts.
+        inputs (tuple): Inputs that are passed to `model` to count activations.
+            Inputs need to be in a tuple.
+        supported_ops (dict(str,Callable) or None) : provide additional
+            handlers for extra ops, or overwrite the existing handlers for
+            convolution and matmul. The key is operator name and the value
+            is a function that takes (inputs, outputs) of the op.
+
+    Returns:
+        tuple[defaultdict, Counter]: A dictionary that records the number of
+        activation (mega) for each operation and a Counter that records the
+        number of unsupported operations.
+    """
+    if supported_ops is None:
+        supported_ops = {}
+    act_counter = ActivationAnalyzer(model,
+                                     inputs).set_op_handle(**supported_ops)
+    mega_acts = defaultdict(float)
+    for op, act in act_counter.by_operator().items():
+        mega_acts[op] = act / 1e6
+    return mega_acts, act_counter.unsupported_ops()
+
+
+def parameter_count(model: nn.Module) -> typing.DefaultDict[str, int]:
+    """Count parameters of a model and its submodules.
+
+    Adopted from
+    https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/parameter_count.py
+
+    Args:
+        model (nn.Module): the model to count parameters.
+
+    Returns:
+        dict[str, int]: the key is either a parameter name or a module name.
+        The value is the number of elements in the parameter, or in all
+        parameters of the module. The key "" corresponds to the total
+        number of parameters of the model.
+    """
+    count = defaultdict(int)  # type: typing.DefaultDict[str, int]
+    for name, param in model.named_parameters():
+        size = param.numel()
+        name = name.split('.')
+        for k in range(0, len(name) + 1):
+            prefix = '.'.join(name[:k])
+            count[prefix] += size
+    return count
+
+
+def parameter_count_table(model: nn.Module, max_depth: int = 3) -> str:
+    """Format the parameter count of the model (and its submodules or
+    parameters)
+
+    Adopted from
+    https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/parameter_count.py
+
+    Args:
+        model (nn.Module): the model to count parameters.
+        max_depth (int): maximum depth to recursively print submodules or
+            parameters
+
+    Returns:
+        str: the table to be printed
+    """
+    count: typing.DefaultDict[str, int] = parameter_count(model)
+    # pyre-fixme[24]: Generic type `tuple` expects at least 1 type parameter.
+    param_shape: typing.Dict[str, typing.Tuple] = {
+        k: tuple(v.shape)
+        for k, v in model.named_parameters()
+    }
+
+    # pyre-fixme[24]: Generic type `tuple` expects at least 1 type parameter.
+    rows: typing.List[typing.Tuple] = []
+
+    def format_size(x: int) -> str:
+        if x > 1e8:
+            return f'{x / 1e9:.1f}G'
+        if x > 1e5:
+            return f'{x / 1e6:.1f}M'
+        if x > 1e2:
+            return f'{x / 1e3:.1f}K'
+        return str(x)
+
+    def fill(lvl: int, prefix: str) -> None:
+        if lvl >= max_depth:
+            return
+        for name, v in count.items():
+            if name.count('.') == lvl and name.startswith(prefix):
+                indent = ' ' * (lvl + 1)
+                if name in param_shape:
+                    rows.append(
+                        (indent + name, indent + str(param_shape[name])))
+                else:
+                    rows.append((indent + name, indent + format_size(v)))
+                    fill(lvl + 1, name + '.')
+
+    rows.append(('model', format_size(count.pop(''))))
+    fill(0, '')
+
+    table = Table(
+        title=f'parameter count of {model.__class__.__name__}', box=box.ASCII2)
+    table.add_column('name')
+    table.add_column('#elements or shape')
+
+    for row in rows:
+        table.add_row(*row)
+
+    console = Console()
+    with console.capture() as capture:
+        console.print(table, end='')
+
+    return capture.get()
diff --git a/head_extractor/build/lib/mmengine/analysis/jit_analysis.py b/head_extractor/build/lib/mmengine/analysis/jit_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..17b294863a507718716b06b447161f8bf660e14e
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/analysis/jit_analysis.py
@@ -0,0 +1,684 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Modified from
+# https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/jit_analysis.py
+
+import logging
+import typing
+import warnings
+from collections import Counter
+from copy import copy
+from dataclasses import dataclass
+from numbers import Number
+from typing import (Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple,
+                    TypeVar, Union)
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.jit import TracerWarning, _get_trace_graph
+
+from mmengine.logging import print_log
+from .jit_handles import Handle
+
+T = TypeVar('T', bound='JitModelAnalysis')
+
+# Only ignore ops that are technically truly 0 flops:
+# shape-manipulation ops, integer ops, memory copy ops
+_IGNORED_OPS: Set[str] = {
+    'aten::Int',
+    'aten::ScalarImplicit',
+    'aten::__and__',
+    'aten::arange',
+    'aten::bitwise_not',
+    'aten::cat',
+    'aten::chunk',
+    'aten::clamp',
+    'aten::clamp_',
+    'aten::constant_pad_nd',
+    'aten::contiguous',
+    'aten::copy_',
+    'aten::detach',
+    'aten::dropout',
+    'aten::empty',
+    'aten::eq',
+    'aten::expand',
+    'aten::flatten',
+    'aten::floor',
+    'aten::floor_divide',
+    'aten::full',
+    'aten::full_like',
+    'aten::gather',
+    'aten::ge',
+    'aten::gt',
+    'aten::index',
+    'aten::index_put_',
+    'aten::masked_fill',
+    'aten::max',
+    'aten::narrow',
+    'aten::new_empty',
+    'aten::new_full',
+    'aten::new_zeros',
+    'aten::nonzero',
+    'aten::ones',
+    'aten::permute',
+    'aten::relu',
+    'aten::relu_',
+    'aten::remainder',
+    'aten::reshape',
+    'aten::roll',
+    'aten::select',
+    'aten::size',
+    'aten::slice',
+    'aten::split',
+    'aten::split_with_sizes',
+    'aten::squeeze',
+    'aten::stack',
+    'aten::t',
+    'aten::to',
+    'aten::transpose',
+    'aten::type_as',
+    'aten::unbind',
+    'aten::unsqueeze',
+    'aten::unsqueeze_',
+    'aten::view',
+    'aten::zeros',
+    'aten::zeros_like',
+}
+
+
+@dataclass
+class Statistics:
+    """For keeping track of the various model statistics recorded during
+    analysis."""
+
+    counts: Dict[str, typing.Counter[str]]
+    unsupported_ops: Dict[str, typing.Counter[str]]
+    uncalled_mods: Set[str]
+
+
+def _named_modules_with_dup(model: nn.Module,
+                            prefix: str = ''
+                            ) -> Iterable[Tuple[str, nn.Module]]:
+    """The same as `model.named_modules()`, except that it includes duplicated
+    modules that have more than one name."""
+    yield prefix, model
+    for name, module in model._modules.items():
+        if module is None:
+            continue
+        submodule_prefix = prefix + ('.' if prefix else '') + name
+        yield from _named_modules_with_dup(module, submodule_prefix)
+
+
+def _named_modules_without_dup(
+        model: nn.Module) -> Iterator[Tuple[str, nn.Module]]:
+    """Like .named_modules(), but the results are slightly different for some
+    wrapped models."""
+    seen = set()
+    for name, mod in _named_modules_with_dup(model):
+        if mod not in seen:
+            seen.add(mod)
+            yield name, mod
+
+
+def _get_scoped_trace_graph(
+    module: nn.Module,
+    inputs: Union[Tensor, Tuple[Tensor, ...]],
+    aliases: Dict[Union[str, nn.Module], str],
+) -> torch._C.Graph:
+    """Traces the provided module using torch.jit._get_trace_graph, but adds
+    submodule scope information to each graph node.
+
+    The resulting graph is in-lined and has all model parameters treated as
+    inputs. The input model has the scope name '', while its descendants
+    have names of the form 'child.grandchild.grandgrandchild...'.
+
+    Args:
+        model (nn.Module): The module to trace
+        inputs (tuple): Inputs used during the trace of the model
+        aliases (dict[str or nn.Module, str]): maps modules and module
+            names to the canonical name to be used as the scope for
+            that module.
+
+    Returns:
+        graph (torch._C.Graph): The pytorch JIT trace of the model
+    """
+
+    # torch.jit._get_trace_graph can trace torch function like `aten::linear`,
+    # `aten::add` etc. However, the traced node(function) cannot tell it is
+    # called by which module. `ScopePushHook` and `ScopePopHook` can
+    # help traced node get the module name information by `node.scopeName()`.
+    class ScopePushHook:
+
+        def __init__(self, name: str) -> None:
+            self.name = name
+
+        def __call__(self, module: nn.Module, inputs: Any) -> Any:
+            tracing_state = torch._C._get_tracing_state()
+            if tracing_state:
+                tracing_state.push_scope(self.name)
+            return inputs
+
+    class ScopePopHook:
+
+        def __call__(self, module: nn.Module, inputs: Any,
+                     outputs: Any) -> Any:
+            tracing_state = torch._C._get_tracing_state()
+            if tracing_state:
+                tracing_state.pop_scope()
+            return outputs
+
+    hook_handles: List[Any] = []
+
+    def register_hooks(mod: nn.Module, name: str) -> None:
+        prehook = mod.register_forward_pre_hook(ScopePushHook(name))
+        posthook = mod.register_forward_hook(ScopePopHook())
+        hook_handles.append(prehook)
+        hook_handles.append(posthook)
+
+    # Unwrap DDP, but correct the scope names for the root module.
+    module_list = (nn.parallel.distributed.DistributedDataParallel,
+                   nn.DataParallel)
+    # Since DataParallel just wraps the model, add an extra set of hooks
+    # to the model it wraps to account for the wrapper. Then trace it.
+    if isinstance(module, module_list):
+        root_name = aliases[module]
+        module = module.module
+        register_hooks(module, root_name)
+
+    for name, mod in _named_modules_without_dup(module):
+        name = aliases[mod]
+        register_hooks(mod, name)
+
+    graph, _ = _get_trace_graph(module, inputs)
+
+    for handle in hook_handles:
+        handle.remove()
+
+    return graph
+
+
+class JitModelAnalysis:
+    """Provides access to per-submodule model statistics obtained by tracing a
+    model with pytorch's jit tracing functionality.
+
+    Calculates a statistic on a per-operator basis using the provided set of
+    functions that acts on the inputs and outputs to the operator, then
+    aggregates this over modules in the model. Can return the aggregate
+    statistic for any submodule in the model. Is lazily evaluated, and will
+    perform the trace when a statistic is first requested. Changing the
+    operator handles will cause the trace to be rerun on the next request.
+
+    Submodules may be referred to using the module's name. The input model has
+    name "", while its descendants have names of the form
+    "child.grandchild.grandgrandchild...".
+
+    An operator is treated as within the scope of a module if calling that
+    module directly resulted in that operator being run. In particular, this
+    means that calls to other functions owned by a module or explicit
+    calls to module.forward(...) will not register resulting operators as
+    contributing statistics to that module.
+
+    We will trace the execution of `model.forward(inputs)`. This means
+    inputs have to be tensors or tuple of tensors (see
+    https://pytorch.org/docs/stable/generated/torch.jit.trace.html#torch.jit.trace).
+    In order to trace other methods or unsupported input types,
+    you may need to implement a wrapper module.
+
+    Args:
+        model: The model to analyze
+        inputs: The inputs to the model for analysis.
+    """
+
+    def __init__(
+        self,
+        model: nn.Module,
+        inputs: Union[Tensor, Tuple[Tensor, ...]],
+    ) -> None:
+        self._model = model
+        self._inputs = inputs
+        self._op_handles: Dict[str, Handle] = {}
+        # Mapping from names to submodules
+        self._named_modules: Dict[str, nn.Module] = dict(
+            _named_modules_with_dup(model))
+        # Mapping from submodules and their aliases to the canonical name
+        # of each submodule
+        self._aliases: Dict[Union[nn.Module, str],
+                            str] = self._get_aliases(model)
+        self._stats: Optional[Statistics] = None
+
+        self._ignored_ops: Set[str] = copy(_IGNORED_OPS)
+        self.unsupported_ops_warnings(True)
+        self.uncalled_modules_warnings(True)
+        self.tracer_warnings('no_tracer_warning')
+        self.ancestor_mode('owner')
+
+    def total(self, module_name: str = '') -> int:
+        """Returns the total aggregated statistic across all operators for the
+        requested module.
+
+        Args:
+            module_name (str): The submodule to get data for. Defaults to
+                the entire model.
+
+        Returns:
+            int: The aggregated statistic.
+        """
+        stats = self._analyze()
+        module_name = self.canonical_module_name(module_name)
+        total_count = sum(stats.counts[module_name].values())
+        return total_count
+
+    def by_operator(self, module_name: str = '') -> typing.Counter[str]:
+        """Returns the statistics for a requested module, grouped by operator
+        type.
+
+        The operator handle determines the name associated with each
+        operator type.
+
+        Args:
+            module_name (str): The submodule to get data for. Defaults
+                to the entire model.
+
+        Returns:
+            Counter(str): The statistics for each operator.
+        """
+        stats = self._analyze()
+        module_name = self.canonical_module_name(module_name)
+        return stats.counts[module_name]
+
+    def by_module_and_operator(self) -> Dict[str, typing.Counter[str]]:
+        """Returns the statistics for all submodules, separated out by operator
+        type for each submodule.
+
+        The operator handle determines the name associated with
+        each operator type.
+
+        Returns:
+            dict[str, Counter(str)]: The statistics for each submodule
+            and each operator. Grouped by submodule names, then
+            by operator name.
+        """
+        stats = self._analyze()
+        return stats.counts
+
+    def by_module(self) -> typing.Counter[str]:
+        """Returns the statistics for all submodules, aggregated over all
+        operators.
+
+        Returns:
+            Counter(str): statistics counter grouped by submodule names
+        """
+        stats = self._analyze()
+        summed_counts = Counter()  # type: Counter
+        for mod, results in stats.counts.items():
+            summed_counts[mod] = sum(results.values())
+        return summed_counts
+
+    def unsupported_ops(self, module_name: str = '') -> typing.Counter[str]:
+        """Lists the number of operators that were encountered but unsupported
+        because no operator handle is available for them.
+
+        Does not include operators that are explicitly ignored.
+
+        Args:
+            module_name (str): The submodule to list unsupported ops.
+                Defaults to the entire model.
+
+        Returns:
+            Counter(str): The number of occurrences each unsupported operator.
+        """
+        if self._stats is None:
+            raise RuntimeError('Analysis results should be computed '
+                               'before calling unsupported_ops()')
+        module_name = self.canonical_module_name(module_name)
+        return self._stats.unsupported_ops[module_name]  # pyre-fixme
+
+    def uncalled_modules(self) -> Set[str]:
+        """Returns a set of submodules that were never called during the trace
+        of the graph.
+
+        This may be because they were unused, or because they were
+        accessed via direct calls .forward() or with other python methods.
+        In the latter case, statistics will not be attributed to the submodule,
+        though the statistics will be included
+        in the parent module.
+
+        Returns:
+            set[str]: The set of submodule names that were never called
+            during the trace of the model.
+        """
+        stats = self._analyze()
+        return stats.uncalled_mods
+
+    def set_op_handle(self, *args,
+                      **kwargs: Optional[Handle]) -> 'JitModelAnalysis':
+        """Sets additional operator handles, or replaces existing ones.
+
+        If a handle is ``None``, the op will be explicitly ignored. Otherwise,
+        handle should be a function that calculates the desirable statistic
+        from an operator. The function must take two arguments, which are the
+        inputs and outputs of the operator, in the form of
+        ``list(torch._C.Value)``. The function should return a counter object
+        with per-operator statistics.
+
+        Args:
+            args: (str, Handle) pairs of operator names and handles.
+            kwargs: mapping from operator names to handles.
+
+        Examples:
+            >>> handlers = {"aten::linear": my_handler}
+            >>> counter.set_op_handle("aten::matmul", None,
+            ...     "aten::bmm", my_handler2).set_op_handle(**handlers)
+        """
+        self._stats = None
+        if len(args) % 2 != 0:
+            raise TypeError(
+                'set_op_handle should be called with pairs of names and'
+                'handles!')
+        for name, handle in zip(args[::2], args[1::2]):
+            kwargs[name] = handle
+        for name, handle in kwargs.items():
+            if handle is None:
+                self._ignored_ops.add(name)
+            else:
+                self._op_handles[name] = handle
+        return self
+
+    def clear_op_handles(self) -> 'JitModelAnalysis':
+        """Clears all operator handles currently set."""
+        self._op_handles = {}
+        self._ignored_ops = copy(_IGNORED_OPS)
+        self._stats = None
+        return self
+
+    def canonical_module_name(self, name: str) -> str:
+        """Returns the canonical module name of the given ``name``, which might
+        be different from the given ``name`` if the module is shared.
+
+        This is the name that will be used as a key when statistics are
+        output using .by_module() and .by_module_and_operator().
+
+        Args:
+            name (str): The name of the module to find the canonical name for.
+
+        Returns:
+            str: The canonical name of the module.
+        """
+        # Blocks access by a direct module reference
+        assert isinstance(name, str), 'Module name must be a string.'
+        if name in self._aliases:
+            return self._aliases[name]
+        else:
+            raise KeyError('Requested module name is not among '
+                           'the descendants of the analyzed model.')
+
+    def copy(
+        self,
+        new_model: Optional[nn.Module] = None,
+        new_inputs: Union[None, Tensor, Tuple[Tensor, ...]] = None,
+    ) -> 'JitModelAnalysis':
+        """Returns a copy of the :class:`JitModelAnalysis` object, keeping all
+        settings, but on a new model or new inputs.
+
+        Args:
+            new_model (nn.Module or None): a new model for the new
+                JitModelAnalysis. If None, uses the original model.
+                Defaults to None.
+            new_inputs (typing.Tuple[object, ...], optional): new inputs
+                for the new JitModelAnalysis. If None, uses the original
+                inputs. Defaults to None.
+
+        Returns:
+            JitModelAnalysis: the new model analysis object
+        """
+        model = self._model if new_model is None else new_model
+        inputs = self._inputs if new_inputs is None else new_inputs
+        return (JitModelAnalysis(model=model, inputs=inputs).set_op_handle(
+            **self._op_handles).unsupported_ops_warnings(
+                self._enable_warn_unsupported_ops).uncalled_modules_warnings(
+                    self._enable_warn_uncalled_mods).tracer_warnings(
+                        self._warn_trace))
+
+    def tracer_warnings(self: T, mode: str) -> T:
+        """Sets which warnings to print when tracing the graph to calculate
+        statistics. There are three modes. Defaults to 'no_tracer_warning'.
+        Allowed values are:
+
+        * 'all' : keeps all warnings raised while tracing
+        * 'no_tracer_warning' : suppress torch.jit.TracerWarning only
+        * 'none' : suppress all warnings raised while tracing
+
+        Args:
+            mode (str) : warning mode in one of the above values.
+        """
+        if mode not in ['all', 'no_tracer_warning', 'none']:
+            raise ValueError(f'Unrecognized tracer warning mode {mode}.')
+        self._warn_trace = mode
+        return self
+
+    def ancestor_mode(self: T, mode: str) -> T:
+        """Sets how to determine the ancestor modules of an operator. Must be
+        one of "owner" or "caller".
+
+        * "caller": an operator belongs to all modules that are currently
+            executing `forward()` at the time the operator is called.
+        * "owner": an operator belongs to the last module that's executing
+            `forward()` at the time the operator is called, plus this
+            module's recursive parents. If an module has multiple parents
+            (e.g. a shared module), only one will be picked.
+
+        For most cases, a module only calls submodules it owns, so both
+        options would work identically. In certain edge cases, this option
+        will affect the hierarchy of results, but won't affect the total
+        count.
+        """
+        if mode not in ['owner', 'caller']:
+            raise ValueError(f'Unrecognized ancestor mode: {mode}')
+        self._ancestor_mode = mode
+        return self
+
+    def unsupported_ops_warnings(self: T, enabled: bool) -> T:
+        """Sets if warnings for unsupported operators are shown.
+
+        Defaults to True. Counts of unsupported operators may be
+        obtained from :meth:`unsupported_ops` regardless of this setting.
+
+        Args:
+            enabled (bool): Set to 'True' to show unsupported operator
+                warnings.
+        """
+        self._enable_warn_unsupported_ops = enabled
+        return self
+
+    def uncalled_modules_warnings(self: T, enabled: bool) -> T:
+        """Sets if warnings from uncalled submodules are shown.
+
+        Defaults to true. A submodule is considered "uncalled" if it is never
+        called during tracing. This may be because it is actually unused, or
+        because it is accessed via calls to ``.forward()`` or other methods of
+        the module. The set of uncalled modules may be obtained from
+        :meth:`uncalled_modules` regardless of this setting.
+
+        Args:
+            enabled (bool): Set to 'True' to show warnings.
+        """
+        self._enable_warn_uncalled_mods = enabled
+        return self
+
+    def _warn_unsupported_ops(self, ops: typing.Counter[str]) -> None:
+        if not self._enable_warn_unsupported_ops:
+            return
+
+        for op, freq in ops.items():
+            print_log(
+                'Unsupported operator {} encountered {} time(s)'.format(
+                    op, freq),
+                'current',
+                logging.WARNING,
+            )
+
+    def _warn_uncalled_mods(self, uncalled_mods: Set[str]) -> None:
+        if not self._enable_warn_uncalled_mods:
+            return
+        uncalled_mods = {x for x in uncalled_mods if self._has_forward(x)}
+        if len(uncalled_mods) == 0:
+            return
+
+        print_log(
+            'The following submodules of the model were never '
+            'called during the trace of the graph. They may be '
+            'unused, or they were accessed by direct calls to '
+            '.forward() or via other python methods. In the latter '
+            'case they will have zeros for statistics, though their '
+            'statistics will still contribute to their parent calling '
+            'module.\n' + ', '.join(sorted(uncalled_mods)), 'current',
+            logging.WARNING)
+
+    def _get_aliases(self,
+                     model: nn.Module) -> Dict[Union[str, nn.Module], str]:
+        aliases = {}
+        for name, module in _named_modules_with_dup(model):
+            if module not in aliases:
+                aliases[module] = name
+            aliases[name] = aliases[module]
+        return aliases
+
+    def _get_all_ancestors(self, module_name: str) -> Set[str]:
+        """Get all ancestors of the given module, defined by ownership.
+
+        If the given module has multiple owners, use its canonical name.
+        """
+        parts = self.canonical_module_name(module_name).split('.')
+        res = {''}
+        for k in range(len(parts) + 1):
+            res.add('.'.join(parts[:k]))
+        return res
+
+    def _analyze(self) -> 'Statistics':
+        # Don't calculate if results are already stored.
+        stats = self._stats
+        if stats is not None:
+            return stats
+
+        with warnings.catch_warnings():
+            if self._warn_trace == 'none':
+                warnings.simplefilter('ignore')
+            elif self._warn_trace == 'no_tracer_warning':
+                warnings.filterwarnings('ignore', category=TracerWarning)
+            graph = _get_scoped_trace_graph(self._model, self._inputs,
+                                            self._aliases)
+
+        # Assures even modules not in the trace graph are initialized to
+        # zero count
+        counts = {}  # type: Dict
+        unsupported_ops = {}  # type: Dict
+        # We don't need the duplication here, but self._model.named_modules()
+        # gives slightly different results for some wrapped models.
+        for _, mod in _named_modules_with_dup(self._model):
+            name = self._aliases[mod]
+            counts[name] = Counter()
+            unsupported_ops[name] = Counter()
+
+        all_seen = set()
+        for node in graph.nodes():
+            kind = node.kind()
+            if kind == 'prim::PythonOp':
+                # for PythonOp, pyname contains the actual name in Python
+                # pyre-fixme[16]: `Node` has no attribute `pyname`.
+                kind = kind + '.' + node.pyname()
+            scope_names = node.scopeName().split('/')
+            all_seen.update(scope_names)
+            # The result of node.scopeName() is like: `layer1/layer1.layer`
+            # Therefore, if there is not shared module ancestors will have the
+            # same value. However, if layer1.layer is used by multiple modules.
+            # scopeName() will return
+            # `layer1/layer1.layer`
+            # `layer2/layer1.layer` respectively
+            # If mode is `caller`, the ancestors will be:
+            # 'layer1', 'layer2', 'layer1.layer'
+            # else, the ancestors will be:
+            # 'layer1', 'layer1.layer'
+            # which means only the flops will only be counted into `layer1`.
+            if self._ancestor_mode == 'caller':
+                ancestors = set(scope_names)
+            else:
+                ancestors = self._get_all_ancestors(scope_names[-1])
+                all_seen.update(ancestors)
+            if kind not in self._op_handles:
+                if self._should_ignore_node(node):
+                    continue
+                for name in ancestors:
+                    unsupported_ops[name][kind] += 1
+            else:
+                inputs, outputs = list(node.inputs()), list(node.outputs())
+                op_counts = self._op_handles[kind](inputs, outputs)
+                if isinstance(op_counts, Number):
+                    op_counts = Counter(
+                        {self._simplify_op_name(kind): op_counts})
+                for v in op_counts.values():  # type: ignore
+                    if not isinstance(v, (int, float, np.float64, np.int64)):
+                        raise ValueError(
+                            f'Invalid type {type(v)} for the flop count! '
+                            'Please use a wider type to avoid overflow.')
+
+                # Assures an op contributes at most once to a module
+                for name in ancestors:
+                    counts[name] += op_counts
+
+        uncalled_mods = set(self._aliases.values()) - all_seen
+        stats = Statistics(
+            counts=counts,
+            unsupported_ops=unsupported_ops,
+            uncalled_mods=uncalled_mods)
+        self._stats = stats
+        self._warn_unsupported_ops(unsupported_ops[''])
+        self._warn_uncalled_mods(uncalled_mods)
+        return stats
+
+    def _simplify_op_name(self, full_op_name: str) -> str:
+        """Get simplified name of the op without the preceding namespace, e.g.
+        aten::batch_norm -> batch_norm."""
+        p = full_op_name.find('::')
+        if p != -1:
+            return full_op_name[p + 2:]
+        else:
+            return full_op_name
+
+    def _has_forward(self, mod_name: str) -> bool:
+        # Whether the module has a valid forward method.
+        # Modules without forward are not expected to get called
+        # and therefore should not produce "uncalled" warnings
+        module = self._named_modules.get(mod_name)
+        if module is None:
+            return False
+        module_type = type(module)
+        # Containers are not meant to be called anyway (they don't have
+        # forward)
+        # NOTE: We add nn.Identity as well to silence the uncalled warning,
+        # but it's different from other containers: Identity has a forward
+        # but the forward does not contain ops, so it appears "uncalled" after
+        # tracing. A more proper way may be to use forward hooks (instead of
+        # the graph) to decide whether a module has been called.
+        no_forward_mods = {
+            nn.ModuleList, nn.ModuleDict, nn.Module, nn.Identity
+        }
+        for mod in no_forward_mods:
+            if module_type.forward is mod.forward:
+                return False
+        return True
+
+    def _should_ignore_node(self, node) -> bool:
+        kind = node.kind()
+        if kind in self._ignored_ops:
+            return True
+        # Ignore all prim:: operators, with two exceptions:
+        # * prim::PythonOp can be a user-implemented `torch.autograd.Function`
+        # * prim::CallFunction an be a call to scripted module/function.
+        if kind.startswith('prim::PythonOp') or kind.startswith(
+                'prim::CallFunction'):
+            return False
+        if kind.startswith('prim::'):
+            return True
+        return False
diff --git a/head_extractor/build/lib/mmengine/analysis/jit_handles.py b/head_extractor/build/lib/mmengine/analysis/jit_handles.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4b9155e8812fd59ac459cdbdd9571ab8d92c374
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/analysis/jit_handles.py
@@ -0,0 +1,286 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Modified from
+# https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/jit_handles.py
+
+import typing
+from collections import Counter, OrderedDict
+from typing import Any, Callable, List, Optional, Union
+
+import numpy as np
+
+try:
+    from math import prod  # type: ignore
+except ImportError:
+    from numpy import prod as _prod  # type: ignore
+
+    # Patch `numpy.prod` to avoid overflow on Windows by converting its result
+    # from `np.int32` to `int`.
+    def prod(*args, **kwargs):  # type: ignore
+        return _prod(*args, **kwargs).item()
+
+
+Handle = Callable[[List[Any], List[Any]], Union[typing.Counter[str], int]]
+
+
+def get_shape(val: Any) -> Optional[List[int]]:
+    """Get the shapes from a jit value object.
+
+    Args:
+        val (torch._C.Value): jit value object.
+
+    Returns:
+        list(int): return a list of ints.
+    """
+    if val.isCompleteTensor():
+        return val.type().sizes()
+    else:
+        return None  # type: ignore
+
+
+"""
+Below are flop/activation counters for various ops.
+Every counter has the following signature:
+
+Args:
+    inputs (list(torch._C.Value)):
+        The inputs of the op in the form of a list of jit object.
+    outputs (list(torch._C.Value)):
+        The outputs of the op in the form of a list of jit object.
+
+Returns:
+    number: The number of flops/activations for the operation.
+    or Counter[str]
+"""
+
+
+def generic_activation_jit(op_name: Optional[str] = None) -> Handle:
+    """This method returns a handle that counts the number of activation from
+    the output shape for the specified operation.
+
+    Args:
+        op_name (str): The name of the operation. If given, the handle will
+            return a counter using this name.
+
+    Returns:
+        Callable: An activation handle for the given operation.
+    """
+
+    def _generic_activation_jit(
+            i: Any, outputs: List[Any]) -> Union[typing.Counter[str], int]:
+        """This is a generic jit handle that counts the number of activations
+        for any operation given the output shape."""
+        out_shape = get_shape(outputs[0])
+        ac_count = prod(out_shape)  # type: ignore
+        if op_name is None:
+            return ac_count  # type: ignore
+        else:
+            return Counter({op_name: ac_count})
+
+    return _generic_activation_jit
+
+
+def addmm_flop_jit(inputs: List[Any], outputs: List[Any]) -> Union[int, Any]:
+    """Count flops for fully connected layers."""
+    # Count flop for nn.Linear
+    # inputs is a list of length 3.
+    input_shapes = [get_shape(v) for v in inputs[1:3]]
+    # input_shapes[0]: [batch size, input feature dimension]
+    # input_shapes[1]: [batch size, output feature dimension]
+    assert len(input_shapes[0]) == 2, input_shapes[0]  # type: ignore
+    assert len(input_shapes[1]) == 2, input_shapes[1]  # type: ignore
+    batch_size, input_dim = input_shapes[0]  # type: ignore
+    output_dim = input_shapes[1][1]  # type: ignore
+    flops = batch_size * input_dim * output_dim
+    return flops
+
+
+def linear_flop_jit(inputs: List[Any], outputs: List[Any]) -> Union[int, Any]:
+    """Count flops for the aten::linear operator."""
+    # Inputs is a list of length 3; unlike aten::addmm, it is the first
+    # two elements that are relevant.
+    input_shapes = [get_shape(v) for v in inputs[0:2]]
+    # input_shapes[0]: [dim0, dim1, ..., input_feature_dim]
+    # input_shapes[1]: [output_feature_dim, input_feature_dim]
+    assert input_shapes[0][-1] == input_shapes[1][-1]  # type: ignore
+    flops = prod(input_shapes[0]) * input_shapes[1][0]  # type: ignore
+    return flops
+
+
+def bmm_flop_jit(inputs: List[Any], outputs: List[Any]) -> Union[int, Any]:
+    """Count flops for the bmm operation."""
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two tensor.
+    assert len(inputs) == 2, len(inputs)
+    input_shapes = [get_shape(v) for v in inputs]
+    n, c, t = input_shapes[0]  # type: ignore
+    d = input_shapes[-1][-1]  # type: ignore
+    flop = n * c * t * d
+    return flop
+
+
+def conv_flop_count(
+    x_shape: List[int],
+    w_shape: List[int],
+    out_shape: List[int],
+    transposed: bool = False,
+) -> Union[int, Any]:
+    """Count flops for convolution. Note only multiplication is counted.
+    Computation for addition and bias is ignored. Flops for a transposed
+    convolution are calculated as.
+
+    flops = (x_shape[2:] * prod(w_shape) * batch_size).
+
+    Args:
+        x_shape (list(int)): The input shape before convolution.
+        w_shape (list(int)): The filter shape.
+        out_shape (list(int)): The output shape after convolution.
+        transposed (bool): is the convolution transposed
+
+    Returns:
+        int: the number of flops
+    """
+    batch_size = x_shape[0]
+    conv_shape = (x_shape if transposed else out_shape)[2:]
+    flop = batch_size * prod(w_shape) * prod(conv_shape)
+    return flop
+
+
+def conv_flop_jit(inputs: List[Any],
+                  outputs: List[Any]) -> typing.Counter[str]:
+    """Count flops for convolution."""
+    # Inputs of Convolution should be a list of length 12 or 13.
+    # They represent:
+    # 0) input tensor, 1) convolution filter, 2) bias, 3) stride, 4) padding,
+    # 5) dilation, 6) transposed, 7) out_pad, 8) groups, 9) benchmark_cudnn,
+    # 10) deterministic_cudnn and 11) user_enabled_cudnn.
+    # starting with #40737 it will be 12) user_enabled_tf32
+    assert len(inputs) == 12 or len(inputs) == 13, len(inputs)
+    x, w = inputs[:2]
+    x_shape, w_shape, out_shape = (get_shape(x), get_shape(w),
+                                   get_shape(outputs[0]))
+    transposed = inputs[6].toIValue()
+
+    # use a custom name instead of "_convolution"
+    return Counter({
+        'conv':
+        conv_flop_count(
+            x_shape,  # type: ignore
+            w_shape,  # type: ignore
+            out_shape,  # type: ignore
+            transposed=transposed)  # type: ignore
+    })
+
+
+def einsum_flop_jit(inputs: List[Any], outputs: List[Any]) -> Union[int, Any]:
+    """Count flops for the einsum operation."""
+    # Inputs of einsum should be a list of length 2+.
+    # Inputs[0] stores the equation used for einsum.
+    # Inputs[1] stores the list of input shapes.
+    assert len(inputs) >= 2, len(inputs)
+    equation = inputs[0].toIValue()
+    # Get rid of white space in the equation string.
+    equation = equation.replace(' ', '')
+    input_shapes_jit = inputs[1].node().inputs()
+    input_shapes = [get_shape(v) for v in input_shapes_jit]
+
+    # Re-map equation so that same equation with different alphabet
+    # representations will look the same.
+    letter_order = OrderedDict((k, 0) for k in equation if k.isalpha()).keys()
+    mapping = {ord(x): 97 + i for i, x in enumerate(letter_order)}
+    equation = equation.translate(mapping)
+
+    if equation == 'abc,abd->acd':
+        n, c, t = input_shapes[0]  # type: ignore
+        p = input_shapes[-1][-1]  # type: ignore
+        flop = n * c * t * p
+        return flop
+
+    elif equation == 'abc,adc->adb':
+        n, t, g = input_shapes[0]  # type: ignore
+        c = input_shapes[-1][1]  # type: ignore
+        flop = n * t * g * c
+        return flop
+    else:
+        np_arrs = [np.zeros(s) for s in input_shapes]
+        optim = np.einsum_path(equation, *np_arrs, optimize='optimal')[1]
+        for line in optim.split('\n'):
+            if 'optimized flop' in line.lower():
+                # divided by 2 because we count MAC
+                # (multiply-add counted as one flop)
+                flop = float(np.floor(float(line.split(':')[-1]) / 2))
+                return flop
+        raise NotImplementedError('Unsupported einsum operation.')
+
+
+def matmul_flop_jit(inputs: List[Any], outputs: List[Any]) -> Union[int, Any]:
+    """Count flops for matmul."""
+    # input_shapes is a list of length 2.
+    input_shapes: list = [get_shape(v) for v in inputs]
+    input1, input2 = input_shapes
+    if len(input1) == 1:
+        input1 = [1, input1[0]]
+    if len(input2) == 1:
+        input2 = [input2[0], 1]
+
+    assert input1[-1] == input2[-2], input_shapes
+    flop = prod(input1) * input2[-1]
+    return flop
+
+
+def norm_flop_counter(affine_arg_index: int) -> Handle:
+    """
+    Args:
+        affine_arg_index: index of the affine argument in inputs
+    """
+
+    def norm_flop_jit(inputs: List[Any],
+                      outputs: List[Any]) -> Union[int, Any]:
+        """Count flops for norm layers."""
+        # Inputs[0] contains the shape of the input.
+        input_shape = get_shape(inputs[0])
+        has_affine = get_shape(inputs[affine_arg_index]) is not None
+        assert 2 <= len(input_shape) <= 5, input_shape  # type: ignore
+        # 5 is just a rough estimate
+        flop = prod(input_shape) * (5 if has_affine else 4)  # type: ignore
+        return flop
+
+    return norm_flop_jit
+
+
+def batchnorm_flop_jit(inputs: List[Any],
+                       outputs: List[Any]) -> Union[int, Any]:
+    training = inputs[5].toIValue()
+    assert isinstance(training,
+                      bool), 'Signature of aten::batch_norm has changed!'
+    if training:
+        return norm_flop_counter(1)(inputs, outputs)  # pyre-ignore
+    has_affine = get_shape(inputs[1]) is not None
+    input_shape = prod(get_shape(inputs[0]))  # type: ignore
+    return input_shape * (2 if has_affine else 1)
+
+
+def elementwise_flop_counter(input_scale: float = 1,
+                             output_scale: float = 0) -> Handle:
+    """Count flops by.
+
+        input_tensor.numel() * input_scale +
+        output_tensor.numel() * output_scale
+
+    Args:
+        input_scale: scale of the input tensor (first argument)
+        output_scale: scale of the output tensor (first element in outputs)
+    """
+
+    def elementwise_flop(inputs: List[Any],
+                         outputs: List[Any]) -> Union[int, Any]:
+        ret = 0
+        if input_scale != 0:
+            shape = get_shape(inputs[0])
+            ret += input_scale * prod(shape)  # type: ignore
+        if output_scale != 0:
+            shape = get_shape(outputs[0])
+            ret += output_scale * prod(shape)  # type: ignore
+        return ret
+
+    return elementwise_flop
diff --git a/head_extractor/build/lib/mmengine/analysis/print_helper.py b/head_extractor/build/lib/mmengine/analysis/print_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b87d42373dae5c59c7c4ed45aa27cfcb09d95e6
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/analysis/print_helper.py
@@ -0,0 +1,784 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Modified from
+# https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/print_model_statistics.py
+
+from collections import defaultdict
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import torch
+from rich import box
+from rich.console import Console
+from rich.table import Table
+from torch import nn
+
+from mmengine.utils import is_tuple_of
+from .complexity_analysis import (ActivationAnalyzer, FlopAnalyzer,
+                                  parameter_count)
+
+
+def _format_size(x: int, sig_figs: int = 3, hide_zero: bool = False) -> str:
+    """Formats an integer for printing in a table or model representation.
+
+    Expresses the number in terms of 'kilo', 'mega', etc., using
+    'K', 'M', etc. as a suffix.
+
+    Args:
+        x (int): The integer to format.
+        sig_figs (int): The number of significant figures to keep.
+            Defaults to 3.
+        hide_zero (bool): If True, x=0 is replaced with an empty string
+            instead of '0'. Defaults to False.
+
+    Returns:
+        str: The formatted string.
+    """
+    if hide_zero and x == 0:
+        return ''
+
+    def fmt(x: float) -> str:
+        # use fixed point to avoid scientific notation
+        return f'{{:.{sig_figs}f}}'.format(x).rstrip('0').rstrip('.')
+
+    if abs(x) > 1e14:
+        return fmt(x / 1e15) + 'P'
+    if abs(x) > 1e11:
+        return fmt(x / 1e12) + 'T'
+    if abs(x) > 1e8:
+        return fmt(x / 1e9) + 'G'
+    if abs(x) > 1e5:
+        return fmt(x / 1e6) + 'M'
+    if abs(x) > 1e2:
+        return fmt(x / 1e3) + 'K'
+    return str(x)
+
+
+def _pretty_statistics(statistics: Dict[str, Dict[str, int]],
+                       sig_figs: int = 3,
+                       hide_zero: bool = False) -> Dict[str, Dict[str, str]]:
+    """Converts numeric statistics to strings with kilo/mega/giga/etc. labels.
+
+    Args:
+        statistics (dict[str, dict[str, int]]) : the statistics to
+            format. Organized as a dictionary over modules, which are
+            each a dictionary over statistic types.
+        sig_figs (int): the number of significant figures for each stat.
+            Defaults to 3.
+        hide_zero (bool): if True, statistics that are zero will be
+            written as an empty string. Defaults to False.
+
+    Returns:
+        dict[str, dict[str, str]]: the input statistics as pretty strings
+    """
+    out_stats = {}
+    for mod, stats in statistics.items():
+        out_stats[mod] = {
+            s: _format_size(val, sig_figs, hide_zero)
+            for s, val in stats.items()
+        }
+    return out_stats
+
+
+def _group_by_module(
+        statistics: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
+    """Converts statistics organized first by statistic type and then by module
+    to statistics organized first by module and then by statistic type.
+
+    Args:
+        statistics (dict[str, dict[str, any]]): the statistics to convert
+
+    Returns:
+        dict[str, dict[str, any]]: the reorganized statistics
+    """
+    out_stats = defaultdict(dict)  # type: Dict[str, Dict[str, Any]]
+    for stat_name, stat in statistics.items():
+        for mod, val in stat.items():
+            out_stats[mod][stat_name] = val
+    return dict(out_stats)
+
+
+def _indicate_uncalled_modules(
+    statistics: Dict[str, Dict[str, str]],
+    stat_name: str,
+    uncalled_modules: Set[str],
+    uncalled_indicator: str = 'N/A',
+) -> Dict[str, Dict[str, str]]:
+    """If a module is in the set of uncalled modules, replace its statistics
+    with the specified indicator, instead of using the existing string.
+
+    Assumes the statistic is already formatting in string form.
+
+    Args:
+        statistics (dict[str, dict[str, str]]): the statistics to
+            format. Organized as a dictionary over modules, which are
+            each a dictionary over statistic types. Expects statistics
+            have already been converted to strings.
+        stat_name (str): the name of the statistic being modified
+        uncalled_modules set(str): a set of names of uncalled modules.
+        indicator (str): the string that will be used to indicate
+            unused modules. Defaults to 'N/A'.
+
+    Returns:
+        dict[str, dict[str, str]]: the modified statistics
+    """
+
+    stats_out = {mod: stats.copy() for mod, stats in statistics.items()}
+    for mod in uncalled_modules:
+        if mod not in stats_out:
+            stats_out[mod] = {}
+        stats_out[mod][stat_name] = uncalled_indicator
+    return stats_out
+
+
+def _remove_zero_statistics(
+    statistics: Dict[str, Dict[str, int]],
+    force_keep: Optional[Set[str]] = None,
+    require_trivial_children: bool = False,
+) -> Dict[str, Dict[str, int]]:
+    """Any module that has zero for all available statistics is removed from
+    the set of statistics.
+
+    This can help declutter the reporting of statistics
+    if many submodules have zero statistics. Assumes the statistics have
+    a model hierarchy starting with a root that has name ''.
+
+    Args:
+        statistics (dict[str, dict[str, int]]): the statistics to
+            remove zeros from. Organized as a dictionary over modules,
+            which are each a dictionary over statistic types.
+        force_keep (set[str] or None): a set of modules to always keep, even
+            if they are all zero.
+        require_trivial_children (bool): If True, a statistic will only
+            be deleted if all its children are also deleted. Defaults to
+            False.
+
+    Returns:
+        dict[str, dict[str, int]]: the input statistics dictionary,
+        with submodules removed if they have zero for all statistics.
+    """
+    out_stats: Dict[str, Dict[str, int]] = {}
+    _force_keep: Set[str] = force_keep if force_keep else set() | {''}
+
+    def keep_stat(name: str) -> None:
+        prefix = name + ('.' if name else '')
+        trivial_children = True
+        for mod in statistics:
+            # 'if mod' excludes root = '', which is never a child
+            if mod and mod.count('.') == prefix.count('.') and mod.startswith(
+                    prefix):
+                keep_stat(mod)
+                trivial_children &= mod not in out_stats
+
+        if ((not all(val == 0 for val in statistics[name].values()))
+                or (name in _force_keep)
+                or (require_trivial_children and not trivial_children)):
+            out_stats[name] = statistics[name].copy()
+
+    keep_stat('')
+    return out_stats
+
+
+def _fill_missing_statistics(
+        model: nn.Module,
+        statistics: Dict[str, Dict[str, int]]) -> Dict[str, Dict[str, int]]:
+    """If, for a given submodule name in the model, a statistic is missing from
+    statistics, fills it in with zero.
+
+    This visually uniformizes the reporting of statistics.
+
+    Args:
+        model (nn.Module): the model whose submodule names will be
+            used to fill in statistics
+        statistics (dict[str, dict[str, int]]) : the statistics to
+            fill in missing values for. Organized as a dictionary
+            over statistics, which are each a dictionary over submodules'
+            names. The statistics are assumed to be formatted already
+            to the desired string format for printing.
+
+    Returns:
+        dict[str, dict[str, int]]: the input statistics with missing
+        values filled with zero.
+    """
+    out_stats = {name: stat.copy() for name, stat in statistics.items()}
+    for mod_name, _ in model.named_modules():
+        for stat in out_stats.values():
+            if mod_name not in stat:
+                stat[mod_name] = 0
+    return out_stats
+
+
+def _model_stats_str(model: nn.Module,
+                     statistics: Dict[str, Dict[str, str]]) -> str:
+    """This produces a representation of the model much like 'str(model)'
+    would, except the provided statistics are written out as additional
+    information for each submodule.
+
+    Args:
+        model (nn.Module): the model to form a representation of.
+        statistics (dict[str, dict[str, str]]): the statistics to
+            include in the model representations. Organized as a dictionary
+            over module names, which are each a dictionary over statistics.
+            The statistics are assumed to be formatted already to the
+            desired string format for printing.
+
+    Returns:
+        str: the string representation of the model with the statistics
+        inserted.
+    """
+
+    # Copied from nn.Module._addindent
+    def _addindent(s_: str, numSpaces: int) -> str:
+        s = s_.split('\n')
+        # don't do anything for single-line stuff
+        if len(s) == 1:
+            return s_
+        first = s.pop(0)
+        s = [(numSpaces * ' ') + line for line in s]
+        s = '\n'.join(s)  # type: ignore
+        s = first + '\n' + s  # type: ignore
+        return s  # type: ignore
+
+    def print_statistics(name: str) -> str:
+        if name not in statistics:
+            return ''
+        printed_stats = [f'{k}: {v}' for k, v in statistics[name].items()]
+        return ', '.join(printed_stats)
+
+    # This comes directly from nn.Module.__repr__ with small changes
+    # to include the statistics.
+    def repr_with_statistics(module: nn.Module, name: str) -> str:
+        # We treat the extra repr like the sub-module, one item per line
+        extra_lines = []
+        extra_repr = module.extra_repr()
+        printed_stats = print_statistics(name)
+        # empty string will be split into list ['']
+        if extra_repr:
+            extra_lines.extend(extra_repr.split('\n'))
+        if printed_stats:
+            extra_lines.extend(printed_stats.split('\n'))
+        child_lines = []
+        for key, submod in module._modules.items():
+            submod_name = name + ('.' if name else '') + key
+            # pyre-fixme[6]: Expected `Module` for 1st param but got
+            #  `Optional[nn.modules.module.Module]`.
+            submod_str = repr_with_statistics(submod, submod_name)
+            submod_str = _addindent(submod_str, 2)
+            child_lines.append('(' + key + '): ' + submod_str)
+        lines = extra_lines + child_lines
+
+        main_str = module._get_name() + '('
+        if lines:
+            # simple one-liner info, which most builtin Modules will use
+            if len(extra_lines) == 1 and not child_lines:
+                main_str += extra_lines[0]
+            else:
+                main_str += '\n  ' + '\n  '.join(lines) + '\n'
+
+        main_str += ')'
+        return main_str
+
+    return repr_with_statistics(model, '')
+
+
+def _get_input_sizes(iterable: Iterable[Any]) -> List[Any]:  # type: ignore
+    """Gets the sizes of all torch tensors in an iterable.
+
+    If an element of the iterable is a non-torch tensor iterable, it recurses
+    into that iterable to continue calculating sizes. Any non-iterable is given
+    a size of None. The output consists of nested lists with the same nesting
+    structure as the input iterables.
+    """
+    out_list = []
+    for i in iterable:
+        if isinstance(i, torch.Tensor):
+            out_list.append(list(i.size()))
+        elif isinstance(i, Iterable):
+            sublist_sizes = _get_input_sizes(i)
+            if all(j is None for j in sublist_sizes):
+                out_list.append(None)  # type: ignore
+            else:
+                out_list.append(sublist_sizes)
+        else:
+            out_list.append(None)  # type: ignore
+    return out_list
+
+
+def _get_single_child(name: str,
+                      statistics: Dict[str, Dict[str, str]]) -> Optional[str]:
+    """If the given module has only a single child in statistics, return it.
+
+    Otherwise, return None.
+    """
+    prefix = name + ('.' if name else '')
+    child = None
+    for mod in statistics:
+        # 'if mod' excludes root = '', which is never a child
+        if mod and mod.count('.') == prefix.count('.') and mod.startswith(
+                prefix):
+            if child is None:
+                child = mod
+            else:
+                return None  # We found a second child, so return None
+    return child
+
+
+def _try_combine(stats1: Dict[str, str],
+                 stats2: Dict[str, str]) -> Optional[Dict[str, str]]:
+    """Try combine two statistics dict to display in one row.
+
+    If they conflict, returns None.
+    """
+    ret = {}
+    if set(stats1.keys()) != set(stats2.keys()):
+        return None
+    for k, v1 in stats1.items():
+        v2 = stats2[k]
+        if v1 != v2 and len(v1) and len(v2):
+            return None
+        ret[k] = v1 if len(v1) else v2
+    return ret
+
+
+def _fastforward(
+        name: str,
+        statistics: Dict[str, Dict[str, str]]) -> Tuple[str, Dict[str, str]]:
+    """If the given module has only a single child and matches statistics with
+    that child, merge statistics and their names into one row.
+
+    Then repeat until the condition isn't met.
+
+    Returns:
+        tuple[str, dict]: the new name and the combined statistics of this row
+    """
+    single_child = _get_single_child(name, statistics)
+    if single_child is None:
+        return name, statistics[name]
+    combined = _try_combine(statistics[name], statistics[single_child])
+    if combined is None:
+        return name, statistics[name]
+    statistics[single_child] = combined
+    return _fastforward(single_child, statistics)
+
+
+def _stats_table_format(
+    statistics: Dict[str, Dict[str, str]],
+    max_depth: int = 3,
+    stat_columns: Optional[List[str]] = None,
+) -> str:
+    """Formats the statistics obtained from a model in a nice table.
+
+    Args:
+        statistics (dict[str, dict[str, str]]): The statistics to print.
+            Organized as a dictionary over modules, then as a dictionary
+            over statistics in the model. The statistics are assumed to
+            already be formatted for printing.
+        max_depth (int): The maximum submodule depth to recurse to.
+            Defaults to 3.
+        stat_columns (list[str]): Specify the order of the columns to print.
+            If None, columns are found automatically from the provided
+            statistics. Defaults to None.
+
+    Return:
+        str: The formatted table.
+    """
+    if stat_columns is None:
+        stat_columns = set()  # type: ignore
+        for stats in statistics.values():
+            stat_columns.update(stats.keys())  # type: ignore
+        stat_columns = list(stat_columns)  # type: ignore
+
+    headers = ['module'] + stat_columns
+    rows: List[List[str]] = []
+
+    def build_row(name: str, stats: Dict[str, str],
+                  indent_lvl: int) -> List[str]:
+        indent = ' ' * indent_lvl
+        row = [indent + name]
+        for stat_name in stat_columns:  # type: ignore
+            row_str = (indent + stats[stat_name]) if stat_name in stats else ''
+            row.append(row_str)
+        return row
+
+    def fill(indent_lvl: int, prefix: str) -> None:
+        if indent_lvl > max_depth:
+            return
+        for mod_name in statistics:
+            # 'if mod' excludes root = '', which is never a child
+            if (mod_name and mod_name.count('.') == prefix.count('.')
+                    and mod_name.startswith(prefix)):
+                mod_name, curr_stats = _fastforward(mod_name, statistics)
+                if root_prefix and mod_name.startswith(root_prefix):
+                    # Skip the root_prefix shared by all submodules as it
+                    # carries 0 information
+                    pretty_mod_name = mod_name[len(root_prefix):]
+                else:
+                    pretty_mod_name = mod_name
+                row = build_row(pretty_mod_name, curr_stats, indent_lvl)
+                rows.append(row)
+                fill(indent_lvl + 1, mod_name + '.')
+
+    root_name, curr_stats = _fastforward('', statistics)
+    row = build_row(root_name or 'model', curr_stats, indent_lvl=0)
+    rows.append(row)
+    root_prefix = root_name + ('.' if root_name else '')
+    fill(indent_lvl=1, prefix=root_prefix)
+
+    table = Table(box=box.ASCII2)
+    for header in headers:
+        table.add_column(header)
+
+    for row in rows:
+        table.add_row(*row)
+
+    console = Console()
+    with console.capture() as capture:
+        console.print(table, end='')
+
+    return capture.get()
+
+
+def complexity_stats_str(
+        flops: FlopAnalyzer,
+        activations: Optional[ActivationAnalyzer] = None) -> str:
+    """Calculates the parameters and flops of the model with the given inputs
+    and returns a string representation of the model that includes the
+    parameters and flops of every submodule. The string is structured to be
+    similar that given by str(model), though it is not guaranteed to be
+    identical in form if the default string representation of a module has been
+    overridden. If a module has zero parameters and flops, statistics will not
+    be reported for succinctness. The trace can only register the scope of a
+    module if it is called directly, which means flops (and activations)
+    arising from explicit calls to .forward() or to other python functions of
+    the module will not be attributed to that module. Modules that are never
+    called will have 'N/A' listed for their flops; this means they are either
+    unused or their statistics are missing for this reason. Any such flops are
+    still counted towards the parent.
+
+    Examples:
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> class InnerNet(nn.Module):
+        ...     def __init__(self):
+        ...         super().__init__()
+        ...         self.fc1 = nn.Linear(10,10)
+        ...         self.fc2 = nn.Linear(10,10)
+        ...     def forward(self, x):
+        ...         return self.fc1(self.fc2(x))
+        >>> class TestNet(nn.Module):
+        ...     def __init__(self):
+        ...         super().__init__()
+        ...         self.fc1 = nn.Linear(10,10)
+        ...         self.fc2 = nn.Linear(10,10)
+        ...         self.inner = InnerNet()
+        ...     def forward(self, x):
+        ...         return self.fc1(self.fc2(self.inner(x)))
+        >>> inputs = torch.randn((1,10))
+        >>> print(complexity_stats_str(FlopAnalyzer(model, inputs)))
+        TestNet(
+          #params: 0.44K, #flops: 0.4K
+          (fc1): Linear(
+            in_features=10, out_features=10, bias=True
+            #params: 0.11K, #flops: 100
+          )
+          (fc2): Linear(
+            in_features=10, out_features=10, bias=True
+            #params: 0.11K, #flops: 100
+          )
+          (inner): InnerNet(
+            #params: 0.22K, #flops: 0.2K
+            (fc1): Linear(
+              in_features=10, out_features=10, bias=True
+              #params: 0.11K, #flops: 100
+            )
+            (fc2): Linear(
+              in_features=10, out_features=10, bias=True
+              #params: 0.11K, #flops: 100
+            )
+          )
+        )
+
+    Args:
+        flops (FlopAnalyzer): the flop counting object
+        activations (ActivationAnalyzer or None): If given, the activations of
+            each layer will also be calculated and included in the
+            representation. Defaults to None.
+
+    Returns:
+        str: a string representation of the model with the number of
+        parameters and flops included.
+    """
+    # cast to dict since pyre doesn't like the implicit defaultdict->dict
+    model = flops._model
+    params = dict(parameter_count(model))
+
+    flops.unsupported_ops_warnings(False)
+    flops.uncalled_modules_warnings(False)
+    flops.tracer_warnings('none')
+    stats = {'#params': params, '#flops': flops.by_module()}
+
+    if activations is not None:
+        activations.unsupported_ops_warnings(False)
+        activations.uncalled_modules_warnings(False)
+        activations.tracer_warnings('none')
+        stats['#acts'] = activations.by_module()
+
+    all_uncalled = flops.uncalled_modules() | (
+        activations.uncalled_modules() if activations is not None else set())
+    stats = _fill_missing_statistics(model, stats)
+    stats = _group_by_module(stats)
+    stats = _remove_zero_statistics(stats, force_keep=all_uncalled)
+    stats = _pretty_statistics(stats, sig_figs=2)  # type: ignore
+    stats = _indicate_uncalled_modules(  # type: ignore
+        stats,  # type: ignore
+        '#flops',  # type: ignore
+        flops.uncalled_modules())  # type: ignore
+    if activations is not None:
+        stats = _indicate_uncalled_modules(  # type: ignore
+            stats,  # type: ignore
+            '#acts',  # type: ignore
+            activations.uncalled_modules())  # type: ignore
+
+    model_string = ''
+    if all_uncalled:
+        model_string += (
+            'N/A indicates a possibly missing statistic due to how '
+            'the module was called. Missing values are still included '
+            "in the parent's total.\n")
+    model_string += _model_stats_str(model, stats)  # type: ignore
+    return model_string
+
+
+def complexity_stats_table(
+    flops: FlopAnalyzer,
+    max_depth: int = 3,
+    activations: Optional[ActivationAnalyzer] = None,
+    show_param_shapes: bool = True,
+) -> str:
+    """
+    Format the per-module parameters and flops of a model in a table.
+    It looks like this:
+    ::
+        | model                            | #parameters or shape| #flops    |
+        |:---------------------------------|:--------------------|:----------|
+        | model                            | 34.6M               | 65.7G     |
+        |  s1                              |  15.4K              |  4.32G    |
+        |   s1.pathway0_stem               |   9.54K             |   1.23G   |
+        |    s1.pathway0_stem.conv         |    9.41K            |    1.23G  |
+        |    s1.pathway0_stem.bn           |    0.128K           |           |
+        |   s1.pathway1_stem               |   5.9K              |   3.08G   |
+        |    s1.pathway1_stem.conv         |    5.88K            |    3.08G  |
+        |    s1.pathway1_stem.bn           |    16               |           |
+        |  s1_fuse                         |  0.928K             |  29.4M    |
+        |   s1_fuse.conv_f2s               |   0.896K            |   29.4M   |
+        |    s1_fuse.conv_f2s.weight       |    (16, 8, 7, 1, 1) |           |
+        |   s1_fuse.bn                     |   32                |           |
+        |    s1_fuse.bn.weight             |    (16,)            |           |
+        |    s1_fuse.bn.bias               |    (16,)            |           |
+        |  s2                              |  0.226M             |  7.73G    |
+        |   s2.pathway0_res0               |   80.1K             |   2.58G   |
+        |    s2.pathway0_res0.branch1      |    20.5K            |    0.671G |
+        |    s2.pathway0_res0.branch1_bn   |    0.512K           |           |
+        |    s2.pathway0_res0.branch2      |    59.1K            |    1.91G  |
+        |   s2.pathway0_res1.branch2       |   70.4K             |   2.28G   |
+        |    s2.pathway0_res1.branch2.a    |    16.4K            |    0.537G |
+        |    s2.pathway0_res1.branch2.a_bn |    0.128K           |           |
+        |    s2.pathway0_res1.branch2.b    |    36.9K            |    1.21G  |
+        |    s2.pathway0_res1.branch2.b_bn |    0.128K           |           |
+        |    s2.pathway0_res1.branch2.c    |    16.4K            |    0.537G |
+        |    s2.pathway0_res1.branch2.c_bn |    0.512K           |           |
+        |   s2.pathway0_res2.branch2       |   70.4K             |   2.28G   |
+        |    s2.pathway0_res2.branch2.a    |    16.4K            |    0.537G |
+        |    s2.pathway0_res2.branch2.a_bn |    0.128K           |           |
+        |    s2.pathway0_res2.branch2.b    |    36.9K            |    1.21G  |
+        |    s2.pathway0_res2.branch2.b_bn |    0.128K           |           |
+        |    s2.pathway0_res2.branch2.c    |    16.4K            |    0.537G |
+        |    s2.pathway0_res2.branch2.c_bn |    0.512K           |           |
+        |    ............................. |    ......           |    ...... |
+
+    Args:
+        flops (FlopAnalyzer): the flop counting object
+        max_depth (int): The max depth of submodules to include in the
+            table. Defaults to 3.
+        activations (ActivationAnalyzer or None): If given, include
+            activation counts as an additional column in the table.
+            Defaults to None.
+        show_param_shapes (bool): If true, shapes for parameters will be
+            included in the table. Defaults to True.
+
+    Returns:
+        str: The formatted table.
+
+    Examples:
+        >>> print(complexity_stats_table(FlopAnalyzer(model, inputs)))
+    """
+    params_header = '#parameters' + (' or shape' if show_param_shapes else '')
+    flops_header, acts_header = '#flops', '#activations'
+
+    model = flops._model
+    # cast to dict since pyre doesn't like the implicit defaultdict->dict
+    params = dict(parameter_count(model))
+
+    flops.unsupported_ops_warnings(False)
+    flops.uncalled_modules_warnings(False)
+    flops.tracer_warnings('none')
+
+    stats = {params_header: params, flops_header: flops.by_module()}
+    stat_columns = [params_header, flops_header]
+
+    if activations is not None:
+        activations.unsupported_ops_warnings(False)
+        activations.uncalled_modules_warnings(False)
+        activations.tracer_warnings('none')
+        stats[acts_header] = activations.by_module()
+        stat_columns += [acts_header]
+
+    stats = _group_by_module(stats)
+    stats = _remove_zero_statistics(
+        stats,  # type: ignore
+        require_trivial_children=True)  # type: ignore
+    stats = _pretty_statistics(stats, hide_zero=False)  # type: ignore
+    stats = _indicate_uncalled_modules(  # type: ignore
+        stats,  # type: ignore
+        flops_header,  # type: ignore
+        flops.uncalled_modules() & stats.keys(),  # type: ignore
+        uncalled_indicator='',  # type: ignore
+    )
+    if activations:
+        stats = _indicate_uncalled_modules(  # type: ignore
+            stats,  # type: ignore
+            acts_header,  # type: ignore
+            activations.uncalled_modules() & stats.keys(),  # type: ignore
+            uncalled_indicator='',  # type: ignore
+        )
+
+    # Swap in shapes for parameters or delete shapes from dict
+    param_shapes: Dict[str, Tuple[int, ...]] = {
+        k: tuple(v.shape)
+        for k, v in model.named_parameters()
+    }
+    to_delete = []
+    for mod in stats:
+        if mod in param_shapes:
+            if show_param_shapes:
+                stats[mod][params_header] = str(  # type: ignore
+                    param_shapes[mod])  # type: ignore
+            else:
+                to_delete.append(mod)
+    for mod in to_delete:
+        del stats[mod]
+
+    return _stats_table_format(
+        statistics=stats,  # type: ignore
+        max_depth=max_depth,
+        stat_columns=stat_columns,
+    )
+
+
+def get_model_complexity_info(
+    model: nn.Module,
+    input_shape: Union[Tuple[int, ...], Tuple[Tuple[int, ...], ...],
+                       None] = None,
+    inputs: Union[torch.Tensor, Tuple[torch.Tensor, ...], Tuple[Any, ...],
+                  None] = None,
+    show_table: bool = True,
+    show_arch: bool = True,
+):
+    """Interface to get the complexity of a model.
+
+    The parameter `inputs` are fed to the forward method of model.
+    If `inputs` is not specified, the `input_shape` is required and
+    it will be used to construct the dummy input fed to model.
+    If the forward of model requires two or more inputs, the `inputs`
+    should be a tuple of tensor or the `input_shape` should be a tuple
+    of tuple which each element will be constructed into a dumpy input.
+
+    Examples:
+        >>> # the forward of model accepts only one input
+        >>> input_shape = (3, 224, 224)
+        >>> get_model_complexity_info(model, input_shape=input_shape)
+        >>> # the forward of model accepts two or more inputs
+        >>> input_shape = ((3, 224, 224), (3, 10))
+        >>> get_model_complexity_info(model, input_shape=input_shape)
+
+    Args:
+        model (nn.Module): The model to analyze.
+        input_shape (Union[Tuple[int, ...], Tuple[Tuple[int, ...]], None]):
+            The input shape of the model.
+            If "inputs" is not specified, the "input_shape" should be set.
+            Defaults to None.
+        inputs (torch.Tensor, tuple[torch.Tensor, ...] or Tuple[Any, ...],\
+            optional]):
+            The input tensor(s) of the model. If not given the input tensor
+            will be generated automatically with the given input_shape.
+            Defaults to None.
+        show_table (bool): Whether to show the complexity table.
+            Defaults to True.
+        show_arch (bool): Whether to show the complexity arch.
+            Defaults to True.
+
+    Returns:
+        dict: The complexity information of the model.
+    """
+    if input_shape is None and inputs is None:
+        raise ValueError('One of "input_shape" and "inputs" should be set.')
+    elif input_shape is not None and inputs is not None:
+        raise ValueError('"input_shape" and "inputs" cannot be both set.')
+
+    if inputs is None:
+        device = next(model.parameters()).device
+        if is_tuple_of(input_shape, int):  # tuple of int, construct one tensor
+            inputs = (torch.randn(1, *input_shape).to(device), )
+        elif is_tuple_of(input_shape, tuple) and all([
+                is_tuple_of(one_input_shape, int)
+                for one_input_shape in input_shape  # type: ignore
+        ]):  # tuple of tuple of int, construct multiple tensors
+            inputs = tuple([
+                torch.randn(1, *one_input_shape).to(device)
+                for one_input_shape in input_shape  # type: ignore
+            ])
+        else:
+            raise ValueError(
+                '"input_shape" should be either a `tuple of int` (to construct'
+                'one input tensor) or a `tuple of tuple of int` (to construct'
+                'multiple input tensors).')
+
+    flop_handler = FlopAnalyzer(model, inputs)
+    activation_handler = ActivationAnalyzer(model, inputs)
+
+    flops = flop_handler.total()
+    activations = activation_handler.total()
+    params = parameter_count(model)['']
+
+    flops_str = _format_size(flops)
+    activations_str = _format_size(activations)
+    params_str = _format_size(params)
+
+    if show_table:
+        complexity_table = complexity_stats_table(
+            flops=flop_handler,
+            activations=activation_handler,
+            show_param_shapes=True,
+        )
+        complexity_table = '\n' + complexity_table
+    else:
+        complexity_table = ''
+
+    if show_arch:
+        complexity_arch = complexity_stats_str(
+            flops=flop_handler,
+            activations=activation_handler,
+        )
+        complexity_arch = '\n' + complexity_arch
+    else:
+        complexity_arch = ''
+
+    return {
+        'flops': flops,
+        'flops_str': flops_str,
+        'activations': activations,
+        'activations_str': activations_str,
+        'params': params,
+        'params_str': params_str,
+        'out_table': complexity_table,
+        'out_arch': complexity_arch
+    }
diff --git a/head_extractor/build/lib/mmengine/config/__init__.py b/head_extractor/build/lib/mmengine/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a1bc47db49d10ae5851211135f63107a175224b
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/config/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .config import Config, ConfigDict, DictAction, read_base
+
+__all__ = ['Config', 'ConfigDict', 'DictAction', 'read_base']
diff --git a/head_extractor/build/lib/mmengine/config/config.py b/head_extractor/build/lib/mmengine/config/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f85795066a24b2ae0f4669cea37884701701626c
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/config/config.py
@@ -0,0 +1,1857 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import ast
+import copy
+import difflib
+import os
+import os.path as osp
+import platform
+import shutil
+import sys
+import tempfile
+import types
+import uuid
+import warnings
+from argparse import Action, ArgumentParser, Namespace
+from collections import OrderedDict, abc
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Any, Optional, Sequence, Tuple, Union
+
+import yapf
+from addict import Dict
+from rich.console import Console
+from rich.text import Text
+from yapf.yapflib.yapf_api import FormatCode
+
+from mmengine.fileio import dump, load
+from mmengine.logging import print_log
+from mmengine.utils import (check_file_exist, digit_version,
+                            get_installed_path, import_modules_from_strings,
+                            is_installed)
+from .lazy import LazyAttr, LazyObject
+from .utils import (ConfigParsingError, ImportTransformer, RemoveAssignFromAST,
+                    _gather_abs_import_lazyobj, _get_external_cfg_base_path,
+                    _get_external_cfg_path, _get_package_and_cfg_path,
+                    _is_builtin_module)
+
+BASE_KEY = '_base_'
+DELETE_KEY = '_delete_'
+DEPRECATION_KEY = '_deprecation_'
+RESERVED_KEYS = ['filename', 'text', 'pretty_text', 'env_variables']
+
+if platform.system() == 'Windows':
+    import regex as re
+else:
+    import re  # type: ignore
+
+
+def _lazy2string(cfg_dict, dict_type=None):
+    if isinstance(cfg_dict, dict):
+        dict_type = dict_type or type(cfg_dict)
+        return dict_type(
+            {k: _lazy2string(v, dict_type)
+             for k, v in dict.items(cfg_dict)})
+    elif isinstance(cfg_dict, (tuple, list)):
+        return type(cfg_dict)(_lazy2string(v, dict_type) for v in cfg_dict)
+    elif isinstance(cfg_dict, (LazyAttr, LazyObject)):
+        return f'{cfg_dict.module}.{str(cfg_dict)}'
+    else:
+        return cfg_dict
+
+
+class ConfigDict(Dict):
+    """A dictionary for config which has the same interface as python's built-
+    in dictionary and can be used as a normal dictionary.
+
+    The Config class would transform the nested fields (dictionary-like fields)
+    in config file into ``ConfigDict``.
+
+    If the class attribute ``lazy``  is ``False``, users will get the
+    object built by ``LazyObject`` or ``LazyAttr``, otherwise users will get
+    the ``LazyObject`` or ``LazyAttr`` itself.
+
+    The ``lazy`` should be set to ``True`` to avoid building the imported
+    object during configuration parsing, and it should be set to False outside
+    the Config to ensure that users do not experience the ``LazyObject``.
+    """
+    lazy = False
+
+    def __init__(__self, *args, **kwargs):
+        object.__setattr__(__self, '__parent', kwargs.pop('__parent', None))
+        object.__setattr__(__self, '__key', kwargs.pop('__key', None))
+        object.__setattr__(__self, '__frozen', False)
+        for arg in args:
+            if not arg:
+                continue
+            # Since ConfigDict.items will convert LazyObject to real object
+            # automatically, we need to call super().items() to make sure
+            # the LazyObject will not be converted.
+            if isinstance(arg, ConfigDict):
+                for key, val in dict.items(arg):
+                    __self[key] = __self._hook(val)
+            elif isinstance(arg, dict):
+                for key, val in arg.items():
+                    __self[key] = __self._hook(val)
+            elif isinstance(arg, tuple) and (not isinstance(arg[0], tuple)):
+                __self[arg[0]] = __self._hook(arg[1])
+            else:
+                for key, val in iter(arg):
+                    __self[key] = __self._hook(val)
+
+        for key, val in dict.items(kwargs):
+            __self[key] = __self._hook(val)
+
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        try:
+            value = super().__getattr__(name)
+            if isinstance(value, (LazyAttr, LazyObject)) and not self.lazy:
+                value = value.build()
+        except KeyError:
+            raise AttributeError(f"'{self.__class__.__name__}' object has no "
+                                 f"attribute '{name}'")
+        except Exception as e:
+            raise e
+        else:
+            return value
+
+    @classmethod
+    def _hook(cls, item):
+        # avoid to convert user defined dict to ConfigDict.
+        if type(item) in (dict, OrderedDict):
+            return cls(item)
+        elif isinstance(item, (list, tuple)):
+            return type(item)(cls._hook(elem) for elem in item)
+        return item
+
+    def __setattr__(self, name, value):
+        value = self._hook(value)
+        return super().__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        value = self._hook(value)
+        return super().__setitem__(name, value)
+
+    def __getitem__(self, key):
+        return self.build_lazy(super().__getitem__(key))
+
+    def __deepcopy__(self, memo):
+        other = self.__class__()
+        memo[id(self)] = other
+        for key, value in super().items():
+            other[copy.deepcopy(key, memo)] = copy.deepcopy(value, memo)
+        return other
+
+    def __copy__(self):
+        other = self.__class__()
+        for key, value in super().items():
+            other[key] = value
+        return other
+
+    copy = __copy__
+
+    def __iter__(self):
+        # Implement `__iter__` to overwrite the unpacking operator `**cfg_dict`
+        # to get the built lazy object
+        return iter(self.keys())
+
+    def get(self, key: str, default: Optional[Any] = None) -> Any:
+        """Get the value of the key. If class attribute ``lazy`` is True, the
+        LazyObject will be built and returned.
+
+        Args:
+            key (str): The key.
+            default (any, optional): The default value. Defaults to None.
+
+        Returns:
+            Any: The value of the key.
+        """
+        return self.build_lazy(super().get(key, default))
+
+    def pop(self, key, default=None):
+        """Pop the value of the key. If class attribute ``lazy`` is True, the
+        LazyObject will be built and returned.
+
+        Args:
+            key (str): The key.
+            default (any, optional): The default value. Defaults to None.
+
+        Returns:
+            Any: The value of the key.
+        """
+        return self.build_lazy(super().pop(key, default))
+
+    def update(self, *args, **kwargs) -> None:
+        """Override this method to make sure the LazyObject will not be built
+        during updating."""
+        other = {}
+        if args:
+            if len(args) > 1:
+                raise TypeError('update only accept one positional argument')
+            # Avoid to used self.items to build LazyObject
+            for key, value in dict.items(args[0]):
+                other[key] = value
+
+        for key, value in dict(kwargs).items():
+            other[key] = value
+        for k, v in other.items():
+            if ((k not in self) or (not isinstance(self[k], dict))
+                    or (not isinstance(v, dict))):
+                self[k] = self._hook(v)
+            else:
+                self[k].update(v)
+
+    def build_lazy(self, value: Any) -> Any:
+        """If class attribute ``lazy`` is False, the LazyObject will be built
+        and returned.
+
+        Args:
+            value (Any): The value to be built.
+
+        Returns:
+            Any: The built value.
+        """
+        if isinstance(value, (LazyAttr, LazyObject)) and not self.lazy:
+            value = value.build()
+        return value
+
+    def values(self):
+        """Yield the values of the dictionary.
+
+        If class attribute ``lazy`` is False, the value of ``LazyObject`` or
+        ``LazyAttr`` will be built and returned.
+        """
+        values = []
+        for value in super().values():
+            values.append(self.build_lazy(value))
+        return values
+
+    def items(self):
+        """Yield the keys and values of the dictionary.
+
+        If class attribute ``lazy`` is False, the value of ``LazyObject`` or
+        ``LazyAttr`` will be built and returned.
+        """
+        items = []
+        for key, value in super().items():
+            items.append((key, self.build_lazy(value)))
+        return items
+
+    def merge(self, other: dict):
+        """Merge another dictionary into current dictionary.
+
+        Args:
+            other (dict): Another dictionary.
+        """
+        default = object()
+
+        def _merge_a_into_b(a, b):
+            if isinstance(a, dict):
+                if not isinstance(b, dict):
+                    a.pop(DELETE_KEY, None)
+                    return a
+                if a.pop(DELETE_KEY, False):
+                    b.clear()
+                all_keys = list(b.keys()) + list(a.keys())
+                return {
+                    key:
+                    _merge_a_into_b(a.get(key, default), b.get(key, default))
+                    for key in all_keys if key != DELETE_KEY
+                }
+            else:
+                return a if a is not default else b
+
+        merged = _merge_a_into_b(copy.deepcopy(other), copy.deepcopy(self))
+        self.clear()
+        for key, value in merged.items():
+            self[key] = value
+
+    def __reduce_ex__(self, proto):
+        # Override __reduce_ex__ to avoid `self.items` will be
+        # called by CPython interpreter during pickling. See more details in
+        # https://github.com/python/cpython/blob/8d61a71f9c81619e34d4a30b625922ebc83c561b/Objects/typeobject.c#L6196  # noqa: E501
+        if digit_version(platform.python_version()) < digit_version('3.8'):
+            return (self.__class__, ({k: v
+                                      for k, v in super().items()}, ), None,
+                    None, None)
+        else:
+            return (self.__class__, ({k: v
+                                      for k, v in super().items()}, ), None,
+                    None, None, None)
+
+    def __eq__(self, other):
+        if isinstance(other, ConfigDict):
+            return other.to_dict() == self.to_dict()
+        elif isinstance(other, dict):
+            return {k: v for k, v in self.items()} == other
+        else:
+            return False
+
+    def _to_lazy_dict(self):
+        """Convert the ConfigDict to a normal dictionary recursively, and keep
+        the ``LazyObject`` or ``LazyAttr`` object not built."""
+
+        def _to_dict(data):
+            if isinstance(data, ConfigDict):
+                return {
+                    key: _to_dict(value)
+                    for key, value in Dict.items(data)
+                }
+            elif isinstance(data, dict):
+                return {key: _to_dict(value) for key, value in data.items()}
+            elif isinstance(data, (list, tuple)):
+                return type(data)(_to_dict(item) for item in data)
+            else:
+                return data
+
+        return _to_dict(self)
+
+    def to_dict(self):
+        """Convert the ConfigDict to a normal dictionary recursively, and
+        convert the ``LazyObject`` or ``LazyAttr`` to string."""
+        return _lazy2string(self, dict_type=dict)
+
+
+def add_args(parser: ArgumentParser,
+             cfg: dict,
+             prefix: str = '') -> ArgumentParser:
+    """Add config fields into argument parser.
+
+    Args:
+        parser (ArgumentParser): Argument parser.
+        cfg (dict): Config dictionary.
+        prefix (str, optional): Prefix of parser argument.
+            Defaults to ''.
+
+    Returns:
+        ArgumentParser: Argument parser containing config fields.
+    """
+    for k, v in cfg.items():
+        if isinstance(v, str):
+            parser.add_argument('--' + prefix + k)
+        elif isinstance(v, bool):
+            parser.add_argument('--' + prefix + k, action='store_true')
+        elif isinstance(v, int):
+            parser.add_argument('--' + prefix + k, type=int)
+        elif isinstance(v, float):
+            parser.add_argument('--' + prefix + k, type=float)
+        elif isinstance(v, dict):
+            add_args(parser, v, prefix + k + '.')
+        elif isinstance(v, abc.Iterable):
+            parser.add_argument(
+                '--' + prefix + k, type=type(next(iter(v))), nargs='+')
+        else:
+            print_log(
+                f'cannot parse key {prefix + k} of type {type(v)}',
+                logger='current')
+    return parser
+
+
+class Config:
+    """A facility for config and config files.
+
+    It supports common file formats as configs: python/json/yaml.
+    ``Config.fromfile`` can parse a dictionary from a config file, then
+    build a ``Config`` instance with the dictionary.
+    The interface is the same as a dict object and also allows access config
+    values as attributes.
+
+    Args:
+        cfg_dict (dict, optional): A config dictionary. Defaults to None.
+        cfg_text (str, optional): Text of config. Defaults to None.
+        filename (str or Path, optional): Name of config file.
+            Defaults to None.
+        format_python_code (bool): Whether to format Python code by yapf.
+            Defaults to True.
+
+    Here is a simple example:
+
+    Examples:
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> cfg.a
+        1
+        >>> cfg.b
+        {'b1': [0, 1]}
+        >>> cfg.b.b1
+        [0, 1]
+        >>> cfg = Config.fromfile('tests/data/config/a.py')
+        >>> cfg.filename
+        "/home/username/projects/mmengine/tests/data/config/a.py"
+        >>> cfg.item4
+        'test'
+        >>> cfg
+        "Config [path: /home/username/projects/mmengine/tests/data/config/a.py]
+        :"
+        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
+
+    You can find more advance usage in the `config tutorial`_.
+
+    .. _config tutorial: https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        cfg_dict: dict = None,
+        cfg_text: Optional[str] = None,
+        filename: Optional[Union[str, Path]] = None,
+        env_variables: Optional[dict] = None,
+        format_python_code: bool = True,
+    ):
+        filename = str(filename) if isinstance(filename, Path) else filename
+        if cfg_dict is None:
+            cfg_dict = dict()
+        elif not isinstance(cfg_dict, dict):
+            raise TypeError('cfg_dict must be a dict, but '
+                            f'got {type(cfg_dict)}')
+        for key in cfg_dict:
+            if key in RESERVED_KEYS:
+                raise KeyError(f'{key} is reserved for config file')
+
+        if not isinstance(cfg_dict, ConfigDict):
+            cfg_dict = ConfigDict(cfg_dict)
+        super().__setattr__('_cfg_dict', cfg_dict)
+        super().__setattr__('_filename', filename)
+        super().__setattr__('_format_python_code', format_python_code)
+        if not hasattr(self, '_imported_names'):
+            super().__setattr__('_imported_names', set())
+
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename, encoding='utf-8') as f:
+                text = f.read()
+        else:
+            text = ''
+        super().__setattr__('_text', text)
+        if env_variables is None:
+            env_variables = dict()
+        super().__setattr__('_env_variables', env_variables)
+
+    @staticmethod
+    def fromfile(filename: Union[str, Path],
+                 use_predefined_variables: bool = True,
+                 import_custom_modules: bool = True,
+                 use_environment_variables: bool = True,
+                 lazy_import: Optional[bool] = None,
+                 format_python_code: bool = True) -> 'Config':
+        """Build a Config instance from config file.
+
+        Args:
+            filename (str or Path): Name of config file.
+            use_predefined_variables (bool, optional): Whether to use
+                predefined variables. Defaults to True.
+            import_custom_modules (bool, optional): Whether to support
+                importing custom modules in config. Defaults to None.
+            use_environment_variables (bool, optional): Whether to use
+                environment variables. Defaults to True.
+            lazy_import (bool): Whether to load config in `lazy_import` mode.
+                If it is `None`, it will be deduced by the content of the
+                config file. Defaults to None.
+            format_python_code (bool): Whether to format Python code by yapf.
+                Defaults to True.
+
+        Returns:
+            Config: Config instance built from config file.
+        """
+        filename = str(filename) if isinstance(filename, Path) else filename
+        if lazy_import is False or \
+           lazy_import is None and not Config._is_lazy_import(filename):
+            cfg_dict, cfg_text, env_variables = Config._file2dict(
+                filename, use_predefined_variables, use_environment_variables,
+                lazy_import)
+            if import_custom_modules and cfg_dict.get('custom_imports', None):
+                try:
+                    import_modules_from_strings(**cfg_dict['custom_imports'])
+                except ImportError as e:
+                    err_msg = (
+                        'Failed to import custom modules from '
+                        f"{cfg_dict['custom_imports']}, the current sys.path "
+                        'is: ')
+                    for p in sys.path:
+                        err_msg += f'\n    {p}'
+                    err_msg += (
+                        '\nYou should set `PYTHONPATH` to make `sys.path` '
+                        'include the directory which contains your custom '
+                        'module')
+                    raise ImportError(err_msg) from e
+            return Config(
+                cfg_dict,
+                cfg_text=cfg_text,
+                filename=filename,
+                env_variables=env_variables,
+            )
+        else:
+            # Enable lazy import when parsing the config.
+            # Using try-except to make sure ``ConfigDict.lazy`` will be reset
+            # to False. See more details about lazy in the docstring of
+            # ConfigDict
+            ConfigDict.lazy = True
+            try:
+                cfg_dict, imported_names = Config._parse_lazy_import(filename)
+            except Exception as e:
+                raise e
+            finally:
+                # disable lazy import to get the real type. See more details
+                # about lazy in the docstring of ConfigDict
+                ConfigDict.lazy = False
+
+            cfg = Config(
+                cfg_dict,
+                filename=filename,
+                format_python_code=format_python_code)
+            object.__setattr__(cfg, '_imported_names', imported_names)
+            return cfg
+
+    @staticmethod
+    def fromstring(cfg_str: str, file_format: str) -> 'Config':
+        """Build a Config instance from config text.
+
+        Args:
+            cfg_str (str): Config text.
+            file_format (str): Config file format corresponding to the
+               config str. Only py/yml/yaml/json type are supported now!
+
+        Returns:
+            Config: Config object generated from ``cfg_str``.
+        """
+        if file_format not in ['.py', '.json', '.yaml', '.yml']:
+            raise OSError('Only py/yml/yaml/json type are supported now!')
+        if file_format != '.py' and 'dict(' in cfg_str:
+            # check if users specify a wrong suffix for python
+            warnings.warn(
+                'Please check "file_format", the file format may be .py')
+
+        # A temporary file can not be opened a second time on Windows.
+        # See https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile for more details. # noqa
+        # `temp_file` is opened first in `tempfile.NamedTemporaryFile` and
+        #  second in `Config.from_file`.
+        # In addition, a named temporary file will be removed after closed.
+        # As a workaround we set `delete=False` and close the temporary file
+        # before opening again.
+
+        with tempfile.NamedTemporaryFile(
+                'w', encoding='utf-8', suffix=file_format,
+                delete=False) as temp_file:
+            temp_file.write(cfg_str)
+
+        cfg = Config.fromfile(temp_file.name)
+        os.remove(temp_file.name)  # manually delete the temporary file
+        return cfg
+
+    @staticmethod
+    def _get_base_modules(nodes: list) -> list:
+        """Get base module name from parsed code.
+
+        Args:
+            nodes (list): Parsed code of the config file.
+
+        Returns:
+            list: Name of base modules.
+        """
+
+        def _get_base_module_from_with(with_nodes: list) -> list:
+            """Get base module name from if statement in python file.
+
+            Args:
+                with_nodes (list): List of if statement.
+
+            Returns:
+                list: Name of base modules.
+            """
+            base_modules = []
+            for node in with_nodes:
+                assert isinstance(node, ast.ImportFrom), (
+                    'Illegal syntax in config file! Only '
+                    '`from ... import ...` could be implemented` in '
+                    'with read_base()`')
+                assert node.module is not None, (
+                    'Illegal syntax in config file! Syntax like '
+                    '`from . import xxx` is not allowed in `with read_base()`')
+                base_modules.append(node.level * '.' + node.module)
+            return base_modules
+
+        for idx, node in enumerate(nodes):
+            if (isinstance(node, ast.Assign)
+                    and isinstance(node.targets[0], ast.Name)
+                    and node.targets[0].id == BASE_KEY):
+                raise ConfigParsingError(
+                    'The configuration file type in the inheritance chain '
+                    'must match the current configuration file type, either '
+                    '"lazy_import" or non-"lazy_import". You got this error '
+                    f'since you use the syntax like `_base_ = "{node.targets[0].id}"` '  # noqa: E501
+                    'in your config. You should use `with read_base(): ... to` '  # noqa: E501
+                    'mark the inherited config file. See more information '
+                    'in https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html'  # noqa: E501
+                )
+
+            if not isinstance(node, ast.With):
+                continue
+
+            expr = node.items[0].context_expr
+            if (not isinstance(expr, ast.Call)
+                    or not expr.func.id == 'read_base' or  # type: ignore
+                    len(node.items) > 1):
+                raise ConfigParsingError(
+                    'Only `read_base` context manager can be used in the '
+                    'config')
+
+            # The original code:
+            # ```
+            # with read_base():
+            #     from .._base_.default_runtime import *
+            # ```
+            # The processed code:
+            # ```
+            # from .._base_.default_runtime import *
+            # ```
+            # As you can see, the if statement is removed and the
+            # from ... import statement will be unindent
+            for nested_idx, nested_node in enumerate(node.body):
+                nodes.insert(idx + nested_idx + 1, nested_node)
+            nodes.pop(idx)
+            return _get_base_module_from_with(node.body)
+        return []
+
+    @staticmethod
+    def _validate_py_syntax(filename: str):
+        """Validate syntax of python config.
+
+        Args:
+            filename (str): Filename of python config file.
+        """
+        with open(filename, encoding='utf-8') as f:
+            content = f.read()
+        try:
+            ast.parse(content)
+        except SyntaxError as e:
+            raise SyntaxError('There are syntax errors in config '
+                              f'file {filename}: {e}')
+
+    @staticmethod
+    def _substitute_predefined_vars(filename: str, temp_config_name: str):
+        """Substitute predefined variables in config with actual values.
+
+        Sometimes we want some variables in the config to be related to the
+        current path or file name, etc.
+
+        Here is an example of a typical usage scenario. When training a model,
+        we define a working directory in the config that save the models and
+        logs. For different configs, we expect to define different working
+        directories. A common way for users is to use the config file name
+        directly as part of the working directory name, e.g. for the config
+        ``config_setting1.py``, the working directory is
+        ``. /work_dir/config_setting1``.
+
+        This can be easily achieved using predefined variables, which can be
+        written in the config `config_setting1.py` as follows
+
+        .. code-block:: python
+
+           work_dir = '. /work_dir/{{ fileBasenameNoExtension }}'
+
+
+        Here `{{ fileBasenameNoExtension }}` indicates the file name of the
+        config (without the extension), and when the config class reads the
+        config file, it will automatically parse this double-bracketed string
+        to the corresponding actual value.
+
+        .. code-block:: python
+
+           cfg = Config.fromfile('. /config_setting1.py')
+           cfg.work_dir # ". /work_dir/config_setting1"
+
+
+        For details, Please refer to docs/zh_cn/advanced_tutorials/config.md .
+
+        Args:
+            filename (str): Filename of config.
+            temp_config_name (str): Temporary filename to save substituted
+                config.
+        """
+        file_dirname = osp.dirname(filename)
+        file_basename = osp.basename(filename)
+        file_basename_no_extension = osp.splitext(file_basename)[0]
+        file_extname = osp.splitext(filename)[1]
+        support_templates = dict(
+            fileDirname=file_dirname,
+            fileBasename=file_basename,
+            fileBasenameNoExtension=file_basename_no_extension,
+            fileExtname=file_extname)
+        with open(filename, encoding='utf-8') as f:
+            config_file = f.read()
+        for key, value in support_templates.items():
+            regexp = r'\{\{\s*' + str(key) + r'\s*\}\}'
+            value = value.replace('\\', '/')
+            config_file = re.sub(regexp, value, config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+
+    @staticmethod
+    def _substitute_env_variables(filename: str, temp_config_name: str):
+        """Substitute environment variables in config with actual values.
+
+        Sometimes, we want to change some items in the config with environment
+        variables. For examples, we expect to change dataset root by setting
+        ``DATASET_ROOT=/dataset/root/path`` in the command line. This can be
+        easily achieved by writing lines in the config as follows
+
+        .. code-block:: python
+
+           data_root = '{{$DATASET_ROOT:/default/dataset}}/images'
+
+
+        Here, ``{{$DATASET_ROOT:/default/dataset}}`` indicates using the
+        environment variable ``DATASET_ROOT`` to replace the part between
+        ``{{}}``. If the ``DATASET_ROOT`` is not set, the default value
+        ``/default/dataset`` will be used.
+
+        Environment variables not only can replace items in the string, they
+        can also substitute other types of data in config. In this situation,
+        we can write the config as below
+
+        .. code-block:: python
+
+           model = dict(
+               bbox_head = dict(num_classes={{'$NUM_CLASSES:80'}}))
+
+
+        For details, Please refer to docs/zh_cn/tutorials/config.md .
+
+        Args:
+            filename (str): Filename of config.
+            temp_config_name (str): Temporary filename to save substituted
+                config.
+        """
+        with open(filename, encoding='utf-8') as f:
+            config_file = f.read()
+        regexp = r'\{\{[\'\"]?\s*\$(\w+)\s*\:\s*(\S*?)\s*[\'\"]?\}\}'
+        keys = re.findall(regexp, config_file)
+        env_variables = dict()
+        for var_name, value in keys:
+            regexp = r'\{\{[\'\"]?\s*\$' + var_name + r'\s*\:\s*' \
+                + value + r'\s*[\'\"]?\}\}'
+            if var_name in os.environ:
+                value = os.environ[var_name]
+                env_variables[var_name] = value
+                print_log(
+                    f'Using env variable `{var_name}` with value of '
+                    f'{value} to replace item in config.',
+                    logger='current')
+            if not value:
+                raise KeyError(f'`{var_name}` cannot be found in `os.environ`.'
+                               f' Please set `{var_name}` in environment or '
+                               'give a default value.')
+            config_file = re.sub(regexp, value, config_file)
+
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+        return env_variables
+
+    @staticmethod
+    def _pre_substitute_base_vars(filename: str,
+                                  temp_config_name: str) -> dict:
+        """Preceding step for substituting variables in base config with actual
+        value.
+
+        Args:
+            filename (str): Filename of config.
+            temp_config_name (str): Temporary filename to save substituted
+                config.
+
+        Returns:
+            dict: A dictionary contains variables in base config.
+        """
+        with open(filename, encoding='utf-8') as f:
+            config_file = f.read()
+        base_var_dict = {}
+        regexp = r'\{\{\s*' + BASE_KEY + r'\.([\w\.]+)\s*\}\}'
+        base_vars = set(re.findall(regexp, config_file))
+        for base_var in base_vars:
+            randstr = f'_{base_var}_{uuid.uuid4().hex.lower()[:6]}'
+            base_var_dict[randstr] = base_var
+            regexp = r'\{\{\s*' + BASE_KEY + r'\.' + base_var + r'\s*\}\}'
+            config_file = re.sub(regexp, f'"{randstr}"', config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+        return base_var_dict
+
+    @staticmethod
+    def _substitute_base_vars(cfg: Any, base_var_dict: dict,
+                              base_cfg: dict) -> Any:
+        """Substitute base variables from strings to their actual values.
+
+        Args:
+            Any : Config dictionary.
+            base_var_dict (dict): A dictionary contains variables in base
+                config.
+            base_cfg (dict): Base config dictionary.
+
+        Returns:
+            Any : A dictionary with origin base variables
+                substituted with actual values.
+        """
+        cfg = copy.deepcopy(cfg)
+
+        if isinstance(cfg, dict):
+            for k, v in cfg.items():
+                if isinstance(v, str) and v in base_var_dict:
+                    new_v = base_cfg
+                    for new_k in base_var_dict[v].split('.'):
+                        new_v = new_v[new_k]
+                    cfg[k] = new_v
+                elif isinstance(v, (list, tuple, dict)):
+                    cfg[k] = Config._substitute_base_vars(
+                        v, base_var_dict, base_cfg)
+        elif isinstance(cfg, tuple):
+            cfg = tuple(
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg)
+        elif isinstance(cfg, list):
+            cfg = [
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg
+            ]
+        elif isinstance(cfg, str) and cfg in base_var_dict:
+            new_v = base_cfg
+            for new_k in base_var_dict[cfg].split('.'):
+                new_v = new_v[new_k]
+            cfg = new_v
+
+        return cfg
+
+    @staticmethod
+    def _file2dict(
+            filename: str,
+            use_predefined_variables: bool = True,
+            use_environment_variables: bool = True,
+            lazy_import: Optional[bool] = None) -> Tuple[dict, str, dict]:
+        """Transform file to variables dictionary.
+
+        Args:
+            filename (str): Name of config file.
+            use_predefined_variables (bool, optional): Whether to use
+                predefined variables. Defaults to True.
+            use_environment_variables (bool, optional): Whether to use
+                environment variables. Defaults to True.
+            lazy_import (bool): Whether to load config in `lazy_import` mode.
+                If it is `None`, it will be deduced by the content of the
+                config file. Defaults to None.
+
+        Returns:
+            Tuple[dict, str]: Variables dictionary and text of Config.
+        """
+        if lazy_import is None and Config._is_lazy_import(filename):
+            raise RuntimeError(
+                'The configuration file type in the inheritance chain '
+                'must match the current configuration file type, either '
+                '"lazy_import" or non-"lazy_import". You got this error '
+                'since you use the syntax like `with read_base(): ...` '
+                f'or import non-builtin module in {filename}. See more '
+                'information in https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html'  # noqa: E501
+            )
+
+        filename = osp.abspath(osp.expanduser(filename))
+        check_file_exist(filename)
+        fileExtname = osp.splitext(filename)[1]
+        if fileExtname not in ['.py', '.json', '.yaml', '.yml']:
+            raise OSError('Only py/yml/yaml/json type are supported now!')
+        try:
+            with tempfile.TemporaryDirectory() as temp_config_dir:
+                temp_config_file = tempfile.NamedTemporaryFile(
+                    dir=temp_config_dir, suffix=fileExtname, delete=False)
+                if platform.system() == 'Windows':
+                    temp_config_file.close()
+
+                # Substitute predefined variables
+                if use_predefined_variables:
+                    Config._substitute_predefined_vars(filename,
+                                                       temp_config_file.name)
+                else:
+                    shutil.copyfile(filename, temp_config_file.name)
+                # Substitute environment variables
+                env_variables = dict()
+                if use_environment_variables:
+                    env_variables = Config._substitute_env_variables(
+                        temp_config_file.name, temp_config_file.name)
+                # Substitute base variables from placeholders to strings
+                base_var_dict = Config._pre_substitute_base_vars(
+                    temp_config_file.name, temp_config_file.name)
+
+                # Handle base files
+                base_cfg_dict = ConfigDict()
+                cfg_text_list = list()
+                for base_cfg_path in Config._get_base_files(
+                        temp_config_file.name):
+                    base_cfg_path, scope = Config._get_cfg_path(
+                        base_cfg_path, filename)
+                    _cfg_dict, _cfg_text, _env_variables = Config._file2dict(
+                        filename=base_cfg_path,
+                        use_predefined_variables=use_predefined_variables,
+                        use_environment_variables=use_environment_variables,
+                        lazy_import=lazy_import,
+                    )
+                    cfg_text_list.append(_cfg_text)
+                    env_variables.update(_env_variables)
+                    duplicate_keys = base_cfg_dict.keys() & _cfg_dict.keys()
+                    if len(duplicate_keys) > 0:
+                        raise KeyError(
+                            'Duplicate key is not allowed among bases. '
+                            f'Duplicate keys: {duplicate_keys}')
+
+                    # _dict_to_config_dict will do the following things:
+                    # 1. Recursively converts ``dict`` to :obj:`ConfigDict`.
+                    # 2. Set `_scope_` for the outer dict variable for the base
+                    # config.
+                    # 3. Set `scope` attribute for each base variable.
+                    # Different from `_scope_`, `scope` is not a key of base
+                    # dict, `scope` attribute will be parsed to key `_scope_`
+                    # by function `_parse_scope` only if the base variable is
+                    # accessed by the current config.
+                    _cfg_dict = Config._dict_to_config_dict(_cfg_dict, scope)
+                    base_cfg_dict.update(_cfg_dict)
+
+                if filename.endswith('.py'):
+                    with open(temp_config_file.name, encoding='utf-8') as f:
+                        parsed_codes = ast.parse(f.read())
+                        parsed_codes = RemoveAssignFromAST(BASE_KEY).visit(
+                            parsed_codes)
+                    codeobj = compile(parsed_codes, filename, mode='exec')
+                    # Support load global variable in nested function of the
+                    # config.
+                    global_locals_var = {BASE_KEY: base_cfg_dict}
+                    ori_keys = set(global_locals_var.keys())
+                    eval(codeobj, global_locals_var, global_locals_var)
+                    cfg_dict = {
+                        key: value
+                        for key, value in global_locals_var.items()
+                        if (key not in ori_keys and not key.startswith('__'))
+                    }
+                elif filename.endswith(('.yml', '.yaml', '.json')):
+                    cfg_dict = load(temp_config_file.name)
+                # close temp file
+                for key, value in list(cfg_dict.items()):
+                    if isinstance(value,
+                                  (types.FunctionType, types.ModuleType)):
+                        cfg_dict.pop(key)
+                temp_config_file.close()
+
+                # If the current config accesses a base variable of base
+                # configs, The ``scope`` attribute of corresponding variable
+                # will be converted to the `_scope_`.
+                Config._parse_scope(cfg_dict)
+        except Exception as e:
+            if osp.exists(temp_config_dir):
+                shutil.rmtree(temp_config_dir)
+            raise e
+
+        # check deprecation information
+        if DEPRECATION_KEY in cfg_dict:
+            deprecation_info = cfg_dict.pop(DEPRECATION_KEY)
+            warning_msg = f'The config file {filename} will be deprecated ' \
+                'in the future.'
+            if 'expected' in deprecation_info:
+                warning_msg += f' Please use {deprecation_info["expected"]} ' \
+                    'instead.'
+            if 'reference' in deprecation_info:
+                warning_msg += ' More information can be found at ' \
+                    f'{deprecation_info["reference"]}'
+            warnings.warn(warning_msg, DeprecationWarning)
+
+        cfg_text = filename + '\n'
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            cfg_text += f.read()
+
+        # Substitute base variables from strings to their actual values
+        cfg_dict = Config._substitute_base_vars(cfg_dict, base_var_dict,
+                                                base_cfg_dict)
+        cfg_dict.pop(BASE_KEY, None)
+
+        cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
+        cfg_dict = {
+            k: v
+            for k, v in cfg_dict.items() if not k.startswith('__')
+        }
+
+        # merge cfg_text
+        cfg_text_list.append(cfg_text)
+        cfg_text = '\n'.join(cfg_text_list)
+
+        return cfg_dict, cfg_text, env_variables
+
+    @staticmethod
+    def _parse_lazy_import(filename: str) -> Tuple[ConfigDict, set]:
+        """Transform file to variables dictionary.
+
+        Args:
+            filename (str): Name of config file.
+
+        Returns:
+            Tuple[dict, dict]: ``cfg_dict`` and ``imported_names``.
+
+              - cfg_dict (dict): Variables dictionary of parsed config.
+              - imported_names (set): Used to mark the names of
+                imported object.
+        """
+        # In lazy import mode, users can use the Python syntax `import` to
+        # implement inheritance between configuration files, which is easier
+        # for users to understand the hierarchical relationships between
+        # different configuration files.
+
+        # Besides, users can also using `import` syntax to import corresponding
+        # module which will be filled in the `type` field. It means users
+        # can directly navigate to the source of the module in the
+        # configuration file by clicking the `type` field.
+
+        # To avoid really importing the third party package like `torch`
+        # during import `type` object, we use `_parse_lazy_import` to parse the
+        # configuration file, which will not actually trigger the import
+        # process, but simply parse the imported `type`s as LazyObject objects.
+
+        # The overall pipeline of _parse_lazy_import is:
+        # 1. Parse the base module from the config file.
+        #                       ||
+        #                       \/
+        #       base_module = ['mmdet.configs.default_runtime']
+        #                       ||
+        #                       \/
+        # 2. recursively parse the base module and gather imported objects to
+        #    a dict.
+        #                       ||
+        #                       \/
+        #       The base_dict will be:
+        #       {
+        #           'mmdet.configs.default_runtime': {...}
+        #           'mmdet.configs.retinanet_r50_fpn_1x_coco': {...}
+        #           ...
+        #       }, each item in base_dict is a dict of `LazyObject`
+        # 3. parse the current config file filling the imported variable
+        #    with the base_dict.
+        #
+        # 4. During the parsing process, all imported variable will be
+        #    recorded in the `imported_names` set. These variables can be
+        #    accessed, but will not be dumped by default.
+
+        with open(filename, encoding='utf-8') as f:
+            global_dict = {'LazyObject': LazyObject, '__file__': filename}
+            base_dict = {}
+
+            parsed_codes = ast.parse(f.read())
+            # get the names of base modules, and remove the
+            # `with read_base():'` statement
+            base_modules = Config._get_base_modules(parsed_codes.body)
+            base_imported_names = set()
+            for base_module in base_modules:
+                # If base_module means a relative import, assuming the level is
+                # 2, which means the module is imported like
+                # "from ..a.b import c". we must ensure that c is an
+                # object `defined` in module b, and module b should not be a
+                # package including `__init__` file but a single python file.
+                level = len(re.match(r'\.*', base_module).group())
+                if level > 0:
+                    # Relative import
+                    base_dir = osp.dirname(filename)
+                    module_path = osp.join(
+                        base_dir, *(['..'] * (level - 1)),
+                        f'{base_module[level:].replace(".", "/")}.py')
+                else:
+                    # Absolute import
+                    module_list = base_module.split('.')
+                    if len(module_list) == 1:
+                        raise ConfigParsingError(
+                            'The imported configuration file should not be '
+                            f'an independent package {module_list[0]}. Here '
+                            'is an example: '
+                            '`with read_base(): from mmdet.configs.retinanet_r50_fpn_1x_coco import *`'  # noqa: E501
+                        )
+                    else:
+                        package = module_list[0]
+                        root_path = get_installed_path(package)
+                        module_path = f'{osp.join(root_path, *module_list[1:])}.py'  # noqa: E501
+                if not osp.isfile(module_path):
+                    raise ConfigParsingError(
+                        f'{module_path} not found! It means that incorrect '
+                        'module is defined in '
+                        f'`with read_base(): = from {base_module} import ...`, please '  # noqa: E501
+                        'make sure the base config module is valid '
+                        'and is consistent with the prior import '
+                        'logic')
+                _base_cfg_dict, _base_imported_names = Config._parse_lazy_import(  # noqa: E501
+                    module_path)
+                base_imported_names |= _base_imported_names
+                # The base_dict will be:
+                # {
+                #     'mmdet.configs.default_runtime': {...}
+                #     'mmdet.configs.retinanet_r50_fpn_1x_coco': {...}
+                #     ...
+                # }
+                base_dict[base_module] = _base_cfg_dict
+
+            # `base_dict` contains all the imported modules from `base_cfg`.
+            # In order to collect the specific imported module from `base_cfg`
+            # before parse the current file, we using AST Transform to
+            # transverse the imported module from base_cfg and merge then into
+            # the global dict. After the ast transformation, most of import
+            # syntax will be removed (except for the builtin import) and
+            # replaced with the `LazyObject`
+            transform = ImportTransformer(
+                global_dict=global_dict,
+                base_dict=base_dict,
+                filename=filename)
+            modified_code = transform.visit(parsed_codes)
+            modified_code, abs_imported = _gather_abs_import_lazyobj(
+                modified_code, filename=filename)
+            imported_names = transform.imported_obj | abs_imported
+            imported_names |= base_imported_names
+            modified_code = ast.fix_missing_locations(modified_code)
+            exec(
+                compile(modified_code, filename, mode='exec'), global_dict,
+                global_dict)
+
+            ret: dict = {}
+            for key, value in global_dict.items():
+                if key.startswith('__') or key in ['LazyObject']:
+                    continue
+                ret[key] = value
+            # convert dict to ConfigDict
+            cfg_dict = Config._dict_to_config_dict_lazy(ret)
+
+            return cfg_dict, imported_names
+
+    @staticmethod
+    def _dict_to_config_dict_lazy(cfg: dict):
+        """Recursively converts ``dict`` to :obj:`ConfigDict`. The only
+        difference between ``_dict_to_config_dict_lazy`` and
+        ``_dict_to_config_dict_lazy`` is that the former one does not consider
+        the scope, and will not trigger the building of ``LazyObject``.
+
+        Args:
+            cfg (dict): Config dict.
+
+        Returns:
+            ConfigDict: Converted dict.
+        """
+        # Only the outer dict with key `type` should have the key `_scope_`.
+        if isinstance(cfg, dict):
+            cfg_dict = ConfigDict()
+            for key, value in cfg.items():
+                cfg_dict[key] = Config._dict_to_config_dict_lazy(value)
+            return cfg_dict
+        if isinstance(cfg, (tuple, list)):
+            return type(cfg)(
+                Config._dict_to_config_dict_lazy(_cfg) for _cfg in cfg)
+        return cfg
+
+    @staticmethod
+    def _dict_to_config_dict(cfg: dict,
+                             scope: Optional[str] = None,
+                             has_scope=True):
+        """Recursively converts ``dict`` to :obj:`ConfigDict`.
+
+        Args:
+            cfg (dict): Config dict.
+            scope (str, optional): Scope of instance.
+            has_scope (bool): Whether to add `_scope_` key to config dict.
+
+        Returns:
+            ConfigDict: Converted dict.
+        """
+        # Only the outer dict with key `type` should have the key `_scope_`.
+        if isinstance(cfg, dict):
+            if has_scope and 'type' in cfg:
+                has_scope = False
+                if scope is not None and cfg.get('_scope_', None) is None:
+                    cfg._scope_ = scope  # type: ignore
+            cfg = ConfigDict(cfg)
+            dict.__setattr__(cfg, 'scope', scope)
+            for key, value in cfg.items():
+                cfg[key] = Config._dict_to_config_dict(
+                    value, scope=scope, has_scope=has_scope)
+        elif isinstance(cfg, tuple):
+            cfg = tuple(
+                Config._dict_to_config_dict(_cfg, scope, has_scope=has_scope)
+                for _cfg in cfg)
+        elif isinstance(cfg, list):
+            cfg = [
+                Config._dict_to_config_dict(_cfg, scope, has_scope=has_scope)
+                for _cfg in cfg
+            ]
+        return cfg
+
+    @staticmethod
+    def _parse_scope(cfg: dict) -> None:
+        """Adds ``_scope_`` to :obj:`ConfigDict` instance, which means a base
+        variable.
+
+        If the config dict already has the scope, scope will not be
+        overwritten.
+
+        Args:
+            cfg (dict): Config needs to be parsed with scope.
+        """
+        if isinstance(cfg, ConfigDict):
+            cfg._scope_ = cfg.scope
+        elif isinstance(cfg, (tuple, list)):
+            [Config._parse_scope(value) for value in cfg]
+        else:
+            return
+
+    @staticmethod
+    def _get_base_files(filename: str) -> list:
+        """Get the base config file.
+
+        Args:
+            filename (str): The config file.
+
+        Raises:
+            TypeError: Name of config file.
+
+        Returns:
+            list: A list of base config.
+        """
+        file_format = osp.splitext(filename)[1]
+        if file_format == '.py':
+            Config._validate_py_syntax(filename)
+            with open(filename, encoding='utf-8') as f:
+                parsed_codes = ast.parse(f.read()).body
+
+                def is_base_line(c):
+                    return (isinstance(c, ast.Assign)
+                            and isinstance(c.targets[0], ast.Name)
+                            and c.targets[0].id == BASE_KEY)
+
+                base_code = next((c for c in parsed_codes if is_base_line(c)),
+                                 None)
+                if base_code is not None:
+                    base_code = ast.Expression(  # type: ignore
+                        body=base_code.value)  # type: ignore
+                    base_files = eval(compile(base_code, '', mode='eval'))
+                else:
+                    base_files = []
+        elif file_format in ('.yml', '.yaml', '.json'):
+            import mmengine
+            cfg_dict = mmengine.load(filename)
+            base_files = cfg_dict.get(BASE_KEY, [])
+        else:
+            raise ConfigParsingError(
+                'The config type should be py, json, yaml or '
+                f'yml, but got {file_format}')
+        base_files = base_files if isinstance(base_files,
+                                              list) else [base_files]
+        return base_files
+
+    @staticmethod
+    def _get_cfg_path(cfg_path: str,
+                      filename: str) -> Tuple[str, Optional[str]]:
+        """Get the config path from the current or external package.
+
+        Args:
+            cfg_path (str): Relative path of config.
+            filename (str): The config file being parsed.
+
+        Returns:
+            Tuple[str, str or None]: Path and scope of config. If the config
+            is not an external config, the scope will be `None`.
+        """
+        if '::' in cfg_path:
+            # `cfg_path` startswith '::' means an external config path.
+            # Get package name and relative config path.
+            scope = cfg_path.partition('::')[0]
+            package, cfg_path = _get_package_and_cfg_path(cfg_path)
+
+            if not is_installed(package):
+                raise ModuleNotFoundError(
+                    f'{package} is not installed, please install {package} '
+                    f'manually')
+
+            # Get installed package path.
+            package_path = get_installed_path(package)
+            try:
+                # Get config path from meta file.
+                cfg_path = _get_external_cfg_path(package_path, cfg_path)
+            except ValueError:
+                # Since base config does not have a metafile, it should be
+                # concatenated with package path and relative config path.
+                cfg_path = _get_external_cfg_base_path(package_path, cfg_path)
+            except FileNotFoundError as e:
+                raise e
+            return cfg_path, scope
+        else:
+            # Get local config path.
+            cfg_dir = osp.dirname(filename)
+            cfg_path = osp.join(cfg_dir, cfg_path)
+            return cfg_path, None
+
+    @staticmethod
+    def _merge_a_into_b(a: dict,
+                        b: dict,
+                        allow_list_keys: bool = False) -> dict:
+        """merge dict ``a`` into dict ``b`` (non-inplace).
+
+        Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid
+        in-place modifications.
+
+        Args:
+            a (dict): The source dict to be merged into ``b``.
+            b (dict): The origin dict to be fetch keys from ``a``.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in source ``a`` and will replace the element of the
+              corresponding index in b if b is a list. Defaults to False.
+
+        Returns:
+            dict: The modified dict of ``b`` using ``a``.
+
+        Examples:
+            # Normally merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # Delete b first and merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # b is a list
+            >>> Config._merge_a_into_b(
+            ...     {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True)
+            [{'a': 2}, {'b': 2}]
+        """
+        b = b.copy()
+        for k, v in a.items():
+            if allow_list_keys and k.isdigit() and isinstance(b, list):
+                k = int(k)
+                if len(b) <= k:
+                    raise KeyError(f'Index {k} exceeds the length of list {b}')
+                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+            elif isinstance(v, dict):
+                if k in b and not v.pop(DELETE_KEY, False):
+                    allowed_types: Union[Tuple, type] = (
+                        dict, list) if allow_list_keys else dict
+                    if not isinstance(b[k], allowed_types):
+                        raise TypeError(
+                            f'{k}={v} in child config cannot inherit from '
+                            f'base because {k} is a dict in the child config '
+                            f'but is of type {type(b[k])} in base config. '
+                            f'You may set `{DELETE_KEY}=True` to ignore the '
+                            f'base config.')
+                    b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+                else:
+                    b[k] = ConfigDict(v)
+            else:
+                b[k] = v
+        return b
+
+    @staticmethod
+    def auto_argparser(description=None):
+        """Generate argparser from config file automatically (experimental)"""
+        partial_parser = ArgumentParser(description=description)
+        partial_parser.add_argument('config', help='config file path')
+        cfg_file = partial_parser.parse_known_args()[0].config
+        cfg = Config.fromfile(cfg_file)
+        parser = ArgumentParser(description=description)
+        parser.add_argument('config', help='config file path')
+        add_args(parser, cfg)
+        return parser, cfg
+
+    @property
+    def filename(self) -> str:
+        """get file name of config."""
+        return self._filename
+
+    @property
+    def text(self) -> str:
+        """get config text."""
+        return self._text
+
+    @property
+    def env_variables(self) -> dict:
+        """get used environment variables."""
+        return self._env_variables
+
+    @property
+    def pretty_text(self) -> str:
+        """get formatted python config text."""
+
+        indent = 4
+
+        def _indent(s_, num_spaces):
+            s = s_.split('\n')
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)
+            s = first + '\n' + s
+            return s
+
+        def _format_basic_types(k, v, use_mapping=False):
+            if isinstance(v, str):
+                v_str = repr(v)
+            else:
+                v_str = str(v)
+
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent)
+
+            return attr_str
+
+        def _format_list_tuple(k, v, use_mapping=False):
+            if isinstance(v, list):
+                left = '['
+                right = ']'
+            else:
+                left = '('
+                right = ')'
+
+            v_str = f'{left}\n'
+            # check if all items in the list are dict
+            for item in v:
+                if isinstance(item, dict):
+                    v_str += f'dict({_indent(_format_dict(item), indent)}),\n'
+                elif isinstance(item, tuple):
+                    v_str += f'{_indent(_format_list_tuple(None, item), indent)},\n'  # noqa: 501
+                elif isinstance(item, list):
+                    v_str += f'{_indent(_format_list_tuple(None, item), indent)},\n'  # noqa: 501
+                elif isinstance(item, str):
+                    v_str += f'{_indent(repr(item), indent)},\n'
+                else:
+                    v_str += str(item) + ',\n'
+            if k is None:
+                return _indent(v_str, indent) + right
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent) + right
+            return attr_str
+
+        def _contain_invalid_identifier(dict_str):
+            contain_invalid_identifier = False
+            for key_name in dict_str:
+                contain_invalid_identifier |= \
+                    (not str(key_name).isidentifier())
+            return contain_invalid_identifier
+
+        def _format_dict(input_dict, outest_level=False):
+            r = ''
+            s = []
+
+            use_mapping = _contain_invalid_identifier(input_dict)
+            if use_mapping:
+                r += '{'
+            for idx, (k, v) in enumerate(
+                    sorted(input_dict.items(), key=lambda x: str(x[0]))):
+                is_last = idx >= len(input_dict) - 1
+                end = '' if outest_level or is_last else ','
+                if isinstance(v, dict):
+                    v_str = '\n' + _format_dict(v)
+                    if use_mapping:
+                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                        attr_str = f'{k_str}: dict({v_str}'
+                    else:
+                        attr_str = f'{str(k)}=dict({v_str}'
+                    attr_str = _indent(attr_str, indent) + ')' + end
+                elif isinstance(v, (list, tuple)):
+                    attr_str = _format_list_tuple(k, v, use_mapping) + end
+                else:
+                    attr_str = _format_basic_types(k, v, use_mapping) + end
+
+                s.append(attr_str)
+            r += '\n'.join(s)
+            if use_mapping:
+                r += '}'
+            return r
+
+        cfg_dict = self.to_dict()
+        text = _format_dict(cfg_dict, outest_level=True)
+        if self._format_python_code:
+            # copied from setup.cfg
+            yapf_style = dict(
+                based_on_style='pep8',
+                blank_line_before_nested_class_or_def=True,
+                split_before_expression_after_opening_paren=True)
+            try:
+                if digit_version(yapf.__version__) >= digit_version('0.40.2'):
+                    text, _ = FormatCode(text, style_config=yapf_style)
+                else:
+                    text, _ = FormatCode(
+                        text, style_config=yapf_style, verify=True)
+            except:  # noqa: E722
+                raise SyntaxError('Failed to format the config file, please '
+                                  f'check the syntax of: \n{text}')
+        return text
+
+    def __repr__(self):
+        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
+
+    def __len__(self):
+        return len(self._cfg_dict)
+
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._cfg_dict, name)
+
+    def __getitem__(self, name):
+        return self._cfg_dict.__getitem__(name)
+
+    def __setattr__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setitem__(name, value)
+
+    def __iter__(self):
+        return iter(self._cfg_dict)
+
+    def __getstate__(
+            self
+    ) -> Tuple[dict, Optional[str], Optional[str], dict, bool, set]:
+        state = (self._cfg_dict, self._filename, self._text,
+                 self._env_variables, self._format_python_code,
+                 self._imported_names)
+        return state
+
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        other = cls.__new__(cls)
+        memo[id(self)] = other
+
+        for key, value in self.__dict__.items():
+            super(Config, other).__setattr__(key, copy.deepcopy(value, memo))
+
+        return other
+
+    def __copy__(self):
+        cls = self.__class__
+        other = cls.__new__(cls)
+        other.__dict__.update(self.__dict__)
+        super(Config, other).__setattr__('_cfg_dict', self._cfg_dict.copy())
+
+        return other
+
+    copy = __copy__
+
+    def __setstate__(self, state: Tuple[dict, Optional[str], Optional[str],
+                                        dict, bool, set]):
+        super().__setattr__('_cfg_dict', state[0])
+        super().__setattr__('_filename', state[1])
+        super().__setattr__('_text', state[2])
+        super().__setattr__('_env_variables', state[3])
+        super().__setattr__('_format_python_code', state[4])
+        super().__setattr__('_imported_names', state[5])
+
+    def dump(self, file: Optional[Union[str, Path]] = None):
+        """Dump config to file or return config text.
+
+        Args:
+            file (str or Path, optional): If not specified, then the object
+            is dumped to a str, otherwise to a file specified by the filename.
+            Defaults to None.
+
+        Returns:
+            str or None: Config text.
+        """
+        file = str(file) if isinstance(file, Path) else file
+        cfg_dict = self.to_dict()
+        if file is None:
+            if self.filename is None or self.filename.endswith('.py'):
+                return self.pretty_text
+            else:
+                file_format = self.filename.split('.')[-1]
+                return dump(cfg_dict, file_format=file_format)
+        elif file.endswith('.py'):
+            with open(file, 'w', encoding='utf-8') as f:
+                f.write(self.pretty_text)
+        else:
+            file_format = file.split('.')[-1]
+            return dump(cfg_dict, file=file, file_format=file_format)
+
+    def merge_from_dict(self,
+                        options: dict,
+                        allow_list_keys: bool = True) -> None:
+        """Merge list into cfg_dict.
+
+        Merge the dict parsed by MultipleKVAction into this cfg.
+
+        Args:
+            options (dict): dict of configs to merge from.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+                are allowed in ``options`` and will replace the element of the
+                corresponding index in the config if the config is a list.
+                Defaults to True.
+
+        Examples:
+            >>> from mmengine import Config
+            >>> #  Merge dictionary element
+            >>> options = {'model.backbone.depth': 50, 'model.backbone.with_cp': True}
+            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
+            >>> cfg.merge_from_dict(options)
+            >>> cfg._cfg_dict
+            {'model': {'backbone': {'type': 'ResNet', 'depth': 50, 'with_cp': True}}}
+            >>> # Merge list element
+            >>> cfg = Config(
+            >>>     dict(pipeline=[dict(type='LoadImage'),
+            >>>                    dict(type='LoadAnnotations')]))
+            >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')})
+            >>> cfg.merge_from_dict(options, allow_list_keys=True)
+            >>> cfg._cfg_dict
+            {'pipeline': [{'type': 'SelfLoadImage'}, {'type': 'LoadAnnotations'}]}
+        """  # noqa: E501
+        option_cfg_dict: dict = {}
+        for full_key, v in options.items():
+            d = option_cfg_dict
+            key_list = full_key.split('.')
+            for subkey in key_list[:-1]:
+                d.setdefault(subkey, ConfigDict())
+                d = d[subkey]
+            subkey = key_list[-1]
+            d[subkey] = v
+
+        cfg_dict = super().__getattribute__('_cfg_dict')
+        super().__setattr__(
+            '_cfg_dict',
+            Config._merge_a_into_b(
+                option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys))
+
+    @staticmethod
+    def diff(cfg1: Union[str, 'Config'], cfg2: Union[str, 'Config']) -> str:
+        if isinstance(cfg1, str):
+            cfg1 = Config.fromfile(cfg1)
+
+        if isinstance(cfg2, str):
+            cfg2 = Config.fromfile(cfg2)
+
+        res = difflib.unified_diff(
+            cfg1.pretty_text.split('\n'), cfg2.pretty_text.split('\n'))
+
+        # Convert into rich format for better visualization
+        console = Console()
+        text = Text()
+        for line in res:
+            if line.startswith('+'):
+                color = 'bright_green'
+            elif line.startswith('-'):
+                color = 'bright_red'
+            else:
+                color = 'bright_white'
+            _text = Text(line + '\n')
+            _text.stylize(color)
+            text.append(_text)
+
+        with console.capture() as capture:
+            console.print(text)
+
+        return capture.get()
+
+    @staticmethod
+    def _is_lazy_import(filename: str) -> bool:
+        if not filename.endswith('.py'):
+            return False
+        with open(filename, encoding='utf-8') as f:
+            codes_str = f.read()
+            parsed_codes = ast.parse(codes_str)
+        for node in ast.walk(parsed_codes):
+            if (isinstance(node, ast.Assign)
+                    and isinstance(node.targets[0], ast.Name)
+                    and node.targets[0].id == BASE_KEY):
+                return False
+
+            if isinstance(node, ast.With):
+                expr = node.items[0].context_expr
+                if (not isinstance(expr, ast.Call)
+                        or not expr.func.id == 'read_base'):  # type: ignore
+                    raise ConfigParsingError(
+                        'Only `read_base` context manager can be used in the '
+                        'config')
+                return True
+            if isinstance(node, ast.ImportFrom):
+                # relative import -> lazy_import
+                if node.level != 0:
+                    return True
+                # Skip checking when using `mmengine.config` in cfg file
+                if (node.module == 'mmengine' and len(node.names) == 1
+                        and node.names[0].name == 'Config'):
+                    continue
+                if not isinstance(node.module, str):
+                    continue
+                # non-builtin module -> lazy_import
+                if not _is_builtin_module(node.module):
+                    return True
+            if isinstance(node, ast.Import):
+                for alias_node in node.names:
+                    if not _is_builtin_module(alias_node.name):
+                        return True
+        return False
+
+    def _to_lazy_dict(self, keep_imported: bool = False) -> dict:
+        """Convert config object to dictionary with lazy object, and filter the
+        imported object."""
+        res = self._cfg_dict._to_lazy_dict()
+        if hasattr(self, '_imported_names') and not keep_imported:
+            res = {
+                key: value
+                for key, value in res.items()
+                if key not in self._imported_names
+            }
+        return res
+
+    def to_dict(self, keep_imported: bool = False):
+        """Convert all data in the config to a builtin ``dict``.
+
+        Args:
+            keep_imported (bool): Whether to keep the imported field.
+                Defaults to False
+
+        If you import third-party objects in the config file, all imported
+        objects will be converted to a string like ``torch.optim.SGD``
+        """
+        cfg_dict = self._cfg_dict.to_dict()
+        if hasattr(self, '_imported_names') and not keep_imported:
+            cfg_dict = {
+                key: value
+                for key, value in cfg_dict.items()
+                if key not in self._imported_names
+            }
+        return cfg_dict
+
+
+class DictAction(Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options can
+    be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit
+    brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build
+    list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]'
+    """
+
+    @staticmethod
+    def _parse_int_float_bool(val: str) -> Union[int, float, bool, Any]:
+        """parse int/float/bool value in the string."""
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return True if val.lower() == 'true' else False
+        if val == 'None':
+            return None
+        return val
+
+    @staticmethod
+    def _parse_iterable(val: str) -> Union[list, tuple, Any]:
+        """Parse iterable values in the string.
+
+        All elements inside '()' or '[]' are treated as iterable values.
+
+        Args:
+            val (str): Value string.
+
+        Returns:
+            list | tuple | Any: The expanded list or tuple from the string,
+            or single value if no iterable values are found.
+
+        Examples:
+            >>> DictAction._parse_iterable('1,2,3')
+            [1, 2, 3]
+            >>> DictAction._parse_iterable('[a, b, c]')
+            ['a', 'b', 'c']
+            >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]')
+            [(1, 2, 3), ['a', 'b'], 'c']
+        """
+
+        def find_next_comma(string):
+            """Find the position of next comma in the string.
+
+            If no ',' is found in the string, return the string length. All
+            chars inside '()' and '[]' are treated as one element and thus ','
+            inside these brackets are ignored.
+            """
+            assert (string.count('(') == string.count(')')) and (
+                    string.count('[') == string.count(']')), \
+                f'Imbalanced brackets exist in {string}'
+            end = len(string)
+            for idx, char in enumerate(string):
+                pre = string[:idx]
+                # The string before this ',' is balanced
+                if ((char == ',') and (pre.count('(') == pre.count(')'))
+                        and (pre.count('[') == pre.count(']'))):
+                    end = idx
+                    break
+            return end
+
+        # Strip ' and " characters and replace whitespace.
+        val = val.strip('\'\"').replace(' ', '')
+        is_tuple = False
+        if val.startswith('(') and val.endswith(')'):
+            is_tuple = True
+            val = val[1:-1]
+        elif val.startswith('[') and val.endswith(']'):
+            val = val[1:-1]
+        elif ',' not in val:
+            # val is a single value
+            return DictAction._parse_int_float_bool(val)
+
+        values = []
+        while len(val) > 0:
+            comma_idx = find_next_comma(val)
+            element = DictAction._parse_iterable(val[:comma_idx])
+            values.append(element)
+            val = val[comma_idx + 1:]
+
+        if is_tuple:
+            return tuple(values)
+
+        return values
+
+    def __call__(self,
+                 parser: ArgumentParser,
+                 namespace: Namespace,
+                 values: Union[str, Sequence[Any], None],
+                 option_string: str = None):
+        """Parse Variables in string and add them into argparser.
+
+        Args:
+            parser (ArgumentParser): Argument parser.
+            namespace (Namespace): Argument namespace.
+            values (Union[str, Sequence[Any], None]): Argument string.
+            option_string (list[str], optional): Option string.
+                Defaults to None.
+        """
+        # Copied behavior from `argparse._ExtendAction`.
+        options = copy.copy(getattr(namespace, self.dest, None) or {})
+        if values is not None:
+            for kv in values:
+                key, val = kv.split('=', maxsplit=1)
+                options[key] = self._parse_iterable(val)
+        setattr(namespace, self.dest, options)
+
+
+@contextmanager
+def read_base():
+    """Context manager to mark the base config.
+
+    The pure Python-style configuration file allows you to use the import
+    syntax. However, it is important to note that you need to import the base
+    configuration file within the context of ``read_base``, and import other
+    dependencies outside of it.
+
+    You can see more usage of Python-style configuration in the `tutorial`_
+
+    .. _tutorial: https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta
+    """  # noqa: E501
+    yield
diff --git a/head_extractor/build/lib/mmengine/config/lazy.py b/head_extractor/build/lib/mmengine/config/lazy.py
new file mode 100644
index 0000000000000000000000000000000000000000..e83cce7c89cbd919def774addaaac8186c10e989
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/config/lazy.py
@@ -0,0 +1,241 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import importlib
+from typing import Any, Optional, Union
+
+from mmengine.utils import is_seq_of
+
+
+class LazyObject:
+    """LazyObject is used to lazily initialize the imported module during
+    parsing the configuration file.
+
+    During parsing process, the syntax like:
+
+    Examples:
+        >>> import torch.nn as nn
+        >>> from mmdet.models import RetinaNet
+        >>> import mmcls.models
+        >>> import mmcls.datasets
+        >>> import mmcls
+
+    Will be parsed as:
+
+    Examples:
+        >>> # import torch.nn as nn
+        >>> nn = lazyObject('torch.nn')
+        >>> # from mmdet.models import RetinaNet
+        >>> RetinaNet = lazyObject('mmdet.models', 'RetinaNet')
+        >>> # import mmcls.models; import mmcls.datasets; import mmcls
+        >>> mmcls = lazyObject(['mmcls', 'mmcls.datasets', 'mmcls.models'])
+
+    ``LazyObject`` records all module information and will be further
+    referenced by the configuration file.
+
+    Args:
+        module (str or list or tuple): The module name to be imported.
+        imported (str, optional): The imported module name. Defaults to None.
+        location (str, optional): The filename and line number of the imported
+            module statement happened.
+    """
+
+    def __init__(self,
+                 module: Union[str, list, tuple],
+                 imported: Optional[str] = None,
+                 location: Optional[str] = None):
+        if not isinstance(module, str) and not is_seq_of(module, str):
+            raise TypeError('module should be `str`, `list`, or `tuple`'
+                            f'but got {type(module)}, this might be '
+                            'a bug of MMEngine, please report it to '
+                            'https://github.com/open-mmlab/mmengine/issues')
+        self._module: Union[str, list, tuple] = module
+
+        if not isinstance(imported, str) and imported is not None:
+            raise TypeError('imported should be `str` or None, but got '
+                            f'{type(imported)}, this might be '
+                            'a bug of MMEngine, please report it to '
+                            'https://github.com/open-mmlab/mmengine/issues')
+        self._imported = imported
+        self.location = location
+
+    def build(self) -> Any:
+        """Return imported object.
+
+        Returns:
+            Any: Imported object
+        """
+        if isinstance(self._module, str):
+            try:
+                module = importlib.import_module(self._module)
+            except Exception as e:
+                raise type(e)(f'Failed to import {self._module} '
+                              f'in {self.location} for {e}')
+
+            if self._imported is not None:
+                if hasattr(module, self._imported):
+                    module = getattr(module, self._imported)
+                else:
+                    raise ImportError(
+                        f'Failed to import {self._imported} '
+                        f'from {self._module} in {self.location}')
+
+            return module
+        else:
+            # import xxx.xxx
+            # import xxx.yyy
+            # import xxx.zzz
+            # return imported xxx
+            try:
+                for module in self._module:
+                    importlib.import_module(module)  # type: ignore
+                module_name = self._module[0].split('.')[0]
+                return importlib.import_module(module_name)
+            except Exception as e:
+                raise type(e)(f'Failed to import {self.module} '
+                              f'in {self.location} for {e}')
+
+    @property
+    def module(self):
+        if isinstance(self._module, str):
+            return self._module
+        return self._module[0].split('.')[0]
+
+    def __call__(self, *args, **kwargs):
+        raise RuntimeError()
+
+    def __deepcopy__(self, memo):
+        return LazyObject(self._module, self._imported, self.location)
+
+    def __getattr__(self, name):
+        # Cannot locate the line number of the getting attribute.
+        # Therefore only record the filename.
+        if self.location is not None:
+            location = self.location.split(', line')[0]
+        else:
+            location = self.location
+        return LazyAttr(name, self, location)
+
+    def __str__(self) -> str:
+        if self._imported is not None:
+            return self._imported
+        return self.module
+
+    __repr__ = __str__
+
+    # `pickle.dump` will try to get the `__getstate__` and `__setstate__`
+    # methods of the dumped object. If these two methods are not defined,
+    # LazyObject will return a `__getstate__` LazyObject` or `__setstate__`
+    # LazyObject.
+    def __getstate__(self):
+        return self.__dict__
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+
+
+class LazyAttr:
+    """The attribute of the LazyObject.
+
+    When parsing the configuration file, the imported syntax will be
+    parsed as the assignment ``LazyObject``. During the subsequent parsing
+    process, users may reference the attributes of the LazyObject.
+    To ensure that these attributes also contain information needed to
+    reconstruct the attribute itself, LazyAttr was introduced.
+
+    Examples:
+        >>> models = LazyObject(['mmdet.models'])
+        >>> model = dict(type=models.RetinaNet)
+        >>> print(type(model['type']))  # <class 'mmengine.config.lazy.LazyAttr'>
+        >>> print(model['type'].build())  # <class 'mmdet.models.detectors.retinanet.RetinaNet'>
+    """  # noqa: E501
+
+    def __init__(self,
+                 name: str,
+                 source: Union['LazyObject', 'LazyAttr'],
+                 location=None):
+        self.name = name
+        self.source: Union[LazyAttr, LazyObject] = source
+
+        if isinstance(self.source, LazyObject):
+            if isinstance(self.source._module, str):
+                if self.source._imported is None:
+                    # source code:
+                    # from xxx.yyy import zzz
+                    # equivalent code:
+                    # zzz = LazyObject('xxx.yyy', 'zzz')
+                    # The source code of get attribute:
+                    # eee = zzz.eee
+                    # Then, `eee._module` should be "xxx.yyy.zzz"
+                    self._module = self.source._module
+                else:
+                    # source code:
+                    # import xxx.yyy as zzz
+                    # equivalent code:
+                    # zzz = LazyObject('xxx.yyy')
+                    # The source code of get attribute:
+                    # eee = zzz.eee
+                    # Then, `eee._module` should be "xxx.yyy"
+                    self._module = f'{self.source._module}.{self.source}'
+            else:
+                # The source code of LazyObject should be
+                # 1. import xxx.yyy
+                # 2. import xxx.zzz
+                # Equivalent to
+                # xxx = LazyObject(['xxx.yyy', 'xxx.zzz'])
+
+                # The source code of LazyAttr should be
+                # eee = xxx.eee
+                # Then, eee._module = xxx
+                self._module = str(self.source)
+        elif isinstance(self.source, LazyAttr):
+            # 1. import xxx
+            # 2. zzz = xxx.yyy.zzz
+
+            # Equivalent to:
+            # xxx = LazyObject('xxx')
+            # zzz = xxx.yyy.zzz
+            # zzz._module = xxx.yyy._module + zzz.name
+            self._module = f'{self.source._module}.{self.source.name}'
+        self.location = location
+
+    @property
+    def module(self):
+        return self._module
+
+    def __call__(self, *args, **kwargs: Any) -> Any:
+        raise RuntimeError()
+
+    def __getattr__(self, name: str) -> 'LazyAttr':
+        return LazyAttr(name, self)
+
+    def __deepcopy__(self, memo):
+        return LazyAttr(self.name, self.source)
+
+    def build(self) -> Any:
+        """Return the attribute of the imported object.
+
+        Returns:
+            Any: attribute of the imported object.
+        """
+        obj = self.source.build()
+        try:
+            return getattr(obj, self.name)
+        except AttributeError:
+            raise ImportError(f'Failed to import {self.module}.{self.name} in '
+                              f'{self.location}')
+        except ImportError as e:
+            raise e
+
+    def __str__(self) -> str:
+        return self.name
+
+    __repr__ = __str__
+
+    # `pickle.dump` will try to get the `__getstate__` and `__setstate__`
+    # methods of the dumped object. If these two methods are not defined,
+    # LazyAttr will return a `__getstate__` LazyAttr` or `__setstate__`
+    # LazyAttr.
+    def __getstate__(self):
+        return self.__dict__
+
+    def __setstate__(self, state):
+        self.__dict__ = state
diff --git a/head_extractor/build/lib/mmengine/config/utils.py b/head_extractor/build/lib/mmengine/config/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..81b58fb49a06090047e94f665ded8890e093ba1e
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/config/utils.py
@@ -0,0 +1,469 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import ast
+import os.path as osp
+import re
+import sys
+import warnings
+from collections import defaultdict
+from importlib.util import find_spec
+from typing import List, Optional, Tuple, Union
+
+from mmengine.fileio import load
+from mmengine.utils import check_file_exist
+
+PYTHON_ROOT_DIR = osp.dirname(osp.dirname(sys.executable))
+SYSTEM_PYTHON_PREFIX = '/usr/lib/python'
+
+MODULE2PACKAGE = {
+    'mmcls': 'mmcls',
+    'mmdet': 'mmdet',
+    'mmdet3d': 'mmdet3d',
+    'mmseg': 'mmsegmentation',
+    'mmaction': 'mmaction2',
+    'mmtrack': 'mmtrack',
+    'mmpose': 'mmpose',
+    'mmedit': 'mmedit',
+    'mmocr': 'mmocr',
+    'mmgen': 'mmgen',
+    'mmfewshot': 'mmfewshot',
+    'mmrazor': 'mmrazor',
+    'mmflow': 'mmflow',
+    'mmhuman3d': 'mmhuman3d',
+    'mmrotate': 'mmrotate',
+    'mmselfsup': 'mmselfsup',
+    'mmyolo': 'mmyolo',
+    'mmpretrain': 'mmpretrain',
+    'mmagic': 'mmagic',
+}
+
+# PKG2PROJECT is not a proper name to represent the mapping between module name
+# (module import from) and package name (used by pip install). Therefore,
+# PKG2PROJECT will be deprecated and this alias will only be kept until
+# MMEngine v1.0.0
+PKG2PROJECT = MODULE2PACKAGE
+
+
+class ConfigParsingError(RuntimeError):
+    """Raise error when failed to parse pure Python style config files."""
+
+
+def _get_cfg_metainfo(package_path: str, cfg_path: str) -> dict:
+    """Get target meta information from all 'metafile.yml' defined in `mode-
+    index.yml` of external package.
+
+    Args:
+        package_path (str): Path of external package.
+        cfg_path (str): Name of experiment config.
+
+    Returns:
+        dict: Meta information of target experiment.
+    """
+    meta_index_path = osp.join(package_path, '.mim', 'model-index.yml')
+    meta_index = load(meta_index_path)
+    cfg_dict = dict()
+    for meta_path in meta_index['Import']:
+        meta_path = osp.join(package_path, '.mim', meta_path)
+        cfg_meta = load(meta_path)
+        for model_cfg in cfg_meta['Models']:
+            if 'Config' not in model_cfg:
+                warnings.warn(f'There is not `Config` define in {model_cfg}')
+                continue
+            cfg_name = model_cfg['Config'].partition('/')[-1]
+            # Some config could have multiple weights, we only pick the
+            # first one.
+            if cfg_name in cfg_dict:
+                continue
+            cfg_dict[cfg_name] = model_cfg
+    if cfg_path not in cfg_dict:
+        raise ValueError(f'Expected configs: {cfg_dict.keys()}, but got '
+                         f'{cfg_path}')
+    return cfg_dict[cfg_path]
+
+
+def _get_external_cfg_path(package_path: str, cfg_file: str) -> str:
+    """Get config path of external package.
+
+    Args:
+        package_path (str): Path of external package.
+        cfg_file (str): Name of experiment config.
+
+    Returns:
+        str: Absolute config path from external package.
+    """
+    cfg_file = cfg_file.split('.')[0]
+    model_cfg = _get_cfg_metainfo(package_path, cfg_file)
+    cfg_path = osp.join(package_path, model_cfg['Config'])
+    check_file_exist(cfg_path)
+    return cfg_path
+
+
+def _get_external_cfg_base_path(package_path: str, cfg_name: str) -> str:
+    """Get base config path of external package.
+
+    Args:
+        package_path (str): Path of external package.
+        cfg_name (str): External relative config path with 'package::'.
+
+    Returns:
+        str: Absolute config path from external package.
+    """
+    cfg_path = osp.join(package_path, '.mim', 'configs', cfg_name)
+    check_file_exist(cfg_path)
+    return cfg_path
+
+
+def _get_package_and_cfg_path(cfg_path: str) -> Tuple[str, str]:
+    """Get package name and relative config path.
+
+    Args:
+        cfg_path (str): External relative config path with 'package::'.
+
+    Returns:
+        Tuple[str, str]: Package name and config path.
+    """
+    if re.match(r'\w*::\w*/\w*', cfg_path) is None:
+        raise ValueError(
+            '`_get_package_and_cfg_path` is used for get external package, '
+            'please specify the package name and relative config path, just '
+            'like `mmdet::faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py`')
+    package_cfg = cfg_path.split('::')
+    if len(package_cfg) > 2:
+        raise ValueError('`::` should only be used to separate package and '
+                         'config name, but found multiple `::` in '
+                         f'{cfg_path}')
+    package, cfg_path = package_cfg
+    assert package in MODULE2PACKAGE, (
+        f'mmengine does not support to load {package} config.')
+    package = MODULE2PACKAGE[package]
+    return package, cfg_path
+
+
+class RemoveAssignFromAST(ast.NodeTransformer):
+    """Remove Assign node if the target's name match the key.
+
+    Args:
+        key (str): The target name of the Assign node.
+    """
+
+    def __init__(self, key):
+        self.key = key
+
+    def visit_Assign(self, node):
+        if (isinstance(node.targets[0], ast.Name)
+                and node.targets[0].id == self.key):
+            return None
+        else:
+            return node
+
+
+def _is_builtin_module(module_name: str) -> bool:
+    """Check if a module is a built-in module.
+
+    Arg:
+        module_name: name of module.
+    """
+    if module_name.startswith('.'):
+        return False
+    if module_name.startswith('mmengine.config'):
+        return True
+    if module_name in sys.builtin_module_names:
+        return True
+    spec = find_spec(module_name.split('.')[0])
+    # Module not found
+    if spec is None:
+        return False
+    origin_path = getattr(spec, 'origin', None)
+    if origin_path is None:
+        return False
+    origin_path = osp.abspath(origin_path)
+    if ('site-package' in origin_path or 'dist-package' in origin_path
+            or not origin_path.startswith(
+                (PYTHON_ROOT_DIR, SYSTEM_PYTHON_PREFIX))):
+        return False
+    else:
+        return True
+
+
+class ImportTransformer(ast.NodeTransformer):
+    """Convert the import syntax to the assignment of
+    :class:`mmengine.config.LazyObject` and preload the base variable before
+    parsing the configuration file.
+
+    Since you are already looking at this part of the code, I believe you must
+    be interested in the mechanism of the ``lazy_import`` feature of
+    :class:`Config`. In this docstring, we will dive deeper into its
+    principles.
+
+    Most of OpenMMLab users maybe bothered with that:
+
+        * In most of popular IDEs, they cannot navigate to the source code in
+          configuration file
+        * In most of popular IDEs, they cannot jump to the base file in current
+          configuration file, which is much painful when the inheritance
+          relationship is complex.
+
+    In order to solve this problem, we introduce the ``lazy_import`` mode.
+
+    A very intuitive idea for solving this problem is to import the module
+    corresponding to the "type" field using the ``import`` syntax. Similarly,
+    we can also ``import`` base file.
+
+    However, this approach has a significant drawback. It requires triggering
+    the import logic to parse the configuration file, which can be
+    time-consuming. Additionally, it implies downloading numerous dependencies
+    solely for the purpose of parsing the configuration file.
+    However, it's possible that only a portion of the config will actually be
+    used. For instance, the package used in the ``train_pipeline`` may not
+    be necessary for an evaluation task. Forcing users to download these
+    unused packages is not a desirable solution.
+
+    To avoid this problem, we introduce :class:`mmengine.config.LazyObject` and
+    :class:`mmengine.config.LazyAttr`. Before we proceed with further
+    explanations, you may refer to the documentation of these two modules to
+    gain an understanding of their functionalities.
+
+    Actually, one of the functions of ``ImportTransformer`` is to hack the
+    ``import`` syntax. It will replace the import syntax
+    (exclude import the base files) with the assignment of ``LazyObject``.
+
+    As for the import syntax of the base file, we cannot lazy import it since
+    we're eager to merge the fields of current file and base files. Therefore,
+    another function of the ``ImportTransformer`` is to collaborate with
+    ``Config._parse_lazy_import`` to parse the base files.
+
+    Args:
+        global_dict (dict): The global dict of the current configuration file.
+            If we divide ordinary Python syntax into two parts, namely the
+            import section and the non-import section (assuming a simple case
+            with imports at the beginning and the rest of the code following),
+            the variables generated by the import statements are stored in
+            global variables for subsequent code use. In this context,
+            the ``global_dict`` represents the global variables required when
+            executing the non-import code. ``global_dict`` will be filled
+            during visiting the parsed code.
+        base_dict (dict): All variables defined in base files.
+
+            Examples:
+                >>> from mmengine.config import read_base
+                >>>
+                >>>
+                >>> with read_base():
+                >>>     from .._base_.default_runtime import *
+                >>>     from .._base_.datasets.coco_detection import dataset
+
+            In this case, the base_dict will be:
+
+            Examples:
+                >>> base_dict = {
+                >>>     '.._base_.default_runtime': ...
+                >>>     '.._base_.datasets.coco_detection': dataset}
+
+            and `global_dict` will be updated like this:
+
+            Examples:
+                >>> global_dict.update(base_dict['.._base_.default_runtime'])  # `import *` means update all data
+                >>> global_dict.update(dataset=base_dict['.._base_.datasets.coco_detection']['dataset'])  # only update `dataset`
+    """  # noqa: E501
+
+    def __init__(self,
+                 global_dict: dict,
+                 base_dict: Optional[dict] = None,
+                 filename: Optional[str] = None):
+        self.base_dict = base_dict if base_dict is not None else {}
+        self.global_dict = global_dict
+        # In Windows, the filename could be like this:
+        # "C:\\Users\\runneradmin\\AppData\\Local\\"
+        # Although it has been an raw string, ast.parse will firstly escape
+        # it as the executed code:
+        # "C:\Users\runneradmin\AppData\Local\\\"
+        # As you see, the `\U` will be treated as a part of
+        # the escape sequence during code parsing, leading to an
+        # parsing error
+        # Here we use `encode('unicode_escape').decode()` for double escaping
+        if isinstance(filename, str):
+            filename = filename.encode('unicode_escape').decode()
+        self.filename = filename
+        self.imported_obj: set = set()
+        super().__init__()
+
+    def visit_ImportFrom(
+        self, node: ast.ImportFrom
+    ) -> Optional[Union[List[ast.Assign], ast.ImportFrom]]:
+        """Hack the ``from ... import ...`` syntax and update the global_dict.
+
+        Examples:
+            >>> from mmdet.models import RetinaNet
+
+        Will be parsed as:
+
+        Examples:
+            >>> RetinaNet = lazyObject('mmdet.models', 'RetinaNet')
+
+        ``global_dict`` will also be updated by ``base_dict`` as the
+        class docstring says.
+
+        Args:
+            node (ast.AST): The node of the current import statement.
+
+        Returns:
+            Optional[List[ast.Assign]]: There three cases:
+
+                * If the node is a statement of importing base files.
+                  None will be returned.
+                * If the node is a statement of importing a builtin module,
+                  node will be directly returned
+                * Otherwise, it will return the assignment statements of
+                  ``LazyObject``.
+        """
+        # Built-in modules will not be parsed as LazyObject
+        module = f'{node.level*"."}{node.module}'
+        if _is_builtin_module(module):
+            # Make sure builtin module will be added into `self.imported_obj`
+            for alias in node.names:
+                if alias.asname is not None:
+                    self.imported_obj.add(alias.asname)
+                elif alias.name == '*':
+                    raise ConfigParsingError(
+                        'Cannot import * from non-base config')
+                else:
+                    self.imported_obj.add(alias.name)
+            return node
+
+        if module in self.base_dict:
+            for alias_node in node.names:
+                if alias_node.name == '*':
+                    self.global_dict.update(self.base_dict[module])
+                    return None
+                if alias_node.asname is not None:
+                    base_key = alias_node.asname
+                else:
+                    base_key = alias_node.name
+                self.global_dict[base_key] = self.base_dict[module][
+                    alias_node.name]
+            return None
+
+        nodes: List[ast.Assign] = []
+        for alias_node in node.names:
+            # `ast.alias` has lineno attr after Python 3.10,
+            if hasattr(alias_node, 'lineno'):
+                lineno = alias_node.lineno
+            else:
+                lineno = node.lineno
+            if alias_node.name == '*':
+                # TODO: If users import * from a non-config module, it should
+                # fallback to import the real module and raise a warning to
+                # remind users the real module will be imported which will slow
+                # down the parsing speed.
+                raise ConfigParsingError(
+                    'Illegal syntax in config! `from xxx import *` is not '
+                    'allowed to appear outside the `if base:` statement')
+            elif alias_node.asname is not None:
+                # case1:
+                # from mmengine.dataset import BaseDataset as Dataset ->
+                # Dataset = LazyObject('mmengine.dataset', 'BaseDataset')
+                code = f'{alias_node.asname} = LazyObject("{module}", "{alias_node.name}", "{self.filename}, line {lineno}")'  # noqa: E501
+                self.imported_obj.add(alias_node.asname)
+            else:
+                # case2:
+                # from mmengine.model import BaseModel
+                # BaseModel = LazyObject('mmengine.model', 'BaseModel')
+                code = f'{alias_node.name} = LazyObject("{module}", "{alias_node.name}", "{self.filename}, line {lineno}")'  # noqa: E501
+                self.imported_obj.add(alias_node.name)
+            try:
+                nodes.append(ast.parse(code).body[0])  # type: ignore
+            except Exception as e:
+                raise ConfigParsingError(
+                    f'Cannot import {alias_node} from {module}'
+                    '1. Cannot import * from 3rd party lib in the config '
+                    'file\n'
+                    '2. Please check if the module is a base config which '
+                    'should be added to `_base_`\n') from e
+        return nodes
+
+    def visit_Import(self, node) -> Union[ast.Assign, ast.Import]:
+        """Work with ``_gather_abs_import_lazyobj`` to hack the ``import ...``
+        syntax.
+
+        Examples:
+            >>> import mmcls.models
+            >>> import mmcls.datasets
+            >>> import mmcls
+
+        Will be parsed as:
+
+        Examples:
+            >>> # import mmcls.models; import mmcls.datasets; import mmcls
+            >>> mmcls = lazyObject(['mmcls', 'mmcls.datasets', 'mmcls.models'])
+
+        Args:
+            node (ast.AST): The node of the current import statement.
+
+        Returns:
+            ast.Assign: If the import statement is ``import ... as ...``,
+            ast.Assign will be returned, otherwise node will be directly
+            returned.
+        """
+        # For absolute import like: `import mmdet.configs as configs`.
+        # It will be parsed as:
+        # configs = LazyObject('mmdet.configs')
+        # For absolute import like:
+        # `import mmdet.configs`
+        # `import mmdet.configs.default_runtime`
+        # This will be parsed as
+        # mmdet = LazyObject(['mmdet.configs.default_runtime', 'mmdet.configs])
+        # However, visit_Import cannot gather other import information, so
+        # `_gather_abs_import_LazyObject` will gather all import information
+        # from the same module and construct the LazyObject.
+        alias_list = node.names
+        assert len(alias_list) == 1, (
+            'Illegal syntax in config! import multiple modules in one line is '
+            'not supported')
+        # TODO Support multiline import
+        alias = alias_list[0]
+        if alias.asname is not None:
+            self.imported_obj.add(alias.asname)
+            if _is_builtin_module(alias.name.split('.')[0]):
+                return node
+            return ast.parse(  # type: ignore
+                f'{alias.asname} = LazyObject('
+                f'"{alias.name}",'
+                f'location="{self.filename}, line {node.lineno}")').body[0]
+        return node
+
+
+def _gather_abs_import_lazyobj(tree: ast.Module,
+                               filename: Optional[str] = None):
+    """Experimental implementation of gathering absolute import information."""
+    if isinstance(filename, str):
+        filename = filename.encode('unicode_escape').decode()
+    imported = defaultdict(list)
+    abs_imported = set()
+    new_body: List[ast.stmt] = []
+    # module2node is used to get lineno when Python < 3.10
+    module2node: dict = dict()
+    for node in tree.body:
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                # Skip converting built-in module to LazyObject
+                if _is_builtin_module(alias.name):
+                    new_body.append(node)
+                    continue
+                module = alias.name.split('.')[0]
+                module2node.setdefault(module, node)
+                imported[module].append(alias)
+            continue
+        new_body.append(node)
+
+    for key, value in imported.items():
+        names = [_value.name for _value in value]
+        if hasattr(value[0], 'lineno'):
+            lineno = value[0].lineno
+        else:
+            lineno = module2node[key].lineno
+        lazy_module_assign = ast.parse(
+            f'{key} = LazyObject({names}, location="{filename}, line {lineno}")'  # noqa: E501
+        )  # noqa: E501
+        abs_imported.add(key)
+        new_body.insert(0, lazy_module_assign.body[0])
+    tree.body = new_body
+    return tree, abs_imported
diff --git a/head_extractor/build/lib/mmengine/dataset/__init__.py b/head_extractor/build/lib/mmengine/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c58ef983f4af79aa1f29a24e42f4d20f1089b133
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/dataset/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_dataset import BaseDataset, Compose, force_full_init
+from .dataset_wrapper import ClassBalancedDataset, ConcatDataset, RepeatDataset
+from .sampler import DefaultSampler, InfiniteSampler
+from .utils import (COLLATE_FUNCTIONS, default_collate, pseudo_collate,
+                    worker_init_fn)
+
+__all__ = [
+    'BaseDataset', 'Compose', 'force_full_init', 'ClassBalancedDataset',
+    'ConcatDataset', 'RepeatDataset', 'DefaultSampler', 'InfiniteSampler',
+    'worker_init_fn', 'pseudo_collate', 'COLLATE_FUNCTIONS', 'default_collate'
+]
diff --git a/head_extractor/build/lib/mmengine/dataset/base_dataset.py b/head_extractor/build/lib/mmengine/dataset/base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4622f146a5884367a8475494a9967d004aa89b8d
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/dataset/base_dataset.py
@@ -0,0 +1,826 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import functools
+import gc
+import logging
+import pickle
+from collections.abc import Mapping
+from typing import Any, Callable, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmengine.config import Config
+from mmengine.fileio import join_path, list_from_file, load
+from mmengine.logging import print_log
+from mmengine.registry import TRANSFORMS
+from mmengine.utils import is_abs
+
+
+class Compose:
+    """Compose multiple transforms sequentially.
+
+    Args:
+        transforms (Sequence[dict, callable], optional): Sequence of transform
+            object or config dict to be composed.
+    """
+
+    def __init__(self, transforms: Optional[Sequence[Union[dict, Callable]]]):
+        self.transforms: List[Callable] = []
+
+        if transforms is None:
+            transforms = []
+
+        for transform in transforms:
+            # `Compose` can be built with config dict with type and
+            # corresponding arguments.
+            if isinstance(transform, dict):
+                transform = TRANSFORMS.build(transform)
+                if not callable(transform):
+                    raise TypeError(f'transform should be a callable object, '
+                                    f'but got {type(transform)}')
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError(
+                    f'transform must be a callable object or dict, '
+                    f'but got {type(transform)}')
+
+    def __call__(self, data: dict) -> Optional[dict]:
+        """Call function to apply transforms sequentially.
+
+        Args:
+            data (dict): A result dict contains the data to transform.
+
+        Returns:
+           dict: Transformed data.
+        """
+        for t in self.transforms:
+            data = t(data)
+            # The transform will return None when it failed to load images or
+            # cannot find suitable augmentation parameters to augment the data.
+            # Here we simply return None if the transform returns None and the
+            # dataset will handle it by randomly selecting another data sample.
+            if data is None:
+                return None
+        return data
+
+    def __repr__(self):
+        """Print ``self.transforms`` in sequence.
+
+        Returns:
+            str: Formatted string.
+        """
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += f'    {t}'
+        format_string += '\n)'
+        return format_string
+
+
+def force_full_init(old_func: Callable) -> Any:
+    """Those methods decorated by ``force_full_init`` will be forced to call
+    ``full_init`` if the instance has not been fully initiated.
+
+    Args:
+        old_func (Callable): Decorated function, make sure the first arg is an
+            instance with ``full_init`` method.
+
+    Returns:
+        Any: Depends on old_func.
+    """
+
+    @functools.wraps(old_func)
+    def wrapper(obj: object, *args, **kwargs):
+        # The instance must have `full_init` method.
+        if not hasattr(obj, 'full_init'):
+            raise AttributeError(f'{type(obj)} does not have full_init '
+                                 'method.')
+        # If instance does not have `_fully_initialized` attribute or
+        # `_fully_initialized` is False, call `full_init` and set
+        # `_fully_initialized` to True
+        if not getattr(obj, '_fully_initialized', False):
+            print_log(
+                f'Attribute `_fully_initialized` is not defined in '
+                f'{type(obj)} or `type(obj)._fully_initialized is '
+                'False, `full_init` will be called and '
+                f'{type(obj)}._fully_initialized will be set to True',
+                logger='current',
+                level=logging.WARNING)
+            obj.full_init()  # type: ignore
+            obj._fully_initialized = True  # type: ignore
+
+        return old_func(obj, *args, **kwargs)
+
+    return wrapper
+
+
+class BaseDataset(Dataset):
+    r"""BaseDataset for open source projects in OpenMMLab.
+
+    The annotation format is shown as follows.
+
+    .. code-block:: none
+
+        {
+            "metainfo":
+            {
+              "dataset_type": "test_dataset",
+              "task_name": "test_task"
+            },
+            "data_list":
+            [
+              {
+                "img_path": "test_img.jpg",
+                "height": 604,
+                "width": 640,
+                "instances":
+                [
+                  {
+                    "bbox": [0, 0, 10, 20],
+                    "bbox_label": 1,
+                    "mask": [[0,0],[0,10],[10,20],[20,0]],
+                    "extra_anns": [1,2,3]
+                  },
+                  {
+                    "bbox": [10, 10, 110, 120],
+                    "bbox_label": 2,
+                    "mask": [[10,10],[10,110],[110,120],[120,10]],
+                    "extra_anns": [4,5,6]
+                  }
+                ]
+              },
+            ]
+        }
+
+    Args:
+        ann_file (str, optional): Annotation file path. Defaults to ''.
+        metainfo (Mapping or Config, optional): Meta information for
+            dataset, such as class information. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to ''.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(img_path='').
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+
+    Note:
+        BaseDataset collects meta information from ``annotation file`` (the
+        lowest priority), ``BaseDataset.METAINFO``(medium) and ``metainfo
+        parameter`` (highest) passed to constructors. The lower priority meta
+        information will be overwritten by higher one.
+
+    Note:
+        Dataset wrapper such as ``ConcatDataset``, ``RepeatDataset`` .etc.
+        should not inherit from ``BaseDataset`` since ``get_subset`` and
+        ``get_subset_`` could produce ambiguous meaning sub-dataset which
+        conflicts with original dataset.
+
+    Examples:
+        >>> # Assume the annotation file is given above.
+        >>> class CustomDataset(BaseDataset):
+        >>>     METAINFO: dict = dict(task_name='custom_task',
+        >>>                           dataset_type='custom_type')
+        >>> metainfo=dict(task_name='custom_task_name')
+        >>> custom_dataset = CustomDataset(
+        >>>                      'path/to/ann_file',
+        >>>                      metainfo=metainfo)
+        >>> # meta information of annotation file will be overwritten by
+        >>> # `CustomDataset.METAINFO`. The merged meta information will
+        >>> # further be overwritten by argument `metainfo`.
+        >>> custom_dataset.metainfo
+        {'task_name': custom_task_name, dataset_type: custom_type}
+    """
+
+    METAINFO: dict = dict()
+    _fully_initialized: bool = False
+
+    def __init__(self,
+                 ann_file: Optional[str] = '',
+                 metainfo: Union[Mapping, Config, None] = None,
+                 data_root: Optional[str] = '',
+                 data_prefix: dict = dict(img_path=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000):
+        self.ann_file = ann_file
+        self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
+        self.data_root = data_root
+        self.data_prefix = copy.copy(data_prefix)
+        self.filter_cfg = copy.deepcopy(filter_cfg)
+        self._indices = indices
+        self.serialize_data = serialize_data
+        self.test_mode = test_mode
+        self.max_refetch = max_refetch
+        self.data_list: List[dict] = []
+        self.data_bytes: np.ndarray
+
+        # Join paths.
+        self._join_prefix()
+
+        # Build pipeline.
+        self.pipeline = Compose(pipeline)
+        # Full initialize the dataset.
+        if not lazy_init:
+            self.full_init()
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index and automatically call ``full_init`` if the
+        dataset has not been fully initialized.
+
+        Args:
+            idx (int): The index of data.
+
+        Returns:
+            dict: The idx-th annotation of the dataset.
+        """
+        if self.serialize_data:
+            start_addr = 0 if idx == 0 else self.data_address[idx - 1].item()
+            end_addr = self.data_address[idx].item()
+            bytes = memoryview(
+                self.data_bytes[start_addr:end_addr])  # type: ignore
+            data_info = pickle.loads(bytes)  # type: ignore
+        else:
+            data_info = copy.deepcopy(self.data_list[idx])
+        # Some codebase needs `sample_idx` of data information. Here we convert
+        # the idx to a positive number and save it in data information.
+        if idx >= 0:
+            data_info['sample_idx'] = idx
+        else:
+            data_info['sample_idx'] = len(self) + idx
+
+        return data_info
+
+    def full_init(self):
+        """Load annotation file and set ``BaseDataset._fully_initialized`` to
+        True.
+
+        If ``lazy_init=False``, ``full_init`` will be called during the
+        instantiation and ``self._fully_initialized`` will be set to True. If
+        ``obj._fully_initialized=False``, the class method decorated by
+        ``force_full_init`` will call ``full_init`` automatically.
+
+        Several steps to initialize annotation:
+
+            - load_data_list: Load annotations from annotation file.
+            - filter data information: Filter annotations according to
+              filter_cfg.
+            - slice_data: Slice dataset according to ``self._indices``
+            - serialize_data: Serialize ``self.data_list`` if
+              ``self.serialize_data`` is True.
+        """
+        if self._fully_initialized:
+            return
+        # load data information
+        self.data_list = self.load_data_list()
+        # filter illegal data, such as data that has no annotations.
+        self.data_list = self.filter_data()
+        # Get subset data according to indices.
+        if self._indices is not None:
+            self.data_list = self._get_unserialized_subset(self._indices)
+
+        # serialize data_list
+        if self.serialize_data:
+            self.data_bytes, self.data_address = self._serialize_data()
+
+        self._fully_initialized = True
+
+    @property
+    def metainfo(self) -> dict:
+        """Get meta information of dataset.
+
+        Returns:
+            dict: meta information collected from ``BaseDataset.METAINFO``,
+            annotation file and metainfo argument during instantiation.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        This method should return dict or list of dict. Each dict or list
+        contains the data information of a training sample. If the protocol of
+        the sample annotations is changed, this function can be overridden to
+        update the parsing logic while keeping compatibility.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``
+
+        Returns:
+            list or list[dict]: Parsed annotation.
+        """
+        for prefix_key, prefix in self.data_prefix.items():
+            assert prefix_key in raw_data_info, (
+                f'raw_data_info: {raw_data_info} dose not contain prefix key'
+                f'{prefix_key}, please check your data_prefix.')
+            raw_data_info[prefix_key] = join_path(prefix,
+                                                  raw_data_info[prefix_key])
+        return raw_data_info
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg. Defaults return all
+        ``data_list``.
+
+        If some ``data_list`` could be filtered according to specific logic,
+        the subclass should override this method.
+
+        Returns:
+            list[int]: Filtered results.
+        """
+        return self.data_list
+
+    def get_cat_ids(self, idx: int) -> List[int]:
+        """Get category ids by index. Dataset wrapped by ClassBalancedDataset
+        must implement this method.
+
+        The ``ClassBalancedDataset`` requires a subclass which implements this
+        method.
+
+        Args:
+            idx (int): The index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+        raise NotImplementedError(f'{type(self)} must implement `get_cat_ids` '
+                                  'method')
+
+    def __getitem__(self, idx: int) -> dict:
+        """Get the idx-th image and data information of dataset after
+        ``self.pipeline``, and ``full_init`` will be called if the dataset has
+        not been fully initialized.
+
+        During training phase, if ``self.pipeline`` get ``None``,
+        ``self._rand_another`` will be called until a valid image is fetched or
+         the maximum limit of refetech is reached.
+
+        Args:
+            idx (int): The index of self.data_list.
+
+        Returns:
+            dict: The idx-th image and data information of dataset after
+            ``self.pipeline``.
+        """
+        # Performing full initialization by calling `__getitem__` will consume
+        # extra memory. If a dataset is not fully initialized by setting
+        # `lazy_init=True` and then fed into the dataloader. Different workers
+        # will simultaneously read and parse the annotation. It will cost more
+        # time and memory, although this may work. Therefore, it is recommended
+        # to manually call `full_init` before dataset fed into dataloader to
+        # ensure all workers use shared RAM from master process.
+        if not self._fully_initialized:
+            print_log(
+                'Please call `full_init()` method manually to accelerate '
+                'the speed.',
+                logger='current',
+                level=logging.WARNING)
+            self.full_init()
+
+        if self.test_mode:
+            data = self.prepare_data(idx)
+            if data is None:
+                raise Exception('Test time pipline should not get `None` '
+                                'data_sample')
+            return data
+
+        for _ in range(self.max_refetch + 1):
+            data = self.prepare_data(idx)
+            # Broken images or random augmentations may cause the returned data
+            # to be None
+            if data is None:
+                idx = self._rand_another()
+                continue
+            return data
+
+        raise Exception(f'Cannot find valid image after {self.max_refetch}! '
+                        'Please check your image path and pipeline')
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        If the annotation file does not follow `OpenMMLab 2.0 format dataset
+        <https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html>`_ .
+        The subclass must override this method for load annotations. The meta
+        information of annotation file will be overwritten :attr:`METAINFO`
+        and ``metainfo`` argument of constructor.
+
+        Returns:
+            list[dict]: A list of annotation.
+        """  # noqa: E501
+        # `self.ann_file` denotes the absolute annotation file path if
+        # `self.root=None` or relative path if `self.root=/path/to/data/`.
+        annotations = load(self.ann_file)
+        if not isinstance(annotations, dict):
+            raise TypeError(f'The annotations loaded from annotation file '
+                            f'should be a dict, but got {type(annotations)}!')
+        if 'data_list' not in annotations or 'metainfo' not in annotations:
+            raise ValueError('Annotation must have data_list and metainfo '
+                             'keys')
+        metainfo = annotations['metainfo']
+        raw_data_list = annotations['data_list']
+
+        # Meta information load from annotation file will not influence the
+        # existed meta information load from `BaseDataset.METAINFO` and
+        # `metainfo` arguments defined in constructor.
+        for k, v in metainfo.items():
+            self._metainfo.setdefault(k, v)
+
+        # load and parse data_infos.
+        data_list = []
+        for raw_data_info in raw_data_list:
+            # parse raw data information to target format
+            data_info = self.parse_data_info(raw_data_info)
+            if isinstance(data_info, dict):
+                # For image tasks, `data_info` should information if single
+                # image, such as dict(img_path='xxx', width=360, ...)
+                data_list.append(data_info)
+            elif isinstance(data_info, list):
+                # For video tasks, `data_info` could contain image
+                # information of multiple frames, such as
+                # [dict(video_path='xxx', timestamps=...),
+                #  dict(video_path='xxx', timestamps=...)]
+                for item in data_info:
+                    if not isinstance(item, dict):
+                        raise TypeError('data_info must be list of dict, but '
+                                        f'got {type(item)}')
+                data_list.extend(data_info)
+            else:
+                raise TypeError('data_info should be a dict or list of dict, '
+                                f'but got {type(data_info)}')
+
+        return data_list
+
+    @classmethod
+    def _load_metainfo(cls,
+                       metainfo: Union[Mapping, Config, None] = None) -> dict:
+        """Collect meta information from the dictionary of meta.
+
+        Args:
+            metainfo (Mapping or Config, optional): Meta information dict.
+                If ``metainfo`` contains existed filename, it will be
+                parsed by ``list_from_file``.
+
+        Returns:
+            dict: Parsed meta information.
+        """
+        # avoid `cls.METAINFO` being overwritten by `metainfo`
+        cls_metainfo = copy.deepcopy(cls.METAINFO)
+        if metainfo is None:
+            return cls_metainfo
+        if not isinstance(metainfo, (Mapping, Config)):
+            raise TypeError('metainfo should be a Mapping or Config, '
+                            f'but got {type(metainfo)}')
+
+        for k, v in metainfo.items():
+            if isinstance(v, str):
+                # If type of value is string, and can be loaded from
+                # corresponding backend. it means the file name of meta file.
+                try:
+                    cls_metainfo[k] = list_from_file(v)
+                except (TypeError, FileNotFoundError):
+                    print_log(
+                        f'{v} is not a meta file, simply parsed as meta '
+                        'information',
+                        logger='current',
+                        level=logging.WARNING)
+                    cls_metainfo[k] = v
+            else:
+                cls_metainfo[k] = v
+        return cls_metainfo
+
+    def _join_prefix(self):
+        """Join ``self.data_root`` with ``self.data_prefix`` and
+        ``self.ann_file``.
+
+        Examples:
+            >>> # self.data_prefix contains relative paths
+            >>> self.data_root = 'a/b/c'
+            >>> self.data_prefix = dict(img='d/e/')
+            >>> self.ann_file = 'f'
+            >>> self._join_prefix()
+            >>> self.data_prefix
+            dict(img='a/b/c/d/e')
+            >>> self.ann_file
+            'a/b/c/f'
+            >>> # self.data_prefix contains absolute paths
+            >>> self.data_root = 'a/b/c'
+            >>> self.data_prefix = dict(img='/d/e/')
+            >>> self.ann_file = 'f'
+            >>> self._join_prefix()
+            >>> self.data_prefix
+            dict(img='/d/e')
+            >>> self.ann_file
+            'a/b/c/f'
+        """
+        # Automatically join annotation file path with `self.root` if
+        # `self.ann_file` is not an absolute path.
+        if self.ann_file and not is_abs(self.ann_file) and self.data_root:
+            self.ann_file = join_path(self.data_root, self.ann_file)
+        # Automatically join data directory with `self.root` if path value in
+        # `self.data_prefix` is not an absolute path.
+        for data_key, prefix in self.data_prefix.items():
+            if not isinstance(prefix, str):
+                raise TypeError('prefix should be a string, but got '
+                                f'{type(prefix)}')
+            if not is_abs(prefix) and self.data_root:
+                self.data_prefix[data_key] = join_path(self.data_root, prefix)
+            else:
+                self.data_prefix[data_key] = prefix
+
+    @force_full_init
+    def get_subset_(self, indices: Union[Sequence[int], int]) -> None:
+        """The in-place version of ``get_subset`` to convert dataset to a
+        subset of original dataset.
+
+        This method will convert the original dataset to a subset of dataset.
+        If type of indices is int, ``get_subset_`` will return a subdataset
+        which contains the first or last few data information according to
+        indices is positive or negative. If type of indices is a sequence of
+        int, the subdataset will extract the data information according to
+        the index given in indices.
+
+        Examples:
+              >>> dataset = BaseDataset('path/to/ann_file')
+              >>> len(dataset)
+              100
+              >>> dataset.get_subset_(90)
+              >>> len(dataset)
+              90
+              >>> # if type of indices is sequence, extract the corresponding
+              >>> # index data information
+              >>> dataset.get_subset_([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+              >>> len(dataset)
+              10
+              >>> dataset.get_subset_(-3)
+              >>> len(dataset) # Get the latest few data information.
+              3
+
+        Args:
+            indices (int or Sequence[int]): If type of indices is int, indices
+                represents the first or last few data of dataset according to
+                indices is positive or negative. If type of indices is
+                Sequence, indices represents the target data information
+                index of dataset.
+        """
+        # Get subset of data from serialized data or data information sequence
+        # according to `self.serialize_data`.
+        if self.serialize_data:
+            self.data_bytes, self.data_address = \
+                self._get_serialized_subset(indices)
+        else:
+            self.data_list = self._get_unserialized_subset(indices)
+
+    @force_full_init
+    def get_subset(self, indices: Union[Sequence[int], int]) -> 'BaseDataset':
+        """Return a subset of dataset.
+
+        This method will return a subset of original dataset. If type of
+        indices is int, ``get_subset_`` will return a subdataset which
+        contains the first or last few data information according to
+        indices is positive or negative. If type of indices is a sequence of
+        int, the subdataset will extract the information according to the index
+        given in indices.
+
+        Examples:
+              >>> dataset = BaseDataset('path/to/ann_file')
+              >>> len(dataset)
+              100
+              >>> subdataset = dataset.get_subset(90)
+              >>> len(sub_dataset)
+              90
+              >>> # if type of indices is list, extract the corresponding
+              >>> # index data information
+              >>> subdataset = dataset.get_subset([0, 1, 2, 3, 4, 5, 6, 7,
+              >>>                                  8, 9])
+              >>> len(sub_dataset)
+              10
+              >>> subdataset = dataset.get_subset(-3)
+              >>> len(subdataset) # Get the latest few data information.
+              3
+
+        Args:
+            indices (int or Sequence[int]): If type of indices is int, indices
+                represents the first or last few data of dataset according to
+                indices is positive or negative. If type of indices is
+                Sequence, indices represents the target data information
+                index of dataset.
+
+        Returns:
+            BaseDataset: A subset of dataset.
+        """
+        # Get subset of data from serialized data or data information list
+        # according to `self.serialize_data`. Since `_get_serialized_subset`
+        # will recalculate the subset data information,
+        # `_copy_without_annotation` will copy all attributes except data
+        # information.
+        sub_dataset = self._copy_without_annotation()
+        # Get subset of dataset with serialize and unserialized data.
+        if self.serialize_data:
+            data_bytes, data_address = \
+                self._get_serialized_subset(indices)
+            sub_dataset.data_bytes = data_bytes.copy()
+            sub_dataset.data_address = data_address.copy()
+        else:
+            data_list = self._get_unserialized_subset(indices)
+            sub_dataset.data_list = copy.deepcopy(data_list)
+        return sub_dataset
+
+    def _get_serialized_subset(self, indices: Union[Sequence[int], int]) \
+            -> Tuple[np.ndarray, np.ndarray]:
+        """Get subset of serialized data information list.
+
+        Args:
+            indices (int or Sequence[int]): If type of indices is int,
+                indices represents the first or last few data of serialized
+                data information list. If type of indices is Sequence, indices
+                represents the target data information index which consist of
+                subset data information.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: subset of serialized data
+            information.
+        """
+        sub_data_bytes: Union[List, np.ndarray]
+        sub_data_address: Union[List, np.ndarray]
+        if isinstance(indices, int):
+            if indices >= 0:
+                assert indices < len(self.data_address), \
+                    f'{indices} is out of dataset length({len(self)}'
+                # Return the first few data information.
+                end_addr = self.data_address[indices - 1].item() \
+                    if indices > 0 else 0
+                # Slicing operation of `np.ndarray` does not trigger a memory
+                # copy.
+                sub_data_bytes = self.data_bytes[:end_addr]
+                # Since the buffer size of first few data information is not
+                # changed,
+                sub_data_address = self.data_address[:indices]
+            else:
+                assert -indices <= len(self.data_address), \
+                    f'{indices} is out of dataset length({len(self)}'
+                # Return the last few data information.
+                ignored_bytes_size = self.data_address[indices - 1]
+                start_addr = self.data_address[indices - 1].item()
+                sub_data_bytes = self.data_bytes[start_addr:]
+                sub_data_address = self.data_address[indices:]
+                sub_data_address = sub_data_address - ignored_bytes_size
+        elif isinstance(indices, Sequence):
+            sub_data_bytes = []
+            sub_data_address = []
+            for idx in indices:
+                assert len(self) > idx >= -len(self)
+                start_addr = 0 if idx == 0 else \
+                    self.data_address[idx - 1].item()
+                end_addr = self.data_address[idx].item()
+                # Get data information by address.
+                sub_data_bytes.append(self.data_bytes[start_addr:end_addr])
+                # Get data information size.
+                sub_data_address.append(end_addr - start_addr)
+            # Handle indices is an empty list.
+            if sub_data_bytes:
+                sub_data_bytes = np.concatenate(sub_data_bytes)
+                sub_data_address = np.cumsum(sub_data_address)
+            else:
+                sub_data_bytes = np.array([])
+                sub_data_address = np.array([])
+        else:
+            raise TypeError('indices should be a int or sequence of int, '
+                            f'but got {type(indices)}')
+        return sub_data_bytes, sub_data_address  # type: ignore
+
+    def _get_unserialized_subset(self, indices: Union[Sequence[int],
+                                                      int]) -> list:
+        """Get subset of data information list.
+
+        Args:
+            indices (int or Sequence[int]): If type of indices is int,
+                indices represents the first or last few data of data
+                information. If type of indices is Sequence, indices represents
+                the target data information index which consist of subset data
+                information.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: subset of data information.
+        """
+        if isinstance(indices, int):
+            if indices >= 0:
+                # Return the first few data information.
+                sub_data_list = self.data_list[:indices]
+            else:
+                # Return the last few data information.
+                sub_data_list = self.data_list[indices:]
+        elif isinstance(indices, Sequence):
+            # Return the data information according to given indices.
+            sub_data_list = []
+            for idx in indices:
+                sub_data_list.append(self.data_list[idx])
+        else:
+            raise TypeError('indices should be a int or sequence of int, '
+                            f'but got {type(indices)}')
+        return sub_data_list
+
+    def _serialize_data(self) -> Tuple[np.ndarray, np.ndarray]:
+        """Serialize ``self.data_list`` to save memory when launching multiple
+        workers in data loading. This function will be called in ``full_init``.
+
+        Hold memory using serialized objects, and data loader workers can use
+        shared RAM from master process instead of making a copy.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: Serialized result and corresponding
+            address.
+        """
+
+        def _serialize(data):
+            buffer = pickle.dumps(data, protocol=4)
+            return np.frombuffer(buffer, dtype=np.uint8)
+
+        # Serialize data information list avoid making multiple copies of
+        # `self.data_list` when iterate `import torch.utils.data.dataloader`
+        # with multiple workers.
+        data_list = [_serialize(x) for x in self.data_list]
+        address_list = np.asarray([len(x) for x in data_list], dtype=np.int64)
+        data_address: np.ndarray = np.cumsum(address_list)
+        # TODO Check if np.concatenate is necessary
+        data_bytes = np.concatenate(data_list)
+        # Empty cache for preventing making multiple copies of
+        # `self.data_info` when loading data multi-processes.
+        self.data_list.clear()
+        gc.collect()
+        return data_bytes, data_address
+
+    def _rand_another(self) -> int:
+        """Get random index.
+
+        Returns:
+            int: Random index from 0 to ``len(self)-1``
+        """
+        return np.random.randint(0, len(self))
+
+    def prepare_data(self, idx) -> Any:
+        """Get data processed by ``self.pipeline``.
+
+        Args:
+            idx (int): The index of ``data_info``.
+
+        Returns:
+            Any: Depends on ``self.pipeline``.
+        """
+        data_info = self.get_data_info(idx)
+        return self.pipeline(data_info)
+
+    @force_full_init
+    def __len__(self) -> int:
+        """Get the length of filtered dataset and automatically call
+        ``full_init`` if the  dataset has not been fully init.
+
+        Returns:
+            int: The length of filtered dataset.
+        """
+        if self.serialize_data:
+            return len(self.data_address)
+        else:
+            return len(self.data_list)
+
+    def _copy_without_annotation(self, memo=dict()) -> 'BaseDataset':
+        """Deepcopy for all attributes other than ``data_list``,
+        ``data_address`` and ``data_bytes``.
+
+        Args:
+            memo: Memory dict which used to reconstruct complex object
+                correctly.
+        """
+        cls = self.__class__
+        other = cls.__new__(cls)
+        memo[id(self)] = other
+
+        for key, value in self.__dict__.items():
+            if key in ['data_list', 'data_address', 'data_bytes']:
+                continue
+            super(BaseDataset, other).__setattr__(key,
+                                                  copy.deepcopy(value, memo))
+
+        return other
diff --git a/head_extractor/build/lib/mmengine/dataset/dataset_wrapper.py b/head_extractor/build/lib/mmengine/dataset/dataset_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..e63860bee00f8bbd4e8ce6c96bbb3ee514e656e6
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/dataset/dataset_wrapper.py
@@ -0,0 +1,529 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import bisect
+import copy
+import logging
+import math
+from collections import defaultdict
+from typing import List, Sequence, Tuple, Union
+
+import numpy as np
+from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
+
+from mmengine.logging import print_log
+from mmengine.registry import DATASETS
+from .base_dataset import BaseDataset, force_full_init
+
+
+@DATASETS.register_module()
+class ConcatDataset(_ConcatDataset):
+    """A wrapper of concatenated dataset.
+
+    Same as ``torch.utils.data.dataset.ConcatDataset`` and support lazy_init.
+
+    Note:
+        ``ConcatDataset`` should not inherit from ``BaseDataset`` since
+        ``get_subset`` and ``get_subset_`` could produce ambiguous meaning
+        sub-dataset which conflicts with original dataset. If you want to use
+        a sub-dataset of ``ConcatDataset``, you should set ``indices``
+        arguments for wrapped dataset which inherit from ``BaseDataset``.
+
+    Args:
+        datasets (Sequence[BaseDataset] or Sequence[dict]): A list of datasets
+            which will be concatenated.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. Defaults to False.
+        ignore_keys (List[str] or str): Ignore the keys that can be
+            unequal in `dataset.metainfo`. Defaults to None.
+            `New in version 0.3.0.`
+    """
+
+    def __init__(self,
+                 datasets: Sequence[Union[BaseDataset, dict]],
+                 lazy_init: bool = False,
+                 ignore_keys: Union[str, List[str], None] = None):
+        self.datasets: List[BaseDataset] = []
+        for i, dataset in enumerate(datasets):
+            if isinstance(dataset, dict):
+                self.datasets.append(DATASETS.build(dataset))
+            elif isinstance(dataset, BaseDataset):
+                self.datasets.append(dataset)
+            else:
+                raise TypeError(
+                    'elements in datasets sequence should be config or '
+                    f'`BaseDataset` instance, but got {type(dataset)}')
+        if ignore_keys is None:
+            self.ignore_keys = []
+        elif isinstance(ignore_keys, str):
+            self.ignore_keys = [ignore_keys]
+        elif isinstance(ignore_keys, list):
+            self.ignore_keys = ignore_keys
+        else:
+            raise TypeError('ignore_keys should be a list or str, '
+                            f'but got {type(ignore_keys)}')
+
+        meta_keys: set = set()
+        for dataset in self.datasets:
+            meta_keys |= dataset.metainfo.keys()
+        # Only use metainfo of first dataset.
+        self._metainfo = self.datasets[0].metainfo
+        for i, dataset in enumerate(self.datasets, 1):
+            for key in meta_keys:
+                if key in self.ignore_keys:
+                    continue
+                if key not in dataset.metainfo:
+                    raise ValueError(
+                        f'{key} does not in the meta information of '
+                        f'the {i}-th dataset')
+                first_type = type(self._metainfo[key])
+                cur_type = type(dataset.metainfo[key])
+                if first_type is not cur_type:  # type: ignore
+                    raise TypeError(
+                        f'The type {cur_type} of {key} in the {i}-th dataset '
+                        'should be the same with the first dataset '
+                        f'{first_type}')
+                if (isinstance(self._metainfo[key], np.ndarray)
+                        and not np.array_equal(self._metainfo[key],
+                                               dataset.metainfo[key])
+                        or (not isinstance(self._metainfo[key], np.ndarray)
+                            and self._metainfo[key] != dataset.metainfo[key])):
+                    raise ValueError(
+                        f'The meta information of the {i}-th dataset does not '
+                        'match meta information of the first dataset')
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        """Get the meta information of the first dataset in ``self.datasets``.
+
+        Returns:
+            dict: Meta information of first dataset.
+        """
+        # Prevent `self._metainfo` from being modified by outside.
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self):
+        """Loop to ``full_init`` each dataset."""
+        if self._fully_initialized:
+            return
+        for d in self.datasets:
+            d.full_init()
+        # Get the cumulative sizes of `self.datasets`. For example, the length
+        # of `self.datasets` is [2, 3, 4], the cumulative sizes is [2, 5, 9]
+        super().__init__(self.datasets)
+        self._fully_initialized = True
+
+    @force_full_init
+    def _get_ori_dataset_idx(self, idx: int) -> Tuple[int, int]:
+        """Convert global idx to local index.
+
+        Args:
+            idx (int): Global index of ``RepeatDataset``.
+
+        Returns:
+            Tuple[int, int]: The index of ``self.datasets`` and the local
+            index of data.
+        """
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError(
+                    f'absolute value of index({idx}) should not exceed dataset'
+                    f'length({len(self)}).')
+            idx = len(self) + idx
+        # Get `dataset_idx` to tell idx belongs to which dataset.
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        # Get the inner index of single dataset.
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+
+        return dataset_idx, sample_idx
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index.
+
+        Args:
+            idx (int): Global index of ``ConcatDataset``.
+
+        Returns:
+            dict: The idx-th annotation of the datasets.
+        """
+        dataset_idx, sample_idx = self._get_ori_dataset_idx(idx)
+        return self.datasets[dataset_idx].get_data_info(sample_idx)
+
+    @force_full_init
+    def __len__(self):
+        return super().__len__()
+
+    def __getitem__(self, idx):
+        if not self._fully_initialized:
+            print_log(
+                'Please call `full_init` method manually to '
+                'accelerate the speed.',
+                logger='current',
+                level=logging.WARNING)
+            self.full_init()
+        dataset_idx, sample_idx = self._get_ori_dataset_idx(idx)
+        return self.datasets[dataset_idx][sample_idx]
+
+    def get_subset_(self, indices: Union[List[int], int]) -> None:
+        """Not supported in ``ConcatDataset`` for the ambiguous meaning of sub-
+        dataset."""
+        raise NotImplementedError(
+            '`ConcatDataset` dose not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `ConcatDataset`.')
+
+    def get_subset(self, indices: Union[List[int], int]) -> 'BaseDataset':
+        """Not supported in ``ConcatDataset`` for the ambiguous meaning of sub-
+        dataset."""
+        raise NotImplementedError(
+            '`ConcatDataset` dose not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `ConcatDataset`.')
+
+
+@DATASETS.register_module()
+class RepeatDataset:
+    """A wrapper of repeated dataset.
+
+    The length of repeated dataset will be `times` larger than the original
+    dataset. This is useful when the data loading time is long but the dataset
+    is small. Using RepeatDataset can reduce the data loading time between
+    epochs.
+
+    Note:
+        ``RepeatDataset`` should not inherit from ``BaseDataset`` since
+        ``get_subset`` and ``get_subset_`` could produce ambiguous meaning
+        sub-dataset which conflicts with original dataset. If you want to use
+        a sub-dataset of ``RepeatDataset``, you should set ``indices``
+        arguments for wrapped dataset which inherit from ``BaseDataset``.
+
+    Args:
+        dataset (BaseDataset or dict): The dataset to be repeated.
+        times (int): Repeat times.
+        lazy_init (bool): Whether to load annotation during
+            instantiation. Defaults to False.
+    """
+
+    def __init__(self,
+                 dataset: Union[BaseDataset, dict],
+                 times: int,
+                 lazy_init: bool = False):
+        self.dataset: BaseDataset
+        if isinstance(dataset, dict):
+            self.dataset = DATASETS.build(dataset)
+        elif isinstance(dataset, BaseDataset):
+            self.dataset = dataset
+        else:
+            raise TypeError(
+                'elements in datasets sequence should be config or '
+                f'`BaseDataset` instance, but got {type(dataset)}')
+        self.times = times
+        self._metainfo = self.dataset.metainfo
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        """Get the meta information of the repeated dataset.
+
+        Returns:
+            dict: The meta information of repeated dataset.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self):
+        """Loop to ``full_init`` each dataset."""
+        if self._fully_initialized:
+            return
+
+        self.dataset.full_init()
+        self._ori_len = len(self.dataset)
+        self._fully_initialized = True
+
+    @force_full_init
+    def _get_ori_dataset_idx(self, idx: int) -> int:
+        """Convert global index to local index.
+
+        Args:
+            idx: Global index of ``RepeatDataset``.
+
+        Returns:
+            idx (int): Local index of data.
+        """
+        return idx % self._ori_len
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index.
+
+        Args:
+            idx (int): Global index of ``ConcatDataset``.
+
+        Returns:
+            dict: The idx-th annotation of the datasets.
+        """
+        sample_idx = self._get_ori_dataset_idx(idx)
+        return self.dataset.get_data_info(sample_idx)
+
+    def __getitem__(self, idx):
+        if not self._fully_initialized:
+            print_log(
+                'Please call `full_init` method manually to accelerate the '
+                'speed.',
+                logger='current',
+                level=logging.WARNING)
+            self.full_init()
+
+        sample_idx = self._get_ori_dataset_idx(idx)
+        return self.dataset[sample_idx]
+
+    @force_full_init
+    def __len__(self):
+        return self.times * self._ori_len
+
+    def get_subset_(self, indices: Union[List[int], int]) -> None:
+        """Not supported in ``RepeatDataset`` for the ambiguous meaning of sub-
+        dataset."""
+        raise NotImplementedError(
+            '`RepeatDataset` dose not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `RepeatDataset`.')
+
+    def get_subset(self, indices: Union[List[int], int]) -> 'BaseDataset':
+        """Not supported in ``RepeatDataset`` for the ambiguous meaning of sub-
+        dataset."""
+        raise NotImplementedError(
+            '`RepeatDataset` dose not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `RepeatDataset`.')
+
+
+@DATASETS.register_module()
+class ClassBalancedDataset:
+    """A wrapper of class balanced dataset.
+
+    Suitable for training on class imbalanced datasets like LVIS. Following
+    the sampling strategy in the `paper <https://arxiv.org/abs/1908.03195>`_,
+    in each epoch, an image may appear multiple times based on its
+    "repeat factor".
+    The repeat factor for an image is a function of the frequency the rarest
+    category labeled in that image. The "frequency of category c" in [0, 1]
+    is defined by the fraction of images in the training set (without repeats)
+    in which category c appears.
+    The dataset needs to instantiate :meth:`get_cat_ids` to support
+    ClassBalancedDataset.
+
+    The repeat factor is computed as followed.
+
+    1. For each category c, compute the fraction # of images
+       that contain it: :math:`f(c)`
+    2. For each category c, compute the category-level repeat factor:
+       :math:`r(c) = max(1, sqrt(t/f(c)))`
+    3. For each image I, compute the image-level repeat factor:
+       :math:`r(I) = max_{c in I} r(c)`
+
+    Note:
+        ``ClassBalancedDataset`` should not inherit from ``BaseDataset``
+        since ``get_subset`` and ``get_subset_`` could  produce ambiguous
+        meaning sub-dataset which conflicts with original dataset. If you
+        want to use a sub-dataset of ``ClassBalancedDataset``, you should set
+        ``indices`` arguments for wrapped dataset which inherit from
+        ``BaseDataset``.
+
+    Args:
+        dataset (BaseDataset or dict): The dataset to be repeated.
+        oversample_thr (float): frequency threshold below which data is
+            repeated. For categories with ``f_c >= oversample_thr``, there is
+            no oversampling. For categories with ``f_c < oversample_thr``, the
+            degree of oversampling following the square-root inverse frequency
+            heuristic above.
+        lazy_init (bool, optional): whether to load annotation during
+            instantiation. Defaults to False
+    """
+
+    def __init__(self,
+                 dataset: Union[BaseDataset, dict],
+                 oversample_thr: float,
+                 lazy_init: bool = False):
+        if isinstance(dataset, dict):
+            self.dataset = DATASETS.build(dataset)
+        elif isinstance(dataset, BaseDataset):
+            self.dataset = dataset
+        else:
+            raise TypeError(
+                'elements in datasets sequence should be config or '
+                f'`BaseDataset` instance, but got {type(dataset)}')
+        self.oversample_thr = oversample_thr
+        self._metainfo = self.dataset.metainfo
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        """Get the meta information of the repeated dataset.
+
+        Returns:
+            dict: The meta information of repeated dataset.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self):
+        """Loop to ``full_init`` each dataset."""
+        if self._fully_initialized:
+            return
+
+        self.dataset.full_init()
+        # Get repeat factors for each image.
+        repeat_factors = self._get_repeat_factors(self.dataset,
+                                                  self.oversample_thr)
+        # Repeat dataset's indices according to repeat_factors. For example,
+        # if `repeat_factors = [1, 2, 3]`, and the `len(dataset) == 3`,
+        # the repeated indices will be [1, 2, 2, 3, 3, 3].
+        repeat_indices = []
+        for dataset_index, repeat_factor in enumerate(repeat_factors):
+            repeat_indices.extend([dataset_index] * math.ceil(repeat_factor))
+        self.repeat_indices = repeat_indices
+
+        self._fully_initialized = True
+
+    def _get_repeat_factors(self, dataset: BaseDataset,
+                            repeat_thr: float) -> List[float]:
+        """Get repeat factor for each images in the dataset.
+
+        Args:
+            dataset (BaseDataset): The dataset.
+            repeat_thr (float): The threshold of frequency. If an image
+                contains the categories whose frequency below the threshold,
+                it would be repeated.
+
+        Returns:
+            List[float]: The repeat factors for each images in the dataset.
+        """
+        # 1. For each category c, compute the fraction # of images
+        #   that contain it: f(c)
+        category_freq: defaultdict = defaultdict(float)
+        num_images = len(dataset)
+        for idx in range(num_images):
+            cat_ids = set(self.dataset.get_cat_ids(idx))
+            for cat_id in cat_ids:
+                category_freq[cat_id] += 1
+        for k, v in category_freq.items():
+            assert v > 0, f'caterogy {k} does not contain any images'
+            category_freq[k] = v / num_images
+
+        # 2. For each category c, compute the category-level repeat factor:
+        #    r(c) = max(1, sqrt(t/f(c)))
+        category_repeat = {
+            cat_id: max(1.0, math.sqrt(repeat_thr / cat_freq))
+            for cat_id, cat_freq in category_freq.items()
+        }
+
+        # 3. For each image I and its labels L(I), compute the image-level
+        # repeat factor:
+        #    r(I) = max_{c in L(I)} r(c)
+        repeat_factors = []
+        for idx in range(num_images):
+            # the length of `repeat_factors` need equal to the length of
+            # dataset. Hence, if the `cat_ids` is empty,
+            # the repeat_factor should be 1.
+            repeat_factor: float = 1.
+            cat_ids = set(self.dataset.get_cat_ids(idx))
+            if len(cat_ids) != 0:
+                repeat_factor = max(
+                    {category_repeat[cat_id]
+                     for cat_id in cat_ids})
+            repeat_factors.append(repeat_factor)
+
+        return repeat_factors
+
+    @force_full_init
+    def _get_ori_dataset_idx(self, idx: int) -> int:
+        """Convert global index to local index.
+
+        Args:
+            idx (int): Global index of ``RepeatDataset``.
+
+        Returns:
+            int: Local index of data.
+        """
+        return self.repeat_indices[idx]
+
+    @force_full_init
+    def get_cat_ids(self, idx: int) -> List[int]:
+        """Get category ids of class balanced dataset by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            List[int]: All categories in the image of specified index.
+        """
+        sample_idx = self._get_ori_dataset_idx(idx)
+        return self.dataset.get_cat_ids(sample_idx)
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index.
+
+        Args:
+            idx (int): Global index of ``ConcatDataset``.
+
+        Returns:
+            dict: The idx-th annotation of the dataset.
+        """
+        sample_idx = self._get_ori_dataset_idx(idx)
+        return self.dataset.get_data_info(sample_idx)
+
+    def __getitem__(self, idx):
+        if not self._fully_initialized:
+            print_log(
+                'Please call `full_init` method manually to accelerate '
+                'the speed.',
+                logger='current',
+                level=logging.WARNING)
+            self.full_init()
+
+        ori_index = self._get_ori_dataset_idx(idx)
+        return self.dataset[ori_index]
+
+    @force_full_init
+    def __len__(self):
+        return len(self.repeat_indices)
+
+    def get_subset_(self, indices: Union[List[int], int]) -> None:
+        """Not supported in ``ClassBalancedDataset`` for the ambiguous meaning
+        of sub-dataset."""
+        raise NotImplementedError(
+            '`ClassBalancedDataset` dose not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `ClassBalancedDataset`.')
+
+    def get_subset(self, indices: Union[List[int], int]) -> 'BaseDataset':
+        """Not supported in ``ClassBalancedDataset`` for the ambiguous meaning
+        of sub-dataset."""
+        raise NotImplementedError(
+            '`ClassBalancedDataset` dose not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `ClassBalancedDataset`.')
diff --git a/head_extractor/build/lib/mmengine/dataset/sampler.py b/head_extractor/build/lib/mmengine/dataset/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..95e8e2da6b3d8c44d9423eeedef8a168df9742ca
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/dataset/sampler.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+import math
+from typing import Iterator, Optional, Sized
+
+import torch
+from torch.utils.data import Sampler
+
+from mmengine.dist import get_dist_info, sync_random_seed
+from mmengine.registry import DATA_SAMPLERS
+
+
+@DATA_SAMPLERS.register_module()
+class DefaultSampler(Sampler):
+    """The default data sampler for both distributed and non-distributed
+    environment.
+
+    It has several differences from the PyTorch ``DistributedSampler`` as
+    below:
+
+    1. This sampler supports non-distributed environment.
+
+    2. The round up behaviors are a little different.
+
+       - If ``round_up=True``, this sampler will add extra samples to make the
+         number of samples is evenly divisible by the world size. And
+         this behavior is the same as the ``DistributedSampler`` with
+         ``drop_last=False``.
+       - If ``round_up=False``, this sampler won't remove or add any samples
+         while the ``DistributedSampler`` with ``drop_last=True`` will remove
+         tail samples.
+
+    Args:
+        dataset (Sized): The dataset.
+        shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
+        seed (int, optional): Random seed used to shuffle the sampler if
+            :attr:`shuffle=True`. This number should be identical across all
+            processes in the distributed group. Defaults to None.
+        round_up (bool): Whether to add extra samples to make the number of
+            samples evenly divisible by the world size. Defaults to True.
+    """
+
+    def __init__(self,
+                 dataset: Sized,
+                 shuffle: bool = True,
+                 seed: Optional[int] = None,
+                 round_up: bool = True) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.shuffle = shuffle
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+        self.epoch = 0
+        self.round_up = round_up
+
+        if self.round_up:
+            self.num_samples = math.ceil(len(self.dataset) / world_size)
+            self.total_size = self.num_samples * self.world_size
+        else:
+            self.num_samples = math.ceil(
+                (len(self.dataset) - rank) / world_size)
+            self.total_size = len(self.dataset)
+
+    def __iter__(self) -> Iterator[int]:
+        """Iterate the indices."""
+        # deterministically shuffle based on epoch and seed
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        if self.round_up:
+            indices = (
+                indices *
+                int(self.total_size / len(indices) + 1))[:self.total_size]
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.world_size]
+
+        return iter(indices)
+
+    def __len__(self) -> int:
+        """The number of samples in this rank."""
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        """Sets the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas use a different
+        random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
+
+
+@DATA_SAMPLERS.register_module()
+class InfiniteSampler(Sampler):
+    """It's designed for iteration-based runner and yields a mini-batch indices
+    each time.
+
+    The implementation logic is referred to
+    https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/samplers/distributed_sampler.py
+
+    Args:
+        dataset (Sized): The dataset.
+        shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
+        seed (int, optional): Random seed. If None, set a random seed.
+            Defaults to None.
+    """  # noqa: W605
+
+    def __init__(self,
+                 dataset: Sized,
+                 shuffle: bool = True,
+                 seed: Optional[int] = None) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.world_size = world_size
+        self.rank = rank
+        self.shuffle = shuffle
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+        self.size = len(dataset)
+        self.indices = self._indices_of_rank()
+
+    def _infinite_indices(self) -> Iterator[int]:
+        """Infinitely yield a sequence of indices."""
+        g = torch.Generator()
+        g.manual_seed(self.seed)
+        while True:
+            if self.shuffle:
+                yield from torch.randperm(self.size, generator=g).tolist()
+
+            else:
+                yield from torch.arange(self.size).tolist()
+
+    def _indices_of_rank(self) -> Iterator[int]:
+        """Slice the infinite indices by rank."""
+        yield from itertools.islice(self._infinite_indices(), self.rank, None,
+                                    self.world_size)
+
+    def __iter__(self) -> Iterator[int]:
+        """Iterate the indices."""
+        yield from self.indices
+
+    def __len__(self) -> int:
+        """Length of base dataset."""
+        return self.size
+
+    def set_epoch(self, epoch: int) -> None:
+        """Not supported in iteration-based runner."""
+        pass
diff --git a/head_extractor/build/lib/mmengine/dataset/utils.py b/head_extractor/build/lib/mmengine/dataset/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c9cf964970dc33e3b14ab3c343ef7a16067bd90
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/dataset/utils.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+import warnings
+from typing import Any, Mapping, Sequence
+
+import numpy as np
+import torch
+from torch.utils.data._utils.collate import \
+    default_collate as torch_default_collate
+
+from mmengine.registry import FUNCTIONS
+from mmengine.structures import BaseDataElement
+
+# FUNCTIONS is new in MMEngine v0.7.0. Reserve the `COLLATE_FUNCTIONS` to keep
+# the compatibility.
+COLLATE_FUNCTIONS = FUNCTIONS
+
+
+def worker_init_fn(worker_id: int,
+                   num_workers: int,
+                   rank: int,
+                   seed: int,
+                   disable_subprocess_warning: bool = False) -> None:
+    """This function will be called on each worker subprocess after seeding and
+    before data loading.
+
+    Args:
+        worker_id (int): Worker id in [0, num_workers - 1].
+        num_workers (int): How many subprocesses to use for data loading.
+        rank (int): Rank of process in distributed environment. If in
+            non-distributed environment, it is a constant number `0`.
+        seed (int): Random seed.
+    """
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+    torch.manual_seed(worker_seed)
+    if disable_subprocess_warning and worker_id != 0:
+        warnings.simplefilter('ignore')
+
+
+@FUNCTIONS.register_module()
+def pseudo_collate(data_batch: Sequence) -> Any:
+    """Convert list of data sampled from dataset into a batch of data, of which
+    type consistent with the type of each data_itement in ``data_batch``.
+
+    The default behavior of dataloader is to merge a list of samples to form
+    a mini-batch of Tensor(s). However, in MMEngine, ``pseudo_collate``
+    will not stack tensors to batch tensors, and convert int, float, ndarray to
+    tensors.
+
+    This code is referenced from:
+    `Pytorch default_collate <https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py>`_.
+
+    Args:
+        data_batch (Sequence): Batch of data from dataloader.
+
+    Returns:
+        Any: Transversed Data in the same format as the data_itement of
+        ``data_batch``.
+    """  # noqa: E501
+    data_item = data_batch[0]
+    data_item_type = type(data_item)
+    if isinstance(data_item, (str, bytes)):
+        return data_batch
+    elif isinstance(data_item, tuple) and hasattr(data_item, '_fields'):
+        # named tuple
+        return data_item_type(*(pseudo_collate(samples)
+                                for samples in zip(*data_batch)))
+    elif isinstance(data_item, Sequence):
+        # check to make sure that the data_itements in batch have
+        # consistent size
+        it = iter(data_batch)
+        data_item_size = len(next(it))
+        if not all(len(data_item) == data_item_size for data_item in it):
+            raise RuntimeError(
+                'each data_itement in list of batch should be of equal size')
+        transposed = list(zip(*data_batch))
+
+        if isinstance(data_item, tuple):
+            return [pseudo_collate(samples)
+                    for samples in transposed]  # Compat with Pytorch.
+        else:
+            try:
+                return data_item_type(
+                    [pseudo_collate(samples) for samples in transposed])
+            except TypeError:
+                # The sequence type may not support `__init__(iterable)`
+                # (e.g., `range`).
+                return [pseudo_collate(samples) for samples in transposed]
+    elif isinstance(data_item, Mapping):
+        return data_item_type({
+            key: pseudo_collate([d[key] for d in data_batch])
+            for key in data_item
+        })
+    else:
+        return data_batch
+
+
+@FUNCTIONS.register_module()
+def default_collate(data_batch: Sequence) -> Any:
+    """Convert list of data sampled from dataset into a batch of data, of which
+    type consistent with the type of each data_itement in ``data_batch``.
+
+    Different from :func:`pseudo_collate`, ``default_collate`` will stack
+    tensor contained in ``data_batch`` into a batched tensor with the
+    first dimension batch size, and then move input tensor to the target
+    device.
+
+    Different from ``default_collate`` in pytorch, ``default_collate`` will
+    not process ``BaseDataElement``.
+
+    This code is referenced from:
+    `Pytorch default_collate <https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py>`_.
+
+    Note:
+        ``default_collate`` only accept input tensor with the same shape.
+
+    Args:
+        data_batch (Sequence): Data sampled from dataset.
+
+    Returns:
+        Any: Data in the same format as the data_itement of ``data_batch``, of which
+        tensors have been stacked, and ndarray, int, float have been
+        converted to tensors.
+    """  # noqa: E501
+    data_item = data_batch[0]
+    data_item_type = type(data_item)
+
+    if isinstance(data_item, (BaseDataElement, str, bytes)):
+        return data_batch
+    elif isinstance(data_item, tuple) and hasattr(data_item, '_fields'):
+        # named_tuple
+        return data_item_type(*(default_collate(samples)
+                                for samples in zip(*data_batch)))
+    elif isinstance(data_item, Sequence):
+        # check to make sure that the data_itements in batch have
+        # consistent size
+        it = iter(data_batch)
+        data_item_size = len(next(it))
+        if not all(len(data_item) == data_item_size for data_item in it):
+            raise RuntimeError(
+                'each data_itement in list of batch should be of equal size')
+        transposed = list(zip(*data_batch))
+
+        if isinstance(data_item, tuple):
+            return [default_collate(samples)
+                    for samples in transposed]  # Compat with Pytorch.
+        else:
+            try:
+                return data_item_type(
+                    [default_collate(samples) for samples in transposed])
+            except TypeError:
+                # The sequence type may not support `__init__(iterable)`
+                # (e.g., `range`).
+                return [default_collate(samples) for samples in transposed]
+    elif isinstance(data_item, Mapping):
+        return data_item_type({
+            key: default_collate([d[key] for d in data_batch])
+            for key in data_item
+        })
+    else:
+        return torch_default_collate(data_batch)
diff --git a/head_extractor/build/lib/mmengine/device/__init__.py b/head_extractor/build/lib/mmengine/device/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..88937d55922cbf90394a07c4e896f4ce328bf976
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/device/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .utils import (get_device, get_max_cuda_memory, get_max_musa_memory,
+                    is_cuda_available, is_dipu_available, is_mlu_available,
+                    is_mps_available, is_musa_available, is_npu_available,
+                    is_npu_support_full_precision)
+
+__all__ = [
+    'get_max_cuda_memory', 'get_device', 'is_cuda_available',
+    'is_mlu_available', 'is_mps_available', 'is_npu_available',
+    'is_dipu_available', 'get_max_musa_memory', 'is_musa_available',
+    'is_npu_support_full_precision'
+]
diff --git a/head_extractor/build/lib/mmengine/device/utils.py b/head_extractor/build/lib/mmengine/device/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fe6e0c15644f60f6c88036ff0e3cdfe0d40baff
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/device/utils.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Optional
+
+import torch
+
+try:
+    import torch_npu  # noqa: F401
+    import torch_npu.npu.utils as npu_utils
+
+    # Enable operator support for dynamic shape and
+    # binary operator support on the NPU.
+    npu_jit_compile = bool(os.getenv('NPUJITCompile', False))
+    torch.npu.set_compile_mode(jit_compile=npu_jit_compile)
+    IS_NPU_AVAILABLE = hasattr(torch, 'npu') and torch.npu.is_available()
+except Exception:
+    IS_NPU_AVAILABLE = False
+
+try:
+    import torch_mlu  # noqa: F401
+    IS_MLU_AVAILABLE = hasattr(torch, 'mlu') and torch.mlu.is_available()
+except Exception:
+    IS_MLU_AVAILABLE = False
+
+try:
+    import torch_dipu  # noqa: F401
+    IS_DIPU_AVAILABLE = True
+except Exception:
+    IS_DIPU_AVAILABLE = False
+
+try:
+    import torch_musa  # noqa: F401
+    IS_MUSA_AVAILABLE = True
+except Exception:
+    IS_MUSA_AVAILABLE = False
+
+
+def get_max_cuda_memory(device: Optional[torch.device] = None) -> int:
+    """Returns the maximum GPU memory occupied by tensors in megabytes (MB) for
+    a given device. By default, this returns the peak allocated memory since
+    the beginning of this program.
+
+    Args:
+        device (torch.device, optional): selected device. Returns
+            statistic for the current device, given by
+            :func:`~torch.cuda.current_device`, if ``device`` is None.
+            Defaults to None.
+
+    Returns:
+        int: The maximum GPU memory occupied by tensors in megabytes
+        for a given device.
+    """
+    mem = torch.cuda.max_memory_allocated(device=device)
+    mem_mb = torch.tensor([int(mem) // (1024 * 1024)],
+                          dtype=torch.int,
+                          device=device)
+    torch.cuda.reset_peak_memory_stats()
+    return int(mem_mb.item())
+
+
+def is_cuda_available() -> bool:
+    """Returns True if cuda devices exist."""
+    return torch.cuda.is_available()
+
+
+def is_npu_available() -> bool:
+    """Returns True if Ascend PyTorch and npu devices exist."""
+    return IS_NPU_AVAILABLE
+
+
+def is_mlu_available() -> bool:
+    """Returns True if Cambricon PyTorch and mlu devices exist."""
+    return IS_MLU_AVAILABLE
+
+
+def is_mps_available() -> bool:
+    """Return True if mps devices exist.
+
+    It's specialized for mac m1 chips and require torch version 1.12 or higher.
+    """
+    return hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
+
+
+def is_dipu_available() -> bool:
+    return IS_DIPU_AVAILABLE
+
+
+def get_max_musa_memory(device: Optional[torch.device] = None) -> int:
+    """Returns the maximum GPU memory occupied by tensors in megabytes (MB) for
+    a given device. By default, this returns the peak allocated memory since
+    the beginning of this program.
+
+    Args:
+        device (torch.device, optional): selected device. Returns
+            statistic for the current device, given by
+            :func:`~torch.musa.current_device`, if ``device`` is None.
+            Defaults to None.
+
+    Returns:
+        int: The maximum GPU memory occupied by tensors in megabytes
+        for a given device.
+    """
+    mem = torch.musa.max_memory_allocated(device=device)
+    mem_mb = torch.tensor([int(mem) // (1024 * 1024)],
+                          dtype=torch.int,
+                          device=device)
+    # TODO:haowen.han@mthreads.com: This function is not supported by musa yet.
+    # torch.musa.reset_peak_memory_stats()
+    return int(mem_mb.item())
+
+
+def is_musa_available() -> bool:
+    return IS_MUSA_AVAILABLE
+
+
+def is_npu_support_full_precision() -> bool:
+    """Returns True if npu devices support full precision training."""
+    version_of_support_full_precision = 220
+    return IS_NPU_AVAILABLE and npu_utils.get_soc_version(
+    ) >= version_of_support_full_precision
+
+
+DEVICE = 'cpu'
+if is_npu_available():
+    DEVICE = 'npu'
+elif is_cuda_available():
+    DEVICE = 'cuda'
+elif is_mlu_available():
+    DEVICE = 'mlu'
+elif is_mps_available():
+    DEVICE = 'mps'
+elif is_dipu_available():
+    DEVICE = 'dipu'
+elif is_musa_available():
+    DEVICE = 'musa'
+
+
+def get_device() -> str:
+    """Returns the currently existing device type.
+
+    Returns:
+        str: cuda | npu | mlu | mps | musa | cpu.
+    """
+    return DEVICE
diff --git a/head_extractor/build/lib/mmengine/dist/__init__.py b/head_extractor/build/lib/mmengine/dist/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c70e181d5d6ba076ebb1d3fcfbd16f17f09952f1
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/dist/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dist import (all_gather_object, all_reduce, all_gather, all_reduce_dict,
+                   collect_results, gather, broadcast, gather_object,
+                   sync_random_seed, broadcast_object_list,
+                   collect_results_cpu, collect_results_gpu, all_reduce_params)
+from .utils import (get_dist_info, init_dist, init_local_group, get_backend,
+                    get_world_size, get_rank, get_local_size, get_local_rank,
+                    is_main_process, master_only, barrier, get_local_group,
+                    is_distributed, get_default_group, get_data_device,
+                    get_comm_device, cast_data_device, infer_launcher)
+
+__all__ = [
+    'all_gather_object', 'all_reduce', 'all_gather', 'all_reduce_dict',
+    'collect_results', 'collect_results_cpu', 'collect_results_gpu', 'gather',
+    'broadcast', 'gather_object', 'sync_random_seed', 'broadcast_object_list',
+    'get_dist_info', 'init_dist', 'init_local_group', 'get_backend',
+    'get_world_size', 'get_rank', 'get_local_size', 'get_local_group',
+    'get_local_rank', 'is_main_process', 'master_only', 'barrier',
+    'is_distributed', 'get_default_group', 'all_reduce_params',
+    'get_data_device', 'get_comm_device', 'cast_data_device', 'infer_launcher'
+]
diff --git a/head_extractor/build/lib/mmengine/dist/dist.py b/head_extractor/build/lib/mmengine/dist/dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..f70cc3ef46cf7c8ff93de464faadbf2547e4e267
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/dist/dist.py
@@ -0,0 +1,1184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+from collections import OrderedDict
+from typing import Any, Dict, Generator, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+from torch import distributed as torch_dist
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+from torch.distributed import ProcessGroup
+from itertools import zip_longest, chain
+import mmengine
+from .utils import (get_world_size, get_rank, get_backend, get_dist_info,
+                    get_default_group, barrier, get_data_device,
+                    get_comm_device, cast_data_device)
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+from mmengine.device import is_npu_available
+
+
+def _get_reduce_op(name: str) -> torch_dist.ReduceOp:
+    op_mappings = {
+        'sum': torch_dist.ReduceOp.SUM,
+        'product': torch_dist.ReduceOp.PRODUCT,
+        'min': torch_dist.ReduceOp.MIN,
+        'max': torch_dist.ReduceOp.MAX,
+        'band': torch_dist.ReduceOp.BAND,
+        'bor': torch_dist.ReduceOp.BOR,
+        'bxor': torch_dist.ReduceOp.BXOR,
+    }
+
+    if name.lower() not in op_mappings:
+        raise ValueError(
+            f'reduce op should be one of {op_mappings.keys()}, bug got {name}')
+
+    return op_mappings[name.lower()]
+
+
+def all_reduce(data: Tensor,
+               op: str = 'sum',
+               group: Optional[ProcessGroup] = None) -> None:
+    """Reduces the tensor data across all machines in such a way that all get
+    the final result.
+
+    After the call ``data`` is going to be bitwise identical in all
+    processes.
+
+    Note:
+        Calling ``all_reduce`` in non-distributed environment does nothing.
+
+    Args:
+        data (Tensor): Input and output of the collective. The function
+            operates in-place.
+        op (str): Operation to reduce data. Defaults to 'sum'. Optional values
+            are 'sum', 'mean' and 'produce', 'min', 'max', 'band', 'bor' and
+            'bxor'.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = torch.arange(2, dtype=torch.int64)
+        >>> dist.all_reduce(data)
+        >>> data
+        tensor([0, 1])
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> data = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
+        >>> data
+        tensor([1, 2]) # Rank 0
+        tensor([3, 4]) # Rank 1
+        >>> dist.all_reduce(data, op=dist.ReduceOp.SUM)
+        >>> data
+        tensor([4, 6]) # Rank 0
+        tensor([4, 6]) # Rank 1
+    """
+    world_size = get_world_size(group)
+    if world_size > 1:
+        if group is None:
+            group = get_default_group()
+
+        input_device = get_data_device(data)
+        backend_device = get_comm_device(group)
+        data_on_device = cast_data_device(data, backend_device)
+
+        # pytorch does not support 'mean' operation so we fall back to support
+        # it with 'sum' operation.
+        if op.lower() == 'mean':
+            torch_dist.all_reduce(data_on_device, _get_reduce_op('sum'), group)
+
+            # use true_divide to handle torch1.6.0 throws an RuntimeError when
+            # the type of `data_on_device` is int64
+            data_on_device = torch.true_divide(data_on_device, world_size)
+        else:
+            torch_dist.all_reduce(data_on_device, _get_reduce_op(op), group)
+
+        cast_data_device(data_on_device, input_device, out=data)
+
+
+def all_gather(data: Tensor,
+               group: Optional[ProcessGroup] = None) -> List[Tensor]:
+    """Gather data from the whole group in a list.
+
+    Note:
+        Calling ``all_gather`` in non-distributed environment does nothing
+        and just returns a list containing :attr:`data` itself.
+
+    Note:
+        Unlike PyTorch ``torch.distributed.all_gather``, :meth:`all_gather` in
+        MMEngine does not pass in an empty list ``gather_list`` and returns
+        the ``gather_list`` directly, which is more convenient. The difference
+        between their interfaces is as below:
+
+        - MMEngine: all_gather(data, group) -> gather_list
+        - PyTorch: all_gather(gather_list, data, group) -> None
+
+    Args:
+        data (Tensor): Tensor to be gathered.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        list[Tensor]: Return a list containing data from the whole group if
+        in distributed environment, otherwise a list only containing
+        :attr:`data` itself.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = torch.arange(2, dtype=torch.int64)
+        >>> data
+        tensor([0, 1])
+        >>> output = dist.all_gather(data)
+        >>> output
+        [tensor([0, 1])]
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> data = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
+        >>> data
+        tensor([1, 2])  # Rank 0
+        tensor([3, 4])  # Rank 1
+        >>> output = dist.all_gather(data)
+        >>> output
+        [tensor([1, 2]), tensor([3, 4])]  # Rank 0
+        [tensor([1, 2]), tensor([3, 4])]  # Rank 1
+    """
+    world_size = get_world_size(group)
+    if world_size == 1:
+        return [data]
+
+    if group is None:
+        group = get_default_group()
+
+    input_device = get_data_device(data)
+    backend_device = get_comm_device(group)
+    data_on_device = cast_data_device(data, backend_device)
+
+    gather_list = [
+        torch.empty_like(data, device=backend_device)
+        for _ in range(world_size)
+    ]
+
+    torch_dist.all_gather(gather_list, data_on_device, group)
+
+    return cast_data_device(gather_list, input_device)  # type: ignore
+
+
+def gather(data: Tensor,
+           dst: int = 0,
+           group: Optional[ProcessGroup] = None) -> List[Optional[Tensor]]:
+    """Gather data from the whole group to ``dst`` process.
+
+    Note:
+        Calling ``gather`` in non-distributed environment dose nothing
+        and just returns a list containing :attr:`data` itself.
+
+    Note:
+        ``NCCL`` backend does not support ``gather``.
+
+    Note:
+        Unlike PyTorch ``torch.distributed.gather``, :meth:`gather` in
+        MMEngine does not pass in an empty list ``gather_list`` and returns
+        the ``gather_list`` directly, which is more convenient. The difference
+        between their interfaces is as below:
+
+        - MMEngine: gather(data, dst, group) -> gather_list
+        - PyTorch: gather(data, gather_list, dst, group) -> None
+
+    Args:
+        data (Tensor): Tensor to be gathered. CUDA tensor is not supported.
+        dst (int): Destination rank. Defaults to 0.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        list[Tensor]: ``dst`` process will get a list of tensor gathering from
+        the whole group. Other process will get a empty list. If in
+        non-distributed environment, just return a list containing
+        :attr:`data` itself.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = torch.arange(2, dtype=torch.int64)
+        >>> data
+        tensor([0, 1])
+        >>> output = dist.gather(data)
+        >>> output
+        [tensor([0, 1])]
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> data = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
+        >>> data
+        tensor([1, 2]) # Rank 0
+        tensor([3, 4]) # Rank 1
+        >>> output = dist.gather(data)
+        >>> output
+        [tensor([1, 2]), tensor([3, 4])]  # Rank 0
+        []  # Rank 1
+    """
+    world_size = get_world_size(group)
+    if world_size == 1:
+        return [data]
+
+    if group is None:
+        group = get_default_group()
+
+    input_device = get_data_device(data)
+    backend_device = get_comm_device(group)
+
+    if get_rank(group) == dst:
+        gather_list = [
+            torch.empty_like(data, device=backend_device)
+            for _ in range(world_size)
+        ]
+    else:
+        gather_list = []
+
+    torch_dist.gather(data, gather_list, dst, group)
+
+    if get_rank(group) == dst:
+        return cast_data_device(gather_list, input_device)  # type: ignore
+    else:
+        return gather_list
+
+
+def broadcast(data: Tensor,
+              src: int = 0,
+              group: Optional[ProcessGroup] = None) -> None:
+    """Broadcast the data from ``src`` process to the whole group.
+
+    ``data`` must have the same number of elements in all processes
+    participating in the collective.
+
+    Note:
+        Calling ``broadcast`` in non-distributed environment does nothing.
+
+    Args:
+        data (Tensor): Data to be sent if ``src`` is the rank of current
+            process, and data to be used to save received data otherwise.
+        src (int): Source rank. Defaults to 0.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = torch.arange(2, dtype=torch.int64)
+        >>> data
+        tensor([0, 1])
+        >>> dist.broadcast(data)
+        >>> data
+        tensor([0, 1])
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> data = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
+        >>> data
+        tensor([1, 2]) # Rank 0
+        tensor([3, 4]) # Rank 1
+        >>> dist.broadcast(data)
+        >>> data
+        tensor([1, 2]) # Rank 0
+        tensor([1, 2]) # Rank 1
+    """
+    if get_world_size(group) > 1:
+        if group is None:
+            group = get_default_group()
+
+        input_device = get_data_device(data)
+        backend_device = get_comm_device(group)
+        data_on_device = cast_data_device(data, backend_device)
+        # broadcast requires tensor is contiguous
+        data_on_device = data_on_device.contiguous()  # type: ignore
+        torch_dist.broadcast(data_on_device, src, group)
+
+        if get_rank(group) != src:
+            cast_data_device(data_on_device, input_device, data)
+
+
+def sync_random_seed(group: Optional[ProcessGroup] = None) -> int:
+    """Synchronize a random seed to all processes.
+
+    In distributed sampling, different ranks should sample non-overlapped
+    data in the dataset. Therefore, this function is used to make sure that
+    each rank shuffles the data indices in the same order based
+    on the same seed. Then different ranks could use different indices
+    to select non-overlapped data from the same data list.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        int: Random seed.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> seed = dist.sync_random_seed()
+        >>> seed  # which a random number
+        587791752
+
+        >>> distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> seed = dist.sync_random_seed()
+        >>> seed
+        587791752  # Rank 0
+        587791752  # Rank 1
+    """
+    seed = np.random.randint(2**31)
+    if get_world_size(group) == 1:
+        return seed
+
+    if group is None:
+        group = get_default_group()
+
+    backend_device = get_comm_device(group)
+
+    if get_rank(group) == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32).to(backend_device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32).to(backend_device)
+
+    torch_dist.broadcast(random_num, src=0, group=group)
+
+    return random_num.item()
+
+
+def _object_to_tensor(obj: Any) -> Tuple[Tensor, Tensor]:
+    """Serialize picklable python object to tensor."""
+    byte_storage = torch.ByteStorage.from_buffer(pickle.dumps(obj))
+    # Do not replace `torch.ByteTensor` or `torch.LongTensor` with torch.tensor
+    # and specifying dtype. Otherwise, it will cause 100X slowdown.
+    # See: https://github.com/pytorch/pytorch/issues/65696
+    byte_tensor = torch.ByteTensor(byte_storage)
+    local_size = torch.LongTensor([byte_tensor.numel()])
+    return byte_tensor, local_size
+
+
+def _tensor_to_object(tensor: Tensor, tensor_size: int) -> Any:
+    """Deserialize tensor to picklable python object."""
+    buf = tensor.cpu().numpy().tobytes()[:tensor_size]
+    return pickle.loads(buf)
+
+
+def _broadcast_object_list(object_list: List[Any],
+                           src: int = 0,
+                           group: Optional[ProcessGroup] = None) -> None:
+    """Broadcast picklable objects in ``object_list`` to the whole group.
+
+    Similar to :func:`broadcast`, but Python objects can be passed in. Note
+    that all objects in ``object_list`` must be picklable in order to be
+    broadcasted.
+    """
+    if torch_dist.distributed_c10d._rank_not_in_group(group):
+        return
+
+    my_rank = get_rank()
+    # Serialize object_list elements to tensors on src rank.
+    if my_rank == src:
+        tensor_list, size_list = zip(
+            *[_object_to_tensor(obj) for obj in object_list])
+        object_sizes_tensor = torch.cat(size_list)
+    else:
+        object_sizes_tensor = torch.empty(len(object_list), dtype=torch.long)
+
+    # Current device selection.
+    # To preserve backwards compatibility, ``device`` is ``None`` by default.
+    # in which case we run current logic of device selection, i.e.
+    # ``current_device`` is CUDA if backend is NCCL otherwise CPU device. In
+    # the case it is not ``None`` we move the size and object tensors to be
+    # broadcasted to this device.
+    group_backend = get_backend(group)
+    is_nccl_backend = group_backend == torch_dist.Backend.NCCL
+    current_device = torch.device('cpu')
+    is_hccl_backend = group_backend == 'hccl'
+    is_cncl_backend = group_backend == 'cncl'
+    is_mccl_backend = group_backend == 'mccl'
+    if is_hccl_backend:
+        current_device = torch.device('npu', torch.npu.current_device())
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
+    elif is_cncl_backend:
+        current_device = torch.device('mlu', torch.mlu.current_device())
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
+    elif is_mccl_backend:
+        current_device = torch.device('musa', torch.musa.current_device())
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
+    elif is_nccl_backend:
+        # See note about using torch.cuda.current_device() here in
+        # docstring. We cannot simply use my_rank since rank == device is
+        # not necessarily true.
+        current_device = torch.device('cuda', torch.cuda.current_device())
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
+
+    # Broadcast object sizes
+    torch_dist.broadcast(object_sizes_tensor, src=src, group=group)
+
+    # Concatenate and broadcast serialized object tensors
+    if my_rank == src:
+        object_tensor = torch.cat(tensor_list)
+    else:
+        object_tensor = torch.empty(
+            torch.sum(object_sizes_tensor).int().item(),
+            dtype=torch.uint8,
+        )
+
+    if is_nccl_backend or is_hccl_backend or is_cncl_backend:
+        object_tensor = object_tensor.to(current_device)
+    torch_dist.broadcast(object_tensor, src=src, group=group)
+    # Deserialize objects using their stored sizes.
+    offset = 0
+    if my_rank != src:
+        for i, obj_size in enumerate(object_sizes_tensor):
+            obj_view = object_tensor[offset:offset + obj_size]
+            obj_view = obj_view.type(torch.uint8)
+            if obj_view.device != torch.device('cpu'):
+                obj_view = obj_view.cpu()
+            offset += obj_size
+            object_list[i] = _tensor_to_object(obj_view, obj_size)
+
+
+def broadcast_object_list(data: List[Any],
+                          src: int = 0,
+                          group: Optional[ProcessGroup] = None) -> None:
+    """Broadcasts picklable objects in ``object_list`` to the whole group.
+    Similar to :func:`broadcast`, but Python objects can be passed in. Note
+    that all objects in ``object_list`` must be picklable in order to be
+    broadcasted.
+
+    Note:
+        Calling ``broadcast_object_list`` in non-distributed environment does
+        nothing.
+
+    Args:
+        data (List[Any]): List of input objects to broadcast.
+            Each object must be picklable. Only objects on the ``src`` rank
+            will be broadcast, but each rank must provide lists of equal sizes.
+        src (int): Source rank from which to broadcast ``object_list``.
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Default is ``None``.
+        device (``torch.device``, optional): If not None, the objects are
+            serialized and converted to tensors which are moved to the
+            ``device`` before broadcasting. Default is ``None``.
+
+    Note:
+        For NCCL-based process groups, internal tensor representations of
+        objects must be moved to the GPU device before communication starts.
+        In this case, the used device is given by
+        ``torch.cuda.current_device()`` and it is the user's responsibility to
+        ensure that this is correctly set so that each rank has an individual
+        GPU, via ``torch.cuda.set_device()``.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = ['foo', 12, {1: 2}]
+        >>> dist.broadcast_object_list(data)
+        >>> data
+        ['foo', 12, {1: 2}]
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> if dist.get_rank() == 0:
+        >>>     # Assumes world_size of 3.
+        >>>     data = ["foo", 12, {1: 2}]  # any picklable object
+        >>> else:
+        >>>     data = [None, None, None]
+        >>> dist.broadcast_object_list(data)
+        >>> data
+        ["foo", 12, {1: 2}]  # Rank 0
+        ["foo", 12, {1: 2}]  # Rank 1
+    """
+    assert isinstance(data, list)
+
+    if get_world_size(group) > 1:
+        if group is None:
+            group = get_default_group()
+
+        if digit_version(TORCH_VERSION) >= digit_version(
+                '1.8.0') and not is_npu_available():
+            torch_dist.broadcast_object_list(data, src, group)
+        else:
+            _broadcast_object_list(data, src, group)
+
+
+def all_reduce_dict(data: Dict[str, Tensor],
+                    op: str = 'sum',
+                    group: Optional[ProcessGroup] = None) -> None:
+    """Reduces the dict across all machines in such a way that all get the
+    final result.
+
+    The code is modified from https://github.com/Megvii-
+    BaseDetection/YOLOX/blob/main/yolox/utils/allreduce_norm.py.
+
+    Args:
+        data (dict[str, Tensor]): Data to be reduced.
+        op (str): Operation to reduce data. Defaults to 'sum'. Optional values
+            are 'sum', 'mean' and 'produce', 'min', 'max', 'band', 'bor' and
+            'bxor'.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = {
+                'key1': torch.arange(2, dtype=torch.int64),
+                'key2': torch.arange(3, dtype=torch.int64)
+            }
+        >>> dist.all_reduce_dict(data)
+        >>> data
+            {'key1': tensor([0, 1]), 'key2': tensor([0, 1, 2])}
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> data = {
+                'key1': torch.arange(2, dtype=torch.int64),
+                'key2': torch.arange(3, dtype=torch.int64)
+            }
+        >>> dist.all_reduce_dict(data)
+        >>> data
+        {'key1': tensor([0, 2]), 'key2': tensor([0, 2, 4])}  # Rank 0
+        {'key1': tensor([0, 2]), 'key2': tensor([0, 2, 4])}  # Rank 1
+    """
+    assert isinstance(data, dict)
+
+    world_size = get_world_size(group)
+    if world_size > 1:
+
+        if group is None:
+            group = get_default_group()
+
+        # ensure keys are consistent across processes
+        keys = sorted(data.keys())
+        tensor_shapes = [data[k].shape for k in keys]
+        tensor_sizes = [data[k].numel() for k in keys]
+
+        if digit_version(TORCH_VERSION) == digit_version('1.5.0'):
+            # `torch.cat` in torch1.5 can not concatenate different types so
+            # we fallback to convert them all to float type.
+            flatten_tensor = torch.cat(
+                [data[k].flatten().float() for k in keys])
+        else:
+            flatten_tensor = torch.cat([data[k].flatten() for k in keys])
+
+        all_reduce(flatten_tensor, op=op, group=group)
+
+        split_tensors = [
+            x.reshape(shape) for x, shape in zip(
+                torch.split(flatten_tensor, tensor_sizes), tensor_shapes)
+        ]
+
+        for k, v in zip(keys, split_tensors):
+            data[k] = v
+
+
+def _all_gather_object(object_list: List[Any],
+                       obj: Any,
+                       group: Optional[ProcessGroup] = None) -> None:
+    """Gather picklable objects from the whole group into a list.
+
+    Similar to :func:`all_gather`, but Python objects can be passed in.
+    Note that the object must be picklable in order to be gathered.
+
+    Args:
+        object_list (list[Any]): Output list. It should be correctly sized as
+            the size of the group for this collective and will contain the
+            output.
+        object (Any): Pickable Python object to be broadcast from current
+            process.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        None. If the calling rank is part of this group, the output of the
+        collective will be populated into the input ``object_list``. If the
+        calling rank is not part of the group, the passed in ``object_list``
+        will be unmodified.
+    """
+    if torch_dist.distributed_c10d._rank_not_in_group(group):
+        return
+
+    input_tensor, local_size = _object_to_tensor(obj)
+    group_backend = get_backend(group)
+    current_device = torch.device('cpu')
+    is_nccl_backend = group_backend == torch_dist.Backend.NCCL
+    is_mccl_backend = group_backend == 'mccl'
+    if is_nccl_backend:
+        # See note about using torch.cuda.current_device() here in docstring.
+        # We cannot simply use my_rank since rank == device is not necessarily
+        # true.
+        current_device = torch.device('cuda', torch.cuda.current_device())
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
+    elif is_mccl_backend:
+        # See note about using torch.musa.current_device() here in docstring.
+        # We cannot simply use my_rank since rank == device is not necessarily
+        # true.
+        current_device = torch.device('musa', torch.musa.current_device())
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
+    # Gather all local sizes. This is so that we can find the max size, and
+    # index until the correct size when deserializing the tensors.
+    group_size = get_world_size(group=group)
+    object_sizes_tensor = torch.zeros(
+        group_size, dtype=torch.long, device=current_device)
+    object_size_list = [
+        object_sizes_tensor[i].unsqueeze(dim=0) for i in range(group_size)
+    ]
+    # Allgather tensor sizes
+    torch_dist.all_gather(object_size_list, local_size, group=group)
+    max_object_size = int(max(object_size_list).item())
+    # Resize tensor to max size across all ranks.
+    input_tensor.resize_(max_object_size)
+    coalesced_output_tensor = torch.empty(
+        max_object_size * group_size, dtype=torch.uint8, device=current_device)
+    # Output tensors are nonoverlapping views of coalesced_output_tensor
+    output_tensors = [
+        coalesced_output_tensor[max_object_size * i:max_object_size * (i + 1)]
+        for i in range(group_size)
+    ]
+    torch_dist.all_gather(output_tensors, input_tensor, group=group)
+    # Deserialize outputs back to object.
+    for i, tensor in enumerate(output_tensors):
+        tensor = tensor.type(torch.uint8)
+        if tensor.device != torch.device('cpu'):
+            tensor = tensor.cpu()
+        tensor_size = object_size_list[i]
+        object_list[i] = _tensor_to_object(tensor, tensor_size)
+
+
+def all_gather_object(data: Any,
+                      group: Optional[ProcessGroup] = None) -> List[Any]:
+    """Gather picklable objects from the whole group into a list. Similar to
+    :func:`all_gather`, but Python objects can be passed in. Note that the
+    object must be picklable in order to be gathered.
+
+    Note:
+        Calling ``all_gather_object`` in non-distributed environment does
+        nothing and just returns a list containing :attr:`data` itself.
+
+    Note:
+        Unlike PyTorch ``torch.distributed.all_gather_object``,
+        :meth:`all_gather_object` in MMEngine does not pass in an empty list
+        ``gather_list`` and returns the ``gather_list`` directly, which is
+        more convenient. The difference between their interfaces is as below:
+
+        - MMEngine: all_gather_object(data, group) -> gather_list
+        - PyTorch: all_gather_object(gather_list, data, group) -> None
+
+    Args:
+        data (Any): Pickable Python object to be broadcast from current
+            process.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        list[Tensor]: Return a list containing data from the whole group if
+        in distributed environment, otherwise a list only containing
+        :attr:`data` itself.
+
+    Note:
+        For NCCL-based process groups, internal tensor representations
+        of objects must be moved to the GPU device before communication starts.
+        In this case, the used device is given by
+        ``torch.cuda.current_device()`` and it is the user's responsibility to
+        ensure that this is correctly set so that each rank has an individual
+        GPU, via ``torch.cuda.set_device()``.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = ['foo', 12, {1: 2}]  # any picklable object
+        >>> gather_objects = dist.all_gather_object(data[dist.get_rank()])
+        >>> output
+        ['foo']
+
+        >>> # distributed environment
+        >>> # We have 3 process groups, 3 ranks.
+        >>> output = dist.all_gather_object(data[dist.get_rank()])
+        >>> output
+        ['foo', 12, {1: 2}]  # Rank 0
+        ['foo', 12, {1: 2}]  # Rank 1
+        ['foo', 12, {1: 2}]  # Rank 2
+    """
+    world_size = get_world_size(group)
+    if world_size == 1:
+        return [data]
+
+    if group is None:
+        group = get_default_group()
+
+    gather_list = [None] * world_size
+
+    if digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
+        torch_dist.all_gather_object(gather_list, data, group)
+    else:
+        _all_gather_object(gather_list, data, group)
+
+    return gather_list
+
+
+def _validate_output_list_for_rank(my_rank: int, dst: int,
+                                   gather_list: Optional[list]) -> None:
+    """Validate whether ``gather_list`` is None in non-dst ranks."""
+    if dst == my_rank:
+        if not gather_list:
+            raise ValueError(
+                'Argument ``gather_list`` must be specified on destination '
+                'rank.')
+    elif gather_list:
+        raise ValueError('Argument ``gather_list`` must NOT be specified '
+                         'on non-destination ranks.')
+
+
+def _gather_object(obj: Any,
+                   object_gather_list=None,
+                   dst: int = 0,
+                   group: Optional[ProcessGroup] = None) -> None:
+    """Gathers picklable objects from the whole group in a single process.
+
+    Similar to :func:`gather`, but Python objects can be passed in. Note that
+    the object must be picklable in order to be gathered.
+
+    Args:
+        obj (Any): Input object. Must be picklable.
+        object_gather_list (list[Any], optional): Output list. On the ``dst``
+            rank, it should be correctly sized as the size of the group for
+            this collective and will contain the output. Must be ``None`` on
+            non-dst ranks. Defaults to None.
+        dst (int): Destination rank. Defaults to 0.
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+    """
+    if torch_dist.distributed_c10d._rank_not_in_group(group):
+        return
+
+    # Ensure object_gather_list is specified appopriately.
+    my_rank = get_rank()
+    _validate_output_list_for_rank(my_rank, dst, object_gather_list)
+    input_tensor, local_size = _object_to_tensor(obj)
+    group_backend = get_backend(group)
+    current_device = torch.device('cpu')
+    is_nccl_backend = group_backend == torch_dist.Backend.NCCL
+    is_mccl_backend = group_backend == 'mccl'
+    if is_nccl_backend:
+        current_device = torch.device('cuda', torch.cuda.current_device())
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
+    elif is_mccl_backend:
+        current_device = torch.device('musa', torch.musa.current_device())
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
+    # Gather all local sizes. This is so that we can find the max size, and
+    # index until the correct size when deserializing the tensors.
+    group_size = get_world_size(group=group)
+    object_sizes_tensor = torch.zeros(
+        group_size, dtype=torch.long, device=current_device)
+    object_size_list = [
+        object_sizes_tensor[i].unsqueeze(dim=0) for i in range(group_size)
+    ]
+    # Allgather tensor sizes. An all-gather is needed here despite this being a
+    # gather, since each rank needs to broadcast a tensor of the same (maximal)
+    # size.
+    torch_dist.all_gather(object_size_list, local_size, group=group)
+    max_object_size = int(max(object_size_list).item())
+    # Resize tensor to max size across all ranks.
+    input_tensor.resize_(max_object_size)
+    # Avoid populating output tensors if the result won't be gathered on this
+    # rank.
+    if my_rank == dst:
+        coalesced_output_tensor = torch.empty(
+            max_object_size * group_size,
+            dtype=torch.uint8,
+            device=current_device)
+        # Output tensors are nonoverlapping views of coalesced_output_tensor
+        output_tensors = [
+            coalesced_output_tensor[max_object_size * i:max_object_size *
+                                    (i + 1)] for i in range(group_size)
+        ]
+    # All ranks call gather with equal-sized tensors.
+    torch_dist.gather(
+        input_tensor,
+        gather_list=output_tensors if my_rank == dst else None,
+        dst=dst,
+        group=group,
+    )
+    if my_rank != dst:
+        return
+    for i, tensor in enumerate(output_tensors):
+        tensor = tensor.type(torch.uint8)
+        tensor_size = object_size_list[i]
+        object_gather_list[i] = _tensor_to_object(tensor, tensor_size)
+
+
+def gather_object(data: Any,
+                  dst: int = 0,
+                  group: Optional[ProcessGroup] = None) -> Optional[List[Any]]:
+    """Gathers picklable objects from the whole group in a single process.
+    Similar to :func:`gather`, but Python objects can be passed in. Note that
+    the object must be picklable in order to be gathered.
+
+    Note:
+        ``NCCL backend`` does not support ``gather_object``.
+
+    Note:
+        Unlike PyTorch ``torch.distributed.gather_object``,
+        :meth:`gather_object` in MMEngine does not pass in an empty list
+        ``gather_list`` and returns the ``gather_list`` directly, which is
+        more convenient. The difference between their interfaces is as below:
+
+        - MMEngine: gather_object(data, dst, group) -> gather_list
+        - PyTorch: gather_object(data, gather_list, data, group) -> None
+
+    Args:
+        data (Any): Input object. Must be picklable.
+        dst (int): Destination rank. Defaults to 0.
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        list[Any]. On the ``dst`` rank, return ``gather_list`` which contains
+        the output of the collective.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = ['foo', 12, {1: 2}]  # any picklable object
+        >>> gather_objects = dist.gather_object(data[dist.get_rank()])
+        >>> output
+        ['foo']
+
+        >>> # distributed environment
+        >>> # We have 3 process groups, 3 ranks.
+        >>> dist.gather_object(gather_objects[dist.get_rank()], dst=0)
+        >>> output
+        ['foo', 12, {1: 2}]  # Rank 0
+        None  # Rank 1
+        None  # Rank 2
+    """
+    world_size = get_world_size(group)
+    if world_size == 1:
+        return [data]
+
+    if group is None:
+        group = get_default_group()
+
+    gather_list = [None] * world_size if get_rank(group) == dst else None
+
+    if digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
+        torch_dist.gather_object(data, gather_list, dst, group)
+    else:
+        _gather_object(data, gather_list, dst, group)
+
+    return gather_list
+
+
+def collect_results(results: list,
+                    size: int,
+                    device: str = 'cpu',
+                    tmpdir: Optional[str] = None) -> Optional[list]:
+    """Collected results in distributed environments.
+
+    Args:
+        results (list[object]): Result list containing result parts to be
+            collected. Each item of ``result_part`` should be a picklable
+            object.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+        device (str): Device name. Optional values are 'cpu', 'gpu' or 'npu'.
+        tmpdir (str | None): Temporal directory for collected results to
+            store. If set to None, it will create a temporal directory for it.
+            ``tmpdir`` should be None when device is 'gpu' or 'npu'.
+            Defaults to None.
+
+    Returns:
+        list or None: The collected results.
+
+    Examples:
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> import mmengine.dist as dist
+        >>> if dist.get_rank() == 0:
+                data = ['foo', {1: 2}]
+            else:
+                data = [24, {'a': 'b'}]
+        >>> size = 4
+        >>> output = dist.collect_results(data, size, device='cpu')
+        >>> output
+        ['foo', 24, {1: 2}, {'a': 'b'}]  # rank 0
+        None  # rank 1
+    """
+    if device not in ['gpu', 'cpu', 'npu']:
+        raise NotImplementedError(
+            f"device must be 'cpu' , 'gpu' or 'npu', but got {device}")
+
+    if device == 'gpu' or device == 'npu':
+        assert tmpdir is None, f'tmpdir should be None when device is {device}'
+        return _collect_results_device(results, size)
+    else:
+        return collect_results_cpu(results, size, tmpdir)
+
+
+def collect_results_cpu(result_part: list,
+                        size: int,
+                        tmpdir: Optional[str] = None) -> Optional[list]:
+    """Collect results under cpu mode.
+
+    On cpu mode, this function will save the results on different gpus to
+    ``tmpdir`` and collect them by the rank 0 worker.
+
+    Args:
+        result_part (list): Result list containing result parts
+            to be collected. Each item of ``result_part`` should be a picklable
+            object.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+        tmpdir (str | None): Temporal directory for collected results to
+            store. If set to None, it will create a random temporal directory
+            for it. Defaults to None.
+
+    Returns:
+        list or None: The collected results.
+
+    Examples:
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> import mmengine.dist as dist
+        >>> if dist.get_rank() == 0:
+                data = ['foo', {1: 2}]
+            else:
+                data = [24, {'a': 'b'}]
+        >>> size = 4
+        >>> output = dist.collect_results_cpu(data, size)
+        >>> output
+        ['foo', 24, {1: 2}, {'a': 'b'}]  # rank 0
+        None  # rank 1
+    """
+    rank, world_size = get_dist_info()
+    if world_size == 1:
+        return result_part[:size]
+
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8)
+        if rank == 0:
+            mmengine.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8)
+            dir_tensor[:len(tmpdir)] = tmpdir
+        broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.numpy().tobytes().decode().rstrip()
+    else:
+        mmengine.mkdir_or_exist(tmpdir)
+
+    # dump the part result to the dir
+    with open(osp.join(tmpdir, f'part_{rank}.pkl'), 'wb') as f:  # type: ignore
+        pickle.dump(result_part, f, protocol=2)
+
+    barrier()
+
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            path = osp.join(tmpdir, f'part_{i}.pkl')  # type: ignore
+            if not osp.exists(path):
+                raise FileNotFoundError(
+                    f'{tmpdir} is not an shared directory for '
+                    f'rank {i}, please make sure {tmpdir} is a shared '
+                    'directory for all ranks!')
+            with open(path, 'rb') as f:
+                part_list.append(pickle.load(f))
+        # sort the results
+        ordered_results = []
+        zipped_results = zip_longest(*part_list)
+        ordered_results = [
+            i for i in chain.from_iterable(zipped_results) if i is not None
+        ]
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)  # type: ignore
+        return ordered_results
+
+
+def _collect_results_device(result_part: list, size: int) -> Optional[list]:
+    """Collect results under gpu or npu mode."""
+    rank, world_size = get_dist_info()
+    if world_size == 1:
+        return result_part[:size]
+
+    # gather all result part. Note that NCCL does not support gather so use
+    # all_gather_object instead.
+    part_list = all_gather_object(result_part)
+
+    if rank == 0:
+        # sort the results
+        ordered_results = []
+        zipped_results = zip_longest(*part_list)
+        ordered_results = [
+            i for i in chain.from_iterable(zipped_results) if i is not None
+        ]
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        return ordered_results
+    else:
+        return None
+
+
+def collect_results_gpu(result_part: list, size: int) -> Optional[list]:
+    """Collect results under gpu mode.
+
+    On gpu mode, this function will encode results to gpu tensors and use gpu
+    communication for results collection.
+
+    Args:
+        result_part (list[object]): Result list containing result parts
+            to be collected. Each item of ``result_part`` should be a picklable
+            object.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+
+    Returns:
+        list or None: The collected results.
+
+    Examples:
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> import mmengine.dist as dist
+        >>> if dist.get_rank() == 0:
+                data = ['foo', {1: 2}]
+            else:
+                data = [24, {'a': 'b'}]
+        >>> size = 4
+        >>> output = dist.collect_results_gpu(data, size)
+        >>> output
+        ['foo', 24, {1: 2}, {'a': 'b'}]  # rank 0
+        None  # rank 1
+    """
+    return _collect_results_device(result_part, size)
+
+
+def _all_reduce_coalesced(tensors: List[torch.Tensor],
+                          bucket_size_mb: int = -1,
+                          op: str = 'sum',
+                          group: Optional[ProcessGroup] = None) -> None:
+    """All-reduce a sequence of tensors as a whole.
+
+    Args:
+        tensors (List[torch.Tensor]): A sequence of tensors to be
+            all-reduced.
+        bucket_size_mb (int): The limit of each chunk in megabytes
+            for grouping tensors into chunks. Defaults to -1.
+        op (str): Operation to reduce data. Defaults to 'sum'. Optional values
+            are 'sum', 'mean' and 'produce', 'min', 'max', 'band', 'bor' and
+            'bxor'.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+    """
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        all_reduce(flat_tensors, op=op, group=group)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def all_reduce_params(params: Union[List, Generator[torch.Tensor, None, None]],
+                      coalesce: bool = True,
+                      bucket_size_mb: int = -1,
+                      op: str = 'sum',
+                      group: Optional[ProcessGroup] = None) -> None:
+    """All-reduce parameters.
+
+    Args:
+        params (List or Generator[torch.Tensor, None, None]): List of
+            parameters or buffers of a model.
+        coalesce (bool, optional): Whether to reduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+        op (str): Operation to reduce data. Defaults to 'sum'. Optional values
+            are 'sum', 'mean' and 'produce', 'min', 'max', 'band', 'bor' and
+            'bxor'.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = [torch.arange(2), torch.arange(3)]
+        >>> dist.all_reduce_params(data)
+        >>> data
+            [tensor([0, 1]), tensor([0, 1, 2])]
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> if dist.get_rank() == 0:
+        ...     data = [torch.tensor([1, 2]), torch.tensor([3, 4])]
+        ... else:
+        ...     data = [torch.tensor([2, 3]), torch.tensor([4, 5])]
+
+        >>> dist.all_reduce_params(data)
+        >>> data
+            [torch.tensor([3, 5]), torch.tensor([7, 9])]
+    """
+    world_size = get_world_size(group)
+    if world_size == 1:
+        return
+    params_data = [param.data for param in params]
+    if coalesce:
+        _all_reduce_coalesced(params_data, bucket_size_mb, op=op, group=group)
+    else:
+        for tensor in params_data:
+            all_reduce(tensor, op=op, group=group)
diff --git a/head_extractor/build/lib/mmengine/dist/utils.py b/head_extractor/build/lib/mmengine/dist/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d32cec36b90e9a082e5542319a5bda6e6f5cfa3
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/dist/utils.py
@@ -0,0 +1,623 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import functools
+import os
+import subprocess
+from typing import Callable, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.multiprocessing as mp
+from torch import Tensor
+from torch import distributed as torch_dist
+from torch.distributed import ProcessGroup
+from mmengine.device import (is_mlu_available, is_npu_available,
+                             is_musa_available)
+
+from collections.abc import Iterable, Mapping
+
+_LOCAL_PROCESS_GROUP = None
+
+
+def is_distributed() -> bool:
+    """Return True if distributed environment has been initialized."""
+    return torch_dist.is_available() and torch_dist.is_initialized()
+
+
+def get_local_group() -> Optional[ProcessGroup]:
+    """Return local process group."""
+    if not is_distributed():
+        return None
+
+    if _LOCAL_PROCESS_GROUP is None:
+        raise RuntimeError('Local process group is not created, please use '
+                           '`init_local_group` to setup local process group.')
+
+    return _LOCAL_PROCESS_GROUP
+
+
+def get_default_group() -> Optional[ProcessGroup]:
+    """Return default process group."""
+
+    return torch_dist.distributed_c10d._get_default_group()
+
+
+def infer_launcher():
+    if 'WORLD_SIZE' in os.environ:
+        return 'pytorch'
+    elif 'SLURM_NTASKS' in os.environ:
+        return 'slurm'
+    elif 'OMPI_COMM_WORLD_LOCAL_RANK' in os.environ:
+        return 'mpi'
+    else:
+        return 'none'
+
+
+def init_dist(launcher,
+              backend='nccl',
+              init_backend='torch',
+              **kwargs) -> None:
+    """Initialize distributed environment.
+
+    Args:
+        launcher (str): Way to launcher multi processes. Supported launchers
+            are 'pytorch', 'mpi' and 'slurm'.
+        backend (str): Communication Backends. Supported backends are 'nccl',
+            'gloo' and 'mpi'. Defaults to 'nccl'.
+        **kwargs: keyword arguments are passed to ``init_process_group``.
+    """
+    timeout = kwargs.get('timeout', None)
+    if timeout is not None:
+        # If a timeout (in seconds) is specified, it must be converted
+        # to a timedelta object before forwarding the call to
+        # the respective backend, because they expect a timedelta object.
+        try:
+            kwargs['timeout'] = datetime.timedelta(seconds=timeout)
+        except TypeError as exception:
+            raise TypeError(
+                f'Timeout for distributed training must be provided as '
+                f"timeout in seconds, but we've received the type "
+                f'{type(timeout)}. Please specify the timeout like this: '
+                f"dist_cfg=dict(backend='nccl', timeout=1800)") from exception
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method('spawn')
+    if launcher == 'pytorch':
+        _init_dist_pytorch(backend, init_backend=init_backend, **kwargs)
+    elif launcher == 'mpi':
+        _init_dist_mpi(backend, **kwargs)
+    elif launcher == 'slurm':
+        _init_dist_slurm(backend, init_backend=init_backend, **kwargs)
+    else:
+        raise ValueError(f'Invalid launcher type: {launcher}')
+
+
+def _init_dist_pytorch(backend, init_backend='torch', **kwargs) -> None:
+    """Initialize distributed environment with PyTorch launcher.
+
+    Args:
+        backend (str): Backend of torch.distributed. Supported backends are
+            'nccl', 'gloo' and 'mpi'. Defaults to 'nccl'.
+        **kwargs: keyword arguments are passed to ``init_process_group``.
+    """
+    rank = int(os.environ['RANK'])
+    # LOCAL_RANK is set by `torch.distributed.launch` since PyTorch 1.1
+    local_rank = int(os.environ['LOCAL_RANK'])
+    if is_mlu_available():
+        import torch_mlu  # noqa: F401
+        torch.mlu.set_device(local_rank)
+        torch_dist.init_process_group(
+            backend='cncl',
+            rank=rank,
+            world_size=int(os.environ['WORLD_SIZE']),
+            **kwargs)
+    elif is_npu_available():
+        import torch_npu  # noqa: F401
+        torch.npu.set_device(local_rank)
+        torch_dist.init_process_group(
+            backend='hccl',
+            rank=rank,
+            world_size=int(os.environ['WORLD_SIZE']),
+            **kwargs)
+    elif is_musa_available():
+        import torch_musa  # noqa: F401
+        torch.musa.set_device(rank)
+        torch_dist.init_process_group(
+            backend='mccl',
+            rank=rank,
+            world_size=int(os.environ['WORLD_SIZE']),
+            **kwargs)
+    else:
+        torch.cuda.set_device(local_rank)
+
+        if init_backend == 'torch':
+            torch_dist.init_process_group(backend=backend, **kwargs)
+        elif init_backend == 'deepspeed':
+            import deepspeed
+            deepspeed.init_distributed(dist_backend=backend, **kwargs)
+        elif init_backend == 'colossalai':
+            import colossalai
+            colossalai.launch_from_torch(backend=backend, **kwargs)
+        else:
+            raise ValueError(
+                'supported "init_backend" is "torch" or "deepspeed", '
+                f'but got {init_backend}')
+
+
+def _init_dist_mpi(backend, **kwargs) -> None:
+    """Initialize distributed environment with MPI launcher.
+
+    Args:
+        backend (str): Backend of torch.distributed. Supported backends are
+            'nccl', 'gloo' and 'mpi'. Defaults to 'nccl'.
+        **kwargs: keyword arguments are passed to ``init_process_group``.
+    """
+    if backend == 'smddp':
+        try:
+            import smdistributed.dataparallel.torch.torch_smddp  # noqa: F401
+        except ModuleNotFoundError as e:
+            raise ModuleNotFoundError(
+                'Please use an Amazon SageMaker DLC to access smdistributed: '
+                'https://github.com/aws/deep-learning-containers/blob/master'
+                '/available_images.md#sagemaker-framework-containers'
+                '-sm-support-only') from e
+    local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+    torch.cuda.set_device(local_rank)
+    if 'MASTER_PORT' not in os.environ:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    if 'MASTER_ADDR' not in os.environ:
+        raise KeyError('The environment variable MASTER_ADDR is not set')
+    os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
+    os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
+    torch_dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_slurm(backend,
+                     port=None,
+                     init_backend='torch',
+                     **kwargs) -> None:
+    """Initialize slurm distributed training environment.
+
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    # Not sure when this environment variable could be None, so use a fallback
+    local_rank_env = os.environ.get('SLURM_LOCALID', None)
+    if local_rank_env is not None:
+        local_rank = int(local_rank_env)
+    else:
+        num_gpus = torch.cuda.device_count()
+        local_rank = proc_id % num_gpus
+    addr = subprocess.getoutput(
+        f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    # use MASTER_ADDR in the environment variable if it already exists
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(local_rank)
+    os.environ['RANK'] = str(proc_id)
+
+    if is_mlu_available():
+        import torch_mlu  # noqa: F401
+        torch.mlu.set_device(local_rank)
+        torch_dist.init_process_group(backend='cncl', **kwargs)
+    else:
+        torch.cuda.set_device(local_rank)
+
+        if init_backend == 'torch':
+            torch_dist.init_process_group(backend=backend, **kwargs)
+        elif init_backend == 'deepspeed':
+            import deepspeed
+            deepspeed.init_distributed(dist_backend=backend, **kwargs)
+        elif init_backend == 'colossalai':
+            import colossalai
+            colossalai.launch_from_slurm(
+                backend=backend,
+                host=os.environ['MASTER_ADDR'],
+                port=os.environ['MASTER_PORT'],
+                **kwargs,
+            )
+        else:
+            raise ValueError(
+                'supported "init_backend" is "torch" or "deepspeed", '
+                f'but got {init_backend}')
+
+
+def init_local_group(node_rank: int, num_gpus_per_node: int):
+    """Setup the local process group.
+
+    Setup a process group which only includes processes that on the same
+    machine as the current process.
+
+    The code is modified from
+    https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py
+
+    Args:
+        node_rank (int): Rank of machines used for training.
+        num_gpus_per_node (int): Number of gpus used for training in a single
+            machine.
+    """  # noqa: W501
+    global _LOCAL_PROCESS_GROUP
+    assert _LOCAL_PROCESS_GROUP is None
+
+    ranks = list(
+        range(node_rank * num_gpus_per_node,
+              (node_rank + 1) * num_gpus_per_node))
+    _LOCAL_PROCESS_GROUP = torch_dist.new_group(ranks)
+
+
+def get_backend(group: Optional[ProcessGroup] = None) -> Optional[str]:
+    """Return the backend of the given process group.
+
+    Note:
+        Calling ``get_backend`` in non-distributed environment will return
+        None.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. The
+            default is the general main process group. If another specific
+            group is specified, the calling process must be part of
+            :attr:`group`. Defaults to None.
+
+    Returns:
+        str or None: Return the backend of the given process group as a lower
+        case string if in distributed environment, otherwise None.
+    """
+    if is_distributed():
+        # handle low versions of torch like 1.5.0 which does not support
+        # passing in None for group argument
+        if group is None:
+            group = get_default_group()
+        return torch_dist.get_backend(group)
+    else:
+        return None
+
+
+def get_world_size(group: Optional[ProcessGroup] = None) -> int:
+    """Return the number of the given process group.
+
+    Note:
+        Calling ``get_world_size`` in non-distributed environment will return
+        1.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        int: Return the number of processes of the given process group if in
+        distributed environment, otherwise 1.
+    """
+    if is_distributed():
+        # handle low versions of torch like 1.5.0 which does not support
+        # passing in None for group argument
+        if group is None:
+            group = get_default_group()
+        return torch_dist.get_world_size(group)
+    else:
+        return 1
+
+
+def get_rank(group: Optional[ProcessGroup] = None) -> int:
+    """Return the rank of the given process group.
+
+    Rank is a unique identifier assigned to each process within a distributed
+    process group. They are always consecutive integers ranging from 0 to
+    ``world_size``.
+
+    Note:
+        Calling ``get_rank`` in non-distributed environment will return 0.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        int: Return the rank of the process group if in distributed
+        environment, otherwise 0.
+    """
+
+    if is_distributed():
+        # handle low versions of torch like 1.5.0 which does not support
+        # passing in None for group argument
+        if group is None:
+            group = get_default_group()
+        return torch_dist.get_rank(group)
+    else:
+        return 0
+
+
+def get_local_size() -> int:
+    """Return the number of the current node.
+
+    Returns:
+        int: Return the number of processes in the current node if in
+        distributed environment, otherwise 1.
+    """
+    if not is_distributed():
+        return 1
+
+    if _LOCAL_PROCESS_GROUP is None:
+        raise RuntimeError('Local process group is not created, please use '
+                           '`init_local_group` to setup local process group.')
+
+    return torch_dist.get_world_size(_LOCAL_PROCESS_GROUP)
+
+
+def get_local_rank() -> int:
+    """Return the rank of current process in the current node.
+
+    Returns:
+        int: Return the rank of current process in the current node if in
+        distributed environment, otherwise 0
+    """
+    if not is_distributed():
+        return 0
+
+    if _LOCAL_PROCESS_GROUP is None:
+        raise RuntimeError('Local process group is not created, please use '
+                           '`init_local_group` to setup local process group.')
+
+    return torch_dist.get_rank(_LOCAL_PROCESS_GROUP)
+
+
+def get_dist_info(group: Optional[ProcessGroup] = None) -> Tuple[int, int]:
+    """Get distributed information of the given process group.
+
+    Note:
+        Calling ``get_dist_info`` in non-distributed environment will return
+        (0, 1).
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        tuple[int, int]: Return a tuple containing the ``rank`` and
+        ``world_size``.
+    """
+    world_size = get_world_size(group)
+    rank = get_rank(group)
+    return rank, world_size
+
+
+def is_main_process(group: Optional[ProcessGroup] = None) -> bool:
+    """Whether the current rank of the given process group is equal to 0.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        bool: Return True if the current rank of the given process group is
+        equal to 0, otherwise False.
+    """
+    return get_rank(group) == 0
+
+
+def master_only(func: Callable) -> Callable:
+    """Decorate those methods which should be executed in master process.
+
+    Args:
+        func (callable): Function to be decorated.
+
+    Returns:
+        callable: Return decorated function.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_main_process():
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+def barrier(group: Optional[ProcessGroup] = None) -> None:
+    """Synchronize all processes from the given process group.
+
+    This collective blocks processes until the whole group enters this
+    function.
+
+    Note:
+        Calling ``barrier`` in non-distributed environment will do nothing.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+    """
+    if is_distributed():
+        # handle low versions of torch like 1.5.0 which does not support
+        # passing in None for group argument
+        if group is None:
+            group = get_default_group()
+        torch_dist.barrier(group)
+
+
+def get_data_device(data: Union[Tensor, Mapping, Iterable]) -> torch.device:
+    """Return the device of ``data``.
+
+    If ``data`` is a sequence of Tensor, all items in ``data`` should have a
+    same device type.
+
+    If ``data`` is a dict whose values are Tensor, all values should have a
+    same device type.
+
+    Args:
+        data (Tensor or Sequence or dict): Inputs to be inferred the device.
+
+    Returns:
+        torch.device: The device of ``data``.
+
+    Examples:
+        >>> import torch
+        >>> from mmengine.dist import cast_data_device
+        >>> # data is a Tensor
+        >>> data = torch.tensor([0, 1])
+        >>> get_data_device(data)
+        device(type='cpu')
+        >>> # data is a list of Tensor
+        >>> data = [torch.tensor([0, 1]), torch.tensor([2, 3])]
+        >>> get_data_device(data)
+        device(type='cpu')
+        >>> # data is a dict
+        >>> data = {'key1': torch.tensor([0, 1]), 'key2': torch.tensor([0, 1])}
+        >>> get_data_device(data)
+        device(type='cpu')
+    """
+    if isinstance(data, Tensor):
+        return data.device
+    elif isinstance(data, Mapping):
+        pre = None
+        for v in data.values():
+            cur = get_data_device(v)
+            if pre is None:
+                pre = cur
+            else:
+                if cur != pre:
+                    raise ValueError(
+                        'device type in data should be consistent, but got '
+                        f'{cur} and {pre}')
+        if pre is None:
+            raise ValueError('data should not be empty.')
+        return pre
+    elif isinstance(data, Iterable) and not isinstance(data, str):
+        pre = None
+        for item in data:
+            cur = get_data_device(item)
+            if pre is None:
+                pre = cur
+            else:
+                if cur != pre:
+                    raise ValueError(
+                        'device type in data should be consistent, but got '
+                        f'{cur} and {pre}')
+        if pre is None:
+            raise ValueError('data should not be empty.')
+        return pre
+    else:
+        raise TypeError('data should be a Tensor, sequence of tensor or dict, '
+                        f'but got {data}')
+
+
+def get_comm_device(group: Optional[ProcessGroup] = None) -> torch.device:
+    """Return the device for communication among groups.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on.
+
+    Returns:
+        torch.device: The device of backend.
+    """
+    backend = get_backend(group)
+    if backend == 'hccl':
+        import torch_npu  # noqa: F401
+        return torch.device('npu', torch.npu.current_device())
+    elif backend == torch_dist.Backend.NCCL:
+        return torch.device('cuda', torch.cuda.current_device())
+    elif backend == 'cncl':
+        import torch_mlu  # noqa: F401
+        return torch.device('mlu', torch.mlu.current_device())
+    elif backend == 'smddp':
+        return torch.device('cuda', torch.cuda.current_device())
+    elif backend == 'mccl':
+        import torch_musa
+        return torch.device('musa', torch_musa.current_device())
+    else:
+        # GLOO and MPI backends use cpu device by default
+        return torch.device('cpu')
+
+
+def cast_data_device(
+    data: Union[Tensor, Mapping, Iterable],
+    device: torch.device,
+    out: Optional[Union[Tensor, Mapping, Iterable]] = None
+) -> Union[Tensor, Mapping, Iterable]:
+    """Recursively convert Tensor in ``data`` to ``device``.
+
+    If ``data`` has already on the ``device``, it will not be casted again.
+
+    Args:
+        data (Tensor or list or dict): Inputs to be casted.
+        device (torch.device): Destination device type.
+        out (Tensor or list or dict, optional): If ``out`` is specified, its
+            value will be equal to ``data``. Defaults to None.
+
+    Returns:
+        Tensor or list or dict: ``data`` was casted to ``device``.
+    """
+    if out is not None:
+        if type(data) is not type(out):
+            raise TypeError(
+                'out should be the same type with data, but got data is '
+                f'{type(data)} and out is {type(data)}')
+
+        if isinstance(out, set):
+            raise TypeError('out should not be a set')
+
+    if isinstance(data, Tensor):
+        if get_data_device(data) == device:
+            data_on_device = data
+        else:
+            data_on_device = data.to(device)
+
+        if out is not None:
+            # modify the value of out inplace
+            out.copy_(data_on_device)  # type: ignore
+
+        return data_on_device
+    elif isinstance(data, Mapping):
+        data_on_device = {}
+        if out is not None:
+            data_len = len(data)
+            out_len = len(out)  # type: ignore
+            if data_len != out_len:
+                raise ValueError('length of data and out should be same, '
+                                 f'but got {data_len} and {out_len}')
+
+            for k, v in data.items():
+                data_on_device[k] = cast_data_device(v, device,
+                                                     out[k])  # type: ignore
+        else:
+            for k, v in data.items():
+                data_on_device[k] = cast_data_device(v, device)
+
+        if len(data_on_device) == 0:
+            raise ValueError('data should not be empty')
+
+        # To ensure the type of output as same as input, we use `type(data)`
+        # to wrap the output
+        return type(data)(data_on_device)  # type: ignore
+    elif isinstance(data, Iterable) and not isinstance(
+            data, str) and not isinstance(data, np.ndarray):
+        data_on_device = []
+        if out is not None:
+            for v1, v2 in zip(data, out):
+                data_on_device.append(cast_data_device(v1, device, v2))
+        else:
+            for v in data:
+                data_on_device.append(cast_data_device(v, device))
+
+        if len(data_on_device) == 0:
+            raise ValueError('data should not be empty')
+
+        return type(data)(data_on_device)  # type: ignore
+    else:
+        raise TypeError('data should be a Tensor, list of tensor or dict, '
+                        f'but got {data}')
diff --git a/head_extractor/build/lib/mmengine/evaluator/__init__.py b/head_extractor/build/lib/mmengine/evaluator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6bc78425e2a3194bdaa4da29e6b3e238237fafa
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/evaluator/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .evaluator import Evaluator
+from .metric import BaseMetric, DumpResults
+from .utils import get_metric_value
+
+__all__ = ['BaseMetric', 'Evaluator', 'get_metric_value', 'DumpResults']
diff --git a/head_extractor/build/lib/mmengine/evaluator/evaluator.py b/head_extractor/build/lib/mmengine/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..930ce930286a6bdcf4d0eac7cb79a961f3587a14
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/evaluator/evaluator.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Iterator, List, Optional, Sequence, Union
+
+from mmengine.dataset import pseudo_collate
+from mmengine.registry import EVALUATOR, METRICS
+from mmengine.structures import BaseDataElement
+from .metric import BaseMetric
+
+
+@EVALUATOR.register_module()
+class Evaluator:
+    """Wrapper class to compose multiple :class:`BaseMetric` instances.
+
+    Args:
+        metrics (dict or BaseMetric or Sequence): The config of metrics.
+    """
+
+    def __init__(self, metrics: Union[dict, BaseMetric, Sequence]):
+        self._dataset_meta: Optional[dict] = None
+        if not isinstance(metrics, Sequence):
+            metrics = [metrics]
+        self.metrics: List[BaseMetric] = []
+        for metric in metrics:
+            if isinstance(metric, dict):
+                self.metrics.append(METRICS.build(metric))
+            else:
+                self.metrics.append(metric)
+
+    @property
+    def dataset_meta(self) -> Optional[dict]:
+        """Optional[dict]: Meta info of the dataset."""
+        return self._dataset_meta
+
+    @dataset_meta.setter
+    def dataset_meta(self, dataset_meta: dict) -> None:
+        """Set the dataset meta info to the evaluator and it's metrics."""
+        self._dataset_meta = dataset_meta
+        for metric in self.metrics:
+            metric.dataset_meta = dataset_meta
+
+    def process(self,
+                data_samples: Sequence[BaseDataElement],
+                data_batch: Optional[Any] = None):
+        """Convert ``BaseDataSample`` to dict and invoke process method of each
+        metric.
+
+        Args:
+            data_samples (Sequence[BaseDataElement]): predictions of the model,
+                and the ground truth of the validation set.
+            data_batch (Any, optional): A batch of data from the dataloader.
+        """
+        _data_samples = []
+        for data_sample in data_samples:
+            if isinstance(data_sample, BaseDataElement):
+                _data_samples.append(data_sample.to_dict())
+            else:
+                _data_samples.append(data_sample)
+
+        for metric in self.metrics:
+            metric.process(data_batch, _data_samples)
+
+    def evaluate(self, size: int) -> dict:
+        """Invoke ``evaluate`` method of each metric and collect the metrics
+        dictionary.
+
+        Args:
+            size (int): Length of the entire validation dataset. When batch
+                size > 1, the dataloader may pad some data samples to make
+                sure all ranks have the same length of dataset slice. The
+                ``collect_results`` function will drop the padded data based on
+                this size.
+
+        Returns:
+            dict: Evaluation results of all metrics. The keys are the names
+            of the metrics, and the values are corresponding results.
+        """
+        metrics = {}
+        for metric in self.metrics:
+            _results = metric.evaluate(size)
+
+            # Check metric name conflicts
+            for name in _results.keys():
+                if name in metrics:
+                    raise ValueError(
+                        'There are multiple evaluation results with the same '
+                        f'metric name {name}. Please make sure all metrics '
+                        'have different prefixes.')
+
+            metrics.update(_results)
+        return metrics
+
+    def offline_evaluate(self,
+                         data_samples: Sequence,
+                         data: Optional[Sequence] = None,
+                         chunk_size: int = 1):
+        """Offline evaluate the dumped predictions on the given data .
+
+        Args:
+            data_samples (Sequence): All predictions and ground truth of the
+                model and the validation set.
+            data (Sequence, optional): All data of the validation set.
+            chunk_size (int): The number of data samples and predictions to be
+                processed in a batch.
+        """
+
+        # support chunking iterable objects
+        def get_chunks(seq: Iterator, chunk_size=1):
+            stop = False
+            while not stop:
+                chunk = []
+                for _ in range(chunk_size):
+                    try:
+                        chunk.append(next(seq))
+                    except StopIteration:
+                        stop = True
+                        break
+                if chunk:
+                    yield chunk
+
+        if data is not None:
+            assert len(data_samples) == len(data), (
+                'data_samples and data should have the same length, but got '
+                f'data_samples length: {len(data_samples)} '
+                f'data length: {len(data)}')
+            data = get_chunks(iter(data), chunk_size)
+
+        size = 0
+        for output_chunk in get_chunks(iter(data_samples), chunk_size):
+            if data is not None:
+                data_chunk = pseudo_collate(next(data))  # type: ignore
+            else:
+                data_chunk = None
+            size += len(output_chunk)
+            self.process(output_chunk, data_chunk)
+        return self.evaluate(size)
diff --git a/head_extractor/build/lib/mmengine/evaluator/metric.py b/head_extractor/build/lib/mmengine/evaluator/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e6d40bee3da6d2f1b8a0a4546a80e2ac6ca527c
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/evaluator/metric.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from abc import ABCMeta, abstractmethod
+from typing import Any, List, Optional, Sequence, Union
+
+from torch import Tensor
+
+from mmengine.dist import (broadcast_object_list, collect_results,
+                           is_main_process)
+from mmengine.fileio import dump
+from mmengine.logging import print_log
+from mmengine.registry import METRICS
+from mmengine.structures import BaseDataElement
+
+
+class BaseMetric(metaclass=ABCMeta):
+    """Base class for a metric.
+
+    The metric first processes each batch of data_samples and predictions,
+    and appends the processed results to the results list. Then it
+    collects all results together from all ranks if distributed training
+    is used. Finally, it computes the metrics of the entire dataset.
+
+    A subclass of class:`BaseMetric` should assign a meaningful value to the
+    class attribute `default_prefix`. See the argument `prefix` for details.
+
+    Args:
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+        collect_dir: (str, optional): Synchronize directory for collecting data
+            from different ranks. This argument should only be configured when
+            ``collect_device`` is 'cpu'. Defaults to None.
+            `New in version 0.7.3.`
+    """
+
+    default_prefix: Optional[str] = None
+
+    def __init__(self,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 collect_dir: Optional[str] = None) -> None:
+        if collect_dir is not None and collect_device != 'cpu':
+            raise ValueError('`collec_dir` could only be configured when '
+                             "`collect_device='cpu'`")
+
+        self._dataset_meta: Union[None, dict] = None
+        self.collect_device = collect_device
+        self.results: List[Any] = []
+        self.prefix = prefix or self.default_prefix
+        self.collect_dir = collect_dir
+
+        if self.prefix is None:
+            print_log(
+                'The prefix is not set in metric class '
+                f'{self.__class__.__name__}.',
+                logger='current',
+                level=logging.WARNING)
+
+    @property
+    def dataset_meta(self) -> Optional[dict]:
+        """Optional[dict]: Meta info of the dataset."""
+        return self._dataset_meta
+
+    @dataset_meta.setter
+    def dataset_meta(self, dataset_meta: dict) -> None:
+        """Set the dataset meta info to the metric."""
+        self._dataset_meta = dataset_meta
+
+    @abstractmethod
+    def process(self, data_batch: Any, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (Any): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+
+    @abstractmethod
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+
+    def evaluate(self, size: int) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset. When batch
+                size > 1, the dataloader may pad some data samples to make
+                sure all ranks have the same length of dataset slice. The
+                ``collect_results`` function will drop the padded data based on
+                this size.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        if len(self.results) == 0:
+            print_log(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.',
+                logger='current',
+                level=logging.WARNING)
+
+        if self.collect_device == 'cpu':
+            results = collect_results(
+                self.results,
+                size,
+                self.collect_device,
+                tmpdir=self.collect_dir)
+        else:
+            results = collect_results(self.results, size, self.collect_device)
+
+        if is_main_process():
+            # cast all tensors in results list to cpu
+            results = _to_cpu(results)
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
+
+
+@METRICS.register_module()
+class DumpResults(BaseMetric):
+    """Dump model predictions to a pickle file for offline evaluation.
+
+    Args:
+        out_file_path (str): Path of the dumped file. Must end with '.pkl'
+            or '.pickle'.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        collect_dir: (str, optional): Synchronize directory for collecting data
+            from different ranks. This argument should only be configured when
+            ``collect_device`` is 'cpu'. Defaults to None.
+            `New in version 0.7.3.`
+    """
+
+    def __init__(self,
+                 out_file_path: str,
+                 collect_device: str = 'cpu',
+                 collect_dir: Optional[str] = None) -> None:
+        super().__init__(
+            collect_device=collect_device, collect_dir=collect_dir)
+        if not out_file_path.endswith(('.pkl', '.pickle')):
+            raise ValueError('The output file must be a pkl file.')
+        self.out_file_path = out_file_path
+
+    def process(self, data_batch: Any, predictions: Sequence[dict]) -> None:
+        """transfer tensors in predictions to CPU."""
+        self.results.extend(_to_cpu(predictions))
+
+    def compute_metrics(self, results: list) -> dict:
+        """dump the prediction results to a pickle file."""
+        dump(results, self.out_file_path)
+        print_log(
+            f'Results has been saved to {self.out_file_path}.',
+            logger='current')
+        return {}
+
+
+def _to_cpu(data: Any) -> Any:
+    """transfer all tensors and BaseDataElement to cpu."""
+    if isinstance(data, (Tensor, BaseDataElement)):
+        return data.to('cpu')
+    elif isinstance(data, list):
+        return [_to_cpu(d) for d in data]
+    elif isinstance(data, tuple):
+        return tuple(_to_cpu(d) for d in data)
+    elif isinstance(data, dict):
+        return {k: _to_cpu(v) for k, v in data.items()}
+    else:
+        return data
diff --git a/head_extractor/build/lib/mmengine/evaluator/utils.py b/head_extractor/build/lib/mmengine/evaluator/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6981c881b95e1b6c6c859958a3b6f73049e2f2af
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/evaluator/utils.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict
+
+
+def get_metric_value(indicator: str, metrics: Dict) -> Any:
+    """Get the metric value specified by an indicator, which can be either a
+    metric name or a full name with evaluator prefix.
+
+    Args:
+        indicator (str): The metric indicator, which can be the metric name
+            (e.g. 'AP') or the full name with prefix (e.g. 'COCO/AP')
+        metrics (dict): The evaluation results output by the evaluator
+
+    Returns:
+        Any: The specified metric value
+    """
+
+    if '/' in indicator:
+        # The indicator is a full name
+        if indicator in metrics:
+            return metrics[indicator]
+        else:
+            raise ValueError(
+                f'The indicator "{indicator}" can not match any metric in '
+                f'{list(metrics.keys())}')
+    else:
+        # The indicator is metric name without prefix
+        matched = [k for k in metrics.keys() if k.split('/')[-1] == indicator]
+
+        if not matched:
+            raise ValueError(
+                f'The indicator {indicator} can not match any metric in '
+                f'{list(metrics.keys())}')
+        elif len(matched) > 1:
+            raise ValueError(f'The indicator "{indicator}" matches multiple '
+                             f'metrics {matched}')
+        else:
+            return metrics[matched[0]]
diff --git a/head_extractor/build/lib/mmengine/fileio/__init__.py b/head_extractor/build/lib/mmengine/fileio/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..81adcd4c02c96715825bf220794ac9e0b08019b9
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .backends import (BaseStorageBackend, HTTPBackend, LmdbBackend,
+                       LocalBackend, MemcachedBackend, PetrelBackend,
+                       register_backend)
+from .file_client import FileClient, HardDiskBackend
+from .handlers import (BaseFileHandler, JsonHandler, PickleHandler,
+                       YamlHandler, register_handler)
+from .io import (copy_if_symlink_fails, copyfile, copyfile_from_local,
+                 copyfile_to_local, copytree, copytree_from_local,
+                 copytree_to_local, dump, exists, generate_presigned_url, get,
+                 get_file_backend, get_local_path, get_text, isdir, isfile,
+                 join_path, list_dir_or_file, load, put, put_text, remove,
+                 rmtree)
+from .parse import dict_from_file, list_from_file
+
+__all__ = [
+    'BaseStorageBackend', 'FileClient', 'PetrelBackend', 'MemcachedBackend',
+    'LmdbBackend', 'HardDiskBackend', 'LocalBackend', 'HTTPBackend',
+    'copy_if_symlink_fails', 'copyfile', 'copyfile_from_local',
+    'copyfile_to_local', 'copytree', 'copytree_from_local',
+    'copytree_to_local', 'exists', 'generate_presigned_url', 'get',
+    'get_file_backend', 'get_local_path', 'get_text', 'isdir', 'isfile',
+    'join_path', 'list_dir_or_file', 'put', 'put_text', 'remove', 'rmtree',
+    'load', 'dump', 'register_handler', 'BaseFileHandler', 'JsonHandler',
+    'PickleHandler', 'YamlHandler', 'list_from_file', 'dict_from_file',
+    'register_backend'
+]
diff --git a/head_extractor/build/lib/mmengine/fileio/backends/__init__.py b/head_extractor/build/lib/mmengine/fileio/backends/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa0008977f765f059c2f727885b57716979c2f05
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/backends/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseStorageBackend
+from .http_backend import HTTPBackend
+from .lmdb_backend import LmdbBackend
+from .local_backend import LocalBackend
+from .memcached_backend import MemcachedBackend
+from .petrel_backend import PetrelBackend
+from .registry_utils import backends, prefix_to_backends, register_backend
+
+__all__ = [
+    'BaseStorageBackend', 'LocalBackend', 'HTTPBackend', 'LmdbBackend',
+    'MemcachedBackend', 'PetrelBackend', 'register_backend', 'backends',
+    'prefix_to_backends'
+]
diff --git a/head_extractor/build/lib/mmengine/fileio/backends/base.py b/head_extractor/build/lib/mmengine/fileio/backends/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9331edf598f9b649cfb21e380b441f82a06d5372
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/backends/base.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from abc import ABCMeta, abstractmethod
+
+from mmengine.logging import print_log
+
+
+class BaseStorageBackend(metaclass=ABCMeta):
+    """Abstract class of storage backends.
+
+    All backends need to implement two apis: :meth:`get()` and
+    :meth:`get_text()`.
+
+    - :meth:`get()` reads the file as a byte stream.
+    - :meth:`get_text()` reads the file as texts.
+    """
+
+    # a flag to indicate whether the backend can create a symlink for a file
+    # This attribute will be deprecated in future.
+    _allow_symlink = False
+
+    @property
+    def allow_symlink(self):
+        print_log(
+            'allow_symlink will be deprecated in future',
+            logger='current',
+            level=logging.WARNING)
+        return self._allow_symlink
+
+    @property
+    def name(self):
+        return self.__class__.__name__
+
+    @abstractmethod
+    def get(self, filepath):
+        pass
+
+    @abstractmethod
+    def get_text(self, filepath):
+        pass
diff --git a/head_extractor/build/lib/mmengine/fileio/backends/http_backend.py b/head_extractor/build/lib/mmengine/fileio/backends/http_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3e65bbdbb6e4cb93b324951d9d2dd18c07bae64
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/backends/http_backend.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import tempfile
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Generator, Union
+from urllib.request import urlopen
+
+from .base import BaseStorageBackend
+
+
+class HTTPBackend(BaseStorageBackend):
+    """HTTP and HTTPS storage bachend."""
+
+    def get(self, filepath: str) -> bytes:
+        """Read bytes from a given ``filepath``.
+
+        Args:
+            filepath (str): Path to read data.
+
+        Returns:
+            bytes: Expected bytes object.
+
+        Examples:
+            >>> backend = HTTPBackend()
+            >>> backend.get('http://path/of/file')
+            b'hello world'
+        """
+        return urlopen(filepath).read()
+
+    def get_text(self, filepath, encoding='utf-8') -> str:
+        """Read text from a given ``filepath``.
+
+        Args:
+            filepath (str): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+
+        Examples:
+            >>> backend = HTTPBackend()
+            >>> backend.get_text('http://path/of/file')
+            'hello world'
+        """
+        return urlopen(filepath).read().decode(encoding)
+
+    @contextmanager
+    def get_local_path(
+            self, filepath: str) -> Generator[Union[str, Path], None, None]:
+        """Download a file from ``filepath`` to a local temporary directory,
+        and return the temporary path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Args:
+            filepath (str): Download a file from ``filepath``.
+
+        Yields:
+            Iterable[str]: Only yield one temporary path.
+
+        Examples:
+            >>> backend = HTTPBackend()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with backend.get_local_path('http://path/of/file') as path:
+            ...     # do something here
+        """
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.get(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
diff --git a/head_extractor/build/lib/mmengine/fileio/backends/lmdb_backend.py b/head_extractor/build/lib/mmengine/fileio/backends/lmdb_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb47923e56a43529cd28ce7aa3bc9404875c2fd7
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/backends/lmdb_backend.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from pathlib import Path
+from typing import Union
+
+from .base import BaseStorageBackend
+
+
+class LmdbBackend(BaseStorageBackend):
+    """Lmdb storage backend.
+
+    Args:
+        db_path (str): Lmdb database path.
+        readonly (bool): Lmdb environment parameter. If True, disallow any
+            write operations. Defaults to True.
+        lock (bool): Lmdb environment parameter. If False, when concurrent
+            access occurs, do not lock the database. Defaults to False.
+        readahead (bool): Lmdb environment parameter. If False, disable the OS
+            filesystem readahead mechanism, which may improve random read
+            performance when a database is larger than RAM. Defaults to False.
+        **kwargs: Keyword arguments passed to `lmdb.open`.
+
+    Attributes:
+        db_path (str): Lmdb database path.
+    """
+
+    def __init__(self,
+                 db_path,
+                 readonly=True,
+                 lock=False,
+                 readahead=False,
+                 **kwargs):
+        try:
+            import lmdb  # noqa: F401
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install lmdb" to enable LmdbBackend.')
+
+        self.db_path = str(db_path)
+        self.readonly = readonly
+        self.lock = lock
+        self.readahead = readahead
+        self.kwargs = kwargs
+        self._client = None
+
+    def get(self, filepath: Union[str, Path]) -> bytes:
+        """Get values according to the filepath.
+
+        Args:
+            filepath (str or Path): Here, filepath is the lmdb key.
+
+        Returns:
+            bytes: Expected bytes object.
+
+        Examples:
+            >>> backend = LmdbBackend('path/to/lmdb')
+            >>> backend.get('key')
+            b'hello world'
+        """
+        if self._client is None:
+            self._client = self._get_client()
+
+        filepath = str(filepath)
+        with self._client.begin(write=False) as txn:
+            value_buf = txn.get(filepath.encode('ascii'))
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+    def _get_client(self):
+        import lmdb
+
+        return lmdb.open(
+            self.db_path,
+            readonly=self.readonly,
+            lock=self.lock,
+            readahead=self.readahead,
+            **self.kwargs)
+
+    def __del__(self):
+        if self._client is not None:
+            self._client.close()
diff --git a/head_extractor/build/lib/mmengine/fileio/backends/local_backend.py b/head_extractor/build/lib/mmengine/fileio/backends/local_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7d5f0462132ce80b3e3d6f32ad98047723d0219
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/backends/local_backend.py
@@ -0,0 +1,543 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Generator, Iterator, Optional, Tuple, Union
+
+import mmengine
+from .base import BaseStorageBackend
+
+
+class LocalBackend(BaseStorageBackend):
+    """Raw local storage backend."""
+
+    _allow_symlink = True
+
+    def get(self, filepath: Union[str, Path]) -> bytes:
+        """Read bytes from a given ``filepath`` with 'rb' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes: Expected bytes object.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.get(filepath)
+            b'hello world'
+        """
+        with open(filepath, 'rb') as f:
+            value = f.read()
+        return value
+
+    def get_text(self,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> str:
+        """Read text from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.get_text(filepath)
+            'hello world'
+        """
+        with open(filepath, encoding=encoding) as f:
+            text = f.read()
+        return text
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write bytes to a given ``filepath`` with 'wb' mode.
+
+        Note:
+            ``put`` will create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.put(b'hello world', filepath)
+        """
+        mmengine.mkdir_or_exist(osp.dirname(filepath))
+        with open(filepath, 'wb') as f:
+            f.write(obj)
+
+    def put_text(self,
+                 obj: str,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> None:
+        """Write text to a given ``filepath`` with 'w' mode.
+
+        Note:
+            ``put_text`` will create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.put_text('hello world', filepath)
+        """
+        mmengine.mkdir_or_exist(osp.dirname(filepath))
+        with open(filepath, 'w', encoding=encoding) as f:
+            f.write(obj)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.exists(filepath)
+            True
+        """
+        return osp.exists(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/dir'
+            >>> backend.isdir(filepath)
+            True
+        """
+        return osp.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.isfile(filepath)
+            True
+        """
+        return osp.isfile(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        r"""Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of \*filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result of concatenation.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath1 = '/path/of/dir1'
+            >>> filepath2 = 'dir2'
+            >>> filepath3 = 'path/of/file'
+            >>> backend.join_path(filepath1, filepath2, filepath3)
+            '/path/of/dir/dir2/path/of/file'
+        """
+        # TODO, if filepath or filepaths are Path, should return Path
+        return osp.join(filepath, *filepaths)
+
+    @contextmanager
+    def get_local_path(
+        self,
+        filepath: Union[str, Path],
+    ) -> Generator[Union[str, Path], None, None]:
+        """Only for unified API and do nothing.
+
+        Args:
+            filepath (str or Path): Path to be read data.
+            backend_args (dict, optional): Arguments to instantiate the
+                corresponding backend. Defaults to None.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> with backend.get_local_path('s3://bucket/abc.jpg') as path:
+            ...     # do something here
+        """
+        yield filepath
+
+    def copyfile(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Copy a file src to dst and return the destination file.
+
+        src and dst should have the same prefix. If dst specifies a directory,
+        the file will be copied into dst using the base filename from src. If
+        dst specifies a file that already exists, it will be replaced.
+
+        Args:
+            src (str or Path): A file to be copied.
+            dst (str or Path): Copy file to dst.
+
+        Returns:
+            str: The destination file.
+
+        Raises:
+            SameFileError: If src and dst are the same file, a SameFileError
+                will be raised.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> # dst is a file
+            >>> src = '/path/of/file'
+            >>> dst = '/path1/of/file1'
+            >>> # src will be copied to '/path1/of/file1'
+            >>> backend.copyfile(src, dst)
+            '/path1/of/file1'
+
+            >>> # dst is a directory
+            >>> dst = '/path1/of/dir'
+            >>> # src will be copied to '/path1/of/dir/file'
+            >>> backend.copyfile(src, dst)
+            '/path1/of/dir/file'
+        """
+        return shutil.copy(src, dst)
+
+    def copytree(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Recursively copy an entire directory tree rooted at src to a
+        directory named dst and return the destination directory.
+
+        src and dst should have the same prefix and dst must not already exist.
+
+        TODO: Whether to support dirs_exist_ok parameter.
+
+        Args:
+            src (str or Path): A directory to be copied.
+            dst (str or Path): Copy directory to dst.
+
+        Returns:
+            str: The destination directory.
+
+        Raises:
+            FileExistsError: If dst had already existed, a FileExistsError will
+                be raised.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> src = '/path/of/dir1'
+            >>> dst = '/path/of/dir2'
+            >>> backend.copytree(src, dst)
+            '/path/of/dir2'
+        """
+        return shutil.copytree(src, dst)
+
+    def copyfile_from_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Copy a local file src to dst and return the destination file. Same
+        as :meth:`copyfile`.
+
+        Args:
+            src (str or Path): A local file to be copied.
+            dst (str or Path): Copy file to dst.
+
+        Returns:
+            str: If dst specifies a directory, the file will be copied into dst
+            using the base filename from src.
+
+        Raises:
+            SameFileError: If src and dst are the same file, a SameFileError
+                will be raised.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> # dst is a file
+            >>> src = '/path/of/file'
+            >>> dst = '/path1/of/file1'
+            >>> # src will be copied to '/path1/of/file1'
+            >>> backend.copyfile_from_local(src, dst)
+            '/path1/of/file1'
+
+            >>> # dst is a directory
+            >>> dst = '/path1/of/dir'
+            >>> # src will be copied to
+            >>> backend.copyfile_from_local(src, dst)
+            '/path1/of/dir/file'
+        """
+        return self.copyfile(src, dst)
+
+    def copytree_from_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Recursively copy an entire directory tree rooted at src to a
+        directory named dst and return the destination directory. Same as
+        :meth:`copytree`.
+
+        Args:
+            src (str or Path): A local directory to be copied.
+            dst (str or Path): Copy directory to dst.
+
+        Returns:
+            str: The destination directory.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> src = '/path/of/dir1'
+            >>> dst = '/path/of/dir2'
+            >>> backend.copytree_from_local(src, dst)
+            '/path/of/dir2'
+        """
+        return self.copytree(src, dst)
+
+    def copyfile_to_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Copy the file src to local dst and return the destination file. Same
+        as :meth:`copyfile`.
+
+        If dst specifies a directory, the file will be copied into dst using
+        the base filename from src. If dst specifies a file that already
+        exists, it will be replaced.
+
+        Args:
+            src (str or Path): A file to be copied.
+            dst (str or Path): Copy file to to local dst.
+
+        Returns:
+            str: If dst specifies a directory, the file will be copied into dst
+            using the base filename from src.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> # dst is a file
+            >>> src = '/path/of/file'
+            >>> dst = '/path1/of/file1'
+            >>> # src will be copied to '/path1/of/file1'
+            >>> backend.copyfile_to_local(src, dst)
+            '/path1/of/file1'
+
+            >>> # dst is a directory
+            >>> dst = '/path1/of/dir'
+            >>> # src will be copied to
+            >>> backend.copyfile_to_local(src, dst)
+            '/path1/of/dir/file'
+        """
+        return self.copyfile(src, dst)
+
+    def copytree_to_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Recursively copy an entire directory tree rooted at src to a local
+        directory named dst and return the destination directory.
+
+        Args:
+            src (str or Path): A directory to be copied.
+            dst (str or Path): Copy directory to local dst.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+
+        Returns:
+            str: The destination directory.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> src = '/path/of/dir1'
+            >>> dst = '/path/of/dir2'
+            >>> backend.copytree_from_local(src, dst)
+            '/path/of/dir2'
+        """
+        return self.copytree(src, dst)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str or Path): Path to be removed.
+
+        Raises:
+            IsADirectoryError: If filepath is a directory, an IsADirectoryError
+                will be raised.
+            FileNotFoundError: If filepath does not exist, an FileNotFoundError
+                will be raised.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.remove(filepath)
+        """
+        if not self.exists(filepath):
+            raise FileNotFoundError(f'filepath {filepath} does not exist')
+
+        if self.isdir(filepath):
+            raise IsADirectoryError('filepath should be a file')
+
+        os.remove(filepath)
+
+    def rmtree(self, dir_path: Union[str, Path]) -> None:
+        """Recursively delete a directory tree.
+
+        Args:
+            dir_path (str or Path): A directory to be removed.
+
+        Examples:
+            >>> dir_path = '/path/of/dir'
+            >>> backend.rmtree(dir_path)
+        """
+        shutil.rmtree(dir_path)
+
+    def copy_if_symlink_fails(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> bool:
+        """Create a symbolic link pointing to src named dst.
+
+        If failed to create a symbolic link pointing to src, directly copy src
+        to dst instead.
+
+        Args:
+            src (str or Path): Create a symbolic link pointing to src.
+            dst (str or Path): Create a symbolic link named dst.
+
+        Returns:
+            bool: Return True if successfully create a symbolic link pointing
+            to src. Otherwise, return False.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> src = '/path/of/file'
+            >>> dst = '/path1/of/file1'
+            >>> backend.copy_if_symlink_fails(src, dst)
+            True
+            >>> src = '/path/of/dir'
+            >>> dst = '/path1/of/dir1'
+            >>> backend.copy_if_symlink_fails(src, dst)
+            True
+        """
+        try:
+            os.symlink(src, dst)
+            return True
+        except Exception:
+            if self.isfile(src):
+                self.copyfile(src, dst)
+            else:
+                self.copytree(src, dst)
+            return False
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+        Args:
+            dir_path (str or Path): Path of the directory.
+            list_dir (bool): List the directories. Defaults to True.
+            list_file (bool): List the path of files. Defaults to True.
+            suffix (str or tuple[str], optional): File suffix that we are
+                interested in. Defaults to None.
+            recursive (bool): If set to True, recursively scan the directory.
+                Defaults to False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> dir_path = '/path/of/dir'
+            >>> # list those files and directories in current directory
+            >>> for file_path in backend.list_dir_or_file(dir_path):
+            ...     print(file_path)
+            >>> # only list files
+            >>> for file_path in backend.list_dir_or_file(dir_path, list_dir=False):
+            ...     print(file_path)
+            >>> # only list directories
+            >>> for file_path in backend.list_dir_or_file(dir_path, list_file=False):
+            ...     print(file_path)
+            >>> # only list files ending with specified suffixes
+            >>> for file_path in backend.list_dir_or_file(dir_path, suffix='.txt'):
+            ...     print(file_path)
+            >>> # list all files and directory recursively
+            >>> for file_path in backend.list_dir_or_file(dir_path, recursive=True):
+            ...     print(file_path)
+        """  # noqa: E501
+        if list_dir and suffix is not None:
+            raise TypeError('`suffix` should be None when `list_dir` is True')
+
+        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+            raise TypeError('`suffix` must be a string or tuple of strings')
+
+        root = dir_path
+
+        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                              recursive):
+            for entry in os.scandir(dir_path):
+                if not entry.name.startswith('.') and entry.is_file():
+                    rel_path = osp.relpath(entry.path, root)
+                    if (suffix is None
+                            or rel_path.endswith(suffix)) and list_file:
+                        yield rel_path
+                elif osp.isdir(entry.path):
+                    if list_dir:
+                        rel_dir = osp.relpath(entry.path, root)
+                        yield rel_dir
+                    if recursive:
+                        yield from _list_dir_or_file(entry.path, list_dir,
+                                                     list_file, suffix,
+                                                     recursive)
+
+        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                 recursive)
diff --git a/head_extractor/build/lib/mmengine/fileio/backends/memcached_backend.py b/head_extractor/build/lib/mmengine/fileio/backends/memcached_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..2458e7c6ec3525ba0425370c3529c4403f726716
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/backends/memcached_backend.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from pathlib import Path
+from typing import Union
+
+from .base import BaseStorageBackend
+
+
+class MemcachedBackend(BaseStorageBackend):
+    """Memcached storage backend.
+
+    Attributes:
+        server_list_cfg (str): Config file for memcached server list.
+        client_cfg (str): Config file for memcached client.
+        sys_path (str, optional): Additional path to be appended to `sys.path`.
+            Defaults to None.
+    """
+
+    def __init__(self, server_list_cfg, client_cfg, sys_path=None):
+        if sys_path is not None:
+            import sys
+            sys.path.append(sys_path)
+        try:
+            import mc
+        except ImportError:
+            raise ImportError(
+                'Please install memcached to enable MemcachedBackend.')
+
+        self.server_list_cfg = server_list_cfg
+        self.client_cfg = client_cfg
+        self._client = mc.MemcachedClient.GetInstance(self.server_list_cfg,
+                                                      self.client_cfg)
+        # mc.pyvector servers as a point which points to a memory cache
+        self._mc_buffer = mc.pyvector()
+
+    def get(self, filepath: Union[str, Path]):
+        """Get values according to the filepath.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes: Expected bytes object.
+
+        Examples:
+            >>> server_list_cfg = '/path/of/server_list.conf'
+            >>> client_cfg = '/path/of/mc.conf'
+            >>> backend = MemcachedBackend(server_list_cfg, client_cfg)
+            >>> backend.get('/path/of/file')
+            b'hello world'
+        """
+        filepath = str(filepath)
+        import mc
+        self._client.Get(filepath, self._mc_buffer)
+        value_buf = mc.ConvertBuffer(self._mc_buffer)
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
diff --git a/head_extractor/build/lib/mmengine/fileio/backends/petrel_backend.py b/head_extractor/build/lib/mmengine/fileio/backends/petrel_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..3994372f664b22535d1104b9b9032d5c3e7ac092
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/backends/petrel_backend.py
@@ -0,0 +1,771 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import re
+import tempfile
+from contextlib import contextmanager
+from pathlib import Path
+from shutil import SameFileError
+from typing import Generator, Iterator, Optional, Tuple, Union
+
+import mmengine
+from mmengine.utils import has_method
+from .base import BaseStorageBackend
+
+
+class PetrelBackend(BaseStorageBackend):
+    """Petrel storage backend (for internal usage).
+
+    PetrelBackend supports reading and writing data to multiple clusters.
+    If the file path contains the cluster name, PetrelBackend will read data
+    from specified cluster or write data to it. Otherwise, PetrelBackend will
+    access the default cluster.
+
+    Args:
+        path_mapping (dict, optional): Path mapping dict from local path to
+            Petrel path. When ``path_mapping={'src': 'dst'}``, ``src`` in
+            ``filepath`` will be replaced by ``dst``. Defaults to None.
+        enable_mc (bool, optional): Whether to enable memcached support.
+            Defaults to True.
+        conf_path (str, optional): Config path of Petrel client. Default: None.
+            `New in version 0.3.3`.
+
+    Examples:
+        >>> backend = PetrelBackend()
+        >>> filepath1 = 'petrel://path/of/file'
+        >>> filepath2 = 'cluster-name:petrel://path/of/file'
+        >>> backend.get(filepath1)  # get data from default cluster
+        >>> client.get(filepath2)  # get data from 'cluster-name' cluster
+    """
+
+    def __init__(self,
+                 path_mapping: Optional[dict] = None,
+                 enable_mc: bool = True,
+                 conf_path: Optional[str] = None):
+        try:
+            from petrel_client import client
+        except ImportError:
+            raise ImportError('Please install petrel_client to enable '
+                              'PetrelBackend.')
+
+        self._client = client.Client(conf_path=conf_path, enable_mc=enable_mc)
+        assert isinstance(path_mapping, dict) or path_mapping is None
+        self.path_mapping = path_mapping
+
+    def _map_path(self, filepath: Union[str, Path]) -> str:
+        """Map ``filepath`` to a string path whose prefix will be replaced by
+        :attr:`self.path_mapping`.
+
+        Args:
+            filepath (str or Path): Path to be mapped.
+        """
+        filepath = str(filepath)
+        if self.path_mapping is not None:
+            for k, v in self.path_mapping.items():
+                filepath = filepath.replace(k, v, 1)
+        return filepath
+
+    def _format_path(self, filepath: str) -> str:
+        """Convert a ``filepath`` to standard format of petrel oss.
+
+        If the ``filepath`` is concatenated by ``os.path.join``, in a Windows
+        environment, the ``filepath`` will be the format of
+        's3://bucket_name\\image.jpg'. By invoking :meth:`_format_path`, the
+        above ``filepath`` will be converted to 's3://bucket_name/image.jpg'.
+
+        Args:
+            filepath (str): Path to be formatted.
+        """
+        return re.sub(r'\\+', '/', filepath)
+
+    def _replace_prefix(self, filepath: Union[str, Path]) -> str:
+        filepath = str(filepath)
+        return filepath.replace('petrel://', 's3://')
+
+    def get(self, filepath: Union[str, Path]) -> bytes:
+        """Read bytes from a given ``filepath`` with 'rb' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes: Return bytes read from filepath.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.get(filepath)
+            b'hello world'
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        filepath = self._replace_prefix(filepath)
+        value = self._client.Get(filepath)
+        return value
+
+    def get_text(
+        self,
+        filepath: Union[str, Path],
+        encoding: str = 'utf-8',
+    ) -> str:
+        """Read text from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.get_text(filepath)
+            'hello world'
+        """
+        return str(self.get(filepath), encoding=encoding)
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write bytes to a given ``filepath``.
+
+        Args:
+            obj (bytes): Data to be saved.
+            filepath (str or Path): Path to write data.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.put(b'hello world', filepath)
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        filepath = self._replace_prefix(filepath)
+        self._client.put(filepath, obj)
+
+    def put_text(
+        self,
+        obj: str,
+        filepath: Union[str, Path],
+        encoding: str = 'utf-8',
+    ) -> None:
+        """Write text to a given ``filepath``.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to encode the ``obj``.
+                Defaults to 'utf-8'.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.put_text('hello world', filepath)
+        """
+        self.put(bytes(obj, encoding=encoding), filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.exists(filepath)
+            True
+        """
+        if not (has_method(self._client, 'contains')
+                and has_method(self._client, 'isdir')):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `contains` and `isdir` methods, please use a higher'
+                'version or dev branch instead.')
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        filepath = self._replace_prefix(filepath)
+        return self._client.contains(filepath) or self._client.isdir(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/dir'
+            >>> backend.isdir(filepath)
+            True
+        """
+        if not has_method(self._client, 'isdir'):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `isdir` method, please use a higher version or dev'
+                ' branch instead.')
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        filepath = self._replace_prefix(filepath)
+        return self._client.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.isfile(filepath)
+            True
+        """
+        if not has_method(self._client, 'contains'):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `contains` method, please use a higher version or '
+                'dev branch instead.')
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        filepath = self._replace_prefix(filepath)
+        return self._client.contains(filepath)
+
+    def join_path(
+        self,
+        filepath: Union[str, Path],
+        *filepaths: Union[str, Path],
+    ) -> str:
+        r"""Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of \*filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result after concatenation.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.join_path(filepath, 'another/path')
+            'petrel://path/of/file/another/path'
+            >>> backend.join_path(filepath, '/another/path')
+            'petrel://path/of/file/another/path'
+        """
+        filepath = self._format_path(self._map_path(filepath))
+        if filepath.endswith('/'):
+            filepath = filepath[:-1]
+        formatted_paths = [filepath]
+        for path in filepaths:
+            formatted_path = self._format_path(self._map_path(path))
+            formatted_paths.append(formatted_path.lstrip('/'))
+
+        return '/'.join(formatted_paths)
+
+    @contextmanager
+    def get_local_path(
+        self,
+        filepath: Union[str, Path],
+    ) -> Generator[Union[str, Path], None, None]:
+        """Download a file from ``filepath`` to a local temporary directory,
+        and return the temporary path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Args:
+            filepath (str or Path): Download a file from ``filepath``.
+
+        Yields:
+            Iterable[str]: Only yield one temporary path.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> filepath = 'petrel://path/of/file'
+            >>> with backend.get_local_path(filepath) as path:
+            ...     # do something here
+        """
+        assert self.isfile(filepath)
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.get(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+
+    def copyfile(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Copy a file src to dst and return the destination file.
+
+        src and dst should have the same prefix. If dst specifies a directory,
+        the file will be copied into dst using the base filename from src. If
+        dst specifies a file that already exists, it will be replaced.
+
+        Args:
+            src (str or Path): A file to be copied.
+            dst (str or Path): Copy file to dst.
+
+        Returns:
+            str: The destination file.
+
+        Raises:
+            SameFileError: If src and dst are the same file, a SameFileError
+                will be raised.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> # dst is a file
+            >>> src = 'petrel://path/of/file'
+            >>> dst = 'petrel://path/of/file1'
+            >>> backend.copyfile(src, dst)
+            'petrel://path/of/file1'
+
+            >>> # dst is a directory
+            >>> dst = 'petrel://path/of/dir'
+            >>> backend.copyfile(src, dst)
+            'petrel://path/of/dir/file'
+        """
+        src = self._format_path(self._map_path(src))
+        dst = self._format_path(self._map_path(dst))
+        if self.isdir(dst):
+            dst = self.join_path(dst, src.split('/')[-1])
+
+        if src == dst:
+            raise SameFileError('src and dst should not be same')
+
+        self.put(self.get(src), dst)
+        return dst
+
+    def copytree(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Recursively copy an entire directory tree rooted at src to a
+        directory named dst and return the destination directory.
+
+        src and dst should have the same prefix.
+
+        Args:
+            src (str or Path): A directory to be copied.
+            dst (str or Path): Copy directory to dst.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+
+        Returns:
+            str: The destination directory.
+
+        Raises:
+            FileExistsError: If dst had already existed, a FileExistsError will
+                be raised.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> src = 'petrel://path/of/dir'
+            >>> dst = 'petrel://path/of/dir1'
+            >>> backend.copytree(src, dst)
+            'petrel://path/of/dir1'
+        """
+        src = self._format_path(self._map_path(src))
+        dst = self._format_path(self._map_path(dst))
+
+        if self.exists(dst):
+            raise FileExistsError('dst should not exist')
+
+        for path in self.list_dir_or_file(src, list_dir=False, recursive=True):
+            src_path = self.join_path(src, path)
+            dst_path = self.join_path(dst, path)
+            self.put(self.get(src_path), dst_path)
+
+        return dst
+
+    def copyfile_from_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Upload a local file src to dst and return the destination file.
+
+        Args:
+            src (str or Path): A local file to be copied.
+            dst (str or Path): Copy file to dst.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+
+        Returns:
+            str: If dst specifies a directory, the file will be copied into dst
+            using the base filename from src.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> # dst is a file
+            >>> src = 'path/of/your/file'
+            >>> dst = 'petrel://path/of/file1'
+            >>> backend.copyfile_from_local(src, dst)
+            'petrel://path/of/file1'
+
+            >>> # dst is a directory
+            >>> dst = 'petrel://path/of/dir'
+            >>> backend.copyfile_from_local(src, dst)
+            'petrel://path/of/dir/file'
+        """
+        dst = self._format_path(self._map_path(dst))
+        if self.isdir(dst):
+            dst = self.join_path(dst, osp.basename(src))
+
+        with open(src, 'rb') as f:
+            self.put(f.read(), dst)
+
+        return dst
+
+    def copytree_from_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Recursively copy an entire directory tree rooted at src to a
+        directory named dst and return the destination directory.
+
+        Args:
+            src (str or Path): A local directory to be copied.
+            dst (str or Path): Copy directory to dst.
+
+        Returns:
+            str: The destination directory.
+
+        Raises:
+            FileExistsError: If dst had already existed, a FileExistsError will
+                be raised.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> src = 'path/of/your/dir'
+            >>> dst = 'petrel://path/of/dir1'
+            >>> backend.copytree_from_local(src, dst)
+            'petrel://path/of/dir1'
+        """
+        dst = self._format_path(self._map_path(dst))
+        if self.exists(dst):
+            raise FileExistsError('dst should not exist')
+
+        src = str(src)
+
+        for cur_dir, _, files in os.walk(src):
+            for f in files:
+                src_path = osp.join(cur_dir, f)
+                dst_path = self.join_path(dst, src_path.replace(src, ''))
+                self.copyfile_from_local(src_path, dst_path)
+
+        return dst
+
+    def copyfile_to_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> Union[str, Path]:
+        """Copy the file src to local dst and return the destination file.
+
+        If dst specifies a directory, the file will be copied into dst using
+        the base filename from src. If dst specifies a file that already
+        exists, it will be replaced.
+
+        Args:
+            src (str or Path): A file to be copied.
+            dst (str or Path): Copy file to to local dst.
+
+        Returns:
+            str: If dst specifies a directory, the file will be copied into dst
+            using the base filename from src.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> # dst is a file
+            >>> src = 'petrel://path/of/file'
+            >>> dst = 'path/of/your/file'
+            >>> backend.copyfile_to_local(src, dst)
+            'path/of/your/file'
+
+            >>> # dst is a directory
+            >>> dst = 'path/of/your/dir'
+            >>> backend.copyfile_to_local(src, dst)
+            'path/of/your/dir/file'
+        """
+        if osp.isdir(dst):
+            basename = osp.basename(src)
+            if isinstance(dst, str):
+                dst = osp.join(dst, basename)
+            else:
+                assert isinstance(dst, Path)
+                dst = dst / basename
+
+        with open(dst, 'wb') as f:
+            f.write(self.get(src))
+
+        return dst
+
+    def copytree_to_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> Union[str, Path]:
+        """Recursively copy an entire directory tree rooted at src to a local
+        directory named dst and return the destination directory.
+
+        Args:
+            src (str or Path): A directory to be copied.
+            dst (str or Path): Copy directory to local dst.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+
+        Returns:
+            str: The destination directory.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> src = 'petrel://path/of/dir'
+            >>> dst = 'path/of/your/dir'
+            >>> backend.copytree_to_local(src, dst)
+            'path/of/your/dir'
+        """
+        for path in self.list_dir_or_file(src, list_dir=False, recursive=True):
+            dst_path = osp.join(dst, path)
+            mmengine.mkdir_or_exist(osp.dirname(dst_path))
+            with open(dst_path, 'wb') as f:
+                f.write(self.get(self.join_path(src, path)))
+
+        return dst
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str or Path): Path to be removed.
+
+        Raises:
+            FileNotFoundError: If filepath does not exist, an FileNotFoundError
+                will be raised.
+            IsADirectoryError: If filepath is a directory, an IsADirectoryError
+                will be raised.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.remove(filepath)
+        """
+        if not has_method(self._client, 'delete'):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `delete` method, please use a higher version or dev '
+                'branch instead.')
+
+        if not self.exists(filepath):
+            raise FileNotFoundError(f'filepath {filepath} does not exist')
+
+        if self.isdir(filepath):
+            raise IsADirectoryError('filepath should be a file')
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        filepath = self._replace_prefix(filepath)
+        self._client.delete(filepath)
+
+    def rmtree(self, dir_path: Union[str, Path]) -> None:
+        """Recursively delete a directory tree.
+
+        Args:
+            dir_path (str or Path): A directory to be removed.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> dir_path = 'petrel://path/of/dir'
+            >>> backend.rmtree(dir_path)
+        """
+        for path in self.list_dir_or_file(
+                dir_path, list_dir=False, recursive=True):
+            filepath = self.join_path(dir_path, path)
+            self.remove(filepath)
+
+    def copy_if_symlink_fails(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> bool:
+        """Create a symbolic link pointing to src named dst.
+
+        Directly copy src to dst because PetrelBacekend does not support create
+        a symbolic link.
+
+        Args:
+            src (str or Path): A file or directory to be copied.
+            dst (str or Path): Copy a file or directory to dst.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+
+        Returns:
+            bool: Return False because PetrelBackend does not support create
+            a symbolic link.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> src = 'petrel://path/of/file'
+            >>> dst = 'petrel://path/of/your/file'
+            >>> backend.copy_if_symlink_fails(src, dst)
+            False
+            >>> src = 'petrel://path/of/dir'
+            >>> dst = 'petrel://path/of/your/dir'
+            >>> backend.copy_if_symlink_fails(src, dst)
+            False
+        """
+        if self.isfile(src):
+            self.copyfile(src, dst)
+        else:
+            self.copytree(src, dst)
+        return False
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            Petrel has no concept of directories but it simulates the directory
+            hierarchy in the filesystem through public prefixes. In addition,
+            if the returned path ends with '/', it means the path is a public
+            prefix which is a logical directory.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+            In addition, the returned path of directory will not contains the
+            suffix '/' which is consistent with other backends.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Defaults to True.
+            list_file (bool): List the path of files. Defaults to True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Defaults to None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Defaults to False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> dir_path = 'petrel://path/of/dir'
+            >>> # list those files and directories in current directory
+            >>> for file_path in backend.list_dir_or_file(dir_path):
+            ...     print(file_path)
+            >>> # only list files
+            >>> for file_path in backend.list_dir_or_file(dir_path, list_dir=False):
+            ...     print(file_path)
+            >>> # only list directories
+            >>> for file_path in backend.list_dir_or_file(dir_path, list_file=False):
+            ...     print(file_path)
+            >>> # only list files ending with specified suffixes
+            >>> for file_path in backend.list_dir_or_file(dir_path, suffix='.txt'):
+            ...     print(file_path)
+            >>> # list all files and directory recursively
+            >>> for file_path in backend.list_dir_or_file(dir_path, recursive=True):
+            ...     print(file_path)
+        """  # noqa: E501
+        if not has_method(self._client, 'list'):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `list` method, please use a higher version or dev'
+                ' branch instead.')
+
+        dir_path = self._map_path(dir_path)
+        dir_path = self._format_path(dir_path)
+        dir_path = self._replace_prefix(dir_path)
+        if list_dir and suffix is not None:
+            raise TypeError(
+                '`list_dir` should be False when `suffix` is not None')
+
+        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+            raise TypeError('`suffix` must be a string or tuple of strings')
+
+        # Petrel's simulated directory hierarchy assumes that directory paths
+        # should end with `/`
+        if not dir_path.endswith('/'):
+            dir_path += '/'
+
+        root = dir_path
+
+        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                              recursive):
+            for path in self._client.list(dir_path):
+                # the `self.isdir` is not used here to determine whether path
+                # is a directory, because `self.isdir` relies on
+                # `self._client.list`
+                if path.endswith('/'):  # a directory path
+                    next_dir_path = self.join_path(dir_path, path)
+                    if list_dir:
+                        # get the relative path and exclude the last
+                        # character '/'
+                        rel_dir = next_dir_path[len(root):-1]
+                        yield rel_dir
+                    if recursive:
+                        yield from _list_dir_or_file(next_dir_path, list_dir,
+                                                     list_file, suffix,
+                                                     recursive)
+                else:  # a file path
+                    absolute_path = self.join_path(dir_path, path)
+                    rel_path = absolute_path[len(root):]
+                    if (suffix is None
+                            or rel_path.endswith(suffix)) and list_file:
+                        yield rel_path
+
+        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                 recursive)
+
+    def generate_presigned_url(self,
+                               url: str,
+                               client_method: str = 'get_object',
+                               expires_in: int = 3600) -> str:
+        """Generate the presigned url of video stream which can be passed to
+        mmcv.VideoReader. Now only work on Petrel backend.
+
+        Note:
+            Now only work on Petrel backend.
+
+        Args:
+            url (str): Url of video stream.
+            client_method (str): Method of client, 'get_object' or
+                'put_object'. Default: 'get_object'.
+            expires_in (int): expires, in seconds. Default: 3600.
+
+        Returns:
+            str: Generated presigned url.
+        """
+        return self._client.generate_presigned_url(url, client_method,
+                                                   expires_in)
diff --git a/head_extractor/build/lib/mmengine/fileio/backends/registry_utils.py b/head_extractor/build/lib/mmengine/fileio/backends/registry_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4578a4ca76fb3f867b87c088407399bc5c700153
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/backends/registry_utils.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+from typing import Optional, Type, Union
+
+from .base import BaseStorageBackend
+from .http_backend import HTTPBackend
+from .lmdb_backend import LmdbBackend
+from .local_backend import LocalBackend
+from .memcached_backend import MemcachedBackend
+from .petrel_backend import PetrelBackend
+
+backends: dict = {}
+prefix_to_backends: dict = {}
+
+
+def _register_backend(name: str,
+                      backend: Type[BaseStorageBackend],
+                      force: bool = False,
+                      prefixes: Union[str, list, tuple, None] = None):
+    """Register a backend.
+
+    Args:
+        name (str): The name of the registered backend.
+        backend (BaseStorageBackend): The backend class to be registered,
+            which must be a subclass of :class:`BaseStorageBackend`.
+        force (bool): Whether to override the backend if the name has already
+            been registered. Defaults to False.
+        prefixes (str or list[str] or tuple[str], optional): The prefix
+            of the registered storage backend. Defaults to None.
+    """
+    global backends, prefix_to_backends
+
+    if not isinstance(name, str):
+        raise TypeError('the backend name should be a string, '
+                        f'but got {type(name)}')
+
+    if not inspect.isclass(backend):
+        raise TypeError(f'backend should be a class, but got {type(backend)}')
+    if not issubclass(backend, BaseStorageBackend):
+        raise TypeError(
+            f'backend {backend} is not a subclass of BaseStorageBackend')
+
+    if name in backends and not force:
+        raise ValueError(f'{name} is already registered as a storage backend, '
+                         'add "force=True" if you want to override it')
+    backends[name] = backend
+
+    if prefixes is not None:
+        if isinstance(prefixes, str):
+            prefixes = [prefixes]
+        else:
+            assert isinstance(prefixes, (list, tuple))
+
+        for prefix in prefixes:
+            if prefix in prefix_to_backends and not force:
+                raise ValueError(
+                    f'{prefix} is already registered as a storage backend,'
+                    ' add "force=True" if you want to override it')
+
+            prefix_to_backends[prefix] = backend
+
+
+def register_backend(name: str,
+                     backend: Optional[Type[BaseStorageBackend]] = None,
+                     force: bool = False,
+                     prefixes: Union[str, list, tuple, None] = None):
+    """Register a backend.
+
+    Args:
+        name (str): The name of the registered backend.
+        backend (class, optional): The backend class to be registered,
+            which must be a subclass of :class:`BaseStorageBackend`.
+            When this method is used as a decorator, backend is None.
+            Defaults to None.
+        force (bool): Whether to override the backend if the name has already
+            been registered. Defaults to False.
+        prefixes (str or list[str] or tuple[str], optional): The prefix
+            of the registered storage backend. Defaults to None.
+
+    This method can be used as a normal method or a decorator.
+
+    Examples:
+
+        >>> class NewBackend(BaseStorageBackend):
+        ...     def get(self, filepath):
+        ...         return filepath
+        ...
+        ...     def get_text(self, filepath):
+        ...         return filepath
+        >>> register_backend('new', NewBackend)
+
+        >>> @register_backend('new')
+        ... class NewBackend(BaseStorageBackend):
+        ...     def get(self, filepath):
+        ...         return filepath
+        ...
+        ...     def get_text(self, filepath):
+        ...         return filepath
+    """
+    if backend is not None:
+        _register_backend(name, backend, force=force, prefixes=prefixes)
+        return
+
+    def _register(backend_cls):
+        _register_backend(name, backend_cls, force=force, prefixes=prefixes)
+        return backend_cls
+
+    return _register
+
+
+register_backend('local', LocalBackend, prefixes='')
+register_backend('memcached', MemcachedBackend)
+register_backend('lmdb', LmdbBackend)
+# To avoid breaking backward Compatibility, 's3' is also used as a
+# prefix for PetrelBackend
+register_backend('petrel', PetrelBackend, prefixes=['petrel', 's3'])
+register_backend('http', HTTPBackend, prefixes=['http', 'https'])
diff --git a/head_extractor/build/lib/mmengine/fileio/file_client.py b/head_extractor/build/lib/mmengine/fileio/file_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..61551d3d1df953adee312d02624ac47c4cfb0df2
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/file_client.py
@@ -0,0 +1,460 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import logging
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Any, Generator, Iterator, Optional, Tuple, Union
+
+from mmengine.logging import print_log
+from mmengine.utils import is_filepath
+from .backends import (BaseStorageBackend, HTTPBackend, LmdbBackend,
+                       LocalBackend, MemcachedBackend, PetrelBackend)
+
+
+class HardDiskBackend(LocalBackend):
+    """Raw hard disks storage backend."""
+
+    def __init__(self) -> None:
+        print_log(
+            '"HardDiskBackend" is the alias of "LocalBackend" '
+            'and the former will be deprecated in future.',
+            logger='current',
+            level=logging.WARNING)
+
+    @property
+    def name(self):
+        return self.__class__.__name__
+
+
+class FileClient:
+    """A general file client to access files in different backends.
+
+    The client loads a file or text in a specified backend from its path
+    and returns it as a binary or text file. There are two ways to choose a
+    backend, the name of backend and the prefix of path. Although both of them
+    can be used to choose a storage backend, ``backend`` has a higher priority
+    that is if they are all set, the storage backend will be chosen by the
+    backend argument. If they are all `None`, the disk backend will be chosen.
+    Note that It can also register other backend accessor with a given name,
+    prefixes, and backend class. In addition, We use the singleton pattern to
+    avoid repeated object creation. If the arguments are the same, the same
+    object will be returned.
+
+    Warning:
+        `FileClient` will be deprecated in future. Please use io functions
+        in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io
+
+    Args:
+        backend (str, optional): The storage backend type. Options are "disk",
+            "memcached", "lmdb", "http" and "petrel". Defaults to None.
+        prefix (str, optional): The prefix of the registered storage backend.
+            Options are "s3", "http", "https". Defaults to None.
+
+    Examples:
+        >>> # only set backend
+        >>> file_client = FileClient(backend='petrel')
+        >>> # only set prefix
+        >>> file_client = FileClient(prefix='s3')
+        >>> # set both backend and prefix but use backend to choose client
+        >>> file_client = FileClient(backend='petrel', prefix='s3')
+        >>> # if the arguments are the same, the same object is returned
+        >>> file_client1 = FileClient(backend='petrel')
+        >>> file_client1 is file_client
+        True
+
+    Attributes:
+        client (:obj:`BaseStorageBackend`): The backend object.
+    """
+
+    _backends = {
+        'disk': HardDiskBackend,
+        'memcached': MemcachedBackend,
+        'lmdb': LmdbBackend,
+        'petrel': PetrelBackend,
+        'http': HTTPBackend,
+    }
+
+    _prefix_to_backends: dict = {
+        's3': PetrelBackend,
+        'petrel': PetrelBackend,
+        'http': HTTPBackend,
+        'https': HTTPBackend,
+    }
+
+    _instances: dict = {}
+
+    client: Any
+
+    def __new__(cls, backend=None, prefix=None, **kwargs):
+        print_log(
+            '"FileClient" will be deprecated in future. Please use io '
+            'functions in '
+            'https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io',  # noqa: E501
+            logger='current',
+            level=logging.WARNING)
+        if backend is None and prefix is None:
+            backend = 'disk'
+        if backend is not None and backend not in cls._backends:
+            raise ValueError(
+                f'Backend {backend} is not supported. Currently supported ones'
+                f' are {list(cls._backends.keys())}')
+        if prefix is not None and prefix not in cls._prefix_to_backends:
+            raise ValueError(
+                f'prefix {prefix} is not supported. Currently supported ones '
+                f'are {list(cls._prefix_to_backends.keys())}')
+
+        # concatenate the arguments to a unique key for determining whether
+        # objects with the same arguments were created
+        arg_key = f'{backend}:{prefix}'
+        for key, value in kwargs.items():
+            arg_key += f':{key}:{value}'
+
+        # if a backend was overridden, it will create a new object
+        if arg_key in cls._instances:
+            _instance = cls._instances[arg_key]
+        else:
+            # create a new object and put it to _instance
+            _instance = super().__new__(cls)
+            if backend is not None:
+                _instance.client = cls._backends[backend](**kwargs)
+            else:
+                _instance.client = cls._prefix_to_backends[prefix](**kwargs)
+
+            cls._instances[arg_key] = _instance
+
+        return _instance
+
+    @property
+    def name(self):
+        return self.client.name
+
+    @property
+    def allow_symlink(self):
+        return self.client.allow_symlink
+
+    @staticmethod
+    def parse_uri_prefix(uri: Union[str, Path]) -> Optional[str]:
+        """Parse the prefix of a uri.
+
+        Args:
+            uri (str | Path): Uri to be parsed that contains the file prefix.
+
+        Examples:
+            >>> FileClient.parse_uri_prefix('s3://path/of/your/file')
+            's3'
+
+        Returns:
+            str | None: Return the prefix of uri if the uri contains '://' else
+            ``None``.
+        """
+        assert is_filepath(uri)
+        uri = str(uri)
+        if '://' not in uri:
+            return None
+        else:
+            prefix, _ = uri.split('://')
+            # In the case of PetrelBackend, the prefix may contains the cluster
+            # name like clusterName:s3
+            if ':' in prefix:
+                _, prefix = prefix.split(':')
+            return prefix
+
+    @classmethod
+    def infer_client(cls,
+                     file_client_args: Optional[dict] = None,
+                     uri: Optional[Union[str, Path]] = None) -> 'FileClient':
+        """Infer a suitable file client based on the URI and arguments.
+
+        Args:
+            file_client_args (dict, optional): Arguments to instantiate a
+                FileClient. Defaults to None.
+            uri (str | Path, optional): Uri to be parsed that contains the file
+                prefix. Defaults to None.
+
+        Examples:
+            >>> uri = 's3://path/of/your/file'
+            >>> file_client = FileClient.infer_client(uri=uri)
+            >>> file_client_args = {'backend': 'petrel'}
+            >>> file_client = FileClient.infer_client(file_client_args)
+
+        Returns:
+            FileClient: Instantiated FileClient object.
+        """
+        assert file_client_args is not None or uri is not None
+        if file_client_args is None:
+            file_prefix = cls.parse_uri_prefix(uri)  # type: ignore
+            return cls(prefix=file_prefix)
+        else:
+            return cls(**file_client_args)
+
+    @classmethod
+    def _register_backend(cls, name, backend, force=False, prefixes=None):
+        if not isinstance(name, str):
+            raise TypeError('the backend name should be a string, '
+                            f'but got {type(name)}')
+        if not inspect.isclass(backend):
+            raise TypeError(
+                f'backend should be a class but got {type(backend)}')
+        if not issubclass(backend, BaseStorageBackend):
+            raise TypeError(
+                f'backend {backend} is not a subclass of BaseStorageBackend')
+        if not force and name in cls._backends:
+            raise KeyError(
+                f'{name} is already registered as a storage backend, '
+                'add "force=True" if you want to override it')
+
+        if name in cls._backends and force:
+            for arg_key, instance in list(cls._instances.items()):
+                if isinstance(instance.client, cls._backends[name]):
+                    cls._instances.pop(arg_key)
+        cls._backends[name] = backend
+
+        if prefixes is not None:
+            if isinstance(prefixes, str):
+                prefixes = [prefixes]
+            else:
+                assert isinstance(prefixes, (list, tuple))
+            for prefix in prefixes:
+                if prefix not in cls._prefix_to_backends:
+                    cls._prefix_to_backends[prefix] = backend
+                elif (prefix in cls._prefix_to_backends) and force:
+                    overridden_backend = cls._prefix_to_backends[prefix]
+                    for arg_key, instance in list(cls._instances.items()):
+                        if isinstance(instance.client, overridden_backend):
+                            cls._instances.pop(arg_key)
+                else:
+                    raise KeyError(
+                        f'{prefix} is already registered as a storage backend,'
+                        ' add "force=True" if you want to override it')
+
+    @classmethod
+    def register_backend(cls, name, backend=None, force=False, prefixes=None):
+        """Register a backend to FileClient.
+
+        This method can be used as a normal class method or a decorator.
+
+        .. code-block:: python
+
+            class NewBackend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return filepath
+
+                def get_text(self, filepath):
+                    return filepath
+
+            FileClient.register_backend('new', NewBackend)
+
+        or
+
+        .. code-block:: python
+
+            @FileClient.register_backend('new')
+            class NewBackend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return filepath
+
+                def get_text(self, filepath):
+                    return filepath
+
+        Args:
+            name (str): The name of the registered backend.
+            backend (class, optional): The backend class to be registered,
+                which must be a subclass of :class:`BaseStorageBackend`.
+                When this method is used as a decorator, backend is None.
+                Defaults to None.
+            force (bool, optional): Whether to override the backend if the name
+                has already been registered. Defaults to False.
+            prefixes (str or list[str] or tuple[str], optional): The prefixes
+                of the registered storage backend. Defaults to None.
+                `New in version 1.3.15.`
+        """
+        if backend is not None:
+            cls._register_backend(
+                name, backend, force=force, prefixes=prefixes)
+            return
+
+        def _register(backend_cls):
+            cls._register_backend(
+                name, backend_cls, force=force, prefixes=prefixes)
+            return backend_cls
+
+        return _register
+
+    def get(self, filepath: Union[str, Path]) -> Union[bytes, memoryview]:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Note:
+            There are two types of return values for ``get``, one is ``bytes``
+            and the other is ``memoryview``. The advantage of using memoryview
+            is that you can avoid copying, and if you want to convert it to
+            ``bytes``, you can use ``.tobytes()``.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes | memoryview: Expected bytes object or a memory view of the
+            bytes object.
+        """
+        return self.client.get(filepath)
+
+    def get_text(self, filepath: Union[str, Path], encoding='utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        return self.client.get_text(filepath, encoding)
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+
+        Note:
+            ``put`` should create a directory if the directory of ``filepath``
+            does not exist.
+
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        self.client.put(obj, filepath)
+
+    def put_text(self, obj: str, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+
+        Note:
+            ``put_text`` should create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str, optional): The encoding format used to open the
+                `filepath`. Defaults to 'utf-8'.
+        """
+        self.client.put_text(obj, filepath)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str, Path): Path to be removed.
+        """
+        self.client.remove(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        return self.client.exists(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+        """
+        return self.client.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+        """
+        return self.client.isfile(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        r"""Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of \*filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result of concatenation.
+        """
+        return self.client.join_path(filepath, *filepaths)
+
+    @contextmanager
+    def get_local_path(
+            self,
+            filepath: Union[str,
+                            Path]) -> Generator[Union[str, Path], None, None]:
+        """Download data from ``filepath`` and write the data to local path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Note:
+            If the ``filepath`` is a local path, just return itself.
+
+        .. warning::
+            ``get_local_path`` is an experimental interface that may change in
+            the future.
+
+        Args:
+            filepath (str or Path): Path to be read data.
+
+        Examples:
+            >>> file_client = FileClient(prefix='s3')
+            >>> with file_client.get_local_path('s3://bucket/abc.jpg') as path:
+            ...     # do something here
+
+        Yields:
+            Iterable[str]: Only yield one path.
+        """
+        with self.client.get_local_path(str(filepath)) as local_path:
+            yield local_path
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Defaults to True.
+            list_file (bool): List the path of files. Defaults to True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Defaults to None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Defaults to False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        yield from self.client.list_dir_or_file(dir_path, list_dir, list_file,
+                                                suffix, recursive)
diff --git a/head_extractor/build/lib/mmengine/fileio/handlers/__init__.py b/head_extractor/build/lib/mmengine/fileio/handlers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..391a60c36b3cdf9070437c9d96e9c0bf23fac1a2
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/handlers/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseFileHandler
+from .json_handler import JsonHandler
+from .pickle_handler import PickleHandler
+from .registry_utils import file_handlers, register_handler
+from .yaml_handler import YamlHandler
+
+__all__ = [
+    'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler',
+    'register_handler', 'file_handlers'
+]
diff --git a/head_extractor/build/lib/mmengine/fileio/handlers/base.py b/head_extractor/build/lib/mmengine/fileio/handlers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..288878bc57282fbb2f12b32290152ca8e9d3cab0
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/handlers/base.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseFileHandler(metaclass=ABCMeta):
+    # `str_like` is a flag to indicate whether the type of file object is
+    # str-like object or bytes-like object. Pickle only processes bytes-like
+    # objects but json only processes str-like object. If it is str-like
+    # object, `StringIO` will be used to process the buffer.
+    str_like = True
+
+    @abstractmethod
+    def load_from_fileobj(self, file, **kwargs):
+        pass
+
+    @abstractmethod
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        pass
+
+    @abstractmethod
+    def dump_to_str(self, obj, **kwargs):
+        pass
+
+    def load_from_path(self, filepath, mode='r', **kwargs):
+        with open(filepath, mode) as f:
+            return self.load_from_fileobj(f, **kwargs)
+
+    def dump_to_path(self, obj, filepath, mode='w', **kwargs):
+        with open(filepath, mode) as f:
+            self.dump_to_fileobj(obj, f, **kwargs)
diff --git a/head_extractor/build/lib/mmengine/fileio/handlers/json_handler.py b/head_extractor/build/lib/mmengine/fileio/handlers/json_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..18d4f15f74139d20adff18b20be5529c592a66b6
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/handlers/json_handler.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+import numpy as np
+
+from .base import BaseFileHandler
+
+
+def set_default(obj):
+    """Set default json values for non-serializable values.
+
+    It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list.
+    It also converts ``np.generic`` (including ``np.int32``, ``np.float32``,
+    etc.) into plain numbers of plain python built-in types.
+    """
+    if isinstance(obj, (set, range)):
+        return list(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, np.generic):
+        return obj.item()
+    raise TypeError(f'{type(obj)} is unsupported for json dump')
+
+
+class JsonHandler(BaseFileHandler):
+
+    def load_from_fileobj(self, file):
+        return json.load(file)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('default', set_default)
+        json.dump(obj, file, **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('default', set_default)
+        return json.dumps(obj, **kwargs)
diff --git a/head_extractor/build/lib/mmengine/fileio/handlers/pickle_handler.py b/head_extractor/build/lib/mmengine/fileio/handlers/pickle_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..073856fd25a731b42f3cd19269ad95744b20598f
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/handlers/pickle_handler.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pickle
+
+from .base import BaseFileHandler
+
+
+class PickleHandler(BaseFileHandler):
+
+    str_like = False
+
+    def load_from_fileobj(self, file, **kwargs):
+        return pickle.load(file, **kwargs)
+
+    def load_from_path(self, filepath, **kwargs):
+        return super().load_from_path(filepath, mode='rb', **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        return pickle.dumps(obj, **kwargs)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        pickle.dump(obj, file, **kwargs)
+
+    def dump_to_path(self, obj, filepath, **kwargs):
+        super().dump_to_path(obj, filepath, mode='wb', **kwargs)
diff --git a/head_extractor/build/lib/mmengine/fileio/handlers/registry_utils.py b/head_extractor/build/lib/mmengine/fileio/handlers/registry_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..106fc881f2514dfcf5b31878e7eca34c7f1659ea
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/handlers/registry_utils.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import is_list_of
+from .base import BaseFileHandler
+from .json_handler import JsonHandler
+from .pickle_handler import PickleHandler
+from .yaml_handler import YamlHandler
+
+file_handlers = {
+    'json': JsonHandler(),
+    'yaml': YamlHandler(),
+    'yml': YamlHandler(),
+    'pickle': PickleHandler(),
+    'pkl': PickleHandler(),
+}
+
+
+def _register_handler(handler, file_formats):
+    """Register a handler for some file extensions.
+
+    Args:
+        handler (:obj:`BaseFileHandler`): Handler to be registered.
+        file_formats (str or list[str]): File formats to be handled by this
+            handler.
+    """
+    if not isinstance(handler, BaseFileHandler):
+        raise TypeError(
+            f'handler must be a child of BaseFileHandler, not {type(handler)}')
+    if isinstance(file_formats, str):
+        file_formats = [file_formats]
+    if not is_list_of(file_formats, str):
+        raise TypeError('file_formats must be a str or a list of str')
+    for ext in file_formats:
+        file_handlers[ext] = handler
+
+
+def register_handler(file_formats, **kwargs):
+
+    def wrap(cls):
+        _register_handler(cls(**kwargs), file_formats)
+        return cls
+
+    return wrap
diff --git a/head_extractor/build/lib/mmengine/fileio/handlers/yaml_handler.py b/head_extractor/build/lib/mmengine/fileio/handlers/yaml_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..22c2607ae43734f334bbfa83445b1409ef855433
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/handlers/yaml_handler.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import yaml
+
+try:
+    from yaml import CDumper as Dumper  # type: ignore
+    from yaml import CLoader as Loader  # type: ignore
+except ImportError:
+    from yaml import Loader, Dumper  # type: ignore
+
+from .base import BaseFileHandler  # isort:skip
+
+
+class YamlHandler(BaseFileHandler):
+
+    def load_from_fileobj(self, file, **kwargs):
+        kwargs.setdefault('Loader', Loader)
+        return yaml.load(file, **kwargs)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('Dumper', Dumper)
+        yaml.dump(obj, file, **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('Dumper', Dumper)
+        return yaml.dump(obj, **kwargs)
diff --git a/head_extractor/build/lib/mmengine/fileio/io.py b/head_extractor/build/lib/mmengine/fileio/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdeb4dc6df9c7eebb0d4e5d76580c5199c341877
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/io.py
@@ -0,0 +1,940 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This module provides unified file I/O related functions, which support
+operating I/O with different file backends based on the specified filepath or
+backend_args.
+
+MMEngine currently supports five file backends:
+
+- LocalBackend
+- PetrelBackend
+- HTTPBackend
+- LmdbBackend
+- MemcacheBackend
+
+Note that this module provide a union of all of the above file backends so
+NotImplementedError will be raised if the interface in the file backend is not
+implemented.
+
+There are two ways to call a method of a file backend:
+
+- Initialize a file backend with ``get_file_backend`` and call its methods.
+- Directory call unified I/O functions, which will call ``get_file_backend``
+  first and then call the corresponding backend method.
+
+Examples:
+    >>> # Initialize a file backend and call its methods
+    >>> import mmengine.fileio as fileio
+    >>> backend = fileio.get_file_backend(backend_args={'backend': 'petrel'})
+    >>> backend.get('s3://path/of/your/file')
+
+    >>> # Directory call unified I/O functions
+    >>> fileio.get('s3://path/of/your/file')
+"""
+import json
+import warnings
+from contextlib import contextmanager
+from io import BytesIO, StringIO
+from pathlib import Path
+from typing import Generator, Iterator, Optional, Tuple, Union
+
+from mmengine.utils import is_filepath, is_str
+from .backends import backends, prefix_to_backends
+from .file_client import FileClient
+# file_handlers and register_handler had been moved to
+# mmengine/fileio/handlers/registry_utis. Import them
+# in this file to keep backward compatibility.
+from .handlers import file_handlers, register_handler  # noqa: F401
+
+backend_instances: dict = {}
+
+
+def _parse_uri_prefix(uri: Union[str, Path]) -> str:
+    """Parse the prefix of uri.
+
+    Args:
+        uri (str or Path): Uri to be parsed that contains the file prefix.
+
+    Examples:
+        >>> _parse_uri_prefix('/home/path/of/your/file')
+        ''
+        >>> _parse_uri_prefix('s3://path/of/your/file')
+        's3'
+        >>> _parse_uri_prefix('clusterName:s3://path/of/your/file')
+        's3'
+
+    Returns:
+        str: Return the prefix of uri if the uri contains '://'. Otherwise,
+        return ''.
+    """
+    assert is_filepath(uri)
+    uri = str(uri)
+    # if uri does not contains '://', the uri will be handled by
+    # LocalBackend by default
+    if '://' not in uri:
+        return ''
+    else:
+        prefix, _ = uri.split('://')
+        # In the case of PetrelBackend, the prefix may contain the cluster
+        # name like clusterName:s3://path/of/your/file
+        if ':' in prefix:
+            _, prefix = prefix.split(':')
+        return prefix
+
+
+def _get_file_backend(prefix: str, backend_args: dict):
+    """Return a file backend based on the prefix or backend_args.
+
+    Args:
+        prefix (str): Prefix of uri.
+        backend_args (dict): Arguments to instantiate the corresponding
+            backend.
+    """
+    # backend name has a higher priority
+    if 'backend' in backend_args:
+        # backend_args should not be modified
+        backend_args_bak = backend_args.copy()
+        backend_name = backend_args_bak.pop('backend')
+        backend = backends[backend_name](**backend_args_bak)
+    else:
+        backend = prefix_to_backends[prefix](**backend_args)
+    return backend
+
+
+def get_file_backend(
+    uri: Union[str, Path, None] = None,
+    *,
+    backend_args: Optional[dict] = None,
+    enable_singleton: bool = False,
+):
+    """Return a file backend based on the prefix of uri or backend_args.
+
+    Args:
+        uri (str or Path): Uri to be parsed that contains the file prefix.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        enable_singleton (bool): Whether to enable the singleton pattern.
+            If it is True, the backend created will be reused if the
+            signature is same with the previous one. Defaults to False.
+
+    Returns:
+        BaseStorageBackend: Instantiated Backend object.
+
+    Examples:
+        >>> # get file backend based on the prefix of uri
+        >>> uri = 's3://path/of/your/file'
+        >>> backend = get_file_backend(uri)
+        >>> # get file backend based on the backend_args
+        >>> backend = get_file_backend(backend_args={'backend': 'petrel'})
+        >>> # backend name has a higher priority if 'backend' in backend_args
+        >>> backend = get_file_backend(uri, backend_args={'backend': 'petrel'})
+    """
+    global backend_instances
+
+    if backend_args is None:
+        backend_args = {}
+
+    if uri is None and 'backend' not in backend_args:
+        raise ValueError(
+            'uri should not be None when "backend" does not exist in '
+            'backend_args')
+
+    if uri is not None:
+        prefix = _parse_uri_prefix(uri)
+    else:
+        prefix = ''
+
+    if enable_singleton:
+        # TODO: whether to pass sort_key to json.dumps
+        unique_key = f'{prefix}:{json.dumps(backend_args)}'
+        if unique_key in backend_instances:
+            return backend_instances[unique_key]
+
+        backend = _get_file_backend(prefix, backend_args)
+        backend_instances[unique_key] = backend
+        return backend
+    else:
+        backend = _get_file_backend(prefix, backend_args)
+        return backend
+
+
+def get(
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> bytes:
+    """Read bytes from a given ``filepath`` with 'rb' mode.
+
+    Args:
+        filepath (str or Path): Path to read data.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        bytes: Expected bytes object.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> get(filepath)
+        b'hello world'
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    return backend.get(filepath)
+
+
+def get_text(
+    filepath: Union[str, Path],
+    encoding='utf-8',
+    backend_args: Optional[dict] = None,
+) -> str:
+    """Read text from a given ``filepath`` with 'r' mode.
+
+    Args:
+        filepath (str or Path): Path to read data.
+        encoding (str): The encoding format used to open the ``filepath``.
+            Defaults to 'utf-8'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: Expected text reading from ``filepath``.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> get_text(filepath)
+        'hello world'
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    return backend.get_text(filepath, encoding)
+
+
+def put(
+    obj: bytes,
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> None:
+    """Write bytes to a given ``filepath`` with 'wb' mode.
+
+    Note:
+        ``put`` should create a directory if the directory of
+        ``filepath`` does not exist.
+
+    Args:
+        obj (bytes): Data to be written.
+        filepath (str or Path): Path to write data.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> put(b'hello world', filepath)
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    backend.put(obj, filepath)
+
+
+def put_text(
+    obj: str,
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> None:
+    """Write text to a given ``filepath`` with 'w' mode.
+
+    Note:
+        ``put_text`` should create a directory if the directory of
+        ``filepath`` does not exist.
+
+    Args:
+        obj (str): Data to be written.
+        filepath (str or Path): Path to write data.
+        encoding (str, optional): The encoding format used to open the
+            ``filepath``. Defaults to 'utf-8'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> put_text('hello world', filepath)
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    backend.put_text(obj, filepath)
+
+
+def exists(
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> bool:
+    """Check whether a file path exists.
+
+    Args:
+        filepath (str or Path): Path to be checked whether exists.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> exists(filepath)
+        True
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    return backend.exists(filepath)
+
+
+def isdir(
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> bool:
+    """Check whether a file path is a directory.
+
+    Args:
+        filepath (str or Path): Path to be checked whether it is a
+            directory.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        bool: Return ``True`` if ``filepath`` points to a directory,
+        ``False`` otherwise.
+
+    Examples:
+        >>> filepath = '/path/of/dir'
+        >>> isdir(filepath)
+        True
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    return backend.isdir(filepath)
+
+
+def isfile(
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> bool:
+    """Check whether a file path is a file.
+
+    Args:
+        filepath (str or Path): Path to be checked whether it is a file.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        bool: Return ``True`` if ``filepath`` points to a file, ``False``
+        otherwise.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> isfile(filepath)
+        True
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    return backend.isfile(filepath)
+
+
+def join_path(
+    filepath: Union[str, Path],
+    *filepaths: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    r"""Concatenate all file paths.
+
+    Join one or more filepath components intelligently. The return value
+    is the concatenation of filepath and any members of \*filepaths.
+
+    Args:
+        filepath (str or Path): Path to be concatenated.
+        *filepaths (str or Path): Other paths to be concatenated.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: The result of concatenation.
+
+    Examples:
+        >>> filepath1 = '/path/of/dir1'
+        >>> filepath2 = 'dir2'
+        >>> filepath3 = 'path/of/file'
+        >>> join_path(filepath1, filepath2, filepath3)
+        '/path/of/dir/dir2/path/of/file'
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    return backend.join_path(filepath, *filepaths)
+
+
+@contextmanager
+def get_local_path(
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Generator[Union[str, Path], None, None]:
+    """Download data from ``filepath`` and write the data to local path.
+
+    ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+    can be called with ``with`` statement, and when exists from the
+    ``with`` statement, the temporary path will be released.
+
+    Note:
+        If the ``filepath`` is a local path, just return itself and it will
+        not be released (removed).
+
+    Args:
+        filepath (str or Path): Path to be read data.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Yields:
+        Iterable[str]: Only yield one path.
+
+    Examples:
+        >>> with get_local_path('s3://bucket/abc.jpg') as path:
+        ...     # do something here
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    with backend.get_local_path(str(filepath)) as local_path:
+        yield local_path
+
+
+def copyfile(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    """Copy a file src to dst and return the destination file.
+
+    src and dst should have the same prefix. If dst specifies a directory,
+    the file will be copied into dst using the base filename from src. If
+    dst specifies a file that already exists, it will be replaced.
+
+    Args:
+        src (str or Path): A file to be copied.
+        dst (str or Path): Copy file to dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: The destination file.
+
+    Raises:
+        SameFileError: If src and dst are the same file, a SameFileError will
+            be raised.
+
+    Examples:
+        >>> # dst is a file
+        >>> src = '/path/of/file'
+        >>> dst = '/path1/of/file1'
+        >>> # src will be copied to '/path1/of/file1'
+        >>> copyfile(src, dst)
+        '/path1/of/file1'
+
+        >>> # dst is a directory
+        >>> dst = '/path1/of/dir'
+        >>> # src will be copied to '/path1/of/dir/file'
+        >>> copyfile(src, dst)
+        '/path1/of/dir/file'
+    """
+    backend = get_file_backend(
+        src, backend_args=backend_args, enable_singleton=True)
+    return backend.copyfile(src, dst)
+
+
+def copytree(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    """Recursively copy an entire directory tree rooted at src to a directory
+    named dst and return the destination directory.
+
+    src and dst should have the same prefix and dst must not already exist.
+
+    Args:
+        src (str or Path): A directory to be copied.
+        dst (str or Path): Copy directory to dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: The destination directory.
+
+    Raises:
+        FileExistsError: If dst had already existed, a FileExistsError will be
+            raised.
+
+    Examples:
+        >>> src = '/path/of/dir1'
+        >>> dst = '/path/of/dir2'
+        >>> copytree(src, dst)
+        '/path/of/dir2'
+    """
+    backend = get_file_backend(
+        src, backend_args=backend_args, enable_singleton=True)
+    return backend.copytree(src, dst)
+
+
+def copyfile_from_local(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    """Copy a local file src to dst and return the destination file.
+
+    Note:
+        If the backend is the instance of LocalBackend, it does the same
+        thing with :func:`copyfile`.
+
+    Args:
+        src (str or Path): A local file to be copied.
+        dst (str or Path): Copy file to dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: If dst specifies a directory, the file will be copied into dst
+        using the base filename from src.
+
+    Examples:
+        >>> # dst is a file
+        >>> src = '/path/of/file'
+        >>> dst = 's3://openmmlab/mmengine/file1'
+        >>> # src will be copied to 's3://openmmlab/mmengine/file1'
+        >>> copyfile_from_local(src, dst)
+        s3://openmmlab/mmengine/file1
+
+        >>> # dst is a directory
+        >>> dst = 's3://openmmlab/mmengine'
+        >>> # src will be copied to 's3://openmmlab/mmengine/file''
+        >>> copyfile_from_local(src, dst)
+        's3://openmmlab/mmengine/file'
+    """
+    backend = get_file_backend(
+        dst, backend_args=backend_args, enable_singleton=True)
+    return backend.copyfile_from_local(src, dst)
+
+
+def copytree_from_local(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    """Recursively copy an entire directory tree rooted at src to a directory
+    named dst and return the destination directory.
+
+    Note:
+        If the backend is the instance of LocalBackend, it does the same
+        thing with :func:`copytree`.
+
+    Args:
+        src (str or Path): A local directory to be copied.
+        dst (str or Path): Copy directory to dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: The destination directory.
+
+    Examples:
+        >>> src = '/path/of/dir'
+        >>> dst = 's3://openmmlab/mmengine/dir'
+        >>> copyfile_from_local(src, dst)
+        's3://openmmlab/mmengine/dir'
+    """
+    backend = get_file_backend(
+        dst, backend_args=backend_args, enable_singleton=True)
+    return backend.copytree_from_local(src, dst)
+
+
+def copyfile_to_local(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    """Copy the file src to local dst and return the destination file.
+
+    If dst specifies a directory, the file will be copied into dst using
+    the base filename from src. If dst specifies a file that already
+    exists, it will be replaced.
+
+    Note:
+        If the backend is the instance of LocalBackend, it does the same
+        thing with :func:`copyfile`.
+
+    Args:
+        src (str or Path): A file to be copied.
+        dst (str or Path): Copy file to to local dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: If dst specifies a directory, the file will be copied into dst
+        using the base filename from src.
+
+    Examples:
+        >>> # dst is a file
+        >>> src = 's3://openmmlab/mmengine/file'
+        >>> dst = '/path/of/file'
+        >>> # src will be copied to '/path/of/file'
+        >>> copyfile_to_local(src, dst)
+        '/path/of/file'
+
+        >>> # dst is a directory
+        >>> dst = '/path/of/dir'
+        >>> # src will be copied to '/path/of/dir/file'
+        >>> copyfile_to_local(src, dst)
+        '/path/of/dir/file'
+    """
+    backend = get_file_backend(
+        dst, backend_args=backend_args, enable_singleton=True)
+    return backend.copyfile_to_local(src, dst)
+
+
+def copytree_to_local(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    """Recursively copy an entire directory tree rooted at src to a local
+    directory named dst and return the destination directory.
+
+    Note:
+        If the backend is the instance of LocalBackend, it does the same
+        thing with :func:`copytree`.
+
+    Args:
+        src (str or Path): A directory to be copied.
+        dst (str or Path): Copy directory to local dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: The destination directory.
+
+    Examples:
+        >>> src = 's3://openmmlab/mmengine/dir'
+        >>> dst = '/path/of/dir'
+        >>> copytree_to_local(src, dst)
+        '/path/of/dir'
+    """
+    backend = get_file_backend(
+        dst, backend_args=backend_args, enable_singleton=True)
+    return backend.copytree_to_local(src, dst)
+
+
+def remove(
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> None:
+    """Remove a file.
+
+    Args:
+        filepath (str, Path): Path to be removed.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Raises:
+        FileNotFoundError: If filepath does not exist, an FileNotFoundError
+            will be raised.
+        IsADirectoryError: If filepath is a directory, an IsADirectoryError
+            will be raised.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> remove(filepath)
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    backend.remove(filepath)
+
+
+def rmtree(
+    dir_path: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> None:
+    """Recursively delete a directory tree.
+
+    Args:
+        dir_path (str or Path): A directory to be removed.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Examples:
+        >>> dir_path = '/path/of/dir'
+        >>> rmtree(dir_path)
+    """
+    backend = get_file_backend(
+        dir_path, backend_args=backend_args, enable_singleton=True)
+    backend.rmtree(dir_path)
+
+
+def copy_if_symlink_fails(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> bool:
+    """Create a symbolic link pointing to src named dst.
+
+    If failed to create a symbolic link pointing to src, directory copy src to
+    dst instead.
+
+    Args:
+        src (str or Path): Create a symbolic link pointing to src.
+        dst (str or Path): Create a symbolic link named dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        bool: Return True if successfully create a symbolic link pointing to
+        src. Otherwise, return False.
+
+    Examples:
+        >>> src = '/path/of/file'
+        >>> dst = '/path1/of/file1'
+        >>> copy_if_symlink_fails(src, dst)
+        True
+        >>> src = '/path/of/dir'
+        >>> dst = '/path1/of/dir1'
+        >>> copy_if_symlink_fails(src, dst)
+        True
+    """
+    backend = get_file_backend(
+        src, backend_args=backend_args, enable_singleton=True)
+    return backend.copy_if_symlink_fails(src, dst)
+
+
+def list_dir_or_file(
+    dir_path: Union[str, Path],
+    list_dir: bool = True,
+    list_file: bool = True,
+    suffix: Optional[Union[str, Tuple[str]]] = None,
+    recursive: bool = False,
+    backend_args: Optional[dict] = None,
+) -> Iterator[str]:
+    """Scan a directory to find the interested directories or files in
+    arbitrary order.
+
+    Note:
+        :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+    Args:
+        dir_path (str or Path): Path of the directory.
+        list_dir (bool): List the directories. Defaults to True.
+        list_file (bool): List the path of files. Defaults to True.
+        suffix (str or tuple[str], optional): File suffix that we are
+            interested in. Defaults to None.
+        recursive (bool): If set to True, recursively scan the directory.
+            Defaults to False.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Yields:
+        Iterable[str]: A relative path to ``dir_path``.
+
+    Examples:
+        >>> dir_path = '/path/of/dir'
+        >>> for file_path in list_dir_or_file(dir_path):
+        ...     print(file_path)
+        >>> # list those files and directories in current directory
+        >>> for file_path in list_dir_or_file(dir_path):
+        ...     print(file_path)
+        >>> # only list files
+        >>> for file_path in list_dir_or_file(dir_path, list_dir=False):
+        ...     print(file_path)
+        >>> # only list directories
+        >>> for file_path in list_dir_or_file(dir_path, list_file=False):
+        ...     print(file_path)
+        >>> # only list files ending with specified suffixes
+        >>> for file_path in list_dir_or_file(dir_path, suffix='.txt'):
+        ...     print(file_path)
+        >>> # list all files and directory recursively
+        >>> for file_path in list_dir_or_file(dir_path, recursive=True):
+        ...     print(file_path)
+    """
+    backend = get_file_backend(
+        dir_path, backend_args=backend_args, enable_singleton=True)
+    yield from backend.list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                        recursive)
+
+
+def generate_presigned_url(
+    url: str,
+    client_method: str = 'get_object',
+    expires_in: int = 3600,
+    backend_args: Optional[dict] = None,
+) -> str:
+    """Generate the presigned url of video stream which can be passed to
+    mmcv.VideoReader. Now only work on Petrel backend.
+
+    Note:
+        Now only work on Petrel backend.
+
+    Args:
+        url (str): Url of video stream.
+        client_method (str): Method of client, 'get_object' or
+            'put_object'. Defaults to 'get_object'.
+        expires_in (int): expires, in seconds. Defaults to 3600.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: Generated presigned url.
+    """
+    backend = get_file_backend(
+        url, backend_args=backend_args, enable_singleton=True)
+    return backend.generate_presigned_url(url, client_method, expires_in)
+
+
+def load(file,
+         file_format=None,
+         file_client_args=None,
+         backend_args=None,
+         **kwargs):
+    """Load data from json/yaml/pickle files.
+
+    This method provides a unified api for loading data from serialized files.
+
+    ``load`` supports loading data from serialized files those can be storaged
+    in different backends.
+
+    Args:
+        file (str or :obj:`Path` or file-like object): Filename or a file-like
+            object.
+        file_format (str, optional): If not specified, the file format will be
+            inferred from the file extension, otherwise use the specified one.
+            Currently supported formats include "json", "yaml/yml" and
+            "pickle/pkl".
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+
+    Examples:
+        >>> load('/path/of/your/file')  # file is storaged in disk
+        >>> load('https://path/of/your/file')  # file is storaged in Internet
+        >>> load('s3://path/of/your/file')  # file is storaged in petrel
+
+    Returns:
+        The content from the file.
+    """
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None and is_str(file):
+        file_format = file.split('.')[-1]
+    if file_format not in file_handlers:
+        raise TypeError(f'Unsupported format: {file_format}')
+
+    if file_client_args is not None:
+        warnings.warn(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead', DeprecationWarning)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args and "backend_args" cannot be set at the '
+                'same time.')
+
+    handler = file_handlers[file_format]
+    if is_str(file):
+        if file_client_args is not None:
+            file_client = FileClient.infer_client(file_client_args, file)
+            file_backend = file_client
+        else:
+            file_backend = get_file_backend(file, backend_args=backend_args)
+
+        if handler.str_like:
+            with StringIO(file_backend.get_text(file)) as f:
+                obj = handler.load_from_fileobj(f, **kwargs)
+        else:
+            with BytesIO(file_backend.get(file)) as f:
+                obj = handler.load_from_fileobj(f, **kwargs)
+    elif hasattr(file, 'read'):
+        obj = handler.load_from_fileobj(file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filepath str or a file-object')
+    return obj
+
+
+def dump(obj,
+         file=None,
+         file_format=None,
+         file_client_args=None,
+         backend_args=None,
+         **kwargs):
+    """Dump data to json/yaml/pickle strings or files.
+
+    This method provides a unified api for dumping data as strings or to files,
+    and also supports custom arguments for each file format.
+
+    ``dump`` supports dumping data as strings or to files which is saved to
+    different backends.
+
+    Args:
+        obj (any): The python object to be dumped.
+        file (str or :obj:`Path` or file-like object, optional): If not
+            specified, then the object is dumped to a str, otherwise to a file
+            specified by the filename or file-like object.
+        file_format (str, optional): Same as :func:`load`.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+
+    Examples:
+        >>> dump('hello world', '/path/of/your/file')  # disk
+        >>> dump('hello world', 's3://path/of/your/file')  # ceph or petrel
+
+    Returns:
+        bool: True for success, False otherwise.
+    """
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None:
+        if is_str(file):
+            file_format = file.split('.')[-1]
+        elif file is None:
+            raise ValueError(
+                'file_format must be specified since file is None')
+    if file_format not in file_handlers:
+        raise TypeError(f'Unsupported format: {file_format}')
+
+    if file_client_args is not None:
+        warnings.warn(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead', DeprecationWarning)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args" and "backend_args" cannot be set at the '
+                'same time.')
+
+    handler = file_handlers[file_format]
+    if file is None:
+        return handler.dump_to_str(obj, **kwargs)
+    elif is_str(file):
+        if file_client_args is not None:
+            file_client = FileClient.infer_client(file_client_args, file)
+            file_backend = file_client
+        else:
+            file_backend = get_file_backend(file, backend_args=backend_args)
+
+        if handler.str_like:
+            with StringIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                file_backend.put_text(f.getvalue(), file)
+        else:
+            with BytesIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                file_backend.put(f.getvalue(), file)
+    elif hasattr(file, 'write'):
+        handler.dump_to_fileobj(obj, file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filename str or a file-object')
diff --git a/head_extractor/build/lib/mmengine/fileio/parse.py b/head_extractor/build/lib/mmengine/fileio/parse.py
new file mode 100644
index 0000000000000000000000000000000000000000..781d899a0465a0041d7d18262ab8840fa06f5acd
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/fileio/parse.py
@@ -0,0 +1,133 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from io import StringIO
+
+from .file_client import FileClient
+from .io import get_text
+
+
+def list_from_file(filename,
+                   prefix='',
+                   offset=0,
+                   max_num=0,
+                   encoding='utf-8',
+                   file_client_args=None,
+                   backend_args=None):
+    """Load a text file and parse the content as a list of strings.
+
+    ``list_from_file`` supports loading a text file which can be storaged in
+    different backends and parsing the content as a list for strings.
+
+    Args:
+        filename (str): Filename.
+        prefix (str): The prefix to be inserted to the beginning of each item.
+        offset (int): The offset of lines.
+        max_num (int): The maximum number of lines to be read,
+            zeros and negatives mean no limitation.
+        encoding (str): Encoding used to open the file. Defaults to utf-8.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+
+    Examples:
+        >>> list_from_file('/path/of/your/file')  # disk
+        ['hello', 'world']
+        >>> list_from_file('s3://path/of/your/file')  # ceph or petrel
+        ['hello', 'world']
+
+    Returns:
+        list[str]: A list of strings.
+    """
+    if file_client_args is not None:
+        warnings.warn(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead', DeprecationWarning)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args" and "backend_args" cannot be set at the '
+                'same time.')
+    cnt = 0
+    item_list = []
+
+    if file_client_args is not None:
+        file_client = FileClient.infer_client(file_client_args, filename)
+        text = file_client.get_text(filename, encoding)
+    else:
+        text = get_text(filename, encoding, backend_args=backend_args)
+
+    with StringIO(text) as f:
+        for _ in range(offset):
+            f.readline()
+        for line in f:
+            if 0 < max_num <= cnt:
+                break
+            item_list.append(prefix + line.rstrip('\n\r'))
+            cnt += 1
+    return item_list
+
+
+def dict_from_file(filename,
+                   key_type=str,
+                   encoding='utf-8',
+                   file_client_args=None,
+                   backend_args=None):
+    """Load a text file and parse the content as a dict.
+
+    Each line of the text file will be two or more columns split by
+    whitespaces or tabs. The first column will be parsed as dict keys, and
+    the following columns will be parsed as dict values.
+
+    ``dict_from_file`` supports loading a text file which can be storaged in
+    different backends and parsing the content as a dict.
+
+    Args:
+        filename(str): Filename.
+        key_type(type): Type of the dict keys. str is user by default and
+            type conversion will be performed if specified.
+        encoding (str): Encoding used to open the file. Defaults to utf-8.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+
+    Examples:
+        >>> dict_from_file('/path/of/your/file')  # disk
+        {'key1': 'value1', 'key2': 'value2'}
+        >>> dict_from_file('s3://path/of/your/file')  # ceph or petrel
+        {'key1': 'value1', 'key2': 'value2'}
+
+    Returns:
+        dict: The parsed contents.
+    """
+    if file_client_args is not None:
+        warnings.warn(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead', DeprecationWarning)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args" and "backend_args" cannot be set at the '
+                'same time.')
+
+    mapping = {}
+
+    if file_client_args is not None:
+        file_client = FileClient.infer_client(file_client_args, filename)
+        text = file_client.get_text(filename, encoding)
+    else:
+        text = get_text(filename, encoding, backend_args=backend_args)
+
+    with StringIO(text) as f:
+        for line in f:
+            items = line.rstrip('\n').split()
+            assert len(items) >= 2
+            key = key_type(items[0])
+            val = items[1:] if len(items) > 2 else items[1]
+            mapping[key] = val
+    return mapping
diff --git a/head_extractor/build/lib/mmengine/hooks/__init__.py b/head_extractor/build/lib/mmengine/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..746be6b02a5afa130026b64e3653824e5576ea6a
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .checkpoint_hook import CheckpointHook
+from .early_stopping_hook import EarlyStoppingHook
+from .ema_hook import EMAHook
+from .empty_cache_hook import EmptyCacheHook
+from .hook import Hook
+from .iter_timer_hook import IterTimerHook
+from .logger_hook import LoggerHook
+from .naive_visualization_hook import NaiveVisualizationHook
+from .param_scheduler_hook import ParamSchedulerHook
+from .profiler_hook import NPUProfilerHook, ProfilerHook
+from .runtime_info_hook import RuntimeInfoHook
+from .sampler_seed_hook import DistSamplerSeedHook
+from .sync_buffer_hook import SyncBuffersHook
+from .test_time_aug_hook import PrepareTTAHook
+
+__all__ = [
+    'Hook', 'IterTimerHook', 'DistSamplerSeedHook', 'ParamSchedulerHook',
+    'SyncBuffersHook', 'EmptyCacheHook', 'CheckpointHook', 'LoggerHook',
+    'NaiveVisualizationHook', 'EMAHook', 'RuntimeInfoHook', 'ProfilerHook',
+    'PrepareTTAHook', 'NPUProfilerHook', 'EarlyStoppingHook'
+]
diff --git a/head_extractor/build/lib/mmengine/hooks/checkpoint_hook.py b/head_extractor/build/lib/mmengine/hooks/checkpoint_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a4867bb967a2eea6fd9b6024618355e5054670
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/checkpoint_hook.py
@@ -0,0 +1,665 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import hashlib
+import logging
+import os.path as osp
+import pickle
+from collections import deque
+from math import inf
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Sequence, Union
+
+from mmengine.dist import is_main_process, master_only
+from mmengine.fileio import FileClient, get_file_backend
+from mmengine.logging import print_log
+from mmengine.registry import HOOKS
+from mmengine.utils import is_list_of, is_seq_of
+from .hook import Hook
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+@HOOKS.register_module()
+class CheckpointHook(Hook):
+    """Save checkpoints periodically.
+
+    Args:
+        interval (int): The saving period. If ``by_epoch=True``, interval
+            indicates epochs, otherwise it indicates iterations.
+            Defaults to -1, which means "never".
+        by_epoch (bool): Saving checkpoints by epoch or by iteration.
+            Defaults to True.
+        save_optimizer (bool): Whether to save optimizer state_dict in the
+            checkpoint. It is usually used for resuming experiments.
+            Defaults to True.
+        save_param_scheduler (bool): Whether to save param_scheduler state_dict
+            in the checkpoint. It is usually used for resuming experiments.
+            Defaults to True.
+        out_dir (str, Path, Optional): The root directory to save checkpoints.
+            If not specified, ``runner.work_dir`` will be used by default. If
+            specified, the ``out_dir`` will be the concatenation of ``out_dir``
+            and the last level directory of ``runner.work_dir``. For example,
+            if the input ``our_dir`` is ``./tmp`` and ``runner.work_dir`` is
+            ``./work_dir/cur_exp``, then the ckpt will be saved in
+            ``./tmp/cur_exp``. Defaults to None.
+        max_keep_ckpts (int): The maximum checkpoints to keep.
+            In some cases we want only the latest few checkpoints and would
+            like to delete old ones to save the disk space.
+            Defaults to -1, which means unlimited.
+        save_last (bool): Whether to force the last checkpoint to be
+            saved regardless of interval. Defaults to True.
+        save_best (str, List[str], optional): If a metric is specified, it
+            would measure the best checkpoint during evaluation. If a list of
+            metrics is passed, it would measure a group of best checkpoints
+            corresponding to the passed metrics. The information about best
+            checkpoint(s) would be saved in ``runner.message_hub`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resuming checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Defaults to None.
+        rule (str, List[str], optional): Comparison rule for best score. If
+            set to None, it will infer a reasonable rule. Keys such as 'acc',
+            'top' .etc will be inferred by 'greater' rule. Keys contain 'loss'
+            will be inferred by 'less' rule. If ``save_best`` is a list of
+            metrics and ``rule`` is a str, all metrics in ``save_best`` will
+            share the comparison rule. If ``save_best`` and ``rule`` are both
+            lists, their length must be the same, and metrics in ``save_best``
+            will use the corresponding comparison rule in ``rule``. Options
+            are 'greater', 'less', None and list which contains 'greater' and
+            'less'. Defaults to None.
+        greater_keys (List[str], optional): Metric keys that will be
+            inferred by 'greater' comparison rule. If ``None``,
+            _default_greater_keys will be used. Defaults to None.
+        less_keys (List[str], optional): Metric keys that will be
+            inferred by 'less' comparison rule. If ``None``, _default_less_keys
+            will be used. Defaults to None.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+        filename_tmpl (str, optional): String template to indicate checkpoint
+            name. If specified, must contain one and only one "{}", which will
+            be replaced with ``epoch + 1`` if ``by_epoch=True`` else
+            ``iteration + 1``.
+            Defaults to None, which means "epoch_{}.pth" or "iter_{}.pth"
+            accordingly.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            `New in version 0.2.0.`
+        published_keys (str, List[str], optional): If ``save_last`` is ``True``
+            or ``save_best`` is not ``None``, it will automatically
+            publish model with keys in the list after training.
+            Defaults to None.
+            `New in version 0.7.1.`
+        save_begin (int): Control the epoch number or iteration number
+            at which checkpoint saving begins. Defaults to 0, which means
+            saving at the beginning.
+            `New in version 0.8.3.`
+
+    Examples:
+        >>> # Save best based on single metric
+        >>> CheckpointHook(interval=2, by_epoch=True, save_best='acc',
+        >>>                rule='less')
+        >>> # Save best based on multi metrics with the same comparison rule
+        >>> CheckpointHook(interval=2, by_epoch=True,
+        >>>                save_best=['acc', 'mIoU'], rule='greater')
+        >>> # Save best based on multi metrics with different comparison rule
+        >>> CheckpointHook(interval=2, by_epoch=True,
+        >>>                save_best=['FID', 'IS'], rule=['less', 'greater'])
+        >>> # Save best based on single metric and publish model after training
+        >>> CheckpointHook(interval=2, by_epoch=True, save_best='acc',
+        >>>                rule='less', published_keys=['meta', 'state_dict'])
+    """
+    out_dir: str
+
+    priority = 'VERY_LOW'
+
+    # logic to save best checkpoints
+    # Since the key for determining greater or less is related to the
+    # downstream tasks, downstream repositories may need to overwrite
+    # the following inner variables accordingly.
+
+    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
+    init_value_map = {'greater': -inf, 'less': inf}
+    _default_greater_keys = [
+        'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU',
+        'mAcc', 'aAcc'
+    ]
+    _default_less_keys = ['loss']
+
+    def __init__(self,
+                 interval: int = -1,
+                 by_epoch: bool = True,
+                 save_optimizer: bool = True,
+                 save_param_scheduler: bool = True,
+                 out_dir: Optional[Union[str, Path]] = None,
+                 max_keep_ckpts: int = -1,
+                 save_last: bool = True,
+                 save_best: Union[str, List[str], None] = None,
+                 rule: Union[str, List[str], None] = None,
+                 greater_keys: Optional[Sequence[str]] = None,
+                 less_keys: Optional[Sequence[str]] = None,
+                 file_client_args: Optional[dict] = None,
+                 filename_tmpl: Optional[str] = None,
+                 backend_args: Optional[dict] = None,
+                 published_keys: Union[str, List[str], None] = None,
+                 save_begin: int = 0,
+                 **kwargs) -> None:
+        self.interval = interval
+        self.by_epoch = by_epoch
+        self.save_optimizer = save_optimizer
+        self.save_param_scheduler = save_param_scheduler
+        self.out_dir = out_dir  # type: ignore
+        self.max_keep_ckpts = max_keep_ckpts
+        self.save_last = save_last
+        self.args = kwargs
+
+        if file_client_args is not None:
+            print_log(
+                '"file_client_args" will be deprecated in future. '
+                'Please use "backend_args" instead',
+                logger='current',
+                level=logging.WARNING)
+            if backend_args is not None:
+                raise ValueError(
+                    '"file_client_args" and "backend_args" cannot be set '
+                    'at the same time.')
+
+        self.file_client_args = file_client_args
+        self.backend_args = backend_args
+
+        if filename_tmpl is None:
+            if self.by_epoch:
+                self.filename_tmpl = 'epoch_{}.pth'
+            else:
+                self.filename_tmpl = 'iter_{}.pth'
+        else:
+            self.filename_tmpl = filename_tmpl
+
+        # save best logic
+        assert (isinstance(save_best, str) or is_list_of(save_best, str)
+                or (save_best is None)), (
+                    '"save_best" should be a str or list of str or None, '
+                    f'but got {type(save_best)}')
+
+        if isinstance(save_best, list):
+            if 'auto' in save_best:
+                assert len(save_best) == 1, (
+                    'Only support one "auto" in "save_best" list.')
+            assert len(save_best) == len(
+                set(save_best)), ('Find duplicate element in "save_best".')
+        else:
+            # convert str to list[str]
+            if save_best is not None:
+                save_best = [save_best]  # type: ignore # noqa: F401
+        self.save_best = save_best
+
+        # rule logic
+        assert (isinstance(rule, str) or is_list_of(rule, str)
+                or (rule is None)), (
+                    '"rule" should be a str or list of str or None, '
+                    f'but got {type(rule)}')
+        if isinstance(rule, list):
+            # check the length of rule list
+            assert len(rule) in [
+                1,
+                len(self.save_best)  # type: ignore
+            ], ('Number of "rule" must be 1 or the same as number of '
+                f'"save_best", but got {len(rule)}.')
+        else:
+            # convert str/None to list
+            rule = [rule]  # type: ignore # noqa: F401
+
+        if greater_keys is None:
+            self.greater_keys = self._default_greater_keys
+        else:
+            if not isinstance(greater_keys, (list, tuple)):
+                greater_keys = (greater_keys, )  # type: ignore
+            assert is_seq_of(greater_keys, str)
+            self.greater_keys = greater_keys  # type: ignore
+
+        if less_keys is None:
+            self.less_keys = self._default_less_keys
+        else:
+            if not isinstance(less_keys, (list, tuple)):
+                less_keys = (less_keys, )  # type: ignore
+            assert is_seq_of(less_keys, str)
+            self.less_keys = less_keys  # type: ignore
+
+        if self.save_best is not None:
+            self.is_better_than: Dict[str, Callable] = dict()
+            self._init_rule(rule, self.save_best)
+            if len(self.key_indicators) == 1:
+                self.best_ckpt_path: Optional[str] = None
+            else:
+                self.best_ckpt_path_dict: Dict = dict()
+
+        # published keys
+        if not (isinstance(published_keys, str)
+                or is_seq_of(published_keys, str) or published_keys is None):
+            raise TypeError(
+                '"published_keys" should be a str or a sequence of str or '
+                f'None, but got {type(published_keys)}')
+
+        if isinstance(published_keys, str):
+            published_keys = [published_keys]
+        elif isinstance(published_keys, (list, tuple)):
+            assert len(published_keys) == len(set(published_keys)), (
+                'Find duplicate elements in "published_keys".')
+        self.published_keys = published_keys
+
+        self.last_ckpt = None
+        if save_begin < 0:
+            raise ValueError(
+                'save_begin should not be less than 0, but got {save_begin}')
+        self.save_begin = save_begin
+
+    def before_train(self, runner) -> None:
+        """Finish all operations, related to checkpoint.
+
+        This function will get the appropriate file client, and the directory
+        to save these checkpoints of the model.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.out_dir is None:
+            self.out_dir = runner.work_dir
+
+        # If self.file_client_args is None, self.file_client will not
+        # used in CheckpointHook. To avoid breaking backward compatibility,
+        # it will not be removed util the release of MMEngine1.0
+        self.file_client = FileClient.infer_client(self.file_client_args,
+                                                   self.out_dir)
+
+        if self.file_client_args is None:
+            self.file_backend = get_file_backend(
+                self.out_dir, backend_args=self.backend_args)
+        else:
+            self.file_backend = self.file_client
+
+        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
+        # `self.out_dir` is set so the final `self.out_dir` is the
+        # concatenation of `self.out_dir` and the last level directory of
+        # `runner.work_dir`
+        if self.out_dir != runner.work_dir:
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_backend.join_path(
+                self.out_dir, basename)  # type: ignore  # noqa: E501
+
+        runner.logger.info(f'Checkpoints will be saved to {self.out_dir}.')
+
+        if self.save_best is not None:
+            if len(self.key_indicators) == 1:
+                if 'best_ckpt' not in runner.message_hub.runtime_info:
+                    self.best_ckpt_path = None
+                else:
+                    self.best_ckpt_path = runner.message_hub.get_info(
+                        'best_ckpt')
+            else:
+                for key_indicator in self.key_indicators:
+                    best_ckpt_name = f'best_ckpt_{key_indicator}'
+                    if best_ckpt_name not in runner.message_hub.runtime_info:
+                        self.best_ckpt_path_dict[key_indicator] = None
+                    else:
+                        self.best_ckpt_path_dict[
+                            key_indicator] = runner.message_hub.get_info(
+                                best_ckpt_name)
+
+        if self.max_keep_ckpts > 0:
+            keep_ckpt_ids = []
+            if 'keep_ckpt_ids' in runner.message_hub.runtime_info:
+                keep_ckpt_ids = runner.message_hub.get_info('keep_ckpt_ids')
+
+                while len(keep_ckpt_ids) > self.max_keep_ckpts:
+                    step = keep_ckpt_ids.pop(0)
+                    if is_main_process():
+                        path = self.file_backend.join_path(
+                            self.out_dir, self.filename_tmpl.format(step))
+                        if self.file_backend.isfile(path):
+                            self.file_backend.remove(path)
+                        elif self.file_backend.isdir(path):
+                            # checkpoints saved by deepspeed are directories
+                            self.file_backend.rmtree(path)
+
+            self.keep_ckpt_ids: deque = deque(keep_ckpt_ids,
+                                              self.max_keep_ckpts)
+
+    def after_train_epoch(self, runner) -> None:
+        """Save the checkpoint and synchronize buffers after each epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if not self.by_epoch:
+            return
+
+        # save checkpoint for following cases:
+        # 1. every ``self.interval`` epochs which start at ``self.save_begin``
+        # 2. reach the last epoch of training
+        if self.every_n_epochs(runner, self.interval, self.save_begin) or (
+                self.save_last and self.is_last_train_epoch(runner)):
+            runner.logger.info(
+                f'Saving checkpoint at {runner.epoch + 1} epochs')
+            self._save_checkpoint(runner)
+
+    def after_val_epoch(self, runner, metrics):
+        """Save the checkpoint and synchronize buffers after each evaluation
+        epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            metrics (dict): Evaluation results of all metrics
+        """
+        if len(metrics) == 0:
+            runner.logger.warning(
+                'Since `metrics` is an empty dict, the behavior to save '
+                'the best checkpoint will be skipped in this evaluation.')
+            return
+
+        self._save_best_checkpoint(runner, metrics)
+
+    def after_train(self, runner) -> None:
+        """Publish the checkpoint after training.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.published_keys is None:
+            return
+
+        if self.save_last and self.last_ckpt is not None:
+            self._publish_model(runner, self.last_ckpt)
+
+        if getattr(self, 'best_ckpt_path', None) is not None:
+            self._publish_model(runner, str(self.best_ckpt_path))
+        if getattr(self, 'best_ckpt_path_dict', None) is not None:
+            for best_ckpt in self.best_ckpt_path_dict.values():
+                self._publish_model(runner, best_ckpt)
+
+    @master_only
+    def _publish_model(self, runner, ckpt_path: str) -> None:
+        """Remove unnecessary keys from ckpt_path and save the new checkpoint.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            ckpt_path (str): The checkpoint path that ought to be published.
+        """
+        from mmengine.runner import save_checkpoint
+        from mmengine.runner.checkpoint import _load_checkpoint
+        checkpoint = _load_checkpoint(ckpt_path)
+        assert self.published_keys is not None
+        removed_keys = []
+        for key in list(checkpoint.keys()):
+            if key not in self.published_keys:
+                removed_keys.append(key)
+                checkpoint.pop(key)
+        if removed_keys:
+            print_log(
+                f'Key {removed_keys} will be removed because they are not '
+                'found in published_keys. If you want to keep them, '
+                f'please set `{removed_keys}` in published_keys',
+                logger='current')
+        checkpoint_data = pickle.dumps(checkpoint)
+        sha = hashlib.sha256(checkpoint_data).hexdigest()
+        final_path = osp.splitext(ckpt_path)[0] + f'-{sha[:8]}.pth'
+        save_checkpoint(checkpoint, final_path)
+        print_log(
+            f'The checkpoint ({ckpt_path}) is published to '
+            f'{final_path}.',
+            logger='current')
+
+    def _save_checkpoint_with_step(self, runner, step, meta):
+        # remove other checkpoints before save checkpoint to make the
+        # self.keep_ckpt_ids are saved as expected
+        if self.max_keep_ckpts > 0:
+            # _save_checkpoint and _save_best_checkpoint may call this
+            # _save_checkpoint_with_step in one epoch
+            if len(self.keep_ckpt_ids) > 0 and self.keep_ckpt_ids[-1] == step:
+                pass
+            else:
+                if len(self.keep_ckpt_ids) == self.max_keep_ckpts:
+                    _step = self.keep_ckpt_ids.popleft()
+                    if is_main_process():
+                        ckpt_path = self.file_backend.join_path(
+                            self.out_dir, self.filename_tmpl.format(_step))
+
+                        if self.file_backend.isfile(ckpt_path):
+                            self.file_backend.remove(ckpt_path)
+                        elif self.file_backend.isdir(ckpt_path):
+                            # checkpoints saved by deepspeed are directories
+                            self.file_backend.rmtree(ckpt_path)
+
+                self.keep_ckpt_ids.append(step)
+                runner.message_hub.update_info('keep_ckpt_ids',
+                                               list(self.keep_ckpt_ids))
+
+        ckpt_filename = self.filename_tmpl.format(step)
+        self.last_ckpt = self.file_backend.join_path(self.out_dir,
+                                                     ckpt_filename)
+        runner.message_hub.update_info('last_ckpt', self.last_ckpt)
+
+        runner.save_checkpoint(
+            self.out_dir,
+            ckpt_filename,
+            self.file_client_args,
+            save_optimizer=self.save_optimizer,
+            save_param_scheduler=self.save_param_scheduler,
+            meta=meta,
+            by_epoch=self.by_epoch,
+            backend_args=self.backend_args,
+            **self.args)
+
+        # Model parallel-like training should involve pulling sharded states
+        # from all ranks, but skip the following procedure.
+        if not is_main_process():
+            return
+
+        save_file = osp.join(runner.work_dir, 'last_checkpoint')
+        with open(save_file, 'w') as f:
+            f.write(self.last_ckpt)  # type: ignore
+
+    def _save_checkpoint(self, runner) -> None:
+        """Save the current checkpoint and delete outdated checkpoint.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.by_epoch:
+            step = runner.epoch + 1
+            meta = dict(epoch=step, iter=runner.iter)
+        else:
+            step = runner.iter + 1
+            meta = dict(epoch=runner.epoch, iter=step)
+
+        self._save_checkpoint_with_step(runner, step, meta=meta)
+
+    def _save_best_checkpoint(self, runner, metrics) -> None:
+        """Save the current checkpoint and delete outdated checkpoint.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            metrics (dict): Evaluation results of all metrics.
+        """
+        if not self.save_best:
+            return
+
+        if self.by_epoch:
+            ckpt_filename = self.filename_tmpl.format(runner.epoch)
+            cur_type, cur_time = 'epoch', runner.epoch
+        else:
+            ckpt_filename = self.filename_tmpl.format(runner.iter)
+            cur_type, cur_time = 'iter', runner.iter
+
+        meta = dict(epoch=runner.epoch, iter=runner.iter)
+
+        # handle auto in self.key_indicators and self.rules before the loop
+        if 'auto' in self.key_indicators:
+            self._init_rule(self.rules, [list(metrics.keys())[0]])
+
+        best_ckpt_updated = False
+        # save best logic
+        # get score from messagehub
+        for key_indicator, rule in zip(self.key_indicators, self.rules):
+            key_score = metrics[key_indicator]
+
+            if len(self.key_indicators) == 1:
+                best_score_key = 'best_score'
+                runtime_best_ckpt_key = 'best_ckpt'
+                best_ckpt_path = self.best_ckpt_path
+            else:
+                best_score_key = f'best_score_{key_indicator}'
+                runtime_best_ckpt_key = f'best_ckpt_{key_indicator}'
+                best_ckpt_path = self.best_ckpt_path_dict[key_indicator]
+
+            if best_score_key not in runner.message_hub.runtime_info:
+                best_score = self.init_value_map[rule]
+            else:
+                best_score = runner.message_hub.get_info(best_score_key)
+
+            if key_score is None or not self.is_better_than[key_indicator](
+                    key_score, best_score):
+                continue
+
+            best_ckpt_updated = True
+
+            best_score = key_score
+            runner.message_hub.update_info(best_score_key, best_score)
+
+            if best_ckpt_path and is_main_process():
+                is_removed = False
+                if self.file_backend.isfile(best_ckpt_path):
+                    self.file_backend.remove(best_ckpt_path)
+                    is_removed = True
+                elif self.file_backend.isdir(best_ckpt_path):
+                    # checkpoints saved by deepspeed are directories
+                    self.file_backend.rmtree(best_ckpt_path)
+                    is_removed = True
+
+                if is_removed:
+                    runner.logger.info(
+                        f'The previous best checkpoint {best_ckpt_path} '
+                        'is removed')
+
+            best_ckpt_name = f'best_{key_indicator}_{ckpt_filename}'
+            # Replace illegal characters for filename with `_`
+            best_ckpt_name = best_ckpt_name.replace('/', '_')
+            if len(self.key_indicators) == 1:
+                self.best_ckpt_path = self.file_backend.join_path(  # type: ignore # noqa: E501
+                    self.out_dir, best_ckpt_name)
+                runner.message_hub.update_info(runtime_best_ckpt_key,
+                                               self.best_ckpt_path)
+            else:
+                self.best_ckpt_path_dict[
+                    key_indicator] = self.file_backend.join_path(  # type: ignore # noqa: E501
+                        self.out_dir, best_ckpt_name)
+                runner.message_hub.update_info(
+                    runtime_best_ckpt_key,
+                    self.best_ckpt_path_dict[key_indicator])
+            runner.save_checkpoint(
+                self.out_dir,
+                filename=best_ckpt_name,
+                file_client_args=self.file_client_args,
+                save_optimizer=False,
+                save_param_scheduler=False,
+                meta=meta,
+                by_epoch=False,
+                backend_args=self.backend_args)
+            runner.logger.info(
+                f'The best checkpoint with {best_score:0.4f} {key_indicator} '
+                f'at {cur_time} {cur_type} is saved to {best_ckpt_name}.')
+
+        # save checkpoint again to update the best_score and best_ckpt stored
+        # in message_hub because the checkpoint saved in `after_train_epoch`
+        # or `after_train_iter` stage only keep the previous best checkpoint
+        # not the current best checkpoint which causes the current best
+        # checkpoint can not be removed when resuming training.
+        if best_ckpt_updated and self.last_ckpt is not None:
+            self._save_checkpoint_with_step(runner, cur_time, meta)
+
+    def _init_rule(self, rules, key_indicators) -> None:
+        """Initialize rule, key_indicator, comparison_func, and best score. If
+        key_indicator is a list of string and rule is a string, all metric in
+        the key_indicator will share the same rule.
+
+        Here is the rule to determine which rule is used for key indicator when
+        the rule is not specific (note that the key indicator matching is case-
+        insensitive):
+
+        1. If the key indicator is in ``self.greater_keys``, the rule
+            will be specified as 'greater'.
+        2. Or if the key indicator is in ``self.less_keys``, the rule
+            will be specified as 'less'.
+        3. Or if any one item in ``self.greater_keys`` is a substring of
+            key_indicator, the rule will be specified as 'greater'.
+        4. Or if any one item in ``self.less_keys`` is a substring of
+            key_indicator, the rule will be specified as 'less'.
+
+        Args:
+            rule (List[Optional[str]]): Comparison rule for best score.
+            key_indicator (List[str]): Key indicator to determine
+                the comparison rule.
+        """
+        if len(rules) == 1:
+            rules = rules * len(key_indicators)
+
+        self.rules = []
+        for rule, key_indicator in zip(rules, key_indicators):
+
+            if rule not in self.rule_map and rule is not None:
+                raise KeyError('rule must be greater, less or None, '
+                               f'but got {rule}.')
+
+            if rule is None and key_indicator != 'auto':
+                # `_lc` here means we use the lower case of keys for
+                # case-insensitive matching
+                key_indicator_lc = key_indicator.lower()
+                greater_keys = {key.lower() for key in self.greater_keys}
+                less_keys = {key.lower() for key in self.less_keys}
+
+                if key_indicator_lc in greater_keys:
+                    rule = 'greater'
+                elif key_indicator_lc in less_keys:
+                    rule = 'less'
+                elif any(key in key_indicator_lc for key in greater_keys):
+                    rule = 'greater'
+                elif any(key in key_indicator_lc for key in less_keys):
+                    rule = 'less'
+                else:
+                    raise ValueError('Cannot infer the rule for key '
+                                     f'{key_indicator}, thus a specific rule '
+                                     'must be specified.')
+            if rule is not None:
+                self.is_better_than[key_indicator] = self.rule_map[rule]
+            self.rules.append(rule)
+
+        self.key_indicators = key_indicators
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs=Optional[dict]) -> None:
+        """Save the checkpoint and synchronize buffers after each iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (dict, optional): Outputs from model.
+        """
+        if self.by_epoch:
+            return
+
+        # save checkpoint for following cases:
+        # 1. every ``self.interval`` iterations
+        #       which start at ``self.save_begin``
+        # 2. reach the last iteration of training
+        if self.every_n_train_iters(runner, self.interval,
+                                    self.save_begin) or \
+                (self.save_last and
+                 self.is_last_train_iter(runner)):
+            runner.logger.info(
+                f'Saving checkpoint at {runner.iter + 1} iterations')
+            self._save_checkpoint(runner)
diff --git a/head_extractor/build/lib/mmengine/hooks/early_stopping_hook.py b/head_extractor/build/lib/mmengine/hooks/early_stopping_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..5533ebc84cf939a38fc12e3cb6a8e5455c2f8c3c
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/early_stopping_hook.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from math import inf, isfinite
+from typing import Optional, Tuple, Union
+
+from mmengine.registry import HOOKS
+from .hook import Hook
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+@HOOKS.register_module()
+class EarlyStoppingHook(Hook):
+    """Early stop the training when the monitored metric reached a plateau.
+
+    Args:
+        monitor (str): The monitored metric key to decide early stopping.
+        rule (str, optional): Comparison rule. Options are 'greater',
+            'less'. Defaults to None.
+        min_delta (float, optional): Minimum difference to continue the
+            training. Defaults to 0.01.
+        strict (bool, optional): Whether to crash the training when `monitor`
+            is not found in the `metrics`. Defaults to False.
+        check_finite: Whether to stop training when the monitor becomes NaN or
+            infinite. Defaults to True.
+        patience (int, optional): The times of validation with no improvement
+            after which training will be stopped. Defaults to 5.
+        stopping_threshold (float, optional): Stop training immediately once
+            the monitored quantity reaches this threshold. Defaults to None.
+
+    Note:
+        `New in version 0.7.0.`
+    """
+    priority = 'LOWEST'
+
+    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
+    _default_greater_keys = [
+        'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU',
+        'mAcc', 'aAcc'
+    ]
+    _default_less_keys = ['loss']
+
+    def __init__(
+        self,
+        monitor: str,
+        rule: Optional[str] = None,
+        min_delta: float = 0.1,
+        strict: bool = False,
+        check_finite: bool = True,
+        patience: int = 5,
+        stopping_threshold: Optional[float] = None,
+    ):
+
+        self.monitor = monitor
+        if rule is not None:
+            if rule not in ['greater', 'less']:
+                raise ValueError(
+                    '`rule` should be either "greater" or "less", '
+                    f'but got {rule}')
+        else:
+            rule = self._init_rule(monitor)
+        self.rule = rule
+        self.min_delta = min_delta if rule == 'greater' else -1 * min_delta
+        self.strict = strict
+        self.check_finite = check_finite
+        self.patience = patience
+        self.stopping_threshold = stopping_threshold
+
+        self.wait_count = 0
+        self.best_score = -inf if rule == 'greater' else inf
+
+    def _init_rule(self, monitor: str) -> str:
+        greater_keys = {key.lower() for key in self._default_greater_keys}
+        less_keys = {key.lower() for key in self._default_less_keys}
+        monitor_lc = monitor.lower()
+        if monitor_lc in greater_keys:
+            rule = 'greater'
+        elif monitor_lc in less_keys:
+            rule = 'less'
+        elif any(key in monitor_lc for key in greater_keys):
+            rule = 'greater'
+        elif any(key in monitor_lc for key in less_keys):
+            rule = 'less'
+        else:
+            raise ValueError(f'Cannot infer the rule for {monitor}, thus rule '
+                             'must be specified.')
+        return rule
+
+    def _check_stop_condition(self, current_score: float) -> Tuple[bool, str]:
+        compare = self.rule_map[self.rule]
+        stop_training = False
+        reason_message = ''
+
+        if self.check_finite and not isfinite(current_score):
+            stop_training = True
+            reason_message = (f'Monitored metric {self.monitor} = '
+                              f'{current_score} is infinite. '
+                              f'Previous best value was '
+                              f'{self.best_score:.3f}.')
+
+        elif self.stopping_threshold is not None and compare(
+                current_score, self.stopping_threshold):
+            stop_training = True
+            self.best_score = current_score
+            reason_message = (f'Stopping threshold reached: '
+                              f'`{self.monitor}` = {current_score} is '
+                              f'{self.rule} than {self.stopping_threshold}.')
+        elif compare(self.best_score + self.min_delta, current_score):
+
+            self.wait_count += 1
+
+            if self.wait_count >= self.patience:
+                reason_message = (f'the monitored metric did not improve '
+                                  f'in the last {self.wait_count} records. '
+                                  f'best score: {self.best_score:.3f}. ')
+                stop_training = True
+        else:
+            self.best_score = current_score
+            self.wait_count = 0
+
+        return stop_training, reason_message
+
+    def before_run(self, runner) -> None:
+        """Check `stop_training` variable in `runner.train_loop`.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+
+        assert hasattr(runner.train_loop, 'stop_training'), \
+            '`train_loop` should contain `stop_training` variable.'
+
+    def after_val_epoch(self, runner, metrics):
+        """Decide whether to stop the training process.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            metrics (dict): Evaluation results of all metrics
+        """
+
+        if self.monitor not in metrics:
+            if self.strict:
+                raise RuntimeError(
+                    'Early stopping conditioned on metric '
+                    f'`{self.monitor} is not available. Please check available'
+                    f' metrics {metrics}, or set `strict=False` in '
+                    '`EarlyStoppingHook`.')
+            warnings.warn(
+                'Skip early stopping process since the evaluation '
+                f'results ({metrics.keys()}) do not include `monitor` '
+                f'({self.monitor}).')
+            return
+
+        current_score = metrics[self.monitor]
+
+        stop_training, message = self._check_stop_condition(current_score)
+        if stop_training:
+            runner.train_loop.stop_training = True
+            runner.logger.info(message)
diff --git a/head_extractor/build/lib/mmengine/hooks/ema_hook.py b/head_extractor/build/lib/mmengine/hooks/ema_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bc1051d0b7069fa83fa4ad10454c0b6e6470716
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/ema_hook.py
@@ -0,0 +1,241 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import itertools
+import logging
+from typing import Dict, Optional
+
+from mmengine.logging import print_log
+from mmengine.model import is_model_wrapper
+from mmengine.registry import HOOKS, MODELS
+from .hook import DATA_BATCH, Hook
+
+
+@HOOKS.register_module()
+class EMAHook(Hook):
+    """A Hook to apply Exponential Moving Average (EMA) on the model during
+    training.
+
+    Note:
+        - EMAHook takes priority over CheckpointHook.
+        - The original model parameters are actually saved in ema field after
+          train.
+        - ``begin_iter`` and ``begin_epoch`` cannot be set at the same time.
+
+    Args:
+        ema_type (str): The type of EMA strategy to use. You can find the
+            supported strategies in :mod:`mmengine.model.averaged_model`.
+            Defaults to 'ExponentialMovingAverage'.
+        strict_load (bool): Whether to strictly enforce that the keys of
+            ``state_dict`` in checkpoint match the keys returned by
+            ``self.module.state_dict``. Defaults to False.
+            Changed in v0.3.0.
+        begin_iter (int): The number of iteration to enable ``EMAHook``.
+            Defaults to 0.
+        begin_epoch (int): The number of epoch to enable ``EMAHook``.
+            Defaults to 0.
+        **kwargs: Keyword arguments passed to subclasses of
+            :obj:`BaseAveragedModel`
+    """
+
+    priority = 'NORMAL'
+
+    def __init__(self,
+                 ema_type: str = 'ExponentialMovingAverage',
+                 strict_load: bool = False,
+                 begin_iter: int = 0,
+                 begin_epoch: int = 0,
+                 **kwargs):
+        self.strict_load = strict_load
+        self.ema_cfg = dict(type=ema_type, **kwargs)
+        assert not (begin_iter != 0 and begin_epoch != 0), (
+            '`begin_iter` and `begin_epoch` should not be both set.')
+        assert begin_iter >= 0, (
+            '`begin_iter` must larger than or equal to 0, '
+            f'but got begin_iter: {begin_iter}')
+        assert begin_epoch >= 0, (
+            '`begin_epoch` must larger than or equal to 0, '
+            f'but got begin_epoch: {begin_epoch}')
+        self.begin_iter = begin_iter
+        self.begin_epoch = begin_epoch
+        # If `begin_epoch` and `begin_iter` are not set, `EMAHook` will be
+        # enabled at 0 iteration.
+        self.enabled_by_epoch = self.begin_epoch > 0
+
+    def before_run(self, runner) -> None:
+        """Create an ema copy of the model.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        self.src_model = model
+        self.ema_model = MODELS.build(
+            self.ema_cfg, default_args=dict(model=self.src_model))
+
+    def before_train(self, runner) -> None:
+        """Check the begin_epoch/iter is smaller than max_epochs/iters.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.enabled_by_epoch:
+            assert self.begin_epoch <= runner.max_epochs, (
+                'self.begin_epoch should be smaller than or equal to '
+                f'runner.max_epochs: {runner.max_epochs}, but got '
+                f'begin_epoch: {self.begin_epoch}')
+        else:
+            assert self.begin_iter <= runner.max_iters, (
+                'self.begin_iter should be smaller than or equal to '
+                f'runner.max_iters: {runner.max_iters}, but got '
+                f'begin_iter: {self.begin_iter}')
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Update ema parameter.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (Sequence[dict], optional): Data from dataloader.
+                Defaults to None.
+            outputs (dict, optional): Outputs from model. Defaults to None.
+        """
+        if self._ema_started(runner):
+            self.ema_model.update_parameters(self.src_model)
+        else:
+            ema_params = self.ema_model.module.state_dict()
+            src_params = self.src_model.state_dict()
+            for k, p in ema_params.items():
+                p.data.copy_(src_params[k].data)
+
+    def before_val_epoch(self, runner) -> None:
+        """We load parameter values from ema model to source model before
+        validation.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        self._swap_ema_parameters()
+
+    def after_val_epoch(self,
+                        runner,
+                        metrics: Optional[Dict[str, float]] = None) -> None:
+        """We recover source model's parameter from ema model after validation.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        self._swap_ema_parameters()
+
+    def before_test_epoch(self, runner) -> None:
+        """We load parameter values from ema model to source model before test.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        self._swap_ema_parameters()
+
+    def after_test_epoch(self,
+                         runner,
+                         metrics: Optional[Dict[str, float]] = None) -> None:
+        """We recover source model's parameter from ema model after test.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on test dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        self._swap_ema_parameters()
+
+    def before_save_checkpoint(self, runner, checkpoint: dict) -> None:
+        """Save ema parameters to checkpoint.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+        """
+        checkpoint['ema_state_dict'] = self.ema_model.state_dict()
+        # Save ema parameters to the source model's state dict so that we
+        # can directly load the averaged model weights for deployment.
+        # Swapping the state_dict key-values instead of swapping model
+        # parameters because the state_dict is a shallow copy of model
+        # parameters.
+        self._swap_ema_state_dict(checkpoint)
+
+    def after_load_checkpoint(self, runner, checkpoint: dict) -> None:
+        """Resume ema parameters from checkpoint.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+        """
+        from mmengine.runner.checkpoint import load_state_dict
+        if 'ema_state_dict' in checkpoint and runner._resume:
+            # The original model parameters are actually saved in ema
+            # field swap the weights back to resume ema state.
+            self._swap_ema_state_dict(checkpoint)
+            self.ema_model.load_state_dict(
+                checkpoint['ema_state_dict'], strict=self.strict_load)
+
+        # Support load checkpoint without ema state dict.
+        else:
+            if runner._resume:
+                print_log(
+                    'There is no `ema_state_dict` in checkpoint. '
+                    '`EMAHook` will make a copy of `state_dict` as the '
+                    'initial `ema_state_dict`', 'current', logging.WARNING)
+            load_state_dict(
+                self.ema_model.module,
+                copy.deepcopy(checkpoint['state_dict']),
+                strict=self.strict_load)
+
+    def _swap_ema_parameters(self) -> None:
+        """Swap the parameter of model with ema_model."""
+        avg_param = (
+            itertools.chain(self.ema_model.module.parameters(),
+                            self.ema_model.module.buffers())
+            if self.ema_model.update_buffers else
+            self.ema_model.module.parameters())
+        src_param = (
+            itertools.chain(self.src_model.parameters(),
+                            self.src_model.buffers())
+            if self.ema_model.update_buffers else self.src_model.parameters())
+        for p_avg, p_src in zip(avg_param, src_param):
+            tmp = p_avg.data.clone()
+            p_avg.data.copy_(p_src.data)
+            p_src.data.copy_(tmp)
+
+    def _swap_ema_state_dict(self, checkpoint):
+        """Swap the state dict values of model with ema_model."""
+        model_state = checkpoint['state_dict']
+        ema_state = checkpoint['ema_state_dict']
+        for k in ema_state:
+            if k[:7] == 'module.':
+                tmp = ema_state[k]
+                ema_state[k] = model_state[k[7:]]
+                model_state[k[7:]] = tmp
+
+    def _ema_started(self, runner) -> bool:
+        """Whether ``EMAHook`` has been initialized at current iteration or
+        epoch.
+
+        :attr:`ema_model` will be initialized when ``runner.iter`` or
+        ``runner.epoch`` is greater than ``self.begin`` for the first time.
+
+        Args:
+            runner (Runner): Runner of the training, validation process.
+
+        Returns:
+            bool: Whether ``EMAHook`` has been initialized.
+        """
+        if self.enabled_by_epoch:
+            return runner.epoch + 1 >= self.begin_epoch
+        else:
+            return runner.iter + 1 >= self.begin_iter
diff --git a/head_extractor/build/lib/mmengine/hooks/empty_cache_hook.py b/head_extractor/build/lib/mmengine/hooks/empty_cache_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a92cdebfef11ce5c9f9bf286d735b8e6202a85f
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/empty_cache_hook.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import torch
+
+from mmengine.registry import HOOKS
+from ..device import is_cuda_available, is_musa_available
+from .hook import Hook
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+@HOOKS.register_module()
+class EmptyCacheHook(Hook):
+    """Releases all unoccupied cached GPU memory during the process of
+    training.
+
+    Args:
+        before_epoch (bool): Whether to release cache before an epoch. Defaults
+            to False.
+        after_epoch (bool): Whether to release cache after an epoch. Defaults
+            to True.
+        after_iter (bool): Whether to release cache after an iteration.
+            Defaults to False.
+    """
+
+    priority = 'NORMAL'
+
+    def __init__(self,
+                 before_epoch: bool = False,
+                 after_epoch: bool = True,
+                 after_iter: bool = False) -> None:
+        self._do_before_epoch = before_epoch
+        self._do_after_epoch = after_epoch
+        self._do_after_iter = after_iter
+
+    def _after_iter(self,
+                    runner,
+                    batch_idx: int,
+                    data_batch: DATA_BATCH = None,
+                    outputs: Optional[Union[dict, Sequence]] = None,
+                    mode: str = 'train') -> None:
+        """Empty cache after an iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (dict or sequence, optional): Outputs from model.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+        if self._do_after_iter:
+            if is_cuda_available():
+                torch.cuda.empty_cache()
+            elif is_musa_available():
+                torch.musa.empty_cache()
+
+    def _before_epoch(self, runner, mode: str = 'train') -> None:
+        """Empty cache before an epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+        if self._do_before_epoch:
+            if is_cuda_available():
+                torch.cuda.empty_cache()
+            elif is_musa_available():
+                torch.musa.empty_cache()
+
+    def _after_epoch(self, runner, mode: str = 'train') -> None:
+        """Empty cache after an epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+        if self._do_after_epoch:
+            if is_cuda_available():
+                torch.cuda.empty_cache()
+            elif is_musa_available():
+                torch.musa.empty_cache()
diff --git a/head_extractor/build/lib/mmengine/hooks/hook.py b/head_extractor/build/lib/mmengine/hooks/hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e1c4ce8bd28f450f911d0e92f9ebceb6dae4014
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/hook.py
@@ -0,0 +1,449 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Sequence, Union
+
+from mmengine import is_method_overridden
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+class Hook:
+    """Base hook class.
+
+    All hooks should inherit from this class.
+    """
+
+    priority = 'NORMAL'
+    stages = ('before_run', 'after_load_checkpoint', 'before_train',
+              'before_train_epoch', 'before_train_iter', 'after_train_iter',
+              'after_train_epoch', 'before_val', 'before_val_epoch',
+              'before_val_iter', 'after_val_iter', 'after_val_epoch',
+              'after_val', 'before_save_checkpoint', 'after_train',
+              'before_test', 'before_test_epoch', 'before_test_iter',
+              'after_test_iter', 'after_test_epoch', 'after_test', 'after_run')
+
+    def before_run(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before the training validation or testing process.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+        """
+
+    def after_run(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before the training validation or testing process.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+        """
+
+    def before_train(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before train.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+
+    def after_train(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations after train.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+
+    def before_val(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before validation.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+        """
+
+    def after_val(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations after validation.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+        """
+
+    def before_test(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before testing.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+        """
+
+    def after_test(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations after testing.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+        """
+
+    def before_save_checkpoint(self, runner, checkpoint: dict) -> None:
+        """All subclasses should override this method, if they need any
+        operations before saving the checkpoint.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            checkpoint (dict): Model's checkpoint.
+        """
+
+    def after_load_checkpoint(self, runner, checkpoint: dict) -> None:
+        """All subclasses should override this method, if they need any
+        operations after loading the checkpoint.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            checkpoint (dict): Model's checkpoint.
+        """
+
+    def before_train_epoch(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before each training epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        self._before_epoch(runner, mode='train')
+
+    def before_val_epoch(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before each validation epoch.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+        """
+        self._before_epoch(runner, mode='val')
+
+    def before_test_epoch(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before each test epoch.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+        """
+        self._before_epoch(runner, mode='test')
+
+    def after_train_epoch(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each training epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        self._after_epoch(runner, mode='train')
+
+    def after_val_epoch(self,
+                        runner,
+                        metrics: Optional[Dict[str, float]] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each validation epoch.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        self._after_epoch(runner, mode='val')
+
+    def after_test_epoch(self,
+                         runner,
+                         metrics: Optional[Dict[str, float]] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each test epoch.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on test dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        self._after_epoch(runner, mode='test')
+
+    def before_train_iter(self,
+                          runner,
+                          batch_idx: int,
+                          data_batch: DATA_BATCH = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations before each training iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+        """
+        self._before_iter(
+            runner, batch_idx=batch_idx, data_batch=data_batch, mode='train')
+
+    def before_val_iter(self,
+                        runner,
+                        batch_idx: int,
+                        data_batch: DATA_BATCH = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations before each validation iteration.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict, optional): Data from dataloader.
+                Defaults to None.
+        """
+        self._before_iter(
+            runner, batch_idx=batch_idx, data_batch=data_batch, mode='val')
+
+    def before_test_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations before each test iteration.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+                Defaults to None.
+        """
+        self._before_iter(
+            runner, batch_idx=batch_idx, data_batch=data_batch, mode='test')
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs: Optional[dict] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each training iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict tuple or list, optional): Data from dataloader.
+            outputs (dict, optional): Outputs from model.
+        """
+        self._after_iter(
+            runner,
+            batch_idx=batch_idx,
+            data_batch=data_batch,
+            outputs=outputs,
+            mode='train')
+
+    def after_val_iter(self,
+                       runner,
+                       batch_idx: int,
+                       data_batch: DATA_BATCH = None,
+                       outputs: Optional[Sequence] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each validation iteration.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (Sequence, optional): Outputs from model.
+        """
+        self._after_iter(
+            runner,
+            batch_idx=batch_idx,
+            data_batch=data_batch,
+            outputs=outputs,
+            mode='val')
+
+    def after_test_iter(self,
+                        runner,
+                        batch_idx: int,
+                        data_batch: DATA_BATCH = None,
+                        outputs: Optional[Sequence] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each test iteration.
+
+        Args:
+            runner (Runner): The runner of the training  process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (Sequence, optional): Outputs from model.
+        """
+        self._after_iter(
+            runner,
+            batch_idx=batch_idx,
+            data_batch=data_batch,
+            outputs=outputs,
+            mode='test')
+
+    def _before_epoch(self, runner, mode: str = 'train') -> None:
+        """All subclasses should override this method, if they need any
+        operations before each epoch.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+
+    def _after_epoch(self, runner, mode: str = 'train') -> None:
+        """All subclasses should override this method, if they need any
+        operations after each epoch.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+
+    def _before_iter(self,
+                     runner,
+                     batch_idx: int,
+                     data_batch: DATA_BATCH = None,
+                     mode: str = 'train') -> None:
+        """All subclasses should override this method, if they need any
+        operations before each iter.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            batch_idx (int): The index of the current batch in the loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+
+    def _after_iter(self,
+                    runner,
+                    batch_idx: int,
+                    data_batch: DATA_BATCH = None,
+                    outputs: Optional[Union[Sequence, dict]] = None,
+                    mode: str = 'train') -> None:
+        """All subclasses should override this method, if they need any
+        operations after each epoch.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            batch_idx (int): The index of the current batch in the loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (dict or Sequence, optional): Outputs from model.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+
+    def every_n_epochs(self, runner, n: int, start: int = 0) -> bool:
+        """Test whether current epoch can be evenly divided by n.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            n (int): Whether current epoch can be evenly divided by n.
+            start (int): Starting from `start` to check the logic for
+                every n epochs. Defaults to 0.
+
+        Returns:
+            bool: Whether current epoch can be evenly divided by n.
+        """
+        dividend = runner.epoch + 1 - start
+        return dividend % n == 0 if dividend >= 0 and n > 0 else False
+
+    def every_n_inner_iters(self, batch_idx: int, n: int) -> bool:
+        """Test whether current inner iteration can be evenly divided by n.
+
+        Args:
+            batch_idx (int): Current batch index of the training, validation
+                or testing loop.
+            n (int): Whether current inner iteration can be evenly
+                divided by n.
+
+        Returns:
+            bool: Whether current inner iteration can be evenly
+            divided by n.
+        """
+        return (batch_idx + 1) % n == 0 if n > 0 else False
+
+    def every_n_train_iters(self, runner, n: int, start: int = 0) -> bool:
+        """Test whether current training iteration can be evenly divided by n.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            n (int): Whether current iteration can be evenly divided by n.
+            start (int): Starting from `start` to check the logic for
+                every n iterations. Defaults to 0.
+
+        Returns:
+            bool: Return True if the current iteration can be evenly divided
+            by n, otherwise False.
+        """
+        dividend = runner.iter + 1 - start
+        return dividend % n == 0 if dividend >= 0 and n > 0 else False
+
+    def end_of_epoch(self, dataloader, batch_idx: int) -> bool:
+        """Check whether the current iteration reaches the last iteration of
+        the dataloader.
+
+        Args:
+            dataloader (Dataloader): The dataloader of the training,
+                validation or testing process.
+            batch_idx (int): The index of the current batch in the loop.
+        Returns:
+            bool: Whether reaches the end of current epoch or not.
+        """
+        return batch_idx + 1 == len(dataloader)
+
+    def is_last_train_epoch(self, runner) -> bool:
+        """Test whether current epoch is the last train epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+
+        Returns:
+            bool: Whether reaches the end of training epoch.
+        """
+        return runner.epoch + 1 == runner.max_epochs
+
+    def is_last_train_iter(self, runner) -> bool:
+        """Test whether current iteration is the last train iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+
+        Returns:
+            bool: Whether current iteration is the last train iteration.
+        """
+        return runner.iter + 1 == runner.max_iters
+
+    def get_triggered_stages(self) -> list:
+        """Get all triggered stages with method name of the hook.
+
+        Returns:
+            list: List of triggered stages.
+        """
+        trigger_stages = set()
+        for stage in Hook.stages:
+            if is_method_overridden(stage, Hook, self):
+                trigger_stages.add(stage)
+
+        # some methods will be triggered in multi stages
+        # use this dict to map method to stages.
+        method_stages_map = {
+            '_before_epoch':
+            ['before_train_epoch', 'before_val_epoch', 'before_test_epoch'],
+            '_after_epoch':
+            ['after_train_epoch', 'after_val_epoch', 'after_test_epoch'],
+            '_before_iter':
+            ['before_train_iter', 'before_val_iter', 'before_test_iter'],
+            '_after_iter':
+            ['after_train_iter', 'after_val_iter', 'after_test_iter'],
+        }
+
+        for method, map_stages in method_stages_map.items():
+            if is_method_overridden(method, Hook, self):
+                trigger_stages.update(map_stages)
+
+        return list(trigger_stages)
diff --git a/head_extractor/build/lib/mmengine/hooks/iter_timer_hook.py b/head_extractor/build/lib/mmengine/hooks/iter_timer_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..5632c2b25e03b5773ec694452535be1ecb661d19
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/iter_timer_hook.py
@@ -0,0 +1,107 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+from typing import Optional, Sequence, Union
+
+from mmengine.registry import HOOKS
+from .hook import Hook
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+@HOOKS.register_module()
+class IterTimerHook(Hook):
+    """A hook that logs the time spent during iteration.
+
+    E.g. ``data_time`` for loading data and ``time`` for a model train step.
+    """
+
+    priority = 'NORMAL'
+
+    def __init__(self):
+        self.time_sec_tot = 0
+        self.time_sec_test_val = 0
+        self.start_iter = 0
+
+    def before_train(self, runner) -> None:
+        """Synchronize the number of iterations with the runner after resuming
+        from checkpoints.
+
+        Args:
+            runner: The runner of the training, validation or testing
+                process.
+        """
+        self.start_iter = runner.iter
+
+    def _before_epoch(self, runner, mode: str = 'train') -> None:
+        """Record timestamp before start an epoch.
+
+        Args:
+            runner (Runner): The runner of the training validation and
+                testing process.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+        self.t = time.time()
+
+    def _after_epoch(self, runner, mode: str = 'train') -> None:
+        self.time_sec_test_val = 0
+
+    def _before_iter(self,
+                     runner,
+                     batch_idx: int,
+                     data_batch: DATA_BATCH = None,
+                     mode: str = 'train') -> None:
+        """Calculating time for loading data and updating "data_time"
+        ``HistoryBuffer`` of ``runner.message_hub``.
+
+        Args:
+            runner (Runner): The runner of the training, validation and
+                testing process.
+            batch_idx (int): The index of the current batch in the loop.
+            data_batch (dict or tuple or list, optional): Data from
+                dataloader.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+        # Update data loading time in `runner.message_hub`.
+        runner.message_hub.update_scalar(f'{mode}/data_time',
+                                         time.time() - self.t)
+
+    def _after_iter(self,
+                    runner,
+                    batch_idx: int,
+                    data_batch: DATA_BATCH = None,
+                    outputs: Optional[Union[dict, Sequence]] = None,
+                    mode: str = 'train') -> None:
+        """Calculating time for an iteration and updating "time"
+        ``HistoryBuffer`` of ``runner.message_hub``.
+
+        Args:
+            runner (Runner): The runner of the training validation and
+                testing process.
+            batch_idx (int): The index of the current batch in the loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (dict or sequence, optional): Outputs from model.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+        # Update iteration time in `runner.message_hub`.
+        message_hub = runner.message_hub
+        message_hub.update_scalar(f'{mode}/time', time.time() - self.t)
+        self.t = time.time()
+        iter_time = message_hub.get_scalar(f'{mode}/time')
+        if mode == 'train':
+            self.time_sec_tot += iter_time.current()
+            # Calculate average iterative time.
+            time_sec_avg = self.time_sec_tot / (
+                runner.iter - self.start_iter + 1)
+            # Calculate eta.
+            eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
+            runner.message_hub.update_info('eta', eta_sec)
+        else:
+            if mode == 'val':
+                cur_dataloader = runner.val_dataloader
+            else:
+                cur_dataloader = runner.test_dataloader
+
+            self.time_sec_test_val += iter_time.current()
+            time_sec_avg = self.time_sec_test_val / (batch_idx + 1)
+            eta_sec = time_sec_avg * (len(cur_dataloader) - batch_idx - 1)
+            runner.message_hub.update_info('eta', eta_sec)
diff --git a/head_extractor/build/lib/mmengine/hooks/logger_hook.py b/head_extractor/build/lib/mmengine/hooks/logger_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa0b79dcf93cf9380a464a8ca50fda4b3d61c93e
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/logger_hook.py
@@ -0,0 +1,355 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+import os.path as osp
+from collections import OrderedDict
+from pathlib import Path
+from typing import Dict, Optional, Sequence, Union
+
+import numpy as np
+import torch
+
+from mmengine.fileio import FileClient, dump
+from mmengine.fileio.io import get_file_backend
+from mmengine.hooks import Hook
+from mmengine.logging import print_log
+from mmengine.registry import HOOKS
+from mmengine.utils import is_seq_of, scandir
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+SUFFIX_TYPE = Union[Sequence[str], str]
+
+
+@HOOKS.register_module()
+class LoggerHook(Hook):
+    """Collect logs from different components of ``Runner`` and write them to
+    terminal, JSON file, tensorboard and wandb .etc.
+
+    ``LoggerHook`` is used to record logs formatted by ``LogProcessor`` during
+    training/validation/testing phase. It is used to control following
+    behaviors:
+
+    - The frequency of logs update in terminal, local, tensorboad wandb.etc.
+    - The frequency of show experiment information in terminal.
+    - The work directory to save logs.
+
+    Args:
+        interval (int): Logging interval (every k iterations).
+            Defaults to 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch if
+            the number of remaining iterations is less than :attr:`interval`.
+            Defaults to True.
+        interval_exp_name (int): Logging interval for experiment name. This
+            feature is to help users conveniently get the experiment
+            information from screen or log file. Defaults to 1000.
+        out_dir (str or Path, optional): The root directory to save
+            checkpoints. If not specified, ``runner.work_dir`` will be used
+            by default. If specified, the ``out_dir`` will be the concatenation
+            of ``out_dir`` and the last level directory of ``runner.work_dir``.
+            For example, if the input ``out_dir`` is ``./tmp`` and
+            ``runner.work_dir`` is ``./work_dir/cur_exp``, then the log will be
+            saved in ``./tmp/cur_exp``. Defaults to None.
+        out_suffix (Tuple[str] or str): Those files in ``runner._log_dir``
+            ending with ``out_suffix`` will be copied to ``out_dir``. Defaults
+            to ('json', '.log', '.py').
+        keep_local (bool): Whether to keep local logs in the local machine
+            when :attr:`out_dir` is specified. If False, the local log will be
+            removed. Defaults to True.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            `backend_args` instead.
+        log_metric_by_epoch (bool): Whether to output metric in validation step
+            by epoch. It can be true when running in epoch based runner.
+            If set to True, `after_val_epoch` will set `step` to self.epoch in
+            `runner.visualizer.add_scalars`. Otherwise `step` will be
+            self.iter. Defaults to True.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+
+    Examples:
+        >>> # The simplest LoggerHook config.
+        >>> logger_hook_cfg = dict(interval=20)
+    """
+    priority = 'BELOW_NORMAL'
+
+    def __init__(self,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 interval_exp_name: int = 1000,
+                 out_dir: Optional[Union[str, Path]] = None,
+                 out_suffix: SUFFIX_TYPE = ('.json', '.log', '.py', 'yaml'),
+                 keep_local: bool = True,
+                 file_client_args: Optional[dict] = None,
+                 log_metric_by_epoch: bool = True,
+                 backend_args: Optional[dict] = None):
+
+        if not isinstance(interval, int):
+            raise TypeError('interval must be an integer')
+        if interval <= 0:
+            raise ValueError('interval must be greater than 0')
+
+        if not isinstance(ignore_last, bool):
+            raise TypeError('ignore_last must be a boolean')
+
+        if not isinstance(interval_exp_name, int):
+            raise TypeError('interval_exp_name must be an integer')
+        if interval_exp_name <= 0:
+            raise ValueError('interval_exp_name must be greater than 0')
+
+        if out_dir is not None and not isinstance(out_dir, (str, Path)):
+            raise TypeError('out_dir must be a str or Path object')
+
+        if not isinstance(keep_local, bool):
+            raise TypeError('keep_local must be a boolean')
+
+        if out_dir is None and file_client_args is not None:
+            raise ValueError(
+                'file_client_args should be "None" when `out_dir` is not'
+                'specified.')
+
+        if file_client_args is not None:
+            print_log(
+                '"file_client_args" will be deprecated in future. '
+                'Please use "backend_args" instead',
+                logger='current',
+                level=logging.WARNING)
+            if backend_args is not None:
+                raise ValueError(
+                    '"file_client_args" and "backend_args" cannot be set '
+                    'at the same time.')
+
+        if not (isinstance(out_suffix, str) or is_seq_of(out_suffix, str)):
+            raise TypeError('out_suffix should be a string or a sequence of '
+                            f'string, but got {type(out_suffix)}')
+
+        self.out_suffix = out_suffix
+        self.out_dir = out_dir
+        self.interval = interval
+        self.ignore_last = ignore_last
+        self.interval_exp_name = interval_exp_name
+        self.keep_local = keep_local
+        self.file_client_args = file_client_args
+        self.json_log_path: Optional[str] = None
+
+        if self.out_dir is not None:
+            self.file_client = FileClient.infer_client(file_client_args,
+                                                       self.out_dir)
+            if file_client_args is None:
+                self.file_backend = get_file_backend(
+                    self.out_dir, backend_args=backend_args)
+            else:
+                self.file_backend = self.file_client
+
+        self.log_metric_by_epoch = log_metric_by_epoch
+
+    def before_run(self, runner) -> None:
+        """Infer ``self.file_client`` from ``self.out_dir``. Initialize the
+        ``self.start_iter`` and record the meta information.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.out_dir is not None:
+            # The final `self.out_dir` is the concatenation of `self.out_dir`
+            # and the last level directory of `runner.work_dir`
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_backend.join_path(self.out_dir, basename)
+            runner.logger.info(
+                f'Text logs will be saved to {self.out_dir} after the '
+                'training process.')
+
+        self.json_log_path = f'{runner.timestamp}.json'
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Record logs after training iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict tuple or list, optional): Data from dataloader.
+            outputs (dict, optional): Outputs from model.
+        """
+        # Print experiment name every n iterations.
+        if self.every_n_train_iters(
+                runner, self.interval_exp_name) or (self.end_of_epoch(
+                    runner.train_dataloader, batch_idx)):
+            exp_info = f'Exp name: {runner.experiment_name}'
+            runner.logger.info(exp_info)
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            tag, log_str = runner.log_processor.get_log_after_iter(
+                runner, batch_idx, 'train')
+        elif (self.end_of_epoch(runner.train_dataloader, batch_idx)
+              and (not self.ignore_last
+                   or len(runner.train_dataloader) <= self.interval)):
+            # `runner.max_iters` may not be divisible by `self.interval`. if
+            # `self.ignore_last==True`, the log of remaining iterations will
+            # be recorded (Epoch [4][1000/1007], the logs of 998-1007
+            # iterations will be recorded).
+            tag, log_str = runner.log_processor.get_log_after_iter(
+                runner, batch_idx, 'train')
+        else:
+            return
+        runner.logger.info(log_str)
+        runner.visualizer.add_scalars(
+            tag, step=runner.iter + 1, file_path=self.json_log_path)
+
+    def after_val_iter(self,
+                       runner,
+                       batch_idx: int,
+                       data_batch: DATA_BATCH = None,
+                       outputs: Optional[Sequence] = None) -> None:
+        """Record logs after validation iteration.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the validation
+                loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+                Defaults to None.
+            outputs (sequence, optional): Outputs from model.
+        """
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            _, log_str = runner.log_processor.get_log_after_iter(
+                runner, batch_idx, 'val')
+            runner.logger.info(log_str)
+
+    def after_test_iter(self,
+                        runner,
+                        batch_idx: int,
+                        data_batch: DATA_BATCH = None,
+                        outputs: Optional[Sequence] = None) -> None:
+        """Record logs after testing iteration.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (sequence, optional): Outputs from model.
+        """
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            _, log_str = runner.log_processor.get_log_after_iter(
+                runner, batch_idx, 'test')
+            runner.logger.info(log_str)
+
+    def after_val_epoch(self,
+                        runner,
+                        metrics: Optional[Dict[str, float]] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each validation epoch.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        tag, log_str = runner.log_processor.get_log_after_epoch(
+            runner, len(runner.val_dataloader), 'val')
+        runner.logger.info(log_str)
+        if self.log_metric_by_epoch:
+            # Accessing the epoch attribute of the runner will trigger
+            # the construction of the train_loop. Therefore, to avoid
+            # triggering the construction of the train_loop during
+            # validation, check before accessing the epoch.
+            if (isinstance(runner._train_loop, dict)
+                    or runner._train_loop is None):
+                epoch = 0
+            else:
+                epoch = runner.epoch
+            runner.visualizer.add_scalars(
+                tag, step=epoch, file_path=self.json_log_path)
+        else:
+            if (isinstance(runner._train_loop, dict)
+                    or runner._train_loop is None):
+                iter = 0
+            else:
+                iter = runner.iter
+            runner.visualizer.add_scalars(
+                tag, step=iter, file_path=self.json_log_path)
+
+    def after_test_epoch(self,
+                         runner,
+                         metrics: Optional[Dict[str, float]] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each test epoch.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on test dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        tag, log_str = runner.log_processor.get_log_after_epoch(
+            runner, len(runner.test_dataloader), 'test', with_non_scalar=True)
+        runner.logger.info(log_str)
+        dump(
+            self._process_tags(tag),
+            osp.join(runner.log_dir, self.json_log_path))  # type: ignore
+
+    @staticmethod
+    def _process_tags(tags: dict):
+        """Convert tag values to json-friendly type."""
+
+        def process_val(value):
+            if isinstance(value, (list, tuple)):
+                # Array type of json
+                return [process_val(item) for item in value]
+            elif isinstance(value, dict):
+                # Object type of json
+                return {k: process_val(v) for k, v in value.items()}
+            elif isinstance(value, (str, int, float, bool)) or value is None:
+                # Other supported type of json
+                return value
+            elif isinstance(value, (torch.Tensor, np.ndarray)):
+                return value.tolist()
+            # Drop unsupported values.
+
+        processed_tags = OrderedDict(process_val(tags))
+
+        return processed_tags
+
+    def after_run(self, runner) -> None:
+        """Copy logs to ``self.out_dir`` if ``self.out_dir is not None``
+
+        Args:
+            runner (Runner): The runner of the training/testing/validation
+                process.
+        """
+        # close the visualizer
+        runner.visualizer.close()
+
+        # copy or upload logs to self.out_dir
+        if self.out_dir is None:
+            return
+
+        removed_files = []
+        for filename in scandir(runner._log_dir, self.out_suffix, True):
+            local_filepath = osp.join(runner._log_dir, filename)
+            removed_files.append(local_filepath)
+            out_filepath = self.file_backend.join_path(self.out_dir, filename)
+            with open(local_filepath) as f:
+                self.file_backend.put_text(f.read(), out_filepath)
+
+            runner.logger.info(
+                f'The file {local_filepath} has been uploaded to '
+                f'{out_filepath}.')
+
+            if not self.keep_local:
+                runner.logger.info(f'{local_filepath} was removed due to the '
+                                   '`self.keep_local=False`. You can check '
+                                   f'the running logs in {out_filepath}')
+
+        if not self.keep_local:
+            # Close file handler to avoid PermissionError on Windows.
+            for handler in runner.logger.handlers:
+                if isinstance(handler, logging.FileHandler):
+                    handler.close()
+
+            for file in removed_files:
+                os.remove(file)
diff --git a/head_extractor/build/lib/mmengine/hooks/naive_visualization_hook.py b/head_extractor/build/lib/mmengine/hooks/naive_visualization_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb803a20fa47fa8cffd0a9c0defb2bbca87d1f8
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/naive_visualization_hook.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Optional, Sequence, Tuple, Union
+
+import cv2
+import numpy as np
+
+from mmengine.hooks import Hook
+from mmengine.registry import HOOKS
+from mmengine.utils.dl_utils import tensor2imgs
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+# TODO: Due to interface changes, the current class
+#  functions incorrectly
+@HOOKS.register_module()
+class NaiveVisualizationHook(Hook):
+    """Show or Write the predicted results during the process of testing.
+
+    Args:
+        interval (int): Visualization interval. Defaults to 1.
+        draw_gt (bool): Whether to draw the ground truth. Defaults to True.
+        draw_pred (bool): Whether to draw the predicted result.
+            Defaults to True.
+    """
+    priority = 'NORMAL'
+
+    def __init__(self,
+                 interval: int = 1,
+                 draw_gt: bool = True,
+                 draw_pred: bool = True):
+        self.draw_gt = draw_gt
+        self.draw_pred = draw_pred
+        self._interval = interval
+
+    def _unpad(self, input: np.ndarray, unpad_shape: Tuple[int,
+                                                           int]) -> np.ndarray:
+        """Unpad the input image.
+
+        Args:
+            input (np.ndarray): The image to unpad.
+            unpad_shape (tuple): The shape of image before padding.
+
+        Returns:
+            np.ndarray: The image before padding.
+        """
+        unpad_width, unpad_height = unpad_shape
+        unpad_image = input[:unpad_height, :unpad_width]
+        return unpad_image
+
+    def before_train(self, runner) -> None:
+        """Call add_graph method of visualizer.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        runner.visualizer.add_graph(runner.model, None)
+
+    def after_test_iter(self,
+                        runner,
+                        batch_idx: int,
+                        data_batch: DATA_BATCH = None,
+                        outputs: Optional[Sequence] = None) -> None:
+        """Show or Write the predicted results.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (Sequence, optional): Outputs from model.
+        """
+        if self.every_n_inner_iters(batch_idx, self._interval):
+            for data, output in zip(data_batch, outputs):  # type: ignore
+                input = data['inputs']
+                data_sample = data['data_sample']
+                input = tensor2imgs(input,
+                                    **data_sample.get('img_norm_cfg',
+                                                      dict()))[0]
+                # TODO We will implement a function to revert the augmentation
+                # in the future.
+                ori_shape = (data_sample.ori_width, data_sample.ori_height)
+                if 'pad_shape' in data_sample:
+                    input = self._unpad(input,
+                                        data_sample.get('scale', ori_shape))
+                origin_image = cv2.resize(input, ori_shape)
+                name = osp.basename(data_sample.img_path)
+                runner.visualizer.add_datasample(name, origin_image,
+                                                 data_sample, output,
+                                                 self.draw_gt, self.draw_pred)
diff --git a/head_extractor/build/lib/mmengine/hooks/param_scheduler_hook.py b/head_extractor/build/lib/mmengine/hooks/param_scheduler_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b2f1e610a9db058e99d03faf607514d73bce030
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/param_scheduler_hook.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Union
+
+from mmengine.optim import _ParamScheduler
+from mmengine.registry import HOOKS
+from mmengine.utils import is_list_of
+from .hook import Hook
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+@HOOKS.register_module()
+class ParamSchedulerHook(Hook):
+    """A hook to update some hyper-parameters in optimizer, e.g., learning rate
+    and momentum."""
+
+    priority = 'LOW'
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Call step function for each scheduler after each training iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+                In order to keep this interface consistent with other hooks,
+                we keep ``data_batch`` here.
+            outputs (dict, optional): Outputs from model.
+                In order to keep this interface consistent with other hooks, we
+                keep ``data_batch`` here.
+        """
+
+        if runner.param_schedulers is None:
+            return
+
+        def step(param_schedulers):
+            assert isinstance(param_schedulers, list)
+            for scheduler in param_schedulers:
+                if not scheduler.by_epoch:
+                    scheduler.step()
+
+        if isinstance(runner.param_schedulers, list):
+            step(runner.param_schedulers)
+        elif isinstance(runner.param_schedulers, dict):
+            for param_schedulers in runner.param_schedulers.values():
+                step(param_schedulers)
+        else:
+            raise TypeError(
+                'runner.param_schedulers should be list of ParamScheduler or '
+                'a dict containing list of ParamScheduler, '
+                f'but got {runner.param_schedulers}')
+
+    def after_train_epoch(self, runner) -> None:
+        """Call step function for each scheduler after each training epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+
+        if runner.param_schedulers is None:
+            return
+
+        def step(param_schedulers):
+            assert isinstance(param_schedulers, list)
+            for scheduler in param_schedulers:
+                if scheduler.by_epoch:
+                    scheduler.step()
+
+        if isinstance(runner.param_schedulers, list):
+            step(runner.param_schedulers)
+        elif isinstance(runner.param_schedulers, dict):
+            for param_schedulers in runner.param_schedulers.values():
+                step(param_schedulers)
+        else:
+            raise TypeError(
+                'runner.param_schedulers should be list of ParamScheduler or '
+                'a dict containing list of ParamScheduler, '
+                f'but got {runner.param_schedulers}')
+
+    def after_val_epoch(self,
+                        runner,
+                        metrics: Optional[Dict[str, float]] = None) -> None:
+        """Call step function for each scheduler which has attribute
+        ``need_val_args`` after each validation epoch.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+
+        Note:
+            if ``runner.param_schedulers`` is not built before,
+            the hook ``after_val_epoch`` will be skipped.
+        """
+
+        if runner.param_schedulers is None:
+            return
+
+        # avoid counting scheduler._global_step
+        # it has counted in after_train_* hook
+        if metrics is None:
+            return
+
+        def step(param_schedulers):
+            # check param_schedulers is list and built
+            if not is_list_of(param_schedulers, _ParamScheduler):
+                return
+
+            for scheduler in param_schedulers:
+                if (scheduler.by_epoch
+                        and getattr(scheduler, 'need_val_args', False)):
+                    scheduler.step(metrics)
+
+        if isinstance(runner.param_schedulers, list):
+            step(runner.param_schedulers)
+        elif isinstance(runner.param_schedulers, dict):
+            for param_schedulers in runner.param_schedulers.values():
+                step(param_schedulers)
+        else:
+            raise TypeError(
+                'runner.param_schedulers should be list of ParamScheduler or '
+                'a dict containing list of ParamScheduler, '
+                f'but got {runner.param_schedulers}')
diff --git a/head_extractor/build/lib/mmengine/hooks/profiler_hook.py b/head_extractor/build/lib/mmengine/hooks/profiler_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..6339a5da9220697e68156235e6cd9782c9391dce
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/profiler_hook.py
@@ -0,0 +1,348 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+import os.path as osp
+import sys
+from typing import Callable, Optional, Union
+
+import torch
+
+from mmengine.dist import master_only
+from mmengine.hooks import Hook
+from mmengine.logging import print_log
+from mmengine.registry import HOOKS
+
+
+def check_kineto() -> bool:  # noqa
+    kineto_exist = False
+    try:
+        if torch.autograd.kineto_available():
+            kineto_exist = True
+    except AttributeError:
+        print_log('NO KINETO', logger='current', level=logging.WARNING)
+    return kineto_exist
+
+
+@HOOKS.register_module()
+class ProfilerHook(Hook):
+    """A hook to analyze performance during training and inference.
+
+    PyTorch Profiler is a tool that allows the collection of the performance
+    metrics during the training. More details on Profiler can be found at
+    `official docs <https://pytorch.org/docs/stable/profiler.html
+    #torch.profiler.profile>`_
+
+    Args:
+        by_epoch (bool): Profile performance by epoch or by iteration.
+            Defaults to True.
+        profile_times (int): The period (epoch/iter) recorded by the profiler.
+            Defaults to 1. For example, profile_iters=10 and by_epoch=False,
+            indicate that 0-10 iterations are recorded.
+        activity_with_cpu (bool): Activities to be used in the analysis (CPU)
+        activity_with_cuda (bool): Activities to be used in the analysis (CUDA)
+        schedule (dict, optional): Key-word arguments passed to
+            `torch.profile.schedule <https://pytorch.org/docs/stable/
+            profiler.html#torch.profiler.schedule>`_.
+            Defaults to None, which means profiling without a schedule
+        on_trace_ready (callable, dict, optional): Either a handler or a dict
+            of generating handler. Defaults to None, which means profiling
+            without an on_trace_ready.The Callable type needs to construct its
+            own function that can handle 'torch.autograd.profiler.profile'.
+            Two officially recommended ways are provided:
+
+            - ``schedule=dict(type='log_trace')``: Print the profiling result
+              in the terminal. See more details in the `PyTorch official tutorial`_.
+              The configurable arguments are the same as
+              ``prof.key_averages().table``
+            - ``scheduler=dict(type='tb_trace')``: Profile the performance
+              with tensorboard. See more details in the tutorial
+              `profile with tensorboard`_.
+
+        record_shapes (bool): Save information about operator's input shapes.
+            Defaults to False.
+        profile_memory (bool): Track tensor memory allocation/deallocation.
+            Defaults to False.
+        with_stack (bool): Record source information (file and line number)
+            for the ops. Defaults to False.
+        with_flops (bool): Use formula to estimate the FLOPS of specific
+            operators (matrix multiplication and 2D convolution).
+            Defaults to False.
+        json_trace_path (str, optional): Exports the collected trace in Chrome
+            JSON format. Chrome use 'chrome://tracing' view json file.
+            Defaults to None, which means profiling does not store json files.
+
+    Warnings:
+        The profiler will be closed after ``profile_times`` iterations
+        automatically. Please make sure the configuration of your scheduler
+        will not close the profiler before the iteration reach the value of
+        ``profile_times``
+
+    Examples:
+        >>> # tensorboard trace
+        >>> trace_config = dict(type='tb_trace')
+        >>> profiler_hook_cfg = dict(on_trace_ready=trace_config)
+
+    .. _PyTorch official tutorial: https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#using-profiler-to-analyze-execution-time
+    .. _profile with tensorboard: https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html#pytorch-profiler-with-tensorboard
+    """  # noqa: E501
+    priority = 'VERY_LOW'
+
+    def __init__(self,
+                 *,
+                 by_epoch: bool = True,
+                 profile_times: int = 1,
+                 activity_with_cpu: bool = True,
+                 activity_with_cuda: bool = False,
+                 schedule: Optional[dict] = None,
+                 on_trace_ready: Union[Callable, dict, None] = None,
+                 record_shapes: bool = False,
+                 profile_memory: bool = False,
+                 with_stack: bool = False,
+                 with_flops: bool = False,
+                 json_trace_path: Optional[str] = None) -> None:
+
+        try:
+            from torch import profiler
+        except ImportError:
+            raise ImportError('please upgrade torch above 1.8.1')
+        if not check_kineto():
+            raise ImportError('Due to Kineto support issues, please upgrade '
+                              'pytorch above 1.8.1(windows users above 1.9.1)')
+
+        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.'
+        self.by_epoch = by_epoch
+
+        if profile_times < 1:
+            raise ValueError('profile_iters should be greater than 0, '
+                             f'but got {profile_times}')
+        if by_epoch and profile_times > 1:
+            raise ValueError(
+                f'Profiler will profile 0-{profile_times} epochs.\n'
+                'Since profiler will slow down the training, it is recommended'
+                ' to train 1 epoch with ProfilerHook and adjust your setting '
+                'according to the profiler summary.\n'
+                'During normal training(epoch > 1), '
+                'you may disable the ProfilerHook.')
+        self.profile_times = profile_times
+
+        assert isinstance(activity_with_cpu, bool), \
+            '``activity_with_cpu`` should be a boolean.'
+        assert isinstance(activity_with_cuda, bool), \
+            '``activity_with_cuda`` should be a boolean.'
+        self.activities = []
+        if activity_with_cpu:
+            self.activities.append(profiler.ProfilerActivity.CPU)
+        if activity_with_cuda:
+            self.activities.append(profiler.ProfilerActivity.CUDA)
+
+        if schedule is not None:
+            assert isinstance(schedule, dict), '``schedule`` should be a dict.'
+            self.schedule = profiler.schedule(**schedule)
+        else:
+            self.schedule = None
+
+        self.on_trace_ready = on_trace_ready
+        self.record_shapes = record_shapes
+        self.profile_memory = profile_memory
+        self.with_stack = with_stack
+        self.with_flops = with_flops
+
+        self.json_trace_path = json_trace_path
+        self._closed = False
+
+    def before_run(self, runner):
+        """Initialize the profiler.
+
+        Through the runner parameter, the validity of the parameter is further
+        determined.
+        """
+        max_times = runner.max_epochs if self.by_epoch else runner.max_iters
+        if max_times < self.profile_times:
+            raise ValueError(
+                f'``profile_times`` should not be greater than {max_times}')
+
+        on_trace_ready = self._parse_trace_config(runner)
+
+        self.profiler = torch.profiler.profile(  # noqa
+            activities=self.activities,
+            schedule=self.schedule,
+            on_trace_ready=on_trace_ready,
+            record_shapes=self.record_shapes,
+            profile_memory=self.profile_memory,
+            with_stack=self.with_stack,
+            with_flops=self.with_flops)
+
+        self.profiler.__enter__()
+        runner.logger.info('profiler is profiling...')
+
+    def _parse_trace_config(self, runner):
+        """Used to parse the parameter 'on_trace_ready'."""
+        if self.on_trace_ready is None:
+            _on_trace_ready = None
+        elif callable(self.on_trace_ready):
+            _on_trace_ready = self.on_trace_ready
+        elif isinstance(self.on_trace_ready, dict):
+            trace_cfg = self.on_trace_ready.copy()
+            trace_type = trace_cfg.pop('type')
+
+            # Build a log printing handle
+            if trace_type == 'log_trace':
+
+                def _log_handler(_profile):
+                    print(_profile.key_averages().table(**trace_cfg))
+
+                _on_trace_ready = _log_handler
+
+            elif trace_type == 'tb_trace':  # tensorboard_trace handler
+                try:
+                    import torch_tb_profiler  # noqa: F401
+                except ImportError:
+                    raise ImportError(
+                        'please run ``pip install torch-tb-profiler``')
+
+                if 'dir_name' not in trace_cfg:
+                    trace_cfg['dir_name'] = osp.join(runner.log_dir,
+                                                     'tf_tracing_logs')
+                elif not osp.isabs(trace_cfg['dir_name']):
+                    trace_cfg['dir_name'] = osp.join(runner.log_dir,
+                                                     trace_cfg['dir_name'])
+                runner.logger.info('trace_files of ProfilerHook will be '
+                                   f'saved to {trace_cfg["dir_name"]}.')
+
+                if self.json_trace_path is not None:
+                    runner.logger.warn(
+                        'When using tensorboard_trace, it is recommended to '
+                        'save json files by setting ``worker_name`` instead of'
+                        ' setting ``json_trace_path``')
+                _on_trace_ready = torch.profiler.tensorboard_trace_handler(
+                    **trace_cfg)
+            else:
+                raise ValueError('trace_type should be "log_trace" or '
+                                 f'"tb_trace", but got {trace_type}')
+        else:
+            raise ValueError(
+                '``on_trace_ready`` should be a handler, or dict, or None, '
+                f'but got {self.on_trace_ready}')
+        return _on_trace_ready
+
+    def after_train_epoch(self, runner):
+        """Determine if the content is exported."""
+        # `after_train_epoch` will also be called in IterBasedTrainLoop.
+        # Here we check `self._closed` to avoid exiting twice.
+        if not self._closed:
+            self._export_chrome_trace(runner)
+
+    def after_train_iter(self, runner, batch_idx, data_batch, outputs):
+        """profiler will call `step` method if it is not closed."""
+        if not self._closed:
+            self.profiler.step()
+        if runner.iter == self.profile_times - 1 and not self.by_epoch:
+            self._export_chrome_trace(runner)
+
+    def _export_chrome_trace(self, runner):
+        """Exporting content."""
+        self._closed = True
+        runner.logger.info('profiler may take a few minutes...')
+        self.profiler.__exit__(None, None, None)
+        if self.json_trace_path is not None:
+            self.profiler.export_chrome_trace(self.json_trace_path)
+
+
+@HOOKS.register_module()
+class NPUProfilerHook(Hook):
+    """NPUProfiler to analyze performance during training.
+
+    NPU Profiling is used to count the device execution time of all operators.
+    The torch_npu.npu.profile interface is used to complete the profiling data
+    collection at each stage of the project, and the data is analyzed by the
+    msprof tool and the data can be dumped to further manually analyze the
+    key performance bottlenecks. For more details on the torch_npu.npu.profile
+    interface, please visit
+    https://gitee.com/ascend/pytorch/blob/master/torch_npu/npu/profiler.py#profile
+
+    Args:
+        begin (int): Number of start iterations for profiling. Defaults to 0.
+        end (int): Number of end iterations for profiling. Defaults to 1.
+        result_path (str): The path to save the profiling results file.
+            Defaults to 'cann_profiling'.
+        exit_after_profiling (bool): Whether to exit the program after
+            profiling. Defaults to True.
+        use_e2e_profiler (bool): Turn on E2E profiling, E2E profiling combines
+            performance data at the Pytorch level and the NPU level to analyze
+            the bottlenecks of model performance end-to-end, and cannot show
+            detailed content, and only as an auxiliary analysis.
+            Defaults to False.
+        ge_profiling_to_std_out (bool): Turn on GE profiling, GE uses to
+            collect the profiling data of the host side scheduling of the
+            Assend device. Defaults to False.
+
+    Examples:
+        >>> cfg = ...
+        >>> profiler_config = dict(type='NPUProfilerHook', end=2)
+        >>> cfg.merge_from_dict({'custom_hooks': custom_hooks})
+        >>> runner = Runner.from_cfg(cfg)
+        >>> runner.train()
+    """
+    priority = 'VERY_LOW'
+
+    def __init__(self,
+                 *,
+                 begin: int = 0,
+                 end: int = 1,
+                 result_path: str = 'cann_profiling',
+                 exit_after_profiling: bool = True,
+                 use_e2e_profiler: bool = False,
+                 ge_profiling_to_std_out: bool = False):
+
+        try:
+            import torch_npu
+        except ImportError:
+            raise ImportError('Failed to import torch_npu module')
+
+        if begin >= end:
+            raise ValueError(
+                'The iteration to start profiling should not be greater'
+                'than or equal to profile end')
+
+        self.begin = begin
+        self.end = end
+        self.result_path = result_path
+        self.exit_after_profiling = exit_after_profiling
+
+        if ge_profiling_to_std_out:
+            os.environ['GE_PROFILING_TO_STD_OUT'] = '1'
+
+        if not osp.exists(self.result_path):
+            os.makedirs(self.result_path, exist_ok=True)
+
+        self.profiler = torch_npu.npu.profile(
+            self.result_path, use_e2e_profiler=use_e2e_profiler)
+
+    @master_only
+    def before_run(self, runner):
+
+        if self.end > runner.max_iters:
+            raise ValueError(
+                'The profiling end iteration should not be greater'
+                'than the max iteration')
+
+    @master_only
+    def before_train_iter(self, runner, batch_idx, data_batch=None):
+
+        if runner.iter == self.begin:
+            self.profiler.__enter__()
+            runner.logger.info('NPUProfiler starts profiling...')
+
+    @master_only
+    def after_train_iter(self,
+                         runner,
+                         batch_idx,
+                         data_batch=None,
+                         outputs=None):
+
+        if runner.iter == self.end - 1:
+            runner.logger.info('profiler may take a few minutes to'
+                               ' save the profiling result.')
+            self.profiler.__exit__(None, None, None)
+            if self.exit_after_profiling:
+                sys.exit()
diff --git a/head_extractor/build/lib/mmengine/hooks/runtime_info_hook.py b/head_extractor/build/lib/mmengine/hooks/runtime_info_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..49407e45631d96737b37a763850a68801c9bf777
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/runtime_info_hook.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+
+from mmengine.registry import HOOKS
+from mmengine.utils import get_git_hash
+from mmengine.version import __version__
+from .hook import Hook
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+def _is_scalar(value: Any) -> bool:
+    """Determine the value is a scalar type value.
+
+    Args:
+        value (Any): value of log.
+
+    Returns:
+        bool: whether the value is a scalar type value.
+    """
+    if isinstance(value, np.ndarray):
+        return value.size == 1
+    elif isinstance(value, (int, float, np.number)):
+        return True
+    elif isinstance(value, torch.Tensor):
+        return value.numel() == 1
+    return False
+
+
+@HOOKS.register_module()
+class RuntimeInfoHook(Hook):
+    """A hook that updates runtime information into message hub.
+
+    E.g. ``epoch``, ``iter``, ``max_epochs``, and ``max_iters`` for the
+    training state. Components that cannot access the runner can get runtime
+    information through the message hub.
+    """
+
+    priority = 'VERY_HIGH'
+
+    def before_run(self, runner) -> None:
+        """Update metainfo.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        metainfo = dict(
+            cfg=runner.cfg.pretty_text,
+            seed=runner.seed,
+            experiment_name=runner.experiment_name,
+            mmengine_version=__version__ + get_git_hash())
+        runner.message_hub.update_info_dict(metainfo)
+
+        self.last_loop_stage = None
+
+    def before_train(self, runner) -> None:
+        """Update resumed training state.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        runner.message_hub.update_info('loop_stage', 'train')
+        runner.message_hub.update_info('epoch', runner.epoch)
+        runner.message_hub.update_info('iter', runner.iter)
+        runner.message_hub.update_info('max_epochs', runner.max_epochs)
+        runner.message_hub.update_info('max_iters', runner.max_iters)
+        if hasattr(runner.train_dataloader.dataset, 'metainfo'):
+            runner.message_hub.update_info(
+                'dataset_meta', runner.train_dataloader.dataset.metainfo)
+
+    def after_train(self, runner) -> None:
+        runner.message_hub.pop_info('loop_stage')
+
+    def before_train_epoch(self, runner) -> None:
+        """Update current epoch information before every epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        runner.message_hub.update_info('epoch', runner.epoch)
+
+    def before_train_iter(self,
+                          runner,
+                          batch_idx: int,
+                          data_batch: DATA_BATCH = None) -> None:
+        """Update current iter and learning rate information before every
+        iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (Sequence[dict], optional): Data from dataloader.
+                Defaults to None.
+        """
+        runner.message_hub.update_info('iter', runner.iter)
+        lr_dict = runner.optim_wrapper.get_lr()
+        assert isinstance(lr_dict, dict), (
+            '`runner.optim_wrapper.get_lr()` should return a dict '
+            'of learning rate when training with OptimWrapper(single '
+            'optimizer) or OptimWrapperDict(multiple optimizer), '
+            f'but got {type(lr_dict)} please check your optimizer '
+            'constructor return an `OptimWrapper` or `OptimWrapperDict` '
+            'instance')
+        for name, lr in lr_dict.items():
+            runner.message_hub.update_scalar(f'train/{name}', lr[0])
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Update ``log_vars`` in model outputs every iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (Sequence[dict], optional): Data from dataloader.
+                Defaults to None.
+            outputs (dict, optional): Outputs from model. Defaults to None.
+        """
+        if outputs is not None:
+            for key, value in outputs.items():
+                runner.message_hub.update_scalar(f'train/{key}', value)
+
+    def before_val(self, runner) -> None:
+        self.last_loop_stage = runner.message_hub.get_info('loop_stage')
+        runner.message_hub.update_info('loop_stage', 'val')
+
+    def after_val_epoch(self,
+                        runner,
+                        metrics: Optional[Dict[str, float]] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each validation epoch.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        if metrics is not None:
+            for key, value in metrics.items():
+                if _is_scalar(value):
+                    runner.message_hub.update_scalar(f'val/{key}', value)
+                else:
+                    runner.message_hub.update_info(f'val/{key}', value)
+
+    def after_val(self, runner) -> None:
+        # ValLoop may be called within the TrainLoop, so we need to reset
+        # the loop_stage
+        # workflow: before_train -> before_val -> after_val -> after_train
+        if self.last_loop_stage == 'train':
+            runner.message_hub.update_info('loop_stage', self.last_loop_stage)
+            self.last_loop_stage = None
+        else:
+            runner.message_hub.pop_info('loop_stage')
+
+    def before_test(self, runner) -> None:
+        runner.message_hub.update_info('loop_stage', 'test')
+
+    def after_test(self, runner) -> None:
+        runner.message_hub.pop_info('loop_stage')
+
+    def after_test_epoch(self,
+                         runner,
+                         metrics: Optional[Dict[str, float]] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each test epoch.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on test dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        if metrics is not None:
+            for key, value in metrics.items():
+                if _is_scalar(value):
+                    runner.message_hub.update_scalar(f'test/{key}', value)
+                else:
+                    runner.message_hub.update_info(f'test/{key}', value)
diff --git a/head_extractor/build/lib/mmengine/hooks/sampler_seed_hook.py b/head_extractor/build/lib/mmengine/hooks/sampler_seed_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aed9dbcf594fd23ca78dacaa4443d18d0ad41ce
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/sampler_seed_hook.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.registry import HOOKS
+from .hook import Hook
+
+
+@HOOKS.register_module()
+class DistSamplerSeedHook(Hook):
+    """Data-loading sampler for distributed training.
+
+    When distributed training, it is only useful in conjunction with
+    :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same
+    purpose with :obj:`IterLoader`.
+    """
+
+    priority = 'NORMAL'
+
+    def before_train_epoch(self, runner) -> None:
+        """Set the seed for sampler and batch_sampler.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if hasattr(runner.train_loop.dataloader, 'sampler') and hasattr(
+                runner.train_loop.dataloader.sampler, 'set_epoch'):
+            # In case the` _SingleProcessDataLoaderIter` has no sampler,
+            # or data loader uses `SequentialSampler` in Pytorch.
+            runner.train_loop.dataloader.sampler.set_epoch(runner.epoch)
+
+        elif hasattr(runner.train_loop.dataloader,
+                     'batch_sampler') and hasattr(
+                         runner.train_loop.dataloader.batch_sampler.sampler,
+                         'set_epoch'):
+            # In case the` _SingleProcessDataLoaderIter` has no batch sampler.
+            # batch sampler in pytorch warps the sampler as its attributes.
+            runner.train_loop.dataloader.batch_sampler.sampler.set_epoch(
+                runner.epoch)
diff --git a/head_extractor/build/lib/mmengine/hooks/sync_buffer_hook.py b/head_extractor/build/lib/mmengine/hooks/sync_buffer_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cc75757fed4c1ddac7e9867d87394a08f53abea
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/sync_buffer_hook.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dist import all_reduce_params, is_distributed
+from mmengine.registry import HOOKS
+from .hook import Hook
+
+
+@HOOKS.register_module()
+class SyncBuffersHook(Hook):
+    """Synchronize model buffers such as running_mean and running_var in BN at
+    the end of each epoch."""
+
+    priority = 'NORMAL'
+
+    def __init__(self) -> None:
+        self.distributed = is_distributed()
+        # A flag to mark whether synchronization has been done in
+        # after_train_epoch
+        self.called_in_train = False
+
+    def before_val_epoch(self, runner) -> None:
+        """All-reduce model buffers before each validation epoch.
+
+        Synchronize the buffers before each validation if they have not been
+        synchronized at the end of the previous training epoch. This method
+        will be called when using IterBasedTrainLoop.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.distributed:
+            if not self.called_in_train:
+                all_reduce_params(runner.model.buffers(), op='mean')
+            self.called_in_train = False
+
+    def after_train_epoch(self, runner) -> None:
+        """All-reduce model buffers at the end of each epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.distributed:
+            all_reduce_params(runner.model.buffers(), op='mean')
+            self.called_in_train = True
diff --git a/head_extractor/build/lib/mmengine/hooks/test_time_aug_hook.py b/head_extractor/build/lib/mmengine/hooks/test_time_aug_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..5775736d1f783d3925f98a4dbae9fb35bf740fc0
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hooks/test_time_aug_hook.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from mmengine.runner import Runner
+
+from mmengine.hooks import Hook
+from mmengine.registry import HOOKS, MODELS, RUNNERS
+
+
+@HOOKS.register_module()
+class PrepareTTAHook(Hook):
+    """Wraps `runner.model` with subclass of :class:`BaseTTAModel` in
+    `before_test`.
+
+    Note:
+        This function will only be used with :obj:`MMFullyShardedDataParallel`.
+
+    Args:
+        tta_cfg (dict): Config dictionary of the test time augmentation model.
+    """
+
+    def __init__(self, tta_cfg: dict):
+        self.tta_cfg = tta_cfg
+
+    def before_test(self, runner: 'Runner') -> None:
+        """Wraps `runner.model` with the subclass of :class:`BaseTTAModel`.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+        """
+        self.tta_cfg['module'] = runner.model  # type: ignore
+        model = MODELS.build(self.tta_cfg)
+        runner.model = model  # type: ignore
+
+
+def build_runner_with_tta(cfg: dict) -> 'Runner':
+    """Builds runner with tta (test time augmentation) transformation and
+    TTAModel.
+
+    Note:
+        This function will only be used with :obj:`MMFullyShardedDataParallel`.
+
+    Args:
+        cfg (dict): cfg with ``tta_pipeline`` and ``tta_model``
+
+    Notes:
+        This is only an experimental feature. We may refactor the code in the
+        future.
+
+    Returns:
+        Runner: Runner with tta transformation and TTAModel
+    """
+    assert hasattr(
+        cfg,
+        'tta_model'), ('please make sure tta_model is defined in your config.')
+    assert hasattr(cfg, 'tta_pipeline'), (
+        'please make sure tta_pipeline is defined in your config.')
+    cfg['test_dataloader']['dataset']['pipeline'] = cfg['tta_pipeline']
+
+    if 'runner_type' in cfg:
+        runner = RUNNERS.build(cfg)
+    else:
+        from mmengine.runner import Runner
+        runner = Runner.from_cfg(cfg)
+
+    runner.register_hook(PrepareTTAHook(tta_cfg=cfg['tta_model']))
+    return runner
diff --git a/head_extractor/build/lib/mmengine/hub/__init__.py b/head_extractor/build/lib/mmengine/hub/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6f2add99c96c9401df11cafda2283f80353e5f1
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hub/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hub import get_config, get_model
+
+__all__ = ['get_config', 'get_model']
diff --git a/head_extractor/build/lib/mmengine/hub/hub.py b/head_extractor/build/lib/mmengine/hub/hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..b24ac2c125143789c8553086eec7767b9c761c41
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/hub/hub.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import importlib
+import os.path as osp
+
+from mmengine.config import Config
+from mmengine.config.utils import (_get_cfg_metainfo,
+                                   _get_external_cfg_base_path,
+                                   _get_package_and_cfg_path)
+from mmengine.registry import MODELS, DefaultScope
+from mmengine.runner import load_checkpoint
+from mmengine.utils import get_installed_path, install_package
+
+
+def get_config(cfg_path: str, pretrained: bool = False) -> Config:
+    """Get config from external package.
+
+    Args:
+        cfg_path (str): External relative config path.
+        pretrained (bool): Whether to save pretrained model path. If
+            ``pretrained==True``, the url of pretrained model can be accessed
+            by ``cfg.model_path``. Defaults to False.
+
+    Examples:
+        >>> cfg = get_config('mmdet::faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py', pretrained=True)
+        >>> # Equivalent to
+        >>> # cfg = Config.fromfile('/path/to/faster-rcnn_r50_fpn_1x_coco.py')
+        >>> cfg.model_path
+        https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
+
+    Returns:
+        Config: A `Config` parsed from external package.
+    """  # noqa E301
+    # Get package name and relative config path.
+    package, cfg_path = _get_package_and_cfg_path(cfg_path)
+    # Install package if it's not installed.
+    install_package(package)
+    package_path = get_installed_path(package)
+    try:
+        # Use `cfg_path` to search target config file.
+        cfg_meta = _get_cfg_metainfo(package_path, cfg_path)
+        cfg_path = osp.join(package_path, '.mim', cfg_meta['Config'])
+        cfg = Config.fromfile(cfg_path)
+        if pretrained:
+            assert 'Weights' in cfg_meta, ('Cannot find `Weights` in cfg_file'
+                                           '.metafile.yml, please check the'
+                                           'metafile')
+            cfg.model_path = cfg_meta['Weights']
+    except ValueError:
+        # Since the base config does not contain a metafile, the absolute
+        # config is `osp.join(package_path, cfg_path_prefix, cfg_name)`
+        cfg_path = _get_external_cfg_base_path(package_path, cfg_path)
+        cfg = Config.fromfile(cfg_path)
+    except Exception as e:
+        raise e
+    return cfg
+
+
+def get_model(cfg_path: str, pretrained: bool = False, **kwargs):
+    """Get built model from external package.
+
+    Args:
+        cfg_path (str): External relative config path with prefix
+            'package::' and without suffix.
+        pretrained (bool): Whether to load pretrained model. Defaults to False.
+        kwargs (dict): Default arguments to build model.
+
+    Examples:
+        >>> model = get_model('mmdet::faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py', pretrained=True)
+        >>> type(model)
+        <class 'mmdet.models.detectors.faster_rcnn.FasterRCNN'>
+
+    Returns:
+        nn.Module: Built model.
+    """  # noqa E301
+    package = cfg_path.split('::')[0]
+    with DefaultScope.overwrite_default_scope(package):  # type: ignore
+        cfg = get_config(cfg_path, pretrained)
+        if 'data_preprocessor' in cfg:
+            cfg.model.data_preprocessor = cfg.data_preprocessor
+        models_module = importlib.import_module(f'{package}.utils')
+        models_module.register_all_modules()  # type: ignore
+        model = MODELS.build(cfg.model, default_args=kwargs)
+        if pretrained:
+            load_checkpoint(model, cfg.model_path)
+            # Hack to use pretrained weights.
+            # If we do not set _is_init here, Runner will call
+            # `model.init_weights()` to overwrite the pretrained model.
+            model._is_init = True
+        return model
diff --git a/head_extractor/build/lib/mmengine/infer/__init__.py b/head_extractor/build/lib/mmengine/infer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a122481f148c6aa52753ea172fee7d9e6005e484
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/infer/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .infer import BaseInferencer
+
+__all__ = ['BaseInferencer']
diff --git a/head_extractor/build/lib/mmengine/infer/infer.py b/head_extractor/build/lib/mmengine/infer/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..322d8852243493944c8b9adbed4095140a8ead89
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/infer/infer.py
@@ -0,0 +1,692 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import importlib
+import os.path as osp
+import re
+import warnings
+from abc import ABCMeta, abstractmethod
+from datetime import datetime
+from typing import (Any, Callable, Dict, Iterable, List, Optional, Sequence,
+                    Tuple, Union)
+
+import numpy as np
+import torch
+import torch.nn as nn
+from rich.progress import track
+
+from mmengine.config import Config, ConfigDict
+from mmengine.config.utils import MODULE2PACKAGE
+from mmengine.dataset import pseudo_collate
+from mmengine.device import get_device
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file, load)
+from mmengine.logging import print_log
+from mmengine.registry import FUNCTIONS, MODELS, VISUALIZERS, DefaultScope
+from mmengine.runner.checkpoint import (_load_checkpoint,
+                                        _load_checkpoint_to_model)
+from mmengine.structures import InstanceData
+from mmengine.visualization import Visualizer
+
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray, torch.Tensor]
+InputsType = Union[InputType, Sequence[InputType]]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+ResType = Union[Dict, List[Dict]]
+ConfigType = Union[Config, ConfigDict]
+ModelType = Union[dict, ConfigType, str]
+
+
+class InferencerMeta(ABCMeta):
+    """Check the legality of the inferencer.
+
+    All Inferencers should not define duplicated keys for
+    ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs`` and
+    ``postprocess_kwargs``.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert isinstance(self.preprocess_kwargs, set)
+        assert isinstance(self.forward_kwargs, set)
+        assert isinstance(self.visualize_kwargs, set)
+        assert isinstance(self.postprocess_kwargs, set)
+
+        all_kwargs = (
+            self.preprocess_kwargs | self.forward_kwargs
+            | self.visualize_kwargs | self.postprocess_kwargs)
+
+        assert len(all_kwargs) == (
+            len(self.preprocess_kwargs) + len(self.forward_kwargs) +
+            len(self.visualize_kwargs) + len(self.postprocess_kwargs)), (
+                f'Class define error! {self.__name__} should not '
+                'define duplicated keys for `preprocess_kwargs`, '
+                '`forward_kwargs`, `visualize_kwargs` and '
+                '`postprocess_kwargs` are not allowed.')
+
+
+class BaseInferencer(metaclass=InferencerMeta):
+    """Base inferencer for downstream tasks.
+
+    The BaseInferencer provides the standard workflow for inference as follows:
+
+    1. Preprocess the input data by :meth:`preprocess`.
+    2. Forward the data to the model by :meth:`forward`. ``BaseInferencer``
+       assumes the model inherits from :class:`mmengine.models.BaseModel` and
+       will call `model.test_step` in :meth:`forward` by default.
+    3. Visualize the results by :meth:`visualize`.
+    4. Postprocess and return the results by :meth:`postprocess`.
+
+    When we call the subclasses inherited from BaseInferencer (not overriding
+    ``__call__``), the workflow will be executed in order.
+
+    All subclasses of BaseInferencer could define the following class
+    attributes for customization:
+
+    - ``preprocess_kwargs``: The keys of the kwargs that will be passed to
+      :meth:`preprocess`.
+    - ``forward_kwargs``: The keys of the kwargs that will be passed to
+      :meth:`forward`
+    - ``visualize_kwargs``: The keys of the kwargs that will be passed to
+      :meth:`visualize`
+    - ``postprocess_kwargs``: The keys of the kwargs that will be passed to
+      :meth:`postprocess`
+
+    All attributes mentioned above should be a ``set`` of keys (strings),
+    and each key should not be duplicated. Actually, :meth:`__call__` will
+    dispatch all the arguments to the corresponding methods according to the
+    ``xxx_kwargs`` mentioned above, therefore, the key in sets should
+    be unique to avoid ambiguous dispatching.
+
+    Warning:
+        If subclasses defined the class attributes mentioned above with
+        duplicated keys, an ``AssertionError`` will be raised during import
+        process.
+
+    Subclasses inherited from ``BaseInferencer`` should implement
+    :meth:`_init_pipeline`, :meth:`visualize` and :meth:`postprocess`:
+
+    - _init_pipeline: Return a callable object to preprocess the input data.
+    - visualize: Visualize the results returned by :meth:`forward`.
+    - postprocess: Postprocess the results returned by :meth:`forward` and
+      :meth:`visualize`.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. Take the `mmdet metafile <https://github.com/open-mmlab/mmdetection/blob/master/configs/retinanet/metafile.yml>`_
+            as an example, the `model` could be `retinanet_r18_fpn_1x_coco` or
+            its alias. If model is not specified, user must provide the
+            `weights` saved by MMEngine which contains the config string.
+            Defaults to None.
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        scope (str, optional): The scope of the model. Defaults to None.
+        show_progress (bool): Control whether to display the progress bar during
+            the inference process. Defaults to True.
+            `New in version 0.7.4.`
+
+    Note:
+        Since ``Inferencer`` could be used to infer batch data,
+        `collate_fn` should be defined. If `collate_fn` is not defined in config
+        file, the `collate_fn` will be `pseudo_collate` by default.
+    """  # noqa: E501
+
+    preprocess_kwargs: set = set()
+    forward_kwargs: set = set()
+    visualize_kwargs: set = set()
+    postprocess_kwargs: set = set()
+
+    def __init__(self,
+                 model: Union[ModelType, str, None] = None,
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: Optional[str] = None,
+                 show_progress: bool = True) -> None:
+        if scope is None:
+            default_scope = DefaultScope.get_current_instance()
+            if default_scope is not None:
+                scope = default_scope.scope_name
+        self.scope = scope
+        # Load config to cfg
+        cfg: ConfigType
+        if isinstance(model, str):
+            if osp.isfile(model):
+                cfg = Config.fromfile(model)
+            else:
+                # Load config and weights from metafile. If `weights` is
+                # assigned, the weights defined in metafile will be ignored.
+                cfg, _weights = self._load_model_from_metafile(model)
+                if weights is None:
+                    weights = _weights
+        elif isinstance(model, (Config, ConfigDict)):
+            cfg = copy.deepcopy(model)
+        elif isinstance(model, dict):
+            cfg = copy.deepcopy(ConfigDict(model))
+        elif model is None:
+            if weights is None:
+                raise ValueError(
+                    'If model is None, the weights must be specified since '
+                    'the config needs to be loaded from the weights')
+            cfg = ConfigDict()
+        else:
+            raise TypeError('model must be a filepath or any ConfigType'
+                            f'object, but got {type(model)}')
+
+        if device is None:
+            device = get_device()
+
+        self.model = self._init_model(cfg, weights, device)  # type: ignore
+        self.pipeline = self._init_pipeline(cfg)
+        self.collate_fn = self._init_collate(cfg)
+        self.visualizer = self._init_visualizer(cfg)
+        self.cfg = cfg
+        self.show_progress = show_progress
+
+    def __call__(
+        self,
+        inputs: InputsType,
+        return_datasamples: bool = False,
+        batch_size: int = 1,
+        **kwargs,
+    ) -> dict:
+        """Call the inferencer.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            return_datasamples (bool): Whether to return results as
+                :obj:`BaseDataElement`. Defaults to False.
+            batch_size (int): Batch size. Defaults to 1.
+            **kwargs: Key words arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+        Returns:
+            dict: Inference and visualization results.
+        """
+        (
+            preprocess_kwargs,
+            forward_kwargs,
+            visualize_kwargs,
+            postprocess_kwargs,
+        ) = self._dispatch_kwargs(**kwargs)
+
+        ori_inputs = self._inputs_to_list(inputs)
+        inputs = self.preprocess(
+            ori_inputs, batch_size=batch_size, **preprocess_kwargs)
+        preds = []
+        for data in (track(inputs, description='Inference')
+                     if self.show_progress else inputs):
+            preds.extend(self.forward(data, **forward_kwargs))
+        visualization = self.visualize(
+            ori_inputs, preds,
+            **visualize_kwargs)  # type: ignore  # noqa: E501
+        results = self.postprocess(preds, visualization, return_datasamples,
+                                   **postprocess_kwargs)
+        return results
+
+    def _inputs_to_list(self, inputs: InputsType) -> list:
+        """Preprocess the inputs to a list.
+
+        Preprocess inputs to a list according to its type:
+
+        - list or tuple: return inputs
+        - str:
+            - Directory path: return all files in the directory
+            - other cases: return a list containing the string. The string
+              could be a path to file, a url or other types of string according
+              to the task.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+
+        Returns:
+            list: List of input for the :meth:`preprocess`.
+        """
+        if isinstance(inputs, str):
+            backend = get_file_backend(inputs)
+            if hasattr(backend, 'isdir') and isdir(inputs):
+                # Backends like HttpsBackend do not implement `isdir`, so only
+                # those backends that implement `isdir` could accept the inputs
+                # as a directory
+                filename_list = list_dir_or_file(inputs, list_dir=False)
+                inputs = [
+                    join_path(inputs, filename) for filename in filename_list
+                ]
+
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        return list(inputs)
+
+    def preprocess(self, inputs: InputsType, batch_size: int = 1, **kwargs):
+        """Process the inputs into a model-feedable format.
+
+        Customize your preprocess by overriding this method. Preprocess should
+        return an iterable object, of which each item will be used as the
+        input of ``model.test_step``.
+
+        ``BaseInferencer.preprocess`` will return an iterable chunked data,
+        which will be used in __call__ like this:
+
+        .. code-block:: python
+
+            def __call__(self, inputs, batch_size=1, **kwargs):
+                chunked_data = self.preprocess(inputs, batch_size, **kwargs)
+                for batch in chunked_data:
+                    preds = self.forward(batch, **kwargs)
+
+        Args:
+            inputs (InputsType): Inputs given by user.
+            batch_size (int): batch size. Defaults to 1.
+
+        Yields:
+            Any: Data processed by the ``pipeline`` and ``collate_fn``.
+        """
+        chunked_data = self._get_chunk_data(
+            map(self.pipeline, inputs), batch_size)
+        yield from map(self.collate_fn, chunked_data)
+
+    @torch.no_grad()
+    def forward(self, inputs: Union[dict, tuple], **kwargs) -> Any:
+        """Feed the inputs to the model."""
+        return self.model.test_step(inputs)
+
+    @abstractmethod
+    def visualize(self,
+                  inputs: list,
+                  preds: Any,
+                  show: bool = False,
+                  **kwargs) -> List[np.ndarray]:
+        """Visualize predictions.
+
+        Customize your visualization by overriding this method. visualize
+        should return visualization results, which could be np.ndarray or any
+        other objects.
+
+        Args:
+            inputs (list): Inputs preprocessed by :meth:`_inputs_to_list`.
+            preds (Any): Predictions of the model.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+
+        Returns:
+            List[np.ndarray]: Visualization results.
+        """
+
+    @abstractmethod
+    def postprocess(
+        self,
+        preds: Any,
+        visualization: List[np.ndarray],
+        return_datasample=False,
+        **kwargs,
+    ) -> dict:
+        """Process the predictions and visualization results from ``forward``
+        and ``visualize``.
+
+        This method should be responsible for the following tasks:
+
+        1. Convert datasamples into a json-serializable dict if needed.
+        2. Pack the predictions and visualization results and return them.
+        3. Dump or log the predictions.
+
+        Customize your postprocess by overriding this method. Make sure
+        ``postprocess`` will return a dict with visualization results and
+        inference results.
+
+        Args:
+            preds (List[Dict]): Predictions of the model.
+            visualization (np.ndarray): Visualized predictions.
+            return_datasample (bool): Whether to return results as datasamples.
+                Defaults to False.
+
+        Returns:
+            dict: Inference and visualization results with key ``predictions``
+            and ``visualization``
+
+            - ``visualization (Any)``: Returned by :meth:`visualize`
+            - ``predictions`` (dict or DataSample): Returned by
+              :meth:`forward` and processed in :meth:`postprocess`.
+              If ``return_datasample=False``, it usually should be a
+              json-serializable dict containing only basic data elements such
+              as strings and numbers.
+        """
+
+    def _load_model_from_metafile(self, model: str) -> Tuple[Config, str]:
+        """Load config and weights from metafile.
+
+        Args:
+            model (str): model name defined in metafile.
+
+        Returns:
+            Tuple[Config, str]: Loaded Config and weights path defined in
+            metafile.
+        """
+        model = model.lower()
+
+        assert self.scope is not None, (
+            'scope should be initialized if you want '
+            'to load config from metafile.')
+        assert self.scope in MODULE2PACKAGE, (
+            f'{self.scope} not in {MODULE2PACKAGE}!,'
+            'please pass a valid scope.')
+
+        repo_or_mim_dir = BaseInferencer._get_repo_or_mim_dir(self.scope)
+        for model_cfg in BaseInferencer._get_models_from_metafile(
+                repo_or_mim_dir):
+            model_name = model_cfg['Name'].lower()
+            model_aliases = model_cfg.get('Alias', [])
+            if isinstance(model_aliases, str):
+                model_aliases = [model_aliases.lower()]
+            else:
+                model_aliases = [alias.lower() for alias in model_aliases]
+            if (model_name == model or model in model_aliases):
+                cfg = Config.fromfile(
+                    osp.join(repo_or_mim_dir, model_cfg['Config']))
+                weights = model_cfg['Weights']
+                weights = weights[0] if isinstance(weights, list) else weights
+                return cfg, weights
+        raise ValueError(f'Cannot find model: {model} in {self.scope}')
+
+    @staticmethod
+    def _get_repo_or_mim_dir(scope):
+        """Get the directory where the ``Configs`` located when the package is
+        installed or ``PYTHONPATH`` is set.
+
+        Args:
+            scope (str): The scope of repository.
+
+        Returns:
+            str: The directory where the ``Configs`` is located.
+        """
+        try:
+            module = importlib.import_module(scope)
+        except ImportError:
+            if scope not in MODULE2PACKAGE:
+                raise KeyError(
+                    f'{scope} is not a valid scope. The available scopes '
+                    f'are {MODULE2PACKAGE.keys()}')
+            else:
+                project = MODULE2PACKAGE[scope]
+                raise ImportError(
+                    f'Cannot import {scope} correctly, please try to install '
+                    f'the {project} by "pip install {project}"')
+        # Since none of OpenMMLab series packages are namespace packages
+        # (https://docs.python.org/3/glossary.html#term-namespace-package),
+        # The first element of module.__path__ means package installation path.
+        package_path = module.__path__[0]
+
+        if osp.exists(osp.join(osp.dirname(package_path), 'configs')):
+            repo_dir = osp.dirname(package_path)
+            return repo_dir
+        else:
+            mim_dir = osp.join(package_path, '.mim')
+            if not osp.exists(osp.join(mim_dir, 'configs')):
+                raise FileNotFoundError(
+                    f'Cannot find `configs` directory in {package_path}!, '
+                    f'please check the completeness of the {scope}.')
+            return mim_dir
+
+    def _init_model(
+        self,
+        cfg: ConfigType,
+        weights: Optional[str],
+        device: str = 'cpu',
+    ) -> nn.Module:
+        """Initialize the model with the given config and checkpoint on the
+        specific device.
+
+        Args:
+            cfg (ConfigType): Config containing the model information.
+            weights (str, optional): Path to the checkpoint.
+            device (str, optional): Device to run inference. Defaults to 'cpu'.
+
+        Returns:
+            nn.Module: Model loaded with checkpoint.
+        """
+        checkpoint: Optional[dict] = None
+        if weights is not None:
+            checkpoint = _load_checkpoint(weights, map_location='cpu')
+
+        if not cfg:
+            assert checkpoint is not None
+            try:
+                # Prefer to get config from `message_hub` since `message_hub`
+                # is a more stable module to store all runtime information.
+                # However, the early version of MMEngine will not save config
+                # in `message_hub`, so we will try to load config from `meta`.
+                cfg_string = checkpoint['message_hub']['runtime_info']['cfg']
+            except KeyError:
+                assert 'meta' in checkpoint, (
+                    'If model(config) is not provided, the checkpoint must'
+                    'contain the config string in `meta` or `message_hub`, '
+                    'but both `meta` and `message_hub` are not found in the '
+                    'checkpoint.')
+                meta = checkpoint['meta']
+                if 'cfg' in meta:
+                    cfg_string = meta['cfg']
+                else:
+                    raise ValueError(
+                        'Cannot find the config in the checkpoint.')
+            cfg.update(
+                Config.fromstring(cfg_string, file_format='.py')._cfg_dict)
+
+        # Delete the `pretrained` field to prevent model from loading the
+        # the pretrained weights unnecessarily.
+        if cfg.model.get('pretrained') is not None:
+            del cfg.model.pretrained
+
+        model = MODELS.build(cfg.model)
+        model.cfg = cfg
+        self._load_weights_to_model(model, checkpoint, cfg)
+        model.to(device)
+        model.eval()
+        return model
+
+    def _load_weights_to_model(self, model: nn.Module,
+                               checkpoint: Optional[dict],
+                               cfg: Optional[ConfigType]) -> None:
+        """Loading model weights and meta information from cfg and checkpoint.
+
+        Subclasses could override this method to load extra meta information
+        from ``checkpoint`` and ``cfg`` to model.
+
+        Args:
+            model (nn.Module): Model to load weights and meta information.
+            checkpoint (dict, optional): The loaded checkpoint.
+            cfg (Config or ConfigDict, optional): The loaded config.
+        """
+        if checkpoint is not None:
+            _load_checkpoint_to_model(model, checkpoint)
+        else:
+            warnings.warn('Checkpoint is not loaded, and the inference '
+                          'result is calculated by the randomly initialized '
+                          'model!')
+
+    def _init_collate(self, cfg: ConfigType) -> Callable:
+        """Initialize the ``collate_fn`` with the given config.
+
+        The returned ``collate_fn`` will be used to collate the batch data.
+        If will be used in :meth:`preprocess` like this
+
+        .. code-block:: python
+            def preprocess(self, inputs, batch_size, **kwargs):
+                ...
+                dataloader = map(self.collate_fn, dataloader)
+                yield from dataloader
+
+        Args:
+            cfg (ConfigType): Config which could contained the `collate_fn`
+                information. If `collate_fn` is not defined in config, it will
+                be :func:`pseudo_collate`.
+
+        Returns:
+            Callable: Collate function.
+        """
+        try:
+            with FUNCTIONS.switch_scope_and_registry(self.scope) as registry:
+                collate_fn = registry.get(cfg.test_dataloader.collate_fn)
+        except AttributeError:
+            collate_fn = pseudo_collate
+        return collate_fn  # type: ignore
+
+    @abstractmethod
+    def _init_pipeline(self, cfg: ConfigType) -> Callable:
+        """Initialize the test pipeline.
+
+        Return a pipeline to handle various input data, such as ``str``,
+        ``np.ndarray``. It is an abstract method in BaseInferencer, and should
+        be implemented in subclasses.
+
+        The returned pipeline will be used to process a single data.
+        It will be used in :meth:`preprocess` like this:
+
+        .. code-block:: python
+            def preprocess(self, inputs, batch_size, **kwargs):
+                ...
+                dataset = map(self.pipeline, dataset)
+                ...
+        """
+
+    def _init_visualizer(self, cfg: ConfigType) -> Optional[Visualizer]:
+        """Initialize visualizers.
+
+        Args:
+            cfg (ConfigType): Config containing the visualizer information.
+
+        Returns:
+            Visualizer or None: Visualizer initialized with config.
+        """
+        if 'visualizer' not in cfg:
+            return None
+        timestamp = str(datetime.timestamp(datetime.now()))
+        name = cfg.visualizer.get('name', timestamp)
+        if Visualizer.check_instance_created(name):
+            name = f'{name}-{timestamp}'
+        cfg.visualizer.name = name
+        return VISUALIZERS.build(cfg.visualizer)
+
+    def _get_chunk_data(self, inputs: Iterable, chunk_size: int):
+        """Get batch data from dataset.
+
+        Args:
+            inputs (Iterable): An iterable dataset.
+            chunk_size (int): Equivalent to batch size.
+
+        Yields:
+            list: batch data.
+        """
+        inputs_iter = iter(inputs)
+        while True:
+            try:
+                chunk_data = []
+                for _ in range(chunk_size):
+                    processed_data = next(inputs_iter)
+                    chunk_data.append(processed_data)
+                yield chunk_data
+            except StopIteration:
+                if chunk_data:
+                    yield chunk_data
+                break
+
+    def _dispatch_kwargs(self, **kwargs) -> Tuple[Dict, Dict, Dict, Dict]:
+        """Dispatch kwargs to preprocess(), forward(), visualize() and
+        postprocess() according to the actual demands.
+
+        Returns:
+            Tuple[Dict, Dict, Dict, Dict]: kwargs passed to preprocess,
+            forward, visualize and postprocess respectively.
+        """
+        # Ensure each argument only matches one function
+        method_kwargs = self.preprocess_kwargs | self.forward_kwargs | \
+            self.visualize_kwargs | self.postprocess_kwargs
+
+        union_kwargs = method_kwargs | set(kwargs.keys())
+        if union_kwargs != method_kwargs:
+            unknown_kwargs = union_kwargs - method_kwargs
+            raise ValueError(
+                f'unknown argument {unknown_kwargs} for `preprocess`, '
+                '`forward`, `visualize` and `postprocess`')
+
+        preprocess_kwargs = {}
+        forward_kwargs = {}
+        visualize_kwargs = {}
+        postprocess_kwargs = {}
+
+        for key, value in kwargs.items():
+            if key in self.preprocess_kwargs:
+                preprocess_kwargs[key] = value
+            elif key in self.forward_kwargs:
+                forward_kwargs[key] = value
+            elif key in self.visualize_kwargs:
+                visualize_kwargs[key] = value
+            else:
+                postprocess_kwargs[key] = value
+
+        return (
+            preprocess_kwargs,
+            forward_kwargs,
+            visualize_kwargs,
+            postprocess_kwargs,
+        )
+
+    @staticmethod
+    def _get_models_from_metafile(dir: str):
+        """Load model config defined in metafile from package path.
+
+        Args:
+            dir (str): Path to the directory of Config. It requires the
+                directory ``Config``, file ``model-index.yml`` exists in the
+                ``dir``.
+
+        Yields:
+            dict: Model config defined in metafile.
+        """
+        meta_indexes = load(osp.join(dir, 'model-index.yml'))
+        for meta_path in meta_indexes['Import']:
+            # meta_path example: mmcls/.mim/configs/conformer/metafile.yml
+            meta_path = osp.join(dir, meta_path)
+            metainfo = load(meta_path)
+            yield from metainfo['Models']
+
+    @staticmethod
+    def list_models(scope: Optional[str] = None, patterns: str = r'.*'):
+        """List models defined in metafile of corresponding packages.
+
+        Args:
+            scope (str, optional): The scope to which the model belongs.
+                Defaults to None.
+            patterns (str, optional): Regular expressions for the searched
+                models. Once matched with ``Alias`` or ``Name`` filed in
+                metafile, corresponding model will be added to the return list.
+                Defaults to '.*'.
+
+        Returns:
+            dict: Model dict with model name and its alias.
+        """
+        matched_models = []
+        if scope is None:
+            default_scope = DefaultScope.get_current_instance()
+            assert default_scope is not None, (
+                'scope should be initialized if you want '
+                'to load config from metafile.')
+        assert scope in MODULE2PACKAGE, (
+            f'{scope} not in {MODULE2PACKAGE}!, please make pass a valid '
+            'scope.')
+        root_or_mim_dir = BaseInferencer._get_repo_or_mim_dir(scope)
+        for model_cfg in BaseInferencer._get_models_from_metafile(
+                root_or_mim_dir):
+            model_name = [model_cfg['Name']]
+            model_name.extend(model_cfg.get('Alias', []))
+            for name in model_name:
+                if re.match(patterns, name) is not None:
+                    matched_models.append(name)
+        output_str = ''
+        for name in matched_models:
+            output_str += f'model_name: {name}\n'
+        print_log(output_str, logger='current')
+        return matched_models
diff --git a/head_extractor/build/lib/mmengine/logging/__init__.py b/head_extractor/build/lib/mmengine/logging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba5533c2363f49c749d0dba87f49496e3861ed80
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/logging/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .history_buffer import HistoryBuffer
+from .logger import MMLogger, print_log
+from .message_hub import MessageHub
+
+__all__ = ['HistoryBuffer', 'MessageHub', 'MMLogger', 'print_log']
diff --git a/head_extractor/build/lib/mmengine/logging/history_buffer.py b/head_extractor/build/lib/mmengine/logging/history_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..58effa8152381b0c9e35b64ada08473994c4bf14
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/logging/history_buffer.py
@@ -0,0 +1,229 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Any, Callable, Optional, Sequence, Tuple, Union
+
+import numpy as np
+
+
+class HistoryBuffer:
+    """Unified storage format for different log types.
+
+    ``HistoryBuffer`` records the history of log for further statistics.
+
+    Examples:
+        >>> history_buffer = HistoryBuffer()
+        >>> # Update history_buffer.
+        >>> history_buffer.update(1)
+        >>> history_buffer.update(2)
+        >>> history_buffer.min()  # minimum of (1, 2)
+        1
+        >>> history_buffer.max()  # maximum of (1, 2)
+        2
+        >>> history_buffer.mean()  # mean of (1, 2)
+        1.5
+        >>> history_buffer.statistics('mean')  # access method by string.
+        1.5
+
+    Args:
+        log_history (Sequence): History logs. Defaults to [].
+        count_history (Sequence): Counts of history logs. Defaults to [].
+        max_length (int): The max length of history logs. Defaults to 1000000.
+    """
+    _statistics_methods: dict = dict()
+
+    def __init__(self,
+                 log_history: Sequence = [],
+                 count_history: Sequence = [],
+                 max_length: int = 1000000):
+
+        self.max_length = max_length
+        self._set_default_statistics()
+        assert len(log_history) == len(count_history), \
+            'The lengths of log_history and count_histroy should be equal'
+        if len(log_history) > max_length:
+            warnings.warn(f'The length of history buffer({len(log_history)}) '
+                          f'exceeds the max_length({max_length}), the first '
+                          'few elements will be ignored.')
+            self._log_history = np.array(log_history[-max_length:])
+            self._count_history = np.array(count_history[-max_length:])
+        else:
+            self._log_history = np.array(log_history)
+            self._count_history = np.array(count_history)
+
+    def _set_default_statistics(self) -> None:
+        """Register default statistic methods: min, max, current and mean."""
+        self._statistics_methods.setdefault('min', HistoryBuffer.min)
+        self._statistics_methods.setdefault('max', HistoryBuffer.max)
+        self._statistics_methods.setdefault('current', HistoryBuffer.current)
+        self._statistics_methods.setdefault('mean', HistoryBuffer.mean)
+
+    def update(self, log_val: Union[int, float], count: int = 1) -> None:
+        """update the log history.
+
+        If the length of the buffer exceeds ``self._max_length``, the oldest
+        element will be removed from the buffer.
+
+        Args:
+            log_val (int or float): The value of log.
+            count (int): The accumulation times of log, defaults to 1.
+            ``count`` will be used in smooth statistics.
+        """
+        if (not isinstance(log_val, (int, float))
+                or not isinstance(count, (int, float))):
+            raise TypeError(f'log_val must be int or float but got '
+                            f'{type(log_val)}, count must be int but got '
+                            f'{type(count)}')
+        self._log_history = np.append(self._log_history, log_val)
+        self._count_history = np.append(self._count_history, count)
+        if len(self._log_history) > self.max_length:
+            self._log_history = self._log_history[-self.max_length:]
+            self._count_history = self._count_history[-self.max_length:]
+
+    @property
+    def data(self) -> Tuple[np.ndarray, np.ndarray]:
+        """Get the ``_log_history`` and ``_count_history``.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: History logs and the counts of
+            the history logs.
+        """
+        return self._log_history, self._count_history
+
+    @classmethod
+    def register_statistics(cls, method: Callable) -> Callable:
+        """Register custom statistics method to ``_statistics_methods``.
+
+        The registered method can be called by ``history_buffer.statistics``
+        with corresponding method name and arguments.
+
+        Examples:
+            >>> @HistoryBuffer.register_statistics
+            >>> def weighted_mean(self, window_size, weight):
+            >>>     assert len(weight) == window_size
+            >>>     return (self._log_history[-window_size:] *
+            >>>             np.array(weight)).sum() / \
+            >>>             self._count_history[-window_size:]
+
+            >>> log_buffer = HistoryBuffer([1, 2], [1, 1])
+            >>> log_buffer.statistics('weighted_mean', 2, [2, 1])
+            2
+
+        Args:
+            method (Callable): Custom statistics method.
+        Returns:
+            Callable: Original custom statistics method.
+        """
+        method_name = method.__name__
+        assert method_name not in cls._statistics_methods, \
+            'method_name cannot be registered twice!'
+        cls._statistics_methods[method_name] = method
+        return method
+
+    def statistics(self, method_name: str, *arg, **kwargs) -> Any:
+        """Access statistics method by name.
+
+        Args:
+            method_name (str): Name of method.
+
+        Returns:
+            Any: Depends on corresponding method.
+        """
+        if method_name not in self._statistics_methods:
+            raise KeyError(f'{method_name} has not been registered in '
+                           'HistoryBuffer._statistics_methods')
+        method = self._statistics_methods[method_name]
+        # Provide self arguments for registered functions.
+        return method(self, *arg, **kwargs)
+
+    def mean(self, window_size: Optional[int] = None) -> np.ndarray:
+        """Return the mean of the latest ``window_size`` values in log
+        histories.
+
+        If ``window_size is None`` or ``window_size > len(self._log_history)``,
+        return the global mean value of history logs.
+
+        Args:
+            window_size (int, optional): Size of statistics window.
+        Returns:
+            np.ndarray: Mean value within the window.
+        """
+        if window_size is not None:
+            assert isinstance(window_size, int), \
+                'The type of window size should be int, but got ' \
+                f'{type(window_size)}'
+        else:
+            window_size = len(self._log_history)
+        logs_sum = self._log_history[-window_size:].sum()
+        counts_sum = self._count_history[-window_size:].sum()
+        return logs_sum / counts_sum
+
+    def max(self, window_size: Optional[int] = None) -> np.ndarray:
+        """Return the maximum value of the latest ``window_size`` values in log
+        histories.
+
+        If ``window_size is None`` or ``window_size > len(self._log_history)``,
+        return the global maximum value of history logs.
+
+        Args:
+            window_size (int, optional): Size of statistics window.
+        Returns:
+            np.ndarray: The maximum value within the window.
+        """
+        if window_size is not None:
+            assert isinstance(window_size, int), \
+                'The type of window size should be int, but got ' \
+                f'{type(window_size)}'
+        else:
+            window_size = len(self._log_history)
+        return self._log_history[-window_size:].max()
+
+    def min(self, window_size: Optional[int] = None) -> np.ndarray:
+        """Return the minimum value of the latest ``window_size`` values in log
+        histories.
+
+        If ``window_size is None`` or ``window_size > len(self._log_history)``,
+        return the global minimum value of history logs.
+
+        Args:
+            window_size (int, optional): Size of statistics window.
+        Returns:
+            np.ndarray: The minimum value within the window.
+        """
+        if window_size is not None:
+            assert isinstance(window_size, int), \
+                'The type of window size should be int, but got ' \
+                f'{type(window_size)}'
+        else:
+            window_size = len(self._log_history)
+        return self._log_history[-window_size:].min()
+
+    def current(self) -> np.ndarray:
+        """Return the recently updated values in log histories.
+
+        Returns:
+            np.ndarray: Recently updated values in log histories.
+        """
+        if len(self._log_history) == 0:
+            raise ValueError('HistoryBuffer._log_history is an empty array! '
+                             'please call update first')
+        return self._log_history[-1]
+
+    def __getstate__(self) -> dict:
+        """Make ``_statistics_methods`` can be resumed.
+
+        Returns:
+            dict: State dict including statistics_methods.
+        """
+        self.__dict__.update(statistics_methods=self._statistics_methods)
+        return self.__dict__
+
+    def __setstate__(self, state):
+        """Try to load ``_statistics_methods`` from state.
+
+        Args:
+            state (dict): State dict.
+        """
+        statistics_methods = state.pop('statistics_methods', {})
+        self._set_default_statistics()
+        self._statistics_methods.update(statistics_methods)
+        self.__dict__.update(state)
diff --git a/head_extractor/build/lib/mmengine/logging/logger.py b/head_extractor/build/lib/mmengine/logging/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..839a08cddaec1869b256af9268fbae17858855c8
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/logging/logger.py
@@ -0,0 +1,463 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import logging
+import os
+import os.path as osp
+import sys
+import warnings
+from getpass import getuser
+from logging import Logger, LogRecord, handlers
+from socket import gethostname
+from typing import Dict, Optional, Union
+
+from termcolor import colored
+
+from mmengine.utils import ManagerMixin
+from mmengine.utils.manager import _accquire_lock, _release_lock
+
+
+class FilterDuplicateWarning(logging.Filter):
+    """Filter the repeated warning message.
+
+    Args:
+        name (str): name of the filter.
+    """
+
+    def __init__(self, name: str = 'mmengine'):
+        super().__init__(name)
+        self.seen: set = set()
+
+    def filter(self, record: LogRecord) -> bool:
+        """Filter the repeated warning message.
+
+        Args:
+            record (LogRecord): The log record.
+
+        Returns:
+            bool: Whether to output the log record.
+        """
+        if record.levelno != logging.WARNING:
+            return True
+
+        if record.msg not in self.seen:
+            self.seen.add(record.msg)
+            return True
+        return False
+
+
+class MMFormatter(logging.Formatter):
+    """Colorful format for MMLogger. If the log level is error, the logger will
+    additionally output the location of the code.
+
+    Args:
+        color (bool): Whether to use colorful format. filehandler is not
+            allowed to use color format, otherwise it will be garbled.
+        blink (bool): Whether to blink the ``INFO`` and ``DEBUG`` logging
+            level.
+        **kwargs: Keyword arguments passed to
+            :meth:`logging.Formatter.__init__`.
+    """
+    _color_mapping: dict = dict(
+        ERROR='red', WARNING='yellow', INFO='white', DEBUG='green')
+
+    def __init__(self, color: bool = True, blink: bool = False, **kwargs):
+        super().__init__(**kwargs)
+        assert not (not color and blink), (
+            'blink should only be available when color is True')
+        # Get prefix format according to color.
+        error_prefix = self._get_prefix('ERROR', color, blink=True)
+        warn_prefix = self._get_prefix('WARNING', color, blink=True)
+        info_prefix = self._get_prefix('INFO', color, blink)
+        debug_prefix = self._get_prefix('DEBUG', color, blink)
+
+        # Config output format.
+        self.err_format = (f'%(asctime)s - %(name)s - {error_prefix} - '
+                           '%(pathname)s - %(funcName)s - %(lineno)d - '
+                           '%(message)s')
+        self.warn_format = (f'%(asctime)s - %(name)s - {warn_prefix} - %('
+                            'message)s')
+        self.info_format = (f'%(asctime)s - %(name)s - {info_prefix} - %('
+                            'message)s')
+        self.debug_format = (f'%(asctime)s - %(name)s - {debug_prefix} - %('
+                             'message)s')
+
+    def _get_prefix(self, level: str, color: bool, blink=False) -> str:
+        """Get the prefix of the target log level.
+
+        Args:
+            level (str): log level.
+            color (bool): Whether to get colorful prefix.
+            blink (bool): Whether the prefix will blink.
+
+        Returns:
+            str: The plain or colorful prefix.
+        """
+        if color:
+            attrs = ['underline']
+            if blink:
+                attrs.append('blink')
+            prefix = colored(level, self._color_mapping[level], attrs=attrs)
+        else:
+            prefix = level
+        return prefix
+
+    def format(self, record: LogRecord) -> str:
+        """Override the `logging.Formatter.format`` method `. Output the
+        message according to the specified log level.
+
+        Args:
+            record (LogRecord): A LogRecord instance represents an event being
+                logged.
+
+        Returns:
+            str: Formatted result.
+        """
+        if record.levelno == logging.ERROR:
+            self._style._fmt = self.err_format
+        elif record.levelno == logging.WARNING:
+            self._style._fmt = self.warn_format
+        elif record.levelno == logging.INFO:
+            self._style._fmt = self.info_format
+        elif record.levelno == logging.DEBUG:
+            self._style._fmt = self.debug_format
+
+        result = logging.Formatter.format(self, record)
+        return result
+
+
+class MMLogger(Logger, ManagerMixin):
+    """Formatted logger used to record messages.
+
+    ``MMLogger`` can create formatted logger to log message with different
+    log levels and get instance in the same way as ``ManagerMixin``.
+    ``MMLogger`` has the following features:
+
+    - Distributed log storage, ``MMLogger`` can choose whether to save log of
+      different ranks according to `log_file`.
+    - Message with different log levels will have different colors and format
+      when displayed on terminal.
+
+    Note:
+        - The `name` of logger and the ``instance_name`` of ``MMLogger`` could
+          be different. We can only get ``MMLogger`` instance by
+          ``MMLogger.get_instance`` but not ``logging.getLogger``. This feature
+          ensures ``MMLogger`` will not be incluenced by third-party logging
+          config.
+        - Different from ``logging.Logger``, ``MMLogger`` will not log warning
+          or error message without ``Handler``.
+
+    Examples:
+        >>> logger = MMLogger.get_instance(name='MMLogger',
+        >>>                                logger_name='Logger')
+        >>> # Although logger has name attribute just like `logging.Logger`
+        >>> # We cannot get logger instance by `logging.getLogger`.
+        >>> assert logger.name == 'Logger'
+        >>> assert logger.instance_name = 'MMLogger'
+        >>> assert id(logger) != id(logging.getLogger('Logger'))
+        >>> # Get logger that do not store logs.
+        >>> logger1 = MMLogger.get_instance('logger1')
+        >>> # Get logger only save rank0 logs.
+        >>> logger2 = MMLogger.get_instance('logger2', log_file='out.log')
+        >>> # Get logger only save multiple ranks logs.
+        >>> logger3 = MMLogger.get_instance('logger3', log_file='out.log',
+        >>>                                 distributed=True)
+
+    Args:
+        name (str): Global instance name.
+        logger_name (str): ``name`` attribute of ``Logging.Logger`` instance.
+            If `logger_name` is not defined, defaults to 'mmengine'.
+        log_file (str, optional): The log filename. If specified, a
+            ``FileHandler`` will be added to the logger. Defaults to None.
+        log_level (str): The log level of the handler. Defaults to
+            'INFO'. If log level is 'DEBUG', distributed logs will be saved
+            during distributed training.
+        file_mode (str): The file mode used to open log file. Defaults to 'w'.
+        distributed (bool): Whether to save distributed logs, Defaults to
+            false.
+        file_handler_cfg (dict, optional): Configuration of file handler.
+            Defaults to None. If ``file_handler_cfg`` is not specified,
+            ``logging.FileHandler`` will be used by default. If it is
+            specified, the ``type`` key should be set. It can be
+            ``RotatingFileHandler``, ``TimedRotatingFileHandler``,
+            ``WatchedFileHandler`` or other file handlers, and the remaining
+            fields will be used to build the handler.
+
+            Examples:
+                >>> file_handler_cfg = dict(
+                >>>    type='TimedRotatingFileHandler',
+                >>>    when='MIDNIGHT',
+                >>>    interval=1,
+                >>>    backupCount=365)
+
+            `New in version 0.9.0.`
+    """
+
+    def __init__(self,
+                 name: str,
+                 logger_name='mmengine',
+                 log_file: Optional[str] = None,
+                 log_level: Union[int, str] = 'INFO',
+                 file_mode: str = 'w',
+                 distributed=False,
+                 file_handler_cfg: Optional[dict] = None):
+        Logger.__init__(self, logger_name)
+        ManagerMixin.__init__(self, name)
+        # Get rank in DDP mode.
+        if isinstance(log_level, str):
+            log_level = logging._nameToLevel[log_level]
+        global_rank = _get_rank()
+        device_id = _get_device_id()
+
+        # Config stream_handler. If `rank != 0`. stream_handler can only
+        # export ERROR logs.
+        stream_handler = logging.StreamHandler(stream=sys.stdout)
+        # `StreamHandler` record month, day, hour, minute, and second
+        # timestamp.
+        stream_handler.setFormatter(
+            MMFormatter(color=True, datefmt='%m/%d %H:%M:%S'))
+        # Only rank0 `StreamHandler` will log messages below error level.
+        if global_rank == 0:
+            stream_handler.setLevel(log_level)
+        else:
+            stream_handler.setLevel(logging.ERROR)
+        stream_handler.addFilter(FilterDuplicateWarning(logger_name))
+        self.handlers.append(stream_handler)
+
+        if log_file is not None:
+            world_size = _get_world_size()
+            is_distributed = (log_level <= logging.DEBUG
+                              or distributed) and world_size > 1
+            if is_distributed:
+                filename, suffix = osp.splitext(osp.basename(log_file))
+                hostname = _get_host_info()
+                if hostname:
+                    filename = (f'{filename}_{hostname}_device{device_id}_'
+                                f'rank{global_rank}{suffix}')
+                else:
+                    # Omit hostname if it is empty
+                    filename = (f'{filename}_device{device_id}_'
+                                f'rank{global_rank}{suffix}')
+                log_file = osp.join(osp.dirname(log_file), filename)
+            # Save multi-ranks logs if distributed is True. The logs of rank0
+            # will always be saved.
+            if global_rank == 0 or is_distributed:
+                if file_handler_cfg is not None:
+                    assert 'type' in file_handler_cfg
+                    file_handler_type = file_handler_cfg.pop('type')
+                    file_handlers_map = _get_logging_file_handlers()
+                    if file_handler_type in file_handlers_map:
+                        file_handler_cls = file_handlers_map[file_handler_type]
+                        file_handler_cfg.setdefault('filename', log_file)
+                        file_handler = file_handler_cls(**file_handler_cfg)
+                    else:
+                        raise ValueError('`logging.handlers` does not '
+                                         f'contain {file_handler_type}')
+                else:
+                    # Here, the default behavior of the official
+                    # logger is 'a'. Thus, we provide an interface to
+                    # change the file mode to the default behavior.
+                    # `FileHandler` is not supported to have colors,
+                    # otherwise it will appear garbled.
+                    file_handler = logging.FileHandler(log_file, file_mode)
+
+                # `StreamHandler` record year, month, day hour, minute,
+                # and second timestamp. file_handler will only record logs
+                # without color to avoid garbled code saved in files.
+                file_handler.setFormatter(
+                    MMFormatter(color=False, datefmt='%Y/%m/%d %H:%M:%S'))
+                file_handler.setLevel(log_level)
+                file_handler.addFilter(FilterDuplicateWarning(logger_name))
+                self.handlers.append(file_handler)
+        self._log_file = log_file
+
+    @property
+    def log_file(self):
+        return self._log_file
+
+    @classmethod
+    def get_current_instance(cls) -> 'MMLogger':
+        """Get latest created ``MMLogger`` instance.
+
+        :obj:`MMLogger` can call :meth:`get_current_instance` before any
+        instance has been created, and return a logger with the instance name
+        "mmengine".
+
+        Returns:
+            MMLogger: Configured logger instance.
+        """
+        if not cls._instance_dict:
+            cls.get_instance('mmengine')
+        return super().get_current_instance()
+
+    def callHandlers(self, record: LogRecord) -> None:
+        """Pass a record to all relevant handlers.
+
+        Override ``callHandlers`` method in ``logging.Logger`` to avoid
+        multiple warning messages in DDP mode. Loop through all handlers of
+        the logger instance and its parents in the logger hierarchy. If no
+        handler was found, the record will not be output.
+
+        Args:
+            record (LogRecord): A ``LogRecord`` instance contains logged
+                message.
+        """
+        for handler in self.handlers:
+            if record.levelno >= handler.level:
+                handler.handle(record)
+
+    def setLevel(self, level):
+        """Set the logging level of this logger.
+
+        If ``logging.Logger.selLevel`` is called, all ``logging.Logger``
+        instances managed by ``logging.Manager`` will clear the cache. Since
+        ``MMLogger`` is not managed by ``logging.Manager`` anymore,
+        ``MMLogger`` should override this method to clear caches of all
+        ``MMLogger`` instance which is managed by :obj:`ManagerMixin`.
+
+        level must be an int or a str.
+        """
+        self.level = logging._checkLevel(level)
+        _accquire_lock()
+        # The same logic as `logging.Manager._clear_cache`.
+        for logger in MMLogger._instance_dict.values():
+            logger._cache.clear()
+        _release_lock()
+
+
+def print_log(msg,
+              logger: Optional[Union[Logger, str]] = None,
+              level=logging.INFO) -> None:
+    """Print a log message.
+
+    Args:
+        msg (str): The message to be logged.
+        logger (Logger or str, optional): If the type of logger is
+        ``logging.Logger``, we directly use logger to log messages.
+            Some special loggers are:
+
+            - "silent": No message will be printed.
+            - "current": Use latest created logger to log message.
+            - other str: Instance name of logger. The corresponding logger
+              will log message if it has been created, otherwise ``print_log``
+              will raise a `ValueError`.
+            - None: The `print()` method will be used to print log messages.
+        level (int): Logging level. Only available when `logger` is a Logger
+            object, "current", or a created logger instance name.
+    """
+    if logger is None:
+        print(msg)
+    elif isinstance(logger, logging.Logger):
+        logger.log(level, msg)
+    elif logger == 'silent':
+        pass
+    elif logger == 'current':
+        logger_instance = MMLogger.get_current_instance()
+        logger_instance.log(level, msg)
+    elif isinstance(logger, str):
+        # If the type of `logger` is `str`, but not with value of `current` or
+        # `silent`, we assume it indicates the name of the logger. If the
+        # corresponding logger has not been created, `print_log` will raise
+        # a `ValueError`.
+        if MMLogger.check_instance_created(logger):
+            logger_instance = MMLogger.get_instance(logger)
+            logger_instance.log(level, msg)
+        else:
+            raise ValueError(f'MMLogger: {logger} has not been created!')
+    else:
+        raise TypeError(
+            '`logger` should be either a logging.Logger object, str, '
+            f'"silent", "current" or None, but got {type(logger)}')
+
+
+def _get_world_size():
+    """Support using logging module without torch."""
+    try:
+        # requires torch
+        from mmengine.dist import get_world_size
+    except ImportError:
+        return 1
+    else:
+        return get_world_size()
+
+
+def _get_rank():
+    """Support using logging module without torch."""
+    try:
+        # requires torch
+        from mmengine.dist import get_rank
+    except ImportError:
+        return 0
+    else:
+        return get_rank()
+
+
+def _get_device_id():
+    """Get device id of current machine."""
+    try:
+        import torch
+    except ImportError:
+        return 0
+    else:
+        MUSA_AVAILABLE = False
+        try:
+            import torch_musa
+            MUSA_AVAILABLE = True
+        except ImportError:
+            pass
+        if MUSA_AVAILABLE:
+            local_rank = int(os.getenv('LOCAL_RANK', '0'))
+            musa_visible_devices = os.getenv('MUSA_VISIBLE_DEVICES', None)
+            if musa_visible_devices is None:
+                num_device = torch_musa.device_count()
+                musa_visible_devices = list(range(num_device))
+            else:
+                musa_visible_devices = musa_visible_devices.split(',')
+            return int(musa_visible_devices[local_rank])
+        else:
+            local_rank = int(os.getenv('LOCAL_RANK', '0'))
+            # TODO: return device id of npu and mlu.
+            if not torch.cuda.is_available():
+                return local_rank
+            cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
+            if cuda_visible_devices is None:
+                num_device = torch.cuda.device_count()
+                cuda_visible_devices = list(range(num_device))
+            else:
+                cuda_visible_devices = cuda_visible_devices.split(',')
+            try:
+                return int(cuda_visible_devices[local_rank])
+            except ValueError:
+                # handle case for Multi-Instance GPUs
+                # see #1148 for details
+                return cuda_visible_devices[local_rank]
+
+
+def _get_host_info() -> str:
+    """Get hostname and username.
+
+    Return empty string if exception raised, e.g. ``getpass.getuser()`` will
+    lead to error in docker container
+    """
+    host = ''
+    try:
+        host = f'{getuser()}@{gethostname()}'
+    except Exception as e:
+        warnings.warn(f'Host or user not found: {str(e)}')
+    finally:
+        return host
+
+
+def _get_logging_file_handlers() -> Dict:
+    """Get additional file_handlers in ``logging.handlers``.
+
+    Returns:
+        Dict: A map of file_handlers.
+    """
+    file_handlers_map = {}
+    for module_name in dir(handlers):
+        if module_name.startswith('__'):
+            continue
+        _fh = getattr(handlers, module_name)
+        if inspect.isclass(_fh) and issubclass(_fh, logging.FileHandler):
+            file_handlers_map[module_name] = _fh
+    return file_handlers_map
diff --git a/head_extractor/build/lib/mmengine/logging/message_hub.py b/head_extractor/build/lib/mmengine/logging/message_hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..82565d88321f84f849d63626970c46c1b1b65e79
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/logging/message_hub.py
@@ -0,0 +1,470 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import numpy as np
+
+from mmengine.utils import ManagerMixin
+from .history_buffer import HistoryBuffer
+from .logger import print_log
+
+if TYPE_CHECKING:
+    import torch
+
+
+class MessageHub(ManagerMixin):
+    """Message hub for component interaction. MessageHub is created and
+    accessed in the same way as ManagerMixin.
+
+    ``MessageHub`` will record log information and runtime information. The
+    log information refers to the learning rate, loss, etc. of the model
+    during training phase, which will be stored as ``HistoryBuffer``. The
+    runtime information refers to the iter times, meta information of
+    runner etc., which will be overwritten by next update.
+
+    Args:
+        name (str): Name of message hub used to get corresponding instance
+            globally.
+        log_scalars (dict, optional): Each key-value pair in the
+            dictionary is the name of the log information such as "loss", "lr",
+            "metric" and their corresponding values. The type of value must be
+            HistoryBuffer. Defaults to None.
+        runtime_info (dict, optional): Each key-value pair in the
+            dictionary is the name of the runtime information and their
+            corresponding values. Defaults to None.
+        resumed_keys (dict, optional): Each key-value pair in the
+            dictionary decides whether the key in :attr:`_log_scalars` and
+            :attr:`_runtime_info` will be serialized.
+
+    Note:
+        Key in :attr:`_resumed_keys` belongs to :attr:`_log_scalars` or
+        :attr:`_runtime_info`. The corresponding value cannot be set
+        repeatedly.
+
+    Examples:
+        >>> # create empty `MessageHub`.
+        >>> message_hub1 = MessageHub('name')
+        >>> log_scalars = dict(loss=HistoryBuffer())
+        >>> runtime_info = dict(task='task')
+        >>> resumed_keys = dict(loss=True)
+        >>> # create `MessageHub` from data.
+        >>> message_hub2 = MessageHub(
+        >>>     name='name',
+        >>>     log_scalars=log_scalars,
+        >>>     runtime_info=runtime_info,
+        >>>     resumed_keys=resumed_keys)
+    """
+
+    def __init__(self,
+                 name: str,
+                 log_scalars: Optional[dict] = None,
+                 runtime_info: Optional[dict] = None,
+                 resumed_keys: Optional[dict] = None):
+        super().__init__(name)
+        self._log_scalars = self._parse_input('log_scalars', log_scalars)
+        self._runtime_info = self._parse_input('runtime_info', runtime_info)
+        self._resumed_keys = self._parse_input('resumed_keys', resumed_keys)
+
+        for value in self._log_scalars.values():
+            assert isinstance(value, HistoryBuffer), \
+                ("The type of log_scalars'value must be HistoryBuffer, but "
+                 f'got {type(value)}')
+
+        for key in self._resumed_keys.keys():
+            assert key in self._log_scalars or key in self._runtime_info, \
+                ('Key in `resumed_keys` must contained in `log_scalars` or '
+                 f'`runtime_info`, but got {key}')
+
+    @classmethod
+    def get_current_instance(cls) -> 'MessageHub':
+        """Get latest created ``MessageHub`` instance.
+
+        :obj:`MessageHub` can call :meth:`get_current_instance` before any
+        instance has been created, and return a message hub with the instance
+        name "mmengine".
+
+        Returns:
+            MessageHub: Empty ``MessageHub`` instance.
+        """
+        if not cls._instance_dict:
+            cls.get_instance('mmengine')
+        return super().get_current_instance()
+
+    def update_scalar(self,
+                      key: str,
+                      value: Union[int, float, np.ndarray, 'torch.Tensor'],
+                      count: int = 1,
+                      resumed: bool = True) -> None:
+        """Update :attr:_log_scalars.
+
+        Update ``HistoryBuffer`` in :attr:`_log_scalars`. If corresponding key
+        ``HistoryBuffer`` has been created, ``value`` and ``count`` is the
+        argument of ``HistoryBuffer.update``, Otherwise, ``update_scalar``
+        will create an ``HistoryBuffer`` with value and count via the
+        constructor of ``HistoryBuffer``.
+
+        Examples:
+            >>> message_hub = MessageHub(name='name')
+            >>> # create loss `HistoryBuffer` with value=1, count=1
+            >>> message_hub.update_scalar('loss', 1)
+            >>> # update loss `HistoryBuffer` with value
+            >>> message_hub.update_scalar('loss', 3)
+            >>> message_hub.update_scalar('loss', 3, resumed=False)
+            AssertionError: loss used to be true, but got false now. resumed
+            keys cannot be modified repeatedly'
+
+        Note:
+            The ``resumed`` argument needs to be consistent for the same
+            ``key``.
+
+        Args:
+            key (str): Key of ``HistoryBuffer``.
+            value (torch.Tensor or np.ndarray or int or float): Value of log.
+            count (torch.Tensor or np.ndarray or int or float): Accumulation
+                times of log, defaults to 1. `count` will be used in smooth
+                statistics.
+            resumed (str): Whether the corresponding ``HistoryBuffer``
+                could be resumed. Defaults to True.
+        """
+        self._set_resumed_keys(key, resumed)
+        checked_value = self._get_valid_value(value)
+        assert isinstance(count, int), (
+            f'The type of count must be int. but got {type(count): {count}}')
+        if key in self._log_scalars:
+            self._log_scalars[key].update(checked_value, count)
+        else:
+            self._log_scalars[key] = HistoryBuffer([checked_value], [count])
+
+    def update_scalars(self, log_dict: dict, resumed: bool = True) -> None:
+        """Update :attr:`_log_scalars` with a dict.
+
+        ``update_scalars`` iterates through each pair of log_dict key-value,
+        and calls ``update_scalar``. If type of value is dict, the value should
+        be ``dict(value=xxx) or dict(value=xxx, count=xxx)``. Item in
+        ``log_dict`` has the same resume option.
+
+        Note:
+            The ``resumed`` argument needs to be consistent for the same
+            ``log_dict``.
+
+        Args:
+            log_dict (str): Used for batch updating :attr:`_log_scalars`.
+            resumed (bool): Whether all ``HistoryBuffer`` referred in
+                log_dict should be resumed. Defaults to True.
+
+        Examples:
+            >>> message_hub = MessageHub.get_instance('mmengine')
+            >>> log_dict = dict(a=1, b=2, c=3)
+            >>> message_hub.update_scalars(log_dict)
+            >>> # The default count of  `a`, `b` and `c` is 1.
+            >>> log_dict = dict(a=1, b=2, c=dict(value=1, count=2))
+            >>> message_hub.update_scalars(log_dict)
+            >>> # The count of `c` is 2.
+        """
+        assert isinstance(log_dict, dict), ('`log_dict` must be a dict!, '
+                                            f'but got {type(log_dict)}')
+        for log_name, log_val in log_dict.items():
+            if isinstance(log_val, dict):
+                assert 'value' in log_val, \
+                    f'value must be defined in {log_val}'
+                count = self._get_valid_value(log_val.get('count', 1))
+                value = log_val['value']
+            else:
+                count = 1
+                value = log_val
+            assert isinstance(count,
+                              int), ('The type of count must be int. but got '
+                                     f'{type(count): {count}}')
+            self.update_scalar(log_name, value, count, resumed)
+
+    def update_info(self, key: str, value: Any, resumed: bool = True) -> None:
+        """Update runtime information.
+
+        The key corresponding runtime information will be overwritten each
+        time calling ``update_info``.
+
+        Note:
+            The ``resumed`` argument needs to be consistent for the same
+            ``key``.
+
+        Examples:
+            >>> message_hub = MessageHub(name='name')
+            >>> message_hub.update_info('iter', 100)
+
+        Args:
+            key (str): Key of runtime information.
+            value (Any): Value of runtime information.
+            resumed (bool): Whether the corresponding ``HistoryBuffer``
+                could be resumed.
+        """
+        self._set_resumed_keys(key, resumed)
+        self._runtime_info[key] = value
+
+    def pop_info(self, key: str, default: Optional[Any] = None) -> Any:
+        """Remove runtime information by key. If the key does not exist, this
+        method will return the default value.
+
+        Args:
+            key (str): Key of runtime information.
+            default (Any, optional): The default returned value for the
+                given key.
+
+        Returns:
+            Any: The runtime information if the key exists.
+        """
+        return self._runtime_info.pop(key, default)
+
+    def update_info_dict(self, info_dict: dict, resumed: bool = True) -> None:
+        """Update runtime information with dictionary.
+
+        The key corresponding runtime information will be overwritten each
+        time calling ``update_info``.
+
+        Note:
+            The ``resumed`` argument needs to be consistent for the same
+            ``info_dict``.
+
+        Examples:
+            >>> message_hub = MessageHub(name='name')
+            >>> message_hub.update_info({'iter': 100})
+
+        Args:
+            info_dict (str): Runtime information dictionary.
+            resumed (bool): Whether the corresponding ``HistoryBuffer``
+                could be resumed.
+        """
+        assert isinstance(info_dict, dict), ('`log_dict` must be a dict!, '
+                                             f'but got {type(info_dict)}')
+        for key, value in info_dict.items():
+            self.update_info(key, value, resumed=resumed)
+
+    def _set_resumed_keys(self, key: str, resumed: bool) -> None:
+        """Set corresponding resumed keys.
+
+        This method is called by ``update_scalar``, ``update_scalars`` and
+        ``update_info`` to set the corresponding key is true or false in
+        :attr:`_resumed_keys`.
+
+        Args:
+            key (str): Key of :attr:`_log_scalrs` or :attr:`_runtime_info`.
+            resumed (bool): Whether the corresponding ``HistoryBuffer``
+                could be resumed.
+        """
+        if key not in self._resumed_keys:
+            self._resumed_keys[key] = resumed
+        else:
+            assert self._resumed_keys[key] == resumed, \
+                f'{key} used to be {self._resumed_keys[key]}, but got ' \
+                '{resumed} now. resumed keys cannot be modified repeatedly.'
+
+    @property
+    def log_scalars(self) -> OrderedDict:
+        """Get all ``HistoryBuffer`` instances.
+
+        Note:
+            Considering the large memory footprint of history buffers in the
+            post-training, :meth:`get_scalar` will return a reference of
+            history buffer rather than a copy.
+
+        Returns:
+            OrderedDict: All ``HistoryBuffer`` instances.
+        """
+        return self._log_scalars
+
+    @property
+    def runtime_info(self) -> OrderedDict:
+        """Get all runtime information.
+
+        Returns:
+            OrderedDict: A copy of all runtime information.
+        """
+        return self._runtime_info
+
+    def get_scalar(self, key: str) -> HistoryBuffer:
+        """Get ``HistoryBuffer`` instance by key.
+
+        Note:
+            Considering the large memory footprint of history buffers in the
+            post-training, :meth:`get_scalar` will not return a reference of
+            history buffer rather than a copy.
+
+        Args:
+            key (str): Key of ``HistoryBuffer``.
+
+        Returns:
+            HistoryBuffer: Corresponding ``HistoryBuffer`` instance if the
+            key exists.
+        """
+        if key not in self.log_scalars:
+            raise KeyError(f'{key} is not found in Messagehub.log_buffers: '
+                           f'instance name is: {MessageHub.instance_name}')
+        return self.log_scalars[key]
+
+    def get_info(self, key: str, default: Optional[Any] = None) -> Any:
+        """Get runtime information by key. If the key does not exist, this
+        method will return default information.
+
+        Args:
+            key (str): Key of runtime information.
+            default (Any, optional): The default returned value for the
+                given key.
+
+        Returns:
+            Any: A copy of corresponding runtime information if the key exists.
+        """
+        if key not in self.runtime_info:
+            return default
+        else:
+            # TODO: There are restrictions on objects that can be saved
+            # return copy.deepcopy(self._runtime_info[key])
+            return self._runtime_info[key]
+
+    def _get_valid_value(
+        self,
+        value: Union['torch.Tensor', np.ndarray, np.number, int, float],
+    ) -> Union[int, float]:
+        """Convert value to python built-in type.
+
+        Args:
+            value (torch.Tensor or np.ndarray or np.number or int or float):
+                value of log.
+
+        Returns:
+            float or int: python built-in type value.
+        """
+        if isinstance(value, (np.ndarray, np.number)):
+            assert value.size == 1
+            value = value.item()
+        elif isinstance(value, (int, float)):
+            value = value
+        else:
+            # check whether value is torch.Tensor but don't want
+            # to import torch in this file
+            assert hasattr(value, 'numel') and value.numel() == 1
+            value = value.item()
+        return value  # type: ignore
+
+    def state_dict(self) -> dict:
+        """Returns a dictionary containing log scalars, runtime information and
+        resumed keys, which should be resumed.
+
+        The returned ``state_dict`` can be loaded by :meth:`load_state_dict`.
+
+        Returns:
+            dict: A dictionary contains ``log_scalars``, ``runtime_info`` and
+            ``resumed_keys``.
+        """
+        saved_scalars = OrderedDict()
+        saved_info = OrderedDict()
+
+        for key, value in self._log_scalars.items():
+            if self._resumed_keys.get(key, False):
+                saved_scalars[key] = copy.deepcopy(value)
+
+        for key, value in self._runtime_info.items():
+            if self._resumed_keys.get(key, False):
+                try:
+                    saved_info[key] = copy.deepcopy(value)
+                except:  # noqa: E722
+                    print_log(
+                        f'{key} in message_hub cannot be copied, '
+                        f'just return its reference. ',
+                        logger='current',
+                        level=logging.WARNING)
+                    saved_info[key] = value
+        return dict(
+            log_scalars=saved_scalars,
+            runtime_info=saved_info,
+            resumed_keys=self._resumed_keys)
+
+    def load_state_dict(self, state_dict: Union['MessageHub', dict]) -> None:
+        """Loads log scalars, runtime information and resumed keys from
+        ``state_dict`` or ``message_hub``.
+
+        If ``state_dict`` is a dictionary returned by :meth:`state_dict`, it
+        will only make copies of data which should be resumed from the source
+        ``message_hub``.
+
+        If ``state_dict`` is a ``message_hub`` instance, it will make copies of
+        all data from the source message_hub. We suggest to load data from
+        ``dict`` rather than a ``MessageHub`` instance.
+
+        Args:
+            state_dict (dict or MessageHub): A dictionary contains key
+                ``log_scalars`` ``runtime_info`` and ``resumed_keys``, or a
+                MessageHub instance.
+        """
+        if isinstance(state_dict, dict):
+            for key in ('log_scalars', 'runtime_info', 'resumed_keys'):
+                assert key in state_dict, (
+                    'The loaded `state_dict` of `MessageHub` must contain '
+                    f'key: `{key}`')
+            # The old `MessageHub` could save non-HistoryBuffer `log_scalars`,
+            # therefore the loaded `log_scalars` needs to be filtered.
+            for key, value in state_dict['log_scalars'].items():
+                if not isinstance(value, HistoryBuffer):
+                    print_log(
+                        f'{key} in message_hub is not HistoryBuffer, '
+                        f'just skip resuming it.',
+                        logger='current',
+                        level=logging.WARNING)
+                    continue
+                self.log_scalars[key] = value
+
+            for key, value in state_dict['runtime_info'].items():
+                try:
+                    self._runtime_info[key] = copy.deepcopy(value)
+                except:  # noqa: E722
+                    print_log(
+                        f'{key} in message_hub cannot be copied, '
+                        f'just return its reference.',
+                        logger='current',
+                        level=logging.WARNING)
+                    self._runtime_info[key] = value
+
+            for key, value in state_dict['resumed_keys'].items():
+                if key not in set(self.log_scalars.keys()) | \
+                        set(self._runtime_info.keys()):
+                    print_log(
+                        f'resumed key: {key} is not defined in message_hub, '
+                        f'just skip resuming this key.',
+                        logger='current',
+                        level=logging.WARNING)
+                    continue
+                elif not value:
+                    print_log(
+                        f'Although resumed key: {key} is False, {key} '
+                        'will still be loaded this time. This key will '
+                        'not be saved by the next calling of '
+                        '`MessageHub.state_dict()`',
+                        logger='current',
+                        level=logging.WARNING)
+                self._resumed_keys[key] = value
+
+        # Since some checkpoints saved serialized `message_hub` instance,
+        # `load_state_dict` support loading `message_hub` instance for
+        # compatibility
+        else:
+            self._log_scalars = copy.deepcopy(state_dict._log_scalars)
+            self._runtime_info = copy.deepcopy(state_dict._runtime_info)
+            self._resumed_keys = copy.deepcopy(state_dict._resumed_keys)
+
+    def _parse_input(self, name: str, value: Any) -> OrderedDict:
+        """Parse input value.
+
+        Args:
+            name (str): name of input value.
+            value (Any): Input value.
+
+        Returns:
+            dict: Parsed input value.
+        """
+        if value is None:
+            return OrderedDict()
+        elif isinstance(value, dict):
+            return OrderedDict(value)
+        else:
+            raise TypeError(f'{name} should be a dict or `None`, but '
+                            f'got {type(name)}')
diff --git a/head_extractor/build/lib/mmengine/model/__init__.py b/head_extractor/build/lib/mmengine/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..033512a985e66b300a52415fa5b206a56e268b60
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils.dl_utils import TORCH_VERSION
+from mmengine.utils.version_utils import digit_version
+from .averaged_model import (BaseAveragedModel, ExponentialMovingAverage,
+                             MomentumAnnealingEMA, StochasticWeightAverage)
+from .base_model import BaseDataPreprocessor, BaseModel, ImgDataPreprocessor
+from .base_module import BaseModule, ModuleDict, ModuleList, Sequential
+from .test_time_aug import BaseTTAModel
+from .utils import (convert_sync_batchnorm, detect_anomalous_params,
+                    merge_dict, revert_sync_batchnorm, stack_batch)
+from .weight_init import (BaseInit, Caffe2XavierInit, ConstantInit,
+                          KaimingInit, NormalInit, PretrainedInit,
+                          TruncNormalInit, UniformInit, XavierInit,
+                          bias_init_with_prob, caffe2_xavier_init,
+                          constant_init, initialize, kaiming_init, normal_init,
+                          trunc_normal_init, uniform_init, update_init_info,
+                          xavier_init)
+from .wrappers import (MMDistributedDataParallel,
+                       MMSeparateDistributedDataParallel, is_model_wrapper)
+
+__all__ = [
+    'MMDistributedDataParallel', 'is_model_wrapper', 'BaseAveragedModel',
+    'StochasticWeightAverage', 'ExponentialMovingAverage',
+    'MomentumAnnealingEMA', 'BaseModel', 'BaseDataPreprocessor',
+    'ImgDataPreprocessor', 'MMSeparateDistributedDataParallel', 'BaseModule',
+    'stack_batch', 'merge_dict', 'detect_anomalous_params', 'ModuleList',
+    'ModuleDict', 'Sequential', 'revert_sync_batchnorm', 'update_init_info',
+    'constant_init', 'xavier_init', 'normal_init', 'trunc_normal_init',
+    'uniform_init', 'kaiming_init', 'caffe2_xavier_init',
+    'bias_init_with_prob', 'BaseInit', 'ConstantInit', 'XavierInit',
+    'NormalInit', 'TruncNormalInit', 'UniformInit', 'KaimingInit',
+    'Caffe2XavierInit', 'PretrainedInit', 'initialize',
+    'convert_sync_batchnorm', 'BaseTTAModel'
+]
+
+if digit_version(TORCH_VERSION) >= digit_version('2.0.0'):
+    from .wrappers import MMFullyShardedDataParallel  # noqa:F401
+    __all__.append('MMFullyShardedDataParallel')
diff --git a/head_extractor/build/lib/mmengine/model/averaged_model.py b/head_extractor/build/lib/mmengine/model/averaged_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..58457c2a6e92311149917b1e362ae339aec8be27
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/averaged_model.py
@@ -0,0 +1,263 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from abc import abstractmethod
+from copy import deepcopy
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmengine.logging import print_log
+from mmengine.registry import MODELS
+
+
+class BaseAveragedModel(nn.Module):
+    """A base class for averaging model weights.
+
+    Weight averaging, such as SWA and EMA, is a widely used technique for
+    training neural networks. This class implements the averaging process
+    for a model. All subclasses must implement the `avg_func` method.
+    This class creates a copy of the provided module :attr:`model`
+    on the :attr:`device` and allows computing running averages of the
+    parameters of the :attr:`model`.
+
+    The code is referenced from: https://github.com/pytorch/pytorch/blob/master/torch/optim/swa_utils.py.
+
+    Different from the `AveragedModel` in PyTorch, we use in-place operation
+    to improve the parameter updating speed, which is about 5 times faster
+    than the non-in-place version.
+
+    In mmengine, we provide two ways to use the model averaging:
+
+    1. Use the model averaging module in hook:
+       We provide an :class:`mmengine.hooks.EMAHook` to apply the model
+       averaging during training. Add ``custom_hooks=[dict(type='EMAHook')]``
+       to the config or the runner.
+
+    2. Use the model averaging module directly in the algorithm. Take the ema
+       teacher in semi-supervise as an example:
+
+       >>> from mmengine.model import ExponentialMovingAverage
+       >>> student = ResNet(depth=50)
+       >>> # use ema model as teacher
+       >>> ema_teacher = ExponentialMovingAverage(student)
+
+    Args:
+        model (nn.Module): The model to be averaged.
+        interval (int): Interval between two updates. Defaults to 1.
+        device (torch.device, optional): If provided, the averaged model will
+            be stored on the :attr:`device`. Defaults to None.
+        update_buffers (bool): if True, it will compute running averages for
+            both the parameters and the buffers of the model. Defaults to
+            False.
+    """  # noqa: E501
+
+    def __init__(self,
+                 model: nn.Module,
+                 interval: int = 1,
+                 device: Optional[torch.device] = None,
+                 update_buffers: bool = False) -> None:
+        super().__init__()
+        self.module = deepcopy(model).requires_grad_(False)
+        self.interval = interval
+        if device is not None:
+            self.module = self.module.to(device)
+        self.register_buffer('steps',
+                             torch.tensor(0, dtype=torch.long, device=device))
+        self.update_buffers = update_buffers
+        if update_buffers:
+            self.avg_parameters = self.module.state_dict()
+        else:
+            self.avg_parameters = dict(self.module.named_parameters())
+
+    @abstractmethod
+    def avg_func(self, averaged_param: Tensor, source_param: Tensor,
+                 steps: int) -> None:
+        """Use in-place operation to compute the average of the parameters. All
+        subclasses must implement this method.
+
+        Args:
+            averaged_param (Tensor): The averaged parameters.
+            source_param (Tensor): The source parameters.
+            steps (int): The number of times the parameters have been
+                updated.
+        """
+
+    def forward(self, *args, **kwargs):
+        """Forward method of the averaged model."""
+        return self.module(*args, **kwargs)
+
+    def update_parameters(self, model: nn.Module) -> None:
+        """Update the parameters of the model. This method will execute the
+        ``avg_func`` to compute the new parameters and update the model's
+        parameters.
+
+        Args:
+            model (nn.Module): The model whose parameters will be averaged.
+        """
+        src_parameters = (
+            model.state_dict()
+            if self.update_buffers else dict(model.named_parameters()))
+        if self.steps == 0:
+            for k, p_avg in self.avg_parameters.items():
+                p_avg.data.copy_(src_parameters[k].data)
+        elif self.steps % self.interval == 0:
+            for k, p_avg in self.avg_parameters.items():
+                if p_avg.dtype.is_floating_point:
+                    device = p_avg.device
+                    self.avg_func(p_avg.data,
+                                  src_parameters[k].data.to(device),
+                                  self.steps)
+        if not self.update_buffers:
+            # If not update the buffers,
+            # keep the buffers in sync with the source model.
+            for b_avg, b_src in zip(self.module.buffers(), model.buffers()):
+                b_avg.data.copy_(b_src.data.to(b_avg.device))
+        self.steps += 1
+
+
+@MODELS.register_module()
+class StochasticWeightAverage(BaseAveragedModel):
+    """Implements the stochastic weight averaging (SWA) of the model.
+
+    Stochastic Weight Averaging was proposed in `Averaging Weights Leads to
+    Wider Optima and Better Generalization, UAI 2018.
+    <https://arxiv.org/abs/1803.05407>`_ by Pavel Izmailov, Dmitrii
+    Podoprikhin, Timur Garipov, Dmitry Vetrov and Andrew Gordon Wilson.
+    """
+
+    def avg_func(self, averaged_param: Tensor, source_param: Tensor,
+                 steps: int) -> None:
+        """Compute the average of the parameters using stochastic weight
+        average.
+
+        Args:
+            averaged_param (Tensor): The averaged parameters.
+            source_param (Tensor): The source parameters.
+            steps (int): The number of times the parameters have been
+                updated.
+        """
+        averaged_param.add_(
+            source_param - averaged_param,
+            alpha=1 / float(steps // self.interval + 1))
+
+
+@MODELS.register_module()
+class ExponentialMovingAverage(BaseAveragedModel):
+    r"""Implements the exponential moving average (EMA) of the model.
+
+    All parameters are updated by the formula as below:
+
+        .. math::
+
+            Xema_{t+1} = (1 - momentum) * Xema_{t} +  momentum * X_t
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically,
+        :math:`Xema_{t+1}` is the moving average and :math:`X_t` is the
+        new observed value. The value of momentum is usually a small number,
+        allowing observed values to slowly update the ema parameters.
+
+    Args:
+        model (nn.Module): The model to be averaged.
+        momentum (float): The momentum used for updating ema parameter.
+            Defaults to 0.0002.
+            Ema's parameter are updated with the formula
+            :math:`averaged\_param = (1-momentum) * averaged\_param +
+            momentum * source\_param`.
+        interval (int): Interval between two updates. Defaults to 1.
+        device (torch.device, optional): If provided, the averaged model will
+            be stored on the :attr:`device`. Defaults to None.
+        update_buffers (bool): if True, it will compute running averages for
+            both the parameters and the buffers of the model. Defaults to
+            False.
+    """  # noqa: W605
+
+    def __init__(self,
+                 model: nn.Module,
+                 momentum: float = 0.0002,
+                 interval: int = 1,
+                 device: Optional[torch.device] = None,
+                 update_buffers: bool = False) -> None:
+        super().__init__(model, interval, device, update_buffers)
+        assert 0.0 < momentum < 1.0, 'momentum must be in range (0.0, 1.0)'\
+                                     f'but got {momentum}'
+        if momentum > 0.5:
+            print_log(
+                'The value of momentum in EMA is usually a small number,'
+                'which is different from the conventional notion of '
+                f'momentum but got {momentum}. Please make sure the '
+                f'value is correct.',
+                logger='current',
+                level=logging.WARNING)
+        self.momentum = momentum
+
+    def avg_func(self, averaged_param: Tensor, source_param: Tensor,
+                 steps: int) -> None:
+        """Compute the moving average of the parameters using exponential
+        moving average.
+
+        Args:
+            averaged_param (Tensor): The averaged parameters.
+            source_param (Tensor): The source parameters.
+            steps (int): The number of times the parameters have been
+                updated.
+        """
+        averaged_param.lerp_(source_param, self.momentum)
+
+
+@MODELS.register_module()
+class MomentumAnnealingEMA(ExponentialMovingAverage):
+    r"""Exponential moving average (EMA) with momentum annealing strategy.
+
+    Args:
+        model (nn.Module): The model to be averaged.
+        momentum (float): The momentum used for updating ema parameter.
+            Defaults to 0.0002.
+            Ema's parameter are updated with the formula
+            :math:`averaged\_param = (1-momentum) * averaged\_param +
+            momentum * source\_param`.
+        gamma (int): Use a larger momentum early in training and gradually
+            annealing to a smaller value to update the ema model smoothly. The
+            momentum is calculated as max(momentum, gamma / (gamma + steps))
+            Defaults to 100.
+        interval (int): Interval between two updates. Defaults to 1.
+        device (torch.device, optional): If provided, the averaged model will
+            be stored on the :attr:`device`. Defaults to None.
+        update_buffers (bool): if True, it will compute running averages for
+            both the parameters and the buffers of the model. Defaults to
+            False.
+    """
+
+    def __init__(self,
+                 model: nn.Module,
+                 momentum: float = 0.0002,
+                 gamma: int = 100,
+                 interval: int = 1,
+                 device: Optional[torch.device] = None,
+                 update_buffers: bool = False) -> None:
+        super().__init__(
+            model=model,
+            momentum=momentum,
+            interval=interval,
+            device=device,
+            update_buffers=update_buffers)
+        assert gamma > 0, f'gamma must be greater than 0, but got {gamma}'
+        self.gamma = gamma
+
+    def avg_func(self, averaged_param: Tensor, source_param: Tensor,
+                 steps: int) -> None:
+        """Compute the moving average of the parameters using the linear
+        momentum strategy.
+
+        Args:
+            averaged_param (Tensor): The averaged parameters.
+            source_param (Tensor): The source parameters.
+            steps (int): The number of times the parameters have been
+                updated.
+        """
+        momentum = max(self.momentum,
+                       self.gamma / (self.gamma + self.steps.item()))
+        averaged_param.lerp_(source_param, momentum)
diff --git a/head_extractor/build/lib/mmengine/model/base_model/__init__.py b/head_extractor/build/lib/mmengine/model/base_model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..66a3cb89a9326799c2822bfb0b06cb2a0602c4e6
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/base_model/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_model import BaseModel
+from .data_preprocessor import BaseDataPreprocessor, ImgDataPreprocessor
+
+__all__ = ['BaseModel', 'ImgDataPreprocessor', 'BaseDataPreprocessor']
diff --git a/head_extractor/build/lib/mmengine/model/base_model/base_model.py b/head_extractor/build/lib/mmengine/model/base_model/base_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..299cd67557ed7d7890d50bd9ded4228345230b44
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/base_model/base_model.py
@@ -0,0 +1,367 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from collections import OrderedDict
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from mmengine.optim import OptimWrapper
+from mmengine.registry import MODELS
+from mmengine.utils import is_list_of
+from ..base_module import BaseModule
+from .data_preprocessor import BaseDataPreprocessor
+
+
+class BaseModel(BaseModule):
+    """Base class for all algorithmic models.
+
+    BaseModel implements the basic functions of the algorithmic model, such as
+    weights initialize, batch inputs preprocess(see more information in
+    :class:`BaseDataPreprocessor`), parse losses, and update model parameters.
+
+    Subclasses inherit from BaseModel only need to implement the forward
+    method, which implements the logic to calculate loss and predictions,
+    then can be trained in the runner.
+
+    Examples:
+        >>> @MODELS.register_module()
+        >>> class ToyModel(BaseModel):
+        >>>
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.backbone = nn.Sequential()
+        >>>         self.backbone.add_module('conv1', nn.Conv2d(3, 6, 5))
+        >>>         self.backbone.add_module('pool', nn.MaxPool2d(2, 2))
+        >>>         self.backbone.add_module('conv2', nn.Conv2d(6, 16, 5))
+        >>>         self.backbone.add_module('fc1', nn.Linear(16 * 5 * 5, 120))
+        >>>         self.backbone.add_module('fc2', nn.Linear(120, 84))
+        >>>         self.backbone.add_module('fc3', nn.Linear(84, 10))
+        >>>
+        >>>         self.criterion = nn.CrossEntropyLoss()
+        >>>
+        >>>     def forward(self, batch_inputs, data_samples, mode='tensor'):
+        >>>         data_samples = torch.stack(data_samples)
+        >>>         if mode == 'tensor':
+        >>>             return self.backbone(batch_inputs)
+        >>>         elif mode == 'predict':
+        >>>             feats = self.backbone(batch_inputs)
+        >>>             predictions = torch.argmax(feats, 1)
+        >>>             return predictions
+        >>>         elif mode == 'loss':
+        >>>             feats = self.backbone(batch_inputs)
+        >>>             loss = self.criterion(feats, data_samples)
+        >>>             return dict(loss=loss)
+
+    Args:
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+
+    Attributes:
+        data_preprocessor (:obj:`BaseDataPreprocessor`): Used for
+            pre-processing data sampled by dataloader to the format accepted by
+            :meth:`forward`.
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 data_preprocessor: Optional[Union[dict, nn.Module]] = None,
+                 init_cfg: Optional[dict] = None):
+        super().__init__(init_cfg)
+        if data_preprocessor is None:
+            data_preprocessor = dict(type='BaseDataPreprocessor')
+        if isinstance(data_preprocessor, nn.Module):
+            self.data_preprocessor = data_preprocessor
+        elif isinstance(data_preprocessor, dict):
+            self.data_preprocessor = MODELS.build(data_preprocessor)
+        else:
+            raise TypeError('data_preprocessor should be a `dict` or '
+                            f'`nn.Module` instance, but got '
+                            f'{type(data_preprocessor)}')
+
+    def train_step(self, data: Union[dict, tuple, list],
+                   optim_wrapper: OptimWrapper) -> Dict[str, torch.Tensor]:
+        """Implements the default model training process including
+        preprocessing, model forward propagation, loss calculation,
+        optimization, and back-propagation.
+
+        During non-distributed training. If subclasses do not override the
+        :meth:`train_step`, :class:`EpochBasedTrainLoop` or
+        :class:`IterBasedTrainLoop` will call this method to update model
+        parameters. The default parameter update process is as follows:
+
+        1. Calls ``self.data_processor(data, training=False)`` to collect
+           batch_inputs and corresponding data_samples(labels).
+        2. Calls ``self(batch_inputs, data_samples, mode='loss')`` to get raw
+           loss
+        3. Calls ``self.parse_losses`` to get ``parsed_losses`` tensor used to
+           backward and dict of loss tensor used to log messages.
+        4. Calls ``optim_wrapper.update_params(loss)`` to update model.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            optim_wrapper (OptimWrapper): OptimWrapper instance
+                used to update model parameters.
+
+        Returns:
+            Dict[str, torch.Tensor]: A ``dict`` of tensor for logging.
+        """
+        # Enable automatic mixed precision training context.
+        with optim_wrapper.optim_context(self):
+            data = self.data_preprocessor(data, True)
+            losses = self._run_forward(data, mode='loss')  # type: ignore
+        parsed_losses, log_vars = self.parse_losses(losses)  # type: ignore
+        optim_wrapper.update_params(parsed_losses)
+        return log_vars
+
+    def val_step(self, data: Union[tuple, dict, list]) -> list:
+        """Gets the predictions of given data.
+
+        Calls ``self.data_preprocessor(data, False)`` and
+        ``self(inputs, data_sample, mode='predict')`` in order. Return the
+        predictions which will be passed to evaluator.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        data = self.data_preprocessor(data, False)
+        return self._run_forward(data, mode='predict')  # type: ignore
+
+    def test_step(self, data: Union[dict, tuple, list]) -> list:
+        """``BaseModel`` implements ``test_step`` the same as ``val_step``.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        data = self.data_preprocessor(data, False)
+        return self._run_forward(data, mode='predict')  # type: ignore
+
+    def parse_losses(
+        self, losses: Dict[str, torch.Tensor]
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """Parses the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: There are two elements. The first is the
+            loss tensor passed to optim_wrapper which may be a weighted sum
+            of all losses, and the second is log_vars which will be sent to
+            the logger.
+        """
+        log_vars = []
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars.append([loss_name, loss_value.mean()])
+            elif is_list_of(loss_value, torch.Tensor):
+                log_vars.append(
+                    [loss_name,
+                     sum(_loss.mean() for _loss in loss_value)])
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(value for key, value in log_vars if 'loss' in key)
+        log_vars.insert(0, ['loss', loss])
+        log_vars = OrderedDict(log_vars)  # type: ignore
+
+        return loss, log_vars  # type: ignore
+
+    def to(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.to`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+
+        # Since Torch has not officially merged
+        # the npu-related fields, using the _parse_to function
+        # directly will cause the NPU to not be found.
+        # Here, the input parameters are processed to avoid errors.
+        if args and isinstance(args[0], str) and 'npu' in args[0]:
+            import torch_npu
+            args = tuple([
+                list(args)[0].replace(
+                    'npu', torch_npu.npu.native_device if hasattr(
+                        torch_npu.npu, 'native_device') else 'privateuseone')
+            ])
+        if kwargs and 'npu' in str(kwargs.get('device', '')):
+            import torch_npu
+            kwargs['device'] = kwargs['device'].replace(
+                'npu', torch_npu.npu.native_device if hasattr(
+                    torch_npu.npu, 'native_device') else 'privateuseone')
+
+        device = torch._C._nn._parse_to(*args, **kwargs)[0]
+        if device is not None:
+            self._set_device(torch.device(device))
+        return super().to(*args, **kwargs)
+
+    def cuda(
+        self,
+        device: Optional[Union[int, str, torch.device]] = None,
+    ) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.cuda`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        if device is None or isinstance(device, int):
+            device = torch.device('cuda', index=device)
+        self._set_device(torch.device(device))
+        return super().cuda(device)
+
+    def musa(
+        self,
+        device: Optional[Union[int, str, torch.device]] = None,
+    ) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.musa`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        if device is None or isinstance(device, int):
+            device = torch.device('musa', index=device)
+        self._set_device(torch.device(device))
+        return super().musa(device)
+
+    def mlu(
+        self,
+        device: Union[int, str, torch.device, None] = None,
+    ) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.mlu`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        device = torch.device('mlu', torch.mlu.current_device())
+        self._set_device(device)
+        return super().mlu()
+
+    def npu(
+        self,
+        device: Union[int, str, torch.device, None] = None,
+    ) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.npu`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+
+        Note:
+            This generation of NPU(Ascend910) does not support
+            the use of multiple cards in a single process,
+            so the index here needs to be consistent with the default device
+        """
+        device = torch.npu.current_device()
+        self._set_device(device)
+        return super().npu()
+
+    def cpu(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.cpu`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._set_device(torch.device('cpu'))
+        return super().cpu()
+
+    def _set_device(self, device: torch.device) -> None:
+        """Recursively set device for `BaseDataPreprocessor` instance.
+
+        Args:
+            device (torch.device): the desired device of the parameters and
+                buffers in this module.
+        """
+
+        def apply_fn(module):
+            if not isinstance(module, BaseDataPreprocessor):
+                return
+            if device is not None:
+                module._device = device
+
+        self.apply(apply_fn)
+
+    @abstractmethod
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: Optional[list] = None,
+                mode: str = 'tensor') -> Union[Dict[str, torch.Tensor], list]:
+        """Returns losses or predictions of training, validation, testing, and
+        simple inference process.
+
+        ``forward`` method of BaseModel is an abstract method, its subclasses
+        must implement this method.
+
+        Accepts ``batch_inputs`` and ``data_sample`` processed by
+        :attr:`data_preprocessor`, and returns results according to mode
+        arguments.
+
+        During non-distributed training, validation, and testing process,
+        ``forward`` will be called by ``BaseModel.train_step``,
+        ``BaseModel.val_step`` and ``BaseModel.test_step`` directly.
+
+        During distributed data parallel training process,
+        ``MMSeparateDistributedDataParallel.train_step`` will first call
+        ``DistributedDataParallel.forward`` to enable automatic
+        gradient synchronization, and then call ``forward`` to get training
+        loss.
+
+        Args:
+            inputs (torch.Tensor): batch input tensor collated by
+                :attr:`data_preprocessor`.
+            data_samples (list, optional):
+                data samples collated by :attr:`data_preprocessor`.
+            mode (str): mode should be one of ``loss``, ``predict`` and
+                ``tensor``
+
+                - ``loss``: Called by ``train_step`` and return loss ``dict``
+                  used for logging
+                - ``predict``: Called by ``val_step`` and ``test_step``
+                  and return list of results used for computing metric.
+                - ``tensor``: Called by custom use to get ``Tensor`` type
+                  results.
+
+        Returns:
+            dict or list:
+                - If ``mode == loss``, return a ``dict`` of loss tensor used
+                  for backward and logging.
+                - If ``mode == predict``, return a ``list`` of inference
+                  results.
+                - If ``mode == tensor``, return a tensor or ``tuple`` of tensor
+                  or ``dict`` of tensor for custom use.
+        """
+
+    def _run_forward(self, data: Union[dict, tuple, list],
+                     mode: str) -> Union[Dict[str, torch.Tensor], list]:
+        """Unpacks data for :meth:`forward`
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            mode (str): Mode of forward.
+
+        Returns:
+            dict or list: Results of training or testing mode.
+        """
+        if isinstance(data, dict):
+            results = self(**data, mode=mode)
+        elif isinstance(data, (list, tuple)):
+            results = self(*data, mode=mode)
+        else:
+            raise TypeError('Output of `data_preprocessor` should be '
+                            f'list, tuple or dict, but got {type(data)}')
+        return results
diff --git a/head_extractor/build/lib/mmengine/model/base_model/data_preprocessor.py b/head_extractor/build/lib/mmengine/model/base_model/data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a101855203bae2287ea981c7abc0c1e3141b6ab8
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/base_model/data_preprocessor.py
@@ -0,0 +1,308 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Mapping, Optional, Sequence, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmengine.registry import MODELS
+from mmengine.structures import BaseDataElement
+from mmengine.utils import is_seq_of
+from ..utils import stack_batch
+
+CastData = Union[tuple, dict, BaseDataElement, torch.Tensor, list, bytes, str,
+                 None]
+
+
+@MODELS.register_module()
+class BaseDataPreprocessor(nn.Module):
+    """Base data pre-processor used for copying data to the target device.
+
+    Subclasses inherit from ``BaseDataPreprocessor`` could override the
+    forward method to implement custom data pre-processing, such as
+    batch-resize, MixUp, or CutMix.
+
+    Args:
+        non_blocking (bool): Whether block current process
+            when transferring data to device.
+            New in version 0.3.0.
+
+    Note:
+        Data dictionary returned by dataloader must be a dict and at least
+        contain the ``inputs`` key.
+    """
+
+    def __init__(self, non_blocking: Optional[bool] = False):
+        super().__init__()
+        self._non_blocking = non_blocking
+        self._device = torch.device('cpu')
+
+    def cast_data(self, data: CastData) -> CastData:
+        """Copying data to the target device.
+
+        Args:
+            data (dict): Data returned by ``DataLoader``.
+
+        Returns:
+            CollatedResult: Inputs and data sample at target device.
+        """
+        if isinstance(data, Mapping):
+            return {key: self.cast_data(data[key]) for key in data}
+        elif isinstance(data, (str, bytes)) or data is None:
+            return data
+        elif isinstance(data, tuple) and hasattr(data, '_fields'):
+            # namedtuple
+            return type(data)(*(self.cast_data(sample) for sample in data))  # type: ignore  # noqa: E501  # yapf:disable
+        elif isinstance(data, Sequence):
+            return type(data)(self.cast_data(sample) for sample in data)  # type: ignore  # noqa: E501  # yapf:disable
+        elif isinstance(data, (torch.Tensor, BaseDataElement)):
+            return data.to(self.device, non_blocking=self._non_blocking)
+        else:
+            return data
+
+    def forward(self, data: dict, training: bool = False) -> Union[dict, list]:
+        """Preprocesses the data into the model input format.
+
+        After the data pre-processing of :meth:`cast_data`, ``forward``
+        will stack the input tensor list to a batch tensor at the first
+        dimension.
+
+        Args:
+            data (dict): Data returned by dataloader
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict or list: Data in the same format as the model input.
+        """
+        return self.cast_data(data)  # type: ignore
+
+    @property
+    def device(self):
+        return self._device
+
+    def to(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+
+        # Since Torch has not officially merged
+        # the npu-related fields, using the _parse_to function
+        # directly will cause the NPU to not be found.
+        # Here, the input parameters are processed to avoid errors.
+        if args and isinstance(args[0], str) and 'npu' in args[0]:
+            args = tuple(
+                [list(args)[0].replace('npu', torch.npu.native_device)])
+        if kwargs and 'npu' in str(kwargs.get('device', '')):
+            kwargs['device'] = kwargs['device'].replace(
+                'npu', torch.npu.native_device)
+
+        device = torch._C._nn._parse_to(*args, **kwargs)[0]
+        if device is not None:
+            self._device = torch.device(device)
+        return super().to(*args, **kwargs)
+
+    def cuda(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._device = torch.device(torch.cuda.current_device())
+        return super().cuda()
+
+    def musa(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._device = torch.device(torch.musa.current_device())
+        return super().musa()
+
+    def npu(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._device = torch.device(torch.npu.current_device())
+        return super().npu()
+
+    def mlu(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._device = torch.device(torch.mlu.current_device())
+        return super().mlu()
+
+    def cpu(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._device = torch.device('cpu')
+        return super().cpu()
+
+
+@MODELS.register_module()
+class ImgDataPreprocessor(BaseDataPreprocessor):
+    """Image pre-processor for normalization and bgr to rgb conversion.
+
+    Accepts the data sampled by the dataloader, and preprocesses it into the
+    format of the model input. ``ImgDataPreprocessor`` provides the
+    basic data pre-processing as follows
+
+    - Collates and moves data to the target device.
+    - Converts inputs from bgr to rgb if the shape of input is (3, H, W).
+    - Normalizes image with defined std and mean.
+    - Pads inputs to the maximum size of current batch with defined
+      ``pad_value``. The padding size can be divisible by a defined
+      ``pad_size_divisor``
+    - Stack inputs to batch_inputs.
+
+    For ``ImgDataPreprocessor``, the dimension of the single inputs must be
+    (3, H, W).
+
+    Note:
+        ``ImgDataPreprocessor`` and its subclass is built in the
+        constructor of :class:`BaseDataset`.
+
+    Args:
+        mean (Sequence[float or int], optional): The pixel mean of image
+            channels. If ``bgr_to_rgb=True`` it means the mean value of R,
+            G, B channels. If the length of `mean` is 1, it means all
+            channels have the same mean value, or the input is a gray image.
+            If it is not specified, images will not be normalized. Defaults
+            None.
+        std (Sequence[float or int], optional): The pixel standard deviation of
+            image channels. If ``bgr_to_rgb=True`` it means the standard
+            deviation of R, G, B channels. If the length of `std` is 1,
+            it means all channels have the same standard deviation, or the
+            input is a gray image.  If it is not specified, images will
+            not be normalized. Defaults None.
+        pad_size_divisor (int): The size of padded image should be
+            divisible by ``pad_size_divisor``. Defaults to 1.
+        pad_value (float or int): The padded pixel value. Defaults to 0.
+        bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): whether to convert image from RGB to RGB.
+            Defaults to False.
+        non_blocking (bool): Whether block current process
+            when transferring data to device.
+            New in version v0.3.0.
+
+    Note:
+        if images do not need to be normalized, `std` and `mean` should be
+        both set to None, otherwise both of them should be set to a tuple of
+        corresponding values.
+    """
+
+    def __init__(self,
+                 mean: Optional[Sequence[Union[float, int]]] = None,
+                 std: Optional[Sequence[Union[float, int]]] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Union[float, int] = 0,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False,
+                 non_blocking: Optional[bool] = False):
+        super().__init__(non_blocking)
+        assert not (bgr_to_rgb and rgb_to_bgr), (
+            '`bgr2rgb` and `rgb2bgr` cannot be set to True at the same time')
+        assert (mean is None) == (std is None), (
+            'mean and std should be both None or tuple')
+        if mean is not None:
+            assert len(mean) == 3 or len(mean) == 1, (
+                '`mean` should have 1 or 3 values, to be compatible with '
+                f'RGB or gray image, but got {len(mean)} values')
+            assert len(std) == 3 or len(std) == 1, (  # type: ignore
+                '`std` should have 1 or 3 values, to be compatible with RGB '  # type: ignore # noqa: E501
+                f'or gray image, but got {len(std)} values')  # type: ignore
+            self._enable_normalize = True
+            self.register_buffer('mean',
+                                 torch.tensor(mean).view(-1, 1, 1), False)
+            self.register_buffer('std',
+                                 torch.tensor(std).view(-1, 1, 1), False)
+        else:
+            self._enable_normalize = False
+        self._channel_conversion = rgb_to_bgr or bgr_to_rgb
+        self.pad_size_divisor = pad_size_divisor
+        self.pad_value = pad_value
+
+    def forward(self, data: dict, training: bool = False) -> Union[dict, list]:
+        """Performs normalization, padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict): Data sampled from dataset. If the collate
+                function of DataLoader is :obj:`pseudo_collate`, data will be a
+                list of dict. If collate function is :obj:`default_collate`,
+                data will be a tuple with batch input tensor and list of data
+                samples.
+            training (bool): Whether to enable training time augmentation. If
+                subclasses override this method, they can perform different
+                preprocessing strategies for training and testing based on the
+                value of ``training``.
+
+        Returns:
+            dict or list: Data in the same format as the model input.
+        """
+        data = self.cast_data(data)  # type: ignore
+        _batch_inputs = data['inputs']
+        # Process data with `pseudo_collate`.
+        if is_seq_of(_batch_inputs, torch.Tensor):
+            batch_inputs = []
+            for _batch_input in _batch_inputs:
+                # channel transform
+                if self._channel_conversion:
+                    _batch_input = _batch_input[[2, 1, 0], ...]
+                # Convert to float after channel conversion to ensure
+                # efficiency
+                _batch_input = _batch_input.float()
+                # Normalization.
+                if self._enable_normalize:
+                    if self.mean.shape[0] == 3:
+                        assert _batch_input.dim(
+                        ) == 3 and _batch_input.shape[0] == 3, (
+                            'If the mean has 3 values, the input tensor '
+                            'should in shape of (3, H, W), but got the tensor '
+                            f'with shape {_batch_input.shape}')
+                    _batch_input = (_batch_input - self.mean) / self.std
+                batch_inputs.append(_batch_input)
+            # Pad and stack Tensor.
+            batch_inputs = stack_batch(batch_inputs, self.pad_size_divisor,
+                                       self.pad_value)
+        # Process data with `default_collate`.
+        elif isinstance(_batch_inputs, torch.Tensor):
+            assert _batch_inputs.dim() == 4, (
+                'The input of `ImgDataPreprocessor` should be a NCHW tensor '
+                'or a list of tensor, but got a tensor with shape: '
+                f'{_batch_inputs.shape}')
+            if self._channel_conversion:
+                _batch_inputs = _batch_inputs[:, [2, 1, 0], ...]
+            # Convert to float after channel conversion to ensure
+            # efficiency
+            _batch_inputs = _batch_inputs.float()
+            if self._enable_normalize:
+                _batch_inputs = (_batch_inputs - self.mean) / self.std
+            h, w = _batch_inputs.shape[2:]
+            target_h = math.ceil(
+                h / self.pad_size_divisor) * self.pad_size_divisor
+            target_w = math.ceil(
+                w / self.pad_size_divisor) * self.pad_size_divisor
+            pad_h = target_h - h
+            pad_w = target_w - w
+            batch_inputs = F.pad(_batch_inputs, (0, pad_w, 0, pad_h),
+                                 'constant', self.pad_value)
+        else:
+            raise TypeError('Output of `cast_data` should be a dict of '
+                            'list/tuple with inputs and data_samples, '
+                            f'but got {type(data)}: {data}')
+        data['inputs'] = batch_inputs
+        data.setdefault('data_samples', None)
+        return data
diff --git a/head_extractor/build/lib/mmengine/model/base_module.py b/head_extractor/build/lib/mmengine/model/base_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cfe0b14a8452bd6188a9e64adb122ce4887f750
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/base_module.py
@@ -0,0 +1,239 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+from abc import ABCMeta
+from collections import defaultdict
+from logging import FileHandler
+from typing import Iterable, List, Optional, Union
+
+import torch.nn as nn
+
+from mmengine.dist import master_only
+from mmengine.logging import MMLogger, print_log
+from .weight_init import PretrainedInit, initialize, update_init_info
+from .wrappers.utils import is_model_wrapper
+
+
+class BaseModule(nn.Module, metaclass=ABCMeta):
+    """Base module for all modules in openmmlab. ``BaseModule`` is a wrapper of
+    ``torch.nn.Module`` with additional functionality of parameter
+    initialization. Compared with ``torch.nn.Module``, ``BaseModule`` mainly
+    adds three attributes.
+
+    - ``init_cfg``: the config to control the initialization.
+    - ``init_weights``: The function of parameter initialization and recording
+      initialization information.
+    - ``_params_init_info``: Used to track the parameter initialization
+      information. This attribute only exists during executing the
+      ``init_weights``.
+
+    Note:
+        :obj:`PretrainedInit` has a higher priority than any other
+        initializer. The loaded pretrained weights will overwrite
+        the previous initialized weights.
+
+    Args:
+        init_cfg (dict or List[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self, init_cfg: Union[dict, List[dict], None] = None):
+        """Initialize BaseModule, inherited from `torch.nn.Module`"""
+
+        # NOTE init_cfg can be defined in different levels, but init_cfg
+        # in low levels has a higher priority.
+
+        super().__init__()
+        # define default value of init_cfg instead of hard code
+        # in init_weights() function
+        self._is_init = False
+
+        self.init_cfg = copy.deepcopy(init_cfg)
+
+        # Backward compatibility in derived classes
+        # if pretrained is not None:
+        #     warnings.warn('DeprecationWarning: pretrained is a deprecated \
+        #         key, please consider using init_cfg')
+        #     self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+    @property
+    def is_init(self):
+        return self._is_init
+
+    @is_init.setter
+    def is_init(self, value):
+        self._is_init = value
+
+    def init_weights(self):
+        """Initialize the weights."""
+
+        is_top_level_module = False
+        # check if it is top-level module
+        if not hasattr(self, '_params_init_info'):
+            # The `_params_init_info` is used to record the initialization
+            # information of the parameters
+            # the key should be the obj:`nn.Parameter` of model and the value
+            # should be a dict containing
+            # - init_info (str): The string that describes the initialization.
+            # - tmp_mean_value (FloatTensor): The mean of the parameter,
+            #       which indicates whether the parameter has been modified.
+            # this attribute would be deleted after all parameters
+            # is initialized.
+            self._params_init_info = defaultdict(dict)
+            is_top_level_module = True
+
+            # Initialize the `_params_init_info`,
+            # When detecting the `tmp_mean_value` of
+            # the corresponding parameter is changed, update related
+            # initialization information
+            for name, param in self.named_parameters():
+                self._params_init_info[param][
+                    'init_info'] = f'The value is the same before and ' \
+                                   f'after calling `init_weights` ' \
+                                   f'of {self.__class__.__name__} '
+                self._params_init_info[param][
+                    'tmp_mean_value'] = param.data.mean().cpu()
+
+            # pass `params_init_info` to all submodules
+            # All submodules share the same `params_init_info`,
+            # so it will be updated when parameters are
+            # modified at any level of the model.
+            for sub_module in self.modules():
+                sub_module._params_init_info = self._params_init_info
+
+        module_name = self.__class__.__name__
+        if not self._is_init:
+            if self.init_cfg:
+                print_log(
+                    f'initialize {module_name} with init_cfg {self.init_cfg}',
+                    logger='current',
+                    level=logging.DEBUG)
+
+                init_cfgs = self.init_cfg
+                if isinstance(self.init_cfg, dict):
+                    init_cfgs = [self.init_cfg]
+
+                # PretrainedInit has higher priority than any other init_cfg.
+                # Therefore we initialize `pretrained_cfg` last to overwrite
+                # the previous initialized weights.
+                # See details in https://github.com/open-mmlab/mmengine/issues/691 # noqa E501
+                other_cfgs = []
+                pretrained_cfg = []
+                for init_cfg in init_cfgs:
+                    assert isinstance(init_cfg, dict)
+                    if (init_cfg['type'] == 'Pretrained'
+                            or init_cfg['type'] is PretrainedInit):
+                        pretrained_cfg.append(init_cfg)
+                    else:
+                        other_cfgs.append(init_cfg)
+
+                initialize(self, other_cfgs)
+
+            for m in self.children():
+                if is_model_wrapper(m) and not hasattr(m, 'init_weights'):
+                    m = m.module
+                if hasattr(m, 'init_weights') and not getattr(
+                        m, 'is_init', False):
+                    m.init_weights()
+                    # users may overload the `init_weights`
+                    update_init_info(
+                        m,
+                        init_info=f'Initialized by '
+                        f'user-defined `init_weights`'
+                        f' in {m.__class__.__name__} ')
+            if self.init_cfg and pretrained_cfg:
+                initialize(self, pretrained_cfg)
+            self._is_init = True
+        else:
+            print_log(
+                f'init_weights of {self.__class__.__name__} has '
+                f'been called more than once.',
+                logger='current',
+                level=logging.WARNING)
+
+        if is_top_level_module:
+            self._dump_init_info()
+
+            for sub_module in self.modules():
+                del sub_module._params_init_info
+
+    @master_only
+    def _dump_init_info(self):
+        """Dump the initialization information to a file named
+        `initialization.log.json` in workdir."""
+
+        logger = MMLogger.get_current_instance()
+        with_file_handler = False
+        # dump the information to the logger file if there is a `FileHandler`
+        for handler in logger.handlers:
+            if isinstance(handler, FileHandler):
+                handler.stream.write(
+                    'Name of parameter - Initialization information\n')
+                for name, param in self.named_parameters():
+                    handler.stream.write(
+                        f'\n{name} - {param.shape}: '
+                        f"\n{self._params_init_info[param]['init_info']} \n")
+                handler.stream.flush()
+                with_file_handler = True
+        if not with_file_handler:
+            for name, param in self.named_parameters():
+                logger.info(
+                    f'\n{name} - {param.shape}: '
+                    f"\n{self._params_init_info[param]['init_info']} \n ")
+
+    def __repr__(self):
+        s = super().__repr__()
+        if self.init_cfg:
+            s += f'\ninit_cfg={self.init_cfg}'
+        return s
+
+
+class Sequential(BaseModule, nn.Sequential):
+    """Sequential module in openmmlab.
+
+    Ensures that all modules in ``Sequential`` have a different initialization
+    strategy than the outer model
+
+    Args:
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, *args, init_cfg: Optional[dict] = None):
+        BaseModule.__init__(self, init_cfg)
+        nn.Sequential.__init__(self, *args)
+
+
+class ModuleList(BaseModule, nn.ModuleList):
+    """ModuleList in openmmlab.
+
+    Ensures that all modules in ``ModuleList`` have a different initialization
+    strategy than the outer model
+
+    Args:
+        modules (iterable, optional): An iterable of modules to add.
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 modules: Optional[Iterable] = None,
+                 init_cfg: Optional[dict] = None):
+        BaseModule.__init__(self, init_cfg)
+        nn.ModuleList.__init__(self, modules)
+
+
+class ModuleDict(BaseModule, nn.ModuleDict):
+    """ModuleDict in openmmlab.
+
+    Ensures that all modules in ``ModuleDict`` have a different initialization
+    strategy than the outer model
+
+    Args:
+        modules (dict, optional): A mapping (dictionary) of (string: module)
+            or an iterable of key-value pairs of type (string, module).
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 modules: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None):
+        BaseModule.__init__(self, init_cfg)
+        nn.ModuleDict.__init__(self, modules)
diff --git a/head_extractor/build/lib/mmengine/model/efficient_conv_bn_eval.py b/head_extractor/build/lib/mmengine/model/efficient_conv_bn_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cb2ad619907212d4f6cb9763bdca5feb2369b91
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/efficient_conv_bn_eval.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from operator import attrgetter
+from typing import List, Union
+
+import torch
+import torch.nn as nn
+
+
+def efficient_conv_bn_eval_forward(bn: nn.modules.batchnorm._BatchNorm,
+                                   conv: nn.modules.conv._ConvNd,
+                                   x: torch.Tensor):
+    """Code borrowed from mmcv 2.0.1, so that this feature can be used for old
+    mmcv versions.
+
+    Implementation based on https://arxiv.org/abs/2305.11624
+    "Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
+    It leverages the associative law between convolution and affine transform,
+    i.e., normalize (weight conv feature) = (normalize weight) conv feature.
+    It works for Eval mode of ConvBN blocks during validation, and can be used
+    for training as well. It reduces memory and computation cost.
+    Args:
+        bn (_BatchNorm): a BatchNorm module.
+        conv (nn._ConvNd): a conv module
+        x (torch.Tensor): Input feature map.
+    """
+    # These lines of code are designed to deal with various cases
+    # like bn without affine transform, and conv without bias
+    weight_on_the_fly = conv.weight
+    if conv.bias is not None:
+        bias_on_the_fly = conv.bias
+    else:
+        bias_on_the_fly = torch.zeros_like(bn.running_var)
+
+    if bn.weight is not None:
+        bn_weight = bn.weight
+    else:
+        bn_weight = torch.ones_like(bn.running_var)
+
+    if bn.bias is not None:
+        bn_bias = bn.bias
+    else:
+        bn_bias = torch.zeros_like(bn.running_var)
+
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    weight_coeff = torch.rsqrt(bn.running_var +
+                               bn.eps).reshape([-1] + [1] *
+                                               (len(conv.weight.shape) - 1))
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff
+
+    # shape of [C_out, C_in, k, k] in Conv2d
+    weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
+    # shape of [C_out] in Conv2d
+    bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() *\
+        (bias_on_the_fly - bn.running_mean)
+
+    return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly)
+
+
+def efficient_conv_bn_eval_control(bn: nn.modules.batchnorm._BatchNorm,
+                                   conv: nn.modules.conv._ConvNd,
+                                   x: torch.Tensor):
+    """This function controls whether to use `efficient_conv_bn_eval_forward`.
+
+    If the following `bn` is in `eval` mode, then we turn on the special
+    `efficient_conv_bn_eval_forward`.
+    """
+    if not bn.training:
+        # bn in eval mode
+        output = efficient_conv_bn_eval_forward(bn, conv, x)
+        return output
+    else:
+        conv_out = conv._conv_forward(x, conv.weight, conv.bias)
+        return bn(conv_out)
+
+
+def efficient_conv_bn_eval_graph_transform(fx_model):
+    """Find consecutive conv+bn calls in the graph, inplace modify the graph
+    with the fused operation."""
+    modules = dict(fx_model.named_modules())
+
+    patterns = [(torch.nn.modules.conv._ConvNd,
+                 torch.nn.modules.batchnorm._BatchNorm)]
+
+    pairs = []
+    # Iterate through nodes in the graph to find ConvBN blocks
+    for node in fx_model.graph.nodes:
+        # If our current node isn't calling a Module then we can ignore it.
+        if node.op != 'call_module':
+            continue
+        target_module = modules[node.target]
+        found_pair = False
+        for conv_class, bn_class in patterns:
+            if isinstance(target_module, bn_class):
+                source_module = modules[node.args[0].target]
+                if isinstance(source_module, conv_class):
+                    found_pair = True
+        # Not a conv-BN pattern or output of conv is used by other nodes
+        if not found_pair or len(node.args[0].users) > 1:
+            continue
+
+        # Find a pair of conv and bn computation nodes to optimize
+        conv_node = node.args[0]
+        bn_node = node
+        pairs.append([conv_node, bn_node])
+
+    for conv_node, bn_node in pairs:
+        # set insertion point
+        fx_model.graph.inserting_before(conv_node)
+        # create `get_attr` node to access modules
+        # note that we directly call `create_node` to fill the `name`
+        # argument. `fx_model.graph.get_attr` and
+        # `fx_model.graph.call_function` does not allow the `name` argument.
+        conv_get_node = fx_model.graph.create_node(
+            op='get_attr', target=conv_node.target, name='get_conv')
+        bn_get_node = fx_model.graph.create_node(
+            op='get_attr', target=bn_node.target, name='get_bn')
+        # prepare args for the fused function
+        args = (bn_get_node, conv_get_node, conv_node.args[0])
+        # create a new node
+        new_node = fx_model.graph.create_node(
+            op='call_function',
+            target=efficient_conv_bn_eval_control,
+            args=args,
+            name='efficient_conv_bn_eval')
+        # this node replaces the original conv + bn, and therefore
+        # should replace the uses of bn_node
+        bn_node.replace_all_uses_with(new_node)
+        # take care of the deletion order:
+        # delete bn_node first, and then conv_node
+        fx_model.graph.erase_node(bn_node)
+        fx_model.graph.erase_node(conv_node)
+
+    # regenerate the code
+    fx_model.graph.lint()
+    fx_model.recompile()
+
+
+def turn_on_efficient_conv_bn_eval_for_single_model(model: torch.nn.Module):
+    import torch.fx as fx
+
+    # currently we use `fx.symbolic_trace` to trace models.
+    # in the future, we might turn to pytorch 2.0 compile infrastructure to
+    # get the `fx.GraphModule` IR. Nonetheless, the graph transform function
+    # can remain unchanged. We just need to change the way
+    # we get `fx.GraphModule`.
+    fx_model: fx.GraphModule = fx.symbolic_trace(model)
+    efficient_conv_bn_eval_graph_transform(fx_model)
+    model.forward = fx_model.forward
+
+
+def turn_on_efficient_conv_bn_eval(model: torch.nn.Module,
+                                   modules: Union[List[str], str]):
+    if isinstance(modules, str):
+        modules = [modules]
+    for module_name in modules:
+        module = attrgetter(module_name)(model)
+        turn_on_efficient_conv_bn_eval_for_single_model(module)
diff --git a/head_extractor/build/lib/mmengine/model/test_time_aug.py b/head_extractor/build/lib/mmengine/model/test_time_aug.py
new file mode 100644
index 0000000000000000000000000000000000000000..c623eec8bc3fa7840f66bd9d3d475f91602aa2ce
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/test_time_aug.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from mmengine.registry import MODELS
+from mmengine.structures import BaseDataElement
+from .base_model import BaseModel
+
+# multi-batch inputs processed by different augmentations from the same batch.
+EnhancedBatchInputs = List[Union[torch.Tensor, List[torch.Tensor]]]
+# multi-batch data samples processed by different augmentations from the same
+# batch. The inner list stands for different augmentations and the outer list
+# stands for batch.
+EnhancedBatchDataSamples = List[List[BaseDataElement]]
+DATA_BATCH = Union[Dict[str, Union[EnhancedBatchInputs,
+                                   EnhancedBatchDataSamples]], tuple, dict]
+MergedDataSamples = List[BaseDataElement]
+
+
+@MODELS.register_module()
+class BaseTTAModel(BaseModel):
+    """Base model for inference with test-time augmentation.
+
+    ``BaseTTAModel`` is a wrapper for inference given multi-batch data.
+    It implements the :meth:`test_step` for multi-batch data inference.
+    ``multi-batch`` data means data processed by different augmentation
+    from the same batch.
+
+    During test time augmentation, the data processed by
+    :obj:`mmcv.transforms.TestTimeAug`, and then collated by
+    ``pseudo_collate`` will have the following format:
+
+    .. code-block::
+
+        result = dict(
+            inputs=[
+                [image1_aug1, image2_aug1],
+                [image1_aug2, image2_aug2]
+            ],
+            data_samples=[
+                [data_sample1_aug1, data_sample2_aug1],
+                [data_sample1_aug2, data_sample2_aug2],
+            ]
+        )
+
+    ``image{i}_aug{j}`` means the i-th image of the batch, which is
+    augmented by the j-th augmentation.
+
+    ``BaseTTAModel`` will collate the data to:
+
+     .. code-block::
+
+        data1 = dict(
+            inputs=[image1_aug1, image2_aug1],
+            data_samples=[data_sample1_aug1, data_sample2_aug1]
+        )
+
+        data2 = dict(
+            inputs=[image1_aug2, image2_aug2],
+            data_samples=[data_sample1_aug2, data_sample2_aug2]
+        )
+
+    ``data1`` and ``data2`` will be passed to model, and the results will be
+    merged by :meth:`merge_preds`.
+
+    Note:
+        :meth:`merge_preds` is an abstract method, all subclasses should
+        implement it.
+
+    Warning:
+        If ``data_preprocessor`` is not None, it will overwrite the model's
+        ``data_preprocessor``.
+
+    Args:
+        module (dict or nn.Module): Tested model.
+        data_preprocessor (dict or :obj:`BaseDataPreprocessor`, optional):
+            If model does not define ``data_preprocessor``, it will be the
+            default value for model.
+    """
+
+    def __init__(
+        self,
+        module: Union[dict, nn.Module],
+        data_preprocessor: Union[dict, nn.Module, None] = None,
+    ):
+        super().__init__()
+        if isinstance(module, nn.Module):
+            self.module = module
+        elif isinstance(module, dict):
+            if data_preprocessor is not None:
+                module['data_preprocessor'] = data_preprocessor
+            self.module = MODELS.build(module)
+        else:
+            raise TypeError('The type of module should be a `nn.Module` '
+                            f'instance or a dict, but got {module}')
+        assert hasattr(self.module, 'test_step'), (
+            'Model wrapped by BaseTTAModel must implement `test_step`!')
+
+    @abstractmethod
+    def merge_preds(self, data_samples_list: EnhancedBatchDataSamples) \
+            -> MergedDataSamples:
+        """Merge predictions of enhanced data to one prediction.
+
+        Args:
+            data_samples_list (EnhancedBatchDataSamples): List of predictions
+                of all enhanced data.
+
+        Returns:
+            List[BaseDataElement]: Merged prediction.
+        """
+
+    def test_step(self, data):
+        """Get predictions of each enhanced data, a multiple predictions.
+
+        Args:
+            data (DataBatch): Enhanced data batch sampled from dataloader.
+
+        Returns:
+            MergedDataSamples: Merged prediction.
+        """
+        data_list: Union[List[dict], List[list]]
+        if isinstance(data, dict):
+            num_augs = len(data[next(iter(data))])
+            data_list = [{key: value[idx]
+                          for key, value in data.items()}
+                         for idx in range(num_augs)]
+        elif isinstance(data, (tuple, list)):
+            num_augs = len(data[0])
+            data_list = [[_data[idx] for _data in data]
+                         for idx in range(num_augs)]
+        else:
+            raise TypeError('data given by dataLoader should be a dict, '
+                            f'tuple or a list, but got {type(data)}')
+
+        predictions = []
+        for data in data_list:  # type: ignore
+            predictions.append(self.module.test_step(data))
+        return self.merge_preds(list(zip(*predictions)))  # type: ignore
+
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: Optional[list] = None,
+                mode: str = 'tensor') -> Union[Dict[str, torch.Tensor], list]:
+        """``BaseTTAModel.forward`` should not be called."""
+        raise NotImplementedError(
+            '`BaseTTAModel.forward` will not be called during training or'
+            'testing. Please call `test_step` instead. If you want to use'
+            '`BaseTTAModel.forward`, please implement this method')
diff --git a/head_extractor/build/lib/mmengine/model/utils.py b/head_extractor/build/lib/mmengine/model/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c78ea3134d41d3e83b8cb5b84b983ae9a94d6327
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/utils.py
@@ -0,0 +1,257 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import warnings
+from typing import List, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmengine.logging import print_log
+from mmengine.utils.dl_utils import mmcv_full_available
+
+
+def stack_batch(tensor_list: List[torch.Tensor],
+                pad_size_divisor: int = 1,
+                pad_value: Union[int, float] = 0) -> torch.Tensor:
+    """Stack multiple tensors to form a batch and pad the tensor to the max
+    shape use the right bottom padding mode in these images. If
+    ``pad_size_divisor > 0``, add padding to ensure the shape of each dim is
+    divisible by ``pad_size_divisor``.
+
+    Args:
+        tensor_list (List[Tensor]): A list of tensors with the same dim.
+        pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding
+            to ensure the shape of each dim is divisible by
+            ``pad_size_divisor``. This depends on the model, and many
+            models need to be divisible by 32. Defaults to 1
+        pad_value (int, float): The padding value. Defaults to 0.
+
+    Returns:
+       Tensor: The n dim tensor.
+    """
+    assert isinstance(
+        tensor_list,
+        list), (f'Expected input type to be list, but got {type(tensor_list)}')
+    assert tensor_list, '`tensor_list` could not be an empty list'
+    assert len({
+        tensor.ndim
+        for tensor in tensor_list
+    }) == 1, (f'Expected the dimensions of all tensors must be the same, '
+              f'but got {[tensor.ndim for tensor in tensor_list]}')
+
+    dim = tensor_list[0].dim()
+    num_img = len(tensor_list)
+    all_sizes: torch.Tensor = torch.Tensor(
+        [tensor.shape for tensor in tensor_list])
+    max_sizes = torch.ceil(
+        torch.max(all_sizes, dim=0)[0] / pad_size_divisor) * pad_size_divisor
+    padded_sizes = max_sizes - all_sizes
+    # The first dim normally means channel,  which should not be padded.
+    padded_sizes[:, 0] = 0
+    if padded_sizes.sum() == 0:
+        return torch.stack(tensor_list)
+    # `pad` is the second arguments of `F.pad`. If pad is (1, 2, 3, 4),
+    # it means that padding the last dim with 1(left) 2(right), padding the
+    # penultimate dim to 3(top) 4(bottom). The order of `pad` is opposite of
+    # the `padded_sizes`. Therefore, the `padded_sizes` needs to be reversed,
+    # and only odd index of pad should be assigned to keep padding "right" and
+    # "bottom".
+    pad = torch.zeros(num_img, 2 * dim, dtype=torch.int)
+    pad[:, 1::2] = padded_sizes[:, range(dim - 1, -1, -1)]
+    batch_tensor = []
+    for idx, tensor in enumerate(tensor_list):
+        batch_tensor.append(
+            F.pad(tensor, tuple(pad[idx].tolist()), value=pad_value))
+    return torch.stack(batch_tensor)
+
+
+def detect_anomalous_params(loss: torch.Tensor, model) -> None:
+    parameters_in_graph = set()
+    visited = set()
+
+    def traverse(grad_fn):
+        if grad_fn is None:
+            return
+        if grad_fn not in visited:
+            visited.add(grad_fn)
+            if hasattr(grad_fn, 'variable'):
+                parameters_in_graph.add(grad_fn.variable)
+            parents = grad_fn.next_functions
+            if parents is not None:
+                for parent in parents:
+                    grad_fn = parent[0]
+                    traverse(grad_fn)
+
+    traverse(loss.grad_fn)
+    for n, p in model.named_parameters():
+        if p not in parameters_in_graph and p.requires_grad:
+            print_log(
+                f'{n} with shape {p.size()} is not '
+                f'in the computational graph \n',
+                logger='current',
+                level=logging.ERROR)
+
+
+def merge_dict(*args):
+    """Merge all dictionaries into one dictionary.
+
+    If pytorch version >= 1.8, ``merge_dict`` will be wrapped
+    by ``torch.fx.wrap``,  which will make ``torch.fx.symbolic_trace`` skip
+    trace ``merge_dict``.
+
+    Note:
+        If a function needs to be traced by ``torch.fx.symbolic_trace``,
+        but inevitably needs to use ``update`` method of ``dict``(``update``
+        is not traceable). It should use ``merge_dict`` to replace
+        ``xxx.update``.
+
+    Args:
+        *args: dictionary needs to be merged.
+
+    Returns:
+        dict: Merged dict from args
+    """
+    output = dict()
+    for item in args:
+        assert isinstance(
+            item,
+            dict), (f'all arguments of merge_dict should be a dict, but got '
+                    f'{type(item)}')
+        output.update(item)
+    return output
+
+
+# torch.fx is only available when pytorch version >= 1.8.
+# If the subclass of `BaseModel` has multiple submodules, and each module
+# will return a loss dict during training process, i.e., `TwoStageDetector`
+# in mmdet. It should use `merge_dict` to get the total loss, rather than
+# `loss.update` to keep model traceable.
+try:
+    import torch.fx
+
+    # make torch.fx skip trace `merge_dict`.
+    merge_dict = torch.fx.wrap(merge_dict)
+
+except ImportError:
+    warnings.warn('Cannot import torch.fx, `merge_dict` is a simple function '
+                  'to merge multiple dicts')
+
+
+class _BatchNormXd(nn.modules.batchnorm._BatchNorm):
+    """A general BatchNorm layer without input dimension check.
+
+    Reproduced from @kapily's work:
+    (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547)
+    The only difference between BatchNorm1d, BatchNorm2d, BatchNorm3d, etc
+    is `_check_input_dim` that is designed for tensor sanity checks.
+    The check has been bypassed in this class for the convenience of converting
+    SyncBatchNorm.
+    """
+
+    def _check_input_dim(self, input: torch.Tensor):
+        return
+
+
+def revert_sync_batchnorm(module: nn.Module) -> nn.Module:
+    """Helper function to convert all `SyncBatchNorm` (SyncBN) and
+    `mmcv.ops.sync_bn.SyncBatchNorm`(MMSyncBN) layers in the model to
+    `BatchNormXd` layers.
+
+    Adapted from @kapily's work:
+    (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547)
+
+    Args:
+        module (nn.Module): The module containing `SyncBatchNorm` layers.
+
+    Returns:
+        module_output: The converted module with `BatchNormXd` layers.
+    """
+    module_output = module
+    module_checklist = [torch.nn.modules.batchnorm.SyncBatchNorm]
+
+    if mmcv_full_available():
+        from mmcv.ops import SyncBatchNorm
+        module_checklist.append(SyncBatchNorm)
+
+    if isinstance(module, tuple(module_checklist)):
+        module_output = _BatchNormXd(module.num_features, module.eps,
+                                     module.momentum, module.affine,
+                                     module.track_running_stats)
+        if module.affine:
+            # no_grad() may not be needed here but
+            # just to be consistent with `convert_sync_batchnorm()`
+            with torch.no_grad():
+                module_output.weight = module.weight
+                module_output.bias = module.bias
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = module.num_batches_tracked
+        module_output.training = module.training
+        # qconfig exists in quantized models
+        if hasattr(module, 'qconfig'):
+            module_output.qconfig = module.qconfig
+    for name, child in module.named_children():
+        # Some custom modules or 3rd party implemented modules may raise an
+        # error when calling `add_module`. Therefore, try to catch the error
+        # and do not raise it. See https://github.com/open-mmlab/mmengine/issues/638 # noqa: E501
+        # for more details.
+        try:
+            module_output.add_module(name, revert_sync_batchnorm(child))
+        except Exception:
+            print_log(
+                F'Failed to convert {child} from SyncBN to BN!',
+                logger='current',
+                level=logging.WARNING)
+    del module
+    return module_output
+
+
+def convert_sync_batchnorm(module: nn.Module,
+                           implementation='torch') -> nn.Module:
+    """Helper function to convert all `BatchNorm` layers in the model to
+    `SyncBatchNorm` (SyncBN) or `mmcv.ops.sync_bn.SyncBatchNorm` (MMSyncBN)
+    layers. Adapted from `PyTorch convert sync batchnorm`_.
+
+    Args:
+        module (nn.Module): The module containing `SyncBatchNorm` layers.
+        implementation (str): The type of `SyncBatchNorm` to convert to.
+
+            - 'torch': convert to `torch.nn.modules.batchnorm.SyncBatchNorm`.
+            - 'mmcv': convert to `mmcv.ops.sync_bn.SyncBatchNorm`.
+
+    Returns:
+        nn.Module: The converted module with `SyncBatchNorm` layers.
+
+    .. _PyTorch convert sync batchnorm:
+       https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html#torch.nn.SyncBatchNorm.convert_sync_batchnorm
+    """  # noqa: E501
+    module_output = module
+
+    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
+        if implementation == 'torch':
+            SyncBatchNorm = torch.nn.modules.batchnorm.SyncBatchNorm
+        elif implementation == 'mmcv':
+            from mmcv.ops import SyncBatchNorm  # type: ignore
+        else:
+            raise ValueError('sync_bn should be "torch" or "mmcv", but got '
+                             f'{implementation}')
+
+        module_output = SyncBatchNorm(module.num_features, module.eps,
+                                      module.momentum, module.affine,
+                                      module.track_running_stats)
+
+        if module.affine:
+            with torch.no_grad():
+                module_output.weight = module.weight
+                module_output.bias = module.bias
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = module.num_batches_tracked
+        if hasattr(module, 'qconfig'):
+            module_output.qconfig = module.qconfig
+    for name, child in module.named_children():
+        module_output.add_module(name,
+                                 convert_sync_batchnorm(child, implementation))
+    del module
+    return module_output
diff --git a/head_extractor/build/lib/mmengine/model/weight_init.py b/head_extractor/build/lib/mmengine/model/weight_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2e0b9a7a5185f78ab09584c500624bb4a749d2a
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/weight_init.py
@@ -0,0 +1,682 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmengine.logging import print_log
+from mmengine.registry import WEIGHT_INITIALIZERS, build_from_cfg
+
+
+def update_init_info(module, init_info):
+    """Update the `_params_init_info` in the module if the value of parameters
+    are changed.
+
+    Args:
+        module (obj:`nn.Module`): The module of PyTorch with a user-defined
+            attribute `_params_init_info` which records the initialization
+            information.
+        init_info (str): The string that describes the initialization.
+    """
+    assert hasattr(
+        module,
+        '_params_init_info'), f'Can not find `_params_init_info` in {module}'
+    for name, param in module.named_parameters():
+
+        assert param in module._params_init_info, (
+            f'Find a new :obj:`Parameter` '
+            f'named `{name}` during executing the '
+            f'`init_weights` of '
+            f'`{module.__class__.__name__}`. '
+            f'Please do not add or '
+            f'replace parameters during executing '
+            f'the `init_weights`. ')
+
+        # The parameter has been changed during executing the
+        # `init_weights` of module
+        mean_value = param.data.mean().cpu()
+        if module._params_init_info[param]['tmp_mean_value'] != mean_value:
+            module._params_init_info[param]['init_info'] = init_info
+            module._params_init_info[param]['tmp_mean_value'] = mean_value
+
+
+def constant_init(module, val, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.constant_(module.weight, val)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def xavier_init(module, gain=1, bias=0, distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if hasattr(module, 'weight') and module.weight is not None:
+        if distribution == 'uniform':
+            nn.init.xavier_uniform_(module.weight, gain=gain)
+        else:
+            nn.init.xavier_normal_(module.weight, gain=gain)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def normal_init(module, mean=0, std=1, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def trunc_normal_init(module: nn.Module,
+                      mean: float = 0,
+                      std: float = 1,
+                      a: float = -2,
+                      b: float = 2,
+                      bias: float = 0) -> None:
+    if hasattr(module, 'weight') and module.weight is not None:
+        trunc_normal_(module.weight, mean, std, a, b)  # type: ignore
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)  # type: ignore
+
+
+def uniform_init(module, a=0, b=1, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.uniform_(module.weight, a, b)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def kaiming_init(module,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if hasattr(module, 'weight') and module.weight is not None:
+        if distribution == 'uniform':
+            nn.init.kaiming_uniform_(
+                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+        else:
+            nn.init.kaiming_normal_(
+                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def caffe2_xavier_init(module, bias=0):
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    kaiming_init(
+        module,
+        a=1,
+        mode='fan_in',
+        nonlinearity='leaky_relu',
+        bias=bias,
+        distribution='uniform')
+
+
+def bias_init_with_prob(prior_prob):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+def _get_bases_name(m):
+    return [b.__name__ for b in m.__class__.__bases__]
+
+
+class BaseInit:
+
+    def __init__(self, *, bias=0, bias_prob=None, layer=None):
+        self.wholemodule = False
+        if not isinstance(bias, (int, float)):
+            raise TypeError(f'bias must be a number, but got a {type(bias)}')
+
+        if bias_prob is not None:
+            if not isinstance(bias_prob, float):
+                raise TypeError(f'bias_prob type must be float, \
+                    but got {type(bias_prob)}')
+
+        if layer is not None:
+            if not isinstance(layer, (str, list)):
+                raise TypeError(f'layer must be a str or a list of str, \
+                    but got a {type(layer)}')
+        else:
+            layer = []
+
+        if bias_prob is not None:
+            self.bias = bias_init_with_prob(bias_prob)
+        else:
+            self.bias = bias
+        self.layer = [layer] if isinstance(layer, str) else layer
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Constant')
+class ConstantInit(BaseInit):
+    """Initialize module parameters with constant values.
+
+    Args:
+        val (int | float): the value to fill the weights in the module with
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, val, **kwargs):
+        super().__init__(**kwargs)
+        self.val = val
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                constant_init(m, self.val, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    constant_init(m, self.val, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: val={self.val}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Xavier')
+class XavierInit(BaseInit):
+    r"""Initialize module parameters with values according to the method
+    described in the paper below.
+
+    `Understanding the difficulty of training deep feedforward
+    neural networks - Glorot, X. & Bengio, Y. (2010).
+    <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+
+    Args:
+        gain (int | float): an optional scaling factor. Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        distribution (str): distribution either be ``'normal'``
+            or ``'uniform'``. Defaults to ``'normal'``.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, gain=1, distribution='normal', **kwargs):
+        super().__init__(**kwargs)
+        self.gain = gain
+        self.distribution = distribution
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                xavier_init(m, self.gain, self.bias, self.distribution)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    xavier_init(m, self.gain, self.bias, self.distribution)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: gain={self.gain}, ' \
+               f'distribution={self.distribution}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Normal')
+class NormalInit(BaseInit):
+    r"""Initialize module parameters with the values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
+
+    Args:
+        mean (int | float):the mean of the normal distribution. Defaults to 0.
+        std (int | float): the standard deviation of the normal distribution.
+            Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, mean=0, std=1, **kwargs):
+        super().__init__(**kwargs)
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                normal_init(m, self.mean, self.std, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    normal_init(m, self.mean, self.std, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: mean={self.mean},' \
+               f' std={self.std}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='TruncNormal')
+class TruncNormalInit(BaseInit):
+    r"""Initialize module parameters with the values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values
+    outside :math:`[a, b]`.
+
+    Args:
+        mean (float): the mean of the normal distribution. Defaults to 0.
+        std (float):  the standard deviation of the normal distribution.
+            Defaults to 1.
+        a (float): The minimum cutoff value.
+        b ( float): The maximum cutoff value.
+        bias (float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 mean: float = 0,
+                 std: float = 1,
+                 a: float = -2,
+                 b: float = 2,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.mean = mean
+        self.std = std
+        self.a = a
+        self.b = b
+
+    def __call__(self, module: nn.Module) -> None:
+
+        def init(m):
+            if self.wholemodule:
+                trunc_normal_init(m, self.mean, self.std, self.a, self.b,
+                                  self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    trunc_normal_init(m, self.mean, self.std, self.a, self.b,
+                                      self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a}, b={self.b},' \
+               f' mean={self.mean}, std={self.std}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Uniform')
+class UniformInit(BaseInit):
+    r"""Initialize module parameters with values drawn from the uniform
+    distribution :math:`\mathcal{U}(a, b)`.
+
+    Args:
+        a (int | float): the lower bound of the uniform distribution.
+            Defaults to 0.
+        b (int | float): the upper bound of the uniform distribution.
+            Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, a=0, b=1, **kwargs):
+        super().__init__(**kwargs)
+        self.a = a
+        self.b = b
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                uniform_init(m, self.a, self.b, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    uniform_init(m, self.a, self.b, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a},' \
+               f' b={self.b}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Kaiming')
+class KaimingInit(BaseInit):
+    r"""Initialize module parameters with the values according to the method
+    described in the paper below.
+
+    `Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification - He, K. et al. (2015).
+    <https://www.cv-foundation.org/openaccess/content_iccv_2015/
+    papers/He_Delving_Deep_into_ICCV_2015_paper.pdf>`_
+
+    Args:
+        a (int | float): the negative slope of the rectifier used after this
+            layer (only used with ``'leaky_relu'``). Defaults to 0.
+        mode (str):  either ``'fan_in'`` or ``'fan_out'``. Choosing
+            ``'fan_in'`` preserves the magnitude of the variance of the weights
+            in the forward pass. Choosing ``'fan_out'`` preserves the
+            magnitudes in the backwards pass. Defaults to ``'fan_out'``.
+        nonlinearity (str): the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` .
+            Defaults to 'relu'.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        distribution (str): distribution either be ``'normal'`` or
+            ``'uniform'``. Defaults to ``'normal'``.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 distribution='normal',
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.a = a
+        self.mode = mode
+        self.nonlinearity = nonlinearity
+        self.distribution = distribution
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                kaiming_init(m, self.a, self.mode, self.nonlinearity,
+                             self.bias, self.distribution)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    kaiming_init(m, self.a, self.mode, self.nonlinearity,
+                                 self.bias, self.distribution)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a}, mode={self.mode}, ' \
+               f'nonlinearity={self.nonlinearity}, ' \
+               f'distribution ={self.distribution}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Caffe2Xavier')
+class Caffe2XavierInit(KaimingInit):
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    def __init__(self, **kwargs):
+        super().__init__(
+            a=1,
+            mode='fan_in',
+            nonlinearity='leaky_relu',
+            distribution='uniform',
+            **kwargs)
+
+    def __call__(self, module):
+        super().__call__(module)
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Pretrained')
+class PretrainedInit:
+    """Initialize module by loading a pretrained model.
+
+    Args:
+        checkpoint (str): the checkpoint file of the pretrained model should
+            be load.
+        prefix (str, optional): the prefix of a sub-module in the pretrained
+            model. it is for loading a part of the pretrained model to
+            initialize. For example, if we would like to only load the
+            backbone of a detector model, we can set ``prefix='backbone.'``.
+            Defaults to None.
+        map_location (str): map tensors into proper locations. Defaults to cpu.
+    """
+
+    def __init__(self, checkpoint, prefix=None, map_location='cpu'):
+        self.checkpoint = checkpoint
+        self.prefix = prefix
+        self.map_location = map_location
+
+    def __call__(self, module):
+        from mmengine.runner.checkpoint import (_load_checkpoint_with_prefix,
+                                                load_checkpoint,
+                                                load_state_dict)
+        if self.prefix is None:
+            print_log(f'load model from: {self.checkpoint}', logger='current')
+            load_checkpoint(
+                module,
+                self.checkpoint,
+                map_location=self.map_location,
+                strict=False,
+                logger='current')
+        else:
+            print_log(
+                f'load {self.prefix} in model from: {self.checkpoint}',
+                logger='current')
+            state_dict = _load_checkpoint_with_prefix(
+                self.prefix, self.checkpoint, map_location=self.map_location)
+            load_state_dict(module, state_dict, strict=False, logger='current')
+
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: load from {self.checkpoint}'
+        return info
+
+
+def _initialize(module, cfg, wholemodule=False):
+    func = build_from_cfg(cfg, WEIGHT_INITIALIZERS)
+    # wholemodule flag is for override mode, there is no layer key in override
+    # and initializer will give init values for the whole module with the name
+    # in override.
+    func.wholemodule = wholemodule
+    func(module)
+
+
+def _initialize_override(module, override, cfg):
+    if not isinstance(override, (dict, list)):
+        raise TypeError(f'override must be a dict or a list of dict, \
+                but got {type(override)}')
+
+    override = [override] if isinstance(override, dict) else override
+
+    for override_ in override:
+
+        cp_override = copy.deepcopy(override_)
+        name = cp_override.pop('name', None)
+        if name is None:
+            raise ValueError('`override` must contain the key "name",'
+                             f'but got {cp_override}')
+        # if override only has name key, it means use args in init_cfg
+        if not cp_override:
+            cp_override.update(cfg)
+        # if override has name key and other args except type key, it will
+        # raise error
+        elif 'type' not in cp_override.keys():
+            raise ValueError(
+                f'`override` need "type" key, but got {cp_override}')
+
+        if hasattr(module, name):
+            _initialize(getattr(module, name), cp_override, wholemodule=True)
+        else:
+            raise RuntimeError(f'module did not have attribute {name}, '
+                               f'but init_cfg is {cp_override}.')
+
+
+def initialize(module, init_cfg):
+    r"""Initialize a module.
+
+    Args:
+        module (``torch.nn.Module``): the module will be initialized.
+        init_cfg (dict | list[dict]): initialization configuration dict to
+            define initializer. OpenMMLab has implemented 6 initializers
+            including ``Constant``, ``Xavier``, ``Normal``, ``Uniform``,
+            ``Kaiming``, and ``Pretrained``.
+
+    Example:
+        >>> module = nn.Linear(2, 3, bias=True)
+        >>> init_cfg = dict(type='Constant', layer='Linear', val =1 , bias =2)
+        >>> initialize(module, init_cfg)
+        >>> module = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2))
+        >>> # define key ``'layer'`` for initializing layer with different
+        >>> # configuration
+        >>> init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
+                dict(type='Constant', layer='Linear', val=2)]
+        >>> initialize(module, init_cfg)
+        >>> # define key``'override'`` to initialize some specific part in
+        >>> # module
+        >>> class FooNet(nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.feat = nn.Conv2d(3, 16, 3)
+        >>>         self.reg = nn.Conv2d(16, 10, 3)
+        >>>         self.cls = nn.Conv2d(16, 5, 3)
+        >>> model = FooNet()
+        >>> init_cfg = dict(type='Constant', val=1, bias=2, layer='Conv2d',
+        >>>     override=dict(type='Constant', name='reg', val=3, bias=4))
+        >>> initialize(model, init_cfg)
+        >>> model = ResNet(depth=50)
+        >>> # Initialize weights with the pretrained model.
+        >>> init_cfg = dict(type='Pretrained',
+                checkpoint='torchvision://resnet50')
+        >>> initialize(model, init_cfg)
+        >>> # Initialize weights of a sub-module with the specific part of
+        >>> # a pretrained model by using "prefix".
+        >>> url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\
+        >>>     'retinanet_r50_fpn_1x_coco/'\
+        >>>     'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
+        >>> init_cfg = dict(type='Pretrained',
+                checkpoint=url, prefix='backbone.')
+    """
+    if not isinstance(init_cfg, (dict, list)):
+        raise TypeError(f'init_cfg must be a dict or a list of dict, \
+                but got {type(init_cfg)}')
+
+    if isinstance(init_cfg, dict):
+        init_cfg = [init_cfg]
+
+    for cfg in init_cfg:
+        # should deeply copy the original config because cfg may be used by
+        # other modules, e.g., one init_cfg shared by multiple bottleneck
+        # blocks, the expected cfg will be changed after pop and will change
+        # the initialization behavior of other modules
+        cp_cfg = copy.deepcopy(cfg)
+        override = cp_cfg.pop('override', None)
+        _initialize(module, cp_cfg)
+
+        if override is not None:
+            cp_cfg.pop('layer', None)
+            _initialize_override(module, override, cp_cfg)
+        else:
+            # All attributes in module have same initialization.
+            pass
+
+
+def _no_grad_trunc_normal_(tensor: Tensor, mean: float, std: float, a: float,
+                           b: float) -> Tensor:
+    # Method based on
+    # https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    # Modified from
+    # https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
+            'The distribution of values may be incorrect.',
+            stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        lower = norm_cdf((a - mean) / std)
+        upper = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [lower, upper], then translate
+        # to [2lower-1, 2upper-1].
+        tensor.uniform_(2 * lower - 1, 2 * upper - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor: Tensor,
+                  mean: float = 0.,
+                  std: float = 1.,
+                  a: float = -2.,
+                  b: float = 2.) -> Tensor:
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    Modified from
+    https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+
+    Args:
+        tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`.
+        mean (float): the mean of the normal distribution.
+        std (float): the standard deviation of the normal distribution.
+        a (float): the minimum cutoff value.
+        b (float): the maximum cutoff value.
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
diff --git a/head_extractor/build/lib/mmengine/model/wrappers/__init__.py b/head_extractor/build/lib/mmengine/model/wrappers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..90eddabbe1df4f694a19f1c1523fce09b40b2858
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/wrappers/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils.dl_utils import TORCH_VERSION
+from mmengine.utils.version_utils import digit_version
+from .distributed import MMDistributedDataParallel
+from .seperate_distributed import MMSeparateDistributedDataParallel
+from .utils import is_model_wrapper
+
+__all__ = [
+    'MMDistributedDataParallel', 'is_model_wrapper',
+    'MMSeparateDistributedDataParallel'
+]
+
+if digit_version(TORCH_VERSION) >= digit_version('2.0.0'):
+    from .fully_sharded_distributed import \
+        MMFullyShardedDataParallel  # noqa:F401
+    __all__.append('MMFullyShardedDataParallel')
diff --git a/head_extractor/build/lib/mmengine/model/wrappers/distributed.py b/head_extractor/build/lib/mmengine/model/wrappers/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..4113aebf9ee49015f84d25f2f242c43ee9d92a23
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/wrappers/distributed.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict, Union
+
+import torch
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+from mmengine.optim import OptimWrapper
+from mmengine.registry import MODEL_WRAPPERS
+from ..utils import detect_anomalous_params
+
+MODEL_WRAPPERS.register_module(module=DistributedDataParallel)
+MODEL_WRAPPERS.register_module(module=DataParallel)
+
+
+@MODEL_WRAPPERS.register_module()
+class MMDistributedDataParallel(DistributedDataParallel):
+    """A distributed model wrapper used for training,testing and validation in
+    loop.
+
+    Different from DistributedDataParallel, MMDistributedDataParallel
+    implements three methods :meth:`train_step`, :meth:`val_step` and
+    :meth:`test_step`, which will be called by ``train_loop``, ``val_loop``
+    and ``test_loop``.
+
+    - ``train_step``: Called by ``runner.train_loop``, and implement
+      default model forward, gradient back propagation, parameter updating
+      logic. To take advantage of DistributedDataParallel's automatic gradient
+      synchronization, ``train_step`` calls ``DistributedDataParallel.forward``
+      to calculate the losses, and call other methods of :class:`BaseModel` to
+      pre-process data and parse losses. Finally, update model parameters by
+      :class:`OptimWrapper` and return the loss dictionary used
+      for logging.
+
+    - ``val_step``: Called by ``runner.val_loop`` and get the inference
+      results. Since there is no gradient synchronization requirement,
+      this procedure is equivalent to ``BaseModel.val_step``
+
+    - ``test_step``: Called by ``runner.test_loop``, equivalent ``val_step``.
+
+    Args:
+        detect_anomalous_params (bool): This option is only used for
+            debugging which will slow down the training speed.
+            Detect anomalous parameters that are not included in
+            the computational graph with `loss` as the root.
+            There are two cases
+
+            - Parameters were not used during forward pass.
+            - Parameters were not used to produce loss.
+
+            Defaults to False.
+
+        **kwargs: keyword arguments passed to ``DistributedDataParallel``.
+
+            - device_ids (List[int] or torch.device, optional): CUDA devices
+              for module.
+            - output_device (int or torch.device, optional): Device location of
+              output for single-device CUDA modules.
+            - dim (int): Defaults to 0.
+            - broadcast_buffers (bool): Flag that enables syncing (
+              broadcasting) buffers of the module at beginning of the
+              ``forward`` function. Defaults to True
+            - find_unused_parameters (bool): Whether to find parameters of
+              module, which are not in the forward graph. Defaults to False.
+            - process_group (ProcessGroup, optional): The process group to be
+              used for distributed data all-reduction.
+            - bucket_cap_mb (int): bucket size in MegaBytes (MB). Defaults
+              to 25.
+            - check_reduction (bool): This argument is deprecated. Defaults
+              to False.
+            - gradient_as_bucket_view (bool): Defaults to False.
+            - static_graph (bool): Defaults to False.
+
+    See more information about arguments in
+    :class:`torch.nn.parallel.DistributedDataParallel`.
+
+    Note:
+        If model has multiple submodules and each module has
+        separate optimization strategies,
+        :class:`MMSeparateDistributedDataParallel` should be used to wrap
+        the model.
+
+    Note:
+        If model itself has custom optimization strategy, rather than
+        simply forward model and update model. A custom model wrapper
+        inherit from ``MMDistributedDataParallel`` should be defined and
+        override the ``train_step`` method.
+    """
+
+    def __init__(self,
+                 module,
+                 detect_anomalous_params: bool = False,
+                 **kwargs):
+        super().__init__(module=module, **kwargs)
+        self.detect_anomalous_params = detect_anomalous_params
+
+    def train_step(self, data: Union[dict, tuple, list],
+                   optim_wrapper: OptimWrapper) -> Dict[str, torch.Tensor]:
+        """Interface for model forward, backward and parameters updating during
+        training process.
+
+        :meth:`train_step` will perform the following steps in order:
+
+        - If :attr:`module` defines the preprocess method,
+          call ``module.preprocess`` to pre-processing data.
+        - Call ``module.forward(**data)`` and get losses.
+        - Parse losses.
+        - Call ``optim_wrapper.optimizer_step`` to update parameters.
+        - Return log messages of losses.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            optim_wrapper (OptimWrapper): A wrapper of optimizer to
+                update parameters.
+
+        Returns:
+            Dict[str, torch.Tensor]: A ``dict`` of tensor for logging.
+        """
+        # Enable automatic mixed precision training context.
+        with optim_wrapper.optim_context(self):
+            data = self.module.data_preprocessor(data, training=True)
+            losses = self._run_forward(data, mode='loss')
+        parsed_loss, log_vars = self.module.parse_losses(losses)
+        optim_wrapper.update_params(parsed_loss)
+        if self.detect_anomalous_params:
+            detect_anomalous_params(parsed_loss, model=self)
+        return log_vars
+
+    def val_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the prediction of module during validation process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        return self.module.val_step(data)
+
+    def test_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the predictions of module during testing process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        return self.module.test_step(data)
+
+    def _run_forward(self, data: Union[dict, tuple, list], mode: str) -> Any:
+        """Unpacks data for :meth:`forward`
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            mode (str): Mode of forward.
+
+        Returns:
+            dict or list: Results of training or testing mode.
+        """
+        if isinstance(data, dict):
+            results = self(**data, mode=mode)
+        elif isinstance(data, (list, tuple)):
+            results = self(*data, mode=mode)
+        else:
+            raise TypeError('Output of `data_preprocessor` should be '
+                            f'list, tuple or dict, but got {type(data)}')
+        return results
diff --git a/head_extractor/build/lib/mmengine/model/wrappers/fully_sharded_distributed.py b/head_extractor/build/lib/mmengine/model/wrappers/fully_sharded_distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6b145ecf9a7bf8b5bbc2e759e8d4b128dd6b802
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/wrappers/fully_sharded_distributed.py
@@ -0,0 +1,453 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed import ProcessGroup
+# yapf: disable
+from torch.distributed.fsdp.api import (FullStateDictConfig,
+                                        LocalOptimStateDictConfig,
+                                        LocalStateDictConfig,
+                                        OptimStateDictConfig,
+                                        ShardedOptimStateDictConfig,
+                                        ShardedStateDictConfig,
+                                        ShardingStrategy, StateDictConfig,
+                                        StateDictSettings, StateDictType)
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    BackwardPrefetch, CPUOffload, FullOptimStateDictConfig,
+    FullyShardedDataParallel, MixedPrecision)
+
+# yapf: enable
+from mmengine.optim import OptimWrapper
+from mmengine.registry import FUNCTIONS, MODEL_WRAPPERS
+from mmengine.structures import BaseDataElement
+from mmengine.utils import digit_version, is_seq_of
+
+
+@MODEL_WRAPPERS.register_module()
+class MMFullyShardedDataParallel(FullyShardedDataParallel):
+    """A wrapper for sharding Module parameters across data parallel workers.
+
+    Different from FullyShardedDataParallel, MMFullyShardedDataParallel
+    implements three methods :meth:`train_step`, :meth:`val_step` and
+    :meth:`test_step`, which will be called by ``train_loop``, ``val_loop``
+    and ``test_loop``.
+
+    - ``train_step``: Called by ``runner.train_loop``, and implement
+      default model forward, gradient back propagation, parameter updating
+      logic.
+
+    - ``val_step``: Called by ``runner.val_loop`` and get the inference
+      results. Specially, since MMFullyShardedDataParallel will wrap model
+      recursively, it may cause some problem if one just use
+      ``BaseModel.val_step`` to implement ``val_step`` here. To avoid that,
+      ``val_step`` will call methods of :obj:`BaseModel` to pre-process
+      data first, and use ``FullyShardedDataParallel.forward`` to get result.
+
+    - ``test_step``: Called by ``runner.test_loop`` and get the inference
+      results. Its logic is equivalent to ``val_loop``.
+
+    Args:
+        module (nn.Module): module to be wrapped with FSDP.
+        process_group (ProcessGroup, optional): process group for sharding.
+        cpu_offload (bool, CPUOffload, optional):
+            CPU offloading config.
+            Different from FullyShardedDataParallel,Since it can be set by
+            users' pre-defined config in MMEngine,its type is expected to be
+            `None`, `bool` or `CPUOffload`.
+
+            Currently, only parameter and gradient CPU offload is supported.
+            It can be enabled via passing in
+            ``cpu_offload=CPUOffload(offload_params=True)``. Note that this
+            currently implicitly enables gradient offloading to CPU in order
+            for params and grads to be on same device to work with optimizer.
+            This API is subject to change. Default is ``None`` in which case
+            there will be no offloading.
+        auto_wrap_policy (str or Callable, optional):
+            Specifying a policy to recursively wrap layers with FSDP.
+            Different from FullyShardedDataParallel, Since it can be set by
+            users' pre-defined config in MMEngine, its type is expected to be
+            `None`, `str` or `Callable`. If it's `str`, then
+            MMFullyShardedDataParallel will try to get specified method in
+            ``FSDP_WRAP_POLICIES`` registry,and this method will be passed to
+            FullyShardedDataParallel to finally initialize model.
+
+            Note that this policy currently will only apply to child modules of
+            the passed in module. The remainder modules are always wrapped in
+            the returned FSDP root instance.
+            ``default_auto_wrap_policy`` written in
+            ``torch.distributed.fsdp.wrap`` is an example of
+            ``auto_wrap_policy`` callable, this policy wraps layers with
+            parameter sizes larger than 100M. Users can supply the customized
+            ``auto_wrap_policy`` callable that should accept following
+            arguments: ``module: nn.Module``, ``recurse: bool``,
+            ``unwrapped_params: int``, extra customized arguments could be
+            added to the customized ``auto_wrap_policy`` callable as well.
+
+            Example::
+
+                >>> def custom_auto_wrap_policy(
+                >>>     module: nn.Module,
+                >>>     recurse: bool,
+                >>>     unwrapped_params: int,
+                >>>     # These are customizable for this policy function.
+                >>>     min_num_params: int = int(1e8),
+                >>> ) -> bool:
+                >>>     return unwrapped_params >= min_num_params
+
+        backward_prefetch (str or BackwardPrefetch, optional):
+            Different from FullyShardedDataParallel, this argument could be a
+            string or a BackwardPrefetch instance. If it's a string, then
+            it should be ``BACKWARD_PRE`` or ``BACKWARD_POST``
+        mixed_precision  (dict or MixedPrecision, optional):
+            This configures native mixed precision for FSDP. If this is set to
+            ``None``. Different from the native FSDP, this argument can a dict
+            like this:
+
+            Examples:
+                >>> mixed_precision=dict(param_dtype='float16',
+                >>>                      buffer_dtype='float32',
+                >>>                      reduce_dtype='float32')
+
+            Defaults to None.
+        use_orig_params (bool): Different from native
+            ``FullyShardedDataParallel``, it defaults to True.
+        **kwargs: Keyword arguments passed to
+            :class:`FullyShardedDataParallel`.
+    """
+
+    def __init__(
+        self,
+        module: nn.Module,
+        process_group: Union[dict, ProcessGroup, None] = None,
+        sharding_strategy: Union[str, ShardingStrategy] = None,
+        cpu_offload: Union[bool, CPUOffload, None] = None,
+        auto_wrap_policy: Union[str, Callable, None] = None,
+        backward_prefetch: Union[str, BackwardPrefetch, None] = None,
+        mixed_precision: Union[dict, MixedPrecision, None] = None,
+        param_init_fn: Union[str, Callable[[nn.Module], None]] = None,
+        use_orig_params: bool = True,
+        **kwargs,
+    ):
+        if isinstance(sharding_strategy, str):
+            sharding_strategy = ShardingStrategy[sharding_strategy]
+        if not (isinstance(sharding_strategy, ShardingStrategy)
+                or sharding_strategy is None):
+            raise TypeError(
+                'sharding_strategy must be str or enum of `ShardingStrategy` '
+                f', but got {sharding_strategy}')
+
+        if isinstance(cpu_offload, bool):
+            cpu_offload = CPUOffload(offload_params=cpu_offload)
+        if not (isinstance(cpu_offload, CPUOffload) or cpu_offload is None):
+            raise TypeError(
+                '`cpu_offload` should be `None`, `bool`'
+                f'or `CPUOffload`, but has type {type(cpu_offload)}')
+
+        with FUNCTIONS.switch_scope_and_registry(None):
+            if isinstance(auto_wrap_policy, str):
+                auto_wrap_policy = FUNCTIONS.get(  # type: ignore
+                    auto_wrap_policy)
+                if auto_wrap_policy is None:
+                    raise ValueError('`auto_wrap_policy` is not registered!')
+            elif isinstance(auto_wrap_policy, dict):
+                policy = auto_wrap_policy.pop('type')
+                if isinstance(policy, str):
+                    policy = FUNCTIONS.get(policy)  # type: ignore
+                if policy is None:
+                    raise ValueError('`auto_wrap_policy` is not registered!')
+                auto_wrap_policy = partial(policy, **auto_wrap_policy)
+
+            if not (auto_wrap_policy is None
+                    or callable(auto_wrap_policy)):  # type: ignore
+                raise TypeError('`auto_wrap_policy` should be a str, a '
+                                'callable, a dict or None, but has type '
+                                f'{type(auto_wrap_policy)}')
+
+            if isinstance(backward_prefetch, str):
+                backward_prefetch = BackwardPrefetch[backward_prefetch]
+            if not (isinstance(backward_prefetch, BackwardPrefetch)
+                    or backward_prefetch is None):
+                raise TypeError(
+                    '`backward_prefetch` should be `None`, string of '
+                    '"BACKWARD_PRE" and "BACKWARD_POST", or '
+                    f'`BackwardPrefetch`, but has type {type(backward_prefetch)}'  # noqa: E501
+                )
+
+            if isinstance(param_init_fn, str):
+                param_init_fn = FUNCTIONS.get(  # type: ignore
+                    param_init_fn)
+                if param_init_fn is None:
+                    raise ValueError('`param_init_fn` is not registered!')
+            elif isinstance(param_init_fn, dict):
+                init_fn = param_init_fn.pop('type')
+                if isinstance(param_init_fn, str):
+                    init_fn = FUNCTIONS.get(init_fn)  # type: ignore
+                if init_fn is None:
+                    raise ValueError('`param_init_fn` is not registered!')
+                param_init_fn = partial(init_fn, **param_init_fn)
+
+            if not (callable(param_init_fn) or param_init_fn is None):
+                raise TypeError('`param_init_fn` should be a str, a '
+                                'callable, a dict or None, but has type '
+                                f'{type(param_init_fn)}')
+
+        def parse_dtype(dtype):
+            if dtype is None:
+                return None
+            elif isinstance(dtype, str):
+                return getattr(torch, dtype)
+            elif isinstance(dtype, torch.dtype):
+                return dtype
+            else:
+                raise TypeError(
+                    '`dtype` should be `None`, `str` or `torch.dtype`, '
+                    f'but has type {type(dtype)}')
+
+        if isinstance(mixed_precision, dict):
+            mixed_precision['param_dtype'] = parse_dtype(
+                mixed_precision.get('param_dtype', None))
+            mixed_precision['reduce_dtype'] = parse_dtype(
+                mixed_precision.get('reduce_dtype', None))
+            mixed_precision['buffer_dtype'] = parse_dtype(
+                mixed_precision.get('buffer_dtype', None))
+            mixed_precision = MixedPrecision(**mixed_precision)
+        elif isinstance(mixed_precision, MixedPrecision):
+            mixed_precision = mixed_precision
+        elif mixed_precision is not None:
+            raise TypeError(
+                '`mixed_precision` should be `None`, `dict` or '
+                f'`MixedPrecision`, but has type {type(mixed_precision)}')
+
+        # ignored_parameters and ignored_modules will be deprecated by PyTorch.
+        # Therefore we hide them in **kwargs.
+        # TODO: Update when PyTorch 2.1.0 released
+        if 'ignored_parameters' in kwargs:
+            kwargs['ignored_parameters'] = self._get_ignored_params(
+                module, kwargs['ignored_parameters'])
+
+        if 'ignored_modules' in kwargs:
+            kwargs['ignored_modules'] = self._get_ignored_modules(
+                module, kwargs['ignored_modules'])
+
+        super().__init__(
+            module=module,
+            process_group=process_group,
+            sharding_strategy=sharding_strategy,
+            auto_wrap_policy=auto_wrap_policy,
+            cpu_offload=cpu_offload,
+            backward_prefetch=backward_prefetch,
+            mixed_precision=mixed_precision,
+            param_init_fn=param_init_fn,
+            use_orig_params=use_orig_params,
+            **kwargs)
+
+    def train_step(self, data: dict,
+                   optim_wrapper: OptimWrapper) -> Dict[str, torch.Tensor]:
+        """Interface for model forward, backward and parameters updating during
+        training process.
+
+        :meth:`train_step` will perform the following steps in order:
+
+        - If :attr:`module` defines the preprocess method,
+            call ``module.preprocess`` to pre-processing data.
+        - Call ``module.forward(**data)`` and get losses.
+        - Parse losses.
+        - Call ``optim_wrapper.optimizer_step`` to update parameters.
+        - Return log messages of losses.
+
+        Args:
+            data (dict): Data sampled by dataloader.
+            optim_wrapper (OptimWrapper): A wrapper of optimizer to
+                update parameters.
+
+        Returns:
+            Dict[str, torch.Tensor]: A ``dict`` of tensor for logging.
+        """
+        # enable automatic mixed precision training context.
+        with optim_wrapper.optim_context(self):
+            data = self.module.data_preprocessor(data, training=True)
+            if isinstance(data, dict):
+                losses = self(**data, mode='loss')
+            elif isinstance(data, (list, tuple)):
+                losses = self(*data, mode='loss')
+            else:
+                raise TypeError('Output of `data_preprocessor` should be '
+                                f'list tuple or dict, but got {type(data)}')
+        parsed_loss, log_vars = self.module.parse_losses(losses)
+        optim_wrapper.update_params(parsed_loss)
+        return log_vars
+
+    def val_step(self, data: dict) -> List[BaseDataElement]:
+        """Gets the prediction of module during validation process.
+
+        Args:
+            data (dict): Data sampled by dataloader.
+
+        Returns:
+            List[BaseDataElement] or dict: The predictions of given data.
+        """
+        data = self.module.data_preprocessor(data, False)
+        return self._run_forward(data, mode='predict')  # type: ignore
+
+    def test_step(self, data: dict) -> List[BaseDataElement]:
+        """Gets the predictions of module during testing process.
+
+        Args:
+            data (dict): Data sampled by dataloader.
+
+        Returns:
+            List[BaseDataElement]: The predictions of given data.
+        """
+        data = self.module.data_preprocessor(data, False)
+        return self._run_forward(data, mode='predict')  # type: ignore
+
+    def _run_forward(self, data: Union[dict, tuple, list],
+                     mode: str) -> Union[Dict[str, torch.Tensor], list]:
+        """Unpacks data for :meth:`forward`
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            mode (str): Mode of forward.
+        Returns:
+            dict or list: Results of training or testing mode.
+        """
+        if isinstance(data, dict):
+            results = self(**data, mode=mode)
+        elif isinstance(data, (list, tuple)):
+            results = self(*data, mode=mode)
+        else:
+            raise TypeError('Output of `data_preprocessor` should be '
+                            f'list, tuple or dict, but got {type(data)}')
+        return results
+
+    def _get_ignored_params(self, module: nn.Module,
+                            ignored_parameters: Union[Iterable[str],
+                                                      Iterable[nn.Module]]):
+        """Get params from string."""
+        params_dict = dict(module.named_parameters())
+        if is_seq_of(ignored_parameters, str):
+            ignored_parameters = [
+                params_dict[name] for name in ignored_parameters
+            ]
+        if not is_seq_of(ignored_parameters,
+                         nn.Parameter) and ignored_parameters is not None:
+            raise TypeError(
+                '`ignored_modules` should be `None`, `Iterable[str]` or '
+                '`Iterable[nn.Parameters]`, but has type '
+                f'{type(ignored_parameters)}')
+        return ignored_parameters
+
+    def _get_ignored_modules(self, module: nn.Module,
+                             ignored_modules: Union[Iterable[str],
+                                                    Iterable[nn.Module]]):
+        """Get modules from string."""
+        modules_dict = dict(module.named_modules())
+        if is_seq_of(ignored_modules, str):
+            ignored_modules = [modules_dict[name] for name in ignored_modules]
+        if not is_seq_of(ignored_modules,
+                         nn.Module) and ignored_modules is not None:
+            raise TypeError(
+                '`ignored_modules` should be `None`, `Iterable[str]` or '
+                '`Iterable[nn.Module]`, but has type '
+                f'{type(ignored_modules)}')
+        return ignored_modules
+
+    if digit_version(torch.__version__) < digit_version('2.0.1'):
+
+        @staticmethod
+        def optim_state_dict(
+            model: torch.nn.Module,
+            optim: torch.optim.Optimizer,
+            group: Optional[dist.ProcessGroup] = None,
+        ) -> Dict[str, Any]:
+            """copied from pytorch 2.0.1 which has fixed some bugs."""
+            state_dict_settings = FullyShardedDataParallel.get_state_dict_type(
+                model)
+            return FullyShardedDataParallel._optim_state_dict_impl(
+                model=model,
+                optim=optim,
+                optim_state_dict=optim.state_dict(),
+                optim_input=None,
+                rank0_only=getattr(state_dict_settings.optim_state_dict_config,
+                                   'rank0_only', False),
+                full_state_dict=state_dict_settings.state_dict_type ==
+                StateDictType.FULL_STATE_DICT,
+                group=group,
+            )
+
+        @staticmethod
+        def set_state_dict_type(
+            module: nn.Module,
+            state_dict_type: StateDictType,
+            state_dict_config: Optional[StateDictConfig] = None,
+            optim_state_dict_config: Optional[OptimStateDictConfig] = None,
+        ) -> StateDictSettings:
+            """copied from pytorch 2.0.1 which has fixed some bugs."""
+            import torch.distributed.fsdp._traversal_utils as traversal_utils
+            _state_dict_type_to_config = {
+                StateDictType.FULL_STATE_DICT: FullStateDictConfig,
+                StateDictType.LOCAL_STATE_DICT: LocalStateDictConfig,
+                StateDictType.SHARDED_STATE_DICT: ShardedStateDictConfig,
+            }
+            _optim_state_dict_type_to_config = {
+                StateDictType.FULL_STATE_DICT: FullOptimStateDictConfig,
+                StateDictType.LOCAL_STATE_DICT: LocalOptimStateDictConfig,
+                StateDictType.SHARDED_STATE_DICT: ShardedOptimStateDictConfig,
+            }
+
+            # Use the default config if a state_dict config is not set.
+            state_dict_config_type = _state_dict_type_to_config[
+                state_dict_type]
+            optim_state_dict_config_type = _optim_state_dict_type_to_config[
+                state_dict_type]
+            if state_dict_config is None:
+                state_dict_config = state_dict_config_type()
+            if optim_state_dict_config is None:
+                optim_state_dict_config = optim_state_dict_config_type()
+            if state_dict_config_type != type(state_dict_config):
+                raise RuntimeError('Expected state_dict_config of type '
+                                   f'{state_dict_config_type} '
+                                   f'but got {type(state_dict_config)}')
+            if optim_state_dict_config_type != type(optim_state_dict_config):
+                raise RuntimeError('Expected optim_state_dict_config of type '
+                                   f'{optim_state_dict_config_type} '
+                                   f'but got {type(optim_state_dict_config)}')
+
+            # Set the state_dict type and configurations.
+            prev_state_dict_type = None
+            prev_state_dict_config = None
+            prev_optim_state_dict_config = None
+            for submodule in traversal_utils._get_fsdp_states(module):
+                if prev_state_dict_type is None:
+                    prev_state_dict_type = submodule._state_dict_type
+                else:
+                    assert (
+                        prev_state_dict_type == submodule._state_dict_type
+                    ), 'All FSDP modules should have the same state_dict_type.'
+                if prev_state_dict_config is None:
+                    prev_state_dict_config = submodule._state_dict_config
+                else:
+                    assert isinstance(
+                        submodule._state_dict_config,
+                        type(prev_state_dict_config)), (
+                            'All FSDP modules must have the same type of '
+                            'state_dict_config.')
+                if prev_optim_state_dict_config is None:
+                    prev_optim_state_dict_config = \
+                        submodule._optim_state_dict_config
+                else:
+                    assert isinstance(
+                        submodule._optim_state_dict_config,
+                        type(prev_optim_state_dict_config),
+                    ), ('All FSDP modules must have the same type of '
+                        'optim_state_dict_config.')
+
+                submodule._state_dict_type = state_dict_type
+                submodule._state_dict_config = state_dict_config
+                submodule._optim_state_dict_config = optim_state_dict_config
+
+            return StateDictSettings(prev_state_dict_type,
+                                     prev_state_dict_config,
+                                     prev_optim_state_dict_config)
diff --git a/head_extractor/build/lib/mmengine/model/wrappers/seperate_distributed.py b/head_extractor/build/lib/mmengine/model/wrappers/seperate_distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac9c2383c325282a19d655b156820b711de04d53
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/wrappers/seperate_distributed.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from contextlib import ExitStack, contextmanager
+from typing import Dict, Union
+
+import torch
+import torch.nn as nn
+from torch.nn.parallel.distributed import DistributedDataParallel
+
+from mmengine.device import get_device
+from mmengine.optim import OptimWrapperDict
+from mmengine.registry import MODEL_WRAPPERS
+from .distributed import MMDistributedDataParallel
+
+
+@MODEL_WRAPPERS.register_module()
+class MMSeparateDistributedDataParallel(DistributedDataParallel):
+    """A DistributedDataParallel wrapper for models in MMGeneration.
+
+    In MMedting and MMGeneration there is a need to wrap different modules in
+    the models with separate DistributedDataParallel. Otherwise, it will cause
+    errors for GAN training. For example, the GAN model, usually has two
+    submodules: generator and discriminator. If we wrap both of them in one
+    standard DistributedDataParallel, it will cause errors during training,
+    because when we update the parameters of the generator (or discriminator),
+    the parameters of the discriminator (or generator) is not updated, which is
+    not allowed for DistributedDataParallel. So we design this wrapper to
+    separately wrap DistributedDataParallel for generator and discriminator.
+    In this wrapper, we perform two operations:
+
+    1. Wraps each module in the models with separate MMDistributedDataParallel.
+       Note that only modules with parameters will be wrapped.
+    2. Calls ``train_step``, ``val_step`` and ``test_step`` of submodules to
+       get losses and predictions.
+
+    Args:
+        module (nn.Module): model contain multiple submodules which have
+            separately updating strategy.
+        broadcast_buffers (bool): Same as that in
+            ``torch.nn.parallel.distributed.DistributedDataParallel``.
+            Defaults to False.
+        find_unused_parameters (bool): Same as that in
+            ``torch.nn.parallel.distributed.DistributedDataParallel``.
+            Traverse the autograd graph of all tensors contained in returned
+            value of the wrapped module's forward function. Defaults to False.
+        **kwargs: Keyword arguments passed to ``MMDistributedDataParallel``.
+
+            - device_ids (List[int] or torch.device, optional): CUDA devices
+              for module.
+            - output_device (int or torch.device, optional): Device location of
+              output for single-device CUDA modules.
+            - dim (int): Defaults to 0.
+            - process_group (ProcessGroup, optional): The process group to be
+              used for distributed data all-reduction.
+            - bucket_cap_mb (int): bucket size in MegaBytes (MB). Defaults
+              to 25.
+            - check_reduction (bool): This argument is deprecated. Defaults
+              to False.
+            - gradient_as_bucket_view (bool): Defaults to False.
+            - static_graph (bool): Defaults to False.
+
+    See more information about arguments in
+    :class:`torch.nn.parallel.DistributedDataParallel`.
+    """
+
+    def __init__(self,
+                 module: nn.Module,
+                 broadcast_buffers: bool = False,
+                 find_unused_parameters: bool = False,
+                 **kwargs):
+        super(DistributedDataParallel, self).__init__()
+        self.module = module
+        device = get_device()
+        # Wrap the submodule with parameters of `self.module` to
+        # `MMDistributedDataParallel`
+        for name, sub_module in module._modules.items():
+            # module without parameters.
+            if next(sub_module.parameters(), None) is None:
+                sub_module = sub_module.to(device)
+            elif all(not p.requires_grad for p in sub_module.parameters()):
+                sub_module = sub_module.to(device)
+            else:
+                sub_module = MMDistributedDataParallel(
+                    module=sub_module.to(device),
+                    broadcast_buffers=broadcast_buffers,
+                    find_unused_parameters=find_unused_parameters,
+                    **kwargs)
+            module._modules[name] = sub_module
+
+    def train_step(self, data: Union[dict, tuple, list],
+                   optim_wrapper: OptimWrapperDict) -> Dict[str, torch.Tensor]:
+        """Interface for model forward, backward and parameters updating during
+        training process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            optim_wrapper (OptimWrapperDict): A wrapper of optimizer to
+                update parameters.
+
+        Returns:
+            Dict[str, torch.Tensor]: A dict of tensor for logging.
+        """
+        return self.module.train_step(data, optim_wrapper)
+
+    def val_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the prediction of module during validation process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        return self.module.val_step(data)
+
+    def test_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the predictions of module during testing process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        return self.module.test_step(data)
+
+    @contextmanager
+    def no_sync(self):
+        """Enables ``no_sync`` context of all sub ``MMDistributedDataParallel``
+        modules."""
+        with ExitStack() as stack:
+            for sub_ddp_model in self.module._modules.values():
+                stack.enter_context(sub_ddp_model.no_sync())
+                yield
+
+    def train(self, mode: bool = True) -> 'MMSeparateDistributedDataParallel':
+        """Sets the module in training mode.
+
+        In order to make the ddp wrapper inheritance hierarchy more uniform,
+        ``MMSeparateDistributedDataParallel`` inherits from
+        ``DistributedDataParallel``, but will not call its constructor.
+        Since the attributes of ``DistributedDataParallel`` have not been
+        initialized, call the ``train`` method of ``DistributedDataParallel``
+        will raise an error if pytorch version <= 1.9. Therefore, override
+        this method to call the ``train`` method of submodules.
+
+        Args:
+            mode (bool): whether to set training mode (``True``) or evaluation
+                mode (``False``). Defaults to ``True``.
+
+        Returns:
+            Module: self.
+        """
+        self.training = mode
+        self.module.train(mode)
+        return self
diff --git a/head_extractor/build/lib/mmengine/model/wrappers/utils.py b/head_extractor/build/lib/mmengine/model/wrappers/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..86e1e123b98b8ad2172214cb3ddc8e688eacecff
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/model/wrappers/utils.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+from mmengine.registry import MODEL_WRAPPERS, Registry
+
+
+def is_model_wrapper(model: nn.Module, registry: Registry = MODEL_WRAPPERS):
+    """Check if a module is a model wrapper.
+
+    The following 4 model in MMEngine (and their subclasses) are regarded as
+    model wrappers: DataParallel, DistributedDataParallel,
+    MMDataParallel, MMDistributedDataParallel. You may add you own
+    model wrapper by registering it to ``mmengine.registry.MODEL_WRAPPERS``.
+
+    Args:
+        model (nn.Module): The model to be checked.
+        registry (Registry): The parent registry to search for model wrappers.
+
+    Returns:
+        bool: True if the input model is a model wrapper.
+    """
+    module_wrappers = tuple(registry.module_dict.values())
+    if isinstance(model, module_wrappers):
+        return True
+
+    if not registry.children:
+        return False
+
+    return any(
+        is_model_wrapper(model, child) for child in registry.children.values())
diff --git a/head_extractor/build/lib/mmengine/optim/__init__.py b/head_extractor/build/lib/mmengine/optim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0a2ec6e37ddaa8b1df506cb7458007adae044ea
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/__init__.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .optimizer import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS,
+                        AmpOptimWrapper, ApexOptimWrapper, BaseOptimWrapper,
+                        DefaultOptimWrapperConstructor, OptimWrapper,
+                        OptimWrapperDict, ZeroRedundancyOptimizer,
+                        build_optim_wrapper)
+# yapf: disable
+from .scheduler import (ConstantLR, ConstantMomentum, ConstantParamScheduler,
+                        CosineAnnealingLR, CosineAnnealingMomentum,
+                        CosineAnnealingParamScheduler, ExponentialLR,
+                        ExponentialMomentum, ExponentialParamScheduler,
+                        LinearLR, LinearMomentum, LinearParamScheduler,
+                        MultiStepLR, MultiStepMomentum,
+                        MultiStepParamScheduler, OneCycleLR,
+                        OneCycleParamScheduler, PolyLR, PolyMomentum,
+                        PolyParamScheduler, ReduceOnPlateauLR,
+                        ReduceOnPlateauMomentum, ReduceOnPlateauParamScheduler,
+                        StepLR, StepMomentum, StepParamScheduler,
+                        _ParamScheduler)
+
+# yapf: enable
+__all__ = [
+    'OPTIM_WRAPPER_CONSTRUCTORS', 'OPTIMIZERS', 'build_optim_wrapper',
+    'DefaultOptimWrapperConstructor', 'ConstantLR', 'CosineAnnealingLR',
+    'ExponentialLR', 'LinearLR', 'MultiStepLR', 'StepLR', 'ConstantMomentum',
+    'CosineAnnealingMomentum', 'ExponentialMomentum', 'LinearMomentum',
+    'MultiStepMomentum', 'StepMomentum', 'ConstantParamScheduler',
+    'CosineAnnealingParamScheduler', 'ExponentialParamScheduler',
+    'LinearParamScheduler', 'MultiStepParamScheduler', 'StepParamScheduler',
+    '_ParamScheduler', 'OptimWrapper', 'AmpOptimWrapper', 'ApexOptimWrapper',
+    'OptimWrapperDict', 'OneCycleParamScheduler', 'OneCycleLR', 'PolyLR',
+    'PolyMomentum', 'PolyParamScheduler', 'ReduceOnPlateauLR',
+    'ReduceOnPlateauMomentum', 'ReduceOnPlateauParamScheduler',
+    'ZeroRedundancyOptimizer', 'BaseOptimWrapper'
+]
diff --git a/head_extractor/build/lib/mmengine/optim/optimizer/__init__.py b/head_extractor/build/lib/mmengine/optim/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebf1f1e3a56badc5161fff712ea3fe552e67f695
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/optimizer/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .amp_optimizer_wrapper import AmpOptimWrapper
+from .apex_optimizer_wrapper import ApexOptimWrapper
+from .base import BaseOptimWrapper
+from .builder import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS,
+                      build_optim_wrapper)
+from .default_constructor import DefaultOptimWrapperConstructor
+from .optimizer_wrapper import OptimWrapper
+from .optimizer_wrapper_dict import OptimWrapperDict
+from .zero_optimizer import ZeroRedundancyOptimizer
+
+__all__ = [
+    'OPTIM_WRAPPER_CONSTRUCTORS', 'OPTIMIZERS',
+    'DefaultOptimWrapperConstructor', 'build_optim_wrapper', 'OptimWrapper',
+    'AmpOptimWrapper', 'ApexOptimWrapper', 'OptimWrapperDict',
+    'ZeroRedundancyOptimizer', 'BaseOptimWrapper'
+]
diff --git a/head_extractor/build/lib/mmengine/optim/optimizer/amp_optimizer_wrapper.py b/head_extractor/build/lib/mmengine/optim/optimizer/amp_optimizer_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f3323f2cccd358a75827cae9e7384860b6ebf6f
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/optimizer/amp_optimizer_wrapper.py
@@ -0,0 +1,190 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from contextlib import contextmanager
+from typing import Union
+
+import torch
+import torch.nn as nn
+
+from mmengine.device import (is_cuda_available, is_mlu_available,
+                             is_musa_available, is_npu_available)
+from mmengine.registry import OPTIM_WRAPPERS
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+from .optimizer_wrapper import OptimWrapper
+
+if is_npu_available():
+    from torch.npu.amp import GradScaler
+elif is_mlu_available():
+    from torch.mlu.amp import GradScaler
+else:
+    from torch.cuda.amp import GradScaler
+
+
+@OPTIM_WRAPPERS.register_module()
+class AmpOptimWrapper(OptimWrapper):
+    """A subclass of :class:`OptimWrapper` that supports automatic mixed
+    precision training based on torch.cuda.amp.
+
+    ``AmpOptimWrapper`` provides a unified interface with
+    ``OptimWrapper``, so ``AmpOptimWrapper`` can be used in the same way
+    as ``OptimWrapper``.
+
+    Warnings:
+        ``AmpOptimWrapper`` requires PyTorch >= 1.6.
+
+    Args:
+        loss_scale (float or str or dict): The initial configuration of
+            `torch.cuda.amp.GradScaler`. See more specific arguments
+            introduction at `PyTorch AMP <https://pytorch.org/docs/stable/amp.html?highlight=gradscalertorch.cuda.amp.GradScaler>`_ # noqa: E501
+            Defaults to ``dynamic``.
+
+            - "dynamic": Initialize GradScale without any arguments.
+            - float: Initialize GradScaler with ``init_scale``.
+            - dict: Initialize GradScaler with more detail configuration.
+
+        dtype (str or torch.dtype, optional): The data type to autocast in amp.
+            If a ``str`` is given, it will be converted to ``torch.dtype``.
+            Valid ``str`` format are `'float16'`, `'bfloat16'`, `'float32'` and
+            `'float64'`. If set to ``None``, the default data type will be used.
+            Defaults to None.
+            `New in version 0.6.1.`
+        use_fsdp (bool): Using ``ShardedGradScaler`` when it is True. It should
+            be enabled when using ``FullyShardedDataParallel``.
+            Defaults to False.
+            `New in version 0.8.0.`
+        **kwargs: Keyword arguments passed to OptimWrapper.
+
+    Warnings:
+        ``dtype`` argument is only available with PyTorch version >= 1.10.0. If
+        you use PyTorch of an older version, it will be ignored.
+
+    Note:
+        If you use ``IterBasedRunner`` and enable gradient accumulation,
+        the original `max_iters` should be multiplied by
+        ``accumulative_counts``.
+    """
+
+    valid_dtypes = ('float16', 'bfloat16', 'float32', 'float64')
+
+    def __init__(self,
+                 loss_scale: str = 'dynamic',
+                 dtype: Union[str, torch.dtype] = None,
+                 use_fsdp: bool = False,
+                 **kwargs):
+        assert digit_version(TORCH_VERSION) >= digit_version('1.6.0'), (
+            '`torch.cuda.amp` is only available when pytorch version >= 1.6')
+        assert is_cuda_available() or is_npu_available() or is_mlu_available(
+        ) or is_musa_available(), (
+            '``AmpOptimizerWrapper`` is only available training '
+            'on gpu, npu, mlu or musa')
+        super().__init__(**kwargs)
+        self._scale_update_param = None
+
+        if use_fsdp:
+            if digit_version(torch.__version__) >= digit_version('2.0.0'):
+                from torch.distributed.fsdp.sharded_grad_scaler import \
+                    ShardedGradScaler
+                scaler_type = ShardedGradScaler
+            else:
+                raise RuntimeError(
+                    'PyTorch>=2.0.0 is required when sets `use_fsdp=True`')
+        else:
+            scaler_type = GradScaler
+
+        if loss_scale == 'dynamic':
+            #  If loss_scale is a string, it must be 'dynamic', then dynamic
+            #  loss scaling will be used.
+            self.loss_scaler = scaler_type()
+        elif isinstance(loss_scale, float):
+            # Static loss scaling
+            self._scale_update_param = loss_scale
+            self.loss_scaler = scaler_type(init_scale=loss_scale)
+        elif isinstance(loss_scale, dict):
+            # More specific configuration.
+            self.loss_scaler = scaler_type(**loss_scale)
+        else:
+            raise TypeError('loss_scale must be of type float, dict, or '
+                            f'"dynamic", but got {loss_scale}')
+
+        # convert string value to torch.dtype
+        if isinstance(dtype, str):
+            assert dtype in self.valid_dtypes, (
+                f'dtype should be any of {self.valid_dtypes}, got {dtype}')
+            dtype = getattr(torch, dtype)
+
+        assert dtype is None or isinstance(dtype, torch.dtype), (
+            f'dtype should be None or instance of torch.dtype, got {dtype}')
+        self.cast_dtype = dtype
+
+    def backward(self, loss: torch.Tensor, **kwargs):
+        """Perform gradient back propagation with :attr:`loss_scaler`.
+
+        Args:
+            loss (torch.Tensor): The loss of current iteration.
+            kwargs: Keyword arguments passed to :meth:`torch.Tensor.backward`
+        """
+        self.loss_scaler.scale(loss).backward(**kwargs)
+        self._inner_count += 1
+
+    def step(self, **kwargs):
+        """Update parameters with :attr:`loss_scaler`.
+
+        Args:
+            kwargs: Keyword arguments passed to
+                :meth:`torch.optim.Optimizer.step`.
+        """
+        if self.clip_grad_kwargs:
+            self.loss_scaler.unscale_(self.optimizer)
+            self._clip_grad()
+        self.loss_scaler.step(self.optimizer, **kwargs)
+        self.loss_scaler.update(self._scale_update_param)
+
+    def state_dict(self) -> dict:
+        """Get the state dictionary of :attr:`optimizer` and
+        :attr:`loss_scaler`.
+
+        Based on the state dictionary of the optimizer, the returned state
+        dictionary will add a key named "loss_scaler".
+
+        Returns:
+            dict: The merged state dict of :attr:`loss_scaler` and
+            :attr:`optimizer`.
+        """
+        # save state_dict of loss_scaler
+        state_dict = super().state_dict()
+        state_dict['loss_scaler'] = self.loss_scaler.state_dict()
+        return state_dict
+
+    def load_state_dict(self, state_dict: dict):
+        """Load and parse the state dictionary of :attr:`optimizer` and
+        :attr:`loss_scaler`.
+
+        If state_dict contains "loss_scaler.", the :attr:`loss_scaler` will
+        load the corresponding keys. Otherwise, only the :attr:`optimizer`
+        will load the state dictionary.
+
+        Args:
+            state_dict (dict): The state dict of :attr:`optimizer` and
+                :attr:`loss_scaler`
+        """
+        if 'loss_scaler' in state_dict:
+            self.loss_scaler.load_state_dict(state_dict.pop('loss_scaler'))
+
+        if 'base_param_settings' in state_dict:
+            self.base_param_settings = state_dict.pop('base_param_settings')
+
+        # load state_dict of optimizer
+        self.optimizer.load_state_dict(state_dict)
+
+    @contextmanager
+    def optim_context(self, model: nn.Module):
+        """Enables the context for mixed precision training, and enables the
+        context for disabling gradient synchronization during gradient
+        accumulation context.
+
+        Args:
+            model (nn.Module): The training model.
+        """
+        from mmengine.runner.amp import autocast
+        with super().optim_context(model), autocast(dtype=self.cast_dtype):
+            yield
diff --git a/head_extractor/build/lib/mmengine/optim/optimizer/apex_optimizer_wrapper.py b/head_extractor/build/lib/mmengine/optim/optimizer/apex_optimizer_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f2f6f4a1b5c2c2bf5c14a672da5fd576c72a0c2
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/optimizer/apex_optimizer_wrapper.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from contextlib import contextmanager
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+# a circular import will be caused by
+# from mmengine.model.wrappers import is_model_wrapper
+import mmengine
+from mmengine.registry import OPTIM_WRAPPERS
+from .optimizer_wrapper import OptimWrapper
+
+try:
+    import apex.amp as apex_amp
+except ImportError:
+    apex_amp = None
+
+
+@OPTIM_WRAPPERS.register_module()
+class ApexOptimWrapper(OptimWrapper):
+    """A subclass of :class:`OptimWrapper` that supports automatic mixed
+    precision training based on apex.amp.
+
+    ``ApexOptimWrapper`` provides a unified interface with
+    ``OptimWrapper``, so it can be used in the same way as ``OptimWrapper``.
+
+    Warning:
+        ``ApexOptimWrapper`` requires `nvidia apex <https://github.com/NVIDIA/apex>`_
+
+    Args:
+        opt_level (str): Pure or mixed precision optimization level. Accepted
+            values are "O0", "O1", "O2", and "O3". Defaults to "O1".
+        loss_scale (float or str, optional): If passed as a string, must be a
+            string representing a number, e.g., "128.0", or the string
+            "dynamic". Defaults to "dynamic".
+        enabled (bool): If False, renders all Amp calls no-ops, so your script
+            should run as if Amp were not present. Defaults to True.
+        cast_model_type (torch.dtype, optional): Model's parameters and
+            buffers to the desired type. Defaults to None.
+        patch_torch_functions (bool, optional): Patch all Torch functions
+            and Tensor methods to perform Tensor Core-friendly ops like GEMMs
+            and convolutions in FP16, and any ops that benefit from FP32
+            precision in FP32. Defaults to None.
+        keep_batchnorm_fp32 (bool or str, optional): To enhance precision
+            and enable cudnn batchnorm (which improves performance),
+            it's often beneficial to keep batchnorm weights in FP32
+            even if the rest of the model is FP16.
+            If passed as a string, must be the string "True" or "False".
+            Defaults to None.
+        master_weights (bool, optional): Maintain FP32 master weights to
+            accompany any FP16 model weights. FP32 master weights are stepped
+            by the optimizer to enhance precision and capture small gradients.
+            Defaults to None.
+        cast_model_outputs (torch.dtype, optional): Option to ensure that
+            the outputs of your model(s) are always cast to a particular type
+            regardless of ``opt_level``. Defaults to None.
+        num_losses (int): Option to tell Amp in advance how many
+            losses/backward passes you plan to use. Defaults to 1.
+        verbosity (int): Set to 0 to suppress Amp-related output.
+            Defaults to 1.
+        min_loss_scale (float, optional): Sets a floor for the loss scale
+            values that can be chosen by dynamic loss scaling.
+            The default value of None means that no floor is imposed.
+            If dynamic loss scaling is not used, `min_loss_scale` is ignored.
+            Defaults to None.
+        max_loss_scale (float, optional): Sets a ceiling for the loss scale
+            values that can be chosen by dynamic loss scaling. If dynamic
+            loss scaling is not used, `max_loss_scale` is ignored.
+            Defaults to 2.**24.
+        **kwargs: Keyword arguments passed to OptimWrapper.
+
+    Note:
+        If you use ``IterBasedRunner`` and enable gradient accumulation,
+        the original `max_iters` should be multiplied by
+        ``accumulative_counts``.
+
+    Note:
+        `New in version 0.6.0.`
+    """  # noqa: E501
+
+    def __init__(self,
+                 opt_level: str = 'O1',
+                 loss_scale: Union[float, str, None] = 'dynamic',
+                 enabled: Optional[bool] = True,
+                 cast_model_type: Optional[torch.dtype] = None,
+                 patch_torch_functions: Optional[bool] = None,
+                 keep_batchnorm_fp32: Union[bool, str, None] = None,
+                 master_weights: Optional[bool] = None,
+                 cast_model_outputs: Optional[torch.dtype] = None,
+                 num_losses: int = 1,
+                 verbosity: int = 1,
+                 min_loss_scale: Optional[float] = None,
+                 max_loss_scale: Optional[float] = 2.**24,
+                 **kwargs):
+        assert apex_amp is not None, \
+            'Apex is not installed. Please check ' \
+            'https://github.com/NVIDIA/apex#linux.'
+        super().__init__(**kwargs)
+        self.opt_level = opt_level
+        self.loss_scale = loss_scale
+        self.enabled = enabled
+        self.cast_model_type = cast_model_type
+        self.patch_torch_functions = patch_torch_functions
+        self.keep_batchnorm_fp32 = keep_batchnorm_fp32
+        self.master_weights = master_weights
+        self.cast_model_outputs = cast_model_outputs
+        self.num_losses = num_losses
+        self.verbosity = verbosity
+        self.min_loss_scale = min_loss_scale
+        self.max_loss_scale = max_loss_scale
+        self._apex_amp_state_dict = None
+
+    def backward(self, loss: torch.Tensor, **kwargs) -> None:
+        """Perform gradient back propagation with :attr:`loss_scaler`.
+
+        Args:
+            loss (torch.Tensor): The loss of current iteration.
+            kwargs: Keyword arguments passed to :meth:`torch.Tensor.backward`
+        """
+        with apex_amp.scale_loss(loss, self.optimizer) as scaled_loss:
+            scaled_loss.backward(**kwargs)
+        self._inner_count += 1
+
+    def state_dict(self) -> dict:
+        """Get the state dictionary of :attr:`optimizer` and
+        :attr:`apex_amp`.
+
+        Based on the state dictionary of the optimizer, the returned state
+        dictionary will add a key named "apex_amp".
+
+        Returns:
+            dict: The merged state dict of :attr:`apex_amp` and
+            :attr:`optimizer`.
+        """
+        state_dict = self.optimizer.state_dict()
+        state_dict['apex_amp'] = apex_amp.state_dict()
+        return state_dict
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        """Load and parse the state dictionary of :attr:`optimizer` and
+        :attr:`apex_amp`.
+
+        If state_dict contains "apex_amp", the :attr:`apex_amp` will
+        load the corresponding keys. Otherwise, only the :attr:`optimizer`
+        will load the state dictionary.
+
+        Note:
+            :meth:`load_state_dict` shuold be called after
+            `apex_amp.initialize` is called.
+        Args:
+            state_dict (dict): The state dict of :attr:`optimizer` and
+                :attr:`apex_amp`
+        """
+        if 'apex_amp' in state_dict:
+            # when `apex_amp` is not initialized, calling `load_state_dict`
+            # will raise an error, so we temporarily cache the apex_amp
+            # part, and then load it into `apex_amp` after completing
+            # the `apex_amp` initialization in `optim_context` method
+            if hasattr(self.optimizer, '_amp_stash'):
+                apex_amp.load_state_dict(state_dict.pop('apex_amp'))
+            else:
+                self._apex_amp_state_dict = state_dict.pop('apex_amp')
+        self.optimizer.load_state_dict(state_dict)
+
+    @contextmanager
+    def optim_context(self, model: nn.Module):
+        """Enables the context for mixed precision training, and enables the
+        context for disabling gradient synchronization during gradient
+        accumulation context.
+
+        Args:
+            model (nn.Module): The training model.
+        """
+        with super().optim_context(model):
+            # when a given optimizer be passed through apex_amp.initialize,
+            # the "_amp_stash" property will be added
+            if not hasattr(self.optimizer, '_amp_stash'):
+                if mmengine.model.wrappers.is_model_wrapper(model):
+                    model = model.module
+                model, self.optimizer = apex_amp.initialize(
+                    model,
+                    self.optimizer,
+                    opt_level=self.opt_level,
+                    loss_scale=self.loss_scale,
+                    enabled=self.enabled,
+                    cast_model_type=self.cast_model_type,
+                    patch_torch_functions=self.patch_torch_functions,
+                    keep_batchnorm_fp32=self.keep_batchnorm_fp32,
+                    master_weights=self.master_weights,
+                    cast_model_outputs=self.cast_model_outputs,
+                    num_losses=self.num_losses,
+                    verbosity=self.verbosity,
+                    min_loss_scale=self.min_loss_scale,
+                    max_loss_scale=self.max_loss_scale)
+                # loading apex_amp state_dict after initialization of apex_amp
+                if self._apex_amp_state_dict is not None:
+                    apex_amp.load_state_dict(self._apex_amp_state_dict)
+                    self._apex_amp_state_dict = None
+            yield
diff --git a/head_extractor/build/lib/mmengine/optim/optimizer/base.py b/head_extractor/build/lib/mmengine/optim/optimizer/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee53f508b146fc7057053f8805fd7345e1105352
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/optimizer/base.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List
+
+import torch
+
+
+class BaseOptimWrapper(metaclass=ABCMeta):
+
+    def __init__(self, optimizer):
+        self.optimizer = optimizer
+
+        # The Following code is used to initialize `base_param_settings`.
+        # `base_param_settings` is used to store the parameters that are not
+        # updated by the optimizer.
+        # The `base_param_settings` used for tracking the base learning in the
+        # optimizer. If the optimizer has multiple parameter groups, this
+        # params will not be scaled by the loss factor.
+        if len(optimizer.param_groups) > 1:
+            self.base_param_settings = {
+                'params': torch.tensor([0.0], dtype=torch.float)
+            }
+            self.base_param_settings.update(**self.optimizer.defaults)
+        else:
+            self.base_param_settings = None  # type: ignore
+
+    @abstractmethod
+    def update_params(self, *args, **kwargs):
+        """Update parameters in :attr:`optimizer`."""
+
+    @abstractmethod
+    def backward(self, loss: torch.Tensor, **kwargs) -> None:
+        """Perform gradient back propagation."""
+
+    @abstractmethod
+    def zero_grad(self, **kwargs) -> None:
+        """A wrapper of ``Optimizer.zero_grad``."""
+
+    @abstractmethod
+    def step(self, **kwargs):
+        """Call the step method of optimizer."""
+
+    def state_dict(self) -> dict:
+        """A wrapper of ``Optimizer.state_dict``."""
+        state_dict = self.optimizer.state_dict()
+        if self.base_param_settings is not None:
+            state_dict['base_param_settings'] = self.base_param_settings
+        return state_dict
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        """A wrapper of ``Optimizer.load_state_dict``. load the state dict of
+        :attr:`optimizer`.
+
+        Provide unified ``load_state_dict`` interface compatible with automatic
+        mixed precision training. Subclass can overload this method to
+        implement the required logic. For example, the state dictionary of
+        GradScaler should be loaded when training with ``torch.cuda.amp``.
+
+        Args:
+            state_dict (dict): The state dictionary of :attr:`optimizer`.
+        """
+        base_param_settings = state_dict.pop('base_param_settings', None)
+
+        if base_param_settings is not None:
+            self.base_param_settings = base_param_settings
+
+        # load state_dict of optimizer
+        self.optimizer.load_state_dict(state_dict)
+
+    @property
+    def param_groups(self) -> List[dict]:
+        """A wrapper of ``Optimizer.param_groups``.
+
+        Make OptimizeWrapper compatible with :class:`_ParamScheduler`.
+
+        Returns:
+             dict: the ``param_groups`` of :attr:`optimizer`.
+        """
+        if self.base_param_settings is not None:
+            return self.optimizer.param_groups + [self.base_param_settings]
+        else:
+            return self.optimizer.param_groups
+
+    @property
+    def defaults(self) -> dict:
+        """A wrapper of ``Optimizer.defaults``.
+
+        Make OptimizeWrapper compatible with :class:`_ParamScheduler`.
+
+        Returns:
+             dict: the ``param_groups`` of :attr:`optimizer`.
+        """
+        return self.optimizer.defaults
+
+    def get_lr(self):
+        """Get the learning rate of the optimizer.
+
+        Provide unified interface to get learning rate of optimizer.
+
+        Returns:
+            Dict[str, List[float]]:
+            param_groups learning rate of the optimizer.
+        """
+        res = {}
+        if self.base_param_settings is not None:
+            res['base_lr'] = [self.base_param_settings['lr']]
+
+        res['lr'] = [group['lr'] for group in self.optimizer.param_groups]
+
+        return res
+
+    def get_momentum(self) -> Dict[str, List[float]]:
+        """Get the momentum of the optimizer.
+
+        Provide unified interface to get momentum of optimizer.
+
+        Returns:
+            Dict[str, List[float]]: Momentum of the optimizer.
+        """
+        momentum = []
+        for group in self.optimizer.param_groups:
+            # Get momentum of SGD.
+            if 'momentum' in group.keys():
+                momentum.append(group['momentum'])
+            # Get momentum of Adam.
+            elif 'betas' in group.keys():
+                momentum.append(group['betas'][0])
+            else:
+                momentum.append(0)
+        return dict(momentum=momentum)
diff --git a/head_extractor/build/lib/mmengine/optim/optimizer/builder.py b/head_extractor/build/lib/mmengine/optim/optimizer/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..37e0370e917e12eb2d40a063975c07ff1f41d6dc
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/optimizer/builder.py
@@ -0,0 +1,216 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+from typing import List, Union
+
+import torch
+import torch.nn as nn
+
+from mmengine.config import Config, ConfigDict
+from mmengine.device import is_npu_available, is_npu_support_full_precision
+from mmengine.registry import OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS
+from .optimizer_wrapper import OptimWrapper
+
+
+def register_torch_optimizers() -> List[str]:
+    """Register optimizers in ``torch.optim`` to the ``OPTIMIZERS`` registry.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    torch_optimizers = []
+    for module_name in dir(torch.optim):
+        if module_name.startswith('__'):
+            continue
+        _optim = getattr(torch.optim, module_name)
+        if inspect.isclass(_optim) and issubclass(_optim,
+                                                  torch.optim.Optimizer):
+            #OPTIMIZERS.register_module(module=_optim)
+            if module_name == 'Adafactor':
+                OPTIMIZERS.register_module(
+                    name='torch_Adafactor', module=_optim)
+            else:
+                OPTIMIZERS.register_module(module=_optim)
+            torch_optimizers.append(module_name)
+    return torch_optimizers
+
+
+TORCH_OPTIMIZERS = register_torch_optimizers()
+
+
+def register_torch_npu_optimizers() -> List[str]:
+    """Register optimizers in ``torch npu`` to the ``OPTIMIZERS`` registry.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    if not is_npu_available():
+        return []
+
+    import torch_npu
+    if not hasattr(torch_npu, 'optim'):
+        return []
+
+    torch_npu_optimizers = []
+    for module_name in dir(torch_npu.optim):
+        if module_name.startswith('__') or module_name in OPTIMIZERS:
+            continue
+        _optim = getattr(torch_npu.optim, module_name)
+        if inspect.isclass(_optim) and issubclass(_optim,
+                                                  torch.optim.Optimizer):
+            OPTIMIZERS.register_module(module=_optim)
+            torch_npu_optimizers.append(module_name)
+    return torch_npu_optimizers
+
+
+NPU_OPTIMIZERS = register_torch_npu_optimizers()
+
+
+def register_dadaptation_optimizers() -> List[str]:
+    """Register optimizers in ``dadaptation`` to the ``OPTIMIZERS`` registry.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    dadaptation_optimizers = []
+    try:
+        import dadaptation
+    except ImportError:
+        pass
+    else:
+        for module_name in ['DAdaptAdaGrad', 'DAdaptAdam', 'DAdaptSGD']:
+            _optim = getattr(dadaptation, module_name)
+            if inspect.isclass(_optim) and issubclass(_optim,
+                                                      torch.optim.Optimizer):
+                OPTIMIZERS.register_module(module=_optim)
+                dadaptation_optimizers.append(module_name)
+    return dadaptation_optimizers
+
+
+DADAPTATION_OPTIMIZERS = register_dadaptation_optimizers()
+
+
+def register_lion_optimizers() -> List[str]:
+    """Register Lion optimizer to the ``OPTIMIZERS`` registry.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    optimizers = []
+    try:
+        from lion_pytorch import Lion
+    except ImportError:
+        pass
+    else:
+        OPTIMIZERS.register_module(module=Lion)
+        optimizers.append('Lion')
+    return optimizers
+
+
+LION_OPTIMIZERS = register_lion_optimizers()
+
+
+def register_sophia_optimizers() -> List[str]:
+    """Register Sophia optimizer to the ``OPTIMIZERS`` registry.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    optimizers = []
+    try:
+        import Sophia
+    except ImportError:
+        pass
+    else:
+        for module_name in dir(Sophia):
+            _optim = getattr(Sophia, module_name)
+            if inspect.isclass(_optim) and issubclass(_optim,
+                                                      torch.optim.Optimizer):
+                OPTIMIZERS.register_module(module=_optim)
+                optimizers.append(module_name)
+    return optimizers
+
+
+SOPHIA_OPTIMIZERS = register_sophia_optimizers()
+
+
+def register_bitsandbytes_optimizers() -> List[str]:
+    """Register optimizers in ``bitsandbytes`` to the ``OPTIMIZERS`` registry.
+
+    In the `bitsandbytes` library, optimizers that have the same name as the
+    default optimizers in PyTorch are prefixed with ``bnb_``. For example,
+    ``bnb_Adagrad``.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    dadaptation_optimizers = []
+    try:
+        import bitsandbytes as bnb
+    except ImportError:
+        pass
+    else:
+        optim_classes = inspect.getmembers(
+            bnb.optim, lambda _optim: (inspect.isclass(_optim) and issubclass(
+                _optim, torch.optim.Optimizer)))
+        for name, optim_cls in optim_classes:
+            if name in OPTIMIZERS:
+                name = f'bnb_{name}'
+            OPTIMIZERS.register_module(module=optim_cls, name=name)
+            dadaptation_optimizers.append(name)
+    return dadaptation_optimizers
+
+
+BITSANDBYTES_OPTIMIZERS = register_bitsandbytes_optimizers()
+
+
+def register_transformers_optimizers():
+    transformer_optimizers = []
+    try:
+        from transformers import Adafactor
+    except ImportError:
+        pass
+    else:
+        OPTIMIZERS.register_module(name='Adafactor', module=Adafactor)
+        transformer_optimizers.append('Adafactor')
+    return transformer_optimizers
+
+
+TRANSFORMERS_OPTIMIZERS = register_transformers_optimizers()
+
+
+def build_optim_wrapper(model: nn.Module,
+                        cfg: Union[dict, Config, ConfigDict]) -> OptimWrapper:
+    """Build function of OptimWrapper.
+
+    If ``constructor`` is set in the ``cfg``, this method will build an
+    optimizer wrapper constructor, and use optimizer wrapper constructor to
+    build the optimizer wrapper. If ``constructor`` is not set, the
+    ``DefaultOptimWrapperConstructor`` will be used by default.
+
+    Args:
+        model (nn.Module): Model to be optimized.
+        cfg (dict): Config of optimizer wrapper, optimizer constructor and
+            optimizer.
+
+    Returns:
+        OptimWrapper: The built optimizer wrapper.
+    """
+    optim_wrapper_cfg = copy.deepcopy(cfg)
+    constructor_type = optim_wrapper_cfg.pop('constructor',
+                                             'DefaultOptimWrapperConstructor')
+    paramwise_cfg = optim_wrapper_cfg.pop('paramwise_cfg', None)
+
+    # Since the current generation of NPU(Ascend 910) only supports
+    # mixed precision training, here we turn on mixed precision
+    # to make the training normal
+    if is_npu_available() and not is_npu_support_full_precision():
+        optim_wrapper_cfg['type'] = 'AmpOptimWrapper'
+
+    optim_wrapper_constructor = OPTIM_WRAPPER_CONSTRUCTORS.build(
+        dict(
+            type=constructor_type,
+            optim_wrapper_cfg=optim_wrapper_cfg,
+            paramwise_cfg=paramwise_cfg))
+    optim_wrapper = optim_wrapper_constructor(model)
+    return optim_wrapper
diff --git a/head_extractor/build/lib/mmengine/optim/optimizer/default_constructor.py b/head_extractor/build/lib/mmengine/optim/optimizer/default_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec223a79678ff5f88483adbbf2ca769c92dd49b1
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/optimizer/default_constructor.py
@@ -0,0 +1,321 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import logging
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import GroupNorm, LayerNorm
+
+from mmengine.logging import print_log
+from mmengine.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS,
+                               OPTIMIZERS)
+from mmengine.utils import is_list_of
+from mmengine.utils.dl_utils import mmcv_full_available
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm
+from .optimizer_wrapper import OptimWrapper
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class DefaultOptimWrapperConstructor:
+    """Default constructor for optimizers.
+
+    By default, each parameter share the same optimizer settings, and we
+    provide an argument ``paramwise_cfg`` to specify parameter-wise settings.
+    It is a dict and may contain the following fields:
+
+    - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If
+      one of the keys in ``custom_keys`` is a substring of the name of one
+      parameter, then the setting of the parameter will be specified by
+      ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will
+      be ignored. It should be noted that the aforementioned ``key`` is the
+      longest key that is a substring of the name of the parameter. If there
+      are multiple matched keys with the same length, then the key with lower
+      alphabet order will be chosen.
+      ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult``
+      and ``decay_mult``. See Example 2 below.
+    - ``bias_lr_mult`` (float): It will be multiplied to the learning
+      rate for all bias parameters (except for those in normalization
+      layers and offset layers of DCN).
+    - ``bias_decay_mult`` (float): It will be multiplied to the weight
+      decay for all bias parameters (except for those in
+      normalization layers, depthwise conv layers, offset layers of DCN).
+    - ``norm_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of normalization
+      layers.
+    - ``flat_decay_mult`` (float): It will be multiplied to the weight
+      decay for all one-dimensional parameters
+    - ``dwconv_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of depthwise conv
+      layers.
+    - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning
+      rate for parameters of offset layer in the deformable convs
+      of a model.
+    - ``bypass_duplicate`` (bool): If true, the duplicate parameters
+      would not be added into optimizer. Defaults to False.
+
+    Note:
+
+        1. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        override the effect of ``bias_lr_mult`` in the bias of offset layer.
+        So be careful when using both ``bias_lr_mult`` and
+        ``dcn_offset_lr_mult``. If you wish to apply both of them to the offset
+        layer in deformable convs, set ``dcn_offset_lr_mult`` to the original
+        ``dcn_offset_lr_mult`` * ``bias_lr_mult``.
+
+        2. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        apply it to all the DCN layers in the model. So be careful when the
+        model contains multiple DCN layers in places other than backbone.
+
+    Args:
+        optim_wrapper_cfg (dict): The config dict of the optimizer wrapper.
+
+            Required fields of ``optim_wrapper_cfg`` are
+
+            - ``type``: class name of the OptimizerWrapper
+            - ``optimizer``: The configuration of optimizer.
+
+            Optional fields of ``optim_wrapper_cfg`` are
+
+            - any arguments of the corresponding optimizer wrapper type,
+              e.g., accumulative_counts, clip_grad, etc.
+
+            Required fields of ``optimizer`` are
+
+            - `type`: class name of the optimizer.
+
+            Optional fields of ``optimizer`` are
+
+            - any arguments of the corresponding optimizer type, e.g.,
+              lr, weight_decay, momentum, etc.
+
+        paramwise_cfg (dict, optional): Parameter-wise options.
+
+    Example 1:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optim_wrapper_cfg = dict(
+        >>>     dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01,
+        >>>         momentum=0.9, weight_decay=0.0001))
+        >>> paramwise_cfg = dict(norm_decay_mult=0.)
+        >>> optim_wrapper_builder = DefaultOptimWrapperConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+
+    Example 2:
+        >>> # assume model have attribute model.backbone and model.cls_head
+        >>> optim_wrapper_cfg = dict(type='OptimWrapper', optimizer=dict(
+        >>>     type='SGD', lr=0.01, weight_decay=0.95))
+        >>> paramwise_cfg = dict(custom_keys={
+        >>>     'backbone': dict(lr_mult=0.1, decay_mult=0.9)})
+        >>> optim_wrapper_builder = DefaultOptimWrapperConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+        >>> # Then the `lr` and `weight_decay` for model.backbone is
+        >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for
+        >>> # model.cls_head is (0.01, 0.95).
+    """
+
+    def __init__(self,
+                 optim_wrapper_cfg: dict,
+                 paramwise_cfg: Optional[dict] = None):
+        if not isinstance(optim_wrapper_cfg, dict):
+            raise TypeError('optimizer_cfg should be a dict',
+                            f'but got {type(optim_wrapper_cfg)}')
+        assert 'optimizer' in optim_wrapper_cfg, (
+            '`optim_wrapper_cfg` must contain "optimizer" config')
+        self.optim_wrapper_cfg = optim_wrapper_cfg.copy()
+        self.optimizer_cfg = self.optim_wrapper_cfg.pop('optimizer')
+        self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg
+        self.base_lr = self.optimizer_cfg.get('lr', None)
+        self.base_wd = self.optimizer_cfg.get('weight_decay', None)
+        self._validate_cfg()
+
+    def _validate_cfg(self) -> None:
+        """verify the correctness of the config."""
+        if not isinstance(self.paramwise_cfg, dict):
+            raise TypeError('paramwise_cfg should be None or a dict, '
+                            f'but got {type(self.paramwise_cfg)}')
+
+        if 'custom_keys' in self.paramwise_cfg:
+            if not isinstance(self.paramwise_cfg['custom_keys'], dict):
+                raise TypeError(
+                    'If specified, custom_keys must be a dict, '
+                    f'but got {type(self.paramwise_cfg["custom_keys"])}')
+            if self.base_wd is None:
+                for key in self.paramwise_cfg['custom_keys']:
+                    if 'decay_mult' in self.paramwise_cfg['custom_keys'][key]:
+                        raise ValueError('base_wd should not be None')
+
+        # get base lr and weight decay
+        # weight_decay must be explicitly specified if mult is specified
+        if ('bias_decay_mult' in self.paramwise_cfg
+                or 'norm_decay_mult' in self.paramwise_cfg
+                or 'dwconv_decay_mult' in self.paramwise_cfg):
+            if self.base_wd is None:
+                raise ValueError('base_wd should not be None')
+
+    def _is_in(self, param_group: dict, param_group_list: list) -> bool:
+        """check whether the `param_group` is in the`param_group_list`"""
+        assert is_list_of(param_group_list, dict)
+        param = set(param_group['params'])
+        param_set = set()
+        for group in param_group_list:
+            param_set.update(set(group['params']))
+
+        return not param.isdisjoint(param_set)
+
+    def add_params(self,
+                   params: List[dict],
+                   module: nn.Module,
+                   prefix: str = '',
+                   is_dcn_module: Optional[Union[int, float]] = None) -> None:
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        # get param-wise options
+        custom_keys = self.paramwise_cfg.get('custom_keys', {})
+        # first sort with alphabet order and then sort with reversed len of str
+        sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
+
+        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', None)
+        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', None)
+        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', None)
+        dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', None)
+        flat_decay_mult = self.paramwise_cfg.get('flat_decay_mult', None)
+        bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)
+        dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', None)
+
+        # special rules for norm layers and depth-wise conv layers
+        is_norm = isinstance(module,
+                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))
+        is_dwconv = (
+            isinstance(module, torch.nn.Conv2d)
+            and module.in_channels == module.groups)
+
+        for name, param in module.named_parameters(recurse=False):
+            param_group = {'params': [param]}
+            if bypass_duplicate and self._is_in(param_group, params):
+                print_log(
+                    f'{prefix} is duplicate. It is skipped since '
+                    f'bypass_duplicate={bypass_duplicate}',
+                    logger='current',
+                    level=logging.WARNING)
+                continue
+            if not param.requires_grad:
+                print_log((f'{prefix}.{name} is skipped since its '
+                           f'requires_grad={param.requires_grad}'),
+                          logger='current',
+                          level=logging.WARNING)
+                continue
+
+            # if the parameter match one of the custom keys, ignore other rules
+            is_custom = False
+            for key in sorted_keys:
+                if key in f'{prefix}.{name}':
+                    is_custom = True
+                    lr_mult = custom_keys[key].get('lr_mult', 1.)
+                    param_group['lr'] = self.base_lr * lr_mult
+                    if self.base_wd is not None:
+                        decay_mult = custom_keys[key].get('decay_mult', 1.)
+                        param_group['weight_decay'] = self.base_wd * decay_mult
+                    # add custom settings to param_group
+                    for k, v in custom_keys[key].items():
+                        param_group[k] = v
+                    break
+
+            if not is_custom:
+                # bias_lr_mult affects all bias parameters
+                # except for norm.bias dcn.conv_offset.bias
+                if name == 'bias' and not (
+                        is_norm or is_dcn_module) and bias_lr_mult is not None:
+                    param_group['lr'] = self.base_lr * bias_lr_mult
+
+                if (prefix.find('conv_offset') != -1 and is_dcn_module
+                        and dcn_offset_lr_mult is not None
+                        and isinstance(module, torch.nn.Conv2d)):
+                    # deal with both dcn_offset's bias & weight
+                    param_group['lr'] = self.base_lr * dcn_offset_lr_mult
+
+                # apply weight decay policies
+                if self.base_wd is not None:
+                    # norm decay
+                    if is_norm and norm_decay_mult is not None:
+                        param_group[
+                            'weight_decay'] = self.base_wd * norm_decay_mult
+                    # bias lr and decay
+                    elif (name == 'bias' and not is_dcn_module
+                          and bias_decay_mult is not None):
+                        param_group[
+                            'weight_decay'] = self.base_wd * bias_decay_mult
+                    # depth-wise conv
+                    elif is_dwconv and dwconv_decay_mult is not None:
+                        param_group[
+                            'weight_decay'] = self.base_wd * dwconv_decay_mult
+                    # flatten parameters except dcn offset
+                    elif (param.ndim == 1 and not is_dcn_module
+                          and flat_decay_mult is not None):
+                        param_group[
+                            'weight_decay'] = self.base_wd * flat_decay_mult
+            params.append(param_group)
+            for key, value in param_group.items():
+                if key == 'params':
+                    continue
+                full_name = f'{prefix}.{name}' if prefix else name
+                print_log(
+                    f'paramwise_options -- {full_name}:{key}={value}',
+                    logger='current')
+
+        if mmcv_full_available():
+            from mmcv.ops import DeformConv2d, ModulatedDeformConv2d
+            is_dcn_module = isinstance(module,
+                                       (DeformConv2d, ModulatedDeformConv2d))
+        else:
+            is_dcn_module = False
+        for child_name, child_mod in module.named_children():
+            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
+            self.add_params(
+                params,
+                child_mod,
+                prefix=child_prefix,
+                is_dcn_module=is_dcn_module)
+
+    def __call__(self, model: nn.Module) -> OptimWrapper:
+        if hasattr(model, 'module'):
+            model = model.module
+
+        optim_wrapper_cfg = self.optim_wrapper_cfg.copy()
+        optim_wrapper_cfg.setdefault('type', 'OptimWrapper')
+        optimizer_cfg = self.optimizer_cfg.copy()
+        optimizer_cls = self.optimizer_cfg['type']
+        # Optimizer like HybridAdam in colossalai requires the argument name
+        # `model_params` rather than `params`. Here we get the first argument
+        # name and fill it with the model parameters.
+        if isinstance(optimizer_cls, str):
+            with OPTIMIZERS.switch_scope_and_registry(None) as registry:
+                optimizer_cls = registry.get(self.optimizer_cfg['type'])
+        fisrt_arg_name = next(
+            iter(inspect.signature(optimizer_cls).parameters))
+        # if no paramwise option is specified, just use the global setting
+        if not self.paramwise_cfg:
+            optimizer_cfg[fisrt_arg_name] = model.parameters()
+            optimizer = OPTIMIZERS.build(optimizer_cfg)
+        else:
+            # set param-wise lr and weight decay recursively
+            params: List = []
+            self.add_params(params, model)
+            optimizer_cfg[fisrt_arg_name] = params
+            optimizer = OPTIMIZERS.build(optimizer_cfg)
+        optim_wrapper = OPTIM_WRAPPERS.build(
+            optim_wrapper_cfg, default_args=dict(optimizer=optimizer))
+        return optim_wrapper
diff --git a/head_extractor/build/lib/mmengine/optim/optimizer/optimizer_wrapper.py b/head_extractor/build/lib/mmengine/optim/optimizer/optimizer_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..41218ef768b013a4881eb357962c0dafdfc5c017
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/optimizer/optimizer_wrapper.py
@@ -0,0 +1,411 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from contextlib import contextmanager
+from typing import Dict, List, Optional
+
+import torch
+import torch.nn as nn
+from torch.optim import Optimizer
+
+from mmengine.logging import MessageHub, print_log
+from mmengine.registry import OPTIM_WRAPPERS
+from mmengine.utils.dl_utils import has_batch_norm
+from .base import BaseOptimWrapper
+
+
+@OPTIM_WRAPPERS.register_module()
+class OptimWrapper(BaseOptimWrapper):
+    """Optimizer wrapper provides a common interface for updating parameters.
+
+    Optimizer wrapper provides a unified interface for single precision
+    training and automatic mixed precision training with different hardware.
+    OptimWrapper encapsulates optimizer to provide simplified interfaces
+    for commonly used training techniques such as gradient accumulative and
+    grad clips. ``OptimWrapper`` implements the basic logic of gradient
+    accumulation and gradient clipping based on ``torch.optim.Optimizer``.
+    The subclasses only need to override some methods to implement the mixed
+    precision training. See more information in :class:`AmpOptimWrapper`.
+
+    Args:
+        optimizer (Optimizer): Optimizer used to update model parameters.
+        accumulative_counts (int): The number of iterations to accumulate
+            gradients. The parameters will be updated per
+            ``accumulative_counts``.
+        clip_grad (dict, optional): If ``clip_grad`` is not None, it will be
+            the arguments of :func:`torch.nn.utils.clip_grad_norm_` or
+            :func:`torch.nn.utils.clip_grad_value_`. ``clip_grad`` should be a
+            dict, and the keys could be set as follows:
+
+            If the key ``type`` is not set, or ``type`` is "norm",
+            the accepted keys are as follows:
+
+            - max_norm (float or int): Max norm of the gradients.
+            - norm_type (float or int): Type of the used p-norm. Can be
+              ``'inf'`` for infinity norm.
+            - error_if_nonfinite (bool): If True, an error is thrown if
+              the total norm of the gradients from :attr:`parameters` is
+              ``nan``, ``inf``, or ``-inf``. Defaults to False (will switch
+              to True in the future)
+
+            If the key ``type`` is set to "value", the accepted keys are as
+            follows:
+
+            - clip_value (float or int): maximum allowed value of the
+              gradients. The gradients are clipped in the range
+              ``(-clip_value, +clip_value)``.
+
+    Note:
+        If ``accumulative_counts`` is larger than 1, perform
+        :meth:`update_params` under the context of  ``optim_context``
+        could avoid unnecessary gradient synchronization.
+
+    Note:
+        If you use ``IterBasedRunner`` and enable gradient accumulation,
+        the original `max_iters` should be multiplied by
+        ``accumulative_counts``.
+
+    Note:
+        The subclass should ensure that once :meth:`update_params` is called,
+        ``_inner_count += 1`` is automatically performed.
+
+    Examples:
+        >>> # Config sample of OptimWrapper and enable clipping gradient by
+        >>> # norm.
+        >>> optim_wrapper_cfg = dict(
+        >>>     type='OptimWrapper',
+        >>>     _accumulative_counts=1,
+        >>>     clip_grad=dict(max_norm=0.2))
+        >>> # Config sample of OptimWrapper and enable clipping gradient by
+        >>> # value.
+        >>> optim_wrapper_cfg = dict(
+        >>>     type='OptimWrapper',
+        >>>     _accumulative_counts=1,
+        >>>     clip_grad=dict(type='value', clip_value=0.2))
+        >>> # Use OptimWrapper to update model.
+        >>> import torch.nn as nn
+        >>> import torch
+        >>> from torch.optim import SGD
+        >>> from torch.utils.data import DataLoader
+        >>> from mmengine.optim import OptimWrapper
+        >>>
+        >>> model = nn.Linear(1, 1)
+        >>> dataset = torch.randn(10, 1, 1)
+        >>> dataloader = DataLoader(dataset)
+        >>> optimizer = SGD(model.parameters(), lr=0.1)
+        >>> optim_wrapper = OptimWrapper(optimizer)
+        >>>
+        >>> for data in dataloader:
+        >>>     loss = model(data)
+        >>>     optim_wrapper.update_params(loss)
+        >>> # Enable gradient accumulation
+        >>> optim_wrapper_cfg = dict(
+        >>>     type='OptimWrapper',
+        >>>     _accumulative_counts=3,
+        >>>     clip_grad=dict(max_norm=0.2))
+        >>> ddp_model = DistributedDataParallel(model)
+        >>> optimizer = SGD(ddp_model.parameters(), lr=0.1)
+        >>> optim_wrapper = OptimWrapper(optimizer)
+        >>> optim_wrapper.initialize_count_status(0, len(dataloader))
+        >>> # If model is a subclass instance of DistributedDataParallel,
+        >>> # `optim_context` context manager can avoid unnecessary gradient
+        >>> #  synchronize.
+        >>> for iter, data in enumerate(dataloader):
+        >>>     with optim_wrapper.optim_context(ddp_model):
+        >>>         loss = model(data)
+        >>>     optim_wrapper.update_params(loss)
+    """
+
+    def __init__(self,
+                 optimizer: Optimizer,
+                 accumulative_counts: int = 1,
+                 clip_grad: Optional[dict] = None):
+        assert accumulative_counts > 0, (
+            '_accumulative_counts at least greater than or equal to 1')
+        self._accumulative_counts = accumulative_counts
+        self.optimizer = optimizer
+
+        if clip_grad is not None:
+            # clip_grad_kwargs should not be non-empty dict.
+            assert isinstance(clip_grad, dict) and clip_grad, (
+                'If `clip_grad` is not None, it should be a `dict` '
+                'which is the arguments of `torch.nn.utils.clip_grad_norm_` '
+                'or clip_grad_value_`.')
+            clip_type = clip_grad.pop('type', 'norm')
+            if clip_type == 'norm':
+                self.clip_func = torch.nn.utils.clip_grad_norm_
+                self.grad_name = 'grad_norm'
+            elif clip_type == 'value':
+                self.clip_func = torch.nn.utils.clip_grad_value_
+                self.grad_name = 'grad_value'
+            else:
+                raise ValueError('type of clip_grad should be "norm" or '
+                                 f'"value" but got {clip_type}')
+            assert clip_grad, ('`clip_grad` should contain other arguments '
+                               'besides `type`. The arguments should match '
+                               'with the `torch.nn.utils.clip_grad_norm_` or '
+                               'clip_grad_value_`')
+        self.clip_grad_kwargs = clip_grad
+        # Used to update `grad_norm` log message.
+        self.message_hub = MessageHub.get_current_instance()
+        self._inner_count = 0
+        # `_max_counts` means the total number of parameter updates.  It
+        # ensures that the gradient of the last few iterations will not be
+        # lost when the `_max_counts` is not divisible by
+        # `accumulative_counts`.
+        self._max_counts = -1
+        # The `_remainder_iter` is used for calculating loss factor at the
+        # last few iterations. If `_max_counts` has not been initialized,
+        # the loss factor will always be the same as `_accumulative_counts`.
+        self._remainder_counts = -1
+
+        # The Following code is used to initialize `base_param_settings`.
+        # `base_param_settings` is used to store the parameters that are not
+        # updated by the optimizer.
+        # The `base_param_settings` used for tracking the base learning in the
+        # optimizer. If the optimizer has multiple parameter groups, this
+        # params will not be scaled by the loss factor.
+        if len(optimizer.param_groups) > 1:
+            self.base_param_settings = {
+                'params': torch.tensor([0.0], dtype=torch.float)
+            }
+            self.base_param_settings.update(**self.optimizer.defaults)
+        else:
+            self.base_param_settings = None  # type: ignore
+
+    def update_params(  # type: ignore
+            self,
+            loss: torch.Tensor,
+            step_kwargs: Optional[Dict] = None,
+            zero_kwargs: Optional[Dict] = None) -> None:
+        """Update parameters in :attr:`optimizer`.
+
+        Args:
+            loss (torch.Tensor): A tensor for back propagation.
+            step_kwargs (dict): Arguments for optimizer.step.
+                Defaults to None.
+                New in version v0.4.0.
+            zero_kwargs (dict): Arguments for optimizer.zero_grad.
+                Defaults to None.
+                New in version v0.4.0.
+        """
+        if step_kwargs is None:
+            step_kwargs = {}
+        if zero_kwargs is None:
+            zero_kwargs = {}
+        loss = self.scale_loss(loss)
+        self.backward(loss)
+        # Update parameters only if `self._inner_count` is divisible by
+        # `self._accumulative_counts` or `self._inner_count` equals to
+        # `self._max_counts`
+        if self.should_update():
+            self.step(**step_kwargs)
+            self.zero_grad(**zero_kwargs)
+
+    def backward(self, loss: torch.Tensor, **kwargs) -> None:
+        """Perform gradient back propagation.
+
+        Provide unified ``backward`` interface compatible with automatic mixed
+        precision training. Subclass can overload this method to implement the
+        required logic. For example, ``torch.cuda.amp`` require some extra
+        operation on GradScaler during backward process.
+
+        Note:
+            If subclasses inherit from ``OptimWrapper`` override
+            ``backward``, ``_inner_count +=1`` must be implemented.
+
+        Args:
+            loss (torch.Tensor): The loss of current iteration.
+            kwargs: Keyword arguments passed to :meth:`torch.Tensor.backward`.
+        """
+        loss.backward(**kwargs)
+        self._inner_count += 1
+
+    def zero_grad(self, **kwargs) -> None:
+        """A wrapper of ``Optimizer.zero_grad``.
+
+        Provide unified ``zero_grad`` interface compatible with automatic mixed
+        precision training. Subclass can overload this method to implement the
+        required logic.
+
+        Args:
+            kwargs: Keyword arguments passed to
+                :meth:`torch.optim.Optimizer.zero_grad`.
+        """
+        self.optimizer.zero_grad(**kwargs)
+
+    def step(self, **kwargs) -> None:
+        """A wrapper of ``Optimizer.step``.
+
+        Provide unified ``step`` interface compatible with automatic mixed
+        precision training. Subclass can overload this method to implement the
+        required logic. For example, ``torch.cuda.amp`` require some extra
+        operation on ``GradScaler`` during step process.
+
+        Clip grad if :attr:`clip_grad_kwargs` is not None, and then update
+        parameters.
+
+        Args:
+            kwargs: Keyword arguments passed to
+                :meth:`torch.optim.Optimizer.step`.
+        """
+        if self.clip_grad_kwargs:
+            self._clip_grad()
+        self.optimizer.step(**kwargs)
+
+    @contextmanager
+    def optim_context(self, model: nn.Module):
+        """A Context for gradient accumulation and automatic mix precision
+        training.
+
+        If subclasses need to enable the context for mix precision training,
+        e.g., ``:class:`AmpOptimWrapper``,  the corresponding context should be
+        enabled in `optim_context`. Since ``OptimWrapper`` uses default fp32
+        training, ``optim_context`` will only enable the context for
+        blocking the unnecessary gradient synchronization during gradient
+        accumulation
+
+        If model is an instance with ``no_sync`` method (which means
+        blocking the gradient synchronization) and
+        ``self._accumulative_counts != 1``. The model will not automatically
+        synchronize gradients if ``cur_iter`` is divisible by
+        ``self._accumulative_counts``. Otherwise, this method will enable an
+        empty context.
+
+        Args:
+            model (nn.Module): The training model.
+        """
+        # During gradient accumulation process, the gradient synchronize
+        # should only happen before updating parameters.
+        if not self.should_sync() and hasattr(model, 'no_sync'):
+            with model.no_sync():
+                yield
+        else:
+            yield
+
+    def _clip_grad(self) -> None:
+        """Clip the gradients of parameters."""
+        params: List[torch.Tensor] = []
+        for param_group in self.optimizer.param_groups:
+            params.extend(param_group['params'])
+
+        params = list(
+            filter(lambda p: p.requires_grad and p.grad is not None, params))
+        if len(params) > 0:
+            grad = self.clip_func(params, **self.clip_grad_kwargs)
+            # `torch.nn.utils.clip_grad_value_` will return None.
+            if grad is not None:
+                self.message_hub.update_scalar(f'train/{self.grad_name}',
+                                               float(grad))
+
+    def initialize_count_status(self, model: nn.Module, init_counts: int,
+                                max_counts: int) -> None:
+        """Initialize gradient accumulation related attributes.
+
+        ``OptimWrapper`` can be used without calling
+        ``initialize_iter_status``. However, Consider the case of  ``len(
+        dataloader) == 10``, and the ``accumulative_iter == 3``. Since 10 is
+        not divisible by 3, the last iteration will not trigger
+        ``optimizer.step()``, resulting in one less parameter updating.
+
+        Args:
+            model (nn.Module): Training model
+            init_counts (int): The initial value of the inner count.
+            max_counts (int): The maximum value of the inner count.
+        """
+        self._inner_count = init_counts
+        self._max_counts = max_counts
+        if self._inner_count % self._accumulative_counts != 0:
+            print_log(
+                'Resumed iteration number is not divisible by '
+                '`_accumulative_counts` in `GradientCumulativeOptimizerHook`, '
+                'which means the gradient of some iterations is lost and the '
+                'result may be influenced slightly.',
+                logger='current',
+                level=logging.WARNING)
+
+        if has_batch_norm(model) and self._accumulative_counts > 1:
+            print_log(
+                'Gradient accumulative may slightly decrease '
+                'performance because the model has BatchNorm layers.',
+                logger='current',
+                level=logging.WARNING)
+        # Remainder of `_max_counts` divided by `_accumulative_counts`
+        self._remainder_counts = self._max_counts % self._accumulative_counts
+
+    def should_update(self) -> bool:
+        """Decide whether the parameters should be updated at the current
+        iteration.
+
+        Called by :meth:`update_params` and check whether the optimizer
+        wrapper should update parameters at current iteration.
+
+        Returns:
+            bool: Whether to update parameters.
+        """
+        return (self._inner_count % self._accumulative_counts == 0
+                or self._inner_count == self._max_counts)
+
+    def should_sync(self) -> bool:
+        """Decide whether the automatic gradient synchronization should be
+        allowed at the current iteration.
+
+        It takes effect when gradient accumulation is used to skip
+        synchronization at the iterations where the parameter is not updated.
+
+        Since ``should_sync`` is called by :meth:`optim_context`, and it is
+        called before :meth:`backward` which means ``self._inner_count += 1``
+        has not happened yet. Therefore, ``self._inner_count += 1`` should be
+        performed manually here.
+
+        Returns:
+            bool: Whether to block the automatic gradient synchronization.
+        """
+        return ((self._inner_count + 1) % self._accumulative_counts == 0
+                or (self._inner_count + 1) == self._max_counts)
+
+    def scale_loss(self, loss: torch.Tensor) -> torch.Tensor:
+        """Get scaled loss according to ``_accumulative_counts``,
+        ``_inner_count`` and max_counts.
+
+        Args:
+            loss (torch.Tensor): Original loss calculated by model.
+
+        Returns:
+            loss (torch.Tensor): Scaled loss.
+        """
+        if self._accumulative_counts == 1:
+            # update parameters without gradient accumulation. The gradient
+            # should not be rescaled and `loss_factor=1`.
+            loss_factor = 1
+        elif self._max_counts == -1:
+            loss_factor = self._accumulative_counts
+        else:
+            # if `self._accumulative_counts > 1`, the gradient needs to be
+            # rescaled and accumulated. In most cases, `loss_factor` equals to
+            # `self._accumulative_counts`. However, `self._max_counts` may not
+            # be divisible by `self._accumulative_counts`, so the
+            # `loss_scale` for the last few iterations needs to be
+            # recalculated.
+            if self._inner_count < self._max_counts - self._remainder_counts:
+                loss_factor = self._accumulative_counts
+            else:
+                loss_factor = self._remainder_counts
+            assert loss_factor > 0, (
+                'loss_factor should be larger than zero! This error could '
+                'happened when initialize_iter_status called with an '
+                'error `init_counts` or `max_counts`')
+
+        loss = loss / loss_factor
+        return loss
+
+    @property
+    def inner_count(self):
+        """Get the number of updating parameters of optimizer wrapper."""
+        return self._inner_count
+
+    def __repr__(self):
+        wrapper_info = (f'Type: {type(self).__name__}\n'
+                        f'_accumulative_counts: {self._accumulative_counts}\n'
+                        'optimizer: \n')
+        optimizer_str = repr(self.optimizer) + '\n'
+        return wrapper_info + optimizer_str
diff --git a/head_extractor/build/lib/mmengine/optim/optimizer/optimizer_wrapper_dict.py b/head_extractor/build/lib/mmengine/optim/optimizer/optimizer_wrapper_dict.py
new file mode 100644
index 0000000000000000000000000000000000000000..a18fd99cae72f8053ee827ddd3501342539a6f05
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/optimizer/optimizer_wrapper_dict.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from contextlib import contextmanager
+from typing import Dict, Iterator, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from .optimizer_wrapper import OptimWrapper
+
+
+class OptimWrapperDict(OptimWrapper):
+    """A dictionary container of :obj:`OptimWrapper`.
+
+    If runner is training with multiple optimizers, all optimizer wrappers
+    should be managed by :obj:`OptimWrapperDict` which is built by
+    ``CustomOptimWrapperConstructor``. ``OptimWrapperDict`` will load and save
+    the state dictionary of all optimizer wrappers.
+
+    Consider the semantic ambiguity of calling :meth:``update_params``,
+    :meth:`backward` of all optimizer wrappers, ``OptimWrapperDict`` will not
+    implement these methods.
+
+    Examples:
+        >>> import torch.nn as nn
+        >>> from torch.optim import SGD
+        >>> from mmengine.optim import OptimWrapperDict, OptimWrapper
+        >>> model1 = nn.Linear(1, 1)
+        >>> model2 = nn.Linear(1, 1)
+        >>> optim_wrapper1 = OptimWrapper(SGD(model1.parameters(), lr=0.1))
+        >>> optim_wrapper2 = OptimWrapper(SGD(model2.parameters(), lr=0.1))
+        >>> optim_wrapper_dict = OptimWrapperDict(model1=optim_wrapper1,
+        >>>                                       model2=optim_wrapper2)
+
+    Note:
+        The optimizer wrapper contained in ``OptimWrapperDict`` can be accessed
+        in the same way as `dict`.
+
+    Args:
+        **optim_wrappers: A dictionary of ``OptimWrapper`` instance.
+    """
+
+    def __init__(self, **optim_wrapper_dict: OptimWrapper):
+        for key, value in optim_wrapper_dict.items():
+            assert isinstance(value, OptimWrapper), (
+                '`OptimWrapperDict` only accept OptimWrapper instance, '
+                f'but got {key}: {type(value)}')
+        self.optim_wrappers = optim_wrapper_dict
+
+    def update_params(  # type: ignore
+            self,
+            loss: torch.Tensor,
+            step_kwargs: Optional[Dict] = None,
+            zero_kwargs: Optional[Dict] = None) -> None:
+        """Update all optimizer wrappers would lead to a duplicate backward
+        errors, and OptimWrapperDict does not know which optimizer wrapper
+        should be updated.
+
+        Therefore, this method is not implemented. The optimizer wrapper of
+        OptimWrapperDict should be accessed and call its `update_params`.
+        """
+        raise NotImplementedError('`update_params` should be called by each '
+                                  'optimizer separately`')
+
+    def backward(self, loss: torch.Tensor, **kwargs) -> None:
+        """Since OptimWrapperDict doesn't know which optimizer wrapper's
+        backward method should be called (``loss_scaler`` maybe different in
+        different :obj:AmpOptimWrapper), this method is not implemented.
+
+        The optimizer wrapper of OptimWrapperDict should be accessed and call
+        its `backward`.
+        """
+        raise NotImplementedError('`backward` should be called by each '
+                                  'optimizer separately`')
+
+    def step(self, **kwargs) -> None:
+        """Since the backward method is not implemented, the step should not be
+        implemented either."""
+        raise NotImplementedError('`step` should be called by each '
+                                  'optimizer separately`')
+
+    def zero_grad(self, **kwargs) -> None:
+        """Set the gradients of all optimizer wrappers to zero."""
+        for optim_wrapper in self.optim_wrappers.values():
+            optim_wrapper.zero_grad()
+
+    @contextmanager
+    def optim_context(self, model: nn.Module):
+        """``optim_context`` should be called by each optimizer separately."""
+        raise NotImplementedError(
+            '`optim_context` should be called by each optimizer separately')
+
+    def initialize_count_status(self, model: nn.Module, cur_iter,
+                                max_iters) -> None:
+        """Do nothing but provide unified interface for :obj:`OptimWrapper`
+
+        Since ``OptimWrapperDict`` does not know the correspondence between
+        model and optimizer wrapper. ``initialize_iter_status`` will do nothing
+        and each optimizer wrapper should call ``initialize_iter_status``
+        separately.
+        """
+        return
+
+    @property
+    def param_groups(self):
+        """Returns the parameter groups of each OptimWrapper."""
+        param_groups = dict()
+        for key, value in self.optim_wrappers.items():
+            param_groups[key] = value.param_groups
+        return param_groups
+
+    def get_lr(self) -> Dict[str, List[float]]:
+        """Get the learning rate of all optimizers.
+
+        Returns:
+            Dict[str, List[float]]: Learning rate of all optimizers.
+        """
+        lr_dict = dict()
+        for name, optim_wrapper in self.optim_wrappers.items():
+            inner_lr_dict = optim_wrapper.get_lr()
+            if 'base_lr' in inner_lr_dict:
+                lr_dict[f'{name}.base_lr'] = inner_lr_dict['base_lr']
+            lr_dict[f'{name}.lr'] = inner_lr_dict['lr']
+        return lr_dict
+
+    def get_momentum(self) -> Dict[str, List[float]]:
+        """Get the momentum of all optimizers.
+
+        Returns:
+            Dict[str, List[float]]: momentum of all optimizers.
+        """
+        momentum_dict = dict()
+        for name, optim_wrapper in self.optim_wrappers.items():
+            momentum_dict[f'{name}.momentum'] = optim_wrapper.get_momentum(
+            )['momentum']
+        return momentum_dict
+
+    def state_dict(self) -> dict:
+        """Get the state dictionary of all optimizer wrappers.
+
+        Returns:
+            dict: Each key-value pair in the dictionary represents the name
+            and state dictionary of corresponding :obj:`OptimWrapper`.
+        """
+        state_dict = dict()
+        for name, optim_wrapper in self.optim_wrappers.items():
+            state_dict[name] = optim_wrapper.state_dict()
+        return state_dict
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        """Load the state dictionary from the ``state_dict``.
+
+        Args:
+            state_dict (dict): Each key-value pair in `state_dict` represents
+                the name and the state dictionary of corresponding
+                :obj:`OptimWrapper`.
+        """
+        for name, _state_dict in state_dict.items():
+            assert name in self.optim_wrappers, (
+                f'Mismatched `state_dict`! cannot found {name} in '
+                'OptimWrapperDict')
+            self.optim_wrappers[name].load_state_dict(_state_dict)
+
+    def items(self) -> Iterator[Tuple[str, OptimWrapper]]:
+        """A generator to get the name and corresponding
+        :obj:`OptimWrapper`"""
+        yield from self.optim_wrappers.items()
+
+    def values(self) -> Iterator[OptimWrapper]:
+        """A generator to get :obj:`OptimWrapper`"""
+        yield from self.optim_wrappers.values()
+
+    def keys(self) -> Iterator[str]:
+        """A generator to get the name of :obj:`OptimWrapper`"""
+        yield from self.optim_wrappers.keys()
+
+    def __getitem__(self, key: str) -> OptimWrapper:
+        assert key in self.optim_wrappers, (
+            f'Cannot find {key} in OptimWrapperDict, please check '
+            'your optimizer constructor.')
+        return self.optim_wrappers[key]
+
+    def __contains__(self, key: str) -> bool:
+        return key in self.optim_wrappers
+
+    def __len__(self) -> int:
+        return len(self.optim_wrappers)
+
+    def __repr__(self) -> str:
+        desc = ''
+        for name, optim_wrapper in self.optim_wrappers.items():
+            desc += f'name: {name}\n'
+            desc += repr(optim_wrapper)
+        return desc
diff --git a/head_extractor/build/lib/mmengine/optim/optimizer/zero_optimizer.py b/head_extractor/build/lib/mmengine/optim/optimizer/zero_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c5630a765c0352031cbdcd1f85ebbe210129a0f
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/optimizer/zero_optimizer.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+from torch.distributed.rpc import is_available
+
+from mmengine.dist import is_main_process
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+try:
+    from torch.distributed.optim import \
+        ZeroRedundancyOptimizer as _ZeroRedundancyOptimizer
+except ImportError:
+    _ZeroRedundancyOptimizer = object
+
+from .builder import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module()
+class ZeroRedundancyOptimizer(_ZeroRedundancyOptimizer):
+    """A wrapper class of :class:`ZeroRedundancyOptimizer` that gets a
+    optimizer type as string.
+
+    This class wraps an arbitrary :class:`torch.optim.Optimizer` and shards its
+    states across ranks in the group as described by ZeRO_. The local optimizer
+    instance in each rank is only responsible for updating approximately
+    ``1 / world_size`` parameters and hence only needs to keep
+    ``1 / world_size`` optimizer states. After parameters are updated locally,
+    each rank will broadcast its parameters to all other peers to keep all
+    model replicas in the same state. ``ZeroRedundancyOptimizer`` can be used
+    in conjunction with :class:`torch.nn.parallel.DistributedDataParallel` to
+    reduce per-rank peak memory consumption.
+
+    ``ZeroRedundancyOptimizer`` uses a sorted-greedy algorithm to pack a number
+    of parameters at each rank. Each parameter belongs to a single rank and is
+    not divided among ranks. The partition is arbitrary and might not match the
+    the parameter registration or usage order.
+
+    Warnings:
+        ``ZeroRedundancyOptimizer`` requires PyTorch >= 1.8.
+
+    Warnings:
+        ``ZeroRedundancyOptimizer`` requires PyTorch >= 1.12 to enable param
+        groups.
+
+    Args:
+        params (``Iterable``): an ``Iterable`` of :class:`torch.Tensor` s
+            or :class:`dict` s giving all parameters, which will be sharded
+            across ranks.
+        optimizer_type (str): the string of the local optimizer class.
+
+    .. _ZeRO: https://arxiv.org/abs/1910.02054
+    """
+
+    def __init__(self, params, optimizer_type: str, **kwargs):
+        assert digit_version(TORCH_VERSION) >= digit_version('1.8.0'), (
+            '`torch.distributed.optim.ZeroReundancyOptimizer` is only '
+            'available when pytorch version >= 1.8.')
+        assert is_available(), 'torch.distributed.rpc is not available.'
+        # Avoid the generator becoming empty after the following check
+        params = list(params)
+        assert (
+            all(isinstance(p, torch.Tensor) for p in params)
+            or digit_version(TORCH_VERSION) >= digit_version('1.12.0')), (
+                'PyTorch ZeroRedundancyOptimizer started to support param '
+                'groups since 1.12.0. Please update your pytorch version to '
+                'enable this feature, or disable param groups by deleting '
+                '`paramwise_cfg` filed in config file.')
+        optimizer_class = getattr(torch.optim, optimizer_type)
+        # TODO: Register a DDP communication hook for `overlap_with_ddp=True`.
+        # Currently only `overlap_with_ddp=False` is supported. For more
+        # details, please refer to the pytorch's official documentation.
+        super().__init__(params, optimizer_class, **kwargs)
+
+    def state_dict(self):
+        """Consolidate `state_dict`s from ranks to save the `state_dict`."""
+        self.consolidate_state_dict()
+        state_dict = super().state_dict() if is_main_process() else dict()
+        return state_dict
diff --git a/head_extractor/build/lib/mmengine/optim/scheduler/__init__.py b/head_extractor/build/lib/mmengine/optim/scheduler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48ccc34bc41b07442e2494b03a303b3c0054b42b
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/scheduler/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# yapf: disable
+from .lr_scheduler import (ConstantLR, CosineAnnealingLR, CosineRestartLR,
+                           ExponentialLR, LinearLR, MultiStepLR, OneCycleLR,
+                           PolyLR, ReduceOnPlateauLR, StepLR)
+from .momentum_scheduler import (ConstantMomentum, CosineAnnealingMomentum,
+                                 CosineRestartMomentum, ExponentialMomentum,
+                                 LinearMomentum, MultiStepMomentum,
+                                 PolyMomentum, ReduceOnPlateauMomentum,
+                                 StepMomentum)
+from .param_scheduler import (ConstantParamScheduler,
+                              CosineAnnealingParamScheduler,
+                              CosineRestartParamScheduler,
+                              ExponentialParamScheduler, LinearParamScheduler,
+                              MultiStepParamScheduler, OneCycleParamScheduler,
+                              PolyParamScheduler,
+                              ReduceOnPlateauParamScheduler,
+                              StepParamScheduler, _ParamScheduler)
+
+# yapf: enable
+__all__ = [
+    'ConstantLR', 'CosineAnnealingLR', 'ExponentialLR', 'LinearLR',
+    'MultiStepLR', 'StepLR', 'ConstantMomentum', 'CosineAnnealingMomentum',
+    'ExponentialMomentum', 'LinearMomentum', 'MultiStepMomentum',
+    'StepMomentum', 'ConstantParamScheduler', 'CosineAnnealingParamScheduler',
+    'ExponentialParamScheduler', 'LinearParamScheduler',
+    'MultiStepParamScheduler', 'StepParamScheduler', '_ParamScheduler',
+    'PolyParamScheduler', 'PolyLR', 'PolyMomentum', 'OneCycleParamScheduler',
+    'OneCycleLR', 'CosineRestartParamScheduler', 'CosineRestartLR',
+    'CosineRestartMomentum', 'ReduceOnPlateauParamScheduler',
+    'ReduceOnPlateauLR', 'ReduceOnPlateauMomentum'
+]
diff --git a/head_extractor/build/lib/mmengine/optim/scheduler/lr_scheduler.py b/head_extractor/build/lib/mmengine/optim/scheduler/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b12c60d0cf29c447c3f88518b2e4905900ce15ef
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/scheduler/lr_scheduler.py
@@ -0,0 +1,379 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.registry import PARAM_SCHEDULERS
+# yapf: disable
+from .param_scheduler import (ConstantParamScheduler,
+                              CosineAnnealingParamScheduler,
+                              CosineRestartParamScheduler,
+                              ExponentialParamScheduler, LinearParamScheduler,
+                              MultiStepParamScheduler, OneCycleParamScheduler,
+                              PolyParamScheduler,
+                              ReduceOnPlateauParamScheduler,
+                              StepParamScheduler)
+
+# yapf: enable
+
+
+class LRSchedulerMixin:
+    """A mixin class for learning rate schedulers."""
+
+    def __init__(self, optimizer, *args, **kwargs):
+        super().__init__(optimizer, 'lr', *args, **kwargs)
+
+
+@PARAM_SCHEDULERS.register_module()
+class ConstantLR(LRSchedulerMixin, ConstantParamScheduler):
+    """Decays the learning rate value of each parameter group by a small
+    constant factor until the number of epoch reaches a pre-defined milestone:
+    ``end``. Notice that such decay can happen simultaneously with other
+    changes to the learning rate value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        factor (float): The number we multiply learning rate until the
+            milestone. Defaults to 1./3.
+        begin (int): Step at which to start updating the learning rate.
+            Defaults to 0.
+        end (int): Step at which to stop updating the learning rate.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without state
+            dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled learning rate is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the learning rate for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class CosineAnnealingLR(LRSchedulerMixin, CosineAnnealingParamScheduler):
+    r"""Set the learning rate of each parameter group using a cosine annealing
+    schedule, where :math:`\eta_{max}` is set to the initial value and
+    :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
+
+    .. math::
+        \begin{aligned}
+            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+            & T_{cur} \neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+            & T_{cur} = (2k+1)T_{max}.
+        \end{aligned}
+
+    Notice that because the schedule
+    is defined recursively, the learning rate can be simultaneously modified
+    outside this scheduler by other operators. If the learning rate is set
+    solely by this scheduler, the learning rate at each step becomes:
+
+    .. math::
+        \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 +
+        \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right)
+
+    It has been proposed in
+    `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this
+    only implements the cosine annealing part of SGDR, and not the restarts.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        T_max (int): Maximum number of iterations.
+        eta_min (float): Minimum learning rate. Defaults to None.
+        begin (int): Step at which to start updating the learning rate.
+            Defaults to 0.
+        end (int): Step at which to stop updating the learning rate.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled learning rate is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the learning rate for each update.
+            Defaults to False.
+        eta_min_ratio (float, optional): The ratio of the minimum parameter
+            value to the base parameter value. Either `eta_min` or
+            `eta_min_ratio` should be specified. Defaults to None.
+            New in version 0.3.2.
+
+    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class ExponentialLR(LRSchedulerMixin, ExponentialParamScheduler):
+    """Decays the learning rate of each parameter group by gamma every epoch.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        gamma (float): Multiplicative factor of learning rate decay.
+        begin (int): Step at which to start updating the learning rate.
+            Defaults to 0.
+        end (int): Step at which to stop updating the learning rate.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled learning rate is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the learning rate for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class LinearLR(LRSchedulerMixin, LinearParamScheduler):
+    """Decays the learning rate of each parameter group by linearly changing
+    small multiplicative factor until the number of epoch reaches a pre-defined
+    milestone: ``end``.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    learning rate from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        start_factor (float): The number we multiply learning rate in the
+            first epoch. The multiplication factor changes towards end_factor
+            in the following epochs. Defaults to 1./3.
+        end_factor (float): The number we multiply learning rate at the end
+            of linear changing process. Defaults to 1.0.
+        begin (int): Step at which to start updating the learning rate.
+            Defaults to 0.
+        end (int): Step at which to stop updating the learning rate.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled learning rate is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the learning rate for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class MultiStepLR(LRSchedulerMixin, MultiStepParamScheduler):
+    """Decays the specified learning rate in each parameter group by gamma once
+    the number of epoch reaches one of the milestones. Notice that such decay
+    can happen simultaneously with other changes to the learning rate from
+    outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        milestones (list): List of epoch indices. Must be increasing.
+        gamma (float): Multiplicative factor of learning rate decay.
+            Defaults to 0.1.
+        begin (int): Step at which to start updating the learning rate.
+            Defaults to 0.
+        end (int): Step at which to stop updating the learning rate.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled learning rate is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the learning rate for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class StepLR(LRSchedulerMixin, StepParamScheduler):
+    """Decays the learning rate of each parameter group by gamma every
+    step_size epochs. Notice that such decay can happen simultaneously with
+    other changes to the learning rate from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        step_size (int): Period of learning rate decay.
+        gamma (float): Multiplicative factor of learning rate decay.
+            Defaults to 0.1.
+        begin (int): Step at which to start updating the learning rate.
+            Defaults to 0.
+        end (int): Step at which to stop updating the learning rate.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled learning rate is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the learning rate for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class PolyLR(LRSchedulerMixin, PolyParamScheduler):
+    """Decays the learning rate of each parameter group in a polynomial decay
+    scheme.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    parameter value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        eta_min (float): Minimum learning rate at the end of scheduling.
+            Defaults to 0.
+        power (float): The power of the polynomial. Defaults to 1.0.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class OneCycleLR(LRSchedulerMixin, OneCycleParamScheduler):
+    r"""Sets the learning rate of each parameter group according to the
+    1cycle learning rate policy. The 1cycle policy anneals the learning
+    rate from an initial learning rate to some maximum learning rate and then
+    from that maximum learning rate to some minimum learning rate much lower
+    than the initial learning rate.
+    This policy was initially described in the paper `Super-Convergence:
+    Very Fast Training of Neural Networks Using Large Learning Rates`_.
+
+    The 1cycle learning rate policy changes the learning rate after every
+    batch. `step` should be called after a batch has been used for training.
+
+    This scheduler is not chainable.
+
+    Note also that the total number of steps in the cycle can be determined in
+    one of two ways (listed in order of precedence):
+
+    #. A value for total_steps is explicitly provided.
+    #. A number of epochs (epochs) and a number of steps per epoch
+       (steps_per_epoch) are provided.
+       In this case, the number of total steps is inferred by
+       total_steps = epochs * steps_per_epoch
+
+    You must either provide a value for total_steps or provide a value for both
+    epochs and steps_per_epoch.
+
+    The default behaviour of this scheduler follows the fastai implementation
+    of 1cycle, which claims that "unpublished work has shown even better
+    results by using only two phases". To mimic the behaviour of the original
+    paper instead, set ``three_phase=True``.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        eta_max (float or list): Upper parameter value boundaries in the cycle
+            for each parameter group.
+        total_steps (int): The total number of steps in the cycle. Note that
+            if a value is not provided here, then it must be inferred by
+            providing a value for epochs and steps_per_epoch.
+            Defaults to None.
+        pct_start (float): The percentage of the cycle (in number of steps)
+            spent increasing the learning rate.
+            Defaults to 0.3
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: "cos" for cosine annealing,
+            "linear" for linear annealing.
+            Defaults to 'cos'
+        div_factor (float): Determines the initial learning rate via
+            initial_param = eta_max/div_factor
+            Defaults to 25
+        final_div_factor (float): Determines the minimum learning rate via
+            eta_min = initial_param/final_div_factor
+            Defaults to 1e4
+        three_phase (bool): If ``True``, use a third phase of the schedule to
+            annihilate the learning rate according to 'final_div_factor'
+            instead of modifying the second phase (the first two phases will be
+            symmetrical about the step indicated by 'pct_start').
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+
+    .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
+        https://arxiv.org/abs/1708.07120
+    """# noqa E501
+
+
+@PARAM_SCHEDULERS.register_module()
+class CosineRestartLR(LRSchedulerMixin, CosineRestartParamScheduler):
+    """Sets the learning rate of each parameter group according to the cosine
+    annealing with restarts scheme. The cosine restart policy anneals the
+    learning rate from the initial value to `eta_min` with a cosine annealing
+    schedule and then restarts another period from the maximum value multiplied
+    with `restart_weight`.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        periods (list[int]): Periods for each cosine anneling cycle.
+        restart_weights (list[float]): Restart weights at each
+            restart iteration. Defaults to [1].
+        eta_min (float): Minimum parameter value at the end of scheduling.
+            Defaults to None.
+        eta_min_ratio (float, optional): The ratio of minimum parameter value
+            to the base parameter value. Either `min_lr` or `min_lr_ratio`
+            should be specified. Defaults to None.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class ReduceOnPlateauLR(LRSchedulerMixin, ReduceOnPlateauParamScheduler):
+    """Reduce the learning rate of each parameter group when a metric has
+    stopped improving. Models often benefit from reducing the learning rate by
+    a factor of 2-10 once learning stagnates. This scheduler reads a metrics
+    quantity and if no improvement is seen for a ``patience`` number of epochs,
+    the learning rate is reduced.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        monitor (str): Key name of the value to monitor in metrics dict.
+        rule (str): One of `less`, `greater`. In `less` rule, learning rate
+            will be reduced when the quantity monitored has stopped
+            decreasing; in `greater` rule it will be reduced when the
+            quantity monitored has stopped increasing. Defaults to 'less'.
+            The ``rule`` is the renaming of ``mode`` in pytorch.
+        factor (float): Factor by which the learning rate will be
+            reduced. new_param = param * factor. Defaults to 0.1.
+        patience (int): Number of epochs with no improvement after
+            which learning rate will be reduced. For example, if
+            ``patience = 2``, then we will ignore the first 2 epochs
+            with no improvement, and will only decrease the learning rate after
+            the 3rd epoch if the monitor value still hasn't improved then.
+            Defaults to 10.
+        threshold (float): Threshold for measuring the new optimum,
+            to only focus on significant changes. Defaults to 1e-4.
+        threshold_rule (str): One of `rel`, `abs`. In `rel` rule,
+            dynamic_threshold = best * ( 1 + threshold ) in 'greater'
+            rule or best * ( 1 - threshold ) in `less` rule.
+            In `abs` rule, dynamic_threshold = best + threshold in
+            `greater` rule or best - threshold in `less` rule.
+            Defaults to 'rel'.
+        cooldown (int): Number of epochs to wait before resuming
+            normal operation after learning rate has been reduced.
+            Defaults to 0.
+        min_value (float or list[float]): A scalar or a sequence of scalars.
+            A lower bound on the learning rate of each parameter group
+            respectively. Defaults to 0. .
+        eps (float): Minimal decay applied to learning rate. If the difference
+            between new and old learning rate is smaller than eps, the update
+            is ignored. Defaults to 1e-8.
+        begin (int): Step at which to start triggering the scheduler
+            to monitor in val within the interval calculated
+            according to epoch of training. Defaults to 0.
+        end (int): Step at which to stop triggering the scheduler
+            to monitor in val within the interval calculated
+            according to epoch of training. Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
diff --git a/head_extractor/build/lib/mmengine/optim/scheduler/momentum_scheduler.py b/head_extractor/build/lib/mmengine/optim/scheduler/momentum_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..e356e70f7b734fa4598afdc1e95478e674a3c344
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/scheduler/momentum_scheduler.py
@@ -0,0 +1,362 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.registry import PARAM_SCHEDULERS
+# yapf: disable
+from .param_scheduler import (ConstantParamScheduler,
+                              CosineAnnealingParamScheduler,
+                              CosineRestartParamScheduler,
+                              ExponentialParamScheduler, LinearParamScheduler,
+                              MultiStepParamScheduler, PolyParamScheduler,
+                              ReduceOnPlateauParamScheduler,
+                              StepParamScheduler)
+
+# yapf: enable
+
+
+class MomentumSchedulerMixin:
+    """A mixin class for momentum schedulers.
+
+    It can schedule the momentum in SGD and the beta_0 in Adam series.
+    """
+
+    def __init__(self, optimizer, *args, **kwargs):
+        self.use_betas = False
+        if 'momentum' in optimizer.defaults:
+            param_name = 'momentum'
+        elif 'betas' in optimizer.defaults:
+            # for Adam series optimizer, the momentum is beta_0
+            self.use_betas = True
+            param_name = 'momentum'
+            for group in optimizer.param_groups:
+                # set a reference momentum in the param groups for scheduling
+                group[param_name] = group['betas'][0]
+        else:
+            raise ValueError(
+                'optimizer must support momentum when using momentum scheduler'
+            )
+        super().__init__(optimizer, param_name, *args, **kwargs)
+
+    def step(self):
+        """Adjusts the momentum of each parameter group based on the specified
+        schedule."""
+        super().step()
+        if self.use_betas:
+            for group in self.optimizer.param_groups:
+                _, beta_1 = group['betas']
+                # update the betas with the calculated value
+                group['betas'] = (group['momentum'], beta_1)
+
+
+@PARAM_SCHEDULERS.register_module()
+class ConstantMomentum(MomentumSchedulerMixin, ConstantParamScheduler):
+    """Decays the momentum value of each parameter group by a small constant
+    factor until the number of epoch reaches a pre-defined milestone: ``end``.
+    Notice that such decay can happen simultaneously with other changes to the
+    momentum value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        factor (float): The number we multiply momentum until the milestone.
+            Defaults to 1./3.
+        begin (int): Step at which to start updating the momentum.
+            Defaults to 0.
+        end (int): Step at which to stop updating the momentum.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without state
+            dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled momentum is updated by epochs.
+            Defaults to True.
+        verbose (bool): Whether to print the momentum for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class CosineAnnealingMomentum(MomentumSchedulerMixin,
+                              CosineAnnealingParamScheduler):
+    r"""Set the momentum of each parameter group using a cosine annealing
+    schedule, where :math:`\eta_{max}` is set to the initial value and
+    :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
+
+    .. math::
+        \begin{aligned}
+            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+            & T_{cur} \neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+            & T_{cur} = (2k+1)T_{max}.
+        \end{aligned}
+
+    Notice that because the schedule
+    is defined recursively, the momentum can be simultaneously modified
+    outside this scheduler by other operators. If the momentum is set
+    solely by this scheduler, the momentum at each step becomes:
+
+    .. math::
+        \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 +
+        \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right)
+
+    It has been proposed in
+    `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this
+    only implements the cosine annealing part of SGDR, and not the restarts.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        T_max (int): Maximum number of iterations.
+        eta_min (float): Minimum momentum value. Defaults to None.
+        begin (int): Step at which to start updating the momentum.
+            Defaults to 0.
+        end (int): Step at which to stop updating the momentum.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled momentum is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the momentum for each update.
+            Defaults to False.
+        eta_min_ratio (float, optional): The ratio of the minimum parameter
+            value to the base parameter value. Either `eta_min` or
+            `eta_min_ratio` should be specified. Defaults to None.
+            New in version 0.3.2.
+
+    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class ExponentialMomentum(MomentumSchedulerMixin, ExponentialParamScheduler):
+    """Decays the momentum of each parameter group by gamma every epoch.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        gamma (float): Multiplicative factor of momentum value decay.
+        begin (int): Step at which to start updating the momentum.
+            Defaults to 0.
+        end (int): Step at which to stop updating the momentum.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled momentum is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the momentum for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class LinearMomentum(MomentumSchedulerMixin, LinearParamScheduler):
+    """Decays the momentum of each parameter group by linearly changing
+    small multiplicative factor until the number of epoch reaches a pre-defined
+    milestone: ``end``.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    momentum from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        start_factor (float): The number we multiply momentum in the
+            first epoch. The multiplication factor changes towards end_factor
+            in the following epochs. Defaults to 1./3.
+        end_factor (float): The number we multiply momentum at the end
+            of linear changing process. Defaults to 1.0.
+        begin (int): Step at which to start updating the momentum.
+            Defaults to 0.
+        end (int): Step at which to stop updating the momentum.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled momentum is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the momentum for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class MultiStepMomentum(MomentumSchedulerMixin, MultiStepParamScheduler):
+    """Decays the specified momentum in each parameter group by gamma once the
+    number of epoch reaches one of the milestones. Notice that such decay can
+    happen simultaneously with other changes to the momentum from outside this
+    scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        milestones (list): List of epoch indices. Must be increasing.
+        gamma (float): Multiplicative factor of momentum value decay.
+            Defaults to 0.1.
+        begin (int): Step at which to start updating the momentum.
+            Defaults to 0.
+        end (int): Step at which to stop updating the momentum.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled momentum is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the momentum for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class StepMomentum(MomentumSchedulerMixin, StepParamScheduler):
+    """Decays the momentum of each parameter group by gamma every step_size
+    epochs. Notice that such decay can happen simultaneously with other changes
+    to the momentum from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        step_size (int): Period of momentum value decay.
+        gamma (float): Multiplicative factor of momentum value decay.
+            Defaults to 0.1.
+        begin (int): Step at which to start updating the momentum.
+            Defaults to 0.
+        end (int): Step at which to stop updating the momentum.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled momentum is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the momentum for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class PolyMomentum(MomentumSchedulerMixin, PolyParamScheduler):
+    """Decays the momentum of each parameter group in a polynomial decay
+    scheme.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    parameter value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        eta_min (float): Minimum momentum at the end of scheduling.
+            Defaults to 0.
+        power (float): The power of the polynomial. Defaults to 1.0.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class CosineRestartMomentum(MomentumSchedulerMixin,
+                            CosineRestartParamScheduler):
+    """Sets the momentum of each parameter group according to the cosine
+    annealing with restarts scheme. The cosine restart policy anneals the
+    momentum from the initial value to `eta_min` with a cosine annealing
+    schedule and then restarts another period from the maximum value multiplied
+    with `restart_weight`.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        periods (list[int]): Periods for each cosine anneling cycle.
+        restart_weights (list[float]): Restart weights at each
+            restart iteration. Defaults to [1].
+        eta_min (float): Minimum parameter value at the end of scheduling.
+            Defaults to None.
+        eta_min_ratio (float, optional): The ratio of minimum parameter value
+            to the base parameter value. Either `min_lr` or `min_lr_ratio`
+            should be specified. Defaults to None.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class ReduceOnPlateauMomentum(MomentumSchedulerMixin,
+                              ReduceOnPlateauParamScheduler):
+    """Reduce the momentum of each parameter group when a metric has stopped
+    improving. Models often benefit from reducing the momentum by a factor of
+    2-10 once learning stagnates. This scheduler reads a metrics quantity and
+    if no improvement is seen for a ``patience`` number of epochs, the momentum
+    is reduced.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        monitor (str): Key name of the value to monitor in metrics dict.
+        rule (str): One of `less`, `greater`. In `less` rule, momentum will
+            be reduced when the quantity monitored has stopped
+            decreasing; in `greater` rule it will be reduced when the
+            quantity monitored has stopped increasing. Defaults to 'less'.
+            The ``rule`` is the renaming of ``mode`` in pytorch.
+        factor (float): Factor by which the momentum will be
+            reduced. new_param = param * factor. Defaults to 0.1.
+        patience (int): Number of epochs with no improvement after
+            which momentum will be reduced. For example, if
+            ``patience = 2``, then we will ignore the first 2 epochs
+            with no improvement, and will only decrease the momentum after
+            the 3rd epoch if the monitor value still hasn't improved then.
+            Defaults to 10.
+        threshold (float): Threshold for measuring the new optimum,
+            to only focus on significant changes. Defaults to 1e-4.
+        threshold_rule (str): One of `rel`, `abs`. In `rel` rule,
+            dynamic_threshold = best * ( 1 + threshold ) in 'greater'
+            rule or best * ( 1 - threshold ) in `less` rule.
+            In `abs` rule, dynamic_threshold = best + threshold in
+            `greater` rule or best - threshold in `less` rule.
+            Defaults to 'rel'.
+        cooldown (int): Number of epochs to wait before resuming
+            normal operation after momentum has been reduced. Defaults to 0.
+        min_value (float or list[float]): A scalar or a sequence of scalars.
+            A lower bound on the momentum of each parameter group
+            respectively. Defaults to 0. .
+        eps (float): Minimal decay applied to momentum. If the difference
+            between new and old momentum is smaller than eps, the update is
+            ignored. Defaults to 1e-8.
+        begin (int): Step at which to start triggering the scheduler
+            to monitor in val within the interval calculated
+            according to epoch of training. Defaults to 0.
+        end (int): Step at which to stop triggering the scheduler
+            to monitor in val within the interval calculated
+            according to epoch of training. Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def step(self, metrics=None):
+        """Adjusts the momentum of each parameter group based on the specified
+        schedule.
+
+        Args:
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+                Defaults to None.
+        """
+        super(MomentumSchedulerMixin, self).step(metrics)
+        if self.use_betas:
+            for group in self.optimizer.param_groups:
+                _, beta_1 = group['betas']
+                # update the betas with the calculated value
+                group['betas'] = (group['momentum'], beta_1)
diff --git a/head_extractor/build/lib/mmengine/optim/scheduler/param_scheduler.py b/head_extractor/build/lib/mmengine/optim/scheduler/param_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..af89ccaea71dda5e973a6f014284752b701110ae
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/optim/scheduler/param_scheduler.py
@@ -0,0 +1,1578 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+# Modified from https://github.com/pytorch/pytorch
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+
+import math
+import warnings
+import weakref
+from collections import Counter
+from functools import wraps
+from typing import Callable, List, Optional, Sequence, Union
+
+from torch.optim import Optimizer
+
+from mmengine.logging import print_log
+from mmengine.optim import BaseOptimWrapper
+from mmengine.registry import PARAM_SCHEDULERS
+
+INF = int(1e9)
+
+OptimizerType = Union[BaseOptimWrapper, Optimizer]
+
+
+class _ParamScheduler:
+    """Base class for parameter schedulers.
+
+    It should be inherited by all schedulers that schedule parameters in the
+    optimizer's ``param_groups``. All subclasses should overwrite the
+    ``_get_value()`` according to their own schedule strategy.
+    The implementation is motivated by
+    https://github.com/pytorch/pytorch/blob/master/torch/optim/lr_scheduler.py.
+
+    Args:
+        optimizer (BaseOptimWrapper or Optimizer): Wrapped optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resuming without
+            state dict. Default value ``-1`` means the ``step`` function is
+            never be called before. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """  # noqa: E501
+
+    def __init__(self,
+                 optimizer: OptimizerType,
+                 param_name: str,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+
+        # Attach optimizer
+        if not isinstance(optimizer, (Optimizer, BaseOptimWrapper)):
+            raise TypeError('``optimizer`` should be an Optimizer,'
+                            'but got {}'.format(type(optimizer).__name__))
+        self.optimizer = optimizer
+        self.param_name = param_name
+
+        if end <= begin:
+            raise ValueError('end should be larger than begin, but got'
+                             ' begin={}, end={}'.format(begin, end))
+        self.begin = begin
+        self.end = end
+
+        self.by_epoch = by_epoch
+
+        assert isinstance(last_step, int) and last_step >= -1
+        # Initialize valid step count and base values
+        if last_step == -1:
+            for group in optimizer.param_groups:
+                # If the param is never be scheduled, record the current value
+                # as the initial value.
+                group.setdefault(f'initial_{param_name}', group[param_name])
+        else:
+            for i, group in enumerate(optimizer.param_groups):
+                if f'initial_{param_name}' not in group:
+                    raise KeyError(
+                        f"param 'initial_{param_name}' is not specified "
+                        'in param_groups[{}] when resuming an optimizer'.
+                        format(i))
+        self.base_values = [
+            group[f'initial_{param_name}'] for group in optimizer.param_groups
+        ]
+        self.last_step = last_step
+
+        # Following https://github.com/pytorch/pytorch/issues/20124
+        # We would like to ensure that `scheduler.step()` is called after
+        # `optimizer.step()`
+        def with_counter(method: Callable):
+            if getattr(method, '_with_counter', False):
+                # `optimizer.step()` has already been replaced, return.
+                return method
+
+            # Keep a weak reference to the optimizer instance to prevent
+            # cyclic references.
+            instance_ref = weakref.ref(method.__self__)  # type: ignore
+            # Get the unbound method for the same purpose.
+            func = method.__func__  # type: ignore
+            cls = instance_ref().__class__  # type: ignore
+            del method
+
+            @wraps(func)
+            def wrapper(*args, **kwargs):
+                instance = instance_ref()
+                instance._global_step += 1
+                wrapped = func.__get__(instance, cls)
+                return wrapped(*args, **kwargs)
+
+            # Note that the returned function here is no longer a bound method,
+            # so attributes like `__func__` and `__self__` no longer exist.
+            wrapper._with_counter = True  # type: ignore
+            return wrapper
+
+        # add counter to optimizer
+        self.optimizer.step = with_counter(self.optimizer.step)  # type: ignore
+        self.optimizer._global_step = -1  # type: ignore
+
+        self._global_step = -1
+        self.verbose = verbose
+
+        self.step()
+
+    def state_dict(self) -> dict:
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which is not
+        the optimizer.
+
+        Returns:
+            dict: scheduler state.
+        """
+        return {
+            key: value
+            for key, value in self.__dict__.items() if key != 'optimizer'
+        }
+
+    def load_state_dict(self, state_dict: dict):
+        """Loads the schedulers state.
+
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        self.__dict__.update(state_dict)
+
+    def get_last_value(self):
+        """Return the last computed value by current scheduler.
+
+        Returns:
+            list: A list of the last computed value of the optimizer's
+            ``param_group``.
+        """
+        return self._last_value
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        raise NotImplementedError
+
+    def print_value(self, is_verbose: bool, group: int, value: float):
+        """Display the current parameter value.
+
+        Args:
+            is_verbose (bool): Whether to print the value.
+            group (int): The index of the current ``param_group``.
+            value (float): The parameter value.
+        """
+        if is_verbose:
+            print_log(
+                f'Adjusting parameter value of group {group} to {value:.4e}.',
+                logger='current')
+
+    def step(self):
+        """Adjusts the parameter value of each parameter group based on the
+        specified schedule."""
+        # Raise a warning if old pattern is detected
+        # https://github.com/pytorch/pytorch/issues/20124
+        if self._global_step == 0:
+            if not hasattr(self.optimizer.step, '_with_counter'):
+                warnings.warn(
+                    'Seems like `optimizer.step()` has been overridden after '
+                    'parameter value scheduler initialization. Please, make '
+                    'sure to call `optimizer.step()` before '
+                    '`scheduler.step()`. See more details at '
+                    'https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate',  # noqa: E501
+                    UserWarning)
+
+            # Just check if there were two first scheduler.step() calls
+            # before optimizer.step()
+            elif self.optimizer._global_step < 0:
+                warnings.warn(
+                    'Detected call of `scheduler.step()` before '
+                    '`optimizer.step()`. In PyTorch 1.1.0 and later, you '
+                    'should call them in the opposite order: '
+                    '`optimizer.step()` before `scheduler.step()`. '
+                    'Failure to do this will result in PyTorch skipping '
+                    'the first value of the parameter value schedule. '
+                    'See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate',  # noqa: E501
+                    UserWarning)
+        self._global_step += 1
+
+        # Compute parameter value per param group in the effective range
+        if self.begin <= self._global_step < self.end:
+            self.last_step += 1
+            values = self._get_value()
+
+            for i, data in enumerate(zip(self.optimizer.param_groups, values)):
+                param_group, value = data
+                param_group[self.param_name] = value
+                self.print_value(self.verbose, i, value)
+
+        self._last_value = [
+            group[self.param_name] for group in self.optimizer.param_groups
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class StepParamScheduler(_ParamScheduler):
+    """Decays the parameter value of each parameter group by gamma every
+    step_size epochs. Notice that such decay can happen simultaneously with
+    other changes to the parameter value from outside this scheduler.
+
+    Args:
+        optimizer (BaseOptimWrapper or Optimizer): Wrapped optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        step_size (int): Period of parameter value decay.
+        gamma (float): Multiplicative factor of parameter value decay.
+            Defaults to 0.1.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: OptimizerType,
+                 param_name: str,
+                 step_size: int,
+                 gamma: float = 0.1,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        self.step_size = step_size
+        self.gamma = gamma
+        super().__init__(
+            optimizer=optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              step_size,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        step_size = step_size * epoch_length
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(
+            *args,
+            step_size=step_size,
+            begin=begin,
+            end=end,
+            by_epoch=by_epoch,
+            **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if (self.last_step == 0) or (self.last_step % self.step_size != 0):
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+        return [
+            group[self.param_name] * self.gamma
+            for group in self.optimizer.param_groups
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class MultiStepParamScheduler(_ParamScheduler):
+    """Decays the specified parameter in each parameter group by gamma once the
+    number of epoch reaches one of the milestones. Notice that such decay can
+    happen simultaneously with other changes to the parameter from outside this
+    scheduler.
+
+    Args:
+        optimizer (BaseOptimWrapper or Optimizer): Wrapped optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        milestones (list): List of epoch indices. Must be increasing.
+        gamma (float): Multiplicative factor of parameter value decay.
+            Defaults to 0.1.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: OptimizerType,
+                 param_name: str,
+                 milestones: List[int],
+                 gamma: float = 0.1,
+                 last_step: int = -1,
+                 begin: int = 0,
+                 end: int = INF,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        self.milestones = Counter(milestones)
+        self.gamma = gamma
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              milestones,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        milestones = [i * epoch_length for i in milestones]
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(
+            *args,
+            milestones=milestones,
+            begin=begin,
+            end=end,
+            by_epoch=by_epoch,
+            **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step not in self.milestones:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+        return [
+            group[self.param_name] *
+            self.gamma**self.milestones[self.last_step]
+            for group in self.optimizer.param_groups
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class ConstantParamScheduler(_ParamScheduler):
+    """Decays the parameter value of each parameter group by a small constant
+    factor until the number of epoch reaches a pre-defined milestone: ``end``.
+    Notice that such decay can happen simultaneously with other changes to the
+    parameter value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        factor (float): The number we multiply parameter value until the
+            milestone. Defaults to 1./3.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: OptimizerType,
+                 param_name: str,
+                 factor: float = 1.0 / 3,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        if factor > 1.0 or factor < 0:
+            raise ValueError(
+                'Constant multiplicative factor should between 0 and 1.')
+
+        self.factor = factor
+        self.total_iters = end - begin - 1
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step == 0:
+            return [
+                group[self.param_name] * self.factor
+                for group in self.optimizer.param_groups
+            ]
+
+        if (self.last_step > self.total_iters
+                or (self.last_step != self.total_iters)):
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+
+        if self.last_step == self.total_iters:
+            return [
+                group[self.param_name] * (1.0 / self.factor)
+                for group in self.optimizer.param_groups
+            ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class ExponentialParamScheduler(_ParamScheduler):
+    """Decays the parameter value of each parameter group by gamma every epoch.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        gamma (float): Multiplicative factor of parameter value decay.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: OptimizerType,
+                 param_name: str,
+                 gamma: float,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        self.gamma = gamma
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step == 0:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+        return [
+            group[self.param_name] * self.gamma
+            for group in self.optimizer.param_groups
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class CosineAnnealingParamScheduler(_ParamScheduler):
+    r"""Set the parameter value of each parameter group using a cosine
+    annealing schedule, where :math:`\eta_{max}` is set to the initial value
+    and :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
+
+    .. math::
+        \begin{aligned}
+            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+            & T_{cur} \neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+            & T_{cur} = (2k+1)T_{max}.
+        \end{aligned}
+
+    Notice that because the schedule
+    is defined recursively, the parameter value can be simultaneously modified
+    outside this scheduler by other operators. If the parameter value is set
+    solely by this scheduler, the parameter value at each step becomes:
+
+    .. math::
+        \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 +
+        \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right)
+
+    It has been proposed in
+    `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this
+    only implements the cosine annealing part of SGDR, and not the restarts.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        T_max (int, optional): Maximum number of iterations. If not specified,
+            use ``end - begin``. Defaults to None.
+        eta_min (float, optional): Minimum parameter value. Defaults to None.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+        eta_min_ratio (float, optional): The ratio of the minimum parameter
+            value to the base parameter value. Either `eta_min` or
+            `eta_min_ratio` should be specified. Defaults to None.
+            New in version 0.3.2.
+
+    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+    """
+
+    def __init__(self,
+                 optimizer: Union[Optimizer, BaseOptimWrapper],
+                 param_name: str,
+                 T_max: Optional[int] = None,
+                 eta_min: Optional[float] = None,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False,
+                 eta_min_ratio: Optional[float] = None):
+        # To preserve backwards compatibility
+        if eta_min is None and eta_min_ratio is None:
+            eta_min = 0.
+        assert (eta_min is None) ^ (eta_min_ratio is None), \
+            'Either `eta_min` or `eta_min_ratio should be specified'
+        self.T_max = T_max or (end - begin)
+        self.eta_min = eta_min
+        self.eta_min_ratio = eta_min_ratio
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              T_max=None,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        if T_max is not None:
+            T_max = T_max * epoch_length
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(
+            *args,
+            T_max=T_max,
+            begin=begin,
+            end=end,
+            by_epoch=by_epoch,
+            **kwargs)
+
+    def _get_value(self) -> list:
+        """Compute value using chainable form of the scheduler."""
+
+        def _get_eta_min(base_value):
+            if self.eta_min_ratio is None:
+                return self.eta_min
+            return base_value * self.eta_min_ratio
+
+        if self.last_step == 0:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+        elif (self.last_step - 1 - self.T_max) % (2 * self.T_max) == 0:
+            return [
+                group[self.param_name] +
+                (base_value - _get_eta_min(base_value)) *
+                (1 - math.cos(math.pi / self.T_max)) / 2
+                for base_value, group in zip(self.base_values,
+                                             self.optimizer.param_groups)
+            ]
+        return [(1 + math.cos(math.pi * self.last_step / self.T_max)) /
+                (1 + math.cos(math.pi * (self.last_step - 1) / self.T_max)) *
+                (group[self.param_name] - _get_eta_min(base_value)) +
+                _get_eta_min(base_value) for base_value, group in zip(
+                    self.base_values, self.optimizer.param_groups)]
+
+
+@PARAM_SCHEDULERS.register_module()
+class LinearParamScheduler(_ParamScheduler):
+    """Decays the parameter value of each parameter group by linearly changing
+    small multiplicative factor until the number of epoch reaches a pre-defined
+    milestone: ``end``.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    parameter value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        start_factor (float): The number we multiply parameter value in the
+            first epoch. The multiplication factor changes towards end_factor
+            in the following epochs. Defaults to 1./3.
+        end_factor (float): The number we multiply parameter value at the end
+            of linear changing process. Defaults to 1.0.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: Union[Optimizer, BaseOptimWrapper],
+                 param_name: str,
+                 start_factor: float = 1.0 / 3,
+                 end_factor: float = 1.0,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        if start_factor > 1.0 or start_factor < 0:
+            raise ValueError(
+                'Starting multiplicative factor should between 0 and 1.')
+
+        if end_factor > 1.0 or end_factor < 0:
+            raise ValueError(
+                'Ending multiplicative factor should between 0 and 1.')
+
+        self.start_factor = start_factor
+        self.end_factor = end_factor
+        self.total_iters = end - begin - 1
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step == 0:
+            return [
+                group[self.param_name] * self.start_factor
+                for group in self.optimizer.param_groups
+            ]
+
+        return [
+            group[self.param_name] *
+            (1. + (self.end_factor - self.start_factor) /
+             (self.total_iters * self.start_factor + (self.last_step - 1) *
+              (self.end_factor - self.start_factor)))
+            for group in self.optimizer.param_groups
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class PolyParamScheduler(_ParamScheduler):
+    """Decays the parameter value of each parameter group in a polynomial decay
+    scheme.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    parameter value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        eta_min (float): Minimum parameter value at the end of scheduling.
+            Defaults to 0.
+        power (float): The power of the polynomial. Defaults to 1.0.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: Union[Optimizer, BaseOptimWrapper],
+                 param_name: str,
+                 eta_min: float = 0,
+                 power: float = 1.0,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+
+        self.eta_min = eta_min
+        self.power = power
+        self.total_iters = end - begin - 1
+
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step == 0:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+
+        return [(group[self.param_name] - self.eta_min) *
+                (1 - 1 / (self.total_iters - self.last_step + 1))**self.power +
+                self.eta_min for group in self.optimizer.param_groups]
+
+
+@PARAM_SCHEDULERS.register_module()
+class OneCycleParamScheduler(_ParamScheduler):
+    r"""Sets the parameters of each parameter group according to the
+    1cycle learning rate policy. The 1cycle policy anneals the learning
+    rate from an initial learning rate to some maximum learning rate and then
+    from that maximum learning rate to some minimum learning rate much lower
+    than the initial learning rate.
+    This policy was initially described in the paper `Super-Convergence:
+    Very Fast Training of Neural Networks Using Large Learning Rates`_.
+
+    The 1cycle learning rate policy changes the learning rate after every
+    batch. `step` should be called after a batch has been used for training.
+
+    This scheduler is not chainable.
+
+    Note also that the total number of steps in the cycle can be determined in
+    one of two ways (listed in order of precedence):
+
+    #. A value for total_steps is explicitly provided.
+    #. If total_steps is not defined, begin and end of the ParamSchedul will
+       works for it. In this case, the number of total steps is inferred by
+       total_steps = end - begin
+
+    The default behaviour of this scheduler follows the fastai implementation
+    of 1cycle, which claims that "unpublished work has shown even better
+    results by using only two phases". To mimic the behaviour of the original
+    paper instead, set ``three_phase=True``.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        eta_max (float or list): Upper parameter value boundaries in the cycle
+            for each parameter group.
+        total_steps (int): The total number of steps in the cycle. Note that
+            if a value is not provided here, then it will be equal to
+            ``end - begin``. Defaults to None
+        pct_start (float): The percentage of the cycle (in number of steps)
+            spent increasing the learning rate.
+            Defaults to 0.3
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: "cos" for cosine annealing,
+            "linear" for linear annealing.
+            Defaults to 'cos'
+        div_factor (float): Determines the initial learning rate via
+            initial_param = eta_max/div_factor
+            Defaults to 25
+        final_div_factor (float): Determines the minimum learning rate via
+            eta_min = initial_param/final_div_factor
+            Defaults to 1e4
+        three_phase (bool): If ``True``, use a third phase of the schedule to
+            annihilate the learning rate according to 'final_div_factor'
+            instead of modifying the second phase (the first two phases will be
+            symmetrical about the step indicated by 'pct_start').
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+
+    .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
+        https://arxiv.org/abs/1708.07120
+    """  # noqa E501
+
+    def __init__(self,
+                 optimizer: Union[Optimizer, BaseOptimWrapper],
+                 param_name: str,
+                 eta_max: float = 0,
+                 total_steps: Optional[int] = None,
+                 pct_start: float = 0.3,
+                 anneal_strategy: str = 'cos',
+                 div_factor: float = 25.,
+                 final_div_factor: float = 1e4,
+                 three_phase: bool = False,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+
+        assert param_name == 'lr', ('OneCycle only works for learning rate '
+                                    'updating, but got patam_name as '
+                                    f'{param_name}')
+
+        self.eta_max = eta_max
+        self.div_factor = div_factor
+        self.final_div_factor = final_div_factor
+
+        # Validate total_steps
+        if total_steps is not None:
+            if total_steps <= 0 or not isinstance(total_steps, int):
+                raise ValueError('Expected positive integer total_steps, '
+                                 f'but got {total_steps}')
+            self.total_steps = total_steps
+        else:
+            self.total_steps = end - begin
+
+        # Validate pct_start
+        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
+            raise ValueError('Expected float between 0 and 1 pct_start, '
+                             f'but got {pct_start}')
+
+        # Validate anneal_strategy
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError(
+                'anneal_strategy must by one of "cos" or "linear", '
+                f'instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = self._annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = self._annealing_linear
+
+        if three_phase:
+            self._schedule_phases = [
+                {
+                    'end_step': float(pct_start * self.total_steps) - 1,
+                    f'start_{param_name}': f'initial_{param_name}',
+                    f'end_{param_name}': f'max_{param_name}'
+                },
+                {
+                    'end_step': float(2 * pct_start * self.total_steps) - 2,
+                    f'start_{param_name}': f'max_{param_name}',
+                    f'end_{param_name}': f'initial_{param_name}'
+                },
+                {
+                    'end_step': self.total_steps - 1,
+                    f'start_{param_name}': f'initial_{param_name}',
+                    f'end_{param_name}': f'min_{param_name}'
+                },
+            ]
+        else:
+            self._schedule_phases = [
+                {
+                    'end_step': float(pct_start * self.total_steps) - 1,
+                    f'start_{param_name}': f'initial_{param_name}',
+                    f'end_{param_name}': f'max_{param_name}'
+                },
+                {
+                    'end_step': self.total_steps - 1,
+                    f'start_{param_name}': f'max_{param_name}',
+                    f'end_{param_name}': f'min_{param_name}'
+                },
+            ]
+
+        # Initialize parameters
+        max_values = self._format_param(f'max_{param_name}', optimizer,
+                                        eta_max)
+        if last_step == -1:
+            for idx, group in enumerate(optimizer.param_groups):
+                group[f'initial_{param_name}'] = max_values[idx] / div_factor
+                group[f'max_{param_name}'] = max_values[idx]
+                group[f'min_{param_name}'] = \
+                    group[f'initial_{param_name}'] / final_div_factor
+
+        super().__init__(
+            optimizer=optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    def _format_param(self, name, optimizer, param):
+        """Return correctly formatted lr/momentum for each param group."""
+        if isinstance(param, (list, tuple)):
+            if len(param) != len(optimizer.param_groups):
+                raise ValueError(
+                    f'expected {len(optimizer.param_groups)} values '
+                    f'for {name}, got {len(param)}')
+            return param
+        else:
+            return [param] * len(optimizer.param_groups)
+
+    @staticmethod
+    def _annealing_cos(start, end, pct):
+        """Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."""
+
+        cos_out = math.cos(math.pi * pct) + 1
+        return end + (start - end) / 2.0 * cos_out
+
+    @staticmethod
+    def _annealing_linear(start, end, pct):
+        """Linearly anneal from `start` to `end` as pct goes from 0.0 to
+        1.0."""
+        return (end - start) * pct + start
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              total_steps=None,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        if total_steps is not None:
+            total_steps = total_steps * epoch_length
+        return cls(
+            *args,
+            begin=begin,
+            end=end,
+            total_steps=total_steps,
+            by_epoch=by_epoch,
+            **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+
+        params = []
+        step_num = self.last_step
+
+        if step_num > self.total_steps:
+            raise ValueError(
+                f'Tried to step {step_num + 1} times. '
+                f'The specified number of total steps is {self.total_steps}')
+
+        for group in self.optimizer.param_groups:
+            start_step = 0
+            for i, phase in enumerate(self._schedule_phases):
+                end_step = phase['end_step']
+                if step_num <= end_step or i == len(self._schedule_phases) - 1:
+                    pct = (step_num - start_step) / (end_step - start_step)
+                    computed_param = self.anneal_func(
+                        group[phase['start_' + self.param_name]],
+                        group[phase['end_' + self.param_name]], pct)
+                    break
+                start_step = phase['end_step']
+
+            params.append(computed_param)
+
+        return params
+
+
+@PARAM_SCHEDULERS.register_module()
+class CosineRestartParamScheduler(_ParamScheduler):
+    """Sets the parameters of each parameter group according to the cosine
+    annealing with restarts scheme. The cosine restart policy anneals the
+    parameter from the initial value to `eta_min` with a cosine annealing
+    schedule and then restarts another period from the maximum value multiplied
+    with `restart_weight`.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        periods (list[int]): Periods for each cosine anneling cycle.
+        restart_weights (list[float]): Restart weights at each
+            restart iteration. Defaults to [1].
+        eta_min (float, optional): Minimum parameter value at the end of
+            scheduling. Defaults to None.
+        eta_min_ratio (float, optional): The ratio of minimum parameter value
+            to the base parameter value. Either `eta_min` or `eta_min_ratio`
+            should be specified. Defaults to None.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: Union[Optimizer, BaseOptimWrapper],
+                 param_name: str,
+                 periods: List[int],
+                 restart_weights: Sequence[float] = (1, ),
+                 eta_min: Optional[float] = None,
+                 eta_min_ratio: Optional[float] = None,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        assert (eta_min is None) ^ (eta_min_ratio is None)
+        self.periods = periods
+        self.eta_min = eta_min
+        self.eta_min_ratio = eta_min_ratio
+        self.restart_weights = restart_weights
+        assert (len(self.periods) == len(self.restart_weights)
+                ), 'periods and restart_weights should have the same length.'
+        self.cumulative_periods = [
+            sum(self.periods[0:i + 1]) for i in range(0, len(self.periods))
+        ]
+
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              periods,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        periods = [p * epoch_length for p in periods]
+        by_epoch = False
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(
+            *args,
+            periods=periods,
+            begin=begin,
+            end=end,
+            by_epoch=by_epoch,
+            **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        idx = self.get_position_from_periods(self.last_step,
+                                             self.cumulative_periods)
+        # if current step is not in the periods, return origin parameters
+        if idx is None:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+        current_weight = self.restart_weights[idx]
+        nearest_restart = 0 if idx == 0 else self.cumulative_periods[idx - 1]
+        current_periods = self.periods[idx]
+        step = self.last_step - nearest_restart
+        values = []
+        for base_value, group in zip(self.base_values,
+                                     self.optimizer.param_groups):
+            eta_max = base_value * current_weight
+            if self.eta_min_ratio is None:
+                eta_min = self.eta_min
+            else:
+                eta_min = base_value * self.eta_min_ratio
+            if step == 0:
+                values.append(eta_max)
+            else:
+                values.append(
+                    (1 + math.cos(math.pi * step / current_periods)) /
+                    (1 + math.cos(math.pi * (step - 1) / current_periods)) *
+                    (group[self.param_name] - eta_min) + eta_min)
+
+        return values
+
+    @staticmethod
+    def get_position_from_periods(
+            iteration: int, cumulative_periods: List[int]) -> Optional[int]:
+        """Get the position from a period list.
+
+        It will return the index of the right-closest number in the period
+        list.
+        For example, the cumulative_periods = [100, 200, 300, 400],
+        if iteration == 50, return 0;
+        if iteration == 210, return 2;
+        if iteration == 300, return 3.
+
+        Args:
+            iteration (int): Current iteration.
+            cumulative_periods (list[int]): Cumulative period list.
+
+        Returns:
+            Optional[int]: The position of the right-closest number in the
+            period list. If not in the period, return None.
+        """
+        for i, period in enumerate(cumulative_periods):
+            if iteration < period:
+                return i
+        return None
+
+
+@PARAM_SCHEDULERS.register_module()
+class ReduceOnPlateauParamScheduler(_ParamScheduler):
+    """Reduce the parameters of each parameter group when a metric has stopped
+    improving. Models often benefit from reducing the parameters by a factor of
+    2-10 once learning stagnates. This scheduler reads a metrics quantity and
+    if no improvement is seen for a ``patience`` number of epochs, the
+    parameters are reduced.
+
+    The implementation is motivated by `PyTorch ReduceLROnPlateau`_.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        monitor (str): The name of the metric to measure whether
+            the performance of the model is improved.
+        rule (str): One of `less`, `greater`. In `less` rule, parameters will
+            be reduced when the quantity monitored has stopped
+            decreasing; in `greater` rule it will be reduced when the
+            quantity monitored has stopped increasing. Defaults to 'less'.
+            The ``rule`` is the renaming of ``mode`` in pytorch.
+        factor (float): Factor by which the parameters will be
+            reduced. new_param = param * factor. Defaults to 0.1.
+        patience (int): Number of epochs with no improvement after
+            which parameters will be reduced. For example, if
+            ``patience = 2``, then we will ignore the first 2 epochs
+            with no improvement, and will only decrease the parameters after
+            the 3rd epoch if the monitor value still hasn't improved then.
+            Defaults to 10.
+        threshold (float): Threshold for measuring the new optimum,
+            to only focus on significant changes. Defaults to 1e-4.
+        threshold_rule (str): One of `rel`, `abs`. In `rel` rule,
+            dynamic_threshold = best * ( 1 + threshold ) in 'greater'
+            rule or best * ( 1 - threshold ) in `less` rule.
+            In `abs` rule, dynamic_threshold = best + threshold in
+            `greater` rule or best - threshold in `less` rule.
+            Defaults to 'rel'.
+        cooldown (int): Number of epochs to wait before resuming
+            normal operation after parameters have been reduced. Defaults to 0.
+        min_value (float or list[float]): A scalar or a sequence of scalars.
+            A lower bound on the parameters of each parameter group
+            respectively. Defaults to 0. .
+        eps (float): Minimal decay applied to parameters. If the difference
+            between new and old parameters are smaller than eps, the update is
+            ignored. Defaults to 1e-8.
+        begin (int): Step at which to start triggering the scheduler
+            to monitor in val within the interval calculated
+            according to epoch of training. Defaults to 0.
+        end (int): Step at which to stop triggering the scheduler
+            to monitor in val within the interval calculated
+            according to epoch of training. Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+
+    .. _PyTorch ReduceLROnPlateau:
+       https://github.com/pytorch/pytorch/blob/master/torch/optim/lr_scheduler.py
+    """
+
+    need_val_args = True
+
+    def __init__(self,
+                 optimizer: OptimizerType,
+                 param_name: str,
+                 monitor: str = 'loss',
+                 rule: str = 'less',
+                 factor: float = 0.1,
+                 patience: int = 10,
+                 threshold: float = 1e-4,
+                 threshold_rule: str = 'rel',
+                 cooldown: int = 0,
+                 min_value: Union[float, Sequence[float]] = 0.,
+                 eps: float = 1e-8,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+
+        # Attach optimizer
+        if not isinstance(optimizer, (Optimizer, BaseOptimWrapper)):
+            raise TypeError('``optimizer`` should be an Optimizer,'
+                            'but got {}'.format(type(optimizer).__name__))
+        self.optimizer = optimizer
+        self.param_name = param_name
+
+        if end <= begin:
+            raise ValueError('end should be larger than begin, but got'
+                             ' begin={}, end={}'.format(begin, end))
+        self.begin = begin
+        self.end = end
+
+        assert by_epoch, \
+            f'Now {type(self).__name__} only support by_epoch=True'
+        self.by_epoch = by_epoch
+
+        assert isinstance(last_step, int) and last_step >= -1
+        # Initialize valid step count and base values
+        if last_step == -1:
+            for group in optimizer.param_groups:
+                # If the param is never be scheduled, record the current value
+                # as the initial value.
+                group.setdefault(f'initial_{param_name}', group[param_name])
+        else:
+            for i, group in enumerate(optimizer.param_groups):
+                if f'initial_{param_name}' not in group:
+                    raise KeyError(
+                        f"param 'initial_{param_name}' is not specified "
+                        'in param_groups[{}] when resuming an optimizer'.
+                        format(i))
+
+        self.last_step = last_step
+
+        self._global_step = 0
+        self.verbose = verbose
+
+        if factor >= 1.0:
+            raise ValueError('Factor should be < 1.0.')
+        self.factor = factor
+
+        # This code snippet handles compatibility with the optimizer wrapper.
+        # The optimizer wrapper includes an additional parameter to record the
+        # base learning rate (lr) which is not affected by the paramwise_cfg.
+        # By retrieving the base lr, we can obtain the actual base lr that
+        # reflects the learning progress.
+        if isinstance(optimizer, BaseOptimWrapper):
+            raw_optimizer = optimizer.optimizer
+        else:
+            raw_optimizer = optimizer
+
+        if isinstance(min_value, (list, tuple)):
+            if len(min_value) != len(raw_optimizer.param_groups):
+                raise ValueError('expected {} min_lrs, got {}'.format(
+                    len(raw_optimizer.param_groups), len(min_value)))
+            self.min_values = list(min_value)
+            # Consider the `min_value` of the last param_groups
+            # as the base setting. And we only add this value when
+            # the optimizer is OptimWrapper.
+            if isinstance(optimizer, BaseOptimWrapper) and \
+                    optimizer.base_param_settings is not None:  # type: ignore
+                self.min_values.append(self.min_values[-1])
+
+        else:
+            self.min_values = [min_value] * len(  # type: ignore
+                optimizer.param_groups)
+
+        self.patience = patience
+        self.cooldown = cooldown
+        self.cooldown_counter = 0
+        self.rule_worse = None  # the worse value for the chosen mode
+        self.best = None
+        self.num_bad_epochs = 0
+        self.eps = eps
+
+        self.monitor = monitor
+        self._init_is_better(
+            rule=rule, threshold=threshold, threshold_rule=threshold_rule)
+        self._reset()
+
+        # remove call self.step() and init self._global_step = 0
+        self._last_value = [
+            group[self.param_name] for group in self.optimizer.param_groups
+        ]
+
+    def step(self, metrics=None):
+        """Adjusts the parameter value of each parameter group based on the
+        specified schedule.
+
+        Args:
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+                Defaults to None.
+        """
+        if metrics is None:
+            # only to count self._global_step
+            self._global_step += 1
+            return
+
+        if not isinstance(metrics, dict):
+            raise TypeError('metrics type should be dict,'
+                            f' but got type {type(metrics)}')
+
+        # Compute parameter value per param group in the effective range
+        if self.begin <= self._global_step < self.end:
+            self.last_step += 1
+
+            # convert `metric` to float, in case it's a zero-dim Tensor
+            metric = metrics.get(self.monitor, None)
+            if metric is not None:
+                if self._is_better(metric, self.best):
+                    self.best = metric
+                    self.num_bad_epochs = 0
+                else:
+                    self.num_bad_epochs += 1
+
+                if self._in_cooldown():
+                    self.cooldown_counter -= 1
+                    self.num_bad_epochs = 0  # ignore bad epochs in cooldown
+
+                if self.num_bad_epochs > self.patience:
+                    values = self._get_value()
+
+                    for i, data in enumerate(
+                            zip(self.optimizer.param_groups, values)):
+                        param_group, value = data
+                        if param_group[self.param_name] - value > self.eps:
+                            param_group[self.param_name] = value
+                            self.print_value(self.verbose, i, value)
+                    self.cooldown_counter = self.cooldown
+                    self.num_bad_epochs = 0
+
+            else:
+                raise KeyError(f'Excepted key in {list(metrics.keys())},'
+                               f' but got key {self.monitor} is not in dict')
+
+        self._last_value = [
+            group[self.param_name] for group in self.optimizer.param_groups
+        ]
+
+    def print_value(self, is_verbose: bool, group: int, value: float) -> None:
+        """Display the current parameter value.
+
+        Args:
+            is_verbose (bool): Whether to print the value.
+            group (int): The index of the current ``param_group``.
+            value (float): The parameter value.
+        """
+        if is_verbose:
+            step_name = 'epoch' if self.by_epoch else 'iter'
+            print_log(
+                f'Adjusting parameter value of group {group} to {value:.4e} '
+                f'in {step_name} {self.last_step}.',
+                logger='current')
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        values = [
+            float(group[self.param_name]) * self.factor
+            for group in self.optimizer.param_groups
+        ]
+        return [max(v, min_v) for v, min_v in zip(values, self.min_values)]
+
+    def _in_cooldown(self):
+        """Judge whether it is in cooldown."""
+        return self.cooldown_counter > 0
+
+    def _is_better(self, a, best):
+        """Judge whether the monitor value is better."""
+        if self.rule == 'less' and self.threshold_rule == 'rel':
+            rel_epsilon = 1. - self.threshold
+            return a < best * rel_epsilon
+
+        elif self.rule == 'less' and self.threshold_rule == 'abs':
+            return a < best - self.threshold
+
+        elif self.rule == 'greater' and self.threshold_rule == 'rel':
+            rel_epsilon = self.threshold + 1.
+            return a > best * rel_epsilon
+
+        else:  # rule == 'greater' and epsilon_mode == 'abs':
+            return a > best + self.threshold
+
+    def _init_is_better(self, rule, threshold, threshold_rule):
+        """Initialize rule and its associated values."""
+        if threshold < 0:
+            raise ValueError(f'threshold {threshold} should be >= 0.')
+        if rule not in {'less', 'greater'}:
+            raise ValueError(f'mode {rule} is unknown!')
+        if threshold_rule not in {'rel', 'abs'}:
+            raise ValueError(f'threshold mode {threshold_rule}'
+                             ' is unknown!')
+
+        if rule == 'less':
+            self.rule_worse = INF
+        else:  # rule == 'greater':
+            self.rule_worse = -INF
+
+        self.rule = rule
+        self.threshold = threshold
+        self.threshold_rule = threshold_rule
+
+    def _reset(self):
+        """Resets num_bad_epochs counter and cooldown counter."""
+        self.best = self.rule_worse
+        self.cooldown_counter = 0
+        self.num_bad_epochs = 0
diff --git a/head_extractor/build/lib/mmengine/registry/__init__.py b/head_extractor/build/lib/mmengine/registry/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cce27370437443a88484b4176ac335d27f8aaf14
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/registry/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .build_functions import (build_from_cfg, build_model_from_cfg,
+                              build_runner_from_cfg, build_scheduler_from_cfg)
+from .default_scope import DefaultScope
+from .registry import Registry
+from .root import (DATA_SAMPLERS, DATASETS, EVALUATOR, FUNCTIONS, HOOKS,
+                   INFERENCERS, LOG_PROCESSORS, LOOPS, METRICS, MODEL_WRAPPERS,
+                   MODELS, OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS,
+                   OPTIMIZERS, PARAM_SCHEDULERS, RUNNER_CONSTRUCTORS, RUNNERS,
+                   STRATEGIES, TASK_UTILS, TRANSFORMS, VISBACKENDS,
+                   VISUALIZERS, WEIGHT_INITIALIZERS)
+from .utils import (count_registered_modules, init_default_scope,
+                    traverse_registry_tree)
+
+__all__ = [
+    'Registry', 'RUNNERS', 'RUNNER_CONSTRUCTORS', 'HOOKS', 'DATASETS',
+    'DATA_SAMPLERS', 'TRANSFORMS', 'MODELS', 'WEIGHT_INITIALIZERS',
+    'OPTIMIZERS', 'OPTIM_WRAPPER_CONSTRUCTORS', 'TASK_UTILS',
+    'PARAM_SCHEDULERS', 'METRICS', 'MODEL_WRAPPERS', 'OPTIM_WRAPPERS', 'LOOPS',
+    'VISBACKENDS', 'VISUALIZERS', 'LOG_PROCESSORS', 'EVALUATOR', 'INFERENCERS',
+    'DefaultScope', 'traverse_registry_tree', 'count_registered_modules',
+    'build_model_from_cfg', 'build_runner_from_cfg', 'build_from_cfg',
+    'build_scheduler_from_cfg', 'init_default_scope', 'FUNCTIONS', 'STRATEGIES'
+]
diff --git a/head_extractor/build/lib/mmengine/registry/build_functions.py b/head_extractor/build/lib/mmengine/registry/build_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..585a86a437f5471b98ff8c4d9516c41cc185de90
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/registry/build_functions.py
@@ -0,0 +1,312 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import logging
+from typing import TYPE_CHECKING, Any, Optional, Union
+import torch
+from mmengine.config import Config, ConfigDict
+#from mmengine.utils import ManagerMixin
+from mmengine.utils import ManagerMixin, digit_version
+from .registry import Registry
+
+if TYPE_CHECKING:
+    import torch.nn as nn
+
+    from mmengine.optim.scheduler import _ParamScheduler
+    from mmengine.runner import Runner
+
+
+def build_from_cfg(
+        cfg: Union[dict, ConfigDict, Config],
+        registry: Registry,
+        default_args: Optional[Union[dict, ConfigDict, Config]] = None) -> Any:
+    """Build a module from config dict when it is a class configuration, or
+    call a function from config dict when it is a function configuration.
+
+    If the global variable default scope (:obj:`DefaultScope`) exists,
+    :meth:`build` will firstly get the responding registry and then call
+    its own :meth:`build`.
+
+    At least one of the ``cfg`` and ``default_args`` contains the key "type",
+    which should be either str or class. If they all contain it, the key
+    in ``cfg`` will be used because ``cfg`` has a high priority than
+    ``default_args`` that means if a key exists in both of them, the value of
+    the key will be ``cfg[key]``. They will be merged first and the key "type"
+    will be popped up and the remaining keys will be used as initialization
+    arguments.
+
+    Examples:
+        >>> from mmengine import Registry, build_from_cfg
+        >>> MODELS = Registry('models')
+        >>> @MODELS.register_module()
+        >>> class ResNet:
+        >>>     def __init__(self, depth, stages=4):
+        >>>         self.depth = depth
+        >>>         self.stages = stages
+        >>> cfg = dict(type='ResNet', depth=50)
+        >>> model = build_from_cfg(cfg, MODELS)
+        >>> # Returns an instantiated object
+        >>> @MODELS.register_module()
+        >>> def resnet50():
+        >>>     pass
+        >>> resnet = build_from_cfg(dict(type='resnet50'), MODELS)
+        >>> # Return a result of the calling function
+
+    Args:
+        cfg (dict or ConfigDict or Config): Config dict. It should at least
+            contain the key "type".
+        registry (:obj:`Registry`): The registry to search the type from.
+        default_args (dict or ConfigDict or Config, optional): Default
+            initialization arguments. Defaults to None.
+
+    Returns:
+        object: The constructed object.
+    """
+    # Avoid circular import
+    from ..logging import print_log
+
+    if not isinstance(cfg, (dict, ConfigDict, Config)):
+        raise TypeError(
+            f'cfg should be a dict, ConfigDict or Config, but got {type(cfg)}')
+
+    if 'type' not in cfg:
+        if default_args is None or 'type' not in default_args:
+            raise KeyError(
+                '`cfg` or `default_args` must contain the key "type", '
+                f'but got {cfg}\n{default_args}')
+
+    if not isinstance(registry, Registry):
+        raise TypeError('registry must be a mmengine.Registry object, '
+                        f'but got {type(registry)}')
+
+    if not (isinstance(default_args,
+                       (dict, ConfigDict, Config)) or default_args is None):
+        raise TypeError(
+            'default_args should be a dict, ConfigDict, Config or None, '
+            f'but got {type(default_args)}')
+
+    args = cfg.copy()
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+
+    # Instance should be built under target scope, if `_scope_` is defined
+    # in cfg, current default scope should switch to specified scope
+    # temporarily.
+    scope = args.pop('_scope_', None)
+    with registry.switch_scope_and_registry(scope) as registry:
+        obj_type = args.pop('type')
+        if isinstance(obj_type, str):
+            obj_cls = registry.get(obj_type)
+            if obj_cls is None:
+                raise KeyError(
+                    f'{obj_type} is not in the {registry.scope}::{registry.name} registry. '  # noqa: E501
+                    f'Please check whether the value of `{obj_type}` is '
+                    'correct or it was registered as expected. More details '
+                    'can be found at '
+                    'https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#import-the-custom-module'  # noqa: E501
+                )
+        # this will include classes, functions, partial functions and more
+        elif callable(obj_type):
+            obj_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a str or valid type, but got {type(obj_type)}')
+
+        # If `obj_cls` inherits from `ManagerMixin`, it should be
+        # instantiated by `ManagerMixin.get_instance` to ensure that it
+        # can be accessed globally.
+        if inspect.isclass(obj_cls) and \
+                issubclass(obj_cls, ManagerMixin):  # type: ignore
+            obj = obj_cls.get_instance(**args)  # type: ignore
+        else:
+            obj = obj_cls(**args)  # type: ignore
+
+        if (inspect.isclass(obj_cls) or inspect.isfunction(obj_cls)
+                or inspect.ismethod(obj_cls)):
+            print_log(
+                f'An `{obj_cls.__name__}` instance is built from '  # type: ignore # noqa: E501
+                'registry, and its implementation can be found in '
+                f'{obj_cls.__module__}',  # type: ignore
+                logger='current',
+                level=logging.DEBUG)
+        else:
+            print_log(
+                'An instance is built from registry, and its constructor '
+                f'is {obj_cls}',
+                logger='current',
+                level=logging.DEBUG)
+        return obj
+
+
+def build_runner_from_cfg(cfg: Union[dict, ConfigDict, Config],
+                          registry: Registry) -> 'Runner':
+    """Build a Runner object.
+
+    Examples:
+        >>> from mmengine.registry import Registry, build_runner_from_cfg
+        >>> RUNNERS = Registry('runners', build_func=build_runner_from_cfg)
+        >>> @RUNNERS.register_module()
+        >>> class CustomRunner(Runner):
+        >>>     def setup_env(env_cfg):
+        >>>         pass
+        >>> cfg = dict(runner_type='CustomRunner', ...)
+        >>> custom_runner = RUNNERS.build(cfg)
+
+    Args:
+        cfg (dict or ConfigDict or Config): Config dict. If "runner_type" key
+            exists, it will be used to build a custom runner. Otherwise, it
+            will be used to build a default runner.
+        registry (:obj:`Registry`): The registry to search the type from.
+
+    Returns:
+        object: The constructed runner object.
+    """
+    from ..config import Config, ConfigDict
+    from ..logging import print_log
+
+    assert isinstance(
+        cfg,
+        (dict, ConfigDict, Config
+         )), f'cfg should be a dict, ConfigDict or Config, but got {type(cfg)}'
+    assert isinstance(
+        registry, Registry), ('registry should be a mmengine.Registry object',
+                              f'but got {type(registry)}')
+
+    args = cfg.copy()
+    # Runner should be built under target scope, if `_scope_` is defined
+    # in cfg, current default scope should switch to specified scope
+    # temporarily.
+    scope = args.pop('_scope_', None)
+    with registry.switch_scope_and_registry(scope) as registry:
+        obj_type = args.get('runner_type', 'Runner')
+        if isinstance(obj_type, str):
+            runner_cls = registry.get(obj_type)
+            if runner_cls is None:
+                raise KeyError(
+                    f'{obj_type} is not in the {registry.name} registry. '
+                    f'Please check whether the value of `{obj_type}` is '
+                    'correct or it was registered as expected. More details '
+                    'can be found at https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#import-the-custom-module'  # noqa: E501
+                )
+        elif inspect.isclass(obj_type):
+            runner_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a str or valid type, but got {type(obj_type)}')
+
+        runner = runner_cls.from_cfg(args)  # type: ignore
+        print_log(
+            f'An `{runner_cls.__name__}` instance is built from '  # type: ignore # noqa: E501
+            'registry, its implementation can be found in'
+            f'{runner_cls.__module__}',  # type: ignore
+            logger='current',
+            level=logging.DEBUG)
+        return runner
+
+
+def build_model_from_cfg(
+    cfg: Union[dict, ConfigDict, Config],
+    registry: Registry,
+    default_args: Optional[Union[dict, 'ConfigDict', 'Config']] = None
+) -> 'nn.Module':
+    """Build a PyTorch model from config dict(s). Different from
+    ``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built.
+
+    Args:
+        cfg (dict, list[dict]): The config of modules, which is either a config
+            dict or a list of config dicts. If cfg is a list, the built
+            modules will be wrapped with ``nn.Sequential``.
+        registry (:obj:`Registry`): A registry the module belongs to.
+        default_args (dict, optional): Default arguments to build the module.
+            Defaults to None.
+
+    Returns:
+        nn.Module: A built nn.Module.
+    """
+    from ..model import Sequential
+    if isinstance(cfg, list):
+        modules = [
+            build_from_cfg(_cfg, registry, default_args) for _cfg in cfg
+        ]
+        return Sequential(*modules)
+    else:
+        return build_from_cfg(cfg, registry, default_args)
+
+
+def build_optimizer_from_cfg(
+        cfg: Union[dict, ConfigDict, Config],
+        registry: Registry,
+        default_args: Optional[Union[dict, ConfigDict, Config]] = None) -> Any:
+    if 'Adafactor' == cfg['type'] and digit_version(
+            torch.__version__) >= digit_version('2.5.0'):
+        from ..logging import print_log
+        print_log(
+            'the torch version of Adafactor is registered as torch_Adafactor')
+    return build_from_cfg(cfg, registry, default_args)
+
+
+
+def build_scheduler_from_cfg(
+    cfg: Union[dict, ConfigDict, Config],
+    registry: Registry,
+    default_args: Optional[Union[dict, ConfigDict, Config]] = None
+) -> '_ParamScheduler':
+    """Builds a ``ParamScheduler`` instance from config.
+
+    ``ParamScheduler`` supports building instance by its constructor or
+    method ``build_iter_from_epoch``. Therefore, its registry needs a build
+    function to handle both cases.
+
+    Args:
+        cfg (dict or ConfigDict or Config): Config dictionary. If it contains
+            the key ``convert_to_iter_based``, instance will be built by method
+            ``convert_to_iter_based``, otherwise instance will be built by its
+            constructor.
+        registry (:obj:`Registry`): The ``PARAM_SCHEDULERS`` registry.
+        default_args (dict or ConfigDict or Config, optional): Default
+            initialization arguments. It must contain key ``optimizer``. If
+            ``convert_to_iter_based`` is defined in ``cfg``, it must
+            additionally contain key ``epoch_length``. Defaults to None.
+
+    Returns:
+        object: The constructed ``ParamScheduler``.
+    """
+    assert isinstance(
+        cfg,
+        (dict, ConfigDict, Config
+         )), f'cfg should be a dict, ConfigDict or Config, but got {type(cfg)}'
+    assert isinstance(
+        registry, Registry), ('registry should be a mmengine.Registry object',
+                              f'but got {type(registry)}')
+
+    args = cfg.copy()
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+    scope = args.pop('_scope_', None)
+    with registry.switch_scope_and_registry(scope) as registry:
+        convert_to_iter = args.pop('convert_to_iter_based', False)
+        if convert_to_iter:
+            scheduler_type = args.pop('type')
+            assert 'epoch_length' in args and args.get('by_epoch', True), (
+                'Only epoch-based parameter scheduler can be converted to '
+                'iter-based, and `epoch_length` should be set')
+            if isinstance(scheduler_type, str):
+                scheduler_cls = registry.get(scheduler_type)
+                if scheduler_cls is None:
+                    raise KeyError(
+                        f'{scheduler_type} is not in the {registry.name} '
+                        'registry. Please check whether the value of '
+                        f'`{scheduler_type}` is correct or it was registered '
+                        'as expected. More details can be found at https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#import-the-custom-module'  # noqa: E501
+                    )
+            elif inspect.isclass(scheduler_type):
+                scheduler_cls = scheduler_type
+            else:
+                raise TypeError('type must be a str or valid type, but got '
+                                f'{type(scheduler_type)}')
+            return scheduler_cls.build_iter_from_epoch(  # type: ignore
+                **args)
+        else:
+            args.pop('epoch_length', None)
+            return build_from_cfg(args, registry)
diff --git a/head_extractor/build/lib/mmengine/registry/default_scope.py b/head_extractor/build/lib/mmengine/registry/default_scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f1afcaba6dadfd6542fd58fd90dc7b6948c9e7
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/registry/default_scope.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import time
+from contextlib import contextmanager
+from typing import Generator, Optional
+
+from mmengine.utils.manager import ManagerMixin, _accquire_lock, _release_lock
+
+
+class DefaultScope(ManagerMixin):
+    """Scope of current task used to reset the current registry, which can be
+    accessed globally.
+
+    Consider the case of resetting the current ``Registry`` by
+    ``default_scope`` in the internal module which cannot access runner
+    directly, it is difficult to get the ``default_scope`` defined in
+    ``Runner``. However, if ``Runner`` created ``DefaultScope`` instance
+    by given ``default_scope``, the internal module can get
+    ``default_scope`` by ``DefaultScope.get_current_instance`` everywhere.
+
+    Args:
+        name (str): Name of default scope for global access.
+        scope_name (str): Scope of current task.
+
+    Examples:
+        >>> from mmengine.model import MODELS
+        >>> # Define default scope in runner.
+        >>> DefaultScope.get_instance('task', scope_name='mmdet')
+        >>> # Get default scope globally.
+        >>> scope_name = DefaultScope.get_instance('task').scope_name
+    """
+
+    def __init__(self, name: str, scope_name: str):
+        super().__init__(name)
+        assert isinstance(
+            scope_name,
+            str), (f'scope_name should be a string, but got {scope_name}')
+        self._scope_name = scope_name
+
+    @property
+    def scope_name(self) -> str:
+        """
+        Returns:
+            str: Get current scope.
+        """
+        return self._scope_name
+
+    @classmethod
+    def get_current_instance(cls) -> Optional['DefaultScope']:
+        """Get latest created default scope.
+
+        Since default_scope is an optional argument for ``Registry.build``.
+        ``get_current_instance`` should return ``None`` if there is no
+        ``DefaultScope`` created.
+
+        Examples:
+            >>> default_scope = DefaultScope.get_current_instance()
+            >>> # There is no `DefaultScope` created yet,
+            >>> # `get_current_instance` return `None`.
+            >>> default_scope = DefaultScope.get_instance(
+            >>>     'instance_name', scope_name='mmengine')
+            >>> default_scope.scope_name
+            mmengine
+            >>> default_scope = DefaultScope.get_current_instance()
+            >>> default_scope.scope_name
+            mmengine
+
+        Returns:
+            Optional[DefaultScope]: Return None If there has not been
+            ``DefaultScope`` instance created yet, otherwise return the
+            latest created DefaultScope instance.
+        """
+        _accquire_lock()
+        if cls._instance_dict:
+            instance = super().get_current_instance()
+        else:
+            instance = None
+        _release_lock()
+        return instance
+
+    @classmethod
+    @contextmanager
+    def overwrite_default_scope(cls, scope_name: Optional[str]) -> Generator:
+        """overwrite the current default scope with `scope_name`"""
+        if scope_name is None:
+            yield
+        else:
+            tmp = copy.deepcopy(cls._instance_dict)
+            # To avoid create an instance with the same name.
+            time.sleep(1e-6)
+            cls.get_instance(f'overwrite-{time.time()}', scope_name=scope_name)
+            try:
+                yield
+            finally:
+                cls._instance_dict = tmp
diff --git a/head_extractor/build/lib/mmengine/registry/registry.py b/head_extractor/build/lib/mmengine/registry/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..31fd44d82769c0171714888ffbf9ccea983f519b
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/registry/registry.py
@@ -0,0 +1,669 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import logging
+import sys
+from collections.abc import Callable
+from contextlib import contextmanager
+from importlib import import_module
+from typing import Any, Dict, Generator, List, Optional, Tuple, Type, Union
+
+from rich.console import Console
+from rich.table import Table
+
+from mmengine.config.utils import MODULE2PACKAGE
+from mmengine.utils import get_object_from_string, is_seq_of
+from .default_scope import DefaultScope
+
+
+class Registry:
+    """A registry to map strings to classes or functions.
+
+    Registered object could be built from registry. Meanwhile, registered
+    functions could be called from registry.
+
+    Args:
+        name (str): Registry name.
+        build_func (callable, optional): A function to construct instance
+            from Registry. :func:`build_from_cfg` is used if neither ``parent``
+            or ``build_func`` is specified. If ``parent`` is specified and
+            ``build_func`` is not given,  ``build_func`` will be inherited
+            from ``parent``. Defaults to None.
+        parent (:obj:`Registry`, optional): Parent registry. The class
+            registered in children registry could be built from parent.
+            Defaults to None.
+        scope (str, optional): The scope of registry. It is the key to search
+            for children registry. If not specified, scope will be the name of
+            the package where class is defined, e.g. mmdet, mmcls, mmseg.
+            Defaults to None.
+        locations (list): The locations to import the modules registered
+            in this registry. Defaults to [].
+            New in version 0.4.0.
+
+    Examples:
+        >>> # define a registry
+        >>> MODELS = Registry('models')
+        >>> # registry the `ResNet` to `MODELS`
+        >>> @MODELS.register_module()
+        >>> class ResNet:
+        >>>     pass
+        >>> # build model from `MODELS`
+        >>> resnet = MODELS.build(dict(type='ResNet'))
+        >>> @MODELS.register_module()
+        >>> def resnet50():
+        >>>     pass
+        >>> resnet = MODELS.build(dict(type='resnet50'))
+
+        >>> # hierarchical registry
+        >>> DETECTORS = Registry('detectors', parent=MODELS, scope='det')
+        >>> @DETECTORS.register_module()
+        >>> class FasterRCNN:
+        >>>     pass
+        >>> fasterrcnn = DETECTORS.build(dict(type='FasterRCNN'))
+
+        >>> # add locations to enable auto import
+        >>> DETECTORS = Registry('detectors', parent=MODELS,
+        >>>     scope='det', locations=['det.models.detectors'])
+        >>> # define this class in 'det.models.detectors'
+        >>> @DETECTORS.register_module()
+        >>> class MaskRCNN:
+        >>>     pass
+        >>> # The registry will auto import det.models.detectors.MaskRCNN
+        >>> fasterrcnn = DETECTORS.build(dict(type='det.MaskRCNN'))
+
+    More advanced usages can be found at
+    https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html.
+    """
+
+    def __init__(self,
+                 name: str,
+                 build_func: Optional[Callable] = None,
+                 parent: Optional['Registry'] = None,
+                 scope: Optional[str] = None,
+                 locations: List = []):
+        from .build_functions import build_from_cfg
+        self._name = name
+        self._module_dict: Dict[str, Type] = dict()
+        self._children: Dict[str, 'Registry'] = dict()
+        self._locations = locations
+        self._imported = False
+
+        if scope is not None:
+            assert isinstance(scope, str)
+            self._scope = scope
+        else:
+            self._scope = self.infer_scope()
+
+        # See https://mypy.readthedocs.io/en/stable/common_issues.html#
+        # variables-vs-type-aliases for the use
+        self.parent: Optional['Registry']
+        if parent is not None:
+            assert isinstance(parent, Registry)
+            parent._add_child(self)
+            self.parent = parent
+        else:
+            self.parent = None
+
+        # self.build_func will be set with the following priority:
+        # 1. build_func
+        # 2. parent.build_func
+        # 3. build_from_cfg
+        self.build_func: Callable
+        if build_func is None:
+            if self.parent is not None:
+                self.build_func = self.parent.build_func
+            else:
+                self.build_func = build_from_cfg
+        else:
+            self.build_func = build_func
+
+    def __len__(self):
+        return len(self._module_dict)
+
+    def __contains__(self, key):
+        return self.get(key) is not None
+
+    def __repr__(self):
+        table = Table(title=f'Registry of {self._name}')
+        table.add_column('Names', justify='left', style='cyan')
+        table.add_column('Objects', justify='left', style='green')
+
+        for name, obj in sorted(self._module_dict.items()):
+            table.add_row(name, str(obj))
+
+        console = Console()
+        with console.capture() as capture:
+            console.print(table, end='')
+
+        return capture.get()
+
+    @staticmethod
+    def infer_scope() -> str:
+        """Infer the scope of registry.
+
+        The name of the package where registry is defined will be returned.
+
+        Returns:
+            str: The inferred scope name.
+
+        Examples:
+            >>> # in mmdet/models/backbone/resnet.py
+            >>> MODELS = Registry('models')
+            >>> @MODELS.register_module()
+            >>> class ResNet:
+            >>>     pass
+            >>> # The scope of ``ResNet`` will be ``mmdet``.
+        """
+        from ..logging import print_log
+
+        # `sys._getframe` returns the frame object that many calls below the
+        # top of the stack. The call stack for `infer_scope` can be listed as
+        # follow:
+        # frame-0: `infer_scope` itself
+        # frame-1: `__init__` of `Registry` which calls the `infer_scope`
+        # frame-2: Where the `Registry(...)` is called
+        module = inspect.getmodule(sys._getframe(2))
+        if module is not None:
+            filename = module.__name__
+            split_filename = filename.split('.')
+            scope = split_filename[0]
+        else:
+            # use "mmengine" to handle some cases which can not infer the scope
+            # like initializing Registry in interactive mode
+            scope = 'mmengine'
+            print_log(
+                'set scope as "mmengine" when scope can not be inferred. You '
+                'can silence this warning by passing a "scope" argument to '
+                'Registry like `Registry(name, scope="toy")`',
+                logger='current',
+                level=logging.WARNING)
+
+        return scope
+
+    @staticmethod
+    def split_scope_key(key: str) -> Tuple[Optional[str], str]:
+        """Split scope and key.
+
+        The first scope will be split from key.
+
+        Return:
+            tuple[str | None, str]: The former element is the first scope of
+            the key, which can be ``None``. The latter is the remaining key.
+
+        Examples:
+            >>> Registry.split_scope_key('mmdet.ResNet')
+            'mmdet', 'ResNet'
+            >>> Registry.split_scope_key('ResNet')
+            None, 'ResNet'
+        """
+        split_index = key.find('.')
+        if split_index != -1:
+            return key[:split_index], key[split_index + 1:]
+        else:
+            return None, key
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def scope(self):
+        return self._scope
+
+    @property
+    def module_dict(self):
+        return self._module_dict
+
+    @property
+    def children(self):
+        return self._children
+
+    @property
+    def root(self):
+        return self._get_root_registry()
+
+    @contextmanager
+    def switch_scope_and_registry(self, scope: Optional[str]) -> Generator:
+        """Temporarily switch default scope to the target scope, and get the
+        corresponding registry.
+
+        If the registry of the corresponding scope exists, yield the
+        registry, otherwise yield the current itself.
+
+        Args:
+            scope (str, optional): The target scope.
+
+        Examples:
+            >>> from mmengine.registry import Registry, DefaultScope, MODELS
+            >>> import time
+            >>> # External Registry
+            >>> MMDET_MODELS = Registry('mmdet_model', scope='mmdet',
+            >>>     parent=MODELS)
+            >>> MMCLS_MODELS = Registry('mmcls_model', scope='mmcls',
+            >>>     parent=MODELS)
+            >>> # Local Registry
+            >>> CUSTOM_MODELS = Registry('custom_model', scope='custom',
+            >>>     parent=MODELS)
+            >>>
+            >>> # Initiate DefaultScope
+            >>> DefaultScope.get_instance(f'scope_{time.time()}',
+            >>>     scope_name='custom')
+            >>> # Check default scope
+            >>> DefaultScope.get_current_instance().scope_name
+            custom
+            >>> # Switch to mmcls scope and get `MMCLS_MODELS` registry.
+            >>> with CUSTOM_MODELS.switch_scope_and_registry(scope='mmcls') as registry:
+            >>>     DefaultScope.get_current_instance().scope_name
+            mmcls
+            >>>     registry.scope
+            mmcls
+            >>> # Nested switch scope
+            >>> with CUSTOM_MODELS.switch_scope_and_registry(scope='mmdet') as mmdet_registry:
+            >>>     DefaultScope.get_current_instance().scope_name
+            mmdet
+            >>>     mmdet_registry.scope
+            mmdet
+            >>>     with CUSTOM_MODELS.switch_scope_and_registry(scope='mmcls') as mmcls_registry:
+            >>>         DefaultScope.get_current_instance().scope_name
+            mmcls
+            >>>         mmcls_registry.scope
+            mmcls
+            >>>
+            >>> # Check switch back to original scope.
+            >>> DefaultScope.get_current_instance().scope_name
+            custom
+        """  # noqa: E501
+        from ..logging import print_log
+
+        # Switch to the given scope temporarily. If the corresponding registry
+        # can be found in root registry, return the registry under the scope,
+        # otherwise return the registry itself.
+        with DefaultScope.overwrite_default_scope(scope):
+            # Get the global default scope
+            default_scope = DefaultScope.get_current_instance()
+            # Get registry by scope
+            if default_scope is not None:
+                scope_name = default_scope.scope_name
+                try:
+                    import_module(f'{scope_name}.registry')
+                except (ImportError, AttributeError, ModuleNotFoundError):
+                    if scope in MODULE2PACKAGE:
+                        print_log(
+                            f'{scope} is not installed and its '
+                            'modules will not be registered. If you '
+                            'want to use modules defined in '
+                            f'{scope}, Please install {scope} by '
+                            f'`pip install {MODULE2PACKAGE[scope]}.',
+                            logger='current',
+                            level=logging.WARNING)
+                    else:
+                        print_log(
+                            f'Failed to import `{scope}.registry` '
+                            f'make sure the registry.py exists in `{scope}` '
+                            'package.',
+                            logger='current',
+                            level=logging.WARNING)
+                root = self._get_root_registry()
+                registry = root._search_child(scope_name)
+                if registry is None:
+                    # if `default_scope` can not be found, fallback to argument
+                    # `registry`
+                    print_log(
+                        f'Failed to search registry with scope "{scope_name}" '
+                        f'in the "{root.name}" registry tree. '
+                        f'As a workaround, the current "{self.name}" registry '
+                        f'in "{self.scope}" is used to build instance. This '
+                        'may cause unexpected failure when running the built '
+                        f'modules. Please check whether "{scope_name}" is a '
+                        'correct scope, or whether the registry is '
+                        'initialized.',
+                        logger='current',
+                        level=logging.WARNING)
+                    registry = self
+            # If there is no built default scope, just return current registry.
+            else:
+                registry = self
+            yield registry
+
+    def _get_root_registry(self) -> 'Registry':
+        """Return the root registry."""
+        root = self
+        while root.parent is not None:
+            root = root.parent
+        return root
+
+    def import_from_location(self) -> None:
+        """import modules from the pre-defined locations in self._location."""
+        if not self._imported:
+            # Avoid circular import
+            from ..logging import print_log
+
+            # avoid BC breaking
+            if len(self._locations) == 0 and self.scope in MODULE2PACKAGE:
+                print_log(
+                    f'The "{self.name}" registry in {self.scope} did not '
+                    'set import location. Fallback to call '
+                    f'`{self.scope}.utils.register_all_modules` '
+                    'instead.',
+                    logger='current',
+                    level=logging.DEBUG)
+                try:
+                    module = import_module(f'{self.scope}.utils')
+                except (ImportError, AttributeError, ModuleNotFoundError):
+                    if self.scope in MODULE2PACKAGE:
+                        print_log(
+                            f'{self.scope} is not installed and its '
+                            'modules will not be registered. If you '
+                            'want to use modules defined in '
+                            f'{self.scope}, Please install {self.scope} by '
+                            f'`pip install {MODULE2PACKAGE[self.scope]}.',
+                            logger='current',
+                            level=logging.WARNING)
+                    else:
+                        print_log(
+                            f'Failed to import {self.scope} and register '
+                            'its modules, please make sure you '
+                            'have registered the module manually.',
+                            logger='current',
+                            level=logging.WARNING)
+                else:
+                    # The import errors triggered during the registration
+                    # may be more complex, here just throwing
+                    # the error to avoid causing more implicit registry errors
+                    # like `xxx`` not found in `yyy` registry.
+                    module.register_all_modules(False)  # type: ignore
+
+            for loc in self._locations:
+                import_module(loc)
+                print_log(
+                    f"Modules of {self.scope}'s {self.name} registry have "
+                    f'been automatically imported from {loc}',
+                    logger='current',
+                    level=logging.DEBUG)
+            self._imported = True
+
+    def get(self, key: str) -> Optional[Type]:
+        """Get the registry record.
+
+        If `key`` represents the whole object name with its module
+        information, for example, `mmengine.model.BaseModel`, ``get``
+        will directly return the class object :class:`BaseModel`.
+
+        Otherwise, it will first parse ``key`` and check whether it
+        contains a scope name. The logic to search for ``key``:
+
+        - ``key`` does not contain a scope name, i.e., it is purely a module
+          name like "ResNet": :meth:`get` will search for ``ResNet`` from the
+          current registry to its parent or ancestors until finding it.
+
+        - ``key`` contains a scope name and it is equal to the scope of the
+          current registry (e.g., "mmcls"), e.g., "mmcls.ResNet": :meth:`get`
+          will only search for ``ResNet`` in the current registry.
+
+        - ``key`` contains a scope name and it is not equal to the scope of
+          the current registry (e.g., "mmdet"), e.g., "mmcls.FCNet": If the
+          scope exists in its children, :meth:`get` will get "FCNet" from
+          them. If not, :meth:`get` will first get the root registry and root
+          registry call its own :meth:`get` method.
+
+        Args:
+            key (str): Name of the registered item, e.g., the class name in
+                string format.
+
+        Returns:
+            Type or None: Return the corresponding class if ``key`` exists,
+            otherwise return None.
+
+        Examples:
+            >>> # define a registry
+            >>> MODELS = Registry('models')
+            >>> # register `ResNet` to `MODELS`
+            >>> @MODELS.register_module()
+            >>> class ResNet:
+            >>>     pass
+            >>> resnet_cls = MODELS.get('ResNet')
+
+            >>> # hierarchical registry
+            >>> DETECTORS = Registry('detector', parent=MODELS, scope='det')
+            >>> # `ResNet` does not exist in `DETECTORS` but `get` method
+            >>> # will try to search from its parents or ancestors
+            >>> resnet_cls = DETECTORS.get('ResNet')
+            >>> CLASSIFIER = Registry('classifier', parent=MODELS, scope='cls')
+            >>> @CLASSIFIER.register_module()
+            >>> class MobileNet:
+            >>>     pass
+            >>> # `get` from its sibling registries
+            >>> mobilenet_cls = DETECTORS.get('cls.MobileNet')
+        """
+        # Avoid circular import
+        from ..logging import print_log
+
+        if not isinstance(key, str):
+            raise TypeError(
+                'The key argument of `Registry.get` must be a str, '
+                f'got {type(key)}')
+
+        scope, real_key = self.split_scope_key(key)
+        obj_cls = None
+        registry_name = self.name
+        scope_name = self.scope
+
+        # lazy import the modules to register them into the registry
+        self.import_from_location()
+
+        if scope is None or scope == self._scope:
+            # get from self
+            if real_key in self._module_dict:
+                obj_cls = self._module_dict[real_key]
+            elif scope is None:
+                # try to get the target from its parent or ancestors
+                parent = self.parent
+                while parent is not None:
+                    if real_key in parent._module_dict:
+                        obj_cls = parent._module_dict[real_key]
+                        registry_name = parent.name
+                        scope_name = parent.scope
+                        break
+                    parent = parent.parent
+        else:
+            # import the registry to add the nodes into the registry tree
+            try:
+                import_module(f'{scope}.registry')
+                print_log(
+                    f'Registry node of {scope} has been automatically '
+                    'imported.',
+                    logger='current',
+                    level=logging.DEBUG)
+            except (ImportError, AttributeError, ModuleNotFoundError):
+                print_log(
+                    f'Cannot auto import {scope}.registry, please check '
+                    f'whether the package "{scope}" is installed correctly '
+                    'or import the registry manually.',
+                    logger='current',
+                    level=logging.DEBUG)
+            # get from self._children
+            if scope in self._children:
+                obj_cls = self._children[scope].get(real_key)
+                registry_name = self._children[scope].name
+                scope_name = scope
+            else:
+                root = self._get_root_registry()
+
+                if scope != root._scope and scope not in root._children:
+                    # If not skip directly, `root.get(key)` will recursively
+                    # call itself until RecursionError is thrown.
+                    pass
+                else:
+                    obj_cls = root.get(key)
+
+        if obj_cls is None:
+            # Actually, it's strange to implement this `try ... except` to
+            # get the object by its name in `Registry.get`. However, If we
+            # want to build the model using a configuration like
+            # `dict(type='mmengine.model.BaseModel')`, which can
+            # be dumped by lazy import config, we need this code snippet
+            # for `Registry.get` to work.
+            try:
+                obj_cls = get_object_from_string(key)
+            except Exception:
+                raise RuntimeError(f'Failed to get {key}')
+
+        if obj_cls is not None:
+            # For some rare cases (e.g. obj_cls is a partial function), obj_cls
+            # doesn't have `__name__`. Use default value to prevent error
+            cls_name = getattr(obj_cls, '__name__', str(obj_cls))
+            print_log(
+                f'Get class `{cls_name}` from "{registry_name}"'
+                f' registry in "{scope_name}"',
+                logger='current',
+                level=logging.DEBUG)
+
+        return obj_cls
+
+    def _search_child(self, scope: str) -> Optional['Registry']:
+        """Depth-first search for the corresponding registry in its children.
+
+        Note that the method only search for the corresponding registry from
+        the current registry. Therefore, if we want to search from the root
+        registry, :meth:`_get_root_registry` should be called to get the
+        root registry first.
+
+        Args:
+            scope (str): The scope name used for searching for its
+                corresponding registry.
+
+        Returns:
+            Registry or None: Return the corresponding registry if ``scope``
+            exists, otherwise return None.
+        """
+        if self._scope == scope:
+            return self
+
+        for child in self._children.values():
+            registry = child._search_child(scope)
+            if registry is not None:
+                return registry
+
+        return None
+
+    def build(self, cfg: dict, *args, **kwargs) -> Any:
+        """Build an instance.
+
+        Build an instance by calling :attr:`build_func`.
+
+        Args:
+            cfg (dict): Config dict needs to be built.
+
+        Returns:
+            Any: The constructed object.
+
+        Examples:
+            >>> from mmengine import Registry
+            >>> MODELS = Registry('models')
+            >>> @MODELS.register_module()
+            >>> class ResNet:
+            >>>     def __init__(self, depth, stages=4):
+            >>>         self.depth = depth
+            >>>         self.stages = stages
+            >>> cfg = dict(type='ResNet', depth=50)
+            >>> model = MODELS.build(cfg)
+        """
+        return self.build_func(cfg, *args, **kwargs, registry=self)
+
+    def _add_child(self, registry: 'Registry') -> None:
+        """Add a child for a registry.
+
+        Args:
+            registry (:obj:`Registry`): The ``registry`` will be added as a
+                child of the ``self``.
+        """
+
+        assert isinstance(registry, Registry)
+        assert registry.scope is not None
+        assert registry.scope not in self.children, \
+            f'scope {registry.scope} exists in {self.name} registry'
+        self.children[registry.scope] = registry
+
+    def _register_module(self,
+                         module: Type,
+                         module_name: Optional[Union[str, List[str]]] = None,
+                         force: bool = False) -> None:
+        """Register a module.
+
+        Args:
+            module (type): Module to be registered. Typically a class or a
+                function, but generally all ``Callable`` are acceptable.
+            module_name (str or list of str, optional): The module name to be
+                registered. If not specified, the class name will be used.
+                Defaults to None.
+            force (bool): Whether to override an existing class with the same
+                name. Defaults to False.
+        """
+        if not callable(module):
+            raise TypeError(f'module must be Callable, but got {type(module)}')
+
+        if module_name is None:
+            module_name = module.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in self._module_dict:
+                existed_module = self.module_dict[name]
+                raise KeyError(f'{name} is already registered in {self.name} '
+                               f'at {existed_module.__module__}')
+            self._module_dict[name] = module
+
+    def register_module(
+            self,
+            name: Optional[Union[str, List[str]]] = None,
+            force: bool = False,
+            module: Optional[Type] = None) -> Union[type, Callable]:
+        """Register a module.
+
+        A record will be added to ``self._module_dict``, whose key is the class
+        name or the specified name, and value is the class itself.
+        It can be used as a decorator or a normal function.
+
+        Args:
+            name (str or list of str, optional): The module name to be
+                registered. If not specified, the class name will be used.
+            force (bool): Whether to override an existing class with the same
+                name. Defaults to False.
+            module (type, optional): Module class or function to be registered.
+                Defaults to None.
+
+        Examples:
+            >>> backbones = Registry('backbone')
+            >>> # as a decorator
+            >>> @backbones.register_module()
+            >>> class ResNet:
+            >>>     pass
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module(name='mnet')
+            >>> class MobileNet:
+            >>>     pass
+
+            >>> # as a normal function
+            >>> class ResNet:
+            >>>     pass
+            >>> backbones.register_module(module=ResNet)
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f'force must be a boolean, but got {type(force)}')
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str) or is_seq_of(name, str)):
+            raise TypeError(
+                'name must be None, an instance of str, or a sequence of str, '
+                f'but got {type(name)}')
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            self._register_module(module=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(module):
+            self._register_module(module=module, module_name=name, force=force)
+            return module
+
+        return _register
diff --git a/head_extractor/build/lib/mmengine/registry/root.py b/head_extractor/build/lib/mmengine/registry/root.py
new file mode 100644
index 0000000000000000000000000000000000000000..8acd157b316dcb7e888ce8180b294fa45f1253dd
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/registry/root.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""MMEngine provides 20 root registries to support using modules across
+projects.
+
+More datails can be found at
+https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html.
+"""
+
+#from .build_functions import (build_model_from_cfg, build_runner_from_cfg,
+#                              build_scheduler_from_cfg)
+
+from .build_functions import (build_model_from_cfg, build_optimizer_from_cfg,
+                              build_runner_from_cfg, build_scheduler_from_cfg)
+from .registry import Registry
+
+# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner`
+RUNNERS = Registry('runner', build_func=build_runner_from_cfg)
+# manage runner constructors that define how to initialize runners
+RUNNER_CONSTRUCTORS = Registry('runner constructor')
+# manage all kinds of loops like `EpochBasedTrainLoop`
+LOOPS = Registry('loop')
+# manage all kinds of hooks like `CheckpointHook`
+HOOKS = Registry('hook')
+
+# manage all kinds of strategies like `NativeStrategy` and `DDPStrategy`
+STRATEGIES = Registry('strategy')
+
+# manage data-related modules
+DATASETS = Registry('dataset')
+DATA_SAMPLERS = Registry('data sampler')
+TRANSFORMS = Registry('transform')
+
+# mangage all kinds of modules inheriting `nn.Module`
+MODELS = Registry('model', build_model_from_cfg)
+# mangage all kinds of model wrappers like 'MMDistributedDataParallel'
+MODEL_WRAPPERS = Registry('model_wrapper')
+# mangage all kinds of weight initialization modules like `Uniform`
+WEIGHT_INITIALIZERS = Registry('weight initializer')
+
+# mangage all kinds of optimizers like `SGD` and `Adam`
+#OPTIMIZERS = Registry('optimizer')
+OPTIMIZERS = Registry('optimizer', build_func=build_optimizer_from_cfg)
+# manage optimizer wrapper
+OPTIM_WRAPPERS = Registry('optim_wrapper')
+# manage constructors that customize the optimization hyperparameters.
+OPTIM_WRAPPER_CONSTRUCTORS = Registry('optimizer wrapper constructor')
+# mangage all kinds of parameter schedulers like `MultiStepLR`
+PARAM_SCHEDULERS = Registry(
+    'parameter scheduler', build_func=build_scheduler_from_cfg)
+
+# manage all kinds of metrics
+METRICS = Registry('metric')
+# manage evaluator
+EVALUATOR = Registry('evaluator')
+
+# manage task-specific modules like anchor generators and box coders
+TASK_UTILS = Registry('task util')
+
+# manage visualizer
+VISUALIZERS = Registry('visualizer')
+# manage visualizer backend
+VISBACKENDS = Registry('vis_backend')
+
+# manage logprocessor
+LOG_PROCESSORS = Registry('log_processor')
+
+# manage inferencer
+INFERENCERS = Registry('inferencer')
+
+# manage function
+FUNCTIONS = Registry('function')
diff --git a/head_extractor/build/lib/mmengine/registry/utils.py b/head_extractor/build/lib/mmengine/registry/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..568970bbf121e2cb5e659c27351995a1f46926ae
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/registry/utils.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import logging
+import os.path as osp
+from typing import Optional
+
+from mmengine.fileio import dump
+from mmengine.logging import print_log
+from . import root
+from .default_scope import DefaultScope
+from .registry import Registry
+
+
+def traverse_registry_tree(registry: Registry, verbose: bool = True) -> list:
+    """Traverse the whole registry tree from any given node, and collect
+    information of all registered modules in this registry tree.
+
+    Args:
+        registry (Registry): a registry node in the registry tree.
+        verbose (bool): Whether to print log. Defaults to True
+
+    Returns:
+        list: Statistic results of all modules in each node of the registry
+        tree.
+    """
+    root_registry = registry.root
+    modules_info = []
+
+    def _dfs_registry(_registry):
+        if isinstance(_registry, Registry):
+            num_modules = len(_registry.module_dict)
+            scope = _registry.scope
+            registry_info = dict(num_modules=num_modules, scope=scope)
+            for name, registered_class in _registry.module_dict.items():
+                folder = '/'.join(registered_class.__module__.split('.')[:-1])
+                if folder in registry_info:
+                    registry_info[folder].append(name)
+                else:
+                    registry_info[folder] = [name]
+            if verbose:
+                print_log(
+                    f"Find {num_modules} modules in {scope}'s "
+                    f"'{_registry.name}' registry ",
+                    logger='current')
+            modules_info.append(registry_info)
+        else:
+            return
+        for _, child in _registry.children.items():
+            _dfs_registry(child)
+
+    _dfs_registry(root_registry)
+    return modules_info
+
+
+def count_registered_modules(save_path: Optional[str] = None,
+                             verbose: bool = True) -> dict:
+    """Scan all modules in MMEngine's root and child registries and dump to
+    json.
+
+    Args:
+        save_path (str, optional): Path to save the json file.
+        verbose (bool): Whether to print log. Defaults to True.
+
+    Returns:
+        dict: Statistic results of all registered modules.
+    """
+    # import modules to trigger registering
+    import mmengine.dataset
+    import mmengine.evaluator
+    import mmengine.hooks
+    import mmengine.model
+    import mmengine.optim
+    import mmengine.runner
+    import mmengine.visualization  # noqa: F401
+
+    registries_info = {}
+    # traverse all registries in MMEngine
+    for item in dir(root):
+        if not item.startswith('__'):
+            registry = getattr(root, item)
+            if isinstance(registry, Registry):
+                registries_info[item] = traverse_registry_tree(
+                    registry, verbose)
+    scan_data = dict(
+        scan_date=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+        registries=registries_info)
+    if verbose:
+        print_log(
+            f'Finish registry analysis, got: {scan_data}', logger='current')
+    if save_path is not None:
+        json_path = osp.join(save_path, 'modules_statistic_results.json')
+        dump(scan_data, json_path, indent=2)
+        print_log(f'Result has been saved to {json_path}', logger='current')
+    return scan_data
+
+
+def init_default_scope(scope: str) -> None:
+    """Initialize the given default scope.
+
+    Args:
+        scope (str): The name of the default scope.
+    """
+    never_created = DefaultScope.get_current_instance(
+    ) is None or not DefaultScope.check_instance_created(scope)
+    if never_created:
+        DefaultScope.get_instance(scope, scope_name=scope)
+        return
+    current_scope = DefaultScope.get_current_instance()  # type: ignore
+    if current_scope.scope_name != scope:  # type: ignore
+        print_log(
+            'The current default scope '  # type: ignore
+            f'"{current_scope.scope_name}" is not "{scope}", '
+            '`init_default_scope` will force set the current'
+            f'default scope to "{scope}".',
+            logger='current',
+            level=logging.WARNING)
+        # avoid name conflict
+        new_instance_name = f'{scope}-{datetime.datetime.now()}'
+        DefaultScope.get_instance(new_instance_name, scope_name=scope)
diff --git a/head_extractor/build/lib/mmengine/runner/__init__.py b/head_extractor/build/lib/mmengine/runner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b00f8e83915854e83254cb9152d97e4df4348b8c
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/runner/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ._flexible_runner import FlexibleRunner
+from .activation_checkpointing import turn_on_activation_checkpointing
+from .amp import autocast
+from .base_loop import BaseLoop
+from .checkpoint import (CheckpointLoader, find_latest_checkpoint,
+                         get_deprecated_model_names, get_external_models,
+                         get_mmcls_models, get_state_dict,
+                         get_torchvision_models, load_checkpoint,
+                         load_state_dict, save_checkpoint, weights_to_cpu)
+from .log_processor import LogProcessor
+from .loops import EpochBasedTrainLoop, IterBasedTrainLoop, TestLoop, ValLoop
+from .priority import Priority, get_priority
+from .runner import Runner
+from .utils import set_random_seed
+
+__all__ = [
+    'BaseLoop', 'load_state_dict', 'get_torchvision_models',
+    'get_external_models', 'get_mmcls_models', 'get_deprecated_model_names',
+    'CheckpointLoader', 'load_checkpoint', 'weights_to_cpu', 'get_state_dict',
+    'save_checkpoint', 'EpochBasedTrainLoop', 'IterBasedTrainLoop', 'ValLoop',
+    'TestLoop', 'Runner', 'get_priority', 'Priority', 'find_latest_checkpoint',
+    'autocast', 'LogProcessor', 'set_random_seed', 'FlexibleRunner',
+    'turn_on_activation_checkpointing'
+]
diff --git a/head_extractor/build/lib/mmengine/runner/_flexible_runner.py b/head_extractor/build/lib/mmengine/runner/_flexible_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d727fb4d5f6d772f940d21b8c4de94558626b4f
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/runner/_flexible_runner.py
@@ -0,0 +1,1650 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+import os.path as osp
+import pickle
+import warnings
+from functools import partial
+from typing import Callable, Dict, List, Optional, Union
+
+import torch.nn as nn
+from torch.utils.data import DataLoader
+
+import mmengine
+from mmengine._strategy import BaseStrategy
+from mmengine.config import Config, ConfigDict
+from mmengine.dataset import worker_init_fn as default_worker_init_fn
+from mmengine.dist import get_rank, infer_launcher, master_only
+from mmengine.evaluator import Evaluator
+from mmengine.fileio import FileClient, join_path
+from mmengine.hooks import Hook
+from mmengine.logging import MessageHub, print_log
+from mmengine.optim import OptimWrapper, OptimWrapperDict, _ParamScheduler
+from mmengine.registry import (DATA_SAMPLERS, DATASETS, EVALUATOR, FUNCTIONS,
+                               HOOKS, LOG_PROCESSORS, LOOPS, RUNNERS,
+                               STRATEGIES, VISUALIZERS, DefaultScope)
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+from mmengine.visualization import Visualizer
+from .base_loop import BaseLoop
+from .checkpoint import find_latest_checkpoint
+from .log_processor import LogProcessor
+from .loops import EpochBasedTrainLoop, IterBasedTrainLoop, TestLoop, ValLoop
+from .priority import Priority, get_priority
+from .utils import _get_batch_size
+
+ConfigType = Union[Dict, Config, ConfigDict]
+ParamSchedulerType = Union[List[_ParamScheduler], Dict[str,
+                                                       List[_ParamScheduler]]]
+OptimWrapperType = Union[OptimWrapper, OptimWrapperDict]
+
+
+@RUNNERS.register_module()
+class FlexibleRunner:
+    """A training helper for PyTorch.
+
+    Runner object can be built from config by ``runner = Runner.from_cfg(cfg)``
+    where the ``cfg`` usually contains training, validation, and test-related
+    configurations to build corresponding components. We usually use the
+    same config to launch training, testing, and validation tasks. However,
+    only some of these components are necessary at the same time, e.g.,
+    testing a model does not need training or validation-related components.
+
+    To avoid repeatedly modifying config, the construction of ``Runner`` adopts
+    lazy initialization to only initialize components when they are going to be
+    used. Therefore, the model is always initialized at the beginning, and
+    training, validation, and, testing related components are only initialized
+    when calling ``runner.train()``, ``runner.val()``, and ``runner.test()``,
+    respectively.
+
+    Warning:
+        This is an experimental feature, and its interface is subject to
+        change.
+
+    Args:
+        model (:obj:`torch.nn.Module` or dict): The model to be run. It can be
+            a dict used for build a model.
+
+    Kwargs:
+        work_dir (str, optional): The working directory to save checkpoints.
+            The logs will be saved in the subdirectory of `work_dir` named
+            :attr:`timestamp`. Defaults to 'work_dir'.
+        experiment_name (str, optional): Name of current experiment. If not
+            specified, timestamp will be used as ``experiment_name``.
+            Defaults to None.
+        train_dataloader (Dataloader or dict, optional): A dataloader object or
+            a dict to build a dataloader. If ``None`` is given, it means
+            skipping training steps. Defaults to None.
+            See :meth:`build_dataloader` for more details.
+        optim_wrapper (OptimWrapper or dict, optional):
+            Computing gradient of model parameters. If specified,
+            :attr:`train_dataloader` should also be specified. If automatic
+            mixed precision or gradient accmulation
+            training is required. The type of ``optim_wrapper`` should be
+            AmpOptimizerWrapper. See :meth:`build_optim_wrapper` for
+            examples. Defaults to None.
+        param_scheduler (_ParamScheduler or dict or list, optional):
+            Parameter scheduler for updating optimizer parameters. If
+            specified, :attr:`optimizer` should also be specified.
+            Defaults to None.
+            See :meth:`build_param_scheduler` for examples.
+        train_cfg (dict, optional): A dict to build a training loop. If it does
+            not provide "type" key, it should contain "by_epoch" to decide
+            which type of training loop :class:`EpochBasedTrainLoop` or
+            :class:`IterBasedTrainLoop` should be used. If ``train_cfg``
+            specified, :attr:`train_dataloader` should also be specified.
+            Defaults to None. See :meth:`build_train_loop` for more details.
+        val_dataloader (Dataloader or dict, optional): A dataloader object or
+            a dict to build a dataloader. If ``None`` is given, it means
+            skipping validation steps. Defaults to None.
+            See :meth:`build_dataloader` for more details.
+        val_evaluator (Evaluator or dict or list, optional): A evaluator object
+            used for computing metrics for validation. It can be a dict or a
+            list of dict to build a evaluator. If specified,
+            :attr:`val_dataloader` should also be specified. Defaults to None.
+        val_cfg (dict, optional): A dict to build a validation loop. If it does
+            not provide "type" key, :class:`ValLoop` will be used by default.
+            If ``val_cfg`` specified, :attr:`val_dataloader` should also be
+            specified. If ``ValLoop`` is built with `fp16=True``,
+            ``runner.val()`` will be performed under fp16 precision.
+        test_dataloader (Dataloader or dict, optional): A dataloader object or
+            a dict to build a dataloader. If ``None`` is given, it means
+            skipping test steps. Defaults to None.
+            See :meth:`build_dataloader` for more details.
+            Defaults to None. See :meth:`build_val_loop` for more details.
+        test_evaluator (Evaluator or dict or list, optional): A evaluator
+            object used for computing metrics for test steps. It can be a dict
+            or a list of dict to build a evaluator. If specified,
+            :attr:`test_dataloader` should also be specified. Defaults to None.
+        test_cfg (dict, optional): A dict to build a test loop. If it does
+            not provide "type" key, :class:`TestLoop` will be used by default.
+            If ``test_cfg`` specified, :attr:`test_dataloader` should also be
+            specified. If ``ValLoop`` is built with `fp16=True``,
+            ``runner.val()`` will be performed under fp16 precision.
+            Defaults to None. See :meth:`build_test_loop` for more details.
+        strategy (BaseStrategy or dict, optional): A strategy object or a dict
+            to build a strategy. Defaults to None. If not specified, the
+            strategy will be inferred automatically.
+        auto_scale_lr (dict, Optional): Config to scale the learning rate
+            automatically. It includes ``base_batch_size`` and ``enable``.
+            ``base_batch_size`` is the batch size that the optimizer lr is
+            based on. ``enable`` is the switch to turn on and off the feature.
+        default_hooks (dict[str, dict] or dict[str, Hook], optional): Hooks to
+            execute default actions like updating model parameters and saving
+            checkpoints. Default hooks are ``OptimizerHook``,
+            ``IterTimerHook``, ``LoggerHook``, ``ParamSchedulerHook`` and
+            ``CheckpointHook``. Defaults to None.
+            See :meth:`register_default_hooks` for more details.
+        custom_hooks (list[dict] or list[Hook], optional): Hooks to execute
+            custom actions like visualizing images processed by pipeline.
+            Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`. If the ``model`` argument is a dict
+            and doesn't contain the key ``data_preprocessor``, set the argument
+            as the ``data_preprocessor`` of the ``model`` dict.
+            Defaults to None.
+        load_from (str, optional): The checkpoint file to load from.
+            Defaults to None.
+        resume (bool): Whether to resume training. Defaults to False. If
+            ``resume`` is True and ``load_from`` is None, automatically to
+            find latest checkpoint from ``work_dir``. If not found, resuming
+            does nothing.
+        launcher (str, optional): Way to launcher multi-process. Supported
+            launchers are 'pytorch', 'mpi', 'slurm' and 'none'. If 'none' is
+            provided, non-distributed environment will be launched.
+            If launcher is None, the launcher will be inferred according some
+            specified environments. Defaults to None.
+        env_cfg (dict): A dict used for setting environment. Defaults to
+            dict(dist_cfg=dict(backend='nccl')).
+        log_processor (dict, optional): A processor to format logs. Defaults to
+            None.
+        log_level (int or str): The log level of MMLogger handlers.
+            Defaults to 'INFO'.
+        visualizer (Visualizer or dict, optional): A Visualizer object or a
+            dict build Visualizer object. Defaults to None. If not
+            specified, default config will be used.
+        default_scope (str): Used to reset registries location.
+            Defaults to "mmengine".
+        randomness (dict): Some settings to make the experiment as reproducible
+            as possible like seed and deterministic.
+            Defaults to ``dict(seed=None)``. If seed is None, a random number
+            will be generated and it will be broadcasted to all other processes
+            if in distributed environment. If ``cudnn_benchmark`` is
+            ``True`` in ``env_cfg`` but ``deterministic`` is ``True`` in
+            ``randomness``, the value of ``torch.backends.cudnn.benchmark``
+            will be ``False`` finally.
+        compile (bool or dict, optional): Whether to enable ``torch.compile``.
+            Defaults to False.
+        cfg (dict or Configdict or :obj:`Config`, optional): Full config.
+            Defaults to None.
+
+    Note:
+        Since PyTorch 2.0.0, you can enable ``torch.compile`` by passing in
+        `compile = True`. If you want to control compile options, you
+        can pass a dict, e.g. ``cfg.compile = dict(backend='eager')``.
+        Refer to `PyTorch API Documentation <https://pytorch.org/docs/
+        master/generated/torch.compile.html#torch.compile>`_ for more valid
+        options.
+
+    Examples:
+        >>> from mmengine.runner import Runner
+        >>> cfg = dict(
+        >>>     model=dict(type='ToyModel'),
+        >>>     work_dir='path/of/work_dir',
+        >>>     train_dataloader=dict(
+        >>>     dataset=dict(type='ToyDataset'),
+        >>>     sampler=dict(type='DefaultSampler', shuffle=True),
+        >>>     batch_size=1,
+        >>>     num_workers=0),
+        >>>     val_dataloader=dict(
+        >>>         dataset=dict(type='ToyDataset'),
+        >>>         sampler=dict(type='DefaultSampler', shuffle=False),
+        >>>        batch_size=1,
+        >>>        num_workers=0),
+        >>>     test_dataloader=dict(
+        >>>         dataset=dict(type='ToyDataset'),
+        >>>         sampler=dict(type='DefaultSampler', shuffle=False),
+        >>>         batch_size=1,
+        >>>         num_workers=0),
+        >>>     auto_scale_lr=dict(base_batch_size=16, enable=False),
+        >>>     optim_wrapper=dict(type='OptimizerWrapper', optimizer=dict(
+        >>>         type='SGD', lr=0.01)),
+        >>>     param_scheduler=dict(type='MultiStepLR', milestones=[1, 2]),
+        >>>     val_evaluator=dict(type='ToyEvaluator'),
+        >>>     test_evaluator=dict(type='ToyEvaluator'),
+        >>>     train_cfg=dict(by_epoch=True, max_epochs=3, val_interval=1),
+        >>>     val_cfg=dict(),
+        >>>     test_cfg=dict(),
+        >>>     custom_hooks=[],
+        >>>     default_hooks=dict(
+        >>>         timer=dict(type='IterTimerHook'),
+        >>>         checkpoint=dict(type='CheckpointHook', interval=1),
+        >>>         logger=dict(type='LoggerHook'),
+        >>>         optimizer=dict(type='OptimizerHook', grad_clip=False),
+        >>>         param_scheduler=dict(type='ParamSchedulerHook')),
+        >>>     launcher='none',
+        >>>     env_cfg=dict(dist_cfg=dict(backend='nccl')),
+        >>>     log_processor=dict(window_size=20),
+        >>>     visualizer=dict(type='Visualizer',
+        >>>     vis_backends=[dict(type='LocalVisBackend',
+        >>>                        save_dir='temp_dir')])
+        >>>    )
+        >>> runner = Runner.from_cfg(cfg)
+        >>> runner.train()
+        >>> runner.test()
+    """
+    cfg: Config
+    _train_loop: Optional[Union[BaseLoop, Dict]]
+    _val_loop: Optional[Union[BaseLoop, Dict]]
+    _test_loop: Optional[Union[BaseLoop, Dict]]
+
+    def __init__(
+        self,
+        model: Union[nn.Module, Dict],
+        *,
+        work_dir: str = 'work_dirs',
+        experiment_name: Optional[str] = None,
+        train_dataloader: Optional[Union[DataLoader, Dict]] = None,
+        optim_wrapper: Optional[Union[OptimWrapper, Dict]] = None,
+        param_scheduler: Optional[Union[_ParamScheduler, Dict, List]] = None,
+        train_cfg: Optional[Dict] = None,
+        val_dataloader: Optional[Union[DataLoader, Dict]] = None,
+        val_evaluator: Optional[Union[Evaluator, Dict, List]] = None,
+        val_cfg: Optional[Dict] = None,
+        test_dataloader: Optional[Union[DataLoader, Dict]] = None,
+        test_evaluator: Optional[Union[Evaluator, Dict, List]] = None,
+        test_cfg: Optional[Dict] = None,
+        strategy: Optional[Union[BaseStrategy, Dict]] = None,
+        auto_scale_lr: Optional[Dict] = None,
+        default_hooks: Optional[Dict[str, Union[Hook, Dict]]] = None,
+        custom_hooks: Optional[List[Union[Hook, Dict]]] = None,
+        data_preprocessor: Union[nn.Module, Dict, None] = None,
+        load_from: Optional[str] = None,
+        resume: Union[str, bool] = False,
+        launcher: Optional[str] = None,
+        env_cfg: Dict = dict(dist_cfg=dict(backend='nccl')),
+        log_processor: Optional[Dict] = None,
+        log_level: str = 'INFO',
+        visualizer: Optional[Union[Visualizer, Dict]] = None,
+        default_scope: Optional[str] = 'mmengine',
+        randomness: Dict = dict(seed=None),
+        compile: Union[bool, Dict] = False,
+        cfg: Optional[ConfigType] = None,
+    ):
+        if isinstance(model, dict) and data_preprocessor is not None:
+            # Merge the data_preprocessor to model config.
+            model.setdefault('data_preprocessor', data_preprocessor)
+        self.model = model
+
+        self._work_dir = osp.abspath(work_dir)
+        mmengine.mkdir_or_exist(self._work_dir)
+
+        # recursively copy the `cfg` because `self.cfg` will be modified
+        # everywhere.
+        if cfg is not None:
+            if isinstance(cfg, Config):
+                self.cfg = copy.deepcopy(cfg)
+            elif isinstance(cfg, dict):
+                self.cfg = Config(cfg)
+        else:
+            self.cfg = Config(dict())
+
+        # lazy initialization
+        training_related = [train_dataloader, train_cfg, optim_wrapper]
+        if not (all(item is None for item in training_related)
+                or all(item is not None for item in training_related)):
+            raise ValueError(
+                'train_dataloader, train_cfg, and optim_wrapper should be '
+                'either all None or not None, but got '
+                f'train_dataloader={train_dataloader}, '
+                f'train_cfg={train_cfg}, '
+                f'optim_wrapper={optim_wrapper}.')
+        self._train_dataloader = train_dataloader
+        self._train_loop = train_cfg
+
+        self.optim_wrapper: Optional[Union[OptimWrapper, dict]]
+        self.optim_wrapper = optim_wrapper
+
+        self._auto_scale_lr = auto_scale_lr
+
+        # If there is no need to adjust learning rate, momentum or other
+        # parameters of optimizer, param_scheduler can be None
+        if param_scheduler is not None and self.optim_wrapper is None:
+            raise ValueError(
+                'param_scheduler should be None when optim_wrapper is None, '
+                f'but got {param_scheduler}')
+
+        self.param_schedulers = param_scheduler
+
+        val_related = [val_dataloader, val_cfg, val_evaluator]
+        if not (all(item is None
+                    for item in val_related) or all(item is not None
+                                                    for item in val_related)):
+            raise ValueError(
+                'val_dataloader, val_cfg, and val_evaluator should be either '
+                'all None or not None, but got '
+                f'val_dataloader={val_dataloader}, val_cfg={val_cfg}, '
+                f'val_evaluator={val_evaluator}')
+        self._val_dataloader = val_dataloader
+        self._val_loop = val_cfg
+        self._val_evaluator = val_evaluator
+
+        test_related = [test_dataloader, test_cfg, test_evaluator]
+        if not (all(item is None for item in test_related)
+                or all(item is not None for item in test_related)):
+            raise ValueError(
+                'test_dataloader, test_cfg, and test_evaluator should be '
+                'either all None or not None, but got '
+                f'test_dataloader={test_dataloader}, test_cfg={test_cfg}, '
+                f'test_evaluator={test_evaluator}')
+        self._test_dataloader = test_dataloader
+        self._test_loop = test_cfg
+        self._test_evaluator = test_evaluator
+
+        if not isinstance(compile, bool) and not isinstance(compile, dict):
+            raise TypeError(
+                f'compile should be a bool or dict, but got {type(compile)}')
+        self._compile = compile
+
+        if isinstance(resume, str) and load_from is not None:
+            raise ValueError('If resume is a str, load_from should be None.')
+        self._load_from = load_from
+        self._resume = resume
+        # flag to mark whether checkpoint has been loaded or resumed
+        self._has_loaded = False
+
+        if launcher is None:
+            launcher = infer_launcher()
+
+        if experiment_name is None and self.cfg.filename is not None:
+            experiment_name = osp.splitext(osp.basename(self.cfg.filename))[0]
+
+        self._randomness_cfg = randomness
+        self.strategy = self.build_strategy(
+            strategy,
+            launcher=launcher,
+            randomness=randomness,
+            env_cfg=env_cfg,
+            experiment_name=experiment_name,
+            log_level=log_level,
+        )
+
+        # Used to reset registries location. See :meth:`Registry.build` for
+        # more details.
+        if default_scope is not None:
+            default_scope = DefaultScope.get_instance(  # type: ignore
+                self.experiment_name,
+                scope_name=default_scope)
+        self.default_scope = default_scope
+        # Build log processor to format message.
+        log_processor = dict() if log_processor is None else log_processor
+        self.log_processor = self.build_log_processor(log_processor)
+
+        # Collect and log environment information.
+        self._log_env()
+
+        # Build `message_hub` for communication among components.
+        # `message_hub` can store log scalars (loss, learning rate) and
+        # runtime information (iter and epoch). Those components that do not
+        # have access to the runner can get iteration or epoch information
+        # from `message_hub`. For example, models can get the latest created
+        # `message_hub` by
+        # `self.message_hub=MessageHub.get_current_instance()` and then get
+        # current epoch by `cur_epoch = self.message_hub.get_info('epoch')`.
+        # See `MessageHub` and `ManagerMixin` for more details.
+        self.message_hub = self.build_message_hub()
+        # visualizer used for writing log or visualizing all kinds of data
+        self.visualizer = self.build_visualizer(visualizer)
+        if self.cfg:
+            self.visualizer.add_config(self.cfg)
+
+        self._hooks: List[Hook] = []
+        # register hooks to `self._hooks`
+        self.register_hooks(default_hooks, custom_hooks)
+        # log hooks information
+        self.logger.info(f'Hooks will be executed in the following '
+                         f'order:\n{self.get_hooks_info()}')
+
+        # dump `cfg` to `work_dir`
+        self.dump_config()
+
+    @classmethod
+    def from_cfg(cls, cfg: ConfigType) -> 'FlexibleRunner':
+        """Build a runner from config.
+
+        Args:
+            cfg (ConfigType): A config used for building runner. Keys of
+                ``cfg`` can see :meth:`__init__`.
+
+        Returns:
+            Runner: A runner build from ``cfg``.
+        """
+        cfg = copy.deepcopy(cfg)
+        runner = cls(
+            model=cfg['model'],
+            work_dir=cfg.get('work_dir', 'work_dirs'),
+            experiment_name=cfg.get('experiment_name'),
+            train_dataloader=cfg.get('train_dataloader'),
+            optim_wrapper=cfg.get('optim_wrapper'),
+            param_scheduler=cfg.get('param_scheduler'),
+            train_cfg=cfg.get('train_cfg'),
+            val_dataloader=cfg.get('val_dataloader'),
+            val_evaluator=cfg.get('val_evaluator'),
+            val_cfg=cfg.get('val_cfg'),
+            test_dataloader=cfg.get('test_dataloader'),
+            test_evaluator=cfg.get('test_evaluator'),
+            test_cfg=cfg.get('test_cfg'),
+            strategy=cfg.get('strategy'),
+            auto_scale_lr=cfg.get('auto_scale_lr'),
+            default_hooks=cfg.get('default_hooks'),
+            custom_hooks=cfg.get('custom_hooks'),
+            data_preprocessor=cfg.get('data_preprocessor'),
+            load_from=cfg.get('load_from'),
+            resume=cfg.get('resume', False),
+            launcher=cfg.get('launcher'),
+            env_cfg=cfg.get('env_cfg'),  # type: ignore
+            log_processor=cfg.get('log_processor'),
+            log_level=cfg.get('log_level', 'INFO'),
+            visualizer=cfg.get('visualizer'),
+            default_scope=cfg.get('default_scope', 'mmengine'),
+            randomness=cfg.get('randomness', dict(seed=None)),
+            cfg=cfg,
+        )
+
+        return runner
+
+    @property
+    def experiment_name(self):
+        """str: Name of experiment."""
+        return self.strategy.experiment_name
+
+    @property
+    def model_name(self):
+        """str: Name of the model, usually the module class name."""
+        return self._model_name
+
+    @property
+    def work_dir(self):
+        """str: The working directory to save checkpoints and logs."""
+        return self._work_dir
+
+    @property
+    def log_dir(self):
+        return self.strategy.log_dir
+
+    @property
+    def logger(self):
+        return self.strategy.logger
+
+    @property
+    def max_epochs(self):
+        """int: Total epochs to train model."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.max_epochs
+        else:
+            return 0
+
+    @property
+    def max_iters(self):
+        """int: Total iterations to train model."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.max_iters
+        else:
+            return 0
+
+    @property
+    def epoch(self):
+        """int: Current epoch."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.epoch
+        else:
+            return 0
+
+    @property
+    def iter(self):
+        """int: Current iteration."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.iter
+        else:
+            return 0
+
+    @property
+    def distributed(self):
+        """bool: Whether current environment is distributed."""
+        return self.strategy.distributed
+
+    @property
+    def rank(self):
+        """int: Rank of current process."""
+        return self.strategy.rank
+
+    @property
+    def world_size(self):
+        """int: Number of processes participating in the job."""
+        return self.strategy.world_size
+
+    @property
+    def deterministic(self):
+        """int: Whether cudnn to select deterministic algorithms."""
+        return self._deterministic
+
+    @property
+    def seed(self):
+        """int: A number to set random modules."""
+        return self.strategy.seed
+
+    @property
+    def timestamp(self):
+        """str: Timestamp when creating experiment."""
+        return self.strategy.timestamp
+
+    @property
+    def hooks(self):
+        """list[:obj:`Hook`]: A list of registered hooks."""
+        return self._hooks
+
+    @property
+    def train_loop(self):
+        """:obj:`BaseLoop`: A loop to run training."""
+        if isinstance(self._train_loop, BaseLoop) or self._train_loop is None:
+            return self._train_loop
+        else:
+            self._train_loop = self.build_train_loop(self._train_loop)
+            return self._train_loop
+
+    @property
+    def val_loop(self):
+        """:obj:`BaseLoop`: A loop to run validation."""
+        if isinstance(self._val_loop, BaseLoop) or self._val_loop is None:
+            return self._val_loop
+        else:
+            self._val_loop = self.build_val_loop(self._val_loop)
+            return self._val_loop
+
+    @property
+    def test_loop(self):
+        """:obj:`BaseLoop`: A loop to run testing."""
+        if isinstance(self._test_loop, BaseLoop) or self._test_loop is None:
+            return self._test_loop
+        else:
+            self._test_loop = self.build_test_loop(self._test_loop)
+            return self._test_loop
+
+    @property
+    def train_dataloader(self):
+        """The data loader for training."""
+        return self.train_loop.dataloader
+
+    @property
+    def val_dataloader(self):
+        """The data loader for validation."""
+        return self.val_loop.dataloader
+
+    @property
+    def test_dataloader(self):
+        """The data loader for testing."""
+        return self.test_loop.dataloader
+
+    @property
+    def val_evaluator(self):
+        """:obj:`Evaluator`: An evaluator for validation."""
+        return self.val_loop.evaluator
+
+    @property
+    def test_evaluator(self):
+        """:obj:`Evaluator`: An evaluator for testing."""
+        return self.test_loop.evaluator
+
+    @property
+    def val_interval(self):
+        """int: Interval to run validation during training."""
+        return self.train_loop.val_interval
+
+    @property
+    def val_begin(self):
+        """int: The epoch/iteration to start running validation during
+        training."""
+        return self.train_loop.val_begin
+
+    def build_strategy(
+        self,
+        strategy: Optional[Union[BaseStrategy, Dict]] = None,
+        launcher: str = 'none',
+        randomness: Optional[dict] = None,
+        env_cfg: dict = dict(dist_cfg=dict(backend='nccl')),
+        experiment_name: Optional[str] = None,
+        log_level: Optional[str] = None,
+    ) -> BaseStrategy:
+        """Build a strategy.
+
+        Args:
+            strategy (BaseStrategy, optional): A strategy object or dict to
+                build the strategy. Defaults to None.
+
+        Returns:
+            BaseStrategy: A strategy object.
+        """
+        if isinstance(strategy, BaseStrategy):
+            strategy_obj = strategy
+        else:
+            if launcher == 'none':
+                if strategy is None:
+                    strategy = dict(type='SingleDeviceStrategy')
+            else:
+                if strategy is None:
+                    strategy = dict(type='DDPStrategy')
+
+            assert isinstance(strategy, dict)
+
+            # train_micro_batch_size_per_gpu is required by DeepSpeed
+            if isinstance(strategy['type'], str):
+                strategy_name = strategy['type']
+            else:
+                strategy_name = strategy['type'].__name__
+            if strategy_name == 'DeepSpeedStrategy':
+                if self._train_dataloader is None:
+                    strategy['train_micro_batch_size_per_gpu'] = 1
+                else:
+                    strategy['train_micro_batch_size_per_gpu'] = \
+                        _get_batch_size(self._train_dataloader)
+
+            strategy.setdefault('work_dir', self._work_dir)
+            strategy.setdefault('experiment_name', experiment_name)
+            strategy.setdefault('auto_scale_lr', self._auto_scale_lr)
+
+            env_kwargs = dict(
+                launcher=launcher,
+                randomness=randomness,
+                **env_cfg,
+            )
+            strategy.setdefault('env_kwargs', env_kwargs)
+
+            log_kwargs = dict(log_level=log_level)
+            strategy.setdefault('log_kwargs', log_kwargs)
+
+            strategy_obj = STRATEGIES.build(strategy)
+
+        return strategy_obj
+
+    def build_message_hub(
+        self,
+        message_hub: Optional[Dict] = None,
+    ) -> MessageHub:
+        """Build a global asscessable MessageHub.
+
+        Args:
+            message_hub (dict, optional): A dict to build MessageHub object.
+                If not specified, default config will be used to build
+                MessageHub object. Defaults to None.
+
+        Returns:
+            MessageHub: A MessageHub object build from ``message_hub``.
+        """
+        if message_hub is None:
+            message_hub = dict(name=self.experiment_name)
+        elif isinstance(message_hub, dict):
+            # ensure message_hub containing name key
+            message_hub.setdefault('name', self.experiment_name)
+        else:
+            raise TypeError(
+                f'message_hub should be dict or None, but got {message_hub}')
+
+        return MessageHub.get_instance(**message_hub)
+
+    def build_visualizer(
+        self,
+        visualizer: Optional[Union[Visualizer, Dict]] = None,
+    ) -> Visualizer:
+        """Build a global asscessable Visualizer.
+
+        Args:
+            visualizer (Visualizer or dict, optional): A Visualizer object
+                or a dict to build Visualizer object. If ``visualizer`` is a
+                Visualizer object, just returns itself. If not specified,
+                default config will be used to build Visualizer object.
+                Defaults to None.
+
+        Returns:
+            Visualizer: A Visualizer object build from ``visualizer``.
+        """
+        if visualizer is None:
+            visualizer = dict(
+                name=self.experiment_name,
+                vis_backends=[dict(type='LocalVisBackend')],
+                save_dir=self.log_dir)
+            return Visualizer.get_instance(**visualizer)
+
+        if isinstance(visualizer, Visualizer):
+            return visualizer
+
+        if isinstance(visualizer, dict):
+            # ensure visualizer containing name key
+            visualizer.setdefault('name', self.experiment_name)
+            visualizer.setdefault('save_dir', self.log_dir)
+            return VISUALIZERS.build(visualizer)
+        else:
+            raise TypeError(
+                'visualizer should be Visualizer object, a dict or None, '
+                f'but got {visualizer}')
+
+    def build_evaluator(
+        self,
+        evaluator: Union[Dict, List, Evaluator],
+    ) -> Evaluator:
+        """Build evaluator.
+
+        Examples of ``evaluator``::
+
+            # evaluator could be a built Evaluator instance
+            evaluator = Evaluator(metrics=[ToyMetric()])
+
+            # evaluator can also be a list of dict
+            evaluator = [
+                dict(type='ToyMetric1'),
+                dict(type='ToyEvaluator2')
+            ]
+
+            # evaluator can also be a list of built metric
+            evaluator = [ToyMetric1(), ToyMetric2()]
+
+            # evaluator can also be a dict with key metrics
+            evaluator = dict(metrics=ToyMetric())
+            # metric is a list
+            evaluator = dict(metrics=[ToyMetric()])
+
+        Args:
+            evaluator (Evaluator or dict or list): An Evaluator object or a
+                config dict or list of config dict used to build an Evaluator.
+
+        Returns:
+            Evaluator: Evaluator build from ``evaluator``.
+        """
+        if isinstance(evaluator, Evaluator):
+            return evaluator
+        elif isinstance(evaluator, dict):
+            # if `metrics` in dict keys, it means to build customized evalutor
+            if 'metrics' in evaluator:
+                evaluator.setdefault('type', 'Evaluator')
+                return EVALUATOR.build(evaluator)
+            # otherwise, default evalutor will be built
+            else:
+                return Evaluator(evaluator)  # type: ignore
+        elif isinstance(evaluator, list):
+            # use the default `Evaluator`
+            return Evaluator(evaluator)  # type: ignore
+        else:
+            raise TypeError(
+                'evaluator should be one of dict, list of dict, and Evaluator'
+                f', but got {evaluator}')
+
+    @staticmethod
+    def build_dataloader(
+        dataloader: Union[DataLoader, Dict],
+        seed: Optional[int] = None,
+        diff_rank_seed: bool = False,
+    ) -> DataLoader:
+        """Build dataloader.
+
+        The method builds three components:
+
+        - Dataset
+        - Sampler
+        - Dataloader
+
+        An example of ``dataloader``::
+
+            dataloader = dict(
+                dataset=dict(type='ToyDataset'),
+                sampler=dict(type='DefaultSampler', shuffle=True),
+                batch_size=1,
+                num_workers=9
+            )
+
+        Args:
+            dataloader (DataLoader or dict): A Dataloader object or a dict to
+                build Dataloader object. If ``dataloader`` is a Dataloader
+                object, just returns itself.
+            seed (int, optional): Random seed. Defaults to None.
+            diff_rank_seed (bool): Whether or not set different seeds to
+                different ranks. If True, the seed passed to sampler is set
+                to None, in order to synchronize the seeds used in samplers
+                across different ranks. Defaults to False.
+
+        Returns:
+            Dataloader: DataLoader build from ``dataloader_cfg``.
+        """
+        if isinstance(dataloader, DataLoader):
+            return dataloader
+
+        dataloader_cfg = copy.deepcopy(dataloader)
+
+        # build dataset
+        dataset_cfg = dataloader_cfg.pop('dataset')
+        if isinstance(dataset_cfg, dict):
+            dataset = DATASETS.build(dataset_cfg)
+            if hasattr(dataset, 'full_init'):
+                dataset.full_init()
+        else:
+            # fallback to raise error in dataloader
+            # if `dataset_cfg` is not a valid type
+            dataset = dataset_cfg
+
+        # build sampler
+        sampler_cfg = dataloader_cfg.pop('sampler')
+        if isinstance(sampler_cfg, dict):
+            sampler_seed = None if diff_rank_seed else seed
+            sampler = DATA_SAMPLERS.build(
+                sampler_cfg,
+                default_args=dict(dataset=dataset, seed=sampler_seed))
+        else:
+            # fallback to raise error in dataloader
+            # if `sampler_cfg` is not a valid type
+            sampler = sampler_cfg
+
+        # build batch sampler
+        batch_sampler_cfg = dataloader_cfg.pop('batch_sampler', None)
+        if batch_sampler_cfg is None:
+            batch_sampler = None
+        elif isinstance(batch_sampler_cfg, dict):
+            batch_sampler = DATA_SAMPLERS.build(
+                batch_sampler_cfg,
+                default_args=dict(
+                    sampler=sampler,
+                    batch_size=dataloader_cfg.pop('batch_size')))
+        else:
+            # fallback to raise error in dataloader
+            # if `batch_sampler_cfg` is not a valid type
+            batch_sampler = batch_sampler_cfg
+
+        # build dataloader
+        init_fn: Optional[partial]
+        if 'worker_init_fn' in dataloader_cfg:
+            worker_init_fn_cfg = dataloader_cfg.pop('worker_init_fn')
+            worker_init_fn_type = worker_init_fn_cfg.pop('type')
+            worker_init_fn = FUNCTIONS.get(worker_init_fn_type)
+            assert callable(worker_init_fn)
+            init_fn = partial(worker_init_fn,
+                              **worker_init_fn_cfg)  # type: ignore
+        else:
+            if seed is not None:
+                disable_subprocess_warning = dataloader_cfg.pop(
+                    'disable_subprocess_warning', False)
+                assert isinstance(disable_subprocess_warning, bool), (
+                    'disable_subprocess_warning should be a bool, but got '
+                    f'{type(disable_subprocess_warning)}')
+                init_fn = partial(
+                    default_worker_init_fn,
+                    num_workers=dataloader_cfg.get('num_workers'),
+                    rank=get_rank(),
+                    seed=seed,
+                    disable_subprocess_warning=disable_subprocess_warning)
+            else:
+                init_fn = None
+
+        # `persistent_workers` requires pytorch version >= 1.7
+        if ('persistent_workers' in dataloader_cfg
+                and digit_version(TORCH_VERSION) < digit_version('1.7.0')):
+            print_log(
+                '`persistent_workers` is only available when '
+                'pytorch version >= 1.7',
+                logger='current',
+                level=logging.WARNING)
+            dataloader_cfg.pop('persistent_workers')
+
+        # The default behavior of `collat_fn` in dataloader is to
+        # merge a list of samples to form a mini-batch of Tensor(s).
+        # However, in mmengine, if `collate_fn` is not defined in
+        # dataloader_cfg, `pseudo_collate` will only convert the list of
+        # samples into a dict without stacking the batch tensor.
+        collate_fn_cfg = dataloader_cfg.pop('collate_fn',
+                                            dict(type='pseudo_collate'))
+        if isinstance(collate_fn_cfg, dict):
+            collate_fn_type = collate_fn_cfg.pop('type')
+            if isinstance(collate_fn_type, str):
+                collate_fn = FUNCTIONS.get(collate_fn_type)
+            else:
+                collate_fn = collate_fn_type
+            collate_fn = partial(collate_fn, **collate_fn_cfg)  # type: ignore
+        elif callable(collate_fn_cfg):
+            collate_fn = collate_fn_cfg
+        else:
+            raise TypeError(
+                'collate_fn should be a dict or callable object, but got '
+                f'{collate_fn_cfg}')
+        data_loader = DataLoader(
+            dataset=dataset,
+            sampler=sampler if batch_sampler is None else None,
+            batch_sampler=batch_sampler,
+            collate_fn=collate_fn,
+            worker_init_fn=init_fn,
+            **dataloader_cfg)
+        return data_loader
+
+    def build_train_loop(self, loop: Union[BaseLoop, Dict]) -> BaseLoop:
+        """Build training loop.
+
+        Examples of ``loop``::
+
+            # `EpochBasedTrainLoop` will be used
+            loop = dict(by_epoch=True, max_epochs=3)
+
+            # `IterBasedTrainLoop` will be used
+            loop = dict(by_epoch=False, max_epochs=3)
+
+            # custom training loop
+            loop = dict(type='CustomTrainLoop', max_epochs=3)
+
+        Args:
+            loop (BaseLoop or dict): A training loop or a dict to build
+                training loop. If ``loop`` is a training loop object, just
+                returns itself.
+
+        Returns:
+            :obj:`BaseLoop`: Training loop object build from ``loop``.
+        """
+        if isinstance(loop, BaseLoop):
+            return loop
+        elif not isinstance(loop, dict):
+            raise TypeError(
+                f'loop should be a Loop object or dict, but got {loop}')
+
+        loop_cfg = copy.deepcopy(loop)
+
+        if 'type' in loop_cfg and 'by_epoch' in loop_cfg:
+            raise RuntimeError(
+                'Only one of `type` or `by_epoch` can exist in `loop_cfg`.')
+
+        if 'type' in loop_cfg:
+            loop = LOOPS.build(
+                loop_cfg,
+                default_args=dict(
+                    runner=self, dataloader=self._train_dataloader))
+        else:
+            by_epoch = loop_cfg.pop('by_epoch')
+            if by_epoch:
+                loop = EpochBasedTrainLoop(
+                    **loop_cfg, runner=self, dataloader=self._train_dataloader)
+            else:
+                loop = IterBasedTrainLoop(
+                    **loop_cfg, runner=self, dataloader=self._train_dataloader)
+        return loop  # type: ignore
+
+    def build_val_loop(self, loop: Union[BaseLoop, Dict]) -> BaseLoop:
+        """Build validation loop.
+
+        Examples of ``loop``:
+
+            # `ValLoop` will be used
+            loop = dict()
+
+            # custom validation loop
+            loop = dict(type='CustomValLoop')
+
+        Args:
+            loop (BaseLoop or dict): A validation loop or a dict to build
+                validation loop. If ``loop`` is a validation loop object, just
+                returns itself.
+
+        Returns:
+            :obj:`BaseLoop`: Validation loop object build from ``loop``.
+        """
+        if isinstance(loop, BaseLoop):
+            return loop
+        elif not isinstance(loop, dict):
+            raise TypeError(
+                f'train_loop should be a Loop object or dict, but got {loop}')
+
+        loop_cfg = copy.deepcopy(loop)
+
+        if 'type' in loop_cfg:
+            loop = LOOPS.build(
+                loop_cfg,
+                default_args=dict(
+                    runner=self,
+                    dataloader=self._val_dataloader,
+                    evaluator=self._val_evaluator))
+        else:
+            loop = ValLoop(
+                **loop_cfg,
+                runner=self,
+                dataloader=self._val_dataloader,
+                evaluator=self._val_evaluator)  # type: ignore
+
+        return loop  # type: ignore
+
+    def build_test_loop(self, loop: Union[BaseLoop, Dict]) -> BaseLoop:
+        """Build test loop.
+
+        Examples of ``loop``::
+
+            # `TestLoop` will be used
+            loop = dict()
+
+            # custom test loop
+            loop = dict(type='CustomTestLoop')
+
+        Args:
+            loop (BaseLoop or dict): A test loop or a dict to build test loop.
+                If ``loop`` is a test loop object, just returns itself.
+
+        Returns:
+            :obj:`BaseLoop`: Test loop object build from ``loop_cfg``.
+        """
+        if isinstance(loop, BaseLoop):
+            return loop
+        elif not isinstance(loop, dict):
+            raise TypeError(
+                f'train_loop should be a Loop object or dict, but got {loop}')
+
+        loop_cfg = copy.deepcopy(loop)  # type: ignore
+
+        if 'type' in loop_cfg:
+            loop = LOOPS.build(
+                loop_cfg,
+                default_args=dict(
+                    runner=self,
+                    dataloader=self._test_dataloader,
+                    evaluator=self._test_evaluator))
+        else:
+            loop = TestLoop(
+                **loop_cfg,
+                runner=self,
+                dataloader=self._test_dataloader,
+                evaluator=self._test_evaluator)  # type: ignore
+
+        return loop  # type: ignore
+
+    def build_log_processor(
+        self,
+        log_processor: Union[LogProcessor, Dict],
+    ) -> LogProcessor:
+        """Build test log_processor.
+
+        Examples of ``log_processor``:
+
+            # `LogProcessor` will be used
+            log_processor = dict()
+
+            # custom log_processor
+            log_processor = dict(type='CustomLogProcessor')
+
+        Args:
+            log_processor (LogProcessor or dict): A log processor or a dict
+            to build log processor. If ``log_processor`` is a log processor
+            object, just returns itself.
+
+        Returns:
+            :obj:`LogProcessor`: Log processor object build from
+            ``log_processor_cfg``.
+        """
+        if isinstance(log_processor, LogProcessor):
+            return log_processor
+        elif not isinstance(log_processor, dict):
+            raise TypeError(
+                'log processor should be a LogProcessor object or dict, but'
+                f'got {log_processor}')
+
+        log_processor_cfg = copy.deepcopy(log_processor)  # type: ignore
+
+        if 'type' in log_processor_cfg:
+            log_processor = LOG_PROCESSORS.build(log_processor_cfg)
+        else:
+            log_processor = LogProcessor(**log_processor_cfg)  # type: ignore
+
+        return log_processor  # type: ignore
+
+    def get_hooks_info(self) -> str:
+        # Get hooks info in each stage
+        stage_hook_map: Dict[str, list] = {stage: [] for stage in Hook.stages}
+        for hook in self.hooks:
+            try:
+                priority = Priority(hook.priority).name  # type: ignore
+            except ValueError:
+                priority = hook.priority  # type: ignore
+            classname = hook.__class__.__name__
+            hook_info = f'({priority:<12}) {classname:<35}'
+            for trigger_stage in hook.get_triggered_stages():
+                stage_hook_map[trigger_stage].append(hook_info)
+
+        stage_hook_infos = []
+        for stage in Hook.stages:
+            hook_infos = stage_hook_map[stage]
+            if len(hook_infos) > 0:
+                info = f'{stage}:\n'
+                info += '\n'.join(hook_infos)
+                info += '\n -------------------- '
+                stage_hook_infos.append(info)
+        return '\n'.join(stage_hook_infos)
+
+    def load_or_resume(self):
+        """load or resume checkpoint."""
+        if self._has_loaded:
+            return None
+
+        if not self._resume and self._load_from is None:
+            return None
+
+        # decide to load from checkpoint or resume from checkpoint
+        resume_from = None
+        if isinstance(self._resume, str):
+            resume_from = self._resume
+        elif self._resume and self._load_from is None:
+            # auto resume from the latest checkpoint
+            resume_from = find_latest_checkpoint(self.work_dir)
+            self.logger.info(
+                f'Auto resumed from the latest checkpoint {resume_from}.')
+        elif self._resume and self._load_from is not None:
+            # resume from the specified checkpoint
+            resume_from = self._load_from
+
+        if resume_from is not None:
+            self.resume(resume_from)
+            self._has_loaded = True
+        elif self._load_from is not None:
+            self.load_checkpoint(self._load_from)
+            self._has_loaded = True
+
+    def train(self) -> nn.Module:
+        """Launch training.
+
+        Returns:
+            nn.Module: The model after training.
+        """
+        if self._train_loop is None:
+            raise RuntimeError(
+                '`self._train_loop` should not be None when calling train '
+                'method. Please provide `train_dataloader`, `train_cfg`, '
+                '`optimizer` and `param_scheduler` arguments when '
+                'initializing runner.')
+
+        self._train_loop = self.build_train_loop(
+            self._train_loop)  # type: ignore
+
+        if self._val_loop is not None:
+            self._val_loop = self.build_val_loop(
+                self._val_loop)  # type: ignore
+
+        compile: Union[dict, bool] = False
+        if isinstance(self._compile, bool):
+            if self._compile:
+                compile = dict(target='train_step')
+        else:
+            compile = copy.copy(self._compile)
+            compile.setdefault('target', 'train_step')
+
+        dispatch_kwargs = dict(
+            epoch_length=len(self.train_dataloader),
+            max_epochs=self.max_epochs,
+            max_iters=self.max_iters,
+            train_micro_batch_size_per_gpu=_get_batch_size(
+                self.train_dataloader))  # type: ignore
+
+        self.strategy.prepare(
+            self.model,
+            optim_wrapper=self.optim_wrapper,
+            param_scheduler=self.param_schedulers,
+            compile=compile,
+            dispatch_kwargs=dispatch_kwargs,
+        )
+
+        self.model = self.strategy.model
+        self.optim_wrapper = self.strategy.optim_wrapper  # type: ignore
+        if self.param_schedulers is not None:
+            self.param_schedulers = self.strategy.param_schedulers
+
+        self.load_or_resume()
+
+        # TODO: add a contextmanager to avoid calling `before_run` many times
+        self.call_hook('before_run')
+
+        model = self.train_loop.run()  # type: ignore
+        self.call_hook('after_run')
+        return model
+
+    def val(self) -> dict:
+        """Launch validation.
+
+        Returns:
+            dict: A dict of metrics on validation set.
+        """
+        if self._val_loop is None:
+            raise RuntimeError(
+                '`self._val_loop` should not be None when calling val method.'
+                'Please provide `val_dataloader`, `val_cfg` and '
+                '`val_evaluator` arguments when initializing runner.')
+
+        self._val_loop = self.build_val_loop(self._val_loop)  # type: ignore
+
+        dispatch_kwargs = dict(
+            init_weights_for_test_or_val=self.cfg.get(
+                'init_weights_for_test_or_val', True))
+        self.strategy.prepare(self.model, dispatch_kwargs=dispatch_kwargs)
+        self.model = self.strategy.model
+
+        self.load_or_resume()
+
+        self.call_hook('before_run')
+        metrics = self.val_loop.run()  # type: ignore
+        self.call_hook('after_run')
+
+        return metrics
+
+    def test(self) -> dict:
+        """Launch test.
+
+        Returns:
+            dict: A dict of metrics on testing set.
+        """
+        if self._test_loop is None:
+            raise RuntimeError(
+                '`self._test_loop` should not be None when calling test '
+                'method. Please provide `test_dataloader`, `test_cfg` and '
+                '`test_evaluator` arguments when initializing runner.')
+
+        self._test_loop = self.build_test_loop(self._test_loop)  # type: ignore
+        dispatch_kwargs = dict(
+            init_weights_for_test_or_val=self.cfg.get(
+                'init_weights_for_test_or_val', True))
+        self.strategy.prepare(self.model, dispatch_kwargs=dispatch_kwargs)
+        self.model = self.strategy.model
+
+        self.load_or_resume()
+
+        self.call_hook('before_run')
+        metrics = self.test_loop.run()  # type: ignore
+        self.call_hook('after_run')
+
+        return metrics
+
+    def call_hook(self, fn_name: str, **kwargs) -> None:
+        """Call all hooks.
+
+        Args:
+            fn_name (str): The function name in each hook to be called, such as
+                "before_train_epoch".
+            **kwargs: Keyword arguments passed to hook.
+        """
+        for hook in self._hooks:
+            # support adding additional custom hook methods
+            if hasattr(hook, fn_name):
+                try:
+                    getattr(hook, fn_name)(self, **kwargs)
+                except TypeError as e:
+                    raise TypeError(f'{e} in {hook}') from e
+
+    def register_hook(
+        self,
+        hook: Union[Hook, Dict],
+        priority: Optional[Union[str, int, Priority]] = None,
+    ) -> None:
+        """Register a hook into the hook list.
+
+        The hook will be inserted into a priority queue, with the specified
+        priority (See :class:`Priority` for details of priorities).
+        For hooks with the same priority, they will be triggered in the same
+        order as they are registered.
+
+        Priority of hook will be decided with the following priority:
+
+        - ``priority`` argument. If ``priority`` is given, it will be priority
+          of hook.
+        - If ``hook`` argument is a dict and ``priority`` in it, the priority
+          will be the value of ``hook['priority']``.
+        - If ``hook`` argument is a dict but ``priority`` not in it or ``hook``
+          is an instance of ``hook``, the priority will be ``hook.priority``.
+
+        Args:
+            hook (:obj:`Hook` or dict): The hook to be registered.
+            priority (int or str or :obj:`Priority`, optional): Hook priority.
+                Lower value means higher priority.
+        """
+        if not isinstance(hook, (Hook, dict)):
+            raise TypeError(
+                f'hook should be an instance of Hook or dict, but got {hook}')
+
+        _priority = None
+        if isinstance(hook, dict):
+            if 'priority' in hook:
+                _priority = hook.pop('priority')
+
+            hook_obj = HOOKS.build(hook)
+        else:
+            hook_obj = hook
+
+        if priority is not None:
+            hook_obj.priority = priority
+        elif _priority is not None:
+            hook_obj.priority = _priority
+
+        inserted = False
+        for i in range(len(self._hooks) - 1, -1, -1):
+            if get_priority(hook_obj.priority) >= get_priority(
+                    self._hooks[i].priority):
+                self._hooks.insert(i + 1, hook_obj)
+                inserted = True
+                break
+        if not inserted:
+            self._hooks.insert(0, hook_obj)
+
+    def register_default_hooks(
+        self,
+        hooks: Optional[Dict[str, Union[Hook, Dict]]] = None,
+    ) -> None:
+        """Register default hooks into hook list.
+
+        ``hooks`` will be registered into runner to execute some default
+        actions like updating model parameters or saving checkpoints.
+
+        Default hooks and their priorities:
+
+        +----------------------+-------------------------+
+        | Hooks                | Priority                |
+        +======================+=========================+
+        | RuntimeInfoHook      | VERY_HIGH (10)          |
+        +----------------------+-------------------------+
+        | IterTimerHook        | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | DistSamplerSeedHook  | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | LoggerHook           | BELOW_NORMAL (60)       |
+        +----------------------+-------------------------+
+        | ParamSchedulerHook   | LOW (70)                |
+        +----------------------+-------------------------+
+        | CheckpointHook       | VERY_LOW (90)           |
+        +----------------------+-------------------------+
+
+        If ``hooks`` is None, above hooks will be registered by
+        default::
+
+            default_hooks = dict(
+                runtime_info=dict(type='RuntimeInfoHook'),
+                timer=dict(type='IterTimerHook'),
+                sampler_seed=dict(type='DistSamplerSeedHook'),
+                logger=dict(type='LoggerHook'),
+                param_scheduler=dict(type='ParamSchedulerHook'),
+                checkpoint=dict(type='CheckpointHook', interval=1),
+            )
+
+        If not None, ``hooks`` will be merged into ``default_hooks``.
+        If there are None value in default_hooks, the corresponding item will
+        be popped from ``default_hooks``::
+
+            hooks = dict(timer=None)
+
+        The final registered default hooks will be :obj:`RuntimeInfoHook`,
+        :obj:`DistSamplerSeedHook`, :obj:`LoggerHook`,
+        :obj:`ParamSchedulerHook` and :obj:`CheckpointHook`.
+
+        Args:
+            hooks (dict[str, Hook or dict], optional): Default hooks or configs
+                to be registered.
+        """
+        default_hooks: dict = dict(
+            runtime_info=dict(type='RuntimeInfoHook'),
+            timer=dict(type='IterTimerHook'),
+            sampler_seed=dict(type='DistSamplerSeedHook'),
+            logger=dict(type='LoggerHook'),
+            param_scheduler=dict(type='ParamSchedulerHook'),
+            checkpoint=dict(type='CheckpointHook', interval=1),
+        )
+        if hooks is not None:
+            for name, hook in hooks.items():
+                if name in default_hooks and hook is None:
+                    # remove hook from _default_hooks
+                    default_hooks.pop(name)
+                else:
+                    assert hook is not None
+                    default_hooks[name] = hook
+
+        for hook in default_hooks.values():
+            self.register_hook(hook)
+
+    def register_custom_hooks(self, hooks: List[Union[Hook, Dict]]) -> None:
+        """Register custom hooks into hook list.
+
+        Args:
+            hooks (list[Hook | dict]): List of hooks or configs to be
+                registered.
+        """
+        for hook in hooks:
+            self.register_hook(hook)
+
+    def register_hooks(
+        self,
+        default_hooks: Optional[Dict[str, Union[Hook, Dict]]] = None,
+        custom_hooks: Optional[List[Union[Hook, Dict]]] = None,
+    ) -> None:
+        """Register default hooks and custom hooks into hook list.
+
+        Args:
+            default_hooks (dict[str, dict] or dict[str, Hook], optional): Hooks
+                to execute default actions like updating model parameters and
+                saving checkpoints.  Defaults to None.
+            custom_hooks (list[dict] or list[Hook], optional): Hooks to execute
+                custom actions like visualizing images processed by pipeline.
+                Defaults to None.
+        """
+        self.register_default_hooks(default_hooks)
+
+        if custom_hooks is not None:
+            self.register_custom_hooks(custom_hooks)
+
+    def resume(
+        self,
+        filename: str,
+        resume_optimizer: bool = True,
+        resume_param_scheduler: bool = True,
+        map_location: Union[str, Callable] = 'default',
+    ) -> None:
+        """Resume model from checkpoint.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+            resume_optimizer (bool): Whether to resume optimizer state.
+                Defaults to True.
+            resume_param_scheduler (bool): Whether to resume param scheduler
+                state. Defaults to True.
+            map_location (str or callable):A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'default'.
+        """
+
+        def callback(checkpoint):
+            self.call_hook('after_load_checkpoint', checkpoint=checkpoint)
+
+        checkpoint = self.strategy.resume(
+            filename,
+            resume_optimizer=resume_optimizer,
+            resume_param_scheduler=resume_param_scheduler,
+            map_location=map_location,
+            callback=callback,
+        )
+
+        self.train_loop._epoch = checkpoint['meta']['epoch']
+        self.train_loop._iter = checkpoint['meta']['iter']
+
+        # check whether the number of GPU used for current experiment
+        # is consistent with resuming from checkpoint
+        if 'config' in checkpoint['meta']:
+            config = mmengine.Config.fromstring(
+                checkpoint['meta']['config'], file_format='.py')
+            previous_gpu_ids = config.get('gpu_ids', None)
+            if (previous_gpu_ids is not None and len(previous_gpu_ids) > 0
+                    and len(previous_gpu_ids) != self.world_size):
+                # TODO, should we modify the iteration?
+                self.logger.info(
+                    'Number of GPU used for current experiment is not '
+                    'consistent with resuming from checkpoint')
+                if (self._auto_scale_lr is None
+                        or not self._auto_scale_lr.get('enable', False)):
+                    raise RuntimeError(
+                        'Cannot automatically rescale lr in resuming. Please '
+                        'make sure the number of GPU is consistent with the '
+                        'previous training state resuming from the checkpoint '
+                        'or set `enable` in `auto_scale_lr to False.')
+
+        resumed_dataset_meta = checkpoint['meta'].get('dataset_meta', None)
+        dataset_meta = getattr(self.train_dataloader.dataset, 'metainfo', None)
+
+        # `resumed_dataset_meta` and `dataset_meta` could be object like
+        # np.ndarray, which cannot be directly judged as equal or not,
+        # therefore we just compared their dumped results.
+        if pickle.dumps(resumed_dataset_meta) != pickle.dumps(dataset_meta):
+            self.logger.warning(
+                'The dataset metainfo from the resumed checkpoint is '
+                'different from the current training dataset, please '
+                'check the correctness of the checkpoint or the training '
+                'dataset.')
+
+        self.message_hub.load_state_dict(checkpoint['message_hub'])
+
+        self.logger.info(f'resumed epoch: {self.epoch}, iter: {self.iter}')
+
+    def load_checkpoint(self,
+                        filename: str,
+                        map_location: Union[str, Callable] = 'cpu',
+                        strict: bool = False,
+                        revise_keys: list = [(r'^module.', '')]):
+        """Load checkpoint from given ``filename``.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+            map_location (str or callable): A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'cpu'.
+            strict (bool): strict (bool): Whether to allow different params for
+                the model and checkpoint.
+            revise_keys (list): A list of customized keywords to modify the
+                state_dict in checkpoint. Each item is a (pattern, replacement)
+                pair of the regular expression operations. Defaults to strip
+                the prefix 'module.' by [(r'^module\\.', '')].
+        """
+
+        def callback(checkpoint):
+            self.call_hook('after_load_checkpoint', checkpoint=checkpoint)
+
+        self.strategy.load_checkpoint(
+            filename,
+            map_location=map_location,
+            strict=strict,
+            revise_keys=revise_keys,
+            callback=callback)
+
+    def save_checkpoint(
+        self,
+        out_dir: str,
+        filename: str,
+        file_client_args: Optional[dict] = None,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        meta: dict = None,
+        by_epoch: bool = True,
+        backend_args: Optional[dict] = None,
+    ):
+        """Save checkpoints.
+
+        ``CheckpointHook`` invokes this method to save checkpoints
+        periodically.
+
+        Args:
+            out_dir (str): The directory that checkpoints are saved.
+            filename (str): The checkpoint filename.
+            file_client_args (dict, optional): Arguments to instantiate a
+                FileClient. See :class:`mmengine.fileio.FileClient` for
+                details. Defaults to None. It will be deprecated in future.
+                Please use `backend_args` instead.
+            save_optimizer (bool): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            save_param_scheduler (bool): Whether to save the param_scheduler
+                to the checkpoint. Defaults to True.
+            meta (dict, optional): The meta information to be saved in the
+                checkpoint. Defaults to None.
+            by_epoch (bool): Whether the scheduled momentum is updated by
+                epochs. Defaults to True.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+        """
+        if meta is None:
+            meta = {}
+        elif not isinstance(meta, dict):
+            raise TypeError(
+                f'meta should be a dict or None, but got {type(meta)}')
+
+        if by_epoch:
+            # self.epoch increments 1 after
+            # `self.call_hook('after_train_epoch)` but `save_checkpoint` is
+            # called by `after_train_epoch`` method of `CheckpointHook` so
+            # `epoch` should be `self.epoch + 1`
+            meta.update(epoch=self.epoch + 1, iter=self.iter)
+        else:
+            meta.update(epoch=self.epoch, iter=self.iter + 1)
+
+        if file_client_args is not None:
+            warnings.warn(
+                '"file_client_args" will be deprecated in future. '
+                'Please use "backend_args" instead', DeprecationWarning)
+            if backend_args is not None:
+                raise ValueError(
+                    '"file_client_args" and "backend_args" cannot be set at '
+                    'the same time.')
+
+            file_client = FileClient.infer_client(file_client_args, out_dir)
+            filepath = file_client.join_path(out_dir, filename)
+        else:
+            filepath = join_path(  # type: ignore
+                out_dir, filename, backend_args=backend_args)
+
+        meta.update(
+            cfg=self.cfg.pretty_text, experiment_name=self.experiment_name)
+
+        if hasattr(self.train_dataloader.dataset, 'metainfo'):
+            meta.update(dataset_meta=self.train_dataloader.dataset.metainfo)
+
+        checkpoint = {
+            'meta': meta,
+            'message_hub': self.message_hub.state_dict()
+        }
+
+        def callback(checkpoint):
+            self.call_hook('before_save_checkpoint', checkpoint=checkpoint)
+
+        self.strategy.save_checkpoint(
+            filename=filepath,
+            save_optimizer=save_optimizer,
+            save_param_scheduler=save_param_scheduler,
+            extra_ckpt=checkpoint,
+            callback=callback,
+        )
+
+    @master_only
+    def dump_config(self) -> None:
+        """Dump config to `work_dir`."""
+        if self.cfg.filename is not None:
+            filename = osp.basename(self.cfg.filename)
+        else:
+            filename = f'{self.timestamp}.py'
+        self.cfg.dump(osp.join(self.work_dir, filename))
+
+    def _log_env(self) -> None:
+        """Logging environment information of the current task.
+
+        Args:
+            env_cfg (dict): The environment config of the runner.
+        """
+        # Collect and log environment information.
+        system_env, runtime_env = self.strategy.collect_env()
+
+        env_info = '\n    ' + '\n    '.join(f'{k}: {v}'
+                                            for k, v in system_env.items())
+        runtime_env_info = '\n    ' + '\n    '.join(
+            f'{k}: {v}' for k, v in runtime_env.items())
+        dash_line = '-' * 60
+        self.logger.info('\n' + dash_line + '\nSystem environment:' +
+                         env_info + '\n'
+                         '\nRuntime environment:' + runtime_env_info + '\n' +
+                         dash_line + '\n')
+
+        if self.cfg._cfg_dict:
+            self.logger.info(f'Config:\n{self.cfg.pretty_text}')
diff --git a/head_extractor/build/lib/mmengine/runner/activation_checkpointing.py b/head_extractor/build/lib/mmengine/runner/activation_checkpointing.py
new file mode 100644
index 0000000000000000000000000000000000000000..3db67f057ced0f56a533eaeb98a1ea52fdd183cf
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/runner/activation_checkpointing.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import wraps
+from operator import attrgetter
+from typing import List, Union
+
+import torch
+from torch.utils.checkpoint import checkpoint
+
+
+def wrap_forward(forward):
+
+    @wraps(forward)
+    def wrapper(*args):
+        return checkpoint(forward, *args)
+
+    return wrapper
+
+
+def turn_on_activation_checkpointing(model: torch.nn.Module,
+                                     modules: Union[List[str], str]):
+
+    if isinstance(modules, str):
+        modules = [modules]
+    for module_name in modules:
+        module = attrgetter(module_name)(model)
+        module.forward = wrap_forward(module.forward)
diff --git a/head_extractor/build/lib/mmengine/runner/amp.py b/head_extractor/build/lib/mmengine/runner/amp.py
new file mode 100644
index 0000000000000000000000000000000000000000..198babc58239ee4bc54335b869b1418f511fcb10
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/runner/amp.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from contextlib import contextmanager
+from typing import Optional
+
+import torch
+
+from mmengine.device import (get_device, is_cuda_available, is_mlu_available,
+                             is_npu_available)
+from mmengine.logging import print_log
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+
+@contextmanager
+def autocast(device_type: Optional[str] = None,
+             dtype: Optional[torch.dtype] = None,
+             enabled: bool = True,
+             cache_enabled: Optional[bool] = None):
+    """A wrapper of ``torch.autocast`` and ``toch.cuda.amp.autocast``.
+
+    Pytorch 1.5.0 provide ``torch.cuda.amp.autocast`` for running in
+    mixed precision , and update it to ``torch.autocast`` in 1.10.0.
+    Both interfaces have different arguments, and ``torch.autocast``
+    support running with cpu additionally.
+
+    This function provides a unified interface by wrapping
+    ``torch.autocast`` and ``torch.cuda.amp.autocast``, which resolves the
+    compatibility issues that ``torch.cuda.amp.autocast`` does not support
+    running mixed precision with cpu, and both contexts have different
+    arguments. We suggest users using this function in the code
+    to achieve maximized compatibility of different PyTorch versions.
+
+    Note:
+        ``autocast`` requires pytorch version >= 1.5.0. If pytorch version
+        <= 1.10.0 and cuda is not available, it will raise an error with
+        ``enabled=True``, since ``torch.cuda.amp.autocast`` only support cuda
+        mode.
+
+    Examples:
+         >>> # case1: 1.10 > Pytorch version >= 1.5.0
+         >>> with autocast():
+         >>>    # run in mixed precision context
+         >>>    pass
+         >>> with autocast(device_type='cpu')::
+         >>>    # raise error, torch.cuda.amp.autocast only support cuda mode.
+         >>>    pass
+         >>> # case2: Pytorch version >= 1.10.0
+         >>> with autocast():
+         >>>    # default cuda mixed precision context
+         >>>    pass
+         >>> with autocast(device_type='cpu'):
+         >>>    # cpu mixed precision context
+         >>>    pass
+         >>> with autocast(
+         >>>     device_type='cuda', enabled=True, cache_enabled=True):
+         >>>    # enable precision context with more specific arguments.
+         >>>    pass
+
+    Args:
+        device_type (str, required):  Whether to use 'cuda' or 'cpu' device.
+        enabled(bool):  Whether autocasting should be enabled in the region.
+            Defaults to True
+        dtype (torch_dtype, optional):  Whether to use ``torch.float16`` or
+            ``torch.bfloat16``.
+        cache_enabled(bool, optional):  Whether the weight cache inside
+            autocast should be enabled.
+    """
+    # If `enabled` is True, enable an empty context and all calculations
+    # are performed under fp32.
+    assert digit_version(TORCH_VERSION) >= digit_version('1.5.0'), (
+        'The minimum pytorch version requirements of mmengine is 1.5.0, but '
+        f'got {TORCH_VERSION}')
+
+    if (digit_version('1.5.0') <= digit_version(TORCH_VERSION) <
+            digit_version('1.10.0')):
+        # If pytorch version is between 1.5.0 and 1.10.0, the default value of
+        # dtype for `torch.cuda.amp.autocast` is torch.float16.
+        assert (
+            device_type == 'cuda' or device_type == 'mlu'
+            or device_type is None), (
+                'Pytorch version under 1.10.0 only supports running automatic '
+                'mixed training with cuda or mlu')
+        if dtype is not None or cache_enabled is not None:
+            print_log(
+                f'{dtype} and {device_type} will not work for '
+                '`autocast` since your Pytorch version: '
+                f'{TORCH_VERSION} <= 1.10.0',
+                logger='current',
+                level=logging.WARNING)
+
+        if is_npu_available():
+            with torch.npu.amp.autocast(enabled=enabled):
+                yield
+        elif is_mlu_available():
+            with torch.mlu.amp.autocast(enabled=enabled):
+                yield
+        elif is_cuda_available():
+            with torch.cuda.amp.autocast(enabled=enabled):
+                yield
+        else:
+            if not enabled:
+                yield
+            else:
+                raise RuntimeError(
+                    'If pytorch versions is between 1.5.0 and 1.10, '
+                    '`autocast` is only available in gpu mode')
+
+    else:
+        # Modified from https://github.com/pytorch/pytorch/blob/master/torch/amp/autocast_mode.py # noqa: E501
+        # This code should update with the `torch.autocast`.
+        if cache_enabled is None:
+            cache_enabled = torch.is_autocast_cache_enabled()
+        device = get_device()
+        device_type = device if device_type is None else device_type
+
+        if device_type == 'cuda':
+            if dtype is None:
+                dtype = torch.get_autocast_gpu_dtype()
+
+            if dtype == torch.bfloat16 and not \
+                    torch.cuda.is_bf16_supported():
+                raise RuntimeError(
+                    'Current CUDA Device does not support bfloat16. Please '
+                    'switch dtype to float16.')
+
+        elif device_type == 'cpu':
+            if dtype is None:
+                dtype = torch.bfloat16
+            assert dtype == torch.bfloat16, (
+                'In CPU autocast, only support `torch.bfloat16` dtype')
+
+        elif device_type == 'mlu':
+            pass
+
+        elif device_type == 'npu':
+            pass
+        elif device_type == 'musa':
+            if dtype is None:
+                dtype = torch.get_autocast_gpu_dtype()
+            with torch.musa.amp.autocast(
+                    enabled=enabled, dtype=dtype, cache_enabled=cache_enabled):
+                yield
+                return
+        else:
+            # Device like MPS does not support fp16 training or testing.
+            # If an inappropriate device is set and fp16 is enabled, an error
+            # will be thrown.
+            if enabled is False:
+                yield
+                return
+            else:
+                raise ValueError('User specified autocast device_type must be '
+                                 f'cuda or cpu, but got {device_type}')
+
+        with torch.autocast(
+                device_type=device_type,
+                enabled=enabled,
+                dtype=dtype,
+                cache_enabled=cache_enabled):
+            yield
diff --git a/head_extractor/build/lib/mmengine/runner/base_loop.py b/head_extractor/build/lib/mmengine/runner/base_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bae459a2071b1560ddf159a2922925f6a58bdb7
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/runner/base_loop.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Any, Dict, Union
+
+from torch.utils.data import DataLoader
+
+
+class BaseLoop(metaclass=ABCMeta):
+    """Base loop class.
+
+    All subclasses inherited from ``BaseLoop`` should overwrite the
+    :meth:`run` method.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): An iterator to generate one batch of
+            dataset each iteration.
+    """
+
+    def __init__(self, runner, dataloader: Union[DataLoader, Dict]) -> None:
+        self._runner = runner
+        if isinstance(dataloader, dict):
+            # Determine whether or not different ranks use different seed.
+            diff_rank_seed = runner._randomness_cfg.get(
+                'diff_rank_seed', False)
+            self.dataloader = runner.build_dataloader(
+                dataloader, seed=runner.seed, diff_rank_seed=diff_rank_seed)
+        else:
+            self.dataloader = dataloader
+
+    @property
+    def runner(self):
+        return self._runner
+
+    @abstractmethod
+    def run(self) -> Any:
+        """Execute loop."""
diff --git a/head_extractor/build/lib/mmengine/runner/checkpoint.py b/head_extractor/build/lib/mmengine/runner/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..60d71a735baaf0f0e93f8a99e927124c71d70763
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/runner/checkpoint.py
@@ -0,0 +1,815 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import io
+import logging
+import os
+import os.path as osp
+import pkgutil
+import re
+from collections import OrderedDict, namedtuple
+from importlib import import_module
+from tempfile import TemporaryDirectory
+from typing import Callable, Dict, Optional
+
+import torch
+
+import mmengine
+from mmengine.dist import get_dist_info
+from mmengine.fileio import FileClient, get_file_backend
+from mmengine.fileio import load as load_file
+from mmengine.logging import print_log
+from mmengine.model import BaseTTAModel, is_model_wrapper
+from mmengine.utils import (apply_to, deprecated_function, digit_version,
+                            mkdir_or_exist)
+from mmengine.utils.dl_utils import load_url
+
+# `MMENGINE_HOME` is the highest priority directory to save checkpoints
+# downloaded from Internet. If it is not set, as a workaround, using
+# `XDG_CACHE_HOME`` or `~/.cache` instead.
+# Note that `XDG_CACHE_HOME` defines the base directory relative to which
+# user-specific non-essential data files should be stored. If `XDG_CACHE_HOME`
+# is either not set or empty, a default equal to `~/.cache` should be used.
+ENV_MMENGINE_HOME = 'MMENGINE_HOME'
+ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
+DEFAULT_CACHE_DIR = '~/.cache'
+
+
+class _IncompatibleKeys(
+        namedtuple('IncompatibleKeys', ['missing_keys', 'unexpected_keys'])):
+
+    def __repr__(self):
+        if not self.missing_keys and not self.unexpected_keys:
+            return '<All keys matched successfully>'
+        return super().__repr__()
+
+    __str__ = __repr__
+
+
+def _get_mmengine_home():
+    mmengine_home = os.path.expanduser(
+        os.getenv(
+            ENV_MMENGINE_HOME,
+            os.path.join(
+                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmengine')))
+
+    mkdir_or_exist(mmengine_home)
+    return mmengine_home
+
+
+def load_state_dict(module, state_dict, strict=False, logger=None):
+    """Load state_dict to a module.
+
+    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+    Default value for ``strict`` is set to ``False`` and the message for
+    param mismatch will be shown even if strict is False.
+
+    Args:
+        module (Module): Module that receives the state_dict.
+        state_dict (OrderedDict): Weights.
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Defaults to False.
+        logger (:obj:`logging.Logger`, optional): Logger to log the error
+            message. If not specified, print function will be used.
+    """
+    unexpected_keys = []
+    missing_keys = []
+    err_msg = []
+
+    # copy state_dict so _load_from_state_dict can modify it
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    # use _load_from_state_dict to enable checkpoint version control
+    def load(module, local_state_dict, prefix=''):
+        # recursively check parallel module in case that the model has a
+        # complicated structure, e.g., nn.Module(nn.Module(DDP))
+        if is_model_wrapper(module) or isinstance(module, BaseTTAModel):
+            module = module.module
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(local_state_dict, prefix, local_metadata,
+                                     True, missing_keys, unexpected_keys,
+                                     err_msg)
+        for name, child in module._modules.items():
+            if child is not None:
+                child_prefix = prefix + name + '.'
+                child_state_dict = {
+                    k: v
+                    for k, v in local_state_dict.items()
+                    if k.startswith(child_prefix)
+                }
+                load(child, child_state_dict, child_prefix)
+
+        # Note that the hook can modify missing_keys and unexpected_keys.
+        incompatible_keys = _IncompatibleKeys(missing_keys, unexpected_keys)
+        if hasattr(module, '_load_state_dict_post_hooks'):
+            for hook in module._load_state_dict_post_hooks.values():
+                out = hook(module, incompatible_keys)
+                assert out is None, (
+                    'Hooks registered with '
+                    '``register_load_state_dict_post_hook`` are not expected '
+                    'to return new values, if incompatible_keys need to be '
+                    'modified, it should be done inplace.')
+
+    load(module, state_dict)
+    load = None  # break load->load reference cycle
+
+    # ignore "num_batches_tracked" of BN layers
+    missing_keys = [
+        key for key in missing_keys if 'num_batches_tracked' not in key
+    ]
+
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+
+    rank, _ = get_dist_info()
+    if len(err_msg) > 0 and rank == 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)
+        if strict:
+            raise RuntimeError(err_msg)
+        else:
+            print_log(err_msg, logger=logger, level=logging.WARNING)
+
+
+def get_torchvision_models():
+    import torchvision
+    if digit_version(torchvision.__version__) < digit_version('0.13.0a0'):
+        model_urls = dict()
+        # When the version of torchvision is lower than 0.13, the model url is
+        # not declared in `torchvision.model.__init__.py`, so we need to
+        # iterate through `torchvision.models.__path__` to get the url for each
+        # model.
+        for _, name, ispkg in pkgutil.walk_packages(
+                torchvision.models.__path__):
+            if ispkg:
+                continue
+            _zoo = import_module(f'torchvision.models.{name}')
+            if hasattr(_zoo, 'model_urls'):
+                _urls = getattr(_zoo, 'model_urls')
+                model_urls.update(_urls)
+    else:
+        # Since torchvision bumps to v0.13, the weight loading logic,
+        # model keys and model urls have been changed. Here the URLs of old
+        # version is loaded to avoid breaking back compatibility. If the
+        # torchvision version>=0.13.0, new URLs will be added. Users can get
+        # the resnet50 checkpoint by setting 'resnet50.imagent1k_v1',
+        # 'resnet50' or 'ResNet50_Weights.IMAGENET1K_V1' in the config.
+        json_path = osp.join(mmengine.__path__[0], 'hub/torchvision_0.12.json')
+        model_urls = mmengine.load(json_path)
+        if digit_version(torchvision.__version__) < digit_version('0.14.0a0'):
+            weights_list = [
+                cls for cls_name, cls in torchvision.models.__dict__.items()
+                if cls_name.endswith('_Weights')
+            ]
+        else:
+            weights_list = [
+                torchvision.models.get_model_weights(model)
+                for model in torchvision.models.list_models(torchvision.models)
+            ]
+
+        for cls in weights_list:
+            # The name of torchvision model weights classes ends with
+            # `_Weights` such as `ResNet18_Weights`. However, some model weight
+            # classes, such as `MNASNet0_75_Weights` does not have any urls in
+            # torchvision 0.13.0 and cannot be iterated. Here we simply check
+            # `DEFAULT` attribute to ensure the class is not empty.
+            if not hasattr(cls, 'DEFAULT'):
+                continue
+            # Since `cls.DEFAULT` can not be accessed by iterating cls, we set
+            # default urls explicitly.
+            cls_name = cls.__name__
+            cls_key = cls_name.replace('_Weights', '').lower()
+            model_urls[f'{cls_key}.default'] = cls.DEFAULT.url
+            for weight_enum in cls:
+                cls_key = cls_name.replace('_Weights', '').lower()
+                cls_key = f'{cls_key}.{weight_enum.name.lower()}'
+                model_urls[cls_key] = weight_enum.url
+
+    return model_urls
+
+
+def get_external_models():
+    mmengine_home = _get_mmengine_home()
+    default_json_path = osp.join(mmengine.__path__[0], 'hub/openmmlab.json')
+    default_urls = load_file(default_json_path)
+    assert isinstance(default_urls, dict)
+    external_json_path = osp.join(mmengine_home, 'open_mmlab.json')
+    if osp.exists(external_json_path):
+        external_urls = load_file(external_json_path)
+        assert isinstance(external_urls, dict)
+        default_urls.update(external_urls)
+
+    return default_urls
+
+
+def get_mmcls_models():
+    mmcls_json_path = osp.join(mmengine.__path__[0], 'hub/mmcls.json')
+    mmcls_urls = load_file(mmcls_json_path)
+
+    return mmcls_urls
+
+
+def get_deprecated_model_names():
+    deprecate_json_path = osp.join(mmengine.__path__[0], 'hub/deprecated.json')
+    deprecate_urls = load_file(deprecate_json_path)
+    assert isinstance(deprecate_urls, dict)
+
+    return deprecate_urls
+
+
+def _process_mmcls_checkpoint(checkpoint):
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        # Some checkpoints converted from 3rd-party repo don't
+        # have the "state_dict" key.
+        state_dict = checkpoint
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith('backbone.'):
+            new_state_dict[k[9:]] = v
+    new_checkpoint = dict(state_dict=new_state_dict)
+
+    return new_checkpoint
+
+
+class CheckpointLoader:
+    """A general checkpoint loader to manage all schemes."""
+
+    _schemes: Dict[str, Callable] = {}
+
+    @classmethod
+    def _register_scheme(cls, prefixes, loader, force=False):
+        if isinstance(prefixes, str):
+            prefixes = [prefixes]
+        else:
+            assert isinstance(prefixes, (list, tuple))
+        for prefix in prefixes:
+            if (prefix not in cls._schemes) or force:
+                cls._schemes[prefix] = loader
+            else:
+                raise KeyError(
+                    f'{prefix} is already registered as a loader backend, '
+                    'add "force=True" if you want to override it')
+        # sort, longer prefixes take priority
+        cls._schemes = OrderedDict(
+            sorted(cls._schemes.items(), key=lambda t: t[0], reverse=True))
+
+    @classmethod
+    def register_scheme(cls, prefixes, loader=None, force=False):
+        """Register a loader to CheckpointLoader.
+
+        This method can be used as a normal class method or a decorator.
+
+        Args:
+            prefixes (str or list[str] or tuple[str]):
+            The prefix of the registered loader.
+            loader (function, optional): The loader function to be registered.
+                When this method is used as a decorator, loader is None.
+                Defaults to None.
+            force (bool, optional): Whether to override the loader
+                if the prefix has already been registered. Defaults to False.
+        """
+
+        if loader is not None:
+            cls._register_scheme(prefixes, loader, force=force)
+            return
+
+        def _register(loader_cls):
+            cls._register_scheme(prefixes, loader_cls, force=force)
+            return loader_cls
+
+        return _register
+
+    @classmethod
+    def _get_checkpoint_loader(cls, path):
+        """Finds a loader that supports the given path. Falls back to the local
+        loader if no other loader is found.
+
+        Args:
+            path (str): checkpoint path
+
+        Returns:
+            callable: checkpoint loader
+        """
+        for p in cls._schemes:
+            # use regular match to handle some cases that where the prefix of
+            # loader has a prefix. For example, both 's3://path' and
+            # 'open-mmlab:s3://path' should return `load_from_ceph`
+            if re.match(p, path) is not None:
+                return cls._schemes[p]
+
+    @classmethod
+    def load_checkpoint(cls, filename, map_location=None, logger='current'):
+        """load checkpoint through URL scheme path.
+
+        Args:
+            filename (str): checkpoint file name with given prefix
+            map_location (str, optional): Same as :func:`torch.load`.
+                Defaults to None
+            logger (str): The logger for message. Defaults to 'current'.
+
+        Returns:
+            dict or OrderedDict: The loaded checkpoint.
+        """
+
+        checkpoint_loader = cls._get_checkpoint_loader(filename)
+        class_name = checkpoint_loader.__name__
+        print_log(
+            f'Loads checkpoint by {class_name[10:]} backend from path: '
+            f'{filename}',
+            logger=logger)
+        return checkpoint_loader(filename, map_location)
+
+
+@CheckpointLoader.register_scheme(prefixes='')
+def load_from_local(filename, map_location):
+    """load checkpoint by local file path.
+
+    Args:
+        filename (str): local checkpoint file path
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    filename = osp.expanduser(filename)
+    if not osp.isfile(filename):
+        raise FileNotFoundError(f'{filename} can not be found.')
+    checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes=('http://', 'https://'))
+def load_from_http(filename,
+                   map_location=None,
+                   model_dir=None,
+                   progress=os.isatty(0)):
+    """load checkpoint through HTTP or HTTPS scheme path. In distributed
+    setting, this function only download checkpoint at local rank 0.
+
+    Args:
+        filename (str): checkpoint file path with modelzoo or
+            torchvision prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+        model_dir (string, optional): directory in which to save the object,
+            Defaults to None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        checkpoint = load_url(
+            filename,
+            model_dir=model_dir,
+            map_location=map_location,
+            progress=progress)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            checkpoint = load_url(
+                filename,
+                model_dir=model_dir,
+                map_location=map_location,
+                progress=progress)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes='pavi://')
+def load_from_pavi(filename, map_location=None):
+    """load checkpoint through the file path prefixed with pavi. In distributed
+    setting, this function download ckpt at all ranks to different temporary
+    directories.
+
+    Args:
+        filename (str): checkpoint file path with pavi prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+          Defaults to None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    assert filename.startswith('pavi://'), \
+        f'Expected filename startswith `pavi://`, but get {filename}'
+    model_path = filename[7:]
+
+    try:
+        from pavi import modelcloud
+    except ImportError:
+        raise ImportError(
+            'Please install pavi to load checkpoint from modelcloud.')
+
+    model = modelcloud.get(model_path)
+    with TemporaryDirectory() as tmp_dir:
+        downloaded_file = osp.join(tmp_dir, model.name)
+        model.download(downloaded_file)
+        checkpoint = torch.load(downloaded_file, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(
+    prefixes=[r'(\S+\:)?s3://', r'(\S+\:)?petrel://'])
+def load_from_ceph(filename, map_location=None, backend='petrel'):
+    """load checkpoint through the file path prefixed with s3.  In distributed
+    setting, this function download ckpt at all ranks to different temporary
+    directories.
+
+    Args:
+        filename (str): checkpoint file path with s3 prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+        backend (str, optional): The storage backend type.
+            Defaults to 'petrel'.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    file_backend = get_file_backend(
+        filename, backend_args={'backend': backend})
+    with io.BytesIO(file_backend.get(filename)) as buffer:
+        checkpoint = torch.load(buffer, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes=('modelzoo://', 'torchvision://'))
+def load_from_torchvision(filename, map_location=None):
+    """load checkpoint through the file path prefixed with modelzoo or
+    torchvision.
+
+    Args:
+        filename (str): checkpoint file path with modelzoo or
+            torchvision prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    model_urls = get_torchvision_models()
+    if filename.startswith('modelzoo://'):
+        print_log(
+            'The URL scheme of "modelzoo://" is deprecated, please '
+            'use "torchvision://" instead',
+            logger='current',
+            level=logging.WARNING)
+        model_name = filename[11:]
+    else:
+        model_name = filename[14:]
+    return load_from_http(model_urls[model_name], map_location=map_location)
+
+
+@CheckpointLoader.register_scheme(prefixes=('open-mmlab://', 'openmmlab://'))
+def load_from_openmmlab(filename, map_location=None):
+    """load checkpoint through the file path prefixed with open-mmlab or
+    openmmlab.
+
+    Args:
+        filename (str): checkpoint file path with open-mmlab or
+        openmmlab prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+          Defaults to None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    model_urls = get_external_models()
+    prefix_str = 'open-mmlab://'
+    if filename.startswith(prefix_str):
+        model_name = filename[13:]
+    else:
+        model_name = filename[12:]
+        prefix_str = 'openmmlab://'
+
+    deprecated_urls = get_deprecated_model_names()
+    if model_name in deprecated_urls:
+        print_log(
+            f'{prefix_str}{model_name} is deprecated in favor '
+            f'of {prefix_str}{deprecated_urls[model_name]}',
+            logger='current',
+            level=logging.WARNING)
+        model_name = deprecated_urls[model_name]
+    model_url = model_urls[model_name]
+    # check if is url
+    if model_url.startswith(('http://', 'https://')):
+        checkpoint = load_from_http(model_url, map_location=map_location)
+    else:
+        filename = osp.join(_get_mmengine_home(), model_url)
+        if not osp.isfile(filename):
+            raise FileNotFoundError(f'{filename} can not be found.')
+        checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes='mmcls://')
+def load_from_mmcls(filename, map_location=None):
+    """load checkpoint through the file path prefixed with mmcls.
+
+    Args:
+        filename (str): checkpoint file path with mmcls prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    model_urls = get_mmcls_models()
+    model_name = filename[8:]
+    checkpoint = load_from_http(
+        model_urls[model_name], map_location=map_location)
+    checkpoint = _process_mmcls_checkpoint(checkpoint)
+    return checkpoint
+
+
+def _load_checkpoint(filename, map_location=None, logger=None):
+    """Load checkpoint from somewhere (modelzoo, file, url).
+
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str, optional): Same as :func:`torch.load`.
+           Defaults to None.
+        logger (:mod:`logging.Logger`, optional): The logger for error message.
+           Defaults to None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint. It can be either an
+        OrderedDict storing model weights or a dict containing other
+        information, which depends on the checkpoint.
+    """
+    return CheckpointLoader.load_checkpoint(filename, map_location, logger)
+
+
+def _load_checkpoint_with_prefix(prefix, filename, map_location=None):
+    """Load partial pretrained model with specific prefix.
+
+    Args:
+        prefix (str): The prefix of sub-module.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str | None): Same as :func:`torch.load`.
+            Defaults to None.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    checkpoint = _load_checkpoint(filename, map_location=map_location)
+
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    if not prefix.endswith('.'):
+        prefix += '.'
+    prefix_len = len(prefix)
+
+    state_dict = {
+        k[prefix_len:]: v
+        for k, v in state_dict.items() if k.startswith(prefix)
+    }
+
+    assert state_dict, f'{prefix} is not in the pretrained model'
+    return state_dict
+
+
+def _load_checkpoint_to_model(model,
+                              checkpoint,
+                              strict=False,
+                              logger=None,
+                              revise_keys=[(r'^module\.', '')]):
+
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+
+    # strip prefix of state_dict
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    for p, r in revise_keys:
+        state_dict = OrderedDict(
+            {re.sub(p, r, k): v
+             for k, v in state_dict.items()})
+    # Keep metadata in state_dict
+    state_dict._metadata = metadata
+
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+
+def load_checkpoint(model,
+                    filename,
+                    map_location=None,
+                    strict=False,
+                    logger=None,
+                    revise_keys=[(r'^module\.', '')]):
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        revise_keys (list): A list of customized keywords to modify the
+            state_dict in checkpoint. Each item is a (pattern, replacement)
+            pair of the regular expression operations. Defaults to strip
+            the prefix 'module.' by [(r'^module\\.', '')].
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location, logger)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+
+    return _load_checkpoint_to_model(model, checkpoint, strict, logger,
+                                     revise_keys)
+
+
+def weights_to_cpu(state_dict):
+    """Copy a model state_dict to cpu.
+
+    Args:
+        state_dict (OrderedDict): Model weights on GPU.
+
+    Returns:
+        OrderedDict: Model weights on GPU.
+    """
+    # stash metadata to put in state_dict later
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    state_dict = apply_to(state_dict, lambda x: hasattr(x, 'cpu'),
+                          lambda x: x.cpu())
+    state_dict._metadata = metadata
+    return state_dict
+
+
+@deprecated_function(
+    since='0.3.0',
+    removed_in='0.5.0',
+    instructions='`_save_to_state_dict` will be deprecated in the future, '
+    'please use `nn.Module._save_to_state_dict` directly.')
+def _save_to_state_dict(module, destination, prefix, keep_vars):
+    """Saves module state to `destination` dictionary.
+
+    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (dict): A dict where state will be stored.
+        prefix (str): The prefix for parameters and buffers used in this
+            module.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters.
+    """
+    for name, param in module._parameters.items():
+        if param is not None:
+            destination[prefix + name] = param if keep_vars else param.detach()
+    for name, buf in module._buffers.items():
+        if buf is not None and name not in module._non_persistent_buffers_set:
+            destination[prefix + name] = buf if keep_vars else buf.detach()
+
+
+def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+    """Returns a dictionary containing a whole state of the module.
+
+    Both parameters and persistent buffers (e.g. running averages) are
+    included. Keys are corresponding parameter and buffer names.
+    This method is modified from :meth:`torch.nn.Module.state_dict` to
+    recursively check parallel module in case that the model has a complicated
+    structure, e.g., nn.Module(nn.Module(DDP)).
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (OrderedDict): Returned dict for the state of the
+            module.
+        prefix (str): Prefix of the key.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters. Defaults to False.
+
+    Returns:
+        dict: A dictionary containing a whole state of the module.
+    """
+    # recursively check parallel module in case that the model has a
+    # complicated structure, e.g., nn.Module(nn.Module(DDP))
+    if is_model_wrapper(module):
+        module = module.module
+
+    # below is the same as torch.nn.Module.state_dict()
+    if destination is None:
+        destination = OrderedDict()
+        destination._metadata = OrderedDict()
+    destination._metadata[prefix[:-1]] = local_metadata = dict(
+        version=module._version)
+    module._save_to_state_dict(destination, prefix, keep_vars)
+    for name, child in module._modules.items():
+        if child is not None:
+            get_state_dict(
+                child, destination, prefix + name + '.', keep_vars=keep_vars)
+    for hook in module._state_dict_hooks.values():
+        hook_result = hook(module, destination, prefix, local_metadata)
+        if hook_result is not None:
+            destination = hook_result
+    return destination
+
+
+def save_checkpoint(checkpoint,
+                    filename,
+                    file_client_args=None,
+                    backend_args=None):
+    """Save checkpoint to file.
+
+    Args:
+        checkpoint (dict): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            `backend_args` instead.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+    """
+    if file_client_args is not None:
+        print_log(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead',
+            logger='current',
+            level=logging.WARNING)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args" and "backend_args" cannot be set '
+                'at the same time.')
+
+    if filename.startswith('pavi://'):
+        if file_client_args is not None or backend_args is not None:
+            raise ValueError(
+                '"file_client_args" or "backend_args" should be "None" if '
+                'filename starts with "pavi://"')
+        try:
+            from pavi import exception, modelcloud
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except exception.NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        file_client = FileClient.infer_client(file_client_args, filename)
+        if file_client_args is None:
+            file_backend = get_file_backend(
+                filename, backend_args=backend_args)
+        else:
+            file_backend = file_client
+
+        with io.BytesIO() as f:
+            torch.save(checkpoint, f)
+            file_backend.put(f.getvalue(), filename)
+
+
+def find_latest_checkpoint(path: str) -> Optional[str]:
+    """Find the latest checkpoint from the given path.
+
+    Refer to https://github.com/facebookresearch/fvcore/blob/main/fvcore/common/checkpoint.py  # noqa: E501
+
+    Args:
+        path(str): The path to find checkpoints.
+
+    Returns:
+        str or None: File path of the latest checkpoint.
+    """
+    save_file = osp.join(path, 'last_checkpoint')
+    last_saved: Optional[str]
+    if os.path.exists(save_file):
+        with open(save_file) as f:
+            last_saved = f.read().strip()
+    else:
+        print_log('Did not find last_checkpoint to be resumed.')
+        last_saved = None
+    return last_saved
diff --git a/head_extractor/build/lib/mmengine/runner/log_processor.py b/head_extractor/build/lib/mmengine/runner/log_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..98183ae317d813b946ae540899fcdae4e7764e99
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/runner/log_processor.py
@@ -0,0 +1,582 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import datetime
+import re
+from collections import OrderedDict
+from itertools import chain
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+
+from mmengine.device import (get_max_cuda_memory, get_max_musa_memory,
+                             is_cuda_available, is_musa_available)
+from mmengine.registry import LOG_PROCESSORS
+
+
+@LOG_PROCESSORS.register_module()
+class LogProcessor:
+    """A log processor used to format log information collected from
+    ``runner.message_hub.log_scalars``.
+
+    ``LogProcessor`` instance is built by runner and will format
+    ``runner.message_hub.log_scalars`` to ``tag`` and ``log_str``, which can
+    directly used by ``LoggerHook`` and ``MMLogger``. Besides, the argument
+    ``custom_cfg`` of constructor can control the statistics method of logs.
+
+    Args:
+        window_size (int): default smooth interval. Defaults to 10.
+        by_epoch (bool): Whether to format logs with epoch stype. Defaults to
+            True.
+        custom_cfg (list[dict], optional): Contains multiple log config dict,
+            in which key means the data source name of log and value means the
+            statistic method and corresponding arguments used to count the
+            data source. Defaults to None.
+
+            - If custom_cfg is None, all logs will be formatted via default
+              methods, such as smoothing loss by default window_size. If
+              custom_cfg is defined as a list of config dict, for example:
+              [dict(data_src='loss', method='mean', log_name='global_loss',
+              window_size='global')]. It means the log item ``loss`` will be
+              counted as global mean and additionally logged as ``global_loss``
+              (defined by ``log_name``). If ``log_name`` is not defined in
+              config dict, the original logged key will be overwritten.
+
+            - The original log item cannot be overwritten twice. Here is
+              an error example:
+              [dict(data_src='loss', method='mean', window_size='global'),
+              dict(data_src='loss', method='mean', window_size='epoch')].
+              Both log config dict in custom_cfg do not have ``log_name`` key,
+              which means the loss item will be overwritten twice.
+
+            - For those statistic methods with the ``window_size`` argument,
+              if ``by_epoch`` is set to False, ``windows_size`` should not be
+              `epoch` to statistics log value by epoch.
+        num_digits (int): The number of significant digit shown in the
+            logging message. Defaults to 4.
+        log_with_hierarchy (bool): Whether to log with hierarchy. If it is
+            True, the information is written to visualizer backend such as
+            :obj:`LocalVisBackend` and :obj:`TensorboardBackend`
+            with hierarchy. For example, ``loss`` will be saved as
+            ``train/loss``, and accuracy will be saved as ``val/accuracy``.
+            Defaults to False.
+            `New in version 0.7.0.`
+        mean_pattern (str): This is a regular expression used to match the log
+            that need to be included in the smoothing statistics.
+            `New in version 0.7.3.`
+
+    Examples:
+        >>> # `log_name` is defined, `loss_large_window` will be an additional
+        >>> # record.
+        >>> log_processor = dict(
+        >>>     window_size=10,
+        >>>     by_epoch=True,
+        >>>     custom_cfg=[dict(data_src='loss',
+        >>>                       log_name='loss_large_window',
+        >>>                       method_name='mean',
+        >>>                       window_size=100)])
+        >>> # `log_name` is not defined. `loss` will be overwritten.
+        >>> log_processor = dict(
+        >>>     window_size=10,
+        >>>     by_epoch=True,
+        >>>     custom_cfg=[dict(data_src='loss',
+        >>>                       method_name='mean',
+        >>>                       window_size=100)])
+        >>> # Record loss with different statistics methods.
+        >>> log_processor = dict(
+        >>>     window_size=10,
+        >>>     by_epoch=True,
+        >>>     custom_cfg=[dict(data_src='loss',
+        >>>                       log_name='loss_large_window',
+        >>>                       method_name='mean',
+        >>>                       window_size=100),
+        >>>                  dict(data_src='loss',
+        >>>                       method_name='mean',
+        >>>                       window_size=100)])
+        >>> # Overwrite loss item twice will raise an error.
+        >>> log_processor = dict(
+        >>>     window_size=10,
+        >>>     by_epoch=True,
+        >>>     custom_cfg=[dict(data_src='loss',
+        >>>                       method_name='mean',
+        >>>                       window_size=100),
+        >>>                  dict(data_src='loss',
+        >>>                       method_name='max',
+        >>>                       window_size=100)])
+        AssertionError
+    """
+
+    def __init__(self,
+                 window_size=10,
+                 by_epoch=True,
+                 custom_cfg: Optional[List[dict]] = None,
+                 num_digits: int = 4,
+                 log_with_hierarchy: bool = False,
+                 mean_pattern=r'.*(loss|time|data_time|grad_norm).*'):
+        self.window_size = window_size
+        self.by_epoch = by_epoch
+        self.custom_cfg = custom_cfg if custom_cfg else []
+        self.num_digits = num_digits
+        self.log_with_hierarchy = log_with_hierarchy
+        self.mean_pattern = re.compile(mean_pattern)
+        self._check_custom_cfg()
+
+    def get_log_after_iter(self, runner, batch_idx: int,
+                           mode: str) -> Tuple[dict, str]:
+        """Format log string after training, validation or testing iteration.
+
+        Args:
+            runner (Runner): The runner of training phase.
+            batch_idx (int): The index of the current batch in the current
+                loop.
+            mode (str): Current mode of runner, train, test or val.
+
+        Return:
+            Tuple[dict, str]: Formatted log dict/string which will be
+            recorded by :obj:`runner.message_hub` and :obj:`runner.visualizer`.
+        """
+        assert mode in ['train', 'test', 'val']
+        # Overwrite ``window_size`` defined in ``custom_cfg`` to int value.
+        parsed_cfg = self._parse_windows_size(runner, batch_idx,
+                                              self.custom_cfg)
+        # log_tag is used to write log information to terminal
+        log_tag = self._collect_scalars(parsed_cfg, runner, mode)
+
+        # If `self.log_with_hierarchy` is False, the tag is the same as
+        # log_tag. Otherwise, each key in tag starts with prefix `train`,
+        # `test` or `val`
+        if not self.log_with_hierarchy:
+            tag = copy.deepcopy(log_tag)
+        else:
+            tag = self._collect_scalars(parsed_cfg, runner, mode, True)
+
+        # Record learning rate.
+        lr_str_list = []
+        for key, value in tag.items():
+            if key.endswith('lr'):
+                key = self._remove_prefix(key, f'{mode}/')
+                log_tag.pop(key)
+                lr_str_list.append(f'{key}: '
+                                   f'{value:.{self.num_digits}e}')
+        lr_str = ' '.join(lr_str_list)
+        # Format log header.
+        # by_epoch == True
+        #   train/val: Epoch [5][5/10]  ...
+        #   test: Epoch [5/10]
+        # by_epoch == False
+        #  train: Epoch [5/10000] ... (divided by `max_iter`)
+        #  val/test: Epoch [5/2000] ... (divided by length of dataloader)
+        if self.by_epoch:
+            # Align the iteration log:
+            # Epoch(train)  [  9][010/270]
+            # ...                 ||| |||
+            # Epoch(train)  [ 10][100/270]
+            dataloader_len = self._get_dataloader_size(runner, mode)
+            cur_iter = self._get_iter(runner, batch_idx)
+            cur_iter_str = str(cur_iter).rjust(len(str(dataloader_len)))
+            if mode in ['train', 'val']:
+                cur_epoch = self._get_epoch(runner, mode)
+                if not (isinstance(runner._train_loop, dict)
+                        or runner._train_loop is None):
+                    # Right Align the epoch log:
+                    # Epoch(train)   [9][100/270]
+                    # ...             ||
+                    # Epoch(train) [100][100/270]
+                    max_epochs = runner.max_epochs
+                    # 3 means the three characters: "[", "]", and " " occupied
+                    # in " [{max_epochs}]"
+                    cur_epoch_str = f'[{cur_epoch}]'.rjust(
+                        len(str(max_epochs)) + 3, ' ')
+                else:
+                    cur_epoch_str = f'[{cur_epoch}]'
+                tag['epoch'] = cur_epoch
+                log_str = (f'Epoch({mode}){cur_epoch_str}'
+                           f'[{cur_iter_str}/{dataloader_len}]  ')
+            else:
+                log_str = (f'Epoch({mode}) '
+                           f'[{cur_iter_str}/{dataloader_len}]  ')
+        else:
+            if mode == 'train':
+                cur_iter = self._get_iter(runner, batch_idx)
+                cur_iter_str = str(cur_iter).rjust(len(str(runner.max_iters)))
+                log_str = (f'Iter({mode}) '
+                           f'[{cur_iter_str}/{runner.max_iters}]  ')
+            else:
+                dataloader_len = self._get_dataloader_size(runner, mode)
+                cur_iter_str = str(batch_idx + 1).rjust(
+                    len(str(dataloader_len)))
+                log_str = (f'Iter({mode}) [{cur_iter_str}/{dataloader_len}]  ')
+        # Add global iter.
+        if isinstance(runner._train_loop, dict) or runner._train_loop is None:
+            tag['iter'] = 0
+        else:
+            tag['iter'] = runner.iter + 1
+        # Concatenate lr, momentum string with log header.
+        log_str += f'{lr_str}  '
+        # If IterTimerHook used in runner, eta, time, and data_time should be
+        # recorded.
+        if (all(item in log_tag for item in ['time', 'data_time'])
+                and 'eta' in runner.message_hub.runtime_info):
+            eta = runner.message_hub.get_info('eta')
+            eta_str = str(datetime.timedelta(seconds=int(eta)))
+            log_str += f'eta: {eta_str}  '
+            log_str += (f'time: {log_tag["time"]:.{self.num_digits}f}  '
+                        f'data_time: '
+                        f'{log_tag["data_time"]:.{self.num_digits}f}  ')
+            # Pop recorded keys
+            log_tag.pop('time')
+            log_tag.pop('data_time')
+
+        # If cuda/musa is available,
+        # the max memory occupied should be calculated.
+        if is_cuda_available() or is_musa_available():
+            max_memory = self._get_max_memory(runner)
+            log_str += f'memory: {max_memory}  '
+            tag['memory'] = max_memory
+
+        # Loop left keys to fill `log_str`.
+        if mode in ('train', 'val'):
+            log_items = []
+            for name, val in log_tag.items():
+                if mode == 'val' and not name.startswith('val/loss'):
+                    continue
+                if isinstance(val, float):
+                    val = f'{val:.{self.num_digits}f}'
+                log_items.append(f'{name}: {val}')
+            log_str += '  '.join(log_items)
+        return tag, log_str
+
+    def get_log_after_epoch(self,
+                            runner,
+                            batch_idx: int,
+                            mode: str,
+                            with_non_scalar: bool = False) -> Tuple[dict, str]:
+        """Format log string after validation or testing epoch.
+
+        Args:
+            runner (Runner): The runner of validation/testing phase.
+            batch_idx (int): The index of the current batch in the current
+                loop.
+            mode (str): Current mode of runner.
+            with_non_scalar (bool): Whether to include non-scalar infos in the
+                returned tag. Defaults to False.
+
+        Return:
+            Tuple[dict, str]: Formatted log dict/string which will be
+            recorded by :obj:`runner.message_hub` and :obj:`runner.visualizer`.
+        """
+        assert mode in [
+            'test', 'val'
+        ], ('`_get_metric_log_str` only accept val or test mode, but got '
+            f'{mode}')
+        dataloader_len = self._get_dataloader_size(runner, mode)
+
+        # By epoch:
+        #     Epoch(val) [10][1000/1000]  ...
+        #     Epoch(test) [1000/1000] ...
+        # By iteration:
+        #     Iteration(val) [1000/1000]  ...
+        #     Iteration(test) [1000/1000]  ...
+        if self.by_epoch:
+            if mode == 'val':
+                cur_epoch = self._get_epoch(runner, mode)
+                log_str = (f'Epoch({mode}) [{cur_epoch}][{dataloader_len}/'
+                           f'{dataloader_len}]  ')
+            else:
+                log_str = (
+                    f'Epoch({mode}) [{dataloader_len}/{dataloader_len}]  ')
+
+        else:
+            log_str = (f'Iter({mode}) [{dataloader_len}/{dataloader_len}]  ')
+
+        custom_cfg_copy = copy.deepcopy(self.custom_cfg)
+        # remove prefix
+        custom_keys = [
+            self._remove_prefix(cfg['data_src'], f'{mode}/')
+            for cfg in custom_cfg_copy
+        ]
+        # Count the averaged time and data_time by epoch
+        if 'time' not in custom_keys:
+            custom_cfg_copy.append(
+                dict(data_src='time', window_size='epoch', method_name='mean'))
+        if 'data_time' not in custom_keys:
+            custom_cfg_copy.append(
+                dict(
+                    data_src='data_time',
+                    window_size='epoch',
+                    method_name='mean'))
+        parsed_cfg = self._parse_windows_size(runner, batch_idx,
+                                              custom_cfg_copy)
+        # tag is used to write log information to different backends.
+        ori_tag = self._collect_scalars(parsed_cfg, runner, mode,
+                                        self.log_with_hierarchy)
+        non_scalar_tag = self._collect_non_scalars(runner, mode)
+        # move `time` or `data_time` to the end of the log
+        tag = OrderedDict()
+        time_tag = OrderedDict()
+        for key, value in ori_tag.items():
+            if key in (f'{mode}/time', f'{mode}/data_time', 'time',
+                       'data_time'):
+                time_tag[key] = value
+            else:
+                tag[key] = value
+        # Log other messages.
+        log_items = []
+        log_str += '  '
+        for name, val in chain(tag.items(), non_scalar_tag.items(),
+                               time_tag.items()):
+            if isinstance(val, float):
+                val = f'{val:.{self.num_digits}f}'
+            if isinstance(val, (torch.Tensor, np.ndarray)):
+                # newline to display tensor and array.
+                val = f'\n{val}\n'
+            log_items.append(f'{name}: {val}')
+        log_str += '  '.join(log_items)
+
+        if with_non_scalar:
+            tag.update(non_scalar_tag)
+        tag.update(time_tag)
+        return tag, log_str
+
+    def _collect_scalars(self,
+                         custom_cfg: List[dict],
+                         runner,
+                         mode: str,
+                         reserve_prefix: bool = False) -> dict:
+        """Collect log information to compose a dict according to mode.
+
+        Args:
+            custom_cfg (List[dict]): A copy of ``self.custom_cfg`` with int
+                ``window_size``.
+            runner (Runner): The runner of the training/testing/validation
+                process.
+            mode (str): Current mode of runner.
+            reserve_prefix (bool): Whether to reserve the prefix of the key.
+
+        Returns:
+            dict: Statistical values of logs.
+        """
+        custom_cfg = copy.deepcopy(custom_cfg)
+        tag = OrderedDict()
+        # history_scalars of train/val/test phase.
+        history_scalars = runner.message_hub.log_scalars
+        # corresponding mode history_scalars
+        mode_history_scalars = OrderedDict()
+        # extract log scalars and remove prefix to `mode_history_scalars`
+        # according to mode.
+        for prefix_key, log_buffer in history_scalars.items():
+            if prefix_key.startswith(mode):
+                if not reserve_prefix:
+                    key = self._remove_prefix(prefix_key, f'{mode}/')
+                else:
+                    key = prefix_key
+                mode_history_scalars[key] = log_buffer
+        for key in mode_history_scalars:
+            # Update the latest learning rate and smoothed time logs.
+            if re.search(self.mean_pattern, key) is not None:
+                tag[key] = mode_history_scalars[key].mean(self.window_size)
+            else:
+                # Default statistic method is current.
+                tag[key] = mode_history_scalars[key].current()
+        # Update custom keys.
+        for log_cfg in custom_cfg:
+            data_src = log_cfg.pop('data_src')
+            log_name = log_cfg.pop('log_name', data_src)
+            if reserve_prefix:
+                data_src = f'{mode}/{data_src}'
+                log_name = f'{mode}/{log_name}'
+            # log item in custom_cfg could only exist in train or val
+            # mode.
+            if data_src in mode_history_scalars:
+                tag[log_name] = mode_history_scalars[data_src].statistics(
+                    **log_cfg)
+        return tag
+
+    def _collect_non_scalars(self, runner, mode: str) -> dict:
+        """Collect log information to compose a dict according to mode.
+
+        Args:
+            runner (Runner): The runner of the training/testing/validation
+                process.
+            mode (str): Current mode of runner.
+
+        Returns:
+            dict: non-scalar infos of the specified mode.
+        """
+        # infos of train/val/test phase.
+        infos = runner.message_hub.runtime_info
+        # corresponding mode infos
+        mode_infos = OrderedDict()
+        # extract log info and remove prefix to `mode_infos` according to mode.
+        for prefix_key, value in infos.items():
+            if prefix_key.startswith(mode):
+                if self.log_with_hierarchy:
+                    key = prefix_key
+                else:
+                    key = self._remove_prefix(prefix_key, f'{mode}/')
+                mode_infos[key] = value
+        return mode_infos
+
+    def _remove_prefix(self, string: str, prefix: str):
+        """Remove the prefix ``train``, ``val`` and ``test`` of the key."""
+        if string.startswith(prefix):
+            return string[len(prefix):]
+        else:
+            return string
+
+    def _check_custom_cfg(self) -> None:
+        """Check the legality of ``self.custom_cfg``."""
+
+        def _check_window_size():
+            for log_cfg in self.custom_cfg:
+                if not self.by_epoch:
+                    assert log_cfg['window_size'] != 'epoch', \
+                        'window_size cannot be epoch if LoggerHook.by_epoch' \
+                        ' is False.'
+
+        def _check_repeated_log_name():
+            # The `log_name` of the same data_src should not be repeated.
+            # If `log_name` is not specified, `data_src` will be overwritten.
+            # But only allowed to be overwritten once.
+            check_set = set()
+            for log_cfg in self.custom_cfg:
+                assert 'data_src' in log_cfg
+                data_src = log_cfg['data_src']
+                log_name = log_cfg.get('log_name', data_src)
+                assert log_name not in check_set, (
+                    f'Found duplicate {log_name} for {data_src}. Please check'
+                    'your `custom_cfg` for `log_processor`. You should '
+                    f'neither define duplicate `{log_name}` for {data_src} '
+                    f'nor do not define any {log_name} for multiple '
+                    f'{data_src}, See more information in the docstring of '
+                    'LogProcessor')
+
+                check_set.add(log_name)
+
+        _check_repeated_log_name()
+        _check_window_size()
+
+    def _parse_windows_size(self,
+                            runner,
+                            batch_idx: int,
+                            custom_cfg: Optional[list] = None) -> list:
+        """Parse window_size defined in custom_cfg to int value.
+
+        Args:
+            runner (Runner): The runner of the training/testing/validation
+                process.
+            batch_idx (int): The iteration index of current dataloader.
+            custom_cfg (list): A copy of ``self.custom_cfg``. Defaults to None
+                to keep backward compatibility.
+        """
+        if custom_cfg is None:
+            custom_cfg = copy.deepcopy(self.custom_cfg)
+        else:
+            custom_cfg = copy.deepcopy(custom_cfg)
+        for log_cfg in custom_cfg:
+            window_size = log_cfg.get('window_size', None)
+            if window_size is None or isinstance(window_size, int):
+                continue
+            elif window_size == 'epoch':
+                log_cfg['window_size'] = batch_idx + 1
+            elif window_size == 'global':
+                log_cfg['window_size'] = runner.iter + 1
+            else:
+                raise TypeError(
+                    'window_size should be int, epoch or global, but got '
+                    f'invalid {window_size}')
+        return custom_cfg
+
+    def _get_max_memory(self, runner) -> int:
+        """Returns the maximum GPU memory occupied by tensors in megabytes (MB)
+        for a given device.
+
+        Args:
+            runner (Runner): The runner of the training/testing/validation
+                process.
+
+        Returns:
+            The maximum GPU memory occupied by tensors in megabytes for a given
+            device.
+        """
+
+        device = getattr(runner.model, 'output_device', None)
+
+        if is_musa_available():
+            return get_max_musa_memory(device)
+        return get_max_cuda_memory(device)
+
+    def _get_iter(self, runner, batch_idx: int) -> int:
+        """Get current iteration index.
+
+        Args:
+            runner (Runner): The runner of the training/testing/validation
+                process.
+            batch_idx (int): The iteration index of current
+                dataloader. Defaults to None.
+
+        Returns:
+            int: The current global iter or inner iter.
+        """
+        if self.by_epoch:
+            current_iter = batch_idx + 1
+        else:
+            current_iter = runner.iter + 1
+        return current_iter
+
+    def _get_epoch(self, runner, mode: str) -> int:
+        """Get current epoch according to mode.
+
+        Args:
+            runner (Runner): The runner of the training/testing/validation
+                process.
+            mode (str): Current mode of runner.
+
+        Returns:
+            int: The current epoch.
+        """
+        if mode == 'train':
+            epoch = runner.epoch + 1
+        elif mode == 'val':
+            if (isinstance(runner._train_loop, dict)
+                    or runner._train_loop is None):
+                epoch = 0
+            else:
+                # normal val mode
+                # runner.epoch += 1 has been done before validation
+                epoch = runner.epoch
+        else:
+            raise ValueError(
+                f"runner mode should be 'train' or 'val', but got {mode}")
+        return epoch
+
+    def _get_cur_loop(self, runner, mode: str):
+        """Get current loop according to mode.
+
+        Args:
+            runner (Runner): The runner of the training/validation/testing
+                process.
+            mode (str): Current mode of runner.
+
+        Returns:
+            BaseLoop: Current loop of runner.
+        """
+        # returns type hint will occur circular import
+        if mode == 'train':
+            return runner.train_loop
+        elif mode == 'val':
+            return runner.val_loop
+        else:
+            return runner.test_loop
+
+    def _get_dataloader_size(self, runner, mode) -> int:
+        """Get dataloader size of current loop.
+
+        Args:
+            runner (Runner): The runner of the training/validation/testing
+            mode (str): Current mode of runner.
+
+        Returns:
+            int: The dataloader size of current loop.
+        """
+        return len(self._get_cur_loop(runner=runner, mode=mode).dataloader)
diff --git a/head_extractor/build/lib/mmengine/runner/loops.py b/head_extractor/build/lib/mmengine/runner/loops.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a678db7b9f7e57bf088c1d166536fb6dff2a838
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/runner/loops.py
@@ -0,0 +1,550 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import bisect
+import logging
+import time
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+from torch.utils.data import DataLoader
+
+from mmengine.evaluator import Evaluator
+from mmengine.logging import HistoryBuffer, print_log
+from mmengine.registry import LOOPS
+from mmengine.structures import BaseDataElement
+from mmengine.utils import is_list_of
+from .amp import autocast
+from .base_loop import BaseLoop
+from .utils import calc_dynamic_intervals
+
+
+@LOOPS.register_module()
+class EpochBasedTrainLoop(BaseLoop):
+    """Loop for epoch-based training.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): A dataloader object or a dict to
+            build a dataloader.
+        max_epochs (int): Total training epochs.
+        val_begin (int): The epoch that begins validating.
+            Defaults to 1.
+        val_interval (int): Validation interval. Defaults to 1.
+        dynamic_intervals (List[Tuple[int, int]], optional): The
+            first element in the tuple is a milestone and the second
+            element is a interval. The interval is used after the
+            corresponding milestone. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            runner,
+            dataloader: Union[DataLoader, Dict],
+            max_epochs: int,
+            val_begin: int = 1,
+            val_interval: int = 1,
+            dynamic_intervals: Optional[List[Tuple[int, int]]] = None) -> None:
+        super().__init__(runner, dataloader)
+        self._max_epochs = int(max_epochs)
+        assert self._max_epochs == max_epochs, \
+            f'`max_epochs` should be a integer number, but get {max_epochs}.'
+        self._max_iters = self._max_epochs * len(self.dataloader)
+        self._epoch = 0
+        self._iter = 0
+        self.val_begin = val_begin
+        self.val_interval = val_interval
+        # This attribute will be updated by `EarlyStoppingHook`
+        # when it is enabled.
+        self.stop_training = False
+        if hasattr(self.dataloader.dataset, 'metainfo'):
+            self.runner.visualizer.dataset_meta = \
+                self.dataloader.dataset.metainfo
+        else:
+            print_log(
+                f'Dataset {self.dataloader.dataset.__class__.__name__} has no '
+                'metainfo. ``dataset_meta`` in visualizer will be '
+                'None.',
+                logger='current',
+                level=logging.WARNING)
+
+        self.dynamic_milestones, self.dynamic_intervals = \
+            calc_dynamic_intervals(
+                self.val_interval, dynamic_intervals)
+
+    @property
+    def max_epochs(self):
+        """int: Total epochs to train model."""
+        return self._max_epochs
+
+    @property
+    def max_iters(self):
+        """int: Total iterations to train model."""
+        return self._max_iters
+
+    @property
+    def epoch(self):
+        """int: Current epoch."""
+        return self._epoch
+
+    @property
+    def iter(self):
+        """int: Current iteration."""
+        return self._iter
+
+    def run(self) -> torch.nn.Module:
+        """Launch training."""
+        self.runner.call_hook('before_train')
+
+        while self._epoch < self._max_epochs and not self.stop_training:
+            self.run_epoch()
+
+            self._decide_current_val_interval()
+            if (self.runner.val_loop is not None
+                    and self._epoch >= self.val_begin
+                    and (self._epoch % self.val_interval == 0
+                         or self._epoch == self._max_epochs)):
+                self.runner.val_loop.run()
+
+        self.runner.call_hook('after_train')
+        return self.runner.model
+
+    def run_epoch(self) -> None:
+        """Iterate one epoch."""
+        self.runner.call_hook('before_train_epoch')
+        self.runner.model.train()
+        for idx, data_batch in enumerate(self.dataloader):
+            self.run_iter(idx, data_batch)
+
+        self.runner.call_hook('after_train_epoch')
+        self._epoch += 1
+
+    def run_iter(self, idx, data_batch: Sequence[dict]) -> None:
+        """Iterate one min-batch.
+
+        Args:
+            data_batch (Sequence[dict]): Batch of data from dataloader.
+        """
+        self.runner.call_hook(
+            'before_train_iter', batch_idx=idx, data_batch=data_batch)
+        # Enable gradient accumulation mode and avoid unnecessary gradient
+        # synchronization during gradient accumulation process.
+        # outputs should be a dict of loss.
+        outputs = self.runner.model.train_step(
+            data_batch, optim_wrapper=self.runner.optim_wrapper)
+
+        self.runner.call_hook(
+            'after_train_iter',
+            batch_idx=idx,
+            data_batch=data_batch,
+            outputs=outputs)
+        self._iter += 1
+
+    def _decide_current_val_interval(self) -> None:
+        """Dynamically modify the ``val_interval``."""
+        step = bisect.bisect(self.dynamic_milestones, (self.epoch + 1))
+        self.val_interval = self.dynamic_intervals[step - 1]
+
+
+class _InfiniteDataloaderIterator:
+    """An infinite dataloader iterator wrapper for IterBasedTrainLoop.
+
+    It resets the dataloader to continue iterating when the iterator has
+    iterated over all the data. However, this approach is not efficient, as the
+    workers need to be restarted every time the dataloader is reset. It is
+    recommended to use `mmengine.dataset.InfiniteSampler` to enable the
+    dataloader to iterate infinitely.
+    """
+
+    def __init__(self, dataloader: DataLoader) -> None:
+        self._dataloader = dataloader
+        self._iterator = iter(self._dataloader)
+        self._epoch = 0
+
+    def __iter__(self):
+        return self
+
+    def __next__(self) -> Sequence[dict]:
+        try:
+            data = next(self._iterator)
+        except StopIteration:
+            print_log(
+                'Reach the end of the dataloader, it will be '
+                'restarted and continue to iterate. It is '
+                'recommended to use '
+                '`mmengine.dataset.InfiniteSampler` to enable the '
+                'dataloader to iterate infinitely.',
+                logger='current',
+                level=logging.WARNING)
+            self._epoch += 1
+            if hasattr(self._dataloader, 'sampler') and hasattr(
+                    self._dataloader.sampler, 'set_epoch'):
+                # In case the` _SingleProcessDataLoaderIter` has no sampler,
+                # or data loader uses `SequentialSampler` in Pytorch.
+                self._dataloader.sampler.set_epoch(self._epoch)
+
+            elif hasattr(self._dataloader, 'batch_sampler') and hasattr(
+                    self._dataloader.batch_sampler.sampler, 'set_epoch'):
+                # In case the` _SingleProcessDataLoaderIter` has no batch
+                # sampler. batch sampler in pytorch warps the sampler as its
+                # attributes.
+                self._dataloader.batch_sampler.sampler.set_epoch(self._epoch)
+            time.sleep(2)  # Prevent possible deadlock during epoch transition
+            self._iterator = iter(self._dataloader)
+            data = next(self._iterator)
+        return data
+
+
+@LOOPS.register_module()
+class IterBasedTrainLoop(BaseLoop):
+    """Loop for iter-based training.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): A dataloader object or a dict to
+            build a dataloader.
+        max_iters (int): Total training iterations.
+        val_begin (int): The iteration that begins validating.
+            Defaults to 1.
+        val_interval (int): Validation interval. Defaults to 1000.
+        dynamic_intervals (List[Tuple[int, int]], optional): The
+            first element in the tuple is a milestone and the second
+            element is a interval. The interval is used after the
+            corresponding milestone. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            runner,
+            dataloader: Union[DataLoader, Dict],
+            max_iters: int,
+            val_begin: int = 1,
+            val_interval: int = 1000,
+            dynamic_intervals: Optional[List[Tuple[int, int]]] = None) -> None:
+        super().__init__(runner, dataloader)
+        self._max_iters = int(max_iters)
+        assert self._max_iters == max_iters, \
+            f'`max_iters` should be a integer number, but get {max_iters}'
+        self._max_epochs = 1  # for compatibility with EpochBasedTrainLoop
+        self._epoch = 0
+        self._iter = 0
+        self.val_begin = val_begin
+        self.val_interval = val_interval
+        # This attribute will be updated by `EarlyStoppingHook`
+        # when it is enabled.
+        self.stop_training = False
+        if hasattr(self.dataloader.dataset, 'metainfo'):
+            self.runner.visualizer.dataset_meta = \
+                self.dataloader.dataset.metainfo
+        else:
+            print_log(
+                f'Dataset {self.dataloader.dataset.__class__.__name__} has no '
+                'metainfo. ``dataset_meta`` in visualizer will be '
+                'None.',
+                logger='current',
+                level=logging.WARNING)
+        # get the iterator of the dataloader
+        self.dataloader_iterator = _InfiniteDataloaderIterator(self.dataloader)
+
+        self.dynamic_milestones, self.dynamic_intervals = \
+            calc_dynamic_intervals(
+                self.val_interval, dynamic_intervals)
+
+    @property
+    def max_epochs(self):
+        """int: Total epochs to train model."""
+        return self._max_epochs
+
+    @property
+    def max_iters(self):
+        """int: Total iterations to train model."""
+        return self._max_iters
+
+    @property
+    def epoch(self):
+        """int: Current epoch."""
+        return self._epoch
+
+    @property
+    def iter(self):
+        """int: Current iteration."""
+        return self._iter
+
+    def run(self) -> None:
+        """Launch training."""
+        self.runner.call_hook('before_train')
+        # In iteration-based training loop, we treat the whole training process
+        # as a big epoch and execute the corresponding hook.
+        self.runner.call_hook('before_train_epoch')
+        if self._iter > 0:
+            print_log(
+                f'Advance dataloader {self._iter} steps to skip data '
+                'that has already been trained',
+                logger='current',
+                level=logging.WARNING)
+            for _ in range(self._iter):
+                next(self.dataloader_iterator)
+        while self._iter < self._max_iters and not self.stop_training:
+            self.runner.model.train()
+
+            data_batch = next(self.dataloader_iterator)
+            self.run_iter(data_batch)
+
+            self._decide_current_val_interval()
+            if (self.runner.val_loop is not None
+                    and self._iter >= self.val_begin
+                    and (self._iter % self.val_interval == 0
+                         or self._iter == self._max_iters)):
+                self.runner.val_loop.run()
+
+        self.runner.call_hook('after_train_epoch')
+        self.runner.call_hook('after_train')
+        return self.runner.model
+
+    def run_iter(self, data_batch: Sequence[dict]) -> None:
+        """Iterate one mini-batch.
+
+        Args:
+            data_batch (Sequence[dict]): Batch of data from dataloader.
+        """
+        self.runner.call_hook(
+            'before_train_iter', batch_idx=self._iter, data_batch=data_batch)
+        # Enable gradient accumulation mode and avoid unnecessary gradient
+        # synchronization during gradient accumulation process.
+        # outputs should be a dict of loss.
+        outputs = self.runner.model.train_step(
+            data_batch, optim_wrapper=self.runner.optim_wrapper)
+
+        self.runner.call_hook(
+            'after_train_iter',
+            batch_idx=self._iter,
+            data_batch=data_batch,
+            outputs=outputs)
+        self._iter += 1
+
+    def _decide_current_val_interval(self) -> None:
+        """Dynamically modify the ``val_interval``."""
+        step = bisect.bisect(self.dynamic_milestones, (self._iter + 1))
+        self.val_interval = self.dynamic_intervals[step - 1]
+
+
+@LOOPS.register_module()
+class ValLoop(BaseLoop):
+    """Loop for validation.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): A dataloader object or a dict to
+            build a dataloader.
+        evaluator (Evaluator or dict or list): Used for computing metrics.
+        fp16 (bool): Whether to enable fp16 validation. Defaults to
+            False.
+    """
+
+    def __init__(self,
+                 runner,
+                 dataloader: Union[DataLoader, Dict],
+                 evaluator: Union[Evaluator, Dict, List],
+                 fp16: bool = False) -> None:
+        super().__init__(runner, dataloader)
+
+        if isinstance(evaluator, (dict, list)):
+            self.evaluator = runner.build_evaluator(evaluator)  # type: ignore
+        else:
+            assert isinstance(evaluator, Evaluator), (
+                'evaluator must be one of dict, list or Evaluator instance, '
+                f'but got {type(evaluator)}.')
+            self.evaluator = evaluator  # type: ignore
+        if hasattr(self.dataloader.dataset, 'metainfo'):
+            self.evaluator.dataset_meta = self.dataloader.dataset.metainfo
+            self.runner.visualizer.dataset_meta = \
+                self.dataloader.dataset.metainfo
+        else:
+            print_log(
+                f'Dataset {self.dataloader.dataset.__class__.__name__} has no '
+                'metainfo. ``dataset_meta`` in evaluator, metric and '
+                'visualizer will be None.',
+                logger='current',
+                level=logging.WARNING)
+        self.fp16 = fp16
+        self.val_loss: Dict[str, HistoryBuffer] = dict()
+
+    def run(self) -> dict:
+        """Launch validation."""
+        self.runner.call_hook('before_val')
+        self.runner.call_hook('before_val_epoch')
+        self.runner.model.eval()
+
+        # clear val loss
+        self.val_loss.clear()
+        for idx, data_batch in enumerate(self.dataloader):
+            self.run_iter(idx, data_batch)
+
+        # compute metrics
+        metrics = self.evaluator.evaluate(len(self.dataloader.dataset))
+
+        if self.val_loss:
+            loss_dict = _parse_losses(self.val_loss, 'val')
+            metrics.update(loss_dict)
+
+        self.runner.call_hook('after_val_epoch', metrics=metrics)
+        self.runner.call_hook('after_val')
+        return metrics
+
+    @torch.no_grad()
+    def run_iter(self, idx, data_batch: Sequence[dict]):
+        """Iterate one mini-batch.
+
+        Args:
+            data_batch (Sequence[dict]): Batch of data
+                from dataloader.
+        """
+        self.runner.call_hook(
+            'before_val_iter', batch_idx=idx, data_batch=data_batch)
+        # outputs should be sequence of BaseDataElement
+        with autocast(enabled=self.fp16):
+            outputs = self.runner.model.val_step(data_batch)
+
+        outputs, self.val_loss = _update_losses(outputs, self.val_loss)
+
+        self.evaluator.process(data_samples=outputs, data_batch=data_batch)
+        self.runner.call_hook(
+            'after_val_iter',
+            batch_idx=idx,
+            data_batch=data_batch,
+            outputs=outputs)
+
+
+@LOOPS.register_module()
+class TestLoop(BaseLoop):
+    """Loop for test.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): A dataloader object or a dict to
+            build a dataloader.
+        evaluator (Evaluator or dict or list): Used for computing metrics.
+        fp16 (bool): Whether to enable fp16 testing. Defaults to
+            False.
+    """
+
+    def __init__(self,
+                 runner,
+                 dataloader: Union[DataLoader, Dict],
+                 evaluator: Union[Evaluator, Dict, List],
+                 fp16: bool = False):
+        super().__init__(runner, dataloader)
+
+        if isinstance(evaluator, dict) or isinstance(evaluator, list):
+            self.evaluator = runner.build_evaluator(evaluator)  # type: ignore
+        else:
+            self.evaluator = evaluator  # type: ignore
+        if hasattr(self.dataloader.dataset, 'metainfo'):
+            self.evaluator.dataset_meta = self.dataloader.dataset.metainfo
+            self.runner.visualizer.dataset_meta = \
+                self.dataloader.dataset.metainfo
+        else:
+            print_log(
+                f'Dataset {self.dataloader.dataset.__class__.__name__} has no '
+                'metainfo. ``dataset_meta`` in evaluator, metric and '
+                'visualizer will be None.',
+                logger='current',
+                level=logging.WARNING)
+        self.fp16 = fp16
+        self.test_loss: Dict[str, HistoryBuffer] = dict()
+
+    def run(self) -> dict:
+        """Launch test."""
+        self.runner.call_hook('before_test')
+        self.runner.call_hook('before_test_epoch')
+        self.runner.model.eval()
+
+        # clear test loss
+        self.test_loss.clear()
+        for idx, data_batch in enumerate(self.dataloader):
+            self.run_iter(idx, data_batch)
+
+        # compute metrics
+        metrics = self.evaluator.evaluate(len(self.dataloader.dataset))
+
+        if self.test_loss:
+            loss_dict = _parse_losses(self.test_loss, 'test')
+            metrics.update(loss_dict)
+
+        self.runner.call_hook('after_test_epoch', metrics=metrics)
+        self.runner.call_hook('after_test')
+        return metrics
+
+    @torch.no_grad()
+    def run_iter(self, idx, data_batch: Sequence[dict]) -> None:
+        """Iterate one mini-batch.
+
+        Args:
+            data_batch (Sequence[dict]): Batch of data from dataloader.
+        """
+        self.runner.call_hook(
+            'before_test_iter', batch_idx=idx, data_batch=data_batch)
+        # predictions should be sequence of BaseDataElement
+        with autocast(enabled=self.fp16):
+            outputs = self.runner.model.test_step(data_batch)
+
+        outputs, self.test_loss = _update_losses(outputs, self.test_loss)
+
+        self.evaluator.process(data_samples=outputs, data_batch=data_batch)
+        self.runner.call_hook(
+            'after_test_iter',
+            batch_idx=idx,
+            data_batch=data_batch,
+            outputs=outputs)
+
+
+def _parse_losses(losses: Dict[str, HistoryBuffer],
+                  stage: str) -> Dict[str, float]:
+    """Parses the raw losses of the network.
+
+    Args:
+        losses (dict): raw losses of the network.
+        stage (str): The stage of loss, e.g., 'val' or 'test'.
+
+    Returns:
+        dict[str, float]: The key is the loss name, and the value is the
+        average loss.
+    """
+    all_loss = 0
+    loss_dict: Dict[str, float] = dict()
+
+    for loss_name, loss_value in losses.items():
+        avg_loss = loss_value.mean()
+        loss_dict[loss_name] = avg_loss
+        if 'loss' in loss_name:
+            all_loss += avg_loss
+
+    loss_dict[f'{stage}_loss'] = all_loss
+    return loss_dict
+
+
+def _update_losses(outputs: list, losses: dict) -> Tuple[list, dict]:
+    """Update and record the losses of the network.
+
+    Args:
+        outputs (list): The outputs of the network.
+        losses (dict): The losses of the network.
+
+    Returns:
+        list: The updated outputs of the network.
+        dict: The updated losses of the network.
+    """
+    if isinstance(outputs[-1],
+                  BaseDataElement) and outputs[-1].keys() == ['loss']:
+        loss = outputs[-1].loss  # type: ignore
+        outputs = outputs[:-1]
+    else:
+        loss = dict()
+
+    for loss_name, loss_value in loss.items():
+        if loss_name not in losses:
+            losses[loss_name] = HistoryBuffer()
+        if isinstance(loss_value, torch.Tensor):
+            losses[loss_name].update(loss_value.item())
+        elif is_list_of(loss_value, torch.Tensor):
+            for loss_value_i in loss_value:
+                losses[loss_name].update(loss_value_i.item())
+    return outputs, losses
diff --git a/head_extractor/build/lib/mmengine/runner/priority.py b/head_extractor/build/lib/mmengine/runner/priority.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff644043b810c49dbe673e2ba5e35900650c3f02
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/runner/priority.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import Enum
+from typing import Union
+
+
+class Priority(Enum):
+    """Hook priority levels.
+
+    +--------------+------------+
+    | Level        | Value      |
+    +==============+============+
+    | HIGHEST      | 0          |
+    +--------------+------------+
+    | VERY_HIGH    | 10         |
+    +--------------+------------+
+    | HIGH         | 30         |
+    +--------------+------------+
+    | ABOVE_NORMAL | 40         |
+    +--------------+------------+
+    | NORMAL       | 50         |
+    +--------------+------------+
+    | BELOW_NORMAL | 60         |
+    +--------------+------------+
+    | LOW          | 70         |
+    +--------------+------------+
+    | VERY_LOW     | 90         |
+    +--------------+------------+
+    | LOWEST       | 100        |
+    +--------------+------------+
+    """
+
+    HIGHEST = 0
+    VERY_HIGH = 10
+    HIGH = 30
+    ABOVE_NORMAL = 40
+    NORMAL = 50
+    BELOW_NORMAL = 60
+    LOW = 70
+    VERY_LOW = 90
+    LOWEST = 100
+
+
+def get_priority(priority: Union[int, str, Priority]) -> int:
+    """Get priority value.
+
+    Args:
+        priority (int or str or :obj:`Priority`): Priority.
+
+    Returns:
+        int: The priority value.
+    """
+    if isinstance(priority, int):
+        if priority < 0 or priority > 100:
+            raise ValueError('priority must be between 0 and 100')
+        return priority
+    elif isinstance(priority, Priority):
+        return priority.value
+    elif isinstance(priority, str):
+        return Priority[priority.upper()].value
+    else:
+        raise TypeError('priority must be an integer or Priority enum value')
diff --git a/head_extractor/build/lib/mmengine/runner/runner.py b/head_extractor/build/lib/mmengine/runner/runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..68716ab2538d58ee73db1e9615d9d4f79d484e35
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/runner/runner.py
@@ -0,0 +1,2413 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+import os
+import os.path as osp
+import pickle
+import platform
+import time
+import warnings
+from collections import OrderedDict
+from functools import partial
+from typing import Callable, Dict, List, Optional, Sequence, Union
+
+import torch
+import torch.nn as nn
+from torch.nn.parallel.distributed import DistributedDataParallel
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader
+
+import mmengine
+from mmengine.config import Config, ConfigDict
+from mmengine.dataset import worker_init_fn as default_worker_init_fn
+from mmengine.device import get_device
+from mmengine.dist import (broadcast, get_dist_info, get_rank, get_world_size,
+                           init_dist, is_distributed, master_only)
+from mmengine.evaluator import Evaluator
+from mmengine.fileio import FileClient, join_path
+from mmengine.hooks import Hook
+from mmengine.logging import MessageHub, MMLogger, print_log
+from mmengine.model import (MMDistributedDataParallel, convert_sync_batchnorm,
+                            is_model_wrapper, revert_sync_batchnorm)
+from mmengine.model.efficient_conv_bn_eval import \
+    turn_on_efficient_conv_bn_eval
+from mmengine.optim import (OptimWrapper, OptimWrapperDict, _ParamScheduler,
+                            build_optim_wrapper)
+from mmengine.registry import (DATA_SAMPLERS, DATASETS, EVALUATOR, FUNCTIONS,
+                               HOOKS, LOG_PROCESSORS, LOOPS, MODEL_WRAPPERS,
+                               MODELS, OPTIM_WRAPPERS, PARAM_SCHEDULERS,
+                               RUNNERS, VISUALIZERS, DefaultScope)
+from mmengine.utils import apply_to, digit_version, get_git_hash, is_seq_of
+from mmengine.utils.dl_utils import (TORCH_VERSION, collect_env,
+                                     set_multi_processing)
+from mmengine.visualization import Visualizer
+from .activation_checkpointing import turn_on_activation_checkpointing
+from .base_loop import BaseLoop
+from .checkpoint import (_load_checkpoint, _load_checkpoint_to_model,
+                         find_latest_checkpoint, save_checkpoint,
+                         weights_to_cpu)
+from .log_processor import LogProcessor
+from .loops import EpochBasedTrainLoop, IterBasedTrainLoop, TestLoop, ValLoop
+from .priority import Priority, get_priority
+from .utils import _get_batch_size, set_random_seed
+
+ConfigType = Union[Dict, Config, ConfigDict]
+ParamSchedulerType = Union[List[_ParamScheduler], Dict[str,
+                                                       List[_ParamScheduler]]]
+OptimWrapperType = Union[OptimWrapper, OptimWrapperDict]
+
+
+class _SlicedDataset:
+
+    def __init__(self, dataset, length) -> None:
+        self._dataset = dataset
+        self._length = length
+
+    def __getattr__(self, name):
+        return getattr(self._dataset, name)
+
+    def __getitem__(self, idx):
+        return self._dataset[idx]
+
+    def __len__(self):
+        return self._length
+
+
+@RUNNERS.register_module()
+class Runner:
+    """A training helper for PyTorch.
+
+    Runner object can be built from config by ``runner = Runner.from_cfg(cfg)``
+    where the ``cfg`` usually contains training, validation, and test-related
+    configurations to build corresponding components. We usually use the
+    same config to launch training, testing, and validation tasks. However,
+    only some of these components are necessary at the same time, e.g.,
+    testing a model does not need training or validation-related components.
+
+    To avoid repeatedly modifying config, the construction of ``Runner`` adopts
+    lazy initialization to only initialize components when they are going to be
+    used. Therefore, the model is always initialized at the beginning, and
+    training, validation, and, testing related components are only initialized
+    when calling ``runner.train()``, ``runner.val()``, and ``runner.test()``,
+    respectively.
+
+    Args:
+        model (:obj:`torch.nn.Module` or dict): The model to be run. It can be
+            a dict used for build a model.
+        work_dir (str): The working directory to save checkpoints. The logs
+            will be saved in the subdirectory of `work_dir` named
+            :attr:`timestamp`.
+        train_dataloader (Dataloader or dict, optional): A dataloader object or
+            a dict to build a dataloader. If ``None`` is given, it means
+            skipping training steps. Defaults to None.
+            See :meth:`build_dataloader` for more details.
+        val_dataloader (Dataloader or dict, optional): A dataloader object or
+            a dict to build a dataloader. If ``None`` is given, it means
+            skipping validation steps. Defaults to None.
+            See :meth:`build_dataloader` for more details.
+        test_dataloader (Dataloader or dict, optional): A dataloader object or
+            a dict to build a dataloader. If ``None`` is given, it means
+            skipping test steps. Defaults to None.
+            See :meth:`build_dataloader` for more details.
+        train_cfg (dict, optional): A dict to build a training loop. If it does
+            not provide "type" key, it should contain "by_epoch" to decide
+            which type of training loop :class:`EpochBasedTrainLoop` or
+            :class:`IterBasedTrainLoop` should be used. If ``train_cfg``
+            specified, :attr:`train_dataloader` should also be specified.
+            Defaults to None. See :meth:`build_train_loop` for more details.
+        val_cfg (dict, optional): A dict to build a validation loop. If it does
+            not provide "type" key, :class:`ValLoop` will be used by default.
+            If ``val_cfg`` specified, :attr:`val_dataloader` should also be
+            specified. If ``ValLoop`` is built with `fp16=True``,
+            ``runner.val()`` will be performed under fp16 precision.
+            Defaults to None. See :meth:`build_val_loop` for more details.
+        test_cfg (dict, optional): A dict to build a test loop. If it does
+            not provide "type" key, :class:`TestLoop` will be used by default.
+            If ``test_cfg`` specified, :attr:`test_dataloader` should also be
+            specified. If ``ValLoop`` is built with `fp16=True``,
+            ``runner.val()`` will be performed under fp16 precision.
+            Defaults to None. See :meth:`build_test_loop` for more details.
+        auto_scale_lr (dict, Optional): Config to scale the learning rate
+            automatically. It includes ``base_batch_size`` and ``enable``.
+            ``base_batch_size`` is the batch size that the optimizer lr is
+            based on. ``enable`` is the switch to turn on and off the feature.
+        optim_wrapper (OptimWrapper or dict, optional):
+            Computing gradient of model parameters. If specified,
+            :attr:`train_dataloader` should also be specified. If automatic
+            mixed precision or gradient accmulation
+            training is required. The type of ``optim_wrapper`` should be
+            AmpOptimizerWrapper. See :meth:`build_optim_wrapper` for
+            examples. Defaults to None.
+        param_scheduler (_ParamScheduler or dict or list, optional):
+            Parameter scheduler for updating optimizer parameters. If
+            specified, :attr:`optimizer` should also be specified.
+            Defaults to None.
+            See :meth:`build_param_scheduler` for examples.
+        val_evaluator (Evaluator or dict or list, optional): A evaluator object
+            used for computing metrics for validation. It can be a dict or a
+            list of dict to build a evaluator. If specified,
+            :attr:`val_dataloader` should also be specified. Defaults to None.
+        test_evaluator (Evaluator or dict or list, optional): A evaluator
+            object used for computing metrics for test steps. It can be a dict
+            or a list of dict to build a evaluator. If specified,
+            :attr:`test_dataloader` should also be specified. Defaults to None.
+        default_hooks (dict[str, dict] or dict[str, Hook], optional): Hooks to
+            execute default actions like updating model parameters and saving
+            checkpoints. Default hooks are ``OptimizerHook``,
+            ``IterTimerHook``, ``LoggerHook``, ``ParamSchedulerHook`` and
+            ``CheckpointHook``. Defaults to None.
+            See :meth:`register_default_hooks` for more details.
+        custom_hooks (list[dict] or list[Hook], optional): Hooks to execute
+            custom actions like visualizing images processed by pipeline.
+            Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`. If the ``model`` argument is a dict
+            and doesn't contain the key ``data_preprocessor``, set the argument
+            as the ``data_preprocessor`` of the ``model`` dict.
+            Defaults to None.
+        load_from (str, optional): The checkpoint file to load from.
+            Defaults to None.
+        resume (bool): Whether to resume training. Defaults to False. If
+            ``resume`` is True and ``load_from`` is None, automatically to
+            find latest checkpoint from ``work_dir``. If not found, resuming
+            does nothing.
+        launcher (str): Way to launcher multi-process. Supported launchers
+            are 'pytorch', 'mpi', 'slurm' and 'none'. If 'none' is provided,
+            non-distributed environment will be launched.
+        env_cfg (dict): A dict used for setting environment. Defaults to
+            dict(dist_cfg=dict(backend='nccl')).
+        log_processor (dict, optional): A processor to format logs. Defaults to
+            None.
+        log_level (int or str): The log level of MMLogger handlers.
+            Defaults to 'INFO'.
+        visualizer (Visualizer or dict, optional): A Visualizer object or a
+            dict build Visualizer object. Defaults to None. If not
+            specified, default config will be used.
+        default_scope (str): Used to reset registries location.
+            Defaults to "mmengine".
+        randomness (dict): Some settings to make the experiment as reproducible
+            as possible like seed and deterministic.
+            Defaults to ``dict(seed=None)``. If seed is None, a random number
+            will be generated and it will be broadcasted to all other processes
+            if in distributed environment. If ``cudnn_benchmark`` is
+            ``True`` in ``env_cfg`` but ``deterministic`` is ``True`` in
+            ``randomness``, the value of ``torch.backends.cudnn.benchmark``
+            will be ``False`` finally.
+        experiment_name (str, optional): Name of current experiment. If not
+            specified, timestamp will be used as ``experiment_name``.
+            Defaults to None.
+        cfg (dict or Configdict or :obj:`Config`, optional): Full config.
+            Defaults to None.
+
+    Note:
+        Since PyTorch 2.0.0, you can enable ``torch.compile`` by passing in
+        `cfg.compile = True`. If you want to control compile options, you
+        can pass a dict, e.g. ``cfg.compile = dict(backend='eager')``.
+        Refer to `PyTorch API Documentation <https://pytorch.org/docs/
+        master/generated/torch.compile.html#torch.compile>`_ for more valid
+        options.
+
+    Examples:
+        >>> from mmengine.runner import Runner
+        >>> cfg = dict(
+        >>>     model=dict(type='ToyModel'),
+        >>>     work_dir='path/of/work_dir',
+        >>>     train_dataloader=dict(
+        >>>     dataset=dict(type='ToyDataset'),
+        >>>     sampler=dict(type='DefaultSampler', shuffle=True),
+        >>>     batch_size=1,
+        >>>     num_workers=0),
+        >>>     val_dataloader=dict(
+        >>>         dataset=dict(type='ToyDataset'),
+        >>>         sampler=dict(type='DefaultSampler', shuffle=False),
+        >>>        batch_size=1,
+        >>>        num_workers=0),
+        >>>     test_dataloader=dict(
+        >>>         dataset=dict(type='ToyDataset'),
+        >>>         sampler=dict(type='DefaultSampler', shuffle=False),
+        >>>         batch_size=1,
+        >>>         num_workers=0),
+        >>>     auto_scale_lr=dict(base_batch_size=16, enable=False),
+        >>>     optim_wrapper=dict(type='OptimizerWrapper', optimizer=dict(
+        >>>         type='SGD', lr=0.01)),
+        >>>     param_scheduler=dict(type='MultiStepLR', milestones=[1, 2]),
+        >>>     val_evaluator=dict(type='ToyEvaluator'),
+        >>>     test_evaluator=dict(type='ToyEvaluator'),
+        >>>     train_cfg=dict(by_epoch=True, max_epochs=3, val_interval=1),
+        >>>     val_cfg=dict(),
+        >>>     test_cfg=dict(),
+        >>>     custom_hooks=[],
+        >>>     default_hooks=dict(
+        >>>         timer=dict(type='IterTimerHook'),
+        >>>         checkpoint=dict(type='CheckpointHook', interval=1),
+        >>>         logger=dict(type='LoggerHook'),
+        >>>         optimizer=dict(type='OptimizerHook', grad_clip=False),
+        >>>         param_scheduler=dict(type='ParamSchedulerHook')),
+        >>>     launcher='none',
+        >>>     env_cfg=dict(dist_cfg=dict(backend='nccl')),
+        >>>     log_processor=dict(window_size=20),
+        >>>     visualizer=dict(type='Visualizer',
+        >>>     vis_backends=[dict(type='LocalVisBackend',
+        >>>                        save_dir='temp_dir')])
+        >>>    )
+        >>> runner = Runner.from_cfg(cfg)
+        >>> runner.train()
+        >>> runner.test()
+    """
+    cfg: Config
+    _train_loop: Optional[Union[BaseLoop, Dict]]
+    _val_loop: Optional[Union[BaseLoop, Dict]]
+    _test_loop: Optional[Union[BaseLoop, Dict]]
+
+    def __init__(
+        self,
+        model: Union[nn.Module, Dict],
+        work_dir: str,
+        train_dataloader: Optional[Union[DataLoader, Dict]] = None,
+        val_dataloader: Optional[Union[DataLoader, Dict]] = None,
+        test_dataloader: Optional[Union[DataLoader, Dict]] = None,
+        train_cfg: Optional[Dict] = None,
+        val_cfg: Optional[Dict] = None,
+        test_cfg: Optional[Dict] = None,
+        auto_scale_lr: Optional[Dict] = None,
+        optim_wrapper: Optional[Union[OptimWrapper, Dict]] = None,
+        param_scheduler: Optional[Union[_ParamScheduler, Dict, List]] = None,
+        val_evaluator: Optional[Union[Evaluator, Dict, List]] = None,
+        test_evaluator: Optional[Union[Evaluator, Dict, List]] = None,
+        default_hooks: Optional[Dict[str, Union[Hook, Dict]]] = None,
+        custom_hooks: Optional[List[Union[Hook, Dict]]] = None,
+        data_preprocessor: Union[nn.Module, Dict, None] = None,
+        load_from: Optional[str] = None,
+        resume: bool = False,
+        launcher: str = 'none',
+        env_cfg: Dict = dict(dist_cfg=dict(backend='nccl')),
+        log_processor: Optional[Dict] = None,
+        log_level: str = 'INFO',
+        visualizer: Optional[Union[Visualizer, Dict]] = None,
+        default_scope: str = 'mmengine',
+        randomness: Dict = dict(seed=None),
+        experiment_name: Optional[str] = None,
+        cfg: Optional[ConfigType] = None,
+    ):
+        self._work_dir = osp.abspath(work_dir)
+        mmengine.mkdir_or_exist(self._work_dir)
+
+        # recursively copy the `cfg` because `self.cfg` will be modified
+        # everywhere.
+        if cfg is not None:
+            if isinstance(cfg, Config):
+                self.cfg = copy.deepcopy(cfg)
+            elif isinstance(cfg, dict):
+                self.cfg = Config(cfg)
+        else:
+            self.cfg = Config(dict())
+
+        # lazy initialization
+        training_related = [train_dataloader, train_cfg, optim_wrapper]
+        if not (all(item is None for item in training_related)
+                or all(item is not None for item in training_related)):
+            raise ValueError(
+                'train_dataloader, train_cfg, and optim_wrapper should be '
+                'either all None or not None, but got '
+                f'train_dataloader={train_dataloader}, '
+                f'train_cfg={train_cfg}, '
+                f'optim_wrapper={optim_wrapper}.')
+        self._train_dataloader = train_dataloader
+        self._train_loop = train_cfg
+
+        self.optim_wrapper: Optional[Union[OptimWrapper, dict]]
+        self.optim_wrapper = optim_wrapper
+
+        self.auto_scale_lr = auto_scale_lr
+
+        # If there is no need to adjust learning rate, momentum or other
+        # parameters of optimizer, param_scheduler can be None
+        if param_scheduler is not None and self.optim_wrapper is None:
+            raise ValueError(
+                'param_scheduler should be None when optim_wrapper is None, '
+                f'but got {param_scheduler}')
+
+        # Parse `param_scheduler` to a list or a dict. If `optim_wrapper` is a
+        # `dict` with single optimizer, parsed param_scheduler will be a
+        # list of parameter schedulers. If `optim_wrapper` is
+        # a `dict` with multiple optimizers, parsed `param_scheduler` will be
+        # dict with multiple list of parameter schedulers.
+        self._check_scheduler_cfg(param_scheduler)
+        self.param_schedulers = param_scheduler
+
+        val_related = [val_dataloader, val_cfg, val_evaluator]
+        if not (all(item is None
+                    for item in val_related) or all(item is not None
+                                                    for item in val_related)):
+            raise ValueError(
+                'val_dataloader, val_cfg, and val_evaluator should be either '
+                'all None or not None, but got '
+                f'val_dataloader={val_dataloader}, val_cfg={val_cfg}, '
+                f'val_evaluator={val_evaluator}')
+        self._val_dataloader = val_dataloader
+        self._val_loop = val_cfg
+        self._val_evaluator = val_evaluator
+
+        test_related = [test_dataloader, test_cfg, test_evaluator]
+        if not (all(item is None for item in test_related)
+                or all(item is not None for item in test_related)):
+            raise ValueError(
+                'test_dataloader, test_cfg, and test_evaluator should be '
+                'either all None or not None, but got '
+                f'test_dataloader={test_dataloader}, test_cfg={test_cfg}, '
+                f'test_evaluator={test_evaluator}')
+        self._test_dataloader = test_dataloader
+        self._test_loop = test_cfg
+        self._test_evaluator = test_evaluator
+
+        self._launcher = launcher
+        if self._launcher == 'none':
+            self._distributed = False
+        else:
+            self._distributed = True
+
+        # self._timestamp will be set in the `setup_env` method. Besides,
+        # it also will initialize multi-process and (or) distributed
+        # environment.
+        self.setup_env(env_cfg)
+        # self._deterministic and self._seed will be set in the
+        # `set_randomness`` method
+        self._randomness_cfg = randomness
+        self.set_randomness(**randomness)
+
+        if experiment_name is not None:
+            self._experiment_name = f'{experiment_name}_{self._timestamp}'
+        elif self.cfg.filename is not None:
+            filename_no_ext = osp.splitext(osp.basename(self.cfg.filename))[0]
+            self._experiment_name = f'{filename_no_ext}_{self._timestamp}'
+        else:
+            self._experiment_name = self.timestamp
+        self._log_dir = osp.join(self.work_dir, self.timestamp)
+        mmengine.mkdir_or_exist(self._log_dir)
+        # Used to reset registries location. See :meth:`Registry.build` for
+        # more details.
+        if default_scope is not None:
+            default_scope = DefaultScope.get_instance(  # type: ignore
+                self._experiment_name,
+                scope_name=default_scope)
+        self.default_scope = default_scope
+
+        # Build log processor to format message.
+        log_processor = dict() if log_processor is None else log_processor
+        self.log_processor = self.build_log_processor(log_processor)
+        # Since `get_instance` could return any subclass of ManagerMixin. The
+        # corresponding attribute needs a type hint.
+        self.logger = self.build_logger(log_level=log_level)
+
+        # Collect and log environment information.
+        self._log_env(env_cfg)
+
+        # Build `message_hub` for communication among components.
+        # `message_hub` can store log scalars (loss, learning rate) and
+        # runtime information (iter and epoch). Those components that do not
+        # have access to the runner can get iteration or epoch information
+        # from `message_hub`. For example, models can get the latest created
+        # `message_hub` by
+        # `self.message_hub=MessageHub.get_current_instance()` and then get
+        # current epoch by `cur_epoch = self.message_hub.get_info('epoch')`.
+        # See `MessageHub` and `ManagerMixin` for more details.
+        self.message_hub = self.build_message_hub()
+        # visualizer used for writing log or visualizing all kinds of data
+        self.visualizer = self.build_visualizer(visualizer)
+        if self.cfg:
+            self.visualizer.add_config(self.cfg)
+
+        self._load_from = load_from
+        self._resume = resume
+        # flag to mark whether checkpoint has been loaded or resumed
+        self._has_loaded = False
+
+        # build a model
+        if isinstance(model, dict) and data_preprocessor is not None:
+            # Merge the data_preprocessor to model config.
+            model.setdefault('data_preprocessor', data_preprocessor)
+        self.model = self.build_model(model)
+        # wrap model
+        self.model = self.wrap_model(
+            self.cfg.get('model_wrapper_cfg'), self.model)
+
+        # get model name from the model class
+        if hasattr(self.model, 'module'):
+            self._model_name = self.model.module.__class__.__name__
+        else:
+            self._model_name = self.model.__class__.__name__
+
+        self._hooks: List[Hook] = []
+        # register hooks to `self._hooks`
+        self.register_hooks(default_hooks, custom_hooks)
+        # log hooks information
+        self.logger.info(f'Hooks will be executed in the following '
+                         f'order:\n{self.get_hooks_info()}')
+
+        # dump `cfg` to `work_dir`
+        self.dump_config()
+
+    @classmethod
+    def from_cfg(cls, cfg: ConfigType) -> 'Runner':
+        """Build a runner from config.
+
+        Args:
+            cfg (ConfigType): A config used for building runner. Keys of
+                ``cfg`` can see :meth:`__init__`.
+
+        Returns:
+            Runner: A runner build from ``cfg``.
+        """
+        cfg = copy.deepcopy(cfg)
+        runner = cls(
+            model=cfg['model'],
+            work_dir=cfg['work_dir'],
+            train_dataloader=cfg.get('train_dataloader'),
+            val_dataloader=cfg.get('val_dataloader'),
+            test_dataloader=cfg.get('test_dataloader'),
+            train_cfg=cfg.get('train_cfg'),
+            val_cfg=cfg.get('val_cfg'),
+            test_cfg=cfg.get('test_cfg'),
+            auto_scale_lr=cfg.get('auto_scale_lr'),
+            optim_wrapper=cfg.get('optim_wrapper'),
+            param_scheduler=cfg.get('param_scheduler'),
+            val_evaluator=cfg.get('val_evaluator'),
+            test_evaluator=cfg.get('test_evaluator'),
+            default_hooks=cfg.get('default_hooks'),
+            custom_hooks=cfg.get('custom_hooks'),
+            data_preprocessor=cfg.get('data_preprocessor'),
+            load_from=cfg.get('load_from'),
+            resume=cfg.get('resume', False),
+            launcher=cfg.get('launcher', 'none'),
+            env_cfg=cfg.get('env_cfg', dict(dist_cfg=dict(backend='nccl'))),
+            log_processor=cfg.get('log_processor'),
+            log_level=cfg.get('log_level', 'INFO'),
+            visualizer=cfg.get('visualizer'),
+            default_scope=cfg.get('default_scope', 'mmengine'),
+            randomness=cfg.get('randomness', dict(seed=None)),
+            experiment_name=cfg.get('experiment_name'),
+            cfg=cfg,
+        )
+
+        return runner
+
+    @property
+    def experiment_name(self):
+        """str: Name of experiment."""
+        return self._experiment_name
+
+    @property
+    def model_name(self):
+        """str: Name of the model, usually the module class name."""
+        return self._model_name
+
+    @property
+    def work_dir(self):
+        """str: The working directory to save checkpoints and logs."""
+        return self._work_dir
+
+    @property
+    def log_dir(self):
+        return self._log_dir
+
+    @property
+    def max_epochs(self):
+        """int: Total epochs to train model."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.max_epochs
+        else:
+            return 0
+
+    @property
+    def max_iters(self):
+        """int: Total iterations to train model."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.max_iters
+        else:
+            return 0
+
+    @property
+    def epoch(self):
+        """int: Current epoch."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.epoch
+        else:
+            return 0
+
+    @property
+    def iter(self):
+        """int: Current iteration."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.iter
+        else:
+            return 0
+
+    @property
+    def launcher(self):
+        """str: Way to launcher multi processes."""
+        return self._launcher
+
+    @property
+    def distributed(self):
+        """bool: Whether current environment is distributed."""
+        return self._distributed
+
+    @property
+    def rank(self):
+        """int: Rank of current process."""
+        return self._rank
+
+    @property
+    def world_size(self):
+        """int: Number of processes participating in the job."""
+        return self._world_size
+
+    @property
+    def deterministic(self):
+        """int: Whether cudnn to select deterministic algorithms."""
+        return self._deterministic
+
+    @property
+    def seed(self):
+        """int: A number to set random modules."""
+        return self._seed
+
+    @property
+    def timestamp(self):
+        """str: Timestamp when creating experiment."""
+        return self._timestamp
+
+    @property
+    def hooks(self):
+        """list[:obj:`Hook`]: A list of registered hooks."""
+        return self._hooks
+
+    @property
+    def train_loop(self):
+        """:obj:`BaseLoop`: A loop to run training."""
+        if isinstance(self._train_loop, BaseLoop) or self._train_loop is None:
+            return self._train_loop
+        else:
+            self._train_loop = self.build_train_loop(self._train_loop)
+            return self._train_loop
+
+    @property
+    def val_loop(self):
+        """:obj:`BaseLoop`: A loop to run validation."""
+        if isinstance(self._val_loop, BaseLoop) or self._val_loop is None:
+            return self._val_loop
+        else:
+            self._val_loop = self.build_val_loop(self._val_loop)
+            return self._val_loop
+
+    @property
+    def test_loop(self):
+        """:obj:`BaseLoop`: A loop to run testing."""
+        if isinstance(self._test_loop, BaseLoop) or self._test_loop is None:
+            return self._test_loop
+        else:
+            self._test_loop = self.build_test_loop(self._test_loop)
+            return self._test_loop
+
+    @property
+    def train_dataloader(self):
+        """The data loader for training."""
+        return self.train_loop.dataloader
+
+    @property
+    def val_dataloader(self):
+        """The data loader for validation."""
+        return self.val_loop.dataloader
+
+    @property
+    def test_dataloader(self):
+        """The data loader for testing."""
+        return self.test_loop.dataloader
+
+    @property
+    def val_evaluator(self):
+        """:obj:`Evaluator`: An evaluator for validation."""
+        return self.val_loop.evaluator
+
+    @property
+    def test_evaluator(self):
+        """:obj:`Evaluator`: An evaluator for testing."""
+        return self.test_loop.evaluator
+
+    @property
+    def val_interval(self):
+        """int: Interval to run validation during training."""
+        return self.train_loop.val_interval
+
+    @property
+    def val_begin(self):
+        """int: The epoch/iteration to start running validation during
+        training."""
+        return self.train_loop.val_begin
+
+    def setup_env(self, env_cfg: Dict) -> None:
+        """Setup environment.
+
+        An example of ``env_cfg``::
+
+            env_cfg = dict(
+                cudnn_benchmark=True,
+                mp_cfg=dict(
+                    mp_start_method='fork',
+                    opencv_num_threads=0
+                ),
+                dist_cfg=dict(backend='nccl', timeout=1800),
+                resource_limit=4096
+            )
+
+        Args:
+            env_cfg (dict): Config for setting environment.
+        """
+        if env_cfg.get('cudnn_benchmark'):
+            torch.backends.cudnn.benchmark = True
+
+        mp_cfg: dict = env_cfg.get('mp_cfg', {})
+        set_multi_processing(**mp_cfg, distributed=self.distributed)
+
+        # init distributed env first, since logger depends on the dist info.
+        if self.distributed and not is_distributed():
+            dist_cfg: dict = env_cfg.get('dist_cfg', {})
+            init_dist(self.launcher, **dist_cfg)
+
+        self._rank, self._world_size = get_dist_info()
+
+        timestamp = torch.tensor(time.time(), dtype=torch.float64)
+        # broadcast timestamp from 0 process to other processes
+        broadcast(timestamp)
+        self._timestamp = time.strftime('%Y%m%d_%H%M%S',
+                                        time.localtime(timestamp.item()))
+
+        # https://github.com/pytorch/pytorch/issues/973
+        # set resource limit
+        if platform.system() != 'Windows':
+            import resource
+            rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+            base_soft_limit = rlimit[0]
+            hard_limit = rlimit[1]
+            soft_limit = min(
+                max(env_cfg.get('resource_limit', 4096), base_soft_limit),
+                hard_limit)
+            resource.setrlimit(resource.RLIMIT_NOFILE,
+                               (soft_limit, hard_limit))
+
+    def set_randomness(self,
+                       seed,
+                       diff_rank_seed: bool = False,
+                       deterministic: bool = False) -> None:
+        """Set random seed to guarantee reproducible results.
+
+        Args:
+            seed (int): A number to set random modules.
+            diff_rank_seed (bool): Whether or not set different seeds according
+                to global rank. Defaults to False.
+            deterministic (bool): Whether to set the deterministic option for
+                CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+                to True and `torch.backends.cudnn.benchmark` to False.
+                Defaults to False.
+                See https://pytorch.org/docs/stable/notes/randomness.html for
+                more details.
+        """
+        self._deterministic = deterministic
+        self._seed = set_random_seed(
+            seed=seed,
+            deterministic=deterministic,
+            diff_rank_seed=diff_rank_seed)
+
+    def build_logger(self,
+                     log_level: Union[int, str] = 'INFO',
+                     log_file: str = None,
+                     **kwargs) -> MMLogger:
+        """Build a global asscessable MMLogger.
+
+        Args:
+            log_level (int or str): The log level of MMLogger handlers.
+                Defaults to 'INFO'.
+            log_file (str, optional): Path of filename to save log.
+                Defaults to None.
+            **kwargs: Remaining parameters passed to ``MMLogger``.
+
+        Returns:
+            MMLogger: A MMLogger object build from ``logger``.
+        """
+        if log_file is None:
+            log_file = osp.join(self._log_dir, f'{self.timestamp}.log')
+
+        log_cfg = dict(log_level=log_level, log_file=log_file, **kwargs)
+        log_cfg.setdefault('name', self._experiment_name)
+        # `torch.compile` in PyTorch 2.0 could close all user defined handlers
+        # unexpectedly. Using file mode 'a' can help prevent abnormal
+        # termination of the FileHandler and ensure that the log file could
+        # be continuously updated during the lifespan of the runner.
+        log_cfg.setdefault('file_mode', 'a')
+
+        return MMLogger.get_instance(**log_cfg)  # type: ignore
+
+    def build_message_hub(self,
+                          message_hub: Optional[Dict] = None) -> MessageHub:
+        """Build a global asscessable MessageHub.
+
+        Args:
+            message_hub (dict, optional): A dict to build MessageHub object.
+                If not specified, default config will be used to build
+                MessageHub object. Defaults to None.
+
+        Returns:
+            MessageHub: A MessageHub object build from ``message_hub``.
+        """
+        if message_hub is None:
+            message_hub = dict(name=self._experiment_name)
+        elif isinstance(message_hub, dict):
+            # ensure message_hub containing name key
+            message_hub.setdefault('name', self._experiment_name)
+        else:
+            raise TypeError(
+                f'message_hub should be dict or None, but got {message_hub}')
+
+        return MessageHub.get_instance(**message_hub)
+
+    def build_visualizer(
+            self,
+            visualizer: Optional[Union[Visualizer,
+                                       Dict]] = None) -> Visualizer:
+        """Build a global asscessable Visualizer.
+
+        Args:
+            visualizer (Visualizer or dict, optional): A Visualizer object
+                or a dict to build Visualizer object. If ``visualizer`` is a
+                Visualizer object, just returns itself. If not specified,
+                default config will be used to build Visualizer object.
+                Defaults to None.
+
+        Returns:
+            Visualizer: A Visualizer object build from ``visualizer``.
+        """
+        if visualizer is None:
+            visualizer = dict(
+                name=self._experiment_name,
+                vis_backends=[dict(type='LocalVisBackend')],
+                save_dir=self._log_dir)
+            return Visualizer.get_instance(**visualizer)
+
+        if isinstance(visualizer, Visualizer):
+            return visualizer
+
+        if isinstance(visualizer, dict):
+            # ensure visualizer containing name key
+            visualizer.setdefault('name', self._experiment_name)
+            visualizer.setdefault('save_dir', self._log_dir)
+            return VISUALIZERS.build(visualizer)
+        else:
+            raise TypeError(
+                'visualizer should be Visualizer object, a dict or None, '
+                f'but got {visualizer}')
+
+    def build_model(self, model: Union[nn.Module, Dict]) -> nn.Module:
+        """Build model.
+
+        If ``model`` is a dict, it will be used to build a nn.Module object.
+        Else, if ``model`` is a nn.Module object it will be returned directly.
+
+        An example of ``model``::
+
+            model = dict(type='ResNet')
+
+        Args:
+            model (nn.Module or dict): A ``nn.Module`` object or a dict to
+                build nn.Module object. If ``model`` is a nn.Module object,
+                just returns itself.
+
+        Note:
+            The returned model must implement ``train_step``, ``test_step``
+            if ``runner.train`` or ``runner.test`` will be called. If
+            ``runner.val`` will be called or ``val_cfg`` is configured,
+            model must implement `val_step`.
+
+        Returns:
+            nn.Module: Model build from ``model``.
+        """
+        if isinstance(model, nn.Module):
+            return model
+        elif isinstance(model, dict):
+            model = MODELS.build(model)
+            return model  # type: ignore
+        else:
+            raise TypeError('model should be a nn.Module object or dict, '
+                            f'but got {model}')
+
+    def wrap_model(
+            self, model_wrapper_cfg: Optional[Dict],
+            model: nn.Module) -> Union[DistributedDataParallel, nn.Module]:
+        """Wrap the model to :obj:`MMDistributedDataParallel` or other custom
+        distributed data-parallel module wrappers.
+
+        An example of ``model_wrapper_cfg``::
+
+            model_wrapper_cfg = dict(
+                broadcast_buffers=False,
+                find_unused_parameters=False
+            )
+
+        Args:
+            model_wrapper_cfg (dict, optional): Config to wrap model. If not
+                specified, ``DistributedDataParallel`` will be used in
+                distributed environment. Defaults to None.
+            model (nn.Module): Model to be wrapped.
+
+        Returns:
+            nn.Module or DistributedDataParallel: nn.Module or subclass of
+            ``DistributedDataParallel``.
+        """
+        if is_model_wrapper(model):
+            if model_wrapper_cfg is not None:
+                raise TypeError(
+                    'model has been wrapped and "model_wrapper_cfg" should be '
+                    f'None, but got {model_wrapper_cfg}')
+
+            return model
+
+        # Set `export CUDA_VISIBLE_DEVICES=-1` to enable CPU training.
+        model = model.to(get_device())
+
+        if not self.distributed:
+            self.logger.info(
+                'Distributed training is not used, all SyncBatchNorm (SyncBN) '
+                'layers in the model will be automatically reverted to '
+                'BatchNormXd layers if they are used.')
+            model = revert_sync_batchnorm(model)
+            return model  # type: ignore
+        else:
+            sync_bn = self.cfg.get('sync_bn', None)
+            if sync_bn is not None:
+                try:
+                    model = convert_sync_batchnorm(model, sync_bn)
+                except ValueError as e:
+                    self.logger.error('cfg.sync_bn should be "torch" or '
+                                      f'"mmcv", but got {sync_bn}')
+                    raise e
+        if model_wrapper_cfg is None:
+            find_unused_parameters = self.cfg.get('find_unused_parameters',
+                                                  False)
+            # Sets the `find_unused_parameters` parameter in
+            # torch.nn.parallel.DistributedDataParallel
+            # TODO: may use a more elegant way to get local device ID.
+            model = MMDistributedDataParallel(
+                module=model,
+                device_ids=[int(os.environ['LOCAL_RANK'])],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+        else:
+            model_wrapper_cfg.setdefault('type', 'MMDistributedDataParallel')
+            model_wrapper_type = MODEL_WRAPPERS.get(
+                model_wrapper_cfg.get('type'))  # type: ignore
+            default_args: dict = dict()
+            if issubclass(
+                    model_wrapper_type,  # type: ignore
+                    DistributedDataParallel):
+                default_args['device_ids'] = [int(os.environ['LOCAL_RANK'])]
+            default_args['module'] = model
+            model = MODEL_WRAPPERS.build(
+                model_wrapper_cfg, default_args=default_args)
+        return model
+
+    def _init_model_weights(self) -> None:
+        """Initialize the model weights if the model has
+        :meth:`init_weights`"""
+        model = self.model.module if is_model_wrapper(
+            self.model) else self.model
+        if hasattr(model, 'init_weights'):
+            model.init_weights()
+            # sync params and buffers
+            for name, params in model.state_dict().items():
+                broadcast(params)
+
+    def scale_lr(self,
+                 optim_wrapper: OptimWrapper,
+                 auto_scale_lr: Optional[Dict] = None) -> None:
+        """Automatically scaling learning rate in training according to the
+        ratio of ``base_batch_size`` in ``autoscalelr_cfg`` and real batch
+        size.
+
+        It scales the learning rate linearly according to the
+        `paper <https://arxiv.org/abs/1706.02677>`_.
+
+        Note:
+            ``scale_lr`` must be called after building optimizer wrappers
+            and before building parameter schedulers.
+
+        Args:
+            optim_wrapper (OptimWrapper): An OptimWrapper object whose
+                parameter groups' learning rate need to be scaled.
+            auto_scale_lr (Dict, Optional): Config to scale the learning
+                rate automatically. It includes ``base_batch_size`` and
+                ``enable``. ``base_batch_size`` is the batch size that the
+                optimizer lr is based on. ``enable`` is the switch to turn on
+                and off the feature.
+        """
+        if (auto_scale_lr is None or not auto_scale_lr.get('enable', False)):
+            return None
+
+        assert 'base_batch_size' in auto_scale_lr, \
+            'Lack of `base_batch_size` in `auto_scale_lr`.'
+        dataloader: Union[DataLoader, Dict] = self._train_dataloader
+        bs = dataloader.batch_size if isinstance(
+            dataloader, DataLoader) else dataloader['batch_size']
+        real_bs = self.world_size * bs
+        base_bs = auto_scale_lr['base_batch_size']
+        ratio = float(real_bs) / float(base_bs)
+        self.logger.info(f'LR is set based on batch size of {base_bs} '
+                         f'and the current batch size is {real_bs}. '
+                         f'Scaling the original LR by {ratio}.')
+
+        def _is_built(schedulers):
+            if isinstance(schedulers, dict):
+                return False if 'type' in schedulers else any(
+                    _is_built(s) for s in schedulers.values())
+            if isinstance(schedulers, list):
+                return any(_is_built(s) for s in schedulers)
+            return isinstance(schedulers, _ParamScheduler)
+
+        if _is_built(self.param_schedulers):
+            raise RuntimeError('`scale_lr` should be called before building '
+                               'ParamScheduler because ParamScheduler will '
+                               'store initial lr from optimizer wrappers')
+
+        assert isinstance(optim_wrapper, OptimWrapper), \
+            '`scale_lr should be called after building OptimWrapper'
+        wrappers = list(optim_wrapper.values()) if isinstance(
+            optim_wrapper, OptimWrapperDict) else [optim_wrapper]
+        for wrapper in wrappers:
+            for group in wrapper.optimizer.param_groups:
+                group['lr'] = group['lr'] * ratio
+
+    def build_optim_wrapper(
+        self, optim_wrapper: Union[Optimizer, OptimWrapper, Dict]
+    ) -> Union[OptimWrapper, OptimWrapperDict]:
+        """Build optimizer wrapper.
+
+        If ``optim_wrapper`` is a config dict for only one optimizer,
+        the keys must contain ``optimizer``, and ``type`` is optional.
+        It will build a :obj:`OptimWrapper` by default.
+
+        If ``optim_wrapper`` is a config dict for multiple optimizers, i.e.,
+        it has multiple keys and each key is for an optimizer wrapper. The
+        constructor must be specified since
+        :obj:`DefaultOptimizerConstructor` cannot handle the building of
+        training with multiple optimizers.
+
+        If ``optim_wrapper`` is a dict of pre-built optimizer wrappers, i.e.,
+        each value of ``optim_wrapper`` represents an ``OptimWrapper``
+        instance. ``build_optim_wrapper`` will directly build the
+        :obj:`OptimWrapperDict` instance from ``optim_wrapper``.
+
+        Args:
+            optim_wrapper (OptimWrapper or dict): An OptimWrapper object or a
+                dict to build OptimWrapper objects. If ``optim_wrapper`` is an
+                OptimWrapper, just return an ``OptimizeWrapper`` instance.
+
+        Note:
+            For single optimizer training, if `optim_wrapper` is a config
+            dict, `type` is optional(defaults to :obj:`OptimWrapper`) and it
+            must contain `optimizer` to build the corresponding optimizer.
+
+        Examples:
+            >>> # build an optimizer
+            >>> optim_wrapper_cfg = dict(type='OptimWrapper', optimizer=dict(
+            ...     type='SGD', lr=0.01))
+            >>> # optim_wrapper_cfg = dict(optimizer=dict(type='SGD', lr=0.01))
+            >>> # is also valid.
+            >>> optim_wrapper = runner.build_optim_wrapper(optim_wrapper_cfg)
+            >>> optim_wrapper
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            SGD (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.01
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+            >>> # build optimizer without `type`
+            >>> optim_wrapper_cfg = dict(optimizer=dict(type='SGD', lr=0.01))
+            >>> optim_wrapper = runner.build_optim_wrapper(optim_wrapper_cfg)
+            >>> optim_wrapper
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            SGD (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.01
+                maximize: False
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+            >>> # build multiple optimizers
+            >>> optim_wrapper_cfg = dict(
+            ...    generator=dict(type='OptimWrapper', optimizer=dict(
+            ...        type='SGD', lr=0.01)),
+            ...    discriminator=dict(type='OptimWrapper', optimizer=dict(
+            ...        type='Adam', lr=0.001))
+            ...    # need to customize a multiple optimizer constructor
+            ...    constructor='CustomMultiOptimizerConstructor',
+            ...)
+            >>> optim_wrapper = runner.optim_wrapper(optim_wrapper_cfg)
+            >>> optim_wrapper
+            name: generator
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            SGD (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.1
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+            name: discriminator
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            'discriminator': Adam (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.02
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+
+        Important:
+            If you need to build multiple optimizers, you should implement a
+            MultiOptimWrapperConstructor which gets parameters passed to
+            corresponding optimizers and compose the ``OptimWrapperDict``.
+            More details about how to customize OptimizerConstructor can be
+            found at `optimizer-docs`_.
+
+        Returns:
+            OptimWrapper: Optimizer wrapper build from ``optimizer_cfg``.
+        """  # noqa: E501
+        if isinstance(optim_wrapper, OptimWrapper):
+            return optim_wrapper
+        if isinstance(optim_wrapper, (dict, ConfigDict, Config)):
+            # optimizer must be defined for single optimizer training.
+            optimizer = optim_wrapper.get('optimizer', None)
+
+            # If optimizer is a built `Optimizer` instance, the optimizer
+            # wrapper should be built by `OPTIM_WRAPPERS` registry.
+            if isinstance(optimizer, Optimizer):
+                optim_wrapper.setdefault('type', 'OptimWrapper')
+                return OPTIM_WRAPPERS.build(optim_wrapper)  # type: ignore
+
+            # If `optimizer` is not None or `constructor` is defined, it means,
+            # optimizer wrapper will be built by optimizer wrapper
+            # constructor. Therefore, `build_optim_wrapper` should be called.
+            if optimizer is not None or 'constructor' in optim_wrapper:
+                return build_optim_wrapper(self.model, optim_wrapper)
+            else:
+                # if `optimizer` is not defined, it should be the case of
+                # training with multiple optimizers. If `constructor` is not
+                # defined either, each value of `optim_wrapper` must be an
+                # `OptimWrapper` instance since `DefaultOptimizerConstructor`
+                # will not handle the case of training with multiple
+                # optimizers. `build_optim_wrapper` will directly build the
+                # `OptimWrapperDict` instance from `optim_wrapper.`
+                optim_wrappers = OrderedDict()
+                for name, optim in optim_wrapper.items():
+                    if not isinstance(optim, OptimWrapper):
+                        raise ValueError(
+                            'each item mush be an optimizer object when '
+                            '"type" and "constructor" are not in '
+                            f'optimizer, but got {name}={optim}')
+                    optim_wrappers[name] = optim
+                return OptimWrapperDict(**optim_wrappers)
+        else:
+            raise TypeError('optimizer wrapper should be an OptimWrapper '
+                            f'object or dict, but got {optim_wrapper}')
+
+    def _build_param_scheduler(
+            self, scheduler: Union[_ParamScheduler, Dict, List],
+            optim_wrapper: OptimWrapper) -> List[_ParamScheduler]:
+        """Build parameter schedulers for a single optimizer.
+
+        Args:
+            scheduler (_ParamScheduler or dict or list): A Param Scheduler
+                object or a dict or list of dict to build parameter schedulers.
+            optim_wrapper (OptimWrapper): An optimizer wrapper object is
+                passed to construct ParamScheduler object.
+
+        Returns:
+            list[_ParamScheduler]: List of parameter schedulers build from
+            ``scheduler``.
+
+        Note:
+            If the train loop is built, when building parameter schedulers,
+            it supports setting the max epochs/iters as the default ``end``
+            of schedulers, and supports converting epoch-based schedulers
+            to iter-based according to the ``convert_to_iter_based`` key.
+        """
+        if not isinstance(scheduler, Sequence):
+            schedulers = [scheduler]
+        else:
+            schedulers = scheduler
+
+        param_schedulers = []
+        for scheduler in schedulers:
+            if isinstance(scheduler, _ParamScheduler):
+                param_schedulers.append(scheduler)
+            elif isinstance(scheduler, dict):
+                _scheduler = copy.deepcopy(scheduler)
+
+                # Set default end
+                if isinstance(self._train_loop, BaseLoop):
+                    default_end = self.max_epochs if _scheduler.get(
+                        'by_epoch', True) else self.max_iters
+                    _scheduler.setdefault('end', default_end)
+                    self.logger.debug(
+                        f'The `end` of {_scheduler["type"]} is not set. '
+                        'Use the max epochs/iters of train loop as default.')
+
+                param_schedulers.append(
+                    PARAM_SCHEDULERS.build(
+                        _scheduler,
+                        default_args=dict(
+                            optimizer=optim_wrapper,
+                            epoch_length=len(self.train_dataloader))))
+            else:
+                raise TypeError(
+                    'scheduler should be a _ParamScheduler object or dict, '
+                    f'but got {scheduler}')
+        return param_schedulers
+
+    def build_param_scheduler(
+            self, scheduler: Union[_ParamScheduler, Dict,
+                                   List]) -> ParamSchedulerType:
+        """Build parameter schedulers.
+
+        ``build_param_scheduler`` should be called after
+        ``build_optim_wrapper`` because the building logic will change
+        according to the number of optimizers built by the runner.
+        The cases are as below:
+
+        - Single optimizer: When only one optimizer is built and used in the
+          runner, ``build_param_scheduler`` will return a list of
+          parameter schedulers.
+        - Multiple optimizers: When two or more optimizers are built and used
+          in runner, ``build_param_scheduler`` will return a dict containing
+          the same keys with multiple optimizers and each value is a list of
+          parameter schedulers. Note that, if you want different optimizers to
+          use different parameter schedulers to update optimizer's
+          hyper-parameters, the input parameter ``scheduler`` also needs to be
+          a dict and its key are consistent with multiple optimizers.
+          Otherwise, the same parameter schedulers will be used to update
+          optimizer's hyper-parameters.
+
+        Args:
+            scheduler (_ParamScheduler or dict or list): A Param Scheduler
+                object or a dict or list of dict to build parameter schedulers.
+
+        Examples:
+            >>> # build one scheduler
+            >>> optim_cfg = dict(dict(type='SGD', lr=0.01))
+            >>> runner.optim_wrapper = runner.build_optim_wrapper(
+            >>>     optim_cfg)
+            >>> scheduler_cfg = dict(type='MultiStepLR', milestones=[1, 2])
+            >>> schedulers = runner.build_param_scheduler(scheduler_cfg)
+            >>> schedulers
+            [<mmengine.optim.scheduler.lr_scheduler.MultiStepLR at 0x7f70f6966290>]  # noqa: E501
+
+            >>> # build multiple schedulers
+            >>> scheduler_cfg = [
+            ...    dict(type='MultiStepLR', milestones=[1, 2]),
+            ...    dict(type='StepLR', step_size=1)
+            ... ]
+            >>> schedulers = runner.build_param_scheduler(scheduler_cfg)
+            >>> schedulers
+            [<mmengine.optim.scheduler.lr_scheduler.MultiStepLR at 0x7f70f60dd3d0>,  # noqa: E501
+            <mmengine.optim.scheduler.lr_scheduler.StepLR at 0x7f70f6eb6150>]
+
+        Above examples only provide the case of one optimizer and one scheduler
+        or multiple schedulers. If you want to know how to set parameter
+        scheduler when using multiple optimizers, you can find more examples
+        `optimizer-docs`_.
+
+        Returns:
+            list[_ParamScheduler] or dict[str, list[_ParamScheduler]]: List of
+            parameter schedulers or a dictionary contains list of parameter
+            schedulers build from ``scheduler``.
+
+        .. _optimizer-docs:
+           https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html
+        """
+        param_schedulers: ParamSchedulerType
+        if not isinstance(self.optim_wrapper, OptimWrapperDict):
+            # Since `OptimWrapperDict` inherits from `OptimWrapper`,
+            # `isinstance(self.optim_wrapper, OptimWrapper)` cannot tell
+            # whether `self.optim_wrapper` is an `OptimizerWrapper` or
+            # `OptimWrapperDict` instance. Therefore, here we simply check
+            # self.optim_wrapper is not an `OptimWrapperDict` instance and
+            # then assert it is an OptimWrapper instance.
+            assert isinstance(self.optim_wrapper, OptimWrapper), (
+                '`build_optimizer` should be called before'
+                '`build_param_scheduler` because the latter depends '
+                'on the former')
+            param_schedulers = self._build_param_scheduler(
+                scheduler, self.optim_wrapper)  # type: ignore
+            return param_schedulers
+        else:
+            param_schedulers = dict()
+            for name, optimizer in self.optim_wrapper.items():
+                if isinstance(scheduler, dict) and 'type' not in scheduler:
+                    # scheduler is a dict and each item is a ParamScheduler
+                    # object or a config to build ParamScheduler objects
+                    param_schedulers[name] = self._build_param_scheduler(
+                        scheduler[name], optimizer)
+                else:
+                    param_schedulers[name] = self._build_param_scheduler(
+                        scheduler, optimizer)
+
+            return param_schedulers
+
+    def build_evaluator(self, evaluator: Union[Dict, List,
+                                               Evaluator]) -> Evaluator:
+        """Build evaluator.
+
+        Examples of ``evaluator``::
+
+            # evaluator could be a built Evaluator instance
+            evaluator = Evaluator(metrics=[ToyMetric()])
+
+            # evaluator can also be a list of dict
+            evaluator = [
+                dict(type='ToyMetric1'),
+                dict(type='ToyEvaluator2')
+            ]
+
+            # evaluator can also be a list of built metric
+            evaluator = [ToyMetric1(), ToyMetric2()]
+
+            # evaluator can also be a dict with key metrics
+            evaluator = dict(metrics=ToyMetric())
+            # metric is a list
+            evaluator = dict(metrics=[ToyMetric()])
+
+        Args:
+            evaluator (Evaluator or dict or list): An Evaluator object or a
+                config dict or list of config dict used to build an Evaluator.
+
+        Returns:
+            Evaluator: Evaluator build from ``evaluator``.
+        """
+        if isinstance(evaluator, Evaluator):
+            return evaluator
+        elif isinstance(evaluator, dict):
+            # if `metrics` in dict keys, it means to build customized evalutor
+            if 'metrics' in evaluator:
+                evaluator.setdefault('type', 'Evaluator')
+                return EVALUATOR.build(evaluator)
+            # otherwise, default evalutor will be built
+            else:
+                return Evaluator(evaluator)  # type: ignore
+        elif isinstance(evaluator, list):
+            # use the default `Evaluator`
+            return Evaluator(evaluator)  # type: ignore
+        else:
+            raise TypeError(
+                'evaluator should be one of dict, list of dict, and Evaluator'
+                f', but got {evaluator}')
+
+    @staticmethod
+    def build_dataloader(dataloader: Union[DataLoader, Dict],
+                         seed: Optional[int] = None,
+                         diff_rank_seed: bool = False) -> DataLoader:
+        """Build dataloader.
+
+        The method builds three components:
+
+        - Dataset
+        - Sampler
+        - Dataloader
+
+        An example of ``dataloader``::
+
+            dataloader = dict(
+                dataset=dict(type='ToyDataset'),
+                sampler=dict(type='DefaultSampler', shuffle=True),
+                batch_size=1,
+                num_workers=9
+            )
+
+        Args:
+            dataloader (DataLoader or dict): A Dataloader object or a dict to
+                build Dataloader object. If ``dataloader`` is a Dataloader
+                object, just returns itself.
+            seed (int, optional): Random seed. Defaults to None.
+            diff_rank_seed (bool): Whether or not set different seeds to
+                different ranks. If True, the seed passed to sampler is set
+                to None, in order to synchronize the seeds used in samplers
+                across different ranks.
+
+
+        Returns:
+            Dataloader: DataLoader build from ``dataloader_cfg``.
+        """
+        if isinstance(dataloader, DataLoader):
+            return dataloader
+
+        dataloader_cfg = copy.deepcopy(dataloader)
+
+        # build dataset
+        dataset_cfg = dataloader_cfg.pop('dataset')
+        if isinstance(dataset_cfg, dict):
+            dataset = DATASETS.build(dataset_cfg)
+            if hasattr(dataset, 'full_init'):
+                dataset.full_init()
+        else:
+            # fallback to raise error in dataloader
+            # if `dataset_cfg` is not a valid type
+            dataset = dataset_cfg
+
+        num_batch_per_epoch = dataloader_cfg.pop('num_batch_per_epoch', None)
+        if num_batch_per_epoch is not None:
+            world_size = get_world_size()
+            num_samples = (
+                num_batch_per_epoch * _get_batch_size(dataloader_cfg) *
+                world_size)
+            dataset = _SlicedDataset(dataset, num_samples)
+
+        # build sampler
+        sampler_cfg = dataloader_cfg.pop('sampler')
+        if isinstance(sampler_cfg, dict):
+            sampler_seed = None if diff_rank_seed else seed
+            sampler = DATA_SAMPLERS.build(
+                sampler_cfg,
+                default_args=dict(dataset=dataset, seed=sampler_seed))
+        else:
+            # fallback to raise error in dataloader
+            # if `sampler_cfg` is not a valid type
+            sampler = sampler_cfg
+
+        # build batch sampler
+        batch_sampler_cfg = dataloader_cfg.pop('batch_sampler', None)
+        if batch_sampler_cfg is None:
+            batch_sampler = None
+        elif isinstance(batch_sampler_cfg, dict):
+            batch_sampler = DATA_SAMPLERS.build(
+                batch_sampler_cfg,
+                default_args=dict(
+                    sampler=sampler,
+                    batch_size=dataloader_cfg.pop('batch_size')))
+        else:
+            # fallback to raise error in dataloader
+            # if `batch_sampler_cfg` is not a valid type
+            batch_sampler = batch_sampler_cfg
+
+        # build dataloader
+        init_fn: Optional[partial]
+
+        if 'worker_init_fn' in dataloader_cfg:
+            worker_init_fn_cfg = dataloader_cfg.pop('worker_init_fn')
+            worker_init_fn_type = worker_init_fn_cfg.pop('type')
+            if isinstance(worker_init_fn_type, str):
+                worker_init_fn = FUNCTIONS.get(worker_init_fn_type)
+            elif callable(worker_init_fn_type):
+                worker_init_fn = worker_init_fn_type
+            else:
+                raise TypeError(
+                    'type of worker_init_fn should be string or callable '
+                    f'object, but got {type(worker_init_fn_type)}')
+            assert callable(worker_init_fn)
+            init_fn = partial(worker_init_fn,
+                              **worker_init_fn_cfg)  # type: ignore
+        else:
+            if seed is not None:
+                disable_subprocess_warning = dataloader_cfg.pop(
+                    'disable_subprocess_warning', False)
+                assert isinstance(disable_subprocess_warning, bool), (
+                    'disable_subprocess_warning should be a bool, but got '
+                    f'{type(disable_subprocess_warning)}')
+                init_fn = partial(
+                    default_worker_init_fn,
+                    num_workers=dataloader_cfg.get('num_workers'),
+                    rank=get_rank(),
+                    seed=seed,
+                    disable_subprocess_warning=disable_subprocess_warning)
+            else:
+                init_fn = None
+
+        # `persistent_workers` requires pytorch version >= 1.7
+        if ('persistent_workers' in dataloader_cfg
+                and digit_version(TORCH_VERSION) < digit_version('1.7.0')):
+            print_log(
+                '`persistent_workers` is only available when '
+                'pytorch version >= 1.7',
+                logger='current',
+                level=logging.WARNING)
+            dataloader_cfg.pop('persistent_workers')
+
+        # The default behavior of `collat_fn` in dataloader is to
+        # merge a list of samples to form a mini-batch of Tensor(s).
+        # However, in mmengine, if `collate_fn` is not defined in
+        # dataloader_cfg, `pseudo_collate` will only convert the list of
+        # samples into a dict without stacking the batch tensor.
+        collate_fn_cfg = dataloader_cfg.pop('collate_fn',
+                                            dict(type='pseudo_collate'))
+        if isinstance(collate_fn_cfg, dict):
+            collate_fn_type = collate_fn_cfg.pop('type')
+            if isinstance(collate_fn_type, str):
+                collate_fn = FUNCTIONS.get(collate_fn_type)
+            else:
+                collate_fn = collate_fn_type
+            collate_fn = partial(collate_fn, **collate_fn_cfg)  # type: ignore
+        elif callable(collate_fn_cfg):
+            collate_fn = collate_fn_cfg
+        else:
+            raise TypeError(
+                'collate_fn should be a dict or callable object, but got '
+                f'{collate_fn_cfg}')
+        data_loader = DataLoader(
+            dataset=dataset,
+            sampler=sampler if batch_sampler is None else None,
+            batch_sampler=batch_sampler,
+            collate_fn=collate_fn,
+            worker_init_fn=init_fn,
+            **dataloader_cfg)
+        return data_loader
+
+    def build_train_loop(self, loop: Union[BaseLoop, Dict]) -> BaseLoop:
+        """Build training loop.
+
+        Examples of ``loop``::
+
+            # `EpochBasedTrainLoop` will be used
+            loop = dict(by_epoch=True, max_epochs=3)
+
+            # `IterBasedTrainLoop` will be used
+            loop = dict(by_epoch=False, max_epochs=3)
+
+            # custom training loop
+            loop = dict(type='CustomTrainLoop', max_epochs=3)
+
+        Args:
+            loop (BaseLoop or dict): A training loop or a dict to build
+                training loop. If ``loop`` is a training loop object, just
+                returns itself.
+
+        Returns:
+            :obj:`BaseLoop`: Training loop object build from ``loop``.
+        """
+        if isinstance(loop, BaseLoop):
+            return loop
+        elif not isinstance(loop, dict):
+            raise TypeError(
+                f'train_loop should be a Loop object or dict, but got {loop}')
+
+        loop_cfg = copy.deepcopy(loop)
+
+        if 'type' in loop_cfg and 'by_epoch' in loop_cfg:
+            raise RuntimeError(
+                'Only one of `type` or `by_epoch` can exist in `loop_cfg`.')
+
+        if 'type' in loop_cfg:
+            loop = LOOPS.build(
+                loop_cfg,
+                default_args=dict(
+                    runner=self, dataloader=self._train_dataloader))
+        else:
+            by_epoch = loop_cfg.pop('by_epoch')
+            if by_epoch:
+                loop = EpochBasedTrainLoop(
+                    **loop_cfg, runner=self, dataloader=self._train_dataloader)
+            else:
+                loop = IterBasedTrainLoop(
+                    **loop_cfg, runner=self, dataloader=self._train_dataloader)
+        return loop  # type: ignore
+
+    def build_val_loop(self, loop: Union[BaseLoop, Dict]) -> BaseLoop:
+        """Build validation loop.
+
+        Examples of ``loop``:
+
+            # `ValLoop` will be used
+            loop = dict()
+
+            # custom validation loop
+            loop = dict(type='CustomValLoop')
+
+        Args:
+            loop (BaseLoop or dict): A validation loop or a dict to build
+                validation loop. If ``loop`` is a validation loop object, just
+                returns itself.
+
+        Returns:
+            :obj:`BaseLoop`: Validation loop object build from ``loop``.
+        """
+        if isinstance(loop, BaseLoop):
+            return loop
+        elif not isinstance(loop, dict):
+            raise TypeError(
+                f'val_loop should be a Loop object or dict, but got {loop}')
+
+        loop_cfg = copy.deepcopy(loop)
+
+        if 'type' in loop_cfg:
+            loop = LOOPS.build(
+                loop_cfg,
+                default_args=dict(
+                    runner=self,
+                    dataloader=self._val_dataloader,
+                    evaluator=self._val_evaluator))
+        else:
+            loop = ValLoop(
+                **loop_cfg,
+                runner=self,
+                dataloader=self._val_dataloader,
+                evaluator=self._val_evaluator)  # type: ignore
+
+        return loop  # type: ignore
+
+    def build_test_loop(self, loop: Union[BaseLoop, Dict]) -> BaseLoop:
+        """Build test loop.
+
+        Examples of ``loop``::
+
+            # `TestLoop` will be used
+            loop = dict()
+
+            # custom test loop
+            loop = dict(type='CustomTestLoop')
+
+        Args:
+            loop (BaseLoop or dict): A test loop or a dict to build test loop.
+                If ``loop`` is a test loop object, just returns itself.
+
+        Returns:
+            :obj:`BaseLoop`: Test loop object build from ``loop_cfg``.
+        """
+        if isinstance(loop, BaseLoop):
+            return loop
+        elif not isinstance(loop, dict):
+            raise TypeError(
+                f'test_loop should be a Loop object or dict, but got {loop}')
+
+        loop_cfg = copy.deepcopy(loop)  # type: ignore
+
+        if 'type' in loop_cfg:
+            loop = LOOPS.build(
+                loop_cfg,
+                default_args=dict(
+                    runner=self,
+                    dataloader=self._test_dataloader,
+                    evaluator=self._test_evaluator))
+        else:
+            loop = TestLoop(
+                **loop_cfg,
+                runner=self,
+                dataloader=self._test_dataloader,
+                evaluator=self._test_evaluator)  # type: ignore
+
+        return loop  # type: ignore
+
+    def build_log_processor(
+            self, log_processor: Union[LogProcessor, Dict]) -> LogProcessor:
+        """Build test log_processor.
+
+        Examples of ``log_processor``:
+
+            # `LogProcessor` will be used
+            log_processor = dict()
+
+            # custom log_processor
+            log_processor = dict(type='CustomLogProcessor')
+
+        Args:
+            log_processor (LogProcessor or dict): A log processor or a dict
+            to build log processor. If ``log_processor`` is a log processor
+            object, just returns itself.
+
+        Returns:
+            :obj:`LogProcessor`: Log processor object build from
+            ``log_processor_cfg``.
+        """
+        if isinstance(log_processor, LogProcessor):
+            return log_processor
+        elif not isinstance(log_processor, dict):
+            raise TypeError(
+                'log processor should be a LogProcessor object or dict, but'
+                f'got {log_processor}')
+
+        log_processor_cfg = copy.deepcopy(log_processor)  # type: ignore
+
+        if 'type' in log_processor_cfg:
+            log_processor = LOG_PROCESSORS.build(log_processor_cfg)
+        else:
+            log_processor = LogProcessor(**log_processor_cfg)  # type: ignore
+
+        return log_processor  # type: ignore
+
+    def get_hooks_info(self) -> str:
+        # Get hooks info in each stage
+        stage_hook_map: Dict[str, list] = {stage: [] for stage in Hook.stages}
+        for hook in self.hooks:
+            try:
+                priority = Priority(hook.priority).name  # type: ignore
+            except ValueError:
+                priority = hook.priority  # type: ignore
+            classname = hook.__class__.__name__
+            hook_info = f'({priority:<12}) {classname:<35}'
+            for trigger_stage in hook.get_triggered_stages():
+                stage_hook_map[trigger_stage].append(hook_info)
+
+        stage_hook_infos = []
+        for stage in Hook.stages:
+            hook_infos = stage_hook_map[stage]
+            if len(hook_infos) > 0:
+                info = f'{stage}:\n'
+                info += '\n'.join(hook_infos)
+                info += '\n -------------------- '
+                stage_hook_infos.append(info)
+        return '\n'.join(stage_hook_infos)
+
+    def load_or_resume(self) -> None:
+        """load or resume checkpoint."""
+        if self._has_loaded:
+            return None
+
+        # decide to load from checkpoint or resume from checkpoint
+        resume_from = None
+        if self._resume and self._load_from is None:
+            # auto resume from the latest checkpoint
+            resume_from = find_latest_checkpoint(self.work_dir)
+            self.logger.info(
+                f'Auto resumed from the latest checkpoint {resume_from}.')
+        elif self._resume and self._load_from is not None:
+            # resume from the specified checkpoint
+            resume_from = self._load_from
+
+        if resume_from is not None:
+            self.resume(resume_from)
+            self._has_loaded = True
+        elif self._load_from is not None:
+            self.load_checkpoint(self._load_from)
+            self._has_loaded = True
+
+    def train(self) -> nn.Module:
+        """Launch training.
+
+        Returns:
+            nn.Module: The model after training.
+        """
+        if is_model_wrapper(self.model):
+            ori_model = self.model.module
+        else:
+            ori_model = self.model
+        assert hasattr(ori_model, 'train_step'), (
+            'If you want to train your model, please make sure your model '
+            'has implemented `train_step`.')
+
+        if self._val_loop is not None:
+            assert hasattr(ori_model, 'val_step'), (
+                'If you want to validate your model, please make sure your '
+                'model has implemented `val_step`.')
+
+        if self._train_loop is None:
+            raise RuntimeError(
+                '`self._train_loop` should not be None when calling train '
+                'method. Please provide `train_dataloader`, `train_cfg`, '
+                '`optimizer` and `param_scheduler` arguments when '
+                'initializing runner.')
+
+        self._train_loop = self.build_train_loop(
+            self._train_loop)  # type: ignore
+
+        # `build_optimizer` should be called before `build_param_scheduler`
+        #  because the latter depends on the former
+        self.optim_wrapper = self.build_optim_wrapper(self.optim_wrapper)
+        # Automatically scaling lr by linear scaling rule
+        self.scale_lr(self.optim_wrapper, self.auto_scale_lr)
+
+        if self.param_schedulers is not None:
+            self.param_schedulers = self.build_param_scheduler(  # type: ignore
+                self.param_schedulers)  # type: ignore
+
+        if self._val_loop is not None:
+            self._val_loop = self.build_val_loop(
+                self._val_loop)  # type: ignore
+        # TODO: add a contextmanager to avoid calling `before_run` many times
+        self.call_hook('before_run')
+
+        # initialize the model weights
+        self._init_model_weights()
+
+        # try to enable activation_checkpointing feature
+        modules = self.cfg.get('activation_checkpointing', None)
+        if modules is not None:
+            self.logger.info(f'Enabling the "activation_checkpointing" feature'
+                             f' for sub-modules: {modules}')
+            turn_on_activation_checkpointing(ori_model, modules)
+
+        # try to enable efficient_conv_bn_eval feature
+        modules = self.cfg.get('efficient_conv_bn_eval', None)
+        if modules is not None:
+            self.logger.info(f'Enabling the "efficient_conv_bn_eval" feature'
+                             f' for sub-modules: {modules}')
+            turn_on_efficient_conv_bn_eval(ori_model, modules)
+
+        # make sure checkpoint-related hooks are triggered after `before_run`
+        self.load_or_resume()
+
+        # Initiate inner count of `optim_wrapper`.
+        self.optim_wrapper.initialize_count_status(
+            self.model,
+            self._train_loop.iter,  # type: ignore
+            self._train_loop.max_iters)  # type: ignore
+
+        # Maybe compile the model according to options in self.cfg.compile
+        # This must be called **AFTER** model has been wrapped.
+        self._maybe_compile('train_step')
+
+        model = self.train_loop.run()  # type: ignore
+        self.call_hook('after_run')
+        return model
+
+    def val(self) -> dict:
+        """Launch validation.
+
+        Returns:
+            dict: A dict of metrics on validation set.
+        """
+        if self._val_loop is None:
+            raise RuntimeError(
+                '`self._val_loop` should not be None when calling val method.'
+                'Please provide `val_dataloader`, `val_cfg` and '
+                '`val_evaluator` arguments when initializing runner.')
+
+        self._val_loop = self.build_val_loop(self._val_loop)  # type: ignore
+
+        self.call_hook('before_run')
+
+        # make sure checkpoint-related hooks are triggered after `before_run`
+        self.load_or_resume()
+
+        metrics = self.val_loop.run()  # type: ignore
+        self.call_hook('after_run')
+        return metrics
+
+    def test(self) -> dict:
+        """Launch test.
+
+        Returns:
+            dict: A dict of metrics on testing set.
+        """
+        if self._test_loop is None:
+            raise RuntimeError(
+                '`self._test_loop` should not be None when calling test '
+                'method. Please provide `test_dataloader`, `test_cfg` and '
+                '`test_evaluator` arguments when initializing runner.')
+
+        self._test_loop = self.build_test_loop(self._test_loop)  # type: ignore
+
+        self.call_hook('before_run')
+
+        # make sure checkpoint-related hooks are triggered after `before_run`
+        self.load_or_resume()
+
+        metrics = self.test_loop.run()  # type: ignore
+        self.call_hook('after_run')
+        return metrics
+
+    def call_hook(self, fn_name: str, **kwargs) -> None:
+        """Call all hooks.
+
+        Args:
+            fn_name (str): The function name in each hook to be called, such as
+                "before_train_epoch".
+            **kwargs: Keyword arguments passed to hook.
+        """
+        for hook in self._hooks:
+            # support adding additional custom hook methods
+            if hasattr(hook, fn_name):
+                try:
+                    getattr(hook, fn_name)(self, **kwargs)
+                except TypeError as e:
+                    raise TypeError(f'{e} in {hook}') from None
+
+    def register_hook(
+            self,
+            hook: Union[Hook, Dict],
+            priority: Optional[Union[str, int, Priority]] = None) -> None:
+        """Register a hook into the hook list.
+
+        The hook will be inserted into a priority queue, with the specified
+        priority (See :class:`Priority` for details of priorities).
+        For hooks with the same priority, they will be triggered in the same
+        order as they are registered.
+
+        Priority of hook will be decided with the following priority:
+
+        - ``priority`` argument. If ``priority`` is given, it will be priority
+          of hook.
+        - If ``hook`` argument is a dict and ``priority`` in it, the priority
+          will be the value of ``hook['priority']``.
+        - If ``hook`` argument is a dict but ``priority`` not in it or ``hook``
+          is an instance of ``hook``, the priority will be ``hook.priority``.
+
+        Args:
+            hook (:obj:`Hook` or dict): The hook to be registered.
+            priority (int or str or :obj:`Priority`, optional): Hook priority.
+                Lower value means higher priority.
+        """
+        if not isinstance(hook, (Hook, dict)):
+            raise TypeError(
+                f'hook should be an instance of Hook or dict, but got {hook}')
+
+        _priority = None
+        if isinstance(hook, dict):
+            if 'priority' in hook:
+                _priority = hook.pop('priority')
+
+            hook_obj = HOOKS.build(hook)
+        else:
+            hook_obj = hook
+
+        if priority is not None:
+            hook_obj.priority = priority
+        elif _priority is not None:
+            hook_obj.priority = _priority
+
+        inserted = False
+        for i in range(len(self._hooks) - 1, -1, -1):
+            if get_priority(hook_obj.priority) >= get_priority(
+                    self._hooks[i].priority):
+                self._hooks.insert(i + 1, hook_obj)
+                inserted = True
+                break
+        if not inserted:
+            self._hooks.insert(0, hook_obj)
+
+    def register_default_hooks(
+            self,
+            hooks: Optional[Dict[str, Union[Hook, Dict]]] = None) -> None:
+        """Register default hooks into hook list.
+
+        ``hooks`` will be registered into runner to execute some default
+        actions like updating model parameters or saving checkpoints.
+
+        Default hooks and their priorities:
+
+        +----------------------+-------------------------+
+        | Hooks                | Priority                |
+        +======================+=========================+
+        | RuntimeInfoHook      | VERY_HIGH (10)          |
+        +----------------------+-------------------------+
+        | IterTimerHook        | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | DistSamplerSeedHook  | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | LoggerHook           | BELOW_NORMAL (60)       |
+        +----------------------+-------------------------+
+        | ParamSchedulerHook   | LOW (70)                |
+        +----------------------+-------------------------+
+        | CheckpointHook       | VERY_LOW (90)           |
+        +----------------------+-------------------------+
+
+        If ``hooks`` is None, above hooks will be registered by
+        default::
+
+            default_hooks = dict(
+                runtime_info=dict(type='RuntimeInfoHook'),
+                timer=dict(type='IterTimerHook'),
+                sampler_seed=dict(type='DistSamplerSeedHook'),
+                logger=dict(type='LoggerHook'),
+                param_scheduler=dict(type='ParamSchedulerHook'),
+                checkpoint=dict(type='CheckpointHook', interval=1),
+            )
+
+        If not None, ``hooks`` will be merged into ``default_hooks``.
+        If there are None value in default_hooks, the corresponding item will
+        be popped from ``default_hooks``::
+
+            hooks = dict(timer=None)
+
+        The final registered default hooks will be :obj:`RuntimeInfoHook`,
+        :obj:`DistSamplerSeedHook`, :obj:`LoggerHook`,
+        :obj:`ParamSchedulerHook` and :obj:`CheckpointHook`.
+
+        Args:
+            hooks (dict[str, Hook or dict], optional): Default hooks or configs
+                to be registered.
+        """
+        default_hooks: dict = dict(
+            runtime_info=dict(type='RuntimeInfoHook'),
+            timer=dict(type='IterTimerHook'),
+            sampler_seed=dict(type='DistSamplerSeedHook'),
+            logger=dict(type='LoggerHook'),
+            param_scheduler=dict(type='ParamSchedulerHook'),
+            checkpoint=dict(type='CheckpointHook', interval=1),
+        )
+        if hooks is not None:
+            for name, hook in hooks.items():
+                if name in default_hooks and hook is None:
+                    # remove hook from _default_hooks
+                    default_hooks.pop(name)
+                else:
+                    assert hook is not None
+                    default_hooks[name] = hook
+
+        for hook in default_hooks.values():
+            self.register_hook(hook)
+
+    def register_custom_hooks(self, hooks: List[Union[Hook, Dict]]) -> None:
+        """Register custom hooks into hook list.
+
+        Args:
+            hooks (list[Hook | dict]): List of hooks or configs to be
+                registered.
+        """
+        for hook in hooks:
+            self.register_hook(hook)
+
+    def register_hooks(
+            self,
+            default_hooks: Optional[Dict[str, Union[Hook, Dict]]] = None,
+            custom_hooks: Optional[List[Union[Hook, Dict]]] = None) -> None:
+        """Register default hooks and custom hooks into hook list.
+
+        Args:
+            default_hooks (dict[str, dict] or dict[str, Hook], optional): Hooks
+                to execute default actions like updating model parameters and
+                saving checkpoints.  Defaults to None.
+            custom_hooks (list[dict] or list[Hook], optional): Hooks to execute
+                custom actions like visualizing images processed by pipeline.
+                Defaults to None.
+        """
+        self.register_default_hooks(default_hooks)
+
+        if custom_hooks is not None:
+            self.register_custom_hooks(custom_hooks)
+
+    def resume(self,
+               filename: str,
+               resume_optimizer: bool = True,
+               resume_param_scheduler: bool = True,
+               map_location: Union[str, Callable] = 'default') -> None:
+        """Resume model from checkpoint.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+            resume_optimizer (bool): Whether to resume optimizer state.
+                Defaults to True.
+            resume_param_scheduler (bool): Whether to resume param scheduler
+                state. Defaults to True.
+            map_location (str or callable):A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'default'.
+        """
+        if map_location == 'default':
+            device = get_device()
+            checkpoint = self.load_checkpoint(filename, map_location=device)
+        else:
+            checkpoint = self.load_checkpoint(
+                filename, map_location=map_location)
+
+        self.train_loop._epoch = checkpoint['meta']['epoch']
+        self.train_loop._iter = checkpoint['meta']['iter']
+
+        # check whether the number of GPU used for current experiment
+        # is consistent with resuming from checkpoint
+        if 'config' in checkpoint['meta']:
+            config = mmengine.Config.fromstring(
+                checkpoint['meta']['config'], file_format='.py')
+            previous_gpu_ids = config.get('gpu_ids', None)
+            if (previous_gpu_ids is not None and len(previous_gpu_ids) > 0
+                    and len(previous_gpu_ids) != self._world_size):
+                # TODO, should we modify the iteration?
+                if (self.auto_scale_lr is None
+                        or not self.auto_scale_lr.get('enable', False)):
+                    raise RuntimeError(
+                        'Number of GPUs used for current experiment is not '
+                        'consistent with the checkpoint being resumed from. '
+                        'This will result in poor performance due to the '
+                        'learning rate. You must set the '
+                        '`auto_scale_lr` parameter for Runner and make '
+                        '`auto_scale_lr["enable"]=True`.')
+                else:
+                    self.logger.info(
+                        'Number of GPU used for current experiment is not '
+                        'consistent with resuming from checkpoint but the '
+                        'leaning rate will be adjusted according to the '
+                        f'setting in auto_scale_lr={self.auto_scale_lr}')
+
+        # resume random seed
+        resumed_seed = checkpoint['meta'].get('seed', None)
+        current_seed = self._randomness_cfg.get('seed')
+        if resumed_seed is not None and resumed_seed != current_seed:
+            if current_seed is not None:
+                self.logger.warning(f'The value of random seed in the '
+                                    f'checkpoint "{resumed_seed}" is '
+                                    f'different from the value in '
+                                    f'`randomness` config "{current_seed}"')
+            self._randomness_cfg.update(seed=resumed_seed)
+            self.set_randomness(**self._randomness_cfg)
+
+        resumed_dataset_meta = checkpoint['meta'].get('dataset_meta', None)
+        dataset_meta = getattr(self.train_dataloader.dataset, 'metainfo', None)
+
+        # `resumed_dataset_meta` and `dataset_meta` could be object like
+        # np.ndarray, which cannot be directly judged as equal or not,
+        # therefore we just compared their dumped results.
+        if pickle.dumps(resumed_dataset_meta) != pickle.dumps(dataset_meta):
+            self.logger.warning(
+                'The dataset metainfo from the resumed checkpoint is '
+                'different from the current training dataset, please '
+                'check the correctness of the checkpoint or the training '
+                'dataset.')
+
+        self.message_hub.load_state_dict(checkpoint['message_hub'])
+
+        # resume optimizer
+        if 'optimizer' in checkpoint and resume_optimizer:
+            self.optim_wrapper = self.build_optim_wrapper(self.optim_wrapper)
+            self.optim_wrapper.load_state_dict(  # type: ignore
+                checkpoint['optimizer'])
+
+        # resume param scheduler
+        if resume_param_scheduler and self.param_schedulers is None:
+            self.logger.warning(
+                '`resume_param_scheduler` is True but `self.param_schedulers` '
+                'is None, so skip resuming parameter schedulers')
+            resume_param_scheduler = False
+        if 'param_schedulers' in checkpoint and resume_param_scheduler:
+            self.param_schedulers = self.build_param_scheduler(  # type: ignore
+                self.param_schedulers)  # type: ignore
+            if isinstance(self.param_schedulers, dict):
+                for name, schedulers in self.param_schedulers.items():
+                    for scheduler, ckpt_scheduler in zip(
+                            schedulers, checkpoint['param_schedulers'][name]):
+                        scheduler.load_state_dict(ckpt_scheduler)
+            else:
+                for scheduler, ckpt_scheduler in zip(
+                        self.param_schedulers,  # type: ignore
+                        checkpoint['param_schedulers']):
+                    scheduler.load_state_dict(ckpt_scheduler)
+
+        self._has_loaded = True
+
+        self.logger.info(f'resumed epoch: {self.epoch}, iter: {self.iter}')
+
+    def load_checkpoint(self,
+                        filename: str,
+                        map_location: Union[str, Callable] = 'cpu',
+                        strict: bool = False,
+                        revise_keys: list = [(r'^module.', '')]):
+        """Load checkpoint from given ``filename``.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+            map_location (str or callable): A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'cpu'.
+            strict (bool): strict (bool): Whether to allow different params for
+                the model and checkpoint.
+            revise_keys (list): A list of customized keywords to modify the
+                state_dict in checkpoint. Each item is a (pattern, replacement)
+                pair of the regular expression operations. Defaults to strip
+                the prefix 'module.' by [(r'^module\\.', '')].
+        """
+        checkpoint = _load_checkpoint(filename, map_location=map_location)
+
+        # Add comments to describe the usage of `after_load_ckpt`
+        self.call_hook('after_load_checkpoint', checkpoint=checkpoint)
+
+        if is_model_wrapper(self.model):
+            model = self.model.module
+        else:
+            model = self.model
+
+        checkpoint = _load_checkpoint_to_model(
+            model, checkpoint, strict, revise_keys=revise_keys)
+
+        self._has_loaded = True
+
+        self.logger.info(f'Load checkpoint from {filename}')
+
+        return checkpoint
+
+    @master_only
+    def save_checkpoint(
+        self,
+        out_dir: str,
+        filename: str,
+        file_client_args: Optional[dict] = None,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        meta: Optional[dict] = None,
+        by_epoch: bool = True,
+        backend_args: Optional[dict] = None,
+    ):
+        """Save checkpoints.
+
+        ``CheckpointHook`` invokes this method to save checkpoints
+        periodically.
+
+        Args:
+            out_dir (str): The directory that checkpoints are saved.
+            filename (str): The checkpoint filename.
+            file_client_args (dict, optional): Arguments to instantiate a
+                FileClient. See :class:`mmengine.fileio.FileClient` for
+                details. Defaults to None. It will be deprecated in future.
+                Please use `backend_args` instead.
+            save_optimizer (bool): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            save_param_scheduler (bool): Whether to save the param_scheduler
+                to the checkpoint. Defaults to True.
+            meta (dict, optional): The meta information to be saved in the
+                checkpoint. Defaults to None.
+            by_epoch (bool): Decide the number of epoch or iteration saved in
+                checkpoint. Defaults to True.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+                New in v0.2.0.
+        """
+        if meta is None:
+            meta = {}
+        elif not isinstance(meta, dict):
+            raise TypeError(
+                f'meta should be a dict or None, but got {type(meta)}')
+
+        if by_epoch:
+            # self.epoch increments 1 after
+            # `self.call_hook('after_train_epoch)` but `save_checkpoint` is
+            # called by `after_train_epoch`` method of `CheckpointHook` so
+            # `epoch` should be `self.epoch + 1`
+            meta.setdefault('epoch', self.epoch + 1)
+            meta.setdefault('iter', self.iter)
+        else:
+            meta.setdefault('epoch', self.epoch)
+            meta.setdefault('iter', self.iter + 1)
+
+        if file_client_args is not None:
+            warnings.warn(
+                '"file_client_args" will be deprecated in future. '
+                'Please use "backend_args" instead', DeprecationWarning)
+            if backend_args is not None:
+                raise ValueError(
+                    '"file_client_args" and "backend_args" cannot be set at '
+                    'the same time.')
+
+            file_client = FileClient.infer_client(file_client_args, out_dir)
+            filepath = file_client.join_path(out_dir, filename)
+        else:
+            filepath = join_path(  # type: ignore
+                out_dir, filename, backend_args=backend_args)
+
+        meta.update(
+            cfg=self.cfg.pretty_text,
+            seed=self.seed,
+            experiment_name=self.experiment_name,
+            time=time.strftime('%Y%m%d_%H%M%S', time.localtime()),
+            mmengine_version=mmengine.__version__ + get_git_hash())
+
+        if hasattr(self.train_dataloader.dataset, 'metainfo'):
+            meta.update(dataset_meta=self.train_dataloader.dataset.metainfo)
+
+        if is_model_wrapper(self.model):
+            model = self.model.module
+        else:
+            model = self.model
+
+        checkpoint = {
+            'meta':
+            meta,
+            'state_dict':
+            weights_to_cpu(model.state_dict()),
+            'message_hub':
+            apply_to(self.message_hub.state_dict(),
+                     lambda x: hasattr(x, 'cpu'), lambda x: x.cpu()),
+        }
+        # save optimizer state dict to checkpoint
+        if save_optimizer:
+            if isinstance(self.optim_wrapper, OptimWrapper):
+                checkpoint['optimizer'] = apply_to(
+                    self.optim_wrapper.state_dict(),
+                    lambda x: hasattr(x, 'cpu'), lambda x: x.cpu())
+            else:
+                raise TypeError(
+                    'self.optim_wrapper should be an `OptimWrapper` '
+                    'or `OptimWrapperDict` instance, but got '
+                    f'{self.optim_wrapper}')
+
+        # save param scheduler state dict
+        if save_param_scheduler and self.param_schedulers is None:
+            self.logger.warning(
+                '`save_param_scheduler` is True but `self.param_schedulers` '
+                'is None, so skip saving parameter schedulers')
+            save_param_scheduler = False
+        if save_param_scheduler:
+            if isinstance(self.param_schedulers, dict):
+                checkpoint['param_schedulers'] = dict()
+                for name, schedulers in self.param_schedulers.items():
+                    checkpoint['param_schedulers'][name] = []
+                    for scheduler in schedulers:
+                        state_dict = scheduler.state_dict()
+                        checkpoint['param_schedulers'][name].append(state_dict)
+            else:
+                checkpoint['param_schedulers'] = []
+                for scheduler in self.param_schedulers:  # type: ignore
+                    state_dict = scheduler.state_dict()  # type: ignore
+                    checkpoint['param_schedulers'].append(state_dict)
+
+        self.call_hook('before_save_checkpoint', checkpoint=checkpoint)
+        save_checkpoint(
+            checkpoint,
+            filepath,
+            file_client_args=file_client_args,
+            backend_args=backend_args)
+
+    @master_only
+    def dump_config(self) -> None:
+        """Dump config to `work_dir`."""
+        if self.cfg.filename is not None:
+            filename = osp.basename(self.cfg.filename)
+        else:
+            filename = f'{self.timestamp}.py'
+        self.cfg.dump(osp.join(self.work_dir, filename))
+
+    def _check_scheduler_cfg(
+            self, param_scheduler: Optional[Union[dict, list,
+                                                  _ParamScheduler]]) -> None:
+        """Parse `param_scheduler` to a list of parameter schedulers, or a
+        `dict` of which each value is a list of parameter schedulers.
+
+        If only one optimizer is used, the parsed config should be a
+        list of parameter scheduler configs or instances. If multiple
+        optimizers are used, the parsed config should be `dict`.
+        Its key should be consistent with the optimizer `dict` and its value
+        should be a list of parameter scheduler configs or instances. See
+        :meth:`build_param_scheduler` for more details.
+
+        Examples:
+            >>> # valid scheduler:
+            >>> # empty scheduler
+            >>> scheduler = None
+            >>> # Single scheduler
+            >>> scheduler = dict(type='MultiStepLR', milestones=[1, 2])
+            >>> # Single list schedulers
+            >>> scheduler = [dict(type='MultiStepLR', milestones=[1, 2]),
+            >>>              dict(type='MultiStepLR', milestones=[2, 3])]
+            >>> # `dict` of schedulers
+            >>> scheduler = dict(linear1=dict(type='MultiStepLR', milestones=[1, 2]),
+            >>>                  linear2=dict(type='MultiStepLR', milestones=[1, 2]))
+            >>> # `dict` of `list` of schedulers
+            >>> scheduler = dict(linear1=[dict(type='MultiStepLR', milestones=[1, 2])],
+            >>>                  linear2=[dict(type='MultiStepLR', milestones=[1, 2])])
+            >>> # Single built scheduler
+            >>> from mmengine.optim import MultiStepLR
+            >>> scheduler = MultiStepLR(milestones=[1, 2], optimizer=optimizer)
+            >>> # Single built list schedulers
+            >>> scheduler = [MultiStepLR(milestones=[1, 2], optimizer=optimizer)]
+            >>> # dict of built scheduler
+            >>> scheduler = dict(linear1=MultiStepLR(milestones=[1, 2], optimizer=optimizer),
+            >>>                  linear2=MultiStepLR(milestones=[1, 2], optimizer=optimizer))
+            >>> # dict of built list schedulers
+            >>> scheduler = dict(linear1=[MultiStepLR(milestones=[1, 2], optimizer=optimizer)],
+            >>>                  linear2=[MultiStepLR(milestones=[1, 2], optimizer=optimizer)])
+
+        Args:
+            param_scheduler (dict or list): The original parameter scheduler.
+        """  # noqa: E501
+        if param_scheduler is None:
+            return
+        if isinstance(param_scheduler, _ParamScheduler):
+            return
+        if is_seq_of(param_scheduler, _ParamScheduler):
+            return
+
+        if is_seq_of(param_scheduler, dict):
+            for _param_scheduler in param_scheduler:
+                assert 'type' in _param_scheduler, (
+                    'Each parameter scheduler should contain the key type, '
+                    f'but got {_param_scheduler}')
+        elif isinstance(param_scheduler, dict):
+            if 'type' not in param_scheduler:
+                for key, _param_scheduler in param_scheduler.items():
+                    assert isinstance(
+                        _param_scheduler,
+                        (dict, tuple, list, _ParamScheduler)), (
+                            'Each value of `param_scheduler` should be a '
+                            f'dict or a list, but got {_param_scheduler} with '
+                            f'type {type(_ParamScheduler)}')
+
+        else:
+            raise TypeError(
+                '`param_scheduler` should be a `_ParamScheduler`, `dict`, '
+                f'list or a tuple, but got {type(param_scheduler)}. If '
+                '`param_scheduler` is a list of dict, it means a list of '
+                'scheduler configs for single optimizer. If it is a dict and '
+                'contains key `type`, it means a scheduler config for a '
+                'single optimizer. If it does not contain key `type`, it '
+                'means multiple lists of schedulers for multiple optimizers.')
+
+    def _log_env(self, env_cfg: dict) -> None:
+        """Logging environment information of the current task.
+
+        Args:
+            env_cfg (dict): The environment config of the runner.
+        """
+        # Collect and log environment information.
+        env = collect_env()
+        runtime_env = OrderedDict()
+        runtime_env.update(env_cfg)
+        runtime_env.update(self._randomness_cfg)
+        runtime_env['seed'] = self._seed
+        runtime_env['Distributed launcher'] = self._launcher
+        runtime_env['Distributed training'] = self._distributed
+        runtime_env['GPU number'] = self._world_size
+
+        env_info = '\n    ' + '\n    '.join(f'{k}: {v}'
+                                            for k, v in env.items())
+        runtime_env_info = '\n    ' + '\n    '.join(
+            f'{k}: {v}' for k, v in runtime_env.items())
+        dash_line = '-' * 60
+        self.logger.info('\n' + dash_line + '\nSystem environment:' +
+                         env_info + '\n'
+                         '\nRuntime environment:' + runtime_env_info + '\n' +
+                         dash_line + '\n')
+
+        if self.cfg._cfg_dict:
+            self.logger.info(f'Config:\n{self.cfg.pretty_text}')
+
+    def _maybe_compile(self, target: str) -> None:
+        """Use `torch.compile` to optimize model/wrapped_model."""
+        compile_cfg = self.cfg.get('compile', None)
+        if compile_cfg is None:
+            # no compile options given, won't compile
+            return
+
+        if isinstance(compile_cfg, bool):
+            if not compile_cfg:
+                # compile=False, compilation is disabled
+                return
+            # compile=True, use default configurations
+            compile_cfg = dict()
+
+        assert digit_version(TORCH_VERSION) >= digit_version('2.0.0'), (
+            'PyTorch >= 2.0.0 is required to enable torch.compile')
+        assert isinstance(compile_cfg, dict), (
+            f'`compile` should be a dict or bool, got {type(compile_cfg)}')
+
+        func = getattr(self.model, target)
+        compiled_func = torch.compile(func, **compile_cfg)
+        setattr(self.model, target, compiled_func)
+        self.logger.info('Model has been "compiled". The first few iterations'
+                         ' will be slow, please be patient.')
diff --git a/head_extractor/build/lib/mmengine/runner/utils.py b/head_extractor/build/lib/mmengine/runner/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91025eb07b95c5e01419a3e01dac7a5fa695a7b
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/runner/utils.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import random
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+
+from mmengine.device import is_cuda_available, is_musa_available
+from mmengine.dist import get_rank, sync_random_seed
+from mmengine.logging import print_log
+from mmengine.utils import digit_version, is_list_of
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+
+def calc_dynamic_intervals(
+    start_interval: int,
+    dynamic_interval_list: Optional[List[Tuple[int, int]]] = None
+) -> Tuple[List[int], List[int]]:
+    """Calculate dynamic intervals.
+
+    Args:
+        start_interval (int): The interval used in the beginning.
+        dynamic_interval_list (List[Tuple[int, int]], optional): The
+            first element in the tuple is a milestone and the second
+            element is a interval. The interval is used after the
+            corresponding milestone. Defaults to None.
+
+    Returns:
+        Tuple[List[int], List[int]]: a list of milestone and its corresponding
+        intervals.
+    """
+    if dynamic_interval_list is None:
+        return [0], [start_interval]
+
+    assert is_list_of(dynamic_interval_list, tuple)
+
+    dynamic_milestones = [0]
+    dynamic_milestones.extend(
+        [dynamic_interval[0] for dynamic_interval in dynamic_interval_list])
+    dynamic_intervals = [start_interval]
+    dynamic_intervals.extend(
+        [dynamic_interval[1] for dynamic_interval in dynamic_interval_list])
+    return dynamic_milestones, dynamic_intervals
+
+
+def set_random_seed(seed: Optional[int] = None,
+                    deterministic: bool = False,
+                    diff_rank_seed: bool = False) -> int:
+    """Set random seed.
+
+    Args:
+        seed (int, optional): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Defaults to False.
+        diff_rank_seed (bool): Whether to add rank number to the random seed to
+            have different random seed in different threads. Defaults to False.
+    """
+    if seed is None:
+        seed = sync_random_seed()
+
+    if diff_rank_seed:
+        rank = get_rank()
+        seed += rank
+
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    # torch.cuda.manual_seed(seed)
+    if is_cuda_available():
+        torch.cuda.manual_seed_all(seed)
+    elif is_musa_available():
+        torch.musa.manual_seed_all(seed)
+    # os.environ['PYTHONHASHSEED'] = str(seed)
+    if deterministic:
+        if torch.backends.cudnn.benchmark:
+            print_log(
+                'torch.backends.cudnn.benchmark is going to be set as '
+                '`False` to cause cuDNN to deterministically select an '
+                'algorithm',
+                logger='current',
+                level=logging.WARNING)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+        if digit_version(TORCH_VERSION) >= digit_version('1.10.0'):
+            torch.use_deterministic_algorithms(True)
+    return seed
+
+
+def _get_batch_size(dataloader: dict):
+    if isinstance(dataloader, dict):
+        if 'batch_size' in dataloader:
+            return dataloader['batch_size']
+        elif ('batch_sampler' in dataloader
+              and 'batch_size' in dataloader['batch_sampler']):
+            return dataloader['batch_sampler']['batch_size']
+        else:
+            raise ValueError('Please set batch_size in `Dataloader` or '
+                             '`batch_sampler`')
+    elif isinstance(dataloader, DataLoader):
+        return dataloader.batch_sampler.batch_size
+    else:
+        raise ValueError('dataloader should be a dict or a Dataloader '
+                         f'instance, but got {type(dataloader)}')
diff --git a/head_extractor/build/lib/mmengine/structures/__init__.py b/head_extractor/build/lib/mmengine/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4d94fd1f78aa52186b6ebe8229ea70c95fc5a2f
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/structures/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_data_element import BaseDataElement
+from .instance_data import InstanceData
+from .label_data import LabelData
+from .pixel_data import PixelData
+
+__all__ = ['BaseDataElement', 'InstanceData', 'LabelData', 'PixelData']
diff --git a/head_extractor/build/lib/mmengine/structures/base_data_element.py b/head_extractor/build/lib/mmengine/structures/base_data_element.py
new file mode 100644
index 0000000000000000000000000000000000000000..53bcd5babf8078e90f6dea0ff6a857f75cda0ae4
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/structures/base_data_element.py
@@ -0,0 +1,639 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Any, Iterator, Optional, Tuple, Type, Union
+
+import numpy as np
+import torch
+
+
+class BaseDataElement:
+    """A base data interface that supports Tensor-like and dict-like
+    operations.
+
+    A typical data elements refer to predicted results or ground truth labels
+    on a task, such as predicted bboxes, instance masks, semantic
+    segmentation masks, etc. Because groundtruth labels and predicted results
+    often have similar properties (for example, the predicted bboxes and the
+    groundtruth bboxes), MMEngine uses the same abstract data interface to
+    encapsulate predicted results and groundtruth labels, and it is recommended
+    to use different name conventions to distinguish them, such as using
+    ``gt_instances`` and ``pred_instances`` to distinguish between labels and
+    predicted results. Additionally, we distinguish data elements at instance
+    level, pixel level, and label level. Each of these types has its own
+    characteristics. Therefore, MMEngine defines the base class
+    ``BaseDataElement``, and implement ``InstanceData``, ``PixelData``, and
+    ``LabelData`` inheriting from ``BaseDataElement`` to represent different
+    types of ground truth labels or predictions.
+
+    Another common data element is sample data. A sample data consists of input
+    data (such as an image) and its annotations and predictions. In general,
+    an image can have multiple types of annotations and/or predictions at the
+    same time (for example, both pixel-level semantic segmentation annotations
+    and instance-level detection bboxes annotations). All labels and
+    predictions of a training sample are often passed between Dataset, Model,
+    Visualizer, and Evaluator components. In order to simplify the interface
+    between components, we can treat them as a large data element and
+    encapsulate them. Such data elements are generally called XXDataSample in
+    the OpenMMLab. Therefore, Similar to `nn.Module`, the `BaseDataElement`
+    allows `BaseDataElement` as its attribute. Such a class generally
+    encapsulates all the data of a sample in the algorithm library, and its
+    attributes generally are various types of data elements. For example,
+    MMDetection is assigned by the BaseDataElement to encapsulate all the data
+    elements of the sample labeling and prediction of a sample in the
+    algorithm library.
+
+    The attributes in ``BaseDataElement`` are divided into two parts,
+    the ``metainfo`` and the ``data`` respectively.
+
+        - ``metainfo``: Usually contains the
+          information about the image such as filename,
+          image_shape, pad_shape, etc. The attributes can be accessed or
+          modified by dict-like or object-like operations, such as
+          ``.`` (for data access and modification), ``in``, ``del``,
+          ``pop(str)``, ``get(str)``, ``metainfo_keys()``,
+          ``metainfo_values()``, ``metainfo_items()``, ``set_metainfo()`` (for
+          set or change key-value pairs in metainfo).
+
+        - ``data``: Annotations or model predictions are
+          stored. The attributes can be accessed or modified by
+          dict-like or object-like operations, such as
+          ``.``, ``in``, ``del``, ``pop(str)``, ``get(str)``, ``keys()``,
+          ``values()``, ``items()``. Users can also apply tensor-like
+          methods to all :obj:`torch.Tensor` in the ``data_fields``,
+          such as ``.cuda()``, ``.cpu()``, ``.numpy()``, ``.to()``,
+          ``to_tensor()``, ``.detach()``.
+
+    Args:
+        metainfo (dict, optional): A dict contains the meta information
+            of single image, such as ``dict(img_shape=(512, 512, 3),
+            scale_factor=(1, 1, 1, 1))``. Defaults to None.
+        kwargs (dict, optional): A dict contains annotations of single image or
+            model predictions. Defaults to None.
+
+    Examples:
+        >>> import torch
+        >>> from mmengine.structures import BaseDataElement
+        >>> gt_instances = BaseDataElement()
+        >>> bboxes = torch.rand((5, 4))
+        >>> scores = torch.rand((5,))
+        >>> img_id = 0
+        >>> img_shape = (800, 1333)
+        >>> gt_instances = BaseDataElement(
+        ...     metainfo=dict(img_id=img_id, img_shape=img_shape),
+        ...     bboxes=bboxes, scores=scores)
+        >>> gt_instances = BaseDataElement(
+        ...     metainfo=dict(img_id=img_id, img_shape=(640, 640)))
+
+        >>> # new
+        >>> gt_instances1 = gt_instances.new(
+        ...     metainfo=dict(img_id=1, img_shape=(640, 640)),
+        ...                   bboxes=torch.rand((5, 4)),
+        ...                   scores=torch.rand((5,)))
+        >>> gt_instances2 = gt_instances1.new()
+
+        >>> # add and process property
+        >>> gt_instances = BaseDataElement()
+        >>> gt_instances.set_metainfo(dict(img_id=9, img_shape=(100, 100)))
+        >>> assert 'img_shape' in gt_instances.metainfo_keys()
+        >>> assert 'img_shape' in gt_instances
+        >>> assert 'img_shape' not in gt_instances.keys()
+        >>> assert 'img_shape' in gt_instances.all_keys()
+        >>> print(gt_instances.img_shape)
+        (100, 100)
+        >>> gt_instances.scores = torch.rand((5,))
+        >>> assert 'scores' in gt_instances.keys()
+        >>> assert 'scores' in gt_instances
+        >>> assert 'scores' in gt_instances.all_keys()
+        >>> assert 'scores' not in gt_instances.metainfo_keys()
+        >>> print(gt_instances.scores)
+        tensor([0.5230, 0.7885, 0.2426, 0.3911, 0.4876])
+        >>> gt_instances.bboxes = torch.rand((5, 4))
+        >>> assert 'bboxes' in gt_instances.keys()
+        >>> assert 'bboxes' in gt_instances
+        >>> assert 'bboxes' in gt_instances.all_keys()
+        >>> assert 'bboxes' not in gt_instances.metainfo_keys()
+        >>> print(gt_instances.bboxes)
+        tensor([[0.0900, 0.0424, 0.1755, 0.4469],
+                [0.8648, 0.0592, 0.3484, 0.0913],
+                [0.5808, 0.1909, 0.6165, 0.7088],
+                [0.5490, 0.4209, 0.9416, 0.2374],
+                [0.3652, 0.1218, 0.8805, 0.7523]])
+
+        >>> # delete and change property
+        >>> gt_instances = BaseDataElement(
+        ...     metainfo=dict(img_id=0, img_shape=(640, 640)),
+        ...     bboxes=torch.rand((6, 4)), scores=torch.rand((6,)))
+        >>> gt_instances.set_metainfo(dict(img_shape=(1280, 1280)))
+        >>> gt_instances.img_shape  # (1280, 1280)
+        >>> gt_instances.bboxes = gt_instances.bboxes * 2
+        >>> gt_instances.get('img_shape', None)  # (1280, 1280)
+        >>> gt_instances.get('bboxes', None)  # 6x4 tensor
+        >>> del gt_instances.img_shape
+        >>> del gt_instances.bboxes
+        >>> assert 'img_shape' not in gt_instances
+        >>> assert 'bboxes' not in gt_instances
+        >>> gt_instances.pop('img_shape', None)  # None
+        >>> gt_instances.pop('bboxes', None)  # None
+
+        >>> # Tensor-like
+        >>> cuda_instances = gt_instances.cuda()
+        >>> cuda_instances = gt_instances.to('cuda:0')
+        >>> cpu_instances = cuda_instances.cpu()
+        >>> cpu_instances = cuda_instances.to('cpu')
+        >>> fp16_instances = cuda_instances.to(
+        ...     device=None, dtype=torch.float16, non_blocking=False,
+        ...     copy=False, memory_format=torch.preserve_format)
+        >>> cpu_instances = cuda_instances.detach()
+        >>> np_instances = cpu_instances.numpy()
+
+        >>> # print
+        >>> metainfo = dict(img_shape=(800, 1196, 3))
+        >>> gt_instances = BaseDataElement(
+        ...     metainfo=metainfo, det_labels=torch.LongTensor([0, 1, 2, 3]))
+        >>> sample = BaseDataElement(metainfo=metainfo,
+        ...                          gt_instances=gt_instances)
+        >>> print(sample)
+        <BaseDataElement(
+            META INFORMATION
+            img_shape: (800, 1196, 3)
+            DATA FIELDS
+            gt_instances: <BaseDataElement(
+                    META INFORMATION
+                    img_shape: (800, 1196, 3)
+                    DATA FIELDS
+                    det_labels: tensor([0, 1, 2, 3])
+                ) at 0x7f0ec5eadc70>
+        ) at 0x7f0fea49e130>
+
+        >>> # inheritance
+        >>> class DetDataSample(BaseDataElement):
+        ...     @property
+        ...     def proposals(self):
+        ...         return self._proposals
+        ...     @proposals.setter
+        ...     def proposals(self, value):
+        ...         self.set_field(value, '_proposals', dtype=BaseDataElement)
+        ...     @proposals.deleter
+        ...     def proposals(self):
+        ...         del self._proposals
+        ...     @property
+        ...     def gt_instances(self):
+        ...         return self._gt_instances
+        ...     @gt_instances.setter
+        ...     def gt_instances(self, value):
+        ...         self.set_field(value, '_gt_instances',
+        ...                        dtype=BaseDataElement)
+        ...     @gt_instances.deleter
+        ...     def gt_instances(self):
+        ...         del self._gt_instances
+        ...     @property
+        ...     def pred_instances(self):
+        ...         return self._pred_instances
+        ...     @pred_instances.setter
+        ...     def pred_instances(self, value):
+        ...         self.set_field(value, '_pred_instances',
+        ...                        dtype=BaseDataElement)
+        ...     @pred_instances.deleter
+        ...     def pred_instances(self):
+        ...         del self._pred_instances
+        >>> det_sample = DetDataSample()
+        >>> proposals = BaseDataElement(bboxes=torch.rand((5, 4)))
+        >>> det_sample.proposals = proposals
+        >>> assert 'proposals' in det_sample
+        >>> assert det_sample.proposals == proposals
+        >>> del det_sample.proposals
+        >>> assert 'proposals' not in det_sample
+        >>> with self.assertRaises(AssertionError):
+        ...     det_sample.proposals = torch.rand((5, 4))
+    """
+
+    def __init__(self, *, metainfo: Optional[dict] = None, **kwargs) -> None:
+
+        self._metainfo_fields: set = set()
+        self._data_fields: set = set()
+
+        if metainfo is not None:
+            self.set_metainfo(metainfo=metainfo)
+        if kwargs:
+            self.set_data(kwargs)
+
+    def set_metainfo(self, metainfo: dict) -> None:
+        """Set or change key-value pairs in ``metainfo_field`` by parameter
+        ``metainfo``.
+
+        Args:
+            metainfo (dict): A dict contains the meta information
+                of image, such as ``img_shape``, ``scale_factor``, etc.
+        """
+        assert isinstance(
+            metainfo,
+            dict), f'metainfo should be a ``dict`` but got {type(metainfo)}'
+        meta = copy.deepcopy(metainfo)
+        for k, v in meta.items():
+            self.set_field(name=k, value=v, field_type='metainfo', dtype=None)
+
+    def set_data(self, data: dict) -> None:
+        """Set or change key-value pairs in ``data_field`` by parameter
+        ``data``.
+
+        Args:
+            data (dict): A dict contains annotations of image or
+                model predictions.
+        """
+        assert isinstance(data,
+                          dict), f'data should be a `dict` but got {data}'
+        for k, v in data.items():
+            # Use `setattr()` rather than `self.set_field` to allow `set_data`
+            # to set property method.
+            setattr(self, k, v)
+
+    def update(self, instance: 'BaseDataElement') -> None:
+        """The update() method updates the BaseDataElement with the elements
+        from another BaseDataElement object.
+
+        Args:
+            instance (BaseDataElement): Another BaseDataElement object for
+                update the current object.
+        """
+        assert isinstance(
+            instance, BaseDataElement
+        ), f'instance should be a `BaseDataElement` but got {type(instance)}'
+        self.set_metainfo(dict(instance.metainfo_items()))
+        self.set_data(dict(instance.items()))
+
+    def new(self,
+            *,
+            metainfo: Optional[dict] = None,
+            **kwargs) -> 'BaseDataElement':
+        """Return a new data element with same type. If ``metainfo`` and
+        ``data`` are None, the new data element will have same metainfo and
+        data. If metainfo or data is not None, the new result will overwrite it
+        with the input value.
+
+        Args:
+            metainfo (dict, optional): A dict contains the meta information
+                of image, such as ``img_shape``, ``scale_factor``, etc.
+                Defaults to None.
+            kwargs (dict): A dict contains annotations of image or
+                model predictions.
+
+        Returns:
+            BaseDataElement: A new data element with same type.
+        """
+        new_data = self.__class__()
+
+        if metainfo is not None:
+            new_data.set_metainfo(metainfo)
+        else:
+            new_data.set_metainfo(dict(self.metainfo_items()))
+        if kwargs:
+            new_data.set_data(kwargs)
+        else:
+            new_data.set_data(dict(self.items()))
+        return new_data
+
+    def clone(self):
+        """Deep copy the current data element.
+
+        Returns:
+            BaseDataElement: The copy of current data element.
+        """
+        clone_data = self.__class__()
+        clone_data.set_metainfo(dict(self.metainfo_items()))
+        clone_data.set_data(dict(self.items()))
+        return clone_data
+
+    def keys(self) -> list:
+        """
+        Returns:
+            list: Contains all keys in data_fields.
+        """
+        # We assume that the name of the attribute related to property is
+        # '_' + the name of the property. We use this rule to filter out
+        # private keys.
+        # TODO: Use a more robust way to solve this problem
+        private_keys = {
+            '_' + key
+            for key in self._data_fields
+            if isinstance(getattr(type(self), key, None), property)
+        }
+        return list(self._data_fields - private_keys)
+
+    def metainfo_keys(self) -> list:
+        """
+        Returns:
+            list: Contains all keys in metainfo_fields.
+        """
+        return list(self._metainfo_fields)
+
+    def values(self) -> list:
+        """
+        Returns:
+            list: Contains all values in data.
+        """
+        return [getattr(self, k) for k in self.keys()]
+
+    def metainfo_values(self) -> list:
+        """
+        Returns:
+            list: Contains all values in metainfo.
+        """
+        return [getattr(self, k) for k in self.metainfo_keys()]
+
+    def all_keys(self) -> list:
+        """
+        Returns:
+            list: Contains all keys in metainfo and data.
+        """
+        return self.metainfo_keys() + self.keys()
+
+    def all_values(self) -> list:
+        """
+        Returns:
+            list: Contains all values in metainfo and data.
+        """
+        return self.metainfo_values() + self.values()
+
+    def all_items(self) -> Iterator[Tuple[str, Any]]:
+        """
+        Returns:
+            iterator: An iterator object whose element is (key, value) tuple
+            pairs for ``metainfo`` and ``data``.
+        """
+        for k in self.all_keys():
+            yield (k, getattr(self, k))
+
+    def items(self) -> Iterator[Tuple[str, Any]]:
+        """
+        Returns:
+            iterator: An iterator object whose element is (key, value) tuple
+            pairs for ``data``.
+        """
+        for k in self.keys():
+            yield (k, getattr(self, k))
+
+    def metainfo_items(self) -> Iterator[Tuple[str, Any]]:
+        """
+        Returns:
+            iterator: An iterator object whose element is (key, value) tuple
+            pairs for ``metainfo``.
+        """
+        for k in self.metainfo_keys():
+            yield (k, getattr(self, k))
+
+    @property
+    def metainfo(self) -> dict:
+        """dict: A dict contains metainfo of current data element."""
+        return dict(self.metainfo_items())
+
+    def __setattr__(self, name: str, value: Any):
+        """setattr is only used to set data."""
+        if name in ('_metainfo_fields', '_data_fields'):
+            if not hasattr(self, name):
+                super().__setattr__(name, value)
+            else:
+                raise AttributeError(f'{name} has been used as a '
+                                     'private attribute, which is immutable.')
+        else:
+            self.set_field(
+                name=name, value=value, field_type='data', dtype=None)
+
+    def __delattr__(self, item: str):
+        """Delete the item in dataelement.
+
+        Args:
+            item (str): The key to delete.
+        """
+        if item in ('_metainfo_fields', '_data_fields'):
+            raise AttributeError(f'{item} has been used as a '
+                                 'private attribute, which is immutable.')
+        super().__delattr__(item)
+        if item in self._metainfo_fields:
+            self._metainfo_fields.remove(item)
+        elif item in self._data_fields:
+            self._data_fields.remove(item)
+
+    # dict-like methods
+    __delitem__ = __delattr__
+
+    def get(self, key, default=None) -> Any:
+        """Get property in data and metainfo as the same as python."""
+        # Use `getattr()` rather than `self.__dict__.get()` to allow getting
+        # properties.
+        return getattr(self, key, default)
+
+    def pop(self, *args) -> Any:
+        """Pop property in data and metainfo as the same as python."""
+        assert len(args) < 3, '``pop`` get more than 2 arguments'
+        name = args[0]
+        if name in self._metainfo_fields:
+            self._metainfo_fields.remove(args[0])
+            return self.__dict__.pop(*args)
+
+        elif name in self._data_fields:
+            self._data_fields.remove(args[0])
+            return self.__dict__.pop(*args)
+
+        # with default value
+        elif len(args) == 2:
+            return args[1]
+        else:
+            # don't just use 'self.__dict__.pop(*args)' for only popping key in
+            # metainfo or data
+            raise KeyError(f'{args[0]} is not contained in metainfo or data')
+
+    def __contains__(self, item: str) -> bool:
+        """Whether the item is in dataelement.
+
+        Args:
+            item (str): The key to inquire.
+        """
+        return item in self._data_fields or item in self._metainfo_fields
+
+    def set_field(self,
+                  value: Any,
+                  name: str,
+                  dtype: Optional[Union[Type, Tuple[Type, ...]]] = None,
+                  field_type: str = 'data') -> None:
+        """Special method for set union field, used as property.setter
+        functions."""
+        assert field_type in ['metainfo', 'data']
+        if dtype is not None:
+            assert isinstance(
+                value,
+                dtype), f'{value} should be a {dtype} but got {type(value)}'
+
+        if field_type == 'metainfo':
+            if name in self._data_fields:
+                raise AttributeError(
+                    f'Cannot set {name} to be a field of metainfo '
+                    f'because {name} is already a data field')
+            self._metainfo_fields.add(name)
+        else:
+            if name in self._metainfo_fields:
+                raise AttributeError(
+                    f'Cannot set {name} to be a field of data '
+                    f'because {name} is already a metainfo field')
+            self._data_fields.add(name)
+        super().__setattr__(name, value)
+
+    # Tensor-like methods
+    def to(self, *args, **kwargs) -> 'BaseDataElement':
+        """Apply same name function to all tensors in data_fields."""
+        new_data = self.new()
+        for k, v in self.items():
+            if hasattr(v, 'to'):
+                v = v.to(*args, **kwargs)
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    # Tensor-like methods
+    def cpu(self) -> 'BaseDataElement':
+        """Convert all tensors to CPU in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.cpu()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    # Tensor-like methods
+    def cuda(self) -> 'BaseDataElement':
+        """Convert all tensors to GPU in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.cuda()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    # Tensor-like methods
+    def musa(self) -> 'BaseDataElement':
+        """Convert all tensors to musa in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.musa()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    # Tensor-like methods
+    def npu(self) -> 'BaseDataElement':
+        """Convert all tensors to NPU in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.npu()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    def mlu(self) -> 'BaseDataElement':
+        """Convert all tensors to MLU in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.mlu()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    # Tensor-like methods
+    def detach(self) -> 'BaseDataElement':
+        """Detach all tensors in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.detach()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    # Tensor-like methods
+    def numpy(self) -> 'BaseDataElement':
+        """Convert all tensors to np.ndarray in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.detach().cpu().numpy()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    def to_tensor(self) -> 'BaseDataElement':
+        """Convert all np.ndarray to tensor in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            data = {}
+            if isinstance(v, np.ndarray):
+                v = torch.from_numpy(v)
+                data[k] = v
+            elif isinstance(v, BaseDataElement):
+                v = v.to_tensor()
+                data[k] = v
+            new_data.set_data(data)
+        return new_data
+
+    def to_dict(self) -> dict:
+        """Convert BaseDataElement to dict."""
+        return {
+            k: v.to_dict() if isinstance(v, BaseDataElement) else v
+            for k, v in self.all_items()
+        }
+
+    def __repr__(self) -> str:
+        """Represent the object."""
+
+        def _addindent(s_: str, num_spaces: int) -> str:
+            """This func is modified from `pytorch` https://github.com/pytorch/
+            pytorch/blob/b17b2b1cc7b017c3daaeff8cc7ec0f514d42ec37/torch/nn/modu
+            les/module.py#L29.
+
+            Args:
+                s_ (str): The string to add spaces.
+                num_spaces (int): The num of space to add.
+
+            Returns:
+                str: The string after add indent.
+            """
+            s = s_.split('\n')
+            # don't do anything for single-line stuff
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)  # type: ignore
+            s = first + '\n' + s  # type: ignore
+            return s  # type: ignore
+
+        def dump(obj: Any) -> str:
+            """Represent the object.
+
+            Args:
+                obj (Any): The obj to represent.
+
+            Returns:
+                str: The represented str.
+            """
+            _repr = ''
+            if isinstance(obj, dict):
+                for k, v in obj.items():
+                    _repr += f'\n{k}: {_addindent(dump(v), 4)}'
+            elif isinstance(obj, BaseDataElement):
+                _repr += '\n\n    META INFORMATION'
+                metainfo_items = dict(obj.metainfo_items())
+                _repr += _addindent(dump(metainfo_items), 4)
+                _repr += '\n\n    DATA FIELDS'
+                items = dict(obj.items())
+                _repr += _addindent(dump(items), 4)
+                classname = obj.__class__.__name__
+                _repr = f'<{classname}({_repr}\n) at {hex(id(obj))}>'
+            else:
+                _repr += repr(obj)
+            return _repr
+
+        return dump(self)
diff --git a/head_extractor/build/lib/mmengine/structures/instance_data.py b/head_extractor/build/lib/mmengine/structures/instance_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..369d445f282b460be57b73d3bf64f679a4d286f1
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/structures/instance_data.py
@@ -0,0 +1,311 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+from collections.abc import Sized
+from typing import Any, List, Union
+
+import numpy as np
+import torch
+
+from mmengine.device import get_device
+from .base_data_element import BaseDataElement
+
+BoolTypeTensor: Union[Any]
+LongTypeTensor: Union[Any]
+
+if get_device() == 'npu':
+    BoolTypeTensor = Union[torch.BoolTensor, torch.npu.BoolTensor]
+    LongTypeTensor = Union[torch.LongTensor, torch.npu.LongTensor]
+elif get_device() == 'mlu':
+    BoolTypeTensor = Union[torch.BoolTensor, torch.mlu.BoolTensor]
+    LongTypeTensor = Union[torch.LongTensor, torch.mlu.LongTensor]
+elif get_device() == 'musa':
+    BoolTypeTensor = Union[torch.BoolTensor, torch.musa.BoolTensor]
+    LongTypeTensor = Union[torch.LongTensor, torch.musa.LongTensor]
+else:
+    BoolTypeTensor = Union[torch.BoolTensor, torch.cuda.BoolTensor]
+    LongTypeTensor = Union[torch.LongTensor, torch.cuda.LongTensor]
+
+IndexType: Union[Any] = Union[str, slice, int, list, LongTypeTensor,
+                              BoolTypeTensor, np.ndarray]
+
+
+# Modified from
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/data_structures/instance_data.py # noqa
+class InstanceData(BaseDataElement):
+    """Data structure for instance-level annotations or predictions.
+
+    Subclass of :class:`BaseDataElement`. All value in `data_fields`
+    should have the same length. This design refer to
+    https://github.com/facebookresearch/detectron2/blob/master/detectron2/structures/instances.py # noqa E501
+    InstanceData also support extra functions: ``index``, ``slice`` and ``cat`` for data field. The type of value
+    in data field can be base data structure such as `torch.Tensor`, `numpy.ndarray`, `list`, `str`, `tuple`,
+    and can be customized data structure that has ``__len__``, ``__getitem__`` and ``cat`` attributes.
+
+    Examples:
+        >>> # custom data structure
+        >>> class TmpObject:
+        ...     def __init__(self, tmp) -> None:
+        ...         assert isinstance(tmp, list)
+        ...         self.tmp = tmp
+        ...     def __len__(self):
+        ...         return len(self.tmp)
+        ...     def __getitem__(self, item):
+        ...         if isinstance(item, int):
+        ...             if item >= len(self) or item < -len(self):  # type:ignore
+        ...                 raise IndexError(f'Index {item} out of range!')
+        ...             else:
+        ...                 # keep the dimension
+        ...                 item = slice(item, None, len(self))
+        ...         return TmpObject(self.tmp[item])
+        ...     @staticmethod
+        ...     def cat(tmp_objs):
+        ...         assert all(isinstance(results, TmpObject) for results in tmp_objs)
+        ...         if len(tmp_objs) == 1:
+        ...             return tmp_objs[0]
+        ...         tmp_list = [tmp_obj.tmp for tmp_obj in tmp_objs]
+        ...         tmp_list = list(itertools.chain(*tmp_list))
+        ...         new_data = TmpObject(tmp_list)
+        ...         return new_data
+        ...     def __repr__(self):
+        ...         return str(self.tmp)
+        >>> from mmengine.structures import InstanceData
+        >>> import numpy as np
+        >>> import torch
+        >>> img_meta = dict(img_shape=(800, 1196, 3), pad_shape=(800, 1216, 3))
+        >>> instance_data = InstanceData(metainfo=img_meta)
+        >>> 'img_shape' in instance_data
+        True
+        >>> instance_data.det_labels = torch.LongTensor([2, 3])
+        >>> instance_data["det_scores"] = torch.Tensor([0.8, 0.7])
+        >>> instance_data.bboxes = torch.rand((2, 4))
+        >>> instance_data.polygons = TmpObject([[1, 2, 3, 4], [5, 6, 7, 8]])
+        >>> len(instance_data)
+        2
+        >>> print(instance_data)
+        <InstanceData(
+            META INFORMATION
+            img_shape: (800, 1196, 3)
+            pad_shape: (800, 1216, 3)
+            DATA FIELDS
+            det_labels: tensor([2, 3])
+            det_scores: tensor([0.8000, 0.7000])
+            bboxes: tensor([[0.4997, 0.7707, 0.0595, 0.4188],
+                        [0.8101, 0.3105, 0.5123, 0.6263]])
+            polygons: [[1, 2, 3, 4], [5, 6, 7, 8]]
+        ) at 0x7fb492de6280>
+        >>> sorted_results = instance_data[instance_data.det_scores.sort().indices]
+        >>> sorted_results.det_scores
+        tensor([0.7000, 0.8000])
+        >>> print(instance_data[instance_data.det_scores > 0.75])
+        <InstanceData(
+            META INFORMATION
+            img_shape: (800, 1196, 3)
+            pad_shape: (800, 1216, 3)
+            DATA FIELDS
+            det_labels: tensor([2])
+            det_scores: tensor([0.8000])
+            bboxes: tensor([[0.4997, 0.7707, 0.0595, 0.4188]])
+            polygons: [[1, 2, 3, 4]]
+        ) at 0x7f64ecf0ec40>
+        >>> print(instance_data[instance_data.det_scores > 1])
+        <InstanceData(
+            META INFORMATION
+            img_shape: (800, 1196, 3)
+            pad_shape: (800, 1216, 3)
+            DATA FIELDS
+            det_labels: tensor([], dtype=torch.int64)
+            det_scores: tensor([])
+            bboxes: tensor([], size=(0, 4))
+            polygons: []
+        ) at 0x7f660a6a7f70>
+        >>> print(instance_data.cat([instance_data, instance_data]))
+        <InstanceData(
+            META INFORMATION
+            img_shape: (800, 1196, 3)
+            pad_shape: (800, 1216, 3)
+            DATA FIELDS
+            det_labels: tensor([2, 3, 2, 3])
+            det_scores: tensor([0.8000, 0.7000, 0.8000, 0.7000])
+            bboxes: tensor([[0.4997, 0.7707, 0.0595, 0.4188],
+                        [0.8101, 0.3105, 0.5123, 0.6263],
+                        [0.4997, 0.7707, 0.0595, 0.4188],
+                        [0.8101, 0.3105, 0.5123, 0.6263]])
+            polygons: [[1, 2, 3, 4], [5, 6, 7, 8], [1, 2, 3, 4], [5, 6, 7, 8]]
+        ) at 0x7f203542feb0>
+    """
+
+    def __setattr__(self, name: str, value: Sized):
+        """setattr is only used to set data.
+
+        The value must have the attribute of `__len__` and have the same length
+        of `InstanceData`.
+        """
+        if name in ('_metainfo_fields', '_data_fields'):
+            if not hasattr(self, name):
+                super().__setattr__(name, value)
+            else:
+                raise AttributeError(f'{name} has been used as a '
+                                     'private attribute, which is immutable.')
+
+        else:
+            assert isinstance(value,
+                              Sized), 'value must contain `__len__` attribute'
+
+            if len(self) > 0:
+                assert len(value) == len(self), 'The length of ' \
+                                                f'values {len(value)} is ' \
+                                                'not consistent with ' \
+                                                'the length of this ' \
+                                                ':obj:`InstanceData` ' \
+                                                f'{len(self)}'
+            super().__setattr__(name, value)
+
+    __setitem__ = __setattr__
+
+    def __getitem__(self, item: IndexType) -> 'InstanceData':
+        """
+        Args:
+            item (str, int, list, :obj:`slice`, :obj:`numpy.ndarray`,
+                :obj:`torch.LongTensor`, :obj:`torch.BoolTensor`):
+                Get the corresponding values according to item.
+
+        Returns:
+            :obj:`InstanceData`: Corresponding values.
+        """
+        assert isinstance(item, IndexType.__args__)
+        if isinstance(item, list):
+            item = np.array(item)
+        if isinstance(item, np.ndarray):
+            # The default int type of numpy is platform dependent, int32 for
+            # windows and int64 for linux. `torch.Tensor` requires the index
+            # should be int64, therefore we simply convert it to int64 here.
+            # More details in https://github.com/numpy/numpy/issues/9464
+            item = item.astype(np.int64) if item.dtype == np.int32 else item
+            item = torch.from_numpy(item)
+
+        if isinstance(item, str):
+            return getattr(self, item)
+
+        if isinstance(item, int):
+            if item >= len(self) or item < -len(self):  # type:ignore
+                raise IndexError(f'Index {item} out of range!')
+            else:
+                # keep the dimension
+                item = slice(item, None, len(self))
+
+        new_data = self.__class__(metainfo=self.metainfo)
+        if isinstance(item, torch.Tensor):
+            assert item.dim() == 1, 'Only support to get the' \
+                                    ' values along the first dimension.'
+            if isinstance(item, BoolTypeTensor.__args__):
+                assert len(item) == len(self), 'The shape of the ' \
+                                               'input(BoolTensor) ' \
+                                               f'{len(item)} ' \
+                                               'does not match the shape ' \
+                                               'of the indexed tensor ' \
+                                               'in results_field ' \
+                                               f'{len(self)} at ' \
+                                               'first dimension.'
+
+            for k, v in self.items():
+                if isinstance(v, torch.Tensor):
+                    new_data[k] = v[item]
+                elif isinstance(v, np.ndarray):
+                    new_data[k] = v[item.cpu().numpy()]
+                elif isinstance(
+                        v, (str, list, tuple)) or (hasattr(v, '__getitem__')
+                                                   and hasattr(v, 'cat')):
+                    # convert to indexes from BoolTensor
+                    if isinstance(item, BoolTypeTensor.__args__):
+                        indexes = torch.nonzero(item).view(
+                            -1).cpu().numpy().tolist()
+                    else:
+                        indexes = item.cpu().numpy().tolist()
+                    slice_list = []
+                    if indexes:
+                        for index in indexes:
+                            slice_list.append(slice(index, None, len(v)))
+                    else:
+                        slice_list.append(slice(None, 0, None))
+                    r_list = [v[s] for s in slice_list]
+                    if isinstance(v, (str, list, tuple)):
+                        new_value = r_list[0]
+                        for r in r_list[1:]:
+                            new_value = new_value + r
+                    else:
+                        new_value = v.cat(r_list)
+                    new_data[k] = new_value
+                else:
+                    raise ValueError(
+                        f'The type of `{k}` is `{type(v)}`, which has no '
+                        'attribute of `cat`, so it does not '
+                        'support slice with `bool`')
+
+        else:
+            # item is a slice
+            for k, v in self.items():
+                new_data[k] = v[item]
+        return new_data  # type:ignore
+
+    @staticmethod
+    def cat(instances_list: List['InstanceData']) -> 'InstanceData':
+        """Concat the instances of all :obj:`InstanceData` in the list.
+
+        Note: To ensure that cat returns as expected, make sure that
+        all elements in the list must have exactly the same keys.
+
+        Args:
+            instances_list (list[:obj:`InstanceData`]): A list
+                of :obj:`InstanceData`.
+
+        Returns:
+            :obj:`InstanceData`
+        """
+        assert all(
+            isinstance(results, InstanceData) for results in instances_list)
+        assert len(instances_list) > 0
+        if len(instances_list) == 1:
+            return instances_list[0]
+
+        # metainfo and data_fields must be exactly the
+        # same for each element to avoid exceptions.
+        field_keys_list = [
+            instances.all_keys() for instances in instances_list
+        ]
+        assert len({len(field_keys) for field_keys in field_keys_list}) \
+               == 1 and len(set(itertools.chain(*field_keys_list))) \
+               == len(field_keys_list[0]), 'There are different keys in ' \
+                                           '`instances_list`, which may ' \
+                                           'cause the cat operation ' \
+                                           'to fail. Please make sure all ' \
+                                           'elements in `instances_list` ' \
+                                           'have the exact same key.'
+
+        new_data = instances_list[0].__class__(
+            metainfo=instances_list[0].metainfo)
+        for k in instances_list[0].keys():
+            values = [results[k] for results in instances_list]
+            v0 = values[0]
+            if isinstance(v0, torch.Tensor):
+                new_values = torch.cat(values, dim=0)
+            elif isinstance(v0, np.ndarray):
+                new_values = np.concatenate(values, axis=0)
+            elif isinstance(v0, (str, list, tuple)):
+                new_values = v0[:]
+                for v in values[1:]:
+                    new_values += v
+            elif hasattr(v0, 'cat'):
+                new_values = v0.cat(values)
+            else:
+                raise ValueError(
+                    f'The type of `{k}` is `{type(v0)}` which has no '
+                    'attribute of `cat`')
+            new_data[k] = new_values
+        return new_data  # type:ignore
+
+    def __len__(self) -> int:
+        """int: The length of InstanceData."""
+        if len(self._data_fields) > 0:
+            return len(self.values()[0])
+        else:
+            return 0
diff --git a/head_extractor/build/lib/mmengine/structures/label_data.py b/head_extractor/build/lib/mmengine/structures/label_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..de178e07a011d4b9d45844113f1052b6d6da01aa
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/structures/label_data.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+
+from .base_data_element import BaseDataElement
+
+
+class LabelData(BaseDataElement):
+    """Data structure for label-level annotations or predictions."""
+
+    @staticmethod
+    def onehot_to_label(onehot: torch.Tensor) -> torch.Tensor:
+        """Convert the one-hot input to label.
+
+        Args:
+            onehot (torch.Tensor, optional): The one-hot input. The format
+                of input must be one-hot.
+
+        Returns:
+            torch.Tensor: The converted results.
+        """
+        assert isinstance(onehot, torch.Tensor)
+        if (onehot.ndim == 1 and onehot.max().item() <= 1
+                and onehot.min().item() >= 0):
+            return onehot.nonzero().squeeze(-1)
+        else:
+            raise ValueError(
+                'input is not one-hot and can not convert to label')
+
+    @staticmethod
+    def label_to_onehot(label: torch.Tensor, num_classes: int) -> torch.Tensor:
+        """Convert the label-format input to one-hot.
+
+        Args:
+            label (torch.Tensor): The label-format input. The format
+                of item must be label-format.
+            num_classes (int): The number of classes.
+
+        Returns:
+            torch.Tensor: The converted results.
+        """
+        assert isinstance(label, torch.Tensor)
+        onehot = label.new_zeros((num_classes, ))
+        assert max(label, default=torch.tensor(0)).item() < num_classes
+        onehot[label] = 1
+        return onehot
diff --git a/head_extractor/build/lib/mmengine/structures/pixel_data.py b/head_extractor/build/lib/mmengine/structures/pixel_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..d550f5c0c6512306c94a9a6bf9c8b40fb6b5a89e
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/structures/pixel_data.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Sequence, Union
+
+import numpy as np
+import torch
+
+from .base_data_element import BaseDataElement
+
+
+class PixelData(BaseDataElement):
+    """Data structure for pixel-level annotations or predictions.
+
+    All data items in ``data_fields`` of ``PixelData`` meet the following
+    requirements:
+
+    - They all have 3 dimensions in orders of channel, height, and width.
+    - They should have the same height and width.
+
+    Examples:
+        >>> metainfo = dict(
+        ...     img_id=random.randint(0, 100),
+        ...     img_shape=(random.randint(400, 600), random.randint(400, 600)))
+        >>> image = np.random.randint(0, 255, (4, 20, 40))
+        >>> featmap = torch.randint(0, 255, (10, 20, 40))
+        >>> pixel_data = PixelData(metainfo=metainfo,
+        ...                        image=image,
+        ...                        featmap=featmap)
+        >>> print(pixel_data.shape)
+        (20, 40)
+
+        >>> # slice
+        >>> slice_data = pixel_data[10:20, 20:40]
+        >>> assert slice_data.shape == (10, 20)
+        >>> slice_data = pixel_data[10, 20]
+        >>> assert slice_data.shape == (1, 1)
+
+        >>> # set
+        >>> pixel_data.map3 = torch.randint(0, 255, (20, 40))
+        >>> assert tuple(pixel_data.map3.shape) == (1, 20, 40)
+        >>> with self.assertRaises(AssertionError):
+        ...     # The dimension must be 3 or 2
+        ...     pixel_data.map2 = torch.randint(0, 255, (1, 3, 20, 40))
+    """
+
+    def __setattr__(self, name: str, value: Union[torch.Tensor, np.ndarray]):
+        """Set attributes of ``PixelData``.
+
+        If the dimension of value is 2 and its shape meet the demand, it
+        will automatically expand its channel-dimension.
+
+        Args:
+            name (str): The key to access the value, stored in `PixelData`.
+            value (Union[torch.Tensor, np.ndarray]): The value to store in.
+                The type of value must be `torch.Tensor` or `np.ndarray`,
+                and its shape must meet the requirements of `PixelData`.
+        """
+        if name in ('_metainfo_fields', '_data_fields'):
+            if not hasattr(self, name):
+                super().__setattr__(name, value)
+            else:
+                raise AttributeError(f'{name} has been used as a '
+                                     'private attribute, which is immutable.')
+
+        else:
+            assert isinstance(value, (torch.Tensor, np.ndarray)), \
+                f'Can not set {type(value)}, only support' \
+                f' {(torch.Tensor, np.ndarray)}'
+
+            if self.shape:
+                assert tuple(value.shape[-2:]) == self.shape, (
+                    'The height and width of '
+                    f'values {tuple(value.shape[-2:])} is '
+                    'not consistent with '
+                    'the shape of this '
+                    ':obj:`PixelData` '
+                    f'{self.shape}')
+            assert value.ndim in [
+                2, 3
+            ], f'The dim of value must be 2 or 3, but got {value.ndim}'
+            if value.ndim == 2:
+                value = value[None]
+                warnings.warn('The shape of value will convert from '
+                              f'{value.shape[-2:]} to {value.shape}')
+            super().__setattr__(name, value)
+
+    # TODO torch.Long/bool
+    def __getitem__(self, item: Sequence[Union[int, slice]]) -> 'PixelData':
+        """
+        Args:
+            item (Sequence[Union[int, slice]]): Get the corresponding values
+                according to item.
+
+        Returns:
+            :obj:`PixelData`: Corresponding values.
+        """
+
+        new_data = self.__class__(metainfo=self.metainfo)
+        if isinstance(item, tuple):
+
+            assert len(item) == 2, 'Only support to slice height and width'
+            tmp_item: List[slice] = list()
+            for index, single_item in enumerate(item[::-1]):
+                if isinstance(single_item, int):
+                    tmp_item.insert(
+                        0, slice(single_item, None, self.shape[-index - 1]))
+                elif isinstance(single_item, slice):
+                    tmp_item.insert(0, single_item)
+                else:
+                    raise TypeError(
+                        'The type of element in input must be int or slice, '
+                        f'but got {type(single_item)}')
+            tmp_item.insert(0, slice(None, None, None))
+            item = tuple(tmp_item)
+            for k, v in self.items():
+                setattr(new_data, k, v[item])
+        else:
+            raise TypeError(
+                f'Unsupported type {type(item)} for slicing PixelData')
+        return new_data
+
+    @property
+    def shape(self):
+        """The shape of pixel data."""
+        if len(self._data_fields) > 0:
+            return tuple(self.values()[0].shape[-2:])
+        else:
+            return None
+
+    # TODO padding, resize
diff --git a/head_extractor/build/lib/mmengine/testing/__init__.py b/head_extractor/build/lib/mmengine/testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7e4da354323ffe2401de2ab01ca9f4a51f932ef
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/testing/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .compare import (assert_allclose, assert_attrs_equal,
+                      assert_dict_contains_subset, assert_dict_has_keys,
+                      assert_is_norm_layer, assert_keys_equal,
+                      assert_params_all_zeros, check_python_script)
+from .runner_test_case import RunnerTestCase
+
+__all__ = [
+    'assert_allclose', 'assert_dict_contains_subset', 'assert_keys_equal',
+    'assert_attrs_equal', 'assert_dict_has_keys', 'assert_is_norm_layer',
+    'assert_params_all_zeros', 'check_python_script', 'RunnerTestCase'
+]
diff --git a/head_extractor/build/lib/mmengine/testing/_internal/__init__.py b/head_extractor/build/lib/mmengine/testing/_internal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4528659a8ff0e342012da67cc5e88fe99c1afa4
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/testing/_internal/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .distributed import MultiProcessTestCase
+
+__all__ = ['MultiProcessTestCase']
diff --git a/head_extractor/build/lib/mmengine/testing/_internal/distributed.py b/head_extractor/build/lib/mmengine/testing/_internal/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..b795cc945672fa132bd12a7f8dce1ee12b5fd6bc
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/testing/_internal/distributed.py
@@ -0,0 +1,357 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) https://github.com/pytorch/pytorch
+# Modified from https://github.com/pytorch/pytorch/blob/master/torch/testing/_internal/common_distributed.py  # noqa: E501
+import faulthandler
+import logging
+import multiprocessing
+import sys
+import tempfile
+import threading
+import time
+import traceback
+import types
+import unittest
+from enum import Enum
+from functools import wraps
+from typing import NamedTuple
+from unittest import TestCase
+
+import torch
+from torch.multiprocessing import active_children
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class TestSkip(NamedTuple):
+    exit_code: int
+    message: str
+
+
+TEST_SKIPS = {
+    'backend_unavailable':
+    TestSkip(10, 'Skipped because distributed backend is not available.'),
+    'no_cuda':
+    TestSkip(11, 'CUDA is not available.'),
+    'multi-gpu-2':
+    TestSkip(12, 'Need at least 2 CUDA device'),
+    'generic':
+    TestSkip(
+        13, 'Test skipped at subprocess level, look at subprocess log for '
+        'skip reason'),
+}
+
+# [How does MultiProcessTestCase work?]
+# Each MultiProcessTestCase instance uses 1 + `world_size()` processes, by
+# default `world_size()` returns 2. Let's take `test_rpc_spawn.py` as an
+# example which inherits from this class. Its `Setup()` methods calls into
+# `MultiProcessTestCase._spawn_processes()` which spawns `world_size()`
+# subprocesses. During the spawn, the main process passes the test name to
+# subprocesses, and the name is acquired from self.id(). The subprocesses
+# then use the provided test function name to retrieve the function attribute
+# from the test instance and run it. The main process simply waits for all
+# subprocesses to join.
+
+
+class MultiProcessTestCase(TestCase):
+    MAIN_PROCESS_RANK = -1
+
+    # This exit code is used to indicate that the test code had an error and
+    # exited abnormally. There are certain tests that might use sys.exit() to
+    # simulate failures and in those cases, we can't have an exit code of 0,
+    # but we still want to ensure we didn't run into any other errors.
+    TEST_ERROR_EXIT_CODE = 10
+
+    # do not early terminate for distributed tests.
+    def _should_stop_test_suite(self) -> bool:
+        return False
+
+    def prepare_subprocess(self):
+        pass
+
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @property
+    def timeout(self) -> int:
+        return 1000
+
+    def join_or_run(self, fn):
+
+        @wraps(fn)
+        def wrapper(self):
+            if self.rank == self.MAIN_PROCESS_RANK:
+                self._join_processes(fn)
+            else:
+                fn()
+
+        return types.MethodType(wrapper, self)
+
+    # The main process spawns N subprocesses that run the test.
+    # Constructor patches current instance test method to
+    # assume the role of the main process and join its subprocesses,
+    # or run the underlying test function.
+    def __init__(self, method_name: str = 'runTest') -> None:
+        super().__init__(method_name)
+        fn = getattr(self, method_name)
+        setattr(self, method_name, self.join_or_run(fn))
+
+    def setUp(self) -> None:
+        super().setUp()
+        self.skip_return_code_checks = []  # type: ignore[var-annotated]
+        self.processes = []  # type: ignore[var-annotated]
+        self.rank = self.MAIN_PROCESS_RANK
+        self.file_name = tempfile.NamedTemporaryFile(delete=False).name
+        # pid to pipe consisting of error message from process.
+        self.pid_to_pipe = {}  # type: ignore[var-annotated]
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        for p in self.processes:
+            p.terminate()
+        # Each Process instance holds a few open file descriptors. The unittest
+        # runner creates a new TestCase instance for each test method and keeps
+        # it alive until the end of the entire suite. We must thus reset the
+        # processes to prevent an effective file descriptor leak.
+        self.processes = []
+
+    def _current_test_name(self) -> str:
+        # self.id()
+        # e.g. '__main__.TestDistributed.TestAdditive.test_get_rank'
+        return self.id().split('.')[-1]
+
+    def _start_processes(self, proc) -> None:
+        self.processes = []
+        for rank in range(int(self.world_size)):
+            parent_conn, child_conn = torch.multiprocessing.Pipe()
+            process = proc(
+                target=self.__class__._run,
+                name='process ' + str(rank),
+                args=(rank, self._current_test_name(), self.file_name,
+                      child_conn),
+            )
+            process.start()
+            self.pid_to_pipe[process.pid] = parent_conn
+            self.processes.append(process)
+
+    def _spawn_processes(self) -> None:
+        proc = torch.multiprocessing.get_context('spawn').Process
+        self._start_processes(proc)
+
+    class Event(Enum):
+        GET_TRACEBACK = 1
+
+    @staticmethod
+    def _event_listener(parent_pipe, signal_pipe, rank: int):
+        while True:
+            ready_pipes = multiprocessing.connection.wait(
+                [parent_pipe, signal_pipe])
+
+            if parent_pipe in ready_pipes:
+
+                if parent_pipe.closed:
+                    return
+
+                event = parent_pipe.recv()
+
+                if event == MultiProcessTestCase.Event.GET_TRACEBACK:
+                    # Return traceback to the parent process.
+                    with tempfile.NamedTemporaryFile(mode='r+') as tmp_file:
+                        faulthandler.dump_traceback(tmp_file)
+                        # Flush buffers and seek to read from the beginning
+                        tmp_file.flush()
+                        tmp_file.seek(0)
+                        parent_pipe.send(tmp_file.read())
+
+            if signal_pipe in ready_pipes:
+                return
+
+    @classmethod
+    def _run(cls, rank: int, test_name: str, file_name: str,
+             parent_pipe) -> None:
+        self = cls(test_name)
+        try:
+            self.prepare_subprocess()
+        except Exception:
+            raise sys.exit(MultiProcessTestCase.TEST_ERROR_EXIT_CODE)
+        self.rank = rank
+        self.file_name = file_name
+        self.run_test(test_name, parent_pipe)
+
+    def run_test(self, test_name: str, parent_pipe) -> None:
+        # Start event listener thread.
+        signal_recv_pipe, signal_send_pipe = torch.multiprocessing.Pipe(
+            duplex=False)
+        event_listener_thread = threading.Thread(
+            target=MultiProcessTestCase._event_listener,
+            args=(parent_pipe, signal_recv_pipe, self.rank),
+            daemon=True,
+        )
+        event_listener_thread.start()
+
+        # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
+        # We're retrieving a corresponding test and executing it.
+        try:
+            getattr(self, test_name)()
+        except unittest.SkipTest as se:
+            logger.info(f'Process {self.rank} skipping test {test_name} for '
+                        f'following reason: {str(se)}')
+            sys.exit(TEST_SKIPS['generic'].exit_code)
+        except Exception:
+            logger.error(
+                f'Caught exception: \n{traceback.format_exc()} exiting '
+                f'process {self.rank} with exit code: '
+                f'{MultiProcessTestCase.TEST_ERROR_EXIT_CODE}')
+            # Send error to parent process.
+            parent_pipe.send(traceback.format_exc())
+            sys.exit(MultiProcessTestCase.TEST_ERROR_EXIT_CODE)
+        finally:
+            if signal_send_pipe is not None:
+                signal_send_pipe.send(None)
+
+            assert event_listener_thread is not None
+            event_listener_thread.join()
+            # Close pipe after done with test.
+            parent_pipe.close()
+
+    def _get_timedout_process_traceback(self) -> None:
+        pipes = []
+        for i, process in enumerate(self.processes):
+            if process.exitcode is None:
+                pipe = self.pid_to_pipe[process.pid]
+                try:
+                    pipe.send(MultiProcessTestCase.Event.GET_TRACEBACK)
+                    pipes.append((i, pipe))
+                except ConnectionError as e:
+                    logger.error(
+                        'Encountered error while trying to get traceback '
+                        f'for process {i}: {e}')
+
+        # Wait for results.
+        for rank, pipe in pipes:
+            try:
+                # Wait for traceback
+                if pipe.poll(5):
+                    if pipe.closed:
+                        logger.info(
+                            f'Pipe closed for process {rank}, cannot retrieve '
+                            'traceback')
+                        continue
+
+                    traceback = pipe.recv()
+                    logger.error(f'Process {rank} timed out with traceback: '
+                                 f'\n\n{traceback}')
+                else:
+                    logger.error('Could not retrieve traceback for timed out '
+                                 f'process: {rank}')
+            except ConnectionError as e:
+                logger.error(
+                    'Encountered error while trying to get traceback for '
+                    f'process {rank}: {e}')
+
+    def _join_processes(self, fn) -> None:
+        start_time = time.time()
+        subprocess_error = False
+        try:
+            while True:
+                # check to see if any subprocess exited with an error early.
+                for (i, p) in enumerate(self.processes):
+                    # This is the exit code processes exit with if they
+                    # encountered an exception.
+                    if p.exitcode == MultiProcessTestCase.TEST_ERROR_EXIT_CODE:
+                        print(
+                            f'Process {i} terminated with exit code '
+                            f'{p.exitcode}, terminating remaining processes.')
+                        _active_children = active_children()
+                        for ac in _active_children:
+                            ac.terminate()
+                        subprocess_error = True
+                        break
+                if subprocess_error:
+                    break
+                # All processes have joined cleanly if they all a valid
+                # exitcode
+                if all([p.exitcode is not None for p in self.processes]):
+                    break
+                # Check if we should time out the test. If so, we terminate
+                # each process.
+                elapsed = time.time() - start_time
+                if elapsed > self.timeout:
+                    self._get_timedout_process_traceback()
+                    print(f'Timing out after {self.timeout} seconds and '
+                          'killing subprocesses.')
+                    for p in self.processes:
+                        p.terminate()
+                    break
+                # Sleep to avoid excessive busy polling.
+                time.sleep(0.1)
+
+            elapsed_time = time.time() - start_time
+
+            if fn in self.skip_return_code_checks:
+                self._check_no_test_errors(elapsed_time)
+            else:
+                self._check_return_codes(elapsed_time)
+        finally:
+            # Close all pipes
+            for pid, pipe in self.pid_to_pipe.items():
+                pipe.close()
+
+    def _check_no_test_errors(self, elapsed_time) -> None:
+        """Checks that we didn't have any errors thrown in the child
+        processes."""
+        for i, p in enumerate(self.processes):
+            if p.exitcode is None:
+                raise RuntimeError(
+                    'Process {} timed out after {} seconds'.format(
+                        i, elapsed_time))
+            self.assertNotEqual(self.TEST_ERROR_EXIT_CODE, p.exitcode)
+
+    def _check_return_codes(self, elapsed_time) -> None:
+        """Checks that the return codes of all spawned processes match, and
+        skips tests if they returned a return code indicating a skipping
+        condition."""
+        first_process = self.processes[0]
+        # first, we check if there are errors in actual processes
+        # (via TEST_ERROR_EXIT CODE), and raise an exception for those.
+        # the reason we do this is to attempt to raise a more helpful error
+        # message than "Process x terminated/timed out"
+        # TODO: we should pipe the exception of the failed subprocess here.
+        # Currently, the actual exception is displayed as a logging output.
+        errored_processes = [
+            (i, p) for i, p in enumerate(self.processes)
+            if p.exitcode == MultiProcessTestCase.TEST_ERROR_EXIT_CODE
+        ]
+        if errored_processes:
+            error = ''
+            for i, process in errored_processes:
+                # Get error from pipe.
+                error_message = self.pid_to_pipe[process.pid].recv()
+                error += (
+                    'Process {} exited with error code {} and exception:\n{}\n'
+                    .format(i, MultiProcessTestCase.TEST_ERROR_EXIT_CODE,
+                            error_message))
+            raise RuntimeError(error)
+        # If no process exited uncleanly, we check for timeouts, and then
+        # ensure each process exited cleanly.
+        for i, p in enumerate(self.processes):
+            if p.exitcode is None:
+                raise RuntimeError(
+                    f'Process {i} terminated or timed out after '
+                    '{elapsed_time} seconds')
+
+        for skip in TEST_SKIPS.values():
+            if first_process.exitcode == skip.exit_code:
+                raise unittest.SkipTest(skip.message)
+
+        # Skip the unittest since the raised error maybe not caused by
+        # the tested function. For example, in CI environment, the tested
+        # method could be terminated by system signal for the limited
+        # resources.
+        self.skipTest(f'Skip test {self._testMethodName} due to '
+                      'the program abort')
+
+    @property
+    def is_master(self) -> bool:
+        return self.rank == 0
diff --git a/head_extractor/build/lib/mmengine/testing/compare.py b/head_extractor/build/lib/mmengine/testing/compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..14c7a97ba73ee98600102ab28d649b01aab8f3bc
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/testing/compare.py
@@ -0,0 +1,188 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+from collections.abc import Iterable
+from runpy import run_path
+from shlex import split
+from typing import Any, Callable, Dict, List, Optional, Union
+from unittest.mock import patch
+
+from torch.nn import GroupNorm, LayerNorm
+from torch.testing import assert_allclose as _assert_allclose
+
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm
+
+
+def assert_allclose(
+    actual: Any,
+    expected: Any,
+    rtol: Optional[float] = None,
+    atol: Optional[float] = None,
+    equal_nan: bool = True,
+    msg: Optional[Union[str, Callable]] = '',
+) -> None:
+    """Asserts that ``actual`` and ``expected`` are close. A wrapper function
+    of ``torch.testing.assert_allclose``.
+
+    Args:
+        actual (Any): Actual input.
+        expected (Any): Expected input.
+        rtol (Optional[float]): Relative tolerance. If specified ``atol`` must
+            also be specified. If omitted, default values based on the
+            :attr:`~torch.Tensor.dtype` are selected with the below table.
+        atol (Optional[float]): Absolute tolerance. If specified :attr:`rtol`
+            must also be specified. If omitted, default values based on the
+            :attr:`~torch.Tensor.dtype` are selected with the below table.
+        equal_nan (bool): If ``True``, two ``NaN`` values will be considered
+            equal.
+        msg (Optional[Union[str, Callable]]): Optional error message to use if
+            the values of corresponding tensors mismatch. Unused when PyTorch
+            < 1.6.
+    """
+    if 'parrots' not in TORCH_VERSION and \
+            digit_version(TORCH_VERSION) >= digit_version('1.6'):
+        _assert_allclose(
+            actual,
+            expected,
+            rtol=rtol,
+            atol=atol,
+            equal_nan=equal_nan,
+            msg=msg)
+    else:
+        # torch.testing.assert_allclose has no ``msg`` argument
+        # when PyTorch < 1.6
+        _assert_allclose(
+            actual, expected, rtol=rtol, atol=atol, equal_nan=equal_nan)
+
+
+def check_python_script(cmd):
+    """Run the python cmd script with `__main__`. The difference between
+    `os.system` is that, this function exectues code in the current process, so
+    that it can be tracked by coverage tools. Currently it supports two forms:
+
+    - ./tests/data/scripts/hello.py zz
+    - python tests/data/scripts/hello.py zz
+    """
+    args = split(cmd)
+    if args[0] == 'python':
+        args = args[1:]
+    with patch.object(sys, 'argv', args):
+        run_path(args[0], run_name='__main__')
+
+
+def _any(judge_result):
+    """Since built-in ``any`` works only when the element of iterable is not
+    iterable, implement the function."""
+    if not isinstance(judge_result, Iterable):
+        return judge_result
+
+    try:
+        for element in judge_result:
+            if _any(element):
+                return True
+    except TypeError:
+        # Maybe encounter the case: torch.tensor(True) | torch.tensor(False)
+        if judge_result:
+            return True
+    return False
+
+
+def assert_dict_contains_subset(dict_obj: Dict[Any, Any],
+                                expected_subset: Dict[Any, Any]) -> bool:
+    """Check if the dict_obj contains the expected_subset.
+
+    Args:
+        dict_obj (Dict[Any, Any]): Dict object to be checked.
+        expected_subset (Dict[Any, Any]): Subset expected to be contained in
+            dict_obj.
+
+    Returns:
+        bool: Whether the dict_obj contains the expected_subset.
+    """
+
+    for key, value in expected_subset.items():
+        if key not in dict_obj.keys() or _any(dict_obj[key] != value):
+            return False
+    return True
+
+
+def assert_attrs_equal(obj: Any, expected_attrs: Dict[str, Any]) -> bool:
+    """Check if attribute of class object is correct.
+
+    Args:
+        obj (object): Class object to be checked.
+        expected_attrs (Dict[str, Any]): Dict of the expected attrs.
+
+    Returns:
+        bool: Whether the attribute of class object is correct.
+    """
+    for attr, value in expected_attrs.items():
+        if not hasattr(obj, attr) or _any(getattr(obj, attr) != value):
+            return False
+    return True
+
+
+def assert_dict_has_keys(obj: Dict[str, Any],
+                         expected_keys: List[str]) -> bool:
+    """Check if the obj has all the expected_keys.
+
+    Args:
+        obj (Dict[str, Any]): Object to be checked.
+        expected_keys (List[str]): Keys expected to contained in the keys of
+            the obj.
+
+    Returns:
+        bool: Whether the obj has the expected keys.
+    """
+    return set(expected_keys).issubset(set(obj.keys()))
+
+
+def assert_keys_equal(result_keys: List[str], target_keys: List[str]) -> bool:
+    """Check if target_keys is equal to result_keys.
+
+    Args:
+        result_keys (List[str]): Result keys to be checked.
+        target_keys (List[str]): Target keys to be checked.
+
+    Returns:
+        bool: Whether target_keys is equal to result_keys.
+    """
+    return set(result_keys) == set(target_keys)
+
+
+def assert_is_norm_layer(module) -> bool:
+    """Check if the module is a norm layer.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: Whether the module is a norm layer.
+    """
+
+    norm_layer_candidates = (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)
+    return isinstance(module, norm_layer_candidates)
+
+
+def assert_params_all_zeros(module) -> bool:
+    """Check if the parameters of the module is all zeros.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: Whether the parameters of the module is all zeros.
+    """
+    weight_data = module.weight.data
+    is_weight_zero = weight_data.allclose(
+        weight_data.new_zeros(weight_data.size()))
+
+    if hasattr(module, 'bias') and module.bias is not None:
+        bias_data = module.bias.data
+        is_bias_zero = bias_data.allclose(
+            bias_data.new_zeros(bias_data.size()))
+    else:
+        is_bias_zero = True
+
+    return is_weight_zero and is_bias_zero
diff --git a/head_extractor/build/lib/mmengine/testing/runner_test_case.py b/head_extractor/build/lib/mmengine/testing/runner_test_case.py
new file mode 100644
index 0000000000000000000000000000000000000000..f64594acef9a966279f94a31c50358ff57b81eac
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/testing/runner_test_case.py
@@ -0,0 +1,196 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+import os
+import shutil
+import tempfile
+import time
+from unittest import TestCase
+from uuid import uuid4
+
+import torch
+import torch.nn as nn
+from torch.distributed import destroy_process_group
+from torch.utils.data import Dataset
+
+import mmengine.hooks  # noqa F401
+import mmengine.optim  # noqa F401
+from mmengine.config import Config
+from mmengine.dist import is_distributed
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MessageHub, MMLogger
+from mmengine.model import BaseModel
+from mmengine.registry import DATASETS, METRICS, MODELS, DefaultScope
+from mmengine.runner import Runner
+from mmengine.visualization import Visualizer
+
+
+class ToyModel(BaseModel):
+
+    def __init__(self, data_preprocessor=None):
+        super().__init__(data_preprocessor=data_preprocessor)
+        self.linear1 = nn.Linear(2, 2)
+        self.linear2 = nn.Linear(2, 1)
+
+    def forward(self, inputs, data_samples=None, mode='tensor'):
+        if isinstance(inputs, list):
+            inputs = torch.stack(inputs)
+        if isinstance(data_samples, list):
+            data_samples = torch.stack(data_samples)
+        outputs = self.linear1(inputs)
+        outputs = self.linear2(outputs)
+
+        if mode == 'tensor':
+            return outputs
+        elif mode == 'loss':
+            loss = (data_samples - outputs).sum()
+            outputs = dict(loss=loss)
+            return outputs
+        elif mode == 'predict':
+            return outputs
+
+
+class ToyDataset(Dataset):
+    METAINFO = dict()  # type: ignore
+    data = torch.randn(12, 2)
+    label = torch.ones(12)
+
+    @property
+    def metainfo(self):
+        return self.METAINFO
+
+    def __len__(self):
+        return self.data.size(0)
+
+    def __getitem__(self, index):
+        return dict(inputs=self.data[index], data_samples=self.label[index])
+
+
+class ToyMetric(BaseMetric):
+
+    def __init__(self, collect_device='cpu', dummy_metrics=None):
+        super().__init__(collect_device=collect_device)
+        self.dummy_metrics = dummy_metrics
+
+    def process(self, data_batch, predictions):
+        result = {'acc': 1}
+        self.results.append(result)
+
+    def compute_metrics(self, results):
+        return dict(acc=1)
+
+
+class RunnerTestCase(TestCase):
+    """A test case to build runner easily.
+
+    `RunnerTestCase` will do the following things:
+
+    1. Registers a toy model, a toy metric, and a toy dataset, which can be
+       used to run the `Runner` successfully.
+    2. Provides epoch based and iteration based cfg to build runner.
+    3. Provides `build_runner` method to build runner easily.
+    4. Clean the global variable used by the runner.
+    """
+    dist_cfg = dict(
+        MASTER_ADDR='127.0.0.1',
+        MASTER_PORT=29600,
+        RANK='0',
+        WORLD_SIZE='1',
+        LOCAL_RANK='0')
+
+    def setUp(self) -> None:
+        self.temp_dir = tempfile.TemporaryDirectory()
+        # Prevent from registering module with the same name by other unit
+        # test. These registries will be cleared in `tearDown`
+        MODELS.register_module(module=ToyModel, force=True)
+        METRICS.register_module(module=ToyMetric, force=True)
+        DATASETS.register_module(module=ToyDataset, force=True)
+        epoch_based_cfg = dict(
+            work_dir=self.temp_dir.name,
+            model=dict(type='ToyModel'),
+            train_dataloader=dict(
+                dataset=dict(type='ToyDataset'),
+                sampler=dict(type='DefaultSampler', shuffle=True),
+                batch_size=3,
+                num_workers=0),
+            val_dataloader=dict(
+                dataset=dict(type='ToyDataset'),
+                sampler=dict(type='DefaultSampler', shuffle=False),
+                batch_size=3,
+                num_workers=0),
+            val_evaluator=[dict(type='ToyMetric')],
+            test_dataloader=dict(
+                dataset=dict(type='ToyDataset'),
+                sampler=dict(type='DefaultSampler', shuffle=False),
+                batch_size=3,
+                num_workers=0),
+            test_evaluator=[dict(type='ToyMetric')],
+            optim_wrapper=dict(optimizer=dict(type='SGD', lr=0.1)),
+            train_cfg=dict(by_epoch=True, max_epochs=2, val_interval=1),
+            val_cfg=dict(),
+            test_cfg=dict(),
+            default_hooks=dict(logger=dict(type='LoggerHook', interval=1)),
+            custom_hooks=[],
+            env_cfg=dict(dist_cfg=dict(backend='nccl')),
+            experiment_name='test1')
+        self.epoch_based_cfg = Config(epoch_based_cfg)
+
+        # prepare iter based cfg.
+        self.iter_based_cfg: Config = copy.deepcopy(self.epoch_based_cfg)
+        self.iter_based_cfg.train_dataloader = dict(
+            dataset=dict(type='ToyDataset'),
+            sampler=dict(type='InfiniteSampler', shuffle=True),
+            batch_size=3,
+            num_workers=0)
+        self.iter_based_cfg.log_processor = dict(by_epoch=False)
+
+        self.iter_based_cfg.train_cfg = dict(by_epoch=False, max_iters=12)
+        self.iter_based_cfg.default_hooks = dict(
+            logger=dict(type='LoggerHook', interval=1),
+            checkpoint=dict(
+                type='CheckpointHook', interval=12, by_epoch=False))
+
+    def tearDown(self):
+        # `FileHandler` should be closed in Windows, otherwise we cannot
+        # delete the temporary directory
+        logging.shutdown()
+        MMLogger._instance_dict.clear()
+        Visualizer._instance_dict.clear()
+        DefaultScope._instance_dict.clear()
+        MessageHub._instance_dict.clear()
+        MODELS.module_dict.pop('ToyModel', None)
+        METRICS.module_dict.pop('ToyMetric', None)
+        DATASETS.module_dict.pop('ToyDataset', None)
+        self.temp_dir.cleanup()
+        if is_distributed():
+            destroy_process_group()
+
+    def build_runner(self, cfg: Config):
+        cfg.experiment_name = self.experiment_name
+        runner = Runner.from_cfg(cfg)
+        return runner
+
+    @property
+    def experiment_name(self):
+        # Since runners could be built too fast to have a unique experiment
+        # name(timestamp is the same), here we use uuid to make sure each
+        # runner has the unique experiment name.
+        return f'{self._testMethodName}_{time.time()} + ' \
+               f'{uuid4()}'
+
+    def setup_dist_env(self):
+        self.dist_cfg['MASTER_PORT'] += 1
+        os.environ['MASTER_PORT'] = str(self.dist_cfg['MASTER_PORT'])
+        os.environ['MASTER_ADDR'] = self.dist_cfg['MASTER_ADDR']
+        os.environ['RANK'] = self.dist_cfg['RANK']
+        os.environ['WORLD_SIZE'] = self.dist_cfg['WORLD_SIZE']
+        os.environ['LOCAL_RANK'] = self.dist_cfg['LOCAL_RANK']
+
+    def clear_work_dir(self):
+        logging.shutdown()
+        for filename in os.listdir(self.temp_dir.name):
+            filepath = os.path.join(self.temp_dir.name, filename)
+            if os.path.isfile(filepath):
+                os.remove(filepath)
+            else:
+                shutil.rmtree(filepath)
diff --git a/head_extractor/build/lib/mmengine/utils/__init__.py b/head_extractor/build/lib/mmengine/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3de90999073d2fafc2738519e99847d9c930502e
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .manager import ManagerMeta, ManagerMixin
+from .misc import (apply_to, check_prerequisites, concat_list,
+                   deprecated_api_warning, deprecated_function,
+                   get_object_from_string, has_method,
+                   import_modules_from_strings, is_list_of,
+                   is_method_overridden, is_seq_of, is_str, is_tuple_of,
+                   iter_cast, list_cast, requires_executable, requires_package,
+                   slice_list, to_1tuple, to_2tuple, to_3tuple, to_4tuple,
+                   to_ntuple, tuple_cast)
+from .package_utils import (call_command, get_installed_path, install_package,
+                            is_installed)
+from .path import (check_file_exist, fopen, is_abs, is_filepath,
+                   mkdir_or_exist, scandir, symlink)
+from .progressbar import (ProgressBar, track_iter_progress,
+                          track_parallel_progress, track_progress)
+from .progressbar_rich import track_progress_rich
+from .timer import Timer, TimerError, check_time
+from .version_utils import digit_version, get_git_hash
+
+__all__ = [
+    'is_str', 'iter_cast', 'list_cast', 'tuple_cast', 'is_seq_of',
+    'is_list_of', 'is_tuple_of', 'slice_list', 'concat_list',
+    'check_prerequisites', 'requires_package', 'requires_executable',
+    'is_filepath', 'fopen', 'check_file_exist', 'mkdir_or_exist', 'symlink',
+    'scandir', 'deprecated_api_warning', 'import_modules_from_strings',
+    'to_1tuple', 'to_2tuple', 'to_3tuple', 'to_4tuple', 'to_ntuple',
+    'is_installed', 'call_command', 'get_installed_path', 'install_package',
+    'is_abs', 'is_method_overridden', 'has_method', 'digit_version',
+    'get_git_hash', 'ManagerMeta', 'ManagerMixin', 'Timer', 'check_time',
+    'TimerError', 'ProgressBar', 'track_iter_progress',
+    'track_parallel_progress', 'track_progress', 'deprecated_function',
+    'apply_to', 'track_progress_rich', 'get_object_from_string'
+]
diff --git a/head_extractor/build/lib/mmengine/utils/dl_utils/__init__.py b/head_extractor/build/lib/mmengine/utils/dl_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..305ea898904e359d61e3aab8e51897a9d035b08e
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/dl_utils/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .collect_env import collect_env
+from .hub import load_url
+from .misc import has_batch_norm, is_norm, mmcv_full_available, tensor2imgs
+from .parrots_wrapper import TORCH_VERSION
+from .setup_env import set_multi_processing
+from .time_counter import TimeCounter
+from .torch_ops import torch_meshgrid
+from .trace import is_jit_tracing
+
+__all__ = [
+    'load_url', 'TORCH_VERSION', 'set_multi_processing', 'has_batch_norm',
+    'is_norm', 'tensor2imgs', 'mmcv_full_available', 'collect_env',
+    'torch_meshgrid', 'is_jit_tracing', 'TimeCounter'
+]
diff --git a/head_extractor/build/lib/mmengine/utils/dl_utils/collect_env.py b/head_extractor/build/lib/mmengine/utils/dl_utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ee99abad2c654e975d11c0b315c14dca18171a7
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/dl_utils/collect_env.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This file holding some environment constant for sharing by other files."""
+import os
+import os.path as osp
+import subprocess
+import sys
+from collections import OrderedDict, defaultdict
+
+import numpy as np
+import torch
+
+import mmengine
+from mmengine.device import is_cuda_available, is_musa_available
+from .parrots_wrapper import TORCH_VERSION, get_build_config, is_rocm_pytorch
+
+
+def _get_cuda_home():
+    if TORCH_VERSION == 'parrots':
+        from parrots.utils.build_extension import CUDA_HOME
+    else:
+        if is_rocm_pytorch():
+            from torch.utils.cpp_extension import ROCM_HOME
+            CUDA_HOME = ROCM_HOME
+        else:
+            from torch.utils.cpp_extension import CUDA_HOME
+    return CUDA_HOME
+
+
+def _get_musa_home():
+    return os.environ.get('MUSA_HOME')
+
+
+def collect_env():
+    """Collect the information of the running environments.
+
+    Returns:
+        dict: The environment information. The following fields are contained.
+
+            - sys.platform: The variable of ``sys.platform``.
+            - Python: Python version.
+            - CUDA available: Bool, indicating if CUDA is available.
+            - GPU devices: Device type of each GPU.
+            - CUDA_HOME (optional): The env var ``CUDA_HOME``.
+            - NVCC (optional): NVCC version.
+            - GCC: GCC version, "n/a" if GCC is not installed.
+            - MSVC: Microsoft Virtual C++ Compiler version, Windows only.
+            - PyTorch: PyTorch version.
+            - PyTorch compiling details: The output of \
+                ``torch.__config__.show()``.
+            - TorchVision (optional): TorchVision version.
+            - OpenCV (optional): OpenCV version.
+            - MMENGINE: MMENGINE version.
+    """
+    from distutils import errors
+
+    env_info = OrderedDict()
+    env_info['sys.platform'] = sys.platform
+    env_info['Python'] = sys.version.replace('\n', '')
+
+    cuda_available = is_cuda_available()
+    musa_available = is_musa_available()
+    env_info['CUDA available'] = cuda_available
+    env_info['MUSA available'] = musa_available
+    env_info['numpy_random_seed'] = np.random.get_state()[1][0]
+
+    if cuda_available:
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, device_ids in devices.items():
+            env_info['GPU ' + ','.join(device_ids)] = name
+
+        CUDA_HOME = _get_cuda_home()
+        env_info['CUDA_HOME'] = CUDA_HOME
+
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            if CUDA_HOME == '/opt/rocm':
+                try:
+                    nvcc = osp.join(CUDA_HOME, 'hip/bin/hipcc')
+                    nvcc = subprocess.check_output(
+                        f'"{nvcc}" --version', shell=True)
+                    nvcc = nvcc.decode('utf-8').strip()
+                    release = nvcc.rfind('HIP version:')
+                    build = nvcc.rfind('')
+                    nvcc = nvcc[release:build].strip()
+                except subprocess.SubprocessError:
+                    nvcc = 'Not Available'
+            else:
+                try:
+                    nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                    nvcc = subprocess.check_output(f'"{nvcc}" -V', shell=True)
+                    nvcc = nvcc.decode('utf-8').strip()
+                    release = nvcc.rfind('Cuda compilation tools')
+                    build = nvcc.rfind('Build ')
+                    nvcc = nvcc[release:build].strip()
+                except subprocess.SubprocessError:
+                    nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+    elif musa_available:
+        devices = defaultdict(list)
+        for k in range(torch.musa.device_count()):
+            devices[torch.musa.get_device_name(k)].append(str(k))
+        for name, device_ids in devices.items():
+            env_info['GPU ' + ','.join(device_ids)] = name
+
+        MUSA_HOME = _get_musa_home()
+        env_info['MUSA_HOME'] = MUSA_HOME
+
+        if MUSA_HOME is not None and osp.isdir(MUSA_HOME):
+            try:
+                mcc = osp.join(MUSA_HOME, 'bin/mcc')
+                subprocess.check_output(f'"{mcc}" -v', shell=True)
+            except subprocess.SubprocessError:
+                mcc = 'Not Available'
+            env_info['mcc'] = mcc
+    try:
+        # Check C++ Compiler.
+        # For Unix-like, sysconfig has 'CC' variable like 'gcc -pthread ...',
+        # indicating the compiler used, we use this to get the compiler name
+        import io
+        import sysconfig
+        cc = sysconfig.get_config_var('CC')
+        if cc:
+            cc = osp.basename(cc.split()[0])
+            cc_info = subprocess.check_output(f'{cc} --version', shell=True)
+            env_info['GCC'] = cc_info.decode('utf-8').partition(
+                '\n')[0].strip()
+        else:
+            # on Windows, cl.exe is not in PATH. We need to find the path.
+            # distutils.ccompiler.new_compiler() returns a msvccompiler
+            # object and after initialization, path to cl.exe is found.
+            import locale
+            import os
+            from distutils.ccompiler import new_compiler
+            ccompiler = new_compiler()
+            ccompiler.initialize()
+            cc = subprocess.check_output(
+                f'{ccompiler.cc}', stderr=subprocess.STDOUT, shell=True)
+            encoding = os.device_encoding(
+                sys.stdout.fileno()) or locale.getpreferredencoding()
+            env_info['MSVC'] = cc.decode(encoding).partition('\n')[0].strip()
+            env_info['GCC'] = 'n/a'
+    except (subprocess.CalledProcessError, errors.DistutilsPlatformError):
+        env_info['GCC'] = 'n/a'
+    except io.UnsupportedOperation as e:
+        # JupyterLab on Windows changes sys.stdout, which has no `fileno` attr
+        # Refer to: https://github.com/open-mmlab/mmengine/issues/931
+        # TODO: find a solution to get compiler info in Windows JupyterLab,
+        # while preserving backward-compatibility in other systems.
+        env_info['MSVC'] = f'n/a, reason: {str(e)}'
+
+    env_info['PyTorch'] = torch.__version__
+    env_info['PyTorch compiling details'] = get_build_config()
+
+    try:
+        import torchvision
+        env_info['TorchVision'] = torchvision.__version__
+    except ModuleNotFoundError:
+        pass
+
+    try:
+        import cv2
+        env_info['OpenCV'] = cv2.__version__
+    except ImportError:
+        pass
+
+    env_info['MMEngine'] = mmengine.__version__
+
+    return env_info
diff --git a/head_extractor/build/lib/mmengine/utils/dl_utils/hub.py b/head_extractor/build/lib/mmengine/utils/dl_utils/hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf555ac766a964f8f1bcd192db9bb398a65b3c99
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/dl_utils/hub.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# The 1.6 release of PyTorch switched torch.save to use a new zipfile-based
+# file format. It will cause RuntimeError when a checkpoint was saved in
+# torch >= 1.6.0 but loaded in torch < 1.7.0.
+# More details at https://github.com/open-mmlab/mmpose/issues/904
+
+from ..path import mkdir_or_exist
+from ..version_utils import digit_version
+from .parrots_wrapper import TORCH_VERSION
+
+if TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) < digit_version(
+        '1.7.0'):
+    # Modified from https://github.com/pytorch/pytorch/blob/master/torch/hub.py
+    import os
+    import sys
+    import warnings
+    import zipfile
+    from urllib.parse import urlparse
+
+    import torch
+    from torch.hub import HASH_REGEX, _get_torch_home, download_url_to_file
+
+    # Hub used to support automatically extracts from zipfile manually
+    # compressed by users. The legacy zip format expects only one file from
+    # torch.save() < 1.6 in the zip. We should remove this support since
+    # zipfile is now default zipfile format for torch.save().
+    def _is_legacy_zip_format(filename):
+        if zipfile.is_zipfile(filename):
+            infolist = zipfile.ZipFile(filename).infolist()
+            return len(infolist) == 1 and not infolist[0].is_dir()
+        return False
+
+    def _legacy_zip_load(filename, model_dir, map_location):
+        warnings.warn(
+            'Falling back to the old format < 1.6. This support will'
+            ' be deprecated in favor of default zipfile format '
+            'introduced in 1.6. Please redo torch.save() to save it '
+            'in the new zipfile format.', DeprecationWarning)
+        # Note: extractall() defaults to overwrite file if exists. No need to
+        #       clean up beforehand. We deliberately don't handle tarfile here
+        #       since our legacy serialization format was in tar.
+        #       E.g. resnet18-5c106cde.pth which is widely used.
+        with zipfile.ZipFile(filename) as f:
+            members = f.infolist()
+            if len(members) != 1:
+                raise RuntimeError(
+                    'Only one file(not dir) is allowed in the zipfile')
+            f.extractall(model_dir)
+            extraced_name = members[0].filename
+            extracted_file = os.path.join(model_dir, extraced_name)
+        return torch.load(extracted_file, map_location=map_location)
+
+    def load_url(url,
+                 model_dir=None,
+                 map_location=None,
+                 progress=True,
+                 check_hash=False,
+                 file_name=None):
+        r"""Loads the Torch serialized object at the given URL.
+        If downloaded file is a zip file, it will be automatically decompressed
+        If the object is already present in `model_dir`, it's deserialized and
+        returned.
+        The default value of ``model_dir`` is ``<hub_dir>/checkpoints`` where
+        ``hub_dir`` is the directory returned by :func:`~torch.hub.get_dir`.
+        Args:
+            url (str): URL of the object to download
+            model_dir (str, optional): directory in which to save the object
+            map_location (optional): a function or a dict specifying how to
+                remap storage locations (see torch.load)
+            progress (bool, optional): whether or not to display a progress bar
+                to stderr. Defaults to True
+            check_hash(bool, optional): If True, the filename part of the URL
+                should follow the naming convention ``filename-<sha256>.ext``
+                where ``<sha256>`` is the first eight or more digits of the
+                SHA256 hash of the contents of the file. The hash is used to
+                ensure unique names and to verify the contents of the file.
+                Defaults to False
+            file_name (str, optional): name for the downloaded file. Filename
+                from ``url`` will be used if not set. Defaults to None.
+        Example:
+            >>> url = ('https://s3.amazonaws.com/pytorch/models/resnet18-5c106'
+            ...        'cde.pth')
+            >>> state_dict = torch.hub.load_state_dict_from_url(url)
+        """
+        # Issue warning to move data if old env is set
+        if os.getenv('TORCH_MODEL_ZOO'):
+            warnings.warn(
+                'TORCH_MODEL_ZOO is deprecated, please use env '
+                'TORCH_HOME instead', DeprecationWarning)
+
+        if model_dir is None:
+            torch_home = _get_torch_home()
+            model_dir = os.path.join(torch_home, 'checkpoints')
+
+        mkdir_or_exist(model_dir)
+
+        parts = urlparse(url)
+        filename = os.path.basename(parts.path)
+        if file_name is not None:
+            filename = file_name
+        cached_file = os.path.join(model_dir, filename)
+        if not os.path.exists(cached_file):
+            sys.stderr.write('Downloading: "{}" to {}\n'.format(
+                url, cached_file))
+            hash_prefix = None
+            if check_hash:
+                r = HASH_REGEX.search(filename)  # r is Optional[Match[str]]
+                hash_prefix = r.group(1) if r else None
+            download_url_to_file(
+                url, cached_file, hash_prefix, progress=progress)
+
+        if _is_legacy_zip_format(cached_file):
+            return _legacy_zip_load(cached_file, model_dir, map_location)
+
+        try:
+            return torch.load(cached_file, map_location=map_location)
+        except RuntimeError as error:
+            if digit_version(TORCH_VERSION) < digit_version('1.5.0'):
+                warnings.warn(
+                    f'If the error is the same as "{cached_file} is a zip '
+                    'archive (did you mean to use torch.jit.load()?)", you can'
+                    ' upgrade your torch to 1.5.0 or higher (current torch '
+                    f'version is {TORCH_VERSION}). The error was raised '
+                    ' because the checkpoint was saved in torch>=1.6.0 but '
+                    'loaded in torch<1.5.')
+            raise error
+else:
+    from torch.utils.model_zoo import load_url  # type: ignore # noqa: F401
diff --git a/head_extractor/build/lib/mmengine/utils/dl_utils/misc.py b/head_extractor/build/lib/mmengine/utils/dl_utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce52d22c3b225abafdc11c43ad1083f3b93b75e0
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/dl_utils/misc.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pkgutil
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ..misc import is_tuple_of
+from .parrots_wrapper import _BatchNorm, _InstanceNorm
+
+
+def is_norm(layer: nn.Module,
+            exclude: Optional[Union[type, Tuple[type]]] = None) -> bool:
+    """Check if a layer is a normalization layer.
+
+    Args:
+        layer (nn.Module): The layer to be checked.
+        exclude (type, tuple[type], optional): Types to be excluded.
+
+    Returns:
+        bool: Whether the layer is a norm layer.
+    """
+    if exclude is not None:
+        if not isinstance(exclude, tuple):
+            exclude = (exclude, )
+        if not is_tuple_of(exclude, type):
+            raise TypeError(
+                f'"exclude" must be either None or type or a tuple of types, '
+                f'but got {type(exclude)}: {exclude}')
+
+    if exclude and isinstance(layer, exclude):
+        return False
+
+    all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)
+    return isinstance(layer, all_norm_bases)
+
+
+def tensor2imgs(tensor: torch.Tensor,
+                mean: Optional[Tuple[float, float, float]] = None,
+                std: Optional[Tuple[float, float, float]] = None,
+                to_bgr: bool = True):
+    """Convert tensor to 3-channel images or 1-channel gray images.
+
+    Args:
+        tensor (torch.Tensor): Tensor that contains multiple images, shape (
+            N, C, H, W). :math:`C` can be either 3 or 1. If C is 3, the format
+            should be RGB.
+        mean (tuple[float], optional): Mean of images. If None,
+            (0, 0, 0) will be used for tensor with 3-channel,
+            while (0, ) for tensor with 1-channel. Defaults to None.
+        std (tuple[float], optional): Standard deviation of images. If None,
+            (1, 1, 1) will be used for tensor with 3-channel,
+            while (1, ) for tensor with 1-channel. Defaults to None.
+        to_bgr (bool): For the tensor with 3 channel, convert its format to
+            BGR. For the tensor with 1 channel, it must be False. Defaults to
+            True.
+
+    Returns:
+        list[np.ndarray]: A list that contains multiple images.
+    """
+
+    assert torch.is_tensor(tensor) and tensor.ndim == 4
+    channels = tensor.size(1)
+    assert channels in [1, 3]
+    if mean is None:
+        mean = (0, ) * channels
+    if std is None:
+        std = (1, ) * channels
+    assert (channels == len(mean) == len(std) == 3) or \
+           (channels == len(mean) == len(std) == 1 and not to_bgr)
+    mean = tensor.new_tensor(mean).view(1, -1)
+    std = tensor.new_tensor(std).view(1, -1)
+    tensor = tensor.permute(0, 2, 3, 1) * std + mean
+    imgs = tensor.detach().cpu().numpy()
+    if to_bgr and channels == 3:
+        imgs = imgs[:, :, :, (2, 1, 0)]  # RGB2BGR
+    imgs = [np.ascontiguousarray(img) for img in imgs]
+    return imgs
+
+
+def has_batch_norm(model: nn.Module) -> bool:
+    """Detect whether model has a BatchNormalization layer.
+
+    Args:
+        model (nn.Module): training model.
+
+    Returns:
+        bool: whether model has a BatchNormalization layer
+    """
+    if isinstance(model, _BatchNorm):
+        return True
+    for m in model.children():
+        if has_batch_norm(m):
+            return True
+    return False
+
+
+def mmcv_full_available() -> bool:
+    """Check whether mmcv-full is installed.
+
+    Returns:
+        bool: True if mmcv-full is installed else False.
+    """
+    try:
+        import mmcv  # noqa: F401
+    except ImportError:
+        return False
+    ext_loader = pkgutil.find_loader('mmcv._ext')
+    return ext_loader is not None
diff --git a/head_extractor/build/lib/mmengine/utils/dl_utils/parrots_wrapper.py b/head_extractor/build/lib/mmengine/utils/dl_utils/parrots_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bd8e5443acbb8bc168e0599d41860f09882fcd6
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/dl_utils/parrots_wrapper.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+from typing import Optional
+
+import torch
+
+TORCH_VERSION = torch.__version__
+
+
+def is_rocm_pytorch() -> bool:
+    """Check whether the PyTorch is compiled on ROCm."""
+    is_rocm = False
+    if TORCH_VERSION != 'parrots':
+        try:
+            from torch.utils.cpp_extension import ROCM_HOME
+            is_rocm = True if ((torch.version.hip is not None) and
+                               (ROCM_HOME is not None)) else False
+        except ImportError:
+            pass
+    return is_rocm
+
+
+def _get_cuda_home() -> Optional[str]:
+    """Obtain the path of CUDA home."""
+    if TORCH_VERSION == 'parrots':
+        from parrots.utils.build_extension import CUDA_HOME
+    else:
+        if is_rocm_pytorch():
+            from torch.utils.cpp_extension import ROCM_HOME
+            CUDA_HOME = ROCM_HOME
+        else:
+            from torch.utils.cpp_extension import CUDA_HOME
+    return CUDA_HOME
+
+
+def get_build_config():
+    """Obtain the build information of PyTorch or Parrots."""
+    if TORCH_VERSION == 'parrots':
+        from parrots.config import get_build_info
+        return get_build_info()
+    else:
+        return torch.__config__.show()
+
+
+def _get_conv() -> tuple:
+    """A wrapper to obtain base classes of Conv layers from PyTorch or
+    Parrots."""
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.conv import _ConvNd, _ConvTransposeMixin
+    else:
+        from torch.nn.modules.conv import _ConvNd, _ConvTransposeMixin
+    return _ConvNd, _ConvTransposeMixin
+
+
+def _get_dataloader() -> tuple:
+    """A wrapper to obtain DataLoader class from PyTorch or Parrots."""
+    if TORCH_VERSION == 'parrots':
+        from torch.utils.data import DataLoader, PoolDataLoader
+    else:
+        from torch.utils.data import DataLoader
+        PoolDataLoader = DataLoader
+    return DataLoader, PoolDataLoader
+
+
+def _get_extension():
+    """A wrapper to obtain extension class from PyTorch or Parrots."""
+    if TORCH_VERSION == 'parrots':
+        from parrots.utils.build_extension import BuildExtension, Extension
+        CppExtension = partial(Extension, cuda=False)
+        CUDAExtension = partial(Extension, cuda=True)
+    else:
+        from torch.utils.cpp_extension import (BuildExtension, CppExtension,
+                                               CUDAExtension)
+    return BuildExtension, CppExtension, CUDAExtension
+
+
+def _get_pool() -> tuple:
+    """A wrapper to obtain base classes of pooling layers from PyTorch or
+    Parrots."""
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.pool import (_AdaptiveAvgPoolNd,
+                                             _AdaptiveMaxPoolNd, _AvgPoolNd,
+                                             _MaxPoolNd)
+    else:
+        from torch.nn.modules.pooling import (_AdaptiveAvgPoolNd,
+                                              _AdaptiveMaxPoolNd, _AvgPoolNd,
+                                              _MaxPoolNd)
+    return _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd
+
+
+def _get_norm() -> tuple:
+    """A wrapper to obtain base classes of normalization layers from PyTorch or
+    Parrots."""
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.batchnorm import _BatchNorm, _InstanceNorm
+        SyncBatchNorm_ = torch.nn.SyncBatchNorm2d
+    else:
+        from torch.nn.modules.batchnorm import _BatchNorm
+        from torch.nn.modules.instancenorm import _InstanceNorm
+        SyncBatchNorm_ = torch.nn.SyncBatchNorm
+    return _BatchNorm, _InstanceNorm, SyncBatchNorm_
+
+
+_ConvNd, _ConvTransposeMixin = _get_conv()
+DataLoader, PoolDataLoader = _get_dataloader()
+_BatchNorm, _InstanceNorm, SyncBatchNorm_ = _get_norm()
+_AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd = _get_pool()
+
+
+class SyncBatchNorm(SyncBatchNorm_):  # type: ignore
+
+    def _check_input_dim(self, input):
+        if TORCH_VERSION == 'parrots':
+            if input.dim() < 2:
+                raise ValueError(
+                    f'expected at least 2D input (got {input.dim()}D input)')
+        else:
+            super()._check_input_dim(input)
diff --git a/head_extractor/build/lib/mmengine/utils/dl_utils/setup_env.py b/head_extractor/build/lib/mmengine/utils/dl_utils/setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c23a56a1342b6dc4312e6e98ef1d356e84faef4
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/dl_utils/setup_env.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import platform
+import warnings
+
+import torch.multiprocessing as mp
+
+
+def set_multi_processing(mp_start_method: str = 'fork',
+                         opencv_num_threads: int = 0,
+                         distributed: bool = False) -> None:
+    """Set multi-processing related environment.
+
+    Args:
+        mp_start_method (str): Set the method which should be used to start
+            child processes. Defaults to 'fork'.
+        opencv_num_threads (int): Number of threads for opencv.
+            Defaults to 0.
+        distributed (bool): True if distributed environment.
+            Defaults to False.
+    """
+    # set multi-process start method as `fork` to speed up the training
+    if platform.system() != 'Windows':
+        current_method = mp.get_start_method(allow_none=True)
+        if (current_method is not None and current_method != mp_start_method):
+            warnings.warn(
+                f'Multi-processing start method `{mp_start_method}` is '
+                f'different from the previous setting `{current_method}`.'
+                f'It will be force set to `{mp_start_method}`. You can '
+                'change this behavior by changing `mp_start_method` in '
+                'your config.')
+        mp.set_start_method(mp_start_method, force=True)
+
+    try:
+        import cv2
+
+        # disable opencv multithreading to avoid system being overloaded
+        cv2.setNumThreads(opencv_num_threads)
+    except ImportError:
+        pass
+
+    # setup OMP threads
+    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
+    if 'OMP_NUM_THREADS' not in os.environ and distributed:
+        omp_num_threads = 1
+        warnings.warn(
+            'Setting OMP_NUM_THREADS environment variable for each process'
+            f' to be {omp_num_threads} in default, to avoid your system '
+            'being overloaded, please further tune the variable for '
+            'optimal performance in your application as needed.')
+        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
+
+    # setup MKL threads
+    if 'MKL_NUM_THREADS' not in os.environ and distributed:
+        mkl_num_threads = 1
+        warnings.warn(
+            'Setting MKL_NUM_THREADS environment variable for each process'
+            f' to be {mkl_num_threads} in default, to avoid your system '
+            'being overloaded, please further tune the variable for '
+            'optimal performance in your application as needed.')
+        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
diff --git a/head_extractor/build/lib/mmengine/utils/dl_utils/time_counter.py b/head_extractor/build/lib/mmengine/utils/dl_utils/time_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4a155dd725b2cb453553eb33752fc816e933b39
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/dl_utils/time_counter.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+from typing import Optional, Union
+
+import torch
+
+from mmengine.device import is_cuda_available, is_musa_available
+from mmengine.dist.utils import master_only
+from mmengine.logging import MMLogger, print_log
+
+
+class TimeCounter:
+    """A tool that counts the average running time of a function or a method.
+    Users can use it as a decorator or context manager to calculate the average
+    running time of code blocks.
+
+    Args:
+        log_interval (int): The interval of logging. Defaults to 1.
+        warmup_interval (int): The interval of warmup. Defaults to 1.
+        with_sync (bool): Whether to synchronize cuda. Defaults to True.
+        tag (str, optional): Function tag. Used to distinguish between
+            different functions or methods being called. Defaults to None.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+                Defaults to None.
+
+    Examples:
+        >>> import time
+        >>> from mmengine.utils.dl_utils import TimeCounter
+        >>> @TimeCounter()
+        ... def fun1():
+        ...     time.sleep(0.1)
+        ... fun1()
+        [fun1]-time per run averaged in the past 1 runs: 100.0 ms
+
+        >>> @@TimeCounter(log_interval=2, tag='fun')
+        ... def fun2():
+        ...    time.sleep(0.2)
+        >>> for _ in range(3):
+        ...    fun2()
+        [fun]-time per run averaged in the past 2 runs: 200.0 ms
+
+        >>> with TimeCounter(tag='fun3'):
+        ...      time.sleep(0.3)
+        [fun3]-time per run averaged in the past 1 runs: 300.0 ms
+    """
+
+    instance_dict: dict = dict()
+
+    log_interval: int
+    warmup_interval: int
+    logger: Optional[MMLogger]
+    __count: int
+    __pure_inf_time: float
+
+    def __new__(cls,
+                log_interval: int = 1,
+                warmup_interval: int = 1,
+                with_sync: bool = True,
+                tag: Optional[str] = None,
+                logger: Optional[MMLogger] = None):
+        assert warmup_interval >= 1
+        if tag is not None and tag in cls.instance_dict:
+            return cls.instance_dict[tag]
+
+        instance = super().__new__(cls)
+        cls.instance_dict[tag] = instance
+
+        instance.log_interval = log_interval
+        instance.warmup_interval = warmup_interval
+        instance.with_sync = with_sync
+        instance.tag = tag
+        instance.logger = logger
+
+        instance.__count = 0
+        instance.__pure_inf_time = 0.
+        instance.__start_time = 0.
+
+        return instance
+
+    @master_only
+    def __call__(self, fn):
+        if self.tag is None:
+            self.tag = fn.__name__
+
+        def wrapper(*args, **kwargs):
+            self.__count += 1
+
+            if self.with_sync:
+                if is_cuda_available():
+                    torch.cuda.synchronize()
+                elif is_musa_available():
+                    torch.musa.synchronize()
+            start_time = time.perf_counter()
+
+            result = fn(*args, **kwargs)
+
+            if self.with_sync:
+                if is_cuda_available():
+                    torch.cuda.synchronize()
+                elif is_musa_available():
+                    torch.musa.synchronize()
+            elapsed = time.perf_counter() - start_time
+            self.print_time(elapsed)
+
+            return result
+
+        return wrapper
+
+    @master_only
+    def __enter__(self):
+        assert self.tag is not None, 'In order to clearly distinguish ' \
+                                     'printing information in different ' \
+                                     'contexts, please specify the ' \
+                                     'tag parameter'
+
+        self.__count += 1
+
+        if self.with_sync and torch.cuda.is_available():
+            torch.cuda.synchronize()
+        self.__start_time = time.perf_counter()
+
+    @master_only
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.with_sync and torch.cuda.is_available():
+            torch.cuda.synchronize()
+        elapsed = time.perf_counter() - self.__start_time
+        self.print_time(elapsed)
+
+    def print_time(self, elapsed: Union[int, float]) -> None:
+        """print times per count."""
+        if self.__count >= self.warmup_interval:
+            self.__pure_inf_time += elapsed
+
+            if self.__count % self.log_interval == 0:
+                times_per_count = 1000 * self.__pure_inf_time / (
+                    self.__count - self.warmup_interval + 1)
+                print_log(
+                    f'[{self.tag}]-time per run averaged in the past '
+                    f'{self.__count} runs: {times_per_count:.1f} ms',
+                    self.logger)
diff --git a/head_extractor/build/lib/mmengine/utils/dl_utils/torch_ops.py b/head_extractor/build/lib/mmengine/utils/dl_utils/torch_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2550ae6986e0fcfe7627b96eb575f26ef601c935
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/dl_utils/torch_ops.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..version_utils import digit_version
+from .parrots_wrapper import TORCH_VERSION
+
+_torch_version_meshgrid_indexing = (
+    'parrots' not in TORCH_VERSION
+    and digit_version(TORCH_VERSION) >= digit_version('1.10.0a0'))
+
+
+def torch_meshgrid(*tensors):
+    """A wrapper of torch.meshgrid to compat different PyTorch versions.
+
+    Since PyTorch 1.10.0a0, torch.meshgrid supports the arguments ``indexing``.
+    So we implement a wrapper here to avoid warning when using high-version
+    PyTorch and avoid compatibility issues when using previous versions of
+    PyTorch.
+
+    Args:
+        tensors (List[Tensor]): List of scalars or 1 dimensional tensors.
+
+    Returns:
+        Sequence[Tensor]: Sequence of meshgrid tensors.
+    """
+    if _torch_version_meshgrid_indexing:
+        return torch.meshgrid(*tensors, indexing='ij')
+    else:
+        return torch.meshgrid(*tensors)  # Uses indexing='ij' by default
diff --git a/head_extractor/build/lib/mmengine/utils/dl_utils/trace.py b/head_extractor/build/lib/mmengine/utils/dl_utils/trace.py
new file mode 100644
index 0000000000000000000000000000000000000000..c12bebf5d12ef7b26f1361a7e54fc120364db469
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/dl_utils/trace.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+
+from ..version_utils import digit_version
+
+
+def is_jit_tracing() -> bool:
+    if (torch.__version__ != 'parrots'
+            and digit_version(torch.__version__) >= digit_version('1.6.0')):
+        on_trace = torch.jit.is_tracing()
+        # In PyTorch 1.6, torch.jit.is_tracing has a bug.
+        # Refers to https://github.com/pytorch/pytorch/issues/42448
+        if isinstance(on_trace, bool):
+            return on_trace
+        else:
+            return torch._C._is_tracing()
+    else:
+        warnings.warn(
+            'torch.jit.is_tracing is only supported after v1.6.0. '
+            'Therefore is_tracing returns False automatically. Please '
+            'set on_trace manually if you are using trace.', UserWarning)
+        return False
diff --git a/head_extractor/build/lib/mmengine/utils/dl_utils/visualize.py b/head_extractor/build/lib/mmengine/utils/dl_utils/visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3361e1d50a4dafb8518d6bbd66f9131b441bd80
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/dl_utils/visualize.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import patch
+
+import torch
+import torch.nn as nn
+
+from mmengine.model import BaseModel
+from mmengine.registry import MODELS
+
+
+@MODELS.register_module()
+class ToyModel(BaseModel):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.conv = nn.Conv2d(1, 1, 1)
+
+    def forward(self, *args, **kwargs):
+        return {'loss': torch.tensor(0.0)}
+
+
+def update_params_step(self, loss):
+    pass
+
+
+def runtimeinfo_step(self, runner, batch_idx, data_batch=None):
+    runner.message_hub.update_info('iter', runner.iter)
+    lr_dict = runner.optim_wrapper.get_lr()
+    for name, lr in lr_dict.items():
+        runner.message_hub.update_scalar(f'train/{name}', lr[0])
+
+    momentum_dict = runner.optim_wrapper.get_momentum()
+    for name, momentum in momentum_dict.items():
+        runner.message_hub.update_scalar(f'train/{name}', momentum[0])
+
+
+@patch('mmengine.optim.optimizer.OptimWrapper.update_params',
+       update_params_step)
+@patch('mmengine.hooks.RuntimeInfoHook.before_train_iter', runtimeinfo_step)
+def fake_run(cfg):
+    from mmengine.runner import Runner
+    cfg.pop('model')
+    cfg.pop('visualizer')
+    cfg.pop('val_dataloader')
+    cfg.pop('val_evaluator')
+    cfg.pop('val_cfg')
+    cfg.pop('test_dataloader')
+    cfg.pop('test_evaluator')
+    cfg.pop('test_cfg')
+    extra_cfg = dict(
+        model=dict(type='ToyModel'),
+        visualizer=dict(
+            type='Visualizer',
+            vis_backends=[
+                dict(type='TensorboardVisBackend', save_dir='temp_dir')
+            ]),
+    )
+    cfg.merge_from_dict(extra_cfg)
+    # build the runner from config
+    runner = Runner.from_cfg(cfg)
+
+    # start training
+    runner.train()
diff --git a/head_extractor/build/lib/mmengine/utils/manager.py b/head_extractor/build/lib/mmengine/utils/manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..70b45f2d8e94b192d070a596f7c3a132097ccc36
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/manager.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import threading
+import warnings
+from collections import OrderedDict
+from typing import Type, TypeVar
+
+_lock = threading.RLock()
+T = TypeVar('T')
+
+
+def _accquire_lock() -> None:
+    """Acquire the module-level lock for serializing access to shared data.
+
+    This should be released with _release_lock().
+    """
+    if _lock:
+        _lock.acquire()
+
+
+def _release_lock() -> None:
+    """Release the module-level lock acquired by calling _accquire_lock()."""
+    if _lock:
+        _lock.release()
+
+
+class ManagerMeta(type):
+    """The metaclass for global accessible class.
+
+    The subclasses inheriting from ``ManagerMeta`` will manage their
+    own ``_instance_dict`` and root instances. The constructors of subclasses
+    must contain the ``name`` argument.
+
+    Examples:
+        >>> class SubClass1(metaclass=ManagerMeta):
+        >>>     def __init__(self, *args, **kwargs):
+        >>>         pass
+        AssertionError: <class '__main__.SubClass1'>.__init__ must have the
+        name argument.
+        >>> class SubClass2(metaclass=ManagerMeta):
+        >>>     def __init__(self, name):
+        >>>         pass
+        >>> # valid format.
+    """
+
+    def __init__(cls, *args):
+        cls._instance_dict = OrderedDict()
+        params = inspect.getfullargspec(cls)
+        params_names = params[0] if params[0] else []
+        assert 'name' in params_names, f'{cls} must have the `name` argument'
+        super().__init__(*args)
+
+
+class ManagerMixin(metaclass=ManagerMeta):
+    """``ManagerMixin`` is the base class for classes that have global access
+    requirements.
+
+    The subclasses inheriting from ``ManagerMixin`` can get their
+    global instances.
+
+    Examples:
+        >>> class GlobalAccessible(ManagerMixin):
+        >>>     def __init__(self, name=''):
+        >>>         super().__init__(name)
+        >>>
+        >>> GlobalAccessible.get_instance('name')
+        >>> instance_1 = GlobalAccessible.get_instance('name')
+        >>> instance_2 = GlobalAccessible.get_instance('name')
+        >>> assert id(instance_1) == id(instance_2)
+
+    Args:
+        name (str): Name of the instance. Defaults to ''.
+    """
+
+    def __init__(self, name: str = '', **kwargs):
+        assert isinstance(name, str) and name, \
+            'name argument must be an non-empty string.'
+        self._instance_name = name
+
+    @classmethod
+    def get_instance(cls: Type[T], name: str, **kwargs) -> T:
+        """Get subclass instance by name if the name exists.
+
+        If corresponding name instance has not been created, ``get_instance``
+        will create an instance, otherwise ``get_instance`` will return the
+        corresponding instance.
+
+        Examples
+            >>> instance1 = GlobalAccessible.get_instance('name1')
+            >>> # Create name1 instance.
+            >>> instance.instance_name
+            name1
+            >>> instance2 = GlobalAccessible.get_instance('name1')
+            >>> # Get name1 instance.
+            >>> assert id(instance1) == id(instance2)
+
+        Args:
+            name (str): Name of instance. Defaults to ''.
+
+        Returns:
+            object: Corresponding name instance, the latest instance, or root
+            instance.
+        """
+        _accquire_lock()
+        assert isinstance(name, str), \
+            f'type of name should be str, but got {type(cls)}'
+        instance_dict = cls._instance_dict  # type: ignore
+        # Get the instance by name.
+        if name not in instance_dict:
+            instance = cls(name=name, **kwargs)  # type: ignore
+            instance_dict[name] = instance  # type: ignore
+        elif kwargs:
+            warnings.warn(
+                f'{cls} instance named of {name} has been created, '
+                'the method `get_instance` should not accept any other '
+                'arguments')
+        # Get latest instantiated instance or root instance.
+        _release_lock()
+        return instance_dict[name]
+
+    @classmethod
+    def get_current_instance(cls):
+        """Get latest created instance.
+
+        Before calling ``get_current_instance``, The subclass must have called
+        ``get_instance(xxx)`` at least once.
+
+        Examples
+            >>> instance = GlobalAccessible.get_current_instance()
+            AssertionError: At least one of name and current needs to be set
+            >>> instance = GlobalAccessible.get_instance('name1')
+            >>> instance.instance_name
+            name1
+            >>> instance = GlobalAccessible.get_current_instance()
+            >>> instance.instance_name
+            name1
+
+        Returns:
+            object: Latest created instance.
+        """
+        _accquire_lock()
+        if not cls._instance_dict:
+            raise RuntimeError(
+                f'Before calling {cls.__name__}.get_current_instance(), you '
+                'should call get_instance(name=xxx) at least once.')
+        name = next(iter(reversed(cls._instance_dict)))
+        _release_lock()
+        return cls._instance_dict[name]
+
+    @classmethod
+    def check_instance_created(cls, name: str) -> bool:
+        """Check whether the name corresponding instance exists.
+
+        Args:
+            name (str): Name of instance.
+
+        Returns:
+            bool: Whether the name corresponding instance exists.
+        """
+        return name in cls._instance_dict
+
+    @property
+    def instance_name(self) -> str:
+        """Get the name of instance.
+
+        Returns:
+            str: Name of instance.
+        """
+        return self._instance_name
diff --git a/head_extractor/build/lib/mmengine/utils/misc.py b/head_extractor/build/lib/mmengine/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..15c1f89faecbe69a8ad61c1d376193580a72243d
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/misc.py
@@ -0,0 +1,543 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections.abc
+import functools
+import itertools
+import logging
+import re
+import subprocess
+import textwrap
+import warnings
+from collections import abc
+from importlib import import_module
+from inspect import getfullargspec, ismodule
+from itertools import repeat
+from typing import Any, Callable, Optional, Type, Union
+
+
+# From PyTorch internals
+def _ntuple(n):
+
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+def is_str(x):
+    """Whether the input is an string instance.
+
+    Note: This method is deprecated since python 2 is no longer supported.
+    """
+    return isinstance(x, str)
+
+
+def import_modules_from_strings(imports, allow_failed_imports=False):
+    """Import modules from the given list of strings.
+
+    Args:
+        imports (list | str | None): The given module names to be imported.
+        allow_failed_imports (bool): If True, the failed imports will return
+            None. Otherwise, an ImportError is raise. Defaults to False.
+
+    Returns:
+        list[module] | module | None: The imported modules.
+
+    Examples:
+        >>> osp, sys = import_modules_from_strings(
+        ...     ['os.path', 'sys'])
+        >>> import os.path as osp_
+        >>> import sys as sys_
+        >>> assert osp == osp_
+        >>> assert sys == sys_
+    """
+    if not imports:
+        return
+    single_import = False
+    if isinstance(imports, str):
+        single_import = True
+        imports = [imports]
+    if not isinstance(imports, list):
+        raise TypeError(
+            f'custom_imports must be a list but got type {type(imports)}')
+    imported = []
+    for imp in imports:
+        if not isinstance(imp, str):
+            raise TypeError(
+                f'{imp} is of type {type(imp)} and cannot be imported.')
+        try:
+            imported_tmp = import_module(imp)
+        except ImportError:
+            if allow_failed_imports:
+                warnings.warn(f'{imp} failed to import and is ignored.',
+                              UserWarning)
+                imported_tmp = None
+            else:
+                raise ImportError(f'Failed to import {imp}')
+        imported.append(imported_tmp)
+    if single_import:
+        imported = imported[0]
+    return imported
+
+
+def iter_cast(inputs, dst_type, return_type=None):
+    """Cast elements of an iterable object into some type.
+
+    Args:
+        inputs (Iterable): The input object.
+        dst_type (type): Destination type.
+        return_type (type, optional): If specified, the output object will be
+            converted to this type, otherwise an iterator.
+
+    Returns:
+        iterator or specified type: The converted object.
+    """
+    if not isinstance(inputs, abc.Iterable):
+        raise TypeError('inputs must be an iterable object')
+    if not isinstance(dst_type, type):
+        raise TypeError('"dst_type" must be a valid type')
+
+    out_iterable = map(dst_type, inputs)
+
+    if return_type is None:
+        return out_iterable
+    else:
+        return return_type(out_iterable)
+
+
+def list_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a list of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=list)
+
+
+def tuple_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a tuple of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=tuple)
+
+
+def is_seq_of(seq: Any,
+              expected_type: Union[Type, tuple],
+              seq_type: Type = None) -> bool:
+    """Check whether it is a sequence of some type.
+
+    Args:
+        seq (Sequence): The sequence to be checked.
+        expected_type (type or tuple): Expected type of sequence items.
+        seq_type (type, optional): Expected sequence type. Defaults to None.
+
+    Returns:
+        bool: Return True if ``seq`` is valid else False.
+
+    Examples:
+        >>> from mmengine.utils import is_seq_of
+        >>> seq = ['a', 'b', 'c']
+        >>> is_seq_of(seq, str)
+        True
+        >>> is_seq_of(seq, int)
+        False
+    """
+    if seq_type is None:
+        exp_seq_type = abc.Sequence
+    else:
+        assert isinstance(seq_type, type)
+        exp_seq_type = seq_type
+    if not isinstance(seq, exp_seq_type):
+        return False
+    for item in seq:
+        if not isinstance(item, expected_type):
+            return False
+    return True
+
+
+def is_list_of(seq, expected_type):
+    """Check whether it is a list of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=list)
+
+
+def is_tuple_of(seq, expected_type):
+    """Check whether it is a tuple of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=tuple)
+
+
+def slice_list(in_list, lens):
+    """Slice a list into several sub lists by a list of given length.
+
+    Args:
+        in_list (list): The list to be sliced.
+        lens(int or list): The expected length of each out list.
+
+    Returns:
+        list: A list of sliced list.
+    """
+    if isinstance(lens, int):
+        assert len(in_list) % lens == 0
+        lens = [lens] * int(len(in_list) / lens)
+    if not isinstance(lens, list):
+        raise TypeError('"indices" must be an integer or a list of integers')
+    elif sum(lens) != len(in_list):
+        raise ValueError('sum of lens and list length does not '
+                         f'match: {sum(lens)} != {len(in_list)}')
+    out_list = []
+    idx = 0
+    for i in range(len(lens)):
+        out_list.append(in_list[idx:idx + lens[i]])
+        idx += lens[i]
+    return out_list
+
+
+def concat_list(in_list):
+    """Concatenate a list of list into a single list.
+
+    Args:
+        in_list (list): The list of list to be merged.
+
+    Returns:
+        list: The concatenated flat list.
+    """
+    return list(itertools.chain(*in_list))
+
+
+def apply_to(data: Any, expr: Callable, apply_func: Callable):
+    """Apply function to each element in dict, list or tuple that matches with
+    the expression.
+
+    For examples, if you want to convert each element in a list of dict from
+    `np.ndarray` to `Tensor`. You can use the following code:
+
+    Examples:
+        >>> from mmengine.utils import apply_to
+        >>> import numpy as np
+        >>> import torch
+        >>> data = dict(array=[np.array(1)]) # {'array': [array(1)]}
+        >>> result = apply_to(data, lambda x: isinstance(x, np.ndarray), lambda x: torch.from_numpy(x))
+        >>> print(result) # {'array': [tensor(1)]}
+
+    Args:
+        data (Any): Data to be applied.
+        expr (Callable): Expression to tell which data should be applied with
+            the function. It should return a boolean.
+        apply_func (Callable): Function applied to data.
+
+    Returns:
+        Any: The data after applying.
+    """  # noqa: E501
+    if isinstance(data, dict):
+        # Keep the original dict type
+        res = type(data)()
+        for key, value in data.items():
+            res[key] = apply_to(value, expr, apply_func)
+        return res
+    elif isinstance(data, tuple) and hasattr(data, '_fields'):
+        # namedtuple
+        return type(data)(*(apply_to(sample, expr, apply_func) for sample in data))  # type: ignore  # noqa: E501  # yapf:disable
+    elif isinstance(data, (tuple, list)):
+        return type(data)(apply_to(sample, expr, apply_func) for sample in data)  # type: ignore  # noqa: E501  # yapf:disable
+    elif expr(data):
+        return apply_func(data)
+    else:
+        return data
+
+
+def check_prerequisites(
+        prerequisites,
+        checker,
+        msg_tmpl='Prerequisites "{}" are required in method "{}" but not '
+                 'found, please install them first.'):  # yapf: disable
+    """A decorator factory to check if prerequisites are satisfied.
+
+    Args:
+        prerequisites (str of list[str]): Prerequisites to be checked.
+        checker (callable): The checker method that returns True if a
+            prerequisite is meet, False otherwise.
+        msg_tmpl (str): The message template with two variables.
+
+    Returns:
+        decorator: A specific decorator.
+    """
+
+    def wrap(func):
+
+        @functools.wraps(func)
+        def wrapped_func(*args, **kwargs):
+            requirements = [prerequisites] if isinstance(
+                prerequisites, str) else prerequisites
+            missing = []
+            for item in requirements:
+                if not checker(item):
+                    missing.append(item)
+            if missing:
+                print(msg_tmpl.format(', '.join(missing), func.__name__))
+                raise RuntimeError('Prerequisites not meet.')
+            else:
+                return func(*args, **kwargs)
+
+        return wrapped_func
+
+    return wrap
+
+
+def _check_py_package(package):
+    try:
+        import_module(package)
+    except ImportError:
+        return False
+    else:
+        return True
+
+
+def _check_executable(cmd):
+    if subprocess.call(f'which {cmd}', shell=True) != 0:
+        return False
+    else:
+        return True
+
+
+def requires_package(prerequisites):
+    """A decorator to check if some python packages are installed.
+
+    Example:
+        >>> @requires_package('numpy')
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        array([0.])
+        >>> @requires_package(['numpy', 'non_package'])
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        ImportError
+    """
+    return check_prerequisites(prerequisites, checker=_check_py_package)
+
+
+def requires_executable(prerequisites):
+    """A decorator to check if some executable files are installed.
+
+    Example:
+        >>> @requires_executable('ffmpeg')
+        >>> func(arg1, args):
+        >>>     print(1)
+        1
+    """
+    return check_prerequisites(prerequisites, checker=_check_executable)
+
+
+def deprecated_api_warning(name_dict: dict,
+                           cls_name: Optional[str] = None) -> Callable:
+    """A decorator to check if some arguments are deprecate and try to replace
+    deprecate src_arg_name to dst_arg_name.
+
+    Args:
+        name_dict(dict):
+            key (str): Deprecate argument names.
+            val (str): Expected argument names.
+
+    Returns:
+        func: New function.
+    """
+
+    def api_warning_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get name of the function
+            func_name = old_func.__name__
+            if cls_name is not None:
+                func_name = f'{cls_name}.{func_name}'
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in arg_names:
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead', DeprecationWarning)
+                        arg_names[arg_names.index(src_arg_name)] = dst_arg_name
+            if kwargs:
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in kwargs:
+                        assert dst_arg_name not in kwargs, (
+                            f'The expected behavior is to replace '
+                            f'the deprecated key `{src_arg_name}` to '
+                            f'new key `{dst_arg_name}`, but got them '
+                            f'in the arguments at the same time, which '
+                            f'is confusing. `{src_arg_name} will be '
+                            f'deprecated in the future, please '
+                            f'use `{dst_arg_name}` instead.')
+
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead', DeprecationWarning)
+                        kwargs[dst_arg_name] = kwargs.pop(src_arg_name)
+
+            # apply converted arguments to the decorated method
+            output = old_func(*args, **kwargs)
+            return output
+
+        return new_func
+
+    return api_warning_wrapper
+
+
+def is_method_overridden(method: str, base_class: type,
+                         derived_class: Union[type, Any]) -> bool:
+    """Check if a method of base class is overridden in derived class.
+
+    Args:
+        method (str): the method name to check.
+        base_class (type): the class of the base class.
+        derived_class (type | Any): the class or instance of the derived class.
+    """
+    assert isinstance(base_class, type), \
+        "base_class doesn't accept instance, Please pass class instead."
+
+    if not isinstance(derived_class, type):
+        derived_class = derived_class.__class__
+
+    base_method = getattr(base_class, method)
+    derived_method = getattr(derived_class, method)
+    return derived_method != base_method
+
+
+def has_method(obj: object, method: str) -> bool:
+    """Check whether the object has a method.
+
+    Args:
+        method (str): The method name to check.
+        obj (object): The object to check.
+
+    Returns:
+        bool: True if the object has the method else False.
+    """
+    return hasattr(obj, method) and callable(getattr(obj, method))
+
+
+def deprecated_function(since: str, removed_in: str,
+                        instructions: str) -> Callable:
+    """Marks functions as deprecated.
+
+    Throw a warning when a deprecated function is called, and add a note in the
+    docstring. Modified from https://github.com/pytorch/pytorch/blob/master/torch/onnx/_deprecation.py
+
+    Args:
+        since (str): The version when the function was first deprecated.
+        removed_in (str): The version when the function will be removed.
+        instructions (str): The action users should take.
+
+    Returns:
+        Callable: A new function, which will be deprecated soon.
+    """  # noqa: E501
+    from mmengine import print_log
+
+    def decorator(function):
+
+        @functools.wraps(function)
+        def wrapper(*args, **kwargs):
+            print_log(
+                f"'{function.__module__}.{function.__name__}' "
+                f'is deprecated in version {since} and will be '
+                f'removed in version {removed_in}. Please {instructions}.',
+                logger='current',
+                level=logging.WARNING,
+            )
+            return function(*args, **kwargs)
+
+        indent = '    '
+        # Add a deprecation note to the docstring.
+        docstring = function.__doc__ or ''
+        # Add a note to the docstring.
+        deprecation_note = textwrap.dedent(f"""\
+            .. deprecated:: {since}
+                Deprecated and will be removed in version {removed_in}.
+                Please {instructions}.
+            """)
+        # Split docstring at first occurrence of newline
+        pattern = '\n\n'
+        summary_and_body = re.split(pattern, docstring, 1)
+
+        if len(summary_and_body) > 1:
+            summary, body = summary_and_body
+            body = textwrap.indent(textwrap.dedent(body), indent)
+            summary = '\n'.join(
+                [textwrap.dedent(string) for string in summary.split('\n')])
+            summary = textwrap.indent(summary, prefix=indent)
+            # Dedent the body. We cannot do this with the presence of the
+            # summary because the body contains leading whitespaces when the
+            # summary does not.
+            new_docstring_parts = [
+                deprecation_note, '\n\n', summary, '\n\n', body
+            ]
+        else:
+            summary = summary_and_body[0]
+            summary = '\n'.join(
+                [textwrap.dedent(string) for string in summary.split('\n')])
+            summary = textwrap.indent(summary, prefix=indent)
+            new_docstring_parts = [deprecation_note, '\n\n', summary]
+
+        wrapper.__doc__ = ''.join(new_docstring_parts)
+
+        return wrapper
+
+    return decorator
+
+
+def get_object_from_string(obj_name: str):
+    """Get object from name.
+
+    Args:
+        obj_name (str): The name of the object.
+
+    Examples:
+        >>> get_object_from_string('torch.optim.sgd.SGD')
+        >>> torch.optim.sgd.SGD
+    """
+    parts = iter(obj_name.split('.'))
+    module_name = next(parts)
+    # import module
+    while True:
+        try:
+            module = import_module(module_name)
+            part = next(parts)
+            # mmcv.ops has nms.py and nms function at the same time. So the
+            # function will have a higher priority
+            obj = getattr(module, part, None)
+            if obj is not None and not ismodule(obj):
+                break
+            module_name = f'{module_name}.{part}'
+        except StopIteration:
+            # if obj is a module
+            return module
+        except ImportError:
+            return None
+
+    # get class or attribute from module
+    obj = module
+    while True:
+        try:
+            obj = getattr(obj, part)
+            part = next(parts)
+        except StopIteration:
+            return obj
+        except AttributeError:
+            return None
diff --git a/head_extractor/build/lib/mmengine/utils/package_utils.py b/head_extractor/build/lib/mmengine/utils/package_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b224625f1334f8d2952a5ffb7dd0bce4c2bd53cf
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/package_utils.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import subprocess
+
+
+def is_installed(package: str) -> bool:
+    """Check package whether installed.
+
+    Args:
+        package (str): Name of package to be checked.
+    """
+    # When executing `import mmengine.runner`,
+    # pkg_resources will be imported and it takes too much time.
+    # Therefore, import it in function scope to save time.
+    import importlib.util
+
+    import pkg_resources
+    from pkg_resources import get_distribution
+
+    # refresh the pkg_resources
+    # more datails at https://github.com/pypa/setuptools/issues/373
+    importlib.reload(pkg_resources)
+    try:
+        get_distribution(package)
+        return True
+    except pkg_resources.DistributionNotFound:
+        spec = importlib.util.find_spec(package)
+        if spec is None:
+            return False
+        elif spec.origin is not None:
+            return True
+        else:
+            return False
+
+
+def get_installed_path(package: str) -> str:
+    """Get installed path of package.
+
+    Args:
+        package (str): Name of package.
+
+    Example:
+        >>> get_installed_path('mmcls')
+        >>> '.../lib/python3.7/site-packages/mmcls'
+    """
+    import importlib.util
+
+    from pkg_resources import DistributionNotFound, get_distribution
+
+    # if the package name is not the same as module name, module name should be
+    # inferred. For example, mmcv-full is the package name, but mmcv is module
+    # name. If we want to get the installed path of mmcv-full, we should concat
+    # the pkg.location and module name
+    try:
+        pkg = get_distribution(package)
+    except DistributionNotFound as e:
+        # if the package is not installed, package path set in PYTHONPATH
+        # can be detected by `find_spec`
+        spec = importlib.util.find_spec(package)
+        if spec is not None:
+            if spec.origin is not None:
+                return osp.dirname(spec.origin)
+            else:
+                # `get_installed_path` cannot get the installed path of
+                # namespace packages
+                raise RuntimeError(
+                    f'{package} is a namespace package, which is invalid '
+                    'for `get_install_path`')
+        else:
+            raise e
+
+    possible_path = osp.join(pkg.location, package)
+    if osp.exists(possible_path):
+        return possible_path
+    else:
+        return osp.join(pkg.location, package2module(package))
+
+
+def package2module(package: str):
+    """Infer module name from package.
+
+    Args:
+        package (str): Package to infer module name.
+    """
+    from pkg_resources import get_distribution
+    pkg = get_distribution(package)
+    if pkg.has_metadata('top_level.txt'):
+        module_name = pkg.get_metadata('top_level.txt').split('\n')[0]
+        return module_name
+    else:
+        raise ValueError(f'can not infer the module name of {package}')
+
+
+def call_command(cmd: list) -> None:
+    try:
+        subprocess.check_call(cmd)
+    except Exception as e:
+        raise e  # type: ignore
+
+
+def install_package(package: str):
+    if not is_installed(package):
+        call_command(['python', '-m', 'pip', 'install', package])
diff --git a/head_extractor/build/lib/mmengine/utils/path.py b/head_extractor/build/lib/mmengine/utils/path.py
new file mode 100644
index 0000000000000000000000000000000000000000..307d053f2fd9fddf0bb52d4e0081cdd32e5a6364
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/path.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from pathlib import Path
+
+from .misc import is_str
+
+
+def is_filepath(x):
+    return is_str(x) or isinstance(x, Path)
+
+
+def fopen(filepath, *args, **kwargs):
+    if is_str(filepath):
+        return open(filepath, *args, **kwargs)
+    elif isinstance(filepath, Path):
+        return filepath.open(*args, **kwargs)
+    raise ValueError('`filepath` should be a string or a Path')
+
+
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+
+
+def mkdir_or_exist(dir_name, mode=0o777):
+    if dir_name == '':
+        return
+    dir_name = osp.expanduser(dir_name)
+    os.makedirs(dir_name, mode=mode, exist_ok=True)
+
+
+def symlink(src, dst, overwrite=True, **kwargs):
+    if os.path.lexists(dst) and overwrite:
+        os.remove(dst)
+    os.symlink(src, dst, **kwargs)
+
+
+def scandir(dir_path, suffix=None, recursive=False, case_sensitive=True):
+    """Scan a directory to find the interested files.
+
+    Args:
+        dir_path (str | :obj:`Path`): Path of the directory.
+        suffix (str | tuple(str), optional): File suffix that we are
+            interested in. Defaults to None.
+        recursive (bool, optional): If set to True, recursively scan the
+            directory. Defaults to False.
+        case_sensitive (bool, optional) : If set to False, ignore the case of
+            suffix. Defaults to True.
+
+    Returns:
+        A generator for all the interested files with relative paths.
+    """
+    if isinstance(dir_path, (str, Path)):
+        dir_path = str(dir_path)
+    else:
+        raise TypeError('"dir_path" must be a string or Path object')
+
+    if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+        raise TypeError('"suffix" must be a string or tuple of strings')
+
+    if suffix is not None and not case_sensitive:
+        suffix = suffix.lower() if isinstance(suffix, str) else tuple(
+            item.lower() for item in suffix)
+
+    root = dir_path
+
+    def _scandir(dir_path, suffix, recursive, case_sensitive):
+        for entry in os.scandir(dir_path):
+            if not entry.name.startswith('.') and entry.is_file():
+                rel_path = osp.relpath(entry.path, root)
+                _rel_path = rel_path if case_sensitive else rel_path.lower()
+                if suffix is None or _rel_path.endswith(suffix):
+                    yield rel_path
+            elif recursive and os.path.isdir(entry.path):
+                # scan recursively if entry.path is a directory
+                yield from _scandir(entry.path, suffix, recursive,
+                                    case_sensitive)
+
+    return _scandir(dir_path, suffix, recursive, case_sensitive)
+
+
+def find_vcs_root(path, markers=('.git', )):
+    """Finds the root directory (including itself) of specified markers.
+
+    Args:
+        path (str): Path of directory or file.
+        markers (list[str], optional): List of file or directory names.
+
+    Returns:
+        The directory contained one of the markers or None if not found.
+    """
+    if osp.isfile(path):
+        path = osp.dirname(path)
+
+    prev, cur = None, osp.abspath(osp.expanduser(path))
+    while cur != prev:
+        if any(osp.exists(osp.join(cur, marker)) for marker in markers):
+            return cur
+        prev, cur = cur, osp.split(cur)[0]
+    return None
+
+
+def is_abs(path: str) -> bool:
+    """Check if path is an absolute path in different backends.
+
+    Args:
+        path (str): path of directory or file.
+
+    Returns:
+        bool: whether path is an absolute path.
+    """
+    if osp.isabs(path) or path.startswith(('http://', 'https://', 's3://')):
+        return True
+    else:
+        return False
diff --git a/head_extractor/build/lib/mmengine/utils/progressbar.py b/head_extractor/build/lib/mmengine/utils/progressbar.py
new file mode 100644
index 0000000000000000000000000000000000000000..36172f04dd5a469387e19f16fbad24ea3ae3b7ce
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/progressbar.py
@@ -0,0 +1,247 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+from collections.abc import Iterable
+from multiprocessing import Pool
+from shutil import get_terminal_size
+from typing import Callable, Sequence
+
+from .timer import Timer
+
+
+class ProgressBar:
+    """A progress bar which can print the progress.
+
+    Args:
+        task_num (int): Number of total steps. Defaults to 0.
+        bar_width (int): Width of the progress bar. Defaults to 50.
+        start (bool): Whether to start the progress bar in the constructor.
+            Defaults to True.
+        file (callable): Progress bar output mode. Defaults to "sys.stdout".
+
+    Examples:
+        >>> import mmengine
+        >>> import time
+        >>> bar = mmengine.ProgressBar(10)
+        >>> for i in range(10):
+        >>>    bar.update()
+        >>>    time.sleep(1)
+    """
+
+    def __init__(self,
+                 task_num: int = 0,
+                 bar_width: int = 50,
+                 start: bool = True,
+                 file=sys.stdout):
+        self.task_num = task_num
+        self.bar_width = bar_width
+        self.completed = 0
+        self.file = file
+        if start:
+            self.start()
+
+    @property
+    def terminal_width(self):
+        width, _ = get_terminal_size()
+        return width
+
+    def start(self):
+        if self.task_num > 0:
+            self.file.write(f'[{" " * self.bar_width}] 0/{self.task_num}, '
+                            'elapsed: 0s, ETA:')
+        else:
+            self.file.write('completed: 0, elapsed: 0s')
+        self.file.flush()
+        self.timer = Timer()
+
+    def update(self, num_tasks: int = 1):
+        """update progressbar.
+
+        Args:
+            num_tasks (int): Update step size.
+        """
+        assert num_tasks > 0
+        self.completed += num_tasks
+        elapsed = self.timer.since_start()
+        if elapsed > 0:
+            fps = self.completed / elapsed
+        else:
+            fps = float('inf')
+        if self.task_num > 0:
+            percentage = self.completed / float(self.task_num)
+            eta = int(elapsed * (1 - percentage) / percentage + 0.5)
+            msg = f'\r[{{}}] {self.completed}/{self.task_num}, ' \
+                  f'{fps:.1f} task/s, elapsed: {int(elapsed + 0.5)}s, ' \
+                  f'ETA: {eta:5}s'
+
+            bar_width = min(self.bar_width,
+                            int(self.terminal_width - len(msg)) + 2,
+                            int(self.terminal_width * 0.6))
+            bar_width = max(2, bar_width)
+            mark_width = int(bar_width * percentage)
+            bar_chars = '>' * mark_width + ' ' * (bar_width - mark_width)
+            self.file.write(msg.format(bar_chars))
+        else:
+            self.file.write(
+                f'completed: {self.completed}, elapsed: {int(elapsed + 0.5)}s,'
+                f' {fps:.1f} tasks/s')
+        self.file.flush()
+
+
+def track_progress(func: Callable,
+                   tasks: Sequence,
+                   bar_width: int = 50,
+                   file=sys.stdout,
+                   **kwargs):
+    """Track the progress of tasks execution with a progress bar.
+
+    Tasks are done with a simple for-loop.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (Sequence): If tasks is a tuple, it must contain two elements,
+            the first being the tasks to be completed and the other being the
+            number of tasks. If it is not a tuple, it represents the tasks to
+            be completed.
+        bar_width (int): Width of progress bar.
+
+    Returns:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]  # type: ignore
+    elif isinstance(tasks, Sequence):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be a tuple object or a sequence object, but got '
+            f'{type(tasks)}')
+    prog_bar = ProgressBar(task_num, bar_width, file=file)
+    results = []
+    for task in tasks:
+        results.append(func(task, **kwargs))
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    return results
+
+
+def init_pool(process_num, initializer=None, initargs=None):
+    if initializer is None:
+        return Pool(process_num)
+    elif initargs is None:
+        return Pool(process_num, initializer)
+    else:
+        if not isinstance(initargs, tuple):
+            raise TypeError('"initargs" must be a tuple')
+        return Pool(process_num, initializer, initargs)
+
+
+def track_parallel_progress(func: Callable,
+                            tasks: Sequence,
+                            nproc: int,
+                            initializer: Callable = None,
+                            initargs: tuple = None,
+                            bar_width: int = 50,
+                            chunksize: int = 1,
+                            skip_first: bool = False,
+                            keep_order: bool = True,
+                            file=sys.stdout):
+    """Track the progress of parallel task execution with a progress bar.
+
+    The built-in :mod:`multiprocessing` module is used for process pools and
+    tasks are done with :func:`Pool.map` or :func:`Pool.imap_unordered`.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (Sequence): If tasks is a tuple, it must contain two elements,
+            the first being the tasks to be completed and the other being the
+            number of tasks. If it is not a tuple, it represents the tasks to
+            be completed.
+        nproc (int): Process (worker) number.
+        initializer (None or callable): Refer to :class:`multiprocessing.Pool`
+            for details.
+        initargs (None or tuple): Refer to :class:`multiprocessing.Pool` for
+            details.
+        chunksize (int): Refer to :class:`multiprocessing.Pool` for details.
+        bar_width (int): Width of progress bar.
+        skip_first (bool): Whether to skip the first sample for each worker
+            when estimating fps, since the initialization step may takes
+            longer.
+        keep_order (bool): If True, :func:`Pool.imap` is used, otherwise
+            :func:`Pool.imap_unordered` is used.
+
+    Returns:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]  # type: ignore
+    elif isinstance(tasks, Sequence):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be a tuple object or a sequence object, but got '
+            f'{type(tasks)}')
+    pool = init_pool(nproc, initializer, initargs)
+    start = not skip_first
+    task_num -= nproc * chunksize * int(skip_first)
+    prog_bar = ProgressBar(task_num, bar_width, start, file=file)
+    results = []
+    if keep_order:
+        gen = pool.imap(func, tasks, chunksize)
+    else:
+        gen = pool.imap_unordered(func, tasks, chunksize)
+    for result in gen:
+        results.append(result)
+        if skip_first:
+            if len(results) < nproc * chunksize:
+                continue
+            elif len(results) == nproc * chunksize:
+                prog_bar.start()
+                continue
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    pool.close()
+    pool.join()
+    return results
+
+
+def track_iter_progress(tasks: Sequence, bar_width: int = 50, file=sys.stdout):
+    """Track the progress of tasks iteration or enumeration with a progress
+    bar.
+
+    Tasks are yielded with a simple for-loop.
+
+    Args:
+        tasks (Sequence): If tasks is a tuple, it must contain two elements,
+            the first being the tasks to be completed and the other being the
+            number of tasks. If it is not a tuple, it represents the tasks to
+            be completed.
+        bar_width (int): Width of progress bar.
+
+    Yields:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]  # type: ignore
+    elif isinstance(tasks, Sequence):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be a tuple object or a sequence object, but got '
+            f'{type(tasks)}')
+    prog_bar = ProgressBar(task_num, bar_width, file=file)
+    for task in tasks:
+        yield task
+        prog_bar.update()
+    prog_bar.file.write('\n')
diff --git a/head_extractor/build/lib/mmengine/utils/progressbar_rich.py b/head_extractor/build/lib/mmengine/utils/progressbar_rich.py
new file mode 100644
index 0000000000000000000000000000000000000000..c126866ba9988022ef1f3ac9cb5deb78d9a963f9
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/progressbar_rich.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from multiprocessing import Pool
+from typing import Callable, Iterable, Sized
+
+from rich.progress import (BarColumn, MofNCompleteColumn, Progress, Task,
+                           TaskProgressColumn, TextColumn, TimeRemainingColumn)
+from rich.text import Text
+
+
+class _Worker:
+    """Function wrapper for ``track_progress_rich``"""
+
+    def __init__(self, func) -> None:
+        self.func = func
+
+    def __call__(self, inputs):
+        inputs, idx = inputs
+        if not isinstance(inputs, (tuple, list)):
+            inputs = (inputs, )
+
+        return self.func(*inputs), idx
+
+
+class _SkipFirstTimeRemainingColumn(TimeRemainingColumn):
+    """Skip calculating remaining time for the first few times.
+
+    Args:
+        skip_times (int): The number of times to skip. Defaults to 0.
+    """
+
+    def __init__(self, *args, skip_times=0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.skip_times = skip_times
+
+    def render(self, task: Task) -> Text:
+        """Show time remaining."""
+        if task.completed <= self.skip_times:
+            return Text('-:--:--', style='progress.remaining')
+        return super().render(task)
+
+
+def _tasks_with_index(tasks):
+    """Add index to tasks."""
+    for idx, task in enumerate(tasks):
+        yield task, idx
+
+
+def track_progress_rich(func: Callable,
+                        tasks: Iterable = tuple(),
+                        task_num: int = None,
+                        nproc: int = 1,
+                        chunksize: int = 1,
+                        description: str = 'Processing',
+                        color: str = 'blue') -> list:
+    """Track the progress of parallel task execution with a progress bar. The
+    built-in :mod:`multiprocessing` module is used for process pools and tasks
+    are done with :func:`Pool.map` or :func:`Pool.imap_unordered`.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (Iterable or Sized): A tuple of tasks. There are several cases
+            for different format tasks:
+            - When ``func`` accepts no arguments: tasks should be an empty
+              tuple, and ``task_num`` must be specified.
+            - When ``func`` accepts only one argument: tasks should be a tuple
+              containing the argument.
+            - When ``func`` accepts multiple arguments: tasks should be a
+              tuple, with each element representing a set of arguments.
+              If an element is a ``dict``, it will be parsed as a set of
+              keyword-only arguments.
+            Defaults to an empty tuple.
+        task_num (int, optional): If ``tasks`` is an iterator which does not
+            have length, the number of tasks can be provided by ``task_num``.
+            Defaults to None.
+        nproc (int): Process (worker) number, if nuproc is 1,
+            use single process. Defaults to 1.
+        chunksize (int): Refer to :class:`multiprocessing.Pool` for details.
+            Defaults to 1.
+        description (str): The description of progress bar.
+            Defaults to "Process".
+        color (str): The color of progress bar. Defaults to "blue".
+
+    Examples:
+        >>> import time
+
+        >>> def func(x):
+        ...    time.sleep(1)
+        ...    return x**2
+        >>> track_progress_rich(func, range(10), nproc=2)
+
+    Returns:
+        list: The task results.
+    """
+    if not callable(func):
+        raise TypeError('func must be a callable object')
+    if not isinstance(tasks, Iterable):
+        raise TypeError(
+            f'tasks must be an iterable object, but got {type(tasks)}')
+    if isinstance(tasks, Sized):
+        if len(tasks) == 0:
+            if task_num is None:
+                raise ValueError('If tasks is an empty iterable, '
+                                 'task_num must be set')
+            else:
+                tasks = tuple(tuple() for _ in range(task_num))
+        else:
+            if task_num is not None and task_num != len(tasks):
+                raise ValueError('task_num does not match the length of tasks')
+            task_num = len(tasks)
+
+    if nproc <= 0:
+        raise ValueError('nproc must be a positive number')
+
+    skip_times = nproc * chunksize if nproc > 1 else 0
+    prog_bar = Progress(
+        TextColumn('{task.description}'),
+        BarColumn(),
+        _SkipFirstTimeRemainingColumn(skip_times=skip_times),
+        MofNCompleteColumn(),
+        TaskProgressColumn(show_speed=True),
+    )
+
+    worker = _Worker(func)
+    task_id = prog_bar.add_task(
+        total=task_num, color=color, description=description)
+    tasks = _tasks_with_index(tasks)
+
+    # Use single process when nproc is 1, else use multiprocess.
+    with prog_bar:
+        if nproc == 1:
+            results = []
+            for task in tasks:
+                results.append(worker(task)[0])
+                prog_bar.update(task_id, advance=1, refresh=True)
+        else:
+            with Pool(nproc) as pool:
+                results = []
+                unordered_results = []
+                gen = pool.imap_unordered(worker, tasks, chunksize)
+                try:
+                    for result in gen:
+                        result, idx = result
+                        unordered_results.append((result, idx))
+                        results.append(None)
+                        prog_bar.update(task_id, advance=1, refresh=True)
+                except Exception as e:
+                    prog_bar.stop()
+                    raise e
+            for result, idx in unordered_results:
+                results[idx] = result
+    return results
diff --git a/head_extractor/build/lib/mmengine/utils/timer.py b/head_extractor/build/lib/mmengine/utils/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..087a969cfabe30ce0ed3080fd6eb6b81e232502f
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/timer.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from time import time
+
+
+class TimerError(Exception):
+
+    def __init__(self, message):
+        self.message = message
+        super().__init__(message)
+
+
+class Timer:
+    """A flexible Timer class.
+
+    Examples:
+        >>> import time
+        >>> import mmcv
+        >>> with mmcv.Timer():
+        >>>     # simulate a code block that will run for 1s
+        >>>     time.sleep(1)
+        1.000
+        >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'):
+        >>>     # simulate a code block that will run for 1s
+        >>>     time.sleep(1)
+        it takes 1.0 seconds
+        >>> timer = mmcv.Timer()
+        >>> time.sleep(0.5)
+        >>> print(timer.since_start())
+        0.500
+        >>> time.sleep(0.5)
+        >>> print(timer.since_last_check())
+        0.500
+        >>> print(timer.since_start())
+        1.000
+    """
+
+    def __init__(self, start=True, print_tmpl=None):
+        self._is_running = False
+        self.print_tmpl = print_tmpl if print_tmpl else '{:.3f}'
+        if start:
+            self.start()
+
+    @property
+    def is_running(self):
+        """bool: indicate whether the timer is running"""
+        return self._is_running
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        print(self.print_tmpl.format(self.since_last_check()))
+        self._is_running = False
+
+    def start(self):
+        """Start the timer."""
+        if not self._is_running:
+            self._t_start = time()
+            self._is_running = True
+        self._t_last = time()
+
+    def since_start(self):
+        """Total time since the timer is started.
+
+        Returns:
+            float: Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        self._t_last = time()
+        return self._t_last - self._t_start
+
+    def since_last_check(self):
+        """Time since the last checking.
+
+        Either :func:`since_start` or :func:`since_last_check` is a checking
+        operation.
+
+        Returns:
+            float: Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        dur = time() - self._t_last
+        self._t_last = time()
+        return dur
+
+
+_g_timers = {}  # global timers
+
+
+def check_time(timer_id):
+    """Add check points in a single line.
+
+    This method is suitable for running a task on a list of items. A timer will
+    be registered when the method is called for the first time.
+
+    Examples:
+        >>> import time
+        >>> import mmcv
+        >>> for i in range(1, 6):
+        >>>     # simulate a code block
+        >>>     time.sleep(i)
+        >>>     mmcv.check_time('task1')
+        2.000
+        3.000
+        4.000
+        5.000
+
+    Args:
+        str: Timer identifier.
+    """
+    if timer_id not in _g_timers:
+        _g_timers[timer_id] = Timer()
+        return 0
+    else:
+        return _g_timers[timer_id].since_last_check()
diff --git a/head_extractor/build/lib/mmengine/utils/version_utils.py b/head_extractor/build/lib/mmengine/utils/version_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..620180547a0d78746186b6f61e8c704be618c8c4
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/utils/version_utils.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import subprocess
+import warnings
+
+from packaging.version import parse
+
+
+def digit_version(version_str: str, length: int = 4):
+    """Convert a version string into a tuple of integers.
+
+    This method is usually used for comparing two versions. For pre-release
+    versions: alpha < beta < rc.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Defaults to 4.
+
+    Returns:
+        tuple[int]: The version info in digits (integers).
+    """
+    assert 'parrots' not in version_str
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        mapping = {'a': -3, 'b': -2, 'rc': -1}
+        val = -4
+        # version.pre can be None
+        if version.pre:
+            if version.pre[0] not in mapping:
+                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
+                              'version checking may go wrong')
+            else:
+                val = mapping[version.pre[0]]
+            release.extend([val, version.pre[-1]])
+        else:
+            release.extend([val, 0])
+
+    elif version.is_postrelease:
+        release.extend([1, version.post])  # type: ignore
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+
+def _minimal_ext_cmd(cmd):
+    # construct minimal environment
+    env = {}
+    for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+        v = os.environ.get(k)
+        if v is not None:
+            env[k] = v
+    # LANGUAGE is used on win32
+    env['LANGUAGE'] = 'C'
+    env['LANG'] = 'C'
+    env['LC_ALL'] = 'C'
+    out, err = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+        env=env).communicate()
+    return out
+
+
+def get_git_hash(fallback='unknown', digits=None):
+    """Get the git hash of the current repo.
+
+    Args:
+        fallback (str, optional): The fallback string when git hash is
+            unavailable. Defaults to 'unknown'.
+        digits (int, optional): kept digits of the hash. Defaults to None,
+            meaning all digits are kept.
+
+    Returns:
+        str: Git commit hash.
+    """
+
+    if digits is not None and not isinstance(digits, int):
+        raise TypeError('digits must be None or an integer')
+
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+        sha = out.strip().decode('ascii')
+        if digits is not None:
+            sha = sha[:digits]
+    except OSError:
+        sha = fallback
+
+    return sha
diff --git a/head_extractor/build/lib/mmengine/version.py b/head_extractor/build/lib/mmengine/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9e981bdb3534b0fc92f8b16b48bcaf7afe1268a
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/version.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+__version__ = '0.10.5'
+
+
+def parse_version_info(version_str):
+    """Parse the version information.
+
+    Args:
+        version_str (str): version string like '0.1.0'.
+
+    Returns:
+        tuple: version information contains major, minor, micro version.
+    """
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/head_extractor/build/lib/mmengine/visualization/__init__.py b/head_extractor/build/lib/mmengine/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f59452c5402b1c16a952c15a5b3648dc675e11d
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/visualization/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .vis_backend import (AimVisBackend, BaseVisBackend, ClearMLVisBackend,
+                          DVCLiveVisBackend, LocalVisBackend, MLflowVisBackend,
+                          NeptuneVisBackend, TensorboardVisBackend,
+                          WandbVisBackend)
+from .visualizer import Visualizer
+
+__all__ = [
+    'Visualizer', 'BaseVisBackend', 'LocalVisBackend', 'WandbVisBackend',
+    'TensorboardVisBackend', 'MLflowVisBackend', 'ClearMLVisBackend',
+    'NeptuneVisBackend', 'DVCLiveVisBackend', 'AimVisBackend'
+]
diff --git a/head_extractor/build/lib/mmengine/visualization/utils.py b/head_extractor/build/lib/mmengine/visualization/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e6b7d8ba9cb90e464426a1c140cc0815aeac103
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/visualization/utils.py
@@ -0,0 +1,244 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type, Union
+
+import cv2
+import numpy as np
+import torch
+
+if TYPE_CHECKING:
+    from matplotlib.backends.backend_agg import FigureCanvasAgg
+
+
+def tensor2ndarray(value: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
+    """If the type of value is torch.Tensor, convert the value to np.ndarray.
+
+    Args:
+        value (np.ndarray, torch.Tensor): value.
+
+    Returns:
+        Any: value.
+    """
+    if isinstance(value, torch.Tensor):
+        value = value.detach().cpu().numpy()
+    return value
+
+
+def value2list(value: Any, valid_type: Union[Type, Tuple[Type, ...]],
+               expand_dim: int) -> List[Any]:
+    """If the type of ``value`` is ``valid_type``, convert the value to list
+    and expand to ``expand_dim``.
+
+    Args:
+        value (Any): value.
+        valid_type (Union[Type, Tuple[Type, ...]): valid type.
+        expand_dim (int): expand dim.
+
+    Returns:
+        List[Any]: value.
+    """
+    if isinstance(value, valid_type):
+        value = [value] * expand_dim
+    return value
+
+
+def check_type(name: str, value: Any,
+               valid_type: Union[Type, Tuple[Type, ...]]) -> None:
+    """Check whether the type of value is in ``valid_type``.
+
+    Args:
+        name (str): value name.
+        value (Any): value.
+        valid_type (Type, Tuple[Type, ...]): expected type.
+    """
+    if not isinstance(value, valid_type):
+        raise TypeError(f'`{name}` should be {valid_type} '
+                        f' but got {type(value)}')
+
+
+def check_length(name: str, value: Any, valid_length: int) -> None:
+    """If type of the ``value`` is list, check whether its length is equal with
+    or greater than ``valid_length``.
+
+    Args:
+        name (str): value name.
+        value (Any): value.
+        valid_length (int): expected length.
+    """
+    if isinstance(value, list):
+        if len(value) < valid_length:
+            raise AssertionError(
+                f'The length of {name} must equal with or '
+                f'greater than {valid_length}, but got {len(value)}')
+
+
+def check_type_and_length(name: str, value: Any,
+                          valid_type: Union[Type, Tuple[Type, ...]],
+                          valid_length: int) -> None:
+    """Check whether the type of value is in ``valid_type``. If type of the
+    ``value`` is list, check whether its length is equal with or greater than
+    ``valid_length``.
+
+    Args:
+        value (Any): value.
+        legal_type (Type, Tuple[Type, ...]): legal type.
+        valid_length (int): expected length.
+
+    Returns:
+        List[Any]: value.
+    """
+    check_type(name, value, valid_type)
+    check_length(name, value, valid_length)
+
+
+def color_val_matplotlib(
+    colors: Union[str, tuple, List[Union[str, tuple]]]
+) -> Union[str, tuple, List[Union[str, tuple]]]:
+    """Convert various input in RGB order to normalized RGB matplotlib color
+    tuples,
+    Args:
+        colors (Union[str, tuple, List[Union[str, tuple]]]): Color inputs
+    Returns:
+        Union[str, tuple, List[Union[str, tuple]]]: A tuple of 3 normalized
+        floats indicating RGB channels.
+    """
+    if isinstance(colors, str):
+        return colors
+    elif isinstance(colors, tuple):
+        assert len(colors) == 3
+        for channel in colors:
+            assert 0 <= channel <= 255
+        colors = [channel / 255 for channel in colors]
+        return tuple(colors)
+    elif isinstance(colors, list):
+        colors = [
+            color_val_matplotlib(color)  # type:ignore
+            for color in colors
+        ]
+        return colors
+    else:
+        raise TypeError(f'Invalid type for color: {type(colors)}')
+
+
+def color_str2rgb(color: str) -> tuple:
+    """Convert Matplotlib str color to an RGB color which range is 0 to 255,
+    silently dropping the alpha channel.
+
+    Args:
+        color (str): Matplotlib color.
+
+    Returns:
+        tuple: RGB color.
+    """
+    import matplotlib
+    rgb_color: tuple = matplotlib.colors.to_rgb(color)
+    rgb_color = tuple(int(c * 255) for c in rgb_color)
+    return rgb_color
+
+
+def convert_overlay_heatmap(feat_map: Union[np.ndarray, torch.Tensor],
+                            img: Optional[np.ndarray] = None,
+                            alpha: float = 0.5) -> np.ndarray:
+    """Convert feat_map to heatmap and overlay on image, if image is not None.
+
+    Args:
+        feat_map (np.ndarray, torch.Tensor): The feat_map to convert
+            with of shape (H, W), where H is the image height and W is
+            the image width.
+        img (np.ndarray, optional): The origin image. The format
+            should be RGB. Defaults to None.
+        alpha (float): The transparency of featmap. Defaults to 0.5.
+
+    Returns:
+        np.ndarray: heatmap
+    """
+    assert feat_map.ndim == 2 or (feat_map.ndim == 3
+                                  and feat_map.shape[0] in [1, 3])
+    if isinstance(feat_map, torch.Tensor):
+        feat_map = feat_map.detach().cpu().numpy()
+
+    if feat_map.ndim == 3:
+        feat_map = feat_map.transpose(1, 2, 0)
+
+    norm_img = np.zeros(feat_map.shape)
+    norm_img = cv2.normalize(feat_map, norm_img, 0, 255, cv2.NORM_MINMAX)
+    norm_img = np.asarray(norm_img, dtype=np.uint8)
+    heat_img = cv2.applyColorMap(norm_img, cv2.COLORMAP_JET)
+    heat_img = cv2.cvtColor(heat_img, cv2.COLOR_BGR2RGB)
+    if img is not None:
+        heat_img = cv2.addWeighted(img, 1 - alpha, heat_img, alpha, 0)
+    return heat_img
+
+
+def wait_continue(figure, timeout: float = 0, continue_key: str = ' ') -> int:
+    """Show the image and wait for the user's input.
+
+    This implementation refers to
+    https://github.com/matplotlib/matplotlib/blob/v3.5.x/lib/matplotlib/_blocking_input.py
+
+    Args:
+        timeout (float): If positive, continue after ``timeout`` seconds.
+            Defaults to 0.
+        continue_key (str): The key for users to continue. Defaults to
+            the space key.
+
+    Returns:
+        int: If zero, means time out or the user pressed ``continue_key``,
+            and if one, means the user closed the show figure.
+    """  # noqa: E501
+    import matplotlib.pyplot as plt
+    from matplotlib.backend_bases import CloseEvent
+    is_inline = 'inline' in plt.get_backend()
+    if is_inline:
+        # If use inline backend, interactive input and timeout is no use.
+        return 0
+
+    if figure.canvas.manager:  # type: ignore
+        # Ensure that the figure is shown
+        figure.show()  # type: ignore
+
+    while True:
+
+        # Connect the events to the handler function call.
+        event = None
+
+        def handler(ev):
+            # Set external event variable
+            nonlocal event
+            # Qt backend may fire two events at the same time,
+            # use a condition to avoid missing close event.
+            event = ev if not isinstance(event, CloseEvent) else event
+            figure.canvas.stop_event_loop()
+
+        cids = [
+            figure.canvas.mpl_connect(name, handler)  # type: ignore
+            for name in ('key_press_event', 'close_event')
+        ]
+
+        try:
+            figure.canvas.start_event_loop(timeout)  # type: ignore
+        finally:  # Run even on exception like ctrl-c.
+            # Disconnect the callbacks.
+            for cid in cids:
+                figure.canvas.mpl_disconnect(cid)  # type: ignore
+
+        if isinstance(event, CloseEvent):
+            return 1  # Quit for close.
+        elif event is None or event.key == continue_key:
+            return 0  # Quit for continue.
+
+
+def img_from_canvas(canvas: 'FigureCanvasAgg') -> np.ndarray:
+    """Get RGB image from ``FigureCanvasAgg``.
+
+    Args:
+        canvas (FigureCanvasAgg): The canvas to get image.
+
+    Returns:
+        np.ndarray: the output of image in RGB.
+    """  # noqa: E501
+    s, (width, height) = canvas.print_to_buffer()
+    buffer = np.frombuffer(s, dtype='uint8')
+    img_rgba = buffer.reshape(height, width, 4)
+    rgb, alpha = np.split(img_rgba, [3], axis=2)
+    return rgb.astype('uint8')
diff --git a/head_extractor/build/lib/mmengine/visualization/vis_backend.py b/head_extractor/build/lib/mmengine/visualization/vis_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..f74eab1fcd91100fd1d9a2ce384471e0e0e5dc94
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/visualization/vis_backend.py
@@ -0,0 +1,1448 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import functools
+import logging
+import os
+import os.path as osp
+import platform
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections.abc import MutableMapping
+from typing import Any, Callable, List, Optional, Sequence, Union
+
+import cv2
+import numpy as np
+import torch
+
+from mmengine.config import Config, ConfigDict
+from mmengine.fileio import dump
+from mmengine.hooks.logger_hook import SUFFIX_TYPE
+from mmengine.logging import MMLogger, print_log
+from mmengine.registry import VISBACKENDS
+from mmengine.utils import digit_version, scandir
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+
+def force_init_env(old_func: Callable) -> Any:
+    """Those methods decorated by ``force_init_env`` will be forced to call
+    ``_init_env`` if the instance has not been fully initiated. This function
+    will decorated all the `add_xxx` method and `experiment` method, because
+    `VisBackend` is initialized only when used its API.
+
+    Args:
+        old_func (Callable): Decorated function, make sure the first arg is an
+            instance with ``_init_env`` method.
+
+    Returns:
+        Any: Depends on old_func.
+    """
+
+    @functools.wraps(old_func)
+    def wrapper(obj: object, *args, **kwargs):
+        # The instance must have `_init_env` method.
+        if not hasattr(obj, '_init_env'):
+            raise AttributeError(f'{type(obj)} does not have _init_env '
+                                 'method.')
+        # If instance does not have `_env_initialized` attribute or
+        # `_env_initialized` is False, call `_init_env` and set
+        # `_env_initialized` to True
+        if not getattr(obj, '_env_initialized', False):
+            print_log(
+                'Attribute `_env_initialized` is not defined in '
+                f'{type(obj)} or `{type(obj)}._env_initialized is '
+                'False, `_init_env` will be called and '
+                f'{type(obj)}._env_initialized will be set to True',
+                logger='current',
+                level=logging.DEBUG)
+            obj._init_env()  # type: ignore
+            obj._env_initialized = True  # type: ignore
+
+        return old_func(obj, *args, **kwargs)
+
+    return wrapper
+
+
+class BaseVisBackend(metaclass=ABCMeta):
+    """Base class for visualization backend.
+
+    All backends must inherit ``BaseVisBackend`` and implement
+    the required functions.
+
+    Args:
+        save_dir (str, optional): The root directory to save
+            the files produced by the backend.
+    """
+
+    def __init__(self, save_dir: str):
+        self._save_dir = save_dir
+        self._env_initialized = False
+
+    @property
+    @abstractmethod
+    def experiment(self) -> Any:
+        """Return the experiment object associated with this visualization
+        backend.
+
+        The experiment attribute can get the visualization backend, such as
+        wandb, tensorboard. If you want to write other data, such as writing a
+        table, you can directly get the visualization backend through
+        experiment.
+        """
+        pass
+
+    @abstractmethod
+    def _init_env(self) -> Any:
+        """Setup env for VisBackend."""
+        pass
+
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config.
+
+        Args:
+            config (Config): The Config object
+        """
+        pass
+
+    def add_graph(self, model: torch.nn.Module, data_batch: Sequence[dict],
+                  **kwargs) -> None:
+        """Record the model graph.
+
+        Args:
+            model (torch.nn.Module): Model to draw.
+            data_batch (Sequence[dict]): Batch of data from dataloader.
+        """
+        pass
+
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        pass
+
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        pass
+
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalars' data.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): The scalar's data will be
+                saved to the `file_path` file at the same time
+                if the `file_path` parameter is specified.
+                Defaults to None.
+        """
+        pass
+
+    def close(self) -> None:
+        """close an opened object."""
+        pass
+
+
+@VISBACKENDS.register_module()
+class LocalVisBackend(BaseVisBackend):
+    """Local visualization backend class.
+
+    It can write image, config, scalars, etc.
+    to the local hard disk. You can get the drawing backend
+    through the experiment property for custom drawing.
+
+    Examples:
+        >>> from mmengine.visualization import LocalVisBackend
+        >>> import numpy as np
+        >>> local_vis_backend = LocalVisBackend(save_dir='temp_dir')
+        >>> img = np.random.randint(0, 256, size=(10, 10, 3))
+        >>> local_vis_backend.add_image('img', img)
+        >>> local_vis_backend.add_scalar('mAP', 0.6)
+        >>> local_vis_backend.add_scalars({'loss': [1, 2, 3], 'acc': 0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> local_vis_backend.add_config(cfg)
+
+    Args:
+        save_dir (str, optional): The root directory to save the files
+            produced by the visualizer. If it is none, it means no data
+            is stored.
+        img_save_dir (str): The directory to save images.
+            Defaults to 'vis_image'.
+        config_save_file (str): The file name to save config.
+            Defaults to 'config.py'.
+        scalar_save_file (str):  The file name to save scalar values.
+            Defaults to 'scalars.json'.
+    """
+
+    def __init__(self,
+                 save_dir: str,
+                 img_save_dir: str = 'vis_image',
+                 config_save_file: str = 'config.py',
+                 scalar_save_file: str = 'scalars.json'):
+        assert config_save_file.split('.')[-1] == 'py'
+        assert scalar_save_file.split('.')[-1] == 'json'
+        super().__init__(save_dir)
+        self._img_save_dir = img_save_dir
+        self._config_save_file = config_save_file
+        self._scalar_save_file = scalar_save_file
+
+    def _init_env(self):
+        """Init save dir."""
+        if not os.path.exists(self._save_dir):
+            os.makedirs(self._save_dir, exist_ok=True)
+        self._img_save_dir = osp.join(
+            self._save_dir,  # type: ignore
+            self._img_save_dir)
+        self._config_save_file = osp.join(
+            self._save_dir,  # type: ignore
+            self._config_save_file)
+        self._scalar_save_file = osp.join(
+            self._save_dir,  # type: ignore
+            self._scalar_save_file)
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self) -> 'LocalVisBackend':
+        """Return the experiment object associated with this visualization
+        backend."""
+        return self
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to disk.
+
+        Args:
+            config (Config): The Config object
+        """
+        assert isinstance(config, Config)
+        config.dump(self._config_save_file)
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.array,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image to disk.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        assert image.dtype == np.uint8
+        drawn_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+        os.makedirs(self._img_save_dir, exist_ok=True)
+        save_file_name = f'{name}_{step}.png'
+        cv2.imwrite(osp.join(self._img_save_dir, save_file_name), drawn_image)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to disk.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        if isinstance(value, torch.Tensor):
+            value = value.item()
+        self._dump({name: value, 'step': step}, self._scalar_save_file, 'json')
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalars to disk.
+
+        The scalar dict will be written to the default and
+        specified files if ``file_path`` is specified.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values. The value must be dumped
+                into json format.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): The scalar's data will be
+                saved to the ``file_path`` file at the same time
+                if the ``file_path`` parameter is specified.
+                Defaults to None.
+        """
+        assert isinstance(scalar_dict, dict)
+        scalar_dict = copy.deepcopy(scalar_dict)
+        scalar_dict.setdefault('step', step)
+
+        if file_path is not None:
+            assert file_path.split('.')[-1] == 'json'
+            new_save_file_path = osp.join(
+                self._save_dir,  # type: ignore
+                file_path)
+            assert new_save_file_path != self._scalar_save_file, \
+                '``file_path`` and ``scalar_save_file`` have the ' \
+                'same name, please set ``file_path`` to another value'
+            self._dump(scalar_dict, new_save_file_path, 'json')
+        self._dump(scalar_dict, self._scalar_save_file, 'json')
+
+    def _dump(self, value_dict: dict, file_path: str,
+              file_format: str) -> None:
+        """dump dict to file.
+
+        Args:
+           value_dict (dict) : The dict data to saved.
+           file_path (str): The file path to save data.
+           file_format (str): The file format to save data.
+        """
+        with open(file_path, 'a+') as f:
+            dump(value_dict, f, file_format=file_format)
+            f.write('\n')
+
+
+@VISBACKENDS.register_module()
+class WandbVisBackend(BaseVisBackend):
+    """Wandb visualization backend class.
+
+    Examples:
+        >>> from mmengine.visualization import WandbVisBackend
+        >>> import numpy as np
+        >>> wandb_vis_backend = WandbVisBackend()
+        >>> img=np.random.randint(0, 256, size=(10, 10, 3))
+        >>> wandb_vis_backend.add_image('img', img)
+        >>> wandb_vis_backend.add_scaler('mAP', 0.6)
+        >>> wandb_vis_backend.add_scalars({'loss': [1, 2, 3],'acc': 0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> wandb_vis_backend.add_config(cfg)
+
+    Args:
+        save_dir (str, optional): The root directory to save the files
+            produced by the visualizer.
+        init_kwargs (dict, optional): wandb initialization
+            input parameters.
+            See `wandb.init <https://docs.wandb.ai/ref/python/init>`_ for
+            details. Defaults to None.
+        define_metric_cfg (dict or list[dict], optional):
+            When a dict is set, it is a dict of metrics and summary for
+            ``wandb.define_metric``.
+            The key is metric and the value is summary.
+            When a list is set, each dict should be a valid argument of
+            the ``define_metric``.
+            For example, ``define_metric_cfg={'coco/bbox_mAP': 'max'}``,
+            means the maximum value of ``coco/bbox_mAP`` is logged on wandb UI.
+            When ``define_metric_cfg=[dict(name='loss',
+            step_metric='epoch')]``,
+            the "loss" will be plotted against the epoch.
+            See `wandb define_metric <https://docs.wandb.ai/ref/python/
+            run#define_metric>`_ for details.
+            Defaults to None.
+        commit (bool, optional) Save the metrics dict to the wandb server
+            and increment the step.  If false `wandb.log` just updates the
+            current metrics dict with the row argument and metrics won't be
+            saved until `wandb.log` is called with `commit=True`.
+            Defaults to True.
+        log_code_name (str, optional) The name of code artifact.
+            By default, the artifact will be named
+            source-$PROJECT_ID-$ENTRYPOINT_RELPATH. See
+            `wandb log_code <https://docs.wandb.ai/ref/python/run#log_code>`_
+            for details. Defaults to None.
+            `New in version 0.3.0.`
+        watch_kwargs (optional, dict): Agurments for ``wandb.watch``.
+            `New in version 0.4.0.`
+    """
+
+    def __init__(self,
+                 save_dir: str,
+                 init_kwargs: Optional[dict] = None,
+                 define_metric_cfg: Union[dict, list, None] = None,
+                 commit: Optional[bool] = True,
+                 log_code_name: Optional[str] = None,
+                 watch_kwargs: Optional[dict] = None):
+        super().__init__(save_dir)
+        self._init_kwargs = init_kwargs
+        self._define_metric_cfg = define_metric_cfg
+        self._commit = commit
+        self._log_code_name = log_code_name
+        self._watch_kwargs = watch_kwargs if watch_kwargs is not None else {}
+
+    def _init_env(self):
+        """Setup env for wandb."""
+        if not os.path.exists(self._save_dir):
+            os.makedirs(self._save_dir, exist_ok=True)  # type: ignore
+        if self._init_kwargs is None:
+            self._init_kwargs = {'dir': self._save_dir}
+        else:
+            self._init_kwargs.setdefault('dir', self._save_dir)
+        try:
+            import wandb
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install wandb" to install wandb')
+
+        wandb.init(**self._init_kwargs)
+        if self._define_metric_cfg is not None:
+            if isinstance(self._define_metric_cfg, dict):
+                for metric, summary in self._define_metric_cfg.items():
+                    wandb.define_metric(metric, summary=summary)
+            elif isinstance(self._define_metric_cfg, list):
+                for metric_cfg in self._define_metric_cfg:
+                    wandb.define_metric(**metric_cfg)
+            else:
+                raise ValueError('define_metric_cfg should be dict or list')
+        self._wandb = wandb
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return wandb object.
+
+        The experiment attribute can get the wandb backend, If you want to
+        write other data, such as writing a table, you can directly get the
+        wandb backend through experiment.
+        """
+        return self._wandb
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to wandb.
+
+        Args:
+            config (Config): The Config object
+        """
+        assert isinstance(self._init_kwargs, dict)
+        allow_val_change = self._init_kwargs.get('allow_val_change', False)
+        self._wandb.config.update(
+            config.to_dict(), allow_val_change=allow_val_change)
+        self._wandb.run.log_code(name=self._log_code_name)
+
+    @force_init_env
+    def add_graph(self, model: torch.nn.Module, data_batch: Sequence[dict],
+                  **kwargs) -> None:
+        """Record the model graph.
+
+        Args:
+            model (torch.nn.Module): Model to draw.
+            data_batch (Sequence[dict]): Batch of data from dataloader.
+        """
+        self._wandb.watch(model, **self._watch_kwargs)
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image to wandb.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB.
+            step (int): Useless parameter. Wandb does not
+                need this parameter. Defaults to 0.
+        """
+        image = self._wandb.Image(image)
+        self._wandb.log({name: image}, commit=self._commit)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to wandb.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Useless parameter. Wandb does not
+                need this parameter. Defaults to 0.
+        """
+        self._wandb.log({name: value}, commit=self._commit)
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalar's data to wandb.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Useless parameter. Wandb does not
+                need this parameter. Defaults to 0.
+            file_path (str, optional): Useless parameter. Just for
+                interface unification. Defaults to None.
+        """
+        self._wandb.log(scalar_dict, commit=self._commit)
+
+    def close(self) -> None:
+        """close an opened wandb object."""
+        if hasattr(self, '_wandb'):
+            self._wandb.join()
+
+
+@VISBACKENDS.register_module()
+class TensorboardVisBackend(BaseVisBackend):
+    """Tensorboard visualization backend class.
+
+    It can write images, config, scalars, etc. to a
+    tensorboard file.
+
+    Examples:
+        >>> from mmengine.visualization import TensorboardVisBackend
+        >>> import numpy as np
+        >>> vis_backend = TensorboardVisBackend(save_dir='temp_dir')
+        >>> img = np.random.randint(0, 256, size=(10, 10, 3))
+        >>> vis_backend.add_image('img', img)
+        >>> vis_backend.add_scaler('mAP', 0.6)
+        >>> vis_backend.add_scalars({'loss': 0.1,'acc':0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> vis_backend.add_config(cfg)
+
+    Args:
+        save_dir (str): The root directory to save the files
+            produced by the backend.
+    """
+
+    def __init__(self, save_dir: str):
+        super().__init__(save_dir)
+
+    def _init_env(self):
+        """Setup env for Tensorboard."""
+        if not os.path.exists(self._save_dir):
+            os.makedirs(self._save_dir, exist_ok=True)  # type: ignore
+        if TORCH_VERSION == 'parrots':
+            try:
+                from tensorboardX import SummaryWriter
+            except ImportError:
+                raise ImportError('Please install tensorboardX to use '
+                                  'TensorboardLoggerHook.')
+        else:
+            try:
+                from torch.utils.tensorboard import SummaryWriter
+            except ImportError:
+                raise ImportError(
+                    'Please run "pip install future tensorboard" to install '
+                    'the dependencies to use torch.utils.tensorboard '
+                    '(applicable to PyTorch 1.1 or higher)')
+        self._tensorboard = SummaryWriter(self._save_dir)
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return Tensorboard object."""
+        return self._tensorboard
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to tensorboard.
+
+        Args:
+            config (Config): The Config object
+        """
+        self._tensorboard.add_text('config', config.pretty_text)
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image to tensorboard.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        self._tensorboard.add_image(name, image, step, dataformats='HWC')
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to tensorboard.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        if isinstance(value,
+                      (int, float, torch.Tensor, np.ndarray, np.number)):
+            self._tensorboard.add_scalar(name, value, step)
+        else:
+            warnings.warn(f'Got {type(value)}, but numpy array, torch tensor, '
+                          f'int or float are expected. skip it!')
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalar's data to tensorboard.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): Useless parameter. Just for
+                interface unification. Defaults to None.
+        """
+        assert isinstance(scalar_dict, dict)
+        assert 'step' not in scalar_dict, 'Please set it directly ' \
+                                          'through the step parameter'
+        for key, value in scalar_dict.items():
+            self.add_scalar(key, value, step)
+
+    def close(self):
+        """close an opened tensorboard object."""
+        if hasattr(self, '_tensorboard'):
+            self._tensorboard.close()
+
+
+@VISBACKENDS.register_module()
+class MLflowVisBackend(BaseVisBackend):
+    """MLflow visualization backend class.
+
+    It can write images, config, scalars, etc. to a
+    mlflow file.
+
+    Examples:
+        >>> from mmengine.visualization import MLflowVisBackend
+        >>> from mmengine import Config
+        >>> import numpy as np
+        >>> vis_backend = MLflowVisBackend(save_dir='temp_dir')
+        >>> img = np.random.randint(0, 256, size=(10, 10, 3))
+        >>> vis_backend.add_image('img.png', img)
+        >>> vis_backend.add_scalar('mAP', 0.6)
+        >>> vis_backend.add_scalars({'loss': 0.1,'acc':0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> vis_backend.add_config(cfg)
+
+    Args:
+        save_dir (str): The root directory to save the files
+            produced by the backend.
+        exp_name (str, optional): The experiment name. Defaults to None.
+        run_name (str, optional): The run name. Defaults to None.
+        tags (dict, optional): The tags to be added to the experiment.
+            Defaults to None.
+        params (dict, optional): The params to be added to the experiment.
+            Defaults to None.
+        tracking_uri (str, optional): The tracking uri. Defaults to None.
+        artifact_suffix (Tuple[str] or str, optional): The artifact suffix.
+            Defaults to ('.json', '.log', '.py', 'yaml').
+        tracked_config_keys (dict, optional): The top level keys of config that
+            will be added to the experiment. If it is None, which means all
+            the config will be added. Defaults to None.
+            `New in version 0.7.4.`
+        artifact_location (str, optional): The location to store run artifacts.
+            If None, the server picks an appropriate default.
+            Defaults to None.
+            `New in version 0.10.4.`
+    """
+
+    def __init__(self,
+                 save_dir: str,
+                 exp_name: Optional[str] = None,
+                 run_name: Optional[str] = None,
+                 tags: Optional[dict] = None,
+                 params: Optional[dict] = None,
+                 tracking_uri: Optional[str] = None,
+                 artifact_suffix: SUFFIX_TYPE = ('.json', '.log', '.py',
+                                                 'yaml'),
+                 tracked_config_keys: Optional[dict] = None,
+                 artifact_location: Optional[str] = None):
+        super().__init__(save_dir)
+        self._exp_name = exp_name
+        self._run_name = run_name
+        self._tags = tags
+        self._params = params
+        self._tracking_uri = tracking_uri
+        self._artifact_suffix = artifact_suffix
+        self._tracked_config_keys = tracked_config_keys
+        self._artifact_location = artifact_location
+
+    def _init_env(self):
+        """Setup env for MLflow."""
+        if not os.path.exists(self._save_dir):
+            os.makedirs(self._save_dir, exist_ok=True)  # type: ignore
+
+        try:
+            import mlflow
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install mlflow" to install mlflow'
+            )  # type: ignore
+        self._mlflow = mlflow
+
+        # when mlflow is imported, a default logger is created.
+        # at this time, the default logger's stream is None
+        # so the stream is reopened only when the stream is None
+        # or the stream is closed
+        logger = MMLogger.get_current_instance()
+        for handler in logger.handlers:
+            if handler.stream is None or handler.stream.closed:
+                handler.stream = open(handler.baseFilename, 'a')
+
+        if self._tracking_uri is not None:
+            logger.warning(
+                'Please make sure that the mlflow server is running.')
+            self._mlflow.set_tracking_uri(self._tracking_uri)
+        else:
+            if os.name == 'nt':
+                file_url = f'file:\\{os.path.abspath(self._save_dir)}'
+            else:
+                file_url = f'file://{os.path.abspath(self._save_dir)}'
+            self._mlflow.set_tracking_uri(file_url)
+
+        self._exp_name = self._exp_name or 'Default'
+
+        if self._mlflow.get_experiment_by_name(self._exp_name) is None:
+            self._mlflow.create_experiment(
+                self._exp_name, artifact_location=self._artifact_location)
+
+        self._mlflow.set_experiment(self._exp_name)
+
+        if self._run_name is not None:
+            self._mlflow.set_tag('mlflow.runName', self._run_name)
+        if self._tags is not None:
+            self._mlflow.set_tags(self._tags)
+        if self._params is not None:
+            self._mlflow.log_params(self._params)
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return MLflow object."""
+        return self._mlflow
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to mlflow.
+
+        Args:
+            config (Config): The Config object
+        """
+        self.cfg = config
+        if self._tracked_config_keys is None:
+            self._mlflow.log_params(self._flatten(self.cfg.to_dict()))
+        else:
+            tracked_cfg = dict()
+            for k in self._tracked_config_keys:
+                tracked_cfg[k] = self.cfg[k]
+            self._mlflow.log_params(self._flatten(tracked_cfg))
+        self._mlflow.log_text(self.cfg.pretty_text, 'config.py')
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image to mlflow.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB.
+            step (int): Global step value to record. Default to 0.
+        """
+        self._mlflow.log_image(image, name)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to mlflow.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Global step value to record. Default to 0.
+        """
+        self._mlflow.log_metric(name, value, step)
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalar's data to mlflow.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Default to 0.
+            file_path (str, optional): Useless parameter. Just for
+                interface unification. Defaults to None.
+        """
+        assert isinstance(scalar_dict, dict)
+        assert 'step' not in scalar_dict, 'Please set it directly ' \
+                                          'through the step parameter'
+        self._mlflow.log_metrics(scalar_dict, step)
+
+    def close(self) -> None:
+        """Close the mlflow."""
+        if not hasattr(self, '_mlflow'):
+            return
+
+        file_paths = dict()
+        for filename in scandir(self.cfg.work_dir, self._artifact_suffix,
+                                True):
+            file_path = osp.join(self.cfg.work_dir, filename)
+            relative_path = os.path.relpath(file_path, self.cfg.work_dir)
+            dir_path = os.path.dirname(relative_path)
+            file_paths[file_path] = dir_path
+
+        for file_path, dir_path in file_paths.items():
+            self._mlflow.log_artifact(file_path, dir_path)
+
+        self._mlflow.end_run()
+
+    def _flatten(self, d, parent_key='', sep='.') -> dict:
+        """Flatten the dict."""
+        items = dict()
+        for k, v in d.items():
+            new_key = parent_key + sep + k if parent_key else k
+            if isinstance(v, MutableMapping):
+                items.update(self._flatten(v, new_key, sep=sep))
+            elif isinstance(v, list):
+                if any(isinstance(x, dict) for x in v):
+                    for i, x in enumerate(v):
+                        items.update(
+                            self._flatten(x, new_key + sep + str(i), sep=sep))
+                else:
+                    items[new_key] = v
+            else:
+                items[new_key] = v
+        return items
+
+
+@VISBACKENDS.register_module()
+class ClearMLVisBackend(BaseVisBackend):
+    """Clearml visualization backend class. It requires `clearml`_ to be
+    installed.
+
+    Examples:
+        >>> from mmengine.visualization import ClearMLVisBackend
+        >>> from mmengine import Config
+        >>> import numpy as np
+        >>> vis_backend = ClearMLVisBackend(save_dir='temp_dir')
+        >>> img = np.random.randint(0, 256, size=(10, 10, 3))
+        >>> vis_backend.add_image('img.png', img)
+        >>> vis_backend.add_scalar('mAP', 0.6)
+        >>> vis_backend.add_scalars({'loss': 0.1,'acc':0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> vis_backend.add_config(cfg)
+
+    Args:
+        save_dir (str, optional): Useless parameter. Just for
+            interface unification. Defaults to None.
+        init_kwargs (dict, optional): A dict contains the arguments of
+            ``clearml.Task.init`` . See `taskinit`_  for more details.
+            Defaults to None
+        artifact_suffix (Tuple[str] or str): The artifact suffix.
+            Defaults to ('.py', 'pth').
+
+    .. _clearml:
+        https://clear.ml/docs/latest/docs/
+
+    .. _taskinit:
+        https://clear.ml/docs/latest/docs/references/sdk/task/#taskinit
+    """
+
+    def __init__(self,
+                 save_dir: Optional[str] = None,
+                 init_kwargs: Optional[dict] = None,
+                 artifact_suffix: SUFFIX_TYPE = ('.py', '.pth')):
+        super().__init__(save_dir)  # type: ignore
+        self._init_kwargs = init_kwargs
+        self._artifact_suffix = artifact_suffix
+
+    def _init_env(self) -> None:
+        try:
+            import clearml
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install clearml" to install clearml')
+
+        task_kwargs = self._init_kwargs or {}
+        self._clearml = clearml
+        self._task = self._clearml.Task.init(**task_kwargs)
+        self._logger = self._task.get_logger()
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return clearml object."""
+        return self._clearml
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to clearml.
+
+        Args:
+            config (Config): The Config object
+        """
+        self.cfg = config
+        self._task.connect_configuration(config.to_dict())
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image to clearml.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        self._logger.report_image(
+            title=name, series=name, iteration=step, image=image)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to clearml.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        self._logger.report_scalar(
+            title=name, series=name, value=value, iteration=step)
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalar's data to clearml.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): Useless parameter. Just for
+                interface unification. Defaults to None.
+        """
+        assert 'step' not in scalar_dict, 'Please set it directly ' \
+                                          'through the step parameter'
+        for key, value in scalar_dict.items():
+            self._logger.report_scalar(
+                title=key, series=key, value=value, iteration=step)
+
+    def close(self) -> None:
+        """Close the clearml."""
+        if not hasattr(self, '_clearml'):
+            return
+
+        file_paths: List[str] = list()
+        if (hasattr(self, 'cfg')
+                and osp.isdir(getattr(self.cfg, 'work_dir', ''))):
+            for filename in scandir(self.cfg.work_dir, self._artifact_suffix,
+                                    False):
+                file_path = osp.join(self.cfg.work_dir, filename)
+                file_paths.append(file_path)
+
+        for file_path in file_paths:
+            self._task.upload_artifact(os.path.basename(file_path), file_path)
+        self._task.close()
+
+
+@VISBACKENDS.register_module()
+class NeptuneVisBackend(BaseVisBackend):
+    """Neptune visualization backend class.
+
+    Examples:
+        >>> from mmengine.visualization import NeptuneVisBackend
+        >>> from mmengine import Config
+        >>> import numpy as np
+        >>> init_kwargs = {'project': 'your_project_name'}
+        >>> neptune_vis_backend = NeptuneVisBackend(init_kwargs=init_kwargs)
+        >>> img = np.random.randint(0, 256, size=(10, 10, 3))
+        >>> neptune_vis_backend.add_image('img', img)
+        >>> neptune_vis_backend.add_scalar('mAP', 0.6)
+        >>> neptune_vis_backend.add_scalars({'loss': 0.1, 'acc': 0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> neptune_vis_backend.add_config(cfg)
+
+    Note:
+        `New in version 0.9.0.`
+
+    Args:
+        save_dir (str, optional): The root directory to save the files
+            produced by the visualizer. NeptuneVisBackend does
+            not require this argument. Defaults to None.
+        init_kwargs (dict, optional): Neptune initialization parameters.
+            Defaults to None.
+
+            - project (str): Name of a project in a form of
+              `namespace/project_name`. If `project` is not specified,
+              the value of `NEPTUNE_PROJECT` environment variable
+              will be taken.
+            - api_token (str): User's API token. If api_token is not api_token,
+              the value of `NEPTUNE_API_TOKEN` environment variable will
+              be taken. Note: It is strongly recommended to use
+              `NEPTUNE_API_TOKEN` environment variable rather than
+              placing your API token here.
+
+            If 'project' and 'api_token are not specified in `init_kwargs`,
+            the 'mode' will be set to 'offline'.
+            See `neptune.init_run
+            <https://docs.neptune.ai/api/neptune/#init_run>`_ for
+            details.
+    """
+
+    def __init__(self,
+                 save_dir: Optional[str] = None,
+                 init_kwargs: Optional[dict] = None):
+        super().__init__(save_dir)  # type:ignore
+        self._init_kwargs = init_kwargs
+
+    def _init_env(self):
+        """Setup env for neptune."""
+        try:
+            import neptune
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install -U neptune" to install neptune')
+        if self._init_kwargs is None:
+            self._init_kwargs = {'mode': 'offline'}
+
+        self._neptune = neptune.init_run(**self._init_kwargs)
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return Neptune object."""
+        return self._neptune
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to neptune.
+
+        Args:
+            config (Config): The Config object
+        """
+        from neptune.types import File
+        self._neptune['config'].upload(File.from_content(config.pretty_text))
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        from neptune.types import File
+
+        # values in the array need to be in the [0, 1] range
+        img = image.astype(np.float32) / 255.0
+        self._neptune['images'].append(
+            File.as_image(img), name=name, step=step)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        self._neptune[name].append(value, step=step)
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalars' data.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): The scalar's data will be
+                saved to the `file_path` file at the same time
+                if the `file_path` parameter is specified.
+                Defaults to None.
+        """
+        assert isinstance(scalar_dict, dict)
+        assert 'step' not in scalar_dict, 'Please set it directly ' \
+                                          'through the step parameter'
+
+        for k, v in scalar_dict.items():
+            self._neptune[k].append(v, step=step)
+
+    def close(self) -> None:
+        """close an opened object."""
+        if hasattr(self, '_neptune'):
+            self._neptune.stop()
+
+
+@VISBACKENDS.register_module()
+class DVCLiveVisBackend(BaseVisBackend):
+    """DVCLive visualization backend class.
+
+    Examples:
+        >>> from mmengine.visualization import DVCLiveVisBackend
+        >>> import numpy as np
+        >>> dvclive_vis_backend = DVCLiveVisBackend(save_dir='temp_dir')
+        >>> img=np.random.randint(0, 256, size=(10, 10, 3))
+        >>> dvclive_vis_backend.add_image('img', img)
+        >>> dvclive_vis_backend.add_scalar('mAP', 0.6)
+        >>> dvclive_vis_backend.add_scalars({'loss': 0.1, 'acc': 0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> dvclive_vis_backend.add_config(cfg)
+
+    Note:
+        `New in version 0.9.0.`
+
+    Args:
+        save_dir (str, optional): The root directory to save the files
+            produced by the visualizer.
+        artifact_suffix (Tuple[str] or str, optional): The artifact suffix.
+            Defaults to ('.json', '.py', 'yaml').
+        init_kwargs (dict, optional): DVCLive initialization parameters.
+            See `DVCLive <https://dvc.org/doc/dvclive/live>`_ for details.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 save_dir: str,
+                 artifact_suffix: SUFFIX_TYPE = ('.json', '.py', 'yaml'),
+                 init_kwargs: Optional[dict] = None):
+        super().__init__(save_dir)
+        self._artifact_suffix = artifact_suffix
+        self._init_kwargs = init_kwargs
+
+    def _init_env(self):
+        """Setup env for dvclive."""
+        if digit_version(platform.python_version()) < digit_version('3.8'):
+            raise RuntimeError('Please use Python 3.8 or higher version '
+                               'to use DVCLiveVisBackend.')
+
+        try:
+            import pygit2
+            from dvclive import Live
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install dvclive" to install dvclive')
+        # if no git info, init dvc without git to avoid SCMError
+        try:
+            path = pygit2.discover_repository(os.fspath(os.curdir), True, '')
+            pygit2.Repository(path).default_signature
+        except KeyError:
+            os.system('dvc init -f --no-scm')
+
+        if self._init_kwargs is None:
+            self._init_kwargs = {}
+        self._init_kwargs.setdefault('dir', self._save_dir)
+        self._init_kwargs.setdefault('save_dvc_exp', True)
+        self._init_kwargs.setdefault('cache_images', True)
+
+        self._dvclive = Live(**self._init_kwargs)
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return dvclive object.
+
+        The experiment attribute can get the dvclive backend, If you want to
+        write other data, such as writing a table, you can directly get the
+        dvclive backend through experiment.
+        """
+        return self._dvclive
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to dvclive.
+
+        Args:
+            config (Config): The Config object
+        """
+        assert isinstance(config, Config)
+        self.cfg = config
+        self._dvclive.log_params(self._to_dvc_paramlike(self.cfg.to_dict()))
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image to dvclive.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB.
+            step (int): Useless parameter. Dvclive does not
+                need this parameter. Defaults to 0.
+        """
+        assert image.dtype == np.uint8
+        save_file_name = f'{name}.png'
+
+        self._dvclive.log_image(save_file_name, image)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to dvclive.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        if isinstance(value, torch.Tensor):
+            value = value.numpy()
+        self._dvclive.step = step
+        self._dvclive.log_metric(name, value)
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalar's data to dvclive.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): Useless parameter. Just for
+                interface unification. Defaults to None.
+        """
+        for key, value in scalar_dict.items():
+            self.add_scalar(key, value, step, **kwargs)
+
+    def close(self) -> None:
+        """close an opened dvclive object."""
+        if not hasattr(self, '_dvclive'):
+            return
+
+        file_paths = dict()
+        for filename in scandir(self._save_dir, self._artifact_suffix, True):
+            file_path = osp.join(self._save_dir, filename)
+            relative_path = os.path.relpath(file_path, self._save_dir)
+            dir_path = os.path.dirname(relative_path)
+            file_paths[file_path] = dir_path
+
+        for file_path, dir_path in file_paths.items():
+            self._dvclive.log_artifact(file_path, dir_path)
+
+        self._dvclive.end()
+
+    def _to_dvc_paramlike(self,
+                          value: Union[int, float, dict, list, tuple, Config,
+                                       ConfigDict, torch.Tensor, np.ndarray]):
+        """Convert the input value to a DVC `ParamLike` recursively.
+
+        Or the `log_params` method of dvclive will raise an error.
+        """
+
+        if isinstance(value, (dict, Config, ConfigDict)):
+            return {k: self._to_dvc_paramlike(v) for k, v in value.items()}
+        elif isinstance(value, (tuple, list)):
+            return [self._to_dvc_paramlike(item) for item in value]
+        elif isinstance(value, (torch.Tensor, np.ndarray)):
+            return value.tolist()
+        elif isinstance(value, np.generic):
+            return value.item()
+        else:
+            return value
+
+
+@VISBACKENDS.register_module()
+class AimVisBackend(BaseVisBackend):
+    """Aim visualization backend class.
+
+    Examples:
+        >>> from mmengine.visualization import AimVisBackend
+        >>> import numpy as np
+        >>> aim_vis_backend = AimVisBackend()
+        >>> img=np.random.randint(0, 256, size=(10, 10, 3))
+        >>> aim_vis_backend.add_image('img', img)
+        >>> aim_vis_backend.add_scalar('mAP', 0.6)
+        >>> aim_vis_backend.add_scalars({'loss': 0.1, 'acc': 0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> aim_vis_backend.add_config(cfg)
+
+    Note:
+        1. `New in version 0.9.0.`
+        2. Refer to
+           `Github issue <https://github.com/aimhubio/aim/issues/2064>`_ ,
+           Aim is not unable to be install on Windows for now.
+
+    Args:
+        save_dir (str, optional): The root directory to save the files
+            produced by the visualizer.
+        init_kwargs (dict, optional): Aim initialization parameters. See
+            `Aim <https://aimstack.readthedocs.io/en/latest/refs/sdk.html>`_
+            for details. Defaults to None.
+    """
+
+    def __init__(self,
+                 save_dir: Optional[str] = None,
+                 init_kwargs: Optional[dict] = None):
+        super().__init__(save_dir)  # type:ignore
+        self._init_kwargs = init_kwargs
+
+    def _init_env(self):
+        """Setup env for Aim."""
+        try:
+            from aim import Run
+        except ImportError:
+            raise ImportError('Please run "pip install aim" to install aim')
+
+        from datetime import datetime
+
+        if self._save_dir is not None:
+            path_list = os.path.normpath(self._save_dir).split(os.sep)
+            exp_name = f'{path_list[-2]}_{path_list[-1]}'
+        else:
+            exp_name = datetime.now().strftime('%Y%m%d_%H%M%S')
+
+        if self._init_kwargs is None:
+            self._init_kwargs = {}
+        self._init_kwargs.setdefault('experiment', exp_name)
+        self._aim_run = Run(**self._init_kwargs)
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return Aim object."""
+        return self._aim_run
+
+    @force_init_env
+    def add_config(self, config, **kwargs) -> None:
+        """Record the config to Aim.
+
+        Args:
+            config (Config): The Config object
+        """
+        if isinstance(config, Config):
+            config = config.to_dict()
+        self._aim_run['hparams'] = config
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        from aim import Image
+        self._aim_run.track(name=name, value=Image(image), step=step)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to Aim.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Global step value to record. Default to 0.
+        """
+        self._aim_run.track(name=name, value=value, step=step)
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalar's data to wandb.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Default to 0.
+            file_path (str, optional): Useless parameter. Just for
+                interface unification. Defaults to None.
+        """
+        for key, value in scalar_dict.items():
+            self._aim_run.track(name=key, value=value, step=step)
+
+    def close(self) -> None:
+        """Close the Aim."""
+        if not hasattr(self, '_aim_run'):
+            return
+
+        self._aim_run.close()
diff --git a/head_extractor/build/lib/mmengine/visualization/visualizer.py b/head_extractor/build/lib/mmengine/visualization/visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e90c184a6e186e8304395f8dd57ba86b0c62804
--- /dev/null
+++ b/head_extractor/build/lib/mmengine/visualization/visualizer.py
@@ -0,0 +1,1186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import os.path as osp
+import warnings
+from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, Union
+
+if TYPE_CHECKING:
+    from matplotlib.font_manager import FontProperties
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from mmengine.config import Config
+from mmengine.dist import master_only
+from mmengine.registry import VISBACKENDS, VISUALIZERS
+from mmengine.structures import BaseDataElement
+from mmengine.utils import ManagerMixin, is_seq_of
+from mmengine.visualization.utils import (check_type, check_type_and_length,
+                                          color_str2rgb, color_val_matplotlib,
+                                          convert_overlay_heatmap,
+                                          img_from_canvas, tensor2ndarray,
+                                          value2list, wait_continue)
+from mmengine.visualization.vis_backend import BaseVisBackend
+
+VisBackendsType = Union[List[Union[List, BaseDataElement]], BaseDataElement,
+                        dict, None]
+
+
+@VISUALIZERS.register_module()
+class Visualizer(ManagerMixin):
+    """MMEngine provides a Visualizer class that uses the ``Matplotlib``
+    library as the backend. It has the following functions:
+
+    - Basic drawing methods
+
+      - draw_bboxes: draw single or multiple bounding boxes
+      - draw_texts: draw single or multiple text boxes
+      - draw_points: draw single or multiple points
+      - draw_lines: draw single or multiple line segments
+      - draw_circles: draw single or multiple circles
+      - draw_polygons: draw single or multiple polygons
+      - draw_binary_masks: draw single or multiple binary masks
+      - draw_featmap: draw feature map
+
+    - Basic visualizer backend methods
+
+      - add_configs: write config to all vis storage backends
+      - add_graph: write model graph to all vis storage backends
+      - add_image: write image to all vis storage backends
+      - add_scalar: write scalar to all vis storage backends
+      - add_scalars: write scalars to all vis storage backends
+      - add_datasample: write datasample to all vis storage \
+         backends. The abstract drawing interface used by the user
+
+    - Basic info methods
+
+      - set_image: sets the original image data
+      - get_image: get the image data in Numpy format after drawing
+      - show: visualization
+      - close: close all resources that have been opened
+      - get_backend: get the specified vis backend
+
+
+    All the basic drawing methods support chain calls, which is convenient for
+    overlaydrawing and display. Each downstream algorithm library can inherit
+    ``Visualizer`` and implement the add_datasample logic. For example,
+    ``DetLocalVisualizer`` in MMDetection inherits from ``Visualizer``
+    and implements functions, such as visual detection boxes, instance masks,
+    and semantic segmentation maps in the add_datasample interface.
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        image (np.ndarray, optional): the origin image to draw. The format
+            should be RGB. Defaults to None.
+        vis_backends (list, optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        fig_save_cfg (dict): Keyword parameters of figure for saving.
+            Defaults to empty dict.
+        fig_show_cfg (dict): Keyword parameters of figure for showing.
+            Defaults to empty dict.
+
+    Examples:
+        >>> # Basic info methods
+        >>> vis = Visualizer()
+        >>> vis.set_image(image)
+        >>> vis.get_image()
+        >>> vis.show()
+
+        >>> # Basic drawing methods
+        >>> vis = Visualizer(image=image)
+        >>> vis.draw_bboxes(np.array([0, 0, 1, 1]), edge_colors='g')
+        >>> vis.draw_bboxes(bbox=np.array([[1, 1, 2, 2], [2, 2, 3, 3]]),
+        >>>                    edge_colors=['g', 'r'])
+        >>> vis.draw_lines(x_datas=np.array([1, 3]),
+        >>>                y_datas=np.array([1, 3]),
+        >>>                colors='r', line_widths=1)
+        >>> vis.draw_lines(x_datas=np.array([[1, 3], [2, 4]]),
+        >>>                y_datas=np.array([[1, 3], [2, 4]]),
+        >>>                colors=['r', 'r'], line_widths=[1, 2])
+        >>> vis.draw_texts(text='MMEngine',
+        >>>               position=np.array([2, 2]),
+        >>>               colors='b')
+        >>> vis.draw_texts(text=['MMEngine','OpenMMLab'],
+        >>>                position=np.array([[2, 2], [5, 5]]),
+        >>>                colors=['b', 'b'])
+        >>> vis.draw_circles(circle_coord=np.array([2, 2]), radius=np.array[1])
+        >>> vis.draw_circles(circle_coord=np.array([[2, 2], [3, 5]),
+        >>>                  radius=np.array[1, 2], colors=['g', 'r'])
+        >>> square = np.array([[0, 0], [100, 0], [100, 100], [0, 100]])
+        >>> vis.draw_polygons(polygons=square, edge_colors='g')
+        >>> squares = [np.array([[0, 0], [100, 0], [100, 100], [0, 100]]),
+        >>>            np.array([[0, 0], [50, 0], [50, 50], [0, 50]])]
+        >>> vis.draw_polygons(polygons=squares, edge_colors=['g', 'r'])
+        >>> vis.draw_binary_masks(binary_mask, alpha=0.6)
+        >>> heatmap = vis.draw_featmap(featmap, img,
+        >>>                            channel_reduction='select_max')
+        >>> heatmap = vis.draw_featmap(featmap, img, channel_reduction=None,
+        >>>                            topk=8, arrangement=(4, 2))
+        >>> heatmap = vis.draw_featmap(featmap, img, channel_reduction=None,
+        >>>                            topk=-1)
+
+        >>> # chain calls
+        >>> vis.draw_bboxes().draw_texts().draw_circle().draw_binary_masks()
+
+        >>> # Backend related methods
+        >>> vis = Visualizer(vis_backends=[dict(type='LocalVisBackend')],
+        >>>                                save_dir='temp_dir')
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> vis.add_config(cfg)
+        >>> image=np.random.randint(0, 256, size=(10, 10, 3)).astype(np.uint8)
+        >>> vis.add_image('image',image)
+        >>> vis.add_scaler('mAP', 0.6)
+        >>> vis.add_scalars({'loss': 0.1,'acc':0.8})
+
+        >>> # inherit
+        >>> class DetLocalVisualizer(Visualizer):
+        >>>      def add_datasample(self,
+        >>>                         name,
+        >>>                         image: np.ndarray,
+        >>>                         gt_sample:
+        >>>                             Optional['BaseDataElement'] = None,
+        >>>                         pred_sample:
+        >>>                             Optional['BaseDataElement'] = None,
+        >>>                         draw_gt: bool = True,
+        >>>                         draw_pred: bool = True,
+        >>>                         show: bool = False,
+        >>>                         wait_time: int = 0,
+        >>>                         step: int = 0) -> None:
+        >>>         pass
+    """
+
+    def __init__(
+        self,
+        name='visualizer',
+        image: Optional[np.ndarray] = None,
+        vis_backends: VisBackendsType = None,
+        save_dir: Optional[str] = None,
+        fig_save_cfg=dict(frameon=False),
+        fig_show_cfg=dict(frameon=False)
+    ) -> None:
+        super().__init__(name)
+        self._dataset_meta: Optional[dict] = None
+        self._vis_backends: Dict[str, BaseVisBackend] = {}
+
+        if vis_backends is None:
+            vis_backends = []
+
+        if isinstance(vis_backends, (dict, BaseVisBackend)):
+            vis_backends = [vis_backends]  # type: ignore
+
+        if not is_seq_of(vis_backends, (dict, BaseVisBackend)):
+            raise TypeError('vis_backends must be a list of dicts or a list '
+                            'of BaseBackend instances')
+        if save_dir is not None:
+            save_dir = osp.join(save_dir, 'vis_data')
+
+        for vis_backend in vis_backends:  # type: ignore
+            name = None
+            if isinstance(vis_backend, dict):
+                name = vis_backend.pop('name', None)
+                vis_backend.setdefault('save_dir', save_dir)
+                vis_backend = VISBACKENDS.build(vis_backend)
+
+            # If vis_backend requires `save_dir` (with no default value)
+            # but is initialized with None, then don't add this
+            # vis_backend to the visualizer.
+            save_dir_arg = inspect.signature(
+                vis_backend.__class__.__init__).parameters.get('save_dir')
+            if (save_dir_arg is not None
+                    and save_dir_arg.default is save_dir_arg.empty
+                    and getattr(vis_backend, '_save_dir') is None):
+                warnings.warn(f'Failed to add {vis_backend.__class__}, '
+                              'please provide the `save_dir` argument.')
+                continue
+
+            type_name = vis_backend.__class__.__name__
+            name = name or type_name
+
+            if name in self._vis_backends:
+                raise RuntimeError(f'vis_backend name {name} already exists')
+            self._vis_backends[name] = vis_backend  # type: ignore
+
+        self.fig_save = None
+        self.fig_save_cfg = fig_save_cfg
+        self.fig_show_cfg = fig_show_cfg
+
+        (self.fig_save_canvas, self.fig_save,
+         self.ax_save) = self._initialize_fig(fig_save_cfg)
+        self.dpi = self.fig_save.get_dpi()
+
+        if image is not None:
+            self.set_image(image)
+
+    @property  # type: ignore
+    @master_only
+    def dataset_meta(self) -> Optional[dict]:
+        """Optional[dict]: Meta info of the dataset."""
+        return self._dataset_meta
+
+    @dataset_meta.setter  # type: ignore
+    @master_only
+    def dataset_meta(self, dataset_meta: dict) -> None:
+        """Set the dataset meta info to the Visualizer."""
+        self._dataset_meta = dataset_meta
+
+    @master_only
+    def show(self,
+             drawn_img: Optional[np.ndarray] = None,
+             win_name: str = 'image',
+             wait_time: float = 0.,
+             continue_key: str = ' ',
+             backend: str = 'matplotlib') -> None:
+        """Show the drawn image.
+
+        Args:
+            drawn_img (np.ndarray, optional): The image to show. If drawn_img
+                is None, it will show the image got by Visualizer. Defaults
+                to None.
+            win_name (str):  The image title. Defaults to 'image'.
+            wait_time (float): Delay in seconds. 0 is the special
+                value that means "forever". Defaults to 0.
+            continue_key (str): The key for users to continue. Defaults to
+                the space key.
+            backend (str): The backend to show the image. Defaults to
+                'matplotlib'. `New in version 0.7.3.`
+        """
+        if backend == 'matplotlib':
+            import matplotlib.pyplot as plt
+            is_inline = 'inline' in plt.get_backend()
+            img = self.get_image() if drawn_img is None else drawn_img
+            self._init_manager(win_name)
+            fig = self.manager.canvas.figure
+            # remove white edges by set subplot margin
+            fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
+            fig.clear()
+            ax = fig.add_subplot()
+            ax.axis(False)
+            ax.imshow(img)
+            self.manager.canvas.draw()
+
+            # Find a better way for inline to show the image
+            if is_inline:
+                return fig
+            wait_continue(fig, timeout=wait_time, continue_key=continue_key)
+        elif backend == 'cv2':
+            # Keep images are shown in the same window, and the title of window
+            # will be updated with `win_name`.
+            cv2.namedWindow(winname=f'{id(self)}')
+            cv2.setWindowTitle(f'{id(self)}', win_name)
+            cv2.imshow(
+                str(id(self)),
+                self.get_image() if drawn_img is None else drawn_img)
+            cv2.waitKey(int(np.ceil(wait_time * 1000)))
+        else:
+            raise ValueError('backend should be "matplotlib" or "cv2", '
+                             f'but got {backend} instead')
+
+    @master_only
+    def set_image(self, image: np.ndarray) -> None:
+        """Set the image to draw.
+
+        Args:
+            image (np.ndarray): The image to draw.
+        """
+        assert image is not None
+        image = image.astype('uint8')
+        self._image = image
+        self.width, self.height = image.shape[1], image.shape[0]
+        self._default_font_size = max(
+            np.sqrt(self.height * self.width) // 90, 10)
+
+        # add a small 1e-2 to avoid precision lost due to matplotlib's
+        # truncation (https://github.com/matplotlib/matplotlib/issues/15363)
+        self.fig_save.set_size_inches(  # type: ignore
+            (self.width + 1e-2) / self.dpi, (self.height + 1e-2) / self.dpi)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        self.ax_save.cla()
+        self.ax_save.axis(False)
+        self.ax_save.imshow(
+            image,
+            extent=(0, self.width, self.height, 0),
+            interpolation='none')
+
+    @master_only
+    def get_image(self) -> np.ndarray:
+        """Get the drawn image. The format is RGB.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        assert self._image is not None, 'Please set image using `set_image`'
+        return img_from_canvas(self.fig_save_canvas)  # type: ignore
+
+    def _initialize_fig(self, fig_cfg) -> tuple:
+        """Build figure according to fig_cfg.
+
+        Args:
+            fig_cfg (dict): The config to build figure.
+
+        Returns:
+             tuple: build canvas figure and axes.
+        """
+        from matplotlib.backends.backend_agg import FigureCanvasAgg
+        from matplotlib.figure import Figure
+        fig = Figure(**fig_cfg)
+        ax = fig.add_subplot()
+        ax.axis(False)
+
+        # remove white edges by set subplot margin
+        fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
+        canvas = FigureCanvasAgg(fig)
+        return canvas, fig, ax
+
+    def _init_manager(self, win_name: str) -> None:
+        """Initialize the matplot manager.
+
+        Args:
+            win_name (str): The window name.
+        """
+        from matplotlib.figure import Figure
+        from matplotlib.pyplot import new_figure_manager
+        if getattr(self, 'manager', None) is None:
+            self.manager = new_figure_manager(
+                num=1, FigureClass=Figure, **self.fig_show_cfg)
+
+        try:
+            self.manager.set_window_title(win_name)
+        except Exception:
+            self.manager = new_figure_manager(
+                num=1, FigureClass=Figure, **self.fig_show_cfg)
+            self.manager.set_window_title(win_name)
+
+    @master_only
+    def get_backend(self, name) -> 'BaseVisBackend':
+        """get vis backend by name.
+
+        Args:
+            name (str): The name of vis backend
+
+        Returns:
+             BaseVisBackend: The vis backend.
+        """
+        return self._vis_backends.get(name)  # type: ignore
+
+    def _is_posion_valid(self, position: np.ndarray) -> bool:
+        """Judge whether the position is in image.
+
+        Args:
+            position (np.ndarray): The position to judge which last dim must
+                be two and the format is [x, y].
+
+        Returns:
+            bool: Whether the position is in image.
+        """
+        flag = (position[..., 0] < self.width).all() and \
+               (position[..., 0] >= 0).all() and \
+               (position[..., 1] < self.height).all() and \
+               (position[..., 1] >= 0).all()
+        return flag
+
+    @master_only
+    def draw_points(self,
+                    positions: Union[np.ndarray, torch.Tensor],
+                    colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+                    marker: Optional[str] = None,
+                    sizes: Optional[Union[np.ndarray, torch.Tensor]] = None):
+        """Draw single or multiple points.
+
+        Args:
+            positions (Union[np.ndarray, torch.Tensor]): Positions to draw.
+            colors (Union[str, tuple, List[str], List[tuple]]): The colors
+                of points. ``colors`` can have the same length with points or
+                just single value. If ``colors`` is single value, all the
+                points will have the same colors. Reference to
+                https://matplotlib.org/stable/gallery/color/named_colors.html
+                for more details. Defaults to 'g.
+            marker (str, optional): The marker style.
+                See :mod:`matplotlib.markers` for more information about
+                marker styles. Defaults to None.
+            sizes (Optional[Union[np.ndarray, torch.Tensor]]): The marker size.
+                Defaults to None.
+        """
+        check_type('positions', positions, (np.ndarray, torch.Tensor))
+        positions = tensor2ndarray(positions)
+
+        if len(positions.shape) == 1:
+            positions = positions[None]
+        assert positions.shape[-1] == 2, (
+            'The shape of `positions` should be (N, 2), '
+            f'but got {positions.shape}')
+        colors = color_val_matplotlib(colors)  # type: ignore
+        self.ax_save.scatter(
+            positions[:, 0], positions[:, 1], c=colors, s=sizes, marker=marker)
+        return self
+
+    @master_only
+    def draw_texts(
+        self,
+        texts: Union[str, List[str]],
+        positions: Union[np.ndarray, torch.Tensor],
+        font_sizes: Optional[Union[int, List[int]]] = None,
+        colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+        vertical_alignments: Union[str, List[str]] = 'top',
+        horizontal_alignments: Union[str, List[str]] = 'left',
+        font_families: Union[str, List[str]] = 'sans-serif',
+        bboxes: Optional[Union[dict, List[dict]]] = None,
+        font_properties: Optional[Union['FontProperties',
+                                        List['FontProperties']]] = None
+    ) -> 'Visualizer':
+        """Draw single or multiple text boxes.
+
+        Args:
+            texts (Union[str, List[str]]): Texts to draw.
+            positions (Union[np.ndarray, torch.Tensor]): The position to draw
+                the texts, which should have the same length with texts and
+                each dim contain x and y.
+            font_sizes (Union[int, List[int]], optional): The font size of
+                texts. ``font_sizes`` can have the same length with texts or
+                just single value. If ``font_sizes`` is single value, all the
+                texts will have the same font size. Defaults to None.
+            colors (Union[str, tuple, List[str], List[tuple]]): The colors
+                of texts. ``colors`` can have the same length with texts or
+                just single value. If ``colors`` is single value, all the
+                texts will have the same colors. Reference to
+                https://matplotlib.org/stable/gallery/color/named_colors.html
+                for more details. Defaults to 'g.
+            vertical_alignments (Union[str, List[str]]): The verticalalignment
+                of texts. verticalalignment controls whether the y positional
+                argument for the text indicates the bottom, center or top side
+                of the text bounding box.
+                ``vertical_alignments`` can have the same length with
+                texts or just single value. If ``vertical_alignments`` is
+                single value, all the texts will have the same
+                verticalalignment. verticalalignment can be 'center' or
+                'top', 'bottom' or 'baseline'. Defaults to 'top'.
+            horizontal_alignments (Union[str, List[str]]): The
+                horizontalalignment of texts. Horizontalalignment controls
+                whether the x positional argument for the text indicates the
+                left, center or right side of the text bounding box.
+                ``horizontal_alignments`` can have
+                the same length with texts or just single value.
+                If ``horizontal_alignments`` is single value, all the texts
+                will have the same horizontalalignment. Horizontalalignment
+                can be 'center','right' or 'left'. Defaults to 'left'.
+            font_families (Union[str, List[str]]): The font family of
+                texts. ``font_families`` can have the same length with texts or
+                just single value. If ``font_families`` is single value, all
+                the texts will have the same font family.
+                font_familiy can be 'serif', 'sans-serif', 'cursive', 'fantasy'
+                or 'monospace'.  Defaults to 'sans-serif'.
+            bboxes (Union[dict, List[dict]], optional): The bounding box of the
+                texts. If bboxes is None, there are no bounding box around
+                texts. ``bboxes`` can have the same length with texts or
+                just single value. If ``bboxes`` is single value, all
+                the texts will have the same bbox. Reference to
+                https://matplotlib.org/stable/api/_as_gen/matplotlib.patches.FancyBboxPatch.html#matplotlib.patches.FancyBboxPatch
+                for more details. Defaults to None.
+            font_properties (Union[FontProperties, List[FontProperties]], optional):
+                The font properties of texts. FontProperties is
+                a ``font_manager.FontProperties()`` object.
+                If you want to draw Chinese texts, you need to prepare
+                a font file that can show Chinese characters properly.
+                For example: `simhei.ttf`, `simsun.ttc`, `simkai.ttf` and so on.
+                Then set ``font_properties=matplotlib.font_manager.FontProperties(fname='path/to/font_file')``
+                ``font_properties`` can have the same length with texts or
+                just single value. If ``font_properties`` is single value,
+                all the texts will have the same font properties.
+                Defaults to None.
+                `New in version 0.6.0.`
+        """  # noqa: E501
+        from matplotlib.font_manager import FontProperties
+        check_type('texts', texts, (str, list))
+        if isinstance(texts, str):
+            texts = [texts]
+        num_text = len(texts)
+        check_type('positions', positions, (np.ndarray, torch.Tensor))
+        positions = tensor2ndarray(positions)
+        if len(positions.shape) == 1:
+            positions = positions[None]
+        assert positions.shape == (num_text, 2), (
+            '`positions` should have the shape of '
+            f'({num_text}, 2), but got {positions.shape}')
+        if not self._is_posion_valid(positions):
+            warnings.warn(
+                'Warning: The text is out of bounds,'
+                ' the drawn text may not be in the image', UserWarning)
+        positions = positions.tolist()
+
+        if font_sizes is None:
+            font_sizes = self._default_font_size
+        check_type_and_length('font_sizes', font_sizes, (int, float, list),
+                              num_text)
+        font_sizes = value2list(font_sizes, (int, float), num_text)
+
+        check_type_and_length('colors', colors, (str, tuple, list), num_text)
+        colors = value2list(colors, (str, tuple), num_text)
+        colors = color_val_matplotlib(colors)  # type: ignore
+
+        check_type_and_length('vertical_alignments', vertical_alignments,
+                              (str, list), num_text)
+        vertical_alignments = value2list(vertical_alignments, str, num_text)
+
+        check_type_and_length('horizontal_alignments', horizontal_alignments,
+                              (str, list), num_text)
+        horizontal_alignments = value2list(horizontal_alignments, str,
+                                           num_text)
+
+        check_type_and_length('font_families', font_families, (str, list),
+                              num_text)
+        font_families = value2list(font_families, str, num_text)
+
+        if font_properties is None:
+            font_properties = [None for _ in range(num_text)]  # type: ignore
+        else:
+            check_type_and_length('font_properties', font_properties,
+                                  (FontProperties, list), num_text)
+            font_properties = value2list(font_properties, FontProperties,
+                                         num_text)
+
+        if bboxes is None:
+            bboxes = [None for _ in range(num_text)]  # type: ignore
+        else:
+            check_type_and_length('bboxes', bboxes, (dict, list), num_text)
+            bboxes = value2list(bboxes, dict, num_text)
+
+        for i in range(num_text):
+            self.ax_save.text(
+                positions[i][0],
+                positions[i][1],
+                texts[i],
+                size=font_sizes[i],  # type: ignore
+                bbox=bboxes[i],  # type: ignore
+                verticalalignment=vertical_alignments[i],
+                horizontalalignment=horizontal_alignments[i],
+                family=font_families[i],
+                fontproperties=font_properties[i],
+                color=colors[i])
+        return self
+
+    @master_only
+    def draw_lines(
+        self,
+        x_datas: Union[np.ndarray, torch.Tensor],
+        y_datas: Union[np.ndarray, torch.Tensor],
+        colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+        line_styles: Union[str, List[str]] = '-',
+        line_widths: Union[Union[int, float], List[Union[int, float]]] = 2
+    ) -> 'Visualizer':
+        """Draw single or multiple line segments.
+
+        Args:
+            x_datas (Union[np.ndarray, torch.Tensor]): The x coordinate of
+                each line' start and end points.
+            y_datas (Union[np.ndarray, torch.Tensor]): The y coordinate of
+                each line' start and end points.
+            colors (Union[str, tuple, List[str], List[tuple]]): The colors of
+                lines. ``colors`` can have the same length with lines or just
+                single value. If ``colors`` is single value, all the lines
+                will have the same colors. Reference to
+                https://matplotlib.org/stable/gallery/color/named_colors.html
+                for more details. Defaults to 'g'.
+            line_styles (Union[str, List[str]]): The linestyle
+                of lines. ``line_styles`` can have the same length with
+                texts or just single value. If ``line_styles`` is single
+                value, all the lines will have the same linestyle.
+                Reference to
+                https://matplotlib.org/stable/api/collections_api.html?highlight=collection#matplotlib.collections.AsteriskPolygonCollection.set_linestyle
+                for more details. Defaults to '-'.
+            line_widths (Union[Union[int, float], List[Union[int, float]]]):
+                The linewidth of lines. ``line_widths`` can have
+                the same length with lines or just single value.
+                If ``line_widths`` is single value, all the lines will
+                have the same linewidth. Defaults to 2.
+        """
+        from matplotlib.collections import LineCollection
+        check_type('x_datas', x_datas, (np.ndarray, torch.Tensor))
+        x_datas = tensor2ndarray(x_datas)
+        check_type('y_datas', y_datas, (np.ndarray, torch.Tensor))
+        y_datas = tensor2ndarray(y_datas)
+        assert x_datas.shape == y_datas.shape, (
+            '`x_datas` and `y_datas` should have the same shape')
+        assert x_datas.shape[-1] == 2, (
+            f'The shape of `x_datas` should be (N, 2), but got {x_datas.shape}'
+        )
+        if len(x_datas.shape) == 1:
+            x_datas = x_datas[None]
+            y_datas = y_datas[None]
+        colors = color_val_matplotlib(colors)  # type: ignore
+        lines = np.concatenate(
+            (x_datas.reshape(-1, 2, 1), y_datas.reshape(-1, 2, 1)), axis=-1)
+        if not self._is_posion_valid(lines):
+            warnings.warn(
+                'Warning: The line is out of bounds,'
+                ' the drawn line may not be in the image', UserWarning)
+        line_collect = LineCollection(
+            lines.tolist(),
+            colors=colors,
+            linestyles=line_styles,
+            linewidths=line_widths)
+        self.ax_save.add_collection(line_collect)
+        return self
+
+    @master_only
+    def draw_circles(
+        self,
+        center: Union[np.ndarray, torch.Tensor],
+        radius: Union[np.ndarray, torch.Tensor],
+        edge_colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+        line_styles: Union[str, List[str]] = '-',
+        line_widths: Union[Union[int, float], List[Union[int, float]]] = 2,
+        face_colors: Union[str, tuple, List[str], List[tuple]] = 'none',
+        alpha: Union[float, int] = 0.8,
+    ) -> 'Visualizer':
+        """Draw single or multiple circles.
+
+        Args:
+            center (Union[np.ndarray, torch.Tensor]): The x coordinate of
+                each line' start and end points.
+            radius (Union[np.ndarray, torch.Tensor]): The y coordinate of
+                each line' start and end points.
+            edge_colors (Union[str, tuple, List[str], List[tuple]]): The
+                colors of circles. ``colors`` can have the same length with
+                lines or just single value. If ``colors`` is single value,
+                all the lines will have the same colors. Reference to
+                https://matplotlib.org/stable/gallery/color/named_colors.html
+                for more details. Defaults to 'g.
+            line_styles (Union[str, List[str]]): The linestyle
+                of lines. ``line_styles`` can have the same length with
+                texts or just single value. If ``line_styles`` is single
+                value, all the lines will have the same linestyle.
+                Reference to
+                https://matplotlib.org/stable/api/collections_api.html?highlight=collection#matplotlib.collections.AsteriskPolygonCollection.set_linestyle
+                for more details. Defaults to '-'.
+            line_widths (Union[Union[int, float], List[Union[int, float]]]):
+                The linewidth of lines. ``line_widths`` can have
+                the same length with lines or just single value.
+                If ``line_widths`` is single value, all the lines will
+                have the same linewidth. Defaults to 2.
+            face_colors (Union[str, tuple, List[str], List[tuple]]):
+                The face colors. Defaults to None.
+            alpha (Union[int, float]): The transparency of circles.
+                Defaults to 0.8.
+        """
+        from matplotlib.collections import PatchCollection
+        from matplotlib.patches import Circle
+        check_type('center', center, (np.ndarray, torch.Tensor))
+        center = tensor2ndarray(center)
+        check_type('radius', radius, (np.ndarray, torch.Tensor))
+        radius = tensor2ndarray(radius)
+        if len(center.shape) == 1:
+            center = center[None]
+        assert center.shape == (radius.shape[0], 2), (
+            'The shape of `center` should be (radius.shape, 2), '
+            f'but got {center.shape}')
+        if not (self._is_posion_valid(center -
+                                      np.tile(radius.reshape((-1, 1)), (1, 2)))
+                and self._is_posion_valid(
+                    center + np.tile(radius.reshape((-1, 1)), (1, 2)))):
+            warnings.warn(
+                'Warning: The circle is out of bounds,'
+                ' the drawn circle may not be in the image', UserWarning)
+
+        center = center.tolist()
+        radius = radius.tolist()
+        edge_colors = color_val_matplotlib(edge_colors)  # type: ignore
+        face_colors = color_val_matplotlib(face_colors)  # type: ignore
+        circles = []
+        for i in range(len(center)):
+            circles.append(Circle(tuple(center[i]), radius[i]))
+
+        if isinstance(line_widths, (int, float)):
+            line_widths = [line_widths] * len(circles)
+        line_widths = [
+            min(max(linewidth, 1), self._default_font_size / 4)
+            for linewidth in line_widths
+        ]
+        p = PatchCollection(
+            circles,
+            alpha=alpha,
+            facecolors=face_colors,
+            edgecolors=edge_colors,
+            linewidths=line_widths,
+            linestyles=line_styles)
+        self.ax_save.add_collection(p)
+        return self
+
+    @master_only
+    def draw_bboxes(
+        self,
+        bboxes: Union[np.ndarray, torch.Tensor],
+        edge_colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+        line_styles: Union[str, List[str]] = '-',
+        line_widths: Union[Union[int, float], List[Union[int, float]]] = 2,
+        face_colors: Union[str, tuple, List[str], List[tuple]] = 'none',
+        alpha: Union[int, float] = 0.8,
+    ) -> 'Visualizer':
+        """Draw single or multiple bboxes.
+
+        Args:
+            bboxes (Union[np.ndarray, torch.Tensor]): The bboxes to draw with
+                the format of(x1,y1,x2,y2).
+            edge_colors (Union[str, tuple, List[str], List[tuple]]): The
+                colors of bboxes. ``colors`` can have the same length with
+                lines or just single value. If ``colors`` is single value, all
+                the lines will have the same colors. Refer to `matplotlib.
+                colors` for full list of formats that are accepted.
+                Defaults to 'g'.
+            line_styles (Union[str, List[str]]): The linestyle
+                of lines. ``line_styles`` can have the same length with
+                texts or just single value. If ``line_styles`` is single
+                value, all the lines will have the same linestyle.
+                Reference to
+                https://matplotlib.org/stable/api/collections_api.html?highlight=collection#matplotlib.collections.AsteriskPolygonCollection.set_linestyle
+                for more details. Defaults to '-'.
+            line_widths (Union[Union[int, float], List[Union[int, float]]]):
+                The linewidth of lines. ``line_widths`` can have
+                the same length with lines or just single value.
+                If ``line_widths`` is single value, all the lines will
+                have the same linewidth. Defaults to 2.
+            face_colors (Union[str, tuple, List[str], List[tuple]]):
+                The face colors. Defaults to None.
+            alpha (Union[int, float]): The transparency of bboxes.
+                Defaults to 0.8.
+        """
+        check_type('bboxes', bboxes, (np.ndarray, torch.Tensor))
+        bboxes = tensor2ndarray(bboxes)
+
+        if len(bboxes.shape) == 1:
+            bboxes = bboxes[None]
+        assert bboxes.shape[-1] == 4, (
+            f'The shape of `bboxes` should be (N, 4), but got {bboxes.shape}')
+
+        assert (bboxes[:, 0] <= bboxes[:, 2]).all() and (bboxes[:, 1] <=
+                                                         bboxes[:, 3]).all()
+        if not self._is_posion_valid(bboxes.reshape((-1, 2, 2))):
+            warnings.warn(
+                'Warning: The bbox is out of bounds,'
+                ' the drawn bbox may not be in the image', UserWarning)
+        poly = np.stack(
+            (bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 1],
+             bboxes[:, 2], bboxes[:, 3], bboxes[:, 0], bboxes[:, 3]),
+            axis=-1).reshape(-1, 4, 2)
+        poly = [p for p in poly]
+        return self.draw_polygons(
+            poly,
+            alpha=alpha,
+            edge_colors=edge_colors,
+            line_styles=line_styles,
+            line_widths=line_widths,
+            face_colors=face_colors)
+
+    @master_only
+    def draw_polygons(
+        self,
+        polygons: Union[Union[np.ndarray, torch.Tensor],
+                        List[Union[np.ndarray, torch.Tensor]]],
+        edge_colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+        line_styles: Union[str, List[str]] = '-',
+        line_widths: Union[Union[int, float], List[Union[int, float]]] = 2,
+        face_colors: Union[str, tuple, List[str], List[tuple]] = 'none',
+        alpha: Union[int, float] = 0.8,
+    ) -> 'Visualizer':
+        """Draw single or multiple bboxes.
+
+        Args:
+            polygons (Union[Union[np.ndarray, torch.Tensor],\
+                List[Union[np.ndarray, torch.Tensor]]]): The polygons to draw
+                with the format of (x1,y1,x2,y2,...,xn,yn).
+            edge_colors (Union[str, tuple, List[str], List[tuple]]): The
+                colors of polygons. ``colors`` can have the same length with
+                lines or just single value. If ``colors`` is single value,
+                all the lines will have the same colors. Refer to
+                `matplotlib.colors` for full list of formats that are accepted.
+                Defaults to 'g.
+            line_styles (Union[str, List[str]]): The linestyle
+                of lines. ``line_styles`` can have the same length with
+                texts or just single value. If ``line_styles`` is single
+                value, all the lines will have the same linestyle.
+                Reference to
+                https://matplotlib.org/stable/api/collections_api.html?highlight=collection#matplotlib.collections.AsteriskPolygonCollection.set_linestyle
+                for more details. Defaults to '-'.
+            line_widths (Union[Union[int, float], List[Union[int, float]]]):
+                The linewidth of lines. ``line_widths`` can have
+                the same length with lines or just single value.
+                If ``line_widths`` is single value, all the lines will
+                have the same linewidth. Defaults to 2.
+            face_colors (Union[str, tuple, List[str], List[tuple]]):
+                The face colors. Defaults to None.
+            alpha (Union[int, float]): The transparency of polygons.
+                Defaults to 0.8.
+        """
+        from matplotlib.collections import PolyCollection
+        check_type('polygons', polygons, (list, np.ndarray, torch.Tensor))
+        edge_colors = color_val_matplotlib(edge_colors)  # type: ignore
+        face_colors = color_val_matplotlib(face_colors)  # type: ignore
+
+        if isinstance(polygons, (np.ndarray, torch.Tensor)):
+            polygons = [polygons]
+        if isinstance(polygons, list):
+            for polygon in polygons:
+                assert polygon.shape[1] == 2, (
+                    'The shape of each polygon in `polygons` should be (M, 2),'
+                    f' but got {polygon.shape}')
+        polygons = [tensor2ndarray(polygon) for polygon in polygons]
+        for polygon in polygons:
+            if not self._is_posion_valid(polygon):
+                warnings.warn(
+                    'Warning: The polygon is out of bounds,'
+                    ' the drawn polygon may not be in the image', UserWarning)
+        if isinstance(line_widths, (int, float)):
+            line_widths = [line_widths] * len(polygons)
+        line_widths = [
+            min(max(linewidth, 1), self._default_font_size / 4)
+            for linewidth in line_widths
+        ]
+        polygon_collection = PolyCollection(
+            polygons,
+            alpha=alpha,
+            facecolor=face_colors,
+            linestyles=line_styles,
+            edgecolors=edge_colors,
+            linewidths=line_widths)
+
+        self.ax_save.add_collection(polygon_collection)
+        return self
+
+    @master_only
+    def draw_binary_masks(
+            self,
+            binary_masks: Union[np.ndarray, torch.Tensor],
+            colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+            alphas: Union[float, List[float]] = 0.8) -> 'Visualizer':
+        """Draw single or multiple binary masks.
+
+        Args:
+            binary_masks (np.ndarray, torch.Tensor): The binary_masks to draw
+                with of shape (N, H, W), where H is the image height and W is
+                the image width. Each value in the array is either a 0 or 1
+                value of uint8 type.
+            colors (np.ndarray): The colors which binary_masks will convert to.
+                ``colors`` can have the same length with binary_masks or just
+                single value. If ``colors`` is single value, all the
+                binary_masks will convert to the same colors. The colors format
+                is RGB. Defaults to np.array([0, 255, 0]).
+            alphas (Union[int, List[int]]): The transparency of masks.
+                Defaults to 0.8.
+        """
+        check_type('binary_masks', binary_masks, (np.ndarray, torch.Tensor))
+        binary_masks = tensor2ndarray(binary_masks)
+        assert binary_masks.dtype == np.bool_, (
+            'The dtype of binary_masks should be np.bool_, '
+            f'but got {binary_masks.dtype}')
+        binary_masks = binary_masks.astype('uint8') * 255
+        img = self.get_image()
+        if binary_masks.ndim == 2:
+            binary_masks = binary_masks[None]
+        assert img.shape[:2] == binary_masks.shape[
+                                1:], '`binary_masks` must have ' \
+                                     'the same shape with image'
+        binary_mask_len = binary_masks.shape[0]
+
+        check_type_and_length('colors', colors, (str, tuple, list),
+                              binary_mask_len)
+        colors = value2list(colors, (str, tuple), binary_mask_len)
+        colors = [
+            color_str2rgb(color) if isinstance(color, str) else color
+            for color in colors
+        ]
+        for color in colors:
+            assert len(color) == 3
+            for channel in color:
+                assert 0 <= channel <= 255  # type: ignore
+
+        if isinstance(alphas, float):
+            alphas = [alphas] * binary_mask_len
+
+        for binary_mask, color, alpha in zip(binary_masks, colors, alphas):
+            binary_mask_complement = cv2.bitwise_not(binary_mask)
+            rgb = np.zeros_like(img)
+            rgb[...] = color
+            rgb = cv2.bitwise_and(rgb, rgb, mask=binary_mask)
+            img_complement = cv2.bitwise_and(
+                img, img, mask=binary_mask_complement)
+            rgb = rgb + img_complement
+            img = cv2.addWeighted(img, 1 - alpha, rgb, alpha, 0)
+        self.ax_save.imshow(
+            img,
+            extent=(0, self.width, self.height, 0),
+            interpolation='nearest')
+        return self
+
+    @staticmethod
+    @master_only
+    def draw_featmap(featmap: torch.Tensor,
+                     overlaid_image: Optional[np.ndarray] = None,
+                     channel_reduction: Optional[str] = 'squeeze_mean',
+                     topk: int = 20,
+                     arrangement: Tuple[int, int] = (4, 5),
+                     resize_shape: Optional[tuple] = None,
+                     alpha: float = 0.5) -> np.ndarray:
+        """Draw featmap.
+
+        - If `overlaid_image` is not None, the final output image will be the
+          weighted sum of img and featmap.
+
+        - If `resize_shape` is specified, `featmap` and `overlaid_image`
+          are interpolated.
+
+        - If `resize_shape` is None and `overlaid_image` is not None,
+          the feature map will be interpolated to the spatial size of the image
+          in the case where the spatial dimensions of `overlaid_image` and
+          `featmap` are different.
+
+        - If `channel_reduction` is "squeeze_mean" and "select_max",
+          it will compress featmap to single channel image and weighted
+          sum to `overlaid_image`.
+
+        - If `channel_reduction` is None
+
+          - If topk <= 0, featmap is assert to be one or three
+            channel and treated as image and will be weighted sum
+            to ``overlaid_image``.
+          - If topk > 0, it will select topk channel to show by the sum of
+            each channel. At the same time, you can specify the `arrangement`
+            to set the window layout.
+
+        Args:
+            featmap (torch.Tensor): The featmap to draw which format is
+                (C, H, W).
+            overlaid_image (np.ndarray, optional): The overlaid image.
+                Defaults to None.
+            channel_reduction (str, optional): Reduce multiple channels to a
+                single channel. The optional value is 'squeeze_mean'
+                or 'select_max'. Defaults to 'squeeze_mean'.
+            topk (int): If channel_reduction is not None and topk > 0,
+                it will select topk channel to show by the sum of each channel.
+                if topk <= 0, tensor_chw is assert to be one or three.
+                Defaults to 20.
+            arrangement (Tuple[int, int]): The arrangement of featmap when
+                channel_reduction is None and topk > 0. Defaults to (4, 5).
+            resize_shape (tuple, optional): The shape to scale the feature map.
+                Defaults to None.
+            alpha (Union[int, List[int]]): The transparency of featmap.
+                Defaults to 0.5.
+
+        Returns:
+            np.ndarray: RGB image.
+        """
+        import matplotlib.pyplot as plt
+        assert isinstance(featmap,
+                          torch.Tensor), (f'`featmap` should be torch.Tensor,'
+                                          f' but got {type(featmap)}')
+        assert featmap.ndim == 3, f'Input dimension must be 3, ' \
+                                  f'but got {featmap.ndim}'
+        featmap = featmap.detach().cpu()
+
+        if overlaid_image is not None:
+            if overlaid_image.ndim == 2:
+                overlaid_image = cv2.cvtColor(overlaid_image,
+                                              cv2.COLOR_GRAY2RGB)
+
+            if overlaid_image.shape[:2] != featmap.shape[1:]:
+                warnings.warn(
+                    f'Since the spatial dimensions of '
+                    f'overlaid_image: {overlaid_image.shape[:2]} and '
+                    f'featmap: {featmap.shape[1:]} are not same, '
+                    f'the feature map will be interpolated. '
+                    f'This may cause mismatch problems !')
+                if resize_shape is None:
+                    featmap = F.interpolate(
+                        featmap[None],
+                        overlaid_image.shape[:2],
+                        mode='bilinear',
+                        align_corners=False)[0]
+
+        if resize_shape is not None:
+            featmap = F.interpolate(
+                featmap[None],
+                resize_shape,
+                mode='bilinear',
+                align_corners=False)[0]
+            if overlaid_image is not None:
+                overlaid_image = cv2.resize(overlaid_image, resize_shape[::-1])
+
+        if channel_reduction is not None:
+            assert channel_reduction in [
+                'squeeze_mean', 'select_max'], \
+                f'Mode only support "squeeze_mean", "select_max", ' \
+                f'but got {channel_reduction}'
+            if channel_reduction == 'select_max':
+                sum_channel_featmap = torch.sum(featmap, dim=(1, 2))
+                _, indices = torch.topk(sum_channel_featmap, 1)
+                feat_map = featmap[indices]
+            else:
+                feat_map = torch.mean(featmap, dim=0)
+            return convert_overlay_heatmap(feat_map, overlaid_image, alpha)
+        elif topk <= 0:
+            featmap_channel = featmap.shape[0]
+            assert featmap_channel in [
+                1, 3
+            ], ('The input tensor channel dimension must be 1 or 3 '
+                'when topk is less than 1, but the channel '
+                f'dimension you input is {featmap_channel}, you can use the'
+                ' channel_reduction parameter or set topk greater than '
+                '0 to solve the error')
+            return convert_overlay_heatmap(featmap, overlaid_image, alpha)
+        else:
+            row, col = arrangement
+            channel, height, width = featmap.shape
+            assert row * col >= topk, 'The product of row and col in ' \
+                                      'the `arrangement` is less than ' \
+                                      'topk, please set the ' \
+                                      '`arrangement` correctly'
+
+            # Extract the feature map of topk
+            topk = min(channel, topk)
+            sum_channel_featmap = torch.sum(featmap, dim=(1, 2))
+            _, indices = torch.topk(sum_channel_featmap, topk)
+            topk_featmap = featmap[indices]
+
+            fig = plt.figure(frameon=False)
+            # Set the window layout
+            fig.subplots_adjust(
+                left=0, right=1, bottom=0, top=1, wspace=0, hspace=0)
+            dpi = fig.get_dpi()
+            fig.set_size_inches((width * col + 1e-2) / dpi,
+                                (height * row + 1e-2) / dpi)
+            for i in range(topk):
+                axes = fig.add_subplot(row, col, i + 1)
+                axes.axis('off')
+                axes.text(2, 15, f'channel: {indices[i]}', fontsize=10)
+                axes.imshow(
+                    convert_overlay_heatmap(topk_featmap[i], overlaid_image,
+                                            alpha))
+            image = img_from_canvas(fig.canvas)
+            plt.close(fig)
+            return image
+
+    @master_only
+    def add_config(self, config: Config, **kwargs):
+        """Record the config.
+
+        Args:
+            config (Config): The Config object.
+        """
+        for vis_backend in self._vis_backends.values():
+            vis_backend.add_config(config, **kwargs)
+
+    @master_only
+    def add_graph(self, model: torch.nn.Module, data_batch: Sequence[dict],
+                  **kwargs) -> None:
+        """Record the model graph.
+
+        Args:
+            model (torch.nn.Module): Model to draw.
+            data_batch (Sequence[dict]): Batch of data from dataloader.
+        """
+        for vis_backend in self._vis_backends.values():
+            vis_backend.add_graph(model, data_batch, **kwargs)
+
+    @master_only
+    def add_image(self, name: str, image: np.ndarray, step: int = 0) -> None:
+        """Record the image.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray, optional): The image to be saved. The format
+                should be RGB. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        for vis_backend in self._vis_backends.values():
+            vis_backend.add_image(name, image, step)  # type: ignore
+
+    @master_only
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data.
+
+        Args:
+            name (str): The scalar identifier.
+            value (float, int): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        for vis_backend in self._vis_backends.values():
+            vis_backend.add_scalar(name, value, step, **kwargs)  # type: ignore
+
+    @master_only
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalars' data.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): The scalar's data will be
+                saved to the `file_path` file at the same time
+                if the `file_path` parameter is specified.
+                Defaults to None.
+        """
+        for vis_backend in self._vis_backends.values():
+            vis_backend.add_scalars(scalar_dict, step, file_path, **kwargs)
+
+    @master_only
+    def add_datasample(self,
+                       name,
+                       image: np.ndarray,
+                       data_sample: Optional['BaseDataElement'] = None,
+                       draw_gt: bool = True,
+                       draw_pred: bool = True,
+                       show: bool = False,
+                       wait_time: int = 0,
+                       step: int = 0) -> None:
+        """Draw datasample."""
+        pass
+
+    def close(self) -> None:
+        """close an opened object."""
+        for vis_backend in self._vis_backends.values():
+            vis_backend.close()
+
+    @classmethod
+    def get_instance(cls, name: str, **kwargs) -> 'Visualizer':
+        """Make subclass can get latest created instance by
+        ``Visualizer.get_current_instance()``.
+
+        Downstream codebase may need to get the latest created instance
+        without knowing the specific Visualizer type. For example, mmdetection
+        builds visualizer in runner and some component which cannot access
+        runner wants to get latest created visualizer. In this case,
+        the component does not know which type of visualizer has been built
+        and cannot get target instance. Therefore, :class:`Visualizer`
+        overrides the :meth:`get_instance` and its subclass will register
+        the created instance to :attr:`_instance_dict` additionally.
+        :meth:`get_current_instance` will return the latest created subclass
+        instance.
+
+        Examples:
+            >>> class DetLocalVisualizer(Visualizer):
+            >>>     def __init__(self, name):
+            >>>         super().__init__(name)
+            >>>
+            >>> visualizer1 = DetLocalVisualizer.get_instance('name1')
+            >>> visualizer2 = Visualizer.get_current_instance()
+            >>> visualizer3 = DetLocalVisualizer.get_current_instance()
+            >>> assert id(visualizer1) == id(visualizer2) == id(visualizer3)
+
+        Args:
+            name (str): Name of instance.
+
+        Returns:
+            object: Corresponding name instance.
+        """
+        instance = super().get_instance(name, **kwargs)
+        Visualizer._instance_dict[name] = instance
+        return instance
diff --git a/head_extractor/build/lib/mmseg/__init__.py b/head_extractor/build/lib/mmseg/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..092a6f161b4801b433a09709b7babfe1074073f7
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import mmengine
+from packaging.version import parse
+
+from .version import __version__, version_info
+
+MMCV_MIN = '2.0.0rc4'
+MMCV_MAX = '2.2.0'
+MMENGINE_MIN = '0.5.0'
+MMENGINE_MAX = '1.0.0'
+
+
+def digit_version(version_str: str, length: int = 4):
+    """Convert a version string into a tuple of integers.
+
+    This method is usually used for comparing two versions. For pre-release
+    versions: alpha < beta < rc.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Default: 4.
+
+    Returns:
+        tuple[int]: The version info in digits (integers).
+    """
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        mapping = {'a': -3, 'b': -2, 'rc': -1}
+        val = -4
+        # version.pre can be None
+        if version.pre:
+            if version.pre[0] not in mapping:
+                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
+                              'version checking may go wrong')
+            else:
+                val = mapping[version.pre[0]]
+            release.extend([val, version.pre[-1]])
+        else:
+            release.extend([val, 0])
+
+    elif version.is_postrelease:
+        release.extend([1, version.post])
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+
+mmcv_min_version = digit_version(MMCV_MIN)
+mmcv_max_version = digit_version(MMCV_MAX)
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_min_version <= mmcv_version <= mmcv_max_version), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>=2.0.0rc4.'
+
+mmengine_min_version = digit_version(MMENGINE_MIN)
+mmengine_max_version = digit_version(MMENGINE_MAX)
+mmengine_version = digit_version(mmengine.__version__)
+
+assert (mmengine_min_version <= mmengine_version < mmengine_max_version), \
+    f'MMEngine=={mmengine.__version__} is used but incompatible. ' \
+    f'Please install mmengine>={mmengine_min_version}, '\
+    f'<{mmengine_max_version}.'
+
+__all__ = ['__version__', 'version_info', 'digit_version']
diff --git a/head_extractor/build/lib/mmseg/apis/__init__.py b/head_extractor/build/lib/mmseg/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b50a266319c9cf74cb8b13afcff564248c058732
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/apis/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .inference import inference_model, init_model, show_result_pyplot
+from .mmseg_inferencer import MMSegInferencer
+from .remote_sense_inferencer import RSImage, RSInferencer
+
+__all__ = [
+    'init_model', 'inference_model', 'show_result_pyplot', 'MMSegInferencer',
+    'RSInferencer', 'RSImage'
+]
diff --git a/head_extractor/build/lib/mmseg/apis/inference.py b/head_extractor/build/lib/mmseg/apis/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7be9a269972ed8f7f601a2410a7797db08f835f
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/apis/inference.py
@@ -0,0 +1,190 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from pathlib import Path
+from typing import Optional, Union
+
+import mmcv
+import numpy as np
+import torch
+from mmengine import Config
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+from mmengine.utils import mkdir_or_exist
+
+from mmseg.models import BaseSegmentor
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+from mmseg.utils import SampleList, dataset_aliases, get_classes, get_palette
+from mmseg.visualization import SegLocalVisualizer
+from .utils import ImageType, _preprare_data
+
+
+def init_model(config: Union[str, Path, Config],
+               checkpoint: Optional[str] = None,
+               device: str = 'cuda:0',
+               cfg_options: Optional[dict] = None):
+    """Initialize a segmentor from config file.
+
+    Args:
+        config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path,
+            :obj:`Path`, or the config object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+        device (str, optional) CPU/CUDA device option. Default 'cuda:0'.
+            Use 'cpu' for loading model on CPU.
+        cfg_options (dict, optional): Options to override some settings in
+            the used config.
+    Returns:
+        nn.Module: The constructed segmentor.
+    """
+    if isinstance(config, (str, Path)):
+        config = Config.fromfile(config)
+    elif not isinstance(config, Config):
+        raise TypeError('config must be a filename or Config object, '
+                        'but got {}'.format(type(config)))
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+    if config.model.type == 'EncoderDecoder':
+        if 'init_cfg' in config.model.backbone:
+            config.model.backbone.init_cfg = None
+    elif config.model.type == 'MultimodalEncoderDecoder':
+        for k, v in config.model.items():
+            if isinstance(v, dict) and 'init_cfg' in v:
+                config.model[k].init_cfg = None
+    config.model.pretrained = None
+    config.model.train_cfg = None
+    init_default_scope(config.get('default_scope', 'mmseg'))
+    # import pdb; pdb.set_trace()
+    model = MODELS.build(config.model)
+    if checkpoint is not None:
+        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+        dataset_meta = checkpoint['meta'].get('dataset_meta', None)
+        # save the dataset_meta in the model for convenience
+        if 'dataset_meta' in checkpoint.get('meta', {}):
+            # mmseg 1.x
+            model.dataset_meta = dataset_meta
+        elif 'CLASSES' in checkpoint.get('meta', {}):
+            # < mmseg 1.x
+            classes = checkpoint['meta']['CLASSES']
+            palette = checkpoint['meta']['PALETTE']
+            model.dataset_meta = {'classes': classes, 'palette': palette}
+        else:
+            warnings.simplefilter('once')
+            warnings.warn(
+                'dataset_meta or class names are not saved in the '
+                'checkpoint\'s meta data, classes and palette will be'
+                'set according to num_classes ')
+            num_classes = model.decode_head.num_classes
+            dataset_name = None
+            for name in dataset_aliases.keys():
+                if len(get_classes(name)) == num_classes:
+                    dataset_name = name
+                    break
+            if dataset_name is None:
+                warnings.warn(
+                    'No suitable dataset found, use Cityscapes by default')
+                dataset_name = 'cityscapes'
+            model.dataset_meta = {
+                'classes': get_classes(dataset_name),
+                'palette': get_palette(dataset_name)
+            }
+    model.cfg = config  # save the config in the model for convenience
+    model.to(device)
+    model.eval()
+    return model
+
+
+def inference_model(model: BaseSegmentor,
+                    img: ImageType) -> Union[SegDataSample, SampleList]:
+    """Inference image(s) with the segmentor.
+
+    Args:
+        model (nn.Module): The loaded segmentor.
+        imgs (str/ndarray or list[str/ndarray]): Either image files or loaded
+            images.
+
+    Returns:
+        :obj:`SegDataSample` or list[:obj:`SegDataSample`]:
+        If imgs is a list or tuple, the same length list type results
+        will be returned, otherwise return the segmentation results directly.
+    """
+    # prepare data
+    data, is_batch = _preprare_data(img, model)
+
+    # data = data.to(dtype=torch.float16)
+    # forward the model
+    with torch.no_grad():
+        results = model.test_step(data)
+
+    return results if is_batch else results[0]
+
+
+def show_result_pyplot(model: BaseSegmentor,
+                       img: Union[str, np.ndarray],
+                       result: SegDataSample,
+                       opacity: float = 0.5,
+                       title: str = '',
+                       draw_gt: bool = True,
+                       draw_pred: bool = True,
+                       wait_time: float = 0,
+                       show: bool = True,
+                       with_labels: Optional[bool] = True,
+                       save_dir=None,
+                       out_file=None):
+    """Visualize the segmentation results on the image.
+
+    Args:
+        model (nn.Module): The loaded segmentor.
+        img (str or np.ndarray): Image filename or loaded image.
+        result (SegDataSample): The prediction SegDataSample result.
+        opacity(float): Opacity of painted segmentation map.
+            Default 0.5. Must be in (0, 1] range.
+        title (str): The title of pyplot figure.
+            Default is ''.
+        draw_gt (bool): Whether to draw GT SegDataSample. Default to True.
+        draw_pred (bool): Whether to draw Prediction SegDataSample.
+            Defaults to True.
+        wait_time (float): The interval of show (s). 0 is the special value
+            that means "forever". Defaults to 0.
+        show (bool): Whether to display the drawn image.
+            Default to True.
+        with_labels(bool, optional): Add semantic labels in visualization
+            result, Default to True.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        out_file (str, optional): Path to output file. Default to None.
+
+
+
+    Returns:
+        np.ndarray: the drawn image which channel is RGB.
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+    if isinstance(img, str):
+        image = mmcv.imread(img, channel_order='rgb')
+    else:
+        image = img
+    if save_dir is not None:
+        mkdir_or_exist(save_dir)
+    # init visualizer
+    visualizer = SegLocalVisualizer(
+        vis_backends=[dict(type='LocalVisBackend')],
+        save_dir=save_dir,
+        alpha=opacity)
+    visualizer.dataset_meta = dict(
+        classes=model.dataset_meta['classes'],
+        palette=model.dataset_meta['palette'])
+    visualizer.add_datasample(
+        name=title,
+        image=image,
+        data_sample=result,
+        draw_gt=draw_gt,
+        draw_pred=draw_pred,
+        wait_time=wait_time,
+        out_file=out_file,
+        show=show,
+        with_labels=with_labels)
+    vis_img = visualizer.get_image()
+
+    return vis_img
diff --git a/head_extractor/build/lib/mmseg/apis/mmseg_inferencer.py b/head_extractor/build/lib/mmseg/apis/mmseg_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..02a198b516a71c1f5a0833955607ba4ecc05bf13
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/apis/mmseg_inferencer.py
@@ -0,0 +1,382 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import List, Optional, Sequence, Union
+
+import mmcv
+import mmengine
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.transforms import Compose
+from mmengine.infer.infer import BaseInferencer, ModelType
+from mmengine.model import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner.checkpoint import _load_checkpoint_to_model
+from PIL import Image
+
+from mmseg.structures import SegDataSample
+from mmseg.utils import ConfigType, SampleList, get_classes, get_palette
+from mmseg.visualization import SegLocalVisualizer
+
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[SegDataSample, SampleList]
+
+
+class MMSegInferencer(BaseInferencer):
+    """Semantic segmentation inferencer, provides inference and visualization
+    interfaces. Note: MMEngine >= 0.5.0 is required.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. Take the `mmseg metafile <https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/metafile.yaml>`_
+            as an example the `model` could be
+            "fcn_r50-d8_4xb2-40k_cityscapes-512x1024", and the weights of model
+            will be download automatically. If use config file, like
+            "configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py", the
+            `weights` should be defined.
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        classes (list, optional): Input classes for result rendering, as the
+            prediction of segmentation model is a segment map with label
+            indices, `classes` is a list which includes items responding to the
+            label indices. If classes is not defined, visualizer will take
+            `cityscapes` classes by default. Defaults to None.
+        palette (list, optional): Input palette for result rendering, which is
+            a list of color palette responding to the classes. If palette is
+            not defined, visualizer will take `cityscapes` palette by default.
+            Defaults to None.
+        dataset_name (str, optional): `Dataset name or alias <https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/utils/class_names.py#L302-L317>`_
+            visulizer will use the meta information of the dataset i.e. classes
+            and palette, but the `classes` and `palette` have higher priority.
+            Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        scope (str, optional): The scope of the model. Defaults to 'mmseg'.
+    """ # noqa
+
+    preprocess_kwargs: set = set()
+    forward_kwargs: set = {'mode', 'out_dir'}
+    visualize_kwargs: set = {
+        'show', 'wait_time', 'img_out_dir', 'opacity', 'return_vis',
+        'with_labels'
+    }
+    postprocess_kwargs: set = {'pred_out_dir', 'return_datasample'}
+
+    def __init__(self,
+                 model: Union[ModelType, str],
+                 weights: Optional[str] = None,
+                 classes: Optional[Union[str, List]] = None,
+                 palette: Optional[Union[str, List]] = None,
+                 dataset_name: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: Optional[str] = 'mmseg') -> None:
+        # A global counter tracking the number of images processes, for
+        # naming of the output images
+        self.num_visualized_imgs = 0
+        self.num_pred_imgs = 0
+        init_default_scope(scope if scope else 'mmseg')
+        super().__init__(
+            model=model, weights=weights, device=device, scope=scope)
+
+        if device == 'cpu' or not torch.cuda.is_available():
+            self.model = revert_sync_batchnorm(self.model)
+
+        assert isinstance(self.visualizer, SegLocalVisualizer)
+        self.visualizer.set_dataset_meta(classes, palette, dataset_name)
+
+    def _load_weights_to_model(self, model: nn.Module,
+                               checkpoint: Optional[dict],
+                               cfg: Optional[ConfigType]) -> None:
+        """Loading model weights and meta information from cfg and checkpoint.
+
+        Subclasses could override this method to load extra meta information
+        from ``checkpoint`` and ``cfg`` to model.
+
+        Args:
+            model (nn.Module): Model to load weights and meta information.
+            checkpoint (dict, optional): The loaded checkpoint.
+            cfg (Config or ConfigDict, optional): The loaded config.
+        """
+
+        if checkpoint is not None:
+            _load_checkpoint_to_model(model, checkpoint)
+            checkpoint_meta = checkpoint.get('meta', {})
+            # save the dataset_meta in the model for convenience
+            if 'dataset_meta' in checkpoint_meta:
+                # mmsegmentation 1.x
+                model.dataset_meta = {
+                    'classes': checkpoint_meta['dataset_meta'].get('classes'),
+                    'palette': checkpoint_meta['dataset_meta'].get('palette')
+                }
+            elif 'CLASSES' in checkpoint_meta:
+                # mmsegmentation 0.x
+                classes = checkpoint_meta['CLASSES']
+                palette = checkpoint_meta.get('PALETTE', None)
+                model.dataset_meta = {'classes': classes, 'palette': palette}
+            else:
+                warnings.warn(
+                    'dataset_meta or class names are not saved in the '
+                    'checkpoint\'s meta data, use classes of Cityscapes by '
+                    'default.')
+                model.dataset_meta = {
+                    'classes': get_classes('cityscapes'),
+                    'palette': get_palette('cityscapes')
+                }
+        else:
+            warnings.warn('Checkpoint is not loaded, and the inference '
+                          'result is calculated by the randomly initialized '
+                          'model!')
+            warnings.warn(
+                'weights is None, use cityscapes classes by default.')
+            model.dataset_meta = {
+                'classes': get_classes('cityscapes'),
+                'palette': get_palette('cityscapes')
+            }
+
+    def __call__(self,
+                 inputs: InputsType,
+                 return_datasamples: bool = False,
+                 batch_size: int = 1,
+                 return_vis: bool = False,
+                 show: bool = False,
+                 wait_time: int = 0,
+                 out_dir: str = '',
+                 img_out_dir: str = 'vis',
+                 pred_out_dir: str = 'pred',
+                 **kwargs) -> dict:
+        """Call the inferencer.
+
+        Args:
+            inputs (Union[list, str, np.ndarray]): Inputs for the inferencer.
+            return_datasamples (bool): Whether to return results as
+                :obj:`SegDataSample`. Defaults to False.
+            batch_size (int): Batch size. Defaults to 1.
+            show (bool): Whether to display the rendering color segmentation
+                mask in a popup window. Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            out_dir (str): Output directory of inference results. Defaults
+                to ''.
+            img_out_dir (str): Subdirectory of `out_dir`, used to save
+                rendering color segmentation mask, so `out_dir` must be defined
+                if you would like to save predicted mask. Defaults to 'vis'.
+            pred_out_dir (str): Subdirectory of `out_dir`, used to save
+                predicted mask file, so `out_dir` must be defined if you would
+                like to save predicted mask. Defaults to 'pred'.
+
+            **kwargs: Other keyword arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+
+        Returns:
+            dict: Inference and visualization results.
+        """
+
+        if out_dir != '':
+            pred_out_dir = osp.join(out_dir, pred_out_dir)
+            img_out_dir = osp.join(out_dir, img_out_dir)
+        else:
+            pred_out_dir = ''
+            img_out_dir = ''
+
+        return super().__call__(
+            inputs=inputs,
+            return_datasamples=return_datasamples,
+            batch_size=batch_size,
+            show=show,
+            wait_time=wait_time,
+            img_out_dir=img_out_dir,
+            pred_out_dir=pred_out_dir,
+            return_vis=return_vis,
+            **kwargs)
+
+    def visualize(self,
+                  inputs: list,
+                  preds: List[dict],
+                  return_vis: bool = False,
+                  show: bool = False,
+                  wait_time: int = 0,
+                  img_out_dir: str = '',
+                  opacity: float = 0.8,
+                  with_labels: Optional[bool] = True) -> List[np.ndarray]:
+        """Visualize predictions.
+
+        Args:
+            inputs (list): Inputs preprocessed by :meth:`_inputs_to_list`.
+            preds (Any): Predictions of the model.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            img_out_dir (str): Output directory of rendering prediction i.e.
+                color segmentation mask. Defaults: ''
+            opacity (int, float): The transparency of segmentation mask.
+                Defaults to 0.8.
+
+        Returns:
+            List[np.ndarray]: Visualization results.
+        """
+        if not show and img_out_dir == '' and not return_vis:
+            return None
+        if self.visualizer is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+
+        self.visualizer.set_dataset_meta(**self.model.dataset_meta)
+        self.visualizer.alpha = opacity
+
+        results = []
+
+        for single_input, pred in zip(inputs, preds):
+            if isinstance(single_input, str):
+                img_bytes = mmengine.fileio.get(single_input)
+                img = mmcv.imfrombytes(img_bytes)
+                img = img[:, :, ::-1]
+                img_name = osp.basename(single_input)
+            elif isinstance(single_input, np.ndarray):
+                img = single_input.copy()
+                img_num = str(self.num_visualized_imgs).zfill(8) + '_vis'
+                img_name = f'{img_num}.jpg'
+            else:
+                raise ValueError('Unsupported input type:'
+                                 f'{type(single_input)}')
+
+            out_file = osp.join(img_out_dir, img_name) if img_out_dir != ''\
+                else None
+
+            self.visualizer.add_datasample(
+                img_name,
+                img,
+                pred,
+                show=show,
+                wait_time=wait_time,
+                draw_gt=False,
+                draw_pred=True,
+                out_file=out_file,
+                with_labels=with_labels)
+            if return_vis:
+                results.append(self.visualizer.get_image())
+            self.num_visualized_imgs += 1
+
+        return results if return_vis else None
+
+    def postprocess(self,
+                    preds: PredType,
+                    visualization: List[np.ndarray],
+                    return_datasample: bool = False,
+                    pred_out_dir: str = '') -> dict:
+        """Process the predictions and visualization results from ``forward``
+        and ``visualize``.
+
+        This method should be responsible for the following tasks:
+
+        1. Pack the predictions and visualization results and return them.
+        2. Save the predictions, if it needed.
+
+        Args:
+            preds (List[Dict]): Predictions of the model.
+            visualization (List[np.ndarray]): The list of rendering color
+                segmentation mask.
+            return_datasample (bool): Whether to return results as datasamples.
+                Defaults to False.
+            pred_out_dir: File to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Inference and visualization results with key ``predictions``
+            and ``visualization``
+
+            - ``visualization (Any)``: Returned by :meth:`visualize`
+            - ``predictions`` (List[np.ndarray], np.ndarray): Returned by
+              :meth:`forward` and processed in :meth:`postprocess`.
+              If ``return_datasample=False``, it will be the segmentation mask
+              with label indice.
+        """
+        if return_datasample:
+            if len(preds) == 1:
+                return preds[0]
+            else:
+                return preds
+
+        results_dict = {}
+
+        results_dict['predictions'] = []
+        results_dict['visualization'] = []
+
+        for i, pred in enumerate(preds):
+            pred_data = dict()
+            if 'pred_sem_seg' in pred.keys():
+                pred_data['sem_seg'] = pred.pred_sem_seg.numpy().data[0]
+            elif 'pred_depth_map' in pred.keys():
+                pred_data['depth_map'] = pred.pred_depth_map.numpy().data[0]
+
+            if visualization is not None:
+                vis = visualization[i]
+                results_dict['visualization'].append(vis)
+            if pred_out_dir != '':
+                mmengine.mkdir_or_exist(pred_out_dir)
+                for key, data in pred_data.items():
+                    post_fix = '_pred.png' if key == 'sem_seg' else '_pred.npy'
+                    img_name = str(self.num_pred_imgs).zfill(8) + post_fix
+                    img_path = osp.join(pred_out_dir, img_name)
+                    if key == 'sem_seg':
+                        output = Image.fromarray(data.astype(np.uint8))
+                        output.save(img_path)
+                    else:
+                        np.save(img_path, data)
+            pred_data = next(iter(pred_data.values()))
+            results_dict['predictions'].append(pred_data)
+            self.num_pred_imgs += 1
+
+        if len(results_dict['predictions']) == 1:
+            results_dict['predictions'] = results_dict['predictions'][0]
+            if visualization is not None:
+                results_dict['visualization'] = \
+                    results_dict['visualization'][0]
+        return results_dict
+
+    def _init_pipeline(self, cfg: ConfigType) -> Compose:
+        """Initialize the test pipeline.
+
+        Return a pipeline to handle various input data, such as ``str``,
+        ``np.ndarray``. It is an abstract method in BaseInferencer, and should
+        be implemented in subclasses.
+
+        The returned pipeline will be used to process a single data.
+        It will be used in :meth:`preprocess` like this:
+
+        .. code-block:: python
+            def preprocess(self, inputs, batch_size, **kwargs):
+                ...
+                dataset = map(self.pipeline, dataset)
+                ...
+        """
+        pipeline_cfg = cfg.test_dataloader.dataset.pipeline
+        # Loading annotations is also not applicable
+        for transform in ('LoadAnnotations', 'LoadDepthAnnotation'):
+            idx = self._get_transform_idx(pipeline_cfg, transform)
+            if idx != -1:
+                del pipeline_cfg[idx]
+
+        load_img_idx = self._get_transform_idx(pipeline_cfg,
+                                               'LoadImageFromFile')
+        if load_img_idx == -1:
+            raise ValueError(
+                'LoadImageFromFile is not found in the test pipeline')
+        pipeline_cfg[load_img_idx]['type'] = 'InferencerLoader'
+        return Compose(pipeline_cfg)
+
+    def _get_transform_idx(self, pipeline_cfg: ConfigType, name: str) -> int:
+        """Returns the index of the transform in a pipeline.
+
+        If the transform is not found, returns -1.
+        """
+        for i, transform in enumerate(pipeline_cfg):
+            if transform['type'] == name:
+                return i
+        return -1
diff --git a/head_extractor/build/lib/mmseg/apis/remote_sense_inferencer.py b/head_extractor/build/lib/mmseg/apis/remote_sense_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6726c6ae3464b3911f7e69b14a0baf35cffc66d0
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/apis/remote_sense_inferencer.py
@@ -0,0 +1,279 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import threading
+from queue import Queue
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine import Config
+from mmengine.model import BaseModel
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+
+try:
+    from osgeo import gdal
+except ImportError:
+    gdal = None
+
+from mmseg.registry import MODELS
+from .utils import _preprare_data
+
+
+class RSImage:
+    """Remote sensing image class.
+
+    Args:
+        img (str or gdal.Dataset): Image file path or gdal.Dataset.
+    """
+
+    def __init__(self, image):
+        self.dataset = gdal.Open(image, gdal.GA_ReadOnly) if isinstance(
+            image, str) else image
+        assert isinstance(self.dataset, gdal.Dataset), \
+            f'{image} is not a image'
+        self.width = self.dataset.RasterXSize
+        self.height = self.dataset.RasterYSize
+        self.channel = self.dataset.RasterCount
+        self.trans = self.dataset.GetGeoTransform()
+        self.proj = self.dataset.GetProjection()
+        self.band_list = []
+        self.band_list.extend(
+            self.dataset.GetRasterBand(c + 1) for c in range(self.channel))
+        self.grids = []
+
+    def read(self, grid: Optional[List] = None) -> np.ndarray:
+        """Read image data. If grid is None, read the whole image.
+
+        Args:
+            grid (Optional[List], optional): Grid to read. Defaults to None.
+        Returns:
+            np.ndarray: Image data.
+        """
+        if grid is None:
+            return np.einsum('ijk->jki', self.dataset.ReadAsArray())
+        assert len(
+            grid) >= 4, 'grid must be a list containing at least 4 elements'
+        data = self.dataset.ReadAsArray(*grid[:4])
+        if data.ndim == 2:
+            data = data[np.newaxis, ...]
+        return np.einsum('ijk->jki', data)
+
+    def write(self, data: Optional[np.ndarray], grid: Optional[List] = None):
+        """Write image data.
+
+        Args:
+            grid (Optional[List], optional): Grid to write. Defaults to None.
+            data (Optional[np.ndarray], optional): Data to write.
+                Defaults to None.
+
+        Raises:
+            ValueError: Either grid or data must be provided.
+        """
+        if grid is not None:
+            assert len(grid) == 8, 'grid must be a list of 8 elements'
+            for band in self.band_list:
+                band.WriteArray(
+                    data[grid[5]:grid[5] + grid[7], grid[4]:grid[4] + grid[6]],
+                    grid[0] + grid[4], grid[1] + grid[5])
+        elif data is not None:
+            for i in range(self.channel):
+                self.band_list[i].WriteArray(data[..., i])
+        else:
+            raise ValueError('Either grid or data must be provided.')
+
+    def create_seg_map(self, output_path: Optional[str] = None):
+        if output_path is None:
+            output_path = 'output_label.tif'
+        driver = gdal.GetDriverByName('GTiff')
+        seg_map = driver.Create(output_path, self.width, self.height, 1,
+                                gdal.GDT_Byte)
+        seg_map.SetGeoTransform(self.trans)
+        seg_map.SetProjection(self.proj)
+        seg_map_img = RSImage(seg_map)
+        seg_map_img.path = output_path
+        return seg_map_img
+
+    def create_grids(self,
+                     window_size: Tuple[int, int],
+                     stride: Tuple[int, int] = (0, 0)):
+        """Create grids for image inference.
+
+        Args:
+            window_size (Tuple[int, int]): the size of the sliding window.
+            stride (Tuple[int, int], optional): the stride of the sliding
+                window. Defaults to (0, 0).
+
+        Raises:
+            AssertionError: window_size must be a tuple of 2 elements.
+            AssertionError: stride must be a tuple of 2 elements.
+        """
+        assert len(
+            window_size) == 2, 'window_size must be a tuple of 2 elements'
+        assert len(stride) == 2, 'stride must be a tuple of 2 elements'
+        win_w, win_h = window_size
+        stride_x, stride_y = stride
+
+        stride_x = win_w if stride_x == 0 else stride_x
+        stride_y = win_h if stride_y == 0 else stride_y
+
+        x_half_overlap = (win_w - stride_x + 1) // 2
+        y_half_overlap = (win_h - stride_y + 1) // 2
+
+        for y in range(0, self.height, stride_y):
+            y_end = y + win_h >= self.height
+            y_offset = self.height - win_h if y_end else y
+            y_size = win_h
+            y_crop_off = 0 if y_offset == 0 else y_half_overlap
+            y_crop_size = y_size if y_end else win_h - y_crop_off
+
+            for x in range(0, self.width, stride_x):
+                x_end = x + win_w >= self.width
+                x_offset = self.width - win_w if x_end else x
+                x_size = win_w
+                x_crop_off = 0 if x_offset == 0 else x_half_overlap
+                x_crop_size = x_size if x_end else win_w - x_crop_off
+
+                self.grids.append([
+                    x_offset, y_offset, x_size, y_size, x_crop_off, y_crop_off,
+                    x_crop_size, y_crop_size
+                ])
+
+
+class RSInferencer:
+    """Remote sensing inference class.
+
+    Args:
+        model (BaseModel): The loaded model.
+        batch_size (int, optional): Batch size. Defaults to 1.
+        thread (int, optional): Number of threads. Defaults to 1.
+    """
+
+    def __init__(self, model: BaseModel, batch_size: int = 1, thread: int = 1):
+        self.model = model
+        self.batch_size = batch_size
+        self.END_FLAG = object()
+        self.read_buffer = Queue(self.batch_size)
+        self.write_buffer = Queue(self.batch_size)
+        self.thread = thread
+
+    @classmethod
+    def from_config_path(cls,
+                         config_path: str,
+                         checkpoint_path: str,
+                         batch_size: int = 1,
+                         thread: int = 1,
+                         device: Optional[str] = 'cpu'):
+        """Initialize a segmentor from config file.
+
+        Args:
+            config_path (str): Config file path.
+            checkpoint_path (str): Checkpoint path.
+            batch_size (int, optional): Batch size. Defaults to 1.
+        """
+        init_default_scope('mmseg')
+        cfg = Config.fromfile(config_path)
+        model = MODELS.build(cfg.model)
+        model.cfg = cfg
+        load_checkpoint(model, checkpoint_path, map_location='cpu')
+        model.to(device)
+        model.eval()
+        return cls(model, batch_size, thread)
+
+    @classmethod
+    def from_model(cls,
+                   model: BaseModel,
+                   checkpoint_path: Optional[str] = None,
+                   batch_size: int = 1,
+                   thread: int = 1,
+                   device: Optional[str] = 'cpu'):
+        """Initialize a segmentor from model.
+
+        Args:
+            model (BaseModel): The loaded model.
+            checkpoint_path (Optional[str]): Checkpoint path.
+            batch_size (int, optional): Batch size. Defaults to 1.
+        """
+        if checkpoint_path is not None:
+            load_checkpoint(model, checkpoint_path, map_location='cpu')
+        model.to(device)
+        return cls(model, batch_size, thread)
+
+    def read(self,
+             image: RSImage,
+             window_size: Tuple[int, int],
+             strides: Tuple[int, int] = (0, 0)):
+        """Load image data to read buffer.
+
+        Args:
+            image (RSImage): The image to read.
+            window_size (Tuple[int, int]): The size of the sliding window.
+            strides (Tuple[int, int], optional): The stride of the sliding
+                window. Defaults to (0, 0).
+        """
+        image.create_grids(window_size, strides)
+        for grid in image.grids:
+            self.read_buffer.put([grid, image.read(grid=grid)])
+        self.read_buffer.put(self.END_FLAG)
+
+    def inference(self):
+        """Inference image data from read buffer and put the result to write
+        buffer."""
+        while True:
+            item = self.read_buffer.get()
+            if item == self.END_FLAG:
+                self.read_buffer.put(self.END_FLAG)
+                self.write_buffer.put(item)
+                break
+            data, _ = _preprare_data(item[1], self.model)
+            with torch.no_grad():
+                result = self.model.test_step(data)
+            item[1] = result[0].pred_sem_seg.cpu().data.numpy()[0]
+            self.write_buffer.put(item)
+            self.read_buffer.task_done()
+
+    def write(self, image: RSImage, output_path: Optional[str] = None):
+        """Write image data from write buffer.
+
+        Args:
+            image (RSImage): The image to write.
+            output_path (Optional[str], optional): The path to save the
+                segmentation map. Defaults to None.
+        """
+        seg_map = image.create_seg_map(output_path)
+        while True:
+            item = self.write_buffer.get()
+            if item == self.END_FLAG:
+                break
+            seg_map.write(data=item[1], grid=item[0])
+            self.write_buffer.task_done()
+
+    def run(self,
+            image: RSImage,
+            window_size: Tuple[int, int],
+            strides: Tuple[int, int] = (0, 0),
+            output_path: Optional[str] = None):
+        """Run inference with multi-threading.
+
+        Args:
+            image (RSImage): The image to inference.
+            window_size (Tuple[int, int]): The size of the sliding window.
+            strides (Tuple[int, int], optional): The stride of the sliding
+                window. Defaults to (0, 0).
+            output_path (Optional[str], optional): The path to save the
+                segmentation map. Defaults to None.
+        """
+        read_thread = threading.Thread(
+            target=self.read, args=(image, window_size, strides))
+        read_thread.start()
+        inference_threads = []
+        for _ in range(self.thread):
+            inference_thread = threading.Thread(target=self.inference)
+            inference_thread.start()
+            inference_threads.append(inference_thread)
+        write_thread = threading.Thread(
+            target=self.write, args=(image, output_path))
+        write_thread.start()
+        read_thread.join()
+        for inference_thread in inference_threads:
+            inference_thread.join()
+        write_thread.join()
diff --git a/head_extractor/build/lib/mmseg/apis/utils.py b/head_extractor/build/lib/mmseg/apis/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..805196dc4644b8d7918be5da341cd4d12b81fc79
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/apis/utils.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import Sequence, Union
+
+import numpy as np
+from mmengine.dataset import Compose
+from mmengine.model import BaseModel
+
+import torch
+import cv2
+
+ImageType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]]
+
+
+def _preprare_data(imgs: ImageType, model: BaseModel):
+
+    cfg = model.cfg
+    for t in cfg.test_pipeline:
+        if t.get('type') == 'LoadAnnotations':
+            cfg.test_pipeline.remove(t)
+
+    is_batch = True
+    if not isinstance(imgs, (list, tuple)):
+        imgs = [imgs]
+        is_batch = False
+
+    if isinstance(imgs[0], np.ndarray):
+        cfg.test_pipeline[0]['type'] = 'LoadImageFromNDArray'
+
+    # TODO: Consider using the singleton pattern to avoid building
+    # a pipeline for each inference
+    pipeline = Compose(cfg.test_pipeline)
+
+    data = defaultdict(list)
+    for img in imgs:
+        if isinstance(img, np.ndarray):
+            data_ = dict(img=img)
+        else:
+            data_ = dict(img_path=img)
+        data_ = pipeline(data_)
+        data['inputs'].append(data_['inputs'])
+        data['data_samples'].append(data_['data_samples'])
+        # data['inputs'].append(data_['inputs'].to(dtype=torch.float16))
+        # data['data_samples'].append(data_['data_samples'].to(dtype=torch.float16))
+
+    return data, is_batch
diff --git a/head_extractor/build/lib/mmseg/configs/_base_/datasets/loveda.py b/head_extractor/build/lib/mmseg/configs/_base_/datasets/loveda.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb3d358fe437f46d1ce0e5a4c9b6a6c101b2e297
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/configs/_base_/datasets/loveda.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import (RandomFlip, RandomResize, Resize,
+                                        TestTimeAug)
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+
+from mmseg.datasets.loveda import LoveDADataset
+from mmseg.datasets.transforms.formatting import PackSegInputs
+from mmseg.datasets.transforms.loading import LoadAnnotations
+from mmseg.datasets.transforms.transforms import (PhotoMetricDistortion,
+                                                  RandomCrop)
+from mmseg.evaluation import IoUMetric
+
+# dataset settings
+dataset_type = LoveDADataset
+data_root = 'data/loveDA'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type=LoadImageFromFile),
+    dict(type=LoadAnnotations, reduce_zero_label=True),
+    dict(
+        type=RandomResize,
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PhotoMetricDistortion),
+    dict(type=PackSegInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile),
+    dict(type=Resize, scale=(1024, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type=LoadAnnotations, reduce_zero_label=True),
+    dict(type=PackSegInputs)
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=None),
+    dict(
+        type=TestTimeAug,
+        transforms=[[
+            dict(type=Resize, scale_factor=r, keep_ratio=True)
+            for r in img_ratios
+        ],
+                    [
+                        dict(type=RandomFlip, prob=0., direction='horizontal'),
+                        dict(type=RandomFlip, prob=1., direction='horizontal')
+                    ], [dict(type=LoadAnnotations)],
+                    [dict(type=PackSegInputs)]])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=12,
+    persistent_workers=True,
+    sampler=dict(type=InfiniteSampler, shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='img_dir/train', seg_map_path='ann_dir/train'),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
+        pipeline=test_pipeline))
+
+test_dataloader = val_dataloader
+val_evaluator = dict(type=IoUMetric, iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/build/lib/mmseg/configs/_base_/datasets/potsdam.py b/head_extractor/build/lib/mmseg/configs/_base_/datasets/potsdam.py
new file mode 100644
index 0000000000000000000000000000000000000000..33a4ebfd8f841ff502828c0adbc60c963e154897
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/configs/_base_/datasets/potsdam.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import (RandomFlip, RandomResize, Resize,
+                                        TestTimeAug)
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+
+from mmseg.datasets.potsdam import PotsdamDataset
+from mmseg.datasets.transforms.formatting import PackSegInputs
+from mmseg.datasets.transforms.loading import LoadAnnotations
+from mmseg.datasets.transforms.transforms import (PhotoMetricDistortion,
+                                                  RandomCrop)
+from mmseg.evaluation import IoUMetric
+
+# dataset settings
+dataset_type = PotsdamDataset
+data_root = 'data/potsdam'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type=LoadImageFromFile),
+    dict(type=LoadAnnotations, reduce_zero_label=True),
+    dict(
+        type=RandomResize,
+        scale=(512, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PhotoMetricDistortion),
+    dict(type=PackSegInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile),
+    dict(type=Resize, scale=(512, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type=LoadAnnotations, reduce_zero_label=True),
+    dict(type=PackSegInputs)
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=None),
+    dict(
+        type=TestTimeAug,
+        transforms=[[
+            dict(type=Resize, scale_factor=r, keep_ratio=True)
+            for r in img_ratios
+        ],
+                    [
+                        dict(type=RandomFlip, prob=0., direction='horizontal'),
+                        dict(type=RandomFlip, prob=1., direction='horizontal')
+                    ], [dict(type=LoadAnnotations)],
+                    [dict(type=PackSegInputs)]])
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type=InfiniteSampler, shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='img_dir/train', seg_map_path='ann_dir/train'),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=IoUMetric, iou_metrics=['mIoU'])  # 'mDice', 'mFscore'
+test_evaluator = val_evaluator
diff --git a/head_extractor/build/lib/mmseg/configs/_base_/default_runtime.py b/head_extractor/build/lib/mmseg/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..c90502096b36abeda6be0840da67c255bd223d9f
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/configs/_base_/default_runtime.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmengine.visualization import LocalVisBackend
+
+from mmseg.models import SegTTAModel
+from mmseg.visualization import SegLocalVisualizer
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=SegLocalVisualizer, vis_backends=vis_backends, name='visualizer')
+log_processor = dict(by_epoch=False)
+log_level = 'INFO'
+load_from = None
+resume = False
+
+tta_model = dict(type=SegTTAModel)
+default_scope = None
diff --git a/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_160k.py b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_160k.py
new file mode 100644
index 0000000000000000000000000000000000000000..294d6ee3f534f39676f2fa7cdb99bc4321320351
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_160k.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import PolyLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+from mmseg.engine import SegVisualizationHook
+
+# optimizer
+optimizer = dict(
+    type=SGD,
+    #  lr=0.01,
+    #  momentum=0.9,
+    #  weight_decay=0.0005
+)
+
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type=PolyLR,
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+# training schedule for 160k
+
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=160000, val_interval=8000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=8000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_20k.py b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..255300a1f4e4161deb80a34298b734a80f685f92
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_20k.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import PolyLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+from mmseg.engine import SegVisualizationHook
+
+# optimizer
+optimizer = dict(type=SGD, lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type=PolyLR,
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=20000,
+        by_epoch=False)
+]
+# training schedule for 20k
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=20000, val_interval=2000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=2000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_240k.py b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_240k.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf9e5d3c94e2d70992a7f66b4a38ee3a0ba8c8f8
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_240k.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import PolyLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+# from mmengine.runner.loops import EpochBasedTrainLoop
+from torch.optim.sgd import SGD
+
+from mmseg.engine import SegVisualizationHook
+
+optimizer = dict(type=SGD, lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type=PolyLR,
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=240000,
+        by_epoch=False)
+]
+# training schedule for 240k
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=240000, val_interval=24000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=24000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_25k.py b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_25k.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a3ebf405e431973d4dc9ef541f424dedf5f83ce
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_25k.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import ConstantLR, LinearLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+# from mmengine.runner.loops import EpochBasedTrainLoop
+from torch.optim.adamw import AdamW
+
+from mmseg.engine import SegVisualizationHook
+from mmseg.engine.schedulers import PolyLRRatio
+
+# optimizer
+optimizer = dict(type=AdamW, lr=0.01, weight_decay=0.1)
+
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+# learning policy
+
+# learning policy
+param_scheduler = [
+    dict(type=LinearLR, start_factor=3e-2, begin=0, end=12000, by_epoch=False),
+    dict(
+        type=PolyLRRatio,
+        eta_min_ratio=3e-2,
+        power=0.9,
+        begin=12000,
+        end=24000,
+        by_epoch=False),
+    dict(type=ConstantLR, by_epoch=False, factor=1, begin=24000, end=25000)
+]
+
+# training schedule for 25k
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=25000, val_interval=1000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=1000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_320k.py b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_320k.py
new file mode 100644
index 0000000000000000000000000000000000000000..dae323ec48adf33f67d91d5843f787a63bcac90c
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_320k.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import PolyLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+# from mmengine.runner.loops import EpochBasedTrainLoop
+from torch.optim.sgd import SGD
+
+from mmseg.engine import SegVisualizationHook
+
+# optimizer
+optimizer = dict(type=SGD, lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type=PolyLR,
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=320000,
+        by_epoch=False)
+]
+# training schedule for 320k
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=320000, val_interval=32000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=32000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_40k.py b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_40k.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4b2ea42b5c13f588497b654023e7b539fadc565
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_40k.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import PolyLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+from mmseg.engine import SegVisualizationHook
+
+# optimizer
+optimizer = dict(type=SGD, lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type=PolyLR,
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=40000,
+        by_epoch=False)
+]
+# training schedule for 40k
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=40000, val_interval=4000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=4000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_80k.py b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_80k.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e711ca891564993f86eba0af6a45ff4ab1543dd
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/configs/_base_/schedules/schedule_80k.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import PolyLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+from mmseg.engine import SegVisualizationHook
+
+# optimizer
+optimizer = dict(
+    type=SGD,
+    #  lr=0.01,
+    #  momentum=0.9,
+    #  weight_decay=0.0005
+)
+
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type=PolyLR,
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=80000,
+        by_epoch=False)
+]
+# training schedule for 80k
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=80000, val_interval=8000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=8000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/build/lib/mmseg/datasets/__init__.py b/head_extractor/build/lib/mmseg/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..528bdac67fe371b14a7c252b87090fbfecce78c5
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/__init__.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# yapf: disable
+from .ade import ADE20KDataset
+from .basesegdataset import BaseCDDataset, BaseSegDataset
+from .bdd100k import BDD100KDataset
+from .chase_db1 import ChaseDB1Dataset
+from .cityscapes import CityscapesDataset
+from .coco_stuff import COCOStuffDataset
+from .dark_zurich import DarkZurichDataset
+from .dataset_wrappers import MultiImageMixDataset
+from .decathlon import DecathlonDataset
+from .drive import DRIVEDataset
+from .dsdl import DSDLSegDataset
+from .hrf import HRFDataset
+from .hsi_drive import HSIDrive20Dataset
+from .isaid import iSAIDDataset
+from .isprs import ISPRSDataset
+from .levir import LEVIRCDDataset
+from .lip import LIPDataset
+from .loveda import LoveDADataset
+from .mapillary import MapillaryDataset_v1, MapillaryDataset_v2
+from .night_driving import NightDrivingDataset
+from .nyu import NYUDataset
+from .pascal_context import PascalContextDataset, PascalContextDataset59
+from .potsdam import PotsdamDataset
+from .refuge import REFUGEDataset
+from .stare import STAREDataset
+from .synapse import SynapseDataset
+from .fashion import FashionDataset
+from .fashion_3category import fashion3categoryDataset
+from .deep_fashion import DeepFashionDataset
+from .human_parsing import HumanParsingDataset
+from .human_union import HumanUnionDataset
+from .deepfashion_10k import DeepFashion10KDataset
+from .imaterialist import iMaterialistDataset
+from .imaterialist_5cat import iMaterialist_5Cat_Dataset
+from .union_new import UnionNewKDataset
+from .union_new_add_mask import UnionNewAddMaskDataset
+
+# yapf: disable
+from .transforms import (CLAHE, AdjustGamma, Albu, BioMedical3DPad,
+                         BioMedical3DRandomCrop, BioMedical3DRandomFlip,
+                         BioMedicalGaussianBlur, BioMedicalGaussianNoise,
+                         BioMedicalRandomGamma, ConcatCDInput, GenerateEdge,
+                         LoadAnnotations, LoadBiomedicalAnnotation,
+                         LoadBiomedicalData, LoadBiomedicalImageFromFile,
+                         LoadImageFromNDArray, LoadMultipleRSImageFromFile,
+                         LoadSingleRSImageFromFile, PackSegInputs,
+                         PhotoMetricDistortion, RandomCrop, RandomCutOut,
+                         RandomMosaic, RandomRotate, RandomRotFlip, Rerange,
+                         ResizeShortestEdge, ResizeToMultiple, RGB2Gray,
+                         SegRescale)
+from .voc import PascalVOCDataset
+
+# yapf: enable
+__all__ = [
+    'BaseSegDataset', 'BioMedical3DRandomCrop', 'BioMedical3DRandomFlip',
+    'CityscapesDataset', 'PascalVOCDataset', 'ADE20KDataset',
+    'PascalContextDataset', 'PascalContextDataset59', 'ChaseDB1Dataset',
+    'DRIVEDataset', 'HRFDataset', 'STAREDataset', 'DarkZurichDataset',
+    'NightDrivingDataset', 'COCOStuffDataset', 'LoveDADataset',
+    'MultiImageMixDataset', 'iSAIDDataset', 'ISPRSDataset', 'PotsdamDataset',
+    'LoadAnnotations', 'RandomCrop', 'SegRescale', 'PhotoMetricDistortion',
+    'RandomRotate', 'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray',
+    'RandomCutOut', 'RandomMosaic', 'PackSegInputs', 'ResizeToMultiple',
+    'LoadImageFromNDArray', 'LoadBiomedicalImageFromFile',
+    'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge',
+    'DecathlonDataset', 'LIPDataset', 'ResizeShortestEdge',
+    'BioMedicalGaussianNoise', 'BioMedicalGaussianBlur',
+    'BioMedicalRandomGamma', 'BioMedical3DPad', 'RandomRotFlip',
+    'SynapseDataset', 'REFUGEDataset', 'MapillaryDataset_v1',
+    'MapillaryDataset_v2', 'Albu', 'LEVIRCDDataset',
+    'LoadMultipleRSImageFromFile', 'LoadSingleRSImageFromFile',
+    'ConcatCDInput', 'BaseCDDataset', 'DSDLSegDataset', 'BDD100KDataset',
+    'NYUDataset', 'HSIDrive20Dataset', 'FashionDataset', 'fashion3categoryDataset', 'DeepFashionDataset',
+    'HumanParsingDataset', 'HumanUnionDataset', 'DeepFashion10KDataset', 'iMaterialistDataset', 'iMaterialist_5Cat_Dataset',
+    'UnionNewKDataset', 'UnionNewAddMaskDataset'
+]
diff --git a/head_extractor/build/lib/mmseg/datasets/ade.py b/head_extractor/build/lib/mmseg/datasets/ade.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9bdae7421205f25d39441381d6492e9208a4714
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/ade.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class ADE20KDataset(BaseSegDataset):
+    """ADE20K dataset.
+
+    In segmentation map annotation for ADE20K, 0 stands for background, which
+    is not included in 150 categories. ``reduce_zero_label`` is fixed to True.
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is fixed to
+    '.png'.
+    """
+    METAINFO = dict(
+        classes=('wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road',
+                 'bed ', 'windowpane', 'grass', 'cabinet', 'sidewalk',
+                 'person', 'earth', 'door', 'table', 'mountain', 'plant',
+                 'curtain', 'chair', 'car', 'water', 'painting', 'sofa',
+                 'shelf', 'house', 'sea', 'mirror', 'rug', 'field', 'armchair',
+                 'seat', 'fence', 'desk', 'rock', 'wardrobe', 'lamp',
+                 'bathtub', 'railing', 'cushion', 'base', 'box', 'column',
+                 'signboard', 'chest of drawers', 'counter', 'sand', 'sink',
+                 'skyscraper', 'fireplace', 'refrigerator', 'grandstand',
+                 'path', 'stairs', 'runway', 'case', 'pool table', 'pillow',
+                 'screen door', 'stairway', 'river', 'bridge', 'bookcase',
+                 'blind', 'coffee table', 'toilet', 'flower', 'book', 'hill',
+                 'bench', 'countertop', 'stove', 'palm', 'kitchen island',
+                 'computer', 'swivel chair', 'boat', 'bar', 'arcade machine',
+                 'hovel', 'bus', 'towel', 'light', 'truck', 'tower',
+                 'chandelier', 'awning', 'streetlight', 'booth',
+                 'television receiver', 'airplane', 'dirt track', 'apparel',
+                 'pole', 'land', 'bannister', 'escalator', 'ottoman', 'bottle',
+                 'buffet', 'poster', 'stage', 'van', 'ship', 'fountain',
+                 'conveyer belt', 'canopy', 'washer', 'plaything',
+                 'swimming pool', 'stool', 'barrel', 'basket', 'waterfall',
+                 'tent', 'bag', 'minibike', 'cradle', 'oven', 'ball', 'food',
+                 'step', 'tank', 'trade name', 'microwave', 'pot', 'animal',
+                 'bicycle', 'lake', 'dishwasher', 'screen', 'blanket',
+                 'sculpture', 'hood', 'sconce', 'vase', 'traffic light',
+                 'tray', 'ashcan', 'fan', 'pier', 'crt screen', 'plate',
+                 'monitor', 'bulletin board', 'shower', 'radiator', 'glass',
+                 'clock', 'flag'),
+        palette=[[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+                 [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+                 [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+                 [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+                 [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+                 [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+                 [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+                 [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+                 [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+                 [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+                 [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+                 [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+                 [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+                 [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+                 [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+                 [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+                 [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+                 [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+                 [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+                 [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+                 [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+                 [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+                 [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+                 [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+                 [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+                 [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+                 [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+                 [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+                 [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+                 [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+                 [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+                 [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+                 [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+                 [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+                 [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+                 [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+                 [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+                 [102, 255, 0], [92, 0, 255]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/basesegdataset.py b/head_extractor/build/lib/mmseg/datasets/basesegdataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c4668c1f561961fb27642fb7c1ac702f626cbb7
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/basesegdataset.py
@@ -0,0 +1,552 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import Callable, Dict, List, Optional, Sequence, Union
+
+import mmengine
+import mmengine.fileio as fileio
+import numpy as np
+from mmengine.dataset import BaseDataset, Compose
+
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BaseSegDataset(BaseDataset):
+    """Custom dataset for semantic segmentation. An example of file structure
+    is as followed.
+
+    .. code-block:: none
+
+        ├── data
+        │   ├── my_dataset
+        │   │   ├── img_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{img_suffix}
+        │   │   │   │   ├── yyy{img_suffix}
+        │   │   │   │   ├── zzz{img_suffix}
+        │   │   │   ├── val
+        │   │   ├── ann_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{seg_map_suffix}
+        │   │   │   │   ├── yyy{seg_map_suffix}
+        │   │   │   │   ├── zzz{seg_map_suffix}
+        │   │   │   ├── val
+
+    The img/gt_semantic_seg pair of BaseSegDataset should be of the same
+    except suffix. A valid img/gt_semantic_seg filename pair should be like
+    ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included
+    in the suffix). If split is given, then ``xxx`` is specified in txt file.
+    Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded.
+    Please refer to ``docs/en/tutorials/new_dataset.md`` for more details.
+
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as
+            specify classes to load. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            dict(img_path=None, seg_map_path=None).
+        img_suffix (str): Suffix of images. Default: '.jpg'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+        ignore_index (int): The label index to be ignored. Default: 255
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+    METAINFO: dict = dict()
+
+    def __init__(self,
+                 ann_file: str = '',
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: dict = dict(img_path='', seg_map_path=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000,
+                 ignore_index: int = 255,
+                 reduce_zero_label: bool = False,
+                 backend_args: Optional[dict] = None) -> None:
+
+        self.img_suffix = img_suffix
+        self.seg_map_suffix = seg_map_suffix
+        self.ignore_index = ignore_index
+        self.reduce_zero_label = reduce_zero_label
+        self.backend_args = backend_args.copy() if backend_args else None
+
+        self.data_root = data_root
+        self.data_prefix = copy.copy(data_prefix)
+        self.ann_file = ann_file
+        self.filter_cfg = copy.deepcopy(filter_cfg)
+        self._indices = indices
+        self.serialize_data = serialize_data
+        self.test_mode = test_mode
+        self.max_refetch = max_refetch
+        self.data_list: List[dict] = []
+        self.data_bytes: np.ndarray
+
+        # Set meta information.
+        self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
+
+        # Get label map for custom classes
+        new_classes = self._metainfo.get('classes', None)
+        self.label_map = self.get_label_map(new_classes)
+        self._metainfo.update(
+            dict(
+                label_map=self.label_map,
+                reduce_zero_label=self.reduce_zero_label))
+
+        # Update palette based on label map or generate palette
+        # if it is not defined
+        updated_palette = self._update_palette()
+        self._metainfo.update(dict(palette=updated_palette))
+
+        # Join paths.
+        if self.data_root is not None:
+            self._join_prefix()
+
+        # Build pipeline.
+        self.pipeline = Compose(pipeline)
+        # Full initialize the dataset.
+        if not lazy_init:
+            self.full_init()
+
+        if test_mode:
+            assert self._metainfo.get('classes') is not None, \
+                'dataset metainfo `classes` should be specified when testing'
+
+    @classmethod
+    def get_label_map(cls,
+                      new_classes: Optional[Sequence] = None
+                      ) -> Union[Dict, None]:
+        """Require label mapping.
+
+        The ``label_map`` is a dictionary, its keys are the old label ids and
+        its values are the new label ids, and is used for changing pixel
+        labels in load_annotations. If and only if old classes in cls.METAINFO
+        is not equal to new classes in self._metainfo and nether of them is not
+        None, `label_map` is not None.
+
+        Args:
+            new_classes (list, tuple, optional): The new classes name from
+                metainfo. Default to None.
+
+
+        Returns:
+            dict, optional: The mapping from old classes in cls.METAINFO to
+                new classes in self._metainfo
+        """
+        old_classes = cls.METAINFO.get('classes', None)
+        if (new_classes is not None and old_classes is not None
+                and list(new_classes) != list(old_classes)):
+
+            label_map = {}
+            if not set(new_classes).issubset(cls.METAINFO['classes']):
+                raise ValueError(
+                    f'new classes {new_classes} is not a '
+                    f'subset of classes {old_classes} in METAINFO.')
+            for i, c in enumerate(old_classes):
+                if c not in new_classes:
+                    label_map[i] = 255
+                else:
+                    label_map[i] = new_classes.index(c)
+            return label_map
+        else:
+            return None
+
+    def _update_palette(self) -> list:
+        """Update palette after loading metainfo.
+
+        If length of palette is equal to classes, just return the palette.
+        If palette is not defined, it will randomly generate a palette.
+        If classes is updated by customer, it will return the subset of
+        palette.
+
+        Returns:
+            Sequence: Palette for current dataset.
+        """
+        palette = self._metainfo.get('palette', [])
+        classes = self._metainfo.get('classes', [])
+        # palette does match classes
+        if len(palette) == len(classes):
+            return palette
+
+        if len(palette) == 0:
+            # Get random state before set seed, and restore
+            # random state later.
+            # It will prevent loss of randomness, as the palette
+            # may be different in each iteration if not specified.
+            # See: https://github.com/open-mmlab/mmdetection/issues/5844
+            state = np.random.get_state()
+            np.random.seed(42)
+            # random palette
+            new_palette = np.random.randint(
+                0, 255, size=(len(classes), 3)).tolist()
+            np.random.set_state(state)
+        elif len(palette) >= len(classes) and self.label_map is not None:
+            new_palette = []
+            # return subset of palette
+            for old_id, new_id in sorted(
+                    self.label_map.items(), key=lambda x: x[1]):
+                if new_id != 255:
+                    new_palette.append(palette[old_id])
+            new_palette = type(palette)(new_palette)
+        else:
+            raise ValueError('palette does not match classes '
+                             f'as metainfo is {self._metainfo}.')
+        return new_palette
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        ann_dir = self.data_prefix.get('seg_map_path', None)
+        if not osp.isdir(self.ann_file) and self.ann_file:
+            assert osp.isfile(self.ann_file), \
+                f'Failed to load `ann_file` {self.ann_file}'
+            lines = mmengine.list_from_file(
+                self.ann_file, backend_args=self.backend_args)
+            for line in lines:
+                img_name = line.strip()
+                data_info = dict(
+                    img_path=osp.join(img_dir, img_name + self.img_suffix))
+                if ann_dir is not None:
+                    seg_map = img_name + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_info['reduce_zero_label'] = self.reduce_zero_label
+                data_info['seg_fields'] = []
+                data_list.append(data_info)
+        else:
+            _suffix_len = len(self.img_suffix)
+            for img in fileio.list_dir_or_file(
+                    dir_path=img_dir,
+                    list_dir=False,
+                    suffix=self.img_suffix,
+                    recursive=True,
+                    backend_args=self.backend_args):
+                data_info = dict(img_path=osp.join(img_dir, img))
+                if ann_dir is not None:
+                    seg_map = img[:-_suffix_len] + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_info['reduce_zero_label'] = self.reduce_zero_label
+                data_info['seg_fields'] = []
+                data_list.append(data_info)
+            data_list = sorted(data_list, key=lambda x: x['img_path'])
+        return data_list
+
+
+@DATASETS.register_module()
+class BaseCDDataset(BaseDataset):
+    """Custom dataset for change detection. An example of file structure is as
+    followed.
+
+    .. code-block:: none
+
+        ├── data
+        │   ├── my_dataset
+        │   │   ├── img_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{img_suffix}
+        │   │   │   │   ├── yyy{img_suffix}
+        │   │   │   │   ├── zzz{img_suffix}
+        │   │   │   ├── val
+        │   │   ├── img_dir2
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{img_suffix}
+        │   │   │   │   ├── yyy{img_suffix}
+        │   │   │   │   ├── zzz{img_suffix}
+        │   │   │   ├── val
+        │   │   ├── ann_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{seg_map_suffix}
+        │   │   │   │   ├── yyy{seg_map_suffix}
+        │   │   │   │   ├── zzz{seg_map_suffix}
+        │   │   │   ├── val
+
+    The image names in img_dir and img_dir2 should be consistent.
+    The img/gt_semantic_seg pair of BaseSegDataset should be of the same
+    except suffix. A valid img/gt_semantic_seg filename pair should be like
+    ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included
+    in the suffix). If split is given, then ``xxx`` is specified in txt file.
+    Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded.
+    Please refer to ``docs/en/tutorials/new_dataset.md`` for more details.
+
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as
+            specify classes to load. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            dict(img_path=None, img_path2=None, seg_map_path=None).
+        img_suffix (str): Suffix of images. Default: '.jpg'
+        img_suffix2 (str): Suffix of images. Default: '.jpg'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+        ignore_index (int): The label index to be ignored. Default: 255
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+    METAINFO: dict = dict()
+
+    def __init__(self,
+                 ann_file: str = '',
+                 img_suffix='.jpg',
+                 img_suffix2='.jpg',
+                 seg_map_suffix='.png',
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: dict = dict(
+                     img_path='', img_path2='', seg_map_path=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000,
+                 ignore_index: int = 255,
+                 reduce_zero_label: bool = False,
+                 backend_args: Optional[dict] = None) -> None:
+
+        self.img_suffix = img_suffix
+        self.img_suffix2 = img_suffix2
+        self.seg_map_suffix = seg_map_suffix
+        self.ignore_index = ignore_index
+        self.reduce_zero_label = reduce_zero_label
+        self.backend_args = backend_args.copy() if backend_args else None
+
+        self.data_root = data_root
+        self.data_prefix = copy.copy(data_prefix)
+        self.ann_file = ann_file
+        self.filter_cfg = copy.deepcopy(filter_cfg)
+        self._indices = indices
+        self.serialize_data = serialize_data
+        self.test_mode = test_mode
+        self.max_refetch = max_refetch
+        self.data_list: List[dict] = []
+        self.data_bytes: np.ndarray
+
+        # Set meta information.
+        self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
+
+        # Get label map for custom classes
+        new_classes = self._metainfo.get('classes', None)
+        self.label_map = self.get_label_map(new_classes)
+        self._metainfo.update(
+            dict(
+                label_map=self.label_map,
+                reduce_zero_label=self.reduce_zero_label))
+
+        # Update palette based on label map or generate palette
+        # if it is not defined
+        updated_palette = self._update_palette()
+        self._metainfo.update(dict(palette=updated_palette))
+
+        # Join paths.
+        if self.data_root is not None:
+            self._join_prefix()
+
+        # Build pipeline.
+        self.pipeline = Compose(pipeline)
+        # Full initialize the dataset.
+        if not lazy_init:
+            self.full_init()
+
+        if test_mode:
+            assert self._metainfo.get('classes') is not None, \
+                'dataset metainfo `classes` should be specified when testing'
+
+    @classmethod
+    def get_label_map(cls,
+                      new_classes: Optional[Sequence] = None
+                      ) -> Union[Dict, None]:
+        """Require label mapping.
+
+        The ``label_map`` is a dictionary, its keys are the old label ids and
+        its values are the new label ids, and is used for changing pixel
+        labels in load_annotations. If and only if old classes in cls.METAINFO
+        is not equal to new classes in self._metainfo and nether of them is not
+        None, `label_map` is not None.
+
+        Args:
+            new_classes (list, tuple, optional): The new classes name from
+                metainfo. Default to None.
+
+
+        Returns:
+            dict, optional: The mapping from old classes in cls.METAINFO to
+                new classes in self._metainfo
+        """
+        old_classes = cls.METAINFO.get('classes', None)
+        if (new_classes is not None and old_classes is not None
+                and list(new_classes) != list(old_classes)):
+
+            label_map = {}
+            if not set(new_classes).issubset(cls.METAINFO['classes']):
+                raise ValueError(
+                    f'new classes {new_classes} is not a '
+                    f'subset of classes {old_classes} in METAINFO.')
+            for i, c in enumerate(old_classes):
+                if c not in new_classes:
+                    label_map[i] = 255
+                else:
+                    label_map[i] = new_classes.index(c)
+            return label_map
+        else:
+            return None
+
+    def _update_palette(self) -> list:
+        """Update palette after loading metainfo.
+
+        If length of palette is equal to classes, just return the palette.
+        If palette is not defined, it will randomly generate a palette.
+        If classes is updated by customer, it will return the subset of
+        palette.
+
+        Returns:
+            Sequence: Palette for current dataset.
+        """
+        palette = self._metainfo.get('palette', [])
+        classes = self._metainfo.get('classes', [])
+        # palette does match classes
+        if len(palette) == len(classes):
+            return palette
+
+        if len(palette) == 0:
+            # Get random state before set seed, and restore
+            # random state later.
+            # It will prevent loss of randomness, as the palette
+            # may be different in each iteration if not specified.
+            # See: https://github.com/open-mmlab/mmdetection/issues/5844
+            state = np.random.get_state()
+            np.random.seed(42)
+            # random palette
+            new_palette = np.random.randint(
+                0, 255, size=(len(classes), 3)).tolist()
+            np.random.set_state(state)
+        elif len(palette) >= len(classes) and self.label_map is not None:
+            new_palette = []
+            # return subset of palette
+            for old_id, new_id in sorted(
+                    self.label_map.items(), key=lambda x: x[1]):
+                if new_id != 255:
+                    new_palette.append(palette[old_id])
+            new_palette = type(palette)(new_palette)
+        else:
+            raise ValueError('palette does not match classes '
+                             f'as metainfo is {self._metainfo}.')
+        return new_palette
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        img_dir2 = self.data_prefix.get('img_path2', None)
+        ann_dir = self.data_prefix.get('seg_map_path', None)
+        if osp.isfile(self.ann_file):
+            lines = mmengine.list_from_file(
+                self.ann_file, backend_args=self.backend_args)
+            for line in lines:
+                img_name = line.strip()
+                if '.' in osp.basename(img_name):
+                    img_name, img_ext = osp.splitext(img_name)
+                    self.img_suffix = img_ext
+                    self.img_suffix2 = img_ext
+                data_info = dict(
+                    img_path=osp.join(img_dir, img_name + self.img_suffix),
+                    img_path2=osp.join(img_dir2, img_name + self.img_suffix2))
+
+                if ann_dir is not None:
+                    seg_map = img_name + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_info['reduce_zero_label'] = self.reduce_zero_label
+                data_info['seg_fields'] = []
+                data_list.append(data_info)
+        else:
+            for img in fileio.list_dir_or_file(
+                    dir_path=img_dir,
+                    list_dir=False,
+                    suffix=self.img_suffix,
+                    recursive=True,
+                    backend_args=self.backend_args):
+                if '.' in osp.basename(img):
+                    img, img_ext = osp.splitext(img)
+                    self.img_suffix = img_ext
+                    self.img_suffix2 = img_ext
+                data_info = dict(
+                    img_path=osp.join(img_dir, img + self.img_suffix),
+                    img_path2=osp.join(img_dir2, img + self.img_suffix2))
+                if ann_dir is not None:
+                    seg_map = img + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_info['reduce_zero_label'] = self.reduce_zero_label
+                data_info['seg_fields'] = []
+                data_list.append(data_info)
+            data_list = sorted(data_list, key=lambda x: x['img_path'])
+        return data_list
diff --git a/head_extractor/build/lib/mmseg/datasets/bdd100k.py b/head_extractor/build/lib/mmseg/datasets/bdd100k.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ae70b5cb29f2b34c5804129c85622bfcca6767d
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/bdd100k.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmseg.datasets.basesegdataset import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BDD100KDataset(BaseSegDataset):
+    METAINFO = dict(
+        classes=('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+                 'traffic light', 'traffic sign', 'vegetation', 'terrain',
+                 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train',
+                 'motorcycle', 'bicycle'),
+        palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+                 [190, 153, 153], [153, 153, 153], [250, 170,
+                                                    30], [220, 220, 0],
+                 [107, 142, 35], [152, 251, 152], [70, 130, 180],
+                 [220, 20, 60], [255, 0, 0], [0, 0, 142], [0, 0, 70],
+                 [0, 60, 100], [0, 80, 100], [0, 0, 230], [119, 11, 32]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/chase_db1.py b/head_extractor/build/lib/mmseg/datasets/chase_db1.py
new file mode 100644
index 0000000000000000000000000000000000000000..626ddf75e9a2a10a09ca1f298f12f4290268d504
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/chase_db1.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class ChaseDB1Dataset(BaseSegDataset):
+    """Chase_db1 dataset.
+
+    In segmentation map annotation for Chase_db1, 0 stands for background,
+    which is included in 2 categories. ``reduce_zero_label`` is fixed to False.
+    The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '_1stHO.png'.
+    """
+    METAINFO = dict(
+        classes=('background', 'vessel'),
+        palette=[[120, 120, 120], [6, 230, 230]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_1stHO.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/head_extractor/build/lib/mmseg/datasets/cityscapes.py b/head_extractor/build/lib/mmseg/datasets/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..f494d62424a39581961ab705b3308e7e07bee110
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/cityscapes.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class CityscapesDataset(BaseSegDataset):
+    """Cityscapes dataset.
+
+    The ``img_suffix`` is fixed to '_leftImg8bit.png' and ``seg_map_suffix`` is
+    fixed to '_gtFine_labelTrainIds.png' for Cityscapes dataset.
+    """
+    METAINFO = dict(
+        classes=('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+                 'traffic light', 'traffic sign', 'vegetation', 'terrain',
+                 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train',
+                 'motorcycle', 'bicycle'),
+        palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+                 [190, 153, 153], [153, 153, 153], [250, 170,
+                                                    30], [220, 220, 0],
+                 [107, 142, 35], [152, 251, 152], [70, 130, 180],
+                 [220, 20, 60], [255, 0, 0], [0, 0, 142], [0, 0, 70],
+                 [0, 60, 100], [0, 80, 100], [0, 0, 230], [119, 11, 32]])
+
+    def __init__(self,
+                 img_suffix='_leftImg8bit.png',
+                 seg_map_suffix='_gtFine_labelTrainIds.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/coco_stuff.py b/head_extractor/build/lib/mmseg/datasets/coco_stuff.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1574d9702330cc5b10bab084841df61e7121ff
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/coco_stuff.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class COCOStuffDataset(BaseSegDataset):
+    """COCO-Stuff dataset.
+
+    In segmentation map annotation for COCO-Stuff, Train-IDs of the 10k version
+    are from 1 to 171, where 0 is the ignore index, and Train-ID of COCO Stuff
+    164k is from 0 to 170, where 255 is the ignore index. So, they are all 171
+    semantic categories. ``reduce_zero_label`` is set to True and False for the
+    10k and 164k versions, respectively. The ``img_suffix`` is fixed to '.jpg',
+    and ``seg_map_suffix`` is fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=(
+            'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+            'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+            'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+            'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+            'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+            'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
+            'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+            'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+            'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+            'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
+            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+            'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+            'blanket', 'branch', 'bridge', 'building-other', 'bush', 'cabinet',
+            'cage', 'cardboard', 'carpet', 'ceiling-other', 'ceiling-tile',
+            'cloth', 'clothes', 'clouds', 'counter', 'cupboard', 'curtain',
+            'desk-stuff', 'dirt', 'door-stuff', 'fence', 'floor-marble',
+            'floor-other', 'floor-stone', 'floor-tile', 'floor-wood', 'flower',
+            'fog', 'food-other', 'fruit', 'furniture-other', 'grass', 'gravel',
+            'ground-other', 'hill', 'house', 'leaves', 'light', 'mat', 'metal',
+            'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net',
+            'paper', 'pavement', 'pillow', 'plant-other', 'plastic',
+            'platform', 'playingfield', 'railing', 'railroad', 'river', 'road',
+            'rock', 'roof', 'rug', 'salad', 'sand', 'sea', 'shelf',
+            'sky-other', 'skyscraper', 'snow', 'solid-other', 'stairs',
+            'stone', 'straw', 'structural-other', 'table', 'tent',
+            'textile-other', 'towel', 'tree', 'vegetable', 'wall-brick',
+            'wall-concrete', 'wall-other', 'wall-panel', 'wall-stone',
+            'wall-tile', 'wall-wood', 'water-other', 'waterdrops',
+            'window-blind', 'window-other', 'wood'),
+        palette=[[0, 192, 64], [0, 192, 64], [0, 64, 96], [128, 192, 192],
+                 [0, 64, 64], [0, 192, 224], [0, 192, 192], [128, 192, 64],
+                 [0, 192, 96], [128, 192, 64], [128, 32, 192], [0, 0, 224],
+                 [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+                 [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+                 [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+                 [64, 128, 32], [0, 160, 0], [0, 0, 0], [192, 128, 160],
+                 [0, 32, 0], [0, 128, 128], [64, 128, 160], [128, 160, 0],
+                 [0, 128, 0], [192, 128, 32], [128, 96, 128], [0, 0, 128],
+                 [64, 0, 32], [0, 224, 128], [128, 0, 0], [192, 0, 160],
+                 [0, 96, 128], [128, 128, 128], [64, 0, 160], [128, 224, 128],
+                 [128, 128, 64], [192, 0, 32], [128, 96, 0], [128, 0, 192],
+                 [0, 128, 32], [64, 224, 0], [0, 0, 64], [128, 128, 160],
+                 [64, 96, 0], [0, 128, 192], [0, 128, 160], [192, 224, 0],
+                 [0, 128, 64], [128, 128, 32], [192, 32, 128], [0, 64, 192],
+                 [0, 0, 32], [64, 160, 128], [128, 64, 64], [128, 0, 160],
+                 [64, 32, 128], [128, 192, 192], [0, 0, 160], [192, 160, 128],
+                 [128, 192, 0], [128, 0, 96], [192, 32, 0], [128, 64, 128],
+                 [64, 128, 96], [64, 160, 0], [0, 64, 0], [192, 128, 224],
+                 [64, 32, 0], [0, 192, 128], [64, 128, 224], [192, 160, 0],
+                 [0, 192, 0], [192, 128, 96], [192, 96, 128], [0, 64, 128],
+                 [64, 0, 96], [64, 224, 128], [128, 64, 0], [192, 0, 224],
+                 [64, 96, 128], [128, 192, 128], [64, 0, 224], [192, 224, 128],
+                 [128, 192, 64], [192, 0, 96], [192, 96, 0], [128, 64, 192],
+                 [0, 128, 96], [0, 224, 0], [64, 64, 64], [128, 128, 224],
+                 [0, 96, 0], [64, 192, 192], [0, 128, 224], [128, 224, 0],
+                 [64, 192, 64], [128, 128, 96], [128, 32, 128], [64, 0, 192],
+                 [0, 64, 96], [0, 160, 128], [192, 0, 64], [128, 64, 224],
+                 [0, 32, 128], [192, 128, 192], [0, 64, 224], [128, 160, 128],
+                 [192, 128, 0], [128, 64, 32], [128, 32, 64], [192, 0, 128],
+                 [64, 192, 32], [0, 160, 64], [64, 0, 0], [192, 192, 160],
+                 [0, 32, 64], [64, 128, 128], [64, 192, 160], [128, 160, 64],
+                 [64, 128, 0], [192, 192, 32], [128, 96, 192], [64, 0, 128],
+                 [64, 64, 32], [0, 224, 192], [192, 0, 0], [192, 64, 160],
+                 [0, 96, 192], [192, 128, 128], [64, 64, 160], [128, 224, 192],
+                 [192, 128, 64], [192, 64, 32], [128, 96, 64], [192, 0, 192],
+                 [0, 192, 32], [64, 224, 64], [64, 0, 64], [128, 192, 160],
+                 [64, 96, 64], [64, 128, 192], [0, 192, 160], [192, 224, 64],
+                 [64, 128, 64], [128, 192, 32], [192, 32, 192], [64, 64, 192],
+                 [0, 64, 32], [64, 160, 192], [192, 64, 64], [128, 64, 160],
+                 [64, 32, 192], [192, 192, 192], [0, 64, 160], [192, 160, 192],
+                 [192, 192, 0], [128, 64, 96], [192, 32, 64], [192, 64, 128],
+                 [64, 192, 96], [64, 160, 64], [64, 64, 0]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='_labelTrainIds.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/dark_zurich.py b/head_extractor/build/lib/mmseg/datasets/dark_zurich.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5393fa9e5047e81790f91829cfe4b7f33cc707
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/dark_zurich.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .cityscapes import CityscapesDataset
+
+
+@DATASETS.register_module()
+class DarkZurichDataset(CityscapesDataset):
+    """DarkZurichDataset dataset."""
+
+    def __init__(self,
+                 img_suffix='_rgb_anon.png',
+                 seg_map_suffix='_gt_labelTrainIds.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/dataset_wrappers.py b/head_extractor/build/lib/mmseg/datasets/dataset_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..082c116ff4582ecc7064dba1aba3c164dd556af5
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/dataset_wrappers.py
@@ -0,0 +1,136 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections
+import copy
+from typing import List, Optional, Sequence, Union
+
+from mmengine.dataset import ConcatDataset, force_full_init
+
+from mmseg.registry import DATASETS, TRANSFORMS
+
+
+@DATASETS.register_module()
+class MultiImageMixDataset:
+    """A wrapper of multiple images mixed dataset.
+
+    Suitable for training on multiple images mixed data augmentation like
+    mosaic and mixup.
+
+    Args:
+        dataset (ConcatDataset or dict): The dataset to be mixed.
+        pipeline (Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        skip_type_keys (list[str], optional): Sequence of type string to
+            be skip pipeline. Default to None.
+    """
+
+    def __init__(self,
+                 dataset: Union[ConcatDataset, dict],
+                 pipeline: Sequence[dict],
+                 skip_type_keys: Optional[List[str]] = None,
+                 lazy_init: bool = False) -> None:
+        assert isinstance(pipeline, collections.abc.Sequence)
+
+        if isinstance(dataset, dict):
+            self.dataset = DATASETS.build(dataset)
+        elif isinstance(dataset, ConcatDataset):
+            self.dataset = dataset
+        else:
+            raise TypeError(
+                'elements in datasets sequence should be config or '
+                f'`ConcatDataset` instance, but got {type(dataset)}')
+
+        if skip_type_keys is not None:
+            assert all([
+                isinstance(skip_type_key, str)
+                for skip_type_key in skip_type_keys
+            ])
+        self._skip_type_keys = skip_type_keys
+
+        self.pipeline = []
+        self.pipeline_types = []
+        for transform in pipeline:
+            if isinstance(transform, dict):
+                self.pipeline_types.append(transform['type'])
+                transform = TRANSFORMS.build(transform)
+                self.pipeline.append(transform)
+            else:
+                raise TypeError('pipeline must be a dict')
+
+        self._metainfo = self.dataset.metainfo
+        self.num_samples = len(self.dataset)
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        """Get the meta information of the multi-image-mixed dataset.
+
+        Returns:
+            dict: The meta information of multi-image-mixed dataset.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self):
+        """Loop to ``full_init`` each dataset."""
+        if self._fully_initialized:
+            return
+
+        self.dataset.full_init()
+        self._ori_len = len(self.dataset)
+        self._fully_initialized = True
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index.
+
+        Args:
+            idx (int): Global index of ``ConcatDataset``.
+
+        Returns:
+            dict: The idx-th annotation of the datasets.
+        """
+        return self.dataset.get_data_info(idx)
+
+    @force_full_init
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        results = copy.deepcopy(self.dataset[idx])
+        for (transform, transform_type) in zip(self.pipeline,
+                                               self.pipeline_types):
+            if self._skip_type_keys is not None and \
+                    transform_type in self._skip_type_keys:
+                continue
+
+            if hasattr(transform, 'get_indices'):
+                indices = transform.get_indices(self.dataset)
+                if not isinstance(indices, collections.abc.Sequence):
+                    indices = [indices]
+                mix_results = [
+                    copy.deepcopy(self.dataset[index]) for index in indices
+                ]
+                results['mix_results'] = mix_results
+
+            results = transform(results)
+
+            if 'mix_results' in results:
+                results.pop('mix_results')
+
+        return results
+
+    def update_skip_type_keys(self, skip_type_keys):
+        """Update skip_type_keys.
+
+        It is called by an external hook.
+
+        Args:
+            skip_type_keys (list[str], optional): Sequence of type
+                string to be skip pipeline.
+        """
+        assert all([
+            isinstance(skip_type_key, str) for skip_type_key in skip_type_keys
+        ])
+        self._skip_type_keys = skip_type_keys
diff --git a/head_extractor/build/lib/mmseg/datasets/decathlon.py b/head_extractor/build/lib/mmseg/datasets/decathlon.py
new file mode 100644
index 0000000000000000000000000000000000000000..26aa4ef0d7f44e55d4400ed6151ea1f6cb3930ec
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/decathlon.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import List
+
+from mmengine.fileio import load
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class DecathlonDataset(BaseSegDataset):
+    """Dataset for Dacathlon dataset.
+
+    The dataset.json format is shown as follows
+
+    .. code-block:: none
+
+        {
+            "name": "BRATS",
+            "tensorImageSize": "4D",
+            "modality":
+            {
+                "0": "FLAIR",
+                "1": "T1w",
+                "2": "t1gd",
+                "3": "T2w"
+            },
+            "labels": {
+                "0": "background",
+                "1": "edema",
+                "2": "non-enhancing tumor",
+                "3": "enhancing tumour"
+            },
+            "numTraining": 484,
+            "numTest": 266,
+            "training":
+            [
+                {
+                    "image": "./imagesTr/BRATS_306.nii.gz"
+                    "label": "./labelsTr/BRATS_306.nii.gz"
+                    ...
+                }
+            ]
+            "test":
+            [
+                "./imagesTs/BRATS_557.nii.gz"
+                ...
+            ]
+        }
+    """
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        # `self.ann_file` denotes the absolute annotation file path if
+        # `self.root=None` or relative path if `self.root=/path/to/data/`.
+        annotations = load(self.ann_file)
+        if not isinstance(annotations, dict):
+            raise TypeError(f'The annotations loaded from annotation file '
+                            f'should be a dict, but got {type(annotations)}!')
+        raw_data_list = annotations[
+            'training'] if not self.test_mode else annotations['test']
+        data_list = []
+        for raw_data_info in raw_data_list:
+            # `2:` works for removing './' in file path, which will break
+            # loading from cloud storage.
+            if isinstance(raw_data_info, dict):
+                data_info = dict(
+                    img_path=osp.join(self.data_root, raw_data_info['image']
+                                      [2:]))
+                data_info['seg_map_path'] = osp.join(
+                    self.data_root, raw_data_info['label'][2:])
+            else:
+                data_info = dict(
+                    img_path=osp.join(self.data_root, raw_data_info)[2:])
+            data_info['label_map'] = self.label_map
+            data_info['reduce_zero_label'] = self.reduce_zero_label
+            data_info['seg_fields'] = []
+            data_list.append(data_info)
+        annotations.pop('training')
+        annotations.pop('test')
+
+        metainfo = copy.deepcopy(annotations)
+        metainfo['classes'] = [*metainfo['labels'].values()]
+        # Meta information load from annotation file will not influence the
+        # existed meta information load from `BaseDataset.METAINFO` and
+        # `metainfo` arguments defined in constructor.
+        for k, v in metainfo.items():
+            self._metainfo.setdefault(k, v)
+
+        return data_list
diff --git a/head_extractor/build/lib/mmseg/datasets/deep_fashion.py b/head_extractor/build/lib/mmseg/datasets/deep_fashion.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba8914c5db9608c7abcffab8bfd5cf11da280742
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/deep_fashion.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class DeepFashionDataset(BaseSegDataset):
+    """
+        Deep Fashion dataset.
+    """
+    METAINFO = dict(
+        classes=('background',
+                'sleeve top', 'long sleeve top', 'short sleeve outwear', 'long sleeve outwear',
+                'vest', 'sling', 'shorts', 'trousers', 'skirt', 'short sleeve dress',
+                'long sleeve dress', 'vest dress', 'sling dress'),
+        palette=[[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
+                 [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
+                 [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
+                 [64, 0, 128], [66, 66, 66]])
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            # ann_file=ann_file,
+            **kwargs)
+        # assert fileio.exists(self.data_prefix['img_path'],
+        #                      self.backend_args) and osp.isfile(self.ann_file)
diff --git a/head_extractor/build/lib/mmseg/datasets/deepfashion_10k.py b/head_extractor/build/lib/mmseg/datasets/deepfashion_10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..054926636bf443aa709d39e3ca66e2092f76a4af
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/deepfashion_10k.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class DeepFashion10KDataset(BaseSegDataset):
+    """
+        Deep Fashion 10k dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'background',
+            'top',
+            'outer',
+            'skirt',
+            'dress',
+            'pants',
+            'leggings',
+            'headwear',
+            'eyeglass',
+            'neckwear',
+            'belt',
+            'footwear',
+            'bag',
+            'hair',
+            'face',
+            'skin',
+            'ring',
+            'wrist wearing',
+            'socks',
+            'gloves',
+            'necklace',
+            'rompers',
+            'earrings',
+            'tie'),
+
+        palette=[
+            [0, 0, 0], [255, 0, 0], [0, 128, 0], [0, 0, 255],
+            [0, 128, 128], [238, 130, 238], [128, 128, 128], [255, 255, 0],
+            [255, 153, 18], [255, 125, 64], [127, 255, 0], [175, 238, 238],
+            [138, 43, 226], [210, 105, 30], [0, 0, 139], [72, 61, 139],
+            [255, 20, 147], [255, 192, 203], [205, 92, 92], [32, 178, 170],
+            [132, 112, 255], [160, 82, 45], [255, 222, 173], [240, 230, 140],
+            ],)
+           
+        # palette=[
+        #     [0, 0, 0], [0, 192, 64], [0, 64, 96], [128, 192, 192],
+        #     [0, 64, 64], [0, 192, 224], [0, 192, 192], [128, 192, 64],
+        #     [0, 192, 96], [128, 192, 64], [128, 32, 192], [0, 0, 224],
+        #     [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+        #     [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+        #     [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+        #     ],)
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/drive.py b/head_extractor/build/lib/mmseg/datasets/drive.py
new file mode 100644
index 0000000000000000000000000000000000000000..76c0160a6b6bf4a56ff135620ff0b08dc086d1d9
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/drive.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class DRIVEDataset(BaseSegDataset):
+    """DRIVE dataset.
+
+    In segmentation map annotation for DRIVE, 0 stands for background, which is
+    included in 2 categories. ``reduce_zero_label`` is fixed to False. The
+    ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '_manual1.png'.
+    """
+    METAINFO = dict(
+        classes=('background', 'vessel'),
+        palette=[[120, 120, 120], [6, 230, 230]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_manual1.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/head_extractor/build/lib/mmseg/datasets/dsdl.py b/head_extractor/build/lib/mmseg/datasets/dsdl.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf7e4e61b5fdd4bcb34617c8e53b93829def443a
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/dsdl.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Dict, List, Optional, Sequence, Union
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+try:
+    from dsdl.dataset import DSDLDataset
+except ImportError:
+    DSDLDataset = None
+
+
+@DATASETS.register_module()
+class DSDLSegDataset(BaseSegDataset):
+    """Dataset for dsdl segmentation.
+
+    Args:
+        specific_key_path(dict): Path of specific key which can not
+            be loaded by it's field name.
+        pre_transform(dict): pre-transform functions before loading.
+        used_labels(sequence): list of actual used classes in train steps,
+            this must be subset of class domain.
+    """
+
+    METAINFO = {}
+
+    def __init__(self,
+                 specific_key_path: Dict = {},
+                 pre_transform: Dict = {},
+                 used_labels: Optional[Sequence] = None,
+                 **kwargs) -> None:
+
+        if DSDLDataset is None:
+            raise RuntimeError(
+                'Package dsdl is not installed. Please run "pip install dsdl".'
+            )
+        self.used_labels = used_labels
+
+        loc_config = dict(type='LocalFileReader', working_dir='')
+        if kwargs.get('data_root'):
+            kwargs['ann_file'] = os.path.join(kwargs['data_root'],
+                                              kwargs['ann_file'])
+        required_fields = ['Image', 'LabelMap']
+
+        self.dsdldataset = DSDLDataset(
+            dsdl_yaml=kwargs['ann_file'],
+            location_config=loc_config,
+            required_fields=required_fields,
+            specific_key_path=specific_key_path,
+            transform=pre_transform,
+        )
+        BaseSegDataset.__init__(self, **kwargs)
+
+    def load_data_list(self) -> List[Dict]:
+        """Load data info from a dsdl yaml file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of data list.
+        """
+
+        if self.used_labels:
+            self._metainfo['classes'] = tuple(self.used_labels)
+            self.label_map = self.get_label_map(self.used_labels)
+        else:
+            self._metainfo['classes'] = tuple(['background'] +
+                                              self.dsdldataset.class_names)
+        data_list = []
+
+        for i, data in enumerate(self.dsdldataset):
+            datainfo = dict(
+                img_path=os.path.join(self.data_prefix['img_path'],
+                                      data['Image'][0].location),
+                seg_map_path=os.path.join(self.data_prefix['seg_map_path'],
+                                          data['LabelMap'][0].location),
+                label_map=self.label_map,
+                reduce_zero_label=self.reduce_zero_label,
+                seg_fields=[],
+            )
+            data_list.append(datainfo)
+
+        return data_list
+
+    def get_label_map(self,
+                      new_classes: Optional[Sequence] = None
+                      ) -> Union[Dict, None]:
+        """Require label mapping.
+
+        The ``label_map`` is a dictionary, its keys are the old label ids and
+        its values are the new label ids, and is used for changing pixel
+        labels in load_annotations. If and only if old classes in class_dom
+        is not equal to new classes in args and nether of them is not
+        None, `label_map` is not None.
+        Args:
+            new_classes (list, tuple, optional): The new classes name from
+                metainfo. Default to None.
+        Returns:
+            dict, optional: The mapping from old classes to new classes.
+        """
+        old_classes = ['background'] + self.dsdldataset.class_names
+        if (new_classes is not None and old_classes is not None
+                and list(new_classes) != list(old_classes)):
+
+            label_map = {}
+            if not set(new_classes).issubset(old_classes):
+                raise ValueError(
+                    f'new classes {new_classes} is not a '
+                    f'subset of classes {old_classes} in class_dom.')
+            for i, c in enumerate(old_classes):
+                if c not in new_classes:
+                    label_map[i] = 255
+                else:
+                    label_map[i] = new_classes.index(c)
+            return label_map
+        else:
+            return None
diff --git a/head_extractor/build/lib/mmseg/datasets/fashion.py b/head_extractor/build/lib/mmseg/datasets/fashion.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bb4c8d8fe4fb1e1021bcad53b9a8728a53ef3d4
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/fashion.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class FashionDataset(BaseSegDataset):
+    """imaterialist_fashion dataset.
+
+    The ``img_suffix`` is fixed to '_leftImg8bit.png' and ``seg_map_suffix`` is
+    fixed to '_gtFine_labelTrainIds.png' for Cityscapes dataset.
+    """
+    METAINFO = dict(
+        classes=("shirt, blouse", "top, t-shirt, sweatshirt", "sweater",
+                 "cardigan", "jacket", "vest", "pants", "shorts", "skirt",
+                 "coat", "dress", "jumpsuit", "cape", "glasses", "hat", 
+                 "headband, head covering, hair accessory", "tie", "glove",
+                 "watch", "belt", "leg warmer", "tights, stockings", "sock",
+                 "shoe", "bag, wallet", "scarf", "umbrella", "hood", "collar",
+                 "lapel", "epaulette", "sleeve", "pocket", "neckline", "buckle",
+                 "zipper", "applique", "bead", "bow", "flower", "fringe",
+                 "ribbon", "rivet", "ruffle", "sequin", "tassel",),
+        palette=[[205, 124, 84], [117, 64, 134], [66, 75, 191], [152, 102, 149], 
+                 [68, 20, 174], [104, 106, 176], [26, 123, 233], [65, 148, 108], 
+                 [90, 227, 255], [53, 74, 138], [174, 5, 217], [36, 9, 3], [175, 93, 71], 
+                 [86, 96, 239], [221, 101, 166], [156, 227, 224], [186, 223, 138], 
+                 [151, 121, 189], [118, 43, 207], [137, 157, 76], [224, 160, 18], 
+                 [100, 109, 226], [88, 31, 162], [101, 153, 76], [140, 252, 51], 
+                 [121, 107, 19], [228, 250, 222], [251, 148, 245], [155, 29, 0], 
+                 [99, 246, 138], [182, 66, 5], [103, 232, 180], [50, 75, 12], 
+                 [79, 181, 229], [172, 98, 94], [19, 137, 226], [191, 182, 104], 
+                 [141, 97, 101], [216, 134, 90], [31, 33, 23], [255, 224, 125], 
+                 [199, 82, 200], [196, 10, 110], [244, 144, 145], [232, 145, 29], [51, 185, 206]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='_label.jpg',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/fashion_3category.py b/head_extractor/build/lib/mmseg/datasets/fashion_3category.py
new file mode 100644
index 0000000000000000000000000000000000000000..a78f157331d88418d021349b731c7bf8f061f3b4
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/fashion_3category.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class fashion3categoryDataset(BaseSegDataset):
+    """imaterialist_fashion dataset. Simplified to 3 catgories
+
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.jpg' for Fashion dataset.
+    """
+    METAINFO = dict(
+        classes=('background', 'upper_body', 'lower_body', 'whole_body'),
+        palette=[[0, 0, 0], [255,0,0], [0, 255, 0], [0, 0, 255]])
+        # palette=[[0, 0, 0], [1,1,1], [2, 2, 2], [3, 3, 3]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/hrf.py b/head_extractor/build/lib/mmseg/datasets/hrf.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd669cce26420b7e2c810ecace247a9e09350a5d
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/hrf.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class HRFDataset(BaseSegDataset):
+    """HRF dataset.
+
+    In segmentation map annotation for HRF, 0 stands for background, which is
+    included in 2 categories. ``reduce_zero_label`` is fixed to False. The
+    ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '.png'.
+    """
+    METAINFO = dict(
+        classes=('background', 'vessel'),
+        palette=[[120, 120, 120], [6, 230, 230]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/head_extractor/build/lib/mmseg/datasets/hsi_drive.py b/head_extractor/build/lib/mmseg/datasets/hsi_drive.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d46a86629e6535ee510d11bfa24caeb9fa4ab95
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/hsi_drive.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+classes_exp = ('unlabelled', 'road', 'road marks', 'vegetation',
+               'painted metal', 'sky', 'concrete', 'pedestrian', 'water',
+               'unpainted metal', 'glass')
+palette_exp = [[0, 0, 0], [77, 77, 77], [255, 255, 255], [0, 255, 0],
+               [255, 0, 0], [0, 0, 255], [102, 51, 0], [255, 255, 0],
+               [0, 207, 250], [255, 166, 0], [0, 204, 204]]
+
+
+@DATASETS.register_module()
+class HSIDrive20Dataset(BaseSegDataset):
+    """HSI-Drive v2.0 (https://ieeexplore.ieee.org/document/10371793), the
+    updated version of HSI-Drive
+    (https://ieeexplore.ieee.org/document/9575298), is a structured dataset for
+    the research and development of automated driving systems (ADS) supported
+    by hyperspectral imaging (HSI). It contains per-pixel manually annotated
+    images selected from videos recorded in real driving conditions and has
+    been organized according to four parameters: season, daytime, road type,
+    and weather conditions.
+
+    The video sequences have been captured with a small-size 25-band VNIR
+    (Visible-NearlnfraRed) snapshot hyperspectral camera mounted on a driving
+    automobile. As a consequence, you need to modify the in_channels parameter
+    of your model from 3 (RGB images) to 25 (HSI images) as it is done in
+    configs/unet/unet-s5-d16_fcn_4xb4-160k_hsidrive-192x384.py
+
+    Apart from the abovementioned articles, additional information is provided
+    in the website (https://ipaccess.ehu.eus/HSI-Drive/) from where you can
+    download the dataset and also visualize some examples of segmented videos.
+    """
+
+    METAINFO = dict(classes=classes_exp, palette=palette_exp)
+
+    def __init__(self,
+                 img_suffix='.npy',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/human_parsing.py b/head_extractor/build/lib/mmseg/datasets/human_parsing.py
new file mode 100644
index 0000000000000000000000000000000000000000..62938b39af1f8cfdb47f65a69a056b434c98450d
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/human_parsing.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class HumanParsingDataset(BaseSegDataset):
+    """
+        Human Parsing dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'Background', 'shirt, blouse', 'top, t-shirt, sweatshirt', 'sweater', 'cardigan', 'jacket', 'vest', 'pants', 'shorts', 'skirt', 'coat', 'dress', 'jumpsuit', 'cape', 'glasses', 'hat', 'headband, head covering, hair accessory', 'tie', 'glove', 'watch', 'belt', 'leg warmer', 'tights, stockings', 'sock', 'shoe', 'bag, wallet', 'scarf', 'umbrella', 'hood', 'collar', 'lapel', 'epaulette', 'sleeve', 'pocket', 'neckline', 'buckle', 'zipper', 'applique', 'bead', 'bow', 'flower', 'fringe', 'ribbon', 'rivet', 'ruffle', 'sequin', 'tassel', 'Hair', 'Sunglasses', 'Upper-clothes', 'Left-shoe', 'Right-shoe', 'Face', 'Left-leg', 'Right-leg', 'Left-arm', 'Right-arm'),
+
+        palette=[
+                [0, 0, 0], [255, 0, 0], [0, 128, 0], [0, 0, 255],
+                [0, 128, 128], [238, 130, 238], [128, 128, 128], [255, 255, 0],
+                [255, 153, 18], [255, 125, 64], [127, 255, 0], [175, 238, 238],
+                [138, 43, 226], [210, 105, 30], [0, 0, 139], [72, 61, 139],
+                [255, 20, 147], [255, 192, 203], [205, 92, 92], [32, 178, 170],
+                [132, 112, 255], [160, 82, 45], [255, 222, 173],
+                [240, 230, 140], [173, 216, 230], [250, 128, 114], [107, 142, 35],
+                [72, 209, 204], [199, 21, 133], [25, 25, 112], [123, 104, 238],
+                [0, 250, 154], [34, 139, 34], [219, 112, 147], [240, 128, 128],
+                [143, 188, 143], [47, 79, 79], [188, 143, 143], [100, 149, 237],
+                [102, 205, 170], [255, 160, 122], [147, 112, 219], [60, 179, 113],
+                [139, 0, 139], [255, 215, 0], [233, 150, 122], [0, 206, 209],
+                [148, 0, 211], [144, 238, 144], [255, 105, 180], [30, 144, 255],
+                [255, 140, 0], [153, 50, 204], [220, 20, 60], [46, 139, 87],
+                [240, 230, 155], [255, 99, 71]
+            ])
+           
+        # palette=[
+        #     [0, 0, 0], [0, 192, 64], [0, 64, 96], [128, 192, 192],
+        #     [0, 64, 64], [0, 192, 224], [0, 192, 192], [128, 192, 64],
+        #     [0, 192, 96], [128, 192, 64], [128, 32, 192], [0, 0, 224],
+        #     [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+        #     [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+        #     [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+        #     ],)
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/human_union.py b/head_extractor/build/lib/mmseg/datasets/human_union.py
new file mode 100644
index 0000000000000000000000000000000000000000..d68b49afb8c4de8344131013e8d6ff8afba038ff
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/human_union.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class HumanUnionDataset(BaseSegDataset):
+    """
+        Human Union dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'background',
+            'hat',
+            'hair',
+            'sunglasses',
+            'upper-clothes',
+            'skirt',
+            'pants',
+            'dress',
+            'belt',
+            'shoes',
+            'face',
+            'legs',
+            'arms',
+            'bag',
+            'scarf',
+            'glove',
+            'socks',
+            'jumpsuits',),
+        
+        palette=[
+            [0, 0, 0], 
+            [128, 0, 0], 
+            [0, 128, 0], 
+            [128, 128, 0], 
+            [0, 0, 128], 
+            [128, 0, 128], 
+            [0, 128, 128], 
+            [128, 128, 128],
+            [64, 0, 0], 
+            [192, 0, 0], 
+            [64, 128, 0], 
+            [192, 128, 0], 
+            [64, 0, 128], 
+            [66, 66, 66], 
+            [123, 66, 123], 
+            [22, 33, 44], 
+            [77, 88, 99], 
+            [23, 24, 77]],)
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/imaterialist.py b/head_extractor/build/lib/mmseg/datasets/imaterialist.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf6c3210e5bc52acae2e76728e1bd6dc87fa4ed3
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/imaterialist.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class iMaterialistDataset(BaseSegDataset):
+    """
+        iMaterialist 2019 dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'background',
+            'shirt, blouse',
+            'top, t-shirt, sweatshirt',
+            'sweater',
+            'cardigan',
+            "jacket",
+            "vest",
+            "pants",
+            "shorts",
+            "skirt",
+            "coat",
+            "dress",
+            "jumpsuit",
+            "cape",
+            "glasses",
+            "hat",
+            "headband, head covering, hair accessory",
+            "tie",
+            "glove",
+            "watch",
+            "belt",
+            "leg warmer",
+            "tights, stockings",
+            "sock",
+            "shoe",
+            "bag, wallet",
+            "scarf",
+            "umbrella",
+            "hood",
+            "collar",
+            "lapel",
+            "epaulette",
+            "sleeve",
+            "pocket",
+            "neckline",
+            "buckle",
+            "zipper",
+            "applique",
+            "bead",
+            "bow",
+            "flower",
+            "fringe",
+            "ribbon",
+            "rivet",
+            "ruffle",
+            "sequin",
+            "tassel",
+            ),
+
+        palette=[
+            [0, 0, 0], 
+            [234, 191, 155], [186, 99, 123], [46, 100, 157], [154, 71, 196], 
+            [15, 185, 171], [13, 89, 100], [67, 216, 41], [212, 139, 166], 
+            [10, 101, 73], [198, 51, 168], [38, 174, 154], [150, 192, 158], 
+            [194, 243, 120], [10, 224, 173], [214, 94, 149], [211, 126, 18], 
+            [96, 7, 165], [255, 35, 14], [83, 127, 78], [106, 23, 51], 
+            [41, 244, 224], [38, 86, 244], [244, 234, 150], [233, 247, 180], 
+            [222, 117, 26], [2, 90, 51], [27, 176, 90], [178, 160, 25], 
+            [75, 52, 236], [119, 65, 186], [163, 254, 113], [39, 140, 118], 
+            [235, 112, 193], [134, 107, 77], [57, 169, 93], [251, 104, 47], 
+            [224, 14, 49], [20, 123, 134], [178, 32, 212], [116, 194, 248], 
+            [211, 196, 233], [93, 36, 29], [113, 99, 55], [5, 7, 250], 
+            [172, 174, 41], [101, 98, 209],
+            ],)
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/imaterialist_5cat.py b/head_extractor/build/lib/mmseg/datasets/imaterialist_5cat.py
new file mode 100644
index 0000000000000000000000000000000000000000..b92e0aa2aa67d727979109b28ed545e37d658cd1
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/imaterialist_5cat.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class iMaterialist_5Cat_Dataset(BaseSegDataset):
+    """
+        iMaterialist 2019 dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'background',
+            'upperbody',
+            'lowerbody',
+            'head_related',
+            'others'
+            ),
+
+        palette=[
+            [0, 0, 0], 
+            [0, 0, 255], [255, 0, 0], [0, 255, 0], [128, 0, 196], 
+            ],)
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/isaid.py b/head_extractor/build/lib/mmseg/datasets/isaid.py
new file mode 100644
index 0000000000000000000000000000000000000000..61942ec1ea33e76c65c22d8e7fc71fb8194841dd
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/isaid.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class iSAIDDataset(BaseSegDataset):
+    """ iSAID: A Large-scale Dataset for Instance Segmentation in Aerial Images
+    In segmentation map annotation for iSAID dataset, which is included
+    in 16 categories. ``reduce_zero_label`` is fixed to False. The
+    ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '_manual1.png'.
+    """
+
+    METAINFO = dict(
+        classes=('background', 'ship', 'store_tank', 'baseball_diamond',
+                 'tennis_court', 'basketball_court', 'Ground_Track_Field',
+                 'Bridge', 'Large_Vehicle', 'Small_Vehicle', 'Helicopter',
+                 'Swimming_pool', 'Roundabout', 'Soccer_ball_field', 'plane',
+                 'Harbor'),
+        palette=[[0, 0, 0], [0, 0, 63], [0, 63, 63], [0, 63, 0], [0, 63, 127],
+                 [0, 63, 191], [0, 63, 255], [0, 127, 63], [0, 127, 127],
+                 [0, 0, 127], [0, 0, 191], [0, 0, 255], [0, 191, 127],
+                 [0, 127, 191], [0, 127, 255], [0, 100, 155]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_instance_color_RGB.png',
+                 ignore_index=255,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            ignore_index=ignore_index,
+            **kwargs)
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/head_extractor/build/lib/mmseg/datasets/isprs.py b/head_extractor/build/lib/mmseg/datasets/isprs.py
new file mode 100644
index 0000000000000000000000000000000000000000..30af53c569b05c9be1218e9a58655c36c8aa9931
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/isprs.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class ISPRSDataset(BaseSegDataset):
+    """ISPRS dataset.
+
+    In segmentation map annotation for ISPRS, 0 is the ignore index.
+    ``reduce_zero_label`` should be set to True. The ``img_suffix`` and
+    ``seg_map_suffix`` are both fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=('impervious_surface', 'building', 'low_vegetation', 'tree',
+                 'car', 'clutter'),
+        palette=[[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
+                 [255, 255, 0], [255, 0, 0]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/levir.py b/head_extractor/build/lib/mmseg/datasets/levir.py
new file mode 100644
index 0000000000000000000000000000000000000000..f467481bad70a426381842dba61d85576c196eaf
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/levir.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseCDDataset
+
+
+@DATASETS.register_module()
+class LEVIRCDDataset(BaseCDDataset):
+    """ISPRS dataset.
+
+    In segmentation map annotation for ISPRS, 0 is to ignore index.
+    ``reduce_zero_label`` should be set to True. The ``img_suffix`` and
+    ``seg_map_suffix`` are both fixed to '.png'.
+    """
+
+    METAINFO = dict(
+        classes=('background', 'changed'),
+        palette=[[0, 0, 0], [255, 255, 255]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 img_suffix2='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            img_suffix2=img_suffix2,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/lip.py b/head_extractor/build/lib/mmseg/datasets/lip.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a32a193aff990ae9f819d4a0a1be82df1d049cb
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/lip.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class LIPDataset(BaseSegDataset):
+    """LIP dataset.
+
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is fixed to
+    '.png'.
+    """
+    METAINFO = dict(
+        classes=('Background', 'Hat', 'Hair', 'Glove', 'Sunglasses',
+                 'UpperClothes', 'Dress', 'Coat', 'Socks', 'Pants',
+                 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm',
+                 'Right-arm', 'Left-leg', 'Right-leg', 'Left-shoe',
+                 'Right-shoe'),
+        palette=(
+            [0, 0, 0],
+            [128, 0, 0],
+            [255, 0, 0],
+            [0, 85, 0],
+            [170, 0, 51],
+            [255, 85, 0],
+            [0, 0, 85],
+            [0, 119, 221],
+            [85, 85, 0],
+            [0, 85, 85],
+            [85, 51, 0],
+            [52, 86, 128],
+            [0, 128, 0],
+            [0, 0, 255],
+            [51, 170, 221],
+            [0, 255, 255],
+            [85, 255, 170],
+            [170, 255, 85],
+            [255, 255, 0],
+            [255, 170, 0],
+        ))
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/loveda.py b/head_extractor/build/lib/mmseg/datasets/loveda.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c16db503adee6f1a1cac67e1dc72ff873ccd5ea
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/loveda.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class LoveDADataset(BaseSegDataset):
+    """LoveDA dataset.
+
+    In segmentation map annotation for LoveDA, 0 is the ignore index.
+    ``reduce_zero_label`` should be set to True. The ``img_suffix`` and
+    ``seg_map_suffix`` are both fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=('background', 'building', 'road', 'water', 'barren', 'forest',
+                 'agricultural'),
+        palette=[[255, 255, 255], [255, 0, 0], [255, 255, 0], [0, 0, 255],
+                 [159, 129, 183], [0, 255, 0], [255, 195, 128]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/mapillary.py b/head_extractor/build/lib/mmseg/datasets/mapillary.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c2947338ec79b3d8558cee0387a2a84e41f0421
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/mapillary.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class MapillaryDataset_v1(BaseSegDataset):
+    """Mapillary Vistas Dataset.
+
+    Dataset paper link:
+    http://ieeexplore.ieee.org/document/8237796/
+
+    v1.2 contain 66 object classes.
+    (37 instance-specific)
+
+    v2.0 contain 124 object classes.
+    (70 instance-specific, 46 stuff, 8 void or crowd).
+
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png' for Mapillary Vistas Dataset.
+    """
+    METAINFO = dict(
+        classes=('Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail',
+                 'Barrier', 'Wall', 'Bike Lane', 'Crosswalk - Plain',
+                 'Curb Cut', 'Parking', 'Pedestrian Area', 'Rail Track',
+                 'Road', 'Service Lane', 'Sidewalk', 'Bridge', 'Building',
+                 'Tunnel', 'Person', 'Bicyclist', 'Motorcyclist',
+                 'Other Rider', 'Lane Marking - Crosswalk',
+                 'Lane Marking - General', 'Mountain', 'Sand', 'Sky', 'Snow',
+                 'Terrain', 'Vegetation', 'Water', 'Banner', 'Bench',
+                 'Bike Rack', 'Billboard', 'Catch Basin', 'CCTV Camera',
+                 'Fire Hydrant', 'Junction Box', 'Mailbox', 'Manhole',
+                 'Phone Booth', 'Pothole', 'Street Light', 'Pole',
+                 'Traffic Sign Frame', 'Utility Pole', 'Traffic Light',
+                 'Traffic Sign (Back)', 'Traffic Sign (Front)', 'Trash Can',
+                 'Bicycle', 'Boat', 'Bus', 'Car', 'Caravan', 'Motorcycle',
+                 'On Rails', 'Other Vehicle', 'Trailer', 'Truck',
+                 'Wheeled Slow', 'Car Mount', 'Ego Vehicle', 'Unlabeled'),
+        palette=[[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153],
+                 [180, 165, 180], [90, 120, 150], [102, 102, 156],
+                 [128, 64, 255], [140, 140, 200], [170, 170, 170],
+                 [250, 170, 160], [96, 96, 96],
+                 [230, 150, 140], [128, 64, 128], [110, 110, 110],
+                 [244, 35, 232], [150, 100, 100], [70, 70, 70], [150, 120, 90],
+                 [220, 20, 60], [255, 0, 0], [255, 0, 100], [255, 0, 200],
+                 [200, 128, 128], [255, 255, 255], [64, 170,
+                                                    64], [230, 160, 50],
+                 [70, 130, 180], [190, 255, 255], [152, 251, 152],
+                 [107, 142, 35], [0, 170, 30], [255, 255, 128], [250, 0, 30],
+                 [100, 140, 180], [220, 220, 220], [220, 128, 128],
+                 [222, 40, 40], [100, 170, 30], [40, 40, 40], [33, 33, 33],
+                 [100, 128, 160], [142, 0, 0], [70, 100, 150], [210, 170, 100],
+                 [153, 153, 153], [128, 128, 128], [0, 0, 80], [250, 170, 30],
+                 [192, 192, 192], [220, 220, 0], [140, 140, 20], [119, 11, 32],
+                 [150, 0, 255], [0, 60, 100], [0, 0, 142], [0, 0, 90],
+                 [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+                 [0, 0, 70], [0, 0, 192], [32, 32, 32], [120, 10,
+                                                         10], [0, 0, 0]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
+
+
+@DATASETS.register_module()
+class MapillaryDataset_v2(BaseSegDataset):
+    """Mapillary Vistas Dataset.
+
+    Dataset paper link:
+    http://ieeexplore.ieee.org/document/8237796/
+
+    v1.2 contain 66 object classes.
+    (37 instance-specific)
+
+    v2.0 contain 124 object classes.
+    (70 instance-specific, 46 stuff, 8 void or crowd).
+
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png' for Mapillary Vistas Dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'Bird', 'Ground Animal', 'Ambiguous Barrier', 'Concrete Block',
+            'Curb', 'Fence', 'Guard Rail', 'Barrier', 'Road Median',
+            'Road Side', 'Lane Separator', 'Temporary Barrier', 'Wall',
+            'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Driveway',
+            'Parking', 'Parking Aisle', 'Pedestrian Area', 'Rail Track',
+            'Road', 'Road Shoulder', 'Service Lane', 'Sidewalk',
+            'Traffic Island', 'Bridge', 'Building', 'Garage', 'Tunnel',
+            'Person', 'Person Group', 'Bicyclist', 'Motorcyclist',
+            'Other Rider', 'Lane Marking - Dashed Line',
+            'Lane Marking - Straight Line', 'Lane Marking - Zigzag Line',
+            'Lane Marking - Ambiguous', 'Lane Marking - Arrow (Left)',
+            'Lane Marking - Arrow (Other)', 'Lane Marking - Arrow (Right)',
+            'Lane Marking - Arrow (Split Left or Straight)',
+            'Lane Marking - Arrow (Split Right or Straight)',
+            'Lane Marking - Arrow (Straight)', 'Lane Marking - Crosswalk',
+            'Lane Marking - Give Way (Row)',
+            'Lane Marking - Give Way (Single)',
+            'Lane Marking - Hatched (Chevron)',
+            'Lane Marking - Hatched (Diagonal)', 'Lane Marking - Other',
+            'Lane Marking - Stop Line', 'Lane Marking - Symbol (Bicycle)',
+            'Lane Marking - Symbol (Other)', 'Lane Marking - Text',
+            'Lane Marking (only) - Dashed Line',
+            'Lane Marking (only) - Crosswalk', 'Lane Marking (only) - Other',
+            'Lane Marking (only) - Test', 'Mountain', 'Sand', 'Sky', 'Snow',
+            'Terrain', 'Vegetation', 'Water', 'Banner', 'Bench', 'Bike Rack',
+            'Catch Basin', 'CCTV Camera', 'Fire Hydrant', 'Junction Box',
+            'Mailbox', 'Manhole', 'Parking Meter', 'Phone Booth', 'Pothole',
+            'Signage - Advertisement', 'Signage - Ambiguous', 'Signage - Back',
+            'Signage - Information', 'Signage - Other', 'Signage - Store',
+            'Street Light', 'Pole', 'Pole Group', 'Traffic Sign Frame',
+            'Utility Pole', 'Traffic Cone', 'Traffic Light - General (Single)',
+            'Traffic Light - Pedestrians', 'Traffic Light - General (Upright)',
+            'Traffic Light - General (Horizontal)', 'Traffic Light - Cyclists',
+            'Traffic Light - Other', 'Traffic Sign - Ambiguous',
+            'Traffic Sign (Back)', 'Traffic Sign - Direction (Back)',
+            'Traffic Sign - Direction (Front)', 'Traffic Sign (Front)',
+            'Traffic Sign - Parking', 'Traffic Sign - Temporary (Back)',
+            'Traffic Sign - Temporary (Front)', 'Trash Can', 'Bicycle', 'Boat',
+            'Bus', 'Car', 'Caravan', 'Motorcycle', 'On Rails', 'Other Vehicle',
+            'Trailer', 'Truck', 'Vehicle Group', 'Wheeled Slow', 'Water Valve',
+            'Car Mount', 'Dynamic', 'Ego Vehicle', 'Ground', 'Static',
+            'Unlabeled'),
+        palette=[[165, 42, 42], [0, 192, 0], [250, 170, 31], [250, 170, 32],
+                 [196, 196, 196], [190, 153, 153], [180, 165, 180],
+                 [90, 120, 150], [250, 170, 33], [250, 170, 34],
+                 [128, 128, 128], [250, 170, 35], [102, 102, 156],
+                 [128, 64, 255], [140, 140, 200], [170, 170, 170],
+                 [250, 170, 36], [250, 170, 160], [250, 170, 37], [96, 96, 96],
+                 [230, 150, 140], [128, 64, 128], [110, 110, 110],
+                 [110, 110, 110], [244, 35, 232], [128, 196,
+                                                   128], [150, 100, 100],
+                 [70, 70, 70], [150, 150, 150], [150, 120, 90], [220, 20, 60],
+                 [220, 20, 60], [255, 0, 0], [255, 0, 100], [255, 0, 200],
+                 [255, 255, 255], [255, 255, 255], [250, 170, 29],
+                 [250, 170, 28], [250, 170, 26], [250, 170,
+                                                  25], [250, 170, 24],
+                 [250, 170, 22], [250, 170, 21], [250, 170,
+                                                  20], [255, 255, 255],
+                 [250, 170, 19], [250, 170, 18], [250, 170,
+                                                  12], [250, 170, 11],
+                 [255, 255, 255], [255, 255, 255], [250, 170, 16],
+                 [250, 170, 15], [250, 170, 15], [255, 255, 255],
+                 [255, 255, 255], [255, 255, 255], [255, 255, 255],
+                 [64, 170, 64], [230, 160, 50],
+                 [70, 130, 180], [190, 255, 255], [152, 251, 152],
+                 [107, 142, 35], [0, 170, 30], [255, 255, 128], [250, 0, 30],
+                 [100, 140, 180], [220, 128, 128], [222, 40,
+                                                    40], [100, 170, 30],
+                 [40, 40, 40], [33, 33, 33], [100, 128, 160], [20, 20, 255],
+                 [142, 0, 0], [70, 100, 150], [250, 171, 30], [250, 172, 30],
+                 [250, 173, 30], [250, 174, 30], [250, 175,
+                                                  30], [250, 176, 30],
+                 [210, 170, 100], [153, 153, 153], [153, 153, 153],
+                 [128, 128, 128], [0, 0, 80], [210, 60, 60], [250, 170, 30],
+                 [250, 170, 30], [250, 170, 30], [250, 170,
+                                                  30], [250, 170, 30],
+                 [250, 170, 30], [192, 192, 192], [192, 192, 192],
+                 [192, 192, 192], [220, 220, 0], [220, 220, 0], [0, 0, 196],
+                 [192, 192, 192], [220, 220, 0], [140, 140, 20], [119, 11, 32],
+                 [150, 0, 255], [0, 60, 100], [0, 0, 142], [0, 0, 90],
+                 [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+                 [0, 0, 70], [0, 0, 142], [0, 0, 192], [170, 170, 170],
+                 [32, 32, 32], [111, 74, 0], [120, 10, 10], [81, 0, 81],
+                 [111, 111, 0], [0, 0, 0]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/night_driving.py b/head_extractor/build/lib/mmseg/datasets/night_driving.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ead91ec77cbd8e3f0a870dee3462549183e9c9b
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/night_driving.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .cityscapes import CityscapesDataset
+
+
+@DATASETS.register_module()
+class NightDrivingDataset(CityscapesDataset):
+    """NightDrivingDataset dataset."""
+
+    def __init__(self,
+                 img_suffix='_leftImg8bit.png',
+                 seg_map_suffix='_gtCoarse_labelTrainIds.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/nyu.py b/head_extractor/build/lib/mmseg/datasets/nyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcfda46647d25b5d16425af97a06ffb8c1f81bca
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/nyu.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class NYUDataset(BaseSegDataset):
+    """NYU depth estimation dataset. The file structure should be.
+
+    .. code-block:: none
+
+        ├── data
+        │   ├── nyu
+        │   │   ├── images
+        │   │   │   ├── train
+        │   │   │   │   ├── scene_xxx.jpg
+        │   │   │   │   ├── ...
+        │   │   │   ├── test
+        │   │   ├── annotations
+        │   │   │   ├── train
+        │   │   │   │   ├── scene_xxx.png
+        │   │   │   │   ├── ...
+        │   │   │   ├── test
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as
+            specify classes to load. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            dict(img_path='images', depth_map_path='annotations').
+        img_suffix (str): Suffix of images. Default: '.jpg'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+        ignore_index (int): The label index to be ignored. Default: 255
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+    METAINFO = dict(
+        classes=('printer_room', 'bathroom', 'living_room', 'study',
+                 'conference_room', 'study_room', 'kitchen', 'home_office',
+                 'bedroom', 'dinette', 'playroom', 'indoor_balcony',
+                 'laundry_room', 'basement', 'excercise_room', 'foyer',
+                 'home_storage', 'cafe', 'furniture_store', 'office_kitchen',
+                 'student_lounge', 'dining_room', 'reception_room',
+                 'computer_lab', 'classroom', 'office', 'bookstore'))
+
+    def __init__(self,
+                 data_prefix=dict(
+                     img_path='images', depth_map_path='annotations'),
+                 img_suffix='.jpg',
+                 depth_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            data_prefix=data_prefix,
+            img_suffix=img_suffix,
+            seg_map_suffix=depth_map_suffix,
+            **kwargs)
+
+    def _get_category_id_from_filename(self, image_fname: str) -> int:
+        """Retrieve the category ID from the given image filename."""
+        image_fname = osp.basename(image_fname)
+        position = image_fname.find(next(filter(str.isdigit, image_fname)), 0)
+        categoty_name = image_fname[:position - 1]
+        if categoty_name not in self._metainfo['classes']:
+            return -1
+        else:
+            return self._metainfo['classes'].index(categoty_name)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        ann_dir = self.data_prefix.get('depth_map_path', None)
+
+        _suffix_len = len(self.img_suffix)
+        for img in fileio.list_dir_or_file(
+                dir_path=img_dir,
+                list_dir=False,
+                suffix=self.img_suffix,
+                recursive=True,
+                backend_args=self.backend_args):
+            data_info = dict(img_path=osp.join(img_dir, img))
+            if ann_dir is not None:
+                depth_map = img[:-_suffix_len] + self.seg_map_suffix
+                data_info['depth_map_path'] = osp.join(ann_dir, depth_map)
+            data_info['seg_fields'] = []
+            data_info['category_id'] = self._get_category_id_from_filename(img)
+            data_list.append(data_info)
+        data_list = sorted(data_list, key=lambda x: x['img_path'])
+        return data_list
diff --git a/head_extractor/build/lib/mmseg/datasets/pascal_context.py b/head_extractor/build/lib/mmseg/datasets/pascal_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..82d00a9b3086a0db81457ab9b2f79c79de4ffaa8
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/pascal_context.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class PascalContextDataset(BaseSegDataset):
+    """PascalContext dataset.
+
+    In segmentation map annotation for PascalContext, 0 stands for background,
+    which is included in 60 categories. ``reduce_zero_label`` is fixed to
+    False. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png'.
+
+    Args:
+        ann_file (str): Annotation file path.
+    """
+
+    METAINFO = dict(
+        classes=('background', 'aeroplane', 'bag', 'bed', 'bedclothes',
+                 'bench', 'bicycle', 'bird', 'boat', 'book', 'bottle',
+                 'building', 'bus', 'cabinet', 'car', 'cat', 'ceiling',
+                 'chair', 'cloth', 'computer', 'cow', 'cup', 'curtain', 'dog',
+                 'door', 'fence', 'floor', 'flower', 'food', 'grass', 'ground',
+                 'horse', 'keyboard', 'light', 'motorbike', 'mountain',
+                 'mouse', 'person', 'plate', 'platform', 'pottedplant', 'road',
+                 'rock', 'sheep', 'shelves', 'sidewalk', 'sign', 'sky', 'snow',
+                 'sofa', 'table', 'track', 'train', 'tree', 'truck',
+                 'tvmonitor', 'wall', 'water', 'window', 'wood'),
+        palette=[[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+                 [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+                 [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+                 [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+                 [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+                 [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+                 [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+                 [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+                 [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+                 [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+                 [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+                 [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+                 [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+                 [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+                 [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]])
+
+    def __init__(self,
+                 ann_file='',
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            ann_file=ann_file,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
+        assert fileio.exists(self.data_prefix['img_path'], self.backend_args)
+
+
+@DATASETS.register_module()
+class PascalContextDataset59(BaseSegDataset):
+    """PascalContext dataset.
+
+    In segmentation map annotation for PascalContext, 0 stands for background,
+    which is included in 60 categories. ``reduce_zero_label`` is fixed to
+    True. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png'.
+    Noted: If the background is 255 and the ids of categories are from 0 to 58,
+    ``reduce_zero_label`` needs to be set to False.
+
+    Args:
+        ann_file (str): Annotation file path.
+    """
+    METAINFO = dict(
+        classes=('aeroplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle',
+                 'bird', 'boat', 'book', 'bottle', 'building', 'bus',
+                 'cabinet', 'car', 'cat', 'ceiling', 'chair', 'cloth',
+                 'computer', 'cow', 'cup', 'curtain', 'dog', 'door', 'fence',
+                 'floor', 'flower', 'food', 'grass', 'ground', 'horse',
+                 'keyboard', 'light', 'motorbike', 'mountain', 'mouse',
+                 'person', 'plate', 'platform', 'pottedplant', 'road', 'rock',
+                 'sheep', 'shelves', 'sidewalk', 'sign', 'sky', 'snow', 'sofa',
+                 'table', 'track', 'train', 'tree', 'truck', 'tvmonitor',
+                 'wall', 'water', 'window', 'wood'),
+        palette=[[180, 120, 120], [6, 230, 230], [80, 50, 50], [4, 200, 3],
+                 [120, 120, 80], [140, 140, 140], [204, 5, 255],
+                 [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+                 [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+                 [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+                 [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+                 [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+                 [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+                 [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+                 [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+                 [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+                 [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+                 [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+                 [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+                 [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]])
+
+    def __init__(self,
+                 ann_file='',
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs):
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            ann_file=ann_file,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
+        assert fileio.exists(self.data_prefix['img_path'], self.backend_args)
diff --git a/head_extractor/build/lib/mmseg/datasets/potsdam.py b/head_extractor/build/lib/mmseg/datasets/potsdam.py
new file mode 100644
index 0000000000000000000000000000000000000000..6892de3dd29fda569527342377c6e83ce0d972bf
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/potsdam.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class PotsdamDataset(BaseSegDataset):
+    """ISPRS Potsdam dataset.
+
+    In segmentation map annotation for Potsdam dataset, 0 is the ignore index.
+    ``reduce_zero_label`` should be set to True. The ``img_suffix`` and
+    ``seg_map_suffix`` are both fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=('impervious_surface', 'building', 'low_vegetation', 'tree',
+                 'car', 'clutter'),
+        palette=[[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
+                 [255, 255, 0], [255, 0, 0]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/refuge.py b/head_extractor/build/lib/mmseg/datasets/refuge.py
new file mode 100644
index 0000000000000000000000000000000000000000..4016a825a37cdd0162f9c3e72df2fcabc6984991
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/refuge.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class REFUGEDataset(BaseSegDataset):
+    """REFUGE dataset.
+
+    In segmentation map annotation for REFUGE, 0 stands for background, which
+    is not included in 2 categories. ``reduce_zero_label`` is fixed to True.
+    The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '.png'.
+    """
+    METAINFO = dict(
+        classes=('background', ' Optic Cup', 'Optic Disc'),
+        palette=[[120, 120, 120], [6, 230, 230], [56, 59, 120]])
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(
+            img_suffix='.png',
+            seg_map_suffix='.png',
+            reduce_zero_label=False,
+            **kwargs)
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/head_extractor/build/lib/mmseg/datasets/stare.py b/head_extractor/build/lib/mmseg/datasets/stare.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b997bb785f20a9225c8b7e3f9b0522bc5e5ed99
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/stare.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class STAREDataset(BaseSegDataset):
+    """STARE dataset.
+
+    In segmentation map annotation for STARE, 0 stands for background, which is
+    included in 2 categories. ``reduce_zero_label`` is fixed to False. The
+    ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '.ah.png'.
+    """
+    METAINFO = dict(
+        classes=('background', 'vessel'),
+        palette=[[120, 120, 120], [6, 230, 230]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.ah.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/head_extractor/build/lib/mmseg/datasets/synapse.py b/head_extractor/build/lib/mmseg/datasets/synapse.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f83b6415046667fb24086083c43083040f4487c
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/synapse.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class SynapseDataset(BaseSegDataset):
+    """Synapse dataset.
+
+    Before dataset preprocess of Synapse, there are total 13 categories of
+    foreground which does not include background. After preprocessing, 8
+    foreground categories are kept while the other 5 foreground categories are
+    handled as background. The ``img_suffix`` is fixed to '.jpg' and
+    ``seg_map_suffix`` is fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=('background', 'aorta', 'gallbladder', 'left_kidney',
+                 'right_kidney', 'liver', 'pancreas', 'spleen', 'stomach'),
+        palette=[[0, 0, 0], [0, 0, 255], [0, 255, 0], [255, 0, 0],
+                 [0, 255, 255], [255, 0, 255], [255, 255, 0], [60, 255, 255],
+                 [240, 240, 240]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/transforms/__init__.py b/head_extractor/build/lib/mmseg/datasets/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..125f07081810c980ebc6ded077bcf5dfd955cfcf
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/transforms/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .formatting import PackSegInputs
+from .loading import (LoadAnnotations, LoadBiomedicalAnnotation,
+                      LoadBiomedicalData, LoadBiomedicalImageFromFile,
+                      LoadDepthAnnotation, LoadImageFromNDArray,
+                      LoadMultipleRSImageFromFile, LoadSingleRSImageFromFile)
+# yapf: disable
+from .transforms import (CLAHE, AdjustGamma, Albu, BioMedical3DPad,
+                         BioMedical3DRandomCrop, BioMedical3DRandomFlip,
+                         BioMedicalGaussianBlur, BioMedicalGaussianNoise,
+                         BioMedicalRandomGamma, ConcatCDInput, GenerateEdge,
+                         PhotoMetricDistortion, RandomCrop, RandomCutOut,
+                         RandomDepthMix, RandomFlip, RandomMosaic,
+                         RandomRotate, RandomRotFlip, Rerange, Resize,
+                         ResizeShortestEdge, ResizeToMultiple, RGB2Gray,
+                         SegRescale)
+
+# yapf: enable
+__all__ = [
+    'LoadAnnotations', 'RandomCrop', 'BioMedical3DRandomCrop', 'SegRescale',
+    'PhotoMetricDistortion', 'RandomRotate', 'AdjustGamma', 'CLAHE', 'Rerange',
+    'RGB2Gray', 'RandomCutOut', 'RandomMosaic', 'PackSegInputs',
+    'ResizeToMultiple', 'LoadImageFromNDArray', 'LoadBiomedicalImageFromFile',
+    'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge',
+    'ResizeShortestEdge', 'BioMedicalGaussianNoise', 'BioMedicalGaussianBlur',
+    'BioMedical3DRandomFlip', 'BioMedicalRandomGamma', 'BioMedical3DPad',
+    'RandomRotFlip', 'Albu', 'LoadSingleRSImageFromFile', 'ConcatCDInput',
+    'LoadMultipleRSImageFromFile', 'LoadDepthAnnotation', 'RandomDepthMix',
+    'RandomFlip', 'Resize'
+]
diff --git a/head_extractor/build/lib/mmseg/datasets/transforms/formatting.py b/head_extractor/build/lib/mmseg/datasets/transforms/formatting.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd250551e98ffc9decaa2e168943821501844c1f
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/transforms/formatting.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+from mmcv.transforms import to_tensor
+from mmcv.transforms.base import BaseTransform
+from mmengine.structures import PixelData
+
+from mmseg.registry import TRANSFORMS
+from mmseg.structures import SegDataSample
+
+
+@TRANSFORMS.register_module()
+class PackSegInputs(BaseTransform):
+    """Pack the inputs data for the semantic segmentation.
+
+    The ``img_meta`` item is always populated.  The contents of the
+    ``img_meta`` dictionary depends on ``meta_keys``. By default this includes:
+
+        - ``img_path``: filename of the image
+
+        - ``ori_shape``: original shape of the image as a tuple (h, w, c)
+
+        - ``img_shape``: shape of the image input to the network as a tuple \
+            (h, w, c).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+
+        - ``pad_shape``: shape of padded images
+
+        - ``scale_factor``: a float indicating the preprocessing scale
+
+        - ``flip``: a boolean indicating if image flip transform was used
+
+        - ``flip_direction``: the flipping direction
+
+    Args:
+        meta_keys (Sequence[str], optional): Meta keys to be packed from
+            ``SegDataSample`` and collected in ``data[img_metas]``.
+            Default: ``('img_path', 'ori_shape',
+            'img_shape', 'pad_shape', 'scale_factor', 'flip',
+            'flip_direction')``
+    """
+
+    def __init__(self,
+                 meta_keys=('img_path', 'seg_map_path', 'ori_shape',
+                            'img_shape', 'pad_shape', 'scale_factor', 'flip',
+                            'flip_direction', 'reduce_zero_label')):
+        self.meta_keys = meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict:
+
+            - 'inputs' (obj:`torch.Tensor`): The forward data of models.
+            - 'data_sample' (obj:`SegDataSample`): The annotation info of the
+                sample.
+        """
+        packed_results = dict()
+        if 'img' in results:
+            img = results['img']
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            if not img.flags.c_contiguous:
+                img = to_tensor(np.ascontiguousarray(img.transpose(2, 0, 1)))
+            else:
+                img = img.transpose(2, 0, 1)
+                img = to_tensor(img).contiguous()
+            packed_results['inputs'] = img
+
+        data_sample = SegDataSample()
+        if 'gt_seg_map' in results:
+            if len(results['gt_seg_map'].shape) == 2:
+                data = to_tensor(results['gt_seg_map'][None,
+                                                       ...].astype(np.int64))
+            else:
+                warnings.warn('Please pay attention your ground truth '
+                              'segmentation map, usually the segmentation '
+                              'map is 2D, but got '
+                              f'{results["gt_seg_map"].shape}')
+                data = to_tensor(results['gt_seg_map'].astype(np.int64))
+            gt_sem_seg_data = dict(data=data)
+            data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data)
+
+        if 'gt_edge_map' in results:
+            gt_edge_data = dict(
+                data=to_tensor(results['gt_edge_map'][None,
+                                                      ...].astype(np.int64)))
+            data_sample.set_data(dict(gt_edge_map=PixelData(**gt_edge_data)))
+
+        if 'gt_depth_map' in results:
+            gt_depth_data = dict(
+                data=to_tensor(results['gt_depth_map'][None, ...]))
+            data_sample.set_data(dict(gt_depth_map=PixelData(**gt_depth_data)))
+
+        img_meta = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_meta[key] = results[key]
+        data_sample.set_metainfo(img_meta)
+        packed_results['data_samples'] = data_sample
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
diff --git a/head_extractor/build/lib/mmseg/datasets/transforms/loading.py b/head_extractor/build/lib/mmseg/datasets/transforms/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..c28937e55eed0aeb9c9cf2f3cd367541a7e81d07
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/transforms/loading.py
@@ -0,0 +1,771 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+import mmcv
+import mmengine.fileio as fileio
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmcv.transforms import LoadAnnotations as MMCV_LoadAnnotations
+from mmcv.transforms import LoadImageFromFile
+
+from mmseg.registry import TRANSFORMS
+from mmseg.utils import datafrombytes
+
+try:
+    from osgeo import gdal
+except ImportError:
+    gdal = None
+
+
+@TRANSFORMS.register_module()
+class LoadAnnotations(MMCV_LoadAnnotations):
+    """Load annotations for semantic segmentation provided by dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            # Filename of semantic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+
+    .. code-block:: python
+
+        {
+            # in str
+            'seg_fields': List
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+        }
+
+    Required Keys:
+
+    - seg_map_path (str): Path of semantic segmentation ground truth file.
+
+    Added Keys:
+
+    - seg_fields (List)
+    - gt_seg_map (np.uint8)
+
+    Args:
+        reduce_zero_label (bool, optional): Whether reduce all label value
+            by 1. Usually used for datasets where 0 is background label.
+            Defaults to None.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:``mmcv.imfrombytes``.
+            See :fun:``mmcv.imfrombytes`` for details.
+            Defaults to 'pillow'.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(
+        self,
+        reduce_zero_label=None,
+        backend_args=None,
+        imdecode_backend='pillow',
+    ) -> None:
+        super().__init__(
+            with_bbox=False,
+            with_label=False,
+            with_seg=True,
+            with_keypoints=False,
+            imdecode_backend=imdecode_backend,
+            backend_args=backend_args)
+        self.reduce_zero_label = reduce_zero_label
+        if self.reduce_zero_label is not None:
+            warnings.warn('`reduce_zero_label` will be deprecated, '
+                          'if you would like to ignore the zero label, please '
+                          'set `reduce_zero_label=True` when dataset '
+                          'initialized')
+        self.imdecode_backend = imdecode_backend
+
+    def _load_seg_map(self, results: dict) -> None:
+        """Private function to load semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+
+        img_bytes = fileio.get(
+            results['seg_map_path'], backend_args=self.backend_args)
+        gt_semantic_seg = mmcv.imfrombytes(
+            img_bytes, flag='unchanged',
+            backend=self.imdecode_backend).squeeze().astype(np.uint8)
+
+        # reduce zero_label
+        if self.reduce_zero_label is None:
+            self.reduce_zero_label = results['reduce_zero_label']
+        assert self.reduce_zero_label == results['reduce_zero_label'], \
+            'Initialize dataset with `reduce_zero_label` as ' \
+            f'{results["reduce_zero_label"]} but when load annotation ' \
+            f'the `reduce_zero_label` is {self.reduce_zero_label}'
+        if self.reduce_zero_label:
+            # avoid using underflow conversion
+            gt_semantic_seg[gt_semantic_seg == 0] = 255
+            gt_semantic_seg = gt_semantic_seg - 1
+            gt_semantic_seg[gt_semantic_seg == 254] = 255
+        # modify if custom classes
+        if results.get('label_map', None) is not None:
+            # Add deep copy to solve bug of repeatedly
+            # replace `gt_semantic_seg`, which is reported in
+            # https://github.com/open-mmlab/mmsegmentation/pull/1445/
+            gt_semantic_seg_copy = gt_semantic_seg.copy()
+            for old_id, new_id in results['label_map'].items():
+                gt_semantic_seg[gt_semantic_seg_copy == old_id] = new_id
+        results['gt_seg_map'] = gt_semantic_seg
+        results['seg_fields'].append('gt_seg_map')
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(reduce_zero_label={self.reduce_zero_label}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'backend_args={self.backend_args})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadImageFromNDArray(LoadImageFromFile):
+    """Load an image from ``results['img']``.
+
+    Similar with :obj:`LoadImageFromFile`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['img']``. Can be used when loading image
+    from webcam.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_path
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            results (dict): Result dict with Webcam read image in
+                ``results['img']``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        img = results['img']
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img_path'] = None
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadBiomedicalImageFromFile(BaseTransform):
+    """Load an biomedical mage from file.
+
+    Required Keys:
+
+    - img_path
+
+    Added Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities, and data type is float32
+        if set to_float32 = True, or float64 if decode_backend is 'nifti' and
+        to_float32 is False.
+    - img_shape
+    - ori_shape
+
+    Args:
+        decode_backend (str): The data decoding backend type. Options are
+            'numpy'and 'nifti', and there is a convention that when backend is
+            'nifti' the axis of data loaded is XYZ, and when backend is
+            'numpy', the the axis is ZYX. The data will be transposed if the
+            backend is 'nifti'. Defaults to 'nifti'.
+        to_xyz (bool): Whether transpose data from Z, Y, X to X, Y, Z.
+            Defaults to False.
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an float64 array.
+            Defaults to True.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 decode_backend: str = 'nifti',
+                 to_xyz: bool = False,
+                 to_float32: bool = True,
+                 backend_args: Optional[dict] = None) -> None:
+        self.decode_backend = decode_backend
+        self.to_xyz = to_xyz
+        self.to_float32 = to_float32
+        self.backend_args = backend_args.copy() if backend_args else None
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+
+        data_bytes = fileio.get(filename, self.backend_args)
+        img = datafrombytes(data_bytes, backend=self.decode_backend)
+
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        if len(img.shape) == 3:
+            img = img[None, ...]
+
+        if self.decode_backend == 'nifti':
+            img = img.transpose(0, 3, 2, 1)
+
+        if self.to_xyz:
+            img = img.transpose(0, 3, 2, 1)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[1:]
+        results['ori_shape'] = img.shape[1:]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f"decode_backend='{self.decode_backend}', "
+                    f'to_xyz={self.to_xyz}, '
+                    f'to_float32={self.to_float32}, '
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadBiomedicalAnnotation(BaseTransform):
+    """Load ``seg_map`` annotation provided by biomedical dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'gt_seg_map': np.ndarray (X, Y, Z) or (Z, Y, X)
+        }
+
+    Required Keys:
+
+    - seg_map_path
+
+    Added Keys:
+
+    - gt_seg_map (np.ndarray): Biomedical seg map with shape (Z, Y, X) by
+        default, and data type is float32 if set to_float32 = True, or
+        float64 if decode_backend is 'nifti' and to_float32 is False.
+
+    Args:
+        decode_backend (str): The data decoding backend type. Options are
+            'numpy'and 'nifti', and there is a convention that when backend is
+            'nifti' the axis of data loaded is XYZ, and when backend is
+            'numpy', the the axis is ZYX. The data will be transposed if the
+            backend is 'nifti'. Defaults to 'nifti'.
+        to_xyz (bool): Whether transpose data from Z, Y, X to X, Y, Z.
+            Defaults to False.
+        to_float32 (bool): Whether to convert the loaded seg map to a float32
+            numpy array. If set to False, the loaded image is an float64 array.
+            Defaults to True.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See :class:`mmengine.fileio` for details.
+            Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 decode_backend: str = 'nifti',
+                 to_xyz: bool = False,
+                 to_float32: bool = True,
+                 backend_args: Optional[dict] = None) -> None:
+        super().__init__()
+        self.decode_backend = decode_backend
+        self.to_xyz = to_xyz
+        self.to_float32 = to_float32
+        self.backend_args = backend_args.copy() if backend_args else None
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        data_bytes = fileio.get(results['seg_map_path'], self.backend_args)
+        gt_seg_map = datafrombytes(data_bytes, backend=self.decode_backend)
+
+        if self.to_float32:
+            gt_seg_map = gt_seg_map.astype(np.float32)
+
+        if self.decode_backend == 'nifti':
+            gt_seg_map = gt_seg_map.transpose(2, 1, 0)
+
+        if self.to_xyz:
+            gt_seg_map = gt_seg_map.transpose(2, 1, 0)
+
+        results['gt_seg_map'] = gt_seg_map
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f"decode_backend='{self.decode_backend}', "
+                    f'to_xyz={self.to_xyz}, '
+                    f'to_float32={self.to_float32}, '
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadBiomedicalData(BaseTransform):
+    """Load an biomedical image and annotation from file.
+
+    The loading data format is as the following:
+
+    .. code-block:: python
+
+        {
+            'img': np.ndarray data[:-1, X, Y, Z]
+            'seg_map': np.ndarray data[-1, X, Y, Z]
+        }
+
+
+    Required Keys:
+
+    - img_path
+
+    Added Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+    - img_shape
+    - ori_shape
+
+    Args:
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Defaults to False.
+        decode_backend (str): The data decoding backend type. Options are
+            'numpy'and 'nifti', and there is a convention that when backend is
+            'nifti' the axis of data loaded is XYZ, and when backend is
+            'numpy', the the axis is ZYX. The data will be transposed if the
+            backend is 'nifti'. Defaults to 'nifti'.
+        to_xyz (bool): Whether transpose data from Z, Y, X to X, Y, Z.
+            Defaults to False.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 with_seg=False,
+                 decode_backend: str = 'numpy',
+                 to_xyz: bool = False,
+                 backend_args: Optional[dict] = None) -> None:  # noqa
+        self.with_seg = with_seg
+        self.decode_backend = decode_backend
+        self.to_xyz = to_xyz
+        self.backend_args = backend_args.copy() if backend_args else None
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        data_bytes = fileio.get(results['img_path'], self.backend_args)
+        data = datafrombytes(data_bytes, backend=self.decode_backend)
+        # img is 4D data (N, X, Y, Z), N is the number of protocol
+        img = data[:-1, :]
+
+        if self.decode_backend == 'nifti':
+            img = img.transpose(0, 3, 2, 1)
+
+        if self.to_xyz:
+            img = img.transpose(0, 3, 2, 1)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[1:]
+        results['ori_shape'] = img.shape[1:]
+
+        if self.with_seg:
+            gt_seg_map = data[-1, :]
+            if self.decode_backend == 'nifti':
+                gt_seg_map = gt_seg_map.transpose(2, 1, 0)
+
+            if self.to_xyz:
+                gt_seg_map = gt_seg_map.transpose(2, 1, 0)
+            results['gt_seg_map'] = gt_seg_map
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'with_seg={self.with_seg}, '
+                    f"decode_backend='{self.decode_backend}', "
+                    f'to_xyz={self.to_xyz}, '
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class InferencerLoader(BaseTransform):
+    """Load an image from ``results['img']``.
+
+    Similar with :obj:`LoadImageFromFile`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['img']``. Can be used when loading image
+    from webcam.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_path
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__()
+        self.from_file = TRANSFORMS.build(
+            dict(type='LoadImageFromFile', **kwargs))
+        self.from_ndarray = TRANSFORMS.build(
+            dict(type='LoadImageFromNDArray', **kwargs))
+
+    def transform(self, single_input: Union[str, np.ndarray, dict]) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            results (dict): Result dict with Webcam read image in
+                ``results['img']``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        if isinstance(single_input, str):
+            inputs = dict(img_path=single_input)
+        elif isinstance(single_input, np.ndarray):
+            inputs = dict(img=single_input)
+        elif isinstance(single_input, dict):
+            inputs = single_input
+        else:
+            raise NotImplementedError
+
+        if 'img' in inputs:
+            return self.from_ndarray(inputs)
+        return self.from_file(inputs)
+
+
+@TRANSFORMS.register_module()
+class LoadSingleRSImageFromFile(BaseTransform):
+    """Load a Remote Sensing mage from file.
+
+    Required Keys:
+
+    - img_path
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is a float64 array.
+            Defaults to True.
+    """
+
+    def __init__(self, to_float32: bool = True):
+        self.to_float32 = to_float32
+
+        if gdal is None:
+            raise RuntimeError('gdal is not installed')
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+        ds = gdal.Open(filename)
+        if ds is None:
+            raise Exception(f'Unable to open file: {filename}')
+        img = np.einsum('ijk->jki', ds.ReadAsArray())
+
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadMultipleRSImageFromFile(BaseTransform):
+    """Load two Remote Sensing mage from file.
+
+    Required Keys:
+
+    - img_path
+    - img_path2
+
+    Modified Keys:
+
+    - img
+    - img2
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is a float64 array.
+            Defaults to True.
+    """
+
+    def __init__(self, to_float32: bool = True):
+        if gdal is None:
+            raise RuntimeError('gdal is not installed')
+        self.to_float32 = to_float32
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+        filename2 = results['img_path2']
+
+        ds = gdal.Open(filename)
+        ds2 = gdal.Open(filename2)
+
+        if ds is None:
+            raise Exception(f'Unable to open file: {filename}')
+        if ds2 is None:
+            raise Exception(f'Unable to open file: {filename2}')
+
+        img = np.einsum('ijk->jki', ds.ReadAsArray())
+        img2 = np.einsum('ijk->jki', ds2.ReadAsArray())
+
+        if self.to_float32:
+            img = img.astype(np.float32)
+            img2 = img2.astype(np.float32)
+
+        if img.shape != img2.shape:
+            raise Exception(f'Image shapes do not match:'
+                            f' {img.shape} vs {img2.shape}')
+
+        results['img'] = img
+        results['img2'] = img2
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadDepthAnnotation(BaseTransform):
+    """Load ``depth_map`` annotation provided by depth estimation dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'gt_depth_map': np.ndarray [Y, X]
+        }
+
+    Required Keys:
+
+    - seg_depth_path
+
+    Added Keys:
+
+    - gt_depth_map (np.ndarray): Depth map with shape (Y, X) by
+        default, and data type is float32 if set to_float32 = True.
+    - depth_rescale_factor (float): The rescale factor of depth map, which
+        can be used to recover the original value of depth map.
+
+    Args:
+        decode_backend (str): The data decoding backend type. Options are
+            'numpy', 'nifti', and 'cv2'. Defaults to 'cv2'.
+        to_float32 (bool): Whether to convert the loaded depth map to a float32
+            numpy array. If set to False, the loaded image is an uint16 array.
+            Defaults to True.
+        depth_rescale_factor (float): Factor to rescale the depth value to
+            limit the range. Defaults to 1.0.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See :class:`mmengine.fileio` for details.
+            Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 decode_backend: str = 'cv2',
+                 to_float32: bool = True,
+                 depth_rescale_factor: float = 1.0,
+                 backend_args: Optional[dict] = None) -> None:
+        super().__init__()
+        self.decode_backend = decode_backend
+        self.to_float32 = to_float32
+        self.depth_rescale_factor = depth_rescale_factor
+        self.backend_args = backend_args.copy() if backend_args else None
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load depth map.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded depth map.
+        """
+        data_bytes = fileio.get(results['depth_map_path'], self.backend_args)
+        gt_depth_map = datafrombytes(data_bytes, backend=self.decode_backend)
+
+        if self.to_float32:
+            gt_depth_map = gt_depth_map.astype(np.float32)
+
+        gt_depth_map *= self.depth_rescale_factor
+        results['gt_depth_map'] = gt_depth_map
+        results['seg_fields'].append('gt_depth_map')
+        results['depth_rescale_factor'] = self.depth_rescale_factor
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f"decode_backend='{self.decode_backend}', "
+                    f'to_float32={self.to_float32}, '
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadImageFromNpyFile(LoadImageFromFile):
+    """Load an image from ``results['img_path']``.
+
+    Required Keys:
+
+    - img_path
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def transform(self, results: dict) -> Optional[dict]:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+
+        try:
+            if Path(filename).suffix in ['.npy', '.npz']:
+                img = np.load(filename)
+            else:
+                if self.file_client_args is not None:
+                    file_client = fileio.FileClient.infer_client(
+                        self.file_client_args, filename)
+                    img_bytes = file_client.get(filename)
+                else:
+                    img_bytes = fileio.get(
+                        filename, backend_args=self.backend_args)
+                img = mmcv.imfrombytes(
+                    img_bytes,
+                    flag=self.color_type,
+                    backend=self.imdecode_backend)
+        except Exception as e:
+            if self.ignore_empty:
+                return None
+            else:
+                raise e
+
+        # in some cases, images are not read successfully, the img would be
+        # `None`, refer to https://github.com/open-mmlab/mmpretrain/issues/1427
+        assert img is not None, f'failed to load image: {filename}'
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
diff --git a/head_extractor/build/lib/mmseg/datasets/transforms/transforms.py b/head_extractor/build/lib/mmseg/datasets/transforms/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..64e23230c66e2cbe13370770646c83e936443398
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/transforms/transforms.py
@@ -0,0 +1,2537 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+import warnings
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import cv2
+import mmcv
+import mmengine
+import numpy as np
+from mmcv.transforms import RandomFlip as MMCV_RandomFlip
+from mmcv.transforms import Resize as MMCV_Resize
+from mmcv.transforms.base import BaseTransform
+from mmcv.transforms.utils import cache_randomness
+from mmengine.utils import is_tuple_of
+from numpy import random
+from scipy.ndimage import gaussian_filter
+
+from mmseg.datasets.dataset_wrappers import MultiImageMixDataset
+from mmseg.registry import TRANSFORMS
+
+try:
+    import albumentations
+    from albumentations import Compose
+    ALBU_INSTALLED = True
+except ImportError:
+    albumentations = None
+    Compose = None
+    ALBU_INSTALLED = False
+
+
+@TRANSFORMS.register_module()
+class ResizeToMultiple(BaseTransform):
+    """Resize images & seg to multiple of divisor.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - pad_shape
+
+    Args:
+        size_divisor (int): images and gt seg maps need to resize to multiple
+            of size_divisor. Default: 32.
+        interpolation (str, optional): The interpolation mode of image resize.
+            Default: None
+    """
+
+    def __init__(self, size_divisor=32, interpolation=None):
+        self.size_divisor = size_divisor
+        self.interpolation = interpolation
+
+    def transform(self, results: dict) -> dict:
+        """Call function to resize images, semantic segmentation map to
+        multiple of size divisor.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape' keys are updated.
+        """
+        # Align image to multiple of size divisor.
+        img = results['img']
+        img = mmcv.imresize_to_multiple(
+            img,
+            self.size_divisor,
+            scale_factor=1,
+            interpolation=self.interpolation
+            if self.interpolation else 'bilinear')
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['pad_shape'] = img.shape[:2]
+
+        # Align segmentation map to multiple of size divisor.
+        for key in results.get('seg_fields', []):
+            gt_seg = results[key]
+            gt_seg = mmcv.imresize_to_multiple(
+                gt_seg,
+                self.size_divisor,
+                scale_factor=1,
+                interpolation='nearest')
+            results[key] = gt_seg
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(size_divisor={self.size_divisor}, '
+                     f'interpolation={self.interpolation})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Rerange(BaseTransform):
+    """Rerange the image pixel value.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        min_value (float or int): Minimum value of the reranged image.
+            Default: 0.
+        max_value (float or int): Maximum value of the reranged image.
+            Default: 255.
+    """
+
+    def __init__(self, min_value=0, max_value=255):
+        assert isinstance(min_value, float) or isinstance(min_value, int)
+        assert isinstance(max_value, float) or isinstance(max_value, int)
+        assert min_value < max_value
+        self.min_value = min_value
+        self.max_value = max_value
+
+    def transform(self, results: dict) -> dict:
+        """Call function to rerange images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Reranged results.
+        """
+
+        img = results['img']
+        img_min_value = np.min(img)
+        img_max_value = np.max(img)
+
+        assert img_min_value < img_max_value
+        # rerange to [0, 1]
+        img = (img - img_min_value) / (img_max_value - img_min_value)
+        # rerange to [min_value, max_value]
+        img = img * (self.max_value - self.min_value) + self.min_value
+        results['img'] = img
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(min_value={self.min_value}, max_value={self.max_value})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CLAHE(BaseTransform):
+    """Use CLAHE method to process the image.
+
+    See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
+    Graphics Gems, 1994:474-485.` for more information.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        clip_limit (float): Threshold for contrast limiting. Default: 40.0.
+        tile_grid_size (tuple[int]): Size of grid for histogram equalization.
+            Input image will be divided into equally sized rectangular tiles.
+            It defines the number of tiles in row and column. Default: (8, 8).
+    """
+
+    def __init__(self, clip_limit=40.0, tile_grid_size=(8, 8)):
+        assert isinstance(clip_limit, (float, int))
+        self.clip_limit = clip_limit
+        assert is_tuple_of(tile_grid_size, int)
+        assert len(tile_grid_size) == 2
+        self.tile_grid_size = tile_grid_size
+
+    def transform(self, results: dict) -> dict:
+        """Call function to Use CLAHE method process images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Processed results.
+        """
+
+        for i in range(results['img'].shape[2]):
+            results['img'][:, :, i] = mmcv.clahe(
+                np.array(results['img'][:, :, i], dtype=np.uint8),
+                self.clip_limit, self.tile_grid_size)
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(clip_limit={self.clip_limit}, ' \
+                    f'tile_grid_size={self.tile_grid_size})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomCrop(BaseTransform):
+    """Random crop the image & seg.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_seg_map
+
+
+    Args:
+        crop_size (Union[int, Tuple[int, int]]):  Expected size after cropping
+            with the format of (h, w). If set to an integer, then cropping
+            width and height are equal to this integer.
+        cat_max_ratio (float): The maximum ratio that single category could
+            occupy.
+        ignore_index (int): The label index to be ignored. Default: 255
+    """
+
+    def __init__(self,
+                 crop_size: Union[int, Tuple[int, int]],
+                 cat_max_ratio: float = 1.,
+                 ignore_index: int = 255):
+        super().__init__()
+        assert isinstance(crop_size, int) or (
+            isinstance(crop_size, tuple) and len(crop_size) == 2
+        ), 'The expected crop_size is an integer, or a tuple containing two '
+        'intergers'
+
+        if isinstance(crop_size, int):
+            crop_size = (crop_size, crop_size)
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        self.crop_size = crop_size
+        self.cat_max_ratio = cat_max_ratio
+        self.ignore_index = ignore_index
+
+    @cache_randomness
+    def crop_bbox(self, results: dict) -> tuple:
+        """get a crop bounding box.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            tuple: Coordinates of the cropped image.
+        """
+
+        def generate_crop_bbox(img: np.ndarray) -> tuple:
+            """Randomly get a crop bounding box.
+
+            Args:
+                img (np.ndarray): Original input image.
+
+            Returns:
+                tuple: Coordinates of the cropped image.
+            """
+
+            margin_h = max(img.shape[0] - self.crop_size[0], 0)
+            margin_w = max(img.shape[1] - self.crop_size[1], 0)
+            offset_h = np.random.randint(0, margin_h + 1)
+            offset_w = np.random.randint(0, margin_w + 1)
+            crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0]
+            crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1]
+
+            return crop_y1, crop_y2, crop_x1, crop_x2
+
+        img = results['img']
+        crop_bbox = generate_crop_bbox(img)
+        if self.cat_max_ratio < 1.:
+            # Repeat 10 times
+            for _ in range(10):
+                seg_temp = self.crop(results['gt_seg_map'], crop_bbox)
+                labels, cnt = np.unique(seg_temp, return_counts=True)
+                cnt = cnt[labels != self.ignore_index]
+                if len(cnt) > 1 and np.max(cnt) / np.sum(
+                        cnt) < self.cat_max_ratio:
+                    break
+                crop_bbox = generate_crop_bbox(img)
+
+        return crop_bbox
+
+    def crop(self, img: np.ndarray, crop_bbox: tuple) -> np.ndarray:
+        """Crop from ``img``
+
+        Args:
+            img (np.ndarray): Original input image.
+            crop_bbox (tuple): Coordinates of the cropped image.
+
+        Returns:
+            np.ndarray: The cropped image.
+        """
+
+        crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox
+        img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+        return img
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to randomly crop images, semantic segmentation
+        maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+
+        img = results['img']
+        crop_bbox = self.crop_bbox(results)
+
+        # crop the image
+        img = self.crop(img, crop_bbox)
+
+        # crop semantic seg
+        for key in results.get('seg_fields', []):
+            results[key] = self.crop(results[key], crop_bbox)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(crop_size={self.crop_size})'
+
+
+@TRANSFORMS.register_module()
+class RandomRotate(BaseTransform):
+    """Rotate the image & seg.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+
+    Args:
+        prob (float): The rotation probability.
+        degree (float, tuple[float]): Range of degrees to select from. If
+            degree is a number instead of tuple like (min, max),
+            the range of degree will be (``-degree``, ``+degree``)
+        pad_val (float, optional): Padding value of image. Default: 0.
+        seg_pad_val (float, optional): Padding value of segmentation map.
+            Default: 255.
+        center (tuple[float], optional): Center point (w, h) of the rotation in
+            the source image. If not specified, the center of the image will be
+            used. Default: None.
+        auto_bound (bool): Whether to adjust the image size to cover the whole
+            rotated image. Default: False
+    """
+
+    def __init__(self,
+                 prob,
+                 degree,
+                 pad_val=0,
+                 seg_pad_val=255,
+                 center=None,
+                 auto_bound=False):
+        self.prob = prob
+        assert prob >= 0 and prob <= 1
+        if isinstance(degree, (float, int)):
+            assert degree > 0, f'degree {degree} should be positive'
+            self.degree = (-degree, degree)
+        else:
+            self.degree = degree
+        assert len(self.degree) == 2, f'degree {self.degree} should be a ' \
+                                      f'tuple of (min, max)'
+        self.pal_val = pad_val
+        self.seg_pad_val = seg_pad_val
+        self.center = center
+        self.auto_bound = auto_bound
+
+    @cache_randomness
+    def generate_degree(self):
+        return np.random.rand() < self.prob, np.random.uniform(
+            min(*self.degree), max(*self.degree))
+
+    def transform(self, results: dict) -> dict:
+        """Call function to rotate image, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Rotated results.
+        """
+
+        rotate, degree = self.generate_degree()
+        if rotate:
+            # rotate image
+            results['img'] = mmcv.imrotate(
+                results['img'],
+                angle=degree,
+                border_value=self.pal_val,
+                center=self.center,
+                auto_bound=self.auto_bound)
+
+            # rotate segs
+            for key in results.get('seg_fields', []):
+                results[key] = mmcv.imrotate(
+                    results[key],
+                    angle=degree,
+                    border_value=self.seg_pad_val,
+                    center=self.center,
+                    auto_bound=self.auto_bound,
+                    interpolation='nearest')
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, ' \
+                    f'degree={self.degree}, ' \
+                    f'pad_val={self.pal_val}, ' \
+                    f'seg_pad_val={self.seg_pad_val}, ' \
+                    f'center={self.center}, ' \
+                    f'auto_bound={self.auto_bound})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RGB2Gray(BaseTransform):
+    """Convert RGB image to grayscale image.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_shape
+
+    This transform calculate the weighted mean of input image channels with
+    ``weights`` and then expand the channels to ``out_channels``. When
+    ``out_channels`` is None, the number of output channels is the same as
+    input channels.
+
+    Args:
+        out_channels (int): Expected number of output channels after
+            transforming. Default: None.
+        weights (tuple[float]): The weights to calculate the weighted mean.
+            Default: (0.299, 0.587, 0.114).
+    """
+
+    def __init__(self, out_channels=None, weights=(0.299, 0.587, 0.114)):
+        assert out_channels is None or out_channels > 0
+        self.out_channels = out_channels
+        assert isinstance(weights, tuple)
+        for item in weights:
+            assert isinstance(item, (float, int))
+        self.weights = weights
+
+    def transform(self, results: dict) -> dict:
+        """Call function to convert RGB image to grayscale image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with grayscale image.
+        """
+        img = results['img']
+        assert len(img.shape) == 3
+        assert img.shape[2] == len(self.weights)
+        weights = np.array(self.weights).reshape((1, 1, -1))
+        img = (img * weights).sum(2, keepdims=True)
+        if self.out_channels is None:
+            img = img.repeat(weights.shape[2], axis=2)
+        else:
+            img = img.repeat(self.out_channels, axis=2)
+
+        results['img'] = img
+        results['img_shape'] = img.shape
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(out_channels={self.out_channels}, ' \
+                    f'weights={self.weights})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class AdjustGamma(BaseTransform):
+    """Using gamma correction to process the image.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        gamma (float or int): Gamma value used in gamma correction.
+            Default: 1.0.
+    """
+
+    def __init__(self, gamma=1.0):
+        assert isinstance(gamma, float) or isinstance(gamma, int)
+        assert gamma > 0
+        self.gamma = gamma
+        inv_gamma = 1.0 / gamma
+        self.table = np.array([(i / 255.0)**inv_gamma * 255
+                               for i in np.arange(256)]).astype('uint8')
+
+    def transform(self, results: dict) -> dict:
+        """Call function to process the image with gamma correction.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Processed results.
+        """
+
+        results['img'] = mmcv.lut_transform(
+            np.array(results['img'], dtype=np.uint8), self.table)
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(gamma={self.gamma})'
+
+
+@TRANSFORMS.register_module()
+class SegRescale(BaseTransform):
+    """Rescale semantic segmentation maps.
+
+    Required Keys:
+
+    - gt_seg_map
+
+    Modified Keys:
+
+    - gt_seg_map
+
+    Args:
+        scale_factor (float): The scale factor of the final output.
+    """
+
+    def __init__(self, scale_factor=1):
+        self.scale_factor = scale_factor
+
+    def transform(self, results: dict) -> dict:
+        """Call function to scale the semantic segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with semantic segmentation map scaled.
+        """
+        for key in results.get('seg_fields', []):
+            if self.scale_factor != 1:
+                results[key] = mmcv.imrescale(
+                    results[key], self.scale_factor, interpolation='nearest')
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(scale_factor={self.scale_factor})'
+
+
+@TRANSFORMS.register_module()
+class PhotoMetricDistortion(BaseTransform):
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta: int = 32,
+                 contrast_range: Sequence[float] = (0.5, 1.5),
+                 saturation_range: Sequence[float] = (0.5, 1.5),
+                 hue_delta: int = 18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def convert(self,
+                img: np.ndarray,
+                alpha: int = 1,
+                beta: int = 0) -> np.ndarray:
+        """Multiple with alpha and add beat with clip.
+
+        Args:
+            img (np.ndarray): The input image.
+            alpha (int): Image weights, change the contrast/saturation
+                of the image. Default: 1
+            beta (int): Image bias, change the brightness of the
+                image. Default: 0
+
+        Returns:
+            np.ndarray: The transformed image.
+        """
+
+        img = img.astype(np.float32) * alpha + beta
+        img = np.clip(img, 0, 255)
+        return img.astype(np.uint8)
+
+    def brightness(self, img: np.ndarray) -> np.ndarray:
+        """Brightness distortion.
+
+        Args:
+            img (np.ndarray): The input image.
+        Returns:
+            np.ndarray: Image after brightness change.
+        """
+
+        if random.randint(2):
+            return self.convert(
+                img,
+                beta=random.uniform(-self.brightness_delta,
+                                    self.brightness_delta))
+        return img
+
+    def contrast(self, img: np.ndarray) -> np.ndarray:
+        """Contrast distortion.
+
+        Args:
+            img (np.ndarray): The input image.
+        Returns:
+            np.ndarray: Image after contrast change.
+        """
+
+        if random.randint(2):
+            return self.convert(
+                img,
+                alpha=random.uniform(self.contrast_lower, self.contrast_upper))
+        return img
+
+    def saturation(self, img: np.ndarray) -> np.ndarray:
+        """Saturation distortion.
+
+        Args:
+            img (np.ndarray): The input image.
+        Returns:
+            np.ndarray: Image after saturation change.
+        """
+
+        if random.randint(2):
+            img = mmcv.bgr2hsv(img)
+            img[:, :, 1] = self.convert(
+                img[:, :, 1],
+                alpha=random.uniform(self.saturation_lower,
+                                     self.saturation_upper))
+            img = mmcv.hsv2bgr(img)
+        return img
+
+    def hue(self, img: np.ndarray) -> np.ndarray:
+        """Hue distortion.
+
+        Args:
+            img (np.ndarray): The input image.
+        Returns:
+            np.ndarray: Image after hue change.
+        """
+
+        if random.randint(2):
+            img = mmcv.bgr2hsv(img)
+            img[:, :,
+                0] = (img[:, :, 0].astype(int) +
+                      random.randint(-self.hue_delta, self.hue_delta)) % 180
+            img = mmcv.hsv2bgr(img)
+        return img
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+
+        img = results['img']
+        # random brightness
+        img = self.brightness(img)
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = random.randint(2)
+        if mode == 1:
+            img = self.contrast(img)
+
+        # random saturation
+        img = self.saturation(img)
+
+        # random hue
+        img = self.hue(img)
+
+        # random contrast
+        if mode == 0:
+            img = self.contrast(img)
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(brightness_delta={self.brightness_delta}, '
+                     f'contrast_range=({self.contrast_lower}, '
+                     f'{self.contrast_upper}), '
+                     f'saturation_range=({self.saturation_lower}, '
+                     f'{self.saturation_upper}), '
+                     f'hue_delta={self.hue_delta})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomCutOut(BaseTransform):
+    """CutOut operation.
+
+    Randomly drop some regions of image used in
+    `Cutout <https://arxiv.org/abs/1708.04552>`_.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+
+    Args:
+        prob (float): cutout probability.
+        n_holes (int | tuple[int, int]): Number of regions to be dropped.
+            If it is given as a list, number of holes will be randomly
+            selected from the closed interval [`n_holes[0]`, `n_holes[1]`].
+        cutout_shape (tuple[int, int] | list[tuple[int, int]]): The candidate
+            shape of dropped regions. It can be `tuple[int, int]` to use a
+            fixed cutout shape, or `list[tuple[int, int]]` to randomly choose
+            shape from the list.
+        cutout_ratio (tuple[float, float] | list[tuple[float, float]]): The
+            candidate ratio of dropped regions. It can be `tuple[float, float]`
+            to use a fixed ratio or `list[tuple[float, float]]` to randomly
+            choose ratio from the list. Please note that `cutout_shape`
+            and `cutout_ratio` cannot be both given at the same time.
+        fill_in (tuple[float, float, float] | tuple[int, int, int]): The value
+            of pixel to fill in the dropped regions. Default: (0, 0, 0).
+        seg_fill_in (int): The labels of pixel to fill in the dropped regions.
+            If seg_fill_in is None, skip. Default: None.
+    """
+
+    def __init__(self,
+                 prob,
+                 n_holes,
+                 cutout_shape=None,
+                 cutout_ratio=None,
+                 fill_in=(0, 0, 0),
+                 seg_fill_in=None):
+
+        assert 0 <= prob and prob <= 1
+        assert (cutout_shape is None) ^ (cutout_ratio is None), \
+            'Either cutout_shape or cutout_ratio should be specified.'
+        assert (isinstance(cutout_shape, (list, tuple))
+                or isinstance(cutout_ratio, (list, tuple)))
+        if isinstance(n_holes, tuple):
+            assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1]
+        else:
+            n_holes = (n_holes, n_holes)
+        if seg_fill_in is not None:
+            assert (isinstance(seg_fill_in, int) and 0 <= seg_fill_in
+                    and seg_fill_in <= 255)
+        self.prob = prob
+        self.n_holes = n_holes
+        self.fill_in = fill_in
+        self.seg_fill_in = seg_fill_in
+        self.with_ratio = cutout_ratio is not None
+        self.candidates = cutout_ratio if self.with_ratio else cutout_shape
+        if not isinstance(self.candidates, list):
+            self.candidates = [self.candidates]
+
+    @cache_randomness
+    def do_cutout(self):
+        return np.random.rand() < self.prob
+
+    @cache_randomness
+    def generate_patches(self, results):
+        cutout = self.do_cutout()
+
+        h, w, _ = results['img'].shape
+        if cutout:
+            n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
+        else:
+            n_holes = 0
+        x1_lst = []
+        y1_lst = []
+        index_lst = []
+        for _ in range(n_holes):
+            x1_lst.append(np.random.randint(0, w))
+            y1_lst.append(np.random.randint(0, h))
+            index_lst.append(np.random.randint(0, len(self.candidates)))
+        return cutout, n_holes, x1_lst, y1_lst, index_lst
+
+    def transform(self, results: dict) -> dict:
+        """Call function to drop some regions of image."""
+        cutout, n_holes, x1_lst, y1_lst, index_lst = self.generate_patches(
+            results)
+        if cutout:
+            h, w, c = results['img'].shape
+            for i in range(n_holes):
+                x1 = x1_lst[i]
+                y1 = y1_lst[i]
+                index = index_lst[i]
+                if not self.with_ratio:
+                    cutout_w, cutout_h = self.candidates[index]
+                else:
+                    cutout_w = int(self.candidates[index][0] * w)
+                    cutout_h = int(self.candidates[index][1] * h)
+
+                x2 = np.clip(x1 + cutout_w, 0, w)
+                y2 = np.clip(y1 + cutout_h, 0, h)
+                results['img'][y1:y2, x1:x2, :] = self.fill_in
+
+                if self.seg_fill_in is not None:
+                    for key in results.get('seg_fields', []):
+                        results[key][y1:y2, x1:x2] = self.seg_fill_in
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'n_holes={self.n_holes}, '
+        repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio
+                     else f'cutout_shape={self.candidates}, ')
+        repr_str += f'fill_in={self.fill_in}, '
+        repr_str += f'seg_fill_in={self.seg_fill_in})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomRotFlip(BaseTransform):
+    """Rotate and flip the image & seg or just rotate the image & seg.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+
+    Args:
+        rotate_prob (float): The probability of rotate image.
+        flip_prob (float): The probability of rotate&flip image.
+        degree (float, tuple[float]): Range of degrees to select from. If
+            degree is a number instead of tuple like (min, max),
+            the range of degree will be (``-degree``, ``+degree``)
+    """
+
+    def __init__(self, rotate_prob=0.5, flip_prob=0.5, degree=(-20, 20)):
+        self.rotate_prob = rotate_prob
+        self.flip_prob = flip_prob
+        assert 0 <= rotate_prob <= 1 and 0 <= flip_prob <= 1
+        if isinstance(degree, (float, int)):
+            assert degree > 0, f'degree {degree} should be positive'
+            self.degree = (-degree, degree)
+        else:
+            self.degree = degree
+        assert len(self.degree) == 2, f'degree {self.degree} should be a ' \
+                                      f'tuple of (min, max)'
+
+    def random_rot_flip(self, results: dict) -> dict:
+        k = np.random.randint(0, 4)
+        results['img'] = np.rot90(results['img'], k)
+        for key in results.get('seg_fields', []):
+            results[key] = np.rot90(results[key], k)
+        axis = np.random.randint(0, 2)
+        results['img'] = np.flip(results['img'], axis=axis).copy()
+        for key in results.get('seg_fields', []):
+            results[key] = np.flip(results[key], axis=axis).copy()
+        return results
+
+    def random_rotate(self, results: dict) -> dict:
+        angle = np.random.uniform(min(*self.degree), max(*self.degree))
+        results['img'] = mmcv.imrotate(results['img'], angle=angle)
+        for key in results.get('seg_fields', []):
+            results[key] = mmcv.imrotate(results[key], angle=angle)
+        return results
+
+    def transform(self, results: dict) -> dict:
+        """Call function to rotate or rotate & flip image, semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Rotated or rotated & flipped results.
+        """
+        rotate_flag = 0
+        if random.random() < self.rotate_prob:
+            results = self.random_rotate(results)
+            rotate_flag = 1
+        if random.random() < self.flip_prob and rotate_flag == 0:
+            results = self.random_rot_flip(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(rotate_prob={self.rotate_prob}, ' \
+                    f'flip_prob={self.flip_prob}, ' \
+                    f'degree={self.degree})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomFlip(MMCV_RandomFlip):
+    """Flip the image & bbox & segmentation map. Added or Updated
+    keys: flip, flip_direction, img, gt_bboxes, gt_seg_map, and gt_depth_map.
+    There are 3 flip modes:
+
+    - ``prob`` is float, ``direction`` is string: the image will be
+      ``direction``ly flipped with probability of ``prob`` .
+      E.g., ``prob=0.5``, ``direction='horizontal'``,
+      then image will be horizontally flipped with probability of 0.5.
+
+    - ``prob`` is float, ``direction`` is list of string: the image will
+      be ``direction[i]``ly flipped with probability of
+      ``prob/len(direction)``.
+      E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``,
+      then image will be horizontally flipped with probability of 0.25,
+      vertically with probability of 0.25.
+
+    - ``prob`` is list of float, ``direction`` is list of string:
+      given ``len(prob) == len(direction)``, the image will
+      be ``direction[i]``ly flipped with probability of ``prob[i]``.
+      E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal',
+      'vertical']``, then image will be horizontally flipped with
+      probability of 0.3, vertically with probability of 0.5.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_depth_map (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_depth_map (optional)
+
+    Added Keys:
+
+    - flip
+    - flip_direction
+    - swap_seg_labels (optional)
+
+    Args:
+        prob (float | list[float], optional): The flipping probability.
+            Defaults to None.
+        direction(str | list[str]): The flipping direction. Options
+            If input is a list, the length must equal ``prob``. Each
+            element in ``prob`` indicates the flip probability of
+            corresponding direction. Defaults to 'horizontal'.
+        swap_seg_labels (list, optional): The label pair need to be swapped
+            for ground truth, like 'left arm' and 'right arm' need to be
+            swapped after horizontal flipping. For example, ``[(1, 5)]``,
+            where 1/5 is the label of the left/right arm. Defaults to None.
+    """
+
+    def _flip(self, results: dict) -> None:
+        """Flip images, bounding boxes and semantic segmentation map."""
+        # flip image
+        results['img'] = mmcv.imflip(
+            results['img'], direction=results['flip_direction'])
+
+        img_shape = results['img'].shape[:2]
+
+        # flip bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'] = self._flip_bbox(results['gt_bboxes'],
+                                                   img_shape,
+                                                   results['flip_direction'])
+
+        # flip seg map
+        for key in results.get('seg_fields', []):
+            if results.get(key, None) is not None:
+                results[key] = self._flip_seg_map(
+                    results[key], direction=results['flip_direction']).copy()
+                results['swap_seg_labels'] = self.swap_seg_labels
+
+
+@TRANSFORMS.register_module()
+class Resize(MMCV_Resize):
+    """Resize images & seg & depth map.
+
+    This transform resizes the input image according to ``scale`` or
+    ``scale_factor``. Seg map, depth map and other relative annotations are
+    then resized with the same scale factor.
+    if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to
+    resize.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map (optional)
+    - gt_depth_map (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+    - gt_depth_map
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+
+    Args:
+        scale (int or tuple): Images scales for resizing. Defaults to None
+        scale_factor (float or tuple[float]): Scale factors for resizing.
+            Defaults to None.
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Defaults to False.
+        clip_object_border (bool): Whether to clip the objects
+            outside the border of the image. In some dataset like MOT17, the gt
+            bboxes are allowed to cross the border of images. Therefore, we
+            don't need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def _resize_seg(self, results: dict) -> None:
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for seg_key in results.get('seg_fields', []):
+            if results.get(seg_key, None) is not None:
+                if self.keep_ratio:
+                    gt_seg = mmcv.imrescale(
+                        results[seg_key],
+                        results['scale'],
+                        interpolation='nearest',
+                        backend=self.backend)
+                else:
+                    gt_seg = mmcv.imresize(
+                        results[seg_key],
+                        results['scale'],
+                        interpolation='nearest',
+                        backend=self.backend)
+                results[seg_key] = gt_seg
+
+
+@TRANSFORMS.register_module()
+class RandomMosaic(BaseTransform):
+    """Mosaic augmentation. Given 4 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |  pad      |
+                |      +-----------+           |
+                |      |           |           |
+                |      |  image1   |--------+  |
+                |      |           |        |  |
+                |      |           | image2 |  |
+     center_y   |----+-------------+-----------|
+                |    |   cropped   |           |
+                |pad |   image3    |  image4   |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+     The mosaic transform steps are as follows:
+         1. Choose the mosaic center as the intersections of 4 images
+         2. Get the left top image according to the index, and randomly
+            sample another 3 images from the custom dataset.
+         3. Sub image will be cropped if image is larger than mosaic patch
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+    - mix_results
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+    - gt_seg_map
+
+    Args:
+        prob (float): mosaic probability.
+        img_scale (Sequence[int]): Image size after mosaic pipeline of
+            a single image. The size of the output image is four times
+            that of a single image. The output image comprises 4 single images.
+            Default: (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Default: (0.5, 1.5).
+        pad_val (int): Pad value. Default: 0.
+        seg_pad_val (int): Pad value of segmentation map. Default: 255.
+    """
+
+    def __init__(self,
+                 prob,
+                 img_scale=(640, 640),
+                 center_ratio_range=(0.5, 1.5),
+                 pad_val=0,
+                 seg_pad_val=255):
+        assert 0 <= prob and prob <= 1
+        assert isinstance(img_scale, tuple)
+        self.prob = prob
+        self.img_scale = img_scale
+        self.center_ratio_range = center_ratio_range
+        self.pad_val = pad_val
+        self.seg_pad_val = seg_pad_val
+
+    @cache_randomness
+    def do_mosaic(self):
+        return np.random.rand() < self.prob
+
+    def transform(self, results: dict) -> dict:
+        """Call function to make a mosaic of image.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with mosaic transformed.
+        """
+        mosaic = self.do_mosaic()
+        if mosaic:
+            results = self._mosaic_transform_img(results)
+            results = self._mosaic_transform_seg(results)
+        return results
+
+    def get_indices(self, dataset: MultiImageMixDataset) -> list:
+        """Call function to collect indices.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+
+        Returns:
+            list: indices.
+        """
+
+        indices = [random.randint(0, len(dataset)) for _ in range(3)]
+        return indices
+
+    @cache_randomness
+    def generate_mosaic_center(self):
+        # mosaic center x, y
+        center_x = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[1])
+        center_y = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        return center_x, center_y
+
+    def _mosaic_transform_img(self, results: dict) -> dict:
+        """Mosaic transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        assert 'mix_results' in results
+        if len(results['img'].shape) == 3:
+            c = results['img'].shape[2]
+            mosaic_img = np.full(
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2), c),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full(
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2)),
+                self.pad_val,
+                dtype=results['img'].dtype)
+
+        # mosaic center x, y
+        self.center_x, self.center_y = self.generate_mosaic_center()
+        center_position = (self.center_x, self.center_y)
+
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                result_patch = copy.deepcopy(results)
+            else:
+                result_patch = copy.deepcopy(results['mix_results'][i - 1])
+
+            img_i = result_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(self.img_scale[0] / h_i,
+                                self.img_scale[1] / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape
+        results['ori_shape'] = mosaic_img.shape
+
+        return results
+
+    def _mosaic_transform_seg(self, results: dict) -> dict:
+        """Mosaic transform function for label annotations.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        assert 'mix_results' in results
+        for key in results.get('seg_fields', []):
+            mosaic_seg = np.full(
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2)),
+                self.seg_pad_val,
+                dtype=results[key].dtype)
+
+            # mosaic center x, y
+            center_position = (self.center_x, self.center_y)
+
+            loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+            for i, loc in enumerate(loc_strs):
+                if loc == 'top_left':
+                    result_patch = copy.deepcopy(results)
+                else:
+                    result_patch = copy.deepcopy(results['mix_results'][i - 1])
+
+                gt_seg_i = result_patch[key]
+                h_i, w_i = gt_seg_i.shape[:2]
+                # keep_ratio resize
+                scale_ratio_i = min(self.img_scale[0] / h_i,
+                                    self.img_scale[1] / w_i)
+                gt_seg_i = mmcv.imresize(
+                    gt_seg_i,
+                    (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)),
+                    interpolation='nearest')
+
+                # compute the combine parameters
+                paste_coord, crop_coord = self._mosaic_combine(
+                    loc, center_position, gt_seg_i.shape[:2][::-1])
+                x1_p, y1_p, x2_p, y2_p = paste_coord
+                x1_c, y1_c, x2_c, y2_c = crop_coord
+
+                # crop and paste image
+                mosaic_seg[y1_p:y2_p, x1_p:x2_p] = \
+                    gt_seg_i[y1_c:y2_c, x1_c:x2_c]
+
+            results[key] = mosaic_seg
+
+        return results
+
+    def _mosaic_combine(self, loc: str, center_position_xy: Sequence[float],
+                        img_shape_wh: Sequence[int]) -> tuple:
+        """Calculate global coordinate of mosaic image and local coordinate of
+        cropped sub-image.
+
+        Args:
+            loc (str): Index for the sub-image, loc in ('top_left',
+              'top_right', 'bottom_left', 'bottom_right').
+            center_position_xy (Sequence[float]): Mixing center for 4 images,
+                (x, y).
+            img_shape_wh (Sequence[int]): Width and height of sub-image
+
+        Returns:
+            tuple[tuple[float]]: Corresponding coordinate of pasting and
+                cropping
+                - paste_coord (tuple): paste corner coordinate in mosaic image.
+                - crop_coord (tuple): crop corner coordinate in mosaic image.
+        """
+
+        assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        if loc == 'top_left':
+            # index0 to top left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             center_position_xy[0], \
+                             center_position_xy[1]
+            crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - (
+                y2 - y1), img_shape_wh[0], img_shape_wh[1]
+
+        elif loc == 'top_right':
+            # index1 to top right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[1] * 2), \
+                             center_position_xy[1]
+            crop_coord = 0, img_shape_wh[1] - (y2 - y1), min(
+                img_shape_wh[0], x2 - x1), img_shape_wh[1]
+
+        elif loc == 'bottom_left':
+            # index2 to bottom left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             center_position_xy[1], \
+                             center_position_xy[0], \
+                             min(self.img_scale[0] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min(
+                y2 - y1, img_shape_wh[1])
+
+        else:
+            # index3 to bottom right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             center_position_xy[1], \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[1] * 2), \
+                             min(self.img_scale[0] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = 0, 0, min(img_shape_wh[0],
+                                   x2 - x1), min(y2 - y1, img_shape_wh[1])
+
+        paste_coord = x1, y1, x2, y2
+        return paste_coord, crop_coord
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'seg_pad_val={self.pad_val})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class GenerateEdge(BaseTransform):
+    """Generate Edge for CE2P approach.
+
+    Edge will be used to calculate loss of
+    `CE2P <https://arxiv.org/abs/1809.05996>`_.
+
+    Modified from https://github.com/liutinglt/CE2P/blob/master/dataset/target_generation.py # noqa:E501
+
+    Required Keys:
+
+        - img_shape
+        - gt_seg_map
+
+    Added Keys:
+        - gt_edge_map (np.ndarray, uint8): The edge annotation generated from the
+            seg map by extracting border between different semantics.
+
+    Args:
+        edge_width (int): The width of edge. Default to 3.
+        ignore_index (int): Index that will be ignored. Default to 255.
+    """
+
+    def __init__(self, edge_width: int = 3, ignore_index: int = 255) -> None:
+        super().__init__()
+        self.edge_width = edge_width
+        self.ignore_index = ignore_index
+
+    def transform(self, results: Dict) -> Dict:
+        """Call function to generate edge from segmentation map.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with edge mask.
+        """
+        h, w = results['img_shape']
+        edge = np.zeros((h, w), dtype=np.uint8)
+        seg_map = results['gt_seg_map']
+
+        # down
+        edge_down = edge[1:h, :]
+        edge_down[(seg_map[1:h, :] != seg_map[:h - 1, :])
+                  & (seg_map[1:h, :] != self.ignore_index) &
+                  (seg_map[:h - 1, :] != self.ignore_index)] = 1
+        # left
+        edge_left = edge[:, :w - 1]
+        edge_left[(seg_map[:, :w - 1] != seg_map[:, 1:w])
+                  & (seg_map[:, :w - 1] != self.ignore_index) &
+                  (seg_map[:, 1:w] != self.ignore_index)] = 1
+        # up_left
+        edge_upleft = edge[:h - 1, :w - 1]
+        edge_upleft[(seg_map[:h - 1, :w - 1] != seg_map[1:h, 1:w])
+                    & (seg_map[:h - 1, :w - 1] != self.ignore_index) &
+                    (seg_map[1:h, 1:w] != self.ignore_index)] = 1
+        # up_right
+        edge_upright = edge[:h - 1, 1:w]
+        edge_upright[(seg_map[:h - 1, 1:w] != seg_map[1:h, :w - 1])
+                     & (seg_map[:h - 1, 1:w] != self.ignore_index) &
+                     (seg_map[1:h, :w - 1] != self.ignore_index)] = 1
+
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT,
+                                           (self.edge_width, self.edge_width))
+        edge = cv2.dilate(edge, kernel)
+
+        results['gt_edge_map'] = edge
+        results['edge_width'] = self.edge_width
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'edge_width={self.edge_width}, '
+        repr_str += f'ignore_index={self.ignore_index})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ResizeShortestEdge(BaseTransform):
+    """Resize the image and mask while keeping the aspect ratio unchanged.
+
+    Modified from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/augmentation_impl.py#L130 # noqa:E501
+    Copyright (c) Facebook, Inc. and its affiliates.
+    Licensed under the Apache-2.0 License
+
+    This transform attempts to scale the shorter edge to the given
+    `scale`, as long as the longer edge does not exceed `max_size`.
+    If `max_size` is reached, then downscale so that the longer
+    edge does not exceed `max_size`.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_seg_map (optional))
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+
+
+    Args:
+        scale (Union[int, Tuple[int, int]]): The target short edge length.
+            If it's tuple, will select the min value as the short edge length.
+        max_size (int): The maximum allowed longest edge length.
+    """
+
+    def __init__(self, scale: Union[int, Tuple[int, int]],
+                 max_size: int) -> None:
+        super().__init__()
+        self.scale = scale
+        self.max_size = max_size
+
+        # Create a empty Resize object
+        self.resize = TRANSFORMS.build({
+            'type': 'Resize',
+            'scale': 0,
+            'keep_ratio': True
+        })
+
+    def _get_output_shape(self, img, short_edge_length) -> Tuple[int, int]:
+        """Compute the target image shape with the given `short_edge_length`.
+
+        Args:
+            img (np.ndarray): The input image.
+            short_edge_length (Union[int, Tuple[int, int]]): The target short
+                edge length. If it's tuple, will select the min value as the
+                short edge length.
+        """
+        h, w = img.shape[:2]
+        if isinstance(short_edge_length, int):
+            size = short_edge_length * 1.0
+        elif isinstance(short_edge_length, tuple):
+            size = min(short_edge_length) * 1.0
+        scale = size / min(h, w)
+        if h < w:
+            new_h, new_w = size, scale * w
+        else:
+            new_h, new_w = scale * h, size
+
+        if max(new_h, new_w) > self.max_size:
+            scale = self.max_size * 1.0 / max(new_h, new_w)
+            new_h *= scale
+            new_w *= scale
+
+        new_h = int(new_h + 0.5)
+        new_w = int(new_w + 0.5)
+        return (new_w, new_h)
+
+    def transform(self, results: Dict) -> Dict:
+        self.resize.scale = self._get_output_shape(results['img'], self.scale)
+        return self.resize(results)
+
+
+@TRANSFORMS.register_module()
+class BioMedical3DRandomCrop(BaseTransform):
+    """Crop the input patch for medical image & segmentation mask.
+
+    Required Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X),
+        N is the number of modalities, and data type is float32.
+    - gt_seg_map (np.ndarray, optional): Biomedical semantic segmentation mask
+        with shape (Z, Y, X).
+
+    Modified Keys:
+
+        - img
+        - img_shape
+        - gt_seg_map (optional)
+
+    Args:
+        crop_shape (Union[int, Tuple[int, int, int]]):  Expected size after
+            cropping with the format of (z, y, x). If set to an integer,
+            then cropping width and height are equal to this integer.
+        keep_foreground (bool): If keep_foreground is True, it will sample a
+            voxel of foreground classes randomly, and will take it as the
+            center of the crop bounding-box. Default to True.
+    """
+
+    def __init__(self,
+                 crop_shape: Union[int, Tuple[int, int, int]],
+                 keep_foreground: bool = True):
+        super().__init__()
+        assert isinstance(crop_shape, int) or (
+            isinstance(crop_shape, tuple) and len(crop_shape) == 3
+        ), 'The expected crop_shape is an integer, or a tuple containing '
+        'three integers'
+
+        if isinstance(crop_shape, int):
+            crop_shape = (crop_shape, crop_shape, crop_shape)
+        assert crop_shape[0] > 0 and crop_shape[1] > 0 and crop_shape[2] > 0
+        self.crop_shape = crop_shape
+        self.keep_foreground = keep_foreground
+
+    def random_sample_location(self, seg_map: np.ndarray) -> dict:
+        """sample foreground voxel when keep_foreground is True.
+
+        Args:
+            seg_map (np.ndarray): gt seg map.
+
+        Returns:
+            dict: Coordinates of selected foreground voxel.
+        """
+        num_samples = 10000
+        # at least 1% of the class voxels need to be selected,
+        # otherwise it may be too sparse
+        min_percent_coverage = 0.01
+        class_locs = {}
+        foreground_classes = []
+        all_classes = np.unique(seg_map)
+        for c in all_classes:
+            if c == 0:
+                # to avoid the segmentation mask full of background 0
+                # and the class_locs is just void dictionary {} when it return
+                # there add a void list for background 0.
+                class_locs[c] = []
+            else:
+                all_locs = np.argwhere(seg_map == c)
+                target_num_samples = min(num_samples, len(all_locs))
+                target_num_samples = max(
+                    target_num_samples,
+                    int(np.ceil(len(all_locs) * min_percent_coverage)))
+
+                selected = all_locs[np.random.choice(
+                    len(all_locs), target_num_samples, replace=False)]
+                class_locs[c] = selected
+                foreground_classes.append(c)
+
+        selected_voxel = None
+        if len(foreground_classes) > 0:
+            selected_class = np.random.choice(foreground_classes)
+            voxels_of_that_class = class_locs[selected_class]
+            selected_voxel = voxels_of_that_class[np.random.choice(
+                len(voxels_of_that_class))]
+
+        return selected_voxel
+
+    def random_generate_crop_bbox(self, margin_z: int, margin_y: int,
+                                  margin_x: int) -> tuple:
+        """Randomly get a crop bounding box.
+
+        Args:
+            seg_map (np.ndarray): Ground truth segmentation map.
+
+        Returns:
+            tuple: Coordinates of the cropped image.
+        """
+        offset_z = np.random.randint(0, margin_z + 1)
+        offset_y = np.random.randint(0, margin_y + 1)
+        offset_x = np.random.randint(0, margin_x + 1)
+        crop_z1, crop_z2 = offset_z, offset_z + self.crop_shape[0]
+        crop_y1, crop_y2 = offset_y, offset_y + self.crop_shape[1]
+        crop_x1, crop_x2 = offset_x, offset_x + self.crop_shape[2]
+
+        return crop_z1, crop_z2, crop_y1, crop_y2, crop_x1, crop_x2
+
+    def generate_margin(self, results: dict) -> tuple:
+        """Generate margin of crop bounding-box.
+
+        If keep_foreground is True, it will sample a voxel of foreground
+        classes randomly, and will take it as the center of the bounding-box,
+        and return the margin between of the bounding-box and image.
+        If keep_foreground is False, it will return the difference from crop
+        shape and image shape.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            tuple: The margin for 3 dimensions of crop bounding-box and image.
+        """
+
+        seg_map = results['gt_seg_map']
+        if self.keep_foreground:
+            selected_voxel = self.random_sample_location(seg_map)
+            if selected_voxel is None:
+                # this only happens if some image does not contain
+                # foreground voxels at all
+                warnings.warn(f'case does not contain any foreground classes'
+                              f': {results["img_path"]}')
+                margin_z = max(seg_map.shape[0] - self.crop_shape[0], 0)
+                margin_y = max(seg_map.shape[1] - self.crop_shape[1], 0)
+                margin_x = max(seg_map.shape[2] - self.crop_shape[2], 0)
+            else:
+                margin_z = max(0, selected_voxel[0] - self.crop_shape[0] // 2)
+                margin_y = max(0, selected_voxel[1] - self.crop_shape[1] // 2)
+                margin_x = max(0, selected_voxel[2] - self.crop_shape[2] // 2)
+                margin_z = max(
+                    0, min(seg_map.shape[0] - self.crop_shape[0], margin_z))
+                margin_y = max(
+                    0, min(seg_map.shape[1] - self.crop_shape[1], margin_y))
+                margin_x = max(
+                    0, min(seg_map.shape[2] - self.crop_shape[2], margin_x))
+        else:
+            margin_z = max(seg_map.shape[0] - self.crop_shape[0], 0)
+            margin_y = max(seg_map.shape[1] - self.crop_shape[1], 0)
+            margin_x = max(seg_map.shape[2] - self.crop_shape[2], 0)
+
+        return margin_z, margin_y, margin_x
+
+    def crop(self, img: np.ndarray, crop_bbox: tuple) -> np.ndarray:
+        """Crop from ``img``
+
+        Args:
+            img (np.ndarray): Original input image.
+            crop_bbox (tuple): Coordinates of the cropped image.
+
+        Returns:
+            np.ndarray: The cropped image.
+        """
+        crop_z1, crop_z2, crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox
+        if len(img.shape) == 3:
+            # crop seg map
+            img = img[crop_z1:crop_z2, crop_y1:crop_y2, crop_x1:crop_x2]
+        else:
+            # crop image
+            assert len(img.shape) == 4
+            img = img[:, crop_z1:crop_z2, crop_y1:crop_y2, crop_x1:crop_x2]
+        return img
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to randomly crop images, semantic segmentation
+        maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+        margin = self.generate_margin(results)
+        crop_bbox = self.random_generate_crop_bbox(*margin)
+
+        # crop the image
+        img = results['img']
+        results['img'] = self.crop(img, crop_bbox)
+        results['img_shape'] = results['img'].shape[1:]
+
+        # crop semantic seg
+        seg_map = results['gt_seg_map']
+        results['gt_seg_map'] = self.crop(seg_map, crop_bbox)
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(crop_shape={self.crop_shape})'
+
+
+@TRANSFORMS.register_module()
+class BioMedicalGaussianNoise(BaseTransform):
+    """Add random Gaussian noise to image.
+
+    Modified from https://github.com/MIC-DKFZ/batchgenerators/blob/7651ece69faf55263dd582a9f5cbd149ed9c3ad0/batchgenerators/transforms/noise_transforms.py#L53  # noqa:E501
+
+    Copyright (c) German Cancer Research Center (DKFZ)
+    Licensed under the Apache License, Version 2.0
+
+    Required Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X),
+            N is the number of modalities, and data type is float32.
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): Probability to add Gaussian noise for
+            each sample. Default to 0.1.
+        mean (float): Mean or “centre” of the distribution. Default to 0.0.
+        std (float): Standard deviation of distribution. Default to 0.1.
+    """
+
+    def __init__(self,
+                 prob: float = 0.1,
+                 mean: float = 0.0,
+                 std: float = 0.1) -> None:
+        super().__init__()
+        assert 0.0 <= prob <= 1.0 and std >= 0.0
+        self.prob = prob
+        self.mean = mean
+        self.std = std
+
+    def transform(self, results: Dict) -> Dict:
+        """Call function to add random Gaussian noise to image.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with random Gaussian noise.
+        """
+        if np.random.rand() < self.prob:
+            rand_std = np.random.uniform(0, self.std)
+            noise = np.random.normal(
+                self.mean, rand_std, size=results['img'].shape)
+            # noise is float64 array, convert to the results['img'].dtype
+            noise = noise.astype(results['img'].dtype)
+            results['img'] = results['img'] + noise
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'mean={self.mean}, '
+        repr_str += f'std={self.std})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BioMedicalGaussianBlur(BaseTransform):
+    """Add Gaussian blur with random sigma to image.
+
+    Modified from https://github.com/MIC-DKFZ/batchgenerators/blob/7651ece69faf55263dd582a9f5cbd149ed9c3ad0/batchgenerators/transforms/noise_transforms.py#L81 # noqa:E501
+
+    Copyright (c) German Cancer Research Center (DKFZ)
+    Licensed under the Apache License, Version 2.0
+
+    Required Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X),
+            N is the number of modalities, and data type is float32.
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        sigma_range (Tuple[float, float]|float): range to randomly
+            select sigma value. Default to (0.5, 1.0).
+        prob (float): Probability to apply Gaussian blur
+            for each sample. Default to 0.2.
+        prob_per_channel  (float): Probability to apply Gaussian blur
+            for each channel (axis N of the image). Default to 0.5.
+        different_sigma_per_channel (bool): whether to use different
+            sigma for each channel (axis N of the image). Default to True.
+        different_sigma_per_axis (bool): whether to use different
+            sigma for axis Z, X and Y of the image. Default to True.
+    """
+
+    def __init__(self,
+                 sigma_range: Tuple[float, float] = (0.5, 1.0),
+                 prob: float = 0.2,
+                 prob_per_channel: float = 0.5,
+                 different_sigma_per_channel: bool = True,
+                 different_sigma_per_axis: bool = True) -> None:
+        super().__init__()
+        assert 0.0 <= prob <= 1.0
+        assert 0.0 <= prob_per_channel <= 1.0
+        assert isinstance(sigma_range, Sequence) and len(sigma_range) == 2
+        self.sigma_range = sigma_range
+        self.prob = prob
+        self.prob_per_channel = prob_per_channel
+        self.different_sigma_per_channel = different_sigma_per_channel
+        self.different_sigma_per_axis = different_sigma_per_axis
+
+    def _get_valid_sigma(self, value_range) -> Tuple[float, ...]:
+        """Ensure the `value_range` to be either a single value or a sequence
+        of two values. If the `value_range` is a sequence, generate a random
+        value with `[value_range[0], value_range[1]]` based on uniform
+        sampling.
+
+        Modified from https://github.com/MIC-DKFZ/batchgenerators/blob/7651ece69faf55263dd582a9f5cbd149ed9c3ad0/batchgenerators/augmentations/utils.py#L625 # noqa:E501
+
+        Args:
+            value_range (tuple|list|float|int): the input value range
+        """
+        if (isinstance(value_range, (list, tuple))):
+            if (value_range[0] == value_range[1]):
+                value = value_range[0]
+            else:
+                orig_type = type(value_range[0])
+                value = np.random.uniform(value_range[0], value_range[1])
+                value = orig_type(value)
+        return value
+
+    def _gaussian_blur(self, data_sample: np.ndarray) -> np.ndarray:
+        """Random generate sigma and apply Gaussian Blur to the data
+        Args:
+            data_sample (np.ndarray): data sample with multiple modalities,
+                the data shape is (N, Z, Y, X)
+        """
+        sigma = None
+        for c in range(data_sample.shape[0]):
+            if np.random.rand() < self.prob_per_channel:
+                # if no `sigma` is generated, generate one
+                # if `self.different_sigma_per_channel` is True,
+                # re-generate random sigma for each channel
+                if (sigma is None or self.different_sigma_per_channel):
+                    if (not self.different_sigma_per_axis):
+                        sigma = self._get_valid_sigma(self.sigma_range)
+                    else:
+                        sigma = [
+                            self._get_valid_sigma(self.sigma_range)
+                            for _ in data_sample.shape[1:]
+                        ]
+                # apply gaussian filter with `sigma`
+                data_sample[c] = gaussian_filter(
+                    data_sample[c], sigma, order=0)
+        return data_sample
+
+    def transform(self, results: Dict) -> Dict:
+        """Call function to add random Gaussian blur to image.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with random Gaussian noise.
+        """
+        if np.random.rand() < self.prob:
+            results['img'] = self._gaussian_blur(results['img'])
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'prob_per_channel={self.prob_per_channel}, '
+        repr_str += f'sigma_range={self.sigma_range}, '
+        repr_str += 'different_sigma_per_channel=' \
+                    f'{self.different_sigma_per_channel}, '
+        repr_str += 'different_sigma_per_axis=' \
+                    f'{self.different_sigma_per_axis})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BioMedicalRandomGamma(BaseTransform):
+    """Using random gamma correction to process the biomedical image.
+
+    Modified from
+    https://github.com/MIC-DKFZ/batchgenerators/blob/master/batchgenerators/transforms/color_transforms.py#L132 # noqa:E501
+    With licence: Apache 2.0
+
+    Required Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X),
+        N is the number of modalities, and data type is float32.
+
+    Modified Keys:
+    - img
+
+    Args:
+        prob (float): The probability to perform this transform. Default: 0.5.
+        gamma_range (Tuple[float]): Range of gamma values. Default: (0.5, 2).
+        invert_image (bool): Whether invert the image before applying gamma
+            augmentation. Default: False.
+        per_channel (bool): Whether perform the transform each channel
+            individually. Default: False
+        retain_stats (bool): Gamma transformation will alter the mean and std
+            of the data in the patch. If retain_stats=True, the data will be
+            transformed to match the mean and standard deviation before gamma
+            augmentation. Default: False.
+    """
+
+    def __init__(self,
+                 prob: float = 0.5,
+                 gamma_range: Tuple[float] = (0.5, 2),
+                 invert_image: bool = False,
+                 per_channel: bool = False,
+                 retain_stats: bool = False):
+        assert 0 <= prob and prob <= 1
+        assert isinstance(gamma_range, tuple) and len(gamma_range) == 2
+        assert isinstance(invert_image, bool)
+        assert isinstance(per_channel, bool)
+        assert isinstance(retain_stats, bool)
+        self.prob = prob
+        self.gamma_range = gamma_range
+        self.invert_image = invert_image
+        self.per_channel = per_channel
+        self.retain_stats = retain_stats
+
+    @cache_randomness
+    def _do_gamma(self):
+        """Whether do adjust gamma for image."""
+        return np.random.rand() < self.prob
+
+    def _adjust_gamma(self, img: np.array):
+        """Gamma adjustment for image.
+
+        Args:
+            img (np.array): Input image before gamma adjust.
+
+        Returns:
+            np.arrays: Image after gamma adjust.
+        """
+
+        if self.invert_image:
+            img = -img
+
+        def _do_adjust(img):
+            if retain_stats_here:
+                img_mean = img.mean()
+                img_std = img.std()
+            if np.random.random() < 0.5 and self.gamma_range[0] < 1:
+                gamma = np.random.uniform(self.gamma_range[0], 1)
+            else:
+                gamma = np.random.uniform(
+                    max(self.gamma_range[0], 1), self.gamma_range[1])
+            img_min = img.min()
+            img_range = img.max() - img_min  # range
+            img = np.power(((img - img_min) / float(img_range + 1e-7)),
+                           gamma) * img_range + img_min
+            if retain_stats_here:
+                img = img - img.mean()
+                img = img / (img.std() + 1e-8) * img_std
+                img = img + img_mean
+            return img
+
+        if not self.per_channel:
+            retain_stats_here = self.retain_stats
+            img = _do_adjust(img)
+        else:
+            for c in range(img.shape[0]):
+                img[c] = _do_adjust(img[c])
+        if self.invert_image:
+            img = -img
+        return img
+
+    def transform(self, results: dict) -> dict:
+        """Call function to perform random gamma correction
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with random gamma correction performed.
+        """
+        do_gamma = self._do_gamma()
+
+        if do_gamma:
+            results['img'] = self._adjust_gamma(results['img'])
+        else:
+            pass
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'gamma_range={self.gamma_range},'
+        repr_str += f'invert_image={self.invert_image},'
+        repr_str += f'per_channel={self.per_channel},'
+        repr_str += f'retain_stats={self.retain_stats}'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BioMedical3DPad(BaseTransform):
+    """Pad the biomedical 3d image & biomedical 3d semantic segmentation maps.
+
+    Required Keys:
+
+    - img (np.ndarry): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+
+    Modified Keys:
+
+    - img (np.ndarry): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+
+    Added Keys:
+
+    - pad_shape (Tuple[int, int, int]): The padded shape.
+
+    Args:
+        pad_shape (Tuple[int, int, int]): Fixed padding size.
+            Expected padding shape (Z, Y, X).
+        pad_val (float): Padding value for biomedical image.
+            The padding mode is set to "constant". The value
+            to be filled in padding area. Default: 0.
+        seg_pad_val (int): Padding value for biomedical 3d semantic
+            segmentation maps. The padding mode is set to "constant".
+            The value to be filled in padding area. Default: 0.
+    """
+
+    def __init__(self,
+                 pad_shape: Tuple[int, int, int],
+                 pad_val: float = 0.,
+                 seg_pad_val: int = 0) -> None:
+
+        # check pad_shape
+        assert pad_shape is not None
+        if not isinstance(pad_shape, tuple):
+            assert len(pad_shape) == 3
+
+        self.pad_shape = pad_shape
+        self.pad_val = pad_val
+        self.seg_pad_val = seg_pad_val
+
+    def _pad_img(self, results: dict) -> None:
+        """Pad images according to ``self.pad_shape``
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: The dict contains the padded image and shape
+                information.
+        """
+        padded_img = self._to_pad(
+            results['img'], pad_shape=self.pad_shape, pad_val=self.pad_val)
+
+        results['img'] = padded_img
+        results['pad_shape'] = padded_img.shape[1:]
+
+    def _pad_seg(self, results: dict) -> None:
+        """Pad semantic segmentation map according to ``self.pad_shape`` if
+        ``gt_seg_map`` is not None in results dict.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Update the padded gt seg map in dict.
+        """
+        if results.get('gt_seg_map', None) is not None:
+            pad_gt_seg = self._to_pad(
+                results['gt_seg_map'][None, ...],
+                pad_shape=results['pad_shape'],
+                pad_val=self.seg_pad_val)
+            results['gt_seg_map'] = pad_gt_seg[1:]
+
+    @staticmethod
+    def _to_pad(img: np.ndarray,
+                pad_shape: Tuple[int, int, int],
+                pad_val: Union[int, float] = 0) -> np.ndarray:
+        """Pad the given 3d image to a certain shape with specified padding
+        value.
+
+        Args:
+            img (ndarray): Biomedical image with shape (N, Z, Y, X)
+                to be padded. N is the number of modalities.
+            pad_shape (Tuple[int,int,int]): Expected padding shape (Z, Y, X).
+            pad_val (float, int): Values to be filled in padding areas
+                and the padding_mode is set to 'constant'. Default: 0.
+
+        Returns:
+            ndarray: The padded image.
+        """
+        # compute pad width
+        d = max(pad_shape[0] - img.shape[1], 0)
+        pad_d = (d // 2, d - d // 2)
+        h = max(pad_shape[1] - img.shape[2], 0)
+        pad_h = (h // 2, h - h // 2)
+        w = max(pad_shape[2] - img.shape[2], 0)
+        pad_w = (w // 2, w - w // 2)
+
+        pad_list = [(0, 0), pad_d, pad_h, pad_w]
+
+        img = np.pad(img, pad_list, mode='constant', constant_values=pad_val)
+        return img
+
+    def transform(self, results: dict) -> dict:
+        """Call function to pad images, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        self._pad_seg(results)
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'pad_shape={self.pad_shape}, '
+        repr_str += f'pad_val={self.pad_val}), '
+        repr_str += f'seg_pad_val={self.seg_pad_val})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BioMedical3DRandomFlip(BaseTransform):
+    """Flip biomedical 3D images and segmentations.
+
+    Modified from https://github.com/MIC-DKFZ/batchgenerators/blob/master/batchgenerators/transforms/spatial_transforms.py # noqa:E501
+
+    Copyright 2021 Division of
+    Medical Image Computing, German Cancer Research Center (DKFZ) and Applied
+    Computer Vision Lab, Helmholtz Imaging Platform.
+    Licensed under the Apache-2.0 License.
+
+    Required Keys:
+
+    - img (np.ndarry): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+
+    Modified Keys:
+
+    - img (np.ndarry): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+
+    Added Keys:
+
+    - do_flip
+    - flip_axes
+
+    Args:
+        prob (float): Flipping probability.
+        axes (Tuple[int, ...]): Flipping axes with order 'ZXY'.
+        swap_label_pairs (Optional[List[Tuple[int, int]]]):
+        The segmentation label pairs that are swapped when flipping.
+    """
+
+    def __init__(self,
+                 prob: float,
+                 axes: Tuple[int, ...],
+                 swap_label_pairs: Optional[List[Tuple[int, int]]] = None):
+        self.prob = prob
+        self.axes = axes
+        self.swap_label_pairs = swap_label_pairs
+        assert prob >= 0 and prob <= 1
+        if axes is not None:
+            assert max(axes) <= 2
+
+    @staticmethod
+    def _flip(img, direction: Tuple[bool, bool, bool]) -> np.ndarray:
+        if direction[0]:
+            img[:, :] = img[:, ::-1]
+        if direction[1]:
+            img[:, :, :] = img[:, :, ::-1]
+        if direction[2]:
+            img[:, :, :, :] = img[:, :, :, ::-1]
+        return img
+
+    def _do_flip(self, img: np.ndarray) -> Tuple[bool, bool, bool]:
+        """Call function to determine which axis to flip.
+
+        Args:
+            img (np.ndarry): Image or segmentation map array.
+        Returns:
+            tuple: Flip action, whether to flip on the z, x, and y axes.
+        """
+        flip_c, flip_x, flip_y = False, False, False
+        if self.axes is not None:
+            flip_c = 0 in self.axes and np.random.rand() < self.prob
+            flip_x = 1 in self.axes and np.random.rand() < self.prob
+            if len(img.shape) == 4:
+                flip_y = 2 in self.axes and np.random.rand() < self.prob
+        return flip_c, flip_x, flip_y
+
+    def _swap_label(self, seg: np.ndarray) -> np.ndarray:
+        out = seg.copy()
+        for first, second in self.swap_label_pairs:
+            first_area = (seg == first)
+            second_area = (seg == second)
+            out[first_area] = second
+            out[second_area] = first
+        return out
+
+    def transform(self, results: Dict) -> Dict:
+        """Call function to flip and swap pair labels.
+
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Flipped results, 'do_flip', 'flip_axes' keys are added into
+                result dict.
+        """
+        # get actual flipped axis
+        if 'do_flip' not in results:
+            results['do_flip'] = self._do_flip(results['img'])
+        if 'flip_axes' not in results:
+            results['flip_axes'] = self.axes
+        # flip image
+        results['img'] = self._flip(
+            results['img'], direction=results['do_flip'])
+        # flip seg
+        if results['gt_seg_map'] is not None:
+            if results['gt_seg_map'].shape != results['img'].shape:
+                results['gt_seg_map'] = results['gt_seg_map'][None, :]
+            results['gt_seg_map'] = self._flip(
+                results['gt_seg_map'], direction=results['do_flip'])
+            results['gt_seg_map'] = results['gt_seg_map'].squeeze()
+            # swap label pairs
+            if self.swap_label_pairs is not None:
+                results['gt_seg_map'] = self._swap_label(results['gt_seg_map'])
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, axes={self.axes}, ' \
+                    f'swap_label_pairs={self.swap_label_pairs})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Albu(BaseTransform):
+    """Albumentation augmentation. Adds custom transformations from
+    Albumentations library. Please, visit
+    `https://albumentations.readthedocs.io` to get more information. An example
+    of ``transforms`` is as followed:
+
+    .. code-block::
+        [
+            dict(
+                type='ShiftScaleRotate',
+                shift_limit=0.0625,
+                scale_limit=0.0,
+                rotate_limit=0,
+                interpolation=1,
+                p=0.5),
+            dict(
+                type='RandomBrightnessContrast',
+                brightness_limit=[0.1, 0.3],
+                contrast_limit=[0.1, 0.3],
+                p=0.2),
+            dict(type='ChannelShuffle', p=0.1),
+            dict(
+                type='OneOf',
+                transforms=[
+                    dict(type='Blur', blur_limit=3, p=1.0),
+                    dict(type='MedianBlur', blur_limit=3, p=1.0)
+                ],
+                p=0.1),
+        ]
+    Args:
+        transforms (list[dict]): A list of albu transformations
+        keymap (dict): Contains {'input key':'albumentation-style key'}
+        additional_targets(dict):  Allows applying same augmentations to \
+        multiple objects of same type.
+        update_pad_shape (bool): Whether to update padding shape according to \
+            the output shape of the last transform
+        bgr_to_rgb (bool): Whether to convert the band order to RGB
+    """
+
+    def __init__(self,
+                 transforms: List[dict],
+                 keymap: Optional[dict] = None,
+                 additional_targets: Optional[dict] = None,
+                 update_pad_shape: bool = False,
+                 bgr_to_rgb: bool = True):
+        if not ALBU_INSTALLED:
+            raise ImportError(
+                'albumentations is not installed, '
+                'we suggest install albumentation by '
+                '"pip install albumentations>=0.3.2 --no-binary qudida,albumentations"'  # noqa
+            )
+
+        # Args will be modified later, copying it will be safer
+        transforms = copy.deepcopy(transforms)
+
+        self.transforms = transforms
+        self.keymap = keymap
+        self.additional_targets = additional_targets
+        self.update_pad_shape = update_pad_shape
+        self.bgr_to_rgb = bgr_to_rgb
+
+        self.aug = Compose([self.albu_builder(t) for t in self.transforms],
+                           additional_targets=self.additional_targets)
+
+        if not keymap:
+            self.keymap_to_albu = {'img': 'image', 'gt_seg_map': 'mask'}
+        else:
+            self.keymap_to_albu = copy.deepcopy(keymap)
+        self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
+
+    def albu_builder(self, cfg: dict) -> object:
+        """Build a callable object from a dict containing albu arguments.
+
+        Args:
+            cfg (dict): Config dict. It should at least contain the key "type".
+
+        Returns:
+            Callable: A callable object.
+        """
+
+        assert isinstance(cfg, dict) and 'type' in cfg
+        args = cfg.copy()
+
+        obj_type = args.pop('type')
+        if mmengine.is_str(obj_type):
+            if not ALBU_INSTALLED:
+                raise ImportError(
+                    'albumentations is not installed, '
+                    'we suggest install albumentation by '
+                    '"pip install albumentations>=0.3.2 --no-binary qudida,albumentations"'  # noqa
+                )
+            obj_cls = getattr(albumentations, obj_type)
+        elif inspect.isclass(obj_type):
+            obj_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a valid type or str, but got {type(obj_type)}')
+
+        if 'transforms' in args:
+            args['transforms'] = [
+                self.albu_builder(t) for t in args['transforms']
+            ]
+
+        return obj_cls(**args)
+
+    @staticmethod
+    def mapper(d: dict, keymap: dict):
+        """Dictionary mapper.
+
+        Renames keys according to keymap provided.
+        Args:
+            d (dict): old dict
+            keymap (dict): {'old_key':'new_key'}
+        Returns:
+            dict: new dict.
+        """
+
+        updated_dict = {}
+        for k, _ in zip(d.keys(), d.values()):
+            new_k = keymap.get(k, k)
+            updated_dict[new_k] = d[k]
+        return updated_dict
+
+    def transform(self, results):
+        # dict to albumentations format
+        results = self.mapper(results, self.keymap_to_albu)
+
+        # Convert to RGB since Albumentations works with RGB images
+        if self.bgr_to_rgb:
+            results['image'] = cv2.cvtColor(results['image'],
+                                            cv2.COLOR_BGR2RGB)
+            if self.additional_targets:
+                for key, value in self.additional_targets.items():
+                    if value == 'image':
+                        results[key] = cv2.cvtColor(results[key],
+                                                    cv2.COLOR_BGR2RGB)
+
+        # Apply Transform
+        results = self.aug(**results)
+
+        # Convert back to BGR
+        if self.bgr_to_rgb:
+            results['image'] = cv2.cvtColor(results['image'],
+                                            cv2.COLOR_RGB2BGR)
+            if self.additional_targets:
+                for key, value in self.additional_targets.items():
+                    if value == 'image':
+                        results[key] = cv2.cvtColor(results['image2'],
+                                                    cv2.COLOR_RGB2BGR)
+
+        # back to the original format
+        results = self.mapper(results, self.keymap_back)
+
+        # update final shape
+        if self.update_pad_shape:
+            results['pad_shape'] = results['img'].shape
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ConcatCDInput(BaseTransform):
+    """Concat images for change detection.
+
+    Required Keys:
+
+    - img
+    - img2
+
+    Args:
+        input_keys (tuple):  Input image keys for change detection.
+            Default: ('img', 'img2').
+    """
+
+    def __init__(self, input_keys=('img', 'img2')):
+        self.input_keys = input_keys
+
+    def transform(self, results: dict) -> dict:
+        img = []
+        for input_key in self.input_keys:
+            img.append(results.pop(input_key))
+        results['img'] = np.concatenate(img, axis=2)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(input_keys={self.input_keys}, '
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomDepthMix(BaseTransform):
+    """This class implements the RandomDepthMix transform.
+
+    Args:
+        prob (float): Probability of applying the transformation.
+            Defaults to 0.25.
+        mix_scale_ratio (float): Ratio to scale the mix width.
+            Defaults to 0.75.
+    """
+
+    def __init__(
+        self,
+        prob: float = 0.25,
+        mix_scale_ratio: float = 0.75,
+    ):
+        super().__init__()
+
+        self.prob = prob
+        self.mix_scale_ratio = mix_scale_ratio
+
+    def transform(self, results: dict) -> dict:
+        if random.random() > self.prob:
+            return results
+
+        h, w = results['img_shape'][:2]
+        left = int(w * random.random())
+        width_ratio = self.mix_scale_ratio * random.random()
+        width = int(max(1, (w - left) * width_ratio))
+
+        img = results['img']
+        depth_rescale_factor = results.get('depth_rescale_factor', 1)
+        depth_map = results['gt_depth_map'] / depth_rescale_factor
+
+        if img.ndim == 3:
+            for c in range(img.shape[-1]):
+                img[:, left:left + width, c] = depth_map[:, left:left + width]
+        elif img.ndim == 2:
+            img[:, left:left + width] = depth_map[:, left:left + width]
+        else:
+            raise ValueError(f'Invalid image shape ({img.shape})')
+
+        results['img'] = img
+        return results
diff --git a/head_extractor/build/lib/mmseg/datasets/union_new.py b/head_extractor/build/lib/mmseg/datasets/union_new.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff47c530a6e2334789cf9664591dae97971a1892
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/union_new.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class UnionNewKDataset(BaseSegDataset):
+    """
+        union dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'background',
+            'top',
+            'outer',
+            'skirt',
+            'dress',
+            'pants',
+            'leggings',
+            'accessories',
+            'belt',
+            'footwear',
+            'bag',
+            'hair',
+            'skin',
+            'rompers',
+            ),
+
+        palette=[
+            [0, 0, 0], # background
+            [255, 0, 0], # top
+            [4, 63, 120], # outer
+            [127, 127, 127],  # skirt
+            [80, 205, 207], # dress
+            [0, 255, 0], # pants
+            [230, 83, 223], # leggings
+            [207, 135, 41], # accessories, including wrist wear,ring,tie,etc...
+            [0, 51, 51], # belt
+            [0, 153, 255], # footwear
+            [167,78,103], # bag
+            [0, 0, 255], # hair
+            [142, 124, 195], # skin
+            [74, 28, 28], # rompers
+            ],)
+           
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/union_new_add_mask.py b/head_extractor/build/lib/mmseg/datasets/union_new_add_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea7a613172e6ef916eb9c15232c8cddfd1a67987
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/union_new_add_mask.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class UnionNewAddMaskDataset(BaseSegDataset):
+    """
+        union dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'background',
+            'top',
+            'outer',
+            'skirt',
+            'dress',
+            'pants',
+            'leggings',
+            'accessories',
+            'belt',
+            'footwear',
+            'bag',
+            'hair',
+            'skin',
+            'rompers',
+            'face_mask'
+            ),
+
+        palette=[
+            [0, 0, 0], # background
+            [0, 0, 255], # top
+            [120, 63, 3], # outer
+            [127, 127, 127],  # skirt
+            [207, 205, 80], # dress
+            [0, 255, 0], # pants
+            [223, 83, 230], # leggings
+            [41, 135, 207], # accessories, including wrist wear,ring,tie,etc...
+            [51, 51, 0], # belt
+            [255, 153, 0], # footwear
+            [103, 78, 167], # bag
+            [255, 0, 0], # hair
+            [195, 124, 142], # skin
+            [28, 28, 74], # rompers
+            [147, 196, 125], # face mask
+            ],)
+           
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/datasets/voc.py b/head_extractor/build/lib/mmseg/datasets/voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e5d6025c03760953a82f80e337185afc51f1386
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/datasets/voc.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class PascalVOCDataset(BaseSegDataset):
+    """Pascal VOC dataset.
+
+    Args:
+        split (str): Split txt file for Pascal VOC.
+    """
+    METAINFO = dict(
+        classes=('background', 'aeroplane', 'bicycle', 'bird', 'boat',
+                 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable',
+                 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep',
+                 'sofa', 'train', 'tvmonitor'),
+        palette=[[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
+                 [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
+                 [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
+                 [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128],
+                 [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
+                 [0, 64, 128]])
+
+    def __init__(self,
+                 ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            ann_file=ann_file,
+            **kwargs)
+        assert fileio.exists(self.data_prefix['img_path'],
+                             self.backend_args) and osp.isfile(self.ann_file)
diff --git a/head_extractor/build/lib/mmseg/engine/__init__.py b/head_extractor/build/lib/mmseg/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..98139a0047fd2f076d659ba5aed2cd3452dbd235
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/engine/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hooks import SegVisualizationHook
+from .optimizers import (ForceDefaultOptimWrapperConstructor,
+                         LayerDecayOptimizerConstructor,
+                         LearningRateDecayOptimizerConstructor)
+from .schedulers import PolyLRRatio
+
+__all__ = [
+    'LearningRateDecayOptimizerConstructor', 'LayerDecayOptimizerConstructor',
+    'SegVisualizationHook', 'PolyLRRatio',
+    'ForceDefaultOptimWrapperConstructor'
+]
diff --git a/head_extractor/build/lib/mmseg/engine/hooks/__init__.py b/head_extractor/build/lib/mmseg/engine/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6048088a7fd322890ced17569e855acee826eca
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/engine/hooks/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .visualization_hook import SegVisualizationHook
+
+__all__ = ['SegVisualizationHook']
diff --git a/head_extractor/build/lib/mmseg/engine/hooks/visualization_hook.py b/head_extractor/build/lib/mmseg/engine/hooks/visualization_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..21cddde89d96c77ffe5b711a5efe524ced668b5c
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/engine/hooks/visualization_hook.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import Optional, Sequence
+
+import mmcv
+from mmengine.fileio import get
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+from mmengine.visualization import Visualizer
+
+from mmseg.registry import HOOKS
+from mmseg.structures import SegDataSample
+
+
+@HOOKS.register_module()
+class SegVisualizationHook(Hook):
+    """Segmentation Visualization Hook. Used to visualize validation and
+    testing process prediction results.
+
+    In the testing phase:
+
+    1. If ``show`` is True, it means that only the prediction results are
+        visualized without storing data, so ``vis_backends`` needs to
+        be excluded.
+
+    Args:
+        draw (bool): whether to draw prediction results. If it is False,
+            it means that no drawing will be done. Defaults to False.
+        interval (int): The interval of visualization. Defaults to 50.
+        show (bool): Whether to display the drawn image. Default to False.
+        wait_time (float): The interval of show (s). Defaults to 0.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 draw: bool = False,
+                 interval: int = 50,
+                 show: bool = False,
+                 wait_time: float = 0.,
+                 backend_args: Optional[dict] = None):
+        self._visualizer: Visualizer = Visualizer.get_current_instance()
+        self.interval = interval
+        self.show = show
+        if self.show:
+            # No need to think about vis backends.
+            self._visualizer._vis_backends = {}
+            warnings.warn('The show is True, it means that only '
+                          'the prediction results are visualized '
+                          'without storing data, so vis_backends '
+                          'needs to be excluded.')
+
+        self.wait_time = wait_time
+        self.backend_args = backend_args.copy() if backend_args else None
+        self.draw = draw
+        if not self.draw:
+            warnings.warn('The draw is False, it means that the '
+                          'hook for visualization will not take '
+                          'effect. The results will NOT be '
+                          'visualized or stored.')
+        self._test_index = 0
+
+    def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                       outputs: Sequence[SegDataSample]) -> None:
+        """Run after every ``self.interval`` validation iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`SegDataSample`]]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        # There is no guarantee that the same batch of images
+        # is visualized for each evaluation.
+        total_curr_iter = runner.iter + batch_idx
+
+        # Visualize only the first data
+        img_path = outputs[0].img_path
+        img_bytes = get(img_path, backend_args=self.backend_args)
+        img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+        window_name = f'val_{osp.basename(img_path)}'
+
+        if total_curr_iter % self.interval == 0:
+            self._visualizer.add_datasample(
+                window_name,
+                img,
+                data_sample=outputs[0],
+                show=self.show,
+                wait_time=self.wait_time,
+                step=total_curr_iter)
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[SegDataSample]) -> None:
+        """Run after every testing iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`SegDataSample`]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        for data_sample in outputs:
+            self._test_index += 1
+
+            img_path = data_sample.img_path
+            window_name = f'test_{osp.basename(img_path)}'
+
+            img_path = data_sample.img_path
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+            self._visualizer.add_datasample(
+                window_name,
+                img,
+                data_sample=data_sample,
+                show=self.show,
+                wait_time=self.wait_time,
+                step=self._test_index)
diff --git a/head_extractor/build/lib/mmseg/engine/optimizers/__init__.py b/head_extractor/build/lib/mmseg/engine/optimizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4cf58741febfc20ea33664ea8e1b1ac68bbb327
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/engine/optimizers/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .force_default_constructor import ForceDefaultOptimWrapperConstructor
+from .layer_decay_optimizer_constructor import (
+    LayerDecayOptimizerConstructor, LearningRateDecayOptimizerConstructor)
+
+__all__ = [
+    'LearningRateDecayOptimizerConstructor', 'LayerDecayOptimizerConstructor',
+    'ForceDefaultOptimWrapperConstructor'
+]
diff --git a/head_extractor/build/lib/mmseg/engine/optimizers/force_default_constructor.py b/head_extractor/build/lib/mmseg/engine/optimizers/force_default_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..12c642ad411bfd547d63c894c84636e2f1896128
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/engine/optimizers/force_default_constructor.py
@@ -0,0 +1,255 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmengine.logging import print_log
+from mmengine.optim import DefaultOptimWrapperConstructor
+from mmengine.utils.dl_utils import mmcv_full_available
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm
+from torch.nn import GroupNorm, LayerNorm
+
+from mmseg.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class ForceDefaultOptimWrapperConstructor(DefaultOptimWrapperConstructor):
+    """Default constructor with forced optimizer settings.
+
+    This constructor extends the default constructor to add an option for
+    forcing default optimizer settings. This is useful for ensuring that
+    certain parameters or layers strictly adhere to pre-defined default
+    settings, regardless of any custom settings specified.
+
+    By default, each parameter share the same optimizer settings, and we
+    provide an argument ``paramwise_cfg`` to specify parameter-wise settings.
+    It is a dict and may contain various fields like 'custom_keys',
+    'bias_lr_mult', etc., as well as the additional field
+    `force_default_settings` which allows for enforcing default settings on
+    optimizer parameters.
+
+    - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If
+      one of the keys in ``custom_keys`` is a substring of the name of one
+      parameter, then the setting of the parameter will be specified by
+      ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will
+      be ignored. It should be noted that the aforementioned ``key`` is the
+      longest key that is a substring of the name of the parameter. If there
+      are multiple matched keys with the same length, then the key with lower
+      alphabet order will be chosen.
+      ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult``
+      and ``decay_mult``. See Example 2 below.
+    - ``bias_lr_mult`` (float): It will be multiplied to the learning
+      rate for all bias parameters (except for those in normalization
+      layers and offset layers of DCN).
+    - ``bias_decay_mult`` (float): It will be multiplied to the weight
+      decay for all bias parameters (except for those in
+      normalization layers, depthwise conv layers, offset layers of DCN).
+    - ``norm_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of normalization
+      layers.
+    - ``flat_decay_mult`` (float): It will be multiplied to the weight
+      decay for all one-dimensional parameters
+    - ``dwconv_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of depthwise conv
+      layers.
+    - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning
+      rate for parameters of offset layer in the deformable convs
+      of a model.
+    - ``bypass_duplicate`` (bool): If true, the duplicate parameters
+      would not be added into optimizer. Defaults to False.
+    - ``force_default_settings`` (bool): If true, this will override any
+      custom settings defined by ``custom_keys`` and enforce the use of
+      default settings for optimizer parameters like ``bias_lr_mult``.
+      This is particularly useful when you want to ensure that certain layers
+      or parameters adhere strictly to the pre-defined default settings.
+
+    Note:
+
+        1. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        override the effect of ``bias_lr_mult`` in the bias of offset layer.
+        So be careful when using both ``bias_lr_mult`` and
+        ``dcn_offset_lr_mult``. If you wish to apply both of them to the offset
+        layer in deformable convs, set ``dcn_offset_lr_mult`` to the original
+        ``dcn_offset_lr_mult`` * ``bias_lr_mult``.
+
+        2. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        apply it to all the DCN layers in the model. So be careful when the
+        model contains multiple DCN layers in places other than backbone.
+
+        3. When the option ``force_default_settings`` is true, it will override
+        any custom settings provided in ``custom_keys``. This ensures that the
+        default settings for the optimizer parameters are used.
+
+    Args:
+        optim_wrapper_cfg (dict): The config dict of the optimizer wrapper.
+
+            Required fields of ``optim_wrapper_cfg`` are
+
+            - ``type``: class name of the OptimizerWrapper
+            - ``optimizer``: The configuration of optimizer.
+
+            Optional fields of ``optim_wrapper_cfg`` are
+
+            - any arguments of the corresponding optimizer wrapper type,
+              e.g., accumulative_counts, clip_grad, etc.
+
+            Required fields of ``optimizer`` are
+
+            - `type`: class name of the optimizer.
+
+            Optional fields of ``optimizer`` are
+
+            - any arguments of the corresponding optimizer type, e.g.,
+              lr, weight_decay, momentum, etc.
+
+        paramwise_cfg (dict, optional): Parameter-wise options.
+
+    Example 1:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optim_wrapper_cfg = dict(
+        >>>     dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01,
+        >>>         momentum=0.9, weight_decay=0.0001))
+        >>> paramwise_cfg = dict(norm_decay_mult=0.)
+        >>> optim_wrapper_builder = DefaultOptimWrapperConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+
+    Example 2:
+        >>> # assume model have attribute model.backbone and model.cls_head
+        >>> optim_wrapper_cfg = dict(type='OptimWrapper', optimizer=dict(
+        >>>     type='SGD', lr=0.01, weight_decay=0.95))
+        >>> paramwise_cfg = dict(custom_keys={
+        >>>     'backbone': dict(lr_mult=0.1, decay_mult=0.9)})
+        >>> optim_wrapper_builder = DefaultOptimWrapperConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+        >>> # Then the `lr` and `weight_decay` for model.backbone is
+        >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for
+        >>> # model.cls_head is (0.01, 0.95).
+    """
+
+    def add_params(self,
+                   params: List[dict],
+                   module: nn.Module,
+                   prefix: str = '',
+                   is_dcn_module: Optional[Union[int, float]] = None) -> None:
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        # get param-wise options
+        custom_keys = self.paramwise_cfg.get('custom_keys', {})
+        # first sort with alphabet order and then sort with reversed len of str
+        sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
+
+        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', None)
+        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', None)
+        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', None)
+        dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', None)
+        flat_decay_mult = self.paramwise_cfg.get('flat_decay_mult', None)
+        bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)
+        dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', None)
+        force_default_settings = self.paramwise_cfg.get(
+            'force_default_settings', False)
+
+        # special rules for norm layers and depth-wise conv layers
+        is_norm = isinstance(module,
+                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))
+        is_dwconv = (
+            isinstance(module, torch.nn.Conv2d)
+            and module.in_channels == module.groups)
+
+        for name, param in module.named_parameters(recurse=False):
+            param_group = {'params': [param]}
+            if bypass_duplicate and self._is_in(param_group, params):
+                print_log(
+                    f'{prefix} is duplicate. It is skipped since '
+                    f'bypass_duplicate={bypass_duplicate}',
+                    logger='current',
+                    level=logging.WARNING)
+                continue
+            if not param.requires_grad:
+                params.append(param_group)
+                continue
+
+            # if the parameter match one of the custom keys, ignore other rules
+            is_custom = False
+            for key in sorted_keys:
+                if key in f'{prefix}.{name}':
+                    is_custom = True
+                    lr_mult = custom_keys[key].get('lr_mult', 1.)
+                    param_group['lr'] = self.base_lr * lr_mult
+                    if self.base_wd is not None:
+                        decay_mult = custom_keys[key].get('decay_mult', 1.)
+                        param_group['weight_decay'] = self.base_wd * decay_mult
+                    # add custom settings to param_group
+                    for k, v in custom_keys[key].items():
+                        param_group[k] = v
+                    break
+
+            if not is_custom or force_default_settings:
+                # bias_lr_mult affects all bias parameters
+                # except for norm.bias dcn.conv_offset.bias
+                if name == 'bias' and not (
+                        is_norm or is_dcn_module) and bias_lr_mult is not None:
+                    param_group['lr'] = self.base_lr * bias_lr_mult
+
+                if (prefix.find('conv_offset') != -1 and is_dcn_module
+                        and dcn_offset_lr_mult is not None
+                        and isinstance(module, torch.nn.Conv2d)):
+                    # deal with both dcn_offset's bias & weight
+                    param_group['lr'] = self.base_lr * dcn_offset_lr_mult
+
+                # apply weight decay policies
+                if self.base_wd is not None:
+                    # norm decay
+                    if is_norm and norm_decay_mult is not None:
+                        param_group[
+                            'weight_decay'] = self.base_wd * norm_decay_mult
+                    # bias lr and decay
+                    elif (name == 'bias' and not is_dcn_module
+                          and bias_decay_mult is not None):
+                        param_group[
+                            'weight_decay'] = self.base_wd * bias_decay_mult
+                    # depth-wise conv
+                    elif is_dwconv and dwconv_decay_mult is not None:
+                        param_group[
+                            'weight_decay'] = self.base_wd * dwconv_decay_mult
+                    # flatten parameters except dcn offset
+                    elif (param.ndim == 1 and not is_dcn_module
+                          and flat_decay_mult is not None):
+                        param_group[
+                            'weight_decay'] = self.base_wd * flat_decay_mult
+            params.append(param_group)
+            for key, value in param_group.items():
+                if key == 'params':
+                    continue
+                full_name = f'{prefix}.{name}' if prefix else name
+                print_log(
+                    f'paramwise_options -- {full_name}:{key}={value}',
+                    logger='current')
+
+        if mmcv_full_available():
+            from mmcv.ops import DeformConv2d, ModulatedDeformConv2d
+            is_dcn_module = isinstance(module,
+                                       (DeformConv2d, ModulatedDeformConv2d))
+        else:
+            is_dcn_module = False
+        for child_name, child_mod in module.named_children():
+            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
+            self.add_params(
+                params,
+                child_mod,
+                prefix=child_prefix,
+                is_dcn_module=is_dcn_module)
diff --git a/head_extractor/build/lib/mmseg/engine/optimizers/layer_decay_optimizer_constructor.py b/head_extractor/build/lib/mmseg/engine/optimizers/layer_decay_optimizer_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdae3ca698c65879056b969f04185f80452ff8d0
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/engine/optimizers/layer_decay_optimizer_constructor.py
@@ -0,0 +1,207 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import warnings
+
+from mmengine.dist import get_dist_info
+from mmengine.logging import print_log
+from mmengine.optim import DefaultOptimWrapperConstructor
+
+from mmseg.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+def get_layer_id_for_convnext(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates in ``layer_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_layer_id (int): Maximum number of backbone layers.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        stage_id = int(var_name.split('.')[2])
+        if stage_id == 0:
+            layer_id = 0
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        block_id = int(var_name.split('.')[3])
+        if stage_id == 0:
+            layer_id = 1
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3 + block_id // 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    else:
+        return max_layer_id + 1
+
+
+def get_stage_id_for_convnext(var_name, max_stage_id):
+    """Get the stage id to set the different learning rates in ``stage_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_stage_id (int): Maximum number of backbone layers.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        return 0
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        return stage_id + 1
+    else:
+        return max_stage_id - 1
+
+
+def get_layer_id_for_vit(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates.
+
+    Args:
+        var_name (str): The key of the model.
+        num_max_layer (int): Maximum number of backbone layers.
+
+    Returns:
+        int: Returns the layer id of the key.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.patch_embed'):
+        return 0
+    elif var_name.startswith('backbone.layers'):
+        layer_id = int(var_name.split('.')[2])
+        return layer_id + 1
+    else:
+        return max_layer_id - 1
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimWrapperConstructor):
+    """Different learning rates are set for different layers of backbone.
+
+    Note: Currently, this optimizer constructor is built for ConvNeXt,
+    BEiT and MAE.
+    """
+
+    def add_params(self, params, module, **kwargs):
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+        """
+
+        parameter_groups = {}
+        print_log(f'self.paramwise_cfg is {self.paramwise_cfg}')
+        num_layers = self.paramwise_cfg.get('num_layers') + 2
+        decay_rate = self.paramwise_cfg.get('decay_rate')
+        decay_type = self.paramwise_cfg.get('decay_type', 'layer_wise')
+        print_log('Build LearningRateDecayOptimizerConstructor  '
+                  f'{decay_type} {decay_rate} - {num_layers}')
+        weight_decay = self.base_wd
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith('.bias') or name in (
+                    'pos_embed', 'cls_token'):
+                group_name = 'no_decay'
+                this_weight_decay = 0.
+            else:
+                group_name = 'decay'
+                this_weight_decay = weight_decay
+            if 'layer_wise' in decay_type:
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_convnext(
+                        name, self.paramwise_cfg.get('num_layers'))
+                    print_log(f'set param {name} as id {layer_id}')
+                elif 'BEiT' in module.backbone.__class__.__name__ or \
+                     'MAE' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_vit(name, num_layers)
+                    print_log(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            elif decay_type == 'stage_wise':
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_stage_id_for_convnext(name, num_layers)
+                    print_log(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            group_name = f'layer_{layer_id}_{group_name}'
+
+            if group_name not in parameter_groups:
+                scale = decay_rate**(num_layers - layer_id - 1)
+
+                parameter_groups[group_name] = {
+                    'weight_decay': this_weight_decay,
+                    'params': [],
+                    'param_names': [],
+                    'lr_scale': scale,
+                    'group_name': group_name,
+                    'lr': scale * self.base_lr,
+                }
+
+            parameter_groups[group_name]['params'].append(param)
+            parameter_groups[group_name]['param_names'].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    'param_names': parameter_groups[key]['param_names'],
+                    'lr_scale': parameter_groups[key]['lr_scale'],
+                    'lr': parameter_groups[key]['lr'],
+                    'weight_decay': parameter_groups[key]['weight_decay'],
+                }
+            print_log(f'Param groups = {json.dumps(to_display, indent=2)}')
+        params.extend(parameter_groups.values())
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class LayerDecayOptimizerConstructor(LearningRateDecayOptimizerConstructor):
+    """Different learning rates are set for different layers of backbone.
+
+    Note: Currently, this optimizer constructor is built for BEiT,
+    and it will be deprecated.
+    Please use ``LearningRateDecayOptimizerConstructor`` instead.
+    """
+
+    def __init__(self, optim_wrapper_cfg, paramwise_cfg):
+        warnings.warn('DeprecationWarning: Original '
+                      'LayerDecayOptimizerConstructor of BEiT '
+                      'will be deprecated. Please use '
+                      'LearningRateDecayOptimizerConstructor instead, '
+                      'and set decay_type = layer_wise_vit in paramwise_cfg.')
+        paramwise_cfg.update({'decay_type': 'layer_wise_vit'})
+        warnings.warn('DeprecationWarning: Layer_decay_rate will '
+                      'be deleted, please use decay_rate instead.')
+        paramwise_cfg['decay_rate'] = paramwise_cfg.pop('layer_decay_rate')
+        super().__init__(optim_wrapper_cfg, paramwise_cfg)
diff --git a/head_extractor/build/lib/mmseg/engine/schedulers/__init__.py b/head_extractor/build/lib/mmseg/engine/schedulers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cd3f6211345bb3627b76d683291f48efd934a77
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/engine/schedulers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .poly_ratio_scheduler import PolyLRRatio
+
+__all__ = ['PolyLRRatio']
diff --git a/head_extractor/build/lib/mmseg/engine/schedulers/poly_ratio_scheduler.py b/head_extractor/build/lib/mmseg/engine/schedulers/poly_ratio_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..057203acc9cc9fc72306d2039669b90f35704436
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/engine/schedulers/poly_ratio_scheduler.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from mmengine.optim.scheduler import PolyLR
+
+from mmseg.registry import PARAM_SCHEDULERS
+
+
+@PARAM_SCHEDULERS.register_module()
+class PolyLRRatio(PolyLR):
+    """Implements polynomial learning rate decay with ratio.
+
+    This scheduler adjusts the learning rate of each parameter group
+    following a polynomial decay equation. The decay can occur in
+    conjunction with external parameter adjustments made outside this
+    scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        eta_min (float): Minimum learning rate at the end of scheduling.
+            Defaults to 0.
+        eta_min_ratio (float, optional): The ratio of the minimum parameter
+            value to the base parameter value. Either `eta_min` or
+            `eta_min_ratio` should be specified. Defaults to None.
+        power (float): The power of the polynomial. Defaults to 1.0.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self, eta_min_ratio: Optional[int] = None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.eta_min_ratio = eta_min_ratio
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+
+        if self.last_step == 0:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+
+        param_groups_value = []
+        for base_value, param_group in zip(self.base_values,
+                                           self.optimizer.param_groups):
+            eta_min = self.eta_min if self.eta_min_ratio is None else \
+                base_value * self.eta_min_ratio
+            step_ratio = (1 - 1 /
+                          (self.total_iters - self.last_step + 1))**self.power
+            step_value = (param_group[self.param_name] -
+                          eta_min) * step_ratio + eta_min
+            param_groups_value.append(step_value)
+
+        return param_groups_value
diff --git a/head_extractor/build/lib/mmseg/evaluation/__init__.py b/head_extractor/build/lib/mmseg/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..82b3a8d68d3aefcc23542fc1006eaddde05ca2ab
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/evaluation/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .metrics import CityscapesMetric, DepthMetric, IoUMetric
+
+__all__ = ['IoUMetric', 'CityscapesMetric', 'DepthMetric']
diff --git a/head_extractor/build/lib/mmseg/evaluation/metrics/__init__.py b/head_extractor/build/lib/mmseg/evaluation/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..848d4713dc8c0b6a08569d536bb72bd04ca1b1cc
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/evaluation/metrics/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .citys_metric import CityscapesMetric
+from .depth_metric import DepthMetric
+from .iou_metric import IoUMetric
+
+__all__ = ['IoUMetric', 'CityscapesMetric', 'DepthMetric']
diff --git a/head_extractor/build/lib/mmseg/evaluation/metrics/citys_metric.py b/head_extractor/build/lib/mmseg/evaluation/metrics/citys_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..32984653c3fa9c13d8c6a7402033001012b5031f
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/evaluation/metrics/citys_metric.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import shutil
+from collections import OrderedDict
+from typing import Dict, Optional, Sequence
+
+try:
+
+    import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as CSEval  # noqa
+    import cityscapesscripts.helpers.labels as CSLabels
+except ImportError:
+    CSLabels = None
+    CSEval = None
+
+import numpy as np
+from mmengine.dist import is_main_process, master_only
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+from mmengine.utils import mkdir_or_exist
+from PIL import Image
+
+from mmseg.registry import METRICS
+
+
+@METRICS.register_module()
+class CityscapesMetric(BaseMetric):
+    """Cityscapes evaluation metric.
+
+    Args:
+        output_dir (str): The directory for output prediction
+        ignore_index (int): Index that will be ignored in evaluation.
+            Default: 255.
+        format_only (bool): Only format result for results commit without
+            perform evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        keep_results (bool): Whether to keep the results. When ``format_only``
+            is True, ``keep_results`` must be True. Defaults to False.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    def __init__(self,
+                 output_dir: str,
+                 ignore_index: int = 255,
+                 format_only: bool = False,
+                 keep_results: bool = False,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 **kwargs) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        if CSEval is None:
+            raise ImportError('Please run "pip install cityscapesscripts" to '
+                              'install cityscapesscripts first.')
+        self.output_dir = output_dir
+        self.ignore_index = ignore_index
+
+        self.format_only = format_only
+        if format_only:
+            assert keep_results, (
+                'When format_only is True, the results must be keep, please '
+                f'set keep_results as True, but got {keep_results}')
+        self.keep_results = keep_results
+        self.prefix = prefix
+        if is_main_process():
+            mkdir_or_exist(self.output_dir)
+
+    @master_only
+    def __del__(self) -> None:
+        """Clean up."""
+        if not self.keep_results:
+            shutil.rmtree(self.output_dir)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        mkdir_or_exist(self.output_dir)
+
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_sem_seg']['data'][0].cpu().numpy()
+            # when evaluating with official cityscapesscripts,
+            # labelIds should be used
+            pred_label = self._convert_to_label_id(pred_label)
+            basename = osp.splitext(osp.basename(data_sample['img_path']))[0]
+            png_filename = osp.abspath(
+                osp.join(self.output_dir, f'{basename}.png'))
+            output = Image.fromarray(pred_label.astype(np.uint8)).convert('P')
+            output.save(png_filename)
+            if self.format_only:
+                # format_only always for test dataset without ground truth
+                gt_filename = ''
+            else:
+                # when evaluating with official cityscapesscripts,
+                # **_gtFine_labelIds.png is used
+                gt_filename = data_sample['seg_map_path'].replace(
+                    'labelTrainIds.png', 'labelIds.png')
+            self.results.append((png_filename, gt_filename))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): Testing results of the dataset.
+
+        Returns:
+            dict[str: float]: Cityscapes evaluation results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.format_only:
+            logger.info(f'results are saved to {osp.dirname(self.output_dir)}')
+            return OrderedDict()
+
+        msg = 'Evaluating in Cityscapes style'
+        if logger is None:
+            msg = '\n' + msg
+        print_log(msg, logger=logger)
+
+        eval_results = dict()
+        print_log(
+            f'Evaluating results under {self.output_dir} ...', logger=logger)
+
+        CSEval.args.evalInstLevelScore = True
+        CSEval.args.predictionPath = osp.abspath(self.output_dir)
+        CSEval.args.evalPixelAccuracy = True
+        CSEval.args.JSONOutput = False
+
+        pred_list, gt_list = zip(*results)
+        metric = dict()
+        eval_results.update(
+            CSEval.evaluateImgLists(pred_list, gt_list, CSEval.args))
+        metric['averageScoreCategories'] = eval_results[
+            'averageScoreCategories']
+        metric['averageScoreInstCategories'] = eval_results[
+            'averageScoreInstCategories']
+        return metric
+
+    @staticmethod
+    def _convert_to_label_id(result):
+        """Convert trainId to id for cityscapes."""
+        if isinstance(result, str):
+            result = np.load(result)
+        result_copy = result.copy()
+        for trainId, label in CSLabels.trainId2label.items():
+            result_copy[result == trainId] = label.id
+
+        return result_copy
diff --git a/head_extractor/build/lib/mmseg/evaluation/metrics/depth_metric.py b/head_extractor/build/lib/mmseg/evaluation/metrics/depth_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..621d4a31c9fe69cdbf83790e8f320218f755557a
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/evaluation/metrics/depth_metric.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import OrderedDict, defaultdict
+from typing import Dict, List, Optional, Sequence
+
+import cv2
+import numpy as np
+import torch
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+from mmengine.utils import mkdir_or_exist
+from prettytable import PrettyTable
+from torch import Tensor
+
+from mmseg.registry import METRICS
+
+
+@METRICS.register_module()
+class DepthMetric(BaseMetric):
+    """Depth estimation evaluation metric.
+
+    Args:
+        depth_metrics (List[str], optional): List of metrics to compute. If
+            not specified, defaults to all metrics in self.METRICS.
+        min_depth_eval (float): Minimum depth value for evaluation.
+            Defaults to 0.0.
+        max_depth_eval (float): Maximum depth value for evaluation.
+            Defaults to infinity.
+        crop_type (str, optional): Specifies the type of cropping to be used
+            during evaluation. This option can affect how the evaluation mask
+            is generated. Currently, 'nyu_crop' is supported, but other
+            types can be added in future. Defaults to None if no cropping
+            should be applied.
+        depth_scale_factor (float): Factor to scale the depth values.
+            Defaults to 1.0.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        output_dir (str): The directory for output prediction. Defaults to
+            None.
+        format_only (bool): Only format result for results commit without
+            perform evaluation. It is useful when you want to save the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+    METRICS = ('d1', 'd2', 'd3', 'abs_rel', 'sq_rel', 'rmse', 'rmse_log',
+               'log10', 'silog')
+
+    def __init__(self,
+                 depth_metrics: Optional[List[str]] = None,
+                 min_depth_eval: float = 0.0,
+                 max_depth_eval: float = float('inf'),
+                 crop_type: Optional[str] = None,
+                 depth_scale_factor: float = 1.0,
+                 collect_device: str = 'cpu',
+                 output_dir: Optional[str] = None,
+                 format_only: bool = False,
+                 prefix: Optional[str] = None,
+                 **kwargs) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        if depth_metrics is None:
+            self.metrics = self.METRICS
+        elif isinstance(depth_metrics, [tuple, list]):
+            for metric in depth_metrics:
+                assert metric in self.METRICS, f'the metric {metric} is not ' \
+                    f'supported. Please use metrics in {self.METRICS}'
+            self.metrics = depth_metrics
+
+        # Validate crop_type, if provided
+        assert crop_type in [
+            None, 'nyu_crop'
+        ], (f'Invalid value for crop_type: {crop_type}. Supported values are '
+            'None or \'nyu_crop\'.')
+        self.crop_type = crop_type
+        self.min_depth_eval = min_depth_eval
+        self.max_depth_eval = max_depth_eval
+        self.output_dir = output_dir
+        if self.output_dir and is_main_process():
+            mkdir_or_exist(self.output_dir)
+        self.format_only = format_only
+        self.depth_scale_factor = depth_scale_factor
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_depth_map']['data'].squeeze()
+            # format_only always for test dataset without ground truth
+            if not self.format_only:
+                gt_depth = data_sample['gt_depth_map']['data'].squeeze().to(
+                    pred_label)
+
+                eval_mask = self._get_eval_mask(gt_depth)
+                self.results.append(
+                    (gt_depth[eval_mask], pred_label[eval_mask]))
+            # format_result
+            if self.output_dir is not None:
+                basename = osp.splitext(osp.basename(
+                    data_sample['img_path']))[0]
+                png_filename = osp.abspath(
+                    osp.join(self.output_dir, f'{basename}.png'))
+                output_mask = pred_label.cpu().numpy(
+                ) * self.depth_scale_factor
+
+                cv2.imwrite(png_filename, output_mask.astype(np.uint16),
+                            [cv2.IMWRITE_PNG_COMPRESSION, 0])
+
+    def _get_eval_mask(self, gt_depth: Tensor):
+        """Generates an evaluation mask based on ground truth depth and
+        cropping.
+
+        Args:
+            gt_depth (Tensor): Ground truth depth map.
+
+        Returns:
+            Tensor: Boolean mask where evaluation should be performed.
+        """
+        valid_mask = torch.logical_and(gt_depth > self.min_depth_eval,
+                                       gt_depth < self.max_depth_eval)
+
+        if self.crop_type == 'nyu_crop':
+            # this implementation is adapted from
+            # https://github.com/zhyever/Monocular-Depth-Estimation-Toolbox/blob/main/depth/datasets/nyu.py  # noqa
+            crop_mask = torch.zeros_like(valid_mask)
+            crop_mask[45:471, 41:601] = 1
+        else:
+            crop_mask = torch.ones_like(valid_mask)
+
+        eval_mask = torch.logical_and(valid_mask, crop_mask)
+        return eval_mask
+
+    @staticmethod
+    def _calc_all_metrics(gt_depth, pred_depth):
+        """Computes final evaluation metrics based on accumulated results."""
+        assert gt_depth.shape == pred_depth.shape
+
+        thresh = torch.max((gt_depth / pred_depth), (pred_depth / gt_depth))
+        diff = pred_depth - gt_depth
+        diff_log = torch.log(pred_depth) - torch.log(gt_depth)
+
+        d1 = torch.sum(thresh < 1.25).float() / len(thresh)
+        d2 = torch.sum(thresh < 1.25**2).float() / len(thresh)
+        d3 = torch.sum(thresh < 1.25**3).float() / len(thresh)
+
+        abs_rel = torch.mean(torch.abs(diff) / gt_depth)
+        sq_rel = torch.mean(torch.pow(diff, 2) / gt_depth)
+
+        rmse = torch.sqrt(torch.mean(torch.pow(diff, 2)))
+        rmse_log = torch.sqrt(torch.mean(torch.pow(diff_log, 2)))
+
+        log10 = torch.mean(
+            torch.abs(torch.log10(pred_depth) - torch.log10(gt_depth)))
+        silog = torch.sqrt(
+            torch.pow(diff_log, 2).mean() -
+            0.5 * torch.pow(diff_log.mean(), 2))
+
+        return {
+            'd1': d1.item(),
+            'd2': d2.item(),
+            'd3': d3.item(),
+            'abs_rel': abs_rel.item(),
+            'sq_rel': sq_rel.item(),
+            'rmse': rmse.item(),
+            'rmse_log': rmse_log.item(),
+            'log10': log10.item(),
+            'silog': silog.item()
+        }
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results. The keys
+                are identical with self.metrics.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.format_only:
+            logger.info(f'results are saved to {osp.dirname(self.output_dir)}')
+            return OrderedDict()
+
+        metrics = defaultdict(list)
+        for gt_depth, pred_depth in results:
+            for key, value in self._calc_all_metrics(gt_depth,
+                                                     pred_depth).items():
+                metrics[key].append(value)
+        metrics = {k: sum(metrics[k]) / len(metrics[k]) for k in self.metrics}
+
+        table_data = PrettyTable()
+        for key, val in metrics.items():
+            table_data.add_column(key, [round(val, 5)])
+
+        print_log('results:', logger)
+        print_log('\n' + table_data.get_string(), logger=logger)
+
+        return metrics
diff --git a/head_extractor/build/lib/mmseg/evaluation/metrics/iou_metric.py b/head_extractor/build/lib/mmseg/evaluation/metrics/iou_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..16014c74001d7295f9fff8f03ef185077e3f613b
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/evaluation/metrics/iou_metric.py
@@ -0,0 +1,286 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import OrderedDict
+from typing import Dict, List, Optional, Sequence
+
+import numpy as np
+import torch
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+from mmengine.utils import mkdir_or_exist
+from PIL import Image
+from prettytable import PrettyTable
+
+from mmseg.registry import METRICS
+
+
+@METRICS.register_module()
+class IoUMetric(BaseMetric):
+    """IoU evaluation metric.
+
+    Args:
+        ignore_index (int): Index that will be ignored in evaluation.
+            Default: 255.
+        iou_metrics (list[str] | str): Metrics to be calculated, the options
+            includes 'mIoU', 'mDice' and 'mFscore'.
+        nan_to_num (int, optional): If specified, NaN values will be replaced
+            by the numbers defined by the user. Default: None.
+        beta (int): Determines the weight of recall in the combined score.
+            Default: 1.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        output_dir (str): The directory for output prediction. Defaults to
+            None.
+        format_only (bool): Only format result for results commit without
+            perform evaluation. It is useful when you want to save the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    def __init__(self,
+                 ignore_index: int = 255,
+                 iou_metrics: List[str] = ['mIoU'],
+                 nan_to_num: Optional[int] = None,
+                 beta: int = 1,
+                 collect_device: str = 'cpu',
+                 output_dir: Optional[str] = None,
+                 format_only: bool = False,
+                 prefix: Optional[str] = None,
+                 **kwargs) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.ignore_index = ignore_index
+        self.metrics = iou_metrics
+        self.nan_to_num = nan_to_num
+        self.beta = beta
+        self.output_dir = output_dir
+        if self.output_dir and is_main_process():
+            mkdir_or_exist(self.output_dir)
+        self.format_only = format_only
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        num_classes = len(self.dataset_meta['classes'])
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_sem_seg']['data'].squeeze()
+            # format_only always for test dataset without ground truth
+            if not self.format_only:
+                label = data_sample['gt_sem_seg']['data'].squeeze().to(
+                    pred_label)
+                self.results.append(
+                    self.intersect_and_union(pred_label, label, num_classes,
+                                             self.ignore_index))
+            # format_result
+            if self.output_dir is not None:
+                basename = osp.splitext(osp.basename(
+                    data_sample['img_path']))[0]
+                png_filename = osp.abspath(
+                    osp.join(self.output_dir, f'{basename}.png'))
+                output_mask = pred_label.cpu().numpy()
+                # The index range of official ADE20k dataset is from 0 to 150.
+                # But the index range of output is from 0 to 149.
+                # That is because we set reduce_zero_label=True.
+                if data_sample.get('reduce_zero_label', False):
+                    output_mask = output_mask + 1
+                output = Image.fromarray(output_mask.astype(np.uint8))
+                output.save(png_filename)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results. The key
+                mainly includes aAcc, mIoU, mAcc, mDice, mFscore, mPrecision,
+                mRecall.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.format_only:
+            logger.info(f'results are saved to {osp.dirname(self.output_dir)}')
+            return OrderedDict()
+        # convert list of tuples to tuple of lists, e.g.
+        # [(A_1, B_1, C_1, D_1), ...,  (A_n, B_n, C_n, D_n)] to
+        # ([A_1, ..., A_n], ..., [D_1, ..., D_n])
+        results = tuple(zip(*results))
+        assert len(results) == 4
+
+        total_area_intersect = sum(results[0])
+        total_area_union = sum(results[1])
+        total_area_pred_label = sum(results[2])
+        total_area_label = sum(results[3])
+        ret_metrics = self.total_area_to_metrics(
+            total_area_intersect, total_area_union, total_area_pred_label,
+            total_area_label, self.metrics, self.nan_to_num, self.beta)
+
+        class_names = self.dataset_meta['classes']
+
+        # summary table
+        ret_metrics_summary = OrderedDict({
+            ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2)
+            for ret_metric, ret_metric_value in ret_metrics.items()
+        })
+        metrics = dict()
+        for key, val in ret_metrics_summary.items():
+            if key == 'aAcc':
+                metrics[key] = val
+            else:
+                metrics['m' + key] = val
+
+        # each class table
+        ret_metrics.pop('aAcc', None)
+        ret_metrics_class = OrderedDict({
+            ret_metric: np.round(ret_metric_value * 100, 2)
+            for ret_metric, ret_metric_value in ret_metrics.items()
+        })
+        ret_metrics_class.update({'Class': class_names})
+        ret_metrics_class.move_to_end('Class', last=False)
+        class_table_data = PrettyTable()
+        for key, val in ret_metrics_class.items():
+            class_table_data.add_column(key, val)
+
+        print_log('per class results:', logger)
+        print_log('\n' + class_table_data.get_string(), logger=logger)
+
+        return metrics
+
+    @staticmethod
+    def intersect_and_union(pred_label: torch.tensor, label: torch.tensor,
+                            num_classes: int, ignore_index: int):
+        """Calculate Intersection and Union.
+
+        Args:
+            pred_label (torch.tensor): Prediction segmentation map
+                or predict result filename. The shape is (H, W).
+            label (torch.tensor): Ground truth segmentation map
+                or label filename. The shape is (H, W).
+            num_classes (int): Number of categories.
+            ignore_index (int): Index that will be ignored in evaluation.
+
+        Returns:
+            torch.Tensor: The intersection of prediction and ground truth
+                histogram on all classes.
+            torch.Tensor: The union of prediction and ground truth histogram on
+                all classes.
+            torch.Tensor: The prediction histogram on all classes.
+            torch.Tensor: The ground truth histogram on all classes.
+        """
+
+        mask = (label != ignore_index)
+        pred_label = pred_label[mask]
+        label = label[mask]
+
+        intersect = pred_label[pred_label == label]
+        area_intersect = torch.histc(
+            intersect.float(), bins=(num_classes), min=0,
+            max=num_classes - 1).cpu()
+        area_pred_label = torch.histc(
+            pred_label.float(), bins=(num_classes), min=0,
+            max=num_classes - 1).cpu()
+        area_label = torch.histc(
+            label.float(), bins=(num_classes), min=0,
+            max=num_classes - 1).cpu()
+        area_union = area_pred_label + area_label - area_intersect
+        return area_intersect, area_union, area_pred_label, area_label
+
+    @staticmethod
+    def total_area_to_metrics(total_area_intersect: np.ndarray,
+                              total_area_union: np.ndarray,
+                              total_area_pred_label: np.ndarray,
+                              total_area_label: np.ndarray,
+                              metrics: List[str] = ['mIoU'],
+                              nan_to_num: Optional[int] = None,
+                              beta: int = 1):
+        """Calculate evaluation metrics
+        Args:
+            total_area_intersect (np.ndarray): The intersection of prediction
+                and ground truth histogram on all classes.
+            total_area_union (np.ndarray): The union of prediction and ground
+                truth histogram on all classes.
+            total_area_pred_label (np.ndarray): The prediction histogram on
+                all classes.
+            total_area_label (np.ndarray): The ground truth histogram on
+                all classes.
+            metrics (List[str] | str): Metrics to be evaluated, 'mIoU' and
+                'mDice'.
+            nan_to_num (int, optional): If specified, NaN values will be
+                replaced by the numbers defined by the user. Default: None.
+            beta (int): Determines the weight of recall in the combined score.
+                Default: 1.
+        Returns:
+            Dict[str, np.ndarray]: per category evaluation metrics,
+                shape (num_classes, ).
+        """
+
+        def f_score(precision, recall, beta=1):
+            """calculate the f-score value.
+
+            Args:
+                precision (float | torch.Tensor): The precision value.
+                recall (float | torch.Tensor): The recall value.
+                beta (int): Determines the weight of recall in the combined
+                    score. Default: 1.
+
+            Returns:
+                [torch.tensor]: The f-score value.
+            """
+            score = (1 + beta**2) * (precision * recall) / (
+                (beta**2 * precision) + recall)
+            return score
+
+        if isinstance(metrics, str):
+            metrics = [metrics]
+        allowed_metrics = ['mIoU', 'mDice', 'mFscore']
+        if not set(metrics).issubset(set(allowed_metrics)):
+            raise KeyError(f'metrics {metrics} is not supported')
+
+        all_acc = total_area_intersect.sum() / total_area_label.sum()
+        ret_metrics = OrderedDict({'aAcc': all_acc})
+        for metric in metrics:
+            if metric == 'mIoU':
+                iou = total_area_intersect / total_area_union
+                acc = total_area_intersect / total_area_label
+                ret_metrics['IoU'] = iou
+                ret_metrics['Acc'] = acc
+            elif metric == 'mDice':
+                dice = 2 * total_area_intersect / (
+                    total_area_pred_label + total_area_label)
+                acc = total_area_intersect / total_area_label
+                ret_metrics['Dice'] = dice
+                ret_metrics['Acc'] = acc
+            elif metric == 'mFscore':
+                precision = total_area_intersect / total_area_pred_label
+                recall = total_area_intersect / total_area_label
+                f_value = torch.tensor([
+                    f_score(x[0], x[1], beta) for x in zip(precision, recall)
+                ])
+                ret_metrics['Fscore'] = f_value
+                ret_metrics['Precision'] = precision
+                ret_metrics['Recall'] = recall
+
+        ret_metrics = {
+            metric: value.numpy()
+            for metric, value in ret_metrics.items()
+        }
+        if nan_to_num is not None:
+            ret_metrics = OrderedDict({
+                metric: np.nan_to_num(metric_value, nan=nan_to_num)
+                for metric, metric_value in ret_metrics.items()
+            })
+        return ret_metrics
diff --git a/head_extractor/build/lib/mmseg/models/__init__.py b/head_extractor/build/lib/mmseg/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a98951283c1ac4047c5f5ca3cdc827a43c42cf60
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .assigners import *  # noqa: F401,F403
+from .backbones import *  # noqa: F401,F403
+from .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone,
+                      build_head, build_loss, build_segmentor)
+from .data_preprocessor import SegDataPreProcessor
+from .decode_heads import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .segmentors import *  # noqa: F401,F403
+from .text_encoder import *  # noqa: F401,F403
+
+__all__ = [
+    'BACKBONES', 'HEADS', 'LOSSES', 'SEGMENTORS', 'build_backbone',
+    'build_head', 'build_loss', 'build_segmentor', 'SegDataPreProcessor'
+]
diff --git a/head_extractor/build/lib/mmseg/models/assigners/__init__.py b/head_extractor/build/lib/mmseg/models/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d49b1b18b9e3e6d4e3b19c48eb1c80cbb1205f69
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/assigners/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_assigner import BaseAssigner
+from .hungarian_assigner import HungarianAssigner
+from .match_cost import ClassificationCost, CrossEntropyLossCost, DiceCost
+
+__all__ = [
+    'BaseAssigner',
+    'HungarianAssigner',
+    'ClassificationCost',
+    'CrossEntropyLossCost',
+    'DiceCost',
+]
diff --git a/head_extractor/build/lib/mmseg/models/assigners/base_assigner.py b/head_extractor/build/lib/mmseg/models/assigners/base_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..97895cdac2789a62c3e8a381caaf944679f1e5a4
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/assigners/base_assigner.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Optional
+
+from mmengine.structures import InstanceData
+
+
+class BaseAssigner(metaclass=ABCMeta):
+    """Base assigner that assigns masks to ground truth class labels."""
+
+    @abstractmethod
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs):
+        """Assign masks to either a ground truth class label or a negative
+        label."""
diff --git a/head_extractor/build/lib/mmseg/models/assigners/hungarian_assigner.py b/head_extractor/build/lib/mmseg/models/assigners/hungarian_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..28868f0a04e7feaf3de20e39fac5059d789047d3
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/assigners/hungarian_assigner.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import torch
+from mmengine import ConfigDict
+from mmengine.structures import InstanceData
+from scipy.optimize import linear_sum_assignment
+from torch.cuda.amp import autocast
+
+from mmseg.registry import TASK_UTILS
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class HungarianAssigner(BaseAssigner):
+    """Computes one-to-one matching between prediction masks and ground truth.
+
+    This class uses bipartite matching-based assignment to computes an
+    assignment between the prediction masks and the ground truth. The
+    assignment result is based on the weighted sum of match costs. The
+    Hungarian algorithm is used to calculate the best matching with the
+    minimum cost. The prediction masks that are not matched are classified
+    as background.
+
+    Args:
+        match_costs (ConfigDict|List[ConfigDict]): Match cost configs.
+    """
+
+    def __init__(
+        self, match_costs: Union[List[Union[dict, ConfigDict]], dict,
+                                 ConfigDict]
+    ) -> None:
+
+        if isinstance(match_costs, dict):
+            match_costs = [match_costs]
+        elif isinstance(match_costs, list):
+            assert len(match_costs) > 0, \
+                'match_costs must not be a empty list.'
+
+        self.match_costs = [
+            TASK_UTILS.build(match_cost) for match_cost in match_costs
+        ]
+
+    def assign(self, pred_instances: InstanceData, gt_instances: InstanceData,
+               **kwargs):
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The assignment first calculates the cost for each
+        category assigned to each query mask, and then uses the
+        Hungarian algorithm to calculate the minimum cost as the best
+        match.
+
+        Args:
+            pred_instances (InstanceData): Instances of model
+                predictions. It includes "masks", with shape
+                (n, h, w) or (n, l), and "cls", with shape (n, num_classes+1)
+            gt_instances (InstanceData): Ground truth of instance
+                annotations. It includes "labels", with shape (k, ),
+                and "masks", with shape (k, h, w) or (k, l).
+
+        Returns:
+            matched_quiery_inds (Tensor): The indexes of matched quieres.
+            matched_label_inds (Tensor): The indexes of matched labels.
+        """
+        # compute weighted cost
+        cost_list = []
+        with autocast(enabled=False):
+            for match_cost in self.match_costs:
+                cost = match_cost(
+                    pred_instances=pred_instances, gt_instances=gt_instances)
+                cost_list.append(cost)
+            cost = torch.stack(cost_list).sum(dim=0)
+
+        device = cost.device
+        # do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+
+        matched_quiery_inds, matched_label_inds = linear_sum_assignment(cost)
+        matched_quiery_inds = torch.from_numpy(matched_quiery_inds).to(device)
+        matched_label_inds = torch.from_numpy(matched_label_inds).to(device)
+
+        return matched_quiery_inds, matched_label_inds
diff --git a/head_extractor/build/lib/mmseg/models/assigners/match_cost.py b/head_extractor/build/lib/mmseg/models/assigners/match_cost.py
new file mode 100644
index 0000000000000000000000000000000000000000..560df852902fa7a2167cc7cfdf86595bf8d6e3f8
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/assigners/match_cost.py
@@ -0,0 +1,231 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Union
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import TASK_UTILS
+
+
+class BaseMatchCost:
+    """Base match cost class.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self, weight: Union[float, int] = 1.) -> None:
+        self.weight = weight
+
+    @abstractmethod
+    def __call__(self, pred_instances: InstanceData,
+                 gt_instances: InstanceData, **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (InstanceData): Instances of model predictions.
+            It often includes "labels" and "scores".
+            gt_instances (InstanceData): Ground truth of instance
+            annotations. It usually includes "labels".
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pass
+
+
+@TASK_UTILS.register_module()
+class ClassificationCost(BaseMatchCost):
+    """ClsSoftmaxCost.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+
+    Examples:
+        >>> from mmseg.models.assigners import ClassificationCost
+        >>> import torch
+        >>> self = ClassificationCost()
+        >>> cls_pred = torch.rand(4, 3)
+        >>> gt_labels = torch.tensor([0, 1, 2])
+        >>> factor = torch.tensor([10, 8, 10, 8])
+        >>> self(cls_pred, gt_labels)
+        tensor([[-0.3430, -0.3525, -0.3045],
+            [-0.3077, -0.2931, -0.3992],
+            [-0.3664, -0.3455, -0.2881],
+            [-0.3343, -0.2701, -0.3956]])
+    """
+
+    def __init__(self, weight: Union[float, int] = 1) -> None:
+        super().__init__(weight=weight)
+
+    def __call__(self, pred_instances: InstanceData,
+                 gt_instances: InstanceData, **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (InstanceData): "scores" inside is
+                predicted classification logits, of shape
+                (num_queries, num_class).
+            gt_instances (InstanceData): "labels" inside should have
+                shape (num_gt, ).
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        assert hasattr(pred_instances, 'scores'), \
+            "pred_instances must contain 'scores'"
+        assert hasattr(gt_instances, 'labels'), \
+            "gt_instances must contain 'labels'"
+        pred_scores = pred_instances.scores
+        gt_labels = gt_instances.labels
+
+        pred_scores = pred_scores.softmax(-1)
+        cls_cost = -pred_scores[:, gt_labels]
+
+        return cls_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class DiceCost(BaseMatchCost):
+    """Cost of mask assignments based on dice losses.
+
+    Args:
+        pred_act (bool): Whether to apply sigmoid to mask_pred.
+            Defaults to False.
+        eps (float): Defaults to 1e-3.
+        naive_dice (bool): If True, use the naive dice loss
+            in which the power of the number in the denominator is
+            the first power. If False, use the second power that
+            is adopted by K-Net and SOLO. Defaults to True.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 pred_act: bool = False,
+                 eps: float = 1e-3,
+                 naive_dice: bool = True,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.pred_act = pred_act
+        self.eps = eps
+        self.naive_dice = naive_dice
+
+    def _binary_mask_dice_loss(self, mask_preds: Tensor,
+                               gt_masks: Tensor) -> Tensor:
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction in shape (num_queries, *).
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+                store 0 or 1, 0 for negative class and 1 for
+                positive class.
+
+        Returns:
+            Tensor: Dice cost matrix in shape (num_queries, num_gt).
+        """
+        mask_preds = mask_preds.flatten(1)
+        gt_masks = gt_masks.flatten(1).float()
+        numerator = 2 * torch.einsum('nc,mc->nm', mask_preds, gt_masks)
+        if self.naive_dice:
+            denominator = mask_preds.sum(-1)[:, None] + \
+                gt_masks.sum(-1)[None, :]
+        else:
+            denominator = mask_preds.pow(2).sum(1)[:, None] + \
+                gt_masks.pow(2).sum(1)[None, :]
+        loss = 1 - (numerator + self.eps) / (denominator + self.eps)
+        return loss
+
+    def __call__(self, pred_instances: InstanceData,
+                 gt_instances: InstanceData, **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (InstanceData): Predicted instances which
+                must contain "masks".
+            gt_instances (InstanceData): Ground truth which must contain
+                "mask".
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        assert hasattr(pred_instances, 'masks'), \
+            "pred_instances must contain 'masks'"
+        assert hasattr(gt_instances, 'masks'), \
+            "gt_instances must contain 'masks'"
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+
+        if self.pred_act:
+            pred_masks = pred_masks.sigmoid()
+        dice_cost = self._binary_mask_dice_loss(pred_masks, gt_masks)
+        return dice_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class CrossEntropyLossCost(BaseMatchCost):
+    """CrossEntropyLossCost.
+
+    Args:
+        use_sigmoid (bool): Whether the prediction uses sigmoid
+                of softmax. Defaults to True.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 use_sigmoid: bool = True,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.use_sigmoid = use_sigmoid
+
+    def _binary_cross_entropy(self, cls_pred: Tensor,
+                              gt_labels: Tensor) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): The prediction with shape (num_queries, 1, *) or
+                (num_queries, *).
+            gt_labels (Tensor): The learning label of prediction with
+                shape (num_gt, *).
+
+        Returns:
+            Tensor: Cross entropy cost matrix in shape (num_queries, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1).float()
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        pos = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.ones_like(cls_pred), reduction='none')
+        neg = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.zeros_like(cls_pred), reduction='none')
+        cls_cost = torch.einsum('nc,mc->nm', pos, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg, 1 - gt_labels)
+        cls_cost = cls_cost / n
+
+        return cls_cost
+
+    def __call__(self, pred_instances: InstanceData,
+                 gt_instances: InstanceData, **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``masks``.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        assert hasattr(pred_instances, 'masks'), \
+            "pred_instances must contain 'masks'"
+        assert hasattr(gt_instances, 'masks'), \
+            "gt_instances must contain 'masks'"
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+        if self.use_sigmoid:
+            cls_cost = self._binary_cross_entropy(pred_masks, gt_masks)
+        else:
+            raise NotImplementedError
+
+        return cls_cost * self.weight
diff --git a/head_extractor/build/lib/mmseg/models/backbones/__init__.py b/head_extractor/build/lib/mmseg/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..697668124d7ce8935cb0c910f713a8bf13d0662d
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .beit import BEiT
+from .bisenetv1 import BiSeNetV1
+from .bisenetv2 import BiSeNetV2
+from .cgnet import CGNet
+from .ddrnet import DDRNet
+from .erfnet import ERFNet
+from .fast_scnn import FastSCNN
+from .hrnet import HRNet
+from .icnet import ICNet
+from .mae import MAE
+from .mit import MixVisionTransformer
+from .mobilenet_v2 import MobileNetV2
+from .mobilenet_v3 import MobileNetV3
+from .mscan import MSCAN
+from .pidnet import PIDNet
+from .resnest import ResNeSt
+from .resnet import ResNet, ResNetV1c, ResNetV1d
+from .resnext import ResNeXt
+from .stdc import STDCContextPathNet, STDCNet
+from .swin import SwinTransformer
+from .timm_backbone import TIMMBackbone
+from .twins import PCPVT, SVT
+from .unet import UNet
+from .vit import VisionTransformer
+from .vpd import VPD
+from .dinov2 import DINOv2
+
+__all__ = [
+    'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN',
+    'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
+    'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer',
+    'BiSeNetV1', 'BiSeNetV2', 'ICNet', 'TIMMBackbone', 'ERFNet', 'PCPVT',
+    'SVT', 'STDCNet', 'STDCContextPathNet', 'BEiT', 'MAE', 'PIDNet', 'MSCAN',
+    'DDRNet', 'VPD', 'DINOv2'
+]
diff --git a/head_extractor/build/lib/mmseg/models/backbones/beit.py b/head_extractor/build/lib/mmseg/models/backbones/beit.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5da71e729256a9dd12b70d32886c9db27d9fa3c
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/beit.py
@@ -0,0 +1,554 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, kaiming_init,
+                                        trunc_normal_)
+from mmengine.runner.checkpoint import _load_checkpoint
+from scipy import interpolate
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn.modules.utils import _pair as to_2tuple
+
+from mmseg.registry import MODELS
+from ..utils import PatchEmbed
+from .vit import TransformerEncoderLayer as VisionTransformerEncoderLayer
+
+
+class BEiTAttention(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        bias (bool): The option to add leanable bias for q, k, v. If bias is
+            True, it will add leanable bias. If bias is 'qv_bias', it will only
+            add leanable bias for q, v. If bias is False, it will not add bias
+            for q, k, v. Default to 'qv_bias'.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float): Dropout ratio of output. Default: 0.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 bias='qv_bias',
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.bias = bias
+        self.scale = qk_scale or head_embed_dims**-0.5
+
+        qkv_bias = bias
+        if bias == 'qv_bias':
+            self._init_qv_bias()
+            qkv_bias = False
+
+        self.window_size = window_size
+        self._init_rel_pos_embedding()
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+    def _init_qv_bias(self):
+        self.q_bias = nn.Parameter(torch.zeros(self.embed_dims))
+        self.v_bias = nn.Parameter(torch.zeros(self.embed_dims))
+
+    def _init_rel_pos_embedding(self):
+        Wh, Ww = self.window_size
+        # cls to token & token 2 cls & cls to cls
+        self.num_relative_distance = (2 * Wh - 1) * (2 * Ww - 1) + 3
+        # relative_position_bias_table shape is (2*Wh-1 * 2*Ww-1 + 3, nH)
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, self.num_heads))
+
+        # get pair-wise relative position index for
+        # each token inside the window
+        coords_h = torch.arange(Wh)
+        coords_w = torch.arange(Ww)
+        # coords shape is (2, Wh, Ww)
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
+        # coords_flatten shape is (2, Wh*Ww)
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = (
+            coords_flatten[:, :, None] - coords_flatten[:, None, :])
+        # relative_coords shape is (Wh*Ww, Wh*Ww, 2)
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        # shift to start from 0
+        relative_coords[:, :, 0] += Wh - 1
+        relative_coords[:, :, 1] += Ww - 1
+        relative_coords[:, :, 0] *= 2 * Ww - 1
+        relative_position_index = torch.zeros(
+            size=(Wh * Ww + 1, ) * 2, dtype=relative_coords.dtype)
+        # relative_position_index shape is (Wh*Ww, Wh*Ww)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+    def init_weights(self):
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x):
+        """
+        Args:
+            x (tensor): input features with shape of (num_windows*B, N, C).
+        """
+        B, N, C = x.shape
+
+        if self.bias == 'qv_bias':
+            k_bias = torch.zeros_like(self.v_bias, requires_grad=False)
+            qkv_bias = torch.cat((self.q_bias, k_bias, self.v_bias))
+            qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        else:
+            qkv = self.qkv(x)
+
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        if self.relative_position_bias_table is not None:
+            Wh = self.window_size[0]
+            Ww = self.window_size[1]
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.view(-1)].view(
+                    Wh * Ww + 1, Wh * Ww + 1, -1)
+            relative_position_bias = relative_position_bias.permute(
+                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class BEiTTransformerEncoderLayer(VisionTransformerEncoderLayer):
+    """Implements one encoder layer in Vision Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): Stochastic depth rate. Default 0.0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        bias (bool): The option to add leanable bias for q, k, v. If bias is
+            True, it will add leanable bias. If bias is 'qv_bias', it will only
+            add leanable bias for q, v. If bias is False, it will not add bias
+            for q, k, v. Default to 'qv_bias'.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        window_size (tuple[int], optional): The height and width of the window.
+            Default: None.
+        init_values (float, optional): Initialize the values of BEiTAttention
+            and FFN with learnable scaling. Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 num_fcs=2,
+                 bias='qv_bias',
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 window_size=None,
+                 attn_cfg=dict(),
+                 ffn_cfg=dict(add_identity=False),
+                 init_values=None):
+        attn_cfg.update(dict(window_size=window_size, qk_scale=None))
+
+        super().__init__(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            feedforward_channels=feedforward_channels,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=0.,
+            drop_rate=0.,
+            num_fcs=num_fcs,
+            qkv_bias=bias,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            attn_cfg=attn_cfg,
+            ffn_cfg=ffn_cfg)
+
+        # NOTE: drop path for stochastic depth, we shall see if
+        # this is better than dropout here
+        dropout_layer = dict(type='DropPath', drop_prob=drop_path_rate)
+        self.drop_path = build_dropout(
+            dropout_layer) if dropout_layer else nn.Identity()
+        self.gamma_1 = nn.Parameter(
+            init_values * torch.ones(embed_dims), requires_grad=True)
+        self.gamma_2 = nn.Parameter(
+            init_values * torch.ones(embed_dims), requires_grad=True)
+
+    def build_attn(self, attn_cfg):
+        self.attn = BEiTAttention(**attn_cfg)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.gamma_2 * self.ffn(self.norm2(x)))
+        return x
+
+
+@MODELS.register_module()
+class BEiT(BaseModule):
+    """BERT Pre-Training of Image Transformers.
+
+    Args:
+        img_size (int | tuple): Input image size. Default: 224.
+        patch_size (int): The patch size. Default: 16.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 768.
+        num_layers (int): Depth of transformer. Default: 12.
+        num_heads (int): Number of attention heads. Default: 12.
+        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        out_indices (list | tuple | int): Output from which stages.
+            Default: -1.
+        qv_bias (bool): Enable bias for qv if True. Default: True.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): Stochastic depth rate. Default 0.0.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        patch_norm (bool): Whether to add a norm in PatchEmbed Block.
+            Default: False.
+        final_norm (bool): Whether to add a additional layer to normalize
+            final feature map. Default: False.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_values (float): Initialize the values of BEiTAttention and FFN
+            with learnable scaling.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dims=768,
+                 num_layers=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 out_indices=-1,
+                 qv_bias=True,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_cfg=dict(type='LN'),
+                 act_cfg=dict(type='GELU'),
+                 patch_norm=False,
+                 final_norm=False,
+                 num_fcs=2,
+                 norm_eval=False,
+                 pretrained=None,
+                 init_values=0.1,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(img_size, int):
+            img_size = to_2tuple(img_size)
+        elif isinstance(img_size, tuple):
+            if len(img_size) == 1:
+                img_size = to_2tuple(img_size[0])
+            assert len(img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.in_channels = in_channels
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.norm_eval = norm_eval
+        self.pretrained = pretrained
+        self.num_layers = num_layers
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.num_fcs = num_fcs
+        self.qv_bias = qv_bias
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.patch_norm = patch_norm
+        self.init_values = init_values
+        self.window_size = (img_size[0] // patch_size,
+                            img_size[1] // patch_size)
+        self.patch_shape = self.window_size
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
+
+        self._build_patch_embedding()
+        self._build_layers()
+
+        if isinstance(out_indices, int):
+            if out_indices == -1:
+                out_indices = num_layers - 1
+            self.out_indices = [out_indices]
+        elif isinstance(out_indices, list) or isinstance(out_indices, tuple):
+            self.out_indices = out_indices
+        else:
+            raise TypeError('out_indices must be type of int, list or tuple')
+
+        self.final_norm = final_norm
+        if final_norm:
+            self.norm1_name, norm1 = build_norm_layer(
+                norm_cfg, embed_dims, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+
+    def _build_patch_embedding(self):
+        """Build patch embedding layer."""
+        self.patch_embed = PatchEmbed(
+            in_channels=self.in_channels,
+            embed_dims=self.embed_dims,
+            conv_type='Conv2d',
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding=0,
+            norm_cfg=self.norm_cfg if self.patch_norm else None,
+            init_cfg=None)
+
+    def _build_layers(self):
+        """Build transformer encoding layers."""
+
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, self.num_layers)
+        ]
+        self.layers = ModuleList()
+        for i in range(self.num_layers):
+            self.layers.append(
+                BEiTTransformerEncoderLayer(
+                    embed_dims=self.embed_dims,
+                    num_heads=self.num_heads,
+                    feedforward_channels=self.mlp_ratio * self.embed_dims,
+                    attn_drop_rate=self.attn_drop_rate,
+                    drop_path_rate=dpr[i],
+                    num_fcs=self.num_fcs,
+                    bias='qv_bias' if self.qv_bias else False,
+                    act_cfg=self.act_cfg,
+                    norm_cfg=self.norm_cfg,
+                    window_size=self.window_size,
+                    init_values=self.init_values))
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    def _geometric_sequence_interpolation(self, src_size, dst_size, sequence,
+                                          num):
+        """Get new sequence via geometric sequence interpolation.
+
+        Args:
+            src_size (int): Pos_embedding size in pre-trained model.
+            dst_size (int): Pos_embedding size in the current model.
+            sequence (tensor): The relative position bias of the pretrain
+                model after removing the extra tokens.
+            num (int): Number of attention heads.
+        Returns:
+            new_sequence (tensor): Geometric sequence interpolate the
+                pre-trained relative position bias to the size of
+                the current model.
+        """
+
+        def geometric_progression(a, r, n):
+            return a * (1.0 - r**n) / (1.0 - r)
+
+        # Here is a binary function.
+        left, right = 1.01, 1.5
+        while right - left > 1e-6:
+            q = (left + right) / 2.0
+            gp = geometric_progression(1, q, src_size // 2)
+            if gp > dst_size // 2:
+                right = q
+            else:
+                left = q
+        # The position of each interpolated point is determined
+        # by the ratio obtained by dichotomy.
+        dis = []
+        cur = 1
+        for i in range(src_size // 2):
+            dis.append(cur)
+            cur += q**(i + 1)
+        r_ids = [-_ for _ in reversed(dis)]
+        x = r_ids + [0] + dis
+        y = r_ids + [0] + dis
+        t = dst_size // 2.0
+        dx = np.arange(-t, t + 0.1, 1.0)
+        dy = np.arange(-t, t + 0.1, 1.0)
+        # Interpolation functions are being executed and called.
+        new_sequence = []
+        for i in range(num):
+            z = sequence[:, i].view(src_size, src_size).float().numpy()
+            f = interpolate.interp2d(x, y, z, kind='cubic')
+            new_sequence.append(
+                torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(sequence))
+        new_sequence = torch.cat(new_sequence, dim=-1)
+        return new_sequence
+
+    def resize_rel_pos_embed(self, checkpoint):
+        """Resize relative pos_embed weights.
+
+        This function is modified from
+        https://github.com/microsoft/unilm/blob/master/beit/semantic_segmentation/mmcv_custom/checkpoint.py.  # noqa: E501
+        Copyright (c) Microsoft Corporation
+        Licensed under the MIT License
+        Args:
+            checkpoint (dict): Key and value of the pretrain model.
+        Returns:
+            state_dict (dict): Interpolate the relative pos_embed weights
+                in the pre-train model to the current model size.
+        """
+        if 'state_dict' in checkpoint:
+            state_dict = checkpoint['state_dict']
+        else:
+            state_dict = checkpoint
+
+        all_keys = list(state_dict.keys())
+        for key in all_keys:
+            if 'relative_position_index' in key:
+                state_dict.pop(key)
+            # In order to keep the center of pos_bias as consistent as
+            # possible after interpolation, and vice versa in the edge
+            # area, the geometric sequence interpolation method is adopted.
+            if 'relative_position_bias_table' in key:
+                rel_pos_bias = state_dict[key]
+                src_num_pos, num_attn_heads = rel_pos_bias.size()
+                dst_num_pos, _ = self.state_dict()[key].size()
+                dst_patch_shape = self.patch_shape
+                if dst_patch_shape[0] != dst_patch_shape[1]:
+                    raise NotImplementedError()
+                # Count the number of extra tokens.
+                num_extra_tokens = dst_num_pos - (
+                    dst_patch_shape[0] * 2 - 1) * (
+                        dst_patch_shape[1] * 2 - 1)
+                src_size = int((src_num_pos - num_extra_tokens)**0.5)
+                dst_size = int((dst_num_pos - num_extra_tokens)**0.5)
+                if src_size != dst_size:
+                    extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
+                    rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
+                    new_rel_pos_bias = self._geometric_sequence_interpolation(
+                        src_size, dst_size, rel_pos_bias, num_attn_heads)
+                    new_rel_pos_bias = torch.cat(
+                        (new_rel_pos_bias, extra_tokens), dim=0)
+                    state_dict[key] = new_rel_pos_bias
+
+        return state_dict
+
+    def init_weights(self):
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+        self.apply(_init_weights)
+
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg.get('type') == 'Pretrained'):
+            checkpoint = _load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+            state_dict = self.resize_rel_pos_embed(checkpoint)
+            self.load_state_dict(state_dict, False)
+        elif self.init_cfg is not None:
+            super().init_weights()
+        else:
+            # We only implement the 'jax_impl' initialization implemented at
+            # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
+            # Copyright 2019 Ross Wightman
+            # Licensed under the Apache License, Version 2.0 (the "License")
+            trunc_normal_(self.cls_token, std=.02)
+            for n, m in self.named_modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if m.bias is not None:
+                        if 'ffn' in n:
+                            nn.init.normal_(m.bias, mean=0., std=1e-6)
+                        else:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Conv2d):
+                    kaiming_init(m, mode='fan_in', bias=0.)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m, val=1.0, bias=0.)
+
+    def forward(self, inputs):
+        B = inputs.shape[0]
+
+        x, hw_shape = self.patch_embed(inputs)
+
+        # stole cls_tokens impl from Phil Wang, thanks
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i == len(self.layers) - 1:
+                if self.final_norm:
+                    x = self.norm1(x)
+            if i in self.out_indices:
+                # Remove class token and reshape token for decoder head
+                out = x[:, 1:]
+                B, _, C = out.shape
+                out = out.reshape(B, hw_shape[0], hw_shape[1],
+                                  C).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.LayerNorm):
+                    m.eval()
diff --git a/head_extractor/build/lib/mmseg/models/backbones/bisenetv1.py b/head_extractor/build/lib/mmseg/models/backbones/bisenetv1.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca58bf9c597836937bc384739ff77001b5402942
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/bisenetv1.py
@@ -0,0 +1,332 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+class SpatialPath(BaseModule):
+    """Spatial Path to preserve the spatial size of the original input image
+    and encode affluent spatial information.
+
+    Args:
+        in_channels(int): The number of channels of input
+            image. Default: 3.
+        num_channels (Tuple[int]): The number of channels of
+            each layers in Spatial Path.
+            Default: (64, 64, 64, 128).
+    Returns:
+        x (torch.Tensor): Feature map for Feature Fusion Module.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 num_channels=(64, 64, 64, 128),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert len(num_channels) == 4, 'Length of input channels \
+                                        of Spatial Path must be 4!'
+
+        self.layers = []
+        for i in range(len(num_channels)):
+            layer_name = f'layer{i + 1}'
+            self.layers.append(layer_name)
+            if i == 0:
+                self.add_module(
+                    layer_name,
+                    ConvModule(
+                        in_channels=in_channels,
+                        out_channels=num_channels[i],
+                        kernel_size=7,
+                        stride=2,
+                        padding=3,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+            elif i == len(num_channels) - 1:
+                self.add_module(
+                    layer_name,
+                    ConvModule(
+                        in_channels=num_channels[i - 1],
+                        out_channels=num_channels[i],
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+            else:
+                self.add_module(
+                    layer_name,
+                    ConvModule(
+                        in_channels=num_channels[i - 1],
+                        out_channels=num_channels[i],
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+
+    def forward(self, x):
+        for i, layer_name in enumerate(self.layers):
+            layer_stage = getattr(self, layer_name)
+            x = layer_stage(x)
+        return x
+
+
+class AttentionRefinementModule(BaseModule):
+    """Attention Refinement Module (ARM) to refine the features of each stage.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+    Returns:
+        x_out (torch.Tensor): Feature map of Attention Refinement Module.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channel,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.conv_layer = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.atten_conv_layer = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            ConvModule(
+                in_channels=out_channel,
+                out_channels=out_channel,
+                kernel_size=1,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None), nn.Sigmoid())
+
+    def forward(self, x):
+        x = self.conv_layer(x)
+        x_atten = self.atten_conv_layer(x)
+        x_out = x * x_atten
+        return x_out
+
+
+class ContextPath(BaseModule):
+    """Context Path to provide sufficient receptive field.
+
+    Args:
+        backbone_cfg:(dict): Config of backbone of
+            Context Path.
+        context_channels (Tuple[int]): The number of channel numbers
+            of various modules in Context Path.
+            Default: (128, 256, 512).
+        align_corners (bool, optional): The align_corners argument of
+            resize operation. Default: False.
+    Returns:
+        x_16_up, x_32_up (torch.Tensor, torch.Tensor): Two feature maps
+            undergoing upsampling from 1/16 and 1/32 downsampling
+            feature maps. These two feature maps are used for Feature
+            Fusion Module and Auxiliary Head.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 context_channels=(128, 256, 512),
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert len(context_channels) == 3, 'Length of input channels \
+                                           of Context Path must be 3!'
+
+        self.backbone = MODELS.build(backbone_cfg)
+
+        self.align_corners = align_corners
+        self.arm16 = AttentionRefinementModule(context_channels[1],
+                                               context_channels[0])
+        self.arm32 = AttentionRefinementModule(context_channels[2],
+                                               context_channels[0])
+        self.conv_head32 = ConvModule(
+            in_channels=context_channels[0],
+            out_channels=context_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv_head16 = ConvModule(
+            in_channels=context_channels[0],
+            out_channels=context_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.gap_conv = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            ConvModule(
+                in_channels=context_channels[2],
+                out_channels=context_channels[0],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def forward(self, x):
+        x_4, x_8, x_16, x_32 = self.backbone(x)
+        x_gap = self.gap_conv(x_32)
+
+        x_32_arm = self.arm32(x_32)
+        x_32_sum = x_32_arm + x_gap
+        x_32_up = resize(input=x_32_sum, size=x_16.shape[2:], mode='nearest')
+        x_32_up = self.conv_head32(x_32_up)
+
+        x_16_arm = self.arm16(x_16)
+        x_16_sum = x_16_arm + x_32_up
+        x_16_up = resize(input=x_16_sum, size=x_8.shape[2:], mode='nearest')
+        x_16_up = self.conv_head16(x_16_up)
+
+        return x_16_up, x_32_up
+
+
+class FeatureFusionModule(BaseModule):
+    """Feature Fusion Module to fuse low level output feature of Spatial Path
+    and high level output feature of Context Path.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+    Returns:
+        x_out (torch.Tensor): Feature map of Feature Fusion Module.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.gap = nn.AdaptiveAvgPool2d((1, 1))
+        self.conv_atten = nn.Sequential(
+            ConvModule(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg), nn.Sigmoid())
+
+    def forward(self, x_sp, x_cp):
+        x_concat = torch.cat([x_sp, x_cp], dim=1)
+        x_fuse = self.conv1(x_concat)
+        x_atten = self.gap(x_fuse)
+        # Note: No BN and more 1x1 conv in paper.
+        x_atten = self.conv_atten(x_atten)
+        x_atten = x_fuse * x_atten
+        x_out = x_atten + x_fuse
+        return x_out
+
+
+@MODELS.register_module()
+class BiSeNetV1(BaseModule):
+    """BiSeNetV1 backbone.
+
+    This backbone is the implementation of `BiSeNet: Bilateral
+    Segmentation Network for Real-time Semantic
+    Segmentation <https://arxiv.org/abs/1808.00897>`_.
+
+    Args:
+        backbone_cfg:(dict): Config of backbone of
+            Context Path.
+        in_channels (int): The number of channels of input
+            image. Default: 3.
+        spatial_channels (Tuple[int]): Size of channel numbers of
+            various layers in Spatial Path.
+            Default: (64, 64, 64, 128).
+        context_channels (Tuple[int]): Size of channel numbers of
+            various modules in Context Path.
+            Default: (128, 256, 512).
+        out_indices (Tuple[int] | int, optional): Output from which stages.
+            Default: (0, 1, 2).
+        align_corners (bool, optional): The align_corners argument of
+            resize operation in Bilateral Guided Aggregation Layer.
+            Default: False.
+        out_channels(int): The number of channels of output.
+            It must be the same with `in_channels` of decode_head.
+            Default: 256.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 in_channels=3,
+                 spatial_channels=(64, 64, 64, 128),
+                 context_channels=(128, 256, 512),
+                 out_indices=(0, 1, 2),
+                 align_corners=False,
+                 out_channels=256,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+        assert len(spatial_channels) == 4, 'Length of input channels \
+                                           of Spatial Path must be 4!'
+
+        assert len(context_channels) == 3, 'Length of input channels \
+                                           of Context Path must be 3!'
+
+        self.out_indices = out_indices
+        self.align_corners = align_corners
+        self.context_path = ContextPath(backbone_cfg, context_channels,
+                                        self.align_corners)
+        self.spatial_path = SpatialPath(in_channels, spatial_channels)
+        self.ffm = FeatureFusionModule(context_channels[1], out_channels)
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+    def forward(self, x):
+        # stole refactoring code from Coin Cheung, thanks
+        x_context8, x_context16 = self.context_path(x)
+        x_spatial = self.spatial_path(x)
+        x_fuse = self.ffm(x_spatial, x_context8)
+
+        outs = [x_fuse, x_context8, x_context16]
+        outs = [outs[i] for i in self.out_indices]
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmseg/models/backbones/bisenetv2.py b/head_extractor/build/lib/mmseg/models/backbones/bisenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..32aa49822f7d0c3bd4839b3796a15689e1f4cbc0
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/bisenetv2.py
@@ -0,0 +1,622 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule,
+                      build_activation_layer, build_norm_layer)
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+class DetailBranch(BaseModule):
+    """Detail Branch with wide channels and shallow layers to capture low-level
+    details and generate high-resolution feature representation.
+
+    Args:
+        detail_channels (Tuple[int]): Size of channel numbers of each stage
+            in Detail Branch, in paper it has 3 stages.
+            Default: (64, 64, 128).
+        in_channels (int): Number of channels of input image. Default: 3.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        x (torch.Tensor): Feature map of Detail Branch.
+    """
+
+    def __init__(self,
+                 detail_channels=(64, 64, 128),
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        detail_branch = []
+        for i in range(len(detail_channels)):
+            if i == 0:
+                detail_branch.append(
+                    nn.Sequential(
+                        ConvModule(
+                            in_channels=in_channels,
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=2,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg),
+                        ConvModule(
+                            in_channels=detail_channels[i],
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg)))
+            else:
+                detail_branch.append(
+                    nn.Sequential(
+                        ConvModule(
+                            in_channels=detail_channels[i - 1],
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=2,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg),
+                        ConvModule(
+                            in_channels=detail_channels[i],
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg),
+                        ConvModule(
+                            in_channels=detail_channels[i],
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg)))
+        self.detail_branch = nn.ModuleList(detail_branch)
+
+    def forward(self, x):
+        for stage in self.detail_branch:
+            x = stage(x)
+        return x
+
+
+class StemBlock(BaseModule):
+    """Stem Block at the beginning of Semantic Branch.
+
+    Args:
+        in_channels (int): Number of input channels.
+            Default: 3.
+        out_channels (int): Number of output channels.
+            Default: 16.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        x (torch.Tensor): First feature map in Semantic Branch.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 out_channels=16,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.conv_first = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.convs = nn.Sequential(
+            ConvModule(
+                in_channels=out_channels,
+                out_channels=out_channels // 2,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                in_channels=out_channels // 2,
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+        self.pool = nn.MaxPool2d(
+            kernel_size=3, stride=2, padding=1, ceil_mode=False)
+        self.fuse_last = ConvModule(
+            in_channels=out_channels * 2,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        x = self.conv_first(x)
+        x_left = self.convs(x)
+        x_right = self.pool(x)
+        x = self.fuse_last(torch.cat([x_left, x_right], dim=1))
+        return x
+
+
+class GELayer(BaseModule):
+    """Gather-and-Expansion Layer.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        exp_ratio (int): Expansion ratio for middle channels.
+            Default: 6.
+        stride (int): Stride of GELayer. Default: 1
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        x (torch.Tensor): Intermediate feature map in
+            Semantic Branch.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 exp_ratio=6,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        mid_channel = in_channels * exp_ratio
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        if stride == 1:
+            self.dwconv = nn.Sequential(
+                # ReLU in ConvModule not shown in paper
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=mid_channel,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    groups=in_channels,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.shortcut = None
+        else:
+            self.dwconv = nn.Sequential(
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=mid_channel,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    groups=in_channels,
+                    bias=False,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=None),
+                # ReLU in ConvModule not shown in paper
+                ConvModule(
+                    in_channels=mid_channel,
+                    out_channels=mid_channel,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    groups=mid_channel,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+            )
+            self.shortcut = nn.Sequential(
+                DepthwiseSeparableConvModule(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    dw_norm_cfg=norm_cfg,
+                    dw_act_cfg=None,
+                    pw_norm_cfg=norm_cfg,
+                    pw_act_cfg=None,
+                ))
+
+        self.conv2 = nn.Sequential(
+            ConvModule(
+                in_channels=mid_channel,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+            ))
+
+        self.act = build_activation_layer(act_cfg)
+
+    def forward(self, x):
+        identity = x
+        x = self.conv1(x)
+        x = self.dwconv(x)
+        x = self.conv2(x)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(identity)
+            x = x + shortcut
+        else:
+            x = x + identity
+        x = self.act(x)
+        return x
+
+
+class CEBlock(BaseModule):
+    """Context Embedding Block for large receptive filed in Semantic Branch.
+
+    Args:
+        in_channels (int): Number of input channels.
+            Default: 3.
+        out_channels (int): Number of output channels.
+            Default: 16.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        x (torch.Tensor): Last feature map in Semantic Branch.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 out_channels=16,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.gap = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            build_norm_layer(norm_cfg, self.in_channels)[1])
+        self.conv_gap = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        # Note: in paper here is naive conv2d, no bn-relu
+        self.conv_last = ConvModule(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        identity = x
+        x = self.gap(x)
+        x = self.conv_gap(x)
+        x = identity + x
+        x = self.conv_last(x)
+        return x
+
+
+class SemanticBranch(BaseModule):
+    """Semantic Branch which is lightweight with narrow channels and deep
+    layers to obtain　high-level semantic context.
+
+    Args:
+        semantic_channels(Tuple[int]): Size of channel numbers of
+            various stages in Semantic Branch.
+            Default: (16, 32, 64, 128).
+        in_channels (int): Number of channels of input image. Default: 3.
+        exp_ratio (int): Expansion ratio for middle channels.
+            Default: 6.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        semantic_outs (List[torch.Tensor]): List of several feature maps
+            for auxiliary heads (Booster) and Bilateral
+            Guided Aggregation Layer.
+    """
+
+    def __init__(self,
+                 semantic_channels=(16, 32, 64, 128),
+                 in_channels=3,
+                 exp_ratio=6,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.semantic_channels = semantic_channels
+        self.semantic_stages = []
+        for i in range(len(semantic_channels)):
+            stage_name = f'stage{i + 1}'
+            self.semantic_stages.append(stage_name)
+            if i == 0:
+                self.add_module(
+                    stage_name,
+                    StemBlock(self.in_channels, semantic_channels[i]))
+            elif i == (len(semantic_channels) - 1):
+                self.add_module(
+                    stage_name,
+                    nn.Sequential(
+                        GELayer(semantic_channels[i - 1], semantic_channels[i],
+                                exp_ratio, 2),
+                        GELayer(semantic_channels[i], semantic_channels[i],
+                                exp_ratio, 1),
+                        GELayer(semantic_channels[i], semantic_channels[i],
+                                exp_ratio, 1),
+                        GELayer(semantic_channels[i], semantic_channels[i],
+                                exp_ratio, 1)))
+            else:
+                self.add_module(
+                    stage_name,
+                    nn.Sequential(
+                        GELayer(semantic_channels[i - 1], semantic_channels[i],
+                                exp_ratio, 2),
+                        GELayer(semantic_channels[i], semantic_channels[i],
+                                exp_ratio, 1)))
+
+        self.add_module(f'stage{len(semantic_channels)}_CEBlock',
+                        CEBlock(semantic_channels[-1], semantic_channels[-1]))
+        self.semantic_stages.append(f'stage{len(semantic_channels)}_CEBlock')
+
+    def forward(self, x):
+        semantic_outs = []
+        for stage_name in self.semantic_stages:
+            semantic_stage = getattr(self, stage_name)
+            x = semantic_stage(x)
+            semantic_outs.append(x)
+        return semantic_outs
+
+
+class BGALayer(BaseModule):
+    """Bilateral Guided Aggregation Layer to fuse the complementary information
+    from both Detail Branch and Semantic Branch.
+
+    Args:
+        out_channels (int): Number of output channels.
+            Default: 128.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        output (torch.Tensor): Output feature map for Segment heads.
+    """
+
+    def __init__(self,
+                 out_channels=128,
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.out_channels = out_channels
+        self.align_corners = align_corners
+        self.detail_dwconv = nn.Sequential(
+            DepthwiseSeparableConvModule(
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                dw_norm_cfg=norm_cfg,
+                dw_act_cfg=None,
+                pw_norm_cfg=None,
+                pw_act_cfg=None,
+            ))
+        self.detail_down = nn.Sequential(
+            ConvModule(
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None),
+            nn.AvgPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False))
+        self.semantic_conv = nn.Sequential(
+            ConvModule(
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None))
+        self.semantic_dwconv = nn.Sequential(
+            DepthwiseSeparableConvModule(
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                dw_norm_cfg=norm_cfg,
+                dw_act_cfg=None,
+                pw_norm_cfg=None,
+                pw_act_cfg=None,
+            ))
+        self.conv = ConvModule(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            inplace=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+        )
+
+    def forward(self, x_d, x_s):
+        detail_dwconv = self.detail_dwconv(x_d)
+        detail_down = self.detail_down(x_d)
+        semantic_conv = self.semantic_conv(x_s)
+        semantic_dwconv = self.semantic_dwconv(x_s)
+        semantic_conv = resize(
+            input=semantic_conv,
+            size=detail_dwconv.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        fuse_1 = detail_dwconv * torch.sigmoid(semantic_conv)
+        fuse_2 = detail_down * torch.sigmoid(semantic_dwconv)
+        fuse_2 = resize(
+            input=fuse_2,
+            size=fuse_1.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        output = self.conv(fuse_1 + fuse_2)
+        return output
+
+
+@MODELS.register_module()
+class BiSeNetV2(BaseModule):
+    """BiSeNetV2: Bilateral Network with Guided Aggregation for
+    Real-time Semantic Segmentation.
+
+    This backbone is the implementation of
+    `BiSeNetV2 <https://arxiv.org/abs/2004.02147>`_.
+
+    Args:
+        in_channels (int): Number of channel of input image. Default: 3.
+        detail_channels (Tuple[int], optional): Channels of each stage
+            in Detail Branch. Default: (64, 64, 128).
+        semantic_channels (Tuple[int], optional): Channels of each stage
+            in Semantic Branch. Default: (16, 32, 64, 128).
+            See Table 1 and Figure 3 of paper for more details.
+        semantic_expansion_ratio (int, optional): The expansion factor
+            expanding channel number of middle channels in Semantic Branch.
+            Default: 6.
+        bga_channels (int, optional): Number of middle channels in
+            Bilateral Guided Aggregation Layer. Default: 128.
+        out_indices (Tuple[int] | int, optional): Output from which stages.
+            Default: (0, 1, 2, 3, 4).
+        align_corners (bool, optional): The align_corners argument of
+            resize operation in Bilateral Guided Aggregation Layer.
+            Default: False.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 detail_channels=(64, 64, 128),
+                 semantic_channels=(16, 32, 64, 128),
+                 semantic_expansion_ratio=6,
+                 bga_channels=128,
+                 out_indices=(0, 1, 2, 3, 4),
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        if init_cfg is None:
+            init_cfg = [
+                dict(type='Kaiming', layer='Conv2d'),
+                dict(
+                    type='Constant', val=1, layer=['_BatchNorm', 'GroupNorm'])
+            ]
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_indices = out_indices
+        self.detail_channels = detail_channels
+        self.semantic_channels = semantic_channels
+        self.semantic_expansion_ratio = semantic_expansion_ratio
+        self.bga_channels = bga_channels
+        self.align_corners = align_corners
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.detail = DetailBranch(self.detail_channels, self.in_channels)
+        self.semantic = SemanticBranch(self.semantic_channels,
+                                       self.in_channels,
+                                       self.semantic_expansion_ratio)
+        self.bga = BGALayer(self.bga_channels, self.align_corners)
+
+    def forward(self, x):
+        #  stole refactoring code from Coin Cheung, thanks
+        x_detail = self.detail(x)
+        x_semantic_lst = self.semantic(x)
+        x_head = self.bga(x_detail, x_semantic_lst[-1])
+        outs = [x_head] + x_semantic_lst[:-1]
+        outs = [outs[i] for i in self.out_indices]
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmseg/models/backbones/cgnet.py b/head_extractor/build/lib/mmseg/models/backbones/cgnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b74b494f53466d1c608e50d088632aa952a5e534
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/cgnet.py
@@ -0,0 +1,372 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmseg.registry import MODELS
+
+
+class GlobalContextExtractor(nn.Module):
+    """Global Context Extractor for CGNet.
+
+    This class is employed to refine the joint feature of both local feature
+    and surrounding context.
+
+    Args:
+        channel (int): Number of input feature channels.
+        reduction (int): Reductions for global context extractor. Default: 16.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self, channel, reduction=16, with_cp=False):
+        super().__init__()
+        self.channel = channel
+        self.reduction = reduction
+        assert reduction >= 1 and channel >= reduction
+        self.with_cp = with_cp
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction), nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel), nn.Sigmoid())
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            num_batch, num_channel = x.size()[:2]
+            y = self.avg_pool(x).view(num_batch, num_channel)
+            y = self.fc(y).view(num_batch, num_channel, 1, 1)
+            return x * y
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class ContextGuidedBlock(nn.Module):
+    """Context Guided Block for CGNet.
+
+    This class consists of four components: local feature extractor,
+    surrounding feature extractor, joint feature extractor and global
+    context extractor.
+
+    Args:
+        in_channels (int): Number of input feature channels.
+        out_channels (int): Number of output feature channels.
+        dilation (int): Dilation rate for surrounding context extractor.
+            Default: 2.
+        reduction (int): Reduction for global context extractor. Default: 16.
+        skip_connect (bool): Add input to output or not. Default: True.
+        downsample (bool): Downsample the input to 1/2 or not. Default: False.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='PReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 dilation=2,
+                 reduction=16,
+                 skip_connect=True,
+                 downsample=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='PReLU'),
+                 with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+        self.downsample = downsample
+
+        channels = out_channels if downsample else out_channels // 2
+        if 'type' in act_cfg and act_cfg['type'] == 'PReLU':
+            act_cfg['num_parameters'] = channels
+        kernel_size = 3 if downsample else 1
+        stride = 2 if downsample else 1
+        padding = (kernel_size - 1) // 2
+
+        self.conv1x1 = ConvModule(
+            in_channels,
+            channels,
+            kernel_size,
+            stride,
+            padding,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.f_loc = build_conv_layer(
+            conv_cfg,
+            channels,
+            channels,
+            kernel_size=3,
+            padding=1,
+            groups=channels,
+            bias=False)
+        self.f_sur = build_conv_layer(
+            conv_cfg,
+            channels,
+            channels,
+            kernel_size=3,
+            padding=dilation,
+            groups=channels,
+            dilation=dilation,
+            bias=False)
+
+        self.bn = build_norm_layer(norm_cfg, 2 * channels)[1]
+        self.activate = nn.PReLU(2 * channels)
+
+        if downsample:
+            self.bottleneck = build_conv_layer(
+                conv_cfg,
+                2 * channels,
+                out_channels,
+                kernel_size=1,
+                bias=False)
+
+        self.skip_connect = skip_connect and not downsample
+        self.f_glo = GlobalContextExtractor(out_channels, reduction, with_cp)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = self.conv1x1(x)
+            loc = self.f_loc(out)
+            sur = self.f_sur(out)
+
+            joi_feat = torch.cat([loc, sur], 1)  # the joint feature
+            joi_feat = self.bn(joi_feat)
+            joi_feat = self.activate(joi_feat)
+            if self.downsample:
+                joi_feat = self.bottleneck(joi_feat)  # channel = out_channels
+            # f_glo is employed to refine the joint feature
+            out = self.f_glo(joi_feat)
+
+            if self.skip_connect:
+                return x + out
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class InputInjection(nn.Module):
+    """Downsampling module for CGNet."""
+
+    def __init__(self, num_downsampling):
+        super().__init__()
+        self.pool = nn.ModuleList()
+        for i in range(num_downsampling):
+            self.pool.append(nn.AvgPool2d(3, stride=2, padding=1))
+
+    def forward(self, x):
+        for pool in self.pool:
+            x = pool(x)
+        return x
+
+
+@MODELS.register_module()
+class CGNet(BaseModule):
+    """CGNet backbone.
+
+    This backbone is the implementation of `A Light-weight Context Guided
+    Network for Semantic Segmentation <https://arxiv.org/abs/1811.08201>`_.
+
+    Args:
+        in_channels (int): Number of input image channels. Normally 3.
+        num_channels (tuple[int]): Numbers of feature channels at each stages.
+            Default: (32, 64, 128).
+        num_blocks (tuple[int]): Numbers of CG blocks at stage 1 and stage 2.
+            Default: (3, 21).
+        dilations (tuple[int]): Dilation rate for surrounding context
+            extractors at stage 1 and stage 2. Default: (2, 4).
+        reductions (tuple[int]): Reductions for global context extractors at
+            stage 1 and stage 2. Default: (8, 16).
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='PReLU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 num_channels=(32, 64, 128),
+                 num_blocks=(3, 21),
+                 dilations=(2, 4),
+                 reductions=(8, 16),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='PReLU'),
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+
+        super().__init__(init_cfg)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer=['Conv2d', 'Linear']),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm']),
+                    dict(type='Constant', val=0, layer='PReLU')
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.in_channels = in_channels
+        self.num_channels = num_channels
+        assert isinstance(self.num_channels, tuple) and len(
+            self.num_channels) == 3
+        self.num_blocks = num_blocks
+        assert isinstance(self.num_blocks, tuple) and len(self.num_blocks) == 2
+        self.dilations = dilations
+        assert isinstance(self.dilations, tuple) and len(self.dilations) == 2
+        self.reductions = reductions
+        assert isinstance(self.reductions, tuple) and len(self.reductions) == 2
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        if 'type' in self.act_cfg and self.act_cfg['type'] == 'PReLU':
+            self.act_cfg['num_parameters'] = num_channels[0]
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        cur_channels = in_channels
+        self.stem = nn.ModuleList()
+        for i in range(3):
+            self.stem.append(
+                ConvModule(
+                    cur_channels,
+                    num_channels[0],
+                    3,
+                    2 if i == 0 else 1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            cur_channels = num_channels[0]
+
+        self.inject_2x = InputInjection(1)  # down-sample for Input, factor=2
+        self.inject_4x = InputInjection(2)  # down-sample for Input, factor=4
+
+        cur_channels += in_channels
+        self.norm_prelu_0 = nn.Sequential(
+            build_norm_layer(norm_cfg, cur_channels)[1],
+            nn.PReLU(cur_channels))
+
+        # stage 1
+        self.level1 = nn.ModuleList()
+        for i in range(num_blocks[0]):
+            self.level1.append(
+                ContextGuidedBlock(
+                    cur_channels if i == 0 else num_channels[1],
+                    num_channels[1],
+                    dilations[0],
+                    reductions[0],
+                    downsample=(i == 0),
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    with_cp=with_cp))  # CG block
+
+        cur_channels = 2 * num_channels[1] + in_channels
+        self.norm_prelu_1 = nn.Sequential(
+            build_norm_layer(norm_cfg, cur_channels)[1],
+            nn.PReLU(cur_channels))
+
+        # stage 2
+        self.level2 = nn.ModuleList()
+        for i in range(num_blocks[1]):
+            self.level2.append(
+                ContextGuidedBlock(
+                    cur_channels if i == 0 else num_channels[2],
+                    num_channels[2],
+                    dilations[1],
+                    reductions[1],
+                    downsample=(i == 0),
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    with_cp=with_cp))  # CG block
+
+        cur_channels = 2 * num_channels[2]
+        self.norm_prelu_2 = nn.Sequential(
+            build_norm_layer(norm_cfg, cur_channels)[1],
+            nn.PReLU(cur_channels))
+
+    def forward(self, x):
+        output = []
+
+        # stage 0
+        inp_2x = self.inject_2x(x)
+        inp_4x = self.inject_4x(x)
+        for layer in self.stem:
+            x = layer(x)
+        x = self.norm_prelu_0(torch.cat([x, inp_2x], 1))
+        output.append(x)
+
+        # stage 1
+        for i, layer in enumerate(self.level1):
+            x = layer(x)
+            if i == 0:
+                down1 = x
+        x = self.norm_prelu_1(torch.cat([x, down1, inp_4x], 1))
+        output.append(x)
+
+        # stage 2
+        for i, layer in enumerate(self.level2):
+            x = layer(x)
+            if i == 0:
+                down2 = x
+        x = self.norm_prelu_2(torch.cat([down2, x], 1))
+        output.append(x)
+
+        return output
+
+    def train(self, mode=True):
+        """Convert the model into training mode will keeping the normalization
+        layer freezed."""
+        super().train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/head_extractor/build/lib/mmseg/models/backbones/ddrnet.py b/head_extractor/build/lib/mmseg/models/backbones/ddrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4508aade82b484abfcca593825649031db7cbdd0
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/ddrnet.py
@@ -0,0 +1,222 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_norm_layer
+from mmengine.model import BaseModule
+
+from mmseg.models.utils import DAPPM, BasicBlock, Bottleneck, resize
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType
+
+
+@MODELS.register_module()
+class DDRNet(BaseModule):
+    """DDRNet backbone.
+
+    This backbone is the implementation of `Deep Dual-resolution Networks for
+    Real-time and Accurate Semantic Segmentation of Road Scenes
+    <http://arxiv.org/abs/2101.06085>`_.
+    Modified from https://github.com/ydhongHIT/DDRNet.
+
+    Args:
+        in_channels (int): Number of input image channels. Default: 3.
+        channels: (int): The base channels of DDRNet. Default: 32.
+        ppm_channels (int): The channels of PPM module. Default: 128.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        norm_cfg (dict): Config dict to build norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+        init_cfg (dict, optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 3,
+                 channels: int = 32,
+                 ppm_channels: int = 128,
+                 align_corners: bool = False,
+                 norm_cfg: OptConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+
+        self.in_channels = in_channels
+        self.ppm_channels = ppm_channels
+
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+
+        # stage 0-2
+        self.stem = self._make_stem_layer(in_channels, channels, num_blocks=2)
+        self.relu = nn.ReLU()
+
+        # low resolution(context) branch
+        self.context_branch_layers = nn.ModuleList()
+        for i in range(3):
+            self.context_branch_layers.append(
+                self._make_layer(
+                    block=BasicBlock if i < 2 else Bottleneck,
+                    inplanes=channels * 2**(i + 1),
+                    planes=channels * 8 if i > 0 else channels * 4,
+                    num_blocks=2 if i < 2 else 1,
+                    stride=2))
+
+        # bilateral fusion
+        self.compression_1 = ConvModule(
+            channels * 4,
+            channels * 2,
+            kernel_size=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+        self.down_1 = ConvModule(
+            channels * 2,
+            channels * 4,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+
+        self.compression_2 = ConvModule(
+            channels * 8,
+            channels * 2,
+            kernel_size=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+        self.down_2 = nn.Sequential(
+            ConvModule(
+                channels * 2,
+                channels * 4,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                channels * 4,
+                channels * 8,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None))
+
+        # high resolution(spatial) branch
+        self.spatial_branch_layers = nn.ModuleList()
+        for i in range(3):
+            self.spatial_branch_layers.append(
+                self._make_layer(
+                    block=BasicBlock if i < 2 else Bottleneck,
+                    inplanes=channels * 2,
+                    planes=channels * 2,
+                    num_blocks=2 if i < 2 else 1,
+                ))
+
+        self.spp = DAPPM(
+            channels * 16, ppm_channels, channels * 4, num_scales=5)
+
+    def _make_stem_layer(self, in_channels, channels, num_blocks):
+        layers = [
+            ConvModule(
+                in_channels,
+                channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                channels,
+                channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        ]
+
+        layers.extend([
+            self._make_layer(BasicBlock, channels, channels, num_blocks),
+            nn.ReLU(),
+            self._make_layer(
+                BasicBlock, channels, channels * 2, num_blocks, stride=2),
+            nn.ReLU(),
+        ])
+
+        return nn.Sequential(*layers)
+
+    def _make_layer(self, block, inplanes, planes, num_blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, planes * block.expansion)[1])
+
+        layers = [
+            block(
+                in_channels=inplanes,
+                channels=planes,
+                stride=stride,
+                downsample=downsample)
+        ]
+        inplanes = planes * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    in_channels=inplanes,
+                    channels=planes,
+                    stride=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg_out=None if i == num_blocks - 1 else self.act_cfg))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Forward function."""
+        out_size = (x.shape[-2] // 8, x.shape[-1] // 8)
+
+        # stage 0-2
+        x = self.stem(x)
+
+        # stage3
+        x_c = self.context_branch_layers[0](x)
+        x_s = self.spatial_branch_layers[0](x)
+        comp_c = self.compression_1(self.relu(x_c))
+        x_c += self.down_1(self.relu(x_s))
+        x_s += resize(
+            comp_c,
+            size=out_size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.training:
+            temp_context = x_s.clone()
+
+        # stage4
+        x_c = self.context_branch_layers[1](self.relu(x_c))
+        x_s = self.spatial_branch_layers[1](self.relu(x_s))
+        comp_c = self.compression_2(self.relu(x_c))
+        x_c += self.down_2(self.relu(x_s))
+        x_s += resize(
+            comp_c,
+            size=out_size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        # stage5
+        x_s = self.spatial_branch_layers[2](self.relu(x_s))
+        x_c = self.context_branch_layers[2](self.relu(x_c))
+        x_c = self.spp(x_c)
+        x_c = resize(
+            x_c,
+            size=out_size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        return (temp_context, x_s + x_c) if self.training else x_s + x_c
diff --git a/head_extractor/build/lib/mmseg/models/backbones/dinov2.py b/head_extractor/build/lib/mmseg/models/backbones/dinov2.py
new file mode 100644
index 0000000000000000000000000000000000000000..95d39521ee1c4d5dfd80df48059f5bb41056f1f5
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/dinov2.py
@@ -0,0 +1,60 @@
+import torch
+from mmengine.model import BaseModule
+from torch import nn
+
+from mmseg.registry import MODELS
+import os
+_DINOV2_MMSEG_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+_DINOV2_TORCHHUB_DIR = os.path.join(_DINOV2_MMSEG_ROOT, 'torchhub', 'facebookresearch_dinov2_main')
+
+
+@MODELS.register_module()
+class DINOv2(nn.Module):
+    """Use DINOv2 pre-trained models
+    """
+
+    def __init__(self, version='large', freeze=False, load_from=None):
+        super().__init__()
+        
+        if version == 'large':
+            self.dinov2 = torch.hub.load(_DINOV2_TORCHHUB_DIR, 'dinov2_vitl14', source='local', pretrained=False)
+        else:
+            raise NotImplementedError
+
+        if load_from is not None:
+            if load_from.split('/')[-1] == 'depth_anything_vitl14.pth':
+                print(load_from)
+                d = torch.load(load_from, map_location='cpu')
+                new_d = {}
+                for key, value in d.items():
+                    if 'pretrained' in key:
+                        new_d[key.replace('pretrained.', '')] = value
+                self.dinov2.load_state_dict(new_d)
+            else:
+                print(load_from)
+                all_d = torch.load(load_from, map_location='cpu')
+                d = all_d['state_dict']
+                new_d = {}
+                for key, value in d.items():
+                    if 'backbone.dinov2' in key:
+                        new_d[key.replace('backbone.dinov2.', '')] = value
+                self.dinov2.load_state_dict(new_d)
+        
+        self.freeze = freeze
+        
+    def forward(self, inputs):
+        B, _, h, w = inputs.shape
+        
+        if self.freeze:
+            with torch.no_grad():
+                features = self.dinov2.get_intermediate_layers(inputs, 4)
+        else:
+            features = self.dinov2.get_intermediate_layers(inputs, 4)
+        
+        outs = []
+        for feature in features:
+            C = feature.shape[-1]
+            feature = feature.permute(0, 2, 1).reshape(B, C, h // 14, w // 14).contiguous()
+            outs.append(feature)
+        
+        return outs
diff --git a/head_extractor/build/lib/mmseg/models/backbones/erfnet.py b/head_extractor/build/lib/mmseg/models/backbones/erfnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c5ec672a086b5d67568514140023ce402eef92f
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/erfnet.py
@@ -0,0 +1,329 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+class DownsamplerBlock(BaseModule):
+    """Downsampler block of ERFNet.
+
+    This module is a little different from basical ConvModule.
+    The features from Conv and MaxPool layers are
+    concatenated before BatchNorm.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.conv = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            out_channels - in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.bn = build_norm_layer(self.norm_cfg, out_channels)[1]
+        self.act = build_activation_layer(self.act_cfg)
+
+    def forward(self, input):
+        conv_out = self.conv(input)
+        pool_out = self.pool(input)
+        pool_out = resize(
+            input=pool_out,
+            size=conv_out.size()[2:],
+            mode='bilinear',
+            align_corners=False)
+        output = torch.cat([conv_out, pool_out], 1)
+        output = self.bn(output)
+        output = self.act(output)
+        return output
+
+
+class NonBottleneck1d(BaseModule):
+    """Non-bottleneck block of ERFNet.
+
+    Args:
+        channels (int): Number of channels in Non-bottleneck block.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.
+        dilation (int): Dilation rate for last two conv layers.
+            Default 1.
+        num_conv_layer (int): Number of 3x1 and 1x3 convolution layers.
+            Default 2.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 channels,
+                 drop_rate=0,
+                 dilation=1,
+                 num_conv_layer=2,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.act = build_activation_layer(self.act_cfg)
+
+        self.convs_layers = nn.ModuleList()
+        for conv_layer in range(num_conv_layer):
+            first_conv_padding = (1, 0) if conv_layer == 0 else (dilation, 0)
+            first_conv_dilation = 1 if conv_layer == 0 else (dilation, 1)
+            second_conv_padding = (0, 1) if conv_layer == 0 else (0, dilation)
+            second_conv_dilation = 1 if conv_layer == 0 else (1, dilation)
+
+            self.convs_layers.append(
+                build_conv_layer(
+                    self.conv_cfg,
+                    channels,
+                    channels,
+                    kernel_size=(3, 1),
+                    stride=1,
+                    padding=first_conv_padding,
+                    bias=True,
+                    dilation=first_conv_dilation))
+            self.convs_layers.append(self.act)
+            self.convs_layers.append(
+                build_conv_layer(
+                    self.conv_cfg,
+                    channels,
+                    channels,
+                    kernel_size=(1, 3),
+                    stride=1,
+                    padding=second_conv_padding,
+                    bias=True,
+                    dilation=second_conv_dilation))
+            self.convs_layers.append(
+                build_norm_layer(self.norm_cfg, channels)[1])
+            if conv_layer == 0:
+                self.convs_layers.append(self.act)
+            else:
+                self.convs_layers.append(nn.Dropout(p=drop_rate))
+
+    def forward(self, input):
+        output = input
+        for conv in self.convs_layers:
+            output = conv(output)
+        output = self.act(output + input)
+        return output
+
+
+class UpsamplerBlock(BaseModule):
+    """Upsampler block of ERFNet.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.conv = nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            output_padding=1,
+            bias=True)
+        self.bn = build_norm_layer(self.norm_cfg, out_channels)[1]
+        self.act = build_activation_layer(self.act_cfg)
+
+    def forward(self, input):
+        output = self.conv(input)
+        output = self.bn(output)
+        output = self.act(output)
+        return output
+
+
+@MODELS.register_module()
+class ERFNet(BaseModule):
+    """ERFNet backbone.
+
+    This backbone is the implementation of `ERFNet: Efficient Residual
+    Factorized ConvNet for Real-time SemanticSegmentation
+    <https://ieeexplore.ieee.org/document/8063438>`_.
+
+    Args:
+        in_channels (int): The number of channels of input
+            image. Default: 3.
+        enc_downsample_channels (Tuple[int]): Size of channel
+            numbers of various Downsampler block in encoder.
+            Default: (16, 64, 128).
+        enc_stage_non_bottlenecks (Tuple[int]): Number of stages of
+            Non-bottleneck block in encoder.
+            Default: (5, 8).
+        enc_non_bottleneck_dilations (Tuple[int]): Dilation rate of each
+            stage of Non-bottleneck block of encoder.
+            Default: (2, 4, 8, 16).
+        enc_non_bottleneck_channels (Tuple[int]): Size of channel
+            numbers of various Non-bottleneck block in encoder.
+            Default: (64, 128).
+        dec_upsample_channels (Tuple[int]): Size of channel numbers of
+            various Deconvolution block in decoder.
+            Default: (64, 16).
+        dec_stages_non_bottleneck (Tuple[int]): Number of stages of
+            Non-bottleneck block in decoder.
+            Default: (2, 2).
+        dec_non_bottleneck_channels (Tuple[int]): Size of channel
+            numbers of various Non-bottleneck block in decoder.
+            Default: (64, 16).
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.1.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 enc_downsample_channels=(16, 64, 128),
+                 enc_stage_non_bottlenecks=(5, 8),
+                 enc_non_bottleneck_dilations=(2, 4, 8, 16),
+                 enc_non_bottleneck_channels=(64, 128),
+                 dec_upsample_channels=(64, 16),
+                 dec_stages_non_bottleneck=(2, 2),
+                 dec_non_bottleneck_channels=(64, 16),
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+        assert len(enc_downsample_channels) \
+               == len(dec_upsample_channels)+1, 'Number of downsample\
+                     block of encoder does not \
+                    match number of upsample block of decoder!'
+        assert len(enc_downsample_channels) \
+               == len(enc_stage_non_bottlenecks)+1, 'Number of \
+                    downsample block of encoder does not match \
+                    number of Non-bottleneck block of encoder!'
+        assert len(enc_downsample_channels) \
+               == len(enc_non_bottleneck_channels)+1, 'Number of \
+                    downsample block of encoder does not match \
+                    number of channels of Non-bottleneck block of encoder!'
+        assert enc_stage_non_bottlenecks[-1] \
+               % len(enc_non_bottleneck_dilations) == 0, 'Number of \
+                    Non-bottleneck block of encoder does not match \
+                    number of Non-bottleneck block of encoder!'
+        assert len(dec_upsample_channels) \
+               == len(dec_stages_non_bottleneck), 'Number of \
+                upsample block of decoder does not match \
+                number of Non-bottleneck block of decoder!'
+        assert len(dec_stages_non_bottleneck) \
+               == len(dec_non_bottleneck_channels), 'Number of \
+                Non-bottleneck block of decoder does not match \
+                number of channels of Non-bottleneck block of decoder!'
+
+        self.in_channels = in_channels
+        self.enc_downsample_channels = enc_downsample_channels
+        self.enc_stage_non_bottlenecks = enc_stage_non_bottlenecks
+        self.enc_non_bottleneck_dilations = enc_non_bottleneck_dilations
+        self.enc_non_bottleneck_channels = enc_non_bottleneck_channels
+        self.dec_upsample_channels = dec_upsample_channels
+        self.dec_stages_non_bottleneck = dec_stages_non_bottleneck
+        self.dec_non_bottleneck_channels = dec_non_bottleneck_channels
+        self.dropout_ratio = dropout_ratio
+
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.encoder.append(
+            DownsamplerBlock(self.in_channels, enc_downsample_channels[0]))
+
+        for i in range(len(enc_downsample_channels) - 1):
+            self.encoder.append(
+                DownsamplerBlock(enc_downsample_channels[i],
+                                 enc_downsample_channels[i + 1]))
+            # Last part of encoder is some dilated NonBottleneck1d blocks.
+            if i == len(enc_downsample_channels) - 2:
+                iteration_times = int(enc_stage_non_bottlenecks[-1] /
+                                      len(enc_non_bottleneck_dilations))
+                for j in range(iteration_times):
+                    for k in range(len(enc_non_bottleneck_dilations)):
+                        self.encoder.append(
+                            NonBottleneck1d(enc_downsample_channels[-1],
+                                            self.dropout_ratio,
+                                            enc_non_bottleneck_dilations[k]))
+            else:
+                for j in range(enc_stage_non_bottlenecks[i]):
+                    self.encoder.append(
+                        NonBottleneck1d(enc_downsample_channels[i + 1],
+                                        self.dropout_ratio))
+
+        for i in range(len(dec_upsample_channels)):
+            if i == 0:
+                self.decoder.append(
+                    UpsamplerBlock(enc_downsample_channels[-1],
+                                   dec_non_bottleneck_channels[i]))
+            else:
+                self.decoder.append(
+                    UpsamplerBlock(dec_non_bottleneck_channels[i - 1],
+                                   dec_non_bottleneck_channels[i]))
+            for j in range(dec_stages_non_bottleneck[i]):
+                self.decoder.append(
+                    NonBottleneck1d(dec_non_bottleneck_channels[i]))
+
+    def forward(self, x):
+        for enc in self.encoder:
+            x = enc(x)
+        for dec in self.decoder:
+            x = dec(x)
+        return [x]
diff --git a/head_extractor/build/lib/mmseg/models/backbones/fast_scnn.py b/head_extractor/build/lib/mmseg/models/backbones/fast_scnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ff7a3191d2fee904c5200e0a526214a65f58b32
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/fast_scnn.py
@@ -0,0 +1,408 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+
+from mmseg.models.decode_heads.psp_head import PPM
+from mmseg.registry import MODELS
+from ..utils import InvertedResidual, resize
+
+
+class LearningToDownsample(nn.Module):
+    """Learning to downsample module.
+
+    Args:
+        in_channels (int): Number of input channels.
+        dw_channels (tuple[int]): Number of output channels of the first and
+            the second depthwise conv (dwconv) layers.
+        out_channels (int): Number of output channels of the whole
+            'learning to downsample' module.
+        conv_cfg (dict | None): Config of conv layers. Default: None
+        norm_cfg (dict | None): Config of norm layers. Default:
+            dict(type='BN')
+        act_cfg (dict): Config of activation layers. Default:
+            dict(type='ReLU')
+        dw_act_cfg (dict): In DepthwiseSeparableConvModule, activation config
+            of depthwise ConvModule. If it is 'default', it will be the same
+            as `act_cfg`. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 dw_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 dw_act_cfg=None):
+        super().__init__()
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.dw_act_cfg = dw_act_cfg
+        dw_channels1 = dw_channels[0]
+        dw_channels2 = dw_channels[1]
+
+        self.conv = ConvModule(
+            in_channels,
+            dw_channels1,
+            3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.dsconv1 = DepthwiseSeparableConvModule(
+            dw_channels1,
+            dw_channels2,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            dw_act_cfg=self.dw_act_cfg)
+
+        self.dsconv2 = DepthwiseSeparableConvModule(
+            dw_channels2,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            dw_act_cfg=self.dw_act_cfg)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.dsconv1(x)
+        x = self.dsconv2(x)
+        return x
+
+
+class GlobalFeatureExtractor(nn.Module):
+    """Global feature extractor module.
+
+    Args:
+        in_channels (int): Number of input channels of the GFE module.
+            Default: 64
+        block_channels (tuple[int]): Tuple of ints. Each int specifies the
+            number of output channels of each Inverted Residual module.
+            Default: (64, 96, 128)
+        out_channels(int): Number of output channels of the GFE module.
+            Default: 128
+        expand_ratio (int): Adjusts number of channels of the hidden layer
+            in InvertedResidual by this amount.
+            Default: 6
+        num_blocks (tuple[int]): Tuple of ints. Each int specifies the
+            number of times each Inverted Residual module is repeated.
+            The repeated Inverted Residual modules are called a 'group'.
+            Default: (3, 3, 3)
+        strides (tuple[int]): Tuple of ints. Each int specifies
+            the downsampling factor of each 'group'.
+            Default: (2, 2, 1)
+        pool_scales (tuple[int]): Tuple of ints. Each int specifies
+            the parameter required in 'global average pooling' within PPM.
+            Default: (1, 2, 3, 6)
+        conv_cfg (dict | None): Config of conv layers. Default: None
+        norm_cfg (dict | None): Config of norm layers. Default:
+            dict(type='BN')
+        act_cfg (dict): Config of activation layers. Default:
+            dict(type='ReLU')
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False
+    """
+
+    def __init__(self,
+                 in_channels=64,
+                 block_channels=(64, 96, 128),
+                 out_channels=128,
+                 expand_ratio=6,
+                 num_blocks=(3, 3, 3),
+                 strides=(2, 2, 1),
+                 pool_scales=(1, 2, 3, 6),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False):
+        super().__init__()
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        assert len(block_channels) == len(num_blocks) == 3
+        self.bottleneck1 = self._make_layer(in_channels, block_channels[0],
+                                            num_blocks[0], strides[0],
+                                            expand_ratio)
+        self.bottleneck2 = self._make_layer(block_channels[0],
+                                            block_channels[1], num_blocks[1],
+                                            strides[1], expand_ratio)
+        self.bottleneck3 = self._make_layer(block_channels[1],
+                                            block_channels[2], num_blocks[2],
+                                            strides[2], expand_ratio)
+        self.ppm = PPM(
+            pool_scales,
+            block_channels[2],
+            block_channels[2] // 4,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=align_corners)
+
+        self.out = ConvModule(
+            block_channels[2] * 2,
+            out_channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _make_layer(self,
+                    in_channels,
+                    out_channels,
+                    blocks,
+                    stride=1,
+                    expand_ratio=6):
+        layers = [
+            InvertedResidual(
+                in_channels,
+                out_channels,
+                stride,
+                expand_ratio,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        ]
+        for i in range(1, blocks):
+            layers.append(
+                InvertedResidual(
+                    out_channels,
+                    out_channels,
+                    1,
+                    expand_ratio,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.bottleneck1(x)
+        x = self.bottleneck2(x)
+        x = self.bottleneck3(x)
+        x = torch.cat([x, *self.ppm(x)], dim=1)
+        x = self.out(x)
+        return x
+
+
+class FeatureFusionModule(nn.Module):
+    """Feature fusion module.
+
+    Args:
+        higher_in_channels (int): Number of input channels of the
+            higher-resolution branch.
+        lower_in_channels (int): Number of input channels of the
+            lower-resolution branch.
+        out_channels (int): Number of output channels.
+        conv_cfg (dict | None): Config of conv layers. Default: None
+        norm_cfg (dict | None): Config of norm layers. Default:
+            dict(type='BN')
+        dwconv_act_cfg (dict): Config of activation layers in 3x3 conv.
+            Default: dict(type='ReLU').
+        conv_act_cfg (dict): Config of activation layers in the two 1x1 conv.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+    """
+
+    def __init__(self,
+                 higher_in_channels,
+                 lower_in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dwconv_act_cfg=dict(type='ReLU'),
+                 conv_act_cfg=None,
+                 align_corners=False):
+        super().__init__()
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dwconv_act_cfg = dwconv_act_cfg
+        self.conv_act_cfg = conv_act_cfg
+        self.align_corners = align_corners
+        self.dwconv = ConvModule(
+            lower_in_channels,
+            out_channels,
+            3,
+            padding=1,
+            groups=out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.dwconv_act_cfg)
+        self.conv_lower_res = ConvModule(
+            out_channels,
+            out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.conv_act_cfg)
+
+        self.conv_higher_res = ConvModule(
+            higher_in_channels,
+            out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.conv_act_cfg)
+
+        self.relu = nn.ReLU(True)
+
+    def forward(self, higher_res_feature, lower_res_feature):
+        lower_res_feature = resize(
+            lower_res_feature,
+            size=higher_res_feature.size()[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        lower_res_feature = self.dwconv(lower_res_feature)
+        lower_res_feature = self.conv_lower_res(lower_res_feature)
+
+        higher_res_feature = self.conv_higher_res(higher_res_feature)
+        out = higher_res_feature + lower_res_feature
+        return self.relu(out)
+
+
+@MODELS.register_module()
+class FastSCNN(BaseModule):
+    """Fast-SCNN Backbone.
+
+    This backbone is the implementation of `Fast-SCNN: Fast Semantic
+    Segmentation Network <https://arxiv.org/abs/1902.04502>`_.
+
+    Args:
+        in_channels (int): Number of input image channels. Default: 3.
+        downsample_dw_channels (tuple[int]): Number of output channels after
+            the first conv layer & the second conv layer in
+            Learning-To-Downsample (LTD) module.
+            Default: (32, 48).
+        global_in_channels (int): Number of input channels of
+            Global Feature Extractor(GFE).
+            Equal to number of output channels of LTD.
+            Default: 64.
+        global_block_channels (tuple[int]): Tuple of integers that describe
+            the output channels for each of the MobileNet-v2 bottleneck
+            residual blocks in GFE.
+            Default: (64, 96, 128).
+        global_block_strides (tuple[int]): Tuple of integers
+            that describe the strides (downsampling factors) for each of the
+            MobileNet-v2 bottleneck residual blocks in GFE.
+            Default: (2, 2, 1).
+        global_out_channels (int): Number of output channels of GFE.
+            Default: 128.
+        higher_in_channels (int): Number of input channels of the higher
+            resolution branch in FFM.
+            Equal to global_in_channels.
+            Default: 64.
+        lower_in_channels (int): Number of input channels of  the lower
+            resolution branch in FFM.
+            Equal to global_out_channels.
+            Default: 128.
+        fusion_out_channels (int): Number of output channels of FFM.
+            Default: 128.
+        out_indices (tuple): Tuple of indices of list
+            [higher_res_features, lower_res_features, fusion_output].
+            Often set to (0,1,2) to enable aux. heads.
+            Default: (0, 1, 2).
+        conv_cfg (dict | None): Config of conv layers. Default: None
+        norm_cfg (dict | None): Config of norm layers. Default:
+            dict(type='BN')
+        act_cfg (dict): Config of activation layers. Default:
+            dict(type='ReLU')
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False
+        dw_act_cfg (dict): In DepthwiseSeparableConvModule, activation config
+            of depthwise ConvModule. If it is 'default', it will be the same
+            as `act_cfg`. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 downsample_dw_channels=(32, 48),
+                 global_in_channels=64,
+                 global_block_channels=(64, 96, 128),
+                 global_block_strides=(2, 2, 1),
+                 global_out_channels=128,
+                 higher_in_channels=64,
+                 lower_in_channels=128,
+                 fusion_out_channels=128,
+                 out_indices=(0, 1, 2),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 dw_act_cfg=None,
+                 init_cfg=None):
+
+        super().__init__(init_cfg)
+
+        if init_cfg is None:
+            self.init_cfg = [
+                dict(type='Kaiming', layer='Conv2d'),
+                dict(
+                    type='Constant', val=1, layer=['_BatchNorm', 'GroupNorm'])
+            ]
+
+        if global_in_channels != higher_in_channels:
+            raise AssertionError('Global Input Channels must be the same \
+                                 with Higher Input Channels!')
+        elif global_out_channels != lower_in_channels:
+            raise AssertionError('Global Output Channels must be the same \
+                                with Lower Input Channels!')
+
+        self.in_channels = in_channels
+        self.downsample_dw_channels1 = downsample_dw_channels[0]
+        self.downsample_dw_channels2 = downsample_dw_channels[1]
+        self.global_in_channels = global_in_channels
+        self.global_block_channels = global_block_channels
+        self.global_block_strides = global_block_strides
+        self.global_out_channels = global_out_channels
+        self.higher_in_channels = higher_in_channels
+        self.lower_in_channels = lower_in_channels
+        self.fusion_out_channels = fusion_out_channels
+        self.out_indices = out_indices
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+        self.learning_to_downsample = LearningToDownsample(
+            in_channels,
+            downsample_dw_channels,
+            global_in_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            dw_act_cfg=dw_act_cfg)
+        self.global_feature_extractor = GlobalFeatureExtractor(
+            global_in_channels,
+            global_block_channels,
+            global_out_channels,
+            strides=self.global_block_strides,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+        self.feature_fusion = FeatureFusionModule(
+            higher_in_channels,
+            lower_in_channels,
+            fusion_out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dwconv_act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+
+    def forward(self, x):
+        higher_res_features = self.learning_to_downsample(x)
+        lower_res_features = self.global_feature_extractor(higher_res_features)
+        fusion_output = self.feature_fusion(higher_res_features,
+                                            lower_res_features)
+
+        outs = [higher_res_features, lower_res_features, fusion_output]
+        outs = [outs[i] for i in self.out_indices]
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmseg/models/backbones/hrnet.py b/head_extractor/build/lib/mmseg/models/backbones/hrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2da755e731cfea911d47729f455c54c3d38a68e4
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/hrnet.py
@@ -0,0 +1,642 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule, ModuleList, Sequential
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmseg.registry import MODELS
+from ..utils import Upsample, resize
+from .resnet import BasicBlock, Bottleneck
+
+
+class HRModule(BaseModule):
+    """High-Resolution Module for HRNet.
+
+    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
+    is in this module.
+    """
+
+    def __init__(self,
+                 num_branches,
+                 blocks,
+                 num_blocks,
+                 in_channels,
+                 num_channels,
+                 multiscale_output=True,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 block_init_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        self.block_init_cfg = block_init_cfg
+        self._check_branches(num_branches, num_blocks, in_channels,
+                             num_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.multiscale_output = multiscale_output
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.with_cp = with_cp
+        self.branches = self._make_branches(num_branches, blocks, num_blocks,
+                                            num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=False)
+
+    def _check_branches(self, num_branches, num_blocks, in_channels,
+                        num_channels):
+        """Check branches configuration."""
+        if num_branches != len(num_blocks):
+            error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_BLOCKS(' \
+                        f'{len(num_blocks)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_CHANNELS(' \
+                        f'{len(num_channels)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_INCHANNELS(' \
+                        f'{len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        """Build one branch."""
+        downsample = None
+        if stride != 1 or \
+                self.in_channels[branch_index] != \
+                num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, num_channels[branch_index] *
+                                 block.expansion)[1])
+
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index],
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=self.block_init_cfg))
+        self.in_channels[branch_index] = \
+            num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index],
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=self.block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        """Build multiple branch."""
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        """Build fuse layer."""
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            # we set align_corners=False for HRNet
+                            Upsample(
+                                scale_factor=2**(j - i),
+                                mode='bilinear',
+                                align_corners=False)))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=False)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = 0
+            for j in range(self.num_branches):
+                if i == j:
+                    y += x[j]
+                elif j > i:
+                    y = y + resize(
+                        self.fuse_layers[i][j](x[j]),
+                        size=x[i].shape[2:],
+                        mode='bilinear',
+                        align_corners=False)
+                else:
+                    y += self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+
+
+@MODELS.register_module()
+class HRNet(BaseModule):
+    """HRNet backbone.
+
+    This backbone is the implementation of `High-Resolution Representations
+    for Labeling Pixels and Regions <https://arxiv.org/abs/1904.04514>`_.
+
+    Args:
+        extra (dict): Detailed configuration for each stage of HRNet.
+            There must be 4 stages, the configuration for each stage must have
+            5 keys:
+
+                - num_modules (int): The number of HRModule in this stage.
+                - num_branches (int): The number of branches in the HRModule.
+                - block (str): The type of convolution block.
+                - num_blocks (tuple): The number of blocks in each branch.
+                    The length must be equal to num_branches.
+                - num_channels (tuple): The number of channels in each branch.
+                    The length must be equal to num_branches.
+        in_channels (int): Number of input image channels. Normally 3.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Use `BN` by default.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: False.
+        multiscale_output (bool): Whether to output multi-level features
+            produced by multiple branches. If False, only the first level
+            feature will be output. Default: True.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Example:
+        >>> from mmseg.models import HRNet
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(4, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=3,
+        >>>         num_branches=4,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4, 4),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+        (1, 64, 4, 4)
+        (1, 128, 2, 2)
+        (1, 256, 1, 1)
+    """
+
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 frozen_stages=-1,
+                 zero_init_residual=False,
+                 multiscale_output=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.pretrained = pretrained
+        self.zero_init_residual = zero_init_residual
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        # Assert configurations of 4 stages are in extra
+        assert 'stage1' in extra and 'stage2' in extra \
+               and 'stage3' in extra and 'stage4' in extra
+        # Assert whether the length of `num_blocks` and `num_channels` are
+        # equal to `num_branches`
+        for i in range(4):
+            cfg = extra[f'stage{i + 1}']
+            assert len(cfg['num_blocks']) == cfg['num_branches'] and \
+                   len(cfg['num_channels']) == cfg['num_branches']
+
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.frozen_stages = frozen_stages
+
+        # stem net
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            64,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # stage 1
+        self.stage1_cfg = self.extra['stage1']
+        num_channels = self.stage1_cfg['num_channels'][0]
+        block_type = self.stage1_cfg['block']
+        num_blocks = self.stage1_cfg['num_blocks'][0]
+
+        block = self.blocks_dict[block_type]
+        stage1_out_channels = num_channels * block.expansion
+        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
+
+        # stage 2
+        self.stage2_cfg = self.extra['stage2']
+        num_channels = self.stage2_cfg['num_channels']
+        block_type = self.stage2_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition1 = self._make_transition_layer([stage1_out_channels],
+                                                       num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        # stage 3
+        self.stage3_cfg = self.extra['stage3']
+        num_channels = self.stage3_cfg['num_channels']
+        block_type = self.stage3_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition2 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        # stage 4
+        self.stage4_cfg = self.extra['stage4']
+        num_channels = self.stage4_cfg['num_channels']
+        block_type = self.stage4_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition3 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multiscale_output=multiscale_output)
+
+        self._freeze_stages()
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        """Make transition layer."""
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+        """Make each layer."""
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, planes * block.expansion)[1])
+
+        layers = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=block_init_cfg))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
+        """Make each stage."""
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+
+        hr_modules = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+
+        for i in range(num_modules):
+            # multi_scale_output is only used for the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            hr_modules.append(
+                HRModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    in_channels,
+                    num_channels,
+                    reset_multiscale_output,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    block_init_cfg=block_init_cfg))
+
+        return Sequential(*hr_modules), in_channels
+
+    def _freeze_stages(self):
+        """Freeze stages param and norm stats."""
+        if self.frozen_stages >= 0:
+
+            self.norm1.eval()
+            self.norm2.eval()
+            for m in [self.conv1, self.norm1, self.conv2, self.norm2]:
+                for param in m.parameters():
+                    param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            if i == 1:
+                m = getattr(self, f'layer{i}')
+                t = getattr(self, f'transition{i}')
+            elif i == 4:
+                m = getattr(self, f'stage{i}')
+            else:
+                m = getattr(self, f'stage{i}')
+                t = getattr(self, f'transition{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+            t.eval()
+            for param in t.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['num_branches']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['num_branches']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['num_branches']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        return y_list
+
+    def train(self, mode=True):
+        """Convert the model into training mode will keeping the normalization
+        layer freezed."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/head_extractor/build/lib/mmseg/models/backbones/icnet.py b/head_extractor/build/lib/mmseg/models/backbones/icnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ff3448569c5a3ec82a12726767fcbb48b3870d2
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/icnet.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..decode_heads.psp_head import PPM
+from ..utils import resize
+
+
+@MODELS.register_module()
+class ICNet(BaseModule):
+    """ICNet for Real-Time Semantic Segmentation on High-Resolution Images.
+
+    This backbone is the implementation of
+    `ICNet <https://arxiv.org/abs/1704.08545>`_.
+
+    Args:
+        backbone_cfg (dict): Config dict to build backbone. Usually it is
+            ResNet but it can also be other backbones.
+        in_channels (int): The number of input image channels. Default: 3.
+        layer_channels (Sequence[int]): The numbers of feature channels at
+            layer 2 and layer 4 in ResNet. It can also be other backbones.
+            Default: (512, 2048).
+        light_branch_middle_channels (int): The number of channels of the
+            middle layer in light branch. Default: 32.
+        psp_out_channels (int): The number of channels of the output of PSP
+            module. Default: 512.
+        out_channels (Sequence[int]): The numbers of output feature channels
+            at each branches. Default: (64, 256, 256).
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module. Default: (1, 2, 3, 6).
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Dictionary to construct and config act layer.
+            Default: dict(type='ReLU').
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 in_channels=3,
+                 layer_channels=(512, 2048),
+                 light_branch_middle_channels=32,
+                 psp_out_channels=512,
+                 out_channels=(64, 256, 256),
+                 pool_scales=(1, 2, 3, 6),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 init_cfg=None):
+        if backbone_cfg is None:
+            raise TypeError('backbone_cfg must be passed from config file!')
+        if init_cfg is None:
+            init_cfg = [
+                dict(type='Kaiming', mode='fan_out', layer='Conv2d'),
+                dict(type='Constant', val=1, layer='_BatchNorm'),
+                dict(type='Normal', mean=0.01, layer='Linear')
+            ]
+        super().__init__(init_cfg=init_cfg)
+        self.align_corners = align_corners
+        self.backbone = MODELS.build(backbone_cfg)
+
+        # Note: Default `ceil_mode` is false in nn.MaxPool2d, set
+        # `ceil_mode=True` to keep information in the corner of feature map.
+        self.backbone.maxpool = nn.MaxPool2d(
+            kernel_size=3, stride=2, padding=1, ceil_mode=True)
+
+        self.psp_modules = PPM(
+            pool_scales=pool_scales,
+            in_channels=layer_channels[1],
+            channels=psp_out_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            align_corners=align_corners)
+
+        self.psp_bottleneck = ConvModule(
+            layer_channels[1] + len(pool_scales) * psp_out_channels,
+            psp_out_channels,
+            3,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.conv_sub1 = nn.Sequential(
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=light_branch_middle_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg),
+            ConvModule(
+                in_channels=light_branch_middle_channels,
+                out_channels=light_branch_middle_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg),
+            ConvModule(
+                in_channels=light_branch_middle_channels,
+                out_channels=out_channels[0],
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg))
+
+        self.conv_sub2 = ConvModule(
+            layer_channels[0],
+            out_channels[1],
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+        self.conv_sub4 = ConvModule(
+            psp_out_channels,
+            out_channels[2],
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+    def forward(self, x):
+        output = []
+
+        # sub 1
+        output.append(self.conv_sub1(x))
+
+        # sub 2
+        x = resize(
+            x,
+            scale_factor=0.5,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.backbone.stem(x)
+        x = self.backbone.maxpool(x)
+        x = self.backbone.layer1(x)
+        x = self.backbone.layer2(x)
+        output.append(self.conv_sub2(x))
+
+        # sub 4
+        x = resize(
+            x,
+            scale_factor=0.5,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.backbone.layer3(x)
+        x = self.backbone.layer4(x)
+        psp_outs = self.psp_modules(x) + [x]
+        psp_outs = torch.cat(psp_outs, dim=1)
+        x = self.psp_bottleneck(psp_outs)
+
+        output.append(self.conv_sub4(x))
+
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/backbones/mae.py b/head_extractor/build/lib/mmseg/models/backbones/mae.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1f243f0857b9aca5454e8c1410075bff9281285
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/mae.py
@@ -0,0 +1,260 @@
+# Copyright (c) OpenMMLab. All rights reserved.import math
+import math
+
+import torch
+import torch.nn as nn
+from mmengine.model import ModuleList
+from mmengine.model.weight_init import (constant_init, kaiming_init,
+                                        trunc_normal_)
+from mmengine.runner.checkpoint import _load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmseg.registry import MODELS
+from .beit import BEiT, BEiTAttention, BEiTTransformerEncoderLayer
+
+
+class MAEAttention(BEiTAttention):
+    """Multi-head self-attention with relative position bias used in MAE.
+
+    This module is different from ``BEiTAttention`` by initializing the
+    relative bias table with zeros.
+    """
+
+    def init_weights(self):
+        """Initialize relative position bias with zeros."""
+
+        # As MAE initializes relative position bias as zeros and this class
+        # inherited from BEiT which initializes relative position bias
+        # with `trunc_normal`, `init_weights` here does
+        # nothing and just passes directly
+
+        pass
+
+
+class MAETransformerEncoderLayer(BEiTTransformerEncoderLayer):
+    """Implements one encoder layer in Vision Transformer.
+
+    This module is different from ``BEiTTransformerEncoderLayer`` by replacing
+    ``BEiTAttention`` with ``MAEAttention``.
+    """
+
+    def build_attn(self, attn_cfg):
+        self.attn = MAEAttention(**attn_cfg)
+
+
+@MODELS.register_module()
+class MAE(BEiT):
+    """VisionTransformer with support for patch.
+
+    Args:
+        img_size (int | tuple): Input image size. Default: 224.
+        patch_size (int): The patch size. Default: 16.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): embedding dimension. Default: 768.
+        num_layers (int): depth of transformer. Default: 12.
+        num_heads (int): number of attention heads. Default: 12.
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        out_indices (list | tuple | int): Output from which stages.
+            Default: -1.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): stochastic depth rate. Default 0.0.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        patch_norm (bool): Whether to add a norm in PatchEmbed Block.
+            Default: False.
+        final_norm (bool): Whether to add a additional layer to normalize
+            final feature map. Default: False.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_values (float): Initialize the values of Attention and FFN
+            with learnable scaling. Defaults to 0.1.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dims=768,
+                 num_layers=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 out_indices=-1,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_cfg=dict(type='LN'),
+                 act_cfg=dict(type='GELU'),
+                 patch_norm=False,
+                 final_norm=False,
+                 num_fcs=2,
+                 norm_eval=False,
+                 pretrained=None,
+                 init_values=0.1,
+                 init_cfg=None):
+        super().__init__(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            out_indices=out_indices,
+            qv_bias=False,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=drop_path_rate,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            patch_norm=patch_norm,
+            final_norm=final_norm,
+            num_fcs=num_fcs,
+            norm_eval=norm_eval,
+            pretrained=pretrained,
+            init_values=init_values,
+            init_cfg=init_cfg)
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
+
+        self.num_patches = self.patch_shape[0] * self.patch_shape[1]
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, self.num_patches + 1, embed_dims))
+
+    def _build_layers(self):
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, self.num_layers)
+        ]
+        self.layers = ModuleList()
+        for i in range(self.num_layers):
+            self.layers.append(
+                MAETransformerEncoderLayer(
+                    embed_dims=self.embed_dims,
+                    num_heads=self.num_heads,
+                    feedforward_channels=self.mlp_ratio * self.embed_dims,
+                    attn_drop_rate=self.attn_drop_rate,
+                    drop_path_rate=dpr[i],
+                    num_fcs=self.num_fcs,
+                    bias=True,
+                    act_cfg=self.act_cfg,
+                    norm_cfg=self.norm_cfg,
+                    window_size=self.patch_shape,
+                    init_values=self.init_values))
+
+    def fix_init_weight(self):
+        """Rescale the initialization according to layer id.
+
+        This function is copied from  https://github.com/microsoft/unilm/blob/master/beit/modeling_pretrain.py. # noqa: E501
+        Copyright (c) Microsoft Corporation
+        Licensed under the MIT License
+        """
+
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.layers):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.ffn.layers[1].weight.data, layer_id + 1)
+
+    def init_weights(self):
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+        self.apply(_init_weights)
+        self.fix_init_weight()
+
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg.get('type') == 'Pretrained'):
+            checkpoint = _load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+            state_dict = self.resize_rel_pos_embed(checkpoint)
+            state_dict = self.resize_abs_pos_embed(state_dict)
+            self.load_state_dict(state_dict, False)
+        elif self.init_cfg is not None:
+            super().init_weights()
+        else:
+            # We only implement the 'jax_impl' initialization implemented at
+            # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
+            # Copyright 2019 Ross Wightman
+            # Licensed under the Apache License, Version 2.0 (the "License")
+            trunc_normal_(self.cls_token, std=.02)
+            for n, m in self.named_modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if m.bias is not None:
+                        if 'ffn' in n:
+                            nn.init.normal_(m.bias, mean=0., std=1e-6)
+                        else:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Conv2d):
+                    kaiming_init(m, mode='fan_in', bias=0.)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m, val=1.0, bias=0.)
+
+    def resize_abs_pos_embed(self, state_dict):
+        if 'pos_embed' in state_dict:
+            pos_embed_checkpoint = state_dict['pos_embed']
+            embedding_size = pos_embed_checkpoint.shape[-1]
+            num_extra_tokens = self.pos_embed.shape[-2] - self.num_patches
+            # height (== width) for the checkpoint position embedding
+            orig_size = int(
+                (pos_embed_checkpoint.shape[-2] - num_extra_tokens)**0.5)
+            # height (== width) for the new position embedding
+            new_size = int(self.num_patches**0.5)
+            # class_token and dist_token are kept unchanged
+            if orig_size != new_size:
+                extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+                # only the position tokens are interpolated
+                pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+                pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size,
+                                                embedding_size).permute(
+                                                    0, 3, 1, 2)
+                pos_tokens = torch.nn.functional.interpolate(
+                    pos_tokens,
+                    size=(new_size, new_size),
+                    mode='bicubic',
+                    align_corners=False)
+                pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+                new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+                state_dict['pos_embed'] = new_pos_embed
+        return state_dict
+
+    def forward(self, inputs):
+        B = inputs.shape[0]
+
+        x, hw_shape = self.patch_embed(inputs)
+
+        # stole cls_tokens impl from Phil Wang, thanks
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i == len(self.layers) - 1:
+                if self.final_norm:
+                    x = self.norm1(x)
+            if i in self.out_indices:
+                out = x[:, 1:]
+                B, _, C = out.shape
+                out = out.reshape(B, hw_shape[0], hw_shape[1],
+                                  C).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmseg/models/backbones/mit.py b/head_extractor/build/lib/mmseg/models/backbones/mit.py
new file mode 100644
index 0000000000000000000000000000000000000000..66556bdfca2b0bcb180afd23c2923c68b9ff3a69
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/mit.py
@@ -0,0 +1,450 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import Conv2d, build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import MultiheadAttention
+from mmengine.model import BaseModule, ModuleList, Sequential
+from mmengine.model.weight_init import (constant_init, normal_init,
+                                        trunc_normal_init)
+
+from mmseg.registry import MODELS
+from ..utils import PatchEmbed, nchw_to_nlc, nlc_to_nchw
+
+
+class MixFFN(BaseModule):
+    """An implementation of MixFFN of Segformer.
+
+    The differences between MixFFN & FFN:
+        1. Use 1X1 Conv to replace Linear layer.
+        2. Introduce 3X3 Conv to encode positional information.
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='ReLU')
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 feedforward_channels,
+                 act_cfg=dict(type='GELU'),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.act_cfg = act_cfg
+        self.activate = build_activation_layer(act_cfg)
+
+        in_channels = embed_dims
+        fc1 = Conv2d(
+            in_channels=in_channels,
+            out_channels=feedforward_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        # 3x3 depth wise conv to provide positional encode information
+        pe_conv = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=feedforward_channels,
+            kernel_size=3,
+            stride=1,
+            padding=(3 - 1) // 2,
+            bias=True,
+            groups=feedforward_channels)
+        fc2 = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        drop = nn.Dropout(ffn_drop)
+        layers = [fc1, pe_conv, self.activate, drop, fc2, drop]
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+
+    def forward(self, x, hw_shape, identity=None):
+        out = nlc_to_nchw(x, hw_shape)
+        out = self.layers(out)
+        out = nchw_to_nlc(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+class EfficientMultiheadAttention(MultiheadAttention):
+    """An implementation of Efficient Multi-head Attention of Segformer.
+
+    This module is modified from MultiheadAttention which is a module from
+    mmcv.cnn.bricks.transformer.
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut. Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        qkv_bias (bool): enable bias for qkv if True. Default True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
+            Attention of Segformer. Default: 1.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=None,
+                 init_cfg=None,
+                 batch_first=True,
+                 qkv_bias=False,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1):
+        super().__init__(
+            embed_dims,
+            num_heads,
+            attn_drop,
+            proj_drop,
+            dropout_layer=dropout_layer,
+            init_cfg=init_cfg,
+            batch_first=batch_first,
+            bias=qkv_bias)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = Conv2d(
+                in_channels=embed_dims,
+                out_channels=embed_dims,
+                kernel_size=sr_ratio,
+                stride=sr_ratio)
+            # The ret[0] of build_norm_layer is norm name.
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        # handle the BC-breaking from https://github.com/open-mmlab/mmcv/pull/1418 # noqa
+        from mmseg import digit_version, mmcv_version
+        if mmcv_version < digit_version('1.3.17'):
+            warnings.warn('The legacy version of forward function in'
+                          'EfficientMultiheadAttention is deprecated in'
+                          'mmcv>=1.3.17 and will no longer support in the'
+                          'future. Please upgrade your mmcv.')
+            self.forward = self.legacy_forward
+
+    def forward(self, x, hw_shape, identity=None):
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            x_q = x_q.transpose(0, 1)
+            x_kv = x_kv.transpose(0, 1)
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+    def legacy_forward(self, x, hw_shape, identity=None):
+        """multi head attention forward in mmcv version < 1.3.17."""
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # `need_weights=True` will let nn.MultiHeadAttention
+        # `return attn_output, attn_output_weights.sum(dim=1) / num_heads`
+        # The `attn_output_weights.sum(dim=1)` may cause cuda error. So, we set
+        # `need_weights=False` to ignore `attn_output_weights.sum(dim=1)`.
+        # This issue - `https://github.com/pytorch/pytorch/issues/37583` report
+        # the error that large scale tensor sum operation may cause cuda error.
+        out = self.attn(query=x_q, key=x_kv, value=x_kv, need_weights=False)[0]
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+class TransformerEncoderLayer(BaseModule):
+    """Implements one encoder layer in Segformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed.
+            after the feed forward layer. Default 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.0.
+        qkv_bias (bool): enable bias for qkv if True.
+            Default: True.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        init_cfg (dict, optional): Initialization config dict.
+            Default:None.
+        sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
+            Attention of Segformer. Default: 1.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 batch_first=True,
+                 sr_ratio=1,
+                 with_cp=False):
+        super().__init__()
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.attn = EfficientMultiheadAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            batch_first=batch_first,
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.ffn = MixFFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg)
+
+        self.with_cp = with_cp
+
+    def forward(self, x, hw_shape):
+
+        def _inner_forward(x):
+            x = self.attn(self.norm1(x), hw_shape, identity=x)
+            x = self.ffn(self.norm2(x), hw_shape, identity=x)
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+
+
+@MODELS.register_module()
+class MixVisionTransformer(BaseModule):
+    """The backbone of Segformer.
+
+    This backbone is the implementation of `SegFormer: Simple and
+    Efficient Design for Semantic Segmentation with
+    Transformers <https://arxiv.org/abs/2105.15203>`_.
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 768.
+        num_stags (int): The num of stages. Default: 4.
+        num_layers (Sequence[int]): The layer number of each transformer encode
+            layer. Default: [3, 4, 6, 3].
+        num_heads (Sequence[int]): The attention heads of each transformer
+            encode layer. Default: [1, 2, 4, 8].
+        patch_sizes (Sequence[int]): The patch_size of each overlapped patch
+            embedding. Default: [7, 3, 3, 3].
+        strides (Sequence[int]): The stride of each overlapped patch embedding.
+            Default: [4, 2, 2, 2].
+        sr_ratios (Sequence[int]): The spatial reduction rate of each
+            transformer encode layer. Default: [8, 4, 2, 1].
+        out_indices (Sequence[int] | int): Output from which stages.
+            Default: (0, 1, 2, 3).
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): stochastic depth rate. Default 0.0
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=64,
+                 num_stages=4,
+                 num_layers=[3, 4, 6, 3],
+                 num_heads=[1, 2, 4, 8],
+                 patch_sizes=[7, 3, 3, 3],
+                 strides=[4, 2, 2, 2],
+                 sr_ratios=[8, 4, 2, 1],
+                 out_indices=(0, 1, 2, 3),
+                 mlp_ratio=4,
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 pretrained=None,
+                 init_cfg=None,
+                 with_cp=False):
+        super().__init__(init_cfg=init_cfg)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.embed_dims = embed_dims
+        self.num_stages = num_stages
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.sr_ratios = sr_ratios
+        self.with_cp = with_cp
+        assert num_stages == len(num_layers) == len(num_heads) \
+               == len(patch_sizes) == len(strides) == len(sr_ratios)
+
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_stages
+
+        # transformer encoder
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, sum(num_layers))
+        ]  # stochastic num_layer decay rule
+
+        cur = 0
+        self.layers = ModuleList()
+        for i, num_layer in enumerate(num_layers):
+            embed_dims_i = embed_dims * num_heads[i]
+            patch_embed = PatchEmbed(
+                in_channels=in_channels,
+                embed_dims=embed_dims_i,
+                kernel_size=patch_sizes[i],
+                stride=strides[i],
+                padding=patch_sizes[i] // 2,
+                norm_cfg=norm_cfg)
+            layer = ModuleList([
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims_i,
+                    num_heads=num_heads[i],
+                    feedforward_channels=mlp_ratio * embed_dims_i,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=dpr[cur + idx],
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    with_cp=with_cp,
+                    sr_ratio=sr_ratios[i]) for idx in range(num_layer)
+            ])
+            in_channels = embed_dims_i
+            # The ret[0] of build_norm_layer is norm name.
+            norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
+            self.layers.append(ModuleList([patch_embed, layer, norm]))
+            cur += num_layer
+
+    def init_weights(self):
+        if self.init_cfg is None:
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, val=1.0, bias=0.)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(
+                        m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
+        else:
+            super().init_weights()
+
+    def forward(self, x):
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            x, hw_shape = layer[0](x)
+            for block in layer[1]:
+                x = block(x, hw_shape)
+            x = layer[2](x)
+            x = nlc_to_nchw(x, hw_shape)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return outs
diff --git a/head_extractor/build/lib/mmseg/models/backbones/mobilenet_v2.py b/head_extractor/build/lib/mmseg/models/backbones/mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c21b5df97dade148136e8b0e6b039512f9e03f9
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/mobilenet_v2.py
@@ -0,0 +1,197 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmseg.registry import MODELS
+from ..utils import InvertedResidual, make_divisible
+
+
+@MODELS.register_module()
+class MobileNetV2(BaseModule):
+    """MobileNetV2 backbone.
+
+    This backbone is the implementation of
+    `MobileNetV2: Inverted Residuals and Linear Bottlenecks
+    <https://arxiv.org/abs/1801.04381>`_.
+
+    Args:
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        strides (Sequence[int], optional): Strides of the first block of each
+            layer. If not specified, default config in ``arch_setting`` will
+            be used.
+        dilations (Sequence[int]): Dilation of each layer.
+        out_indices (None or Sequence[int]): Output from which stages.
+            Default: (7, ).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    # Parameters to build layers. 3 parameters are needed to construct a
+    # layer, from left to right: expand_ratio, channel, num_blocks.
+    arch_settings = [[1, 16, 1], [6, 24, 2], [6, 32, 3], [6, 64, 4],
+                     [6, 96, 3], [6, 160, 3], [6, 320, 1]]
+
+    def __init__(self,
+                 widen_factor=1.,
+                 strides=(1, 2, 2, 2, 1, 2, 1),
+                 dilations=(1, 1, 1, 1, 1, 1, 1),
+                 out_indices=(1, 2, 4, 6),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.widen_factor = widen_factor
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == len(self.arch_settings)
+        self.out_indices = out_indices
+        for index in out_indices:
+            if index not in range(0, 7):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, 7). But received {index}')
+
+        if frozen_stages not in range(-1, 7):
+            raise ValueError('frozen_stages must be in range(-1, 7). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = make_divisible(32 * widen_factor, 8)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.layers = []
+
+        for i, layer_cfg in enumerate(self.arch_settings):
+            expand_ratio, channel, num_blocks = layer_cfg
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            out_channels = make_divisible(channel * widen_factor, 8)
+            inverted_res_layer = self.make_layer(
+                out_channels=out_channels,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                expand_ratio=expand_ratio)
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, inverted_res_layer)
+            self.layers.append(layer_name)
+
+    def make_layer(self, out_channels, num_blocks, stride, dilation,
+                   expand_ratio):
+        """Stack InvertedResidual blocks to build a layer for MobileNetV2.
+
+        Args:
+            out_channels (int): out_channels of block.
+            num_blocks (int): Number of blocks.
+            stride (int): Stride of the first block.
+            dilation (int): Dilation of the first block.
+            expand_ratio (int): Expand the number of channels of the
+                hidden layer in InvertedResidual by this ratio.
+        """
+        layers = []
+        for i in range(num_blocks):
+            layers.append(
+                InvertedResidual(
+                    self.in_channels,
+                    out_channels,
+                    stride if i == 0 else 1,
+                    expand_ratio=expand_ratio,
+                    dilation=dilation if i == 0 else 1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/head_extractor/build/lib/mmseg/models/backbones/mobilenet_v3.py b/head_extractor/build/lib/mmseg/models/backbones/mobilenet_v3.py
new file mode 100644
index 0000000000000000000000000000000000000000..1efb6e097472d53a5269e52a39ff2cae48e834db
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/mobilenet_v3.py
@@ -0,0 +1,267 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import Conv2dAdaptivePadding
+from mmengine.model import BaseModule
+from mmengine.utils import is_tuple_of
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmseg.registry import MODELS
+from ..utils import InvertedResidualV3 as InvertedResidual
+
+
+@MODELS.register_module()
+class MobileNetV3(BaseModule):
+    """MobileNetV3 backbone.
+
+    This backbone is the improved implementation of `Searching for MobileNetV3
+    <https://ieeexplore.ieee.org/document/9008835>`_.
+
+    Args:
+        arch (str): Architecture of mobilnetv3, from {'small', 'large'}.
+            Default: 'small'.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        out_indices (tuple[int]): Output from which layer.
+            Default: (0, 1, 12).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed.
+            Default: False.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+    # Parameters to build each block:
+    #     [kernel size, mid channels, out channels, with_se, act type, stride]
+    arch_settings = {
+        'small': [[3, 16, 16, True, 'ReLU', 2],  # block0 layer1 os=4
+                  [3, 72, 24, False, 'ReLU', 2],  # block1 layer2 os=8
+                  [3, 88, 24, False, 'ReLU', 1],
+                  [5, 96, 40, True, 'HSwish', 2],  # block2 layer4 os=16
+                  [5, 240, 40, True, 'HSwish', 1],
+                  [5, 240, 40, True, 'HSwish', 1],
+                  [5, 120, 48, True, 'HSwish', 1],  # block3 layer7 os=16
+                  [5, 144, 48, True, 'HSwish', 1],
+                  [5, 288, 96, True, 'HSwish', 2],  # block4 layer9 os=32
+                  [5, 576, 96, True, 'HSwish', 1],
+                  [5, 576, 96, True, 'HSwish', 1]],
+        'large': [[3, 16, 16, False, 'ReLU', 1],  # block0 layer1 os=2
+                  [3, 64, 24, False, 'ReLU', 2],  # block1 layer2 os=4
+                  [3, 72, 24, False, 'ReLU', 1],
+                  [5, 72, 40, True, 'ReLU', 2],  # block2 layer4 os=8
+                  [5, 120, 40, True, 'ReLU', 1],
+                  [5, 120, 40, True, 'ReLU', 1],
+                  [3, 240, 80, False, 'HSwish', 2],  # block3 layer7 os=16
+                  [3, 200, 80, False, 'HSwish', 1],
+                  [3, 184, 80, False, 'HSwish', 1],
+                  [3, 184, 80, False, 'HSwish', 1],
+                  [3, 480, 112, True, 'HSwish', 1],  # block4 layer11 os=16
+                  [3, 672, 112, True, 'HSwish', 1],
+                  [5, 672, 160, True, 'HSwish', 2],  # block5 layer13 os=32
+                  [5, 960, 160, True, 'HSwish', 1],
+                  [5, 960, 160, True, 'HSwish', 1]]
+    }  # yapf: disable
+
+    def __init__(self,
+                 arch='small',
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 out_indices=(0, 1, 12),
+                 frozen_stages=-1,
+                 reduction_factor=1,
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        assert arch in self.arch_settings
+        assert isinstance(reduction_factor, int) and reduction_factor > 0
+        assert is_tuple_of(out_indices, int)
+        for index in out_indices:
+            if index not in range(0, len(self.arch_settings[arch]) + 2):
+                raise ValueError(
+                    'the item in out_indices must in '
+                    f'range(0, {len(self.arch_settings[arch])+2}). '
+                    f'But received {index}')
+
+        if frozen_stages not in range(-1, len(self.arch_settings[arch]) + 2):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             f'{len(self.arch_settings[arch])+2}). '
+                             f'But received {frozen_stages}')
+        self.arch = arch
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.reduction_factor = reduction_factor
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.layers = self._make_layer()
+
+    def _make_layer(self):
+        layers = []
+
+        # build the first layer (layer0)
+        in_channels = 16
+        layer = ConvModule(
+            in_channels=3,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=dict(type='Conv2dAdaptivePadding'),
+            norm_cfg=self.norm_cfg,
+            act_cfg=dict(type='HSwish'))
+        self.add_module('layer0', layer)
+        layers.append('layer0')
+
+        layer_setting = self.arch_settings[self.arch]
+        for i, params in enumerate(layer_setting):
+            (kernel_size, mid_channels, out_channels, with_se, act,
+             stride) = params
+
+            if self.arch == 'large' and i >= 12 or self.arch == 'small' and \
+                    i >= 8:
+                mid_channels = mid_channels // self.reduction_factor
+                out_channels = out_channels // self.reduction_factor
+
+            if with_se:
+                se_cfg = dict(
+                    channels=mid_channels,
+                    ratio=4,
+                    act_cfg=(dict(type='ReLU'),
+                             dict(type='HSigmoid', bias=3.0, divisor=6.0)))
+            else:
+                se_cfg = None
+
+            layer = InvertedResidual(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                mid_channels=mid_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                se_cfg=se_cfg,
+                with_expand_conv=(in_channels != mid_channels),
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=dict(type=act),
+                with_cp=self.with_cp)
+            in_channels = out_channels
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, layer)
+            layers.append(layer_name)
+
+        # build the last layer
+        # block5 layer12 os=32 for small model
+        # block6 layer16 os=32 for large model
+        layer = ConvModule(
+            in_channels=in_channels,
+            out_channels=576 if self.arch == 'small' else 960,
+            kernel_size=1,
+            stride=1,
+            dilation=4,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=dict(type='HSwish'))
+        layer_name = f'layer{len(layer_setting) + 1}'
+        self.add_module(layer_name, layer)
+        layers.append(layer_name)
+
+        # next, convert backbone MobileNetV3 to a semantic segmentation version
+        if self.arch == 'small':
+            self.layer4.depthwise_conv.conv.stride = (1, 1)
+            self.layer9.depthwise_conv.conv.stride = (1, 1)
+            for i in range(4, len(layers)):
+                layer = getattr(self, layers[i])
+                if isinstance(layer, InvertedResidual):
+                    modified_module = layer.depthwise_conv.conv
+                else:
+                    modified_module = layer.conv
+
+                if i < 9:
+                    modified_module.dilation = (2, 2)
+                    pad = 2
+                else:
+                    modified_module.dilation = (4, 4)
+                    pad = 4
+
+                if not isinstance(modified_module, Conv2dAdaptivePadding):
+                    # Adjust padding
+                    pad *= (modified_module.kernel_size[0] - 1) // 2
+                    modified_module.padding = (pad, pad)
+        else:
+            self.layer7.depthwise_conv.conv.stride = (1, 1)
+            self.layer13.depthwise_conv.conv.stride = (1, 1)
+            for i in range(7, len(layers)):
+                layer = getattr(self, layers[i])
+                if isinstance(layer, InvertedResidual):
+                    modified_module = layer.depthwise_conv.conv
+                else:
+                    modified_module = layer.conv
+
+                if i < 13:
+                    modified_module.dilation = (2, 2)
+                    pad = 2
+                else:
+                    modified_module.dilation = (4, 4)
+                    pad = 4
+
+                if not isinstance(modified_module, Conv2dAdaptivePadding):
+                    # Adjust padding
+                    pad *= (modified_module.kernel_size[0] - 1) // 2
+                    modified_module.padding = (pad, pad)
+
+        return layers
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return outs
+
+    def _freeze_stages(self):
+        for i in range(self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/head_extractor/build/lib/mmseg/models/backbones/mscan.py b/head_extractor/build/lib/mmseg/models/backbones/mscan.py
new file mode 100644
index 0000000000000000000000000000000000000000..7150cb7a1c13d11dcdcc6fbbc72931154853929e
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/mscan.py
@@ -0,0 +1,467 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Originally from https://github.com/visual-attention-network/segnext
+# Licensed under the Apache License, Version 2.0 (the "License")
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks import DropPath
+from mmengine.model import BaseModule
+from mmengine.model.weight_init import (constant_init, normal_init,
+                                        trunc_normal_init)
+
+from mmseg.registry import MODELS
+
+
+class Mlp(BaseModule):
+    """Multi Layer Perceptron (MLP) Module.
+
+    Args:
+        in_features (int): The dimension of input features.
+        hidden_features (int): The dimension of hidden features.
+            Defaults: None.
+        out_features (int): The dimension of output features.
+            Defaults: None.
+        act_cfg (dict): Config dict for activation layer in block.
+            Default: dict(type='GELU').
+        drop (float): The number of dropout rate in MLP block.
+            Defaults: 0.0.
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_cfg=dict(type='GELU'),
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
+        self.dwconv = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            3,
+            1,
+            1,
+            bias=True,
+            groups=hidden_features)
+        self.act = build_activation_layer(act_cfg)
+        self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        """Forward function."""
+
+        x = self.fc1(x)
+
+        x = self.dwconv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+
+        return x
+
+
+class StemConv(BaseModule):
+    """Stem Block at the beginning of Semantic Branch.
+
+    Args:
+        in_channels (int): The dimension of input channels.
+        out_channels (int): The dimension of output channels.
+        act_cfg (dict): Config dict for activation layer in block.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults: dict(type='SyncBN', requires_grad=True).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN', requires_grad=True)):
+        super().__init__()
+
+        self.proj = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                out_channels // 2,
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1)),
+            build_norm_layer(norm_cfg, out_channels // 2)[1],
+            build_activation_layer(act_cfg),
+            nn.Conv2d(
+                out_channels // 2,
+                out_channels,
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1)),
+            build_norm_layer(norm_cfg, out_channels)[1],
+        )
+
+    def forward(self, x):
+        """Forward function."""
+
+        x = self.proj(x)
+        _, _, H, W = x.size()
+        x = x.flatten(2).transpose(1, 2)
+        return x, H, W
+
+
+class MSCAAttention(BaseModule):
+    """Attention Module in Multi-Scale Convolutional Attention Module (MSCA).
+
+    Args:
+        channels (int): The dimension of channels.
+        kernel_sizes (list): The size of attention
+            kernel. Defaults: [5, [1, 7], [1, 11], [1, 21]].
+        paddings (list): The number of
+            corresponding padding value in attention module.
+            Defaults: [2, [0, 3], [0, 5], [0, 10]].
+    """
+
+    def __init__(self,
+                 channels,
+                 kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+                 paddings=[2, [0, 3], [0, 5], [0, 10]]):
+        super().__init__()
+        self.conv0 = nn.Conv2d(
+            channels,
+            channels,
+            kernel_size=kernel_sizes[0],
+            padding=paddings[0],
+            groups=channels)
+        for i, (kernel_size,
+                padding) in enumerate(zip(kernel_sizes[1:], paddings[1:])):
+            kernel_size_ = [kernel_size, kernel_size[::-1]]
+            padding_ = [padding, padding[::-1]]
+            conv_name = [f'conv{i}_1', f'conv{i}_2']
+            for i_kernel, i_pad, i_conv in zip(kernel_size_, padding_,
+                                               conv_name):
+                self.add_module(
+                    i_conv,
+                    nn.Conv2d(
+                        channels,
+                        channels,
+                        tuple(i_kernel),
+                        padding=i_pad,
+                        groups=channels))
+        self.conv3 = nn.Conv2d(channels, channels, 1)
+
+    def forward(self, x):
+        """Forward function."""
+
+        u = x.clone()
+
+        attn = self.conv0(x)
+
+        # Multi-Scale Feature extraction
+        attn_0 = self.conv0_1(attn)
+        attn_0 = self.conv0_2(attn_0)
+
+        attn_1 = self.conv1_1(attn)
+        attn_1 = self.conv1_2(attn_1)
+
+        attn_2 = self.conv2_1(attn)
+        attn_2 = self.conv2_2(attn_2)
+
+        attn = attn + attn_0 + attn_1 + attn_2
+        # Channel Mixing
+        attn = self.conv3(attn)
+
+        # Convolutional Attention
+        x = attn * u
+
+        return x
+
+
+class MSCASpatialAttention(BaseModule):
+    """Spatial Attention Module in Multi-Scale Convolutional Attention Module
+    (MSCA).
+
+    Args:
+        in_channels (int): The dimension of channels.
+        attention_kernel_sizes (list): The size of attention
+            kernel. Defaults: [5, [1, 7], [1, 11], [1, 21]].
+        attention_kernel_paddings (list): The number of
+            corresponding padding value in attention module.
+            Defaults: [2, [0, 3], [0, 5], [0, 10]].
+        act_cfg (dict): Config dict for activation layer in block.
+            Default: dict(type='GELU').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 attention_kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+                 attention_kernel_paddings=[2, [0, 3], [0, 5], [0, 10]],
+                 act_cfg=dict(type='GELU')):
+        super().__init__()
+        self.proj_1 = nn.Conv2d(in_channels, in_channels, 1)
+        self.activation = build_activation_layer(act_cfg)
+        self.spatial_gating_unit = MSCAAttention(in_channels,
+                                                 attention_kernel_sizes,
+                                                 attention_kernel_paddings)
+        self.proj_2 = nn.Conv2d(in_channels, in_channels, 1)
+
+    def forward(self, x):
+        """Forward function."""
+
+        shorcut = x.clone()
+        x = self.proj_1(x)
+        x = self.activation(x)
+        x = self.spatial_gating_unit(x)
+        x = self.proj_2(x)
+        x = x + shorcut
+        return x
+
+
+class MSCABlock(BaseModule):
+    """Basic Multi-Scale Convolutional Attention Block. It leverage the large-
+    kernel attention (LKA) mechanism to build both channel and spatial
+    attention. In each branch, it uses two depth-wise strip convolutions to
+    approximate standard depth-wise convolutions with large kernels. The kernel
+    size for each branch is set to 7, 11, and 21, respectively.
+
+    Args:
+        channels (int): The dimension of channels.
+        attention_kernel_sizes (list): The size of attention
+            kernel. Defaults: [5, [1, 7], [1, 11], [1, 21]].
+        attention_kernel_paddings (list): The number of
+            corresponding padding value in attention module.
+            Defaults: [2, [0, 3], [0, 5], [0, 10]].
+        mlp_ratio (float): The ratio of multiple input dimension to
+            calculate hidden feature in MLP layer. Defaults: 4.0.
+        drop (float): The number of dropout rate in MLP block.
+            Defaults: 0.0.
+        drop_path (float): The ratio of drop paths.
+            Defaults: 0.0.
+        act_cfg (dict): Config dict for activation layer in block.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults: dict(type='SyncBN', requires_grad=True).
+    """
+
+    def __init__(self,
+                 channels,
+                 attention_kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+                 attention_kernel_paddings=[2, [0, 3], [0, 5], [0, 10]],
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN', requires_grad=True)):
+        super().__init__()
+        self.norm1 = build_norm_layer(norm_cfg, channels)[1]
+        self.attn = MSCASpatialAttention(channels, attention_kernel_sizes,
+                                         attention_kernel_paddings, act_cfg)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = build_norm_layer(norm_cfg, channels)[1]
+        mlp_hidden_channels = int(channels * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=channels,
+            hidden_features=mlp_hidden_channels,
+            act_cfg=act_cfg,
+            drop=drop)
+        layer_scale_init_value = 1e-2
+        self.layer_scale_1 = nn.Parameter(
+            layer_scale_init_value * torch.ones(channels), requires_grad=True)
+        self.layer_scale_2 = nn.Parameter(
+            layer_scale_init_value * torch.ones(channels), requires_grad=True)
+
+    def forward(self, x, H, W):
+        """Forward function."""
+
+        B, N, C = x.shape
+        x = x.permute(0, 2, 1).view(B, C, H, W)
+        x = x + self.drop_path(
+            self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) *
+            self.attn(self.norm1(x)))
+        x = x + self.drop_path(
+            self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) *
+            self.mlp(self.norm2(x)))
+        x = x.view(B, C, N).permute(0, 2, 1)
+        return x
+
+
+class OverlapPatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    Args:
+        patch_size (int): The patch size.
+            Defaults: 7.
+        stride (int): Stride of the convolutional layer.
+            Default: 4.
+        in_channels (int): The number of input channels.
+            Defaults: 3.
+        embed_dims (int): The dimensions of embedding.
+            Defaults: 768.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults: dict(type='SyncBN', requires_grad=True).
+    """
+
+    def __init__(self,
+                 patch_size=7,
+                 stride=4,
+                 in_channels=3,
+                 embed_dim=768,
+                 norm_cfg=dict(type='SyncBN', requires_grad=True)):
+        super().__init__()
+
+        self.proj = nn.Conv2d(
+            in_channels,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=patch_size // 2)
+        self.norm = build_norm_layer(norm_cfg, embed_dim)[1]
+
+    def forward(self, x):
+        """Forward function."""
+
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = self.norm(x)
+
+        x = x.flatten(2).transpose(1, 2)
+
+        return x, H, W
+
+
+@MODELS.register_module()
+class MSCAN(BaseModule):
+    """SegNeXt Multi-Scale Convolutional Attention Network (MCSAN) backbone.
+
+    This backbone is the implementation of `SegNeXt: Rethinking
+    Convolutional Attention Design for Semantic
+    Segmentation <https://arxiv.org/abs/2209.08575>`_.
+    Inspiration from https://github.com/visual-attention-network/segnext.
+
+    Args:
+        in_channels (int): The number of input channels. Defaults: 3.
+        embed_dims (list[int]): Embedding dimension.
+            Defaults: [64, 128, 256, 512].
+        mlp_ratios (list[int]): Ratio of mlp hidden dim to embedding dim.
+            Defaults: [4, 4, 4, 4].
+        drop_rate (float): Dropout rate. Defaults: 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults: 0.
+        depths (list[int]): Depths of each Swin Transformer stage.
+            Default: [3, 4, 6, 3].
+        num_stages (int): MSCAN stages. Default: 4.
+        attention_kernel_sizes (list): Size of attention kernel in
+            Attention Module (Figure 2(b) of original paper).
+            Defaults: [5, [1, 7], [1, 11], [1, 21]].
+        attention_kernel_paddings (list): Size of attention paddings
+            in Attention Module (Figure 2(b) of original paper).
+            Defaults: [2, [0, 3], [0, 5], [0, 10]].
+        norm_cfg (dict): Config of norm layers.
+            Defaults: dict(type='SyncBN', requires_grad=True).
+        pretrained (str, optional): model pretrained path.
+            Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=[64, 128, 256, 512],
+                 mlp_ratios=[4, 4, 4, 4],
+                 drop_rate=0.,
+                 drop_path_rate=0.,
+                 depths=[3, 4, 6, 3],
+                 num_stages=4,
+                 attention_kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+                 attention_kernel_paddings=[2, [0, 3], [0, 5], [0, 10]],
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN', requires_grad=True),
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depths = depths
+        self.num_stages = num_stages
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        cur = 0
+
+        for i in range(num_stages):
+            if i == 0:
+                patch_embed = StemConv(3, embed_dims[0], norm_cfg=norm_cfg)
+            else:
+                patch_embed = OverlapPatchEmbed(
+                    patch_size=7 if i == 0 else 3,
+                    stride=4 if i == 0 else 2,
+                    in_channels=in_channels if i == 0 else embed_dims[i - 1],
+                    embed_dim=embed_dims[i],
+                    norm_cfg=norm_cfg)
+
+            block = nn.ModuleList([
+                MSCABlock(
+                    channels=embed_dims[i],
+                    attention_kernel_sizes=attention_kernel_sizes,
+                    attention_kernel_paddings=attention_kernel_paddings,
+                    mlp_ratio=mlp_ratios[i],
+                    drop=drop_rate,
+                    drop_path=dpr[cur + j],
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg) for j in range(depths[i])
+            ])
+            norm = nn.LayerNorm(embed_dims[i])
+            cur += depths[i]
+
+            setattr(self, f'patch_embed{i + 1}', patch_embed)
+            setattr(self, f'block{i + 1}', block)
+            setattr(self, f'norm{i + 1}', norm)
+
+    def init_weights(self):
+        """Initialize modules of MSCAN."""
+
+        print('init cfg', self.init_cfg)
+        if self.init_cfg is None:
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, val=1.0, bias=0.)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(
+                        m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
+        else:
+            super().init_weights()
+
+    def forward(self, x):
+        """Forward function."""
+
+        B = x.shape[0]
+        outs = []
+
+        for i in range(self.num_stages):
+            patch_embed = getattr(self, f'patch_embed{i + 1}')
+            block = getattr(self, f'block{i + 1}')
+            norm = getattr(self, f'norm{i + 1}')
+            x, H, W = patch_embed(x)
+            for blk in block:
+                x = blk(x, H, W)
+            x = norm(x)
+            x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+            outs.append(x)
+
+        return outs
diff --git a/head_extractor/build/lib/mmseg/models/backbones/pidnet.py b/head_extractor/build/lib/mmseg/models/backbones/pidnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b711a373701c0771c5c5997bbb8e5b345d70924
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/pidnet.py
@@ -0,0 +1,522 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from mmengine.runner import CheckpointLoader
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType
+from ..utils import DAPPM, PAPPM, BasicBlock, Bottleneck
+
+
+class PagFM(BaseModule):
+    """Pixel-attention-guided fusion module.
+
+    Args:
+        in_channels (int): The number of input channels.
+        channels (int): The number of channels.
+        after_relu (bool): Whether to use ReLU before attention.
+            Default: False.
+        with_channel (bool): Whether to use channel attention.
+            Default: False.
+        upsample_mode (str): The mode of upsample. Default: 'bilinear'.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(typ='ReLU', inplace=True).
+        init_cfg (dict): Config dict for initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 after_relu: bool = False,
+                 with_channel: bool = False,
+                 upsample_mode: str = 'bilinear',
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(typ='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.after_relu = after_relu
+        self.with_channel = with_channel
+        self.upsample_mode = upsample_mode
+        self.f_i = ConvModule(
+            in_channels, channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        self.f_p = ConvModule(
+            in_channels, channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        if with_channel:
+            self.up = ConvModule(
+                channels, in_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        if after_relu:
+            self.relu = MODELS.build(act_cfg)
+
+    def forward(self, x_p: Tensor, x_i: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x_p (Tensor): The featrue map from P branch.
+            x_i (Tensor): The featrue map from I branch.
+
+        Returns:
+            Tensor: The feature map with pixel-attention-guided fusion.
+        """
+        if self.after_relu:
+            x_p = self.relu(x_p)
+            x_i = self.relu(x_i)
+
+        f_i = self.f_i(x_i)
+        f_i = F.interpolate(
+            f_i,
+            size=x_p.shape[2:],
+            mode=self.upsample_mode,
+            align_corners=False)
+
+        f_p = self.f_p(x_p)
+
+        if self.with_channel:
+            sigma = torch.sigmoid(self.up(f_p * f_i))
+        else:
+            sigma = torch.sigmoid(torch.sum(f_p * f_i, dim=1).unsqueeze(1))
+
+        x_i = F.interpolate(
+            x_i,
+            size=x_p.shape[2:],
+            mode=self.upsample_mode,
+            align_corners=False)
+
+        out = sigma * x_i + (1 - sigma) * x_p
+        return out
+
+
+class Bag(BaseModule):
+    """Boundary-attention-guided fusion module.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        kernel_size (int): The kernel size of the convolution. Default: 3.
+        padding (int): The padding of the convolution. Default: 1.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: dict(order=('norm', 'act', 'conv')).
+        init_cfg (dict): Config dict for initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int = 3,
+                 padding: int = 1,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 conv_cfg: OptConfigType = dict(order=('norm', 'act', 'conv')),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+
+        self.conv = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=padding,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **conv_cfg)
+
+    def forward(self, x_p: Tensor, x_i: Tensor, x_d: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x_p (Tensor): The featrue map from P branch.
+            x_i (Tensor): The featrue map from I branch.
+            x_d (Tensor): The featrue map from D branch.
+
+        Returns:
+            Tensor: The feature map with boundary-attention-guided fusion.
+        """
+        sigma = torch.sigmoid(x_d)
+        return self.conv(sigma * x_p + (1 - sigma) * x_i)
+
+
+class LightBag(BaseModule):
+    """Light Boundary-attention-guided fusion module.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer. Default: None.
+        init_cfg (dict): Config dict for initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.f_p = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.f_i = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x_p: Tensor, x_i: Tensor, x_d: Tensor) -> Tensor:
+        """Forward function.
+        Args:
+            x_p (Tensor): The featrue map from P branch.
+            x_i (Tensor): The featrue map from I branch.
+            x_d (Tensor): The featrue map from D branch.
+
+        Returns:
+            Tensor: The feature map with light boundary-attention-guided
+                fusion.
+        """
+        sigma = torch.sigmoid(x_d)
+
+        f_p = self.f_p((1 - sigma) * x_i + x_p)
+        f_i = self.f_i(x_i + sigma * x_p)
+
+        return f_p + f_i
+
+
+@MODELS.register_module()
+class PIDNet(BaseModule):
+    """PIDNet backbone.
+
+    This backbone is the implementation of `PIDNet: A Real-time Semantic
+    Segmentation Network Inspired from PID Controller
+    <https://arxiv.org/abs/2206.02066>`_.
+    Modified from https://github.com/XuJiacong/PIDNet.
+
+    Licensed under the MIT License.
+
+    Args:
+        in_channels (int): The number of input channels. Default: 3.
+        channels (int): The number of channels in the stem layer. Default: 64.
+        ppm_channels (int): The number of channels in the PPM layer.
+            Default: 96.
+        num_stem_blocks (int): The number of blocks in the stem layer.
+            Default: 2.
+        num_branch_blocks (int): The number of blocks in the branch layer.
+            Default: 3.
+        align_corners (bool): The align_corners argument of F.interpolate.
+            Default: False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+        init_cfg (dict): Config dict for initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 3,
+                 channels: int = 64,
+                 ppm_channels: int = 96,
+                 num_stem_blocks: int = 2,
+                 num_branch_blocks: int = 3,
+                 align_corners: bool = False,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None,
+                 **kwargs):
+        super().__init__(init_cfg)
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+
+        # stem layer
+        self.stem = self._make_stem_layer(in_channels, channels,
+                                          num_stem_blocks)
+        self.relu = nn.ReLU()
+
+        # I Branch
+        self.i_branch_layers = nn.ModuleList()
+        for i in range(3):
+            self.i_branch_layers.append(
+                self._make_layer(
+                    block=BasicBlock if i < 2 else Bottleneck,
+                    in_channels=channels * 2**(i + 1),
+                    channels=channels * 8 if i > 0 else channels * 4,
+                    num_blocks=num_branch_blocks if i < 2 else 2,
+                    stride=2))
+
+        # P Branch
+        self.p_branch_layers = nn.ModuleList()
+        for i in range(3):
+            self.p_branch_layers.append(
+                self._make_layer(
+                    block=BasicBlock if i < 2 else Bottleneck,
+                    in_channels=channels * 2,
+                    channels=channels * 2,
+                    num_blocks=num_stem_blocks if i < 2 else 1))
+        self.compression_1 = ConvModule(
+            channels * 4,
+            channels * 2,
+            kernel_size=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.compression_2 = ConvModule(
+            channels * 8,
+            channels * 2,
+            kernel_size=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.pag_1 = PagFM(channels * 2, channels)
+        self.pag_2 = PagFM(channels * 2, channels)
+
+        # D Branch
+        if num_stem_blocks == 2:
+            self.d_branch_layers = nn.ModuleList([
+                self._make_single_layer(BasicBlock, channels * 2, channels),
+                self._make_layer(Bottleneck, channels, channels, 1)
+            ])
+            channel_expand = 1
+            spp_module = PAPPM
+            dfm_module = LightBag
+            act_cfg_dfm = None
+        else:
+            self.d_branch_layers = nn.ModuleList([
+                self._make_single_layer(BasicBlock, channels * 2,
+                                        channels * 2),
+                self._make_single_layer(BasicBlock, channels * 2, channels * 2)
+            ])
+            channel_expand = 2
+            spp_module = DAPPM
+            dfm_module = Bag
+            act_cfg_dfm = act_cfg
+
+        self.diff_1 = ConvModule(
+            channels * 4,
+            channels * channel_expand,
+            kernel_size=3,
+            padding=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.diff_2 = ConvModule(
+            channels * 8,
+            channels * 2,
+            kernel_size=3,
+            padding=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.spp = spp_module(
+            channels * 16, ppm_channels, channels * 4, num_scales=5)
+        self.dfm = dfm_module(
+            channels * 4, channels * 4, norm_cfg=norm_cfg, act_cfg=act_cfg_dfm)
+
+        self.d_branch_layers.append(
+            self._make_layer(Bottleneck, channels * 2, channels * 2, 1))
+
+    def _make_stem_layer(self, in_channels: int, channels: int,
+                         num_blocks: int) -> nn.Sequential:
+        """Make stem layer.
+
+        Args:
+            in_channels (int): Number of input channels.
+            channels (int): Number of output channels.
+            num_blocks (int): Number of blocks.
+
+        Returns:
+            nn.Sequential: The stem layer.
+        """
+
+        layers = [
+            ConvModule(
+                in_channels,
+                channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                channels,
+                channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        ]
+
+        layers.append(
+            self._make_layer(BasicBlock, channels, channels, num_blocks))
+        layers.append(nn.ReLU())
+        layers.append(
+            self._make_layer(
+                BasicBlock, channels, channels * 2, num_blocks, stride=2))
+        layers.append(nn.ReLU())
+
+        return nn.Sequential(*layers)
+
+    def _make_layer(self,
+                    block: BasicBlock,
+                    in_channels: int,
+                    channels: int,
+                    num_blocks: int,
+                    stride: int = 1) -> nn.Sequential:
+        """Make layer for PIDNet backbone.
+        Args:
+            block (BasicBlock): Basic block.
+            in_channels (int): Number of input channels.
+            channels (int): Number of output channels.
+            num_blocks (int): Number of blocks.
+            stride (int): Stride of the first block. Default: 1.
+
+        Returns:
+            nn.Sequential: The Branch Layer.
+        """
+        downsample = None
+        if stride != 1 or in_channels != channels * block.expansion:
+            downsample = ConvModule(
+                in_channels,
+                channels * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None)
+
+        layers = [block(in_channels, channels, stride, downsample)]
+        in_channels = channels * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    in_channels,
+                    channels,
+                    stride=1,
+                    act_cfg_out=None if i == num_blocks - 1 else self.act_cfg))
+        return nn.Sequential(*layers)
+
+    def _make_single_layer(self,
+                           block: Union[BasicBlock, Bottleneck],
+                           in_channels: int,
+                           channels: int,
+                           stride: int = 1) -> nn.Module:
+        """Make single layer for PIDNet backbone.
+        Args:
+            block (BasicBlock or Bottleneck): Basic block or Bottleneck.
+            in_channels (int): Number of input channels.
+            channels (int): Number of output channels.
+            stride (int): Stride of the first block. Default: 1.
+
+        Returns:
+            nn.Module
+        """
+
+        downsample = None
+        if stride != 1 or in_channels != channels * block.expansion:
+            downsample = ConvModule(
+                in_channels,
+                channels * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None)
+        return block(
+            in_channels, channels, stride, downsample, act_cfg_out=None)
+
+    def init_weights(self):
+        """Initialize the weights in backbone.
+
+        Since the D branch is not initialized by the pre-trained model, we
+        initialize it with the same method as the ResNet.
+        """
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        if self.init_cfg is not None:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            ckpt = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], map_location='cpu')
+            self.load_state_dict(ckpt, strict=False)
+
+    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:
+        """Forward function.
+
+        Args:
+            x (Tensor): Input tensor with shape (B, C, H, W).
+
+        Returns:
+            Tensor or tuple[Tensor]: If self.training is True, return
+                tuple[Tensor], else return Tensor.
+        """
+        w_out = x.shape[-1] // 8
+        h_out = x.shape[-2] // 8
+
+        # stage 0-2
+        x = self.stem(x)
+
+        # stage 3
+        x_i = self.relu(self.i_branch_layers[0](x))
+        x_p = self.p_branch_layers[0](x)
+        x_d = self.d_branch_layers[0](x)
+
+        comp_i = self.compression_1(x_i)
+        x_p = self.pag_1(x_p, comp_i)
+        diff_i = self.diff_1(x_i)
+        x_d += F.interpolate(
+            diff_i,
+            size=[h_out, w_out],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.training:
+            temp_p = x_p.clone()
+
+        # stage 4
+        x_i = self.relu(self.i_branch_layers[1](x_i))
+        x_p = self.p_branch_layers[1](self.relu(x_p))
+        x_d = self.d_branch_layers[1](self.relu(x_d))
+
+        comp_i = self.compression_2(x_i)
+        x_p = self.pag_2(x_p, comp_i)
+        diff_i = self.diff_2(x_i)
+        x_d += F.interpolate(
+            diff_i,
+            size=[h_out, w_out],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.training:
+            temp_d = x_d.clone()
+
+        # stage 5
+        x_i = self.i_branch_layers[2](x_i)
+        x_p = self.p_branch_layers[2](self.relu(x_p))
+        x_d = self.d_branch_layers[2](self.relu(x_d))
+
+        x_i = self.spp(x_i)
+        x_i = F.interpolate(
+            x_i,
+            size=[h_out, w_out],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        out = self.dfm(x_p, x_i, x_d)
+        return (temp_p, out, temp_d) if self.training else out
diff --git a/head_extractor/build/lib/mmseg/models/backbones/resnest.py b/head_extractor/build/lib/mmseg/models/backbones/resnest.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cc380b4460915f476ffc1febcfc145a94fc7c7a
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/resnest.py
@@ -0,0 +1,318 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from mmseg.registry import MODELS
+from ..utils import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNetV1d
+
+
+class RSoftmax(nn.Module):
+    """Radix Softmax module in ``SplitAttentionConv2d``.
+
+    Args:
+        radix (int): Radix of input.
+        groups (int): Groups of input.
+    """
+
+    def __init__(self, radix, groups):
+        super().__init__()
+        self.radix = radix
+        self.groups = groups
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class SplitAttentionConv2d(nn.Module):
+    """Split-Attention Conv2d in ResNeSt.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int | tuple[int]): Same as nn.Conv2d.
+        stride (int | tuple[int]): Same as nn.Conv2d.
+        padding (int | tuple[int]): Same as nn.Conv2d.
+        dilation (int | tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels. Default: 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        dcn (dict): Config dict for DCN. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 radix=2,
+                 reduction_factor=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None):
+        super().__init__()
+        inter_channels = max(in_channels * radix // reduction_factor, 32)
+        self.radix = radix
+        self.groups = groups
+        self.channels = channels
+        self.with_dcn = dcn is not None
+        self.dcn = dcn
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if self.with_dcn and not fallback_on_stride:
+            assert conv_cfg is None, 'conv_cfg must be None for DCN'
+            conv_cfg = dcn
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            channels * radix,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups * radix,
+            bias=False)
+        self.norm0_name, norm0 = build_norm_layer(
+            norm_cfg, channels * radix, postfix=0)
+        self.add_module(self.norm0_name, norm0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc1 = build_conv_layer(
+            None, channels, inter_channels, 1, groups=self.groups)
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, inter_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.fc2 = build_conv_layer(
+            None, inter_channels, channels * radix, 1, groups=self.groups)
+        self.rsoftmax = RSoftmax(radix, groups)
+
+    @property
+    def norm0(self):
+        """nn.Module: the normalization layer named "norm0" """
+        return getattr(self, self.norm0_name)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm0(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        batch = x.size(0)
+        if self.radix > 1:
+            splits = x.view(batch, self.radix, -1, *x.shape[2:])
+            gap = splits.sum(dim=1)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        gap = self.norm1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            attens = atten.view(batch, self.radix, -1, *atten.shape[2:])
+            out = torch.sum(attens * splits, dim=1)
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeSt.
+
+    Args:
+        inplane (int): Input planes of this block.
+        planes (int): Middle planes of this block.
+        groups (int): Groups of conv2.
+        width_per_group (int): Width per group of conv2. 64x4d indicates
+            ``groups=64, width_per_group=4`` and 32x8d indicates
+            ``groups=32, width_per_group=8``.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Key word arguments for base class.
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        """Bottleneck block for ResNeSt."""
+        super().__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.with_modulated_dcn = False
+        self.conv2 = SplitAttentionConv2d(
+            width,
+            width,
+            kernel_size=3,
+            stride=1 if self.avg_down_stride else self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            radix=radix,
+            reduction_factor=reduction_factor,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dcn=self.dcn)
+        delattr(self, self.norm2_name)
+
+        if self.avg_down_stride:
+            self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+
+            if self.avg_down_stride:
+                out = self.avd_layer(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class ResNeSt(ResNetV1d):
+    """ResNeSt backbone.
+
+    This backbone is the implementation of `ResNeSt:
+    Split-Attention Networks <https://arxiv.org/abs/2004.08955>`_.
+
+    Args:
+        groups (int): Number of groups of Bottleneck. Default: 1
+        base_width (int): Base width of Bottleneck. Default: 4
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Keyword arguments for ResNet.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3)),
+        200: (Bottleneck, (3, 24, 36, 3))
+    }
+
+    def __init__(self,
+                 groups=1,
+                 base_width=4,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        self.radix = radix
+        self.reduction_factor = reduction_factor
+        self.avg_down_stride = avg_down_stride
+        super().__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            radix=self.radix,
+            reduction_factor=self.reduction_factor,
+            avg_down_stride=self.avg_down_stride,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/models/backbones/resnet.py b/head_extractor/build/lib/mmseg/models/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..9226c90d85c938e76f322e58643ee9d7b17ba27b
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/resnet.py
@@ -0,0 +1,712 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_plugin_layer
+from mmengine.model import BaseModule
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmseg.registry import MODELS
+from ..utils import ResLayer
+
+
+class BasicBlock(BaseModule):
+    """Basic block for ResNet."""
+
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    """Bottleneck block for ResNet.
+
+    If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is
+    "caffe", the stride-two layer is the first 1x1 conv layer.
+    """
+
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        assert style in ['pytorch', 'caffe']
+        assert dcn is None or isinstance(dcn, dict)
+        assert plugins is None or isinstance(plugins, list)
+        if plugins is not None:
+            allowed_position = ['after_conv1', 'after_conv2', 'after_conv3']
+            assert all(p['position'] in allowed_position for p in plugins)
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+        self.plugins = plugins
+        self.with_plugins = plugins is not None
+
+        if self.with_plugins:
+            # collect plugins for conv1/conv2/conv3
+            self.after_conv1_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv1'
+            ]
+            self.after_conv2_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv2'
+            ]
+            self.after_conv3_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv3'
+            ]
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                conv_cfg,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                dcn,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+        if self.with_plugins:
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                planes, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                planes, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                planes * self.expansion, self.after_conv3_plugins)
+
+    def make_block_plugins(self, in_channels, plugins):
+        """make plugins for block.
+
+        Args:
+            in_channels (int): Input channels of plugin.
+            plugins (list[dict]): List of plugins cfg to build.
+
+        Returns:
+            list[str]: List of the names of plugin.
+        """
+        assert isinstance(plugins, list)
+        plugin_names = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            name, layer = build_plugin_layer(
+                plugin,
+                in_channels=in_channels,
+                postfix=plugin.pop('postfix', ''))
+            assert not hasattr(self, name), f'duplicate plugin {name}'
+            self.add_module(name, layer)
+            plugin_names.append(name)
+        return plugin_names
+
+    def forward_plugin(self, x, plugin_names):
+        """Forward function for plugins."""
+        out = x
+        for name in plugin_names:
+            out = getattr(self, name)(x)
+        return out
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class ResNet(BaseModule):
+    """ResNet backbone.
+
+    This backbone is the improved implementation of `Deep Residual Learning
+    for Image Recognition <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Number of stem channels. Default: 64.
+        base_channels (int): Number of base channels of res layer. Default: 64.
+        num_stages (int): Resnet stages, normally 4. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: (1, 2, 2, 2).
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: (1, 1, 1, 1).
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer. Default: 'pytorch'.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): Dictionary to construct and config conv layer.
+            When conv_cfg is None, cfg will be set to dict(type='Conv2d').
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        dcn (dict | None): Dictionary to construct and config DCN conv layer.
+            When dcn is not None, conv_cfg must be None. Default: None.
+        stage_with_dcn (Sequence[bool]): Whether to set DCN conv for each
+            stage. The length of stage_with_dcn is equal to num_stages.
+            Default: (False, False, False, False).
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+
+            - position (str, required): Position inside block to insert plugin,
+            options: 'after_conv1', 'after_conv2', 'after_conv3'.
+
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+            should be same as 'num_stages'.
+            Default: None.
+        multi_grid (Sequence[int]|None): Multi grid dilation rates of last
+            stage. Default: None.
+        contract_dilation (bool): Whether contract first dilation of each layer
+            Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Example:
+        >>> from mmseg.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=64,
+                 base_channels=64,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 multi_grid=None,
+                 contract_dilation=False,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+
+        self.pretrained = pretrained
+        self.zero_init_residual = zero_init_residual
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                block = self.arch_settings[depth][0]
+                if self.zero_init_residual:
+                    if block is BasicBlock:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm2'))
+                    elif block is Bottleneck:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depth = depth
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.multi_grid = multi_grid
+        self.contract_dilation = contract_dilation
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = stem_channels
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if plugins is not None:
+                stage_plugins = self.make_stage_plugins(plugins, i)
+            else:
+                stage_plugins = None
+            # multi grid is applied to last layer only
+            stage_multi_grid = multi_grid if i == len(
+                self.stage_blocks) - 1 else None
+            planes = base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                multi_grid=stage_multi_grid,
+                contract_dilation=contract_dilation,
+                init_cfg=block_init_cfg)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i+1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    def make_stage_plugins(self, plugins, stage_idx):
+        """make plugins for ResNet 'stage_idx'th stage .
+
+        Currently we support to insert 'context_block',
+        'empirical_attention_block', 'nonlocal_block' into the backbone like
+        ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
+        Bottleneck.
+
+        An example of plugins format could be :
+        >>> plugins=[
+        ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+        ...          stages=(False, True, True, True),
+        ...          position='after_conv2'),
+        ...     dict(cfg=dict(type='yyy'),
+        ...          stages=(True, True, True, True),
+        ...          position='after_conv3'),
+        ...     dict(cfg=dict(type='zzz', postfix='1'),
+        ...          stages=(True, True, True, True),
+        ...          position='after_conv3'),
+        ...     dict(cfg=dict(type='zzz', postfix='2'),
+        ...          stages=(True, True, True, True),
+        ...          position='after_conv3')
+        ... ]
+        >>> self = ResNet(depth=18)
+        >>> stage_plugins = self.make_stage_plugins(plugins, 0)
+        >>> assert len(stage_plugins) == 3
+
+        Suppose 'stage_idx=0', the structure of blocks in the stage would be:
+            conv1-> conv2->conv3->yyy->zzz1->zzz2
+        Suppose 'stage_idx=1', the structure of blocks in the stage would be:
+            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2
+
+        If stages is missing, the plugin would be applied to all stages.
+
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+
+        Returns:
+            list[dict]: Plugins for current stage
+        """
+        stage_plugins = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            # whether to insert plugin into current stage
+            if stages is None or stages[stage_idx]:
+                stage_plugins.append(plugin)
+
+        return stage_plugins
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        """Make stem layer for ResNet."""
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels)[1],
+                nn.ReLU(inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        """Freeze stages param and norm stats."""
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@MODELS.register_module()
+class ResNetV1c(ResNet):
+    """ResNetV1c variant described in [1]_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1c replaces the 7x7 conv in
+    the input stem with three 3x3 convs. For more details please refer to `Bag
+    of Tricks for Image Classification with Convolutional Neural Networks
+    <https://arxiv.org/abs/1812.01187>`_.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(deep_stem=True, avg_down=False, **kwargs)
+
+
+@MODELS.register_module()
+class ResNetV1d(ResNet):
+    """ResNetV1d variant described in [1]_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(deep_stem=True, avg_down=True, **kwargs)
diff --git a/head_extractor/build/lib/mmseg/models/backbones/resnext.py b/head_extractor/build/lib/mmseg/models/backbones/resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..67a244a12f61b78ee12e89e8b45868781208614c
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/resnext.py
@@ -0,0 +1,150 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from mmseg.registry import MODELS
+from ..utils import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeXt.
+
+    If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is
+    "caffe", the stride-two layer is the first 1x1 conv layer.
+    """
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 **kwargs):
+        super().__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@MODELS.register_module()
+class ResNeXt(ResNet):
+    """ResNeXt backbone.
+
+    This backbone is the implementation of `Aggregated
+    Residual Transformations for Deep Neural
+    Networks <https://arxiv.org/abs/1611.05431>`_.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Normally 3.
+        num_stages (int): Resnet stages, normally 4.
+        groups (int): Group of resnext.
+        base_width (int): Base width of resnext.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from mmseg.models import ResNeXt
+        >>> import torch
+        >>> self = ResNeXt(depth=50)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 8, 8)
+        (1, 512, 4, 4)
+        (1, 1024, 2, 2)
+        (1, 2048, 1, 1)
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        super().__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``"""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/head_extractor/build/lib/mmseg/models/backbones/stdc.py b/head_extractor/build/lib/mmseg/models/backbones/stdc.py
new file mode 100644
index 0000000000000000000000000000000000000000..758a3c92e07dc8d2051f670adf00d163019d758c
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/stdc.py
@@ -0,0 +1,422 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from https://github.com/MichaelFan01/STDC-Seg."""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList, Sequential
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .bisenetv1 import AttentionRefinementModule
+
+
+class STDCModule(BaseModule):
+    """STDCModule.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels before scaling.
+        stride (int): The number of stride for the first conv layer.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): The activation config for conv layers.
+        num_convs (int): Numbers of conv layers.
+        fusion_type (str): Type of fusion operation. Default: 'add'.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 num_convs=4,
+                 fusion_type='add',
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert num_convs > 1
+        assert fusion_type in ['add', 'cat']
+        self.stride = stride
+        self.with_downsample = True if self.stride == 2 else False
+        self.fusion_type = fusion_type
+
+        self.layers = ModuleList()
+        conv_0 = ConvModule(
+            in_channels, out_channels // 2, kernel_size=1, norm_cfg=norm_cfg)
+
+        if self.with_downsample:
+            self.downsample = ConvModule(
+                out_channels // 2,
+                out_channels // 2,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                groups=out_channels // 2,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+
+            if self.fusion_type == 'add':
+                self.layers.append(nn.Sequential(conv_0, self.downsample))
+                self.skip = Sequential(
+                    ConvModule(
+                        in_channels,
+                        in_channels,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        groups=in_channels,
+                        norm_cfg=norm_cfg,
+                        act_cfg=None),
+                    ConvModule(
+                        in_channels,
+                        out_channels,
+                        1,
+                        norm_cfg=norm_cfg,
+                        act_cfg=None))
+            else:
+                self.layers.append(conv_0)
+                self.skip = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
+        else:
+            self.layers.append(conv_0)
+
+        for i in range(1, num_convs):
+            out_factor = 2**(i + 1) if i != num_convs - 1 else 2**i
+            self.layers.append(
+                ConvModule(
+                    out_channels // 2**i,
+                    out_channels // out_factor,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs):
+        if self.fusion_type == 'add':
+            out = self.forward_add(inputs)
+        else:
+            out = self.forward_cat(inputs)
+        return out
+
+    def forward_add(self, inputs):
+        layer_outputs = []
+        x = inputs.clone()
+        for layer in self.layers:
+            x = layer(x)
+            layer_outputs.append(x)
+        if self.with_downsample:
+            inputs = self.skip(inputs)
+
+        return torch.cat(layer_outputs, dim=1) + inputs
+
+    def forward_cat(self, inputs):
+        x0 = self.layers[0](inputs)
+        layer_outputs = [x0]
+        for i, layer in enumerate(self.layers[1:]):
+            if i == 0:
+                if self.with_downsample:
+                    x = layer(self.downsample(x0))
+                else:
+                    x = layer(x0)
+            else:
+                x = layer(x)
+            layer_outputs.append(x)
+        if self.with_downsample:
+            layer_outputs[0] = self.skip(x0)
+        return torch.cat(layer_outputs, dim=1)
+
+
+class FeatureFusionModule(BaseModule):
+    """Feature Fusion Module. This module is different from FeatureFusionModule
+    in BiSeNetV1. It uses two ConvModules in `self.attention` whose inter
+    channel number is calculated by given `scale_factor`, while
+    FeatureFusionModule in BiSeNetV1 only uses one ConvModule in
+    `self.conv_atten`.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        scale_factor (int): The number of channel scale factor.
+            Default: 4.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): The activation config for conv layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 scale_factor=4,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        channels = out_channels // scale_factor
+        self.conv0 = ConvModule(
+            in_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.attention = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            ConvModule(
+                out_channels,
+                channels,
+                1,
+                norm_cfg=None,
+                bias=False,
+                act_cfg=act_cfg),
+            ConvModule(
+                channels,
+                out_channels,
+                1,
+                norm_cfg=None,
+                bias=False,
+                act_cfg=None), nn.Sigmoid())
+
+    def forward(self, spatial_inputs, context_inputs):
+        inputs = torch.cat([spatial_inputs, context_inputs], dim=1)
+        x = self.conv0(inputs)
+        attn = self.attention(x)
+        x_attn = x * attn
+        return x_attn + x
+
+
+@MODELS.register_module()
+class STDCNet(BaseModule):
+    """This backbone is the implementation of `Rethinking BiSeNet For Real-time
+    Semantic Segmentation <https://arxiv.org/abs/2104.13188>`_.
+
+    Args:
+        stdc_type (int): The type of backbone structure,
+            `STDCNet1` and`STDCNet2` denotes two main backbones in paper,
+            whose FLOPs is 813M and 1446M, respectively.
+        in_channels (int): The num of input_channels.
+        channels (tuple[int]): The output channels for each stage.
+        bottleneck_type (str): The type of STDC Module type, the value must
+            be 'add' or 'cat'.
+        norm_cfg (dict): Config dict for normalization layer.
+        act_cfg (dict): The activation config for conv layers.
+        num_convs (int): Numbers of conv layer at each STDC Module.
+            Default: 4.
+        with_final_conv (bool): Whether add a conv layer at the Module output.
+            Default: True.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Example:
+        >>> import torch
+        >>> stdc_type = 'STDCNet1'
+        >>> in_channels = 3
+        >>> channels = (32, 64, 256, 512, 1024)
+        >>> bottleneck_type = 'cat'
+        >>> inputs = torch.rand(1, 3, 1024, 2048)
+        >>> self = STDCNet(stdc_type, in_channels,
+        ...                 channels, bottleneck_type).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 256, 128, 256])
+        outputs[1].shape = torch.Size([1, 512, 64, 128])
+        outputs[2].shape = torch.Size([1, 1024, 32, 64])
+    """
+
+    arch_settings = {
+        'STDCNet1': [(2, 1), (2, 1), (2, 1)],
+        'STDCNet2': [(2, 1, 1, 1), (2, 1, 1, 1, 1), (2, 1, 1)]
+    }
+
+    def __init__(self,
+                 stdc_type,
+                 in_channels,
+                 channels,
+                 bottleneck_type,
+                 norm_cfg,
+                 act_cfg,
+                 num_convs=4,
+                 with_final_conv=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert stdc_type in self.arch_settings, \
+            f'invalid structure {stdc_type} for STDCNet.'
+        assert bottleneck_type in ['add', 'cat'],\
+            f'bottleneck_type must be `add` or `cat`, got {bottleneck_type}'
+
+        assert len(channels) == 5,\
+            f'invalid channels length {len(channels)} for STDCNet.'
+
+        self.in_channels = in_channels
+        self.channels = channels
+        self.stage_strides = self.arch_settings[stdc_type]
+        self.prtrained = pretrained
+        self.num_convs = num_convs
+        self.with_final_conv = with_final_conv
+
+        self.stages = ModuleList([
+            ConvModule(
+                self.in_channels,
+                self.channels[0],
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                self.channels[0],
+                self.channels[1],
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        ])
+        # `self.num_shallow_features` is the number of shallow modules in
+        # `STDCNet`, which is noted as `Stage1` and `Stage2` in original paper.
+        # They are both not used for following modules like Attention
+        # Refinement Module and Feature Fusion Module.
+        # Thus they would be cut from `outs`. Please refer to Figure 4
+        # of original paper for more details.
+        self.num_shallow_features = len(self.stages)
+
+        for strides in self.stage_strides:
+            idx = len(self.stages) - 1
+            self.stages.append(
+                self._make_stage(self.channels[idx], self.channels[idx + 1],
+                                 strides, norm_cfg, act_cfg, bottleneck_type))
+        # After appending, `self.stages` is a ModuleList including several
+        # shallow modules and STDCModules.
+        # (len(self.stages) ==
+        # self.num_shallow_features + len(self.stage_strides))
+        if self.with_final_conv:
+            self.final_conv = ConvModule(
+                self.channels[-1],
+                max(1024, self.channels[-1]),
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+
+    def _make_stage(self, in_channels, out_channels, strides, norm_cfg,
+                    act_cfg, bottleneck_type):
+        layers = []
+        for i, stride in enumerate(strides):
+            layers.append(
+                STDCModule(
+                    in_channels if i == 0 else out_channels,
+                    out_channels,
+                    stride,
+                    norm_cfg,
+                    act_cfg,
+                    num_convs=self.num_convs,
+                    fusion_type=bottleneck_type))
+        return Sequential(*layers)
+
+    def forward(self, x):
+        outs = []
+        for stage in self.stages:
+            x = stage(x)
+            outs.append(x)
+        if self.with_final_conv:
+            outs[-1] = self.final_conv(outs[-1])
+        outs = outs[self.num_shallow_features:]
+        return tuple(outs)
+
+
+@MODELS.register_module()
+class STDCContextPathNet(BaseModule):
+    """STDCNet with Context Path. The `outs` below is a list of three feature
+    maps from deep to shallow, whose height and width is from small to big,
+    respectively. The biggest feature map of `outs` is outputted for
+    `STDCHead`, where Detail Loss would be calculated by Detail Ground-truth.
+    The other two feature maps are used for Attention Refinement Module,
+    respectively. Besides, the biggest feature map of `outs` and the last
+    output of Attention Refinement Module are concatenated for Feature Fusion
+    Module. Then, this fusion feature map `feat_fuse` would be outputted for
+    `decode_head`. More details please refer to Figure 4 of original paper.
+
+    Args:
+        backbone_cfg (dict): Config dict for stdc backbone.
+        last_in_channels (tuple(int)), The number of channels of last
+            two feature maps from stdc backbone. Default: (1024, 512).
+        out_channels (int): The channels of output feature maps.
+            Default: 128.
+        ffm_cfg (dict): Config dict for Feature Fusion Module. Default:
+            `dict(in_channels=512, out_channels=256, scale_factor=4)`.
+        upsample_mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'nearest'``.
+        align_corners (str): align_corners argument of F.interpolate. It
+            must be `None` if upsample_mode is ``'nearest'``. Default: None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Return:
+        outputs (tuple): The tuple of list of output feature map for
+            auxiliary heads and decoder head.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 last_in_channels=(1024, 512),
+                 out_channels=128,
+                 ffm_cfg=dict(
+                     in_channels=512, out_channels=256, scale_factor=4),
+                 upsample_mode='nearest',
+                 align_corners=None,
+                 norm_cfg=dict(type='BN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone_cfg)
+        self.arms = ModuleList()
+        self.convs = ModuleList()
+        for channels in last_in_channels:
+            self.arms.append(AttentionRefinementModule(channels, out_channels))
+            self.convs.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg))
+        self.conv_avg = ConvModule(
+            last_in_channels[0], out_channels, 1, norm_cfg=norm_cfg)
+
+        self.ffm = FeatureFusionModule(**ffm_cfg)
+
+        self.upsample_mode = upsample_mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        outs = list(self.backbone(x))
+        avg = F.adaptive_avg_pool2d(outs[-1], 1)
+        avg_feat = self.conv_avg(avg)
+
+        feature_up = resize(
+            avg_feat,
+            size=outs[-1].shape[2:],
+            mode=self.upsample_mode,
+            align_corners=self.align_corners)
+        arms_out = []
+        for i in range(len(self.arms)):
+            x_arm = self.arms[i](outs[len(outs) - 1 - i]) + feature_up
+            feature_up = resize(
+                x_arm,
+                size=outs[len(outs) - 1 - i - 1].shape[2:],
+                mode=self.upsample_mode,
+                align_corners=self.align_corners)
+            feature_up = self.convs[i](feature_up)
+            arms_out.append(feature_up)
+
+        feat_fuse = self.ffm(outs[0], arms_out[1])
+
+        # The `outputs` has four feature maps.
+        # `outs[0]` is outputted for `STDCHead` auxiliary head.
+        # Two feature maps of `arms_out` are outputted for auxiliary head.
+        # `feat_fuse` is outputted for decoder head.
+        outputs = [outs[0]] + list(arms_out) + [feat_fuse]
+        return tuple(outputs)
diff --git a/head_extractor/build/lib/mmseg/models/backbones/swin.py b/head_extractor/build/lib/mmseg/models/backbones/swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..67b28a96e15fe81e8213d67518d664383a4fd255
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/swin.py
@@ -0,0 +1,757 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, build_dropout
+from mmengine.logging import print_log
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, trunc_normal_,
+                                        trunc_normal_init)
+from mmengine.runner import CheckpointLoader
+from mmengine.utils import to_2tuple
+
+from mmseg.registry import MODELS
+from ..utils.embed import PatchEmbed, PatchMerging
+
+
+class WindowMSA(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.scale = qk_scale or head_embed_dims**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # About 2x faster than original impl
+        Wh, Ww = self.window_size
+        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
+        rel_position_index = rel_index_coords + rel_index_coords.T
+        rel_position_index = rel_position_index.flip(1).contiguous()
+        self.register_buffer('relative_position_index', rel_position_index)
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def init_weights(self):
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+
+            x (tensor): input features with shape of (num_windows*B, N, C)
+            mask (tensor | None, Optional): mask with shape of (num_windows,
+                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        # make torchscript happy (cannot use tensor as tuple)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def double_step_seq(step1, len1, step2, len2):
+        seq1 = torch.arange(0, step1 * len1, step1)
+        seq2 = torch.arange(0, step2 * len2, step2)
+        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
+
+
+class ShiftWindowMSA(BaseModule):
+    """Shifted Window Multihead Self-Attention Module.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): The height and width of the window.
+        shift_size (int, optional): The shift step of each window towards
+            right-bottom. If zero, act as regular window-msa. Defaults to 0.
+        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
+            Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Defaults: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Defaults: 0.
+        proj_drop_rate (float, optional): Dropout ratio of output.
+            Defaults: 0.
+        dropout_layer (dict, optional): The dropout_layer used before output.
+            Defaults: dict(type='DropPath', drop_prob=0.).
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 shift_size=0,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0,
+                 proj_drop_rate=0,
+                 dropout_layer=dict(type='DropPath', drop_prob=0.),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.window_size = window_size
+        self.shift_size = shift_size
+        assert 0 <= self.shift_size < self.window_size
+
+        self.w_msa = WindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=to_2tuple(window_size),
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate,
+            init_cfg=None)
+
+        self.drop = build_dropout(dropout_layer)
+
+    def forward(self, query, hw_shape):
+        B, L, C = query.shape
+        H, W = hw_shape
+        assert L == H * W, 'input feature has wrong size'
+        query = query.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
+        H_pad, W_pad = query.shape[1], query.shape[2]
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_query = torch.roll(
+                query,
+                shifts=(-self.shift_size, -self.shift_size),
+                dims=(1, 2))
+
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device)
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            # nW, window_size, window_size, 1
+            mask_windows = self.window_partition(img_mask)
+            mask_windows = mask_windows.view(
+                -1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                              float(-100.0)).masked_fill(
+                                                  attn_mask == 0, float(0.0))
+        else:
+            shifted_query = query
+            attn_mask = None
+
+        # nW*B, window_size, window_size, C
+        query_windows = self.window_partition(shifted_query)
+        # nW*B, window_size*window_size, C
+        query_windows = query_windows.view(-1, self.window_size**2, C)
+
+        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
+        attn_windows = self.w_msa(query_windows, mask=attn_mask)
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+
+        # B H' W' C
+        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        x = self.drop(x)
+        return x
+
+    def window_reverse(self, windows, H, W):
+        """
+        Args:
+            windows: (num_windows*B, window_size, window_size, C)
+            H (int): Height of image
+            W (int): Width of image
+        Returns:
+            x: (B, H, W, C)
+        """
+        window_size = self.window_size
+        B = int(windows.shape[0] / (H * W / window_size / window_size))
+        x = windows.view(B, H // window_size, W // window_size, window_size,
+                         window_size, -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+        return x
+
+    def window_partition(self, x):
+        """
+        Args:
+            x: (B, H, W, C)
+        Returns:
+            windows: (num_windows*B, window_size, window_size, C)
+        """
+        B, H, W, C = x.shape
+        window_size = self.window_size
+        x = x.view(B, H // window_size, window_size, W // window_size,
+                   window_size, C)
+        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+        windows = windows.view(-1, window_size, window_size, C)
+        return windows
+
+
+class SwinBlock(BaseModule):
+    """"
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        window_size (int, optional): The local window scale. Default: 7.
+        shift (bool, optional): whether to shift window or not. Default False.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 window_size=7,
+                 shift=False,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+
+        self.with_cp = with_cp
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.attn = ShiftWindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            shift_size=window_size // 2 if shift else 0,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            init_cfg=None)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=2,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=True,
+            init_cfg=None)
+
+    def forward(self, x, hw_shape):
+
+        def _inner_forward(x):
+            identity = x
+            x = self.norm1(x)
+            x = self.attn(x, hw_shape)
+
+            x = x + identity
+
+            identity = x
+            x = self.norm2(x)
+            x = self.ffn(x, identity=identity)
+
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+
+        return x
+
+
+class SwinBlockSequence(BaseModule):
+    """Implements one stage in Swin Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        depth (int): The number of blocks in this stage.
+        window_size (int, optional): The local window scale. Default: 7.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float | list[float], optional): Stochastic depth
+            rate. Default: 0.
+        downsample (BaseModule | None, optional): The downsample operation
+            module. Default: None.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 depth,
+                 window_size=7,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 downsample=None,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(drop_path_rate, list):
+            drop_path_rates = drop_path_rate
+            assert len(drop_path_rates) == depth
+        else:
+            drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)]
+
+        self.blocks = ModuleList()
+        for i in range(depth):
+            block = SwinBlock(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                feedforward_channels=feedforward_channels,
+                window_size=window_size,
+                shift=False if i % 2 == 0 else True,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=drop_path_rates[i],
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.blocks.append(block)
+
+        self.downsample = downsample
+
+    def forward(self, x, hw_shape):
+        for block in self.blocks:
+            x = block(x, hw_shape)
+
+        if self.downsample:
+            x_down, down_hw_shape = self.downsample(x, hw_shape)
+            return x_down, down_hw_shape, x, hw_shape
+        else:
+            return x, hw_shape, x, hw_shape
+
+
+@MODELS.register_module()
+class SwinTransformer(BaseModule):
+    """Swin Transformer backbone.
+
+    This backbone is the implementation of `Swin Transformer:
+    Hierarchical Vision Transformer using Shifted
+    Windows <https://arxiv.org/abs/2103.14030>`_.
+    Inspiration from https://github.com/microsoft/Swin-Transformer.
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): The num of input channels.
+            Defaults: 3.
+        embed_dims (int): The feature dimension. Default: 96.
+        patch_size (int | tuple[int]): Patch size. Default: 4.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (int | float): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+            Default: (2, 2, 6, 2).
+        num_heads (tuple[int]): Parallel attention heads of each Swin
+            Transformer stage. Default: (3, 6, 12, 24).
+        strides (tuple[int]): The patch merging or patch embedding stride of
+            each Swin Transformer stage. (In swin, we set kernel size equal to
+            stride.) Default: (4, 2, 2, 2).
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key,
+            value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        patch_norm (bool): If add a norm layer for patch embed and patch
+            merging. Default: True.
+        drop_rate (float): Dropout rate. Defaults: 0.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults: 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: False.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LN').
+        norm_cfg (dict): Config dict for normalization layer at
+            output of backone. Defaults: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        pretrained (str, optional): model pretrained path. Default: None.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 in_channels=3,
+                 embed_dims=96,
+                 patch_size=4,
+                 window_size=7,
+                 mlp_ratio=4,
+                 depths=(2, 2, 6, 2),
+                 num_heads=(3, 6, 12, 24),
+                 strides=(4, 2, 2, 2),
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=True,
+                 qk_scale=None,
+                 patch_norm=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_abs_pos_embed=False,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 pretrained=None,
+                 frozen_stages=-1,
+                 init_cfg=None):
+        self.frozen_stages = frozen_stages
+
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            init_cfg = init_cfg
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        super().__init__(init_cfg=init_cfg)
+
+        num_layers = len(depths)
+        self.out_indices = out_indices
+        self.use_abs_pos_embed = use_abs_pos_embed
+
+        assert strides[0] == patch_size, 'Use non-overlapping patch embed.'
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=strides[0],
+            padding='corner',
+            norm_cfg=norm_cfg if patch_norm else None,
+            init_cfg=None)
+
+        if self.use_abs_pos_embed:
+            patch_row = pretrain_img_size[0] // patch_size
+            patch_col = pretrain_img_size[1] // patch_size
+            num_patches = patch_row * patch_col
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros((1, num_patches, embed_dims)))
+
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+
+        # set stochastic depth decay rule
+        total_depth = sum(depths)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
+        ]
+
+        self.stages = ModuleList()
+        in_channels = embed_dims
+        for i in range(num_layers):
+            if i < num_layers - 1:
+                downsample = PatchMerging(
+                    in_channels=in_channels,
+                    out_channels=2 * in_channels,
+                    stride=strides[i + 1],
+                    norm_cfg=norm_cfg if patch_norm else None,
+                    init_cfg=None)
+            else:
+                downsample = None
+
+            stage = SwinBlockSequence(
+                embed_dims=in_channels,
+                num_heads=num_heads[i],
+                feedforward_channels=int(mlp_ratio * in_channels),
+                depth=depths[i],
+                window_size=window_size,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                downsample=downsample,
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.stages.append(stage)
+            if downsample:
+                in_channels = downsample.out_channels
+
+        self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)]
+        # Add a norm layer for each output
+        for i in out_indices:
+            layer = build_norm_layer(norm_cfg, self.num_features[i])[1]
+            layer_name = f'norm{i}'
+            self.add_module(layer_name, layer)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super().train(mode)
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            if self.use_abs_pos_embed:
+                self.absolute_pos_embed.requires_grad = False
+            self.drop_after_pos.eval()
+
+        for i in range(1, self.frozen_stages + 1):
+
+            if (i - 1) in self.out_indices:
+                norm_layer = getattr(self, f'norm{i-1}')
+                norm_layer.eval()
+                for param in norm_layer.parameters():
+                    param.requires_grad = False
+
+            m = self.stages[i - 1]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self):
+        if self.init_cfg is None:
+            print_log(f'No pre-trained weights for '
+                      f'{self.__class__.__name__}, '
+                      f'training start from scratch')
+            if self.use_abs_pos_embed:
+                trunc_normal_(self.absolute_pos_embed, std=0.02)
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, val=1.0, bias=0.)
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            ckpt = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+            if 'state_dict' in ckpt:
+                _state_dict = ckpt['state_dict']
+            elif 'model' in ckpt:
+                _state_dict = ckpt['model']
+            else:
+                _state_dict = ckpt
+
+            state_dict = OrderedDict()
+            for k, v in _state_dict.items():
+                if k.startswith('backbone.'):
+                    state_dict[k[9:]] = v
+                else:
+                    state_dict[k] = v
+
+            # strip prefix of state_dict
+            if list(state_dict.keys())[0].startswith('module.'):
+                state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+            # reshape absolute position embedding
+            if state_dict.get('absolute_pos_embed') is not None:
+                absolute_pos_embed = state_dict['absolute_pos_embed']
+                N1, L, C1 = absolute_pos_embed.size()
+                N2, C2, H, W = self.absolute_pos_embed.size()
+                if N1 != N2 or C1 != C2 or L != H * W:
+                    print_log('Error in loading absolute_pos_embed, pass')
+                else:
+                    state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
+                        N2, H, W, C2).permute(0, 3, 1, 2).contiguous()
+
+            # interpolate position bias table if needed
+            relative_position_bias_table_keys = [
+                k for k in state_dict.keys()
+                if 'relative_position_bias_table' in k
+            ]
+            for table_key in relative_position_bias_table_keys:
+                table_pretrained = state_dict[table_key]
+                if table_key in self.state_dict():
+                    table_current = self.state_dict()[table_key]
+                    L1, nH1 = table_pretrained.size()
+                    L2, nH2 = table_current.size()
+                    if nH1 != nH2:
+                        print_log(f'Error in loading {table_key}, pass')
+                    elif L1 != L2:
+                        S1 = int(L1**0.5)
+                        S2 = int(L2**0.5)
+                        table_pretrained_resized = F.interpolate(
+                            table_pretrained.permute(1, 0).reshape(
+                                1, nH1, S1, S1),
+                            size=(S2, S2),
+                            mode='bicubic')
+                        state_dict[table_key] = table_pretrained_resized.view(
+                            nH2, L2).permute(1, 0).contiguous()
+
+            # load state_dict
+            self.load_state_dict(state_dict, strict=False)
+
+    def forward(self, x):
+        x, hw_shape = self.patch_embed(x)
+
+        if self.use_abs_pos_embed:
+            x = x + self.absolute_pos_embed
+        x = self.drop_after_pos(x)
+
+        outs = []
+        for i, stage in enumerate(self.stages):
+            x, hw_shape, out, out_hw_shape = stage(x, hw_shape)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                out = norm_layer(out)
+                out = out.view(-1, *out_hw_shape,
+                               self.num_features[i]).permute(0, 3, 1,
+                                                             2).contiguous()
+                outs.append(out)
+
+        return outs
diff --git a/head_extractor/build/lib/mmseg/models/backbones/timm_backbone.py b/head_extractor/build/lib/mmseg/models/backbones/timm_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eef302bddeac3cee71412bcb481b68b796e515f
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/timm_backbone.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+try:
+    import timm
+except ImportError:
+    timm = None
+
+from mmengine.model import BaseModule
+from mmengine.registry import MODELS as MMENGINE_MODELS
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class TIMMBackbone(BaseModule):
+    """Wrapper to use backbones from timm library. More details can be found in
+    `timm <https://github.com/rwightman/pytorch-image-models>`_ .
+
+    Args:
+        model_name (str): Name of timm model to instantiate.
+        pretrained (bool): Load pretrained weights if True.
+        checkpoint_path (str): Path of checkpoint to load after
+            model is initialized.
+        in_channels (int): Number of input image channels. Default: 3.
+        init_cfg (dict, optional): Initialization config dict
+        **kwargs: Other timm & model specific arguments.
+    """
+
+    def __init__(
+        self,
+        model_name,
+        features_only=True,
+        pretrained=True,
+        checkpoint_path='',
+        in_channels=3,
+        init_cfg=None,
+        **kwargs,
+    ):
+        if timm is None:
+            raise RuntimeError('timm is not installed')
+        super().__init__(init_cfg)
+        if 'norm_layer' in kwargs:
+            kwargs['norm_layer'] = MMENGINE_MODELS.get(kwargs['norm_layer'])
+        self.timm_model = timm.create_model(
+            model_name=model_name,
+            features_only=features_only,
+            pretrained=pretrained,
+            in_chans=in_channels,
+            checkpoint_path=checkpoint_path,
+            **kwargs,
+        )
+
+        # Make unused parameters None
+        self.timm_model.global_pool = None
+        self.timm_model.fc = None
+        self.timm_model.classifier = None
+
+        # Hack to use pretrained weights from timm
+        if pretrained or checkpoint_path:
+            self._is_init = True
+
+    def forward(self, x):
+        features = self.timm_model(x)
+        return features
diff --git a/head_extractor/build/lib/mmseg/models/backbones/twins.py b/head_extractor/build/lib/mmseg/models/backbones/twins.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6a6eea795cf53bee6b52ece80d5d90ecc969970
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/twins.py
@@ -0,0 +1,588 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import FFN
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, normal_init,
+                                        trunc_normal_init)
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmseg.models.backbones.mit import EfficientMultiheadAttention
+from mmseg.registry import MODELS
+from ..utils.embed import PatchEmbed
+
+
+class GlobalSubsampledAttention(EfficientMultiheadAttention):
+    """Global Sub-sampled Attention (Spatial Reduction Attention)
+
+    This module is modified from EfficientMultiheadAttention，
+    which is a module from mmseg.models.backbones.mit.py.
+    Specifically, there is no difference between
+    `GlobalSubsampledAttention` and `EfficientMultiheadAttention`,
+    `GlobalSubsampledAttention` is built as a brand new class
+    because it is renamed as `Global sub-sampled attention (GSA)`
+    in paper.
+
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut. Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dims)
+            or (n, batch, embed_dims). Default: False.
+        qkv_bias (bool): enable bias for qkv if True. Default: True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of GSA of PCPVT.
+            Default: 1.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=None,
+                 batch_first=True,
+                 qkv_bias=True,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 init_cfg=None):
+        super().__init__(
+            embed_dims,
+            num_heads,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            dropout_layer=dropout_layer,
+            batch_first=batch_first,
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio,
+            init_cfg=init_cfg)
+
+
+class GSAEncoderLayer(BaseModule):
+    """Implements one encoder layer with GSA.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): Stochastic depth rate. Default 0.0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (float): Kernel_size of conv in Attention modules. Default: 1.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 num_fcs=2,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1.,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1]
+        self.attn = GlobalSubsampledAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims, postfix=2)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=num_fcs,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=False)
+
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path_rate)
+        ) if drop_path_rate > 0. else nn.Identity()
+
+    def forward(self, x, hw_shape):
+        x = x + self.drop_path(self.attn(self.norm1(x), hw_shape, identity=0.))
+        x = x + self.drop_path(self.ffn(self.norm2(x)))
+        return x
+
+
+class LocallyGroupedSelfAttention(BaseModule):
+    """Locally-grouped Self Attention (LSA) module.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads. Default: 8
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: False.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        window_size(int): Window size of LSA. Default: 1.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 window_size=1,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        assert embed_dims % num_heads == 0, f'dim {embed_dims} should be ' \
+                                            f'divided by num_heads ' \
+                                            f'{num_heads}.'
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        head_dim = embed_dims // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+        self.window_size = window_size
+
+    def forward(self, x, hw_shape):
+        b, n, c = x.shape
+        h, w = hw_shape
+        x = x.view(b, h, w, c)
+
+        # pad feature maps to multiples of Local-groups
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - w % self.window_size) % self.window_size
+        pad_b = (self.window_size - h % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+
+        # calculate attention mask for LSA
+        Hp, Wp = x.shape[1:-1]
+        _h, _w = Hp // self.window_size, Wp // self.window_size
+        mask = torch.zeros((1, Hp, Wp), device=x.device)
+        mask[:, -pad_b:, :].fill_(1)
+        mask[:, :, -pad_r:].fill_(1)
+
+        # [B, _h, _w, window_size, window_size, C]
+        x = x.reshape(b, _h, self.window_size, _w, self.window_size,
+                      c).transpose(2, 3)
+        mask = mask.reshape(1, _h, self.window_size, _w,
+                            self.window_size).transpose(2, 3).reshape(
+                                1, _h * _w,
+                                self.window_size * self.window_size)
+        # [1, _h*_w, window_size*window_size, window_size*window_size]
+        attn_mask = mask.unsqueeze(2) - mask.unsqueeze(3)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                          float(-1000.0)).masked_fill(
+                                              attn_mask == 0, float(0.0))
+
+        # [3, B, _w*_h, nhead, window_size*window_size, dim]
+        qkv = self.qkv(x).reshape(b, _h * _w,
+                                  self.window_size * self.window_size, 3,
+                                  self.num_heads, c // self.num_heads).permute(
+                                      3, 0, 1, 4, 2, 5)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        # [B, _h*_w, n_head, window_size*window_size, window_size*window_size]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn + attn_mask.unsqueeze(2)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        attn = (attn @ v).transpose(2, 3).reshape(b, _h, _w, self.window_size,
+                                                  self.window_size, c)
+        x = attn.transpose(2, 3).reshape(b, _h * self.window_size,
+                                         _w * self.window_size, c)
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :h, :w, :].contiguous()
+
+        x = x.reshape(b, n, c)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class LSAEncoderLayer(BaseModule):
+    """Implements one encoder layer in Twins-SVT.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+           Default: 0.0
+        drop_path_rate (float): Stochastic depth rate. Default 0.0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+           head_dim ** -0.5 if set. Default: None.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        window_size (int): Window size of LSA. Default: 1.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 num_fcs=2,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 window_size=1,
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1]
+        self.attn = LocallyGroupedSelfAttention(embed_dims, num_heads,
+                                                qkv_bias, qk_scale,
+                                                attn_drop_rate, drop_rate,
+                                                window_size)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims, postfix=2)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=num_fcs,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=False)
+
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path_rate)
+        ) if drop_path_rate > 0. else nn.Identity()
+
+    def forward(self, x, hw_shape):
+        x = x + self.drop_path(self.attn(self.norm1(x), hw_shape))
+        x = x + self.drop_path(self.ffn(self.norm2(x)))
+        return x
+
+
+class ConditionalPositionEncoding(BaseModule):
+    """The Conditional Position Encoding (CPE) module.
+
+    The CPE is the implementation of 'Conditional Positional Encodings
+    for Vision Transformers <https://arxiv.org/abs/2102.10882>'_.
+
+    Args:
+       in_channels (int): Number of input channels.
+       embed_dims (int): The feature dimension. Default: 768.
+       stride (int): Stride of conv layer. Default: 1.
+    """
+
+    def __init__(self, in_channels, embed_dims=768, stride=1, init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.proj = nn.Conv2d(
+            in_channels,
+            embed_dims,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=True,
+            groups=embed_dims)
+        self.stride = stride
+
+    def forward(self, x, hw_shape):
+        b, n, c = x.shape
+        h, w = hw_shape
+        feat_token = x
+        cnn_feat = feat_token.transpose(1, 2).view(b, c, h, w)
+        if self.stride == 1:
+            x = self.proj(cnn_feat) + cnn_feat
+        else:
+            x = self.proj(cnn_feat)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+
+
+@MODELS.register_module()
+class PCPVT(BaseModule):
+    """The backbone of Twins-PCPVT.
+
+    This backbone is the implementation of `Twins: Revisiting the Design
+    of Spatial Attention in Vision Transformers
+    <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (list): Embedding dimension. Default: [64, 128, 256, 512].
+        patch_sizes (list): The patch sizes. Default: [4, 2, 2, 2].
+        strides (list): The strides. Default: [4, 2, 2, 2].
+        num_heads (int): Number of attention heads. Default: [1, 2, 4, 8].
+        mlp_ratios (int): Ratio of mlp hidden dim to embedding dim.
+            Default: [4, 4, 4, 4].
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool): Enable bias for qkv if True. Default: False.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): Stochastic depth rate. Default 0.0
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        depths (list): Depths of each stage. Default [3, 4, 6, 3]
+        sr_ratios (list): Kernel_size of conv in each Attn module in
+            Transformer encoder layer. Default: [8, 4, 2, 1].
+        norm_after_stage（bool): Add extra norm. Default False.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=[64, 128, 256, 512],
+                 patch_sizes=[4, 2, 2, 2],
+                 strides=[4, 2, 2, 2],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_cfg=dict(type='LN'),
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 norm_after_stage=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+        self.depths = depths
+
+        # patch_embed
+        self.patch_embeds = ModuleList()
+        self.position_encoding_drops = ModuleList()
+        self.layers = ModuleList()
+
+        for i in range(len(depths)):
+            self.patch_embeds.append(
+                PatchEmbed(
+                    in_channels=in_channels if i == 0 else embed_dims[i - 1],
+                    embed_dims=embed_dims[i],
+                    conv_type='Conv2d',
+                    kernel_size=patch_sizes[i],
+                    stride=strides[i],
+                    padding='corner',
+                    norm_cfg=norm_cfg))
+
+            self.position_encoding_drops.append(nn.Dropout(p=drop_rate))
+
+        self.position_encodings = ModuleList([
+            ConditionalPositionEncoding(embed_dim, embed_dim)
+            for embed_dim in embed_dims
+        ])
+
+        # transformer encoder
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        cur = 0
+
+        for k in range(len(depths)):
+            _block = ModuleList([
+                GSAEncoderLayer(
+                    embed_dims=embed_dims[k],
+                    num_heads=num_heads[k],
+                    feedforward_channels=mlp_ratios[k] * embed_dims[k],
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    drop_path_rate=dpr[cur + i],
+                    num_fcs=2,
+                    qkv_bias=qkv_bias,
+                    act_cfg=dict(type='GELU'),
+                    norm_cfg=dict(type='LN'),
+                    sr_ratio=sr_ratios[k]) for i in range(depths[k])
+            ])
+            self.layers.append(_block)
+            cur += depths[k]
+
+        self.norm_name, norm = build_norm_layer(
+            norm_cfg, embed_dims[-1], postfix=1)
+
+        self.out_indices = out_indices
+        self.norm_after_stage = norm_after_stage
+        if self.norm_after_stage:
+            self.norm_list = ModuleList()
+            for dim in embed_dims:
+                self.norm_list.append(build_norm_layer(norm_cfg, dim)[1])
+
+    def init_weights(self):
+        if self.init_cfg is not None:
+            super().init_weights()
+        else:
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m, val=1.0, bias=0.)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(
+                        m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
+
+    def forward(self, x):
+        outputs = list()
+
+        b = x.shape[0]
+
+        for i in range(len(self.depths)):
+            x, hw_shape = self.patch_embeds[i](x)
+            h, w = hw_shape
+            x = self.position_encoding_drops[i](x)
+            for j, blk in enumerate(self.layers[i]):
+                x = blk(x, hw_shape)
+                if j == 0:
+                    x = self.position_encodings[i](x, hw_shape)
+            if self.norm_after_stage:
+                x = self.norm_list[i](x)
+            x = x.reshape(b, h, w, -1).permute(0, 3, 1, 2).contiguous()
+
+            if i in self.out_indices:
+                outputs.append(x)
+
+        return tuple(outputs)
+
+
+@MODELS.register_module()
+class SVT(PCPVT):
+    """The backbone of Twins-SVT.
+
+    This backbone is the implementation of `Twins: Revisiting the Design
+    of Spatial Attention in Vision Transformers
+    <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (list): Embedding dimension. Default: [64, 128, 256, 512].
+        patch_sizes (list): The patch sizes. Default: [4, 2, 2, 2].
+        strides (list): The strides. Default: [4, 2, 2, 2].
+        num_heads (int): Number of attention heads. Default: [1, 2, 4].
+        mlp_ratios (int): Ratio of mlp hidden dim to embedding dim.
+            Default: [4, 4, 4].
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool): Enable bias for qkv if True. Default: False.
+        drop_rate (float): Dropout rate. Default 0.
+        attn_drop_rate (float): Dropout ratio of attention weight.
+            Default 0.0
+        drop_path_rate (float): Stochastic depth rate. Default 0.2.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        depths (list): Depths of each stage. Default [4, 4, 4].
+        sr_ratios (list): Kernel_size of conv in each Attn module in
+            Transformer encoder layer. Default: [4, 2, 1].
+        windiow_sizes (list): Window size of LSA. Default: [7, 7, 7],
+        input_features_slice（bool): Input features need slice. Default: False.
+        norm_after_stage（bool): Add extra norm. Default False.
+        strides (list): Strides in patch-Embedding modules. Default: (2, 2, 2)
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=[64, 128, 256],
+                 patch_sizes=[4, 2, 2, 2],
+                 strides=[4, 2, 2, 2],
+                 num_heads=[1, 2, 4],
+                 mlp_ratios=[4, 4, 4],
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_cfg=dict(type='LN'),
+                 depths=[4, 4, 4],
+                 sr_ratios=[4, 2, 1],
+                 windiow_sizes=[7, 7, 7],
+                 norm_after_stage=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(in_channels, embed_dims, patch_sizes, strides,
+                         num_heads, mlp_ratios, out_indices, qkv_bias,
+                         drop_rate, attn_drop_rate, drop_path_rate, norm_cfg,
+                         depths, sr_ratios, norm_after_stage, pretrained,
+                         init_cfg)
+        # transformer encoder
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        for k in range(len(depths)):
+            for i in range(depths[k]):
+                if i % 2 == 0:
+                    self.layers[k][i] = \
+                        LSAEncoderLayer(
+                            embed_dims=embed_dims[k],
+                            num_heads=num_heads[k],
+                            feedforward_channels=mlp_ratios[k] * embed_dims[k],
+                            drop_rate=drop_rate,
+                            attn_drop_rate=attn_drop_rate,
+                            drop_path_rate=dpr[sum(depths[:k])+i],
+                            qkv_bias=qkv_bias,
+                            window_size=windiow_sizes[k])
diff --git a/head_extractor/build/lib/mmseg/models/backbones/unet.py b/head_extractor/build/lib/mmseg/models/backbones/unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..545921db8e14668e454f5834f9a1618fe0c04ffe
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/unet.py
@@ -0,0 +1,436 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+from mmengine.model import BaseModule
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmseg.registry import MODELS
+from ..utils import UpConvBlock, Upsample
+
+
+class BasicConvBlock(nn.Module):
+    """Basic convolutional block for UNet.
+
+    This module consists of several plain convolutional layers.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_convs (int): Number of convolutional layers. Default: 2.
+        stride (int): Whether use stride convolution to downsample
+            the input feature map. If stride=2, it only uses stride convolution
+            in the first convolutional layer to downsample the input feature
+            map. Options are 1 or 2. Default: 1.
+        dilation (int): Whether use dilated convolution to expand the
+            receptive field. Set dilation rate of each convolutional layer and
+            the dilation rate of the first convolutional layer is always 1.
+            Default: 1.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        conv_cfg (dict | None): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        dcn (bool): Use deformable convolution in convolutional layer or not.
+            Default: None.
+        plugins (dict): plugins for convolutional layers. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_convs=2,
+                 stride=1,
+                 dilation=1,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 dcn=None,
+                 plugins=None):
+        super().__init__()
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.with_cp = with_cp
+        convs = []
+        for i in range(num_convs):
+            convs.append(
+                ConvModule(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    stride=stride if i == 0 else 1,
+                    dilation=1 if i == 0 else dilation,
+                    padding=1 if i == 0 else dilation,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        self.convs = nn.Sequential(*convs)
+
+    def forward(self, x):
+        """Forward function."""
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(self.convs, x)
+        else:
+            out = self.convs(x)
+        return out
+
+
+@MODELS.register_module()
+class DeconvModule(nn.Module):
+    """Deconvolution upsample module in decoder for UNet (2X upsample).
+
+    This module uses deconvolution to upsample feature map in the decoder
+    of UNet.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        kernel_size (int): Kernel size of the convolutional layer. Default: 4.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 with_cp=False,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 *,
+                 kernel_size=4,
+                 scale_factor=2):
+        super().__init__()
+
+        assert (kernel_size - scale_factor >= 0) and\
+               (kernel_size - scale_factor) % 2 == 0,\
+               f'kernel_size should be greater than or equal to scale_factor '\
+               f'and (kernel_size - scale_factor) should be even numbers, '\
+               f'while the kernel size is {kernel_size} and scale_factor is '\
+               f'{scale_factor}.'
+
+        stride = scale_factor
+        padding = (kernel_size - scale_factor) // 2
+        self.with_cp = with_cp
+        deconv = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding)
+
+        norm_name, norm = build_norm_layer(norm_cfg, out_channels)
+        activate = build_activation_layer(act_cfg)
+        self.deconv_upsamping = nn.Sequential(deconv, norm, activate)
+
+    def forward(self, x):
+        """Forward function."""
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(self.deconv_upsamping, x)
+        else:
+            out = self.deconv_upsamping(x)
+        return out
+
+
+@MODELS.register_module()
+class InterpConv(nn.Module):
+    """Interpolation upsample module in decoder for UNet.
+
+    This module uses interpolation to upsample feature map in the decoder
+    of UNet. It consists of one interpolation upsample layer and one
+    convolutional layer. It can be one interpolation upsample layer followed
+    by one convolutional layer (conv_first=False) or one convolutional layer
+    followed by one interpolation upsample layer (conv_first=True).
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        conv_cfg (dict | None): Config dict for convolution layer.
+            Default: None.
+        conv_first (bool): Whether convolutional layer or interpolation
+            upsample layer first. Default: False. It means interpolation
+            upsample layer followed by one convolutional layer.
+        kernel_size (int): Kernel size of the convolutional layer. Default: 1.
+        stride (int): Stride of the convolutional layer. Default: 1.
+        padding (int): Padding of the convolutional layer. Default: 1.
+        upsample_cfg (dict): Interpolation config of the upsample layer.
+            Default: dict(
+                scale_factor=2, mode='bilinear', align_corners=False).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 with_cp=False,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 *,
+                 conv_cfg=None,
+                 conv_first=False,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 upsample_cfg=dict(
+                     scale_factor=2, mode='bilinear', align_corners=False)):
+        super().__init__()
+
+        self.with_cp = with_cp
+        conv = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        upsample = Upsample(**upsample_cfg)
+        if conv_first:
+            self.interp_upsample = nn.Sequential(conv, upsample)
+        else:
+            self.interp_upsample = nn.Sequential(upsample, conv)
+
+    def forward(self, x):
+        """Forward function."""
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(self.interp_upsample, x)
+        else:
+            out = self.interp_upsample(x)
+        return out
+
+
+@MODELS.register_module()
+class UNet(BaseModule):
+    """UNet backbone.
+
+    This backbone is the implementation of `U-Net: Convolutional Networks
+    for Biomedical Image Segmentation <https://arxiv.org/abs/1505.04597>`_.
+
+    Args:
+        in_channels (int): Number of input image channels. Default" 3.
+        base_channels (int): Number of base channels of each stage.
+            The output channels of the first stage. Default: 64.
+        num_stages (int): Number of stages in encoder, normally 5. Default: 5.
+        strides (Sequence[int 1 | 2]): Strides of each stage in encoder.
+            len(strides) is equal to num_stages. Normally the stride of the
+            first stage in encoder is 1. If strides[i]=2, it uses stride
+            convolution to downsample in the correspondence encoder stage.
+            Default: (1, 1, 1, 1, 1).
+        enc_num_convs (Sequence[int]): Number of convolutional layers in the
+            convolution block of the correspondence encoder stage.
+            Default: (2, 2, 2, 2, 2).
+        dec_num_convs (Sequence[int]): Number of convolutional layers in the
+            convolution block of the correspondence decoder stage.
+            Default: (2, 2, 2, 2).
+        downsamples (Sequence[int]): Whether use MaxPool to downsample the
+            feature map after the first stage of encoder
+            (stages: [1, num_stages)). If the correspondence encoder stage use
+            stride convolution (strides[i]=2), it will never use MaxPool to
+            downsample, even downsamples[i-1]=True.
+            Default: (True, True, True, True).
+        enc_dilations (Sequence[int]): Dilation rate of each stage in encoder.
+            Default: (1, 1, 1, 1, 1).
+        dec_dilations (Sequence[int]): Dilation rate of each stage in decoder.
+            Default: (1, 1, 1, 1).
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        conv_cfg (dict | None): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        upsample_cfg (dict): The upsample config of the upsample module in
+            decoder. Default: dict(type='InterpConv').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        dcn (bool): Use deformable convolution in convolutional layer or not.
+            Default: None.
+        plugins (dict): plugins for convolutional layers. Default: None.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Notice:
+        The input image size should be divisible by the whole downsample rate
+        of the encoder. More detail of the whole downsample rate can be found
+        in UNet._check_input_divisible.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 base_channels=64,
+                 num_stages=5,
+                 strides=(1, 1, 1, 1, 1),
+                 enc_num_convs=(2, 2, 2, 2, 2),
+                 dec_num_convs=(2, 2, 2, 2),
+                 downsamples=(True, True, True, True),
+                 enc_dilations=(1, 1, 1, 1, 1),
+                 dec_dilations=(1, 1, 1, 1),
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 upsample_cfg=dict(type='InterpConv'),
+                 norm_eval=False,
+                 dcn=None,
+                 plugins=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+        assert len(strides) == num_stages, \
+            'The length of strides should be equal to num_stages, '\
+            f'while the strides is {strides}, the length of '\
+            f'strides is {len(strides)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(enc_num_convs) == num_stages, \
+            'The length of enc_num_convs should be equal to num_stages, '\
+            f'while the enc_num_convs is {enc_num_convs}, the length of '\
+            f'enc_num_convs is {len(enc_num_convs)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(dec_num_convs) == (num_stages-1), \
+            'The length of dec_num_convs should be equal to (num_stages-1), '\
+            f'while the dec_num_convs is {dec_num_convs}, the length of '\
+            f'dec_num_convs is {len(dec_num_convs)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(downsamples) == (num_stages-1), \
+            'The length of downsamples should be equal to (num_stages-1), '\
+            f'while the downsamples is {downsamples}, the length of '\
+            f'downsamples is {len(downsamples)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(enc_dilations) == num_stages, \
+            'The length of enc_dilations should be equal to num_stages, '\
+            f'while the enc_dilations is {enc_dilations}, the length of '\
+            f'enc_dilations is {len(enc_dilations)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(dec_dilations) == (num_stages-1), \
+            'The length of dec_dilations should be equal to (num_stages-1), '\
+            f'while the dec_dilations is {dec_dilations}, the length of '\
+            f'dec_dilations is {len(dec_dilations)}, and the num_stages is '\
+            f'{num_stages}.'
+        self.num_stages = num_stages
+        self.strides = strides
+        self.downsamples = downsamples
+        self.norm_eval = norm_eval
+        self.base_channels = base_channels
+
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+
+        for i in range(num_stages):
+            enc_conv_block = []
+            if i != 0:
+                if strides[i] == 1 and downsamples[i - 1]:
+                    enc_conv_block.append(nn.MaxPool2d(kernel_size=2))
+                upsample = (strides[i] != 1 or downsamples[i - 1])
+                self.decoder.append(
+                    UpConvBlock(
+                        conv_block=BasicConvBlock,
+                        in_channels=base_channels * 2**i,
+                        skip_channels=base_channels * 2**(i - 1),
+                        out_channels=base_channels * 2**(i - 1),
+                        num_convs=dec_num_convs[i - 1],
+                        stride=1,
+                        dilation=dec_dilations[i - 1],
+                        with_cp=with_cp,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg,
+                        upsample_cfg=upsample_cfg if upsample else None,
+                        dcn=None,
+                        plugins=None))
+
+            enc_conv_block.append(
+                BasicConvBlock(
+                    in_channels=in_channels,
+                    out_channels=base_channels * 2**i,
+                    num_convs=enc_num_convs[i],
+                    stride=strides[i],
+                    dilation=enc_dilations[i],
+                    with_cp=with_cp,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    dcn=None,
+                    plugins=None))
+            self.encoder.append(nn.Sequential(*enc_conv_block))
+            in_channels = base_channels * 2**i
+
+    def forward(self, x):
+        self._check_input_divisible(x)
+        enc_outs = []
+        for enc in self.encoder:
+            x = enc(x)
+            enc_outs.append(x)
+        dec_outs = [x]
+        for i in reversed(range(len(self.decoder))):
+            x = self.decoder[i](enc_outs[i], x)
+            dec_outs.append(x)
+
+        return dec_outs
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super().train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def _check_input_divisible(self, x):
+        h, w = x.shape[-2:]
+        whole_downsample_rate = 1
+        for i in range(1, self.num_stages):
+            if self.strides[i] == 2 or self.downsamples[i - 1]:
+                whole_downsample_rate *= 2
+        assert (h % whole_downsample_rate == 0) \
+            and (w % whole_downsample_rate == 0),\
+            f'The input image size {(h, w)} should be divisible by the whole '\
+            f'downsample rate {whole_downsample_rate}, when num_stages is '\
+            f'{self.num_stages}, strides is {self.strides}, and downsamples '\
+            f'is {self.downsamples}.'
diff --git a/head_extractor/build/lib/mmseg/models/backbones/vit.py b/head_extractor/build/lib/mmseg/models/backbones/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd0f688fcc46680b13904a26f14269b3d19d6ce3
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/vit.py
@@ -0,0 +1,501 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmengine.logging import print_log
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, kaiming_init,
+                                        trunc_normal_)
+from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn.modules.utils import _pair as to_2tuple
+
+from mmseg.registry import MODELS
+from ..utils import PatchEmbed, resize
+
+
+class TransformerEncoderLayer(BaseModule):
+    """Implements one encoder layer in Vision Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        qkv_bias (bool): enable bias for qkv if True. Default: True
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: True.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 num_fcs=2,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 batch_first=True,
+                 attn_cfg=dict(),
+                 ffn_cfg=dict(),
+                 with_cp=False):
+        super().__init__()
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, embed_dims, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+
+        attn_cfg.update(
+            dict(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                attn_drop=attn_drop_rate,
+                proj_drop=drop_rate,
+                batch_first=batch_first,
+                bias=qkv_bias))
+
+        self.build_attn(attn_cfg)
+
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, embed_dims, postfix=2)
+        self.add_module(self.norm2_name, norm2)
+
+        ffn_cfg.update(
+            dict(
+                embed_dims=embed_dims,
+                feedforward_channels=feedforward_channels,
+                num_fcs=num_fcs,
+                ffn_drop=drop_rate,
+                dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate)
+                if drop_path_rate > 0 else None,
+                act_cfg=act_cfg))
+        self.build_ffn(ffn_cfg)
+        self.with_cp = with_cp
+
+    def build_attn(self, attn_cfg):
+        self.attn = MultiheadAttention(**attn_cfg)
+
+    def build_ffn(self, ffn_cfg):
+        self.ffn = FFN(**ffn_cfg)
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            x = self.attn(self.norm1(x), identity=x)
+            x = self.ffn(self.norm2(x), identity=x)
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+
+
+@MODELS.register_module()
+class VisionTransformer(BaseModule):
+    """Vision Transformer.
+
+    This backbone is the implementation of `An Image is Worth 16x16 Words:
+    Transformers for Image Recognition at
+    Scale <https://arxiv.org/abs/2010.11929>`_.
+
+    Args:
+        img_size (int | tuple): Input image size. Default: 224.
+        patch_size (int): The patch size. Default: 16.
+        patch_pad  (str | int | None): The padding method in patch embedding.
+            Default: 'corner'.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): embedding dimension. Default: 768.
+        num_layers (int): depth of transformer. Default: 12.
+        num_heads (int): number of attention heads. Default: 12.
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        out_origin (bool): Whether to output the original input embedding.
+            Default: False
+        out_indices (list | tuple | int): Output from which stages.
+            Default: -1.
+        qkv_bias (bool): enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): stochastic depth rate. Default 0.0
+        with_cls_token (bool): Whether concatenating class token into image
+            tokens as transformer input. Default: True.
+        output_cls_token (bool): Whether output the cls_token. If set True,
+            `with_cls_token` must be True. Default: False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        patch_bias (dict): Whether use bias in convolution of PatchEmbed Block.
+            Default: True.
+        patch_norm (bool): Whether to add a norm in PatchEmbed Block.
+            Default: False.
+        pre_norm (bool): Whether to add a norm before Transformer Layers.
+            Default: False.
+        final_norm (bool): Whether to add a additional layer to normalize
+            final feature map. Default: False.
+        interpolate_mode (str): Select the interpolate mode for position
+            embeding vector resize. Default: bicubic.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+        frozen_exclude (List): List of parameters that are not to be frozen.
+            Default: ["all"], "all" means there are no frozen parameters.
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 patch_pad='corner',
+                 in_channels=3,
+                 embed_dims=768,
+                 num_layers=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 out_origin=False,
+                 out_indices=-1,
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 with_cls_token=True,
+                 output_cls_token=False,
+                 norm_cfg=dict(type='LN'),
+                 act_cfg=dict(type='GELU'),
+                 patch_norm=False,
+                 patch_bias=False,
+                 pre_norm=False,
+                 final_norm=False,
+                 interpolate_mode='bicubic',
+                 num_fcs=2,
+                 norm_eval=False,
+                 with_cp=False,
+                 frozen_exclude=['all'],
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(img_size, int):
+            img_size = to_2tuple(img_size)
+        elif isinstance(img_size, tuple):
+            if len(img_size) == 1:
+                img_size = to_2tuple(img_size[0])
+            assert len(img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(img_size)}'
+
+        if output_cls_token:
+            assert with_cls_token is True, f'with_cls_token must be True if' \
+                f'set output_cls_token to True, but got {with_cls_token}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.interpolate_mode = interpolate_mode
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.pretrained = pretrained
+        self.out_origin = out_origin
+        self.frozen_exclude = frozen_exclude
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=patch_size,
+            padding=patch_pad,
+            bias=patch_bias,
+            norm_cfg=norm_cfg if patch_norm else None,
+            init_cfg=None,
+        )
+
+        num_patches = (img_size[0] // patch_size) * \
+            (img_size[1] // patch_size)
+
+        self.with_cls_token = with_cls_token
+        self.output_cls_token = output_cls_token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + 1, embed_dims))
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+        self.pre_norm = pre_norm
+
+        if self.pre_norm:
+            self.pre_ln_name, pre_ln = build_norm_layer(
+                norm_cfg, embed_dims, postfix='_pre')
+            self.add_module(self.pre_ln_name, pre_ln)
+
+        if isinstance(out_indices, int):
+            if out_indices == -1:
+                out_indices = num_layers - 1
+            self.out_indices = [out_indices]
+        elif isinstance(out_indices, list) or isinstance(out_indices, tuple):
+            self.out_indices = out_indices
+        else:
+            raise TypeError('out_indices must be type of int, list or tuple')
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, num_layers)
+        ]  # stochastic depth decay rule
+
+        self.layers = ModuleList()
+        for i in range(num_layers):
+            self.layers.append(
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims,
+                    num_heads=num_heads,
+                    feedforward_channels=mlp_ratio * embed_dims,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    drop_path_rate=dpr[i],
+                    num_fcs=num_fcs,
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    with_cp=with_cp,
+                    batch_first=True))
+
+        self.final_norm = final_norm
+        if final_norm:
+            self.norm1_name, norm1 = build_norm_layer(
+                norm_cfg, embed_dims, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+
+        self._freeze()
+
+    @property
+    def pre_ln(self):
+        return getattr(self, self.pre_ln_name)
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    def init_weights(self):
+        if isinstance(self.init_cfg, dict) and \
+                self.init_cfg.get('type') in ['Pretrained', 'Pretrained_Part']:
+            checkpoint = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+
+            if self.init_cfg.get('type') == 'Pretrained':
+                if 'state_dict' in checkpoint:
+                    state_dict = checkpoint['state_dict']
+                else:
+                    state_dict = checkpoint
+
+            elif self.init_cfg.get('type') == 'Pretrained_Part':
+                state_dict = checkpoint.copy()
+                para_prefix = 'image_encoder'
+                prefix_len = len(para_prefix) + 1
+                for k, v in checkpoint.items():
+                    state_dict.pop(k)
+                    if para_prefix in k:
+                        state_dict[k[prefix_len:]] = v
+
+            if 'pos_embed' in state_dict.keys():
+                if self.pos_embed.shape != state_dict['pos_embed'].shape:
+                    print_log(msg=f'Resize the pos_embed shape from '
+                              f'{state_dict["pos_embed"].shape} to '
+                              f'{self.pos_embed.shape}')
+                    h, w = self.img_size
+                    pos_size = int(
+                        math.sqrt(state_dict['pos_embed'].shape[1] - 1))
+                    state_dict['pos_embed'] = self.resize_pos_embed(
+                        state_dict['pos_embed'],
+                        (h // self.patch_size, w // self.patch_size),
+                        (pos_size, pos_size), self.interpolate_mode)
+
+            load_state_dict(self, state_dict, strict=False, logger=None)
+        elif self.init_cfg is not None:
+            super().init_weights()
+        else:
+            # We only implement the 'jax_impl' initialization implemented at
+            # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
+            trunc_normal_(self.pos_embed, std=.02)
+            trunc_normal_(self.cls_token, std=.02)
+            for n, m in self.named_modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if m.bias is not None:
+                        if 'ffn' in n:
+                            nn.init.normal_(m.bias, mean=0., std=1e-6)
+                        else:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Conv2d):
+                    kaiming_init(m, mode='fan_in', bias=0.)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m, val=1.0, bias=0.)
+
+    def _freeze(self):
+        if 'all' in self.frozen_exclude:
+            return
+        for name, param in self.named_parameters():
+            if not any([exclude in name for exclude in self.frozen_exclude]):
+                param.requires_grad = False
+
+    def _pos_embeding(self, patched_img, hw_shape, pos_embed):
+        """Positioning embeding method.
+
+        Resize the pos_embed, if the input image size doesn't match
+            the training size.
+        Args:
+            patched_img (torch.Tensor): The patched image, it should be
+                shape of [B, L1, C].
+            hw_shape (tuple): The downsampled image resolution.
+            pos_embed (torch.Tensor): The pos_embed weighs, it should be
+                shape of [B, L2, c].
+        Return:
+            torch.Tensor: The pos encoded image feature.
+        """
+        assert patched_img.ndim == 3 and pos_embed.ndim == 3, \
+            'the shapes of patched_img and pos_embed must be [B, L, C]'
+        x_len, pos_len = patched_img.shape[1], pos_embed.shape[1]
+        if x_len != pos_len:
+            if pos_len == (self.img_size[0] // self.patch_size) * (
+                    self.img_size[1] // self.patch_size) + 1:
+                pos_h = self.img_size[0] // self.patch_size
+                pos_w = self.img_size[1] // self.patch_size
+            else:
+                raise ValueError(
+                    'Unexpected shape of pos_embed, got {}.'.format(
+                        pos_embed.shape))
+            pos_embed = self.resize_pos_embed(pos_embed, hw_shape,
+                                              (pos_h, pos_w),
+                                              self.interpolate_mode)
+        return self.drop_after_pos(patched_img + pos_embed)
+
+    @staticmethod
+    def resize_pos_embed(pos_embed, input_shpae, pos_shape, mode):
+        """Resize pos_embed weights.
+
+        Resize pos_embed using bicubic interpolate method.
+        Args:
+            pos_embed (torch.Tensor): Position embedding weights.
+            input_shpae (tuple): Tuple for (downsampled input image height,
+                downsampled input image width).
+            pos_shape (tuple): The resolution of downsampled origin training
+                image.
+            mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'nearest'``
+        Return:
+            torch.Tensor: The resized pos_embed of shape [B, L_new, C]
+        """
+        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
+        pos_h, pos_w = pos_shape
+        cls_token_weight = pos_embed[:, 0]
+        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
+        pos_embed_weight = pos_embed_weight.reshape(
+            1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2)
+        pos_embed_weight = resize(
+            pos_embed_weight, size=input_shpae, align_corners=False, mode=mode)
+        cls_token_weight = cls_token_weight.unsqueeze(1)
+        pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2)
+        pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1)
+        return pos_embed
+
+    def forward(self, inputs):
+        B = inputs.shape[0]
+
+        x, hw_shape = self.patch_embed(inputs)
+
+        # stole cls_tokens impl from Phil Wang, thanks
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = self._pos_embeding(x, hw_shape, self.pos_embed)
+
+        if not self.with_cls_token:
+            # Remove class token for transformer encoder input
+            x = x[:, 1:]
+
+        if self.pre_norm:
+            x = self.pre_ln(x)
+
+        outs = []
+        if self.out_origin:
+            if self.with_cls_token:
+                # Remove class token and reshape token for decoder head
+                out = x[:, 1:]
+            else:
+                out = x
+            B, _, C = out.shape
+            out = out.reshape(B, hw_shape[0], hw_shape[1],
+                              C).permute(0, 3, 1, 2).contiguous()
+            if self.output_cls_token:
+                out = [out, x[:, 0]]
+            outs.append(out)
+
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i == len(self.layers) - 1:
+                if self.final_norm:
+                    x = self.norm1(x)
+            if i in self.out_indices:
+                if self.with_cls_token:
+                    # Remove class token and reshape token for decoder head
+                    out = x[:, 1:]
+                else:
+                    out = x
+                B, _, C = out.shape
+                out = out.reshape(B, hw_shape[0], hw_shape[1],
+                                  C).permute(0, 3, 1, 2).contiguous()
+                if self.output_cls_token:
+                    out = [out, x[:, 0]]
+                outs.append(out)
+
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.LayerNorm):
+                    m.eval()
diff --git a/head_extractor/build/lib/mmseg/models/backbones/vpd.py b/head_extractor/build/lib/mmseg/models/backbones/vpd.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0536d31c64f82fb66117d9ebd2161d5f2df57bd
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/backbones/vpd.py
@@ -0,0 +1,395 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/wl-zhao/VPD/blob/main/vpd/models.py
+# Original licence: MIT License
+# ------------------------------------------------------------------------------
+
+import math
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+from mmengine.runner import CheckpointLoader, load_checkpoint
+
+from mmseg.registry import MODELS
+from mmseg.utils import ConfigType, OptConfigType
+
+try:
+    from ldm.modules.diffusionmodules.util import timestep_embedding
+    from ldm.util import instantiate_from_config
+    has_ldm = True
+except ImportError:
+    has_ldm = False
+
+
+def register_attention_control(model, controller):
+    """Registers a control function to manage attention within a model.
+
+    Args:
+        model: The model to which attention is to be registered.
+        controller: The control function responsible for managing attention.
+    """
+
+    def ca_forward(self, place_in_unet):
+        """Custom forward method for attention.
+
+        Args:
+            self: Reference to the current object.
+            place_in_unet: The location in UNet (down/mid/up).
+
+        Returns:
+            The modified forward method.
+        """
+
+        def forward(x, context=None, mask=None):
+            h = self.heads
+            is_cross = context is not None
+            context = context or x  # if context is None, use x
+
+            q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
+            q, k, v = (
+                tensor.view(tensor.shape[0] * h, tensor.shape[1],
+                            tensor.shape[2] // h) for tensor in [q, k, v])
+
+            sim = torch.matmul(q, k.transpose(-2, -1)) * self.scale
+
+            if mask is not None:
+                mask = mask.flatten(1).unsqueeze(1).repeat(h, 1, 1)
+                max_neg_value = -torch.finfo(sim.dtype).max
+                sim.masked_fill_(~mask, max_neg_value)
+
+            attn = sim.softmax(dim=-1)
+            attn_mean = attn.view(h, attn.shape[0] // h,
+                                  *attn.shape[1:]).mean(0)
+            controller(attn_mean, is_cross, place_in_unet)
+
+            out = torch.matmul(attn, v)
+            out = out.view(out.shape[0] // h, out.shape[1], out.shape[2] * h)
+            return self.to_out(out)
+
+        return forward
+
+    def register_recr(net_, count, place_in_unet):
+        """Recursive function to register the custom forward method to all
+        CrossAttention layers.
+
+        Args:
+            net_: The network layer currently being processed.
+            count: The current count of layers processed.
+            place_in_unet: The location in UNet (down/mid/up).
+
+        Returns:
+            The updated count of layers processed.
+        """
+        if net_.__class__.__name__ == 'CrossAttention':
+            net_.forward = ca_forward(net_, place_in_unet)
+            return count + 1
+        if hasattr(net_, 'children'):
+            return sum(
+                register_recr(child, 0, place_in_unet)
+                for child in net_.children())
+        return count
+
+    cross_att_count = sum(
+        register_recr(net[1], 0, place) for net, place in [
+            (child, 'down') if 'input_blocks' in name else (
+                child, 'up') if 'output_blocks' in name else
+            (child,
+             'mid') if 'middle_block' in name else (None, None)  # Default case
+            for name, child in model.diffusion_model.named_children()
+        ] if net is not None)
+
+    controller.num_att_layers = cross_att_count
+
+
+class AttentionStore:
+    """A class for storing attention information in the UNet model.
+
+    Attributes:
+        base_size (int): Base size for storing attention information.
+        max_size (int): Maximum size for storing attention information.
+    """
+
+    def __init__(self, base_size=64, max_size=None):
+        """Initialize AttentionStore with default or custom sizes."""
+        self.reset()
+        self.base_size = base_size
+        self.max_size = max_size or (base_size // 2)
+        self.num_att_layers = -1
+
+    @staticmethod
+    def get_empty_store():
+        """Returns an empty store for holding attention values."""
+        return {
+            key: []
+            for key in [
+                'down_cross', 'mid_cross', 'up_cross', 'down_self', 'mid_self',
+                'up_self'
+            ]
+        }
+
+    def reset(self):
+        """Resets the step and attention stores to their initial states."""
+        self.cur_step = 0
+        self.cur_att_layer = 0
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        """Processes a single forward step, storing the attention.
+
+        Args:
+            attn: The attention tensor.
+            is_cross (bool): Whether it's cross attention.
+            place_in_unet (str): The location in UNet (down/mid/up).
+
+        Returns:
+            The unmodified attention tensor.
+        """
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[1] <= (self.max_size)**2:
+            self.step_store[key].append(attn)
+        return attn
+
+    def between_steps(self):
+        """Processes and stores attention information between steps."""
+        if not self.attention_store:
+            self.attention_store = self.step_store
+        else:
+            for key in self.attention_store:
+                self.attention_store[key] = [
+                    stored + step for stored, step in zip(
+                        self.attention_store[key], self.step_store[key])
+                ]
+        self.step_store = self.get_empty_store()
+
+    def get_average_attention(self):
+        """Calculates and returns the average attention across all steps."""
+        return {
+            key: [item for item in self.step_store[key]]
+            for key in self.step_store
+        }
+
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        """Allows the class instance to be callable."""
+        return self.forward(attn, is_cross, place_in_unet)
+
+    @property
+    def num_uncond_att_layers(self):
+        """Returns the number of unconditional attention layers (default is
+        0)."""
+        return 0
+
+    def step_callback(self, x_t):
+        """A placeholder for a step callback.
+
+        Returns the input unchanged.
+        """
+        return x_t
+
+
+class UNetWrapper(nn.Module):
+    """A wrapper for UNet with optional attention mechanisms.
+
+    Args:
+        unet (nn.Module): The UNet model to wrap
+        use_attn (bool): Whether to use attention. Defaults to True
+        base_size (int): Base size for the attention store. Defaults to 512
+        max_attn_size (int, optional): Maximum size for the attention store.
+            Defaults to None
+        attn_selector (str): The types of attention to use.
+            Defaults to 'up_cross+down_cross'
+    """
+
+    def __init__(self,
+                 unet,
+                 use_attn=True,
+                 base_size=512,
+                 max_attn_size=None,
+                 attn_selector='up_cross+down_cross'):
+        super().__init__()
+
+        assert has_ldm, 'To use UNetWrapper, please install required ' \
+            'packages via `pip install -r requirements/optional.txt`.'
+
+        self.unet = unet
+        self.attention_store = AttentionStore(
+            base_size=base_size // 8, max_size=max_attn_size)
+        self.attn_selector = attn_selector.split('+')
+        self.use_attn = use_attn
+        self.init_sizes(base_size)
+        if self.use_attn:
+            register_attention_control(unet, self.attention_store)
+
+    def init_sizes(self, base_size):
+        """Initialize sizes based on the base size."""
+        self.size16 = base_size // 32
+        self.size32 = base_size // 16
+        self.size64 = base_size // 8
+
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+        """Forward pass through the model."""
+        diffusion_model = self.unet.diffusion_model
+        if self.use_attn:
+            self.attention_store.reset()
+        hs, emb, out_list = self._unet_forward(x, timesteps, context, y,
+                                               diffusion_model)
+        if self.use_attn:
+            self._append_attn_to_output(out_list)
+        return out_list[::-1]
+
+    def _unet_forward(self, x, timesteps, context, y, diffusion_model):
+        hs = []
+        t_emb = timestep_embedding(
+            timesteps, diffusion_model.model_channels, repeat_only=False)
+        emb = diffusion_model.time_embed(t_emb)
+        h = x.type(diffusion_model.dtype)
+        for module in diffusion_model.input_blocks:
+            h = module(h, emb, context)
+            hs.append(h)
+        h = diffusion_model.middle_block(h, emb, context)
+        out_list = []
+        for i_out, module in enumerate(diffusion_model.output_blocks):
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+            if i_out in [1, 4, 7]:
+                out_list.append(h)
+        h = h.type(x.dtype)
+        out_list.append(h)
+        return hs, emb, out_list
+
+    def _append_attn_to_output(self, out_list):
+        avg_attn = self.attention_store.get_average_attention()
+        attns = {self.size16: [], self.size32: [], self.size64: []}
+        for k in self.attn_selector:
+            for up_attn in avg_attn[k]:
+                size = int(math.sqrt(up_attn.shape[1]))
+                up_attn = up_attn.transpose(-1, -2).reshape(
+                    *up_attn.shape[:2], size, -1)
+                attns[size].append(up_attn)
+        attn16 = torch.stack(attns[self.size16]).mean(0)
+        attn32 = torch.stack(attns[self.size32]).mean(0)
+        attn64 = torch.stack(attns[self.size64]).mean(0) if len(
+            attns[self.size64]) > 0 else None
+        out_list[1] = torch.cat([out_list[1], attn16], dim=1)
+        out_list[2] = torch.cat([out_list[2], attn32], dim=1)
+        if attn64 is not None:
+            out_list[3] = torch.cat([out_list[3], attn64], dim=1)
+
+
+class TextAdapter(nn.Module):
+    """A PyTorch Module that serves as a text adapter.
+
+    This module takes text embeddings and adjusts them based on a scaling
+    factor gamma.
+    """
+
+    def __init__(self, text_dim=768):
+        super().__init__()
+        self.fc = nn.Sequential(
+            nn.Linear(text_dim, text_dim), nn.GELU(),
+            nn.Linear(text_dim, text_dim))
+
+    def forward(self, texts, gamma):
+        texts_after = self.fc(texts)
+        texts = texts + gamma * texts_after
+        return texts
+
+
+@MODELS.register_module()
+class VPD(BaseModule):
+    """VPD (Visual Perception Diffusion) model.
+
+    .. _`VPD`: https://arxiv.org/abs/2303.02153
+
+    Args:
+        diffusion_cfg (dict): Configuration for diffusion model.
+        class_embed_path (str): Path for class embeddings.
+        unet_cfg (dict, optional): Configuration for U-Net.
+        gamma (float, optional): Gamma for text adaptation. Defaults to 1e-4.
+        class_embed_select (bool, optional): If True, enables class embedding
+            selection. Defaults to False.
+        pad_shape (Optional[Union[int, List[int]]], optional): Padding shape.
+            Defaults to None.
+        pad_val (Union[int, List[int]], optional): Padding value.
+            Defaults to 0.
+        init_cfg (dict, optional): Configuration for network initialization.
+    """
+
+    def __init__(self,
+                 diffusion_cfg: ConfigType,
+                 class_embed_path: str,
+                 unet_cfg: OptConfigType = dict(),
+                 gamma: float = 1e-4,
+                 class_embed_select=False,
+                 pad_shape: Optional[Union[int, List[int]]] = None,
+                 pad_val: Union[int, List[int]] = 0,
+                 init_cfg: OptConfigType = None):
+
+        super().__init__(init_cfg=init_cfg)
+
+        assert has_ldm, 'To use VPD model, please install required packages' \
+            ' via `pip install -r requirements/optional.txt`.'
+
+        if pad_shape is not None:
+            if not isinstance(pad_shape, (list, tuple)):
+                pad_shape = (pad_shape, pad_shape)
+
+        self.pad_shape = pad_shape
+        self.pad_val = pad_val
+
+        # diffusion model
+        diffusion_checkpoint = diffusion_cfg.pop('checkpoint', None)
+        sd_model = instantiate_from_config(diffusion_cfg)
+        if diffusion_checkpoint is not None:
+            load_checkpoint(sd_model, diffusion_checkpoint, strict=False)
+
+        self.encoder_vq = sd_model.first_stage_model
+        self.unet = UNetWrapper(sd_model.model, **unet_cfg)
+
+        # class embeddings & text adapter
+        class_embeddings = CheckpointLoader.load_checkpoint(class_embed_path)
+        text_dim = class_embeddings.size(-1)
+        self.text_adapter = TextAdapter(text_dim=text_dim)
+        self.class_embed_select = class_embed_select
+        if class_embed_select:
+            class_embeddings = torch.cat(
+                (class_embeddings, class_embeddings.mean(dim=0,
+                                                         keepdims=True)),
+                dim=0)
+        self.register_buffer('class_embeddings', class_embeddings)
+        self.gamma = nn.Parameter(torch.ones(text_dim) * gamma)
+
+    def forward(self, x):
+        """Extract features from images."""
+
+        # calculate cross-attn map
+        if self.class_embed_select:
+            if isinstance(x, (tuple, list)):
+                x, class_ids = x[:2]
+                class_ids = class_ids.tolist()
+            else:
+                class_ids = [-1] * x.size(0)
+            class_embeddings = self.class_embeddings[class_ids]
+            c_crossattn = self.text_adapter(class_embeddings, self.gamma)
+            c_crossattn = c_crossattn.unsqueeze(1)
+        else:
+            class_embeddings = self.class_embeddings
+            c_crossattn = self.text_adapter(class_embeddings, self.gamma)
+            c_crossattn = c_crossattn.unsqueeze(0).repeat(x.size(0), 1, 1)
+
+        # pad to required input shape for pretrained diffusion model
+        if self.pad_shape is not None:
+            pad_width = max(0, self.pad_shape[1] - x.shape[-1])
+            pad_height = max(0, self.pad_shape[0] - x.shape[-2])
+            x = F.pad(x, (0, pad_width, 0, pad_height), value=self.pad_val)
+
+        # forward the denoising model
+        with torch.no_grad():
+            latents = self.encoder_vq.encode(x).mode().detach()
+        t = torch.ones((x.shape[0], ), device=x.device).long()
+        outs = self.unet(latents, t, context=c_crossattn)
+
+        return outs
diff --git a/head_extractor/build/lib/mmseg/models/builder.py b/head_extractor/build/lib/mmseg/models/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..081c646b49b8ff1ea6c42d1ea4e24e63cdf6b43a
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/builder.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmseg.registry import MODELS
+
+BACKBONES = MODELS
+NECKS = MODELS
+HEADS = MODELS
+LOSSES = MODELS
+SEGMENTORS = MODELS
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    warnings.warn('``build_backbone`` would be deprecated soon, please use '
+                  '``mmseg.registry.MODELS.build()`` ')
+    return BACKBONES.build(cfg)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    warnings.warn('``build_neck`` would be deprecated soon, please use '
+                  '``mmseg.registry.MODELS.build()`` ')
+    return NECKS.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    warnings.warn('``build_head`` would be deprecated soon, please use '
+                  '``mmseg.registry.MODELS.build()`` ')
+    return HEADS.build(cfg)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    warnings.warn('``build_loss`` would be deprecated soon, please use '
+                  '``mmseg.registry.MODELS.build()`` ')
+    return LOSSES.build(cfg)
+
+
+def build_segmentor(cfg, train_cfg=None, test_cfg=None):
+    """Build segmentor."""
+    if train_cfg is not None or test_cfg is not None:
+        warnings.warn(
+            'train_cfg and test_cfg is deprecated, '
+            'please specify them in model', UserWarning)
+    assert cfg.get('train_cfg') is None or train_cfg is None, \
+        'train_cfg specified in both outer field and model field '
+    assert cfg.get('test_cfg') is None or test_cfg is None, \
+        'test_cfg specified in both outer field and model field '
+    return SEGMENTORS.build(
+        cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
diff --git a/head_extractor/build/lib/mmseg/models/data_preprocessor.py b/head_extractor/build/lib/mmseg/models/data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d32bc647b7d48183590408e36ec42ea36aea91c
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/data_preprocessor.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from numbers import Number
+from typing import Any, Dict, List, Optional, Sequence
+
+import torch
+from mmengine.model import BaseDataPreprocessor
+
+from mmseg.registry import MODELS
+from mmseg.utils import stack_batch
+
+
+@MODELS.register_module()
+class SegDataPreProcessor(BaseDataPreprocessor):
+    """Image pre-processor for segmentation tasks.
+
+    Comparing with the :class:`mmengine.ImgDataPreprocessor`,
+
+    1. It won't do normalization if ``mean`` is not specified.
+    2. It does normalization and color space conversion after stacking batch.
+    3. It supports batch augmentations like mixup and cutmix.
+
+
+    It provides the data pre-processing as follows
+
+    - Collate and move data to the target device.
+    - Pad inputs to the input size with defined ``pad_val``, and pad seg map
+        with defined ``seg_pad_val``.
+    - Stack inputs to batch_inputs.
+    - Convert inputs from bgr to rgb if the shape of input is (3, H, W).
+    - Normalize image with defined std and mean.
+    - Do batch augmentations like Mixup and Cutmix during training.
+
+    Args:
+        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
+            Defaults to None.
+        std (Sequence[Number], optional): The pixel standard deviation of
+            R, G, B channels. Defaults to None.
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (float, optional): Padding value. Default: 0.
+        seg_pad_val (float, optional): Padding value of segmentation map.
+            Default: 255.
+        padding_mode (str): Type of padding. Default: constant.
+            - constant: pads with a constant value, this value is specified
+              with pad_val.
+        bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): whether to convert image from RGB to RGB.
+            Defaults to False.
+        batch_augments (list[dict], optional): Batch-level augmentations
+        test_cfg (dict, optional): The padding size config in testing, if not
+            specify, will use `size` and `size_divisor` params as default.
+            Defaults to None, only supports keys `size` or `size_divisor`.
+    """
+
+    def __init__(
+        self,
+        mean: Sequence[Number] = None,
+        std: Sequence[Number] = None,
+        size: Optional[tuple] = None,
+        size_divisor: Optional[int] = None,
+        pad_val: Number = 0,
+        seg_pad_val: Number = 255,
+        bgr_to_rgb: bool = False,
+        rgb_to_bgr: bool = False,
+        batch_augments: Optional[List[dict]] = None,
+        test_cfg: dict = None,
+    ):
+        super().__init__()
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_val = pad_val
+        self.seg_pad_val = seg_pad_val
+
+        assert not (bgr_to_rgb and rgb_to_bgr), (
+            '`bgr2rgb` and `rgb2bgr` cannot be set to True at the same time')
+        self.channel_conversion = rgb_to_bgr or bgr_to_rgb
+
+        if mean is not None:
+            assert std is not None, 'To enable the normalization in ' \
+                                    'preprocessing, please specify both ' \
+                                    '`mean` and `std`.'
+            # Enable the normalization in preprocessing.
+            self._enable_normalize = True
+            self.register_buffer('mean',
+                                 torch.tensor(mean).view(-1, 1, 1), False)
+            self.register_buffer('std',
+                                 torch.tensor(std).view(-1, 1, 1), False)
+        else:
+            self._enable_normalize = False
+
+        # TODO: support batch augmentations.
+        self.batch_augments = batch_augments
+
+        # Support different padding methods in testing
+        self.test_cfg = test_cfg
+
+    def forward(self, data: dict, training: bool = False) -> Dict[str, Any]:
+        """Perform normalization、padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict): data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            Dict: Data in the same format as the model input.
+        """
+        data = self.cast_data(data)  # type: ignore
+        inputs = data['inputs']
+        data_samples = data.get('data_samples', None)
+        # TODO: whether normalize should be after stack_batch
+        if self.channel_conversion and inputs[0].size(0) == 3:
+            inputs = [_input[[2, 1, 0], ...] for _input in inputs]
+
+        inputs = [_input.float() for _input in inputs]
+        if self._enable_normalize:
+            inputs = [(_input - self.mean) / self.std for _input in inputs]
+
+        if training:
+            assert data_samples is not None, ('During training, ',
+                                              '`data_samples` must be define.')
+            inputs, data_samples = stack_batch(
+                inputs=inputs,
+                data_samples=data_samples,
+                size=self.size,
+                size_divisor=self.size_divisor,
+                pad_val=self.pad_val,
+                seg_pad_val=self.seg_pad_val)
+
+            if self.batch_augments is not None:
+                inputs, data_samples = self.batch_augments(
+                    inputs, data_samples)
+        else:
+            img_size = inputs[0].shape[1:]
+            assert all(input_.shape[1:] == img_size for input_ in inputs),  \
+                'The image size in a batch should be the same.'
+            # pad images when testing
+            if self.test_cfg:
+                inputs, padded_samples = stack_batch(
+                    inputs=inputs,
+                    size=self.test_cfg.get('size', None),
+                    size_divisor=self.test_cfg.get('size_divisor', None),
+                    pad_val=self.pad_val,
+                    seg_pad_val=self.seg_pad_val)
+                for data_sample, pad_info in zip(data_samples, padded_samples):
+                    data_sample.set_metainfo({**pad_info})
+            else:
+                inputs = torch.stack(inputs, dim=0)
+
+        return dict(inputs=inputs, data_samples=data_samples)
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/__init__.py b/head_extractor/build/lib/mmseg/models/decode_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4229763816e4100ab6718e4698a21ce92199371b
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/__init__.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ann_head import ANNHead
+from .apc_head import APCHead
+from .aspp_head import ASPPHead
+from .cc_head import CCHead
+from .da_head import DAHead
+from .ddr_head import DDRHead
+from .dm_head import DMHead
+from .dnl_head import DNLHead
+from .dpt_head import DPTHead
+from .ema_head import EMAHead
+from .enc_head import EncHead
+from .fcn_head import FCNHead
+from .fpn_head import FPNHead
+from .gc_head import GCHead
+from .ham_head import LightHamHead
+from .isa_head import ISAHead
+from .knet_head import IterativeDecodeHead, KernelUpdateHead, KernelUpdator
+from .lraspp_head import LRASPPHead
+from .mask2former_head import Mask2FormerHead
+from .maskformer_head import MaskFormerHead
+from .nl_head import NLHead
+from .ocr_head import OCRHead
+from .pid_head import PIDHead
+from .point_head import PointHead
+from .psa_head import PSAHead
+from .psp_head import PSPHead
+from .san_head import SideAdapterCLIPHead
+from .segformer_head import SegformerHead
+from .segmenter_mask_head import SegmenterMaskTransformerHead
+from .sep_aspp_head import DepthwiseSeparableASPPHead
+from .sep_fcn_head import DepthwiseSeparableFCNHead
+from .setr_mla_head import SETRMLAHead
+from .setr_up_head import SETRUPHead
+from .stdc_head import STDCHead
+from .uper_head import UPerHead
+from .vpd_depth_head import VPDDepthHead
+
+__all__ = [
+    'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead',
+    'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead',
+    'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead',
+    'PointHead', 'APCHead', 'DMHead', 'LRASPPHead', 'SETRUPHead',
+    'SETRMLAHead', 'DPTHead', 'SETRMLAHead', 'SegmenterMaskTransformerHead',
+    'SegformerHead', 'ISAHead', 'STDCHead', 'IterativeDecodeHead',
+    'KernelUpdateHead', 'KernelUpdator', 'MaskFormerHead', 'Mask2FormerHead',
+    'LightHamHead', 'PIDHead', 'DDRHead', 'VPDDepthHead', 'SideAdapterCLIPHead'
+]
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/ann_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/ann_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b40ef5aa1da0bc2473597fedca5b3f33973beb0
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/ann_head.py
@@ -0,0 +1,245 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import SelfAttentionBlock as _SelfAttentionBlock
+from .decode_head import BaseDecodeHead
+
+
+class PPMConcat(nn.ModuleList):
+    """Pyramid Pooling Module that only concat the features of each layer.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module.
+    """
+
+    def __init__(self, pool_scales=(1, 3, 6, 8)):
+        super().__init__(
+            [nn.AdaptiveAvgPool2d(pool_scale) for pool_scale in pool_scales])
+
+    def forward(self, feats):
+        """Forward function."""
+        ppm_outs = []
+        for ppm in self:
+            ppm_out = ppm(feats)
+            ppm_outs.append(ppm_out.view(*feats.shape[:2], -1))
+        concat_outs = torch.cat(ppm_outs, dim=2)
+        return concat_outs
+
+
+class SelfAttentionBlock(_SelfAttentionBlock):
+    """Make a ANN used SelfAttentionBlock.
+
+    Args:
+        low_in_channels (int): Input channels of lower level feature,
+            which is the key feature for self-attention.
+        high_in_channels (int): Input channels of higher level feature,
+            which is the query feature for self-attention.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+        share_key_query (bool): Whether share projection weight between key
+            and query projection.
+        query_scale (int): The scale of query feature map.
+        key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module of key feature.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict|None): Config of activation layers.
+    """
+
+    def __init__(self, low_in_channels, high_in_channels, channels,
+                 out_channels, share_key_query, query_scale, key_pool_scales,
+                 conv_cfg, norm_cfg, act_cfg):
+        key_psp = PPMConcat(key_pool_scales)
+        if query_scale > 1:
+            query_downsample = nn.MaxPool2d(kernel_size=query_scale)
+        else:
+            query_downsample = None
+        super().__init__(
+            key_in_channels=low_in_channels,
+            query_in_channels=high_in_channels,
+            channels=channels,
+            out_channels=out_channels,
+            share_key_query=share_key_query,
+            query_downsample=query_downsample,
+            key_downsample=key_psp,
+            key_query_num_convs=1,
+            key_query_norm=True,
+            value_out_num_convs=1,
+            value_out_norm=False,
+            matmul_norm=True,
+            with_out=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+
+class AFNB(nn.Module):
+    """Asymmetric Fusion Non-local Block(AFNB)
+
+    Args:
+        low_in_channels (int): Input channels of lower level feature,
+            which is the key feature for self-attention.
+        high_in_channels (int): Input channels of higher level feature,
+            which is the query feature for self-attention.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+            and query projection.
+        query_scales (tuple[int]): The scales of query feature map.
+            Default: (1,)
+        key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module of key feature.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict|None): Config of activation layers.
+    """
+
+    def __init__(self, low_in_channels, high_in_channels, channels,
+                 out_channels, query_scales, key_pool_scales, conv_cfg,
+                 norm_cfg, act_cfg):
+        super().__init__()
+        self.stages = nn.ModuleList()
+        for query_scale in query_scales:
+            self.stages.append(
+                SelfAttentionBlock(
+                    low_in_channels=low_in_channels,
+                    high_in_channels=high_in_channels,
+                    channels=channels,
+                    out_channels=out_channels,
+                    share_key_query=False,
+                    query_scale=query_scale,
+                    key_pool_scales=key_pool_scales,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        self.bottleneck = ConvModule(
+            out_channels + high_in_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, low_feats, high_feats):
+        """Forward function."""
+        priors = [stage(high_feats, low_feats) for stage in self.stages]
+        context = torch.stack(priors, dim=0).sum(dim=0)
+        output = self.bottleneck(torch.cat([context, high_feats], 1))
+        return output
+
+
+class APNB(nn.Module):
+    """Asymmetric Pyramid Non-local Block (APNB)
+
+    Args:
+        in_channels (int): Input channels of key/query feature,
+            which is the key feature for self-attention.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+        query_scales (tuple[int]): The scales of query feature map.
+            Default: (1,)
+        key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module of key feature.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict|None): Config of activation layers.
+    """
+
+    def __init__(self, in_channels, channels, out_channels, query_scales,
+                 key_pool_scales, conv_cfg, norm_cfg, act_cfg):
+        super().__init__()
+        self.stages = nn.ModuleList()
+        for query_scale in query_scales:
+            self.stages.append(
+                SelfAttentionBlock(
+                    low_in_channels=in_channels,
+                    high_in_channels=in_channels,
+                    channels=channels,
+                    out_channels=out_channels,
+                    share_key_query=True,
+                    query_scale=query_scale,
+                    key_pool_scales=key_pool_scales,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        self.bottleneck = ConvModule(
+            2 * in_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, feats):
+        """Forward function."""
+        priors = [stage(feats, feats) for stage in self.stages]
+        context = torch.stack(priors, dim=0).sum(dim=0)
+        output = self.bottleneck(torch.cat([context, feats], 1))
+        return output
+
+
+@MODELS.register_module()
+class ANNHead(BaseDecodeHead):
+    """Asymmetric Non-local Neural Networks for Semantic Segmentation.
+
+    This head is the implementation of `ANNNet
+    <https://arxiv.org/abs/1908.07678>`_.
+
+    Args:
+        project_channels (int): Projection channels for Nonlocal.
+        query_scales (tuple[int]): The scales of query feature map.
+            Default: (1,)
+        key_pool_scales (tuple[int]): The pooling scales of key feature map.
+            Default: (1, 3, 6, 8).
+    """
+
+    def __init__(self,
+                 project_channels,
+                 query_scales=(1, ),
+                 key_pool_scales=(1, 3, 6, 8),
+                 **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+        assert len(self.in_channels) == 2
+        low_in_channels, high_in_channels = self.in_channels
+        self.project_channels = project_channels
+        self.fusion = AFNB(
+            low_in_channels=low_in_channels,
+            high_in_channels=high_in_channels,
+            out_channels=high_in_channels,
+            channels=project_channels,
+            query_scales=query_scales,
+            key_pool_scales=key_pool_scales,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.bottleneck = ConvModule(
+            high_in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.context = APNB(
+            in_channels=self.channels,
+            out_channels=self.channels,
+            channels=project_channels,
+            query_scales=query_scales,
+            key_pool_scales=key_pool_scales,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        low_feats, high_feats = self._transform_inputs(inputs)
+        output = self.fusion(low_feats, high_feats)
+        output = self.dropout(output)
+        output = self.bottleneck(output)
+        output = self.context(output)
+        output = self.cls_seg(output)
+
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/apc_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/apc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..728f39659c63680944306fddc9e33b7c9172c1ba
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/apc_head.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class ACM(nn.Module):
+    """Adaptive Context Module used in APCNet.
+
+    Args:
+        pool_scale (int): Pooling scale used in Adaptive Context
+            Module to extract region features.
+        fusion (bool): Add one conv to fuse residual feature.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict | None): Config of conv layers.
+        norm_cfg (dict | None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self, pool_scale, fusion, in_channels, channels, conv_cfg,
+                 norm_cfg, act_cfg):
+        super().__init__()
+        self.pool_scale = pool_scale
+        self.fusion = fusion
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.pooled_redu_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.input_redu_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.global_info = ConvModule(
+            self.channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.gla = nn.Conv2d(self.channels, self.pool_scale**2, 1, 1, 0)
+
+        self.residual_conv = ConvModule(
+            self.channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        if self.fusion:
+            self.fusion_conv = ConvModule(
+                self.channels,
+                self.channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+    def forward(self, x):
+        """Forward function."""
+        pooled_x = F.adaptive_avg_pool2d(x, self.pool_scale)
+        # [batch_size, channels, h, w]
+        x = self.input_redu_conv(x)
+        # [batch_size, channels, pool_scale, pool_scale]
+        pooled_x = self.pooled_redu_conv(pooled_x)
+        batch_size = x.size(0)
+        # [batch_size, pool_scale * pool_scale, channels]
+        pooled_x = pooled_x.view(batch_size, self.channels,
+                                 -1).permute(0, 2, 1).contiguous()
+        # [batch_size, h * w, pool_scale * pool_scale]
+        affinity_matrix = self.gla(x + resize(
+            self.global_info(F.adaptive_avg_pool2d(x, 1)), size=x.shape[2:])
+                                   ).permute(0, 2, 3, 1).reshape(
+                                       batch_size, -1, self.pool_scale**2)
+        affinity_matrix = F.sigmoid(affinity_matrix)
+        # [batch_size, h * w, channels]
+        z_out = torch.matmul(affinity_matrix, pooled_x)
+        # [batch_size, channels, h * w]
+        z_out = z_out.permute(0, 2, 1).contiguous()
+        # [batch_size, channels, h, w]
+        z_out = z_out.view(batch_size, self.channels, x.size(2), x.size(3))
+        z_out = self.residual_conv(z_out)
+        z_out = F.relu(z_out + x)
+        if self.fusion:
+            z_out = self.fusion_conv(z_out)
+
+        return z_out
+
+
+@MODELS.register_module()
+class APCHead(BaseDecodeHead):
+    """Adaptive Pyramid Context Network for Semantic Segmentation.
+
+    This head is the implementation of
+    `APCNet <https://openaccess.thecvf.com/content_CVPR_2019/papers/\
+    He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_\
+    CVPR_2019_paper.pdf>`_.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Adaptive Context
+            Module. Default: (1, 2, 3, 6).
+        fusion (bool): Add one conv to fuse residual feature.
+    """
+
+    def __init__(self, pool_scales=(1, 2, 3, 6), fusion=True, **kwargs):
+        super().__init__(**kwargs)
+        assert isinstance(pool_scales, (list, tuple))
+        self.pool_scales = pool_scales
+        self.fusion = fusion
+        acm_modules = []
+        for pool_scale in self.pool_scales:
+            acm_modules.append(
+                ACM(pool_scale,
+                    self.fusion,
+                    self.in_channels,
+                    self.channels,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        self.acm_modules = nn.ModuleList(acm_modules)
+        self.bottleneck = ConvModule(
+            self.in_channels + len(pool_scales) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        acm_outs = [x]
+        for acm_module in self.acm_modules:
+            acm_outs.append(acm_module(x))
+        acm_outs = torch.cat(acm_outs, dim=1)
+        output = self.bottleneck(acm_outs)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/aspp_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/aspp_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d7185d7de58d35ef17e5d54e0e75b045e8724c4
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/aspp_head.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class ASPPModule(nn.ModuleList):
+    """Atrous Spatial Pyramid Pooling (ASPP) Module.
+
+    Args:
+        dilations (tuple[int]): Dilation rate of each layer.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self, dilations, in_channels, channels, conv_cfg, norm_cfg,
+                 act_cfg):
+        super().__init__()
+        self.dilations = dilations
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        for dilation in dilations:
+            self.append(
+                ConvModule(
+                    self.in_channels,
+                    self.channels,
+                    1 if dilation == 1 else 3,
+                    dilation=dilation,
+                    padding=0 if dilation == 1 else dilation,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+    def forward(self, x):
+        """Forward function."""
+        aspp_outs = []
+        for aspp_module in self:
+            aspp_outs.append(aspp_module(x))
+
+        return aspp_outs
+
+
+@MODELS.register_module()
+class ASPPHead(BaseDecodeHead):
+    """Rethinking Atrous Convolution for Semantic Image Segmentation.
+
+    This head is the implementation of `DeepLabV3
+    <https://arxiv.org/abs/1706.05587>`_.
+
+    Args:
+        dilations (tuple[int]): Dilation rates for ASPP module.
+            Default: (1, 6, 12, 18).
+    """
+
+    def __init__(self, dilations=(1, 6, 12, 18), **kwargs):
+        super().__init__(**kwargs)
+        assert isinstance(dilations, (list, tuple))
+        self.dilations = dilations
+        self.image_pool = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            ConvModule(
+                self.in_channels,
+                self.channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        self.aspp_modules = ASPPModule(
+            dilations,
+            self.in_channels,
+            self.channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.bottleneck = ConvModule(
+            (len(dilations) + 1) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _forward_feature(self, inputs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        x = self._transform_inputs(inputs)
+        aspp_outs = [
+            resize(
+                self.image_pool(x),
+                size=x.size()[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        ]
+        aspp_outs.extend(self.aspp_modules(x))
+        aspp_outs = torch.cat(aspp_outs, dim=1)
+        feats = self.bottleneck(aspp_outs)
+        return feats
+
+    def forward(self, inputs):
+        """Forward function."""
+        output = self._forward_feature(inputs)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/cascade_decode_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/cascade_decode_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe2bcb9302235e3881696dff6657e3e7fb12609b
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/cascade_decode_head.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List
+
+from torch import Tensor
+
+from mmseg.utils import ConfigType
+from .decode_head import BaseDecodeHead
+
+
+class BaseCascadeDecodeHead(BaseDecodeHead, metaclass=ABCMeta):
+    """Base class for cascade decode head used in
+    :class:`CascadeEncoderDecoder."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @abstractmethod
+    def forward(self, inputs, prev_output):
+        """Placeholder of forward function."""
+        pass
+
+    def loss(self, inputs: List[Tensor], prev_output: Tensor,
+             batch_data_samples: List[dict], train_cfg: ConfigType) -> Tensor:
+        """Forward function for training.
+
+        Args:
+            inputs (List[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+            batch_data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_sem_seg`.
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs, prev_output)
+        losses = self.loss_by_feat(seg_logits, batch_data_samples)
+
+        return losses
+
+    def predict(self, inputs: List[Tensor], prev_output: Tensor,
+                batch_img_metas: List[dict], tese_cfg: ConfigType):
+        """Forward function for testing.
+
+        Args:
+            inputs (List[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+            batch_img_metas (dict): List Image info where each dict may also
+                contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        seg_logits = self.forward(inputs, prev_output)
+
+        return self.predict_by_feat(seg_logits, batch_img_metas)
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/cc_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/cc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9075a2648d77f6bca6bb29f3e7db52a329f7afb
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/cc_head.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmseg.registry import MODELS
+from .fcn_head import FCNHead
+
+try:
+    from mmcv.ops import CrissCrossAttention
+except ModuleNotFoundError:
+    CrissCrossAttention = None
+
+
+@MODELS.register_module()
+class CCHead(FCNHead):
+    """CCNet: Criss-Cross Attention for Semantic Segmentation.
+
+    This head is the implementation of `CCNet
+    <https://arxiv.org/abs/1811.11721>`_.
+
+    Args:
+        recurrence (int): Number of recurrence of Criss Cross Attention
+            module. Default: 2.
+    """
+
+    def __init__(self, recurrence=2, **kwargs):
+        if CrissCrossAttention is None:
+            raise RuntimeError('Please install mmcv-full for '
+                               'CrissCrossAttention ops')
+        super().__init__(num_convs=2, **kwargs)
+        self.recurrence = recurrence
+        self.cca = CrissCrossAttention(self.channels)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        output = self.convs[0](x)
+        for _ in range(self.recurrence):
+            output = self.cca(output)
+        output = self.convs[1](output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/da_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/da_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d87214365d2f8695b60ccab0c1850669ff8dd295
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/da_head.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale
+from torch import Tensor, nn
+
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList, add_prefix
+from ..utils import SelfAttentionBlock as _SelfAttentionBlock
+from .decode_head import BaseDecodeHead
+
+
+class PAM(_SelfAttentionBlock):
+    """Position Attention Module (PAM)
+
+    Args:
+        in_channels (int): Input channels of key/query feature.
+        channels (int): Output channels of key/query transform.
+    """
+
+    def __init__(self, in_channels, channels):
+        super().__init__(
+            key_in_channels=in_channels,
+            query_in_channels=in_channels,
+            channels=channels,
+            out_channels=in_channels,
+            share_key_query=False,
+            query_downsample=None,
+            key_downsample=None,
+            key_query_num_convs=1,
+            key_query_norm=False,
+            value_out_num_convs=1,
+            value_out_norm=False,
+            matmul_norm=False,
+            with_out=False,
+            conv_cfg=None,
+            norm_cfg=None,
+            act_cfg=None)
+
+        self.gamma = Scale(0)
+
+    def forward(self, x):
+        """Forward function."""
+        out = super().forward(x, x)
+
+        out = self.gamma(out) + x
+        return out
+
+
+class CAM(nn.Module):
+    """Channel Attention Module (CAM)"""
+
+    def __init__(self):
+        super().__init__()
+        self.gamma = Scale(0)
+
+    def forward(self, x):
+        """Forward function."""
+        batch_size, channels, height, width = x.size()
+        proj_query = x.view(batch_size, channels, -1)
+        proj_key = x.view(batch_size, channels, -1).permute(0, 2, 1)
+        energy = torch.bmm(proj_query, proj_key)
+        energy_new = torch.max(
+            energy, -1, keepdim=True)[0].expand_as(energy) - energy
+        attention = F.softmax(energy_new, dim=-1)
+        proj_value = x.view(batch_size, channels, -1)
+
+        out = torch.bmm(attention, proj_value)
+        out = out.view(batch_size, channels, height, width)
+
+        out = self.gamma(out) + x
+        return out
+
+
+@MODELS.register_module()
+class DAHead(BaseDecodeHead):
+    """Dual Attention Network for Scene Segmentation.
+
+    This head is the implementation of `DANet
+    <https://arxiv.org/abs/1809.02983>`_.
+
+    Args:
+        pam_channels (int): The channels of Position Attention Module(PAM).
+    """
+
+    def __init__(self, pam_channels, **kwargs):
+        super().__init__(**kwargs)
+        self.pam_channels = pam_channels
+        self.pam_in_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.pam = PAM(self.channels, pam_channels)
+        self.pam_out_conv = ConvModule(
+            self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.pam_conv_seg = nn.Conv2d(
+            self.channels, self.num_classes, kernel_size=1)
+
+        self.cam_in_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.cam = CAM()
+        self.cam_out_conv = ConvModule(
+            self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.cam_conv_seg = nn.Conv2d(
+            self.channels, self.num_classes, kernel_size=1)
+
+    def pam_cls_seg(self, feat):
+        """PAM feature classification."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.pam_conv_seg(feat)
+        return output
+
+    def cam_cls_seg(self, feat):
+        """CAM feature classification."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.cam_conv_seg(feat)
+        return output
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        pam_feat = self.pam_in_conv(x)
+        pam_feat = self.pam(pam_feat)
+        pam_feat = self.pam_out_conv(pam_feat)
+        pam_out = self.pam_cls_seg(pam_feat)
+
+        cam_feat = self.cam_in_conv(x)
+        cam_feat = self.cam(cam_feat)
+        cam_feat = self.cam_out_conv(cam_feat)
+        cam_out = self.cam_cls_seg(cam_feat)
+
+        feat_sum = pam_feat + cam_feat
+        pam_cam_out = self.cls_seg(feat_sum)
+
+        return pam_cam_out, pam_out, cam_out
+
+    def predict(self, inputs, batch_img_metas: List[dict], test_cfg,
+                **kwargs) -> List[Tensor]:
+        """Forward function for testing, only ``pam_cam`` is used."""
+        seg_logits = self.forward(inputs)[0]
+        return self.predict_by_feat(seg_logits, batch_img_metas, **kwargs)
+
+    def loss_by_feat(self, seg_logit: Tuple[Tensor],
+                     batch_data_samples: SampleList, **kwargs) -> dict:
+        """Compute ``pam_cam``, ``pam``, ``cam`` loss."""
+        pam_cam_seg_logit, pam_seg_logit, cam_seg_logit = seg_logit
+        loss = dict()
+        loss.update(
+            add_prefix(
+                super().loss_by_feat(pam_cam_seg_logit, batch_data_samples),
+                'pam_cam'))
+        loss.update(
+            add_prefix(super().loss_by_feat(pam_seg_logit, batch_data_samples),
+                       'pam'))
+        loss.update(
+            add_prefix(super().loss_by_feat(cam_seg_logit, batch_data_samples),
+                       'cam'))
+        return loss
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/ddr_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/ddr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba26d6503c09d7efb3ca6664c7baf59c9e6e3ce9
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/ddr_head.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+from torch import Tensor
+
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.models.losses import accuracy
+from mmseg.models.utils import resize
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType, SampleList
+
+
+@MODELS.register_module()
+class DDRHead(BaseDecodeHead):
+    """Decode head for DDRNet.
+
+    Args:
+        in_channels (int): Number of input channels.
+        channels (int): Number of output channels.
+        num_classes (int): Number of classes.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict, optional): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 num_classes: int,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 **kwargs):
+        super().__init__(
+            in_channels,
+            channels,
+            num_classes=num_classes,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **kwargs)
+
+        self.head = self._make_base_head(self.in_channels, self.channels)
+        self.aux_head = self._make_base_head(self.in_channels // 2,
+                                             self.channels)
+        self.aux_cls_seg = nn.Conv2d(
+            self.channels, self.out_channels, kernel_size=1)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(
+            self,
+            inputs: Union[Tensor,
+                          Tuple[Tensor]]) -> Union[Tensor, Tuple[Tensor]]:
+        if self.training:
+            c3_feat, c5_feat = inputs
+            x_c = self.head(c5_feat)
+            x_c = self.cls_seg(x_c)
+            x_s = self.aux_head(c3_feat)
+            x_s = self.aux_cls_seg(x_s)
+
+            return x_c, x_s
+        else:
+            x_c = self.head(inputs)
+            x_c = self.cls_seg(x_c)
+            return x_c
+
+    def _make_base_head(self, in_channels: int,
+                        channels: int) -> nn.Sequential:
+        layers = [
+            ConvModule(
+                in_channels,
+                channels,
+                kernel_size=3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg,
+                order=('norm', 'act', 'conv')),
+            build_norm_layer(self.norm_cfg, channels)[1],
+            build_activation_layer(self.act_cfg),
+        ]
+
+        return nn.Sequential(*layers)
+
+    def loss_by_feat(self, seg_logits: Tuple[Tensor],
+                     batch_data_samples: SampleList) -> dict:
+        loss = dict()
+        context_logit, spatial_logit = seg_logits
+        seg_label = self._stack_batch_gt(batch_data_samples)
+
+        context_logit = resize(
+            context_logit,
+            size=seg_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        spatial_logit = resize(
+            spatial_logit,
+            size=seg_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        seg_label = seg_label.squeeze(1)
+
+        loss['loss_context'] = self.loss_decode[0](context_logit, seg_label)
+        loss['loss_spatial'] = self.loss_decode[1](spatial_logit, seg_label)
+        loss['acc_seg'] = accuracy(
+            context_logit, seg_label, ignore_index=self.ignore_index)
+
+        return loss
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/decode_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/decode_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd53afe22d9350482e84c989d3d87e4e07d1ee6b
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/decode_head.py
@@ -0,0 +1,366 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from abc import ABCMeta, abstractmethod
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures import build_pixel_sampler
+from mmseg.utils import ConfigType, SampleList
+from ..losses import accuracy
+from ..utils import resize
+
+
+class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead.
+
+    1. The ``init_weights`` method is used to initialize decode_head's
+    model parameters. After segmentor initialization, ``init_weights``
+    is triggered when ``segmentor.init_weights()`` is called externally.
+
+    2. The ``loss`` method is used to calculate the loss of decode_head,
+    which includes two steps: (1) the decode_head model performs forward
+    propagation to obtain the feature maps (2) The ``loss_by_feat`` method
+    is called based on the feature maps to calculate the loss.
+
+    .. code:: text
+
+    loss(): forward() -> loss_by_feat()
+
+    3. The ``predict`` method is used to predict segmentation results,
+    which includes two steps: (1) the decode_head model performs forward
+    propagation to obtain the feature maps (2) The ``predict_by_feat`` method
+    is called based on the feature maps to predict segmentation results
+    including post-processing.
+
+    .. code:: text
+
+    predict(): forward() -> predict_by_feat()
+
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        out_channels (int): Output channels of conv_seg. Default: None.
+        threshold (float): Threshold for binary segmentation in the case of
+            `num_classes==1`. Default: None.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict | Sequence[dict]): Config of decode loss.
+            The `loss_name` is property of corresponding loss function which
+            could be shown in training log. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_ce'.
+             e.g. dict(type='CrossEntropyLoss'),
+             [dict(type='CrossEntropyLoss', loss_name='loss_ce'),
+              dict(type='DiceLoss', loss_name='loss_dice')]
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255.
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 out_channels=None,
+                 threshold=None,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False,
+                 init_cfg=dict(
+                     type='Normal', std=0.01, override=dict(name='conv_seg'))):
+        super().__init__(init_cfg)
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+
+        if out_channels is None:
+            if num_classes == 2:
+                warnings.warn('For binary segmentation, we suggest using'
+                              '`out_channels = 1` to define the output'
+                              'channels of segmentor, and use `threshold`'
+                              'to convert `seg_logits` into a prediction'
+                              'applying a threshold')
+            out_channels = num_classes
+
+        if out_channels != num_classes and out_channels != 1:
+            raise ValueError(
+                'out_channels should be equal to num_classes,'
+                'except binary segmentation set out_channels == 1 and'
+                f'num_classes == 2, but got out_channels={out_channels}'
+                f'and num_classes={num_classes}')
+
+        if out_channels == 1 and threshold is None:
+            threshold = 0.3
+            warnings.warn('threshold is not defined for binary, and defaults'
+                          'to 0.3')
+        self.num_classes = num_classes
+        self.out_channels = out_channels
+        self.threshold = threshold
+
+        if isinstance(loss_decode, dict):
+            self.loss_decode = MODELS.build(loss_decode)
+        elif isinstance(loss_decode, (list, tuple)):
+            self.loss_decode = nn.ModuleList()
+            for loss in loss_decode:
+                self.loss_decode.append(MODELS.build(loss))
+        else:
+            raise TypeError(f'loss_decode must be a dict or sequence of dict,\
+                but got {type(loss_decode)}')
+
+        if sampler is not None:
+            self.sampler = build_pixel_sampler(sampler, context=self)
+        else:
+            self.sampler = None
+
+        self.conv_seg = nn.Conv2d(channels, self.out_channels, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    def loss(self, inputs: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Forward function for training.
+
+        Args:
+            inputs (Tuple[Tensor]): List of multi-level img features.
+            batch_data_samples (list[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `img_metas` or `gt_semantic_seg`.
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs)
+        losses = self.loss_by_feat(seg_logits, batch_data_samples)
+        return losses
+
+    def predict(self, inputs: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tensor:
+        """Forward function for prediction.
+
+        Args:
+            inputs (Tuple[Tensor]): List of multi-level img features.
+            batch_img_metas (dict): List Image info where each dict may also
+                contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Outputs segmentation logits map.
+        """
+        seg_logits = self.forward(inputs)
+
+        return self.predict_by_feat(seg_logits, batch_img_metas)
+
+    def _stack_batch_gt(self, batch_data_samples: SampleList) -> Tensor:
+        gt_semantic_segs = [
+            data_sample.gt_sem_seg.data for data_sample in batch_data_samples
+        ]
+        return torch.stack(gt_semantic_segs, dim=0)
+
+    def loss_by_feat(self, seg_logits: Tensor,
+                     batch_data_samples: SampleList) -> dict:
+        """Compute segmentation loss.
+
+        Args:
+            seg_logits (Tensor): The output from decode head forward function.
+            batch_data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        seg_label = self._stack_batch_gt(batch_data_samples)
+        loss = dict()
+        seg_logits = resize(
+            input=seg_logits,
+            size=seg_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.sampler is not None:
+            seg_weight = self.sampler.sample(seg_logits, seg_label)
+        else:
+            seg_weight = None
+        seg_label = seg_label.squeeze(1)
+
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_decode in losses_decode:
+            if loss_decode.loss_name not in loss:
+                loss[loss_decode.loss_name] = loss_decode(
+                    seg_logits,
+                    seg_label,
+                    weight=seg_weight,
+                    ignore_index=self.ignore_index)
+            else:
+                loss[loss_decode.loss_name] += loss_decode(
+                    seg_logits,
+                    seg_label,
+                    weight=seg_weight,
+                    ignore_index=self.ignore_index)
+
+        loss['acc_seg'] = accuracy(
+            seg_logits, seg_label, ignore_index=self.ignore_index)
+        return loss
+
+    def predict_by_feat(self, seg_logits: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Transform a batch of output seg_logits to the input shape.
+
+        Args:
+            seg_logits (Tensor): The output from decode head forward function.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            Tensor: Outputs segmentation logits map.
+        """
+
+        if isinstance(batch_img_metas[0]['img_shape'], torch.Size):
+            # slide inference
+            size = batch_img_metas[0]['img_shape']
+        elif 'pad_shape' in batch_img_metas[0]:
+            size = batch_img_metas[0]['pad_shape'][:2]
+        else:
+            size = batch_img_metas[0]['img_shape']
+
+        seg_logits = resize(
+            input=seg_logits,
+            size=size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        return seg_logits
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/dm_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/dm_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7694abd8ac3a470d543c580bd97adceb5b647f7c
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/dm_head.py
@@ -0,0 +1,141 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+
+from mmseg.registry import MODELS
+from .decode_head import BaseDecodeHead
+
+
+class DCM(nn.Module):
+    """Dynamic Convolutional Module used in DMNet.
+
+    Args:
+        filter_size (int): The filter size of generated convolution kernel
+            used in Dynamic Convolutional Module.
+        fusion (bool): Add one conv to fuse DCM output feature.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict | None): Config of conv layers.
+        norm_cfg (dict | None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self, filter_size, fusion, in_channels, channels, conv_cfg,
+                 norm_cfg, act_cfg):
+        super().__init__()
+        self.filter_size = filter_size
+        self.fusion = fusion
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.filter_gen_conv = nn.Conv2d(self.in_channels, self.channels, 1, 1,
+                                         0)
+
+        self.input_redu_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        if self.norm_cfg is not None:
+            self.norm = build_norm_layer(self.norm_cfg, self.channels)[1]
+        else:
+            self.norm = None
+        self.activate = build_activation_layer(self.act_cfg)
+
+        if self.fusion:
+            self.fusion_conv = ConvModule(
+                self.channels,
+                self.channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+    def forward(self, x):
+        """Forward function."""
+        generated_filter = self.filter_gen_conv(
+            F.adaptive_avg_pool2d(x, self.filter_size))
+        x = self.input_redu_conv(x)
+        b, c, h, w = x.shape
+        # [1, b * c, h, w], c = self.channels
+        x = x.view(1, b * c, h, w)
+        # [b * c, 1, filter_size, filter_size]
+        generated_filter = generated_filter.view(b * c, 1, self.filter_size,
+                                                 self.filter_size)
+        pad = (self.filter_size - 1) // 2
+        if (self.filter_size - 1) % 2 == 0:
+            p2d = (pad, pad, pad, pad)
+        else:
+            p2d = (pad + 1, pad, pad + 1, pad)
+        x = F.pad(input=x, pad=p2d, mode='constant', value=0)
+        # [1, b * c, h, w]
+        output = F.conv2d(input=x, weight=generated_filter, groups=b * c)
+        # [b, c, h, w]
+        output = output.view(b, c, h, w)
+        if self.norm is not None:
+            output = self.norm(output)
+        output = self.activate(output)
+
+        if self.fusion:
+            output = self.fusion_conv(output)
+
+        return output
+
+
+@MODELS.register_module()
+class DMHead(BaseDecodeHead):
+    """Dynamic Multi-scale Filters for Semantic Segmentation.
+
+    This head is the implementation of
+    `DMNet <https://openaccess.thecvf.com/content_ICCV_2019/papers/\
+        He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_\
+            ICCV_2019_paper.pdf>`_.
+
+    Args:
+        filter_sizes (tuple[int]): The size of generated convolutional filters
+            used in Dynamic Convolutional Module. Default: (1, 3, 5, 7).
+        fusion (bool): Add one conv to fuse DCM output feature.
+    """
+
+    def __init__(self, filter_sizes=(1, 3, 5, 7), fusion=False, **kwargs):
+        super().__init__(**kwargs)
+        assert isinstance(filter_sizes, (list, tuple))
+        self.filter_sizes = filter_sizes
+        self.fusion = fusion
+        dcm_modules = []
+        for filter_size in self.filter_sizes:
+            dcm_modules.append(
+                DCM(filter_size,
+                    self.fusion,
+                    self.in_channels,
+                    self.channels,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        self.dcm_modules = nn.ModuleList(dcm_modules)
+        self.bottleneck = ConvModule(
+            self.in_channels + len(filter_sizes) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        dcm_outs = [x]
+        for dcm_module in self.dcm_modules:
+            dcm_outs.append(dcm_module(x))
+        dcm_outs = torch.cat(dcm_outs, dim=1)
+        output = self.bottleneck(dcm_outs)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/dnl_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/dnl_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..248c11814108d02e88fa7e0cada061b3366e33ff
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/dnl_head.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import NonLocal2d
+from torch import nn
+
+from mmseg.registry import MODELS
+from .fcn_head import FCNHead
+
+
+class DisentangledNonLocal2d(NonLocal2d):
+    """Disentangled Non-Local Blocks.
+
+    Args:
+        temperature (float): Temperature to adjust attention. Default: 0.05
+    """
+
+    def __init__(self, *arg, temperature, **kwargs):
+        super().__init__(*arg, **kwargs)
+        self.temperature = temperature
+        self.conv_mask = nn.Conv2d(self.in_channels, 1, kernel_size=1)
+
+    def embedded_gaussian(self, theta_x, phi_x):
+        """Embedded gaussian with temperature."""
+
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        if self.use_scale:
+            # theta_x.shape[-1] is `self.inter_channels`
+            pairwise_weight /= torch.tensor(
+                theta_x.shape[-1],
+                dtype=torch.float,
+                device=pairwise_weight.device)**torch.tensor(
+                    0.5, device=pairwise_weight.device)
+        pairwise_weight /= torch.tensor(
+            self.temperature, device=pairwise_weight.device)
+        pairwise_weight = pairwise_weight.softmax(dim=-1)
+        return pairwise_weight
+
+    def forward(self, x):
+        # x: [N, C, H, W]
+        n = x.size(0)
+
+        # g_x: [N, HxW, C]
+        g_x = self.g(x).view(n, self.inter_channels, -1)
+        g_x = g_x.permute(0, 2, 1)
+
+        # theta_x: [N, HxW, C], phi_x: [N, C, HxW]
+        if self.mode == 'gaussian':
+            theta_x = x.view(n, self.in_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            if self.sub_sample:
+                phi_x = self.phi(x).view(n, self.in_channels, -1)
+            else:
+                phi_x = x.view(n, self.in_channels, -1)
+        elif self.mode == 'concatenation':
+            theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
+        else:
+            theta_x = self.theta(x).view(n, self.inter_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, -1)
+
+        # subtract mean
+        theta_x -= theta_x.mean(dim=-2, keepdim=True)
+        phi_x -= phi_x.mean(dim=-1, keepdim=True)
+
+        pairwise_func = getattr(self, self.mode)
+        # pairwise_weight: [N, HxW, HxW]
+        pairwise_weight = pairwise_func(theta_x, phi_x)
+
+        # y: [N, HxW, C]
+        y = torch.matmul(pairwise_weight, g_x)
+        # y: [N, C, H, W]
+        y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
+                                                    *x.size()[2:])
+
+        # unary_mask: [N, 1, HxW]
+        unary_mask = self.conv_mask(x)
+        unary_mask = unary_mask.view(n, 1, -1)
+        unary_mask = unary_mask.softmax(dim=-1)
+        # unary_x: [N, 1, C]
+        unary_x = torch.matmul(unary_mask, g_x)
+        # unary_x: [N, C, 1, 1]
+        unary_x = unary_x.permute(0, 2, 1).contiguous().reshape(
+            n, self.inter_channels, 1, 1)
+
+        output = x + self.conv_out(y + unary_x)
+
+        return output
+
+
+@MODELS.register_module()
+class DNLHead(FCNHead):
+    """Disentangled Non-Local Neural Networks.
+
+    This head is the implementation of `DNLNet
+    <https://arxiv.org/abs/2006.06668>`_.
+
+    Args:
+        reduction (int): Reduction factor of projection transform. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            sqrt(1/inter_channels). Default: False.
+        mode (str): The nonlocal mode. Options are 'embedded_gaussian',
+            'dot_product'. Default: 'embedded_gaussian.'.
+        temperature (float): Temperature to adjust attention. Default: 0.05
+    """
+
+    def __init__(self,
+                 reduction=2,
+                 use_scale=True,
+                 mode='embedded_gaussian',
+                 temperature=0.05,
+                 **kwargs):
+        super().__init__(num_convs=2, **kwargs)
+        self.reduction = reduction
+        self.use_scale = use_scale
+        self.mode = mode
+        self.temperature = temperature
+        self.dnl_block = DisentangledNonLocal2d(
+            in_channels=self.channels,
+            reduction=self.reduction,
+            use_scale=self.use_scale,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            mode=self.mode,
+            temperature=self.temperature)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        output = self.convs[0](x)
+        output = self.dnl_block(output)
+        output = self.convs[1](output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/dpt_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/dpt_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2cfd89daa4df48601e930cfd158dcf3c9a6a837
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/dpt_head.py
@@ -0,0 +1,294 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Linear, build_activation_layer
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class ReassembleBlocks(BaseModule):
+    """ViTPostProcessBlock, process cls_token in ViT backbone output and
+    rearrange the feature vector to feature map.
+
+    Args:
+        in_channels (int): ViT feature channels. Default: 768.
+        out_channels (List): output channels of each stage.
+            Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        patch_size (int): The patch size. Default: 16.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=768,
+                 out_channels=[96, 192, 384, 768],
+                 readout_type='ignore',
+                 patch_size=16,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        assert readout_type in ['ignore', 'add', 'project']
+        self.readout_type = readout_type
+        self.patch_size = patch_size
+
+        self.projects = nn.ModuleList([
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                act_cfg=None,
+            ) for out_channel in out_channels
+        ])
+
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        if self.readout_type == 'project':
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        Linear(2 * in_channels, in_channels),
+                        build_activation_layer(dict(type='GELU'))))
+
+    def forward(self, inputs):
+        assert isinstance(inputs, list)
+        out = []
+        for i, x in enumerate(inputs):
+            assert len(x) == 2
+            x, cls_token = x[0], x[1]
+            feature_shape = x.shape
+            if self.readout_type == 'project':
+                x = x.flatten(2).permute((0, 2, 1))
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+                x = x.permute(0, 2, 1).reshape(feature_shape)
+            elif self.readout_type == 'add':
+                x = x.flatten(2) + cls_token.unsqueeze(-1)
+                x = x.reshape(feature_shape)
+            else:
+                pass
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        return out
+
+
+class PreActResidualConvUnit(BaseModule):
+    """ResidualConvUnit, pre-activate residual unit.
+
+    Args:
+        in_channels (int): number of channels in the input feature map.
+        act_cfg (dict): dictionary to construct and config activation layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        stride (int): stride of the first block. Default: 1
+        dilation (int): dilation rate for convs layers. Default: 1.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 act_cfg,
+                 norm_cfg,
+                 stride=1,
+                 dilation=1,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.conv1 = ConvModule(
+            in_channels,
+            in_channels,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            bias=False,
+            order=('act', 'conv', 'norm'))
+
+        self.conv2 = ConvModule(
+            in_channels,
+            in_channels,
+            3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            bias=False,
+            order=('act', 'conv', 'norm'))
+
+    def forward(self, inputs):
+        inputs_ = inputs.clone()
+        x = self.conv1(inputs)
+        x = self.conv2(x)
+        return x + inputs_
+
+
+class FeatureFusionBlock(BaseModule):
+    """FeatureFusionBlock, merge feature map from different stages.
+
+    Args:
+        in_channels (int): Input channels.
+        act_cfg (dict): The activation config for ResidualConvUnit.
+        norm_cfg (dict): Config dict for normalization layer.
+        expand (bool): Whether expand the channels in post process block.
+            Default: False.
+        align_corners (bool): align_corner setting for bilinear upsample.
+            Default: True.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 act_cfg,
+                 norm_cfg,
+                 expand=False,
+                 align_corners=True,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.in_channels = in_channels
+        self.expand = expand
+        self.align_corners = align_corners
+
+        self.out_channels = in_channels
+        if self.expand:
+            self.out_channels = in_channels // 2
+
+        self.project = ConvModule(
+            self.in_channels,
+            self.out_channels,
+            kernel_size=1,
+            act_cfg=None,
+            bias=True)
+
+        self.res_conv_unit1 = PreActResidualConvUnit(
+            in_channels=self.in_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
+        self.res_conv_unit2 = PreActResidualConvUnit(
+            in_channels=self.in_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
+
+    def forward(self, *inputs):
+        x = inputs[0]
+        if len(inputs) == 2:
+            if x.shape != inputs[1].shape:
+                res = resize(
+                    inputs[1],
+                    size=(x.shape[2], x.shape[3]),
+                    mode='bilinear',
+                    align_corners=False)
+            else:
+                res = inputs[1]
+            x = x + self.res_conv_unit1(res)
+        x = self.res_conv_unit2(x)
+        x = resize(
+            x,
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.project(x)
+        return x
+
+
+@MODELS.register_module()
+class DPTHead(BaseDecodeHead):
+    """Vision Transformers for Dense Prediction.
+
+    This head is implemented of `DPT <https://arxiv.org/abs/2103.13413>`_.
+
+    Args:
+        embed_dims (int): The embed dimension of the ViT backbone.
+            Default: 768.
+        post_process_channels (List): Out channels of post process conv
+            layers. Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        patch_size (int): The patch size. Default: 16.
+        expand_channels (bool): Whether expand the channels in post process
+            block. Default: False.
+        act_cfg (dict): The activation config for residual conv unit.
+            Default dict(type='ReLU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+    """
+
+    def __init__(self,
+                 embed_dims=768,
+                 post_process_channels=[96, 192, 384, 768],
+                 readout_type='ignore',
+                 patch_size=16,
+                 expand_channels=False,
+                 act_cfg=dict(type='ReLU'),
+                 norm_cfg=dict(type='BN'),
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.in_channels = self.in_channels
+        self.expand_channels = expand_channels
+        self.reassemble_blocks = ReassembleBlocks(embed_dims,
+                                                  post_process_channels,
+                                                  readout_type, patch_size)
+
+        self.post_process_channels = [
+            channel * math.pow(2, i) if expand_channels else channel
+            for i, channel in enumerate(post_process_channels)
+        ]
+        self.convs = nn.ModuleList()
+        for channel in self.post_process_channels:
+            self.convs.append(
+                ConvModule(
+                    channel,
+                    self.channels,
+                    kernel_size=3,
+                    padding=1,
+                    act_cfg=None,
+                    bias=False))
+        self.fusion_blocks = nn.ModuleList()
+        for _ in range(len(self.convs)):
+            self.fusion_blocks.append(
+                FeatureFusionBlock(self.channels, act_cfg, norm_cfg))
+        self.fusion_blocks[0].res_conv_unit1 = None
+        self.project = ConvModule(
+            self.channels,
+            self.channels,
+            kernel_size=3,
+            padding=1,
+            norm_cfg=norm_cfg)
+        self.num_fusion_blocks = len(self.fusion_blocks)
+        self.num_reassemble_blocks = len(self.reassemble_blocks.resize_layers)
+        self.num_post_process_channels = len(self.post_process_channels)
+        assert self.num_fusion_blocks == self.num_reassemble_blocks
+        assert self.num_reassemble_blocks == self.num_post_process_channels
+
+    def forward(self, inputs):
+        assert len(inputs) == self.num_reassemble_blocks
+        x = self._transform_inputs(inputs)
+        x = self.reassemble_blocks(x)
+        x = [self.convs[i](feature) for i, feature in enumerate(x)]
+        out = self.fusion_blocks[0](x[-1])
+        for i in range(1, len(self.fusion_blocks)):
+            out = self.fusion_blocks[i](out, x[-(i + 1)])
+        out = self.project(out)
+        out = self.cls_seg(out)
+        return out
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/ema_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/ema_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab8dbb0c29b9b533dad962e48d71ae055f20aa07
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/ema_head.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from .decode_head import BaseDecodeHead
+
+
+def reduce_mean(tensor):
+    """Reduce mean when distributed training."""
+    if not (dist.is_available() and dist.is_initialized()):
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+    return tensor
+
+
+class EMAModule(nn.Module):
+    """Expectation Maximization Attention Module used in EMANet.
+
+    Args:
+        channels (int): Channels of the whole module.
+        num_bases (int): Number of bases.
+        num_stages (int): Number of the EM iterations.
+    """
+
+    def __init__(self, channels, num_bases, num_stages, momentum):
+        super().__init__()
+        assert num_stages >= 1, 'num_stages must be at least 1!'
+        self.num_bases = num_bases
+        self.num_stages = num_stages
+        self.momentum = momentum
+
+        bases = torch.zeros(1, channels, self.num_bases)
+        bases.normal_(0, math.sqrt(2. / self.num_bases))
+        # [1, channels, num_bases]
+        bases = F.normalize(bases, dim=1, p=2)
+        self.register_buffer('bases', bases)
+
+    def forward(self, feats):
+        """Forward function."""
+        batch_size, channels, height, width = feats.size()
+        # [batch_size, channels, height*width]
+        feats = feats.view(batch_size, channels, height * width)
+        # [batch_size, channels, num_bases]
+        bases = self.bases.repeat(batch_size, 1, 1)
+
+        with torch.no_grad():
+            for i in range(self.num_stages):
+                # [batch_size, height*width, num_bases]
+                attention = torch.einsum('bcn,bck->bnk', feats, bases)
+                attention = F.softmax(attention, dim=2)
+                # l1 norm
+                attention_normed = F.normalize(attention, dim=1, p=1)
+                # [batch_size, channels, num_bases]
+                bases = torch.einsum('bcn,bnk->bck', feats, attention_normed)
+                # l2 norm
+                bases = F.normalize(bases, dim=1, p=2)
+
+        feats_recon = torch.einsum('bck,bnk->bcn', bases, attention)
+        feats_recon = feats_recon.view(batch_size, channels, height, width)
+
+        if self.training:
+            bases = bases.mean(dim=0, keepdim=True)
+            bases = reduce_mean(bases)
+            # l2 norm
+            bases = F.normalize(bases, dim=1, p=2)
+            self.bases = (1 -
+                          self.momentum) * self.bases + self.momentum * bases
+
+        return feats_recon
+
+
+@MODELS.register_module()
+class EMAHead(BaseDecodeHead):
+    """Expectation Maximization Attention Networks for Semantic Segmentation.
+
+    This head is the implementation of `EMANet
+    <https://arxiv.org/abs/1907.13426>`_.
+
+    Args:
+        ema_channels (int): EMA module channels
+        num_bases (int): Number of bases.
+        num_stages (int): Number of the EM iterations.
+        concat_input (bool): Whether concat the input and output of convs
+            before classification layer. Default: True
+        momentum (float): Momentum to update the base. Default: 0.1.
+    """
+
+    def __init__(self,
+                 ema_channels,
+                 num_bases,
+                 num_stages,
+                 concat_input=True,
+                 momentum=0.1,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.ema_channels = ema_channels
+        self.num_bases = num_bases
+        self.num_stages = num_stages
+        self.concat_input = concat_input
+        self.momentum = momentum
+        self.ema_module = EMAModule(self.ema_channels, self.num_bases,
+                                    self.num_stages, self.momentum)
+
+        self.ema_in_conv = ConvModule(
+            self.in_channels,
+            self.ema_channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        # project (0, inf) -> (-inf, inf)
+        self.ema_mid_conv = ConvModule(
+            self.ema_channels,
+            self.ema_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=None,
+            act_cfg=None)
+        for param in self.ema_mid_conv.parameters():
+            param.requires_grad = False
+
+        self.ema_out_conv = ConvModule(
+            self.ema_channels,
+            self.ema_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+        self.bottleneck = ConvModule(
+            self.ema_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        if self.concat_input:
+            self.conv_cat = ConvModule(
+                self.in_channels + self.channels,
+                self.channels,
+                kernel_size=3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        feats = self.ema_in_conv(x)
+        identity = feats
+        feats = self.ema_mid_conv(feats)
+        recon = self.ema_module(feats)
+        recon = F.relu(recon, inplace=True)
+        recon = self.ema_out_conv(recon)
+        output = F.relu(identity + recon, inplace=True)
+        output = self.bottleneck(output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/enc_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/enc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bba73b3010b31cd39547b79bd507073f14bdb32
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/enc_head.py
@@ -0,0 +1,196 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_norm_layer
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import ConfigType, SampleList
+from ..utils import Encoding, resize
+from .decode_head import BaseDecodeHead
+
+
+class EncModule(nn.Module):
+    """Encoding Module used in EncNet.
+
+    Args:
+        in_channels (int): Input channels.
+        num_codes (int): Number of code words.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self, in_channels, num_codes, conv_cfg, norm_cfg, act_cfg):
+        super().__init__()
+        self.encoding_project = ConvModule(
+            in_channels,
+            in_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        # TODO: resolve this hack
+        # change to 1d
+        if norm_cfg is not None:
+            encoding_norm_cfg = norm_cfg.copy()
+            if encoding_norm_cfg['type'] in ['BN', 'IN']:
+                encoding_norm_cfg['type'] += '1d'
+            else:
+                encoding_norm_cfg['type'] = encoding_norm_cfg['type'].replace(
+                    '2d', '1d')
+        else:
+            # fallback to BN1d
+            encoding_norm_cfg = dict(type='BN1d')
+        self.encoding = nn.Sequential(
+            Encoding(channels=in_channels, num_codes=num_codes),
+            build_norm_layer(encoding_norm_cfg, num_codes)[1],
+            nn.ReLU(inplace=True))
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, in_channels), nn.Sigmoid())
+
+    def forward(self, x):
+        """Forward function."""
+        encoding_projection = self.encoding_project(x)
+        encoding_feat = self.encoding(encoding_projection).mean(dim=1)
+        batch_size, channels, _, _ = x.size()
+        gamma = self.fc(encoding_feat)
+        y = gamma.view(batch_size, channels, 1, 1)
+        output = F.relu_(x + x * y)
+        return encoding_feat, output
+
+
+@MODELS.register_module()
+class EncHead(BaseDecodeHead):
+    """Context Encoding for Semantic Segmentation.
+
+    This head is the implementation of `EncNet
+    <https://arxiv.org/abs/1803.08904>`_.
+
+    Args:
+        num_codes (int): Number of code words. Default: 32.
+        use_se_loss (bool): Whether use Semantic Encoding Loss (SE-loss) to
+            regularize the training. Default: True.
+        add_lateral (bool): Whether use lateral connection to fuse features.
+            Default: False.
+        loss_se_decode (dict): Config of decode loss.
+            Default: dict(type='CrossEntropyLoss', use_sigmoid=True).
+    """
+
+    def __init__(self,
+                 num_codes=32,
+                 use_se_loss=True,
+                 add_lateral=False,
+                 loss_se_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=0.2),
+                 **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+        self.use_se_loss = use_se_loss
+        self.add_lateral = add_lateral
+        self.num_codes = num_codes
+        self.bottleneck = ConvModule(
+            self.in_channels[-1],
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        if add_lateral:
+            self.lateral_convs = nn.ModuleList()
+            for in_channels in self.in_channels[:-1]:  # skip the last one
+                self.lateral_convs.append(
+                    ConvModule(
+                        in_channels,
+                        self.channels,
+                        1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            self.fusion = ConvModule(
+                len(self.in_channels) * self.channels,
+                self.channels,
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        self.enc_module = EncModule(
+            self.channels,
+            num_codes=num_codes,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        if self.use_se_loss:
+            self.loss_se_decode = MODELS.build(loss_se_decode)
+            self.se_layer = nn.Linear(self.channels, self.num_classes)
+
+    def forward(self, inputs):
+        """Forward function."""
+        inputs = self._transform_inputs(inputs)
+        feat = self.bottleneck(inputs[-1])
+        if self.add_lateral:
+            laterals = [
+                resize(
+                    lateral_conv(inputs[i]),
+                    size=feat.shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners)
+                for i, lateral_conv in enumerate(self.lateral_convs)
+            ]
+            feat = self.fusion(torch.cat([feat, *laterals], 1))
+        encode_feat, output = self.enc_module(feat)
+        output = self.cls_seg(output)
+        if self.use_se_loss:
+            se_output = self.se_layer(encode_feat)
+            return output, se_output
+        else:
+            return output
+
+    def predict(self, inputs: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType):
+        """Forward function for testing, ignore se_loss."""
+        if self.use_se_loss:
+            seg_logits = self.forward(inputs)[0]
+        else:
+            seg_logits = self.forward(inputs)
+        return self.predict_by_feat(seg_logits, batch_img_metas)
+
+    @staticmethod
+    def _convert_to_onehot_labels(seg_label, num_classes):
+        """Convert segmentation label to onehot.
+
+        Args:
+            seg_label (Tensor): Segmentation label of shape (N, H, W).
+            num_classes (int): Number of classes.
+
+        Returns:
+            Tensor: Onehot labels of shape (N, num_classes).
+        """
+
+        batch_size = seg_label.size(0)
+        onehot_labels = seg_label.new_zeros((batch_size, num_classes))
+        for i in range(batch_size):
+            hist = seg_label[i].float().histc(
+                bins=num_classes, min=0, max=num_classes - 1)
+            onehot_labels[i] = hist > 0
+        return onehot_labels
+
+    def loss_by_feat(self, seg_logit: Tuple[Tensor],
+                     batch_data_samples: SampleList, **kwargs) -> dict:
+        """Compute segmentation and semantic encoding loss."""
+        seg_logit, se_seg_logit = seg_logit
+        loss = dict()
+        loss.update(super().loss_by_feat(seg_logit, batch_data_samples))
+
+        seg_label = self._stack_batch_gt(batch_data_samples)
+        se_loss = self.loss_se_decode(
+            se_seg_logit,
+            self._convert_to_onehot_labels(seg_label, self.num_classes))
+        loss['loss_se'] = se_loss
+        return loss
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/fcn_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/fcn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..341801888368d307da6b926a2c89f72b6b06476d
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/fcn_head.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from .decode_head import BaseDecodeHead
+
+
+@MODELS.register_module()
+class FCNHead(BaseDecodeHead):
+    """Fully Convolution Networks for Semantic Segmentation.
+
+    This head is implemented of `FCNNet <https://arxiv.org/abs/1411.4038>`_.
+
+    Args:
+        num_convs (int): Number of convs in the head. Default: 2.
+        kernel_size (int): The kernel size for convs in the head. Default: 3.
+        concat_input (bool): Whether concat the input and output of convs
+            before classification layer.
+        dilation (int): The dilation rate for convs in the head. Default: 1.
+    """
+
+    def __init__(self,
+                 num_convs=2,
+                 kernel_size=3,
+                 concat_input=True,
+                 dilation=1,
+                 **kwargs):
+        assert num_convs >= 0 and dilation > 0 and isinstance(dilation, int)
+        self.num_convs = num_convs
+        self.concat_input = concat_input
+        self.kernel_size = kernel_size
+        super().__init__(**kwargs)
+        if num_convs == 0:
+            assert self.in_channels == self.channels
+
+        conv_padding = (kernel_size // 2) * dilation
+        convs = []
+        convs.append(
+            ConvModule(
+                self.in_channels,
+                self.channels,
+                kernel_size=kernel_size,
+                padding=conv_padding,
+                dilation=dilation,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        for i in range(num_convs - 1):
+            convs.append(
+                ConvModule(
+                    self.channels,
+                    self.channels,
+                    kernel_size=kernel_size,
+                    padding=conv_padding,
+                    dilation=dilation,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        if num_convs == 0:
+            self.convs = nn.Identity()
+        else:
+            self.convs = nn.Sequential(*convs)
+        if self.concat_input:
+            self.conv_cat = ConvModule(
+                self.in_channels + self.channels,
+                self.channels,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+    def _forward_feature(self, inputs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        x = self._transform_inputs(inputs)
+        feats = self.convs(x)
+        if self.concat_input:
+            feats = self.conv_cat(torch.cat([x, feats], dim=1))
+        return feats
+
+    def forward(self, inputs):
+        """Forward function."""
+        output = self._forward_feature(inputs)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/fpn_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/fpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..25f481fe81c5f4f0aa37903aaf135dc63c930bf8
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/fpn_head.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import Upsample, resize
+from .decode_head import BaseDecodeHead
+
+
+@MODELS.register_module()
+class FPNHead(BaseDecodeHead):
+    """Panoptic Feature Pyramid Networks.
+
+    This head is the implementation of `Semantic FPN
+    <https://arxiv.org/abs/1901.02446>`_.
+
+    Args:
+        feature_strides (tuple[int]): The strides for input feature maps.
+            stack_lateral. All strides suppose to be power of 2. The first
+            one is of largest resolution.
+    """
+
+    def __init__(self, feature_strides, **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+        assert len(feature_strides) == len(self.in_channels)
+        assert min(feature_strides) == feature_strides[0]
+        self.feature_strides = feature_strides
+
+        self.scale_heads = nn.ModuleList()
+        for i in range(len(feature_strides)):
+            head_length = max(
+                1,
+                int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
+            scale_head = []
+            for k in range(head_length):
+                scale_head.append(
+                    ConvModule(
+                        self.in_channels[i] if k == 0 else self.channels,
+                        self.channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                if feature_strides[i] != feature_strides[0]:
+                    scale_head.append(
+                        Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=self.align_corners))
+            self.scale_heads.append(nn.Sequential(*scale_head))
+
+    def forward(self, inputs):
+
+        x = self._transform_inputs(inputs)
+
+        output = self.scale_heads[0](x[0])
+        for i in range(1, len(self.feature_strides)):
+            # non inplace
+            output = output + resize(
+                self.scale_heads[i](x[i]),
+                size=output.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/gc_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/gc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..14f0ef021c1143d493e17f347f1f4da1145470b8
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/gc_head.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import ContextBlock
+
+from mmseg.registry import MODELS
+from .fcn_head import FCNHead
+
+
+@MODELS.register_module()
+class GCHead(FCNHead):
+    """GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond.
+
+    This head is the implementation of `GCNet
+    <https://arxiv.org/abs/1904.11492>`_.
+
+    Args:
+        ratio (float): Multiplier of channels ratio. Default: 1/4.
+        pooling_type (str): The pooling type of context aggregation.
+            Options are 'att', 'avg'. Default: 'avg'.
+        fusion_types (tuple[str]): The fusion type for feature fusion.
+            Options are 'channel_add', 'channel_mul'. Default: ('channel_add',)
+    """
+
+    def __init__(self,
+                 ratio=1 / 4.,
+                 pooling_type='att',
+                 fusion_types=('channel_add', ),
+                 **kwargs):
+        super().__init__(num_convs=2, **kwargs)
+        self.ratio = ratio
+        self.pooling_type = pooling_type
+        self.fusion_types = fusion_types
+        self.gc_block = ContextBlock(
+            in_channels=self.channels,
+            ratio=self.ratio,
+            pooling_type=self.pooling_type,
+            fusion_types=self.fusion_types)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        output = self.convs[0](x)
+        output = self.gc_block(output)
+        output = self.convs[1](output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/ham_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/ham_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..073d8011b05dc8c5e8d48cc8b77484a27f7b2100
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/ham_head.py
@@ -0,0 +1,255 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Originally from https://github.com/visual-attention-network/segnext
+# Licensed under the Apache License, Version 2.0 (the "License")
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.device import get_device
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class Matrix_Decomposition_2D_Base(nn.Module):
+    """Base class of 2D Matrix Decomposition.
+
+    Args:
+        MD_S (int): The number of spatial coefficient in
+            Matrix Decomposition, it may be used for calculation
+            of the number of latent dimension D in Matrix
+            Decomposition. Defaults: 1.
+        MD_R (int): The number of latent dimension R in
+            Matrix Decomposition. Defaults: 64.
+        train_steps (int): The number of iteration steps in
+            Multiplicative Update (MU) rule to solve Non-negative
+            Matrix Factorization (NMF) in training. Defaults: 6.
+        eval_steps (int): The number of iteration steps in
+            Multiplicative Update (MU) rule to solve Non-negative
+            Matrix Factorization (NMF) in evaluation. Defaults: 7.
+        inv_t (int): Inverted multiple number to make coefficient
+            smaller in softmax. Defaults: 100.
+        rand_init (bool): Whether to initialize randomly.
+            Defaults: True.
+    """
+
+    def __init__(self,
+                 MD_S=1,
+                 MD_R=64,
+                 train_steps=6,
+                 eval_steps=7,
+                 inv_t=100,
+                 rand_init=True):
+        super().__init__()
+
+        self.S = MD_S
+        self.R = MD_R
+
+        self.train_steps = train_steps
+        self.eval_steps = eval_steps
+
+        self.inv_t = inv_t
+
+        self.rand_init = rand_init
+
+    def _build_bases(self, B, S, D, R, device=None):
+        raise NotImplementedError
+
+    def local_step(self, x, bases, coef):
+        raise NotImplementedError
+
+    def local_inference(self, x, bases):
+        # (B * S, D, N)^T @ (B * S, D, R) -> (B * S, N, R)
+        coef = torch.bmm(x.transpose(1, 2), bases)
+        coef = F.softmax(self.inv_t * coef, dim=-1)
+
+        steps = self.train_steps if self.training else self.eval_steps
+        for _ in range(steps):
+            bases, coef = self.local_step(x, bases, coef)
+
+        return bases, coef
+
+    def compute_coef(self, x, bases, coef):
+        raise NotImplementedError
+
+    def forward(self, x, return_bases=False):
+        """Forward Function."""
+        B, C, H, W = x.shape
+
+        # (B, C, H, W) -> (B * S, D, N)
+        D = C // self.S
+        N = H * W
+        x = x.view(B * self.S, D, N)
+        if not self.rand_init and not hasattr(self, 'bases'):
+            bases = self._build_bases(1, self.S, D, self.R, device=x.device)
+            self.register_buffer('bases', bases)
+
+        # (S, D, R) -> (B * S, D, R)
+        if self.rand_init:
+            bases = self._build_bases(B, self.S, D, self.R, device=x.device)
+        else:
+            bases = self.bases.repeat(B, 1, 1)
+
+        bases, coef = self.local_inference(x, bases)
+
+        # (B * S, N, R)
+        coef = self.compute_coef(x, bases, coef)
+
+        # (B * S, D, R) @ (B * S, N, R)^T -> (B * S, D, N)
+        x = torch.bmm(bases, coef.transpose(1, 2))
+
+        # (B * S, D, N) -> (B, C, H, W)
+        x = x.view(B, C, H, W)
+
+        return x
+
+
+class NMF2D(Matrix_Decomposition_2D_Base):
+    """Non-negative Matrix Factorization (NMF) module.
+
+    It is inherited from ``Matrix_Decomposition_2D_Base`` module.
+    """
+
+    def __init__(self, args=dict()):
+        super().__init__(**args)
+
+        self.inv_t = 1
+
+    def _build_bases(self, B, S, D, R, device=None):
+        """Build bases in initialization."""
+        if device is None:
+            device = get_device()
+        bases = torch.rand((B * S, D, R)).to(device)
+        bases = F.normalize(bases, dim=1)
+
+        return bases
+
+    def local_step(self, x, bases, coef):
+        """Local step in iteration to renew bases and coefficient."""
+        # (B * S, D, N)^T @ (B * S, D, R) -> (B * S, N, R)
+        numerator = torch.bmm(x.transpose(1, 2), bases)
+        # (B * S, N, R) @ [(B * S, D, R)^T @ (B * S, D, R)] -> (B * S, N, R)
+        denominator = coef.bmm(bases.transpose(1, 2).bmm(bases))
+        # Multiplicative Update
+        coef = coef * numerator / (denominator + 1e-6)
+
+        # (B * S, D, N) @ (B * S, N, R) -> (B * S, D, R)
+        numerator = torch.bmm(x, coef)
+        # (B * S, D, R) @ [(B * S, N, R)^T @ (B * S, N, R)] -> (B * S, D, R)
+        denominator = bases.bmm(coef.transpose(1, 2).bmm(coef))
+        # Multiplicative Update
+        bases = bases * numerator / (denominator + 1e-6)
+
+        return bases, coef
+
+    def compute_coef(self, x, bases, coef):
+        """Compute coefficient."""
+        # (B * S, D, N)^T @ (B * S, D, R) -> (B * S, N, R)
+        numerator = torch.bmm(x.transpose(1, 2), bases)
+        # (B * S, N, R) @ (B * S, D, R)^T @ (B * S, D, R) -> (B * S, N, R)
+        denominator = coef.bmm(bases.transpose(1, 2).bmm(bases))
+        # multiplication update
+        coef = coef * numerator / (denominator + 1e-6)
+
+        return coef
+
+
+class Hamburger(nn.Module):
+    """Hamburger Module. It consists of one slice of "ham" (matrix
+    decomposition) and two slices of "bread" (linear transformation).
+
+    Args:
+        ham_channels (int): Input and output channels of feature.
+        ham_kwargs (dict): Config of matrix decomposition module.
+        norm_cfg (dict | None): Config of norm layers.
+    """
+
+    def __init__(self,
+                 ham_channels=512,
+                 ham_kwargs=dict(),
+                 norm_cfg=None,
+                 **kwargs):
+        super().__init__()
+
+        self.ham_in = ConvModule(
+            ham_channels, ham_channels, 1, norm_cfg=None, act_cfg=None)
+
+        self.ham = NMF2D(ham_kwargs)
+
+        self.ham_out = ConvModule(
+            ham_channels, ham_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+
+    def forward(self, x):
+        enjoy = self.ham_in(x)
+        enjoy = F.relu(enjoy, inplace=True)
+        enjoy = self.ham(enjoy)
+        enjoy = self.ham_out(enjoy)
+        ham = F.relu(x + enjoy, inplace=True)
+
+        return ham
+
+
+@MODELS.register_module()
+class LightHamHead(BaseDecodeHead):
+    """SegNeXt decode head.
+
+    This decode head is the implementation of `SegNeXt: Rethinking
+    Convolutional Attention Design for Semantic
+    Segmentation <https://arxiv.org/abs/2209.08575>`_.
+    Inspiration from https://github.com/visual-attention-network/segnext.
+
+    Specifically, LightHamHead is inspired by HamNet from
+    `Is Attention Better Than Matrix Decomposition?
+    <https://arxiv.org/abs/2109.04553>`.
+
+    Args:
+        ham_channels (int): input channels for Hamburger.
+            Defaults: 512.
+        ham_kwargs (int): kwagrs for Ham. Defaults: dict().
+    """
+
+    def __init__(self, ham_channels=512, ham_kwargs=dict(), **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+        self.ham_channels = ham_channels
+
+        self.squeeze = ConvModule(
+            sum(self.in_channels),
+            self.ham_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.hamburger = Hamburger(ham_channels, ham_kwargs, **kwargs)
+
+        self.align = ConvModule(
+            self.ham_channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        inputs = self._transform_inputs(inputs)
+
+        inputs = [
+            resize(
+                level,
+                size=inputs[0].shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for level in inputs
+        ]
+
+        inputs = torch.cat(inputs, dim=1)
+        # apply a conv block to squeeze feature map
+        x = self.squeeze(inputs)
+        # apply hamburger module
+        x = self.hamburger(x)
+
+        # apply a conv block to align feature map
+        output = self.align(x)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/isa_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/isa_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..355f215f39007d0153c2fdb3b22a40e7f11a01e3
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/isa_head.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import SelfAttentionBlock as _SelfAttentionBlock
+from .decode_head import BaseDecodeHead
+
+
+class SelfAttentionBlock(_SelfAttentionBlock):
+    """Self-Attention Module.
+
+    Args:
+        in_channels (int): Input channels of key/query feature.
+        channels (int): Output channels of key/query transform.
+        conv_cfg (dict | None): Config of conv layers.
+        norm_cfg (dict | None): Config of norm layers.
+        act_cfg (dict | None): Config of activation layers.
+    """
+
+    def __init__(self, in_channels, channels, conv_cfg, norm_cfg, act_cfg):
+        super().__init__(
+            key_in_channels=in_channels,
+            query_in_channels=in_channels,
+            channels=channels,
+            out_channels=in_channels,
+            share_key_query=False,
+            query_downsample=None,
+            key_downsample=None,
+            key_query_num_convs=2,
+            key_query_norm=True,
+            value_out_num_convs=1,
+            value_out_norm=False,
+            matmul_norm=True,
+            with_out=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.output_project = self.build_project(
+            in_channels,
+            in_channels,
+            num_convs=1,
+            use_conv_module=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        """Forward function."""
+        context = super().forward(x, x)
+        return self.output_project(context)
+
+
+@MODELS.register_module()
+class ISAHead(BaseDecodeHead):
+    """Interlaced Sparse Self-Attention for Semantic Segmentation.
+
+    This head is the implementation of `ISA
+    <https://arxiv.org/abs/1907.12273>`_.
+
+    Args:
+        isa_channels (int): The channels of ISA Module.
+        down_factor (tuple[int]): The local group size of ISA.
+    """
+
+    def __init__(self, isa_channels, down_factor=(8, 8), **kwargs):
+        super().__init__(**kwargs)
+        self.down_factor = down_factor
+
+        self.in_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.global_relation = SelfAttentionBlock(
+            self.channels,
+            isa_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.local_relation = SelfAttentionBlock(
+            self.channels,
+            isa_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.out_conv = ConvModule(
+            self.channels * 2,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x_ = self._transform_inputs(inputs)
+        x = self.in_conv(x_)
+        residual = x
+
+        n, c, h, w = x.size()
+        loc_h, loc_w = self.down_factor  # size of local group in H- and W-axes
+        glb_h, glb_w = math.ceil(h / loc_h), math.ceil(w / loc_w)
+        pad_h, pad_w = glb_h * loc_h - h, glb_w * loc_w - w
+        if pad_h > 0 or pad_w > 0:  # pad if the size is not divisible
+            padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                       pad_h - pad_h // 2)
+            x = F.pad(x, padding)
+
+        # global relation
+        x = x.view(n, c, glb_h, loc_h, glb_w, loc_w)
+        # do permutation to gather global group
+        x = x.permute(0, 3, 5, 1, 2, 4)  # (n, loc_h, loc_w, c, glb_h, glb_w)
+        x = x.reshape(-1, c, glb_h, glb_w)
+        # apply attention within each global group
+        x = self.global_relation(x)  # (n * loc_h * loc_w, c, glb_h, glb_w)
+
+        # local relation
+        x = x.view(n, loc_h, loc_w, c, glb_h, glb_w)
+        # do permutation to gather local group
+        x = x.permute(0, 4, 5, 3, 1, 2)  # (n, glb_h, glb_w, c, loc_h, loc_w)
+        x = x.reshape(-1, c, loc_h, loc_w)
+        # apply attention within each local group
+        x = self.local_relation(x)  # (n * glb_h * glb_w, c, loc_h, loc_w)
+
+        # permute each pixel back to its original position
+        x = x.view(n, glb_h, glb_w, c, loc_h, loc_w)
+        x = x.permute(0, 3, 1, 4, 2, 5)  # (n, c, glb_h, loc_h, glb_w, loc_w)
+        x = x.reshape(n, c, glb_h * loc_h, glb_w * loc_w)
+        if pad_h > 0 or pad_w > 0:  # remove padding
+            x = x[:, :, pad_h // 2:pad_h // 2 + h, pad_w // 2:pad_w // 2 + w]
+
+        x = self.out_conv(torch.cat([x, residual], dim=1))
+        out = self.cls_seg(x)
+
+        return out
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/knet_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/knet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..82d3a2807685cdc896c881095f46fd50a450018e
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/knet_head.py
@@ -0,0 +1,461 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.transformer import (FFN, MultiheadAttention,
+                                         build_transformer_layer)
+from mmengine.logging import print_log
+from torch import Tensor
+
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList
+
+
+@MODELS.register_module()
+class KernelUpdator(nn.Module):
+    """Dynamic Kernel Updator in Kernel Update Head.
+
+    Args:
+        in_channels (int): The number of channels of input feature map.
+            Default: 256.
+        feat_channels (int): The number of middle-stage channels in
+            the kernel updator. Default: 64.
+        out_channels (int): The number of output channels.
+        gate_sigmoid (bool): Whether use sigmoid function in gate
+            mechanism. Default: True.
+        gate_norm_act (bool): Whether add normalization and activation
+            layer in gate mechanism. Default: False.
+        activate_out: Whether add activation after gate mechanism.
+            Default: False.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='LN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+    """
+
+    def __init__(
+            self,
+            in_channels=256,
+            feat_channels=64,
+            out_channels=None,
+            gate_sigmoid=True,
+            gate_norm_act=False,
+            activate_out=False,
+            norm_cfg=dict(type='LN'),
+            act_cfg=dict(type='ReLU', inplace=True),
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.out_channels_raw = out_channels
+        self.gate_sigmoid = gate_sigmoid
+        self.gate_norm_act = gate_norm_act
+        self.activate_out = activate_out
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.out_channels = out_channels if out_channels else in_channels
+
+        self.num_params_in = self.feat_channels
+        self.num_params_out = self.feat_channels
+        self.dynamic_layer = nn.Linear(
+            self.in_channels, self.num_params_in + self.num_params_out)
+        self.input_layer = nn.Linear(self.in_channels,
+                                     self.num_params_in + self.num_params_out,
+                                     1)
+        self.input_gate = nn.Linear(self.in_channels, self.feat_channels, 1)
+        self.update_gate = nn.Linear(self.in_channels, self.feat_channels, 1)
+        if self.gate_norm_act:
+            self.gate_norm = build_norm_layer(norm_cfg, self.feat_channels)[1]
+
+        self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.input_norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.input_norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1]
+
+        self.activation = build_activation_layer(act_cfg)
+
+        self.fc_layer = nn.Linear(self.feat_channels, self.out_channels, 1)
+        self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+    def forward(self, update_feature, input_feature):
+        """Forward function of KernelUpdator.
+
+        Args:
+            update_feature (torch.Tensor): Feature map assembled from
+                each group. It would be reshaped with last dimension
+                shape: `self.in_channels`.
+            input_feature (torch.Tensor): Intermediate feature
+                with shape: (N, num_classes, conv_kernel_size**2, channels).
+        Returns:
+            Tensor: The output tensor of shape (N*C1/C2, K*K, C2), where N is
+            the number of classes, C1 and C2 are the feature map channels of
+            KernelUpdateHead and KernelUpdator, respectively.
+        """
+
+        update_feature = update_feature.reshape(-1, self.in_channels)
+        num_proposals = update_feature.size(0)
+        # dynamic_layer works for
+        # phi_1 and psi_3 in Eq.(4) and (5) of K-Net paper
+        parameters = self.dynamic_layer(update_feature)
+        param_in = parameters[:, :self.num_params_in].view(
+            -1, self.feat_channels)
+        param_out = parameters[:, -self.num_params_out:].view(
+            -1, self.feat_channels)
+
+        # input_layer works for
+        # phi_2 and psi_4 in Eq.(4) and (5) of K-Net paper
+        input_feats = self.input_layer(
+            input_feature.reshape(num_proposals, -1, self.feat_channels))
+        input_in = input_feats[..., :self.num_params_in]
+        input_out = input_feats[..., -self.num_params_out:]
+
+        # `gate_feats` is F^G in K-Net paper
+        gate_feats = input_in * param_in.unsqueeze(-2)
+        if self.gate_norm_act:
+            gate_feats = self.activation(self.gate_norm(gate_feats))
+
+        input_gate = self.input_norm_in(self.input_gate(gate_feats))
+        update_gate = self.norm_in(self.update_gate(gate_feats))
+        if self.gate_sigmoid:
+            input_gate = input_gate.sigmoid()
+            update_gate = update_gate.sigmoid()
+        param_out = self.norm_out(param_out)
+        input_out = self.input_norm_out(input_out)
+
+        if self.activate_out:
+            param_out = self.activation(param_out)
+            input_out = self.activation(input_out)
+
+        # Gate mechanism. Eq.(5) in original paper.
+        # param_out has shape (batch_size, feat_channels, out_channels)
+        features = update_gate * param_out.unsqueeze(
+            -2) + input_gate * input_out
+
+        features = self.fc_layer(features)
+        features = self.fc_norm(features)
+        features = self.activation(features)
+
+        return features
+
+
+@MODELS.register_module()
+class KernelUpdateHead(nn.Module):
+    """Kernel Update Head in K-Net.
+
+    Args:
+        num_classes (int): Number of classes. Default: 150.
+        num_ffn_fcs (int): The number of fully-connected layers in
+            FFNs. Default: 2.
+        num_heads (int): The number of parallel attention heads.
+            Default: 8.
+        num_mask_fcs (int): The number of fully connected layers for
+            mask prediction. Default: 3.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 2048.
+        in_channels (int): The number of channels of input feature map.
+            Default: 256.
+        out_channels (int): The number of output channels.
+            Default: 256.
+        dropout (float): The Probability of an element to be
+            zeroed in MultiheadAttention and FFN. Default 0.0.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        ffn_act_cfg (dict): Config of activation layers in FFN.
+            Default: dict(type='ReLU').
+        conv_kernel_size (int): The kernel size of convolution in
+            Kernel Update Head for dynamic kernel updation.
+            Default: 1.
+        feat_transform_cfg (dict | None): Config of feature transform.
+            Default: None.
+        kernel_init (bool): Whether initiate mask kernel in mask head.
+            Default: False.
+        with_ffn (bool): Whether add FFN in kernel update head.
+            Default: True.
+        feat_gather_stride (int): Stride of convolution in feature transform.
+            Default: 1.
+        mask_transform_stride (int): Stride of mask transform.
+            Default: 1.
+        kernel_updator_cfg (dict): Config of kernel updator.
+            Default: dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN')).
+    """
+
+    def __init__(self,
+                 num_classes=150,
+                 num_ffn_fcs=2,
+                 num_heads=8,
+                 num_mask_fcs=3,
+                 feedforward_channels=2048,
+                 in_channels=256,
+                 out_channels=256,
+                 dropout=0.0,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 ffn_act_cfg=dict(type='ReLU', inplace=True),
+                 conv_kernel_size=1,
+                 feat_transform_cfg=None,
+                 kernel_init=False,
+                 with_ffn=True,
+                 feat_gather_stride=1,
+                 mask_transform_stride=1,
+                 kernel_updator_cfg=dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN'))):
+        super().__init__()
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.fp16_enabled = False
+        self.dropout = dropout
+        self.num_heads = num_heads
+        self.kernel_init = kernel_init
+        self.with_ffn = with_ffn
+        self.conv_kernel_size = conv_kernel_size
+        self.feat_gather_stride = feat_gather_stride
+        self.mask_transform_stride = mask_transform_stride
+
+        self.attention = MultiheadAttention(in_channels * conv_kernel_size**2,
+                                            num_heads, dropout)
+        self.attention_norm = build_norm_layer(
+            dict(type='LN'), in_channels * conv_kernel_size**2)[1]
+        self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg)
+
+        if feat_transform_cfg is not None:
+            kernel_size = feat_transform_cfg.pop('kernel_size', 1)
+            transform_channels = in_channels
+            self.feat_transform = ConvModule(
+                transform_channels,
+                in_channels,
+                kernel_size,
+                stride=feat_gather_stride,
+                padding=int(feat_gather_stride // 2),
+                **feat_transform_cfg)
+        else:
+            self.feat_transform = None
+
+        if self.with_ffn:
+            self.ffn = FFN(
+                in_channels,
+                feedforward_channels,
+                num_ffn_fcs,
+                act_cfg=ffn_act_cfg,
+                dropout=dropout)
+            self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1]
+
+        self.mask_fcs = nn.ModuleList()
+        for _ in range(num_mask_fcs):
+            self.mask_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.mask_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.mask_fcs.append(build_activation_layer(act_cfg))
+
+        self.fc_mask = nn.Linear(in_channels, out_channels)
+
+    def init_weights(self):
+        """Use xavier initialization for all weight parameter and set
+        classification head bias as a specific value when use focal loss."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+            else:
+                # adopt the default initialization for
+                # the weight and bias of the layer norm
+                pass
+        if self.kernel_init:
+            print_log(
+                'mask kernel in mask head is normal initialized by std 0.01')
+            nn.init.normal_(self.fc_mask.weight, mean=0, std=0.01)
+
+    def forward(self, x, proposal_feat, mask_preds, mask_shape=None):
+        """Forward function of Dynamic Instance Interactive Head.
+
+        Args:
+            x (Tensor): Feature map from FPN with shape
+                (batch_size, feature_dimensions, H , W).
+            proposal_feat (Tensor): Intermediate feature get from
+                diihead in last stage, has shape
+                (batch_size, num_proposals, feature_dimensions)
+            mask_preds (Tensor): mask prediction from the former stage in shape
+                (batch_size, num_proposals, H, W).
+
+        Returns:
+            Tuple: The first tensor is predicted mask with shape
+            (N, num_classes, H, W), the second tensor is dynamic kernel
+            with shape (N, num_classes, channels, K, K).
+        """
+        N, num_proposals = proposal_feat.shape[:2]
+        if self.feat_transform is not None:
+            x = self.feat_transform(x)
+
+        C, H, W = x.shape[-3:]
+
+        mask_h, mask_w = mask_preds.shape[-2:]
+        if mask_h != H or mask_w != W:
+            gather_mask = F.interpolate(
+                mask_preds, (H, W), align_corners=False, mode='bilinear')
+        else:
+            gather_mask = mask_preds
+
+        sigmoid_masks = gather_mask.softmax(dim=1)
+
+        # Group Feature Assembling. Eq.(3) in original paper.
+        # einsum is faster than bmm by 30%
+        x_feat = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x)
+
+        # obj_feat in shape [B, N, C, K, K] -> [B, N, C, K*K] -> [B, N, K*K, C]
+        proposal_feat = proposal_feat.reshape(N, num_proposals,
+                                              self.in_channels,
+                                              -1).permute(0, 1, 3, 2)
+        obj_feat = self.kernel_update_conv(x_feat, proposal_feat)
+
+        # [B, N, K*K, C] -> [B, N, K*K*C] -> [N, B, K*K*C]
+        obj_feat = obj_feat.reshape(N, num_proposals, -1).permute(1, 0, 2)
+        obj_feat = self.attention_norm(self.attention(obj_feat))
+        # [N, B, K*K*C] -> [B, N, K*K*C]
+        obj_feat = obj_feat.permute(1, 0, 2)
+
+        # obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
+        obj_feat = obj_feat.reshape(N, num_proposals, -1, self.in_channels)
+
+        # FFN
+        if self.with_ffn:
+            obj_feat = self.ffn_norm(self.ffn(obj_feat))
+
+        mask_feat = obj_feat
+
+        for reg_layer in self.mask_fcs:
+            mask_feat = reg_layer(mask_feat)
+
+        # [B, N, K*K, C] -> [B, N, C, K*K]
+        mask_feat = self.fc_mask(mask_feat).permute(0, 1, 3, 2)
+
+        if (self.mask_transform_stride == 2 and self.feat_gather_stride == 1):
+            mask_x = F.interpolate(
+                x, scale_factor=0.5, mode='bilinear', align_corners=False)
+            H, W = mask_x.shape[-2:]
+        else:
+            mask_x = x
+        # group conv is 5x faster than unfold and uses about 1/5 memory
+        # Group conv vs. unfold vs. concat batch, 2.9ms :13.5ms :3.8ms
+        # Group conv vs. unfold vs. concat batch, 278 : 1420 : 369
+        # but in real training group conv is slower than concat batch
+        # so we keep using concat batch.
+        # fold_x = F.unfold(
+        #     mask_x,
+        #     self.conv_kernel_size,
+        #     padding=int(self.conv_kernel_size // 2))
+        # mask_feat = mask_feat.reshape(N, num_proposals, -1)
+        # new_mask_preds = torch.einsum('bnc,bcl->bnl', mask_feat, fold_x)
+        # [B, N, C, K*K] -> [B*N, C, K, K]
+        mask_feat = mask_feat.reshape(N, num_proposals, C,
+                                      self.conv_kernel_size,
+                                      self.conv_kernel_size)
+        # [B, C, H, W] -> [1, B*C, H, W]
+        new_mask_preds = []
+        for i in range(N):
+            new_mask_preds.append(
+                F.conv2d(
+                    mask_x[i:i + 1],
+                    mask_feat[i],
+                    padding=int(self.conv_kernel_size // 2)))
+
+        new_mask_preds = torch.cat(new_mask_preds, dim=0)
+        new_mask_preds = new_mask_preds.reshape(N, num_proposals, H, W)
+        if self.mask_transform_stride == 2:
+            new_mask_preds = F.interpolate(
+                new_mask_preds,
+                scale_factor=2,
+                mode='bilinear',
+                align_corners=False)
+
+        if mask_shape is not None and mask_shape[0] != H:
+            new_mask_preds = F.interpolate(
+                new_mask_preds,
+                mask_shape,
+                align_corners=False,
+                mode='bilinear')
+
+        return new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape(
+            N, num_proposals, self.in_channels, self.conv_kernel_size,
+            self.conv_kernel_size)
+
+
+@MODELS.register_module()
+class IterativeDecodeHead(BaseDecodeHead):
+    """K-Net: Towards Unified Image Segmentation.
+
+    This head is the implementation of
+    `K-Net:　<https://arxiv.org/abs/2106.14855>`_.
+
+    Args:
+        num_stages (int): The number of stages (kernel update heads)
+            in IterativeDecodeHead. Default: 3.
+        kernel_generate_head:(dict): Config of kernel generate head which
+            generate mask predictions, dynamic kernels and class predictions
+            for next kernel update heads.
+        kernel_update_head (dict): Config of kernel update head which refine
+            dynamic kernels and class predictions iteratively.
+
+    """
+
+    def __init__(self, num_stages, kernel_generate_head, kernel_update_head,
+                 **kwargs):
+        # ``IterativeDecodeHead`` would skip initialization of
+        # ``BaseDecodeHead`` which would be called when building
+        # ``self.kernel_generate_head``.
+        super(BaseDecodeHead, self).__init__(**kwargs)
+        assert num_stages == len(kernel_update_head)
+        self.num_stages = num_stages
+        self.kernel_generate_head = MODELS.build(kernel_generate_head)
+        self.kernel_update_head = nn.ModuleList()
+        self.align_corners = self.kernel_generate_head.align_corners
+        self.num_classes = self.kernel_generate_head.num_classes
+        self.input_transform = self.kernel_generate_head.input_transform
+        self.ignore_index = self.kernel_generate_head.ignore_index
+        self.out_channels = self.num_classes
+
+        for head_cfg in kernel_update_head:
+            self.kernel_update_head.append(MODELS.build(head_cfg))
+
+    def forward(self, inputs):
+        """Forward function."""
+        feats = self.kernel_generate_head._forward_feature(inputs)
+        sem_seg = self.kernel_generate_head.cls_seg(feats)
+        seg_kernels = self.kernel_generate_head.conv_seg.weight.clone()
+        seg_kernels = seg_kernels[None].expand(
+            feats.size(0), *seg_kernels.size())
+
+        stage_segs = [sem_seg]
+        for i in range(self.num_stages):
+            sem_seg, seg_kernels = self.kernel_update_head[i](feats,
+                                                              seg_kernels,
+                                                              sem_seg)
+            stage_segs.append(sem_seg)
+        if self.training:
+            return stage_segs
+        # only return the prediction of the last stage during testing
+        return stage_segs[-1]
+
+    def loss_by_feat(self, seg_logits: List[Tensor],
+                     batch_data_samples: SampleList, **kwargs) -> dict:
+        losses = dict()
+        for i, logit in enumerate(seg_logits):
+            loss = self.kernel_generate_head.loss_by_feat(
+                logit, batch_data_samples)
+            for k, v in loss.items():
+                losses[f'{k}.s{i}'] = v
+
+        return losses
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/lraspp_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/lraspp_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba2465f27522e6ff106fcdf94a46aab42881260a
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/lraspp_head.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.utils import is_tuple_of
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+@MODELS.register_module()
+class LRASPPHead(BaseDecodeHead):
+    """Lite R-ASPP (LRASPP) head is proposed in Searching for MobileNetV3.
+
+    This head is the improved implementation of `Searching for MobileNetV3
+    <https://ieeexplore.ieee.org/document/9008835>`_.
+
+    Args:
+        branch_channels (tuple[int]): The number of output channels in every
+            each branch. Default: (32, 64).
+    """
+
+    def __init__(self, branch_channels=(32, 64), **kwargs):
+        super().__init__(**kwargs)
+        if self.input_transform != 'multiple_select':
+            raise ValueError('in Lite R-ASPP (LRASPP) head, input_transform '
+                             f'must be \'multiple_select\'. But received '
+                             f'\'{self.input_transform}\'')
+        assert is_tuple_of(branch_channels, int)
+        assert len(branch_channels) == len(self.in_channels) - 1
+        self.branch_channels = branch_channels
+
+        self.convs = nn.Sequential()
+        self.conv_ups = nn.Sequential()
+        for i in range(len(branch_channels)):
+            self.convs.add_module(
+                f'conv{i}',
+                nn.Conv2d(
+                    self.in_channels[i], branch_channels[i], 1, bias=False))
+            self.conv_ups.add_module(
+                f'conv_up{i}',
+                ConvModule(
+                    self.channels + branch_channels[i],
+                    self.channels,
+                    1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    bias=False))
+
+        self.conv_up_input = nn.Conv2d(self.channels, self.channels, 1)
+
+        self.aspp_conv = ConvModule(
+            self.in_channels[-1],
+            self.channels,
+            1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            bias=False)
+        self.image_pool = nn.Sequential(
+            nn.AvgPool2d(kernel_size=49, stride=(16, 20)),
+            ConvModule(
+                self.in_channels[2],
+                self.channels,
+                1,
+                act_cfg=dict(type='Sigmoid'),
+                bias=False))
+
+    def forward(self, inputs):
+        """Forward function."""
+        inputs = self._transform_inputs(inputs)
+
+        x = inputs[-1]
+
+        x = self.aspp_conv(x) * resize(
+            self.image_pool(x),
+            size=x.size()[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.conv_up_input(x)
+
+        for i in range(len(self.branch_channels) - 1, -1, -1):
+            x = resize(
+                x,
+                size=inputs[i].size()[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            x = torch.cat([x, self.convs[i](inputs[i])], 1)
+            x = self.conv_ups[i](x)
+
+        return self.cls_seg(x)
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/mask2former_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/mask2former_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..36121da6a1fa570e2fb6ba4d8fa5773ee76ccefc
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/mask2former_head.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+
+from mmdet.models.dense_heads import \
+    Mask2FormerHead as MMDET_Mask2FormerHead
+    # print("MMDET_Mask2FormerHead", MMDET_Mask2FormerHead)
+# except ModuleNotFoundError:
+#     print("ModuleNotFoundError")
+#     MMDET_Mask2FormerHead = BaseModule
+
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures.seg_data_sample import SegDataSample
+from mmseg.utils import ConfigType, SampleList
+
+
+@MODELS.register_module()
+class Mask2FormerHead(MMDET_Mask2FormerHead):
+    """Implements the Mask2Former head.
+
+    See `Mask2Former: Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/abs/2112.01527>`_ for details.
+
+    Args:
+        num_classes (int): Number of classes. Default: 150.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        ignore_index (int): The label index to be ignored. Default: 255.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 align_corners=False,
+                 ignore_index=255,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        self.out_channels = num_classes
+        self.ignore_index = ignore_index
+
+        feat_channels = kwargs['feat_channels']
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+
+    def _seg_data_to_instance_data(self, batch_data_samples: SampleList):
+        """Perform forward propagation to convert paradigm from MMSegmentation
+        to MMDetection to ensure ``MMDET_Mask2FormerHead`` could be called
+        normally. Specifically, ``batch_gt_instances`` would be added.
+
+        Args:
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: A tuple contains two lists.
+
+                - batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                    gt_instance. It usually includes ``labels``, each is
+                    unique ground truth label id of images, with
+                    shape (num_gt, ) and ``masks``, each is ground truth
+                    masks of each instances of a image, shape (num_gt, h, w).
+                - batch_img_metas (list[dict]): List of image meta information.
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            gt_sem_seg = data_sample.gt_sem_seg.data
+            classes = torch.unique(
+                gt_sem_seg,
+                sorted=False,
+                return_inverse=False,
+                return_counts=False)
+
+            # remove ignored region
+            gt_labels = classes[classes != self.ignore_index]
+
+            masks = []
+            for class_id in gt_labels:
+                masks.append(gt_sem_seg == class_id)
+
+            if len(masks) == 0:
+                gt_masks = torch.zeros(
+                    (0, gt_sem_seg.shape[-2],
+                     gt_sem_seg.shape[-1])).to(gt_sem_seg).long()
+            else:
+                gt_masks = torch.stack(masks).squeeze(1).long()
+
+            instance_data = InstanceData(labels=gt_labels, masks=gt_masks)
+            batch_gt_instances.append(instance_data)
+        return batch_gt_instances, batch_img_metas
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Perform forward propagation and loss calculation of the decoder head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            train_cfg (ConfigType): Training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        # batch SegDataSample to InstanceDataSample
+        batch_gt_instances, batch_img_metas = self._seg_data_to_instance_data(
+            batch_data_samples)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self, x: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tuple[Tensor]:
+        """Test without augmentaton.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_img_metas (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            test_cfg (ConfigType): Test config.
+
+        Returns:
+            Tensor: A tensor of segmentation mask.
+        """
+        batch_data_samples = [
+            SegDataSample(metainfo=metainfo) for metainfo in batch_img_metas
+        ]
+
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+        if 'pad_shape' in batch_img_metas[0]:
+            size = batch_img_metas[0]['pad_shape']
+        else:
+            size = batch_img_metas[0]['img_shape']
+        # upsample mask
+        mask_pred_results = F.interpolate(
+            mask_pred_results, size=size, mode='bilinear', align_corners=False)
+        cls_score = F.softmax(mask_cls_results, dim=-1)[..., :-1]
+        mask_pred = mask_pred_results.sigmoid()
+        seg_logits = torch.einsum('bqc, bqhw->bchw', cls_score, mask_pred)
+        return seg_logits
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/maskformer_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/maskformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e61a7f63a33a508955a866e57c139ce8c40e0f6
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/maskformer_head.py
@@ -0,0 +1,174 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+
+try:
+    from mmdet.models.dense_heads import MaskFormerHead as MMDET_MaskFormerHead
+except ModuleNotFoundError:
+    MMDET_MaskFormerHead = BaseModule
+
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures.seg_data_sample import SegDataSample
+from mmseg.utils import ConfigType, SampleList
+
+
+@MODELS.register_module()
+class MaskFormerHead(MMDET_MaskFormerHead):
+    """Implements the MaskFormer head.
+
+    See `Per-Pixel Classification is Not All You Need for Semantic Segmentation
+    <https://arxiv.org/pdf/2107.06278>`_ for details.
+
+    Args:
+        num_classes (int): Number of classes. Default: 150.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        ignore_index (int): The label index to be ignored. Default: 255.
+    """
+
+    def __init__(self,
+                 num_classes: int = 150,
+                 align_corners: bool = False,
+                 ignore_index: int = 255,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+
+        self.out_channels = kwargs['out_channels']
+        self.align_corners = True
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        self.out_channels = num_classes
+        self.ignore_index = ignore_index
+
+        feat_channels = kwargs['feat_channels']
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+
+    def _seg_data_to_instance_data(self, batch_data_samples: SampleList):
+        """Perform forward propagation to convert paradigm from MMSegmentation
+        to MMDetection to ensure ``MMDET_MaskFormerHead`` could be called
+        normally. Specifically, ``batch_gt_instances`` would be added.
+
+        Args:
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: A tuple contains two lists.
+
+                - batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                    gt_instance. It usually includes ``labels``, each is
+                    unique ground truth label id of images, with
+                    shape (num_gt, ) and ``masks``, each is ground truth
+                    masks of each instances of a image, shape (num_gt, h, w).
+                - batch_img_metas (list[dict]): List of image meta information.
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+        for data_sample in batch_data_samples:
+            # Add `batch_input_shape` in metainfo of data_sample, which would
+            # be used in MaskFormerHead of MMDetection.
+            metainfo = data_sample.metainfo
+            metainfo['batch_input_shape'] = metainfo['img_shape']
+            data_sample.set_metainfo(metainfo)
+            batch_img_metas.append(data_sample.metainfo)
+            gt_sem_seg = data_sample.gt_sem_seg.data
+            classes = torch.unique(
+                gt_sem_seg,
+                sorted=False,
+                return_inverse=False,
+                return_counts=False)
+
+            # remove ignored region
+            gt_labels = classes[classes != self.ignore_index]
+
+            masks = []
+            for class_id in gt_labels:
+                masks.append(gt_sem_seg == class_id)
+
+            if len(masks) == 0:
+                gt_masks = torch.zeros((0, gt_sem_seg.shape[-2],
+                                        gt_sem_seg.shape[-1])).to(gt_sem_seg)
+            else:
+                gt_masks = torch.stack(masks).squeeze(1)
+
+            instance_data = InstanceData(
+                labels=gt_labels, masks=gt_masks.long())
+            batch_gt_instances.append(instance_data)
+        return batch_gt_instances, batch_img_metas
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Perform forward propagation and loss calculation of the decoder head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            train_cfg (ConfigType): Training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        # batch SegDataSample to InstanceDataSample
+        batch_gt_instances, batch_img_metas = self._seg_data_to_instance_data(
+            batch_data_samples)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self, x: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tuple[Tensor]:
+        """Test without augmentaton.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_img_metas (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            test_cfg (ConfigType): Test config.
+
+        Returns:
+            Tensor: A tensor of segmentation mask.
+        """
+
+        batch_data_samples = []
+        for metainfo in batch_img_metas:
+            metainfo['batch_input_shape'] = metainfo['img_shape']
+            batch_data_samples.append(SegDataSample(metainfo=metainfo))
+        # Forward function of MaskFormerHead from MMDetection needs
+        # 'batch_data_samples' as inputs, which is image shape　actually.
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        # upsample masks
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=img_shape,
+            mode='bilinear',
+            align_corners=False)
+
+        # semantic inference
+        cls_score = F.softmax(mask_cls_results, dim=-1)[..., :-1]
+        mask_pred = mask_pred_results.sigmoid()
+        seg_logits = torch.einsum('bqc,bqhw->bchw', cls_score, mask_pred)
+        return seg_logits
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/nl_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/nl_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ffcc2a2f081127f109deb0ad5bd1be0d6f50493
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/nl_head.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import NonLocal2d
+
+from mmseg.registry import MODELS
+from .fcn_head import FCNHead
+
+
+@MODELS.register_module()
+class NLHead(FCNHead):
+    """Non-local Neural Networks.
+
+    This head is the implementation of `NLNet
+    <https://arxiv.org/abs/1711.07971>`_.
+
+    Args:
+        reduction (int): Reduction factor of projection transform. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            sqrt(1/inter_channels). Default: True.
+        mode (str): The nonlocal mode. Options are 'embedded_gaussian',
+            'dot_product'. Default: 'embedded_gaussian.'.
+    """
+
+    def __init__(self,
+                 reduction=2,
+                 use_scale=True,
+                 mode='embedded_gaussian',
+                 **kwargs):
+        super().__init__(num_convs=2, **kwargs)
+        self.reduction = reduction
+        self.use_scale = use_scale
+        self.mode = mode
+        self.nl_block = NonLocal2d(
+            in_channels=self.channels,
+            reduction=self.reduction,
+            use_scale=self.use_scale,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            mode=self.mode)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        output = self.convs[0](x)
+        output = self.nl_block(output)
+        output = self.convs[1](output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/ocr_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/ocr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9afe37bebd6c16ff184dc482ae358eb7ae9a093a
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/ocr_head.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import SelfAttentionBlock as _SelfAttentionBlock
+from ..utils import resize
+from .cascade_decode_head import BaseCascadeDecodeHead
+
+
+class SpatialGatherModule(nn.Module):
+    """Aggregate the context features according to the initial predicted
+    probability distribution.
+
+    Employ the soft-weighted method to aggregate the context.
+    """
+
+    def __init__(self, scale):
+        super().__init__()
+        self.scale = scale
+
+    def forward(self, feats, probs):
+        """Forward function."""
+        batch_size, num_classes, height, width = probs.size()
+        channels = feats.size(1)
+        probs = probs.view(batch_size, num_classes, -1)
+        feats = feats.view(batch_size, channels, -1)
+        # [batch_size, height*width, num_classes]
+        feats = feats.permute(0, 2, 1)
+        # [batch_size, channels, height*width]
+        probs = F.softmax(self.scale * probs, dim=2)
+        # [batch_size, channels, num_classes]
+        ocr_context = torch.matmul(probs, feats)
+        ocr_context = ocr_context.permute(0, 2, 1).contiguous().unsqueeze(3)
+        return ocr_context
+
+
+class ObjectAttentionBlock(_SelfAttentionBlock):
+    """Make a OCR used SelfAttentionBlock."""
+
+    def __init__(self, in_channels, channels, scale, conv_cfg, norm_cfg,
+                 act_cfg):
+        if scale > 1:
+            query_downsample = nn.MaxPool2d(kernel_size=scale)
+        else:
+            query_downsample = None
+        super().__init__(
+            key_in_channels=in_channels,
+            query_in_channels=in_channels,
+            channels=channels,
+            out_channels=in_channels,
+            share_key_query=False,
+            query_downsample=query_downsample,
+            key_downsample=None,
+            key_query_num_convs=2,
+            key_query_norm=True,
+            value_out_num_convs=1,
+            value_out_norm=True,
+            matmul_norm=True,
+            with_out=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.bottleneck = ConvModule(
+            in_channels * 2,
+            in_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, query_feats, key_feats):
+        """Forward function."""
+        context = super().forward(query_feats, key_feats)
+        output = self.bottleneck(torch.cat([context, query_feats], dim=1))
+        if self.query_downsample is not None:
+            output = resize(query_feats)
+
+        return output
+
+
+@MODELS.register_module()
+class OCRHead(BaseCascadeDecodeHead):
+    """Object-Contextual Representations for Semantic Segmentation.
+
+    This head is the implementation of `OCRNet
+    <https://arxiv.org/abs/1909.11065>`_.
+
+    Args:
+        ocr_channels (int): The intermediate channels of OCR block.
+        scale (int): The scale of probability map in SpatialGatherModule in
+            Default: 1.
+    """
+
+    def __init__(self, ocr_channels, scale=1, **kwargs):
+        super().__init__(**kwargs)
+        self.ocr_channels = ocr_channels
+        self.scale = scale
+        self.object_context_block = ObjectAttentionBlock(
+            self.channels,
+            self.ocr_channels,
+            self.scale,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.spatial_gather_module = SpatialGatherModule(self.scale)
+
+        self.bottleneck = ConvModule(
+            self.in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs, prev_output):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        feats = self.bottleneck(x)
+        context = self.spatial_gather_module(feats, prev_output)
+        object_context = self.object_context_block(feats, context)
+        output = self.cls_seg(object_context)
+
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/pid_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/pid_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c092cb32d07c279c1d6a45d2e02baccb8e5ffa33
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/pid_head.py
@@ -0,0 +1,183 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.models.losses import accuracy
+from mmseg.models.utils import resize
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType, SampleList
+
+
+class BasePIDHead(BaseModule):
+    """Base class for PID head.
+
+    Args:
+        in_channels (int): Number of input channels.
+        channels (int): Number of output channels.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Init config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.conv = ConvModule(
+            in_channels,
+            channels,
+            kernel_size=3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            order=('norm', 'act', 'conv'))
+        _, self.norm = build_norm_layer(norm_cfg, num_features=channels)
+        self.act = build_activation_layer(act_cfg)
+
+    def forward(self, x: Tensor, cls_seg: Optional[nn.Module]) -> Tensor:
+        """Forward function.
+        Args:
+            x (Tensor): Input tensor.
+            cls_seg (nn.Module, optional): The classification head.
+
+        Returns:
+            Tensor: Output tensor.
+        """
+        x = self.conv(x)
+        x = self.norm(x)
+        x = self.act(x)
+        if cls_seg is not None:
+            x = cls_seg(x)
+        return x
+
+
+@MODELS.register_module()
+class PIDHead(BaseDecodeHead):
+    """Decode head for PIDNet.
+
+    Args:
+        in_channels (int): Number of input channels.
+        channels (int): Number of output channels.
+        num_classes (int): Number of classes.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 num_classes: int,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 **kwargs):
+        super().__init__(
+            in_channels,
+            channels,
+            num_classes=num_classes,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **kwargs)
+        self.i_head = BasePIDHead(in_channels, channels, norm_cfg, act_cfg)
+        self.p_head = BasePIDHead(in_channels // 2, channels, norm_cfg,
+                                  act_cfg)
+        self.d_head = BasePIDHead(
+            in_channels // 2,
+            in_channels // 4,
+            norm_cfg,
+        )
+        self.p_cls_seg = nn.Conv2d(channels, self.out_channels, kernel_size=1)
+        self.d_cls_seg = nn.Conv2d(in_channels // 4, 1, kernel_size=1)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(
+            self,
+            inputs: Union[Tensor,
+                          Tuple[Tensor]]) -> Union[Tensor, Tuple[Tensor]]:
+        """Forward function.
+        Args:
+            inputs (Tensor | tuple[Tensor]): Input tensor or tuple of
+                Tensor. When training, the input is a tuple of three tensors,
+                (p_feat, i_feat, d_feat), and the output is a tuple of three
+                tensors, (p_seg_logit, i_seg_logit, d_seg_logit).
+                When inference, only the head of integral branch is used, and
+                input is a tensor of integral feature map, and the output is
+                the segmentation logit.
+
+        Returns:
+            Tensor | tuple[Tensor]: Output tensor or tuple of tensors.
+        """
+        if self.training:
+            x_p, x_i, x_d = inputs
+            x_p = self.p_head(x_p, self.p_cls_seg)
+            x_i = self.i_head(x_i, self.cls_seg)
+            x_d = self.d_head(x_d, self.d_cls_seg)
+            return x_p, x_i, x_d
+        else:
+            return self.i_head(inputs, self.cls_seg)
+
+    def _stack_batch_gt(self, batch_data_samples: SampleList) -> Tuple[Tensor]:
+        gt_semantic_segs = [
+            data_sample.gt_sem_seg.data for data_sample in batch_data_samples
+        ]
+        gt_edge_segs = [
+            data_sample.gt_edge_map.data for data_sample in batch_data_samples
+        ]
+        gt_sem_segs = torch.stack(gt_semantic_segs, dim=0)
+        gt_edge_segs = torch.stack(gt_edge_segs, dim=0)
+        return gt_sem_segs, gt_edge_segs
+
+    def loss_by_feat(self, seg_logits: Tuple[Tensor],
+                     batch_data_samples: SampleList) -> dict:
+        loss = dict()
+        p_logit, i_logit, d_logit = seg_logits
+        sem_label, bd_label = self._stack_batch_gt(batch_data_samples)
+        p_logit = resize(
+            input=p_logit,
+            size=sem_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        i_logit = resize(
+            input=i_logit,
+            size=sem_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        d_logit = resize(
+            input=d_logit,
+            size=bd_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        sem_label = sem_label.squeeze(1)
+        bd_label = bd_label.squeeze(1)
+        loss['loss_sem_p'] = self.loss_decode[0](
+            p_logit, sem_label, ignore_index=self.ignore_index)
+        loss['loss_sem_i'] = self.loss_decode[1](i_logit, sem_label)
+        loss['loss_bd'] = self.loss_decode[2](d_logit, bd_label)
+        filler = torch.ones_like(sem_label) * self.ignore_index
+        sem_bd_label = torch.where(
+            torch.sigmoid(d_logit[:, 0, :, :]) > 0.8, sem_label, filler)
+        loss['loss_sem_bd'] = self.loss_decode[3](i_logit, sem_bd_label)
+        loss['acc_seg'] = accuracy(
+            i_logit, sem_label, ignore_index=self.ignore_index)
+        return loss
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/point_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/point_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8e433d66249a4690cea3e33e95ec54d58ee3a07
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/point_head.py
@@ -0,0 +1,367 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py  # noqa
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+try:
+    from mmcv.ops import point_sample
+except ModuleNotFoundError:
+    point_sample = None
+
+from typing import List
+
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList
+from ..losses import accuracy
+from ..utils import resize
+from .cascade_decode_head import BaseCascadeDecodeHead
+
+
+def calculate_uncertainty(seg_logits):
+    """Estimate uncertainty based on seg logits.
+
+    For each location of the prediction ``seg_logits`` we estimate
+    uncertainty as the difference between top first and top second
+    predicted logits.
+
+    Args:
+        seg_logits (Tensor): Semantic segmentation logits,
+            shape (batch_size, num_classes, height, width).
+
+    Returns:
+        scores (Tensor): T uncertainty scores with the most uncertain
+            locations having the highest uncertainty score, shape (
+            batch_size, 1, height, width)
+    """
+    top2_scores = torch.topk(seg_logits, k=2, dim=1)[0]
+    return (top2_scores[:, 1] - top2_scores[:, 0]).unsqueeze(1)
+
+
+@MODELS.register_module()
+class PointHead(BaseCascadeDecodeHead):
+    """A mask point head use in PointRend.
+
+    This head is implemented of `PointRend: Image Segmentation as
+    Rendering <https://arxiv.org/abs/1912.08193>`_.
+    ``PointHead`` use shared multi-layer perceptron (equivalent to
+    nn.Conv1d) to predict the logit of input points. The fine-grained feature
+    and coarse feature will be concatenate together for predication.
+
+    Args:
+        num_fcs (int): Number of fc layers in the head. Default: 3.
+        in_channels (int): Number of input channels. Default: 256.
+        fc_channels (int): Number of fc channels. Default: 256.
+        num_classes (int): Number of classes for logits. Default: 80.
+        class_agnostic (bool): Whether use class agnostic classification.
+            If so, the output channels of logits will be 1. Default: False.
+        coarse_pred_each_layer (bool): Whether concatenate coarse feature with
+            the output of each fc layer. Default: True.
+        conv_cfg (dict|None): Dictionary to construct and config conv layer.
+            Default: dict(type='Conv1d'))
+        norm_cfg (dict|None): Dictionary to construct and config norm layer.
+            Default: None.
+        loss_point (dict): Dictionary to construct and config loss layer of
+            point head. Default: dict(type='CrossEntropyLoss', use_mask=True,
+            loss_weight=1.0).
+    """
+
+    def __init__(self,
+                 num_fcs=3,
+                 coarse_pred_each_layer=True,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU', inplace=False),
+                 **kwargs):
+        super().__init__(
+            input_transform='multiple_select',
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=dict(
+                type='Normal', std=0.01, override=dict(name='fc_seg')),
+            **kwargs)
+        if point_sample is None:
+            raise RuntimeError('Please install mmcv-full for '
+                               'point_sample ops')
+
+        self.num_fcs = num_fcs
+        self.coarse_pred_each_layer = coarse_pred_each_layer
+
+        fc_in_channels = sum(self.in_channels) + self.num_classes
+        fc_channels = self.channels
+        self.fcs = nn.ModuleList()
+        for k in range(num_fcs):
+            fc = ConvModule(
+                fc_in_channels,
+                fc_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.fcs.append(fc)
+            fc_in_channels = fc_channels
+            fc_in_channels += self.num_classes if self.coarse_pred_each_layer \
+                else 0
+        self.fc_seg = nn.Conv1d(
+            fc_in_channels,
+            self.num_classes,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        if self.dropout_ratio > 0:
+            self.dropout = nn.Dropout(self.dropout_ratio)
+        delattr(self, 'conv_seg')
+
+    def cls_seg(self, feat):
+        """Classify each pixel with fc."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.fc_seg(feat)
+        return output
+
+    def forward(self, fine_grained_point_feats, coarse_point_feats):
+        x = torch.cat([fine_grained_point_feats, coarse_point_feats], dim=1)
+        for fc in self.fcs:
+            x = fc(x)
+            if self.coarse_pred_each_layer:
+                x = torch.cat((x, coarse_point_feats), dim=1)
+        return self.cls_seg(x)
+
+    def _get_fine_grained_point_feats(self, x, points):
+        """Sample from fine grained features.
+
+        Args:
+            x (list[Tensor]): Feature pyramid from by neck or backbone.
+            points (Tensor): Point coordinates, shape (batch_size,
+                num_points, 2).
+
+        Returns:
+            fine_grained_feats (Tensor): Sampled fine grained feature,
+                shape (batch_size, sum(channels of x), num_points).
+        """
+
+        fine_grained_feats_list = [
+            point_sample(_, points, align_corners=self.align_corners)
+            for _ in x
+        ]
+        if len(fine_grained_feats_list) > 1:
+            fine_grained_feats = torch.cat(fine_grained_feats_list, dim=1)
+        else:
+            fine_grained_feats = fine_grained_feats_list[0]
+
+        return fine_grained_feats
+
+    def _get_coarse_point_feats(self, prev_output, points):
+        """Sample from fine grained features.
+
+        Args:
+            prev_output (list[Tensor]): Prediction of previous decode head.
+            points (Tensor): Point coordinates, shape (batch_size,
+                num_points, 2).
+
+        Returns:
+            coarse_feats (Tensor): Sampled coarse feature, shape (batch_size,
+                num_classes, num_points).
+        """
+
+        coarse_feats = point_sample(
+            prev_output, points, align_corners=self.align_corners)
+
+        return coarse_feats
+
+    def loss(self, inputs, prev_output, batch_data_samples: SampleList,
+             train_cfg, **kwargs):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+            batch_data_samples (list[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `img_metas` or `gt_semantic_seg`.
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        x = self._transform_inputs(inputs)
+        with torch.no_grad():
+            points = self.get_points_train(
+                prev_output, calculate_uncertainty, cfg=train_cfg)
+        fine_grained_point_feats = self._get_fine_grained_point_feats(
+            x, points)
+        coarse_point_feats = self._get_coarse_point_feats(prev_output, points)
+        point_logits = self.forward(fine_grained_point_feats,
+                                    coarse_point_feats)
+
+        losses = self.loss_by_feat(point_logits, points, batch_data_samples)
+
+        return losses
+
+    def predict(self, inputs, prev_output, batch_img_metas: List[dict],
+                test_cfg, **kwargs):
+        """Forward function for testing.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Output segmentation map.
+        """
+
+        x = self._transform_inputs(inputs)
+        refined_seg_logits = prev_output.clone()
+        for _ in range(test_cfg.subdivision_steps):
+            refined_seg_logits = resize(
+                refined_seg_logits,
+                scale_factor=test_cfg.scale_factor,
+                mode='bilinear',
+                align_corners=self.align_corners)
+            batch_size, channels, height, width = refined_seg_logits.shape
+            point_indices, points = self.get_points_test(
+                refined_seg_logits, calculate_uncertainty, cfg=test_cfg)
+            fine_grained_point_feats = self._get_fine_grained_point_feats(
+                x, points)
+            coarse_point_feats = self._get_coarse_point_feats(
+                prev_output, points)
+            point_logits = self.forward(fine_grained_point_feats,
+                                        coarse_point_feats)
+
+            point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1)
+            refined_seg_logits = refined_seg_logits.reshape(
+                batch_size, channels, height * width)
+            refined_seg_logits = refined_seg_logits.scatter_(
+                2, point_indices, point_logits)
+            refined_seg_logits = refined_seg_logits.view(
+                batch_size, channels, height, width)
+
+        return self.predict_by_feat(refined_seg_logits, batch_img_metas,
+                                    **kwargs)
+
+    def loss_by_feat(self, point_logits, points, batch_data_samples, **kwargs):
+        """Compute segmentation loss."""
+        gt_semantic_seg = self._stack_batch_gt(batch_data_samples)
+        point_label = point_sample(
+            gt_semantic_seg.float(),
+            points,
+            mode='nearest',
+            align_corners=self.align_corners)
+        point_label = point_label.squeeze(1).long()
+
+        loss = dict()
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_module in losses_decode:
+            loss['point' + loss_module.loss_name] = loss_module(
+                point_logits, point_label, ignore_index=self.ignore_index)
+
+        loss['acc_point'] = accuracy(
+            point_logits, point_label, ignore_index=self.ignore_index)
+        return loss
+
+    def get_points_train(self, seg_logits, uncertainty_func, cfg):
+        """Sample points for training.
+
+        Sample points in [0, 1] x [0, 1] coordinate space based on their
+        uncertainty. The uncertainties are calculated for each point using
+        'uncertainty_func' function that takes point's logit prediction as
+        input.
+
+        Args:
+            seg_logits (Tensor): Semantic segmentation logits, shape (
+                batch_size, num_classes, height, width).
+            uncertainty_func (func): uncertainty calculation function.
+            cfg (dict): Training config of point head.
+
+        Returns:
+            point_coords (Tensor): A tensor of shape (batch_size, num_points,
+                2) that contains the coordinates of ``num_points`` sampled
+                points.
+        """
+        num_points = cfg.num_points
+        oversample_ratio = cfg.oversample_ratio
+        importance_sample_ratio = cfg.importance_sample_ratio
+        assert oversample_ratio >= 1
+        assert 0 <= importance_sample_ratio <= 1
+        batch_size = seg_logits.shape[0]
+        num_sampled = int(num_points * oversample_ratio)
+        point_coords = torch.rand(
+            batch_size, num_sampled, 2, device=seg_logits.device)
+        point_logits = point_sample(seg_logits, point_coords)
+        # It is crucial to calculate uncertainty based on the sampled
+        # prediction value for the points. Calculating uncertainties of the
+        # coarse predictions first and sampling them for points leads to
+        # incorrect results.  To illustrate this: assume uncertainty func(
+        # logits)=-abs(logits), a sampled point between two coarse
+        # predictions with -1 and 1 logits has 0 logits, and therefore 0
+        # uncertainty value. However, if we calculate uncertainties for the
+        # coarse predictions first, both will have -1 uncertainty,
+        # and sampled point will get -1 uncertainty.
+        point_uncertainties = uncertainty_func(point_logits)
+        num_uncertain_points = int(importance_sample_ratio * num_points)
+        num_random_points = num_points - num_uncertain_points
+        idx = torch.topk(
+            point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+        shift = num_sampled * torch.arange(
+            batch_size, dtype=torch.long, device=seg_logits.device)
+        idx += shift[:, None]
+        point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+            batch_size, num_uncertain_points, 2)
+        if num_random_points > 0:
+            rand_point_coords = torch.rand(
+                batch_size, num_random_points, 2, device=seg_logits.device)
+            point_coords = torch.cat((point_coords, rand_point_coords), dim=1)
+        return point_coords
+
+    def get_points_test(self, seg_logits, uncertainty_func, cfg):
+        """Sample points for testing.
+
+        Find ``num_points`` most uncertain points from ``uncertainty_map``.
+
+        Args:
+            seg_logits (Tensor): A tensor of shape (batch_size, num_classes,
+                height, width) for class-specific or class-agnostic prediction.
+            uncertainty_func (func): uncertainty calculation function.
+            cfg (dict): Testing config of point head.
+
+        Returns:
+            point_indices (Tensor): A tensor of shape (batch_size, num_points)
+                that contains indices from [0, height x width) of the most
+                uncertain points.
+            point_coords (Tensor): A tensor of shape (batch_size, num_points,
+                2) that contains [0, 1] x [0, 1] normalized coordinates of the
+                most uncertain points from the ``height x width`` grid .
+        """
+
+        num_points = cfg.subdivision_num_points
+        uncertainty_map = uncertainty_func(seg_logits)
+        batch_size, _, height, width = uncertainty_map.shape
+        h_step = 1.0 / height
+        w_step = 1.0 / width
+
+        uncertainty_map = uncertainty_map.view(batch_size, height * width)
+        num_points = min(height * width, num_points)
+        point_indices = uncertainty_map.topk(num_points, dim=1)[1]
+        point_coords = torch.zeros(
+            batch_size,
+            num_points,
+            2,
+            dtype=torch.float,
+            device=seg_logits.device)
+        point_coords[:, :, 0] = w_step / 2.0 + (point_indices %
+                                                width).float() * w_step
+        point_coords[:, :, 1] = h_step / 2.0 + (point_indices //
+                                                width).float() * h_step
+        return point_indices, point_coords
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/psa_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/psa_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..13ee5c58a569bb46612625b85685cd61b7e9df3e
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/psa_head.py
@@ -0,0 +1,197 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+try:
+    from mmcv.ops import PSAMask
+except ModuleNotFoundError:
+    PSAMask = None
+
+
+@MODELS.register_module()
+class PSAHead(BaseDecodeHead):
+    """Point-wise Spatial Attention Network for Scene Parsing.
+
+    This head is the implementation of `PSANet
+    <https://hszhao.github.io/papers/eccv18_psanet.pdf>`_.
+
+    Args:
+        mask_size (tuple[int]): The PSA mask size. It usually equals input
+            size.
+        psa_type (str): The type of psa module. Options are 'collect',
+            'distribute', 'bi-direction'. Default: 'bi-direction'
+        compact (bool): Whether use compact map for 'collect' mode.
+            Default: True.
+        shrink_factor (int): The downsample factors of psa mask. Default: 2.
+        normalization_factor (float): The normalize factor of attention.
+        psa_softmax (bool): Whether use softmax for attention.
+    """
+
+    def __init__(self,
+                 mask_size,
+                 psa_type='bi-direction',
+                 compact=False,
+                 shrink_factor=2,
+                 normalization_factor=1.0,
+                 psa_softmax=True,
+                 **kwargs):
+        if PSAMask is None:
+            raise RuntimeError('Please install mmcv-full for PSAMask ops')
+        super().__init__(**kwargs)
+        assert psa_type in ['collect', 'distribute', 'bi-direction']
+        self.psa_type = psa_type
+        self.compact = compact
+        self.shrink_factor = shrink_factor
+        self.mask_size = mask_size
+        mask_h, mask_w = mask_size
+        self.psa_softmax = psa_softmax
+        if normalization_factor is None:
+            normalization_factor = mask_h * mask_w
+        self.normalization_factor = normalization_factor
+
+        self.reduce = ConvModule(
+            self.in_channels,
+            self.channels,
+            kernel_size=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.attention = nn.Sequential(
+            ConvModule(
+                self.channels,
+                self.channels,
+                kernel_size=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            nn.Conv2d(
+                self.channels, mask_h * mask_w, kernel_size=1, bias=False))
+        if psa_type == 'bi-direction':
+            self.reduce_p = ConvModule(
+                self.in_channels,
+                self.channels,
+                kernel_size=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            self.attention_p = nn.Sequential(
+                ConvModule(
+                    self.channels,
+                    self.channels,
+                    kernel_size=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                nn.Conv2d(
+                    self.channels, mask_h * mask_w, kernel_size=1, bias=False))
+            self.psamask_collect = PSAMask('collect', mask_size)
+            self.psamask_distribute = PSAMask('distribute', mask_size)
+        else:
+            self.psamask = PSAMask(psa_type, mask_size)
+        self.proj = ConvModule(
+            self.channels * (2 if psa_type == 'bi-direction' else 1),
+            self.in_channels,
+            kernel_size=1,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.bottleneck = ConvModule(
+            self.in_channels * 2,
+            self.channels,
+            kernel_size=3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        identity = x
+        align_corners = self.align_corners
+        if self.psa_type in ['collect', 'distribute']:
+            out = self.reduce(x)
+            n, c, h, w = out.size()
+            if self.shrink_factor != 1:
+                if h % self.shrink_factor and w % self.shrink_factor:
+                    h = (h - 1) // self.shrink_factor + 1
+                    w = (w - 1) // self.shrink_factor + 1
+                    align_corners = True
+                else:
+                    h = h // self.shrink_factor
+                    w = w // self.shrink_factor
+                    align_corners = False
+                out = resize(
+                    out,
+                    size=(h, w),
+                    mode='bilinear',
+                    align_corners=align_corners)
+            y = self.attention(out)
+            if self.compact:
+                if self.psa_type == 'collect':
+                    y = y.view(n, h * w,
+                               h * w).transpose(1, 2).view(n, h * w, h, w)
+            else:
+                y = self.psamask(y)
+            if self.psa_softmax:
+                y = F.softmax(y, dim=1)
+            out = torch.bmm(
+                out.view(n, c, h * w), y.view(n, h * w, h * w)).view(
+                    n, c, h, w) * (1.0 / self.normalization_factor)
+        else:
+            x_col = self.reduce(x)
+            x_dis = self.reduce_p(x)
+            n, c, h, w = x_col.size()
+            if self.shrink_factor != 1:
+                if h % self.shrink_factor and w % self.shrink_factor:
+                    h = (h - 1) // self.shrink_factor + 1
+                    w = (w - 1) // self.shrink_factor + 1
+                    align_corners = True
+                else:
+                    h = h // self.shrink_factor
+                    w = w // self.shrink_factor
+                    align_corners = False
+                x_col = resize(
+                    x_col,
+                    size=(h, w),
+                    mode='bilinear',
+                    align_corners=align_corners)
+                x_dis = resize(
+                    x_dis,
+                    size=(h, w),
+                    mode='bilinear',
+                    align_corners=align_corners)
+            y_col = self.attention(x_col)
+            y_dis = self.attention_p(x_dis)
+            if self.compact:
+                y_dis = y_dis.view(n, h * w,
+                                   h * w).transpose(1, 2).view(n, h * w, h, w)
+            else:
+                y_col = self.psamask_collect(y_col)
+                y_dis = self.psamask_distribute(y_dis)
+            if self.psa_softmax:
+                y_col = F.softmax(y_col, dim=1)
+                y_dis = F.softmax(y_dis, dim=1)
+            x_col = torch.bmm(
+                x_col.view(n, c, h * w), y_col.view(n, h * w, h * w)).view(
+                    n, c, h, w) * (1.0 / self.normalization_factor)
+            x_dis = torch.bmm(
+                x_dis.view(n, c, h * w), y_dis.view(n, h * w, h * w)).view(
+                    n, c, h, w) * (1.0 / self.normalization_factor)
+            out = torch.cat([x_col, x_dis], 1)
+        out = self.proj(out)
+        out = resize(
+            out,
+            size=identity.shape[2:],
+            mode='bilinear',
+            align_corners=align_corners)
+        out = self.bottleneck(torch.cat((identity, out), dim=1))
+        out = self.cls_seg(out)
+        return out
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/psp_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/psp_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..a40ec41dec281e53815e9753ee2ba1a5da76bd05
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/psp_head.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class PPM(nn.ModuleList):
+    """Pooling Pyramid Module used in PSPNet.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+        align_corners (bool): align_corners argument of F.interpolate.
+    """
+
+    def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg,
+                 act_cfg, align_corners, **kwargs):
+        super().__init__()
+        self.pool_scales = pool_scales
+        self.align_corners = align_corners
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        for pool_scale in pool_scales:
+            self.append(
+                nn.Sequential(
+                    nn.AdaptiveAvgPool2d(pool_scale),
+                    ConvModule(
+                        self.in_channels,
+                        self.channels,
+                        1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        **kwargs)))
+
+    def forward(self, x):
+        """Forward function."""
+        ppm_outs = []
+        for ppm in self:
+            ppm_out = ppm(x)
+            upsampled_ppm_out = resize(
+                ppm_out,
+                size=x.size()[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            ppm_outs.append(upsampled_ppm_out)
+        return ppm_outs
+
+
+@MODELS.register_module()
+class PSPHead(BaseDecodeHead):
+    """Pyramid Scene Parsing Network.
+
+    This head is the implementation of
+    `PSPNet <https://arxiv.org/abs/1612.01105>`_.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module. Default: (1, 2, 3, 6).
+    """
+
+    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
+        super().__init__(**kwargs)
+        assert isinstance(pool_scales, (list, tuple))
+        self.pool_scales = pool_scales
+        self.psp_modules = PPM(
+            self.pool_scales,
+            self.in_channels,
+            self.channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+        self.bottleneck = ConvModule(
+            self.in_channels + len(pool_scales) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _forward_feature(self, inputs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        x = self._transform_inputs(inputs)
+        psp_outs = [x]
+        psp_outs.extend(self.psp_modules(x))
+        psp_outs = torch.cat(psp_outs, dim=1)
+        feats = self.bottleneck(psp_outs)
+        return feats
+
+    def forward(self, inputs):
+        """Forward function."""
+        output = self._forward_feature(inputs)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/san_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/san_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20da801924080efeee30a246331af2e2e5df352
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/san_head.py
@@ -0,0 +1,736 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_norm_layer
+from mmcv.cnn.bricks.transformer import BaseTransformerLayer
+from mmcv.ops import point_sample
+from mmengine.dist import all_reduce
+from mmengine.model.weight_init import (caffe2_xavier_init, normal_init,
+                                        trunc_normal_)
+from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmseg.models.backbones.vit import TransformerEncoderLayer
+from mmseg.registry import MODELS
+from mmseg.utils import (ConfigType, MatchMasks, SampleList,
+                         seg_data_to_instance_data)
+from ..utils import (MLP, LayerNorm2d, PatchEmbed, cross_attn_layer,
+                     get_uncertain_point_coords_with_randomness, resize)
+from .decode_head import BaseDecodeHead
+
+
+class MLPMaskDecoder(nn.Module):
+    """Module for decoding query and visual features with MLP layers to
+    generate the attention biases and the mask proposals."""
+
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        total_heads: int = 1,
+        total_layers: int = 1,
+        embed_channels: int = 256,
+        mlp_channels: int = 256,
+        mlp_num_layers: int = 3,
+        rescale_attn_bias: bool = False,
+    ):
+        super().__init__()
+        self.total_heads = total_heads
+        self.total_layers = total_layers
+
+        dense_affine_func = partial(nn.Conv2d, kernel_size=1)
+        # Query Branch
+        self.query_mlp = MLP(in_channels, mlp_channels, embed_channels,
+                             mlp_num_layers)
+        # Pixel Branch
+        self.pix_mlp = MLP(
+            in_channels,
+            mlp_channels,
+            embed_channels,
+            mlp_num_layers,
+            affine_func=dense_affine_func,
+        )
+        # Attention Bias Branch
+        self.attn_mlp = MLP(
+            in_channels,
+            mlp_channels,
+            embed_channels * self.total_heads * self.total_layers,
+            mlp_num_layers,
+            affine_func=dense_affine_func,
+        )
+        if rescale_attn_bias:
+            self.bias_scaling = nn.Linear(1, 1)
+        else:
+            self.bias_scaling = nn.Identity()
+
+    def forward(self, query: torch.Tensor,
+                x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """Forward function.
+        Args:
+            query (Tensor): Query Tokens [B,N,C].
+            x (Tensor): Visual features [B,C,H,W]
+
+        Return:
+            mask_preds (Tensor): Mask proposals.
+            attn_bias (List[Tensor]): List of attention bias.
+        """
+        query = self.query_mlp(query)
+        pix = self.pix_mlp(x)
+        b, c, h, w = pix.shape
+        # preidict mask
+        mask_preds = torch.einsum('bqc,bchw->bqhw', query, pix)
+        # generate attn bias
+        attn = self.attn_mlp(x)
+        attn = attn.reshape(b, self.total_layers, self.total_heads, c, h, w)
+        attn_bias = torch.einsum('bqc,blnchw->blnqhw', query, attn)
+        attn_bias = self.bias_scaling(attn_bias[..., None]).squeeze(-1)
+        attn_bias = attn_bias.chunk(self.total_layers, dim=1)
+        attn_bias = [attn.squeeze(1) for attn in attn_bias]
+        return mask_preds, attn_bias
+
+
+class SideAdapterNetwork(nn.Module):
+    """Side Adapter Network for predicting mask proposals and attention bias.
+
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        clip_channels (int): Number of channels of visual features.
+            Default: 768.
+        embed_dims (int): embedding dimension. Default: 240.
+        patch_size (int): The patch size. Default: 16.
+        patch_bias (bool): Whether use bias in patch embedding.
+            Default: True.
+        num_queries (int): Number of queries for mask proposals.
+            Default: 100.
+        fusion_index (List[int]): The layer number of the encode
+            transformer to fuse with the CLIP feature.
+            Default: [0, 1, 2, 3].
+        cfg_encoder (ConfigType): Configs for the encode layers.
+        cfg_decoder (ConfigType): Configs for the decode layers.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+    """
+
+    def __init__(
+            self,
+            in_channels: int = 3,
+            clip_channels: int = 768,
+            embed_dims: int = 240,
+            patch_size: int = 16,
+            patch_bias: bool = True,
+            num_queries: int = 100,
+            fusion_index: list = [0, 1, 2, 3],
+            cfg_encoder: ConfigType = ...,
+            cfg_decoder: ConfigType = ...,
+            norm_cfg: dict = dict(type='LN'),
+    ):
+        super().__init__()
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=patch_size,
+            padding=0,
+            input_size=(640, 640),
+            bias=patch_bias,
+            norm_cfg=None,
+            init_cfg=None,
+        )
+        ori_h, ori_w = self.patch_embed.init_out_size
+        num_patches = ori_h * ori_w
+        self.pos_embed = nn.Parameter(
+            torch.randn(1, num_patches, embed_dims) * .02)
+        self.query_pos_embed = nn.Parameter(
+            torch.zeros(1, num_queries, embed_dims))
+        self.query_embed = nn.Parameter(
+            torch.zeros(1, num_queries, embed_dims))
+        encode_layers = []
+        for i in range(cfg_encoder.num_encode_layer):
+            encode_layers.append(
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims,
+                    num_heads=cfg_encoder.num_heads,
+                    feedforward_channels=cfg_encoder.mlp_ratio * embed_dims,
+                    norm_cfg=norm_cfg))
+        self.encode_layers = nn.ModuleList(encode_layers)
+        conv_clips = []
+        for i in range(len(fusion_index)):
+            conv_clips.append(
+                nn.Sequential(
+                    LayerNorm2d(clip_channels),
+                    ConvModule(
+                        clip_channels,
+                        embed_dims,
+                        kernel_size=1,
+                        norm_cfg=None,
+                        act_cfg=None)))
+        self.conv_clips = nn.ModuleList(conv_clips)
+        self.fusion_index = fusion_index
+        self.mask_decoder = MLPMaskDecoder(
+            in_channels=embed_dims,
+            total_heads=cfg_decoder.num_heads,
+            total_layers=cfg_decoder.num_layers,
+            embed_channels=cfg_decoder.embed_channels,
+            mlp_channels=cfg_decoder.mlp_channels,
+            mlp_num_layers=cfg_decoder.num_mlp,
+            rescale_attn_bias=cfg_decoder.rescale)
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.query_embed, std=0.02)
+        nn.init.normal_(self.query_pos_embed, std=0.02)
+        for i in range(len(self.conv_clips)):
+            caffe2_xavier_init(self.conv_clips[i][1].conv)
+
+    def fuse_clip(self, fused_index: int, x: torch.Tensor,
+                  clip_feature: torch.Tensor, hwshape: Tuple[int,
+                                                             int], L: int):
+        """Fuse CLIP feature and visual tokens."""
+        fused_clip = (resize(
+            self.conv_clips[fused_index](clip_feature.contiguous()),
+            size=hwshape,
+            mode='bilinear',
+            align_corners=False)).permute(0, 2, 3, 1).reshape(x[:, -L:,
+                                                                ...].shape)
+        x = torch.cat([x[:, :-L, ...], x[:, -L:, ...] + fused_clip], dim=1)
+        return x
+
+    def encode_feature(self, image: torch.Tensor,
+                       clip_features: List[torch.Tensor],
+                       deep_supervision_idxs: List[int]) -> List[List]:
+        """Encode images by a lightweight vision transformer."""
+        assert len(self.fusion_index) == len(clip_features)
+        x, hwshape = self.patch_embed(image)
+        ori_h, ori_w = self.patch_embed.init_out_size
+        pos_embed = self.pos_embed
+        if self.pos_embed.shape[1] != x.shape[1]:
+            # resize the position embedding
+            pos_embed = (
+                resize(
+                    self.pos_embed.reshape(1, ori_h, ori_w,
+                                           -1).permute(0, 3, 1, 2),
+                    size=hwshape,
+                    mode='bicubic',
+                    align_corners=False,
+                ).flatten(2).permute(0, 2, 1))
+        pos_embed = torch.cat([
+            self.query_pos_embed.expand(pos_embed.shape[0], -1, -1), pos_embed
+        ],
+                              dim=1)
+        x = torch.cat([self.query_embed.expand(x.shape[0], -1, -1), x], dim=1)
+        x = x + pos_embed
+        L = hwshape[0] * hwshape[1]
+        fused_index = 0
+        if self.fusion_index[fused_index] == 0:
+            x = self.fuse_clip(fused_index, x, clip_features[0][0], hwshape, L)
+            fused_index += 1
+        outs = []
+        for index, block in enumerate(self.encode_layers, start=1):
+            x = block(x)
+            if index < len(self.fusion_index
+                           ) and index == self.fusion_index[fused_index]:
+                x = self.fuse_clip(fused_index, x,
+                                   clip_features[fused_index][0], hwshape, L)
+                fused_index += 1
+            x_query = x[:, :-L, ...]
+            x_feat = x[:, -L:, ...].permute(0, 2, 1)\
+                .reshape(x.shape[0], x.shape[-1], hwshape[0], hwshape[1])
+
+            if index in deep_supervision_idxs or index == len(
+                    self.encode_layers):
+                outs.append({'query': x_query, 'x': x_feat})
+
+            if index < len(self.encode_layers):
+                x = x + pos_embed
+        return outs
+
+    def decode_feature(self, features):
+        mask_embeds = []
+        attn_biases = []
+        for feature in features:
+            mask_embed, attn_bias = self.mask_decoder(**feature)
+            mask_embeds.append(mask_embed)
+            attn_biases.append(attn_bias)
+        return mask_embeds, attn_biases
+
+    def forward(
+        self, image: torch.Tensor, clip_features: List[torch.Tensor],
+        deep_supervision_idxs: List[int]
+    ) -> Tuple[List[torch.Tensor], List[List[torch.Tensor]]]:
+        """Forward function."""
+        features = self.encode_feature(image, clip_features,
+                                       deep_supervision_idxs)
+        mask_embeds, attn_biases = self.decode_feature(features)
+        return mask_embeds, attn_biases
+
+
+class RecWithAttnbias(nn.Module):
+    """Mask recognition module by applying the attention biases to rest deeper
+    CLIP layers.
+
+    Args:
+        sos_token_format (str): The format of sos token. It should be
+            chosen from  ["cls_token", "learnable_token", "pos_embedding"].
+            Default: 'cls_token'.
+        sos_token_num (int): Number of sos token. It should be equal to
+            the number of quries. Default: 100.
+        num_layers (int): Number of rest CLIP layers for mask recognition.
+            Default: 3.
+        cross_attn (bool): Whether use cross attention to update sos token.
+            Default: False.
+        embed_dims (int): The feature dimension of CLIP layers.
+            Default: 768.
+        num_heads (int): Parallel attention heads of CLIP layers.
+            Default: 768.
+        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        qkv_bias (bool): Whether to use bias in multihead-attention.
+            Default: True.
+        out_dims (int): Number of channels of the output mask proposals.
+            It should be equal to the out_dims of text_encoder.
+            Default: 512.
+        final_norm (True): Whether use norm layer for sos token.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        frozen_exclude (List): List of parameters that are not to be frozen.
+    """
+
+    def __init__(self,
+                 sos_token_format: str = 'cls_token',
+                 sos_token_num: int = 100,
+                 num_layers: int = 3,
+                 cross_attn: bool = False,
+                 embed_dims: int = 768,
+                 num_heads: int = 12,
+                 mlp_ratio: int = 4,
+                 num_fcs: int = 2,
+                 qkv_bias: bool = True,
+                 out_dims: int = 512,
+                 final_norm: bool = True,
+                 act_cfg: dict = dict(type='GELU'),
+                 norm_cfg: dict = dict(type='LN'),
+                 frozen_exclude: List = []):
+        super().__init__()
+
+        assert sos_token_format in [
+            'cls_token', 'learnable_token', 'pos_embedding'
+        ]
+        self.sos_token_format = sos_token_format
+        self.sos_token_num = sos_token_num
+        self.frozen_exclude = frozen_exclude
+        self.cross_attn = cross_attn
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        if sos_token_format in ['learnable_token', 'pos_embedding']:
+            self.sos_token = nn.Parameter(
+                torch.randn(sos_token_num, 1, self.proj.shape[0]))
+            self.frozen.append('sos_token')
+
+        layers = []
+        for i in range(num_layers):
+            layers.append(
+                BaseTransformerLayer(
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=embed_dims,
+                        num_heads=num_heads,
+                        batch_first=False,
+                        bias=qkv_bias),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=embed_dims,
+                        feedforward_channels=mlp_ratio * embed_dims,
+                        act_cfg=act_cfg),
+                    operation_order=('norm', 'self_attn', 'norm', 'ffn')))
+        self.layers = nn.ModuleList(layers)
+
+        self.ln_post = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.proj = nn.Linear(embed_dims, out_dims, bias=False)
+
+        self.final_norm = final_norm
+        self._freeze()
+
+    def init_weights(self, rec_state_dict):
+        if hasattr(self, 'sos_token'):
+            normal_init(self.sos_token, std=0.02)
+        if rec_state_dict is not None:
+            load_state_dict(self, rec_state_dict, strict=False, logger=None)
+        else:
+            super().init_weights()
+
+    def _freeze(self):
+        if 'all' in self.frozen_exclude:
+            return
+        for name, param in self.named_parameters():
+            if not any([exclude in name for exclude in self.frozen_exclude]):
+                param.requires_grad = False
+
+    def _build_attn_biases(self, attn_biases, target_shape):
+        formatted_attn_biases = []
+        for attn_bias in attn_biases:
+            # convert it to proper format: N*num_head,L,L
+            # attn_bias: [N, num_head/1, num_sos,H,W]
+            n, num_head, num_sos, h, w = attn_bias.shape
+            # reshape and downsample
+            attn_bias = F.adaptive_max_pool2d(
+                attn_bias.reshape(n, num_head * num_sos, h, w),
+                output_size=target_shape)
+            attn_bias = attn_bias.reshape(n, num_head, num_sos, *target_shape)
+
+            true_num_head = self.num_heads
+            assert (num_head == 1 or num_head
+                    == true_num_head), f'num_head={num_head} is not supported.'
+            if num_head == 1:
+                attn_bias = attn_bias.repeat(1, true_num_head, 1, 1, 1)
+            attn_bias = attn_bias.reshape(n * true_num_head, num_sos, -1)
+            L = attn_bias.shape[-1]
+            if self.cross_attn:
+                # [n*num_head, num_sos, L]
+                formatted_attn_biases.append(attn_bias)
+            else:
+                # [n*num_head, num_sos+1+L, num_sos+1+L]
+                new_attn_bias = attn_bias.new_zeros(num_sos + 1 + L,
+                                                    num_sos + 1 + L)
+                new_attn_bias[:, :num_sos] = -100
+                new_attn_bias[torch.arange(num_sos), torch.arange(num_sos)] = 0
+                new_attn_bias[:num_sos, num_sos] = -100
+                new_attn_bias = (
+                    new_attn_bias[None, ...].expand(n * true_num_head, -1,
+                                                    -1).clone())
+                new_attn_bias[..., :num_sos, -L:] = attn_bias
+                formatted_attn_biases.append(new_attn_bias)
+
+        if len(formatted_attn_biases) == 1:
+            formatted_attn_biases = [
+                formatted_attn_biases[0] for _ in range(self.num_layers)
+            ]
+        return formatted_attn_biases
+
+    def forward(self, bias: List[Tensor], feature: List[Tensor]):
+        """Forward function to recognize the category of masks
+        Args:
+            bias (List[Tensor]): Attention bias for transformer layers
+            feature (List[Tensor]): Output of the image encoder,
+            including cls_token and img_feature.
+        """
+        cls_token = feature[1].unsqueeze(0)
+        img_feature = feature[0]
+        b, c, h, w = img_feature.shape
+        # construct clip shadow features
+        x = torch.cat(
+            [cls_token,
+             img_feature.reshape(b, c, -1).permute(2, 0, 1)])
+
+        # construct sos token
+        if self.sos_token_format == 'cls_token':
+            sos_token = cls_token.repeat(self.sos_token_num, 1, 1)
+        elif self.sos_token_format == 'learnable_token':
+            sos_token = self.sos_token.expand(-1, b, -1)
+        elif self.sos_token_format == 'pos_embedding':
+            sos_token = self.sos_token.expand(-1, b, -1) + cls_token
+
+        # construct attn bias
+        attn_biases = self._build_attn_biases(bias, target_shape=(h, w))
+
+        if self.cross_attn:
+            for i, block in enumerate(self.layers):
+                if self.cross_attn:
+                    sos_token = cross_attn_layer(
+                        block,
+                        sos_token,
+                        x[1:, ],
+                        attn_biases[i],
+                    )
+                    if i < len(self.layers) - 1:
+                        x = block(x)
+        else:
+            x = torch.cat([sos_token, x], dim=0)
+            for i, block in enumerate(self.layers):
+                x = block(x, attn_masks=[attn_biases[i]])
+            sos_token = x[:self.sos_token_num]
+
+        sos_token = sos_token.permute(1, 0, 2)  # LND -> NLD
+        sos_token = self.ln_post(sos_token)
+        sos_token = self.proj(sos_token)
+        if self.final_norm:
+            sos_token = F.normalize(sos_token, dim=-1)
+        return sos_token
+
+
+@MODELS.register_module()
+class SideAdapterCLIPHead(BaseDecodeHead):
+    """Side Adapter Network (SAN) for open-vocabulary semantic segmentation
+    with pre-trained vision-language model.
+
+    This decode head is the implementation of `Side Adapter Network
+    for Open-Vocabulary Semantic Segmentation`
+    <https://arxiv.org/abs/2302.12242>.
+    Modified from https://github.com/MendelXu/SAN/blob/main/san/model/side_adapter/side_adapter.py # noqa:E501
+    Copyright (c) 2023 MendelXu.
+    Licensed under the MIT License
+
+    Args:
+        num_classes (int): the number of classes.
+        san_cfg (ConfigType): Configs for SideAdapterNetwork module
+        maskgen_cfg (ConfigType): Configs for RecWithAttnbias module
+    """
+
+    def __init__(self, num_classes: int, san_cfg: ConfigType,
+                 maskgen_cfg: ConfigType, deep_supervision_idxs: List[int],
+                 train_cfg: ConfigType, **kwargs):
+        super().__init__(
+            in_channels=san_cfg.in_channels,
+            channels=san_cfg.embed_dims,
+            num_classes=num_classes,
+            **kwargs)
+        assert san_cfg.num_queries == maskgen_cfg.sos_token_num, \
+            'num_queries in san_cfg should be equal to sos_token_num ' \
+            'in maskgen_cfg'
+        del self.conv_seg
+        self.side_adapter_network = SideAdapterNetwork(**san_cfg)
+        self.rec_with_attnbias = RecWithAttnbias(**maskgen_cfg)
+        self.deep_supervision_idxs = deep_supervision_idxs
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.match_masks = MatchMasks(
+                num_points=train_cfg.num_points,
+                num_queries=san_cfg.num_queries,
+                num_classes=num_classes,
+                assigner=train_cfg.assigner)
+
+    def init_weights(self):
+
+        rec_state_dict = None
+        if isinstance(self.init_cfg, dict) and \
+                self.init_cfg.get('type') == 'Pretrained_Part':
+            checkpoint = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+
+            rec_state_dict = checkpoint.copy()
+            para_prefix = 'decode_head.rec_with_attnbias'
+            prefix_len = len(para_prefix) + 1
+            for k, v in checkpoint.items():
+                rec_state_dict.pop(k)
+                if para_prefix in k:
+                    rec_state_dict[k[prefix_len:]] = v
+
+        self.side_adapter_network.init_weights()
+        self.rec_with_attnbias.init_weights(rec_state_dict)
+
+    def forward(self, inputs: Tuple[Tensor],
+                deep_supervision_idxs) -> Tuple[List]:
+        """Forward function.
+
+        Args:
+            inputs (Tuple[Tensor]): A triplet including images,
+            list of multi-level visual features from image encoder and
+            class embeddings from text_encoder.
+
+        Returns:
+            mask_props (List[Tensor]): Mask proposals predicted by SAN.
+            mask_logits (List[Tensor]): Class logits of mask proposals.
+        """
+        imgs, clip_feature, class_embeds = inputs
+        # predict mask proposals and attention bias
+        mask_props, attn_biases = self.side_adapter_network(
+            imgs, clip_feature, deep_supervision_idxs)
+
+        # mask recognition with attention bias
+        mask_embeds = [
+            self.rec_with_attnbias(att_bias, clip_feature[-1])
+            for att_bias in attn_biases
+        ]
+        # Obtain class prediction of masks by comparing the similarity
+        # between the image token and the text embedding of class names.
+        mask_logits = [
+            torch.einsum('bqc,nc->bqn', mask_embed, class_embeds)
+            for mask_embed in mask_embeds
+        ]
+        return mask_props, mask_logits
+
+    def predict(self, inputs: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tensor:
+        """Forward function for prediction.
+
+        Args:
+            inputs (Tuple[Tensor]): Images, visual features from image encoder
+            and class embedding from text encoder.
+            batch_img_metas (dict): List Image info where each dict may also
+                contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Outputs segmentation logits map.
+        """
+        mask_props, mask_logits = self.forward(inputs, [])
+
+        return self.predict_by_feat([mask_props[-1], mask_logits[-1]],
+                                    batch_img_metas)
+
+    def predict_by_feat(self, seg_logits: List[Tensor],
+                        batch_img_metas: List[dict]) -> Tensor:
+        """1. Transform a batch of mask proposals to the input shape.
+           2. Generate segmentation map with mask proposals and class logits.
+        """
+        mask_pred = seg_logits[0]
+        cls_score = seg_logits[1]
+        if isinstance(batch_img_metas[0]['img_shape'], torch.Size):
+            # slide inference
+            size = batch_img_metas[0]['img_shape']
+        elif 'pad_shape' in batch_img_metas[0]:
+            size = batch_img_metas[0]['pad_shape'][:2]
+        else:
+            size = batch_img_metas[0]['img_shape']
+        # upsample mask
+        mask_pred = F.interpolate(
+            mask_pred, size=size, mode='bilinear', align_corners=False)
+
+        mask_cls = F.softmax(cls_score, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        seg_logits = torch.einsum('bqc,bqhw->bchw', mask_cls, mask_pred)
+        return seg_logits
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Perform forward propagation and loss calculation of the decoder head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            train_cfg (ConfigType): Training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        # batch SegDataSample to InstanceDataSample
+        batch_gt_instances = seg_data_to_instance_data(self.ignore_index,
+                                                       batch_data_samples)
+
+        # forward
+        all_mask_props, all_mask_logits = self.forward(
+            x, self.deep_supervision_idxs)
+
+        # loss
+        losses = self.loss_by_feat(all_mask_logits, all_mask_props,
+                                   batch_gt_instances)
+
+        return losses
+
+    def loss_by_feat(
+            self, all_cls_scores: Tensor, all_mask_preds: Tensor,
+            batch_gt_instances: List[InstanceData]) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification scores for all decoder
+                layers with shape (num_decoder, batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            all_mask_preds (Tensor): Mask scores for all decoder layers with
+                shape (num_decoder, batch_size, num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_dec_layers = len(all_cls_scores)
+        batch_gt_instances_list = [
+            batch_gt_instances for _ in range(num_dec_layers)
+        ]
+
+        losses = []
+        for i in range(num_dec_layers):
+            cls_scores = all_cls_scores[i]
+            mask_preds = all_mask_preds[i]
+            # matching N mask predictions to K category labels
+            (labels, mask_targets, mask_weights,
+             avg_factor) = self.match_masks.get_targets(
+                 cls_scores, mask_preds, batch_gt_instances_list[i])
+            cls_scores = cls_scores.flatten(0, 1)
+            labels = labels.flatten(0, 1)
+            num_total_masks = cls_scores.new_tensor([avg_factor],
+                                                    dtype=torch.float)
+            all_reduce(num_total_masks, op='mean')
+            num_total_masks = max(num_total_masks, 1)
+
+            # extract positive ones
+            # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+            mask_preds = mask_preds[mask_weights > 0]
+
+            if mask_targets.shape[0] != 0:
+                with torch.no_grad():
+                    points_coords = get_uncertain_point_coords_with_randomness(
+                        mask_preds.unsqueeze(1), None,
+                        self.train_cfg.num_points,
+                        self.train_cfg.oversample_ratio,
+                        self.train_cfg.importance_sample_ratio)
+                    # shape (num_total_gts, h, w)
+                    # -> (num_total_gts, num_points)
+                    mask_point_targets = point_sample(
+                        mask_targets.unsqueeze(1).float(),
+                        points_coords).squeeze(1)
+                # shape (num_queries, h, w) -> (num_queries, num_points)
+                mask_point_preds = point_sample(
+                    mask_preds.unsqueeze(1), points_coords).squeeze(1)
+
+            if not isinstance(self.loss_decode, nn.ModuleList):
+                losses_decode = [self.loss_decode]
+            else:
+                losses_decode = self.loss_decode
+            loss = dict()
+            for loss_decode in losses_decode:
+                if 'loss_cls' in loss_decode.loss_name:
+                    if loss_decode.loss_name == 'loss_cls_ce':
+                        loss[loss_decode.loss_name] = loss_decode(
+                            cls_scores, labels)
+                    else:
+                        assert False, "Only support 'CrossEntropyLoss' in" \
+                                      ' classification loss'
+
+                elif 'loss_mask' in loss_decode.loss_name:
+                    if mask_targets.shape[0] == 0:
+                        loss[loss_decode.loss_name] = mask_preds.sum()
+                    elif loss_decode.loss_name == 'loss_mask_ce':
+                        loss[loss_decode.loss_name] = loss_decode(
+                            mask_point_preds,
+                            mask_point_targets,
+                            avg_factor=num_total_masks *
+                            self.train_cfg.num_points)
+                    elif loss_decode.loss_name == 'loss_mask_dice':
+                        loss[loss_decode.loss_name] = loss_decode(
+                            mask_point_preds,
+                            mask_point_targets,
+                            avg_factor=num_total_masks)
+                    else:
+                        assert False, "Only support 'CrossEntropyLoss' and" \
+                                      " 'DiceLoss' in mask loss"
+                else:
+                    assert False, "Only support for 'loss_cls' and 'loss_mask'"
+
+            losses.append(loss)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict.update(losses[-1])
+        # loss from other decoder layers
+        for i, loss in enumerate(losses[:-1]):
+            for k, v in loss.items():
+                loss_dict[f'd{self.deep_supervision_idxs[i]}.{k}'] = v
+        return loss_dict
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/segformer_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/segformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9eb0b320b4e7b892e0540cea5ba5ea7054f8008
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/segformer_head.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+@MODELS.register_module()
+class SegformerHead(BaseDecodeHead):
+    """The all mlp Head of segformer.
+
+    This head is the implementation of
+    `Segformer <https://arxiv.org/abs/2105.15203>` _.
+
+    Args:
+        interpolate_mode: The interpolate mode of MLP head upsample operation.
+            Default: 'bilinear'.
+    """
+
+    def __init__(self, interpolate_mode='bilinear', **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+
+        self.interpolate_mode = interpolate_mode
+        num_inputs = len(self.in_channels)
+
+        assert num_inputs == len(self.in_index)
+
+        self.convs = nn.ModuleList()
+        for i in range(num_inputs):
+            self.convs.append(
+                ConvModule(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.channels,
+                    kernel_size=1,
+                    stride=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+        self.fusion_conv = ConvModule(
+            in_channels=self.channels * num_inputs,
+            out_channels=self.channels,
+            kernel_size=1,
+            norm_cfg=self.norm_cfg)
+
+    def forward(self, inputs):
+        # Receive 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32
+        inputs = self._transform_inputs(inputs)
+        outs = []
+        for idx in range(len(inputs)):
+            x = inputs[idx]
+            conv = self.convs[idx]
+            outs.append(
+                resize(
+                    input=conv(x),
+                    size=inputs[0].shape[2:],
+                    mode=self.interpolate_mode,
+                    align_corners=self.align_corners))
+
+        out = self.fusion_conv(torch.cat(outs, dim=1))
+
+        out = self.cls_seg(out)
+
+        return out
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/segmenter_mask_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/segmenter_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..85d27735ba8015772324177716b5e8d5f357295c
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/segmenter_mask_head.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmengine.model import ModuleList
+from mmengine.model.weight_init import (constant_init, trunc_normal_,
+                                        trunc_normal_init)
+
+from mmseg.models.backbones.vit import TransformerEncoderLayer
+from mmseg.registry import MODELS
+from .decode_head import BaseDecodeHead
+
+
+@MODELS.register_module()
+class SegmenterMaskTransformerHead(BaseDecodeHead):
+    """Segmenter: Transformer for Semantic Segmentation.
+
+    This head is the implementation of
+    `Segmenter: <https://arxiv.org/abs/2105.05633>`_.
+
+    Args:
+        backbone_cfg:(dict): Config of backbone of
+            Context Path.
+        in_channels (int): The number of channels of input image.
+        num_layers (int): The depth of transformer.
+        num_heads (int): The number of attention heads.
+        embed_dims (int): The number of embedding dimension.
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        drop_path_rate (float): stochastic depth rate. Default 0.1.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        init_std (float): The value of std in weight initialization.
+            Default: 0.02.
+    """
+
+    def __init__(
+            self,
+            in_channels,
+            num_layers,
+            num_heads,
+            embed_dims,
+            mlp_ratio=4,
+            drop_path_rate=0.1,
+            drop_rate=0.0,
+            attn_drop_rate=0.0,
+            num_fcs=2,
+            qkv_bias=True,
+            act_cfg=dict(type='GELU'),
+            norm_cfg=dict(type='LN'),
+            init_std=0.02,
+            **kwargs,
+    ):
+        super().__init__(in_channels=in_channels, **kwargs)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, num_layers)]
+        self.layers = ModuleList()
+        for i in range(num_layers):
+            self.layers.append(
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims,
+                    num_heads=num_heads,
+                    feedforward_channels=mlp_ratio * embed_dims,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    drop_path_rate=dpr[i],
+                    num_fcs=num_fcs,
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    batch_first=True,
+                ))
+
+        self.dec_proj = nn.Linear(in_channels, embed_dims)
+
+        self.cls_emb = nn.Parameter(
+            torch.randn(1, self.num_classes, embed_dims))
+        self.patch_proj = nn.Linear(embed_dims, embed_dims, bias=False)
+        self.classes_proj = nn.Linear(embed_dims, embed_dims, bias=False)
+
+        self.decoder_norm = build_norm_layer(
+            norm_cfg, embed_dims, postfix=1)[1]
+        self.mask_norm = build_norm_layer(
+            norm_cfg, self.num_classes, postfix=2)[1]
+
+        self.init_std = init_std
+
+        delattr(self, 'conv_seg')
+
+    def init_weights(self):
+        trunc_normal_(self.cls_emb, std=self.init_std)
+        trunc_normal_init(self.patch_proj, std=self.init_std)
+        trunc_normal_init(self.classes_proj, std=self.init_std)
+        for n, m in self.named_modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_init(m, std=self.init_std, bias=0)
+            elif isinstance(m, nn.LayerNorm):
+                constant_init(m, val=1.0, bias=0.0)
+
+    def forward(self, inputs):
+        x = self._transform_inputs(inputs)
+        b, c, h, w = x.shape
+        x = x.permute(0, 2, 3, 1).contiguous().view(b, -1, c)
+
+        x = self.dec_proj(x)
+        cls_emb = self.cls_emb.expand(x.size(0), -1, -1)
+        x = torch.cat((x, cls_emb), 1)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.decoder_norm(x)
+
+        patches = self.patch_proj(x[:, :-self.num_classes])
+        cls_seg_feat = self.classes_proj(x[:, -self.num_classes:])
+
+        patches = F.normalize(patches, dim=2, p=2)
+        cls_seg_feat = F.normalize(cls_seg_feat, dim=2, p=2)
+
+        masks = patches @ cls_seg_feat.transpose(1, 2)
+        masks = self.mask_norm(masks)
+        masks = masks.permute(0, 2, 1).contiguous().view(b, -1, h, w)
+
+        return masks
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/sep_aspp_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/sep_aspp_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dba68c9ecc6909e47da4f2da6169d529910355d
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/sep_aspp_head.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .aspp_head import ASPPHead, ASPPModule
+
+
+class DepthwiseSeparableASPPModule(ASPPModule):
+    """Atrous Spatial Pyramid Pooling (ASPP) Module with depthwise separable
+    conv."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        for i, dilation in enumerate(self.dilations):
+            if dilation > 1:
+                self[i] = DepthwiseSeparableConvModule(
+                    self.in_channels,
+                    self.channels,
+                    3,
+                    dilation=dilation,
+                    padding=dilation,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg)
+
+
+@MODELS.register_module()
+class DepthwiseSeparableASPPHead(ASPPHead):
+    """Encoder-Decoder with Atrous Separable Convolution for Semantic Image
+    Segmentation.
+
+    This head is the implementation of `DeepLabV3+
+    <https://arxiv.org/abs/1802.02611>`_.
+
+    Args:
+        c1_in_channels (int): The input channels of c1 decoder. If is 0,
+            the no decoder will be used.
+        c1_channels (int): The intermediate channels of c1 decoder.
+    """
+
+    def __init__(self, c1_in_channels, c1_channels, **kwargs):
+        super().__init__(**kwargs)
+        assert c1_in_channels >= 0
+        self.aspp_modules = DepthwiseSeparableASPPModule(
+            dilations=self.dilations,
+            in_channels=self.in_channels,
+            channels=self.channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        if c1_in_channels > 0:
+            self.c1_bottleneck = ConvModule(
+                c1_in_channels,
+                c1_channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        else:
+            self.c1_bottleneck = None
+        self.sep_bottleneck = nn.Sequential(
+            DepthwiseSeparableConvModule(
+                self.channels + c1_channels,
+                self.channels,
+                3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            DepthwiseSeparableConvModule(
+                self.channels,
+                self.channels,
+                3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        aspp_outs = [
+            resize(
+                self.image_pool(x),
+                size=x.size()[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        ]
+        aspp_outs.extend(self.aspp_modules(x))
+        aspp_outs = torch.cat(aspp_outs, dim=1)
+        output = self.bottleneck(aspp_outs)
+        if self.c1_bottleneck is not None:
+            c1_output = self.c1_bottleneck(inputs[0])
+            output = resize(
+                input=output,
+                size=c1_output.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            output = torch.cat([output, c1_output], dim=1)
+        output = self.sep_bottleneck(output)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/sep_fcn_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/sep_fcn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b15983bceaeff48534bbceedfdf1c434a8d1d1f
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/sep_fcn_head.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import DepthwiseSeparableConvModule
+
+from mmseg.registry import MODELS
+from .fcn_head import FCNHead
+
+
+@MODELS.register_module()
+class DepthwiseSeparableFCNHead(FCNHead):
+    """Depthwise-Separable Fully Convolutional Network for Semantic
+    Segmentation.
+
+    This head is implemented according to `Fast-SCNN: Fast Semantic
+    Segmentation Network <https://arxiv.org/abs/1902.04502>`_.
+
+    Args:
+        in_channels(int): Number of output channels of FFM.
+        channels(int): Number of middle-stage channels in the decode head.
+        concat_input(bool): Whether to concatenate original decode input into
+            the result of several consecutive convolution layers.
+            Default: True.
+        num_classes(int): Used to determine the dimension of
+            final prediction tensor.
+        in_index(int): Correspond with 'out_indices' in FastSCNN backbone.
+        norm_cfg (dict | None): Config of norm layers.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        loss_decode(dict): Config of loss type and some
+            relevant additional options.
+        dw_act_cfg (dict):Activation config of depthwise ConvModule. If it is
+            'default', it will be the same as `act_cfg`. Default: None.
+    """
+
+    def __init__(self, dw_act_cfg=None, **kwargs):
+        super().__init__(**kwargs)
+        self.convs[0] = DepthwiseSeparableConvModule(
+            self.in_channels,
+            self.channels,
+            kernel_size=self.kernel_size,
+            padding=self.kernel_size // 2,
+            norm_cfg=self.norm_cfg,
+            dw_act_cfg=dw_act_cfg)
+
+        for i in range(1, self.num_convs):
+            self.convs[i] = DepthwiseSeparableConvModule(
+                self.channels,
+                self.channels,
+                kernel_size=self.kernel_size,
+                padding=self.kernel_size // 2,
+                norm_cfg=self.norm_cfg,
+                dw_act_cfg=dw_act_cfg)
+
+        if self.concat_input:
+            self.conv_cat = DepthwiseSeparableConvModule(
+                self.in_channels + self.channels,
+                self.channels,
+                kernel_size=self.kernel_size,
+                padding=self.kernel_size // 2,
+                norm_cfg=self.norm_cfg,
+                dw_act_cfg=dw_act_cfg)
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/setr_mla_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/setr_mla_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1975991a60cc720650b880060efe10753f213131
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/setr_mla_head.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import Upsample
+from .decode_head import BaseDecodeHead
+
+
+@MODELS.register_module()
+class SETRMLAHead(BaseDecodeHead):
+    """Multi level feature aggretation head of SETR.
+
+    MLA head of `SETR  <https://arxiv.org/pdf/2012.15840.pdf>`_.
+
+    Args:
+        mlahead_channels (int): Channels of conv-conv-4x of multi-level feature
+            aggregation. Default: 128.
+        up_scale (int): The scale factor of interpolate. Default:4.
+    """
+
+    def __init__(self, mla_channels=128, up_scale=4, **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+        self.mla_channels = mla_channels
+
+        num_inputs = len(self.in_channels)
+
+        # Refer to self.cls_seg settings of BaseDecodeHead
+        assert self.channels == num_inputs * mla_channels
+
+        self.up_convs = nn.ModuleList()
+        for i in range(num_inputs):
+            self.up_convs.append(
+                nn.Sequential(
+                    ConvModule(
+                        in_channels=self.in_channels[i],
+                        out_channels=mla_channels,
+                        kernel_size=3,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    ConvModule(
+                        in_channels=mla_channels,
+                        out_channels=mla_channels,
+                        kernel_size=3,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    Upsample(
+                        scale_factor=up_scale,
+                        mode='bilinear',
+                        align_corners=self.align_corners)))
+
+    def forward(self, inputs):
+        inputs = self._transform_inputs(inputs)
+        outs = []
+        for x, up_conv in zip(inputs, self.up_convs):
+            outs.append(up_conv(x))
+        out = torch.cat(outs, dim=1)
+        out = self.cls_seg(out)
+        return out
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/setr_up_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/setr_up_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c796d8161088c2d7effe17f5ba71e43ff62e50c
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/setr_up_head.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_norm_layer
+
+from mmseg.registry import MODELS
+from ..utils import Upsample
+from .decode_head import BaseDecodeHead
+
+
+@MODELS.register_module()
+class SETRUPHead(BaseDecodeHead):
+    """Naive upsampling head and Progressive upsampling head of SETR.
+
+    Naive or PUP head of `SETR  <https://arxiv.org/pdf/2012.15840.pdf>`_.
+
+    Args:
+        norm_layer (dict): Config dict for input normalization.
+            Default: norm_layer=dict(type='LN', eps=1e-6, requires_grad=True).
+        num_convs (int): Number of decoder convolutions. Default: 1.
+        up_scale (int): The scale factor of interpolate. Default:4.
+        kernel_size (int): The kernel size of convolution when decoding
+            feature information from backbone. Default: 3.
+        init_cfg (dict | list[dict] | None): Initialization config dict.
+            Default: dict(
+                     type='Constant', val=1.0, bias=0, layer='LayerNorm').
+    """
+
+    def __init__(self,
+                 norm_layer=dict(type='LN', eps=1e-6, requires_grad=True),
+                 num_convs=1,
+                 up_scale=4,
+                 kernel_size=3,
+                 init_cfg=[
+                     dict(type='Constant', val=1.0, bias=0, layer='LayerNorm'),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         override=dict(name='conv_seg'))
+                 ],
+                 **kwargs):
+
+        assert kernel_size in [1, 3], 'kernel_size must be 1 or 3.'
+
+        super().__init__(init_cfg=init_cfg, **kwargs)
+
+        assert isinstance(self.in_channels, int)
+
+        _, self.norm = build_norm_layer(norm_layer, self.in_channels)
+
+        self.up_convs = nn.ModuleList()
+        in_channels = self.in_channels
+        out_channels = self.channels
+        for _ in range(num_convs):
+            self.up_convs.append(
+                nn.Sequential(
+                    ConvModule(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        padding=int(kernel_size - 1) // 2,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    Upsample(
+                        scale_factor=up_scale,
+                        mode='bilinear',
+                        align_corners=self.align_corners)))
+            in_channels = out_channels
+
+    def forward(self, x):
+        x = self._transform_inputs(x)
+
+        n, c, h, w = x.shape
+        x = x.reshape(n, c, h * w).transpose(2, 1).contiguous()
+        x = self.norm(x)
+        x = x.transpose(1, 2).reshape(n, c, h, w).contiguous()
+
+        for up_conv in self.up_convs:
+            x = up_conv(x)
+        out = self.cls_seg(x)
+        return out
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/stdc_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/stdc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c1c21e3083fcb5098d2458e44538c0cf5b8f0e4
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/stdc_head.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+from mmengine.structures import PixelData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+from mmseg.utils import SampleList
+from .fcn_head import FCNHead
+
+
+@MODELS.register_module()
+class STDCHead(FCNHead):
+    """This head is the implementation of `Rethinking BiSeNet For Real-time
+    Semantic Segmentation <https://arxiv.org/abs/2104.13188>`_.
+
+    Args:
+        boundary_threshold (float): The threshold of calculating boundary.
+            Default: 0.1.
+    """
+
+    def __init__(self, boundary_threshold=0.1, **kwargs):
+        super().__init__(**kwargs)
+        self.boundary_threshold = boundary_threshold
+        # Using register buffer to make laplacian kernel on the same
+        # device of `seg_label`.
+        self.register_buffer(
+            'laplacian_kernel',
+            torch.tensor([-1, -1, -1, -1, 8, -1, -1, -1, -1],
+                         dtype=torch.float32,
+                         requires_grad=False).reshape((1, 1, 3, 3)))
+        self.fusion_kernel = torch.nn.Parameter(
+            torch.tensor([[6. / 10], [3. / 10], [1. / 10]],
+                         dtype=torch.float32).reshape(1, 3, 1, 1),
+            requires_grad=False)
+
+    def loss_by_feat(self, seg_logits: Tensor,
+                     batch_data_samples: SampleList) -> dict:
+        """Compute Detail Aggregation Loss."""
+        # Note: The paper claims `fusion_kernel` is a trainable 1x1 conv
+        # parameters. However, it is a constant in original repo and other
+        # codebase because it would not be added into computation graph
+        # after threshold operation.
+        seg_label = self._stack_batch_gt(batch_data_samples).to(
+            self.laplacian_kernel)
+        boundary_targets = F.conv2d(
+            seg_label, self.laplacian_kernel, padding=1)
+        boundary_targets = boundary_targets.clamp(min=0)
+        boundary_targets[boundary_targets > self.boundary_threshold] = 1
+        boundary_targets[boundary_targets <= self.boundary_threshold] = 0
+
+        boundary_targets_x2 = F.conv2d(
+            seg_label, self.laplacian_kernel, stride=2, padding=1)
+        boundary_targets_x2 = boundary_targets_x2.clamp(min=0)
+
+        boundary_targets_x4 = F.conv2d(
+            seg_label, self.laplacian_kernel, stride=4, padding=1)
+        boundary_targets_x4 = boundary_targets_x4.clamp(min=0)
+
+        boundary_targets_x4_up = F.interpolate(
+            boundary_targets_x4, boundary_targets.shape[2:], mode='nearest')
+        boundary_targets_x2_up = F.interpolate(
+            boundary_targets_x2, boundary_targets.shape[2:], mode='nearest')
+
+        boundary_targets_x2_up[
+            boundary_targets_x2_up > self.boundary_threshold] = 1
+        boundary_targets_x2_up[
+            boundary_targets_x2_up <= self.boundary_threshold] = 0
+
+        boundary_targets_x4_up[
+            boundary_targets_x4_up > self.boundary_threshold] = 1
+        boundary_targets_x4_up[
+            boundary_targets_x4_up <= self.boundary_threshold] = 0
+
+        boundary_targets_pyramids = torch.stack(
+            (boundary_targets, boundary_targets_x2_up, boundary_targets_x4_up),
+            dim=1)
+
+        boundary_targets_pyramids = boundary_targets_pyramids.squeeze(2)
+        boudary_targets_pyramid = F.conv2d(boundary_targets_pyramids,
+                                           self.fusion_kernel)
+
+        boudary_targets_pyramid[
+            boudary_targets_pyramid > self.boundary_threshold] = 1
+        boudary_targets_pyramid[
+            boudary_targets_pyramid <= self.boundary_threshold] = 0
+
+        seg_labels = boudary_targets_pyramid.long()
+        batch_sample_list = []
+        for label in seg_labels:
+            seg_data_sample = SegDataSample()
+            seg_data_sample.gt_sem_seg = PixelData(data=label)
+            batch_sample_list.append(seg_data_sample)
+
+        loss = super().loss_by_feat(seg_logits, batch_sample_list)
+        return loss
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/uper_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/uper_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1ccc3173c0f1193e89ad48861aa7b5ee3b329cc
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/uper_head.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+from .psp_head import PPM
+
+
+@MODELS.register_module()
+class UPerHead(BaseDecodeHead):
+    """Unified Perceptual Parsing for Scene Understanding.
+
+    This head is the implementation of `UPerNet
+    <https://arxiv.org/abs/1807.10221>`_.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module applied on the last feature. Default: (1, 2, 3, 6).
+    """
+
+    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+        # PSP Module
+        self.psp_modules = PPM(
+            pool_scales,
+            self.in_channels[-1],
+            self.channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+        self.bottleneck = ConvModule(
+            self.in_channels[-1] + len(pool_scales) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        # FPN Module
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+        for in_channels in self.in_channels[:-1]:  # skip the top layer
+            l_conv = ConvModule(
+                in_channels,
+                self.channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                self.channels,
+                self.channels,
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg,
+                inplace=False)
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        self.fpn_bottleneck = ConvModule(
+            len(self.in_channels) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def psp_forward(self, inputs):
+        """Forward function of PSP module."""
+        x = inputs[-1]
+        psp_outs = [x]
+        psp_outs.extend(self.psp_modules(x))
+        psp_outs = torch.cat(psp_outs, dim=1)
+        output = self.bottleneck(psp_outs)
+
+        return output
+
+    def _forward_feature(self, inputs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        inputs = self._transform_inputs(inputs)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        laterals.append(self.psp_forward(inputs))
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            prev_shape = laterals[i - 1].shape[2:]
+            laterals[i - 1] = laterals[i - 1] + resize(
+                laterals[i],
+                size=prev_shape,
+                mode='bilinear',
+                align_corners=self.align_corners)
+
+        # build outputs
+        fpn_outs = [
+            self.fpn_convs[i](laterals[i])
+            for i in range(used_backbone_levels - 1)
+        ]
+        # append psp feature
+        fpn_outs.append(laterals[-1])
+
+        for i in range(used_backbone_levels - 1, 0, -1):
+            fpn_outs[i] = resize(
+                fpn_outs[i],
+                size=fpn_outs[0].shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        fpn_outs = torch.cat(fpn_outs, dim=1)
+        feats = self.fpn_bottleneck(fpn_outs)
+        return feats
+
+    def forward(self, inputs):
+        """Forward function."""
+        output = self._forward_feature(inputs)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/build/lib/mmseg/models/decode_heads/vpd_depth_head.py b/head_extractor/build/lib/mmseg/models/decode_heads/vpd_depth_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..65bdfbd8d9065c158f4f6a147cd8c06ae1dfd961
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/decode_heads/vpd_depth_head.py
@@ -0,0 +1,253 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class VPDDepthDecoder(BaseModule):
+    """VPD Depth Decoder class.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_deconv_layers (int): Number of deconvolution layers.
+        num_deconv_filters (List[int]): List of output channels for
+            deconvolution layers.
+        init_cfg (Optional[Union[Dict, List[Dict]]], optional): Configuration
+            for weight initialization. Defaults to Normal for Conv2d and
+            ConvTranspose2d layers.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 num_deconv_layers: int,
+                 num_deconv_filters: List[int],
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = dict(
+                     type='Normal',
+                     std=0.001,
+                     layer=['Conv2d', 'ConvTranspose2d'])):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+
+        self.deconv_layers = self._make_deconv_layer(
+            num_deconv_layers,
+            num_deconv_filters,
+        )
+
+        conv_layers = []
+        conv_layers.append(
+            build_conv_layer(
+                dict(type='Conv2d'),
+                in_channels=num_deconv_filters[-1],
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1))
+        conv_layers.append(build_norm_layer(dict(type='BN'), out_channels)[1])
+        conv_layers.append(nn.ReLU(inplace=True))
+        self.conv_layers = nn.Sequential(*conv_layers)
+
+        self.up_sample = nn.Upsample(
+            scale_factor=2, mode='bilinear', align_corners=False)
+
+    def forward(self, x):
+        """Forward pass through the decoder network."""
+        out = self.deconv_layers(x)
+        out = self.conv_layers(out)
+
+        out = self.up_sample(out)
+        out = self.up_sample(out)
+
+        return out
+
+    def _make_deconv_layer(self, num_layers, num_deconv_filters):
+        """Make deconv layers."""
+
+        layers = []
+        in_channels = self.in_channels
+        for i in range(num_layers):
+
+            num_channels = num_deconv_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=in_channels,
+                    out_channels=num_channels,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    output_padding=0,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(num_channels))
+            layers.append(nn.ReLU(inplace=True))
+            in_channels = num_channels
+
+        return nn.Sequential(*layers)
+
+
+@MODELS.register_module()
+class VPDDepthHead(BaseDecodeHead):
+    """Depth Prediction Head for VPD.
+
+    .. _`VPD`: https://arxiv.org/abs/2303.02153
+
+    Args:
+        max_depth (float): Maximum depth value. Defaults to 10.0.
+        in_channels (Sequence[int]): Number of input channels for each
+            convolutional layer.
+        embed_dim (int): Dimension of embedding. Defaults to 192.
+        feature_dim (int): Dimension of aggregated feature. Defaults to 1536.
+        num_deconv_layers (int): Number of deconvolution layers in the
+            decoder. Defaults to 3.
+        num_deconv_filters (Sequence[int]): Number of filters for each deconv
+            layer. Defaults to (32, 32, 32).
+        fmap_border (Union[int, Sequence[int]]): Feature map border for
+            cropping. Defaults to 0.
+        align_corners (bool): Flag for align_corners in interpolation.
+            Defaults to False.
+        loss_decode (dict): Configurations for the loss function. Defaults to
+            dict(type='SiLogLoss').
+        init_cfg (dict): Initialization configurations. Defaults to
+            dict(type='TruncNormal', std=0.02, layer=['Conv2d', 'Linear']).
+    """
+
+    num_classes = 1
+    out_channels = 1
+    input_transform = None
+
+    def __init__(
+        self,
+        max_depth: float = 10.0,
+        in_channels: Sequence[int] = [320, 640, 1280, 1280],
+        embed_dim: int = 192,
+        feature_dim: int = 1536,
+        num_deconv_layers: int = 3,
+        num_deconv_filters: Sequence[int] = (32, 32, 32),
+        fmap_border: Union[int, Sequence[int]] = 0,
+        align_corners: bool = False,
+        loss_decode: dict = dict(type='SiLogLoss'),
+        init_cfg=dict(
+            type='TruncNormal', std=0.02, layer=['Conv2d', 'Linear']),
+    ):
+
+        super(BaseDecodeHead, self).__init__(init_cfg=init_cfg)
+
+        # initialize parameters
+        self.in_channels = in_channels
+        self.max_depth = max_depth
+        self.align_corners = align_corners
+
+        # feature map border
+        if isinstance(fmap_border, int):
+            fmap_border = (fmap_border, fmap_border)
+        self.fmap_border = fmap_border
+
+        # define network layers
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels[0], in_channels[0], 3, stride=2, padding=1),
+            nn.GroupNorm(16, in_channels[0]),
+            nn.ReLU(),
+            nn.Conv2d(in_channels[0], in_channels[0], 3, stride=2, padding=1),
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels[1], in_channels[1], 3, stride=2, padding=1)
+
+        self.conv_aggregation = nn.Sequential(
+            nn.Conv2d(sum(in_channels), feature_dim, 1),
+            nn.GroupNorm(16, feature_dim),
+            nn.ReLU(),
+        )
+
+        self.decoder = VPDDepthDecoder(
+            in_channels=embed_dim * 8,
+            out_channels=embed_dim,
+            num_deconv_layers=num_deconv_layers,
+            num_deconv_filters=num_deconv_filters)
+
+        self.depth_pred_layer = nn.Sequential(
+            nn.Conv2d(
+                embed_dim, embed_dim, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(inplace=False),
+            nn.Conv2d(embed_dim, 1, kernel_size=3, stride=1, padding=1))
+
+        # build loss
+        if isinstance(loss_decode, dict):
+            self.loss_decode = MODELS.build(loss_decode)
+        elif isinstance(loss_decode, (list, tuple)):
+            self.loss_decode = nn.ModuleList()
+            for loss in loss_decode:
+                self.loss_decode.append(MODELS.build(loss))
+        else:
+            raise TypeError(f'loss_decode must be a dict or sequence of dict,\
+                but got {type(loss_decode)}')
+
+    def _stack_batch_gt(self, batch_data_samples: SampleList) -> Tensor:
+        gt_depth_maps = [
+            data_sample.gt_depth_map.data for data_sample in batch_data_samples
+        ]
+        return torch.stack(gt_depth_maps, dim=0)
+
+    def forward(self, x):
+        x = [
+            x[0], x[1],
+            torch.cat([x[2], F.interpolate(x[3], scale_factor=2)], dim=1)
+        ]
+        x = torch.cat([self.conv1(x[0]), self.conv2(x[1]), x[2]], dim=1)
+        x = self.conv_aggregation(x)
+
+        x = x[:, :, :x.size(2) - self.fmap_border[0], :x.size(3) -
+              self.fmap_border[1]].contiguous()
+        x = self.decoder(x)
+        out = self.depth_pred_layer(x)
+
+        depth = torch.sigmoid(out) * self.max_depth
+
+        return depth
+
+    def loss_by_feat(self, pred_depth_map: Tensor,
+                     batch_data_samples: SampleList) -> dict:
+        """Compute depth estimation loss.
+
+        Args:
+            pred_depth_map (Tensor): The output from decode head forward
+                function.
+            batch_data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_dpeth_map`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        gt_depth_map = self._stack_batch_gt(batch_data_samples)
+        loss = dict()
+        pred_depth_map = resize(
+            input=pred_depth_map,
+            size=gt_depth_map.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_decode in losses_decode:
+            if loss_decode.loss_name not in loss:
+                loss[loss_decode.loss_name] = loss_decode(
+                    pred_depth_map, gt_depth_map)
+            else:
+                loss[loss_decode.loss_name] += loss_decode(
+                    pred_depth_map, gt_depth_map)
+
+        return loss
diff --git a/head_extractor/build/lib/mmseg/models/losses/__init__.py b/head_extractor/build/lib/mmseg/models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0467cb3ad89b8c0c57f7f8eb58cbc2e23f50cdb4
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/losses/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .accuracy import Accuracy, accuracy
+from .boundary_loss import BoundaryLoss
+from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,
+                                 cross_entropy, mask_cross_entropy)
+from .dice_loss import DiceLoss
+from .focal_loss import FocalLoss
+from .huasdorff_distance_loss import HuasdorffDisstanceLoss
+from .lovasz_loss import LovaszLoss
+from .ohem_cross_entropy_loss import OhemCrossEntropy
+from .silog_loss import SiLogLoss
+from .tversky_loss import TverskyLoss
+from .utils import reduce_loss, weight_reduce_loss, weighted_loss
+
+__all__ = [
+    'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
+    'mask_cross_entropy', 'CrossEntropyLoss', 'reduce_loss',
+    'weight_reduce_loss', 'weighted_loss', 'LovaszLoss', 'DiceLoss',
+    'FocalLoss', 'TverskyLoss', 'OhemCrossEntropy', 'BoundaryLoss',
+    'HuasdorffDisstanceLoss', 'SiLogLoss'
+]
diff --git a/head_extractor/build/lib/mmseg/models/losses/accuracy.py b/head_extractor/build/lib/mmseg/models/losses/accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d9e2d7701088adadd5b6bb71c718c986b87a066
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/losses/accuracy.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+
+def accuracy(pred, target, topk=1, thresh=None, ignore_index=None):
+    """Calculate accuracy according to the prediction and target.
+
+    Args:
+        pred (torch.Tensor): The model prediction, shape (N, num_class, ...)
+        target (torch.Tensor): The target of each prediction, shape (N, , ...)
+        ignore_index (int | None): The label index to be ignored. Default: None
+        topk (int | tuple[int], optional): If the predictions in ``topk``
+            matches the target, the predictions will be regarded as
+            correct ones. Defaults to 1.
+        thresh (float, optional): If not None, predictions with scores under
+            this threshold are considered incorrect. Default to None.
+
+    Returns:
+        float | tuple[float]: If the input ``topk`` is a single integer,
+            the function will return a single float as accuracy. If
+            ``topk`` is a tuple containing multiple integers, the
+            function will return a tuple containing accuracies of
+            each ``topk`` number.
+    """
+    assert isinstance(topk, (int, tuple))
+    if isinstance(topk, int):
+        topk = (topk, )
+        return_single = True
+    else:
+        return_single = False
+
+    maxk = max(topk)
+    if pred.size(0) == 0:
+        accu = [pred.new_tensor(0.) for i in range(len(topk))]
+        return accu[0] if return_single else accu
+    assert pred.ndim == target.ndim + 1
+    assert pred.size(0) == target.size(0)
+    assert maxk <= pred.size(1), \
+        f'maxk {maxk} exceeds pred dimension {pred.size(1)}'
+    pred_value, pred_label = pred.topk(maxk, dim=1)
+    # transpose to shape (maxk, N, ...)
+    pred_label = pred_label.transpose(0, 1)
+    correct = pred_label.eq(target.unsqueeze(0).expand_as(pred_label))
+    if thresh is not None:
+        # Only prediction values larger than thresh are counted as correct
+        correct = correct & (pred_value > thresh).t()
+    if ignore_index is not None:
+        correct = correct[:, target != ignore_index]
+    res = []
+    eps = torch.finfo(torch.float32).eps
+    for k in topk:
+        # Avoid causing ZeroDivisionError when all pixels
+        # of an image are ignored
+        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + eps
+        if ignore_index is not None:
+            total_num = target[target != ignore_index].numel() + eps
+        else:
+            total_num = target.numel() + eps
+        res.append(correct_k.mul_(100.0 / total_num))
+    return res[0] if return_single else res
+
+
+class Accuracy(nn.Module):
+    """Accuracy calculation module."""
+
+    def __init__(self, topk=(1, ), thresh=None, ignore_index=None):
+        """Module to calculate the accuracy.
+
+        Args:
+            topk (tuple, optional): The criterion used to calculate the
+                accuracy. Defaults to (1,).
+            thresh (float, optional): If not None, predictions with scores
+                under this threshold are considered incorrect. Default to None.
+        """
+        super().__init__()
+        self.topk = topk
+        self.thresh = thresh
+        self.ignore_index = ignore_index
+
+    def forward(self, pred, target):
+        """Forward function to calculate accuracy.
+
+        Args:
+            pred (torch.Tensor): Prediction of models.
+            target (torch.Tensor): Target for each prediction.
+
+        Returns:
+            tuple[float]: The accuracies under different topk criterions.
+        """
+        return accuracy(pred, target, self.topk, self.thresh,
+                        self.ignore_index)
diff --git a/head_extractor/build/lib/mmseg/models/losses/boundary_loss.py b/head_extractor/build/lib/mmseg/models/losses/boundary_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..e86b850d87e1d26be8cbb700758dae8dead82c58
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/losses/boundary_loss.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class BoundaryLoss(nn.Module):
+    """Boundary loss.
+
+    This function is modified from
+    `PIDNet <https://github.com/XuJiacong/PIDNet/blob/main/utils/criterion.py#L122>`_.  # noqa
+    Licensed under the MIT License.
+
+
+    Args:
+        loss_weight (float): Weight of the loss. Defaults to 1.0.
+        loss_name (str): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_boundary'.
+    """
+
+    def __init__(self,
+                 loss_weight: float = 1.0,
+                 loss_name: str = 'loss_boundary'):
+        super().__init__()
+        self.loss_weight = loss_weight
+        self.loss_name_ = loss_name
+
+    def forward(self, bd_pre: Tensor, bd_gt: Tensor) -> Tensor:
+        """Forward function.
+        Args:
+            bd_pre (Tensor): Predictions of the boundary head.
+            bd_gt (Tensor): Ground truth of the boundary.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        log_p = bd_pre.permute(0, 2, 3, 1).contiguous().view(1, -1)
+        target_t = bd_gt.view(1, -1).float()
+
+        pos_index = (target_t == 1)
+        neg_index = (target_t == 0)
+
+        weight = torch.zeros_like(log_p)
+        pos_num = pos_index.sum()
+        neg_num = neg_index.sum()
+        sum_num = pos_num + neg_num
+        weight[pos_index] = neg_num * 1.0 / sum_num
+        weight[neg_index] = pos_num * 1.0 / sum_num
+
+        loss = F.binary_cross_entropy_with_logits(
+            log_p, target_t, weight, reduction='mean')
+
+        return self.loss_weight * loss
+
+    @property
+    def loss_name(self):
+        return self.loss_name_
diff --git a/head_extractor/build/lib/mmseg/models/losses/cross_entropy_loss.py b/head_extractor/build/lib/mmseg/models/losses/cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..988fb789c11eca9d002b2c02f227450d704aeaef
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/losses/cross_entropy_loss.py
@@ -0,0 +1,311 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmseg.registry import MODELS
+from .utils import get_class_weight, weight_reduce_loss
+
+
+def cross_entropy(pred,
+                  label,
+                  weight=None,
+                  class_weight=None,
+                  reduction='mean',
+                  avg_factor=None,
+                  ignore_index=-100,
+                  avg_non_ignore=False):
+    """cross_entropy. The wrapper function for :func:`F.cross_entropy`
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1).
+        label (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+            Default: None.
+        class_weight (list[float], optional): The weight for each class.
+            Default: None.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are 'none', 'mean' and 'sum'. Default: 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Default: None.
+        ignore_index (int): Specifies a target value that is ignored and
+            does not contribute to the input gradients. When
+            ``avg_non_ignore `` is ``True``, and the ``reduction`` is
+            ``''mean''``, the loss is averaged over non-ignored targets.
+            Defaults: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+            `New in version 0.23.0.`
+    """
+
+    # class_weight is a manual rescaling weight given to each class.
+    # If given, has to be a Tensor of size C element-wise losses
+    loss = F.cross_entropy(
+        pred,
+        label,
+        weight=class_weight,
+        reduction='none',
+        ignore_index=ignore_index)
+
+    # apply weights and do the reduction
+    # average loss over non-ignored elements
+    # pytorch's official cross_entropy average loss over non-ignored elements
+    # refer to https://github.com/pytorch/pytorch/blob/56b43f4fec1f76953f15a627694d4bba34588969/torch/nn/functional.py#L2660  # noqa
+    if (avg_factor is None) and reduction == 'mean':
+        if class_weight is None:
+            if avg_non_ignore:
+                avg_factor = label.numel() - (label
+                                              == ignore_index).sum().item()
+            else:
+                avg_factor = label.numel()
+
+        else:
+            # the average factor should take the class weights into account
+            label_weights = torch.stack([class_weight[cls] for cls in label
+                                         ]).to(device=class_weight.device)
+
+            if avg_non_ignore:
+                label_weights[label == ignore_index] = 0
+            avg_factor = label_weights.sum()
+
+    if weight is not None:
+        weight = weight.float()
+    loss = weight_reduce_loss(
+        loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def _expand_onehot_labels(labels, label_weights, target_shape, ignore_index):
+    """Expand onehot labels to match the size of prediction."""
+    bin_labels = labels.new_zeros(target_shape)
+    valid_mask = (labels >= 0) & (labels != ignore_index)
+    inds = torch.nonzero(valid_mask, as_tuple=True)
+
+    if inds[0].numel() > 0:
+        if labels.dim() == 3:
+            bin_labels[inds[0], labels[valid_mask], inds[1], inds[2]] = 1
+        else:
+            bin_labels[inds[0], labels[valid_mask]] = 1
+
+    valid_mask = valid_mask.unsqueeze(1).expand(target_shape).float()
+
+    if label_weights is None:
+        bin_label_weights = valid_mask
+    else:
+        bin_label_weights = label_weights.unsqueeze(1).expand(target_shape)
+        bin_label_weights = bin_label_weights * valid_mask
+
+    return bin_labels, bin_label_weights, valid_mask
+
+
+def binary_cross_entropy(pred,
+                         label,
+                         weight=None,
+                         reduction='mean',
+                         avg_factor=None,
+                         class_weight=None,
+                         ignore_index=-100,
+                         avg_non_ignore=False,
+                         **kwargs):
+    """Calculate the binary CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1).
+        label (torch.Tensor): The learning label of the prediction.
+            Note: In bce loss, label < 0 is invalid.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int): The label index to be ignored. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+            `New in version 0.23.0.`
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    if pred.size(1) == 1:
+        # For binary class segmentation, the shape of pred is
+        # [N, 1, H, W] and that of label is [N, H, W].
+        # As the ignore_index often set as 255, so the
+        # binary class label check should mask out
+        # ignore_index
+        assert label[label != ignore_index].max() <= 1, \
+            'For pred with shape [N, 1, H, W], its label must have at ' \
+            'most 2 classes'
+        pred = pred.squeeze(1)
+    if pred.dim() != label.dim():
+        assert (pred.dim() == 2 and label.dim() == 1) or (
+                pred.dim() == 4 and label.dim() == 3), \
+            'Only pred shape [N, C], label shape [N] or pred shape [N, C, ' \
+            'H, W], label shape [N, H, W] are supported'
+        # `weight` returned from `_expand_onehot_labels`
+        # has been treated for valid (non-ignore) pixels
+        label, weight, valid_mask = _expand_onehot_labels(
+            label, weight, pred.shape, ignore_index)
+    else:
+        # should mask out the ignored elements
+        valid_mask = ((label >= 0) & (label != ignore_index)).float()
+        if weight is not None:
+            weight = weight * valid_mask
+        else:
+            weight = valid_mask
+    # average loss over non-ignored and valid elements
+    if reduction == 'mean' and avg_factor is None and avg_non_ignore:
+        avg_factor = valid_mask.sum().item()
+
+    loss = F.binary_cross_entropy_with_logits(
+        pred, label.float(), pos_weight=class_weight, reduction='none')
+    # do the reduction for the weighted loss
+    loss = weight_reduce_loss(
+        loss, weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def mask_cross_entropy(pred,
+                       target,
+                       label,
+                       reduction='mean',
+                       avg_factor=None,
+                       class_weight=None,
+                       ignore_index=None,
+                       **kwargs):
+    """Calculate the CrossEntropy loss for masks.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        label (torch.Tensor): ``label`` indicates the class label of the mask'
+            corresponding object. This will be used to select the mask in the
+            of the class which the object belongs to when the mask prediction
+            if not class-agnostic.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (None): Placeholder, to be consistent with other loss.
+            Default: None.
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    assert ignore_index is None, 'BCE loss does not support ignore_index'
+    # TODO: handle these two reserved arguments
+    assert reduction == 'mean' and avg_factor is None
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(
+        pred_slice, target, weight=class_weight, reduction='mean')[None]
+
+
+@MODELS.register_module()
+class CrossEntropyLoss(nn.Module):
+    """CrossEntropyLoss.
+
+    Args:
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+            of softmax. Defaults to False.
+        use_mask (bool, optional): Whether to use mask cross entropy loss.
+            Defaults to False.
+        reduction (str, optional): . Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+        loss_name (str, optional): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_ce'.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+            `New in version 0.23.0.`
+    """
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0,
+                 loss_name='loss_ce',
+                 avg_non_ignore=False):
+        super().__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = get_class_weight(class_weight)
+        self.avg_non_ignore = avg_non_ignore
+        if not self.avg_non_ignore and self.reduction == 'mean':
+            warnings.warn(
+                'Default ``avg_non_ignore`` is False, if you would like to '
+                'ignore the certain label and average loss over non-ignore '
+                'labels, which is the same with PyTorch official '
+                'cross_entropy, set ``avg_non_ignore=True``.')
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+        self._loss_name = loss_name
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'avg_non_ignore={self.avg_non_ignore}'
+        return s
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                ignore_index=-100,
+                **kwargs):
+        """Forward function."""
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+        # Note: for BCE loss, label < 0 is invalid.
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            weight,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            avg_non_ignore=self.avg_non_ignore,
+            ignore_index=ignore_index,
+            **kwargs)
+        return loss_cls
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/build/lib/mmseg/models/losses/dice_loss.py b/head_extractor/build/lib/mmseg/models/losses/dice_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb2ffdba8daf867032b6d7b4e0d70a9b7a0c50fe
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/losses/dice_loss.py
@@ -0,0 +1,202 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+import torch.nn as nn
+
+from mmseg.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def _expand_onehot_labels_dice(pred: torch.Tensor,
+                               target: torch.Tensor) -> torch.Tensor:
+    """Expand onehot labels to match the size of prediction.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (N, num_class, H, W).
+        target (torch.Tensor): The learning label of the prediction,
+            has a shape (N, H, W).
+
+    Returns:
+        torch.Tensor: The target after one-hot encoding,
+            has a shape (N, num_class, H, W).
+    """
+    num_classes = pred.shape[1]
+    one_hot_target = torch.clamp(target, min=0, max=num_classes)
+    one_hot_target = torch.nn.functional.one_hot(one_hot_target,
+                                                 num_classes + 1)
+    one_hot_target = one_hot_target[..., :num_classes].permute(0, 3, 1, 2)
+    return one_hot_target
+
+
+def dice_loss(pred: torch.Tensor,
+              target: torch.Tensor,
+              weight: Union[torch.Tensor, None],
+              eps: float = 1e-3,
+              reduction: Union[str, None] = 'mean',
+              naive_dice: Union[bool, None] = False,
+              avg_factor: Union[int, None] = None,
+              ignore_index: Union[int, None] = 255) -> float:
+    """Calculate dice loss, there are two forms of dice loss is supported:
+
+        - the one proposed in `V-Net: Fully Convolutional Neural
+            Networks for Volumetric Medical Image Segmentation
+            <https://arxiv.org/abs/1606.04797>`_.
+        - the dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (n, *)
+        target (torch.Tensor): The learning label of the prediction,
+            shape (n, *), same shape of pred.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction, has a shape (n,). Defaults to None.
+        eps (float): Avoid dividing by zero. Default: 1e-3.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        naive_dice (bool, optional): If false, use the dice
+            loss defined in the V-Net paper, otherwise, use the
+            naive dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.Defaults to False.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        ignore_index (int, optional): The label index to be ignored.
+            Defaults to 255.
+    """
+    if ignore_index is not None:
+        num_classes = pred.shape[1]
+        pred = pred[:, torch.arange(num_classes) != ignore_index, :, :]
+        target = target[:, torch.arange(num_classes) != ignore_index, :, :]
+        assert pred.shape[1] != 0  # if the ignored index is the only class
+    input = pred.flatten(1)
+    target = target.flatten(1).float()
+    a = torch.sum(input * target, 1)
+    if naive_dice:
+        b = torch.sum(input, 1)
+        c = torch.sum(target, 1)
+        d = (2 * a + eps) / (b + c + eps)
+    else:
+        b = torch.sum(input * input, 1) + eps
+        c = torch.sum(target * target, 1) + eps
+        d = (2 * a) / (b + c)
+
+    loss = 1 - d
+    if weight is not None:
+        assert weight.ndim == loss.ndim
+        assert len(weight) == len(pred)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class DiceLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 activate=True,
+                 reduction='mean',
+                 naive_dice=False,
+                 loss_weight=1.0,
+                 ignore_index=255,
+                 eps=1e-3,
+                 loss_name='loss_dice'):
+        """Compute dice loss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            activate (bool): Whether to activate the predictions inside,
+                this will disable the inside sigmoid operation.
+                Defaults to True.
+            reduction (str, optional): The method used
+                to reduce the loss. Options are "none",
+                "mean" and "sum". Defaults to 'mean'.
+            naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power. Defaults to False.
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            ignore_index (int, optional): The label index to be ignored.
+                Default: 255.
+            eps (float): Avoid dividing by zero. Defaults to 1e-3.
+            loss_name (str, optional): Name of the loss item. If you want this
+                loss item to be included into the backward graph, `loss_` must
+                be the prefix of the name. Defaults to 'loss_dice'.
+        """
+
+        super().__init__()
+        self.use_sigmoid = use_sigmoid
+        self.reduction = reduction
+        self.naive_dice = naive_dice
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self.activate = activate
+        self.ignore_index = ignore_index
+        self._loss_name = loss_name
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                ignore_index=255,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction, has a shape (n, *).
+            target (torch.Tensor): The label of the prediction,
+                shape (n, *), same shape of pred.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction, has a shape (n,). Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        one_hot_target = target
+        if (pred.shape != target.shape):
+            one_hot_target = _expand_onehot_labels_dice(pred, target)
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.activate:
+            if self.use_sigmoid:
+                pred = pred.sigmoid()
+            elif pred.shape[1] != 1:
+                # softmax does not work when there is only 1 class
+                pred = pred.softmax(dim=1)
+        loss = self.loss_weight * dice_loss(
+            pred,
+            one_hot_target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            naive_dice=self.naive_dice,
+            avg_factor=avg_factor,
+            ignore_index=self.ignore_index)
+
+        return loss
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/build/lib/mmseg/models/losses/focal_loss.py b/head_extractor/build/lib/mmseg/models/losses/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..6507ed7a9112993733ac25bc095da0b571e14363
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/losses/focal_loss.py
@@ -0,0 +1,337 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/open-mmlab/mmdetection
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss
+
+from mmseg.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+# This method is used when cuda is not available
+def py_sigmoid_focal_loss(pred,
+                          target,
+                          one_hot_target=None,
+                          weight=None,
+                          gamma=2.0,
+                          alpha=0.5,
+                          class_weight=None,
+                          valid_mask=None,
+                          reduction='mean',
+                          avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning label of the prediction with
+            shape (N, C)
+        one_hot_target (None): Placeholder. It should be None.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float | list[float], optional): A balanced form for Focal Loss.
+            Defaults to 0.5.
+        class_weight (list[float], optional): Weight of each class.
+            Defaults to None.
+        valid_mask (torch.Tensor, optional): A mask uses 1 to mark the valid
+            samples and uses 0 to mark the ignored samples. Default: None.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    if isinstance(alpha, list):
+        alpha = pred.new_tensor(alpha)
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    one_minus_pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * one_minus_pt.pow(gamma)
+
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    final_weight = torch.ones(1, pred.size(1)).type_as(loss)
+    if weight is not None:
+        if weight.shape != loss.shape and weight.size(0) == loss.size(0):
+            # For most cases, weight is of shape (N, ),
+            # which means it does not have the second axis num_class
+            weight = weight.view(-1, 1)
+        assert weight.dim() == loss.dim()
+        final_weight = final_weight * weight
+    if class_weight is not None:
+        final_weight = final_weight * pred.new_tensor(class_weight)
+    if valid_mask is not None:
+        final_weight = final_weight * valid_mask
+    loss = weight_reduce_loss(loss, final_weight, reduction, avg_factor)
+    return loss
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       one_hot_target,
+                       weight=None,
+                       gamma=2.0,
+                       alpha=0.5,
+                       class_weight=None,
+                       valid_mask=None,
+                       reduction='mean',
+                       avg_factor=None):
+    r"""A wrapper of cuda version `Focal Loss
+    <https://arxiv.org/abs/1708.02002>`_.
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction. It's shape
+            should be (N, )
+        one_hot_target (torch.Tensor): The learning label with shape (N, C)
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float | list[float], optional): A balanced form for Focal Loss.
+            Defaults to 0.5.
+        class_weight (list[float], optional): Weight of each class.
+            Defaults to None.
+        valid_mask (torch.Tensor, optional): A mask uses 1 to mark the valid
+            samples and uses 0 to mark the ignored samples. Default: None.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    # Function.apply does not accept keyword arguments, so the decorator
+    # "weighted_loss" is not applicable
+    final_weight = torch.ones(1, pred.size(1)).type_as(pred)
+    if isinstance(alpha, list):
+        # _sigmoid_focal_loss doesn't accept alpha of list type. Therefore, if
+        # a list is given, we set the input alpha as 0.5. This means setting
+        # equal weight for foreground class and background class. By
+        # multiplying the loss by 2, the effect of setting alpha as 0.5 is
+        # undone. The alpha of type list is used to regulate the loss in the
+        # post-processing process.
+        loss = _sigmoid_focal_loss(pred.contiguous(), target.contiguous(),
+                                   gamma, 0.5, None, 'none') * 2
+        alpha = pred.new_tensor(alpha)
+        final_weight = final_weight * (
+            alpha * one_hot_target + (1 - alpha) * (1 - one_hot_target))
+    else:
+        loss = _sigmoid_focal_loss(pred.contiguous(), target.contiguous(),
+                                   gamma, alpha, None, 'none')
+    if weight is not None:
+        if weight.shape != loss.shape and weight.size(0) == loss.size(0):
+            # For most cases, weight is of shape (N, ),
+            # which means it does not have the second axis num_class
+            weight = weight.view(-1, 1)
+        assert weight.dim() == loss.dim()
+        final_weight = final_weight * weight
+    if class_weight is not None:
+        final_weight = final_weight * pred.new_tensor(class_weight)
+    if valid_mask is not None:
+        final_weight = final_weight * valid_mask
+    loss = weight_reduce_loss(loss, final_weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.5,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0,
+                 loss_name='loss_focal'):
+        """`Focal Loss <https://arxiv.org/abs/1708.02002>`_
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float | list[float], optional): A balanced form for Focal
+                Loss. Defaults to 0.5. When a list is provided, the length
+                of the list should be equal to the number of classes.
+                Please be careful that this parameter is not the
+                class-wise weight but the weight of a binary classification
+                problem. This binary classification problem regards the
+                pixels which belong to one class as the foreground
+                and the other pixels as the background, each element in
+                the list is the weight of the corresponding foreground class.
+                The value of alpha or each element of alpha should be a float
+                in the interval [0, 1]. If you want to specify the class-wise
+                weight, please use `class_weight` parameter.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            class_weight (list[float], optional): Weight of each class.
+                Defaults to None.
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            loss_name (str, optional): Name of the loss item. If you want this
+                loss item to be included into the backward graph, `loss_` must
+                be the prefix of the name. Defaults to 'loss_focal'.
+        """
+        super().__init__()
+        assert use_sigmoid is True, \
+            'AssertionError: Only sigmoid focal loss supported now.'
+        assert reduction in ('none', 'mean', 'sum'), \
+            "AssertionError: reduction should be 'none', 'mean' or " \
+            "'sum'"
+        assert isinstance(alpha, (float, list)), \
+            'AssertionError: alpha should be of type float'
+        assert isinstance(gamma, float), \
+            'AssertionError: gamma should be of type float'
+        assert isinstance(loss_weight, float), \
+            'AssertionError: loss_weight should be of type float'
+        assert isinstance(loss_name, str), \
+            'AssertionError: loss_name should be of type str'
+        assert isinstance(class_weight, list) or class_weight is None, \
+            'AssertionError: class_weight must be None or of type list'
+        self.use_sigmoid = use_sigmoid
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.class_weight = class_weight
+        self.loss_weight = loss_weight
+        self._loss_name = loss_name
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                ignore_index=255,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction with shape
+                (N, C) where C = number of classes, or
+                (N, C, d_1, d_2, ..., d_K) with K≥1 in the
+                case of K-dimensional loss.
+            target (torch.Tensor): The ground truth. If containing class
+                indices, shape (N) where each value is 0≤targets[i]≤C−1,
+                or (N, d_1, d_2, ..., d_K) with K≥1 in the case of
+                K-dimensional loss. If containing class probabilities,
+                same shape as the input.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used
+                to override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+            ignore_index (int, optional): The label index to be ignored.
+                Default: 255
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert isinstance(ignore_index, int), \
+            'ignore_index must be of type int'
+        assert reduction_override in (None, 'none', 'mean', 'sum'), \
+            "AssertionError: reduction should be 'none', 'mean' or " \
+            "'sum'"
+        assert pred.shape == target.shape or \
+               (pred.size(0) == target.size(0) and
+                pred.shape[2:] == target.shape[1:]), \
+               "The shape of pred doesn't match the shape of target"
+
+        original_shape = pred.shape
+
+        # [B, C, d_1, d_2, ..., d_k] -> [C, B, d_1, d_2, ..., d_k]
+        pred = pred.transpose(0, 1)
+        # [C, B, d_1, d_2, ..., d_k] -> [C, N]
+        pred = pred.reshape(pred.size(0), -1)
+        # [C, N] -> [N, C]
+        pred = pred.transpose(0, 1).contiguous()
+
+        if original_shape == target.shape:
+            # target with shape [B, C, d_1, d_2, ...]
+            # transform it's shape into [N, C]
+            # [B, C, d_1, d_2, ...] -> [C, B, d_1, d_2, ..., d_k]
+            target = target.transpose(0, 1)
+            # [C, B, d_1, d_2, ..., d_k] -> [C, N]
+            target = target.reshape(target.size(0), -1)
+            # [C, N] -> [N, C]
+            target = target.transpose(0, 1).contiguous()
+        else:
+            # target with shape [B, d_1, d_2, ...]
+            # transform it's shape into [N, ]
+            target = target.view(-1).contiguous()
+            valid_mask = (target != ignore_index).view(-1, 1)
+            # avoid raising error when using F.one_hot()
+            target = torch.where(target == ignore_index, target.new_tensor(0),
+                                 target)
+
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            num_classes = pred.size(1)
+            if torch.cuda.is_available() and pred.is_cuda:
+                if target.dim() == 1:
+                    one_hot_target = F.one_hot(
+                        target, num_classes=num_classes + 1)
+                    if num_classes == 1:
+                        one_hot_target = one_hot_target[:, 1]
+                        target = 1 - target
+                    else:
+                        one_hot_target = one_hot_target[:, :num_classes]
+                else:
+                    one_hot_target = target
+                    target = target.argmax(dim=1)
+                    valid_mask = (target != ignore_index).view(-1, 1)
+                calculate_loss_func = sigmoid_focal_loss
+            else:
+                one_hot_target = None
+                if target.dim() == 1:
+                    target = F.one_hot(target, num_classes=num_classes + 1)
+                    if num_classes == 1:
+                        target = target[:, 1]
+                    else:
+                        target = target[:, num_classes]
+                else:
+                    valid_mask = (target.argmax(dim=1) != ignore_index).view(
+                        -1, 1)
+                calculate_loss_func = py_sigmoid_focal_loss
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                one_hot_target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                class_weight=self.class_weight,
+                valid_mask=valid_mask,
+                reduction=reduction,
+                avg_factor=avg_factor)
+
+            if reduction == 'none':
+                # [N, C] -> [C, N]
+                loss_cls = loss_cls.transpose(0, 1)
+                # [C, N] -> [C, B, d1, d2, ...]
+                # original_shape: [B, C, d1, d2, ...]
+                loss_cls = loss_cls.reshape(original_shape[1],
+                                            original_shape[0],
+                                            *original_shape[2:])
+                # [C, B, d1, d2, ...] -> [B, C, d1, d2, ...]
+                loss_cls = loss_cls.transpose(0, 1).contiguous()
+        else:
+            raise NotImplementedError
+        return loss_cls
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/build/lib/mmseg/models/losses/huasdorff_distance_loss.py b/head_extractor/build/lib/mmseg/models/losses/huasdorff_distance_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..d950ba728f8d419ea2b291e2159b926aca44038c
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/losses/huasdorff_distance_loss.py
@@ -0,0 +1,160 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from https://github.com/JunMa11/SegWithDistMap/blob/
+master/code/train_LA_HD.py (Apache-2.0 License)"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from scipy.ndimage import distance_transform_edt as distance
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from .utils import get_class_weight, weighted_loss
+
+
+def compute_dtm(img_gt: Tensor, pred: Tensor) -> Tensor:
+    """
+    compute the distance transform map of foreground in mask
+    Args:
+        img_gt: Ground truth of the image, (b, h, w)
+        pred: Predictions of the segmentation head after softmax, (b, c, h, w)
+
+    Returns:
+        output: the foreground Distance Map (SDM)
+        dtm(x) = 0; x in segmentation boundary
+                inf|x-y|; x in segmentation
+    """
+
+    fg_dtm = torch.zeros_like(pred)
+    out_shape = pred.shape
+    for b in range(out_shape[0]):  # batch size
+        for c in range(1, out_shape[1]):  # default 0 channel is background
+            posmask = img_gt[b].byte()
+            if posmask.any():
+                posdis = distance(posmask)
+                fg_dtm[b][c] = torch.from_numpy(posdis)
+
+    return fg_dtm
+
+
+@weighted_loss
+def hd_loss(seg_soft: Tensor,
+            gt: Tensor,
+            seg_dtm: Tensor,
+            gt_dtm: Tensor,
+            class_weight=None,
+            ignore_index=255) -> Tensor:
+    """
+    compute huasdorff distance loss for segmentation
+    Args:
+        seg_soft: softmax results, shape=(b,c,x,y)
+        gt: ground truth, shape=(b,x,y)
+        seg_dtm: segmentation distance transform map, shape=(b,c,x,y)
+        gt_dtm: ground truth distance transform map, shape=(b,c,x,y)
+
+    Returns:
+        output: hd_loss
+    """
+    assert seg_soft.shape[0] == gt.shape[0]
+    total_loss = 0
+    num_class = seg_soft.shape[1]
+    if class_weight is not None:
+        assert class_weight.ndim == num_class
+    for i in range(1, num_class):
+        if i != ignore_index:
+            delta_s = (seg_soft[:, i, ...] - gt.float())**2
+            s_dtm = seg_dtm[:, i, ...]**2
+            g_dtm = gt_dtm[:, i, ...]**2
+            dtm = s_dtm + g_dtm
+            multiplied = torch.einsum('bxy, bxy->bxy', delta_s, dtm)
+            hd_loss = multiplied.mean()
+        if class_weight is not None:
+            hd_loss *= class_weight[i]
+        total_loss += hd_loss
+
+    return total_loss / num_class
+
+
+@MODELS.register_module()
+class HuasdorffDisstanceLoss(nn.Module):
+    """HuasdorffDisstanceLoss. This loss is proposed in `How Distance Transform
+    Maps Boost Segmentation CNNs: An Empirical Study.
+
+    <http://proceedings.mlr.press/v121/ma20b.html>`_.
+    Args:
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float): Weight of the loss. Defaults to 1.0.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+        loss_name (str): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_boundary'.
+    """
+
+    def __init__(self,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0,
+                 ignore_index=255,
+                 loss_name='loss_huasdorff_disstance',
+                 **kwargs):
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = get_class_weight(class_weight)
+        self._loss_name = loss_name
+        self.ignore_index = ignore_index
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predictions of the segmentation head. (B, C, H, W)
+            target (Tensor): Ground truth of the image. (B, H, W)
+            avg_factor (int, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used
+                to override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+        Returns:
+            Tensor: Loss tensor.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.class_weight is not None:
+            class_weight = pred.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        pred_soft = F.softmax(pred, dim=1)
+        valid_mask = (target != self.ignore_index).long()
+        target = target * valid_mask
+
+        with torch.no_grad():
+            gt_dtm = compute_dtm(target.cpu(), pred_soft)
+            gt_dtm = gt_dtm.float()
+            seg_dtm2 = compute_dtm(
+                pred_soft.argmax(dim=1, keepdim=False).cpu(), pred_soft)
+            seg_dtm2 = seg_dtm2.float()
+
+        loss_hd = self.loss_weight * hd_loss(
+            pred_soft,
+            target,
+            seg_dtm=seg_dtm2,
+            gt_dtm=gt_dtm,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            class_weight=class_weight,
+            ignore_index=self.ignore_index)
+        return loss_hd
+
+    @property
+    def loss_name(self):
+        return self._loss_name
diff --git a/head_extractor/build/lib/mmseg/models/losses/kldiv_loss.py b/head_extractor/build/lib/mmseg/models/losses/kldiv_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..496ef9713f085a36d46837ac0b51d4cb9f956fce
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/losses/kldiv_loss.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class KLDivLoss(nn.Module):
+
+    def __init__(self,
+                 temperature: float = 1.0,
+                 reduction: str = 'mean',
+                 loss_name: str = 'loss_kld'):
+        """Kullback-Leibler divergence Loss.
+
+        <https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence>
+
+        Args:
+            temperature (float, optional): Temperature param
+            reduction  (str,  optional): The method to reduce the loss into a
+            scalar. Default is "mean". Options are "none", "sum",
+            and "mean"
+        """
+
+        assert isinstance(temperature, (float, int)), \
+            'Expected temperature to be' \
+            f'float or int, but got {temperature.__class__.__name__} instead'
+        assert temperature != 0., 'Temperature must not be zero'
+
+        assert reduction in ['mean', 'none', 'sum'], \
+            'Reduction must be one of the options ("mean", ' \
+            f'"sum", "none"), but got {reduction}'
+
+        super().__init__()
+        self.temperature = temperature
+        self.reduction = reduction
+        self._loss_name = loss_name
+
+    def forward(self, input: torch.Tensor, target: torch.Tensor):
+        """Forward function. Calculate KL divergence Loss.
+
+        Args:
+            input (Tensor): Logit tensor,
+                the data type is float32 or float64.
+                The shape is (N, C) where N is batchsize and C  is number of
+                channels.
+                If there more than 2 dimensions, shape is (N, C, D1, D2, ...
+                Dk), k>= 1
+            target (Tensor): Logit tensor,
+                the data type is float32 or float64.
+                input and target must be with the same shape.
+
+        Returns:
+            (Tensor): Reduced loss.
+        """
+        assert isinstance(input, torch.Tensor), 'Expected input to' \
+            f'be Tensor, but got {input.__class__.__name__} instead'
+        assert isinstance(target, torch.Tensor), 'Expected target to' \
+            f'be Tensor, but got {target.__class__.__name__} instead'
+
+        assert input.shape == target.shape, 'Input and target ' \
+            'must have same shape,' \
+            f'but got shapes {input.shape} and {target.shape}'
+
+        input = F.softmax(input / self.temperature, dim=1)
+        target = F.softmax(target / self.temperature, dim=1)
+
+        loss = F.kl_div(input, target, reduction='none', log_target=False)
+        loss = loss * self.temperature**2
+
+        batch_size = input.shape[0]
+
+        if self.reduction == 'sum':
+            # Change view to calculate instance-wise sum
+            loss = loss.view(batch_size, -1)
+            return torch.sum(loss, dim=1)
+
+        elif self.reduction == 'mean':
+            # Change view to calculate instance-wise mean
+            loss = loss.view(batch_size, -1)
+            return torch.mean(loss, dim=1)
+
+        return loss
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/build/lib/mmseg/models/losses/lovasz_loss.py b/head_extractor/build/lib/mmseg/models/losses/lovasz_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b47f9d8a15330a45d0d2d25f3c18d9386e2b335e
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/losses/lovasz_loss.py
@@ -0,0 +1,323 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from https://github.com/bermanmaxim/LovaszSoftmax/blob/master/pytor
+ch/lovasz_losses.py Lovasz-Softmax and Jaccard hinge loss in PyTorch Maxim
+Berman 2018 ESAT-PSI KU Leuven (MIT License)"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.utils import is_list_of
+
+from mmseg.registry import MODELS
+from .utils import get_class_weight, weight_reduce_loss
+
+
+def lovasz_grad(gt_sorted):
+    """Computes gradient of the Lovasz extension w.r.t sorted errors.
+
+    See Alg. 1 in paper.
+    """
+    p = len(gt_sorted)
+    gts = gt_sorted.sum()
+    intersection = gts - gt_sorted.float().cumsum(0)
+    union = gts + (1 - gt_sorted).float().cumsum(0)
+    jaccard = 1. - intersection / union
+    if p > 1:  # cover 1-pixel case
+        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
+    return jaccard
+
+
+def flatten_binary_logits(logits, labels, ignore_index=None):
+    """Flattens predictions in the batch (binary case) Remove labels equal to
+    'ignore_index'."""
+    logits = logits.view(-1)
+    labels = labels.view(-1)
+    if ignore_index is None:
+        return logits, labels
+    valid = (labels != ignore_index)
+    vlogits = logits[valid]
+    vlabels = labels[valid]
+    return vlogits, vlabels
+
+
+def flatten_probs(probs, labels, ignore_index=None):
+    """Flattens predictions in the batch."""
+    if probs.dim() == 3:
+        # assumes output of a sigmoid layer
+        B, H, W = probs.size()
+        probs = probs.view(B, 1, H, W)
+    B, C, H, W = probs.size()
+    probs = probs.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B*H*W, C=P,C
+    labels = labels.view(-1)
+    if ignore_index is None:
+        return probs, labels
+    valid = (labels != ignore_index)
+    vprobs = probs[valid.nonzero().squeeze()]
+    vlabels = labels[valid]
+    return vprobs, vlabels
+
+
+def lovasz_hinge_flat(logits, labels):
+    """Binary Lovasz hinge loss.
+
+    Args:
+        logits (torch.Tensor): [P], logits at each prediction
+            (between -infty and +infty).
+        labels (torch.Tensor): [P], binary ground truth labels (0 or 1).
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    if len(labels) == 0:
+        # only void pixels, the gradients should be 0
+        return logits.sum() * 0.
+    signs = 2. * labels.float() - 1.
+    errors = (1. - logits * signs)
+    errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
+    perm = perm.data
+    gt_sorted = labels[perm]
+    grad = lovasz_grad(gt_sorted)
+    loss = torch.dot(F.relu(errors_sorted), grad)
+    return loss
+
+
+def lovasz_hinge(logits,
+                 labels,
+                 classes='present',
+                 per_image=False,
+                 class_weight=None,
+                 reduction='mean',
+                 avg_factor=None,
+                 ignore_index=255):
+    """Binary Lovasz hinge loss.
+
+    Args:
+        logits (torch.Tensor): [B, H, W], logits at each pixel
+            (between -infty and +infty).
+        labels (torch.Tensor): [B, H, W], binary ground truth masks (0 or 1).
+        classes (str | list[int], optional): Placeholder, to be consistent with
+            other loss. Default: None.
+        per_image (bool, optional): If per_image is True, compute the loss per
+            image instead of per batch. Default: False.
+        class_weight (list[float], optional): Placeholder, to be consistent
+            with other loss. Default: None.
+        reduction (str, optional): The method used to reduce the loss. Options
+            are "none", "mean" and "sum". This parameter only works when
+            per_image is True. Default: 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. This parameter only works when per_image is True.
+            Default: None.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    if per_image:
+        loss = [
+            lovasz_hinge_flat(*flatten_binary_logits(
+                logit.unsqueeze(0), label.unsqueeze(0), ignore_index))
+            for logit, label in zip(logits, labels)
+        ]
+        loss = weight_reduce_loss(
+            torch.stack(loss), None, reduction, avg_factor)
+    else:
+        loss = lovasz_hinge_flat(
+            *flatten_binary_logits(logits, labels, ignore_index))
+    return loss
+
+
+def lovasz_softmax_flat(probs, labels, classes='present', class_weight=None):
+    """Multi-class Lovasz-Softmax loss.
+
+    Args:
+        probs (torch.Tensor): [P, C], class probabilities at each prediction
+            (between 0 and 1).
+        labels (torch.Tensor): [P], ground truth labels (between 0 and C - 1).
+        classes (str | list[int], optional): Classes chosen to calculate loss.
+            'all' for all classes, 'present' for classes present in labels, or
+            a list of classes to average. Default: 'present'.
+        class_weight (list[float], optional): The weight for each class.
+            Default: None.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    if probs.numel() == 0:
+        # only void pixels, the gradients should be 0
+        return probs * 0.
+    C = probs.size(1)
+    losses = []
+    class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
+    for c in class_to_sum:
+        fg = (labels == c).float()  # foreground for class c
+        if (classes == 'present' and fg.sum() == 0):
+            continue
+        if C == 1:
+            if len(classes) > 1:
+                raise ValueError('Sigmoid output possible only with 1 class')
+            class_pred = probs[:, 0]
+        else:
+            class_pred = probs[:, c]
+        errors = (fg - class_pred).abs()
+        errors_sorted, perm = torch.sort(errors, 0, descending=True)
+        perm = perm.data
+        fg_sorted = fg[perm]
+        loss = torch.dot(errors_sorted, lovasz_grad(fg_sorted))
+        if class_weight is not None:
+            loss *= class_weight[c]
+        losses.append(loss)
+    return torch.stack(losses).mean()
+
+
+def lovasz_softmax(probs,
+                   labels,
+                   classes='present',
+                   per_image=False,
+                   class_weight=None,
+                   reduction='mean',
+                   avg_factor=None,
+                   ignore_index=255):
+    """Multi-class Lovasz-Softmax loss.
+
+    Args:
+        probs (torch.Tensor): [B, C, H, W], class probabilities at each
+            prediction (between 0 and 1).
+        labels (torch.Tensor): [B, H, W], ground truth labels (between 0 and
+            C - 1).
+        classes (str | list[int], optional): Classes chosen to calculate loss.
+            'all' for all classes, 'present' for classes present in labels, or
+            a list of classes to average. Default: 'present'.
+        per_image (bool, optional): If per_image is True, compute the loss per
+            image instead of per batch. Default: False.
+        class_weight (list[float], optional): The weight for each class.
+            Default: None.
+        reduction (str, optional): The method used to reduce the loss. Options
+            are "none", "mean" and "sum". This parameter only works when
+            per_image is True. Default: 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. This parameter only works when per_image is True.
+            Default: None.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+
+    if per_image:
+        loss = [
+            lovasz_softmax_flat(
+                *flatten_probs(
+                    prob.unsqueeze(0), label.unsqueeze(0), ignore_index),
+                classes=classes,
+                class_weight=class_weight)
+            for prob, label in zip(probs, labels)
+        ]
+        loss = weight_reduce_loss(
+            torch.stack(loss), None, reduction, avg_factor)
+    else:
+        loss = lovasz_softmax_flat(
+            *flatten_probs(probs, labels, ignore_index),
+            classes=classes,
+            class_weight=class_weight)
+    return loss
+
+
+@MODELS.register_module()
+class LovaszLoss(nn.Module):
+    """LovaszLoss.
+
+    This loss is proposed in `The Lovasz-Softmax loss: A tractable surrogate
+    for the optimization of the intersection-over-union measure in neural
+    networks <https://arxiv.org/abs/1705.08790>`_.
+
+    Args:
+        loss_type (str, optional): Binary or multi-class loss.
+            Default: 'multi_class'. Options are "binary" and "multi_class".
+        classes (str | list[int], optional): Classes chosen to calculate loss.
+            'all' for all classes, 'present' for classes present in labels, or
+            a list of classes to average. Default: 'present'.
+        per_image (bool, optional): If per_image is True, compute the loss per
+            image instead of per batch. Default: False.
+        reduction (str, optional): The method used to reduce the loss. Options
+            are "none", "mean" and "sum". This parameter only works when
+            per_image is True. Default: 'mean'.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+        loss_name (str, optional): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_lovasz'.
+    """
+
+    def __init__(self,
+                 loss_type='multi_class',
+                 classes='present',
+                 per_image=False,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0,
+                 loss_name='loss_lovasz'):
+        super().__init__()
+        assert loss_type in ('binary', 'multi_class'), "loss_type should be \
+                                                    'binary' or 'multi_class'."
+
+        if loss_type == 'binary':
+            self.cls_criterion = lovasz_hinge
+        else:
+            self.cls_criterion = lovasz_softmax
+        assert classes in ('all', 'present') or is_list_of(classes, int)
+        if not per_image:
+            assert reduction == 'none', "reduction should be 'none' when \
+                                                        per_image is False."
+
+        self.classes = classes
+        self.per_image = per_image
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = get_class_weight(class_weight)
+        self._loss_name = loss_name
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function."""
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        # if multi-class loss, transform logits to probs
+        if self.cls_criterion == lovasz_softmax:
+            cls_score = F.softmax(cls_score, dim=1)
+
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            self.classes,
+            self.per_image,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_cls
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/build/lib/mmseg/models/losses/ohem_cross_entropy_loss.py b/head_extractor/build/lib/mmseg/models/losses/ohem_cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a519b4d84e1dbf86ebc7ad07372ddbdfb0ff3d13
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/losses/ohem_cross_entropy_loss.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class OhemCrossEntropy(nn.Module):
+    """OhemCrossEntropy loss.
+
+    This func is modified from
+    `PIDNet <https://github.com/XuJiacong/PIDNet/blob/main/utils/criterion.py#L43>`_.  # noqa
+
+    Licensed under the MIT License.
+
+    Args:
+        ignore_label (int): Labels to ignore when computing the loss.
+            Default: 255
+        thresh (float, optional): The threshold for hard example selection.
+            Below which, are prediction with low confidence. If not
+            specified, the hard examples will be pixels of top ``min_kept``
+            loss. Default: 0.7.
+        min_kept (int, optional): The minimum number of predictions to keep.
+            Default: 100000.
+        loss_weight (float): Weight of the loss. Defaults to 1.0.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_name (str): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_boundary'.
+    """
+
+    def __init__(self,
+                 ignore_label: int = 255,
+                 thres: float = 0.7,
+                 min_kept: int = 100000,
+                 loss_weight: float = 1.0,
+                 class_weight: Optional[Union[List[float], str]] = None,
+                 loss_name: str = 'loss_ohem'):
+        super().__init__()
+        self.thresh = thres
+        self.min_kept = max(1, min_kept)
+        self.ignore_label = ignore_label
+        self.loss_weight = loss_weight
+        self.loss_name_ = loss_name
+        self.class_weight = class_weight
+
+    def forward(self, score: Tensor, target: Tensor) -> Tensor:
+        """Forward function.
+        Args:
+            score (Tensor): Predictions of the segmentation head.
+            target (Tensor): Ground truth of the image.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        # score: (N, C, H, W)
+        pred = F.softmax(score, dim=1)
+        if self.class_weight is not None:
+            class_weight = score.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        pixel_losses = F.cross_entropy(
+            score,
+            target,
+            weight=class_weight,
+            ignore_index=self.ignore_label,
+            reduction='none').contiguous().view(-1)  # (N*H*W)
+        mask = target.contiguous().view(-1) != self.ignore_label  # (N*H*W)
+
+        tmp_target = target.clone()  # (N, H, W)
+        tmp_target[tmp_target == self.ignore_label] = 0
+        # pred: (N, C, H, W) -> (N*H*W, C)
+        pred = pred.gather(1, tmp_target.unsqueeze(1))
+        # pred: (N*H*W, C) -> (N*H*W), ind: (N*H*W)
+        pred, ind = pred.contiguous().view(-1, )[mask].contiguous().sort()
+        if pred.numel() > 0:
+            min_value = pred[min(self.min_kept, pred.numel() - 1)]
+        else:
+            return score.new_tensor(0.0)
+        threshold = max(min_value, self.thresh)
+
+        pixel_losses = pixel_losses[mask][ind]
+        pixel_losses = pixel_losses[pred < threshold]
+        return self.loss_weight * pixel_losses.mean()
+
+    @property
+    def loss_name(self):
+        return self.loss_name_
diff --git a/head_extractor/build/lib/mmseg/models/losses/silog_loss.py b/head_extractor/build/lib/mmseg/models/losses/silog_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecc07aac424a9308bce33e00c621369ac555f4ba
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/losses/silog_loss.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def silog_loss(pred: Tensor,
+               target: Tensor,
+               weight: Optional[Tensor] = None,
+               eps: float = 1e-4,
+               reduction: Union[str, None] = 'mean',
+               avg_factor: Optional[int] = None) -> Tensor:
+    """Computes the Scale-Invariant Logarithmic (SI-Log) loss between
+    prediction and target.
+
+    Args:
+        pred (Tensor): Predicted output.
+        target (Tensor): Ground truth.
+        weight (Optional[Tensor]): Optional weight to apply on the loss.
+        eps (float): Epsilon value to avoid division and log(0).
+        reduction (Union[str, None]): Specifies the reduction to apply to the
+            output: 'mean', 'sum' or None.
+        avg_factor (Optional[int]): Optional average factor for the loss.
+
+    Returns:
+        Tensor: The calculated SI-Log loss.
+    """
+    pred, target = pred.flatten(1), target.flatten(1)
+    valid_mask = (target > eps).detach().float()
+
+    diff_log = torch.log(target.clamp(min=eps)) - torch.log(
+        pred.clamp(min=eps))
+
+    valid_mask = (target > eps).detach() & (~torch.isnan(diff_log))
+    diff_log[~valid_mask] = 0.0
+    valid_mask = valid_mask.float()
+
+    diff_log_sq_mean = (diff_log.pow(2) * valid_mask).sum(
+        dim=1) / valid_mask.sum(dim=1).clamp(min=eps)
+    diff_log_mean = (diff_log * valid_mask).sum(dim=1) / valid_mask.sum(
+        dim=1).clamp(min=eps)
+
+    loss = torch.sqrt(diff_log_sq_mean - 0.5 * diff_log_mean.pow(2))
+
+    if weight is not None:
+        weight = weight.float()
+
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class SiLogLoss(nn.Module):
+    """Compute SiLog loss.
+
+    Args:
+        reduction (str, optional): The method used
+            to reduce the loss. Options are "none",
+            "mean" and "sum". Defaults to 'mean'.
+        loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+        eps (float): Avoid dividing by zero. Defaults to 1e-3.
+        loss_name (str, optional): Name of the loss item. If you want this
+            loss item to be included into the backward graph, `loss_` must
+            be the prefix of the name. Defaults to 'loss_silog'.
+    """
+
+    def __init__(self,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 eps=1e-6,
+                 loss_name='loss_silog'):
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self._loss_name = loss_name
+
+    def forward(
+        self,
+        pred,
+        target,
+        weight=None,
+        avg_factor=None,
+        reduction_override=None,
+    ):
+
+        assert pred.shape == target.shape, 'the shapes of pred ' \
+            f'({pred.shape}) and target ({target.shape}) are mismatch'
+
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        loss = self.loss_weight * silog_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+        )
+
+        return loss
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/build/lib/mmseg/models/losses/tversky_loss.py b/head_extractor/build/lib/mmseg/models/losses/tversky_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfca1af6669e3ac328492da11758a084999ef906
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/losses/tversky_loss.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from
+https://github.com/JunMa11/SegLoss/blob/master/losses_pytorch/dice_loss.py#L333
+(Apache-2.0 License)"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import get_class_weight, weighted_loss
+
+
+@weighted_loss
+def tversky_loss(pred,
+                 target,
+                 valid_mask,
+                 alpha=0.3,
+                 beta=0.7,
+                 smooth=1,
+                 class_weight=None,
+                 ignore_index=255):
+    assert pred.shape[0] == target.shape[0]
+    total_loss = 0
+    num_classes = pred.shape[1]
+    for i in range(num_classes):
+        if i != ignore_index:
+            tversky_loss = binary_tversky_loss(
+                pred[:, i],
+                target[..., i],
+                valid_mask=valid_mask,
+                alpha=alpha,
+                beta=beta,
+                smooth=smooth)
+            if class_weight is not None:
+                tversky_loss *= class_weight[i]
+            total_loss += tversky_loss
+    return total_loss / num_classes
+
+
+@weighted_loss
+def binary_tversky_loss(pred,
+                        target,
+                        valid_mask,
+                        alpha=0.3,
+                        beta=0.7,
+                        smooth=1):
+    assert pred.shape[0] == target.shape[0]
+    pred = pred.reshape(pred.shape[0], -1)
+    target = target.reshape(target.shape[0], -1)
+    valid_mask = valid_mask.reshape(valid_mask.shape[0], -1)
+
+    TP = torch.sum(torch.mul(pred, target) * valid_mask, dim=1)
+    FP = torch.sum(torch.mul(pred, 1 - target) * valid_mask, dim=1)
+    FN = torch.sum(torch.mul(1 - pred, target) * valid_mask, dim=1)
+    tversky = (TP + smooth) / (TP + alpha * FP + beta * FN + smooth)
+
+    return 1 - tversky
+
+
+@LOSSES.register_module()
+class TverskyLoss(nn.Module):
+    """TverskyLoss. This loss is proposed in `Tversky loss function for image
+    segmentation using 3D fully convolutional deep networks.
+
+    <https://arxiv.org/abs/1706.05721>`_.
+    Args:
+        smooth (float): A float number to smooth loss, and avoid NaN error.
+            Default: 1.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float, optional): Weight of the loss. Default to 1.0.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+        alpha(float, in [0, 1]):
+            The coefficient of false positives. Default: 0.3.
+        beta (float, in [0, 1]):
+            The coefficient of false negatives. Default: 0.7.
+            Note: alpha + beta = 1.
+        loss_name (str, optional): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_tversky'.
+    """
+
+    def __init__(self,
+                 smooth=1,
+                 class_weight=None,
+                 loss_weight=1.0,
+                 ignore_index=255,
+                 alpha=0.3,
+                 beta=0.7,
+                 loss_name='loss_tversky'):
+        super().__init__()
+        self.smooth = smooth
+        self.class_weight = get_class_weight(class_weight)
+        self.loss_weight = loss_weight
+        self.ignore_index = ignore_index
+        assert (alpha + beta == 1.0), 'Sum of alpha and beta but be 1.0!'
+        self.alpha = alpha
+        self.beta = beta
+        self._loss_name = loss_name
+
+    def forward(self, pred, target, **kwargs):
+        if self.class_weight is not None:
+            class_weight = pred.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        pred = F.softmax(pred, dim=1)
+        num_classes = pred.shape[1]
+        one_hot_target = F.one_hot(
+            torch.clamp(target.long(), 0, num_classes - 1),
+            num_classes=num_classes)
+        valid_mask = (target != self.ignore_index).long()
+
+        loss = self.loss_weight * tversky_loss(
+            pred,
+            one_hot_target,
+            valid_mask=valid_mask,
+            alpha=self.alpha,
+            beta=self.beta,
+            smooth=self.smooth,
+            class_weight=class_weight,
+            ignore_index=self.ignore_index)
+        return loss
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/build/lib/mmseg/models/losses/utils.py b/head_extractor/build/lib/mmseg/models/losses/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..047803473316ff5fc58de2b8e35ef0087bc3b624
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/losses/utils.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmengine.fileio import load
+
+
+def get_class_weight(class_weight):
+    """Get class weight for loss function.
+
+    Args:
+        class_weight (list[float] | str | None): If class_weight is a str,
+            take it as a file name and read from it.
+    """
+    if isinstance(class_weight, str):
+        # take it as a file path
+        if class_weight.endswith('.npy'):
+            class_weight = np.load(class_weight)
+        else:
+            # pkl, json or yaml
+            class_weight = load(class_weight)
+
+    return class_weight
+
+
+def reduce_loss(loss, reduction) -> torch.Tensor:
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+def weight_reduce_loss(loss,
+                       weight=None,
+                       reduction='mean',
+                       avg_factor=None) -> torch.Tensor:
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Average factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        assert weight.dim() == loss.dim()
+        if weight.dim() > 1:
+            assert weight.size(1) == 1 or weight.size(1) == loss.size(1)
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = torch.finfo(torch.float32).eps
+            loss = loss.sum() / (avg_factor + eps)
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+def weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
diff --git a/head_extractor/build/lib/mmseg/models/necks/__init__.py b/head_extractor/build/lib/mmseg/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff03186a92b78f942e79cff9eec9f5e2784c359a
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/necks/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .featurepyramid import Feature2Pyramid
+from .fpn import FPN
+from .ic_neck import ICNeck
+from .jpu import JPU
+from .mla_neck import MLANeck
+from .multilevel_neck import MultiLevelNeck
+
+__all__ = [
+    'FPN', 'MultiLevelNeck', 'MLANeck', 'ICNeck', 'JPU', 'Feature2Pyramid'
+]
diff --git a/head_extractor/build/lib/mmseg/models/necks/featurepyramid.py b/head_extractor/build/lib/mmseg/models/necks/featurepyramid.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc1250d39dafcf78880aa282bcba4215520ad94e
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/necks/featurepyramid.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class Feature2Pyramid(nn.Module):
+    """Feature2Pyramid.
+
+    A neck structure connect ViT backbone and decoder_heads.
+
+    Args:
+        embed_dims (int): Embedding dimension.
+        rescales (list[float]): Different sampling multiples were
+            used to obtain pyramid features. Default: [4, 2, 1, 0.5].
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='SyncBN', requires_grad=True).
+    """
+
+    def __init__(self,
+                 embed_dim,
+                 rescales=[4, 2, 1, 0.5],
+                 norm_cfg=dict(type='SyncBN', requires_grad=True)):
+        super().__init__()
+        self.rescales = rescales
+        self.upsample_4x = None
+        for k in self.rescales:
+            if k == 4:
+                self.upsample_4x = nn.Sequential(
+                    nn.ConvTranspose2d(
+                        embed_dim, embed_dim, kernel_size=2, stride=2),
+                    build_norm_layer(norm_cfg, embed_dim)[1],
+                    nn.GELU(),
+                    nn.ConvTranspose2d(
+                        embed_dim, embed_dim, kernel_size=2, stride=2),
+                )
+            elif k == 2:
+                self.upsample_2x = nn.Sequential(
+                    nn.ConvTranspose2d(
+                        embed_dim, embed_dim, kernel_size=2, stride=2))
+            elif k == 1:
+                self.identity = nn.Identity()
+            elif k == 0.5:
+                self.downsample_2x = nn.MaxPool2d(kernel_size=2, stride=2)
+            elif k == 0.25:
+                self.downsample_4x = nn.MaxPool2d(kernel_size=4, stride=4)
+            else:
+                raise KeyError(f'invalid {k} for feature2pyramid')
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.rescales)
+        outputs = []
+        if self.upsample_4x is not None:
+            ops = [
+                self.upsample_4x, self.upsample_2x, self.identity,
+                self.downsample_2x
+            ]
+        else:
+            ops = [
+                self.upsample_2x, self.identity, self.downsample_2x,
+                self.downsample_4x
+            ]
+        for i in range(len(inputs)):
+            outputs.append(ops[i](inputs[i]))
+        return tuple(outputs)
diff --git a/head_extractor/build/lib/mmseg/models/necks/fpn.py b/head_extractor/build/lib/mmseg/models/necks/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddab74c00a262a89031fda44824c5de0e2e9a362
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/necks/fpn.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+@MODELS.register_module()
+class FPN(BaseModule):
+    """Feature Pyramid Network.
+
+    This neck is the implementation of `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral': Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
+            on the original feature from the backbone. If True,
+            it is equivalent to `add_extra_convs='on_input'`. If False, it is
+            equivalent to set `add_extra_convs='on_output'`. Default to True.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: dict(mode='nearest').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 extra_convs_on_inputs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest'),
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super().__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            if extra_convs_on_inputs:
+                # For compatibility with previous release
+                # TODO: deprecate `extra_convs_on_inputs`
+                self.add_extra_convs = 'on_input'
+            else:
+                self.add_extra_convs = 'on_output'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmseg/models/necks/ic_neck.py b/head_extractor/build/lib/mmseg/models/necks/ic_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..9763541e0980cb0ec53a342b656e64c99d87ed7e
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/necks/ic_neck.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+class CascadeFeatureFusion(BaseModule):
+    """Cascade Feature Fusion Unit in ICNet.
+
+    Args:
+        low_channels (int): The number of input channels for
+            low resolution feature map.
+        high_channels (int): The number of input channels for
+            high resolution feature map.
+        out_channels (int): The number of output channels.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Dictionary to construct and config act layer.
+            Default: dict(type='ReLU').
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Returns:
+        x (Tensor): The output tensor of shape (N, out_channels, H, W).
+        x_low (Tensor): The output tensor of shape (N, out_channels, H, W)
+            for Cascade Label Guidance in auxiliary heads.
+    """
+
+    def __init__(self,
+                 low_channels,
+                 high_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.align_corners = align_corners
+        self.conv_low = ConvModule(
+            low_channels,
+            out_channels,
+            3,
+            padding=2,
+            dilation=2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv_high = ConvModule(
+            high_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x_low, x_high):
+        x_low = resize(
+            x_low,
+            size=x_high.size()[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        # Note: Different from original paper, `x_low` is underwent
+        # `self.conv_low` rather than another 1x1 conv classifier
+        #  before being used for auxiliary head.
+        x_low = self.conv_low(x_low)
+        x_high = self.conv_high(x_high)
+        x = x_low + x_high
+        x = F.relu(x, inplace=True)
+        return x, x_low
+
+
+@MODELS.register_module()
+class ICNeck(BaseModule):
+    """ICNet for Real-Time Semantic Segmentation on High-Resolution Images.
+
+    This head is the implementation of `ICHead
+    <https://arxiv.org/abs/1704.08545>`_.
+
+    Args:
+        in_channels (int): The number of input image channels. Default: 3.
+        out_channels (int): The numbers of output feature channels.
+            Default: 128.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Dictionary to construct and config act layer.
+            Default: dict(type='ReLU').
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=(64, 256, 256),
+                 out_channels=128,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert len(in_channels) == 3, 'Length of input channels \
+                                        must be 3!'
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+        self.cff_24 = CascadeFeatureFusion(
+            self.in_channels[2],
+            self.in_channels[1],
+            self.out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+
+        self.cff_12 = CascadeFeatureFusion(
+            self.out_channels,
+            self.in_channels[0],
+            self.out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+
+    def forward(self, inputs):
+        assert len(inputs) == 3, 'Length of input feature \
+                                        maps must be 3!'
+
+        x_sub1, x_sub2, x_sub4 = inputs
+        x_cff_24, x_24 = self.cff_24(x_sub4, x_sub2)
+        x_cff_12, x_12 = self.cff_12(x_cff_24, x_sub1)
+        # Note: `x_cff_12` is used for decode_head,
+        # `x_24` and `x_12` are used for auxiliary head.
+        return x_24, x_12, x_cff_12
diff --git a/head_extractor/build/lib/mmseg/models/necks/jpu.py b/head_extractor/build/lib/mmseg/models/necks/jpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ea0fe2183377d3e3c1a87ca8a0df909b123cdfa
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/necks/jpu.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+@MODELS.register_module()
+class JPU(BaseModule):
+    """FastFCN: Rethinking Dilated Convolution in the Backbone
+    for Semantic Segmentation.
+
+    This Joint Pyramid Upsampling (JPU) neck is the implementation of
+    `FastFCN <https://arxiv.org/abs/1903.11816>`_.
+
+    Args:
+        in_channels (Tuple[int], optional): The number of input channels
+            for each convolution operations before upsampling.
+            Default: (512, 1024, 2048).
+        mid_channels (int): The number of output channels of JPU.
+            Default: 512.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        dilations (tuple[int]): Dilation rate of each Depthwise
+            Separable ConvModule. Default: (1, 2, 4, 8).
+        align_corners (bool, optional): The align_corners argument of
+            resize operation. Default: False.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=(512, 1024, 2048),
+                 mid_channels=512,
+                 start_level=0,
+                 end_level=-1,
+                 dilations=(1, 2, 4, 8),
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(in_channels, tuple)
+        assert isinstance(dilations, tuple)
+        self.in_channels = in_channels
+        self.mid_channels = mid_channels
+        self.start_level = start_level
+        self.num_ins = len(in_channels)
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+        else:
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+
+        self.dilations = dilations
+        self.align_corners = align_corners
+
+        self.conv_layers = nn.ModuleList()
+        self.dilation_layers = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            conv_layer = nn.Sequential(
+                ConvModule(
+                    self.in_channels[i],
+                    self.mid_channels,
+                    kernel_size=3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.conv_layers.append(conv_layer)
+        for i in range(len(dilations)):
+            dilation_layer = nn.Sequential(
+                DepthwiseSeparableConvModule(
+                    in_channels=(self.backbone_end_level - self.start_level) *
+                    self.mid_channels,
+                    out_channels=self.mid_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=dilations[i],
+                    dilation=dilations[i],
+                    dw_norm_cfg=norm_cfg,
+                    dw_act_cfg=None,
+                    pw_norm_cfg=norm_cfg,
+                    pw_act_cfg=act_cfg))
+            self.dilation_layers.append(dilation_layer)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels), 'Length of inputs must \
+                                           be the same with self.in_channels!'
+
+        feats = [
+            self.conv_layers[i - self.start_level](inputs[i])
+            for i in range(self.start_level, self.backbone_end_level)
+        ]
+
+        h, w = feats[0].shape[2:]
+        for i in range(1, len(feats)):
+            feats[i] = resize(
+                feats[i],
+                size=(h, w),
+                mode='bilinear',
+                align_corners=self.align_corners)
+
+        feat = torch.cat(feats, dim=1)
+        concat_feat = torch.cat([
+            self.dilation_layers[i](feat) for i in range(len(self.dilations))
+        ],
+                                dim=1)
+
+        outs = []
+
+        # Default: outs[2] is the output of JPU for decoder head, outs[1] is
+        # the feature map from backbone for auxiliary head. Additionally,
+        # outs[0] can also be used for auxiliary head.
+        for i in range(self.start_level, self.backbone_end_level - 1):
+            outs.append(inputs[i])
+        outs.append(concat_feat)
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmseg/models/necks/mla_neck.py b/head_extractor/build/lib/mmseg/models/necks/mla_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..db250aefbfa45beaa98855be79ddc7f5e7276cca
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/necks/mla_neck.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_norm_layer
+
+from mmseg.registry import MODELS
+
+
+class MLAModule(nn.Module):
+
+    def __init__(self,
+                 in_channels=[1024, 1024, 1024, 1024],
+                 out_channels=256,
+                 norm_cfg=None,
+                 act_cfg=None):
+        super().__init__()
+        self.channel_proj = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.channel_proj.append(
+                ConvModule(
+                    in_channels=in_channels[i],
+                    out_channels=out_channels,
+                    kernel_size=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        self.feat_extract = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.feat_extract.append(
+                ConvModule(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs):
+
+        # feat_list -> [p2, p3, p4, p5]
+        feat_list = []
+        for x, conv in zip(inputs, self.channel_proj):
+            feat_list.append(conv(x))
+
+        # feat_list -> [p5, p4, p3, p2]
+        # mid_list -> [m5, m4, m3, m2]
+        feat_list = feat_list[::-1]
+        mid_list = []
+        for feat in feat_list:
+            if len(mid_list) == 0:
+                mid_list.append(feat)
+            else:
+                mid_list.append(mid_list[-1] + feat)
+
+        # mid_list -> [m5, m4, m3, m2]
+        # out_list -> [o2, o3, o4, o5]
+        out_list = []
+        for mid, conv in zip(mid_list, self.feat_extract):
+            out_list.append(conv(mid))
+
+        return tuple(out_list)
+
+
+@MODELS.register_module()
+class MLANeck(nn.Module):
+    """Multi-level Feature Aggregation.
+
+    This neck is `The Multi-level Feature Aggregation construction of
+    SETR <https://arxiv.org/abs/2012.15840>`_.
+
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        norm_layer (dict): Config dict for input normalization.
+            Default: norm_layer=dict(type='LN', eps=1e-6, requires_grad=True).
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 norm_layer=dict(type='LN', eps=1e-6, requires_grad=True),
+                 norm_cfg=None,
+                 act_cfg=None):
+        super().__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        # In order to build general vision transformer backbone, we have to
+        # move MLA to neck.
+        self.norm = nn.ModuleList([
+            build_norm_layer(norm_layer, in_channels[i])[1]
+            for i in range(len(in_channels))
+        ])
+
+        self.mla = MLAModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # Convert from nchw to nlc
+        outs = []
+        for i in range(len(inputs)):
+            x = inputs[i]
+            n, c, h, w = x.shape
+            x = x.reshape(n, c, h * w).transpose(2, 1).contiguous()
+            x = self.norm[i](x)
+            x = x.transpose(1, 2).reshape(n, c, h, w).contiguous()
+            outs.append(x)
+
+        outs = self.mla(outs)
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmseg/models/necks/multilevel_neck.py b/head_extractor/build/lib/mmseg/models/necks/multilevel_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..c997125f24791b1c01248c60a27fa37a986c6c82
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/necks/multilevel_neck.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model.weight_init import xavier_init
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+@MODELS.register_module()
+class MultiLevelNeck(nn.Module):
+    """MultiLevelNeck.
+
+    A neck structure connect vit backbone and decoder_heads.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        scales (List[float]): Scale factors for each input feature map.
+            Default: [0.5, 1, 2, 4]
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 scales=[0.5, 1, 2, 4],
+                 norm_cfg=None,
+                 act_cfg=None):
+        super().__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.scales = scales
+        self.num_outs = len(scales)
+        self.lateral_convs = nn.ModuleList()
+        self.convs = nn.ModuleList()
+        for in_channel in in_channels:
+            self.lateral_convs.append(
+                ConvModule(
+                    in_channel,
+                    out_channels,
+                    kernel_size=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        for _ in range(self.num_outs):
+            self.convs.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    stride=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+        inputs = [
+            lateral_conv(inputs[i])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        # for len(inputs) not equal to self.num_outs
+        if len(inputs) == 1:
+            inputs = [inputs[0] for _ in range(self.num_outs)]
+        outs = []
+        for i in range(self.num_outs):
+            x_resize = resize(
+                inputs[i], scale_factor=self.scales[i], mode='bilinear')
+            outs.append(self.convs[i](x_resize))
+        return tuple(outs)
diff --git a/head_extractor/build/lib/mmseg/models/segmentors/__init__.py b/head_extractor/build/lib/mmseg/models/segmentors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..59b012f41725d26d099b8f890630d1dc04019ba5
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/segmentors/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseSegmentor
+from .cascade_encoder_decoder import CascadeEncoderDecoder
+from .depth_estimator import DepthEstimator
+from .encoder_decoder import EncoderDecoder
+from .multimodal_encoder_decoder import MultimodalEncoderDecoder
+from .seg_tta import SegTTAModel
+
+__all__ = [
+    'BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder', 'SegTTAModel',
+    'MultimodalEncoderDecoder', 'DepthEstimator'
+]
diff --git a/head_extractor/build/lib/mmseg/models/segmentors/base.py b/head_extractor/build/lib/mmseg/models/segmentors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0d6ca189e5e4db834974692580d77621f8ff69f
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/segmentors/base.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Tuple
+
+from mmengine.model import BaseModel
+from mmengine.structures import PixelData
+from torch import Tensor
+
+from mmseg.structures import SegDataSample
+from mmseg.utils import (ForwardResults, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList)
+from ..utils import resize
+
+import torch
+
+
+class BaseSegmentor(BaseModel, metaclass=ABCMeta):
+    """Base class for segmentors.
+
+    Args:
+        data_preprocessor (dict, optional): Model preprocessing config
+            for processing the input data. it usually includes
+            ``to_rgb``, ``pad_size_divisor``, ``pad_val``,
+            ``mean`` and ``std``. Default to None.
+       init_cfg (dict, optional): the config to control the
+           initialization. Default to None.
+    """
+
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+    @property
+    def with_neck(self) -> bool:
+        """bool: whether the segmentor has neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    @property
+    def with_auxiliary_head(self) -> bool:
+        """bool: whether the segmentor has auxiliary head"""
+        return hasattr(self,
+                       'auxiliary_head') and self.auxiliary_head is not None
+
+    @property
+    def with_decode_head(self) -> bool:
+        """bool: whether the segmentor has decode head"""
+        return hasattr(self, 'decode_head') and self.decode_head is not None
+
+    @abstractmethod
+    def extract_feat(self, inputs: Tensor) -> bool:
+        """Placeholder for extract features from images."""
+        pass
+
+    @abstractmethod
+    def encode_decode(self, inputs: Tensor, batch_data_samples: SampleList):
+        """Placeholder for encode images with backbone and decode into a
+        semantic segmentation map of the same size as input."""
+        pass
+
+    def forward(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None,
+                mode: str = 'tensor') -> ForwardResults:
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+        processed to a list of :obj:`SegDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (torch.Tensor): The input tensor with shape (N, C, ...) in
+                general.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`. Default to None.
+            mode (str): Return what kind of value. Defaults to 'tensor'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`DetDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if mode == 'loss':
+            # try:
+            #     return self.loss(inputs, data_samples)
+            # except:
+            #     print(data_samples)
+            return self.loss(inputs, data_samples)
+        elif mode == 'predict':
+            # data_samples[0] = data_samples[0].to(dtype=torch.float16)
+            # inputs = inputs.to(dtype=torch.float16)
+            return self.predict(inputs, data_samples)
+            # try:
+            #     return self.predict(inputs, data_samples)
+            # except:
+            #     print(data_samples)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    @abstractmethod
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+        pass
+
+    @abstractmethod
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing."""
+        pass
+
+    @abstractmethod
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        pass
+
+    def postprocess_result(self,
+                           seg_logits: Tensor,
+                           data_samples: OptSampleList = None) -> SampleList:
+        """ Convert results list to `SegDataSample`.
+        Args:
+            seg_logits (Tensor): The segmentation results, seg_logits from
+                model of each input image.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`. Default to None.
+        Returns:
+            list[:obj:`SegDataSample`]: Segmentation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+            - ``seg_logits``(PixelData): Predicted logits of semantic
+                segmentation before normalization.
+        """
+        batch_size, C, H, W = seg_logits.shape
+
+        if data_samples is None:
+            data_samples = [SegDataSample() for _ in range(batch_size)]
+            only_prediction = True
+        else:
+            only_prediction = False
+
+        for i in range(batch_size):
+            if not only_prediction:
+                img_meta = data_samples[i].metainfo
+                # remove padding area
+                if 'img_padding_size' not in img_meta:
+                    padding_size = img_meta.get('padding_size', [0] * 4)
+                else:
+                    padding_size = img_meta['img_padding_size']
+                padding_left, padding_right, padding_top, padding_bottom =\
+                    padding_size
+                # i_seg_logits shape is 1, C, H, W after remove padding
+                i_seg_logits = seg_logits[i:i + 1, :,
+                                          padding_top:H - padding_bottom,
+                                          padding_left:W - padding_right]
+
+                flip = img_meta.get('flip', None)
+                if flip:
+                    flip_direction = img_meta.get('flip_direction', None)
+                    assert flip_direction in ['horizontal', 'vertical']
+                    if flip_direction == 'horizontal':
+                        i_seg_logits = i_seg_logits.flip(dims=(3, ))
+                    else:
+                        i_seg_logits = i_seg_logits.flip(dims=(2, ))
+
+                # resize as original shape
+                i_seg_logits = resize(
+                    i_seg_logits,
+                    size=img_meta['ori_shape'],
+                    mode='bilinear',
+                    align_corners=self.align_corners,
+                    warning=False).squeeze(0)
+            else:
+                i_seg_logits = seg_logits[i]
+
+            if C > 1:
+                i_seg_pred = i_seg_logits.argmax(dim=0, keepdim=True)
+            else:
+                i_seg_logits = i_seg_logits.sigmoid()
+                i_seg_pred = (i_seg_logits >
+                              self.decode_head.threshold).to(i_seg_logits)
+            data_samples[i].set_data({
+                'seg_logits':
+                PixelData(**{'data': i_seg_logits}),
+                'pred_sem_seg':
+                PixelData(**{'data': i_seg_pred})
+            })
+
+        return data_samples
diff --git a/head_extractor/build/lib/mmseg/models/segmentors/cascade_encoder_decoder.py b/head_extractor/build/lib/mmseg/models/segmentors/cascade_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0184a3533a18cbe96a28bbb645c3e73bbffcdeee
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/segmentors/cascade_encoder_decoder.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+from torch import Tensor, nn
+
+from mmseg.registry import MODELS
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from .encoder_decoder import EncoderDecoder
+
+
+@MODELS.register_module()
+class CascadeEncoderDecoder(EncoderDecoder):
+    """Cascade Encoder Decoder segmentors.
+
+    CascadeEncoderDecoder almost the same as EncoderDecoder, while decoders of
+    CascadeEncoderDecoder are cascaded. The output of previous decoder_head
+    will be the input of next decoder_head.
+
+    Args:
+
+        num_stages (int): How many stages will be cascaded.
+        backbone (ConfigType): The config for the backnone of segmentor.
+        decode_head (ConfigType): The config for the decode head of segmentor.
+        neck (OptConfigType): The config for the neck of segmentor.
+            Defaults to None.
+        auxiliary_head (OptConfigType): The config for the auxiliary head of
+            segmentor. Defaults to None.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """
+
+    def __init__(self,
+                 num_stages: int,
+                 backbone: ConfigType,
+                 decode_head: ConfigType,
+                 neck: OptConfigType = None,
+                 auxiliary_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 init_cfg: OptMultiConfig = None):
+        self.num_stages = num_stages
+        super().__init__(
+            backbone=backbone,
+            decode_head=decode_head,
+            neck=neck,
+            auxiliary_head=auxiliary_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def _init_decode_head(self, decode_head: ConfigType) -> None:
+        """Initialize ``decode_head``"""
+        assert isinstance(decode_head, list)
+        assert len(decode_head) == self.num_stages
+        self.decode_head = nn.ModuleList()
+        for i in range(self.num_stages):
+            self.decode_head.append(MODELS.build(decode_head[i]))
+        self.align_corners = self.decode_head[-1].align_corners
+        self.num_classes = self.decode_head[-1].num_classes
+        self.out_channels = self.decode_head[-1].out_channels
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input."""
+        x = self.extract_feat(inputs)
+        out = self.decode_head[0].forward(x)
+        for i in range(1, self.num_stages - 1):
+            out = self.decode_head[i].forward(x, out)
+        seg_logits_list = self.decode_head[-1].predict(x, out, batch_img_metas,
+                                                       self.test_cfg)
+
+        return seg_logits_list
+
+    def _decode_head_forward_train(self, inputs: Tensor,
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+
+        loss_decode = self.decode_head[0].loss(inputs, data_samples,
+                                               self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode_0'))
+        # get batch_img_metas
+        batch_size = len(data_samples)
+        batch_img_metas = []
+        for batch_index in range(batch_size):
+            metainfo = data_samples[batch_index].metainfo
+            batch_img_metas.append(metainfo)
+
+        for i in range(1, self.num_stages):
+            # forward test again, maybe unnecessary for most methods.
+            if i == 1:
+                prev_outputs = self.decode_head[0].forward(inputs)
+            else:
+                prev_outputs = self.decode_head[i - 1].forward(
+                    inputs, prev_outputs)
+            loss_decode = self.decode_head[i].loss(inputs, prev_outputs,
+                                                   data_samples,
+                                                   self.train_cfg)
+            losses.update(add_prefix(loss_decode, f'decode_{i}'))
+
+        return losses
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_semantic_seg`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+
+        out = self.decode_head[0].forward(x)
+        for i in range(1, self.num_stages):
+            # TODO support PointRend tensor mode
+            out = self.decode_head[i].forward(x, out)
+
+        return out
diff --git a/head_extractor/build/lib/mmseg/models/segmentors/depth_estimator.py b/head_extractor/build/lib/mmseg/models/segmentors/depth_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..1020637e737a3c72ba6a48f2d1228717470ba862
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/segmentors/depth_estimator.py
@@ -0,0 +1,392 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.logging import print_log
+from mmengine.structures import PixelData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from ..utils import resize
+from .encoder_decoder import EncoderDecoder
+
+
+@MODELS.register_module()
+class DepthEstimator(EncoderDecoder):
+    """Encoder Decoder depth estimator.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+
+    1. The ``loss`` method is used to calculate the loss of model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2) Call the decode head loss function to forward decode head model and
+    calculate losses.
+
+    .. code:: text
+
+     loss(): extract_feat() -> _decode_head_forward_train() -> _auxiliary_head_forward_train (optional)
+     _decode_head_forward_train(): decode_head.loss()
+     _auxiliary_head_forward_train(): auxiliary_head.loss (optional)
+
+    2. The ``predict`` method is used to predict depth estimation results,
+    which includes two steps: (1) Run inference function to obtain the list of
+    depth (2) Call post-processing function to obtain list of
+    ``SegDataSample`` including ``pred_depth_map``.
+
+    .. code:: text
+
+     predict(): inference() -> postprocess_result()
+     inference(): whole_inference()/slide_inference()
+     whole_inference()/slide_inference(): encoder_decoder()
+     encoder_decoder(): extract_feat() -> decode_head.predict()
+
+    3. The ``_forward`` method is used to output the tensor by running the model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2)Call the decode head forward function to forward decode head model.
+
+    .. code:: text
+
+     _forward(): extract_feat() -> _decode_head.forward()
+
+    Args:
+
+        backbone (ConfigType): The config for the backnone of depth estimator.
+        decode_head (ConfigType): The config for the decode head of depth estimator.
+        neck (OptConfigType): The config for the neck of depth estimator.
+            Defaults to None.
+        auxiliary_head (OptConfigType): The config for the auxiliary head of
+            depth estimator. Defaults to None.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 decode_head: ConfigType,
+                 neck: OptConfigType = None,
+                 auxiliary_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            decode_head=decode_head,
+            neck=neck,
+            auxiliary_head=auxiliary_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def extract_feat(self,
+                     inputs: Tensor,
+                     batch_img_metas: Optional[List[dict]] = None) -> Tensor:
+        """Extract features from images."""
+
+        if getattr(self.backbone, 'class_embed_select', False) and \
+                isinstance(batch_img_metas, list) and \
+                'category_id' in batch_img_metas[0]:
+            cat_ids = [meta['category_id'] for meta in batch_img_metas]
+            cat_ids = torch.tensor(cat_ids).to(inputs.device)
+            inputs = (inputs, cat_ids)
+
+        x = self.backbone(inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode images with backbone and decode into a depth map of the same
+        size as input."""
+        x = self.extract_feat(inputs, batch_img_metas)
+        depth = self.decode_head.predict(x, batch_img_metas, self.test_cfg)
+
+        return depth
+
+    def _decode_head_forward_train(self, inputs: List[Tensor],
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.loss(inputs, data_samples,
+                                            self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def _auxiliary_head_forward_train(self, inputs: List[Tensor],
+                                      data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for auxiliary head in
+        training."""
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.loss(inputs, data_samples, self.train_cfg)
+                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+        else:
+            loss_aux = self.auxiliary_head.loss(inputs, data_samples,
+                                                self.train_cfg)
+            losses.update(add_prefix(loss_aux, 'aux'))
+
+        return losses
+
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): Input images.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_depth_map`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        x = self.extract_feat(inputs, batch_img_metas)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, data_samples)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(x, data_samples)
+            losses.update(loss_aux)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`], optional): The seg data
+                samples. It usually includes information such as `metainfo`
+                and `gt_depth_map`.
+
+        Returns:
+            list[:obj:`SegDataSample`]: Depth estimation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_depth_max``(PixelData): Prediction of depth estimation.
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        depth = self.inference(inputs, batch_img_metas)
+
+        return self.postprocess_result(depth, data_samples)
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_depth_map`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+        return self.decode_head.forward(x)
+
+    def slide_flip_inference(self, inputs: Tensor,
+                             batch_img_metas: List[dict]) -> Tensor:
+        """Inference by sliding-window with overlap and flip.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+
+        Args:
+            inputs (tensor): the tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The depth estimation results.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = inputs.size()
+        out_channels = self.out_channels
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+        count_mat = inputs.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = inputs[:, :, y1:y2, x1:x2]
+                # change the image shape to patch shape
+                batch_img_metas[0]['img_shape'] = crop_img.shape[2:]
+                # the output of encode_decode is depth tensor map
+                # with shape [N, C, H, W]
+                crop_depth_map = self.encode_decode(crop_img, batch_img_metas)
+
+                # average out the original and flipped prediction
+                crop_depth_map_flip = self.encode_decode(
+                    crop_img.flip(dims=(3, )), batch_img_metas)
+                crop_depth_map_flip = crop_depth_map_flip.flip(dims=(3, ))
+                crop_depth_map = (crop_depth_map + crop_depth_map_flip) / 2.0
+
+                preds += F.pad(crop_depth_map,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        depth = preds / count_mat
+
+        return depth
+
+    def inference(self, inputs: Tensor, batch_img_metas: List[dict]) -> Tensor:
+        """Inference with slide/whole style.
+
+        Args:
+            inputs (Tensor): The input image of shape (N, 3, H, W).
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', 'pad_shape', and 'padding_size'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The depth estimation results.
+        """
+        assert self.test_cfg.get('mode', 'whole') in ['slide', 'whole',
+                                                      'slide_flip'], \
+            f'Only "slide", "slide_flip" or "whole" test mode are ' \
+            f'supported, but got {self.test_cfg["mode"]}.'
+        ori_shape = batch_img_metas[0]['ori_shape']
+        if not all(_['ori_shape'] == ori_shape for _ in batch_img_metas):
+            print_log(
+                'Image shapes are different in the batch.',
+                logger='current',
+                level=logging.WARN)
+        if self.test_cfg.mode == 'slide':
+            depth_map = self.slide_inference(inputs, batch_img_metas)
+        if self.test_cfg.mode == 'slide_flip':
+            depth_map = self.slide_flip_inference(inputs, batch_img_metas)
+        else:
+            depth_map = self.whole_inference(inputs, batch_img_metas)
+
+        return depth_map
+
+    def postprocess_result(self,
+                           depth: Tensor,
+                           data_samples: OptSampleList = None) -> SampleList:
+        """ Convert results list to `SegDataSample`.
+        Args:
+            depth (Tensor): The depth estimation results.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_depth_map`. Default to None.
+        Returns:
+            list[:obj:`SegDataSample`]: Depth estomation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_depth_map``(PixelData): Prediction of depth estimation.
+        """
+        batch_size, C, H, W = depth.shape
+
+        if data_samples is None:
+            data_samples = [SegDataSample() for _ in range(batch_size)]
+            only_prediction = True
+        else:
+            only_prediction = False
+
+        for i in range(batch_size):
+            if not only_prediction:
+                img_meta = data_samples[i].metainfo
+                # remove padding area
+                if 'img_padding_size' not in img_meta:
+                    padding_size = img_meta.get('padding_size', [0] * 4)
+                else:
+                    padding_size = img_meta['img_padding_size']
+                padding_left, padding_right, padding_top, padding_bottom =\
+                    padding_size
+                # i_depth shape is 1, C, H, W after remove padding
+                i_depth = depth[i:i + 1, :, padding_top:H - padding_bottom,
+                                padding_left:W - padding_right]
+
+                flip = img_meta.get('flip', None)
+                if flip:
+                    flip_direction = img_meta.get('flip_direction', None)
+                    assert flip_direction in ['horizontal', 'vertical']
+                    if flip_direction == 'horizontal':
+                        i_depth = i_depth.flip(dims=(3, ))
+                    else:
+                        i_depth = i_depth.flip(dims=(2, ))
+
+                # resize as original shape
+                i_depth = resize(
+                    i_depth,
+                    size=img_meta['ori_shape'],
+                    mode='bilinear',
+                    align_corners=self.align_corners,
+                    warning=False).squeeze(0)
+            else:
+                i_depth = depth[i]
+
+            data_samples[i].set_data(
+                {'pred_depth_map': PixelData(**{'data': i_depth})})
+
+        return data_samples
diff --git a/head_extractor/build/lib/mmseg/models/segmentors/encoder_decoder.py b/head_extractor/build/lib/mmseg/models/segmentors/encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9c6011f2783f3a738fd105cd1a3850588aefa1b
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/segmentors/encoder_decoder.py
@@ -0,0 +1,365 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import List, Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.logging import print_log
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from .base import BaseSegmentor
+
+
+@MODELS.register_module()
+class EncoderDecoder(BaseSegmentor):
+    """Encoder Decoder segmentors.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+
+    1. The ``loss`` method is used to calculate the loss of model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2) Call the decode head loss function to forward decode head model and
+    calculate losses.
+
+    .. code:: text
+
+     loss(): extract_feat() -> _decode_head_forward_train() -> _auxiliary_head_forward_train (optional)
+     _decode_head_forward_train(): decode_head.loss()
+     _auxiliary_head_forward_train(): auxiliary_head.loss (optional)
+
+    2. The ``predict`` method is used to predict segmentation results,
+    which includes two steps: (1) Run inference function to obtain the list of
+    seg_logits (2) Call post-processing function to obtain list of
+    ``SegDataSample`` including ``pred_sem_seg`` and ``seg_logits``.
+
+    .. code:: text
+
+     predict(): inference() -> postprocess_result()
+     infercen(): whole_inference()/slide_inference()
+     whole_inference()/slide_inference(): encoder_decoder()
+     encoder_decoder(): extract_feat() -> decode_head.predict()
+
+    3. The ``_forward`` method is used to output the tensor by running the model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2)Call the decode head forward function to forward decode head model.
+
+    .. code:: text
+
+     _forward(): extract_feat() -> _decode_head.forward()
+
+    Args:
+
+        backbone (ConfigType): The config for the backnone of segmentor.
+        decode_head (ConfigType): The config for the decode head of segmentor.
+        neck (OptConfigType): The config for the neck of segmentor.
+            Defaults to None.
+        auxiliary_head (OptConfigType): The config for the auxiliary head of
+            segmentor. Defaults to None.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 decode_head: ConfigType,
+                 neck: OptConfigType = None,
+                 auxiliary_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        if pretrained is not None:
+            assert backbone.get('pretrained') is None, \
+                'both backbone and segmentor set pretrained weight'
+            backbone.pretrained = pretrained
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        self._init_decode_head(decode_head)
+        self._init_auxiliary_head(auxiliary_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head
+
+    def _init_decode_head(self, decode_head: ConfigType) -> None:
+        """Initialize ``decode_head``"""
+        self.decode_head = MODELS.build(decode_head)
+        self.align_corners = self.decode_head.align_corners
+        self.num_classes = self.decode_head.num_classes
+        self.out_channels = self.decode_head.out_channels
+
+    def _init_auxiliary_head(self, auxiliary_head: ConfigType) -> None:
+        """Initialize ``auxiliary_head``"""
+        if auxiliary_head is not None:
+            if isinstance(auxiliary_head, list):
+                self.auxiliary_head = nn.ModuleList()
+                for head_cfg in auxiliary_head:
+                    self.auxiliary_head.append(MODELS.build(head_cfg))
+            else:
+                self.auxiliary_head = MODELS.build(auxiliary_head)
+
+    def extract_feat(self, inputs: Tensor) -> List[Tensor]:
+        """Extract features from images."""
+        x = self.backbone(inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input."""
+        x = self.extract_feat(inputs)
+        seg_logits = self.decode_head.predict(x, batch_img_metas,
+                                              self.test_cfg)
+
+        return seg_logits
+
+    def _decode_head_forward_train(self, inputs: List[Tensor],
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.loss(inputs, data_samples,
+                                            self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def _auxiliary_head_forward_train(self, inputs: List[Tensor],
+                                      data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for auxiliary head in
+        training."""
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.loss(inputs, data_samples, self.train_cfg)
+                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+        else:
+            loss_aux = self.auxiliary_head.loss(inputs, data_samples,
+                                                self.train_cfg)
+            losses.update(add_prefix(loss_aux, 'aux'))
+
+        return losses
+
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): Input images.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        x = self.extract_feat(inputs)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, data_samples)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(x, data_samples)
+            losses.update(loss_aux)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`], optional): The seg data
+                samples. It usually includes information such as `metainfo`
+                and `gt_sem_seg`.
+
+        Returns:
+            list[:obj:`SegDataSample`]: Segmentation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+            - ``seg_logits``(PixelData): Predicted logits of semantic
+                segmentation before normalization.
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        seg_logits = self.inference(inputs, batch_img_metas)
+
+        return self.postprocess_result(seg_logits, data_samples)
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_sem_seg`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+        # return self.decode_head.forward(x)
+        return self.decode_head.forward(x, data_samples)
+
+    def slide_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+
+        Args:
+            inputs (tensor): the tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = inputs.size()
+        out_channels = self.out_channels
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+        count_mat = inputs.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = inputs[:, :, y1:y2, x1:x2]
+                # change the image shape to patch shape
+                batch_img_metas[0]['img_shape'] = crop_img.shape[2:]
+                # the output of encode_decode is seg logits tensor map
+                # with shape [N, C, H, W]
+                crop_seg_logit = self.encode_decode(crop_img, batch_img_metas)
+                preds += F.pad(crop_seg_logit,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        seg_logits = preds / count_mat
+
+        return seg_logits
+
+    def whole_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference with full image.
+
+        Args:
+            inputs (Tensor): The tensor should have a shape NxCxHxW, which
+                contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        seg_logits = self.encode_decode(inputs, batch_img_metas)
+
+        return seg_logits
+
+    def inference(self, inputs: Tensor, batch_img_metas: List[dict]) -> Tensor:
+        """Inference with slide/whole style.
+
+        Args:
+            inputs (Tensor): The input image of shape (N, 3, H, W).
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', 'pad_shape', and 'padding_size'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+        assert self.test_cfg.get('mode', 'whole') in ['slide', 'whole'], \
+            f'Only "slide" or "whole" test mode are supported, but got ' \
+            f'{self.test_cfg["mode"]}.'
+        ori_shape = batch_img_metas[0]['ori_shape']
+        if not all(_['ori_shape'] == ori_shape for _ in batch_img_metas):
+            print_log(
+                'Image shapes are different in the batch.',
+                logger='current',
+                level=logging.WARN)
+        if self.test_cfg.mode == 'slide':
+            seg_logit = self.slide_inference(inputs, batch_img_metas)
+        else:
+            seg_logit = self.whole_inference(inputs, batch_img_metas)
+
+        return seg_logit
+
+    def aug_test(self, inputs, batch_img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented seg logit inplace
+        seg_logit = self.inference(inputs[0], batch_img_metas[0], rescale)
+        for i in range(1, len(inputs)):
+            cur_seg_logit = self.inference(inputs[i], batch_img_metas[i],
+                                           rescale)
+            seg_logit += cur_seg_logit
+        seg_logit /= len(inputs)
+        seg_pred = seg_logit.argmax(dim=1)
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
diff --git a/head_extractor/build/lib/mmseg/models/segmentors/multimodal_encoder_decoder.py b/head_extractor/build/lib/mmseg/models/segmentors/multimodal_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..75aa8b9b17688cb5f54da08f9300af82b3339967
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/segmentors/multimodal_encoder_decoder.py
@@ -0,0 +1,350 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from .base import BaseSegmentor
+
+
+@MODELS.register_module()
+class MultimodalEncoderDecoder(BaseSegmentor):
+    """Multimodal Encoder-Decoder segmentors.
+
+    Multimodal segmentation architecture is used for open-vocabulary
+    semantic segmentation with combining the visual and language
+    pretrain models. It consists of a image_encoder (backbone) to extract
+    visual feature, a text encoder to extract text feature, and a decode
+    head to generate semantic maps.
+    Note that the deep supervision during training is implemented in decode head.
+
+    1. The ``loss`` method is used to calculate the loss of model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2) Call the decode head loss function to forward decode head model and
+    calculate losses.
+
+    .. code:: text
+
+     loss(): extract_feat() -> _decode_head_forward_train()
+     _decode_head_forward_train(): decode_head.loss()
+
+    2. The ``predict`` method is used to predict segmentation results,
+    which includes two steps: (1) Run inference function to obtain the list of
+    seg_logits (2) Call post-processing function to obtain list of
+    ``SegDataSampel`` including ``pred_sem_seg`` and ``seg_logits``.
+
+    .. code:: text
+
+     predict(): inference() -> postprocess_result()
+     inference(): whole_inference()/slide_inference()
+     whole_inference()/slide_inference(): encoder_decoder()
+     encoder_decoder(): extract_feat() -> decode_head.predict()
+
+    3. The ``_forward`` method is used to output the tensor by running the model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2)Call the decode head forward function to forward decode head model.
+
+    .. code:: text
+
+     _forward(): extract_feat() -> _decode_head.forward()
+
+    Args:
+
+        image_encoder (ConfigType): The config for the visual encoder of segmentor.
+        text_encoder ((ConfigType): The config for the text encoder of segmentor.
+        decode_head (ConfigType): The config for the decode head of segmentor.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        asymetric_input (bool): whether to use different size of input for image encoder
+            and decode head. Defaults to False.
+        encoder_resolution (float): resize scale of input images for image encoder.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 image_encoder: ConfigType,
+                 text_encoder: ConfigType,
+                 decode_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 asymetric_input: bool = True,
+                 encoder_resolution: float = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        if pretrained is not None:
+            image_encoder.init_cfg = dict(
+                type='Pretrained_Part', checkpoint=pretrained)
+            text_encoder.init_cfg = dict(
+                type='Pretrained_Part', checkpoint=pretrained)
+            decode_head.init_cfg = dict(
+                type='Pretrained_Part', checkpoint=pretrained)
+
+        if asymetric_input:
+            assert encoder_resolution is not None, \
+                'if asymetric_input set True, ' \
+                'clip_resolution must be a certain value'
+        self.asymetric_input = asymetric_input
+        self.encoder_resolution = encoder_resolution
+        self.image_encoder = MODELS.build(image_encoder)
+        self.text_encoder = MODELS.build(text_encoder)
+        self._init_decode_head(decode_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head
+
+    def _init_decode_head(self, decode_head: ConfigType) -> None:
+        """Initialize ``decode_head``"""
+        self.decode_head = MODELS.build(decode_head)
+        self.align_corners = self.decode_head.align_corners
+        self.num_classes = self.decode_head.num_classes
+        self.out_channels = self.decode_head.out_channels
+
+    def extract_feat(self, inputs: Tensor) -> List[Tensor]:
+        """Extract visual features from images."""
+        x = self.image_encoder(inputs)
+        return x
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode the name of classes with text_encoder and encode images with
+        image_encoder.
+
+        Then decode the class embedding and visual feature into a semantic
+        segmentation map of the same size as input.
+        """
+        classifier_embeds = self.text_encoder()
+        clip_inputs = inputs
+        if self.asymetric_input:
+            clip_inputs = F.interpolate(
+                inputs, scale_factor=self.encoder_resolution, mode='bilinear')
+        x = self.image_encoder(clip_inputs)
+        seg_logits = self.decode_head.predict([inputs, x, classifier_embeds],
+                                              batch_img_metas, self.test_cfg)
+
+        return seg_logits
+
+    def _decode_head_forward_train(self, inputs: List[Tensor],
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.loss(inputs, data_samples,
+                                            self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): Input images.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        classifier_embeds = self.text_encoder()
+        clip_inputs = inputs
+        if self.asymetric_input:
+            clip_inputs = F.interpolate(
+                inputs, scale_factor=self.encoder_resolution, mode='bilinear')
+        x = self.image_encoder(clip_inputs)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(
+            [inputs, x, classifier_embeds], data_samples)
+        losses.update(loss_decode)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`], optional): The seg data
+                samples. It usually includes information such as `metainfo`
+                and `gt_sem_seg`.
+
+        Returns:
+            list[:obj:`SegDataSample`]: Segmentation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+            - ``seg_logits``(PixelData): Predicted logits of semantic
+                segmentation before normalization.
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        seg_logits = self.inference(inputs, batch_img_metas)
+
+        return self.postprocess_result(seg_logits, data_samples)
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_sem_seg`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+        return self.decode_head.forward(x)
+
+    def slide_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+
+        Args:
+            inputs (tensor): the tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = inputs.size()
+        out_channels = self.out_channels
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+        count_mat = inputs.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = inputs[:, :, y1:y2, x1:x2]
+                # change the image shape to patch shape
+                batch_img_metas[0]['img_shape'] = crop_img.shape[2:]
+                # the output of encode_decode is seg logits tensor map
+                # with shape [N, C, H, W]
+                crop_seg_logit = self.encode_decode(crop_img, batch_img_metas)
+                preds += F.pad(crop_seg_logit,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        seg_logits = preds / count_mat
+
+        return seg_logits
+
+    def whole_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference with full image.
+
+        Args:
+            inputs (Tensor): The tensor should have a shape NxCxHxW, which
+                contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        seg_logits = self.encode_decode(inputs, batch_img_metas)
+
+        return seg_logits
+
+    def inference(self, inputs: Tensor, batch_img_metas: List[dict]) -> Tensor:
+        """Inference with slide/whole style.
+
+        Args:
+            inputs (Tensor): The input image of shape (N, 3, H, W).
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', 'pad_shape', and 'padding_size'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        assert self.test_cfg.mode in ['slide', 'whole']
+        ori_shape = batch_img_metas[0]['ori_shape']
+        assert all(_['ori_shape'] == ori_shape for _ in batch_img_metas)
+        if self.test_cfg.mode == 'slide':
+            seg_logit = self.slide_inference(inputs, batch_img_metas)
+        else:
+            seg_logit = self.whole_inference(inputs, batch_img_metas)
+
+        return seg_logit
+
+    def aug_test(self, inputs, batch_img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented seg logit inplace
+        seg_logit = self.inference(inputs[0], batch_img_metas[0], rescale)
+        for i in range(1, len(inputs)):
+            cur_seg_logit = self.inference(inputs[i], batch_img_metas[i],
+                                           rescale)
+            seg_logit += cur_seg_logit
+        seg_logit /= len(inputs)
+        seg_pred = seg_logit.argmax(dim=1)
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
diff --git a/head_extractor/build/lib/mmseg/models/segmentors/seg_tta.py b/head_extractor/build/lib/mmseg/models/segmentors/seg_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..63ef61d223a572dec4fc3e43e1550b98cd2e7302
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/segmentors/seg_tta.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmengine.model import BaseTTAModel
+from mmengine.structures import PixelData
+
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList
+
+
+@MODELS.register_module()
+class SegTTAModel(BaseTTAModel):
+
+    def merge_preds(self, data_samples_list: List[SampleList]) -> SampleList:
+        """Merge predictions of enhanced data to one prediction.
+
+        Args:
+            data_samples_list (List[SampleList]): List of predictions
+                of all enhanced data.
+
+        Returns:
+            SampleList: Merged prediction.
+        """
+        predictions = []
+        for data_samples in data_samples_list:
+            seg_logits = data_samples[0].seg_logits.data
+            logits = torch.zeros(seg_logits.shape).to(seg_logits)
+            for data_sample in data_samples:
+                seg_logit = data_sample.seg_logits.data
+                if self.module.out_channels > 1:
+                    logits += seg_logit.softmax(dim=0)
+                else:
+                    logits += seg_logit.sigmoid()
+            logits /= len(data_samples)
+            if self.module.out_channels == 1:
+                seg_pred = (logits > self.module.decode_head.threshold
+                            ).to(logits).squeeze(1)
+            else:
+                seg_pred = logits.argmax(dim=0)
+            data_sample.set_data({'pred_sem_seg': PixelData(data=seg_pred)})
+            if hasattr(data_samples[0], 'gt_sem_seg'):
+                data_sample.set_data(
+                    {'gt_sem_seg': data_samples[0].gt_sem_seg})
+            data_sample.set_metainfo({'img_path': data_samples[0].img_path})
+            predictions.append(data_sample)
+        return predictions
diff --git a/head_extractor/build/lib/mmseg/models/text_encoder/__init__.py b/head_extractor/build/lib/mmseg/models/text_encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..199856d9d79800cbcd9aa7b77223a6528c6b7e0a
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/text_encoder/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .clip_text_encoder import CLIPTextEncoder
+
+__all__ = ['CLIPTextEncoder']
diff --git a/head_extractor/build/lib/mmseg/models/text_encoder/clip_text_encoder.py b/head_extractor/build/lib/mmseg/models/text_encoder/clip_text_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a18b86395ebcf0443e9aab05f4454acada98990
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/text_encoder/clip_text_encoder.py
@@ -0,0 +1,229 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import BaseTransformerLayer
+from mmengine.model import BaseModule, ModuleList
+from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict
+from torch.nn import functional as F
+
+from mmseg.registry import MODELS
+from mmseg.utils import get_classes, get_predefined_templates, tokenizer
+
+
+@MODELS.register_module()
+class CLIPTextEncoder(BaseModule):
+    """A text encoder with transformer architecture to encode the label text.
+
+    Modified from https://github.com/MendelXu/SAN/blob/main/san/model/clip_utils/classifier.py # noqa:E501
+    Copyright (c) 2023 MendelXu.
+    Licensed under the MIT License
+
+    Args:
+        dataset_name: (str|None): The name of the dataset to which
+            the data belongs.
+        vocabulary: (List[str]|None): The list of class names. Default: None.
+        templates: (List[str]|None): The prompt template used for labels.
+            Default: None.
+        total_vocab_size: (int): Number of all words used by the pre-trained
+            model. Default: 49408 (CLIP).
+        context_length: (int): The max length of prompt text.
+            Default: 77 (CLIP).
+        embed_dims: (int): Width of transformer model. Default: 512.
+        num_layers: (int): Depth of transformer. Default: 12,
+        num_heads: (int): Number of attention heads in transformer.
+            Default: 8,
+        mlp_ratio: (int) Ratio of mlp hidden dim to embedding dim in
+            transformer. Default: 4,
+        output_dims: (int) Dim of output text embeddings. Default: 512,
+        cache_feature: (bool) Whether to save class embeddings in cache.
+            Default: True,
+        cat_bg: (bool) Whether to add background embedding. Default: True.
+        norm_cfg (dict|None): Config for norm layer. Default: dict(type='LN')
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 dataset_name: str = None,
+                 vocabulary: List[str] = None,
+                 templates: str = 'vild',
+                 total_vocab_size: int = 49408,
+                 context_length: int = 77,
+                 embed_dims: int = 512,
+                 num_layers: int = 12,
+                 num_heads: int = 8,
+                 mlp_ratio: int = 4,
+                 output_dims: int = 512,
+                 cache_feature: bool = True,
+                 cat_bg: bool = True,
+                 norm_cfg: dict = dict(type='LN'),
+                 init_cfg: dict = None):
+        super().__init__(init_cfg)
+        if isinstance(templates, List):
+            self.templates = templates
+        else:
+            self.templates = get_predefined_templates(templates)
+
+        assert dataset_name is not None or vocabulary is not None, \
+            "text_encoder required either 'dataset_name' or 'vocabulary'"
+        assert dataset_name is None or vocabulary is None, \
+            "there is conflict between 'dataset_name' and 'vocabulary'"
+        self.dataset_name = dataset_name
+        self.vocabulary = vocabulary
+        self.num_pos = context_length
+        self.token_embedding = nn.Embedding(total_vocab_size, embed_dims)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(context_length, embed_dims))
+        self.text_projection = nn.Parameter(
+            torch.empty(embed_dims, output_dims))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.transformer = ModuleList()
+        self.register_buffer(
+            'attn_mask', self.build_attention_mask(), persistent=False)
+        for i in range(num_layers):
+            self.transformer.append(
+                BaseTransformerLayer(
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=embed_dims,
+                        num_heads=num_heads,
+                        batch_first=False,
+                        bias=True),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=embed_dims,
+                        feedforward_channels=mlp_ratio * embed_dims,
+                        act_cfg=dict(type='QuickGELU')),
+                    operation_order=('norm', 'self_attn', 'norm', 'ffn')))
+        self.ln_final = build_norm_layer(
+            norm_cfg, embed_dims, postfix='_final')[1]
+
+        self.cache_feature = cache_feature
+        if self.cache_feature:
+            self.cache = {}
+
+        self._freeze()
+
+        self.cat_bg = cat_bg
+        if self.cat_bg:
+            self.bg_embed = nn.Parameter(
+                torch.randn(1, self.text_projection.shape[1]))
+
+    @property
+    def ln_final(self):
+        return getattr(self, self.final_name)
+
+    def build_attention_mask(self):
+        """lazily create causal attention mask, with full attention between the
+        tokens.
+
+        pytorch uses additive attention mask; fill with -inf
+        """
+        mask = torch.empty(self.num_pos, self.num_pos)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    def _freeze(self):
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def init_weights(self):
+        if self.cat_bg:
+            nn.init.normal_(
+                self.bg_embed,
+                std=self.bg_embed.shape[1]**-0.5,
+            )
+        if isinstance(self.init_cfg, dict) and \
+                self.init_cfg.get('type') == 'Pretrained_Part':
+            checkpoint = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+
+            state_dict = checkpoint.copy()
+            para_prefix = 'text_encoder'
+            prefix_len = len(para_prefix) + 1
+            for k, v in checkpoint.items():
+                state_dict.pop(k)
+                if para_prefix in k:
+                    state_dict[k[prefix_len:]] = v
+
+            load_state_dict(self, state_dict, strict=False, logger=None)
+
+        else:
+            super().init_weights()
+
+    @torch.no_grad()
+    def encode_text(self, text, normalize=False):
+        """encode class token."""
+
+        embed_device = self.token_embedding.weight.device
+        x = self.token_embedding(
+            text.to(embed_device))  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        for block in self.transformer:
+            x = block(query=x, attn_masks=self.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x)  # [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding
+        # (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]),
+              text.argmax(dim=-1)] @ self.text_projection
+        return F.normalize(x, dim=-1) if normalize else x
+
+    def template_encode(self, vocabulary):
+        """Prompt engineering."""
+        text_embed_bucket = []
+        for template in self.templates:
+            text_inputs = tokenizer.tokenize(
+                [template.format(noun) for noun in vocabulary])
+            text_embed = self.encode_text(text_inputs, normalize=True)
+            text_embed_bucket.append(text_embed)
+        text_embed = torch.stack(text_embed_bucket).mean(dim=0)
+        text_embed = text_embed / text_embed.norm(dim=-1, keepdim=True)
+        return text_embed
+
+    def forward(self):
+        """Forward function."""
+        if self.dataset_name is None:  # encoding vocabulary directly
+            class_names = self.vocabulary
+            if self.cache_feature:
+                new_classes = [
+                    word for word in class_names if word not in self.cache
+                ]
+                if len(new_classes) > 0:
+                    class_embeds = self.template_encode(new_classes)
+                    self.cache.update(dict(zip(new_classes, class_embeds)))
+                class_embeds = torch.stack(
+                    [self.cache[word] for word in class_names])
+            else:
+                class_embeds = self.template_encode(class_names)
+
+        else:  # encoding the classes of the dataset
+            class_names = get_classes(self.dataset_name)
+            if class_names[0] == 'background':
+                class_names = class_names[1:]
+            if self.cache_feature:
+                if self.dataset_name not in self.cache:
+                    class_embeds = self.template_encode(class_names)
+                    self.cache[self.dataset_name] = class_embeds
+                else:
+                    class_embeds = self.cache[self.dataset_name]
+            else:
+                class_embeds = self.template_encode(class_names)
+
+        if self.cat_bg:
+            class_embeds = torch.cat([class_embeds, self.bg_embed])
+            class_embeds = F.normalize(class_embeds, p=2, dim=-1)
+        return self.logit_scale.exp() * class_embeds
+
+
+@MODELS.register_module()
+class QuickGELU(nn.Module):
+    # From https://github.com/openai/CLIP/blob/main/clip/model.py
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
diff --git a/head_extractor/build/lib/mmseg/models/utils/__init__.py b/head_extractor/build/lib/mmseg/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0751b17c02de14e9bf1bfc02230d507a143e9c0
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .basic_block import BasicBlock, Bottleneck
+from .embed import PatchEmbed
+from .encoding import Encoding
+from .inverted_residual import InvertedResidual, InvertedResidualV3
+from .make_divisible import make_divisible
+from .point_sample import get_uncertain_point_coords_with_randomness
+from .ppm import DAPPM, PAPPM
+from .res_layer import ResLayer
+from .se_layer import SELayer
+from .self_attention_block import SelfAttentionBlock
+from .shape_convert import (nchw2nlc2nchw, nchw_to_nlc, nlc2nchw2nlc,
+                            nlc_to_nchw)
+from .up_conv_block import UpConvBlock
+
+# isort: off
+from .wrappers import Upsample, resize
+from .san_layers import MLP, LayerNorm2d, cross_attn_layer
+
+__all__ = [
+    'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual',
+    'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'PatchEmbed',
+    'nchw_to_nlc', 'nlc_to_nchw', 'nchw2nlc2nchw', 'nlc2nchw2nlc', 'Encoding',
+    'Upsample', 'resize', 'DAPPM', 'PAPPM', 'BasicBlock', 'Bottleneck',
+    'cross_attn_layer', 'LayerNorm2d', 'MLP',
+    'get_uncertain_point_coords_with_randomness'
+]
diff --git a/head_extractor/build/lib/mmseg/models/utils/basic_block.py b/head_extractor/build/lib/mmseg/models/utils/basic_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e1ad8146dd200c5f1e543adf22ada654ee196a4
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/basic_block.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType
+
+
+class BasicBlock(BaseModule):
+    """Basic block from `ResNet <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        in_channels (int): Input channels.
+        channels (int): Output channels.
+        stride (int): Stride of the first block. Default: 1.
+        downsample (nn.Module, optional): Downsample operation on identity.
+            Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict, optional): Config dict for activation layer in
+            ConvModule. Default: dict(type='ReLU', inplace=True).
+        act_cfg_out (dict, optional): Config dict for activation layer at the
+            last of the block. Default: None.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    expansion = 1
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 stride: int = 1,
+                 downsample: nn.Module = None,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 act_cfg_out: OptConfigType = dict(type='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.conv1 = ConvModule(
+            in_channels,
+            channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = ConvModule(
+            channels,
+            channels,
+            kernel_size=3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.downsample = downsample
+        if act_cfg_out:
+            self.act = MODELS.build(act_cfg_out)
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.downsample:
+            residual = self.downsample(x)
+
+        out += residual
+
+        if hasattr(self, 'act'):
+            out = self.act(out)
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    """Bottleneck block from `ResNet <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        in_channels (int): Input channels.
+        channels (int): Output channels.
+        stride (int): Stride of the first block. Default: 1.
+        downsample (nn.Module, optional): Downsample operation on identity.
+            Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict, optional): Config dict for activation layer in
+            ConvModule. Default: dict(type='ReLU', inplace=True).
+        act_cfg_out (dict, optional): Config dict for activation layer at
+            the last of the block. Default: None.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    expansion = 2
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 stride: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 act_cfg_out: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.conv1 = ConvModule(
+            in_channels, channels, 1, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.conv2 = ConvModule(
+            channels,
+            channels,
+            3,
+            stride,
+            1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv3 = ConvModule(
+            channels,
+            channels * self.expansion,
+            1,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        if act_cfg_out:
+            self.act = MODELS.build(act_cfg_out)
+        self.downsample = downsample
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = x
+
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = self.conv3(out)
+
+        if self.downsample:
+            residual = self.downsample(x)
+
+        out += residual
+
+        if hasattr(self, 'act'):
+            out = self.act(out)
+
+        return out
diff --git a/head_extractor/build/lib/mmseg/models/utils/embed.py b/head_extractor/build/lib/mmseg/models/utils/embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..aef0a40b0a87bb6616db96fe2c72c19cc6f5b366
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/embed.py
@@ -0,0 +1,330 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+from mmengine.utils import to_2tuple
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding to input (if needed) so that input can get fully covered
+    by filter you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
+    input. The "corner"  mode would pad zero to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel:
+        stride (int | tuple): Stride of the filter. Default: 1:
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+
+        super().__init__()
+
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The config dict for embedding
+            conv layer type selection. Default: "Conv2d".
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int, optional): The slide stride of embedding conv.
+            Default: None (Would be set as `kernel_size`).
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only work when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmengine.ConfigDict`, optional): The Config for
+            initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv2d',
+                 kernel_size=16,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adap_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adap_padding:
+                pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (out_h, out_w).
+        """
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
+    merge patch, which is about 25% faster than original implementation.
+    Instead, we need to modify pretrained models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adap_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+            H, W = x.shape[-2:]
+
+        x = self.sampler(x)
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
diff --git a/head_extractor/build/lib/mmseg/models/utils/encoding.py b/head_extractor/build/lib/mmseg/models/utils/encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee4f0574fbc1957cf8da591a0e4befd6d8a125d3
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/encoding.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+class Encoding(nn.Module):
+    """Encoding Layer: a learnable residual encoder.
+
+    Input is of shape  (batch_size, channels, height, width).
+    Output is of shape (batch_size, num_codes, channels).
+
+    Args:
+        channels: dimension of the features or feature channels
+        num_codes: number of code words
+    """
+
+    def __init__(self, channels, num_codes):
+        super().__init__()
+        # init codewords and smoothing factor
+        self.channels, self.num_codes = channels, num_codes
+        std = 1. / ((num_codes * channels)**0.5)
+        # [num_codes, channels]
+        self.codewords = nn.Parameter(
+            torch.empty(num_codes, channels,
+                        dtype=torch.float).uniform_(-std, std),
+            requires_grad=True)
+        # [num_codes]
+        self.scale = nn.Parameter(
+            torch.empty(num_codes, dtype=torch.float).uniform_(-1, 0),
+            requires_grad=True)
+
+    @staticmethod
+    def scaled_l2(x, codewords, scale):
+        num_codes, channels = codewords.size()
+        batch_size = x.size(0)
+        reshaped_scale = scale.view((1, 1, num_codes))
+        expanded_x = x.unsqueeze(2).expand(
+            (batch_size, x.size(1), num_codes, channels))
+        reshaped_codewords = codewords.view((1, 1, num_codes, channels))
+
+        scaled_l2_norm = reshaped_scale * (
+            expanded_x - reshaped_codewords).pow(2).sum(dim=3)
+        return scaled_l2_norm
+
+    @staticmethod
+    def aggregate(assignment_weights, x, codewords):
+        num_codes, channels = codewords.size()
+        reshaped_codewords = codewords.view((1, 1, num_codes, channels))
+        batch_size = x.size(0)
+
+        expanded_x = x.unsqueeze(2).expand(
+            (batch_size, x.size(1), num_codes, channels))
+        encoded_feat = (assignment_weights.unsqueeze(3) *
+                        (expanded_x - reshaped_codewords)).sum(dim=1)
+        return encoded_feat
+
+    def forward(self, x):
+        assert x.dim() == 4 and x.size(1) == self.channels
+        # [batch_size, channels, height, width]
+        batch_size = x.size(0)
+        # [batch_size, height x width, channels]
+        x = x.view(batch_size, self.channels, -1).transpose(1, 2).contiguous()
+        # assignment_weights: [batch_size, channels, num_codes]
+        assignment_weights = F.softmax(
+            self.scaled_l2(x, self.codewords, self.scale), dim=2)
+        # aggregate
+        encoded_feat = self.aggregate(assignment_weights, x, self.codewords)
+        return encoded_feat
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(Nx{self.channels}xHxW =>Nx{self.num_codes}' \
+                    f'x{self.channels})'
+        return repr_str
diff --git a/head_extractor/build/lib/mmseg/models/utils/inverted_residual.py b/head_extractor/build/lib/mmseg/models/utils/inverted_residual.py
new file mode 100644
index 0000000000000000000000000000000000000000..56190b3bfe7cc8fe98bf34c3812db18dd34a8f02
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/inverted_residual.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule
+from torch import nn
+from torch.utils import checkpoint as cp
+
+from .se_layer import SELayer
+
+
+class InvertedResidual(nn.Module):
+    """InvertedResidual block for MobileNetV2.
+
+    Args:
+        in_channels (int): The input channels of the InvertedResidual block.
+        out_channels (int): The output channels of the InvertedResidual block.
+        stride (int): Stride of the middle (first) 3x3 convolution.
+        expand_ratio (int): Adjusts number of channels of the hidden layer
+            in InvertedResidual by this amount.
+        dilation (int): Dilation rate of depthwise conv. Default: 1
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 expand_ratio,
+                 dilation=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 with_cp=False,
+                 **kwargs):
+        super().__init__()
+        self.stride = stride
+        assert stride in [1, 2], f'stride must in [1, 2]. ' \
+            f'But received {stride}.'
+        self.with_cp = with_cp
+        self.use_res_connect = self.stride == 1 and in_channels == out_channels
+        hidden_dim = int(round(in_channels * expand_ratio))
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=hidden_dim,
+                    kernel_size=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    **kwargs))
+        layers.extend([
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=3,
+                stride=stride,
+                padding=dilation,
+                dilation=dilation,
+                groups=hidden_dim,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                **kwargs),
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=out_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                **kwargs)
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            if self.use_res_connect:
+                return x + self.conv(x)
+            else:
+                return self.conv(x)
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class InvertedResidualV3(nn.Module):
+    """Inverted Residual Block for MobileNetV3.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        mid_channels (int): The input channels of the depthwise convolution.
+        kernel_size (int): The kernel size of the depthwise convolution.
+            Default: 3.
+        stride (int): The stride of the depthwise convolution. Default: 1.
+        se_cfg (dict): Config dict for se layer. Default: None, which means no
+            se layer.
+        with_expand_conv (bool): Use expand conv or not. If set False,
+            mid_channels must be the same with in_channels. Default: True.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_expand_conv=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        super().__init__()
+        self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
+        assert stride in [1, 2]
+        self.with_cp = with_cp
+        self.with_se = se_cfg is not None
+        self.with_expand_conv = with_expand_conv
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+        if not self.with_expand_conv:
+            assert mid_channels == in_channels
+
+        if self.with_expand_conv:
+            self.expand_conv = ConvModule(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.depthwise_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=mid_channels,
+            conv_cfg=dict(
+                type='Conv2dAdaptivePadding') if stride == 2 else conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.linear_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+
+            if self.with_expand_conv:
+                out = self.expand_conv(out)
+
+            out = self.depthwise_conv(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.linear_conv(out)
+
+            if self.with_res_shortcut:
+                return x + out
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
diff --git a/head_extractor/build/lib/mmseg/models/utils/make_divisible.py b/head_extractor/build/lib/mmseg/models/utils/make_divisible.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed42c2eeea2a6aed03a0be5516b8d1ef1139e486
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/make_divisible.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+    """Make divisible function.
+
+    This function rounds the channel number to the nearest value that can be
+    divisible by the divisor. It is taken from the original tf repo. It ensures
+    that all layers have a channel number that is divisible by divisor. It can
+    be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py  # noqa
+
+    Args:
+        value (int): The original channel number.
+        divisor (int): The divisor to fully divide the channel number.
+        min_value (int): The minimum value of the output channel.
+            Default: None, means that the minimum value equal to the divisor.
+        min_ratio (float): The minimum ratio of the rounded channel number to
+            the original channel number. Default: 0.9.
+
+    Returns:
+        int: The modified output channel number.
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than (1-min_ratio).
+    if new_value < min_ratio * value:
+        new_value += divisor
+    return new_value
diff --git a/head_extractor/build/lib/mmseg/models/utils/point_sample.py b/head_extractor/build/lib/mmseg/models/utils/point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..1afc957f3da7d1dc030c21d40311c768c6952ea4
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/point_sample.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import point_sample
+from torch import Tensor
+
+
+def get_uncertainty(mask_preds: Tensor, labels: Tensor) -> Tensor:
+    """Estimate uncertainty based on pred logits.
+
+    We estimate uncertainty as L1 distance between 0.0 and the logits
+    prediction in 'mask_preds' for the foreground class in `classes`.
+
+    Args:
+        mask_preds (Tensor): mask predication logits, shape (num_rois,
+            num_classes, mask_height, mask_width).
+
+        labels (Tensor): Either predicted or ground truth label for
+            each predicted mask, of length num_rois.
+
+    Returns:
+        scores (Tensor): Uncertainty scores with the most uncertain
+            locations having the highest uncertainty score,
+            shape (num_rois, 1, mask_height, mask_width)
+    """
+    if mask_preds.shape[1] == 1:
+        gt_class_logits = mask_preds.clone()
+    else:
+        inds = torch.arange(mask_preds.shape[0], device=mask_preds.device)
+        gt_class_logits = mask_preds[inds, labels].unsqueeze(1)
+    return -torch.abs(gt_class_logits)
+
+
+def get_uncertain_point_coords_with_randomness(
+        mask_preds: Tensor, labels: Tensor, num_points: int,
+        oversample_ratio: float, importance_sample_ratio: float) -> Tensor:
+    """Get ``num_points`` most uncertain points with random points during
+    train.
+
+    Sample points in [0, 1] x [0, 1] coordinate space based on their
+    uncertainty. The uncertainties are calculated for each point using
+    'get_uncertainty()' function that takes point's logit prediction as
+    input.
+
+    Args:
+        mask_preds (Tensor): A tensor of shape (num_rois, num_classes,
+            mask_height, mask_width) for class-specific or class-agnostic
+            prediction.
+        labels (Tensor): The ground truth class for each instance.
+        num_points (int): The number of points to sample.
+        oversample_ratio (float): Oversampling parameter.
+        importance_sample_ratio (float): Ratio of points that are sampled
+            via importnace sampling.
+
+    Returns:
+        point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+            that contains the coordinates sampled points.
+    """
+    assert oversample_ratio >= 1
+    assert 0 <= importance_sample_ratio <= 1
+    batch_size = mask_preds.shape[0]
+    num_sampled = int(num_points * oversample_ratio)
+    point_coords = torch.rand(
+        batch_size, num_sampled, 2, device=mask_preds.device)
+    point_logits = point_sample(mask_preds, point_coords)
+    # It is crucial to calculate uncertainty based on the sampled
+    # prediction value for the points. Calculating uncertainties of the
+    # coarse predictions first and sampling them for points leads to
+    # incorrect results.  To illustrate this: assume uncertainty func(
+    # logits)=-abs(logits), a sampled point between two coarse
+    # predictions with -1 and 1 logits has 0 logits, and therefore 0
+    # uncertainty value. However, if we calculate uncertainties for the
+    # coarse predictions first, both will have -1 uncertainty,
+    # and sampled point will get -1 uncertainty.
+    point_uncertainties = get_uncertainty(point_logits, labels)
+    num_uncertain_points = int(importance_sample_ratio * num_points)
+    num_random_points = num_points - num_uncertain_points
+    idx = torch.topk(
+        point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+    shift = num_sampled * torch.arange(
+        batch_size, dtype=torch.long, device=mask_preds.device)
+    idx += shift[:, None]
+    point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+        batch_size, num_uncertain_points, 2)
+    if num_random_points > 0:
+        rand_roi_coords = torch.rand(
+            batch_size, num_random_points, 2, device=mask_preds.device)
+        point_coords = torch.cat((point_coords, rand_roi_coords), dim=1)
+    return point_coords
diff --git a/head_extractor/build/lib/mmseg/models/utils/ppm.py b/head_extractor/build/lib/mmseg/models/utils/ppm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fe6ff26fae6869b989cecde96af3ceff1a37b38
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/ppm.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList, Sequential
+from torch import Tensor
+
+
+class DAPPM(BaseModule):
+    """DAPPM module in `DDRNet <https://arxiv.org/abs/2101.06085>`_.
+
+    Args:
+        in_channels (int): Input channels.
+        branch_channels (int): Branch channels.
+        out_channels (int): Output channels.
+        num_scales (int): Number of scales.
+        kernel_sizes (list[int]): Kernel sizes of each scale.
+        strides (list[int]): Strides of each scale.
+        paddings (list[int]): Paddings of each scale.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU', inplace=True).
+        conv_cfg (dict): Config dict for convolution layer in ConvModule.
+            Default: dict(order=('norm', 'act', 'conv'), bias=False).
+        upsample_mode (str): Upsample mode. Default: 'bilinear'.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 branch_channels: int,
+                 out_channels: int,
+                 num_scales: int,
+                 kernel_sizes: List[int] = [5, 9, 17],
+                 strides: List[int] = [2, 4, 8],
+                 paddings: List[int] = [2, 4, 8],
+                 norm_cfg: Dict = dict(type='BN', momentum=0.1),
+                 act_cfg: Dict = dict(type='ReLU', inplace=True),
+                 conv_cfg: Dict = dict(
+                     order=('norm', 'act', 'conv'), bias=False),
+                 upsample_mode: str = 'bilinear'):
+        super().__init__()
+
+        self.num_scales = num_scales
+        self.unsample_mode = upsample_mode
+        self.in_channels = in_channels
+        self.branch_channels = branch_channels
+        self.out_channels = out_channels
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.conv_cfg = conv_cfg
+
+        self.scales = ModuleList([
+            ConvModule(
+                in_channels,
+                branch_channels,
+                kernel_size=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                **conv_cfg)
+        ])
+        for i in range(1, num_scales - 1):
+            self.scales.append(
+                Sequential(*[
+                    nn.AvgPool2d(
+                        kernel_size=kernel_sizes[i - 1],
+                        stride=strides[i - 1],
+                        padding=paddings[i - 1]),
+                    ConvModule(
+                        in_channels,
+                        branch_channels,
+                        kernel_size=1,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg,
+                        **conv_cfg)
+                ]))
+        self.scales.append(
+            Sequential(*[
+                nn.AdaptiveAvgPool2d((1, 1)),
+                ConvModule(
+                    in_channels,
+                    branch_channels,
+                    kernel_size=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    **conv_cfg)
+            ]))
+        self.processes = ModuleList()
+        for i in range(num_scales - 1):
+            self.processes.append(
+                ConvModule(
+                    branch_channels,
+                    branch_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    **conv_cfg))
+
+        self.compression = ConvModule(
+            branch_channels * num_scales,
+            out_channels,
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **conv_cfg)
+
+        self.shortcut = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **conv_cfg)
+
+    def forward(self, inputs: Tensor):
+        feats = []
+        feats.append(self.scales[0](inputs))
+
+        for i in range(1, self.num_scales):
+            feat_up = F.interpolate(
+                self.scales[i](inputs),
+                size=inputs.shape[2:],
+                mode=self.unsample_mode)
+            feats.append(self.processes[i - 1](feat_up + feats[i - 1]))
+
+        return self.compression(torch.cat(feats,
+                                          dim=1)) + self.shortcut(inputs)
+
+
+class PAPPM(DAPPM):
+    """PAPPM module in `PIDNet <https://arxiv.org/abs/2206.02066>`_.
+
+    Args:
+        in_channels (int): Input channels.
+        branch_channels (int): Branch channels.
+        out_channels (int): Output channels.
+        num_scales (int): Number of scales.
+        kernel_sizes (list[int]): Kernel sizes of each scale.
+        strides (list[int]): Strides of each scale.
+        paddings (list[int]): Paddings of each scale.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', momentum=0.1).
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU', inplace=True).
+        conv_cfg (dict): Config dict for convolution layer in ConvModule.
+            Default: dict(order=('norm', 'act', 'conv'), bias=False).
+        upsample_mode (str): Upsample mode. Default: 'bilinear'.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 branch_channels: int,
+                 out_channels: int,
+                 num_scales: int,
+                 kernel_sizes: List[int] = [5, 9, 17],
+                 strides: List[int] = [2, 4, 8],
+                 paddings: List[int] = [2, 4, 8],
+                 norm_cfg: Dict = dict(type='BN', momentum=0.1),
+                 act_cfg: Dict = dict(type='ReLU', inplace=True),
+                 conv_cfg: Dict = dict(
+                     order=('norm', 'act', 'conv'), bias=False),
+                 upsample_mode: str = 'bilinear'):
+        super().__init__(in_channels, branch_channels, out_channels,
+                         num_scales, kernel_sizes, strides, paddings, norm_cfg,
+                         act_cfg, conv_cfg, upsample_mode)
+
+        self.processes = ConvModule(
+            self.branch_channels * (self.num_scales - 1),
+            self.branch_channels * (self.num_scales - 1),
+            kernel_size=3,
+            padding=1,
+            groups=self.num_scales - 1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            **self.conv_cfg)
+
+    def forward(self, inputs: Tensor):
+        x_ = self.scales[0](inputs)
+        feats = []
+        for i in range(1, self.num_scales):
+            feat_up = F.interpolate(
+                self.scales[i](inputs),
+                size=inputs.shape[2:],
+                mode=self.unsample_mode,
+                align_corners=False)
+            feats.append(feat_up + x_)
+        scale_out = self.processes(torch.cat(feats, dim=1))
+        return self.compression(torch.cat([x_, scale_out],
+                                          dim=1)) + self.shortcut(inputs)
diff --git a/head_extractor/build/lib/mmseg/models/utils/res_layer.py b/head_extractor/build/lib/mmseg/models/utils/res_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dd7a6f75a168f2f7e3c61f82d309b1cf0d502bc
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/res_layer.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import Sequential
+from torch import nn as nn
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        multi_grid (int | None): Multi grid dilation rates of last
+            stage. Default: None
+        contract_dilation (bool): Whether contract first dilation of each layer
+            Default: False
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 dilation=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 multi_grid=None,
+                 contract_dilation=False,
+                 **kwargs):
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if multi_grid is None:
+            if dilation > 1 and contract_dilation:
+                first_dilation = dilation // 2
+            else:
+                first_dilation = dilation
+        else:
+            first_dilation = multi_grid[0]
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                dilation=first_dilation,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                **kwargs))
+        inplanes = planes * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    dilation=dilation if multi_grid is None else multi_grid[i],
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+        super().__init__(*layers)
diff --git a/head_extractor/build/lib/mmseg/models/utils/san_layers.py b/head_extractor/build/lib/mmseg/models/utils/san_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2267686daf62658c5dc81408e0a399c43aee83aa
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/san_layers.py
@@ -0,0 +1,418 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/MendelXu/SAN/blob/main/san/model/attn_helper.py  # noqa: E501
+# Copyright (c) 2023 MendelXu.
+# Licensed under the MIT License
+
+import warnings
+from typing import Optional
+
+import torch
+from mmcv.cnn.bricks.transformer import BaseTransformerLayer
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+
+def cross_attn_with_self_bias(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    embed_dim_to_check: int,
+    num_heads: int,
+    in_proj_weight: Tensor,
+    in_proj_bias: Tensor,
+    bias_k: Optional[Tensor],
+    bias_v: Optional[Tensor],
+    add_zero_attn: bool,
+    dropout_p: float,
+    out_proj_weight: Tensor,
+    out_proj_bias: Tensor,
+    training: bool = True,
+    key_padding_mask: Optional[Tensor] = None,
+    need_weights: bool = True,
+    attn_mask: Optional[Tensor] = None,
+    use_separate_proj_weight: bool = False,
+    q_proj_weight: Optional[Tensor] = None,
+    k_proj_weight: Optional[Tensor] = None,
+    v_proj_weight: Optional[Tensor] = None,
+    static_k: Optional[Tensor] = None,
+    static_v: Optional[Tensor] = None,
+):
+    """Forward function of multi-head attention. Modified from
+    multi_head_attention_forward in
+    https://github.com/pytorch/pytorch/blob/main/torch/nn/functional.py.
+
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+            Default: `True`
+            Note: `needs_weight` defaults to `True`, but should be set to `False`
+            For best performance when attention weights are not needed.
+            *Setting needs_weights to `True`
+            leads to a significant performance degradation.*
+        attn_mask: 2D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+    """  # noqa: E501
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    # allow MHA to have different sizes for the feature dimension
+    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+
+    head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, \
+        'embed_dim must be divisible by num_heads'
+    scaling = float(head_dim)**-0.5
+
+    if not use_separate_proj_weight:
+        if (query is key or torch.equal(
+                query, key)) and (key is value or torch.equal(key, value)):
+            # self-attention
+            raise NotImplementedError('self-attention is not implemented')
+
+        elif key is value or torch.equal(key, value):
+            # encoder-decoder attention
+            # This is inline in_proj function
+            # with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = F.linear(query, _w, _b)
+
+            if key is None:
+                assert value is None
+                k = None
+                v = None
+                q_k = None
+                q_v = None
+            else:
+                # This is inline in_proj function with
+                # in_proj_weight and in_proj_bias
+                _b = in_proj_bias
+                _start = embed_dim
+                _end = None
+                _w = in_proj_weight[_start:, :]
+                if _b is not None:
+                    _b = _b[_start:]
+                k, v = F.linear(key, _w, _b).chunk(2, dim=-1)
+                q_k, q_v = F.linear(query, _w, _b).chunk(2, dim=-1)
+        else:
+            # This is inline in_proj function with
+            # in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = F.linear(query, _w, _b)
+
+            # This is inline in_proj function with
+            # in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim
+            _end = embed_dim * 2
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            k = F.linear(key, _w, _b)
+            q_k = F.linear(query, _w, _b)
+            # This is inline in_proj function with
+            # in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim * 2
+            _end = None
+            _w = in_proj_weight[_start:, :]
+            if _b is not None:
+                _b = _b[_start:]
+            v = F.linear(value, _w, _b)
+            q_v = F.linear(query, _w, _b)
+    else:
+        q_proj_weight_non_opt = \
+            torch.jit._unwrap_optional(q_proj_weight)
+        len1, len2 = q_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == query.size(-1)
+
+        k_proj_weight_non_opt = \
+            torch.jit._unwrap_optional(k_proj_weight)
+        len1, len2 = k_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == key.size(-1)
+
+        v_proj_weight_non_opt = \
+            torch.jit._unwrap_optional(v_proj_weight)
+        len1, len2 = v_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == value.size(-1)
+
+        if in_proj_bias is not None:
+            q = F.linear(query, q_proj_weight_non_opt,
+                         in_proj_bias[0:embed_dim])
+            k = F.linear(key, k_proj_weight_non_opt,
+                         in_proj_bias[embed_dim:(embed_dim * 2)])
+            v = F.linear(value, v_proj_weight_non_opt,
+                         in_proj_bias[(embed_dim * 2):])
+        else:
+            q = F.linear(query, q_proj_weight_non_opt, in_proj_bias)
+            k = F.linear(key, k_proj_weight_non_opt, in_proj_bias)
+            v = F.linear(value, v_proj_weight_non_opt, in_proj_bias)
+    q = q * scaling
+
+    if attn_mask is not None:
+        assert (
+            attn_mask.dtype == torch.float32
+            or attn_mask.dtype == torch.float64
+            or attn_mask.dtype == torch.float16
+            or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool
+        ), 'Only float, byte, and bool types are supported for ' \
+           'attn_mask, not {}'.format(attn_mask.dtype)
+        if attn_mask.dtype == torch.uint8:
+            warnings.warn('Byte tensor for attn_mask in nn.MultiheadAttention '
+                          'is deprecated. Use bool tensor instead.')
+            attn_mask = attn_mask.to(torch.bool)
+
+        if attn_mask.dim() == 2:
+            attn_mask = attn_mask.unsqueeze(0)
+            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                raise RuntimeError(
+                    'The size of the 2D attn_mask is not correct.')
+        elif attn_mask.dim() == 3:
+            if list(attn_mask.size()) != [
+                    bsz * num_heads,
+                    query.size(0), key.size(0)
+            ]:
+                raise RuntimeError(
+                    'The size of the 3D attn_mask is not correct.')
+        else:
+            raise RuntimeError(
+                "attn_mask's dimension {} is not supported".format(
+                    attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+
+    # convert ByteTensor key_padding_mask to bool
+    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+        warnings.warn(
+            'Byte tensor for key_padding_mask in nn.MultiheadAttention '
+            'is deprecated. Use bool tensor instead.')
+        key_padding_mask = key_padding_mask.to(torch.bool)
+
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = F.pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = F.pad(key_padding_mask, (0, 1))
+        else:
+            assert static_k is None, 'bias cannot be added to static key.'
+            assert static_v is None, 'bias cannot be added to static value.'
+    else:
+        assert bias_k is None
+        assert bias_v is None
+
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+        q_k = q_k.contiguous().view(tgt_len, bsz * num_heads,
+                                    head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+        q_v = q_v.contiguous().view(tgt_len, bsz * num_heads,
+                                    head_dim).transpose(0, 1)
+
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == head_dim
+        v = static_v
+
+    src_len = k.size(1)
+
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat(
+            [
+                k,
+                torch.zeros(
+                    (k.size(0), 1) + k.size()[2:],
+                    dtype=k.dtype,
+                    device=k.device),
+            ],
+            dim=1,
+        )
+        v = torch.cat(
+            [
+                v,
+                torch.zeros(
+                    (v.size(0), 1) + v.size()[2:],
+                    dtype=v.dtype,
+                    device=v.device),
+            ],
+            dim=1,
+        )
+        if attn_mask is not None:
+            attn_mask = F.pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = F.pad(key_padding_mask, (0, 1))
+
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    assert list(
+        attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+        else:
+            attn_output_weights += attn_mask
+
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len,
+                                                       src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1).unsqueeze(2),
+            float('-inf'),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads,
+                                                       tgt_len, src_len)
+    # attn_out_weights: [bsz * num_heads, tgt_len, src_len]
+    # ->[bsz * num_heads, tgt_len, src_len+1]
+    self_weight = (q * q_k).sum(
+        dim=-1, keepdim=True)  # [bsz * num_heads, tgt_len, 1]
+    total_attn_output_weights = torch.cat([attn_output_weights, self_weight],
+                                          dim=-1)
+    total_attn_output_weights = F.softmax(total_attn_output_weights, dim=-1)
+    total_attn_output_weights = F.dropout(
+        total_attn_output_weights, p=dropout_p, training=training)
+    attn_output_weights = \
+        total_attn_output_weights[:, :, : -1]
+    # [bsz * num_heads, tgt_len, src_len]
+    self_weight = \
+        total_attn_output_weights[:, :, -1:]  # [bsz * num_heads, tgt_len, 1]
+
+    attn_output = torch.bmm(attn_output_weights,
+                            v)  # [bsz * num_heads, tgt_len, head_dim]
+    attn_output = (attn_output + self_weight * q_v
+                   )  # [bsz * num_heads, tgt_len, head_dim]
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(
+        tgt_len, bsz, embed_dim)
+    attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias)
+
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len,
+                                                       src_len)
+        return attn_output, attn_output_weights  # .sum(dim=1) / num_heads
+    else:
+        return attn_output, None
+
+
+def cross_attn_layer(tf_layer: BaseTransformerLayer, x, mem, attn_bias):
+    """Implementation of transformer layer with cross attention. The cross
+    attention shares the embedding weights with self-attention of tf_layer.
+    Args:
+        tf_layer: (TransformerEncoderLayer): The Module of transformer layer.
+        x (Tensor): query [K,N,C]
+        mem (Tensor): key and value [L,N,C]
+        attn_bias (Tensor): attention bias [N*num_head,K,L]
+
+    Return:
+        x (Tensor): cross attention output [K,N,C]
+    """
+    self_attn_layer = tf_layer.attentions[0].attn
+    attn_layer_paras = {
+        'embed_dim_to_check': self_attn_layer.embed_dim,
+        'num_heads': self_attn_layer.num_heads,
+        'in_proj_weight': self_attn_layer.in_proj_weight,
+        'in_proj_bias': self_attn_layer.in_proj_bias,
+        'bias_k': self_attn_layer.bias_k,
+        'bias_v': self_attn_layer.bias_v,
+        'add_zero_attn': self_attn_layer.add_zero_attn,
+        'dropout_p': self_attn_layer.dropout,
+        'out_proj_weight': self_attn_layer.out_proj.weight,
+        'out_proj_bias': self_attn_layer.out_proj.bias,
+        'training': self_attn_layer.training
+    }
+
+    q_x = tf_layer.norms[0](x)
+    k_x = v_x = tf_layer.norms[0](mem)
+    x = x + cross_attn_with_self_bias(
+        q_x,
+        k_x,
+        v_x,
+        attn_mask=attn_bias,
+        need_weights=False,
+        **attn_layer_paras)[0]
+    x = tf_layer.ffns[0](tf_layer.norms[1](x), identity=x)
+    return x
+
+
+class LayerNorm2d(nn.Module):
+    """A LayerNorm variant, popularized by Transformers, that performs point-
+    wise mean and variance normalization over the channel dimension for inputs
+    that have shape (batch_size, channels, height, width).
+
+    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa B950
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x: torch.Tensor):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+class MLP(nn.Module):
+    """Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self,
+                 input_dim,
+                 hidden_dim,
+                 output_dim,
+                 num_layers,
+                 affine_func=nn.Linear):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            affine_func(n, k)
+            for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x: torch.Tensor):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
diff --git a/head_extractor/build/lib/mmseg/models/utils/se_layer.py b/head_extractor/build/lib/mmseg/models/utils/se_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ff632cfea728a7ffd99f1578c828c588d78f3db
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/se_layer.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.utils import is_tuple_of
+
+from .make_divisible import make_divisible
+
+
+class SELayer(nn.Module):
+    """Squeeze-and-Excitation Module.
+
+    Args:
+        channels (int): The input (and output) channels of the SE layer.
+        ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
+            ``int(channels/ratio)``. Default: 16.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configured
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configured by the first dict and the
+            second activation layer will be configured by the second dict.
+            Default: (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0,
+            divisor=6.0)).
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 act_cfg=(dict(type='ReLU'),
+                          dict(type='HSigmoid', bias=3.0, divisor=6.0))):
+        super().__init__()
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=make_divisible(channels // ratio, 8),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=make_divisible(channels // ratio, 8),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
diff --git a/head_extractor/build/lib/mmseg/models/utils/self_attention_block.py b/head_extractor/build/lib/mmseg/models/utils/self_attention_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bb6e8284e599637c12553e27199338a820709e3
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/self_attention_block.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import ConvModule
+from mmengine.model.weight_init import constant_init
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+class SelfAttentionBlock(nn.Module):
+    """General self-attention block/non-local block.
+
+    Please refer to https://arxiv.org/abs/1706.03762 for details about key,
+    query and value.
+
+    Args:
+        key_in_channels (int): Input channels of key feature.
+        query_in_channels (int): Input channels of query feature.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+        share_key_query (bool): Whether share projection weight between key
+            and query projection.
+        query_downsample (nn.Module): Query downsample module.
+        key_downsample (nn.Module): Key downsample module.
+        key_query_num_convs (int): Number of convs for key/query projection.
+        value_num_convs (int): Number of convs for value projection.
+        matmul_norm (bool): Whether normalize attention map with sqrt of
+            channels
+        with_out (bool): Whether use out projection.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict|None): Config of activation layers.
+    """
+
+    def __init__(self, key_in_channels, query_in_channels, channels,
+                 out_channels, share_key_query, query_downsample,
+                 key_downsample, key_query_num_convs, value_out_num_convs,
+                 key_query_norm, value_out_norm, matmul_norm, with_out,
+                 conv_cfg, norm_cfg, act_cfg):
+        super().__init__()
+        if share_key_query:
+            assert key_in_channels == query_in_channels
+        self.key_in_channels = key_in_channels
+        self.query_in_channels = query_in_channels
+        self.out_channels = out_channels
+        self.channels = channels
+        self.share_key_query = share_key_query
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.key_project = self.build_project(
+            key_in_channels,
+            channels,
+            num_convs=key_query_num_convs,
+            use_conv_module=key_query_norm,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        if share_key_query:
+            self.query_project = self.key_project
+        else:
+            self.query_project = self.build_project(
+                query_in_channels,
+                channels,
+                num_convs=key_query_num_convs,
+                use_conv_module=key_query_norm,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.value_project = self.build_project(
+            key_in_channels,
+            channels if with_out else out_channels,
+            num_convs=value_out_num_convs,
+            use_conv_module=value_out_norm,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        if with_out:
+            self.out_project = self.build_project(
+                channels,
+                out_channels,
+                num_convs=value_out_num_convs,
+                use_conv_module=value_out_norm,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        else:
+            self.out_project = None
+
+        self.query_downsample = query_downsample
+        self.key_downsample = key_downsample
+        self.matmul_norm = matmul_norm
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialize weight of later layer."""
+        if self.out_project is not None:
+            if not isinstance(self.out_project, ConvModule):
+                constant_init(self.out_project, 0)
+
+    def build_project(self, in_channels, channels, num_convs, use_conv_module,
+                      conv_cfg, norm_cfg, act_cfg):
+        """Build projection layer for key/query/value/out."""
+        if use_conv_module:
+            convs = [
+                ConvModule(
+                    in_channels,
+                    channels,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg)
+            ]
+            for _ in range(num_convs - 1):
+                convs.append(
+                    ConvModule(
+                        channels,
+                        channels,
+                        1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+        else:
+            convs = [nn.Conv2d(in_channels, channels, 1)]
+            for _ in range(num_convs - 1):
+                convs.append(nn.Conv2d(channels, channels, 1))
+        if len(convs) > 1:
+            convs = nn.Sequential(*convs)
+        else:
+            convs = convs[0]
+        return convs
+
+    def forward(self, query_feats, key_feats):
+        """Forward function."""
+        batch_size = query_feats.size(0)
+        query = self.query_project(query_feats)
+        if self.query_downsample is not None:
+            query = self.query_downsample(query)
+        query = query.reshape(*query.shape[:2], -1)
+        query = query.permute(0, 2, 1).contiguous()
+
+        key = self.key_project(key_feats)
+        value = self.value_project(key_feats)
+        if self.key_downsample is not None:
+            key = self.key_downsample(key)
+            value = self.key_downsample(value)
+        key = key.reshape(*key.shape[:2], -1)
+        value = value.reshape(*value.shape[:2], -1)
+        value = value.permute(0, 2, 1).contiguous()
+
+        sim_map = torch.matmul(query, key)
+        if self.matmul_norm:
+            sim_map = (self.channels**-.5) * sim_map
+        sim_map = F.softmax(sim_map, dim=-1)
+
+        context = torch.matmul(sim_map, value)
+        context = context.permute(0, 2, 1).contiguous()
+        context = context.reshape(batch_size, -1, *query_feats.shape[2:])
+        if self.out_project is not None:
+            context = self.out_project(context)
+        return context
diff --git a/head_extractor/build/lib/mmseg/models/utils/shape_convert.py b/head_extractor/build/lib/mmseg/models/utils/shape_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..cce1e220b645d4b02df1ec2d9ed3137c8acba707
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/shape_convert.py
@@ -0,0 +1,107 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def nlc_to_nchw(x, hw_shape):
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len doesn\'t match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W)
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+def nchw2nlc2nchw(module, x, contiguous=False, **kwargs):
+    """Flatten [N, C, H, W] shape tensor `x` to [N, L, C] shape tensor. Use the
+    reshaped tensor as the input of `module`, and the convert the output of
+    `module`, whose shape is.
+
+    [N, L, C], to [N, C, H, W].
+
+    Args:
+        module (Callable): A callable object the takes a tensor
+            with shape [N, L, C] as input.
+        x (Tensor): The input tensor of shape [N, C, H, W].
+                contiguous:
+        contiguous (Bool): Whether to make the tensor contiguous
+            after each shape transform.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W].
+
+    Example:
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> norm = nn.LayerNorm(4)
+        >>> feature_map = torch.rand(4, 4, 5, 5)
+        >>> output = nchw2nlc2nchw(norm, feature_map)
+    """
+    B, C, H, W = x.shape
+    if not contiguous:
+        x = x.flatten(2).transpose(1, 2)
+        x = module(x, **kwargs)
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+    else:
+        x = x.flatten(2).transpose(1, 2).contiguous()
+        x = module(x, **kwargs)
+        x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+    return x
+
+
+def nlc2nchw2nlc(module, x, hw_shape, contiguous=False, **kwargs):
+    """Convert [N, L, C] shape tensor `x` to [N, C, H, W] shape tensor. Use the
+    reshaped tensor as the input of `module`, and convert the output of
+    `module`, whose shape is.
+
+    [N, C, H, W], to [N, L, C].
+
+    Args:
+        module (Callable): A callable object the takes a tensor
+            with shape [N, C, H, W] as input.
+        x (Tensor): The input tensor of shape [N, L, C].
+        hw_shape: (Sequence[int]): The height and width of the
+            feature map with shape [N, C, H, W].
+        contiguous (Bool): Whether to make the tensor contiguous
+            after each shape transform.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C].
+
+    Example:
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> conv = nn.Conv2d(16, 16, 3, 1, 1)
+        >>> feature_map = torch.rand(4, 25, 16)
+        >>> output = nlc2nchw2nlc(conv, feature_map, (5, 5))
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len doesn\'t match H, W'
+    if not contiguous:
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+        x = module(x, **kwargs)
+        x = x.flatten(2).transpose(1, 2)
+    else:
+        x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+        x = module(x, **kwargs)
+        x = x.flatten(2).transpose(1, 2).contiguous()
+    return x
diff --git a/head_extractor/build/lib/mmseg/models/utils/up_conv_block.py b/head_extractor/build/lib/mmseg/models/utils/up_conv_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fa3b598de96d53c169232d9c89ac458f6921e8d
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/up_conv_block.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_upsample_layer
+
+
+class UpConvBlock(nn.Module):
+    """Upsample convolution block in decoder for UNet.
+
+    This upsample convolution block consists of one upsample module
+    followed by one convolution block. The upsample module expands the
+    high-level low-resolution feature map and the convolution block fuses
+    the upsampled high-level low-resolution feature map and the low-level
+    high-resolution feature map from encoder.
+
+    Args:
+        conv_block (nn.Sequential): Sequential of convolutional layers.
+        in_channels (int): Number of input channels of the high-level
+        skip_channels (int): Number of input channels of the low-level
+        high-resolution feature map from encoder.
+        out_channels (int): Number of output channels.
+        num_convs (int): Number of convolutional layers in the conv_block.
+            Default: 2.
+        stride (int): Stride of convolutional layer in conv_block. Default: 1.
+        dilation (int): Dilation rate of convolutional layer in conv_block.
+            Default: 1.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        conv_cfg (dict | None): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        upsample_cfg (dict): The upsample config of the upsample module in
+            decoder. Default: dict(type='InterpConv'). If the size of
+            high-level feature map is the same as that of skip feature map
+            (low-level feature map from encoder), it does not need upsample the
+            high-level feature map and the upsample_cfg is None.
+        dcn (bool): Use deformable convolution in convolutional layer or not.
+            Default: None.
+        plugins (dict): plugins for convolutional layers. Default: None.
+    """
+
+    def __init__(self,
+                 conv_block,
+                 in_channels,
+                 skip_channels,
+                 out_channels,
+                 num_convs=2,
+                 stride=1,
+                 dilation=1,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 upsample_cfg=dict(type='InterpConv'),
+                 dcn=None,
+                 plugins=None):
+        super().__init__()
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.conv_block = conv_block(
+            in_channels=2 * skip_channels,
+            out_channels=out_channels,
+            num_convs=num_convs,
+            stride=stride,
+            dilation=dilation,
+            with_cp=with_cp,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            dcn=None,
+            plugins=None)
+        if upsample_cfg is not None:
+            self.upsample = build_upsample_layer(
+                cfg=upsample_cfg,
+                in_channels=in_channels,
+                out_channels=skip_channels,
+                with_cp=with_cp,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        else:
+            self.upsample = ConvModule(
+                in_channels,
+                skip_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+
+    def forward(self, skip, x):
+        """Forward function."""
+
+        x = self.upsample(x)
+        out = torch.cat([skip, x], dim=1)
+        out = self.conv_block(out)
+
+        return out
diff --git a/head_extractor/build/lib/mmseg/models/utils/wrappers.py b/head_extractor/build/lib/mmseg/models/utils/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..abbd0c029623b4f480a067e4b78adfec234ef8d0
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/models/utils/wrappers.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+class Upsample(nn.Module):
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 mode='nearest',
+                 align_corners=None):
+        super().__init__()
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        if not self.size:
+            size = [int(t * self.scale_factor) for t in x.shape[-2:]]
+        else:
+            size = self.size
+        return resize(x, size, None, self.mode, self.align_corners)
diff --git a/head_extractor/build/lib/mmseg/registry/__init__.py b/head_extractor/build/lib/mmseg/registry/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee514d1a2a2bdd54a0a9b017ec227160ee502be5
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/registry/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .registry import (DATA_SAMPLERS, DATASETS, EVALUATOR, HOOKS, INFERENCERS,
+                       LOG_PROCESSORS, LOOPS, METRICS, MODEL_WRAPPERS, MODELS,
+                       OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS, OPTIMIZERS,
+                       PARAM_SCHEDULERS, RUNNER_CONSTRUCTORS, RUNNERS,
+                       TASK_UTILS, TRANSFORMS, VISBACKENDS, VISUALIZERS,
+                       WEIGHT_INITIALIZERS)
+
+__all__ = [
+    'HOOKS', 'DATASETS', 'DATA_SAMPLERS', 'TRANSFORMS', 'MODELS',
+    'WEIGHT_INITIALIZERS', 'OPTIMIZERS', 'OPTIM_WRAPPER_CONSTRUCTORS',
+    'TASK_UTILS', 'PARAM_SCHEDULERS', 'METRICS', 'MODEL_WRAPPERS',
+    'VISBACKENDS', 'VISUALIZERS', 'RUNNERS', 'RUNNER_CONSTRUCTORS', 'LOOPS',
+    'EVALUATOR', 'LOG_PROCESSORS', 'OPTIM_WRAPPERS', 'INFERENCERS'
+]
diff --git a/head_extractor/build/lib/mmseg/registry/registry.py b/head_extractor/build/lib/mmseg/registry/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..37b6a776095856c2fab0101b5b0ec8ed7e8fa8f2
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/registry/registry.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""MMSegmentation provides 21 registry nodes to support using modules across
+projects. Each node is a child of the root registry in MMEngine.
+
+More details can be found at
+https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html.
+"""
+
+from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS
+from mmengine.registry import DATASETS as MMENGINE_DATASETS
+from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR
+from mmengine.registry import HOOKS as MMENGINE_HOOKS
+from mmengine.registry import INFERENCERS as MMENGINE_INFERENCERS
+from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS
+from mmengine.registry import LOOPS as MMENGINE_LOOPS
+from mmengine.registry import METRICS as MMENGINE_METRICS
+from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS
+from mmengine.registry import MODELS as MMENGINE_MODELS
+from mmengine.registry import \
+    OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS
+from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS
+from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS
+from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS
+from mmengine.registry import \
+    RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS
+from mmengine.registry import RUNNERS as MMENGINE_RUNNERS
+from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS
+from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS
+from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS
+from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS
+from mmengine.registry import \
+    WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS
+from mmengine.registry import Registry
+
+# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner`
+RUNNERS = Registry('runner', parent=MMENGINE_RUNNERS)
+# manage runner constructors that define how to initialize runners
+RUNNER_CONSTRUCTORS = Registry(
+    'runner constructor', parent=MMENGINE_RUNNER_CONSTRUCTORS)
+# manage all kinds of loops like `EpochBasedTrainLoop`
+LOOPS = Registry('loop', parent=MMENGINE_LOOPS)
+# manage all kinds of hooks like `CheckpointHook`
+HOOKS = Registry(
+    'hook', parent=MMENGINE_HOOKS, locations=['mmseg.engine.hooks'])
+
+# manage data-related modules
+DATASETS = Registry(
+    'dataset', parent=MMENGINE_DATASETS, locations=['mmseg.datasets'])
+DATA_SAMPLERS = Registry('data sampler', parent=MMENGINE_DATA_SAMPLERS)
+TRANSFORMS = Registry(
+    'transform',
+    parent=MMENGINE_TRANSFORMS,
+    locations=['mmseg.datasets.transforms'])
+
+# mangage all kinds of modules inheriting `nn.Module`
+MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['mmseg.models'])
+# mangage all kinds of model wrappers like 'MMDistributedDataParallel'
+MODEL_WRAPPERS = Registry(
+    'model_wrapper',
+    parent=MMENGINE_MODEL_WRAPPERS,
+    locations=['mmseg.models'])
+# mangage all kinds of weight initialization modules like `Uniform`
+WEIGHT_INITIALIZERS = Registry(
+    'weight initializer',
+    parent=MMENGINE_WEIGHT_INITIALIZERS,
+    locations=['mmseg.models'])
+
+# mangage all kinds of optimizers like `SGD` and `Adam`
+OPTIMIZERS = Registry(
+    'optimizer',
+    parent=MMENGINE_OPTIMIZERS,
+    locations=['mmseg.engine.optimizers'])
+# manage optimizer wrapper
+OPTIM_WRAPPERS = Registry(
+    'optim_wrapper',
+    parent=MMENGINE_OPTIM_WRAPPERS,
+    locations=['mmseg.engine.optimizers'])
+# manage constructors that customize the optimization hyperparameters.
+OPTIM_WRAPPER_CONSTRUCTORS = Registry(
+    'optimizer wrapper constructor',
+    parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS,
+    locations=['mmseg.engine.optimizers'])
+# mangage all kinds of parameter schedulers like `MultiStepLR`
+PARAM_SCHEDULERS = Registry(
+    'parameter scheduler',
+    parent=MMENGINE_PARAM_SCHEDULERS,
+    locations=['mmseg.engine.schedulers'])
+
+# manage all kinds of metrics
+METRICS = Registry(
+    'metric', parent=MMENGINE_METRICS, locations=['mmseg.evaluation'])
+# manage evaluator
+EVALUATOR = Registry(
+    'evaluator', parent=MMENGINE_EVALUATOR, locations=['mmseg.evaluation'])
+
+# manage task-specific modules like ohem pixel sampler
+TASK_UTILS = Registry(
+    'task util', parent=MMENGINE_TASK_UTILS, locations=['mmseg.models'])
+
+# manage visualizer
+VISUALIZERS = Registry(
+    'visualizer',
+    parent=MMENGINE_VISUALIZERS,
+    locations=['mmseg.visualization'])
+# manage visualizer backend
+VISBACKENDS = Registry(
+    'vis_backend',
+    parent=MMENGINE_VISBACKENDS,
+    locations=['mmseg.visualization'])
+
+# manage logprocessor
+LOG_PROCESSORS = Registry(
+    'log_processor',
+    parent=MMENGINE_LOG_PROCESSORS,
+    locations=['mmseg.visualization'])
+
+# manage inferencer
+INFERENCERS = Registry('inferencer', parent=MMENGINE_INFERENCERS)
diff --git a/head_extractor/build/lib/mmseg/structures/__init__.py b/head_extractor/build/lib/mmseg/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..63d118dca3ebcff30ca241f9378475bcce072627
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/structures/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .sampler import BasePixelSampler, OHEMPixelSampler, build_pixel_sampler
+from .seg_data_sample import SegDataSample
+
+__all__ = [
+    'SegDataSample', 'BasePixelSampler', 'OHEMPixelSampler',
+    'build_pixel_sampler'
+]
diff --git a/head_extractor/build/lib/mmseg/structures/sampler/__init__.py b/head_extractor/build/lib/mmseg/structures/sampler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..91d762d1b4552b391ece046fa3d094409011bcec
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/structures/sampler/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_pixel_sampler import BasePixelSampler
+from .builder import build_pixel_sampler
+from .ohem_pixel_sampler import OHEMPixelSampler
+
+__all__ = ['build_pixel_sampler', 'BasePixelSampler', 'OHEMPixelSampler']
diff --git a/head_extractor/build/lib/mmseg/structures/sampler/base_pixel_sampler.py b/head_extractor/build/lib/mmseg/structures/sampler/base_pixel_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..03672cd478a2e464cc734ae92686c86f219da0a9
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/structures/sampler/base_pixel_sampler.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BasePixelSampler(metaclass=ABCMeta):
+    """Base class of pixel sampler."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    @abstractmethod
+    def sample(self, seg_logit, seg_label):
+        """Placeholder for sample function."""
diff --git a/head_extractor/build/lib/mmseg/structures/sampler/builder.py b/head_extractor/build/lib/mmseg/structures/sampler/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..48e14790264a3d4c4ff54d84e5bab67b1623a1df
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/structures/sampler/builder.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmseg.registry import TASK_UTILS
+
+PIXEL_SAMPLERS = TASK_UTILS
+
+
+def build_pixel_sampler(cfg, **default_args):
+    """Build pixel sampler for segmentation map."""
+    warnings.warn(
+        '``build_pixel_sampler`` would be deprecated soon, please use '
+        '``mmseg.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
diff --git a/head_extractor/build/lib/mmseg/structures/sampler/ohem_pixel_sampler.py b/head_extractor/build/lib/mmseg/structures/sampler/ohem_pixel_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a974273cab504be269e7f391e23a521b97bd8588
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/structures/sampler/ohem_pixel_sampler.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .base_pixel_sampler import BasePixelSampler
+from .builder import PIXEL_SAMPLERS
+
+
+@PIXEL_SAMPLERS.register_module()
+class OHEMPixelSampler(BasePixelSampler):
+    """Online Hard Example Mining Sampler for segmentation.
+
+    Args:
+        context (nn.Module): The context of sampler, subclass of
+            :obj:`BaseDecodeHead`.
+        thresh (float, optional): The threshold for hard example selection.
+            Below which, are prediction with low confidence. If not
+            specified, the hard examples will be pixels of top ``min_kept``
+            loss. Default: None.
+        min_kept (int, optional): The minimum number of predictions to keep.
+            Default: 100000.
+    """
+
+    def __init__(self, context, thresh=None, min_kept=100000):
+        super().__init__()
+        self.context = context
+        assert min_kept > 1
+        self.thresh = thresh
+        self.min_kept = min_kept
+
+    def sample(self, seg_logit, seg_label):
+        """Sample pixels that have high loss or with low prediction confidence.
+
+        Args:
+            seg_logit (torch.Tensor): segmentation logits, shape (N, C, H, W)
+            seg_label (torch.Tensor): segmentation label, shape (N, 1, H, W)
+
+        Returns:
+            torch.Tensor: segmentation weight, shape (N, H, W)
+        """
+        with torch.no_grad():
+            assert seg_logit.shape[2:] == seg_label.shape[2:]
+            assert seg_label.shape[1] == 1
+            seg_label = seg_label.squeeze(1).long()
+            batch_kept = self.min_kept * seg_label.size(0)
+            valid_mask = seg_label != self.context.ignore_index
+            seg_weight = seg_logit.new_zeros(size=seg_label.size())
+            valid_seg_weight = seg_weight[valid_mask]
+            if self.thresh is not None:
+                seg_prob = F.softmax(seg_logit, dim=1)
+
+                tmp_seg_label = seg_label.clone().unsqueeze(1)
+                tmp_seg_label[tmp_seg_label == self.context.ignore_index] = 0
+                seg_prob = seg_prob.gather(1, tmp_seg_label).squeeze(1)
+                sort_prob, sort_indices = seg_prob[valid_mask].sort()
+
+                if sort_prob.numel() > 0:
+                    min_threshold = sort_prob[min(batch_kept,
+                                                  sort_prob.numel() - 1)]
+                else:
+                    min_threshold = 0.0
+                threshold = max(min_threshold, self.thresh)
+                valid_seg_weight[seg_prob[valid_mask] < threshold] = 1.
+            else:
+                if not isinstance(self.context.loss_decode, nn.ModuleList):
+                    losses_decode = [self.context.loss_decode]
+                else:
+                    losses_decode = self.context.loss_decode
+                losses = 0.0
+                for loss_module in losses_decode:
+                    losses += loss_module(
+                        seg_logit,
+                        seg_label,
+                        weight=None,
+                        ignore_index=self.context.ignore_index,
+                        reduction_override='none')
+
+                # faster than topk according to https://github.com/pytorch/pytorch/issues/22812  # noqa
+                _, sort_indices = losses[valid_mask].sort(descending=True)
+                valid_seg_weight[sort_indices[:batch_kept]] = 1.
+
+            seg_weight[valid_mask] = valid_seg_weight
+
+            return seg_weight
diff --git a/head_extractor/build/lib/mmseg/structures/seg_data_sample.py b/head_extractor/build/lib/mmseg/structures/seg_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce68b5474330e2149d7d1c4de2d2406ae5b0345e
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/structures/seg_data_sample.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.structures import BaseDataElement, PixelData
+
+
+class SegDataSample(BaseDataElement):
+    """A data structure interface of MMSegmentation. They are used as
+    interfaces between different components.
+
+    The attributes in ``SegDataSample`` are divided into several parts:
+
+        - ``gt_sem_seg``(PixelData): Ground truth of semantic segmentation.
+        - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+        - ``seg_logits``(PixelData): Predicted logits of semantic segmentation.
+
+    Examples:
+         >>> import torch
+         >>> import numpy as np
+         >>> from mmengine.structures import PixelData
+         >>> from mmseg.structures import SegDataSample
+
+         >>> data_sample = SegDataSample()
+         >>> img_meta = dict(img_shape=(4, 4, 3),
+         ...                 pad_shape=(4, 4, 3))
+         >>> gt_segmentations = PixelData(metainfo=img_meta)
+         >>> gt_segmentations.data = torch.randint(0, 2, (1, 4, 4))
+         >>> data_sample.gt_sem_seg = gt_segmentations
+         >>> assert 'img_shape' in data_sample.gt_sem_seg.metainfo_keys()
+         >>> data_sample.gt_sem_seg.shape
+         (4, 4)
+         >>> print(data_sample)
+        <SegDataSample(
+
+            META INFORMATION
+
+            DATA FIELDS
+            gt_sem_seg: <PixelData(
+
+                    META INFORMATION
+                    img_shape: (4, 4, 3)
+                    pad_shape: (4, 4, 3)
+
+                    DATA FIELDS
+                    data: tensor([[[1, 1, 1, 0],
+                                 [1, 0, 1, 1],
+                                 [1, 1, 1, 1],
+                                 [0, 1, 0, 1]]])
+                ) at 0x1c2b4156460>
+        ) at 0x1c2aae44d60>
+
+        >>> data_sample = SegDataSample()
+        >>> gt_sem_seg_data = dict(sem_seg=torch.rand(1, 4, 4))
+        >>> gt_sem_seg = PixelData(**gt_sem_seg_data)
+        >>> data_sample.gt_sem_seg = gt_sem_seg
+        >>> assert 'gt_sem_seg' in data_sample
+        >>> assert 'sem_seg' in data_sample.gt_sem_seg
+    """
+
+    @property
+    def gt_sem_seg(self) -> PixelData:
+        return self._gt_sem_seg
+
+    @gt_sem_seg.setter
+    def gt_sem_seg(self, value: PixelData) -> None:
+        self.set_field(value, '_gt_sem_seg', dtype=PixelData)
+
+    @gt_sem_seg.deleter
+    def gt_sem_seg(self) -> None:
+        del self._gt_sem_seg
+
+    @property
+    def pred_sem_seg(self) -> PixelData:
+        return self._pred_sem_seg
+
+    @pred_sem_seg.setter
+    def pred_sem_seg(self, value: PixelData) -> None:
+        self.set_field(value, '_pred_sem_seg', dtype=PixelData)
+
+    @pred_sem_seg.deleter
+    def pred_sem_seg(self) -> None:
+        del self._pred_sem_seg
+
+    @property
+    def seg_logits(self) -> PixelData:
+        return self._seg_logits
+
+    @seg_logits.setter
+    def seg_logits(self, value: PixelData) -> None:
+        self.set_field(value, '_seg_logits', dtype=PixelData)
+
+    @seg_logits.deleter
+    def seg_logits(self) -> None:
+        del self._seg_logits
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b4afb514783786adf76744f9b97f3e1db1d6081
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+__version__ = "0.0.1"
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..033c35660450afec6612adb342c7c30e1ccd15ee
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pathlib
+
+from omegaconf import OmegaConf
+
+
+def load_config(config_name: str):
+    config_filename = config_name + ".yaml"
+    return OmegaConf.load(pathlib.Path(__file__).parent.resolve() / config_filename)
+
+
+dinov2_default_config = load_config("ssl_default_config")
+
+
+def load_and_merge_config(config_name: str):
+    default_config = OmegaConf.create(dinov2_default_config)
+    loaded_config = load_config(config_name)
+    return OmegaConf.merge(default_config, loaded_config)
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..357db5c542c5810391ba2bd45a60c13c01c3737a
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .adapters import DatasetWithEnumeratedTargets
+from .loaders import make_data_loader, make_dataset, SamplerType
+from .collate import collate_data_and_cast
+from .masking import MaskingGenerator
+from .augmentations import DataAugmentationDINO
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dcbc68e046f03460d5867f1388d5380d9c6f603
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Tuple
+
+from torch.utils.data import Dataset
+
+
+class DatasetWithEnumeratedTargets(Dataset):
+    def __init__(self, dataset):
+        self._dataset = dataset
+
+    def get_image_data(self, index: int) -> bytes:
+        return self._dataset.get_image_data(index)
+
+    def get_target(self, index: int) -> Tuple[Any, int]:
+        target = self._dataset.get_target(index)
+        return (index, target)
+
+    def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]:
+        image, target = self._dataset[index]
+        target = index if target is None else target
+        return image, (index, target)
+
+    def __len__(self) -> int:
+        return len(self._dataset)
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/augmentations.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/augmentations.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ca28cb59a4de2566a6c9ef9c301cbbb4e54b5ee
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/augmentations.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from torchvision import transforms
+
+from .transforms import (
+    GaussianBlur,
+    make_normalize_transform,
+)
+
+
+logger = logging.getLogger("dinov2")
+
+
+class DataAugmentationDINO(object):
+    def __init__(
+        self,
+        global_crops_scale,
+        local_crops_scale,
+        local_crops_number,
+        global_crops_size=224,
+        local_crops_size=96,
+    ):
+        self.global_crops_scale = global_crops_scale
+        self.local_crops_scale = local_crops_scale
+        self.local_crops_number = local_crops_number
+        self.global_crops_size = global_crops_size
+        self.local_crops_size = local_crops_size
+
+        logger.info("###################################")
+        logger.info("Using data augmentation parameters:")
+        logger.info(f"global_crops_scale: {global_crops_scale}")
+        logger.info(f"local_crops_scale: {local_crops_scale}")
+        logger.info(f"local_crops_number: {local_crops_number}")
+        logger.info(f"global_crops_size: {global_crops_size}")
+        logger.info(f"local_crops_size: {local_crops_size}")
+        logger.info("###################################")
+
+        # random resized crop and flip
+        self.geometric_augmentation_global = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    global_crops_size, scale=global_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.RandomHorizontalFlip(p=0.5),
+            ]
+        )
+
+        self.geometric_augmentation_local = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    local_crops_size, scale=local_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.RandomHorizontalFlip(p=0.5),
+            ]
+        )
+
+        # color distorsions / blurring
+        color_jittering = transforms.Compose(
+            [
+                transforms.RandomApply(
+                    [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
+                    p=0.8,
+                ),
+                transforms.RandomGrayscale(p=0.2),
+            ]
+        )
+
+        global_transfo1_extra = GaussianBlur(p=1.0)
+
+        global_transfo2_extra = transforms.Compose(
+            [
+                GaussianBlur(p=0.1),
+                transforms.RandomSolarize(threshold=128, p=0.2),
+            ]
+        )
+
+        local_transfo_extra = GaussianBlur(p=0.5)
+
+        # normalization
+        self.normalize = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                make_normalize_transform(),
+            ]
+        )
+
+        self.global_transfo1 = transforms.Compose([color_jittering, global_transfo1_extra, self.normalize])
+        self.global_transfo2 = transforms.Compose([color_jittering, global_transfo2_extra, self.normalize])
+        self.local_transfo = transforms.Compose([color_jittering, local_transfo_extra, self.normalize])
+
+    def __call__(self, image):
+        output = {}
+
+        # global crops:
+        im1_base = self.geometric_augmentation_global(image)
+        global_crop_1 = self.global_transfo1(im1_base)
+
+        im2_base = self.geometric_augmentation_global(image)
+        global_crop_2 = self.global_transfo2(im2_base)
+
+        output["global_crops"] = [global_crop_1, global_crop_2]
+
+        # global crops for teacher:
+        output["global_crops_teacher"] = [global_crop_1, global_crop_2]
+
+        # local crops:
+        local_crops = [
+            self.local_transfo(self.geometric_augmentation_local(image)) for _ in range(self.local_crops_number)
+        ]
+        output["local_crops"] = local_crops
+        output["offsets"] = ()
+
+        return output
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/collate.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/collate.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f0d98906808ed326dff4486d95b3ec04f8a5e75
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/collate.py
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import random
+
+
+def collate_data_and_cast(samples_list, mask_ratio_tuple, mask_probability, dtype, n_tokens=None, mask_generator=None):
+    # dtype = torch.half  # TODO: Remove
+
+    n_global_crops = len(samples_list[0][0]["global_crops"])
+    n_local_crops = len(samples_list[0][0]["local_crops"])
+
+    collated_global_crops = torch.stack([s[0]["global_crops"][i] for i in range(n_global_crops) for s in samples_list])
+
+    collated_local_crops = torch.stack([s[0]["local_crops"][i] for i in range(n_local_crops) for s in samples_list])
+
+    B = len(collated_global_crops)
+    N = n_tokens
+    n_samples_masked = int(B * mask_probability)
+    probs = torch.linspace(*mask_ratio_tuple, n_samples_masked + 1)
+    upperbound = 0
+    masks_list = []
+    for i in range(0, n_samples_masked):
+        prob_min = probs[i]
+        prob_max = probs[i + 1]
+        masks_list.append(torch.BoolTensor(mask_generator(int(N * random.uniform(prob_min, prob_max)))))
+        upperbound += int(N * prob_max)
+    for i in range(n_samples_masked, B):
+        masks_list.append(torch.BoolTensor(mask_generator(0)))
+
+    random.shuffle(masks_list)
+
+    collated_masks = torch.stack(masks_list).flatten(1)
+    mask_indices_list = collated_masks.flatten().nonzero().flatten()
+
+    masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks]
+
+    return {
+        "collated_global_crops": collated_global_crops.to(dtype),
+        "collated_local_crops": collated_local_crops.to(dtype),
+        "collated_masks": collated_masks,
+        "mask_indices_list": mask_indices_list,
+        "masks_weight": masks_weight,
+        "upperbound": upperbound,
+        "n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long),
+    }
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b537aee8fe31d7e0fa06713d2cfe9233ff0ef60
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .image_net import ImageNet
+from .image_net_22k import ImageNet22k
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/decoders.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/decoders.py
new file mode 100644
index 0000000000000000000000000000000000000000..548720b3b9959b4949f71fb2dd5cf6af3d184066
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/decoders.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from io import BytesIO
+from typing import Any
+
+from PIL import Image
+
+
+class Decoder:
+    def decode(self) -> Any:
+        raise NotImplementedError
+
+
+class ImageDataDecoder(Decoder):
+    def __init__(self, image_data: bytes) -> None:
+        self._image_data = image_data
+
+    def decode(self) -> Image:
+        f = BytesIO(self._image_data)
+        return Image.open(f).convert(mode="RGB")
+
+
+class TargetDecoder(Decoder):
+    def __init__(self, target: Any):
+        self._target = target
+
+    def decode(self) -> Any:
+        return self._target
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/extended.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/extended.py
new file mode 100644
index 0000000000000000000000000000000000000000..4da831e6ad275025ed55eaa490f780ecf6083f2c
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/extended.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Tuple
+
+from torchvision.datasets import VisionDataset
+
+from .decoders import TargetDecoder, ImageDataDecoder
+
+
+class ExtendedVisionDataset(VisionDataset):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)  # type: ignore
+
+    def get_image_data(self, index: int) -> bytes:
+        raise NotImplementedError
+
+    def get_target(self, index: int) -> Any:
+        raise NotImplementedError
+
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        try:
+            image_data = self.get_image_data(index)
+            image = ImageDataDecoder(image_data).decode()
+        except Exception as e:
+            raise RuntimeError(f"can not read image for sample {index}") from e
+        target = self.get_target(index)
+        target = TargetDecoder(target).decode()
+
+        if self.transforms is not None:
+            image, target = self.transforms(image, target)
+
+        return image, target
+
+    def __len__(self) -> int:
+        raise NotImplementedError
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1c384cc96ceb6afeb3e555d9b3e2a2c008c5d4
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net.py
@@ -0,0 +1,291 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import csv
+from enum import Enum
+import logging
+import os
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from .extended import ExtendedVisionDataset
+
+
+logger = logging.getLogger("dinov2")
+_Target = int
+
+
+class _Split(Enum):
+    TRAIN = "train"
+    VAL = "val"
+    TEST = "test"  # NOTE: torchvision does not support the test split
+
+    @property
+    def length(self) -> int:
+        split_lengths = {
+            _Split.TRAIN: 1_281_167,
+            _Split.VAL: 50_000,
+            _Split.TEST: 100_000,
+        }
+        return split_lengths[self]
+
+    def get_dirname(self, class_id: Optional[str] = None) -> str:
+        return self.value if class_id is None else os.path.join(self.value, class_id)
+
+    def get_image_relpath(self, actual_index: int, class_id: Optional[str] = None) -> str:
+        dirname = self.get_dirname(class_id)
+        if self == _Split.TRAIN:
+            basename = f"{class_id}_{actual_index}"
+        else:  # self in (_Split.VAL, _Split.TEST):
+            basename = f"ILSVRC2012_{self.value}_{actual_index:08d}"
+        return os.path.join(dirname, basename + ".JPEG")
+
+    def parse_image_relpath(self, image_relpath: str) -> Tuple[str, int]:
+        assert self != _Split.TEST
+        dirname, filename = os.path.split(image_relpath)
+        class_id = os.path.split(dirname)[-1]
+        basename, _ = os.path.splitext(filename)
+        actual_index = int(basename.split("_")[-1])
+        return class_id, actual_index
+
+
+class ImageNet(ExtendedVisionDataset):
+    Target = Union[_Target]
+    Split = Union[_Split]
+
+    def __init__(
+        self,
+        *,
+        split: "ImageNet.Split",
+        root: str,
+        extra: str,
+        transforms: Optional[Callable] = None,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+    ) -> None:
+        super().__init__(root, transforms, transform, target_transform)
+        self._extra_root = extra
+        self._split = split
+
+        self._entries = None
+        self._class_ids = None
+        self._class_names = None
+
+    @property
+    def split(self) -> "ImageNet.Split":
+        return self._split
+
+    def _get_extra_full_path(self, extra_path: str) -> str:
+        return os.path.join(self._extra_root, extra_path)
+
+    def _load_extra(self, extra_path: str) -> np.ndarray:
+        extra_full_path = self._get_extra_full_path(extra_path)
+        return np.load(extra_full_path, mmap_mode="r")
+
+    def _save_extra(self, extra_array: np.ndarray, extra_path: str) -> None:
+        extra_full_path = self._get_extra_full_path(extra_path)
+        os.makedirs(self._extra_root, exist_ok=True)
+        np.save(extra_full_path, extra_array)
+
+    @property
+    def _entries_path(self) -> str:
+        return f"entries-{self._split.value.upper()}.npy"
+
+    @property
+    def _class_ids_path(self) -> str:
+        return f"class-ids-{self._split.value.upper()}.npy"
+
+    @property
+    def _class_names_path(self) -> str:
+        return f"class-names-{self._split.value.upper()}.npy"
+
+    def _get_entries(self) -> np.ndarray:
+        if self._entries is None:
+            self._entries = self._load_extra(self._entries_path)
+        assert self._entries is not None
+        return self._entries
+
+    def _get_class_ids(self) -> np.ndarray:
+        if self._split == _Split.TEST:
+            assert False, "Class IDs are not available in TEST split"
+        if self._class_ids is None:
+            self._class_ids = self._load_extra(self._class_ids_path)
+        assert self._class_ids is not None
+        return self._class_ids
+
+    def _get_class_names(self) -> np.ndarray:
+        if self._split == _Split.TEST:
+            assert False, "Class names are not available in TEST split"
+        if self._class_names is None:
+            self._class_names = self._load_extra(self._class_names_path)
+        assert self._class_names is not None
+        return self._class_names
+
+    def find_class_id(self, class_index: int) -> str:
+        class_ids = self._get_class_ids()
+        return str(class_ids[class_index])
+
+    def find_class_name(self, class_index: int) -> str:
+        class_names = self._get_class_names()
+        return str(class_names[class_index])
+
+    def get_image_data(self, index: int) -> bytes:
+        entries = self._get_entries()
+        actual_index = entries[index]["actual_index"]
+
+        class_id = self.get_class_id(index)
+
+        image_relpath = self.split.get_image_relpath(actual_index, class_id)
+        image_full_path = os.path.join(self.root, image_relpath)
+        with open(image_full_path, mode="rb") as f:
+            image_data = f.read()
+        return image_data
+
+    def get_target(self, index: int) -> Optional[Target]:
+        entries = self._get_entries()
+        class_index = entries[index]["class_index"]
+        return None if self.split == _Split.TEST else int(class_index)
+
+    def get_targets(self) -> Optional[np.ndarray]:
+        entries = self._get_entries()
+        return None if self.split == _Split.TEST else entries["class_index"]
+
+    def get_class_id(self, index: int) -> Optional[str]:
+        entries = self._get_entries()
+        class_id = entries[index]["class_id"]
+        return None if self.split == _Split.TEST else str(class_id)
+
+    def get_class_name(self, index: int) -> Optional[str]:
+        entries = self._get_entries()
+        class_name = entries[index]["class_name"]
+        return None if self.split == _Split.TEST else str(class_name)
+
+    def __len__(self) -> int:
+        entries = self._get_entries()
+        assert len(entries) == self.split.length
+        return len(entries)
+
+    def _load_labels(self, labels_path: str) -> List[Tuple[str, str]]:
+        labels_full_path = os.path.join(self.root, labels_path)
+        labels = []
+
+        try:
+            with open(labels_full_path, "r") as f:
+                reader = csv.reader(f)
+                for row in reader:
+                    class_id, class_name = row
+                    labels.append((class_id, class_name))
+        except OSError as e:
+            raise RuntimeError(f'can not read labels file "{labels_full_path}"') from e
+
+        return labels
+
+    def _dump_entries(self) -> None:
+        split = self.split
+        if split == ImageNet.Split.TEST:
+            dataset = None
+            sample_count = split.length
+            max_class_id_length, max_class_name_length = 0, 0
+        else:
+            labels_path = "labels.txt"
+            logger.info(f'loading labels from "{labels_path}"')
+            labels = self._load_labels(labels_path)
+
+            # NOTE: Using torchvision ImageFolder for consistency
+            from torchvision.datasets import ImageFolder
+
+            dataset_root = os.path.join(self.root, split.get_dirname())
+            dataset = ImageFolder(dataset_root)
+            sample_count = len(dataset)
+            max_class_id_length, max_class_name_length = -1, -1
+            for sample in dataset.samples:
+                _, class_index = sample
+                class_id, class_name = labels[class_index]
+                max_class_id_length = max(len(class_id), max_class_id_length)
+                max_class_name_length = max(len(class_name), max_class_name_length)
+
+        dtype = np.dtype(
+            [
+                ("actual_index", "<u4"),
+                ("class_index", "<u4"),
+                ("class_id", f"U{max_class_id_length}"),
+                ("class_name", f"U{max_class_name_length}"),
+            ]
+        )
+        entries_array = np.empty(sample_count, dtype=dtype)
+
+        if split == ImageNet.Split.TEST:
+            old_percent = -1
+            for index in range(sample_count):
+                percent = 100 * (index + 1) // sample_count
+                if percent > old_percent:
+                    logger.info(f"creating entries: {percent}%")
+                    old_percent = percent
+
+                actual_index = index + 1
+                class_index = np.uint32(-1)
+                class_id, class_name = "", ""
+                entries_array[index] = (actual_index, class_index, class_id, class_name)
+        else:
+            class_names = {class_id: class_name for class_id, class_name in labels}
+
+            assert dataset
+            old_percent = -1
+            for index in range(sample_count):
+                percent = 100 * (index + 1) // sample_count
+                if percent > old_percent:
+                    logger.info(f"creating entries: {percent}%")
+                    old_percent = percent
+
+                image_full_path, class_index = dataset.samples[index]
+                image_relpath = os.path.relpath(image_full_path, self.root)
+                class_id, actual_index = split.parse_image_relpath(image_relpath)
+                class_name = class_names[class_id]
+                entries_array[index] = (actual_index, class_index, class_id, class_name)
+
+        logger.info(f'saving entries to "{self._entries_path}"')
+        self._save_extra(entries_array, self._entries_path)
+
+    def _dump_class_ids_and_names(self) -> None:
+        split = self.split
+        if split == ImageNet.Split.TEST:
+            return
+
+        entries_array = self._load_extra(self._entries_path)
+
+        max_class_id_length, max_class_name_length, max_class_index = -1, -1, -1
+        for entry in entries_array:
+            class_index, class_id, class_name = (
+                entry["class_index"],
+                entry["class_id"],
+                entry["class_name"],
+            )
+            max_class_index = max(int(class_index), max_class_index)
+            max_class_id_length = max(len(str(class_id)), max_class_id_length)
+            max_class_name_length = max(len(str(class_name)), max_class_name_length)
+
+        class_count = max_class_index + 1
+        class_ids_array = np.empty(class_count, dtype=f"U{max_class_id_length}")
+        class_names_array = np.empty(class_count, dtype=f"U{max_class_name_length}")
+        for entry in entries_array:
+            class_index, class_id, class_name = (
+                entry["class_index"],
+                entry["class_id"],
+                entry["class_name"],
+            )
+            class_ids_array[class_index] = class_id
+            class_names_array[class_index] = class_name
+
+        logger.info(f'saving class IDs to "{self._class_ids_path}"')
+        self._save_extra(class_ids_array, self._class_ids_path)
+
+        logger.info(f'saving class names to "{self._class_names_path}"')
+        self._save_extra(class_names_array, self._class_names_path)
+
+    def dump_extra(self) -> None:
+        self._dump_entries()
+        self._dump_class_ids_and_names()
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net_22k.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net_22k.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c0bfd335a68b67e02c241f39b1ae06f9fbe1dd0
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net_22k.py
@@ -0,0 +1,303 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from enum import Enum
+from functools import lru_cache
+from gzip import GzipFile
+from io import BytesIO
+from mmap import ACCESS_READ, mmap
+import os
+from typing import Any, Callable, List, Optional, Set, Tuple
+import warnings
+
+import numpy as np
+
+from .extended import ExtendedVisionDataset
+
+
+_Labels = int
+
+_DEFAULT_MMAP_CACHE_SIZE = 16  # Warning: This can exhaust file descriptors
+
+
+@dataclass
+class _ClassEntry:
+    block_offset: int
+    maybe_filename: Optional[str] = None
+
+
+@dataclass
+class _Entry:
+    class_index: int  # noqa: E701
+    start_offset: int
+    end_offset: int
+    filename: str
+
+
+class _Split(Enum):
+    TRAIN = "train"
+    VAL = "val"
+
+    @property
+    def length(self) -> int:
+        return {
+            _Split.TRAIN: 11_797_647,
+            _Split.VAL: 561_050,
+        }[self]
+
+    def entries_path(self):
+        return f"imagenet21kp_{self.value}.txt"
+
+
+def _get_tarball_path(class_id: str) -> str:
+    return f"{class_id}.tar"
+
+
+def _make_mmap_tarball(tarballs_root: str, mmap_cache_size: int):
+    @lru_cache(maxsize=mmap_cache_size)
+    def _mmap_tarball(class_id: str) -> mmap:
+        tarball_path = _get_tarball_path(class_id)
+        tarball_full_path = os.path.join(tarballs_root, tarball_path)
+        with open(tarball_full_path) as f:
+            return mmap(fileno=f.fileno(), length=0, access=ACCESS_READ)
+
+    return _mmap_tarball
+
+
+class ImageNet22k(ExtendedVisionDataset):
+    _GZIPPED_INDICES: Set[int] = {
+        841_545,
+        1_304_131,
+        2_437_921,
+        2_672_079,
+        2_795_676,
+        2_969_786,
+        6_902_965,
+        6_903_550,
+        6_903_628,
+        7_432_557,
+        7_432_589,
+        7_813_809,
+        8_329_633,
+        10_296_990,
+        10_417_652,
+        10_492_265,
+        10_598_078,
+        10_782_398,
+        10_902_612,
+        11_203_736,
+        11_342_890,
+        11_397_596,
+        11_589_762,
+        11_705_103,
+        12_936_875,
+        13_289_782,
+    }
+    Labels = _Labels
+
+    def __init__(
+        self,
+        *,
+        root: str,
+        extra: str,
+        transforms: Optional[Callable] = None,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        mmap_cache_size: int = _DEFAULT_MMAP_CACHE_SIZE,
+    ) -> None:
+        super().__init__(root, transforms, transform, target_transform)
+        self._extra_root = extra
+
+        entries_path = self._get_entries_path(root)
+        self._entries = self._load_extra(entries_path)
+
+        class_ids_path = self._get_class_ids_path(root)
+        self._class_ids = self._load_extra(class_ids_path)
+
+        self._gzipped_indices = ImageNet22k._GZIPPED_INDICES
+        self._mmap_tarball = _make_mmap_tarball(self._tarballs_root, mmap_cache_size)
+
+    def _get_entries_path(self, root: Optional[str] = None) -> str:
+        return "entries.npy"
+
+    def _get_class_ids_path(self, root: Optional[str] = None) -> str:
+        return "class-ids.npy"
+
+    def _find_class_ids(self, path: str) -> List[str]:
+        class_ids = []
+
+        with os.scandir(path) as entries:
+            for entry in entries:
+                root, ext = os.path.splitext(entry.name)
+                if ext != ".tar":
+                    continue
+                class_ids.append(root)
+
+        return sorted(class_ids)
+
+    def _load_entries_class_ids(self, root: Optional[str] = None) -> Tuple[List[_Entry], List[str]]:
+        root = self.get_root(root)
+        entries: List[_Entry] = []
+        class_ids = self._find_class_ids(root)
+
+        for class_index, class_id in enumerate(class_ids):
+            path = os.path.join(root, "blocks", f"{class_id}.log")
+            class_entries = []
+
+            try:
+                with open(path) as f:
+                    for line in f:
+                        line = line.rstrip()
+                        block, filename = line.split(":")
+                        block_offset = int(block[6:])
+                        filename = filename[1:]
+
+                        maybe_filename = None
+                        if filename != "** Block of NULs **":
+                            maybe_filename = filename
+                            _, ext = os.path.splitext(filename)
+                            # assert ext == ".JPEG"
+
+                        class_entry = _ClassEntry(block_offset, maybe_filename)
+                        class_entries.append(class_entry)
+            except OSError as e:
+                raise RuntimeError(f'can not read blocks file "{path}"') from e
+
+            assert class_entries[-1].maybe_filename is None
+
+            for class_entry1, class_entry2 in zip(class_entries, class_entries[1:]):
+                assert class_entry1.block_offset <= class_entry2.block_offset
+                start_offset = 512 * class_entry1.block_offset
+                end_offset = 512 * class_entry2.block_offset
+                assert class_entry1.maybe_filename is not None
+                filename = class_entry1.maybe_filename
+                entry = _Entry(class_index, start_offset, end_offset, filename)
+                # Skip invalid image files (PIL throws UnidentifiedImageError)
+                if filename == "n06470073_47249.JPEG":
+                    continue
+                entries.append(entry)
+
+        return entries, class_ids
+
+    def _load_extra(self, extra_path: str) -> np.ndarray:
+        extra_root = self._extra_root
+        extra_full_path = os.path.join(extra_root, extra_path)
+        return np.load(extra_full_path, mmap_mode="r")
+
+    def _save_extra(self, extra_array: np.ndarray, extra_path: str) -> None:
+        extra_root = self._extra_root
+        extra_full_path = os.path.join(extra_root, extra_path)
+        os.makedirs(extra_root, exist_ok=True)
+        np.save(extra_full_path, extra_array)
+
+    @property
+    def _tarballs_root(self) -> str:
+        return self.root
+
+    def find_class_id(self, class_index: int) -> str:
+        return str(self._class_ids[class_index])
+
+    def get_image_data(self, index: int) -> bytes:
+        entry = self._entries[index]
+        class_id = entry["class_id"]
+        class_mmap = self._mmap_tarball(class_id)
+
+        start_offset, end_offset = entry["start_offset"], entry["end_offset"]
+        try:
+            mapped_data = class_mmap[start_offset:end_offset]
+            data = mapped_data[512:]  # Skip entry header block
+
+            if len(data) >= 2 and tuple(data[:2]) == (0x1F, 0x8B):
+                assert index in self._gzipped_indices, f"unexpected gzip header for sample {index}"
+                with GzipFile(fileobj=BytesIO(data)) as g:
+                    data = g.read()
+        except Exception as e:
+            raise RuntimeError(f"can not retrieve image data for sample {index} " f'from "{class_id}" tarball') from e
+
+        return data
+
+    def get_target(self, index: int) -> Any:
+        return int(self._entries[index]["class_index"])
+
+    def get_targets(self) -> np.ndarray:
+        return self._entries["class_index"]
+
+    def get_class_id(self, index: int) -> str:
+        return str(self._entries[index]["class_id"])
+
+    def get_class_ids(self) -> np.ndarray:
+        return self._entries["class_id"]
+
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            return super().__getitem__(index)
+
+    def __len__(self) -> int:
+        return len(self._entries)
+
+    def _dump_entries(self, *args, **kwargs) -> None:
+        entries, class_ids = self._load_entries_class_ids(*args, **kwargs)
+
+        max_class_id_length, max_filename_length, max_class_index = -1, -1, -1
+        for entry in entries:
+            class_id = class_ids[entry.class_index]
+            max_class_index = max(entry.class_index, max_class_index)
+            max_class_id_length = max(len(class_id), max_class_id_length)
+            max_filename_length = max(len(entry.filename), max_filename_length)
+
+        dtype = np.dtype(
+            [
+                ("class_index", "<u4"),
+                ("class_id", f"U{max_class_id_length}"),
+                ("start_offset", "<u4"),
+                ("end_offset", "<u4"),
+                ("filename", f"U{max_filename_length}"),
+            ]
+        )
+        sample_count = len(entries)
+        entries_array = np.empty(sample_count, dtype=dtype)
+        for i, entry in enumerate(entries):
+            class_index = entry.class_index
+            class_id = class_ids[class_index]
+            start_offset = entry.start_offset
+            end_offset = entry.end_offset
+            filename = entry.filename
+            entries_array[i] = (
+                class_index,
+                class_id,
+                start_offset,
+                end_offset,
+                filename,
+            )
+
+        entries_path = self._get_entries_path(*args, **kwargs)
+        self._save_extra(entries_array, entries_path)
+
+    def _dump_class_ids(self, *args, **kwargs) -> None:
+        entries_path = self._get_entries_path(*args, **kwargs)
+        entries_array = self._load_extra(entries_path)
+
+        max_class_id_length, max_class_index = -1, -1
+        for entry in entries_array:
+            class_index, class_id = entry["class_index"], entry["class_id"]
+            max_class_index = max(int(class_index), max_class_index)
+            max_class_id_length = max(len(str(class_id)), max_class_id_length)
+
+        class_ids_array = np.empty(max_class_index + 1, dtype=f"U{max_class_id_length}")
+        for entry in entries_array:
+            class_index, class_id = entry["class_index"], entry["class_id"]
+            class_ids_array[class_index] = class_id
+        class_ids_path = self._get_class_ids_path(*args, **kwargs)
+        self._save_extra(class_ids_array, class_ids_path)
+
+    def _dump_extra(self, *args, **kwargs) -> None:
+        self._dump_entries(*args, *kwargs)
+        self._dump_class_ids(*args, *kwargs)
+
+    def dump_extra(self, root: Optional[str] = None) -> None:
+        return self._dump_extra(root)
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/loaders.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fb6f25a0a3c3251b803f48d0a515aa0b9591226
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/loaders.py
@@ -0,0 +1,223 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from enum import Enum
+from typing import Any, Callable, List, Optional, TypeVar
+
+import torch
+from torch.utils.data import Sampler
+
+from .datasets import ImageNet, ImageNet22k
+from .samplers import EpochSampler, InfiniteSampler, ShardedInfiniteSampler
+
+
+logger = logging.getLogger("dinov2")
+
+
+class SamplerType(Enum):
+    DISTRIBUTED = 0
+    EPOCH = 1
+    INFINITE = 2
+    SHARDED_INFINITE = 3
+    SHARDED_INFINITE_NEW = 4
+
+
+def _make_bool_str(b: bool) -> str:
+    return "yes" if b else "no"
+
+
+def _make_sample_transform(image_transform: Optional[Callable] = None, target_transform: Optional[Callable] = None):
+    def transform(sample):
+        image, target = sample
+        if image_transform is not None:
+            image = image_transform(image)
+        if target_transform is not None:
+            target = target_transform(target)
+        return image, target
+
+    return transform
+
+
+def _parse_dataset_str(dataset_str: str):
+    tokens = dataset_str.split(":")
+
+    name = tokens[0]
+    kwargs = {}
+
+    for token in tokens[1:]:
+        key, value = token.split("=")
+        assert key in ("root", "extra", "split")
+        kwargs[key] = value
+
+    if name == "ImageNet":
+        class_ = ImageNet
+        if "split" in kwargs:
+            kwargs["split"] = ImageNet.Split[kwargs["split"]]
+    elif name == "ImageNet22k":
+        class_ = ImageNet22k
+    else:
+        raise ValueError(f'Unsupported dataset "{name}"')
+
+    return class_, kwargs
+
+
+def make_dataset(
+    *,
+    dataset_str: str,
+    transform: Optional[Callable] = None,
+    target_transform: Optional[Callable] = None,
+):
+    """
+    Creates a dataset with the specified parameters.
+
+    Args:
+        dataset_str: A dataset string description (e.g. ImageNet:split=TRAIN).
+        transform: A transform to apply to images.
+        target_transform: A transform to apply to targets.
+
+    Returns:
+        The created dataset.
+    """
+    logger.info(f'using dataset: "{dataset_str}"')
+
+    class_, kwargs = _parse_dataset_str(dataset_str)
+    dataset = class_(transform=transform, target_transform=target_transform, **kwargs)
+
+    logger.info(f"# of dataset samples: {len(dataset):,d}")
+
+    # Aggregated datasets do not expose (yet) these attributes, so add them.
+    if not hasattr(dataset, "transform"):
+        setattr(dataset, "transform", transform)
+    if not hasattr(dataset, "target_transform"):
+        setattr(dataset, "target_transform", target_transform)
+
+    return dataset
+
+
+def _make_sampler(
+    *,
+    dataset,
+    type: Optional[SamplerType] = None,
+    shuffle: bool = False,
+    seed: int = 0,
+    size: int = -1,
+    advance: int = 0,
+) -> Optional[Sampler]:
+    sample_count = len(dataset)
+
+    if type == SamplerType.INFINITE:
+        logger.info("sampler: infinite")
+        if size > 0:
+            raise ValueError("sampler size > 0 is invalid")
+        return InfiniteSampler(
+            sample_count=sample_count,
+            shuffle=shuffle,
+            seed=seed,
+            advance=advance,
+        )
+    elif type in (SamplerType.SHARDED_INFINITE, SamplerType.SHARDED_INFINITE_NEW):
+        logger.info("sampler: sharded infinite")
+        if size > 0:
+            raise ValueError("sampler size > 0 is invalid")
+        # TODO: Remove support for old shuffling
+        use_new_shuffle_tensor_slice = type == SamplerType.SHARDED_INFINITE_NEW
+        return ShardedInfiniteSampler(
+            sample_count=sample_count,
+            shuffle=shuffle,
+            seed=seed,
+            advance=advance,
+            use_new_shuffle_tensor_slice=use_new_shuffle_tensor_slice,
+        )
+    elif type == SamplerType.EPOCH:
+        logger.info("sampler: epoch")
+        if advance > 0:
+            raise NotImplementedError("sampler advance > 0 is not supported")
+        size = size if size > 0 else sample_count
+        logger.info(f"# of samples / epoch: {size:,d}")
+        return EpochSampler(
+            size=size,
+            sample_count=sample_count,
+            shuffle=shuffle,
+            seed=seed,
+        )
+    elif type == SamplerType.DISTRIBUTED:
+        logger.info("sampler: distributed")
+        if size > 0:
+            raise ValueError("sampler size > 0 is invalid")
+        if advance > 0:
+            raise ValueError("sampler advance > 0 is invalid")
+        return torch.utils.data.DistributedSampler(
+            dataset=dataset,
+            shuffle=shuffle,
+            seed=seed,
+            drop_last=False,
+        )
+
+    logger.info("sampler: none")
+    return None
+
+
+T = TypeVar("T")
+
+
+def make_data_loader(
+    *,
+    dataset,
+    batch_size: int,
+    num_workers: int,
+    shuffle: bool = True,
+    seed: int = 0,
+    sampler_type: Optional[SamplerType] = SamplerType.INFINITE,
+    sampler_size: int = -1,
+    sampler_advance: int = 0,
+    drop_last: bool = True,
+    persistent_workers: bool = False,
+    collate_fn: Optional[Callable[[List[T]], Any]] = None,
+):
+    """
+    Creates a data loader with the specified parameters.
+
+    Args:
+        dataset: A dataset (third party, LaViDa or WebDataset).
+        batch_size: The size of batches to generate.
+        num_workers: The number of workers to use.
+        shuffle: Whether to shuffle samples.
+        seed: The random seed to use.
+        sampler_type: Which sampler to use: EPOCH, INFINITE, SHARDED_INFINITE, SHARDED_INFINITE_NEW, DISTRIBUTED or None.
+        sampler_size: The number of images per epoch (when applicable) or -1 for the entire dataset.
+        sampler_advance: How many samples to skip (when applicable).
+        drop_last: Whether the last non-full batch of data should be dropped.
+        persistent_workers: maintain the workers Dataset instances alive after a dataset has been consumed once.
+        collate_fn: Function that performs batch collation
+    """
+
+    sampler = _make_sampler(
+        dataset=dataset,
+        type=sampler_type,
+        shuffle=shuffle,
+        seed=seed,
+        size=sampler_size,
+        advance=sampler_advance,
+    )
+
+    logger.info("using PyTorch data loader")
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=True,
+        drop_last=drop_last,
+        persistent_workers=persistent_workers,
+        collate_fn=collate_fn,
+    )
+
+    try:
+        logger.info(f"# of batches: {len(data_loader):,d}")
+    except TypeError:  # data loader has no length
+        logger.info("infinite data loader")
+    return data_loader
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/masking.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/masking.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc3c72648c3e440dcdb284366b98d2df12ad1272
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/masking.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import random
+import math
+import numpy as np
+
+
+class MaskingGenerator:
+    def __init__(
+        self,
+        input_size,
+        num_masking_patches=None,
+        min_num_patches=4,
+        max_num_patches=None,
+        min_aspect=0.3,
+        max_aspect=None,
+    ):
+        if not isinstance(input_size, tuple):
+            input_size = (input_size,) * 2
+        self.height, self.width = input_size
+
+        self.num_patches = self.height * self.width
+        self.num_masking_patches = num_masking_patches
+
+        self.min_num_patches = min_num_patches
+        self.max_num_patches = num_masking_patches if max_num_patches is None else max_num_patches
+
+        max_aspect = max_aspect or 1 / min_aspect
+        self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
+
+    def __repr__(self):
+        repr_str = "Generator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
+            self.height,
+            self.width,
+            self.min_num_patches,
+            self.max_num_patches,
+            self.num_masking_patches,
+            self.log_aspect_ratio[0],
+            self.log_aspect_ratio[1],
+        )
+        return repr_str
+
+    def get_shape(self):
+        return self.height, self.width
+
+    def _mask(self, mask, max_mask_patches):
+        delta = 0
+        for _ in range(10):
+            target_area = random.uniform(self.min_num_patches, max_mask_patches)
+            aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+            if w < self.width and h < self.height:
+                top = random.randint(0, self.height - h)
+                left = random.randint(0, self.width - w)
+
+                num_masked = mask[top : top + h, left : left + w].sum()
+                # Overlap
+                if 0 < h * w - num_masked <= max_mask_patches:
+                    for i in range(top, top + h):
+                        for j in range(left, left + w):
+                            if mask[i, j] == 0:
+                                mask[i, j] = 1
+                                delta += 1
+
+                if delta > 0:
+                    break
+        return delta
+
+    def __call__(self, num_masking_patches=0):
+        mask = np.zeros(shape=self.get_shape(), dtype=bool)
+        mask_count = 0
+        while mask_count < num_masking_patches:
+            max_mask_patches = num_masking_patches - mask_count
+            max_mask_patches = min(max_mask_patches, self.max_num_patches)
+
+            delta = self._mask(mask, max_mask_patches)
+            if delta == 0:
+                break
+            else:
+                mask_count += delta
+
+        return mask
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/samplers.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/samplers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e356edf603a33ce2d18a388fd799694e22d1980f
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/samplers.py
@@ -0,0 +1,230 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+from typing import Any, Optional
+import warnings
+
+import numpy as np
+import torch
+from torch.utils.data.sampler import Sampler
+
+import dinov2.distributed as distributed
+
+
+class EpochSampler(Sampler):
+    def __init__(
+        self,
+        *,
+        size: int,
+        sample_count: int,
+        shuffle: bool = False,
+        seed: int = 0,
+        start: Optional[int] = None,
+        step: Optional[int] = None,
+    ):
+        self._size = size
+        self._sample_count = sample_count
+        self._shuffle = shuffle
+        self._seed = seed
+        self._start = distributed.get_global_rank() if start is None else start
+        self._step = distributed.get_global_size() if step is None else step
+        self._epoch = 0
+
+    def __iter__(self):
+        count = (self._size + self._sample_count - 1) // self._sample_count
+        tiled_indices = np.tile(np.arange(self._sample_count), count)
+        if self._shuffle:
+            seed = self._seed * self._epoch if self._seed != 0 else self._epoch
+            rng = np.random.default_rng(seed)
+            iterable = rng.choice(tiled_indices, self._size, replace=False)
+        else:
+            iterable = tiled_indices[: self._size]
+
+        yield from itertools.islice(iterable, self._start, None, self._step)
+
+    def __len__(self):
+        return (self._size - self._start + self._step - 1) // self._step
+
+    def set_epoch(self, epoch):
+        self._epoch = epoch
+
+
+def _get_numpy_dtype(size: int) -> Any:
+    return np.int32 if size <= 2**31 else np.int64
+
+
+def _get_torch_dtype(size: int) -> Any:
+    return torch.int32 if size <= 2**31 else torch.int64
+
+
+def _generate_randperm_indices(*, size: int, generator: torch.Generator):
+    """Generate the indices of a random permutation."""
+    dtype = _get_torch_dtype(size)
+    # This is actually matching PyTorch's CPU implementation, see: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorFactories.cpp#L900-L921
+    perm = torch.arange(size, dtype=dtype)
+    for i in range(size):
+        j = torch.randint(i, size, size=(1,), generator=generator).item()
+
+        # Always swap even if no-op
+        value = perm[j].item()
+        perm[j] = perm[i].item()
+        perm[i] = value
+        yield value
+
+
+class InfiniteSampler(Sampler):
+    def __init__(
+        self,
+        *,
+        sample_count: int,
+        shuffle: bool = False,
+        seed: int = 0,
+        start: Optional[int] = None,
+        step: Optional[int] = None,
+        advance: int = 0,
+    ):
+        self._sample_count = sample_count
+        self._seed = seed
+        self._shuffle = shuffle
+        self._start = distributed.get_global_rank() if start is None else start
+        self._step = distributed.get_global_size() if step is None else step
+        self._advance = advance
+
+    def __iter__(self):
+        if self._shuffle:
+            iterator = self._shuffled_iterator()
+        else:
+            iterator = self._iterator()
+
+        yield from itertools.islice(iterator, self._advance, None)
+
+    def _iterator(self):
+        assert not self._shuffle
+
+        while True:
+            iterable = range(self._sample_count)
+            yield from itertools.islice(iterable, self._start, None, self._step)
+
+    def _shuffled_iterator(self):
+        assert self._shuffle
+
+        # Instantiate a generator here (rather than in the ctor) to keep the class
+        # picklable (requirement of mp.spawn)
+        generator = torch.Generator().manual_seed(self._seed)
+
+        while True:
+            iterable = _generate_randperm_indices(size=self._sample_count, generator=generator)
+            yield from itertools.islice(iterable, self._start, None, self._step)
+
+
+# The following function is somewhat equivalent to _new_shuffle_tensor_slice below,
+# but avoids a full in-place random permutation generation.
+def _shuffle_tensor_slice(
+    *, tensor: torch.Tensor, start: int = 0, step: int = 1, generator: torch.Generator
+) -> np.ndarray:
+    stop = len(tensor)
+    count = stop // step
+    drop_count = stop - step * count
+    if drop_count:
+        warnings.warn(f"# of dropped samples: {drop_count}")
+
+    dtype = _get_numpy_dtype(stop)
+    result = np.empty(count, dtype=dtype)
+
+    for i in range(count):
+        j = torch.randint(0, i + 1, size=(1,), generator=generator).item() if i > 0 else 0
+
+        result[i] = result[j]
+        result[j] = tensor[start + i * step].item()
+
+    return result
+
+
+def _new_shuffle_tensor_slice(
+    *, tensor: torch.Tensor, start: int = 0, step: int = 1, generator: torch.Generator
+) -> np.ndarray:
+    stop = len(tensor)
+    count = stop // step
+    dtype = torch.int64  # Needed for using randperm result as indices
+    count = stop // step
+    drop_count = stop - step * count
+    if drop_count:
+        warnings.warn(f"# of dropped samples: {drop_count}")
+    indices = torch.randperm(count, dtype=dtype, generator=generator)
+    return tensor[start::step][indices].numpy()
+
+
+def _make_seed(seed: int, start: int, iter_count: int) -> int:
+    # NOTE: Tried a few variants (including iter_count << 32), this one worked best.
+    return seed + start + (iter_count << 24)
+
+
+class ShardedInfiniteSampler(Sampler):
+    def __init__(
+        self,
+        *,
+        sample_count: int,
+        shuffle: bool = False,
+        seed: int = 0,
+        start: Optional[int] = None,
+        step: Optional[int] = None,
+        advance: int = 0,
+        use_new_shuffle_tensor_slice: bool = False,
+    ):
+        self._sample_count = sample_count
+        self._seed = seed
+        self._shuffle = shuffle
+        self._start = distributed.get_global_rank() if start is None else start
+        self._step = distributed.get_global_size() if step is None else step
+        self._advance = advance
+        self._iter_count = 0
+        self._shuffle_tensor_slice_fn = (
+            _new_shuffle_tensor_slice if use_new_shuffle_tensor_slice else _shuffle_tensor_slice
+        )
+
+    def __iter__(self):
+        iter_count = self._advance // self._sample_count
+        if iter_count > 0:
+            self._advance -= iter_count * self._sample_count
+            self._iter_count += iter_count
+
+        if self._shuffle:
+            iterator = self._shuffled_iterator()
+        else:
+            iterator = self._iterator()
+
+        yield from itertools.islice(iterator, self._advance, None)
+
+    def _iterator(self):
+        assert not self._shuffle
+
+        while True:
+            iterable = range(self._sample_count)
+            yield from itertools.islice(iterable, self._start, None, self._step)
+
+    def _shuffled_iterator(self):
+        assert self._shuffle
+
+        # Instantiate a generator here (rather than in the ctor) to be keep the class
+        # picklable (requirement of mp.spawn)
+        generator = torch.Generator()
+
+        # Always shuffle everything first
+        generator.manual_seed(self._seed)
+        dtype = _get_torch_dtype(self._sample_count)
+        perm = torch.randperm(self._sample_count, dtype=dtype, generator=generator)
+
+        while True:
+            # Re-seed on each iteration to allow skipping whole permutations
+            seed = _make_seed(self._seed, self._start, self._iter_count)
+            generator.manual_seed(seed)
+
+            iterable = self._shuffle_tensor_slice_fn(
+                tensor=perm, start=self._start, step=self._step, generator=generator
+            )
+            yield from iterable
+            self._iter_count += 1
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/transforms.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1bc4cbd1a459a9f44314806cf9ccedea112ab14
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/transforms.py
@@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Sequence
+
+import torch
+from torchvision import transforms
+
+
+class GaussianBlur(transforms.RandomApply):
+    """
+    Apply Gaussian Blur to the PIL image.
+    """
+
+    def __init__(self, *, p: float = 0.5, radius_min: float = 0.1, radius_max: float = 2.0):
+        # NOTE: torchvision is applying 1 - probability to return the original image
+        keep_p = 1 - p
+        transform = transforms.GaussianBlur(kernel_size=9, sigma=(radius_min, radius_max))
+        super().__init__(transforms=[transform], p=keep_p)
+
+
+class MaybeToTensor(transforms.ToTensor):
+    """
+    Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor, or keep as is if already a tensor.
+    """
+
+    def __call__(self, pic):
+        """
+        Args:
+            pic (PIL Image, numpy.ndarray or torch.tensor): Image to be converted to tensor.
+        Returns:
+            Tensor: Converted image.
+        """
+        if isinstance(pic, torch.Tensor):
+            return pic
+        return super().__call__(pic)
+
+
+# Use timm's names
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+
+def make_normalize_transform(
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+) -> transforms.Normalize:
+    return transforms.Normalize(mean=mean, std=std)
+
+
+# This roughly matches torchvision's preset for classification training:
+#   https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L6-L44
+def make_classification_train_transform(
+    *,
+    crop_size: int = 224,
+    interpolation=transforms.InterpolationMode.BICUBIC,
+    hflip_prob: float = 0.5,
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+):
+    transforms_list = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)]
+    if hflip_prob > 0.0:
+        transforms_list.append(transforms.RandomHorizontalFlip(hflip_prob))
+    transforms_list.extend(
+        [
+            MaybeToTensor(),
+            make_normalize_transform(mean=mean, std=std),
+        ]
+    )
+    return transforms.Compose(transforms_list)
+
+
+# This matches (roughly) torchvision's preset for classification evaluation:
+#   https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L47-L69
+def make_classification_eval_transform(
+    *,
+    resize_size: int = 256,
+    interpolation=transforms.InterpolationMode.BICUBIC,
+    crop_size: int = 224,
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+) -> transforms.Compose:
+    transforms_list = [
+        transforms.Resize(resize_size, interpolation=interpolation),
+        transforms.CenterCrop(crop_size),
+        MaybeToTensor(),
+        make_normalize_transform(mean=mean, std=std),
+    ]
+    return transforms.Compose(transforms_list)
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/distributed/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccd663f33d5a21ad1f9d25db7bd378ec52aeac2
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/distributed/__init__.py
@@ -0,0 +1,271 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import random
+import re
+import socket
+from typing import Dict, List
+
+import torch
+import torch.distributed as dist
+
+_LOCAL_RANK = -1
+_LOCAL_WORLD_SIZE = -1
+
+
+def is_enabled() -> bool:
+    """
+    Returns:
+        True if distributed training is enabled
+    """
+    return dist.is_available() and dist.is_initialized()
+
+
+def get_global_size() -> int:
+    """
+    Returns:
+        The number of processes in the process group
+    """
+    return dist.get_world_size() if is_enabled() else 1
+
+
+def get_global_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the global process group.
+    """
+    return dist.get_rank() if is_enabled() else 0
+
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if not is_enabled():
+        return 0
+    assert 0 <= _LOCAL_RANK < _LOCAL_WORLD_SIZE
+    return _LOCAL_RANK
+
+
+def get_local_size() -> int:
+    """
+    Returns:
+        The size of the per-machine process group,
+        i.e. the number of processes per machine.
+    """
+    if not is_enabled():
+        return 1
+    assert 0 <= _LOCAL_RANK < _LOCAL_WORLD_SIZE
+    return _LOCAL_WORLD_SIZE
+
+
+def is_main_process() -> bool:
+    """
+    Returns:
+        True if the current process is the main one.
+    """
+    return get_global_rank() == 0
+
+
+def _restrict_print_to_main_process() -> None:
+    """
+    This function disables printing when not in the main process
+    """
+    import builtins as __builtin__
+
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_main_process() or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def _get_master_port(seed: int = 0) -> int:
+    MIN_MASTER_PORT, MAX_MASTER_PORT = (20_000, 60_000)
+
+    master_port_str = os.environ.get("MASTER_PORT")
+    if master_port_str is None:
+        rng = random.Random(seed)
+        return rng.randint(MIN_MASTER_PORT, MAX_MASTER_PORT)
+
+    return int(master_port_str)
+
+
+def _get_available_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        # A "" host address means INADDR_ANY i.e. binding to all interfaces.
+        # Note this is not compatible with IPv6.
+        s.bind(("", 0))
+        port = s.getsockname()[1]
+        return port
+
+
+_TORCH_DISTRIBUTED_ENV_VARS = (
+    "MASTER_ADDR",
+    "MASTER_PORT",
+    "RANK",
+    "WORLD_SIZE",
+    "LOCAL_RANK",
+    "LOCAL_WORLD_SIZE",
+)
+
+
+def _collect_env_vars() -> Dict[str, str]:
+    return {env_var: os.environ[env_var] for env_var in _TORCH_DISTRIBUTED_ENV_VARS if env_var in os.environ}
+
+
+def _is_slurm_job_process() -> bool:
+    return "SLURM_JOB_ID" in os.environ
+
+
+def _parse_slurm_node_list(s: str) -> List[str]:
+    nodes = []
+    # Extract "hostname", "hostname[1-2,3,4-5]," substrings
+    p = re.compile(r"(([^\[]+)(?:\[([^\]]+)\])?),?")
+    for m in p.finditer(s):
+        prefix, suffixes = s[m.start(2) : m.end(2)], s[m.start(3) : m.end(3)]
+        for suffix in suffixes.split(","):
+            span = suffix.split("-")
+            if len(span) == 1:
+                nodes.append(prefix + suffix)
+            else:
+                width = len(span[0])
+                start, end = int(span[0]), int(span[1]) + 1
+                nodes.extend([prefix + f"{i:0{width}}" for i in range(start, end)])
+    return nodes
+
+
+def _check_env_variable(key: str, new_value: str):
+    # Only check for difference with preset environment variables
+    if key in os.environ and os.environ[key] != new_value:
+        raise RuntimeError(f"Cannot export environment variables as {key} is already set")
+
+
+class _TorchDistributedEnvironment:
+    def __init__(self):
+        self.master_addr = "127.0.0.1"
+        self.master_port = 0
+        self.rank = -1
+        self.world_size = -1
+        self.local_rank = -1
+        self.local_world_size = -1
+
+        if _is_slurm_job_process():
+            return self._set_from_slurm_env()
+
+        env_vars = _collect_env_vars()
+        if not env_vars:
+            # Environment is not set
+            pass
+        elif len(env_vars) == len(_TORCH_DISTRIBUTED_ENV_VARS):
+            # Environment is fully set
+            return self._set_from_preset_env()
+        else:
+            # Environment is partially set
+            collected_env_vars = ", ".join(env_vars.keys())
+            raise RuntimeError(f"Partially set environment: {collected_env_vars}")
+
+        if torch.cuda.device_count() > 0:
+            return self._set_from_local()
+
+        raise RuntimeError("Can't initialize PyTorch distributed environment")
+
+    # Slurm job created with sbatch, submitit, etc...
+    def _set_from_slurm_env(self):
+        # logger.info("Initialization from Slurm environment")
+        job_id = int(os.environ["SLURM_JOB_ID"])
+        node_count = int(os.environ["SLURM_JOB_NUM_NODES"])
+        nodes = _parse_slurm_node_list(os.environ["SLURM_JOB_NODELIST"])
+        assert len(nodes) == node_count
+
+        self.master_addr = nodes[0]
+        self.master_port = _get_master_port(seed=job_id)
+        self.rank = int(os.environ["SLURM_PROCID"])
+        self.world_size = int(os.environ["SLURM_NTASKS"])
+        assert self.rank < self.world_size
+        self.local_rank = int(os.environ["SLURM_LOCALID"])
+        self.local_world_size = self.world_size // node_count
+        assert self.local_rank < self.local_world_size
+
+    # Single node job with preset environment (i.e. torchrun)
+    def _set_from_preset_env(self):
+        # logger.info("Initialization from preset environment")
+        self.master_addr = os.environ["MASTER_ADDR"]
+        self.master_port = os.environ["MASTER_PORT"]
+        self.rank = int(os.environ["RANK"])
+        self.world_size = int(os.environ["WORLD_SIZE"])
+        assert self.rank < self.world_size
+        self.local_rank = int(os.environ["LOCAL_RANK"])
+        self.local_world_size = int(os.environ["LOCAL_WORLD_SIZE"])
+        assert self.local_rank < self.local_world_size
+
+    # Single node and GPU job (i.e. local script run)
+    def _set_from_local(self):
+        # logger.info("Initialization from local")
+        self.master_addr = "127.0.0.1"
+        self.master_port = _get_available_port()
+        self.rank = 0
+        self.world_size = 1
+        self.local_rank = 0
+        self.local_world_size = 1
+
+    def export(self, *, overwrite: bool) -> "_TorchDistributedEnvironment":
+        # See the "Environment variable initialization" section from
+        # https://pytorch.org/docs/stable/distributed.html for the complete list of
+        # environment variables required for the env:// initialization method.
+        env_vars = {
+            "MASTER_ADDR": self.master_addr,
+            "MASTER_PORT": str(self.master_port),
+            "RANK": str(self.rank),
+            "WORLD_SIZE": str(self.world_size),
+            "LOCAL_RANK": str(self.local_rank),
+            "LOCAL_WORLD_SIZE": str(self.local_world_size),
+        }
+        if not overwrite:
+            for k, v in env_vars.items():
+                _check_env_variable(k, v)
+
+        os.environ.update(env_vars)
+        return self
+
+
+def enable(*, set_cuda_current_device: bool = True, overwrite: bool = False, allow_nccl_timeout: bool = False):
+    """Enable distributed mode
+
+    Args:
+        set_cuda_current_device: If True, call torch.cuda.set_device() to set the
+            current PyTorch CUDA device to the one matching the local rank.
+        overwrite: If True, overwrites already set variables. Else fails.
+    """
+
+    global _LOCAL_RANK, _LOCAL_WORLD_SIZE
+    if _LOCAL_RANK >= 0 or _LOCAL_WORLD_SIZE >= 0:
+        raise RuntimeError("Distributed mode has already been enabled")
+    torch_env = _TorchDistributedEnvironment()
+    torch_env.export(overwrite=overwrite)
+
+    if set_cuda_current_device:
+        torch.cuda.set_device(torch_env.local_rank)
+
+    if allow_nccl_timeout:
+        # This allows to use torch distributed timeout in a NCCL backend
+        key, value = "NCCL_ASYNC_ERROR_HANDLING", "1"
+        if not overwrite:
+            _check_env_variable(key, value)
+        os.environ[key] = value
+
+    dist.init_process_group(backend="nccl")
+    dist.barrier()
+
+    # Finalize setup
+    _LOCAL_RANK = torch_env.local_rank
+    _LOCAL_WORLD_SIZE = torch_env.local_world_size
+    _restrict_print_to_main_process()
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0952fcc3f57e34b3747962e9ebd6fc57aeea63fa
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/knn.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..02ee261348e9871b10bfc40b7283b4f6205cba18
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/knn.py
@@ -0,0 +1,405 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from functools import partial
+import json
+import logging
+import os
+import sys
+from typing import List, Optional
+
+import torch
+from torch.nn.functional import one_hot, softmax
+
+import dinov2.distributed as distributed
+from dinov2.data import SamplerType, make_data_loader, make_dataset
+from dinov2.data.transforms import make_classification_eval_transform
+from dinov2.eval.metrics import AccuracyAveraging, build_topk_accuracy_metric
+from dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from dinov2.eval.setup import setup_and_build_model
+from dinov2.eval.utils import ModelWithNormalize, evaluate, extract_features
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parents = parents or []
+    setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+    parents = [setup_args_parser]
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--train-dataset",
+        dest="train_dataset_str",
+        type=str,
+        help="Training dataset",
+    )
+    parser.add_argument(
+        "--val-dataset",
+        dest="val_dataset_str",
+        type=str,
+        help="Validation dataset",
+    )
+    parser.add_argument(
+        "--nb_knn",
+        nargs="+",
+        type=int,
+        help="Number of NN to use. 20 is usually working the best.",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        help="Temperature used in the voting coefficient",
+    )
+    parser.add_argument(
+        "--gather-on-cpu",
+        action="store_true",
+        help="Whether to gather the train features on cpu, slower"
+        "but useful to avoid OOM for large datasets (e.g. ImageNet22k).",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        help="Batch size.",
+    )
+    parser.add_argument(
+        "--n-per-class-list",
+        nargs="+",
+        type=int,
+        help="Number to take per class",
+    )
+    parser.add_argument(
+        "--n-tries",
+        type=int,
+        help="Number of tries",
+    )
+    parser.set_defaults(
+        train_dataset_str="ImageNet:split=TRAIN",
+        val_dataset_str="ImageNet:split=VAL",
+        nb_knn=[10, 20, 100, 200],
+        temperature=0.07,
+        batch_size=256,
+        n_per_class_list=[-1],
+        n_tries=1,
+    )
+    return parser
+
+
+class KnnModule(torch.nn.Module):
+    """
+    Gets knn of test features from all processes on a chunk of the train features
+
+    Each rank gets a chunk of the train features as well as a chunk of the test features.
+    In `compute_neighbors`, for each rank one after the other, its chunk of test features
+    is sent to all devices, partial knns are computed with each chunk of train features
+    then collated back on the original device.
+    """
+
+    def __init__(self, train_features, train_labels, nb_knn, T, device, num_classes=1000):
+        super().__init__()
+
+        self.global_rank = distributed.get_global_rank()
+        self.global_size = distributed.get_global_size()
+
+        self.device = device
+        self.train_features_rank_T = train_features.chunk(self.global_size)[self.global_rank].T.to(self.device)
+        self.candidates = train_labels.chunk(self.global_size)[self.global_rank].view(1, -1).to(self.device)
+
+        self.nb_knn = nb_knn
+        self.max_k = max(self.nb_knn)
+        self.T = T
+        self.num_classes = num_classes
+
+    def _get_knn_sims_and_labels(self, similarity, train_labels):
+        topk_sims, indices = similarity.topk(self.max_k, largest=True, sorted=True)
+        neighbors_labels = torch.gather(train_labels, 1, indices)
+        return topk_sims, neighbors_labels
+
+    def _similarity_for_rank(self, features_rank, source_rank):
+        # Send the features from `source_rank` to all ranks
+        broadcast_shape = torch.tensor(features_rank.shape).to(self.device)
+        torch.distributed.broadcast(broadcast_shape, source_rank)
+
+        broadcasted = features_rank
+        if self.global_rank != source_rank:
+            broadcasted = torch.zeros(*broadcast_shape, dtype=features_rank.dtype, device=self.device)
+        torch.distributed.broadcast(broadcasted, source_rank)
+
+        # Compute the neighbors for `source_rank` among `train_features_rank_T`
+        similarity_rank = torch.mm(broadcasted, self.train_features_rank_T)
+        candidate_labels = self.candidates.expand(len(similarity_rank), -1)
+        return self._get_knn_sims_and_labels(similarity_rank, candidate_labels)
+
+    def _gather_all_knn_for_rank(self, topk_sims, neighbors_labels, target_rank):
+        # Gather all neighbors for `target_rank`
+        topk_sims_rank = retrieved_rank = None
+        if self.global_rank == target_rank:
+            topk_sims_rank = [torch.zeros_like(topk_sims) for _ in range(self.global_size)]
+            retrieved_rank = [torch.zeros_like(neighbors_labels) for _ in range(self.global_size)]
+
+        torch.distributed.gather(topk_sims, topk_sims_rank, dst=target_rank)
+        torch.distributed.gather(neighbors_labels, retrieved_rank, dst=target_rank)
+
+        if self.global_rank == target_rank:
+            # Perform a second top-k on the k * global_size retrieved neighbors
+            topk_sims_rank = torch.cat(topk_sims_rank, dim=1)
+            retrieved_rank = torch.cat(retrieved_rank, dim=1)
+            results = self._get_knn_sims_and_labels(topk_sims_rank, retrieved_rank)
+            return results
+        return None
+
+    def compute_neighbors(self, features_rank):
+        for rank in range(self.global_size):
+            topk_sims, neighbors_labels = self._similarity_for_rank(features_rank, rank)
+            results = self._gather_all_knn_for_rank(topk_sims, neighbors_labels, rank)
+            if results is not None:
+                topk_sims_rank, neighbors_labels_rank = results
+        return topk_sims_rank, neighbors_labels_rank
+
+    def forward(self, features_rank):
+        """
+        Compute the results on all values of `self.nb_knn` neighbors from the full `self.max_k`
+        """
+        assert all(k <= self.max_k for k in self.nb_knn)
+
+        topk_sims, neighbors_labels = self.compute_neighbors(features_rank)
+        batch_size = neighbors_labels.shape[0]
+        topk_sims_transform = softmax(topk_sims / self.T, 1)
+        matmul = torch.mul(
+            one_hot(neighbors_labels, num_classes=self.num_classes),
+            topk_sims_transform.view(batch_size, -1, 1),
+        )
+        probas_for_k = {k: torch.sum(matmul[:, :k, :], 1) for k in self.nb_knn}
+        return probas_for_k
+
+
+class DictKeysModule(torch.nn.Module):
+    def __init__(self, keys):
+        super().__init__()
+        self.keys = keys
+
+    def forward(self, features_dict, targets):
+        for k in self.keys:
+            features_dict = features_dict[k]
+        return {"preds": features_dict, "target": targets}
+
+
+def create_module_dict(*, module, n_per_class_list, n_tries, nb_knn, train_features, train_labels):
+    modules = {}
+    mapping = create_class_indices_mapping(train_labels)
+    for npc in n_per_class_list:
+        if npc < 0:  # Only one try needed when using the full data
+            full_module = module(
+                train_features=train_features,
+                train_labels=train_labels,
+                nb_knn=nb_knn,
+            )
+            modules["full"] = ModuleDictWithForward({"1": full_module})
+            continue
+        all_tries = {}
+        for t in range(n_tries):
+            final_indices = filter_train(mapping, npc, seed=t)
+            k_list = list(set(nb_knn + [npc]))
+            k_list = sorted([el for el in k_list if el <= npc])
+            all_tries[str(t)] = module(
+                train_features=train_features[final_indices],
+                train_labels=train_labels[final_indices],
+                nb_knn=k_list,
+            )
+        modules[f"{npc} per class"] = ModuleDictWithForward(all_tries)
+
+    return ModuleDictWithForward(modules)
+
+
+def filter_train(mapping, n_per_class, seed):
+    torch.manual_seed(seed)
+    final_indices = []
+    for k in mapping.keys():
+        index = torch.randperm(len(mapping[k]))[:n_per_class]
+        final_indices.append(mapping[k][index])
+    return torch.cat(final_indices).squeeze()
+
+
+def create_class_indices_mapping(labels):
+    unique_labels, inverse = torch.unique(labels, return_inverse=True)
+    mapping = {unique_labels[i]: (inverse == i).nonzero() for i in range(len(unique_labels))}
+    return mapping
+
+
+class ModuleDictWithForward(torch.nn.ModuleDict):
+    def forward(self, *args, **kwargs):
+        return {k: module(*args, **kwargs) for k, module in self._modules.items()}
+
+
+def eval_knn(
+    model,
+    train_dataset,
+    val_dataset,
+    accuracy_averaging,
+    nb_knn,
+    temperature,
+    batch_size,
+    num_workers,
+    gather_on_cpu,
+    n_per_class_list=[-1],
+    n_tries=1,
+):
+    model = ModelWithNormalize(model)
+
+    logger.info("Extracting features for train set...")
+    train_features, train_labels = extract_features(
+        model, train_dataset, batch_size, num_workers, gather_on_cpu=gather_on_cpu
+    )
+    logger.info(f"Train features created, shape {train_features.shape}.")
+
+    val_dataloader = make_data_loader(
+        dataset=val_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        sampler_type=SamplerType.DISTRIBUTED,
+        drop_last=False,
+        shuffle=False,
+        persistent_workers=True,
+    )
+    num_classes = train_labels.max() + 1
+    metric_collection = build_topk_accuracy_metric(accuracy_averaging, num_classes=num_classes)
+
+    device = torch.cuda.current_device()
+    partial_module = partial(KnnModule, T=temperature, device=device, num_classes=num_classes)
+    knn_module_dict = create_module_dict(
+        module=partial_module,
+        n_per_class_list=n_per_class_list,
+        n_tries=n_tries,
+        nb_knn=nb_knn,
+        train_features=train_features,
+        train_labels=train_labels,
+    )
+    postprocessors, metrics = {}, {}
+    for n_per_class, knn_module in knn_module_dict.items():
+        for t, knn_try in knn_module.items():
+            postprocessors = {
+                **postprocessors,
+                **{(n_per_class, t, k): DictKeysModule([n_per_class, t, k]) for k in knn_try.nb_knn},
+            }
+            metrics = {**metrics, **{(n_per_class, t, k): metric_collection.clone() for k in knn_try.nb_knn}}
+    model_with_knn = torch.nn.Sequential(model, knn_module_dict)
+
+    # ============ evaluation ... ============
+    logger.info("Start the k-NN classification.")
+    _, results_dict = evaluate(model_with_knn, val_dataloader, postprocessors, metrics, device)
+
+    # Averaging the results over the n tries for each value of n_per_class
+    for n_per_class, knn_module in knn_module_dict.items():
+        first_try = list(knn_module.keys())[0]
+        k_list = knn_module[first_try].nb_knn
+        for k in k_list:
+            keys = results_dict[(n_per_class, first_try, k)].keys()  # keys are e.g. `top-1` and `top-5`
+            results_dict[(n_per_class, k)] = {
+                key: torch.mean(torch.stack([results_dict[(n_per_class, t, k)][key] for t in knn_module.keys()]))
+                for key in keys
+            }
+            for t in knn_module.keys():
+                del results_dict[(n_per_class, t, k)]
+
+    return results_dict
+
+
+def eval_knn_with_model(
+    model,
+    output_dir,
+    train_dataset_str="ImageNet:split=TRAIN",
+    val_dataset_str="ImageNet:split=VAL",
+    nb_knn=(10, 20, 100, 200),
+    temperature=0.07,
+    autocast_dtype=torch.float,
+    accuracy_averaging=AccuracyAveraging.MEAN_ACCURACY,
+    transform=None,
+    gather_on_cpu=False,
+    batch_size=256,
+    num_workers=5,
+    n_per_class_list=[-1],
+    n_tries=1,
+):
+    transform = transform or make_classification_eval_transform()
+
+    train_dataset = make_dataset(
+        dataset_str=train_dataset_str,
+        transform=transform,
+    )
+    val_dataset = make_dataset(
+        dataset_str=val_dataset_str,
+        transform=transform,
+    )
+
+    with torch.cuda.amp.autocast(dtype=autocast_dtype):
+        results_dict_knn = eval_knn(
+            model=model,
+            train_dataset=train_dataset,
+            val_dataset=val_dataset,
+            accuracy_averaging=accuracy_averaging,
+            nb_knn=nb_knn,
+            temperature=temperature,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            gather_on_cpu=gather_on_cpu,
+            n_per_class_list=n_per_class_list,
+            n_tries=n_tries,
+        )
+
+    results_dict = {}
+    if distributed.is_main_process():
+        for knn_ in results_dict_knn.keys():
+            top1 = results_dict_knn[knn_]["top-1"].item() * 100.0
+            top5 = results_dict_knn[knn_]["top-5"].item() * 100.0
+            results_dict[f"{knn_} Top 1"] = top1
+            results_dict[f"{knn_} Top 5"] = top5
+            logger.info(f"{knn_} classifier result: Top1: {top1:.2f} Top5: {top5:.2f}")
+
+    metrics_file_path = os.path.join(output_dir, "results_eval_knn.json")
+    with open(metrics_file_path, "a") as f:
+        for k, v in results_dict.items():
+            f.write(json.dumps({k: v}) + "\n")
+
+    if distributed.is_enabled():
+        torch.distributed.barrier()
+    return results_dict
+
+
+def main(args):
+    model, autocast_dtype = setup_and_build_model(args)
+    eval_knn_with_model(
+        model=model,
+        output_dir=args.output_dir,
+        train_dataset_str=args.train_dataset_str,
+        val_dataset_str=args.val_dataset_str,
+        nb_knn=args.nb_knn,
+        temperature=args.temperature,
+        autocast_dtype=autocast_dtype,
+        accuracy_averaging=AccuracyAveraging.MEAN_ACCURACY,
+        transform=None,
+        gather_on_cpu=args.gather_on_cpu,
+        batch_size=args.batch_size,
+        num_workers=5,
+        n_per_class_list=args.n_per_class_list,
+        n_tries=args.n_tries,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    description = "DINOv2 k-NN evaluation"
+    args_parser = get_args_parser(description=description)
+    args = args_parser.parse_args()
+    sys.exit(main(args))
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/linear.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d8202606999c0c01353904d8b02d2ff3509fef9
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/linear.py
@@ -0,0 +1,626 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from functools import partial
+import json
+import logging
+import os
+import sys
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel
+from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer
+
+from dinov2.data import SamplerType, make_data_loader, make_dataset
+from dinov2.data.transforms import make_classification_eval_transform, make_classification_train_transform
+import dinov2.distributed as distributed
+from dinov2.eval.metrics import MetricType, build_metric
+from dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from dinov2.eval.setup import setup_and_build_model
+from dinov2.eval.utils import ModelWithIntermediateLayers, evaluate
+from dinov2.logging import MetricLogger
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parents = parents or []
+    setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+    parents = [setup_args_parser]
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--train-dataset",
+        dest="train_dataset_str",
+        type=str,
+        help="Training dataset",
+    )
+    parser.add_argument(
+        "--val-dataset",
+        dest="val_dataset_str",
+        type=str,
+        help="Validation dataset",
+    )
+    parser.add_argument(
+        "--test-datasets",
+        dest="test_dataset_strs",
+        type=str,
+        nargs="+",
+        help="Test datasets, none to reuse the validation dataset",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        help="Number of training epochs",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        help="Batch Size (per GPU)",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        help="Number de Workers",
+    )
+    parser.add_argument(
+        "--epoch-length",
+        type=int,
+        help="Length of an epoch in number of iterations",
+    )
+    parser.add_argument(
+        "--save-checkpoint-frequency",
+        type=int,
+        help="Number of epochs between two named checkpoint saves.",
+    )
+    parser.add_argument(
+        "--eval-period-iterations",
+        type=int,
+        help="Number of iterations between two evaluations.",
+    )
+    parser.add_argument(
+        "--learning-rates",
+        nargs="+",
+        type=float,
+        help="Learning rates to grid search.",
+    )
+    parser.add_argument(
+        "--no-resume",
+        action="store_true",
+        help="Whether to not resume from existing checkpoints",
+    )
+    parser.add_argument(
+        "--val-metric-type",
+        type=MetricType,
+        choices=list(MetricType),
+        help="Validation metric",
+    )
+    parser.add_argument(
+        "--test-metric-types",
+        type=MetricType,
+        choices=list(MetricType),
+        nargs="+",
+        help="Evaluation metric",
+    )
+    parser.add_argument(
+        "--classifier-fpath",
+        type=str,
+        help="Path to a file containing pretrained linear classifiers",
+    )
+    parser.add_argument(
+        "--val-class-mapping-fpath",
+        type=str,
+        help="Path to a file containing a mapping to adjust classifier outputs",
+    )
+    parser.add_argument(
+        "--test-class-mapping-fpaths",
+        nargs="+",
+        type=str,
+        help="Path to a file containing a mapping to adjust classifier outputs",
+    )
+    parser.set_defaults(
+        train_dataset_str="ImageNet:split=TRAIN",
+        val_dataset_str="ImageNet:split=VAL",
+        test_dataset_strs=None,
+        epochs=10,
+        batch_size=128,
+        num_workers=8,
+        epoch_length=1250,
+        save_checkpoint_frequency=20,
+        eval_period_iterations=1250,
+        learning_rates=[1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 0.1],
+        val_metric_type=MetricType.MEAN_ACCURACY,
+        test_metric_types=None,
+        classifier_fpath=None,
+        val_class_mapping_fpath=None,
+        test_class_mapping_fpaths=[None],
+    )
+    return parser
+
+
+def has_ddp_wrapper(m: nn.Module) -> bool:
+    return isinstance(m, DistributedDataParallel)
+
+
+def remove_ddp_wrapper(m: nn.Module) -> nn.Module:
+    return m.module if has_ddp_wrapper(m) else m
+
+
+def _pad_and_collate(batch):
+    maxlen = max(len(targets) for image, targets in batch)
+    padded_batch = [
+        (image, np.pad(targets, (0, maxlen - len(targets)), constant_values=-1)) for image, targets in batch
+    ]
+    return torch.utils.data.default_collate(padded_batch)
+
+
+def create_linear_input(x_tokens_list, use_n_blocks, use_avgpool):
+    intermediate_output = x_tokens_list[-use_n_blocks:]
+    output = torch.cat([class_token for _, class_token in intermediate_output], dim=-1)
+    if use_avgpool:
+        output = torch.cat(
+            (
+                output,
+                torch.mean(intermediate_output[-1][0], dim=1),  # patch tokens
+            ),
+            dim=-1,
+        )
+        output = output.reshape(output.shape[0], -1)
+    return output.float()
+
+
+class LinearClassifier(nn.Module):
+    """Linear layer to train on top of frozen features"""
+
+    def __init__(self, out_dim, use_n_blocks, use_avgpool, num_classes=1000):
+        super().__init__()
+        self.out_dim = out_dim
+        self.use_n_blocks = use_n_blocks
+        self.use_avgpool = use_avgpool
+        self.num_classes = num_classes
+        self.linear = nn.Linear(out_dim, num_classes)
+        self.linear.weight.data.normal_(mean=0.0, std=0.01)
+        self.linear.bias.data.zero_()
+
+    def forward(self, x_tokens_list):
+        output = create_linear_input(x_tokens_list, self.use_n_blocks, self.use_avgpool)
+        return self.linear(output)
+
+
+class AllClassifiers(nn.Module):
+    def __init__(self, classifiers_dict):
+        super().__init__()
+        self.classifiers_dict = nn.ModuleDict()
+        self.classifiers_dict.update(classifiers_dict)
+
+    def forward(self, inputs):
+        return {k: v.forward(inputs) for k, v in self.classifiers_dict.items()}
+
+    def __len__(self):
+        return len(self.classifiers_dict)
+
+
+class LinearPostprocessor(nn.Module):
+    def __init__(self, linear_classifier, class_mapping=None):
+        super().__init__()
+        self.linear_classifier = linear_classifier
+        self.register_buffer("class_mapping", None if class_mapping is None else torch.LongTensor(class_mapping))
+
+    def forward(self, samples, targets):
+        preds = self.linear_classifier(samples)
+        return {
+            "preds": preds[:, self.class_mapping] if self.class_mapping is not None else preds,
+            "target": targets,
+        }
+
+
+def scale_lr(learning_rates, batch_size):
+    return learning_rates * (batch_size * distributed.get_global_size()) / 256.0
+
+
+def setup_linear_classifiers(sample_output, n_last_blocks_list, learning_rates, batch_size, num_classes=1000):
+    linear_classifiers_dict = nn.ModuleDict()
+    optim_param_groups = []
+    for n in n_last_blocks_list:
+        for avgpool in [False, True]:
+            for _lr in learning_rates:
+                lr = scale_lr(_lr, batch_size)
+                out_dim = create_linear_input(sample_output, use_n_blocks=n, use_avgpool=avgpool).shape[1]
+                linear_classifier = LinearClassifier(
+                    out_dim, use_n_blocks=n, use_avgpool=avgpool, num_classes=num_classes
+                )
+                linear_classifier = linear_classifier.cuda()
+                linear_classifiers_dict[
+                    f"classifier_{n}_blocks_avgpool_{avgpool}_lr_{lr:.5f}".replace(".", "_")
+                ] = linear_classifier
+                optim_param_groups.append({"params": linear_classifier.parameters(), "lr": lr})
+
+    linear_classifiers = AllClassifiers(linear_classifiers_dict)
+    if distributed.is_enabled():
+        linear_classifiers = nn.parallel.DistributedDataParallel(linear_classifiers)
+
+    return linear_classifiers, optim_param_groups
+
+
+@torch.no_grad()
+def evaluate_linear_classifiers(
+    feature_model,
+    linear_classifiers,
+    data_loader,
+    metric_type,
+    metrics_file_path,
+    training_num_classes,
+    iteration,
+    prefixstring="",
+    class_mapping=None,
+    best_classifier_on_val=None,
+):
+    logger.info("running validation !")
+
+    num_classes = len(class_mapping) if class_mapping is not None else training_num_classes
+    metric = build_metric(metric_type, num_classes=num_classes)
+    postprocessors = {k: LinearPostprocessor(v, class_mapping) for k, v in linear_classifiers.classifiers_dict.items()}
+    metrics = {k: metric.clone() for k in linear_classifiers.classifiers_dict}
+
+    _, results_dict_temp = evaluate(
+        feature_model,
+        data_loader,
+        postprocessors,
+        metrics,
+        torch.cuda.current_device(),
+    )
+
+    logger.info("")
+    results_dict = {}
+    max_accuracy = 0
+    best_classifier = ""
+    for i, (classifier_string, metric) in enumerate(results_dict_temp.items()):
+        logger.info(f"{prefixstring} -- Classifier: {classifier_string} * {metric}")
+        if (
+            best_classifier_on_val is None and metric["top-1"].item() > max_accuracy
+        ) or classifier_string == best_classifier_on_val:
+            max_accuracy = metric["top-1"].item()
+            best_classifier = classifier_string
+
+    results_dict["best_classifier"] = {"name": best_classifier, "accuracy": max_accuracy}
+
+    logger.info(f"best classifier: {results_dict['best_classifier']}")
+
+    if distributed.is_main_process():
+        with open(metrics_file_path, "a") as f:
+            f.write(f"iter: {iteration}\n")
+            for k, v in results_dict.items():
+                f.write(json.dumps({k: v}) + "\n")
+            f.write("\n")
+
+    return results_dict
+
+
+def eval_linear(
+    *,
+    feature_model,
+    linear_classifiers,
+    train_data_loader,
+    val_data_loader,
+    metrics_file_path,
+    optimizer,
+    scheduler,
+    output_dir,
+    max_iter,
+    checkpoint_period,  # In number of iter, creates a new file every period
+    running_checkpoint_period,  # Period to update main checkpoint file
+    eval_period,
+    metric_type,
+    training_num_classes,
+    resume=True,
+    classifier_fpath=None,
+    val_class_mapping=None,
+):
+    checkpointer = Checkpointer(linear_classifiers, output_dir, optimizer=optimizer, scheduler=scheduler)
+    start_iter = checkpointer.resume_or_load(classifier_fpath or "", resume=resume).get("iteration", -1) + 1
+
+    periodic_checkpointer = PeriodicCheckpointer(checkpointer, checkpoint_period, max_iter=max_iter)
+    iteration = start_iter
+    logger.info("Starting training from iteration {}".format(start_iter))
+    metric_logger = MetricLogger(delimiter="  ")
+    header = "Training"
+
+    for data, labels in metric_logger.log_every(
+        train_data_loader,
+        10,
+        header,
+        max_iter,
+        start_iter,
+    ):
+        data = data.cuda(non_blocking=True)
+        labels = labels.cuda(non_blocking=True)
+
+        features = feature_model(data)
+        outputs = linear_classifiers(features)
+
+        losses = {f"loss_{k}": nn.CrossEntropyLoss()(v, labels) for k, v in outputs.items()}
+        loss = sum(losses.values())
+
+        # compute the gradients
+        optimizer.zero_grad()
+        loss.backward()
+
+        # step
+        optimizer.step()
+        scheduler.step()
+
+        # log
+        if iteration % 10 == 0:
+            torch.cuda.synchronize()
+            metric_logger.update(loss=loss.item())
+            metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+            print("lr", optimizer.param_groups[0]["lr"])
+
+        if iteration - start_iter > 5:
+            if iteration % running_checkpoint_period == 0:
+                torch.cuda.synchronize()
+                if distributed.is_main_process():
+                    logger.info("Checkpointing running_checkpoint")
+                    periodic_checkpointer.save("running_checkpoint_linear_eval", iteration=iteration)
+                torch.cuda.synchronize()
+        periodic_checkpointer.step(iteration)
+
+        if eval_period > 0 and (iteration + 1) % eval_period == 0 and iteration != max_iter - 1:
+            _ = evaluate_linear_classifiers(
+                feature_model=feature_model,
+                linear_classifiers=remove_ddp_wrapper(linear_classifiers),
+                data_loader=val_data_loader,
+                metrics_file_path=metrics_file_path,
+                prefixstring=f"ITER: {iteration}",
+                metric_type=metric_type,
+                training_num_classes=training_num_classes,
+                iteration=iteration,
+                class_mapping=val_class_mapping,
+            )
+            torch.cuda.synchronize()
+
+        iteration = iteration + 1
+
+    val_results_dict = evaluate_linear_classifiers(
+        feature_model=feature_model,
+        linear_classifiers=remove_ddp_wrapper(linear_classifiers),
+        data_loader=val_data_loader,
+        metrics_file_path=metrics_file_path,
+        metric_type=metric_type,
+        training_num_classes=training_num_classes,
+        iteration=iteration,
+        class_mapping=val_class_mapping,
+    )
+    return val_results_dict, feature_model, linear_classifiers, iteration
+
+
+def make_eval_data_loader(test_dataset_str, batch_size, num_workers, metric_type):
+    test_dataset = make_dataset(
+        dataset_str=test_dataset_str,
+        transform=make_classification_eval_transform(),
+    )
+    test_data_loader = make_data_loader(
+        dataset=test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        sampler_type=SamplerType.DISTRIBUTED,
+        drop_last=False,
+        shuffle=False,
+        persistent_workers=False,
+        collate_fn=_pad_and_collate if metric_type == MetricType.IMAGENET_REAL_ACCURACY else None,
+    )
+    return test_data_loader
+
+
+def test_on_datasets(
+    feature_model,
+    linear_classifiers,
+    test_dataset_strs,
+    batch_size,
+    num_workers,
+    test_metric_types,
+    metrics_file_path,
+    training_num_classes,
+    iteration,
+    best_classifier_on_val,
+    prefixstring="",
+    test_class_mappings=[None],
+):
+    results_dict = {}
+    for test_dataset_str, class_mapping, metric_type in zip(test_dataset_strs, test_class_mappings, test_metric_types):
+        logger.info(f"Testing on {test_dataset_str}")
+        test_data_loader = make_eval_data_loader(test_dataset_str, batch_size, num_workers, metric_type)
+        dataset_results_dict = evaluate_linear_classifiers(
+            feature_model,
+            remove_ddp_wrapper(linear_classifiers),
+            test_data_loader,
+            metric_type,
+            metrics_file_path,
+            training_num_classes,
+            iteration,
+            prefixstring="",
+            class_mapping=class_mapping,
+            best_classifier_on_val=best_classifier_on_val,
+        )
+        results_dict[f"{test_dataset_str}_accuracy"] = 100.0 * dataset_results_dict["best_classifier"]["accuracy"]
+    return results_dict
+
+
+def run_eval_linear(
+    model,
+    output_dir,
+    train_dataset_str,
+    val_dataset_str,
+    batch_size,
+    epochs,
+    epoch_length,
+    num_workers,
+    save_checkpoint_frequency,
+    eval_period_iterations,
+    learning_rates,
+    autocast_dtype,
+    test_dataset_strs=None,
+    resume=True,
+    classifier_fpath=None,
+    val_class_mapping_fpath=None,
+    test_class_mapping_fpaths=[None],
+    val_metric_type=MetricType.MEAN_ACCURACY,
+    test_metric_types=None,
+):
+    seed = 0
+
+    if test_dataset_strs is None:
+        test_dataset_strs = [val_dataset_str]
+    if test_metric_types is None:
+        test_metric_types = [val_metric_type] * len(test_dataset_strs)
+    else:
+        assert len(test_metric_types) == len(test_dataset_strs)
+    assert len(test_dataset_strs) == len(test_class_mapping_fpaths)
+
+    train_transform = make_classification_train_transform()
+    train_dataset = make_dataset(
+        dataset_str=train_dataset_str,
+        transform=train_transform,
+    )
+    training_num_classes = len(torch.unique(torch.Tensor(train_dataset.get_targets().astype(int))))
+    sampler_type = SamplerType.SHARDED_INFINITE
+    # sampler_type = SamplerType.INFINITE
+
+    n_last_blocks_list = [1, 4]
+    n_last_blocks = max(n_last_blocks_list)
+    autocast_ctx = partial(torch.cuda.amp.autocast, enabled=True, dtype=autocast_dtype)
+    feature_model = ModelWithIntermediateLayers(model, n_last_blocks, autocast_ctx)
+    sample_output = feature_model(train_dataset[0][0].unsqueeze(0).cuda())
+
+    linear_classifiers, optim_param_groups = setup_linear_classifiers(
+        sample_output,
+        n_last_blocks_list,
+        learning_rates,
+        batch_size,
+        training_num_classes,
+    )
+
+    optimizer = torch.optim.SGD(optim_param_groups, momentum=0.9, weight_decay=0)
+    max_iter = epochs * epoch_length
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iter, eta_min=0)
+    checkpointer = Checkpointer(linear_classifiers, output_dir, optimizer=optimizer, scheduler=scheduler)
+    start_iter = checkpointer.resume_or_load(classifier_fpath or "", resume=resume).get("iteration", -1) + 1
+    train_data_loader = make_data_loader(
+        dataset=train_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=True,
+        seed=seed,
+        sampler_type=sampler_type,
+        sampler_advance=start_iter,
+        drop_last=True,
+        persistent_workers=True,
+    )
+    val_data_loader = make_eval_data_loader(val_dataset_str, batch_size, num_workers, val_metric_type)
+
+    checkpoint_period = save_checkpoint_frequency * epoch_length
+
+    if val_class_mapping_fpath is not None:
+        logger.info(f"Using class mapping from {val_class_mapping_fpath}")
+        val_class_mapping = np.load(val_class_mapping_fpath)
+    else:
+        val_class_mapping = None
+
+    test_class_mappings = []
+    for class_mapping_fpath in test_class_mapping_fpaths:
+        if class_mapping_fpath is not None and class_mapping_fpath != "None":
+            logger.info(f"Using class mapping from {class_mapping_fpath}")
+            class_mapping = np.load(class_mapping_fpath)
+        else:
+            class_mapping = None
+        test_class_mappings.append(class_mapping)
+
+    metrics_file_path = os.path.join(output_dir, "results_eval_linear.json")
+    val_results_dict, feature_model, linear_classifiers, iteration = eval_linear(
+        feature_model=feature_model,
+        linear_classifiers=linear_classifiers,
+        train_data_loader=train_data_loader,
+        val_data_loader=val_data_loader,
+        metrics_file_path=metrics_file_path,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        output_dir=output_dir,
+        max_iter=max_iter,
+        checkpoint_period=checkpoint_period,
+        running_checkpoint_period=epoch_length,
+        eval_period=eval_period_iterations,
+        metric_type=val_metric_type,
+        training_num_classes=training_num_classes,
+        resume=resume,
+        val_class_mapping=val_class_mapping,
+        classifier_fpath=classifier_fpath,
+    )
+    results_dict = {}
+    if len(test_dataset_strs) > 1 or test_dataset_strs[0] != val_dataset_str:
+        results_dict = test_on_datasets(
+            feature_model,
+            linear_classifiers,
+            test_dataset_strs,
+            batch_size,
+            0,  # num_workers,
+            test_metric_types,
+            metrics_file_path,
+            training_num_classes,
+            iteration,
+            val_results_dict["best_classifier"]["name"],
+            prefixstring="",
+            test_class_mappings=test_class_mappings,
+        )
+    results_dict["best_classifier"] = val_results_dict["best_classifier"]["name"]
+    results_dict[f"{val_dataset_str}_accuracy"] = 100.0 * val_results_dict["best_classifier"]["accuracy"]
+    logger.info("Test Results Dict " + str(results_dict))
+
+    return results_dict
+
+
+def main(args):
+    model, autocast_dtype = setup_and_build_model(args)
+    run_eval_linear(
+        model=model,
+        output_dir=args.output_dir,
+        train_dataset_str=args.train_dataset_str,
+        val_dataset_str=args.val_dataset_str,
+        test_dataset_strs=args.test_dataset_strs,
+        batch_size=args.batch_size,
+        epochs=args.epochs,
+        epoch_length=args.epoch_length,
+        num_workers=args.num_workers,
+        save_checkpoint_frequency=args.save_checkpoint_frequency,
+        eval_period_iterations=args.eval_period_iterations,
+        learning_rates=args.learning_rates,
+        autocast_dtype=autocast_dtype,
+        resume=not args.no_resume,
+        classifier_fpath=args.classifier_fpath,
+        val_metric_type=args.val_metric_type,
+        test_metric_types=args.test_metric_types,
+        val_class_mapping_fpath=args.val_class_mapping_fpath,
+        test_class_mapping_fpaths=args.test_class_mapping_fpaths,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    description = "DINOv2 linear evaluation"
+    args_parser = get_args_parser(description=description)
+    args = args_parser.parse_args()
+    sys.exit(main(args))
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/log_regression.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/log_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e6ede2b616208cb49c7af67d58c8e6e4afb60e1
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/log_regression.py
@@ -0,0 +1,445 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import gc
+import logging
+import sys
+import time
+from typing import List, Optional
+
+from cuml.linear_model import LogisticRegression
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed
+from torch import nn
+from torch.utils.data import TensorDataset
+from torchmetrics import MetricTracker
+
+from dinov2.data import make_dataset
+from dinov2.data.transforms import make_classification_eval_transform
+from dinov2.distributed import get_global_rank, get_global_size
+from dinov2.eval.metrics import MetricType, build_metric
+from dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from dinov2.eval.setup import setup_and_build_model
+from dinov2.eval.utils import evaluate, extract_features
+from dinov2.utils.dtype import as_torch_dtype
+
+
+logger = logging.getLogger("dinov2")
+
+DEFAULT_MAX_ITER = 1_000
+C_POWER_RANGE = torch.linspace(-6, 5, 45)
+_CPU_DEVICE = torch.device("cpu")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parents = parents or []
+    setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+    parents = [setup_args_parser]
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--train-dataset",
+        dest="train_dataset_str",
+        type=str,
+        help="Training dataset",
+    )
+    parser.add_argument(
+        "--val-dataset",
+        dest="val_dataset_str",
+        type=str,
+        help="Validation dataset",
+    )
+    parser.add_argument(
+        "--finetune-dataset-str",
+        dest="finetune_dataset_str",
+        type=str,
+        help="Fine-tuning dataset",
+    )
+    parser.add_argument(
+        "--finetune-on-val",
+        action="store_true",
+        help="If there is no finetune dataset, whether to choose the "
+        "hyperparameters on the val set instead of 10%% of the train dataset",
+    )
+    parser.add_argument(
+        "--metric-type",
+        type=MetricType,
+        choices=list(MetricType),
+        help="Metric type",
+    )
+    parser.add_argument(
+        "--train-features-device",
+        type=str,
+        help="Device to gather train features (cpu, cuda, cuda:0, etc.), default: %(default)s",
+    )
+    parser.add_argument(
+        "--train-dtype",
+        type=str,
+        help="Data type to convert the train features to (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--max-train-iters",
+        type=int,
+        help="Maximum number of train iterations (default: %(default)s)",
+    )
+    parser.set_defaults(
+        train_dataset_str="ImageNet:split=TRAIN",
+        val_dataset_str="ImageNet:split=VAL",
+        finetune_dataset_str=None,
+        metric_type=MetricType.MEAN_ACCURACY,
+        train_features_device="cpu",
+        train_dtype="float64",
+        max_train_iters=DEFAULT_MAX_ITER,
+        finetune_on_val=False,
+    )
+    return parser
+
+
+class LogRegModule(nn.Module):
+    def __init__(
+        self,
+        C,
+        max_iter=DEFAULT_MAX_ITER,
+        dtype=torch.float64,
+        device=_CPU_DEVICE,
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.device = device
+        self.estimator = LogisticRegression(
+            penalty="l2",
+            C=C,
+            max_iter=max_iter,
+            output_type="numpy",
+            tol=1e-12,
+            linesearch_max_iter=50,
+        )
+
+    def forward(self, samples, targets):
+        samples_device = samples.device
+        samples = samples.to(dtype=self.dtype, device=self.device)
+        if self.device == _CPU_DEVICE:
+            samples = samples.numpy()
+        probas = self.estimator.predict_proba(samples)
+        return {"preds": torch.from_numpy(probas).to(samples_device), "target": targets}
+
+    def fit(self, train_features, train_labels):
+        train_features = train_features.to(dtype=self.dtype, device=self.device)
+        train_labels = train_labels.to(dtype=self.dtype, device=self.device)
+        if self.device == _CPU_DEVICE:
+            # both cuML and sklearn only work with numpy arrays on CPU
+            train_features = train_features.numpy()
+            train_labels = train_labels.numpy()
+        self.estimator.fit(train_features, train_labels)
+
+
+def evaluate_model(*, logreg_model, logreg_metric, test_data_loader, device):
+    postprocessors = {"metrics": logreg_model}
+    metrics = {"metrics": logreg_metric}
+    return evaluate(nn.Identity(), test_data_loader, postprocessors, metrics, device)
+
+
+def train_for_C(*, C, max_iter, train_features, train_labels, dtype=torch.float64, device=_CPU_DEVICE):
+    logreg_model = LogRegModule(C, max_iter=max_iter, dtype=dtype, device=device)
+    logreg_model.fit(train_features, train_labels)
+    return logreg_model
+
+
+def train_and_evaluate(
+    *,
+    C,
+    max_iter,
+    train_features,
+    train_labels,
+    logreg_metric,
+    test_data_loader,
+    train_dtype=torch.float64,
+    train_features_device,
+    eval_device,
+):
+    logreg_model = train_for_C(
+        C=C,
+        max_iter=max_iter,
+        train_features=train_features,
+        train_labels=train_labels,
+        dtype=train_dtype,
+        device=train_features_device,
+    )
+    return evaluate_model(
+        logreg_model=logreg_model,
+        logreg_metric=logreg_metric,
+        test_data_loader=test_data_loader,
+        device=eval_device,
+    )
+
+
+def sweep_C_values(
+    *,
+    train_features,
+    train_labels,
+    test_data_loader,
+    metric_type,
+    num_classes,
+    train_dtype=torch.float64,
+    train_features_device=_CPU_DEVICE,
+    max_train_iters=DEFAULT_MAX_ITER,
+):
+    if metric_type == MetricType.PER_CLASS_ACCURACY:
+        # If we want to output per-class accuracy, we select the hyperparameters with mean per class
+        metric_type = MetricType.MEAN_PER_CLASS_ACCURACY
+    logreg_metric = build_metric(metric_type, num_classes=num_classes)
+    metric_tracker = MetricTracker(logreg_metric, maximize=True)
+    ALL_C = 10**C_POWER_RANGE
+    logreg_models = {}
+
+    train_features = train_features.to(dtype=train_dtype, device=train_features_device)
+    train_labels = train_labels.to(device=train_features_device)
+
+    for i in range(get_global_rank(), len(ALL_C), get_global_size()):
+        C = ALL_C[i].item()
+        logger.info(
+            f"Training for C = {C:.5f}, dtype={train_dtype}, "
+            f"features: {train_features.shape}, {train_features.dtype}, "
+            f"labels: {train_labels.shape}, {train_labels.dtype}"
+        )
+        logreg_models[C] = train_for_C(
+            C=C,
+            max_iter=max_train_iters,
+            train_features=train_features,
+            train_labels=train_labels,
+            dtype=train_dtype,
+            device=train_features_device,
+        )
+
+    gather_list = [None for _ in range(get_global_size())]
+    torch.distributed.all_gather_object(gather_list, logreg_models)
+
+    logreg_models_gathered = {}
+    for logreg_dict in gather_list:
+        logreg_models_gathered.update(logreg_dict)
+
+    for i in range(len(ALL_C)):
+        metric_tracker.increment()
+        C = ALL_C[i].item()
+        evals = evaluate_model(
+            logreg_model=logreg_models_gathered[C],
+            logreg_metric=metric_tracker,
+            test_data_loader=test_data_loader,
+            device=torch.cuda.current_device(),
+        )
+        logger.info(f"Trained for C = {C:.5f}, accuracies = {evals}")
+
+        best_stats, which_epoch = metric_tracker.best_metric(return_step=True)
+        best_stats_100 = {k: 100.0 * v for k, v in best_stats.items()}
+        if which_epoch["top-1"] == i:
+            best_C = C
+    logger.info(f"Sweep best {best_stats_100}, best C = {best_C:.6f}")
+
+    return best_stats, best_C
+
+
+def eval_log_regression(
+    *,
+    model,
+    train_dataset,
+    val_dataset,
+    finetune_dataset,
+    metric_type,
+    batch_size,
+    num_workers,
+    finetune_on_val=False,
+    train_dtype=torch.float64,
+    train_features_device=_CPU_DEVICE,
+    max_train_iters=DEFAULT_MAX_ITER,
+):
+    """
+    Implements the "standard" process for log regression evaluation:
+    The value of C is chosen by training on train_dataset and evaluating on
+    finetune_dataset. Then, the final model is trained on a concatenation of
+    train_dataset and finetune_dataset, and is evaluated on val_dataset.
+    If there is no finetune_dataset, the value of C is the one that yields
+    the best results on a random 10% subset of the train dataset
+    """
+
+    start = time.time()
+
+    train_features, train_labels = extract_features(
+        model, train_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+    )
+    val_features, val_labels = extract_features(
+        model, val_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+    )
+    val_data_loader = torch.utils.data.DataLoader(
+        TensorDataset(val_features, val_labels),
+        batch_size=batch_size,
+        drop_last=False,
+        num_workers=0,
+        persistent_workers=False,
+    )
+
+    if finetune_dataset is None and finetune_on_val:
+        logger.info("Choosing hyperparameters on the val dataset")
+        finetune_features, finetune_labels = val_features, val_labels
+    elif finetune_dataset is None and not finetune_on_val:
+        logger.info("Choosing hyperparameters on 10% of the train dataset")
+        torch.manual_seed(0)
+        indices = torch.randperm(len(train_features), device=train_features.device)
+        finetune_index = indices[: len(train_features) // 10]
+        train_index = indices[len(train_features) // 10 :]
+        finetune_features, finetune_labels = train_features[finetune_index], train_labels[finetune_index]
+        train_features, train_labels = train_features[train_index], train_labels[train_index]
+    else:
+        logger.info("Choosing hyperparameters on the finetune dataset")
+        finetune_features, finetune_labels = extract_features(
+            model, finetune_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+        )
+    # release the model - free GPU memory
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
+    finetune_data_loader = torch.utils.data.DataLoader(
+        TensorDataset(finetune_features, finetune_labels),
+        batch_size=batch_size,
+        drop_last=False,
+    )
+
+    if len(train_labels.shape) > 1:
+        num_classes = train_labels.shape[1]
+    else:
+        num_classes = train_labels.max() + 1
+
+    logger.info("Using cuML for logistic regression")
+
+    best_stats, best_C = sweep_C_values(
+        train_features=train_features,
+        train_labels=train_labels,
+        test_data_loader=finetune_data_loader,
+        metric_type=metric_type,
+        num_classes=num_classes,
+        train_dtype=train_dtype,
+        train_features_device=train_features_device,
+        max_train_iters=max_train_iters,
+    )
+
+    if not finetune_on_val:
+        logger.info("Best parameter found, concatenating features")
+        train_features = torch.cat((train_features, finetune_features))
+        train_labels = torch.cat((train_labels, finetune_labels))
+
+    logger.info("Training final model")
+    logreg_metric = build_metric(metric_type, num_classes=num_classes)
+    evals = train_and_evaluate(
+        C=best_C,
+        max_iter=max_train_iters,
+        train_features=train_features,
+        train_labels=train_labels,
+        logreg_metric=logreg_metric.clone(),
+        test_data_loader=val_data_loader,
+        eval_device=torch.cuda.current_device(),
+        train_dtype=train_dtype,
+        train_features_device=train_features_device,
+    )
+
+    best_stats = evals[1]["metrics"]
+
+    best_stats["best_C"] = best_C
+
+    logger.info(f"Log regression evaluation done in {int(time.time() - start)}s")
+    return best_stats
+
+
+def eval_log_regression_with_model(
+    model,
+    train_dataset_str="ImageNet:split=TRAIN",
+    val_dataset_str="ImageNet:split=VAL",
+    finetune_dataset_str=None,
+    autocast_dtype=torch.float,
+    finetune_on_val=False,
+    metric_type=MetricType.MEAN_ACCURACY,
+    train_dtype=torch.float64,
+    train_features_device=_CPU_DEVICE,
+    max_train_iters=DEFAULT_MAX_ITER,
+):
+    cudnn.benchmark = True
+
+    transform = make_classification_eval_transform(resize_size=224)
+    target_transform = None
+
+    train_dataset = make_dataset(dataset_str=train_dataset_str, transform=transform, target_transform=target_transform)
+    val_dataset = make_dataset(dataset_str=val_dataset_str, transform=transform, target_transform=target_transform)
+    if finetune_dataset_str is not None:
+        finetune_dataset = make_dataset(
+            dataset_str=finetune_dataset_str, transform=transform, target_transform=target_transform
+        )
+    else:
+        finetune_dataset = None
+
+    with torch.cuda.amp.autocast(dtype=autocast_dtype):
+        results_dict_logreg = eval_log_regression(
+            model=model,
+            train_dataset=train_dataset,
+            val_dataset=val_dataset,
+            finetune_dataset=finetune_dataset,
+            metric_type=metric_type,
+            batch_size=256,
+            num_workers=0,  # 5,
+            finetune_on_val=finetune_on_val,
+            train_dtype=train_dtype,
+            train_features_device=train_features_device,
+            max_train_iters=max_train_iters,
+        )
+
+    results_dict = {
+        "top-1": results_dict_logreg["top-1"].cpu().numpy() * 100.0,
+        "top-5": results_dict_logreg.get("top-5", torch.tensor(0.0)).cpu().numpy() * 100.0,
+        "best_C": results_dict_logreg["best_C"],
+    }
+    logger.info(
+        "\n".join(
+            [
+                "Training of the supervised logistic regression on frozen features completed.\n"
+                "Top-1 test accuracy: {acc:.1f}".format(acc=results_dict["top-1"]),
+                "Top-5 test accuracy: {acc:.1f}".format(acc=results_dict["top-5"]),
+                "obtained for C = {c:.6f}".format(c=results_dict["best_C"]),
+            ]
+        )
+    )
+
+    torch.distributed.barrier()
+    return results_dict
+
+
+def main(args):
+    model, autocast_dtype = setup_and_build_model(args)
+    eval_log_regression_with_model(
+        model=model,
+        train_dataset_str=args.train_dataset_str,
+        val_dataset_str=args.val_dataset_str,
+        finetune_dataset_str=args.finetune_dataset_str,
+        autocast_dtype=autocast_dtype,
+        finetune_on_val=args.finetune_on_val,
+        metric_type=args.metric_type,
+        train_dtype=as_torch_dtype(args.train_dtype),
+        train_features_device=torch.device(args.train_features_device),
+        max_train_iters=args.max_train_iters,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    description = "DINOv2 logistic regression evaluation"
+    args_parser = get_args_parser(description=description)
+    args = args_parser.parse_args()
+    sys.exit(main(args))
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/metrics.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..80bf88da224e749dd6b3dd4b2bd90ec99eaec34e
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/metrics.py
@@ -0,0 +1,114 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+import logging
+from typing import Any, Dict, Optional
+
+import torch
+from torch import Tensor
+from torchmetrics import Metric, MetricCollection
+from torchmetrics.classification import MulticlassAccuracy
+from torchmetrics.utilities.data import dim_zero_cat, select_topk
+
+
+logger = logging.getLogger("dinov2")
+
+
+class MetricType(Enum):
+    MEAN_ACCURACY = "mean_accuracy"
+    MEAN_PER_CLASS_ACCURACY = "mean_per_class_accuracy"
+    PER_CLASS_ACCURACY = "per_class_accuracy"
+    IMAGENET_REAL_ACCURACY = "imagenet_real_accuracy"
+
+    @property
+    def accuracy_averaging(self):
+        return getattr(AccuracyAveraging, self.name, None)
+
+    def __str__(self):
+        return self.value
+
+
+class AccuracyAveraging(Enum):
+    MEAN_ACCURACY = "micro"
+    MEAN_PER_CLASS_ACCURACY = "macro"
+    PER_CLASS_ACCURACY = "none"
+
+    def __str__(self):
+        return self.value
+
+
+def build_metric(metric_type: MetricType, *, num_classes: int, ks: Optional[tuple] = None):
+    if metric_type.accuracy_averaging is not None:
+        return build_topk_accuracy_metric(
+            average_type=metric_type.accuracy_averaging,
+            num_classes=num_classes,
+            ks=(1, 5) if ks is None else ks,
+        )
+    elif metric_type == MetricType.IMAGENET_REAL_ACCURACY:
+        return build_topk_imagenet_real_accuracy_metric(
+            num_classes=num_classes,
+            ks=(1, 5) if ks is None else ks,
+        )
+
+    raise ValueError(f"Unknown metric type {metric_type}")
+
+
+def build_topk_accuracy_metric(average_type: AccuracyAveraging, num_classes: int, ks: tuple = (1, 5)):
+    metrics: Dict[str, Metric] = {
+        f"top-{k}": MulticlassAccuracy(top_k=k, num_classes=int(num_classes), average=average_type.value) for k in ks
+    }
+    return MetricCollection(metrics)
+
+
+def build_topk_imagenet_real_accuracy_metric(num_classes: int, ks: tuple = (1, 5)):
+    metrics: Dict[str, Metric] = {f"top-{k}": ImageNetReaLAccuracy(top_k=k, num_classes=int(num_classes)) for k in ks}
+    return MetricCollection(metrics)
+
+
+class ImageNetReaLAccuracy(Metric):
+    is_differentiable: bool = False
+    higher_is_better: Optional[bool] = None
+    full_state_update: bool = False
+
+    def __init__(
+        self,
+        num_classes: int,
+        top_k: int = 1,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.num_classes = num_classes
+        self.top_k = top_k
+        self.add_state("tp", [], dist_reduce_fx="cat")
+
+    def update(self, preds: Tensor, target: Tensor) -> None:  # type: ignore
+        # preds [B, D]
+        # target [B, A]
+        # preds_oh [B, D] with 0 and 1
+        # select top K highest probabilities, use one hot representation
+        preds_oh = select_topk(preds, self.top_k)
+        # target_oh [B, D + 1] with 0 and 1
+        target_oh = torch.zeros((preds_oh.shape[0], preds_oh.shape[1] + 1), device=target.device, dtype=torch.int32)
+        target = target.long()
+        # for undefined targets (-1) use a fake value `num_classes`
+        target[target == -1] = self.num_classes
+        # fill targets, use one hot representation
+        target_oh.scatter_(1, target, 1)
+        # target_oh [B, D] (remove the fake target at index `num_classes`)
+        target_oh = target_oh[:, :-1]
+        # tp [B] with 0 and 1
+        tp = (preds_oh * target_oh == 1).sum(dim=1)
+        # at least one match between prediction and target
+        tp.clip_(max=1)
+        # ignore instances where no targets are defined
+        mask = target_oh.sum(dim=1) > 0
+        tp = tp[mask]
+        self.tp.append(tp)  # type: ignore
+
+    def compute(self) -> Tensor:
+        tp = dim_zero_cat(self.tp)  # type: ignore
+        return tp.float().mean()
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/setup.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7fadc2b63b994f569c8def82a43ed08ccd15b33
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/setup.py
@@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from typing import Any, List, Optional, Tuple
+
+import torch
+import torch.backends.cudnn as cudnn
+
+from dinov2.models import build_model_from_cfg
+from dinov2.utils.config import setup
+import dinov2.utils.utils as dinov2_utils
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents or [],
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        help="Model configuration file",
+    )
+    parser.add_argument(
+        "--pretrained-weights",
+        type=str,
+        help="Pretrained model weights",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="",
+        type=str,
+        help="Output directory to write results and logs",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Extra configuration options",
+        default=[],
+        nargs="+",
+    )
+    return parser
+
+
+def get_autocast_dtype(config):
+    teacher_dtype_str = config.compute_precision.teacher.backbone.mixed_precision.param_dtype
+    if teacher_dtype_str == "fp16":
+        return torch.half
+    elif teacher_dtype_str == "bf16":
+        return torch.bfloat16
+    else:
+        return torch.float
+
+
+def build_model_for_eval(config, pretrained_weights):
+    model, _ = build_model_from_cfg(config, only_teacher=True)
+    dinov2_utils.load_pretrained_weights(model, pretrained_weights, "teacher")
+    model.eval()
+    model.cuda()
+    return model
+
+
+def setup_and_build_model(args) -> Tuple[Any, torch.dtype]:
+    cudnn.benchmark = True
+    config = setup(args)
+    model = build_model_for_eval(config, args.pretrained_weights)
+    autocast_dtype = get_autocast_dtype(config)
+    return model, autocast_dtype
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/utils.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2f7e34f41ba6a0b911023e0c5375eef21f426fa
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/utils.py
@@ -0,0 +1,147 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Dict, Optional
+
+import torch
+from torch import nn
+from torchmetrics import MetricCollection
+
+from dinov2.data import DatasetWithEnumeratedTargets, SamplerType, make_data_loader
+import dinov2.distributed as distributed
+from dinov2.logging import MetricLogger
+
+
+logger = logging.getLogger("dinov2")
+
+
+class ModelWithNormalize(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, samples):
+        return nn.functional.normalize(self.model(samples), dim=1, p=2)
+
+
+class ModelWithIntermediateLayers(nn.Module):
+    def __init__(self, feature_model, n_last_blocks, autocast_ctx):
+        super().__init__()
+        self.feature_model = feature_model
+        self.feature_model.eval()
+        self.n_last_blocks = n_last_blocks
+        self.autocast_ctx = autocast_ctx
+
+    def forward(self, images):
+        with torch.inference_mode():
+            with self.autocast_ctx():
+                features = self.feature_model.get_intermediate_layers(
+                    images, self.n_last_blocks, return_class_token=True
+                )
+        return features
+
+
+@torch.inference_mode()
+def evaluate(
+    model: nn.Module,
+    data_loader,
+    postprocessors: Dict[str, nn.Module],
+    metrics: Dict[str, MetricCollection],
+    device: torch.device,
+    criterion: Optional[nn.Module] = None,
+):
+    model.eval()
+    if criterion is not None:
+        criterion.eval()
+
+    for metric in metrics.values():
+        metric = metric.to(device)
+
+    metric_logger = MetricLogger(delimiter="  ")
+    header = "Test:"
+
+    for samples, targets, *_ in metric_logger.log_every(data_loader, 10, header):
+        outputs = model(samples.to(device))
+        targets = targets.to(device)
+
+        if criterion is not None:
+            loss = criterion(outputs, targets)
+            metric_logger.update(loss=loss.item())
+
+        for k, metric in metrics.items():
+            metric_inputs = postprocessors[k](outputs, targets)
+            metric.update(**metric_inputs)
+
+    metric_logger.synchronize_between_processes()
+    logger.info(f"Averaged stats: {metric_logger}")
+
+    stats = {k: metric.compute() for k, metric in metrics.items()}
+    metric_logger_stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    return metric_logger_stats, stats
+
+
+def all_gather_and_flatten(tensor_rank):
+    tensor_all_ranks = torch.empty(
+        distributed.get_global_size(),
+        *tensor_rank.shape,
+        dtype=tensor_rank.dtype,
+        device=tensor_rank.device,
+    )
+    tensor_list = list(tensor_all_ranks.unbind(0))
+    torch.distributed.all_gather(tensor_list, tensor_rank.contiguous())
+    return tensor_all_ranks.flatten(end_dim=1)
+
+
+def extract_features(model, dataset, batch_size, num_workers, gather_on_cpu=False):
+    dataset_with_enumerated_targets = DatasetWithEnumeratedTargets(dataset)
+    sample_count = len(dataset_with_enumerated_targets)
+    data_loader = make_data_loader(
+        dataset=dataset_with_enumerated_targets,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        sampler_type=SamplerType.DISTRIBUTED,
+        drop_last=False,
+        shuffle=False,
+    )
+    return extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu)
+
+
+@torch.inference_mode()
+def extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu=False):
+    gather_device = torch.device("cpu") if gather_on_cpu else torch.device("cuda")
+    metric_logger = MetricLogger(delimiter="  ")
+    features, all_labels = None, None
+    for samples, (index, labels_rank) in metric_logger.log_every(data_loader, 10):
+        samples = samples.cuda(non_blocking=True)
+        labels_rank = labels_rank.cuda(non_blocking=True)
+        index = index.cuda(non_blocking=True)
+        features_rank = model(samples).float()
+
+        # init storage feature matrix
+        if features is None:
+            features = torch.zeros(sample_count, features_rank.shape[-1], device=gather_device)
+            labels_shape = list(labels_rank.shape)
+            labels_shape[0] = sample_count
+            all_labels = torch.full(labels_shape, fill_value=-1, device=gather_device)
+            logger.info(f"Storing features into tensor of shape {features.shape}")
+
+        # share indexes, features and labels between processes
+        index_all = all_gather_and_flatten(index).to(gather_device)
+        features_all_ranks = all_gather_and_flatten(features_rank).to(gather_device)
+        labels_all_ranks = all_gather_and_flatten(labels_rank).to(gather_device)
+
+        # update storage feature matrix
+        if len(index_all) > 0:
+            features.index_copy_(0, index_all, features_all_ranks)
+            all_labels.index_copy_(0, index_all, labels_all_ranks)
+
+    logger.info(f"Features shape: {tuple(features.shape)}")
+    logger.info(f"Labels shape: {tuple(all_labels.shape)}")
+
+    assert torch.all(all_labels > -1)
+
+    return features, all_labels
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/fsdp/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/fsdp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d20397611619e6a02ea07f5305d650ffef2a51
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/fsdp/__init__.py
@@ -0,0 +1,158 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from typing import Any
+
+import torch
+import dinov2.distributed as distributed
+from functools import partial
+from fvcore.common.checkpoint import Checkpointer
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import ShardingStrategy
+from torch.distributed.fsdp import MixedPrecision
+from torch.distributed.fsdp import StateDictType
+from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.distributed.fsdp._runtime_utils import _reshard
+
+
+def get_fsdp_wrapper(model_cfg, modules_to_wrap=set()):
+    sharding_strategy_dict = {
+        "NO_SHARD": ShardingStrategy.NO_SHARD,
+        "SHARD_GRAD_OP": ShardingStrategy.SHARD_GRAD_OP,
+        "FULL_SHARD": ShardingStrategy.FULL_SHARD,
+    }
+
+    dtype_dict = {
+        "fp32": torch.float32,
+        "fp16": torch.float16,
+        "bf16": torch.bfloat16,
+    }
+
+    mixed_precision_config = MixedPrecision(
+        param_dtype=dtype_dict[model_cfg.mixed_precision.param_dtype],
+        reduce_dtype=dtype_dict[model_cfg.mixed_precision.reduce_dtype],
+        buffer_dtype=dtype_dict[model_cfg.mixed_precision.buffer_dtype],
+    )
+
+    sharding_strategy_config = sharding_strategy_dict[model_cfg.sharding_strategy]
+
+    local_rank = distributed.get_local_rank()
+
+    fsdp_wrapper = partial(
+        FSDP,
+        sharding_strategy=sharding_strategy_config,
+        mixed_precision=mixed_precision_config,
+        device_id=local_rank,
+        sync_module_states=True,
+        use_orig_params=True,
+        auto_wrap_policy=ModuleWrapPolicy(modules_to_wrap),
+    )
+    return fsdp_wrapper
+
+
+def is_fsdp(x):
+    return isinstance(x, FSDP)
+
+
+def is_sharded_fsdp(x):
+    return is_fsdp(x) and x.sharding_strategy is not ShardingStrategy.NO_SHARD
+
+
+def free_if_fsdp(x):
+    if is_sharded_fsdp(x):
+        handles = x._handles
+        true_list = [True for h in handles]
+        _reshard(x, handles, true_list)
+
+
+def get_fsdp_modules(x):
+    return FSDP.fsdp_modules(x)
+
+
+def reshard_fsdp_model(x):
+    for m in get_fsdp_modules(x):
+        free_if_fsdp(m)
+
+
+def rankstr():
+    return f"rank_{distributed.get_global_rank()}"
+
+
+class FSDPCheckpointer(Checkpointer):
+    def save(self, name: str, **kwargs: Any) -> None:
+        """
+        Dump model and checkpointables to a file.
+
+        Args:
+            name (str): name of the file.
+            kwargs (dict): extra arbitrary data to save.
+        """
+        if not self.save_dir or not self.save_to_disk:
+            return
+
+        data = {}
+        with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT):
+            data["model"] = self.model.state_dict()
+
+        # data["model"] = self.model.state_dict()
+        for key, obj in self.checkpointables.items():
+            data[key] = obj.state_dict()
+        data.update(kwargs)
+
+        basename = f"{name}.{rankstr()}.pth"
+        save_file = os.path.join(self.save_dir, basename)
+        assert os.path.basename(save_file) == basename, basename
+        self.logger.info("Saving checkpoint to {}".format(save_file))
+        with self.path_manager.open(save_file, "wb") as f:
+            torch.save(data, f)
+        self.tag_last_checkpoint(basename)
+
+    def load(self, *args, **kwargs):
+        with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT):
+            return super().load(*args, **kwargs)
+
+    def has_checkpoint(self) -> bool:
+        """
+        Returns:
+            bool: whether a checkpoint exists in the target directory.
+        """
+        save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+        return self.path_manager.exists(save_file)
+
+    def get_checkpoint_file(self) -> str:
+        """
+        Returns:
+            str: The latest checkpoint file in target directory.
+        """
+        save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+        try:
+            with self.path_manager.open(save_file, "r") as f:
+                last_saved = f.read().strip()
+        except IOError:
+            # if file doesn't exist, maybe because it has just been
+            # deleted by a separate process
+            return ""
+        # pyre-fixme[6]: For 2nd param expected `Union[PathLike[str], str]` but got
+        #  `Union[bytes, str]`.
+        return os.path.join(self.save_dir, last_saved)
+
+    def tag_last_checkpoint(self, last_filename_basename: str) -> None:
+        """
+        Tag the last checkpoint.
+
+        Args:
+            last_filename_basename (str): the basename of the last filename.
+        """
+        if distributed.is_enabled():
+            torch.distributed.barrier()
+        save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+        with self.path_manager.open(save_file, "w") as f:
+            f.write(last_filename_basename)  # pyre-ignore
+
+
+ShardedGradScaler = ShardedGradScaler
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..31f196aacac5be8a7c537a3dfa8f97084671b466
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f9b0c94b40967dfdff4f261c127cbd21328c905
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py
@@ -0,0 +1,81 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+import logging
+
+from torch import Tensor
+from torch import nn
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+    from xformers.ops import memory_efficient_attention, unbind, fmha
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            return super().forward(x)
+
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+
+        q, k, v = unbind(qkv, 2)
+
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/block.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/block.py
new file mode 100644
index 0000000000000000000000000000000000000000..25488f57cc0ad3c692f86b62555f6668e2a66db1
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/block.py
@@ -0,0 +1,252 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+
+import torch
+from torch import nn, Tensor
+
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+    from xformers.ops import fmha
+    from xformers.ops import scaled_index_add, index_select_cat
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.sample_drop_ratio = drop_path
+
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+
+
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+
+    residual_scale_factor = b / sample_subset_size
+
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+
+
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+
+
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+
+
+attn_bias_cache: Dict[Tuple, Any] = {}
+
+
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+
+    return attn_bias_cache[all_shapes], cat_tensors
+
+
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+
+
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+
+        if self.training and self.sample_drop_ratio > 0.0:
+
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/dino_head.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/dino_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7212db92a4fd8d4c7230e284e551a0234e9d8623
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/dino_head.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+
+
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+
+
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py
new file mode 100644
index 0000000000000000000000000000000000000000..af05625984dd14682cc96a63bf0c97bab1f123b1
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+
+
+from torch import nn
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca5daa52bd81d3581adeb2198ea5b7dba2a3aea1
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+
+from typing import Union
+
+import torch
+from torch import Tensor
+from torch import nn
+
+
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e4b315f972f9a9f54aef1e4ef4e81b52976f018
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+
+
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/patch_embed.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/patch_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..574abe41175568d700a389b8b96d1ba554914779
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/patch_embed.py
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+from typing import Callable, Optional, Tuple, Union
+
+from torch import Tensor
+import torch.nn as nn
+
+
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+
+    assert isinstance(x, int)
+    return (x, x)
+
+
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.flatten_embedding = flatten_embedding
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/swiglu_ffn.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/swiglu_ffn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3324b266fb0a50ccf8c3a0ede2ae10ac4dfa03e
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/swiglu_ffn.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+import torch.nn.functional as F
+
+
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+
+
+try:
+    from xformers.ops import SwiGLU
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+
+
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e80dadb2d57056e9f6f4989cd24a3c7e26fee23f
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/__init__.py
@@ -0,0 +1,103 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import logging
+import os
+import sys
+from typing import Optional
+
+import dinov2.distributed as distributed
+from .helpers import MetricLogger, SmoothedValue
+
+
+# So that calling _configure_logger multiple times won't add many handlers
+@functools.lru_cache()
+def _configure_logger(
+    name: Optional[str] = None,
+    *,
+    level: int = logging.DEBUG,
+    output: Optional[str] = None,
+):
+    """
+    Configure a logger.
+
+    Adapted from Detectron2.
+
+    Args:
+        name: The name of the logger to configure.
+        level: The logging level to use.
+        output: A file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+
+    Returns:
+        The configured logger.
+    """
+
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    logger.propagate = False
+
+    # Loosely match Google glog format:
+    #   [IWEF]yyyymmdd hh:mm:ss.uuuuuu threadid file:line] msg
+    # but use a shorter timestamp and include the logger name:
+    #   [IWEF]yyyymmdd hh:mm:ss logger threadid file:line] msg
+    fmt_prefix = "%(levelname).1s%(asctime)s %(process)s %(name)s %(filename)s:%(lineno)s] "
+    fmt_message = "%(message)s"
+    fmt = fmt_prefix + fmt_message
+    datefmt = "%Y%m%d %H:%M:%S"
+    formatter = logging.Formatter(fmt=fmt, datefmt=datefmt)
+
+    # stdout logging for main worker only
+    if distributed.is_main_process():
+        handler = logging.StreamHandler(stream=sys.stdout)
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    # file logging for all workers
+    if output:
+        if os.path.splitext(output)[-1] in (".txt", ".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, "logs", "log.txt")
+
+        if not distributed.is_main_process():
+            global_rank = distributed.get_global_rank()
+            filename = filename + ".rank{}".format(global_rank)
+
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+        handler = logging.StreamHandler(open(filename, "a"))
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    return logger
+
+
+def setup_logging(
+    output: Optional[str] = None,
+    *,
+    name: Optional[str] = None,
+    level: int = logging.DEBUG,
+    capture_warnings: bool = True,
+) -> None:
+    """
+    Setup logging.
+
+    Args:
+        output: A file name or a directory to save log files. If None, log
+            files will not be saved. If output ends with ".txt" or ".log", it
+            is assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name: The name of the logger to configure, by default the root logger.
+        level: The logging level to use.
+        capture_warnings: Whether warnings should be captured as logs.
+    """
+    logging.captureWarnings(capture_warnings)
+    _configure_logger(name, level=level, output=output)
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/helpers.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..16d643500d2ee10ffea5916aad07f9b9d7c0af6d
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/helpers.py
@@ -0,0 +1,195 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict, deque
+import datetime
+import json
+import logging
+import time
+
+import torch
+
+import dinov2.distributed as distributed
+
+
+logger = logging.getLogger("dinov2")
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t", output_file=None):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+        self.output_file = output_file
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {}".format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def dump_in_output_file(self, iteration, iter_time, data_time):
+        if self.output_file is None or not distributed.is_main_process():
+            return
+        dict_to_dump = dict(
+            iteration=iteration,
+            iter_time=iter_time,
+            data_time=data_time,
+        )
+        dict_to_dump.update({k: v.median for k, v in self.meters.items()})
+        with open(self.output_file, "a") as f:
+            f.write(json.dumps(dict_to_dump) + "\n")
+        pass
+
+    def log_every(self, iterable, print_freq, header=None, n_iterations=None, start_iteration=0):
+        i = start_iteration
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.6f}")
+        data_time = SmoothedValue(fmt="{avg:.6f}")
+
+        if n_iterations is None:
+            n_iterations = len(iterable)
+
+        space_fmt = ":" + str(len(str(n_iterations))) + "d"
+
+        log_list = [
+            header,
+            "[{0" + space_fmt + "}/{1}]",
+            "eta: {eta}",
+            "{meters}",
+            "time: {time}",
+            "data: {data}",
+        ]
+        if torch.cuda.is_available():
+            log_list += ["max mem: {memory:.0f}"]
+
+        log_msg = self.delimiter.join(log_list)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == n_iterations - 1:
+                self.dump_in_output_file(iteration=i, iter_time=iter_time.avg, data_time=data_time.avg)
+                eta_seconds = iter_time.global_avg * (n_iterations - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    logger.info(
+                        log_msg.format(
+                            i,
+                            n_iterations,
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
+                else:
+                    logger.info(
+                        log_msg.format(
+                            i,
+                            n_iterations,
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                        )
+                    )
+            i += 1
+            end = time.time()
+            if i >= n_iterations:
+                break
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        logger.info("{} Total time: {} ({:.6f} s / it)".format(header, total_time_str, total_time / n_iterations))
+
+
+class SmoothedValue:
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, num=1):
+        self.deque.append(value)
+        self.count += num
+        self.total += value * num
+
+    def synchronize_between_processes(self):
+        """
+        Distributed synchronization of the metric
+        Warning: does not synchronize the deque!
+        """
+        if not distributed.is_enabled():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        torch.distributed.barrier()
+        torch.distributed.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value,
+        )
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..477b71b28259bf97b806df3f3d2f392dded866d6
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .dino_clstoken_loss import DINOLoss
+from .ibot_patch_loss import iBOTPatchLoss
+from .koleo_loss import KoLeoLoss
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/dino_clstoken_loss.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/dino_clstoken_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f33897efb1084e6c1c14ae00bc93ab332c61074
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/dino_clstoken_loss.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+
+
+class DINOLoss(nn.Module):
+    def __init__(
+        self,
+        out_dim,
+        student_temp=0.1,
+        center_momentum=0.9,
+    ):
+        super().__init__()
+        self.student_temp = student_temp
+        self.center_momentum = center_momentum
+        self.register_buffer("center", torch.zeros(1, out_dim))
+        self.updated = True
+        self.reduce_handle = None
+        self.len_teacher_output = None
+        self.async_batch_center = None
+
+    @torch.no_grad()
+    def softmax_center_teacher(self, teacher_output, teacher_temp):
+        self.apply_center_update()
+        # teacher centering and sharpening
+        return F.softmax((teacher_output - self.center) / teacher_temp, dim=-1)
+
+    @torch.no_grad()
+    def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_iterations=3):
+        teacher_output = teacher_output.float()
+        world_size = dist.get_world_size() if dist.is_initialized() else 1
+        Q = torch.exp(teacher_output / teacher_temp).t()  # Q is K-by-B for consistency with notations from our paper
+        B = Q.shape[1] * world_size  # number of samples to assign
+        K = Q.shape[0]  # how many prototypes
+
+        # make the matrix sums to 1
+        sum_Q = torch.sum(Q)
+        if dist.is_initialized():
+            dist.all_reduce(sum_Q)
+        Q /= sum_Q
+
+        for it in range(n_iterations):
+            # normalize each row: total weight per prototype must be 1/K
+            sum_of_rows = torch.sum(Q, dim=1, keepdim=True)
+            if dist.is_initialized():
+                dist.all_reduce(sum_of_rows)
+            Q /= sum_of_rows
+            Q /= K
+
+            # normalize each column: total weight per sample must be 1/B
+            Q /= torch.sum(Q, dim=0, keepdim=True)
+            Q /= B
+
+        Q *= B  # the columns must sum to 1 so that Q is an assignment
+        return Q.t()
+
+    def forward(self, student_output_list, teacher_out_softmaxed_centered_list):
+        """
+        Cross-entropy between softmax outputs of the teacher and student networks.
+        """
+        # TODO: Use cross_entropy_distribution here
+        total_loss = 0
+        for s in student_output_list:
+            lsm = F.log_softmax(s / self.student_temp, dim=-1)
+            for t in teacher_out_softmaxed_centered_list:
+                loss = torch.sum(t * lsm, dim=-1)
+                total_loss -= loss.mean()
+        return total_loss
+
+    @torch.no_grad()
+    def update_center(self, teacher_output):
+        self.reduce_center_update(teacher_output)
+
+    @torch.no_grad()
+    def reduce_center_update(self, teacher_output):
+        self.updated = False
+        self.len_teacher_output = len(teacher_output)
+        self.async_batch_center = torch.sum(teacher_output, dim=0, keepdim=True)
+        if dist.is_initialized():
+            self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True)
+
+    @torch.no_grad()
+    def apply_center_update(self):
+        if self.updated is False:
+            world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+            if self.reduce_handle is not None:
+                self.reduce_handle.wait()
+            _t = self.async_batch_center / (self.len_teacher_output * world_size)
+
+            self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum)
+
+            self.updated = True
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/ibot_patch_loss.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/ibot_patch_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..16bc5cf634d661f1fa337304273f60dcd43c79c3
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/ibot_patch_loss.py
@@ -0,0 +1,152 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+
+import logging
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+    from xformers.ops import cross_entropy
+
+    def lossfunc(t, s, temp):
+        s = s.float()
+        t = t.float()
+        if s.ndim == 2:
+            return -cross_entropy(s.unsqueeze(0), t.unsqueeze(0), temp, bw_inplace=True).squeeze(0)
+        elif s.ndim == 3:
+            return -cross_entropy(s, t, temp, bw_inplace=True)
+
+except ImportError:
+
+    def lossfunc(t, s, temp):
+        return torch.sum(t * F.log_softmax(s / temp, dim=-1), dim=-1)
+
+
+class iBOTPatchLoss(nn.Module):
+    def __init__(self, patch_out_dim, student_temp=0.1, center_momentum=0.9):
+        super().__init__()
+        self.student_temp = student_temp
+        self.center_momentum = center_momentum
+        self.register_buffer("center", torch.zeros(1, 1, patch_out_dim))
+        self.updated = True
+        self.reduce_handle = None
+        self.len_teacher_patch_tokens = None
+        self.async_batch_center = None
+
+    @torch.no_grad()
+    def softmax_center_teacher(self, teacher_patch_tokens, teacher_temp):
+        self.apply_center_update()
+        # teacher centering and sharpening
+        #
+        # WARNING:
+        #   as self.center is a float32, everything gets casted to float32 afterwards
+        #
+        # teacher_patch_tokens = teacher_patch_tokens.float()
+        # return F.softmax((teacher_patch_tokens.sub_(self.center.to(teacher_patch_tokens.dtype))).mul_(1 / teacher_temp), dim=-1)
+
+        return F.softmax((teacher_patch_tokens - self.center) / teacher_temp, dim=-1)
+
+        # this is experimental, keep everything in float16 and let's see what happens:
+        # return F.softmax((teacher_patch_tokens.sub_(self.center)) / teacher_temp, dim=-1)
+
+    @torch.no_grad()
+    def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_masked_patches_tensor, n_iterations=3):
+        teacher_output = teacher_output.float()
+        # world_size = dist.get_world_size() if dist.is_initialized() else 1
+        Q = torch.exp(teacher_output / teacher_temp).t()  # Q is K-by-B for consistency with notations from our paper
+        # B = Q.shape[1] * world_size # number of samples to assign
+        B = n_masked_patches_tensor
+        dist.all_reduce(B)
+        K = Q.shape[0]  # how many prototypes
+
+        # make the matrix sums to 1
+        sum_Q = torch.sum(Q)
+        if dist.is_initialized():
+            dist.all_reduce(sum_Q)
+        Q /= sum_Q
+
+        for it in range(n_iterations):
+            # normalize each row: total weight per prototype must be 1/K
+            sum_of_rows = torch.sum(Q, dim=1, keepdim=True)
+            if dist.is_initialized():
+                dist.all_reduce(sum_of_rows)
+            Q /= sum_of_rows
+            Q /= K
+
+            # normalize each column: total weight per sample must be 1/B
+            Q /= torch.sum(Q, dim=0, keepdim=True)
+            Q /= B
+
+        Q *= B  # the columns must sum to 1 so that Q is an assignment
+        return Q.t()
+
+    def forward(self, student_patch_tokens, teacher_patch_tokens, student_masks_flat):
+        """
+        Cross-entropy between softmax outputs of the teacher and student networks.
+        student_patch_tokens: (B, N, D) tensor
+        teacher_patch_tokens: (B, N, D) tensor
+        student_masks_flat: (B, N) tensor
+        """
+        t = teacher_patch_tokens
+        s = student_patch_tokens
+        loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1)
+        loss = torch.sum(loss * student_masks_flat.float(), dim=-1) / student_masks_flat.sum(dim=-1).clamp(min=1.0)
+        return -loss.mean()
+
+    def forward_masked(
+        self,
+        student_patch_tokens_masked,
+        teacher_patch_tokens_masked,
+        student_masks_flat,
+        n_masked_patches=None,
+        masks_weight=None,
+    ):
+        t = teacher_patch_tokens_masked
+        s = student_patch_tokens_masked
+        # loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1)
+        loss = lossfunc(t, s, self.student_temp)
+        if masks_weight is None:
+            masks_weight = (
+                (1 / student_masks_flat.sum(-1).clamp(min=1.0))
+                .unsqueeze(-1)
+                .expand_as(student_masks_flat)[student_masks_flat]
+            )
+        if n_masked_patches is not None:
+            loss = loss[:n_masked_patches]
+        loss = loss * masks_weight
+        return -loss.sum() / student_masks_flat.shape[0]
+
+    @torch.no_grad()
+    def update_center(self, teacher_patch_tokens):
+        self.reduce_center_update(teacher_patch_tokens)
+
+    @torch.no_grad()
+    def reduce_center_update(self, teacher_patch_tokens):
+        self.updated = False
+        self.len_teacher_patch_tokens = len(teacher_patch_tokens)
+        self.async_batch_center = torch.sum(teacher_patch_tokens.mean(1), dim=0, keepdim=True)
+        if dist.is_initialized():
+            self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True)
+
+    @torch.no_grad()
+    def apply_center_update(self):
+        if self.updated is False:
+            world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+            if self.reduce_handle is not None:
+                self.reduce_handle.wait()
+            _t = self.async_batch_center / (self.len_teacher_patch_tokens * world_size)
+
+            self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum)
+
+            self.updated = True
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..e776d0426bb029cf48f25b0c94077720bc8421c4
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# import torch.distributed as dist
+
+
+logger = logging.getLogger("dinov2")
+
+
+class KoLeoLoss(nn.Module):
+    """Kozachenko-Leonenko entropic loss regularizer from Sablayrolles et al. - 2018 - Spreading vectors for similarity search"""
+
+    def __init__(self):
+        super().__init__()
+        self.pdist = nn.PairwiseDistance(2, eps=1e-8)
+
+    def pairwise_NNs_inner(self, x):
+        """
+        Pairwise nearest neighbors for L2-normalized vectors.
+        Uses Torch rather than Faiss to remain on GPU.
+        """
+        # parwise dot products (= inverse distance)
+        dots = torch.mm(x, x.t())
+        n = x.shape[0]
+        dots.view(-1)[:: (n + 1)].fill_(-1)  # Trick to fill diagonal with -1
+        # max inner prod -> min distance
+        _, I = torch.max(dots, dim=1)  # noqa: E741
+        return I
+
+    def forward(self, student_output, eps=1e-8):
+        """
+        Args:
+            student_output (BxD): backbone output of student
+        """
+        with torch.cuda.amp.autocast(enabled=False):
+            student_output = F.normalize(student_output, eps=eps, p=2, dim=-1)
+            I = self.pairwise_NNs_inner(student_output)  # noqa: E741
+            distances = self.pdist(student_output, student_output[I])  # BxD, BxD -> B
+            loss = -torch.log(distances + eps).mean()
+        return loss
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e5a1f3832464f898752e57e865760e9864613cb
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from . import vision_transformer as vits
+
+
+logger = logging.getLogger("dinov2")
+
+
+def build_model(args, only_teacher=False, img_size=224):
+    args.arch = args.arch.removesuffix("_memeff")
+    if "vit" in args.arch:
+        vit_kwargs = dict(
+            img_size=img_size,
+            patch_size=args.patch_size,
+            init_values=args.layerscale,
+            ffn_layer=args.ffn_layer,
+            block_chunks=args.block_chunks,
+            qkv_bias=args.qkv_bias,
+            proj_bias=args.proj_bias,
+            ffn_bias=args.ffn_bias,
+        )
+        teacher = vits.__dict__[args.arch](**vit_kwargs)
+        if only_teacher:
+            return teacher, teacher.embed_dim
+        student = vits.__dict__[args.arch](
+            **vit_kwargs,
+            drop_path_rate=args.drop_path_rate,
+            drop_path_uniform=args.drop_path_uniform,
+        )
+        embed_dim = student.embed_dim
+    return student, teacher, embed_dim
+
+
+def build_model_from_cfg(cfg, only_teacher=False):
+    return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/vision_transformer.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..18e159a986336af813c8f0e505b946f42cd83e47
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/vision_transformer.py
@@ -0,0 +1,358 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+
+from dinov2.layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+
+
+logger = logging.getLogger("dinov2")
+
+
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+
+
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+
+            def f(*args, **kwargs):
+                return nn.Identity()
+
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+        self.init_weights()
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        w0, h0 = w0 + 0.1, h0 + 0.1
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+            scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+            mode="bicubic",
+        )
+
+        assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+
+        return x
+
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_patchtokens": x_norm[:, 1:],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+
+        x = self.prepare_tokens_with_masks(x, masks)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_patchtokens": x_norm[:, 1:],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+
+
+def vit_small(patch_size=16, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
+
+
+def vit_base(patch_size=16, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
+
+
+def vit_large(patch_size=16, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
+
+
+def vit_giant2(patch_size=16, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0952fcc3f57e34b3747962e9ebd6fc57aeea63fa
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..15d674b78b0629aa0f041c2426c894925469a0e8
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.eval.knn import get_args_parser as get_knn_args_parser
+from dinov2.logging import setup_logging
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.eval.knn import main as knn_main
+
+        self._setup_args()
+        knn_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 k-NN evaluation"
+    knn_args_parser = get_knn_args_parser(add_help=False)
+    parents = [knn_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Evaluator, args, name="dinov2:knn")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8c264762ac6bb82a3622c74e1e683ea5c6be437
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.eval.linear import get_args_parser as get_linear_args_parser
+from dinov2.logging import setup_logging
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.eval.linear import main as linear_main
+
+        self._setup_args()
+        linear_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 linear evaluation"
+    linear_args_parser = get_linear_args_parser(add_help=False)
+    parents = [linear_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Evaluator, args, name="dinov2:linear")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/log_regression.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/log_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d3d5a5742792fc8d4ca3b39c15c47e8aa349bc7
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/log_regression.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.eval.log_regression import get_args_parser as get_log_regression_args_parser
+from dinov2.logging import setup_logging
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.eval.log_regression import main as log_regression_main
+
+        self._setup_args()
+        log_regression_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 logistic evaluation"
+    log_regression_args_parser = get_log_regression_args_parser(add_help=False)
+    parents = [log_regression_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Evaluator, args, name="dinov2:logreg")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/submit.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..68140f3d6d93dc67ccd7c45fe712eb15483d1ad6
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/submit.py
@@ -0,0 +1,123 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import List, Optional
+
+import submitit
+
+from dinov2.utils.cluster import (
+    get_slurm_executor_parameters,
+    get_slurm_partition,
+    get_user_checkpoint_path,
+)
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+) -> argparse.ArgumentParser:
+    parents = parents or []
+    slurm_partition = get_slurm_partition()
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--ngpus",
+        "--gpus",
+        "--gpus-per-node",
+        default=8,
+        type=int,
+        help="Number of GPUs to request on each node",
+    )
+    parser.add_argument(
+        "--nodes",
+        "--nnodes",
+        default=2,
+        type=int,
+        help="Number of nodes to request",
+    )
+    parser.add_argument(
+        "--timeout",
+        default=2800,
+        type=int,
+        help="Duration of the job",
+    )
+    parser.add_argument(
+        "--partition",
+        default=slurm_partition,
+        type=str,
+        help="Partition where to submit",
+    )
+    parser.add_argument(
+        "--use-volta32",
+        action="store_true",
+        help="Request V100-32GB GPUs",
+    )
+    parser.add_argument(
+        "--comment",
+        default="",
+        type=str,
+        help="Comment to pass to scheduler, e.g. priority message",
+    )
+    parser.add_argument(
+        "--exclude",
+        default="",
+        type=str,
+        help="Nodes to exclude",
+    )
+    return parser
+
+
+def get_shared_folder() -> Path:
+    user_checkpoint_path = get_user_checkpoint_path()
+    if user_checkpoint_path is None:
+        raise RuntimeError("Path to user checkpoint cannot be determined")
+    path = user_checkpoint_path / "experiments"
+    path.mkdir(exist_ok=True)
+    return path
+
+
+def submit_jobs(task_class, args, name: str):
+    if not args.output_dir:
+        args.output_dir = str(get_shared_folder() / "%j")
+
+    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    executor = submitit.AutoExecutor(folder=args.output_dir, slurm_max_num_timeout=30)
+
+    kwargs = {}
+    if args.use_volta32:
+        kwargs["slurm_constraint"] = "volta32gb"
+    if args.comment:
+        kwargs["slurm_comment"] = args.comment
+    if args.exclude:
+        kwargs["slurm_exclude"] = args.exclude
+
+    executor_params = get_slurm_executor_parameters(
+        nodes=args.nodes,
+        num_gpus_per_node=args.ngpus,
+        timeout_min=args.timeout,  # max is 60 * 72
+        slurm_signal_delay_s=120,
+        slurm_partition=args.partition,
+        **kwargs,
+    )
+    executor.update_parameters(name=name, **executor_params)
+
+    task = task_class(args)
+    job = executor.submit(task)
+
+    logger.info(f"Submitted job_id: {job.job_id}")
+    str_output_dir = os.path.abspath(args.output_dir).replace("%j", str(job.job_id))
+    logger.info(f"Logs and checkpoints will be saved at: {str_output_dir}")
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..24716f2a314820a4cc15289fe0cb13ad52cf343c
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.logging import setup_logging
+from dinov2.train import get_args_parser as get_train_args_parser
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Trainer(object):
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.train import main as train_main
+
+        self._setup_args()
+        train_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 training"
+    train_args_parser = get_train_args_parser(add_help=False)
+    parents = [train_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Trainer, args, name="dinov2:train")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0b66d17aa547ed5560e75a03f5c1587da2d4fd7
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .train import get_args_parser, main
+from .ssl_meta_arch import SSLMetaArch
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/ssl_meta_arch.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/ssl_meta_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..86d0c2413f9abc61953d0e12b43a5a843d97d244
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/ssl_meta_arch.py
@@ -0,0 +1,403 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import partial
+import logging
+
+import torch
+from torch import nn
+
+from dinov2.loss import DINOLoss, iBOTPatchLoss, KoLeoLoss
+from dinov2.models import build_model_from_cfg
+from dinov2.layers import DINOHead
+from dinov2.utils.utils import has_batchnorms
+from dinov2.utils.param_groups import get_params_groups_with_decay, fuse_params_groups
+from dinov2.fsdp import get_fsdp_wrapper, ShardedGradScaler, get_fsdp_modules, reshard_fsdp_model
+
+from dinov2.models.vision_transformer import BlockChunk
+
+try:
+    from xformers.ops import fmha
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    XFORMERS_AVAILABLE = False
+assert XFORMERS_AVAILABLE, "xFormers is required for DINOv2 training"
+
+
+logger = logging.getLogger("dinov2")
+
+
+class SSLMetaArch(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.fp16_scaler = ShardedGradScaler() if cfg.compute_precision.grad_scaler else None
+
+        student_model_dict = dict()
+        teacher_model_dict = dict()
+
+        student_backbone, teacher_backbone, embed_dim = build_model_from_cfg(cfg)
+        student_model_dict["backbone"] = student_backbone
+        teacher_model_dict["backbone"] = teacher_backbone
+        logger.info(f"OPTIONS -- architecture : embed_dim: {embed_dim}")
+
+        if cfg.student.pretrained_weights:
+            chkpt = torch.load(cfg.student.pretrained_weights)
+            logger.info(f"OPTIONS -- pretrained weights: loading from {cfg.student.pretrained_weights}")
+            student_backbone.load_state_dict(chkpt["model"], strict=False)
+
+        self.embed_dim = embed_dim
+        self.dino_out_dim = cfg.dino.head_n_prototypes
+
+        self.do_dino = cfg.dino.loss_weight > 0
+        self.do_koleo = cfg.dino.koleo_loss_weight > 0
+        self.do_ibot = cfg.ibot.loss_weight > 0
+        self.ibot_separate_head = cfg.ibot.separate_head
+
+        logger.info("OPTIONS -- DINO")
+        if self.do_dino:
+            logger.info(f"OPTIONS -- DINO -- loss_weight: {cfg.dino.loss_weight}")
+            logger.info(f"OPTIONS -- DINO -- head_n_prototypes: {cfg.dino.head_n_prototypes}")
+            logger.info(f"OPTIONS -- DINO -- head_bottleneck_dim: {cfg.dino.head_bottleneck_dim}")
+            logger.info(f"OPTIONS -- DINO -- head_hidden_dim: {cfg.dino.head_hidden_dim}")
+            self.dino_loss_weight = cfg.dino.loss_weight
+            dino_head = partial(
+                DINOHead,
+                in_dim=embed_dim,
+                out_dim=cfg.dino.head_n_prototypes,
+                hidden_dim=cfg.dino.head_hidden_dim,
+                bottleneck_dim=cfg.dino.head_bottleneck_dim,
+                nlayers=cfg.dino.head_nlayers,
+            )
+            self.dino_loss = DINOLoss(self.dino_out_dim)
+            if self.do_koleo:
+                logger.info("OPTIONS -- DINO -- applying KOLEO regularization")
+                self.koleo_loss = KoLeoLoss()
+
+        else:
+            logger.info("OPTIONS -- DINO -- not using DINO")
+
+        if self.do_dino or self.do_ibot:
+            student_model_dict["dino_head"] = dino_head()
+            teacher_model_dict["dino_head"] = dino_head()
+
+        logger.info("OPTIONS -- IBOT")
+        logger.info(f"OPTIONS -- IBOT -- loss_weight: {cfg.ibot.loss_weight}")
+        logger.info(f"OPTIONS -- IBOT masking -- ibot_mask_ratio_tuple: {cfg.ibot.mask_ratio_min_max}")
+        logger.info(f"OPTIONS -- IBOT masking -- ibot_mask_sample_probability: {cfg.ibot.mask_sample_probability}")
+        if self.do_ibot:
+            self.ibot_loss_weight = cfg.ibot.loss_weight
+            assert max(cfg.ibot.mask_ratio_min_max) > 0, "please provide a positive mask ratio tuple for ibot"
+            assert cfg.ibot.mask_sample_probability > 0, "please provide a positive mask probability for ibot"
+            self.ibot_out_dim = cfg.ibot.head_n_prototypes if self.ibot_separate_head else cfg.dino.head_n_prototypes
+            self.ibot_patch_loss = iBOTPatchLoss(self.ibot_out_dim)
+            if self.ibot_separate_head:
+                logger.info(f"OPTIONS -- IBOT -- loss_weight: {cfg.ibot.loss_weight}")
+                logger.info(f"OPTIONS -- IBOT -- head_n_prototypes: {cfg.ibot.head_n_prototypes}")
+                logger.info(f"OPTIONS -- IBOT -- head_bottleneck_dim: {cfg.ibot.head_bottleneck_dim}")
+                logger.info(f"OPTIONS -- IBOT -- head_hidden_dim: {cfg.ibot.head_hidden_dim}")
+                ibot_head = partial(
+                    DINOHead,
+                    in_dim=embed_dim,
+                    out_dim=cfg.ibot.head_n_prototypes,
+                    hidden_dim=cfg.ibot.head_hidden_dim,
+                    bottleneck_dim=cfg.ibot.head_bottleneck_dim,
+                    nlayers=cfg.ibot.head_nlayers,
+                )
+                student_model_dict["ibot_head"] = ibot_head()
+                teacher_model_dict["ibot_head"] = ibot_head()
+            else:
+                logger.info("OPTIONS -- IBOT -- head shared with DINO")
+
+        self.need_to_synchronize_fsdp_streams = True
+
+        self.student = nn.ModuleDict(student_model_dict)
+        self.teacher = nn.ModuleDict(teacher_model_dict)
+
+        # there is no backpropagation through the teacher, so no need for gradients
+        for p in self.teacher.parameters():
+            p.requires_grad = False
+        logger.info(f"Student and Teacher are built: they are both {cfg.student.arch} network.")
+
+    def forward(self, inputs):
+        raise NotImplementedError
+
+    def backprop_loss(self, loss):
+        if self.fp16_scaler is not None:
+            self.fp16_scaler.scale(loss).backward()
+        else:
+            loss.backward()
+
+    def forward_backward(self, images, teacher_temp):
+        n_global_crops = 2
+        assert n_global_crops == 2
+        n_local_crops = self.cfg.crops.local_crops_number
+
+        global_crops = images["collated_global_crops"].cuda(non_blocking=True)
+        local_crops = images["collated_local_crops"].cuda(non_blocking=True)
+
+        masks = images["collated_masks"].cuda(non_blocking=True)
+        mask_indices_list = images["mask_indices_list"].cuda(non_blocking=True)
+        n_masked_patches_tensor = images["n_masked_patches"].cuda(non_blocking=True)
+        n_masked_patches = mask_indices_list.shape[0]
+        upperbound = images["upperbound"]
+        masks_weight = images["masks_weight"].cuda(non_blocking=True)
+
+        n_local_crops_loss_terms = max(n_local_crops * n_global_crops, 1)
+        n_global_crops_loss_terms = (n_global_crops - 1) * n_global_crops
+
+        do_dino = self.do_dino
+        do_ibot = self.do_ibot
+
+        # loss scales
+        ibot_loss_scale = 1.0 / n_global_crops
+
+        # teacher output
+        @torch.no_grad()
+        def get_teacher_output():
+            x, n_global_crops_teacher = global_crops, n_global_crops
+            teacher_backbone_output_dict = self.teacher.backbone(x, is_training=True)
+            teacher_cls_tokens = teacher_backbone_output_dict["x_norm_clstoken"]
+            teacher_cls_tokens = teacher_cls_tokens.chunk(n_global_crops_teacher)
+            # watch out: these are chunked and cat'd in reverse so A is matched to B in the global crops dino loss
+            teacher_cls_tokens = torch.cat((teacher_cls_tokens[1], teacher_cls_tokens[0]))
+            ibot_teacher_patch_tokens = teacher_backbone_output_dict["x_norm_patchtokens"]
+            _dim = ibot_teacher_patch_tokens.shape[-1]
+            n_cls_tokens = teacher_cls_tokens.shape[0]
+
+            if do_ibot and not self.ibot_separate_head:
+                buffer_tensor_teacher = ibot_teacher_patch_tokens.new_zeros(upperbound + n_cls_tokens, _dim)
+                buffer_tensor_teacher[:n_cls_tokens].copy_(teacher_cls_tokens)
+                torch.index_select(
+                    ibot_teacher_patch_tokens.flatten(0, 1),
+                    dim=0,
+                    index=mask_indices_list,
+                    out=buffer_tensor_teacher[n_cls_tokens : n_cls_tokens + n_masked_patches],
+                )
+                tokens_after_head = self.teacher.dino_head(buffer_tensor_teacher)
+                teacher_cls_tokens_after_head = tokens_after_head[:n_cls_tokens]
+                masked_teacher_patch_tokens_after_head = tokens_after_head[
+                    n_cls_tokens : n_cls_tokens + n_masked_patches
+                ]
+            elif do_ibot and self.ibot_separate_head:
+                buffer_tensor_teacher = ibot_teacher_patch_tokens.new_zeros(upperbound, _dim)
+                torch.index_select(
+                    ibot_teacher_patch_tokens.flatten(0, 1),
+                    dim=0,
+                    index=mask_indices_list,
+                    out=buffer_tensor_teacher[:n_masked_patches],
+                )
+                teacher_cls_tokens_after_head = self.teacher.dino_head(teacher_cls_tokens)
+                masked_teacher_patch_tokens_after_head = self.teacher.ibot_head(buffer_tensor_teacher)[
+                    :n_masked_patches
+                ]
+            else:
+                teacher_cls_tokens_after_head = self.teacher.dino_head(teacher_cls_tokens)
+                masked_teacher_ibot_softmaxed_centered = None
+
+            if self.cfg.train.centering == "centering":
+                teacher_dino_softmaxed_centered_list = self.dino_loss.softmax_center_teacher(
+                    teacher_cls_tokens_after_head, teacher_temp=teacher_temp
+                ).view(n_global_crops_teacher, -1, *teacher_cls_tokens_after_head.shape[1:])
+                self.dino_loss.update_center(teacher_cls_tokens_after_head)
+                if do_ibot:
+                    masked_teacher_patch_tokens_after_head = masked_teacher_patch_tokens_after_head.unsqueeze(0)
+                    masked_teacher_ibot_softmaxed_centered = self.ibot_patch_loss.softmax_center_teacher(
+                        masked_teacher_patch_tokens_after_head[:, :n_masked_patches], teacher_temp=teacher_temp
+                    )
+                    masked_teacher_ibot_softmaxed_centered = masked_teacher_ibot_softmaxed_centered.squeeze(0)
+                    self.ibot_patch_loss.update_center(masked_teacher_patch_tokens_after_head[:n_masked_patches])
+
+            elif self.cfg.train.centering == "sinkhorn_knopp":
+                teacher_dino_softmaxed_centered_list = self.dino_loss.sinkhorn_knopp_teacher(
+                    teacher_cls_tokens_after_head, teacher_temp=teacher_temp
+                ).view(n_global_crops_teacher, -1, *teacher_cls_tokens_after_head.shape[1:])
+
+                if do_ibot:
+                    masked_teacher_ibot_softmaxed_centered = self.ibot_patch_loss.sinkhorn_knopp_teacher(
+                        masked_teacher_patch_tokens_after_head,
+                        teacher_temp=teacher_temp,
+                        n_masked_patches_tensor=n_masked_patches_tensor,
+                    )
+
+            else:
+                raise NotImplementedError
+
+            return teacher_dino_softmaxed_centered_list, masked_teacher_ibot_softmaxed_centered
+
+        teacher_dino_softmaxed_centered_list, masked_teacher_ibot_softmaxed_centered = get_teacher_output()
+        reshard_fsdp_model(self.teacher)
+
+        loss_dict = {}
+
+        loss_accumulator = 0  # for backprop
+        student_global_backbone_output_dict, student_local_backbone_output_dict = self.student.backbone(
+            [global_crops, local_crops], masks=[masks, None], is_training=True
+        )
+
+        inputs_for_student_head_list = []
+
+        # 1a: local crops cls tokens
+        student_local_cls_tokens = student_local_backbone_output_dict["x_norm_clstoken"]
+        inputs_for_student_head_list.append(student_local_cls_tokens.unsqueeze(0))
+
+        # 1b: global crops cls tokens
+        student_global_cls_tokens = student_global_backbone_output_dict["x_norm_clstoken"]
+        inputs_for_student_head_list.append(student_global_cls_tokens.unsqueeze(0))
+
+        # 1c: global crops patch tokens
+        if do_ibot:
+            _dim = student_global_backbone_output_dict["x_norm_clstoken"].shape[-1]
+            ibot_student_patch_tokens = student_global_backbone_output_dict["x_norm_patchtokens"]
+            buffer_tensor_patch_tokens = ibot_student_patch_tokens.new_zeros(upperbound, _dim)
+            buffer_tensor_patch_tokens[:n_masked_patches].copy_(
+                torch.index_select(ibot_student_patch_tokens.flatten(0, 1), dim=0, index=mask_indices_list)
+            )
+            if not self.ibot_separate_head:
+                inputs_for_student_head_list.append(buffer_tensor_patch_tokens.unsqueeze(0))
+            else:
+                student_global_masked_patch_tokens_after_head = self.student.ibot_head(buffer_tensor_patch_tokens)[
+                    :n_masked_patches
+                ]
+
+        # 2: run
+        _attn_bias, cat_inputs = fmha.BlockDiagonalMask.from_tensor_list(inputs_for_student_head_list)
+        outputs_list = _attn_bias.split(self.student.dino_head(cat_inputs))
+
+        # 3a: local crops cls tokens
+        student_local_cls_tokens_after_head = outputs_list.pop(0).squeeze(0)
+
+        # 3b: global crops cls tokens
+        student_global_cls_tokens_after_head = outputs_list.pop(0).squeeze(0)
+
+        # 3c: global crops patch tokens
+        if do_ibot and not self.ibot_separate_head:
+            student_global_masked_patch_tokens_after_head = outputs_list.pop(0).squeeze(0)[:n_masked_patches]
+
+        if n_local_crops > 0:
+            dino_local_crops_loss = self.dino_loss(
+                student_output_list=student_local_cls_tokens_after_head.chunk(n_local_crops),
+                teacher_out_softmaxed_centered_list=teacher_dino_softmaxed_centered_list,
+            ) / (n_global_crops_loss_terms + n_local_crops_loss_terms)
+
+            # store for display
+            loss_dict["dino_local_crops_loss"] = dino_local_crops_loss
+
+            # accumulate loss
+            loss_accumulator += self.dino_loss_weight * dino_local_crops_loss
+
+        # process global crops
+        loss_scales = 2  # this is here since we process global crops together
+
+        if do_dino:
+            # compute loss
+            dino_global_crops_loss = (
+                self.dino_loss(
+                    student_output_list=[student_global_cls_tokens_after_head],
+                    teacher_out_softmaxed_centered_list=[
+                        teacher_dino_softmaxed_centered_list.flatten(0, 1)
+                    ],  # these were chunked and stacked in reverse so A is matched to B
+                )
+                * loss_scales
+                / (n_global_crops_loss_terms + n_local_crops_loss_terms)
+            )
+
+            loss_dict["dino_global_crops_loss"] = dino_global_crops_loss
+
+            # accumulate loss
+            loss_accumulator += self.dino_loss_weight * dino_global_crops_loss
+
+            student_cls_tokens = student_global_cls_tokens
+
+            if self.do_koleo:
+                koleo_loss = self.cfg.dino.koleo_loss_weight * sum(
+                    self.koleo_loss(p) for p in student_cls_tokens.chunk(2)
+                )  # we don't apply koleo loss between cls tokens of a same image
+                loss_accumulator += koleo_loss
+                loss_dict["koleo_loss"] = (
+                    koleo_loss / loss_scales
+                )  # this is to display the same losses as before but we can remove eventually
+
+        if do_ibot:
+            # compute loss
+            ibot_patch_loss = (
+                self.ibot_patch_loss.forward_masked(
+                    student_global_masked_patch_tokens_after_head,
+                    masked_teacher_ibot_softmaxed_centered,
+                    student_masks_flat=masks,
+                    n_masked_patches=n_masked_patches,
+                    masks_weight=masks_weight,
+                )
+                * loss_scales
+                * ibot_loss_scale
+            )
+
+            # store for display
+            loss_dict["ibot_loss"] = ibot_patch_loss / 2
+
+            # accumulate loss
+            loss_accumulator += self.ibot_loss_weight * ibot_patch_loss
+
+        self.backprop_loss(loss_accumulator)
+
+        self.fsdp_synchronize_streams()
+
+        return loss_dict
+
+    def fsdp_synchronize_streams(self):
+        if self.need_to_synchronize_fsdp_streams:
+            torch.cuda.synchronize()
+            self.student.dino_head._streams = (
+                self.teacher.dino_head._streams
+            ) = self.student.backbone._streams = self.teacher.backbone._streams
+            self.need_to_synchronize_fsdp_streams = False
+
+    def update_teacher(self, m):
+        student_param_list = []
+        teacher_param_list = []
+        with torch.no_grad():
+            for k in self.student.keys():
+                for ms, mt in zip(get_fsdp_modules(self.student[k]), get_fsdp_modules(self.teacher[k])):
+                    student_param_list += ms.params
+                    teacher_param_list += mt.params
+            torch._foreach_mul_(teacher_param_list, m)
+            torch._foreach_add_(teacher_param_list, student_param_list, alpha=1 - m)
+
+    def train(self):
+        super().train()
+        self.teacher.eval()
+
+    def get_maybe_fused_params_for_submodel(self, m):
+        params_groups = get_params_groups_with_decay(
+            model=m,
+            lr_decay_rate=self.cfg.optim.layerwise_decay,
+            patch_embed_lr_mult=self.cfg.optim.patch_embed_lr_mult,
+        )
+        fused_params_groups = fuse_params_groups(params_groups)
+        logger.info("fusing param groups")
+
+        for g in fused_params_groups:
+            g["foreach"] = True
+        return fused_params_groups
+
+    def get_params_groups(self):
+        all_params_groups = []
+        for m in self.student.values():
+            all_params_groups += self.get_maybe_fused_params_for_submodel(m)
+        return all_params_groups
+
+    def prepare_for_distributed_training(self):
+        logger.info("DISTRIBUTED FSDP -- preparing model for distributed training")
+        if has_batchnorms(self.student):
+            raise NotImplementedError
+        # below will synchronize all student subnetworks across gpus:
+        for k, v in self.student.items():
+            self.teacher[k].load_state_dict(self.student[k].state_dict())
+            student_model_cfg = self.cfg.compute_precision.student[k]
+            self.student[k] = get_fsdp_wrapper(student_model_cfg, modules_to_wrap={BlockChunk})(self.student[k])
+            teacher_model_cfg = self.cfg.compute_precision.teacher[k]
+            self.teacher[k] = get_fsdp_wrapper(teacher_model_cfg, modules_to_wrap={BlockChunk})(self.teacher[k])
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/train.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..5279b9c4317e56b5c0a9c39f7bf9bf56b04a1f8b
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/train.py
@@ -0,0 +1,319 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import math
+import os
+from functools import partial
+
+from fvcore.common.checkpoint import PeriodicCheckpointer
+import torch
+
+from dinov2.data import SamplerType, make_data_loader, make_dataset
+from dinov2.data import collate_data_and_cast, DataAugmentationDINO, MaskingGenerator
+import dinov2.distributed as distributed
+from dinov2.fsdp import FSDPCheckpointer
+from dinov2.logging import MetricLogger
+from dinov2.utils.config import setup
+from dinov2.utils.utils import CosineScheduler
+
+from dinov2.train.ssl_meta_arch import SSLMetaArch
+
+
+torch.backends.cuda.matmul.allow_tf32 = True  # PyTorch 1.12 sets this to False by default
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(add_help: bool = True):
+    parser = argparse.ArgumentParser("DINOv2 training", add_help=add_help)
+    parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
+    parser.add_argument(
+        "--no-resume",
+        action="store_true",
+        help="Whether to not attempt to resume from the checkpoint directory. ",
+    )
+    parser.add_argument("--eval-only", action="store_true", help="perform evaluation only")
+    parser.add_argument("--eval", type=str, default="", help="Eval type to perform")
+    parser.add_argument(
+        "opts",
+        help="""
+Modify config options at the end of the command. For Yacs configs, use
+space-separated "PATH.KEY VALUE" pairs.
+For python-based LazyConfig, use "path.key=value".
+        """.strip(),
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    parser.add_argument(
+        "--output-dir",
+        "--output_dir",
+        default="",
+        type=str,
+        help="Output directory to save logs and checkpoints",
+    )
+
+    return parser
+
+
+def build_optimizer(cfg, params_groups):
+    return torch.optim.AdamW(params_groups, betas=(cfg.optim.adamw_beta1, cfg.optim.adamw_beta2))
+
+
+def build_schedulers(cfg):
+    OFFICIAL_EPOCH_LENGTH = cfg.train.OFFICIAL_EPOCH_LENGTH
+    lr = dict(
+        base_value=cfg.optim["lr"],
+        final_value=cfg.optim["min_lr"],
+        total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+        warmup_iters=cfg.optim["warmup_epochs"] * OFFICIAL_EPOCH_LENGTH,
+        start_warmup_value=0,
+    )
+    wd = dict(
+        base_value=cfg.optim["weight_decay"],
+        final_value=cfg.optim["weight_decay_end"],
+        total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+    )
+    momentum = dict(
+        base_value=cfg.teacher["momentum_teacher"],
+        final_value=cfg.teacher["final_momentum_teacher"],
+        total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+    )
+    teacher_temp = dict(
+        base_value=cfg.teacher["teacher_temp"],
+        final_value=cfg.teacher["teacher_temp"],
+        total_iters=cfg.teacher["warmup_teacher_temp_epochs"] * OFFICIAL_EPOCH_LENGTH,
+        warmup_iters=cfg.teacher["warmup_teacher_temp_epochs"] * OFFICIAL_EPOCH_LENGTH,
+        start_warmup_value=cfg.teacher["warmup_teacher_temp"],
+    )
+
+    lr_schedule = CosineScheduler(**lr)
+    wd_schedule = CosineScheduler(**wd)
+    momentum_schedule = CosineScheduler(**momentum)
+    teacher_temp_schedule = CosineScheduler(**teacher_temp)
+    last_layer_lr_schedule = CosineScheduler(**lr)
+
+    last_layer_lr_schedule.schedule[
+        : cfg.optim["freeze_last_layer_epochs"] * OFFICIAL_EPOCH_LENGTH
+    ] = 0  # mimicking the original schedules
+
+    logger.info("Schedulers ready.")
+
+    return (
+        lr_schedule,
+        wd_schedule,
+        momentum_schedule,
+        teacher_temp_schedule,
+        last_layer_lr_schedule,
+    )
+
+
+def apply_optim_scheduler(optimizer, lr, wd, last_layer_lr):
+    for param_group in optimizer.param_groups:
+        is_last_layer = param_group["is_last_layer"]
+        lr_multiplier = param_group["lr_multiplier"]
+        wd_multiplier = param_group["wd_multiplier"]
+        param_group["weight_decay"] = wd * wd_multiplier
+        param_group["lr"] = (last_layer_lr if is_last_layer else lr) * lr_multiplier
+
+
+def do_test(cfg, model, iteration):
+    new_state_dict = model.teacher.state_dict()
+
+    if distributed.is_main_process():
+        iterstring = str(iteration)
+        eval_dir = os.path.join(cfg.train.output_dir, "eval", iterstring)
+        os.makedirs(eval_dir, exist_ok=True)
+        # save teacher checkpoint
+        teacher_ckp_path = os.path.join(eval_dir, "teacher_checkpoint.pth")
+        torch.save({"teacher": new_state_dict}, teacher_ckp_path)
+
+
+def do_train(cfg, model, resume=False):
+    model.train()
+    inputs_dtype = torch.half
+    fp16_scaler = model.fp16_scaler  # for mixed precision training
+
+    # setup optimizer
+
+    optimizer = build_optimizer(cfg, model.get_params_groups())
+    (
+        lr_schedule,
+        wd_schedule,
+        momentum_schedule,
+        teacher_temp_schedule,
+        last_layer_lr_schedule,
+    ) = build_schedulers(cfg)
+
+    # checkpointer
+    checkpointer = FSDPCheckpointer(model, cfg.train.output_dir, optimizer=optimizer, save_to_disk=True)
+
+    start_iter = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
+
+    OFFICIAL_EPOCH_LENGTH = cfg.train.OFFICIAL_EPOCH_LENGTH
+    max_iter = cfg.optim.epochs * OFFICIAL_EPOCH_LENGTH
+
+    periodic_checkpointer = PeriodicCheckpointer(
+        checkpointer,
+        period=3 * OFFICIAL_EPOCH_LENGTH,
+        max_iter=max_iter,
+        max_to_keep=3,
+    )
+
+    # setup data preprocessing
+
+    img_size = cfg.crops.global_crops_size
+    patch_size = cfg.student.patch_size
+    n_tokens = (img_size // patch_size) ** 2
+    mask_generator = MaskingGenerator(
+        input_size=(img_size // patch_size, img_size // patch_size),
+        max_num_patches=0.5 * img_size // patch_size * img_size // patch_size,
+    )
+
+    data_transform = DataAugmentationDINO(
+        cfg.crops.global_crops_scale,
+        cfg.crops.local_crops_scale,
+        cfg.crops.local_crops_number,
+        global_crops_size=cfg.crops.global_crops_size,
+        local_crops_size=cfg.crops.local_crops_size,
+    )
+
+    collate_fn = partial(
+        collate_data_and_cast,
+        mask_ratio_tuple=cfg.ibot.mask_ratio_min_max,
+        mask_probability=cfg.ibot.mask_sample_probability,
+        n_tokens=n_tokens,
+        mask_generator=mask_generator,
+        dtype=inputs_dtype,
+    )
+
+    # setup data loader
+
+    dataset = make_dataset(
+        dataset_str=cfg.train.dataset_path,
+        transform=data_transform,
+        target_transform=lambda _: (),
+    )
+    # sampler_type = SamplerType.INFINITE
+    sampler_type = SamplerType.SHARDED_INFINITE
+    data_loader = make_data_loader(
+        dataset=dataset,
+        batch_size=cfg.train.batch_size_per_gpu,
+        num_workers=cfg.train.num_workers,
+        shuffle=True,
+        seed=start_iter,  # TODO: Fix this -- cfg.train.seed
+        sampler_type=sampler_type,
+        sampler_advance=0,  # TODO(qas): fix this -- start_iter * cfg.train.batch_size_per_gpu,
+        drop_last=True,
+        collate_fn=collate_fn,
+    )
+
+    # training loop
+
+    iteration = start_iter
+
+    logger.info("Starting training from iteration {}".format(start_iter))
+    metrics_file = os.path.join(cfg.train.output_dir, "training_metrics.json")
+    metric_logger = MetricLogger(delimiter="  ", output_file=metrics_file)
+    header = "Training"
+
+    for data in metric_logger.log_every(
+        data_loader,
+        10,
+        header,
+        max_iter,
+        start_iter,
+    ):
+        current_batch_size = data["collated_global_crops"].shape[0] / 2
+        if iteration > max_iter:
+            return
+
+        # apply schedules
+
+        lr = lr_schedule[iteration]
+        wd = wd_schedule[iteration]
+        mom = momentum_schedule[iteration]
+        teacher_temp = teacher_temp_schedule[iteration]
+        last_layer_lr = last_layer_lr_schedule[iteration]
+        apply_optim_scheduler(optimizer, lr, wd, last_layer_lr)
+
+        # compute losses
+
+        optimizer.zero_grad(set_to_none=True)
+        loss_dict = model.forward_backward(data, teacher_temp=teacher_temp)
+
+        # clip gradients
+
+        if fp16_scaler is not None:
+            if cfg.optim.clip_grad:
+                fp16_scaler.unscale_(optimizer)
+                for v in model.student.values():
+                    v.clip_grad_norm_(cfg.optim.clip_grad)
+            fp16_scaler.step(optimizer)
+            fp16_scaler.update()
+        else:
+            if cfg.optim.clip_grad:
+                for v in model.student.values():
+                    v.clip_grad_norm_(cfg.optim.clip_grad)
+            optimizer.step()
+
+        # perform teacher EMA update
+
+        model.update_teacher(mom)
+
+        # logging
+
+        if distributed.get_global_size() > 1:
+            for v in loss_dict.values():
+                torch.distributed.all_reduce(v)
+        loss_dict_reduced = {k: v.item() / distributed.get_global_size() for k, v in loss_dict.items()}
+
+        if math.isnan(sum(loss_dict_reduced.values())):
+            logger.info("NaN detected")
+            raise AssertionError
+        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+
+        metric_logger.update(lr=lr)
+        metric_logger.update(wd=wd)
+        metric_logger.update(mom=mom)
+        metric_logger.update(last_layer_lr=last_layer_lr)
+        metric_logger.update(current_batch_size=current_batch_size)
+        metric_logger.update(total_loss=losses_reduced, **loss_dict_reduced)
+
+        # checkpointing and testing
+
+        if cfg.evaluation.eval_period_iterations > 0 and (iteration + 1) % cfg.evaluation.eval_period_iterations == 0:
+            do_test(cfg, model, f"training_{iteration}")
+            torch.cuda.synchronize()
+        periodic_checkpointer.step(iteration)
+
+        iteration = iteration + 1
+    metric_logger.synchronize_between_processes()
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+def main(args):
+    cfg = setup(args)
+
+    model = SSLMetaArch(cfg).to(torch.device("cuda"))
+    model.prepare_for_distributed_training()
+
+    logger.info("Model:\n{}".format(model))
+    if args.eval_only:
+        iteration = (
+            FSDPCheckpointer(model, save_dir=cfg.train.output_dir)
+            .resume_or_load(cfg.MODEL.WEIGHTS, resume=not args.no_resume)
+            .get("iteration", -1)
+            + 1
+        )
+        return do_test(cfg, model, f"manual_{iteration}")
+
+    do_train(cfg, model, resume=not args.no_resume)
+
+
+if __name__ == "__main__":
+    args = get_args_parser(add_help=True).parse_args()
+    main(args)
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0952fcc3f57e34b3747962e9ebd6fc57aeea63fa
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/cluster.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/cluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d98c05d68aa6e9dc165df3db06bd70d999b3fda
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/cluster.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+
+class ClusterType(Enum):
+    AWS = "aws"
+    FAIR = "fair"
+    RSC = "rsc"
+
+
+def _guess_cluster_type() -> ClusterType:
+    uname = os.uname()
+    if uname.sysname == "Linux":
+        if uname.release.endswith("-aws"):
+            # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws"
+            return ClusterType.AWS
+        elif uname.nodename.startswith("rsc"):
+            # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc"
+            return ClusterType.RSC
+
+    return ClusterType.FAIR
+
+
+def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]:
+    if cluster_type is None:
+        return _guess_cluster_type()
+
+    return cluster_type
+
+
+def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+
+    CHECKPOINT_DIRNAMES = {
+        ClusterType.AWS: "checkpoints",
+        ClusterType.FAIR: "checkpoint",
+        ClusterType.RSC: "checkpoint/dino",
+    }
+    return Path("/") / CHECKPOINT_DIRNAMES[cluster_type]
+
+
+def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    checkpoint_path = get_checkpoint_path(cluster_type)
+    if checkpoint_path is None:
+        return None
+
+    username = os.environ.get("USER")
+    assert username is not None
+    return checkpoint_path / username
+
+
+def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+
+    SLURM_PARTITIONS = {
+        ClusterType.AWS: "learnlab",
+        ClusterType.FAIR: "learnlab",
+        ClusterType.RSC: "learn",
+    }
+    return SLURM_PARTITIONS[cluster_type]
+
+
+def get_slurm_executor_parameters(
+    nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs
+) -> Dict[str, Any]:
+    # create default parameters
+    params = {
+        "mem_gb": 0,  # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html
+        "gpus_per_node": num_gpus_per_node,
+        "tasks_per_node": num_gpus_per_node,  # one task per GPU
+        "cpus_per_task": 10,
+        "nodes": nodes,
+        "slurm_partition": get_slurm_partition(cluster_type),
+    }
+    # apply cluster-specific adjustments
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type == ClusterType.AWS:
+        params["cpus_per_task"] = 12
+        del params["mem_gb"]
+    elif cluster_type == ClusterType.RSC:
+        params["cpus_per_task"] = 12
+    # set additional parameters / apply overrides
+    params.update(kwargs)
+    return params
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/config.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3763a8b0808ad45cbbfc1dcb00d52b00113f9ad
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/config.py
@@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import logging
+import os
+
+from omegaconf import OmegaConf
+
+import dinov2.distributed as distributed
+from dinov2.logging import setup_logging
+from dinov2.utils import utils
+from dinov2.configs import dinov2_default_config
+
+
+logger = logging.getLogger("dinov2")
+
+
+def apply_scaling_rules_to_cfg(cfg):  # to fix
+    if cfg.optim.scaling_rule == "sqrt_wrt_1024":
+        base_lr = cfg.optim.base_lr
+        cfg.optim.lr = base_lr
+        cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0)
+        logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}")
+    else:
+        raise NotImplementedError
+    return cfg
+
+
+def write_config(cfg, output_dir, name="config.yaml"):
+    logger.info(OmegaConf.to_yaml(cfg))
+    saved_cfg_path = os.path.join(output_dir, name)
+    with open(saved_cfg_path, "w") as f:
+        OmegaConf.save(config=cfg, f=f)
+    return saved_cfg_path
+
+
+def get_cfg_from_args(args):
+    args.output_dir = os.path.abspath(args.output_dir)
+    args.opts += [f"train.output_dir={args.output_dir}"]
+    default_cfg = OmegaConf.create(dinov2_default_config)
+    cfg = OmegaConf.load(args.config_file)
+    cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts))
+    return cfg
+
+
+def default_setup(args):
+    distributed.enable(overwrite=True)
+    seed = getattr(args, "seed", 0)
+    rank = distributed.get_global_rank()
+
+    global logger
+    setup_logging(output=args.output_dir, level=logging.INFO)
+    logger = logging.getLogger("dinov2")
+
+    utils.fix_random_seeds(seed + rank)
+    logger.info("git:\n  {}\n".format(utils.get_sha()))
+    logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg_from_args(args)
+    os.makedirs(args.output_dir, exist_ok=True)
+    default_setup(args)
+    apply_scaling_rules_to_cfg(cfg)
+    write_config(cfg, args.output_dir)
+    return cfg
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..cef122b25ff3533e004799a1d977f63eb213fee0
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Dict, Union
+
+import numpy as np
+import torch
+
+
+TypeSpec = Union[str, np.dtype, torch.dtype]
+
+
+_NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
+    np.dtype("bool"): torch.bool,
+    np.dtype("uint8"): torch.uint8,
+    np.dtype("int8"): torch.int8,
+    np.dtype("int16"): torch.int16,
+    np.dtype("int32"): torch.int32,
+    np.dtype("int64"): torch.int64,
+    np.dtype("float16"): torch.float16,
+    np.dtype("float32"): torch.float32,
+    np.dtype("float64"): torch.float64,
+    np.dtype("complex64"): torch.complex64,
+    np.dtype("complex128"): torch.complex128,
+}
+
+
+def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, str):
+        dtype = np.dtype(dtype)
+    assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
+    return _NUMPY_TO_TORCH_DTYPE[dtype]
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/param_groups.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/param_groups.py
new file mode 100644
index 0000000000000000000000000000000000000000..d707e70cc11591858d4166410d6ed80621cd49ff
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/param_groups.py
@@ -0,0 +1,94 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict
+import logging
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12, force_is_backbone=False, chunked_blocks=False):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone") or force_is_backbone:
+        if ".pos_embed" in name or ".patch_embed" in name or ".mask_token" in name or ".cls_token" in name:
+            layer_id = 0
+        elif force_is_backbone and (
+            "pos_embed" in name or "patch_embed" in name or "mask_token" in name or "cls_token" in name
+        ):
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+        elif chunked_blocks and "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[2]) + 1
+        elif "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[1]) + 1
+
+    return lr_decay_rate ** (num_layers + 1 - layer_id)
+
+
+def get_params_groups_with_decay(model, lr_decay_rate=1.0, patch_embed_lr_mult=1.0):
+    chunked_blocks = False
+    if hasattr(model, "n_blocks"):
+        logger.info("chunked fsdp")
+        n_blocks = model.n_blocks
+        chunked_blocks = model.chunked_blocks
+    elif hasattr(model, "blocks"):
+        logger.info("first code branch")
+        n_blocks = len(model.blocks)
+    elif hasattr(model, "backbone"):
+        logger.info("second code branch")
+        n_blocks = len(model.backbone.blocks)
+    else:
+        logger.info("else code branch")
+        n_blocks = 0
+    all_param_groups = []
+
+    for name, param in model.named_parameters():
+        name = name.replace("_fsdp_wrapped_module.", "")
+        if not param.requires_grad:
+            continue
+        decay_rate = get_vit_lr_decay_rate(
+            name, lr_decay_rate, num_layers=n_blocks, force_is_backbone=n_blocks > 0, chunked_blocks=chunked_blocks
+        )
+        d = {"params": param, "is_last_layer": False, "lr_multiplier": decay_rate, "wd_multiplier": 1.0, "name": name}
+
+        if "last_layer" in name:
+            d.update({"is_last_layer": True})
+
+        if name.endswith(".bias") or "norm" in name or "gamma" in name:
+            d.update({"wd_multiplier": 0.0})
+
+        if "patch_embed" in name:
+            d.update({"lr_multiplier": d["lr_multiplier"] * patch_embed_lr_mult})
+
+        all_param_groups.append(d)
+        logger.info(f"""{name}: lr_multiplier: {d["lr_multiplier"]}, wd_multiplier: {d["wd_multiplier"]}""")
+
+    return all_param_groups
+
+
+def fuse_params_groups(all_params_groups, keys=("lr_multiplier", "wd_multiplier", "is_last_layer")):
+    fused_params_groups = defaultdict(lambda: {"params": []})
+    for d in all_params_groups:
+        identifier = ""
+        for k in keys:
+            identifier += k + str(d[k]) + "_"
+
+        for k in keys:
+            fused_params_groups[identifier][k] = d[k]
+        fused_params_groups[identifier]["params"].append(d["params"])
+
+    return fused_params_groups.values()
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/utils.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..53e63eb427f6d5396c8dc153ab07e825c72b68b4
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/utils.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import random
+import subprocess
+from urllib.parse import urlparse
+
+import numpy as np
+import torch
+from torch import nn
+
+
+logger = logging.getLogger("dinov2")
+
+
+def load_pretrained_weights(model, pretrained_weights, checkpoint_key):
+    if urlparse(pretrained_weights).scheme:  # If it looks like an URL
+        state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
+    else:
+        state_dict = torch.load(pretrained_weights, map_location="cpu")
+    if checkpoint_key is not None and checkpoint_key in state_dict:
+        logger.info(f"Take key {checkpoint_key} in provided checkpoint dict")
+        state_dict = state_dict[checkpoint_key]
+    # remove `module.` prefix
+    state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+    # remove `backbone.` prefix induced by multicrop wrapper
+    state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
+    msg = model.load_state_dict(state_dict, strict=False)
+    logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg))
+
+
+def fix_random_seeds(seed=31):
+    """
+    Fix random seeds.
+    """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
+
+    sha = "N/A"
+    diff = "clean"
+    branch = "N/A"
+    try:
+        sha = _run(["git", "rev-parse", "HEAD"])
+        subprocess.check_output(["git", "diff"], cwd=cwd)
+        diff = _run(["git", "diff-index", "HEAD"])
+        diff = "has uncommitted changes" if diff else "clean"
+        branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+
+
+class CosineScheduler(object):
+    def __init__(self, base_value, final_value, total_iters, warmup_iters=0, start_warmup_value=0, freeze_iters=0):
+        super().__init__()
+        self.final_value = final_value
+        self.total_iters = total_iters
+
+        freeze_schedule = np.zeros((freeze_iters))
+
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+
+        iters = np.arange(total_iters - warmup_iters - freeze_iters)
+        schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
+        self.schedule = np.concatenate((freeze_schedule, warmup_schedule, schedule))
+
+        assert len(self.schedule) == self.total_iters
+
+    def __getitem__(self, it):
+        if it >= self.total_iters:
+            return self.final_value
+        else:
+            return self.schedule[it]
+
+
+def has_batchnorms(model):
+    bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
+    for name, module in model.named_modules():
+        if isinstance(module, bn_types):
+            return True
+    return False
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/hubconf.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/hubconf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36b42cd2136182ea956d8be785cf492418163d8
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/hubconf.py
@@ -0,0 +1,162 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+from typing import Union
+
+import torch
+
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+
+
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+
+
+class Weights(Enum):
+    LVD142M = "LVD142M"
+
+
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    import vision_transformer as vits
+
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+
+    return model
+
+
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+
+
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/setup.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..001987cfeef6c5fe3469ea09cd4698352fa90939
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/setup.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from pathlib import Path
+import re
+from typing import List, Tuple
+
+from setuptools import setup, find_packages
+
+
+NAME = "dinov2"
+DESCRIPTION = "PyTorch code and models for the DINOv2 self-supervised learning method."
+
+URL = "https://github.com/facebookresearch/dinov2"
+AUTHOR = "FAIR"
+REQUIRES_PYTHON = ">=3.9.0"
+HERE = Path(__file__).parent
+
+
+try:
+    with open(HERE / "README.md", encoding="utf-8") as f:
+        long_description = "\n" + f.read()
+except FileNotFoundError:
+    long_description = DESCRIPTION
+
+
+def get_requirements(path: str = HERE / "requirements.txt") -> Tuple[List[str], List[str]]:
+    requirements = []
+    extra_indices = []
+    with open(path) as f:
+        for line in f.readlines():
+            line = line.rstrip("\r\n")
+            if line.startswith("--extra-index-url "):
+                extra_indices.append(line[18:])
+                continue
+            requirements.append(line)
+    return requirements, extra_indices
+
+
+def get_package_version() -> str:
+    with open(HERE / "dinov2/__init__.py") as f:
+        result = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", f.read(), re.M)
+        if result:
+            return result.group(1)
+    raise RuntimeError("Can't get package version")
+
+
+requirements, extra_indices = get_requirements()
+version = get_package_version()
+dev_requirements, _ = get_requirements(HERE / "requirements-dev.txt")
+
+
+setup(
+    name=NAME,
+    version=version,
+    description=DESCRIPTION,
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author=AUTHOR,
+    python_requires=REQUIRES_PYTHON,
+    url=URL,
+    packages=find_packages(),
+    package_data={
+        "": ["*.yaml"],
+    },
+    install_requires=requirements,
+    dependency_links=extra_indices,
+    extras_require={
+        "dev": dev_requirements,
+    },
+    install_package_data=True,
+    license="CC-BY-NC",
+    license_files=("LICENSE",),
+    classifiers=[
+        # Trove classifiers: https://github.com/pypa/trove-classifiers/blob/main/src/trove_classifiers/__init__.py
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: Other/Proprietary License",
+        "Programming Language :: Python :: 3.9",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+)
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/utils.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c6641404093652d5a2f19b4cf283d976ec39e64
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/utils.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import itertools
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+
+
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+
+
+class CenterPadding(nn.Module):
+    def __init__(self, multiple):
+        super().__init__()
+        self.multiple = multiple
+
+    def _get_pad(self, size):
+        new_size = math.ceil(size / self.multiple) * self.multiple
+        pad_size = new_size - size
+        pad_size_left = pad_size // 2
+        pad_size_right = pad_size - pad_size_left
+        return pad_size_left, pad_size_right
+
+    @torch.inference_mode()
+    def forward(self, x):
+        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
+        output = F.pad(x, pads)
+        return output
diff --git a/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/vision_transformer.py b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..121318f9c77a69a4467888cce44e49549e9954c0
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/torchhub/facebookresearch_dinov2_main/vision_transformer.py
@@ -0,0 +1,395 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+
+from dinov2.layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+
+
+logger = logging.getLogger("dinov2")
+
+
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+
+
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+
+            def f(*args, **kwargs):
+                return nn.Identity()
+
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+        self.init_weights()
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
+        w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        
+        sqrt_N = math.sqrt(N)
+        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+            scale_factor=(sx, sy),
+            # (int(w0), int(h0)), # to solve the upsampling shape issue
+            mode="bicubic",
+            antialias=self.interpolate_antialias
+        )
+        
+        assert int(w0) == patch_pos_embed.shape[-2]
+        assert int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+
+        return x
+
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+
+        x = self.prepare_tokens_with_masks(x, masks)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+
+
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
diff --git a/head_extractor/build/lib/mmseg/utils/__init__.py b/head_extractor/build/lib/mmseg/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1160c2c160b0ba0e99e2a84f157f7c656111004
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/utils/__init__.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# yapf: disable
+from .class_names import (ade_classes, ade_palette, bdd100k_classes,
+                          bdd100k_palette, cityscapes_classes,
+                          cityscapes_palette, cocostuff_classes,
+                          cocostuff_palette, dataset_aliases, get_classes,
+                          get_palette, isaid_classes, isaid_palette,
+                          loveda_classes, loveda_palette, potsdam_classes,
+                          potsdam_palette, stare_classes, stare_palette,
+                          synapse_classes, synapse_palette, vaihingen_classes,
+                          vaihingen_palette, voc_classes, voc_palette, 
+                          deepfashion_palette, deepfashion_classes,
+                          fashion_3category_classes, fashion_3category_palette,
+                          human_parsing_classes, human_parsing_palette,
+                          lip_classes, lip_palette, 
+                          human_union_classes, human_union_palette,
+                          deep_fashion_10k_classes, deep_fashion_10k_palette,
+                          imaterialist_classes, imaterialist_palette,
+                          imaterialist_5cat_classes, imaterialist_5cat_palette,
+                          union_new_classes, union_new_palette,
+                          union_new_add_mask_classes, union_new_add_mask_palette)
+
+# yapf: enable
+from .collect_env import collect_env
+from .get_templates import get_predefined_templates
+from .io import datafrombytes
+from .misc import add_prefix, stack_batch
+from .set_env import register_all_modules
+from .tokenizer import tokenize
+from .typing_utils import (ConfigType, ForwardResults, MultiConfig,
+                           OptConfigType, OptMultiConfig, OptSampleList,
+                           SampleList, TensorDict, TensorList)
+
+# isort: off
+from .mask_classification import MatchMasks, seg_data_to_instance_data
+
+__all__ = [
+    'union_new_add_mask_classes',
+    'union_new_add_mask_palette',
+    'union_new_classes',
+    'union_new_palette',
+    'imaterialist_5cat_classes',
+    'imaterialist_5cat_palette',
+    'imaterialist_classes',
+    'imaterialist_palette',
+    'human_union_classes',
+    'human_union_palette',
+    'deep_fashion_10k_classes',
+    'deep_fashion_10k_palette',
+    'lip_classes',
+    'lip_palette',
+    'human_parsing_classes',
+    'human_parsing_palette',
+    'fashion_3category_palette',
+    'fashion_3category_classes',
+    'deepfashion_palette',
+    'deepfashion_classes',
+    'collect_env',
+    'register_all_modules',
+    'stack_batch',
+    'add_prefix',
+    'ConfigType',
+    'OptConfigType',
+    'MultiConfig',
+    'OptMultiConfig',
+    'SampleList',
+    'OptSampleList',
+    'TensorDict',
+    'TensorList',
+    'ForwardResults',
+    'cityscapes_classes',
+    'ade_classes',
+    'voc_classes',
+    'cocostuff_classes',
+    'loveda_classes',
+    'potsdam_classes',
+    'vaihingen_classes',
+    'isaid_classes',
+    'stare_classes',
+    'cityscapes_palette',
+    'ade_palette',
+    'voc_palette',
+    'cocostuff_palette',
+    'loveda_palette',
+    'potsdam_palette',
+    'vaihingen_palette',
+    'isaid_palette',
+    'stare_palette',
+    'dataset_aliases',
+    'get_classes',
+    'get_palette',
+    'datafrombytes',
+    'synapse_palette',
+    'synapse_classes',
+    'get_predefined_templates',
+    'tokenize',
+    'seg_data_to_instance_data',
+    'MatchMasks',
+    'bdd100k_classes',
+    'bdd100k_palette',
+]
diff --git a/head_extractor/build/lib/mmseg/utils/bpe_simple_vocab_16e6.txt.gz b/head_extractor/build/lib/mmseg/utils/bpe_simple_vocab_16e6.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..36a15856e00a06a9fbed8cdd34d2393fea4a3113
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/utils/bpe_simple_vocab_16e6.txt.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917
diff --git a/head_extractor/build/lib/mmseg/utils/class_names.py b/head_extractor/build/lib/mmseg/utils/class_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..2636b83bd906a20ca04026bd6b1dba9fc819e59d
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/utils/class_names.py
@@ -0,0 +1,837 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import is_str
+
+
+def fashion_3category_classes():
+    """fashion_3category class names for external use."""
+    return [
+        'background', 'upper_body', 'lower_body', 'whole_body'
+    ]
+
+def fashion_3category_palette():
+    """fashion_3category palette for external use."""
+    return [
+        [0,0,0], [255,0,0], [0, 255, 0], [0, 0, 255]
+    ]
+
+def human_parsing_classes():
+    return [
+        'Background',
+        'Hat', 'Hair', 'Sunglasses', 'Upper-clothes',
+        'Skirt', 'Pants', 'Dress', 'Belt', 'Left-shoe', 'Right-shoe',
+        'Face', 'Left-leg', 'Right-leg', 'Left-arm', 'Right-arm', 'Bag', 'Scarf',
+    ]
+    
+def human_parsing_palette():
+    return [
+        [0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
+        [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
+        [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
+        [64, 0, 128], [66, 66, 66], [123, 66, 123], [22, 33, 44], [77, 88, 99], [23, 24, 77]
+    ]
+
+def human_union_classes():
+    return [
+        'background',
+            'hat',
+            'hair',
+            'sunglasses',
+            'upper-clothes',
+            'skirt',
+            'pants',
+            'dress',
+            'belt',
+            'shoes',
+            'face',
+            'legs',
+            'arms',
+            'bag',
+            'scarf',
+            'glove',
+            'socks',
+            'jumpsuits'
+    ]
+
+def human_union_palette():
+    return [
+        [0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
+        [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
+        [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
+        [64, 0, 128], [66, 66, 66], [123, 66, 123], [22, 33, 44], [77, 88, 99], [23, 24, 77]
+    ]
+
+def deep_fashion_10k_classes():
+    return [
+        'background',
+        'top',
+        'outer',
+        'skirt',
+        'dress',
+        'pants',
+        'leggings',
+        'headwear',
+        'eyeglass',
+        'neckwear',
+        'belt',
+        'footwear',
+        'bag',
+        'hair',
+        'face',
+        'skin',
+        'ring',
+        'wrist wearing',
+        'socks',
+        'gloves',
+        'necklace',
+        'rompers',
+        'earrings',
+        'tie'
+    ]
+
+def deep_fashion_10k_palette():
+    return [
+        [0, 0, 0], [255, 0, 0], [0, 128, 0], [0, 0, 255],
+        [0, 128, 128], [238, 130, 238], [128, 128, 128], [255, 255, 0],
+        [255, 153, 18], [255, 125, 64], [127, 255, 0], [175, 238, 238],
+        [138, 43, 226], [210, 105, 30], [0, 0, 139], [72, 61, 139],
+        [255, 20, 147], [255, 192, 203], [205, 92, 92], [32, 178, 170],
+        [132, 112, 255], [160, 82, 45], [255, 222, 173], [240, 230, 140],
+    ]
+
+def union_new_classes():
+    return [
+        'background',
+        'top',
+        'outer',
+        'skirt',
+        'dress',
+        'pants',
+        'leggings',
+        'accessories',
+        'belt',
+        'footwear',
+        'bag',
+        'hair',
+        'skin',
+        'rompers',
+    ]
+
+def union_new_palette():
+    return [
+        [0, 0, 0], # background
+        [255, 0, 0], # top
+        [4, 63, 120], # outer
+        [127, 127, 127],  # skirt
+        [80, 205, 207], # dress
+        [0, 255, 0], # pants
+        [230, 83, 223], # leggings
+        [207, 135, 41], # accessories, including wrist wear,ring,tie,etc...
+        [0, 51, 51], # belt
+        [0, 153, 255], # footwear
+        [167,78,103], # bag
+        [0, 0, 255], # hair
+        [142, 124, 195], # skin
+        [74, 28, 28], # rompers
+    ]
+
+def union_new_add_mask_classes():
+    return [
+        'background',
+        'top',
+        'outer',
+        'skirt',
+        'dress',
+        'pants',
+        'leggings',
+        'accessories',
+        'belt',
+        'footwear',
+        'bag',
+        'hair',
+        'skin',
+        'rompers',
+        'face_mask'
+    ]
+
+def union_new_add_mask_palette():
+    return [
+        [0, 0, 0], # background
+        [255, 0, 0], # top
+        [4, 63, 120], # outer
+        [127, 127, 127],  # skirt
+        [80, 205, 207], # dress
+        [0, 255, 0], # pants
+        [230, 83, 223], # leggings
+        [207, 135, 41], # accessories, including wrist wear,ring,tie,etc...
+        [0, 51, 51], # belt
+        [0, 153, 255], # footwear
+        [167,78,103], # bag
+        [0, 0, 255], # hair
+        [142, 124, 195], # skin
+        [74, 28, 28], # rompers
+        [147, 196, 125], # face mask
+    ]
+
+def imaterialist_classes():
+    return [
+        'background',
+        'shirt, blouse',
+        'top, t-shirt, sweatshirt',
+        'sweater',
+        'cardigan',
+        "jacket",
+        "vest",
+        "pants",
+        "shorts",
+        "skirt",
+        "coat",
+        "dress",
+        "jumpsuit",
+        "cape",
+        "glasses",
+        "hat",
+        "headband, head covering, hair accessory",
+        "tie",
+        "glove",
+        "watch",
+        "belt",
+        "leg warmer",
+        "tights, stockings",
+        "sock",
+        "shoe",
+        "bag, wallet",
+        "scarf",
+        "umbrella",
+        "hood",
+        "collar",
+        "lapel",
+        "epaulette",
+        "sleeve",
+        "pocket",
+        "neckline",
+        "buckle",
+        "zipper",
+        "applique",
+        "bead",
+        "bow",
+        "flower",
+        "fringe",
+        "ribbon",
+        "rivet",
+        "ruffle",
+        "sequin",
+        "tassel",
+    ]
+
+def imaterialist_palette():
+    return [
+        [0, 0, 0], 
+        [234, 191, 155], [186, 99, 123], [46, 100, 157], [154, 71, 196], 
+        [15, 185, 171], [13, 89, 100], [67, 216, 41], [212, 139, 166], 
+        [10, 101, 73], [198, 51, 168], [38, 174, 154], [150, 192, 158], 
+        [194, 243, 120], [10, 224, 173], [214, 94, 149], [211, 126, 18], 
+        [96, 7, 165], [255, 35, 14], [83, 127, 78], [106, 23, 51], 
+        [41, 244, 224], [38, 86, 244], [244, 234, 150], [233, 247, 180], 
+        [222, 117, 26], [2, 90, 51], [27, 176, 90], [178, 160, 25], 
+        [75, 52, 236], [119, 65, 186], [163, 254, 113], [39, 140, 118], 
+        [235, 112, 193], [134, 107, 77], [57, 169, 93], [251, 104, 47], 
+        [224, 14, 49], [20, 123, 134], [178, 32, 212], [116, 194, 248], 
+        [211, 196, 233], [93, 36, 29], [113, 99, 55], [5, 7, 250], 
+        [172, 174, 41], [101, 98, 209],
+    ]
+
+def imaterialist_5cat_classes():
+    return [
+        'background',
+        'upperbody',
+        'lowerbody',
+        'head_related',
+        'others',]
+    
+def imaterialist_5cat_palette():
+    return [
+        [0, 0, 0], [0, 0, 255], [255, 0, 0], [0, 255, 0], [128, 0, 196], 
+    ]
+
+def cityscapes_classes():
+    """Cityscapes class names for external use."""
+    return [
+        'road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+        'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+
+
+def ade_classes():
+    """ADE20K class names for external use."""
+    return [
+        'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed ',
+        'windowpane', 'grass', 'cabinet', 'sidewalk', 'person', 'earth',
+        'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car',
+        'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug',
+        'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe',
+        'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column',
+        'signboard', 'chest of drawers', 'counter', 'sand', 'sink',
+        'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path',
+        'stairs', 'runway', 'case', 'pool table', 'pillow', 'screen door',
+        'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffee table',
+        'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove',
+        'palm', 'kitchen island', 'computer', 'swivel chair', 'boat', 'bar',
+        'arcade machine', 'hovel', 'bus', 'towel', 'light', 'truck', 'tower',
+        'chandelier', 'awning', 'streetlight', 'booth', 'television receiver',
+        'airplane', 'dirt track', 'apparel', 'pole', 'land', 'bannister',
+        'escalator', 'ottoman', 'bottle', 'buffet', 'poster', 'stage', 'van',
+        'ship', 'fountain', 'conveyer belt', 'canopy', 'washer', 'plaything',
+        'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent',
+        'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank',
+        'trade name', 'microwave', 'pot', 'animal', 'bicycle', 'lake',
+        'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce',
+        'vase', 'traffic light', 'tray', 'ashcan', 'fan', 'pier', 'crt screen',
+        'plate', 'monitor', 'bulletin board', 'shower', 'radiator', 'glass',
+        'clock', 'flag'
+    ]
+
+
+def voc_classes():
+    """Pascal VOC class names for external use."""
+    return [
+        'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
+        'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+        'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+        'tvmonitor'
+    ]
+
+def deepfashion_classes():
+    """deepfashion class names for external use."""
+    return [
+        'background',
+        'sleeve top', 'long sleeve top', 'short sleeve outwear', 'long sleeve outwear',
+        'vest', 'sling', 'shorts', 'trousers', 'skirt', 'short sleeve dress',
+        'long sleeve dress', 'vest dress', 'sling dress'
+    ]
+
+
+def pcontext_classes():
+    """Pascal Context class names for external use."""
+    return [
+        'aeroplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle', 'bird',
+        'boat', 'book', 'bottle', 'building', 'bus', 'cabinet', 'car', 'cat',
+        'ceiling', 'chair', 'cloth', 'computer', 'cow', 'cup', 'curtain',
+        'dog', 'door', 'fence', 'floor', 'flower', 'food', 'grass', 'ground',
+        'horse', 'keyboard', 'light', 'motorbike', 'mountain', 'mouse',
+        'person', 'plate', 'platform', 'pottedplant', 'road', 'rock', 'sheep',
+        'shelves', 'sidewalk', 'sign', 'sky', 'snow', 'sofa', 'table', 'track',
+        'train', 'tree', 'truck', 'tvmonitor', 'wall', 'water', 'window',
+        'wood'
+    ]
+
+
+def cocostuff_classes():
+    """CocoStuff class names for external use."""
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+        'blanket', 'branch', 'bridge', 'building-other', 'bush', 'cabinet',
+        'cage', 'cardboard', 'carpet', 'ceiling-other', 'ceiling-tile',
+        'cloth', 'clothes', 'clouds', 'counter', 'cupboard', 'curtain',
+        'desk-stuff', 'dirt', 'door-stuff', 'fence', 'floor-marble',
+        'floor-other', 'floor-stone', 'floor-tile', 'floor-wood', 'flower',
+        'fog', 'food-other', 'fruit', 'furniture-other', 'grass', 'gravel',
+        'ground-other', 'hill', 'house', 'leaves', 'light', 'mat', 'metal',
+        'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net', 'paper',
+        'pavement', 'pillow', 'plant-other', 'plastic', 'platform',
+        'playingfield', 'railing', 'railroad', 'river', 'road', 'rock', 'roof',
+        'rug', 'salad', 'sand', 'sea', 'shelf', 'sky-other', 'skyscraper',
+        'snow', 'solid-other', 'stairs', 'stone', 'straw', 'structural-other',
+        'table', 'tent', 'textile-other', 'towel', 'tree', 'vegetable',
+        'wall-brick', 'wall-concrete', 'wall-other', 'wall-panel',
+        'wall-stone', 'wall-tile', 'wall-wood', 'water-other', 'waterdrops',
+        'window-blind', 'window-other', 'wood'
+    ]
+
+
+def loveda_classes():
+    """LoveDA class names for external use."""
+    return [
+        'background', 'building', 'road', 'water', 'barren', 'forest',
+        'agricultural'
+    ]
+
+
+def potsdam_classes():
+    """Potsdam class names for external use."""
+    return [
+        'impervious_surface', 'building', 'low_vegetation', 'tree', 'car',
+        'clutter'
+    ]
+
+
+def vaihingen_classes():
+    """Vaihingen class names for external use."""
+    return [
+        'impervious_surface', 'building', 'low_vegetation', 'tree', 'car',
+        'clutter'
+    ]
+
+
+def isaid_classes():
+    """iSAID class names for external use."""
+    return [
+        'background', 'ship', 'store_tank', 'baseball_diamond', 'tennis_court',
+        'basketball_court', 'Ground_Track_Field', 'Bridge', 'Large_Vehicle',
+        'Small_Vehicle', 'Helicopter', 'Swimming_pool', 'Roundabout',
+        'Soccer_ball_field', 'plane', 'Harbor'
+    ]
+
+
+def stare_classes():
+    """stare class names for external use."""
+    return ['background', 'vessel']
+
+
+def mapillary_v1_classes():
+    """mapillary_v1 class names for external use."""
+    return [
+        'Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail', 'Barrier',
+        'Wall', 'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Parking',
+        'Pedestrian Area', 'Rail Track', 'Road', 'Service Lane', 'Sidewalk',
+        'Bridge', 'Building', 'Tunnel', 'Person', 'Bicyclist', 'Motorcyclist',
+        'Other Rider', 'Lane Marking - Crosswalk', 'Lane Marking - General',
+        'Mountain', 'Sand', 'Sky', 'Snow', 'Terrain', 'Vegetation', 'Water',
+        'Banner', 'Bench', 'Bike Rack', 'Billboard', 'Catch Basin',
+        'CCTV Camera', 'Fire Hydrant', 'Junction Box', 'Mailbox', 'Manhole',
+        'Phone Booth', 'Pothole', 'Street Light', 'Pole', 'Traffic Sign Frame',
+        'Utility Pole', 'Traffic Light', 'Traffic Sign (Back)',
+        'Traffic Sign (Front)', 'Trash Can', 'Bicycle', 'Boat', 'Bus', 'Car',
+        'Caravan', 'Motorcycle', 'On Rails', 'Other Vehicle', 'Trailer',
+        'Truck', 'Wheeled Slow', 'Car Mount', 'Ego Vehicle', 'Unlabeled'
+    ]
+
+
+def mapillary_v1_palette():
+    """mapillary_v1_ palette for external use."""
+    return [[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153],
+            [180, 165, 180], [90, 120, 150], [102, 102, 156], [128, 64, 255],
+            [140, 140, 200], [170, 170, 170], [250, 170, 160], [96, 96, 96],
+            [230, 150, 140], [128, 64, 128], [110, 110, 110], [244, 35, 232],
+            [150, 100, 100], [70, 70, 70], [150, 120, 90], [220, 20, 60],
+            [255, 0, 0], [255, 0, 100], [255, 0, 200], [200, 128, 128],
+            [255, 255, 255], [64, 170, 64], [230, 160, 50], [70, 130, 180],
+            [190, 255, 255], [152, 251, 152], [107, 142, 35], [0, 170, 30],
+            [255, 255, 128], [250, 0, 30], [100, 140, 180], [220, 220, 220],
+            [220, 128, 128], [222, 40, 40], [100, 170, 30], [40, 40, 40],
+            [33, 33, 33], [100, 128, 160], [142, 0, 0], [70, 100, 150],
+            [210, 170, 100], [153, 153, 153], [128, 128, 128], [0, 0, 80],
+            [250, 170, 30], [192, 192, 192], [220, 220, 0], [140, 140, 20],
+            [119, 11, 32], [150, 0, 255], [0, 60, 100], [0, 0, 142],
+            [0, 0, 90], [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+            [0, 0, 70], [0, 0, 192], [32, 32, 32], [120, 10, 10], [0, 0, 0]]
+
+
+def mapillary_v2_classes():
+    """mapillary_v2 class names for external use."""
+    return [
+        'Bird', 'Ground Animal', 'Ambiguous Barrier', 'Concrete Block', 'Curb',
+        'Fence', 'Guard Rail', 'Barrier', 'Road Median', 'Road Side',
+        'Lane Separator', 'Temporary Barrier', 'Wall', 'Bike Lane',
+        'Crosswalk - Plain', 'Curb Cut', 'Driveway', 'Parking',
+        'Parking Aisle', 'Pedestrian Area', 'Rail Track', 'Road',
+        'Road Shoulder', 'Service Lane', 'Sidewalk', 'Traffic Island',
+        'Bridge', 'Building', 'Garage', 'Tunnel', 'Person', 'Person Group',
+        'Bicyclist', 'Motorcyclist', 'Other Rider',
+        'Lane Marking - Dashed Line', 'Lane Marking - Straight Line',
+        'Lane Marking - Zigzag Line', 'Lane Marking - Ambiguous',
+        'Lane Marking - Arrow (Left)', 'Lane Marking - Arrow (Other)',
+        'Lane Marking - Arrow (Right)',
+        'Lane Marking - Arrow (Split Left or Straight)',
+        'Lane Marking - Arrow (Split Right or Straight)',
+        'Lane Marking - Arrow (Straight)', 'Lane Marking - Crosswalk',
+        'Lane Marking - Give Way (Row)', 'Lane Marking - Give Way (Single)',
+        'Lane Marking - Hatched (Chevron)',
+        'Lane Marking - Hatched (Diagonal)', 'Lane Marking - Other',
+        'Lane Marking - Stop Line', 'Lane Marking - Symbol (Bicycle)',
+        'Lane Marking - Symbol (Other)', 'Lane Marking - Text',
+        'Lane Marking (only) - Dashed Line', 'Lane Marking (only) - Crosswalk',
+        'Lane Marking (only) - Other', 'Lane Marking (only) - Test',
+        'Mountain', 'Sand', 'Sky', 'Snow', 'Terrain', 'Vegetation', 'Water',
+        'Banner', 'Bench', 'Bike Rack', 'Catch Basin', 'CCTV Camera',
+        'Fire Hydrant', 'Junction Box', 'Mailbox', 'Manhole', 'Parking Meter',
+        'Phone Booth', 'Pothole', 'Signage - Advertisement',
+        'Signage - Ambiguous', 'Signage - Back', 'Signage - Information',
+        'Signage - Other', 'Signage - Store', 'Street Light', 'Pole',
+        'Pole Group', 'Traffic Sign Frame', 'Utility Pole', 'Traffic Cone',
+        'Traffic Light - General (Single)', 'Traffic Light - Pedestrians',
+        'Traffic Light - General (Upright)',
+        'Traffic Light - General (Horizontal)', 'Traffic Light - Cyclists',
+        'Traffic Light - Other', 'Traffic Sign - Ambiguous',
+        'Traffic Sign (Back)', 'Traffic Sign - Direction (Back)',
+        'Traffic Sign - Direction (Front)', 'Traffic Sign (Front)',
+        'Traffic Sign - Parking', 'Traffic Sign - Temporary (Back)',
+        'Traffic Sign - Temporary (Front)', 'Trash Can', 'Bicycle', 'Boat',
+        'Bus', 'Car', 'Caravan', 'Motorcycle', 'On Rails', 'Other Vehicle',
+        'Trailer', 'Truck', 'Vehicle Group', 'Wheeled Slow', 'Water Valve',
+        'Car Mount', 'Dynamic', 'Ego Vehicle', 'Ground', 'Static', 'Unlabeled'
+    ]
+
+
+def mapillary_v2_palette():
+    """mapillary_v2_ palette for external use."""
+    return [[165, 42, 42], [0, 192, 0], [250, 170, 31], [250, 170, 32],
+            [196, 196, 196], [190, 153, 153], [180, 165, 180], [90, 120, 150],
+            [250, 170, 33], [250, 170, 34], [128, 128, 128], [250, 170, 35],
+            [102, 102, 156], [128, 64, 255], [140, 140, 200], [170, 170, 170],
+            [250, 170, 36], [250, 170, 160], [250, 170, 37], [96, 96, 96],
+            [230, 150, 140], [128, 64, 128], [110, 110, 110], [110, 110, 110],
+            [244, 35, 232], [128, 196, 128], [150, 100, 100], [70, 70, 70],
+            [150, 150, 150], [150, 120, 90], [220, 20, 60], [220, 20, 60],
+            [255, 0, 0], [255, 0, 100], [255, 0, 200], [255, 255, 255],
+            [255, 255, 255], [250, 170, 29], [250, 170, 28], [250, 170, 26],
+            [250, 170, 25], [250, 170, 24], [250, 170, 22], [250, 170, 21],
+            [250, 170, 20], [255, 255, 255], [250, 170, 19], [250, 170, 18],
+            [250, 170, 12], [250, 170, 11], [255, 255, 255], [255, 255, 255],
+            [250, 170, 16], [250, 170, 15], [250, 170, 15], [255, 255, 255],
+            [255, 255, 255], [255, 255, 255], [255, 255, 255], [64, 170, 64],
+            [230, 160, 50], [70, 130, 180], [190, 255, 255], [152, 251, 152],
+            [107, 142, 35], [0, 170, 30], [255, 255, 128], [250, 0, 30],
+            [100, 140, 180], [220, 128, 128], [222, 40, 40], [100, 170, 30],
+            [40, 40, 40], [33, 33, 33], [100, 128, 160], [20, 20, 255],
+            [142, 0, 0], [70, 100, 150], [250, 171, 30], [250, 172, 30],
+            [250, 173, 30], [250, 174, 30], [250, 175, 30], [250, 176, 30],
+            [210, 170, 100], [153, 153, 153], [153, 153, 153], [128, 128, 128],
+            [0, 0, 80], [210, 60, 60], [250, 170, 30], [250, 170, 30],
+            [250, 170, 30], [250, 170, 30], [250, 170, 30], [250, 170, 30],
+            [192, 192, 192], [192, 192, 192], [192, 192, 192], [220, 220, 0],
+            [220, 220, 0], [0, 0, 196], [192, 192, 192], [220, 220, 0],
+            [140, 140, 20], [119, 11, 32], [150, 0, 255], [0, 60, 100],
+            [0, 0, 142], [0, 0, 90], [0, 0, 230], [0, 80, 100], [128, 64, 64],
+            [0, 0, 110], [0, 0, 70], [0, 0, 142], [0, 0, 192], [170, 170, 170],
+            [32, 32, 32], [111, 74, 0], [120, 10, 10], [81, 0, 81],
+            [111, 111, 0], [0, 0, 0]]
+
+
+def cityscapes_palette():
+    """Cityscapes palette for external use."""
+    return [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+            [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0],
+            [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60],
+            [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100],
+            [0, 0, 230], [119, 11, 32]]
+
+
+def ade_palette():
+    """ADE20K palette for external use."""
+    return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+            [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+            [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+            [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+            [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+            [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+            [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+            [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+            [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+            [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+            [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+            [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+            [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+            [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+            [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+            [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+            [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+            [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+            [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+            [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+            [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+            [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+            [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+            [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+            [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+            [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+            [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+            [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+            [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+            [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+            [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+            [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+            [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+            [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+            [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+            [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+            [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+            [102, 255, 0], [92, 0, 255]]
+
+
+def voc_palette():
+    """Pascal VOC palette for external use."""
+    return [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
+            [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0],
+            [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128],
+            [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0],
+            [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]]
+
+def deepfashion_palette():
+    """deepfashion palette for external use."""
+    return [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
+            [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
+            [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
+            [64, 0, 128], [66, 66, 66]]
+
+def pcontext_palette():
+    """Pascal Context palette for external use."""
+    return [[180, 120, 120], [6, 230, 230], [80, 50, 50], [4, 200, 3],
+            [120, 120, 80], [140, 140, 140], [204, 5, 255], [230, 230, 230],
+            [4, 250, 7], [224, 5, 255], [235, 255, 7], [150, 5, 61],
+            [120, 120, 70], [8, 255, 51], [255, 6, 82], [143, 255, 140],
+            [204, 255, 4], [255, 51, 7], [204, 70, 3], [0, 102, 200],
+            [61, 230, 250], [255, 6, 51], [11, 102, 255], [255, 7, 71],
+            [255, 9, 224], [9, 7, 230], [220, 220, 220], [255, 9, 92],
+            [112, 9, 255], [8, 255, 214], [7, 255, 224], [255, 184, 6],
+            [10, 255, 71], [255, 41, 10], [7, 255, 255], [224, 255, 8],
+            [102, 8, 255], [255, 61, 6], [255, 194, 7], [255, 122, 8],
+            [0, 255, 20], [255, 8, 41], [255, 5, 153], [6, 51, 255],
+            [235, 12, 255], [160, 150, 20], [0, 163, 255], [140, 140, 140],
+            [250, 10, 15], [20, 255, 0], [31, 255, 0], [255, 31, 0],
+            [255, 224, 0], [153, 255, 0], [0, 0, 255], [255, 71, 0],
+            [0, 235, 255], [0, 173, 255], [31, 0, 255]]
+
+
+def cocostuff_palette():
+    """CocoStuff palette for external use."""
+    return [[0, 192, 64], [0, 192, 64], [0, 64, 96], [128, 192, 192],
+            [0, 64, 64], [0, 192, 224], [0, 192, 192], [128, 192, 64],
+            [0, 192, 96], [128, 192, 64], [128, 32, 192], [0, 0, 224],
+            [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+            [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+            [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+            [64, 128, 32], [0, 160, 0], [0, 0, 0], [192, 128, 160], [0, 32, 0],
+            [0, 128, 128], [64, 128, 160], [128, 160, 0], [0, 128, 0],
+            [192, 128, 32], [128, 96, 128], [0, 0, 128], [64, 0, 32],
+            [0, 224, 128], [128, 0, 0], [192, 0, 160], [0, 96, 128],
+            [128, 128, 128], [64, 0, 160], [128, 224, 128], [128, 128, 64],
+            [192, 0, 32], [128, 96, 0], [128, 0, 192], [0, 128, 32],
+            [64, 224, 0], [0, 0, 64], [128, 128, 160], [64, 96, 0],
+            [0, 128, 192], [0, 128, 160], [192, 224, 0], [0, 128, 64],
+            [128, 128, 32], [192, 32, 128], [0, 64, 192], [0, 0, 32],
+            [64, 160, 128], [128, 64, 64], [128, 0, 160], [64, 32, 128],
+            [128, 192, 192], [0, 0, 160], [192, 160, 128], [128, 192, 0],
+            [128, 0, 96], [192, 32, 0], [128, 64, 128], [64, 128, 96],
+            [64, 160, 0], [0, 64, 0], [192, 128, 224], [64, 32, 0],
+            [0, 192, 128], [64, 128, 224], [192, 160, 0], [0, 192, 0],
+            [192, 128, 96], [192, 96, 128], [0, 64, 128], [64, 0, 96],
+            [64, 224, 128], [128, 64, 0], [192, 0, 224], [64, 96, 128],
+            [128, 192, 128], [64, 0, 224], [192, 224, 128], [128, 192, 64],
+            [192, 0, 96], [192, 96, 0], [128, 64, 192], [0, 128, 96],
+            [0, 224, 0], [64, 64, 64], [128, 128, 224], [0, 96, 0],
+            [64, 192, 192], [0, 128, 224], [128, 224, 0], [64, 192, 64],
+            [128, 128, 96], [128, 32, 128], [64, 0, 192], [0, 64, 96],
+            [0, 160, 128], [192, 0, 64], [128, 64, 224], [0, 32, 128],
+            [192, 128, 192], [0, 64, 224], [128, 160, 128], [192, 128, 0],
+            [128, 64, 32], [128, 32, 64], [192, 0, 128], [64, 192, 32],
+            [0, 160, 64], [64, 0, 0], [192, 192, 160], [0, 32, 64],
+            [64, 128, 128], [64, 192, 160], [128, 160, 64], [64, 128, 0],
+            [192, 192, 32], [128, 96, 192], [64, 0, 128], [64, 64, 32],
+            [0, 224, 192], [192, 0, 0], [192, 64, 160], [0, 96, 192],
+            [192, 128, 128], [64, 64, 160], [128, 224, 192], [192, 128, 64],
+            [192, 64, 32], [128, 96, 64], [192, 0, 192], [0, 192, 32],
+            [64, 224, 64], [64, 0, 64], [128, 192, 160], [64, 96, 64],
+            [64, 128, 192], [0, 192, 160], [192, 224, 64], [64, 128, 64],
+            [128, 192, 32], [192, 32, 192], [64, 64, 192], [0, 64, 32],
+            [64, 160, 192], [192, 64, 64], [128, 64, 160], [64, 32, 192],
+            [192, 192, 192], [0, 64, 160], [192, 160, 192], [192, 192, 0],
+            [128, 64, 96], [192, 32, 64], [192, 64, 128], [64, 192, 96],
+            [64, 160, 64], [64, 64, 0]]
+
+
+def loveda_palette():
+    """LoveDA palette for external use."""
+    return [[255, 255, 255], [255, 0, 0], [255, 255, 0], [0, 0, 255],
+            [159, 129, 183], [0, 255, 0], [255, 195, 128]]
+
+
+def potsdam_palette():
+    """Potsdam palette for external use."""
+    return [[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
+            [255, 255, 0], [255, 0, 0]]
+
+
+def vaihingen_palette():
+    """Vaihingen palette for external use."""
+    return [[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
+            [255, 255, 0], [255, 0, 0]]
+
+
+def isaid_palette():
+    """iSAID palette for external use."""
+    return [[0, 0, 0], [0, 0, 63], [0, 63, 63], [0, 63, 0], [0, 63, 127],
+            [0, 63, 191], [0, 63, 255], [0, 127, 63], [0, 127,
+                                                       127], [0, 0, 127],
+            [0, 0, 191], [0, 0, 255], [0, 191, 127], [0, 127, 191],
+            [0, 127, 255], [0, 100, 155]]
+
+
+def stare_palette():
+    """STARE palette for external use."""
+    return [[120, 120, 120], [6, 230, 230]]
+
+
+def synapse_palette():
+    """Synapse palette for external use."""
+    return [[0, 0, 0], [0, 0, 255], [0, 255, 0], [255, 0, 0], [0, 255, 255],
+            [255, 0, 255], [255, 255, 0], [60, 255, 255], [240, 240, 240]]
+
+
+def synapse_classes():
+    """Synapse class names for external use."""
+    return [
+        'background', 'aorta', 'gallbladder', 'left_kidney', 'right_kidney',
+        'liver', 'pancreas', 'spleen', 'stomach'
+    ]
+
+
+def lip_classes():
+    """LIP class names for external use."""
+    return [
+        'background', 'hat', 'hair', 'glove', 'sunglasses', 'upperclothes',
+        'dress', 'coat', 'socks', 'pants', 'jumpsuits', 'scarf', 'skirt',
+        'face', 'leftArm', 'rightArm', 'leftLeg', 'rightLeg', 'leftShoe',
+        'rightShoe'
+    ]
+
+
+def lip_palette():
+    """LIP palette for external use."""
+    return [
+        [0, 0, 0],
+        [128, 0, 0],
+        [255, 0, 0],
+        [0, 85, 0],
+        [170, 0, 51],
+        [255, 85, 0],
+        [0, 0, 85],
+        [0, 119, 221],
+        [85, 85, 0],
+        [0, 85, 85],
+        [85, 51, 0],
+        [52, 86, 128],
+        [0, 128, 0],
+        [0, 0, 255],
+        [51, 170, 221],
+        [0, 255, 255],
+        [85, 255, 170],
+        [170, 255, 85],
+        [255, 255, 0],
+        [255, 170, 0],
+    ]
+
+
+def bdd100k_classes():
+    """BDD100K class names for external use(the class name is compatible with
+    Cityscapes )."""
+    return [
+        'road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+        'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+
+
+def bdd100k_palette():
+    """bdd100k palette for external use(same with cityscapes)"""
+    return [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+            [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0],
+            [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60],
+            [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100],
+            [0, 0, 230], [119, 11, 32]]
+
+
+def hsidrive_classes():
+    """HSI Drive 2.0 class names for external use."""
+    return [
+        'unlabelled', 'road', 'road marks', 'vegetation', 'painted metal',
+        'sky', 'concrete', 'pedestrian', 'water', 'unpainted metal', 'glass'
+    ]
+
+
+def hsidrive_palette():
+    """HSI Drive 2.0 palette for external use."""
+    return [[0, 0, 0], [77, 77, 77], [255, 255, 255], [0, 255, 0], [255, 0, 0],
+            [0, 0, 255], [102, 51, 0], [255, 255, 0], [0, 207, 250],
+            [255, 166, 0], [0, 204, 204]]
+
+
+dataset_aliases = {
+    'fashion_3category': ['fashon_3category'],
+    'cityscapes': ['cityscapes'],
+    'ade': ['ade', 'ade20k'],
+    'voc': ['voc', 'pascal_voc', 'voc12', 'voc12aug'],
+    'pcontext': ['pcontext', 'pascal_context', 'voc2010'],
+    'loveda': ['loveda'],
+    'potsdam': ['potsdam'],
+    'vaihingen': ['vaihingen'],
+    'cocostuff': [
+        'cocostuff', 'cocostuff10k', 'cocostuff164k', 'coco-stuff',
+        'coco-stuff10k', 'coco-stuff164k', 'coco_stuff', 'coco_stuff10k',
+        'coco_stuff164k'
+    ],
+    'isaid': ['isaid', 'iSAID'],
+    'stare': ['stare', 'STARE'],
+    'lip': ['LIP', 'lip'],
+    'human_union': ['human_union', 'Human_Union'],
+    'union_new': ['union_new'],
+    'union_new_add_mask': ['union_new_add_mask'],
+    'deep_fashion_10k': ['deep_fashion_10k', 'Deep_Fashion_10K'],
+    'imaterialist': ['iMaterialist', 'imaterialist'],
+    'imaterialist_5cat': ['iMaterialist_5cat', 'imaterialist_5cat'],
+    'mapillary_v1': ['mapillary_v1'],
+    'mapillary_v2': ['mapillary_v2'],
+    'bdd100k': ['bdd100k'],
+    'hsidrive': [
+        'hsidrive', 'HSIDrive', 'HSI-Drive', 'hsidrive20', 'HSIDrive20',
+        'HSI-Drive20'
+    ]
+}
+
+
+def get_classes(dataset):
+    """Get class names of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_classes()')
+        else:
+            raise ValueError(f'Unrecognized dataset: {dataset}')
+    else:
+        raise TypeError(f'dataset must a str, but got {type(dataset)}')
+    return labels
+
+
+def get_palette(dataset):
+    """Get class palette (RGB) of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_palette()')
+        else:
+            raise ValueError(f'Unrecognized dataset: {dataset}')
+    else:
+        raise TypeError(f'dataset must a str, but got {type(dataset)}')
+    return labels
diff --git a/head_extractor/build/lib/mmseg/utils/collect_env.py b/head_extractor/build/lib/mmseg/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5d6ea290283e3af2f29475f82d225072cf39d99
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/utils/collect_env.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import get_git_hash
+from mmengine.utils.dl_utils import collect_env as collect_base_env
+
+import mmseg
+
+
+def collect_env():
+    """Collect the information of the running environments."""
+    env_info = collect_base_env()
+    env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}'
+
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/head_extractor/build/lib/mmseg/utils/get_templates.py b/head_extractor/build/lib/mmseg/utils/get_templates.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9032ba96cbe750134676fe46fc26fb607779f5
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/utils/get_templates.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+PREDEFINED_TEMPLATES = {
+    'imagenet': [
+        'a bad photo of a {}.',
+        'a photo of many {}.',
+        'a sculpture of a {}.',
+        'a photo of the hard to see {}.',
+        'a low resolution photo of the {}.',
+        'a rendering of a {}.',
+        'graffiti of a {}.',
+        'a bad photo of the {}.',
+        'a cropped photo of the {}.',
+        'a tattoo of a {}.',
+        'the embroidered {}.',
+        'a photo of a hard to see {}.',
+        'a bright photo of a {}.',
+        'a photo of a clean {}.',
+        'a photo of a dirty {}.',
+        'a dark photo of the {}.',
+        'a drawing of a {}.',
+        'a photo of my {}.',
+        'the plastic {}.',
+        'a photo of the cool {}.',
+        'a close-up photo of a {}.',
+        'a black and white photo of the {}.',
+        'a painting of the {}.',
+        'a painting of a {}.',
+        'a pixelated photo of the {}.',
+        'a sculpture of the {}.',
+        'a bright photo of the {}.',
+        'a cropped photo of a {}.',
+        'a plastic {}.',
+        'a photo of the dirty {}.',
+        'a jpeg corrupted photo of a {}.',
+        'a blurry photo of the {}.',
+        'a photo of the {}.',
+        'a good photo of the {}.',
+        'a rendering of the {}.',
+        'a {} in a video game.',
+        'a photo of one {}.',
+        'a doodle of a {}.',
+        'a close-up photo of the {}.',
+        'a photo of a {}.',
+        'the origami {}.',
+        'the {} in a video game.',
+        'a sketch of a {}.',
+        'a doodle of the {}.',
+        'a origami {}.',
+        'a low resolution photo of a {}.',
+        'the toy {}.',
+        'a rendition of the {}.',
+        'a photo of the clean {}.',
+        'a photo of a large {}.',
+        'a rendition of a {}.',
+        'a photo of a nice {}.',
+        'a photo of a weird {}.',
+        'a blurry photo of a {}.',
+        'a cartoon {}.',
+        'art of a {}.',
+        'a sketch of the {}.',
+        'a embroidered {}.',
+        'a pixelated photo of a {}.',
+        'itap of the {}.',
+        'a jpeg corrupted photo of the {}.',
+        'a good photo of a {}.',
+        'a plushie {}.',
+        'a photo of the nice {}.',
+        'a photo of the small {}.',
+        'a photo of the weird {}.',
+        'the cartoon {}.',
+        'art of the {}.',
+        'a drawing of the {}.',
+        'a photo of the large {}.',
+        'a black and white photo of a {}.',
+        'the plushie {}.',
+        'a dark photo of a {}.',
+        'itap of a {}.',
+        'graffiti of the {}.',
+        'a toy {}.',
+        'itap of my {}.',
+        'a photo of a cool {}.',
+        'a photo of a small {}.',
+        'a tattoo of the {}.',
+    ],
+    'vild': [
+        'a photo of a {}.',
+        'This is a photo of a {}',
+        'There is a {} in the scene',
+        'There is the {} in the scene',
+        'a photo of a {} in the scene',
+        'a photo of a small {}.',
+        'a photo of a medium {}.',
+        'a photo of a large {}.',
+        'This is a photo of a small {}.',
+        'This is a photo of a medium {}.',
+        'This is a photo of a large {}.',
+        'There is a small {} in the scene.',
+        'There is a medium {} in the scene.',
+        'There is a large {} in the scene.',
+    ],
+}
+
+
+def get_predefined_templates(template_set_name: str) -> List[str]:
+    if template_set_name not in PREDEFINED_TEMPLATES:
+        raise ValueError(f'Template set {template_set_name} not found')
+    return PREDEFINED_TEMPLATES[template_set_name]
diff --git a/head_extractor/build/lib/mmseg/utils/io.py b/head_extractor/build/lib/mmseg/utils/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..7029c3cddda02c89cbb50cee9f8b7e7fa57378d9
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/utils/io.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import gzip
+import io
+import pickle
+
+import cv2
+import numpy as np
+
+
+def datafrombytes(content: bytes, backend: str = 'numpy') -> np.ndarray:
+    """Data decoding from bytes.
+
+    Args:
+        content (bytes): The data bytes got from files or other streams.
+        backend (str): The data decoding backend type. Options are 'numpy',
+            'nifti', 'cv2' and 'pickle'. Defaults to 'numpy'.
+
+    Returns:
+        numpy.ndarray: Loaded data array.
+    """
+    if backend == 'pickle':
+        data = pickle.loads(content)
+    else:
+        with io.BytesIO(content) as f:
+            if backend == 'nifti':
+                f = gzip.open(f)
+                try:
+                    from nibabel import FileHolder, Nifti1Image
+                except ImportError:
+                    print('nifti files io depends on nibabel, please run'
+                          '`pip install nibabel` to install it')
+                fh = FileHolder(fileobj=f)
+                data = Nifti1Image.from_file_map({'header': fh, 'image': fh})
+                data = Nifti1Image.from_bytes(data.to_bytes()).get_fdata()
+            elif backend == 'numpy':
+                data = np.load(f)
+            elif backend == 'cv2':
+                data = np.frombuffer(f.read(), dtype=np.uint8)
+                data = cv2.imdecode(data, cv2.IMREAD_UNCHANGED)
+            else:
+                raise ValueError
+    return data
diff --git a/head_extractor/build/lib/mmseg/utils/mask_classification.py b/head_extractor/build/lib/mmseg/utils/mask_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..205d5259754abfe07e0d84ae0739cf08043815ff
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/utils/mask_classification.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from mmcv.ops import point_sample
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import TASK_UTILS
+from mmseg.utils import ConfigType, SampleList
+
+
+def seg_data_to_instance_data(ignore_index: int,
+                              batch_data_samples: SampleList):
+    """Convert the paradigm of ground truth from semantic segmentation to
+    instance segmentation.
+
+    Args:
+        ignore_index (int): The label index to be ignored.
+        batch_data_samples (List[SegDataSample]): The Data
+            Samples. It usually includes information such as
+            `gt_sem_seg`.
+
+    Returns:
+        tuple[Tensor]: A tuple contains two lists.
+            - batch_gt_instances (List[InstanceData]): Batch of
+                gt_instance. It usually includes ``labels``, each is
+                unique ground truth label id of images, with
+                shape (num_gt, ) and ``masks``, each is ground truth
+                masks of each instances of a image, shape (num_gt, h, w).
+            - batch_img_metas (List[Dict]): List of image meta information.
+    """
+    batch_gt_instances = []
+
+    for data_sample in batch_data_samples:
+        gt_sem_seg = data_sample.gt_sem_seg.data
+        classes = torch.unique(
+            gt_sem_seg,
+            sorted=False,
+            return_inverse=False,
+            return_counts=False)
+
+        # remove ignored region
+        gt_labels = classes[classes != ignore_index]
+
+        masks = []
+        for class_id in gt_labels:
+            masks.append(gt_sem_seg == class_id)
+
+        if len(masks) == 0:
+            gt_masks = torch.zeros(
+                (0, gt_sem_seg.shape[-2],
+                 gt_sem_seg.shape[-1])).to(gt_sem_seg).long()
+        else:
+            gt_masks = torch.stack(masks).squeeze(1).long()
+
+        instance_data = InstanceData(labels=gt_labels, masks=gt_masks)
+        batch_gt_instances.append(instance_data)
+    return batch_gt_instances
+
+
+class MatchMasks:
+    """Match the predictions to category labels.
+
+    Args:
+        num_points (int): the number of sampled points to compute cost.
+        num_queries (int): the number of prediction masks.
+        num_classes (int): the number of classes.
+        assigner (BaseAssigner): the assigner to compute matching.
+    """
+
+    def __init__(self,
+                 num_points: int,
+                 num_queries: int,
+                 num_classes: int,
+                 assigner: ConfigType = None):
+        assert assigner is not None, "\'assigner\' in decode_head.train_cfg" \
+                                     'cannot be None'
+        assert num_points > 0, 'num_points should be a positive integer.'
+        self.num_points = num_points
+        self.num_queries = num_queries
+        self.num_classes = num_classes
+        self.assigner = TASK_UTILS.build(assigner)
+
+    def get_targets(self, cls_scores: List[Tensor], mask_preds: List[Tensor],
+                    batch_gt_instances: List[InstanceData]) -> Tuple:
+        """Compute best mask matches for all images for a decoder layer.
+
+        Args:
+            cls_scores (List[Tensor]): Mask score logits from a single
+                decoder layer for all images. Each with shape (num_queries,
+                cls_out_channels).
+            mask_preds (List[Tensor]): Mask logits from a single decoder
+                layer for all images. Each with shape (num_queries, h, w).
+            batch_gt_instances (List[InstanceData]): each contains
+                ``labels`` and ``masks``.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+                - labels (List[Tensor]): Labels of all images.\
+                    Each with shape (num_queries, ).
+                - mask_targets (List[Tensor]): Mask targets of\
+                    all images. Each with shape (num_queries, h, w).
+                - mask_weights (List[Tensor]): Mask weights of\
+                    all images. Each with shape (num_queries, ).
+                - avg_factor (int): Average factor that is used to
+                    average the loss. `avg_factor` is usually equal
+                    to the number of positive priors.
+        """
+        batch_size = cls_scores.shape[0]
+        results = dict({
+            'labels': [],
+            'mask_targets': [],
+            'mask_weights': [],
+        })
+        for i in range(batch_size):
+            labels, mask_targets, mask_weights\
+                = self._get_targets_single(cls_scores[i],
+                                           mask_preds[i],
+                                           batch_gt_instances[i])
+            results['labels'].append(labels)
+            results['mask_targets'].append(mask_targets)
+            results['mask_weights'].append(mask_weights)
+
+        # shape (batch_size, num_queries)
+        labels = torch.stack(results['labels'], dim=0)
+        # shape (batch_size, num_gts, h, w)
+        mask_targets = torch.cat(results['mask_targets'], dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(results['mask_weights'], dim=0)
+
+        avg_factor = sum(
+            [len(gt_instances.labels) for gt_instances in batch_gt_instances])
+
+        res = (labels, mask_targets, mask_weights, avg_factor)
+
+        return res
+
+    def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor,
+                            gt_instances: InstanceData) \
+            -> Tuple[Tensor, Tensor, Tensor]:
+        """Compute a set of best mask matches for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_instances (:obj:`InstanceData`): It contains ``labels`` and
+                ``masks``.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+        """
+        gt_labels = gt_instances.labels
+        gt_masks = gt_instances.masks
+        # when "gt_labels" is empty, classify all queries to background
+        if len(gt_labels) == 0:
+            labels = gt_labels.new_full((self.num_queries, ),
+                                        self.num_classes,
+                                        dtype=torch.long)
+            mask_targets = gt_labels
+            mask_weights = gt_labels.new_zeros((self.num_queries, ))
+            return labels, mask_targets, mask_weights
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(
+            mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1,
+                                                        1)).squeeze(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(
+            gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1,
+                                                               1)).squeeze(1)
+
+        sampled_gt_instances = InstanceData(
+            labels=gt_labels, masks=gt_points_masks)
+        sampled_pred_instances = InstanceData(
+            scores=cls_score, masks=mask_points_pred)
+        # assign and sample
+        matched_quiery_inds, matched_label_inds = self.assigner.assign(
+            pred_instances=sampled_pred_instances,
+            gt_instances=sampled_gt_instances)
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[matched_quiery_inds] = gt_labels[matched_label_inds]
+
+        mask_weights = gt_labels.new_zeros((self.num_queries, ))
+        mask_weights[matched_quiery_inds] = 1
+        mask_targets = gt_masks[matched_label_inds]
+
+        return labels, mask_targets, mask_weights
diff --git a/head_extractor/build/lib/mmseg/utils/misc.py b/head_extractor/build/lib/mmseg/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfc469e8320d375135846cfb0474a0fc8d9b15d0
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/utils/misc.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from .typing_utils import SampleList
+
+
+def add_prefix(inputs, prefix):
+    """Add prefix for dict.
+
+    Args:
+        inputs (dict): The input dict with str keys.
+        prefix (str): The prefix to add.
+
+    Returns:
+
+        dict: The dict with keys updated with ``prefix``.
+    """
+
+    outputs = dict()
+    for name, value in inputs.items():
+        outputs[f'{prefix}.{name}'] = value
+
+    return outputs
+
+
+def stack_batch(inputs: List[torch.Tensor],
+                data_samples: Optional[SampleList] = None,
+                size: Optional[tuple] = None,
+                size_divisor: Optional[int] = None,
+                pad_val: Union[int, float] = 0,
+                seg_pad_val: Union[int, float] = 255) -> torch.Tensor:
+    """Stack multiple inputs to form a batch and pad the images and gt_sem_segs
+    to the max shape use the right bottom padding mode.
+
+    Args:
+        inputs (List[Tensor]): The input multiple tensors. each is a
+            CHW 3D-tensor.
+        data_samples (list[:obj:`SegDataSample`]): The list of data samples.
+            It usually includes information such as `gt_sem_seg`.
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (int, float): The padding value. Defaults to 0
+        seg_pad_val (int, float): The padding value. Defaults to 255
+
+    Returns:
+       Tensor: The 4D-tensor.
+       List[:obj:`SegDataSample`]: After the padding of the gt_seg_map.
+    """
+    assert isinstance(inputs, list), \
+        f'Expected input type to be list, but got {type(inputs)}'
+    assert len({tensor.ndim for tensor in inputs}) == 1, \
+        f'Expected the dimensions of all inputs must be the same, ' \
+        f'but got {[tensor.ndim for tensor in inputs]}'
+    assert inputs[0].ndim == 3, f'Expected tensor dimension to be 3, ' \
+        f'but got {inputs[0].ndim}'
+    assert len({tensor.shape[0] for tensor in inputs}) == 1, \
+        f'Expected the channels of all inputs must be the same, ' \
+        f'but got {[tensor.shape[0] for tensor in inputs]}'
+
+    # only one of size and size_divisor should be valid
+    assert (size is not None) ^ (size_divisor is not None), \
+        'only one of size and size_divisor should be valid'
+
+    padded_inputs = []
+    padded_samples = []
+    inputs_sizes = [(img.shape[-2], img.shape[-1]) for img in inputs]
+    max_size = np.stack(inputs_sizes).max(0)
+    if size_divisor is not None and size_divisor > 1:
+        # the last two dims are H,W, both subject to divisibility requirement
+        max_size = (max_size +
+                    (size_divisor - 1)) // size_divisor * size_divisor
+
+    for i in range(len(inputs)):
+        tensor = inputs[i]
+        if size is not None:
+            width = max(size[-1] - tensor.shape[-1], 0)
+            height = max(size[-2] - tensor.shape[-2], 0)
+            # (padding_left, padding_right, padding_top, padding_bottom)
+            padding_size = (0, width, 0, height)
+        elif size_divisor is not None:
+            width = max(max_size[-1] - tensor.shape[-1], 0)
+            height = max(max_size[-2] - tensor.shape[-2], 0)
+            padding_size = (0, width, 0, height)
+        else:
+            padding_size = [0, 0, 0, 0]
+
+        # pad img
+        pad_img = F.pad(tensor, padding_size, value=pad_val)
+        padded_inputs.append(pad_img)
+        # pad gt_sem_seg
+        if data_samples is not None:
+            data_sample = data_samples[i]
+            pad_shape = None
+            if 'gt_sem_seg' in data_sample:
+                gt_sem_seg = data_sample.gt_sem_seg.data
+                del data_sample.gt_sem_seg.data
+                data_sample.gt_sem_seg.data = F.pad(
+                    gt_sem_seg, padding_size, value=seg_pad_val)
+                pad_shape = data_sample.gt_sem_seg.shape
+            if 'gt_edge_map' in data_sample:
+                gt_edge_map = data_sample.gt_edge_map.data
+                del data_sample.gt_edge_map.data
+                data_sample.gt_edge_map.data = F.pad(
+                    gt_edge_map, padding_size, value=seg_pad_val)
+                pad_shape = data_sample.gt_edge_map.shape
+            if 'gt_depth_map' in data_sample:
+                gt_depth_map = data_sample.gt_depth_map.data
+                del data_sample.gt_depth_map.data
+                data_sample.gt_depth_map.data = F.pad(
+                    gt_depth_map, padding_size, value=seg_pad_val)
+                pad_shape = data_sample.gt_depth_map.shape
+            data_sample.set_metainfo({
+                'img_shape': tensor.shape[-2:],
+                'pad_shape': pad_shape,
+                'padding_size': padding_size
+            })
+            padded_samples.append(data_sample)
+        else:
+            padded_samples.append(
+                dict(
+                    img_padding_size=padding_size,
+                    pad_shape=pad_img.shape[-2:]))
+
+    return torch.stack(padded_inputs, dim=0), padded_samples
diff --git a/head_extractor/build/lib/mmseg/utils/set_env.py b/head_extractor/build/lib/mmseg/utils/set_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..c948950d62a7463295c1055a27a9a0ce881d9fad
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/utils/set_env.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import warnings
+
+from mmengine import DefaultScope
+
+
+def register_all_modules(init_default_scope: bool = True) -> None:
+    """Register all modules in mmseg into the registries.
+
+    Args:
+        init_default_scope (bool): Whether initialize the mmseg default scope.
+            When `init_default_scope=True`, the global default scope will be
+            set to `mmseg`, and all registries will build modules from mmseg's
+            registry node. To understand more about the registry, please refer
+            to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md
+            Defaults to True.
+    """  # noqa
+    import mmseg.datasets  # noqa: F401,F403
+    import mmseg.engine  # noqa: F401,F403
+    import mmseg.evaluation  # noqa: F401,F403
+    import mmseg.models  # noqa: F401,F403
+    import mmseg.structures  # noqa: F401,F403
+
+    if init_default_scope:
+        never_created = DefaultScope.get_current_instance() is None \
+                        or not DefaultScope.check_instance_created('mmseg')
+        if never_created:
+            DefaultScope.get_instance('mmseg', scope_name='mmseg')
+            return
+        current_scope = DefaultScope.get_current_instance()
+        if current_scope.scope_name != 'mmseg':
+            warnings.warn('The current default scope '
+                          f'"{current_scope.scope_name}" is not "mmseg", '
+                          '`register_all_modules` will force the current'
+                          'default scope to be "mmseg". If this is not '
+                          'expected, please set `init_default_scope=False`.')
+            # avoid name conflict
+            new_instance_name = f'mmseg-{datetime.datetime.now()}'
+            DefaultScope.get_instance(new_instance_name, scope_name='mmseg')
diff --git a/head_extractor/build/lib/mmseg/utils/tokenizer.py b/head_extractor/build/lib/mmseg/utils/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d56f5fae602506a27b9ae8835415e8dea7b611b7
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/utils/tokenizer.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""CLIP tokenizer.
+
+Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright
+(c) 2021 OpenAI.
+"""
+import gzip
+import html
+import os
+from functools import lru_cache
+from typing import List, Union
+
+import ftfy
+import regex as re
+import torch
+
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+
+@lru_cache()
+def default_bpe():
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'bpe_simple_vocab_16e6.txt.gz')
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """Returns list of utf-8 byte and a corresponding list of unicode strings.
+
+    The reversible bpe codes work on unicode strings. This means you need a
+    large # of unicode characters in your vocab if you want to avoid UNKs. When
+    you're at something like a 10B token dataset you end up needing around 5K
+    for decent coverage. This is a significant percentage of your normal, say,
+    32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and
+    unicode strings. And avoids mapping to whitespace/control characters the
+    bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length
+    strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer:
+
+    def __init__(self, bpe_path: str = default_bpe(), special_tokens=None):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        if not special_tokens:
+            special_tokens = ['<start_of_text>', '<end_of_text>']
+        else:
+            special_tokens = ['<start_of_text>', '<end_of_text>'
+                              ] + special_tokens
+        vocab.extend(special_tokens)
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {t: t for t in special_tokens}
+        special = '|'.join(special_tokens)
+        self.pat = re.compile(
+            special +
+            r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+        self.vocab_size = len(self.encoder)
+        self.all_special_ids = [self.encoder[t] for t in special_tokens]
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa: E722, E261
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
+
+
+_tokenizer = SimpleTokenizer()
+
+
+def decode(output_ids: torch.Tensor):
+    output_ids = output_ids.cpu().numpy()
+    return _tokenizer.decode(output_ids)
+
+
+def tokenize(texts: Union[str, List[str]],
+             context_length: int = 77) -> torch.LongTensor:
+    """Returns the tokenized representation of given input string(s)
+
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens,
+    shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+
+    sot_token = _tokenizer.encoder['<start_of_text>']
+    eot_token = _tokenizer.encoder['<end_of_text>']
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token]
+                  for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            tokens = tokens[:context_length]  # Truncate
+            tokens[-1] = eot_token
+        result[i, :len(tokens)] = torch.tensor(tokens)
+
+    return result
+
+
+class HFTokenizer:
+    """HuggingFace tokenizer wrapper."""
+
+    def __init__(self, tokenizer_name: str):
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+
+    def save_pretrained(self, dest):
+        self.tokenizer.save_pretrained(dest)
+
+    def __call__(self,
+                 texts: Union[str, List[str]],
+                 context_length: int = 77) -> torch.Tensor:
+        # same cleaning as for default tokenizer, except lowercasing
+        # adding lower (for case-sensitive tokenizers) will make it
+        # more robust but less sensitive to nuance
+        if isinstance(texts, str):
+            texts = [texts]
+        texts = [whitespace_clean(basic_clean(text)) for text in texts]
+        input_ids = self.tokenizer(
+            texts,
+            return_tensors='pt',
+            max_length=context_length,
+            padding='max_length',
+            truncation=True,
+        ).input_ids
+        return input_ids
diff --git a/head_extractor/build/lib/mmseg/utils/typing_utils.py b/head_extractor/build/lib/mmseg/utils/typing_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba7d3b92bba8301171d2a0fffadfabfcd112976
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/utils/typing_utils.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Collecting some commonly used type hint in mmflow."""
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+from mmengine.config import ConfigDict
+
+from mmseg.structures import SegDataSample
+
+# Type hint of config data
+ConfigType = Union[ConfigDict, dict]
+OptConfigType = Optional[ConfigType]
+# Type hint of one or more config data
+MultiConfig = Union[ConfigType, Sequence[ConfigType]]
+OptMultiConfig = Optional[MultiConfig]
+
+SampleList = Sequence[SegDataSample]
+OptSampleList = Optional[SampleList]
+
+# Type hint of Tensor
+TensorDict = Dict[str, torch.Tensor]
+TensorList = Sequence[torch.Tensor]
+
+ForwardResults = Union[Dict[str, torch.Tensor], List[SegDataSample],
+                       Tuple[torch.Tensor], torch.Tensor]
diff --git a/head_extractor/build/lib/mmseg/version.py b/head_extractor/build/lib/mmseg/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..b76bb4580ddfa0ba0ba13fa4896c49bac9cef65a
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/version.py
@@ -0,0 +1,18 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+
+__version__ = '1.2.2'
+
+
+def parse_version_info(version_str):
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/head_extractor/build/lib/mmseg/visualization/__init__.py b/head_extractor/build/lib/mmseg/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cbb211e5243aafb4ab3d91f6a6f7ce0735b13a9
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/visualization/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .local_visualizer import SegLocalVisualizer
+
+__all__ = ['SegLocalVisualizer']
diff --git a/head_extractor/build/lib/mmseg/visualization/local_visualizer.py b/head_extractor/build/lib/mmseg/visualization/local_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee3d652c7bbe9d93ca481fb7a7ed4bb976eec80d
--- /dev/null
+++ b/head_extractor/build/lib/mmseg/visualization/local_visualizer.py
@@ -0,0 +1,349 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmengine.dist import master_only
+from mmengine.structures import PixelData
+from mmengine.visualization import Visualizer
+
+from mmseg.registry import VISUALIZERS
+from mmseg.structures import SegDataSample
+from mmseg.utils import get_classes, get_palette
+
+
+@VISUALIZERS.register_module()
+class SegLocalVisualizer(Visualizer):
+    """Local Visualizer.
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        image (np.ndarray, optional): the origin image to draw. The format
+            should be RGB. Defaults to None.
+        vis_backends (list, optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        classes (list, optional): Input classes for result rendering, as the
+            prediction of segmentation model is a segment map with label
+            indices, `classes` is a list which includes items responding to the
+            label indices. If classes is not defined, visualizer will take
+            `cityscapes` classes by default. Defaults to None.
+        palette (list, optional): Input palette for result rendering, which is
+            a list of color palette responding to the classes. Defaults to None.
+        dataset_name (str, optional): `Dataset name or alias <https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/utils/class_names.py#L302-L317>`_
+            visulizer will use the meta information of the dataset i.e. classes
+            and palette, but the `classes` and `palette` have higher priority.
+            Defaults to None.
+        alpha (int, float): The transparency of segmentation mask.
+                Defaults to 0.8.
+
+    Examples:
+        >>> import numpy as np
+        >>> import torch
+        >>> from mmengine.structures import PixelData
+        >>> from mmseg.structures import SegDataSample
+        >>> from mmseg.visualization import SegLocalVisualizer
+
+        >>> seg_local_visualizer = SegLocalVisualizer()
+        >>> image = np.random.randint(0, 256,
+        ...                     size=(10, 12, 3)).astype('uint8')
+        >>> gt_sem_seg_data = dict(data=torch.randint(0, 2, (1, 10, 12)))
+        >>> gt_sem_seg = PixelData(**gt_sem_seg_data)
+        >>> gt_seg_data_sample = SegDataSample()
+        >>> gt_seg_data_sample.gt_sem_seg = gt_sem_seg
+        >>> seg_local_visualizer.dataset_meta = dict(
+        >>>     classes=('background', 'foreground'),
+        >>>     palette=[[120, 120, 120], [6, 230, 230]])
+        >>> seg_local_visualizer.add_datasample('visualizer_example',
+        ...                         image, gt_seg_data_sample)
+        >>> seg_local_visualizer.add_datasample(
+        ...                        'visualizer_example', image,
+        ...                         gt_seg_data_sample, show=True)
+    """  # noqa
+
+    def __init__(self,
+                 name: str = 'visualizer',
+                 image: Optional[np.ndarray] = None,
+                 vis_backends: Optional[Dict] = None,
+                 save_dir: Optional[str] = None,
+                 classes: Optional[List] = None,
+                 palette: Optional[List] = None,
+                 dataset_name: Optional[str] = None,
+                 alpha: float = 0.8,
+                 **kwargs):
+        super().__init__(name, image, vis_backends, save_dir, **kwargs)
+        self.alpha: float = alpha
+        self.set_dataset_meta(palette, classes, dataset_name)
+
+    def _get_center_loc(self, mask: np.ndarray) -> np.ndarray:
+        """Get semantic seg center coordinate.
+
+        Args:
+            mask: np.ndarray: get from sem_seg
+        """
+        loc = np.argwhere(mask == 1)
+
+        loc_sort = np.array(
+            sorted(loc.tolist(), key=lambda row: (row[0], row[1])))
+        y_list = loc_sort[:, 0]
+        unique, indices, counts = np.unique(
+            y_list, return_index=True, return_counts=True)
+        y_loc = unique[counts.argmax()]
+        y_most_freq_loc = loc[loc_sort[:, 0] == y_loc]
+        center_num = len(y_most_freq_loc) // 2
+        x = y_most_freq_loc[center_num][1]
+        y = y_most_freq_loc[center_num][0]
+        return np.array([x, y])
+
+    def _draw_sem_seg(self,
+                      image: np.ndarray,
+                      sem_seg: PixelData,
+                      classes: Optional[List],
+                      palette: Optional[List],
+                      with_labels: Optional[bool] = True) -> np.ndarray:
+        """Draw semantic seg of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            sem_seg (:obj:`PixelData`): Data structure for pixel-level
+                annotations or predictions.
+            classes (list, optional): Input classes for result rendering, as
+                the prediction of segmentation model is a segment map with
+                label indices, `classes` is a list which includes items
+                responding to the label indices. If classes is not defined,
+                visualizer will take `cityscapes` classes by default.
+                Defaults to None.
+            palette (list, optional): Input palette for result rendering, which
+                is a list of color palette responding to the classes.
+                Defaults to None.
+            with_labels(bool, optional): Add semantic labels in visualization
+                result, Default to True.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        num_classes = len(classes)
+
+        sem_seg = sem_seg.cpu().data
+        ids = np.unique(sem_seg)[::-1]
+        legal_indices = ids < num_classes
+        ids = ids[legal_indices]
+        labels = np.array(ids, dtype=np.int64)
+
+        colors = [palette[label] for label in labels]
+
+        mask = np.zeros_like(image, dtype=np.uint8)
+        for label, color in zip(labels, colors):
+            mask[sem_seg[0] == label, :] = color
+
+        if with_labels:
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            # (0,1] to change the size of the text relative to the image
+            scale = 0.05
+            fontScale = min(image.shape[0], image.shape[1]) / (25 / scale)
+            fontColor = (255, 255, 255)
+            if image.shape[0] < 300 or image.shape[1] < 300:
+                thickness = 1
+                rectangleThickness = 1
+            else:
+                thickness = 2
+                rectangleThickness = 2
+            lineType = 2
+
+            if isinstance(sem_seg[0], torch.Tensor):
+                masks = sem_seg[0].numpy() == labels[:, None, None]
+            else:
+                masks = sem_seg[0] == labels[:, None, None]
+            masks = masks.astype(np.uint8)
+            for mask_num in range(len(labels)):
+                classes_id = labels[mask_num]
+                classes_color = colors[mask_num]
+                loc = self._get_center_loc(masks[mask_num])
+                text = classes[classes_id]
+                (label_width, label_height), baseline = cv2.getTextSize(
+                    text, font, fontScale, thickness)
+                mask = cv2.rectangle(mask, loc,
+                                     (loc[0] + label_width + baseline,
+                                      loc[1] + label_height + baseline),
+                                     classes_color, -1)
+                mask = cv2.rectangle(mask, loc,
+                                     (loc[0] + label_width + baseline,
+                                      loc[1] + label_height + baseline),
+                                     (0, 0, 0), rectangleThickness)
+                mask = cv2.putText(mask, text, (loc[0], loc[1] + label_height),
+                                   font, fontScale, fontColor, thickness,
+                                   lineType)
+        color_seg = (image * (1 - self.alpha) + mask * self.alpha).astype(
+            np.uint8)
+        self.set_image(color_seg)
+        return color_seg
+
+    def _draw_depth_map(self, image: np.ndarray,
+                        depth_map: PixelData) -> np.ndarray:
+        """Draws a depth map on a given image.
+
+        This function takes an image and a depth map as input,
+        renders the depth map, and concatenates it with the original image.
+        Finally, it updates the internal image state of the visualizer with
+        the concatenated result.
+
+        Args:
+            image (np.ndarray): The original image where the depth map will
+                be drawn. The array should be in the format HxWx3 where H is
+                the height, W is the width.
+
+            depth_map (PixelData): Depth map to be drawn. The depth map
+                should be in the form of a PixelData object. It will be
+                converted to a torch tensor if it is a numpy array.
+
+        Returns:
+            np.ndarray: The concatenated image with the depth map drawn.
+
+        Example:
+            >>> depth_map_data = PixelData(data=torch.rand(1, 10, 10))
+            >>> image = np.random.randint(0, 256,
+            >>>                           size=(10, 10, 3)).astype('uint8')
+            >>> visualizer = SegLocalVisualizer()
+            >>> visualizer._draw_depth_map(image, depth_map_data)
+        """
+        depth_map = depth_map.cpu().data
+        if isinstance(depth_map, np.ndarray):
+            depth_map = torch.from_numpy(depth_map)
+        if depth_map.ndim == 2:
+            depth_map = depth_map[None]
+
+        depth_map = self.draw_featmap(depth_map, resize_shape=image.shape[:2])
+        out_image = np.concatenate((image, depth_map), axis=0)
+        self.set_image(out_image)
+        return out_image
+
+    def set_dataset_meta(self,
+                         classes: Optional[List] = None,
+                         palette: Optional[List] = None,
+                         dataset_name: Optional[str] = None) -> None:
+        """Set meta information to visualizer.
+
+        Args:
+            classes (list, optional): Input classes for result rendering, as
+                the prediction of segmentation model is a segment map with
+                label indices, `classes` is a list which includes items
+                responding to the label indices. If classes is not defined,
+                visualizer will take `cityscapes` classes by default.
+                Defaults to None.
+            palette (list, optional): Input palette for result rendering, which
+                is a list of color palette responding to the classes.
+                Defaults to None.
+            dataset_name (str, optional): `Dataset name or alias <https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/utils/class_names.py#L302-L317>`_
+                visulizer will use the meta information of the dataset i.e.
+                classes and palette, but the `classes` and `palette` have
+                higher priority. Defaults to None.
+        """  # noqa
+        # Set default value. When calling
+        # `SegLocalVisualizer().dataset_meta=xxx`,
+        # it will override the default value.
+        if dataset_name is None:
+            dataset_name = 'cityscapes'
+        classes = classes if classes else get_classes(dataset_name)
+        palette = palette if palette else get_palette(dataset_name)
+        assert len(classes) == len(
+            palette), 'The length of classes should be equal to palette'
+        self.dataset_meta: dict = {'classes': classes, 'palette': palette}
+
+    @master_only
+    def add_datasample(
+            self,
+            name: str,
+            image: np.ndarray,
+            data_sample: Optional[SegDataSample] = None,
+            draw_gt: bool = True,
+            draw_pred: bool = True,
+            show: bool = False,
+            wait_time: float = 0,
+            # TODO: Supported in mmengine's Viusalizer.
+            out_file: Optional[str] = None,
+            step: int = 0,
+            with_labels: Optional[bool] = True) -> None:
+        """Draw datasample and save to all backends.
+
+        - If GT and prediction are plotted at the same time, they are
+        displayed in a stitched image where the left image is the
+        ground truth and the right image is the prediction.
+        - If ``show`` is True, all storage backends are ignored, and
+        the images will be displayed in a local window.
+        - If ``out_file`` is specified, the drawn image will be
+        saved to ``out_file``. it is usually used when the display
+        is not available.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to draw.
+            gt_sample (:obj:`SegDataSample`, optional): GT SegDataSample.
+                Defaults to None.
+            pred_sample (:obj:`SegDataSample`, optional): Prediction
+                SegDataSample. Defaults to None.
+            draw_gt (bool): Whether to draw GT SegDataSample. Default to True.
+            draw_pred (bool): Whether to draw Prediction SegDataSample.
+                Defaults to True.
+            show (bool): Whether to display the drawn image. Default to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            out_file (str): Path to output file. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+            with_labels(bool, optional): Add semantic labels in visualization
+                result, Defaults to True.
+        """
+        classes = self.dataset_meta.get('classes', None)
+        palette = self.dataset_meta.get('palette', None)
+
+        gt_img_data = None
+        pred_img_data = None
+
+        if draw_gt and data_sample is not None:
+            if 'gt_sem_seg' in data_sample:
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing semantic ' \
+                                            'segmentation results.'
+                gt_img_data = self._draw_sem_seg(image, data_sample.gt_sem_seg,
+                                                 classes, palette, with_labels)
+
+            if 'gt_depth_map' in data_sample:
+                gt_img_data = gt_img_data if gt_img_data is not None else image
+                gt_img_data = self._draw_depth_map(gt_img_data,
+                                                   data_sample.gt_depth_map)
+
+        if draw_pred and data_sample is not None:
+
+            if 'pred_sem_seg' in data_sample:
+
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing semantic ' \
+                                            'segmentation results.'
+                pred_img_data = self._draw_sem_seg(image,
+                                                   data_sample.pred_sem_seg,
+                                                   classes, palette,
+                                                   with_labels)
+
+            if 'pred_depth_map' in data_sample:
+                pred_img_data = pred_img_data if pred_img_data is not None \
+                    else image
+                pred_img_data = self._draw_depth_map(
+                    pred_img_data, data_sample.pred_depth_map)
+
+        if gt_img_data is not None and pred_img_data is not None:
+            drawn_img = np.concatenate((gt_img_data, pred_img_data), axis=1)
+        elif gt_img_data is not None:
+            drawn_img = gt_img_data
+        else:
+            drawn_img = pred_img_data
+
+        if show:
+            self.show(drawn_img, win_name=name, wait_time=wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(mmcv.rgb2bgr(drawn_img), out_file)
+        else:
+            self.add_image(name, drawn_img, step)
diff --git a/head_extractor/mmcv-2.1.0/.circleci/config.yml b/head_extractor/mmcv-2.1.0/.circleci/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4660422fa105ae4acae54c884fa09079a4a2bffd
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.circleci/config.yml
@@ -0,0 +1,32 @@
+version: 2.1
+
+# this allows you to use CircleCI's dynamic configuration feature
+setup: true
+
+# the path-filtering orb is required to continue a pipeline based on
+# the path of an updated fileset
+orbs:
+  path-filtering: circleci/path-filtering@0.1.2
+
+workflows:
+  # the always-run workflow is always triggered, regardless of the pipeline parameters.
+  always-run:
+    jobs:
+      # the path-filtering/filter job determines which pipeline
+      # parameters to update.
+      - path-filtering/filter:
+          name: check-updated-files
+          # 3-column, whitespace-delimited mapping. One mapping per
+          # line:
+          # <regex path-to-test> <parameter-to-set> <value-of-pipeline-parameter>
+          mapping: |
+            mmcv/.* lint_only false
+            requirements/.* lint_only false
+            tests/.* lint_only false
+            .circleci/.* lint_only false
+          base-revision: main
+          # this is the path of the configuration we should trigger once
+          # path filtering and pipeline parameter value updates are
+          # complete. In this case, we are using the parent dynamic
+          # configuration itself.
+          config-path: .circleci/test.yml
diff --git a/head_extractor/mmcv-2.1.0/.circleci/docker/Dockerfile b/head_extractor/mmcv-2.1.0/.circleci/docker/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..ab4961206a7fd660a0eeec91e268ba30c293365b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.circleci/docker/Dockerfile
@@ -0,0 +1,17 @@
+ARG PYTORCH="1.8.1"
+ARG CUDA="10.2"
+ARG CUDNN="7"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+# Set MKL_THREADING_LAYER=GNU to fix issue:
+# https://github.com/pytorch/pytorch/issues/37377
+ENV MKL_THREADING_LAYER GNU
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# To fix GPG key error when running apt-get update
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+RUN apt-get update && apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx ffmpeg libturbojpeg git
diff --git a/head_extractor/mmcv-2.1.0/.circleci/test.yml b/head_extractor/mmcv-2.1.0/.circleci/test.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d9ddbce2558c2e1339dda385891e2f3eb5288cda
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.circleci/test.yml
@@ -0,0 +1,298 @@
+version: 2.1
+
+# the default pipeline parameters, which will be updated according to
+# the results of the path-filtering orb
+parameters:
+  lint_only:
+    type: boolean
+    default: true
+
+jobs:
+  lint:
+    docker:
+      - image: cimg/python:3.7.4
+    steps:
+      - checkout
+      - run:
+          name: Install pre-commit hook
+          command: |
+            pip install pre-commit
+            pre-commit install
+      - run:
+          name: Linting
+          command: pre-commit run --all-files
+  build_without_torch:
+    parameters:
+      # The python version must match available image tags in
+      # https://circleci.com/developer/images/image/cimg/python
+      python:
+        type: string
+        default: "3.7.4"
+    docker:
+      - image: cimg/python:<< parameters.python >>
+    resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Install Libraries
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx libjpeg-dev zlib1g-dev libtinfo-dev libncurses5 ffmpeg libturbojpeg
+      - run:
+          name: Upgrade pip
+          command: |
+            pip install pip --upgrade
+            pip --version
+      - run:
+          name: Install MMEngine from main branch
+          command: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - run:
+          name: Build MMCV from source
+          command: pip install -e . -v
+          environment:
+            MMCV_WITH_OPS: 0
+      - run:
+          name: Install unit tests dependencies
+          command: pip install -r requirements/test.txt
+      - run:
+          name: Run unit tests
+          command: pytest tests/test_image tests/test_transforms tests/test_video tests/test_arraymisc.py tests/test_visualization.py tests/test_utils/test_env.py --ignore=tests/test_image/test_io.py
+  build_without_ops:
+    parameters:
+      # The python version must match available image tags in
+      # https://circleci.com/developer/images/image/cimg/python
+      python:
+        type: string
+        default: "3.7.4"
+      torch:
+        type: string
+      torchvision:
+        type: string
+    docker:
+      - image: cimg/python:<< parameters.python >>
+    resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Install Libraries
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx libjpeg-dev zlib1g-dev libtinfo-dev libncurses5 ffmpeg libturbojpeg
+      - run:
+          name: Configure Python & pip
+          command: |
+            pip install --upgrade pip
+            pip install wheel
+      - run:
+          name: Install PyTorch
+          command: pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - run:
+          name: Install MMEngine from main branch
+          command: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - run:
+          name: Create sdist and untar
+          command: |
+            sed -i "s/os.getenv('MMCV_WITH_OPS', '1')/os.getenv('MMCV_WITH_OPS', '0')/g" setup.py
+            python setup.py sdist
+            tar zxvf dist/mmcv* -C /tmp
+            rm -r mmcv
+      - run:
+          name: Build and install from sdist
+          command: |
+            pushd /tmp/mmcv*
+            pip install -e . -v
+            popd
+      - run:
+          name: Install unit tests dependencies
+          command: pip install -r requirements/test.txt
+      - run:
+          name: Run unit tests
+          command: pytest tests --ignore=tests/test_ops
+  build_cpu:
+    parameters:
+      # The python version must match available image tags in
+      # https://circleci.com/developer/images/image/cimg/python
+      python:
+        type: string
+      torch:
+        type: string
+      torchvision:
+        type: string
+    docker:
+      - image: cimg/python:<< parameters.python >>
+    resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Install Libraries
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx libjpeg-dev zlib1g-dev libtinfo-dev libncurses5 ffmpeg libturbojpeg
+      - run:
+          name: Configure Python & pip
+          command: |
+            pip install --upgrade pip
+            pip install wheel
+      - run:
+          name: Install PyTorch
+          command: pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - run:
+          name: Install MMEngine from main branch
+          command: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - run:
+          name: Install ninja to speed the compilation
+          command: pip install ninja
+      - run:
+          name: Create sdist and untar
+          command: |
+            python setup.py sdist
+            tar zxvf dist/mmcv* -C /tmp
+            rm -r mmcv
+      - run:
+          name: Build and install from sdist
+          command: |
+            pushd /tmp/mmcv*
+            pip install -e . -v
+            popd
+      - run:
+          name: Install unit tests dependencies
+          command: pip install -r requirements/test.txt
+      - run:
+          name: Run unit tests
+          command: |
+            coverage run --branch --source mmcv -m pytest tests/
+            coverage xml
+            coverage report -m
+  build_cuda:
+    parameters:
+      torch:
+        type: string
+      cuda:
+        type: enum
+        enum: ["10.1", "10.2", "11.1", "11.7", "11.8"]
+      cudnn:
+        type: integer
+        default: 7
+    machine:
+      image: linux-cuda-11:default
+      docker_layer_caching: true
+    resource_class: gpu.nvidia.small.multi
+    steps:
+      - checkout
+      - run:
+          name: Install nvidia-container-toolkit and Restart Docker
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y nvidia-container-toolkit
+            sudo systemctl restart docker
+      - run:
+          name: Build Docker image
+          command: |
+            docker build .circleci/docker -t mmcv:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >>
+            docker run --gpus all -t -d -v /home/circleci/project:/mmcv -w /mmcv --name mmcv mmcv:gpu
+      - run:
+          name: Install MMEngine from main branch
+          command: docker exec mmcv pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - run:
+          name: Install ninja to speed the compilation
+          command: docker exec mmcv pip install ninja
+      - run:
+          name: Build MMCV from source
+          command: docker exec mmcv pip install -e . -v
+      - run:
+          name: Install unit tests dependencies
+          command: docker exec mmcv pip install -r requirements/test.txt
+      - run:
+          name: Run unit tests
+          command: docker exec mmcv python -m pytest tests/
+
+workflows:
+  pr_stage_lint:
+    when: << pipeline.parameters.lint_only >>
+    jobs:
+      - lint:
+          name: lint
+          filters:
+            branches:
+              ignore:
+                - main
+  pr_stage_test:
+    when:
+      not:
+        << pipeline.parameters.lint_only >>
+    jobs:
+      - lint:
+          name: lint
+          filters:
+            branches:
+              ignore:
+                - main
+      - build_without_torch:
+          name: build_without_torch
+          requires:
+            - lint
+      - build_without_ops:
+          name: build_without_ops
+          torch: 1.8.1
+          torchvision: 0.9.1
+          requires:
+            - build_without_torch
+      - build_cpu:
+          name: minimum_version_cpu
+          torch: 1.8.1
+          torchvision: 0.9.1
+          python: 3.7.4
+          requires:
+            - build_without_ops
+      - build_cpu:
+          name: maximum_version_cpu
+          torch: 2.1.0
+          torchvision: 0.16.0
+          python: 3.9.0
+          requires:
+            - minimum_version_cpu
+      - hold_cuda_test:
+          type: approval
+      - build_cuda:
+          name: minimum_version_gpu
+          torch: 1.8.1
+          # Use double quotation mark to explicitly specify its type
+          # as string instead of number
+          cuda: "10.2"
+          requires:
+            - hold_cuda_test
+      - build_cuda:
+          name: maximum_version_gpu
+          torch: 2.1.0
+          # Use double quotation mark to explicitly specify its type
+          # as string instead of number
+          cuda: "11.8"
+          cudnn: 8
+          requires:
+            - hold_cuda_test
+  merge_stage_test:
+    when:
+      not:
+        << pipeline.parameters.lint_only >>
+    jobs:
+      - build_cuda:
+          name: minimum_version_gpu
+          torch: 1.8.1
+          # Use double quotation mark to explicitly specify its type
+          # as string instead of number
+          cuda: "10.2"
+          filters:
+            branches:
+              only:
+                - main
+      - build_cuda:
+          name: maximum_version_gpu
+          torch: 2.1.0
+          # Use double quotation mark to explicitly specify its type
+          # as string instead of number
+          cuda: "11.8"
+          cudnn: 8
+          filters:
+            branches:
+              only:
+                - main
diff --git a/head_extractor/mmcv-2.1.0/.dev_scripts/check_installation.py b/head_extractor/mmcv-2.1.0/.dev_scripts/check_installation.py
new file mode 100644
index 0000000000000000000000000000000000000000..739c1e11028865b5230438b88b706f80e203576d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.dev_scripts/check_installation.py
@@ -0,0 +1,44 @@
+import numpy as np
+import torch
+
+from mmcv.ops import box_iou_rotated
+from mmcv.utils import collect_env
+
+
+def check_installation():
+    """Check whether mmcv has been installed successfully."""
+    np_boxes1 = np.asarray(
+        [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
+         [7.0, 7.0, 8.0, 8.0, 0.4]],
+        dtype=np.float32)
+    np_boxes2 = np.asarray(
+        [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
+         [5.0, 5.0, 6.0, 7.0, 0.4]],
+        dtype=np.float32)
+    boxes1 = torch.from_numpy(np_boxes1)
+    boxes2 = torch.from_numpy(np_boxes2)
+
+    # test mmcv with CPU ops
+    box_iou_rotated(boxes1, boxes2)
+    print('CPU ops were compiled successfully.')
+
+    # test mmcv with both CPU and CUDA ops
+    if torch.cuda.is_available():
+        boxes1 = boxes1.cuda()
+        boxes2 = boxes2.cuda()
+        box_iou_rotated(boxes1, boxes2)
+        print('CUDA ops were compiled successfully.')
+    else:
+        print('No CUDA runtime is found, skipping the checking of CUDA ops.')
+
+
+if __name__ == '__main__':
+    print('Start checking the installation of mmcv ...')
+    check_installation()
+    print('mmcv has been installed successfully.\n')
+
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    print('Environment information:')
+    print(dash_line + env_info + '\n' + dash_line)
diff --git a/head_extractor/mmcv-2.1.0/.dockerignore b/head_extractor/mmcv-2.1.0/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..8c22f226d3e2d8a625515290691d2cfc6ed87f2e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.dockerignore
@@ -0,0 +1,6 @@
+.git
+.gitignore
+*.egg-info
+.eggs/
+.mypy-cache
+pip-wheel-metadata
diff --git a/head_extractor/mmcv-2.1.0/.github/ISSUE_TEMPLATE/1-bug-report.yml b/head_extractor/mmcv-2.1.0/.github/ISSUE_TEMPLATE/1-bug-report.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d6d0a081b60f28a964d77ea5fdfbb713b14e9273
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.github/ISSUE_TEMPLATE/1-bug-report.yml
@@ -0,0 +1,94 @@
+name: "🐞 Bug report"
+description: "Create a report to help us reproduce and fix the bug"
+labels: bug
+title: "[Bug] "
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        ## Note
+        For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmcv/discussions)
+        Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.**
+
+  - type: checkboxes
+    attributes:
+      label: Prerequisite
+      description: Please check the following items before creating a new issue.
+      options:
+      - label: I have searched [Issues](https://github.com/open-mmlab/mmcv/issues) and [Discussions](https://github.com/open-mmlab/mmcv/discussions) but cannot get the expected help.
+        required: true
+      - label: The bug has not been fixed in the latest version(https://github.com/open-mmlab/mmcv).
+        required: true
+
+  - type: textarea
+    attributes:
+      label: Environment
+      description: |
+        Please run `python -c "from mmcv.utils import collect_env; print(collect_env())"` to collect necessary environment information and copy-paste it here.
+        You may add additional information that may be helpful for locating the problem, such as
+          - How you installed PyTorch \[e.g., pip, conda, source\]
+          - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Reproduces the problem - code sample
+      description: |
+        Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
+      placeholder: |
+        ```python
+        # Sample code to reproduce the problem
+        ```
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Reproduces the problem - command or script
+      description: |
+        What command or script did you run?
+      placeholder: |
+        ```shell
+        The command or script you run.
+        ```
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Reproduces the problem - error message
+      description: |
+        Please provide the error message or logs you got, with the full traceback.
+
+        Tip: You can attach images or log files by dragging them into the text area..
+      placeholder: |
+        ```
+        The error message or logs you got, with the full traceback.
+        ```
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Additional information
+      description: |
+        Tell us anything else you think we should know.
+
+        Tip: You can attach images or log files by dragging them into the text area.
+      placeholder: |
+        1. What's your expected result?
+        2. What dataset did you use?
+        3. What do you think might be the reason?
+
+  - type: markdown
+    attributes:
+      value: |
+        ## Acknowledgement
+        Thanks for taking the time to fill out this report.
+
+        If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [**Here**](https://github.com/open-mmlab/mmcv/pulls)!
+        Please refer to [**Contribution Guide**](https://mmcv.readthedocs.io/en/latest/community/contributing.html) for contributing.
+
+        Welcome to join our [**Community (TODO)**](https://mmcv.readthedocs.io/en/latest/contact.html) to discuss together. 👬
diff --git a/head_extractor/mmcv-2.1.0/.github/ISSUE_TEMPLATE/2-feature_request.yml b/head_extractor/mmcv-2.1.0/.github/ISSUE_TEMPLATE/2-feature_request.yml
new file mode 100644
index 0000000000000000000000000000000000000000..eb122cdb88aba478a14d283c6c9bb0cdb3cf4e7a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.github/ISSUE_TEMPLATE/2-feature_request.yml
@@ -0,0 +1,39 @@
+name: 🚀 Feature request
+description: Suggest an idea for this project
+labels: [feature-request]
+title: "[Feature] "
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        ## Note
+        For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmcv/discussions)
+
+        Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.**
+
+  - type: textarea
+    attributes:
+      label: What is the feature?
+      description: Tell us more about the feature and how this feature can help.
+      placeholder: |
+        E.g., It is inconvenient when \[....\].
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Any other context?
+      description: |
+        Have you considered any alternative solutions or features? If so, what are they? Also, feel free to add any other context or screenshots about the feature request here.
+
+  - type: markdown
+    attributes:
+      value: |
+        ## Acknowledgement
+        Thanks for taking the time to fill out this report.
+
+        We strongly appreciate you creating a new PR to implement it [**Here**](https://github.com/open-mmlab/mmcv/pulls)!
+        Please refer to [**Contribution Guide**](https://mmcv.readthedocs.io/en/latest/community/contributing.html) for contributing.
+
+        Welcome to join our [**Community (TODO)**](https://mmcv.readthedocs.io/en/latest/contact.html) to discuss together. 👬
diff --git a/head_extractor/mmcv-2.1.0/.github/ISSUE_TEMPLATE/3-documentation.yml b/head_extractor/mmcv-2.1.0/.github/ISSUE_TEMPLATE/3-documentation.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e68fd841f1030eebd9ac2e5c709c6ded422e470e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.github/ISSUE_TEMPLATE/3-documentation.yml
@@ -0,0 +1,37 @@
+name: 📚 Documentation
+description: Report an issue related to the documentation.
+labels: "docs"
+title: "[Docs] "
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        ## Note
+        For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmcv/discussions)
+        Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.**
+
+  - type: textarea
+    attributes:
+      label: 📚 The doc issue
+      description: >
+        A clear and concise description the issue.
+    validations:
+      required: true
+
+  - type: textarea
+    attributes:
+      label: Suggest a potential alternative/fix
+      description: >
+        Tell us how we could improve the documentation in this regard.
+
+  - type: markdown
+    attributes:
+      value: |
+        ## Acknowledgement
+        Thanks for taking the time to fill out this report.
+
+        If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [**here**](https://github.com/open-mmlab/mmcv/pulls)!
+        Please refer to [**Contribution Guide**](https://mmcv.readthedocs.io/en/latest/community/contributing.html) for contributing.
+
+        Welcome to join our [**Community(TODO)**](https://mmcv.readthedocs.io/en/latest/contact.html) to discuss together. 👬
diff --git a/head_extractor/mmcv-2.1.0/.github/ISSUE_TEMPLATE/config.yml b/head_extractor/mmcv-2.1.0/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..3338ac5a94a872444e152d8ce8064fb8f4dc6a29
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,12 @@
+blank_issues_enabled: false
+
+contact_links:
+  - name: 💬 Forum
+    url: https://github.com/open-mmlab/mmcv/discussions
+    about: Ask general usage questions and discuss with other mmcv community members
+  - name: MMCV Documentation
+    url: https://mmcv.readthedocs.io/en/latest/
+    about: Check if your question is answered in docs
+  - name: 🌐 Explore OpenMMLab
+    url: https://openmmlab.com/
+    about: Get know more about OpenMMLab
diff --git a/head_extractor/mmcv-2.1.0/.github/pull_request_template.md b/head_extractor/mmcv-2.1.0/.github/pull_request_template.md
new file mode 100644
index 0000000000000000000000000000000000000000..0980b85db1c5fc90b2a8c32aa5fbdf923b25bf32
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.github/pull_request_template.md
@@ -0,0 +1,33 @@
+Thanks for your contribution and we appreciate it a lot. The following instructions would make your pull request more healthy and more easily get feedback. If you do not understand some items, don't worry, just make the pull request and seek help from maintainers.
+
+## Motivation
+
+Please describe the motivation of this PR and the goal you want to achieve through this PR.
+
+## Modification
+
+Please briefly describe what modification is made in this PR.
+
+## BC-breaking (Optional)
+
+Does the modification introduce changes that break the backward-compatibility of the downstream repositories?
+If so, please describe how it breaks the compatibility and how the downstream projects should modify their code to keep compatibility with this PR.
+
+## Use cases (Optional)
+
+If this PR introduces a new feature, it is better to list some use cases here, and update the documentation.
+
+## Checklist
+
+**Before PR**:
+
+- [ ] I have read and followed the workflow indicated in the [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) to create this PR.
+- [ ] Pre-commit or linting tools indicated in [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) are used to fix the potential lint issues.
+- [ ] Bug fixes are covered by unit tests, the case that causes the bug should be added in the unit tests.
+- [ ] New functionalities are covered by complete unit tests. If not, please add more unit test to ensure the correctness.
+- [ ] The documentation has been modified accordingly, including docstring or example tutorials.
+
+**After PR**:
+
+- [ ] If the modification has potential influence on downstream or other related projects, this PR should be tested with some of those projects, like MMDet or MMCls.
+- [ ] CLA has been signed and all committers have signed the CLA in this PR.
diff --git a/head_extractor/mmcv-2.1.0/.github/workflows/build_macos_wheel.yml b/head_extractor/mmcv-2.1.0/.github/workflows/build_macos_wheel.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b9f7b815ddb1f1e3b4a1573d1a03c344bc7482c5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.github/workflows/build_macos_wheel.yml
@@ -0,0 +1,76 @@
+name: build macos wheel
+
+on: push
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build_macos10_wheel:
+    runs-on: macos-latest
+    if: contains(github.event.head_commit.message, 'Bump version to')
+    strategy:
+      matrix:
+        torch: [1.8.0, 1.9.0, 1.10.0, 1.11.0, 1.12.0, 1.13.0, 2.0.0, 2.1.0]
+        python-version: [3.7, 3.8, 3.9, '3.10', '3.11']
+        include:
+          - torch: 1.8.0
+            torchvision: 0.9.0
+          - torch: 1.9.0
+            torchvision: 0.10.0
+          - torch: 1.10.0
+            torchvision: 0.11.0
+          - torch: 1.11.0
+            torchvision: 0.12.0
+          - torch: 1.12.0
+            torchvision: 0.13.0
+          - torch: 1.13.0
+            torchvision: 0.14.0
+          - torch: 2.0.0
+            torchvision: 0.15.1
+          - torch: 2.1.0
+            torchvision: 0.16.0
+        exclude:
+          - torch: 1.8.0
+            python-version: '3.10'
+          - torch: 1.9.0
+            python-version: '3.10'
+          - torch: 1.10.0
+            python-version: '3.10'
+          - torch: 1.8.0
+            python-version: '3.11'
+          - torch: 1.9.0
+            python-version: '3.11'
+          - torch: 1.10.0
+            python-version: '3.11'
+          - torch: 1.10.0
+            python-version: '3.11'
+          - torch: 1.11.0
+            python-version: '3.11'
+          - torch: 1.12.0
+            python-version: '3.11'
+          - torch: 1.13.0
+            python-version: '3.11'
+          - torch: 2.0.0
+            python-version: 3.7
+          - torch: 2.1.0
+            python-version: 3.7
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install psutil
+        run: pip install psutil
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} --no-cache-dir
+      - name: Build and install
+        run: |
+          pip install wheel
+          python setup.py bdist_wheel
+      - uses: actions/upload-artifact@v3
+        with:
+          name: ${{matrix.torch}}
+          path: dist/
diff --git a/head_extractor/mmcv-2.1.0/.github/workflows/lint.yml b/head_extractor/mmcv-2.1.0/.github/workflows/lint.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ac5afd597a9c1696c3f20699edebd30cde43314a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.github/workflows/lint.yml
@@ -0,0 +1,29 @@
+name: lint
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  lint:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Install pre-commit hook
+        run: |
+          pip install pre-commit
+          pre-commit install
+      - name: Linting
+        run: pre-commit run --all-files
+      - name: Format c/cuda codes with clang-format
+        uses: DoozyX/clang-format-lint-action@v0.11
+        with:
+          source: mmcv/ops/csrc
+          extensions: h,c,cpp,hpp,cu,cuh
+          style: google
diff --git a/head_extractor/mmcv-2.1.0/.github/workflows/merge_stage_test.yml b/head_extractor/mmcv-2.1.0/.github/workflows/merge_stage_test.yml
new file mode 100644
index 0000000000000000000000000000000000000000..206d2b577a2ea9518242b2d79d99c656c91a6142
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.github/workflows/merge_stage_test.yml
@@ -0,0 +1,380 @@
+name: merge_stage_test
+
+on:
+  push:
+    paths-ignore:
+      - ".github/**.md"
+      - "docker/**"
+      - "docs/**"
+      - 'examples/**'
+      - '.dev_scripts/**'
+      - "README.md"
+      - "README_zh-CN.md"
+      - "CONTRIBUTING.md"
+      - ".pre-commit-config.yaml"
+      - ".pre-commit-config-zh-cn.yaml"
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build_without_torch:
+    runs-on: ubuntu-22.04
+    env:
+      MMCV_WITH_OPS: 0
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install system dependencies
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests
+        run: pytest tests/test_image tests/test_transforms tests/test_video tests/test_arraymisc.py tests/test_visualization.py tests/test_utils/test_env.py --ignore=tests/test_image/test_io.py
+  build_without_ops:
+    runs-on: ubuntu-22.04
+    env:
+      MMCV_WITH_OPS: 0
+    strategy:
+      matrix:
+        python-version: [3.7]
+        torch: [1.8.1, 1.9.1]
+        include:
+          - torch: 1.8.1
+            torchvision: 0.9.1
+          - torch: 1.9.1
+            torchvision: 0.10.1
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install system dependencies
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests
+        run: pytest tests --ignore=tests/test_ops
+  build_cpu_py:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version: [3.8, 3.9, '3.10']
+        torch: [1.13.0]
+        include:
+          - torch: 1.13.0
+            torchvision: 0.14.0
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install system dependencies
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
+      - name: Upgrade pip and wheel
+        run: pip install pip wheel --upgrade
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install ninja to speed the compilation
+        run: pip install ninja psutil
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests and generate coverage report
+        run: |
+          coverage run --branch --source mmcv -m pytest tests/
+          coverage xml
+          coverage report -m
+  build_cpu_pt:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+        torch: [1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.0, 1.13.0, 2.0.0, 2.1.0]
+        include:
+          - torch: 1.8.1
+            torchvision: 0.9.1
+          - torch: 1.9.1
+            torchvision: 0.10.1
+          - torch: 1.10.1
+            torchvision: 0.11.2
+          - torch: 1.11.0
+            torchvision: 0.12.0
+          - torch: 1.12.0
+            torchvision: 0.13.0
+          - torch: 1.13.0
+            torchvision: 0.14.0
+          - torch: 2.0.0
+            torchvision: 0.15.1
+            python-version: 3.8
+          - torch: 2.1.0
+            torchvision: 0.16.0
+            python-version: 3.8
+        exclude:
+          - torch: 2.0.0
+            python-version: 3.7
+          - torch: 2.1.0
+            python-version: 3.7
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install system dependencies
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
+      - name: Upgrade pip and wheel
+        run: pip install pip wheel --upgrade
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install ninja to speed the compilation
+        run: pip install ninja psutil
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests and generate coverage report
+        run: |
+          coverage run --branch --source mmcv -m pytest tests/
+          coverage xml
+          coverage report -m
+      # Only upload coverage report for python3.7 && pytorch1.8.1 cpu
+      - name: Upload coverage to Codecov
+        if: ${{matrix.torch == '1.8.1' && matrix.python-version == '3.8'}}
+        uses: codecov/codecov-action@v1.0.14
+        with:
+          file: ./coverage.xml
+          flags: unittests
+          env_vars: OS,PYTHON
+          name: codecov-umbrella
+          fail_ci_if_error: false
+  build_cu102:
+    runs-on: ubuntu-22.04
+    container:
+      image: pytorch/pytorch:1.8.1-cuda10.2-cudnn7-devel
+    env:
+      FORCE_CUDA: 1
+      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip and wheel
+        run: pip install pip wheel --upgrade
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install ninja to speed the compilation
+        run: pip install ninja psutil
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests and generate coverage report
+        run: |
+          coverage run --branch --source mmcv -m pytest tests/
+          coverage xml
+          coverage report -m
+  build_cu111:
+    runs-on: ubuntu-22.04
+    container:
+      image: pytorch/pytorch:1.8.1-cuda11.1-cudnn8-devel
+    env:
+      FORCE_CUDA: 1
+      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip and wheel
+        run: pip install pip wheel --upgrade
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install ninja to speed the compilation
+        run: pip install ninja psutil
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests and generate coverage report
+        run: |
+          coverage run --branch --source mmcv -m pytest tests/
+          coverage xml
+          coverage report -m
+  build_cu116:
+    runs-on: ubuntu-22.04
+    container:
+      image: pytorch/pytorch:1.13.0-cuda11.6-cudnn8-devel
+    env:
+      FORCE_CUDA: 1
+      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip and wheel
+        run: pip install pip wheel --upgrade
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install ninja to speed the compilation
+        run: pip install ninja psutil
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests and generate coverage report
+        run: |
+          coverage run --branch --source mmcv -m pytest tests
+          coverage xml
+          coverage report -m
+  build_windows_without_ops:
+    runs-on: windows-2019
+    env:
+      MMCV_WITH_OPS: 0
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip and wheel
+        run: python -m pip install pip wheel --upgrade
+      - name: Install PyTorch
+        run: pip install torch==1.8.1+cpu torchvision==0.9.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install ninja to speed the compilation
+        run: pip install ninja psutil
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests
+        run: pytest tests --ignore=tests/test_ops --ignore tests/test_image/test_io.py
+  build_windows:
+    runs-on: windows-2019
+    strategy:
+      matrix:
+        torch: [1.8.1, 2.1.0]
+        include:
+          - torch: 1.8.1
+            torchvision: 0.9.1
+            python-version: 3.7
+          - torch: 2.1.0
+            torchvision: 0.16.0
+            python-version: 3.8
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip and wheel
+        run: python -m pip install pip wheel --upgrade
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install ninja to speed the compilation
+        run: pip install ninja psutil
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests
+        run: pytest tests/ --ignore tests/test_image/test_io.py
+  build_macos:
+    runs-on: macos-latest
+    strategy:
+      matrix:
+        torch: [1.8.1, 2.1.0]
+        include:
+          - torch: 1.8.1
+            torchvision: 0.9.1
+            python-version: 3.7
+          - torch: 2.1.0
+            torchvision: 0.16.0
+            python-version: 3.8
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install system dependencies
+        run: brew install ffmpeg jpeg-turbo
+      - name: Upgrade pip and wheel
+        run: pip install pip wheel --upgrade
+      - name: Install PyTorch
+        run: pip install torch==${{ matrix.torch }} torchvision==${{ matrix.torchvision }}
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install ninja to speed the compilation
+        run: pip install ninja psutil
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests
+        run: pytest tests/
diff --git a/head_extractor/mmcv-2.1.0/.github/workflows/pr_stage_test.yml b/head_extractor/mmcv-2.1.0/.github/workflows/pr_stage_test.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8dfefeed7cf0cca6586c5a3c872d5bf8771c8ccb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.github/workflows/pr_stage_test.yml
@@ -0,0 +1,196 @@
+name: pr_stage_test
+
+on:
+  pull_request:
+    paths-ignore:
+      - ".github/**.md"
+      - "docker/**"
+      - "docs/**"
+      - 'examples/**'
+      - '.dev_scripts/**'
+      - "README.md"
+      - "README_zh-CN.md"
+      - "CONTRIBUTING.md"
+      - ".pre-commit-config.yaml"
+      - ".pre-commit-config-zh-cn.yaml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build_cu102:
+    runs-on: ubuntu-22.04
+    container:
+      image: pytorch/pytorch:1.8.1-cuda10.2-cudnn7-devel
+    env:
+      FORCE_CUDA: 1
+      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip and wheel
+        run: pip install pip wheel --upgrade
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install ninja to speed the compilation
+        run: pip install ninja psutil
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests and generate coverage report
+        run: |
+          coverage run --branch --source mmcv -m pytest tests/
+          coverage xml
+          coverage report -m
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v1.0.14
+        with:
+          file: ./coverage.xml
+          flags: unittests
+          env_vars: OS,PYTHON
+          name: codecov-umbrella
+          fail_ci_if_error: false
+  build_cu111:
+    runs-on: ubuntu-22.04
+    container:
+      image: pytorch/pytorch:1.8.1-cuda11.1-cudnn8-devel
+    env:
+      FORCE_CUDA: 1
+      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip and wheel
+        run: pip install pip wheel --upgrade
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y git ffmpeg libturbojpeg
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install ninja to speed the compilation
+        run: pip install ninja psutil
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests and generate coverage report
+        run: |
+          coverage run --branch --source mmcv -m pytest tests/
+          coverage xml
+          coverage report -m
+  build_windows_without_ops:
+    runs-on: windows-2019
+    env:
+      MMCV_WITH_OPS: 0
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip and wheel
+        run: python -m pip install pip wheel --upgrade
+      - name: Install PyTorch
+        run: pip install torch==1.8.1+cpu torchvision==0.9.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install ninja to speed the compilation
+        run: pip install ninja psutil
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests
+        run: pytest tests --ignore=tests/test_ops --ignore tests/test_image/test_io.py
+  build_windows:
+    runs-on: windows-2019
+    strategy:
+      matrix:
+        torch: [1.8.1, 2.1.0]
+        include:
+          - torch: 1.8.1
+            torchvision: 0.9.1
+            python-version: 3.7
+          - torch: 2.1.0
+            torchvision: 0.16.0
+            python-version: 3.8
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Upgrade pip and wheel
+        run: python -m pip install pip wheel --upgrade
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install ninja to speed the compilation
+        run: pip install ninja psutil
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests
+        run: pytest tests/ --ignore tests/test_image/test_io.py
+  build_macos:
+    runs-on: macos-latest
+    strategy:
+      matrix:
+        torch: [1.8.1, 2.1.0]
+        include:
+          - torch: 1.8.1
+            torchvision: 0.9.1
+            python-version: 3.7
+          - torch: 2.1.0
+            torchvision: 0.16.0
+            python-version: 3.8
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install system dependencies
+        run: brew install ffmpeg jpeg-turbo
+      - name: Upgrade pip and wheel
+        run: pip install pip wheel --upgrade
+      - name: Install PyTorch
+        run: pip install torch==${{ matrix.torch }} torchvision==${{ matrix.torchvision }}
+      - name: Install MMEngine from main branch
+        run: pip install git+https://github.com/open-mmlab/mmengine.git@main
+      - name: Install ninja to speed the compilation
+        run: pip install ninja psutil
+      - name: Build MMCV from source
+        run: pip install -e . -v
+      - name: Install unit tests dependencies
+        run: pip install -r requirements/test.txt
+      - name: Run unit tests
+        run: pytest tests/
diff --git a/head_extractor/mmcv-2.1.0/.github/workflows/publish-to-pypi.yml b/head_extractor/mmcv-2.1.0/.github/workflows/publish-to-pypi.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5ed8e7cd96b2d1ed06d9a1553676024ff60e0e2f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.github/workflows/publish-to-pypi.yml
@@ -0,0 +1,46 @@
+name: deploy
+
+on: push
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-n-publish_without_ops:
+    runs-on: ubuntu-22.04
+    if: startsWith(github.event.ref, 'refs/tags')
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.7
+      - name: Upgrade Setuptools
+        run: pip install setuptools wheel --upgrade
+      - name: Build MMCV
+        run: |
+          sed -i "s/os.getenv('MMCV_WITH_OPS', '1')/os.getenv('MMCV_WITH_OPS', '0')/g" setup.py
+          python setup.py sdist bdist_wheel
+      - name: Publish distribution to PyPI
+        run: |
+          pip install twine
+          twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }}
+
+  build-n-publish_with_ops:
+    runs-on: ubuntu-22.04
+    if: startsWith(github.event.ref, 'refs/tags')
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.7
+      - name: Upgrade Setuptools
+        run: pip install setuptools --upgrade
+      - name: Build MMCV with ops
+        run: python setup.py sdist
+      - name: Publish distribution to PyPI
+        run: |
+          pip install twine
+          twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }}
diff --git a/head_extractor/mmcv-2.1.0/.gitignore b/head_extractor/mmcv-2.1.0/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1769eff2de51a7d5b451afd1140850bd58c0f7b8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.gitignore
@@ -0,0 +1,125 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# PyTorch checkpoint
+*.pth
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+mlu-ops/
+mlu-ops.*
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/en/_build/
+docs/en/api/generated/
+docs/zh_cn/_build/
+docs/zh_cn/api/generated/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# editors and IDEs
+.idea/
+.vscode/
+
+# custom
+.DS_Store
+
+# datasets and logs and checkpoints
+data/
+work_dir/
+
+src/
diff --git a/head_extractor/mmcv-2.1.0/.pre-commit-config-zh-cn.yaml b/head_extractor/mmcv-2.1.0/.pre-commit-config-zh-cn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..313c83c84be160867736215ee35232857f2a35d2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.pre-commit-config-zh-cn.yaml
@@ -0,0 +1,72 @@
+exclude: ^tests/data/
+repos:
+  - repo: https://gitee.com/openmmlab/mirrors-flake8
+    rev: 5.0.4
+    hooks:
+      - id: flake8
+  - repo: https://gitee.com/openmmlab/mirrors-isort
+    rev: 5.11.5
+    hooks:
+      - id: isort
+  - repo: https://gitee.com/openmmlab/mirrors-yapf
+    rev: v0.32.0
+    hooks:
+      - id: yapf
+  - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: double-quote-string-fixer
+      - id: check-merge-conflict
+      - id: fix-encoding-pragma
+        args: ["--remove"]
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+  - repo: https://gitee.com/openmmlab/mirrors-codespell
+    rev: v2.2.1
+    hooks:
+      - id: codespell
+  - repo: https://gitee.com/openmmlab/mirrors-mdformat
+    rev: 0.7.9
+    hooks:
+      - id: mdformat
+        args: ["--number"]
+        additional_dependencies:
+          - mdformat-openmmlab
+          - mdformat_frontmatter
+          - linkify-it-py
+  - repo: https://gitee.com/openmmlab/mirrors-docformatter
+    rev: v1.3.1
+    hooks:
+      - id: docformatter
+        args: ["--in-place", "--wrap-descriptions", "79"]
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.0.0
+    hooks:
+      - id: pyupgrade
+        args: ["--py36-plus"]
+  - repo: https://gitee.com/openmmlab/pre-commit-hooks
+    rev: v0.2.0  # Use the ref you want to point at
+    hooks:
+      - id: check-copyright
+        args: ["mmcv", "tests", "--excludes", "mmcv/ops"]
+  - repo: https://gitee.com/openmmlab/mirrors-mypy
+    rev: v0.812
+    hooks:
+      - id: mypy
+        exclude: |-
+          (?x)(
+              ^test
+              | ^docs
+          )
+  # - repo: local
+  #   hooks:
+  #     - id: clang-format
+  #       name: clang-format
+  #       description: Format files with ClangFormat
+  #       entry: clang-format -style=google -i
+  #       language: system
+  #       files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
diff --git a/head_extractor/mmcv-2.1.0/.pre-commit-config.yaml b/head_extractor/mmcv-2.1.0/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..76ded556f9fa40d49f2a773162a5483c091e1c9e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.pre-commit-config.yaml
@@ -0,0 +1,72 @@
+exclude: ^tests/data/
+repos:
+  - repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
+    hooks:
+      - id: flake8
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.11.5
+    hooks:
+      - id: isort
+  - repo: https://github.com/pre-commit/mirrors-yapf
+    rev: v0.32.0
+    hooks:
+      - id: yapf
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: double-quote-string-fixer
+      - id: check-merge-conflict
+      - id: fix-encoding-pragma
+        args: ["--remove"]
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.1
+    hooks:
+      - id: codespell
+  - repo: https://github.com/executablebooks/mdformat
+    rev: 0.7.9
+    hooks:
+      - id: mdformat
+        args: ["--number"]
+        additional_dependencies:
+          - mdformat-openmmlab
+          - mdformat_frontmatter
+          - linkify-it-py
+  - repo: https://github.com/myint/docformatter
+    rev: v1.3.1
+    hooks:
+      - id: docformatter
+        args: ["--in-place", "--wrap-descriptions", "79"]
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.0.0
+    hooks:
+      - id: pyupgrade
+        args: ["--py36-plus"]
+  - repo: https://github.com/open-mmlab/pre-commit-hooks
+    rev: v0.2.0  # Use the ref you want to point at
+    hooks:
+      - id: check-copyright
+        args: ["mmcv", "tests", "--excludes", "mmcv/ops"]
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v0.812
+    hooks:
+      - id: mypy
+        exclude: |-
+          (?x)(
+              ^test
+              | ^docs
+          )
+  # - repo: local
+  #   hooks:
+  #     - id: clang-format
+  #       name: clang-format
+  #       description: Format files with ClangFormat
+  #       entry: clang-format -style=google -i
+  #       language: system
+  #       files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
diff --git a/head_extractor/mmcv-2.1.0/.readthedocs.yml b/head_extractor/mmcv-2.1.0/.readthedocs.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7d5f1c2060a64e5cf9c2bec433cd24532a283164
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/.readthedocs.yml
@@ -0,0 +1,9 @@
+version: 2
+
+formats: all
+
+python:
+  version: 3.7
+  install:
+    - requirements: requirements/runtime.txt
+    - requirements: requirements/docs.txt
diff --git a/head_extractor/mmcv-2.1.0/CITATION.cff b/head_extractor/mmcv-2.1.0/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..786117aac3e063efc18ad1b55e163d570a09e379
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/CITATION.cff
@@ -0,0 +1,8 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - name: "MMCV Contributors"
+title: "OpenMMLab Computer Vision Foundation"
+date-released: 2018-08-22
+url: "https://github.com/open-mmlab/mmcv"
+license: Apache-2.0
diff --git a/head_extractor/mmcv-2.1.0/CONTRIBUTING.md b/head_extractor/mmcv-2.1.0/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..a60cd994305bc8b548c71951c9b57d544c1ec21d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/CONTRIBUTING.md
@@ -0,0 +1,258 @@
+## Contributing to OpenMMLab
+
+Welcome to the MMCV community, we are committed to building a cutting-edge computer vision foundational library and all kinds of contributions are welcomed, including but not limited to
+
+**Fix bug**
+
+You can directly post a Pull Request to fix typo in code or documents
+
+The steps to fix the bug of code implementation are as follows.
+
+1. If the modification involve significant changes, you should create an issue first and describe the error information and how to trigger the bug. Other developers will discuss with you and propose an proper solution.
+
+2. Posting a pull request after fixing the bug and adding corresponding unit test.
+
+**New Feature or Enhancement**
+
+1. If the modification involve significant changes, you should create an issue to discuss with our developers to propose an proper design.
+2. Post a Pull Request after implementing the new feature or enhancement and add corresponding unit test.
+
+**Document**
+
+You can directly post a pull request to fix documents. If you want to add a document, you should first create an issue to check if it is reasonable.
+
+### Pull Request Workflow
+
+If you're not familiar with Pull Request, don't worry! The following guidance will tell you how to create a Pull Request step by step. If you want to dive into the develop mode of Pull Request, you can refer to the [official documents](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
+
+#### 1. Fork and clone
+
+If you are posting a pull request for the first time, you should fork the OpenMMLab repositories by clicking the **Fork** button in the top right corner of the GitHub page, and the forked repositories will appear under your GitHub profile.
+
+<img src="https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png" width="1200">
+
+Then, you can clone the repositories to local:
+
+```shell
+git clone git@github.com:{username}/mmcv.git
+```
+
+After that, you should ddd official repository as the upstream repository
+
+```bash
+git remote add upstream git@github.com:open-mmlab/mmcv
+```
+
+Check whether remote repository has been added successfully by `git remote -v`
+
+```bash
+origin	git@github.com:{username}/mmcv.git (fetch)
+origin	git@github.com:{username}/mmcv.git (push)
+upstream	git@github.com:open-mmlab/mmcv (fetch)
+upstream	git@github.com:open-mmlab/mmcv (push)
+```
+
+> Here's a brief introduction to origin and upstream. When we use "git clone", we create an "origin" remote by default, which points to the repository cloned from. As for "upstream", we add it ourselves to point to the target repository. Of course, if you don't like the name "upstream", you could name it as you wish. Usually, we'll push the code to "origin". If the pushed code conflicts with the latest code in official("upstream"), we should pull the latest code from upstream to resolve the conflicts, and then push to "origin" again. The posted Pull Request will be updated automatically.
+
+#### 2. Configure pre-commit
+
+You should configure [pre-commit](https://pre-commit.com/#intro) in the local development environment to make sure the code style matches that of OpenMMLab. **Note**: The following code should be executed under the MMCV directory.
+
+```shell
+pip install -U pre-commit
+pre-commit install
+```
+
+Check that pre-commit is configured successfully, and install the hooks defined in `.pre-commit-config.yaml`.
+
+```shell
+pre-commit run --all-files
+```
+
+<img src="https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png" width="1200">
+
+<img src="https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png" width="1200">
+
+If the installation process is interrupted, you can repeatedly run `pre-commit run ... ` to continue the installation.
+
+If the code does not conform to the code style specification, pre-commit will raise a warning and  fixes some of the errors automatically.
+
+<img src="https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png" width="1200">
+
+If we want to commit our code bypassing the pre-commit hook, we can use the `--no-verify` option(**only for temporarily commit**).
+
+```shell
+git commit -m "xxx" --no-verify
+```
+
+#### 3. Create a development branch
+
+After configuring the pre-commit, we should create a branch based on the master branch to develop the new feature or fix the bug. The proposed branch name is `username/pr_name`
+
+```shell
+git checkout -b yhc/refactor_contributing_doc
+```
+
+In subsequent development, if the master branch of the local repository is behind the master branch of "upstream", we need to pull the upstream for synchronization, and then execute the above command:
+
+```shell
+git pull upstream master
+```
+
+#### 4. Commit the code and pass the unit test
+
+- MMCV introduces mypy to do static type checking to increase the robustness of the code. Therefore, we need to add Type Hints to our code and pass the mypy check. If you are not familiar with Type Hints, you can refer to [this tutorial](https://docs.python.org/3/library/typing.html).
+
+- The committed code should pass through the unit test
+
+  ```shell
+  # Pass all unit tests
+  pytest tests
+
+  # Pass the unit test of runner
+  pytest tests/test_runner/test_runner.py
+  ```
+
+  If the unit test fails for lack of dependencies, you can install the dependencies referring to the [guidance](#unit-test)
+
+- If the documents are modified/added, we should check the rendering result referring to [guidance](#document-rendering)
+
+#### 5. Push the code to remote
+
+We could push the local commits to remote after passing through the check of unit test and pre-commit. You can associate the local branch with remote branch by adding `-u` option.
+
+```shell
+git push -u origin {branch_name}
+```
+
+This will allow you to use the `git push` command to push code directly next time, without having to specify a branch or the remote repository.
+
+#### 6. Create a Pull Request
+
+(1) Create a pull request in GitHub's Pull request interface
+
+<img src="https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png" width="1200">
+
+(2) Modify the PR description according to the guidelines so that other developers can better understand your changes
+
+<img src="https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png" width="1200">
+
+Find more details about Pull Request description in [pull request guidelines](#pr-specs).
+
+**note**
+
+(a) The Pull Request description should contain the reason for the change, the content of the change, and the impact of the change, and be associated with the relevant Issue (see [documentation](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)
+
+(b) If it is your first contribution, please sign the CLA
+
+<img src="https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png" width="1200">
+
+(c) Check whether the Pull Request pass through the CI
+
+<img src="https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png" width="1200">
+
+MMCV will run unit test for the posted Pull Request on different platforms (Linux, Window, Mac), based on different versions of Python, PyTorch, CUDA to make sure the code is correct. We can see the specific test information by clicking `Details` in the above image so that we can modify the code.
+
+(3) If the Pull Request passes the CI, then you can wait for the review from other developers. You'll modify the code based on the reviewer's comments, and repeat the steps [4](#4-commit-the-code-and-pass-the-unit-test)-[5](#5-push-the-code-to-remote) until all reviewers approve it. Then, we will merge it ASAP.
+
+<img src="https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png" width="1200">
+
+#### 7. Resolve conflicts
+
+If your local branch conflicts with the latest master branch of "upstream", you'll need to resolove them. There are two ways to do this:
+
+```shell
+git fetch --all --prune
+git rebase upstream/master
+```
+
+or
+
+```shell
+git fetch --all --prune
+git merge upstream/master
+```
+
+If you are very good at handling conflicts, then you can use rebase to resolve conflicts, as this will keep your commit logs tidy. If you are not familiar with `rebase`, then you can use `merge` to resolve conflicts.
+
+### Guidance
+
+#### Unit test
+
+If you cannot run the unit test of some modules for lacking of some dependencies, such as [video](https://github.com/open-mmlab/mmcv/tree/master/mmcv/video) module, you can try to install the following dependencies:
+
+```shell
+# Linux
+sudo apt-get update -y
+sudo apt-get install -y libturbojpeg
+sudo apt-get install -y ffmpeg
+
+# Windows
+conda install ffmpeg
+```
+
+We should also make sure the committed code will not decrease the coverage of unit test, we could run the following command to check the coverage of unit test:
+
+```shell
+python -m coverage run -m pytest /path/to/test_file
+python -m coverage html
+# check file in htmlcov/index.html
+```
+
+#### Document rendering
+
+If the documents are modified/added, we should check the rendering result. We could install the dependencies and run the following command to render the documents and check the results:
+
+```shell
+pip install -r requirements/docs.txt
+cd docs/zh_cn/
+# or docs/en
+make html
+# check file in ./docs/zh_cn/_build/html/index.html
+```
+
+### Code style
+
+#### Python
+
+We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.
+
+We use the following tools for linting and formatting:
+
+- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools.
+- [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports.
+- [yapf](https://github.com/google/yapf): A formatter for Python files.
+- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files.
+- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.
+- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.
+
+Style configurations of yapf and isort can be found in [setup.cfg](./setup.cfg).
+
+We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,
+fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.
+The config for a pre-commit hook is stored in [.pre-commit-config](./.pre-commit-config.yaml).
+
+#### C++ and CUDA
+
+We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+
+### PR Specs
+
+1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style
+
+2. One short-time branch should be matched with only one PR
+
+3. Accomplish a detailed change in one PR. Avoid large PR
+
+   - Bad: Support Faster R-CNN
+   - Acceptable: Add a box head to Faster R-CNN
+   - Good: Add a parameter to box head to support custom conv-layer number
+
+4. Provide clear and significant commit message
+
+5. Provide clear and meaningful PR description
+
+   - Task name should be clarified in title. The general format is: \[Prefix\] Short description of the PR (Suffix)
+   - Prefix: add new feature \[Feature\], fix bug \[Fix\], related to documents \[Docs\], in developing \[WIP\] (which will not be reviewed temporarily)
+   - Introduce main changes, results and influences on other modules in short description
+   - Associate related issues and pull requests with a milestone
diff --git a/head_extractor/mmcv-2.1.0/CONTRIBUTING_zh-CN.md b/head_extractor/mmcv-2.1.0/CONTRIBUTING_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..00622031dd567957829f38d0425d3d23741c8f2f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/CONTRIBUTING_zh-CN.md
@@ -0,0 +1,274 @@
+## 贡献代码
+
+欢迎加入 MMCV 社区，我们致力于打造最前沿的计算机视觉基础库，我们欢迎任何类型的贡献，包括但不限于
+
+**修复错误**
+
+修复代码实现错误的步骤如下：
+
+1. 如果提交的代码改动较大，建议先提交 issue，并正确描述 issue 的现象、原因和复现方式，讨论后确认修复方案。
+2. 修复错误并补充相应的单元测试，提交拉取请求。
+
+**新增功能或组件**
+
+1. 如果新功能或模块涉及较大的代码改动，建议先提交 issue，确认功能的必要性。
+2. 实现新增功能并添单元测试，提交拉取请求。
+
+**文档补充**
+
+修复文档可以直接提交拉取请求
+
+添加文档或将文档翻译成其他语言步骤如下
+
+1. 提交 issue，确认添加文档的必要性。
+2. 添加文档，提交拉取请求。
+
+### 拉取请求工作流
+
+如果你对拉取请求不了解，没关系，接下来的内容将会从零开始，一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式，可以参考 github [官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
+
+#### 1. 复刻仓库
+
+当你第一次提交拉取请求时，先复刻 OpenMMLab 原代码库，点击 GitHub 页面右上角的 **Fork** 按钮，复刻后的代码库将会出现在你的 GitHub 个人主页下。
+
+<img src="https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png" width="1200">
+
+将代码克隆到本地
+
+```shell
+git clone git@github.com:{username}/mmcv.git
+```
+
+添加原代码库为上游代码库
+
+```bash
+git remote add upstream git@github.com:open-mmlab/mmcv
+```
+
+检查 remote 是否添加成功，在终端输入 `git remote -v`
+
+```bash
+origin	git@github.com:{username}/mmcv.git (fetch)
+origin	git@github.com:{username}/mmcv.git (push)
+upstream	git@github.com:open-mmlab/mmcv (fetch)
+upstream	git@github.com:open-mmlab/mmcv (push)
+```
+
+> 这里对 origin 和 upstream 进行一个简单的介绍，当我们使用 git clone 来克隆代码时，会默认创建一个 origin 的 remote，它指向我们克隆的代码库地址，而 upstream 则是我们自己添加的，用来指向原始代码库地址。当然如果你不喜欢他叫 upstream，也可以自己修改，比如叫 open-mmlab。我们通常向 origin 提交代码（即 fork 下来的远程仓库），然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突，再从 upstream 拉取最新的代码，和本地分支解决冲突，再提交到 origin。
+
+#### 2. 配置 pre-commit
+
+在本地开发环境中，我们使用 [pre-commit](https://pre-commit.com/#intro) 来检查代码风格，以确保代码风格的统一。在提交代码，需要先安装 pre-commit（需要在 MMCV 目录下执行）:
+
+```shell
+pip install -U pre-commit
+pre-commit install
+```
+
+检查 pre-commit 是否配置成功，并安装 `.pre-commit-config.yaml` 中的钩子：
+
+```shell
+pre-commit run --all-files
+```
+
+<img src="https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png" width="1200">
+
+<img src="https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png" width="1200">
+
+> 如果你是中国用户，由于网络原因，可能会出现安装失败的情况，这时可以使用国内源
+
+> pre-commit install -c .pre-commit-config-zh-cn.yaml
+
+> pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml
+
+如果安装过程被中断，可以重复执行 `pre-commit run ...` 继续安装。
+
+如果提交的代码不符合代码风格规范，pre-commit 会发出警告，并自动修复部分错误。
+
+<img src="https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png" width="1200">
+
+如果我们想临时绕开 pre-commit 的检查提交一次代码，可以在 `git commit` 时加上 `--no-verify`（需要保证最后推送至远程仓库的代码能够通过 pre-commit 检查）。
+
+```shell
+git commit -m "xxx" --no-verify
+```
+
+#### 3. 创建开发分支
+
+安装完 pre-commit 之后，我们需要基于 master 创建开发分支，建议的分支命名规则为 `username/pr_name`。
+
+```shell
+git checkout -b yhc/refactor_contributing_doc
+```
+
+在后续的开发中，如果本地仓库的 master 分支落后于 upstream 的 master 分支，我们需要先拉取 upstream 的代码进行同步，再执行上面的命令
+
+```shell
+git pull upstream master
+```
+
+#### 4. 提交代码并在本地通过单元测试
+
+- MMCV 引入了 mypy 来做静态类型检查，以增加代码的鲁棒性。因此我们在提交代码时，需要补充 Type Hints。具体规则可以参考[教程](https://zhuanlan.zhihu.com/p/519335398)。
+
+- 提交的代码同样需要通过单元测试
+
+  ```shell
+  # 通过全量单元测试
+  pytest tests
+
+  # 我们需要保证提交的代码能够通过修改模块的单元测试，以 runner 为例
+  pytest tests/test_runner/test_runner.py
+  ```
+
+  如果你由于缺少依赖无法运行修改模块的单元测试，可以参考[指引-单元测试](#单元测试)
+
+- 如果修改/添加了文档，参考[指引](#文档渲染)确认文档渲染正常。
+
+#### 5. 推送代码到远程
+
+代码通过单元测试和 pre-commit 检查后，将代码推送到远程仓库，如果是第一次推送，可以在 `git push` 后加上 `-u` 参数以关联远程分支
+
+```shell
+git push -u origin {branch_name}
+```
+
+这样下次就可以直接使用 `git push` 命令推送代码了，而无需指定分支和远程仓库。
+
+#### 6. 提交拉取请求（PR）
+
+(1) 在 GitHub 的 Pull request 界面创建拉取请求
+<img src="https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png" width="1200">
+
+(2) 根据指引修改 PR 描述，以便于其他开发者更好地理解你的修改
+
+<img src="https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png" width="1200">
+
+描述规范详见[拉取请求规范](#拉取请求规范)
+
+&#160;
+
+**注意事项**
+
+(a) PR 描述应该包含修改理由、修改内容以及修改后带来的影响，并关联相关 Issue（具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）
+
+(b) 如果是第一次为 OpenMMLab 做贡献，需要签署 CLA
+
+<img src="https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png" width="1200">
+
+(c) 检查提交的 PR 是否通过 CI（集成测试）
+
+<img src="https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png" width="1200">
+
+MMCV 会在不同的平台（Linux、Window、Mac），基于不同版本的 Python、PyTorch、CUDA 对提交的代码进行单元测试，以保证代码的正确性，如果有任何一个没有通过，我们可点击上图中的 `Details` 来查看具体的测试信息，以便于我们修改代码。
+
+(3) 如果 PR 通过了 CI，那么就可以等待其他开发者的 review，并根据 reviewer 的意见，修改代码，并重复 [4](#4-提交代码并本地通过单元测试)-[5](#5-推送代码到远程) 步骤，直到 reviewer 同意合入 PR。
+
+<img src="https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png" width="1200">
+
+所有 reviewer 同意合入 PR 后，我们会尽快将 PR 合并到主分支。
+
+#### 7. 解决冲突
+
+随着时间的推移，我们的代码库会不断更新，这时候，如果你的 PR 与主分支存在冲突，你需要解决冲突，解决冲突的方式有两种：
+
+```shell
+git fetch --all --prune
+git rebase upstream/master
+```
+
+或者
+
+```shell
+git fetch --all --prune
+git merge upstream/master
+```
+
+如果你非常善于处理冲突，那么可以使用 rebase 的方式来解决冲突，因为这能够保证你的 commit log 的整洁。如果你不太熟悉 `rebase` 的使用，那么可以使用 `merge` 的方式来解决冲突。
+
+### 指引
+
+#### 单元测试
+
+如果你无法正常执行部分模块的单元测试，例如 [video](https://github.com/open-mmlab/mmcv/tree/master/mmcv/video) 模块，可能是你的当前环境没有安装以下依赖
+
+```shell
+# Linux
+sudo apt-get update -y
+sudo apt-get install -y libturbojpeg
+sudo apt-get install -y ffmpeg
+
+# Windows
+conda install ffmpeg
+```
+
+在提交修复代码错误或新增特性的拉取请求时，我们应该尽可能的让单元测试覆盖所有提交的代码，计算单元测试覆盖率的方法如下
+
+```shell
+python -m coverage run -m pytest /path/to/test_file
+python -m coverage html
+# check file in htmlcov/index.html
+```
+
+#### 文档渲染
+
+在提交修复代码错误或新增特性的拉取请求时，可能会需要修改/新增模块的 docstring。我们需要确认渲染后的文档样式是正确的。
+本地生成渲染后的文档的方法如下
+
+```shell
+pip install -r requirements/docs.txt
+cd docs/zh_cn/
+# or docs/en
+make html
+# check file in ./docs/zh_cn/_build/html/index.html
+```
+
+### 代码风格
+
+#### Python
+
+[PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范，我们使用以下工具检查和格式化代码
+
+- [flake8](https://github.com/PyCQA/flake8): Python 官方发布的代码规范检查工具，是多个检查工具的封装
+- [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具
+- [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具
+- [codespell](https://github.com/codespell-project/codespell): 检查单词拼写是否有误
+- [mdformat](https://github.com/executablebooks/mdformat): 检查 markdown 文件的工具
+- [docformatter](https://github.com/myint/docformatter): 格式化 docstring 的工具
+
+yapf 和 isort 的配置可以在 [setup.cfg](./setup.cfg) 找到
+
+通过配置 [pre-commit hook](https://pre-commit.com/) ，我们可以在提交代码时自动检查和格式化 `flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`，
+修复 `end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`，调整 `requirments.txt` 的包顺序。
+pre-commit 钩子的配置可以在 [.pre-commit-config](./.pre-commit-config.yaml) 找到。
+
+pre-commit 具体的安装使用方式见[拉取请求](#2-配置-pre-commit)。
+
+更具体的规范请参考 [OpenMMLab 代码规范](code_style.md)。
+
+#### C++ and CUDA
+
+C++ 和 CUDA 的代码规范遵从 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)
+
+### 拉取请求规范
+
+1. 使用 [pre-commit hook](https://pre-commit.com)，尽量减少代码风格相关问题
+
+2. 一个`拉取请求`对应一个短期分支
+
+3. 粒度要细，一个`拉取请求`只做一件事情，避免超大的`拉取请求`
+
+   - Bad：实现 Faster R-CNN
+   - Acceptable：给 Faster R-CNN 添加一个 box head
+   - Good：给 box head 增加一个参数来支持自定义的 conv 层数
+
+4. 每次 Commit 时需要提供清晰且有意义 commit 信息
+
+5. 提供清晰且有意义的`拉取请求`描述
+
+   - 标题写明白任务名称，一般格式:\[Prefix\] Short description of the pull request (Suffix)
+   - prefix: 新增功能 \[Feature\], 修 bug \[Fix\], 文档相关 \[Docs\], 开发中 \[WIP\] (暂时不会被review)
+   - 描述里介绍`拉取请求`的主要修改内容，结果，以及对其他部分的影响, 参考`拉取请求`模板
+   - 关联相关的`议题` (issue) 和其他`拉取请求`
+
+6. 如果引入了其他三方库，或借鉴了三方库的代码，请确认他们的许可证和 mmcv 兼容，并在借鉴的代码上补充 `This code is inspired from http://`
diff --git a/head_extractor/mmcv-2.1.0/LICENSE b/head_extractor/mmcv-2.1.0/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..f02314255d824c0816b0bf1648aac8ab78976199
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/LICENSE
@@ -0,0 +1,203 @@
+Copyright (c) OpenMMLab. All rights reserved
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018-2020 Open-MMLab. All rights reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/head_extractor/mmcv-2.1.0/LICENSES.md b/head_extractor/mmcv-2.1.0/LICENSES.md
new file mode 100644
index 0000000000000000000000000000000000000000..3cdeddf6ff1d09ed8e2d9042f2d930e20599a0b1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/LICENSES.md
@@ -0,0 +1,11 @@
+# Licenses for special operations
+
+In this file, we list the operations with other licenses instead of Apache 2.0. Users should be careful about adopting these operations in any commercial matters.
+
+|    Operation     |                                                                             Files                                                                              |    License     |
+| :--------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------: |
+|    upfirdn2d     |          [mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu)          | NVIDIA License |
+| fused_leaky_relu | [mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu) | NVIDIA License |
+|     bias_act     |             [mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu)             | NVIDIA License |
+|  filtered_lrelu  |            [mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu)            | NVIDIA License |
+|  conv2d_gradfix  |                              [mmcv/ops/conv2d_gradfix.py](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/conv2d_gradfix.py)                              | NVIDIA License |
diff --git a/head_extractor/mmcv-2.1.0/MANIFEST.in b/head_extractor/mmcv-2.1.0/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..622635caa1ec01f78d95c684b87658df87c63b38
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/MANIFEST.in
@@ -0,0 +1,6 @@
+include requirements/runtime.txt
+include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp
+include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp
+include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp
+include mmcv/ops/csrc/pytorch/mps/*.mm mmcv/ops/csrc/common/mps/*.h mmcv/ops/csrc/common/mps/*.mm
+recursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu *.mm
diff --git a/head_extractor/mmcv-2.1.0/README.md b/head_extractor/mmcv-2.1.0/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f0ca668b1220e5124bb9c8aac63a095e62009a64
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/README.md
@@ -0,0 +1,187 @@
+<div align="center">
+  <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/main/docs/en/mmcv-logo.png" width="300"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b><font size="5">OpenMMLab website</font></b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i><font size="4">HOT</font></i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b><font size="5">OpenMMLab platform</font></b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i><font size="4">TRY IT OUT</font></i>
+      </a>
+    </sup>
+  </div>
+  <div>&nbsp;</div>
+
+[![platform](https://img.shields.io/badge/platform-Linux%7CWindows%7CmacOS-blue)](https://mmcv.readthedocs.io/en/latest/get_started/installation.html)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/)
+[![pytorch](https://img.shields.io/badge/pytorch-1.8~2.0-orange)](https://pytorch.org/get-started/previous-versions/)
+[![cuda](https://img.shields.io/badge/cuda-10.1~11.8-green)](https://developer.nvidia.com/cuda-downloads)
+[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv)
+[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv)
+[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)
+
+[📘Documentation](https://mmcv.readthedocs.io/en/latest/) |
+[🛠️Installation](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) |
+[🤔Reporting Issues](https://github.com/open-mmlab/mmcv/issues/new/choose)
+
+</div>
+
+<div align="center">
+
+English | [简体中文](README_zh-CN.md)
+
+</div>
+
+## Highlights
+
+The OpenMMLab team released a new generation of training engine [MMEngine](https://github.com/open-mmlab/mmengine) at the World Artificial Intelligence Conference on September 1, 2022. It is a foundational library for training deep learning models. Compared with MMCV, it provides a universal and powerful runner, an open architecture with a more unified interface, and a more customizable training process.
+
+MMCV v2.0.0 official version was released on April 6, 2023. In version 2.x, it removed components related to the training process and added a data transformation module. Also, starting from 2.x, it renamed the package names **mmcv** to **mmcv-lite** and **mmcv-full** to **mmcv**. For details, see [Compatibility Documentation](docs/en/compatibility.md).
+
+MMCV will maintain both [1.x](https://github.com/open-mmlab/mmcv/tree/1.x) (corresponding to the original [master](https://github.com/open-mmlab/mmcv/tree/master) branch) and **2.x** (corresponding to the **main** branch, now the default branch) versions simultaneously. For details, see [Branch Maintenance Plan](README.md#branch-maintenance-plan).
+
+## Introduction
+
+MMCV is a foundational library for computer vision research and it provides the following functionalities:
+
+- [Image/Video processing](https://mmcv.readthedocs.io/en/latest/understand_mmcv/data_process.html)
+- [Image and annotation visualization](https://mmcv.readthedocs.io/en/latest/understand_mmcv/visualization.html)
+- [Image transformation](https://mmcv.readthedocs.io/en/latest/understand_mmcv/data_transform.html)
+- [Various CNN architectures](https://mmcv.readthedocs.io/en/latest/understand_mmcv/cnn.html)
+- [High-quality implementation of common CPU and CUDA ops](https://mmcv.readthedocs.io/en/latest/understand_mmcv/ops.html)
+
+It supports the following systems:
+
+- Linux
+- Windows
+- macOS
+
+See the [documentation](http://mmcv.readthedocs.io/en/latest) for more features and usage.
+
+Note: MMCV requires Python 3.7+.
+
+## Installation
+
+There are two versions of MMCV:
+
+- **mmcv**: comprehensive, with full features and various CUDA ops out of the box. It takes longer time to build.
+- **mmcv-lite**: lite, without CUDA ops but all other features, similar to mmcv\<1.0.0. It is useful when you do not need those CUDA ops.
+
+**Note**: Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is available`.
+
+### Install mmcv
+
+Before installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation). For apple silicon users, please use PyTorch 1.13+.
+
+The command to install mmcv:
+
+```bash
+pip install -U openmim
+mim install mmcv
+```
+
+If you need to specify the version of mmcv, you can use the following command:
+
+```bash
+mim install mmcv==2.0.0
+```
+
+If you find that the above installation command does not use a pre-built package ending with `.whl` but a source package ending with `.tar.gz`, you may not have a pre-build package corresponding to the PyTorch or CUDA or mmcv version, in which case you can [build mmcv from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html).
+
+<details>
+<summary>Installation log using pre-built packages</summary>
+
+Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
+Collecting mmcv<br />
+<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl</b>
+
+</details>
+
+<details>
+<summary>Installation log using source packages</summary>
+
+Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
+Collecting mmcv==2.0.0<br />
+<b>Downloading mmcv-2.0.0.tar.gz</b>
+
+</details>
+
+For more installation methods, please refer to the [Installation documentation](https://mmcv.readthedocs.io/en/latest/get_started/installation.html).
+
+### Install mmcv-lite
+
+If you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation).
+
+```bash
+pip install -U openmim
+mim install mmcv-lite
+```
+
+## FAQ
+
+If you face some installation issues, CUDA related issues or RuntimeErrors,
+you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html).
+
+If you face installation problems or runtime issues, you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html) to see if there is a solution. If the problem is still not solved, feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).
+
+## Citation
+
+If you find this project useful in your research, please consider cite:
+
+```latex
+@misc{mmcv,
+    title={{MMCV: OpenMMLab} Computer Vision Foundation},
+    author={MMCV Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmcv}},
+    year={2018}
+}
+```
+
+## Contributing
+
+We appreciate all contributions to improve MMCV. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for the contributing guideline.
+
+## License
+
+MMCV is released under the Apache 2.0 license, while some specific operations in this library are with other licenses. Please refer to [LICENSES.md](LICENSES.md) for the careful check, if you are using our code for commercial matters.
+
+## Branch Maintenance Plan
+
+MMCV currently has four branches, namely main, 1.x, master, and 2.x, where 2.x is an alias for the main branch, and master is an alias for the 1.x branch. The 2.x and master branches will be deleted in the future. MMCV's branches go through the following three stages:
+
+| Phase                | Time                  | Branch                                                                                                                              | description                                                                                                                                            |
+| -------------------- | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| RC Period            | 2022.9.1 - 2023.4.5   | Release candidate code (2.x version) will be released on 2.x branch. Default master branch is still 1.x version                     | Master and 2.x branches iterate normally                                                                                                               |
+| Compatibility Period | 2023.4.6 - 2023.12.31 | **The 2.x branch has been renamed to the main branch and set as the default branch**, and 1.x branch will correspond to 1.x version | We still maintain the old version 1.x, respond to user needs, but try not to introduce changes that break compatibility; main branch iterates normally |
+| Maintenance Period   | From 2024/1/1         | Default main branch corresponds to 2.x version and 1.x branch is 1.x version                                                        | 1.x branch is in maintenance phase, no more new feature support; main branch is iterating normally                                                     |
+
+## Projects in OpenMMLab
+
+- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
+- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
diff --git a/head_extractor/mmcv-2.1.0/README_zh-CN.md b/head_extractor/mmcv-2.1.0/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..e48f33bd7f402858e42c916cdc0186d13ec404f2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/README_zh-CN.md
@@ -0,0 +1,210 @@
+<div align="center">
+  <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/main/docs/en/mmcv-logo.png" width="300"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b><font size="5">OpenMMLab 官网</font></b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i><font size="4">HOT</font></i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b><font size="5">OpenMMLab 开放平台</font></b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i><font size="4">TRY IT OUT</font></i>
+      </a>
+    </sup>
+  </div>
+  <div>&nbsp;</div>
+
+[![platform](https://img.shields.io/badge/platform-Linux%7CWindows%7CmacOS-blue)](https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/)
+[![pytorch](https://img.shields.io/badge/pytorch-1.8~2.0-orange)](https://pytorch.org/get-started/previous-versions/)
+[![cuda](https://img.shields.io/badge/cuda-10.1~11.8-green)](https://developer.nvidia.com/cuda-downloads)
+[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv)
+[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv)
+[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)
+
+[📘使用文档](https://mmcv.readthedocs.io/zh_CN/latest/) |
+[🛠️安装教程](https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html) |
+[🤔报告问题](https://github.com/open-mmlab/mmcv/issues/new/choose)
+
+</div>
+
+<div align="center">
+
+[English](README.md) | 简体中文
+
+</div>
+
+<div align="center">
+  <a href="https://openmmlab.medium.com/" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://discord.gg/raweFPmdzG" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://twitter.com/OpenMMLab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.youtube.com/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://space.bilibili.com/1293512903" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.zhihu.com/people/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png" width="3%" alt="" /></a>
+</div>
+
+## Highlights
+
+OpenMMLab 团队于 2022 年 9 月 1 日在世界人工智能大会发布了新一代训练引擎 [MMEngine](https://github.com/open-mmlab/mmengine)，它是一个用于训练深度学习模型的基础库。相比于 MMCV，它提供了更高级且通用的训练器、接口更加统一的开放架构以及可定制化程度更高的训练流程。
+
+MMCV v2.0.0 正式版本于 2023 年 4 月 6 日发布。在 2.x 版本中，它删除了和训练流程相关的组件，并新增了数据变换模块。另外，从 2.x 版本开始，重命名包名 **mmcv** 为 **mmcv-lite** 以及 **mmcv-full** 为 **mmcv**。详情见[兼容性文档](docs/zh_cn/compatibility.md)。
+
+MMCV 会同时维护 [1.x](https://github.com/open-mmlab/mmcv/tree/1.x) (对应原 [master](https://github.com/open-mmlab/mmcv/tree/master) 分支) 和 **2.x**（对应 **main** 分支，现为默认分支）版本，详情见[分支维护计划](README_zh-CN.md#分支维护计划)。
+
+## 简介
+
+MMCV 是一个面向计算机视觉的基础库，它提供了以下功能：
+
+- [图像和视频处理](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/data_process.html)
+- [图像和标注结果可视化](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/visualization.html)
+- [图像变换](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/data_transform.html)
+- [多种 CNN 网络结构](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/cnn.html)
+- [高质量实现的常见 CUDA 算子](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/ops.html)
+
+MMCV 支持多种平台，包括：
+
+- Linux
+- Windows
+- macOS
+
+如想了解更多特性和使用，请参考[文档](http://mmcv.readthedocs.io/zh_CN/latest)。
+
+提示: MMCV 需要 Python 3.7 以上版本。
+
+## 安装
+
+MMCV 有两个版本：
+
+- **mmcv**: 完整版，包含所有的特性以及丰富的开箱即用的 CUDA 算子。注意完整版本可能需要更长时间来编译。
+- **mmcv-lite**: 精简版，不包含 CUDA 算子但包含其余所有特性和功能，类似 MMCV 1.0 之前的版本。如果你不需要使用 CUDA 算子的话，精简版可以作为一个考虑选项。
+
+**注意**: 请不要在同一个环境中安装两个版本，否则可能会遇到类似 `ModuleNotFound` 的错误。在安装一个版本之前，需要先卸载另一个。`如果 CUDA 可用，强烈推荐安装 mmcv`。
+
+### 安装 mmcv
+
+在安装 mmcv 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://github.com/pytorch/pytorch#installation)。如果你使用的是搭载 apple silicon 的 mac 设备，请安装 PyTorch 1.13+ 的版本。
+
+安装 mmcv 的命令如下：
+
+```bash
+pip install -U openmim
+mim install mmcv
+```
+
+如果需要指定 mmcv 的版本，可以使用以下命令
+
+```bash
+mim install mmcv==2.0.0
+```
+
+如果发现上述的安装命令没有使用预编译包（以 `.whl` 结尾）而是使用源码包（以 `.tar.gz` 结尾）安装，则有可能是我们没有提供和当前环境的 PyTorch 版本、CUDA 版本相匹配的 mmcv 预编译包，此时，你可以[源码安装 mmcv](https://mmcv.readthedocs.io/zh_CN/latest/get_started/build.html)。
+
+<details>
+<summary>使用预编译包的安装日志</summary>
+
+Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
+Collecting mmcv<br />
+<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl</b>
+
+</details>
+
+<details>
+<summary>使用源码包的安装日志</summary>
+
+Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
+Collecting mmcv==2.0.0<br />
+<b>Downloading mmcv-2.0.0.tar.gz</b>
+
+</details>
+
+更多安装方式请参考[安装文档](https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html)。
+
+### 安装 mmcv-lite
+
+如果你需要使用和 PyTorch 相关的模块，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://github.com/pytorch/pytorch#installation)。
+
+```bash
+pip install -U openmim
+mim install mmcv-lite
+```
+
+## FAQ
+
+如果你遇到了安装问题或者运行时问题，请查看[问题解决页面](https://mmcv.readthedocs.io/zh_CN/latest/faq.html)是否已有解决方案。如果问题仍然没有解决，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。
+
+## 贡献指南
+
+我们感谢所有的贡献者为改进和提升 MMCV 所作出的努力。请参考[贡献指南](CONTRIBUTING.md)来了解参与项目贡献的相关指引。
+
+## 许可证
+
+`MMCV` 目前以 Apache 2.0 的许可证发布，但是其中有一部分功能并不是使用的 Apache2.0 许可证，我们在 [许可证](LICENSES.md) 中详细地列出了这些功能以及他们对应的许可证，如果您正在从事盈利性活动，请谨慎参考此文档。
+
+## 分支维护计划
+
+MMCV 目前有四个分支，分别是 main、1.x、master 和 2.x，其中 2.x 为 main 分支的别名，master 为 1.x 分支的别名，2.x 和 master 这两个分支在将来会被删除。MMCV 的分支经历以下三个阶段：
+
+| 阶段   | 时间                  | 分支                                                                  | 说明                                                                                                   |
+| ------ | --------------------- | --------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
+| 公测期 | 2022.9.1 - 2023.4.5   | 公测版代码发布在 2.x 分支；默认主分支 master 仍对应 1.x 版本          | master 和 2.x 分支正常进行迭代                                                                         |
+| 兼容期 | 2023.4.6 - 2023.12.31 | **2.x 分支重命名为 main 分支并设置为默认分支**；1.x 分支对应 1.x 版本 | 保持对旧版本 1.x 的维护和开发，响应用户需求，但尽量不引进破坏旧版本兼容性的改动；main 分支正常进行迭代 |
+| 维护期 | 2024.1.1 - 待定       | 默认主分支 main 为 2.x 版本；1.x 分支对应 1.x 版本                    | 1.x 分支进入维护阶段，不再进行新功能支持；main 分支正常进行迭代                                        |
+
+## OpenMMLab 的其他项目
+
+- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练基础库
+- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
+- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
+
+## 欢迎加入 OpenMMLab 社区
+
+扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=K0QI8ByU)，或添加微信小助手”OpenMMLabwx“加入官方交流微信群。
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/25839884/205870927-39f4946d-8751-4219-a4c0-740117558fd7.jpg" height="400" />  <img src="https://user-images.githubusercontent.com/25839884/203904835-62392033-02d4-4c73-a68c-c9e4c1e2b07f.jpg" height="400" /> <img src="https://user-images.githubusercontent.com/25839884/205872898-e2e6009d-c6bb-4d27-8d07-117e697a3da8.jpg" height="400" />
+</div>
+
+我们会在 OpenMMLab 社区为大家
+
+- 📢 分享 AI 框架的前沿核心技术
+- 💻 解读 PyTorch 常用模块源码
+- 📰 发布 OpenMMLab 的相关新闻
+- 🚀 介绍 OpenMMLab 开发的前沿算法
+- 🏃 获取更高效的问题答疑和意见反馈
+- 🔥 提供与各行各业开发者充分交流的平台
+
+干货满满 📘，等你来撩 💗，OpenMMLab 社区期待您的加入 👬
diff --git a/head_extractor/mmcv-2.1.0/TERMINOLOGY.md b/head_extractor/mmcv-2.1.0/TERMINOLOGY.md
new file mode 100644
index 0000000000000000000000000000000000000000..07411b7774c2ed713f472c1287b98b871c7f4d02
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/TERMINOLOGY.md
@@ -0,0 +1,30 @@
+# English-Chinese terminology comparison (英汉术语对照)
+
+This document is used as a reference for English-Chinese terminology translation.
+
+该文档用作中英文翻译对照参考。
+
+|      English      |     中文     |
+| :---------------: | :----------: |
+|    annotation     |     标注     |
+|     backbone      |   主干网络   |
+|     benchmark     |   基准测试   |
+|    checkpoint     | 模型权重文件 |
+|    classifier     |    分类器    |
+|     cls_head      |    分类头    |
+|      decoder      |    解码器    |
+|     detector      |    检测器    |
+|      encoder      |    编码器    |
+|     finetune      |     微调     |
+|   ground truth    |   真实标签   |
+|       hook        |     钩子     |
+|     localizer     |    定位器    |
+|       neck        |   模型颈部   |
+|     pipeline      |    流水线    |
+|    recognizer     |    识别器    |
+|     register      |    注册器    |
+|     schedule      |     调整     |
+|     scheduler     |    调度器    |
+|     segmentor     |    分割器    |
+|      tensor       |     张量     |
+| training schedule |   训练策略   |
diff --git a/head_extractor/mmcv-2.1.0/_docker/README.md b/head_extractor/mmcv-2.1.0/_docker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..60d5c9de5da8faa7e0ae7e0def19a4320a2a7a5e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/_docker/README.md
@@ -0,0 +1,70 @@
+# Docker images
+
+There are two `Dockerfile` files to build docker images, one to build an image with the mmcv pre-built package and the other with the mmcv development environment.
+
+```text
+.
+|-- README.md
+|-- dev  # build with mmcv development environment
+|   `-- Dockerfile
+`-- release  # build with mmcv pre-built package
+    `-- Dockerfile
+```
+
+## Build docker images
+
+### Build with mmcv pre-built package
+
+Build with local repository
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
+docker build -t mmcv -f docker/release/Dockerfile .
+```
+
+Or build with remote repository
+
+```bash
+docker build -t mmcv https://github.com/open-mmlab/mmcv.git#master:docker/release
+```
+
+The [Dockerfile](release/Dockerfile) installs latest released version of mmcv by default, but you can specify mmcv versions to install expected versions.
+
+```bash
+docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0rc1 .
+```
+
+If you also want to use other versions of PyTorch and CUDA, you can also pass them when building docker images.
+
+An example to build an image with PyTorch 1.11 and CUDA 11.3.
+
+```bash
+docker build -t mmcv -f docker/release/Dockerfile \
+    --build-arg PYTORCH=1.9.0 \
+    --build-arg CUDA=11.1 \
+    --build-arg CUDNN=8 \
+    --build-arg MMCV=2.0.0rc1 .
+```
+
+More available versions of PyTorch and CUDA can be found at [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags).
+
+### Build with mmcv development environment
+
+If you want to build an docker image with the mmcv development environment, you can use the following command
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
+docker build -t mmcv -f docker/dev/Dockerfile --build-arg CUDA_ARCH=7.5 .
+```
+
+Note that `CUDA_ARCH` is the cumpute capability of your GPU and you can find it at [Compute Capability](https://developer.nvidia.com/cuda-gpus#compute).
+
+The building process may take 10 minutes or more.
+
+## Run images
+
+```bash
+docker run --gpus all --shm-size=8g -it mmcv
+```
+
+See [docker run](https://docs.docker.com/engine/reference/commandline/run/) for more usages.
diff --git a/head_extractor/mmcv-2.1.0/_docker/dev/Dockerfile b/head_extractor/mmcv-2.1.0/_docker/dev/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..a4d9e23fcfaa6e1af104aaa0e9cbb2a348b3cd34
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/_docker/dev/Dockerfile
@@ -0,0 +1,31 @@
+ARG PYTORCH="1.8.1"
+ARG CUDA="10.2"
+ARG CUDNN="7"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+# To fix GPG key error when running apt-get update
+RUN rm /etc/apt/sources.list.d/cuda.list \
+    && rm /etc/apt/sources.list.d/nvidia-ml.list \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+# Install git and system dependencies for opencv-python
+RUN apt-get update && apt-get install -y git \
+    && apt-get update && apt-get install -y libgl1 libglib2.0-0
+
+# Install system dependencies for unit tests
+RUN apt-get install -y ffmpeg libturbojpeg \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# build mmcv from source with develop mode
+ARG HTTPS_PROXY=""
+ENV https_proxy=${HTTPS_PROXY}
+ENV FORCE_CUDA="1"
+ARG CUDA_ARCH=""
+ENV TORCH_CUDA_ARCH_LIST=${CUDA_ARCH}
+RUN git clone https://github.com/open-mmlab/mmcv.git /mmcv
+WORKDIR /mmcv
+RUN git checkout 2.x && git rev-parse --short HEAD
+RUN pip install --no-cache-dir -e .[all] -v && pip install pre-commit && pre-commit install
diff --git a/head_extractor/mmcv-2.1.0/_docker/release/Dockerfile b/head_extractor/mmcv-2.1.0/_docker/release/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..d5e25e9eb70a87ab1c47a629cc6ed9706ade83c6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/_docker/release/Dockerfile
@@ -0,0 +1,23 @@
+ARG PYTORCH="1.8.1"
+ARG CUDA="10.2"
+ARG CUDNN="7"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+# To fix GPG key error when running apt-get update
+RUN rm /etc/apt/sources.list.d/cuda.list \
+    && rm /etc/apt/sources.list.d/nvidia-ml.list \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+# Install system dependencies for opencv-python
+RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install mmcv
+ARG MMCV=""
+RUN if [ "${MMCV}" = "" ]; then pip install -U openmim && mim install 'mmcv>=2.0.0rc1'; else pip install -U openmim && mim install mmcv==${MMCV}; fi
+
+# Verify the installation
+RUN python -c 'import mmcv;print(mmcv.__version__)'
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/__init__.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2410ea555e905acb450792a427596764e16f62d3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# flake8: noqa
+from .arraymisc import *
+from .image import *
+from .transforms import *
+from .version import *
+from .video import *
+from .visualization import *
+
+# The following modules are not imported to this level, so mmcv may be used
+# without PyTorch.
+# - op
+# - utils
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/_ext.cpython-311-x86_64-linux-gnu.so b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/_ext.cpython-311-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..1e3c364a3d95b7ff2880cced7d9c2bd867ee7a5f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/_ext.cpython-311-x86_64-linux-gnu.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c65eb946113c0f4c13d448af14625175b676402c1edc105f78c427459bd7ec2b
+size 32237896
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/arraymisc/__init__.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/arraymisc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b4700d6139ae3d604ff6e542468cce4200c020c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/arraymisc/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .quantization import dequantize, quantize
+
+__all__ = ['quantize', 'dequantize']
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/arraymisc/quantization.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/arraymisc/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..6182710d51787061304cfc7304ec97d565822536
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/arraymisc/quantization.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import numpy as np
+
+
+def quantize(arr: np.ndarray,
+             min_val: Union[int, float],
+             max_val: Union[int, float],
+             levels: int,
+             dtype=np.int64) -> tuple:
+    """Quantize an array of (-inf, inf) to [0, levels-1].
+
+    Args:
+        arr (ndarray): Input array.
+        min_val (int or float): Minimum value to be clipped.
+        max_val (int or float): Maximum value to be clipped.
+        levels (int): Quantization levels.
+        dtype (np.type): The type of the quantized array.
+
+    Returns:
+        tuple: Quantized array.
+    """
+    if not (isinstance(levels, int) and levels > 1):
+        raise ValueError(
+            f'levels must be a positive integer, but got {levels}')
+    if min_val >= max_val:
+        raise ValueError(
+            f'min_val ({min_val}) must be smaller than max_val ({max_val})')
+
+    arr = np.clip(arr, min_val, max_val) - min_val
+    quantized_arr = np.minimum(
+        np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1)
+
+    return quantized_arr
+
+
+def dequantize(arr: np.ndarray,
+               min_val: Union[int, float],
+               max_val: Union[int, float],
+               levels: int,
+               dtype=np.float64) -> tuple:
+    """Dequantize an array.
+
+    Args:
+        arr (ndarray): Input array.
+        min_val (int or float): Minimum value to be clipped.
+        max_val (int or float): Maximum value to be clipped.
+        levels (int): Quantization levels.
+        dtype (np.type): The type of the dequantized array.
+
+    Returns:
+        tuple: Dequantized array.
+    """
+    if not (isinstance(levels, int) and levels > 1):
+        raise ValueError(
+            f'levels must be a positive integer, but got {levels}')
+    if min_val >= max_val:
+        raise ValueError(
+            f'min_val ({min_val}) must be smaller than max_val ({max_val})')
+
+    dequantized_arr = (arr + 0.5).astype(dtype) * (max_val -
+                                                   min_val) / levels + min_val
+
+    return dequantized_arr
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/__init__.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..10e7e027e4da544f42a6a4fe3400d9413a57e081
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .alexnet import AlexNet
+# yapf: disable
+from .bricks import (ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule,
+                     ConvTranspose2d, ConvTranspose3d, ConvWS2d,
+                     DepthwiseSeparableConvModule, GeneralizedAttention,
+                     HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d,
+                     NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish,
+                     build_activation_layer, build_conv_layer,
+                     build_norm_layer, build_padding_layer, build_plugin_layer,
+                     build_upsample_layer, conv_ws_2d, is_norm)
+# yapf: enable
+from .resnet import ResNet, make_res_layer
+from .rfsearch import Conv2dRFSearchOp, RFSearchHook
+from .utils import fuse_conv_bn, get_model_complexity_info
+from .vgg import VGG, make_vgg_layer
+
+__all__ = [
+    'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer',
+    'ConvModule', 'build_activation_layer', 'build_conv_layer',
+    'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
+    'build_plugin_layer', 'is_norm', 'NonLocal1d', 'NonLocal2d', 'NonLocal3d',
+    'ContextBlock', 'HSigmoid', 'Swish', 'HSwish', 'GeneralizedAttention',
+    'Scale', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d',
+    'DepthwiseSeparableConvModule', 'Linear', 'Conv2d', 'ConvTranspose2d',
+    'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'fuse_conv_bn',
+    'get_model_complexity_info', 'Conv2dRFSearchOp', 'RFSearchHook'
+]
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/alexnet.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..309be24b66049c86837c67d24ee0e790e6396abc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/alexnet.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmengine.runner import load_checkpoint
+
+
+class AlexNet(nn.Module):
+    """AlexNet backbone.
+
+    Args:
+        num_classes (int): number of classes for classification.
+    """
+
+    def __init__(self, num_classes: int = -1):
+        super().__init__()
+        self.num_classes = num_classes
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Dropout(),
+                nn.Linear(256 * 6 * 6, 4096),
+                nn.ReLU(inplace=True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(inplace=True),
+                nn.Linear(4096, num_classes),
+            )
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            # use default initializer
+            pass
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+
+        x = self.features(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), 256 * 6 * 6)
+            x = self.classifier(x)
+
+        return x
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/__init__.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c74986953bf1a23a246c92c51fd14e033b6d682
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .activation import build_activation_layer
+from .context_block import ContextBlock
+from .conv import build_conv_layer
+from .conv2d_adaptive_padding import Conv2dAdaptivePadding
+from .conv_module import ConvModule
+from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d
+from .depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from .drop import Dropout, DropPath
+from .generalized_attention import GeneralizedAttention
+from .hsigmoid import HSigmoid
+from .hswish import HSwish
+from .non_local import NonLocal1d, NonLocal2d, NonLocal3d
+from .norm import build_norm_layer, is_norm
+from .padding import build_padding_layer
+from .plugin import build_plugin_layer
+from .scale import LayerScale, Scale
+from .swish import Swish
+from .upsample import build_upsample_layer
+from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
+                       Linear, MaxPool2d, MaxPool3d)
+
+__all__ = [
+    'ConvModule', 'build_activation_layer', 'build_conv_layer',
+    'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
+    'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d',
+    'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention',
+    'Scale', 'ConvAWS2d', 'ConvWS2d', 'conv_ws_2d',
+    'DepthwiseSeparableConvModule', 'Swish', 'Linear', 'Conv2dAdaptivePadding',
+    'Conv2d', 'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d',
+    'Conv3d', 'Dropout', 'DropPath', 'LayerScale'
+]
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/activation.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae99714b940913c946fa169883584ea193f645ea
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/activation.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.registry import MODELS
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+for module in [
+        nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU,
+        nn.Sigmoid, nn.Tanh
+]:
+    MODELS.register_module(module=module)
+
+if digit_version(torch.__version__) >= digit_version('1.7.0'):
+    MODELS.register_module(module=nn.SiLU, name='SiLU')
+else:
+
+    class SiLU(nn.Module):
+        """Sigmoid Weighted Liner Unit."""
+
+        def __init__(self, inplace=False):
+            super().__init__()
+            self.inplace = inplace
+
+        def forward(self, inputs) -> torch.Tensor:
+            if self.inplace:
+                return inputs.mul_(torch.sigmoid(inputs))
+            else:
+                return inputs * torch.sigmoid(inputs)
+
+    MODELS.register_module(module=SiLU, name='SiLU')
+
+
+@MODELS.register_module(name='Clip')
+@MODELS.register_module()
+class Clamp(nn.Module):
+    """Clamp activation layer.
+
+    This activation function is to clamp the feature map value within
+    :math:`[min, max]`. More details can be found in ``torch.clamp()``.
+
+    Args:
+        min (Number | optional): Lower-bound of the range to be clamped to.
+            Default to -1.
+        max (Number | optional): Upper-bound of the range to be clamped to.
+            Default to 1.
+    """
+
+    def __init__(self, min: float = -1., max: float = 1.):
+        super().__init__()
+        self.min = min
+        self.max = max
+
+    def forward(self, x) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: Clamped tensor.
+        """
+        return torch.clamp(x, min=self.min, max=self.max)
+
+
+class GELU(nn.Module):
+    r"""Applies the Gaussian Error Linear Units function:
+
+    .. math::
+        \text{GELU}(x) = x * \Phi(x)
+    where :math:`\Phi(x)` is the Cumulative Distribution Function for
+    Gaussian Distribution.
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/GELU.png
+
+    Examples::
+
+        >>> m = nn.GELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.gelu(input)
+
+
+if (TORCH_VERSION == 'parrots'
+        or digit_version(TORCH_VERSION) < digit_version('1.4')):
+    MODELS.register_module(module=GELU)
+else:
+    MODELS.register_module(module=nn.GELU)
+
+
+def build_activation_layer(cfg: Dict) -> nn.Module:
+    """Build activation layer.
+
+    Args:
+        cfg (dict): The activation layer config, which should contain:
+
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an activation layer.
+
+    Returns:
+        nn.Module: Created activation layer.
+    """
+    return MODELS.build(cfg)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/context_block.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/context_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e78df8648b779124091a8595282aad7a8d0d305
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/context_block.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from mmengine.model import constant_init, kaiming_init
+from mmengine.registry import MODELS
+from torch import nn
+
+
+def last_zero_init(m: Union[nn.Module, nn.Sequential]) -> None:
+    if isinstance(m, nn.Sequential):
+        constant_init(m[-1], val=0)
+    else:
+        constant_init(m, val=0)
+
+
+@MODELS.register_module()
+class ContextBlock(nn.Module):
+    """ContextBlock module in GCNet.
+
+    See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    (https://arxiv.org/abs/1904.11492) for details.
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        ratio (float): Ratio of channels of transform bottleneck
+        pooling_type (str): Pooling method for context modeling.
+            Options are 'att' and 'avg', stand for attention pooling and
+            average pooling respectively. Default: 'att'.
+        fusion_types (Sequence[str]): Fusion method for feature fusion,
+            Options are 'channels_add', 'channel_mul', stand for channelwise
+            addition and multiplication respectively. Default: ('channel_add',)
+    """
+
+    _abbr_ = 'context_block'
+
+    def __init__(self,
+                 in_channels: int,
+                 ratio: float,
+                 pooling_type: str = 'att',
+                 fusion_types: tuple = ('channel_add', )):
+        super().__init__()
+        assert pooling_type in ['avg', 'att']
+        assert isinstance(fusion_types, (list, tuple))
+        valid_fusion_types = ['channel_add', 'channel_mul']
+        assert all([f in valid_fusion_types for f in fusion_types])
+        assert len(fusion_types) > 0, 'at least one fusion should be used'
+        self.in_channels = in_channels
+        self.ratio = ratio
+        self.planes = int(in_channels * ratio)
+        self.pooling_type = pooling_type
+        self.fusion_types = fusion_types
+        if pooling_type == 'att':
+            self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
+            self.softmax = nn.Softmax(dim=2)
+        else:
+            self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        if 'channel_add' in fusion_types:
+            self.channel_add_conv = nn.Sequential(
+                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
+                nn.LayerNorm([self.planes, 1, 1]),
+                nn.ReLU(inplace=True),  # yapf: disable
+                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
+        else:
+            self.channel_add_conv = None
+        if 'channel_mul' in fusion_types:
+            self.channel_mul_conv = nn.Sequential(
+                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
+                nn.LayerNorm([self.planes, 1, 1]),
+                nn.ReLU(inplace=True),  # yapf: disable
+                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
+        else:
+            self.channel_mul_conv = None
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        if self.pooling_type == 'att':
+            kaiming_init(self.conv_mask, mode='fan_in')
+            self.conv_mask.inited = True
+
+        if self.channel_add_conv is not None:
+            last_zero_init(self.channel_add_conv)
+        if self.channel_mul_conv is not None:
+            last_zero_init(self.channel_mul_conv)
+
+    def spatial_pool(self, x: torch.Tensor) -> torch.Tensor:
+        batch, channel, height, width = x.size()
+        if self.pooling_type == 'att':
+            input_x = x
+            # [N, C, H * W]
+            input_x = input_x.view(batch, channel, height * width)
+            # [N, 1, C, H * W]
+            input_x = input_x.unsqueeze(1)
+            # [N, 1, H, W]
+            context_mask = self.conv_mask(x)
+            # [N, 1, H * W]
+            context_mask = context_mask.view(batch, 1, height * width)
+            # [N, 1, H * W]
+            context_mask = self.softmax(context_mask)
+            # [N, 1, H * W, 1]
+            context_mask = context_mask.unsqueeze(-1)
+            # [N, 1, C, 1]
+            context = torch.matmul(input_x, context_mask)
+            # [N, C, 1, 1]
+            context = context.view(batch, channel, 1, 1)
+        else:
+            # [N, C, 1, 1]
+            context = self.avg_pool(x)
+
+        return context
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # [N, C, 1, 1]
+        context = self.spatial_pool(x)
+
+        out = x
+        if self.channel_mul_conv is not None:
+            # [N, C, 1, 1]
+            channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))
+            out = out * channel_mul_term
+        if self.channel_add_conv is not None:
+            # [N, C, 1, 1]
+            channel_add_term = self.channel_add_conv(context)
+            out = out + channel_add_term
+
+        return out
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..a00b0a52cee31ff7d4dd8df00b1e1046767a6903
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+from typing import Dict, Optional
+
+from mmengine.registry import MODELS
+from torch import nn
+
+MODELS.register_module('Conv1d', module=nn.Conv1d)
+MODELS.register_module('Conv2d', module=nn.Conv2d)
+MODELS.register_module('Conv3d', module=nn.Conv3d)
+MODELS.register_module('Conv', module=nn.Conv2d)
+
+
+def build_conv_layer(cfg: Optional[Dict], *args, **kwargs) -> nn.Module:
+    """Build convolution layer.
+
+    Args:
+        cfg (None or dict): The conv layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an conv layer.
+        args (argument list): Arguments passed to the `__init__`
+            method of the corresponding conv layer.
+        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+            method of the corresponding conv layer.
+
+    Returns:
+        nn.Module: Created conv layer.
+    """
+    if cfg is None:
+        cfg_ = dict(type='Conv2d')
+    else:
+        if not isinstance(cfg, dict):
+            raise TypeError('cfg must be a dict')
+        if 'type' not in cfg:
+            raise KeyError('the cfg dict must contain the key "type"')
+        cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if inspect.isclass(layer_type):
+        return layer_type(*args, **kwargs, **cfg_)  # type: ignore
+    # Switch registry to the target scope. If `conv_layer` cannot be found
+    # in the registry, fallback to search `conv_layer` in the
+    # mmengine.MODELS.
+    with MODELS.switch_scope_and_registry(None) as registry:
+        conv_layer = registry.get(layer_type)
+    if conv_layer is None:
+        raise KeyError(f'Cannot find {conv_layer} in registry under scope '
+                       f'name {registry.scope}')
+    layer = conv_layer(*args, **kwargs, **cfg_)
+
+    return layer
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv2d_adaptive_padding.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv2d_adaptive_padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ac9949e4830c64161036b519594685f7dae72c2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv2d_adaptive_padding.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Tuple, Union
+
+import torch
+from mmengine.registry import MODELS
+from torch import nn
+from torch.nn import functional as F
+
+
+@MODELS.register_module()
+class Conv2dAdaptivePadding(nn.Conv2d):
+    """Implementation of 2D convolution in tensorflow with `padding` as "same",
+    which applies padding to input (if needed) so that input image gets fully
+    covered by filter and stride you specified. For stride 1, this will ensure
+    that output image size is same as input. For stride of 2, output dimensions
+    will be half, for example.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements.
+            Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: bool = True):
+        super().__init__(in_channels, out_channels, kernel_size, stride, 0,
+                         dilation, groups, bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        img_h, img_w = x.size()[-2:]
+        kernel_h, kernel_w = self.weight.size()[-2:]
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(img_h / stride_h)
+        output_w = math.ceil(img_w / stride_w)
+        pad_h = (
+            max((output_h - 1) * self.stride[0] +
+                (kernel_h - 1) * self.dilation[0] + 1 - img_h, 0))
+        pad_w = (
+            max((output_w - 1) * self.stride[1] +
+                (kernel_w - 1) * self.dilation[1] + 1 - img_w, 0))
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [
+                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+            ])
+        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
+                        self.dilation, self.groups)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv_module.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..760e3788125300e47769bce0bc34156e8385791b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv_module.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from functools import partial
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.model import constant_init, kaiming_init
+from mmengine.registry import MODELS
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm
+
+from .activation import build_activation_layer
+from .conv import build_conv_layer
+from .norm import build_norm_layer
+from .padding import build_padding_layer
+
+
+def efficient_conv_bn_eval_forward(bn: _BatchNorm,
+                                   conv: nn.modules.conv._ConvNd,
+                                   x: torch.Tensor):
+    """
+    Implementation based on https://arxiv.org/abs/2305.11624
+    "Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
+    It leverages the associative law between convolution and affine transform,
+    i.e., normalize (weight conv feature) = (normalize weight) conv feature.
+    It works for Eval mode of ConvBN blocks during validation, and can be used
+    for training as well. It reduces memory and computation cost.
+
+    Args:
+        bn (_BatchNorm): a BatchNorm module.
+        conv (nn._ConvNd): a conv module
+        x (torch.Tensor): Input feature map.
+    """
+    # These lines of code are designed to deal with various cases
+    # like bn without affine transform, and conv without bias
+    weight_on_the_fly = conv.weight
+    if conv.bias is not None:
+        bias_on_the_fly = conv.bias
+    else:
+        bias_on_the_fly = torch.zeros_like(bn.running_var)
+
+    if bn.weight is not None:
+        bn_weight = bn.weight
+    else:
+        bn_weight = torch.ones_like(bn.running_var)
+
+    if bn.bias is not None:
+        bn_bias = bn.bias
+    else:
+        bn_bias = torch.zeros_like(bn.running_var)
+
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    weight_coeff = torch.rsqrt(bn.running_var +
+                               bn.eps).reshape([-1] + [1] *
+                                               (len(conv.weight.shape) - 1))
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff
+
+    # shape of [C_out, C_in, k, k] in Conv2d
+    weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
+    # shape of [C_out] in Conv2d
+    bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() *\
+        (bias_on_the_fly - bn.running_mean)
+
+    return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly)
+
+
+@MODELS.register_module()
+class ConvModule(nn.Module):
+    """A conv block that bundles conv/norm/activation layers.
+
+    This block simplifies the usage of convolution layers, which are commonly
+    used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+    It is based upon three build methods: `build_conv_layer()`,
+    `build_norm_layer()` and `build_activation_layer()`.
+
+    Besides, we add some additional features in this module.
+    1. Automatically set `bias` of the conv layer.
+    2. Spectral norm is supported.
+    3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
+    supports zero and circular padding, and we add "reflect" padding mode.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+            Same as that in ``nn._ConvNd``.
+        out_channels (int): Number of channels produced by the convolution.
+            Same as that in ``nn._ConvNd``.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+            Same as that in ``nn._ConvNd``.
+        stride (int | tuple[int]): Stride of the convolution.
+            Same as that in ``nn._ConvNd``.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input. Same as that in ``nn._ConvNd``.
+        dilation (int | tuple[int]): Spacing between kernel elements.
+            Same as that in ``nn._ConvNd``.
+        groups (int): Number of blocked connections from input channels to
+            output channels. Same as that in ``nn._ConvNd``.
+        bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
+            False. Default: "auto".
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        inplace (bool): Whether to use inplace mode for activation.
+            Default: True.
+        with_spectral_norm (bool): Whether use spectral norm in conv module.
+            Default: False.
+        padding_mode (str): If the `padding_mode` has not been supported by
+            current `Conv2d` in PyTorch, we will use our own padding layer
+            instead. Currently, we support ['zeros', 'circular'] with official
+            implementation and ['reflect'] with our own implementation.
+            Default: 'zeros'.
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Common examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+            Default: ('conv', 'norm', 'act').
+        efficient_conv_bn_eval (bool): Whether use efficient conv when the
+            consecutive bn is in eval mode (either training or testing), as
+            proposed in https://arxiv.org/abs/2305.11624 . Default: `False`.
+    """
+
+    _abbr_ = 'conv_block'
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: Union[bool, str] = 'auto',
+                 conv_cfg: Optional[Dict] = None,
+                 norm_cfg: Optional[Dict] = None,
+                 act_cfg: Optional[Dict] = dict(type='ReLU'),
+                 inplace: bool = True,
+                 with_spectral_norm: bool = False,
+                 padding_mode: str = 'zeros',
+                 order: tuple = ('conv', 'norm', 'act'),
+                 efficient_conv_bn_eval: bool = False):
+        super().__init__()
+        assert conv_cfg is None or isinstance(conv_cfg, dict)
+        assert norm_cfg is None or isinstance(norm_cfg, dict)
+        assert act_cfg is None or isinstance(act_cfg, dict)
+        official_padding_mode = ['zeros', 'circular']
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.inplace = inplace
+        self.with_spectral_norm = with_spectral_norm
+        self.with_explicit_padding = padding_mode not in official_padding_mode
+        self.order = order
+        assert isinstance(self.order, tuple) and len(self.order) == 3
+        assert set(order) == {'conv', 'norm', 'act'}
+
+        self.with_norm = norm_cfg is not None
+        self.with_activation = act_cfg is not None
+        # if the conv layer is before a norm layer, bias is unnecessary.
+        if bias == 'auto':
+            bias = not self.with_norm
+        self.with_bias = bias
+
+        if self.with_explicit_padding:
+            pad_cfg = dict(type=padding_mode)
+            self.padding_layer = build_padding_layer(pad_cfg, padding)
+
+        # reset padding to 0 for conv module
+        conv_padding = 0 if self.with_explicit_padding else padding
+        # build convolution layer
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=conv_padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        # export the attributes of self.conv to a higher level for convenience
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+
+        if self.with_spectral_norm:
+            self.conv = nn.utils.spectral_norm(self.conv)
+
+        # build normalization layers
+        if self.with_norm:
+            # norm layer is after conv layer
+            if order.index('norm') > order.index('conv'):
+                norm_channels = out_channels
+            else:
+                norm_channels = in_channels
+            self.norm_name, norm = build_norm_layer(
+                norm_cfg, norm_channels)  # type: ignore
+            self.add_module(self.norm_name, norm)
+            if self.with_bias:
+                if isinstance(norm, (_BatchNorm, _InstanceNorm)):
+                    warnings.warn(
+                        'Unnecessary conv bias before batch/instance norm')
+        else:
+            self.norm_name = None  # type: ignore
+
+        self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
+
+        # build activation layer
+        if self.with_activation:
+            act_cfg_ = act_cfg.copy()  # type: ignore
+            # nn.Tanh has no 'inplace' argument
+            if act_cfg_['type'] not in [
+                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish', 'GELU'
+            ]:
+                act_cfg_.setdefault('inplace', inplace)
+            self.activate = build_activation_layer(act_cfg_)
+
+        # Use msra init by default
+        self.init_weights()
+
+    @property
+    def norm(self):
+        if self.norm_name:
+            return getattr(self, self.norm_name)
+        else:
+            return None
+
+    def init_weights(self):
+        # 1. It is mainly for customized conv layers with their own
+        #    initialization manners by calling their own ``init_weights()``,
+        #    and we do not want ConvModule to override the initialization.
+        # 2. For customized conv layers without their own initialization
+        #    manners (that is, they don't have their own ``init_weights()``)
+        #    and PyTorch's conv layers, they will be initialized by
+        #    this method with default ``kaiming_init``.
+        # Note: For PyTorch's conv layers, they will be overwritten by our
+        #    initialization implementation using default ``kaiming_init``.
+        if not hasattr(self.conv, 'init_weights'):
+            if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':
+                nonlinearity = 'leaky_relu'
+                a = self.act_cfg.get('negative_slope', 0.01)
+            else:
+                nonlinearity = 'relu'
+                a = 0
+            kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
+        if self.with_norm:
+            constant_init(self.norm, 1, bias=0)
+
+    def forward(self,
+                x: torch.Tensor,
+                activate: bool = True,
+                norm: bool = True) -> torch.Tensor:
+        layer_index = 0
+        while layer_index < len(self.order):
+            layer = self.order[layer_index]
+            if layer == 'conv':
+                if self.with_explicit_padding:
+                    x = self.padding_layer(x)
+                # if the next operation is norm and we have a norm layer in
+                # eval mode and we have enabled `efficient_conv_bn_eval` for
+                # the conv operator, then activate the optimized forward and
+                # skip the next norm operator since it has been fused
+                if layer_index + 1 < len(self.order) and \
+                        self.order[layer_index + 1] == 'norm' and norm and \
+                        self.with_norm and not self.norm.training and \
+                        self.efficient_conv_bn_eval_forward is not None:
+                    self.conv.forward = partial(
+                        self.efficient_conv_bn_eval_forward, self.norm,
+                        self.conv)
+                    layer_index += 1
+                    x = self.conv(x)
+                    del self.conv.forward
+                else:
+                    x = self.conv(x)
+            elif layer == 'norm' and norm and self.with_norm:
+                x = self.norm(x)
+            elif layer == 'act' and activate and self.with_activation:
+                x = self.activate(x)
+            layer_index += 1
+        return x
+
+    def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval=True):
+        # efficient_conv_bn_eval works for conv + bn
+        # with `track_running_stats` option
+        if efficient_conv_bn_eval and self.norm \
+                            and isinstance(self.norm, _BatchNorm) \
+                            and self.norm.track_running_stats:
+            self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward  # noqa: E501
+        else:
+            self.efficient_conv_bn_eval_forward = None  # type: ignore
+
+    @staticmethod
+    def create_from_conv_bn(conv: torch.nn.modules.conv._ConvNd,
+                            bn: torch.nn.modules.batchnorm._BatchNorm,
+                            efficient_conv_bn_eval=True) -> 'ConvModule':
+        """Create a ConvModule from a conv and a bn module."""
+        self = ConvModule.__new__(ConvModule)
+        super(ConvModule, self).__init__()
+
+        self.conv_cfg = None
+        self.norm_cfg = None
+        self.act_cfg = None
+        self.inplace = False
+        self.with_spectral_norm = False
+        self.with_explicit_padding = False
+        self.order = ('conv', 'norm', 'act')
+
+        self.with_norm = True
+        self.with_activation = False
+        self.with_bias = conv.bias is not None
+
+        # build convolution layer
+        self.conv = conv
+        # export the attributes of self.conv to a higher level for convenience
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = self.conv.padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+
+        # build normalization layers
+        self.norm_name, norm = 'bn', bn
+        self.add_module(self.norm_name, norm)
+
+        self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
+
+        return self
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv_ws.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv_ws.py
new file mode 100644
index 0000000000000000000000000000000000000000..261f5c1aa9aa9b80891e6330e6d576c3a8ce3e5d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv_ws.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.registry import MODELS
+
+
+def conv_ws_2d(input: torch.Tensor,
+               weight: torch.Tensor,
+               bias: Optional[torch.Tensor] = None,
+               stride: Union[int, Tuple[int, int]] = 1,
+               padding: Union[int, Tuple[int, int]] = 0,
+               dilation: Union[int, Tuple[int, int]] = 1,
+               groups: int = 1,
+               eps: float = 1e-5) -> torch.Tensor:
+    c_in = weight.size(0)
+    weight_flat = weight.view(c_in, -1)
+    mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+    std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+    weight = (weight - mean) / (std + eps)
+    return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
+
+
+@MODELS.register_module('ConvWS')
+class ConvWS2d(nn.Conv2d):
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 eps: float = 1e-5):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
+                          self.dilation, self.groups, self.eps)
+
+
+@MODELS.register_module(name='ConvAWS')
+class ConvAWS2d(nn.Conv2d):
+    """AWS (Adaptive Weight Standardization)
+
+    This is a variant of Weight Standardization
+    (https://arxiv.org/pdf/1903.10520.pdf)
+    It is used in DetectoRS to avoid NaN
+    (https://arxiv.org/pdf/2006.02334.pdf)
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the conv kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements.
+            Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If set True, adds a learnable bias to the
+            output. Default: True
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: bool = True):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.register_buffer('weight_gamma',
+                             torch.ones(self.out_channels, 1, 1, 1))
+        self.register_buffer('weight_beta',
+                             torch.zeros(self.out_channels, 1, 1, 1))
+
+    def _get_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        weight_flat = weight.view(weight.size(0), -1)
+        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
+        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
+        weight = (weight - mean) / std
+        weight = self.weight_gamma * weight + self.weight_beta
+        return weight
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        weight = self._get_weight(self.weight)
+        return F.conv2d(x, weight, self.bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+
+    def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str,
+                              local_metadata: Dict, strict: bool,
+                              missing_keys: List[str],
+                              unexpected_keys: List[str],
+                              error_msgs: List[str]) -> None:
+        """Override default load function.
+
+        AWS overrides the function _load_from_state_dict to recover
+        weight_gamma and weight_beta if they are missing. If weight_gamma and
+        weight_beta are found in the checkpoint, this function will return
+        after super()._load_from_state_dict. Otherwise, it will compute the
+        mean and std of the pretrained weights and store them in weight_beta
+        and weight_gamma.
+        """
+
+        self.weight_gamma.data.fill_(-1)
+        local_missing_keys: List = []
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, local_missing_keys,
+                                      unexpected_keys, error_msgs)
+        if self.weight_gamma.data.mean() > 0:
+            for k in local_missing_keys:
+                missing_keys.append(k)
+            return
+        weight = self.weight.data
+        weight_flat = weight.view(weight.size(0), -1)
+        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
+        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
+        self.weight_beta.data.copy_(mean)
+        self.weight_gamma.data.copy_(std)
+        missing_gamma_beta = [
+            k for k in local_missing_keys
+            if k.endswith('weight_gamma') or k.endswith('weight_beta')
+        ]
+        for k in missing_gamma_beta:
+            local_missing_keys.remove(k)
+        for k in local_missing_keys:
+            missing_keys.append(k)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/depthwise_separable_conv_module.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/depthwise_separable_conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf1fe4cad3812007573211fa2bede28b23822122
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/depthwise_separable_conv_module.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from .conv_module import ConvModule
+
+
+class DepthwiseSeparableConvModule(nn.Module):
+    """Depthwise separable convolution module.
+
+    See https://arxiv.org/pdf/1704.04861.pdf for details.
+
+    This module can replace a ConvModule with the conv block replaced by two
+    conv block: depthwise conv block and pointwise conv block. The depthwise
+    conv block contains depthwise-conv/norm/activation layers. The pointwise
+    conv block contains pointwise-conv/norm/activation layers. It should be
+    noted that there will be norm/activation layer in the depthwise conv block
+    if `norm_cfg` and `act_cfg` are specified.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+            Same as that in ``nn._ConvNd``.
+        out_channels (int): Number of channels produced by the convolution.
+            Same as that in ``nn._ConvNd``.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+            Same as that in ``nn._ConvNd``.
+        stride (int | tuple[int]): Stride of the convolution.
+            Same as that in ``nn._ConvNd``. Default: 1.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input. Same as that in ``nn._ConvNd``. Default: 0.
+        dilation (int | tuple[int]): Spacing between kernel elements.
+            Same as that in ``nn._ConvNd``. Default: 1.
+        norm_cfg (dict): Default norm config for both depthwise ConvModule and
+            pointwise ConvModule. Default: None.
+        act_cfg (dict): Default activation config for both depthwise ConvModule
+            and pointwise ConvModule. Default: dict(type='ReLU').
+        dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
+            'default', it will be the same as `norm_cfg`. Default: 'default'.
+        dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
+            'default', it will be the same as `act_cfg`. Default: 'default'.
+        pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
+            'default', it will be the same as `norm_cfg`. Default: 'default'.
+        pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
+            'default', it will be the same as `act_cfg`. Default: 'default'.
+        kwargs (optional): Other shared arguments for depthwise and pointwise
+            ConvModule. See ConvModule for ref.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 norm_cfg: Optional[Dict] = None,
+                 act_cfg: Dict = dict(type='ReLU'),
+                 dw_norm_cfg: Union[Dict, str] = 'default',
+                 dw_act_cfg: Union[Dict, str] = 'default',
+                 pw_norm_cfg: Union[Dict, str] = 'default',
+                 pw_act_cfg: Union[Dict, str] = 'default',
+                 **kwargs):
+        super().__init__()
+        assert 'groups' not in kwargs, 'groups should not be specified'
+
+        # if norm/activation config of depthwise/pointwise ConvModule is not
+        # specified, use default config.
+        dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501
+        dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
+        pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501
+        pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg
+
+        # depthwise convolution
+        self.depthwise_conv = ConvModule(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            norm_cfg=dw_norm_cfg,  # type: ignore
+            act_cfg=dw_act_cfg,  # type: ignore
+            **kwargs)
+
+        self.pointwise_conv = ConvModule(
+            in_channels,
+            out_channels,
+            1,
+            norm_cfg=pw_norm_cfg,  # type: ignore
+            act_cfg=pw_act_cfg,  # type: ignore
+            **kwargs)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv(x)
+        return x
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/drop.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/drop.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe82a2560515858341836de3fa563ed4db3a3e14
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/drop.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+
+def drop_path(x: torch.Tensor,
+              drop_prob: float = 0.,
+              training: bool = False) -> torch.Tensor:
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of
+    residual blocks).
+
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    # handle tensors with different dimensions, not just 4D tensors.
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=x.dtype, device=x.device)
+    output = x.div(keep_prob) * random_tensor.floor()
+    return output
+
+
+@MODELS.register_module()
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
+    residual blocks).
+
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+
+    Args:
+        drop_prob (float): Probability of the path to be zeroed. Default: 0.1
+    """
+
+    def __init__(self, drop_prob: float = 0.1):
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return drop_path(x, self.drop_prob, self.training)
+
+
+@MODELS.register_module()
+class Dropout(nn.Dropout):
+    """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
+    ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
+    ``DropPath``
+
+    Args:
+        drop_prob (float): Probability of the elements to be
+            zeroed. Default: 0.5.
+        inplace (bool):  Do the operation inplace or not. Default: False.
+    """
+
+    def __init__(self, drop_prob: float = 0.5, inplace: bool = False):
+        super().__init__(p=drop_prob, inplace=inplace)
+
+
+def build_dropout(cfg: Dict, default_args: Optional[Dict] = None) -> Any:
+    """Builder for drop out layers."""
+    return MODELS.build(cfg, default_args=default_args)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/generalized_attention.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/generalized_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..239823c787c8c56947c8b4afe2e0987c42a86abb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/generalized_attention.py
@@ -0,0 +1,411 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import kaiming_init
+from mmengine.registry import MODELS
+
+
+@MODELS.register_module()
+class GeneralizedAttention(nn.Module):
+    """GeneralizedAttention module.
+
+    See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
+    (https://arxiv.org/abs/1904.05873) for details.
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        spatial_range (int): The spatial range. -1 indicates no spatial range
+            constraint. Default: -1.
+        num_heads (int): The head number of empirical_attention module.
+            Default: 9.
+        position_embedding_dim (int): The position embedding dimension.
+            Default: -1.
+        position_magnitude (int): A multiplier acting on coord difference.
+            Default: 1.
+        kv_stride (int): The feature stride acting on key/value feature map.
+            Default: 2.
+        q_stride (int): The feature stride acting on query feature map.
+            Default: 1.
+        attention_type (str): A binary indicator string for indicating which
+            items in generalized empirical_attention module are used.
+            Default: '1111'.
+
+            - '1000' indicates 'query and key content' (appr - appr) item,
+            - '0100' indicates 'query content and relative position'
+              (appr - position) item,
+            - '0010' indicates 'key content only' (bias - appr) item,
+            - '0001' indicates 'relative position only' (bias - position) item.
+    """
+
+    _abbr_ = 'gen_attention_block'
+
+    def __init__(self,
+                 in_channels: int,
+                 spatial_range: int = -1,
+                 num_heads: int = 9,
+                 position_embedding_dim: int = -1,
+                 position_magnitude: int = 1,
+                 kv_stride: int = 2,
+                 q_stride: int = 1,
+                 attention_type: str = '1111'):
+
+        super().__init__()
+
+        # hard range means local range for non-local operation
+        self.position_embedding_dim = (
+            position_embedding_dim
+            if position_embedding_dim > 0 else in_channels)
+
+        self.position_magnitude = position_magnitude
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.spatial_range = spatial_range
+        self.kv_stride = kv_stride
+        self.q_stride = q_stride
+        self.attention_type = [bool(int(_)) for _ in attention_type]
+        self.qk_embed_dim = in_channels // num_heads
+        out_c = self.qk_embed_dim * num_heads
+
+        if self.attention_type[0] or self.attention_type[1]:
+            self.query_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_c,
+                kernel_size=1,
+                bias=False)
+            self.query_conv.kaiming_init = True
+
+        if self.attention_type[0] or self.attention_type[2]:
+            self.key_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_c,
+                kernel_size=1,
+                bias=False)
+            self.key_conv.kaiming_init = True
+
+        self.v_dim = in_channels // num_heads
+        self.value_conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=self.v_dim * num_heads,
+            kernel_size=1,
+            bias=False)
+        self.value_conv.kaiming_init = True
+
+        if self.attention_type[1] or self.attention_type[3]:
+            self.appr_geom_fc_x = nn.Linear(
+                self.position_embedding_dim // 2, out_c, bias=False)
+            self.appr_geom_fc_x.kaiming_init = True
+
+            self.appr_geom_fc_y = nn.Linear(
+                self.position_embedding_dim // 2, out_c, bias=False)
+            self.appr_geom_fc_y.kaiming_init = True
+
+        if self.attention_type[2]:
+            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
+            appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv
+            self.appr_bias = nn.Parameter(appr_bias_value)
+
+        if self.attention_type[3]:
+            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
+            geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv
+            self.geom_bias = nn.Parameter(geom_bias_value)
+
+        self.proj_conv = nn.Conv2d(
+            in_channels=self.v_dim * num_heads,
+            out_channels=in_channels,
+            kernel_size=1,
+            bias=True)
+        self.proj_conv.kaiming_init = True
+        self.gamma = nn.Parameter(torch.zeros(1))
+
+        if self.spatial_range >= 0:
+            # only works when non local is after 3*3 conv
+            if in_channels == 256:
+                max_len = 84
+            elif in_channels == 512:
+                max_len = 42
+
+            max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
+            local_constraint_map = np.ones(
+                (max_len, max_len, max_len_kv, max_len_kv), dtype=int)
+            for iy in range(max_len):
+                for ix in range(max_len):
+                    local_constraint_map[
+                        iy, ix,
+                        max((iy - self.spatial_range) //
+                            self.kv_stride, 0):min((iy + self.spatial_range +
+                                                    1) // self.kv_stride +
+                                                   1, max_len),
+                        max((ix - self.spatial_range) //
+                            self.kv_stride, 0):min((ix + self.spatial_range +
+                                                    1) // self.kv_stride +
+                                                   1, max_len)] = 0
+
+            self.local_constraint_map = nn.Parameter(
+                torch.from_numpy(local_constraint_map).byte(),
+                requires_grad=False)
+
+        if self.q_stride > 1:
+            self.q_downsample = nn.AvgPool2d(
+                kernel_size=1, stride=self.q_stride)
+        else:
+            self.q_downsample = None
+
+        if self.kv_stride > 1:
+            self.kv_downsample = nn.AvgPool2d(
+                kernel_size=1, stride=self.kv_stride)
+        else:
+            self.kv_downsample = None
+
+        self.init_weights()
+
+    def get_position_embedding(self,
+                               h,
+                               w,
+                               h_kv,
+                               w_kv,
+                               q_stride,
+                               kv_stride,
+                               device,
+                               dtype,
+                               feat_dim,
+                               wave_length=1000):
+        # the default type of Tensor is float32, leading to type mismatch
+        # in fp16 mode. Cast it to support fp16 mode.
+        h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype)
+        h_idxs = h_idxs.view((h, 1)) * q_stride
+
+        w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype)
+        w_idxs = w_idxs.view((w, 1)) * q_stride
+
+        h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to(
+            device=device, dtype=dtype)
+        h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride
+
+        w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to(
+            device=device, dtype=dtype)
+        w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride
+
+        # (h, h_kv, 1)
+        h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0)
+        h_diff *= self.position_magnitude
+
+        # (w, w_kv, 1)
+        w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0)
+        w_diff *= self.position_magnitude
+
+        feat_range = torch.arange(0, feat_dim / 4).to(
+            device=device, dtype=dtype)
+
+        dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype)
+        dim_mat = dim_mat**((4. / feat_dim) * feat_range)
+        dim_mat = dim_mat.view((1, 1, -1))
+
+        embedding_x = torch.cat(
+            ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2)
+
+        embedding_y = torch.cat(
+            ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2)
+
+        return embedding_x, embedding_y
+
+    def forward(self, x_input: torch.Tensor) -> torch.Tensor:
+        num_heads = self.num_heads
+
+        # use empirical_attention
+        if self.q_downsample is not None:
+            x_q = self.q_downsample(x_input)
+        else:
+            x_q = x_input
+        n, _, h, w = x_q.shape
+
+        if self.kv_downsample is not None:
+            x_kv = self.kv_downsample(x_input)
+        else:
+            x_kv = x_input
+        _, _, h_kv, w_kv = x_kv.shape
+
+        if self.attention_type[0] or self.attention_type[1]:
+            proj_query = self.query_conv(x_q).view(
+                (n, num_heads, self.qk_embed_dim, h * w))
+            proj_query = proj_query.permute(0, 1, 3, 2)
+
+        if self.attention_type[0] or self.attention_type[2]:
+            proj_key = self.key_conv(x_kv).view(
+                (n, num_heads, self.qk_embed_dim, h_kv * w_kv))
+
+        if self.attention_type[1] or self.attention_type[3]:
+            position_embed_x, position_embed_y = self.get_position_embedding(
+                h, w, h_kv, w_kv, self.q_stride, self.kv_stride,
+                x_input.device, x_input.dtype, self.position_embedding_dim)
+            # (n, num_heads, w, w_kv, dim)
+            position_feat_x = self.appr_geom_fc_x(position_embed_x).\
+                view(1, w, w_kv, num_heads, self.qk_embed_dim).\
+                permute(0, 3, 1, 2, 4).\
+                repeat(n, 1, 1, 1, 1)
+
+            # (n, num_heads, h, h_kv, dim)
+            position_feat_y = self.appr_geom_fc_y(position_embed_y).\
+                view(1, h, h_kv, num_heads, self.qk_embed_dim).\
+                permute(0, 3, 1, 2, 4).\
+                repeat(n, 1, 1, 1, 1)
+
+            position_feat_x /= math.sqrt(2)
+            position_feat_y /= math.sqrt(2)
+
+        # accelerate for saliency only
+        if (np.sum(self.attention_type) == 1) and self.attention_type[2]:
+            appr_bias = self.appr_bias.\
+                view(1, num_heads, 1, self.qk_embed_dim).\
+                repeat(n, 1, 1, 1)
+
+            energy = torch.matmul(appr_bias, proj_key).\
+                view(n, num_heads, 1, h_kv * w_kv)
+
+            h = 1
+            w = 1
+        else:
+            # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for
+            if not self.attention_type[0]:
+                energy = torch.zeros(
+                    n,
+                    num_heads,
+                    h,
+                    w,
+                    h_kv,
+                    w_kv,
+                    dtype=x_input.dtype,
+                    device=x_input.device)
+
+            # attention_type[0]: appr - appr
+            # attention_type[1]: appr - position
+            # attention_type[2]: bias - appr
+            # attention_type[3]: bias - position
+            if self.attention_type[0] or self.attention_type[2]:
+                if self.attention_type[0] and self.attention_type[2]:
+                    appr_bias = self.appr_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim)
+                    energy = torch.matmul(proj_query + appr_bias, proj_key).\
+                        view(n, num_heads, h, w, h_kv, w_kv)
+
+                elif self.attention_type[0]:
+                    energy = torch.matmul(proj_query, proj_key).\
+                        view(n, num_heads, h, w, h_kv, w_kv)
+
+                elif self.attention_type[2]:
+                    appr_bias = self.appr_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim).\
+                        repeat(n, 1, 1, 1)
+
+                    energy += torch.matmul(appr_bias, proj_key).\
+                        view(n, num_heads, 1, 1, h_kv, w_kv)
+
+            if self.attention_type[1] or self.attention_type[3]:
+                if self.attention_type[1] and self.attention_type[3]:
+                    geom_bias = self.geom_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim)
+
+                    proj_query_reshape = (proj_query + geom_bias).\
+                        view(n, num_heads, h, w, self.qk_embed_dim)
+
+                    energy_x = torch.matmul(
+                        proj_query_reshape.permute(0, 1, 3, 2, 4),
+                        position_feat_x.permute(0, 1, 2, 4, 3))
+                    energy_x = energy_x.\
+                        permute(0, 1, 3, 2, 4).unsqueeze(4)
+
+                    energy_y = torch.matmul(
+                        proj_query_reshape,
+                        position_feat_y.permute(0, 1, 2, 4, 3))
+                    energy_y = energy_y.unsqueeze(5)
+
+                    energy += energy_x + energy_y
+
+                elif self.attention_type[1]:
+                    proj_query_reshape = proj_query.\
+                        view(n, num_heads, h, w, self.qk_embed_dim)
+                    proj_query_reshape = proj_query_reshape.\
+                        permute(0, 1, 3, 2, 4)
+                    position_feat_x_reshape = position_feat_x.\
+                        permute(0, 1, 2, 4, 3)
+                    position_feat_y_reshape = position_feat_y.\
+                        permute(0, 1, 2, 4, 3)
+
+                    energy_x = torch.matmul(proj_query_reshape,
+                                            position_feat_x_reshape)
+                    energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4)
+
+                    energy_y = torch.matmul(proj_query_reshape,
+                                            position_feat_y_reshape)
+                    energy_y = energy_y.unsqueeze(5)
+
+                    energy += energy_x + energy_y
+
+                elif self.attention_type[3]:
+                    geom_bias = self.geom_bias.\
+                        view(1, num_heads, self.qk_embed_dim, 1).\
+                        repeat(n, 1, 1, 1)
+
+                    position_feat_x_reshape = position_feat_x.\
+                        view(n, num_heads, w * w_kv, self.qk_embed_dim)
+
+                    position_feat_y_reshape = position_feat_y.\
+                        view(n, num_heads, h * h_kv, self.qk_embed_dim)
+
+                    energy_x = torch.matmul(position_feat_x_reshape, geom_bias)
+                    energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv)
+
+                    energy_y = torch.matmul(position_feat_y_reshape, geom_bias)
+                    energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1)
+
+                    energy += energy_x + energy_y
+
+            energy = energy.view(n, num_heads, h * w, h_kv * w_kv)
+
+        if self.spatial_range >= 0:
+            cur_local_constraint_map = \
+                self.local_constraint_map[:h, :w, :h_kv, :w_kv].\
+                contiguous().\
+                view(1, 1, h*w, h_kv*w_kv)
+
+            energy = energy.masked_fill_(cur_local_constraint_map.bool(),
+                                         float('-inf'))
+
+        attention = F.softmax(energy, 3)
+
+        proj_value = self.value_conv(x_kv)
+        proj_value_reshape = proj_value.\
+            view((n, num_heads, self.v_dim, h_kv * w_kv)).\
+            permute(0, 1, 3, 2)
+
+        out = torch.matmul(attention, proj_value_reshape).\
+            permute(0, 1, 3, 2).\
+            contiguous().\
+            view(n, self.v_dim * self.num_heads, h, w)
+
+        out = self.proj_conv(out)
+
+        # output is downsampled, upsample back to input size
+        if self.q_downsample is not None:
+            out = F.interpolate(
+                out,
+                size=x_input.shape[2:],
+                mode='bilinear',
+                align_corners=False)
+
+        out = self.gamma * out + x_input
+        return out
+
+    def init_weights(self):
+        for m in self.modules():
+            if hasattr(m, 'kaiming_init') and m.kaiming_init:
+                kaiming_init(
+                    m,
+                    mode='fan_in',
+                    nonlinearity='leaky_relu',
+                    bias=0,
+                    distribution='uniform',
+                    a=1)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/hsigmoid.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/hsigmoid.py
new file mode 100644
index 0000000000000000000000000000000000000000..423e0aad9ae154cf651d289327bc19da940cf449
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/hsigmoid.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+
+@MODELS.register_module()
+class HSigmoid(nn.Module):
+    """Hard Sigmoid Module. Apply the hard sigmoid function:
+    Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
+    Default: Hsigmoid(x) = min(max((x + 3) / 6, 0), 1)
+
+    Note:
+        In MMCV v1.4.4, we modified the default value of args to align with
+        PyTorch official.
+
+    Args:
+        bias (float): Bias of the input feature map. Default: 3.0.
+        divisor (float): Divisor of the input feature map. Default: 6.0.
+        min_value (float): Lower bound value. Default: 0.0.
+        max_value (float): Upper bound value. Default: 1.0.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 bias: float = 3.0,
+                 divisor: float = 6.0,
+                 min_value: float = 0.0,
+                 max_value: float = 1.0):
+        super().__init__()
+        warnings.warn(
+            'In MMCV v1.4.4, we modified the default value of args to align '
+            'with PyTorch official. Previous Implementation: '
+            'Hsigmoid(x) = min(max((x + 1) / 2, 0), 1). '
+            'Current Implementation: '
+            'Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).')
+        self.bias = bias
+        self.divisor = divisor
+        assert self.divisor != 0
+        self.min_value = min_value
+        self.max_value = max_value
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x + self.bias) / self.divisor
+
+        return x.clamp_(self.min_value, self.max_value)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/hswish.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/hswish.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b6dd006d424bd39a3f99ceefda816408309d71c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/hswish.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+
+class HSwish(nn.Module):
+    """Hard Swish Module.
+
+    This module applies the hard swish function:
+
+    .. math::
+        Hswish(x) = x * ReLU6(x + 3) / 6
+
+    Args:
+        inplace (bool): can optionally do the operation in-place.
+            Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.act = nn.ReLU6(inplace)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * self.act(x + 3) / 6
+
+
+if (TORCH_VERSION == 'parrots'
+        or digit_version(TORCH_VERSION) < digit_version('1.7')):
+    # Hardswish is not supported when PyTorch version < 1.6.
+    # And Hardswish in PyTorch 1.6 does not support inplace.
+    MODELS.register_module(module=HSwish)
+else:
+    MODELS.register_module(module=nn.Hardswish, name='HSwish')
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/non_local.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/non_local.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dd4465cd62fcb07ec1bc3410ebd272f427ec6b1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/non_local.py
@@ -0,0 +1,308 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+from typing import Dict, Optional
+
+import torch
+import torch.nn as nn
+from mmengine.model import constant_init, normal_init
+from mmengine.registry import MODELS
+
+from .conv_module import ConvModule
+
+
+class _NonLocalNd(nn.Module, metaclass=ABCMeta):
+    """Basic Non-local module.
+
+    This module is proposed in
+    "Non-local Neural Networks"
+    Paper reference: https://arxiv.org/abs/1711.07971
+    Code reference: https://github.com/AlexHex7/Non-local_pytorch
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        reduction (int): Channel reduction ratio. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`.
+            Default: True.
+        conv_cfg (None | dict): The config dict for convolution layers.
+            If not specified, it will use `nn.Conv2d` for convolution layers.
+            Default: None.
+        norm_cfg (None | dict): The config dict for normalization layers.
+            Default: None. (This parameter is only applicable to conv_out.)
+        mode (str): Options are `gaussian`, `concatenation`,
+            `embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 reduction: int = 2,
+                 use_scale: bool = True,
+                 conv_cfg: Optional[Dict] = None,
+                 norm_cfg: Optional[Dict] = None,
+                 mode: str = 'embedded_gaussian',
+                 **kwargs):
+        super().__init__()
+        self.in_channels = in_channels
+        self.reduction = reduction
+        self.use_scale = use_scale
+        self.inter_channels = max(in_channels // reduction, 1)
+        self.mode = mode
+
+        if mode not in [
+                'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'
+        ]:
+            raise ValueError("Mode should be in 'gaussian', 'concatenation', "
+                             f"'embedded_gaussian' or 'dot_product', but got "
+                             f'{mode} instead.')
+
+        # g, theta, phi are defaulted as `nn.ConvNd`.
+        # Here we use ConvModule for potential usage.
+        self.g = ConvModule(
+            self.in_channels,
+            self.inter_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            act_cfg=None)  # type: ignore
+        self.conv_out = ConvModule(
+            self.inter_channels,
+            self.in_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        if self.mode != 'gaussian':
+            self.theta = ConvModule(
+                self.in_channels,
+                self.inter_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                act_cfg=None)
+            self.phi = ConvModule(
+                self.in_channels,
+                self.inter_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                act_cfg=None)
+
+        if self.mode == 'concatenation':
+            self.concat_project = ConvModule(
+                self.inter_channels * 2,
+                1,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+                act_cfg=dict(type='ReLU'))
+
+        self.init_weights(**kwargs)
+
+    def init_weights(self, std: float = 0.01, zeros_init: bool = True) -> None:
+        if self.mode != 'gaussian':
+            for m in [self.g, self.theta, self.phi]:
+                normal_init(m.conv, std=std)
+        else:
+            normal_init(self.g.conv, std=std)
+        if zeros_init:
+            if self.conv_out.norm_cfg is None:
+                constant_init(self.conv_out.conv, 0)
+            else:
+                constant_init(self.conv_out.norm, 0)
+        else:
+            if self.conv_out.norm_cfg is None:
+                normal_init(self.conv_out.conv, std=std)
+            else:
+                normal_init(self.conv_out.norm, std=std)
+
+    def gaussian(self, theta_x: torch.Tensor,
+                 phi_x: torch.Tensor) -> torch.Tensor:
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        pairwise_weight = pairwise_weight.softmax(dim=-1)
+        return pairwise_weight
+
+    def embedded_gaussian(self, theta_x: torch.Tensor,
+                          phi_x: torch.Tensor) -> torch.Tensor:
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        if self.use_scale:
+            # theta_x.shape[-1] is `self.inter_channels`
+            pairwise_weight /= theta_x.shape[-1]**0.5
+        pairwise_weight = pairwise_weight.softmax(dim=-1)
+        return pairwise_weight
+
+    def dot_product(self, theta_x: torch.Tensor,
+                    phi_x: torch.Tensor) -> torch.Tensor:
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        pairwise_weight /= pairwise_weight.shape[-1]
+        return pairwise_weight
+
+    def concatenation(self, theta_x: torch.Tensor,
+                      phi_x: torch.Tensor) -> torch.Tensor:
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        h = theta_x.size(2)
+        w = phi_x.size(3)
+        theta_x = theta_x.repeat(1, 1, 1, w)
+        phi_x = phi_x.repeat(1, 1, h, 1)
+
+        concat_feature = torch.cat([theta_x, phi_x], dim=1)
+        pairwise_weight = self.concat_project(concat_feature)
+        n, _, h, w = pairwise_weight.size()
+        pairwise_weight = pairwise_weight.view(n, h, w)
+        pairwise_weight /= pairwise_weight.shape[-1]
+
+        return pairwise_weight
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Assume `reduction = 1`, then `inter_channels = C`
+        # or `inter_channels = C` when `mode="gaussian"`
+
+        # NonLocal1d x: [N, C, H]
+        # NonLocal2d x: [N, C, H, W]
+        # NonLocal3d x: [N, C, T, H, W]
+        n = x.size(0)
+
+        # NonLocal1d g_x: [N, H, C]
+        # NonLocal2d g_x: [N, HxW, C]
+        # NonLocal3d g_x: [N, TxHxW, C]
+        g_x = self.g(x).view(n, self.inter_channels, -1)
+        g_x = g_x.permute(0, 2, 1)
+
+        # NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H]
+        # NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW]
+        # NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW]
+        if self.mode == 'gaussian':
+            theta_x = x.view(n, self.in_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            if self.sub_sample:
+                phi_x = self.phi(x).view(n, self.in_channels, -1)
+            else:
+                phi_x = x.view(n, self.in_channels, -1)
+        elif self.mode == 'concatenation':
+            theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
+        else:
+            theta_x = self.theta(x).view(n, self.inter_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, -1)
+
+        pairwise_func = getattr(self, self.mode)
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = pairwise_func(theta_x, phi_x)
+
+        # NonLocal1d y: [N, H, C]
+        # NonLocal2d y: [N, HxW, C]
+        # NonLocal3d y: [N, TxHxW, C]
+        y = torch.matmul(pairwise_weight, g_x)
+        # NonLocal1d y: [N, C, H]
+        # NonLocal2d y: [N, C, H, W]
+        # NonLocal3d y: [N, C, T, H, W]
+        y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
+                                                    *x.size()[2:])
+
+        output = x + self.conv_out(y)
+
+        return output
+
+
+class NonLocal1d(_NonLocalNd):
+    """1D Non-local module.
+
+    Args:
+        in_channels (int): Same as `NonLocalND`.
+        sub_sample (bool): Whether to apply max pooling after pairwise
+            function (Note that the `sub_sample` is applied on spatial only).
+            Default: False.
+        conv_cfg (None | dict): Same as `NonLocalND`.
+            Default: dict(type='Conv1d').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 sub_sample: bool = False,
+                 conv_cfg: Dict = dict(type='Conv1d'),
+                 **kwargs):
+        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+
+        self.sub_sample = sub_sample
+
+        if sub_sample:
+            max_pool_layer = nn.MaxPool1d(kernel_size=2)
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
+
+
+@MODELS.register_module()
+class NonLocal2d(_NonLocalNd):
+    """2D Non-local module.
+
+    Args:
+        in_channels (int): Same as `NonLocalND`.
+        sub_sample (bool): Whether to apply max pooling after pairwise
+            function (Note that the `sub_sample` is applied on spatial only).
+            Default: False.
+        conv_cfg (None | dict): Same as `NonLocalND`.
+            Default: dict(type='Conv2d').
+    """
+
+    _abbr_ = 'nonlocal_block'
+
+    def __init__(self,
+                 in_channels: int,
+                 sub_sample: bool = False,
+                 conv_cfg: Dict = dict(type='Conv2d'),
+                 **kwargs):
+        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+
+        self.sub_sample = sub_sample
+
+        if sub_sample:
+            max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
+
+
+class NonLocal3d(_NonLocalNd):
+    """3D Non-local module.
+
+    Args:
+        in_channels (int): Same as `NonLocalND`.
+        sub_sample (bool): Whether to apply max pooling after pairwise
+            function (Note that the `sub_sample` is applied on spatial only).
+            Default: False.
+        conv_cfg (None | dict): Same as `NonLocalND`.
+            Default: dict(type='Conv3d').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 sub_sample: bool = False,
+                 conv_cfg: Dict = dict(type='Conv3d'),
+                 **kwargs):
+        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+        self.sub_sample = sub_sample
+
+        if sub_sample:
+            max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/norm.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aabab21a0096c185a929b11861c057cbea4b84d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/norm.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+from typing import Dict, Tuple, Union
+
+import torch.nn as nn
+from mmengine.registry import MODELS
+from mmengine.utils import is_tuple_of
+from mmengine.utils.dl_utils.parrots_wrapper import (SyncBatchNorm, _BatchNorm,
+                                                     _InstanceNorm)
+
+MODELS.register_module('BN', module=nn.BatchNorm2d)
+MODELS.register_module('BN1d', module=nn.BatchNorm1d)
+MODELS.register_module('BN2d', module=nn.BatchNorm2d)
+MODELS.register_module('BN3d', module=nn.BatchNorm3d)
+MODELS.register_module('SyncBN', module=SyncBatchNorm)
+MODELS.register_module('GN', module=nn.GroupNorm)
+MODELS.register_module('LN', module=nn.LayerNorm)
+MODELS.register_module('IN', module=nn.InstanceNorm2d)
+MODELS.register_module('IN1d', module=nn.InstanceNorm1d)
+MODELS.register_module('IN2d', module=nn.InstanceNorm2d)
+MODELS.register_module('IN3d', module=nn.InstanceNorm3d)
+
+
+def infer_abbr(class_type):
+    """Infer abbreviation from the class name.
+
+    When we build a norm layer with `build_norm_layer()`, we want to preserve
+    the norm type in variable names, e.g, self.bn1, self.gn. This method will
+    infer the abbreviation to map class types to abbreviations.
+
+    Rule 1: If the class has the property "_abbr_", return the property.
+    Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or
+    InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and
+    "in" respectively.
+    Rule 3: If the class name contains "batch", "group", "layer" or "instance",
+    the abbreviation of this layer will be "bn", "gn", "ln" and "in"
+    respectively.
+    Rule 4: Otherwise, the abbreviation falls back to "norm".
+
+    Args:
+        class_type (type): The norm layer type.
+
+    Returns:
+        str: The inferred abbreviation.
+    """
+    if not inspect.isclass(class_type):
+        raise TypeError(
+            f'class_type must be a type, but got {type(class_type)}')
+    if hasattr(class_type, '_abbr_'):
+        return class_type._abbr_
+    if issubclass(class_type, _InstanceNorm):  # IN is a subclass of BN
+        return 'in'
+    elif issubclass(class_type, _BatchNorm):
+        return 'bn'
+    elif issubclass(class_type, nn.GroupNorm):
+        return 'gn'
+    elif issubclass(class_type, nn.LayerNorm):
+        return 'ln'
+    else:
+        class_name = class_type.__name__.lower()
+        if 'batch' in class_name:
+            return 'bn'
+        elif 'group' in class_name:
+            return 'gn'
+        elif 'layer' in class_name:
+            return 'ln'
+        elif 'instance' in class_name:
+            return 'in'
+        else:
+            return 'norm_layer'
+
+
+def build_norm_layer(cfg: Dict,
+                     num_features: int,
+                     postfix: Union[int, str] = '') -> Tuple[str, nn.Module]:
+    """Build normalization layer.
+
+    Args:
+        cfg (dict): The norm layer config, which should contain:
+
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a norm layer.
+            - requires_grad (bool, optional): Whether stop gradient updates.
+        num_features (int): Number of input channels.
+        postfix (int | str): The postfix to be appended into norm abbreviation
+            to create named layer.
+
+    Returns:
+        tuple[str, nn.Module]: The first element is the layer name consisting
+        of abbreviation and postfix, e.g., bn1, gn. The second element is the
+        created norm layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+
+    if inspect.isclass(layer_type):
+        norm_layer = layer_type
+    else:
+        # Switch registry to the target scope. If `norm_layer` cannot be found
+        # in the registry, fallback to search `norm_layer` in the
+        # mmengine.MODELS.
+        with MODELS.switch_scope_and_registry(None) as registry:
+            norm_layer = registry.get(layer_type)
+        if norm_layer is None:
+            raise KeyError(f'Cannot find {norm_layer} in registry under '
+                           f'scope name {registry.scope}')
+    abbr = infer_abbr(norm_layer)
+
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+
+    requires_grad = cfg_.pop('requires_grad', True)
+    cfg_.setdefault('eps', 1e-5)
+    if norm_layer is not nn.GroupNorm:
+        layer = norm_layer(num_features, **cfg_)
+        if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
+            layer._specify_ddp_gpu_num(1)
+    else:
+        assert 'num_groups' in cfg_
+        layer = norm_layer(num_channels=num_features, **cfg_)
+
+    for param in layer.parameters():
+        param.requires_grad = requires_grad
+
+    return name, layer
+
+
+def is_norm(layer: nn.Module,
+            exclude: Union[type, tuple, None] = None) -> bool:
+    """Check if a layer is a normalization layer.
+
+    Args:
+        layer (nn.Module): The layer to be checked.
+        exclude (type | tuple[type]): Types to be excluded.
+
+    Returns:
+        bool: Whether the layer is a norm layer.
+    """
+    if exclude is not None:
+        if not isinstance(exclude, tuple):
+            exclude = (exclude, )
+        if not is_tuple_of(exclude, type):
+            raise TypeError(
+                f'"exclude" must be either None or type or a tuple of types, '
+                f'but got {type(exclude)}: {exclude}')
+
+    if exclude and isinstance(layer, exclude):
+        return False
+
+    all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)
+    return isinstance(layer, all_norm_bases)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/padding.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b29996b94a246d63a3661c345219eb1955a03d5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/padding.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+from typing import Dict
+
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+MODELS.register_module('zero', module=nn.ZeroPad2d)
+MODELS.register_module('reflect', module=nn.ReflectionPad2d)
+MODELS.register_module('replicate', module=nn.ReplicationPad2d)
+
+
+def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
+    """Build padding layer.
+
+    Args:
+        cfg (dict): The padding layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a padding layer.
+
+    Returns:
+        nn.Module: Created padding layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+
+    cfg_ = cfg.copy()
+    padding_type = cfg_.pop('type')
+    if inspect.isclass(padding_type):
+        return padding_type(*args, **kwargs, **cfg_)
+    # Switch registry to the target scope. If `padding_layer` cannot be found
+    # in the registry, fallback to search `padding_layer` in the
+    # mmengine.MODELS.
+    with MODELS.switch_scope_and_registry(None) as registry:
+        padding_layer = registry.get(padding_type)
+    if padding_layer is None:
+        raise KeyError(f'Cannot find {padding_layer} in registry under scope '
+                       f'name {registry.scope}')
+    layer = padding_layer(*args, **kwargs, **cfg_)
+
+    return layer
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/plugin.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..3195ed13cf764febf5827224edf1abf9dc951efe
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/plugin.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import platform
+from typing import Dict, Tuple, Union
+
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+if platform.system() == 'Windows':
+    import regex as re  # type: ignore
+else:
+    import re  # type: ignore
+
+
+def infer_abbr(class_type: type) -> str:
+    """Infer abbreviation from the class name.
+
+    This method will infer the abbreviation to map class types to
+    abbreviations.
+
+    Rule 1: If the class has the property "abbr", return the property.
+    Rule 2: Otherwise, the abbreviation falls back to snake case of class
+    name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``.
+
+    Args:
+        class_type (type): The norm layer type.
+
+    Returns:
+        str: The inferred abbreviation.
+    """
+
+    def camel2snack(word):
+        """Convert camel case word into snack case.
+
+        Modified from `inflection lib
+        <https://inflection.readthedocs.io/en/latest/#inflection.underscore>`_.
+
+        Example::
+
+            >>> camel2snack("FancyBlock")
+            'fancy_block'
+        """
+
+        word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word)
+        word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word)
+        word = word.replace('-', '_')
+        return word.lower()
+
+    if not inspect.isclass(class_type):
+        raise TypeError(
+            f'class_type must be a type, but got {type(class_type)}')
+    if hasattr(class_type, '_abbr_'):
+        return class_type._abbr_  # type: ignore
+    else:
+        return camel2snack(class_type.__name__)
+
+
+def build_plugin_layer(cfg: Dict,
+                       postfix: Union[int, str] = '',
+                       **kwargs) -> Tuple[str, nn.Module]:
+    """Build plugin layer.
+
+    Args:
+        cfg (dict): cfg should contain:
+
+            - type (str): identify plugin layer type.
+            - layer args: args needed to instantiate a plugin layer.
+        postfix (int, str): appended into norm abbreviation to
+            create named layer. Default: ''.
+
+    Returns:
+        tuple[str, nn.Module]: The first one is the concatenation of
+        abbreviation and postfix. The second is the created plugin layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if inspect.isclass(layer_type):
+        plugin_layer = layer_type
+    else:
+        # Switch registry to the target scope. If `plugin_layer` cannot be
+        # found in the registry, fallback to search `plugin_layer` in the
+        # mmengine.MODELS.
+        with MODELS.switch_scope_and_registry(None) as registry:
+            plugin_layer = registry.get(layer_type)
+        if plugin_layer is None:
+            raise KeyError(
+                f'Cannot find {plugin_layer} in registry under scope '
+                f'name {registry.scope}')
+    abbr = infer_abbr(plugin_layer)
+
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+
+    layer = plugin_layer(**kwargs, **cfg_)
+
+    return name, layer
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/scale.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..a47379898f75117e5ca2176d9a5f225f563d7b1e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/scale.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+
+class Scale(nn.Module):
+    """A learnable scale parameter.
+
+    This layer scales the input by a learnable factor. It multiplies a
+    learnable scale parameter of shape (1,) with input of any shape.
+
+    Args:
+        scale (float): Initial value of scale factor. Default: 1.0
+    """
+
+    def __init__(self, scale: float = 1.0):
+        super().__init__()
+        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * self.scale
+
+
+class LayerScale(nn.Module):
+    """LayerScale layer.
+
+    Args:
+        dim (int): Dimension of input features.
+        inplace (bool): Whether performs operation in-place.
+            Default: `False`.
+        data_format (str): The input data format, could be 'channels_last'
+            or 'channels_first', representing (B, C, H, W) and
+            (B, N, C) format data respectively. Default: 'channels_last'.
+        scale (float): Initial value of scale factor. Default: 1.0
+    """
+
+    def __init__(self,
+                 dim: int,
+                 inplace: bool = False,
+                 data_format: str = 'channels_last',
+                 scale: float = 1e-5):
+        super().__init__()
+        assert data_format in ('channels_last', 'channels_first'), \
+            "'data_format' could only be channels_last or channels_first."
+        self.inplace = inplace
+        self.data_format = data_format
+        self.weight = nn.Parameter(torch.ones(dim) * scale)
+
+    def forward(self, x) -> torch.Tensor:
+        if self.data_format == 'channels_first':
+            shape = tuple((1, -1, *(1 for _ in range(x.dim() - 2))))
+        else:
+            shape = tuple((*(1 for _ in range(x.dim() - 1)), -1))
+        if self.inplace:
+            return x.mul_(self.weight.view(*shape))
+        else:
+            return x * self.weight.view(*shape)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/swish.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/swish.py
new file mode 100644
index 0000000000000000000000000000000000000000..75ad75b9d73f11375ed63491d9e29efd6f43f143
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/swish.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+
+@MODELS.register_module()
+class Swish(nn.Module):
+    """Swish Module.
+
+    This module applies the swish function:
+
+    .. math::
+        Swish(x) = x * Sigmoid(x)
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.sigmoid(x)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/transformer.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f83b9a6977bf821985cb4c2f78de84fcf103fffb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/transformer.py
@@ -0,0 +1,951 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+import warnings
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, ModuleList, Sequential
+from mmengine.registry import MODELS
+from mmengine.utils import deprecated_api_warning, to_2tuple
+
+from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer,
+                      build_norm_layer)
+from .drop import build_dropout
+from .scale import LayerScale
+
+# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
+try:
+    from mmcv.ops.multi_scale_deform_attn import \
+        MultiScaleDeformableAttention  # noqa F401
+    warnings.warn(
+        ImportWarning(
+            '``MultiScaleDeformableAttention`` has been moved to '
+            '``mmcv.ops.multi_scale_deform_attn``, please change original path '  # noqa E501
+            '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '  # noqa E501
+            'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '  # noqa E501
+        ))
+
+except ImportError:
+    warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
+                  '``mmcv.ops.multi_scale_deform_attn``, '
+                  'You should install ``mmcv`` rather than ``mmcv-lite`` '
+                  'if you need this module. ')
+
+
+def build_positional_encoding(cfg, default_args=None):
+    """Builder for Position Encoding."""
+    return MODELS.build(cfg, default_args=default_args)
+
+
+def build_attention(cfg, default_args=None):
+    """Builder for attention."""
+    return MODELS.build(cfg, default_args=default_args)
+
+
+def build_feedforward_network(cfg, default_args=None):
+    """Builder for feed-forward network (FFN)."""
+    return MODELS.build(cfg, default_args=default_args)
+
+
+def build_transformer_layer(cfg, default_args=None):
+    """Builder for transformer layer."""
+    return MODELS.build(cfg, default_args=default_args)
+
+
+def build_transformer_layer_sequence(cfg, default_args=None):
+    """Builder for transformer encoder and transformer decoder."""
+    return MODELS.build(cfg, default_args=default_args)
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding adaptively to the input.
+
+    This module can make input get fully covered by filter
+    you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad
+    zero around input. The "corner"  mode would pad zero
+    to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel. Default: 1.
+        stride (int | tuple): Stride of the filter. Default: 1.
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+        super().__init__()
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        """Calculate the padding size of input.
+
+        Args:
+            input_shape (:obj:`torch.Size`): arrange as (H, W).
+
+        Returns:
+            Tuple[int]: The padding size along the
+            original H and W directions
+        """
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        """Add padding to `x`
+
+        Args:
+            x (Tensor): Input tensor has shape (B, C, H, W).
+
+        Returns:
+            Tensor: The tensor with adaptive padding
+        """
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The type of convolution
+            to generate patch embedding. Default: "Conv2d".
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: 16.
+        padding (int | tuple | string): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only works when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv2d',
+                 kernel_size=16,
+                 stride=16,
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adaptive_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # e.g. when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adaptive_padding:
+                pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (out_h, out_w).
+        """
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map ((used in Swin Transformer)).
+    Our implementation uses `nn.Unfold` to
+    merge patches, which is about 25% faster than the original
+    implementation. However, we need to modify pretrained
+    models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+            to gets fully covered by filter and stride you specified.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adaptive_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+            H, W = x.shape[-2:]
+
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+        x = self.sampler(x)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
+
+
+@MODELS.register_module()
+class MultiheadAttention(BaseModule):
+    """A wrapper for ``torch.nn.MultiheadAttention``.
+
+    This module implements MultiheadAttention with identity connection,
+    and positional encoding  is also passed as input.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): When it is True,  Key, Query and Value are shape of
+            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+             Default to False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=dict(type='Dropout', drop_prob=0.),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+        super().__init__(init_cfg)
+        if 'dropout' in kwargs:
+            warnings.warn(
+                'The arguments `dropout` in MultiheadAttention '
+                'has been deprecated, now you can separately '
+                'set `attn_drop`(float), proj_drop(float), '
+                'and `dropout_layer`(dict) ', DeprecationWarning)
+            attn_drop = kwargs['dropout']
+            dropout_layer['drop_prob'] = kwargs.pop('dropout')
+
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.batch_first = batch_first
+
+        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
+                                          **kwargs)
+
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else nn.Identity()
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiheadAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_pos=None,
+                attn_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `MultiheadAttention`.
+
+        **kwargs allow passing a more general data flow when combining
+        with other operations in `transformerlayer`.
+
+        Args:
+            query (Tensor): The input query with shape [num_queries, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+                If None, the ``query`` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            identity (Tensor): This tensor, with the same shape as x,
+                will be used for the identity link.
+                If None, `x` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `x`. If not None, it will
+                be added to `x` before forward function. Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+
+        Returns:
+            Tensor: forwarded results with shape
+            [num_queries, bs, embed_dims]
+            if self.batch_first is False, else
+            [bs, num_queries embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+        if identity is None:
+            identity = query
+        if key_pos is None:
+            if query_pos is not None:
+                # use query_pos if key_pos is not available
+                if query_pos.shape == key.shape:
+                    key_pos = query_pos
+                else:
+                    warnings.warn(f'position encoding of key is'
+                                  f'missing in {self.__class__.__name__}.')
+        if query_pos is not None:
+            query = query + query_pos
+        if key_pos is not None:
+            key = key + key_pos
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            query = query.transpose(0, 1)
+            key = key.transpose(0, 1)
+            value = value.transpose(0, 1)
+
+        out = self.attn(
+            query=query,
+            key=key,
+            value=value,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+@MODELS.register_module()
+class FFN(BaseModule):
+    """Implements feed-forward networks (FFNs) with identity connection.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        num_fcs (int, optional): The number of fully-connected layers in
+            FFNs. Default: 2.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='ReLU')
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        add_identity (bool, optional): Whether to add the
+            identity connection. Default: `True`.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        layer_scale_init_value (float): Initial value of scale factor in
+            LayerScale. Default: 1.0
+    """
+
+    @deprecated_api_warning(
+        {
+            'dropout': 'ffn_drop',
+            'add_residual': 'add_identity'
+        },
+        cls_name='FFN')
+    def __init__(self,
+                 embed_dims=256,
+                 feedforward_channels=1024,
+                 num_fcs=2,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 add_identity=True,
+                 init_cfg=None,
+                 layer_scale_init_value=0.):
+        super().__init__(init_cfg)
+        assert num_fcs >= 2, 'num_fcs should be no less ' \
+            f'than 2. got {num_fcs}.'
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.num_fcs = num_fcs
+
+        layers = []
+        in_channels = embed_dims
+        for _ in range(num_fcs - 1):
+            layers.append(
+                Sequential(
+                    Linear(in_channels, feedforward_channels),
+                    build_activation_layer(act_cfg), nn.Dropout(ffn_drop)))
+            in_channels = feedforward_channels
+        layers.append(Linear(feedforward_channels, embed_dims))
+        layers.append(nn.Dropout(ffn_drop))
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+        self.add_identity = add_identity
+
+        if layer_scale_init_value > 0:
+            self.gamma2 = LayerScale(embed_dims, scale=layer_scale_init_value)
+        else:
+            self.gamma2 = nn.Identity()
+
+    @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
+    def forward(self, x, identity=None):
+        """Forward function for `FFN`.
+
+        The function would add x to the output tensor if residue is None.
+        """
+        out = self.layers(x)
+        out = self.gamma2(out)
+        if not self.add_identity:
+            return self.dropout_layer(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+@MODELS.register_module()
+class BaseTransformerLayer(BaseModule):
+    """Base `TransformerLayer` for vision transformer.
+
+    It can be built from `mmcv.ConfigDict` and support more flexible
+    customization, for example, using any number of `FFN or LN ` and
+    use different kinds of `attention` by specifying a list of `ConfigDict`
+    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
+    when you specifying `norm` as the first element of `operation_order`.
+    More details about the `prenorm`: `On Layer Normalization in the
+    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
+
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for `self_attention` or `cross_attention` modules,
+            The order of the configs in the list should be consistent with
+            corresponding attentions in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config. Default: None.
+        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for FFN, The order of the configs in the list should be
+            consistent with corresponding ffn in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Support `prenorm` when you specifying first element as `norm`.
+            Default：None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape
+            of (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+    """
+
+    def __init__(self,
+                 attn_cfgs=None,
+                 ffn_cfgs=dict(
+                     type='FFN',
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 operation_order=None,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+
+        deprecated_args = dict(
+            feedforward_channels='feedforward_channels',
+            ffn_dropout='ffn_drop',
+            ffn_num_fcs='num_fcs')
+        for ori_name, new_name in deprecated_args.items():
+            if ori_name in kwargs:
+                warnings.warn(
+                    f'The arguments `{ori_name}` in BaseTransformerLayer '
+                    f'has been deprecated, now you should set `{new_name}` '
+                    f'and other FFN related arguments '
+                    f'to a dict named `ffn_cfgs`. ', DeprecationWarning)
+                ffn_cfgs[new_name] = kwargs[ori_name]
+
+        super().__init__(init_cfg)
+
+        self.batch_first = batch_first
+
+        assert set(operation_order) & {
+            'self_attn', 'norm', 'ffn', 'cross_attn'} == \
+            set(operation_order), f'The operation_order of' \
+            f' {self.__class__.__name__} should ' \
+            f'contains all four operation type ' \
+            f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
+
+        num_attn = operation_order.count('self_attn') + operation_order.count(
+            'cross_attn')
+        if isinstance(attn_cfgs, dict):
+            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        else:
+            assert num_attn == len(attn_cfgs), f'The length ' \
+                f'of attn_cfg {num_attn} is ' \
+                f'not consistent with the number of attention' \
+                f'in operation_order {operation_order}.'
+
+        self.num_attn = num_attn
+        self.operation_order = operation_order
+        self.norm_cfg = norm_cfg
+        self.pre_norm = operation_order[0] == 'norm'
+        self.attentions = ModuleList()
+
+        index = 0
+        for operation_name in operation_order:
+            if operation_name in ['self_attn', 'cross_attn']:
+                if 'batch_first' in attn_cfgs[index]:
+                    assert self.batch_first == attn_cfgs[index]['batch_first']
+                else:
+                    attn_cfgs[index]['batch_first'] = self.batch_first
+                attention = build_attention(attn_cfgs[index])
+                # Some custom attentions used as `self_attn`
+                # or `cross_attn` can have different behavior.
+                attention.operation_name = operation_name
+                self.attentions.append(attention)
+                index += 1
+
+        self.embed_dims = self.attentions[0].embed_dims
+
+        self.ffns = ModuleList()
+        num_ffns = operation_order.count('ffn')
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = ConfigDict(ffn_cfgs)
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
+        assert len(ffn_cfgs) == num_ffns
+        for ffn_index in range(num_ffns):
+            if 'embed_dims' not in ffn_cfgs[ffn_index]:
+                ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims
+            else:
+                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
+            self.ffns.append(
+                build_feedforward_network(ffn_cfgs[ffn_index],
+                                          dict(type='FFN')))
+
+        self.norms = ModuleList()
+        num_norms = operation_order.count('norm')
+        for _ in range(num_norms):
+            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+
+        **kwargs contains some specific arguments of attentions.
+
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                        f'attn_masks {len(attn_masks)} must be equal ' \
+                        f'to the number of attention in ' \
+                        f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            if layer == 'self_attn':
+                temp_key = temp_value = query
+                query = self.attentions[attn_index](
+                    query,
+                    temp_key,
+                    temp_value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
+
+
+@MODELS.register_module()
+class TransformerLayerSequence(BaseModule):
+    """Base class for TransformerEncoder and TransformerDecoder in vision
+    transformer.
+
+    As base-class of Encoder and Decoder in vision transformer.
+    Support customization such as specifying different kind
+    of `transformer_layer` in `transformer_coder`.
+
+    Args:
+        transformerlayer (list[obj:`mmcv.ConfigDict`] |
+            obj:`mmcv.ConfigDict`): Config of transformerlayer
+            in TransformerCoder. If it is obj:`mmcv.ConfigDict`,
+             it would be repeated `num_layer` times to a
+             list[`mmcv.ConfigDict`]. Default: None.
+        num_layers (int): The number of `TransformerLayer`. Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
+        super().__init__(init_cfg)
+        if isinstance(transformerlayers, dict):
+            transformerlayers = [
+                copy.deepcopy(transformerlayers) for _ in range(num_layers)
+            ]
+        else:
+            assert isinstance(transformerlayers, list) and \
+                   len(transformerlayers) == num_layers
+        self.num_layers = num_layers
+        self.layers = ModuleList()
+        for i in range(num_layers):
+            self.layers.append(build_transformer_layer(transformerlayers[i]))
+        self.embed_dims = self.layers[0].embed_dims
+        self.pre_norm = self.layers[0].pre_norm
+
+    def forward(self,
+                query,
+                key,
+                value,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerCoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_queries, bs, embed_dims)`.
+            key (Tensor): The key tensor with shape
+                `(num_keys, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_keys, bs, embed_dims)`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor], optional): Each element is 2D Tensor
+                which is used in calculation of corresponding attention in
+                operation_order. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in self-attention
+                Default: None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor:  results with shape [num_queries, bs, embed_dims].
+        """
+        for layer in self.layers:
+            query = layer(
+                query,
+                key,
+                value,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                attn_masks=attn_masks,
+                query_key_padding_mask=query_key_padding_mask,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+        return query
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/upsample.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/upsample.py
new file mode 100644
index 0000000000000000000000000000000000000000..78fb5bf371712d13a72edf5d57151dca8fce6953
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/upsample.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import xavier_init
+from mmengine.registry import MODELS
+
+MODELS.register_module('nearest', module=nn.Upsample)
+MODELS.register_module('bilinear', module=nn.Upsample)
+
+
+@MODELS.register_module(name='pixel_shuffle')
+class PixelShufflePack(nn.Module):
+    """Pixel Shuffle upsample layer.
+
+    This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to
+    achieve a simple upsampling with pixel shuffle.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        scale_factor (int): Upsample ratio.
+        upsample_kernel (int): Kernel size of the conv layer to expand the
+            channels.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, scale_factor: int,
+                 upsample_kernel: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.scale_factor = scale_factor
+        self.upsample_kernel = upsample_kernel
+        self.upsample_conv = nn.Conv2d(
+            self.in_channels,
+            self.out_channels * scale_factor * scale_factor,
+            self.upsample_kernel,
+            padding=(self.upsample_kernel - 1) // 2)
+        self.init_weights()
+
+    def init_weights(self):
+        xavier_init(self.upsample_conv, distribution='uniform')
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.upsample_conv(x)
+        x = F.pixel_shuffle(x, self.scale_factor)
+        return x
+
+
+def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
+    """Build upsample layer.
+
+    Args:
+        cfg (dict): The upsample layer config, which should contain:
+
+            - type (str): Layer type.
+            - scale_factor (int): Upsample ratio, which is not applicable to
+              deconv.
+            - layer args: Args needed to instantiate a upsample layer.
+        args (argument list): Arguments passed to the ``__init__``
+            method of the corresponding conv layer.
+        kwargs (keyword arguments): Keyword arguments passed to the
+            ``__init__`` method of the corresponding conv layer.
+
+    Returns:
+        nn.Module: Created upsample layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'type' not in cfg:
+        raise KeyError(
+            f'the cfg dict must contain the key "type", but got {cfg}')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+
+    if inspect.isclass(layer_type):
+        upsample = layer_type
+    # Switch registry to the target scope. If `upsample` cannot be found
+    # in the registry, fallback to search `upsample` in the
+    # mmengine.MODELS.
+    else:
+        with MODELS.switch_scope_and_registry(None) as registry:
+            upsample = registry.get(layer_type)
+        if upsample is None:
+            raise KeyError(f'Cannot find {upsample} in registry under scope '
+                           f'name {registry.scope}')
+        if upsample is nn.Upsample:
+            cfg_['mode'] = layer_type
+    layer = upsample(*args, **kwargs, **cfg_)
+    return layer
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/wrappers.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc98c35584430a85910ef7a776e83d1ab9cd036a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/wrappers.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py  # noqa: E501
+
+Wrap some nn modules to support empty tensor input. Currently, these wrappers
+are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask
+heads are trained on only positive RoIs.
+"""
+import math
+
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+from torch.nn.modules.utils import _pair, _triple
+
+if torch.__version__ == 'parrots':
+    TORCH_VERSION = torch.__version__
+else:
+    # torch.__version__ could be 1.3.1+cu92, we only need the first two
+    # for comparison
+    TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
+
+
+def obsolete_torch_version(torch_version, version_threshold) -> bool:
+    return torch_version == 'parrots' or torch_version <= version_threshold
+
+
+class NewEmptyTensorOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, new_shape: tuple) -> torch.Tensor:
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+
+    @staticmethod
+    def backward(ctx, grad: torch.Tensor) -> tuple:
+        shape = ctx.shape
+        return NewEmptyTensorOp.apply(grad, shape), None
+
+
+@MODELS.register_module('Conv', force=True)
+class Conv2d(nn.Conv2d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
+                                     self.padding, self.stride, self.dilation):
+                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@MODELS.register_module('Conv3d', force=True)
+class Conv3d(nn.Conv3d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
+                                     self.padding, self.stride, self.dilation):
+                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@MODELS.register_module()
+@MODELS.register_module('deconv')
+class ConvTranspose2d(nn.ConvTranspose2d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
+                                         self.padding, self.stride,
+                                         self.dilation, self.output_padding):
+                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@MODELS.register_module()
+@MODELS.register_module('deconv3d')
+class ConvTranspose3d(nn.ConvTranspose3d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
+                                         self.padding, self.stride,
+                                         self.dilation, self.output_padding):
+                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+class MaxPool2d(nn.MaxPool2d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
+            out_shape = list(x.shape[:2])
+            for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
+                                     _pair(self.padding), _pair(self.stride),
+                                     _pair(self.dilation)):
+                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
+                o = math.ceil(o) if self.ceil_mode else math.floor(o)
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            return empty
+
+        return super().forward(x)
+
+
+class MaxPool3d(nn.MaxPool3d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
+            out_shape = list(x.shape[:2])
+            for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size),
+                                     _triple(self.padding),
+                                     _triple(self.stride),
+                                     _triple(self.dilation)):
+                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
+                o = math.ceil(o) if self.ceil_mode else math.floor(o)
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            return empty
+
+        return super().forward(x)
+
+
+class Linear(torch.nn.Linear):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # empty tensor forward of Linear layer is supported in Pytorch 1.6
+        if obsolete_torch_version(TORCH_VERSION, (1, 5)) and x.numel() == 0:
+            out_shape = [x.shape[0], self.out_features]
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/resnet.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fc6abf6ac60b982a8c7998e0545bc55f9ceee78
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/resnet.py
@@ -0,0 +1,321 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import Optional, Sequence, Tuple, Union
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmengine.model import constant_init, kaiming_init
+from mmengine.runner import load_checkpoint
+from torch import Tensor
+
+
+def conv3x3(in_planes: int,
+            out_planes: int,
+            stride: int = 1,
+            dilation: int = 1):
+    """3x3 convolution with padding."""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        dilation=dilation,
+        bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 style: str = 'pytorch',
+                 with_cp: bool = False):
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+        self.conv1 = conv3x3(inplanes, planes, stride, dilation)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        assert not with_cp
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 style: str = 'pytorch',
+                 with_cp: bool = False):
+        """Bottleneck block.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+        if style == 'pytorch':
+            conv1_stride = 1
+            conv2_stride = stride
+        else:
+            conv1_stride = stride
+            conv2_stride = 1
+        self.conv1 = nn.Conv2d(
+            inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(
+            planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    def forward(self, x: Tensor) -> Tensor:
+
+        def _inner_forward(x):
+            residual = x
+
+            out = self.conv1(x)
+            out = self.bn1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.bn2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.bn3(out)
+
+            if self.downsample is not None:
+                residual = self.downsample(x)
+
+            out += residual
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def make_res_layer(block: nn.Module,
+                   inplanes: int,
+                   planes: int,
+                   blocks: int,
+                   stride: int = 1,
+                   dilation: int = 1,
+                   style: str = 'pytorch',
+                   with_cp: bool = False) -> nn.Module:
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = nn.Sequential(
+            nn.Conv2d(
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                bias=False),
+            nn.BatchNorm2d(planes * block.expansion),
+        )
+
+    layers = []
+    layers.append(
+        block(
+            inplanes,
+            planes,
+            stride,
+            dilation,
+            downsample,
+            style=style,
+            with_cp=with_cp))
+    inplanes = planes * block.expansion
+    for _ in range(1, blocks):
+        layers.append(
+            block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp))
+
+    return nn.Sequential(*layers)
+
+
+class ResNet(nn.Module):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        num_stages (int): Resnet stages, normally 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
+            running stats (mean and var).
+        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth: int,
+                 num_stages: int = 4,
+                 strides: Sequence[int] = (1, 2, 2, 2),
+                 dilations: Sequence[int] = (1, 1, 1, 1),
+                 out_indices: Sequence[int] = (0, 1, 2, 3),
+                 style: str = 'pytorch',
+                 frozen_stages: int = -1,
+                 bn_eval: bool = True,
+                 bn_frozen: bool = False,
+                 with_cp: bool = False):
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        assert num_stages >= 1 and num_stages <= 4
+        block, stage_blocks = self.arch_settings[depth]
+        stage_blocks = stage_blocks[:num_stages]  # type: ignore
+        assert len(strides) == len(dilations) == num_stages
+        assert max(out_indices) < num_stages
+
+        self.out_indices = out_indices
+        self.style = style
+        self.frozen_stages = frozen_stages
+        self.bn_eval = bn_eval
+        self.bn_frozen = bn_frozen
+        self.with_cp = with_cp
+
+        self.inplanes: int = 64
+        self.conv1 = nn.Conv2d(
+            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            planes = 64 * 2**i
+            res_layer = make_res_layer(
+                block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                with_cp=with_cp)
+            self.inplanes = planes * block.expansion  # type: ignore
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self.feat_dim = block.expansion * 64 * 2**(  # type: ignore
+            len(stage_blocks) - 1)
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def train(self, mode: bool = True) -> None:
+        super().train(mode)
+        if self.bn_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+                    if self.bn_frozen:
+                        for params in m.parameters():
+                            params.requires_grad = False
+        if mode and self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+            for param in self.bn1.parameters():
+                param.requires_grad = False
+            self.bn1.eval()
+            self.bn1.weight.requires_grad = False
+            self.bn1.bias.requires_grad = False
+            for i in range(1, self.frozen_stages + 1):
+                mod = getattr(self, f'layer{i}')
+                mod.eval()
+                for param in mod.parameters():
+                    param.requires_grad = False
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/__init__.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..04d45725dc40a15c086f21fc5ce73373318c578e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp
+from .search import RFSearchHook
+
+__all__ = ['BaseConvRFSearchOp', 'Conv2dRFSearchOp', 'RFSearchHook']
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/operator.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fa45abb0a282954cd5e06503596141c9a314de4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/operator.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmengine.logging import print_log
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from .utils import expand_rates, get_single_padding
+
+
+class BaseConvRFSearchOp(BaseModule):
+    """Based class of ConvRFSearchOp.
+
+    Args:
+        op_layer (nn.Module): pytorch module, e,g, Conv2d
+        global_config (dict): config dict.
+    """
+
+    def __init__(self, op_layer: nn.Module, global_config: dict):
+        super().__init__()
+        self.op_layer = op_layer
+        self.global_config = global_config
+
+    def normlize(self, weights: nn.Parameter) -> nn.Parameter:
+        """Normalize weights.
+
+        Args:
+            weights (nn.Parameter): Weights to be normalized.
+
+        Returns:
+            nn.Parameters: Normalized weights.
+        """
+        abs_weights = torch.abs(weights)
+        normalized_weights = abs_weights / torch.sum(abs_weights)
+        return normalized_weights
+
+
+class Conv2dRFSearchOp(BaseConvRFSearchOp):
+    """Enable Conv2d with receptive field searching ability.
+
+    Args:
+        op_layer (nn.Module): pytorch module, e,g, Conv2d
+        global_config (dict): config dict. Defaults to None.
+            By default this must include:
+
+            - "init_alphas": The value for initializing weights of each branch.
+            - "num_branches": The controller of the size of
+              search space (the number of branches).
+            - "exp_rate": The controller of the sparsity of search space.
+            - "mmin": The minimum dilation rate.
+            - "mmax": The maximum dilation rate.
+
+            Extra keys may exist, but are used by RFSearchHook, e.g., "step",
+            "max_step", "search_interval", and "skip_layer".
+        verbose (bool): Determines whether to print rf-next
+            related logging messages.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 op_layer: nn.Module,
+                 global_config: dict,
+                 verbose: bool = True):
+        super().__init__(op_layer, global_config)
+        assert global_config is not None, 'global_config is None'
+        self.num_branches = global_config['num_branches']
+        assert self.num_branches in [2, 3]
+        self.verbose = verbose
+        init_dilation = op_layer.dilation
+        self.dilation_rates = expand_rates(init_dilation, global_config)
+        if self.op_layer.kernel_size[
+                0] == 1 or self.op_layer.kernel_size[0] % 2 == 0:
+            self.dilation_rates = [(op_layer.dilation[0], r[1])
+                                   for r in self.dilation_rates]
+        if self.op_layer.kernel_size[
+                1] == 1 or self.op_layer.kernel_size[1] % 2 == 0:
+            self.dilation_rates = [(r[0], op_layer.dilation[1])
+                                   for r in self.dilation_rates]
+
+        self.branch_weights = nn.Parameter(torch.Tensor(self.num_branches))
+        if self.verbose:
+            print_log(f'Expand as {self.dilation_rates}', 'current')
+        nn.init.constant_(self.branch_weights, global_config['init_alphas'])
+
+    def forward(self, input: Tensor) -> Tensor:
+        norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)])
+        if len(self.dilation_rates) == 1:
+            outputs = [
+                nn.functional.conv2d(
+                    input,
+                    weight=self.op_layer.weight,
+                    bias=self.op_layer.bias,
+                    stride=self.op_layer.stride,
+                    padding=self.get_padding(self.dilation_rates[0]),
+                    dilation=self.dilation_rates[0],
+                    groups=self.op_layer.groups,
+                )
+            ]
+        else:
+            outputs = [
+                nn.functional.conv2d(
+                    input,
+                    weight=self.op_layer.weight,
+                    bias=self.op_layer.bias,
+                    stride=self.op_layer.stride,
+                    padding=self.get_padding(r),
+                    dilation=r,
+                    groups=self.op_layer.groups,
+                ) * norm_w[i] for i, r in enumerate(self.dilation_rates)
+            ]
+        output = outputs[0]
+        for i in range(1, len(self.dilation_rates)):
+            output += outputs[i]
+        return output
+
+    def estimate_rates(self) -> None:
+        """Estimate new dilation rate based on trained branch_weights."""
+        norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)])
+        if self.verbose:
+            print_log(
+                'Estimate dilation {} with weight {}.'.format(
+                    self.dilation_rates,
+                    norm_w.detach().cpu().numpy().tolist()), 'current')
+
+        sum0, sum1, w_sum = 0, 0, 0
+        for i in range(len(self.dilation_rates)):
+            sum0 += norm_w[i].item() * self.dilation_rates[i][0]
+            sum1 += norm_w[i].item() * self.dilation_rates[i][1]
+            w_sum += norm_w[i].item()
+        estimated = [
+            np.clip(
+                int(round(sum0 / w_sum)), self.global_config['mmin'],
+                self.global_config['mmax']).item(),
+            np.clip(
+                int(round(sum1 / w_sum)), self.global_config['mmin'],
+                self.global_config['mmax']).item()
+        ]
+        self.op_layer.dilation = tuple(estimated)
+        self.op_layer.padding = self.get_padding(self.op_layer.dilation)
+        self.dilation_rates = [tuple(estimated)]
+        if self.verbose:
+            print_log(f'Estimate as {tuple(estimated)}', 'current')
+
+    def expand_rates(self) -> None:
+        """Expand dilation rate."""
+        dilation = self.op_layer.dilation
+        dilation_rates = expand_rates(dilation, self.global_config)
+        if self.op_layer.kernel_size[
+                0] == 1 or self.op_layer.kernel_size[0] % 2 == 0:
+            dilation_rates = [(dilation[0], r[1]) for r in dilation_rates]
+        if self.op_layer.kernel_size[
+                1] == 1 or self.op_layer.kernel_size[1] % 2 == 0:
+            dilation_rates = [(r[0], dilation[1]) for r in dilation_rates]
+
+        self.dilation_rates = copy.deepcopy(dilation_rates)
+        if self.verbose:
+            print_log(f'Expand as {self.dilation_rates}', 'current')
+        nn.init.constant_(self.branch_weights,
+                          self.global_config['init_alphas'])
+
+    def get_padding(self, dilation) -> tuple:
+        padding = (get_single_padding(self.op_layer.kernel_size[0],
+                                      self.op_layer.stride[0], dilation[0]),
+                   get_single_padding(self.op_layer.kernel_size[1],
+                                      self.op_layer.stride[1], dilation[1]))
+        return padding
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/search.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4add4b23afd1585fd434931e27dc92187ba1f6f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/search.py
@@ -0,0 +1,239 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Dict, Optional
+
+import mmengine
+import torch  # noqa
+import torch.nn as nn
+from mmengine.hooks import Hook
+from mmengine.logging import print_log
+from mmengine.registry import HOOKS
+
+from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp  # noqa
+from .utils import get_single_padding, write_to_json
+
+
+@HOOKS.register_module()
+class RFSearchHook(Hook):
+    """Rcecptive field search via dilation rates.
+
+    Please refer to `RF-Next: Efficient Receptive Field
+    Search for Convolutional Neural Networks
+    <https://arxiv.org/abs/2206.06637>`_ for more details.
+
+
+    Args:
+        mode (str, optional): It can be set to the following types:
+            'search', 'fixed_single_branch', or 'fixed_multi_branch'.
+            Defaults to 'search'.
+        config (Dict, optional): config dict of search.
+            By default this config contains "search",
+            and config["search"] must include:
+
+            - "step": recording the current searching step.
+            - "max_step": The maximum number of searching steps
+              to update the structures.
+            - "search_interval": The interval (epoch/iteration)
+              between two updates.
+            - "exp_rate": The controller of the sparsity of search space.
+            - "init_alphas": The value for initializing weights of each branch.
+            - "mmin": The minimum dilation rate.
+            - "mmax": The maximum dilation rate.
+            - "num_branches": The controller of the size of
+              search space (the number of branches).
+            - "skip_layer": The modules in skip_layer will be ignored
+              during the receptive field search.
+        rfstructure_file (str, optional): Path to load searched receptive
+            fields of the model. Defaults to None.
+        by_epoch (bool, optional): Determine to perform step by epoch or
+            by iteration. If set to True, it will step by epoch. Otherwise, by
+            iteration. Defaults to True.
+        verbose (bool): Determines whether to print rf-next related logging
+            messages. Defaults to True.
+    """
+
+    def __init__(self,
+                 mode: str = 'search',
+                 config: Dict = {},
+                 rfstructure_file: Optional[str] = None,
+                 by_epoch: bool = True,
+                 verbose: bool = True):
+        assert mode in ['search', 'fixed_single_branch', 'fixed_multi_branch']
+        assert config is not None
+        self.config = config
+        self.config['structure'] = {}
+        self.verbose = verbose
+        if rfstructure_file is not None:
+            rfstructure = mmengine.load(rfstructure_file)['structure']
+            self.config['structure'] = rfstructure
+        self.mode = mode
+        self.num_branches = self.config['search']['num_branches']
+        self.by_epoch = by_epoch
+
+    def init_model(self, model: nn.Module):
+        """init model with search ability.
+
+        Args:
+            model (nn.Module): pytorch model
+
+        Raises:
+            NotImplementedError: only support three modes:
+                search/fixed_single_branch/fixed_multi_branch
+        """
+        if self.verbose:
+            print_log('RFSearch init begin.', 'current')
+        if self.mode == 'search':
+            if self.config['structure']:
+                self.set_model(model, search_op='Conv2d')
+            self.wrap_model(model, search_op='Conv2d')
+        elif self.mode == 'fixed_single_branch':
+            self.set_model(model, search_op='Conv2d')
+        elif self.mode == 'fixed_multi_branch':
+            self.set_model(model, search_op='Conv2d')
+            self.wrap_model(model, search_op='Conv2d')
+        else:
+            raise NotImplementedError
+        if self.verbose:
+            print_log('RFSearch init end.', 'current')
+
+    def after_train_epoch(self, runner):
+        """Performs a dilation searching step after one training epoch."""
+        if self.by_epoch and self.mode == 'search':
+            self.step(runner.model, runner.work_dir)
+
+    def after_train_iter(self, runner, batch_idx, data_batch, outputs):
+        """Performs a dilation searching step after one training iteration."""
+        if not self.by_epoch and self.mode == 'search':
+            self.step(runner.model, runner.work_dir)
+
+    def step(self, model: nn.Module, work_dir: str) -> None:
+        """Performs a dilation searching step.
+
+        Args:
+            model (nn.Module): pytorch model
+            work_dir (str): Directory to save the searching results.
+        """
+        self.config['search']['step'] += 1
+        if (self.config['search']['step']
+            ) % self.config['search']['search_interval'] == 0 and (self.config[
+                'search']['step']) < self.config['search']['max_step']:
+            self.estimate_and_expand(model)
+            for name, module in model.named_modules():
+                if isinstance(module, BaseConvRFSearchOp):
+                    self.config['structure'][name] = module.op_layer.dilation
+
+            write_to_json(
+                self.config,
+                os.path.join(
+                    work_dir,
+                    'local_search_config_step%d.json' %
+                    self.config['search']['step'],
+                ),
+            )
+
+    def estimate_and_expand(self, model: nn.Module) -> None:
+        """estimate and search for RFConvOp.
+
+        Args:
+            model (nn.Module): pytorch model
+        """
+        for module in model.modules():
+            if isinstance(module, BaseConvRFSearchOp):
+                module.estimate_rates()
+                module.expand_rates()
+
+    def wrap_model(self,
+                   model: nn.Module,
+                   search_op: str = 'Conv2d',
+                   prefix: str = '') -> None:
+        """wrap model to support searchable conv op.
+
+        Args:
+            model (nn.Module): pytorch model
+            search_op (str): The module that uses RF search.
+                Defaults to 'Conv2d'.
+            init_rates (int, optional): Set to other initial dilation rates.
+                Defaults to None.
+            prefix (str): Prefix for function recursion. Defaults to ''.
+        """
+        op = 'torch.nn.' + search_op
+        for name, module in model.named_children():
+            if prefix == '':
+                fullname = 'module.' + name
+            else:
+                fullname = prefix + '.' + name
+            if self.config['search']['skip_layer'] is not None:
+                if any(layer in fullname
+                       for layer in self.config['search']['skip_layer']):
+                    continue
+            if isinstance(module, eval(op)):
+                if 1 < module.kernel_size[0] and \
+                    0 != module.kernel_size[0] % 2 or \
+                    1 < module.kernel_size[1] and \
+                        0 != module.kernel_size[1] % 2:
+                    moduleWrap = eval(search_op + 'RFSearchOp')(
+                        module, self.config['search'], self.verbose)
+                    moduleWrap = moduleWrap.to(module.weight.device)
+                    if self.verbose:
+                        print_log(
+                            'Wrap model %s to %s.' %
+                            (str(module), str(moduleWrap)), 'current')
+                    setattr(model, name, moduleWrap)
+            elif not isinstance(module, BaseConvRFSearchOp):
+                self.wrap_model(module, search_op, fullname)
+
+    def set_model(self,
+                  model: nn.Module,
+                  search_op: str = 'Conv2d',
+                  init_rates: Optional[int] = None,
+                  prefix: str = '') -> None:
+        """set model based on config.
+
+        Args:
+            model (nn.Module): pytorch model
+            config (Dict): config file
+            search_op (str): The module that uses RF search.
+                Defaults to 'Conv2d'.
+            init_rates (int, optional):  Set to other initial dilation rates.
+                Defaults to None.
+            prefix (str): Prefix for function recursion. Defaults to ''.
+        """
+        op = 'torch.nn.' + search_op
+        for name, module in model.named_children():
+            if prefix == '':
+                fullname = 'module.' + name
+            else:
+                fullname = prefix + '.' + name
+            if self.config['search']['skip_layer'] is not None:
+                if any(layer in fullname
+                       for layer in self.config['search']['skip_layer']):
+                    continue
+            if isinstance(module, eval(op)):
+                if 1 < module.kernel_size[0] and \
+                    0 != module.kernel_size[0] % 2 or \
+                    1 < module.kernel_size[1] and \
+                        0 != module.kernel_size[1] % 2:
+                    if isinstance(self.config['structure'][fullname], int):
+                        self.config['structure'][fullname] = [
+                            self.config['structure'][fullname],
+                            self.config['structure'][fullname]
+                        ]
+                    module.dilation = (
+                        self.config['structure'][fullname][0],
+                        self.config['structure'][fullname][1],
+                    )
+                    module.padding = (
+                        get_single_padding(
+                            module.kernel_size[0], module.stride[0],
+                            self.config['structure'][fullname][0]),
+                        get_single_padding(
+                            module.kernel_size[1], module.stride[1],
+                            self.config['structure'][fullname][1]))
+                    setattr(model, name, module)
+                    if self.verbose:
+                        print_log(
+                            'Set module %s dilation as: [%d %d]' %
+                            (fullname, module.dilation[0], module.dilation[1]),
+                            'current')
+            elif not isinstance(module, BaseConvRFSearchOp):
+                self.set_model(module, search_op, init_rates, fullname)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/utils.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c8168e343d6bded761390f1be9a38b58727badf
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/utils.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine
+import numpy as np
+
+
+def write_to_json(config: dict, filename: str):
+    """save config to json file.
+
+    Args:
+        config (dict): Config to be saved.
+        filename (str): Path to save config.
+    """
+
+    with open(filename, 'w', encoding='utf-8') as f:
+        mmengine.dump(config, f, file_format='json')
+
+
+def expand_rates(dilation: tuple, config: dict) -> list:
+    """expand dilation rate according to config.
+
+    Args:
+        dilation (int): _description_
+        config (dict): config dict
+
+    Returns:
+        list: list of expanded dilation rates
+    """
+    exp_rate = config['exp_rate']
+
+    large_rates = []
+    small_rates = []
+    for _ in range(config['num_branches'] // 2):
+        large_rates.append(
+            tuple([
+                np.clip(
+                    int(round((1 + exp_rate) * dilation[0])), config['mmin'],
+                    config['mmax']).item(),
+                np.clip(
+                    int(round((1 + exp_rate) * dilation[1])), config['mmin'],
+                    config['mmax']).item()
+            ]))
+        small_rates.append(
+            tuple([
+                np.clip(
+                    int(round((1 - exp_rate) * dilation[0])), config['mmin'],
+                    config['mmax']).item(),
+                np.clip(
+                    int(round((1 - exp_rate) * dilation[1])), config['mmin'],
+                    config['mmax']).item()
+            ]))
+
+    small_rates.reverse()
+
+    if config['num_branches'] % 2 == 0:
+        rate_list = small_rates + large_rates
+    else:
+        rate_list = small_rates + [dilation] + large_rates
+
+    unique_rate_list = list(set(rate_list))
+    unique_rate_list.sort(key=rate_list.index)
+    return unique_rate_list
+
+
+def get_single_padding(kernel_size: int,
+                       stride: int = 1,
+                       dilation: int = 1) -> int:
+    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils/__init__.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdec9399f6544a90de6ac4238a60b05b8888c907
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .flops_counter import get_model_complexity_info
+from .fuse_conv_bn import fuse_conv_bn
+
+__all__ = ['get_model_complexity_info', 'fuse_conv_bn']
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils/flops_counter.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils/flops_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b09edbcdff063c5a8276bafdd8d69b440539108e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils/flops_counter.py
@@ -0,0 +1,604 @@
+# Modified from flops-counter.pytorch by Vladislav Sovrasov
+# original repo: https://github.com/sovrasov/flops-counter.pytorch
+
+# MIT License
+
+# Copyright (c) 2018 Vladislav Sovrasov
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import sys
+import warnings
+from functools import partial
+from typing import Any, Callable, Dict, Optional, TextIO, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from mmcv.cnn.bricks import (Conv2d, Conv3d, ConvTranspose2d, Linear,
+                             MaxPool2d, MaxPool3d)
+
+
+def get_model_complexity_info(model: nn.Module,
+                              input_shape: tuple,
+                              print_per_layer_stat: bool = True,
+                              as_strings: bool = True,
+                              input_constructor: Optional[Callable] = None,
+                              flush: bool = False,
+                              ost: TextIO = sys.stdout) -> tuple:
+    """Get complexity information of a model.
+
+    This method can calculate FLOPs and parameter counts of a model with
+    corresponding input shape. It can also print complexity information for
+    each layer in a model.
+
+    Supported layers are listed as below:
+        - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``.
+        - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``,
+          ``nn.LeakyReLU``, ``nn.ReLU6``.
+        - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``,
+          ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
+          ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
+          ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
+          ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
+        - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``,
+          ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
+          ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
+        - Linear: ``nn.Linear``.
+        - Deconvolution: ``nn.ConvTranspose2d``.
+        - Upsample: ``nn.Upsample``.
+
+    Args:
+        model (nn.Module): The model for complexity calculation.
+        input_shape (tuple): Input shape used for calculation.
+        print_per_layer_stat (bool): Whether to print complexity information
+            for each layer in a model. Default: True.
+        as_strings (bool): Output FLOPs and params counts in a string form.
+            Default: True.
+        input_constructor (None | callable): If specified, it takes a callable
+            method that generates input. otherwise, it will generate a random
+            tensor with input shape to calculate FLOPs. Default: None.
+        flush (bool): same as that in :func:`print`. Default: False.
+        ost (stream): same as ``file`` param in :func:`print`.
+            Default: sys.stdout.
+
+    Returns:
+        tuple[float | str]: If ``as_strings`` is set to True, it will return
+        FLOPs and parameter counts in a string format. otherwise, it will
+        return those in a float number format.
+    """
+    assert type(input_shape) is tuple
+    assert len(input_shape) >= 1
+    assert isinstance(model, nn.Module)
+    flops_model = add_flops_counting_methods(model)
+    flops_model.eval()
+    flops_model.start_flops_count()
+    if input_constructor:
+        input = input_constructor(input_shape)
+        _ = flops_model(**input)
+    else:
+        try:
+            batch = torch.ones(()).new_empty(
+                (1, *input_shape),
+                dtype=next(flops_model.parameters()).dtype,
+                device=next(flops_model.parameters()).device)
+        except StopIteration:
+            # Avoid StopIteration for models which have no parameters,
+            # like `nn.Relu()`, `nn.AvgPool2d`, etc.
+            batch = torch.ones(()).new_empty((1, *input_shape))
+
+        _ = flops_model(batch)
+
+    flops_count, params_count = flops_model.compute_average_flops_cost()
+    if print_per_layer_stat:
+        print_model_with_flops(
+            flops_model, flops_count, params_count, ost=ost, flush=flush)
+    flops_model.stop_flops_count()
+
+    if as_strings:
+        return flops_to_string(flops_count), params_to_string(params_count)
+
+    return flops_count, params_count
+
+
+def flops_to_string(flops: float,
+                    units: Optional[str] = 'GFLOPs',
+                    precision: int = 2) -> str:
+    """Convert FLOPs number into a string.
+
+    Note that Here we take a multiply-add counts as one FLOP.
+
+    Args:
+        flops (float): FLOPs number to be converted.
+        units (str | None): Converted FLOPs units. Options are None, 'GFLOPs',
+            'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically
+            choose the most suitable unit for FLOPs. Default: 'GFLOPs'.
+        precision (int): Digit number after the decimal point. Default: 2.
+
+    Returns:
+        str: The converted FLOPs number with units.
+
+    Examples:
+        >>> flops_to_string(1e9)
+        '1.0 GFLOPs'
+        >>> flops_to_string(2e5, 'MFLOPs')
+        '0.2 MFLOPs'
+        >>> flops_to_string(3e-9, None)
+        '3e-09 FLOPs'
+    """
+    if units is None:
+        if flops // 10**9 > 0:
+            return str(round(flops / 10.**9, precision)) + ' GFLOPs'
+        elif flops // 10**6 > 0:
+            return str(round(flops / 10.**6, precision)) + ' MFLOPs'
+        elif flops // 10**3 > 0:
+            return str(round(flops / 10.**3, precision)) + ' KFLOPs'
+        else:
+            return str(flops) + ' FLOPs'
+    else:
+        if units == 'GFLOPs':
+            return str(round(flops / 10.**9, precision)) + ' ' + units
+        elif units == 'MFLOPs':
+            return str(round(flops / 10.**6, precision)) + ' ' + units
+        elif units == 'KFLOPs':
+            return str(round(flops / 10.**3, precision)) + ' ' + units
+        else:
+            return str(flops) + ' FLOPs'
+
+
+def params_to_string(num_params: float,
+                     units: Optional[str] = None,
+                     precision: int = 2) -> str:
+    """Convert parameter number into a string.
+
+    Args:
+        num_params (float): Parameter number to be converted.
+        units (str | None): Converted FLOPs units. Options are None, 'M',
+            'K' and ''. If set to None, it will automatically choose the most
+            suitable unit for Parameter number. Default: None.
+        precision (int): Digit number after the decimal point. Default: 2.
+
+    Returns:
+        str: The converted parameter number with units.
+
+    Examples:
+        >>> params_to_string(1e9)
+        '1000.0 M'
+        >>> params_to_string(2e5)
+        '200.0 k'
+        >>> params_to_string(3e-9)
+        '3e-09'
+    """
+    if units is None:
+        if num_params // 10**6 > 0:
+            return str(round(num_params / 10**6, precision)) + ' M'
+        elif num_params // 10**3:
+            return str(round(num_params / 10**3, precision)) + ' k'
+        else:
+            return str(num_params)
+    else:
+        if units == 'M':
+            return str(round(num_params / 10.**6, precision)) + ' ' + units
+        elif units == 'K':
+            return str(round(num_params / 10.**3, precision)) + ' ' + units
+        else:
+            return str(num_params)
+
+
+def print_model_with_flops(model: nn.Module,
+                           total_flops: float,
+                           total_params: float,
+                           units: Optional[str] = 'GFLOPs',
+                           precision: int = 3,
+                           ost: TextIO = sys.stdout,
+                           flush: bool = False) -> None:
+    """Print a model with FLOPs for each layer.
+
+    Args:
+        model (nn.Module): The model to be printed.
+        total_flops (float): Total FLOPs of the model.
+        total_params (float): Total parameter counts of the model.
+        units (str | None): Converted FLOPs units. Default: 'GFLOPs'.
+        precision (int): Digit number after the decimal point. Default: 3.
+        ost (stream): same as `file` param in :func:`print`.
+            Default: sys.stdout.
+        flush (bool): same as that in :func:`print`. Default: False.
+
+    Example:
+        >>> class ExampleModel(nn.Module):
+
+        >>> def __init__(self):
+        >>>     super().__init__()
+        >>>     self.conv1 = nn.Conv2d(3, 8, 3)
+        >>>     self.conv2 = nn.Conv2d(8, 256, 3)
+        >>>     self.conv3 = nn.Conv2d(256, 8, 3)
+        >>>     self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        >>>     self.flatten = nn.Flatten()
+        >>>     self.fc = nn.Linear(8, 1)
+
+        >>> def forward(self, x):
+        >>>     x = self.conv1(x)
+        >>>     x = self.conv2(x)
+        >>>     x = self.conv3(x)
+        >>>     x = self.avg_pool(x)
+        >>>     x = self.flatten(x)
+        >>>     x = self.fc(x)
+        >>>     return x
+
+        >>> model = ExampleModel()
+        >>> x = (3, 16, 16)
+        to print the complexity information state for each layer, you can use
+        >>> get_model_complexity_info(model, x)
+        or directly use
+        >>> print_model_with_flops(model, 4579784.0, 37361)
+        ExampleModel(
+          0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs,
+          (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1))  # noqa: E501
+          (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1))
+          (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1))
+          (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1))
+          (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, )
+          (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True)
+        )
+    """
+
+    def accumulate_params(self):
+        if is_supported_instance(self):
+            return self.__params__
+        else:
+            sum = 0
+            for m in self.children():
+                sum += m.accumulate_params()
+            return sum
+
+    def accumulate_flops(self):
+        if is_supported_instance(self):
+            return self.__flops__ / model.__batch_counter__
+        else:
+            sum = 0
+            for m in self.children():
+                sum += m.accumulate_flops()
+            return sum
+
+    def flops_repr(self):
+        accumulated_num_params = self.accumulate_params()
+        accumulated_flops_cost = self.accumulate_flops()
+        return ', '.join([
+            params_to_string(
+                accumulated_num_params, units='M', precision=precision),
+            f'{accumulated_num_params / total_params:.3%} Params',
+            flops_to_string(
+                accumulated_flops_cost, units=units, precision=precision),
+            f'{accumulated_flops_cost / total_flops:.3%} FLOPs',
+            self.original_extra_repr()
+        ])
+
+    def add_extra_repr(m):
+        m.accumulate_flops = accumulate_flops.__get__(m)
+        m.accumulate_params = accumulate_params.__get__(m)
+        flops_extra_repr = flops_repr.__get__(m)
+        if m.extra_repr != flops_extra_repr:
+            m.original_extra_repr = m.extra_repr
+            m.extra_repr = flops_extra_repr
+            assert m.extra_repr != m.original_extra_repr
+
+    def del_extra_repr(m):
+        if hasattr(m, 'original_extra_repr'):
+            m.extra_repr = m.original_extra_repr
+            del m.original_extra_repr
+        if hasattr(m, 'accumulate_flops'):
+            del m.accumulate_flops
+
+    model.apply(add_extra_repr)
+    print(model, file=ost, flush=flush)
+    model.apply(del_extra_repr)
+
+
+def get_model_parameters_number(model: nn.Module) -> float:
+    """Calculate parameter number of a model.
+
+    Args:
+        model (nn.module): The model for parameter number calculation.
+
+    Returns:
+        float: Parameter number of the model.
+    """
+    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return num_params
+
+
+def add_flops_counting_methods(net_main_module: nn.Module) -> nn.Module:
+    # adding additional methods to the existing module object,
+    # this is done this way so that each function has access to self object
+    net_main_module.start_flops_count = start_flops_count.__get__(  # type: ignore # noqa E501
+        net_main_module)
+    net_main_module.stop_flops_count = stop_flops_count.__get__(  # type: ignore # noqa E501
+        net_main_module)
+    net_main_module.reset_flops_count = reset_flops_count.__get__(  # type: ignore # noqa E501
+        net_main_module)
+    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(  # type: ignore # noqa E501
+        net_main_module)
+
+    net_main_module.reset_flops_count()
+
+    return net_main_module
+
+
+def compute_average_flops_cost(self) -> Tuple[float, float]:
+    """Compute average FLOPs cost.
+
+    A method to compute average FLOPs cost, which will be available after
+    `add_flops_counting_methods()` is called on a desired net object.
+
+    Returns:
+        float: Current mean flops consumption per image.
+    """
+    batches_count = self.__batch_counter__
+    flops_sum = 0
+    for module in self.modules():
+        if is_supported_instance(module):
+            flops_sum += module.__flops__
+    params_sum = get_model_parameters_number(self)
+    return flops_sum / batches_count, params_sum
+
+
+def start_flops_count(self) -> None:
+    """Activate the computation of mean flops consumption per image.
+
+    A method to activate the computation of mean flops consumption per image.
+    which will be available after ``add_flops_counting_methods()`` is called on
+    a desired net object. It should be called before running the network.
+    """
+    add_batch_counter_hook_function(self)
+
+    def add_flops_counter_hook_function(module: nn.Module) -> None:
+        if is_supported_instance(module):
+            if hasattr(module, '__flops_handle__'):
+                return
+
+            else:
+                handle = module.register_forward_hook(
+                    get_modules_mapping()[type(module)])
+
+            module.__flops_handle__ = handle
+
+    self.apply(partial(add_flops_counter_hook_function))
+
+
+def stop_flops_count(self) -> None:
+    """Stop computing the mean flops consumption per image.
+
+    A method to stop computing the mean flops consumption per image, which will
+    be available after ``add_flops_counting_methods()`` is called on a desired
+    net object. It can be called to pause the computation whenever.
+    """
+    remove_batch_counter_hook_function(self)
+    self.apply(remove_flops_counter_hook_function)
+
+
+def reset_flops_count(self) -> None:
+    """Reset statistics computed so far.
+
+    A method to Reset computed statistics, which will be available after
+    `add_flops_counting_methods()` is called on a desired net object.
+    """
+    add_batch_counter_variables_or_reset(self)
+    self.apply(add_flops_counter_variable_or_reset)
+
+
+# ---- Internal functions
+def empty_flops_counter_hook(module: nn.Module, input: tuple,
+                             output: Any) -> None:
+    module.__flops__ += 0
+
+
+def upsample_flops_counter_hook(module: nn.Module, input: tuple,
+                                output: torch.Tensor) -> None:
+    output_size = output[0]
+    batch_size = output_size.shape[0]
+    output_elements_count = batch_size
+    for val in output_size.shape[1:]:
+        output_elements_count *= val
+    module.__flops__ += int(output_elements_count)
+
+
+def relu_flops_counter_hook(module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    active_elements_count = output.numel()
+    module.__flops__ += int(active_elements_count)
+
+
+def linear_flops_counter_hook(module: nn.Module, input: tuple,
+                              output: torch.Tensor) -> None:
+    output_last_dim = output.shape[
+        -1]  # pytorch checks dimensions, so here we don't care much
+    module.__flops__ += int(np.prod(input[0].shape) * output_last_dim)
+
+
+def pool_flops_counter_hook(module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    module.__flops__ += int(np.prod(input[0].shape))
+
+
+def norm_flops_counter_hook(module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    batch_flops = np.prod(input[0].shape)
+    if (getattr(module, 'affine', False)
+            or getattr(module, 'elementwise_affine', False)):
+        batch_flops *= 2
+    module.__flops__ += int(batch_flops)
+
+
+def deconv_flops_counter_hook(conv_module: nn.Module, input: tuple,
+                              output: torch.Tensor) -> None:
+    # Can have multiple inputs, getting the first one
+    batch_size = input[0].shape[0]
+    input_height, input_width = input[0].shape[2:]
+
+    kernel_height, kernel_width = conv_module.kernel_size
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = (
+        kernel_height * kernel_width * in_channels * filters_per_channel)
+
+    active_elements_count = batch_size * input_height * input_width
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+    bias_flops = 0
+    if conv_module.bias is not None:
+        output_height, output_width = output.shape[2:]
+        bias_flops = out_channels * batch_size * output_height * output_width
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+def conv_flops_counter_hook(conv_module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    # Can have multiple inputs, getting the first one
+    batch_size = input[0].shape[0]
+    output_dims = list(output.shape[2:])
+
+    kernel_dims = list(conv_module.kernel_size)
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = int(
+        np.prod(kernel_dims)) * in_channels * filters_per_channel
+
+    active_elements_count = batch_size * int(np.prod(output_dims))
+
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+
+    bias_flops = 0
+
+    if conv_module.bias is not None:
+
+        bias_flops = out_channels * active_elements_count
+
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+def batch_counter_hook(module: nn.Module, input: tuple, output: Any) -> None:
+    batch_size = 1
+    if len(input) > 0:
+        # Can have multiple inputs, getting the first one
+        batch_size = len(input[0])
+    else:
+        warnings.warn('No positional inputs found for a module, '
+                      'assuming batch size is 1.')
+    module.__batch_counter__ += batch_size
+
+
+def add_batch_counter_variables_or_reset(module: nn.Module) -> None:
+
+    module.__batch_counter__ = 0
+
+
+def add_batch_counter_hook_function(module: nn.Module) -> None:
+    if hasattr(module, '__batch_counter_handle__'):
+        return
+
+    handle = module.register_forward_hook(batch_counter_hook)
+    module.__batch_counter_handle__ = handle
+
+
+def remove_batch_counter_hook_function(module: nn.Module) -> None:
+    if hasattr(module, '__batch_counter_handle__'):
+        module.__batch_counter_handle__.remove()
+        del module.__batch_counter_handle__
+
+
+def add_flops_counter_variable_or_reset(module: nn.Module) -> None:
+    if is_supported_instance(module):
+        if hasattr(module, '__flops__') or hasattr(module, '__params__'):
+            warnings.warn('variables __flops__ or __params__ are already '
+                          'defined for the module' + type(module).__name__ +
+                          ' ptflops can affect your code!')
+        module.__flops__ = 0
+        module.__params__ = get_model_parameters_number(module)
+
+
+def is_supported_instance(module: nn.Module) -> bool:
+    if type(module) in get_modules_mapping():
+        return True
+    return False
+
+
+def remove_flops_counter_hook_function(module: nn.Module) -> None:
+    if is_supported_instance(module):
+        if hasattr(module, '__flops_handle__'):
+            module.__flops_handle__.remove()
+            del module.__flops_handle__
+
+
+def get_modules_mapping() -> Dict:
+    return {
+        # convolutions
+        nn.Conv1d: conv_flops_counter_hook,
+        nn.Conv2d: conv_flops_counter_hook,
+        Conv2d: conv_flops_counter_hook,
+        nn.Conv3d: conv_flops_counter_hook,
+        Conv3d: conv_flops_counter_hook,
+        # activations
+        nn.ReLU: relu_flops_counter_hook,
+        nn.PReLU: relu_flops_counter_hook,
+        nn.ELU: relu_flops_counter_hook,
+        nn.LeakyReLU: relu_flops_counter_hook,
+        nn.ReLU6: relu_flops_counter_hook,
+        # poolings
+        nn.MaxPool1d: pool_flops_counter_hook,
+        nn.AvgPool1d: pool_flops_counter_hook,
+        nn.AvgPool2d: pool_flops_counter_hook,
+        nn.MaxPool2d: pool_flops_counter_hook,
+        MaxPool2d: pool_flops_counter_hook,
+        nn.MaxPool3d: pool_flops_counter_hook,
+        MaxPool3d: pool_flops_counter_hook,
+        nn.AvgPool3d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool1d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool1d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool2d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool2d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool3d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool3d: pool_flops_counter_hook,
+        # normalizations
+        nn.BatchNorm1d: norm_flops_counter_hook,
+        nn.BatchNorm2d: norm_flops_counter_hook,
+        nn.BatchNorm3d: norm_flops_counter_hook,
+        nn.GroupNorm: norm_flops_counter_hook,
+        nn.InstanceNorm1d: norm_flops_counter_hook,
+        nn.InstanceNorm2d: norm_flops_counter_hook,
+        nn.InstanceNorm3d: norm_flops_counter_hook,
+        nn.LayerNorm: norm_flops_counter_hook,
+        # FC
+        nn.Linear: linear_flops_counter_hook,
+        Linear: linear_flops_counter_hook,
+        # Upscale
+        nn.Upsample: upsample_flops_counter_hook,
+        # Deconvolution
+        nn.ConvTranspose2d: deconv_flops_counter_hook,
+        ConvTranspose2d: deconv_flops_counter_hook,
+    }
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils/fuse_conv_bn.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils/fuse_conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ccaab3bf1eb3ce615bad910d6dc45a467bb1fe4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils/fuse_conv_bn.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+
+def _fuse_conv_bn(conv: nn.Module, bn: nn.Module) -> nn.Module:
+    """Fuse conv and bn into one module.
+
+    Args:
+        conv (nn.Module): Conv to be fused.
+        bn (nn.Module): BN to be fused.
+
+    Returns:
+        nn.Module: Fused module.
+    """
+    conv_w = conv.weight
+    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+        bn.running_mean)
+
+    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+    conv.weight = nn.Parameter(conv_w *
+                               factor.reshape([conv.out_channels, 1, 1, 1]))
+    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+    return conv
+
+
+def fuse_conv_bn(module: nn.Module) -> nn.Module:
+    """Recursively fuse conv and bn in a module.
+
+    During inference, the functionary of batch norm layers is turned off
+    but only the mean and var alone channels are used, which exposes the
+    chance to fuse it with the preceding conv layers to save computations and
+    simplify network structures.
+
+    Args:
+        module (nn.Module): Module to be fused.
+
+    Returns:
+        nn.Module: Fused module.
+    """
+    last_conv = None
+    last_conv_name = None
+
+    for name, child in module.named_children():
+        if isinstance(child,
+                      (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)):
+            if last_conv is None:  # only fuse BN that is after Conv
+                continue
+            fused_conv = _fuse_conv_bn(last_conv, child)
+            module._modules[last_conv_name] = fused_conv
+            # To reduce changes, set BN as Identity instead of deleting it.
+            module._modules[name] = nn.Identity()
+            last_conv = None
+        elif isinstance(child, nn.Conv2d):
+            last_conv = child
+            last_conv_name = name
+        else:
+            fuse_conv_bn(child)
+    return module
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/vgg.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7f3116062c3943bb85fd7540b23a31918622a24
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/cnn/vgg.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch.nn as nn
+from mmengine.model import constant_init, kaiming_init, normal_init
+from mmengine.runner import load_checkpoint
+from torch import Tensor
+
+
+def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module:
+    """3x3 convolution with padding."""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        padding=dilation,
+        dilation=dilation)
+
+
+def make_vgg_layer(inplanes: int,
+                   planes: int,
+                   num_blocks: int,
+                   dilation: int = 1,
+                   with_bn: bool = False,
+                   ceil_mode: bool = False) -> List[nn.Module]:
+    layers = []
+    for _ in range(num_blocks):
+        layers.append(conv3x3(inplanes, planes, dilation))
+        if with_bn:
+            layers.append(nn.BatchNorm2d(planes))
+        layers.append(nn.ReLU(inplace=True))
+        inplanes = planes
+    layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))
+
+    return layers
+
+
+class VGG(nn.Module):
+    """VGG backbone.
+
+    Args:
+        depth (int): Depth of vgg, from {11, 13, 16, 19}.
+        with_bn (bool): Use BatchNorm or not.
+        num_classes (int): number of classes for classification.
+        num_stages (int): VGG stages, normally 5.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
+            running stats (mean and var).
+        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+    """
+
+    arch_settings = {
+        11: (1, 1, 2, 2, 2),
+        13: (2, 2, 2, 2, 2),
+        16: (2, 2, 3, 3, 3),
+        19: (2, 2, 4, 4, 4)
+    }
+
+    def __init__(self,
+                 depth: int,
+                 with_bn: bool = False,
+                 num_classes: int = -1,
+                 num_stages: int = 5,
+                 dilations: Sequence[int] = (1, 1, 1, 1, 1),
+                 out_indices: Sequence[int] = (0, 1, 2, 3, 4),
+                 frozen_stages: int = -1,
+                 bn_eval: bool = True,
+                 bn_frozen: bool = False,
+                 ceil_mode: bool = False,
+                 with_last_pool: bool = True):
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for vgg')
+        assert num_stages >= 1 and num_stages <= 5
+        stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        assert len(dilations) == num_stages
+        assert max(out_indices) <= num_stages
+
+        self.num_classes = num_classes
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.bn_eval = bn_eval
+        self.bn_frozen = bn_frozen
+
+        self.inplanes = 3
+        start_idx = 0
+        vgg_layers = []
+        self.range_sub_modules = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            num_modules = num_blocks * (2 + with_bn) + 1
+            end_idx = start_idx + num_modules
+            dilation = dilations[i]
+            planes = 64 * 2**i if i < 4 else 512
+            vgg_layer = make_vgg_layer(
+                self.inplanes,
+                planes,
+                num_blocks,
+                dilation=dilation,
+                with_bn=with_bn,
+                ceil_mode=ceil_mode)
+            vgg_layers.extend(vgg_layer)
+            self.inplanes = planes
+            self.range_sub_modules.append([start_idx, end_idx])
+            start_idx = end_idx
+        if not with_last_pool:
+            vgg_layers.pop(-1)
+            self.range_sub_modules[-1][1] -= 1
+        self.module_name = 'features'
+        self.add_module(self.module_name, nn.Sequential(*vgg_layers))
+
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Linear(512 * 7 * 7, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, num_classes),
+            )
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=0.01)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor, ...]]:
+        outs = []
+        vgg_layers = getattr(self, self.module_name)
+        for i in range(len(self.stage_blocks)):
+            for j in range(*self.range_sub_modules[i]):
+                vgg_layer = vgg_layers[j]
+                x = vgg_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), -1)
+            x = self.classifier(x)
+            outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def train(self, mode: bool = True) -> None:
+        super().train(mode)
+        if self.bn_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+                    if self.bn_frozen:
+                        for params in m.parameters():
+                            params.requires_grad = False
+        vgg_layers = getattr(self, self.module_name)
+        if mode and self.frozen_stages >= 0:
+            for i in range(self.frozen_stages):
+                for j in range(*self.range_sub_modules[i]):
+                    mod = vgg_layers[j]
+                    mod.eval()
+                    for param in mod.parameters():
+                        param.requires_grad = False
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/__init__.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ecec4046a6f5ee25b4ea07215ed7c7c810dcfa
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr,
+                         gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert,
+                         rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb)
+from .geometric import (cutout, imcrop, imflip, imflip_, impad,
+                        impad_to_multiple, imrescale, imresize, imresize_like,
+                        imresize_to_multiple, imrotate, imshear, imtranslate,
+                        rescale_size)
+from .io import imfrombytes, imread, imwrite, supported_backends, use_backend
+from .misc import tensor2imgs
+from .photometric import (adjust_brightness, adjust_color, adjust_contrast,
+                          adjust_hue, adjust_lighting, adjust_sharpness,
+                          auto_contrast, clahe, imdenormalize, imequalize,
+                          iminvert, imnormalize, imnormalize_, lut_transform,
+                          posterize, solarize)
+
+__all__ = [
+    'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb',
+    'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale',
+    'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size',
+    'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate',
+    'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend',
+    'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize',
+    'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr',
+    'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize',
+    'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe',
+    'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting',
+    'adjust_hue'
+]
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/colorspace.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/colorspace.py
new file mode 100644
index 0000000000000000000000000000000000000000..08f9952408c8e0bb38b17c10e2089e900ed418c2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/colorspace.py
@@ -0,0 +1,309 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, Union
+
+import cv2
+import numpy as np
+
+
+def imconvert(img: np.ndarray, src: str, dst: str) -> np.ndarray:
+    """Convert an image from the src colorspace to dst colorspace.
+
+    Args:
+        img (ndarray): The input image.
+        src (str): The source colorspace, e.g., 'rgb', 'hsv'.
+        dst (str): The destination colorspace, e.g., 'rgb', 'hsv'.
+
+    Returns:
+        ndarray: The converted image.
+    """
+    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+    out_img = cv2.cvtColor(img, code)
+    return out_img
+
+
+def bgr2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
+    """Convert a BGR image to grayscale image.
+
+    Args:
+        img (ndarray): The input image.
+        keepdim (bool): If False (by default), then return the grayscale image
+            with 2 dims, otherwise 3 dims.
+
+    Returns:
+        ndarray: The converted grayscale image.
+    """
+    out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    if keepdim:
+        out_img = out_img[..., None]
+    return out_img
+
+
+def rgb2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
+    """Convert a RGB image to grayscale image.
+
+    Args:
+        img (ndarray): The input image.
+        keepdim (bool): If False (by default), then return the grayscale image
+            with 2 dims, otherwise 3 dims.
+
+    Returns:
+        ndarray: The converted grayscale image.
+    """
+    out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    if keepdim:
+        out_img = out_img[..., None]
+    return out_img
+
+
+def gray2bgr(img: np.ndarray) -> np.ndarray:
+    """Convert a grayscale image to BGR image.
+
+    Args:
+        img (ndarray): The input image.
+
+    Returns:
+        ndarray: The converted BGR image.
+    """
+    img = img[..., None] if img.ndim == 2 else img
+    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+    return out_img
+
+
+def gray2rgb(img: np.ndarray) -> np.ndarray:
+    """Convert a grayscale image to RGB image.
+
+    Args:
+        img (ndarray): The input image.
+
+    Returns:
+        ndarray: The converted RGB image.
+    """
+    img = img[..., None] if img.ndim == 2 else img
+    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+    return out_img
+
+
+def _convert_input_type_range(img: np.ndarray) -> np.ndarray:
+    """Convert the type and range of the input image.
+
+    It converts the input image to np.float32 type and range of [0, 1].
+    It is mainly used for pre-processing the input image in colorspace
+    conversion functions such as rgb2ycbcr and ycbcr2rgb.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        (ndarray): The converted image with type of np.float32 and range of
+            [0, 1].
+    """
+    img_type = img.dtype
+    img = img.astype(np.float32)
+    if img_type == np.float32:
+        pass
+    elif img_type == np.uint8:
+        img /= 255.
+    else:
+        raise TypeError('The img type should be np.float32 or np.uint8, '
+                        f'but got {img_type}')
+    return img
+
+
+def _convert_output_type_range(
+        img: np.ndarray, dst_type: Union[np.uint8, np.float32]) -> np.ndarray:
+    """Convert the type and range of the image according to dst_type.
+
+    It converts the image to desired type and range. If `dst_type` is np.uint8,
+    images will be converted to np.uint8 type with range [0, 255]. If
+    `dst_type` is np.float32, it converts the image to np.float32 type with
+    range [0, 1].
+    It is mainly used for post-processing images in colorspace conversion
+    functions such as rgb2ycbcr and ycbcr2rgb.
+
+    Args:
+        img (ndarray): The image to be converted with np.float32 type and
+            range [0, 255].
+        dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it
+            converts the image to np.uint8 type with range [0, 255]. If
+            dst_type is np.float32, it converts the image to np.float32 type
+            with range [0, 1].
+
+    Returns:
+        (ndarray): The converted image with desired type and range.
+    """
+    if dst_type not in (np.uint8, np.float32):
+        raise TypeError('The dst_type should be np.float32 or np.uint8, '
+                        f'but got {dst_type}')
+    if dst_type == np.uint8:
+        img = img.round()
+    else:
+        img /= 255.
+    return img.astype(dst_type)
+
+
+def rgb2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
+    """Convert a RGB image to YCbCr image.
+
+    This function produces the same results as Matlab's `rgb2ycbcr` function.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img)
+    if y_only:
+        out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
+                  [24.966, 112.0, -18.214]]) + [16, 128, 128]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def bgr2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
+    """Convert a BGR image to YCbCr image.
+
+    The bgr version of rgb2ycbcr.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img)
+    if y_only:
+        out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+                  [65.481, -37.797, 112.0]]) + [16, 128, 128]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def ycbcr2rgb(img: np.ndarray) -> np.ndarray:
+    """Convert a YCbCr image to RGB image.
+
+    This function produces the same results as Matlab's ycbcr2rgb function.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        ndarray: The converted RGB image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img) * 255
+    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+                              [0, -0.00153632, 0.00791071],
+                              [0.00625893, -0.00318811, 0]]) * 255.0 + [
+                                  -222.921, 135.576, -276.836
+                              ]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def ycbcr2bgr(img: np.ndarray) -> np.ndarray:
+    """Convert a YCbCr image to BGR image.
+
+    The bgr version of ycbcr2rgb.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        ndarray: The converted BGR image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img) * 255
+    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+                              [0.00791071, -0.00153632, 0],
+                              [0, -0.00318811, 0.00625893]]) * 255.0 + [
+                                  -276.836, 135.576, -222.921
+                              ]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def convert_color_factory(src: str, dst: str) -> Callable:
+
+    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+
+    def convert_color(img: np.ndarray) -> np.ndarray:
+        out_img = cv2.cvtColor(img, code)
+        return out_img
+
+    convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()}
+        image.
+
+    Args:
+        img (ndarray or str): The input image.
+
+    Returns:
+        ndarray: The converted {dst.upper()} image.
+    """
+
+    return convert_color
+
+
+bgr2rgb = convert_color_factory('bgr', 'rgb')
+
+rgb2bgr = convert_color_factory('rgb', 'bgr')
+
+bgr2hsv = convert_color_factory('bgr', 'hsv')
+
+hsv2bgr = convert_color_factory('hsv', 'bgr')
+
+bgr2hls = convert_color_factory('bgr', 'hls')
+
+hls2bgr = convert_color_factory('hls', 'bgr')
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/geometric.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/geometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d62ebff35caf99858c9d73566fc1db0eb3831b2c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/geometric.py
@@ -0,0 +1,788 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numbers
+from typing import List, Optional, Tuple, Union, no_type_check
+
+import cv2
+import numpy as np
+from mmengine.utils import to_2tuple
+
+from .io import imread_backend
+
+try:
+    from PIL import Image
+except ImportError:
+    Image = None
+
+
+def _scale_size(
+    size: Tuple[int, int],
+    scale: Union[float, int, Tuple[float, float], Tuple[int, int]],
+) -> Tuple[int, int]:
+    """Rescale a size by a ratio.
+
+    Args:
+        size (tuple[int]): (w, h).
+        scale (float | int | tuple(float) | tuple(int)): Scaling factor.
+
+    Returns:
+        tuple[int]: scaled size.
+    """
+    if isinstance(scale, (float, int)):
+        scale = (scale, scale)
+    w, h = size
+    return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
+
+
+cv2_interp_codes = {
+    'nearest': cv2.INTER_NEAREST,
+    'bilinear': cv2.INTER_LINEAR,
+    'bicubic': cv2.INTER_CUBIC,
+    'area': cv2.INTER_AREA,
+    'lanczos': cv2.INTER_LANCZOS4
+}
+
+cv2_border_modes = {
+    'constant': cv2.BORDER_CONSTANT,
+    'replicate': cv2.BORDER_REPLICATE,
+    'reflect': cv2.BORDER_REFLECT,
+    'wrap': cv2.BORDER_WRAP,
+    'reflect_101': cv2.BORDER_REFLECT_101,
+    'transparent': cv2.BORDER_TRANSPARENT,
+    'isolated': cv2.BORDER_ISOLATED
+}
+
+# Pillow >=v9.1.0 use a slightly different naming scheme for filters.
+# Set pillow_interp_codes according to the naming scheme used.
+if Image is not None:
+    if hasattr(Image, 'Resampling'):
+        pillow_interp_codes = {
+            'nearest': Image.Resampling.NEAREST,
+            'bilinear': Image.Resampling.BILINEAR,
+            'bicubic': Image.Resampling.BICUBIC,
+            'box': Image.Resampling.BOX,
+            'lanczos': Image.Resampling.LANCZOS,
+            'hamming': Image.Resampling.HAMMING
+        }
+    else:
+        pillow_interp_codes = {
+            'nearest': Image.NEAREST,
+            'bilinear': Image.BILINEAR,
+            'bicubic': Image.BICUBIC,
+            'box': Image.BOX,
+            'lanczos': Image.LANCZOS,
+            'hamming': Image.HAMMING
+        }
+
+
+def imresize(
+    img: np.ndarray,
+    size: Tuple[int, int],
+    return_scale: bool = False,
+    interpolation: str = 'bilinear',
+    out: Optional[np.ndarray] = None,
+    backend: Optional[str] = None
+) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
+    """Resize image to a given size.
+
+    Args:
+        img (ndarray): The input image.
+        size (tuple[int]): Target size (w, h).
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        out (ndarray): The output destination.
+        backend (str | None): The image resize backend type. Options are `cv2`,
+            `pillow`, `None`. If backend is None, the global imread_backend
+            specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+    Returns:
+        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = img.shape[:2]
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported for resize.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        pil_image = Image.fromarray(img)
+        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
+        resized_img = np.array(pil_image)
+    else:
+        resized_img = cv2.resize(
+            img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
+    if not return_scale:
+        return resized_img
+    else:
+        w_scale = size[0] / w
+        h_scale = size[1] / h
+        return resized_img, w_scale, h_scale
+
+
+@no_type_check
+def imresize_to_multiple(
+    img: np.ndarray,
+    divisor: Union[int, Tuple[int, int]],
+    size: Union[int, Tuple[int, int], None] = None,
+    scale_factor: Union[float, int, Tuple[float, float], Tuple[int, int],
+                        None] = None,
+    keep_ratio: bool = False,
+    return_scale: bool = False,
+    interpolation: str = 'bilinear',
+    out: Optional[np.ndarray] = None,
+    backend: Optional[str] = None
+) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
+    """Resize image according to a given size or scale factor and then rounds
+    up the the resized or rescaled image size to the nearest value that can be
+    divided by the divisor.
+
+    Args:
+        img (ndarray): The input image.
+        divisor (int | tuple): Resized image size will be a multiple of
+            divisor. If divisor is a tuple, divisor should be
+            (w_divisor, h_divisor).
+        size (None | int | tuple[int]): Target size (w, h). Default: None.
+        scale_factor (None | float | int | tuple[float] | tuple[int]):
+            Multiplier for spatial size. Should match input size if it is a
+            tuple and the 2D style is (w_scale_factor, h_scale_factor).
+            Default: None.
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Default: False.
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        out (ndarray): The output destination.
+        backend (str | None): The image resize backend type. Options are `cv2`,
+            `pillow`, `None`. If backend is None, the global imread_backend
+            specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+    Returns:
+        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = img.shape[:2]
+    if size is not None and scale_factor is not None:
+        raise ValueError('only one of size or scale_factor should be defined')
+    elif size is None and scale_factor is None:
+        raise ValueError('one of size or scale_factor should be defined')
+    elif size is not None:
+        size = to_2tuple(size)
+        if keep_ratio:
+            size = rescale_size((w, h), size, return_scale=False)
+    else:
+        size = _scale_size((w, h), scale_factor)
+
+    divisor = to_2tuple(divisor)
+    size = tuple(int(np.ceil(s / d)) * d for s, d in zip(size, divisor))
+    resized_img, w_scale, h_scale = imresize(
+        img,
+        size,
+        return_scale=True,
+        interpolation=interpolation,
+        out=out,
+        backend=backend)
+    if return_scale:
+        return resized_img, w_scale, h_scale
+    else:
+        return resized_img
+
+
+def imresize_like(
+    img: np.ndarray,
+    dst_img: np.ndarray,
+    return_scale: bool = False,
+    interpolation: str = 'bilinear',
+    backend: Optional[str] = None
+) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
+    """Resize image to the same size of a given image.
+
+    Args:
+        img (ndarray): The input image.
+        dst_img (ndarray): The target image.
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = dst_img.shape[:2]
+    return imresize(img, (w, h), return_scale, interpolation, backend=backend)
+
+
+def rescale_size(old_size: tuple,
+                 scale: Union[float, int, Tuple[int, int]],
+                 return_scale: bool = False) -> tuple:
+    """Calculate the new size to be rescaled to.
+
+    Args:
+        old_size (tuple[int]): The old size (w, h) of image.
+        scale (float | int | tuple[int]): The scaling factor or maximum size.
+            If it is a float number or an integer, then the image will be
+            rescaled by this factor, else if it is a tuple of 2 integers, then
+            the image will be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image size.
+
+    Returns:
+        tuple[int]: The new rescaled image size.
+    """
+    w, h = old_size
+    if isinstance(scale, (float, int)):
+        if scale <= 0:
+            raise ValueError(f'Invalid scale {scale}, must be positive.')
+        scale_factor = scale
+    elif isinstance(scale, tuple):
+        max_long_edge = max(scale)
+        max_short_edge = min(scale)
+        scale_factor = min(max_long_edge / max(h, w),
+                           max_short_edge / min(h, w))
+    else:
+        raise TypeError(
+            f'Scale must be a number or tuple of int, but got {type(scale)}')
+
+    new_size = _scale_size((w, h), scale_factor)
+
+    if return_scale:
+        return new_size, scale_factor
+    else:
+        return new_size
+
+
+def imrescale(
+    img: np.ndarray,
+    scale: Union[float, int, Tuple[int, int]],
+    return_scale: bool = False,
+    interpolation: str = 'bilinear',
+    backend: Optional[str] = None
+) -> Union[np.ndarray, Tuple[np.ndarray, float]]:
+    """Resize image while keeping the aspect ratio.
+
+    Args:
+        img (ndarray): The input image.
+        scale (float | int | tuple[int]): The scaling factor or maximum size.
+            If it is a float number or an integer, then the image will be
+            rescaled by this factor, else if it is a tuple of 2 integers, then
+            the image will be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The rescaled image.
+    """
+    h, w = img.shape[:2]
+    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
+    rescaled_img = imresize(
+        img, new_size, interpolation=interpolation, backend=backend)
+    if return_scale:
+        return rescaled_img, scale_factor
+    else:
+        return rescaled_img
+
+
+def imflip(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:
+    """Flip an image horizontally or vertically.
+
+    Args:
+        img (ndarray): Image to be flipped.
+        direction (str): The flip direction, either "horizontal" or
+            "vertical" or "diagonal".
+
+    Returns:
+        ndarray: The flipped image.
+    """
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    if direction == 'horizontal':
+        return np.flip(img, axis=1)
+    elif direction == 'vertical':
+        return np.flip(img, axis=0)
+    else:
+        return np.flip(img, axis=(0, 1))
+
+
+def imflip_(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:
+    """Inplace flip an image horizontally or vertically.
+
+    Args:
+        img (ndarray): Image to be flipped.
+        direction (str): The flip direction, either "horizontal" or
+            "vertical" or "diagonal".
+
+    Returns:
+        ndarray: The flipped image (inplace).
+    """
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    if direction == 'horizontal':
+        return cv2.flip(img, 1, img)
+    elif direction == 'vertical':
+        return cv2.flip(img, 0, img)
+    else:
+        return cv2.flip(img, -1, img)
+
+
+def imrotate(img: np.ndarray,
+             angle: float,
+             center: Optional[Tuple[float, float]] = None,
+             scale: float = 1.0,
+             border_value: int = 0,
+             interpolation: str = 'bilinear',
+             auto_bound: bool = False,
+             border_mode: str = 'constant') -> np.ndarray:
+    """Rotate an image.
+
+    Args:
+        img (np.ndarray): Image to be rotated.
+        angle (float): Rotation angle in degrees, positive values mean
+            clockwise rotation.
+        center (tuple[float], optional): Center point (w, h) of the rotation in
+            the source image. If not specified, the center of the image will be
+            used.
+        scale (float): Isotropic scale factor.
+        border_value (int): Border value used in case of a constant border.
+            Defaults to 0.
+        interpolation (str): Same as :func:`resize`.
+        auto_bound (bool): Whether to adjust the image size to cover the whole
+            rotated image.
+        border_mode (str): Pixel extrapolation method. Defaults to 'constant'.
+
+    Returns:
+        np.ndarray: The rotated image.
+    """
+    if center is not None and auto_bound:
+        raise ValueError('`auto_bound` conflicts with `center`')
+    h, w = img.shape[:2]
+    if center is None:
+        center = ((w - 1) * 0.5, (h - 1) * 0.5)
+    assert isinstance(center, tuple)
+
+    matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+    if auto_bound:
+        cos = np.abs(matrix[0, 0])
+        sin = np.abs(matrix[0, 1])
+        new_w = h * sin + w * cos
+        new_h = h * cos + w * sin
+        matrix[0, 2] += (new_w - w) * 0.5
+        matrix[1, 2] += (new_h - h) * 0.5
+        w = int(np.round(new_w))
+        h = int(np.round(new_h))
+    rotated = cv2.warpAffine(
+        img,
+        matrix, (w, h),
+        flags=cv2_interp_codes[interpolation],
+        borderMode=cv2_border_modes[border_mode],
+        borderValue=border_value)
+    return rotated
+
+
+def bbox_clip(bboxes: np.ndarray, img_shape: Tuple[int, int]) -> np.ndarray:
+    """Clip bboxes to fit the image shape.
+
+    Args:
+        bboxes (ndarray): Shape (..., 4*k)
+        img_shape (tuple[int]): (height, width) of the image.
+
+    Returns:
+        ndarray: Clipped bboxes.
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype)
+    cmin[0::2] = img_shape[1] - 1
+    cmin[1::2] = img_shape[0] - 1
+    clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0)
+    return clipped_bboxes
+
+
+def bbox_scaling(bboxes: np.ndarray,
+                 scale: float,
+                 clip_shape: Optional[Tuple[int, int]] = None) -> np.ndarray:
+    """Scaling bboxes w.r.t the box center.
+
+    Args:
+        bboxes (ndarray): Shape(..., 4).
+        scale (float): Scaling factor.
+        clip_shape (tuple[int], optional): If specified, bboxes that exceed the
+            boundary will be clipped according to the given shape (h, w).
+
+    Returns:
+        ndarray: Scaled bboxes.
+    """
+    if float(scale) == 1.0:
+        scaled_bboxes = bboxes.copy()
+    else:
+        w = bboxes[..., 2] - bboxes[..., 0] + 1
+        h = bboxes[..., 3] - bboxes[..., 1] + 1
+        dw = (w * (scale - 1)) * 0.5
+        dh = (h * (scale - 1)) * 0.5
+        scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1)
+    if clip_shape is not None:
+        return bbox_clip(scaled_bboxes, clip_shape)
+    else:
+        return scaled_bboxes
+
+
+def imcrop(
+    img: np.ndarray,
+    bboxes: np.ndarray,
+    scale: float = 1.0,
+    pad_fill: Union[float, list, None] = None
+) -> Union[np.ndarray, List[np.ndarray]]:
+    """Crop image patches.
+
+    3 steps: scale the bboxes -> clip bboxes -> crop and pad.
+
+    Args:
+        img (ndarray): Image to be cropped.
+        bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes.
+        scale (float, optional): Scale ratio of bboxes, the default value
+            1.0 means no scaling.
+        pad_fill (Number | list[Number]): Value to be filled for padding.
+            Default: None, which means no padding.
+
+    Returns:
+        list[ndarray] | ndarray: The cropped image patches.
+    """
+    chn = 1 if img.ndim == 2 else img.shape[2]
+    if pad_fill is not None:
+        if isinstance(pad_fill, (int, float)):
+            pad_fill = [pad_fill for _ in range(chn)]
+        assert len(pad_fill) == chn
+
+    _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes
+    scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32)
+    clipped_bbox = bbox_clip(scaled_bboxes, img.shape)
+
+    patches = []
+    for i in range(clipped_bbox.shape[0]):
+        x1, y1, x2, y2 = tuple(clipped_bbox[i, :])
+        if pad_fill is None:
+            patch = img[y1:y2 + 1, x1:x2 + 1, ...]
+        else:
+            _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :])
+            patch_h = _y2 - _y1 + 1
+            patch_w = _x2 - _x1 + 1
+            if chn == 1:
+                patch_shape = (patch_h, patch_w)
+            else:
+                patch_shape = (patch_h, patch_w, chn)  # type: ignore
+            patch = np.array(
+                pad_fill, dtype=img.dtype) * np.ones(
+                    patch_shape, dtype=img.dtype)
+            x_start = 0 if _x1 >= 0 else -_x1
+            y_start = 0 if _y1 >= 0 else -_y1
+            w = x2 - x1 + 1
+            h = y2 - y1 + 1
+            patch[y_start:y_start + h, x_start:x_start + w,
+                  ...] = img[y1:y1 + h, x1:x1 + w, ...]
+        patches.append(patch)
+
+    if bboxes.ndim == 1:
+        return patches[0]
+    else:
+        return patches
+
+
+def impad(img: np.ndarray,
+          *,
+          shape: Optional[Tuple[int, int]] = None,
+          padding: Union[int, tuple, None] = None,
+          pad_val: Union[float, List] = 0,
+          padding_mode: str = 'constant') -> np.ndarray:
+    """Pad the given image to a certain shape or pad on all sides with
+    specified padding mode and padding value.
+
+    Args:
+        img (ndarray): Image to be padded.
+        shape (tuple[int]): Expected padding shape (h, w). Default: None.
+        padding (int or tuple[int]): Padding on each border. If a single int is
+            provided this is used to pad all borders. If tuple of length 2 is
+            provided this is the padding on left/right and top/bottom
+            respectively. If a tuple of length 4 is provided this is the
+            padding for the left, top, right and bottom borders respectively.
+            Default: None. Note that `shape` and `padding` can not be both
+            set.
+        pad_val (Number | Sequence[Number]): Values to be filled in padding
+            areas when padding_mode is 'constant'. Default: 0.
+        padding_mode (str): Type of padding. Should be: constant, edge,
+            reflect or symmetric. Default: constant.
+
+            - constant: pads with a constant value, this value is specified
+              with pad_val.
+            - edge: pads with the last value at the edge of the image.
+            - reflect: pads with reflection of image without repeating the last
+              value on the edge. For example, padding [1, 2, 3, 4] with 2
+              elements on both sides in reflect mode will result in
+              [3, 2, 1, 2, 3, 4, 3, 2].
+            - symmetric: pads with reflection of image repeating the last value
+              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+              both sides in symmetric mode will result in
+              [2, 1, 1, 2, 3, 4, 4, 3]
+
+    Returns:
+        ndarray: The padded image.
+    """
+
+    assert (shape is not None) ^ (padding is not None)
+    if shape is not None:
+        width = max(shape[1] - img.shape[1], 0)
+        height = max(shape[0] - img.shape[0], 0)
+        padding = (0, 0, width, height)
+
+    # check pad_val
+    if isinstance(pad_val, tuple):
+        assert len(pad_val) == img.shape[-1]
+    elif not isinstance(pad_val, numbers.Number):
+        raise TypeError('pad_val must be a int or a tuple. '
+                        f'But received {type(pad_val)}')
+
+    # check padding
+    if isinstance(padding, tuple) and len(padding) in [2, 4]:
+        if len(padding) == 2:
+            padding = (padding[0], padding[1], padding[0], padding[1])
+    elif isinstance(padding, numbers.Number):
+        padding = (padding, padding, padding, padding)
+    else:
+        raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
+                         f'But received {padding}')
+
+    # check padding mode
+    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+
+    border_type = {
+        'constant': cv2.BORDER_CONSTANT,
+        'edge': cv2.BORDER_REPLICATE,
+        'reflect': cv2.BORDER_REFLECT_101,
+        'symmetric': cv2.BORDER_REFLECT
+    }
+    img = cv2.copyMakeBorder(
+        img,
+        padding[1],
+        padding[3],
+        padding[0],
+        padding[2],
+        border_type[padding_mode],
+        value=pad_val)
+
+    return img
+
+
+def impad_to_multiple(img: np.ndarray,
+                      divisor: int,
+                      pad_val: Union[float, List] = 0) -> np.ndarray:
+    """Pad an image to ensure each edge to be multiple to some number.
+
+    Args:
+        img (ndarray): Image to be padded.
+        divisor (int): Padded image edges will be multiple to divisor.
+        pad_val (Number | Sequence[Number]): Same as :func:`impad`.
+
+    Returns:
+        ndarray: The padded image.
+    """
+    pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor
+    pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor
+    return impad(img, shape=(pad_h, pad_w), pad_val=pad_val)
+
+
+def cutout(img: np.ndarray,
+           shape: Union[int, Tuple[int, int]],
+           pad_val: Union[int, float, tuple] = 0) -> np.ndarray:
+    """Randomly cut out a rectangle from the original img.
+
+    Args:
+        img (ndarray): Image to be cutout.
+        shape (int | tuple[int]): Expected cutout shape (h, w). If given as a
+            int, the value will be used for both h and w.
+        pad_val (int | float | tuple[int | float]): Values to be filled in the
+            cut area. Defaults to 0.
+
+    Returns:
+        ndarray: The cutout image.
+    """
+
+    channels = 1 if img.ndim == 2 else img.shape[2]
+    if isinstance(shape, int):
+        cut_h, cut_w = shape, shape
+    else:
+        assert isinstance(shape, tuple) and len(shape) == 2, \
+            f'shape must be a int or a tuple with length 2, but got type ' \
+            f'{type(shape)} instead.'
+        cut_h, cut_w = shape
+    if isinstance(pad_val, (int, float)):
+        pad_val = tuple([pad_val] * channels)
+    elif isinstance(pad_val, tuple):
+        assert len(pad_val) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(pad_val), channels)
+    else:
+        raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`')
+
+    img_h, img_w = img.shape[:2]
+    y0 = np.random.uniform(img_h)
+    x0 = np.random.uniform(img_w)
+
+    y1 = int(max(0, y0 - cut_h / 2.))
+    x1 = int(max(0, x0 - cut_w / 2.))
+    y2 = min(img_h, y1 + cut_h)
+    x2 = min(img_w, x1 + cut_w)
+
+    if img.ndim == 2:
+        patch_shape = (y2 - y1, x2 - x1)
+    else:
+        patch_shape = (y2 - y1, x2 - x1, channels)  # type: ignore
+
+    img_cutout = img.copy()
+    patch = np.array(
+        pad_val, dtype=img.dtype) * np.ones(
+            patch_shape, dtype=img.dtype)
+    img_cutout[y1:y2, x1:x2, ...] = patch
+
+    return img_cutout
+
+
+def _get_shear_matrix(magnitude: Union[int, float],
+                      direction: str = 'horizontal') -> np.ndarray:
+    """Generate the shear matrix for transformation.
+
+    Args:
+        magnitude (int | float): The magnitude used for shear.
+        direction (str): The flip direction, either "horizontal"
+            or "vertical".
+
+    Returns:
+        ndarray: The shear matrix with dtype float32.
+    """
+    if direction == 'horizontal':
+        shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]])
+    elif direction == 'vertical':
+        shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]])
+    return shear_matrix
+
+
+def imshear(img: np.ndarray,
+            magnitude: Union[int, float],
+            direction: str = 'horizontal',
+            border_value: Union[int, Tuple[int, int]] = 0,
+            interpolation: str = 'bilinear') -> np.ndarray:
+    """Shear an image.
+
+    Args:
+        img (ndarray): Image to be sheared with format (h, w)
+            or (h, w, c).
+        magnitude (int | float): The magnitude used for shear.
+        direction (str): The flip direction, either "horizontal"
+            or "vertical".
+        border_value (int | tuple[int]): Value used in case of a
+            constant border.
+        interpolation (str): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The sheared image.
+    """
+    assert direction in ['horizontal',
+                         'vertical'], f'Invalid direction: {direction}'
+    height, width = img.shape[:2]
+    if img.ndim == 2:
+        channels = 1
+    elif img.ndim == 3:
+        channels = img.shape[-1]
+    if isinstance(border_value, int):
+        border_value = tuple([border_value] * channels)  # type: ignore
+    elif isinstance(border_value, tuple):
+        assert len(border_value) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(border_value), channels)
+    else:
+        raise ValueError(
+            f'Invalid type {type(border_value)} for `border_value`')
+    shear_matrix = _get_shear_matrix(magnitude, direction)
+    sheared = cv2.warpAffine(
+        img,
+        shear_matrix,
+        (width, height),
+        # Note case when the number elements in `border_value`
+        # greater than 3 (e.g. shearing masks whose channels large
+        # than 3) will raise TypeError in `cv2.warpAffine`.
+        # Here simply slice the first 3 values in `border_value`.
+        borderValue=border_value[:3],  # type: ignore
+        flags=cv2_interp_codes[interpolation])
+    return sheared
+
+
+def _get_translate_matrix(offset: Union[int, float],
+                          direction: str = 'horizontal') -> np.ndarray:
+    """Generate the translate matrix.
+
+    Args:
+        offset (int | float): The offset used for translate.
+        direction (str): The translate direction, either
+            "horizontal" or "vertical".
+
+    Returns:
+        ndarray: The translate matrix with dtype float32.
+    """
+    if direction == 'horizontal':
+        translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]])
+    elif direction == 'vertical':
+        translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]])
+    return translate_matrix
+
+
+def imtranslate(img: np.ndarray,
+                offset: Union[int, float],
+                direction: str = 'horizontal',
+                border_value: Union[int, tuple] = 0,
+                interpolation: str = 'bilinear') -> np.ndarray:
+    """Translate an image.
+
+    Args:
+        img (ndarray): Image to be translated with format
+            (h, w) or (h, w, c).
+        offset (int | float): The offset used for translate.
+        direction (str): The translate direction, either "horizontal"
+            or "vertical".
+        border_value (int | tuple[int]): Value used in case of a
+            constant border.
+        interpolation (str): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The translated image.
+    """
+    assert direction in ['horizontal',
+                         'vertical'], f'Invalid direction: {direction}'
+    height, width = img.shape[:2]
+    if img.ndim == 2:
+        channels = 1
+    elif img.ndim == 3:
+        channels = img.shape[-1]
+    if isinstance(border_value, int):
+        border_value = tuple([border_value] * channels)
+    elif isinstance(border_value, tuple):
+        assert len(border_value) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(border_value), channels)
+    else:
+        raise ValueError(
+            f'Invalid type {type(border_value)} for `border_value`.')
+    translate_matrix = _get_translate_matrix(offset, direction)
+    translated = cv2.warpAffine(
+        img,
+        translate_matrix,
+        (width, height),
+        # Note case when the number elements in `border_value`
+        # greater than 3 (e.g. translating masks whose channels
+        # large than 3) will raise TypeError in `cv2.warpAffine`.
+        # Here simply slice the first 3 values in `border_value`.
+        borderValue=border_value[:3],
+        flags=cv2_interp_codes[interpolation])
+    return translated
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/io.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..e10d443da6554865afc98cb2441a0cc8eddf0e16
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/io.py
@@ -0,0 +1,364 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import io
+import os.path as osp
+import warnings
+from pathlib import Path
+from typing import Optional, Union
+
+import cv2
+import mmengine.fileio as fileio
+import numpy as np
+from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION,
+                 IMREAD_UNCHANGED)
+from mmengine.utils import is_filepath, is_str
+
+try:
+    from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG
+except ImportError:
+    TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None
+
+try:
+    from PIL import Image, ImageOps
+except ImportError:
+    Image = None
+
+try:
+    import tifffile
+except ImportError:
+    tifffile = None
+
+jpeg = None
+supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile']
+
+imread_flags = {
+    'color': IMREAD_COLOR,
+    'grayscale': IMREAD_GRAYSCALE,
+    'unchanged': IMREAD_UNCHANGED,
+    'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR,
+    'grayscale_ignore_orientation':
+    IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE
+}
+
+imread_backend = 'cv2'
+
+
+def use_backend(backend: str) -> None:
+    """Select a backend for image decoding.
+
+    Args:
+        backend (str): The image decoding backend type. Options are `cv2`,
+        `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG)
+        and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg`
+        file format.
+    """
+    assert backend in supported_backends
+    global imread_backend
+    imread_backend = backend
+    if imread_backend == 'turbojpeg':
+        if TurboJPEG is None:
+            raise ImportError('`PyTurboJPEG` is not installed')
+        global jpeg
+        if jpeg is None:
+            jpeg = TurboJPEG()
+    elif imread_backend == 'pillow':
+        if Image is None:
+            raise ImportError('`Pillow` is not installed')
+    elif imread_backend == 'tifffile':
+        if tifffile is None:
+            raise ImportError('`tifffile` is not installed')
+
+
+def _jpegflag(flag: str = 'color', channel_order: str = 'bgr'):
+    channel_order = channel_order.lower()
+    if channel_order not in ['rgb', 'bgr']:
+        raise ValueError('channel order must be either "rgb" or "bgr"')
+
+    if flag == 'color':
+        if channel_order == 'bgr':
+            return TJPF_BGR
+        elif channel_order == 'rgb':
+            return TJCS_RGB
+    elif flag == 'grayscale':
+        return TJPF_GRAY
+    else:
+        raise ValueError('flag must be "color" or "grayscale"')
+
+
+def _pillow2array(img,
+                  flag: str = 'color',
+                  channel_order: str = 'bgr') -> np.ndarray:
+    """Convert a pillow image to numpy array.
+
+    Args:
+        img (:obj:`PIL.Image.Image`): The image loaded using PIL
+        flag (str): Flags specifying the color type of a loaded image,
+            candidates are 'color', 'grayscale' and 'unchanged'.
+            Default to 'color'.
+        channel_order (str): The channel order of the output image array,
+            candidates are 'bgr' and 'rgb'. Default to 'bgr'.
+
+    Returns:
+        np.ndarray: The converted numpy array
+    """
+    channel_order = channel_order.lower()
+    if channel_order not in ['rgb', 'bgr']:
+        raise ValueError('channel order must be either "rgb" or "bgr"')
+
+    if flag == 'unchanged':
+        array = np.array(img)
+        if array.ndim >= 3 and array.shape[2] >= 3:  # color image
+            array[:, :, :3] = array[:, :, (2, 1, 0)]  # RGB to BGR
+    else:
+        # Handle exif orientation tag
+        if flag in ['color', 'grayscale']:
+            img = ImageOps.exif_transpose(img)
+        # If the image mode is not 'RGB', convert it to 'RGB' first.
+        if img.mode != 'RGB':
+            if img.mode != 'LA':
+                # Most formats except 'LA' can be directly converted to RGB
+                img = img.convert('RGB')
+            else:
+                # When the mode is 'LA', the default conversion will fill in
+                #  the canvas with black, which sometimes shadows black objects
+                #  in the foreground.
+                #
+                # Therefore, a random color (124, 117, 104) is used for canvas
+                img_rgba = img.convert('RGBA')
+                img = Image.new('RGB', img_rgba.size, (124, 117, 104))
+                img.paste(img_rgba, mask=img_rgba.split()[3])  # 3 is alpha
+        if flag in ['color', 'color_ignore_orientation']:
+            array = np.array(img)
+            if channel_order != 'rgb':
+                array = array[:, :, ::-1]  # RGB to BGR
+        elif flag in ['grayscale', 'grayscale_ignore_orientation']:
+            img = img.convert('L')
+            array = np.array(img)
+        else:
+            raise ValueError(
+                'flag must be "color", "grayscale", "unchanged", '
+                f'"color_ignore_orientation" or "grayscale_ignore_orientation"'
+                f' but got {flag}')
+    return array
+
+
+def imread(img_or_path: Union[np.ndarray, str, Path],
+           flag: str = 'color',
+           channel_order: str = 'bgr',
+           backend: Optional[str] = None,
+           file_client_args: Optional[dict] = None,
+           *,
+           backend_args: Optional[dict] = None) -> np.ndarray:
+    """Read an image.
+
+    Args:
+        img_or_path (ndarray or str or Path): Either a numpy array or str or
+            pathlib.Path. If it is a numpy array (loaded image), then
+            it will be returned as is.
+        flag (str): Flags specifying the color type of a loaded image,
+            candidates are `color`, `grayscale`, `unchanged`,
+            `color_ignore_orientation` and `grayscale_ignore_orientation`.
+            By default, `cv2` and `pillow` backend would rotate the image
+            according to its EXIF info unless called with `unchanged` or
+            `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend
+            always ignore image's EXIF info regardless of the flag.
+            The `turbojpeg` backend only supports `color` and `grayscale`.
+        channel_order (str): Order of channel, candidates are `bgr` and `rgb`.
+        backend (str | None): The image decoding backend type. Options are
+            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.
+            If backend is None, the global imread_backend specified by
+            ``mmcv.use_backend()`` will be used. Default: None.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Default: None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+            Deprecated in version 2.0.0rc4.
+        backend_args (dict, optional): Instantiates the corresponding file
+            backend. It may contain `backend` key to specify the file
+            backend. If it contains, the file backend corresponding to this
+            value will be used and initialized with the remaining values,
+            otherwise the corresponding file backend will be selected
+            based on the prefix of the file path. Defaults to None.
+            New in version 2.0.0rc4.
+
+    Returns:
+        ndarray: Loaded image array.
+
+    Examples:
+        >>> import mmcv
+        >>> img_path = '/path/to/img.jpg'
+        >>> img = mmcv.imread(img_path)
+        >>> img = mmcv.imread(img_path, flag='color', channel_order='rgb',
+        ...     backend='cv2')
+        >>> img = mmcv.imread(img_path, flag='color', channel_order='bgr',
+        ...     backend='pillow')
+        >>> s3_img_path = 's3://bucket/img.jpg'
+        >>> # infer the file backend by the prefix s3
+        >>> img = mmcv.imread(s3_img_path)
+        >>> # manually set the file backend petrel
+        >>> img = mmcv.imread(s3_img_path, backend_args={
+        ...     'backend': 'petrel'})
+        >>> http_img_path = 'http://path/to/img.jpg'
+        >>> img = mmcv.imread(http_img_path)
+        >>> img = mmcv.imread(http_img_path, backend_args={
+        ...     'backend': 'http'})
+    """
+    if file_client_args is not None:
+        warnings.warn(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead', DeprecationWarning)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args" and "backend_args" cannot be set at the '
+                'same time.')
+
+    if isinstance(img_or_path, Path):
+        img_or_path = str(img_or_path)
+
+    if isinstance(img_or_path, np.ndarray):
+        return img_or_path
+    elif is_str(img_or_path):
+        if file_client_args is not None:
+            file_client = fileio.FileClient.infer_client(
+                file_client_args, img_or_path)
+            img_bytes = file_client.get(img_or_path)
+        else:
+            img_bytes = fileio.get(img_or_path, backend_args=backend_args)
+        return imfrombytes(img_bytes, flag, channel_order, backend)
+    else:
+        raise TypeError('"img" must be a numpy array or a str or '
+                        'a pathlib.Path object')
+
+
+def imfrombytes(content: bytes,
+                flag: str = 'color',
+                channel_order: str = 'bgr',
+                backend: Optional[str] = None) -> np.ndarray:
+    """Read an image from bytes.
+
+    Args:
+        content (bytes): Image bytes got from files or other streams.
+        flag (str): Same as :func:`imread`.
+        channel_order (str): The channel order of the output, candidates
+            are 'bgr' and 'rgb'. Default to 'bgr'.
+        backend (str | None): The image decoding backend type. Options are
+            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is
+            None, the global imread_backend specified by ``mmcv.use_backend()``
+            will be used. Default: None.
+
+    Returns:
+        ndarray: Loaded image array.
+
+    Examples:
+        >>> img_path = '/path/to/img.jpg'
+        >>> with open(img_path, 'rb') as f:
+        >>>     img_buff = f.read()
+        >>> img = mmcv.imfrombytes(img_buff)
+        >>> img = mmcv.imfrombytes(img_buff, flag='color', channel_order='rgb')
+        >>> img = mmcv.imfrombytes(img_buff, backend='pillow')
+        >>> img = mmcv.imfrombytes(img_buff, backend='cv2')
+    """
+
+    if backend is None:
+        backend = imread_backend
+    if backend not in supported_backends:
+        raise ValueError(
+            f'backend: {backend} is not supported. Supported '
+            "backends are 'cv2', 'turbojpeg', 'pillow', 'tifffile'")
+    if backend == 'turbojpeg':
+        img = jpeg.decode(  # type: ignore
+            content, _jpegflag(flag, channel_order))
+        if img.shape[-1] == 1:
+            img = img[:, :, 0]
+        return img
+    elif backend == 'pillow':
+        with io.BytesIO(content) as buff:
+            img = Image.open(buff)
+            img = _pillow2array(img, flag, channel_order)
+        return img
+    elif backend == 'tifffile':
+        with io.BytesIO(content) as buff:
+            img = tifffile.imread(buff)
+        return img
+    else:
+        img_np = np.frombuffer(content, np.uint8)
+        flag = imread_flags[flag] if is_str(flag) else flag
+        img = cv2.imdecode(img_np, flag)
+        if flag == IMREAD_COLOR and channel_order == 'rgb':
+            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+        return img
+
+
+def imwrite(img: np.ndarray,
+            file_path: str,
+            params: Optional[list] = None,
+            auto_mkdir: Optional[bool] = None,
+            file_client_args: Optional[dict] = None,
+            *,
+            backend_args: Optional[dict] = None) -> bool:
+    """Write image to file.
+
+    Warning:
+        The parameter `auto_mkdir` will be deprecated in the future and every
+        file clients will make directory automatically.
+
+    Args:
+        img (ndarray): Image array to be written.
+        file_path (str): Image file path.
+        params (None or list): Same as opencv :func:`imwrite` interface.
+        auto_mkdir (bool): If the parent folder of `file_path` does not exist,
+            whether to create it automatically. It will be deprecated.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Default: None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+            Deprecated in version 2.0.0rc4.
+        backend_args (dict, optional): Instantiates the corresponding file
+            backend. It may contain `backend` key to specify the file
+            backend. If it contains, the file backend corresponding to this
+            value will be used and initialized with the remaining values,
+            otherwise the corresponding file backend will be selected
+            based on the prefix of the file path. Defaults to None.
+            New in version 2.0.0rc4.
+
+    Returns:
+        bool: Successful or not.
+
+    Examples:
+        >>> # write to hard disk client
+        >>> ret = mmcv.imwrite(img, '/path/to/img.jpg')
+        >>> # infer the file backend by the prefix s3
+        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg')
+        >>> # manually set the file backend petrel
+        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg', backend_args={
+        ...     'backend': 'petrel'})
+    """
+    if file_client_args is not None:
+        warnings.warn(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead', DeprecationWarning)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args" and "backend_args" cannot be set at the '
+                'same time.')
+
+    assert is_filepath(file_path)
+    file_path = str(file_path)
+    if auto_mkdir is not None:
+        warnings.warn(
+            'The parameter `auto_mkdir` will be deprecated in the future and '
+            'every file clients will make directory automatically.')
+
+    img_ext = osp.splitext(file_path)[-1]
+    # Encode image according to image suffix.
+    # For example, if image path is '/path/your/img.jpg', the encode
+    # format is '.jpg'.
+    flag, img_buff = cv2.imencode(img_ext, img, params)
+
+    if file_client_args is not None:
+        file_client = fileio.FileClient.infer_client(file_client_args,
+                                                     file_path)
+        file_client.put(img_buff.tobytes(), file_path)
+    else:
+        fileio.put(img_buff.tobytes(), file_path, backend_args=backend_args)
+
+    return flag
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/misc.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e923cad4e5f7d210640ee51291a48d82c3b84c32
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/misc.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import numpy as np
+
+import mmcv
+
+try:
+    import torch
+except ImportError:
+    torch = None
+
+
+def tensor2imgs(tensor,
+                mean: Optional[tuple] = None,
+                std: Optional[tuple] = None,
+                to_rgb: bool = True) -> list:
+    """Convert tensor to 3-channel images or 1-channel gray images.
+
+    Args:
+        tensor (torch.Tensor): Tensor that contains multiple images, shape (
+            N, C, H, W). :math:`C` can be either 3 or 1.
+        mean (tuple[float], optional): Mean of images. If None,
+            (0, 0, 0) will be used for tensor with 3-channel,
+            while (0, ) for tensor with 1-channel. Defaults to None.
+        std (tuple[float], optional): Standard deviation of images. If None,
+            (1, 1, 1) will be used for tensor with 3-channel,
+            while (1, ) for tensor with 1-channel. Defaults to None.
+        to_rgb (bool, optional): Whether the tensor was converted to RGB
+            format in the first place. If so, convert it back to BGR.
+            For the tensor with 1 channel, it must be False. Defaults to True.
+
+    Returns:
+        list[np.ndarray]: A list that contains multiple images.
+    """
+
+    if torch is None:
+        raise RuntimeError('pytorch is not installed')
+    assert torch.is_tensor(tensor) and tensor.ndim == 4
+    channels = tensor.size(1)
+    assert channels in [1, 3]
+    if mean is None:
+        mean = (0, ) * channels
+    if std is None:
+        std = (1, ) * channels
+    assert (channels == len(mean) == len(std) == 3) or \
+        (channels == len(mean) == len(std) == 1 and not to_rgb)
+
+    num_imgs = tensor.size(0)
+    mean = np.array(mean, dtype=np.float32)
+    std = np.array(std, dtype=np.float32)
+    imgs = []
+    for img_id in range(num_imgs):
+        img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
+        img = mmcv.imdenormalize(
+            img, mean, std, to_bgr=to_rgb).astype(np.uint8)
+        imgs.append(np.ascontiguousarray(img))
+    return imgs
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/photometric.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/photometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..12cbb90822564bf14cd5176cc3c5532220db40da
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/image/photometric.py
@@ -0,0 +1,561 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional
+
+import cv2
+import numpy as np
+from mmengine.utils import is_tuple_of
+from PIL import Image, ImageEnhance
+
+from .colorspace import bgr2gray, gray2bgr
+from .io import imread_backend
+
+
+def imnormalize(img, mean, std, to_rgb=True):
+    """Normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    img = img.copy().astype(np.float32)
+    return imnormalize_(img, mean, std, to_rgb)
+
+
+def imnormalize_(img, mean, std, to_rgb=True):
+    """Inplace normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    # cv2 inplace normalization does not accept uint8
+    assert img.dtype != np.uint8
+    mean = np.float64(mean.reshape(1, -1))
+    stdinv = 1 / np.float64(std.reshape(1, -1))
+    if to_rgb:
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
+    cv2.subtract(img, mean, img)  # inplace
+    cv2.multiply(img, stdinv, img)  # inplace
+    return img
+
+
+def imdenormalize(img, mean, std, to_bgr=True):
+    assert img.dtype != np.uint8
+    mean = mean.reshape(1, -1).astype(np.float64)
+    std = std.reshape(1, -1).astype(np.float64)
+    img = cv2.multiply(img, std)  # make a copy
+    cv2.add(img, mean, img)  # inplace
+    if to_bgr:
+        cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img)  # inplace
+    return img
+
+
+def iminvert(img):
+    """Invert (negate) an image.
+
+    Args:
+        img (ndarray): Image to be inverted.
+
+    Returns:
+        ndarray: The inverted image.
+    """
+    return np.full_like(img, 255) - img
+
+
+def solarize(img, thr=128):
+    """Solarize an image (invert all pixel values above a threshold)
+
+    Args:
+        img (ndarray): Image to be solarized.
+        thr (int): Threshold for solarizing (0 - 255).
+
+    Returns:
+        ndarray: The solarized image.
+    """
+    img = np.where(img < thr, img, 255 - img)
+    return img
+
+
+def posterize(img, bits):
+    """Posterize an image (reduce the number of bits for each color channel)
+
+    Args:
+        img (ndarray): Image to be posterized.
+        bits (int): Number of bits (1 to 8) to use for posterizing.
+
+    Returns:
+        ndarray: The posterized image.
+    """
+    shift = 8 - bits
+    img = np.left_shift(np.right_shift(img, shift), shift)
+    return img
+
+
+def adjust_color(img, alpha=1, beta=None, gamma=0, backend=None):
+    r"""It blends the source image and its gray image:
+
+    .. math::
+        output = img * alpha + gray\_img * beta + gamma
+
+    Args:
+        img (ndarray): The input source image.
+        alpha (int | float): Weight for the source image. Default 1.
+        beta (int | float): Weight for the converted gray image.
+            If None, it's assigned the value (1 - `alpha`).
+        gamma (int | float): Scalar added to each sum.
+            Same as :func:`cv2.addWeighted`. Default 0.
+        backend (str | None): The image processing backend type. Options are
+            `cv2`, `pillow`, `None`. If backend is None, the global
+            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
+            used. Defaults to None.
+
+    Returns:
+        ndarray: Colored image which has the same size and dtype as input.
+    """
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        warnings.warn("Only use 'alpha' for pillow backend.")
+        # Image.fromarray defaultly supports RGB, not BGR.
+        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
+        enhancer = ImageEnhance.Color(pil_image)
+        pil_image = enhancer.enhance(alpha)
+        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
+    else:
+        gray_img = bgr2gray(img)
+        gray_img = np.tile(gray_img[..., None], [1, 1, 3])
+        if beta is None:
+            beta = 1 - alpha
+        colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma)
+        if not colored_img.dtype == np.uint8:
+            # Note when the dtype of `img` is not the default `np.uint8`
+            # (e.g. np.float32), the value in `colored_img` got from cv2
+            # is not guaranteed to be in range [0, 255], so here clip
+            # is needed.
+            colored_img = np.clip(colored_img, 0, 255)
+        return colored_img.astype(img.dtype)
+
+
+def imequalize(img):
+    """Equalize the image histogram.
+
+    This function applies a non-linear mapping to the input image,
+    in order to create a uniform distribution of grayscale values
+    in the output image.
+
+    Args:
+        img (ndarray): Image to be equalized.
+
+    Returns:
+        ndarray: The equalized image.
+    """
+
+    def _scale_channel(im, c):
+        """Scale the data in the corresponding channel."""
+        im = im[:, :, c]
+        # Compute the histogram of the image channel.
+        histo = np.histogram(im, 256, (0, 255))[0]
+        # For computing the step, filter out the nonzeros.
+        nonzero_histo = histo[histo > 0]
+        step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
+        if not step:
+            lut = np.array(range(256))
+        else:
+            # Compute the cumulative sum, shifted by step // 2
+            # and then normalized by step.
+            lut = (np.cumsum(histo) + (step // 2)) // step
+            # Shift lut, prepending with 0.
+            lut = np.concatenate([[0], lut[:-1]], 0)
+            # handle potential integer overflow
+            lut[lut > 255] = 255
+        # If step is zero, return the original image.
+        # Otherwise, index from lut.
+        return np.where(np.equal(step, 0), im, lut[im])
+
+    # Scales each channel independently and then stacks
+    # the result.
+    s1 = _scale_channel(img, 0)
+    s2 = _scale_channel(img, 1)
+    s3 = _scale_channel(img, 2)
+    equalized_img = np.stack([s1, s2, s3], axis=-1)
+    return equalized_img.astype(img.dtype)
+
+
+def adjust_brightness(img, factor=1., backend=None):
+    """Adjust image brightness.
+
+    This function controls the brightness of an image. An
+    enhancement factor of 0.0 gives a black image.
+    A factor of 1.0 gives the original image. This function
+    blends the source image and the degenerated black image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be brightened.
+        factor (float): A value controls the enhancement.
+            Factor 1.0 returns the original image, lower
+            factors mean less color (brightness, contrast,
+            etc), and higher values more. Default 1.
+        backend (str | None): The image processing backend type. Options are
+            `cv2`, `pillow`, `None`. If backend is None, the global
+            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
+            used. Defaults to None.
+
+    Returns:
+        ndarray: The brightened image.
+    """
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        # Image.fromarray defaultly supports RGB, not BGR.
+        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
+        enhancer = ImageEnhance.Brightness(pil_image)
+        pil_image = enhancer.enhance(factor)
+        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
+    else:
+        degenerated = np.zeros_like(img)
+        # Note manually convert the dtype to np.float32, to
+        # achieve as close results as PIL.ImageEnhance.Brightness.
+        # Set beta=1-factor, and gamma=0
+        brightened_img = cv2.addWeighted(
+            img.astype(np.float32), factor, degenerated.astype(np.float32),
+            1 - factor, 0)
+        brightened_img = np.clip(brightened_img, 0, 255)
+        return brightened_img.astype(img.dtype)
+
+
+def adjust_contrast(img, factor=1., backend=None):
+    """Adjust image contrast.
+
+    This function controls the contrast of an image. An
+    enhancement factor of 0.0 gives a solid grey
+    image. A factor of 1.0 gives the original image. It
+    blends the source image and the degenerated mean image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be contrasted. BGR order.
+        factor (float): Same as :func:`mmcv.adjust_brightness`.
+        backend (str | None): The image processing backend type. Options are
+            `cv2`, `pillow`, `None`. If backend is None, the global
+            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
+            used. Defaults to None.
+
+    Returns:
+        ndarray: The contrasted image.
+    """
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        # Image.fromarray defaultly supports RGB, not BGR.
+        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
+        enhancer = ImageEnhance.Contrast(pil_image)
+        pil_image = enhancer.enhance(factor)
+        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
+    else:
+        gray_img = bgr2gray(img)
+        hist = np.histogram(gray_img, 256, (0, 255))[0]
+        mean = round(np.sum(gray_img) / np.sum(hist))
+        degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype)
+        degenerated = gray2bgr(degenerated)
+        contrasted_img = cv2.addWeighted(
+            img.astype(np.float32), factor, degenerated.astype(np.float32),
+            1 - factor, 0)
+        contrasted_img = np.clip(contrasted_img, 0, 255)
+        return contrasted_img.astype(img.dtype)
+
+
+def auto_contrast(img, cutoff=0):
+    """Auto adjust image contrast.
+
+    This function maximize (normalize) image contrast by first removing cutoff
+    percent of the lightest and darkest pixels from the histogram and remapping
+    the image so that the darkest pixel becomes black (0), and the lightest
+    becomes white (255).
+
+    Args:
+        img (ndarray): Image to be contrasted. BGR order.
+        cutoff (int | float | tuple): The cutoff percent of the lightest and
+            darkest pixels to be removed. If given as tuple, it shall be
+            (low, high). Otherwise, the single value will be used for both.
+            Defaults to 0.
+
+    Returns:
+        ndarray: The contrasted image.
+    """
+
+    def _auto_contrast_channel(im, c, cutoff):
+        im = im[:, :, c]
+        # Compute the histogram of the image channel.
+        histo = np.histogram(im, 256, (0, 255))[0]
+        # Remove cut-off percent pixels from histo
+        histo_sum = np.cumsum(histo)
+        cut_low = histo_sum[-1] * cutoff[0] // 100
+        cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100
+        histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low
+        histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0)
+
+        # Compute mapping
+        low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1]
+        # If all the values have been cut off, return the origin img
+        if low >= high:
+            return im
+        scale = 255.0 / (high - low)
+        offset = -low * scale
+        lut = np.array(range(256))
+        lut = lut * scale + offset
+        lut = np.clip(lut, 0, 255)
+        return lut[im]
+
+    if isinstance(cutoff, (int, float)):
+        cutoff = (cutoff, cutoff)
+    else:
+        assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \
+            f'float or tuple, but got {type(cutoff)} instead.'
+    # Auto adjusts contrast for each channel independently and then stacks
+    # the result.
+    s1 = _auto_contrast_channel(img, 0, cutoff)
+    s2 = _auto_contrast_channel(img, 1, cutoff)
+    s3 = _auto_contrast_channel(img, 2, cutoff)
+    contrasted_img = np.stack([s1, s2, s3], axis=-1)
+    return contrasted_img.astype(img.dtype)
+
+
+def adjust_sharpness(img, factor=1., kernel=None):
+    """Adjust image sharpness.
+
+    This function controls the sharpness of an image. An
+    enhancement factor of 0.0 gives a blurred image. A
+    factor of 1.0 gives the original image. And a factor
+    of 2.0 gives a sharpened image. It blends the source
+    image and the degenerated mean image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be sharpened. BGR order.
+        factor (float): Same as :func:`mmcv.adjust_brightness`.
+        kernel (np.ndarray, optional): Filter kernel to be applied on the img
+            to obtain the degenerated img. Defaults to None.
+
+    Note:
+        No value sanity check is enforced on the kernel set by users. So with
+        an inappropriate kernel, the ``adjust_sharpness`` may fail to perform
+        the function its name indicates but end up performing whatever
+        transform determined by the kernel.
+
+    Returns:
+        ndarray: The sharpened image.
+    """
+
+    if kernel is None:
+        # adopted from PIL.ImageFilter.SMOOTH
+        kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13
+    assert isinstance(kernel, np.ndarray), \
+        f'kernel must be of type np.ndarray, but got {type(kernel)} instead.'
+    assert kernel.ndim == 2, \
+        f'kernel must have a dimension of 2, but got {kernel.ndim} instead.'
+
+    degenerated = cv2.filter2D(img, -1, kernel)
+    sharpened_img = cv2.addWeighted(
+        img.astype(np.float32), factor, degenerated.astype(np.float32),
+        1 - factor, 0)
+    sharpened_img = np.clip(sharpened_img, 0, 255)
+    return sharpened_img.astype(img.dtype)
+
+
+def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True):
+    """AlexNet-style PCA jitter.
+
+    This data augmentation is proposed in `ImageNet Classification with Deep
+    Convolutional Neural Networks
+    <https://dl.acm.org/doi/pdf/10.1145/3065386>`_.
+
+    Args:
+        img (ndarray): Image to be adjusted lighting. BGR order.
+        eigval (ndarray): the eigenvalue of the convariance matrix of pixel
+            values, respectively.
+        eigvec (ndarray): the eigenvector of the convariance matrix of pixel
+            values, respectively.
+        alphastd (float): The standard deviation for distribution of alpha.
+            Defaults to 0.1
+        to_rgb (bool): Whether to convert img to rgb.
+
+    Returns:
+        ndarray: The adjusted image.
+    """
+    assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \
+        f'eigval and eigvec should both be of type np.ndarray, got ' \
+        f'{type(eigval)} and {type(eigvec)} instead.'
+
+    assert eigval.ndim == 1 and eigvec.ndim == 2
+    assert eigvec.shape == (3, eigval.shape[0])
+    n_eigval = eigval.shape[0]
+    assert isinstance(alphastd, float), 'alphastd should be of type float, ' \
+        f'got {type(alphastd)} instead.'
+
+    img = img.copy().astype(np.float32)
+    if to_rgb:
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
+
+    alpha = np.random.normal(0, alphastd, n_eigval)
+    alter = eigvec \
+        * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \
+        * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval))
+    alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape)
+    img_adjusted = img + alter
+    return img_adjusted
+
+
+def lut_transform(img, lut_table):
+    """Transform array by look-up table.
+
+    The function lut_transform fills the output array with values from the
+    look-up table. Indices of the entries are taken from the input array.
+
+    Args:
+        img (ndarray): Image to be transformed.
+        lut_table (ndarray): look-up table of 256 elements; in case of
+            multi-channel input array, the table should either have a single
+            channel (in this case the same table is used for all channels) or
+            the same number of channels as in the input array.
+
+    Returns:
+        ndarray: The transformed image.
+    """
+    assert isinstance(img, np.ndarray)
+    assert 0 <= np.min(img) and np.max(img) <= 255
+    assert isinstance(lut_table, np.ndarray)
+    assert lut_table.shape == (256, )
+
+    return cv2.LUT(np.array(img, dtype=np.uint8), lut_table)
+
+
+def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
+    """Use CLAHE method to process the image.
+
+    See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
+    Graphics Gems, 1994:474-485.` for more information.
+
+    Args:
+        img (ndarray): Image to be processed.
+        clip_limit (float): Threshold for contrast limiting. Default: 40.0.
+        tile_grid_size (tuple[int]): Size of grid for histogram equalization.
+            Input image will be divided into equally sized rectangular tiles.
+            It defines the number of tiles in row and column. Default: (8, 8).
+
+    Returns:
+        ndarray: The processed image.
+    """
+    assert isinstance(img, np.ndarray)
+    assert img.ndim == 2
+    assert isinstance(clip_limit, (float, int))
+    assert is_tuple_of(tile_grid_size, int)
+    assert len(tile_grid_size) == 2
+
+    clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
+    return clahe.apply(np.array(img, dtype=np.uint8))
+
+
+def adjust_hue(img: np.ndarray,
+               hue_factor: float,
+               backend: Optional[str] = None) -> np.ndarray:
+    """Adjust hue of an image.
+
+    The image hue is adjusted by converting the image to HSV and cyclically
+    shifting the intensities in the hue channel (H). The image is then
+    converted back to original image mode.
+
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+
+    Modified from
+    https://github.com/pytorch/vision/blob/main/torchvision/
+    transforms/functional.py
+
+    Args:
+        img (ndarray): Image to be adjusted.
+        hue_factor (float):  How much to shift the hue channel. Should be in
+            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+            HSV space in positive and negative direction respectively.
+            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+            with complementary colors while 0 gives the original image.
+        backend (str | None): The image processing backend type. Options are
+            `cv2`, `pillow`, `None`. If backend is None, the global
+            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
+            used. Defaults to None.
+
+    Returns:
+        ndarray: Hue adjusted image.
+    """
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f'hue_factor:{hue_factor} is not in [-0.5, 0.5].')
+    if not (isinstance(img, np.ndarray) and (img.ndim in {2, 3})):
+        raise TypeError('img should be ndarray with dim=[2 or 3].')
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        # Image.fromarray defaultly supports RGB, not BGR.
+        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
+        input_mode = pil_image.mode
+        if input_mode in {'L', '1', 'I', 'F'}:
+            return pil_image
+
+        h, s, v = pil_image.convert('HSV').split()
+
+        np_h = np.array(h, dtype=np.uint8)
+        # uint8 addition take cares of rotation across boundaries
+        with np.errstate(over='ignore'):
+            np_h += np.uint8(hue_factor * 255)
+        h = Image.fromarray(np_h, 'L')
+
+        pil_image = Image.merge('HSV', (h, s, v)).convert(input_mode)
+        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
+    else:
+        dtype = img.dtype
+        img = img.astype(np.uint8)
+        hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
+        h, s, v = cv2.split(hsv_img)
+        h = h.astype(np.uint8)
+        # uint8 addition take cares of rotation across boundaries
+        with np.errstate(over='ignore'):
+            h += np.uint8(hue_factor * 255)
+        hsv_img = cv2.merge([h, s, v])
+        return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/__init__.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffad9b2bfdbf94cf7963a48ca5252959d43fe29c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/__init__.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import IS_MLU_AVAILABLE
+from .active_rotated_filter import active_rotated_filter
+from .assign_score_withk import assign_score_withk
+from .ball_query import ball_query
+from .bbox import bbox_overlaps
+from .bezier_align import BezierAlign, bezier_align
+from .bias_act import bias_act
+from .border_align import BorderAlign, border_align
+from .box_iou_quadri import box_iou_quadri
+from .box_iou_rotated import box_iou_rotated
+from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
+from .cc_attention import CrissCrossAttention
+from .chamfer_distance import chamfer_distance
+from .contour_expand import contour_expand
+from .conv2d_gradfix import conv2d, conv_transpose2d
+from .convex_iou import convex_giou, convex_iou
+from .corner_pool import CornerPool
+from .correlation import Correlation
+from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
+from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack,
+                              ModulatedDeformRoIPoolPack, deform_roi_pool)
+from .deprecated_wrappers import Conv2d_deprecated as Conv2d
+from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d
+from .deprecated_wrappers import Linear_deprecated as Linear
+from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d
+from .diff_iou_rotated import diff_iou_rotated_2d, diff_iou_rotated_3d
+from .filtered_lrelu import filtered_lrelu
+from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
+                         sigmoid_focal_loss, softmax_focal_loss)
+from .furthest_point_sample import (furthest_point_sample,
+                                    furthest_point_sample_with_dist)
+from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu
+from .gather_points import gather_points
+from .group_points import GroupAll, QueryAndGroup, grouping_operation
+from .info import get_compiler_version, get_compiling_cuda_version
+from .iou3d import (boxes_iou3d, boxes_iou_bev, boxes_overlap_bev, nms3d,
+                    nms3d_normal, nms_bev, nms_normal_bev)
+from .knn import knn
+from .masked_conv import MaskedConv2d, masked_conv2d
+from .min_area_polygons import min_area_polygons
+from .modulated_deform_conv import (ModulatedDeformConv2d,
+                                    ModulatedDeformConv2dPack,
+                                    modulated_deform_conv2d)
+from .multi_scale_deform_attn import MultiScaleDeformableAttention
+from .nms import batched_nms, nms, nms_match, nms_quadri, nms_rotated, soft_nms
+from .pixel_group import pixel_group
+from .point_sample import (SimpleRoIAlign, point_sample,
+                           rel_roi_point_to_rel_img_point)
+from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
+                              points_in_boxes_part)
+from .points_in_polygons import points_in_polygons
+from .points_sampler import PointsSampler
+from .prroi_pool import PrRoIPool, prroi_pool
+from .psa_mask import PSAMask
+from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated
+from .roi_align import RoIAlign, roi_align
+from .roi_align_rotated import RoIAlignRotated, roi_align_rotated
+from .roi_pool import RoIPool, roi_pool
+from .roiaware_pool3d import RoIAwarePool3d
+from .roipoint_pool3d import RoIPointPool3d
+from .rotated_feature_align import rotated_feature_align
+from .saconv import SAConv2d
+from .scatter_points import DynamicScatter, dynamic_scatter
+from .sparse_conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
+                          SparseConvTranspose3d, SparseInverseConv2d,
+                          SparseInverseConv3d, SubMConv2d, SubMConv3d)
+from .sparse_modules import SparseModule, SparseSequential
+from .sparse_pool import SparseMaxPool2d, SparseMaxPool3d
+from .sparse_structure import SparseConvTensor, scatter_nd
+from .sync_bn import SyncBatchNorm
+from .three_interpolate import three_interpolate
+from .three_nn import three_nn
+from .tin_shift import TINShift, tin_shift
+from .upfirdn2d import filter2d, upfirdn2d, upsample2d
+from .voxelize import Voxelization, voxelization
+
+__all__ = [
+    'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe',
+    'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack',
+    'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack',
+    'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss',
+    'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss',
+    'get_compiler_version', 'get_compiling_cuda_version', 'MaskedConv2d',
+    'masked_conv2d', 'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',
+    'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match',
+    'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d',
+    'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask',
+    'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',
+    'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',
+    'box_iou_rotated', 'box_iou_quadri', 'RoIPointPool3d', 'nms_rotated',
+    'knn', 'ball_query', 'upfirdn2d', 'FusedBiasLeakyReLU',
+    'fused_bias_leakyrelu', 'rotated_feature_align', 'RiRoIAlignRotated',
+    'riroi_align_rotated', 'RoIAlignRotated', 'roi_align_rotated',
+    'pixel_group', 'QueryAndGroup', 'GroupAll', 'grouping_operation',
+    'contour_expand', 'three_nn', 'three_interpolate',
+    'MultiScaleDeformableAttention', 'BorderAlign', 'border_align',
+    'gather_points', 'furthest_point_sample', 'nms_quadri',
+    'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',
+    'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev',
+    'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization',
+    'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d',
+    'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d',
+    'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d',
+    'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d',
+    'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part',
+    'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',
+    'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',
+    'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',
+    'PrRoIPool', 'prroi_pool', 'bias_act', 'filtered_lrelu', 'conv2d',
+    'conv_transpose2d', 'filter2d', 'upsample2d', 'BezierAlign', 'bezier_align'
+]
+
+if IS_MLU_AVAILABLE:
+    from .deform_conv import DeformConv2dPack_MLU  # noqa:F401
+    from .modulated_deform_conv import \
+        ModulatedDeformConv2dPack_MLU  # noqa:F401
+    __all__.extend(['ModulatedDeformConv2dPack_MLU', 'DeformConv2dPack_MLU'])
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/active_rotated_filter.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/active_rotated_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8ba43dd41cca14e0d74b4ba7dd8316da2ba4abe
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/active_rotated_filter.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['active_rotated_filter_forward', 'active_rotated_filter_backward'])
+
+
+class ActiveRotatedFilterFunction(Function):
+    """Encoding the orientation information and generating orientation-
+    sensitive features.
+
+    The details are described in the paper `Align Deep Features for Oriented
+    Object Detection  <https://arxiv.org/abs/2008.09397>_`.
+    """
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input (torch.Tensor): Input features with shape
+                [num_output_planes, num_input_planes, num_orientations, H, W].
+            indices (torch.Tensor): Indices with shape
+                [num_orientations, H, W, num_rotations].
+
+        Returns:
+            torch.Tensor: Refined features with shape [num_output_planes *
+            num_rotations, num_input_planes * num_orientations, H, W].
+        """
+        ctx.save_for_backward(input, indices)
+        op, ip, o, h, w = input.size()
+        o, h, w, r = indices.size()
+        output = input.new_zeros((op * r, ip * o, h, w))
+        ext_module.active_rotated_filter_forward(input, indices, output)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
+        """
+        Args:
+            grad_output (torch.Tensor): The gradient of output features
+                with shape [num_output_planes * num_rotations,
+                num_input_planes * num_orientations, H, W].
+
+        Returns:
+            torch.Tensor: The gradient of input features with shape
+            [num_output_planes, num_input_planes, num_orientations, H, W].
+        """
+        input, indices = ctx.saved_tensors
+        grad_in = torch.zeros_like(input)
+        ext_module.active_rotated_filter_backward(grad_out, indices, grad_in)
+        return grad_in, None
+
+
+active_rotated_filter = ActiveRotatedFilterFunction.apply
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/assign_score_withk.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/assign_score_withk.py
new file mode 100644
index 0000000000000000000000000000000000000000..deca0892bddc52b51e9d2543a9e893f0bd67ebdb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/assign_score_withk.py
@@ -0,0 +1,131 @@
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['assign_score_withk_forward', 'assign_score_withk_backward'])
+
+
+class AssignScoreWithK(Function):
+    r"""Perform weighted sum to generate output features according to scores.
+    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
+    scene_seg/lib/paconv_lib/src/gpu>`_.
+
+    This is a memory-efficient CUDA implementation of assign_scores operation,
+    which first transform all point features with weight bank, then assemble
+    neighbor features with ``knn_idx`` and perform weighted sum of ``scores``.
+
+    See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
+        more detailed descriptions.
+
+    Note:
+        This implementation assumes using ``neighbor`` kernel input, which is
+            (point_features - center_features, point_features).
+        See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/
+        pointnet2/paconv.py#L128 for more details.
+    """
+
+    @staticmethod
+    def forward(ctx,
+                scores: torch.Tensor,
+                point_features: torch.Tensor,
+                center_features: torch.Tensor,
+                knn_idx: torch.Tensor,
+                aggregate: str = 'sum') -> torch.Tensor:
+        """
+        Args:
+            scores (torch.Tensor): (B, npoint, K, M), predicted scores to
+                aggregate weight matrices in the weight bank.
+                ``npoint`` is the number of sampled centers.
+                ``K`` is the number of queried neighbors.
+                ``M`` is the number of weight matrices in the weight bank.
+            point_features (torch.Tensor): (B, N, M, out_dim)
+                Pre-computed point features to be aggregated.
+            center_features (torch.Tensor): (B, N, M, out_dim)
+                Pre-computed center features to be aggregated.
+            knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN.
+                We assume the first idx in each row is the idx of the center.
+            aggregate (str, optional): Aggregation method.
+                Can be 'sum', 'avg' or 'max'. Defaults: 'sum'.
+
+        Returns:
+            torch.Tensor: (B, out_dim, npoint, K), the aggregated features.
+        """
+        agg = {'sum': 0, 'avg': 1, 'max': 2}
+
+        B, N, M, out_dim = point_features.size()
+        _, npoint, K, _ = scores.size()
+
+        output = point_features.new_zeros((B, out_dim, npoint, K))
+        ext_module.assign_score_withk_forward(
+            point_features.contiguous(),
+            center_features.contiguous(),
+            scores.contiguous(),
+            knn_idx.contiguous(),
+            output,
+            B=B,
+            N0=N,
+            N1=npoint,
+            M=M,
+            K=K,
+            O=out_dim,
+            aggregate=agg[aggregate])
+
+        ctx.save_for_backward(output, point_features, center_features, scores,
+                              knn_idx)
+        ctx.agg = agg[aggregate]
+
+        return output
+
+    @staticmethod
+    def backward(
+        ctx, grad_out: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None]:
+        """
+        Args:
+            grad_out (torch.Tensor): (B, out_dim, npoint, K)
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains five elements. The first one
+            is the gradient of ``scores`` whose shape is (B, npoint, K, M). The
+            second is the gradient of ``point_features`` whose shape is
+            (B, N, M, out_dim). The third is the gradient of
+            ``center_features`` with the shape of (B, N, M, out_dim). The last
+            two are ``None``.
+        """
+        _, point_features, center_features, scores, knn_idx = ctx.saved_tensors
+
+        agg = ctx.agg
+
+        B, N, M, out_dim = point_features.size()
+        _, npoint, K, _ = scores.size()
+
+        grad_point_features = point_features.new_zeros(point_features.shape)
+        grad_center_features = center_features.new_zeros(center_features.shape)
+        grad_scores = scores.new_zeros(scores.shape)
+
+        ext_module.assign_score_withk_backward(
+            grad_out.contiguous(),
+            point_features.contiguous(),
+            center_features.contiguous(),
+            scores.contiguous(),
+            knn_idx.contiguous(),
+            grad_point_features,
+            grad_center_features,
+            grad_scores,
+            B=B,
+            N0=N,
+            N1=npoint,
+            M=M,
+            K=K,
+            O=out_dim,
+            aggregate=agg)
+
+        return grad_scores, grad_point_features, \
+            grad_center_features, None, None
+
+
+assign_score_withk = AssignScoreWithK.apply
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/ball_query.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/ball_query.py
new file mode 100644
index 0000000000000000000000000000000000000000..a89b36b52b1cce8ab90274418a4d1346796d971c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/ball_query.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ball_query_forward', 'stack_ball_query_forward'])
+
+
+class BallQuery(Function):
+    """Find nearby points in spherical space."""
+
+    @staticmethod
+    def forward(
+            ctx,
+            min_radius: float,
+            max_radius: float,
+            sample_num: int,
+            xyz: torch.Tensor,
+            center_xyz: torch.Tensor,
+            xyz_batch_cnt: Optional[torch.Tensor] = None,
+            center_xyz_batch_cnt: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            min_radius (float): minimum radius of the balls.
+            max_radius (float): maximum radius of the balls.
+            sample_num (int): maximum number of features in the balls.
+            xyz (torch.Tensor): (B, N, 3) xyz coordinates of the features,
+                or staked input (N1 + N2 ..., 3).
+            center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball
+                query, or staked input (M1 + M2 ..., 3).
+            xyz_batch_cnt: (batch_size): Stacked input xyz coordinates nums in
+                each batch, just like (N1, N2, ...). Defaults to None.
+                New in version 1.7.0.
+            center_xyz_batch_cnt: (batch_size): Stacked centers coordinates
+                nums in each batch, just line (M1, M2, ...). Defaults to None.
+                New in version 1.7.0.
+
+        Returns:
+            torch.Tensor: (B, npoint, nsample) tensor with the indices of the
+            features that form the query balls.
+        """
+        assert center_xyz.is_contiguous()
+        assert xyz.is_contiguous()
+        assert min_radius < max_radius
+        if xyz_batch_cnt is not None and center_xyz_batch_cnt is not None:
+            assert xyz_batch_cnt.dtype == torch.int
+            assert center_xyz_batch_cnt.dtype == torch.int
+            idx = center_xyz.new_zeros((center_xyz.shape[0], sample_num),
+                                       dtype=torch.int32)
+            ext_module.stack_ball_query_forward(
+                center_xyz,
+                center_xyz_batch_cnt,
+                xyz,
+                xyz_batch_cnt,
+                idx,
+                max_radius=max_radius,
+                nsample=sample_num,
+            )
+        else:
+            B, N, _ = xyz.size()
+            npoint = center_xyz.size(1)
+            idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int32)
+            ext_module.ball_query_forward(
+                center_xyz,
+                xyz,
+                idx,
+                b=B,
+                n=N,
+                m=npoint,
+                min_radius=min_radius,
+                max_radius=max_radius,
+                nsample=sample_num)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, a=None) -> Tuple[None, None, None, None]:
+        return None, None, None, None
+
+
+ball_query = BallQuery.apply
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/bbox.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..4583ba7d5a867a86f4a798c524b2c48e9c8f1ae0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/bbox.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])
+
+
+def _bbox_overlaps_cpu(bboxes1: torch.Tensor,
+                       bboxes2: torch.Tensor,
+                       mode: str = 'iou',
+                       aligned: bool = False,
+                       offset: int = 0) -> torch.Tensor:
+    assert mode in ['iou', 'iof']
+
+    if aligned:
+        lt = torch.max(bboxes1[:, :2], bboxes2[:, :2])  # [rows, 2]
+        rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:])  # [rows, 2]
+
+        wh = (rb - lt + offset).clamp(min=0)  # [rows, 2]
+        overlap = wh[:, 0] * wh[:, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + offset)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + offset)
+            ious = overlap / (area1 + area2 - overlap)
+        else:
+            ious = overlap / area1
+    else:
+        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [rows, cols, 2]
+        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [rows, cols, 2]
+
+        wh = (rb - lt + offset).clamp(min=0)  # [rows, cols, 2]
+        overlap = wh[:, :, 0] * wh[:, :, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + offset)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + offset)
+            ious = overlap / (area1[:, None] + area2 - overlap)
+        else:
+            ious = overlap / (area1[:, None])
+
+    return ious
+
+
+def bbox_overlaps(bboxes1: torch.Tensor,
+                  bboxes2: torch.Tensor,
+                  mode: str = 'iou',
+                  aligned: bool = False,
+                  offset: int = 0) -> torch.Tensor:
+    """Calculate overlap between two set of bboxes.
+
+    If ``aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (torch.Tensor): shape (m, 4) in <x1, y1, x2, y2> format or
+            empty.
+        bboxes2 (torch.Tensor): shape (n, 4) in <x1, y1, x2, y2> format or
+            empty. If aligned is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+
+    Returns:
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (m, n) else (m, 1).
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> bbox_overlaps(bboxes1, bboxes2)
+        tensor([[0.5000, 0.0000, 0.0000],
+                [0.0000, 0.0000, 1.0000],
+                [0.0000, 0.0000, 0.0000]])
+
+    Example:
+        >>> empty = torch.FloatTensor([])
+        >>> nonempty = torch.FloatTensor([
+        >>>     [0, 0, 10, 9],
+        >>> ])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    mode_dict = {'iou': 0, 'iof': 1}
+    assert mode in mode_dict.keys()
+    mode_flag = mode_dict[mode]
+    # Either the boxes are empty or the length of boxes' last dimension is 4
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+    assert offset == 1 or offset == 0
+
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+
+    if aligned:
+        assert rows == cols
+        ious = bboxes1.new_zeros(rows)
+    else:
+        ious = bboxes1.new_zeros((rows, cols))
+
+    if rows * cols == 0:
+        return ious
+
+    if bboxes1.device.type == 'cpu' and torch.__version__ == 'parrots':
+        return _bbox_overlaps_cpu(
+            bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset)
+
+    ext_module.bbox_overlaps(
+        bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)
+
+    return ious
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/bezier_align.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/bezier_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..6db7f5c8d8567b4c6ad5df2eb77f6cf60a4f0bb6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/bezier_align.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['bezier_align_forward', 'bezier_align_backward'])
+
+
+class BezierAlignFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input: torch.Tensor,
+                beziers: torch.Tensor,
+                output_size: Union[int, Tuple[int, int]],
+                spatial_scale: Union[int, float] = 1.0,
+                sampling_ratio: int = 0,
+                aligned: bool = True) -> torch.Tensor:
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.input_shape = input.size()
+        ctx.sampling_ratio = sampling_ratio
+        ctx.aligned = aligned
+
+        assert beziers.size(1) == 17
+        output_shape = (beziers.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        ext_module.bezier_align_forward(
+            input,
+            beziers,
+            output,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            aligned=ctx.aligned)
+
+        ctx.save_for_backward(beziers)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor):
+        beziers = ctx.saved_tensors[0]
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+        grad_output = grad_output.contiguous()
+        ext_module.bezier_align_backward(
+            grad_output,
+            beziers,
+            grad_input,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            aligned=ctx.aligned)
+        return grad_input, None, None, None, None, None
+
+
+bezier_align = BezierAlignFunction.apply
+
+
+class BezierAlign(nn.Module):
+    """Bezier align pooling layer.
+
+    Args:
+        output_size (tuple): h, w
+        spatial_scale (float): scale the input boxes by this number
+        sampling_ratio (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        aligned (bool): if False, use the legacy implementation in
+            MMDetection. If True, align the results more perfectly.
+
+    Note:
+        The implementation of BezierAlign is modified from
+        https://github.com/aim-uofa/AdelaiDet
+
+        The meaning of aligned=True:
+
+        Given a continuous coordinate c, its two neighboring pixel
+        indices (in our pixel model) are computed by floor(c - 0.5) and
+        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
+        indices [0] and [1] (which are sampled from the underlying signal
+        at continuous coordinates 0.5 and 1.5). But the original roi_align
+        (aligned=False) does not subtract the 0.5 when computing
+        neighboring pixel indices and therefore it uses pixels with a
+        slightly incorrect alignment (relative to our pixel model) when
+        performing bilinear interpolation.
+
+        With `aligned=True`,
+        we first appropriately scale the ROI and then shift it by -0.5
+        prior to calling roi_align. This produces the correct neighbors;
+
+        The difference does not make a difference to the model's
+        performance if ROIAlign is used together with conv layers.
+    """
+
+    def __init__(
+        self,
+        output_size: Tuple,
+        spatial_scale: Union[int, float],
+        sampling_ratio: int,
+        aligned: bool = True,
+    ) -> None:
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.aligned = aligned
+
+    def forward(self, input: torch.Tensor,
+                beziers: torch.Tensor) -> torch.Tensor:
+        """BezierAlign forward.
+
+        Args:
+            inputs (Tensor): input features.
+            beziers (Tensor): beziers for align.
+        """
+        return bezier_align(input, beziers, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.aligned)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale})'
+        s += f'sampling_ratio={self.sampling_ratio})'
+        s += f'aligned={self.aligned})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/bias_act.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/bias_act.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dfa55743e0a0a6e8ad408c5937d9097cce6ea7d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/bias_act.py
@@ -0,0 +1,375 @@
+# Modified from
+# https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.py
+
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# source: https://github.com/open-mmlab/mmediting/blob/dev-1.x/mmedit/models/editors/stylegan3/stylegan3_ops/ops/bias_act.py # noqa
+"""Custom PyTorch ops for efficient bias and activation."""
+
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['bias_act'])
+
+
+class EasyDict(dict):
+    """Convenience class that behaves like a dict but allows access with the
+    attribute syntax."""
+
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        self[name] = value
+
+    def __delattr__(self, name: str) -> None:
+        del self[name]
+
+
+activation_funcs = {
+    'linear':
+    EasyDict(
+        func=lambda x, **_: x,
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=1,
+        ref='',
+        has_2nd_grad=False),
+    'relu':
+    EasyDict(
+        func=lambda x, **_: torch.nn.functional.relu(x),
+        def_alpha=0,
+        def_gain=np.sqrt(2),
+        cuda_idx=2,
+        ref='y',
+        has_2nd_grad=False),
+    'lrelu':
+    EasyDict(
+        func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha),
+        def_alpha=0.2,
+        def_gain=np.sqrt(2),
+        cuda_idx=3,
+        ref='y',
+        has_2nd_grad=False),
+    'tanh':
+    EasyDict(
+        func=lambda x, **_: torch.tanh(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=4,
+        ref='y',
+        has_2nd_grad=True),
+    'sigmoid':
+    EasyDict(
+        func=lambda x, **_: torch.sigmoid(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=5,
+        ref='y',
+        has_2nd_grad=True),
+    'elu':
+    EasyDict(
+        func=lambda x, **_: torch.nn.functional.elu(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=6,
+        ref='y',
+        has_2nd_grad=True),
+    'selu':
+    EasyDict(
+        func=lambda x, **_: torch.nn.functional.selu(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=7,
+        ref='y',
+        has_2nd_grad=True),
+    'softplus':
+    EasyDict(
+        func=lambda x, **_: torch.nn.functional.softplus(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=8,
+        ref='y',
+        has_2nd_grad=True),
+    'swish':
+    EasyDict(
+        func=lambda x, **_: torch.sigmoid(x) * x,
+        def_alpha=0,
+        def_gain=np.sqrt(2),
+        cuda_idx=9,
+        ref='x',
+        has_2nd_grad=True),
+}
+
+_null_tensor = torch.empty([0])
+
+
+def bias_act(input: torch.Tensor,
+             bias: Optional[torch.Tensor] = None,
+             dim: int = 1,
+             act: str = 'linear',
+             alpha: Optional[Union[float, int]] = None,
+             gain: Optional[float] = None,
+             clamp: Optional[float] = None,
+             use_custom_op: bool = True):
+    r"""Fused bias and activation function.
+
+    Adds `bias` to activation tensor `input`, and evaluates activation
+    function `act`, and scales the result by `gain`. Each of the steps is
+    optional.
+
+    In most cases, the fused op is considerably more efficient than performing
+    the same calculation using standard PyTorch ops. It supports first and
+    second order gradients, but not third order gradients.
+
+    Args:
+        input (torch.Tensor): Input activation tensor. Can be of any shape.
+        bias (torch.Tensor): Bias vector, or `None` to disable.
+            Must be a 1D tensor of the same type as `input`. The shape must
+            be known, and it must match the dimension of `input` corresponding
+            to `dim`. Defaults to None.
+        dim (int): The dimension in `input` corresponding to the elements of
+            `bias`. The value of `dim` is ignored if `b` is not specified.
+            Defaults to 1.
+        act (str): Name of the activation function to evaluate, or `"linear"`
+            to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
+            "swish", etc. See `activation_funcs` for a full list. `None` is not
+            allowed. Defaults to `linear`.
+        alpha (float or int): Shape parameter for the activation
+            function, or `None` to use the default. Defaults to None.
+        gain (float): Scaling factor for the output tensor, or `None`
+            to use default. See `activation_funcs` for the default scaling of
+            each activation function. If unsure, consider specifying 1.
+            Defaults to None.
+        clamp (float):  Clamp the output values to `[-clamp, +clamp]`,
+            or `None` to disable the clamping (default). Defaults to None.
+        use_custom_op (bool): Whether to use customized op.
+            Defaults to True.
+
+    Returns:
+        torch.Tensor: Tensor of the same shape and datatype as `input`.
+    """
+    assert isinstance(input, torch.Tensor)
+    if use_custom_op and input.is_cuda:
+        return _bias_act_cuda(
+            dim=dim, act=act, alpha=alpha, gain=gain,
+            clamp=clamp).apply(input, bias)
+    return _bias_act_ref(
+        input=input,
+        bias=bias,
+        dim=dim,
+        act=act,
+        alpha=alpha,
+        gain=gain,
+        clamp=clamp)
+
+
+def _bias_act_ref(input: torch.Tensor,
+                  bias: Optional[torch.Tensor] = None,
+                  dim: int = 1,
+                  act: str = 'linear',
+                  alpha: Optional[Union[float, int]] = None,
+                  gain: Optional[float] = None,
+                  clamp: Optional[float] = None):
+    """Slow reference implementation of `bias_act()` using standard PyTorch
+    ops.
+
+    Adds `bias` to activation tensor `input`, and evaluates activation
+    function `act`, and scales the result by `gain`. Each of the steps is
+    optional.
+
+    In most cases, the fused op is considerably more efficient than performing
+    the same calculation using standard PyTorch ops. It supports first and
+    second order gradients, but not third order gradients.
+
+    Args:
+        input (torch.Tensor): Input activation tensor. Can be of any shape.
+        bias (torch.Tensor): Bias vector, or `None` to disable.
+            Must be a 1D tensor of the same type as `input`. The shape must
+            be known, and it must match the dimension of `input` corresponding
+            to `dim`. Defaults to None.
+        dim (int): The dimension in `input` corresponding to the elements of
+            `bias`. The value of `dim` is ignored if `b` is not specified.
+            Defaults to 1.
+        act (str): Name of the activation function to evaluate, or `"linear"`
+            to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
+            "swish", etc. See `activation_funcs` for a full list. `None` is not
+            allowed. Defaults to `linear`.
+        alpha (float or int): Shape parameter for the activation
+            function, or `None` to use the default. Defaults to None.
+        gain (float): Scaling factor for the output tensor, or `None`
+            to use default. See `activation_funcs` for the default scaling of
+            each activation function. If unsure, consider specifying 1.
+            Defaults to None.
+        clamp (float):  Clamp the output values to
+            `[-clamp, +clamp]`, or `None` to disable the clamping (default).
+            Defaults to None.
+
+    Returns:
+        torch.Tensor: Tensor of the same shape and datatype as `input`.
+    """
+    assert isinstance(input, torch.Tensor)
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Add bias.
+    if bias is not None:
+        assert isinstance(bias, torch.Tensor) and bias.ndim == 1
+        assert 0 <= dim < input.ndim
+        assert bias.shape[0] == input.shape[dim]
+        input = input + bias.reshape(
+            [-1 if i == dim else 1 for i in range(input.ndim)])
+
+    # Evaluate activation function.
+    alpha = float(alpha)
+    output = spec.func(input, alpha=alpha)
+
+    # Scale by gain.
+    gain = float(gain)
+    if gain != 1:
+        output = output * gain
+
+    # Clamp.
+    if clamp >= 0:
+        # pylint: disable=invalid-unary-operand-type
+        output = output.clamp(-clamp, clamp)
+    return output
+
+
+_bias_act_cuda_cache: Dict = dict()
+
+
+def _bias_act_cuda(dim: int = 1,
+                   act: str = 'linear',
+                   alpha: Optional[Union[float, int]] = None,
+                   gain: Optional[float] = None,
+                   clamp: Optional[float] = None):
+    """"Fast CUDA implementation of `bias_act()` using custom ops.
+
+    Args:
+        dim (int): The dimension in `x` corresponding to the elements of `b`.
+            The value of `dim` is ignored if `b` is not specified.
+            Defaults to 1.
+        act (str): Name of the activation function to evaluate, or `"linear"`
+            to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
+            "swish", etc. See `activation_funcs` for a full list. `None` is not
+            allowed. Defaults to `linear`.
+        alpha (float | int): Shape parameter for the activation
+            function, or `None` to use the default. Defaults to None.
+        gain (float): Scaling factor for the output tensor, or `None`
+            to use default. See `activation_funcs` for the default scaling of
+            each activation function. If unsure, consider specifying 1.
+            Defaults to None.
+        clamp (float): Clamp the output values to `[-clamp, +clamp]`,
+            or `None` to disable the clamping (default). Defaults to None.
+
+    Returns:
+        torch.Tensor: Tensor of the same shape and datatype as `x`.
+    """
+    # Parse arguments.
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Lookup from cache.
+    key = (dim, act, alpha, gain, clamp)
+    if key in _bias_act_cuda_cache:
+        return _bias_act_cuda_cache[key]
+
+    # Forward op.
+    class BiasActCuda(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, x, b):  # pylint: disable=arguments-differ
+            ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride(
+                1) == 1 else torch.contiguous_format
+            x = x.contiguous(memory_format=ctx.memory_format)
+            b = b.contiguous() if b is not None else _null_tensor.to(x.device)
+            y = x
+            if act != 'linear' or gain != 1 or clamp >= 0 or (
+                    b is not _null_tensor.to(x.device)):
+                y = ext_module.bias_act(x, b, _null_tensor.to(x.device),
+                                        _null_tensor.to(x.device),
+                                        _null_tensor.to(x.device), 0, dim,
+                                        spec.cuda_idx, alpha, gain, clamp)
+            ctx.save_for_backward(
+                x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor.to(
+                    x.device), b if 'x' in spec.ref or spec.has_2nd_grad else
+                _null_tensor.to(x.device),
+                y if 'y' in spec.ref else _null_tensor.to(x.device))
+            return y
+
+        @staticmethod
+        def backward(ctx, dy):  # pylint: disable=arguments-differ
+            dy = dy.contiguous(memory_format=ctx.memory_format)
+            x, b, y = ctx.saved_tensors
+            dx = None
+            db = None
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                dx = dy
+                if act != 'linear' or gain != 1 or clamp >= 0:
+                    dx = BiasActCudaGrad.apply(dy, x, b, y)
+
+            if ctx.needs_input_grad[1]:
+                db = dx.sum([i for i in range(dx.ndim) if i != dim])
+
+            return dx, db
+
+    # Backward op.
+    class BiasActCudaGrad(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, dy, x, b, y):  # pylint: disable=arguments-differ
+            ctx.memory_format = torch.channels_last if dy.ndim > 2 and (
+                dy.stride(1) == 1) else torch.contiguous_format
+            dx = ext_module.bias_act(dy, b, x, y, _null_tensor.to(x.device), 1,
+                                     dim, spec.cuda_idx, alpha, gain, clamp)
+            ctx.save_for_backward(
+                dy if spec.has_2nd_grad else _null_tensor.to(x.device), x, b,
+                y)
+            return dx
+
+        @staticmethod
+        def backward(ctx, d_dx):  # pylint: disable=arguments-differ
+            d_dx = d_dx.contiguous(memory_format=ctx.memory_format)
+            dy, x, b, y = ctx.saved_tensors
+            d_dy = None
+            d_x = None
+            d_b = None
+            d_y = None
+
+            if ctx.needs_input_grad[0]:
+                d_dy = BiasActCudaGrad.apply(d_dx, x, b, y)
+
+            if spec.has_2nd_grad and (ctx.needs_input_grad[1]
+                                      or ctx.needs_input_grad[2]):
+                d_x = ext_module.bias_act(d_dx, b, x, y, dy, 2, dim,
+                                          spec.cuda_idx, alpha, gain, clamp)
+
+            if spec.has_2nd_grad and ctx.needs_input_grad[2]:
+                d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim])
+
+            return d_dy, d_x, d_b, d_y
+
+    # Add to cache.
+    _bias_act_cuda_cache[key] = BiasActCuda
+    return BiasActCuda
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/border_align.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/border_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..c09501b962cfce10b1da87e6b651d61911eb8406
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/border_align.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# modified from
+# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['border_align_forward', 'border_align_backward'])
+
+
+class BorderAlignFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, boxes, pool_size):
+        return g.op(
+            'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor, boxes: torch.Tensor,
+                pool_size: int) -> torch.Tensor:
+        ctx.pool_size = pool_size
+        ctx.input_shape = input.size()
+
+        assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]'
+        assert boxes.size(2) == 4, \
+            'the last dimension of boxes must be (x1, y1, x2, y2)'
+        assert input.size(1) % 4 == 0, \
+            'the channel for input feature must be divisible by factor 4'
+
+        # [B, C//4, H*W, 4]
+        output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4)
+        output = input.new_zeros(output_shape)
+        # `argmax_idx` only used for backward
+        argmax_idx = input.new_zeros(output_shape).to(torch.int)
+
+        ext_module.border_align_forward(
+            input, boxes, output, argmax_idx, pool_size=ctx.pool_size)
+
+        ctx.save_for_backward(boxes, argmax_idx)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx,
+                 grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
+        boxes, argmax_idx = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+        # complex head architecture may cause grad_output uncontiguous
+        grad_output = grad_output.contiguous()
+        ext_module.border_align_backward(
+            grad_output,
+            boxes,
+            argmax_idx,
+            grad_input,
+            pool_size=ctx.pool_size)
+        return grad_input, None, None
+
+
+border_align = BorderAlignFunction.apply
+
+
+class BorderAlign(nn.Module):
+    r"""Border align pooling layer.
+
+    Applies border_align over the input feature based on predicted bboxes.
+    The details were described in the paper
+    `BorderDet: Border Feature for Dense Object Detection
+    <https://arxiv.org/abs/2007.11056>`_.
+
+    For each border line (e.g. top, left, bottom or right) of each box,
+    border_align does the following:
+
+    1. uniformly samples ``pool_size`` +1 positions on this line, involving
+       the start and end points.
+    2. the corresponding features on these points are computed by bilinear
+       interpolation.
+    3. max pooling over all the ``pool_size`` +1 positions are used for
+       computing pooled feature.
+
+    Args:
+        pool_size (int): number of positions sampled over the boxes' borders
+            (e.g. top, bottom, left, right).
+    """
+
+    def __init__(self, pool_size: int):
+        super().__init__()
+        self.pool_size = pool_size
+
+    def forward(self, input: torch.Tensor,
+                boxes: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
+                [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom,
+                right features respectively.
+            boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
+
+        Returns:
+            torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is
+            (top,left,bottom,right) for the last dimension.
+        """
+        return border_align(input, boxes, self.pool_size)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(pool_size={self.pool_size})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/box_iou_quadri.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/box_iou_quadri.py
new file mode 100644
index 0000000000000000000000000000000000000000..89747fdf1f03e0491351f876385ba3c1369ebaf7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/box_iou_quadri.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['box_iou_quadri'])
+
+
+def box_iou_quadri(bboxes1: torch.Tensor,
+                   bboxes2: torch.Tensor,
+                   mode: str = 'iou',
+                   aligned: bool = False) -> torch.Tensor:
+    """Return intersection-over-union (Jaccard index) of boxes.
+
+    Both sets of boxes are expected to be in
+    (x1, y1, ..., x4, y4) format.
+
+    If ``aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (torch.Tensor): quadrilateral bboxes 1. It has shape (N, 8),
+            indicating (x1, y1, ..., x4, y4) for each row.
+        bboxes2 (torch.Tensor): quadrilateral bboxes 2. It has shape (M, 8),
+            indicating (x1, y1, ..., x4, y4) for each row.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+
+    Returns:
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (N, M) else (N,).
+    """
+    assert mode in ['iou', 'iof']
+    mode_dict = {'iou': 0, 'iof': 1}
+    mode_flag = mode_dict[mode]
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+    if aligned:
+        ious = bboxes1.new_zeros(rows)
+    else:
+        ious = bboxes1.new_zeros(rows * cols)
+    bboxes1 = bboxes1.contiguous()
+    bboxes2 = bboxes2.contiguous()
+    ext_module.box_iou_quadri(
+        bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
+    if not aligned:
+        ious = ious.view(rows, cols)
+    return ious
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/box_iou_rotated.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/box_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..a811531d4283f37cab80ae06af35d8d223d4b949
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/box_iou_rotated.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])
+
+
+def box_iou_rotated(bboxes1: torch.Tensor,
+                    bboxes2: torch.Tensor,
+                    mode: str = 'iou',
+                    aligned: bool = False,
+                    clockwise: bool = True) -> torch.Tensor:
+    """Return intersection-over-union (Jaccard index) of boxes.
+
+    Both sets of boxes are expected to be in
+    (x_center, y_center, width, height, angle) format.
+
+    If ``aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+
+    .. note::
+        The operator assumes:
+
+        1) The positive direction along x axis is left -> right.
+
+        2) The positive direction along y axis is top -> down.
+
+        3) The w border is in parallel with x axis when angle = 0.
+
+        However, there are 2 opposite definitions of the positive angular
+        direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports
+        both definitions and uses CW by default.
+
+        Please set ``clockwise=False`` if you are using the CCW definition.
+
+        The coordinate system when ``clockwise`` is ``True`` (default)
+
+            .. code-block:: none
+
+                0-------------------> x (0 rad)
+                |  A-------------B
+                |  |             |
+                |  |     box     h
+                |  |   angle=0   |
+                |  D------w------C
+                v
+                y (pi/2 rad)
+
+            In such coordination system the rotation matrix is
+
+            .. math::
+                \\begin{pmatrix}
+                \\cos\\alpha & -\\sin\\alpha \\\\
+                \\sin\\alpha & \\cos\\alpha
+                \\end{pmatrix}
+
+            The coordinates of the corner point A can be calculated as:
+
+            .. math::
+                P_A=
+                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+                =
+                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+                \\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\
+                \\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+                =
+                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha
+                \\\\
+                y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+
+        The coordinate system when ``clockwise`` is ``False``
+
+            .. code-block:: none
+
+                0-------------------> x (0 rad)
+                |  A-------------B
+                |  |             |
+                |  |     box     h
+                |  |   angle=0   |
+                |  D------w------C
+                v
+                y (-pi/2 rad)
+
+            In such coordination system the rotation matrix is
+
+            .. math::
+                \\begin{pmatrix}
+                \\cos\\alpha & \\sin\\alpha \\\\
+                -\\sin\\alpha & \\cos\\alpha
+                \\end{pmatrix}
+
+            The coordinates of the corner point A can be calculated as:
+
+            .. math::
+                P_A=
+                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+                =
+                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+                \\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\
+                -\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+                =
+                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha
+                \\\\
+                y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+    Args:
+        boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),
+            indicating (x, y, w, h, theta) for each row. Note that theta is in
+            radian.
+        boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),
+            indicating (x, y, w, h, theta) for each row. Note that theta is in
+            radian.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+        clockwise (bool): flag indicating whether the positive angular
+            orientation is clockwise. default True.
+            `New in version 1.4.3.`
+
+    Returns:
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (N, M) else (N,).
+    """
+    assert mode in ['iou', 'iof']
+    mode_dict = {'iou': 0, 'iof': 1}
+    mode_flag = mode_dict[mode]
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+    if aligned:
+        ious = bboxes1.new_zeros(rows)
+    else:
+        if bboxes1.device.type == 'mlu':
+            ious = bboxes1.new_zeros([rows, cols])
+        else:
+            ious = bboxes1.new_zeros(rows * cols)
+    if not clockwise:
+        flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
+        flip_mat[-1] = -1
+        bboxes1 = bboxes1 * flip_mat
+        bboxes2 = bboxes2 * flip_mat
+    if bboxes1.device.type == 'npu':
+        scale_mat = bboxes1.new_ones(bboxes1.shape[-1])
+        scale_mat[-1] = 1.0 / 0.01745329252
+        bboxes1 = bboxes1 * scale_mat
+        bboxes2 = bboxes2 * scale_mat
+    bboxes1 = bboxes1.contiguous()
+    bboxes2 = bboxes2.contiguous()
+    ext_module.box_iou_rotated(
+        bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
+    if not aligned:
+        ious = ious.view(rows, cols)
+    return ious
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/carafe.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/carafe.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7e79c275e2bea62ce7e08fb6e6e4629c7565600
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/carafe.py
@@ -0,0 +1,300 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import normal_init, xavier_init
+from mmengine.registry import MODELS
+from torch import Tensor
+from torch.autograd import Function
+from torch.nn.modules.module import Module
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward',
+    'carafe_backward'
+])
+
+
+class CARAFENaiveFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+                 group_size: int, scale_factor: int) -> Tensor:
+        return g.op(
+            'mmcv::MMCVCARAFENaive',
+            features,
+            masks,
+            kernel_size_i=kernel_size,
+            group_size_i=group_size,
+            scale_factor_f=scale_factor)
+
+    @staticmethod
+    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+                group_size: int, scale_factor: int) -> Tensor:
+        assert scale_factor >= 1
+        assert masks.size(1) == kernel_size * kernel_size * group_size
+        assert masks.size(-1) == features.size(-1) * scale_factor
+        assert masks.size(-2) == features.size(-2) * scale_factor
+        assert features.size(1) % group_size == 0
+        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
+        ctx.kernel_size = kernel_size
+        ctx.group_size = group_size
+        ctx.scale_factor = scale_factor
+        ctx.feature_size = features.size()
+        ctx.mask_size = masks.size()
+
+        n, c, h, w = features.size()
+        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
+        ext_module.carafe_naive_forward(
+            features,
+            masks,
+            output,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+
+        if features.requires_grad or masks.requires_grad or \
+                torch.__version__ == 'parrots':
+            ctx.save_for_backward(features, masks)
+        return output
+
+    @staticmethod
+    def backward(
+            ctx,
+            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
+        assert grad_output.is_cuda
+
+        features, masks = ctx.saved_tensors
+        kernel_size = ctx.kernel_size
+        group_size = ctx.group_size
+        scale_factor = ctx.scale_factor
+
+        grad_input = torch.zeros_like(features)
+        grad_masks = torch.zeros_like(masks)
+        ext_module.carafe_naive_backward(
+            grad_output.contiguous(),
+            features,
+            masks,
+            grad_input,
+            grad_masks,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+
+        return grad_input, grad_masks, None, None, None
+
+
+carafe_naive = CARAFENaiveFunction.apply
+
+
+class CARAFENaive(Module):
+
+    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+        super().__init__()
+
+        assert isinstance(kernel_size, int) and isinstance(
+            group_size, int) and isinstance(scale_factor, int)
+        self.kernel_size = kernel_size
+        self.group_size = group_size
+        self.scale_factor = scale_factor
+
+    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
+        return carafe_naive(features, masks, self.kernel_size, self.group_size,
+                            self.scale_factor)
+
+
+class CARAFEFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+                 group_size: int, scale_factor: int) -> Tensor:
+        return g.op(
+            'mmcv::MMCVCARAFE',
+            features,
+            masks,
+            kernel_size_i=kernel_size,
+            group_size_i=group_size,
+            scale_factor_f=scale_factor)
+
+    @staticmethod
+    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+                group_size: int, scale_factor: int) -> Tensor:
+        assert scale_factor >= 1
+        assert masks.size(1) == kernel_size * kernel_size * group_size
+        assert masks.size(-1) == features.size(-1) * scale_factor
+        assert masks.size(-2) == features.size(-2) * scale_factor
+        assert features.size(1) % group_size == 0
+        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
+        ctx.kernel_size = kernel_size
+        ctx.group_size = group_size
+        ctx.scale_factor = scale_factor
+        ctx.feature_size = features.size()
+        ctx.mask_size = masks.size()
+
+        n, c, h, w = features.size()
+        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
+        routput = features.new_zeros(output.size(), requires_grad=False)
+        rfeatures = features.new_zeros(features.size(), requires_grad=False)
+        rmasks = masks.new_zeros(masks.size(), requires_grad=False)
+        ext_module.carafe_forward(
+            features,
+            masks,
+            rfeatures,
+            routput,
+            rmasks,
+            output,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+
+        if features.requires_grad or masks.requires_grad or \
+                torch.__version__ == 'parrots':
+            ctx.save_for_backward(features, masks, rfeatures)
+        return output
+
+    @staticmethod
+    def backward(
+            ctx,
+            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
+        features, masks, rfeatures = ctx.saved_tensors
+        kernel_size = ctx.kernel_size
+        group_size = ctx.group_size
+        scale_factor = ctx.scale_factor
+
+        rgrad_output = torch.zeros_like(grad_output, requires_grad=False)
+        rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False)
+        rgrad_input = torch.zeros_like(features, requires_grad=False)
+        rgrad_masks = torch.zeros_like(masks, requires_grad=False)
+        grad_input = torch.zeros_like(features, requires_grad=False)
+        grad_masks = torch.zeros_like(masks, requires_grad=False)
+        ext_module.carafe_backward(
+            grad_output.contiguous(),
+            rfeatures,
+            masks,
+            rgrad_output,
+            rgrad_input_hs,
+            rgrad_input,
+            rgrad_masks,
+            grad_input,
+            grad_masks,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+        return grad_input, grad_masks, None, None, None
+
+
+carafe = CARAFEFunction.apply
+
+
+class CARAFE(Module):
+    """ CARAFE: Content-Aware ReAssembly of FEatures
+
+    Please refer to `CARAFE: Content-Aware ReAssembly of FEatures
+    <https://arxiv.org/abs/1905.02188>`_ for more details.
+
+    Args:
+        kernel_size (int): reassemble kernel size
+        group_size (int): reassemble group size
+        scale_factor (int): upsample ratio
+
+    Returns:
+        upsampled feature map
+    """
+
+    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+        super().__init__()
+
+        assert isinstance(kernel_size, int) and isinstance(
+            group_size, int) and isinstance(scale_factor, int)
+        self.kernel_size = kernel_size
+        self.group_size = group_size
+        self.scale_factor = scale_factor
+
+    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
+        return carafe(features, masks, self.kernel_size, self.group_size,
+                      self.scale_factor)
+
+
+@MODELS.register_module(name='carafe')
+class CARAFEPack(nn.Module):
+    """A unified package of CARAFE upsampler that contains: 1) channel
+    compressor 2) content encoder 3) CARAFE op.
+
+    Official implementation of ICCV 2019 paper
+    `CARAFE: Content-Aware ReAssembly of FEatures
+    <https://arxiv.org/abs/1905.02188>`_.
+
+    Args:
+        channels (int): input feature channels
+        scale_factor (int): upsample ratio
+        up_kernel (int): kernel size of CARAFE op
+        up_group (int): group size of CARAFE op
+        encoder_kernel (int): kernel size of content encoder
+        encoder_dilation (int): dilation of content encoder
+        compressed_channels (int): output channels of channels compressor
+
+    Returns:
+        upsampled feature map
+    """
+
+    def __init__(self,
+                 channels: int,
+                 scale_factor: int,
+                 up_kernel: int = 5,
+                 up_group: int = 1,
+                 encoder_kernel: int = 3,
+                 encoder_dilation: int = 1,
+                 compressed_channels: int = 64):
+        super().__init__()
+        self.channels = channels
+        self.scale_factor = scale_factor
+        self.up_kernel = up_kernel
+        self.up_group = up_group
+        self.encoder_kernel = encoder_kernel
+        self.encoder_dilation = encoder_dilation
+        self.compressed_channels = compressed_channels
+        self.channel_compressor = nn.Conv2d(channels, self.compressed_channels,
+                                            1)
+        self.content_encoder = nn.Conv2d(
+            self.compressed_channels,
+            self.up_kernel * self.up_kernel * self.up_group *
+            self.scale_factor * self.scale_factor,
+            self.encoder_kernel,
+            padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),
+            dilation=self.encoder_dilation,
+            groups=1)
+        self.init_weights()
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+        normal_init(self.content_encoder, std=0.001)
+
+    def kernel_normalizer(self, mask: Tensor) -> Tensor:
+        mask = F.pixel_shuffle(mask, self.scale_factor)
+        n, mask_c, h, w = mask.size()
+        # use float division explicitly,
+        # to void inconsistency while exporting to onnx
+        mask_channel = int(mask_c / float(self.up_kernel**2))
+        mask = mask.view(n, mask_channel, -1, h, w)
+
+        mask = F.softmax(mask, dim=2, dtype=mask.dtype)
+        mask = mask.view(n, mask_c, h, w).contiguous()
+
+        return mask
+
+    def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor:
+        x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        compressed_x = self.channel_compressor(x)
+        mask = self.content_encoder(compressed_x)
+        mask = self.kernel_normalizer(mask)
+
+        x = self.feature_reassemble(x, mask)
+        return x
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/cc_attention.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/cc_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..efde7b703c8c50ecf5aa604e756422f0be488759
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/cc_attention.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.registry import MODELS
+
+from mmcv.cnn import Scale
+
+
+def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
+    """Returns a diagonal matrix of size [n, n].
+
+    The diagonal are all "-inf". This is for avoiding calculating the
+    overlapped element in the Criss-Cross twice.
+    """
+    return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)
+
+
+@MODELS.register_module()
+class CrissCrossAttention(nn.Module):
+    """Criss-Cross Attention Module.
+
+    .. note::
+        Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch
+        to a pure PyTorch and equivalent implementation. For more
+        details, please refer to https://github.com/open-mmlab/mmcv/pull/1201.
+
+        Speed comparison for one forward pass
+
+        - Input size: [2,512,97,97]
+        - Device: 1 NVIDIA GeForce RTX 2080 Ti
+
+        +-----------------------+---------------+------------+---------------+
+        |                       |PyTorch version|CUDA version|Relative speed |
+        +=======================+===============+============+===============+
+        |with torch.no_grad()   |0.00554402 s   |0.0299619 s |5.4x           |
+        +-----------------------+---------------+------------+---------------+
+        |no with torch.no_grad()|0.00562803 s   |0.0301349 s |5.4x           |
+        +-----------------------+---------------+------------+---------------+
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+    """
+
+    def __init__(self, in_channels: int) -> None:
+        super().__init__()
+        self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
+        self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
+        self.value_conv = nn.Conv2d(in_channels, in_channels, 1)
+        self.gamma = Scale(0.)
+        self.in_channels = in_channels
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """forward function of Criss-Cross Attention.
+
+        Args:
+            x (torch.Tensor): Input feature with the shape of
+                (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output of the layer, with the shape of
+            (batch_size, in_channels, height, width)
+        """
+        B, C, H, W = x.size()
+        query = self.query_conv(x)
+        key = self.key_conv(x)
+        value = self.value_conv(x)
+        energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG(
+            H, query.device)
+        energy_H = energy_H.transpose(1, 2)
+        energy_W = torch.einsum('bchw,bchj->bhwj', query, key)
+        attn = F.softmax(
+            torch.cat([energy_H, energy_W], dim=-1), dim=-1)  # [B,H,W,(H+W)]
+        out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H])
+        out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:])
+
+        out = self.gamma(out) + x
+        out = out.contiguous()
+
+        return out
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__
+        s += f'(in_channels={self.in_channels})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/chamfer_distance.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/chamfer_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f908a5bbc2655de6233cd6ddfa140ee783079ba
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/chamfer_distance.py
@@ -0,0 +1,93 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Tuple
+
+import torch
+from torch import Tensor
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['chamfer_distance_forward', 'chamfer_distance_backward'])
+
+
+class ChamferDistanceFunction(Function):
+    """This is an implementation of the 2D Chamfer Distance.
+
+    It has been used in the paper `Oriented RepPoints for Aerial Object
+    Detection (CVPR 2022) <https://arxiv.org/abs/2105.11111>_`.
+    """
+
+    @staticmethod
+    def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]:
+        """
+        Args:
+            xyz1 (Tensor): Point set with shape (B, N, 2).
+            xyz2 (Tensor): Point set with shape (B, N, 2).
+
+        Returns:
+            Sequence[Tensor]:
+
+                - dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with
+                    shape (B, N).
+                - dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with
+                    shape (B, N).
+                - idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
+                    with shape (B, N), which be used in compute gradient.
+                - idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
+                    with shape (B, N), which be used in compute gradient.
+        """
+        batch_size, n, _ = xyz1.size()
+        _, m, _ = xyz2.size()
+        device = xyz1.device
+        xyz1 = xyz1.contiguous()
+        xyz2 = xyz2.contiguous()
+
+        dist1 = torch.zeros(batch_size, n).to(device)
+        dist2 = torch.zeros(batch_size, m).to(device)
+        idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device)
+        idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device)
+
+        ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1,
+                                            idx2)
+        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
+        return dist1, dist2, idx1, idx2
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx,
+                 grad_dist1: Tensor,
+                 grad_dist2: Tensor,
+                 grad_idx1=None,
+                 grad_idx2=None) -> Tuple[Tensor, Tensor]:
+        """
+
+        Args:
+            grad_dist1 (Tensor): Gradient of chamfer distance
+                (xyz1 to xyz2) with shape (B, N).
+            grad_dist2 (Tensor): Gradient of chamfer distance
+                (xyz2 to xyz1) with shape (B, N).
+
+        Returns:
+            Tuple[Tensor, Tensor]:
+
+            - grad_xyz1 (Tensor): Gradient of the point set with shape \
+                (B, N, 2).
+            - grad_xyz2 (Tensor):Gradient of the point set with shape \
+                (B, N, 2).
+        """
+        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
+        device = grad_dist1.device
+        grad_dist1 = grad_dist1.contiguous()
+        grad_dist2 = grad_dist2.contiguous()
+        grad_xyz1 = torch.zeros(xyz1.size()).to(device)
+        grad_xyz2 = torch.zeros(xyz2.size()).to(device)
+
+        ext_module.chamfer_distance_backward(xyz1, xyz2, idx1, idx2,
+                                             grad_dist1, grad_dist2, grad_xyz1,
+                                             grad_xyz2)
+        return grad_xyz1, grad_xyz2
+
+
+chamfer_distance = ChamferDistanceFunction.apply
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/contour_expand.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/contour_expand.py
new file mode 100644
index 0000000000000000000000000000000000000000..7184609ad9b64d421c17fdfe4a1a0dbeb62d64c8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/contour_expand.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import numpy as np
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['contour_expand'])
+
+
+def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
+                   internal_kernel_label: Union[np.array, torch.Tensor],
+                   min_kernel_area: int, kernel_num: int) -> list:
+    """Expand kernel contours so that foreground pixels are assigned into
+    instances.
+
+    Args:
+        kernel_mask (np.array or torch.Tensor): The instance kernel mask with
+            size hxw.
+        internal_kernel_label (np.array or torch.Tensor): The instance internal
+            kernel label with size hxw.
+        min_kernel_area (int): The minimum kernel area.
+        kernel_num (int): The instance kernel number.
+
+    Returns:
+        list: The instance index map with size hxw.
+    """
+    assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
+    assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
+    assert isinstance(min_kernel_area, int)
+    assert isinstance(kernel_num, int)
+
+    if isinstance(kernel_mask, np.ndarray):
+        kernel_mask = torch.from_numpy(kernel_mask)
+    if isinstance(internal_kernel_label, np.ndarray):
+        internal_kernel_label = torch.from_numpy(internal_kernel_label)
+
+    if torch.__version__ == 'parrots':
+        if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0:
+            label = []
+        else:
+            label = ext_module.contour_expand(
+                kernel_mask,
+                internal_kernel_label,
+                min_kernel_area=min_kernel_area,
+                kernel_num=kernel_num)
+            label = label.tolist()  # type: ignore
+    else:
+        label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
+                                          min_kernel_area, kernel_num)
+    return label
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/conv2d_gradfix.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/conv2d_gradfix.py
new file mode 100644
index 0000000000000000000000000000000000000000..b93a76a844457dd78d625dd95e042864943f11c3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/conv2d_gradfix.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/conv2d_gradfix.py # noqa
+"""Custom replacement for `torch.nn.functional.conv2d` that supports
+arbitrarily high order gradients with zero performance penalty."""
+
+import contextlib
+import warnings
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils.parrots_wrapper import is_rocm_pytorch
+
+enabled = True
+weight_gradients_disabled = False
+
+
+@contextlib.contextmanager
+def no_weight_gradients(disable=True):
+    global weight_gradients_disabled
+    old = weight_gradients_disabled
+    if disable:
+        weight_gradients_disabled = True
+    yield
+    weight_gradients_disabled = old
+
+
+def conv2d(input: torch.Tensor,
+           weight: torch.Tensor,
+           bias: Optional[torch.Tensor] = None,
+           stride: Union[int, Tuple[int, ...]] = 1,
+           padding: Union[int, Tuple[int, ...]] = 0,
+           dilation: Union[int, Tuple[int, ...]] = 1,
+           groups: int = 1):
+    flag = True
+    if digit_version(torch.__version__) >= digit_version('1.10.0'):
+        warnings.warn('Since '
+                      'aten:cudnn_convolution_backward_weight is '
+                      f'not supported in torch=={torch.__version__},'
+                      ' rolling back to `torch.nn.functional.conv2d`')
+        flag = False
+    if _should_use_custom_op(input) and flag:
+        return _conv2d_gradfix(
+            transpose=False,
+            weight_shape=weight.shape,
+            stride=stride,
+            padding=padding,
+            output_padding=0,
+            dilation=dilation,
+            groups=groups).apply(input, weight, bias)
+    return torch.nn.functional.conv2d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups)
+
+
+def conv_transpose2d(input: torch.Tensor,
+                     weight: torch.Tensor,
+                     bias: Optional[torch.Tensor] = None,
+                     stride: Union[int, Tuple[int, ...]] = 1,
+                     padding: Union[int, Tuple[int, ...]] = 0,
+                     output_padding: Union[int, Tuple[int, ...]] = 0,
+                     groups: int = 1,
+                     dilation: Union[int, Tuple[int, ...]] = 1):
+    if _should_use_custom_op(input):
+        return _conv2d_gradfix(
+            transpose=True,
+            weight_shape=weight.shape,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            dilation=dilation).apply(input, weight, bias)
+    return torch.nn.functional.conv_transpose2d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        output_padding=output_padding,
+        groups=groups,
+        dilation=dilation)
+
+
+def _should_use_custom_op(input):
+    assert isinstance(input, torch.Tensor)
+    if (not enabled) or (not torch.backends.cudnn.enabled):
+        return False
+    if input.device.type != 'cuda':
+        return False
+    return True
+
+
+def _to_tuple(x, ndim):
+    xs = tuple(x) if isinstance(x, (tuple, list)) else (x, ) * ndim
+    assert len(xs) == ndim
+    assert all(isinstance(x, int) for x in xs)
+    return xs
+
+
+_conv2d_gradfix_cache: Dict = dict()
+_null_tensor = torch.empty([0])
+
+
+def _conv2d_gradfix(
+    transpose: bool,
+    weight_shape: Tuple[int, ...],
+    stride: Union[int, Tuple[int, ...]],
+    padding: Union[int, Tuple[int, ...]],
+    output_padding: Union[int, Tuple[int, ...]],
+    dilation: Union[int, Tuple[int, ...]],
+    groups: int,
+):
+    # Parse arguments.
+    ndim = 2
+    weight_shape = tuple(weight_shape)
+    stride = _to_tuple(stride, ndim)
+    padding = _to_tuple(padding, ndim)
+    output_padding = _to_tuple(output_padding, ndim)
+    dilation = _to_tuple(dilation, ndim)
+
+    # Lookup from cache.
+    key = (transpose, weight_shape, stride, padding, output_padding, dilation,
+           groups)
+    if key in _conv2d_gradfix_cache:
+        return _conv2d_gradfix_cache[key]
+
+    # Validate arguments.
+
+    assert groups >= 1
+    assert len(weight_shape) == ndim + 2
+    assert all(stride[i] >= 1 for i in range(ndim))  # type: ignore
+    assert all(padding[i] >= 0 for i in range(ndim))  # type: ignore
+    assert all(dilation[i] >= 0 for i in range(ndim))  # type: ignore
+    if not transpose:
+        assert all(output_padding[i] == 0 for i in range(ndim))  # type: ignore
+    else:  # transpose
+        for i in range(ndim):
+            assert 0 <= output_padding[i] < max(  # type: ignore
+                stride[i],  # type: ignore
+                dilation[i])  # type: ignore
+
+    # Helpers.
+    common_kwargs = dict(
+        stride=stride, padding=padding, dilation=dilation, groups=groups)
+
+    def calc_output_padding(input_shape, output_shape):
+        if transpose:
+            return [0, 0]
+        return [
+            input_shape[i + 2] - (output_shape[i + 2] - 1) * stride[i] -
+            (1 - 2 * padding[i]) - dilation[i] * (weight_shape[i + 2] - 1)
+            for i in range(ndim)
+        ]
+
+    # Forward & backward.
+    class Conv2d(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, input, weight, bias):
+            assert weight.shape == weight_shape
+            ctx.save_for_backward(
+                input if weight.requires_grad else _null_tensor,
+                weight if input.requires_grad else _null_tensor,
+            )
+            ctx.input_shape = input.shape
+
+            # Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere).
+            if weight_shape[2:] == stride == dilation == (
+                    1, 1) and padding == (
+                        0, 0) and torch.cuda.get_device_capability(
+                            input.device) < (8, 0):
+                a = weight.reshape(groups, weight_shape[0] // groups,
+                                   weight_shape[1])
+                b = input.reshape(input.shape[0], groups,
+                                  input.shape[1] // groups, -1)
+                c = (a.transpose(1, 2) if transpose else a) @ b.permute(
+                    1, 2, 0, 3).flatten(2)
+                c = c.reshape(-1, input.shape[0],
+                              *input.shape[2:]).transpose(0, 1)
+                c = c if bias is None else c + bias.unsqueeze(0).unsqueeze(
+                    2).unsqueeze(3)
+                return c.contiguous(
+                    memory_format=(torch.channels_last if input.stride(1) ==
+                                   1 else torch.contiguous_format))
+
+            # General case => cuDNN.
+            if transpose:
+                return torch.nn.functional.conv_transpose2d(
+                    input=input,
+                    weight=weight,
+                    bias=bias,
+                    output_padding=output_padding,
+                    **common_kwargs)
+            return torch.nn.functional.conv2d(
+                input=input, weight=weight, bias=bias, **common_kwargs)
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            input, weight = ctx.saved_tensors
+            input_shape = ctx.input_shape
+            grad_input = None
+            grad_weight = None
+            grad_bias = None
+
+            if ctx.needs_input_grad[0]:
+                p = calc_output_padding(
+                    input_shape=input_shape, output_shape=grad_output.shape)
+                op = _conv2d_gradfix(
+                    transpose=(not transpose),
+                    weight_shape=weight_shape,
+                    output_padding=p,
+                    **common_kwargs)
+                grad_input = op.apply(grad_output, weight, None)
+                assert grad_input.shape == input_shape
+
+            if ctx.needs_input_grad[1] and not weight_gradients_disabled:
+                grad_weight = Conv2dGradWeight.apply(grad_output, input)
+                assert grad_weight.shape == weight_shape
+
+            if ctx.needs_input_grad[2]:
+                grad_bias = grad_output.sum([0, 2, 3])
+
+            return grad_input, grad_weight, grad_bias
+
+    # Gradient with respect to the weights.
+    class Conv2dGradWeight(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, grad_output, input):
+            ctx.save_for_backward(
+                grad_output if input.requires_grad else _null_tensor,
+                input if grad_output.requires_grad else _null_tensor,
+            )
+            ctx.grad_output_shape = grad_output.shape
+            ctx.input_shape = input.shape
+
+            # Simple 1x1 convolution => cuBLAS (on both Volta and Ampere).
+            if weight_shape[2:] == stride == dilation == (
+                    1, 1) and padding == (0, 0):
+                a = grad_output.reshape(grad_output.shape[0], groups,
+                                        grad_output.shape[1] // groups,
+                                        -1).permute(1, 2, 0, 3).flatten(2)
+                b = input.reshape(input.shape[0], groups,
+                                  input.shape[1] // groups,
+                                  -1).permute(1, 2, 0, 3).flatten(2)
+                c = (b @ a.transpose(1, 2) if transpose else
+                     a @ b.transpose(1, 2)).reshape(weight_shape)
+                return c.contiguous(
+                    memory_format=(torch.channels_last if input.stride(1) ==
+                                   1 else torch.contiguous_format))
+
+            # PyTorch consolidated convolution backward API in PR:
+            # https://github.com/pytorch/pytorch/commit/3dc3651e0ee3623f669c3a2c096408dbc476d122  # noqa: E501
+            # Enhance the code referring to the discussion:
+            # https://github.com/pytorch/pytorch/issues/74437
+            if digit_version(torch.__version__) >= digit_version('1.11.0'):
+                empty_weight = torch.tensor(
+                    0.0, dtype=input.dtype,
+                    device=input.device).expand(weight_shape)
+                output_padding = calc_output_padding(input.shape,
+                                                     grad_output.shape)
+                return torch.ops.aten.convolution_backward(
+                    grad_output,
+                    input,
+                    empty_weight,
+                    None,
+                    stride=stride,
+                    dilation=dilation,
+                    transposed=transpose,
+                    padding=padding,
+                    groups=groups,
+                    output_padding=output_padding,
+                    output_mask=[0, 1, 0])[1]
+            else:
+                if is_rocm_pytorch():
+                    name = 'aten::miopen_convolution_transpose_backward_weight'
+                    if not transpose:
+                        name = 'aten::miopen_convolution_backward_weight'
+                    flags = [
+                        torch.backends.cudnn.benchmark,
+                        torch.backends.cudnn.deterministic
+                    ]
+                else:
+                    # General case => cuDNN.
+                    name = ('aten::cudnn_convolution_transpose_backward_weight'
+                            if transpose else
+                            'aten::cudnn_convolution_backward_weight')
+                    flags = [
+                        torch.backends.cudnn.benchmark,
+                        torch.backends.cudnn.deterministic,
+                        torch.backends.cudnn.allow_tf32
+                    ]
+                return torch._C._jit_get_operation(name)(weight_shape,
+                                                         grad_output, input,
+                                                         padding, stride,
+                                                         dilation, groups,
+                                                         *flags)
+
+        @staticmethod
+        def backward(ctx, grad2_grad_weight):
+            grad_output, input = ctx.saved_tensors
+            grad_output_shape = ctx.grad_output_shape
+            input_shape = ctx.input_shape
+            grad2_grad_output = None
+            grad2_input = None
+
+            if ctx.needs_input_grad[0]:
+                grad2_grad_output = Conv2d.apply(input, grad2_grad_weight,
+                                                 None)
+                assert grad2_grad_output.shape == grad_output_shape
+
+            if ctx.needs_input_grad[1]:
+                p = calc_output_padding(
+                    input_shape=input_shape, output_shape=grad_output_shape)
+                op = _conv2d_gradfix(
+                    transpose=(not transpose),
+                    weight_shape=weight_shape,
+                    output_padding=p,
+                    **common_kwargs)
+                grad2_input = op.apply(grad_output, grad2_grad_weight, None)
+                assert grad2_input.shape == input_shape
+
+            return grad2_grad_output, grad2_input
+
+    _conv2d_gradfix_cache[key] = Conv2d
+    return Conv2d
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/convex_iou.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/convex_iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..50050363ac5b08cfa8f86dd186ab7087fac6f48a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/convex_iou.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])
+
+
+def convex_giou(pointsets: torch.Tensor,
+                polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Return generalized intersection-over-union (Jaccard index) between point
+    sets and polygons.
+
+    Args:
+        pointsets (torch.Tensor): It has shape (N, 18),
+            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+        polygons (torch.Tensor): It has shape (N, 8),
+            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: The first element is the gious
+        between point sets and polygons with the shape (N,). The second
+        element is the gradient of point sets with the shape (N, 18).
+    """
+    output = pointsets.new_zeros((pointsets.size(0), 19))
+    ext_module.convex_giou(pointsets, polygons, output)
+    convex_giou = output[:, -1]
+    points_grad = output[:, 0:-1]
+    return convex_giou, points_grad
+
+
+def convex_iou(pointsets: torch.Tensor,
+               polygons: torch.Tensor) -> torch.Tensor:
+    """Return intersection-over-union (Jaccard index) between point sets and
+    polygons.
+
+    Args:
+        pointsets (torch.Tensor): It has shape (N, 18),
+            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+        polygons (torch.Tensor): It has shape (K, 8),
+            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+    Returns:
+        torch.Tensor: Return the ious between point sets and polygons with the
+        shape (N, K).
+    """
+    N, K = pointsets.size(0), polygons.size(0)
+    ious = pointsets.new_zeros((N, K))
+    ext_module.convex_iou(pointsets, polygons, ious)
+    return ious
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/corner_pool.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/corner_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f4ebca06304329439fec33965792e84b7513c5c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/corner_pool.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.utils import digit_version
+from torch import Tensor, nn
+
+_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
+
+
+def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:
+    size = x.size(dim)
+    output = x.clone()
+
+    ind = 1
+    while ind < size:
+        if flip:
+            cur_start = 0
+            cur_len = size - ind
+            next_start = ind
+            next_len = size - ind
+        else:
+            cur_start = ind
+            cur_len = size - ind
+            next_start = 0
+            next_len = size - ind
+
+        # max_temp should be cloned for backward computation
+        max_temp = output.narrow(dim, cur_start, cur_len).clone()
+        cur_temp = output.narrow(dim, cur_start, cur_len)
+        next_temp = output.narrow(dim, next_start, next_len)
+
+        cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp)
+
+        ind = ind << 1
+
+    return output
+
+
+class CornerPool(nn.Module):
+    """Corner Pooling.
+
+    Corner Pooling is a new type of pooling layer that helps a
+    convolutional network better localize corners of bounding boxes.
+
+    Please refer to `CornerNet: Detecting Objects as Paired Keypoints
+    <https://arxiv.org/abs/1808.01244>`_ for more details.
+
+    Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
+
+    Args:
+        mode (str): Pooling orientation for the pooling layer
+
+            - 'bottom': Bottom Pooling
+            - 'left': Left Pooling
+            - 'right': Right Pooling
+            - 'top': Top Pooling
+
+    Returns:
+        Feature map after pooling.
+    """
+
+    cummax_dim_flip = {
+        'bottom': (2, False),
+        'left': (3, True),
+        'right': (3, False),
+        'top': (2, True),
+    }
+
+    def __init__(self, mode: str):
+        super().__init__()
+        assert mode in self.cummax_dim_flip
+        self.mode = mode
+
+    def forward(self, x: Tensor) -> Tensor:
+        if (torch.__version__ != 'parrots' and
+                digit_version(torch.__version__) >= digit_version('1.5.0')):
+            dim, flip = self.cummax_dim_flip[self.mode]
+            if flip:
+                x = x.flip(dim)
+            pool_tensor, _ = torch.cummax(x, dim=dim)
+            if flip:
+                pool_tensor = pool_tensor.flip(dim)
+            return pool_tensor
+        else:
+            dim, flip = self.cummax_dim_flip[self.mode]
+            return _corner_pool(x, dim, flip)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/correlation.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/correlation.py
new file mode 100644
index 0000000000000000000000000000000000000000..319b7646782637e9ebaac4ef07b82d1f460031b5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/correlation.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from torch import Tensor, nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['correlation_forward', 'correlation_backward'])
+
+
+class CorrelationFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input1: Tensor,
+                input2: Tensor,
+                kernel_size: int = 1,
+                max_displacement: int = 1,
+                stride: int = 1,
+                padding: int = 1,
+                dilation: int = 1,
+                dilation_patch: int = 1) -> Tensor:
+
+        ctx.save_for_backward(input1, input2)
+
+        kH, kW = ctx.kernel_size = _pair(kernel_size)
+        patch_size = max_displacement * 2 + 1
+        ctx.patch_size = patch_size
+        dH, dW = ctx.stride = _pair(stride)
+        padH, padW = ctx.padding = _pair(padding)
+        dilationH, dilationW = ctx.dilation = _pair(dilation)
+        dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair(
+            dilation_patch)
+
+        output_size = CorrelationFunction._output_size(ctx, input1)
+
+        output = input1.new_zeros(output_size)
+
+        ext_module.correlation_forward(
+            input1,
+            input2,
+            output,
+            kH=kH,
+            kW=kW,
+            patchH=patch_size,
+            patchW=patch_size,
+            padH=padH,
+            padW=padW,
+            dilationH=dilationH,
+            dilationW=dilationW,
+            dilation_patchH=dilation_patchH,
+            dilation_patchW=dilation_patchW,
+            dH=dH,
+            dW=dW)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]:
+        input1, input2 = ctx.saved_tensors
+
+        kH, kW = ctx.kernel_size
+        patch_size = ctx.patch_size
+        padH, padW = ctx.padding
+        dilationH, dilationW = ctx.dilation
+        dilation_patchH, dilation_patchW = ctx.dilation_patch
+        dH, dW = ctx.stride
+        grad_input1 = torch.zeros_like(input1)
+        grad_input2 = torch.zeros_like(input2)
+
+        ext_module.correlation_backward(
+            grad_output,
+            input1,
+            input2,
+            grad_input1,
+            grad_input2,
+            kH=kH,
+            kW=kW,
+            patchH=patch_size,
+            patchW=patch_size,
+            padH=padH,
+            padW=padW,
+            dilationH=dilationH,
+            dilationW=dilationW,
+            dilation_patchH=dilation_patchH,
+            dilation_patchW=dilation_patchW,
+            dH=dH,
+            dW=dW)
+        return grad_input1, grad_input2, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(ctx, input1):
+        iH, iW = input1.size(2), input1.size(3)
+        batch_size = input1.size(0)
+        kH, kW = ctx.kernel_size
+        patch_size = ctx.patch_size
+        dH, dW = ctx.stride
+        padH, padW = ctx.padding
+        dilationH, dilationW = ctx.dilation
+        dilatedKH = (kH - 1) * dilationH + 1
+        dilatedKW = (kW - 1) * dilationW + 1
+
+        oH = int((iH + 2 * padH - dilatedKH) / dH + 1)
+        oW = int((iW + 2 * padW - dilatedKW) / dW + 1)
+
+        output_size = (batch_size, patch_size, patch_size, oH, oW)
+        return output_size
+
+
+class Correlation(nn.Module):
+    r"""Correlation operator
+
+    This correlation operator works for optical flow correlation computation.
+
+    There are two batched tensors with shape :math:`(N, C, H, W)`,
+    and the correlation output's shape is :math:`(N, max\_displacement \times
+    2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})`
+
+    where
+
+    .. math::
+        H_{out} = \left\lfloor\frac{H_{in}  + 2 \times padding -
+            dilation \times (kernel\_size - 1) - 1}
+            {stride} + 1\right\rfloor
+
+    .. math::
+        W_{out} = \left\lfloor\frac{W_{in}  + 2 \times padding - dilation
+            \times (kernel\_size - 1) - 1}
+            {stride} + 1\right\rfloor
+
+    the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding
+    window convolution between input1 and shifted input2,
+
+    .. math::
+        Corr(N_i, dx, dy) =
+        \sum_{c=0}^{C-1}
+        input1(N_i, c) \star
+        \mathcal{S}(input2(N_i, c), dy, dx)
+
+    where :math:`\star` is the valid 2d sliding window convolution operator,
+    and :math:`\mathcal{S}` means shifting the input features (auto-complete
+    zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in
+    [-max\_displacement \times dilation\_patch, max\_displacement \times
+    dilation\_patch]`.
+
+    Args:
+        kernel_size (int): The size of sliding window i.e. local neighborhood
+            representing the center points and involved in correlation
+            computation. Defaults to 1.
+        max_displacement (int): The radius for computing correlation volume,
+            but the actual working space can be dilated by dilation_patch.
+            Defaults to 1.
+        stride (int): The stride of the sliding blocks in the input spatial
+            dimensions. Defaults to 1.
+        padding (int): Zero padding added to all four sides of the input1.
+            Defaults to 0.
+        dilation (int): The spacing of local neighborhood that will involved
+            in correlation. Defaults to 1.
+        dilation_patch (int): The spacing between position need to compute
+            correlation.  Defaults to 1.
+    """
+
+    def __init__(self,
+                 kernel_size: int = 1,
+                 max_displacement: int = 1,
+                 stride: int = 1,
+                 padding: int = 0,
+                 dilation: int = 1,
+                 dilation_patch: int = 1) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.max_displacement = max_displacement
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.dilation_patch = dilation_patch
+
+    def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
+        return CorrelationFunction.apply(input1, input2, self.kernel_size,
+                                         self.max_displacement, self.stride,
+                                         self.padding, self.dilation,
+                                         self.dilation_patch)
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__
+        s += f'(kernel_size={self.kernel_size}, '
+        s += f'max_displacement={self.max_displacement}, '
+        s += f'stride={self.stride}, '
+        s += f'padding={self.padding}, '
+        s += f'dilation={self.dilation}, '
+        s += f'dilation_patch={self.dilation_patch})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8453eaa8d3638394df8a0b169d8df01dfc27a11
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
@@ -0,0 +1,426 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+#pragma once
+#include <cassert>
+#include <cmath>
+
+#ifdef __CUDACC__
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+#else
+#include <algorithm>
+#define HOST_DEVICE
+#define HOST_DEVICE_INLINE HOST_DEVICE inline
+#endif
+
+namespace {
+
+template <typename T>
+struct RotatedBox {
+  T x_ctr, y_ctr, w, h, a;
+};
+
+template <typename T>
+struct Point {
+  T x, y;
+  HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
+  HOST_DEVICE_INLINE Point operator+(const Point& p) const {
+    return Point(x + p.x, y + p.y);
+  }
+  HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
+    x += p.x;
+    y += p.y;
+    return *this;
+  }
+  HOST_DEVICE_INLINE Point operator-(const Point& p) const {
+    return Point(x - p.x, y - p.y);
+  }
+  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
+    return Point(x * coeff, y * coeff);
+  }
+};
+
+template <typename T>
+HOST_DEVICE_INLINE T dot_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.x + A.y * B.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T cross_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.y - B.x * A.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox<T>& box,
+                                             Point<T> (&pts)[4]) {
+  // M_PI / 180. == 0.01745329251
+  // double theta = box.a * 0.01745329251;
+  // MODIFIED
+  double theta = box.a;
+  T cosTheta2 = (T)cos(theta) * 0.5f;
+  T sinTheta2 = (T)sin(theta) * 0.5f;
+
+  // y: top --> down; x: left --> right
+  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[2].x = 2 * box.x_ctr - pts[0].x;
+  pts[2].y = 2 * box.y_ctr - pts[0].y;
+  pts[3].x = 2 * box.x_ctr - pts[1].x;
+  pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int get_intersection_points(const Point<T> (&pts1)[4],
+                                               const Point<T> (&pts2)[4],
+                                               Point<T> (&intersections)[24]) {
+  // Line vector
+  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+  Point<T> vec1[4], vec2[4];
+  for (int i = 0; i < 4; i++) {
+    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+  }
+
+  // Line test - test all line combos for intersection
+  int num = 0;  // number of intersections
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      // Solve for 2x2 Ax=b
+      T det = cross_2d<T>(vec2[j], vec1[i]);
+
+      // This takes care of parallel lines
+      if (fabs(det) <= 1e-14) {
+        continue;
+      }
+
+      auto vec12 = pts2[j] - pts1[i];
+
+      T t1 = cross_2d<T>(vec2[j], vec12) / det;
+      T t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
+        intersections[num++] = pts1[i] + vec1[i] * t1;
+      }
+    }
+  }
+
+  // Check for vertices of rect1 inside rect2
+  {
+    const auto& AB = vec2[0];
+    const auto& DA = vec2[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      // assume ABCD is the rectangle, and P is the point to be judged
+      // P is inside ABCD iff. P's projection on AB lies within AB
+      // and P's projection on AD lies within AD
+
+      auto AP = pts1[i] - pts2[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts1[i];
+      }
+    }
+  }
+
+  // Reverse the check - check for vertices of rect2 inside rect1
+  {
+    const auto& AB = vec1[0];
+    const auto& DA = vec1[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      auto AP = pts2[i] - pts1[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts2[i];
+      }
+    }
+  }
+
+  return num;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
+                                          const int& num_in, Point<T> (&q)[24],
+                                          bool shift_to_zero = false) {
+  assert(num_in >= 2);
+
+  // Step 1:
+  // Find point with minimum y
+  // if more than 1 points have the same minimum y,
+  // pick the one with the minimum x.
+  int t = 0;
+  for (int i = 1; i < num_in; i++) {
+    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+      t = i;
+    }
+  }
+  auto& start = p[t];  // starting point
+
+  // Step 2:
+  // Subtract starting point from every points (for sorting in the next step)
+  for (int i = 0; i < num_in; i++) {
+    q[i] = p[i] - start;
+  }
+
+  // Swap the starting point to position 0
+  auto tmp = q[0];
+  q[0] = q[t];
+  q[t] = tmp;
+
+  // Step 3:
+  // Sort point 1 ~ num_in according to their relative cross-product values
+  // (essentially sorting according to angles)
+  // If the angles are the same, sort according to their distance to origin
+  T dist[24];
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+#ifdef __CUDACC__
+  // CUDA version
+  // In the future, we can potentially use thrust
+  // for sorting here to improve speed (though not guaranteed)
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+#else
+  // CPU version
+  std::sort(q + 1, q + num_in,
+            [](const Point<T>& A, const Point<T>& B) -> bool {
+              T temp = cross_2d<T>(A, B);
+              if (fabs(temp) < 1e-6) {
+                return dot_2d<T>(A, A) < dot_2d<T>(B, B);
+              } else {
+                return temp > 0;
+              }
+            });
+  // compute distance to origin after sort, since the points are now different.
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+#endif
+
+  // Step 4:
+  // Make sure there are at least 2 points (that don't overlap with each other)
+  // in the stack
+  int k;  // index of the non-overlapped second point
+  for (k = 1; k < num_in; k++) {
+    if (dist[k] > 1e-8) {
+      break;
+    }
+  }
+  if (k == num_in) {
+    // We reach the end, which means the convex hull is just one point
+    q[0] = p[t];
+    return 1;
+  }
+  q[1] = q[k];
+  int m = 2;  // 2 points in the stack
+  // Step 5:
+  // Finally we can start the scanning process.
+  // When a non-convex relationship between the 3 points is found
+  // (either concave shape or duplicated points),
+  // we pop the previous point from the stack
+  // until the 3-point relationship is convex again, or
+  // until the stack only contains two points
+  for (int i = k + 1; i < num_in; i++) {
+    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
+      m--;
+    }
+    q[m++] = q[i];
+  }
+
+  // Step 6 (Optional):
+  // In general sense we need the original coordinates, so we
+  // need to shift the points back (reverting Step 2)
+  // But if we're only interested in getting the area/perimeter of the shape
+  // We can simply return.
+  if (!shift_to_zero) {
+    for (int i = 0; i < m; i++) {
+      q[i] += start;
+    }
+  }
+
+  return m;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T quadri_box_area(const Point<T> (&q)[4]) {
+  T area = 0;
+#pragma unroll
+  for (int i = 1; i < 3; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
+  if (m <= 2) {
+    return 0;
+  }
+
+  T area = 0;
+  for (int i = 1; i < m - 1; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox<T>& box1,
+                                                const RotatedBox<T>& box2) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  Point<T> pts1[4];
+  Point<T> pts2[4];
+  get_rotated_vertices<T>(box1, pts1);
+  get_rotated_vertices<T>(box2, pts2);
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T quadri_boxes_intersection(const Point<T> (&pts1)[4],
+                                               const Point<T> (&pts2)[4]) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+}  // namespace
+
+template <typename T>
+HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw,
+                                            T const* const box2_raw,
+                                            const int mode_flag) {
+  // shift center to the middle point to achieve higher precision in result
+  RotatedBox<T> box1, box2;
+  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+  box1.x_ctr = box1_raw[0] - center_shift_x;
+  box1.y_ctr = box1_raw[1] - center_shift_y;
+  box1.w = box1_raw[2];
+  box1.h = box1_raw[3];
+  box1.a = box1_raw[4];
+  box2.x_ctr = box2_raw[0] - center_shift_x;
+  box2.y_ctr = box2_raw[1] - center_shift_y;
+  box2.w = box2_raw[2];
+  box2.h = box2_raw[3];
+  box2.a = box2_raw[4];
+
+  const T area1 = box1.w * box1.h;
+  const T area2 = box2.w * box2.h;
+  if (area1 < 1e-14 || area2 < 1e-14) {
+    return 0.f;
+  }
+
+  const T intersection = rotated_boxes_intersection<T>(box1, box2);
+  T baseS = 1.0;
+  if (mode_flag == 0) {
+    baseS = (area1 + area2 - intersection);
+  } else if (mode_flag == 1) {
+    baseS = area1;
+  }
+  const T iou = intersection / baseS;
+  return iou;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T single_box_iou_quadri(T const* const pts1_raw,
+                                           T const* const pts2_raw,
+                                           const int mode_flag) {
+  // shift center to the middle point to achieve higher precision in result
+  Point<T> pts1[4], pts2[4];
+
+  auto center_shift_x =
+      (pts1_raw[0] + pts2_raw[0] + pts1_raw[2] + pts2_raw[2] + pts1_raw[4] +
+       pts2_raw[4] + pts1_raw[6] + pts2_raw[6]) /
+      8.0;
+  auto center_shift_y =
+      (pts1_raw[1] + pts2_raw[1] + pts1_raw[3] + pts2_raw[3] + pts1_raw[5] +
+       pts2_raw[5] + pts1_raw[7] + pts2_raw[7]) /
+      8.0;
+  pts1[0].x = pts1_raw[0] - center_shift_x;
+  pts1[0].y = pts1_raw[1] - center_shift_y;
+  pts1[1].x = pts1_raw[2] - center_shift_x;
+  pts1[1].y = pts1_raw[3] - center_shift_y;
+  pts1[2].x = pts1_raw[4] - center_shift_x;
+  pts1[2].y = pts1_raw[5] - center_shift_y;
+  pts1[3].x = pts1_raw[6] - center_shift_x;
+  pts1[3].y = pts1_raw[7] - center_shift_y;
+  pts2[0].x = pts2_raw[0] - center_shift_x;
+  pts2[0].y = pts2_raw[1] - center_shift_y;
+  pts2[1].x = pts2_raw[2] - center_shift_x;
+  pts2[1].y = pts2_raw[3] - center_shift_y;
+  pts2[2].x = pts2_raw[4] - center_shift_x;
+  pts2[2].y = pts2_raw[5] - center_shift_y;
+  pts2[3].x = pts2_raw[6] - center_shift_x;
+  pts2[3].y = pts2_raw[7] - center_shift_y;
+
+  const T area1 = quadri_box_area<T>(pts1);
+  const T area2 = quadri_box_area<T>(pts2);
+  if (area1 < 1e-14 || area2 < 1e-14) {
+    return 0.f;
+  }
+
+  const T intersection = quadri_boxes_intersection<T>(pts1, pts2);
+  T baseS = 1.0;
+  if (mode_flag == 0) {
+    baseS = (area1 + area2 - intersection);
+  } else if (mode_flag == 1) {
+    baseS = area1;
+  }
+  const T iou = intersection / baseS;
+  return iou;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..36e41107ebd52d3cf5e9a71cffe6eddeed4f0765
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
@@ -0,0 +1,59 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
+#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void active_rotated_filter_forward_cuda_kernel(
+    const int nthreads, const scalar_t* weight_data, const int* indices_data,
+    const int num_input_planes, const int num_output_planes,
+    const int num_orientations, const int num_rotations, const int nEntry,
+    scalar_t* output_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int l = index % nEntry;
+    int j = (index / nEntry) % num_input_planes;
+    int i = index / nEntry / num_input_planes;
+    int k;
+    scalar_t val = *(weight_data + index);
+    for (k = 0; k < num_rotations; k++) {
+      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+      scalar_t* target = output_data +
+                         i * (num_rotations * num_input_planes * nEntry) +
+                         k * (num_input_planes * nEntry) + j * (nEntry) + idx;
+      *target = val;
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void active_rotated_filter_backward_cuda_kernel(
+    const int nthreads, const scalar_t* gradWeight_data,
+    const int* indices_data, const int num_input_planes,
+    const int num_output_planes, const int num_orientations,
+    const int num_rotations, const int nEntry, scalar_t* weight_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int l = index % nEntry;
+    int j = (index / nEntry) % num_input_planes;
+    int i = index / nEntry / num_input_planes;
+    int k;
+    scalar_t* val = weight_data + index;
+    *val = 0;
+    scalar_t tmp = 0;
+    for (k = 0; k < num_rotations; k++) {
+      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+      scalar_t target =
+          *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
+            k * (num_input_planes * nEntry) + j * (nEntry) + idx);
+      tmp = tmp + target;
+    }
+    *val = tmp;
+  }
+}
+#endif  // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9f9250844b9ceeca0df0377640c3d28e3f61cecc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
@@ -0,0 +1,116 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
+#define ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K)
+// output: fout(B,O,N)
+// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) =  s(b,i,k,m)*p(b,i(k),m,j)
+//       i(k) = idx(b,i,k)
+//      sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j)
+//      avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k
+//      max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j)))
+
+template <typename T>
+__global__ void assign_score_withk_forward_cuda_kernel(
+    const int B, const int N0, const int N1, const int M, const int K,
+    const int O, const int aggregate, const T* points, const T* centers,
+    const T* scores, const int64_t* knn_idx, T* output) {
+  // ----- parallel loop for B, N1, K and O ---------
+  CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
+    // ------- loop for M ----------
+    const int b = (int)(i / (O * N1 * K));
+    const int o = (int)(i % (O * N1 * K) / (N1 * K));
+    const int n = (int)(i % (N1 * K) / K);
+    const int k = (int)(i % K);
+    const int cn = (int)knn_idx[b * K * N1 + n * K +
+                                0];  // The first neighbor is the center point
+    const int kn = (int)knn_idx[b * K * N1 + n * K + k];
+    if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
+    }
+    assert(b < B);
+    assert(kn < N0);
+    assert(cn < N0);
+    assert(o < O);
+    assert(n < N1);
+    const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
+    T val = output[out_idx];
+    for (int m = 0; m < M; m++) {
+      val += points[b * N0 * M * O + kn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m] -
+             centers[b * N0 * M * O + cn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m];
+    }
+    output[out_idx] = val;
+  }
+}
+
+template <typename T>
+__global__ void assign_score_withk_points_backward_cuda_kernel(
+    const int B, const int N0, const int N, const int M, const int K,
+    const int O, const int aggregate, const T* grad_out, const T* scores,
+    const int64_t* knn_idx, T* grad_points, T* grad_centers) {
+  // ----- parallel loop for B, M, O ---------
+  CUDA_1D_KERNEL_LOOP(i, B * M * O) {
+    int b = (int)(i / (M * O));
+    int m = (int)(i % (M * O) / O);
+    int o = (int)(i % O);
+
+    // ----- loop for N,K ---------
+    for (int n = 0; n < N; n++) {
+      for (int k = 0; k < K; k++) {
+        int kn = knn_idx[b * N * K + n * K + k];
+        int cn = knn_idx[b * N * K + n * K + 0];
+        if (kn >= N0 || kn < 0) {  // if index overflows, it is out of the
+                                   // neighborhood range
+          continue;
+        }
+        atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
+                  scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
+        atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
+                  -scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void assign_score_withk_scores_backward_cuda_kernel(
+    const int B, const int N0, const int N, const int M, const int K,
+    const int O, const int aggregate, const T* grad_out, const T* points,
+    const T* centers, const int64_t* knn_idx, T* grad_scores) {
+  // ----- parallel loop for B, N, K, M ---------
+  CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
+    const int b = (int)(i / (N * M * K));
+    const int n = (int)(i % (N * M * K) / M / K);
+    const int k = (int)(i % (M * K) / M);
+    const int m = (int)(i % M);
+    const int cn = knn_idx[b * N * K + n * K + 0];
+    const int kn = knn_idx[b * N * K + n * K + k];
+    if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
+    }
+
+    // -------------- loop for O ------------------------
+    const int out_idx = b * N * K * M + n * K * M + k * M + m;
+    T val = grad_scores[out_idx];
+    for (int o = 0; o < O; o++) {
+      val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
+              centers[b * N0 * M * O + cn * M * O + m * O + o]) *
+             grad_out[b * O * N * K + o * N * K + n * K + k];
+    }
+    grad_scores[out_idx] = val;
+  }
+}
+
+#endif  // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..632b5c4940b33a9d8d839fa3f3b92e7b6a2bd29e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+#ifndef BALL_QUERY_CUDA_KERNEL_CUH
+#define BALL_QUERY_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
+                                               float min_radius,
+                                               float max_radius, int nsample,
+                                               const T* new_xyz, const T* xyz,
+                                               int* idx) {
+  // new_xyz: (B, M, 3)
+  // xyz: (B, N, 3)
+  // output:
+  //      idx: (B, M, nsample)
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b) return;
+
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+
+    float max_radius2 = max_radius * max_radius;
+    float min_radius2 = min_radius * min_radius;
+    T new_x = new_xyz[0];
+    T new_y = new_xyz[1];
+    T new_z = new_xyz[2];
+
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+      T x = xyz[k * 3 + 0];
+      T y = xyz[k * 3 + 1];
+      T z = xyz[k * 3 + 2];
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+             (new_z - z) * (new_z - z);
+      if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            idx[l] = k;
+          }
+        }
+        idx[cnt] = k;
+        ++cnt;
+        if (cnt >= nsample) break;
+      }
+    }
+  }
+}
+
+#endif  // BALL_QUERY_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..15bd91eca629895d3a99dde3fe6614036ca31dc9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
@@ -0,0 +1,147 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BBOX_OVERLAPS_CUDA_KERNEL_CUH
+#define BBOX_OVERLAPS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
+                                          T& y1, T& x2, T& y2) {
+  x1 = bbox[base];
+  y1 = bbox[base + 1];
+  x2 = bbox[base + 2];
+  y2 = bbox[base + 3];
+}
+
+template <>
+__device__ __forceinline__ void load_bbox<float>(const float* bbox,
+                                                 const int base, float& x1,
+                                                 float& y1, float& x2,
+                                                 float& y2) {
+  const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];
+  x1 = bbox_offset.x;
+  y1 = bbox_offset.y;
+  x2 = bbox_offset.z;
+  y2 = bbox_offset.w;
+}
+
+template <typename T>
+__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
+                                          T* ious, const int num_bbox1,
+                                          const int num_bbox2, const int mode,
+                                          const bool aligned,
+                                          const int offset) {
+  if (aligned) {
+    CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
+      const int b1 = index;
+      const int b2 = index;
+
+      const int base1 = b1 << 2;  // b1 * 4
+      T b1_x1, b1_y1, b1_x2, b1_y2;
+      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      const int base2 = b2 << 2;  // b2 * 4
+      T b2_x1, b2_y1, b2_x2, b2_y2;
+      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      const T width = fmaxf(right - left + offset, 0.f);
+      const T height = fmaxf(bottom - top + offset, 0.f);
+      const T interS = width * height;
+
+      const T baseS =
+          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
+      ious[index] = interS / baseS;
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
+      const int b1 = index / num_bbox2;
+      const int b2 = index % num_bbox2;
+
+      const int base1 = b1 << 2;  // b1 * 4
+      T b1_x1, b1_y1, b1_x2, b1_y2;
+      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      const int base2 = b2 << 2;  // b2 * 4
+      T b2_x1, b2_y1, b2_x2, b2_y2;
+      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      const T width = fmaxf(right - left + offset, 0.f);
+      const T height = fmaxf(bottom - top + offset, 0.f);
+      const T interS = width * height;
+
+      const T baseS =
+          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
+      ious[index] = interS / baseS;
+    }
+  }
+}
+
+#if __CUDA_ARCH__ >= 530
+__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,
+                                              const __half x2, const __half y2,
+                                              const __half offset) {
+  const __half half_w = __hadd(__hsub(x2, x1), offset);
+  const __half half_h = __hadd(__hsub(y2, y1), offset);
+  return __hmul(half_w, half_h);
+}
+
+__device__ __forceinline__ __half __half_max(const __half a, const __half b) {
+  return __hge(a, b) ? a : b;
+}
+
+__device__ __forceinline__ __half __half_min(const __half a, const __half b) {
+  return __hle(a, b) ? a : b;
+}
+
+// fp16 won't provide much increase when aligned==true. It is useful when
+// aligned==false, which would give you ~40% bonus.
+__device__ void bbox_overlaps_cuda_kernel_half(
+    const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,
+    const int num_bbox2, const int mode, const bool aligned, const int offset) {
+  const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;
+  const __half h_offset = __int2half_rn(offset);
+  CUDA_1D_KERNEL_LOOP(index, num_output) {
+    const int b1 = aligned ? index : index / num_bbox2;
+    const int b2 = aligned ? index : index % num_bbox2;
+
+    const int base1 = b1 << 2;
+    __half b1_x1, b1_y1, b1_x2, b1_y2;
+    load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+    const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);
+
+    const int base2 = b2 << 2;
+    __half b2_x1, b2_y1, b2_x2, b2_y2;
+    load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+    const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);
+
+    const __half left = __half_max(b1_x1, b2_x1),
+                 right = __half_min(b1_x2, b2_x2);
+    const __half top = __half_max(b1_y1, b2_y1),
+                 bottom = __half_min(b1_y2, b2_y2);
+    const __half width =
+        __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));
+    const __half height =
+        __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));
+    const __half interS = __hmul(width, height);
+
+    const __half baseS = __half_max(
+        mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,
+        h_offset);
+    ious[index] = __hdiv(interS, baseS);
+  }
+}
+#endif  // __CUDA_ARCH__ >= 530
+
+#endif  // BBOX_OVERLAPS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..537610416e16aae8979d0843972e090d127b0d43
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh
@@ -0,0 +1,230 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu
+#ifndef BEZIER_ALIGN_CUDA_KERNEL_CUH
+#define BEZIER_ALIGN_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+template <typename T>
+__device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3,
+                          const T u) {
+  return ((1. - u) * (1. - u) * (1. - u) * p0 +
+          3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +
+          u * u * u * p3);
+}
+
+template <typename T>
+__global__ void bezier_align_forward_cuda_kernel(
+    const int nthreads,
+    const T *bottom_data,  // inputs
+    const T *bottom_rois,  // bottom rois contains the bezier curve
+    T *top_data,           // outputs
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int sampling_ratio, bool aligned, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    // beziers have size Nx(1+8*2) = Nx17
+    const T *offset_bottom_rois = bottom_rois + n * 17;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+
+    // TODO: avoid this by using parallel annotation, for good
+    T p0_x = offset_bottom_rois[1] * spatial_scale;
+    T p0_y = offset_bottom_rois[2] * spatial_scale;
+    T p1_x = offset_bottom_rois[3] * spatial_scale;
+    T p1_y = offset_bottom_rois[4] * spatial_scale;
+    T p2_x = offset_bottom_rois[5] * spatial_scale;
+    T p2_y = offset_bottom_rois[6] * spatial_scale;
+    T p3_x = offset_bottom_rois[7] * spatial_scale;
+    T p3_y = offset_bottom_rois[8] * spatial_scale;
+    T p4_x = offset_bottom_rois[15] * spatial_scale;
+    T p4_y = offset_bottom_rois[16] * spatial_scale;
+    T p5_x = offset_bottom_rois[13] * spatial_scale;
+    T p5_y = offset_bottom_rois[14] * spatial_scale;
+    T p6_x = offset_bottom_rois[11] * spatial_scale;
+    T p6_y = offset_bottom_rois[12] * spatial_scale;
+    T p7_x = offset_bottom_rois[9] * spatial_scale;
+    T p7_y = offset_bottom_rois[10] * spatial_scale;
+
+    // compute the coords
+    const T u = pw / static_cast<T>(pooled_width);
+    const T v = ph / static_cast<T>(pooled_height);
+    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
+    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
+    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
+    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
+    const T x_center = x1 * v + x0 * (1. - v) - offset;
+    const T y_center = y1 * v + y0 * (1. - v) - offset;
+
+    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
+    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
+    {
+      const T y = y_center - (T)0.5 * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = x_center - (T)0.5 * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+
+        T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,
+                                     index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+template <typename T>
+__global__ void bezier_align_backward_cuda_kernel(
+    const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int sampling_ratio, bool aligned, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    // beziers have size Nx(1+8*2) = Nx17
+    const T *offset_bottom_rois = bottom_rois + n * 17;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T p0_x = offset_bottom_rois[1] * spatial_scale;
+    T p0_y = offset_bottom_rois[2] * spatial_scale;
+    T p1_x = offset_bottom_rois[3] * spatial_scale;
+    T p1_y = offset_bottom_rois[4] * spatial_scale;
+    T p2_x = offset_bottom_rois[5] * spatial_scale;
+    T p2_y = offset_bottom_rois[6] * spatial_scale;
+    T p3_x = offset_bottom_rois[7] * spatial_scale;
+    T p3_y = offset_bottom_rois[8] * spatial_scale;
+    T p4_x = offset_bottom_rois[15] * spatial_scale;
+    T p4_y = offset_bottom_rois[16] * spatial_scale;
+    T p5_x = offset_bottom_rois[13] * spatial_scale;
+    T p5_y = offset_bottom_rois[14] * spatial_scale;
+    T p6_x = offset_bottom_rois[11] * spatial_scale;
+    T p6_y = offset_bottom_rois[12] * spatial_scale;
+    T p7_x = offset_bottom_rois[9] * spatial_scale;
+    T p7_y = offset_bottom_rois[10] * spatial_scale;
+
+    // compute the coords
+    const T u = pw / static_cast<T>(pooled_width);
+    const T v = ph / static_cast<T>(pooled_height);
+    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
+    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
+    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
+    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
+    const T x_center = x1 * v + x0 * (1. - v) - offset;
+    const T y_center = y1 * v + y0 * (1. - v) - offset;
+
+    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
+    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T *offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
+    {
+      const T y = y_center - (T)0.5 * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = x_center - (T)0.5 * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low,
+                    static_cast<T>(g1));
+          atomicAdd(offset_bottom_diff + y_low * width + x_high,
+                    static_cast<T>(g2));
+          atomicAdd(offset_bottom_diff + y_high * width + x_low,
+                    static_cast<T>(g3));
+          atomicAdd(offset_bottom_diff + y_high * width + x_high,
+                    static_cast<T>(g4));
+        }  // if
+      }    // ix
+    }      // iy
+  }        // CUDA_1D_KERNEL_LOOP
+}  // BezierAlignBackward
+
+#endif  // BEZIER_ALIGN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1d2a2197b45ef5c82412c4b75d7819a7e27674f6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
@@ -0,0 +1,200 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/csrc/border_align/border_align_kernel.cu.
+// the main difference: (1) use `argmax_idx` for fast computing of gradient
+// during the backward. (2) `wh` is directly computed by `boxes`, rather than
+// passing it as argument to forward or backward functions.
+
+#ifndef BORDER_ALIGN_CUDA_KERNEL_CUH
+#define BORDER_ALIGN_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+enum BorderMode { Top = 0, Left = 1, Bottom = 2, Right = 3 };
+
+/*** Forward ***/
+template <typename T>
+__global__ void border_align_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* boxes, T* output,
+    int* argmax_idx, const int channels, const int box_size, const int height,
+    const int width, const int pool_size) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
+    // output, and `extreme_idx` is in range [0,3]
+    int batch_idx, c_idx, box_idx, extreme_idx, maxidx, *offset_argmax_idx;
+    const T *offset_box, *offset_input, *offset_box_x;
+    T *offset_output, box_width, box_height, stride, x_stride, y_stride, x, y,
+        val, maxval;
+
+    extreme_idx = threadIdx.y;
+    // shape (N, C, box_size, 4) for output
+    batch_idx = index / channels / box_size;
+    // shape (N, box_size, 4) for boxes
+    box_idx = index % box_size + batch_idx * box_size;
+    c_idx = (index / box_size) % channels;
+
+    offset_box = boxes + box_idx * 4;
+    box_width = *(offset_box + 2) - *offset_box;
+    box_height = *(offset_box + 3) - *(offset_box + 1);
+    offset_output = output + index * 4 + extreme_idx;
+    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
+    // shape (N, 4C, h, w) for input.
+    // [0,C) for top feature, [C,2C) for left feature,
+    // [2C,3C) for bottom feature, [3C,4C) for right feature
+    offset_input =
+        input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) *
+                    height * width;
+
+    // extreme_idx in [0,1] -> offset_box_x indexed at x1
+    // extreme_idx in [2,3] -> offset_box_x indexed at x2
+    offset_box_x = offset_box + extreme_idx / 2 * 2;
+
+    // (x1,y1) or (x2,y2) for (x,y)
+    x = *offset_box_x;
+    y = *(offset_box_x + 1);
+
+    switch (extreme_idx) {
+      // top
+      case BorderMode::Top:
+        stride = box_width / pool_size;
+        x_stride = stride;
+        y_stride = 0;
+        break;
+      // left
+      case BorderMode::Left:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = stride;
+        break;
+      // bottom
+      case BorderMode::Bottom:
+        stride = box_width / pool_size;
+        x_stride = -stride;
+        y_stride = 0;
+        break;
+      // right
+      case BorderMode::Right:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = -stride;
+        break;
+    }
+
+    // initialize maxval and maxidx with the start position (e.g. (x1,y1) or
+    // (x2,y2))
+    maxval = bilinear_interpolate(offset_input, height, width, y, x, index);
+    maxidx = 0;
+
+    // do max_pool along the border
+    for (int i = 1; i <= pool_size; i++) {
+      x += x_stride;
+      y += y_stride;
+      val = bilinear_interpolate(offset_input, height, width, y, x, index);
+      if (val > maxval) {
+        maxval = val;
+        maxidx = i;
+      }
+    }
+
+    // update output and argmax_idx
+    *offset_output = maxval;
+    *offset_argmax_idx = maxidx;
+  }
+}
+
+/*** Backward ***/
+template <typename T>
+__global__ void border_align_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* boxes,
+    const int* argmax_idx, T* grad_input, const int channels,
+    const int box_size, const int height, const int width,
+    const int pool_size) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
+    // output, and `extreme_idx` is in range [0,3]
+    int batch_idx, c_idx, box_idx, extreme_idx;
+    const int* offset_argmax_idx;
+    const T *offset_grad_output, *offset_box, *offset_box_x;
+    T *offset_grad_input, box_width, box_height, stride, x_stride, y_stride, x,
+        y;
+
+    extreme_idx = threadIdx.y;
+    batch_idx = index / channels / box_size;
+    box_idx = index % box_size + batch_idx * box_size;
+    c_idx = (index / box_size) % channels;
+
+    offset_box = boxes + box_idx * 4;
+    box_width = *(offset_box + 2) - *offset_box;
+    box_height = *(offset_box + 3) - *(offset_box + 1);
+    offset_grad_output = grad_output + index * 4 + extreme_idx;
+    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
+    // [0,C) for top feature grad, [C,2C) for left feature grad,
+    // [2C,3C) for bottom feature grad, [3C,4C) for right feature grad
+    offset_grad_input = grad_input + (batch_idx * channels * 4 +
+                                      extreme_idx * channels + c_idx) *
+                                         height * width;
+
+    // extreme_idx in [0,1] -> offset_box_x indexed at x1
+    // extreme_idx in [2,3] -> offset_box_x indexed at x2
+    offset_box_x = offset_box + extreme_idx / 2 * 2;
+
+    switch (extreme_idx) {
+      // top
+      case BorderMode::Top:
+        stride = box_width / pool_size;
+        x_stride = stride;
+        y_stride = 0;
+        break;
+      // left
+      case BorderMode::Left:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = stride;
+        break;
+      // bottom
+      case BorderMode::Bottom:
+        stride = box_width / pool_size;
+        x_stride = -stride;
+        y_stride = 0;
+        break;
+      // right
+      case BorderMode::Right:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = -stride;
+        break;
+    }
+
+    // get position (x,y) which has maximum value during forward
+    x = *offset_box_x;
+    y = *(offset_box_x + 1);
+    x += x_stride * (T)(*offset_argmax_idx);
+    y += y_stride * (T)(*offset_argmax_idx);
+
+    T w1, w2, w3, w4;
+    int x_low, x_high, y_low, y_high;
+    bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low,
+                                  x_high, y_low, y_high, index);
+
+    // update grad_output
+    atomicAdd(offset_grad_input + y_low * width + x_low,
+              *offset_grad_output * w1);
+    atomicAdd(offset_grad_input + y_low * width + x_high,
+              *offset_grad_output * w2);
+    atomicAdd(offset_grad_input + y_high * width + x_low,
+              *offset_grad_output * w3);
+    atomicAdd(offset_grad_input + y_high * width + x_high,
+              *offset_grad_output * w4);
+  }
+}
+
+#endif  // BORDER_ALIGN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..cf8ad5e1a324de3a11c8fc8af28a8d559a661ed6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh
@@ -0,0 +1,91 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#ifndef BOX_IOU_QUADRI_CUDA_CUH
+#define BOX_IOU_QUADRI_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+template <typename T>
+__global__ void box_iou_quadri_cuda_kernel(
+    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
+    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
+  if (aligned) {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
+      int b1 = index;
+      int b2 = index;
+
+      int base1 = b1 * 8;
+
+      float block_boxes1[8];
+      float block_boxes2[8];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+      block_boxes1[5] = dev_boxes1[base1 + 5];
+      block_boxes1[6] = dev_boxes1[base1 + 6];
+      block_boxes1[7] = dev_boxes1[base1 + 7];
+
+      int base2 = b2 * 8;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+      block_boxes2[5] = dev_boxes2[base2 + 5];
+      block_boxes2[6] = dev_boxes2[base2 + 6];
+      block_boxes2[7] = dev_boxes2[base2 + 7];
+
+      dev_ious[index] =
+          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
+      int b1 = index / n_boxes2;
+      int b2 = index % n_boxes2;
+
+      int base1 = b1 * 8;
+
+      float block_boxes1[8];
+      float block_boxes2[8];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+      block_boxes1[5] = dev_boxes1[base1 + 5];
+      block_boxes1[6] = dev_boxes1[base1 + 6];
+      block_boxes1[7] = dev_boxes1[base1 + 7];
+
+      int base2 = b2 * 8;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+      block_boxes2[5] = dev_boxes2[base2 + 5];
+      block_boxes2[6] = dev_boxes2[base2 + 6];
+      block_boxes2[7] = dev_boxes2[base2 + 7];
+
+      dev_ious[index] =
+          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  }
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..abd47cd85437804310886de057b5a839a49481b2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
@@ -0,0 +1,81 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+#ifndef BOX_IOU_ROTATED_CUDA_CUH
+#define BOX_IOU_ROTATED_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+template <typename T>
+__global__ void box_iou_rotated_cuda_kernel(
+    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
+    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
+  if (aligned) {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
+      int b1 = index;
+      int b2 = index;
+
+      int base1 = b1 * 5;
+
+      float block_boxes1[5];
+      float block_boxes2[5];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+
+      int base2 = b2 * 5;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+
+      dev_ious[index] =
+          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
+      int b1 = index / n_boxes2;
+      int b2 = index % n_boxes2;
+
+      int base1 = b1 * 5;
+
+      float block_boxes1[5];
+      float block_boxes2[5];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+
+      int base2 = b2 * 5;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+
+      dev_ious[index] =
+          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  }
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..311900fcd303483dea815a1eb996a7eb33fdc55b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
@@ -0,0 +1,335 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_CUDA_KERNEL_CUH
+#define CARAFE_CUDA_KERNEL_CUH
+
+#include <ATen/cuda/DeviceUtils.cuh>
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#ifdef MMCV_WITH_HIP
+#define WARP_SIZE 64
+#else
+#define WARP_SIZE 32
+#endif
+#define THREADS_PER_PIXEL 32
+#define MAX_SHARED_MEMORY 49152
+#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
+#define MAXIMIZE_KERNEL_SIZE true
+#define kTileDim 32
+#define kBlockRows 8
+#define FULL_MASK 0xffffffff
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+                                const int w, const int channel_num,
+                                const int height, const int width) {
+  int index = w + (h + (c + n * channel_num) * height) * width;
+  return index;
+}
+#ifndef MMCV_WITH_HIP
+/* TODO: move this to a common place */
+template <typename scalar_t>
+__device__ inline scalar_t min(scalar_t a, scalar_t b) {
+  return a < b ? a : b;
+}
+
+template <typename scalar_t>
+__device__ inline scalar_t max(scalar_t a, scalar_t b) {
+  return a > b ? a : b;
+}
+#endif
+template <typename scalar_t>
+__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
+  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
+#ifdef MMCV_WITH_HIP
+    val += __shfl_down(val, offset);
+#else
+    val += __shfl_down_sync(FULL_MASK, val, offset);
+#endif
+  return val;
+}
+
+template <>
+__device__ __forceinline__ phalf warpReduceSum(phalf val) {
+  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
+#ifdef MMCV_WITH_HIP
+    // Using PyTorch's macro for half support
+    __PHALF(val) += WARP_SHFL_DOWN(val, offset);
+#else
+    __PHALF(val) +=
+        __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset);
+#endif
+  return val;
+}
+
+// Splits the original matrix into submatrices with size 32 * 32.
+// Each block transposes one submatrix by loading it into shared memory.
+// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
+template <typename scalar_t>
+__global__ void BatchTranspose2DCUDAKernel(const int N, const int H,
+                                           const int W, const int dh,
+                                           const int dw,
+                                           const scalar_t *__restrict__ X,
+                                           scalar_t *__restrict__ Y) {
+  __shared__ scalar_t tile[kTileDim][kTileDim + 1];
+  const int n = blockIdx.x / (dh * dw);
+  const int k = blockIdx.x % (dh * dw);
+  const int r = k / dw;
+  const int c = k % dw;
+  const int offset = n * H * W;
+  int x = c * kTileDim + threadIdx.x;
+  int y = r * kTileDim + threadIdx.y;
+  if (x < W) {
+    for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {
+      tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
+    }
+  }
+  __syncthreads();
+  x = r * kTileDim + threadIdx.x;
+  y = c * kTileDim + threadIdx.y;
+  if (x < H) {
+    for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {
+      Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void CARAFEForward(
+    const int num_kernels, const scalar_t *__restrict__ bottom_data,
+    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int down_height, const int down_width, const int height,
+    const int width, const int mask_channels, scalar_t *__restrict__ top_data) {
+#if MAXIMIZE_KERNEL_SIZE
+  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int down_pw = pw / scale_factor;
+  const int down_ph = ph / scale_factor;
+
+  const int start_w = down_pw - (kernel_size - 1) / 2;
+  const int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+  const int start_h = down_ph - (kernel_size - 1) / 2;
+  const int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+    int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels);
+    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+  }
+  __syncthreads();
+
+  const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    int mask_group = c / channels_per_group;
+    scalar_t output_val = 0;
+#pragma unroll
+    for (int iy = start_h; iy < end_h; iy++) {
+#pragma unroll
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, iy, ix, c, down_height, down_width, channels);
+
+        output_val += bottom_data[feat_index] *
+                      shared_mask[mask_c * WARP_SIZE + pixel_id];
+      }
+    }
+
+    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+    top_data[top_index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void CARAFEBackward_Feature(
+    const int num_kernels, const scalar_t *__restrict__ top_diff,
+    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int down_height, const int down_width, const int height,
+    const int width, const int mask_channels,
+    scalar_t *__restrict__ bottom_diff) {
+#if MAXIMIZE_KERNEL_SIZE
+  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+
+  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  // (n, c, ph, pw) is an element in the bottom_data
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int start_w = pw - (kernel_size - 1) * scale_factor / 2;
+  const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1;
+  const int start_h = ph - (kernel_size - 1) * scale_factor / 2;
+  const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1;
+  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+    const int mask_w = (c % kernel_size) * scale_factor;
+    const int mask_h = (c / kernel_size % kernel_size) * scale_factor;
+    const int mask_x = start_w + mask_w;
+    const int mask_y = start_h + mask_h;
+    if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) {
+      shared_mask[c * WARP_SIZE + pixel_id] = 0;
+      continue;
+    }
+    const int mask_group = c / (kernel_size * kernel_size);
+    const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1;
+    int mask_index =
+        Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width);
+    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+  }
+  __syncthreads();
+  const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    int mask_group = c / channels_per_group;
+    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+    scalar_t output_val = 0;
+#pragma unroll
+    for (int iy = start_h; iy < end_h; iy += scale_factor) {
+#pragma unroll
+      for (int ix = start_w; ix < end_w; ix += scale_factor) {
+        if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) {
+          continue;
+        }
+        int mask_iy =
+            (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+        int mask_ix =
+            (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index = Loc2Index(n, iy, ix, c, height, width, channels);
+        output_val +=
+            shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index];
+      }
+    }
+    bottom_diff[top_index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void FeatureSum(const int num_kernels,
+                           const scalar_t *__restrict__ input_data,
+                           const int scale_factor, const int channels,
+                           const int height, const int width,
+                           scalar_t *__restrict__ output_data) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    scalar_t output_val = 0;
+    for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) {
+      for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) {
+        int input_id = Loc2Index(n, iy, ix, c, height * scale_factor,
+                                 width * scale_factor, channels);
+        output_val += input_data[input_id];
+      }
+    }
+    const int output_id = Loc2Index(n, ph, pw, c, height, width, channels);
+    output_data[output_id] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void CARAFEBackward_Mask(const int num_kernels,
+                                    const scalar_t *__restrict__ top_diff,
+                                    const scalar_t *__restrict__ bottom_data,
+                                    const int kernel_size, const int group_size,
+                                    const int scale_factor, const int channels,
+                                    const int down_height, const int down_width,
+                                    const int height, const int width,
+                                    const int mask_channels,
+                                    scalar_t *__restrict__ mask_diff) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+
+  const int lane_id = index % WARP_SIZE;
+  index = index / WARP_SIZE;
+  const int mask_c = index % mask_channels;
+  // (n, c, ph, pw) is an element in the bottom_data
+  index = index / mask_channels;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int down_pw = pw / scale_factor;
+  const int down_ph = ph / scale_factor;
+
+  const int mask_group = mask_c / (kernel_size * kernel_size);
+  const int mask_loc = mask_c % (kernel_size * kernel_size);
+
+  const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2;
+  const int offset_y =
+      mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2;
+
+  const int down_x = down_pw + offset_x;
+  const int down_y = down_ph + offset_y;
+
+  scalar_t output_val = 0;
+
+  if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 &&
+      down_x <= down_width - 1) {
+    const int channels_per_mask = ceilf(channels / (float)group_size);
+    const int start = channels_per_mask * mask_group;
+    const int end = min(channels_per_mask * (mask_group + 1), channels);
+    for (int c = start + lane_id; c < end; c += WARP_SIZE) {
+      int bottom_id =
+          Loc2Index(n, down_y, down_x, c, down_height, down_width, channels);
+      int top_id = Loc2Index(n, ph, pw, c, height, width, channels);
+      output_val += top_diff[top_id] * bottom_data[bottom_id];
+    }
+  }
+#ifdef MMCV_WITH_HIP
+  __syncthreads();
+#else
+  __syncwarp();
+#endif
+  output_val = warpReduceSum(output_val);
+  if (lane_id == 0) {
+    const int mask_id =
+        Loc2Index(n, ph, pw, mask_c, height, width, mask_channels);
+    mask_diff[mask_id] = output_val;
+  }
+}
+
+#endif  // CARAFE_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..48230c632f223b736aa72a9d5fd682c97b3aa93a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_NAIVE_CUDA_KERNEL_CUH
+#define CARAFE_NAIVE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+                                const int w, const int channel_num,
+                                const int height, const int width) {
+  int index = w + (h + (c + n * channel_num) * height) * width;
+  return index;
+}
+
+template <typename scalar_t>
+__global__ void carafe_naive_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the bottom_data
+    int pw = index % width;
+    int ph = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    int mask_channels = kernel_size * kernel_size * group_size;
+    int mask_group = c / (channels / group_size);
+
+    int down_pw = pw / scale_factor;
+    int down_ph = ph / scale_factor;
+    int down_width = width / scale_factor;
+    int down_height = height / scale_factor;
+    int start_w = down_pw - (kernel_size - 1) / 2;
+    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+    int start_h = down_ph - (kernel_size - 1) / 2;
+    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+
+    scalar_t output_val = 0;
+    for (int iy = start_h; iy < end_h; iy++) {
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+        int mask_index =
+            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+        output_val += bottom_data[feat_index] * bottom_masks[mask_index];
+      }
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void carafe_naive_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data,
+    const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff,
+    const int kernel_size, const int group_size, const int scale_factor,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the bottom_data
+    int pw = index % width;
+    int ph = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    int mask_channels = kernel_size * kernel_size * group_size;
+    int mask_group = c / (channels / group_size);
+
+    int down_pw = pw / scale_factor;
+    int down_ph = ph / scale_factor;
+    int down_width = width / scale_factor;
+    int down_height = height / scale_factor;
+    int start_w = down_pw - (kernel_size - 1) / 2;
+    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+    int start_h = down_ph - (kernel_size - 1) / 2;
+    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+
+    for (int iy = start_h; iy < end_h; iy++) {
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+        int mask_index =
+            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+        atomicAdd(bottom_diff + feat_index,
+                  bottom_masks[mask_index] * top_diff[index]);
+        atomicAdd(mask_diff + mask_index,
+                  bottom_data[feat_index] * top_diff[index]);
+      }
+    }
+  }
+}
+
+#endif  // CARAFE_NAIVE_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..89feea4a546a5093967f26393ca6be3b9fe6ae05
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
@@ -0,0 +1,101 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu
+#ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH
+#define CHAMFER_DISTANCE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
+
+template <typename scalar_t>
+__global__ void chamfer_distance_forward_cuda_kernel(int b, int n,
+                                                     const scalar_t* xyz, int m,
+                                                     const scalar_t* xyz2,
+                                                     scalar_t* result,
+                                                     int* result_i) {
+  __shared__ scalar_t buf[MAX_SHARED_SCALAR_T];
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) {
+      int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2;
+      for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) {
+        buf[j] = xyz2[(i * m + k2) * 2 + j];
+      }
+      __syncthreads();
+      for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
+        scalar_t x1 = xyz[(i * n + j) * 2 + 0];
+        scalar_t y1 = xyz[(i * n + j) * 2 + 1];
+        int best_i = 0;
+        scalar_t best = 1e10;
+        int end_ka = end_k & (~2);
+        if (end_ka == THREADS_PER_BLOCK) {
+          for (int k = 0; k < THREADS_PER_BLOCK; k += 4) {
+#pragma unroll
+            for (int j = 0; j < 4; ++j) {
+              scalar_t x2 = buf[(k + j) * 2] - x1;
+              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
+              scalar_t d = x2 * x2 + y2 * y2;
+              if (d < best) {
+                best = d;
+                best_i = k + k2 + j;
+              }
+            }
+          }
+        } else {
+          for (int k = 0; k < end_ka; k += 4) {
+#pragma unroll
+            for (int j = 0; j < 4; ++j) {
+              scalar_t x2 = buf[(k + j) * 2] - x1;
+              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
+              scalar_t d = x2 * x2 + y2 * y2;
+              if (d < best) {
+                best = d;
+                best_i = k + k2 + j;
+              }
+            }
+          }
+        }
+        for (int k = end_ka; k < end_k; k++) {
+          scalar_t x2 = buf[k * 2 + 0] - x1;
+          scalar_t y2 = buf[k * 2 + 1] - y1;
+          scalar_t d = x2 * x2 + y2 * y2;
+          if (k == 0 || d < best) {
+            best = d;
+            best_i = k + k2;
+          }
+        }
+        if (k2 == 0 || result[(i * n + j)] > best) {
+          result[(i * n + j)] = best;
+          result_i[(i * n + j)] = best_i;
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void chamfer_distance_backward_cuda_kernel(
+    int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2,
+    const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1,
+    scalar_t* grad_xyz2) {
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
+      scalar_t x1 = xyz1[(i * n + j) * 2 + 0];
+      scalar_t y1 = xyz1[(i * n + j) * 2 + 1];
+      int j2 = idx1[i * n + j];
+      scalar_t x2 = xyz2[(i * m + j2) * 2 + 0];
+      scalar_t y2 = xyz2[(i * m + j2) * 2 + 1];
+      scalar_t g = grad_dist1[i * n + j] * 2;
+      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2));
+      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2));
+      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2)));
+      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2)));
+    }
+  }
+}
+#endif  // CHAMFER_DISTANCE_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b12aa9a26a2cc162fd89f68ccc97e17749090a41
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
@@ -0,0 +1,120 @@
+#ifndef COMMON_CUDA_HELPER
+#define COMMON_CUDA_HELPER
+
+#include <cuda.h>
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                             \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \
+       i += blockDim.x * gridDim.x)                                 \
+    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
+         j += blockDim.y * gridDim.y)
+
+#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \
+  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
+    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
+
+#define THREADS_PER_BLOCK 512
+
+inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
+  int optimal_block_num = (N + num_threads - 1) / num_threads;
+  int max_block_num = 4096;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename T>
+__device__ T bilinear_interpolate(const T* input, const int height,
+                                  const int width, T y, T x,
+                                  const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
+    int& x_low, int& x_high, int& y_low, int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+#endif  // COMMON_CUDA_HELPER
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2af96f7963ec347486ced942a5ef7cc4f187db8b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
@@ -0,0 +1,831 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CONVEX_IOU_CUDA_KERNEL_CUH
+#define CONVEX_IOU_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAXN 100
+#define NMAX 512
+__device__ const double EPS = 1E-8;
+
+__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); }
+
+struct Point {
+  double x, y;
+  __device__ Point() {}
+  __device__ Point(double x, double y) : x(x), y(y) {}
+};
+
+__device__ inline bool point_same(Point& a, Point& b) {
+  return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0;
+}
+
+__device__ inline void swap1(Point* a, Point* b) {
+  Point temp;
+  temp.x = a->x;
+  temp.y = a->y;
+
+  a->x = b->x;
+  a->y = b->y;
+
+  b->x = temp.x;
+  b->y = temp.y;
+}
+
+__device__ inline void reverse1(Point* a, const int n) {
+  for (int i = 0; i < (n - 1) / 2.0; i++) {
+    Point* j = &(a[i]);
+    Point* k = &(a[n - 1 - i]);
+    swap1(j, k);
+  }
+}
+
+__device__ inline double cross(Point o, Point a, Point b) {
+  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
+}
+
+__device__ inline double dis(Point a, Point b) {
+  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+}
+__device__ inline double area(Point* ps, int n) {
+  ps[n] = ps[0];
+  double res = 0;
+  for (int i = 0; i < n; i++) {
+    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
+  }
+  return res / 2.0;
+}
+__device__ inline double polygon_area_grad(Point* ps, int n,
+                                           int* polygon_to_pred_index,
+                                           int n_pred, double* grad_C) {
+  ps[n] = ps[0];
+  double partion_grad[4 * 30 + 2];
+  double res = 0;
+  for (int i = 0; i < n; i++) {
+    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
+    partion_grad[i * 4 + 2] = ps[i + 1].y;
+    partion_grad[i * 4 + 3] = -ps[i + 1].x;
+    if (i != n - 1) {
+      partion_grad[i * 4 + 4] = -ps[i].y;
+      partion_grad[i * 4 + 5] = ps[i].x;
+    } else {
+      partion_grad[0] = -ps[i].y;
+      partion_grad[1] = ps[i].x;
+    }
+  }
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < n_pred; j++) {
+      if (i == polygon_to_pred_index[j]) {
+        grad_C[2 * polygon_to_pred_index[j + n_pred]] =
+            (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2;
+        break;
+      }
+    }
+    for (int j = 0; j < n_pred; j++) {
+      if (i == polygon_to_pred_index[j]) {
+        grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] =
+            (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2;
+        break;
+      }
+    }
+  }
+
+  return res / 2.0;
+}
+
+__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p,
+                                double* cut_grad, int m, int n, int i) {
+  double s1, s2;
+  double s2_s1_2;
+  double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd;
+  double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd;
+  s1 = cross(a, b, c);
+  s2 = cross(a, b, d);
+
+  ds1_dxc = -(b.y - a.y);
+  ds1_dyc = b.x - a.x;
+  ds2_dxd = ds1_dxc;
+  ds2_dyd = ds1_dyc;
+  s2_s1_2 = (s2 - s1) * (s2 - s1);
+
+  if (sig(s1) == 0 && sig(s2) == 0) return 2;
+  if (sig(s2 - s1) == 0) return 0;
+
+  dxp_dxc =
+      ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) /
+      (s2_s1_2);
+  dxp_dyc =
+      ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) /
+      (s2_s1_2);
+  dxp_dxd =
+      ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) /
+      (s2_s1_2);
+  dxp_dyd =
+      ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) /
+      (s2_s1_2);
+
+  dyp_dxc =
+      ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) /
+      (s2_s1_2);
+  dyp_dyc =
+      ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) /
+      (s2_s1_2);
+  dyp_dxd =
+      ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) /
+      (s2_s1_2);
+  dyp_dyd =
+      ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) /
+      (s2_s1_2);
+
+  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
+  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
+  if (i == n - 1) {
+    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
+    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
+    cut_grad[4 * n * m + 0] = dxp_dxd;  // + dyp_dxd;
+    cut_grad[4 * n * m + 1] = dyp_dxd;
+    cut_grad[4 * n * m + 2] = dxp_dyd;  // + dyp_dyd;
+    cut_grad[4 * n * m + 3] = dyp_dyd;
+  } else {
+    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
+    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
+    cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd;  // + dyp_dxd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd;  // + dyp_dyd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd;
+  }
+
+  return 1;
+}
+__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b,
+                                   double* cut_grad) {
+  Point pp[MAXN];
+  double ccur_grad[MAXN] = {};
+  int m = 0;
+  p[n] = p[0];
+  int k = n;
+  for (int i = 0; i < n; i++) {
+    if (sig(cross(a, b, p[i])) > 0) {
+      pp[m] = p[i];
+      ccur_grad[4 * n * m + 4 * i] = 1.0;
+      ccur_grad[4 * n * m + 4 * i + 3] = 1.0;
+      m++;
+    }
+    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
+      lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i);
+      m++;
+    }
+  }
+
+  n = 0;
+  for (int i = 0; i < m; i++) {
+    if (!i || !(point_same(pp[i], pp[i - 1]))) {
+      p[n] = pp[i];
+      for (int j = 0; j < 4 * k; j++) {
+        cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j];
+      }
+      n++;
+    }
+  }
+
+  while (n > 1 && point_same(p[n - 1], p[0])) n--;
+}
+
+__device__ inline double intersectArea(Point a, Point b, Point c, Point d,
+                                       double* grad_AB, int order,
+                                       int convex_n) {
+  Point o(0, 0);
+  int res_flag = 0;
+  int s1 = sig(cross(o, a, b));
+  int s2 = sig(cross(o, c, d));
+  if (s1 == 0 || s2 == 0) return 0.0;
+  if (s1 == -1) {
+    Point* i = &a;
+    Point* j = &b;
+    swap1(i, j);
+    res_flag = 1;
+  }
+  if (s2 == -1) {
+    Point* i = &c;
+    Point* j = &d;
+    swap1(i, j);
+  }
+  Point p[10] = {o, a, b};
+  int n = 3, n0 = 3, n1, n2, n3;
+  double cut_grad1[MAXN] = {};
+  double cut_grad2[MAXN] = {};
+  double cut_grad3[MAXN] = {};
+  double p1_p_grad[10][10] = {};
+  double p2_p1_grad[10][10] = {};
+  double p3_p2_grad[10][10] = {};
+
+  double p3_p1_grad[10][10] = {};
+  double p3_p_grad[10][10] = {};
+
+  // 1
+  polygon_cut(p, n, o, c, cut_grad1);
+  n1 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n0; j++) {
+      if (!(j % 2)) {
+        p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j];
+      } else {
+        p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j];
+      }
+    }
+  }
+
+  // 2
+  polygon_cut(p, n, c, d, cut_grad2);
+  n2 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n1; j++) {
+      if (!(j % 2)) {
+        p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j];
+      } else {
+        p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j];
+      }
+    }
+  }
+  // 3
+  polygon_cut(p, n, d, o, cut_grad3);
+  n3 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n2; j++) {
+      if (!(j % 2)) {
+        p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j];
+      } else {
+        p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j];
+      }
+    }
+  }
+
+  // mul
+  //  p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)
+  for (int i = 0; i < 2 * n3; i++) {
+    for (int j = 0; j < 2 * n1; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n2; m++) {
+        sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j];
+      }
+      p3_p1_grad[i][j] = sum;
+    }
+  }
+
+  // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)
+  for (int i = 0; i < 2 * n3; i++) {
+    for (int j = 0; j < 2 * n0; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n1; m++) {
+        sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j];
+      }
+      p3_p_grad[i][j] = sum;
+    }
+  }
+
+  // calculate S_grad
+  int polygon_index_box_index[20];
+  double grad_polygon[20];
+  double S_grad[6];
+
+  for (int i = 0; i < n3; i++) {
+    polygon_index_box_index[i] = i;
+    polygon_index_box_index[i + n3] = i;
+  }
+
+  double res =
+      polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon);
+
+  if (s1 * s2 == -1) {
+    for (int j = 0; j < 2 * 3; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n3; m++) {
+        sum = sum - grad_polygon[m] * p3_p_grad[m][j];
+      }
+      S_grad[j] = sum;
+    }
+
+    if (order != convex_n - 1) {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[2 * order + 2] += S_grad[2];
+        grad_AB[2 * order + 3] += S_grad[3];
+
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[2 * order + 2] += S_grad[4];
+        grad_AB[2 * order + 3] += S_grad[5];
+      }
+    } else {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[0] += S_grad[2];
+        grad_AB[1] += S_grad[3];
+
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[0] += S_grad[4];
+        grad_AB[1] += S_grad[5];
+      }
+    }
+    res = -res;
+  } else {
+    for (int j = 0; j < 2 * 3; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n3; m++) {
+        sum = sum + grad_polygon[m] * p3_p_grad[m][j];
+      }
+      S_grad[j] = sum;
+    }
+
+    if (order != convex_n - 1) {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[2 * order + 2] += S_grad[2];
+        grad_AB[2 * order + 3] += S_grad[3];
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[2 * order + 2] += S_grad[4];
+        grad_AB[2 * order + 3] += S_grad[5];
+      }
+    } else {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[0] += S_grad[2];
+        grad_AB[1] += S_grad[3];
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[0] += S_grad[4];
+        grad_AB[1] += S_grad[5];
+      }
+    }
+  }
+  return res;
+}
+
+__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2,
+                                        double* grad_AB) {
+  if (area(ps1, n1) < 0) reverse1(ps1, n1);
+  if (area(ps2, n2) < 0) reverse1(ps2, n2);
+  ps1[n1] = ps1[0];
+  ps2[n2] = ps2[0];
+  double res = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      res +=
+          intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1);
+    }
+  }
+  return res;
+}
+
+__device__ inline void Jarvis(Point* in_poly, int& n_poly) {
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[NMAX] = {}, top1, top2;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point* j = &(in_poly[0]);
+      Point* k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+  for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]];
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+  for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]];
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+}
+
+__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2,
+                                           int n2, double* grad_C) {
+  Point polygon[MAXN];
+  int n = n1 + n2, n_poly = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n - n1; j++) {
+      if (point_same(ps1[i], ps2[j])) {
+        for (int k = j; k < n - n1 - 1; k++) {
+          ps2[k] = ps2[k + 1];
+        }
+        n2--;
+        break;
+      }
+    }
+  }
+  n_poly = n1 + n2;
+  for (int i = 0; i < n_poly; i++) {
+    if (i < n1) {
+      polygon[i] = ps1[i];
+    } else {
+      polygon[i] = ps2[i - n1];
+    }
+  }
+
+  Jarvis(polygon, n_poly);
+
+  int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                   -1, -1, -1, -1, -1, -1, -1, -1, -1};
+  int n_pred = 0;
+  for (int i = 0; i < n_poly; i++) {
+    for (int j = 0; j < n1; j++) {
+      if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) {
+        polygon_to_pred_index[n_pred] = i;
+        polygon_to_pred_index[n_pred + n1] = j;
+        n_pred += 1;
+        break;
+      }
+    }
+  }
+  if (n_pred == 0) {
+    double polygon_area = fabs(area(polygon, n_poly));
+    for (int i = 0; i < 18; i++) {
+      grad_C[i] = 0.0;
+    }
+    return polygon_area;
+  } else {
+    double polygon_area =
+        polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C);
+    if (polygon_area < 0) {
+      for (int i = 0; i < 18; i++) {
+        grad_C[i] = -grad_C[i];
+      }
+    }
+    return fabs(polygon_area);
+  }
+}
+
+// convex_find and get the polygon_index_box_index
+__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly,
+                                        int* points_to_convex_ind) {
+  int n_input = n_poly;
+  Point input_poly[20];
+  for (int i = 0; i < n_input; i++) {
+    input_poly[i].x = in_poly[i].x;
+    input_poly[i].y = in_poly[i].y;
+  }
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[20], top1, top2;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point* j = &(in_poly[0]);
+      Point* k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+  for (int i = 0; i <= top1; i++) {
+    right_point[i] = in_poly[Stack[i]];
+  }
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+
+  for (int i = top2 - 1; i >= 0; i--) {
+    left_point[i] = in_poly[Stack[i]];
+  }
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+  for (int i = 0; i < n_poly; i++) {
+    for (int j = 0; j < n_input; j++) {
+      if (point_same(in_poly[i], input_poly[j])) {
+        points_to_convex_ind[i] = j;
+        break;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ inline float devrIoU(T const* const p, T const* const q,
+                                T* point_grad, const int idx) {
+  Point ps1[MAXN], ps2[MAXN];
+
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = (double)p[i * 2];
+    convex[i].y = (double)p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
+  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
+
+  int n1 = n_convex;
+  int n2 = 4;
+
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = (double)convex[i].x;
+    ps1[i].y = (double)convex[i].y;
+  }
+
+  for (int i = 0; i < n2; i++) {
+    ps2[i].x = (double)q[i * 2];
+    ps2[i].y = (double)q[i * 2 + 1];
+  }
+
+  int polygon_index_box_index[18];
+  for (int i = 0; i < n1; i++) {
+    polygon_index_box_index[i] = i;
+    polygon_index_box_index[i + n1] = i;
+  }
+
+  double grad_A[18] = {};
+  double grad_AB[18] = {};
+  double grad_C[18] = {};
+
+  double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB);
+  double S_pred =
+      polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A);
+  if (S_pred < 0) {
+    for (int i = 0; i < n_convex * 2; i++) {
+      grad_A[i] = -grad_A[i];
+    }
+  }
+  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
+
+  double iou = inter_area / union_area;
+  double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C);
+
+  //    printf("%d:live\n", idx);
+  double rot_giou = iou - (polygon_area - union_area) / polygon_area;
+
+  float grad_point_temp[18] = {};
+
+  for (int i = 0; i < n_convex; i++) {
+    int grad_point = points_to_convex_ind[i];
+    grad_point_temp[2 * grad_point] =
+        (float)((union_area + inter_area) / (union_area * union_area) *
+                    grad_AB[2 * i] -
+                iou / union_area * grad_A[2 * i] -
+                1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) -
+                (union_area) / polygon_area / polygon_area * grad_C[2 * i]);
+    grad_point_temp[2 * grad_point + 1] =
+        (float)((union_area + inter_area) / (union_area * union_area) *
+                    grad_AB[2 * i + 1] -
+                iou / union_area * grad_A[2 * i + 1] -
+                1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) -
+                (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]);
+  }
+
+  for (int i = 0; i < 9; i++) {
+    point_grad[2 * i] = grad_point_temp[2 * i];
+    point_grad[2 * i + 1] = grad_point_temp[2 * i + 1];
+  }
+  return (float)rot_giou;
+}
+
+template <typename T>
+__global__ void convex_giou_cuda_kernel(const int ex_n_boxes,
+                                        const int gt_n_boxes, const T* ex_boxes,
+                                        const T* gt_boxes, T* point_grad) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T* cur_box = ex_boxes + index * 18;
+    const T* cur_gt_box = gt_boxes + index * 8;
+    T* cur_grad = point_grad + index * 19;
+    T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x);
+    cur_grad[18] = giou;
+  }
+}
+
+__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) {
+  double s1, s2;
+  s1 = cross(a, b, c);
+  s2 = cross(a, b, d);
+  if (sig(s1) == 0 && sig(s2) == 0) return 2;
+  if (sig(s2 - s1) == 0) return 0;
+  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
+  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
+  return 1;
+}
+
+__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) {
+  Point pp[MAXN];
+  int m = 0;
+  p[n] = p[0];
+  for (int i = 0; i < n; i++) {
+    if (sig(cross(a, b, p[i])) > 0) {
+      pp[m] = p[i];
+      m++;
+    }
+    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
+      lineCross(a, b, p[i], p[i + 1], pp[m]);
+      m++;
+    }
+  }
+  n = 0;
+  for (int i = 0; i < m; i++) {
+    if (!i || !(point_same(pp[i], pp[i - 1]))) {
+      p[n] = pp[i];
+      n++;
+    }
+  }
+
+  while (n > 1 && point_same(p[n - 1], p[0])) n--;
+}
+
+__device__ inline double intersectArea(Point a, Point b, Point c, Point d) {
+  Point o(0, 0);
+  int s1 = sig(cross(o, a, b));
+  int s2 = sig(cross(o, c, d));
+  if (s1 == 0 || s2 == 0) return 0.0;
+  if (s1 == -1) {
+    Point* i = &a;
+    Point* j = &b;
+    swap1(i, j);
+  }
+  if (s2 == -1) {
+    Point* i = &c;
+    Point* j = &d;
+    swap1(i, j);
+  }
+  Point p[10] = {o, a, b};
+  int n = 3;
+
+  polygon_cut(p, n, o, c);
+  polygon_cut(p, n, c, d);
+  polygon_cut(p, n, d, o);
+  double res = area(p, n);
+  if (s1 * s2 == -1) res = -res;
+  return res;
+}
+__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2,
+                                        int n2) {
+  if (area(ps1, n1) < 0) reverse1(ps1, n1);
+  if (area(ps2, n2) < 0) reverse1(ps2, n2);
+  ps1[n1] = ps1[0];
+  ps2[n2] = ps2[0];
+  double res = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]);
+    }
+  }
+  return res;
+}
+
+template <typename T>
+__device__ inline float devrIoU(T const* const p, T const* const q) {
+  Point ps1[MAXN], ps2[MAXN];
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = (double)p[i * 2];
+    convex[i].y = (double)p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
+  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
+  int n1 = n_convex;
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = (double)convex[i].x;
+    ps1[i].y = (double)convex[i].y;
+  }
+  int n2 = 4;
+  for (int i = 0; i < n2; i++) {
+    ps2[i].x = (double)q[i * 2];
+    ps2[i].y = (double)q[i * 2 + 1];
+  }
+  double inter_area = intersectAreaO(ps1, n1, ps2, n2);
+  double S_pred = area(ps1, n1);
+  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
+  double iou = inter_area / union_area;
+  return (float)iou;
+}
+
+template <typename T>
+__global__ void convex_iou_cuda_kernel(const int ex_n_boxes,
+                                       const int gt_n_boxes, const T* ex_boxes,
+                                       const T* gt_boxes, T* iou) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T* cur_box = ex_boxes + index * 18;
+    for (int i = 0; i < gt_n_boxes; i++) {
+      iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8);
+    }
+  }
+}
+#endif  // CONVEX_IOU_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f910561ec309cd50fd6d4da131ab36cdf3ca963a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
@@ -0,0 +1,231 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
+// Original licence: Under MIT License
+
+#ifndef CORRELATION_CUDA
+#define CORRELATION_CUDA
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+// Using <torch/extension.h> is recommended in the official documentation in
+// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
+// However, we use <torch/types.h> for compatibility with CUDA 9.0
+// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
+#include <torch/types.h>
+
+#include <iostream>
+#include <vector>
+
+using namespace torch;
+
+#define TensorAcc4R PackedTensorAccessor32<scalar_t, 4, RestrictPtrTraits>
+#define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>
+#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
+
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
+
+template <typename scalar_t>
+__global__ void correlation_forward_cuda_kernel(
+    const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output,
+    int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH,
+    int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW,
+    int oH, int oW) {
+  const int iH = rInput1.size(1);
+  const int iW = rInput1.size(2);
+  const int C = rInput1.size(3);
+
+  const int n = blockIdx.x;
+  const int h = blockIdx.y * blockDim.y + threadIdx.y;
+  const int w = blockIdx.z * blockDim.z + threadIdx.z;
+
+  if (h >= oH || w >= oW) return;
+
+  const int thread = threadIdx.x;
+
+  const int start_i = -padH + h * dH;
+  const int start_j = -padW + w * dW;
+
+  const int patchRadH = dilation_patchH * (patchH - 1) / 2;
+  const int patchRadW = dilation_patchW * (patchW - 1) / 2;
+
+  for (int ph = 0; ph < patchH; ++ph) {
+    int ph_dilated = ph * dilation_patchH - patchRadH;
+    for (int pw = 0; pw < patchW; ++pw) {
+      int pw_dilated = pw * dilation_patchW - patchRadW;
+      scalar_t prod_sum = 0.0f;
+      for (int i = 0; i < kH; ++i) {
+        int i1 = start_i + i * dilationH;
+        int i2 = i1 + ph_dilated;
+        if (WITHIN_BOUNDS(i1, i2, iH, iH)) {
+          for (int j = 0; j < kW; ++j) {
+            int j1 = start_j + j * dilationW;
+            int j2 = j1 + pw_dilated;
+            if (WITHIN_BOUNDS(j1, j2, iW, iW)) {
+              for (int c = thread; c < C; c += WARP_SIZE) {
+                scalar_t v1 = rInput1[n][i1][j1][c];
+                scalar_t v2 = rInput2[n][i2][j2][c];
+                prod_sum += v1 * v2;
+              }
+            }
+          }
+        }
+      }
+      // accumulate
+      for (int offset = 16; offset > 0; offset /= 2)
+#ifdef MMCV_WITH_HIP
+        prod_sum += __shfl_down(float(prod_sum), offset);
+#else
+        prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset);
+#endif
+      if (thread == 0) {
+        output[n][ph][pw][h][w] = prod_sum;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void correlation_backward_cuda_kernel_input1(
+    const TensorAcc5R grad_output, const TensorAcc4R input2,
+    TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,
+    const int patchW, const int padH, const int padW, const int dilationH,
+    const int dilationW, const int dilation_patchH, const int dilation_patchW,
+    const int dH, const int dW) {
+  const int iH = input2.size(1);
+  const int iW = input2.size(2);
+  const int C = input2.size(3);
+
+  const int H = grad_output.size(3);
+  const int W = grad_output.size(4);
+
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+
+  const int n = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+
+  const int h_2 = h + padH;
+  const int w_2 = w + padW;
+  const int min_h = h_2 - kH * dilationH;
+  const int min_w = w_2 - kW * dilationW;
+
+  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
+  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
+  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
+    const int ph = i / patchW;
+    const int pw = i % patchW;
+    int i1 = h + dilation_patchH * (ph - patchRadH);
+    int j1 = w + dilation_patchW * (pw - patchRadW);
+
+    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+      scalar_t grad_val = 0.0f;
+      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+        int i2 = (h_3) / dH;
+        if (i2 * dH != h_3) continue;
+        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+          int j2 = (w_3) / dW;
+          if (j2 * dW != w_3) continue;
+          if (WITHIN_BOUNDS(i2, j2, H, W)) {
+            grad_val += grad_output[n][ph][pw][i2][j2];
+          }
+        }
+      }
+      grad_cache[i] = grad_val;
+    }
+  }
+  __syncthreads();
+
+  for (int c = threadIdx.x; c < C; c += blockDim.x) {
+    scalar_t grad_input_val = 0.0f;
+    for (int ph = 0; ph < patchH; ++ph) {
+      int i1 = h + dilation_patchH * (ph - patchRadH);
+      for (int pw = 0; pw < patchW; ++pw) {
+        int j1 = w + dilation_patchW * (pw - patchRadW);
+        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+          grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw];
+        }
+      }
+    }
+    grad_input1[n][c][h][w] = grad_input_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void correlation_backward_cuda_kernel_input2(
+    const TensorAcc5R grad_output, const TensorAcc4R input1,
+    TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,
+    int padW, int dilationH, int dilationW, int dilation_patchH,
+    int dilation_patchW, int dH, int dW) {
+  const int iH = input1.size(1);
+  const int iW = input1.size(2);
+  const int C = input1.size(3);
+
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+
+  const int H = grad_output.size(3);
+  const int W = grad_output.size(4);
+
+  const int dilatedKH = kH * dilationH;
+  const int dilatedKW = kW * dilationW;
+
+  const int n = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+
+  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
+  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
+  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
+    const int ph = i / patchW;
+    const int pw = i % patchW;
+    int i1 = h - dilation_patchH * (ph - patchRadH);
+    int j1 = w - dilation_patchW * (pw - patchRadW);
+
+    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+      scalar_t grad_val = 0.0f;
+
+      const int h_2 = i1 + padH;
+      const int w_2 = j1 + padW;
+      const int min_h = h_2 - dilatedKH;
+      const int min_w = w_2 - dilatedKW;
+
+      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+        int i2 = (h_3) / dH;
+        if (i2 * dH != h_3) continue;
+        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+          int j2 = (w_3) / dW;
+          if (j2 * dW != w_3) continue;
+          if (WITHIN_BOUNDS(i2, j2, H, W)) {
+            grad_val += grad_output[n][ph][pw][i2][j2];
+          }
+        }
+      }
+      grad_cache[i] = grad_val;
+    }
+  }
+  __syncthreads();
+
+  for (int c = threadIdx.x; c < C; c += blockDim.x) {
+    scalar_t grad_input_val = 0.0f;
+    for (int ph = 0; ph < patchH; ++ph) {
+      int i1 = h - dilation_patchH * (ph - patchRadH);
+      for (int pw = 0; pw < patchW; ++pw) {
+        int j1 = w - dilation_patchW * (pw - patchRadW);
+        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+          grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw];
+        }
+      }
+    }
+    grad_input2[n][c][h][w] = grad_input_val;
+  }
+}
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6b4d1bbd85bad1b87ee5d6b8a3cd3b29e3cbc411
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
@@ -0,0 +1,367 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef DEFORM_CONV_CUDA_KERNEL_CUH
+#define DEFORM_CONV_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+template <typename T>
+__device__ T deformable_im2col_bilinear(const T *input, const int data_width,
+                                        const int height, const int width, T h,
+                                        T w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+__device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h,
+                                 const int w, const int height,
+                                 const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+__device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height,
+                                   const int width, const T *im_data,
+                                   const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+__global__ void deformable_im2col_gpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = deformable_im2col_bilinear(data_im_ptr, width, height, width,
+                                           h_im, w_im);
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+                                         cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void deformable_col2im_coord_gpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int offset_channels, const int deformable_group, const int height_col,
+    const int width_col, T *grad_offset) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      const T weight = get_coordinate_weight(inv_h, inv_w, height, width,
+                                             data_im_ptr + cnt * height * width,
+                                             width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+#endif  // DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..86c4bc66dd2fb289340a4fb1714edb5db1e798c4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
@@ -0,0 +1,186 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+#define DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void deform_roi_pool_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, const T* offset,
+    T* output, const int pooled_height, const int pooled_width,
+    const T spatial_scale, const int sampling_ratio, const T gamma,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    // Compute roi offset
+    if (offset != NULL) {
+      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+                              ph * pooled_width + pw;
+      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+      T offset_roi_h =
+          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+      roi_start_w += offset_roi_w;
+      roi_start_h += offset_roi_h;
+    }
+
+    // We do average pooling inside a bin
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output[index] = output_val / count;
+  }
+}
+
+template <typename T>
+__global__ void deform_roi_pool_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* input, const T* rois,
+    const T* offset, T* grad_input, T* grad_offset, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int sampling_ratio,
+    const T gamma, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    const T* offset_input =
+        input + ((roi_batch_ind * channels + c) * height * width);
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    // Compute roi offset
+    if (offset != NULL) {
+      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+                              ph * pooled_width + pw;
+      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+      T offset_roi_h =
+          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+      roi_start_w += offset_roi_w;
+      roi_start_h += offset_roi_h;
+    }
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+    const T grad_output_this_bin = grad_output[index] / count;
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_grad_input + y_low * width + x_low,
+                    grad_output_this_bin * w1);
+          atomicAdd(offset_grad_input + y_low * width + x_high,
+                    grad_output_this_bin * w2);
+          atomicAdd(offset_grad_input + y_high * width + x_low,
+                    grad_output_this_bin * w3);
+          atomicAdd(offset_grad_input + y_high * width + x_high,
+                    grad_output_this_bin * w4);
+          if (offset != NULL) {
+            T input_00 = offset_input[y_low * width + x_low];
+            T input_10 = offset_input[y_low * width + x_high];
+            T input_01 = offset_input[y_high * width + x_low];
+            T input_11 = offset_input[y_high * width + x_high];
+            T ogx = gamma * roi_width * grad_output_this_bin *
+                    (input_11 * (y - y_low) + input_10 * (y_high - y) +
+                     input_01 * (y_low - y) + input_00 * (y - y_high));
+            T ogy = gamma * roi_height * grad_output_this_bin *
+                    (input_11 * (x - x_low) + input_01 * (x_high - x) +
+                     input_10 * (x_low - x) + input_00 * (x - x_high));
+            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+                          ph * pooled_width + pw,
+                      ogx);
+            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+                          pooled_width * pooled_height + ph * pooled_width + pw,
+                      ogy);
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif  // DEFORM_ROI_POOL_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..053977a3011692b22a5dce6050fcfec4797f092c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
@@ -0,0 +1,137 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Adapted from
+// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAX_NUM_VERT_IDX 9
+#define INTERSECTION_OFFSET 8
+#define EPSILON 1e-8
+
+inline int opt_n_thread(int work_size) {
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+  return max(min(1 << pow_2, THREADS_PER_BLOCK), 1);
+}
+
+/*
+compare normalized vertices (vertices around (0,0))
+if vertex1 < vertex2 return true.
+order: minimum at x-aixs, become larger in anti-clockwise direction
+*/
+__device__ bool compare_vertices(float x1, float y1, float x2, float y2) {
+  if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON)
+    return false;  // if equal, return false
+
+  if (y1 > 0 && y2 < 0) return true;
+  if (y1 < 0 && y2 > 0) return false;
+
+  float n1 = x1 * x1 + y1 * y1 + EPSILON;
+  float n2 = x2 * x2 + y2 * y2 + EPSILON;
+  float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2;
+
+  if (y1 > 0 && y2 > 0) {
+    if (diff > EPSILON)
+      return true;
+    else
+      return false;
+  }
+  if (y1 < 0 && y2 < 0) {
+    if (diff < EPSILON)
+      return true;
+    else
+      return false;
+  }
+  return false;
+}
+
+__global__ void diff_iou_rotated_sort_vertices_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ vertices,
+    const bool *__restrict__ mask, const int *__restrict__ num_valid,
+    int *__restrict__ idx) {
+  int batch_idx = blockIdx.x;
+  vertices += batch_idx * n * m * 2;
+  mask += batch_idx * n * m;
+  num_valid += batch_idx * n;
+  idx += batch_idx * n * MAX_NUM_VERT_IDX;
+
+  int index = threadIdx.x;  // index of polygon
+  int stride = blockDim.x;
+  for (int i = index; i < n; i += stride) {
+    int pad;  // index of arbitrary invalid intersection point (not box corner!)
+    for (int j = INTERSECTION_OFFSET; j < m; ++j) {
+      if (!mask[i * m + j]) {
+        pad = j;
+        break;
+      }
+    }
+    if (num_valid[i] < 3) {
+      // not enough vertices, take an invalid intersection point
+      // (zero padding)
+      for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) {
+        idx[i * MAX_NUM_VERT_IDX + j] = pad;
+      }
+    } else {
+      // sort the valid vertices
+      // note the number of valid vertices is known
+      // note: check that num_valid[i] < MAX_NUM_VERT_IDX
+      for (int j = 0; j < num_valid[i]; ++j) {
+        // initialize with a "big" value
+        float x_min = 1;
+        float y_min = -EPSILON;
+        int i_take = 0;
+        int i2;
+        float x2, y2;
+        if (j != 0) {
+          i2 = idx[i * MAX_NUM_VERT_IDX + j - 1];
+          x2 = vertices[i * m * 2 + i2 * 2 + 0];
+          y2 = vertices[i * m * 2 + i2 * 2 + 1];
+        }
+        for (int k = 0; k < m; ++k) {
+          float x = vertices[i * m * 2 + k * 2 + 0];
+          float y = vertices[i * m * 2 + k * 2 + 1];
+          if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) {
+            if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) {
+              x_min = x;
+              y_min = y;
+              i_take = k;
+            }
+          }
+        }
+        idx[i * MAX_NUM_VERT_IDX + j] = i_take;
+      }
+      // duplicate the first idx
+      idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0];
+
+      // pad zeros
+      for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) {
+        idx[i * MAX_NUM_VERT_IDX + j] = pad;
+      }
+
+      // for corner case: the two boxes are exactly the same.
+      // in this case, idx would have duplicate elements, which makes the
+      // shoelace formula broken because of the definition, the duplicate
+      // elements only appear in the first 8 positions (they are "corners in
+      // box", not "intersection of edges")
+      if (num_valid[i] == 8) {
+        int counter = 0;
+        for (int j = 0; j < 4; ++j) {
+          int check = idx[i * MAX_NUM_VERT_IDX + j];
+          for (int k = 4; k < INTERSECTION_OFFSET; ++k) {
+            if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++;
+          }
+        }
+        if (counter == 4) {
+          idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0];
+          for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) {
+            idx[i * MAX_NUM_VERT_IDX + j] = pad;
+          }
+        }
+      }
+
+      // TODO: still might need to cover some other corner cases :(
+    }
+  }
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d3801a02c1c8f44874fb84fa884cc23bee25c331
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
@@ -0,0 +1,152 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
+#define FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
+                         int idx1, int idx2) {
+  const float v1 = dists[idx1], v2 = dists[idx2];
+  const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+  dists[idx1] = max(v1, v2);
+  dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  // dataset: (B, N, 3)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * 3;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+
+  int tid = threadIdx.x;
+  const int stride = block_size;
+
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    float x1 = dataset[old * 3 + 0];
+    float y1 = dataset[old * 3 + 1];
+    float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      float x2, y2, z2;
+      x2 = dataset[k * 3 + 0];
+      y2 = dataset[k * 3 + 1];
+      z2 = dataset[k * 3 + 2];
+      // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+      // if (mag <= 1e-3)
+      // continue;
+
+      float d =
+          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+#pragma unroll
+    for (int block_size_thres = 1024; block_size_thres >= 2;
+         block_size_thres >>= 1) {
+      const int tid_thres = block_size_thres / 2;
+      if (block_size >= block_size_thres && tid < tid_thres) {
+        __update(dists, dists_i, tid, tid + tid_thres);
+      }
+      __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+
+// Modified from
+// https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_with_dist_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  // dataset: (B, N, N)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * n;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+
+  int tid = threadIdx.x;
+  const int stride = block_size;
+
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    // float x1 = dataset[old * 3 + 0];
+    // float y1 = dataset[old * 3 + 1];
+    // float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      // float x2, y2, z2;
+      // x2 = dataset[k * 3 + 0];
+      // y2 = dataset[k * 3 + 1];
+      // z2 = dataset[k * 3 + 2];
+
+      // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) *
+      // (z2 - z1);
+      float d = dataset[old * n + k];
+
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+#pragma unroll
+    for (int block_size_thres = 1024; block_size_thres >= 2;
+         block_size_thres >>= 1) {
+      const int tid_thres = block_size_thres / 2;
+      if (block_size >= block_size_thres && tid < tid_thres) {
+        __update(dists, dists_i, tid, tid + tid_thres);
+      }
+      __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+
+#endif  // FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6d932434cba245833e661b8c7e140601940bc35b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef GATHER_POINTS_CUDA_KERNEL_CUH
+#define GATHER_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define TOTAL_THREADS 1024
+
+template <typename T>
+__global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
+                                                  const T *points,
+                                                  const int *__restrict__ idx,
+                                                  T *out) {
+  // points: (B, C, N)
+  // idx: (B, M)
+  // output:
+  //      out: (B, C, M)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    points += bs_idx * c * n + c_idx * n;
+    out[0] = points[idx[0]];
+  }
+}
+
+template <typename T>
+__global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
+                                                   const T *grad_out,
+                                                   const int *__restrict__ idx,
+                                                   T *grad_points) {
+  // grad_out: (B, C, M)
+  // idx: (B, M)
+  // output:
+  //      grad_points: (B, C, N)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    grad_points += bs_idx * c * n + c_idx * n;
+
+    atomicAdd(grad_points + idx[0], grad_out[0]);
+  }
+}
+
+#endif  // GATHER_POINTS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..dfad66fc16d8759f614d7f36fa961673976b1d95
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
@@ -0,0 +1,65 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#ifndef GROUP_POINTS_CUDA_KERNEL_CUH
+#define GROUP_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void group_points_forward_cuda_kernel(int b, int c, int n,
+                                                 int npoints, int nsample,
+                                                 const T *points,
+                                                 const int *__restrict__ idx,
+                                                 T *out) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    int pt_idx = index / nsample;
+    int sample_idx = index % nsample;
+
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                  pt_idx * nsample + sample_idx;
+
+    out[out_idx] = points[in_idx];
+  }
+}
+
+template <typename T>
+__global__ void group_points_backward_cuda_kernel(int b, int c, int n,
+                                                  int npoints, int nsample,
+                                                  const T *grad_out,
+                                                  const int *__restrict__ idx,
+                                                  T *grad_points) {
+  // grad_out: (B, C, npoints, nsample)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      grad_points: (B, C, N)
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
+    int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c) return;
+
+    int sample_idx = index % nsample;
+    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                pt_idx * nsample + sample_idx;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+
+    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
+  }
+}
+
+#endif  // GROUP_POINTS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9ebdcad15eee05a9f412ef34eb12d3553874a4dc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
@@ -0,0 +1,367 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef IOU3D_CUDA_KERNEL_CUH
+#define IOU3D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+const int THREADS_PER_BLOCK_IOU3D = 16;
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+__device__ const float EPS = 1e-8;
+
+struct Point {
+  float x, y;
+  __device__ Point() {}
+  __device__ Point(double _x, double _y) { x = _x, y = _y; }
+
+  __device__ void set(float _x, float _y) {
+    x = _x;
+    y = _y;
+  }
+
+  __device__ Point operator+(const Point &b) const {
+    return Point(x + b.x, y + b.y);
+  }
+
+  __device__ Point operator-(const Point &b) const {
+    return Point(x - b.x, y - b.y);
+  }
+};
+
+__device__ inline float cross(const Point &a, const Point &b) {
+  return a.x * b.y - a.y * b.x;
+}
+
+__device__ inline float cross(const Point &p1, const Point &p2,
+                              const Point &p0) {
+  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+__device__ int check_rect_cross(const Point &p1, const Point &p2,
+                                const Point &q1, const Point &q2) {
+  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
+            min(q1.x, q2.x) <= max(p1.x, p2.x) &&
+            min(p1.y, p2.y) <= max(q1.y, q2.y) &&
+            min(q1.y, q2.y) <= max(p1.y, p2.y);
+  return ret;
+}
+
+__device__ inline int check_in_box2d(const float *box, const Point &p) {
+  // params: box (7) [x, y, z, dx, dy, dz, heading]
+  const float MARGIN = 1e-2;
+
+  float center_x = box[0], center_y = box[1];
+  // rotate the point in the opposite direction of box
+  float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);
+  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
+
+  return (fabs(rot_x) < box[3] / 2 + MARGIN &&
+          fabs(rot_y) < box[4] / 2 + MARGIN);
+}
+
+__device__ inline int intersection(const Point &p1, const Point &p0,
+                                   const Point &q1, const Point &q0,
+                                   Point &ans_point) {
+  // fast exclusion
+  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+  // check cross standing
+  float s1 = cross(q0, p1, p0);
+  float s2 = cross(p1, q1, p0);
+  float s3 = cross(p0, q1, q0);
+  float s4 = cross(q1, p1, q0);
+
+  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+  // calculate intersection of two lines
+  float s5 = cross(q1, p1, p0);
+  if (fabs(s5 - s1) > EPS) {
+    ans_point.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+    ans_point.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+  } else {
+    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+    float D = a0 * b1 - a1 * b0;
+
+    ans_point.x = (b0 * c1 - b1 * c0) / D;
+    ans_point.y = (a1 * c0 - a0 * c1) / D;
+  }
+
+  return 1;
+}
+
+__device__ inline void rotate_around_center(const Point &center,
+                                            const float angle_cos,
+                                            const float angle_sin, Point &p) {
+  float new_x =
+      (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;
+  float new_y =
+      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+  p.set(new_x, new_y);
+}
+
+__device__ inline int point_cmp(const Point &a, const Point &b,
+                                const Point &center) {
+  return atan2(a.y - center.y, a.x - center.x) >
+         atan2(b.y - center.y, b.x - center.x);
+}
+
+__device__ inline float box_overlap(const float *box_a, const float *box_b) {
+  // params box_a: [x, y, z, dx, dy, dz, heading]
+  // params box_b: [x, y, z, dx, dy, dz, heading]
+
+  float a_angle = box_a[6], b_angle = box_b[6];
+  float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,
+        a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
+  float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
+  float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
+  float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
+  float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
+
+  Point center_a(box_a[0], box_a[1]);
+  Point center_b(box_b[0], box_b[1]);
+
+  Point box_a_corners[5];
+  box_a_corners[0].set(a_x1, a_y1);
+  box_a_corners[1].set(a_x2, a_y1);
+  box_a_corners[2].set(a_x2, a_y2);
+  box_a_corners[3].set(a_x1, a_y2);
+
+  Point box_b_corners[5];
+  box_b_corners[0].set(b_x1, b_y1);
+  box_b_corners[1].set(b_x2, b_y1);
+  box_b_corners[2].set(b_x2, b_y2);
+  box_b_corners[3].set(b_x1, b_y2);
+
+  // get oriented corners
+  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+  for (int k = 0; k < 4; k++) {
+    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+  }
+
+  box_a_corners[4] = box_a_corners[0];
+  box_b_corners[4] = box_b_corners[0];
+
+  // get intersection of lines
+  Point cross_points[16];
+  Point poly_center;
+  int cnt = 0, flag = 0;
+
+  poly_center.set(0, 0);
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
+                          box_b_corners[j + 1], box_b_corners[j],
+                          cross_points[cnt]);
+      if (flag) {
+        poly_center = poly_center + cross_points[cnt];
+        cnt++;
+      }
+    }
+  }
+
+  // check corners
+  for (int k = 0; k < 4; k++) {
+    if (check_in_box2d(box_a, box_b_corners[k])) {
+      poly_center = poly_center + box_b_corners[k];
+      cross_points[cnt] = box_b_corners[k];
+      cnt++;
+    }
+    if (check_in_box2d(box_b, box_a_corners[k])) {
+      poly_center = poly_center + box_a_corners[k];
+      cross_points[cnt] = box_a_corners[k];
+      cnt++;
+    }
+  }
+
+  poly_center.x /= cnt;
+  poly_center.y /= cnt;
+
+  // sort the points of polygon
+  Point temp;
+  for (int j = 0; j < cnt - 1; j++) {
+    for (int i = 0; i < cnt - j - 1; i++) {
+      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
+        temp = cross_points[i];
+        cross_points[i] = cross_points[i + 1];
+        cross_points[i + 1] = temp;
+      }
+    }
+  }
+
+  // get the overlap areas
+  float area = 0;
+  for (int k = 0; k < cnt - 1; k++) {
+    area += cross(cross_points[k] - cross_points[0],
+                  cross_points[k + 1] - cross_points[0]);
+  }
+
+  return fabs(area) / 2.0;
+}
+
+__device__ inline float iou_bev(const float *box_a, const float *box_b) {
+  // params box_a: [x, y, z, dx, dy, dz, heading]
+  // params box_b: [x, y, z, dx, dy, dz, heading]
+  float sa = box_a[3] * box_a[4];
+  float sb = box_b[3] * box_b[4];
+  float s_overlap = box_overlap(box_a, box_b);
+  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+__global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(
+    const int num_a, const float *boxes_a, const int num_b,
+    const float *boxes_b, float *ans_overlap) {
+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+  CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
+    if (a_idx >= num_a || b_idx >= num_b) {
+      return;
+    }
+
+    const float *cur_box_a = boxes_a + a_idx * 7;
+    const float *cur_box_b = boxes_b + b_idx * 7;
+    float cur_overlap = box_overlap(cur_box_a, cur_box_b);
+    ans_overlap[a_idx * num_b + b_idx] = cur_overlap;
+  }
+}
+
+__global__ void iou3d_nms3d_forward_cuda_kernel(const int boxes_num,
+                                                const float nms_overlap_thresh,
+                                                const float *boxes,
+                                                unsigned long long *mask) {
+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+  const int blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 7 + 0] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+      block_boxes[threadIdx.x * 7 + 1] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+      block_boxes[threadIdx.x * 7 + 2] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+      block_boxes[threadIdx.x * 7 + 3] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+      block_boxes[threadIdx.x * 7 + 4] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+      block_boxes[threadIdx.x * 7 + 5] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+      block_boxes[threadIdx.x * 7 + 6] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+      const float *cur_box = boxes + cur_box_idx * 7;
+
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks =
+          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+      mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+__device__ inline float iou_normal(float const *const a, float const *const b) {
+  // params: a: [x, y, z, dx, dy, dz, heading]
+  // params: b: [x, y, z, dx, dy, dz, heading]
+
+  float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),
+        right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
+  float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),
+        bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
+  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
+  float interS = width * height;
+  float Sa = a[3] * a[4];
+  float Sb = b[3] * b[4];
+  return interS / fmaxf(Sa + Sb - interS, EPS);
+}
+
+__global__ void iou3d_nms3d_normal_forward_cuda_kernel(
+    const int boxes_num, const float nms_overlap_thresh, const float *boxes,
+    unsigned long long *mask) {
+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+  const int blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 7 + 0] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+      block_boxes[threadIdx.x * 7 + 1] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+      block_boxes[threadIdx.x * 7 + 2] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+      block_boxes[threadIdx.x * 7 + 3] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+      block_boxes[threadIdx.x * 7 + 4] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+      block_boxes[threadIdx.x * 7 + 5] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+      block_boxes[threadIdx.x * 7 + 6] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+      const float *cur_box = boxes + cur_box_idx * 7;
+
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks =
+          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+      mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+#endif  // IOU3D_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3cf52bb90eb27d02b28c52069c760c8a38f83f08
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
@@ -0,0 +1,92 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+#ifndef KNN_CUDA_KERNEL_CUH
+#define KNN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+inline __device__ void swap_float(float *x, float *y) {
+  float tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+inline __device__ void swap_int(int *x, int *y) {
+  int tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+__device__ void reheap(float *dist, int *idx, int k) {
+  int root = 0;
+  int child = root * 2 + 1;
+  while (child < k) {
+    if (child + 1 < k && dist[child + 1] > dist[child]) child++;
+    if (dist[root] > dist[child]) return;
+    swap_float(&dist[root], &dist[child]);
+    swap_int(&idx[root], &idx[child]);
+    root = child;
+    child = root * 2 + 1;
+  }
+}
+
+__device__ void heap_sort(float *dist, int *idx, int k) {
+  int i;
+  for (i = k - 1; i > 0; i--) {
+    swap_float(&dist[0], &dist[i]);
+    swap_int(&idx[0], &idx[i]);
+    reheap(dist, idx, i);
+  }
+}
+
+// input: xyz (b, n, 3) new_xyz (b, m, 3)
+// output: idx (b, m, nsample) dist2 (b, m, nsample)
+template <typename T>
+__global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
+                                        const T *xyz, const T *new_xyz,
+                                        int *__restrict__ idx, T *dist2) {
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b) return;
+
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+    dist2 += bs_idx * m * nsample + pt_idx * nsample;
+
+    T new_x = new_xyz[0];
+    T new_y = new_xyz[1];
+    T new_z = new_xyz[2];
+
+    float best_dist[100];
+    int best_idx[100];
+    for (int i = 0; i < nsample; i++) {
+      best_dist[i] = 1e10;
+      best_idx[i] = 0;
+    }
+    for (int i = 0; i < n; i++) {
+      T x = xyz[i * 3 + 0];
+      T y = xyz[i * 3 + 1];
+      T z = xyz[i * 3 + 2];
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+             (new_z - z) * (new_z - z);
+      if (d2 < best_dist[0]) {
+        best_dist[0] = d2;
+        best_idx[0] = i;
+        reheap(best_dist, best_idx, nsample);
+      }
+    }
+    heap_sort(best_dist, best_idx, nsample);
+    for (int i = 0; i < nsample; i++) {
+      idx[i] = best_idx[i];
+      dist2[i] = best_dist[i];
+    }
+  }
+}
+
+#endif  // KNN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1a0bd040e823eaaa79f96e525f961a8b8fbeafb5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
@@ -0,0 +1,62 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MASKED_CONV2D_CUDA_KERNEL_CUH
+#define MASKED_CONV2D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
+                                    const int height, const int width,
+                                    const int kernel_h, const int kernel_w,
+                                    const int pad_h, const int pad_w,
+                                    const int64_t *mask_h_idx,
+                                    const int64_t *mask_w_idx,
+                                    const int mask_cnt, scalar_t *data_col) {
+  // mask_cnt * channels
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_col = mask_h_idx[m_index];
+    const int w_col = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col - pad_h;
+    const int w_offset = w_col - pad_w;
+    scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;
+    for (int i = 0; i < kernel_h; ++i) {
+      int h_im = h_offset + i;
+      for (int j = 0; j < kernel_w; ++j) {
+        int w_im = w_offset + j;
+        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+          *data_col_ptr =
+              (scalar_t)data_im[(c_im * height + h_im) * width + w_im];
+        } else {
+          *data_col_ptr = 0.0;
+        }
+        data_col_ptr += mask_cnt;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
+                                    const int height, const int width,
+                                    const int channels,
+                                    const int64_t *mask_h_idx,
+                                    const int64_t *mask_w_idx,
+                                    const int mask_cnt, scalar_t *data_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_im = mask_h_idx[m_index];
+    const int w_im = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    // compute the start and end of the output
+    data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
+  }
+}
+
+#endif  // MASKED_CONV2D_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..df56e743669c3426f6abb113e4209d0cc60f2baf
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
@@ -0,0 +1,300 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
+#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAXN 20
+__device__ const float PI = 3.1415926;
+
+struct Point {
+  float x, y;
+  __device__ Point() {}
+  __device__ Point(float x, float y) : x(x), y(y) {}
+};
+
+__device__ inline void swap1(Point *a, Point *b) {
+  Point temp;
+  temp.x = a->x;
+  temp.y = a->y;
+
+  a->x = b->x;
+  a->y = b->y;
+
+  b->x = temp.x;
+  b->y = temp.y;
+}
+__device__ inline float cross(Point o, Point a, Point b) {
+  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
+}
+
+__device__ inline float dis(Point a, Point b) {
+  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+}
+__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) {
+  float convex_points[2][MAXN];
+  for (int j = 0; j < n_points; j++) {
+    convex_points[0][j] = ps[j].x;
+  }
+  for (int j = 0; j < n_points; j++) {
+    convex_points[1][j] = ps[j].y;
+  }
+
+  Point edges[MAXN];
+  float edges_angles[MAXN];
+  float unique_angles[MAXN];
+  int n_edges = n_points - 1;
+  int n_unique = 0;
+  int unique_flag = 0;
+
+  for (int i = 0; i < n_edges; i++) {
+    edges[i].x = ps[i + 1].x - ps[i].x;
+    edges[i].y = ps[i + 1].y - ps[i].y;
+  }
+  for (int i = 0; i < n_edges; i++) {
+    edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x);
+    if (edges_angles[i] >= 0) {
+      edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2);
+    } else {
+      edges_angles[i] =
+          edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2);
+    }
+  }
+  unique_angles[0] = edges_angles[0];
+  n_unique += 1;
+  for (int i = 1; i < n_edges; i++) {
+    for (int j = 0; j < n_unique; j++) {
+      if (edges_angles[i] == unique_angles[j]) {
+        unique_flag += 1;
+      }
+    }
+    if (unique_flag == 0) {
+      unique_angles[n_unique] = edges_angles[i];
+      n_unique += 1;
+      unique_flag = 0;
+    } else {
+      unique_flag = 0;
+    }
+  }
+
+  float minarea = 1e12;
+  for (int i = 0; i < n_unique; i++) {
+    float R[2][2];
+    float rot_points[2][MAXN];
+    R[0][0] = cos(unique_angles[i]);
+    R[0][1] = sin(unique_angles[i]);
+    R[1][0] = -sin(unique_angles[i]);
+    R[1][1] = cos(unique_angles[i]);
+    // R x Points
+    for (int m = 0; m < 2; m++) {
+      for (int n = 0; n < n_points; n++) {
+        float sum = 0.0;
+        for (int k = 0; k < 2; k++) {
+          sum = sum + R[m][k] * convex_points[k][n];
+        }
+        rot_points[m][n] = sum;
+      }
+    }
+
+    // xmin;
+    float xmin, ymin, xmax, ymax;
+    xmin = 1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
+        continue;
+      } else {
+        if (rot_points[0][j] < xmin) {
+          xmin = rot_points[0][j];
+        }
+      }
+    }
+    // ymin
+    ymin = 1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
+        continue;
+      } else {
+        if (rot_points[1][j] < ymin) {
+          ymin = rot_points[1][j];
+        }
+      }
+    }
+    // xmax
+    xmax = -1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
+        continue;
+      } else {
+        if (rot_points[0][j] > xmax) {
+          xmax = rot_points[0][j];
+        }
+      }
+    }
+    // ymax
+    ymax = -1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
+        continue;
+      } else {
+        if (rot_points[1][j] > ymax) {
+          ymax = rot_points[1][j];
+        }
+      }
+    }
+    float area = (xmax - xmin) * (ymax - ymin);
+    if (area < minarea) {
+      minarea = area;
+      minbox[0] = unique_angles[i];
+      minbox[1] = xmin;
+      minbox[2] = ymin;
+      minbox[3] = xmax;
+      minbox[4] = ymax;
+    }
+  }
+}
+
+// convex_find
+__device__ inline void Jarvis(Point *in_poly, int &n_poly) {
+  int n_input = n_poly;
+  Point input_poly[20];
+  for (int i = 0; i < n_input; i++) {
+    input_poly[i].x = in_poly[i].x;
+    input_poly[i].y = in_poly[i].y;
+  }
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[20], top1, top2;
+  // float sign;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point *j = &(in_poly[0]);
+      Point *k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+
+  for (int i = 0; i <= top1; i++) {
+    right_point[i] = in_poly[Stack[i]];
+  }
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+
+  for (int i = top2 - 1; i >= 0; i--) {
+    left_point[i] = in_poly[Stack[i]];
+  }
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+}
+
+template <typename T>
+__device__ inline void Findminbox(T const *const p, T *minpoints) {
+  Point ps1[MAXN];
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = p[i * 2];
+    convex[i].y = p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  Jarvis(convex, n_convex);
+  int n1 = n_convex;
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = convex[i].x;
+    ps1[i].y = convex[i].y;
+  }
+  ps1[n1].x = convex[0].x;
+  ps1[n1].y = convex[0].y;
+
+  float minbbox[5] = {0};
+  minBoundingRect(ps1, n1 + 1, minbbox);
+  float angle = minbbox[0];
+  float xmin = minbbox[1];
+  float ymin = minbbox[2];
+  float xmax = minbbox[3];
+  float ymax = minbbox[4];
+  float R[2][2];
+
+  R[0][0] = cos(angle);
+  R[0][1] = sin(angle);
+  R[1][0] = -sin(angle);
+  R[1][1] = cos(angle);
+
+  minpoints[0] = xmax * R[0][0] + ymin * R[1][0];
+  minpoints[1] = xmax * R[0][1] + ymin * R[1][1];
+  minpoints[2] = xmin * R[0][0] + ymin * R[1][0];
+  minpoints[3] = xmin * R[0][1] + ymin * R[1][1];
+  minpoints[4] = xmin * R[0][0] + ymax * R[1][0];
+  minpoints[5] = xmin * R[0][1] + ymax * R[1][1];
+  minpoints[6] = xmax * R[0][0] + ymax * R[1][0];
+  minpoints[7] = xmax * R[0][1] + ymax * R[1][1];
+}
+
+template <typename T>
+__global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes,
+                                              const T *ex_boxes, T *minbox) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T *cur_box = ex_boxes + index * 18;
+    T *cur_min_box = minbox + index * 8;
+    Findminbox(cur_box, cur_min_box);
+  }
+}
+
+#endif  // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ca0e91a25246569bb7de04649ab4f5afe233670c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
@@ -0,0 +1,399 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+#define MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+template <typename T>
+__device__ T dmcn_im2col_bilinear(const T *input, const int data_width,
+                                  const int height, const int width, T h, T w) {
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+__device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h,
+                                      const int w, const int height,
+                                      const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+__device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w,
+                                        const int height, const int width,
+                                        const T *im_data, const int data_width,
+                                        const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+__global__ void modulated_deformable_im2col_gpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const T *data_mask,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const T *data_mask_ptr =
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im,
+                                     w_im);
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void modulated_deformable_col2im_gpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const T *data_mask,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T mask = data_mask_ptr[data_mask_hw_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight =
+              dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+                                       cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const T *data_mask, const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      else
+        mval += data_col_ptr[col_pos] *
+                dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width,
+                                     height, width, inv_h, inv_w);
+      const T weight = dmcn_get_coordinate_weight(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                      kernel_w +
+                  offset_c / 2) *
+                     height_col +
+                 h) *
+                    width_col +
+                w] = mval;
+  }
+}
+
+#endif  // MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..12225ffdb3b1691ad9edabcd1663109f67ef1a6f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
@@ -0,0 +1,801 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#ifndef DEFORM_ATTN_CUDA_KERNEL
+#define DEFORM_ATTN_CUDA_KERNEL
+
+#include "common_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(
+    const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes,
+    const int64_t *data_level_start_index, const scalar_t *data_sampling_loc,
+    const scalar_t *data_attn_weight, const int batch_size,
+    const int spatial_size, const int num_heads, const int channels,
+    const int num_levels, const int num_query, const int num_point,
+    scalar_t *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr =
+          data_value +
+          (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h,
+                                                spatial_w, num_heads, channels,
+                                                h_im, w_im, m_col, c_col) *
+                 weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+  __shared__ scalar_t cache_grad_attn_weight[blockSize];
+  unsigned int tid = threadIdx.x;
+  const int qid_stride = num_heads * channels;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int _tid = 1; _tid < blockSize; ++_tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[_tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc_out = _grad_w;
+          *(grad_sampling_loc_out + 1) = _grad_h;
+          *grad_attn_weight_out = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+  __shared__ scalar_t cache_grad_attn_weight[blockSize];
+  unsigned int tid = threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight_out = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  extern __shared__ int _s[];
+  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+  unsigned int tid = threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[_tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc_out = _grad_w;
+          *(grad_sampling_loc_out + 1) = _grad_h;
+          *grad_attn_weight_out = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  extern __shared__ int _s[];
+  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+  unsigned int tid = threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight_out = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  extern __shared__ int _s[];
+  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+  unsigned int tid = threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear_gm(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              grad_sampling_loc_out, grad_attn_weight_out);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+#endif  // DEFORM_ATTN_CUDA_KERNEL
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..281d9f0b409f54260a81a79ad96ab09fde9580ce
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
@@ -0,0 +1,117 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef NMS_CUDA_KERNEL_CUH
+#define NMS_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+int const threadsPerBlock = sizeof(unsigned long long int) * 8;
+
+__device__ inline bool devIoU(float const *const a, float const *const b,
+                              const int offset, const float threshold) {
+  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
+  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
+  float width = fmaxf(right - left + offset, 0.f),
+        height = fmaxf(bottom - top + offset, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
+  float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
+  return interS > threshold * (Sa + Sb - interS);
+}
+
+__global__ static void nms_cuda(const int n_boxes, const float iou_threshold,
+                                const int offset, const float *dev_boxes,
+                                unsigned long long *dev_mask) {
+  int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    const int tid = threadIdx.x;
+
+    if (row_start > col_start) return;
+
+    const int row_size =
+        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    __shared__ float block_boxes[threadsPerBlock * 4];
+    if (tid < col_size) {
+      block_boxes[tid * 4 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
+      block_boxes[tid * 4 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
+      block_boxes[tid * 4 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
+      block_boxes[tid * 4 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
+    }
+    __syncthreads();
+
+    if (tid < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + tid;
+      const float *cur_box = dev_boxes + cur_box_idx * 4;
+      int i = 0;
+      unsigned long long int t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = tid + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
+          t |= 1ULL << i;
+        }
+      }
+      dev_mask[cur_box_idx * gridDim.y + col_start] = t;
+    }
+  }
+}
+
+__global__ static void gather_keep_from_mask(bool *keep,
+                                             const unsigned long long *dev_mask,
+                                             const int n_boxes) {
+  const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+  const int tid = threadIdx.x;
+
+  // mark the bboxes which have been removed.
+  extern __shared__ unsigned long long removed[];
+
+  // initialize removed.
+  for (int i = tid; i < col_blocks; i += blockDim.x) {
+    removed[i] = 0;
+  }
+  __syncthreads();
+
+  for (int nblock = 0; nblock < col_blocks; ++nblock) {
+    auto removed_val = removed[nblock];
+    __syncthreads();
+    const int i_offset = nblock * threadsPerBlock;
+#pragma unroll
+    for (int inblock = 0; inblock < threadsPerBlock; ++inblock) {
+      const int i = i_offset + inblock;
+      if (i >= n_boxes) break;
+      // select a candidate, check if it should kept.
+      if (!(removed_val & (1ULL << inblock))) {
+        if (tid == 0) {
+          // mark the output.
+          keep[i] = true;
+        }
+        auto p = dev_mask + i * col_blocks;
+        // remove all bboxes which overlap the candidate.
+        for (int j = tid; j < col_blocks; j += blockDim.x) {
+          if (j >= nblock) removed[j] |= p[j];
+        }
+        __syncthreads();
+        removed_val = removed[nblock];
+      }
+    }
+  }
+}
+
+#endif  // NMS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..bba3b8258f6b8798b9d1a651bfda29c48bb5376a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh
@@ -0,0 +1,141 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#ifndef NMS_QUADRI_CUDA_CUH
+#define NMS_QUADRI_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+__host__ __device__ inline int divideUP(const int x, const int y) {
+  return (((x) + (y)-1) / (y));
+}
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template <typename T>
+__global__ void nms_quadri_cuda_kernel(const int n_boxes,
+                                       const float iou_threshold,
+                                       const T* dev_boxes,
+                                       unsigned long long* dev_mask,
+                                       const int multi_label) {
+  if (multi_label == 1) {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 8 values
+    // (x1, y1, ..., x4, y4) here.
+    __shared__ T block_boxes[threadsPerBlock * 8];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 8 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0];
+      block_boxes[threadIdx.x * 8 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1];
+      block_boxes[threadIdx.x * 8 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2];
+      block_boxes[threadIdx.x * 8 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3];
+      block_boxes[threadIdx.x * 8 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4];
+      block_boxes[threadIdx.x * 8 + 5] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5];
+      block_boxes[threadIdx.x * 8 + 6] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6];
+      block_boxes[threadIdx.x * 8 + 7] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 9;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_quadri function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  } else {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 8 values
+    // (x1, y1, , ..., x4, y4) here.
+    __shared__ T block_boxes[threadsPerBlock * 8];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 8 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 0];
+      block_boxes[threadIdx.x * 8 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 1];
+      block_boxes[threadIdx.x * 8 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 2];
+      block_boxes[threadIdx.x * 8 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 3];
+      block_boxes[threadIdx.x * 8 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 4];
+      block_boxes[threadIdx.x * 8 + 5] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 5];
+      block_boxes[threadIdx.x * 8 + 6] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 6];
+      block_boxes[threadIdx.x * 8 + 7] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 7];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 8;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_quadri function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..747327afb83900177dd4721f1b0ba99153f658d7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
@@ -0,0 +1,133 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
+#ifndef NMS_ROTATED_CUDA_CUH
+#define NMS_ROTATED_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+__host__ __device__ inline int divideUP(const int x, const int y) {
+  return (((x) + (y)-1) / (y));
+}
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template <typename T>
+__global__ void nms_rotated_cuda_kernel(const int n_boxes,
+                                        const float iou_threshold,
+                                        const T* dev_boxes,
+                                        unsigned long long* dev_mask,
+                                        const int multi_label) {
+  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
+
+  if (multi_label == 1) {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 5 values
+    // (x_center, y_center, width, height, angle_degrees) here.
+    __shared__ T block_boxes[threadsPerBlock * 5];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 5 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
+      block_boxes[threadIdx.x * 5 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
+      block_boxes[threadIdx.x * 5 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
+      block_boxes[threadIdx.x * 5 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
+      block_boxes[threadIdx.x * 5 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 6;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_rotated function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  } else {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 5 values
+    // (x_center, y_center, width, height, angle_degrees) here.
+    __shared__ T block_boxes[threadsPerBlock * 5];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 5 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+      block_boxes[threadIdx.x * 5 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+      block_boxes[threadIdx.x * 5 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+      block_boxes[threadIdx.x * 5 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+      block_boxes[threadIdx.x * 5 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 5;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_rotated function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7918a57452bbde9dc7c249b0c3dd2774aa1961bf
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019, SenseTime.
+ */
+
+#ifndef INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
+#define INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
+
+#ifndef __CUDACC__
+#error cudawarpfunction.cuh should only be included by .cu files
+#endif
+#include <cuda.h>
+
+#include <parrots/foundation/common.hpp>
+
+#ifdef PARROTS_USE_HALF
+#include <cuda_fp16.h>
+#endif
+#ifdef __CUDA_ARCH__
+#define CUDA_INTRINSIC_FUNC(Expr) Expr
+#else
+#define CUDA_INTRINSIC_FUNC(Expr)
+#endif
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#ifdef PARROTS_USE_HALF
+
+#if CUDA_VERSION < 9000
+
+__device__ inline float16 __shfl(float16 var, int srcLane, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl(var.y, srcLane, width););
+}
+
+__device__ inline float16 __shfl_up(float16 var, unsigned delta, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_up(var.y, delta, width););
+}
+
+__device__ inline float16 __shfl_down(float16 var, unsigned delta, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_down(var.y, delta, width););
+}
+
+__device__ inline float16 __shfl_xor(float16 var, int laneMask, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_xor(var.y, laneMask, width););
+}
+
+#else  // CUDA_VERSION >= 9000
+
+__device__ inline float16 __shfl_sync(unsigned mask, float16 var, int srcLane,
+                                      int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(float16 r; r.y = __shfl_sync(mask, var.y, srcLane, width);
+                      return r;);
+}
+
+__device__ inline float16 __shfl_up_sync(unsigned mask, float16 var,
+                                         unsigned delta, int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(
+      float16 r; r.y = __shfl_up_sync(mask, var.y, delta, width); return r;);
+}
+
+__device__ inline float16 __shfl_down_sync(unsigned mask, float16 var,
+                                           unsigned delta,
+                                           int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(
+      float16 r; r.y = __shfl_down_sync(mask, var.y, delta, width); return r;);
+}
+
+__device__ inline float16 __shfl_xor_sync(unsigned mask, float16 var,
+                                          int laneMask, int width) {
+  CUDA_INTRINSIC_FUNC(float16 r;
+                      r.y = __shfl_xor_sync(mask, var.y, laneMask, width);
+                      return r;);
+}
+
+#endif  // CUDA_VERSION < 9000
+
+#endif  // PARROTS_USE_HALF
+
+// warp shuffle interface with a dummy mask
+#if CUDA_VERSION < 9000
+
+template <typename T>
+__device__ inline T __shfl_sync(unsigned mask, T var, int srcLane,
+                                int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl(var, srcLane, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_up_sync(unsigned mask, T var, unsigned delta,
+                                   int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_up(var, delta, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_down_sync(unsigned mask, T var, unsigned delta,
+                                     int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_down(var, delta, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_xor_sync(unsigned mask, T var, int laneMask,
+                                    int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_xor(var, laneMask, width););
+}
+
+#endif  // CUDA_VERSION < 9000
+
+#endif  // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#endif  // INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..342362079a5ce3dde6d19532b3014872f4373330
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
@@ -0,0 +1,95 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINT_IN_BOXES_CUDA_KERNEL_CUH
+#define POINT_IN_BOXES_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void points_in_boxes_part_forward_cuda_kernel(
+    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
+    int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
+  // (B, npoints), default -1
+
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (bs_idx >= batch_size) return;
+
+    boxes += bs_idx * boxes_num * 7;
+    pts += bs_idx * pts_num * 3 + pt_idx * 3;
+    box_idx_of_points += bs_idx * pts_num + pt_idx;
+
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = 0;
+    for (int k = 0; k < boxes_num; k++) {
+      cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+      if (cur_in_flag) {
+        box_idx_of_points[0] = k;
+        break;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void points_in_boxes_all_forward_cuda_kernel(
+    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
+    int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
+  // (B, npoints), default -1
+
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (bs_idx >= batch_size) return;
+
+    boxes += bs_idx * boxes_num * 7;
+    pts += bs_idx * pts_num * 3 + pt_idx * 3;
+    box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
+
+    T local_x = 0, local_y = 0;
+    for (int k = 0; k < boxes_num; k++) {
+      const int cur_in_flag =
+          check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+      if (cur_in_flag) {
+        box_idx_of_points[k] = 1;
+      }
+    }
+  }
+}
+
+#endif  // POINT_IN_BOXES_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a0769d75a29ce8d7eac00931d6f51caa292b2693
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
@@ -0,0 +1,79 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
+#define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+struct point {
+  float x, y;
+};
+
+template <typename scalar_t>
+__global__ void points_in_polygons_forward_cuda_kernel(
+    const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2,
+    const int rows, const int cols, scalar_t *inside_flag) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int row = index / cols;
+    int col = index % cols;
+
+    const scalar_t *offset_vertex1 = vertex1 + row * 2;
+    const scalar_t *offset_vertex2 = vertex2 + col * 8;
+
+    point point_[1];
+    point polygon[4];
+
+    point_[0].x = offset_vertex1[0];
+    point_[0].y = offset_vertex1[1];
+
+    polygon[0].x = offset_vertex2[0];
+    polygon[0].y = offset_vertex2[1];
+    polygon[1].x = offset_vertex2[2];
+    polygon[1].y = offset_vertex2[3];
+    polygon[2].x = offset_vertex2[4];
+    polygon[2].y = offset_vertex2[5];
+    polygon[3].x = offset_vertex2[6];
+    polygon[3].y = offset_vertex2[7];
+
+    int nCross = 0;
+    int i, j;
+    float sx, sy, tx, ty, px, py, x;
+    for (i = 0, j = 3; i < 4; j = i, i++) {
+      sx = polygon[i].x;
+      sy = polygon[i].y;
+      tx = polygon[j].x;
+      ty = polygon[j].y;
+
+      px = point_[0].x;
+      py = point_[0].y;
+
+      if (py < min(sy, ty)) continue;
+      if (py > max(sy, ty)) continue;
+
+      if ((sx == px && sy == py) || (tx == px && ty == py)) {
+        break;
+      } else {
+        if ((sy < py && ty >= py) || (sy >= py && ty < py)) {
+          x = sx + (py - sy) * (tx - sx) / (ty - sy);
+          if (x == px) {
+            break;
+          }
+          if (x > px) {
+            nCross++;
+          }
+        }
+      }
+    }
+    if (nCross % 2 == 1) {
+      inside_flag[index] = 1.0;
+    } else {
+      inside_flag[index] = 0.0;
+    }
+    return;
+  }
+}
+
+#endif  // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e2f5a11b8dd6058f8d2fd288fc943dc235b39c37
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
@@ -0,0 +1,381 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu
+// Distributed under terms of the MIT license.
+#ifndef PRROI_POOL_CUDA_KERNEL_CUH
+#define PRROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data,
+                                                        const int h,
+                                                        const int w,
+                                                        const int height,
+                                                        const int width) {
+  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
+  T retVal = overflow ? 0.0f : data[h * width + w];
+  return retVal;
+}
+
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) {
+  return (1.0f - abs(dh)) * (1.0f - abs(dw));
+}
+
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t,
+                                                                   T c1, T c2) {
+  return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1;
+}
+
+template <typename T>
+__device__ static T PrRoIPoolingInterpolation(const T *data, const T h,
+                                              const T w, const int height,
+                                              const int width) {
+  T retVal = 0.0f;
+  int h1 = floorf(h);
+  int w1 = floorf(w);
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h) + 1;
+  w1 = floorf(w);
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h);
+  w1 = floorf(w) + 1;
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h) + 1;
+  w1 = floorf(w) + 1;
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  return retVal;
+}
+
+template <typename T>
+__device__ static T PrRoIPoolingMatCalculation(const T *this_data,
+                                               const int s_h, const int s_w,
+                                               const int e_h, const int e_w,
+                                               const T y0, const T x0,
+                                               const T y1, const T x1,
+                                               const int h0, const int w0) {
+  T alpha, beta, lim_alpha, lim_beta, tmp;
+  T sum_out = 0;
+
+  alpha = x0 - T(s_w);
+  beta = y0 - T(s_h);
+  lim_alpha = x1 - T(s_w);
+  lim_beta = y1 - T(s_h);
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;
+
+  alpha = x0 - T(s_w);
+  beta = T(e_h) - y1;
+  lim_alpha = x1 - T(s_w);
+  lim_beta = T(e_h) - y0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;
+
+  return sum_out;
+}
+
+template <typename T>
+__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff,
+                                                  const int h, const int w,
+                                                  const int height,
+                                                  const int width,
+                                                  const T coeff) {
+  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
+  if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff);
+}
+
+template <typename T>
+__device__ static void PrRoIPoolingMatDistributeDiff(
+    T *diff, const T top_diff, const int s_h, const int s_w, const int e_h,
+    const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,
+    const int w0) {
+  T alpha, beta, lim_alpha, lim_beta, tmp;
+
+  alpha = x0 - T(s_w);
+  beta = y0 - T(s_h);
+  lim_alpha = x1 - T(s_w);
+  lim_beta = y1 - T(s_h);
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp);
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp);
+
+  alpha = x0 - T(s_w);
+  beta = T(e_h) - y1;
+  lim_alpha = x1 - T(s_w);
+  lim_beta = T(e_h) - y0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp);
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp);
+}
+
+template <typename T>
+__global__ void prroi_pool_forward_cuda_kernel(
+    const int nthreads, const T *input, const T *rois, T *output,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T *offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    T roi_x1 = offset_rois[1] * spatial_scale;
+    T roi_y1 = offset_rois[2] * spatial_scale;
+    T roi_x2 = offset_rois[3] * spatial_scale;
+    T roi_y2 = offset_rois[4] * spatial_scale;
+
+    T roi_width = max(roi_x2 - roi_x1, ((T)0.0));
+    T roi_height = max(roi_y2 - roi_y1, ((T)0.0));
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    const T *this_data =
+        input + (roi_batch_ind * channels + c) * height * width;
+    T *this_out = output + index;
+
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+    if (bin_size == 0) {
+      *this_out = 0;
+      continue;
+    }
+
+    T sum_out = 0;
+
+    int start_x, start_y, end_x, end_y;
+
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
+      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
+        sum_out += PrRoIPoolingMatCalculation(
+            this_data, bin_y, bin_x, bin_y + 1, bin_x + 1,
+            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
+            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
+            width);
+    *this_out = sum_out / bin_size;
+  }
+}
+
+template <typename T>
+__global__ void prroi_pool_backward_cuda_kernel(
+    const int nthreads, const T *grad_output, const T *rois, T *grad_input,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    auto rois_cur = rois + n * 5;
+
+    int roi_batch_ind = rois_cur[0];
+    T roi_x1 = rois_cur[1] * spatial_scale;
+    T roi_y1 = rois_cur[2] * spatial_scale;
+    T roi_x2 = rois_cur[3] * spatial_scale;
+    T roi_y2 = rois_cur[4] * spatial_scale;
+
+    T roi_width = max(roi_x2 - roi_x1, (T)0);
+    T roi_height = max(roi_y2 - roi_y1, (T)0);
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    const T *this_out_grad = grad_output + index;
+    T *this_data_grad =
+        grad_input + (roi_batch_ind * channels + c) * height * width;
+
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+
+    T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size;
+
+    int start_x, start_y, end_x, end_y;
+
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
+      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
+        PrRoIPoolingMatDistributeDiff(
+            this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1,
+            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
+            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
+            width);
+  }
+}
+
+template <typename T>
+__global__ void prroi_pool_coor_backward_cuda_kernel(
+    const int nthreads, const T *output, const T *grad_output, const T *input,
+    const T *rois, T *grad_rois, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    auto rois_cur = rois + n * 5;
+
+    int roi_batch_ind = rois_cur[0];
+    T roi_x1 = rois_cur[1] * spatial_scale;
+    T roi_y1 = rois_cur[2] * spatial_scale;
+    T roi_x2 = rois_cur[3] * spatial_scale;
+    T roi_y2 = rois_cur[4] * spatial_scale;
+
+    T roi_width = max(roi_x2 - roi_x1, (T)0);
+    T roi_height = max(roi_y2 - roi_y1, (T)0);
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    const T output_grad_val = grad_output[index];
+    const T *this_input_data =
+        input + (roi_batch_ind * channels + c) * height * width;
+    const T output_val = output[index];
+    T *this_rois_grad = grad_rois + n * 5;
+
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+
+    T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size;
+
+    // WARNING: to be discussed
+    if (sum_out == 0) continue;
+
+    int start_x, start_y, end_x, end_y;
+
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+
+    T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0;
+    for (int bin_y = start_y; bin_y < end_y; ++bin_y) {
+      grad_x1_y += PrRoIPoolingSingleCoorIntegral(
+          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1,
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1,
+                                    height, width));
+
+      grad_x2_y += PrRoIPoolingSingleCoorIntegral(
+          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2,
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2,
+                                    height, width));
+    }
+
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x) {
+      grad_x_y1 += PrRoIPoolingSingleCoorIntegral(
+          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
+          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x),
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1),
+                                    height, width));
+
+      grad_x_y2 += PrRoIPoolingSingleCoorIntegral(
+          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
+          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x),
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1),
+                                    height, width));
+    }
+
+    T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val;
+    T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val;
+    T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val;
+    T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val;
+
+    partial_x1 = partial_x1 / bin_size * spatial_scale;
+    partial_x2 = partial_x2 / bin_size * spatial_scale;
+    partial_y1 = partial_y1 / bin_size * spatial_scale;
+    partial_y2 = partial_y2 / bin_size * spatial_scale;
+
+    // (index, x1, y1, x2, y2)
+    this_rois_grad[0] = 0;
+    atomicAdd(this_rois_grad + 1,
+              (partial_x1 * (1.0f - T(pw) / pooled_width) +
+               partial_x2 * (1.0f - T(pw + 1) / pooled_width)) *
+                  output_grad_val);
+    atomicAdd(this_rois_grad + 2,
+              (partial_y1 * (1.0f - T(ph) / pooled_height) +
+               partial_y2 * (1.0f - T(ph + 1) / pooled_height)) *
+                  output_grad_val);
+    atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width +
+                                   partial_x1 * T(pw) / pooled_width) *
+                                      output_grad_val);
+    atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height +
+                                   partial_y1 * T(ph) / pooled_height) *
+                                      output_grad_val);
+  }
+}
+
+#endif  // ROI_POOL_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5d946686bdd5fdfbf8a27f6d040e15861202f471
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh
@@ -0,0 +1,141 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef PSAMASK_CUDA_KERNEL_CUH
+#define PSAMASK_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+// CUDA: grid stride looping
+#ifndef CUDA_KERNEL_LOOP
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+#endif
+
+template <typename T>
+__global__ void psamask_collect_forward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* mask_data, T* buffer_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        buffer_data[(n * h_feature * w_feature +
+                     (hidx + h - half_h_mask) * w_feature +
+                     (widx + w - half_w_mask)) *
+                        h_feature * w_feature +
+                    h * w_feature + w] = mask_data
+            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
+                 w_feature +
+             w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_distribute_forward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* mask_data, T* buffer_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        buffer_data[(n * h_feature * w_feature + h * w_feature + w) *
+                        h_feature * w_feature +
+                    (hidx + h - half_h_mask) * w_feature +
+                    (widx + w - half_w_mask)] = mask_data
+            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
+                 w_feature +
+             w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_collect_backward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
+                   h) *
+                      w_feature +
+                  w] = buffer_diff[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_distribute_backward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
+                   h) *
+                      w_feature +
+                  w] =
+            buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *
+                            h_feature * w_feature +
+                        (hidx + h - half_h_mask) * w_feature +
+                        (widx + w - half_w_mask)];
+      }
+    }
+  }
+}
+
+#endif  // PSAMASK_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4383d9e82cce97362f53cf799b8dfa30c7b4cd02
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
@@ -0,0 +1,242 @@
+// Modified from
+// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu
+#ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+#define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+
+/*** Forward ***/
+template <typename scalar_t>
+__global__ void riroi_align_rotated_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_rois, const scalar_t spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int num_orientations, scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int o = (index / pooled_width / pooled_height) % num_orientations;
+    int c =
+        (index / pooled_width / pooled_height / num_orientations) % channels;
+    int n = index / pooled_width / pooled_height / num_orientations / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    // Force malformed ROIs to be 1x1
+    roi_width = max(roi_width, (scalar_t)1.);
+    roi_height = max(roi_height, (scalar_t)1.);
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    // find aligned index
+    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
+    int ind = floorf(ind_float);
+    scalar_t l_var = ind_float - (scalar_t)ind;
+    scalar_t r_var = 1.0 - l_var;
+    // correct start channel
+    ind = (ind + num_orientations) % num_orientations;
+    // rotated channel
+    int ind_rot = (o - ind + num_orientations) % num_orientations;
+    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot) *
+                          height * width;
+
+    const scalar_t *offset_bottom_data_plus =
+        bottom_data + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot_plus) *
+                          height * width;
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (num_samples > 0)
+                             ? num_samples
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosscalar_theta = cos(theta);
+    scalar_t sinscalar_theta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    scalar_t output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta (counterclockwise) around the center and translate
+        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
+        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
+
+        scalar_t val = bilinear_interpolate<scalar_t>(
+            offset_bottom_data, height, width, y, x, index);
+        scalar_t val_plus = bilinear_interpolate<scalar_t>(
+            offset_bottom_data_plus, height, width, y, x, index);
+        output_val += r_var * val + l_var * val_plus;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+/*** Backward ***/
+template <typename scalar_t>
+__global__ void riroi_align_rotated_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
+    const scalar_t spatial_scale, const int num_samples, const bool clockwise,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int o = (index / pooled_width / pooled_height) % num_orientations;
+    int c =
+        (index / pooled_width / pooled_height / num_orientations) % channels;
+    int n = index / pooled_width / pooled_height / num_orientations / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not round
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    // Force malformed ROIs to be 1x1
+    roi_width = max(roi_width, (scalar_t)1.);
+    roi_height = max(roi_height, (scalar_t)1.);
+
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    // find aligned index
+    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
+    int ind = floorf(ind_float);
+    scalar_t l_var = ind_float - (scalar_t)ind;
+    scalar_t r_var = 1.0 - l_var;
+    // correct start channel
+    ind = (ind + num_orientations) % num_orientations;
+    // rotated channel
+    int ind_rot = (o - ind + num_orientations) % num_orientations;
+    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot) *
+                          height * width;
+    scalar_t *offset_bottom_diff_plus =
+        bottom_diff + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot_plus) *
+                          height * width;
+    int top_offset =
+        (n * channels * num_orientations + c * num_orientations + o) *
+        pooled_height * pooled_width;
+    const scalar_t *offset_top_diff = top_diff + top_offset;
+    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (num_samples > 0)
+                             ? num_samples
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosTheta = cos(theta);
+    scalar_t sinTheta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
+        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
+
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
+                                                w4, x_low, x_high, y_low,
+                                                y_high, index);
+
+        scalar_t g1 = top_diff_this_bin * w1 / count;
+        scalar_t g2 = top_diff_this_bin * w2 / count;
+        scalar_t g3 = top_diff_this_bin * w3 / count;
+        scalar_t g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var);
+
+          atomicAdd(offset_bottom_diff_plus + y_low * width + x_low,
+                    g1 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_low * width + x_high,
+                    g2 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_high * width + x_low,
+                    g3 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_high * width + x_high,
+                    g4 * l_var);
+
+        }  // if
+      }    // ix
+    }      // iy
+  }        // CUDA_1D_KERNEL_LOOP
+}  // RiRoIAlignBackward
+
+#endif  // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4541462afd6bd77ee794badd7d84bdd6c91b2c43
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh
@@ -0,0 +1,212 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_ALIGN_CUDA_KERNEL_CUH
+#define ROI_ALIGN_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+/*** Forward ***/
+template <typename T>
+__global__ void roi_align_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, T* output, T* argmax_y,
+    T* argmax_x, const int pooled_height, const int pooled_width,
+    const T spatial_scale, const int sampling_ratio,
+    const int pool_mode,  // 0 - max pool, 1 - avg pool
+    const bool aligned, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    if (pool_mode == 0) {
+      // We do max pooling inside a bin
+      T maxval = -FLT_MAX;
+      T maxidx_y = -1.f, maxidx_x = -1.f;
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+          T val =
+              bilinear_interpolate(offset_input, height, width, y, x, index);
+          if (val > maxval) {
+            maxval = val;
+            maxidx_y = y;
+            maxidx_x = x;
+          }
+        }
+      }
+      output[index] = maxval;
+      argmax_y[index] = maxidx_y;
+      argmax_x[index] = maxidx_x;
+    } else if (pool_mode == 1) {
+      // We do average pooling inside a bin
+      const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+      T output_val = 0.;
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+          T val =
+              bilinear_interpolate(offset_input, height, width, y, x, index);
+          output_val += val;
+        }
+      }
+      output[index] = output_val / count;
+    }
+  }
+}
+
+/*** Backward ***/
+template <typename T>
+__global__ void roi_align_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* rois, const T* argmax_y,
+    const T* argmax_x, T* grad_input, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int sampling_ratio,
+    const int pool_mode,  // 0 - max pool, 1 - avg pool
+    const bool aligned, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T grad_output_this_bin = grad_output[index];
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    if (pool_mode == 0) {
+      T y = argmax_y[index], x = argmax_x[index];
+      if (y != -1.f) {
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_grad_input + y_low * width + x_low,
+                    grad_output_this_bin * w1);
+          atomicAdd(offset_grad_input + y_low * width + x_high,
+                    grad_output_this_bin * w2);
+          atomicAdd(offset_grad_input + y_high * width + x_low,
+                    grad_output_this_bin * w3);
+          atomicAdd(offset_grad_input + y_high * width + x_high,
+                    grad_output_this_bin * w4);
+        }
+      }
+    } else if (pool_mode == 1) {
+      // Do not using rounding; this implementation detail is critical
+      T offset = aligned ? (T)0.5 : (T)0.0;
+      T roi_start_w = offset_rois[1] * spatial_scale - offset;
+      T roi_start_h = offset_rois[2] * spatial_scale - offset;
+      T roi_end_w = offset_rois[3] * spatial_scale - offset;
+      T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+      T roi_width = roi_end_w - roi_start_w;
+      T roi_height = roi_end_h - roi_start_h;
+      if (!aligned) {  // for backward-compatibility only
+        roi_width = max(roi_width, (T)1.);
+        roi_height = max(roi_height, (T)1.);
+      }
+
+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+      // We use roi_bin_grid to sample the grid and mimic integral
+      int roi_bin_grid_h =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : static_cast<int>(ceilf(roi_height / pooled_height));
+      int roi_bin_grid_w =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : static_cast<int>(ceilf(roi_width / pooled_width));
+
+      // We do average (integral) pooling inside a bin
+      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+
+          T w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                        x_low, x_high, y_low, y_high, index);
+
+          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+            atomicAdd(offset_grad_input + y_low * width + x_low,
+                      grad_output_this_bin * w1 / count);
+            atomicAdd(offset_grad_input + y_low * width + x_high,
+                      grad_output_this_bin * w2 / count);
+            atomicAdd(offset_grad_input + y_high * width + x_low,
+                      grad_output_this_bin * w3 / count);
+            atomicAdd(offset_grad_input + y_high * width + x_high,
+                      grad_output_this_bin * w4 / count);
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif  // ROI_ALIGN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8274dc50c709630c4ee456efd543aa1265049b41
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
@@ -0,0 +1,202 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#ifndef ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+#define ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+/*** Forward ***/
+template <typename scalar_t>
+__global__ void roi_align_rotated_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_rois, const scalar_t spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    if (!aligned) {  // for backward-compatibility only
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (scalar_t)1.);
+      roi_height = max(roi_height, (scalar_t)1.);
+    }
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosscalar_theta = cos(theta);
+    scalar_t sinscalar_theta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    scalar_t output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta (counterclockwise) around the center and translate
+        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
+        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
+
+        scalar_t val = bilinear_interpolate<scalar_t>(
+            offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+/*** Backward ***/
+template <typename scalar_t>
+__global__ void roi_align_rotated_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
+    const scalar_t spatial_scale, const int sampling_ratio, const bool aligned,
+    const bool clockwise, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not round
+    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    if (!aligned) {  // for backward-compatibility only
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (scalar_t)1.);
+      roi_height = max(roi_height, (scalar_t)1.);
+    }
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const scalar_t *offset_top_diff = top_diff + top_offset;
+    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosTheta = cos(theta);
+    scalar_t sinTheta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
+        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
+
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
+                                                w4, x_low, x_high, y_low,
+                                                y_high, index);
+
+        scalar_t g1 = top_diff_this_bin * w1 / count;
+        scalar_t g2 = top_diff_this_bin * w2 / count;
+        scalar_t g3 = top_diff_this_bin * w3 / count;
+        scalar_t g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+        }  // if
+      }    // ix
+    }      // iy
+  }        // CUDA_1D_KERNEL_LOOP
+}  // RoIAlignBackward
+
+#endif  // ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3d7eae66b99b7812b92d9fc8bad237cbcbd59436
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh
@@ -0,0 +1,93 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_POOL_CUDA_KERNEL_CUH
+#define ROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void roi_pool_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, T* output, int* argmax,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    // calculate the roi region on feature maps
+    T roi_x1 = offset_rois[1] * spatial_scale;
+    T roi_y1 = offset_rois[2] * spatial_scale;
+    T roi_x2 = (offset_rois[3] + 1) * spatial_scale;
+    T roi_y2 = (offset_rois[4] + 1) * spatial_scale;
+
+    // force malformed rois to be 1x1
+    T roi_w = roi_x2 - roi_x1;
+    T roi_h = roi_y2 - roi_y1;
+    if (roi_w <= 0 || roi_h <= 0) continue;
+
+    T bin_size_w = roi_w / static_cast<T>(pooled_width);
+    T bin_size_h = roi_h / static_cast<T>(pooled_height);
+
+    // the corresponding bin region
+    int bin_x1 = floorf(static_cast<T>(pw) * bin_size_w + roi_x1);
+    int bin_y1 = floorf(static_cast<T>(ph) * bin_size_h + roi_y1);
+    int bin_x2 = ceilf(static_cast<T>(pw + 1) * bin_size_w + roi_x1);
+    int bin_y2 = ceilf(static_cast<T>(ph + 1) * bin_size_h + roi_y1);
+
+    // add roi offsets and clip to input boundaries
+    bin_x1 = min(max(bin_x1, 0), width);
+    bin_y1 = min(max(bin_y1, 0), height);
+    bin_x2 = min(max(bin_x2, 0), width);
+    bin_y2 = min(max(bin_y2, 0), height);
+    bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+    // Define an empty pooling region to be zero
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    T max_val = is_empty ? 0 : -FLT_MAX;
+    int max_idx = -1;
+    for (int h = bin_y1; h < bin_y2; ++h) {
+      for (int w = bin_x1; w < bin_x2; ++w) {
+        int offset = h * width + w;
+        if (offset_input[offset] > max_val) {
+          max_val = offset_input[offset];
+          max_idx = offset;
+        }
+      }
+    }
+    output[index] = max_val;
+    if (argmax != NULL) argmax[index] = max_idx;
+  }
+}
+
+template <typename T>
+__global__ void roi_pool_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* rois, const int* argmax,
+    T* grad_input, const int pooled_height, const int pooled_width,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c) is an element in the pooled output
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    int roi_batch_ind = rois[n * 5];
+    T* grad_input_offset =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+    int argmax_index = argmax[index];
+
+    if (argmax_index != -1) {
+      atomicAdd(grad_input_offset + argmax_index, grad_output[index]);
+    }
+  }
+}
+
+#endif  // ROI_POOL_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..fc0aacf1435f8715fae92de535bf01bac07ac39a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
@@ -0,0 +1,260 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIAWARE_POOL3D_CUDA_KERNEL_CUH
+#define ROIAWARE_POOL3D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
+                                            int out_x, int out_y, int out_z,
+                                            const T *rois, const T *pts,
+                                            int *pts_mask) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
+  // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
+  // y_idxs, z_idxs) by binary bit
+  int box_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (box_idx >= boxes_num) return;
+
+    pts += pt_idx * 3;
+    rois += box_idx * 7;
+    pts_mask += box_idx * pts_num + pt_idx;
+
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
+
+    pts_mask[0] = -1;
+    if (cur_in_flag > 0) {
+      T local_z = pts[2] - rois[2];
+      T x_size = rois[3], y_size = rois[4], z_size = rois[5];
+
+      T x_res = x_size / out_x;
+      T y_res = y_size / out_y;
+      T z_res = z_size / out_z;
+
+      unsigned int x_idx = int((local_x + x_size / 2) / x_res);
+      unsigned int y_idx = int((local_y + y_size / 2) / y_res);
+      unsigned int z_idx = int(local_z / z_res);
+
+      x_idx = min(max(x_idx, 0), out_x - 1);
+      y_idx = min(max(y_idx, 0), out_y - 1);
+      z_idx = min(max(z_idx, 0), out_z - 1);
+
+      unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
+
+      pts_mask[0] = idx_encoding;
+    }
+  }
+}
+
+template <typename T>
+__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
+                                             int max_pts_each_voxel, int out_x,
+                                             int out_y, int out_z,
+                                             const int *pts_mask,
+                                             T *pts_idx_of_voxels) {
+  // params pts_mask: (N, npoints)  0 or 1
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) {
+    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
+
+    for (int k = 0; k < pts_num; k++) {
+      if (pts_mask[box_idx * pts_num + k] != -1) {
+        unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
+        unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
+        unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
+        unsigned int z_idx = idx_encoding & 0xFF;
+        unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
+                                   y_idx * out_z * max_pts_each_voxel +
+                                   z_idx * max_pts_each_voxel;
+        unsigned int cnt = pts_idx_of_voxels[base_offset];
+        if (cnt < max_num_pts) {
+          pts_idx_of_voxels[base_offset + cnt + 1] = k;
+          pts_idx_of_voxels[base_offset]++;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const T *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   T *pooled_features, int *argmax) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    pooled_features += box_idx * out_x * out_y * out_z * channels +
+                       offset_base * channels + channel_idx;
+    argmax += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+    int argmax_idx = -1;
+    float max_val = -1e50;
+
+    int total_pts = pts_idx_of_voxels[0];
+
+    for (int k = 1; k <= total_pts; k++) {
+      if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >
+          max_val) {
+        max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+        argmax_idx = pts_idx_of_voxels[k];
+      }
+    }
+
+    if (argmax_idx != -1) {
+      pooled_features[0] = max_val;
+    }
+    argmax[0] = argmax_idx;
+  }
+}
+
+template <typename T>
+__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const T *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   T *pooled_features) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    pooled_features += box_idx * out_x * out_y * out_z * channels +
+                       offset_base * channels + channel_idx;
+
+    float sum_val = 0;
+    int total_pts = pts_idx_of_voxels[0];
+
+    for (int k = 1; k <= total_pts; k++) {
+      sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+    }
+
+    if (total_pts > 0) {
+      pooled_features[0] = sum_val / total_pts;
+    }
+  }
+}
+
+template <typename T>
+__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            const int *argmax,
+                                            const T *grad_out, T *grad_in) {
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    argmax += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+    grad_out += box_idx * out_x * out_y * out_z * channels +
+                offset_base * channels + channel_idx;
+
+    if (argmax[0] == -1) return;
+
+    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
+  }
+}
+
+template <typename T>
+__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            int max_pts_each_voxel,
+                                            const int *pts_idx_of_voxels,
+                                            const T *grad_out, T *grad_in) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    grad_out += box_idx * out_x * out_y * out_z * channels +
+                offset_base * channels + channel_idx;
+
+    int total_pts = pts_idx_of_voxels[0];
+    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
+    for (int k = 1; k <= total_pts; k++) {
+      atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
+                grad_out[0] * cur_grad);
+    }
+  }
+}
+
+#endif  // ROIAWARE_POOL3D_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..545f6ffa09d4a6cae49f1f1e68c191c1fd54de68
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
@@ -0,0 +1,134 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIPOINT_POOL3D_CUDA_KERNEL_CUH
+#define ROIPOINT_POOL3D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the
+  // bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
+  cz += dz / 2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > dz / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  T in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) &
+              (local_y > -dy / 2.0) & (local_y < dy / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
+                                    const T *xyz, const T *boxes3d,
+                                    int *pts_assign) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
+  // background points
+  int box_idx = blockIdx.y;
+  int bs_idx = blockIdx.z;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
+
+    int assign_idx =
+        bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
+    pts_assign[assign_idx] = 0;
+
+    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
+    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
+
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
+                                        local_x, local_y);
+    pts_assign[assign_idx] = cur_in_flag;
+  }
+}
+
+__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
+                               int sampled_pts_num, const int *pts_assign,
+                               int *pts_idx, int *pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params pts_feature: (B, N, C)
+  // params pts_assign: (B, N)
+  // params pts_idx: (B, M, 512)
+  // params pooled_empty_flag: (B, M)
+  CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) {
+    int bs_idx = blockIdx.y;
+
+    int cnt = 0;
+    for (int k = 0; k < pts_num; k++) {
+      if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +
+                     boxes_idx]) {
+        if (cnt < sampled_pts_num) {
+          pts_idx[bs_idx * boxes_num * sampled_pts_num +
+                  boxes_idx * sampled_pts_num + cnt] = k;
+          cnt++;
+        } else
+          break;
+      }
+    }
+
+    if (cnt == 0) {
+      pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
+    } else if (cnt < sampled_pts_num) {
+      // duplicate same points for sampling
+      for (int k = cnt; k < sampled_pts_num; k++) {
+        int duplicate_idx = k % cnt;
+        int base_offset =
+            bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
+        pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void roipoint_pool3d_forward(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const T *xyz, const int *pts_idx, const T *pts_feature,
+    T *pooled_features, int *pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params pts_idx: (B, M, 512)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  int box_idx = blockIdx.y;
+  int bs_idx = blockIdx.z;
+  CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) {
+    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
+    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return;
+
+    int temp_idx = bs_idx * boxes_num * sampled_pts_num +
+                   box_idx * sampled_pts_num + sample_pt_idx;
+    int src_pt_idx = pts_idx[temp_idx];
+    int dst_feature_offset = temp_idx * (3 + feature_in_len);
+
+    for (int j = 0; j < 3; j++)
+      pooled_features[dst_feature_offset + j] =
+          xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
+
+    int src_feature_offset =
+        bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
+    memcpy(pooled_features + dst_feature_offset + 3,
+           pts_feature + src_feature_offset, feature_in_len * sizeof(T));
+  }
+}
+
+#endif  // ROIPOINT_POOL3D_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ffcc658ccb1f5e3059c0428159bc2e80fbeee3d4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
@@ -0,0 +1,129 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
+#define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void rotated_feature_align_forward_kernel(
+    const int nthreads, const int points, const scalar_t* bottom_data,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width, scalar_t* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    const scalar_t* offset_bottom_data =
+        bottom_data + (n * channels + c) * height * width;
+
+    scalar_t output_val = bottom_data[index];
+    for (int i = 0; i < points; i++) {
+      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
+                                                   width, py[i], px[i], i);
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void rotated_feature_align_backward_kernel(
+    const int nthreads, const int points, const scalar_t* top_diff,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width,
+    scalar_t* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    scalar_t* offset_bottom_diff =
+        bottom_diff + (n * channels + c) * height * width;
+    scalar_t value_top_diff = top_diff[index];
+
+    atomicAdd(bottom_diff + index, value_top_diff);
+    for (int i = 0; i < points; i++) {
+      scalar_t w1, w2, w3, w4;
+      int x_low, x_high, y_low, y_high;
+
+      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
+                                              w2, w3, w4, x_low, x_high, y_low,
+                                              y_high, i);
+      scalar_t g1 = value_top_diff * w1;
+      scalar_t g2 = value_top_diff * w2;
+      scalar_t g3 = value_top_diff * w3;
+      scalar_t g4 = value_top_diff * w4;
+      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+        atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+        atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+        atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+        atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+      }
+    }
+  }
+}
+#endif  // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..af5b9f67b12060ae5dfa52738dba52c8fe674105
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
@@ -0,0 +1,187 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SCATTER_POINTS_CUDA_KERNEL_CUH
+#define SCATTER_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+int const maxGridDim = 50000;
+
+__device__ __forceinline__ static void reduceMax(float *address, float val) {
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(fmaxf(val, __int_as_float(assumed))));
+  } while (assumed != old || __int_as_float(old) < val);
+}
+
+__device__ __forceinline__ static void reduceMax(double *address, double val) {
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(
+        address_as_ull, assumed,
+        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
+  } while (assumed != old || __longlong_as_double(old) < val);
+}
+
+// get rid of meaningless warnings when compiling host code
+#ifdef MMCV_WITH_HIP
+__device__ __forceinline__ static void reduceAdd(float *address, float val) {
+  atomicAdd(address, val);
+}
+__device__ __forceinline__ static void reduceAdd(double *address, double val) {
+  atomicAdd(address, val);
+}
+#else
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ static void reduceAdd(float *address, float val) {
+#if (__CUDA_ARCH__ < 200)
+#ifdef _MSC_VER
+#pragma message( \
+    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32")
+#else
+#warning \
+    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
+#endif
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(val + __int_as_float(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+
+__device__ __forceinline__ static void reduceAdd(double *address, double val) {
+#if (__CUDA_ARCH__ < 600)
+#ifdef _MSC_VER
+#pragma message( \
+    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64")
+#else
+#warning \
+    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
+#endif
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+#endif  // __CUDA_ARCH__
+#endif  // MMCV_WITH_HIP
+
+template <typename T>
+__global__ void feats_reduce_kernel(
+    const T *feats, const int32_t *coors_map,
+    T *reduced_feats,  // shall be 0 at initialization
+    const int num_input, const int num_feats, const reduce_t reduce_type) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) continue;
+
+    const T *feats_offset = feats + x * num_feats;
+    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
+    if (reduce_type == reduce_t::MAX) {
+      for (int i = 0; i < num_feats; i++) {
+        reduceMax(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    } else {
+      for (int i = 0; i < num_feats; i++) {
+        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void add_reduce_traceback_grad_kernel(
+    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
+    const int32_t *reduce_count, const int num_input, const int num_feats,
+    const reduce_t reduce_type) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int input_offset = x * num_feats;
+    T *grad_feats_offset = grad_feats + input_offset;
+    const int reduced_offset = reduce_to * num_feats;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    if (reduce_type == reduce_t::SUM) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i];
+      }
+    } else if (reduce_type == reduce_t::MEAN) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i] /
+                               static_cast<T>(reduce_count[reduce_to]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_traceback_scatter_idx_kernel(
+    const T *feats, const T *reduced_feats, int32_t *reduce_from,
+    const int32_t *coors_map, const int num_input, const int num_feats) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+
+    const int input_offset = x * num_feats;
+    const T *feats_offset = feats + input_offset;
+
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int reduced_offset = reduce_to * num_feats;
+    const T *reduced_feats_offset = reduced_feats + reduced_offset;
+    int32_t *reduce_from_offset = reduce_from + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      if (feats_offset[i] == reduced_feats_offset[i]) {
+        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
+                                               const T *grad_reduced_feats,
+                                               const int32_t *reduce_from,
+                                               const int num_reduced,
+                                               const int num_feats) {
+  CUDA_1D_KERNEL_LOOP(x, num_reduced) {
+    const int reduced_offset = x * num_feats;
+    const int32_t *scatter_to_offset = reduce_from + reduced_offset;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      grad_feats[scatter_to_offset[i] * num_feats + i] =
+          grad_reduced_feats_offset[i];
+    }
+  }
+}
+
+#endif  // SCATTER_POINTS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1eb5f8fcccbaafdb62972652e3979803c0acd1ca
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh
@@ -0,0 +1,71 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
+#define SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void sigmoid_focal_loss_forward_cuda_kernel(
+    const int nthreads, const T* input, const int64_t* target, const T* weight,
+    T* output, const T gamma, const T alpha, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+
+    int64_t t = target[n];
+    T flag_p = (t == c);
+    T flag_n = (t != c);
+
+    // p = sigmoid(x) = 1. / 1. + expf(-x)
+    T p = (T)1. / ((T)1. + expf(-input[index]));
+
+    // (1 - p)**gamma * log(p)
+    T term_p = pow(((T)1. - p), gamma) * log(max(p, (T)FLT_MIN));
+    // p**gamma * log(1 - p)
+    T term_n = pow(p, gamma) * log(max((T)1. - p, (T)FLT_MIN));
+
+    output[index] = (T)0.;
+    output[index] += -flag_p * alpha * term_p;
+    output[index] += -flag_n * ((T)1. - alpha) * term_n;
+    if (weight != NULL) {
+      output[index] *= weight[t];
+    }
+  }
+}
+
+template <typename T>
+__global__ void sigmoid_focal_loss_backward_cuda_kernel(
+    const int nthreads, const T* input, const int64_t* target, const T* weight,
+    T* grad_input, const T gamma, const T alpha, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+
+    int64_t t = target[n];
+    T flag_p = (t == c);
+    T flag_n = (t != c);
+
+    // p = sigmoid(x) = 1. / 1. + expf(-x)
+    T p = (T)1. / ((T)1. + exp(-input[index]));
+
+    // (1 - p)**gamma * (1 - p - gamma*p*log(p))
+    T term_p = pow(((T)1. - p), gamma) *
+               ((T)1. - p - (gamma * p * log(max(p, (T)FLT_MIN))));
+    // p**gamma * (gamma * (1 - p) * log(1 - p) - p)
+    T term_n = pow(p, gamma) *
+               (gamma * ((T)1. - p) * log(max((T)1. - p, (T)FLT_MIN)) - p);
+
+    grad_input[index] = (T)0.;
+    grad_input[index] += -flag_p * alpha * term_p;
+    grad_input[index] += -flag_n * ((T)1. - alpha) * term_n;
+    if (weight != NULL) {
+      grad_input[index] *= weight[t];
+    }
+  }
+}
+
+#endif  // SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..631b2c6175412a9503f6c385ee6597d9527d754f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh
@@ -0,0 +1,72 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
+#define SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void softmax_focal_loss_forward_cuda_kernel(
+    const int nthreads, const T* softmax, const int64_t* target,
+    const T* weight, T* output, const T gamma, const T alpha,
+    const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int64_t label = target[index];
+    T pred = softmax[index * num_classes + label];
+
+    if (label >= 0) {
+      output[index] =
+          -alpha * pow((T)1. - pred, gamma) * log(max(pred, (T)FLT_MIN));
+    } else {
+      output[index] = 0;
+    }
+    if (weight != NULL) {
+      output[index] *= weight[label];
+    }
+  }
+}
+
+template <typename T>
+__global__ void softmax_focal_loss_backward_cuda1_kernel(
+    const int nthreads, const T* softmax, const int64_t* target,
+    const T* weight, T* buff, const T gamma, const T alpha,
+    const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int64_t label = target[index];
+    T pred = softmax[index * num_classes + label];
+
+    if (label >= 0) {
+      buff[index] = alpha * (-pow((T)1. - pred, gamma) +
+                             gamma * pow((T)1. - pred, gamma - 1) * pred *
+                                 log(max(pred, (T)FLT_MIN)));
+    } else {
+      buff[index] = 0;
+    }
+    if (weight != NULL) {
+      buff[index] *= weight[label];
+    }
+  }
+}
+
+template <typename T>
+__global__ void softmax_focal_loss_backward_cuda2_kernel(
+    const int nthreads, const T* softmax, const int64_t* target, const T* buff,
+    T* grad_input, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+    int64_t label = target[n];
+
+    if (label >= 0) {
+      T flag = (label == c ? (T)1. : (T)0.);
+      grad_input[index] = buff[n] * (flag - softmax[index]);
+    } else {
+      grad_input[index] = 0;
+    }
+  }
+}
+
+#endif  // SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/spconv/indice.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/spconv/indice.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5ef0009a10f8effeb447e398cff5103b400056de
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/spconv/indice.cuh
@@ -0,0 +1,236 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef INDICE_CU_H_
+#define INDICE_CU_H_
+#include <utils/spconv/spconv/geometry.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void prepareIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
+    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  auto indicePairsDim2 = indicePairs.dim(2);
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+      indicePairs(offset, 0, oldNum) = ix;
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      indicePairs(offset, 1, oldNum) = index;
+      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void prepareDeConvIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
+    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  auto indicePairsDim2 = indicePairs.dim(2);
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPosTranspose<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+      indicePairs(offset, 0, oldNum) = ix;
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      indicePairs(offset, 1, oldNum) = index;
+      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void assignGridAndIndiceOutKernel(
+    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+    int numAct, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
+  Index index;
+  auto indicesOutPtr = indicesOut.data();
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    index = indicePairUnique[ix];
+    gridsOut[index] = ix;
+    index = tv::rowArrayIdxInv<Index, NDim>(
+        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
+    indicesOut[ix * (NDim + 1)] = index % batchSize;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void assignIndicePairsKernel(
+    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+    int numActIn, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  Index index;
+  int kernelVolume = indicePairs.dim(0);
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    for (int i = 0; i < kernelVolume; ++i) {
+      index = indicePairs(i, 1, ix);
+      if (index > -1) {
+        indicePairs(i, 1, ix) = gridsOut[index];
+      }
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void prepareSubMGridKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index index = 0;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
+                                         outSpatialShape.data()) +
+            spatialVolume * indicesIn(ix, 0);
+    gridsOut[index] = ix;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void getSubMIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
+    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  Index index = 0;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (int i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      if (gridsOut[index] > -1) {
+        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+        indicePairs(offset, 1, oldNum) = gridsOut[index];
+        indicePairs(offset, 0, oldNum) = ix;
+      }
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void resetGridKernel(const Index *indicePairUnique,
+                                tv::TensorView<IndexGrid> gridsOut,
+                                int numAct) {
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    gridsOut[indicePairUnique[ix]] = -1;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void resetGridSubMKernel(
+    const Index *indices, tv::TensorView<IndexGrid> gridsOut,
+    const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {
+  int outSpatialShapeReg[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShapeReg[i] = outSpatialShape[i];
+  }
+  Index spatialVolume = 1;
+  auto indsPtr = indices;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    indsPtr = indices + ix * (NDim + 1);
+    index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);
+    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
+  }
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e3ec68b937b0507e3a119d63a49ad79e8f48eec7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
@@ -0,0 +1,160 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef REORDERING_CU_H_
+#define REORDERING_CU_H_
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features,
+                                    const Index *indices, int size,
+                                    int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size)
+          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
+              features[inds[ilp] + iy];
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features,
+                                const Index *indices, int size, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size)
+          reinterpret_cast<VecType *>(
+              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
+              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType = int4>
+__global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features,
+                                     const Index *indices, int size,
+                                     int numPlanes) {
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
+  features += blockIdx.x * NumTLP;
+  buffer += blockIdx.x * NumTLP;
+
+  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      reinterpret_cast<VecType *>(
+          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
+          reinterpret_cast<const VecType *>(
+              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
+                        threadIdx.x];
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void scatterAddGenericKernel(scalar_t *outFeatures,
+                                        const scalar_t *buffer,
+                                        const Index *indices, int size,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size) {
+          outFeatures[inds[ilp] + iy] +=
+              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType = int4>
+__global__ void scatterAddVecBlockKernel(scalar_t *outFeatures,
+                                         const scalar_t *buffer,
+                                         const Index *indices, int size,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
+  outFeatures += blockIdx.x * NumTLP;
+  buffer += blockIdx.x * NumTLP;
+  scalar_t buf[vecloadFactor];
+  scalar_t buf2[vecloadFactor];
+  Index idx;
+  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(buf)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idx];
+      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
+          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        buf[i] += buf2[i];
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idx] =
+          reinterpret_cast<VecType *>(buf)[0];
+    }
+  }
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..06caefa18d47be11b6cb8770ceb8951479add902
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
@@ -0,0 +1,68 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+#ifndef STACK_BALL_QUERY_CUDA_KERNEL_CUH
+#define STACK_BALL_QUERY_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void stack_ball_query_forward_cuda_kernel(
+    int B, int M, float radius, int nsample, const T *new_xyz,
+    const int *new_xyz_batch_cnt, const T *xyz, const int *xyz_batch_cnt,
+    int *idx) {
+  // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
+  // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
+  // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
+  // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
+  // output:
+  //      idx: (M, nsample)
+  const T *cur_xyz = xyz;
+  int *cur_idx = idx;
+  CUDA_1D_KERNEL_LOOP(pt_idx, M) {
+    int bs_idx = 0;
+    for (int pt_cnt = 0; bs_idx < B; bs_idx++) {
+      pt_cnt += new_xyz_batch_cnt[bs_idx];
+      if (pt_idx < pt_cnt) break;
+    }
+
+    int xyz_batch_start_idx = 0;
+    for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];
+
+    const T *new_xyz_p = new_xyz + pt_idx * 3;
+    cur_xyz += xyz_batch_start_idx * 3;
+    cur_idx += pt_idx * nsample;
+
+    float radius2 = radius * radius;
+    T new_x = new_xyz_p[0];
+    T new_y = new_xyz_p[1];
+    T new_z = new_xyz_p[2];
+    int n = xyz_batch_cnt[bs_idx];
+
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+      T x = cur_xyz[k * 3 + 0];
+      T y = cur_xyz[k * 3 + 1];
+      T z = cur_xyz[k * 3 + 2];
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+             (new_z - z) * (new_z - z);
+      if (d2 < radius2) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            cur_idx[l] = k;
+          }
+        }
+        cur_idx[cnt] = k;
+        ++cnt;
+        if (cnt >= nsample) break;
+      }
+    }
+    if (cnt == 0) cur_idx[0] = -1;
+  }
+}
+
+#endif  // STACK_BALL_QUERY_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4ef3663d05bcd9146e15dd93bb979734538919cb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
@@ -0,0 +1,97 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#ifndef STACK_GROUP_POINTS_CUDA_KERNEL_CUH
+#define STACK_GROUP_POINTS_CUDA_KERNEL_CUH
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include <stdio.h>
+template <typename T>
+__global__ void stack_group_points_forward_cuda_kernel(
+    int b, int c, int m, int nsample, const T *features,
+    const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt,
+    T *out) {
+  // :param features: (N1 + N2 ..., C) tensor of features to group
+  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
+  // indices of features to group with :param idx: (M1 + M2 ..., nsample) tensor
+  // containing the indices of features to group with :param idx_batch_cnt:
+  // (batch_size) [M1 + M2 ...] tensor containing the indices of features to
+  // group with :return:
+  //     output: (M1 + M2, C, nsample) tensor
+  CUDA_1D_KERNEL_LOOP(index, m * c * nsample) {
+    const T *cur_features = features;
+    const int *cur_idx = idx;
+    int sample_idx = index % nsample;
+    int c_idx = (index / nsample) % c;
+    int pt_idx = (index / nsample / c);
+
+    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;
+    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
+    for (int k = 1; k < b; k++) {
+      if (pt_idx < pt_cnt) break;
+      pt_cnt += idx_batch_cnt[k];
+      bs_idx = k;
+    }
+
+    int features_batch_start_idx = 0;
+    int features_batch_end_idx = features_batch_cnt[0];
+    for (int k = 0; k < bs_idx; k++) {
+      features_batch_start_idx += features_batch_cnt[k];
+      features_batch_end_idx =
+          features_batch_start_idx + features_batch_cnt[k + 1];
+    }
+    cur_features += features_batch_start_idx * c;
+
+    cur_idx += pt_idx * nsample + sample_idx;
+    int in_idx = cur_idx[0] * c + c_idx;
+    int out_idx = pt_idx * c * nsample + c_idx * nsample + sample_idx;
+    if (in_idx < features_batch_end_idx * c) {
+      out[out_idx] = cur_features[in_idx];
+    }
+  }
+}
+
+template <typename T>
+__global__ void stack_group_points_backward_cuda_kernel(
+    int b, int c, int m, int n, int nsample, const T *grad_out, const int *idx,
+    const int *idx_batch_cnt, const int *features_batch_cnt, T *grad_features) {
+  // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the
+  // output from forward :param idx: (M1 + M2 ..., nsample) tensor containing
+  // the indices of features to group with :param idx_batch_cnt: (batch_size)
+  // [M1 + M2 ...] tensor containing the indices of features to group with
+  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
+  // indices of features to group with :return:
+  //     grad_features: (N1 + N2 ..., C) gradient of the features
+  CUDA_1D_KERNEL_LOOP(index, m * c * nsample) {
+    const T *cur_grad_out = grad_out;
+    const int *cur_idx = idx;
+    T *cur_grad_features = grad_features;
+    int sample_idx = index % nsample;
+    int c_idx = (index / nsample) % c;
+    int pt_idx = (index / nsample / c);
+
+    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;
+
+    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
+    for (int k = 1; k < b; k++) {
+      if (pt_idx < pt_cnt) break;
+      pt_cnt += idx_batch_cnt[k];
+      bs_idx = k;
+    }
+
+    int features_batch_start_idx = 0;
+    for (int k = 0; k < bs_idx; k++)
+      features_batch_start_idx += features_batch_cnt[k];
+
+    cur_grad_out += pt_idx * c * nsample + c_idx * nsample + sample_idx;
+    cur_idx += pt_idx * nsample + sample_idx;
+    cur_grad_features += (features_batch_start_idx + cur_idx[0]) * c + c_idx;
+
+    atomicAdd(cur_grad_features, cur_grad_out[0]);
+  }
+}
+
+#endif  // GROUP_POINTS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4ec6a466886832d38c72da6e3a3574e72d53cec8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh
@@ -0,0 +1,331 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SYNCBN_CUDA_KERNEL_CUH
+#define SYNCBN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void sync_bn_forward_mean_cuda_kernel(const T *input, float *mean,
+                                                 int num, int channels,
+                                                 int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer[tid] += input[index];
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    mean[c] = buffer[0] / total;
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_mean_cuda_kernel(const phalf *input,
+                                                 float *mean, int num,
+                                                 int channels, int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer[tid] += static_cast<float>(input[index]);
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    mean[c] = buffer[0] / total;
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_forward_var_cuda_kernel(const T *input,
+                                                const float *mean, float *var,
+                                                int num, int channels,
+                                                int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    float td = input[index] - mean[c];
+    buffer[tid] += td * td;
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    var[c] = buffer[0] / total;
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_var_cuda_kernel(const phalf *input,
+                                                const float *mean, float *var,
+                                                int num, int channels,
+                                                int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    float td = static_cast<float>(input[index]) - mean[c];
+    buffer[tid] += td * td;
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    var[c] = buffer[0] / total;
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_forward_output_cuda_kernel(
+    const T *input, const float *mean, const float *var, float *running_mean,
+    float *running_var, const float *weight, const float *bias, float *norm,
+    float *std, T *output, int num, int channels, int spatial, float eps,
+    float momentum, int group_size) {
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  float mean_value = mean[c];
+  float std_value = sqrt(var[c] + eps);
+
+  if (weight != nullptr) {
+    float weight_value = weight[c];
+    float bias_value = bias[c];
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] = (input[index] - mean_value) / std_value;
+        output[index] = norm[index] * weight_value + bias_value;
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] =
+            (input[index] - mean_value) / std_value * weight_value + bias_value;
+      }
+    }
+  } else {
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = norm[index] = (input[index] - mean_value) / std_value;
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = (input[index] - mean_value) / std_value;
+      }
+    }
+  }
+  if (tid == 0) {
+    if (std != nullptr) std[c] = std_value;
+    if (running_mean != nullptr) {
+      running_mean[c] =
+          momentum * mean_value + (1 - momentum) * running_mean[c];
+      int count = num * spatial * group_size;
+      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
+      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
+    }
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_output_cuda_kernel(
+    const phalf *input, const float *mean, const float *var,
+    float *running_mean, float *running_var, const float *weight,
+    const float *bias, float *norm, float *std, phalf *output, int num,
+    int channels, int spatial, float eps, float momentum, int group_size) {
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  float mean_value = mean[c];
+  float std_value = sqrt(var[c] + eps);
+  if (weight != nullptr) {
+    float weight_value = weight[c];
+    float bias_value = bias[c];
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] =
+            (static_cast<float>(input[index]) - mean_value) / std_value;
+        output[index] =
+            static_cast<phalf>(norm[index] * weight_value + bias_value);
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] =
+            static_cast<phalf>((static_cast<float>(input[index]) - mean_value) /
+                                   std_value * weight_value +
+                               bias_value);
+      }
+    }
+  } else {
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] =
+            (static_cast<float>(input[index]) - mean_value) / std_value;
+        output[index] = static_cast<phalf>(norm[index]);
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = static_cast<phalf>(
+            (static_cast<float>(input[index]) - mean_value) / std_value);
+      }
+    }
+  }
+  if (tid == 0) {
+    if (std != nullptr) std[c] = std_value;
+    if (running_mean != nullptr) {
+      running_mean[c] =
+          momentum * mean_value + (1 - momentum) * running_mean[c];
+      int count = num * spatial * group_size;
+      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
+      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
+    }
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_backward_param_cuda_kernel(const T *grad_output,
+                                                   const float *norm,
+                                                   float *grad_weight,
+                                                   float *grad_bias, int num,
+                                                   int channels, int spatial) {
+  __shared__ float buffer1[THREADS_PER_BLOCK];
+  __shared__ float buffer2[THREADS_PER_BLOCK];
+
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer1[tid] = buffer2[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer1[tid] += grad_output[index] * norm[index];
+    buffer2[tid] += grad_output[index];
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer1[tid] += buffer1[tid + s];
+      buffer2[tid] += buffer2[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_weight[c] = buffer1[0];
+    grad_bias[c] = buffer2[0];
+  }
+}
+
+template <>
+__global__ void sync_bn_backward_param_cuda_kernel(const phalf *grad_output,
+                                                   const float *norm,
+                                                   float *grad_weight,
+                                                   float *grad_bias, int num,
+                                                   int channels, int spatial) {
+  __shared__ float buffer1[THREADS_PER_BLOCK];
+  __shared__ float buffer2[THREADS_PER_BLOCK];
+
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer1[tid] = buffer2[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer1[tid] += static_cast<float>(grad_output[index]) * norm[index];
+    buffer2[tid] += static_cast<float>(grad_output[index]);
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer1[tid] += buffer1[tid + s];
+      buffer2[tid] += buffer2[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_weight[c] = buffer1[0];
+    grad_bias[c] = buffer2[0];
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_backward_data_cuda_kernel(
+    int output_size, const T *grad_output, const float *weight,
+    const float *grad_weight, const float *grad_bias, const float *norm,
+    const float *std, T *grad_input, int num, int channels, int spatial) {
+  int factor = num * spatial;
+  CUDA_1D_KERNEL_LOOP(index, output_size) {
+    int c = (index / spatial) % channels;
+    grad_input[index] =
+        weight[c] *
+        (grad_output[index] -
+         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
+        std[c];
+  }
+}
+
+template <>
+__global__ void sync_bn_backward_data_cuda_kernel(
+    int output_size, const phalf *grad_output, const float *weight,
+    const float *grad_weight, const float *grad_bias, const float *norm,
+    const float *std, phalf *grad_input, int num, int channels, int spatial) {
+  int factor = num * spatial;
+  CUDA_1D_KERNEL_LOOP(index, output_size) {
+    int c = (index / spatial) % channels;
+    grad_input[index] = static_cast<phalf>(
+        weight[c] *
+        (static_cast<float>(grad_output[index]) -
+         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
+        std[c]);
+  }
+}
+
+#endif  // SYNCBN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..971b496e589d2210131351305cbaf0ed1a027cb1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_INTERPOLATE_CUDA_KERNEL_CUH
+#define THREE_INTERPOLATE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void three_interpolate_forward_cuda_kernel(
+    int b, int c, int m, int n, const T *points, const int *__restrict__ idx,
+    const T *weight, T *out) {
+  // points: (B, C, M)
+  // idx: (B, N, 3)
+  // weight: (B, N, 3)
+  // output:
+  //      out: (B, C, N)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+    out += bs_idx * c * n + c_idx * n;
+
+    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
+                  weight[2] * points[idx[2]];
+  }
+}
+
+template <typename T>
+__global__ void three_interpolate_backward_cuda_kernel(
+    int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx,
+    const T *weight, T *grad_points) {
+  // grad_out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
+  //      grad_points: (B, C, M)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    grad_points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+
+    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
+  }
+}
+
+#endif  // THREE_INTERPOLATE_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..15434121b94033afb2fcb9945a83db15b92262d4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
@@ -0,0 +1,67 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_NN_CUDA_KERNEL_CUH
+#define THREE_NN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void three_nn_forward_cuda_kernel(int b, int n, int m,
+                                             const T *unknown, const T *known,
+                                             T *dist2, int *__restrict__ idx) {
+  // unknown: (B, N, 3)
+  // known: (B, M, 3)
+  // output:
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
+
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b) return;
+
+    unknown += bs_idx * n * 3 + pt_idx * 3;
+    known += bs_idx * m * 3;
+    dist2 += bs_idx * n * 3 + pt_idx * 3;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+
+    T ux = unknown[0];
+    T uy = unknown[1];
+    T uz = unknown[2];
+
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < m; ++k) {
+      T x = known[k * 3 + 0];
+      T y = known[k * 3 + 1];
+      T z = known[k * 3 + 2];
+      T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+      if (d < best1) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = best1;
+        besti2 = besti1;
+        best1 = d;
+        besti1 = k;
+      } else if (d < best2) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = d;
+        besti2 = k;
+      } else if (d < best3) {
+        best3 = d;
+        besti3 = k;
+      }
+    }
+    dist2[0] = best1;
+    dist2[1] = best2;
+    dist2[2] = best3;
+    idx[0] = besti1;
+    idx[1] = besti2;
+    idx[2] = besti3;
+  }
+}
+
+#endif  // THREE_NN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4d1159a515f4de2666c25ba4bd5e4f2cbbca1e10
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TIN_SHIFT_CUDA_KERNEL_CUH
+#define TIN_SHIFT_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void tin_shift_forward_cuda_kernel(
+    const int nthreads, const T* input, const int* shift, T* output,
+    const int batch_size, const int channels, const int t_size,
+    const int hw_size, const int group_size, const int group_channel) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int hw_index = index % hw_size;
+    const int j = (index / hw_size) % channels;
+
+    const int n_index = (index / hw_size / channels) % batch_size;
+    int group_id = j / group_channel;
+    int t_shift = shift[n_index * group_size + group_id];
+    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
+    for (int i = 0; i < t_size; i++) {
+      int now_t = i + t_shift;
+      int data_id = i * hw_size * channels + offset;
+      if (now_t < 0 || now_t >= t_size) {
+        continue;
+      }
+      int out_id = now_t * hw_size * channels + offset;
+      output[out_id] = input[data_id];
+    }
+  }
+}
+
+template <typename T>
+__global__ void tin_shift_backward_cuda_kernel(
+    const int nthreads, const T* input, const int* shift, T* output,
+    const int batch_size, const int channels, const int t_size,
+    const int hw_size, const int group_size, const int group_channel) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int hw_index = index % hw_size;
+    const int j = (index / hw_size) % channels;
+
+    const int n_index = (index / hw_size / channels) % batch_size;
+    int group_id = j / group_channel;
+    int t_shift = shift[n_index * group_size + group_id];
+    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
+    for (int i = 0; i < t_size; i++) {
+      int now_t = i + t_shift;
+      int data_id = i * hw_size * channels + offset;
+      if (now_t < 0 || now_t >= t_size) {
+        continue;
+      }
+      int out_id = now_t * hw_size * channels + offset;
+      output[out_id] = input[data_id];
+    }
+  }
+}
+
+#endif  // TIN_SHIFT_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..021b488d8d716c9e8132173bf04491d42b7b6fa2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
@@ -0,0 +1,216 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef VOXELIZATION_CUDA_KERNEL_CUH
+#define VOXELIZATION_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+template <typename T, typename T_int>
+__global__ void dynamic_voxelize_kernel(
+    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
+    const float voxel_z, const float coors_x_min, const float coors_y_min,
+    const float coors_z_min, const float coors_x_max, const float coors_y_max,
+    const float coors_z_max, const int grid_x, const int grid_y,
+    const int grid_z, const int num_points, const int num_features,
+    const int NDim) {
+  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    // To save some computation
+    auto points_offset = points + index * num_features;
+    auto coors_offset = coors + index * NDim;
+    int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x);
+    if (c_x < 0 || c_x >= grid_x) {
+      coors_offset[0] = -1;
+      continue;
+    }
+
+    int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y);
+    if (c_y < 0 || c_y >= grid_y) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      continue;
+    }
+
+    int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z);
+    if (c_z < 0 || c_z >= grid_z) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      coors_offset[2] = -1;
+    } else {
+      coors_offset[0] = c_z;
+      coors_offset[1] = c_y;
+      coors_offset[2] = c_x;
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_point_to_voxel(const int nthreads, const T* points,
+                                      T_int* point_to_voxelidx,
+                                      T_int* coor_to_voxelidx, T* voxels,
+                                      const int max_points,
+                                      const int num_features,
+                                      const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    int index = thread_idx / num_features;
+
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num > -1 && voxelidx > -1) {
+      auto voxels_offset =
+          voxels + voxelidx * max_points * num_features + num * num_features;
+
+      int k = thread_idx % num_features;
+      voxels_offset[k] = points[thread_idx];
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
+                                   T_int* point_to_voxelidx,
+                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
+                                   const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    // if (index >= num_points) return;
+    int index = thread_idx / NDim;
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num == 0 && voxelidx > -1) {
+      auto coors_offset = voxel_coors + voxelidx * NDim;
+      int k = thread_idx % NDim;
+      coors_offset[k] = coor[thread_idx];
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void point_to_voxelidx_kernel(const T_int* coor,
+                                         T_int* point_to_voxelidx,
+                                         T_int* point_to_pointidx,
+                                         const int max_points,
+                                         const int max_voxels,
+                                         const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    auto coor_offset = coor + index * NDim;
+    // skip invalid points
+    if (coor_offset[0] == -1) continue;
+
+    int num = 0;
+    int coor_x = coor_offset[0];
+    int coor_y = coor_offset[1];
+    int coor_z = coor_offset[2];
+    // only calculate the coors before this coor[index]
+    for (int i = 0; i < index; ++i) {
+      auto prev_coor = coor + i * NDim;
+      if (prev_coor[0] == -1) continue;
+
+      // Find all previous points that have the same coors
+      // if find the same coor, record it
+      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
+          (prev_coor[2] == coor_z)) {
+        num++;
+        if (num == 1) {
+          // point to the same coor that first show up
+          point_to_pointidx[index] = i;
+        } else if (num >= max_points) {
+          // out of boundary
+          break;
+        }
+      }
+    }
+    if (num == 0) {
+      point_to_pointidx[index] = index;
+    }
+    if (num < max_points) {
+      point_to_voxelidx[index] = num;
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void determin_voxel_num(
+    // const T_int* coor,
+    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
+    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
+    const int max_points, const int max_voxels, const int num_points) {
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < num_points; ++i) {
+    int point_pos_in_voxel = point_to_voxelidx[i];
+    // record voxel
+    if (point_pos_in_voxel == -1) {
+      // out of max_points or invalid point
+      continue;
+    } else if (point_pos_in_voxel == 0) {
+      // record new voxel
+      int voxelidx = voxel_num[0];
+      if (voxel_num[0] >= max_voxels) continue;
+      voxel_num[0] += 1;
+      coor_to_voxelidx[i] = voxelidx;
+      num_points_per_voxel[voxelidx] = 1;
+    } else {
+      int point_idx = point_to_pointidx[i];
+      int voxelidx = coor_to_voxelidx[point_idx];
+      if (voxelidx != -1) {
+        coor_to_voxelidx[i] = voxelidx;
+        num_points_per_voxel[voxelidx] += 1;
+      }
+    }
+  }
+}
+
+__global__ void nondeterministic_get_assign_pos(
+    const int nthreads, const int32_t* coors_map, int32_t* pts_id,
+    int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    if (coors_idx > -1) {
+      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
+      pts_id[thread_idx] = coors_pts_pos;
+      if (coors_pts_pos == 0) {
+        coors_order[coors_idx] = atomicAdd(coors_count, 1);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void nondeterministic_assign_point_voxel(
+    const int nthreads, const T* points, const int32_t* coors_map,
+    const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count,
+    const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count,
+    const int max_voxels, const int max_points, const int num_features,
+    const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    int coors_pts_pos = pts_id[thread_idx];
+    if (coors_idx > -1 && coors_pts_pos < max_points) {
+      int coors_pos = coors_order[coors_idx];
+      if (coors_pos < max_voxels) {
+        auto voxels_offset =
+            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
+        auto points_offset = points + thread_idx * num_features;
+        for (int k = 0; k < num_features; k++) {
+          voxels_offset[k] = points_offset[k];
+        }
+        if (coors_pts_pos == 0) {
+          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
+          auto coors_offset = coors + coors_pos * NDim;
+          auto coors_in_offset = coors_in + coors_idx * NDim;
+          for (int k = 0; k < NDim; k++) {
+            coors_offset[k] = coors_in_offset[k];
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif  // VOXELIZATION_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..852737224183c1852f1394903e1106219d9ad40e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
@@ -0,0 +1,256 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef COMMON_MLU_HELPER_HPP_
+#define COMMON_MLU_HELPER_HPP_
+
+#define NFU_ALIGN_SIZE 128          // Byte
+#define REM_FOR_STACK (128 * 1024)  // 128KB reserved for cncc
+
+#ifdef __BANG_ARCH__
+#define MAX_NRAM_SIZE \
+  (__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
+#define MAX_SRAM_SIZE \
+  (__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
+#else
+#define MAX_NRAM_SIZE (384 * 1024)   // 384KB,  initialization value
+#define MAX_SRAM_SIZE (1920 * 1024)  // 1920KB, initialization value
+#endif
+
+#ifndef PAD_UP
+#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
+#endif
+
+#ifndef PAD_DOWN
+#define PAD_DOWN(x, y) (((x) / (y)) * (y))
+#endif
+
+#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
+
+template <typename scalar_t>
+__mlu_func__ inline scalar_t min(scalar_t a, scalar_t b) {
+  return a < b ? a : b;
+}
+
+template <typename scalar_t>
+__mlu_func__ inline scalar_t max(scalar_t a, scalar_t b) {
+  return a > b ? a : b;
+}
+
+/*!
+ * @brief Converts int32 to float32 data type.
+ *
+ * @param[out] dst
+ *   Pointer to NRAM that stores int32 type data.
+ * @param[in,out] dst_addition
+ *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src
+ *   Pointer to NRAM that stores float32 type data.
+ * @param[in,out] src_addition
+ *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src_count
+ *   The count of elements in src.
+ */
+__mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
+                                   float *src_addition, const int src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_int2float((float *)dst, (int32_t *)src, src_count, 0);
+#else
+  // get sign bit
+  const float move_23bit = 8388608.0;
+  // 0x80000000 = 1,000000000,0000000000000000000000000000
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000000);
+  __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
+                    src_count * sizeof(float), NFU_ALIGN_SIZE);
+  // get 1 or 0 from sign bit
+  // judg is Odd
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x00000001);
+  __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
+                   (char *)src_addition, src_count * sizeof(float),
+                   NFU_ALIGN_SIZE);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000001);
+  __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
+                  NFU_ALIGN_SIZE / sizeof(float));
+  // minus xor, positive num invariant
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xffffffff);
+  __bang_cycle_mul(dst, dst_addition, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+  __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));
+  // convert int32 to float32
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x7fffff);
+  __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
+                    src_count * sizeof(float), NFU_ALIGN_SIZE);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x4b000000);
+  __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
+                   src_count * sizeof(float), NFU_ALIGN_SIZE);
+  __bang_sub_scalar(dst, dst, move_23bit, src_count);
+  // add one
+  __bang_add(dst, dst, dst_addition, src_count);
+  // set sign for float32
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xffffffff);
+  __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x00000001);
+  __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000000);
+  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
+                    (char *)src_addition, src_count * 4, 128);
+  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);
+#endif  // __BANG_ARCH__ >= 300
+}
+
+/*!
+ * @brief Converts float32 to int32 data type with to_zero round mode.
+ *
+ * @param[out] dst
+ *   Pointer to NRAM that stores float32 type data.
+ * @param[in,out] dst_addition
+ *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src
+ *   Pointer to NRAM that stores int32 type data.
+ * @param[in,out] src_addition
+ *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src_count
+ *   The count of elements in src.
+ */
+__mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
+                                   float *src_addition, const int src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_float2int_tz((int32_t *)dst, (float *)src, src_count, 0);
+#else
+  // sign ===> src_addition
+  // dst=-1.0 : when src[i] is a negative number
+  // dst=+1.0 : when src[i] is a positive number
+  const int floatDchar = sizeof(float) / sizeof(char);
+  __bang_active_sign((float *)dst, src, src_count);
+  // dst_addition = abs(src)
+  __bang_mul(dst_addition, src, (float *)dst, src_count);
+  // if dst_addition < 1.0 , then src_addition + 1, to fix add error.
+  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     1.0f);
+  __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
+                  NFU_ALIGN_SIZE / sizeof(float));
+  __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xbf800000);
+  // set negative flag -1.0 = 0xbf80000
+  __bang_cycle_eq(
+      (float *)dst, (float *)dst, (float *)src_addition, src_count,
+      NFU_ALIGN_SIZE / sizeof(float));  //  to mark all src in [x<-1.0]
+  __bang_active_abs(dst_addition, src, src_count);
+  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     8388608.0f);
+  // mask shift move 23
+  __bang_cycle_add_tz(
+      dst_addition, dst_addition, src_addition, src_count,
+      NFU_ALIGN_SIZE / sizeof(float));  // right shift move 23bit
+  // two`s complement for negatibe
+  // dst=1.0 , when src <-1.0
+  // dst=0.0 , when src >=-1.0
+  __bang_sub(dst_addition, dst_addition, (float *)dst, src_count);
+  // to fix max value
+  // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
+  // means max value.
+  __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count);
+  __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
+              src_count * floatDchar);
+  // get low 23bit
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     (unsigned)0x007fffff);
+  // mask low 23bit is 1
+  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
+                    (char *)src_addition, src_count * floatDchar,
+                    NFU_ALIGN_SIZE / sizeof(char));
+  // set 9 high bit ===> dst
+  // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
+  //  1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
+  __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
+  __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+  // src or dst_addition
+  __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
+             src_count * floatDchar);
+  __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count);
+  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
+             src_count * floatDchar);
+#endif  // __BANG_ARCH__ >= 300
+}
+
+/*!
+ * @brief Converts float32 to half data type,
+ * the rounding mode on MLU200 is rd, on MLU300 is rn.
+ *
+ * @param[out] dst
+ *   Pointer to NRAM that stores half type data.
+ * @param[in] src
+ *   Pointer to NRAM that stores float32 type data.
+ * @param[in] src_count
+ *   The count of elements in src.
+ */
+__mlu_func__ inline void convertFloat2half(half *dst, float *src,
+                                           int src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_float2half_rn(dst, src, src_count);
+#else
+  __bang_float2half_rd(dst, src, src_count);
+#endif
+}
+
+/*!
+ * @brief recursiveSumPool.
+ * @param[in,out] dst
+ *     Pointer to NRAM that stores the input and output data.
+ * @param[in] low_dim
+ *     Which is the number of low dim.
+ * @param[in] high_dim
+ *     Which is the number of high dim.
+ * @param[in] kernel_limit
+ *     Which is the high_dim of sumpool per time.
+ ******************************************************************************/
+template <typename T>
+__mlu_func__ void recursiveSumPool(T *dst, int low_dim, int high_dim,
+                                   int kernel_limit) {
+  for (; high_dim > 1;) {
+    int repeat_s = high_dim / kernel_limit;
+    int remain_s = high_dim % kernel_limit;
+
+    if (remain_s) {
+      __bang_sumpool((T *)dst, (T *)dst, low_dim, 1, remain_s, 1, remain_s, 1,
+                     1);
+    }
+    if (repeat_s) {
+      __bang_sumpool((T *)dst + (remain_s > 0 ? low_dim : 0),
+                     (T *)dst + remain_s * low_dim, low_dim,
+                     kernel_limit * repeat_s, 1, kernel_limit, 1, 1,
+                     kernel_limit);
+    }
+    high_dim = repeat_s + (bool)remain_s;
+  }
+  return;
+}
+
+#endif  // COMMON_MLU_HELPER_HPP_
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSDevice.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSDevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1d9d49618d7aea6a30b42630350c5a7b77ea0ac
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSDevice.h
@@ -0,0 +1,64 @@
+//  Copyright © 2022 Apple Inc.
+
+// This file is modify from:
+// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h
+
+#pragma once
+#include <ATen/ATen.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLDevice;
+typedef void* MTLDevice_t;
+#endif
+
+using namespace std;
+
+namespace at {
+namespace mps {
+
+//-----------------------------------------------------------------
+//  MPSDevice
+//
+// MPSDevice is a singleton class that returns the default device
+//-----------------------------------------------------------------
+
+class TORCH_API MPSDevice {
+ public:
+  /**
+   * MPSDevice should not be cloneable.
+   */
+  MPSDevice(MPSDevice& other) = delete;
+  /**
+   * MPSDevice should not be assignable.
+   */
+  void operator=(const MPSDevice&) = delete;
+  /**
+   * Gets single instance of the Device.
+   */
+  static MPSDevice* getInstance();
+  /**
+   * Returns the single device.
+   */
+  MTLDevice_t device() { return _mtl_device; }
+
+  ~MPSDevice();
+
+ private:
+  static MPSDevice* _device;
+  MTLDevice_t _mtl_device;
+  MPSDevice();
+};
+
+TORCH_API bool is_available();
+
+TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
+
+}  // namespace mps
+}  // namespace at
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSLibrary.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSLibrary.h
new file mode 100644
index 0000000000000000000000000000000000000000..41c33fba8cbdd43cc5b3285603c11c6f9eee617b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSLibrary.h
@@ -0,0 +1,61 @@
+#ifndef _MPS_LIBRARY_H_
+#define _MPS_LIBRARY_H_
+
+#include <string>
+#include <unordered_map>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+typedef id<MTLComputePipelineState> MTLComputePipelineState_t;
+typedef id<MTLLibrary> MTLLibrary_t;
+#else
+typedef void* MTLComputePipelineState;
+typedef void* MTLComputePipelineState_t;
+typedef void* MTLLibrary;
+typedef void* MTLLibrary_t;
+#endif
+
+class MPSLibrary {
+ public:
+  // disable constructor for singleton
+  static MPSLibrary* createFromUrl(const std::string& library_url);
+  static MPSLibrary* createFromSource(const std::string& source);
+  ~MPSLibrary();
+
+  MTLLibrary_t library() { return _library; }
+
+  MTLComputePipelineState_t getComputePipelineState(
+      const std::string& function_name);
+
+ private:
+  MTLLibrary_t _library;
+  std::unordered_map<std::string, MTLComputePipelineState_t> _pso_map;
+};
+
+class MPSLibraryManager {
+ public:
+  // disable constructor for singleton
+  MPSLibraryManager(const MPSLibraryManager&) = delete;
+  MPSLibraryManager& operator=(const MPSLibraryManager&) = delete;
+  MPSLibraryManager(MPSLibraryManager&&) = delete;
+  MPSLibraryManager& operator=(MPSLibraryManager&&) = delete;
+
+  static MPSLibraryManager* getInstance();
+
+  bool hasLibrary(const std::string& name);
+
+  MPSLibrary* getLibrary(const std::string& library_url);
+
+  MPSLibrary* createLibraryFromSouce(const std::string& name,
+                                     const std::string& sources);
+
+  ~MPSLibraryManager();
+
+ private:
+  MPSLibraryManager();
+  std::unordered_map<std::string, std::unique_ptr<MPSLibrary>> _library_map;
+};
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSLibrary.mm b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSLibrary.mm
new file mode 100644
index 0000000000000000000000000000000000000000..99addc7e28222f890e0b65660bb97711b6b52305
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSLibrary.mm
@@ -0,0 +1,107 @@
+#include "MPSLibrary.h"
+#include "MPSDevice.h"
+
+static std::unique_ptr<MPSLibraryManager> mps_library_manager=nullptr;
+
+MPSLibraryManager* MPSLibraryManager::getInstance() {
+  if(!mps_library_manager)
+    mps_library_manager = std::unique_ptr<MPSLibraryManager>(new MPSLibraryManager());
+  return mps_library_manager.get();
+}
+
+MPSLibraryManager::~MPSLibraryManager() {}
+
+MPSLibraryManager::MPSLibraryManager() {}
+
+bool MPSLibraryManager::hasLibrary(const std::string& name) {
+  return _library_map.find(name) != _library_map.end();
+}
+
+MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) {
+  if (_library_map.find(library_url) != _library_map.end()) {
+    return _library_map[library_url].get();
+  }
+  _library_map.emplace(std::make_pair(
+      library_url, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromUrl(library_url))));
+  return _library_map[library_url].get();
+}
+
+MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name,
+                                                      const std::string& source) {
+  NSString* ns_name = [NSString stringWithCString:name.c_str()];
+  if (_library_map.find(name) != _library_map.end()) {
+    NSLog(@"Library %@ already exist.", ns_name);
+    return nullptr;
+  }
+
+  _library_map.emplace(
+      std::make_pair(name, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromSource(source))));
+  return _library_map[name].get();
+}
+
+MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) {
+  MPSLibrary* library = new MPSLibrary();
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // load library and func
+    NSString* utl_str = [NSString stringWithCString:library_url.c_str()];
+    NSURL* metal_url = [NSURL fileURLWithPath:utl_str];
+    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url
+                                                                                 error:&error];
+    if (library->_library == nil) {
+      NSLog(@"Failed to find library, error %@.", error);
+      exit(1);
+    }
+  }
+
+  return library;
+}
+
+MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) {
+  MPSLibrary* library = new MPSLibrary();
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // load library and func
+    NSString* code_str = [NSString stringWithCString:sources.c_str()];
+    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str
+                                                                                  options:nil
+                                                                                    error:&error];
+    if (library->_library == nil) {
+      NSLog(@"Failed to find library, error %@.", error);
+      exit(1);
+    }
+  }
+
+  return library;
+}
+
+MPSLibrary::~MPSLibrary() {
+  [_library release];
+  _library = nil;
+}
+
+MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) {
+  if (_pso_map.find(function_name) != _pso_map.end()) {
+    return _pso_map[function_name];
+  }
+
+  MTLComputePipelineState_t pso;
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // create function
+    NSString* function_name_str = [NSString stringWithCString:function_name.c_str()];
+    id<MTLFunction> func = [_library newFunctionWithName:function_name_str];
+    if (func == nil) {
+      NSLog(@"Failed to created pipeline state object, error %@.", error);
+      exit(1);
+    }
+    // create pipeline
+    pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func
+                                                                                     error:&error];
+    _pso_map.emplace(std::make_pair(function_name, pso));
+  }
+  return _pso_map[function_name];
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSStream.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSStream.h
new file mode 100644
index 0000000000000000000000000000000000000000..54cd388494c8bbac636db44dd5c8afd1915357c6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSStream.h
@@ -0,0 +1,132 @@
+//  Copyright © 2022 Apple Inc.
+
+// This file is modify from:
+// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/util/Exception.h>
+#include "MPSDevice.h"
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+typedef id<MTLCommandQueue> MTLCommandQueue_t;
+typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
+typedef id<MTLSharedEvent> MTLSharedEvent_t;
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLCommandQueue_t;
+typedef void* MTLCommandQueue;
+typedef void* MTLCommandBuffer_t;
+typedef void* MTLCommandBuffer;
+typedef void* MTLSharedEvent_t;
+typedef void* dispatch_queue_t;
+typedef void* MTLDevice_t;
+#define nil NULL;
+#endif
+
+namespace at {
+namespace mps {
+
+//-----------------------------------------------------------------
+//  MPSStream
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStream {
+ public:
+  enum Unchecked { UNCHECKED };
+  /// Construct a MPSStream from a Stream.  This construction is checked,
+  /// and will raise an error if the Stream is not, in fact, a MPS stream.
+  explicit MPSStream(Stream stream);
+
+  ~MPSStream();
+  MTLCommandQueue_t commandQueue() const { return _commandQueue; };
+  dispatch_queue_t queue() const { return _serialQueue; }
+
+  MTLCommandBuffer_t commandBuffer();
+  void commit(bool flush);
+  void commitAndWait();
+  void synchronize();
+
+  void flush();
+
+  /// Get the MPS device index that this stream is associated with.
+  c10::DeviceIndex device_index() const { return _stream.device_index(); }
+
+  MTLCommandQueue_t stream() const { return _commandQueue; };
+
+  MTLDevice_t device() const { return [_commandQueue device]; }
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const { return _stream; }
+
+ private:
+  Stream _stream;
+  MTLCommandQueue_t _commandQueue = nil;
+  MTLCommandBuffer_t _commandBuffer = nil;
+  void _flush(bool commitAndWait) const;
+
+  dispatch_queue_t _serialQueue = nullptr;
+};
+
+/**
+ * Get the current MPS stream
+ */
+TORCH_API MPSStream* getCurrentMPSStream();
+
+/**
+ * Get the default MPS stream
+ */
+TORCH_API MPSStream* getDefaultMPSStream();
+
+//-----------------------------------------------------------------
+//  MPSStreamImpl
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStreamImpl {
+ public:
+  /**
+   * Gets single instance of the MPSStream.
+   */
+  static MPSStream* getInstance();
+
+ private:
+  static MPSStream* _stream;
+  MPSStreamImpl();
+};
+
+//-----------------------------------------------------------------
+//  MPSEvent
+//-----------------------------------------------------------------
+
+struct TORCH_API MPSEvent {
+  MPSEvent();
+  // MPSEvent(id<MTLDevice> device);
+
+  ~MPSEvent();
+  MTLSharedEvent_t event() const { return _event; }
+
+  void recordEvent(MPSStream* stream);
+  void waitForEvent(MPSStream* queue);  // waits on the cpu
+  bool queryEvent();
+  uint64_t getCurrentValue() { return _currentValue; }
+  void setCurrentValue(uint64_t currValue) { _currentValue = currValue; }
+
+ private:
+  bool _isRecorded = false;
+  uint64_t _currentValue = 0;
+  MTLSharedEvent_t _event;
+};
+
+typedef MPSEvent* mpsEvent_t;
+
+}  // namespace mps
+}  // namespace at
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSUtils.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a4ce6d7978d566e88dd22ee4f9722df914ff0de
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSUtils.h
@@ -0,0 +1,51 @@
+#ifndef _MPS_UTILS_H_
+#define _MPS_UTILS_H_
+#include <torch/extension.h>
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+typedef id<MTLBuffer> MTLBuffer_t;
+typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
+#else
+typedef void* MTLBuffer;
+typedef void* MTLBuffer_t;
+typedef void* MTLComputeCommandEncoder;
+typedef void* MTLComputeCommandEncoder_t;
+#endif
+
+// utils
+static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) {
+  return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data());
+}
+
+template <typename T,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t);
+
+template <typename T,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
+  [encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index];
+}
+
+template <typename T, std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
+  [encoder setBytes:&t length:sizeof(t) atIndex:index];
+}
+
+inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {}
+
+template <typename T, typename... Args>
+void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) {
+  setMTLArg(encoder, index, std::forward<T>(t));
+  setMTLArgsImpl(encoder, index + 1, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) {
+  [encoder setComputePipelineState:pso];
+  setMTLArgsImpl(encoder, 0, std::forward<Args>(args)...);
+}
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/parrots_cpp_helper.hpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/parrots_cpp_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..72701890dd727db911a1c0ce4d6790c1b531348d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/parrots_cpp_helper.hpp
@@ -0,0 +1,40 @@
+#ifndef PARROTS_CPP_HELPER
+#define PARROTS_CPP_HELPER
+#include <parrots/darray/darraymath.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/darraylite.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include <vector>
+
+using namespace parrots;
+
+#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \
+  case prim_type: {                                     \
+    using scalar_t = type;                              \
+    return __VA_ARGS__();                               \
+  }
+
+#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...)                  \
+  [&] {                                                             \
+    const auto& the_type = TYPE;                                    \
+    switch (the_type) {                                             \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)  \
+      default:                                                      \
+        PARROTS_NOTSUPPORTED;                                       \
+    }                                                               \
+  }()
+
+#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...)          \
+  [&] {                                                              \
+    const auto& the_type = TYPE;                                     \
+    switch (the_type) {                                              \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__)  \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)   \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \
+      default:                                                       \
+        PARROTS_NOTSUPPORTED;                                        \
+    }                                                                \
+  }()
+
+#endif  // PARROTS_CPP_HELPER
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/parrots_cuda_helper.hpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/parrots_cuda_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..539009c3f91b46ea58a3a64f0875d799e8bd0b65
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/parrots_cuda_helper.hpp
@@ -0,0 +1,111 @@
+#ifndef PARROTS_CUDA_HELPER
+#define PARROTS_CUDA_HELPER
+
+#include <cuda.h>
+#include <float.h>
+
+#include <parrots/darray/darraymath.hpp>
+#include <parrots/darray/mathfunctions.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/darrayutil.hpp>
+#include <parrots/foundation/exceptions.hpp>
+#include <parrots/foundation/float16.hpp>
+#include <parrots/foundation/mathfunction.hpp>
+
+#include "common_cuda_helper.hpp"
+#include "parrots_cudawarpfunction.cuh"
+
+using namespace parrots;
+using phalf = float16;
+
+#define __PHALF(x) (x.y)
+
+#define PARROTS_CUDA_CHECK(exp)                         \
+  do {                                                  \
+    cudaError_t err = exp;                              \
+    if (err != cudaSuccess) {                           \
+      fprintf(stderr, "cudaCheckError() failed : %s\n", \
+              cudaGetErrorString(err));                 \
+      exit(-1);                                         \
+    }                                                   \
+  } while (0)
+
+#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \
+  case prim_type: {                                     \
+    using scalar_t = type;                              \
+    return __VA_ARGS__();                               \
+  }
+
+#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...)                  \
+  [&] {                                                             \
+    const auto& the_type = TYPE;                                    \
+    switch (the_type) {                                             \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)  \
+      default:                                                      \
+        PARROTS_NOTSUPPORTED;                                       \
+    }                                                               \
+  }()
+
+#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...)          \
+  [&] {                                                              \
+    const auto& the_type = TYPE;                                     \
+    switch (the_type) {                                              \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__)  \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)   \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \
+      default:                                                       \
+        PARROTS_NOTSUPPORTED;                                        \
+    }                                                                \
+  }()
+
+/** atomicAdd **/
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+
+static __inline__ __device__ double atomicAdd(double* address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  if (val == 0.0) return __longlong_as_double(old);
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+#endif
+
+static __inline__ __device__ float16 atomicAdd(float16* address, float16 val) {
+  unsigned int* aligned =
+      (unsigned int*)((size_t)address - ((size_t)address & 2));
+  unsigned int old = *aligned;
+  unsigned int assumed;
+  unsigned short old_as_us;
+  do {
+    assumed = old;
+    old_as_us =
+        (unsigned short)((size_t)address & 2 ? old >> 16 : old & 0xffff);
+
+#if __CUDACC_VER_MAJOR__ >= 9
+    float16 tmp;
+    tmp.x = old_as_us;
+    float16 sum = tmp + val;
+    unsigned short sum_as_us = sum.x;
+//         half sum = __float2half_rn(__half2float(__ushort_as_half(old_as_us))
+//         + (float)(val)); unsigned short sum_as_us = __half_as_ushort(sum);
+#else
+    unsigned short sum_as_us =
+        __float2half_rn(__half2float(old_as_us) + (float)(val));
+#endif
+
+    unsigned int sum_as_ui = (size_t)address & 2
+                                 ? (sum_as_us << 16) | (old & 0xffff)
+                                 : (old & 0xffff0000) | sum_as_us;
+    old = atomicCAS(aligned, assumed, sum_as_ui);
+  } while (assumed != old);
+  //__half_raw raw = {old_as_us};
+  // return float16(raw);
+  return *reinterpret_cast<float16*>(&old_as_us);
+}
+#endif  // PARROTS_CUDA_HELPER
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f68e8740561ef833c09e1ba9f999922f5d04bce5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
@@ -0,0 +1,27 @@
+#ifndef PYTORCH_CPP_HELPER
+#define PYTORCH_CPP_HELPER
+#include <torch/types.h>
+
+#include <vector>
+
+using namespace at;
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_MLU(x) \
+  TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor")
+#define CHECK_CPU(x) \
+  TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) \
+  CHECK_CUDA(x);            \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_MLU_INPUT(x) \
+  CHECK_MLU(x);            \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) \
+  CHECK_CPU(x);            \
+  CHECK_CONTIGUOUS(x)
+
+#endif  // PYTORCH_CPP_HELPER
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..52e512695a403abe2688f9bffeece633a02f189a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
@@ -0,0 +1,20 @@
+#ifndef PYTORCH_CUDA_HELPER
+#define PYTORCH_CUDA_HELPER
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <THC/THCAtomics.cuh>
+
+#include "common_cuda_helper.hpp"
+
+using at::Half;
+using at::Tensor;
+using phalf = at::Half;
+
+#define __PHALF(x) (x)
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+#endif  // PYTORCH_CUDA_HELPER
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_device_registry.hpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_device_registry.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a32b7270c3521f960394af7d18cbbd03ba50df1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_device_registry.hpp
@@ -0,0 +1,141 @@
+#ifndef PYTORCH_DEVICE_REGISTRY_H
+#define PYTORCH_DEVICE_REGISTRY_H
+
+// Using <torch/extension.h> is recommended in the official documentation in
+// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
+// However, we use <torch/types.h> for compatibility with CUDA 9.0
+// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
+#include <torch/types.h>
+
+#include <cassert>
+#include <functional>
+#include <map>
+#include <type_traits>
+
+inline std::string GetDeviceStr(const at::Device& device) {
+  std::string str = DeviceTypeName(device.type(), true);
+  if (device.has_index()) {
+    str.push_back(':');
+    str.append(std::to_string(device.index()));
+  }
+  return str;
+}
+
+// Registry
+template <typename F, F f>
+class DeviceRegistry;
+
+template <typename Ret, typename... Args, Ret (*f)(Args...)>
+class DeviceRegistry<Ret (*)(Args...), f> {
+ public:
+  using FunctionType = Ret (*)(Args...);
+  static const int MAX_DEVICE_TYPES =
+      int8_t(at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
+
+  void Register(at::DeviceType device, FunctionType function) {
+    funcs_[int8_t(device)] = function;
+  }
+
+  FunctionType Find(at::DeviceType device) const {
+    return funcs_[int8_t(device)];
+  }
+
+  static DeviceRegistry& instance() {
+    static DeviceRegistry inst;
+    return inst;
+  }
+
+ private:
+  DeviceRegistry() {
+    for (size_t i = 0; i < MAX_DEVICE_TYPES; ++i) {
+      funcs_[i] = nullptr;
+    }
+  };
+  FunctionType funcs_[MAX_DEVICE_TYPES];
+};
+
+// get device of first tensor param
+
+template <typename T, typename... Args,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
+  return std::forward<T>(t).device();
+}
+template <typename T, typename... Args,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
+  return GetFirstTensorDevice(std::forward<Args>(args)...);
+}
+
+// check device consistency
+
+inline std::pair<int, at::Device> CheckDeviceConsistency(
+    const at::Device& device, int index) {
+  return {index, device};
+}
+
+template <typename T, typename... Args,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args);
+
+template <typename T, typename... Args,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args) {
+  auto new_device = std::forward<T>(t).device();
+  if (new_device.type() != device.type() ||
+      new_device.index() != device.index()) {
+    return {index, new_device};
+  }
+  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
+}
+
+template <
+    typename T, typename... Args,
+    std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args) {
+  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
+}
+
+// dispatch
+
+template <typename R, typename... Args>
+auto Dispatch(const R& registry, const char* name, Args&&... args) {
+  auto device = GetFirstTensorDevice(std::forward<Args>(args)...);
+  auto inconsist =
+      CheckDeviceConsistency(device, 0, std::forward<Args>(args)...);
+  TORCH_CHECK(inconsist.first >= int(sizeof...(Args)), name, ": at param ",
+              inconsist.first,
+              ", inconsistent device: ", GetDeviceStr(inconsist.second).c_str(),
+              " vs ", GetDeviceStr(device).c_str(), "\n")
+  auto f_ptr = registry.Find(device.type());
+  TORCH_CHECK(f_ptr != nullptr, name, ": implementation for device ",
+              GetDeviceStr(device).c_str(), " not found.\n")
+  return f_ptr(std::forward<Args>(args)...);
+}
+
+// helper macro
+
+#define DEVICE_REGISTRY(key) DeviceRegistry<decltype(&(key)), key>::instance()
+
+#define REGISTER_DEVICE_IMPL(key, device, value)           \
+  struct key##_##device##_registerer {                     \
+    key##_##device##_registerer() {                        \
+      DEVICE_REGISTRY(key).Register(at::k##device, value); \
+    }                                                      \
+  };                                                       \
+  static key##_##device##_registerer _##key##_##device##_registerer;
+
+#define DISPATCH_DEVICE_IMPL(key, ...) \
+  Dispatch(DEVICE_REGISTRY(key), #key, __VA_ARGS__)
+
+#endif  // PYTORCH_DEVICE_REGISTRY
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e49572ca841211e2960192f1e0955b54819086cc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
@@ -0,0 +1,61 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef PYTORCH_MLU_HELPER_HPP_
+#define PYTORCH_MLU_HELPER_HPP_
+
+#ifdef MMCV_WITH_MLU
+#include "aten.h"
+
+#define NFU_ALIGN_SIZE 128
+
+#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
+
+#define PAD_DOWN(x, y) (((x) / (y)) * (y))
+
+#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
+
+#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
+
+inline int32_t getJobLimitCapability() {
+  CNcontext drv_ctx;
+  TORCH_CHECK(CN_SUCCESS == cnCtxGetCurrent(&drv_ctx), "cnCtxGetCurrent fails");
+  CNctxConfigParam ctx_conf_param;
+  TORCH_CHECK(
+      CN_SUCCESS == cnGetCtxConfigParam(drv_ctx, CN_CTX_CONFIG_UNION_LIMIT,
+                                        &ctx_conf_param),
+      "cnGetCtxConfigParam fails.");
+  return (int32_t)ctx_conf_param.unionLimit;
+}
+
+inline int32_t getCoreNumOfJobLimitCapability() {
+  switch (getJobLimitCapability()) {
+    default:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) *
+             getJobLimitCapability();
+    case CN_KERNEL_CLASS_BLOCK:
+      return 1;
+    case CN_KERNEL_CLASS_UNION:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+    case CN_KERNEL_CLASS_UNION2:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 2;
+    case CN_KERNEL_CLASS_UNION4:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 4;
+    case CN_KERNEL_CLASS_UNION8:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 8;
+    case CN_KERNEL_CLASS_UNION16:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 16;
+  }
+}
+
+#endif  // MMCV_WITH_MLU
+
+#endif  // PYTORCH_MLU_HELPER_HPP_
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_npu_helper.hpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_npu_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..073d6b38c345ed480542c2dd68d9fc256a4665ae
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_npu_helper.hpp
@@ -0,0 +1,47 @@
+/******************************************************************************
+ * Copyright (c) 2022 Huawei Technologies Co., Ltd
+ * All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License  (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef PYTORCH_NPU_HELPER_HPP_
+#define PYTORCH_NPU_HELPER_HPP_
+
+#include <torch_npu/csrc/aten/CustomFunctions.h>
+#include <torch_npu/csrc/framework/utils/CalcuOpUtil.h>
+#include <torch_npu/csrc/framework/utils/OpAdapter.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+#define NPU_NAME_SPACE at_npu::native
+
+#ifdef MMCV_WITH_XLA
+#define REGISTER_NPU_IMPL(key, value) REGISTER_DEVICE_IMPL(key, XLA, value)
+#else
+#define REGISTER_NPU_IMPL(key, value) \
+  REGISTER_DEVICE_IMPL(key, PrivateUse1, value)
+#endif
+
+#ifdef MMCV_WITH_XLA
+#define CHECK_NPU(x) \
+  TORCH_CHECK(x.device().type() == at::kXLA, #x " must be a NPU tensor")
+#else
+#define CHECK_NPU(x)                                    \
+  TORCH_CHECK(x.device().type() == at::kPrivateUse1, #x \
+              " must be a NPU "                         \
+              "tensor")
+
+#endif
+#endif  // PYTORCH_NPU_HELPER_HPP_
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
new file mode 100644
index 0000000000000000000000000000000000000000..f23ff4482324c51012865c42f2a5f9e59d54848a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
@@ -0,0 +1,70 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PARAMS_GRID_H_
+#define PARAMS_GRID_H_
+#include <tuple>
+#include <vector>
+
+namespace detail {
+template <class scalar_t>
+int getTotalSize(std::vector<scalar_t> arg) {
+  return arg.size();
+}
+
+template <class scalar_t, class... TArgs>
+int getTotalSize(std::vector<scalar_t> arg, std::vector<TArgs>... args) {
+  return arg.size() * getTotalSize(args...);
+}
+
+template <typename scalar_t>
+int getSize(std::vector<scalar_t> arg) {
+  return arg.size();
+}
+
+template <int Idx, class TT, class scalar_t>
+void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg) {
+  std::get<Idx>(src) = arg[counter[Idx]];
+}
+
+template <int Idx, class TT, class scalar_t, class... TArgs>
+void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg,
+              std::vector<TArgs> &... args) {
+  std::get<Idx>(src) = arg[counter[Idx]];
+  assigner<Idx + 1>(src, counter, args...);
+}
+}  // namespace detail
+
+template <class... TArgs>
+std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
+  int length = detail::getTotalSize(args...);
+  std::vector<int> sizes = {detail::getSize(args)...};
+  int size = sizes.size();
+
+  std::vector<std::tuple<TArgs...>> params(length);
+  std::vector<int> counter(size);
+  for (int i = 0; i < length; ++i) {
+    detail::assigner<0>(params[i], counter, args...);
+    counter[size - 1] += 1;
+    for (int c = size - 1; c >= 0; --c) {
+      if (counter[c] == sizes[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return params;
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/prettyprint.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a6bdc3361dc1ada31fdebef87989672c9aeb51c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
@@ -0,0 +1,493 @@
+//          Copyright Louis Delacroix 2010 - 2014.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+//
+// A pretty printing library for C++
+//
+// Usage:
+// Include this header, and operator<< will "just work".
+
+#ifndef H_PRETTY_PRINT
+#define H_PRETTY_PRINT
+
+#include <cstddef>
+#include <iterator>
+#include <memory>
+#include <ostream>
+#include <set>
+#include <tuple>
+#include <type_traits>
+#include <unordered_set>
+#include <utility>
+#include <valarray>
+
+namespace pretty_print {
+namespace detail {
+// SFINAE type trait to detect whether T::const_iterator exists.
+
+struct sfinae_base {
+  using yes = char;
+  using no = yes[2];
+};
+
+template <typename T>
+struct has_const_iterator : private sfinae_base {
+ private:
+  template <typename C>
+  static yes &test(typename C::const_iterator *);
+  template <typename C>
+  static no &test(...);
+
+ public:
+  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
+  using type = T;
+};
+
+template <typename T>
+struct has_begin_end : private sfinae_base {
+ private:
+  template <typename C>
+  static yes &
+  f(typename std::enable_if<
+      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
+                                            const>(&C::begin)),
+                   typename C::const_iterator (C::*)() const>::value>::type *);
+
+  template <typename C>
+  static no &f(...);
+
+  template <typename C>
+  static yes &g(typename std::enable_if<
+                std::is_same<decltype(static_cast<typename C::const_iterator (
+                                          C::*)() const>(&C::end)),
+                             typename C::const_iterator (C::*)() const>::value,
+                void>::type *);
+
+  template <typename C>
+  static no &g(...);
+
+ public:
+  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
+  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
+};
+
+}  // namespace detail
+
+// Holds the delimiter values for a specific character type
+
+template <typename TChar>
+struct delimiters_values {
+  using char_type = TChar;
+  const char_type *prefix;
+  const char_type *delimiter;
+  const char_type *postfix;
+};
+
+// Defines the delimiter values for a specific container and character type
+
+template <typename T, typename TChar>
+struct delimiters {
+  using type = delimiters_values<TChar>;
+  static const type values;
+};
+
+// Functor to print containers. You can use this directly if you want
+// to specify a non-default delimiters type. The printing logic can
+// be customized by specializing the nested template.
+
+template <typename T, typename TChar = char,
+          typename TCharTraits = ::std::char_traits<TChar>,
+          typename TDelimiters = delimiters<T, TChar>>
+struct print_container_helper {
+  using delimiters_type = TDelimiters;
+  using ostream_type = std::basic_ostream<TChar, TCharTraits>;
+
+  template <typename U>
+  struct printer {
+    static void print_body(const U &c, ostream_type &stream) {
+      using std::begin;
+      using std::end;
+
+      auto it = begin(c);
+      const auto the_end = end(c);
+
+      if (it != the_end) {
+        for (;;) {
+          stream << *it;
+
+          if (++it == the_end) break;
+
+          if (delimiters_type::values.delimiter != NULL)
+            stream << delimiters_type::values.delimiter;
+        }
+      }
+    }
+  };
+
+  print_container_helper(const T &container) : container_(container) {}
+
+  inline void operator()(ostream_type &stream) const {
+    if (delimiters_type::values.prefix != NULL)
+      stream << delimiters_type::values.prefix;
+
+    printer<T>::print_body(container_, stream);
+
+    if (delimiters_type::values.postfix != NULL)
+      stream << delimiters_type::values.postfix;
+  }
+
+ private:
+  const T &container_;
+};
+
+// Specialization for pairs
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+template <typename T1, typename T2>
+struct print_container_helper<T, TChar, TCharTraits,
+                              TDelimiters>::printer<std::pair<T1, T2>> {
+  using ostream_type =
+      typename print_container_helper<T, TChar, TCharTraits,
+                                      TDelimiters>::ostream_type;
+
+  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
+    stream << c.first;
+    if (print_container_helper<T, TChar, TCharTraits,
+                               TDelimiters>::delimiters_type::values
+            .delimiter != NULL)
+      stream << print_container_helper<T, TChar, TCharTraits,
+                                       TDelimiters>::delimiters_type::values
+                    .delimiter;
+    stream << c.second;
+  }
+};
+
+// Specialization for tuples
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+template <typename... Args>
+struct print_container_helper<T, TChar, TCharTraits,
+                              TDelimiters>::printer<std::tuple<Args...>> {
+  using ostream_type =
+      typename print_container_helper<T, TChar, TCharTraits,
+                                      TDelimiters>::ostream_type;
+  using element_type = std::tuple<Args...>;
+
+  template <std::size_t I>
+  struct Int {};
+
+  static void print_body(const element_type &c, ostream_type &stream) {
+    tuple_print(c, stream, Int<0>());
+  }
+
+  static void tuple_print(const element_type &, ostream_type &,
+                          Int<sizeof...(Args)>) {}
+
+  static void tuple_print(
+      const element_type &c, ostream_type &stream,
+      typename std::conditional<sizeof...(Args) != 0, Int<0>,
+                                std::nullptr_t>::type) {
+    stream << std::get<0>(c);
+    tuple_print(c, stream, Int<1>());
+  }
+
+  template <std::size_t N>
+  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
+    if (print_container_helper<T, TChar, TCharTraits,
+                               TDelimiters>::delimiters_type::values
+            .delimiter != NULL)
+      stream << print_container_helper<T, TChar, TCharTraits,
+                                       TDelimiters>::delimiters_type::values
+                    .delimiter;
+
+    stream << std::get<N>(c);
+
+    tuple_print(c, stream, Int<N + 1>());
+  }
+};
+
+// Prints a print_container_helper to the specified stream.
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+inline std::basic_ostream<TChar, TCharTraits> &operator<<(
+    std::basic_ostream<TChar, TCharTraits> &stream,
+    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
+  helper(stream);
+  return stream;
+}
+
+// Basic is_container template; specialize to derive from std::true_type for all
+// desired container types
+
+template <typename T>
+struct is_container
+    : public std::integral_constant<bool,
+                                    detail::has_const_iterator<T>::value &&
+                                        detail::has_begin_end<T>::beg_value &&
+                                        detail::has_begin_end<T>::end_value> {};
+
+template <typename T, std::size_t N>
+struct is_container<T[N]> : std::true_type {};
+
+template <std::size_t N>
+struct is_container<char[N]> : std::false_type {};
+
+template <typename T>
+struct is_container<std::valarray<T>> : std::true_type {};
+
+template <typename T1, typename T2>
+struct is_container<std::pair<T1, T2>> : std::true_type {};
+
+template <typename... Args>
+struct is_container<std::tuple<Args...>> : std::true_type {};
+
+// Default delimiters
+
+template <typename T>
+struct delimiters<T, char> {
+  static const delimiters_values<char> values;
+};
+template <typename T>
+const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
+template <typename T>
+struct delimiters<T, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename T>
+const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
+                                                                   L"]"};
+
+// Delimiters for (multi)set and unordered_(multi)set
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::set<T, TComp, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<char>
+    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
+                                                                  "}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
+        L"{", L", ", L"}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<char>
+    delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
+        "{", ", ", "}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
+        L"{", L", ", L"}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<char> delimiters<
+    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
+    "{", ", ", "}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<wchar_t> delimiters<
+    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
+    L"{", L", ", L"}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+                  char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<char> delimiters<
+    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
+    "{", ", ", "}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+                  wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+               wchar_t>::values = {L"{", L", ", L"}"};
+
+// Delimiters for pair and tuple
+
+template <typename T1, typename T2>
+struct delimiters<std::pair<T1, T2>, char> {
+  static const delimiters_values<char> values;
+};
+template <typename T1, typename T2>
+const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
+    "(", ", ", ")"};
+template <typename T1, typename T2>
+struct delimiters<::std::pair<T1, T2>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename T1, typename T2>
+const delimiters_values<wchar_t>
+    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
+
+template <typename... Args>
+struct delimiters<std::tuple<Args...>, char> {
+  static const delimiters_values<char> values;
+};
+template <typename... Args>
+const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
+    "(", ", ", ")"};
+template <typename... Args>
+struct delimiters<::std::tuple<Args...>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename... Args>
+const delimiters_values<wchar_t>
+    delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
+
+// Type-erasing helper class for easy use of custom delimiters.
+// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
+// and MyDelims needs to be defined for TChar. Usage: "cout <<
+// pretty_print::custom_delims<MyDelims>(x)".
+
+struct custom_delims_base {
+  virtual ~custom_delims_base() {}
+  virtual std::ostream &stream(::std::ostream &) = 0;
+  virtual std::wostream &stream(::std::wostream &) = 0;
+};
+
+template <typename T, typename Delims>
+struct custom_delims_wrapper : custom_delims_base {
+  custom_delims_wrapper(const T &t_) : t(t_) {}
+
+  std::ostream &stream(std::ostream &s) {
+    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
+               t);
+  }
+
+  std::wostream &stream(std::wostream &s) {
+    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
+                                       Delims>(t);
+  }
+
+ private:
+  const T &t;
+};
+
+template <typename Delims>
+struct custom_delims {
+  template <typename Container>
+  custom_delims(const Container &c)
+      : base(new custom_delims_wrapper<Container, Delims>(c)) {}
+
+  std::unique_ptr<custom_delims_base> base;
+};
+
+template <typename TChar, typename TCharTraits, typename Delims>
+inline std::basic_ostream<TChar, TCharTraits> &operator<<(
+    std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
+  return p.base->stream(s);
+}
+
+// A wrapper for a C-style array given as pointer-plus-size.
+// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
+
+template <typename T>
+struct array_wrapper_n {
+  typedef const T *const_iterator;
+  typedef T value_type;
+
+  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
+  inline const_iterator begin() const { return _array; }
+  inline const_iterator end() const { return _array + _n; }
+
+ private:
+  const T *const _array;
+  size_t _n;
+};
+
+// A wrapper for hash-table based containers that offer local iterators to each
+// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket
+// 5 of container m.)
+
+template <typename T>
+struct bucket_print_wrapper {
+  typedef typename T::const_local_iterator const_iterator;
+  typedef typename T::size_type size_type;
+
+  const_iterator begin() const { return m_map.cbegin(n); }
+
+  const_iterator end() const { return m_map.cend(n); }
+
+  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
+
+ private:
+  const T &m_map;
+  const size_type n;
+};
+
+}  // namespace pretty_print
+
+// Global accessor functions for the convenience wrappers
+
+template <typename T>
+inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
+                                                           size_t n) {
+  return pretty_print::array_wrapper_n<T>(a, n);
+}
+
+template <typename T>
+pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
+                                                   typename T::size_type n) {
+  return pretty_print::bucket_print_wrapper<T>(m, n);
+}
+
+// Main magic entry point: An overload snuck into namespace std.
+// Can we do better?
+
+namespace std {
+// Prints a container to the stream using default delimiters
+
+template <typename T, typename TChar, typename TCharTraits>
+inline typename enable_if<::pretty_print::is_container<T>::value,
+                          basic_ostream<TChar, TCharTraits> &>::type
+operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
+  return stream
+         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
+                container);
+}
+}  // namespace std
+
+#endif  // H_PRETTY_PRINT
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..026e35b1a6b52ec74fee27fbccd2dfda5ef845ce
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
@@ -0,0 +1,60 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <pybind11/embed.h>
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <spconv/tensorview/tensorview.h>
+
+#include <algorithm>
+#include <iostream>
+
+namespace py = pybind11;
+
+template <typename scalar_t, typename TPyObject>
+std::vector<scalar_t> array2Vector(TPyObject arr) {
+  py::array arr_np = arr;
+  size_t size = arr.attr("size").template cast<size_t>();
+  py::array_t<scalar_t> arr_cc = arr_np;
+  std::vector<scalar_t> data(arr_cc.data(), arr_cc.data() + size);
+  return data;
+}
+
+template <typename scalar_t>
+std::vector<scalar_t> arrayT2Vector(py::array_t<scalar_t> arr) {
+  std::vector<scalar_t> data(arr.data(), arr.data() + arr.size());
+  return data;
+}
+
+template <typename scalar_t, typename TPyObject>
+tv::TensorView<scalar_t> array2TensorView(TPyObject arr) {
+  py::array arr_np = arr;
+  py::array_t<scalar_t> arr_cc = arr_np;
+  tv::Shape shape;
+  for (int i = 0; i < arr_cc.ndim(); ++i) {
+    shape.push_back(arr_cc.shape(i));
+  }
+  return tv::TensorView<scalar_t>(arr_cc.mutable_data(), shape);
+}
+template <typename scalar_t>
+tv::TensorView<scalar_t> arrayT2TensorView(py::array_t<scalar_t> arr) {
+  tv::Shape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  return tv::TensorView<scalar_t>(arr.mutable_data(), shape);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
new file mode 100644
index 0000000000000000000000000000000000000000..def6fe5e125a4e8c7e38f889887a6af80557f219
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
@@ -0,0 +1,295 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPCONV_GEOMETRY_H_
+#define SPCONV_GEOMETRY_H_
+
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <iostream>
+#include <limits>
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
+                                    const Index *kernelSize,
+                                    const Index *stride, const Index *padding,
+                                    const Index *dilation,
+                                    const Index *outSpatialShape, Index *out) {
+  Index lowers[NDim];
+  Index uppers[NDim];
+  Index counter[NDim];
+  Index counterSize[NDim];
+  Index pointCounter = 0;
+  Index val;
+  Index numPoints = 1;
+  Index m, offset;
+  bool valid = false;
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
+                 stride[i] + padding[i]) /
+                stride[i];
+    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
+  }
+
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
+    numPoints *= counterSize[i];
+  }
+
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counter[i] = 0;
+  }
+  for (int i = 0; i < numPoints; ++i) {
+    valid = true;
+    m = 1;
+    offset = 0;
+#pragma unroll
+    for (int j = NDim - 1; j >= 0; --j) {
+      val = uppers[j] - counter[j] * dilation[j];
+      out[pointCounter * (NDim + 1) + j] = val;
+      if (val < 0 || (val > outSpatialShape[j] - 1)) {
+        valid = false;
+        // break;
+      }
+      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
+      m *= kernelSize[j];
+    }
+
+    out[pointCounter * (NDim + 1) + NDim] = offset;
+    if (valid) ++pointCounter;
+    counter[NDim - 1] += 1;
+#pragma unroll
+    for (int c = NDim - 1; c >= 0; --c) {
+      if (counter[c] == counterSize[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return pointCounter;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE Index getValidOutPosTranspose(
+    const Index *input_pos, const Index *kernelSize, const Index *stride,
+    const Index *padding, const Index *dilation, const Index *outSpatialShape,
+    Index *out) {
+  Index lowers[NDim];
+  Index uppers[NDim];
+  Index counter[NDim];
+  Index counterSize[NDim];
+  Index pointCounter = 0;
+  Index val;
+  Index numPoints = 1;
+  Index m, offset;
+  bool valid = false;
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    lowers[i] = input_pos[i] * stride[i] - padding[i];
+    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
+  }
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
+    numPoints *= counterSize[i];
+  }
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counter[i] = 0;
+  }
+  for (int i = 0; i < numPoints; ++i) {
+    valid = true;
+    m = 1;
+    offset = 0;
+#pragma unroll
+    for (int j = NDim - 1; j >= 0; --j) {
+      val = uppers[j] - counter[j] * dilation[j];
+      out[pointCounter * (NDim + 1) + j] = val;
+      if (val < 0 || (val > outSpatialShape[j] - 1)) {
+        valid = false;
+      }
+      offset += m * (val - lowers[j]) / dilation[j];
+      m *= kernelSize[j];
+    }
+    out[pointCounter * (NDim + 1) + NDim] = offset;
+    if (valid) ++pointCounter;
+    counter[NDim - 1] += 1;
+#pragma unroll
+    for (int c = NDim - 1; c >= 0; --c) {
+      if (counter[c] == counterSize[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return pointCounter;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
+                         tv::TensorView<Index> indicesOut,
+                         tv::TensorView<IndexGrid> gridsOut,
+                         tv::TensorView<Index> indicePairs,
+                         tv::TensorView<Index> indiceNum,
+                         const Index *kernelSize, const Index *stride,
+                         const Index *padding, const Index *dilation,
+                         const Index *outSpatialShape) {
+  // indicesOut: num_active * kernelVolume * (NDim + 1)
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  for (int j = 0; j < numActIn; ++j) {
+    batchIdx = indicesIn(j, 0);
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+                   spatialVolume * batchIdx;
+      if (gridsOut[index] == -1) {
+        for (unsigned k = 1; k < NDim + 1; ++k) {
+          indicesOut(numAct, k) = pointPtr[k - 1];
+        }
+        indicesOut(numAct, 0) = batchIdx;
+        gridsOut[index] = numAct++;
+      }
+      // indicePairs: [K, 2, L]
+      indicePairs(offset, 0, indiceNum[offset]) = j;
+      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+    }
+  }
+  return numAct;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
+                           tv::TensorView<Index> indicesOut,
+                           tv::TensorView<IndexGrid> gridsOut,
+                           tv::TensorView<Index> indicePairs,
+                           tv::TensorView<Index> indiceNum,
+                           const Index *kernelSize, const Index *stride,
+                           const Index *padding, const Index *dilation,
+                           const Index *outSpatialShape) {
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  for (int j = 0; j < numActIn; ++j) {
+    batchIdx = indicesIn(j, 0);
+    numValidPoints = getValidOutPosTranspose<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+                   spatialVolume * batchIdx;
+      if (gridsOut[index] == -1) {
+        for (unsigned k = 1; k < NDim + 1; ++k) {
+          indicesOut(numAct, k) = pointPtr[k - 1];
+        }
+        indicesOut(numAct, 0) = batchIdx;
+        gridsOut[index] = numAct++;
+      }
+      // indicePairs: [K, 2, L]
+      indicePairs(offset, 0, indiceNum[offset]) = j;
+      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+    }
+  }
+  return numAct;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
+                         tv::TensorView<IndexGrid> gridsOut,
+                         tv::TensorView<Index> indicePairs,
+                         tv::TensorView<Index> indiceNum,
+                         const Index *const kernelSize,
+                         const Index *const stride, const Index *const padding,
+                         const Index *dilation,
+                         const Index *const outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  // Index validPoints[kernelVolume * (NDim + 1)];
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  Index index = 0;
+  for (int j = 0; j < numActIn; ++j) {
+    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
+                                         outSpatialShape) +
+            spatialVolume * indicesIn(j, 0);
+    gridsOut[index] = j;
+  }
+  for (int j = 0; j < numActIn; ++j) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+              spatialVolume * indicesIn(j, 0);
+      if (gridsOut[index] > -1) {
+        indicePairs(offset, 0, indiceNum[offset]) = j;
+        indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+      }
+    }
+  }
+  return numActIn;
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
new file mode 100644
index 0000000000000000000000000000000000000000..96ce34e3b456f0c999002bd53b8b1a6ab082edae
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
@@ -0,0 +1,78 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
+#define SPARSE_CONV_INDICE_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1 {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2 {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctor {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+}  // namespace functor
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..78f32edd4db70724d38826809672aa461a6d065e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
@@ -0,0 +1,37 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
+#define SPARSE_MAXPOOL_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size);
+};
+
+template <typename Device, typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor {
+  void operator()(const Device& d, tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size);
+};
+}  // namespace functor
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..8262b30efb5e127d7e079ebdde0693c671fb96d6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
@@ -0,0 +1,50 @@
+#ifndef MP_HELPER_H_
+#define MP_HELPER_H_
+#include <type_traits>
+#include <utility>
+
+template <class... T>
+struct mp_list {};
+
+template <class T, T... I>
+using mp_list_c = mp_list<std::integral_constant<T, I>...>;
+
+namespace detail {
+
+template <class... T, class F>
+constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
+  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
+}
+
+template <class F>
+constexpr F mp_for_each_impl(mp_list<>, F &&f) {
+  return std::forward<F>(f);
+}
+
+}  // namespace detail
+
+namespace detail {
+
+template <class A, template <class...> class B>
+struct mp_rename_impl {
+  // An error "no type named 'type'" here means that the first argument to
+  // mp_rename is not a list
+};
+
+template <template <class...> class A, class... T, template <class...> class B>
+struct mp_rename_impl<A<T...>, B> {
+  using type = B<T...>;
+};
+
+}  // namespace detail
+
+template <class A, template <class...> class B>
+using mp_rename = typename ::detail::mp_rename_impl<A, B>::type;
+
+template <class L, class F>
+constexpr F mp_for_each(F &&f) {
+  return ::detail::mp_for_each_impl(mp_rename<L, mp_list>(),
+                                    std::forward<F>(f));
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
new file mode 100644
index 0000000000000000000000000000000000000000..95c1c6e389eb2f451e8640592ee2698d8b736010
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
@@ -0,0 +1,385 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <math.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <algorithm>
+#include <iostream>
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, int max_points,
+                          int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_mean(py::array_t<DType> points,
+                               py::array_t<DType> voxels,
+                               py::array_t<DType> means, py::array_t<int> coors,
+                               py::array_t<int> num_points_per_voxel,
+                               py::array_t<int> coor_to_voxelidx,
+                               std::vector<DType> voxel_size,
+                               std::vector<DType> coors_range, int max_points,
+                               int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto means_rw = means.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+      for (int k = 0; k < num_features; ++k) {
+        means_rw(voxelidx, k) +=
+            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
+      }
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    num = num_points_per_voxel_rw(i);
+    for (int j = num; j < max_points; ++j) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(i, j, k) = means_rw(i, k);
+      }
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_height(
+    py::array_t<DType> points, py::array_t<DType> voxels,
+    py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
+    int max_points, int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+        height_rw(voxelidx, k) =
+            std::min(points_rw(i, k), height_rw(voxelidx, k));
+        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    for (int k = 0; k < num_features; ++k) {
+      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
+                    py::array_t<DType> height, py::array_t<DType> maxs,
+                    py::array_t<int> coor_to_voxelidx,
+                    std::vector<DType> voxel_size,
+                    std::vector<DType> coors_range, int max_voxels, DType eps) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<1>();
+  auto maxs_rw = maxs.template mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+    }
+    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
+    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
+  }
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
+      mask(i) = 0;
+    }
+  }
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_with_filtering(
+    py::array_t<DType> points, py::array_t<DType> voxels,
+    py::array_t<int> voxel_mask, py::array_t<DType> mins,
+    py::array_t<DType> maxs, py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
+    int max_points, int max_voxels, int block_factor, int block_size,
+    DType height_threshold) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto mins_rw = mins.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+
+  DType max_value, min_value;
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int block_shape_H = grid_size[1] / block_factor;
+  int block_shape_W = grid_size[0] / block_factor;
+  int voxelidx, num;
+  int block_coor[2];
+  int startx, stopx, starty, stopy;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      block_coor[0] = coor[1] / block_factor;
+      block_coor[1] = coor[2] / block_factor;
+      mins_rw(block_coor[0], block_coor[1]) =
+          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
+      maxs_rw(block_coor[0], block_coor[1]) =
+          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor[1] = coors_rw(i, 1);
+    coor[2] = coors_rw(i, 2);
+    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
+    block_coor[0] = coor[1] / block_factor;
+    block_coor[1] = coor[2] / block_factor;
+    min_value = mins_rw(block_coor[0], block_coor[1]);
+    max_value = maxs_rw(block_coor[0], block_coor[1]);
+    startx = std::max(0, block_coor[0] - block_size / 2);
+    stopx =
+        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
+    starty = std::max(0, block_coor[1] - block_size / 2);
+    stopy =
+        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
+
+    for (int j = startx; j < stopx; ++j) {
+      for (int k = starty; k < stopy; ++k) {
+        min_value = std::min(min_value, mins_rw(j, k));
+        max_value = std::max(max_value, maxs_rw(j, k));
+      }
+    }
+    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
+  }
+  return voxel_num;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
new file mode 100644
index 0000000000000000000000000000000000000000..998d9511b060d02d9f12408038b56a802f63c1da
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
@@ -0,0 +1,36 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_REORDERING_FUNCTOR_H_
+#define SPARSE_REORDERING_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename scalar_t, typename Index>
+struct SparseGatherFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size);
+};
+
+template <typename Device, typename scalar_t, typename Index>
+struct SparseScatterAddFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> out_features,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size,
+                  bool stable = false);
+};
+}  // namespace functor
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..70851bc70ecb8ce1c74d777006d5b30b78e0d232
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
@@ -0,0 +1,75 @@
+#pragma once
+namespace tv {
+namespace detail {
+
+template <typename scalar_t>
+class KernelLoop {
+  struct Iterator {
+    __forceinline__ __device__ Iterator(scalar_t index, scalar_t delta)
+        : index_(index), delta_(delta) {}
+    __forceinline__ __device__ scalar_t operator*() const { return index_; }
+    __forceinline__ __device__ Iterator &operator++() {
+      index_ += delta_;
+      return *this;
+    }
+    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
+      bool greater = index_ > other.index_;
+      bool less = index_ < other.index_;
+      if (!other.delta_) {
+        return less;
+      }
+      if (!delta_) {
+        return greater;
+      }
+      return less || greater;
+    }
+
+   private:
+    scalar_t index_;
+    const scalar_t delta_;
+  };
+
+ public:
+  __forceinline__ __device__ KernelLoop(scalar_t begin, scalar_t delta,
+                                        scalar_t end)
+      : begin_(begin), delta_(delta), end_(end) {}
+
+  __forceinline__ __device__ Iterator begin() const {
+    return Iterator{begin_, delta_};
+  }
+  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
+
+ private:
+  scalar_t begin_;
+  scalar_t delta_;
+  scalar_t end_;
+};
+
+}  // namespace detail
+
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopX(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.x * blockDim.x + threadIdx.x,
+                                      gridDim.x * blockDim.x * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
+// Usage: for(int i : KernelLoopY(count)) { visit(i); }
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopY(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.y * blockDim.y + threadIdx.y,
+                                      gridDim.y * blockDim.y * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
+// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopZ(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.z * blockDim.z + threadIdx.z,
+                                      gridDim.z * blockDim.z * NumILP, count);
+}
+
+}  // namespace tv
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
new file mode 100644
index 0000000000000000000000000000000000000000..163df1720cbb0e55c70fb82e9762b040b3b13fb9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
@@ -0,0 +1,19 @@
+#pragma once
+// from pytorch.aten
+#include "tensorview.h"
+namespace tv {
+namespace launch {
+
+template <typename T1, typename T2>
+inline int DivUp(const T1 a, const T2 b) {
+  return (a + b - 1) / b;
+}
+
+constexpr int CUDA_NUM_THREADS = 1024;
+inline int getBlocks(const int N) {
+  TV_ASSERT_RT_ERR(N > 0,
+                   "CUDA kernel launch blocks must be positive, but got N=", N);
+  return DivUp(N, CUDA_NUM_THREADS);
+}
+}  // namespace launch
+}  // namespace tv
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
new file mode 100644
index 0000000000000000000000000000000000000000..66e01a8ed109d115a8882f30b6222cbeac8c63dc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
@@ -0,0 +1,1120 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <type_traits>
+#include <vector>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace tv {
+
+#if defined(__NVCC__) || defined(__HIP__)
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#define TV_ASSERT(expr) assert(expr)
+#elif defined(__CUDACC_RTC__)
+#define TV_ASSERT(expr) assert(expr)
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#else
+#define TV_ASSERT(x) assert(x)
+#define TV_HOST_DEVICE_INLINE inline
+#define TV_HOST_DEVICE
+#endif
+
+#define TV_REQUIRE(expr, ...) \
+  {                           \
+    if (!(expr)) {            \
+      printf(__VA_ARGS__);    \
+      assert(expr);           \
+    }                         \
+  }
+
+#define TV_DEVICE_REQUIRE(expr, ...)                      \
+  {                                                       \
+    if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
+    assert(expr);                                         \
+  }
+
+template <class SStream, class T>
+void sstream_print(SStream &ss, T val) {
+  ss << val;
+}
+
+template <class SStream, class T, class... TArgs>
+void sstream_print(SStream &ss, T val, TArgs... args) {
+  ss << val << " ";
+  sstream_print(ss, args...);
+}
+
+#define TV_ASSERT_RT_ERR(expr, ...)                     \
+  {                                                     \
+    if (!(expr)) {                                      \
+      std::stringstream __macro_s;                      \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
+      __macro_s << #expr << " assert failed. ";         \
+      tv::sstream_print(__macro_s, __VA_ARGS__);        \
+      throw std::runtime_error(__macro_s.str());        \
+    }                                                   \
+  }
+
+#define TV_ASSERT_INVALID_ARG(expr, ...)                \
+  {                                                     \
+    if (!(expr)) {                                      \
+      std::stringstream __macro_s;                      \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
+      __macro_s << #expr << " assert failed. ";         \
+      tv::sstream_print(__macro_s, __VA_ARGS__);        \
+      throw std::invalid_argument(__macro_s.str());     \
+    }                                                   \
+  }
+
+#define TV_CHECK_CUDA_ERR()                                    \
+  {                                                            \
+    auto err = cudaGetLastError();                             \
+    if (err != cudaSuccess) {                                  \
+      std::stringstream __macro_s;                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";        \
+      __macro_s << "cuda execution failed with error " << err; \
+      throw std::runtime_error(__macro_s.str());               \
+    }                                                          \
+  }
+
+struct CPU {};
+
+#define TV_MAX_DIM 6
+
+template <typename scalar_t, size_t MaxDim = TV_MAX_DIM>
+struct SimpleVector {
+ public:
+  TV_HOST_DEVICE_INLINE SimpleVector(){};
+  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<scalar_t> q) {
+    TV_ASSERT(q.size() <= MaxDim);
+    mSize = 0;
+    for (scalar_t s : q) {
+      mArray[mSize++] = s;
+    }
+    mSize = q.size();
+  }
+  SimpleVector(const std::vector<scalar_t> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE SimpleVector(
+      const SimpleVector<scalar_t, MaxDim> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE const scalar_t &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE void push_back(scalar_t s) {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize < MaxDim);
+#endif
+    mArray[mSize] = s;
+    mSize++;
+  }
+  TV_HOST_DEVICE_INLINE void pop_back() {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize > 0);
+#endif
+    mSize--;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
+  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mArray; }
+  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
+
+  typedef size_t size_type;
+
+  class iterator {
+   public:
+    typedef iterator self_type;
+    typedef scalar_t value_type;
+    typedef scalar_t &reference;
+    typedef scalar_t *pointer;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef std::ptrdiff_t difference_type;
+    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+   private:
+    pointer ptr_;
+  };
+
+  class const_iterator {
+   public:
+    typedef const_iterator self_type;
+    typedef scalar_t value_type;
+    typedef const scalar_t &reference;
+    typedef const scalar_t *pointer;
+    typedef std::ptrdiff_t difference_type;
+    typedef std::forward_iterator_tag iterator_category;
+    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+   private:
+    pointer ptr_;
+  };
+
+  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
+
+  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
+
+  TV_HOST_DEVICE_INLINE const_iterator begin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator end() const {
+    return const_iterator(mArray + mSize);
+  }
+  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator cend() const {
+    return const_iterator(mArray + mSize);
+  }
+
+ protected:
+  scalar_t mArray[MaxDim];
+  size_t mSize = 0;
+};
+
+template <typename scalar_t, size_t MaxDim>
+bool operator==(const SimpleVector<scalar_t, MaxDim> &lfs,
+                const SimpleVector<scalar_t, MaxDim> &rfs) {
+  if (lfs.size() != rfs.size()) return false;
+  for (size_t i = 0; i < lfs.size(); ++i) {
+    if (lfs[i] != rfs[i]) return false;
+  }
+  return true;
+}
+
+template <typename scalar_t, size_t MaxDim>
+bool operator!=(const SimpleVector<scalar_t, MaxDim> &lfs,
+                const SimpleVector<scalar_t, MaxDim> &rfs) {
+  return !(lfs == rfs);
+}
+
+struct Slice {
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
+    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
+    SimpleVector<int, 3> slices{int(ints)...};
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    for (size_t i = 0; i < slices.size(); ++i) {
+      mSlices[i] = slices[i];
+    }
+  }
+
+  TV_HOST_DEVICE_INLINE Slice() {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+  }
+  template <typename scalar_t>
+  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<scalar_t> slice) {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    TV_ASSERT(slice.size() <= 3);
+    int idx = 0;
+    for (scalar_t s : slice) {
+      mSlices[idx] = int(s);
+      ++idx;
+    }
+  }
+  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+
+ protected:
+  int mSlices[3];
+};
+
+template <size_t MaxDim = TV_MAX_DIM>
+struct ShapeBase : public SimpleVector<int, MaxDim> {
+  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
+  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+
+  // TODO: find out why this template can no be used on windows
+  // template <typename scalar_t, template <class...> class Container>
+  // ShapeBase(Container<scalar_t> shape) : SimpleVector<int, MaxDim>(shape) {}
+  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
+
+  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < end; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && start <= this->mSize);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < this->mSize; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const {
+    if (this->mSize == 0) return 0;
+    size_t s = 1;
+    for (int i = 0; i < int(this->mSize); ++i) {
+      s *= this->mArray[i];
+    }
+    return s;
+  }
+  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+};
+
+using Shape = ShapeBase<TV_MAX_DIM>;
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#ifdef TV_DEBUG
+  TV_ASSERT(sizeof...(indexes) == shape.size());
+#endif
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           std::vector<int> &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = shape.size() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           const Shape &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
+                                           const Index *shape) {
+  unsigned offset = 0;
+  unsigned m = 1;
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    offset += m * indexes[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
+                                           const Index *shape) {
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    output[i] = index % shape[i];
+    index -= output[i];
+    index /= shape[i];
+  }
+  return index;
+}
+
+template <int N>
+struct ArrayIndexRowMajor {
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return indexes[N - 1] +
+           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
+  }
+};
+
+template <>
+struct ArrayIndexRowMajor<0> {
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return 0;
+  }
+};
+
+namespace detail {
+template <typename scalar_t>
+constexpr const char *simpleTypeName(scalar_t val = scalar_t());
+template <>
+constexpr const char *simpleTypeName(float val) {
+  return "float32";
+}
+template <>
+constexpr const char *simpleTypeName(double val) {
+  return "float64";
+}
+template <>
+constexpr const char *simpleTypeName(int val) {
+  return "int32";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned val) {
+  return "uint32";
+}
+template <>
+constexpr const char *simpleTypeName(long val) {
+  return "int64";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned long val) {
+  return "uint64";
+}
+};  // namespace detail
+
+template <typename scalar_t, int Rank = -1>
+struct TensorView {
+  TV_HOST_DEVICE_INLINE TensorView() {}
+  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Shape shape)
+      : mPtr(ptr), mShape(shape) {}
+  template <class... Integers>
+  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Integers... shapes)
+      : mPtr(ptr) {
+    mShape = {int(shapes)...};
+  }
+
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
+      const TensorView<scalar_t, Rank> &tensor) {
+    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
+               "\n");
+    scalar_t *ptr = mPtr;
+    const scalar_t *other_ptr = tensor.data();
+    for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
+    return *this;
+  }
+
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
+      std::initializer_list<T1> seq) {
+    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
+               "\n");
+    scalar_t *ptr = mPtr;
+    for (const T1 &s : seq) *(ptr++) = scalar_t(s);
+    return *this;
+  }
+
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(Inds... inds) {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(Inds... inds) const {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  TV_HOST_DEVICE_INLINE scalar_t &operator()() {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()() const {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+
+  template <class T1>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1) {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  template <class T1>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3,
+                                                   T4 i4) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
+                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
+#else
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+               int(idx), size());
+#endif
+#endif
+    return mPtr[idx];
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> operator[](
+      SimpleVector<Slice> slice_vec) {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE const TensorView<scalar_t, Rank> operator[](
+      SimpleVector<Slice> slice_vec) const {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
+  TV_HOST_DEVICE_INLINE scalar_t *data() { return mPtr; }
+  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mPtr; }
+  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
+  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
+  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Inds... newShapes) {
+    Shape shapes{int(newShapes)...};
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Shape shapes) {
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(
+      Inds... newShapes) const {
+    Shape shapes{int(newShapes)...};
+    for (size_t i = 0; i < shapes.ndim(); ++i) {
+      if (shapes[i] == -1) {
+        shapes[i] = 1;
+        shapes[i] = size() / shapes.size();
+        break;
+      }
+    }
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<scalar_t, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(Shape shapes) const {
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<scalar_t, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze() const {
+    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze());
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze(int dim) const {
+    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze(dim));
+  }
+  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
+
+  template <class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
+      Slice slice, Slices... slices) const {
+    return subview<float, Slice, Slices...>(slice, slices...);
+  }
+  template <class T2 = float, class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
+      Slices... slices) const {
+    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
+    Shape new_shape{to_slice(slices)[0]...};
+    Shape start{to_slice(slices)[0]...};
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1;
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
+  }
+
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(int id,
+                                                           Integers... ints) {
+    Shape start = {id, ints...};
+    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
+      start.push_back(0);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + rowArrayIdx(mShape, start),
+                                      mShape.subshape(sizeof...(ints) + 1));
+  }
+
+  std::string repr() const {
+    std::ostringstream ss;
+    if (empty()) return "";
+    if (mShape.ndim() == 0) {
+      ss << *mPtr;
+      ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
+      return ss.str();
+    }
+    Shape counter = mShape;
+    auto tensor_flat = this->view(-1);
+    for (int i = 0; i < counter.ndim(); ++i) {
+      counter[i] = 0;
+      ss << "[";
+    }
+    for (size_t i = 0; i < this->size(); ++i) {
+      ss << tensor_flat(rowArrayIdx(mShape, counter));
+      counter[counter.ndim() - 1] += 1;
+      int inc_count = 0;
+      bool print_comma = true;
+      for (int c = counter.ndim() - 1; c >= 0; --c) {
+        if (counter[c] == this->dim(c) && c > 0) {
+          ++inc_count;
+          counter[c - 1] += 1;
+          counter[c] = 0;
+          print_comma = false;
+        }
+      }
+      if (print_comma && i != this->size() - 1) ss << ", ";
+      for (int j = 0; j < inc_count; ++j) {
+        ss << "]";
+      }
+      if (i != this->size() - 1) {
+        if (inc_count != 0) ss << "\n";
+        for (int j = 0; j < inc_count; ++j) {
+          ss << "[";
+        }
+      }
+    }
+    ss << "]";
+    ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
+    return ss.str();
+  }
+
+ protected:
+  // TODO: make this function public.
+  // currently this function is called unexpectedly when using subview({0, 0}).
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> _subview(
+      SimpleVector<Slice> slice_vec) {
+    Shape new_shape;
+    for (int i = 0; i < slice_vec.size(); ++i) {
+      new_shape.push_back(slice_vec[i][0]);
+    }
+    Shape start = new_shape;
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1;  // reduce dim
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
+  }
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
+    return Slice{int(s), -1, -1};
+  }
+
+  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
+
+  scalar_t *mPtr = nullptr;
+  Shape mShape;
+};
+
+template <typename Os, typename scalar_t, int Rank>
+Os &operator<<(Os &os, const TensorView<scalar_t, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+template <typename Os, typename scalar_t, int Rank>
+Os &operator<<(Os &os, const TensorView<const scalar_t, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+namespace detail {
+template <typename scalar_t>
+constexpr const char *printfTypeFormat(scalar_t val = scalar_t());
+template <>
+constexpr const char *printfTypeFormat(float val) {
+  return "%.2f";
+}
+template <>
+constexpr const char *printfTypeFormat(double val) {
+  return "%.2f";
+}
+template <>
+constexpr const char *printfTypeFormat(int val) {
+  return "%d";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned val) {
+  return "%u";
+}
+template <>
+constexpr const char *printfTypeFormat(long val) {
+  return "%ld";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned long val) {
+  return "%lu";
+}
+};  // namespace detail
+
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const TensorView<scalar_t> tensor,
+                                    const char *format) {
+  if (tensor.empty()) return;
+  if (tensor.ndim() == 0) {
+    printf(format, tensor());
+    printf("\n");
+    return;
+  }
+  Shape counter = tensor.shape();
+  auto tensor_flat = tensor.view(-1);
+  for (int i = 0; i < counter.ndim(); ++i) {
+    counter[i] = 0;
+    printf("[");
+  }
+  for (size_t i = 0; i < tensor.size(); ++i) {
+    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
+    counter[counter.ndim() - 1] += 1;
+    int inc_count = 0;
+    bool print_comma = true;
+    for (int c = counter.ndim() - 1; c >= 0; --c) {
+      if (counter[c] == tensor.dim(c) && c > 0) {
+        ++inc_count;
+        counter[c - 1] += 1;
+        counter[c] = 0;
+        print_comma = false;
+      }
+    }
+    if (print_comma && i != tensor.size() - 1) printf(", ");
+    for (int j = 0; j < inc_count; ++j) {
+      printf("]");
+    }
+    if (i != tensor.size() - 1) {
+      if (inc_count != 0) printf("\n");
+      for (int j = 0; j < inc_count; ++j) {
+        printf("[");
+      }
+    }
+  }
+  printf("]\n");
+}
+
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(TensorView<scalar_t> tensor) {
+  using Traw = typename std::remove_const<scalar_t>::type;
+  return printTensorView(tensor, detail::printfTypeFormat<Traw>());
+}
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape) {
+  using Traw = typename std::remove_const<scalar_t>::type;
+  return printTensorView(TensorView<const scalar_t>(ptr, shape),
+                         detail::printfTypeFormat<Traw>());
+}
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape,
+                                    const char *format) {
+  return printTensorView(TensorView<const scalar_t>(ptr, shape), format);
+}
+
+}  // namespace tv
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/active_rotated_filter.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1ead1f8e4700d019fff7b25034e2475087040c8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/active_rotated_filter.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
+                       output);
+}
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
+                       grad_in);
+}
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output) {
+  active_rotated_filter_forward_impl(input, indices, output);
+}
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in) {
+  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9097f7e0a15d817b8e176a01e080e8f4476f6be9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "active_rotated_filter_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void active_rotated_filter_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto input = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  active_rotated_filter_forward(input, indices, output);
+}
+
+void active_rotated_filter_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto grad_out = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto grad_in = buildATensor(ctx, outs[0]);
+  active_rotated_filter_backward(grad_out, indices, grad_in);
+}
+#endif
+
+void active_rotated_filter_forward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto input = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  active_rotated_filter_forward(input, indices, output);
+}
+
+void active_rotated_filter_backward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto grad_out = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto grad_in = buildATensor(ctx, outs[0]);
+  active_rotated_filter_backward(grad_out, indices, grad_in);
+}
+
+PARROTS_EXTENSION_REGISTER(active_rotated_filter_forward)
+    .input(2)
+    .output(1)
+    .apply(active_rotated_filter_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(active_rotated_filter_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(active_rotated_filter_backward)
+    .input(2)
+    .output(1)
+    .apply(active_rotated_filter_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(active_rotated_filter_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a4d2ce96a416d6d845413f08b586aa55c57ea2f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
@@ -0,0 +1,13 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ACTIVE_ROTATED_FILTER_PYTORCH_H
+#define ACTIVE_ROTATED_FILTER_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output);
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in);
+
+#endif  // ACTIVE_ROTATED_FILTER_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/assign_score_withk.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/assign_score_withk.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9076277181c48c7c8f236cb9da79a83c5d38d47f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/assign_score_withk.cpp
@@ -0,0 +1,42 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
+                       aggregate, points, centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
+                       aggregate, grad_out, points, centers, scores, knn_idx,
+                       grad_points, grad_centers, grad_scores);
+}
+
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
+                                const Tensor& scores, const Tensor& knn_idx,
+                                Tensor& output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate) {
+  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
+                                  centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
+                                 const Tensor& centers, const Tensor& scores,
+                                 const Tensor& knn_idx, Tensor& grad_points,
+                                 Tensor& grad_centers, Tensor& grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate) {
+  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
+                                   points, centers, scores, knn_idx,
+                                   grad_points, grad_centers, grad_scores);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5729c716310069f2abd49412255b048a5dfe3f68
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp
@@ -0,0 +1,89 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "assign_score_withk_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void assign_score_withk_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int B, N0, N1, M, K, O, aggregate;
+  SSAttrs(attr)
+      .get<int>("B", B)
+      .get<int>("N0", N0)
+      .get<int>("N1", N1)
+      .get<int>("M", M)
+      .get<int>("K", K)
+      .get<int>("O", O)
+      .get<int>("aggregate", aggregate)
+      .done();
+
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& centers = buildATensor(ctx, ins[1]);
+  const auto& scores = buildATensor(ctx, ins[2]);
+  const auto& knn_idx = buildATensor(ctx, ins[3]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  assign_score_withk_forward(points, centers, scores, knn_idx, output, B, N0,
+                             N1, M, K, O, aggregate);
+}
+
+void assign_score_withk_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int B, N0, N1, M, K, O, aggregate;
+  SSAttrs(attr)
+      .get<int>("B", B)
+      .get<int>("N0", N0)
+      .get<int>("N1", N1)
+      .get<int>("M", M)
+      .get<int>("K", K)
+      .get<int>("O", O)
+      .get<int>("aggregate", aggregate)
+      .done();
+
+  const auto& grad_out = buildATensor(ctx, ins[0]);
+  const auto& points = buildATensor(ctx, ins[1]);
+  const auto& centers = buildATensor(ctx, ins[2]);
+  const auto& scores = buildATensor(ctx, ins[3]);
+  const auto& knn_idx = buildATensor(ctx, ins[4]);
+
+  auto grad_points = buildATensor(ctx, outs[0]);
+  auto grad_centers = buildATensor(ctx, outs[1]);
+  auto grad_scores = buildATensor(ctx, outs[2]);
+  assign_score_withk_backward(grad_out, points, centers, scores, knn_idx,
+                              grad_points, grad_centers, grad_scores, B, N0, N1,
+                              M, K, O, aggregate);
+}
+
+PARROTS_EXTENSION_REGISTER(assign_score_withk_forward)
+    .attr("B")
+    .attr("N0")
+    .attr("N1")
+    .attr("M")
+    .attr("K")
+    .attr("O")
+    .attr("aggregate")
+    .input(4)
+    .output(1)
+    .apply(assign_score_withk_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(assign_score_withk_backward)
+    .attr("B")
+    .attr("N0")
+    .attr("N1")
+    .attr("M")
+    .attr("K")
+    .attr("O")
+    .attr("aggregate")
+    .input(5)
+    .output(3)
+    .apply(assign_score_withk_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..660594feec80371eaece3a5663facf1db2b366d9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h
@@ -0,0 +1,19 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ASSIGN_SCORE_WITHK_PYTORCH_H
+#define ASSIGN_SCORE_WITHK_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
+                                const Tensor& scores, const Tensor& knn_idx,
+                                Tensor& output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate);
+
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
+                                 const Tensor& centers, const Tensor& scores,
+                                 const Tensor& knn_idx, Tensor& grad_points,
+                                 Tensor& grad_centers, Tensor& grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate);
+
+#endif  // ASSIGN_SCORE_WITHK_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ball_query._parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ball_query._parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..01ab9739b09986a59b69961c5b108bb098b36d6e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ball_query._parrots.cpp
@@ -0,0 +1,43 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "ball_query_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void ball_query_parrots(CudaContext& ctx, const SSElement& attr,
+                        const OperatorBase::in_list_t& ins,
+                        OperatorBase::out_list_t& outs) {
+  int b, n, m, nsample;
+  float min_radius, max_radius;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("n", n)
+      .get<int>("m", m)
+      .get<int>("nsample", nsample)
+      .get<float>("min_radius", min_radius)
+      .get<float>("max_radius", max_radius)
+      .done();
+
+  const auto& center_xyz = buildATensor(ctx, ins[0]);
+  const auto& xyz = buildATensor(ctx, ins[1]);
+  auto idx = buildATensor(ctx, outs[0]);
+  ball_query_forward(center_xyz, xyz, idx, b, n, m, min_radius, max_radius,
+                     nsample);
+}
+
+PARROTS_EXTENSION_REGISTER(ball_query_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .attr("nsample")
+    .attr("min_radius")
+    .attr("max_radius")
+    .input(2)
+    .output(1)
+    .apply(ball_query_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ball_query.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ball_query.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c9e7a20785e894c80d15256a1b040beffa92b47
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ball_query.cpp
@@ -0,0 +1,20 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
+                       nsample, new_xyz, xyz, idx);
+}
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample) {
+  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
+                          new_xyz_tensor, xyz_tensor, idx_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ball_query_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ball_query_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..70026f315089d1c37335865ae719f301407d6231
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ball_query_pytorch.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BALL_QUERY_PYTORCH_H
+#define BALL_QUERY_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void ball_query_forward(const Tensor new_xyz, const Tensor xyz, Tensor idx,
+                        int b, int n, int m, float min_radius, float max_radius,
+                        int nsample);
+
+#endif  // BALL_QUERY_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/bbox_overlaps.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..187216fb01a307906a6fff8d7c10fc4efa1b9b3a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
+                       aligned, offset);
+}
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset) {
+  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5f6264d3c07a6b0c0f5b1cb98666580e7bae6a25
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp
@@ -0,0 +1,40 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "bbox_overlaps_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*
+ * void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor
+ * ious, const int mode, const bool aligned, const int offset);
+ */
+void bbox_overlaps_parrots(CudaContext& ctx, const SSElement& attr,
+                           const OperatorBase::in_list_t& ins,
+                           OperatorBase::out_list_t& outs) {
+  int mode, offset;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("mode", mode)
+      .get<bool>("aligned", aligned)
+      .get<int>("offset", offset)
+      .done();
+
+  const auto& bboxes1 = buildATensor(ctx, ins[0]);
+  const auto& bboxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+PARROTS_EXTENSION_REGISTER(bbox_overlaps)
+    .attr("mode")
+    .attr("aligned")
+    .attr("offset")
+    .input(2)
+    .output(1)
+    .apply(bbox_overlaps_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f68aa3397d80db7dd2cf4299b4391cddc533920
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h
@@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BBOX_OVERLAPS_PYTORCH_H
+#define BBOX_OVERLAPS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+
+#endif  // BBOX_OVERLAPS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/border_align.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/border_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..565de689913413ab106884365e6dc1edfa940de0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/border_align.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
+                       argmax_idx, pool_size);
+}
+
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
+                       argmax_idx, grad_input, pool_size);
+}
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size) {
+  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
+}
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size) {
+  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
+                             pool_size);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/border_align_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/border_align_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c3bea58cca4903a1a33361ecdfe0e0d37404e0b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/border_align_parrots.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "border_align_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void border_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int pool_size;
+  SSAttrs(attr).get<int>("pool_size", pool_size).done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& boxes = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax_idx = buildATensor(ctx, outs[1]);
+  border_align_forward_cuda(input, boxes, output, argmax_idx, pool_size);
+}
+
+void border_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int pool_size;
+  SSAttrs(attr).get<int>("pool_size", pool_size).done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& boxes = buildATensor(ctx, ins[1]);
+  const auto& argmax_idx = buildATensor(ctx, ins[2]);
+
+  auto bottom_grad = buildATensor(ctx, outs[0]);
+  border_align_backward_cuda(top_grad, boxes, argmax_idx, bottom_grad,
+                             pool_size);
+}
+
+PARROTS_EXTENSION_REGISTER(border_align_forward)
+    .attr("pool_size")
+    .input(2)
+    .output(2)
+    .apply(border_align_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(border_align_backward)
+    .attr("pool_size")
+    .input(3)
+    .output(1)
+    .apply(border_align_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/border_align_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/border_align_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb031e572a50df4edec4fc65056700c8850f7715
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/border_align_pytorch.h
@@ -0,0 +1,17 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BORDER_ALIGN_PYTORCH_H
+#define BORDER_ALIGN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size);
+#endif
+
+#endif  // BORDER_ALIGN_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/box_iou_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2a4e0953a5575f72c167bd668c6b6e758ebae87
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
@@ -0,0 +1,19 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
+                       aligned);
+}
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned) {
+  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a90d640458b8ed38b9e18c3b26f574ce4c58e8fb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "box_iou_rotated_pytorch.h"
+
+using namespace parrots;
+
+/*
+ * void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor
+ * ious, const int mode_flag, const bool aligned);
+ */
+void box_iou_rotated_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  bool aligned;
+  int mode_flag;
+  SSAttrs(attr)
+      .get<bool>("aligned", aligned)
+      .get<int>("mode_flag", mode_flag)
+      .done();
+
+  const auto& boxes1 = buildATensor(ctx, ins[0]);
+  const auto& boxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+#ifdef MMCV_WITH_CUDA
+/*
+ * void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor
+ * ious, const int mode_flag, const bool aligned);
+ */
+void box_iou_rotated_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  bool aligned;
+  int mode_flag;
+  SSAttrs(attr)
+      .get<bool>("aligned", aligned)
+      .get<int>("mode_flag", mode_flag)
+      .done();
+
+  const auto& boxes1 = buildATensor(ctx, ins[0]);
+  const auto& boxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
+}
+#endif
+
+PARROTS_EXTENSION_REGISTER(box_iou_rotated)
+    .attr("aligned")
+    .attr("mode_flag")
+    .input(2)
+    .output(1)
+    .apply(box_iou_rotated_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(box_iou_rotated_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..afab7031812d4389707e6b4235affba93faef6c0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BOX_IOU_ROTATED_PYTORCH_H
+#define BOX_IOU_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned);
+
+#ifdef MMCV_WITH_CUDA
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+#endif
+
+#endif  // BOX_IOU_ROTATED_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a563aed94f04e32614e38062c4e7f4250c6dafe6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
+                       rmasks, output, kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
+                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor) {
+  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor) {
+  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_naive.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_naive.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e8917a61d93c7e6613566902cb00623ea89444e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_naive.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
+                       kernel_size, group_size, scale_factor);
+}
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c16a3707991d015971325fe161c2c9c4c2c31a6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "carafe_naive_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+ *                                int kernel_size, int group_size,
+ *                                int scale_factor)
+ */
+void carafe_naive_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& features = buildATensor(ctx, ins[0]);
+  const auto& masks = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+/*void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor
+ * masks, Tensor bottom_grad, Tensor mask_grad, int kernel_size, int group_size,
+ *                                int scale_factor);
+ */
+void carafe_naive_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& features = buildATensor(ctx, ins[1]);
+  const auto& masks = buildATensor(ctx, ins[2]);
+
+  auto bottom_grad = buildATensor(ctx, outs[0]);
+  auto mask_grad = buildATensor(ctx, outs[1]);
+  carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
+
+PARROTS_EXTENSION_REGISTER(carafe_naive_forward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(2)
+    .output(1)
+    .apply(carafe_naive_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(carafe_naive_backward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(3)
+    .output(2)
+    .apply(carafe_naive_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_naive_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_naive_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6df9b88c231b4949f128c528cc3f31633c76fb79
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_naive_pytorch.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_NAIVE_PYTORCH_H
+#define CARAFE_NAIVE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+#endif  // CARAFE_NAIVE_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e99f59ef221bfe7058c53a486c75e201c44e7f68
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_parrots.cpp
@@ -0,0 +1,88 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "carafe_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*
+ * void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+ *                          Tensor routput, Tensor rmasks, Tensor output,
+ *                          int kernel_size, int group_size, int scale_factor);
+ */
+void carafe_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& features = buildATensor(ctx, ins[0]);
+  const auto& masks = buildATensor(ctx, ins[1]);
+
+  auto rfeatures = buildATensor(ctx, outs[0]);
+  auto routput = buildATensor(ctx, outs[1]);
+  auto rmasks = buildATensor(ctx, outs[2]);
+  auto output = buildATensor(ctx, outs[3]);
+
+  carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+/*
+ * void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+ *                           Tensor rtop_grad, Tensor rbottom_grad_hs,
+ *                           Tensor rbottom_grad, Tensor rmask_grad,
+ *                           Tensor bottom_grad, Tensor mask_grad, int
+ * kernel_size, int group_size, int scale_factor);
+ */
+void carafe_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& rfeatures = buildATensor(ctx, ins[1]);
+  const auto& masks = buildATensor(ctx, ins[2]);
+
+  auto rtop_grad = buildATensor(ctx, outs[0]);
+  auto rbottom_grad_hs = buildATensor(ctx, outs[1]);
+  auto rbottom_grad = buildATensor(ctx, outs[2]);
+  auto rmask_grad = buildATensor(ctx, outs[3]);
+  auto bottom_grad = buildATensor(ctx, outs[4]);
+  auto mask_grad = buildATensor(ctx, outs[5]);
+
+  carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
+
+PARROTS_EXTENSION_REGISTER(carafe_forward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(2)
+    .output(4)
+    .apply(carafe_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(carafe_backward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(3)
+    .output(6)
+    .apply(carafe_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b94d44d3c9d1a81e0838bf209d774c703004fa9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_PYTORCH_H
+#define CARAFE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+#endif  // CARAFE_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/chamfer_distance.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/chamfer_distance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dcff69893185d7cc52d8048d300b45ccfe0b3968
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/chamfer_distance.cpp
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2) {
+  DISPATCH_DEVICE_IMPL(chamfer_distance_forward_impl, xyz1, xyz2, dist1, dist2,
+                       idx1, idx2);
+}
+
+void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor idx1, Tensor idx2, Tensor graddist1,
+                                    Tensor graddist2, Tensor gradxyz1,
+                                    Tensor gradxyz2) {
+  DISPATCH_DEVICE_IMPL(chamfer_distance_backward_impl, xyz1, xyz2, idx1, idx2,
+                       graddist1, graddist2, gradxyz1, gradxyz2);
+}
+
+void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
+                              const Tensor dist1, const Tensor dist2,
+                              const Tensor idx1, const Tensor idx2) {
+  chamfer_distance_forward_impl(xyz1, xyz2, dist1, dist2, idx1, idx2);
+}
+
+void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
+                               Tensor idx1, Tensor idx2, Tensor graddist1,
+                               Tensor graddist2, Tensor gradxyz1,
+                               Tensor gradxyz2) {
+  chamfer_distance_backward_impl(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
+                                 gradxyz1, gradxyz2);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db8eff1d6f5e0c4a0c1e21a55f54381f1d5a3104
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp
@@ -0,0 +1,51 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "chamfer_distance_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void chamfer_distance_forward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  auto xyz1 = buildATensor(ctx, ins[0]);
+  auto xyz2 = buildATensor(ctx, ins[1]);
+  auto dist1 = buildATensor(ctx, outs[0]);
+  auto dist2 = buildATensor(ctx, outs[1]);
+  auto idx1 = buildATensor(ctx, outs[2]);
+  auto idx2 = buildATensor(ctx, outs[3]);
+  chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1, idx2);
+}
+
+void chamfer_distance_backward_cuda_parrots(CudaContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  auto xyz1 = buildATensor(ctx, ins[0]);
+  auto xyz2 = buildATensor(ctx, ins[1]);
+  auto idx1 = buildATensor(ctx, ins[2]);
+  auto idx2 = buildATensor(ctx, ins[3]);
+  auto graddist1 = buildATensor(ctx, ins[4]);
+  auto graddist2 = buildATensor(ctx, ins[5]);
+  auto gradxyz1 = buildATensor(ctx, outs[0]);
+  auto gradxyz2 = buildATensor(ctx, outs[1]);
+  chamfer_distance_backward(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
+                            gradxyz1, gradxyz2);
+}
+
+PARROTS_EXTENSION_REGISTER(chamfer_distance_forward)
+    .input(2)
+    .output(4)
+    .apply(chamfer_distance_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(chamfer_distance_backward)
+    .input(6)
+    .output(2)
+    .apply(chamfer_distance_backward_cuda_parrots)
+    .done();
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6405526b0c4c73d6aa1bb2142687d148ba559af2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
+#define ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
+                              const Tensor dist1, const Tensor dist2,
+                              const Tensor idx1, const Tensor idx);
+
+void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
+                               Tensor idx1, Tensor idx2, Tensor graddist1,
+                               Tensor graddist2, Tensor gradxyz1,
+                               Tensor gradxyz2);
+
+#endif  // ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/contour_expand.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/contour_expand.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..586c48ee44b6b7dbb24573b4a2d2ecf499a56d0b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/contour_expand.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/whai362/PSENet
+#include <iostream>
+#include <queue>
+
+#include "pytorch_cpp_helper.hpp"
+
+using namespace std;
+
+class Point2d {
+ public:
+  int x;
+  int y;
+
+  Point2d() : x(0), y(0) {}
+  Point2d(int _x, int _y) : x(_x), y(_y) {}
+};
+
+void kernel_dilate(const uint8_t *data, IntArrayRef data_shape,
+                   const int *label_map, int &label_num, int &min_area,
+                   vector<vector<int>> &text_line) {
+  std::vector<int> area(label_num + 1);
+  int kernel_num = data_shape[0];
+  int height = data_shape[1];
+  int width = data_shape[2];
+
+  for (int x = 0; x < height; ++x) {
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      area[label] += 1;
+    }
+  }
+
+  queue<Point2d> queue, next_queue;
+  for (int x = 0; x < height; ++x) {
+    vector<int> row(width);
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      if (area[label] < min_area) continue;
+
+      Point2d point(x, y);
+      queue.push(point);
+      row[y] = label;
+    }
+    text_line.emplace_back(row);
+  }
+
+  int dx[] = {-1, 1, 0, 0};
+  int dy[] = {0, 0, -1, 1};
+  vector<int> kernel_step(kernel_num);
+  std::for_each(kernel_step.begin(), kernel_step.end(),
+                [=](int &k) { return k * height * width; });
+
+  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
+    while (!queue.empty()) {
+      Point2d point = queue.front();
+      queue.pop();
+      int x = point.x;
+      int y = point.y;
+      int label = text_line[x][y];
+
+      bool is_edge = true;
+      for (int d = 0; d < 4; ++d) {
+        int tmp_x = x + dx[d];
+        int tmp_y = y + dy[d];
+
+        if (tmp_x < 0 || tmp_x >= height) continue;
+        if (tmp_y < 0 || tmp_y >= width) continue;
+        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
+        if (kernel_value == 0) continue;
+        if (text_line[tmp_x][tmp_y] > 0) continue;
+
+        Point2d point(tmp_x, tmp_y);
+        queue.push(point);
+        text_line[tmp_x][tmp_y] = label;
+        is_edge = false;
+      }
+
+      if (is_edge) {
+        next_queue.push(point);
+      }
+    }
+    swap(queue, next_queue);
+  }
+}
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num) {
+  kernel_mask = kernel_mask.contiguous();
+  internal_kernel_label = internal_kernel_label.contiguous();
+  assert(kernel_mask.dim() == 3);
+  assert(internal_kernel_label.dim() == 2);
+  assert(kernel_mask.size(1) == internal_kernel_label.size(0));
+  assert(kernel_mask.size(2) == internal_kernel_label.size(1));
+  CHECK_CPU_INPUT(kernel_mask);
+  CHECK_CPU_INPUT(internal_kernel_label);
+  auto ptr_data = kernel_mask.data_ptr<uint8_t>();
+  IntArrayRef data_shape = kernel_mask.sizes();
+
+  auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
+  vector<vector<int>> text_line;
+
+  kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
+                min_kernel_area, text_line);
+
+  return text_line;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/contour_expand_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/contour_expand_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1581fdc833c8f6b19a8e5a892ddbd8ec9414333e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/contour_expand_parrots.cpp
@@ -0,0 +1,43 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "contour_expand_pytorch.h"
+
+using namespace parrots;
+using namespace std;
+
+template <typename T>
+void contour_expand_parrots(T& ctx, const SSElement& attr,
+                            const OperatorBase::in_list_t& ins,
+                            OperatorBase::out_list_t& outs) {
+  int min_kernel_area, kernel_num;
+  SSAttrs(attr)
+      .get<int>("min_kernel_area", min_kernel_area)
+      .get<int>("kernel_num", kernel_num)
+      .done();
+  at::Tensor kernel_mask;
+  at::Tensor internal_kernel_label;
+  kernel_mask = buildATensor(ctx, ins[0]);
+  internal_kernel_label = buildATensor(ctx, ins[1]);
+  auto out = contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
+                            kernel_num);
+  int n = out.size(), m = 0;
+  for (int i = 0; i < n; ++i)
+    if (m < out[i].size()) m = out[i].size();
+  auto options = torch::TensorOptions().dtype(at::kInt);
+  auto tensor = torch::zeros({n, m}, options);
+  for (int i = 0; i < n; i++)
+    tensor.slice(0, i, i + 1) =
+        torch::from_blob(out[i].data(), {out[i].size()}, options);
+  updateDArray(ctx, tensor, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(contour_expand)
+    .attr("min_kernel_area")
+    .attr("kernel_num")
+    .input(2)
+    .output(1)
+    .apply(contour_expand_parrots<HostContext>)
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/contour_expand_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/contour_expand_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..881bbac3cb73494e0063314c340adc7a280f4fc6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/contour_expand_pytorch.h
@@ -0,0 +1,12 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CONTOUR_EXPAND_PYTORCH_H
+#define CONTOUR_EXPAND_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num);
+
+#endif  // CONTOUR_EXPAND_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/convex_iou.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/convex_iou.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79f2028b551c474453aff2f6633dd426194e4afd
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/convex_iou.cpp
@@ -0,0 +1,23 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
+}
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
+  convex_iou_impl(pointsets, polygons, ious);
+}
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
+}
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
+  convex_giou_impl(pointsets, polygons, output);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf766542f0a04da85a1b15022f3e5f078c283a1a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp
@@ -0,0 +1,40 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "convex_iou_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void convex_iou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  convex_iou(pointsets, polygons, ious);
+}
+
+void convex_giou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  convex_giou(pointsets, polygons, output);
+}
+
+PARROTS_EXTENSION_REGISTER(convex_iou)
+    .input(2)
+    .output(1)
+    .apply(convex_iou_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(convex_giou)
+    .input(2)
+    .output(1)
+    .apply(convex_giou_forward_cuda_parrots)
+    .done();
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/convex_iou_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/convex_iou_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f16a1ce4b62bbe91b3083465468c2b9ae6df055
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/convex_iou_pytorch.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CONVEX_IOU_PYTORCH_H
+#define CONVEX_IOU_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
+
+#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/correlation.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/correlation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4adba2a0c17201476352c473f1c7117af020ab2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/correlation.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <iostream>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
+                       patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
+                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
+                       padW, dilationH, dilationW, dilation_patchH,
+                       dilation_patchW, dH, dW);
+}
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW) {
+  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
+                           padW, dilationH, dilationW, dilation_patchH,
+                           dilation_patchW, dH, dW);
+}
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW) {
+  correlation_backward_impl(grad_output, input1, input2, grad_input1,
+                            grad_input2, kH, kW, patchH, patchW, padH, padW,
+                            dilationH, dilationW, dilation_patchH,
+                            dilation_patchW, dH, dW);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/correlation_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/correlation_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b1e287d063564775070389285a6fee7ea1aaeb80
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/correlation_parrots.cpp
@@ -0,0 +1,176 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "correlation_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void correlation_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto input1 = buildATensor(ctx, ins[0]);
+  auto input2 = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  correlation_forward(input1, input2, output, kH, kW, patchH, patchW, padH,
+                      padW, dilationH, dilationW, dilation_patchH,
+                      dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto input1 = buildATensor(ctx, ins[1]);
+  auto input2 = buildATensor(ctx, ins[2]);
+
+  auto grad_input1 = buildATensor(ctx, outs[0]);
+  auto grad_input2 = buildATensor(ctx, outs[1]);
+
+  correlation_backward(grad_output, input1, input2, grad_input1, grad_input2,
+                       kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+#endif
+
+void correlation_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto input1 = buildATensor(ctx, ins[0]);
+  auto input2 = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  correlation_forward(input1, input2, output, kH, kW, patchH, patchW, padH,
+                      padW, dilationH, dilationW, dilation_patchH,
+                      dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto input1 = buildATensor(ctx, ins[1]);
+  auto input2 = buildATensor(ctx, ins[2]);
+
+  auto grad_input1 = buildATensor(ctx, outs[0]);
+  auto grad_input2 = buildATensor(ctx, outs[1]);
+
+  correlation_backward(grad_output, input1, input2, grad_input1, grad_input2,
+                       kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+
+PARROTS_EXTENSION_REGISTER(correlation_forward)
+    .attr("kH")
+    .attr("kW")
+    .attr("patchH")
+    .attr("patchW")
+    .attr("padH")
+    .attr("padW")
+    .attr("dilationH")
+    .attr("dilationW")
+    .attr("dilation_patchH")
+    .attr("dilation_patchW")
+    .attr("dH")
+    .attr("dW")
+    .input(2)
+    .output(1)
+    .apply(correlation_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(correlation_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(correlation_backward)
+    .attr("kH")
+    .attr("kW")
+    .attr("patchH")
+    .attr("patchW")
+    .attr("padH")
+    .attr("padW")
+    .attr("dilationH")
+    .attr("dilationW")
+    .attr("dilation_patchH")
+    .attr("dilation_patchW")
+    .attr("dH")
+    .attr("dW")
+    .input(3)
+    .output(2)
+    .apply(correlation_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(correlation_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/correlation_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/correlation_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..806fcaa710deb7d4622be6373dda84b20e7278fc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/correlation_pytorch.h
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CORRELATION_PYTORCH_H
+#define CORRELATION_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW);
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW);
+
+#endif  // CORRELATION_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/cudabind.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/cudabind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4521ddf4a928c971069e3da1e2a71cf8b2b4e0e8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/cudabind.cpp
@@ -0,0 +1,1626 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output);
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  AssignScoreWithKForwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
+};
+
+void assign_score_withk_backward_cuda(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  AssignScoreWithKBackwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
+      grad_points, grad_centers, grad_scores);
+};
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output);
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
+                     assign_score_withk_forward_cuda);
+REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,
+                     assign_score_withk_backward_cuda);
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx);
+
+void ball_query_forward_cuda(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
+                                     new_xyz, xyz, idx);
+};
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx);
+REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset);
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor& input,
+                                          const Tensor& boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size);
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor& grad_output,
+                                           const Tensor& boxes,
+                                           const Tensor& argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size);
+
+void border_align_forward_cuda(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
+                                       pool_size);
+}
+
+void border_align_backward_cuda(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
+                                        grad_input, pool_size);
+}
+
+void border_align_forward_impl(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_impl(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size);
+
+REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
+                     border_align_forward_cuda);
+REGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,
+                     border_align_backward_cuda);
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor);
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor);
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                  output, kernel_size, group_size,
+                                  scale_factor);
+}
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                   bottom_grad, mask_grad, kernel_size,
+                                   group_size, scale_factor);
+}
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor);
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor);
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+                                       group_size, scale_factor);
+}
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+                                        mask_grad, kernel_size, group_size,
+                                        scale_factor);
+}
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,
+                     carafe_naive_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,
+                     carafe_naive_backward_cuda);
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW);
+
+void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
+                                           Tensor input2, Tensor grad_input1,
+                                           Tensor grad_input2, int kH, int kW,
+                                           int patchH, int patchW, int padH,
+                                           int padW, int dilationH,
+                                           int dilationW, int dilation_patchH,
+                                           int dilation_patchW, int dH, int dW);
+
+void correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  CorrelationForwardCUDAKernelLauncher(
+      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
+      dilationW, dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  CorrelationBackwardCUDAKernelLauncher(
+      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
+      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW);
+}
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW);
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW);
+
+REGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);
+REGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,
+                     correlation_backward_cuda);
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,
+                     deformable_col2im_coord_cuda);
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma);
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma);
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+                                         pooled_height, pooled_width,
+                                         spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DeformRoIPoolBackwardCUDAKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,
+                     deform_roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,
+                     deform_roi_pool_backward_cuda);
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha);
+}
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,
+                     sigmoid_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,
+                     sigmoid_focal_loss_backward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,
+                     softmax_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
+                     softmax_focal_loss_backward_cuda);
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float* dataset,
+                                                    float* temp, int* idxs);
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float* dataset, float* temp, int* idxs);
+
+void furthest_point_sampling_forward_cuda(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
+                                                         idxs);
+}
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m);
+
+REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
+                     furthest_point_sampling_forward_cuda);
+REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
+                     furthest_point_sampling_with_dist_forward_cuda);
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
+                                      int grad, float alpha, float scale);
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale);
+REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
+                     fused_bias_leakyrelu_op);
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out);
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points);
+
+void gather_points_forward_cuda(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
+};
+
+void gather_points_backward_cuda(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
+                                         grad_points);
+};
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out);
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,
+                     gather_points_forward_cuda);
+REGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,
+                     gather_points_backward_cuda);
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out);
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points);
+
+void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
+                                       out);
+};
+
+void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
+                                        idx, grad_points);
+};
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out);
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
+                     group_points_forward_cuda);
+REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
+                     group_points_backward_cuda);
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2);
+
+void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
+}
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2);
+REGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w);
+
+void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int height,
+                                           const int width, const int channels);
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                        width, channels);
+}
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+
+REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,
+                     masked_im2col_forward_cuda);
+REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,
+                     masked_col2im_forward_cuda);
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,
+                     modulated_deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
+                     modulated_deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
+                     modulated_deformable_col2im_coord_cuda);
+
+Tensor ms_deform_attn_cuda_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_cuda_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+Tensor ms_deform_attn_impl_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_impl_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
+                     ms_deform_attn_cuda_forward);
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,
+                     ms_deform_attn_cuda_backward);
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset);
+
+Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points);
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points);
+
+void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                             boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                            boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points);
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points);
+REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,
+                     points_in_boxes_part_forward_cuda);
+REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,
+                     points_in_boxes_all_forward_cuda);
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask);
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask);
+
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
+                                   w_feature, h_mask, w_mask, half_h_mask,
+                                   half_w_mask);
+}
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                    h_feature, w_feature, h_mask, w_mask,
+                                    half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  ROIAlignBackwardCUDAKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output);
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
+
+void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = input.size(1);
+  int data_height = input.size(2);
+  int data_width = input.size(3);
+  ROIAlignRotatedForwardCUDAKernelLauncher(
+      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, output);
+}
+
+void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  ROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, bottom_grad);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
+                     roi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
+                     roi_align_rotated_backward_cuda);
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output);
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad);
+
+void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(features);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = features.size(1) / num_orientations;
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  RiROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, output);
+}
+
+void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(top_grad);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = bottom_grad.size(1) / num_orientations;
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  RiROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, bottom_grad);
+}
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise);
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise);
+
+REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
+                     riroi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
+                     riroi_align_rotated_backward_cuda);
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method);
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method);
+
+void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  RoiawarePool3dForwardCUDAKernelLauncher(
+      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
+      pool_method);
+};
+
+void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  RoiawarePool3dBackwardCUDAKernelLauncher(
+      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
+};
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,
+                     roiaware_pool3d_forward_cuda);
+REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,
+                     roiaware_pool3d_backward_cuda);
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
+
+void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  RoIPointPool3dForwardCUDAKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+};
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag);
+REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,
+                     roipoint_pool3d_forward_cuda);
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale);
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale);
+
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
+                                   pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
+                                    pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor& feats, const at::Tensor& coors,
+    const reduce_t reduce_type);
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor& grad_feats, const at::Tensor& grad_reduced_feats,
+    const at::Tensor& feats, const at::Tensor& reduced_feats,
+    const at::Tensor& coors_map, const at::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type) {
+  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
+                                                      reduce_type);
+};
+
+void dynamic_point_to_voxel_backward_cuda(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type) {
+  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
+                                                feats, reduced_feats, coors_idx,
+                                                reduce_count, reduce_type);
+};
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
+                     dynamic_point_to_voxel_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
+                     dynamic_point_to_voxel_backward_cuda);
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var);
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size);
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias);
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input);
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
+  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
+}
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
+                                        running_var, weight, bias, norm, std,
+                                        output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
+                                        grad_bias);
+}
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
+                                       grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,
+                     sync_bn_forward_mean_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,
+                     sync_bn_forward_output_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,
+                     sync_bn_backward_param_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,
+                     sync_bn_backward_data_cuda);
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight, Tensor out);
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points);
+
+void three_interpolate_forward_cuda(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
+                                            out);
+};
+
+void three_interpolate_backward_cuda(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
+                                             grad_points);
+};
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out);
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points);
+REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,
+                     three_interpolate_forward_cuda);
+REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,
+                     three_interpolate_backward_cuda);
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx);
+
+void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
+};
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx);
+REGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output);
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input);
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardCUDAKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);
+
+torch::Tensor upfirdn2d_op(const torch::Tensor& input,
+                           const torch::Tensor& kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1);
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1);
+REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3);
+
+int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim) {
+  return HardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+int nondeterministic_hard_voxelize_forward_cuda(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim) {
+  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim) {
+  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
+                                           coors_range, NDim);
+};
+
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
+                     hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
+                     nondeterministic_hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
+                     dynamic_voxelize_forward_cuda);
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output);
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad);
+
+void rotated_feature_align_forward_cuda(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
+                                               spatial_scale, points, output);
+};
+
+void rotated_feature_align_backward_cuda(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  RotatedFeatureAlignBackwardCUDAKernelLauncher(
+      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
+};
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
+                     rotated_feature_align_forward_cuda);
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
+                     rotated_feature_align_backward_cuda);
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output);
+
+void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
+                                            output);
+};
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols);
+
+REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
+                     points_in_polygons_forward_cuda);
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
+
+void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
+  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
+}
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
+
+REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output);
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in);
+
+void active_rotated_filter_forward_cuda(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
+};
+
+void active_rotated_filter_backward_cuda(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
+};
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
+                     active_rotated_filter_forward_cuda);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
+                     active_rotated_filter_backward_cuda);
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious);
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output);
+
+void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
+}
+
+void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
+}
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious);
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output);
+
+REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
+REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
+
+Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
+                                                    Tensor mask,
+                                                    Tensor num_valid);
+
+Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
+                                                      num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
+                     diff_iou_rotated_sort_vertices_forward_cuda);
+
+void ChamferDistanceForwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
+    const Tensor dist2, const Tensor idx1, const Tensor idx2);
+
+void ChamferDistanceBackwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
+    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2);
+
+void chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2) {
+  ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,
+                                           idx2);
+};
+
+void chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor idx1, Tensor idx2, Tensor graddist1,
+                                    Tensor graddist2, Tensor gradxyz1,
+                                    Tensor gradxyz2) {
+  ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, idx1, idx2, graddist1,
+                                            graddist2, gradxyz1, gradxyz2);
+};
+
+void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2);
+
+void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor idx1, Tensor idx2, Tensor graddist1,
+                                    Tensor graddist2, Tensor gradxyz1,
+                                    Tensor gradxyz2);
+
+REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
+                     chamfer_distance_forward_cuda);
+REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
+                     chamfer_distance_backward_cuda);
+
+void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                        Tensor output, int pooled_height,
+                                        int pooled_width, float spatial_scale);
+
+void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                         Tensor grad_input, int pooled_height,
+                                         int pooled_width, float spatial_scale);
+
+void PrROIPoolCoorBackwardCUDAKernelLauncher(
+    Tensor output, Tensor grad_output, Tensor input, Tensor rois,
+    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);
+
+void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
+                                     pooled_width, spatial_scale);
+}
+
+void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
+                                      pooled_height, pooled_width,
+                                      spatial_scale);
+}
+
+void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
+                                          grad_rois, pooled_height,
+                                          pooled_width, spatial_scale);
+}
+
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale);
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale);
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale);
+REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
+                     prroi_pool_coor_backward_cuda);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_conv.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86690b9394a4b758104009062f656dcfe0de178e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_conv.cpp
@@ -0,0 +1,517 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, data_col);
+}
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, grad_im);
+}
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
+                       data_offset, channels, height, width, ksize_h, ksize_w,
+                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                       parallel_imgs, deformable_group, grad_offset);
+}
+
+void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
+                             at::Tensor *gradOutput, at::Tensor weight, int kH,
+                             int kW, int dH, int dW, int padH, int padW,
+                             int dilationH, int dilationW, int group,
+                             int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got kH: %d kW: %d",
+              kH, kW);
+
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+              "kernel size should be consistent with weight, ",
+              "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+              kH, kW, weight.size(2), weight.size(3));
+
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got dH: %d dW: %d", dH,
+              dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(ndim == 3 || ndim == 4,
+              "3D or 4D input tensor expected but got: %s", ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
+              "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(input.size(1) == nInputPlane,
+              "invalid number of input planes, expected: %d, but got: %d",
+              nInputPlane, input.size(1));
+
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+              "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+              "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane, gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight, outputWidth, gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
+                          padW, dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
+                                    im2col_step * outputHeight, outputWidth},
+                                   output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradInput);
+    CHECK_CUDA_INPUT(gradOffset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(columns);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradInput);
+    CHECK_CPU_INPUT(gradOffset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(columns);
+  }
+  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
+                          padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
+                                 inputHeight, inputWidth, kH, kW, padH, padW,
+                                 dH, dW, dilationH, dilationW, im2col_step,
+                                 deformable_group, gradOffset[elt]);
+
+    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group,
+                           gradInput[elt]);
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradWeight);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradWeight);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
+                          dW, padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer = gradOutputBuffer.contiguous();
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_conv_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_conv_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c07a170dfb73032756277096d53b82a528ecafd1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_conv_parrots.cpp
@@ -0,0 +1,273 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "deform_conv_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void deform_conv_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+
+  deform_conv_forward(input, weight, offset, output, columns, ones, kW, kH, dW,
+                      dH, padW, padH, dilationW, dilationH, group,
+                      deformable_group, im2col_step);
+}
+
+void deform_conv_backward_input_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradInput = buildATensor(ctx, outs[0]);
+  auto gradOffset = buildATensor(ctx, outs[1]);
+  auto weight = buildATensor(ctx, outs[2]);
+  auto columns = buildATensor(ctx, outs[3]);
+
+  deform_conv_backward_input(input, offset, gradOutput, gradInput, gradOffset,
+                             weight, columns, kW, kH, dW, dH, padW, padH,
+                             dilationW, dilationH, group, deformable_group,
+                             im2col_step);
+}
+
+void deform_conv_backward_parameters_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  float scale;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<float>("scale", scale)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradWeight = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+  deform_conv_backward_parameters(input, offset, gradOutput, gradWeight,
+                                  columns, ones, kW, kH, dW, dH, padW, padH,
+                                  dilationW, dilationH, group, deformable_group,
+                                  scale, im2col_step);
+}
+#endif
+
+void deform_conv_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+
+  deform_conv_forward(input, weight, offset, output, columns, ones, kW, kH, dW,
+                      dH, padW, padH, dilationW, dilationH, group,
+                      deformable_group, im2col_step);
+}
+
+void deform_conv_backward_input_cpu_parrots(HostContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradInput = buildATensor(ctx, outs[0]);
+  auto gradOffset = buildATensor(ctx, outs[1]);
+  auto weight = buildATensor(ctx, outs[2]);
+  auto columns = buildATensor(ctx, outs[3]);
+
+  deform_conv_backward_input(input, offset, gradOutput, gradInput, gradOffset,
+                             weight, columns, kW, kH, dW, dH, padW, padH,
+                             dilationW, dilationH, group, deformable_group,
+                             im2col_step);
+}
+
+void deform_conv_backward_parameters_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  float scale;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<float>("scale", scale)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradWeight = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+  deform_conv_backward_parameters(input, offset, gradOutput, gradWeight,
+                                  columns, ones, kW, kH, dW, dH, padW, padH,
+                                  dilationW, dilationH, group, deformable_group,
+                                  scale, im2col_step);
+}
+
+PARROTS_EXTENSION_REGISTER(deform_conv_forward)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("im2col_step")
+    .input(3)
+    .output(3)
+    .apply(deform_conv_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(deform_conv_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_conv_backward_input)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("im2col_step")
+    .input(3)
+    .output(4)
+    .apply(deform_conv_backward_input_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(deform_conv_backward_input_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("scale")
+    .attr("im2col_step")
+    .input(3)
+    .output(3)
+    .apply(deform_conv_backward_parameters_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(deform_conv_backward_parameters_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_conv_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_conv_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0d3d40d1c9eb32a466d5d4b427556741a4c79fc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_conv_pytorch.h
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DEFORM_CONV_PYTORCH_H
+#define DEFORM_CONV_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step);
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step);
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step);
+
+#endif  // DEFORM_CONV_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_roi_pool.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4fb78a96e74f7e97dff5212bb767eab743f2e73c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
+                       output, pooled_height, pooled_width, spatial_scale,
+                       sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
+                       offset, grad_input, grad_offset, pooled_height,
+                       pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma) {
+  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma) {
+  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fc2701d52d921ee03fd2ff518852d52e291d6c4c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp
@@ -0,0 +1,102 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "deform_roi_pool_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+ *                                  Tensor output, int pooled_height,
+ *                                  int pooled_width, float spatial_scale,
+ *                                  int sampling_ratio, float gamma);
+ */
+void deform_roi_pool_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  float gamma;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<float>("gamma", gamma)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+
+/*void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+ *                                   Tensor rois, Tensor offset,
+ *                                   Tensor grad_input, Tensor grad_offset,
+ *                                   int pooled_height, int pooled_width,
+ *                                   float spatial_scale, int sampling_ratio,
+ *                                   float gamma);
+ */
+void deform_roi_pool_backward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  float gamma;
+
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<float>("gamma", gamma)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& input = buildATensor(ctx, ins[1]);
+  const auto& rois = buildATensor(ctx, ins[2]);
+  const auto& offset = buildATensor(ctx, ins[3]);
+
+  auto grad_input = buildATensor(ctx, outs[0]);
+  auto grad_offset = buildATensor(ctx, outs[1]);
+
+  deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
+
+PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("gamma")
+    .input(3)
+    .output(1)
+    .apply(deform_roi_pool_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("gamma")
+    .input(4)
+    .output(2)
+    .apply(deform_roi_pool_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac0f2c324bb8329f2a0b6bc683f3d902a300156c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DEFORM_ROI_POOL_PYTORCH_H
+#define DEFORM_ROI_POOL_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+#endif  // DEFORM_ROI_POOL_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2361b7fbe5c86fa62a0fa78f39f6d018de108f8f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
+                              vertices, mask, num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
+                                              Tensor num_valid) {
+  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4d3e0e05900a1c9c731fcc7e2194eeedc8b9bfb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "diff_iou_rotated_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void diff_iou_rotated_sort_vertices_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  at::Tensor boxes, scores, dets;
+  auto vertices = buildATensor(ctx, ins[0]);
+  auto mask = buildATensor(ctx, ins[1]);
+  auto num_valid = buildATensor(ctx, ins[2]);
+  auto out =
+      diff_iou_rotated_sort_vertices_forward_cuda(vertices, mask, num_valid);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(diff_iou_rotated_sort_vertices_forward)
+    .input(3)
+    .output(1)
+    .apply(diff_iou_rotated_sort_vertices_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef911ecc20c7e648dea7aeb74a4d3ec2f46ec990
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
@@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DIFF_IOU_ROTATED_PYTORCH_H
+#define DIFF_IOU_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+#endif  // DIFF_IOU_ROTATED_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/focal_loss.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/focal_loss.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed0e2186532d9d6d909f76d653283bbdc29eac11
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/focal_loss.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
+                       grad_input, gamma, alpha);
+}
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
+                       buff, grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha) {
+  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
+                                   alpha);
+}
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha) {
+  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/focal_loss_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/focal_loss_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..044e200c40ef6342c6147e2d9282d856cc3dd9a2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/focal_loss_parrots.cpp
@@ -0,0 +1,113 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "focal_loss_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void sigmoid_focal_loss_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto grad_input = buildATensor(ctx, outs[0]);
+
+  sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
+                                   alpha);
+}
+
+void softmax_focal_loss_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  softmax_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto buff = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
+
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_backward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(softmax_focal_loss_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(2)
+    .apply(softmax_focal_loss_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/focal_loss_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/focal_loss_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7a00c8abcd5fccd5bf2e3bfcde0451545c69f28
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/focal_loss_pytorch.h
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef FOCAL_LOSS_PYTORCH_H
+#define FOCAL_LOSS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+#endif  // FOCAL_LOSS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/furthest_point_sample.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c7098acdb5b8392a698803dd7c7d34a360df6ad
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
@@ -0,0 +1,34 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
+                       temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
+                       points_tensor, temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m) {
+  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
+                                       b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m) {
+  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
+                                                 idx_tensor, b, n, m);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..483bfb24316d505c6c6086f0ec1f70a61c2e2baf
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp
@@ -0,0 +1,57 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "furthest_point_sample_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void furthest_point_sample_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int b, n, m;
+  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto temp_tensor = buildATensor(ctx, ins[1]);
+
+  auto idx_tensor = buildATensor(ctx, outs[0]);
+
+  furthest_point_sampling_forward(points_tensor, temp_tensor, idx_tensor, b, n,
+                                  m);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int b, n, m;
+  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto temp_tensor = buildATensor(ctx, ins[1]);
+
+  auto idx_tensor = buildATensor(ctx, outs[0]);
+
+  furthest_point_sampling_with_dist_forward(points_tensor, temp_tensor,
+                                            idx_tensor, b, n, m);
+}
+PARROTS_EXTENSION_REGISTER(furthest_point_sampling_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .input(2)
+    .output(1)
+    .apply(furthest_point_sample_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(furthest_point_sampling_with_dist_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .input(2)
+    .output(1)
+    .apply(furthest_point_sampling_with_dist_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0325cd66ed317574d2ab258152617091552a9301
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef FURTHEST_POINT_SAMPLE_PYTORCH_H
+#define FURTHEST_POINT_SAMPLE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m);
+#endif  // FURTHEST_POINT_SAMPLE_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8d411c9d843f15174653aab4b24cbb3c37564073
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
@@ -0,0 +1,119 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale) {
+  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
+                              act, grad, alpha, scale);
+}
+
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
+                                   const torch::Tensor& bias,
+                                   const torch::Tensor& refer, int act,
+                                   int grad, float alpha, float scale) {
+  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
+                                      scale);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/fused_bias_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/fused_bias_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47409ad20bbb5d4852eceb16038d3cec41e3431c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/fused_bias_parrots.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+using namespace at;
+using namespace parrots;
+
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor &input,
+                                   const torch::Tensor &bias,
+                                   const torch::Tensor &refer, int act,
+                                   int grad, float alpha, float scale);
+
+void fused_bias_leakyrelu_parrots(CudaContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int act, grad;
+  float alpha, scale;
+  SSAttrs(attr)
+      .get<int>("act", act)
+      .get<int>("grad", grad)
+      .get<float>("alpha", alpha)
+      .get<float>("scale", scale)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  const auto &bias = buildATensor(ctx, ins[1]);
+  const auto &refer = buildATensor(ctx, ins[2]);
+  auto out = fused_bias_leakyrelu(input, bias, refer, act, grad, alpha, scale);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(fused_bias_leakyrelu)
+    .attr("act")
+    .attr("grad")
+    .attr("alpha")
+    .attr("scale")
+    .input(3)
+    .output(1)
+    .apply(fused_bias_leakyrelu_parrots)
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/gather_points.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/gather_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8fb020022902bfbeb5ba940621d51859c616bdc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/gather_points.cpp
@@ -0,0 +1,30 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
+                       idx, out);
+}
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
+                       idx, grad_points);
+}
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n,
+                           int npoints) {
+  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
+                             out_tensor);
+}
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints) {
+  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
+                              grad_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/gather_points_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/gather_points_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d2d9e1290f26ccbfeb301a102fcb0917ff2cfa1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/gather_points_parrots.cpp
@@ -0,0 +1,71 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "gather_points_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void gather_points_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto out_tensor = buildATensor(ctx, outs[0]);
+
+  gather_points_forward(points_tensor, idx_tensor, out_tensor, b, c, n,
+                        npoints);
+}
+
+void gather_points_backward_cuda_parrots(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .done();
+
+  auto grad_out_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto grad_points_tensor = buildATensor(ctx, outs[0]);
+
+  gather_points_backward(grad_out_tensor, idx_tensor, grad_points_tensor, b, c,
+                         n, npoints);
+}
+
+PARROTS_EXTENSION_REGISTER(gather_points_forward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .input(2)
+    .output(1)
+    .apply(gather_points_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(gather_points_backward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .input(2)
+    .output(1)
+    .apply(gather_points_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/gather_points_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/gather_points_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1689ae6ad9ca00e795510ac356f6b49c4890bf2e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/gather_points_pytorch.h
@@ -0,0 +1,13 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef GATHER_POINTS_PYTORCH_H
+#define GATHER_POINTS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n, int npoints);
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints);
+#endif  // GATHER_POINTS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/group_points.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/group_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cdd190d40bbfdb109e34148791775dfe9d16be2e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/group_points.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points, idx, out);
+}
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
+                       grad_out, idx, grad_points);
+}
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points_tensor, idx_tensor, out_tensor);
+}
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample) {
+  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
+                             idx_tensor, grad_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/group_points_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/group_points_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..282c01a8c175cc6145ab45e5938325d2f7e0d491
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/group_points_parrots.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "group_points_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void group_points_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints, nsample;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .get<int>("nsample", nsample)
+      .done();
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto out_tensor = buildATensor(ctx, outs[0]);
+
+  group_points_forward(points_tensor, idx_tensor, out_tensor, b, c, n, npoints,
+                       nsample);
+}
+
+void group_points_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints, nsample;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .get<int>("nsample", nsample)
+      .done();
+  auto grad_out_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto grad_points_tensor = buildATensor(ctx, outs[0]);
+
+  group_points_backward(grad_out_tensor, idx_tensor, grad_points_tensor, b, c,
+                        n, npoints, nsample);
+}
+
+PARROTS_EXTENSION_REGISTER(group_points_forward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .attr("nsample")
+    .input(2)
+    .output(1)
+    .apply(group_points_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(group_points_backward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .attr("nsample")
+    .input(2)
+    .output(1)
+    .apply(group_points_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/group_points_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/group_points_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e704ab078e0ea3833c0ef29e5e4ab00693151be3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/group_points_pytorch.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef GROUP_POINTS_PYTORCH_H
+#define GROUP_POINTS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample);
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample);
+
+#endif  // GROUP_POINTS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/info.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/info.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4cc41861128dc0a8f8ccd641f68044428c4dc2c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/info.cpp
@@ -0,0 +1,65 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+#ifdef MMCV_WITH_HIP
+#include <hip/hip_runtime_api.h>
+int get_hiprt_version() {
+  int runtimeVersion;
+  hipRuntimeGetVersion(&runtimeVersion);
+  return runtimeVersion;
+}
+#else
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef MMCV_WITH_HIP
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  std::ostringstream oss;
+  oss << get_hiprt_version();
+  return oss.str();
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/iou3d.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/iou3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a347c0ee96db9ceefd6168c3cce84bea243e7044
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/iou3d.cpp
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
+                       num_b, boxes_b, ans_overlap);
+}
+
+void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
+                              Tensor &keep_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, keep, keep_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
+                                     Tensor &keep_num,
+                                     float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, keep, keep_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
+                                       ans_overlap);
+}
+
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  iou3d_nms3d_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  iou3d_nms3d_normal_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/iou3d_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..20e288aeab9bdaef047115bdac645e4b58e4c629
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
@@ -0,0 +1,70 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "iou3d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void iou3d_boxes_overlap_bev_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto boxes_a = buildATensor(ctx, ins[0]);
+  auto boxes_b = buildATensor(ctx, ins[1]);
+
+  auto ans_iou = buildATensor(ctx, outs[0]);
+
+  iou3d_boxes_overlap_bev_forward(boxes_a, boxes_b, ans_iou);
+}
+
+void iou3d_nms3d_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  float nms_overlap_thresh;
+  SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
+
+  auto boxes = buildATensor(ctx, ins[0]);
+
+  auto keep = buildATensor(ctx, outs[0]);
+  auto keep_num = buildATensor(ctx, outs[1]);
+
+  iou3d_nms3d_forward(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+void iou3d_nms3d_normal_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  float nms_overlap_thresh;
+  SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
+
+  auto boxes = buildATensor(ctx, ins[0]);
+
+  auto keep = buildATensor(ctx, outs[0]);
+  auto keep_num = buildATensor(ctx, outs[1]);
+
+  iou3d_nms3d_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+PARROTS_EXTENSION_REGISTER(iou3d_boxes_overlap_bev_forward)
+    .input(2)
+    .output(1)
+    .apply(iou3d_boxes_overlap_bev_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(iou3d_nms3d_forward)
+    .attr("nms_overlap_thresh")
+    .input(1)
+    .output(2)
+    .apply(iou3d_nms3d_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(iou3d_nms3d_normal_forward)
+    .attr("nms_overlap_thresh")
+    .input(1)
+    .output(2)
+    .apply(iou3d_nms3d_normal_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/iou3d_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/iou3d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..76170edc7083dbaff4a2d23356c4e7702b929a2d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/iou3d_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef IOU_3D_PYTORCH_H
+#define IOU_3D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap);
+
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh);
+
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh);
+
+#endif  // IOU_3D_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/knn.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/knn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4be9428c59c0f04635891b954f4c73f7fb0536d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/knn.cpp
@@ -0,0 +1,17 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
+                       dist2);
+}
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
+  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
+                   dist2_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/knn_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/knn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..585b84644a4427330046ac0ea2220d07580ee638
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/knn_parrots.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "knn_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void knn_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                              const OperatorBase::in_list_t& ins,
+                              OperatorBase::out_list_t& outs) {
+  int b, n, m, nsample;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("n", n)
+      .get<int>("m", m)
+      .get<int>("nsample", nsample)
+      .done();
+
+  auto xyz_tensor = buildATensor(ctx, ins[0]);
+  auto new_xyz_tensor = buildATensor(ctx, ins[1]);
+
+  auto idx_tensor = buildATensor(ctx, outs[0]);
+  auto dist2_tensor = buildATensor(ctx, outs[1]);
+
+  knn_forward(xyz_tensor, new_xyz_tensor, idx_tensor, dist2_tensor, b, n, m,
+              nsample);
+}
+
+PARROTS_EXTENSION_REGISTER(knn_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .attr("nsample")
+    .input(2)
+    .output(2)
+    .apply(knn_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/knn_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/knn_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0875f8389ee91bfc93083da844ccd4f6be9fdf3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/knn_pytorch.h
@@ -0,0 +1,9 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef KNN_PYTORCH_H
+#define KNN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample);
+#endif  // KNN_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/masked_conv2d.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/masked_conv2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5903925351fcb193b86c8b5f01b410e4fc0bbaf9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/masked_conv2d.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
+                       col, kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
+                       im, height, width, channels);
+}
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w) {
+  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels) {
+  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39f19740c84b521cf16a2030fb01b07bda1e75e4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "masked_conv2d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void masked_im2col_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  int kernel_h, kernel_w, pad_h, pad_w;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .done();
+
+  const auto& im = buildATensor(ctx, ins[0]);
+  const auto& mask_h_idx = buildATensor(ctx, ins[1]);
+  const auto& mask_w_idx = buildATensor(ctx, ins[2]);
+
+  auto col = buildATensor(ctx, outs[0]);
+  masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  int height, width, channels;
+  SSAttrs(attr)
+      .get<int>("height", height)
+      .get<int>("width", width)
+      .get<int>("channels", channels)
+      .done();
+
+  const auto& col = buildATensor(ctx, ins[0]);
+  const auto& mask_h_idx = buildATensor(ctx, ins[1]);
+  const auto& mask_w_idx = buildATensor(ctx, ins[2]);
+
+  auto im = buildATensor(ctx, outs[0]);
+  masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
+
+PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .input(3)
+    .output(1)
+    .apply(masked_im2col_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
+    .attr("height")
+    .attr("width")
+    .attr("channels")
+    .input(3)
+    .output(1)
+    .apply(masked_col2im_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..36d5643f6037bf05cfdcdb23a02151aab0c1d4b4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MASKED_CONV2D_PYTORCH_H
+#define MASKED_CONV2D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+#endif  // MASKED_CONV2D_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/min_area_polygons.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/min_area_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ff996dc8992b4c95633516054ecdba5913de8f3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/min_area_polygons.cpp
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
+  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
+}
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons) {
+  min_area_polygons_impl(pointsets, polygons);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9e4ff4b3dd80746ca534cbf4f02ace966b363d8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "min_area_polygons_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void min_area_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+
+  auto polygons = buildATensor(ctx, outs[0]);
+  min_area_polygons(pointsets, polygons);
+}
+
+PARROTS_EXTENSION_REGISTER(min_area_polygons)
+    .input(1)
+    .output(1)
+    .apply(min_area_polygons_cuda_parrots)
+    .done();
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1df27641882c6ae29028809f726c1a19b9a192cd
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
@@ -0,0 +1,9 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MIN_AREA_POLYGONS_PYTORCH_H
+#define MIN_AREA_POLYGONS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons);
+
+#endif  // MIN_AREA_POLYGONS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..12b538a05e6fd98becccfddf8e79cba7abf96d93
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
@@ -0,0 +1,237 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, data_col);
+}
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, grad_im);
+}
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
+                       data_im, data_offset, data_mask, batch_size, channels,
+                       height_im, width_im, height_col, width_col, kernel_h,
+                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                       dilation_w, deformable_group, grad_offset, grad_mask);
+}
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_impl(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_impl(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ef7efff6e473abd4ee94d21c8b8dc05ab34f1d9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp
@@ -0,0 +1,199 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "modulated_deform_conv_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void modulated_deform_conv_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+
+  modulated_deform_conv_forward(input, weight, bias, ones, offset, mask, output,
+                                columns, kernel_h, kernel_w, stride_h, stride_w,
+                                pad_h, pad_w, dilation_h, dilation_w, group,
+                                deformable_group, with_bias);
+}
+
+void modulated_deform_conv_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto columns = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  auto grad_weight = buildATensor(ctx, outs[2]);
+  auto grad_bias = buildATensor(ctx, outs[3]);
+  auto grad_offset = buildATensor(ctx, outs[4]);
+  auto grad_mask = buildATensor(ctx, outs[5]);
+  auto grad_output = buildATensor(ctx, outs[6]);
+  modulated_deform_conv_backward(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+}
+#endif
+
+void modulated_deform_conv_forward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+
+  modulated_deform_conv_forward(input, weight, bias, ones, offset, mask, output,
+                                columns, kernel_h, kernel_w, stride_h, stride_w,
+                                pad_h, pad_w, dilation_h, dilation_w, group,
+                                deformable_group, with_bias);
+}
+
+void modulated_deform_conv_backward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto columns = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  auto grad_weight = buildATensor(ctx, outs[2]);
+  auto grad_bias = buildATensor(ctx, outs[3]);
+  auto grad_offset = buildATensor(ctx, outs[4]);
+  auto grad_mask = buildATensor(ctx, outs[5]);
+  auto grad_output = buildATensor(ctx, outs[6]);
+  modulated_deform_conv_backward(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+}
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(2)
+    .apply(modulated_deform_conv_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(modulated_deform_conv_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(7)
+    .apply(modulated_deform_conv_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(modulated_deform_conv_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..12f6868612d5e7596378c4ce2e8fa25f1b9c0afc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MODULATED_DEFORM_CONV_PYTORCH_H
+#define MODULATED_DEFORM_CONV_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias);
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias);
+#endif  // MODULATED_DEFORM_CONV_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ms_deform_attn.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..25c8f6209b16c475ba181eea7c880eb27cca4082
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
@@ -0,0 +1,60 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step) {
+  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
+                              spatial_shapes, level_start_index, sampling_loc,
+                              attn_weight, im2col_step);
+}
+
+void ms_deform_attn_impl_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
+    const int im2col_step) {
+  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
+                       level_start_index, sampling_loc, attn_weight,
+                       grad_output, grad_value, grad_sampling_loc,
+                       grad_attn_weight, im2col_step);
+}
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight,
+                              const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
+}
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
+                               sampling_loc, attn_weight, grad_output,
+                               grad_value, grad_sampling_loc, grad_attn_weight,
+                               im2col_step);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3ad786a8e08129fa84fa73b710637e6e23b2994
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+using namespace at;
+using namespace parrots;
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight, const int im2col_step);
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step);
+
+void ms_deform_attn_forward_parrots(CudaContext &ctx, const SSElement &attr,
+                                    const OperatorBase::in_list_t &ins,
+                                    OperatorBase::out_list_t &outs) {
+  int im2col_step;
+  SSAttrs(attr).get<int>("im2col_step", im2col_step).done();
+  const auto &value = buildATensor(ctx, ins[0]);
+  const auto &spatial_shapes = buildATensor(ctx, ins[1]);
+  const auto &level_start_index = buildATensor(ctx, ins[2]);
+  const auto &sampling_loc = buildATensor(ctx, ins[3]);
+  const auto &attn_weight = buildATensor(ctx, ins[4]);
+  auto out = ms_deform_attn_forward(value, spatial_shapes, level_start_index,
+                                    sampling_loc, attn_weight, im2col_step);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void ms_deform_attn_backward_parrots(CudaContext &ctx, const SSElement &attr,
+                                     const OperatorBase::in_list_t &ins,
+                                     OperatorBase::out_list_t &outs) {
+  int im2col_step;
+  SSAttrs(attr).get<int>("im2col_step", im2col_step).done();
+  const auto &value = buildATensor(ctx, ins[0]);
+  const auto &spatial_shapes = buildATensor(ctx, ins[1]);
+  const auto &level_start_index = buildATensor(ctx, ins[2]);
+  const auto &sampling_loc = buildATensor(ctx, ins[3]);
+  const auto &attn_weight = buildATensor(ctx, ins[4]);
+  const auto &grad_output = buildATensor(ctx, ins[5]);
+  auto grad_value = buildATensor(ctx, outs[0]);
+  auto grad_sampling_loc = buildATensor(ctx, outs[1]);
+  auto grad_attn_weight = buildATensor(ctx, outs[2]);
+  ms_deform_attn_backward(value, spatial_shapes, level_start_index,
+                          sampling_loc, attn_weight, grad_output, grad_value,
+                          grad_sampling_loc, grad_attn_weight, im2col_step);
+}
+
+PARROTS_EXTENSION_REGISTER(ms_deform_attn_forward)
+    .attr("im2col_step")
+    .input(5)
+    .output(1)
+    .apply(ms_deform_attn_forward_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(ms_deform_attn_backward)
+    .attr("im2col_step")
+    .input(6)
+    .output(3)
+    .apply(ms_deform_attn_backward_parrots)
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..199d8af236f5442fcdd53ce3dfd8d24aa67481bb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset) {
+  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
+                              sigma, min_score, method, offset);
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets,
+                                              float iou_threshold) {
+  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
+}
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return nms_impl(boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset) {
+  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
+                      method, offset);
+}
+
+std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
+  return nms_match_impl(dets, iou_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db8b5f16e9a276a9891f0a415276c334ebf0901f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms_parrots.cpp
@@ -0,0 +1,140 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "nms_pytorch.h"
+
+using namespace parrots;
+
+// Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+template <typename T>
+void nms_parrots(T& ctx, const SSElement& attr,
+                 const OperatorBase::in_list_t& ins,
+                 OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int offset;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("offset", offset)
+      .done();
+  at::Tensor boxes, scores;
+  boxes = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  auto out = nms(boxes, scores, iou_threshold, offset);
+  updateDArray(ctx, out, outs[0]);
+}
+
+/*Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+ *                float sigma, float min_score, int method, int offset);*/
+template <typename T>
+void softnms_parrots(T& ctx, const SSElement& attr,
+                     const OperatorBase::in_list_t& ins,
+                     OperatorBase::out_list_t& outs) {
+  float iou_threshold, sigma, min_score;
+  int method, offset;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("sigma", sigma)
+      .get("min_score", min_score)
+      .get("method", method)
+      .get("offset", offset)
+      .done();
+  at::Tensor boxes, scores, dets;
+  boxes = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  dets = buildATensor(ctx, ins[2]);
+  auto out = softnms(boxes, scores, dets, iou_threshold, sigma, min_score,
+                     method, offset);
+  updateDArray(ctx, out, outs[0]);
+}
+
+// std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold);
+template <typename T>
+void nms_match_parrots(T& ctx, const SSElement& attr,
+                       const OperatorBase::in_list_t& ins,
+                       OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  SSAttrs(attr).get("iou_threshold", iou_threshold).done();
+  at::Tensor dets;
+  dets = buildATensor(ctx, ins[0]);
+  auto out = nms_match(dets, iou_threshold);
+  int n = out.size(), m = 0;
+  for (int i = 0; i < n; ++i)
+    if (m < out[i].size()) m = out[i].size();
+  auto options = torch::TensorOptions().dtype(at::kInt);
+  auto tensor = torch::zeros({n, m}, options);
+  for (int i = 0; i < n; i++)
+    tensor.slice(0, i, i + 1) =
+        torch::from_blob(out[i].data(), {out[i].size()}, options);
+  updateDArray(ctx, tensor, outs[0]);
+}
+
+/*Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+ *                    const Tensor dets_sorted, const float iou_threshold,
+ *                                       const int multi_label);*/
+template <typename T>
+void nms_rotated_parrots(T& ctx, const SSElement& attr,
+                         const OperatorBase::in_list_t& ins,
+                         OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int multi_label;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("multi_label", multi_label)
+      .done();
+  at::Tensor dets, scores, order, dets_sorted;
+  dets = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  order = buildATensor(ctx, ins[2]);
+  dets_sorted = buildATensor(ctx, ins[3]);
+  auto out =
+      nms_rotated(dets, scores, order, dets_sorted, iou_threshold, multi_label);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(nms)
+    .attr("iou_threshold")
+    .attr("offset")
+    .input(2)
+    .output(1)
+    .apply(nms_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softnms)
+    .attr("iou_threshold")
+    .attr("sigma")
+    .attr("min_score")
+    .attr("method")
+    .attr("offset")
+    .input(3)
+    .output(1)
+    .apply(softnms_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(softnms_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(nms_match)
+    .attr("iou_threshold")
+    .input(1)
+    .output(1)
+    .apply(nms_match_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_match_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(nms_rotated)
+    .attr("multi_label")
+    .attr("iou_threshold")
+    .input(4)
+    .output(1)
+    .apply(nms_rotated_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_rotated_parrots<CudaContext>)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..78c680e57c3089b44d29586175f56a5599560914
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms_pytorch.h
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef NMS_PYTORCH_H
+#define NMS_PYTORCH_H
+#include <torch/extension.h>
+
+at::Tensor nms(at::Tensor boxes, at::Tensor scores, float iou_threshold,
+               int offset);
+
+at::Tensor softnms(at::Tensor boxes, at::Tensor scores, at::Tensor dets,
+                   float iou_threshold, float sigma, float min_score,
+                   int method, int offset);
+
+std::vector<std::vector<int> > nms_match(at::Tensor dets, float iou_threshold);
+
+at::Tensor nms_rotated(const at::Tensor dets, const at::Tensor scores,
+                       const at::Tensor order, const at::Tensor dets_sorted,
+                       const float iou_threshold, const int multi_label);
+#endif  // NMS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e4ef676a9d6f94e5f60b7c9e1df8ce78eb6cbaa2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms_rotated.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h
+#include "pytorch_cpp_helper.hpp"
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold);
+
+#ifdef MMCV_WITH_CUDA
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order, const Tensor dets_sorted,
+                        const float iou_threshold, const int multi_label);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const float iou_threshold,
+                   const int multi_label) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
+                            multi_label);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  return nms_rotated_cpu(dets, scores, iou_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/pixel_group.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/pixel_group.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bf8c8bbf2061cacb9e0c2d33c8a635834407622
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/pixel_group.cpp
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/WenmuZhou/PAN.pytorch
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
+                              kernel_label, kernel_contour, kernel_region_num,
+                              dis_threshold);
+}
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold) {
+  score = score.contiguous();
+  mask = mask.contiguous();
+  embedding = embedding.contiguous();
+  kernel_label = kernel_label.contiguous();
+  kernel_contour = kernel_contour.contiguous();
+
+  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
+                          kernel_region_num, distance_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/pixel_group_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/pixel_group_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd863a4e1b341441b3700fe3931c9bb78c159ee6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/pixel_group_parrots.cpp
@@ -0,0 +1,54 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "pixel_group_pytorch.h"
+
+using namespace parrots;
+using namespace std;
+
+template <typename T>
+void pixel_group_parrots(T& ctx, const SSElement& attr,
+                         const OperatorBase::in_list_t& ins,
+                         OperatorBase::out_list_t& outs) {
+  int kernel_region_num;
+  float distance_threshold;
+  SSAttrs(attr)
+      .get<int>("kernel_region_num", kernel_region_num)
+      .get<float>("distance_threshold", distance_threshold)
+      .done();
+  at::Tensor score;
+  at::Tensor mask;
+  at::Tensor embedding;
+  at::Tensor kernel_label;
+  at::Tensor kernel_contour;
+  score = buildATensor(ctx, ins[0]);
+  mask = buildATensor(ctx, ins[1]);
+  embedding = buildATensor(ctx, ins[2]);
+  kernel_label = buildATensor(ctx, ins[3]);
+  kernel_contour = buildATensor(ctx, ins[4]);
+  auto out = pixel_group(score, mask, embedding, kernel_label, kernel_contour,
+                         kernel_region_num, distance_threshold);
+  int n = out.size();
+  std::vector<float> out_tensor;
+  for (int i = 0; i < n; ++i) out_tensor.push_back(float(out[i].size()));
+  for (int i = 0; i < n; ++i)
+    out_tensor.insert(out_tensor.end(), out[i].begin(), out[i].end());
+  auto options = torch::TensorOptions().dtype(at::kFloat);
+  auto tensor = torch::zeros({1, out_tensor.size()}, options);
+  tensor.slice(0, 0, 1) =
+      torch::from_blob(out_tensor.data(), {out_tensor.size()}, options);
+  updateDArray(ctx, tensor, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(pixel_group)
+    .attr("kernel_region_num")
+    .attr("distance_threshold")
+    .input(5)
+    .output(1)
+    .apply(pixel_group_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(pixel_group_parrots<CudaContext>)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/pixel_group_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/pixel_group_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1686ef3ee3647ada5fa37ded01415c37a4186f2d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/pixel_group_pytorch.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef PIXEL_GROUP_PYTORCH_H
+#define PIXEL_GROUP_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold);
+
+#endif  // PIXEL_GROUP_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_boxes.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_boxes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..540da94038f6dea2dc10443905f289ddd131f1af
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_boxes.cpp
@@ -0,0 +1,44 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
+                                    boxes_tensor, pts_tensor,
+                                    box_idx_of_points_tensor);
+}
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
+  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
+                                   pts_tensor, box_idx_of_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..afd2b0eb2d6c84f0dc44229c08b6b764185365fb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp
@@ -0,0 +1,64 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "points_in_boxes_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void points_in_boxes_part_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto boxes_tensor = buildATensor(ctx, ins[0]);
+  auto pts_tensor = buildATensor(ctx, ins[1]);
+
+  auto box_idx_of_points_tensor = buildATensor(ctx, outs[0]);
+
+  points_in_boxes_part_forward(boxes_tensor, pts_tensor,
+                               box_idx_of_points_tensor);
+}
+
+void points_in_boxes_all_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto boxes_tensor = buildATensor(ctx, ins[0]);
+  auto pts_tensor = buildATensor(ctx, ins[1]);
+
+  auto box_idx_of_points_tensor = buildATensor(ctx, outs[0]);
+
+  points_in_boxes_all_forward(boxes_tensor, pts_tensor,
+                              box_idx_of_points_tensor);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_boxes_part_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_boxes_part_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(points_in_boxes_all_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_boxes_all_forward_cuda_parrots)
+    .done();
+#endif
+
+void points_in_boxes_forward_cpu_parrots(HostContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  auto boxes_tensor = buildATensor(ctx, ins[0]);
+  auto pts_tensor = buildATensor(ctx, ins[1]);
+
+  auto pts_indices_tensor = buildATensor(ctx, outs[0]);
+
+  points_in_boxes_cpu_forward(boxes_tensor, pts_tensor, pts_indices_tensor);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_boxes_cpu_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_boxes_forward_cpu_parrots)
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3e465e3c785e5e78c020f61eaeaa23e59d1948a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINTS_IN_BOXES_PYTORCH_H
+#define POINTS_IN_BOXES_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor);
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor);
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor);
+
+#endif  // POINTS_IN_BOXES_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_polygons.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75a93dcef33f23904c1218048e16beff65c230d1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_polygons.cpp
@@ -0,0 +1,15 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
+                       output, rows, cols);
+}
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
+  int rows = points.size(0);
+  int cols = polygons.size(0);
+  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d52018e6451f52d0c10648cea2ee036b3214376d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "points_in_polygons_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void points_in_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  auto points = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  points_in_polygons_forward(points, polygons, output);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_polygons_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_polygons_cuda_parrots)
+    .done();
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..042678143472b18c85ac6d1bdcd79cc97a4e7ab0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
@@ -0,0 +1,9 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINTS_IN_POLYGONS_PYTORCH_H
+#define POINTS_IN_POLYGONS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
+
+#endif  // POINTS_IN_POLYGONS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/prroi_pool.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/prroi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00db84a154bef7a7cee8d38ba6236d959849a3bc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/prroi_pool.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
+                       input, rois, grad_rois, pooled_height, pooled_width,
+                       spatial_scale);
+}
+
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale) {
+  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
+                          spatial_scale);
+}
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale) {
+  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
+                           pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
+                                pooled_height, pooled_width, spatial_scale);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e82955818640f3276255f14cd1e7db232117773
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp
@@ -0,0 +1,97 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "prroi_pool_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void prroi_pool_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  prroi_pool_forward(input, rois, output, pooled_height, pooled_width,
+                     spatial_scale);
+}
+
+void prroi_pool_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  prroi_pool_backward(grad_output, rois, grad_input, pooled_height,
+                      pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& output = buildATensor(ctx, ins[0]);
+  const auto& grad_output = buildATensor(ctx, ins[1]);
+  const auto& input = buildATensor(ctx, ins[2]);
+  const auto& rois = buildATensor(ctx, ins[3]);
+  auto grad_rois = buildATensor(ctx, outs[0]);
+  prroi_pool_coor_backward(output, grad_output, input, rois, grad_rois,
+                           pooled_height, pooled_width, spatial_scale);
+}
+
+PARROTS_EXTENSION_REGISTER(prroi_pool_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(2)
+    .output(1)
+    .apply(prroi_pool_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(prroi_pool_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(2)
+    .output(1)
+    .apply(prroi_pool_backward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(prroi_pool_coor_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(4)
+    .output(1)
+    .apply(prroi_pool_coor_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/prroi_pool_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/prroi_pool_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..451b01dd5d289cd6a4533f62f326b326cd89da16
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/prroi_pool_pytorch.h
@@ -0,0 +1,19 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef PRROI_POOL_PYTORCH_H
+#define PRROI_POOL_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale);
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale);
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale);
+
+#endif  // PRROI_POOL_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/psamask.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/psamask.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6064c9ba5fd7ec9bcfef22b3abcc65ef50106d67
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/psamask.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
+                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
+                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask) {
+  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask) {
+  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/psamask_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/psamask_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f67102d02cc124a81d300aea4946c65155ede81d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/psamask_parrots.cpp
@@ -0,0 +1,129 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "psamask_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void psamask_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  auto output = buildATensor(ctx, outs[0]);
+  psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                   const OperatorBase::in_list_t &ins,
+                                   OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
+#endif
+
+void psamask_forward_cpu_parrots(HostContext &ctx, const SSElement &attr,
+                                 const OperatorBase::in_list_t &ins,
+                                 OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  auto output = buildATensor(ctx, outs[0]);
+  psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
+                      h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward_cpu_parrots(HostContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
+                       w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+PARROTS_EXTENSION_REGISTER(psamask_forward)
+    .attr("psa_type")
+    .attr("num_")
+    .attr("h_feature")
+    .attr("w_feature")
+    .attr("h_mask")
+    .attr("w_mask")
+    .attr("half_h_mask")
+    .attr("half_w_mask")
+    .input(1)
+    .output(1)
+    .apply(psamask_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(psamask_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(psamask_backward)
+    .attr("psa_type")
+    .attr("num_")
+    .attr("h_feature")
+    .attr("w_feature")
+    .attr("h_mask")
+    .attr("w_mask")
+    .attr("half_h_mask")
+    .attr("half_w_mask")
+    .input(1)
+    .output(1)
+    .apply(psamask_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(psamask_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/psamask_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/psamask_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3f0579efb8b8149f1840d0a20fc5ba91df74f06
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/psamask_pytorch.h
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef PSAMASK_PYTORCH_H
+#define PSAMASK_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+#endif
+void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask);
+
+void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask);
+#endif  // PSAMASK_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81ffa9fd6dcd82117ca13ac83b88b5f023aca466
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
+                       pooled_height, pooled_width, spatial_scale, num_samples,
+                       num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, pooled_height, pooled_width, spatial_scale,
+                       num_samples, num_orientations, clockwise);
+}
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise) {
+  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
+                                   pooled_width, spatial_scale, num_samples,
+                                   num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise) {
+  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
+                                    pooled_width, spatial_scale, num_samples,
+                                    num_orientations, clockwise);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5eb340ce42cf0ed4ccbe66a4b97aaed55a13be8b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
@@ -0,0 +1,86 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "riroi_align_rotated_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void riroi_align_rotated_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  int num_orientations;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("num_samples", sample_num)
+      .get<int>("num_orientations", num_orientations)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  auto input = buildATensor(ctx, ins[0]);
+  auto rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  riroi_align_rotated_forward(input, rois, output, pooled_height, pooled_width,
+                              spatial_scale, sample_num, num_orientations,
+                              clockwise);
+}
+
+void riroi_align_rotated_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  int num_orientations;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("num_samples", sample_num)
+      .get<int>("num_orientations", num_orientations)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  riroi_align_rotated_backward(grad_output, rois, grad_input, pooled_height,
+                               pooled_width, spatial_scale, sample_num,
+                               num_orientations, clockwise);
+}
+
+PARROTS_EXTENSION_REGISTER(riroi_align_rotated_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("num_samples")
+    .attr("num_orientations")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(riroi_align_rotated_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(riroi_align_rotated_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("num_samples")
+    .attr("num_orientations")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(riroi_align_rotated_backward_cuda_parrots)
+    .done();
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..49a30bffaffe059c98884332449c6af817036390
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef RIROI_ALIGN_ROTATED_PYTORCH_H
+#define RIROI_ALIGN_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise);
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise);
+
+#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e7077397d06ecd55af1e1060e64fe8c5ff08c94
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
+                       argmax_x, grad_input, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60abea092709427b0e62c101931911c2c1924cf1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_parrots.cpp
@@ -0,0 +1,151 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roi_align_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax_y = buildATensor(ctx, outs[1]);
+  auto argmax_x = buildATensor(ctx, outs[2]);
+  roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& argmax_y = buildATensor(ctx, ins[2]);
+  const auto& argmax_x = buildATensor(ctx, ins[3]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+}
+#endif
+
+void roi_align_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax_y = buildATensor(ctx, outs[1]);
+  auto argmax_x = buildATensor(ctx, outs[2]);
+  roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x, aligned_height,
+                        aligned_width, spatial_scale, sampling_ratio, pool_mode,
+                        aligned);
+}
+
+void roi_align_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& argmax_y = buildATensor(ctx, ins[2]);
+  const auto& argmax_x = buildATensor(ctx, ins[3]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_align_forward)
+    .attr("aligned_height")
+    .attr("aligned_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("pool_mode")
+    .attr("aligned")
+    .input(2)
+    .output(3)
+    .apply(roi_align_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_align_backward)
+    .attr("aligned_height")
+    .attr("aligned_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("pool_mode")
+    .attr("aligned")
+    .input(4)
+    .output(1)
+    .apply(roi_align_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c60160984fd964663547c590025558780c8c62f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_pytorch.h
@@ -0,0 +1,32 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_ALIGN_PYTORCH_H
+#define ROI_ALIGN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+#endif
+
+void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+#endif  // ROI_ALIGN_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ef691ada07e599740906254369631189e5d6f51
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
+                       aligned_height, aligned_width, spatial_scale,
+                       sample_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, aligned_height, aligned_width,
+                       spatial_scale, sample_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned, bool clockwise) {
+  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
+                                 aligned_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                Tensor bottom_grad, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise) {
+  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
+                                  aligned_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9386250a27b1db338bcc522c4acf9b29b05077db
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
@@ -0,0 +1,147 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roi_align_rotated_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_rotated_forward_cuda_parrots(CudaContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
+                                 pooled_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_rotated_backward_cuda(grad_output, rois, grad_input, pooled_height,
+                                  pooled_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
+}
+#endif
+
+void roi_align_rotated_forward_cpu_parrots(HostContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
+                                pooled_width, spatial_scale, sampling_ratio,
+                                aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cpu_parrots(HostContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
+                                 pooled_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_align_rotated_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("aligned")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(roi_align_rotated_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_rotated_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_align_rotated_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("aligned")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(roi_align_rotated_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_rotated_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8136b56d133d4dfa32b0d1aa2a02425560dee0e0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_ALIGN_ROTATED_PYTORCH_H
+#define ROI_ALIGN_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                                    int pooled_height, int pooled_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_cuda(Tensor grad_output, Tensor rois,
+                                     Tensor bottom_grad, int pooled_height,
+                                     int pooled_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+#endif
+
+void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_cpu(Tensor grad_output, Tensor rois,
+                                    Tensor bottom_grad, int pooled_height,
+                                    int pooled_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise);
+
+#endif  // ROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_pool.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bba90b806c5fe59d9e20a0b41a51df9922e91c3f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_pool.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
+                       grad_input, pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width,
+                      float spatial_scale) {
+  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
+}
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale) {
+  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_pool_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_pool_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0acde4a41e46ccac53c8b4bae80bd88fb2fde6d6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_pool_parrots.cpp
@@ -0,0 +1,67 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roi_pool_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roi_pool_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax = buildATensor(ctx, outs[1]);
+  roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& argmax = buildATensor(ctx, ins[2]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_pool_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(2)
+    .output(2)
+    .apply(roi_pool_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_pool_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(3)
+    .output(1)
+    .apply(roi_pool_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_pool_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_pool_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d67a1502fe955fa469cc5f854687df88ee432756
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_pool_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_POOL_PYTORCH_H
+#define ROI_POOL_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+#endif
+#endif  // ROI_POOL_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cf9cf0945db4c0ce1774aed6d334b62f3e1a9e4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
@@ -0,0 +1,72 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
+                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
+                       pts, pts_feature, argmax, pts_idx_of_voxels,
+                       pooled_features, pool_method);
+}
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
+                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
+                       argmax, grad_out, grad_in, pool_method);
+}
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR
+  // coordinate
+  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // params pts_feature: (npoints, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params pooled_features: (N, out_x, out_y, out_z, C)
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = rois.size(0);
+  int pts_num = pts.size(0);
+  int channels = pts_feature.size(1);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  assert((out_x < 256) && (out_y < 256) &&
+         (out_z < 256));  // we encode index with 8bit
+
+  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
+                               out_x, out_y, out_z, rois, pts, pts_feature,
+                               argmax, pts_idx_of_voxels, pooled_features,
+                               pool_method);
+}
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in,
+                              int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = pts_idx_of_voxels.size(0);
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int channels = grad_out.size(4);
+
+  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
+                                max_pts_each_voxel, pts_idx_of_voxels, argmax,
+                                grad_out, grad_in, pool_method);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..771d920043869cd538377a9f9a7320dd67243c69
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roiaware_pool3d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roiaware_pool3d_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int pool_method;
+  SSAttrs(attr).get<int>("pool_method", pool_method).done();
+  auto rois = buildATensor(ctx, ins[0]);
+  auto pts = buildATensor(ctx, ins[1]);
+  auto pts_feature = buildATensor(ctx, ins[2]);
+
+  auto argmax = buildATensor(ctx, outs[0]);
+  auto pts_idx_of_voxels = buildATensor(ctx, outs[1]);
+  auto pooled_features = buildATensor(ctx, outs[2]);
+
+  roiaware_pool3d_forward(rois, pts, pts_feature, argmax, pts_idx_of_voxels,
+                          pooled_features, pool_method);
+}
+
+void roiaware_pool3d_backward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pool_method;
+  SSAttrs(attr).get<int>("pool_method", pool_method).done();
+  auto pts_idx_of_voxels = buildATensor(ctx, ins[0]);
+  auto argmax = buildATensor(ctx, ins[1]);
+  auto grad_out = buildATensor(ctx, ins[2]);
+
+  auto grad_in = buildATensor(ctx, outs[0]);
+
+  roiaware_pool3d_backward(pts_idx_of_voxels, argmax, grad_out, grad_in,
+                           pool_method);
+}
+
+PARROTS_EXTENSION_REGISTER(roiaware_pool3d_forward)
+    .attr("pool_method")
+    .input(3)
+    .output(3)
+    .apply(roiaware_pool3d_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roiaware_pool3d_backward)
+    .attr("pool_method")
+    .input(3)
+    .output(1)
+    .apply(roiaware_pool3d_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b4b0402afa573c2231a3667fec41632ed854ad2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIAWARE_POOL3D_PYTORCH_H
+#define ROIAWARE_POOL3D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in, int pool_method);
+
+#endif  // ROIAWARE_POOL3D_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a10080b7c23abb3a31b6f764c972ea7917f52346
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
@@ -0,0 +1,39 @@
+/*
+Modified from
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
+                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
+                       pts_feature, pooled_features, pooled_empty_flag);
+}
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  int batch_size = xyz.size(0);
+  int pts_num = xyz.size(1);
+  int boxes_num = boxes3d.size(1);
+  int feature_in_len = pts_feature.size(2);
+  int sampled_pts_num = pooled_features.size(2);
+
+  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
+                               sampled_pts_num, xyz, boxes3d, pts_feature,
+                               pooled_features, pooled_empty_flag);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..17f549849df4d433d5c7369f5f43715d1f88a56e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roipoint_pool3d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roipoint_pool3d_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  auto xyz = buildATensor(ctx, ins[0]);
+  auto boxes3d = buildATensor(ctx, ins[1]);
+  auto pts_feature = buildATensor(ctx, ins[2]);
+
+  auto pooled_features = buildATensor(ctx, outs[0]);
+  auto pooled_empty_flag = buildATensor(ctx, outs[1]);
+
+  roipoint_pool3d_forward(xyz, boxes3d, pts_feature, pooled_features,
+                          pooled_empty_flag);
+}
+
+PARROTS_EXTENSION_REGISTER(roipoint_pool3d_forward)
+    .input(3)
+    .output(2)
+    .apply(roipoint_pool3d_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5b61b0d9ab2d2ed6ea3db9947ae8dc1e0d96992
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h
@@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIPOINT_POOL3D_PYTORCH_H
+#define ROIPOINT_POOL3D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag);
+
+#endif  // ROIPOINT_POOL3D_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/rotated_feature_align.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71fe0c9a0a26003310a388d4edca6e79aa7b9026
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/rotated_feature_align.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
+                       best_bboxes, spatial_scale, points, output);
+}
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
+                       best_bboxes, spatial_scale, points, bottom_grad);
+}
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale,
+                                   const int points) {
+  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
+                                     points, output);
+}
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points) {
+  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
+                                      points, bottom_grad);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4efaf1d3a3d8f382047202defc7546b8af6c48f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp
@@ -0,0 +1,99 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "rotated_feature_align_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void rotated_feature_align_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto features = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,
+                                points);
+}
+
+void rotated_feature_align_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,
+                                 spatial_scale, points);
+}
+#endif
+
+void rotated_feature_align_forward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto features = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,
+                                points);
+}
+
+void rotated_feature_align_backward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,
+                                 spatial_scale, points);
+}
+
+PARROTS_EXTENSION_REGISTER(rotated_feature_align_forward)
+    .attr("spatial_scale")
+    .attr("points")
+    .input(2)
+    .output(1)
+    .apply(rotated_feature_align_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(rotated_feature_align_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(rotated_feature_align_backward)
+    .attr("spatial_scale")
+    .attr("points")
+    .input(2)
+    .output(1)
+    .apply(rotated_feature_align_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(rotated_feature_align_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a695ee5e3de4b2d8f77e93fb06986967f3a35d0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h
@@ -0,0 +1,17 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROTATED_FEATURE_ALIGN_PYTORCH_H
+#define ROTATED_FEATURE_ALIGN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale, const int points);
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points);
+
+#endif  // ROTATED_FEATURE_ALIGN_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/sync_bn.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/sync_bn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd5a513273a7bbce2cf41c790706fe4801f4c414
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/sync_bn.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
+}
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
+}
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
+                       running_mean, running_var, weight, bias, norm, std,
+                       output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
+                       grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
+                       grad_weight, grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean) {
+  sync_bn_forward_mean_impl(input, mean);
+}
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
+  sync_bn_forward_var_impl(input, mean, var);
+}
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size) {
+  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
+}
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias) {
+  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input) {
+  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/sync_bn_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/sync_bn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b1855abd1cca4bd0cd831c3b86e50f273779339
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/sync_bn_parrots.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "sync_bn_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void sync_bn_forward_mean_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  const auto& input = buildATensor(ctx, ins[0]);
+  auto mean = buildATensor(ctx, outs[0]);
+  sync_bn_forward_mean_cuda(input, mean);
+}
+
+void sync_bn_forward_var_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& mean = buildATensor(ctx, ins[1]);
+  auto var = buildATensor(ctx, outs[0]);
+  sync_bn_forward_var_cuda(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda_parrots(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  size_t group_size;
+  float eps, momentum;
+  SSAttrs(attr)
+      .get<float>("eps", eps)
+      .get<float>("momentum", momentum)
+      .get<size_t>("group_size", group_size)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& mean = buildATensor(ctx, ins[1]);
+  const auto& var = buildATensor(ctx, ins[2]);
+  const auto& weight = buildATensor(ctx, ins[3]);
+  const auto& bias = buildATensor(ctx, ins[4]);
+  auto running_mean = buildATensor(ctx, outs[0]);
+  auto running_var = buildATensor(ctx, outs[1]);
+  auto norm = buildATensor(ctx, outs[2]);
+  auto std = buildATensor(ctx, outs[3]);
+  auto output = buildATensor(ctx, outs[4]);
+  sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
+}
+
+void sync_bn_backward_param_cuda_parrots(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& norm = buildATensor(ctx, ins[1]);
+  auto grad_weight = buildATensor(ctx, outs[0]);
+  auto grad_bias = buildATensor(ctx, outs[1]);
+  sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& grad_weight = buildATensor(ctx, ins[2]);
+  const auto& grad_bias = buildATensor(ctx, ins[3]);
+  const auto& norm = buildATensor(ctx, ins[4]);
+  const auto& std = buildATensor(ctx, ins[5]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
+}
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_mean)
+    .input(1)
+    .output(1)
+    .apply(sync_bn_forward_mean_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_var)
+    .input(2)
+    .output(1)
+    .apply(sync_bn_forward_var_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_output)
+    .attr("eps")
+    .attr("momentum")
+    .attr("group_size")
+    .input(5)
+    .output(5)
+    .apply(sync_bn_forward_output_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_backward_param)
+    .input(2)
+    .output(2)
+    .apply(sync_bn_backward_param_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_backward_data)
+    .input(6)
+    .output(1)
+    .apply(sync_bn_backward_data_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/sync_bn_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/sync_bn_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bd6a7fada22ed512489f74d69445042b9aaf84b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/sync_bn_pytorch.h
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SYNC_BN_PYTORCH_H
+#define SYNC_BN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+#endif  // SYNC_BN_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_interpolate.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_interpolate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e0ec71bb3d3fdb8416dcc62cfda926cc45c9977
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_interpolate.cpp
@@ -0,0 +1,33 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
+                       weight, out);
+}
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
+                       idx, weight, grad_points);
+}
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n) {
+  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
+                                 weight_tensor, out_tensor);
+}
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m) {
+  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
+                                  weight_tensor, grad_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a71a90fd1e6b0321e14665265430a31c2934cb51
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "three_interpolate_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void three_interpolate_forward_cuda_parrots(CudaContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int b, c, m, n;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("m", m)
+      .get<int>("n", n)
+      .done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+  auto weight_tensor = buildATensor(ctx, ins[2]);
+
+  auto out_tensor = buildATensor(ctx, outs[0]);
+
+  three_interpolate_forward(points_tensor, idx_tensor, weight_tensor,
+                            out_tensor, b, c, m, n);
+}
+
+void three_interpolate_backward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int b, c, n, m;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("m", m)
+      .done();
+
+  auto grad_out_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+  auto weight_tensor = buildATensor(ctx, ins[2]);
+
+  auto grad_points_tensor = buildATensor(ctx, outs[0]);
+
+  three_interpolate_backward(grad_out_tensor, idx_tensor, weight_tensor,
+                             grad_points_tensor, b, c, n, m);
+}
+
+PARROTS_EXTENSION_REGISTER(three_interpolate_forward)
+    .attr("b")
+    .attr("c")
+    .attr("m")
+    .attr("n")
+    .input(3)
+    .output(1)
+    .apply(three_interpolate_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(three_interpolate_backward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("m")
+    .input(3)
+    .output(1)
+    .apply(three_interpolate_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_interpolate_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_interpolate_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..464c6d90051529e2f2c694bfda9cb15f5998c9c5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_interpolate_pytorch.h
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_INTERPOLATE_PYTORCH_H
+#define THREE_INTERPOLATE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n);
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m);
+#endif  // THREE_INTERPOLATE_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_nn.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_nn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b629200c0727cdec5ca4e0abd8ac65baacaa31f9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_nn.cpp
@@ -0,0 +1,18 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
+                       idx);
+}
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m) {
+  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
+                        idx_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_nn_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_nn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c28c7d216cc6c2d4ab55de26b7b9d9e0197642b3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_nn_parrots.cpp
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "three_nn_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void three_nn_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int b, n, m;
+  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();
+
+  auto unknown_tensor = buildATensor(ctx, ins[0]);
+  auto known_tensor = buildATensor(ctx, ins[1]);
+
+  auto dist2_tensor = buildATensor(ctx, outs[0]);
+  auto idx_tensor = buildATensor(ctx, outs[1]);
+
+  three_nn_forward(unknown_tensor, known_tensor, dist2_tensor, idx_tensor, b, n,
+                   m);
+}
+
+PARROTS_EXTENSION_REGISTER(three_nn_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .input(2)
+    .output(2)
+    .apply(three_nn_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_nn_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_nn_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6574fba0912bd87425de995db5ddb6c7b715381d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_nn_pytorch.h
@@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_NN_PYTORCH_H
+#define THREE_NN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m);
+#endif  // THREE_NN_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/tin_shift.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/tin_shift.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b03f587541f17cae3c3f03f5cb8747d4b0208efc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/tin_shift.cpp
@@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
+  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
+}
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
+}
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
+  tin_shift_forward_impl(input, shift, output);
+}
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
+  tin_shift_backward_impl(grad_output, shift, grad_input);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/tin_shift_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/tin_shift_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0920928e73a0af9650726420396c6a481e1b2bd
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/tin_shift_parrots.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "tin_shift_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void tin_shift_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                    const OperatorBase::in_list_t &ins,
+                                    OperatorBase::out_list_t &outs) {
+  const auto &input = buildATensor(ctx, ins[0]);
+  const auto &shift = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  tin_shift_forward_cuda(input, shift, output);
+}
+
+void tin_shift_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                     const OperatorBase::in_list_t &ins,
+                                     OperatorBase::out_list_t &outs) {
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  const auto &shift = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  tin_shift_backward_cuda(grad_output, shift, grad_input);
+}
+
+PARROTS_EXTENSION_REGISTER(tin_shift_forward)
+    .input(2)
+    .output(1)
+    .apply(tin_shift_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(tin_shift_backward)
+    .input(2)
+    .output(1)
+    .apply(tin_shift_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/tin_shift_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/tin_shift_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe72383764cd0ed13fd8b74938027ea9db992d52
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/tin_shift_pytorch.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TIN_SHIFT_PYTORCH_H
+#define TIN_SHIFT_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+#endif  // TIN_SHIFT_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/upfirdn2d.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/upfirdn2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd325bd7887a49b5f0ccd134604f24c0fd40fc10
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/upfirdn2d.cpp
@@ -0,0 +1,118 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1) {
+  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, kernel, up_x, up_y,
+                              down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
+}
+
+torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
+                        int up_x, int up_y, int down_x, int down_y, int pad_x0,
+                        int pad_x1, int pad_y0, int pad_y1) {
+  return upfirdn2d_op_impl(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                           pad_x1, pad_y0, pad_y1);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f0c50db5cdfca872a8231c26d6f578d0fdc171f5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+using namespace at;
+using namespace parrots;
+
+torch::Tensor upfirdn2d(const Tensor &input, const Tensor &kernel, int up_x,
+                        int up_y, int down_x, int down_y, int pad_x0,
+                        int pad_x1, int pad_y0, int pad_y1);
+
+void upfirdn2d_parrots(CudaContext &ctx, const SSElement &attr,
+                       const OperatorBase::in_list_t &ins,
+                       OperatorBase::out_list_t &outs) {
+  int up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1;
+  const auto &input = buildATensor(ctx, ins[0]);
+  const auto &kernel = buildATensor(ctx, ins[1]);
+  SSAttrs(attr)
+      .get("up_x", up_x)
+      .get("up_y", up_y)
+      .get("down_x", down_x)
+      .get("down_y", down_y)
+      .get("pad_x0", pad_x0)
+      .get("pad_x1", pad_x1)
+      .get("pad_y0", pad_y0)
+      .get("pad_y1", pad_y1)
+      .done();
+  auto out = upfirdn2d(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                       pad_x1, pad_y0, pad_y1);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(upfirdn2d)
+    .attr("up_x")
+    .attr("up_y")
+    .attr("down_x")
+    .attr("down_y")
+    .attr("pad_x0")
+    .attr("pad_x1")
+    .attr("pad_y0")
+    .attr("pad_y1")
+    .input(2)
+    .output(1)
+    .apply(upfirdn2d_parrots)
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/voxelization.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/voxelization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7946be6178ad5eae64958b4631c1cabec2a04eee
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/voxelization.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
+                              num_points_per_voxel, voxel_size, coors_range,
+                              max_points, max_voxels, NDim);
+}
+
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
+                              points, voxels, coors, num_points_per_voxel,
+                              voxel_size, coors_range, max_points, max_voxels,
+                              NDim);
+}
+
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim = 3) {
+  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
+                       coors_range, NDim);
+}
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim = 3,
+                           const bool deterministic = true) {
+  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+
+  if (deterministic) {
+    *voxel_num_data = hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  } else {
+    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  }
+}
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim = 3) {
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
+                                NDim);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/voxelization_parrots.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/voxelization_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..90e2a4445c217a49ecddf064455874b1be12a14f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/voxelization_parrots.cpp
@@ -0,0 +1,113 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "voxelization_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void hard_voxelize_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int max_points, max_voxels, NDim;
+  bool deterministic;
+  SSAttrs(attr)
+      .get<int>("max_points", max_points)
+      .get<int>("max_voxels", max_voxels)
+      .get<int>("NDim", NDim)
+      .get<bool>("deterministic", deterministic)
+      .done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto voxels = buildATensor(ctx, outs[0]);
+  auto coors = buildATensor(ctx, outs[1]);
+  auto num_points_per_voxel = buildATensor(ctx, outs[2]);
+  auto voxel_num = buildATensor(ctx, outs[3]);
+
+  hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
+                        num_points_per_voxel, voxel_num, max_points, max_voxels,
+                        NDim, deterministic);
+}
+
+void dynamic_voxelize_forward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int NDim;
+  SSAttrs(attr).get<int>("NDim", NDim).done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto coors = buildATensor(ctx, outs[0]);
+
+  dynamic_voxelize_forward(points, voxel_size, coors_range, coors, NDim);
+}
+#endif
+
+void hard_voxelize_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int max_points, max_voxels, NDim;
+  bool deterministic;
+  SSAttrs(attr)
+      .get<int>("max_points", max_points)
+      .get<int>("max_voxels", max_voxels)
+      .get<int>("NDim", NDim)
+      .get<bool>("deterministic", deterministic)
+      .done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto voxels = buildATensor(ctx, outs[0]);
+  auto coors = buildATensor(ctx, outs[1]);
+  auto num_points_per_voxel = buildATensor(ctx, outs[2]);
+  auto voxel_num = buildATensor(ctx, outs[3]);
+
+  hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
+                        num_points_per_voxel, voxel_num, max_points, max_voxels,
+                        NDim, deterministic);
+}
+
+void dynamic_voxelize_forward_cpu_parrots(HostContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int NDim;
+  SSAttrs(attr).get<int>("NDim", NDim).done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto coors = buildATensor(ctx, outs[0]);
+
+  dynamic_voxelize_forward(points, voxel_size, coors_range, coors, NDim);
+}
+
+PARROTS_EXTENSION_REGISTER(hard_voxelize_forward)
+    .attr("max_points")
+    .attr("max_voxels")
+    .attr("NDim")
+    .attr("deterministic")
+    .input(3)
+    .output(4)
+    .apply(hard_voxelize_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(hard_voxelize_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(dynamic_voxelize_forward)
+    .attr("NDim")
+    .input(3)
+    .output(1)
+    .apply(dynamic_voxelize_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(dynamic_voxelize_forward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/voxelization_pytorch.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/voxelization_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0019d51912cb4b8077147e553925ab107bc216ce
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/voxelization_pytorch.h
@@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef VOXELIZATION_PYTORCH_H
+#define VOXELIZATION_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim = 3,
+                           const bool deterministic = true);
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim = 3);
+
+#endif  // VOXELIZATION_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1ead1f8e4700d019fff7b25034e2475087040c8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
+                       output);
+}
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
+                       grad_in);
+}
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output) {
+  active_rotated_filter_forward_impl(input, indices, output);
+}
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in) {
+  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/assign_score_withk.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/assign_score_withk.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9076277181c48c7c8f236cb9da79a83c5d38d47f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/assign_score_withk.cpp
@@ -0,0 +1,42 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
+                       aggregate, points, centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
+                       aggregate, grad_out, points, centers, scores, knn_idx,
+                       grad_points, grad_centers, grad_scores);
+}
+
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
+                                const Tensor& scores, const Tensor& knn_idx,
+                                Tensor& output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate) {
+  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
+                                  centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
+                                 const Tensor& centers, const Tensor& scores,
+                                 const Tensor& knn_idx, Tensor& grad_points,
+                                 Tensor& grad_centers, Tensor& grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate) {
+  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
+                                   points, centers, scores, knn_idx,
+                                   grad_points, grad_centers, grad_scores);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ball_query.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ball_query.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0534db5ce136ed43e2f72a497281fb1968f41f9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ball_query.cpp
@@ -0,0 +1,38 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
+                       nsample, new_xyz, xyz, idx);
+}
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample) {
+  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
+                          new_xyz_tensor, xyz_tensor, idx_tensor);
+}
+
+void stack_ball_query_forward_impl(float max_radius, int nsample,
+                                   const Tensor new_xyz,
+                                   const Tensor new_xyz_batch_cnt,
+                                   const Tensor xyz, const Tensor xyz_batch_cnt,
+                                   Tensor idx) {
+  DISPATCH_DEVICE_IMPL(stack_ball_query_forward_impl, max_radius, nsample,
+                       new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
+}
+
+void stack_ball_query_forward(Tensor new_xyz_tensor, Tensor new_xyz_batch_cnt,
+                              Tensor xyz_tensor, Tensor xyz_batch_cnt,
+                              Tensor idx_tensor, float max_radius,
+                              int nsample) {
+  stack_ball_query_forward_impl(max_radius, nsample, new_xyz_tensor,
+                                new_xyz_batch_cnt, xyz_tensor, xyz_batch_cnt,
+                                idx_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..52269a1fd945d937de447f761c669b2ebae16c50
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
@@ -0,0 +1,60 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#ifdef MMCV_WITH_DIOPI
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+#include <diopi/functions_mmcv.h>
+
+#include "csrc_dipu/base/basedef.h"
+#include "csrc_dipu/diopirt/diopirt_impl.h"
+
+using dipu::diopi_helper::toDiopiScalar;
+using dipu::diopi_helper::toDiopiTensorHandle;
+#endif
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
+                       aligned, offset);
+}
+
+#ifdef MMCV_WITH_DIOPI
+void bbox_overlaps_diopi(const Tensor bboxes1, const Tensor bboxes2,
+                         Tensor ious, const int mode, const bool aligned,
+                         const int offset) {
+  auto bboxes1_p = toDiopiTensorHandle(bboxes1);
+  diopiDevice_t device;
+  diopiGetTensorDevice(bboxes1_p, &device);
+  if (device == diopi_host) {
+    bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto bboxes2_p = toDiopiTensorHandle(bboxes2);
+  auto ious_p = toDiopiTensorHandle(ious);
+  bool is_mock_cuda = bboxes1.device().type() == dipu::DIPU_DEVICE_TYPE;
+  if (is_mock_cuda &&
+      reinterpret_cast<void *>(diopiBboxOverlapsMmcv) != nullptr) {
+    auto ret = diopiBboxOverlapsMmcv(ch, ious_p, bboxes1_p, bboxes2_p, mode,
+                                     offset, aligned);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op bbox_overlaps";
+  auto bboxes1_cpu = bboxes1.cpu();
+  auto bboxes2_cpu = bboxes2.cpu();
+  auto ious_cpu = ious.cpu();
+  bbox_overlaps_impl(bboxes1_cpu, bboxes2_cpu, ious_cpu, mode, aligned, offset);
+  ious.copy_(ious_cpu);
+}
+#endif
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset) {
+#ifdef MMCV_WITH_DIOPI
+  bbox_overlaps_diopi(bboxes1, bboxes2, ious, mode, aligned, offset);
+#else
+  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
+#endif
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bezier_align.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bezier_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8521d66cb2ec1dae27cb215f8a42c8a61709073
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bezier_align.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned) {
+  DISPATCH_DEVICE_IMPL(bezier_align_forward_impl, input, rois, output,
+                       aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, aligned);
+}
+
+void bezier_align_backward_impl(Tensor grad_output, Tensor rois,
+                                Tensor grad_input, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned) {
+  DISPATCH_DEVICE_IMPL(bezier_align_backward_impl, grad_output, rois,
+                       grad_input, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, aligned);
+}
+
+void bezier_align_forward(Tensor input, Tensor rois, Tensor output,
+                          int aligned_height, int aligned_width,
+                          float spatial_scale, int sampling_ratio,
+                          bool aligned) {
+  bezier_align_forward_impl(input, rois, output, aligned_height, aligned_width,
+                            spatial_scale, sampling_ratio, aligned);
+}
+
+void bezier_align_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                           int aligned_height, int aligned_width,
+                           float spatial_scale, int sampling_ratio,
+                           bool aligned) {
+  bezier_align_backward_impl(grad_output, rois, grad_input, aligned_height,
+                             aligned_width, spatial_scale, sampling_ratio,
+                             aligned);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bias_act.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bias_act.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ad32dc50119492ee86c8dbd905208bee7c2e3dc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bias_act.cpp
@@ -0,0 +1,20 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor bias_act_op_impl(const torch::Tensor &input,
+                               const torch::Tensor &bias,
+                               const torch::Tensor &xref,
+                               const torch::Tensor &yref,
+                               const torch::Tensor &dy, int grad, int dim,
+                               int act, float alpha, float gain, float clamp) {
+  return DISPATCH_DEVICE_IMPL(bias_act_op_impl, input, bias, xref, yref, dy,
+                              grad, dim, act, alpha, gain, clamp);
+}
+
+torch::Tensor bias_act(const torch::Tensor &input, const torch::Tensor &bias,
+                       const torch::Tensor &xref, const torch::Tensor &yref,
+                       const torch::Tensor &dy, int grad, int dim, int act,
+                       float alpha, float gain, float clamp) {
+  return bias_act_op_impl(input, bias, xref, yref, dy, grad, dim, act, alpha,
+                          gain, clamp);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/border_align.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/border_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..565de689913413ab106884365e6dc1edfa940de0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/border_align.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
+                       argmax_idx, pool_size);
+}
+
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
+                       argmax_idx, grad_input, pool_size);
+}
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size) {
+  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
+}
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size) {
+  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
+                             pool_size);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_quadri.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_quadri.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48c928106d955e983e824efc1682ad6c0c514791
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_quadri.cpp
@@ -0,0 +1,17 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_quadri_impl, boxes1, boxes2, ious, mode_flag,
+                       aligned);
+}
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+void box_iou_quadri(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                    const int mode_flag, const bool aligned) {
+  box_iou_quadri_impl(boxes1, boxes2, ious, mode_flag, aligned);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2a4e0953a5575f72c167bd668c6b6e758ebae87
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
@@ -0,0 +1,19 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
+                       aligned);
+}
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned) {
+  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a563aed94f04e32614e38062c4e7f4250c6dafe6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
+                       rmasks, output, kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
+                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor) {
+  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor) {
+  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe_naive.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe_naive.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e8917a61d93c7e6613566902cb00623ea89444e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe_naive.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
+                       kernel_size, group_size, scale_factor);
+}
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/chamfer_distance.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/chamfer_distance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dcff69893185d7cc52d8048d300b45ccfe0b3968
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/chamfer_distance.cpp
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2) {
+  DISPATCH_DEVICE_IMPL(chamfer_distance_forward_impl, xyz1, xyz2, dist1, dist2,
+                       idx1, idx2);
+}
+
+void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor idx1, Tensor idx2, Tensor graddist1,
+                                    Tensor graddist2, Tensor gradxyz1,
+                                    Tensor gradxyz2) {
+  DISPATCH_DEVICE_IMPL(chamfer_distance_backward_impl, xyz1, xyz2, idx1, idx2,
+                       graddist1, graddist2, gradxyz1, gradxyz2);
+}
+
+void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
+                              const Tensor dist1, const Tensor dist2,
+                              const Tensor idx1, const Tensor idx2) {
+  chamfer_distance_forward_impl(xyz1, xyz2, dist1, dist2, idx1, idx2);
+}
+
+void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
+                               Tensor idx1, Tensor idx2, Tensor graddist1,
+                               Tensor graddist2, Tensor gradxyz1,
+                               Tensor gradxyz2) {
+  chamfer_distance_backward_impl(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
+                                 gradxyz1, gradxyz2);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/contour_expand.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/contour_expand.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..586c48ee44b6b7dbb24573b4a2d2ecf499a56d0b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/contour_expand.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/whai362/PSENet
+#include <iostream>
+#include <queue>
+
+#include "pytorch_cpp_helper.hpp"
+
+using namespace std;
+
+class Point2d {
+ public:
+  int x;
+  int y;
+
+  Point2d() : x(0), y(0) {}
+  Point2d(int _x, int _y) : x(_x), y(_y) {}
+};
+
+void kernel_dilate(const uint8_t *data, IntArrayRef data_shape,
+                   const int *label_map, int &label_num, int &min_area,
+                   vector<vector<int>> &text_line) {
+  std::vector<int> area(label_num + 1);
+  int kernel_num = data_shape[0];
+  int height = data_shape[1];
+  int width = data_shape[2];
+
+  for (int x = 0; x < height; ++x) {
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      area[label] += 1;
+    }
+  }
+
+  queue<Point2d> queue, next_queue;
+  for (int x = 0; x < height; ++x) {
+    vector<int> row(width);
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      if (area[label] < min_area) continue;
+
+      Point2d point(x, y);
+      queue.push(point);
+      row[y] = label;
+    }
+    text_line.emplace_back(row);
+  }
+
+  int dx[] = {-1, 1, 0, 0};
+  int dy[] = {0, 0, -1, 1};
+  vector<int> kernel_step(kernel_num);
+  std::for_each(kernel_step.begin(), kernel_step.end(),
+                [=](int &k) { return k * height * width; });
+
+  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
+    while (!queue.empty()) {
+      Point2d point = queue.front();
+      queue.pop();
+      int x = point.x;
+      int y = point.y;
+      int label = text_line[x][y];
+
+      bool is_edge = true;
+      for (int d = 0; d < 4; ++d) {
+        int tmp_x = x + dx[d];
+        int tmp_y = y + dy[d];
+
+        if (tmp_x < 0 || tmp_x >= height) continue;
+        if (tmp_y < 0 || tmp_y >= width) continue;
+        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
+        if (kernel_value == 0) continue;
+        if (text_line[tmp_x][tmp_y] > 0) continue;
+
+        Point2d point(tmp_x, tmp_y);
+        queue.push(point);
+        text_line[tmp_x][tmp_y] = label;
+        is_edge = false;
+      }
+
+      if (is_edge) {
+        next_queue.push(point);
+      }
+    }
+    swap(queue, next_queue);
+  }
+}
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num) {
+  kernel_mask = kernel_mask.contiguous();
+  internal_kernel_label = internal_kernel_label.contiguous();
+  assert(kernel_mask.dim() == 3);
+  assert(internal_kernel_label.dim() == 2);
+  assert(kernel_mask.size(1) == internal_kernel_label.size(0));
+  assert(kernel_mask.size(2) == internal_kernel_label.size(1));
+  CHECK_CPU_INPUT(kernel_mask);
+  CHECK_CPU_INPUT(internal_kernel_label);
+  auto ptr_data = kernel_mask.data_ptr<uint8_t>();
+  IntArrayRef data_shape = kernel_mask.sizes();
+
+  auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
+  vector<vector<int>> text_line;
+
+  kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
+                min_kernel_area, text_line);
+
+  return text_line;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/convex_iou.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/convex_iou.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79f2028b551c474453aff2f6633dd426194e4afd
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/convex_iou.cpp
@@ -0,0 +1,23 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
+}
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
+  convex_iou_impl(pointsets, polygons, ious);
+}
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
+}
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
+  convex_giou_impl(pointsets, polygons, output);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/correlation.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/correlation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4adba2a0c17201476352c473f1c7117af020ab2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/correlation.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <iostream>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
+                       patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
+                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
+                       padW, dilationH, dilationW, dilation_patchH,
+                       dilation_patchW, dH, dW);
+}
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW) {
+  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
+                           padW, dilationH, dilationW, dilation_patchH,
+                           dilation_patchW, dH, dW);
+}
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW) {
+  correlation_backward_impl(grad_output, input1, input2, grad_input1,
+                            grad_input2, kH, kW, patchH, patchW, padH, padW,
+                            dilationH, dilationW, dilation_patchH,
+                            dilation_patchW, dH, dW);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa5a8b3d517e9cec4cf953aa9f3de8e2fb17c3a3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
@@ -0,0 +1,120 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cpu/ActiveRotatingFilter_cpu.cpp
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+void active_rotated_filter_forward_cpu_kernel(
+    const T* weightData, const int* indicesData, const int num_output_planes,
+    const int num_input_planes, const int num_orientations, const int kH,
+    const int kW, const int num_rotations, T* outputData) {
+  const int nEntry = num_orientations * kH * kW;
+  int i, j, l;
+  int k;
+
+#pragma omp parallel for private(i, j, l, k)
+  for (i = 0; i < num_output_planes; i++) {
+    for (j = 0; j < num_input_planes; j++) {
+      for (l = 0; l < nEntry; l++) {
+        int weightIndex = i * num_input_planes * nEntry + j * nEntry + l;
+        T val = *(weightData + weightIndex);
+        for (k = 0; k < num_rotations; k++) {
+          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
+          T* target = outputData +
+                      i * (num_rotations * num_input_planes * nEntry) +
+                      k * (num_input_planes * nEntry) + j * (nEntry) + index;
+          *target = val;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void active_rotated_filter_backward_cpu_kernel(
+    const T* gradOutputData, const int* indicesData,
+    const int num_output_planes, const int num_input_planes,
+    const int num_orientations, const int kH, const int kW,
+    const int num_rotations, T* gradInputData) {
+  const int nEntry = num_orientations * kH * kW;
+  int i, j, l;
+  int k;
+
+#pragma omp parallel for private(i, j, l, k)
+  for (i = 0; i < num_output_planes; i++) {
+    for (j = 0; j < num_input_planes; j++) {
+      for (l = 0; l < nEntry; l++) {
+        int gradInputIndex = i * num_input_planes * nEntry + j * nEntry + l;
+        T* val = gradInputData + gradInputIndex;
+        *val = 0;
+        for (k = 0; k < num_rotations; k++) {
+          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
+          const T* target =
+              gradOutputData + i * (num_rotations * num_input_planes * nEntry) +
+              k * (num_input_planes * nEntry) + j * (nEntry) + index;
+          *val = *val + *target;
+        }
+      }
+    }
+  }
+}
+
+void ActiveRotatedFilterForwardCPULauncher(const Tensor input,
+                                           const Tensor indices,
+                                           Tensor output) {
+  const int num_output_planes = input.size(0);
+  const int num_input_planes = input.size(1);
+  const int num_orientations = input.size(2);
+  const int kH = input.size(3);
+  const int kW = input.size(4);
+  const int num_rotations = indices.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "active_rotated_filter_forward_cpu_kernel", [&] {
+        active_rotated_filter_forward_cpu_kernel<scalar_t>(
+            input.data_ptr<scalar_t>(), indices.data_ptr<int>(),
+            num_output_planes, num_input_planes, num_orientations, kH, kW,
+            num_rotations, output.data_ptr<scalar_t>());
+      });
+}
+
+void ActiveRotatedFilterBackwardCPULauncher(const Tensor grad_out,
+                                            const Tensor indices,
+                                            Tensor grad_in) {
+  const int num_orientations = indices.size(0);
+  const int kH = indices.size(1);
+  const int kW = indices.size(2);
+  const int num_rotations = indices.size(3);
+  const int num_output_planes = grad_out.size(0) / num_rotations;
+  const int num_input_planes = grad_out.size(1) / num_orientations;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "active_rotated_filter_backward_cpu_kernel", [&] {
+        active_rotated_filter_backward_cpu_kernel<scalar_t>(
+            grad_out.data_ptr<scalar_t>(), indices.data_ptr<int>(),
+            num_output_planes, num_input_planes, num_orientations, kH, kW,
+            num_rotations, grad_in.data_ptr<scalar_t>());
+      });
+}
+
+void active_rotated_filter_forward_cpu(const Tensor input, const Tensor indices,
+                                       Tensor output) {
+  ActiveRotatedFilterForwardCPULauncher(input, indices, output);
+}
+
+void active_rotated_filter_backward_cpu(const Tensor grad_out,
+                                        const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCPULauncher(grad_out, indices, grad_in);
+}
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CPU,
+                     active_rotated_filter_forward_cpu);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CPU,
+                     active_rotated_filter_backward_cpu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..44988954894d7ce4067ab005cd76391355b8bac6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp
@@ -0,0 +1,65 @@
+// Copyright(c) OpenMMLab.All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+using torch::indexing::None;
+using torch::indexing::Slice;
+
+void bbox_overlaps_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
+                              Tensor ious, const int mode_flag,
+                              const bool aligned, const int offset) {
+  Tensor temp_ious;
+  if (aligned) {
+    Tensor lt = torch::max(boxes1.index({Slice(None), Slice({None, 2})}),
+                           boxes2.index({Slice(None), Slice({None, 2})}));
+    Tensor rb = torch::min(boxes1.index({Slice(None), Slice(2)}),
+                           boxes2.index({Slice(None), Slice(2)}));
+    Tensor wh = (rb - lt + offset).clamp(0.f, INT_MAX * 1.f);
+    Tensor overlap = wh.index({Slice(None), 0}) * wh.index({Slice(None), 1});
+    Tensor area1 = (boxes1.index({Slice(None), 2}) -
+                    boxes1.index({Slice(None), 0}) + offset) *
+                   (boxes1.index({Slice(None), 3}) -
+                    boxes1.index({Slice(None), 1}) + offset);
+    if (mode_flag == 0) {
+      Tensor area2 = (boxes2.index({Slice(None), 2}) -
+                      boxes2.index({Slice(None), 0}) + offset) *
+                     (boxes2.index({Slice(None), 3}) -
+                      boxes2.index({Slice(None), 1}) + offset);
+      temp_ious = overlap / (area1 + area2 - overlap);
+    } else {
+      temp_ious = overlap / area1;
+    }
+  } else {
+    Tensor lt = torch::max(boxes1.index({Slice(None), None, Slice({None, 2})}),
+                           boxes2.index({Slice(None), Slice({None, 2})}));
+    Tensor rb = torch::min(boxes1.index({Slice(None), None, Slice(2)}),
+                           boxes2.index({Slice(None), Slice(2)}));
+    Tensor wh = (rb - lt + offset).clamp(0.f, INT_MAX * 1.f);
+    Tensor overlap = wh.index({"...", 0}) * wh.index({"...", 1});
+    Tensor area1 = (boxes1.index({Slice(None), 2}) -
+                    boxes1.index({Slice(None), 0}) + offset) *
+                   (boxes1.index({Slice(None), 3}) -
+                    boxes1.index({Slice(None), 1}) + offset);
+    if (mode_flag == 0) {
+      Tensor area2 = (boxes2.index({Slice(None), 2}) -
+                      boxes2.index({Slice(None), 0}) + offset) *
+                     (boxes2.index({Slice(None), 3}) -
+                      boxes2.index({Slice(None), 1}) + offset);
+      temp_ious =
+          overlap / (area1.index({Slice(None), None}) + area2 - overlap);
+    } else {
+      temp_ious = overlap / area1.index({Slice(None), None});
+    }
+  }
+  ious.copy_(temp_ious);
+}
+
+void bbox_overlaps_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                       const int mode, const bool aligned, const int offset) {
+  bbox_overlaps_cpu_kernel(boxes1, boxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CPU, bbox_overlaps_cpu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7eb0e5b9402c9fb2077252a4d758e7ea6345e672
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp
@@ -0,0 +1,447 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/BezierAlign
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+T bezier_curve(const T p0, const T p1, const T p2, const T p3, const T u) {
+  return ((1. - u) * (1. - u) * (1. - u) * p0 +
+          3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +
+          u * u * u * p3);
+}
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper, T p0_x,
+    T p0_y, T p1_x, T p1_y, T p2_x, T p2_y, T p3_x, T p3_y, T p4_x, T p4_y,
+    T p5_x, T p5_y, T p6_x, T p6_y, T p7_x, T p7_y, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, T offset,
+    std::vector<PreCalc<T>> &pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      // compute the coords
+      const T u = pw / static_cast<T>(pooled_width);
+      const T v = ph / static_cast<T>(pooled_height);
+      const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
+      const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
+      const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
+      const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
+      const T x_center = x1 * v + x0 * (1. - v) - offset;
+      const T y_center = y1 * v + y0 * (1. - v) - offset;
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = y_center - (T)0.5 * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = x_center - (T)0.5 * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void BezierAlignForward(const int nthreads, const T *input, const T *rois,
+                        T *output, const int pooled_height,
+                        const int pooled_width, const T &spatial_scale,
+                        const int sampling_ratio, bool aligned,
+                        const int channels, const int height, const int width) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    // beziers have size Nx(1+8*2) = Nx17
+    const T *offset_rois = rois + n * 17;
+    int roi_batch_ind = offset_rois[0];
+
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    // Do not use rounding; this implementation detail is critical
+    T p0_x = offset_rois[1] * spatial_scale;
+    T p0_y = offset_rois[2] * spatial_scale;
+    T p1_x = offset_rois[3] * spatial_scale;
+    T p1_y = offset_rois[4] * spatial_scale;
+    T p2_x = offset_rois[5] * spatial_scale;
+    T p2_y = offset_rois[6] * spatial_scale;
+    T p3_x = offset_rois[7] * spatial_scale;
+    T p3_y = offset_rois[8] * spatial_scale;
+    T p4_x = offset_rois[15] * spatial_scale;
+    T p4_y = offset_rois[16] * spatial_scale;
+    T p5_x = offset_rois[13] * spatial_scale;
+    T p5_y = offset_rois[14] * spatial_scale;
+    T p6_x = offset_rois[11] * spatial_scale;
+    T p6_y = offset_rois[12] * spatial_scale;
+    T p7_x = offset_rois[9] * spatial_scale;
+    T p7_y = offset_rois[10] * spatial_scale;
+
+    T roi_width = std::max(std::abs(p0_x - p3_x), std::abs(p4_x - p7_x));
+    T roi_height = std::max(std::abs(p0_y - p3_y), std::abs(p4_y - p7_y));
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "Beziers in BezierAlign cannot have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, p0_x, p0_y, p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x,
+        p4_y, p5_x, p5_y, p6_x, p6_y, p7_x, p7_y, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, offset, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T *offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                            pc.w2 * offset_input[pc.pos2] +
+                            pc.w3 * offset_input[pc.pos3] +
+                            pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T &w1, T &w2, T &w3, T &w4, int &x_low,
+                                   int &x_high, int &y_low, int &y_high,
+                                   const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+}
+
+template <class T>
+inline void add(T *address, const T &val) {
+  *address += val;
+}
+
+template <typename T>
+void BezierAlignBackward(const int nthreads, const T *grad_output,
+                         const T *rois, T *grad_input, const int pooled_height,
+                         const int pooled_width, const T &spatial_scale,
+                         const int sampling_ratio, bool aligned,
+                         const int channels, const int height, const int width,
+                         const int n_stride, const int c_stride,
+                         const int h_stride, const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T *offset_rois = rois + n * 17;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T p0_x = offset_rois[1] * spatial_scale;
+    T p0_y = offset_rois[2] * spatial_scale;
+    T p1_x = offset_rois[3] * spatial_scale;
+    T p1_y = offset_rois[4] * spatial_scale;
+    T p2_x = offset_rois[5] * spatial_scale;
+    T p2_y = offset_rois[6] * spatial_scale;
+    T p3_x = offset_rois[7] * spatial_scale;
+    T p3_y = offset_rois[8] * spatial_scale;
+    T p4_x = offset_rois[15] * spatial_scale;
+    T p4_y = offset_rois[16] * spatial_scale;
+    T p5_x = offset_rois[13] * spatial_scale;
+    T p5_y = offset_rois[14] * spatial_scale;
+    T p6_x = offset_rois[11] * spatial_scale;
+    T p6_y = offset_rois[12] * spatial_scale;
+    T p7_x = offset_rois[9] * spatial_scale;
+    T p7_y = offset_rois[10] * spatial_scale;
+
+    // compute the coords
+    const T u = pw / static_cast<T>(pooled_width);
+    const T v = ph / static_cast<T>(pooled_height);
+    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
+    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
+    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
+    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
+    const T x_center = x1 * v + x0 * (1. - v) - offset;
+    const T y_center = y1 * v + y0 * (1. - v) - offset;
+
+    T roi_width = std::max(std::abs(p0_x - p3_x), std::abs(p4_x - p7_x));
+    T roi_height = std::max(std::abs(p0_y - p3_y), std::abs(p4_y - p7_y));
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "Beziers in BezierAlign do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T *offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T *offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = y_center - (T)0.5 * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = x_center - (T)0.5 * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        }  // if
+      }    // ix
+    }      // iy
+  }        // for
+}  // BezierAlignBackward
+
+void BezierAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                   int aligned_height, int aligned_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "BezierAlign_forward", [&] {
+        BezierAlignForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(), aligned_height, aligned_width,
+            static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
+            channels, height, width);
+      });
+}
+
+void BezierAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                    Tensor grad_input, int aligned_height,
+                                    int aligned_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad_output.stride(0);
+  int c_stride = grad_output.stride(1);
+  int h_stride = grad_output.stride(2);
+  int w_stride = grad_output.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "BezierAlign_backward", [&] {
+        BezierAlignBackward<scalar_t>(
+            output_size, grad_output.data_ptr<scalar_t>(),
+            rois.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
+            sampling_ratio, aligned, channels, height, width, n_stride,
+            c_stride, h_stride, w_stride);
+      });
+}
+
+void bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned);
+
+void bezier_align_backward_impl(Tensor grad_output, Tensor rois,
+                                Tensor grad_input, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned);
+
+REGISTER_DEVICE_IMPL(bezier_align_forward_impl, CPU,
+                     BezierAlignForwardCPULauncher);
+REGISTER_DEVICE_IMPL(bezier_align_backward_impl, CPU,
+                     BezierAlignBackwardCPULauncher);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..211699ce2b832324848c4c6c5f7e5f90fcab97f2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp
@@ -0,0 +1,36 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+void box_iou_quadri_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
+                               Tensor ious, const int mode_flag,
+                               const bool aligned) {
+  int output_size = ious.numel();
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  if (aligned) {
+    for (int i = 0; i < output_size; i++) {
+      ious[i] = single_box_iou_quadri<T>(boxes1[i].data_ptr<T>(),
+                                         boxes2[i].data_ptr<T>(), mode_flag);
+    }
+  } else {
+    for (int i = 0; i < num_boxes1; i++) {
+      for (int j = 0; j < num_boxes2; j++) {
+        ious[i * num_boxes2 + j] = single_box_iou_quadri<T>(
+            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
+      }
+    }
+  }
+}
+
+void box_iou_quadri_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                        const int mode_flag, const bool aligned) {
+  box_iou_quadri_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_quadri_impl, CPU, box_iou_quadri_cpu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..585d2c9fddd1566e4898c35ce6e1f4533cd1a236
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
+                                Tensor ious, const int mode_flag,
+                                const bool aligned) {
+  int output_size = ious.numel();
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  if (aligned) {
+    for (int i = 0; i < output_size; i++) {
+      ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
+                                          boxes2[i].data_ptr<T>(), mode_flag);
+    }
+  } else {
+    for (int i = 0; i < num_boxes1; i++) {
+      for (int j = 0; j < num_boxes2; j++) {
+        ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
+            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
+      }
+    }
+  }
+}
+
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CPU, box_iou_rotated_cpu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ab67e78c7b5fb4468f47066935cb35b68525b54
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp
@@ -0,0 +1,408 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
+                                 const int height, const int width, T h, T w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+T get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
+                          const int height, const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+T get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
+                            const int width, const T *im_data,
+                            const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+void deformable_im2col_cpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  for (int index = 0; index < n; index++) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,
+                                               width, h_im, w_im);
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+void deformable_col2im_cpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  for (int index = 0; index < n; index++) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight =
+              get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
+                                      cur_h + dy, cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void deformable_col2im_coord_cpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int offset_channels, const int deformable_group, const int height_col,
+    const int width_col, T *grad_offset) {
+  for (int index = 0; index < n; index++) {
+    T val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      const T weight = get_coordinate_weight_cpu(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor data_col) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_cpu", [&] {
+        deformable_im2col_cpu_kernel<scalar_t>(
+            num_kernels, data_im.data_ptr<scalar_t>(),
+            data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels,
+            deformable_group, height_col, width_col,
+            data_col.data_ptr<scalar_t>());
+      });
+}
+
+void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_cpu_kernel<scalar_t>(
+            num_kernels, data_col_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+            dilation_w, channel_per_deformable_group, parallel_imgs,
+            deformable_group, height_col, width_col, grad_im_);
+      }));
+}
+
+void deformable_col2im_coord_cpu(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+                    deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_cpu_kernel<scalar_t>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height,
+            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+}
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CPU, deformable_im2col_cpu);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CPU, deformable_col2im_cpu);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CPU,
+                     deformable_col2im_coord_cpu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..95390956450d062a37eaec98664aff11a8035587
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
@@ -0,0 +1,436 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T dmcn_im2col_bilinear_cpu(const T *input, const int data_width,
+                           const int height, const int width, T h, T w) {
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+T dmcn_get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
+                               const int height, const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+T dmcn_get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
+                                 const int width, const T *im_data,
+                                 const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+void modulated_deformable_im2col_cpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const T *data_mask,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  for (int index = 0; index < n; index++) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const T *data_mask_ptr =
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width,
+                                         h_im, w_im);
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+void modulated_deformable_col2im_cpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const T *data_mask,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  for (int index = 0; index < n; index++) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T mask = data_mask_ptr[data_mask_hw_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data,
+                                                  cur_inv_w_data, cur_h + dy,
+                                                  cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void modulated_deformable_col2im_coord_cpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const T *data_mask, const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
+  for (int index = 0; index < n; index++) {
+    T val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      else
+        mval += data_col_ptr[col_pos] *
+                dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width,
+                                         width, height, width, inv_h, inv_w);
+      const T weight = dmcn_get_coordinate_weight_cpu(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                      kernel_w +
+                  offset_c / 2) *
+                     height_col +
+                 h) *
+                    width_col +
+                w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cpu(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_cpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_cpu_kernel(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im,
+            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
+            channels, deformable_group, height_col, width_col, data_col_);
+      }));
+}
+
+void modulated_deformable_col2im_cpu(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_cpu_kernel(
+            num_kernels, data_col_, data_offset_, data_mask_, channels,
+            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+}
+
+void modulated_deformable_col2im_coord_cpu(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+                          kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_cpu_kernel(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
+            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size,
+            2 * kernel_h * kernel_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_, grad_mask_);
+      }));
+}
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CPU,
+                     modulated_deformable_im2col_cpu);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CPU,
+                     modulated_deformable_col2im_cpu);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CPU,
+                     modulated_deformable_col2im_coord_cpu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..53e9b9a8d82c405e8f923be06f78cda730c0f4ee
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms.cpp
@@ -0,0 +1,230 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto nboxes = boxes.size(0);
+  Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
+
+  auto select = select_t.data_ptr<bool>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  for (int64_t _i = 0; _i < nboxes; _i++) {
+    if (select[_i] == false) continue;
+    auto i = order[_i];
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+      if (select[_j] == false) continue;
+      auto j = order[_j];
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr > iou_threshold) select[_j] = false;
+    }
+  }
+  return order_t.masked_select(select_t);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CPU, nms_cpu);
+
+Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
+                   float iou_threshold, float sigma, float min_score,
+                   int method, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+  auto scores_t = scores.clone();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto nboxes = boxes.size(0);
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto sc = scores_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+  auto de = dets.data_ptr<float>();
+
+  int64_t pos = 0;
+  Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
+  auto inds = inds_t.data_ptr<int64_t>();
+
+  for (int64_t i = 0; i < nboxes; i++) {
+    auto max_score = sc[i];
+    auto max_pos = i;
+
+    pos = i + 1;
+    // get max box
+    while (pos < nboxes) {
+      if (max_score < sc[pos]) {
+        max_score = sc[pos];
+        max_pos = pos;
+      }
+      pos = pos + 1;
+    }
+    // swap
+    auto ix1 = de[i * 5 + 0] = x1[max_pos];
+    auto iy1 = de[i * 5 + 1] = y1[max_pos];
+    auto ix2 = de[i * 5 + 2] = x2[max_pos];
+    auto iy2 = de[i * 5 + 3] = y2[max_pos];
+    auto iscore = de[i * 5 + 4] = sc[max_pos];
+    auto iarea = areas[max_pos];
+    auto iind = inds[max_pos];
+    x1[max_pos] = x1[i];
+    y1[max_pos] = y1[i];
+    x2[max_pos] = x2[i];
+    y2[max_pos] = y2[i];
+    sc[max_pos] = sc[i];
+    areas[max_pos] = areas[i];
+    inds[max_pos] = inds[i];
+    x1[i] = ix1;
+    y1[i] = iy1;
+    x2[i] = ix2;
+    y2[i] = iy2;
+    sc[i] = iscore;
+    areas[i] = iarea;
+    inds[i] = iind;
+
+    pos = i + 1;
+    while (pos < nboxes) {
+      auto xx1 = std::max(ix1, x1[pos]);
+      auto yy1 = std::max(iy1, y1[pos]);
+      auto xx2 = std::min(ix2, x2[pos]);
+      auto yy2 = std::min(iy2, y2[pos]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[pos] - inter);
+
+      float weight = 1.;
+      if (method == 0) {
+        if (ovr >= iou_threshold) weight = 0;
+      } else if (method == 1) {
+        if (ovr >= iou_threshold) weight = 1 - ovr;
+      } else if (method == 2) {
+        weight = std::exp(-(ovr * ovr) / sigma);
+      }
+      sc[pos] *= weight;
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update N
+      if (sc[pos] < min_score) {
+        x1[pos] = x1[nboxes - 1];
+        y1[pos] = y1[nboxes - 1];
+        x2[pos] = x2[nboxes - 1];
+        y2[pos] = y2[nboxes - 1];
+        sc[pos] = sc[nboxes - 1];
+        areas[pos] = areas[nboxes - 1];
+        inds[pos] = inds[nboxes - 1];
+        nboxes = nboxes - 1;
+        pos = pos - 1;
+      }
+      pos = pos + 1;
+    }
+  }
+  return inds_t.slice(0, 0, nboxes);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset);
+REGISTER_DEVICE_IMPL(softnms_impl, CPU, softnms_cpu);
+
+std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
+  auto x1_t = dets.select(1, 0).contiguous();
+  auto y1_t = dets.select(1, 1).contiguous();
+  auto x2_t = dets.select(1, 2).contiguous();
+  auto y2_t = dets.select(1, 3).contiguous();
+  auto scores = dets.select(1, 4).contiguous();
+
+  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t =
+      at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  std::vector<int> keep;
+  std::vector<std::vector<int> > matched;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) continue;
+    keep.push_back(i);
+    std::vector<int> v_i;
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) continue;
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(static_cast<float>(0), xx2 - xx1);
+      auto h = std::max(static_cast<float>(0), yy2 - yy1);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+        v_i.push_back(j);
+      }
+    }
+    matched.push_back(v_i);
+  }
+  for (size_t i = 0; i < keep.size(); i++)
+    matched[i].insert(matched[i].begin(), keep[i]);
+  return matched;
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets, float iou_threshold);
+REGISTER_DEVICE_IMPL(nms_match_impl, CPU, nms_match_cpu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..086df167eb49b1406c01129eea8783e393d7320f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp
@@ -0,0 +1,64 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+
+template <typename scalar_t>
+Tensor nms_quadri_cpu_kernel(const Tensor dets, const Tensor scores,
+                             const float iou_threshold) {
+  // nms_quadri_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+  // however, the code in this function is much shorter because
+  // we delegate the IoU computation for quadri boxes to
+  // the single_box_iou_quadri function in box_iou_rotated_utils.h
+  AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),
+             "dets should have the same type as scores");
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+
+  int64_t num_to_keep = 0;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) {
+      continue;
+    }
+
+    keep[num_to_keep++] = i;
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) {
+        continue;
+      }
+
+      auto ovr = single_box_iou_quadri<scalar_t>(
+          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+      }
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+
+Tensor nms_quadri_cpu(const Tensor dets, const Tensor scores,
+                      const float iou_threshold) {
+  auto result = at::empty({0}, dets.options());
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_quadri", [&] {
+    result = nms_quadri_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2774c82654ef83d220ca81566cce8d25d02c275
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
@@ -0,0 +1,66 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+
+template <typename scalar_t>
+Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
+                              const float iou_threshold) {
+  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+  // however, the code in this function is much shorter because
+  // we delegate the IoU computation for rotated boxes to
+  // the single_box_iou_rotated function in box_iou_rotated_utils.h
+  AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),
+             "dets should have the same type as scores");
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+
+  int64_t num_to_keep = 0;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) {
+      continue;
+    }
+
+    keep[num_to_keep++] = i;
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) {
+        continue;
+      }
+
+      auto ovr = single_box_iou_rotated<scalar_t>(
+          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+      }
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold) {
+  auto result = at::empty({0}, dets.options());
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
+    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db06a224a075e641b8d7738fe3e7be3f71990fc7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
@@ -0,0 +1,126 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/WenmuZhou/PAN.pytorch
+
+#include <queue>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> estimate_confidence(int32_t* label,
+                                                    float* score, int label_num,
+                                                    int height, int width) {
+  std::vector<std::vector<float>> point_vector;
+  for (int i = 0; i < label_num; i++) {
+    std::vector<float> point;
+    point.push_back(0);
+    point.push_back(0);
+    point_vector.push_back(point);
+  }
+  for (int y = 0; y < height; y++) {
+    auto label_tmp = label + y * width;
+    auto score_tmp = score + y * width;
+    for (int x = 0; x < width; x++) {
+      auto l = label_tmp[x];
+      if (l > 0) {
+        float confidence = score_tmp[x];
+        point_vector[l].push_back(x);
+        point_vector[l].push_back(y);
+        point_vector[l][0] += confidence;
+        point_vector[l][1] += 1;
+      }
+    }
+  }
+  for (size_t l = 0; l < point_vector.size(); l++)
+    if (point_vector[l][1] > 0) {
+      point_vector[l][0] /= point_vector[l][1];
+    }
+  return point_vector;
+}
+std::vector<std::vector<float>> pixel_group_cpu(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  assert(score.dim() == 2);
+  assert(mask.dim() == 2);
+  assert(embedding.dim() == 3);
+  int height = score.size(0);
+  int width = score.size(1);
+  assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
+  assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
+
+  auto threshold_square = dis_threshold * dis_threshold;
+  auto ptr_score = score.data_ptr<float>();
+  auto ptr_mask = mask.data_ptr<bool>();
+  auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
+  auto ptr_embedding = embedding.data_ptr<float>();
+  auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
+  std::queue<std::tuple<int, int, int32_t>> contour_pixels;
+  auto embedding_dim = embedding.size(2);
+  std::vector<std::vector<float>> kernel_vector(
+      kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
+
+  Tensor text_label;
+  text_label = kernel_label.clone();
+  auto ptr_text_label = text_label.data_ptr<int32_t>();
+
+  for (int i = 0; i < height; i++) {
+    auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
+    auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
+    auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
+
+    for (int j = 0, k = 0; j < width && k < width * embedding_dim;
+         j++, k += embedding_dim) {
+      int32_t label = ptr_kernel_label_tmp[j];
+      if (label > 0) {
+        for (int d = 0; d < embedding_dim; d++)
+          kernel_vector[label][d] += ptr_embedding_tmp[k + d];
+        kernel_vector[label][embedding_dim] += 1;
+        // kernel pixel number
+        if (ptr_kernel_contour_tmp[j]) {
+          contour_pixels.push(std::make_tuple(i, j, label));
+        }
+      }
+    }
+  }
+  for (int i = 0; i < kernel_region_num; i++) {
+    for (int j = 0; j < embedding_dim; j++) {
+      kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
+    }
+  }
+  int dx[4] = {-1, 1, 0, 0};
+  int dy[4] = {0, 0, -1, 1};
+  while (!contour_pixels.empty()) {
+    auto query_pixel = contour_pixels.front();
+    contour_pixels.pop();
+    int y = std::get<0>(query_pixel);
+    int x = std::get<1>(query_pixel);
+    int32_t l = std::get<2>(query_pixel);
+    auto kernel_cv = kernel_vector[l];
+    for (int idx = 0; idx < 4; idx++) {
+      int tmpy = y + dy[idx];
+      int tmpx = x + dx[idx];
+      auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
+      if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
+      if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
+        continue;
+
+      float dis = 0;
+      auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
+      for (size_t i = 0; i < size_t(embedding_dim); i++) {
+        dis +=
+            pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
+        // ignore further computing if dis is big enough
+        if (dis >= threshold_square) break;
+      }
+      if (dis >= threshold_square) continue;
+      contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
+      ptr_text_label_tmp[tmpx] = l;
+    }
+  }
+
+  return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
+                             height, width);
+}
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold);
+REGISTER_DEVICE_IMPL(pixel_group_impl, CPU, pixel_group_cpu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c16baa4cca4c380db4ae25462f5074607f084214
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp
@@ -0,0 +1,53 @@
+#include "pytorch_cpp_helper.hpp"
+
+inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
+                                      float &local_x, float &local_y) {
+  float cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
+                                 float &local_x, float &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  float x = pt[0], y = pt[1], z = pt[2];
+  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor) {
+  // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (npoints, 3) [x, y, z] in LiDAR coordinate params pts_indices: (N, npoints)
+
+  CHECK_CONTIGUOUS(boxes_tensor);
+  CHECK_CONTIGUOUS(pts_tensor);
+  CHECK_CONTIGUOUS(pts_indices_tensor);
+
+  int boxes_num = boxes_tensor.size(0);
+  int pts_num = pts_tensor.size(0);
+
+  const float *boxes = boxes_tensor.data_ptr<float>();
+  const float *pts = pts_tensor.data_ptr<float>();
+  int *pts_indices = pts_indices_tensor.data_ptr<int>();
+
+  float local_x = 0, local_y = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    for (int j = 0; j < pts_num; j++) {
+      int cur_in_flag =
+          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
+      pts_indices[i * pts_num + j] = cur_in_flag;
+    }
+  }
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/psamask.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/psamask.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa7fdcbdca908e3f037d75bcc6d7d9e68102d192
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/psamask.cpp
@@ -0,0 +1,199 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+#ifndef min
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+#ifndef max
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+void psamask_collect_forward(const int num_, const int h_feature,
+                             const int w_feature, const int h_mask,
+                             const int w_mask, const int half_h_mask,
+                             const int half_w_mask, const Tensor mask_data,
+                             Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view({-1})[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_forward(const int num_, const int h_feature,
+                                const int w_feature, const int h_mask,
+                                const int w_mask, const int half_h_mask,
+                                const int half_w_mask, const Tensor mask_data,
+                                Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view(
+                {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                          h_feature * w_feature +
+                      (hidx + h - half_h_mask) * w_feature +
+                      (widx + w - half_w_mask)] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_collect_backward(const int num_, const int h_feature,
+                              const int w_feature, const int h_mask,
+                              const int w_mask, const int half_h_mask,
+                              const int half_w_mask, const Tensor buffer_diff,
+                              Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view({-1})[(n * h_feature * w_feature +
+                                        (hidx + h - half_h_mask) * w_feature +
+                                        (widx + w - half_w_mask)) *
+                                           h_feature * w_feature +
+                                       h * w_feature + w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_backward(const int num_, const int h_feature,
+                                 const int w_feature, const int h_mask,
+                                 const int w_mask, const int half_h_mask,
+                                 const int half_w_mask,
+                                 const Tensor buffer_diff, Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view(
+                    {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                              h_feature * w_feature +
+                          (hidx + h - half_h_mask) * w_feature +
+                          (widx + w - half_w_mask)];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                            half_h_mask, half_w_mask, input, output);
+  else
+    psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                               half_h_mask, half_w_mask, input, output);
+}
+
+void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                             half_h_mask, half_w_mask, grad_output, grad_input);
+  else
+    psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                                half_h_mask, half_w_mask, grad_output,
+                                grad_input);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CPU, psamask_forward_cpu);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CPU, psamask_backward_cpu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d545390645917aff7e5e8b42564fb83eb4e62ae7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp
@@ -0,0 +1,466 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(const int nthreads, const T* input, const T* rois,
+                     T* output, T* argmax_y, T* argmax_x,
+                     const int pooled_height, const int pooled_width,
+                     const T spatial_scale, const int sampling_ratio,
+                     const int pool_mode,  // 0 - max pool, 1 - avg pool
+                     const bool aligned, const int channels, const int height,
+                     const int width) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign cannot have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          T maxval = -10000;
+          T maxidx_y = -1.f, maxidx_x = -1.f;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const T y = roi_start_h + ph * bin_size_h +
+                        static_cast<T>(iy + .5f) * bin_size_h /
+                            static_cast<T>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const T x = roi_start_w + pw * bin_size_w +
+                          static_cast<T>(ix + .5f) * bin_size_w /
+                              static_cast<T>(roi_bin_grid_w);
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              T val = pc.w1 * offset_input[pc.pos1] +
+                      pc.w2 * offset_input[pc.pos2] +
+                      pc.w3 * offset_input[pc.pos3] +
+                      pc.w4 * offset_input[pc.pos4];
+              if (val > maxval) {
+                maxval = val;
+                maxidx_y = y;
+                maxidx_x = x;
+              }
+              output_val += val;
+              pre_calc_index += 1;
+            }
+          }
+          if (pool_mode == 0) {
+            // We do max pooling inside a bin
+            output[index] = maxval;
+            argmax_y[index] = maxidx_y;
+            argmax_x[index] = maxidx_x;
+          } else if (pool_mode == 1) {
+            // We do average (integral) pooling inside a bin
+            output[index] = output_val / count;
+          }  // if
+        }    // for pw
+      }      // for ph
+    }        // for c
+  }          // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high,
+                                   const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
+                      const T* argmax_y, const T* argmax_x, T* grad_input,
+                      const int pooled_height, const int pooled_width,
+                      const T spatial_scale, const int sampling_ratio,
+                      const int pool_mode,  // 0 - max pool, 1 - avg pool
+                      const bool aligned, const int channels, const int height,
+                      const int width, const int n_stride, const int c_stride,
+                      const int h_stride, const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    if (pool_mode == 0) {
+      // We do max pooling inside a bin
+      T y = argmax_y[index], x = argmax_x[index];
+      if (y != -1.f) {
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        T g1 = grad_output_this_bin * w1;
+        T g2 = grad_output_this_bin * w2;
+        T g3 = grad_output_this_bin * w3;
+        T g4 = grad_output_this_bin * w4;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        }  // if
+      }    // mode
+    } else if (pool_mode == 1) {
+      // We do average (integral) pooling inside a bin
+      // We use roi_bin_grid to sample the grid and mimic integral
+      int roi_bin_grid_h =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : ceilf(roi_height / pooled_height);  // e.g., = 2
+      int roi_bin_grid_w = (sampling_ratio > 0)
+                               ? sampling_ratio
+                               : ceilf(roi_width / pooled_width);
+
+      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+
+          T w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+
+          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                        x_low, x_high, y_low, y_high, index);
+
+          T g1 = grad_output_this_bin * w1 / count;
+          T g2 = grad_output_this_bin * w2 / count;
+          T g3 = grad_output_this_bin * w3 / count;
+          T g4 = grad_output_this_bin * w4 / count;
+
+          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+            // atomic add is not needed for now since it is single threaded
+            add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+            add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+            add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+            add(offset_grad_input + y_high * width + x_high,
+                static_cast<T>(g4));
+          }  // if
+        }    // ix
+      }      // iy
+    }        // mode
+  }          // for
+}  // ROIAlignBackward
+
+void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                Tensor argmax_y, Tensor argmax_x,
+                                int aligned_height, int aligned_width,
+                                float spatial_scale, int sampling_ratio,
+                                int pool_mode, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlign_forward", [&] {
+        ROIAlignForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,
+            static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+            aligned, channels, height, width);
+      });
+}
+
+void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                 Tensor argmax_y, Tensor argmax_x,
+                                 Tensor grad_input, int aligned_height,
+                                 int aligned_width, float spatial_scale,
+                                 int sampling_ratio, int pool_mode,
+                                 bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad_output.stride(0);
+  int c_stride = grad_output.stride(1);
+  int h_stride = grad_output.stride(2);
+  int w_stride = grad_output.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "ROIAlign_backward", [&] {
+        ROIAlignBackward<scalar_t>(
+            output_size, grad_output.data_ptr<scalar_t>(),
+            rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
+            sampling_ratio, pool_mode, aligned, channels, height, width,
+            n_stride, c_stride, h_stride, w_stride);
+      });
+}
+
+void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
+                             aligned_height, aligned_width, spatial_scale,
+                             sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
+                              aligned_height, aligned_width, spatial_scale,
+                              sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CPU, roi_align_forward_cpu);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CPU, roi_align_backward_cpu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c849de0cbc564a9a88cdbcd35b4acdb065f99a3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
@@ -0,0 +1,455 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w,
+    T cos_theta, T sin_theta, std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignRotatedForward(const int nthreads, const T* input,
+                            const T& spatial_scale, const bool aligned,
+                            const bool clockwise, const int channels,
+                            const int height, const int width,
+                            const int pooled_height, const int pooled_width,
+                            const int sampling_ratio, const T* rois,
+                            T* output) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlignRotated do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
+        sin_theta, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                            pc.w2 * offset_input[pc.pos2] +
+                            pc.w3 * offset_input[pc.pos3] +
+                            pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignRotatedBackward(
+    const int nthreads,
+    // may not be contiguous. should index using n_stride, etc
+    const T* grad_output, const T& spatial_scale, const bool aligned,
+    const bool clockwise, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int sampling_ratio,
+    T* grad_input, const T* rois, const int n_stride, const int c_stride,
+    const int h_stride, const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlignRotated do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T yy = roi_start_h + ph * bin_size_h +
+                   static_cast<T>(iy + .5f) * bin_size_h /
+                       static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+                     static_cast<T>(ix + .5f) * bin_size_w /
+                         static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        }  // if
+      }    // ix
+    }      // iy
+  }        // for
+}  // ROIAlignRotatedBackward
+
+void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       bool aligned, bool clockwise) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(),
+            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
+            height, width, aligned_height, aligned_width, sampling_ratio,
+            rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
+      });
+}
+
+void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, bool aligned,
+                                        bool clockwise) {
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad_output.stride(0);
+  int c_stride = grad_output.stride(1);
+  int h_stride = grad_output.stride(2);
+  int w_stride = grad_output.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "ROIAlignRotated_backward", [&] {
+        ROIAlignRotatedBackward<scalar_t>(
+            grad_output.numel(), grad_output.data_ptr<scalar_t>(),
+            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
+            height, width, aligned_height, aligned_width, sampling_ratio,
+            grad_input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            n_stride, c_stride, h_stride, w_stride);
+      });
+}
+
+void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                                   int aligned_height, int aligned_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise) {
+  ROIAlignRotatedForwardCPULauncher(input, rois, output, aligned_height,
+                                    aligned_width, spatial_scale,
+                                    sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
+                                    Tensor bottom_grad, int aligned_height,
+                                    int aligned_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise) {
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  ROIAlignRotatedBackwardCPULauncher(
+      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
+      sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU,
+                     roi_align_rotated_forward_cpu);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CPU,
+                     roi_align_rotated_backward_cpu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..09dcdd33759aa03e619c629ef7ae052d0fe48f2b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
@@ -0,0 +1,262 @@
+// modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T bilinear_interpolate(const T* input, const int height, const int width, T y,
+                       T x, const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  const T v_low = fma(v2 - v1, lx, v1);
+  const T v_high = fma(v4 - v3, lx, v3);
+  const T val = fma(v_high - v_low, ly, v_low);
+
+  return val;
+}
+
+template <typename scalar_t>
+void rotated_feature_align_forward_cpu_kernel(
+    const int nthreads, const int points, const scalar_t* bottom_data,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width, scalar_t* top_data) {
+  for (int index = 0; index < nthreads; index++) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    const scalar_t* offset_bottom_data =
+        bottom_data + (n * channels + c) * height * width;
+
+    scalar_t output_val = bottom_data[index];
+    for (int i = 0; i < points; i++) {
+      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
+                                                   width, py[i], px[i], i);
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high,
+                                   const int index) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename scalar_t>
+inline void valueAdd(scalar_t* address, scalar_t val) {
+  scalar_t old = *address;
+  *address = (old + val);
+}
+
+template <typename scalar_t>
+void rotated_feature_align_backward_cpu_kernel(
+    const int nthreads, const int points, const scalar_t* top_diff,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width,
+    scalar_t* bottom_diff) {
+  for (int index = 0; index < nthreads; index++) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    scalar_t* offset_bottom_diff =
+        bottom_diff + (n * channels + c) * height * width;
+    scalar_t value_top_diff = top_diff[index];
+
+    valueAdd(bottom_diff + index, value_top_diff);
+    for (int i = 0; i < points; i++) {
+      scalar_t w1, w2, w3, w4;
+      int x_low, x_high, y_low, y_high;
+
+      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
+                                              w2, w3, w4, x_low, x_high, y_low,
+                                              y_high, i);
+      scalar_t g1 = value_top_diff * w1;
+      scalar_t g2 = value_top_diff * w2;
+      scalar_t g3 = value_top_diff * w3;
+      scalar_t g4 = value_top_diff * w4;
+      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+        valueAdd(offset_bottom_diff + y_low * width + x_low, g1);
+        valueAdd(offset_bottom_diff + y_low * width + x_high, g2);
+        valueAdd(offset_bottom_diff + y_high * width + x_low, g3);
+        valueAdd(offset_bottom_diff + y_high * width + x_high, g4);
+      }
+    }
+  }
+}
+
+void rotated_feature_align_forward_cpu(const Tensor features,
+                                       const Tensor best_bboxes,
+                                       const float spatial_scale,
+                                       const int points, Tensor output) {
+  const int output_size = features.numel();
+  AT_DISPATCH_FLOATING_TYPES(
+      features.scalar_type(), "rotated_feature_align_forward_cpu_kernel", [&] {
+        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* top_data = output.data_ptr<scalar_t>();
+
+        rotated_feature_align_forward_cpu_kernel<scalar_t>(
+            output_size, points, bottom_data, bboxes_data,
+            scalar_t(spatial_scale), features.size(1), features.size(2),
+            features.size(3), top_data);
+      });
+}
+
+void rotated_feature_align_backward_cpu(const Tensor top_grad,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor bottom_grad) {
+  const int output_size = top_grad.numel();
+  AT_DISPATCH_FLOATING_TYPES(
+      top_grad.scalar_type(), "rotated_feature_align_backward_cpu_kernel", [&] {
+        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();
+
+        rotated_feature_align_backward_cpu_kernel<scalar_t>(
+            output_size, points, top_diff, bboxes_data, scalar_t(spatial_scale),
+            top_grad.size(1), top_grad.size(2), top_grad.size(3), bottom_diff);
+      });
+}
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CPU,
+                     rotated_feature_align_forward_cpu);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CPU,
+                     rotated_feature_align_backward_cpu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2c592b77d35af5dba3c8bc986aca30c2726d25c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
@@ -0,0 +1,84 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/script.h>
+#include <utils/spconv/spconv/geometry.h>
+#include <utils/spconv/spconv/indice.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    if (transpose)
+      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+    else
+      return getIndicePairsConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    return getIndicePairsSubM<Index, IndexGrid, NDim>(
+        indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                           \
+  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
+                                                       NDIM>;               \
+  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
+                                                       NDIM>;
+
+#define DECLARE_CPU_INDEX(Index)          \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_CPU_INDEX(int);
+DECLARE_CPU_INDEX(long);
+
+#undef DECLARE_CPU_INDEX
+#undef DECLARE_CPU_SPECS_INDEX_NDIM
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6266741ff9a4c1e122012d94578bb2cef58e4178
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
@@ -0,0 +1,82 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/script.h>
+#include <utils/spconv/spconv/maxpool.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
+          outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
+    }
+  }
+};
+
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto foutData = fout.data();
+    auto finData = fin.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
+          finData[idxi + plane] += foutData[idxo + plane];
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
+
+#define DECLARE_CPU_SPECS(T)         \
+  DECLARE_CPU_SPECS_T_INDEX(T, int); \
+  DECLARE_CPU_SPECS_T_INDEX(T, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4223da36093c558f62dd92a698411b3f5572096
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
@@ -0,0 +1,68 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/script.h>
+#include <utils/spconv/spconv/reordering.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseGatherFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size) {
+    int numPlanes = features.dim(1);
+    for (int i = 0; i < size; ++i) {
+      std::memcpy(buffer.data() + i * numPlanes,
+                  features.data() + indices[i] * numPlanes,
+                  sizeof(scalar_t) * numPlanes);
+    }
+  }
+};
+
+template <typename scalar_t, typename Index>
+struct SparseScatterAddFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    int numPlanes = outFeatures.dim(1);
+    const scalar_t* buf = buffer.data();
+    scalar_t* out = outFeatures.data();
+    for (int i = 0; i < size; ++i) {
+      buf = buffer.data() + i * numPlanes;
+      out = outFeatures.data() + indices[i] * numPlanes;
+      for (int j = 0; j < numPlanes; ++j) {
+        out[j] += buf[j];
+      }
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_CPU_SPECS_T_INDEX(scalar_t, Index)                        \
+  template struct functor::SparseGatherFunctor<tv::CPU, scalar_t, Index>; \
+  template struct functor::SparseScatterAddFunctor<tv::CPU, scalar_t, Index>;
+
+#define DECLARE_CPU_SPECS(scalar_t)         \
+  DECLARE_CPU_SPECS_T_INDEX(scalar_t, int); \
+  DECLARE_CPU_SPECS_T_INDEX(scalar_t, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a21f849a0b90ebb489d26daadbbc48427d6dd502
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
@@ -0,0 +1,186 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T, typename T_int>
+void dynamic_voxelize_forward_cpu_kernel(
+    const torch::TensorAccessor<T, 2> points,
+    torch::TensorAccessor<T_int, 2> coors, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const std::vector<int> grid_size,
+    const int num_points, const int num_features, const int NDim) {
+  const int ndim_minus_1 = NDim - 1;
+  bool failed = false;
+  // int coor[NDim];
+  int* coor = new int[NDim]();
+  int c;
+
+  for (int i = 0; i < num_points; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
+      // necessary to rm points out of range
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+
+    // memcpy and memset will cause problem because of the memory distribution
+    // discontinuity of TensorAccessor, so here using loops to replace memcpy
+    // or memset
+    if (failed) {
+      for (int k = 0; k < NDim; ++k) {
+        coors[i][k] = -1;
+      }
+    } else {
+      for (int k = 0; k < NDim; ++k) {
+        coors[i][k] = coor[k];
+      }
+    }
+  }
+
+  delete[] coor;
+  return;
+}
+
+template <typename T, typename T_int>
+void hard_voxelize_forward_cpu_kernel(
+    const torch::TensorAccessor<T, 2> points,
+    torch::TensorAccessor<T, 3> voxels, torch::TensorAccessor<T_int, 2> coors,
+    torch::TensorAccessor<T_int, 1> num_points_per_voxel,
+    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const std::vector<int> grid_size, const int max_points,
+    const int max_voxels, const int num_points, const int num_features,
+    const int NDim) {
+  // declare a temp coors
+  at::Tensor temp_coors = at::zeros(
+      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
+
+  // First use dynamic voxelization to get coors,
+  // then check max points/voxels constraints
+  dynamic_voxelize_forward_cpu_kernel<T, int>(
+      points, temp_coors.accessor<int, 2>(), voxel_size, coors_range, grid_size,
+      num_points, num_features, NDim);
+
+  int voxelidx, num;
+  auto coor = temp_coors.accessor<int, 2>();
+
+  for (int i = 0; i < num_points; ++i) {
+    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
+
+    if (coor[i][0] == -1) continue;
+
+    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+
+    // record voxel
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (max_voxels != -1 && voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+
+      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
+      // memcpy will cause problem because of the memory distribution
+      // discontinuity of TensorAccessor, so here using loops to replace memcpy
+      for (int k = 0; k < NDim; ++k) {
+        coors[voxelidx][k] = coor[i][k];
+      }
+    }
+
+    // put points into voxel
+    num = num_points_per_voxel[voxelidx];
+    if (max_points == -1 || num < max_points) {
+      // memcpy will cause problem because of the memory distribution
+      // discontinuity of TensorAccessor, so here using loops to replace memcpy
+      for (int k = 0; k < num_features; ++k) {
+        voxels[voxelidx][num][k] = points[i][k];
+      }
+      num_points_per_voxel[voxelidx] += 1;
+    }
+  }
+
+  return;
+}
+
+void dynamic_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& coors,
+                                  const std::vector<float> voxel_size,
+                                  const std::vector<float> coors_range,
+                                  const int NDim = 3) {
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "dynamic_voxelize_forward_cpu_kernel", [&] {
+        dynamic_voxelize_forward_cpu_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
+            voxel_size, coors_range, grid_size, num_points, num_features, NDim);
+      });
+}
+
+int hard_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& voxels,
+                              at::Tensor& coors,
+                              at::Tensor& num_points_per_voxel,
+                              const std::vector<float> voxel_size,
+                              const std::vector<float> coors_range,
+                              const int max_points, const int max_voxels,
+                              const int NDim = 3) {
+  // current version tooks about 0.02s_0.03s for one frame on cpu
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
+  // grid_size[1], grid_size[0]);
+  at::Tensor coor_to_voxelidx =
+      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
+
+  int voxel_num = 0;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "hard_voxelize_forward_cpu_kernel", [&] {
+        hard_voxelize_forward_cpu_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
+            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
+            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
+            coors_range, grid_size, max_points, max_voxels, num_points,
+            num_features, NDim);
+      });
+
+  return voxel_num;
+}
+
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CPU,
+                     hard_voxelize_forward_cpu);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CPU,
+                     dynamic_voxelize_forward_cpu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..27fffb9faeaa33eff201c0fcaf236866e5d10712
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
+#include "active_rotated_filter_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output) {
+  int num_output_planes = input.size(0);
+  int num_input_planes = input.size(1);
+  int num_orientations = input.size(2);
+  int kH = input.size(3);
+  int kW = input.size(4);
+  int num_rotations = indices.size(3);
+  int nEntry = num_orientations * kH * kW;
+  int output_size = input.numel();
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "active_rotated_filter_forward_cuda_kernel", [&] {
+        active_rotated_filter_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                indices.data_ptr<int>(), num_input_planes, num_output_planes,
+                num_orientations, num_rotations, nEntry,
+                output.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in) {
+  int num_orientations = indices.size(0);
+  int kH = indices.size(1);
+  int kW = indices.size(2);
+  int num_rotations = indices.size(3);
+  int num_output_planes = grad_out.size(0) / num_rotations;
+  int num_input_planes = grad_out.size(1) / num_orientations;
+  int nEntry = num_orientations * kH * kW;
+  int output_size = grad_in.numel();
+
+  at::cuda::CUDAGuard device_guard(indices.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "active_rotated_filter_backward_cuda_kernel",
+      [&] {
+        active_rotated_filter_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_out.data_ptr<scalar_t>(),
+                indices.data_ptr<int>(), num_input_planes, num_output_planes,
+                num_orientations, num_rotations, nEntry,
+                grad_in.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bdb5fab9fc61ad19d9230cfdc26642dc7fe5972e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "assign_score_withk_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output) {
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "assign_score_withk_forward_cuda_kernel", [&] {
+        assign_score_withk_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, points.data_ptr<scalar_t>(),
+                centers.data_ptr<scalar_t>(), scores.data_ptr<scalar_t>(),
+                knn_idx.data_ptr<int64_t>(), output.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));
+  dim3 threads1(THREADS_PER_BLOCK);
+  dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));
+  dim3 threads2(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "assign_score_withk_points_backward_cuda_kernel",
+      [&] {
+        assign_score_withk_points_backward_cuda_kernel<scalar_t>
+            <<<blocks1, threads1, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
+                scores.data_ptr<scalar_t>(), knn_idx.data_ptr<int64_t>(),
+                grad_points.data_ptr<scalar_t>(),
+                grad_centers.data_ptr<scalar_t>());
+      });
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "assign_score_withk_scores_backward_cuda_kernel",
+      [&] {
+        assign_score_withk_scores_backward_cuda_kernel<scalar_t>
+            <<<blocks2, threads2, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
+                points.data_ptr<scalar_t>(), centers.data_ptr<scalar_t>(),
+                knn_idx.data_ptr<int64_t>(), grad_scores.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c42c3e2ae6164dfc504c2794db1436607ec8445f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ball_query_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx) {
+  // new_xyz: (B, M, 3)
+  // xyz: (B, N, 3)
+  // output:
+  //      idx: (B, M, nsample)
+
+  at::cuda::CUDAGuard device_guard(new_xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      new_xyz.scalar_type(), "ball_query_forward_cuda_kernel", [&] {
+        ball_query_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, n, m, min_radius, max_radius, nsample,
+                new_xyz.data_ptr<scalar_t>(), xyz.data_ptr<scalar_t>(),
+                idx.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7dae535cfb4818d6cae445666378332db29bb9f0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
@@ -0,0 +1,40 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "bbox_overlaps_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+// Disable fp16 on ROCm device
+#ifndef MMCV_WITH_HIP
+#if __CUDA_ARCH__ >= 530
+template <>
+__global__ void bbox_overlaps_cuda_kernel<at::Half>(
+    const at::Half* bbox1, const at::Half* bbox2, at::Half* ious,
+    const int num_bbox1, const int num_bbox2, const int mode,
+    const bool aligned, const int offset) {
+  bbox_overlaps_cuda_kernel_half(reinterpret_cast<const __half*>(bbox1),
+                                 reinterpret_cast<const __half*>(bbox2),
+                                 reinterpret_cast<__half*>(ious), num_bbox1,
+                                 num_bbox2, mode, aligned, offset);
+}
+
+#endif  // __CUDA_ARCH__ >= 530
+#endif  // MMCV_WITH_HIP
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset) {
+  int output_size = ious.numel();
+  int num_bbox1 = bboxes1.size(0);
+  int num_bbox2 = bboxes2.size(0);
+
+  at::cuda::CUDAGuard device_guard(bboxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bboxes1.scalar_type(), "bbox_overlaps_cuda_kernel", ([&] {
+        bbox_overlaps_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),
+                ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
+                offset);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b2786a84eb7dcbb6b06e950f6a1e80f8fcaebfb7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "bezier_align_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BezierAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                          Tensor output, int aligned_height,
+                                          int aligned_width,
+                                          float spatial_scale,
+                                          int sampling_ratio, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "bezier_align_forward_cuda_kernel", [&] {
+        bezier_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
+                channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void BezierAlignBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,
+    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "bezier_align_backward_cuda_kernel", [&] {
+        bezier_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
+                channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..74d02dcececb0f8b338c3068f0fae31bddc55efc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
@@ -0,0 +1,301 @@
+// Modified from
+// https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.cpp
+
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <c10/util/Half.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+struct bias_act_kernel_params {
+  const void *x;     // [sizeX]
+  const void *b;     // [sizeB] or NULL
+  const void *xref;  // [sizeX] or NULL
+  const void *yref;  // [sizeX] or NULL
+  const void *dy;    // [sizeX] or NULL
+  void *y;           // [sizeX]
+
+  int grad;
+  int act;
+  float alpha;
+  float gain;
+  float clamp;
+
+  int sizeX;
+  int sizeB;
+  int stepB;
+  int loopX;
+};
+
+// CUDA kernel selection.
+
+template <class T>
+void *choose_bias_act_kernel(const bias_act_kernel_params &p);
+//------------------------------------------------------------------------
+// Helpers.
+
+template <class T>
+struct InternalType;
+template <>
+struct InternalType<double> {
+  typedef double scalar_t;
+};
+template <>
+struct InternalType<float> {
+  typedef float scalar_t;
+};
+template <>
+struct InternalType<c10::Half> {
+  typedef float scalar_t;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel.
+
+template <class T, int A>
+__global__ void bias_act_kernel(bias_act_kernel_params p) {
+  typedef typename InternalType<T>::scalar_t scalar_t;
+  int G = p.grad;
+  scalar_t alpha = (scalar_t)p.alpha;
+  scalar_t gain = (scalar_t)p.gain;
+  scalar_t clamp = (scalar_t)p.clamp;
+  scalar_t one = (scalar_t)1;
+  scalar_t two = (scalar_t)2;
+  scalar_t expRange = (scalar_t)80;
+  scalar_t halfExpRange = (scalar_t)40;
+  scalar_t seluScale = (scalar_t)1.0507009873554804934193349852946;
+  scalar_t seluAlpha = (scalar_t)1.6732632423543772848170429916717;
+
+  // Loop over elements.
+  int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;
+  for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX;
+       loopIdx++, xi += blockDim.x) {
+    // Load.
+    scalar_t x = (scalar_t)((const T *)p.x)[xi];
+    scalar_t b =
+        (p.b) ? (scalar_t)((const T *)p.b)[(xi / p.stepB) % p.sizeB] : 0;
+    scalar_t xref = (p.xref) ? (scalar_t)((const T *)p.xref)[xi] : 0;
+    scalar_t yref = (p.yref) ? (scalar_t)((const T *)p.yref)[xi] : 0;
+    scalar_t dy = (p.dy) ? (scalar_t)((const T *)p.dy)[xi] : one;
+    scalar_t yy = (gain != 0) ? yref / gain : 0;
+    scalar_t y = 0;
+
+    // Apply bias.
+    ((G == 0) ? x : xref) += b;
+
+    // linear
+    if (A == 1) {
+      if (G == 0) y = x;
+      if (G == 1) y = x;
+    }
+
+    // relu
+    if (A == 2) {
+      if (G == 0) y = (x > 0) ? x : 0;
+      if (G == 1) y = (yy > 0) ? x : 0;
+    }
+
+    // lrelu
+    if (A == 3) {
+      if (G == 0) y = (x > 0) ? x : x * alpha;
+      if (G == 1) y = (yy > 0) ? x : x * alpha;
+    }
+
+    // tanh
+    if (A == 4) {
+      if (G == 0) {
+        scalar_t c = exp(x);
+        scalar_t d = one / c;
+        y = (x < -expRange) ? -one : (x > expRange) ? one : (c - d) / (c + d);
+      }
+      if (G == 1) y = x * (one - yy * yy);
+      if (G == 2) y = x * (one - yy * yy) * (-two * yy);
+    }
+
+    // sigmoid
+    if (A == 5) {
+      if (G == 0) y = (x < -expRange) ? 0 : one / (exp(-x) + one);
+      if (G == 1) y = x * yy * (one - yy);
+      if (G == 2) y = x * yy * (one - yy) * (one - two * yy);
+    }
+
+    // elu
+    if (A == 6) {
+      if (G == 0) y = (x >= 0) ? x : exp(x) - one;
+      if (G == 1) y = (yy >= 0) ? x : x * (yy + one);
+      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + one);
+    }
+
+    // selu
+    if (A == 7) {
+      if (G == 0)
+        y = (x >= 0) ? seluScale * x : (seluScale * seluAlpha) * (exp(x) - one);
+      if (G == 1)
+        y = (yy >= 0) ? x * seluScale : x * (yy + seluScale * seluAlpha);
+      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + seluScale * seluAlpha);
+    }
+
+    // softplus
+    if (A == 8) {
+      if (G == 0) y = (x > expRange) ? x : log(exp(x) + one);
+      if (G == 1) y = x * (one - exp(-yy));
+      if (G == 2) {
+        scalar_t c = exp(-yy);
+        y = x * c * (one - c);
+      }
+    }
+
+    // swish
+    if (A == 9) {
+      if (G == 0)
+        y = (x < -expRange) ? 0 : x / (exp(-x) + one);
+      else {
+        scalar_t c = exp(xref);
+        scalar_t d = c + one;
+        if (G == 1)
+          y = (xref > halfExpRange) ? x : x * c * (xref + d) / (d * d);
+        else
+          y = (xref > halfExpRange)
+                  ? 0
+                  : x * c * (xref * (two - d) + two * d) / (d * d * d);
+        yref = (xref < -expRange) ? 0 : xref / (exp(-xref) + one) * gain;
+      }
+    }
+
+    // Apply gain.
+    y *= gain * dy;
+
+    // Clamp.
+    if (clamp >= 0) {
+      if (G == 0)
+        y = (y > -clamp & y < clamp) ? y : (y >= 0) ? clamp : -clamp;
+      else
+        y = (yref > -clamp & yref < clamp) ? y : 0;
+    }
+
+    // Store.
+    ((T *)p.y)[xi] = (T)y;
+  }
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T>
+void *choose_bias_act_kernel(const bias_act_kernel_params &p) {
+  if (p.act == 1) return (void *)bias_act_kernel<T, 1>;
+  if (p.act == 2) return (void *)bias_act_kernel<T, 2>;
+  if (p.act == 3) return (void *)bias_act_kernel<T, 3>;
+  if (p.act == 4) return (void *)bias_act_kernel<T, 4>;
+  if (p.act == 5) return (void *)bias_act_kernel<T, 5>;
+  if (p.act == 6) return (void *)bias_act_kernel<T, 6>;
+  if (p.act == 7) return (void *)bias_act_kernel<T, 7>;
+  if (p.act == 8) return (void *)bias_act_kernel<T, 8>;
+  if (p.act == 9) return (void *)bias_act_kernel<T, 9>;
+  return NULL;
+}
+
+//------------------------------------------------------------------------
+
+static bool has_same_layout(torch::Tensor x, torch::Tensor y) {
+  if (x.dim() != y.dim()) return false;
+  for (int64_t i = 0; i < x.dim(); i++) {
+    if (x.size(i) != y.size(i)) return false;
+    if (x.size(i) >= 2 && x.stride(i) != y.stride(i)) return false;
+  }
+  return true;
+}
+
+//------------------------------------------------------------------------
+torch::Tensor bias_act_op(const torch::Tensor &x, const torch::Tensor &b,
+                          const torch::Tensor &xref, const torch::Tensor &yref,
+                          const torch::Tensor &dy, int grad, int dim, int act,
+                          float alpha, float gain, float clamp) {
+  // Validate arguments.
+  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+  TORCH_CHECK(
+      b.numel() == 0 || (b.dtype() == x.dtype() && b.device() == x.device()),
+      "b must have the same dtype and device as x");
+  TORCH_CHECK(xref.numel() == 0 ||
+                  (xref.sizes() == x.sizes() && xref.dtype() == x.dtype() &&
+                   xref.device() == x.device()),
+              "xref must have the same shape, dtype, and device as x");
+  TORCH_CHECK(yref.numel() == 0 ||
+                  (yref.sizes() == x.sizes() && yref.dtype() == x.dtype() &&
+                   yref.device() == x.device()),
+              "yref must have the same shape, dtype, and device as x");
+  TORCH_CHECK(
+      dy.numel() == 0 || (dy.sizes() == x.sizes() && dy.dtype() == x.dtype() &&
+                          dy.device() == x.device()),
+      "dy must have the same dtype and device as x");
+  TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
+  TORCH_CHECK(b.dim() == 1, "b must have rank 1");
+  TORCH_CHECK(b.numel() == 0 || (dim >= 0 && dim < x.dim()),
+              "dim is out of bounds");
+  TORCH_CHECK(b.numel() == 0 || b.numel() == x.size(dim),
+              "b has wrong number of elements");
+  TORCH_CHECK(grad >= 0, "grad must be non-negative");
+
+  // Validate layout.
+  TORCH_CHECK(x.is_non_overlapping_and_dense(),
+              "x must be non-overlapping and dense");
+  TORCH_CHECK(b.is_contiguous(), "b must be contiguous");
+  TORCH_CHECK(xref.numel() == 0 || has_same_layout(xref, x),
+              "xref must have the same layout as x");
+  TORCH_CHECK(yref.numel() == 0 || has_same_layout(yref, x),
+              "yref must have the same layout as x");
+  TORCH_CHECK(dy.numel() == 0 || has_same_layout(dy, x),
+              "dy must have the same layout as x");
+
+  // Create output tensor.
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+  torch::Tensor y = torch::empty_like(x);
+  TORCH_CHECK(has_same_layout(y, x), "y must have the same layout as x");
+
+  // Initialize CUDA kernel parameters.
+  bias_act_kernel_params p;
+  p.x = x.data_ptr();
+  p.b = (b.numel()) ? b.data_ptr() : NULL;
+  p.xref = (xref.numel()) ? xref.data_ptr() : NULL;
+  p.yref = (yref.numel()) ? yref.data_ptr() : NULL;
+  p.dy = (dy.numel()) ? dy.data_ptr() : NULL;
+  p.y = y.data_ptr();
+  p.grad = grad;
+  p.act = act;
+  p.alpha = alpha;
+  p.gain = gain;
+  p.clamp = clamp;
+  p.sizeX = (int)x.numel();
+  p.sizeB = (int)b.numel();
+  p.stepB = (b.numel()) ? (int)x.stride(dim) : 1;
+
+  // Choose CUDA kernel.
+  void *kernel;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
+    kernel = choose_bias_act_kernel<scalar_t>(p);
+  });
+  TORCH_CHECK(kernel, "no CUDA kernel found for the specified activation func");
+
+  // Launch CUDA kernel.
+  p.loopX = 4;
+  int blockSize = 4 * 32;
+  int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;
+  void *args[] = {&p};
+#ifdef MMCV_WITH_HIP
+  AT_CUDA_CHECK(hipLaunchKernel(kernel, gridSize, blockSize, args, 0,
+                                at::cuda::getCurrentCUDAStream()));
+#else
+  AT_CUDA_CHECK(cudaLaunchKernel(kernel, gridSize, blockSize, args, 0,
+                                 at::cuda::getCurrentCUDAStream()));
+#endif
+
+  return y;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3aeefea5ddafa81da74f320ae7f166f4977787b4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu
@@ -0,0 +1,68 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "border_align_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
+                                          const Tensor &boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size) {
+  // shape assertion
+  AT_ASSERTM(input.ndimension() == 4,
+             "non-empty 4D(batch mode) tensor expected for input feature");
+  AT_ASSERTM(boxes.ndimension() == 3,
+             "boxes must be 3D tensor with size of [B, H*W, 4]");
+
+  int batch_size = input.size(0);
+  int feat_channels = input.size(1);
+  int channels = feat_channels / 4;
+  int height = input.size(2);
+  int width = input.size(3);
+  // shape [N, box_size, 4] for boxes. (x1, y1, x2, y2) format
+  int box_size = boxes.size(1);
+  // shape [N, channels, box_size, 4] for output
+  int nthreads = batch_size * channels * box_size;
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 block(128, 4);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "border_align_forward_cuda_kernel", [&] {
+        border_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
+                nthreads, input.data_ptr<scalar_t>(),
+                boxes.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax_idx.data_ptr<int>(), channels, box_size, height, width,
+                pool_size);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
+                                           const Tensor &boxes,
+                                           const Tensor &argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size) {
+  int batch_size = grad_input.size(0);
+  int feat_channels = grad_input.size(1);
+  int channels = feat_channels / 4;
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  int box_size = boxes.size(1);
+  int nthreads = batch_size * channels * box_size;
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 block(128, 4);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "border_align_backward_cuda_kernel", [&] {
+        border_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
+                nthreads, grad_output.data_ptr<scalar_t>(),
+                boxes.data_ptr<scalar_t>(), argmax_idx.data_ptr<int>(),
+                grad_input.data_ptr<scalar_t>(), channels, box_size, height,
+                width, pool_size);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25b6819a795354f015c421b612fd2ae130482e91
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu
@@ -0,0 +1,23 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "box_iou_quadri_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void box_iou_quadri_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  using scalar_t = float;
+  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
+
+  int output_size = ious.numel();
+  int num_boxes1 = boxes1.size(0);
+  int num_boxes2 = boxes2.size(0);
+
+  at::cuda::CUDAGuard device_guard(boxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  box_iou_quadri_cuda_kernel<scalar_t>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
+          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
+          mode_flag, aligned);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3c13e06237b208a48e2489ef8246c90ada78ef51
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
@@ -0,0 +1,25 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+#include "box_iou_rotated_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  using scalar_t = float;
+  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
+
+  int output_size = ious.numel();
+  int num_boxes1 = boxes1.size(0);
+  int num_boxes2 = boxes2.size(0);
+
+  at::cuda::CUDAGuard device_guard(boxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  box_iou_rotated_cuda_kernel<scalar_t>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
+          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
+          mode_flag, aligned);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..984e734f9ea5e15de2517d6a580dbe35a11c208b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu
@@ -0,0 +1,180 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "carafe_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor) {
+  const int batch_size = output.size(0);
+  const int channels = output.size(1);
+  const int output_height = output.size(2);
+  const int output_width = output.size(3);
+
+  const int input_height = features.size(2);
+  const int input_width = features.size(3);
+
+  const int mask_channels = masks.size(1);
+
+  rfeatures.resize_({batch_size, input_height, input_width, channels});
+  routput.resize_({batch_size, output_height, output_width, channels});
+  rmasks.resize_({batch_size, output_height, output_width, mask_channels});
+
+  // one warp per pixel
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Feature", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        scalar_t *top_data = rfeatures.data_ptr<scalar_t>();
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(input_height * input_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, input_height * input_width, dh, dw,
+                bottom_data, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Masks", ([&] {
+        const scalar_t *bottom_data = masks.data_ptr<scalar_t>();
+        scalar_t *top_data = rmasks.data_ptr<scalar_t>();
+        const int dh = divideUP(mask_channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, mask_channels, output_height * output_width, dh, dw,
+                bottom_data, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFELaucherForward", ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();
+        scalar_t *top_data = routput.data_ptr<scalar_t>();
+
+        CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
+                                  THREADS_PER_BLOCK, 0, stream>>>(
+            num_kernels, bottom_data, bottom_masks, kernel_size, group_size,
+            scale_factor, channels, input_height, input_width, output_height,
+            output_width, mask_channels, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NHWC2NCHW", ([&] {
+        const scalar_t *bottom_data = routput.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor) {
+  const int batch_size = top_grad.size(0);
+  const int channels = top_grad.size(1);
+  const int output_height = top_grad.size(2);
+  const int output_width = top_grad.size(3);
+
+  const int input_height = bottom_grad.size(2);
+  const int input_width = bottom_grad.size(3);
+
+  const int mask_channels = masks.size(1);
+
+  rtop_grad.resize_({batch_size, output_height, output_width, channels});
+  rbottom_grad.resize_({batch_size, input_height, input_width, channels});
+  rbottom_grad_hs.resize_({batch_size, output_height, output_width, channels});
+  rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});
+
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] {
+        const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, output_height * output_width, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();
+
+        CARAFEBackward_Feature<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, top_diff, bottom_masks, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, bottom_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "FeatureSum", ([&] {
+        const int num_kernels =
+            batch_size * input_height * input_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();
+
+        FeatureSum<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, bottom_diff_hs, scale_factor, channels,
+                         input_height, input_width, bottom_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] {
+        const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(input_height * input_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, input_height * input_width, channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] {
+        const int num_kernels = batch_size * output_height * output_width *
+                                mask_channels * WARP_SIZE;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();
+
+        CARAFEBackward_Mask<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, top_diff, bottom_data, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, mask_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] {
+        const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = mask_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(mask_channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, mask_channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2fc5667686d225064bd14c2f2ad5d06b93bd5fca
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
@@ -0,0 +1,52 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "carafe_naive_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor) {
+  int output_size = output.numel();
+  int channels = output.size(1);
+  int height = output.size(2);
+  int width = output.size(3);
+
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFENAIVEForward", ([&] {
+        carafe_naive_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, features.data_ptr<scalar_t>(),
+                masks.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                kernel_size, group_size, scale_factor, channels, height, width);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor) {
+  int output_size = top_grad.numel();
+  int channels = top_grad.size(1);
+  int height = top_grad.size(2);
+  int width = top_grad.size(3);
+
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFENAIVEBackward", ([&] {
+        carafe_naive_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, top_grad.data_ptr<scalar_t>(),
+                features.data_ptr<scalar_t>(), masks.data_ptr<scalar_t>(),
+                bottom_grad.data_ptr<scalar_t>(),
+                mask_grad.data_ptr<scalar_t>(), kernel_size, group_size,
+                scale_factor, channels, height, width);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6effa29ee7f9998f03461df5e0c251657aeccc39
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
@@ -0,0 +1,63 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
+#include "chamfer_distance_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ChamferDistanceForwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
+    const Tensor dist2, const Tensor idx1, const Tensor idx2) {
+  int batch_size = xyz1.size(0);
+  int n = xyz1.size(1);
+  int m = xyz2.size(1);
+
+  at::cuda::CUDAGuard device_guard(xyz1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
+        chamfer_distance_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK, 0, stream>>>(
+                batch_size, n, xyz1.data_ptr<scalar_t>(), m,
+                xyz2.data_ptr<scalar_t>(), dist1.data_ptr<scalar_t>(),
+                idx1.data_ptr<int>());
+      });
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
+        chamfer_distance_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK, 0, stream>>>(
+                batch_size, m, xyz2.data_ptr<scalar_t>(), n,
+                xyz1.data_ptr<scalar_t>(), dist2.data_ptr<scalar_t>(),
+                idx2.data_ptr<int>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ChamferDistanceBackwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
+    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2) {
+  int batch_size = xyz1.size(0);
+  int n = xyz1.size(1);
+  int m = xyz2.size(1);
+
+  at::cuda::CUDAGuard device_guard(xyz1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
+        chamfer_distance_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                batch_size, m, xyz1.data_ptr<scalar_t>(), n,
+                xyz2.data_ptr<scalar_t>(), grad_dist1.data_ptr<scalar_t>(),
+                idx1.data_ptr<int>(), grad_xyz1.data_ptr<scalar_t>(),
+                grad_xyz2.data_ptr<scalar_t>());
+      });
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
+        chamfer_distance_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                batch_size, n, xyz2.data_ptr<scalar_t>(), m,
+                xyz1.data_ptr<scalar_t>(), grad_dist2.data_ptr<scalar_t>(),
+                idx2.data_ptr<int>(), grad_xyz2.data_ptr<scalar_t>(),
+                grad_xyz1.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
new file mode 100644
index 0000000000000000000000000000000000000000..804f7ac3bae433173f2e71011fa5be2c2c81e761
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/iou/src/convex_iou_kernel.cu
+#include "convex_iou_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious) {
+  int output_size = ious.numel();
+  int num_pointsets = pointsets.size(0);
+  int num_polygons = polygons.size(0);
+
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "convex_iou_cuda_kernel", ([&] {
+        convex_iou_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>(), ious.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output) {
+  int output_size = output.numel();
+  int num_pointsets = pointsets.size(0);
+  int num_polygons = polygons.size(0);
+
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "convex_giou_cuda_kernel", ([&] {
+        convex_giou_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6a43cfc70dafd8050699eac05bfc9bd896f5ba2f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
@@ -0,0 +1,94 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
+// Original licence: Under MIT License
+
+#include "correlation_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW) {
+  const int batch_size = input1.size(0);
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+  const int dilatedKH = (kH - 1) * dilationH + 1;
+  const int dilatedKW = (kW - 1) * dilationW + 1;
+
+  const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;
+  const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;
+
+  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
+  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
+
+  const dim3 threads(WARP_SIZE, 4, 4);
+  const dim3 blocks(batch_size, (oH + 3) >> 2, (oW + 3) >> 2);
+
+  at::cuda::CUDAGuard device_guard(input1.device());
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input1.scalar_type(), "correlation_forward_cuda", ([&] {
+        TensorAcc4R trInput1_acc =
+            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R trInput2_acc =
+            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc5R output_acc =
+            output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
+
+        correlation_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                trInput1_acc, trInput2_acc, output_acc, kH, kW, patchH, patchW,
+                padH, padW, dilationH, dilationW, dilation_patchH,
+                dilation_patchW, dH, dW, oH, oW);
+      }));
+}
+
+void CorrelationBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input1, Tensor input2, Tensor grad_input1,
+    Tensor grad_input2, int kH, int kW, int patchH, int patchW, int padH,
+    int padW, int dilationH, int dilationW, int dilation_patchH,
+    int dilation_patchW, int dH, int dW) {
+  const int batch_size = input1.size(0);
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+  const int C = input1.size(1);
+
+  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
+  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
+  const dim3 blocks(batch_size, iH, iW);
+  const dim3 threads(THREADS_PER_BLOCK);
+
+  at::cuda::CUDAGuard device_guard(input1.device());
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input1.scalar_type(), "correlation_backward_cuda", ([&] {
+        const int grad_cache_size = patchH * patchW * sizeof(scalar_t);
+        TensorAcc4R input1_acc =
+            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R input2_acc =
+            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R grad_input1_acc =
+            grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R grad_input2_acc =
+            grad_input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc5R grad_output_acc =
+            grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
+
+        correlation_backward_cuda_kernel_input1<scalar_t>
+            <<<blocks, threads, grad_cache_size,
+               at::cuda::getCurrentCUDAStream()>>>(
+                grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
+                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                dilation_patchW, dH, dW);
+
+        correlation_backward_cuda_kernel_input2<scalar_t>
+            <<<blocks, threads, grad_cache_size,
+               at::cuda::getCurrentCUDAStream()>>>(
+                grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
+                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                dilation_patchW, dH, dW);
+      }));
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9359551d26ebbc7afd5443d3eada01880225944
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
@@ -0,0 +1,1918 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor &points, const Tensor &centers, const Tensor &scores,
+    const Tensor &knn_idx, Tensor &output);
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
+    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
+    Tensor &grad_centers, Tensor &grad_scores);
+
+void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor &points,
+                                     const Tensor &centers,
+                                     const Tensor &scores,
+                                     const Tensor &knn_idx, Tensor &output) {
+  AssignScoreWithKForwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
+};
+
+void assign_score_withk_backward_cuda(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
+    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
+    Tensor &grad_centers, Tensor &grad_scores) {
+  AssignScoreWithKBackwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
+      grad_points, grad_centers, grad_scores);
+};
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor &points,
+                                     const Tensor &centers,
+                                     const Tensor &scores,
+                                     const Tensor &knn_idx, Tensor &output);
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
+    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
+    Tensor &grad_centers, Tensor &grad_scores);
+
+REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
+                     assign_score_withk_forward_cuda);
+REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,
+                     assign_score_withk_backward_cuda);
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx);
+
+void ball_query_forward_cuda(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
+                                     new_xyz, xyz, idx);
+};
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx);
+REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);
+
+void StackBallQueryForwardCUDAKernelLauncher(float max_radius, int nsample,
+                                             const Tensor new_xyz,
+                                             const Tensor new_xyz_batch_cnt,
+                                             const Tensor xyz,
+                                             const Tensor xyz_batch_cnt,
+                                             Tensor idx);
+
+void stack_ball_query_forward_cuda(float max_radius, int nsample,
+                                   const Tensor new_xyz,
+                                   const Tensor new_xyz_batch_cnt,
+                                   const Tensor xyz, const Tensor xyz_batch_cnt,
+                                   Tensor idx) {
+  StackBallQueryForwardCUDAKernelLauncher(
+      max_radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
+};
+
+void stack_ball_query_forward_impl(float max_radius, int nsample,
+                                   const Tensor new_xyz,
+                                   const Tensor new_xyz_batch_cnt,
+                                   const Tensor xyz, const Tensor xyz_batch_cnt,
+                                   Tensor idx);
+REGISTER_DEVICE_IMPL(stack_ball_query_forward_impl, CUDA,
+                     stack_ball_query_forward_cuda);
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset);
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
+                                          const Tensor &boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size);
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
+                                           const Tensor &boxes,
+                                           const Tensor &argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size);
+
+void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
+                                       pool_size);
+}
+
+void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
+                                        grad_input, pool_size);
+}
+
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size);
+
+REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
+                     border_align_forward_cuda);
+REGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,
+                     border_align_backward_cuda);
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);
+
+void box_iou_quadri_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned);
+
+void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_quadri_impl, CUDA, box_iou_quadri_cuda);
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor);
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor);
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                  output, kernel_size, group_size,
+                                  scale_factor);
+}
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                   bottom_grad, mask_grad, kernel_size,
+                                   group_size, scale_factor);
+}
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor);
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor);
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+                                       group_size, scale_factor);
+}
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+                                        mask_grad, kernel_size, group_size,
+                                        scale_factor);
+}
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,
+                     carafe_naive_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,
+                     carafe_naive_backward_cuda);
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW);
+
+void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
+                                           Tensor input2, Tensor grad_input1,
+                                           Tensor grad_input2, int kH, int kW,
+                                           int patchH, int patchW, int padH,
+                                           int padW, int dilationH,
+                                           int dilationW, int dilation_patchH,
+                                           int dilation_patchW, int dH, int dW);
+
+void correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  CorrelationForwardCUDAKernelLauncher(
+      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
+      dilationW, dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  CorrelationBackwardCUDAKernelLauncher(
+      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
+      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW);
+}
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW);
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW);
+
+REGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);
+REGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,
+                     correlation_backward_cuda);
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,
+                     deformable_col2im_coord_cuda);
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma);
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma);
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+                                         pooled_height, pooled_width,
+                                         spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DeformRoIPoolBackwardCUDAKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,
+                     deform_roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,
+                     deform_roi_pool_backward_cuda);
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha);
+}
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,
+                     sigmoid_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,
+                     sigmoid_focal_loss_backward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,
+                     softmax_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
+                     softmax_focal_loss_backward_cuda);
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float *dataset,
+                                                    float *temp, int *idxs);
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float *dataset, float *temp, int *idxs);
+
+void furthest_point_sampling_forward_cuda(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  const float *dataset = points_tensor.data_ptr<float>();
+  float *temp = temp_tensor.data_ptr<float>();
+  int *idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  const float *dataset = points_tensor.data_ptr<float>();
+  float *temp = temp_tensor.data_ptr<float>();
+  int *idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
+                                                         idxs);
+}
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m);
+
+REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
+                     furthest_point_sampling_forward_cuda);
+REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
+                     furthest_point_sampling_with_dist_forward_cuda);
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input,
+                                      const torch::Tensor &bias,
+                                      const torch::Tensor &refer, int act,
+                                      int grad, float alpha, float scale);
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor &input,
+                                           const torch::Tensor &bias,
+                                           const torch::Tensor &refer, int act,
+                                           int grad, float alpha, float scale);
+REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
+                     fused_bias_leakyrelu_op);
+
+torch::Tensor bias_act_op_impl(const torch::Tensor &input,
+                               const torch::Tensor &bias,
+                               const torch::Tensor &xref,
+                               const torch::Tensor &yref,
+                               const torch::Tensor &dy, int grad, int dim,
+                               int act, float alpha, float gain, float clamp);
+
+torch::Tensor bias_act_op(const torch::Tensor &input, const torch::Tensor &bias,
+                          const torch::Tensor &xref, const torch::Tensor &yref,
+                          const torch::Tensor &dy, int grad, int dim, int act,
+                          float alpha, float gain, float clamp);
+
+REGISTER_DEVICE_IMPL(bias_act_op_impl, CUDA, bias_act_op);
+
+torch::Tensor filtered_lrelu_act_op_impl(torch::Tensor x, torch::Tensor si,
+                                         int sx, int sy, float gain,
+                                         float slope, float clamp,
+                                         bool writeSigns);
+
+torch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,
+                                    int sy, float gain, float slope,
+                                    float clamp, bool writeSigns);
+
+REGISTER_DEVICE_IMPL(filtered_lrelu_act_op_impl, CUDA, filtered_lrelu_act_op);
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out);
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points);
+
+void gather_points_forward_cuda(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
+};
+
+void gather_points_backward_cuda(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
+                                         grad_points);
+};
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out);
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,
+                     gather_points_forward_cuda);
+REGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,
+                     gather_points_backward_cuda);
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out);
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points);
+
+void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
+                                       out);
+};
+
+void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
+                                        idx, grad_points);
+};
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out);
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
+                     group_points_forward_cuda);
+REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
+                     group_points_backward_cuda);
+
+void StackGroupPointsForwardCUDAKernelLauncher(
+    int b, int c, int m, int nsample, const Tensor features_tensor,
+    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,
+    const Tensor idx_batch_cnt_tensor, Tensor out_tensor);
+void StackGroupPointsBackwardCUDAKernelLauncher(
+    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,
+    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,
+    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor);
+
+void stack_group_points_forward_cuda(int b, int c, int m, int nsample,
+                                     const Tensor features_tensor,
+                                     const Tensor features_batch_cnt_tensor,
+                                     const Tensor idx_tensor,
+                                     const Tensor idx_batch_cnt_tensor,
+                                     Tensor out_tensor) {
+  StackGroupPointsForwardCUDAKernelLauncher(
+      b, c, m, nsample, features_tensor, features_batch_cnt_tensor, idx_tensor,
+      idx_batch_cnt_tensor, out_tensor);
+};
+
+void stack_group_points_backward_cuda(int b, int c, int m, int n, int nsample,
+                                      const Tensor grad_out_tensor,
+                                      const Tensor idx_tensor,
+                                      const Tensor idx_batch_cnt_tensor,
+                                      const Tensor features_batch_cnt_tensor,
+                                      Tensor grad_features_tensor) {
+  StackGroupPointsBackwardCUDAKernelLauncher(
+      b, c, m, n, nsample, grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,
+      features_batch_cnt_tensor, grad_features_tensor);
+};
+
+void stack_group_points_forward_impl(int b, int c, int m, int nsample,
+                                     const Tensor features_tensor,
+                                     const Tensor features_batch_cnt_tensor,
+                                     const Tensor idx_tensor,
+                                     const Tensor idx_batch_cnt_tensor,
+                                     Tensor out_tensor);
+
+void stack_group_points_backward_impl(int b, int c, int m, int n, int nsample,
+                                      const Tensor grad_out_tensor,
+                                      const Tensor idx_tensor,
+                                      const Tensor idx_batch_cnt_tensor,
+                                      const Tensor features_batch_cnt_tensor,
+                                      Tensor grad_features_tensor);
+
+REGISTER_DEVICE_IMPL(stack_group_points_forward_impl, CUDA,
+                     stack_group_points_forward_cuda);
+REGISTER_DEVICE_IMPL(stack_group_points_backward_impl, CUDA,
+                     stack_group_points_backward_cuda);
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap);
+
+void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor &keep,
+                                         Tensor &keep_num,
+                                         float nms_overlap_thresh);
+
+void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor &keep,
+                                               Tensor &keep_num,
+                                               float nms_overlap_thresh);
+
+void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                                ans_overlap);
+};
+
+void iou3d_nms3d_forward_cuda(const Tensor boxes, Tensor &keep,
+                              Tensor &keep_num, float nms_overlap_thresh) {
+  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, keep, keep_num,
+                                      nms_overlap_thresh);
+};
+
+void iou3d_nms3d_normal_forward_cuda(const Tensor boxes, Tensor &keep,
+                                     Tensor &keep_num,
+                                     float nms_overlap_thresh) {
+  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, keep, keep_num,
+                                            nms_overlap_thresh);
+};
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap);
+
+void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
+                              Tensor &keep_num, float nms_overlap_thresh);
+
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
+                                     Tensor &keep_num,
+                                     float nms_overlap_thresh);
+
+REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
+                     iou3d_boxes_overlap_bev_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, CUDA, iou3d_nms3d_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, CUDA,
+                     iou3d_nms3d_normal_forward_cuda);
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2);
+
+void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
+}
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2);
+REGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w);
+
+void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int height,
+                                           const int width, const int channels);
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                        width, channels);
+}
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+
+REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,
+                     masked_im2col_forward_cuda);
+REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,
+                     masked_col2im_forward_cuda);
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,
+                     modulated_deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
+                     modulated_deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
+                     modulated_deformable_col2im_coord_cuda);
+
+Tensor ms_deform_attn_cuda_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_cuda_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
+
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_impl_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
+                     ms_deform_attn_cuda_forward);
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,
+                     ms_deform_attn_cuda_backward);
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset);
+
+Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points);
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points);
+
+void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                             boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                            boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points);
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points);
+REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,
+                     points_in_boxes_part_forward_cuda);
+REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,
+                     points_in_boxes_all_forward_cuda);
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask);
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask);
+
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
+                                   w_feature, h_mask, w_mask, half_h_mask,
+                                   half_w_mask);
+}
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                    h_feature, w_feature, h_mask, w_mask,
+                                    half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  ROIAlignBackwardCUDAKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output);
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
+
+void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = input.size(1);
+  int data_height = input.size(2);
+  int data_width = input.size(3);
+  ROIAlignRotatedForwardCUDAKernelLauncher(
+      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, output);
+}
+
+void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  ROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, bottom_grad);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
+                     roi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
+                     roi_align_rotated_backward_cuda);
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output);
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad);
+
+void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(features);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = features.size(1) / num_orientations;
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  RiROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, output);
+}
+
+void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(top_grad);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = bottom_grad.size(1) / num_orientations;
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  RiROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, bottom_grad);
+}
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise);
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise);
+
+REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
+                     riroi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
+                     riroi_align_rotated_backward_cuda);
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method);
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method);
+
+void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  RoiawarePool3dForwardCUDAKernelLauncher(
+      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
+      pool_method);
+};
+
+void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  RoiawarePool3dBackwardCUDAKernelLauncher(
+      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
+};
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,
+                     roiaware_pool3d_forward_cuda);
+REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,
+                     roiaware_pool3d_backward_cuda);
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
+
+void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  RoIPointPool3dForwardCUDAKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+};
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag);
+REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,
+                     roipoint_pool3d_forward_cuda);
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale);
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale);
+
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
+                                   pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
+                                    pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor &feats, const at::Tensor &coors,
+    const reduce_t reduce_type);
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
+    const at::Tensor &feats, const at::Tensor &reduced_feats,
+    const at::Tensor &coors_map, const at::Tensor &reduce_count,
+    const reduce_t reduce_type);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const reduce_t reduce_type) {
+  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
+                                                      reduce_type);
+};
+
+void dynamic_point_to_voxel_backward_cuda(
+    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
+    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
+    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
+    const reduce_t reduce_type) {
+  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
+                                                feats, reduced_feats, coors_idx,
+                                                reduce_count, reduce_type);
+};
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
+    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
+    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
+    const reduce_t reduce_type);
+
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
+                     dynamic_point_to_voxel_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
+                     dynamic_point_to_voxel_backward_cuda);
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var);
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size);
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias);
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input);
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
+  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
+}
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
+                                        running_var, weight, bias, norm, std,
+                                        output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
+                                        grad_bias);
+}
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
+                                       grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,
+                     sync_bn_forward_mean_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,
+                     sync_bn_forward_output_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,
+                     sync_bn_backward_param_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,
+                     sync_bn_backward_data_cuda);
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight, Tensor out);
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points);
+
+void three_interpolate_forward_cuda(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
+                                            out);
+};
+
+void three_interpolate_backward_cuda(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
+                                             grad_points);
+};
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out);
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points);
+REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,
+                     three_interpolate_forward_cuda);
+REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,
+                     three_interpolate_backward_cuda);
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx);
+
+void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
+};
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx);
+REGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output);
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input);
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardCUDAKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);
+
+torch::Tensor upfirdn2d_op(torch::Tensor input, torch::Tensor filter, int upx,
+                           int upy, int downx, int downy, int padx0, int padx1,
+                           int pady0, int pady1, bool flip, float gain);
+
+torch::Tensor upfirdn2d_op_impl(torch::Tensor input, torch::Tensor filter,
+                                int upx, int upy, int downx, int downy,
+                                int padx0, int padx1, int pady0, int pady1,
+                                bool flip, float gain);
+REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3);
+
+int hard_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim) {
+  return HardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+int nondeterministic_hard_voxelize_forward_cuda(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim) {
+  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+void dynamic_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim) {
+  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
+                                           coors_range, NDim);
+};
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
+                     hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
+                     nondeterministic_hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
+                     dynamic_voxelize_forward_cuda);
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output);
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad);
+
+void rotated_feature_align_forward_cuda(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
+                                               spatial_scale, points, output);
+};
+
+void rotated_feature_align_backward_cuda(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  RotatedFeatureAlignBackwardCUDAKernelLauncher(
+      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
+};
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
+                     rotated_feature_align_forward_cuda);
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
+                     rotated_feature_align_backward_cuda);
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output);
+
+void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
+                                            output);
+};
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols);
+
+REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
+                     points_in_polygons_forward_cuda);
+
+torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
+                                                     torch::Tensor indicePairs,
+                                                     torch::Tensor indiceNum,
+                                                     int64_t numAct);
+
+torch::Tensor indice_maxpool_forward_cuda(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct) {
+  return IndiceMaxpoolForwardCUDAKernelLauncher(features, indicePairs,
+                                                indiceNum, numAct);
+};
+
+torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct);
+REGISTER_DEVICE_IMPL(indice_maxpool_forward_impl, CUDA,
+                     indice_maxpool_forward_cuda);
+
+torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
+                                                      torch::Tensor outFeatures,
+                                                      torch::Tensor outGrad,
+                                                      torch::Tensor indicePairs,
+                                                      torch::Tensor indiceNum);
+
+torch::Tensor indice_maxpool_backward_cuda(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum) {
+  return IndiceMaxpoolBackwardCUDAKernelLauncher(features, outFeatures, outGrad,
+                                                 indicePairs, indiceNum);
+};
+
+torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum);
+
+REGISTER_DEVICE_IMPL(indice_maxpool_backward_impl, CUDA,
+                     indice_maxpool_backward_cuda)
+
+torch::Tensor IndiceConvForwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
+    int64_t _subM);
+
+torch::Tensor indice_conv_forward_cuda(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM) {
+  return IndiceConvForwardCUDAKernelLauncher(
+      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
+};
+
+torch::Tensor indice_conv_forward_impl(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM);
+
+REGISTER_DEVICE_IMPL(indice_conv_forward_impl, CUDA, indice_conv_forward_cuda);
+
+std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM);
+
+std::vector<torch::Tensor> indice_conv_backward_cuda(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return IndiceConvBackwardCUDAKernelLauncher(
+      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
+};
+
+std::vector<torch::Tensor> indice_conv_backward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM);
+
+REGISTER_DEVICE_IMPL(indice_conv_backward_impl, CUDA,
+                     indice_conv_backward_cuda);
+
+torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM);
+
+torch::Tensor fused_indice_conv_batchnorm_forward_cuda(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return FusedIndiceConvBatchnormCUDAKernelLauncher(features, filters, bias,
+                                                    indicePairs, indiceNum,
+                                                    numActOut, _inverse, _subM);
+};
+
+torch::Tensor fused_indice_conv_batchnorm_forward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM);
+
+REGISTER_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl, CUDA,
+                     fused_indice_conv_batchnorm_forward_cuda)
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
+
+void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
+  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
+}
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
+
+REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output);
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in);
+
+void active_rotated_filter_forward_cuda(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
+};
+
+void active_rotated_filter_backward_cuda(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
+};
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
+                     active_rotated_filter_forward_cuda);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
+                     active_rotated_filter_backward_cuda);
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious);
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output);
+
+void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
+}
+
+void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
+}
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious);
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output);
+
+REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
+REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
+
+Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
+                                                    Tensor mask,
+                                                    Tensor num_valid);
+
+Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
+                                                      num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
+                     diff_iou_rotated_sort_vertices_forward_cuda);
+
+void ChamferDistanceForwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
+    const Tensor dist2, const Tensor idx1, const Tensor idx2);
+
+void ChamferDistanceBackwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
+    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2);
+
+void chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2) {
+  ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,
+                                           idx2);
+};
+
+void chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor idx1, Tensor idx2, Tensor graddist1,
+                                    Tensor graddist2, Tensor gradxyz1,
+                                    Tensor gradxyz2) {
+  ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, idx1, idx2, graddist1,
+                                            graddist2, gradxyz1, gradxyz2);
+};
+
+void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2);
+
+void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor idx1, Tensor idx2, Tensor graddist1,
+                                    Tensor graddist2, Tensor gradxyz1,
+                                    Tensor gradxyz2);
+
+REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
+                     chamfer_distance_forward_cuda);
+REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
+                     chamfer_distance_backward_cuda);
+
+void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                        Tensor output, int pooled_height,
+                                        int pooled_width, float spatial_scale);
+
+void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                         Tensor grad_input, int pooled_height,
+                                         int pooled_width, float spatial_scale);
+
+void PrROIPoolCoorBackwardCUDAKernelLauncher(
+    Tensor output, Tensor grad_output, Tensor input, Tensor rois,
+    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);
+
+void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
+                                     pooled_width, spatial_scale);
+}
+
+void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
+                                      pooled_height, pooled_width,
+                                      spatial_scale);
+}
+
+void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
+                                          grad_rois, pooled_height,
+                                          pooled_width, spatial_scale);
+}
+
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale);
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale);
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale);
+REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
+                     prroi_pool_coor_backward_cuda);
+
+void BezierAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                          Tensor output, int aligned_height,
+                                          int aligned_width,
+                                          float spatial_scale,
+                                          int sampling_ratio, bool aligned);
+
+void BezierAlignBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,
+    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned);
+
+void bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned);
+
+void bezier_align_backward_impl(Tensor grad_output, Tensor rois,
+                                Tensor grad_input, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned);
+
+REGISTER_DEVICE_IMPL(bezier_align_forward_impl, CUDA,
+                     BezierAlignForwardCUDAKernelLauncher);
+REGISTER_DEVICE_IMPL(bezier_align_backward_impl, CUDA,
+                     BezierAlignBackwardCUDAKernelLauncher);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..05fc08b70be937411ed04c0dc80c40f5479c0d9e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
@@ -0,0 +1,105 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "deform_conv_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels),
+                                       THREADS_PER_BLOCK, 0,
+                                       at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_im_, data_offset_, height, width, ksize_h,
+            ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels,
+            deformable_group, height_col, width_col, data_col_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels),
+                                       THREADS_PER_BLOCK, 0,
+                                       at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+            dilation_w, channel_per_deformable_group, parallel_imgs,
+            deformable_group, height_col, width_col, grad_im_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+                    deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height,
+            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d44399829e99f725e2c24418723ea14685819858
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
@@ -0,0 +1,55 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "deform_roi_pool_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "deform_roi_pool_forward_cuda_kernel", [&] {
+        deform_roi_pool_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "deform_roi_pool_backward_cuda_kernel", [&] {
+        deform_roi_pool_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+                offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+                grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..62dbf5da357ac8f2178e53d21fd8f9d3339eca81
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Adapted from
+// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa
+#include "diff_iou_rotated_cuda_kernel.cuh"
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
+
+at::Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(at::Tensor vertices,
+                                                        at::Tensor mask,
+                                                        at::Tensor num_valid) {
+  at::cuda::CUDAGuard device_guard(vertices.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  CHECK_CONTIGUOUS(vertices);
+  CHECK_CONTIGUOUS(mask);
+  CHECK_CONTIGUOUS(num_valid);
+  CHECK_CUDA(vertices);
+  CHECK_CUDA(mask);
+  CHECK_CUDA(num_valid);
+
+  int b = vertices.size(0);
+  int n = vertices.size(1);
+  int m = vertices.size(2);
+  at::Tensor idx =
+      torch::zeros({b, n, MAX_NUM_VERT_IDX},
+                   at::device(vertices.device()).dtype(at::ScalarType::Int));
+
+  diff_iou_rotated_sort_vertices_forward_cuda_kernel<<<b, opt_n_thread(n), 0,
+                                                       stream>>>(
+      b, n, m, vertices.data_ptr<float>(), mask.data_ptr<bool>(),
+      num_valid.data_ptr<int>(), idx.data_ptr<int>());
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return idx;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cee9b26168336431bf3b212bf362a505f6b8edd3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu
@@ -0,0 +1,2056 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#include <c10/util/Half.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include <cstdint>
+
+#include "pytorch_cuda_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+//------------------------------------------------------------------------
+// CUDA kernel parameters.
+
+struct filtered_lrelu_kernel_params {
+  // These parameters decide which kernel to use.
+  int up;        // upsampling ratio (1, 2, 4)
+  int down;      // downsampling ratio (1, 2, 4)
+  int2 fuShape;  // [size, 1] | [size, size]
+  int2 fdShape;  // [size, 1] | [size, size]
+
+  int _dummy;  // Alignment.
+
+  // Rest of the parameters.
+  const void *x;     // Input tensor.
+  void *y;           // Output tensor.
+  const void *b;     // Bias tensor.
+  unsigned char *s;  // Sign tensor in/out. NULL if unused.
+  const float *fu;   // Upsampling filter.
+  const float *fd;   // Downsampling filter.
+
+  int2 pad0;    // Left/top padding.
+  float gain;   // Additional gain factor.
+  float slope;  // Leaky ReLU slope on negative side.
+  float clamp;  // Clamp after nonlinearity.
+  int flip;     // Filter kernel flip for gradient computation.
+
+  int tilesXdim;  // Original number of horizontal output tiles.
+  int tilesXrep;  // Number of horizontal tiles per CTA.
+  int blockZofs;  // Block z offset to support large minibatch, channel
+                  // dimensions.
+
+  int4 xShape;  // [width, height, channel, batch]
+  int4 yShape;  // [width, height, channel, batch]
+  int2 sShape;  // [width, height] - width is in bytes. Contiguous. Zeros if
+                // unused.
+  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
+  int swLimit;  // Active width of sign tensor in bytes.
+
+  longlong4 xStride;   // Strides of all tensors except signs, same component
+                       // order as shapes.
+  longlong4 yStride;   //
+  int64_t bStride;     //
+  longlong3 fuStride;  //
+  longlong3 fdStride;  //
+};
+
+struct filtered_lrelu_act_kernel_params {
+  void *x;           // Input/output, modified in-place.
+  unsigned char *s;  // Sign tensor in/out. NULL if unused.
+
+  float gain;   // Additional gain factor.
+  float slope;  // Leaky ReLU slope on negative side.
+  float clamp;  // Clamp after nonlinearity.
+
+  int4 xShape;        // [width, height, channel, batch]
+  longlong4 xStride;  // Input/output tensor strides, same order as in shape.
+  int2 sShape;  // [width, height] - width is in elements. Contiguous. Zeros if
+                // unused.
+  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel specialization.
+
+struct filtered_lrelu_kernel_spec {
+  void *setup;   // Function for filter kernel setup.
+  void *exec;    // Function for main operation.
+  int2 tileOut;  // Width/height of launch tile.
+  int numWarps;  // Number of warps per thread block, determines launch block
+                 // size.
+  int xrep;      // For processing multiple horizontal tiles per thread block.
+  int dynamicSharedKB;  // How much dynamic shared memory the exec kernel wants.
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T, class index_t, bool signWrite, bool signRead>
+filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(
+    const filtered_lrelu_kernel_params &p, int sharedKB);
+template <class T, bool signWrite, bool signRead>
+void *choose_filtered_lrelu_act_kernel(void);
+
+//------------------------------------------------------------------------
+// Helpers.
+
+enum              // Filter modes.
+{ MODE_SUSD = 0,  // Separable upsampling, separable downsampling.
+  MODE_FUSD = 1,  // Full upsampling, separable downsampling.
+  MODE_SUFD = 2,  // Separable upsampling, full downsampling.
+  MODE_FUFD = 3,  // Full upsampling, full downsampling.
+};
+
+template <class T>
+struct InternalType;
+template <>
+struct InternalType<double> {
+  typedef double scalar_t;
+  typedef double2 vec2_t;
+  typedef double4 vec4_t;
+  __device__ __forceinline__ static vec2_t zero_vec2(void) {
+    return make_double2(0, 0);
+  }
+  __device__ __forceinline__ static vec4_t zero_vec4(void) {
+    return make_double4(0, 0, 0, 0);
+  }
+  __device__ __forceinline__ static double clamp(double x, double c) {
+    return fmin(fmax(x, -c), c);
+  }
+};
+template <>
+struct InternalType<float> {
+  typedef float scalar_t;
+  typedef float2 vec2_t;
+  typedef float4 vec4_t;
+  __device__ __forceinline__ static vec2_t zero_vec2(void) {
+    return make_float2(0, 0);
+  }
+  __device__ __forceinline__ static vec4_t zero_vec4(void) {
+    return make_float4(0, 0, 0, 0);
+  }
+  __device__ __forceinline__ static float clamp(float x, float c) {
+    return fminf(fmaxf(x, -c), c);
+  }
+};
+template <>
+struct InternalType<c10::Half> {
+  typedef float scalar_t;
+  typedef float2 vec2_t;
+  typedef float4 vec4_t;
+  __device__ __forceinline__ static vec2_t zero_vec2(void) {
+    return make_float2(0, 0);
+  }
+  __device__ __forceinline__ static vec4_t zero_vec4(void) {
+    return make_float4(0, 0, 0, 0);
+  }
+  __device__ __forceinline__ static float clamp(float x, float c) {
+    return fminf(fmaxf(x, -c), c);
+  }
+};
+
+#define MIN(A, B) ((A) < (B) ? (A) : (B))
+#define MAX(A, B) ((A) > (B) ? (A) : (B))
+#define CEIL_DIV(A, B)                                   \
+  (((B) == 1)                                            \
+       ? (A)                                             \
+       : ((B) == 2) ? ((int)((A) + 1) >> 1)              \
+                    : ((B) == 4) ? ((int)((A) + 3) >> 2) \
+                                 : (((A) + ((A) > 0 ? (B)-1 : 0)) / (B)))
+
+// This works only up to blocks of size 256 x 256 and for all N that are powers
+// of two.
+template <int N>
+__device__ __forceinline__ void fast_div_mod(int &x, int &y, unsigned int i) {
+  if ((N & (N - 1)) && N <= 256)
+    y = (i * ((1 << 24) / N + 1)) >> 24;  // Assumes N <= 256, i < N*256.
+  else
+    y = i / N;
+
+  x = i - y * N;
+}
+
+// Type cast stride before reading it.
+template <class T>
+__device__ __forceinline__ T get_stride(const int64_t &x) {
+  return *reinterpret_cast<const T *>(&x);
+}
+
+//------------------------------------------------------------------------
+// Filters, setup kernel, copying function.
+
+#define MAX_FILTER_SIZE 32
+
+// Combined up/down filter buffers so that transfer can be done with one copy.
+__device__ float
+    g_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE];  // Filters in global memory,
+                                                    // written by setup kernel.
+__device__ __constant__ float
+    c_fbuf[2 * MAX_FILTER_SIZE *
+           MAX_FILTER_SIZE];  // Filters in constant memory, read by main
+                              // kernel.
+
+// Accessors to combined buffers to index up/down filters individually.
+#define c_fu (c_fbuf)
+#define c_fd (c_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
+#define g_fu (g_fbuf)
+#define g_fd (g_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
+
+// Set up filters into global memory buffer.
+static __global__ void setup_filters_kernel(filtered_lrelu_kernel_params p) {
+  for (int idx = threadIdx.x; idx < MAX_FILTER_SIZE * MAX_FILTER_SIZE;
+       idx += blockDim.x) {
+    int x, y;
+    fast_div_mod<MAX_FILTER_SIZE>(x, y, idx);
+
+    int fu_x = p.flip ? x : (p.fuShape.x - 1 - x);
+    int fu_y = p.flip ? y : (p.fuShape.y - 1 - y);
+    if (p.fuShape.y > 0)
+      g_fu[idx] = (x >= p.fuShape.x || y >= p.fuShape.y)
+                      ? 0.0f
+                      : p.fu[fu_x * p.fuStride.x + fu_y * p.fuStride.y];
+    else
+      g_fu[idx] =
+          (x >= p.fuShape.x || y > 0) ? 0.0f : p.fu[fu_x * p.fuStride.x];
+
+    int fd_x = p.flip ? x : (p.fdShape.x - 1 - x);
+    int fd_y = p.flip ? y : (p.fdShape.y - 1 - y);
+    if (p.fdShape.y > 0)
+      g_fd[idx] = (x >= p.fdShape.x || y >= p.fdShape.y)
+                      ? 0.0f
+                      : p.fd[fd_x * p.fdStride.x + fd_y * p.fdStride.y];
+    else
+      g_fd[idx] =
+          (x >= p.fdShape.x || y > 0) ? 0.0f : p.fd[fd_x * p.fdStride.x];
+  }
+}
+
+// Host function to copy filters written by setup kernel into constant buffer
+// for main kernel.
+static cudaError_t copy_filters(cudaStream_t stream) {
+  void *src = 0;
+  cudaError_t err = cudaGetSymbolAddress(&src, g_fbuf);
+  if (err) return err;
+  return cudaMemcpyToSymbolAsync(
+      c_fbuf, src, 2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE * sizeof(float), 0,
+      cudaMemcpyDeviceToDevice, stream);
+}
+
+//------------------------------------------------------------------------
+// Coordinate spaces:
+// - Relative to input tensor:      inX, inY, tileInX, tileInY
+// - Relative to input tile:        relInX, relInY, tileInW, tileInH
+// - Relative to upsampled tile:    relUpX, relUpY, tileUpW, tileUpH
+// - Relative to output tile:       relOutX, relOutY, tileOutW, tileOutH
+// - Relative to output tensor:     outX, outY, tileOutX, tileOutY
+//
+// Relationships between coordinate spaces:
+// - inX = tileInX + relInX
+// - inY = tileInY + relInY
+// - relUpX = relInX * up + phaseInX
+// - relUpY = relInY * up + phaseInY
+// - relUpX = relOutX * down
+// - relUpY = relOutY * down
+// - outX = tileOutX + relOutX
+// - outY = tileOutY + relOutY
+
+extern __shared__ char
+    s_buf_raw[];  // When sharedKB <= 48, allocate shared memory statically
+                  // inside the kernel, otherwise use the externally allocated
+                  // shared memory buffer.
+
+template <class T, class index_t, int sharedKB, bool signWrite, bool signRead,
+          int filterMode, int up, int fuSize, int down, int fdSize,
+          int tileOutW, int tileOutH, int threadsPerBlock, bool enableXrep,
+          bool enableWriteSkip>
+static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p) {
+  // Check that we don't try to support non-existing filter modes.
+  static_assert(up == 1 || up == 2 || up == 4,
+                "only up=1, up=2, up=4 scales supported");
+  static_assert(down == 1 || down == 2 || down == 4,
+                "only down=1, down=2, down=4 scales supported");
+  static_assert(fuSize >= up,
+                "upsampling filter size must be at least upsampling factor");
+  static_assert(
+      fdSize >= down,
+      "downsampling filter size must be at least downsampling factor");
+  static_assert(
+      fuSize % up == 0,
+      "upsampling filter size must be divisible with upsampling factor");
+  static_assert(
+      fdSize % down == 0,
+      "downsampling filter size must be divisible with downsampling factor");
+  static_assert(fuSize <= MAX_FILTER_SIZE && fdSize <= MAX_FILTER_SIZE,
+                "filter size greater than MAX_FILTER_SIZE");
+  static_assert(up != 1 || (fuSize == 1 && (filterMode == MODE_FUFD ||
+                                            filterMode == MODE_FUSD)),
+                "up=1 supported only for 1x1 full filters");
+  static_assert(down != 1 || (fdSize == 1 && (filterMode == MODE_FUFD ||
+                                              filterMode == MODE_SUFD)),
+                "down=1 supported only for 1x1 full filters");
+  static_assert(
+      !(up == 4 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)),
+      "full filters not supported for up=4");
+  static_assert(
+      !(down == 4 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)),
+      "full filters not supported for down=4");
+
+  // Static definitions.
+  typedef typename InternalType<T>::scalar_t scalar_t;
+  typedef typename InternalType<T>::vec2_t vec2_t;
+  typedef typename InternalType<T>::vec4_t vec4_t;
+  const int tileUpW = (tileOutW * down + (fdSize - 1) - (down - 1) + 3) &
+                      ~3;  // Upsampled tile width, rounded up to multiple of 4.
+  const int tileUpH =
+      tileOutH * down + (fdSize - 1) - (down - 1);  // Upsampled tile height.
+  const int tileInW =
+      CEIL_DIV(tileUpW + (fuSize - 1), up);  // Input tile width.
+  const int tileInH =
+      CEIL_DIV(tileUpH + (fuSize - 1), up);  // Input tile height.
+  const int tileUpH_up =
+      CEIL_DIV(tileUpH, up) *
+      up;  // Upsampled tile height rounded up to a multiple of up.
+  const int tileInH_up =
+      CEIL_DIV(tileUpH_up + (fuSize - 1),
+               up);  // For allocations only, to avoid shared memory read
+                     // overruns with up=2 and up=4.
+
+  // Merge 1x1 downsampling into last upsampling step for upf1 and ups2.
+  const bool downInline =
+      (down == 1) && ((up == 1 && filterMode == MODE_FUFD) ||
+                      (up == 2 && filterMode == MODE_SUFD));
+
+  // Sizes of logical buffers.
+  const int szIn = tileInH_up * tileInW;
+  const int szUpX = tileInH_up * tileUpW;
+  const int szUpXY = downInline ? 0 : (tileUpH * tileUpW);
+  const int szDownX = tileUpH * tileOutW;
+
+  // Sizes for shared memory arrays.
+  const int s_buf0_size_base =
+      (filterMode == MODE_SUSD)
+          ? MAX(szIn, szUpXY)
+          : (filterMode == MODE_FUSD)
+                ? MAX(szIn, szDownX)
+                : (filterMode == MODE_SUFD)
+                      ? MAX(szIn, szUpXY)
+                      : (filterMode == MODE_FUFD) ? szIn : -1;
+  const int s_buf1_size_base =
+      (filterMode == MODE_SUSD)
+          ? MAX(szUpX, szDownX)
+          : (filterMode == MODE_FUSD)
+                ? szUpXY
+                : (filterMode == MODE_SUFD)
+                      ? szUpX
+                      : (filterMode == MODE_FUFD) ? szUpXY : -1;
+
+  // Ensure U128 alignment.
+  const int s_buf0_size = (s_buf0_size_base + 3) & ~3;
+  const int s_buf1_size = (s_buf1_size_base + 3) & ~3;
+
+  // Check at compile time that we don't use too much shared memory.
+  static_assert(
+      (s_buf0_size + s_buf1_size) * sizeof(scalar_t) <= (sharedKB << 10),
+      "shared memory overflow");
+
+  // Declare shared memory arrays.
+  scalar_t *s_buf0;
+  scalar_t *s_buf1;
+  if (sharedKB <= 48) {
+    // Allocate shared memory arrays here.
+    __shared__ scalar_t
+        s_buf0_st[(sharedKB > 48)
+                      ? (1 << 24)
+                      : (s_buf0_size +
+                         s_buf1_size)];  // Prevent launching if this isn't
+                                         // optimized away when unused.
+    s_buf0 = s_buf0_st;
+    s_buf1 = s_buf0 + s_buf0_size;
+  } else {
+    // Use the dynamically allocated shared memory array.
+    s_buf0 = (scalar_t *)s_buf_raw;
+    s_buf1 = s_buf0 + s_buf0_size;
+  }
+
+  // Pointers to the buffers.
+  scalar_t *
+      s_tileIn;  // Input tile:                      [relInX * tileInH + relInY]
+  scalar_t *s_tileUpX;   // After horizontal upsampling:     [relInY * tileUpW +
+                         // relUpX]
+  scalar_t *s_tileUpXY;  // After upsampling:                [relUpY * tileUpW +
+                         // relUpX]
+  scalar_t *s_tileDownX;  // After horizontal downsampling:   [relUpY * tileOutW
+                          // + relOutX]
+  if (filterMode == MODE_SUSD) {
+    s_tileIn = s_buf0;
+    s_tileUpX = s_buf1;
+    s_tileUpXY = s_buf0;
+    s_tileDownX = s_buf1;
+  } else if (filterMode == MODE_FUSD) {
+    s_tileIn = s_buf0;
+    s_tileUpXY = s_buf1;
+    s_tileDownX = s_buf0;
+  } else if (filterMode == MODE_SUFD) {
+    s_tileIn = s_buf0;
+    s_tileUpX = s_buf1;
+    s_tileUpXY = s_buf0;
+  } else if (filterMode == MODE_FUFD) {
+    s_tileIn = s_buf0;
+    s_tileUpXY = s_buf1;
+  }
+
+  // Allow large grids in z direction via per-launch offset.
+  int channelIdx = blockIdx.z + p.blockZofs;
+  int batchIdx = channelIdx / p.yShape.z;
+  channelIdx -= batchIdx * p.yShape.z;
+
+  // Offset to output feature map. In bytes.
+  index_t mapOfsOut = channelIdx * get_stride<index_t>(p.yStride.z) +
+                      batchIdx * get_stride<index_t>(p.yStride.w);
+
+  // Sign shift amount.
+  uint32_t signXo = ((threadIdx.x + p.sOfs.x) << 1) & 6;
+
+// Inner tile loop.
+#pragma unroll 1
+  for (int tileIdx = 0;
+       !enableXrep ||
+       (tileIdx < MIN(p.tilesXrep, p.tilesXdim - p.tilesXrep * blockIdx.y));
+       tileIdx++) {
+    // Locate output tile.
+    int tileX = enableXrep ? blockIdx.y * p.tilesXrep + tileIdx : blockIdx.x;
+    int tileOutX = tileX * tileOutW;
+    int tileOutY = (enableXrep ? blockIdx.x : blockIdx.y) * tileOutH;
+
+    // Locate input tile.
+    int tmpX = tileOutX * down - p.pad0.x;
+    int tmpY = tileOutY * down - p.pad0.y;
+    int tileInX = CEIL_DIV(tmpX, up);
+    int tileInY = CEIL_DIV(tmpY, up);
+    const int phaseInX = tileInX * up - tmpX;
+    const int phaseInY = tileInY * up - tmpY;
+
+    // Extra sync if input and output buffers are the same and we are not on
+    // first tile.
+    if (enableXrep && tileIdx > 0 &&
+        (filterMode == MODE_FUSD || (filterMode == MODE_SUFD && !downInline) ||
+         (filterMode == MODE_FUFD && downInline)))
+      __syncthreads();
+
+    // Load input tile & apply bias. Unrolled.
+    scalar_t b =
+        (scalar_t) * (const T *)((const char *)p.b +
+                                 (channelIdx * get_stride<index_t>(p.bStride)));
+    index_t mapOfsIn = channelIdx * get_stride<index_t>(p.xStride.z) +
+                       batchIdx * get_stride<index_t>(p.xStride.w);
+    int idx = threadIdx.x;
+    const int loopCountIN = CEIL_DIV(tileInW * tileInH, threadsPerBlock);
+#pragma unroll
+    for (int loop = 0; loop < loopCountIN; loop++) {
+      int relInX, relInY;
+      fast_div_mod<tileInW>(relInX, relInY, idx);
+      int inX = tileInX + relInX;
+      int inY = tileInY + relInY;
+      scalar_t v = 0;
+
+      if ((uint32_t)inX < p.xShape.x && (uint32_t)inY < p.xShape.y)
+        v = (scalar_t) * ((const T *)((const char *)p.x +
+                                      (inX * get_stride<index_t>(p.xStride.x) +
+                                       inY * get_stride<index_t>(p.xStride.y) +
+                                       mapOfsIn))) +
+            b;
+
+      bool skip = (loop == loopCountIN - 1) && (idx >= tileInW * tileInH);
+      if (!skip) s_tileIn[idx] = v;
+
+      idx += threadsPerBlock;
+    }
+
+    if (filterMode == MODE_SUSD ||
+        filterMode == MODE_SUFD)  // Separable upsampling filter.
+    {
+      // Horizontal upsampling.
+      __syncthreads();
+      if (up == 4) {
+        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;
+             idx += blockDim.x * up) {
+          int relUpX0, relInY;
+          fast_div_mod<tileUpW>(relUpX0, relInY, idx);
+          int relInX0 = relUpX0 / up;
+          int src0 = relInX0 + tileInW * relInY;
+          int dst = relInY * tileUpW + relUpX0;
+          vec4_t v = InternalType<T>::zero_vec4();
+          scalar_t a = s_tileIn[src0];
+          if (phaseInX == 0) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileIn[src0 + step + 1];
+              v.y += a * (scalar_t)c_fu[step * up + 3];
+              v.z += a * (scalar_t)c_fu[step * up + 2];
+              v.w += a * (scalar_t)c_fu[step * up + 1];
+            }
+          } else if (phaseInX == 1) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 1];
+              v.y += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileIn[src0 + step + 1];
+              v.z += a * (scalar_t)c_fu[step * up + 3];
+              v.w += a * (scalar_t)c_fu[step * up + 2];
+            }
+          } else if (phaseInX == 2) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 2];
+              v.y += a * (scalar_t)c_fu[step * up + 1];
+              v.z += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileIn[src0 + step + 1];
+              v.w += a * (scalar_t)c_fu[step * up + 3];
+            }
+          } else  // (phaseInX == 3)
+          {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 3];
+              v.y += a * (scalar_t)c_fu[step * up + 2];
+              v.z += a * (scalar_t)c_fu[step * up + 1];
+              v.w += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileIn[src0 + step + 1];
+            }
+          }
+          s_tileUpX[dst + 0] = v.x;
+          s_tileUpX[dst + 1] = v.y;
+          s_tileUpX[dst + 2] = v.z;
+          s_tileUpX[dst + 3] = v.w;
+        }
+      } else if (up == 2) {
+        bool p0 = (phaseInX == 0);
+        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;
+             idx += blockDim.x * up) {
+          int relUpX0, relInY;
+          fast_div_mod<tileUpW>(relUpX0, relInY, idx);
+          int relInX0 = relUpX0 / up;
+          int src0 = relInX0 + tileInW * relInY;
+          int dst = relInY * tileUpW + relUpX0;
+          vec2_t v = InternalType<T>::zero_vec2();
+          scalar_t a = s_tileIn[src0];
+          if (p0)  // (phaseInX == 0)
+          {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileIn[src0 + step + 1];
+              v.y += a * (scalar_t)c_fu[step * up + 1];
+            }
+          } else  // (phaseInX == 1)
+          {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 1];
+              v.y += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileIn[src0 + step + 1];
+            }
+          }
+          s_tileUpX[dst + 0] = v.x;
+          s_tileUpX[dst + 1] = v.y;
+        }
+      }
+
+      // Vertical upsampling & nonlinearity.
+
+      __syncthreads();
+      int groupMask = 15 << ((threadIdx.x & 31) & ~3);
+      int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH
+                          : 0;  // Skip already written signs.
+      int sShapeMaxY =
+          MIN(p.sShape.y,
+              tileOutY * down + tileUpH);  // Avoid out-of-tile sign writes.
+      if (up == 4) {
+        minY -= 3;  // Adjust according to block height.
+        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;
+             idx += blockDim.x) {
+          int relUpX, relInY0;
+          fast_div_mod<tileUpW>(relUpX, relInY0, idx);
+          int relUpY0 = relInY0 * up;
+          int src0 = relInY0 * tileUpW + relUpX;
+          int dst = relUpY0 * tileUpW + relUpX;
+          vec4_t v = InternalType<T>::zero_vec4();
+
+          scalar_t a = s_tileUpX[src0];
+          if (phaseInY == 0) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileUpX[src0 + (step + 1) * tileUpW];
+              v.y += a * (scalar_t)c_fu[step * up + 3];
+              v.z += a * (scalar_t)c_fu[step * up + 2];
+              v.w += a * (scalar_t)c_fu[step * up + 1];
+            }
+          } else if (phaseInY == 1) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 1];
+              v.y += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileUpX[src0 + (step + 1) * tileUpW];
+              v.z += a * (scalar_t)c_fu[step * up + 3];
+              v.w += a * (scalar_t)c_fu[step * up + 2];
+            }
+          } else if (phaseInY == 2) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 2];
+              v.y += a * (scalar_t)c_fu[step * up + 1];
+              v.z += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileUpX[src0 + (step + 1) * tileUpW];
+              v.w += a * (scalar_t)c_fu[step * up + 3];
+            }
+          } else  // (phaseInY == 3)
+          {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 3];
+              v.y += a * (scalar_t)c_fu[step * up + 2];
+              v.z += a * (scalar_t)c_fu[step * up + 1];
+              v.w += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileUpX[src0 + (step + 1) * tileUpW];
+            }
+          }
+
+          int x = tileOutX * down + relUpX;
+          int y = tileOutY * down + relUpY0;
+          int signX = x + p.sOfs.x;
+          int signY = y + p.sOfs.y;
+          int signZ = blockIdx.z + p.blockZofs;
+          int signXb = signX >> 2;
+          index_t si0 =
+              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+          index_t si1 = si0 + p.sShape.x;
+          index_t si2 = si0 + p.sShape.x * 2;
+          index_t si3 = si0 + p.sShape.x * 3;
+
+          v.x *= (scalar_t)((float)up * (float)up * p.gain);
+          v.y *= (scalar_t)((float)up * (float)up * p.gain);
+          v.z *= (scalar_t)((float)up * (float)up * p.gain);
+          v.w *= (scalar_t)((float)up * (float)up * p.gain);
+
+          if (signWrite) {
+            if (!enableWriteSkip) {
+              // Determine and write signs.
+              int sx = __float_as_uint(v.x) >> 31 << 0;
+              int sy = __float_as_uint(v.y) >> 31 << 8;
+              int sz = __float_as_uint(v.z) >> 31 << 16;
+              int sw = __float_as_uint(v.w) >> 31 << 24;
+              if (sx) v.x *= p.slope;
+              if (sy) v.y *= p.slope;
+              if (sz) v.z *= p.slope;
+              if (sw) v.w *= p.slope;
+              if (fabsf(v.x) > p.clamp) {
+                sx = 2 << 0;
+                v.x = InternalType<T>::clamp(v.x, p.clamp);
+              }
+              if (fabsf(v.y) > p.clamp) {
+                sy = 2 << 8;
+                v.y = InternalType<T>::clamp(v.y, p.clamp);
+              }
+              if (fabsf(v.z) > p.clamp) {
+                sz = 2 << 16;
+                v.z = InternalType<T>::clamp(v.z, p.clamp);
+              }
+              if (fabsf(v.w) > p.clamp) {
+                sw = 2 << 24;
+                v.w = InternalType<T>::clamp(v.w, p.clamp);
+              }
+
+              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
+                // Combine signs.
+                uint32_t s = sx + sy + sw + sz;
+                s <<= (signX & 3) << 1;
+#ifdef MMCV_WITH_HIP
+                s |= __shfl_xor(s, 1);
+                s |= __shfl_xor(s, 2);
+#else
+                s |= __shfl_xor_sync(groupMask, s, 1);
+                s |= __shfl_xor_sync(groupMask, s, 2);
+#endif
+
+                // Write signs.
+                if ((uint32_t)(signY + 0) < sShapeMaxY) {
+                  p.s[si0] = (unsigned char)(s >> 0);
+                }
+                if ((uint32_t)(signY + 1) < sShapeMaxY) {
+                  p.s[si1] = (unsigned char)(s >> 8);
+                }
+                if ((uint32_t)(signY + 2) < sShapeMaxY) {
+                  p.s[si2] = (unsigned char)(s >> 16);
+                }
+                if ((uint32_t)(signY + 3) < sShapeMaxY) {
+                  p.s[si3] = (unsigned char)(s >> 24);
+                }
+              }
+            } else {
+              // Determine and write signs.
+              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
+                int sx = __float_as_uint(v.x) >> 31 << 0;
+                int sy = __float_as_uint(v.y) >> 31 << 8;
+                int sz = __float_as_uint(v.z) >> 31 << 16;
+                int sw = __float_as_uint(v.w) >> 31 << 24;
+                if (sx) v.x *= p.slope;
+                if (sy) v.y *= p.slope;
+                if (sz) v.z *= p.slope;
+                if (sw) v.w *= p.slope;
+                if (fabsf(v.x) > p.clamp) {
+                  sx = 2 << 0;
+                  v.x = InternalType<T>::clamp(v.x, p.clamp);
+                }
+                if (fabsf(v.y) > p.clamp) {
+                  sy = 2 << 8;
+                  v.y = InternalType<T>::clamp(v.y, p.clamp);
+                }
+                if (fabsf(v.z) > p.clamp) {
+                  sz = 2 << 16;
+                  v.z = InternalType<T>::clamp(v.z, p.clamp);
+                }
+                if (fabsf(v.w) > p.clamp) {
+                  sw = 2 << 24;
+                  v.w = InternalType<T>::clamp(v.w, p.clamp);
+                }
+
+                // Combine signs.
+                uint32_t s = sx + sy + sw + sz;
+                s <<= (signX & 3) << 1;
+#ifdef MMCV_WITH_HIP
+                s |= __shfl_xor(s, 1);
+                s |= __shfl_xor(s, 2);
+#else
+                s |= __shfl_xor_sync(groupMask, s, 1);
+                s |= __shfl_xor_sync(groupMask, s, 2);
+#endif
+
+                // Write signs.
+                if ((uint32_t)(signY + 0) < sShapeMaxY) {
+                  p.s[si0] = (unsigned char)(s >> 0);
+                }
+                if ((uint32_t)(signY + 1) < sShapeMaxY) {
+                  p.s[si1] = (unsigned char)(s >> 8);
+                }
+                if ((uint32_t)(signY + 2) < sShapeMaxY) {
+                  p.s[si2] = (unsigned char)(s >> 16);
+                }
+                if ((uint32_t)(signY + 3) < sShapeMaxY) {
+                  p.s[si3] = (unsigned char)(s >> 24);
+                }
+              } else {
+                // Just compute the values.
+                if (v.x < 0.f) v.x *= p.slope;
+                v.x = InternalType<T>::clamp(v.x, p.clamp);
+                if (v.y < 0.f) v.y *= p.slope;
+                v.y = InternalType<T>::clamp(v.y, p.clamp);
+                if (v.z < 0.f) v.z *= p.slope;
+                v.z = InternalType<T>::clamp(v.z, p.clamp);
+                if (v.w < 0.f) v.w *= p.slope;
+                v.w = InternalType<T>::clamp(v.w, p.clamp);
+              }
+            }
+          } else if (signRead)  // Read signs and apply.
+          {
+            if ((uint32_t)signXb < p.swLimit) {
+              int ss = (signX & 3) << 1;
+              if ((uint32_t)(signY + 0) < p.sShape.y) {
+                int s = p.s[si0] >> ss;
+                if (s & 1) v.x *= p.slope;
+                if (s & 2) v.x = 0.f;
+              }
+              if ((uint32_t)(signY + 1) < p.sShape.y) {
+                int s = p.s[si1] >> ss;
+                if (s & 1) v.y *= p.slope;
+                if (s & 2) v.y = 0.f;
+              }
+              if ((uint32_t)(signY + 2) < p.sShape.y) {
+                int s = p.s[si2] >> ss;
+                if (s & 1) v.z *= p.slope;
+                if (s & 2) v.z = 0.f;
+              }
+              if ((uint32_t)(signY + 3) < p.sShape.y) {
+                int s = p.s[si3] >> ss;
+                if (s & 1) v.w *= p.slope;
+                if (s & 2) v.w = 0.f;
+              }
+            }
+          } else  // Forward pass with no sign write.
+          {
+            if (v.x < 0.f) v.x *= p.slope;
+            v.x = InternalType<T>::clamp(v.x, p.clamp);
+            if (v.y < 0.f) v.y *= p.slope;
+            v.y = InternalType<T>::clamp(v.y, p.clamp);
+            if (v.z < 0.f) v.z *= p.slope;
+            v.z = InternalType<T>::clamp(v.z, p.clamp);
+            if (v.w < 0.f) v.w *= p.slope;
+            v.w = InternalType<T>::clamp(v.w, p.clamp);
+          }
+
+          s_tileUpXY[dst + 0 * tileUpW] = v.x;
+          if (relUpY0 + 1 < tileUpH) s_tileUpXY[dst + 1 * tileUpW] = v.y;
+          if (relUpY0 + 2 < tileUpH) s_tileUpXY[dst + 2 * tileUpW] = v.z;
+          if (relUpY0 + 3 < tileUpH) s_tileUpXY[dst + 3 * tileUpW] = v.w;
+        }
+      } else if (up == 2) {
+        minY -= 1;  // Adjust according to block height.
+        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;
+             idx += blockDim.x) {
+          int relUpX, relInY0;
+          fast_div_mod<tileUpW>(relUpX, relInY0, idx);
+          int relUpY0 = relInY0 * up;
+          int src0 = relInY0 * tileUpW + relUpX;
+          int dst = relUpY0 * tileUpW + relUpX;
+          vec2_t v = InternalType<T>::zero_vec2();
+
+          scalar_t a = s_tileUpX[src0];
+          if (phaseInY == 0) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileUpX[src0 + (step + 1) * tileUpW];
+              v.y += a * (scalar_t)c_fu[step * up + 1];
+            }
+          } else  // (phaseInY == 1)
+          {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 1];
+              v.y += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileUpX[src0 + (step + 1) * tileUpW];
+            }
+          }
+
+          int x = tileOutX * down + relUpX;
+          int y = tileOutY * down + relUpY0;
+          int signX = x + p.sOfs.x;
+          int signY = y + p.sOfs.y;
+          int signZ = blockIdx.z + p.blockZofs;
+          int signXb = signX >> 2;
+          index_t si0 =
+              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+          index_t si1 = si0 + p.sShape.x;
+
+          v.x *= (scalar_t)((float)up * (float)up * p.gain);
+          v.y *= (scalar_t)((float)up * (float)up * p.gain);
+
+          if (signWrite) {
+            if (!enableWriteSkip) {
+              // Determine and write signs.
+              int sx = __float_as_uint(v.x) >> 31 << 0;
+              int sy = __float_as_uint(v.y) >> 31 << 8;
+              if (sx) v.x *= p.slope;
+              if (sy) v.y *= p.slope;
+              if (fabsf(v.x) > p.clamp) {
+                sx = 2 << 0;
+                v.x = InternalType<T>::clamp(v.x, p.clamp);
+              }
+              if (fabsf(v.y) > p.clamp) {
+                sy = 2 << 8;
+                v.y = InternalType<T>::clamp(v.y, p.clamp);
+              }
+
+              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
+                // Combine signs.
+                int s = sx + sy;
+                s <<= signXo;
+#ifdef MMCV_WITH_HIP
+                s |= __shfl_xor(s, 1);
+                s |= __shfl_xor(s, 2);
+#else
+                s |= __shfl_xor_sync(groupMask, s, 1);
+                s |= __shfl_xor_sync(groupMask, s, 2);
+#endif
+
+                // Write signs.
+                if ((uint32_t)(signY + 0) < sShapeMaxY) {
+                  p.s[si0] = (unsigned char)(s >> 0);
+                }
+                if ((uint32_t)(signY + 1) < sShapeMaxY) {
+                  p.s[si1] = (unsigned char)(s >> 8);
+                }
+              }
+            } else {
+              // Determine and write signs.
+              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
+                int sx = __float_as_uint(v.x) >> 31 << 0;
+                int sy = __float_as_uint(v.y) >> 31 << 8;
+                if (sx) v.x *= p.slope;
+                if (sy) v.y *= p.slope;
+                if (fabsf(v.x) > p.clamp) {
+                  sx = 2 << 0;
+                  v.x = InternalType<T>::clamp(v.x, p.clamp);
+                }
+                if (fabsf(v.y) > p.clamp) {
+                  sy = 2 << 8;
+                  v.y = InternalType<T>::clamp(v.y, p.clamp);
+                }
+
+                // Combine signs.
+                int s = sx + sy;
+                s <<= signXo;
+#ifdef MMCV_WITH_HIP
+                s |= __shfl_xor(s, 1);
+                s |= __shfl_xor(s, 2);
+#else
+                s |= __shfl_xor_sync(groupMask, s, 1);
+                s |= __shfl_xor_sync(groupMask, s, 2);
+#endif
+
+                // Write signs.
+                if ((uint32_t)(signY + 0) < sShapeMaxY) {
+                  p.s[si0] = (unsigned char)(s >> 0);
+                }
+                if ((uint32_t)(signY + 1) < sShapeMaxY) {
+                  p.s[si1] = (unsigned char)(s >> 8);
+                }
+              } else {
+                // Just compute the values.
+                if (v.x < 0.f) v.x *= p.slope;
+                v.x = InternalType<T>::clamp(v.x, p.clamp);
+                if (v.y < 0.f) v.y *= p.slope;
+                v.y = InternalType<T>::clamp(v.y, p.clamp);
+              }
+            }
+          } else if (signRead)  // Read signs and apply.
+          {
+            if ((uint32_t)signXb < p.swLimit) {
+              if ((uint32_t)(signY + 0) < p.sShape.y) {
+                int s = p.s[si0] >> signXo;
+                if (s & 1) v.x *= p.slope;
+                if (s & 2) v.x = 0.f;
+              }
+              if ((uint32_t)(signY + 1) < p.sShape.y) {
+                int s = p.s[si1] >> signXo;
+                if (s & 1) v.y *= p.slope;
+                if (s & 2) v.y = 0.f;
+              }
+            }
+          } else  // Forward pass with no sign write.
+          {
+            if (v.x < 0.f) v.x *= p.slope;
+            v.x = InternalType<T>::clamp(v.x, p.clamp);
+            if (v.y < 0.f) v.y *= p.slope;
+            v.y = InternalType<T>::clamp(v.y, p.clamp);
+          }
+
+          if (!downInline) {
+            // Write into temporary buffer.
+            s_tileUpXY[dst] = v.x;
+            if (relUpY0 < tileUpH - 1) s_tileUpXY[dst + tileUpW] = v.y;
+          } else {
+            // Write directly into output buffer.
+            if ((uint32_t)x < p.yShape.x) {
+              int ymax = MIN(p.yShape.y, tileUpH + tileOutY * down);
+              index_t ofs = x * get_stride<index_t>(p.yStride.x) +
+                            y * get_stride<index_t>(p.yStride.y) + mapOfsOut;
+              if ((uint32_t)y + 0 < p.yShape.y)
+                *((T *)((char *)p.y + ofs)) = (T)(v.x * (scalar_t)c_fd[0]);
+              if ((uint32_t)y + 1 < ymax)
+                *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.y))) =
+                    (T)(v.y * (scalar_t)c_fd[0]);
+            }
+          }
+        }
+      }
+    } else if (filterMode == MODE_FUSD || filterMode == MODE_FUFD) {
+      // Full upsampling filter.
+
+      if (up == 2) {
+        // 2 x 2-wide.
+        __syncthreads();
+        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH + p.sOfs.y
+                            : 0;  // Skip already written signs.
+        for (int idx = threadIdx.x * 4; idx < tileUpW * tileUpH;
+             idx += blockDim.x * 4) {
+          int relUpX0, relUpY0;
+          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
+          int relInX0 = CEIL_DIV(relUpX0 - phaseInX, up);
+          int relInY0 = CEIL_DIV(relUpY0 - phaseInY, up);
+          int src0 = relInX0 + tileInW * relInY0;
+          int tap0y = (relInY0 * up + phaseInY - relUpY0);
+
+#define X_LOOP(TAPY, PX)                                             \
+  for (int sx = 0; sx < fuSize / up; sx++) {                         \
+    v.x += a * (scalar_t)c_fu[(sx * up + (((PX)-0) & (up - 1))) +    \
+                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+    v.z += b * (scalar_t)c_fu[(sx * up + (((PX)-0) & (up - 1))) +    \
+                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+    if ((PX) == 0) {                                                 \
+      a = b;                                                         \
+      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \
+    }                                                                \
+    v.y += a * (scalar_t)c_fu[(sx * up + (((PX)-1) & (up - 1))) +    \
+                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+    v.w += b * (scalar_t)c_fu[(sx * up + (((PX)-1) & (up - 1))) +    \
+                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+    if ((PX) == 1) {                                                 \
+      a = b;                                                         \
+      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \
+    }                                                                \
+  }
+
+          vec4_t v = InternalType<T>::zero_vec4();
+          if (tap0y == 0 && phaseInX == 0)
+#pragma unroll
+            for (int sy = 0; sy < fuSize / up; sy++) {
+              scalar_t a = s_tileIn[src0 + sy * tileInW];
+              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+#pragma unroll
+              X_LOOP(0, 0)
+            }
+          if (tap0y == 0 && phaseInX == 1)
+#pragma unroll
+            for (int sy = 0; sy < fuSize / up; sy++) {
+              scalar_t a = s_tileIn[src0 + sy * tileInW];
+              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+#pragma unroll
+              X_LOOP(0, 1)
+            }
+          if (tap0y == 1 && phaseInX == 0)
+#pragma unroll
+            for (int sy = 0; sy < fuSize / up; sy++) {
+              scalar_t a = s_tileIn[src0 + sy * tileInW];
+              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+#pragma unroll
+              X_LOOP(1, 0)
+            }
+          if (tap0y == 1 && phaseInX == 1)
+#pragma unroll
+            for (int sy = 0; sy < fuSize / up; sy++) {
+              scalar_t a = s_tileIn[src0 + sy * tileInW];
+              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+#pragma unroll
+              X_LOOP(1, 1)
+            }
+
+#undef X_LOOP
+
+          int x = tileOutX * down + relUpX0;
+          int y = tileOutY * down + relUpY0;
+          int signX = x + p.sOfs.x;
+          int signY = y + p.sOfs.y;
+          int signZ = blockIdx.z + p.blockZofs;
+          int signXb = signX >> 2;
+          index_t si =
+              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+
+          v.x *= (scalar_t)((float)up * (float)up * p.gain);
+          v.y *= (scalar_t)((float)up * (float)up * p.gain);
+          v.z *= (scalar_t)((float)up * (float)up * p.gain);
+          v.w *= (scalar_t)((float)up * (float)up * p.gain);
+
+          if (signWrite) {
+            if (!enableWriteSkip) {
+              // Determine and write signs.
+              int sx = __float_as_uint(v.x) >> 31;
+              int sy = __float_as_uint(v.y) >> 31;
+              int sz = __float_as_uint(v.z) >> 31;
+              int sw = __float_as_uint(v.w) >> 31;
+              if (sx) v.x *= p.slope;
+              if (fabsf(v.x) > p.clamp) {
+                sx = 2;
+                v.x = InternalType<T>::clamp(v.x, p.clamp);
+              }
+              if (sy) v.y *= p.slope;
+              if (fabsf(v.y) > p.clamp) {
+                sy = 2;
+                v.y = InternalType<T>::clamp(v.y, p.clamp);
+              }
+              if (sz) v.z *= p.slope;
+              if (fabsf(v.z) > p.clamp) {
+                sz = 2;
+                v.z = InternalType<T>::clamp(v.z, p.clamp);
+              }
+              if (sw) v.w *= p.slope;
+              if (fabsf(v.w) > p.clamp) {
+                sw = 2;
+                v.w = InternalType<T>::clamp(v.w, p.clamp);
+              }
+
+              if ((uint32_t)signXb < p.swLimit &&
+                  (uint32_t)signY < p.sShape.y && signY >= minY) {
+                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
+              }
+            } else {
+              // Determine and write signs.
+              if ((uint32_t)signXb < p.swLimit &&
+                  (uint32_t)signY < p.sShape.y && signY >= minY) {
+                int sx = __float_as_uint(v.x) >> 31;
+                int sy = __float_as_uint(v.y) >> 31;
+                int sz = __float_as_uint(v.z) >> 31;
+                int sw = __float_as_uint(v.w) >> 31;
+                if (sx) v.x *= p.slope;
+                if (fabsf(v.x) > p.clamp) {
+                  sx = 2;
+                  v.x = InternalType<T>::clamp(v.x, p.clamp);
+                }
+                if (sy) v.y *= p.slope;
+                if (fabsf(v.y) > p.clamp) {
+                  sy = 2;
+                  v.y = InternalType<T>::clamp(v.y, p.clamp);
+                }
+                if (sz) v.z *= p.slope;
+                if (fabsf(v.z) > p.clamp) {
+                  sz = 2;
+                  v.z = InternalType<T>::clamp(v.z, p.clamp);
+                }
+                if (sw) v.w *= p.slope;
+                if (fabsf(v.w) > p.clamp) {
+                  sw = 2;
+                  v.w = InternalType<T>::clamp(v.w, p.clamp);
+                }
+
+                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
+              } else {
+                // Just compute the values.
+                if (v.x < 0.f) v.x *= p.slope;
+                v.x = InternalType<T>::clamp(v.x, p.clamp);
+                if (v.y < 0.f) v.y *= p.slope;
+                v.y = InternalType<T>::clamp(v.y, p.clamp);
+                if (v.z < 0.f) v.z *= p.slope;
+                v.z = InternalType<T>::clamp(v.z, p.clamp);
+                if (v.w < 0.f) v.w *= p.slope;
+                v.w = InternalType<T>::clamp(v.w, p.clamp);
+              }
+            }
+          } else if (signRead)  // Read sign and apply.
+          {
+            if ((uint32_t)signY < p.sShape.y) {
+              int s = 0;
+              if ((uint32_t)signXb < p.swLimit) s = p.s[si];
+              if ((uint32_t)signXb + 1 < p.swLimit) s |= p.s[si + 1] << 8;
+              s >>= (signX & 3) << 1;
+              if (s & 0x01) v.x *= p.slope;
+              if (s & 0x02) v.x = 0.f;
+              if (s & 0x04) v.y *= p.slope;
+              if (s & 0x08) v.y = 0.f;
+              if (s & 0x10) v.z *= p.slope;
+              if (s & 0x20) v.z = 0.f;
+              if (s & 0x40) v.w *= p.slope;
+              if (s & 0x80) v.w = 0.f;
+            }
+          } else  // Forward pass with no sign write.
+          {
+            if (v.x < 0.f) v.x *= p.slope;
+            v.x = InternalType<T>::clamp(v.x, p.clamp);
+            if (v.y < 0.f) v.y *= p.slope;
+            v.y = InternalType<T>::clamp(v.y, p.clamp);
+            if (v.z < 0.f) v.z *= p.slope;
+            v.z = InternalType<T>::clamp(v.z, p.clamp);
+            if (v.w < 0.f) v.w *= p.slope;
+            v.w = InternalType<T>::clamp(v.w, p.clamp);
+          }
+
+          s_tileUpXY[idx + 0] = v.x;
+          s_tileUpXY[idx + 1] = v.y;
+          s_tileUpXY[idx + 2] = v.z;
+          s_tileUpXY[idx + 3] = v.w;
+        }
+      } else if (up == 1) {
+        __syncthreads();
+        uint32_t groupMask = 15 << ((threadIdx.x & 31) & ~3);
+        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH
+                            : 0;  // Skip already written signs.
+        for (int idx = threadIdx.x; idx < tileUpW * tileUpH;
+             idx += blockDim.x) {
+          int relUpX0, relUpY0;
+          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
+          scalar_t v = s_tileIn[idx] * (scalar_t)c_fu[0];  // 1x1 filter.
+
+          int x = tileOutX * down + relUpX0;
+          int y = tileOutY * down + relUpY0;
+          int signX = x + p.sOfs.x;
+          int signY = y + p.sOfs.y;
+          int signZ = blockIdx.z + p.blockZofs;
+          int signXb = signX >> 2;
+          index_t si =
+              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+          v *= (scalar_t)((float)up * (float)up * p.gain);
+
+          if (signWrite) {
+            if (!enableWriteSkip) {
+              // Determine and write sign.
+              uint32_t s = 0;
+              uint32_t signXbit = (1u << signXo);
+              if (v < 0.f) {
+                s = signXbit;
+                v *= p.slope;
+              }
+              if (fabsf(v) > p.clamp) {
+                s = signXbit * 2;
+                v = InternalType<T>::clamp(v, p.clamp);
+              }
+              if ((uint32_t)signXb < p.swLimit &&
+                  (uint32_t)signY < p.sShape.y && signY >= minY) {
+#ifdef MMCV_WITH_HIP
+                s += __shfl_xor(s, 1);  // Coalesce.
+                s += __shfl_xor(s, 2);  // Coalesce.
+#else
+                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
+                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
+#endif
+                p.s[si] = s;  // Write.
+              }
+            } else {
+              // Determine and write sign.
+              if ((uint32_t)signXb < p.swLimit &&
+                  (uint32_t)signY < p.sShape.y && signY >= minY) {
+                uint32_t s = 0;
+                uint32_t signXbit = (1u << signXo);
+                if (v < 0.f) {
+                  s = signXbit;
+                  v *= p.slope;
+                }
+                if (fabsf(v) > p.clamp) {
+                  s = signXbit * 2;
+                  v = InternalType<T>::clamp(v, p.clamp);
+                }
+#ifdef MMCV_WITH_HIP
+                s += __shfl_xor(s, 1);  // Coalesce.
+                s += __shfl_xor(s, 2);  // Coalesce.
+#else
+                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
+                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
+#endif
+                p.s[si] = s;  // Write.
+              } else {
+                // Just compute the value.
+                if (v < 0.f) v *= p.slope;
+                v = InternalType<T>::clamp(v, p.clamp);
+              }
+            }
+          } else if (signRead) {
+            // Read sign and apply if within sign tensor bounds.
+            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y) {
+              int s = p.s[si];
+              s >>= signXo;
+              if (s & 1) v *= p.slope;
+              if (s & 2) v = 0.f;
+            }
+          } else  // Forward pass with no sign write.
+          {
+            if (v < 0.f) v *= p.slope;
+            v = InternalType<T>::clamp(v, p.clamp);
+          }
+
+          if (!downInline)  // Write into temporary buffer.
+            s_tileUpXY[idx] = v;
+          else if ((uint32_t)x < p.yShape.x &&
+                   (uint32_t)y <
+                       p.yShape.y)  // Write directly into output buffer
+            *((T *)((char *)p.y + (x * get_stride<index_t>(p.yStride.x) +
+                                   y * get_stride<index_t>(p.yStride.y) +
+                                   mapOfsOut))) = (T)(v * (scalar_t)c_fd[0]);
+        }
+      }
+    }
+
+    // Downsampling.
+    if (filterMode == MODE_SUSD || filterMode == MODE_FUSD) {
+      // Horizontal downsampling.
+      __syncthreads();
+      if (down == 4 && tileOutW % 4 == 0) {
+        // Calculate 4 pixels at a time.
+        for (int idx = threadIdx.x * 4; idx < tileOutW * tileUpH;
+             idx += blockDim.x * 4) {
+          int relOutX0, relUpY;
+          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+          int relUpX0 = relOutX0 * down;
+          int src0 = relUpY * tileUpW + relUpX0;
+          vec4_t v = InternalType<T>::zero_vec4();
+#pragma unroll
+          for (int step = 0; step < fdSize; step++) {
+            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];
+            v.y += s_tileUpXY[src0 + 4 + step] * (scalar_t)c_fd[step];
+            v.z += s_tileUpXY[src0 + 8 + step] * (scalar_t)c_fd[step];
+            v.w += s_tileUpXY[src0 + 12 + step] * (scalar_t)c_fd[step];
+          }
+          s_tileDownX[idx + 0] = v.x;
+          s_tileDownX[idx + 1] = v.y;
+          s_tileDownX[idx + 2] = v.z;
+          s_tileDownX[idx + 3] = v.w;
+        }
+      } else if ((down == 2 || down == 4) && (tileOutW % 2 == 0)) {
+        // Calculate 2 pixels at a time.
+        for (int idx = threadIdx.x * 2; idx < tileOutW * tileUpH;
+             idx += blockDim.x * 2) {
+          int relOutX0, relUpY;
+          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+          int relUpX0 = relOutX0 * down;
+          int src0 = relUpY * tileUpW + relUpX0;
+          vec2_t v = InternalType<T>::zero_vec2();
+#pragma unroll
+          for (int step = 0; step < fdSize; step++) {
+            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];
+            v.y += s_tileUpXY[src0 + down + step] * (scalar_t)c_fd[step];
+          }
+          s_tileDownX[idx + 0] = v.x;
+          s_tileDownX[idx + 1] = v.y;
+        }
+      } else {
+        // Calculate 1 pixel at a time.
+        for (int idx = threadIdx.x; idx < tileOutW * tileUpH;
+             idx += blockDim.x) {
+          int relOutX0, relUpY;
+          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+          int relUpX0 = relOutX0 * down;
+          int src = relUpY * tileUpW + relUpX0;
+          scalar_t v = 0.f;
+#pragma unroll
+          for (int step = 0; step < fdSize; step++)
+            v += s_tileUpXY[src + step] * (scalar_t)c_fd[step];
+          s_tileDownX[idx] = v;
+        }
+      }
+
+      // Vertical downsampling & store output tile.
+      __syncthreads();
+      for (int idx = threadIdx.x; idx < tileOutW * tileOutH;
+           idx += blockDim.x) {
+        int relOutX, relOutY0;
+        fast_div_mod<tileOutW>(relOutX, relOutY0, idx);
+        int relUpY0 = relOutY0 * down;
+        int src0 = relUpY0 * tileOutW + relOutX;
+        scalar_t v = 0;
+#pragma unroll
+        for (int step = 0; step < fdSize; step++)
+          v += s_tileDownX[src0 + step * tileOutW] * (scalar_t)c_fd[step];
+
+        int outX = tileOutX + relOutX;
+        int outY = tileOutY + relOutY0;
+
+        if (outX < p.yShape.x & outY < p.yShape.y)
+          *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +
+                                 outY * get_stride<index_t>(p.yStride.y) +
+                                 mapOfsOut))) = (T)v;
+      }
+    } else if (filterMode == MODE_SUFD || filterMode == MODE_FUFD) {
+      // Full downsampling filter.
+      if (down == 2) {
+        // 2-wide.
+        __syncthreads();
+        for (int idx = threadIdx.x * 2; idx < tileOutW * tileOutH;
+             idx += blockDim.x * 2) {
+          int relOutX0, relOutY0;
+          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
+          int relUpX0 = relOutX0 * down;
+          int relUpY0 = relOutY0 * down;
+          int src0 = relUpY0 * tileUpW + relUpX0;
+          vec2_t v = InternalType<T>::zero_vec2();
+#pragma unroll
+          for (int sy = 0; sy < fdSize; sy++)
+#pragma unroll
+            for (int sx = 0; sx < fdSize; sx++) {
+              v.x += s_tileUpXY[src0 + 0 + sx + sy * tileUpW] *
+                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
+              v.y += s_tileUpXY[src0 + 2 + sx + sy * tileUpW] *
+                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
+            }
+
+          int outX = tileOutX + relOutX0;
+          int outY = tileOutY + relOutY0;
+          if ((uint32_t)outY < p.yShape.y) {
+            index_t ofs = outX * get_stride<index_t>(p.yStride.x) +
+                          outY * get_stride<index_t>(p.yStride.y) + mapOfsOut;
+            if (outX + 0 < p.yShape.x) *((T *)((char *)p.y + ofs)) = (T)v.x;
+            if (outX + 1 < p.yShape.x)
+              *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.x))) =
+                  (T)v.y;
+          }
+        }
+      } else if (down == 1 && !downInline) {
+        // Thread per pixel.
+        __syncthreads();
+        for (int idx = threadIdx.x; idx < tileOutW * tileOutH;
+             idx += blockDim.x) {
+          int relOutX0, relOutY0;
+          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
+          scalar_t v = s_tileUpXY[idx] * (scalar_t)c_fd[0];  // 1x1 filter.
+
+          int outX = tileOutX + relOutX0;
+          int outY = tileOutY + relOutY0;
+          if ((uint32_t)outX < p.yShape.x && (uint32_t)outY < p.yShape.y)
+            *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +
+                                   outY * get_stride<index_t>(p.yStride.y) +
+                                   mapOfsOut))) = (T)v;
+        }
+      }
+    }
+
+    if (!enableXrep) break;
+  }
+}
+
+//------------------------------------------------------------------------
+// Compute activation function and signs for upsampled data tensor, modifying
+// data tensor in-place. Used for accelerating the generic variant. Sign tensor
+// is known to be contiguous, and p.x and p.s have the same z, w dimensions.
+// 64-bit indexing is always used.
+
+template <class T, bool signWrite, bool signRead>
+static __global__ void filtered_lrelu_act_kernel(
+    filtered_lrelu_act_kernel_params p) {
+  typedef typename InternalType<T>::scalar_t scalar_t;
+
+  // Indexing.
+  int32_t x = threadIdx.x + blockIdx.x * blockDim.x;
+  int32_t ymax = signWrite ? p.sShape.y : p.xShape.y;
+  int32_t qmax =
+      p.xShape.z * p.xShape.w;  // Combined minibatch*channel maximum index.
+
+  // Loop to accommodate oversized tensors.
+  for (int32_t q = blockIdx.z; q < qmax; q += gridDim.z)
+    for (int32_t y = blockIdx.y; y < ymax; y += gridDim.y) {
+      // Extract z and w (channel, minibatch index).
+      int32_t w = q / p.xShape.z;
+      int32_t z = q - w * p.xShape.z;
+
+      // Choose behavior based on sign read/write mode.
+      if (signWrite) {
+        // Process value if in p.x.
+        uint32_t s = 0;
+        if (x < p.xShape.x && y < p.xShape.y) {
+          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
+                       w * p.xStride.w;
+          T *pv = ((T *)p.x) + ix;
+          scalar_t v = (scalar_t)(*pv);
+
+          // Gain, LReLU, clamp.
+          v *= p.gain;
+          if (v < 0.f) {
+            v *= p.slope;
+            s = 1;  // Sign.
+          }
+          if (fabsf(v) > p.clamp) {
+            v = InternalType<T>::clamp(v, p.clamp);
+            s = 2;  // Clamp.
+          }
+
+          *pv = (T)v;  // Write value.
+        }
+
+        // Coalesce into threads 0 and 16 of warp.
+        uint32_t m = (threadIdx.x & 16) ? 0xffff0000u : 0x0000ffffu;
+        s <<= ((threadIdx.x & 15) << 1);  // Shift into place.
+#ifdef MMCV_WITH_HIP
+        s |= __shfl_xor(s, 1);  // Distribute.
+        s |= __shfl_xor(s, 2);
+        s |= __shfl_xor(s, 4);
+        s |= __shfl_xor(s, 8);
+#else
+        s |= __shfl_xor_sync(m, s, 1);                  // Distribute.
+        s |= __shfl_xor_sync(m, s, 2);
+        s |= __shfl_xor_sync(m, s, 4);
+        s |= __shfl_xor_sync(m, s, 8);
+#endif
+
+        // Write signs if leader and in p.s.
+        if (!(threadIdx.x & 15) && x < p.sShape.x)  // y is always in.
+        {
+          uint64_t is =
+              x + p.sShape.x * (y + (int64_t)p.sShape.y * q);  // Contiguous.
+          ((uint32_t *)p.s)[is >> 4] = s;
+        }
+      } else if (signRead) {
+        // Process value if in p.x.
+        if (x < p.xShape.x)  // y is always in.
+        {
+          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
+                       w * p.xStride.w;
+          T *pv = ((T *)p.x) + ix;
+          scalar_t v = (scalar_t)(*pv);
+          v *= p.gain;
+
+          // Apply sign buffer offset.
+          uint32_t sx = x + p.sOfs.x;
+          uint32_t sy = y + p.sOfs.y;
+
+          // Read and apply signs if we land inside valid region of sign buffer.
+          if (sx < p.sShape.x && sy < p.sShape.y) {
+            uint64_t is =
+                (sx >> 2) + (p.sShape.x >> 2) *
+                                (sy + (uint64_t)p.sShape.y * q);  // Contiguous.
+            unsigned char s = p.s[is];
+            s >>= (sx & 3) << 1;  // Shift into place.
+            if (s & 1)            // Sign?
+              v *= p.slope;
+            if (s & 2)  // Clamp?
+              v = 0.f;
+          }
+
+          *pv = (T)v;  // Write value.
+        }
+      } else {
+        // Forward pass with no sign write. Process value if in p.x.
+        if (x < p.xShape.x)  // y is always in.
+        {
+          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
+                       w * p.xStride.w;
+          T *pv = ((T *)p.x) + ix;
+          scalar_t v = (scalar_t)(*pv);
+          v *= p.gain;
+          if (v < 0.f) v *= p.slope;
+          if (fabsf(v) > p.clamp) v = InternalType<T>::clamp(v, p.clamp);
+          *pv = (T)v;  // Write value.
+        }
+      }
+    }
+}
+
+template <class T, bool signWrite, bool signRead>
+void *choose_filtered_lrelu_act_kernel(void) {
+  return (void *)filtered_lrelu_act_kernel<T, signWrite, signRead>;
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T, class index_t, bool signWrite, bool signRead>
+filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(
+    const filtered_lrelu_kernel_params &p, int sharedKB) {
+  filtered_lrelu_kernel_spec s = {0};
+
+  // Return the first matching kernel.
+#define CASE(SH, U, FU, D, FD, MODE, TW, TH, W, XR, WS)                        \
+  if (sharedKB >= SH)                                                          \
+    if ((p.fuShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_SUFD)) ||      \
+        (p.fuShape.y > 0 && (MODE == MODE_FUSD || MODE == MODE_FUFD)))         \
+      if ((p.fdShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_FUSD)) ||    \
+          (p.fdShape.y > 0 && (MODE == MODE_SUFD || MODE == MODE_FUFD)))       \
+        if (p.up == U && p.fuShape.x <= FU && p.fuShape.y <= FU &&             \
+            p.down == D && p.fdShape.x <= FD && p.fdShape.y <= FD) {           \
+          static_assert((D * TW % 4) == 0,                                     \
+                        "down * tileWidth must be divisible by 4");            \
+          static_assert(                                                       \
+              FU % U == 0,                                                     \
+              "upscaling filter size must be multiple of upscaling factor");   \
+          static_assert(FD % D == 0,                                           \
+                        "downscaling filter size must be multiple of "         \
+                        "downscaling factor");                                 \
+          s.setup = (void *)setup_filters_kernel;                              \
+          s.exec = (void *)                                                    \
+              filtered_lrelu_kernel<T, index_t, SH, signWrite, signRead, MODE, \
+                                    U, FU, D, FD, TW, TH, W * 32, !!XR, !!WS>; \
+          s.tileOut = make_int2(TW, TH);                                       \
+          s.numWarps = W;                                                      \
+          s.xrep = XR;                                                         \
+          s.dynamicSharedKB = (SH == 48) ? 0 : SH;                             \
+          return s;                                                            \
+        }
+
+  // Launch parameters for various kernel specializations.
+  // Small filters must be listed before large filters, otherwise the kernel for
+  // larger filter will always match first. Kernels that use more shared memory
+  // must be listed before those that use less, for the same reason.
+
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 1, 1, /*mode*/ MODE_FUFD,
+       /*tw,th,warps,xrep,wskip*/ 64, 178, 32, 0, 0)  // 1t-upf1-downf1
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 152, 95, 16, 0, 0)  // 4t-ups2-downf1
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 56, 22, 16, 0, 0)  // 4t-upf1-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 56, 29, 16, 11, 0)  // 4t-ups2-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 60, 28, 16, 0, 0)  // 4t-upf2-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 56, 28, 16, 0, 0)  // 4t-ups2-downf2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 56, 31, 16, 11, 0)  // 4t-ups4-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 56, 36, 16, 0, 0)  // 4t-ups4-downf2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 16, 22, 16, 12, 0)  // 4t-ups2-downs4
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 29, 15, 16, 0, 0)  // 4t-upf2-downs4
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 96, 150, 28, 0, 0)  // 6t-ups2-downf1
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 32, 35, 24, 0, 0)  // 6t-upf1-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 32, 46, 16, 10, 0)  // 6t-ups2-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 58, 28, 24, 8, 0)  // 6t-upf2-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 52, 28, 16, 0, 0)  // 6t-ups2-downf2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 32, 51, 16, 5, 0)  // 6t-ups4-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 32, 56, 16, 6, 0)  // 6t-ups4-downf2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 16, 18, 16, 12, 0)  // 6t-ups2-downs4
+  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 27, 31, 32, 6, 0)  // 6t-upf2-downs4 96kB
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 27, 13, 24, 0, 0)  // 6t-upf2-downs4
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 148, 89, 24, 0, 0)  // 8t-ups2-downf1
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 32, 31, 16, 5, 0)  // 8t-upf1-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 32, 41, 16, 9, 0)  // 8t-ups2-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 56, 26, 24, 0, 0)  // 8t-upf2-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 32, 40, 16, 0, 0)  // 8t-ups2-downf2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 32, 46, 24, 5, 0)  // 8t-ups4-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 32, 50, 16, 0, 0)  // 8t-ups4-downf2
+  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 24, 24, 32, 12, 1)  // 8t-ups2-downs4 96kB
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 16, 13, 16, 10, 1)  // 8t-ups2-downs4
+  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 25, 28, 28, 4, 0)  // 8t-upf2-downs4 96kB
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 25, 10, 24, 0, 0)  // 8t-upf2-downs4
+
+#undef CASE
+  return s;  // No kernel found.
+}
+
+//------------------------------------------------------------------------
+
+#define BUILD_FILTERED_LRELU_OP 1
+
+#ifndef MMCV_WITH_HIP
+#ifdef __GNUC__
+#if __GNUC__ < 6
+#undef BUILD_FILTERED_LRELU_OP
+#define BUILD_FILTERED_LRELU_OP 0
+#endif
+#endif
+
+#if CUDA_VERSION < 10020
+#undef BUILD_FILTERED_LRELU_OP
+#define BUILD_FILTERED_LRELU_OP 0
+#endif
+#endif
+
+#if BUILD_FILTERED_LRELU_OP == 1
+std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
+    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
+    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
+    bool writeSigns) {
+  // Set CUDA device.
+  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+
+  // Validate arguments.
+  TORCH_CHECK(fu.device() == x.device() && fd.device() == x.device() &&
+                  b.device() == x.device(),
+              "all input tensors must reside on the same device");
+  TORCH_CHECK(fu.dtype() == torch::kFloat && fd.dtype() == torch::kFloat,
+              "fu and fd must be float32");
+  TORCH_CHECK(b.dtype() == x.dtype(), "x and b must have the same dtype");
+  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat,
+              "x and b must be float16 or float32");
+  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&
+                  x.size(3) <= INT_MAX,
+              "x is too large");
+  TORCH_CHECK(x.numel() > 0, "x is empty");
+  TORCH_CHECK(
+      (fu.dim() == 1 || fu.dim() == 2) && (fd.dim() == 1 || fd.dim() == 2),
+      "fu and fd must be rank 1 or 2");
+  TORCH_CHECK(fu.size(0) <= INT_MAX && fu.size(-1) <= INT_MAX,
+              "fu is too large");
+  TORCH_CHECK(fd.size(0) <= INT_MAX && fd.size(-1) <= INT_MAX,
+              "fd is too large");
+  TORCH_CHECK(fu.numel() > 0, "fu is empty");
+  TORCH_CHECK(fd.numel() > 0, "fd is empty");
+  TORCH_CHECK(b.dim() == 1 && b.size(0) == x.size(1),
+              "b must be a vector with the same number of channels as x");
+  TORCH_CHECK(up >= 1 && down >= 1, "up and down must be at least 1");
+
+  // Figure out how much shared memory is available on the device.
+  int maxSharedBytes = 0;
+#ifdef MMCV_WITH_HIP
+  cudaDeviceGetAttribute(&maxSharedBytes,
+                         hipDeviceAttributeMaxSharedMemoryPerBlock,
+                         x.device().index());
+#else
+  AT_CUDA_CHECK(cudaDeviceGetAttribute(&maxSharedBytes,
+                                       cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                                       x.device().index()));
+#endif
+  int sharedKB = maxSharedBytes >> 10;
+
+  // Populate enough launch parameters to check if a CUDA kernel exists.
+  filtered_lrelu_kernel_params p;
+  p.up = up;
+  p.down = down;
+  p.fuShape =
+      make_int2((int)fu.size(-1),
+                fu.dim() == 2 ? (int)fu.size(0)
+                              : 0);  // shape [n, 0] indicates separable filter.
+  p.fdShape = make_int2((int)fd.size(-1), fd.dim() == 2 ? (int)fd.size(0) : 0);
+  filtered_lrelu_kernel_spec test_spec =
+      choose_filtered_lrelu_kernel<float, int32_t, false, false>(p, sharedKB);
+  if (!test_spec.exec) {
+    // No kernel found - return empty tensors and indicate missing kernel with
+    // return code of -1.
+    return std::make_tuple(torch::Tensor(), torch::Tensor(), -1);
+  }
+
+  // Input/output element size.
+  int64_t sz = (x.dtype() == torch::kHalf) ? 2 : 4;
+
+  // Input sizes.
+  int64_t xw = (int)x.size(3);
+  int64_t xh = (int)x.size(2);
+  int64_t fut_w = (int)fu.size(-1) - 1;
+  int64_t fut_h = (int)fu.size(0) - 1;
+  int64_t fdt_w = (int)fd.size(-1) - 1;
+  int64_t fdt_h = (int)fd.size(0) - 1;
+
+  // Logical size of upsampled buffer.
+  int64_t cw = xw * up + (px0 + px1) - fut_w;
+  int64_t ch = xh * up + (py0 + py1) - fut_h;
+  TORCH_CHECK(
+      cw > fdt_w && ch > fdt_h,
+      "upsampled buffer must be at least the size of downsampling filter");
+  TORCH_CHECK(cw <= INT_MAX && ch <= INT_MAX, "upsampled buffer is too large");
+
+  // Compute output size and allocate.
+  int64_t yw = (cw - fdt_w + (down - 1)) / down;
+  int64_t yh = (ch - fdt_h + (down - 1)) / down;
+  TORCH_CHECK(yw > 0 && yh > 0, "output must be at least 1x1");
+  TORCH_CHECK(yw <= INT_MAX && yh <= INT_MAX, "output is too large");
+  torch::Tensor y = torch::empty({x.size(0), x.size(1), yh, yw}, x.options(),
+                                 x.suggest_memory_format());
+
+  // Allocate sign tensor.
+  torch::Tensor so;
+  torch::Tensor s = si;
+  bool readSigns = !!s.numel();
+  int64_t sw_active = 0;  // Active width of sign tensor.
+  if (writeSigns) {
+    sw_active = yw * down - (down - 1) + fdt_w;   // Active width in elements.
+    int64_t sh = yh * down - (down - 1) + fdt_h;  // Height = active height.
+    int64_t sw = (sw_active + 15) & ~15;  // Width  = active width in elements,
+                                          // rounded up to multiple of 16.
+    TORCH_CHECK(sh <= INT_MAX && (sw >> 2) <= INT_MAX, "signs is too large");
+    s = so = torch::empty({x.size(0), x.size(1), sh, sw >> 2},
+                          x.options().dtype(torch::kUInt8),
+                          at::MemoryFormat::Contiguous);
+  } else if (readSigns)
+    sw_active = s.size(3) << 2;
+
+  // Validate sign tensor if in use.
+  if (readSigns || writeSigns) {
+    TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
+    TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
+    TORCH_CHECK(s.device() == x.device(),
+                "signs must reside on the same device as x");
+    TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
+    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),
+                "signs must have same batch & channels as x");
+    TORCH_CHECK(s.size(2) <= INT_MAX && s.size(3) <= INT_MAX,
+                "signs is too large");
+  }
+
+  // Populate rest of CUDA kernel parameters.
+  p.x = x.data_ptr();
+  p.y = y.data_ptr();
+  p.b = b.data_ptr();
+  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
+  p.fu = fu.data_ptr<float>();
+  p.fd = fd.data_ptr<float>();
+  p.pad0 = make_int2(px0, py0);
+  p.gain = gain;
+  p.slope = slope;
+  p.clamp = clamp;
+  p.flip = (flip_filters) ? 1 : 0;
+  p.xShape =
+      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+  p.yShape =
+      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
+  p.sShape = (readSigns || writeSigns)
+                 ? make_int2((int)s.size(3), (int)s.size(2))
+                 : make_int2(0, 0);  // Width is in bytes. Contiguous.
+  p.sOfs = make_int2(sx, sy);
+  p.swLimit = (sw_active + 3) >> 2;  // Rounded up to bytes.
+
+  // x, y, b strides are in bytes.
+  p.xStride = make_longlong4(sz * x.stride(3), sz * x.stride(2),
+                             sz * x.stride(1), sz * x.stride(0));
+  p.yStride = make_longlong4(sz * y.stride(3), sz * y.stride(2),
+                             sz * y.stride(1), sz * y.stride(0));
+  p.bStride = sz * b.stride(0);
+
+  // fu, fd strides are in elements.
+  p.fuStride =
+      make_longlong3(fu.stride(-1), fu.dim() == 2 ? fu.stride(0) : 0, 0);
+  p.fdStride =
+      make_longlong3(fd.stride(-1), fd.dim() == 2 ? fd.stride(0) : 0, 0);
+
+  // Determine if indices don't fit in int32. Support negative strides although
+  // Torch currently never produces those.
+  bool index64b = false;
+  if (std::abs(p.bStride * x.size(1)) > INT_MAX) index64b = true;
+  if (std::min(x.size(0) * p.xStride.w, 0ll) +
+          std::min(x.size(1) * p.xStride.z, 0ll) +
+          std::min(x.size(2) * p.xStride.y, 0ll) +
+          std::min(x.size(3) * p.xStride.x, 0ll) <
+      -INT_MAX)
+    index64b = true;
+  if (std::max(x.size(0) * p.xStride.w, 0ll) +
+          std::max(x.size(1) * p.xStride.z, 0ll) +
+          std::max(x.size(2) * p.xStride.y, 0ll) +
+          std::max(x.size(3) * p.xStride.x, 0ll) >
+      INT_MAX)
+    index64b = true;
+  if (std::min(y.size(0) * p.yStride.w, 0ll) +
+          std::min(y.size(1) * p.yStride.z, 0ll) +
+          std::min(y.size(2) * p.yStride.y, 0ll) +
+          std::min(y.size(3) * p.yStride.x, 0ll) <
+      -INT_MAX)
+    index64b = true;
+  if (std::max(y.size(0) * p.yStride.w, 0ll) +
+          std::max(y.size(1) * p.yStride.z, 0ll) +
+          std::max(y.size(2) * p.yStride.y, 0ll) +
+          std::max(y.size(3) * p.yStride.x, 0ll) >
+      INT_MAX)
+    index64b = true;
+  if (s.numel() > INT_MAX) index64b = true;
+
+  // Choose CUDA kernel.
+  filtered_lrelu_kernel_spec spec = {0};
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      x.scalar_type(), "filtered_lrelu_cuda", [&] {
+        if constexpr (sizeof(scalar_t) <=
+                      4)  // Exclude doubles. constexpr
+                          // prevents template instantiation.
+        {
+          // Choose kernel based on index type, datatype and sign read/write
+          // modes.
+          if (!index64b && writeSigns && !readSigns)
+            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, true, false>(
+                p, sharedKB);
+          else if (!index64b && !writeSigns && readSigns)
+            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, true>(
+                p, sharedKB);
+          else if (!index64b && !writeSigns && !readSigns)
+            spec =
+                choose_filtered_lrelu_kernel<scalar_t, int32_t, false, false>(
+                    p, sharedKB);
+          else if (index64b && writeSigns && !readSigns)
+            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, true, false>(
+                p, sharedKB);
+          else if (index64b && !writeSigns && readSigns)
+            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, true>(
+                p, sharedKB);
+          else if (index64b && !writeSigns && !readSigns)
+            spec =
+                choose_filtered_lrelu_kernel<scalar_t, int64_t, false, false>(
+                    p, sharedKB);
+        }
+      });
+  TORCH_CHECK(
+      spec.exec,
+      "internal error - CUDA kernel not found")  // This should not happen
+                                                 // because we tested earlier
+                                                 // that kernel exists.
+
+  // Launch CUDA kernel.
+  void *args[] = {&p};
+  int bx = spec.numWarps * 32;
+  int gx = (p.yShape.x - 1) / spec.tileOut.x + 1;
+  int gy = (p.yShape.y - 1) / spec.tileOut.y + 1;
+  int gz = p.yShape.z * p.yShape.w;
+
+  // Repeat multiple horizontal tiles in a CTA?
+  if (spec.xrep) {
+    p.tilesXrep = spec.xrep;
+    p.tilesXdim = gx;
+
+    gx = (gx + p.tilesXrep - 1) / p.tilesXrep;
+    std::swap(gx, gy);
+  } else {
+    p.tilesXrep = 0;
+    p.tilesXdim = 0;
+  }
+#ifdef MMCV_WITH_HIP
+  AT_CUDA_CHECK(hipLaunchKernel(spec.setup, 1, 1024, args, 0,
+                                at::cuda::getCurrentCUDAStream()));
+#else
+  // Launch filter setup kernel.
+  AT_CUDA_CHECK(cudaLaunchKernel(spec.setup, 1, 1024, args, 0,
+                                 at::cuda::getCurrentCUDAStream()));
+#endif
+
+  // Copy kernels to constant memory.
+  if (writeSigns && !readSigns)
+    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));
+  else if (!writeSigns && readSigns)
+    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));
+  else if (!writeSigns && !readSigns)
+    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));
+
+  // Set cache and shared memory configurations for main kernel.
+  AT_CUDA_CHECK(cudaFuncSetCacheConfig(spec.exec, cudaFuncCachePreferShared));
+  if (spec.dynamicSharedKB)  // Need dynamically allocated shared memory?
+#ifdef MMCV_WITH_HIP
+    AT_CUDA_CHECK(hipFuncSetAttribute(
+        spec.exec, hipFuncAttributeMaxDynamicSharedMemorySize,
+        spec.dynamicSharedKB << 10));
+#else
+    AT_CUDA_CHECK(cudaFuncSetAttribute(
+        spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize,
+        spec.dynamicSharedKB << 10));
+#endif
+  AT_CUDA_CHECK(
+      cudaFuncSetSharedMemConfig(spec.exec, cudaSharedMemBankSizeFourByte));
+
+  // Launch main kernel.
+  const int maxSubGz = 65535;  // CUDA maximum for block z dimension.
+  for (int zofs = 0; zofs < gz;
+       zofs += maxSubGz)  // Do multiple launches if gz is too big.
+  {
+    p.blockZofs = zofs;
+    int subGz = std::min(maxSubGz, gz - zofs);
+#ifdef MMCV_WITH_HIP
+    AT_CUDA_CHECK(hipLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,
+                                  spec.dynamicSharedKB << 10,
+                                  at::cuda::getCurrentCUDAStream()));
+#else
+    AT_CUDA_CHECK(cudaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,
+                                   spec.dynamicSharedKB << 10,
+                                   at::cuda::getCurrentCUDAStream()));
+#endif
+  }
+
+  // Done.
+  return std::make_tuple(y, so, 0);
+}
+
+std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op_impl(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
+    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
+    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
+    bool writeSigns);
+
+REGISTER_DEVICE_IMPL(filtered_lrelu_op_impl, CUDA, filtered_lrelu_op);
+
+#else
+
+#pragma message(                           \
+    "filtered_lrelu_op is not available. " \
+    "Please update your compiler and cuda version.")
+
+#endif
+#undef BUILD_FILTERED_LRELU_OP
+
+//------------------------------------------------------------------------
+
+torch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,
+                                    int sy, float gain, float slope,
+                                    float clamp, bool writeSigns) {
+  // Set CUDA device.
+  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+
+  // Validate arguments.
+  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&
+                  x.size(3) <= INT_MAX,
+              "x is too large");
+  TORCH_CHECK(x.numel() > 0, "x is empty");
+  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat ||
+                  x.dtype() == torch::kDouble,
+              "x must be float16, float32 or float64");
+
+  // Output signs if we don't have sign input.
+  torch::Tensor so;
+  torch::Tensor s = si;
+  bool readSigns = !!s.numel();
+  if (writeSigns) {
+    int64_t sw = x.size(3);
+    sw = (sw + 15) & ~15;  // Round to a multiple of 16 for coalescing.
+    s = so = torch::empty({x.size(0), x.size(1), x.size(2), sw >> 2},
+                          x.options().dtype(torch::kUInt8),
+                          at::MemoryFormat::Contiguous);
+  }
+
+  // Validate sign tensor if in use.
+  if (readSigns || writeSigns) {
+    TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
+    TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
+    TORCH_CHECK(s.device() == x.device(),
+                "signs must reside on the same device as x");
+    TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
+    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),
+                "signs must have same batch & channels as x");
+    TORCH_CHECK(s.size(2) <= INT_MAX && (s.size(3) << 2) <= INT_MAX,
+                "signs tensor is too large");
+  }
+
+  // Initialize CUDA kernel parameters.
+  filtered_lrelu_act_kernel_params p;
+  p.x = x.data_ptr();
+  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
+  p.gain = gain;
+  p.slope = slope;
+  p.clamp = clamp;
+  p.xShape =
+      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+  p.xStride =
+      make_longlong4(x.stride(3), x.stride(2), x.stride(1), x.stride(0));
+  p.sShape = (readSigns || writeSigns)
+                 ? make_int2((int)s.size(3) << 2, (int)s.size(2))
+                 : make_int2(0, 0);  // Width is in elements. Contiguous.
+  p.sOfs = make_int2(sx, sy);
+
+  // Choose CUDA kernel.
+  void *func = 0;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      x.scalar_type(), "filtered_lrelu_act_cuda", [&] {
+        if (writeSigns)
+          func = choose_filtered_lrelu_act_kernel<scalar_t, true, false>();
+        else if (readSigns)
+          func = choose_filtered_lrelu_act_kernel<scalar_t, false, true>();
+        else
+          func = choose_filtered_lrelu_act_kernel<scalar_t, false, false>();
+      });
+  TORCH_CHECK(func, "internal error - CUDA kernel not found");
+
+  // Launch CUDA kernel.
+  void *args[] = {&p};
+  int bx = 128;  // 4 warps per block.
+
+  // Logical size of launch = writeSigns ? p.s : p.x
+  uint32_t gx = writeSigns ? p.sShape.x : p.xShape.x;
+  uint32_t gy = writeSigns ? p.sShape.y : p.xShape.y;
+  uint32_t gz =
+      p.xShape.z * p.xShape.w;  // Same as in p.sShape if signs are in use.
+  gx = (gx - 1) / bx + 1;
+
+  // Make sure grid y and z dimensions are within CUDA launch limits. Kernel
+  // loops internally to do the rest.
+  const uint32_t gmax = 65535;
+  gy = std::min(gy, gmax);
+  gz = std::min(gz, gmax);
+
+  // Launch.
+#ifdef MMCV_WITH_HIP
+  AT_CUDA_CHECK(hipLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,
+                                at::cuda::getCurrentCUDAStream()));
+#else
+  AT_CUDA_CHECK(cudaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,
+                                 at::cuda::getCurrentCUDAStream()));
+#endif
+
+  return so;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cb899f954fd969e57a23d5723bf2f9c49b35a853
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "sigmoid_focal_loss_cuda_kernel.cuh"
+#include "softmax_focal_loss_cuda_kernel.cuh"
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  int output_size = output.numel();
+  int num_classes = input.size(1);
+  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_forward_cuda_kernel", [&] {
+        sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha) {
+  int output_size = grad_input.numel();
+  int num_classes = input.size(1);
+
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_backward_cuda_kernel", [&] {
+        sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  int output_size = output.numel();
+  int num_classes = softmax.size(1);
+
+  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(softmax.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      softmax.scalar_type(), "softmax_focal_loss_forward_cuda_kernel", [&] {
+        softmax_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha) {
+  int num_classes = softmax.size(1);
+
+  int output_size = buff.numel();
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(),
+      "softmax_focal_loss_backward_cuda1_"
+      "kernel",
+      [&] {
+        softmax_focal_loss_backward_cuda1_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  output_size = grad_input.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(),
+      "softmax_focal_loss_backward_cuda2_"
+      "kernel",
+      [&] {
+        softmax_focal_loss_backward_cuda2_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),
+                grad_input.data_ptr<scalar_t>(), num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cfb4cd3646fa181de8bf61df33526c99dfdf5522
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
@@ -0,0 +1,143 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling_gpu.cu
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "furthest_point_sample_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+inline int opt_n_threads(int work_size) {
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+
+  return max(min(1 << pow_2, 1024), 1);
+}
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float* dataset,
+                                                    float* temp, int* idxs) {
+  // dataset: (B, N, 3)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  unsigned int n_threads = opt_n_threads(n);
+
+  switch (n_threads) {
+    case 1024:
+      furthest_point_sampling_forward_cuda_kernel<1024>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 512:
+      furthest_point_sampling_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_forward_cuda_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_forward_cuda_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_forward_cuda_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_forward_cuda_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_forward_cuda_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_forward_cuda_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_forward_cuda_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_forward_cuda_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_forward_cuda_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float* dataset, float* temp, int* idxs) {
+  // dataset: (B, N, N)
+  // temp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  unsigned int n_threads = opt_n_threads(n);
+
+  switch (n_threads) {
+    case 1024:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<1024>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 512:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..911ea019aad65c8e51ca94c273cb5bbad70ae8db
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
@@ -0,0 +1,109 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act_kernel.cu
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+template <typename scalar_t>
+static __global__ void fused_bias_act_kernel(
+    scalar_t* out, const scalar_t* p_x, const scalar_t* p_b,
+    const scalar_t* p_ref, int act, int grad, scalar_t alpha, scalar_t scale,
+    int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) {
+  int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x;
+
+  scalar_t zero = 0.0;
+
+  for (int loop_idx = 0; loop_idx < loop_x && xi < size_x;
+       loop_idx++, xi += blockDim.x) {
+    scalar_t x = p_x[xi];
+
+    if (use_bias) {
+      x += p_b[(xi / step_b) % size_b];
+    }
+
+    scalar_t ref = use_ref ? p_ref[xi] : zero;
+
+    scalar_t y;
+
+    // act = 1: linear layer
+    // act = 3: leaky relu layer
+    // grad = 0: direct forward path
+    // grad = 1: first order deviation
+    // grad = 2: second order deviation
+    switch (act * 10 + grad) {
+      default:
+      case 10:
+        y = x;
+        break;
+      case 11:
+        y = x;
+        break;
+      case 12:
+        y = 0.0;
+        break;
+
+      case 30:
+        y = (x > 0.0) ? x : x * alpha;
+        break;
+      case 31:
+        y = (ref > 0.0) ? x : x * alpha;
+        break;
+      case 32:
+        y = 0.0;
+        break;
+    }
+
+    out[xi] = y * scale;
+  }
+}
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
+                                      int grad, float alpha, float scale) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+
+  auto x = input.contiguous();
+  auto b = bias.contiguous();
+  auto ref = refer.contiguous();
+
+  int use_bias = b.numel() ? 1 : 0;
+  int use_ref = ref.numel() ? 1 : 0;
+
+  int size_x = x.numel();
+  int size_b = b.numel();
+  int step_b = 1;
+
+  for (int i = 1 + 1; i < x.dim(); i++) {
+    step_b *= x.size(i);
+  }
+
+  int loop_x = 4;
+  int block_size = 4 * 32;
+  int grid_size = (size_x - 1) / (loop_x * block_size) + 1;
+
+  auto y = torch::empty_like(x);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      x.scalar_type(), "fused_bias_act_kernel", [&] {
+        fused_bias_act_kernel<scalar_t><<<grid_size, block_size, 0, stream>>>(
+            y.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(), ref.data_ptr<scalar_t>(), act, grad, alpha,
+            scale, loop_x, size_x, step_b, size_b, use_bias, use_ref);
+      });
+
+  return y;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aa0e4564c6fdd30d997d4d62245e4c56d1c49c55
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
@@ -0,0 +1,104 @@
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+// clang-format off
+// TODO: make spconv_utils.h order agnostic
+#include "../spconv_utils.h"
+// clang-format on
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/reordering.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+
+  torch::Tensor output =
+      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {  // the center index of subm conv don't need gather and scatter
+               // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "FusedIndiceConvBatchnormKernel", [&] {
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            /* slower than SparseGatherFunctor, may due to int->long conversion
+            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+            auto indicePairBlob =
+            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
+            indicePairOptions); torch::index_select_out(inputBufferBlob,
+            features, 0, indicePairBlob);*/
+          }
+          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+
+  return output;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd0a7b5daf03510cfb7408ff82cfac760af92afb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gather_points_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out) {
+  // points: (B, C, N)
+  // idx: (B, npoints)
+  // output:
+  //      out: (B, C, npoints)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "gather_points_forward_cuda_kernel", [&] {
+        gather_points_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, points.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points) {
+  // grad_out: (B, C, npoints)
+  // idx: (B, npoints)
+  // output:
+  //      grad_points: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "gather_points_backward_cuda_kernel", [&] {
+        gather_points_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, grad_out.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..42fc2bb67b13938b8994f1961ec2fbc41a30d2d8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "group_points_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "group_points_forward_cuda_kernel", [&] {
+        group_points_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, nsample, points.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points) {
+  // grad_out: (B, C, npoints, nsample)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      grad_points: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "group_points_backward_cuda_kernel", [&] {
+        group_points_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, nsample, grad_out.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..968e13d5dccca91135ee13fce3ec83a75e75a919
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
@@ -0,0 +1,104 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include <stdio.h>
+
+#include "iou3d_cuda_kernel.cuh"
+#include "nms_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap) {
+  at::cuda::CUDAGuard device_guard(boxes_a.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
+              GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
+  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
+
+  iou3d_boxes_overlap_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      num_a, boxes_a.data_ptr<float>(), num_b, boxes_b.data_ptr<float>(),
+      ans_overlap.data_ptr<float>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
+                                         Tensor& keep_num,
+                                         float nms_overlap_thresh) {
+  using namespace at::indexing;
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int boxes_num = boxes.size(0);
+
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+
+  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
+              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+
+  iou3d_nms3d_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),
+      (unsigned long long*)mask.data_ptr<int64_t>());
+
+  at::Tensor keep_t = at::zeros(
+      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
+  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
+                          col_blocks * sizeof(unsigned long long), stream>>>(
+      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
+      boxes_num);
+
+  auto keep_data = keep_t.nonzero().index({Slice(), 0});
+  keep_num.fill_(at::Scalar(keep_data.size(0)));
+  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
+                                               Tensor& keep_num,
+                                               float nms_overlap_thresh) {
+  using namespace at::indexing;
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int boxes_num = boxes.size(0);
+
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+
+  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
+              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+
+  iou3d_nms3d_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),
+      (unsigned long long*)mask.data_ptr<int64_t>());
+
+  at::Tensor keep_t = at::zeros(
+      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
+  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
+                          col_blocks * sizeof(unsigned long long), stream>>>(
+      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
+      boxes_num);
+
+  auto keep_data = keep_t.nonzero().index({Slice(), 0});
+  keep_num.fill_(at::Scalar(keep_data.size(0)));
+  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e3351819779cc356cc21d7bb375082f71da2cb75
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
@@ -0,0 +1,34 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+
+#include <cmath>
+#include <cstdio>
+
+#include "knn_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2) {
+  // param new_xyz: (B, m, 3)
+  // param xyz: (B, n, 3)
+  // param idx: (B, m, nsample)
+
+  at::cuda::CUDAGuard device_guard(new_xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      new_xyz.scalar_type(), "knn_forward_cuda_kernel", [&] {
+        knn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            b, n, m, nsample, xyz.data_ptr<scalar_t>(),
+            new_xyz.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+            dist2.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..022e18901580a415037d1d5942791b3ccafc30b9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
@@ -0,0 +1,54 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "masked_conv2d_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w) {
+  int channels = bottom_data.size(1);
+  int height = bottom_data.size(2);
+  int width = bottom_data.size(3);
+  int mask_cnt = mask_h_idx.size(0);
+  int output_size = mask_cnt * channels;
+
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
+        MaskedIm2colForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data_, height, width, kernel_h, kernel_w,
+                pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void MaskedCol2imForwardCUDAKernelLauncher(
+    const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,
+    Tensor top_data, const int height, const int width, const int channels) {
+  int mask_cnt = mask_h_idx.size(0);
+  int output_size = mask_cnt * channels;
+
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
+
+        MaskedCol2imForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data_, height, width, channels, mask_h_idx_,
+                mask_w_idx_, mask_cnt, top_data_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9314f2dda6c89e1f35369b1b7ab9d290cf2ab295
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/minareabbox/src/minareabbox_kernel.cu
+#include "min_area_polygons_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets,
+                                       Tensor polygons) {
+  int num_pointsets = pointsets.size(0);
+  const int output_size = polygons.numel();
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "min_area_polygons_cuda_kernel", ([&] {
+        min_area_polygons_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                num_pointsets, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2b52796e4fdfa2b8bf039fd66f0b16a3af8c84ee
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
@@ -0,0 +1,96 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "modulated_deform_conv_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im,
+            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
+            channels, deformable_group, height_col, width_col, data_col_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_offset_, data_mask_, channels,
+            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+                          kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
+            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size,
+            2 * kernel_h * kernel_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_, grad_mask_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8e1e62df512b3f94ea450b082e2457e1a541e9e6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
@@ -0,0 +1,351 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <THC/THCAtomics.cuh>
+#include <vector>
+
+#include "ms_deform_attn_cuda_kernel.cuh"
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value,
+                               const int64_t *data_spatial_shapes,
+                               const int64_t *data_level_start_index,
+                               const scalar_t *data_sampling_loc,
+                               const scalar_t *data_attn_weight,
+                               const int batch_size, const int spatial_size,
+                               const int num_heads, const int channels,
+                               const int num_levels, const int num_query,
+                               const int num_point, scalar_t *data_col) {
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = THREADS_PER_BLOCK;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0, stream>>>(
+          num_kernels, data_value, data_spatial_shapes, data_level_start_index,
+          data_sampling_loc, data_attn_weight, batch_size, spatial_size,
+          num_heads, channels, num_levels, num_query, num_point, data_col);
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(
+    cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  const int num_threads =
+      (channels > THREADS_PER_BLOCK) ? THREADS_PER_BLOCK : channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > THREADS_PER_BLOCK) {
+    if ((channels & THREADS_PER_BLOCK - 1) == 0) {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+             num_threads * 3 * sizeof(scalar_t), stream>>>(
+              num_kernels, grad_col, data_value, data_spatial_shapes,
+              data_level_start_index, data_sampling_loc, data_attn_weight,
+              batch_size, spatial_size, num_heads, channels, num_levels,
+              num_query, num_point, grad_value, grad_sampling_loc,
+              grad_attn_weight);
+    } else {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, spatial_size, num_heads,
+                       channels, num_levels, num_query, num_point, grad_value,
+                       grad_sampling_loc, grad_attn_weight);
+    }
+  } else {
+    switch (channels) {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      1>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      2>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      4>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      8>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      16>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      32>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      64>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      128>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      256>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      512>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      default:
+        if (channels < 64) {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        } else {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
+                                       const at::Tensor &spatial_shapes,
+                                       const at::Tensor &level_start_index,
+                                       const at::Tensor &sampling_loc,
+                                       const at::Tensor &attn_weight,
+                                       const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+
+  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  auto output =
+      at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+  const int batch_n = im2col_step_;
+  auto output_n = output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto columns = output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
+          ms_deformable_im2col_cuda(
+              at::cuda::getCurrentCUDAStream(),
+              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data_ptr<int64_t>(),
+              level_start_index.data_ptr<int64_t>(),
+              sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point, columns.data_ptr<scalar_t>());
+        }));
+  }
+
+  output = output.view({batch, num_query, num_heads * channels});
+
+  return output;
+}
+
+void ms_deform_attn_cuda_backward(
+    const at::Tensor &value, const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index, const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight, const at::Tensor &grad_output,
+    at::Tensor &grad_value, at::Tensor &grad_sampling_loc,
+    at::Tensor &grad_attn_weight, const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+  AT_ASSERTM(grad_output.is_contiguous(),
+             "grad_output tensor has to be contiguous");
+
+  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+  AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  const int batch_n = im2col_step_;
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  auto grad_output_n = grad_output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto grad_output_g = grad_output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
+          ms_deformable_col2im_cuda(
+              at::cuda::getCurrentCUDAStream(),
+              grad_output_g.data_ptr<scalar_t>(),
+              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data_ptr<int64_t>(),
+              level_start_index.data_ptr<int64_t>(),
+              sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point,
+              grad_value.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_value_size,
+              grad_sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              grad_attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size);
+        }));
+  }
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1b87e0fa75bd5507ed0c94c7e32eb601a95a5f76
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
@@ -0,0 +1,36 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "nms_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset) {
+  at::cuda::CUDAGuard device_guard(boxes.device());
+
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+  auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));
+  auto boxes_sorted = boxes.index_select(0, order_t);
+
+  int boxes_num = boxes.size(0);
+  const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;
+  const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  dim3 blocks(col_blocks_alloc, col_blocks_alloc);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  nms_cuda<<<blocks, threads, 0, stream>>>(
+      boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),
+      (unsigned long long*)mask.data_ptr<int64_t>());
+
+  // Filter the boxes which should be kept.
+  at::Tensor keep_t = at::zeros(
+      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
+  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
+                          col_blocks * sizeof(unsigned long long), stream>>>(
+      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
+      boxes_num);
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.masked_select(keep_t);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..15004b82179ab36408355cac4deef90de252b291
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu
@@ -0,0 +1,60 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "nms_quadri_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+Tensor nms_quadri_cuda(const Tensor dets, const Tensor scores,
+                       const Tensor order_t, const Tensor dets_sorted,
+                       float iou_threshold, const int multi_label) {
+  // using scalar_t = float;
+  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());
+
+  int dets_num = dets.size(0);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
+
+  Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      dets_sorted.scalar_type(), "nms_quadri_kernel_cuda", [&] {
+        nms_quadri_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
+      });
+
+  Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e1185f81cb2fd58d00a30d3fff5215af76f57a85
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
@@ -0,0 +1,62 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
+#include "nms_rotated_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order_t, const Tensor dets_sorted,
+                        float iou_threshold, const int multi_label) {
+  // using scalar_t = float;
+  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());
+
+  int dets_num = dets.size(0);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
+
+  Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
+        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
+      });
+
+  Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3cc89d010a80126360fe42503a1754ef4a420afa
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
@@ -0,0 +1,62 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <stdio.h>
+
+#include "points_in_boxes_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is
+  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
+  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
+  // -1
+
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      boxes.scalar_type(), "points_in_boxes_part_forward_cuda_kernel", [&] {
+        points_in_boxes_part_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
+                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      boxes.scalar_type(), "points_in_boxes_all_forward_cuda_kernel", [&] {
+        points_in_boxes_all_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
+                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6e7db9ddfd63e4bfb3ca150a83dde5a79fb1717e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/ming71/CUDA/blob/master/point_justify/points_justify_kernel.cu
+
+#include <stdio.h>
+
+#include "points_in_polygons_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output) {
+  const int output_size = rows * cols;
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "points_in_polygons_forward_cuda_kernel", ([&] {
+        const scalar_t *vertex1 = points.data_ptr<scalar_t>();
+        const scalar_t *vertex2 = polygons.data_ptr<scalar_t>();
+        scalar_t *inside_flag = output.data_ptr<scalar_t>();
+
+        points_in_polygons_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, vertex1, vertex2, rows, cols, inside_flag);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e0636098b1d6fb6eef0c6a5ff334ddb43ae7855f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
@@ -0,0 +1,65 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "prroi_pool_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                        Tensor output, int pooled_height,
+                                        int pooled_width, float spatial_scale) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_forward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, input.data_ptr<float>(), rois.data_ptr<float>(),
+          output.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                         Tensor grad_input, int pooled_height,
+                                         int pooled_width,
+                                         float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_backward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, grad_output.data_ptr<float>(), rois.data_ptr<float>(),
+          grad_input.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void PrROIPoolCoorBackwardCUDAKernelLauncher(Tensor output, Tensor grad_output,
+                                             Tensor input, Tensor rois,
+                                             Tensor grad_rois,
+                                             int pooled_height,
+                                             int pooled_width,
+                                             float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_coor_backward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, output.data_ptr<float>(), grad_output.data_ptr<float>(),
+          input.data_ptr<float>(), rois.data_ptr<float>(),
+          grad_rois.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a0bdfa60c2d3ba75d089d0bfa44648821aaf4fed
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
@@ -0,0 +1,60 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+
+#include <torch/serialize/tensor.h>
+
+#include "psamask_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (psa_type == 0)
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "psamask_collect_forward_cuda", [&] {
+          psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+              half_w_mask, input.data_ptr<scalar_t>(),
+              output.data_ptr<scalar_t>());
+        });
+  else
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "psamask_distribute_forward_cuda", [&] {
+          psamask_distribute_forward_cuda<scalar_t>
+              <<<nthreads, 512, 0, stream>>>(
+                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                  half_w_mask, input.data_ptr<scalar_t>(),
+                  output.data_ptr<scalar_t>());
+        });
+}
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (psa_type == 0)
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_input.scalar_type(), "psamask_collect_backward_cuda", [&] {
+          psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+              half_w_mask, grad_output.data_ptr<scalar_t>(),
+              grad_input.data_ptr<scalar_t>());
+        });
+  else
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_input.scalar_type(), "psamask_distribute_backward_cuda", [&] {
+          psamask_distribute_backward_cuda<scalar_t>
+              <<<nthreads, 512, 0, stream>>>(
+                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                  half_w_mask, grad_output.data_ptr<scalar_t>(),
+                  grad_input.data_ptr<scalar_t>());
+        });
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9829da731d6f5ad61ad2cde04a3b8511b5ca942c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "riroi_align_rotated_cuda_kernel.cuh"
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output) {
+  const int output_size =
+      num_rois * pooled_height * pooled_width * channels * num_orientations;
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "riroi_align_rotated_forward_cuda_kernel", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+
+        riroi_align_rotated_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                num_samples, clockwise, channels, height, width, pooled_height,
+                pooled_width, num_orientations, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad) {
+  const int output_size =
+      num_rois * pooled_height * pooled_width * channels * num_orientations;
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "riroi_align_rotated_backward_cuda_kernel", ([&] {
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
+        riroi_align_rotated_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, top_diff, rois_data, spatial_scale, num_samples,
+                clockwise, channels, height, width, pooled_height, pooled_width,
+                num_orientations, bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3d4f7614e4bce44b77027c82d99cabbd571e608c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "roi_align_cuda_kernel.cuh"
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "roi_align_forward_cuda_kernel", [&] {
+        roi_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax_y.data_ptr<scalar_t>(), argmax_x.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+                aligned, channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "roi_align_backward_cuda_kernel", [&] {
+        roi_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+                argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+                aligned, channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c0fd987bb91d4c903c7e408190d7a31b906bae62
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
@@ -0,0 +1,45 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "roi_align_rotated_cuda_kernel.cuh"
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlignRotatedLaucherForward", ([&] {
+        const scalar_t *bottom_data = input.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+
+        roi_align_rotated_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                sampling_ratio, aligned, clockwise, channels, height, width,
+                pooled_height, pooled_width, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] {
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
+        roi_align_rotated_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, top_diff, rois_data, spatial_scale, sampling_ratio,
+                aligned, clockwise, channels, height, width, pooled_height,
+                pooled_width, bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d9cdf3050964e9bd4fbb64f0650b138ccb51ac6d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
@@ -0,0 +1,50 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "roi_pool_cuda_kernel.cuh"
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "roi_pool_forward_cuda_kernel", [&] {
+        roi_pool_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax.data_ptr<int>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "roi_pool_backward_cuda_kernel", [&] {
+        roi_pool_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), argmax.data_ptr<int>(),
+                grad_input.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7d83755f4c89104a037cb7c16a59e6dd25f84e12
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
@@ -0,0 +1,118 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <stdio.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "roiaware_pool3d_cuda_kernel.cuh"
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate params pts: (npoints, 3) [x, y, z] in LiDAR coordinate params
+  // pts_feature: (npoints, C) params argmax: (N, out_x, out_y, out_z, C) params
+  // pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) params
+  // pooled_features: (N, out_x, out_y, out_z, C) params pool_method: 0:
+  // max_pool 1: avg_pool
+
+  at::cuda::CUDAGuard device_guard(pts_feature.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  Tensor pts_mask =
+      -at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));
+
+  dim3 blocks_mask(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      rois.scalar_type(), "generate_pts_mask_for_box3d", [&] {
+        generate_pts_mask_for_box3d<scalar_t>
+            <<<blocks_mask, threads, 0, stream>>>(
+                boxes_num, pts_num, out_x, out_y, out_z,
+                rois.data_ptr<scalar_t>(), pts.data_ptr<scalar_t>(),
+                pts_mask.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // TODO: Merge the collect and pool functions, SS
+
+  dim3 blocks_collect(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK));
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      pts_idx_of_voxels.scalar_type(), "collect_inside_pts_for_box3d", [&] {
+        collect_inside_pts_for_box3d<scalar_t>
+            <<<blocks_collect, threads, 0, stream>>>(
+                boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z,
+                pts_mask.data_ptr<int>(),
+                pts_idx_of_voxels.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  dim3 blocks_pool(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK),
+                   channels, boxes_num);
+  if (pool_method == 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        pts_feature.scalar_type(), "roiaware_maxpool3d", [&] {
+          roiaware_maxpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
+              out_z, pts_feature.data_ptr<scalar_t>(),
+              pts_idx_of_voxels.data_ptr<int>(),
+              pooled_features.data_ptr<scalar_t>(), argmax.data_ptr<int>());
+        });
+  } else if (pool_method == 1) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        pts_feature.scalar_type(), "roiaware_avgpool3d", [&] {
+          roiaware_avgpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
+              out_z, pts_feature.data_ptr<scalar_t>(),
+              pts_idx_of_voxels.data_ptr<int>(),
+              pooled_features.data_ptr<scalar_t>());
+        });
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool, 1: avg_pool
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+              boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  if (pool_method == 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        grad_in.scalar_type(), "roiaware_maxpool3d_backward", [&] {
+          roiaware_maxpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
+              boxes_num, channels, out_x, out_y, out_z, argmax.data_ptr<int>(),
+              grad_out.data_ptr<scalar_t>(), grad_in.data_ptr<scalar_t>());
+        });
+  } else if (pool_method == 1) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        grad_in.scalar_type(), "roiaware_avgpool3d_backward", [&] {
+          roiaware_avgpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
+              boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
+              pts_idx_of_voxels.data_ptr<int>(), grad_out.data_ptr<scalar_t>(),
+              grad_in.data_ptr<scalar_t>());
+        });
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..af2098e8229ef29c08fe3c8d715863fe67cda06e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
@@ -0,0 +1,60 @@
+/*
+Modified from
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "roipoint_pool3d_cuda_kernel.cuh"
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features,
+    Tensor pooled_empty_flag) {
+  Tensor pts_assign = at::empty({batch_size, pts_num, boxes_num},
+                                boxes3d.options().dtype(at::kInt));
+
+  at::cuda::CUDAGuard device_guard(xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz.scalar_type(), "assign_pts_to_box3d", [&] {
+        assign_pts_to_box3d<scalar_t><<<blocks, threads, 0, stream>>>(
+            batch_size, pts_num, boxes_num, xyz.data_ptr<scalar_t>(),
+            boxes3d.data_ptr<scalar_t>(), pts_assign.data_ptr<int>());
+      });
+
+  Tensor pts_idx = at::empty({batch_size, boxes_num, sampled_pts_num},
+                             boxes3d.options().dtype(at::kInt));
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks2(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK), batch_size);
+
+  get_pooled_idx<<<blocks2, threads, 0, stream>>>(
+      batch_size, pts_num, boxes_num, sampled_pts_num,
+      pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),
+      pooled_empty_flag.data_ptr<int>());
+
+  dim3 blocks_pool(GET_BLOCKS(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
+                   batch_size);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz.scalar_type(), "roipoint_pool3d_forward", [&] {
+        roipoint_pool3d_forward<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+            batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
+            xyz.data_ptr<scalar_t>(), pts_idx.data_ptr<int>(),
+            pts_feature.data_ptr<scalar_t>(),
+            pooled_features.data_ptr<scalar_t>(),
+            pooled_empty_flag.data_ptr<int>());
+      });
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d172338ae76b7d1509b3011383d3ea95ee8d9527
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#include "pytorch_cuda_helper.hpp"
+#include "rotated_feature_align_cuda_kernel.cuh"
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int output_size = features.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "rotated_feature_align_forward_cuda_kernel",
+      ([&] {
+        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* top_data = output.data_ptr<scalar_t>();
+
+        rotated_feature_align_forward_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, points, bottom_data, bboxes_data,
+                scalar_t(spatial_scale), features.size(1), features.size(2),
+                features.size(3), top_data);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad) {
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int output_size = top_grad.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "rotated_feature_align_backward_cuda_kernel",
+      ([&] {
+        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();
+
+        rotated_feature_align_backward_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, points, top_diff, bboxes_data,
+                scalar_t(spatial_scale), top_grad.size(1), top_grad.size(2),
+                top_grad.size(3), bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cbc44651fc51a5392031e51355de242837242596
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
@@ -0,0 +1,132 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <stdio.h>
+#include <stdlib.h>
+#include <torch/types.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "scatter_points_cuda_kernel.cuh"
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor &feats, const at::Tensor &coors,
+    const reduce_t reduce_type) {
+  const int num_input = feats.size(0);
+  const int num_feats = feats.size(1);
+
+  if (num_input == 0)
+    return {feats.clone().detach(), coors.clone().detach(),
+            coors.new_empty({0}, torch::kInt32),
+            coors.new_empty({0}, torch::kInt32)};
+
+  at::Tensor out_coors;
+  at::Tensor coors_map;
+  at::Tensor reduce_count;
+
+  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);
+
+  std::tie(out_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, true);
+
+  if (out_coors[0][0].lt(0).item<bool>()) {
+    // the first element of out_coors (-1,-1,-1) and should be removed
+    out_coors = out_coors.slice(0, 1);
+    reduce_count = reduce_count.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  coors_map = coors_map.to(torch::kInt32);
+  reduce_count = reduce_count.to(torch::kInt32);
+
+  auto reduced_feats =
+      at::empty({out_coors.size(0), num_feats}, feats.options());
+
+  at::cuda::CUDAGuard device_guard(feats.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      feats.scalar_type(), "feats_reduce_kernel", ([&] {
+        if (reduce_type == reduce_t::MAX)
+          reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
+        else
+          reduced_feats.fill_(static_cast<scalar_t>(0));
+
+        dim3 blocks(std::min(
+            at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+        dim3 threads(THREADS_PER_BLOCK);
+        feats_reduce_kernel<<<blocks, threads, 0, stream>>>(
+            feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
+            reduced_feats.data_ptr<scalar_t>(), num_input, num_feats,
+            reduce_type);
+        if (reduce_type == reduce_t::MEAN)
+          reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return {reduced_feats, out_coors, coors_map, reduce_count};
+}
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
+    const at::Tensor &feats, const at::Tensor &reduced_feats,
+    const at::Tensor &coors_map, const at::Tensor &reduce_count,
+    const reduce_t reduce_type) {
+  const int num_input = feats.size(0);
+  const int num_reduced = reduced_feats.size(0);
+  const int num_feats = feats.size(1);
+
+  grad_feats.fill_(0);
+  // copy voxel grad to points
+
+  if (num_input == 0 || num_reduced == 0) return;
+  at::cuda::CUDAGuard device_guard(feats.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
+        ([&] {
+          dim3 blocks(std::min(
+              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          add_reduce_traceback_grad_kernel<<<blocks, threads, 0, stream>>>(
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
+              num_input, num_feats, reduce_type);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  } else {
+    auto reduce_from = at::full({num_reduced, num_feats}, num_input,
+                                coors_map.options().dtype(torch::kInt32));
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(std::min(
+              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads, 0,
+                                                    stream>>>(
+              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
+              num_input, num_feats);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(
+              std::min(at::cuda::ATenCeilDiv(num_reduced, THREADS_PER_BLOCK),
+                       maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          max_reduce_scatter_grad_kernel<<<blocks, threads, 0, stream>>>(
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a014dfc8e943c076662bd49b7be36b82e2338992
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
@@ -0,0 +1,159 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+// clang-format off
+// TODO: make spconv_utils.h order agnostic
+#include "../spconv_utils.h"
+// clang-format on
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
+#include <spconv/indice.cuh>
+#include <type_traits>
+
+#include "pytorch_cuda_helper.hpp"
+
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose) {
+    Index batchSize = gridsOut.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    if (transpose)
+      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                              indiceNum, indicePairUnique, kernelSize, stride,
+                              padding, dilation, outSpatialShape);
+    else
+      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                              indiceNum, indicePairUnique, kernelSize, stride,
+                              padding, dilation, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    return 1;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    Index batchSize = gridsOut.dim(0);
+    auto kernelVolume = indicePairs.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    Index numAct = indicePairUnique.dim(0) - 1;
+    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
+                            indicePairUnique, outSpatialShape, batchSize);
+    TV_CHECK_CUDA_ERR();
+    assignIndicePairsKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
+                            indicePairUnique, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+
+    if (resetGrid) {
+      resetGridKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numAct;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    prepareSubMGridKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
+                            kernelSize, stride, padding, dilation,
+                            outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+
+    if (resetGrid) {
+      resetGridSubMKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
+                              numActIn);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numActIn;
+  }
+};
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \
+  template struct functor::CreateConvIndicePairFunctor<tv::TorchGPU, Index,   \
+                                                       int, NDIM>;            \
+  template struct functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, \
+                                                         int, NDIM>;          \
+  template struct functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, \
+                                                         int, NDIM>;          \
+  template struct functor::CreateSubMIndicePairFunctor<tv::TorchGPU, Index,   \
+                                                       int, NDIM>;
+
+#define DECLARE_GPU_INDEX(Index)          \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_GPU_INDEX(int);
+
+#undef DECLARE_GPU_INDEX
+#undef DECLARE_GPU_SPECS_INDEX_NDIM
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
new file mode 100644
index 0000000000000000000000000000000000000000..985820556d57300a306fdfb3b216edc8a5a8a726
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
@@ -0,0 +1,486 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+// clang-format off
+// TODO: make spconv_utils.h order agnostic
+#include "../spconv_utils.h"
+// clang-format on
+#include <utils/spconv/spconv/maxpool.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
+#include <type_traits>
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdBlockKernel(scalar_t *outFeatures,
+                                      const scalar_t *inFeatures,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
+  scalar_t in, out;
+  int ILPStrideY[NumILP];
+  Index idxo, idxi;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in > out) {
+          outFeatures[idxo] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdGenericBlockKernel(scalar_t *outFeatures,
+                                             const scalar_t *inFeatures,
+                                             const Index *indicesIn,
+                                             const Index *indicesOut,
+                                             int numHot, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in > out) {
+          outFeatures[RO[ilp] + iy] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void maxPoolFwdVecBlockKernel(scalar_t *outFeatures,
+                                         const scalar_t *inFeatures,
+                                         const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+  scalar_t bufi[vecloadFactor];
+  scalar_t bufo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] > bufo[i]) {
+          bufo[i] = bufi[i];
+        }
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idxo] =
+          reinterpret_cast<VecType *>(bufo)[0];
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdGenericKernel(scalar_t *outFeatures,
+                                        const scalar_t *inFeatures,
+                                        const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in > out) {
+            outFeatures[RO[ilp] + iy] = in;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdBlockKernel(const scalar_t *outFeatures,
+                                      const scalar_t *inFeatures,
+                                      const scalar_t *fout, scalar_t *fin,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
+  scalar_t in, out;
+  Index idxo, idxi;
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  fout += blockIdx.y * NumTLP;
+  fin += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in == out) {
+          fin[idxi] += fout[idxo];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdGenericBlockKernel(
+    const scalar_t *outFeatures, const scalar_t *inFeatures,
+    const scalar_t *fout, scalar_t *fin, const Index *indicesIn,
+    const Index *indicesOut, int numHot, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in == out) {
+          fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void maxPoolBwdVecBlockKernel(const scalar_t *outFeatures,
+                                         const scalar_t *inFeatures,
+                                         const scalar_t *fout, scalar_t *fin,
+                                         const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+  scalar_t bufi[vecloadFactor];
+  scalar_t bufo[vecloadFactor];
+  scalar_t bufdi[vecloadFactor];
+  scalar_t bufdo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<const VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+      reinterpret_cast<VecType *>(bufdo)[0] =
+          reinterpret_cast<const VecType *>(fout)[idxo];
+      reinterpret_cast<VecType *>(bufdi)[0] =
+          reinterpret_cast<VecType *>(fin)[idxi];
+
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] == bufo[i]) {
+          bufdi[i] += bufdo[i];
+        }
+      }
+      reinterpret_cast<VecType *>(fin)[idxi] =
+          reinterpret_cast<VecType *>(bufdi)[0];
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdGenericKernel(const scalar_t *outFeatures,
+                                        const scalar_t *inFeatures,
+                                        const scalar_t *fout, scalar_t *fin,
+                                        const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in == out) {
+            fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolFwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolFwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolFwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+
+      if (size > numHotBlock) {
+        maxPoolFwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d,
+                  tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &fout, &fin,
+                                 &indices, &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolBwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    fout.data(), fin.data(),
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolBwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                       fout.data(), fin.data(),
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolBwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+
+      if (size > numHotBlock) {
+        maxPoolBwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, \
+                                                       Index>;                 \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU,          \
+                                                        scalar_t, Index>;
+
+#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..74b182d1290dfccb1f6b6232fc668b1b0c2048e0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
@@ -0,0 +1,91 @@
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+// clang-format off
+// TODO: make spconv_utils.h order agnostic
+#include "../spconv_utils.h"
+// clang-format on
+#include <utils/spconv/spconv/maxpool.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
+                                                     torch::Tensor indicePairs,
+                                                     torch::Tensor indiceNum,
+                                                     int64_t numAct) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  auto device = features.device().type();
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceMaxpoolForwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, int>
+                forwardFtor;
+            forwardFtor(tv::CPU(), tv::torch2tv<scalar_t>(output),
+                        tv::torch2tv<const scalar_t>(features),
+                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+          } else {
+            functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, int>
+                forwardFtor;
+            forwardFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                        tv::torch2tv<const scalar_t>(features),
+                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return output;
+}
+
+torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
+                                                      torch::Tensor outFeatures,
+                                                      torch::Tensor outGrad,
+                                                      torch::Tensor indicePairs,
+                                                      torch::Tensor indiceNum) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  auto device = features.device().type();
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  auto kernelVolume = indicePairs.size(0);
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceMaxpoolBackwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, int>
+                backwardFtor;
+            backwardFtor(tv::CPU(), tv::torch2tv<const scalar_t>(outFeatures),
+                         tv::torch2tv<const scalar_t>(features),
+                         tv::torch2tv<const scalar_t>(outGrad),
+                         tv::torch2tv<scalar_t>(inputGrad),
+                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+          } else {
+            functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, int>
+                backwardFtor;
+            backwardFtor(tv::TorchGPU(),
+                         tv::torch2tv<const scalar_t>(outFeatures),
+                         tv::torch2tv<const scalar_t>(features),
+                         tv::torch2tv<const scalar_t>(outGrad),
+                         tv::torch2tv<scalar_t>(inputGrad),
+                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return inputGrad;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ba1686082fe16cc10673bc18fa15ff290d5007b9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
@@ -0,0 +1,160 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+// clang-format off
+// TODO: make spconv_utils.h order agnostic
+#include "../spconv_utils.h"
+// clang-format on
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/spconv/reordering.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
+#include <spconv/reordering.cuh>
+#include <type_traits>
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+#include "pytorch_cuda_helper.hpp"
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseGatherFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = features.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            gatherVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                 vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(buffer.data(), features.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
+
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            gatherVecKernel<scalar_t, Index, int(NumTLP), NumILP,
+                            vecload_type_t>
+                <<<dim3(1, numPlanes / NumTLP),
+                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
+                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
+                                    features.data(), indices.data() + nHotBlock,
+                                    size - nHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      gatherGenericKernel<scalar_t, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              buffer.data(), features.data(), indices.data(), size, numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+template <typename scalar_t, typename Index>
+struct SparseScatterAddFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    if (size <= 0) return;
+    int numPlanes = outFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor =
+        sizeof(vecload_type_t) / sizeof(scalar_t);  // important for half.
+    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            scatterAddVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), buffer.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            scatterAddGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(
+                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
+                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      scatterAddGenericKernel<scalar_t, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              outFeatures.data(), buffer.data(), indices.data(), size,
+              numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
+  template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \
+  template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t,     \
+                                                   Index>;
+
+#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d3791f3db2741fb010d4838a0ef0a402afd5cff3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
@@ -0,0 +1,477 @@
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+// clang-format off
+// TODO: make spconv_utils.h order agnostic
+#include "../spconv_utils.h"
+// clang-format on
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/reordering.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  at::cuda::CUDAGuard device_guard(indices.device());
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1;
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor gridOut =
+      torch::full({batchSize * outputVolume}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique = torch::full(
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                     torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
+                                                 NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
+                                                 NDim>();
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  at::cuda::CUDAGuard device_guard(indices.device());
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1;
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique = torch::full(
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose, true);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                     torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose, true);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
+                                                 NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
+                                                 NDim>();
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
+            true);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
+torch::Tensor IndiceConvForwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
+    int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceConvForwardKernel", [&] {
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            /* slower than SparseGatherFunctor, may due to int->long conversion
+            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+            auto indicePairBlob =
+            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
+            indicePairOptions); torch::index_select_out(inputBufferBlob,
+            features, 0, indicePairBlob);*/
+          }
+          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return output;
+}
+
+std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  auto filterShape = filters.sizes();
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    auto filterGradSub = filtersGrad[indicePairMaxOffset];
+    torch::mm_out(filterGradSub, features.t(), outGrad);
+    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
+  }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceConvBackwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtorOut;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            gatherFtorOut(
+                tv::CPU(), tv::torch2tv<scalar_t>(outputBuffer),
+                tv::torch2tv<const scalar_t>(outGrad),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtorOut;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            gatherFtorOut(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(outputBuffer),
+                tv::torch2tv<const scalar_t>(outGrad),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+          auto filterGradSub = filtersGrad[i];
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
+          torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(inputGrad),
+                tv::torch2tv<const scalar_t>(inputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(inputGrad),
+                tv::torch2tv<const scalar_t>(inputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return {inputGrad, filtersGrad.view(filterShape)};
+}
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<2>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<3>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3095df5ee32070b340deec15f43d1fc093a2b282
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu
@@ -0,0 +1,45 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "stack_ball_query_cuda_kernel.cuh"
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+void StackBallQueryForwardCUDAKernelLauncher(float max_radius, int nsample,
+                                             const Tensor new_xyz,
+                                             const Tensor new_xyz_batch_cnt,
+                                             const Tensor xyz,
+                                             const Tensor xyz_batch_cnt,
+                                             Tensor idx) {
+  at::cuda::CUDAGuard device_guard(new_xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  //   const float *new_xyz_ptr = new_xyz.data_ptr<float>();
+  //   const float *xyz_ptr = xyz.data_ptr<float>();
+  //   const int *new_xyz_batch_cnt_ptr = new_xyz_batch_cnt.data_ptr<int>();
+  //   const int *xyz_batch_cnt_ptr = xyz_batch_cnt.data_ptr<int>();
+  //   int *idx_ptr = idx.data_ptr<int>();
+
+  int B = xyz_batch_cnt.size(0);
+  int M = new_xyz.size(0);
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(M, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      new_xyz.scalar_type(), "stack_ball_query_forward_cuda_kernel", [&] {
+        stack_ball_query_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                B, M, max_radius, nsample, new_xyz.data_ptr<scalar_t>(),
+                new_xyz_batch_cnt.data_ptr<int>(), xyz.data_ptr<scalar_t>(),
+                xyz_batch_cnt.data_ptr<int>(), idx.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9f903b02a6750e0352f06ad268c35775d694b0fc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu
@@ -0,0 +1,62 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "stack_group_points_cuda_kernel.cuh"
+
+void StackGroupPointsForwardCUDAKernelLauncher(
+    int b, int c, int m, int nsample, const Tensor features_tensor,
+    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,
+    const Tensor idx_batch_cnt_tensor, Tensor out_tensor) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+  at::cuda::CUDAGuard device_guard(features_tensor.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features_tensor.scalar_type(), "stack_group_points_forward_cuda_kernel",
+      [&] {
+        stack_group_points_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, m, nsample, features_tensor.data_ptr<scalar_t>(),
+                features_batch_cnt_tensor.data_ptr<int>(),
+                idx_tensor.data_ptr<int>(),
+                idx_batch_cnt_tensor.data_ptr<int>(),
+                out_tensor.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void StackGroupPointsBackwardCUDAKernelLauncher(
+    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,
+    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,
+    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor) {
+  at::cuda::CUDAGuard device_guard(grad_features_tensor.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_features_tensor.scalar_type(),
+      "stack_group_points_backward_cuda_kernel", [&] {
+        stack_group_points_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, m, n, nsample, grad_out_tensor.data_ptr<scalar_t>(),
+                idx_tensor.data_ptr<int>(),
+                idx_batch_cnt_tensor.data_ptr<int>(),
+                features_batch_cnt_tensor.data_ptr<int>(),
+                grad_features_tensor.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..657c81701b7c114af700c4f8cf37094c705b9a94
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
@@ -0,0 +1,110 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "sync_bn_cuda_kernel.cuh"
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_mean_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_var_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
+                var.data_ptr<float>(), num, channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_output_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
+                var.data_ptr<float>(), running_mean.data_ptr<float>(),
+                running_var.data_ptr<float>(), weight.data_ptr<float>(),
+                bias.data_ptr<float>(), norm.data_ptr<float>(),
+                std.data_ptr<float>(), output.data_ptr<scalar_t>(), num,
+                channels, spatial, eps, momentum, group_size);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias) {
+  int num = grad_output.size(0);
+  int channels = grad_output.size(1);
+  int spatial = grad_output.size(2);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "sync_bn_backward_param_cuda_kernel", [&] {
+        sync_bn_backward_param_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                grad_output.data_ptr<scalar_t>(), norm.data_ptr<float>(),
+                grad_weight.data_ptr<float>(), grad_bias.data_ptr<float>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input) {
+  int output_size = grad_input.numel();
+  int num = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int spatial = grad_input.size(2);
+
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "sync_bn_backward_data_cuda_kernel", [&] {
+        sync_bn_backward_data_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                weight.data_ptr<float>(), grad_weight.data_ptr<float>(),
+                grad_bias.data_ptr<float>(), norm.data_ptr<float>(),
+                std.data_ptr<float>(), grad_input.data_ptr<scalar_t>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..56a5550066035efb96d1d8e46c5f1ecd3e36083b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "three_interpolate_cuda_kernel.cuh"
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight,
+                                               Tensor out) {
+  // points: (B, C, M)
+  // idx: (B, N, 3)
+  // weight: (B, N, 3)
+  // output:
+  //      out: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "three_interpolate_forward_cuda_kernel", [&] {
+        three_interpolate_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, m, n, points.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+                weight.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points) {
+  // grad_out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
+  //      grad_points: (B, C, M)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "three_interpolate_backward_cuda_kernel", [&] {
+        three_interpolate_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, m, grad_out.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+                weight.data_ptr<scalar_t>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..91c68829b9f2c19f1a64def88475c0fedf40de9f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
@@ -0,0 +1,35 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "three_nn_cuda_kernel.cuh"
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx) {
+  // unknown: (B, N, 3)
+  // known: (B, M, 3)
+  // output:
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
+
+  at::cuda::CUDAGuard device_guard(unknown.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      unknown.scalar_type(), "three_nn_forward_cuda_kernel", [&] {
+        three_nn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            b, n, m, unknown.data_ptr<scalar_t>(), known.data_ptr<scalar_t>(),
+            dist2.data_ptr<scalar_t>(), idx.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..19c85c76c9f53cb70314d4cdc1c1d2379322f30e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
@@ -0,0 +1,55 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#include "tin_shift_cuda_kernel.cuh"
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output) {
+  int output_size = output.numel();
+  int batch_size = input.size(0);
+  int t_size = input.size(1);
+  int channels = input.size(2);
+  int hw_size = input.size(3);
+  int group_size = shift.size(1);
+  int group_channel = channels / group_size;
+  int num_kernels = batch_size * hw_size * channels;
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "tin_shift_forward_cuda_kernel", [&] {
+        tin_shift_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(), shift.data_ptr<int>(),
+                output.data_ptr<scalar_t>(), batch_size, channels, t_size,
+                hw_size, group_size, group_channel);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input) {
+  int output_size = grad_output.numel();
+  int batch_size = grad_output.size(0);
+  int t_size = grad_output.size(1);
+  int channels = grad_output.size(2);
+  int hw_size = grad_output.size(3);
+  int group_size = shift.size(1);
+  int group_channel = channels / group_size;
+  int num_kernels = batch_size * hw_size * channels;
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "tin_shift_backward_cuda_kernel", [&] {
+        tin_shift_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                shift.data_ptr<int>(), grad_input.data_ptr<scalar_t>(),
+                batch_size, channels, t_size, hw_size, group_size,
+                group_channel);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2f1ae0a683de7c441cec560c92c3831104971d27
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
@@ -0,0 +1,746 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#include <c10/util/Half.h>
+#include <torch/types.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+struct upfirdn2d_kernel_params {
+  const void *x;
+  const float *f;
+  void *y;
+
+  int2 up;
+  int2 down;
+  int2 pad0;
+  int flip;
+  float gain;
+
+  int4 inSize;  // [width, height, channel, batch]
+  int4 inStride;
+  int2 filterSize;  // [width, height]
+  int2 filterStride;
+  int4 outSize;  // [width, height, channel, batch]
+  int4 outStride;
+  int sizeMinor;
+  int sizeMajor;
+
+  int loopMinor;
+  int loopMajor;
+  int loopX;
+  int launchMinor;
+  int launchMajor;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel specialization.
+
+struct upfirdn2d_kernel_spec {
+  void *kernel;
+  int tileOutW;
+  int tileOutH;
+  int loopMinor;
+  int loopX;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T>
+upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params &p);
+//------------------------------------------------------------------------
+
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Helpers.
+
+template <class T>
+struct InternalType;
+template <>
+struct InternalType<double> {
+  typedef double scalar_t;
+};
+template <>
+struct InternalType<float> {
+  typedef float scalar_t;
+};
+template <>
+struct InternalType<c10::Half> {
+  typedef float scalar_t;
+};
+
+static __device__ __forceinline__ int floor_div(int a, int b) {
+  int t = 1 - a / b;
+  return (a + t * b) / b - t;
+}
+
+//------------------------------------------------------------------------
+// Generic CUDA implementation for large filters.
+
+template <class T>
+static __global__ void upfirdn2d_kernel_large(upfirdn2d_kernel_params p) {
+  typedef typename InternalType<T>::scalar_t scalar_t;
+
+  // Calculate thread index.
+  int minorBase = blockIdx.x * blockDim.x + threadIdx.x;
+  int outY = minorBase / p.launchMinor;
+  minorBase -= outY * p.launchMinor;
+  int outXBase = blockIdx.y * p.loopX * blockDim.y + threadIdx.y;
+  int majorBase = blockIdx.z * p.loopMajor;
+  if (outXBase >= p.outSize.x | outY >= p.outSize.y | majorBase >= p.sizeMajor)
+    return;
+
+  // Setup Y receptive field.
+  int midY = outY * p.down.y + p.up.y - 1 - p.pad0.y;
+  int inY = min(max(floor_div(midY, p.up.y), 0), p.inSize.y);
+  int h =
+      min(max(floor_div(midY + p.filterSize.y, p.up.y), 0), p.inSize.y) - inY;
+  int filterY = midY + p.filterSize.y - (inY + 1) * p.up.y;
+  if (p.flip) filterY = p.filterSize.y - 1 - filterY;
+
+  // Loop over major, minor, and X.
+  for (int majorIdx = 0, major = majorBase;
+       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++)
+    for (int minorIdx = 0, minor = minorBase;
+         minorIdx < p.loopMinor & minor < p.sizeMinor;
+         minorIdx++, minor += p.launchMinor) {
+      int nc = major * p.sizeMinor + minor;
+      int n = nc / p.inSize.z;
+      int c = nc - n * p.inSize.z;
+      for (int loopX = 0, outX = outXBase; loopX < p.loopX & outX < p.outSize.x;
+           loopX++, outX += blockDim.y) {
+        // Setup X receptive field.
+        int midX = outX * p.down.x + p.up.x - 1 - p.pad0.x;
+        int inX = min(max(floor_div(midX, p.up.x), 0), p.inSize.x);
+        int w =
+            min(max(floor_div(midX + p.filterSize.x, p.up.x), 0), p.inSize.x) -
+            inX;
+        int filterX = midX + p.filterSize.x - (inX + 1) * p.up.x;
+        if (p.flip) filterX = p.filterSize.x - 1 - filterX;
+
+        // Initialize pointers.
+        const T *xp =
+            &((const T *)p.x)[inX * p.inStride.x + inY * p.inStride.y +
+                              c * p.inStride.z + n * p.inStride.w];
+        const float *fp =
+            &p.f[filterX * p.filterStride.x + filterY * p.filterStride.y];
+        int filterStepX = ((p.flip) ? p.up.x : -p.up.x) * p.filterStride.x;
+        int filterStepY = ((p.flip) ? p.up.y : -p.up.y) * p.filterStride.y;
+
+        // Inner loop.
+        scalar_t v = 0;
+        for (int y = 0; y < h; y++) {
+          for (int x = 0; x < w; x++) {
+            v += (scalar_t)(*xp) * (scalar_t)(*fp);
+            xp += p.inStride.x;
+            fp += filterStepX;
+          }
+          xp += p.inStride.y - w * p.inStride.x;
+          fp += filterStepY - w * filterStepX;
+        }
+
+        // Store result.
+        v *= p.gain;
+        ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +
+                   c * p.outStride.z + n * p.outStride.w] = (T)v;
+      }
+    }
+}
+
+//------------------------------------------------------------------------
+// Specialized CUDA implementation for small filters.
+
+template <class T, int upx, int upy, int downx, int downy, int filterW,
+          int filterH, int tileOutW, int tileOutH, int loopMinor>
+static __global__ void upfirdn2d_kernel_small(upfirdn2d_kernel_params p) {
+  typedef typename InternalType<T>::scalar_t scalar_t;
+  const int tileInW = ((tileOutW - 1) * downx + filterW - 1) / upx + 1;
+  const int tileInH = ((tileOutH - 1) * downy + filterH - 1) / upy + 1;
+  __shared__ volatile scalar_t sf[filterH][filterW];
+  __shared__ volatile scalar_t sx[tileInH][tileInW][loopMinor];
+
+  // Calculate tile index.
+  int minorBase = blockIdx.x;
+  int tileOutY = minorBase / p.launchMinor;
+  minorBase -= tileOutY * p.launchMinor;
+  minorBase *= loopMinor;
+  tileOutY *= tileOutH;
+  int tileOutXBase = blockIdx.y * p.loopX * tileOutW;
+  int majorBase = blockIdx.z * p.loopMajor;
+  if (tileOutXBase >= p.outSize.x | tileOutY >= p.outSize.y |
+      majorBase >= p.sizeMajor)
+    return;
+
+  // Load filter (flipped).
+  for (int tapIdx = threadIdx.x; tapIdx < filterH * filterW;
+       tapIdx += blockDim.x) {
+    int fy = tapIdx / filterW;
+    int fx = tapIdx - fy * filterW;
+    scalar_t v = 0;
+    if (fx < p.filterSize.x & fy < p.filterSize.y) {
+      int ffx = (p.flip) ? fx : p.filterSize.x - 1 - fx;
+      int ffy = (p.flip) ? fy : p.filterSize.y - 1 - fy;
+      v = (scalar_t)p.f[ffx * p.filterStride.x + ffy * p.filterStride.y];
+    }
+    sf[fy][fx] = v;
+  }
+
+  // Loop over major and X.
+  for (int majorIdx = 0, major = majorBase;
+       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++) {
+    int baseNC = major * p.sizeMinor + minorBase;
+    int n = baseNC / p.inSize.z;
+    int baseC = baseNC - n * p.inSize.z;
+    for (int loopX = 0, tileOutX = tileOutXBase;
+         loopX < p.loopX & tileOutX < p.outSize.x;
+         loopX++, tileOutX += tileOutW) {
+      // Load input pixels.
+      int tileMidX = tileOutX * downx + upx - 1 - p.pad0.x;
+      int tileMidY = tileOutY * downy + upy - 1 - p.pad0.y;
+      int tileInX = floor_div(tileMidX, upx);
+      int tileInY = floor_div(tileMidY, upy);
+      __syncthreads();
+      for (int inIdx = threadIdx.x; inIdx < tileInH * tileInW * loopMinor;
+           inIdx += blockDim.x) {
+        int relC = inIdx;
+        int relInX = relC / loopMinor;
+        int relInY = relInX / tileInW;
+        relC -= relInX * loopMinor;
+        relInX -= relInY * tileInW;
+        int c = baseC + relC;
+        int inX = tileInX + relInX;
+        int inY = tileInY + relInY;
+        scalar_t v = 0;
+        if (inX >= 0 & inY >= 0 & inX < p.inSize.x & inY < p.inSize.y &
+            c < p.inSize.z)
+          v = (scalar_t)(
+              (const T *)p.x)[inX * p.inStride.x + inY * p.inStride.y +
+                              c * p.inStride.z + n * p.inStride.w];
+        sx[relInY][relInX][relC] = v;
+      }
+
+      // Loop over output pixels.
+      __syncthreads();
+      for (int outIdx = threadIdx.x; outIdx < tileOutH * tileOutW * loopMinor;
+           outIdx += blockDim.x) {
+        int relC = outIdx;
+        int relOutX = relC / loopMinor;
+        int relOutY = relOutX / tileOutW;
+        relC -= relOutX * loopMinor;
+        relOutX -= relOutY * tileOutW;
+        int c = baseC + relC;
+        int outX = tileOutX + relOutX;
+        int outY = tileOutY + relOutY;
+
+        // Setup receptive field.
+        int midX = tileMidX + relOutX * downx;
+        int midY = tileMidY + relOutY * downy;
+        int inX = floor_div(midX, upx);
+        int inY = floor_div(midY, upy);
+        int relInX = inX - tileInX;
+        int relInY = inY - tileInY;
+        int filterX = (inX + 1) * upx - midX - 1;  // flipped
+        int filterY = (inY + 1) * upy - midY - 1;  // flipped
+
+        // Inner loop.
+        if (outX < p.outSize.x & outY < p.outSize.y & c < p.outSize.z) {
+          scalar_t v = 0;
+#pragma unroll
+          for (int y = 0; y < filterH / upy; y++)
+#pragma unroll
+            for (int x = 0; x < filterW / upx; x++)
+              v += sx[relInY + y][relInX + x][relC] *
+                   sf[filterY + y * upy][filterX + x * upx];
+          v *= p.gain;
+          ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +
+                     c * p.outStride.z + n * p.outStride.w] = (T)v;
+        }
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T>
+upfirdn2d_kernel_spec choose_upfirdn2d_kernel(
+    const upfirdn2d_kernel_params &p) {
+  int s = p.inStride.z, fx = p.filterSize.x, fy = p.filterSize.y;
+  upfirdn2d_kernel_spec spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 1,
+                                4};  // contiguous
+  if (s == 1)
+    spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 4, 1};  // channels_last
+
+  // No up/downsampling.
+  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 24 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 64, 32, 1>,
+              64, 32, 1, 1};
+    if (s != 1 && fx <= 16 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 64, 32, 1>,
+              64, 32, 1, 1};
+    if (s != 1 && fx <= 7 && fy <= 7)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 6 && fy <= 6)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 5 && fy <= 5)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 4 && fy <= 4)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 3 && fy <= 3)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 24 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    if (s != 1 && fx <= 16 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    if (s != 1 && fx <= 8 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 32, 32, 1>,
+              32, 32, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 24 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s == 1 && fx <= 16 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s == 1 && fx <= 7 && fy <= 7)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 6 && fy <= 6)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 5 && fy <= 5)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 4 && fy <= 4)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 3 && fy <= 3)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 24 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+    if (s == 1 && fx <= 16 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+    if (s == 1 && fx <= 8 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+    if (s == 1 && fx <= 1 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 1, 128, 16>,
+              1, 128, 16, 1};
+    if (s == 1 && fx <= 1 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 1, 128, 16>,
+              1, 128, 16, 1};
+    if (s == 1 && fx <= 1 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 1, 128, 16>,
+              1, 128, 16, 1};
+  }
+
+  // 2x upsampling.
+  if (p.up.x == 2 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 24 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 64, 32, 1>,
+              64, 32, 1, 1};
+    if (s != 1 && fx <= 16 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 64, 32, 1>,
+              64, 32, 1, 1};
+    if (s != 1 && fx <= 8 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 6 && fy <= 6)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 4 && fy <= 4)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 2 && fy <= 2)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 64, 16, 1>,
+              64, 16, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 24 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s == 1 && fx <= 16 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s == 1 && fx <= 8 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 6 && fy <= 6)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 4 && fy <= 4)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 2 && fy <= 2)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 16, 16, 8>,
+              16, 16, 8, 1};
+  }
+  if (p.up.x == 2 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 24 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    if (s != 1 && fx <= 16 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    if (s != 1 && fx <= 8 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 24 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+    if (s == 1 && fx <= 16 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+    if (s == 1 && fx <= 8 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+  }
+  if (p.up.x == 1 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 1 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 32, 32, 1>,
+              32, 32, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 1 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 1, 128, 16>,
+              1, 128, 16, 1};
+    if (s == 1 && fx <= 1 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 1, 128, 16>,
+              1, 128, 16, 1};
+    if (s == 1 && fx <= 1 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 1, 128, 16>,
+              1, 128, 16, 1};
+  }
+
+  // 2x downsampling.
+  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 2) {
+    // contiguous
+    if (s != 1 && fx <= 24 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 32, 16, 1>,
+              32, 16, 1, 1};
+    if (s != 1 && fx <= 16 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 32, 16, 1>,
+              32, 16, 1, 1};
+    if (s != 1 && fx <= 8 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 32, 8, 1>, 32,
+              8, 1, 1};
+    if (s != 1 && fx <= 6 && fy <= 6)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 32, 8, 1>, 32,
+              8, 1, 1};
+    if (s != 1 && fx <= 4 && fy <= 4)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 32, 8, 1>, 32,
+              8, 1, 1};
+    if (s != 1 && fx <= 2 && fy <= 2)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 32, 8, 1>, 32,
+              8, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 24 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 16, 16, 1>,
+              16, 16, 1, 1};
+    if (s == 1 && fx <= 16 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 16, 16, 1>,
+              16, 16, 1, 1};
+    if (s == 1 && fx <= 8 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 8, 8, 8>, 8,
+              8, 8, 1};
+    if (s == 1 && fx <= 6 && fy <= 6)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 8, 8, 8>, 8,
+              8, 8, 1};
+    if (s == 1 && fx <= 4 && fy <= 4)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 8, 8, 8>, 8,
+              8, 8, 1};
+    if (s == 1 && fx <= 2 && fy <= 2)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 8, 8, 8>, 8,
+              8, 8, 1};
+  }
+  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 24 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 8, 1>,
+              64, 8, 1, 1};
+    if (s != 1 && fx <= 16 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 8, 1>,
+              64, 8, 1, 1};
+    if (s != 1 && fx <= 8 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 8, 1>, 64,
+              8, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 24 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 1, 8>,
+              64, 1, 8, 1};
+    if (s == 1 && fx <= 16 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 1, 8>,
+              64, 1, 8, 1};
+    if (s == 1 && fx <= 8 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 1, 8>, 64,
+              1, 8, 1};
+  }
+  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 2) {
+    // contiguous
+    if (s != 1 && fx <= 1 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 32, 16, 1>,
+              32, 16, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 32, 16, 1>,
+              32, 16, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 32, 16, 1>,
+              32, 16, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 1 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 1, 64, 8>, 1,
+              64, 8, 1};
+    if (s == 1 && fx <= 1 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 1, 64, 8>, 1,
+              64, 8, 1};
+    if (s == 1 && fx <= 1 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 1, 64, 8>, 1,
+              64, 8, 1};
+  }
+
+  // 4x upsampling.
+  if (p.up.x == 4 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 48 && fy <= 48)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 64, 32, 1>,
+              64, 32, 1, 1};
+    if (s != 1 && fx <= 32 && fy <= 32)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 64, 32, 1>,
+              64, 32, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 48 && fy <= 48)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s == 1 && fx <= 32 && fy <= 32)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 32, 32, 1>,
+              32, 32, 1, 1};
+  }
+  if (p.up.x == 4 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 48 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    if (s != 1 && fx <= 32 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 48 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+    if (s == 1 && fx <= 32 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+  }
+  if (p.up.x == 1 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 1 && fy <= 48)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 32)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 32, 32, 1>,
+              32, 32, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 1 && fy <= 48)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 1, 128, 16>,
+              1, 128, 16, 1};
+    if (s == 1 && fx <= 1 && fy <= 32)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 1, 128, 16>,
+              1, 128, 16, 1};
+  }
+
+  // 4x downsampling (inefficient).
+  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 4 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 48 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 8, 1>,
+              32, 8, 1, 1};
+    if (s != 1 && fx <= 32 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 8, 1>,
+              32, 8, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 48 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 1, 8>,
+              32, 1, 8, 1};
+    if (s == 1 && fx <= 32 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 1, 8>,
+              32, 1, 8, 1};
+  }
+  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 4) {
+    // contiguous
+    if (s != 1 && fx <= 1 && fy <= 48)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 32, 8, 1>,
+              32, 8, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 32)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 32, 8, 1>,
+              32, 8, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 1 && fy <= 48)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 1, 32, 8>, 1,
+              32, 8, 1};
+    if (s == 1 && fx <= 1 && fy <= 32)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 1, 32, 8>, 1,
+              32, 8, 1};
+  }
+  return spec;
+}
+
+//------------------------------------------------------------------------
+// Template specializations.
+
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<double>(
+    const upfirdn2d_kernel_params &p);
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<float>(
+    const upfirdn2d_kernel_params &p);
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<c10::Half>(
+    const upfirdn2d_kernel_params &p);
+
+//------------------------------------------------------------------------
+
+//------------------------------------------------------------------------
+
+torch::Tensor upfirdn2d_op(torch::Tensor x, torch::Tensor f, int upx, int upy,
+                           int downx, int downy, int padx0, int padx1,
+                           int pady0, int pady1, bool flip, float gain) {
+  // Validate arguments.
+  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+  TORCH_CHECK(f.device() == x.device(),
+              "f must reside on the same device as x");
+  TORCH_CHECK(f.dtype() == torch::kFloat, "f must be float32");
+  TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
+  TORCH_CHECK(f.numel() <= INT_MAX, "f is too large");
+  TORCH_CHECK(x.numel() > 0, "x has zero size");
+  TORCH_CHECK(f.numel() > 0, "f has zero size");
+  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+  TORCH_CHECK(f.dim() == 2, "f must be rank 2");
+  TORCH_CHECK((x.size(0) - 1) * x.stride(0) + (x.size(1) - 1) * x.stride(1) +
+                      (x.size(2) - 1) * x.stride(2) +
+                      (x.size(3) - 1) * x.stride(3) <=
+                  INT_MAX,
+              "x memory footprint is too large");
+  TORCH_CHECK(f.size(0) >= 1 && f.size(1) >= 1, "f must be at least 1x1");
+  TORCH_CHECK(upx >= 1 && upy >= 1, "upsampling factor must be at least 1");
+  TORCH_CHECK(downx >= 1 && downy >= 1,
+              "downsampling factor must be at least 1");
+
+  // Create output tensor.
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+  int outW =
+      ((int)x.size(3) * upx + padx0 + padx1 - (int)f.size(1) + downx) / downx;
+  int outH =
+      ((int)x.size(2) * upy + pady0 + pady1 - (int)f.size(0) + downy) / downy;
+  TORCH_CHECK(outW >= 1 && outH >= 1, "output must be at least 1x1");
+  torch::Tensor y = torch::empty({x.size(0), x.size(1), outH, outW},
+                                 x.options(), x.suggest_memory_format());
+  TORCH_CHECK(y.numel() <= INT_MAX, "output is too large");
+  TORCH_CHECK((y.size(0) - 1) * y.stride(0) + (y.size(1) - 1) * y.stride(1) +
+                      (y.size(2) - 1) * y.stride(2) +
+                      (y.size(3) - 1) * y.stride(3) <=
+                  INT_MAX,
+              "output memory footprint is too large");
+
+  // Initialize CUDA kernel parameters.
+  upfirdn2d_kernel_params p;
+  p.x = x.data_ptr();
+  p.f = f.data_ptr<float>();
+  p.y = y.data_ptr();
+  p.up = make_int2(upx, upy);
+  p.down = make_int2(downx, downy);
+  p.pad0 = make_int2(padx0, pady0);
+  p.flip = (flip) ? 1 : 0;
+  p.gain = gain;
+  p.inSize =
+      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+  p.inStride = make_int4((int)x.stride(3), (int)x.stride(2), (int)x.stride(1),
+                         (int)x.stride(0));
+  p.filterSize = make_int2((int)f.size(1), (int)f.size(0));
+  p.filterStride = make_int2((int)f.stride(1), (int)f.stride(0));
+  p.outSize =
+      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
+  p.outStride = make_int4((int)y.stride(3), (int)y.stride(2), (int)y.stride(1),
+                          (int)y.stride(0));
+  p.sizeMajor = (p.inStride.z == 1) ? p.inSize.w : p.inSize.w * p.inSize.z;
+  p.sizeMinor = (p.inStride.z == 1) ? p.inSize.z : 1;
+
+  // Choose CUDA kernel.
+  upfirdn2d_kernel_spec spec;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
+    spec = choose_upfirdn2d_kernel<scalar_t>(p);
+  });
+
+  // Set looping options.
+  p.loopMajor = (p.sizeMajor - 1) / 16384 + 1;
+  p.loopMinor = spec.loopMinor;
+  p.loopX = spec.loopX;
+  p.launchMinor = (p.sizeMinor - 1) / p.loopMinor + 1;
+  p.launchMajor = (p.sizeMajor - 1) / p.loopMajor + 1;
+
+  // Compute grid size.
+  dim3 blockSize, gridSize;
+  if (spec.tileOutW < 0)  // large
+  {
+    blockSize = dim3(4, 32, 1);
+    gridSize =
+        dim3(((p.outSize.y - 1) / blockSize.x + 1) * p.launchMinor,
+             (p.outSize.x - 1) / (blockSize.y * p.loopX) + 1, p.launchMajor);
+  } else  // small
+  {
+    blockSize = dim3(256, 1, 1);
+    gridSize =
+        dim3(((p.outSize.y - 1) / spec.tileOutH + 1) * p.launchMinor,
+             (p.outSize.x - 1) / (spec.tileOutW * p.loopX) + 1, p.launchMajor);
+  }
+
+  // Launch CUDA kernel.
+  void *args[] = {&p};
+#ifdef MMCV_WITH_HIP
+  AT_CUDA_CHECK(hipLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,
+                                at::cuda::getCurrentCUDAStream()));
+#else
+  AT_CUDA_CHECK(cudaLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,
+                                 at::cuda::getCurrentCUDAStream()));
+#endif
+
+  return y;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f4166b7b7a4fc7297f452636a991bbf91789dd85
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
@@ -0,0 +1,286 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "voxelization_cuda_kernel.cuh"
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 block(512);
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int><<<grid, block, 0, stream>>>(
+            points.contiguous().data_ptr<scalar_t>(),
+            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
+            NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 2. map point to the idx of the corresponding voxel, find duplicate coor
+  // create some temporary variables
+  auto point_to_pointidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+
+  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 map_block(512);
+
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.scalar_type(), "determin_duplicate", ([&] {
+        point_to_voxelidx_kernel<int><<<map_grid, map_block, 0, stream>>>(
+            temp_coors.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            point_to_pointidx.contiguous().data_ptr<int>(), max_points,
+            max_voxels, num_points, NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 3. determine voxel num and voxel's coor index
+  // make the logic in the CUDA device could accelerate about 10 times
+  auto coor_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto voxel_num = at::zeros(
+      {
+          1,
+      },
+      points.options().dtype(at::kInt));  // must be zero from the beginning
+
+  AT_DISPATCH_ALL_TYPES(temp_coors.scalar_type(), "determin_duplicate", ([&] {
+                          determin_voxel_num<int><<<1, 1, 0, stream>>>(
+                              num_points_per_voxel.contiguous().data_ptr<int>(),
+                              point_to_voxelidx.contiguous().data_ptr<int>(),
+                              point_to_pointidx.contiguous().data_ptr<int>(),
+                              coor_to_voxelidx.contiguous().data_ptr<int>(),
+                              voxel_num.contiguous().data_ptr<int>(),
+                              max_points, max_voxels, num_points);
+                        }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 4. copy point features to voxels
+  // Step 4 & 5 could be parallel
+  auto pts_output_size = num_points * num_features;
+  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
+  dim3 cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_point_to_voxel<float, int><<<cp_grid, cp_block, 0, stream>>>(
+            pts_output_size, points.contiguous().data_ptr<float>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            voxels.contiguous().data_ptr<float>(), max_points, num_features,
+            num_points, NDim);
+      }));
+  //   cudaDeviceSynchronize();
+  //   AT_CUDA_CHECK(cudaGetLastError());
+
+  // 5. copy coors of each voxels
+  auto coors_output_size = num_points * NDim;
+  dim3 coors_cp_grid(
+      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
+  dim3 coors_cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_voxel_coors<float, int>
+            <<<coors_cp_grid, coors_cp_block, 0, stream>>>(
+                coors_output_size, temp_coors.contiguous().data_ptr<int>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                coor_to_voxelidx.contiguous().data_ptr<int>(),
+                coors.contiguous().data_ptr<int>(), num_points, NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+
+  return voxel_num_int;
+}
+
+int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  if (num_points == 0) return 0;
+
+  dim3 blocks(
+      std::min(at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK), 4096));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+            points.contiguous().data_ptr<scalar_t>(),
+            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
+            NDim);
+      }));
+
+  at::Tensor coors_map;
+  at::Tensor reduce_count;
+
+  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
+
+  std::tie(temp_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, false);
+
+  if (temp_coors[0][0].lt(0).item<bool>()) {
+    // the first element of temp_coors is (-1,-1,-1) and should be removed
+    temp_coors = temp_coors.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  int num_coors = temp_coors.size(0);
+  temp_coors = temp_coors.to(at::kInt);
+  coors_map = coors_map.to(at::kInt);
+
+  at::Tensor coors_count = at::zeros({1}, coors_map.options());
+  at::Tensor coors_order = at::empty({num_coors}, coors_map.options());
+  at::Tensor pts_id = at::zeros({num_points}, coors_map.options());
+  reduce_count = at::zeros({num_coors}, coors_map.options());
+
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "get_assign_pos", ([&] {
+        nondeterministic_get_assign_pos<<<blocks, threads, 0, stream>>>(
+            num_points, coors_map.contiguous().data_ptr<int32_t>(),
+            pts_id.contiguous().data_ptr<int32_t>(),
+            coors_count.contiguous().data_ptr<int32_t>(),
+            reduce_count.contiguous().data_ptr<int32_t>(),
+            coors_order.contiguous().data_ptr<int32_t>());
+      }));
+
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        nondeterministic_assign_point_voxel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                num_points, points.contiguous().data_ptr<scalar_t>(),
+                coors_map.contiguous().data_ptr<int32_t>(),
+                pts_id.contiguous().data_ptr<int32_t>(),
+                temp_coors.contiguous().data_ptr<int32_t>(),
+                reduce_count.contiguous().data_ptr<int32_t>(),
+                coors_order.contiguous().data_ptr<int32_t>(),
+                voxels.contiguous().data_ptr<scalar_t>(),
+                coors.contiguous().data_ptr<int32_t>(),
+                num_points_per_voxel.contiguous().data_ptr<int32_t>(),
+                max_voxels, max_points, num_features, NDim);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+  return max_voxels < num_coors ? max_voxels : num_coors;
+}
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK);
+  dim3 blocks(col_blocks);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
+    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+        points.contiguous().data_ptr<scalar_t>(),
+        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
+  });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_conv.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86690b9394a4b758104009062f656dcfe0de178e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_conv.cpp
@@ -0,0 +1,517 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, data_col);
+}
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, grad_im);
+}
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
+                       data_offset, channels, height, width, ksize_h, ksize_w,
+                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                       parallel_imgs, deformable_group, grad_offset);
+}
+
+void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
+                             at::Tensor *gradOutput, at::Tensor weight, int kH,
+                             int kW, int dH, int dW, int padH, int padW,
+                             int dilationH, int dilationW, int group,
+                             int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got kH: %d kW: %d",
+              kH, kW);
+
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+              "kernel size should be consistent with weight, ",
+              "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+              kH, kW, weight.size(2), weight.size(3));
+
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got dH: %d dW: %d", dH,
+              dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(ndim == 3 || ndim == 4,
+              "3D or 4D input tensor expected but got: %s", ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
+              "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(input.size(1) == nInputPlane,
+              "invalid number of input planes, expected: %d, but got: %d",
+              nInputPlane, input.size(1));
+
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+              "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+              "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane, gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight, outputWidth, gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
+                          padW, dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
+                                    im2col_step * outputHeight, outputWidth},
+                                   output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradInput);
+    CHECK_CUDA_INPUT(gradOffset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(columns);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradInput);
+    CHECK_CPU_INPUT(gradOffset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(columns);
+  }
+  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
+                          padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
+                                 inputHeight, inputWidth, kH, kW, padH, padW,
+                                 dH, dW, dilationH, dilationW, im2col_step,
+                                 deformable_group, gradOffset[elt]);
+
+    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group,
+                           gradInput[elt]);
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradWeight);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradWeight);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
+                          dW, padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer = gradOutputBuffer.contiguous();
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4fb78a96e74f7e97dff5212bb767eab743f2e73c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
+                       output, pooled_height, pooled_width, spatial_scale,
+                       sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
+                       offset, grad_input, grad_offset, pooled_height,
+                       pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma) {
+  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma) {
+  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2361b7fbe5c86fa62a0fa78f39f6d018de108f8f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
+                              vertices, mask, num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
+                                              Tensor num_valid) {
+  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/filtered_lrelu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/filtered_lrelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7ecc14cf96868241c8029fdde579f179bfdbc00
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/filtered_lrelu.cpp
@@ -0,0 +1,37 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op_impl(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
+    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
+    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
+    bool writeSigns) {
+  return DISPATCH_DEVICE_IMPL(filtered_lrelu_op_impl, x, fu, fd, b, si, up,
+                              down, px0, px1, py0, py1, sx, sy, gain, slope,
+                              clamp, flip_filters, writeSigns);
+}
+
+std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
+    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
+    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
+    bool writeSigns) {
+  return filtered_lrelu_op_impl(x, fu, fd, b, si, up, down, px0, px1, py0, py1,
+                                sx, sy, gain, slope, clamp, flip_filters,
+                                writeSigns);
+}
+
+torch::Tensor filtered_lrelu_act_op_impl(torch::Tensor x, torch::Tensor si,
+                                         int sx, int sy, float gain,
+                                         float slope, float clamp,
+                                         bool writeSigns) {
+  return DISPATCH_DEVICE_IMPL(filtered_lrelu_act_op_impl, x, si, sx, sy, gain,
+                              slope, clamp, writeSigns);
+}
+
+torch::Tensor filtered_lrelu_act_(torch::Tensor x, torch::Tensor si, int sx,
+                                  int sy, float gain, float slope, float clamp,
+                                  bool writeSigns) {
+  return filtered_lrelu_act_op_impl(x, si, sx, sy, gain, slope, clamp,
+                                    writeSigns);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/focal_loss.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/focal_loss.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..51568ead3442f1a15c54e781c8b9efaac9f01619
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/focal_loss.cpp
@@ -0,0 +1,140 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#ifdef MMCV_WITH_DIOPI
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+#include <diopi/functions_mmcv.h>
+
+#include "csrc_dipu/diopirt/diopirt_impl.h"
+
+using dipu::diopi_helper::toDiopiScalar;
+using dipu::diopi_helper::toDiopiTensorHandle;
+#endif
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
+                       grad_input, gamma, alpha);
+}
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
+                       buff, grad_input, gamma, alpha);
+}
+
+#ifdef MMCV_WITH_DIOPI
+void sigmoid_focal_loss_forward_diopi(Tensor input, Tensor target,
+                                      Tensor weight, Tensor output, float gamma,
+                                      float alpha) {
+  auto input_p = toDiopiTensorHandle(input);
+  diopiDevice_t device;
+  diopiGetTensorDevice(input_p, &device);
+  if (device == diopi_host) {
+    sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma,
+                                    alpha);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto target_p = toDiopiTensorHandle(target);
+  auto weight_p = toDiopiTensorHandle(weight);
+  auto output_p = toDiopiTensorHandle(output);
+  if (reinterpret_cast<void *>(diopiSigmoidFocalLossMmcv) != nullptr) {
+    auto ret = diopiSigmoidFocalLossMmcv(ch, output_p, input_p, target_p,
+                                         weight_p, gamma, alpha);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING)
+      << "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl";
+  auto input_cpu = input.cpu();
+  auto target_cpu = target.cpu();
+  auto weight_cpu = weight.cpu();
+  auto output_cpu = output.cpu();
+  sigmoid_focal_loss_forward_impl(input_cpu, target_cpu, weight_cpu, output_cpu,
+                                  gamma, alpha);
+  output.copy_(output_cpu);
+  return;
+}
+
+void sigmoid_focal_loss_backward_diopi(Tensor input, Tensor target,
+                                       Tensor weight, Tensor grad_input,
+                                       float gamma, float alpha) {
+  auto input_p = toDiopiTensorHandle(input);
+  diopiDevice_t device;
+  diopiGetTensorDevice(input_p, &device);
+  if (device == diopi_host) {
+    sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
+                                     alpha);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto target_p = toDiopiTensorHandle(target);
+  auto weight_p = toDiopiTensorHandle(weight);
+  auto grad_input_p = toDiopiTensorHandle(grad_input);
+  if (reinterpret_cast<void *>(diopiSigmoidFocalLossBackwardMmcv) != nullptr) {
+    auto ret = diopiSigmoidFocalLossBackwardMmcv(
+        ch, grad_input_p, input_p, target_p, weight_p, gamma, alpha);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING)
+      << "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl";
+  auto input_cpu = input.cpu();
+  auto target_cpu = target.cpu();
+  auto weight_cpu = weight.cpu();
+  auto grad_input_cpu = grad_input.cpu();
+  sigmoid_focal_loss_backward_impl(input_cpu, target_cpu, weight_cpu,
+                                   grad_input_cpu, gamma, alpha);
+  grad_input.copy_(grad_input_cpu);
+  return;
+}
+#endif
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+#ifdef MMCV_WITH_DIOPI
+  sigmoid_focal_loss_forward_diopi(input, target, weight, output, gamma, alpha);
+#else
+  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+#endif
+}
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha) {
+#ifdef MMCV_WITH_DIOPI
+  sigmoid_focal_loss_backward_diopi(input, target, weight, grad_input, gamma,
+                                    alpha);
+#else
+  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
+                                   alpha);
+#endif
+}
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha) {
+  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c7098acdb5b8392a698803dd7c7d34a360df6ad
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
@@ -0,0 +1,34 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
+                       temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
+                       points_tensor, temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m) {
+  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
+                                       b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m) {
+  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
+                                                 idx_tensor, b, n, m);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8d411c9d843f15174653aab4b24cbb3c37564073
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
@@ -0,0 +1,119 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale) {
+  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
+                              act, grad, alpha, scale);
+}
+
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
+                                   const torch::Tensor& bias,
+                                   const torch::Tensor& refer, int act,
+                                   int grad, float alpha, float scale) {
+  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
+                                      scale);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..54073a54ec5d335d2e2ed68c553eb1d6eb49557b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
@@ -0,0 +1,34 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_indice_conv_batchnorm_forward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl,
+                              features, filters, bias, indicePairs, indiceNum,
+                              numActOut, _inverse, _subM);
+}
+
+torch::Tensor fused_indice_conv_batchnorm_forward(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return fused_indice_conv_batchnorm_forward_impl(features, filters, bias,
+                                                  indicePairs, indiceNum,
+                                                  numActOut, _inverse, _subM);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/gather_points.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/gather_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8fb020022902bfbeb5ba940621d51859c616bdc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/gather_points.cpp
@@ -0,0 +1,30 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
+                       idx, out);
+}
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
+                       idx, grad_points);
+}
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n,
+                           int npoints) {
+  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
+                             out_tensor);
+}
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints) {
+  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
+                              grad_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/group_points.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/group_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..850deed9866a63604e8c1171dc6c485ffad62c72
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/group_points.cpp
@@ -0,0 +1,76 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points, idx, out);
+}
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
+                       grad_out, idx, grad_points);
+}
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points_tensor, idx_tensor, out_tensor);
+}
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample) {
+  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
+                             idx_tensor, grad_points_tensor);
+}
+
+void stack_group_points_backward_impl(int b, int c, int m, int n, int nsample,
+                                      const Tensor grad_out_tensor,
+                                      const Tensor idx_tensor,
+                                      const Tensor idx_batch_cnt_tensor,
+                                      const Tensor features_batch_cnt_tensor,
+                                      Tensor grad_features_tensor) {
+  DISPATCH_DEVICE_IMPL(stack_group_points_backward_impl, b, c, m, n, nsample,
+                       grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,
+                       features_batch_cnt_tensor, grad_features_tensor);
+}
+
+void stack_group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                 Tensor idx_batch_cnt_tensor,
+                                 Tensor features_batch_cnt_tensor,
+                                 Tensor grad_features_tensor, int b, int c,
+                                 int m, int n, int nsample) {
+  stack_group_points_backward_impl(
+      b, c, m, n, nsample, grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,
+      features_batch_cnt_tensor, grad_features_tensor);
+}
+
+void stack_group_points_forward_impl(int b, int c, int m, int nsample,
+                                     const Tensor features_tensor,
+                                     const Tensor features_batch_cnt_tensor,
+                                     const Tensor idx_tensor,
+                                     const Tensor idx_batch_cnt_tensor,
+                                     Tensor out_tensor) {
+  DISPATCH_DEVICE_IMPL(stack_group_points_forward_impl, b, c, m, nsample,
+                       features_tensor, features_batch_cnt_tensor, idx_tensor,
+                       idx_batch_cnt_tensor, out_tensor);
+}
+
+void stack_group_points_forward(Tensor features_tensor,
+                                Tensor features_batch_cnt_tensor,
+                                Tensor idx_tensor, Tensor idx_batch_cnt_tensor,
+                                Tensor out_tensor, int b, int c, int m,
+                                int nsample) {
+  DISPATCH_DEVICE_IMPL(stack_group_points_forward_impl, b, c, m, nsample,
+                       features_tensor, features_batch_cnt_tensor, idx_tensor,
+                       idx_batch_cnt_tensor, out_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/info.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/info.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4cc41861128dc0a8f8ccd641f68044428c4dc2c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/info.cpp
@@ -0,0 +1,65 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+#ifdef MMCV_WITH_HIP
+#include <hip/hip_runtime_api.h>
+int get_hiprt_version() {
+  int runtimeVersion;
+  hipRuntimeGetVersion(&runtimeVersion);
+  return runtimeVersion;
+}
+#else
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef MMCV_WITH_HIP
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  std::ostringstream oss;
+  oss << get_hiprt_version();
+  return oss.str();
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/iou3d.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/iou3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a347c0ee96db9ceefd6168c3cce84bea243e7044
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/iou3d.cpp
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
+                       num_b, boxes_b, ans_overlap);
+}
+
+void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
+                              Tensor &keep_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, keep, keep_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
+                                     Tensor &keep_num,
+                                     float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, keep, keep_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
+                                       ans_overlap);
+}
+
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  iou3d_nms3d_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  iou3d_nms3d_normal_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/knn.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/knn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4be9428c59c0f04635891b954f4c73f7fb0536d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/knn.cpp
@@ -0,0 +1,17 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
+                       dist2);
+}
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
+  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
+                   dist2_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/masked_conv2d.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5903925351fcb193b86c8b5f01b410e4fc0bbaf9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
+                       col, kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
+                       im, height, width, channels);
+}
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w) {
+  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels) {
+  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/min_area_polygons.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/min_area_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ff996dc8992b4c95633516054ecdba5913de8f3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/min_area_polygons.cpp
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
+  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
+}
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons) {
+  min_area_polygons_impl(pointsets, polygons);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5016e4a4e345ce7d6a049ffeb9f3dbf140c74a7b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp
@@ -0,0 +1,47 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void ball_query_forward_mlu(int b, int n, int m, float min_radius,
+                            float max_radius, int nsample, const Tensor new_xyz,
+                            const Tensor xyz, Tensor idx) {
+  auto new_xyz_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      new_xyz, new_xyz.suggest_memory_format());
+  auto xyz_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      xyz, new_xyz.suggest_memory_format());
+  auto idx_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      idx, new_xyz.suggest_memory_format());
+
+  MluOpTensorDescriptor new_xyz_desc, xyz_desc, idx_desc;
+  new_xyz_desc.set(new_xyz_contiguous);
+  xyz_desc.set(xyz_contiguous);
+  idx_desc.set(idx_contiguous);
+
+  auto new_xyz_impl = torch_mlu::getMluTensorImpl(new_xyz_contiguous);
+  auto xyz_impl = torch_mlu::getMluTensorImpl(xyz_contiguous);
+  auto idx_impl = torch_mlu::getMluTensorImpl(idx_contiguous);
+  auto new_xyz_ptr = new_xyz_impl->cnnlMalloc();
+  auto xyz_ptr = xyz_impl->cnnlMalloc();
+  auto idx_ptr = idx_impl->cnnlMalloc();
+
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpBallQuery(
+      handle, new_xyz_desc.desc(), new_xyz_ptr, xyz_desc.desc(), xyz_ptr,
+      min_radius, max_radius, nsample, idx_desc.desc(), idx_ptr));
+}
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx);
+
+REGISTER_DEVICE_IMPL(ball_query_forward_impl, MLU, ball_query_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a88dfc74aa2c4d5edf4fb37444533c744299b950
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
@@ -0,0 +1,56 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "mlu_common_helper.h"
+
+void bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                       const int32_t mode, const bool aligned,
+                       const int32_t offset) {
+  // check dtype
+  TORCH_CHECK(
+      bboxes1.scalar_type() == at::kFloat || bboxes1.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      bboxes1.scalar_type(), ".");
+  TORCH_CHECK(bboxes1.scalar_type() == bboxes2.scalar_type(),
+              "bboxes1's dtype should be the same with bboxes2's dtype.");
+
+  // params check
+  TORCH_CHECK(bboxes1.dim() == 2, "bboxes1 should be a 2d tensor, got ",
+              bboxes1.dim(), "D");
+  TORCH_CHECK(bboxes2.dim() == 2, "bboxes2 should be a 2d tensor, got ",
+              bboxes2.dim(), "D");
+
+  auto rows = bboxes1.size(0);
+  auto cols = bboxes2.size(0);
+  auto batch_num_all = rows;
+
+  if (rows * cols == 0) {
+    // return if zero element
+    return;
+  }
+
+  INITIAL_MLU_PARAM_WITH_TENSOR(bboxes1);
+  INITIAL_MLU_PARAM_WITH_TENSOR(bboxes2);
+  INITIAL_MLU_PARAM_WITH_TENSOR(ious);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  TORCH_MLUOP_CHECK(mluOpBboxOverlaps(
+      handle, mode, aligned, offset, bboxes1_desc.desc(), bboxes1_ptr,
+      bboxes2_desc.desc(), bboxes2_ptr, ious_desc.desc(), ious_ptr));
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MLU, bbox_overlaps_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b04f51cb4bc05189e040336a8a8e66b7309a3702
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp
@@ -0,0 +1,54 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void BoxIouRotatedMLUKernelLauncher(const Tensor boxes1, const Tensor boxes2,
+                                    Tensor ious, const int mode_flag,
+                                    const bool aligned) {
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  auto boxes1_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      boxes1, boxes1.suggest_memory_format());
+  auto boxes2_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      boxes2, boxes2.suggest_memory_format());
+  auto ious_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(ious, ious.suggest_memory_format());
+
+  MluOpTensorDescriptor boxes1_desc, boxes2_desc, ious_desc;
+  boxes1_desc.set(boxes1_contiguous);
+  boxes2_desc.set(boxes2_contiguous);
+  ious_desc.set(ious_contiguous);
+
+  auto boxes1_impl = torch_mlu::getMluTensorImpl(boxes1_contiguous);
+  auto boxes2_impl = torch_mlu::getMluTensorImpl(boxes2_contiguous);
+  auto ious_impl = torch_mlu::getMluTensorImpl(ious_contiguous);
+
+  auto boxes1_ptr = boxes1_impl->cnnlMalloc();
+  auto boxes2_ptr = boxes2_impl->cnnlMalloc();
+  auto ious_ptr = ious_impl->cnnlMalloc();
+
+  CNLOG(INFO) << "Call mluOpBoxIouRotated().";
+  TORCH_MLUOP_CHECK(mluOpBoxIouRotated(
+      handle, mode_flag, aligned, boxes1_desc.desc(), boxes1_ptr,
+      boxes2_desc.desc(), boxes2_ptr, ious_desc.desc(), ious_ptr));
+}
+
+void box_iou_rotated_mlu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  BoxIouRotatedMLUKernelLauncher(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, MLU, box_iou_rotated_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5d6e7e20921e168987b96d5b1bdff8097d34dbe6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp
@@ -0,0 +1,208 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void CARAFEForwardMLUKernelLauncher(const Tensor input, const Tensor mask,
+                                    Tensor rinput, Tensor routput, Tensor rmask,
+                                    Tensor output, const int kernel_size,
+                                    const int group_size,
+                                    const int scale_factor) {
+  // check tensor data type
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      input.scalar_type(), ".");
+
+  TORCH_CHECK(mask.scalar_type() == input.scalar_type(),
+              "Data types of input and mask should be the same, but got ",
+              input.scalar_type(), " and ", mask.scalar_type());
+
+  // check number of dimensions
+  TORCH_CHECK(input.dim() == 4, "input should be a 4-D tensor, but has ",
+              input.dim(), "D.");
+  TORCH_CHECK(mask.dim() == 4, "mask should be a 4-D tensor, but has ",
+              input.dim(), "D.");
+
+  // return fast on zero-element tensor
+  if (output.numel() == 0) {
+    output = at::zeros(output.sizes().vec(), output.options());
+    return;
+  }
+
+  // convert NCHW to NHWC
+  auto memory_format_input_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto rinput_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format_input_nhwc);
+
+  auto memory_format_mask_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(mask.dim());
+  auto rmask_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(mask, memory_format_mask_nhwc);
+
+  auto memory_format_output_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(output.dim());
+  auto routput_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format_output_nhwc);
+
+  // set tensor descriptor
+  MluOpTensorDescriptor input_desc, mask_desc, output_desc;
+  input_desc.set_with_layout(rinput_, MLUOP_LAYOUT_NHWC);
+  mask_desc.set_with_layout(rmask_, MLUOP_LAYOUT_NHWC);
+  output_desc.set_with_layout(routput_, MLUOP_LAYOUT_NHWC);
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(rinput_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto mask_impl = torch_mlu::getMluTensorImpl(rmask_);
+  auto mask_ptr = mask_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(routput_);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // set op descriptor
+  auto handle = mluOpGetCurrentHandle();
+  mluOpCarafeDescriptor_t carafe_desc;
+  TORCH_MLUOP_CHECK(mluOpCreateCarafeDescriptor(&carafe_desc));
+  TORCH_MLUOP_CHECK(mluOpSetCarafeDescriptor(
+      carafe_desc, input.dim(), kernel_size, group_size, scale_factor));
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpCarafeForward(handle, carafe_desc, input_desc.desc(),
+                                       input_ptr, mask_desc.desc(), mask_ptr,
+                                       output_desc.desc(), output_ptr));
+  // destroy op descriptor
+  TORCH_MLUOP_CHECK(mluOpDestroyCarafeDescriptor(carafe_desc));
+
+  // copy output from NHWC back into NCHW
+  rinput.copy_(rinput_);
+  output.copy_(routput_);
+}
+
+void CARAFEBackwardMLUKernelLauncher(
+    const Tensor grad_output, const Tensor rinput, const Tensor mask,
+    Tensor rgrad_output, Tensor rgrad_input_hs, Tensor rgrad_input,
+    Tensor rgrad_mask, Tensor grad_input, Tensor grad_mask,
+    const int kernel_size, const int group_size, const int scale_factor) {
+  // data type check
+  TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||
+                  grad_output.scalar_type() == at::kHalf,
+              "grad_output type should be Float or Half, got ",
+              grad_output.scalar_type());
+  TORCH_CHECK(grad_output.scalar_type() == mask.scalar_type(),
+              "mask should have the same type as grad_output");
+
+  // dim check
+  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be a 4d tensor, got ",
+              grad_output.dim(), "D");
+
+  // param check
+  TORCH_CHECK(kernel_size < 137, "kernel_size should be less than 137, got ",
+              kernel_size);
+
+  // convert NCHW to NHWC
+  auto memory_format_input_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(rinput.dim());
+  auto rinput_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rinput, memory_format_input_nhwc);
+
+  auto memory_format_mask_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(mask.dim());
+  auto rmask_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(mask, memory_format_mask_nhwc);
+
+  auto memory_format_grad_output_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
+  auto rgrad_output_ = torch_mlu::cnnl::ops::cnnl_contiguous(
+      grad_output, memory_format_grad_output_nhwc);
+
+  auto memory_format_grad_input_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_input.dim());
+  auto rgrad_input_ = torch_mlu::cnnl::ops::cnnl_contiguous(
+                          grad_input, memory_format_grad_input_nhwc)
+                          .zero_();
+
+  auto memory_format_grad_mask_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_mask.dim());
+  auto rgrad_mask_ = torch_mlu::cnnl::ops::cnnl_contiguous(
+      grad_mask, memory_format_grad_mask_nhwc);
+
+  // set tensor descriptor
+  MluOpTensorDescriptor input_desc, mask_desc;
+  input_desc.set_with_layout(rinput_, MLUOP_LAYOUT_NHWC);
+  mask_desc.set_with_layout(rmask_, MLUOP_LAYOUT_NHWC);
+
+  MluOpTensorDescriptor grad_output_desc, grad_input_desc, grad_mask_desc;
+  grad_output_desc.set_with_layout(rgrad_output_, MLUOP_LAYOUT_NHWC);
+  grad_input_desc.set_with_layout(rgrad_input_, MLUOP_LAYOUT_NHWC);
+  grad_mask_desc.set_with_layout(rgrad_mask_, MLUOP_LAYOUT_NHWC);
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(rinput_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto mask_impl = torch_mlu::getMluTensorImpl(rmask_);
+  auto mask_ptr = mask_impl->cnnlMalloc();
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(rgrad_output_);
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(rgrad_input_);
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+  auto grad_mask_impl = torch_mlu::getMluTensorImpl(rgrad_mask_);
+  auto grad_mask_ptr = grad_mask_impl->cnnlMalloc();
+
+  // set op descriptor
+  auto handle = mluOpGetCurrentHandle();
+  mluOpCarafeDescriptor_t carafe_desc;
+  TORCH_MLUOP_CHECK(mluOpCreateCarafeDescriptor(&carafe_desc));
+  TORCH_MLUOP_CHECK(mluOpSetCarafeDescriptor(
+      carafe_desc, grad_output.dim(), kernel_size, group_size, scale_factor));
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpCarafeBackward(
+      handle, carafe_desc, input_desc.desc(), input_ptr, mask_desc.desc(),
+      mask_ptr, grad_output_desc.desc(), grad_output_ptr,
+      grad_input_desc.desc(), grad_input_ptr, grad_mask_desc.desc(),
+      grad_mask_ptr));
+  // destroy op descriptor
+  TORCH_MLUOP_CHECK(mluOpDestroyCarafeDescriptor(carafe_desc));
+
+  // copy output from NHWC back into NCHW
+  grad_input.copy_(rgrad_input_);
+  grad_mask.copy_(rgrad_mask_);
+}
+
+void carafe_forward_mlu(Tensor features, Tensor masks, Tensor rfeatures,
+                        Tensor routput, Tensor rmasks, Tensor output,
+                        int kernel_size, int group_size, int scale_factor) {
+  CARAFEForwardMLUKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                 output, kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward_mlu(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                         Tensor rtop_grad, Tensor rbottom_grad_hs,
+                         Tensor rbottom_grad, Tensor rmask_grad,
+                         Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                         int group_size, int scale_factor) {
+  CARAFEBackwardMLUKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                  rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                  bottom_grad, mask_grad, kernel_size,
+                                  group_size, scale_factor);
+}
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_forward_impl, MLU, carafe_forward_mlu);
+REGISTER_DEVICE_IMPL(carafe_backward_impl, MLU, carafe_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..26705d9f2e51413890981ee96ffd4f80bf2a5e13
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp
@@ -0,0 +1,159 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void DeformRoIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois,
+                                           Tensor offset, Tensor output,
+                                           int pooled_height, int pooled_width,
+                                           float spatial_scale,
+                                           int sampling_ratio, float gamma) {
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+  auto rois_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
+  auto output_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);
+
+  MluOpTensorDescriptor input_desc, rois_desc, offset_desc, output_desc;
+  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
+  rois_desc.set(rois_contiguous);
+  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
+
+  mluOpTensorDescriptor_t offset_real_desc = NULL;
+  void *offset_ptr = NULL;
+  if (offset.defined() && offset.numel() > 0) {
+    auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+        offset, offset.suggest_memory_format());
+    offset_desc.set(offset_contiguous);
+    offset_real_desc = offset_desc.desc();
+    auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);
+    offset_ptr = offset_impl->cnnlMalloc();
+  }
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpDeformRoiPoolForward(
+      handle, input_desc.desc(), input_ptr, rois_desc.desc(), rois_ptr,
+      offset_real_desc, offset_ptr, pooled_height, pooled_width, spatial_scale,
+      sampling_ratio, gamma, output_desc.desc(), output_ptr));
+
+  output.copy_(output_contiguous);
+}
+
+void DeformRoIPoolBackwardMLUKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma) {
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
+  auto grad_output_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);
+  memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+  auto rois_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
+  auto grad_input_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(grad_input, memory_format);
+
+  // get ptr of tensors
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto input_impl = torch_mlu::getMluTensorImpl(input_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  MluOpTensorDescriptor grad_output_desc, input_desc, rois_desc, offset_desc,
+      grad_input_desc, grad_offset_desc;
+  grad_output_desc.set_with_layout(grad_output_, MLUOP_LAYOUT_NHWC);
+  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
+  rois_desc.set(rois_contiguous);
+  grad_input_desc.set_with_layout(grad_input_, MLUOP_LAYOUT_NHWC);
+  mluOpTensorDescriptor_t offset_real_desc = NULL;
+  void *offset_ptr = NULL;
+  if (offset.defined() && offset.numel() > 0) {
+    auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+        offset, offset.suggest_memory_format());
+    offset_desc.set(offset_contiguous);
+    offset_real_desc = offset_desc.desc();
+    auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);
+    offset_ptr = offset_impl->cnnlMalloc();
+  }
+  mluOpTensorDescriptor_t grad_offset_real_desc = NULL;
+  void *grad_offset_ptr = NULL;
+  if (grad_offset.defined() && grad_offset.numel() > 0) {
+    auto grad_offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+        grad_offset, grad_offset.suggest_memory_format());
+    grad_offset_desc.set(grad_offset_contiguous);
+    grad_offset_real_desc = grad_offset_desc.desc();
+    auto grad_offset_impl = torch_mlu::getMluTensorImpl(grad_offset_contiguous);
+    grad_offset_ptr = grad_offset_impl->cnnlMalloc();
+  }
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpDeformRoiPoolBackward(
+      handle, grad_output_desc.desc(), grad_output_ptr, input_desc.desc(),
+      input_ptr, rois_desc.desc(), rois_ptr, offset_real_desc, offset_ptr,
+      pooled_height, pooled_width, spatial_scale, sampling_ratio, gamma,
+      grad_input_desc.desc(), grad_input_ptr, grad_offset_real_desc,
+      grad_offset_ptr));
+  grad_input.copy_(grad_input_);
+}
+
+void deform_roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor offset,
+                                 Tensor output, int pooled_height,
+                                 int pooled_width, float spatial_scale,
+                                 int sampling_ratio, float gamma) {
+  DeformRoIPoolForwardMLUKernelLauncher(input, rois, offset, output,
+                                        pooled_height, pooled_width,
+                                        spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_mlu(Tensor grad_output, Tensor input, Tensor rois,
+                                  Tensor offset, Tensor grad_input,
+                                  Tensor grad_offset, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DeformRoIPoolBackwardMLUKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, MLU,
+                     deform_roi_pool_forward_mlu);
+REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, MLU,
+                     deform_roi_pool_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/diff_iou_rotated_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/diff_iou_rotated_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..046f5895753c456664bed57edaea1e46c7273c95
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/diff_iou_rotated_mlu.cpp
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (C) 2023 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+Tensor diff_iou_rotated_sort_vertices_forward_mlu(Tensor vertices, Tensor mask,
+                                                  Tensor num_valid) {
+  // params check
+  TORCH_CHECK(vertices.scalar_type() == at::kFloat,
+              "vertices type should be Float, got ", vertices.scalar_type());
+  TORCH_CHECK(mask.scalar_type() == at::kBool, "mask should be Bool, got ",
+              mask.scalar_type());
+  TORCH_CHECK(num_valid.scalar_type() == at::kInt,
+              "num_valid type should be Int32, got ", num_valid.scalar_type());
+  TORCH_CHECK(vertices.size(2) == 24, "vertices.dim(2) should be 24, got ",
+              vertices.size(2));
+  TORCH_CHECK(mask.size(2) == 24, "mask.dim(2) should be 24, got ",
+              mask.size(2));
+
+  // zero-element check
+  if (vertices.numel() == 0) {
+    return at::empty({0}, num_valid.options().dtype(at::kInt));
+  }
+
+  auto idx = at::empty({vertices.size(0), vertices.size(1), 9},
+                       num_valid.options().dtype(at::kInt));
+
+  INITIAL_MLU_PARAM_WITH_TENSOR(vertices);
+  INITIAL_MLU_PARAM_WITH_TENSOR(mask);
+  INITIAL_MLU_PARAM_WITH_TENSOR(num_valid);
+  INITIAL_MLU_PARAM_WITH_TENSOR(idx);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpDiffIouRotatedSortVerticesForward(
+      handle, vertices_desc.desc(), vertices_ptr, mask_desc.desc(), mask_ptr,
+      num_valid_desc.desc(), num_valid_ptr, idx_desc.desc(), idx_ptr));
+  return idx;
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, MLU,
+                     diff_iou_rotated_sort_vertices_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b5633b6df575cb0be1354715ff9fbc826b81d534
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
@@ -0,0 +1,177 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <string>
+#include <vector>
+
+#include "mlu_common_helper.h"
+
+void sigmoid_focal_loss_forward_mlu(Tensor input, Tensor target, Tensor weight,
+                                    Tensor output, const float gamma,
+                                    const float alpha) {
+  // params check
+  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
+              "But now gamma is ", gamma, ".");
+
+  // check dtype
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      input.scalar_type(), ".");
+
+  TORCH_CHECK(
+      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
+      "target type should be Int or Long. ", "But now target type is ",
+      target.scalar_type(), ".");
+
+  if (weight.data_ptr() != nullptr) {
+    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
+                "Data types of input and weight should be the same. But now "
+                "input type is ",
+                input.scalar_type(), ", weight type is ", weight.scalar_type(),
+                ".");
+  } else {
+    CNLOG(INFO) << "weight is a empty tensor.";
+  }
+
+  // return if zero-element
+  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
+    return;
+  }
+
+  // contiguous
+  auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      input, input.suggest_memory_format());
+  // target only support in32
+  auto target_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      target.toType(at::kInt), target.suggest_memory_format());
+  auto weight_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      weight, weight.suggest_memory_format());
+  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      output, output.suggest_memory_format());
+
+  // set tensor descriptor
+  MluOpTensorDescriptor input_desc, target_desc, weight_desc, output_desc;
+  input_desc.set(input_contiguous);
+  target_desc.set(target_contiguous);
+  weight_desc.set(weight_contiguous);
+  output_desc.set(output_contiguous);
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto target_impl = torch_mlu::getMluTensorImpl(target_contiguous);
+  auto target_ptr = target_impl->cnnlMalloc();
+  auto weight_impl = torch_mlu::getMluTensorImpl(weight_contiguous);
+  auto weight_ptr = weight_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // set prefer computation performance and redcuntion approach
+  mluOpComputationPreference_t prefer = MLUOP_COMPUTATION_FAST;
+  mluOpLossReduction_t reduction = MLUOP_LOSS_REDUCTION_NONE;
+
+  auto handle = mluOpGetCurrentHandle();
+
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpFocalLossSigmoidForward(
+      handle, prefer, reduction, input_desc.desc(), input_ptr,
+      target_desc.desc(), target_ptr, weight_desc.desc(), weight_ptr, alpha,
+      gamma, output_desc.desc(), output_ptr));
+}
+
+void sigmoid_focal_loss_backward_mlu(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, const float gamma,
+                                     const float alpha) {
+  // params check
+  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
+              "But now gamma is ", gamma, ".");
+  // check dtype
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      input.scalar_type(), ".");
+
+  TORCH_CHECK(
+      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
+      "target type should be Int or Long. ", "But now target type is ",
+      target.scalar_type(), ".");
+
+  bool has_weight = false;
+  if (weight.data_ptr() != nullptr) {
+    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
+                "Data types of input and weight should be the same. But now "
+                "input type is ",
+                input.scalar_type(), ", weight type is ", weight.scalar_type(),
+                ".");
+    has_weight = true;
+  } else {
+    CNLOG(INFO) << "weight is a empty tensor.";
+  }
+
+  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
+    // return if zero-element
+    return;
+  }
+
+  // contiguous
+  auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      input, input.suggest_memory_format());
+  // only support in32
+  auto target_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      target.toType(at::kInt), target.suggest_memory_format());
+  auto weight_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      weight, weight.suggest_memory_format());
+  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      output, output.suggest_memory_format());
+
+  // set tensor descriptor
+  MluOpTensorDescriptor input_desc, target_desc, weight_desc, output_desc;
+  input_desc.set(input_contiguous);
+  target_desc.set(target_contiguous);
+  weight_desc.set(weight_contiguous);
+  output_desc.set(output_contiguous);
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto target_impl = torch_mlu::getMluTensorImpl(target_contiguous);
+  auto target_ptr = target_impl->cnnlMalloc();
+  auto weight_impl = torch_mlu::getMluTensorImpl(weight_contiguous);
+  auto weight_ptr = weight_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // set prefer computation performance and redcuntion approach
+  // backward only support MLUOP_COMPUTATION_HIGH_PRECISION
+  mluOpComputationPreference_t prefer = MLUOP_COMPUTATION_HIGH_PRECISION;
+  mluOpLossReduction_t reduction = MLUOP_LOSS_REDUCTION_NONE;
+
+  auto handle = mluOpGetCurrentHandle();
+
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpFocalLossSigmoidBackward(
+      handle, prefer, reduction, input_desc.desc(), input_ptr,
+      target_desc.desc(), target_ptr, weight_desc.desc(), weight_ptr, alpha,
+      gamma, output_desc.desc(), output_ptr));
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, MLU,
+                     sigmoid_focal_loss_forward_mlu);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, MLU,
+                     sigmoid_focal_loss_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..58bf6fc4bf9b0c63926afb4fbe0f2541e9370bf9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
@@ -0,0 +1,79 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "mlu_common_helper.h"
+
+void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
+                                 float iou_threshold) {
+  if (boxes.numel() == 0) {
+    return;
+  }
+
+  int input_box_num = boxes.size(0);
+  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
+  auto output = keep.to(boxes.options().dtype(at::kInt));
+  auto output_size = at::empty({1}, boxes.options().dtype(at::kInt));
+
+  MluOpTensorDescriptor boxes_desc, output_desc;
+  boxes_desc.set(boxes_);
+  output_desc.set(output);
+
+  // workspace
+  size_t workspace_size = 0;
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpGetNmsWorkspaceSize(handle, boxes_desc.desc(), NULL,
+                                             &workspace_size));
+  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));
+
+  // get compute queue
+  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
+  auto boxes_ptr = boxes_impl->cnnlMalloc();
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(keep);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto output_size_impl = torch_mlu::getMluTensorImpl(keep_num);
+  auto output_size_ptr = output_size_impl->cnnlMalloc();
+
+  // nms desc
+  mluOpNmsDescriptor_t nms_desc;
+  const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
+  const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
+  const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
+  const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
+  const float soft_nms_sigma = 0.0;
+  const float confidence_threshold = 0.0;
+  const int input_layout = 0;
+  const bool pad_to_max_output_size = false;
+  const int max_output_size = input_box_num;
+  const float offset = 0.0;
+
+  TORCH_MLUOP_CHECK(mluOpCreateNmsDescriptor(&nms_desc));
+  TORCH_MLUOP_CHECK(mluOpSetNmsDescriptor(
+      nms_desc, box_mode, output_mode, algo, method_mode, iou_threshold,
+      soft_nms_sigma, max_output_size, confidence_threshold, offset,
+      input_layout, pad_to_max_output_size));
+
+  TORCH_MLUOP_CHECK(mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr,
+                             NULL, NULL, workspace_ptr, workspace_size,
+                             output_desc.desc(), output_ptr, output_size_ptr));
+  TORCH_MLUOP_CHECK(mluOpDestroyNmsDescriptor(nms_desc));
+}
+
+void iou3d_nms3d_forward_mlu(const Tensor boxes, Tensor &keep, Tensor &keep_num,
+                             float nms_overlap_thresh) {
+  IoU3DNMS3DMLUKernelLauncher(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
+                              Tensor &keep_num, float nms_overlap_thresh);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, MLU, iou3d_nms3d_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7842b3a13841b2caa27ded028ee103193822931
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp
@@ -0,0 +1,226 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelMaskedIm2colForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    cnrtDataType_t k_dtype, const void *im_ptr, const int height,
+    const int width, const int channels, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const void *mask_h_idx_ptr,
+    const void *mask_w_idx_ptr, const int mask_cnt, void *col_ptr);
+
+void KernelMaskedCol2imForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                               cnrtQueue_t queue, cnrtDataType_t k_dtype,
+                               const void *col_ptr, const int height,
+                               const int width, const int channels,
+                               const void *mask_h_idx_ptr,
+                               const void *mask_w_idx_ptr, const int mask_cnt,
+                               void *im_ptr);
+
+// policy function
+static void policyFunc(const int mask_cnt, cnrtDim3_t *k_dim,
+                       cnrtFunctionType_t *k_type) {
+  const size_t cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  const size_t core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  const size_t task_dim = CEIL_ALIGN(mask_cnt, core_num);
+  k_dim->x = core_num;
+  k_dim->y =
+      (task_dim / core_num) > cluster_num ? cluster_num : (task_dim / core_num);
+  k_dim->z = 1;
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+}
+
+void MaskedIm2colForwardMLUKernelLauncher(const Tensor im,
+                                          const Tensor mask_h_idx,
+                                          const Tensor mask_w_idx, Tensor col,
+                                          const int kernel_h,
+                                          const int kernel_w, const int pad_h,
+                                          const int pad_w) {
+  // Check dtype.
+  TORCH_CHECK(im.scalar_type() == at::kFloat || im.scalar_type() == at::kHalf,
+              "im type should be Float or Half, got ", im.scalar_type(), ".");
+  TORCH_CHECK(mask_h_idx.scalar_type() == at::kInt ||
+                  mask_h_idx.scalar_type() == at::kLong,
+              "mask_h_idx type should be Int or Long, got ",
+              mask_h_idx.scalar_type(), ".");
+  TORCH_CHECK(mask_w_idx.scalar_type() == at::kInt ||
+                  mask_w_idx.scalar_type() == at::kLong,
+              "mask_w_idx type should be Int or Long, got ",
+              mask_w_idx.scalar_type(), ".");
+  TORCH_CHECK(kernel_h > 0, "kernel_h should greater than 0, got ", kernel_h,
+              ".");
+  TORCH_CHECK(kernel_w > 0, "kernel_w should greater than 0, got ", kernel_w,
+              ".");
+
+  // zero element check
+  TORCH_CHECK(im.numel() > 0, "im.numel should greater than zero, got ",
+              im.numel(), ".");
+  TORCH_CHECK(col.size(0) > 0, "col.size(0) should greater than zero, got ",
+              col.size(0), ".");
+
+  // large tensor check
+  const size_t max_input_num = 2147483648;  // 2^31, 2G num
+  TORCH_CHECK(im.numel() < max_input_num,
+              "im.numel() should be less than 2147483648, got ", im.numel(),
+              ".");
+  TORCH_CHECK(col.numel() < max_input_num,
+              "col.numel() should be less than 2147483648, got ", col.numel(),
+              ".");
+
+  const int channels = im.size(1);
+  const int height = im.size(2);
+  const int width = im.size(3);
+  const int mask_cnt = mask_h_idx.size(0);
+
+  // auto im_t = im.permute({0, 2, 3, 1}).contiguous();
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(im.dim());
+  auto im_ = torch_mlu::cnnl::ops::cnnl_contiguous(im, memory_format);
+  auto col_ =
+      at::zeros({mask_cnt, kernel_h * kernel_w, channels}, col.options());
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFunc(mask_cnt, &k_dim, &k_type);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+  // get ptr of tensors
+  auto im_impl = torch_mlu::getMluTensorImpl(im_);
+  auto im_ptr = im_impl->cnnlMalloc();
+  auto mask_h_idx_impl = torch_mlu::getMluTensorImpl(mask_h_idx);
+  auto mask_h_idx_ptr = mask_h_idx_impl->cnnlMalloc();
+  auto mask_w_idx_impl = torch_mlu::getMluTensorImpl(mask_w_idx);
+  auto mask_w_idx_ptr = mask_w_idx_impl->cnnlMalloc();
+  auto col_impl = torch_mlu::getMluTensorImpl(col_);
+  auto col_ptr = col_impl->cnnlMalloc();
+
+  // get comput dtype of input
+  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(im.dtype());
+
+  // launch kernel
+  CNLOG(INFO) << "Launch Kernel MLUKernelMaskedIm2colForward<<<" << k_dim.x
+              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
+  KernelMaskedIm2colForward(k_dim, k_type, queue, data_type, im_ptr, height,
+                            width, channels, kernel_h, kernel_w, pad_h, pad_w,
+                            mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, col_ptr);
+
+  col.copy_(col_.permute({2, 1, 0})
+                .reshape({channels * kernel_h * kernel_w, mask_cnt})
+                .contiguous());
+}
+
+void MaskedCol2imForwardMLUKernelLauncher(const Tensor col,
+                                          const Tensor mask_h_idx,
+                                          const Tensor mask_w_idx, Tensor im,
+                                          const int height, const int width,
+                                          const int channels) {
+  // Check dtype.
+  TORCH_CHECK(col.scalar_type() == at::kFloat || col.scalar_type() == at::kHalf,
+              "col type should be Float or Half, got ", col.scalar_type(), ".");
+  TORCH_CHECK(mask_h_idx.scalar_type() == at::kInt ||
+                  mask_h_idx.scalar_type() == at::kLong,
+              "mask_h_idx type should be Int or Long, got ",
+              mask_h_idx.scalar_type(), ".");
+  TORCH_CHECK(mask_w_idx.scalar_type() == at::kInt ||
+                  mask_w_idx.scalar_type() == at::kLong,
+              "mask_w_idx type should be Int or Long, got ",
+              mask_w_idx.scalar_type(), ".");
+
+  // zero element check
+  TORCH_CHECK(im.numel() > 0, "im.numel should greater than zero, got ",
+              im.numel(), ".");
+  TORCH_CHECK(col.size(0) > 0, "col.size(0) should greater than zero, got ",
+              col.size(0), ".");
+
+  // large tensor check
+  const size_t max_input_num = 2147483648;  // 2^31, 2G num
+  TORCH_CHECK(im.numel() < max_input_num,
+              "im.numel() should be less than 2147483648, got ", im.numel(),
+              ".");
+  TORCH_CHECK(col.numel() < max_input_num,
+              "col.numel() should be less than 2147483648, got ", col.numel(),
+              ".");
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(im.dim());
+  at::Tensor im_ =
+      at::empty({1, channels, height, width}, im.options(), memory_format)
+          .zero_();
+
+  auto col_t = torch_mlu::cnnl::ops::cnnl_contiguous(col.transpose(0, 1));
+
+  const int mask_cnt = mask_h_idx.size(0);
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFunc(mask_cnt, &k_dim, &k_type);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+  // get ptr of tensors
+  auto im_impl = torch_mlu::getMluTensorImpl(im_);
+  auto im_ptr = im_impl->cnnlMalloc();
+  auto mask_h_idx_impl = torch_mlu::getMluTensorImpl(mask_h_idx);
+  auto mask_h_idx_ptr = mask_h_idx_impl->cnnlMalloc();
+  auto mask_w_idx_impl = torch_mlu::getMluTensorImpl(mask_w_idx);
+  auto mask_w_idx_ptr = mask_w_idx_impl->cnnlMalloc();
+  auto col_t_impl = torch_mlu::getMluTensorImpl(col_t);
+  auto col_t_ptr = col_t_impl->cnnlMalloc();
+
+  // get comput dtype of input
+  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(col.dtype());
+
+  // launch kernel
+  CNLOG(INFO) << "Launch Kernel MLUKernelMaskedCol2imForward<<<" << k_dim.x
+              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
+
+  KernelMaskedCol2imForward(k_dim, k_type, queue, data_type, col_t_ptr, height,
+                            width, channels, mask_h_idx_ptr, mask_w_idx_ptr,
+                            mask_cnt, im_ptr);
+
+  im.copy_(im_);
+}
+
+void masked_im2col_forward_mlu(const Tensor im, const Tensor mask_h_idx,
+                               const Tensor mask_w_idx, Tensor col,
+                               const int kernel_h, const int kernel_w,
+                               const int pad_h, const int pad_w) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardMLUKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                       kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_mlu(const Tensor col, const Tensor mask_h_idx,
+                               const Tensor mask_w_idx, Tensor im, int height,
+                               int width, int channels) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardMLUKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                       width, channels);
+}
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+
+REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, MLU,
+                     masked_im2col_forward_mlu);
+REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, MLU,
+                     masked_col2im_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f0b50ae891d2ecbb792faff618e4a562d2cbbdb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
@@ -0,0 +1,150 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+// Descriptors
+mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type) {
+  const std::map<std::string, mluOpDataType_t> mapping_type = {
+      {std::string("c10::Half"), MLUOP_DTYPE_HALF},
+      {std::string("float"), MLUOP_DTYPE_FLOAT},
+      {std::string("double"), MLUOP_DTYPE_DOUBLE},
+      {std::string("int8"), MLUOP_DTYPE_INT8},
+      {std::string("signed char"), MLUOP_DTYPE_INT8},
+      {std::string("short int"), MLUOP_DTYPE_INT16},
+      {std::string("short"), MLUOP_DTYPE_INT16},
+      {std::string("int"), MLUOP_DTYPE_INT32},
+      {std::string("long int"), MLUOP_DTYPE_INT64},
+      {std::string("long"), MLUOP_DTYPE_INT64},
+      {std::string("unsigned char"), MLUOP_DTYPE_UINT8},
+      {std::string("bool"), MLUOP_DTYPE_BOOL},
+      {std::string("c10::complex<c10::Half>"), MLUOP_DTYPE_COMPLEX_HALF},
+      {std::string("c10::complex<float>"), MLUOP_DTYPE_COMPLEX_FLOAT}};
+
+  if (mapping_type.find(std::string(data_type.name())) != mapping_type.end()) {
+    return mapping_type.find(std::string(data_type.name()))->second;
+  }
+  return MLUOP_DTYPE_INVALID;
+}
+
+// laytout
+mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input) {
+  auto suggest_memory_format = input.suggest_memory_format();
+  mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY;
+  switch (input.dim()) {
+    case 4:
+      layout = (suggest_memory_format == at::MemoryFormat::ChannelsLast)
+                   ? MLUOP_LAYOUT_NHWC
+                   : MLUOP_LAYOUT_NCHW;
+      break;
+    case 5:
+      layout = (suggest_memory_format == at::MemoryFormat::ChannelsLast3d)
+                   ? MLUOP_LAYOUT_NDHWC
+                   : MLUOP_LAYOUT_NCDHW;
+      break;
+    default:
+      layout = MLUOP_LAYOUT_ARRAY;
+  }
+  return layout;
+}
+
+mluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type) {
+  const std::map<reduce_t, mluOpReduceMode_t> mapping_type = {
+      {reduce_t::MAX, MLUOP_REDUCE_DMAX},
+      {reduce_t::SUM, MLUOP_REDUCE_DSUM},
+      {reduce_t::MEAN, MLUOP_REDUCE_DMEAN}};
+  if (mapping_type.find(reduce_type) != mapping_type.end()) {
+    return mapping_type.find(reduce_type)->second;
+  } else {
+    TORCH_CHECK(false, "Unsupported reduce type: ", to_string(reduce_type));
+    return MLUOP_REDUCE_DSUM;
+  }
+}
+
+void MluOpTensorDescriptor::set(Tensor t) {
+  mluOpDataType_t data_type = getMluOpDataType(t.dtype());
+  mluOpTensorLayout_t layout = getMluOpSuggestLayout(t);
+  int t_dim = t.dim();
+  std::vector<int> dim_array;
+  if (t_dim == 0) {
+    dim_array.push_back(
+        1);  // ScalarTensor(0-dim 1-item Tensor) view like size = 1 as default;
+  } else {
+    for (int i = 0; i < t_dim; i++) {
+      dim_array.push_back(static_cast<int>(t.sizes().vec()[i]));
+    }
+  }
+  set_desc(t, layout, data_type, dim_array);
+}
+
+void MluOpTensorDescriptor::set_with_layout(Tensor t,
+                                            mluOpTensorLayout_t layout) {
+  mluOpDataType_t data_type = getMluOpDataType(t.dtype());
+  int t_dim = t.dim();
+  std::vector<int> shape_info = checkUpperBoundAndCastTo<int>(t.sizes().vec());
+  std::vector<int> stride_info =
+      checkUpperBoundAndCastTo<int>(t.strides().vec());
+  if (layout == MLUOP_LAYOUT_NHWC || layout == MLUOP_LAYOUT_NDHWC ||
+      layout == MLUOP_LAYOUT_NLC) {
+    convertShapeAndStride(shape_info, stride_info);
+  } else if (layout == MLUOP_LAYOUT_HWCN) {
+    auto convertDepthWiseConvShapeStride = [](const std::vector<int64_t>& vec,
+                                              std::vector<int>& target_vec,
+                                              std::vector<int>& stride_vec) {
+      // NCHW --> HWCN
+      target_vec[0] = static_cast<int>(vec[2]);
+      target_vec[1] = static_cast<int>(vec[3]);
+      target_vec[2] = static_cast<int>(vec[1]);
+      target_vec[3] = static_cast<int>(vec[0]);
+      // Calculate Stride just like contiguous of HWCN.
+      stride_vec[3] = 1;
+      stride_vec[2] = target_vec[3] * stride_vec[3];
+      stride_vec[1] = target_vec[2] * stride_vec[2];
+      stride_vec[0] = target_vec[1] * stride_vec[1];
+    };
+    convertDepthWiseConvShapeStride(t.sizes().vec(), shape_info, stride_info);
+  }
+  TORCH_CHECK(mluOpSetTensorDescriptorEx(
+                  desc_, layout, data_type, t_dim, shape_info.data(),
+                  stride_info.data()) == MLUOP_STATUS_SUCCESS,
+              "mluOpSetTensorDescriptorEx execution failed.");
+}
+
+void MluOpTensorDescriptor::set_desc(const at::Tensor& t,
+                                     mluOpTensorLayout_t layout,
+                                     mluOpDataType_t dtype,
+                                     std::vector<int>& dims) {
+  int dimNb = dims.size();
+  TORCH_MLUOP_CHECK(
+      mluOpSetTensorDescriptor(desc_, layout, dtype, dimNb, dims.data()));
+}
+
+// Handles
+std::once_flag mmcv_mluop_init_flag;
+std::mutex mmcv_mluop_mutex;
+static std::vector<MluOpHandle> mmcv_mluop_handles;
+
+mluOpHandle_t mluOpGetCurrentHandle(c10::DeviceIndex device_index) {
+  std::call_once(mmcv_mluop_init_flag,
+                 []()  // Init mmcv_mluop_handles 1-device <-> 1-handle
+                 {
+                   c10::DeviceIndex num_devices = torch_mlu::device_count();
+                   mmcv_mluop_handles.resize(num_devices);
+                 });
+
+  if (device_index == -1) {
+    device_index = torch_mlu::current_device();
+  }
+  std::lock_guard<std::mutex> mmcv_mluop_guard(mmcv_mluop_mutex);
+  auto queue = torch_mlu::getCurrentQueue(device_index).queue();
+  mmcv_mluop_handles[device_index].setQueue(queue);
+  return mmcv_mluop_handles[device_index].handle;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f295ca804089a732a2280c59696173a2000ac63
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
@@ -0,0 +1,144 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#pragma once
+#include <ATen/ATen.h>
+#include <c10/core/ScalarType.h>
+
+#include "aten.h"
+#include "mlu_op.h"
+#include "pytorch_device_registry.hpp"
+
+#define MLUOP_MAJOR 0
+#define MLUOP_MINOR 8
+#define MLUOP_PATCHLEVEL 1
+
+/*************************************************************************
+ * This MACRO contains operations of simple tensor to mlu-tensor.
+ * _contiguous, _desc, _impl, _ptr will be automatically generated in
+ * this MACRO.
+ *************************************************************************/
+#define INITIAL_MLU_PARAM_WITH_TENSOR(NAME)                         \
+  auto NAME##_contigous = torch_mlu::cnnl::ops::cnnl_contiguous(    \
+      NAME, NAME.suggest_memory_format());                          \
+  MluOpTensorDescriptor NAME##_desc;                                \
+  NAME##_desc.set(NAME##_contigous);                                \
+  auto NAME##_impl = torch_mlu::getMluTensorImpl(NAME##_contigous); \
+  auto NAME##_ptr = NAME##_impl->cnnlMalloc();
+
+#ifndef TORCH_MLUOP_CHECK
+#define TORCH_MLUOP_CHECK(EXPR)                                          \
+  do {                                                                   \
+    mluOpStatus_t status = EXPR;                                         \
+    if (status != MLUOP_STATUS_SUCCESS) {                                \
+      CNLOG(ERROR) << "";                                                \
+      TORCH_CHECK(false, "MLUOPS error: ", mluOpGetErrorString(status)); \
+    }                                                                    \
+  } while (0);
+#endif
+
+enum class reduce_t { SUM = 0, MEAN = 1, MAX = 2 };
+
+inline std::string to_string(reduce_t reduce_type) {
+  if (reduce_type == reduce_t::MAX) {
+    return "max";
+  } else if (reduce_type == reduce_t::MEAN) {
+    return "mean";
+  } else if (reduce_type == reduce_t::SUM) {
+    return "sum";
+  } else {
+    return "unknown reduce type";
+  }
+}
+
+mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type);
+mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input);
+mluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type);
+
+class MluOpTensorDescriptor {
+ public:
+  MluOpTensorDescriptor() {
+    TORCH_MLUOP_CHECK(mluOpCreateTensorDescriptor(&desc_));
+  };
+  ~MluOpTensorDescriptor() {
+    TORCH_MLUOP_CHECK(mluOpDestroyTensorDescriptor(desc_));
+  }
+
+  void set(at::Tensor);
+  void set_with_layout(at::Tensor, mluOpTensorLayout_t layout);
+  mluOpTensorDescriptor_t desc() { return desc_; }
+
+ private:
+  mluOpTensorDescriptor_t desc_;
+  void set_desc(const at::Tensor&, mluOpTensorLayout_t, mluOpDataType_t,
+                std::vector<int>& dims);
+};
+
+mluOpHandle_t mluOpGetCurrentHandle(c10::DeviceIndex device_index = -1);
+
+class MluOpHandle {
+ public:
+  MluOpHandle() : handle(nullptr) { TORCH_MLUOP_CHECK(mluOpCreate(&handle)); }
+  ~MluOpHandle() {
+    if (handle) {
+      TORCH_MLUOP_CHECK(mluOpDestroy(handle));
+      handle = nullptr;
+    }
+  }
+  void setQueue(cnrtQueue_t queue) {
+    TORCH_MLUOP_CHECK(mluOpSetQueue(handle, queue));
+  }
+  mluOpHandle_t handle;
+};
+
+// modify tensor size and stride order based on
+// channels_first to channels_last or channels_last_3d.
+// which this is not same with pytorch original layout,
+// this real layout is based on data storage real order.
+// example: modify channels_last tensor dim to nhwc tensor desc.
+//            N    C H W  -->   N    H W C
+//          C*H*W  1 W C  --> C*H*W  W C 1
+template <typename T>
+void convertShapeAndStride(std::vector<T>& shape_info,
+                           std::vector<T>& stride_info) {
+  TORCH_MLU_CHECK(shape_info.size() == stride_info.size(),
+                  "shape size need equal to stride size.");
+  const int dim = shape_info.size();
+  std::vector<T> temp_shape_info(dim);
+  std::vector<T> temp_stride_info(dim);
+  temp_shape_info[0] = shape_info[0];
+  temp_stride_info[0] = stride_info[0];
+  for (size_t i = 0; i < dim - 1; ++i) {
+    const int index = (i + 1) % (dim - 1) + 1;
+    temp_shape_info[i + 1] = shape_info[index];
+    temp_stride_info[i + 1] = stride_info[index];
+  }
+  shape_info.assign(temp_shape_info.begin(), temp_shape_info.end());
+  stride_info.assign(temp_stride_info.begin(), temp_stride_info.end());
+}
+
+// torch tensor provides int64_t type of shape and stride,
+// but mluops descriptor requires type int32.
+// use this function to ensure safe CAST, or report an error.
+template <typename DST_T, typename SRC_T>
+std::vector<DST_T> checkUpperBoundAndCastTo(const std::vector<SRC_T>& input) {
+  std::vector<DST_T> output;
+  output.reserve(input.size());
+  for (const auto& val : input) {
+    if (val > std::numeric_limits<DST_T>::max()) {
+      TORCH_MLU_CHECK(false, "Requires dim size not greater than ",
+                      std::numeric_limits<DST_T>::max(), ". But got ", val,
+                      ".");
+    }
+    output.push_back(static_cast<DST_T>(val));
+  }
+  return output;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..59192063e487b52887c481b9c33e698154700b9e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
@@ -0,0 +1,129 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+Tensor MsDeformAttnForwardLauncher(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step) {
+  auto handle = mluOpGetCurrentHandle();
+  const int batch_size = value.size(0);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+  const int num_queries = sampling_loc.size(1);
+  auto output = at::zeros({batch_size, num_queries, num_heads, channels},
+                          value.options());
+  auto spatial_shapes_int = spatial_shapes.to(at::kInt);
+  auto level_start_index_int = level_start_index.to(at::kInt);
+  INITIAL_MLU_PARAM_WITH_TENSOR(output);
+  INITIAL_MLU_PARAM_WITH_TENSOR(value);
+  INITIAL_MLU_PARAM_WITH_TENSOR(spatial_shapes_int);
+  INITIAL_MLU_PARAM_WITH_TENSOR(level_start_index_int);
+  INITIAL_MLU_PARAM_WITH_TENSOR(sampling_loc);
+  INITIAL_MLU_PARAM_WITH_TENSOR(attn_weight);
+
+  TORCH_MLUOP_CHECK(mluOpMsDeformAttnForward(
+      handle, value_desc.desc(), value_ptr, spatial_shapes_int_desc.desc(),
+      spatial_shapes_int_ptr, level_start_index_int_desc.desc(),
+      level_start_index_int_ptr, sampling_loc_desc.desc(), sampling_loc_ptr,
+      attn_weight_desc.desc(), attn_weight_ptr, im2col_step, output_desc.desc(),
+      output_ptr));
+
+  output = output.view({batch_size, num_queries, num_heads * channels});
+  return output;
+}
+
+void MsDeformAttnBackwardLauncher(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight,
+    const int im2col_step) {
+  auto handle = mluOpGetCurrentHandle();
+  auto spatial_shapes_int = spatial_shapes.to(at::kInt);
+  auto level_start_index_int = level_start_index.to(at::kInt);
+  const int batch_size = value.size(0);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+  const int num_queries = sampling_loc.size(1);
+
+  auto grad_output_dim4 =
+      grad_output.view({batch_size, num_queries, num_heads, channels});
+  // auto grad_output_dim4 = grad_output.view({batch_size, num_queries,
+  // num_heads, channels}).detach();
+  INITIAL_MLU_PARAM_WITH_TENSOR(value);
+  INITIAL_MLU_PARAM_WITH_TENSOR(spatial_shapes_int);
+  INITIAL_MLU_PARAM_WITH_TENSOR(level_start_index_int);
+  INITIAL_MLU_PARAM_WITH_TENSOR(sampling_loc);
+  INITIAL_MLU_PARAM_WITH_TENSOR(attn_weight);
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_output_dim4);
+  // INITIAL_MLU_PARAM_WITH_TENSOR(grad_output);
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_value);
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_sampling_loc);
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_attn_weight);
+
+  mluOpMsDeformAttnBackward(
+      handle, value_desc.desc(), value_ptr, spatial_shapes_int_desc.desc(),
+      spatial_shapes_int_ptr, level_start_index_int_desc.desc(),
+      level_start_index_int_ptr, sampling_loc_desc.desc(), sampling_loc_ptr,
+      attn_weight_desc.desc(), attn_weight_ptr, grad_output_dim4_desc.desc(),
+      grad_output_dim4_ptr, im2col_step, grad_value_desc.desc(), grad_value_ptr,
+      grad_sampling_loc_desc.desc(), grad_sampling_loc_ptr,
+      grad_attn_weight_desc.desc(), grad_attn_weight_ptr);
+
+  return;
+}
+
+Tensor ms_deform_attn_mlu_forward(const Tensor& value,
+                                  const Tensor& spatial_shapes,
+                                  const Tensor& level_start_index,
+                                  const Tensor& sampling_loc,
+                                  const Tensor& attn_weight,
+                                  const int im2col_step) {
+  return MsDeformAttnForwardLauncher(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
+}
+
+void ms_deform_attn_mlu_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight,
+    const int im2col_step) {
+  return MsDeformAttnBackwardLauncher(value, spatial_shapes, level_start_index,
+                                      sampling_loc, attn_weight, grad_output,
+                                      grad_value, grad_sampling_loc,
+                                      grad_attn_weight, im2col_step);
+}
+
+Tensor ms_deform_attn_impl_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_impl_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, MLU,
+                     ms_deform_attn_mlu_forward);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, MLU,
+                     ms_deform_attn_mlu_backward);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..675c11e5978f04fb9a465ca2f653e52bde6a235c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
@@ -0,0 +1,86 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "mlu_common_helper.h"
+
+Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                            int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  int max_output_boxes = boxes.size(0);
+
+  // transpose boxes (n, 4) to (4, n) for better performance
+  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
+  auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
+  auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kInt));
+  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
+
+  MluOpTensorDescriptor boxes_desc, scores_desc, output_desc;
+  boxes_desc.set(boxes_);
+  scores_desc.set(scores_);
+  output_desc.set(output);
+
+  // workspace
+  size_t workspace_size = 0;
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpGetNmsWorkspaceSize(
+      handle, boxes_desc.desc(), scores_desc.desc(), &workspace_size));
+  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));
+
+  // get compute queue
+  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
+  auto boxes_ptr = boxes_impl->cnnlMalloc();
+  auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
+  auto scores_ptr = scores_impl->cnnlMalloc();
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
+  auto output_size_ptr = output_size_impl->cnnlMalloc();
+
+  // nms desc
+  mluOpNmsDescriptor_t nms_desc;
+  const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
+  const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
+  const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
+  const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
+  const float soft_nms_sigma = 0.0;
+  const float confidence_threshold = 0.0;
+  const int input_layout = 0;
+  const bool pad_to_max_output_size = false;
+  const int max_output_size = max_output_boxes;
+
+  TORCH_MLUOP_CHECK(mluOpCreateNmsDescriptor(&nms_desc));
+  TORCH_MLUOP_CHECK(mluOpSetNmsDescriptor(
+      nms_desc, box_mode, output_mode, algo, method_mode, iou_threshold,
+      soft_nms_sigma, max_output_size, confidence_threshold, (float)offset,
+      input_layout, pad_to_max_output_size));
+
+  TORCH_MLUOP_CHECK(mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr,
+                             scores_desc.desc(), scores_ptr, workspace_ptr,
+                             workspace_size, output_desc.desc(), output_ptr,
+                             output_size_ptr));
+  TORCH_MLUOP_CHECK(mluOpDestroyNmsDescriptor(nms_desc));
+  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
+  auto ret = output.to(boxes.options().dtype(at::kLong));
+  return ret.slice(0, 0, output_num);
+}
+
+Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSMLUKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/nms_rotated_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/nms_rotated_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3b6b333ee9f90440d5582daf0330cdf0017ac90
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/nms_rotated_mlu.cpp
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+Tensor nms_rotated_mlu(Tensor boxes, Tensor scores, float iou_threshold) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  int boxes_num = boxes.size(0);
+  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
+  auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
+  auto output = at::empty({boxes_num}, boxes.options().dtype(at::kInt));
+  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
+
+  MluOpTensorDescriptor boxes_desc, scores_desc, output_desc;
+  boxes_desc.set(boxes_);
+  scores_desc.set(scores_);
+  output_desc.set(output);
+
+  // workspace
+  size_t workspace_size = 0;
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpGetNmsRotatedWorkspaceSize(handle, boxes_desc.desc(),
+                                                    &workspace_size));
+  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));
+
+  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
+  auto boxes_ptr = boxes_impl->cnnlMalloc();
+  auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
+  auto scores_ptr = scores_impl->cnnlMalloc();
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
+  auto output_size_ptr = output_size_impl->cnnlMalloc();
+
+  TORCH_MLUOP_CHECK(mluOpNmsRotated(
+      handle, iou_threshold, boxes_desc.desc(), boxes_ptr, scores_desc.desc(),
+      scores_ptr, workspace_ptr, workspace_size, output_desc.desc(), output_ptr,
+      (int *)output_size_ptr));
+  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
+  auto ret = output.to(boxes.options().dtype(at::kLong));
+  return ret.slice(0, 0, output_num);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e43787a7af38c97549a23afee1efeda3ff90b42
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
@@ -0,0 +1,110 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void PSAMaskForwardMLUKernelLauncher(const int psa_type, const Tensor x,
+                                     Tensor y, const int num_,
+                                     const int h_feature, const int w_feature,
+                                     const int h_mask, const int w_mask,
+                                     const int half_h_mask,
+                                     const int half_w_mask) {
+  int y_c = y.size(1);
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(x.dim());
+  auto x_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(x, memory_format);
+  at::Tensor y_tmp =
+      at::empty({num_, y_c, h_feature, w_feature}, x.options(), memory_format);
+
+  MluOpTensorDescriptor x_desc, y_desc;
+  x_desc.set_with_layout(x_tensor, MLUOP_LAYOUT_NHWC);
+  y_desc.set_with_layout(y_tmp, MLUOP_LAYOUT_NHWC);
+
+  auto handle = mluOpGetCurrentHandle();
+  auto x_impl = torch_mlu::getMluTensorImpl(x_tensor);
+  auto x_ptr = x_impl->cnnlMalloc();
+  auto y_impl = torch_mlu::getMluTensorImpl(y_tmp);
+  auto y_ptr = y_impl->cnnlMalloc();
+
+  TORCH_MLUOP_CHECK(mluOpPsamaskForward(handle, psa_type, x_desc.desc(), x_ptr,
+                                        h_mask, w_mask, y_desc.desc(), y_ptr));
+
+  y.copy_(y_tmp);
+}
+
+void PSAMaskBackwardMLUKernelLauncher(const int psa_type, const Tensor dy,
+                                      Tensor dx, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask) {
+  int dx_c = dx.size(1);
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(dy.dim());
+  auto dy_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(dy, memory_format);
+  at::Tensor dx_tmp = at::empty({num_, dx_c, h_feature, w_feature},
+                                dy.options(), memory_format);
+
+  MluOpTensorDescriptor dy_desc, dx_tmp_desc;
+  dy_desc.set_with_layout(dy_tensor, MLUOP_LAYOUT_NHWC);
+  dx_tmp_desc.set_with_layout(dx_tmp, MLUOP_LAYOUT_NHWC);
+
+  auto handle = mluOpGetCurrentHandle();
+
+  // get ptr of tensors
+  auto dx_impl = torch_mlu::getMluTensorImpl(dx_tmp);
+  auto dx_ptr = dx_impl->cnnlMalloc();
+  auto dy_impl = torch_mlu::getMluTensorImpl(dy_tensor);
+  auto dy_ptr = dy_impl->cnnlMalloc();
+
+  TORCH_MLUOP_CHECK(mluOpPsamaskBackward(handle, psa_type, dy_desc.desc(),
+                                         dy_ptr, h_mask, w_mask,
+                                         dx_tmp_desc.desc(), dx_ptr));
+
+  dx.copy_(dx_tmp);
+}
+
+void psamask_forward_mlu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  PSAMaskForwardMLUKernelLauncher(psa_type, input, output, num_, h_feature,
+                                  w_feature, h_mask, w_mask, half_h_mask,
+                                  half_w_mask);
+}
+
+void psamask_backward_mlu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardMLUKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                   h_feature, w_feature, h_mask, w_mask,
+                                   half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+
+REGISTER_DEVICE_IMPL(psamask_forward_impl, MLU, psamask_forward_mlu);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, MLU, psamask_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..41307fd2ff16b6332484573d1b021663e0974249
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
@@ -0,0 +1,185 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax_y, Tensor argmax_x,
+                                      int aligned_height, int aligned_width,
+                                      float spatial_scale, int sampling_ratio,
+                                      int pool_mode, bool aligned) {
+  // params check
+  TORCH_CHECK(pool_mode == 1, "pool_mode only supports 'avg' currently");
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_tensor =
+      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  auto output_contiguous =
+      at::empty({num_rois, channels, aligned_height, aligned_width},
+                input.options(), memory_format);
+  // get tensor impl
+  auto self_impl = torch_mlu::getMluTensorImpl(input_tensor);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+
+  MluOpTensorDescriptor input_desc, rois_desc, argmax_y_desc, argmax_x_desc,
+      output_desc;
+  input_desc.set_with_layout(input_tensor, MLUOP_LAYOUT_NHWC);
+  rois_desc.set_with_layout(rois, MLUOP_LAYOUT_ARRAY);
+  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
+
+  // get the mlu ptr
+  auto self_ptr = self_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  mluOpRoiAlignForwardDescriptor_t roialign_desc;
+  TORCH_MLUOP_CHECK(mluOpCreateRoiAlignForwardDescriptor(&roialign_desc));
+  TORCH_MLUOP_CHECK(mluOpSetRoiAlignForwardDescriptor_v2(
+      roialign_desc, aligned_height, aligned_width, sampling_ratio,
+      spatial_scale, pool_mode, aligned));
+
+  auto handle = mluOpGetCurrentHandle();
+  if (pool_mode == 0) {
+    auto argmax_y_contiguous =
+        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_y, memory_format);
+    auto argmax_x_contiguous =
+        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_x, memory_format);
+    auto argmax_x_impl = torch_mlu::getMluTensorImpl(argmax_x_contiguous);
+    auto argmax_y_impl = torch_mlu::getMluTensorImpl(argmax_y_contiguous);
+    auto argmax_x_ptr = argmax_x_impl->cnnlMalloc();
+    auto argmax_y_ptr = argmax_y_impl->cnnlMalloc();
+    argmax_y_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
+    argmax_x_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
+    TORCH_MLUOP_CHECK(mluOpRoiAlignForward_v2(
+        handle, roialign_desc, input_desc.desc(), self_ptr, rois_desc.desc(),
+        rois_ptr, output_desc.desc(), output_ptr, argmax_x_desc.desc(),
+        argmax_x_ptr, argmax_y_desc.desc(), argmax_y_ptr));
+    argmax_x.copy_(argmax_x_contiguous);
+    argmax_y.copy_(argmax_y_contiguous);
+  } else {
+    TORCH_MLUOP_CHECK(mluOpRoiAlignForward_v2(
+        handle, roialign_desc, input_desc.desc(), self_ptr, rois_desc.desc(),
+        rois_ptr, output_desc.desc(), output_ptr, NULL, NULL, NULL, NULL));
+  }
+  TORCH_MLUOP_CHECK(mluOpDestroyRoiAlignForwardDescriptor(roialign_desc));
+  output.copy_(output_contiguous);
+}
+
+void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       Tensor grad_input, int aligned_height,
+                                       int aligned_width, float spatial_scale,
+                                       int sampling_ratio, int pool_mode,
+                                       bool aligned) {
+  // params check
+  TORCH_CHECK(pool_mode == 1, "pool_mode only supports 'avg' currently");
+  int batch_size = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad.dim());
+  auto grad_ = torch_mlu::cnnl::ops::cnnl_contiguous(grad, memory_format);
+  auto grad_input_ = at::empty({batch_size, channels, height, width},
+                               grad.options(), memory_format)
+                         .zero_();
+
+  int boxes_num = rois.size(0);
+  int hi = grad.size(2);
+  int wi = grad.size(3);
+  int c = grad.size(1);
+
+  int no = grad_input.size(0);
+  int ho = grad_input.size(2);
+  int wo = grad_input.size(3);
+
+  // get tensor impl
+  auto grad_impl = torch_mlu::getMluTensorImpl(grad_);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+
+  // get the mlu ptr
+  auto grad_ptr = grad_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  MluOpTensorDescriptor grads_desc, rois_desc, argmax_y_desc, argmax_x_desc,
+      grad_input_desc;
+  grads_desc.set_with_layout(grad_, MLUOP_LAYOUT_NHWC);
+  rois_desc.set_with_layout(rois, MLUOP_LAYOUT_ARRAY);
+  grad_input_desc.set_with_layout(grad_input_, MLUOP_LAYOUT_NHWC);
+
+  auto handle = mluOpGetCurrentHandle();
+  if (pool_mode == 0) {
+    auto argmax_y_contiguous =
+        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_y, memory_format);
+    auto argmax_x_contiguous =
+        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_x, memory_format);
+    auto argmax_x_impl = torch_mlu::getMluTensorImpl(argmax_x_contiguous);
+    auto argmax_y_impl = torch_mlu::getMluTensorImpl(argmax_y_contiguous);
+    auto argmax_x_ptr = argmax_x_impl->cnnlMalloc();
+    auto argmax_y_ptr = argmax_y_impl->cnnlMalloc();
+    argmax_y_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
+    argmax_x_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
+    TORCH_MLUOP_CHECK(mluOpRoiAlignBackward_v2(
+        handle, grads_desc.desc(), grad_ptr, rois_desc.desc(), rois_ptr,
+        argmax_y_desc.desc(), argmax_x_ptr, argmax_y_desc.desc(), argmax_y_ptr,
+        spatial_scale, sampling_ratio, aligned, pool_mode,
+        grad_input_desc.desc(), grad_input_ptr));
+  } else {
+    TORCH_MLUOP_CHECK(mluOpRoiAlignBackward_v2(
+        handle, grads_desc.desc(), grad_ptr, rois_desc.desc(), rois_ptr, NULL,
+        NULL, NULL, NULL, spatial_scale, sampling_ratio, aligned, pool_mode,
+        grad_input_desc.desc(), grad_input_ptr));
+  }
+  grad_input.copy_(grad_input_);
+}
+
+void roi_align_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  ROIAlignForwardMLUKernelLauncher(input, rois, output, argmax_y, argmax_x,
+                                   aligned_height, aligned_width, spatial_scale,
+                                   sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignBackwardMLUKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, MLU, roi_align_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, MLU, roi_align_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..10a8b6836356ec1f0d899e8fb29797926eb993b4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
@@ -0,0 +1,119 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void ROIAlignRotatedForwardMLUKernelLauncher(Tensor input, Tensor rois,
+                                             Tensor output, int pooled_height,
+                                             int pooled_width,
+                                             float spatial_scale,
+                                             int sampling_ratio, bool aligned,
+                                             bool clockwise) {
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+  auto rois_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
+  auto output_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);
+
+  MluOpTensorDescriptor input_desc, rois_desc, output_desc;
+  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
+  rois_desc.set(rois_contiguous);
+  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpRoiAlignRotatedForward(
+      handle, input_desc.desc(), input_ptr, rois_desc.desc(), rois_ptr,
+      pooled_height, pooled_width, sampling_ratio, spatial_scale, aligned,
+      clockwise, output_desc.desc(), output_ptr));
+
+  output.copy_(output_contiguous);
+}
+
+void ROIAlignRotatedBackwardMLUKernelLauncher(
+    Tensor top_grad, Tensor rois, Tensor bottom_grad, int pooled_height,
+    int pooled_width, float spatial_scale, int sampling_ratio, bool aligned,
+    bool clockwise) {
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());
+  auto top_grad_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);
+  auto rois_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
+  auto bottom_grad_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(bottom_grad, memory_format);
+
+  // get ptr of tensors
+  auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_);
+  auto top_grad_ptr = top_grad_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_);
+  auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();
+
+  MluOpTensorDescriptor top_grad_desc, rois_desc, bottom_grad_desc;
+  top_grad_desc.set_with_layout(top_grad_, MLUOP_LAYOUT_NHWC);
+  rois_desc.set(rois_contiguous);
+  bottom_grad_desc.set_with_layout(bottom_grad_, MLUOP_LAYOUT_NHWC);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpRoiAlignRotatedBackward(
+      handle, top_grad_desc.desc(), top_grad_ptr, rois_desc.desc(), rois_ptr,
+      pooled_height, pooled_width, sampling_ratio, spatial_scale, aligned,
+      clockwise, bottom_grad_desc.desc(), bottom_grad_ptr));
+  bottom_grad.copy_(bottom_grad_);
+}
+
+void roi_align_rotated_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                                   int aligned_height, int aligned_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise) {
+  ROIAlignRotatedForwardMLUKernelLauncher(input, rois, output, aligned_height,
+                                          aligned_width, spatial_scale,
+                                          sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_mlu(Tensor top_grad, Tensor rois,
+                                    Tensor bottom_grad, int aligned_height,
+                                    int aligned_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise) {
+  ROIAlignRotatedBackwardMLUKernelLauncher(
+      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
+      sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, MLU,
+                     roi_align_rotated_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, MLU,
+                     roi_align_rotated_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7db23957d2cbba8f496b9effd67a62f87cde39e5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
@@ -0,0 +1,275 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                          cnrtQueue_t queue, cnrtDataType_t data_type,
+                          const void *input_data, const void *input_rois,
+                          const int batch, const int channels, const int height,
+                          const int width, const int pooled_height,
+                          const int pooled_width, const int rois_num,
+                          const float spatial_scale, void *output_data,
+                          int *argmax);
+
+void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                           cnrtQueue_t queue, cnrtDataType_t k_dtype,
+                           const void *grad_output_ptr, const void *rois_ptr,
+                           const int *argmax_ptr, void *grad_input_ptr,
+                           const int box_num, const int pooled_height,
+                           const int pooled_width, const int channels,
+                           const int batch, const int height, const int width,
+                           const float spatial_scale);
+
+// policy function for forward
+static void policyFuncForward(const int bin_num, cnrtDim3_t *k_dim,
+                              cnrtFunctionType_t *k_type) {
+  auto core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = core_num;
+  unsigned int use_cluster = bin_num / core_num + (bin_num % core_num > 0);
+  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
+  k_dim->z = 1;
+}
+
+void ROIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                     Tensor argmax, int pooled_height,
+                                     int pooled_width, float spatial_scale) {
+  // Check dtype.
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "input type should be Float or Half, got ", input.scalar_type());
+  TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
+              "rois should have the same type as input");
+
+  // Check dtype relationship.
+  TORCH_CHECK(
+      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
+      "argmax type should be Int or Long, got ", argmax.scalar_type());
+
+  // Check shape.
+  TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
+              "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
+              argmax.dim(), "D");
+
+  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
+              "spatial_scale should be within (0, 1], got ", spatial_scale);
+
+  // compute kernel params
+  auto batch = input.size(0);
+  auto height = input.size(2);
+  auto width = input.size(3);
+  auto channels = input.size(1);
+  auto rois_num = output.size(0);
+
+  if (output.numel() == 0) {
+    output = at::zeros({rois_num, channels, pooled_height, pooled_width},
+                       input.options());
+    return;
+  }
+  if (argmax.numel() == 0) {
+    argmax = at::zeros({rois_num, channels, pooled_height, pooled_width},
+                       argmax.options());
+    return;
+  }
+
+  // zero element check
+  if (input.numel() == 0 || rois.numel() == 0 || output.numel() == 0 ||
+      argmax.numel() == 0) {
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+
+  at::Tensor output_ =
+      at::empty({rois_num, channels, pooled_height, pooled_width},
+                input.options(), memory_format);
+  at::Tensor argmax_ =
+      at::empty({rois_num, channels, pooled_height, pooled_width},
+                argmax.options(), memory_format);
+
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFuncForward(rois_num * pooled_height * pooled_width, &k_dim, &k_type);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+
+  // get comput dtype of input
+  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input_.dtype());
+
+  // launch kernel
+  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolForward<<<" << k_dim.x << ", "
+              << k_dim.y << ", " << k_dim.z << ">>>";
+
+  KernelRoiPoolForward(k_dim, k_type, queue, data_type, input_ptr, rois_ptr,
+                       batch, channels, height, width, pooled_height,
+                       pooled_width, rois_num, spatial_scale, output_ptr,
+                       (int *)argmax_ptr);
+  output.copy_(output_);
+  argmax.copy_(argmax_);
+}
+
+// policy function for backward
+static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  k_dim->z = 1;
+}
+
+void ROIPoolBackwardMLUKernelLauncher(Tensor grad_output, Tensor rois,
+                                      Tensor argmax, Tensor grad_input,
+                                      int pooled_height, int pooled_width,
+                                      float spatial_scale) {
+  // Check dtype.
+  TORCH_CHECK(
+      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
+      "argmax type should be Int or Long, got ", argmax.scalar_type());
+  TORCH_CHECK((grad_output.scalar_type() == at::kFloat ||
+               grad_output.scalar_type() == at::kHalf),
+              "grad_output type should be FLoat or Half, got ",
+              grad_output.scalar_type());
+
+  // Check dtype relationship.
+  TORCH_CHECK((rois.scalar_type() == grad_output.scalar_type()),
+              "rois should have the same type as grad_output");
+
+  // Check shape.
+  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be 4d tensor, got ",
+              grad_output.dim(), "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
+              argmax.dim(), "D");
+
+  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
+              "spatial_scale should be within (0, 1], got ", spatial_scale);
+
+  // Check relationship between tensor.
+  // Check the relationship of n.
+  TORCH_CHECK(grad_output.size(0) == rois.size(0),
+              "grad_output.size(0) = ", grad_output.size(0),
+              ", while rois.size(0) = ", rois.size(0),
+              ". They should be the same.");
+
+  // Check the relationship of channels.
+  TORCH_CHECK(grad_output.size(1) == argmax.size(1),
+              "grad_output.size(1) = ", grad_output.size(1),
+              ", while argmax.size(1) = ", argmax.size(1),
+              ". They should be the same.");
+
+  // Check the relationship of height and width.
+  TORCH_CHECK(grad_output.size(2) == argmax.size(2),
+              "argmax.size(2) = ", argmax.size(2),
+              ", while grad_output.size(2) = ", grad_output.size(2),
+              ". They should be the same.");
+  TORCH_CHECK(grad_output.size(3) == argmax.size(3),
+              "argmax.size(3) = ", argmax.size(3),
+              ", while grad_output.size(3) = ", grad_output.size(3),
+              ". They should be the same.");
+
+  // Check zero element.
+  if (grad_output.numel() == 0 || rois.numel() == 0 || argmax.numel() == 0 ||
+      grad_input.numel() == 0) {
+    // return if zero-element
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
+  auto grad_output_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);
+  auto argmax_ = torch_mlu::cnnl::ops::cnnl_contiguous(argmax, memory_format);
+
+  int boxes_num = grad_output.size(0);
+  int no = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  auto grad_input_ = at::empty({no, channels, height, width},
+                               grad_input.options(), memory_format)
+                         .zero_();
+
+  // get tensor impl
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get mlu ptr
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  // calculate task dimension
+  cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad_input.dtype());
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFuncBackward(&k_dim, &k_type);
+
+  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolBackward<<<" << k_dim.x << ", "
+              << k_dim.y << ", " << k_dim.z << ">>>";
+
+  KernelRoiPoolBackward(k_dim, k_type, queue, k_dtype, grad_output_ptr,
+                        rois_ptr, (int *)argmax_ptr, grad_input_ptr, boxes_num,
+                        pooled_height, pooled_width, channels, no, height,
+                        width, spatial_scale);
+
+  grad_input.copy_(grad_input_);
+}
+
+void roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                          Tensor argmax, int pooled_height, int pooled_width,
+                          float spatial_scale) {
+  ROIPoolForwardMLUKernelLauncher(input, rois, output, argmax, pooled_height,
+                                  pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax,
+                           Tensor grad_input, int pooled_height,
+                           int pooled_width, float spatial_scale) {
+  ROIPoolBackwardMLUKernelLauncher(grad_output, rois, argmax, grad_input,
+                                   pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, MLU, roi_pool_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, MLU, roi_pool_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad01a36bd36bc6f6c6ebb6a501bb4e0b4a2108c9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp
@@ -0,0 +1,164 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void RoiawarePool3dForwardMLUKernelLauncher(
+    const int pool_method, const int boxes_num, const int pts_num,
+    const int channels, const int max_pts_each_voxel, const int out_x,
+    const int out_y, const int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor pts_idx_of_voxels, Tensor pooled_features,
+    Tensor argmax) {
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  auto rois_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
+  auto pts_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(pts, pts.suggest_memory_format());
+  auto pts_feature_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pts_feature, pts_feature.suggest_memory_format());
+  auto argmax_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      argmax, argmax.suggest_memory_format());
+  auto pts_idx_of_voxels_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pts_idx_of_voxels, pts_idx_of_voxels.suggest_memory_format());
+  auto pooled_features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pooled_features, pooled_features.suggest_memory_format());
+
+  MluOpTensorDescriptor rois_desc, pts_desc, pts_feature_desc, argmax_desc,
+      pts_idx_of_voxels_desc, pooled_features_desc;
+  rois_desc.set(rois_contiguous);
+  pts_desc.set(pts_contiguous);
+  pts_feature_desc.set(pts_feature_contiguous);
+  argmax_desc.set(argmax_contiguous);
+  pts_idx_of_voxels_desc.set(pts_idx_of_voxels_contiguous);
+  pooled_features_desc.set(pooled_features_contiguous);
+
+  // allocate extra space for workspace
+  size_t workspace_size = 0;
+  TORCH_MLUOP_CHECK(mluOpGetRoiawarePool3dForwardWorkspaceSize(
+      handle, rois_desc.desc(), pts_desc.desc(), pts_feature_desc.desc(),
+      &workspace_size));
+
+  auto workspace = at::empty(workspace_size, rois.options().dtype(at::kByte));
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
+  auto pts_impl = torch_mlu::getMluTensorImpl(pts_contiguous);
+  auto pts_feature_impl = torch_mlu::getMluTensorImpl(pts_feature_contiguous);
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_contiguous);
+  auto pts_idx_of_voxels_impl =
+      torch_mlu::getMluTensorImpl(pts_idx_of_voxels_contiguous);
+  auto pooled_features_impl =
+      torch_mlu::getMluTensorImpl(pooled_features_contiguous);
+
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto pts_ptr = pts_impl->cnnlMalloc();
+  auto pts_feature_ptr = pts_feature_impl->cnnlMalloc();
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+  auto pts_idx_of_voxels_ptr = pts_idx_of_voxels_impl->cnnlMalloc();
+  auto pooled_features_ptr = pooled_features_impl->cnnlMalloc();
+
+  CNLOG(INFO) << "Call mluOpRoiawarePool3dForward().";
+  TORCH_MLUOP_CHECK(mluOpRoiawarePool3dForward(
+      handle, pool_method, boxes_num, pts_num, channels, rois_desc.desc(),
+      rois_ptr, pts_desc.desc(), pts_ptr, pts_feature_desc.desc(),
+      pts_feature_ptr, workspace_ptr, workspace_size, max_pts_each_voxel, out_x,
+      out_y, out_z, argmax_desc.desc(), argmax_ptr,
+      pts_idx_of_voxels_desc.desc(), pts_idx_of_voxels_ptr,
+      pooled_features_desc.desc(), pooled_features_ptr));
+}
+
+void roiaware_pool3d_forward_mlu(int boxes_num, int pts_num, int channels,
+                                 int max_pts_each_voxel, int out_x, int out_y,
+                                 int out_z, const Tensor rois, const Tensor pts,
+                                 const Tensor pts_feature, Tensor argmax,
+                                 Tensor pts_idx_of_voxels,
+                                 Tensor pooled_features, int pool_method) {
+  RoiawarePool3dForwardMLUKernelLauncher(
+      pool_method, boxes_num, pts_num, channels, max_pts_each_voxel, out_x,
+      out_y, out_z, rois, pts, pts_feature, pts_idx_of_voxels, pooled_features,
+      argmax);
+}
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, MLU,
+                     roiaware_pool3d_forward_mlu);
+
+void RoiawarePool3dBackwardMLUKernelLauncher(
+    int pool_method, int boxes_num, int out_x, int out_y, int out_z,
+    int channels, int max_pts_each_voxel, const Tensor pts_idx_of_voxels,
+    const Tensor argmax, const Tensor grad_out, Tensor grad_in) {
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  auto pts_idx_of_voxels_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pts_idx_of_voxels, pts_idx_of_voxels.suggest_memory_format());
+  auto argmax_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      argmax, argmax.suggest_memory_format());
+  auto grad_out_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      grad_out, grad_out.suggest_memory_format());
+  auto grad_in_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      grad_in, grad_in.suggest_memory_format());
+
+  MluOpTensorDescriptor pts_idx_of_voxels_desc, argmax_desc, grad_out_desc,
+      grad_in_desc;
+
+  pts_idx_of_voxels_desc.set(pts_idx_of_voxels_contiguous);
+  argmax_desc.set(argmax_contiguous);
+  grad_out_desc.set(grad_out_contiguous);
+  grad_in_desc.set(grad_in_contiguous);
+
+  auto pts_idx_of_voxels_impl =
+      torch_mlu::getMluTensorImpl(pts_idx_of_voxels_contiguous);
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_contiguous);
+  auto grad_out_impl = torch_mlu::getMluTensorImpl(grad_out_contiguous);
+  auto grad_in_impl = torch_mlu::getMluTensorImpl(grad_in_contiguous);
+
+  auto pts_idx_of_voxels_ptr = pts_idx_of_voxels_impl->cnnlMalloc();
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+  auto grad_out_ptr = grad_out_impl->cnnlMalloc();
+  auto grad_in_ptr = grad_in_impl->cnnlMalloc();
+
+  CNLOG(INFO) << "Call mluOpRoiawarePool3dBackward().";
+  TORCH_MLUOP_CHECK(mluOpRoiawarePool3dBackward(
+      handle, pool_method, boxes_num, out_x, out_y, out_z, channels,
+      max_pts_each_voxel, pts_idx_of_voxels_desc.desc(), pts_idx_of_voxels_ptr,
+      argmax_desc.desc(), argmax_ptr, grad_out_desc.desc(), grad_out_ptr,
+      grad_in_desc.desc(), grad_in_ptr));
+}
+
+void roiaware_pool3d_backward_mlu(int boxes_num, int out_x, int out_y,
+                                  int out_z, int channels,
+                                  int max_pts_each_voxel,
+                                  const Tensor pts_idx_of_voxels,
+                                  const Tensor argmax, const Tensor grad_out,
+                                  Tensor grad_in, int pool_method) {
+  RoiawarePool3dBackwardMLUKernelLauncher(
+      pool_method, boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+      pts_idx_of_voxels, argmax, grad_out, grad_in);
+}
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, MLU,
+                     roiaware_pool3d_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..109745c65996bb95b2857175efcf7cd649b050c4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp
@@ -0,0 +1,150 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void RoIPointPool3dForwardMLUKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features,
+    Tensor pooled_empty_flag) {
+  // check datatype
+  TORCH_CHECK(((xyz.scalar_type() == pooled_features.scalar_type()) &&
+               (boxes3d.scalar_type() == pooled_features.scalar_type()) &&
+               (pts_feature.scalar_type() == pooled_features.scalar_type())),
+              "data types of xyz, boxes3d, pts_feature and pooled_features "
+              "should be the same, ",
+              "but now xyz type is ", xyz.scalar_type(), ", boxes3d type is ",
+              boxes3d.scalar_type(), ", pts_feature type is ",
+              pts_feature.scalar_type(), ", pooled_features type is ",
+              pooled_features.scalar_type(), ".");
+  TORCH_CHECK(
+      (xyz.scalar_type() == at::kFloat || xyz.scalar_type() == at::kHalf),
+      "xyz type should be Float or Half, got ", xyz.scalar_type(), ".");
+  TORCH_CHECK((pooled_empty_flag.scalar_type() == at::kInt),
+              "pooled_empty_flag type should be Int, got ",
+              pooled_empty_flag.scalar_type(), ".");
+
+  // check shape
+  TORCH_CHECK(boxes3d.dim() == 3, "boxes3d should be a 3d tensor, got ",
+              boxes3d.dim(), "D.");
+  TORCH_CHECK(pts_feature.dim() == 3, "pts_feature should be a 3d tensor, got ",
+              pts_feature.dim(), "D.");
+
+  TORCH_CHECK(boxes3d.size(2) == 7,
+              "the 3rd dimensions of boxes3d should be 7, got ",
+              boxes3d.size(2), ".");
+  TORCH_CHECK((boxes3d.size(0) == batch_size),
+              "the 1st dimensions of boxes3d should be batch_size, ",
+              "but now the 1st dimension of boxes3d is ", boxes3d.size(0),
+              ", and batch_size is ", batch_size, ".");
+  TORCH_CHECK((pts_feature.size(0) == batch_size),
+              "the 1st dimensions of pts_feature should be batch_size, ",
+              "but now the 1st dimension of pts_feature is ",
+              pts_feature.size(0), ", and batch_size is ", batch_size, ".");
+  TORCH_CHECK((pts_feature.size(1) == pts_num),
+              "the 2nd dimensions of pts_feature should be pts_num, ",
+              "but now the 2nd dimension of pts_feature is ",
+              pts_feature.size(1), ", and pts_num is ", pts_num, ".");
+
+  // check zero element
+  if (xyz.numel() == 0 || pts_feature.numel() == 0 || boxes3d.numel() == 0 ||
+      pooled_features.numel() == 0 || pooled_empty_flag.numel() == 0) {
+    return;
+  }
+
+  // large tensor check
+  const size_t max_input_size = 2147483648;
+  TORCH_CHECK(xyz.numel() < max_input_size,
+              "xyz element num should be less than 2^31, got ", xyz.numel(),
+              ".");
+  TORCH_CHECK(boxes3d.numel() < max_input_size,
+              "boxes3d element num should be less than 2^31, got ",
+              boxes3d.numel(), ".");
+  TORCH_CHECK(pts_feature.numel() < max_input_size,
+              "pts_feature element num should be less than 2^31, got ",
+              pts_feature.numel(), ".");
+
+  // set contiguous
+  auto xyz_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(xyz, xyz.suggest_memory_format());
+  auto pts_feature_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pts_feature, pts_feature.suggest_memory_format());
+  auto boxes3d_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      boxes3d, boxes3d.suggest_memory_format());
+  auto pooled_features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pooled_features, pooled_features.suggest_memory_format());
+  auto pooled_empty_flag_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pooled_empty_flag, pooled_empty_flag.suggest_memory_format());
+
+  // get ptr of tensors
+  auto xyz_impl = torch_mlu::getMluTensorImpl(xyz_contiguous);
+  auto xyz_ptr = xyz_impl->cnnlMalloc();
+  auto pts_feature_impl = torch_mlu::getMluTensorImpl(pts_feature_contiguous);
+  auto pts_feature_ptr = pts_feature_impl->cnnlMalloc();
+  auto boxes3d_impl = torch_mlu::getMluTensorImpl(boxes3d_contiguous);
+  auto boxes3d_ptr = boxes3d_impl->cnnlMalloc();
+  auto pooled_features_impl =
+      torch_mlu::getMluTensorImpl(pooled_features_contiguous);
+  auto pooled_features_ptr = pooled_features_impl->cnnlMalloc();
+  auto pooled_empty_flag_impl =
+      torch_mlu::getMluTensorImpl(pooled_empty_flag_contiguous);
+  auto pooled_empty_flag_ptr = pooled_empty_flag_impl->cnnlMalloc();
+
+  // create tensor descriptors
+  MluOpTensorDescriptor xyz_desc, pts_feature_desc, boxes3d_desc,
+      pooled_features_desc, pooled_empty_flag_desc;
+  xyz_desc.set(xyz_contiguous);
+  pts_feature_desc.set(pts_feature_contiguous);
+  boxes3d_desc.set(boxes3d_contiguous);
+  pooled_features_desc.set(pooled_features_contiguous);
+  pooled_empty_flag_desc.set(pooled_empty_flag_contiguous);
+
+  // get workspace
+  size_t workspace_size = 0;
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpGetRoiPointPool3dWorkspaceSize(
+      handle, batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
+      xyz_desc.desc(), pts_feature_desc.desc(), boxes3d_desc.desc(),
+      pooled_features_desc.desc(), pooled_empty_flag_desc.desc(),
+      &workspace_size));
+
+  auto workspace = at::empty(workspace_size, xyz.options().dtype(at::kByte));
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+  TORCH_MLUOP_CHECK(mluOpRoiPointPool3d(
+      handle, batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
+      xyz_desc.desc(), xyz_ptr, pts_feature_desc.desc(), pts_feature_ptr,
+      boxes3d_desc.desc(), boxes3d_ptr, workspace_ptr, workspace_size,
+      pooled_features_desc.desc(), pooled_features_ptr,
+      pooled_empty_flag_desc.desc(), (int *)pooled_empty_flag_ptr));
+}
+
+void roipoint_pool3d_forward_mlu(int batch_size, int pts_num, int boxes_num,
+                                 int feature_in_len, int sampled_pts_num,
+                                 const Tensor xyz, const Tensor boxes3d,
+                                 const Tensor pts_feature,
+                                 Tensor pooled_features,
+                                 Tensor pooled_empty_flag) {
+  RoIPointPool3dForwardMLUKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+}
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag);
+
+REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, MLU,
+                     roipoint_pool3d_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/rotated_feature_align_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/rotated_feature_align_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..46f93facda2d19223bb90ad1471d380c2179983a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/rotated_feature_align_mlu.cpp
@@ -0,0 +1,115 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void RotatedFeatureAlignForwardMLUKernelLauncher(const Tensor features,
+                                                 const Tensor best_bboxes,
+                                                 const float spatial_scale,
+                                                 const int points,
+                                                 Tensor output) {
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(features.dim());
+  auto features_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(features, memory_format);
+  auto best_bboxes_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      best_bboxes, best_bboxes.suggest_memory_format());
+  auto output_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);
+
+  MluOpTensorDescriptor features_desc, best_bboxes_desc, output_desc;
+  features_desc.set_with_layout(features_, MLUOP_LAYOUT_NHWC);
+  best_bboxes_desc.set(best_bboxes_contiguous);
+  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
+
+  // get ptr of tensors
+  auto features_impl = torch_mlu::getMluTensorImpl(features_);
+  auto features_ptr = features_impl->cnnlMalloc();
+  auto best_bboxes_impl = torch_mlu::getMluTensorImpl(best_bboxes_contiguous);
+  auto best_bboxes_ptr = best_bboxes_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpRotatedFeatureAlignForward(
+      handle, features_desc.desc(), features_ptr, best_bboxes_desc.desc(),
+      best_bboxes_ptr, spatial_scale, points, output_desc.desc(), output_ptr));
+
+  output.copy_(output_contiguous);
+}
+
+void RotatedFeatureAlignBackwardMLUKernelLauncher(const Tensor top_grad,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor bottom_grad) {
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());
+  auto top_grad_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);
+  auto best_bboxes_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      best_bboxes, best_bboxes.suggest_memory_format());
+  auto bottom_grad_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(bottom_grad, memory_format);
+
+  // get ptr of tensors
+  auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_);
+  auto top_grad_ptr = top_grad_impl->cnnlMalloc();
+  auto best_bboxes_impl = torch_mlu::getMluTensorImpl(best_bboxes_contiguous);
+  auto best_bboxes_ptr = best_bboxes_impl->cnnlMalloc();
+  auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_);
+  auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();
+
+  MluOpTensorDescriptor top_grad_desc, best_bboxes_desc, bottom_grad_desc;
+  top_grad_desc.set_with_layout(top_grad_, MLUOP_LAYOUT_NHWC);
+  best_bboxes_desc.set(best_bboxes_contiguous);
+  bottom_grad_desc.set_with_layout(bottom_grad_, MLUOP_LAYOUT_NHWC);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpRotatedFeatureAlignBackward(
+      handle, top_grad_desc.desc(), top_grad_ptr, best_bboxes_desc.desc(),
+      best_bboxes_ptr, spatial_scale, points, bottom_grad_desc.desc(),
+      bottom_grad_ptr));
+  bottom_grad.copy_(bottom_grad_);
+}
+
+void rotated_feature_align_forward_mlu(const Tensor features,
+                                       const Tensor best_bboxes,
+                                       const float spatial_scale,
+                                       const int points, Tensor output) {
+  RotatedFeatureAlignForwardMLUKernelLauncher(features, best_bboxes,
+                                              spatial_scale, points, output);
+}
+
+void rotated_feature_align_backward_mlu(const Tensor top_grad,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor bottom_grad) {
+  RotatedFeatureAlignBackwardMLUKernelLauncher(
+      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
+}
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, MLU,
+                     rotated_feature_align_forward_mlu);
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, MLU,
+                     rotated_feature_align_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa5098722073d6f32bd9c16835cd19af53e6c3bf
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp
@@ -0,0 +1,156 @@
+/*************************************************************************
+ * Copyright (C) 2023 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+std::vector<Tensor> dynamic_point_to_voxel_forward_mlu(
+    const Tensor &feats, const Tensor &coors, const reduce_t reduce_type) {
+  // params check
+  TORCH_CHECK(feats.scalar_type() == at::kFloat,
+              "feats type should be Float, got ", feats.scalar_type());
+  TORCH_CHECK(coors.scalar_type() == at::kInt,
+              "coors type should be Int32, got ", coors.scalar_type());
+  TORCH_CHECK(feats.size(0) == coors.size(0),
+              "feats.dim(0) and coors.dim(0) should be same, got ",
+              feats.size(0), " vs ", coors.size(0));
+
+  const int num_input = feats.size(0);
+  const int num_feats = feats.size(1);
+  // zero-element check
+  if (num_input == 0)
+    return {feats.clone().detach(), coors.clone().detach(),
+            coors.new_empty({0}, torch::kInt32),
+            coors.new_empty({0}, torch::kInt32)};
+
+  auto mlu_reduce_type = getMluOpReduceMode(reduce_type);
+  auto reduced_feats = at::empty({num_input, num_feats}, feats.options());
+  auto out_coors = at::empty({num_input, 3}, coors.options());
+  auto coors_map = at::empty({num_input}, coors.options());
+  auto reduce_count = at::empty({num_input}, coors.options());
+  auto voxel_num = at::empty({1}, coors.options());
+
+  INITIAL_MLU_PARAM_WITH_TENSOR(feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(coors);
+  INITIAL_MLU_PARAM_WITH_TENSOR(reduced_feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(out_coors);
+  INITIAL_MLU_PARAM_WITH_TENSOR(coors_map);
+  INITIAL_MLU_PARAM_WITH_TENSOR(reduce_count);
+  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  size_t workspace_size;
+  TORCH_MLUOP_CHECK(mluOpGetDynamicPointToVoxelForwardWorkspaceSize(
+      handle, feats_desc.desc(), coors_desc.desc(), &workspace_size));
+  auto workspace_tensor =
+      at::empty(workspace_size, feats.options().dtype(at::kByte));
+  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);
+
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpDynamicPointToVoxelForward(
+      handle, mlu_reduce_type, feats_desc.desc(), feats_ptr, coors_desc.desc(),
+      coors_ptr, workspace_tensor_ptr, workspace_size,
+      reduced_feats_desc.desc(), reduced_feats_ptr, out_coors_desc.desc(),
+      out_coors_ptr, coors_map_desc.desc(), coors_map_ptr,
+      reduce_count_desc.desc(), reduce_count_ptr, voxel_num_desc.desc(),
+      voxel_num_ptr));
+
+  int voxel_num_value = *static_cast<int *>(voxel_num.cpu().data_ptr());
+  TORCH_CHECK(voxel_num_value <= feats.size(0),
+              "voxel_num should be less than or equal to feats_num, got ",
+              voxel_num_value, " vs ", feats.size(0));
+  return {reduced_feats.slice(0, 0, voxel_num_value),
+          out_coors.slice(0, 0, voxel_num_value), coors_map,
+          reduce_count.slice(0, 0, voxel_num_value)};
+}
+
+void dynamic_point_to_voxel_backward_mlu(
+    Tensor &grad_feats, const Tensor &grad_reduced_feats, const Tensor &feats,
+    const Tensor &reduced_feats, const Tensor &coors_idx,
+    const Tensor &reduce_count, const reduce_t reduce_type) {
+  // params check
+  TORCH_CHECK(grad_reduced_feats.scalar_type() == at::kFloat,
+              "grad_reduced_feats type should be Float, got ",
+              grad_reduced_feats.scalar_type());
+  TORCH_CHECK(feats.scalar_type() == at::kFloat,
+              "feats type should be Float, got ", feats.scalar_type());
+  TORCH_CHECK(reduced_feats.scalar_type() == at::kFloat,
+              "reduced_feats type should be Float, got ",
+              reduced_feats.scalar_type());
+  TORCH_CHECK(coors_idx.scalar_type() == at::kInt,
+              "coors_idx type should be Int32, got ", coors_idx.scalar_type());
+  TORCH_CHECK(reduce_count.scalar_type() == at::kInt,
+              "reduce_count type should be Int32, got ",
+              reduce_count.scalar_type());
+
+  const int num_input = feats.size(0);
+  const int num_reduced = reduced_feats.size(0);
+  const int num_feats = feats.size(1);
+
+  grad_feats.fill_(0);
+
+  // zero-element check
+  if (num_input == 0 || num_reduced == 0) return;
+
+  // TODO(miaochen): remove this after mlu-ops supports other mode of reduce.
+  TORCH_CHECK(reduce_type == reduce_t::MAX,
+              "only supports max reduce in current version, got ",
+              to_string(reduce_type));
+
+  int voxel_num_value = reduced_feats.size(0);
+  auto opts = torch::TensorOptions().dtype(torch::kInt32);
+  auto voxel_num =
+      torch::from_blob(&voxel_num_value, {1}, opts).clone().to(at::kMLU);
+  auto mlu_reduce_type = getMluOpReduceMode(reduce_type);
+
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_reduced_feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(reduced_feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(coors_idx);
+  INITIAL_MLU_PARAM_WITH_TENSOR(reduce_count);
+  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  size_t workspace_size;
+  TORCH_MLUOP_CHECK(mluOpGetDynamicPointToVoxelBackwardWorkspaceSize(
+      handle, mlu_reduce_type, grad_feats_desc.desc(), feats_desc.desc(),
+      grad_reduced_feats_desc.desc(), coors_idx_desc.desc(),
+      reduce_count_desc.desc(), voxel_num_desc.desc(), &workspace_size));
+  auto workspace_tensor =
+      at::empty(workspace_size, feats.options().dtype(at::kByte));
+  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);
+
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpDynamicPointToVoxelBackward(
+      handle, mlu_reduce_type, grad_reduced_feats_desc.desc(),
+      grad_reduced_feats_ptr, feats_desc.desc(), feats_ptr,
+      reduced_feats_desc.desc(), reduced_feats_ptr, coors_idx_desc.desc(),
+      coors_idx_ptr, reduce_count_desc.desc(), reduce_count_ptr,
+      voxel_num_desc.desc(), voxel_num_ptr, workspace_tensor_ptr,
+      workspace_size, grad_feats_desc.desc(), grad_feats_ptr));
+}
+
+std::vector<Tensor> dynamic_point_to_voxel_forward_impl(
+    const Tensor &feats, const Tensor &coors, const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_impl(
+    Tensor &grad_feats, const Tensor &grad_reduced_feats, const Tensor &feats,
+    const Tensor &reduced_feats, const Tensor &coors_idx,
+    const Tensor &reduce_count, const reduce_t reduce_type);
+
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, MLU,
+                     dynamic_point_to_voxel_forward_mlu);
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, MLU,
+                     dynamic_point_to_voxel_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/sparse_conv_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/sparse_conv_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..03acd84abc3f251b19e33e41c736395477d589e2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/sparse_conv_mlu.cpp
@@ -0,0 +1,444 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <torch/script.h>
+
+#include <vector>
+
+#include "mlu_common_helper.h"
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  // The following code is copied from
+  // mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu to ensure the output is
+  // available for network train. The outputs of this function have correct
+  // shape but wrong value.
+  auto numAct = indices.size(0);
+  auto kernelVolume = kernelSize[0];
+  int sub_m = (int)_subM;
+  int transpose = (int)_transpose;
+  int batch = (int)batchSize;
+  auto coorDim = indices.size(1) - 1;
+
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  torch::Tensor indicePairs = at::full({kernelVolume, 2, numAct}, -1,
+                                       indices.options().dtype(at::kInt));
+  torch::Tensor indiceNum =
+      at::zeros({kernelVolume}, indices.options().dtype(at::kInt));
+  int out_size = sub_m == 1
+                     ? numAct
+                     : std::min(numAct * kernelVolume, batch * outputVolume);
+  torch::Tensor out_indices =
+      at::zeros({out_size, coorDim + 1}, indices.options().dtype(at::kInt));
+  auto indices_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      indices, at::MemoryFormat::Contiguous);
+  auto indicePairs_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      indicePairs, at::MemoryFormat::Contiguous);
+  auto indiceNum_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      indiceNum, at::MemoryFormat::Contiguous);
+  auto out_indices_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      out_indices, at::MemoryFormat::Contiguous);
+
+  std::vector<int> input_space;
+  std::vector<int> filter_space;
+  std::vector<int> output_space;
+  std::vector<int> padding32;
+  std::vector<int> stride32;
+  std::vector<int> dilation32;
+  for (int i = 0; i < NDim; i++) {
+    input_space.push_back(spatialShape[i]);
+    filter_space.push_back(kernelSize[i]);
+    output_space.push_back(outSpatialShape[i]);
+    padding32.push_back(padding[i]);
+    stride32.push_back(stride[i]);
+    dilation32.push_back(dilation[i]);
+  }
+  MluOpTensorDescriptor indices_desc, out_indices_desc, indicePairs_desc,
+      indiceNum_desc;
+  indices_desc.set(indices_contiguous);
+  indicePairs_desc.set(indicePairs_contiguous);
+  indiceNum_desc.set(indiceNum_contiguous);
+  out_indices_desc.set(out_indices_contiguous);
+  {
+    mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY;
+    mluOpDataType_t dtype = MLUOP_DTYPE_INT32;
+    std::vector<int> dims;
+    dims = {numAct, coorDim + 1};
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        indices_desc.desc(), layout, dtype, dims.size(), dims.data()));
+    dims = {kernelVolume, 2, numAct};
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        indicePairs_desc.desc(), layout, dtype, dims.size(), dims.data()));
+    dims = {kernelVolume};
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        indiceNum_desc.desc(), layout, dtype, dims.size(), dims.data()));
+    dims = {out_size, coorDim + 1};
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        out_indices_desc.desc(), layout, dtype, dims.size(), dims.data()));
+  }
+
+  mluOpSparseConvolutionDescriptor_t sparse_conv_desc;
+  TORCH_MLUOP_CHECK(mluOpCreateSparseConvolutionDescriptor(&sparse_conv_desc));
+  TORCH_MLUOP_CHECK(mluOpSetSparseConvolutionDescriptor(
+      sparse_conv_desc, NDim + 2, batch, padding32.data(), stride32.data(),
+      dilation32.data(), input_space.data(), filter_space.data(),
+      output_space.data(), sub_m, transpose, 0));
+
+  auto handle = mluOpGetCurrentHandle();
+  size_t workspace_size = 0;
+  TORCH_MLUOP_CHECK(mluOpGetIndicePairsWorkspaceSize(
+      handle, sparse_conv_desc, indices_desc.desc(), indicePairs_desc.desc(),
+      out_indices_desc.desc(), indiceNum_desc.desc(), &workspace_size));
+  auto indice_workspace_size =
+      at::empty(workspace_size, indices.options().dtype(at::kByte));
+
+  auto indices_impl = torch_mlu::getMluTensorImpl(indices_contiguous);
+  auto out_indices_impl = torch_mlu::getMluTensorImpl(out_indices_contiguous);
+  auto indicePairs_impl = torch_mlu::getMluTensorImpl(indicePairs_contiguous);
+  auto indiceNum_impl = torch_mlu::getMluTensorImpl(indiceNum_contiguous);
+  auto indice_workspace_impl =
+      torch_mlu::getMluTensorImpl(indice_workspace_size);
+
+  auto indices_ptr = indices_impl->cnnlMalloc();
+  auto out_indices_ptr = out_indices_impl->cnnlMalloc();
+  auto indicePairs_ptr = indicePairs_impl->cnnlMalloc();
+  auto indiceNum_ptr = indiceNum_impl->cnnlMalloc();
+  auto indice_workspace_ptr = indice_workspace_impl->cnnlMalloc();
+
+  TORCH_MLUOP_CHECK(mluOpGetIndicePairs(
+      handle, sparse_conv_desc, indices_desc.desc(), indices_ptr,
+      indice_workspace_ptr, workspace_size, indicePairs_desc.desc(),
+      indicePairs_ptr, out_indices_desc.desc(), out_indices_ptr,
+      indiceNum_desc.desc(), indiceNum_ptr));
+  int num_act_out = 0;
+  TORCH_MLUOP_CHECK(
+      mluOpGetSparseConvolutionNumActOut(sparse_conv_desc, &num_act_out));
+  TORCH_MLUOP_CHECK(mluOpDestroySparseConvolutionDescriptor(sparse_conv_desc));
+  if (!sub_m) {
+    return {out_indices.slice(0, 0, num_act_out), indicePairs, indiceNum};
+  } else {
+    return {indices, indicePairs, indiceNum};
+  }
+}
+
+torch::Tensor IndiceConvForwardMLUKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
+    int64_t _subM) {
+  auto indice_num_cpu = indiceNum.to({torch::kCPU});
+  auto indice_num_cpu_64 = indice_num_cpu.to(torch::kInt64);
+  auto indice_num = indice_num_cpu_64.data_ptr<int64_t>();
+
+  // generate empty output
+  int C = filters.dim() == 4 ? filters.size(3) : filters.size(4);
+  torch::Tensor output =
+      at::zeros({numActOut, C}, features.options().dtype(at::kFloat));
+  // generate descriptor
+  auto features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      features, at::MemoryFormat::Contiguous);
+  auto filters_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      filters, at::MemoryFormat::Contiguous);
+  auto indice_pairs_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      indicePairs, at::MemoryFormat::Contiguous);
+  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      output, at::MemoryFormat::Contiguous);
+
+  MluOpTensorDescriptor features_desc, filters_desc, indice_pairs_desc,
+      output_desc;
+  features_desc.set(features_contiguous);
+  filters_desc.set(filters_contiguous);
+  indice_pairs_desc.set(indice_pairs_contiguous);
+  output_desc.set(output_contiguous);
+
+  // set layout
+  {
+    mluOpTensorLayout_t layout;
+    mluOpDataType_t dtype;
+    int dim;
+    int dims[8];
+
+    // features_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(features_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        features_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+
+    // filters_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(filters_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        filters_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+
+    // indice_pairs_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(indice_pairs_desc.desc(),
+                                               &layout, &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        indice_pairs_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+
+    // output_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(output_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        output_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+  }
+
+  auto handle = mluOpGetCurrentHandle();
+  size_t workspace_size = 0;
+  TORCH_MLUOP_CHECK(mluOpGetIndiceConvolutionForwardWorkspaceSize(
+      handle, features_desc.desc(), filters_desc.desc(),
+      indice_pairs_desc.desc(), output_desc.desc(), indice_num, numActOut,
+      _inverse, _subM, &workspace_size));
+
+  auto workspace =
+      at::empty(workspace_size, features.options().dtype(at::kByte));
+
+  auto features_impl = torch_mlu::getMluTensorImpl(features_contiguous);
+  auto filters_impl = torch_mlu::getMluTensorImpl(filters_contiguous);
+  auto indice_pairs_impl = torch_mlu::getMluTensorImpl(indice_pairs_contiguous);
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+
+  auto features_ptr = features_impl->cnnlMalloc();
+  auto filters_ptr = filters_impl->cnnlMalloc();
+  auto indice_pairs_ptr = indice_pairs_impl->cnnlMalloc();
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+
+  //  outputs
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+  TORCH_MLUOP_CHECK(mluOpIndiceConvolutionForward(
+      handle, features_desc.desc(), features_ptr, filters_desc.desc(),
+      filters_ptr, indice_pairs_desc.desc(), indice_pairs_ptr, indice_num,
+      numActOut, _inverse, _subM, workspace_ptr, workspace_size,
+      output_desc.desc(), output_ptr));
+
+  return output;
+}
+
+std::vector<torch::Tensor> IndiceConvBackwardMLUKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  auto indice_num_cpu = indiceNum.to({torch::kCPU});
+  auto indice_num_cpu_64 = indice_num_cpu.to(torch::kInt64);
+  auto indice_num = indice_num_cpu_64.data_ptr<int64_t>();
+
+  // generate empty input_grad
+  torch::Tensor input_grad = at::zeros({features.size(0), features.size(1)},
+                                       features.options().dtype(at::kFloat));
+  torch::Tensor filters_grad;
+  if (filters.dim() == 4) {
+    int h = filters.size(0);
+    int w = filters.size(1);
+    int c = filters.size(2);
+    int n = filters.size(3);
+    filters_grad = at::zeros({h, w, c, n}, filters.options().dtype(at::kFloat));
+  } else if (filters.dim() == 5) {
+    int d = filters.size(0);
+    int h = filters.size(1);
+    int w = filters.size(2);
+    int c = filters.size(3);
+    int n = filters.size(4);
+    filters_grad =
+        at::zeros({d, h, w, c, n}, filters.options().dtype(at::kFloat));
+  }
+
+  auto features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      features, at::MemoryFormat::Contiguous);
+  auto filters_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      filters, at::MemoryFormat::Contiguous);
+  auto output_grad_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      outGrad, at::MemoryFormat::Contiguous);
+  auto indice_pairs_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      indicePairs, at::MemoryFormat::Contiguous);
+  auto input_grad_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      features, at::MemoryFormat::Contiguous);
+  auto filters_grad_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      filters, at::MemoryFormat::Contiguous);
+
+  MluOpTensorDescriptor features_desc, output_grad_desc, filters_desc,
+      indice_pairs_desc, input_grad_desc, filters_grad_desc;
+  features_desc.set(features_contiguous);
+  filters_desc.set(filters_contiguous);
+  output_grad_desc.set(output_grad_contiguous);
+  indice_pairs_desc.set(indice_pairs_contiguous);
+  input_grad_desc.set(input_grad_contiguous);
+  filters_grad_desc.set(filters_grad_contiguous);
+
+  // need to set desc layout with mluOp functions
+  {
+    mluOpTensorLayout_t layout;
+    mluOpDataType_t dtype;
+    int dim;
+    int dims[8];
+
+    // features_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(features_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        features_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+
+    // filters_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(filters_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    if (dim == 4) {
+      TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+          filters_desc.desc(), MLUOP_LAYOUT_HWCN, dtype, dim, dims));
+    } else {
+      TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+          filters_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+    }
+
+    // output_grad_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(output_grad_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        output_grad_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+
+    // indice_pairs_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(indice_pairs_desc.desc(),
+                                               &layout, &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        indice_pairs_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+
+    // input_grad_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(input_grad_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        input_grad_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+  }
+
+  auto handle = mluOpGetCurrentHandle();
+  size_t data_workspace_size = 0;
+  mluOpGetIndiceConvolutionBackwardDataWorkspaceSize(
+      handle, output_grad_desc.desc(), filters_desc.desc(),
+      indice_pairs_desc.desc(), input_grad_desc.desc(), indice_num, _inverse,
+      &data_workspace_size);
+
+  size_t filters_workspace_size = 0;
+  TORCH_MLUOP_CHECK(mluOpGetIndiceConvolutionBackwardFilterWorkspaceSize(
+      handle, features_desc.desc(), output_grad_desc.desc(),
+      indice_pairs_desc.desc(), filters_grad_desc.desc(), indice_num, _inverse,
+      _subM, &filters_workspace_size));
+
+  auto indice_convbpdata_workspace =
+      at::empty(data_workspace_size, features.options().dtype(at::kByte));
+  auto indice_convbpfilter_workspace =
+      at::empty(filters_workspace_size, filters.options().dtype(at::kByte));
+
+  auto features_impl = torch_mlu::getMluTensorImpl(features_contiguous);
+  auto filters_impl = torch_mlu::getMluTensorImpl(filters_contiguous);
+  auto output_grad_impl = torch_mlu::getMluTensorImpl(output_grad_contiguous);
+  auto indice_pairs_impl = torch_mlu::getMluTensorImpl(indice_pairs_contiguous);
+  auto indice_convbpdata_workspace_impl =
+      torch_mlu::getMluTensorImpl(indice_convbpdata_workspace);
+  auto indice_convbpfilter_workspace_impl =
+      torch_mlu::getMluTensorImpl(indice_convbpfilter_workspace);
+
+  auto features_ptr = features_impl->cnnlMalloc();
+  auto filters_ptr = filters_impl->cnnlMalloc();
+  auto output_grad_ptr = output_grad_impl->cnnlMalloc();
+  auto indice_pairs_ptr = indice_pairs_impl->cnnlMalloc();
+  auto indice_convbpdata_workspace_ptr =
+      indice_convbpdata_workspace_impl->cnnlMalloc();
+  auto indice_convbpfilter_workspace_ptr =
+      indice_convbpfilter_workspace_impl->cnnlMalloc();
+
+  // outputs
+  auto input_grad_impl = torch_mlu::getMluTensorImpl(input_grad);
+  auto input_grad_ptr = input_grad_impl->cnnlMalloc();
+  auto filters_grad_impl = torch_mlu::getMluTensorImpl(filters_grad);
+  auto filters_grad_ptr = filters_grad_impl->cnnlMalloc();
+
+  TORCH_MLUOP_CHECK(mluOpIndiceConvolutionBackwardData(
+      handle, output_grad_desc.desc(), output_grad_ptr, filters_desc.desc(),
+      filters_ptr, indice_pairs_desc.desc(), indice_pairs_ptr, indice_num,
+      _inverse, _subM, indice_convbpdata_workspace_ptr, data_workspace_size,
+      input_grad_desc.desc(), input_grad_ptr));
+
+  TORCH_MLUOP_CHECK(mluOpIndiceConvolutionBackwardFilter(
+      handle, features_desc.desc(), features_ptr, output_grad_desc.desc(),
+      output_grad_ptr, indice_pairs_desc.desc(), indice_pairs_ptr, indice_num,
+      _inverse, _subM, indice_convbpfilter_workspace_ptr,
+      filters_workspace_size, filters_grad_desc.desc(), filters_grad_ptr));
+
+  std::vector<torch::Tensor> result;
+  result.push_back(input_grad);
+  result.push_back(filters_grad);
+  return result;
+}
+
+torch::Tensor indice_conv_forward_mlu(torch::Tensor features,
+                                      torch::Tensor filters,
+                                      torch::Tensor indicePairs,
+                                      torch::Tensor indiceNum,
+                                      int64_t numActOut, int64_t _inverse,
+                                      int64_t _subM) {
+  return IndiceConvForwardMLUKernelLauncher(
+      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
+}
+
+std::vector<torch::Tensor> indice_conv_backward_mlu(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return IndiceConvBackwardMLUKernelLauncher(
+      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
+}
+
+torch::Tensor indice_conv_forward_impl(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM);
+
+std::vector<torch::Tensor> indice_conv_backward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM);
+
+REGISTER_DEVICE_IMPL(indice_conv_forward_impl, MLU, indice_conv_forward_mlu);
+REGISTER_DEVICE_IMPL(indice_conv_backward_impl, MLU, indice_conv_backward_mlu);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9aa57472d9ef651c85335f32d771f044c86b256d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp
@@ -0,0 +1,63 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void ThreeNNMLUKernelLauncher(int b, int n, int m, const Tensor unknown,
+                              const Tensor known, Tensor dist2, Tensor idx) {
+  auto unknown_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      unknown, unknown.suggest_memory_format());
+  auto known_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      known, known.suggest_memory_format());
+  auto dist2_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      dist2, dist2.suggest_memory_format());
+  auto idx_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(idx, idx.suggest_memory_format());
+
+  MluOpTensorDescriptor unknown_desc, known_desc, dist2_desc, idx_desc;
+  unknown_desc.set(unknown_contiguous);
+  known_desc.set(known_contiguous);
+  dist2_desc.set(dist2_contiguous);
+  idx_desc.set(idx_contiguous);
+
+  auto handle = mluOpGetCurrentHandle();
+  size_t workspace_size = 0;
+  TORCH_MLUOP_CHECK(mluOpGetThreeNNForwardWorkspaceSize(
+      handle, known_desc.desc(), &workspace_size));
+  auto known_workspace =
+      at::empty(workspace_size, known.options().dtype(at::kByte));
+
+  auto unknown_impl = torch_mlu::getMluTensorImpl(unknown_contiguous);
+  auto known_impl = torch_mlu::getMluTensorImpl(known_contiguous);
+  auto dist2_impl = torch_mlu::getMluTensorImpl(dist2_contiguous);
+  auto idx_impl = torch_mlu::getMluTensorImpl(idx_contiguous);
+  auto workspace_impl = torch_mlu::getMluTensorImpl(known_workspace);
+  auto unknown_ptr = unknown_impl->cnnlMalloc();
+  auto known_ptr = known_impl->cnnlMalloc();
+  auto dist2_ptr = dist2_impl->cnnlMalloc();
+  auto idx_ptr = idx_impl->cnnlMalloc();
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+
+  TORCH_MLUOP_CHECK(mluOpThreeNNForward(
+      handle, unknown_desc.desc(), unknown_ptr, known_desc.desc(), known_ptr,
+      workspace_ptr, workspace_size, dist2_desc.desc(), dist2_ptr,
+      idx_desc.desc(), idx_ptr));
+}
+
+void three_nn_forward_mlu(int b, int n, int m, const Tensor unknown,
+                          const Tensor known, Tensor dist2, Tensor idx) {
+  ThreeNNMLUKernelLauncher(b, n, m, unknown, known, dist2, idx);
+}
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx);
+
+REGISTER_DEVICE_IMPL(three_nn_forward_impl, MLU, three_nn_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b7714f599530f2c25f923ee718a6008c2030514
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
@@ -0,0 +1,138 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void TINShiftForwardMLUKernelLauncher(Tensor input, Tensor shift,
+                                      Tensor output) {
+  // params check
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "input type should be Float or Half, got ", input.scalar_type(), ".");
+  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
+              input.dim(), "d.");
+  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
+              shift.dim(), "d.");
+  TORCH_CHECK(
+      input.size(0) == shift.size(0),
+      "input batch size should be the same as shift's, input batch size is ",
+      input.size(0), " and shift batch size is ", shift.size(0), ".");
+  TORCH_CHECK(input.size(0) != 0, "Input batch size should not be zero.");
+  TORCH_CHECK(input.size(3) != 0,
+              "The last dim size of input should not be zero.");
+  if (input.size(1) == 0) {
+    return;
+  }
+
+  // set contiguous
+  auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      input, input.suggest_memory_format());
+  auto shift_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      shift, shift.suggest_memory_format());
+  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      output, output.suggest_memory_format());
+
+  // get tensor impl
+  auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);
+  auto shift_impl = torch_mlu::getMluTensorImpl(shift_contiguous);
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+
+  // get the mlu ptr
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto shift_ptr = shift_impl->cnnlMalloc();
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // set tensor descriptor
+  MluOpTensorDescriptor input_desc, shift_desc, output_desc;
+  input_desc.set(input_contiguous);
+  shift_desc.set(shift_contiguous);
+  output_desc.set(output_contiguous);
+
+  // get current handle
+  auto handle = mluOpGetCurrentHandle();
+
+  TORCH_MLUOP_CHECK(mluOpTinShiftForward(handle, input_desc.desc(), input_ptr,
+                                         shift_desc.desc(), shift_ptr,
+                                         output_desc.desc(), output_ptr));
+}
+
+void TINShiftBackwardMLUKernelLauncher(Tensor grad_output, Tensor shift,
+                                       Tensor grad_input) {
+  // params check
+  TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||
+                  grad_output.scalar_type() == at::kHalf,
+              "grad_output type should be Float or Half, got ",
+              grad_output.scalar_type(), ".");
+  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be a 4d tensor, got ",
+              grad_output.dim(), "d.");
+  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
+              shift.dim(), "d.");
+  TORCH_CHECK(grad_output.size(0) == shift.size(0),
+              "grad_output batch size should be the same as shift's, "
+              "grad_output batch size is ",
+              grad_output.size(0), ", shift batch size is ", shift.size(0),
+              ".");
+  TORCH_CHECK(grad_output.size(0) != 0,
+              "grad_output batch size should not be zero.");
+  TORCH_CHECK(grad_output.size(3) != 0,
+              "The last dim size of grad_output should not be zero.");
+  if (grad_output.size(1) == 0) {
+    return;
+  }
+
+  // set contiguous
+  auto grad_output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      grad_output, grad_output.suggest_memory_format());
+  auto shift_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      shift, shift.suggest_memory_format());
+  auto grad_input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      grad_input, grad_input.suggest_memory_format());
+
+  // get tensor impl
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_contiguous);
+  auto shift_impl = torch_mlu::getMluTensorImpl(shift_contiguous);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_contiguous);
+
+  // get the mlu ptr
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto shift_ptr = shift_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  // set tensor descriptor
+  MluOpTensorDescriptor grad_output_desc, shift_desc, grad_input_desc;
+  grad_output_desc.set(grad_output_contiguous);
+  shift_desc.set(shift_contiguous);
+  grad_input_desc.set(grad_input_contiguous);
+
+  // get current handle
+  auto handle = mluOpGetCurrentHandle();
+
+  TORCH_MLUOP_CHECK(mluOpTinShiftBackward(
+      handle, grad_output_desc.desc(), grad_output_ptr, shift_desc.desc(),
+      shift_ptr, grad_input_desc.desc(), grad_input_ptr));
+}
+
+void tin_shift_forward_mlu(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardMLUKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_mlu(Tensor grad_output, Tensor shift,
+                            Tensor grad_input) {
+  TINShiftBackwardMLUKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, MLU, tin_shift_forward_mlu);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, MLU, tin_shift_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e6d4ff6d2b260a5c7a0e1c9196a4fb93bc2b760f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp
@@ -0,0 +1,99 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+/*************************************************************************
+ * This MACRO contains operations of simple tensor to mlu-tensor.
+ * _contiguous, _desc, _impl, _ptr will be automatically generated in
+ * this MACRO.
+ *************************************************************************/
+#define INITIAL_MLU_PARAM_WITH_TENSOR(NAME)                         \
+  auto NAME##_contigous = torch_mlu::cnnl::ops::cnnl_contiguous(    \
+      NAME, NAME.suggest_memory_format());                          \
+  MluOpTensorDescriptor NAME##_desc;                                \
+  NAME##_desc.set(NAME##_contigous);                                \
+  auto NAME##_impl = torch_mlu::getMluTensorImpl(NAME##_contigous); \
+  auto NAME##_ptr = NAME##_impl->cnnlMalloc();
+
+int HardVoxelizeForwardMLUKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  std::vector<float> _voxel_size(voxel_size.begin(), voxel_size.end());
+  std::vector<float> _coors_range(coors_range.begin(), coors_range.end());
+  auto opts = torch::TensorOptions().dtype(torch::kFloat32);
+  auto voxel_size_tensor =
+      torch::from_blob(_voxel_size.data(), {int64_t(_voxel_size.size())}, opts)
+          .clone()
+          .to(at::kMLU);
+  auto coors_range_tensor =
+      torch::from_blob(_coors_range.data(), {int64_t(_coors_range.size())},
+                       opts)
+          .clone()
+          .to(at::kMLU);
+  INITIAL_MLU_PARAM_WITH_TENSOR(points);
+  INITIAL_MLU_PARAM_WITH_TENSOR(voxels);
+  INITIAL_MLU_PARAM_WITH_TENSOR(coors);
+  INITIAL_MLU_PARAM_WITH_TENSOR(num_points_per_voxel);
+  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_size_tensor);
+  INITIAL_MLU_PARAM_WITH_TENSOR(coors_range_tensor);
+
+  auto voxel_num_tensor = at::empty({1}, points.options().dtype(torch::kInt32));
+  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num_tensor);
+
+  size_t workspace_size;
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpGetVoxelizationWorkspaceSize(
+      handle, points_desc.desc(), voxel_size_tensor_desc.desc(),
+      coors_range_tensor_desc.desc(), max_points, max_voxels, NDim, true,
+      voxels_desc.desc(), coors_desc.desc(), num_points_per_voxel_desc.desc(),
+      voxel_num_tensor_desc.desc(), &workspace_size));
+  auto workspace_tensor =
+      at::empty(workspace_size, points.options().dtype(at::kByte));
+  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);
+
+  TORCH_MLUOP_CHECK(mluOpVoxelization(
+      handle, points_desc.desc(), points_ptr, voxel_size_tensor_desc.desc(),
+      voxel_size_tensor_ptr, coors_range_tensor_desc.desc(),
+      coors_range_tensor_ptr, max_points, max_voxels, NDim, true,
+      workspace_tensor_ptr, workspace_size, voxels_desc.desc(), voxels_ptr,
+      coors_desc.desc(), coors_ptr, num_points_per_voxel_desc.desc(),
+      num_points_per_voxel_ptr, voxel_num_tensor_desc.desc(),
+      voxel_num_tensor_ptr));
+  auto voxel_num_cpu = voxel_num_tensor.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+  return voxel_num_int;
+}
+
+int hard_voxelize_forward_mlu(const at::Tensor &points, at::Tensor &voxels,
+                              at::Tensor &coors,
+                              at::Tensor &num_points_per_voxel,
+                              const std::vector<float> voxel_size,
+                              const std::vector<float> coors_range,
+                              const int max_points, const int max_voxels,
+                              const int NDim) {
+  return HardVoxelizeForwardMLUKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+}
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, MLU,
+                     hard_voxelize_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2b9493dbc9a300fd97aa0ffb5d8cce87b33fa29d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
@@ -0,0 +1,409 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#ifdef MMCV_WITH_DIOPI
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+#include <diopi/functions_mmcv.h>
+
+#include "csrc_dipu/diopirt/diopirt_impl.h"
+
+using dipu::diopi_helper::toDiopiScalar;
+using dipu::diopi_helper::toDiopiTensorHandle;
+#endif
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, data_col);
+}
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, grad_im);
+}
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
+                       data_im, data_offset, data_mask, batch_size, channels,
+                       height_im, width_im, height_col, width_col, kernel_h,
+                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                       dilation_w, deformable_group, grad_offset, grad_mask);
+}
+
+void modulated_deform_conv_forward_fallthrough(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_backward_fallthrough(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_impl(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_impl(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
+
+#ifdef MMCV_WITH_DIOPI
+void modulated_deform_conv_forward_diopi(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+  auto input_p = toDiopiTensorHandle(input);
+  diopiDevice_t device;
+  diopiGetTensorDevice(input_p, &device);
+  if (device == diopi_host) {
+    modulated_deform_conv_forward_fallthrough(
+        input, weight, bias, ones, offset, mask, output, columns, kernel_h,
+        kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
+        group, deformable_group, with_bias);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto weight_p = toDiopiTensorHandle(weight);
+  auto bias_p = toDiopiTensorHandle(bias);
+  auto ones_p = toDiopiTensorHandle(ones);
+  auto offset_p = toDiopiTensorHandle(offset);
+  auto mask_p = toDiopiTensorHandle(mask);
+  auto output_p = toDiopiTensorHandle(output);
+  auto columns_p = toDiopiTensorHandle(columns);
+  if (reinterpret_cast<void*>(diopiModulatedDeformConvMmcv) != nullptr) {
+    auto ret = diopiModulatedDeformConvMmcv(
+        ch, output_p, columns_p, ones_p, input_p, weight_p, bias_p, offset_p,
+        mask_p, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
+        dilation_h, dilation_w, group, deformable_group, with_bias);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op modulated_deform_conv_forward";
+  auto input_cpu = input.cpu();
+  auto weight_cpu = weight.cpu();
+  auto bias_cpu = bias.cpu();
+  auto ones_cpu = ones.cpu();
+  auto offset_cpu = offset.cpu();
+  auto mask_cpu = mask.cpu();
+  auto output_cpu = output.cpu();
+  auto columns_cpu = columns.cpu();
+  modulated_deform_conv_forward_fallthrough(
+      input_cpu, weight_cpu, bias_cpu, ones_cpu, offset_cpu, mask_cpu,
+      output_cpu, columns_cpu, kernel_h, kernel_w, stride_h, stride_w, pad_h,
+      pad_w, dilation_h, dilation_w, group, deformable_group, with_bias);
+  output.copy_(output_cpu);
+  return;
+}
+
+void modulated_deform_conv_backward_diopi(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  auto input_p = toDiopiTensorHandle(input);
+  diopiDevice_t device;
+  diopiGetTensorDevice(input_p, &device);
+  if (device == diopi_host) {
+    modulated_deform_conv_backward_fallthrough(
+        input, weight, bias, ones, offset, mask, columns, grad_input,
+        grad_weight, grad_bias, grad_offset, grad_mask, grad_output, kernel_h,
+        kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
+        group, deformable_group, with_bias);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto weight_p = toDiopiTensorHandle(weight);
+  auto bias_p = toDiopiTensorHandle(bias);
+  auto ones_p = toDiopiTensorHandle(ones);
+  auto offset_p = toDiopiTensorHandle(offset);
+  auto mask_p = toDiopiTensorHandle(mask);
+  auto columns_p = toDiopiTensorHandle(columns);
+  auto grad_input_p = toDiopiTensorHandle(grad_input);
+  auto grad_weight_p = toDiopiTensorHandle(grad_weight);
+  auto grad_bias_p = toDiopiTensorHandle(grad_bias);
+  auto grad_offset_p = toDiopiTensorHandle(grad_offset);
+  auto grad_mask_p = toDiopiTensorHandle(grad_mask);
+  auto grad_output_p = toDiopiTensorHandle(grad_output);
+
+  if (reinterpret_cast<void*>(diopiModulatedDeformConvBackwardMmcv) !=
+      nullptr) {
+    auto ret = diopiModulatedDeformConvBackwardMmcv(
+        ch, grad_input_p, grad_weight_p, grad_bias_p, grad_offset_p,
+        grad_mask_p, input_p, weight_p, bias_p, ones_p, offset_p, mask_p,
+        columns_p, grad_output_p, kernel_h, kernel_w, stride_h, stride_w, pad_h,
+        pad_w, dilation_h, dilation_w, group, deformable_group, with_bias);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op modulated_deform_conv_forward";
+  auto input_cpu = input.cpu();
+  auto weight_cpu = weight.cpu();
+  auto bias_cpu = bias.cpu();
+  auto ones_cpu = ones.cpu();
+  auto offset_cpu = offset.cpu();
+  auto mask_cpu = mask.cpu();
+  auto columns_cpu = columns.cpu();
+  auto grad_input_cpu = grad_input.cpu();
+  auto grad_weight_cpu = grad_weight.cpu();
+  auto grad_bias_cpu = grad_bias.cpu();
+  auto grad_offset_cpu = grad_offset.cpu();
+  auto grad_mask_cpu = grad_mask.cpu();
+  auto grad_output_cpu = grad_output.cpu();
+  modulated_deform_conv_backward_fallthrough(
+      input_cpu, weight_cpu, bias_cpu, ones_cpu, offset_cpu, mask_cpu,
+      columns_cpu, grad_input_cpu, grad_weight_cpu, grad_bias_cpu,
+      grad_offset_cpu, grad_mask_cpu, grad_output_cpu, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+  grad_input.copy_(grad_input_cpu);
+  grad_weight.copy_(grad_weight_cpu);
+  grad_bias.copy_(grad_bias_cpu);
+  grad_offset.copy_(grad_offset_cpu);
+  grad_mask.copy_(grad_mask_cpu);
+  return;
+}
+#endif
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+#ifdef MMCV_WITH_DIOPI
+  modulated_deform_conv_forward_diopi(
+      input, weight, bias, ones, offset, mask, output, columns, kernel_h,
+      kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+#else
+  modulated_deform_conv_forward_fallthrough(
+      input, weight, bias, ones, offset, mask, output, columns, kernel_h,
+      kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+#endif
+}
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+#ifdef MMCV_WITH_DIOPI
+  modulated_deform_conv_backward_diopi(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+#else
+  modulated_deform_conv_backward_fallthrough(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+#endif
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
new file mode 100644
index 0000000000000000000000000000000000000000..cad6a41a09a0d9dbf43ae473235c356b16a2eec8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
@@ -0,0 +1,99 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "pytorch_device_registry.hpp"
+
+#include "MPSLibrary.h"
+#include "MPSStream.h"
+#include "MPSUtils.h"
+
+using at::Tensor;
+
+const static std::string kSourceCode = R"(
+#include <metal_math>
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void bbox_overlap_mps_kernel(constant const float4* bboxes1,
+                       constant const float4* bboxes2,
+                       device float* ious,
+                       constant int& num_bbox1,
+                       constant int& num_bbox2,
+                       constant int& mode,
+                       constant bool& aligned,
+                       constant int& offset,
+                       uint index [[thread_position_in_grid]])
+{
+    int base1 = index;
+    int base2 = index;
+    if(!aligned){
+      base1 = index / num_bbox2;
+      base2 = index % num_bbox2;
+    }
+
+    const float f_offset = float(offset);
+
+    const float4 b1 = bboxes1[base1];
+    const float b1_area = (b1[2]-b1[0]+f_offset)*(b1[3]-b1[1]+f_offset);
+
+    const float4 b2 = bboxes2[base2];
+    const float b2_area = (b2[2]-b2[0]+f_offset)*(b2[3]-b2[1]+f_offset);
+
+    const float2 left_top = fmax(b1.xy, b2.xy);
+    const float2 right_bottom = fmin(b1.zw, b2.zw);
+    const float2 wh = fmax(right_bottom - left_top + f_offset, 0.0f);
+    const float interS = wh.x * wh.y;
+
+    const float baseS =
+        fmax(mode == 0 ? b1_area + b2_area - interS : b1_area, f_offset);
+    ious[index] = interS / baseS;
+}
+)";
+
+void BBoxOverlapsMPSKernelLauncher(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                                   const int mode, const bool aligned, const int offset) {
+  // get stream
+  auto stream = at::mps::getCurrentMPSStream();
+  auto library_manager = MPSLibraryManager::getInstance();
+  MPSLibrary* library;
+  const static std::string kLibraryName = "bbox_overlap";
+  if (library_manager->hasLibrary(kLibraryName))
+    library = library_manager->getLibrary(kLibraryName);
+  else
+    library = library_manager->createLibraryFromSouce(kLibraryName, kSourceCode);
+  auto func_pso = library->getComputePipelineState("bbox_overlap_mps_kernel");
+
+  // create command buffer and encoder
+  MTLCommandBuffer_t command_buffer = stream->commandBuffer();
+  MTLComputeCommandEncoder_t compute_encoder = [command_buffer computeCommandEncoder];
+
+  // set pso and buffer
+  int output_size = ious.numel();
+  int num_bbox1 = bboxes1.size(0);
+  int num_bbox2 = bboxes2.size(0);
+  int num_elements = output_size;
+  setMTLArgs(compute_encoder, func_pso, bboxes1, bboxes2, ious, num_bbox1, num_bbox2, mode, aligned,
+             offset);
+
+  // set grid size
+  MTLSize grid_size = MTLSizeMake(num_elements, 1, 1);
+  NSUInteger thread_group_size_x = func_pso.maxTotalThreadsPerThreadgroup;
+  if (thread_group_size_x > num_elements) {
+    thread_group_size_x = num_elements;
+  }
+  MTLSize thread_group_size = MTLSizeMake(thread_group_size_x, 1, 1);
+
+  // encoding
+  [compute_encoder dispatchThreads:grid_size threadsPerThreadgroup:thread_group_size];
+  [compute_encoder endEncoding];
+
+  // commit, not sure if flush is required
+  stream->commit(false);
+}
+
+void bbox_overlaps_mps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
+                       const bool aligned, const int offset) {
+  BBoxOverlapsMPSKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
+                        const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MPS, bbox_overlaps_mps);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..25c8f6209b16c475ba181eea7c880eb27cca4082
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
@@ -0,0 +1,60 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step) {
+  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
+                              spatial_shapes, level_start_index, sampling_loc,
+                              attn_weight, im2col_step);
+}
+
+void ms_deform_attn_impl_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
+    const int im2col_step) {
+  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
+                       level_start_index, sampling_loc, attn_weight,
+                       grad_output, grad_value, grad_sampling_loc,
+                       grad_attn_weight, im2col_step);
+}
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight,
+                              const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
+}
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
+                               sampling_loc, attn_weight, grad_output,
+                               grad_value, grad_sampling_loc, grad_attn_weight,
+                               im2col_step);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4370091d2b76469e6a3439be13f04b2280188120
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms.cpp
@@ -0,0 +1,78 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#ifdef MMCV_WITH_DIOPI
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+#include <diopi/functions_mmcv.h>
+
+#include "csrc_dipu/base/basedef.h"
+#include "csrc_dipu/diopirt/diopirt_impl.h"
+
+using dipu::diopi_helper::toDiopiScalar;
+using dipu::diopi_helper::toDiopiTensorHandle;
+#endif
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset) {
+  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
+                              sigma, min_score, method, offset);
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets,
+                                              float iou_threshold) {
+  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
+}
+
+#ifdef MMCV_WITH_DIOPI
+Tensor nms_diopi(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  auto boxes_p = toDiopiTensorHandle(boxes);
+  diopiDevice_t device;
+  diopiGetTensorDevice(boxes_p, &device);
+  if (device == diopi_host) {
+    return nms_impl(boxes, scores, iou_threshold, offset);
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  Tensor out;
+  auto outp = toDiopiTensorHandle(out);
+  diopiTensorHandle_t* outhandle = &outp;
+  auto scores_p = toDiopiTensorHandle(scores);
+  bool is_mock_cuda = boxes.device().type() == dipu::DIPU_DEVICE_TYPE;
+  if (is_mock_cuda && reinterpret_cast<void*>(diopiNmsMmcv) != nullptr) {
+    auto ret =
+        diopiNmsMmcv(ch, outhandle, boxes_p, scores_p, iou_threshold, offset);
+    if (ret == diopiSuccess) {
+      auto tensorhandle = reinterpret_cast<Tensor*>(*outhandle);
+      return *tensorhandle;
+    }
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op nms";
+  auto boxes_cpu = boxes.cpu();
+  auto scores_cpu = scores.cpu();
+  return nms_impl(boxes_cpu, scores_cpu, iou_threshold, offset);
+}
+#endif
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+#ifdef MMCV_WITH_DIOPI
+  return nms_diopi(boxes, scores, iou_threshold, offset);
+#else
+  return nms_impl(boxes, scores, iou_threshold, offset);
+#endif
+}
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset) {
+  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
+                      method, offset);
+}
+
+std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
+  return nms_match_impl(dets, iou_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_quadri.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_quadri.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8baed951a6306d589e3609986f6fce1dd571067
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_quadri.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "pytorch_cpp_helper.hpp"
+
+Tensor nms_quadri_cpu(const Tensor dets, const Tensor scores,
+                      const float iou_threshold);
+
+#ifdef MMCV_WITH_CUDA
+Tensor nms_quadri_cuda(const Tensor dets, const Tensor scores,
+                       const Tensor order, const Tensor dets_sorted,
+                       const float iou_threshold, const int multi_label);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+Tensor nms_quadri(const Tensor dets, const Tensor scores, const Tensor order,
+                  const Tensor dets_sorted, const float iou_threshold,
+                  const int multi_label) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    return nms_quadri_cuda(dets, scores, order, dets_sorted, iou_threshold,
+                           multi_label);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  return nms_quadri_cpu(dets, scores, iou_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3b23b19309f9eed9163636fddd613daac1309eb7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_rotated.cpp
@@ -0,0 +1,54 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h
+#include "pytorch_cpp_helper.hpp"
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold);
+
+#ifdef MMCV_WITH_CUDA
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order, const Tensor dets_sorted,
+                        const float iou_threshold, const int multi_label);
+#endif
+
+#ifdef MMCV_WITH_NPU
+Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
+                       const Tensor labels, const float iou_threshold);
+#endif
+
+#ifdef MMCV_WITH_MLU
+Tensor nms_rotated_mlu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const Tensor labels,
+                   const float iou_threshold, const int multi_label) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    return nms_rotated_cuda(dets, scores, order, dets_sorted.contiguous(),
+                            iou_threshold, multi_label);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+#ifdef MMCV_WITH_XLA
+  } else if (dets.device().type() == at::kXLA) {
+    return nms_rotated_npu(dets, scores, labels, iou_threshold);
+#endif
+#ifdef MMCV_WITH_KPRIVATE
+  } else if (dets.device().type() == at::kPrivateUse1) {
+    return nms_rotated_npu(dets, scores, labels, iou_threshold);
+#endif
+#ifdef MMCV_WITH_MLU
+  } else if (dets.device().type() == at::kMLU) {
+    return nms_rotated_mlu(dets, scores, iou_threshold);
+#endif
+  }
+
+  return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/active_rotated_filter_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/active_rotated_filter_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6363fefbf7ad906af51c52806c0b2ce52e0c4b3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/active_rotated_filter_npu.cpp
@@ -0,0 +1,36 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+void active_rotated_filter_forward_npu(const Tensor input, const Tensor indices,
+                                       Tensor output) {
+  OpCommand cmd;
+  cmd.Name("ActiveRotatedFilter")
+      .Input(input)
+      .Input(indices)
+      .Output(output)
+      .Run();
+}
+
+void active_rotated_filter_backward_npu(const Tensor grad_out,
+                                        const Tensor indices, Tensor grad_in) {
+  OpCommand cmd;
+  cmd.Name("ActiveRotatedFilterGrad")
+      .Input(grad_out)
+      .Input(indices)
+      .Output(grad_in)
+      .Run();
+}
+
+REGISTER_NPU_IMPL(active_rotated_filter_forward_impl,
+                  active_rotated_filter_forward_npu);
+
+REGISTER_NPU_IMPL(active_rotated_filter_backward_impl,
+                  active_rotated_filter_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed04622af687492517f0c0b0d6e509b3b93de5b1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp
@@ -0,0 +1,51 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+
+void bbox_overlaps_npu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                       const int mode, const bool aligned, const int offset) {
+  string modeStr = "iou";
+  if (mode == 1) {
+    modeStr = "iof";
+  }
+  bool swap_flag = false;
+  at::Tensor bboxesFP32 = bboxes2;
+  at::Tensor gtboxesFP32 = bboxes1;
+  if (bboxes2.size(0) < bboxes1.size(0)) {
+    swap_flag = true;
+    bboxesFP32 = bboxes1;
+    gtboxesFP32 = bboxes2;
+  }
+  if (bboxes2.scalar_type() != at::kFloat) {
+    bboxesFP32 = bboxesFP32.to(at::kFloat);
+    gtboxesFP32 = gtboxesFP32.to(at::kFloat);
+  }
+  c10::SmallVector<int64_t, SIZE> iousSize = {gtboxesFP32.size(0),
+                                              bboxesFP32.size(0)};
+  if (aligned) {
+    iousSize = {gtboxesFP32.size(0), 1};
+  }
+  at::Tensor iousFP32 = at::empty(iousSize, bboxesFP32.options());
+  bboxesFP32 = aligned ? bboxesFP32.transpose(0, 1) : bboxesFP32;
+  gtboxesFP32 = aligned ? gtboxesFP32.transpose(0, 1) : gtboxesFP32;
+  OpCommand cmd;
+  cmd.Name("Iou")
+      .Input(bboxesFP32)
+      .Input(gtboxesFP32)
+      .Output(iousFP32)
+      .Attr("mode", modeStr)
+      .Attr("eps", (float)offset)
+      .Attr("aligned", aligned)
+      .Run();
+  if (bboxes2.scalar_type() != at::kFloat) {
+    iousFP32 = iousFP32.to(at::kHalf);
+  }
+  iousFP32 = swap_flag ? iousFP32.transpose(0, 1) : iousFP32;
+  ious.copy_(iousFP32);
+}
+
+REGISTER_NPU_IMPL(bbox_overlaps_impl, bbox_overlaps_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6e6b664781a0184c9a0c390e90878578051ee11
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp
@@ -0,0 +1,47 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+void box_iou_rotated_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  at::Tensor boxes = at::ones_like(boxes1);
+  at::Tensor query_boxes = at::ones_like(boxes2);
+  boxes = boxes1.transpose(0, 1).unsqueeze(0);
+  query_boxes = boxes2.transpose(0, 1).unsqueeze(0);
+
+  bool is_trans = false;
+  string modeStr = "iou";
+  if (mode_flag == 1) {
+    modeStr = "iof";
+  }
+  bool is_cross = true;
+  if (aligned) {
+    is_cross = false;
+  }
+  float v_threshold = 0;
+  float e_threshold = 0;
+
+  OpCommand cmd;
+  cmd.Name("RotatedIou")
+      .Input(boxes)
+      .Input(query_boxes)
+      .Output(ious)
+      .Attr("trans", is_trans)
+      .Attr("mode", modeStr)
+      .Attr("is_cross", is_cross)
+      .Attr("v_threshold", v_threshold)
+      .Attr("e_threshold", e_threshold)
+      .Run();
+
+  if (is_cross) {
+    ious = ious.view({boxes1.size(0), boxes2.size(0)});
+  } else {
+    ious = ious.view({boxes1.size(0), 1});
+  }
+}
+
+REGISTER_NPU_IMPL(box_iou_rotated_impl, box_iou_rotated_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..074e52d4f42c9653f68969719a5d64632a29cdf6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp
@@ -0,0 +1,63 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+void deform_roi_pool_forward_npu(Tensor input, Tensor rois, Tensor offset,
+                                 Tensor output, int pooled_height,
+                                 int pooled_width, float spatial_scale,
+                                 int sampling_ratio, float gamma) {
+  c10::SmallVector<int64_t, 2> output_sizes = {pooled_height, pooled_width};
+  at::IntArrayRef output_size = at::IntArrayRef(output_sizes);
+  int64_t sampling_ratio_ = (int64_t)sampling_ratio;
+  OpCommand cmd;
+  cmd.Name("DeformableRoiPool")
+      .Input(input)
+      .Input(rois)
+      .Input(offset)
+      .Output(output)
+      .Attr("spatial_scale", spatial_scale)
+      .Attr("output_size", output_size)
+      .Attr("sampling_ratio", sampling_ratio_)
+      .Attr("gamma", gamma)
+      .Run();
+}
+
+void deform_roi_pool_backward_npu(Tensor grad_output, Tensor input, Tensor rois,
+                                  Tensor offset, Tensor grad_input,
+                                  Tensor grad_offset, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  c10::SmallVector<int64_t, 2> output_sizes = {pooled_height, pooled_width};
+  at::IntArrayRef output_size = at::IntArrayRef(output_sizes);
+  int64_t sampling_ratio_ = (int64_t)sampling_ratio;
+  OpCommand cmd;
+  cmd.Name("DeformableRoiPoolGrad")
+      .Input(grad_output)
+      .Input(input)
+      .Input(rois)
+      .Input(offset)
+      .Output(grad_input)
+      .Output(grad_offset)
+      .Attr("output_size", output_size)
+      .Attr("spatial_scale", spatial_scale)
+      .Attr("sample_ratio", sampling_ratio_)
+      .Attr("gamma", gamma)
+      .Run();
+}
+
+REGISTER_NPU_IMPL(deform_roi_pool_forward_impl, deform_roi_pool_forward_npu);
+
+REGISTER_NPU_IMPL(deform_roi_pool_backward_impl, deform_roi_pool_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b7c995a223eb2a6adb94866a278fd19d5eb6d9fc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp
@@ -0,0 +1,151 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
+                                    Tensor output, float gamma, float alpha) {
+  int64_t n_class = input.size(1);
+  at::Tensor target_y = at::ones_like(input);
+  if (n_class == 1) {
+    target_y = at::reshape(target, input.sizes());
+    target_y = at::mul(target_y, -1.0);
+    target_y = at::add(target_y, 1.0);
+  } else {
+    target_y = at::one_hot(target, n_class);
+  }
+  target_y = target_y.to(at::kInt);
+  int64_t weight_size = weight.size(0);
+  at::Tensor weight_y = at::ones_like(input);
+  if (weight_size > 0) {
+    weight_y = at::broadcast_to(weight, input.sizes());
+  }
+  OpCommand cmd;
+  string reduction = "none";
+  cmd.Name("SigmoidFocalLoss")
+      .Input(input)
+      .Input(target_y)
+      .Input(weight_y)
+      .Output(output)
+      .Attr("gamma", gamma)
+      .Attr("alpha", alpha)
+      .Attr("reduction", reduction)
+      .Run();
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
+                                     Tensor grad_input, float gamma,
+                                     float alpha) {
+  int64_t n_class = input.size(1);
+  at::Tensor target_y = at::ones_like(input);
+  if (n_class == 1) {
+    target_y = at::reshape(target, input.sizes());
+  } else {
+    target_y = at::one_hot(target, n_class);
+    target_y = at::mul(target_y, -1.0);
+    target_y = at::add(target_y, 1.0);
+  }
+  target_y = target_y.to(at::kInt);
+  at::Tensor grad_up = at::ones_like(input);
+  int64_t weight_size = weight.size(0);
+  at::Tensor weight_y = at::ones_like(input);
+  if (weight_size > 0) {
+    weight_y = at::broadcast_to(weight, input.sizes());
+  }
+  OpCommand cmd;
+  string reduction = "none";
+  cmd.Name("SigmoidFocalLossGrad")
+      .Input(input)
+      .Input(target_y)
+      .Input(grad_up)
+      .Input(weight_y)
+      .Output(grad_input)
+      .Attr("gamma", gamma)
+      .Attr("alpha", alpha)
+      .Attr("reduction", reduction)
+      .Run();
+}
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
+                                    Tensor output, float gamma, float alpha) {
+  int64_t n_class = input.size(1);
+  at::Tensor target_y = at::one_hot(target, n_class);
+  target_y = target_y.to(at::kInt);
+  int64_t weight_size = weight.size(0);
+  at::Tensor weight_y = at::ones_like(input);
+  if (weight_size > 0) {
+    weight_y = at::broadcast_to(weight, input.sizes());
+  }
+  at::Tensor op_output = at::ones_like(input);
+  OpCommand cmd;
+  string reduction = "none";
+  cmd.Name("SoftmaxFocalLoss")
+      .Input(input)
+      .Input(target_y)
+      .Input(weight_y)
+      .Output(op_output)
+      .Attr("gamma", gamma)
+      .Attr("alpha", alpha)
+      .Attr("reduction", reduction)
+      .Run();
+  int64_t n_batch = input.size(0);
+  c10::SmallVector<int64_t, 2> offsets = {0, 0};
+  c10::SmallVector<int64_t, 2> sizes = {n_batch, 1};
+  at::IntArrayRef offset = at::IntArrayRef(offsets);
+  at::IntArrayRef size = at::IntArrayRef(sizes);
+  at_npu::native::custom_ops::npu_slice_out(op_output, offset, size, output);
+}
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor grad_input, float gamma,
+                                     float alpha);
+
+void softmax_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
+                                     Tensor buff, Tensor grad_input,
+                                     float gamma, float alpha) {
+  int64_t n_class = input.size(1);
+  at::Tensor target_y = at::one_hot(target, n_class);
+  target_y = target_y.to(at::kInt);
+  at::Tensor grad_up = at::ones_like(input);
+  int64_t weight_size = weight.size(0);
+  at::Tensor weight_y = at::ones_like(input);
+  if (weight_size > 0) {
+    weight_y = at::broadcast_to(weight, input.sizes());
+  }
+  OpCommand cmd;
+  string reduction = "none";
+  cmd.Name("SoftmaxFocalLossGrad")
+      .Input(input)
+      .Input(target_y)
+      .Input(grad_up)
+      .Input(weight_y)
+      .Output(grad_input)
+      .Attr("gamma", gamma)
+      .Attr("alpha", alpha)
+      .Attr("reduction", reduction)
+      .Run();
+}
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+
+REGISTER_NPU_IMPL(sigmoid_focal_loss_forward_impl,
+                  sigmoid_focal_loss_forward_npu);
+
+REGISTER_NPU_IMPL(sigmoid_focal_loss_backward_impl,
+                  sigmoid_focal_loss_backward_npu);
+
+REGISTER_NPU_IMPL(softmax_focal_loss_forward_impl,
+                  softmax_focal_loss_forward_npu);
+
+REGISTER_NPU_IMPL(softmax_focal_loss_backward_impl,
+                  softmax_focal_loss_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3d44cacb27339961cab7ef317e38542d7dc1793
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp
@@ -0,0 +1,55 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+Tensor fused_bias_leakyrelu_op_impl(const Tensor &input, const Tensor &bias,
+                                    const Tensor &refer, int act, int grad,
+                                    float alpha, float scale);
+
+Tensor fused_bias_leakyrelu_npu(const Tensor &input, const Tensor &bias,
+                                const Tensor &refer, int act, int grad,
+                                float alpha, float scale) {
+  at::Tensor py = at::empty_like(input);
+  // forward
+  if (grad == 0) {
+    auto input_size = input.sizes();
+    int input_length = input_size.size();
+    c10::SmallVector<int64_t, SIZE> input_size_tmp;
+    input_size_tmp = array_to_small_vector(input_size);
+    if (input_length > 1) {
+      for (int i = 0; i < input_length; i++) {
+        if (i != 1) {
+          input_size_tmp[i] = 1;
+        }
+      }
+    }
+    at::Tensor bias_tmp = at::reshape(bias, input_size_tmp);
+    // at::Tensor bias_ = at_npu::native::NPUNativeFunctions::npu_broadcast(
+    //     bias_tmp, input.sizes());
+    at::Tensor bias_ = at::broadcast_to(bias_tmp, input.sizes());
+    OpCommand cmd;
+    cmd.Name("FusedBiasLeakyRelu")
+        .Input(input)
+        .Input(bias_)
+        .Output(py)
+        .Attr("scale", scale)
+        .Attr("negative_slope", alpha)
+        .Run();
+  }
+
+  // backward
+  if (grad == 1) {
+    OpCommand cmd;
+    cmd.Name("FusedBiasLeakyReluGrad")
+        .Input(input)
+        .Input(refer)
+        .Output(py)
+        .Attr("scale", scale)
+        .Attr("negative_slope", alpha)
+        .Run();
+  }
+  return py;
+}
+
+REGISTER_NPU_IMPL(fused_bias_leakyrelu_op_impl, fused_bias_leakyrelu_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b84fcfcac277eeca0ada386094d49cb65de3f983
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp
@@ -0,0 +1,73 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void gather_points_forward_npu(int b, int c, int n, int npoints,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  // b, c, n, and npoints do not need to be passed into gatherv2,
+  // b, c, n, and npoints are calculated inside the operator
+  // gatherv2 operator in ascend needs to set axis to 2, batch_dims is 1
+  c10::SmallVector<int64_t, N> axis = {2};
+  int64_t batch_dims = 1;
+
+  OpCommand cmd;
+  cmd.Name("GatherV2")
+      .Input(points)
+      .Input(idx)
+      .Input(axis)
+      .Output(out)
+      .Attr("batch_dims", batch_dims)
+      .Run();
+}
+void gather_points_backward_npu(int b, int c, int n, int npoints,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  at::Tensor indices = idx;
+  if (idx.scalar_type() != at::ScalarType::Int) {
+    indices = idx.to(at::kInt);
+  }
+  if (idx.dim() == 0) {
+    indices.unsqueeze_(0);
+  }
+  int64_t dim = 0;
+  at::SmallVector<int64_t, N> pad_size = array_to_small_vector(idx.sizes());
+  at::Tensor trans_grad_points = grad_points.transpose(1, 2).contiguous();
+  at::Tensor grad_points_view = trans_grad_points.view(
+      {trans_grad_points.sizes()[0] * trans_grad_points.sizes()[1],
+       trans_grad_points.sizes()[2]});
+  at::Tensor trans_grad_out = grad_out.transpose(1, 2).contiguous();
+  trans_grad_out = trans_grad_out.view(
+      {trans_grad_out.sizes()[0] * trans_grad_out.sizes()[1],
+       trans_grad_out.sizes()[2]});
+  auto index = at::arange(0, b);
+  index = index.to(grad_out.device());
+  index = at::mul(index, n);
+  index = index.view({b, 1});
+  index = at::broadcast_to(index, pad_size);
+  indices = at::add(index, indices);
+  indices = indices.view({-1});
+  OpCommand cmd;
+  cmd.Name("InplaceIndexAdd")
+      .Input(grad_points_view)
+      .Input(indices)
+      .Input(trans_grad_out)
+      .Output(grad_points_view)
+      .Attr("axis", dim)
+      .Run();
+  at::Tensor grad_points_result =
+      grad_points_view.view(trans_grad_points.sizes());
+  grad_points_result = grad_points_result.transpose(1, 2);
+  grad_points.copy_(grad_points_result);
+}
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out);
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points);
+
+REGISTER_NPU_IMPL(gather_points_forward_impl, gather_points_forward_npu);
+REGISTER_NPU_IMPL(gather_points_backward_impl, gather_points_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eabf9118c71912e108b49be14664c0bfcc2441da
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp
@@ -0,0 +1,45 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void group_points_forward_npu(int b, int c, int n, int npoints, int nsample,
+                              const Tensor points, const Tensor idx,
+                              Tensor out) {
+  // b, c, n, and npoints do not need to be passed into gatherv2,
+  // b, c, n, and npoints are calculated inside the operator
+  // gatherv2 operator in ascend needs to set axis to 0, batch_dims is 0
+  c10::SmallVector<int64_t, N> axis = {0};
+  int64_t batch_dims = 0;
+
+  auto index = at::arange(0, b);
+  index = index.to(points.device());
+  index = index.view({-1, 1, 1});
+  index = at::mul(index, n);
+  at::Tensor indices = at::add(index, idx);
+  indices = indices.view({-1});
+
+  at::Tensor trans_features = points.transpose(1, 2);
+  at::Tensor features = NpuUtils::format_contiguous(trans_features);
+  features = features.view({b * n, c});
+
+  OpCommand cmd;
+  cmd.Name("GatherV2")
+      .Input(features)
+      .Input(indices)
+      .Input(axis)
+      .Output(out)
+      .Attr("batch_dims", batch_dims)
+      .Run();
+
+  at::Tensor output =
+      out.view({b, npoints, nsample, c}).transpose(1, 3).transpose(2, 3);
+  at::Tensor res = NpuUtils::format_contiguous(output);
+  out.copy_(res);
+}
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out);
+
+REGISTER_NPU_IMPL(group_points_forward_impl, group_points_forward_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/nms_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/nms_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5638c8be381590e60e3519fe23ea45a55096936b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/nms_npu.cpp
@@ -0,0 +1,40 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+Tensor nms_npu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  TORCH_CHECK((boxes.scalar_type() == at::ScalarType::Float),
+              "The type of boxes tensor passed in nms_npu should be float");
+  int64_t offset_64 = offset;
+  at::Tensor iou_threshold_y =
+      at::empty({}, boxes.options().dtype(at::kFloat)).fill_(iou_threshold);
+  at::Tensor scores_threshold_y =
+      at::empty({}, boxes.options().dtype(at::kFloat)).fill_(0);
+  at::Tensor max_outputsize_y =
+      at::empty({}, boxes.options().dtype(at::kInt)).fill_(boxes.size(0));
+
+  c10::SmallVector<int64_t, SIZE> outputsize = {boxes.size(0)};
+  at::Tensor output =
+      at::empty(outputsize, boxes.options().dtype(at::kInt)).fill_(-1);
+  OpCommand cmd;
+  cmd.Name("NonMaxSuppressionV3")
+      .Input(boxes)
+      .Input(scores)
+      .Input(max_outputsize_y)
+      .Input(iou_threshold_y)
+      .Input(scores_threshold_y)
+      .Attr("offset", offset_64)
+      .Output(output)
+      .Run();
+  auto outputsizeBool = at::gt(output, -1);
+  auto outputsizeInt = outputsizeBool.to(at::kInt);
+  auto countLen = at::sum(outputsizeInt, at::kInt);
+  at::Tensor actual_output = output.slice(0, 0, countLen.item().toLong());
+  actual_output = actual_output.to(at::kLong);
+  return actual_output;
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+
+REGISTER_NPU_IMPL(nms_impl, nms_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a92ec8d830e721954ac5a4f54dbefff331c42b83
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp
@@ -0,0 +1,33 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+
+Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
+                       const Tensor labels, const float iou_threshold) {
+  auto originDtype = dets.scalar_type();
+  at::Tensor detsCast = dets;
+  at::Tensor scoresCast = scores;
+  if (originDtype != at::kFloat) {
+    detsCast = detsCast.to(at::kFloat);
+    scoresCast = scoresCast.to(at::kFloat);
+  }
+  c10::SmallVector<int64_t, SIZE> selectedIndexSize = {dets.size(0)};
+
+  at::Tensor selectedBox = at::empty_like(dets);
+  at::Tensor selectedIndex =
+      at::empty(selectedIndexSize, dets.options().dtype(at::kInt));
+
+  c10::SmallVector<int64_t, N> output_sync_idx = {0, 1};
+  OpCommand cmd;
+  cmd.Sync(output_sync_idx)
+      .Name("RotatedNMS")
+      .Input(detsCast)
+      .Input(scoresCast)
+      .Input(labels)
+      .Output(selectedBox)
+      .Output(selectedIndex)
+      .Attr("iou_threshold", (float)iou_threshold)
+      .Run();
+  selectedIndex = selectedIndex.to(at::kLong);
+  return selectedIndex;
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/points_in_polygons_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/points_in_polygons_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f282afeed34f24f0eb23300ffc80c3ceb00fe229
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/points_in_polygons_npu.cpp
@@ -0,0 +1,27 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+constexpr int32_t MAX_POLYGONS_BATCH = 2800;
+
+void points_in_polygons_npu(const Tensor points, Tensor polygons, Tensor output,
+                            const int rows, const int cols) {
+  TORCH_CHECK(
+      (polygons.sizes()[0] <= MAX_POLYGONS_BATCH),
+      "The batch of polygons tensor must be less than MAX_POLYGONS_BATCH");
+  at::Tensor trans_polygons = polygons.transpose(0, 1);
+  OpCommand cmd;
+  at::Tensor new_trans_polygons = NpuUtils::format_contiguous(trans_polygons);
+  cmd.Name("PointsInPolygons")
+      .Input(points, (string) "points")
+      .Input(new_trans_polygons, (string) "polygons")
+      .Output(output)
+      .Run();
+}
+
+void points_in_polygons_forward_impl(const Tensor points, Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols);
+
+REGISTER_NPU_IMPL(points_in_polygons_forward_impl, points_in_polygons_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..44ddb5431f498917099b9160a4deb3a1d9210815
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp
@@ -0,0 +1,75 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void psamask_forward_npu(const int psa_type, const Tensor x, Tensor y,
+                         const int num, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  int64_t psa_type_i64 = psa_type;
+  int64_t num_i64 = num;
+  int64_t h_feature_i64 = h_feature;
+  int64_t w_feature_i64 = w_feature;
+  int64_t h_mask_i64 = h_mask;
+  int64_t w_mask_i64 = w_mask;
+  int64_t half_h_mask_i64 = half_h_mask;
+  int64_t half_w_mask_i64 = half_w_mask;
+  OpCommand cmd;
+  cmd.Name("PSAMask")
+      .Input(x)
+      .Output(y)
+      .Attr("psa_type", psa_type_i64)
+      .Attr("num", num_i64)
+      .Attr("h_feature", h_feature_i64)
+      .Attr("w_feature", w_feature_i64)
+      .Attr("h_mask", h_mask_i64)
+      .Attr("w_mask", w_mask_i64)
+      .Attr("half_h_mask", half_h_mask_i64)
+      .Attr("half_w_mask", half_w_mask_i64)
+      .Run();
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor x, Tensor y,
+                          const int num, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_npu(const int psa_type, const Tensor y_grad,
+                          Tensor x_grad, const int num, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  int64_t psa_type_i64 = psa_type;
+  int64_t num_i64 = num;
+  int64_t h_feature_i64 = h_feature;
+  int64_t w_feature_i64 = w_feature;
+  int64_t h_mask_i64 = h_mask;
+  int64_t w_mask_i64 = w_mask;
+  int64_t half_h_mask_i64 = half_h_mask;
+  int64_t half_w_mask_i64 = half_w_mask;
+  OpCommand cmd;
+  cmd.Name("PSAMaskGrad")
+      .Input(y_grad)
+      .Output(x_grad)
+      .Attr("psa_type", psa_type_i64)
+      .Attr("num", num_i64)
+      .Attr("h_feature", h_feature_i64)
+      .Attr("w_feature", w_feature_i64)
+      .Attr("h_mask", h_mask_i64)
+      .Attr("w_mask", w_mask_i64)
+      .Attr("half_h_mask", half_h_mask_i64)
+      .Attr("half_w_mask", half_w_mask_i64)
+      .Run();
+}
+
+void psamask_backward_impl(const int psa_type, const Tensor y_grad,
+                           Tensor x_grad, const int num, const int h_feature,
+                           const int w_feature, const int h_mask,
+                           const int w_mask, const int half_h_mask,
+                           const int half_w_mask);
+
+REGISTER_NPU_IMPL(psamask_forward_impl, psamask_forward_npu);
+REGISTER_NPU_IMPL(psamask_backward_impl, psamask_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f505b23e1859618b39295a2662f9c3ecf541eca4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp
@@ -0,0 +1,73 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void roi_align_forward_npu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  int64_t roi_end_mode = 2;
+  if (!aligned) {
+    LOG(WARNING) << "The [aligned] attr in roi_align op is false";
+    roi_end_mode = 0;
+  }
+  int64_t aligned_height_64 = aligned_height;
+  int64_t aligned_width_64 = aligned_width;
+  int64_t sampling_ratio_64 = sampling_ratio;
+  OpCommand cmd;
+  cmd.Name("ROIAlign")
+      .Input(input)
+      .Input(rois)
+      .Output(output)
+      .Attr("spatial_scale", spatial_scale)
+      .Attr("pooled_height", aligned_height_64)
+      .Attr("pooled_width", aligned_width_64)
+      .Attr("sample_num", sampling_ratio_64)
+      .Attr("roi_end_mode", roi_end_mode)
+      .Run();
+}
+
+void roi_align_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  int64_t aligned_height_64 = aligned_height;
+  int64_t aligned_width_64 = aligned_width;
+  int64_t sampling_ratio_64 = sampling_ratio;
+  int64_t roi_end_mode = 2;
+  if (!aligned) {
+    LOG(WARNING) << "The [aligned] attr in roi_align_grad op is false";
+    roi_end_mode = 0;
+  }
+  c10::SmallVector<int64_t, SIZE> xdiff_shape =
+      array_to_small_vector(grad_input.sizes());
+  OpCommand cmd;
+  cmd.Name("ROIAlignGrad")
+      .Input(grad_output)
+      .Input(rois)
+      .Output(grad_input)
+      .Attr("xdiff_shape", xdiff_shape)
+      .Attr("pooled_width", aligned_width_64)
+      .Attr("pooled_height", aligned_height_64)
+      .Attr("spatial_scale", spatial_scale)
+      .Attr("sample_num", sampling_ratio_64)
+      .Attr("roi_end_mode", roi_end_mode)
+      .Run();
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_NPU_IMPL(roi_align_forward_impl, roi_align_forward_npu);
+REGISTER_NPU_IMPL(roi_align_backward_impl, roi_align_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7a11e8c6d2cf3360e4ed02bb998b2d82757adfa
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp
@@ -0,0 +1,81 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void roi_pool_forward_npu(Tensor input, Tensor rois, Tensor output,
+                          Tensor argmax, int pooled_height, int pooled_width,
+                          float spatial_scale) {
+  int64_t pooled_height_64 = pooled_height;
+  int64_t pooled_width_64 = pooled_width;
+  int64_t pooled_channel = 1;
+  at::Tensor roi_actual_num =
+      at::empty_like(rois, rois.options().dtype(at::kInt));
+  if (input.sizes()[1] % 16 == 0) {
+    OpCommand cmd;
+    cmd.Name("RoiPoolingWithArgMax")
+        .Input(input)
+        .Input(rois)
+        .Input(roi_actual_num)
+        .Output(output)
+        .Output(argmax)
+        .Attr("pooled_h", pooled_height_64)
+        .Attr("pooled_w", pooled_width_64)
+        .Attr("spatial_scale_h", spatial_scale)
+        .Attr("spatial_scale_w", spatial_scale)
+        .Attr("pool_channel", pooled_channel)
+        .Run();
+
+  } else {
+    OpCommand cmd;
+    cmd.Name("RoiPoolingWithArgMax")
+        .Input(input)
+        .Input(rois)
+        .Input(roi_actual_num)
+        .Output(output)
+        .Output(argmax)
+        .Attr("pooled_h", pooled_height_64)
+        .Attr("pooled_w", pooled_width_64)
+        .Attr("spatial_scale_h", spatial_scale)
+        .Attr("spatial_scale_w", spatial_scale)
+        .Attr("pool_channel", pooled_channel)
+        .Attr("_exclude_engines", (string) "AiCore")
+        .Run();
+  }
+}
+
+void roi_pool_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax,
+                           Tensor grad_input, int pooled_height,
+                           int pooled_width, float spatial_scale) {
+  int64_t pooled_height_64 = pooled_height;
+  int64_t pooled_width_64 = pooled_width;
+  int64_t pooled_channel = 1;
+  at::Tensor roi_actual_num =
+      at::empty_like(rois, rois.options().dtype(at::kInt));
+  at::Tensor x = at::ones_like(grad_input);
+  OpCommand cmd;
+  cmd.Name("RoiPoolingGradWithArgMax")
+      .Input(grad_output)
+      .Input(x)
+      .Input(rois)
+      .Input(roi_actual_num)
+      .Input(argmax)
+      .Output(grad_input)
+      .Attr("pooled_h", pooled_height_64)
+      .Attr("pooled_w", pooled_width_64)
+      .Attr("spatial_scale_h", spatial_scale)
+      .Attr("spatial_scale_w", spatial_scale)
+      .Attr("pool_channel", pooled_channel)
+      .Run();
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+
+REGISTER_NPU_IMPL(roi_pool_forward_impl, roi_pool_forward_npu);
+REGISTER_NPU_IMPL(roi_pool_backward_impl, roi_pool_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ffd9b4c43b943505c98681c7427e20ff9e26aa3a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp
@@ -0,0 +1,56 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim = 3);
+
+int hard_voxelize_forward_npu(const at::Tensor &points, at::Tensor &voxels,
+                              at::Tensor &coors,
+                              at::Tensor &num_points_per_voxel,
+                              const std::vector<float> voxel_size,
+                              const std::vector<float> coors_range,
+                              const int max_points, const int max_voxels,
+                              const int NDim = 3) {
+  at::Tensor voxel_num_tmp = at::empty({1}, points.options());
+  at::Tensor voxel_num = voxel_num_tmp.to(at::kInt);
+
+  at::Tensor voxel_size_cpu = at::from_blob(
+      const_cast<float *>(voxel_size.data()), {3}, dtype(at::kFloat));
+  at::Tensor voxel_size_npu = voxel_size_cpu.to(points.device());
+
+  at::Tensor coors_range_cpu = at::from_blob(
+      const_cast<float *>(coors_range.data()), {6}, dtype(at::kFloat));
+  at::Tensor coors_range_npu = coors_range_cpu.to(points.device());
+
+  int64_t max_points_ = (int64_t)max_points;
+  int64_t max_voxels_ = (int64_t)max_voxels;
+
+  // only support true now
+  bool deterministic = true;
+
+  OpCommand cmd;
+  cmd.Name("Voxelization")
+      .Input(points)
+      .Input(voxel_size_npu)
+      .Input(coors_range_npu)
+      .Output(voxels)
+      .Output(coors)
+      .Output(num_points_per_voxel)
+      .Output(voxel_num)
+      .Attr("max_points", max_points_)
+      .Attr("max_voxels", max_voxels_)
+      .Attr("deterministic", deterministic)
+      .Run();
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+  return voxel_num_int;
+}
+
+REGISTER_NPU_IMPL(hard_voxelize_forward_impl, hard_voxelize_forward_npu);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pixel_group.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pixel_group.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bf8c8bbf2061cacb9e0c2d33c8a635834407622
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pixel_group.cpp
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/WenmuZhou/PAN.pytorch
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
+                              kernel_label, kernel_contour, kernel_region_num,
+                              dis_threshold);
+}
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold) {
+  score = score.contiguous();
+  mask = mask.contiguous();
+  embedding = embedding.contiguous();
+  kernel_label = kernel_label.contiguous();
+  kernel_contour = kernel_contour.contiguous();
+
+  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
+                          kernel_region_num, distance_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_boxes.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_boxes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..540da94038f6dea2dc10443905f289ddd131f1af
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_boxes.cpp
@@ -0,0 +1,44 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
+                                    boxes_tensor, pts_tensor,
+                                    box_idx_of_points_tensor);
+}
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
+  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
+                                   pts_tensor, box_idx_of_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_polygons.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75a93dcef33f23904c1218048e16beff65c230d1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_polygons.cpp
@@ -0,0 +1,15 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
+                       output, rows, cols);
+}
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
+  int rows = points.size(0);
+  int cols = polygons.size(0);
+  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/prroi_pool.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/prroi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00db84a154bef7a7cee8d38ba6236d959849a3bc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/prroi_pool.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
+                       input, rois, grad_rois, pooled_height, pooled_width,
+                       spatial_scale);
+}
+
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale) {
+  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
+                          spatial_scale);
+}
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale) {
+  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
+                           pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
+                                pooled_height, pooled_width, spatial_scale);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/psamask.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/psamask.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6064c9ba5fd7ec9bcfef22b3abcc65ef50106d67
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/psamask.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
+                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
+                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask) {
+  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask) {
+  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pybind.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pybind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c8591a5cc151b35abe6694fc1b81c0ac91786397
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pybind.cpp
@@ -0,0 +1,950 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+std::string get_compiler_version();
+std::string get_compiling_cuda_version();
+
+void assign_score_withk_forward(const Tensor &points, const Tensor &centers,
+                                const Tensor &scores, const Tensor &knn_idx,
+                                Tensor &output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate);
+
+void assign_score_withk_backward(const Tensor &grad_out, const Tensor &points,
+                                 const Tensor &centers, const Tensor &scores,
+                                 const Tensor &knn_idx, Tensor &grad_points,
+                                 Tensor &grad_centers, Tensor &grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate);
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor);
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor);
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor);
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step);
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step);
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step);
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma);
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma);
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample);
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample);
+
+void stack_group_points_forward(Tensor features_tensor,
+                                Tensor features_batch_cnt_tensor,
+                                Tensor idx_tensor, Tensor idx_batch_cnt_tensor,
+                                Tensor out_tensor, int b, int c, int m,
+                                int nsample);
+
+void stack_group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                 Tensor idx_batch_cnt_tensor,
+                                 Tensor features_batch_cnt_tensor,
+                                 Tensor grad_features_tensor, int b, int c,
+                                 int m, int n, int nsample);
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag);
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n, int npoints);
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints);
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha);
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha);
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n);
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m);
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m);
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset);
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample);
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap);
+
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh);
+
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh);
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m);
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w);
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels);
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias);
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias);
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight, const int im2col_step);
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step);
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset);
+
+std::vector<std::vector<int>> nms_match(Tensor dets, float iou_threshold);
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold);
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num);
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width, float spatial_scale);
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale);
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var);
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size);
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input);
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask);
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask);
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input);
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample);
+
+void stack_ball_query_forward(Tensor new_xyz_tensor, Tensor new_xyz_batch_cnt,
+                              Tensor xyz_tensor, Tensor xyz_batch_cnt,
+                              Tensor idx_tensor, float max_radius, int nsample);
+
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale);
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale);
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<Tensor> get_indice_pairs_backward(
+    Tensor indices, Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+Tensor indice_conv_forward(Tensor features, Tensor filters, Tensor indicePairs,
+                           Tensor indiceNum, int64_t numActOut,
+                           int64_t _inverse, int64_t _subM);
+
+std::vector<Tensor> indice_conv_backward(Tensor features, Tensor filters,
+                                         Tensor outGrad, Tensor indicePairs,
+                                         Tensor indiceNum, int64_t _inverse,
+                                         int64_t _subM);
+
+Tensor fused_indice_conv_batchnorm_forward(Tensor features, Tensor filters,
+                                           Tensor bias, Tensor indicePairs,
+                                           Tensor indiceNum, int64_t numActOut,
+                                           int64_t _inverse, int64_t _subM);
+
+Tensor indice_maxpool_forward(Tensor features, Tensor indicePairs,
+                              Tensor indiceNum, int64_t numAct);
+
+Tensor indice_maxpool_backward(Tensor features, Tensor outFeatures,
+                               Tensor outGrad, Tensor indicePairs,
+                               Tensor indiceNum);
+
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned);
+
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const Tensor labels,
+                   const float iou_threshold, const int multi_label);
+
+Tensor upfirdn2d(torch::Tensor input, torch::Tensor filter, int upx, int upy,
+                 int downx, int downy, int padx0, int padx1, int pady0,
+                 int pady1, bool flip, float gain);
+
+Tensor fused_bias_leakyrelu(const Tensor &input, const Tensor &bias,
+                            const Tensor &refer, int act, int grad, float alpha,
+                            float scale);
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int pooled_height, int pooled_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned, bool clockwise);
+
+void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
+                                Tensor grad_input, int pooled_height,
+                                int pooled_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const std::string &reduce_type);
+
+void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
+                                     const torch::Tensor &grad_reduced_feats,
+                                     const torch::Tensor &feats,
+                                     const torch::Tensor &reduced_feats,
+                                     const torch::Tensor &coors_idx,
+                                     const torch::Tensor &reduce_count,
+                                     const std::string &reduce_type);
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim,
+                           const bool deterministic);
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim);
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size);
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size);
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor);
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor);
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor);
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in, int pool_method);
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW);
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW);
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale, const int points);
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points);
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise);
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise);
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons);
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output);
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in);
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
+
+at::Tensor diff_iou_rotated_sort_vertices_forward(at::Tensor vertices,
+                                                  at::Tensor mask,
+                                                  at::Tensor num_valid);
+
+void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
+                              const Tensor dist1, const Tensor dist2,
+                              const Tensor idx1, const Tensor idx);
+
+void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
+                               Tensor idx1, Tensor idx2, Tensor graddist1,
+                               Tensor graddist2, Tensor gradxyz1,
+                               Tensor gradxyz2);
+
+Tensor bias_act(const Tensor &input, const Tensor &bias, const Tensor &xref,
+                const Tensor &yref, const Tensor &dy, int grad, int dim,
+                int act, float alpha, float gain, float clamp);
+
+std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
+    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
+    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
+    bool writeSigns);
+
+torch::Tensor filtered_lrelu_act_(torch::Tensor x, torch::Tensor si, int sx,
+                                  int sy, float gain, float slope, float clamp,
+                                  bool writeSigns);
+
+void box_iou_quadri(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                    const int mode_flag, const bool aligned);
+
+Tensor nms_quadri(const Tensor dets, const Tensor scores, const Tensor order,
+                  const Tensor dets_sorted, const float iou_threshold,
+                  const int multi_label);
+
+void bezier_align_forward(Tensor input, Tensor rois, Tensor output,
+                          int aligned_height, int aligned_width,
+                          float spatial_scale, int sampling_ratio,
+                          bool aligned);
+
+void bezier_align_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                           int aligned_height, int aligned_width,
+                           float spatial_scale, int sampling_ratio,
+                           bool aligned);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)", py::arg("input"),
+        py::arg("filter"), py::arg("upx"), py::arg("upy"), py::arg("downx"),
+        py::arg("downy"), py::arg("padx0"), py::arg("padx1"), py::arg("pady0"),
+        py::arg("pady1"), py::arg("flip"), py::arg("gain"));
+  m.def("fused_bias_leakyrelu", &fused_bias_leakyrelu,
+        "fused_bias_leakyrelu (CUDA)", py::arg("input"), py::arg("bias"),
+        py::arg("empty"), py::arg("act"), py::arg("grad"), py::arg("alpha"),
+        py::arg("scale"));
+  m.def("gather_points_forward", &gather_points_forward,
+        "gather_points_forward", py::arg("points_tensor"),
+        py::arg("idx_tensor"), py::arg("out_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"));
+  m.def("gather_points_backward", &gather_points_backward,
+        "gather_points_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"));
+  m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
+  m.def("get_compiling_cuda_version", &get_compiling_cuda_version,
+        "get_compiling_cuda_version");
+  m.def("assign_score_withk_forward", &assign_score_withk_forward,
+        "assign_score_withk_forward", py::arg("points"), py::arg("centers"),
+        py::arg("scores"), py::arg("knn_idx"), py::arg("output"), py::arg("B"),
+        py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"), py::arg("O"),
+        py::arg("aggregate"));
+  m.def("assign_score_withk_backward", &assign_score_withk_backward,
+        "assign_score_withk_backward", py::arg("grad_out"), py::arg("points"),
+        py::arg("centers"), py::arg("scores"), py::arg("knn_idx"),
+        py::arg("grad_points"), py::arg("grad_centers"), py::arg("grad_scores"),
+        py::arg("B"), py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"),
+        py::arg("O"), py::arg("aggregate"));
+  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("xyz_tensor"),
+        py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("dist2_tensor"), py::arg("b"), py::arg("n"), py::arg("m"),
+        py::arg("nsample"));
+  m.def("carafe_naive_forward", &carafe_naive_forward, "carafe_naive_forward",
+        py::arg("features"), py::arg("masks"), py::arg("output"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_naive_backward", &carafe_naive_backward,
+        "carafe_naive_backward", py::arg("top_grad"), py::arg("features"),
+        py::arg("masks"), py::arg("bottom_grad"), py::arg("mask_grad"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_forward", &carafe_forward, "carafe_forward",
+        py::arg("features"), py::arg("masks"), py::arg("rfeatures"),
+        py::arg("routput"), py::arg("rmasks"), py::arg("output"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_backward", &carafe_backward, "carafe_backward",
+        py::arg("top_grad"), py::arg("rfeatures"), py::arg("masks"),
+        py::arg("rtop_grad"), py::arg("rbottom_grad_hs"),
+        py::arg("rbottom_grad"), py::arg("rmask_grad"), py::arg("bottom_grad"),
+        py::arg("mask_grad"), py::arg("kernel_size"), py::arg("group_size"),
+        py::arg("scale_factor"));
+  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward",
+        py::arg("input"), py::arg("weight"), py::arg("offset"),
+        py::arg("output"), py::arg("columns"), py::arg("ones"), py::arg("kW"),
+        py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padW"),
+        py::arg("padH"), py::arg("dilationW"), py::arg("dilationH"),
+        py::arg("group"), py::arg("deformable_group"), py::arg("im2col_step"));
+  m.def("deform_conv_backward_input", &deform_conv_backward_input,
+        "deform_conv_backward_input", py::arg("input"), py::arg("offset"),
+        py::arg("gradOutput"), py::arg("gradInput"), py::arg("gradOffset"),
+        py::arg("weight"), py::arg("columns"), py::arg("kW"), py::arg("kH"),
+        py::arg("dW"), py::arg("dH"), py::arg("padW"), py::arg("padH"),
+        py::arg("dilationW"), py::arg("dilationH"), py::arg("group"),
+        py::arg("deformable_group"), py::arg("im2col_step"));
+  m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters,
+        "deform_conv_backward_parameters", py::arg("input"), py::arg("offset"),
+        py::arg("gradOutput"), py::arg("gradWeight"), py::arg("columns"),
+        py::arg("ones"), py::arg("kW"), py::arg("kH"), py::arg("dW"),
+        py::arg("dH"), py::arg("padW"), py::arg("padH"), py::arg("dilationW"),
+        py::arg("dilationH"), py::arg("group"), py::arg("deformable_group"),
+        py::arg("scale"), py::arg("im2col_step"));
+  m.def("deform_roi_pool_forward", &deform_roi_pool_forward,
+        "deform roi pool forward", py::arg("input"), py::arg("rois"),
+        py::arg("offset"), py::arg("output"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("gamma"));
+  m.def("deform_roi_pool_backward", &deform_roi_pool_backward,
+        "deform roi pool backward", py::arg("grad_output"), py::arg("input"),
+        py::arg("rois"), py::arg("offset"), py::arg("grad_input"),
+        py::arg("grad_offset"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("gamma"));
+  m.def("roipoint_pool3d_forward", &roipoint_pool3d_forward,
+        "roipoint_pool3d_forward", py::arg("xyz"), py::arg("boxes3d"),
+        py::arg("pts_feature"), py::arg("pooled_features"),
+        py::arg("pooled_empty_flag"));
+  m.def("sigmoid_focal_loss_forward", &sigmoid_focal_loss_forward,
+        "sigmoid_focal_loss_forward ", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("output"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("sigmoid_focal_loss_backward", &sigmoid_focal_loss_backward,
+        "sigmoid_focal_loss_backward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("grad_input"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("softmax_focal_loss_forward", &softmax_focal_loss_forward,
+        "softmax_focal_loss_forward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("output"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("softmax_focal_loss_backward", &softmax_focal_loss_backward,
+        "softmax_focal_loss_backward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("buff"), py::arg("grad_input"),
+        py::arg("gamma"), py::arg("alpha"));
+  m.def("three_interpolate_forward", &three_interpolate_forward,
+        "three_interpolate_forward", py::arg("points_tensor"),
+        py::arg("idx_tensor"), py::arg("weight_tensor"), py::arg("out_tensor"),
+        py::arg("b"), py::arg("c"), py::arg("m"), py::arg("n"));
+  m.def("three_interpolate_backward", &three_interpolate_backward,
+        "three_interpolate_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("weight_tensor"),
+        py::arg("grad_points_tensor"), py::arg("b"), py::arg("c"), py::arg("n"),
+        py::arg("m"));
+  m.def("three_nn_forward", &three_nn_forward, "three_nn_forward",
+        py::arg("unknown_tensor"), py::arg("known_tensor"),
+        py::arg("dist2_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("bbox_overlaps", &bbox_overlaps, "bbox_overlaps", py::arg("bboxes1"),
+        py::arg("bboxes2"), py::arg("ious"), py::arg("mode"),
+        py::arg("aligned"), py::arg("offset"));
+  m.def("group_points_forward", &group_points_forward, "group_points_forward",
+        py::arg("points_tensor"), py::arg("idx_tensor"), py::arg("out_tensor"),
+        py::arg("b"), py::arg("c"), py::arg("n"), py::arg("npoints"),
+        py::arg("nsample"));
+  m.def("group_points_backward", &group_points_backward,
+        "group_points_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"), py::arg("nsample"));
+  m.def("stack_group_points_forward", &stack_group_points_forward,
+        "stack_group_points_forward", py::arg("features_tensor"),
+        py::arg("features_batch_cnt_tensor"), py::arg("idx_tensor"),
+        py::arg("idx_batch_cnt_tensor"), py::arg("out_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("m"), py::arg("nsample"));
+  m.def("stack_group_points_backward", &stack_group_points_backward,
+        "stack_group_points_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("idx_batch_cnt_tensor"),
+        py::arg("features_batch_cnt_tensor"), py::arg("grad_features_tensor"),
+        py::arg("b"), py::arg("c"), py::arg("m"), py::arg("n"),
+        py::arg("nsample"));
+  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("b"), py::arg("n"),
+        py::arg("m"), py::arg("nsample"), py::arg("xyz_tensor"),
+        py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("dist2_tensor"));
+  m.def("iou3d_boxes_overlap_bev_forward", &iou3d_boxes_overlap_bev_forward,
+        "iou3d_boxes_overlap_bev_forward", py::arg("boxes_a"),
+        py::arg("boxes_b"), py::arg("ans_iou"));
+  m.def("iou3d_nms3d_forward", &iou3d_nms3d_forward, "iou3d_nms3d_forward",
+        py::arg("boxes"), py::arg("keep"), py::arg("num_out"),
+        py::arg("nms_overlap_thresh"));
+  m.def("iou3d_nms3d_normal_forward", &iou3d_nms3d_normal_forward,
+        "iou3d_nms3d_normal_forward", py::arg("boxes"), py::arg("keep"),
+        py::arg("num_out"), py::arg("nms_overlap_thresh"));
+  m.def("furthest_point_sampling_forward", &furthest_point_sampling_forward,
+        "furthest_point_sampling_forward", py::arg("points_tensor"),
+        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("furthest_point_sampling_with_dist_forward",
+        &furthest_point_sampling_with_dist_forward,
+        "furthest_point_sampling_with_dist_forward", py::arg("points_tensor"),
+        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("masked_im2col_forward", &masked_im2col_forward,
+        "masked_im2col_forward", py::arg("im"), py::arg("mask_h_idx"),
+        py::arg("mask_w_idx"), py::arg("col"), py::arg("kernel_h"),
+        py::arg("kernel_w"), py::arg("pad_h"), py::arg("pad_w"));
+  m.def("masked_col2im_forward", &masked_col2im_forward,
+        "masked_col2im_forward", py::arg("col"), py::arg("mask_h_idx"),
+        py::arg("mask_w_idx"), py::arg("im"), py::arg("height"),
+        py::arg("width"), py::arg("channels"));
+  m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward,
+        "modulated deform conv forward", py::arg("input"), py::arg("weight"),
+        py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
+        py::arg("output"), py::arg("columns"), py::arg("kernel_h"),
+        py::arg("kernel_w"), py::arg("stride_h"), py::arg("stride_w"),
+        py::arg("pad_h"), py::arg("pad_w"), py::arg("dilation_h"),
+        py::arg("dilation_w"), py::arg("group"), py::arg("deformable_group"),
+        py::arg("with_bias"));
+  m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward,
+        "modulated deform conv backward", py::arg("input"), py::arg("weight"),
+        py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
+        py::arg("columns"), py::arg("grad_input"), py::arg("grad_weight"),
+        py::arg("grad_bias"), py::arg("grad_offset"), py::arg("grad_mask"),
+        py::arg("grad_output"), py::arg("kernel_h"), py::arg("kernel_w"),
+        py::arg("stride_h"), py::arg("stride_w"), py::arg("pad_h"),
+        py::arg("pad_w"), py::arg("dilation_h"), py::arg("dilation_w"),
+        py::arg("group"), py::arg("deformable_group"), py::arg("with_bias"));
+  m.def("nms", &nms, "nms (CPU/CUDA) ", py::arg("boxes"), py::arg("scores"),
+        py::arg("iou_threshold"), py::arg("offset"));
+  m.def("softnms", &softnms, "softnms (CPU) ", py::arg("boxes"),
+        py::arg("scores"), py::arg("dets"), py::arg("iou_threshold"),
+        py::arg("sigma"), py::arg("min_score"), py::arg("method"),
+        py::arg("offset"));
+  m.def("nms_match", &nms_match, "nms_match (CPU) ", py::arg("dets"),
+        py::arg("iou_threshold"));
+  m.def("pixel_group", &pixel_group, "pixel group (CPU) ", py::arg("score"),
+        py::arg("mask"), py::arg("embedding"), py::arg("kernel_label"),
+        py::arg("kernel_contour"), py::arg("kernel_region_label"),
+        py::arg("distance_threshold"));
+  m.def("contour_expand", &contour_expand, "contour exapnd (CPU) ",
+        py::arg("kernel_mask"), py::arg("internal_kernel_label"),
+        py::arg("min_kernel_area"), py::arg("kernel_num"));
+  m.def("roi_align_forward", &roi_align_forward, "roi_align forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"),
+        py::arg("argmax_y"), py::arg("argmax_x"), py::arg("aligned_height"),
+        py::arg("aligned_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
+  m.def("roi_align_backward", &roi_align_backward, "roi_align backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("argmax_y"),
+        py::arg("argmax_x"), py::arg("grad_input"), py::arg("aligned_height"),
+        py::arg("aligned_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
+  m.def("roi_pool_forward", &roi_pool_forward, "roi_pool forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"), py::arg("argmax"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("roi_pool_backward", &roi_pool_backward, "roi_pool backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("argmax"),
+        py::arg("grad_input"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"));
+  m.def("sync_bn_forward_mean", &sync_bn_forward_mean, "sync_bn forward_mean",
+        py::arg("input"), py::arg("mean"));
+  m.def("sync_bn_forward_var", &sync_bn_forward_var, "sync_bn forward_var",
+        py::arg("input"), py::arg("mean"), py::arg("var"));
+  m.def("sync_bn_forward_output", &sync_bn_forward_output,
+        "sync_bn forward_output", py::arg("input"), py::arg("mean"),
+        py::arg("var"), py::arg("weight"), py::arg("bias"),
+        py::arg("running_mean"), py::arg("running_var"), py::arg("norm"),
+        py::arg("std"), py::arg("output"), py::arg("eps"), py::arg("momentum"),
+        py::arg("group_size"));
+  m.def("sync_bn_backward_param", &sync_bn_backward_param,
+        "sync_bn backward_param", py::arg("grad_output"), py::arg("norm"),
+        py::arg("grad_weight"), py::arg("grad_bias"));
+  m.def("sync_bn_backward_data", &sync_bn_backward_data,
+        "sync_bn backward_data", py::arg("grad_output"), py::arg("weight"),
+        py::arg("grad_weight"), py::arg("grad_bias"), py::arg("norm"),
+        py::arg("std"), py::arg("grad_input"));
+  m.def("get_indice_pairs_2d_forward", &get_indice_pairs_forward<2>,
+        "get_indice_pairs_2d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_3d_forward", &get_indice_pairs_forward<3>,
+        "get_indice_pairs_3d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_4d_forward", &get_indice_pairs_forward<4>,
+        "get_indice_pairs_4d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_2d_backward", &get_indice_pairs_backward<2>,
+        "get_indice_pairs_2d_backward", py::arg("indices"), py::arg("gridOut"),
+        py::arg("batchSize"), py::arg("outSpatialShape"),
+        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
+        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
+        py::arg("_subM"), py::arg("_transpose"));
+  m.def("get_indice_pairs_3d_backward", &get_indice_pairs_backward<3>,
+        "get_indice_pairs_3d_backward", py::arg("indices"), py::arg("gridOut"),
+        py::arg("batchSize"), py::arg("outSpatialShape"),
+        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
+        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
+        py::arg("_subM"), py::arg("_transpose"));
+  m.def("indice_conv_forward", &indice_conv_forward, "indice_conv_forward",
+        py::arg("features"), py::arg("filters"), py::arg("indicePairs"),
+        py::arg("indiceNum"), py::arg("numActOut"), py::arg("_inverse"),
+        py::arg("_subM"));
+  m.def("indice_conv_backward", &indice_conv_backward, "indice_conv_backward",
+        py::arg("features"), py::arg("filters"), py::arg("outGrad"),
+        py::arg("indicePairs"), py::arg("indiceNum"), py::arg("_inverse"),
+        py::arg("_subM"));
+  m.def("fused_indice_conv_forward", &fused_indice_conv_batchnorm_forward,
+        "fused_indice_conv_forward", py::arg("features"), py::arg("filters"),
+        py::arg("bias"), py::arg("indicePairs"), py::arg("indiceNum"),
+        py::arg("numActOut"), py::arg("_inverse"), py::arg("_subM"));
+  m.def("indice_maxpool_forward", &indice_maxpool_forward,
+        "indice_maxpool_forward", py::arg("features"), py::arg("indicePairs"),
+        py::arg("indiceNum"), py::arg("numAct"));
+  m.def("indice_maxpool_backward", &indice_maxpool_backward,
+        "indice_maxpool_backward", py::arg("features"), py::arg("outFeatures"),
+        py::arg("outGrad"), py::arg("indicePairs"), py::arg("indiceNum"));
+  m.def("psamask_forward", &psamask_forward, "PSAMASK forward (CPU/CUDA)",
+        py::arg("input"), py::arg("output"), py::arg("psa_type"),
+        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
+        py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
+        py::arg("half_w_mask"));
+  m.def("psamask_backward", &psamask_backward, "PSAMASK backward (CPU/CUDA)",
+        py::arg("grad_output"), py::arg("grad_input"), py::arg("psa_type"),
+        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
+        py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
+        py::arg("half_w_mask"));
+  m.def("tin_shift_forward", &tin_shift_forward, "tin_shift forward",
+        py::arg("input"), py::arg("shift"), py::arg("output"));
+  m.def("tin_shift_backward", &tin_shift_backward, "tin_shift backward",
+        py::arg("grad_output"), py::arg("shift"), py::arg("grad_input"));
+  m.def("box_iou_rotated", &box_iou_rotated, "IoU for rotated boxes",
+        py::arg("boxes1"), py::arg("boxes2"), py::arg("ious"),
+        py::arg("mode_flag"), py::arg("aligned"));
+  m.def("nms_rotated", &nms_rotated, "NMS for rotated boxes", py::arg("dets"),
+        py::arg("scores"), py::arg("order"), py::arg("dets_sorted"),
+        py::arg("labels"), py::arg("iou_threshold"), py::arg("multi_label"));
+  m.def("ball_query_forward", &ball_query_forward, "ball_query_forward",
+        py::arg("new_xyz_tensor"), py::arg("xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("b"), py::arg("n"), py::arg("m"), py::arg("min_radius"),
+        py::arg("max_radius"), py::arg("nsample"));
+  m.def("stack_ball_query_forward", &stack_ball_query_forward,
+        "stack_ball_query_forward", py::arg("new_xyz_tensor"),
+        py::arg("new_xyz_batch_cnt"), py::arg("xyz_tensor"),
+        py::arg("xyz_batch_cnt"), py::arg("idx_tensor"), py::arg("max_radius"),
+        py::arg("nsample"));
+  m.def("roi_align_rotated_forward", &roi_align_rotated_forward,
+        "roi_align_rotated forward", py::arg("input"), py::arg("rois"),
+        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"), py::arg("sampling_ratio"), py::arg("aligned"),
+        py::arg("clockwise"));
+  m.def("roi_align_rotated_backward", &roi_align_rotated_backward,
+        "roi_align_rotated backward", py::arg("rois"), py::arg("grad_input"),
+        py::arg("grad_output"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("aligned"), py::arg("clockwise"));
+  m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward,
+        "dynamic_point_to_voxel_forward", py::arg("feats"), py::arg("coors"),
+        py::arg("reduce_type"));
+  m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward,
+        "dynamic_point_to_voxel_backward", py::arg("grad_feats"),
+        py::arg("grad_reduced_feats"), py::arg("feats"),
+        py::arg("reduced_feats"), py::arg("coors_idx"), py::arg("reduce_count"),
+        py::arg("reduce_type"));
+  m.def("hard_voxelize_forward", &hard_voxelize_forward,
+        "hard_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
+        py::arg("coors_range"), py::arg("voxels"), py::arg("coors"),
+        py::arg("num_points_per_voxel"), py::arg("voxel_num"),
+        py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"),
+        py::arg("deterministic"));
+  m.def("dynamic_voxelize_forward", &dynamic_voxelize_forward,
+        "dynamic_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
+        py::arg("coors_range"), py::arg("coors"), py::arg("NDim"));
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward,
+        "forward function of multi-scale deformable attention",
+        py::arg("value"), py::arg("value_spatial_shapes"),
+        py::arg("value_level_start_index"), py::arg("sampling_locations"),
+        py::arg("attention_weights"), py::arg("im2col_step"));
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward,
+        "backward function of multi-scale deformable attention",
+        py::arg("value"), py::arg("value_spatial_shapes"),
+        py::arg("value_level_start_index"), py::arg("sampling_locations"),
+        py::arg("attention_weights"), py::arg("grad_output"),
+        py::arg("grad_value"), py::arg("grad_sampling_loc"),
+        py::arg("grad_attn_weight"), py::arg("im2col_step"));
+  m.def("border_align_forward", &border_align_forward,
+        "forward function of border_align", py::arg("input"), py::arg("boxes"),
+        py::arg("output"), py::arg("argmax_idx"), py::arg("pool_size"));
+  m.def("border_align_backward", &border_align_backward,
+        "backward function of border_align", py::arg("grad_output"),
+        py::arg("boxes"), py::arg("argmax_idx"), py::arg("grad_input"),
+        py::arg("pool_size"));
+  m.def("correlation_forward", &correlation_forward, "Correlation forward",
+        py::arg("input1"), py::arg("input2"), py::arg("output"), py::arg("kH"),
+        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
+        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
+        py::arg("dW"));
+  m.def("correlation_backward", &correlation_backward, "Correlation backward",
+        py::arg("grad_output"), py::arg("input1"), py::arg("input2"),
+        py::arg("grad_input1"), py::arg("grad_input2"), py::arg("kH"),
+        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
+        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
+        py::arg("dW"));
+  m.def("points_in_boxes_cpu_forward", &points_in_boxes_cpu_forward,
+        "points_in_boxes_cpu_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("pts_indices_tensor"));
+  m.def("points_in_boxes_part_forward", &points_in_boxes_part_forward,
+        "points_in_boxes_part_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("box_idx_of_points_tensor"));
+  m.def("points_in_boxes_all_forward", &points_in_boxes_all_forward,
+        "points_in_boxes_all_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("box_idx_of_points_tensor"));
+  m.def("roiaware_pool3d_forward", &roiaware_pool3d_forward,
+        "roiaware_pool3d_forward", py::arg("rois"), py::arg("pts"),
+        py::arg("pts_feature"), py::arg("argmax"), py::arg("pts_idx_of_voxels"),
+        py::arg("pooled_features"), py::arg("pool_method"));
+  m.def("roiaware_pool3d_backward", &roiaware_pool3d_backward,
+        "roiaware_pool3d_backward", py::arg("pts_idx_of_voxels"),
+        py::arg("argmax"), py::arg("grad_out"), py::arg("grad_in"),
+        py::arg("pool_method"));
+  m.def("rotated_feature_align_forward", &rotated_feature_align_forward,
+        "Feature Refine forward (CUDA)", py::arg("features"),
+        py::arg("best_bboxes"), py::arg("output"), py::arg("spatial_scale"),
+        py::arg("points"));
+  m.def("rotated_feature_align_backward", &rotated_feature_align_backward,
+        "Feature Refine backward (CUDA)", py::arg("top_grad"),
+        py::arg("best_bboxes"), py::arg("bottom_grad"),
+        py::arg("spatial_scale"), py::arg("points"));
+  m.def("riroi_align_rotated_forward", &riroi_align_rotated_forward,
+        "riroi_align_rotated forward", py::arg("features"), py::arg("rois"),
+        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"), py::arg("num_samples"),
+        py::arg("num_orientations"), py::arg("clockwise"));
+  m.def("riroi_align_rotated_backward", &riroi_align_rotated_backward,
+        "riroi_align_rotated backward", py::arg("top_grad"), py::arg("rois"),
+        py::arg("bottom_grad"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("num_samples"), py::arg("num_orientations"),
+        py::arg("clockwise"));
+  m.def("points_in_polygons_forward", &points_in_polygons_forward,
+        "points_in_polygons_forward", py::arg("points"), py::arg("polygons"),
+        py::arg("output"));
+  m.def("min_area_polygons", &min_area_polygons, "min_area_polygons",
+        py::arg("pointsets"), py::arg("polygons"));
+  m.def("active_rotated_filter_forward", &active_rotated_filter_forward,
+        "active_rotated_filter_forward", py::arg("input"), py::arg("indices"),
+        py::arg("output"));
+  m.def("active_rotated_filter_backward", &active_rotated_filter_backward,
+        "active_rotated_filter_backward", py::arg("grad_out"),
+        py::arg("indices"), py::arg("grad_in"));
+  m.def("convex_iou", &convex_iou, "convex_iou", py::arg("pointsets"),
+        py::arg("polygons"), py::arg("ious"));
+  m.def("convex_giou", &convex_giou, "convex_giou", py::arg("pointsets"),
+        py::arg("polygons"), py::arg("output"));
+  m.def("diff_iou_rotated_sort_vertices_forward",
+        &diff_iou_rotated_sort_vertices_forward,
+        "diff_iou_rotated_sort_vertices_forward", py::arg("vertices"),
+        py::arg("mask"), py::arg("num_valid"));
+  m.def("chamfer_distance_forward", &chamfer_distance_forward,
+        "chamfer_distance_forward", py::arg("xyz1"), py::arg("xyz2"),
+        py::arg("dist1"), py::arg("dist2"), py::arg("idx1"), py::arg("idx2"));
+  m.def("chamfer_distance_backward", &chamfer_distance_backward,
+        "chamfer_distance_backward", py::arg("xyz1"), py::arg("xyz2"),
+        py::arg("idx1"), py::arg("idx2"), py::arg("graddist1"),
+        py::arg("graddist2"), py::arg("gradxyz1"), py::arg("gradxyz2"));
+  m.def("prroi_pool_forward", &prroi_pool_forward, "prroi_pool forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("prroi_pool_backward", &prroi_pool_backward, "prroi_pool_backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("grad_input"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("prroi_pool_coor_backward", &prroi_pool_coor_backward,
+        "prroi_pool_coor_backward", py::arg("output"), py::arg("grad_output"),
+        py::arg("input"), py::arg("rois"), py::arg("grad_rois"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("bias_act", &bias_act, "bias_act (CUDA)", py::arg("input"),
+        py::arg("bias"), py::arg("xref"), py::arg("yref"), py::arg("dy"),
+        py::arg("grad"), py::arg("dim"), py::arg("act"), py::arg("alpha"),
+        py::arg("gain"), py::arg("clamp"));
+  m.def("filtered_lrelu", &filtered_lrelu, "filtered_lrelu (CUDA)",
+        py::arg("x"), py::arg("fu"), py::arg("fd"), py::arg("b"), py::arg("si"),
+        py::arg("up"), py::arg("down"), py::arg("px0"), py::arg("px1"),
+        py::arg("py0"), py::arg("py1"), py::arg("sx"), py::arg("sy"),
+        py::arg("gain"), py::arg("slope"), py::arg("clamp"),
+        py::arg("flip_filters"), py::arg("writeSigns"));
+  m.def("filtered_lrelu_act_", &filtered_lrelu_act_,
+        "filtered_lrelu_act_ (CUDA)", py::arg("x"), py::arg("si"),
+        py::arg("sx"), py::arg("sy"), py::arg("gain"), py::arg("slope"),
+        py::arg("clamp"), py::arg("writeSigns"));
+  m.def("box_iou_quadri", &box_iou_quadri, "IoU for quadrilateral boxes",
+        py::arg("boxes1"), py::arg("boxes2"), py::arg("ious"),
+        py::arg("mode_flag"), py::arg("aligned"));
+  m.def("nms_quadri", &nms_quadri, "NMS for quadrilateral boxes",
+        py::arg("dets"), py::arg("scores"), py::arg("order"),
+        py::arg("dets_sorted"), py::arg("iou_threshold"),
+        py::arg("multi_label"));
+  m.def("bezier_align_forward", &bezier_align_forward, "bezier_align forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"),
+        py::arg("aligned_height"), py::arg("aligned_width"),
+        py::arg("spatial_scale"), py::arg("sampling_ratio"),
+        py::arg("aligned"));
+  m.def("bezier_align_backward", &bezier_align_backward,
+        "bezier_align backward", py::arg("grad_output"), py::arg("rois"),
+        py::arg("grad_input"), py::arg("aligned_height"),
+        py::arg("aligned_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("aligned"));
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81ffa9fd6dcd82117ca13ac83b88b5f023aca466
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
+                       pooled_height, pooled_width, spatial_scale, num_samples,
+                       num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, pooled_height, pooled_width, spatial_scale,
+                       num_samples, num_orientations, clockwise);
+}
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise) {
+  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
+                                   pooled_width, spatial_scale, num_samples,
+                                   num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise) {
+  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
+                                    pooled_width, spatial_scale, num_samples,
+                                    num_orientations, clockwise);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a59c2b43b072907b543e05143876d69ab385f958
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align.cpp
@@ -0,0 +1,146 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#ifdef MMCV_WITH_DIOPI
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+#include <diopi/functions_mmcv.h>
+
+#include "csrc_dipu/base/basedef.h"
+#include "csrc_dipu/diopirt/diopirt_impl.h"
+
+using dipu::diopi_helper::toDiopiScalar;
+using dipu::diopi_helper::toDiopiTensorHandle;
+#endif
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
+                       argmax_x, grad_input, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+#ifdef MMCV_WITH_DIOPI
+void roi_align_forward_diopi(Tensor input, Tensor rois, Tensor output,
+                             Tensor argmax_y, Tensor argmax_x,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  auto input_p = toDiopiTensorHandle(input);
+  diopiDevice_t device;
+  diopiGetTensorDevice(input_p, &device);
+  if (device == diopi_host) {
+    roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
+                           aligned_height, aligned_width, spatial_scale,
+                           sampling_ratio, pool_mode, aligned);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto rois_p = toDiopiTensorHandle(rois);
+  auto out_p = toDiopiTensorHandle(output);
+  auto argmax_y_p = toDiopiTensorHandle(argmax_y);
+  auto argmax_x_p = toDiopiTensorHandle(argmax_x);
+  bool is_mock_cuda = input.device().type() == dipu::DIPU_DEVICE_TYPE;
+  if (is_mock_cuda && reinterpret_cast<void *>(diopiRoiAlignMmcv) != nullptr) {
+    auto ret = diopiRoiAlignMmcv(
+        ch, out_p, argmax_y_p, argmax_x_p, input_p, rois_p, aligned_height,
+        aligned_width, sampling_ratio, pool_mode, spatial_scale, aligned);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op roi_align_forward";
+  auto input_cpu = input.cpu();
+  auto rois_cpu = rois.cpu();
+  auto out_cpu = output.cpu();
+  auto argmax_y_cpu = argmax_y.cpu();
+  auto argmax_x_cpu = argmax_x.cpu();
+  roi_align_forward_impl(input_cpu, rois_cpu, out_cpu, argmax_y_cpu,
+                         argmax_x_cpu, aligned_height, aligned_width,
+                         spatial_scale, sampling_ratio, pool_mode, aligned);
+  output.copy_(out_cpu);
+}
+
+void roi_align_backward_diopi(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                              Tensor argmax_x, Tensor grad_input,
+                              int aligned_height, int aligned_width,
+                              float spatial_scale, int sampling_ratio,
+                              int pool_mode, bool aligned) {
+  auto grad_output_ = toDiopiTensorHandle(grad_output);
+  diopiDevice_t device;
+  diopiGetTensorDevice(grad_output_, &device);
+  if (device == diopi_host) {
+    roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
+                            aligned_height, aligned_width, spatial_scale,
+                            sampling_ratio, pool_mode, aligned);
+    return;
+  }
+  auto rois_ = toDiopiTensorHandle(rois);
+  auto argmax_y_ = toDiopiTensorHandle(argmax_y);
+  auto argmax_x_ = toDiopiTensorHandle(argmax_x);
+  auto grad_input_ = toDiopiTensorHandle(grad_input);
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  bool is_mock_cuda = grad_output.device().type() == dipu::DIPU_DEVICE_TYPE;
+  if (is_mock_cuda &&
+      reinterpret_cast<void *>(diopiRoiAlignBackwardMmcv) != nullptr) {
+    auto ret = diopiRoiAlignBackwardMmcv(ch, grad_input_, grad_output_, rois_,
+                                         argmax_y_, argmax_x_, aligned_height,
+                                         aligned_width, sampling_ratio,
+                                         pool_mode, spatial_scale, aligned);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op roi_align_backward";
+  auto grad_output_cpu = grad_output.cpu();
+  auto rois_cpu = rois.cpu();
+  auto argmax_y_cpu = argmax_y.cpu();
+  auto argmax_x_cpu = argmax_x.cpu();
+  auto grad_input_cpu = grad_input.cpu();
+  roi_align_backward_impl(grad_output_cpu, rois_cpu, argmax_y_cpu, argmax_x_cpu,
+                          grad_input_cpu, aligned_height, aligned_width,
+                          spatial_scale, sampling_ratio, pool_mode, aligned);
+  grad_input.copy_(grad_input_cpu);
+}
+#endif
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned) {
+#ifdef MMCV_WITH_DIOPI
+  roi_align_forward_diopi(input, rois, output, argmax_y, argmax_x,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+#else
+  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+#endif
+}
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned) {
+#ifdef MMCV_WITH_DIOPI
+  roi_align_backward_diopi(grad_output, rois, argmax_y, argmax_x, grad_input,
+                           aligned_height, aligned_width, spatial_scale,
+                           sampling_ratio, pool_mode, aligned);
+#else
+  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+#endif
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..77ea5ce70cff1724a6b012aee127ba256c7dd326
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, input, rois, output,
+                       aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned, bool clockwise) {
+  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
+                                 aligned_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                Tensor bottom_grad, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise) {
+  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
+                                  aligned_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_pool.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bba90b806c5fe59d9e20a0b41a51df9922e91c3f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_pool.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
+                       grad_input, pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width,
+                      float spatial_scale) {
+  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
+}
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale) {
+  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cf9cf0945db4c0ce1774aed6d334b62f3e1a9e4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
@@ -0,0 +1,72 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
+                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
+                       pts, pts_feature, argmax, pts_idx_of_voxels,
+                       pooled_features, pool_method);
+}
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
+                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
+                       argmax, grad_out, grad_in, pool_method);
+}
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR
+  // coordinate
+  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // params pts_feature: (npoints, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params pooled_features: (N, out_x, out_y, out_z, C)
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = rois.size(0);
+  int pts_num = pts.size(0);
+  int channels = pts_feature.size(1);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  assert((out_x < 256) && (out_y < 256) &&
+         (out_z < 256));  // we encode index with 8bit
+
+  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
+                               out_x, out_y, out_z, rois, pts, pts_feature,
+                               argmax, pts_idx_of_voxels, pooled_features,
+                               pool_method);
+}
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in,
+                              int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = pts_idx_of_voxels.size(0);
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int channels = grad_out.size(4);
+
+  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
+                                max_pts_each_voxel, pts_idx_of_voxels, argmax,
+                                grad_out, grad_in, pool_method);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a10080b7c23abb3a31b6f764c972ea7917f52346
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
@@ -0,0 +1,39 @@
+/*
+Modified from
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
+                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
+                       pts_feature, pooled_features, pooled_empty_flag);
+}
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  int batch_size = xyz.size(0);
+  int pts_num = xyz.size(1);
+  int boxes_num = boxes3d.size(1);
+  int feature_in_len = pts_feature.size(2);
+  int sampled_pts_num = pooled_features.size(2);
+
+  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
+                               sampled_pts_num, xyz, boxes3d, pts_feature,
+                               pooled_features, pooled_empty_flag);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71fe0c9a0a26003310a388d4edca6e79aa7b9026
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
+                       best_bboxes, spatial_scale, points, output);
+}
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
+                       best_bboxes, spatial_scale, points, bottom_grad);
+}
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale,
+                                   const int points) {
+  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
+                                     points, output);
+}
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points) {
+  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
+                                      points, bottom_grad);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/scatter_points.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/scatter_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0de8ebf64a3432db25b61a81fce305efc09195b8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/scatter_points.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const reduce_t reduce_type) {
+  return DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, feats, coors,
+                              reduce_type);
+}
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
+    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
+    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
+    const reduce_t reduce_type) {
+  DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, grad_feats,
+                       grad_reduced_feats, feats, reduced_feats, coors_idx,
+                       reduce_count, reduce_type);
+}
+
+inline reduce_t convert_reduce_type(const std::string &reduce_type) {
+  if (reduce_type == "max")
+    return reduce_t::MAX;
+  else if (reduce_type == "sum")
+    return reduce_t::SUM;
+  else if (reduce_type == "mean")
+    return reduce_t::MEAN;
+  else
+    TORCH_CHECK(false, "do not support reduce type " + reduce_type)
+  return reduce_t::SUM;
+}
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const std::string &reduce_type) {
+  return dynamic_point_to_voxel_forward_impl(feats, coors,
+                                             convert_reduce_type(reduce_type));
+}
+
+void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
+                                     const torch::Tensor &grad_reduced_feats,
+                                     const torch::Tensor &feats,
+                                     const torch::Tensor &reduced_feats,
+                                     const torch::Tensor &coors_idx,
+                                     const torch::Tensor &reduce_count,
+                                     const std::string &reduce_type) {
+  dynamic_point_to_voxel_backward_impl(grad_feats, grad_reduced_feats, feats,
+                                       reduced_feats, coors_idx, reduce_count,
+                                       convert_reduce_type(reduce_type));
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6f38fc68a3ec4fc1de253215c1068fba6109599
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
@@ -0,0 +1,48 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct) {
+  return DISPATCH_DEVICE_IMPL(indice_maxpool_forward_impl, features,
+                              indicePairs, indiceNum, numAct);
+}
+
+torch::Tensor indice_maxpool_forward(torch::Tensor features,
+                                     torch::Tensor indicePairs,
+                                     torch::Tensor indiceNum, int64_t numAct) {
+  return indice_maxpool_forward_impl(features, indicePairs, indiceNum, numAct);
+}
+
+torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum) {
+  return DISPATCH_DEVICE_IMPL(indice_maxpool_backward_impl, features,
+                              outFeatures, outGrad, indicePairs, indiceNum);
+}
+
+torch::Tensor indice_maxpool_backward(torch::Tensor features,
+                                      torch::Tensor outFeatures,
+                                      torch::Tensor outGrad,
+                                      torch::Tensor indicePairs,
+                                      torch::Tensor indiceNum) {
+  return indice_maxpool_backward_impl(features, outFeatures, outGrad,
+                                      indicePairs, indiceNum);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_ops.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..723c6c7b900b47b128222cc8ebf0b1cf4e3af116
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_ops.cpp
@@ -0,0 +1,197 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward_cuda(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsForwardCUDAKernelLauncher<NDim>(
+      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+      padding, dilation, outPadding, _subM, _transpose);
+};
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward_mlu(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsForwardMLUKernelLauncher<NDim>(
+      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+      padding, dilation, outPadding, _subM, _transpose);
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_backward_cuda(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsBackwardCUDAKernelLauncher<NDim>(
+      indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
+      stride, padding, dilation, outPadding, _subM, _transpose);
+};
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  if (indices.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(indices);
+
+    return get_indice_pairs_forward_cuda<NDim>(
+        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+        padding, dilation, outPadding, _subM, _transpose);
+#else
+    AT_ERROR("get_indice_pairs is not compiled with GPU support");
+#endif
+#ifdef MMCV_WITH_MLU
+  } else if (indices.device().type() == at::kMLU) {
+    return get_indice_pairs_forward_mlu<NDim>(
+        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+        padding, dilation, outPadding, _subM, _transpose);
+#endif
+  } else {
+    AT_ERROR("get_indice_pairs is not implemented on CPU");
+  }
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_backward(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  if (indices.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(indices);
+    CHECK_CUDA_INPUT(gridOut);
+
+    return get_indice_pairs_backward_cuda<NDim>(
+        indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
+        stride, padding, dilation, outPadding, _subM, _transpose);
+#else
+    AT_ERROR("get_indice_pairs is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("get_indice_pairs is not implemented on CPU");
+  }
+}
+
+torch::Tensor indice_conv_forward_impl(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(indice_conv_forward_impl, features, filters,
+                              indicePairs, indiceNum, numActOut, _inverse,
+                              _subM);
+}
+
+torch::Tensor indice_conv_forward(torch::Tensor features, torch::Tensor filters,
+                                  torch::Tensor indicePairs,
+                                  torch::Tensor indiceNum, int64_t numActOut,
+                                  int64_t _inverse, int64_t _subM) {
+  return indice_conv_forward_impl(features, filters, indicePairs, indiceNum,
+                                  numActOut, _inverse, _subM);
+}
+
+std::vector<torch::Tensor> indice_conv_backward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(indice_conv_backward_impl, features, filters,
+                              outGrad, indicePairs, indiceNum, _inverse, _subM);
+}
+
+std::vector<torch::Tensor> indice_conv_backward(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return indice_conv_backward_impl(features, filters, outGrad, indicePairs,
+                                   indiceNum, _inverse, _subM);
+}
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_backward<2>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_backward<3>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_utils.h b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d3de025b690f6247abfb813614e70de36b02d7d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_utils.h
@@ -0,0 +1,79 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/script.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+namespace tv {
+struct GPU {
+  GPU(cudaStream_t s = 0) : mStream(s) {}
+  virtual cudaStream_t getStream() const { return mStream; }
+  cudaStream_t mStream = 0;
+};
+
+struct TorchGPU : public tv::GPU {
+  virtual cudaStream_t getStream() const override {
+    return at::cuda::getCurrentCUDAStream();
+  }
+};
+
+template <typename scalar_t>
+void check_torch_dtype(const torch::Tensor &tensor) {
+  switch (tensor.type().scalarType()) {
+    case at::ScalarType::Double: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, double>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Float: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, float>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Int: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, int>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Half: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, at::Half>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Long: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, long>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    default:
+      TV_ASSERT_RT_ERR(false, "error");
+  }
+}
+
+template <typename scalar_t>
+tv::TensorView<scalar_t> torch2tv(const torch::Tensor &tensor) {
+  check_torch_dtype<scalar_t>(tensor);
+  tv::Shape shape;
+  for (auto i : tensor.sizes()) {
+    shape.push_back(i);
+  }
+  return tv::TensorView<scalar_t>(
+      tensor.data_ptr<std::remove_const_t<scalar_t>>(), shape);
+}
+}  // namespace tv
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sync_bn.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sync_bn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd5a513273a7bbce2cf41c790706fe4801f4c414
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sync_bn.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
+}
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
+}
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
+                       running_mean, running_var, weight, bias, norm, std,
+                       output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
+                       grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
+                       grad_weight, grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean) {
+  sync_bn_forward_mean_impl(input, mean);
+}
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
+  sync_bn_forward_var_impl(input, mean, var);
+}
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size) {
+  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
+}
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias) {
+  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input) {
+  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_interpolate.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_interpolate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e0ec71bb3d3fdb8416dcc62cfda926cc45c9977
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_interpolate.cpp
@@ -0,0 +1,33 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
+                       weight, out);
+}
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
+                       idx, weight, grad_points);
+}
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n) {
+  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
+                                 weight_tensor, out_tensor);
+}
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m) {
+  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
+                                  weight_tensor, grad_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_nn.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_nn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b629200c0727cdec5ca4e0abd8ac65baacaa31f9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_nn.cpp
@@ -0,0 +1,18 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
+                       idx);
+}
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m) {
+  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
+                        idx_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/tin_shift.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/tin_shift.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b03f587541f17cae3c3f03f5cb8747d4b0208efc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/tin_shift.cpp
@@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
+  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
+}
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
+}
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
+  tin_shift_forward_impl(input, shift, output);
+}
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
+  tin_shift_backward_impl(grad_output, shift, grad_input);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/upfirdn2d.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/upfirdn2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4a3e928c1a0a0d5f1c29c6173bb8b058e871079f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/upfirdn2d.cpp
@@ -0,0 +1,118 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor upfirdn2d_op_impl(torch::Tensor input, torch::Tensor filter,
+                                int upx, int upy, int downx, int downy,
+                                int padx0, int padx1, int pady0, int pady1,
+                                bool flip, float gain) {
+  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, filter, upx, upy, downx,
+                              downy, padx0, padx1, pady0, pady1, flip, gain);
+}
+
+torch::Tensor upfirdn2d(torch::Tensor input, torch::Tensor filter, int upx,
+                        int upy, int downx, int downy, int padx0, int padx1,
+                        int pady0, int pady1, bool flip, float gain) {
+  return upfirdn2d_op_impl(input, filter, upx, upy, downx, downy, padx0, padx1,
+                           pady0, pady1, flip, gain);
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/voxelization.cpp b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/voxelization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75673511591215422dea113082eb6fed331df7da
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/voxelization.cpp
@@ -0,0 +1,220 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#ifdef MMCV_WITH_DIOPI
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+#include <diopi/functions_mmcv.h>
+
+#include "csrc_dipu/diopirt/diopirt_impl.h"
+
+using dipu::diopi_helper::toDiopiScalar;
+using dipu::diopi_helper::toDiopiTensorHandle;
+#endif
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
+                              num_points_per_voxel, voxel_size, coors_range,
+                              max_points, max_voxels, NDim);
+}
+
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
+                              points, voxels, coors, num_points_per_voxel,
+                              voxel_size, coors_range, max_points, max_voxels,
+                              NDim);
+}
+
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim = 3) {
+  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
+                       coors_range, NDim);
+}
+
+#ifdef MMCV_WITH_DIOPI
+void hard_voxelize_forward_diopi(const at::Tensor &points,
+                                 const at::Tensor &voxel_size,
+                                 const at::Tensor &coors_range,
+                                 at::Tensor &voxels, at::Tensor &coors,
+                                 at::Tensor &num_points_per_voxel,
+                                 at::Tensor &voxel_num, const int max_points,
+                                 const int max_voxels, const int NDim = 3,
+                                 const bool deterministic = true) {
+  auto points_p = toDiopiTensorHandle(points);
+  diopiDevice_t device;
+  diopiGetTensorDevice(points_p, &device);
+  if (device == diopi_host) {
+    int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
+    std::vector<float> voxel_size_v(
+        voxel_size.data_ptr<float>(),
+        voxel_size.data_ptr<float>() + voxel_size.numel());
+    std::vector<float> coors_range_v(
+        coors_range.data_ptr<float>(),
+        coors_range.data_ptr<float>() + coors_range.numel());
+
+    if (deterministic) {
+      *voxel_num_data = hard_voxelize_forward_impl(
+          points, voxels, coors, num_points_per_voxel, voxel_size_v,
+          coors_range_v, max_points, max_voxels, NDim);
+    } else {
+      TORCH_CHECK(
+          deterministic,
+          "nondeterministic hard_voxelize_forward is not supported on host!");
+    }
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto voxel_size_p = toDiopiTensorHandle(voxel_size);
+  auto coors_range_p = toDiopiTensorHandle(coors_range);
+  auto voxels_p = toDiopiTensorHandle(voxels);
+  auto coors_p = toDiopiTensorHandle(coors);
+  auto num_points_per_voxel_p = toDiopiTensorHandle(num_points_per_voxel);
+  auto voxel_num_p = toDiopiTensorHandle(voxel_num);
+  if (reinterpret_cast<void *>(diopiHardVoxelizeMmcv) != nullptr) {
+    auto ret = diopiHardVoxelizeMmcv(
+        ch, voxels_p, coors_p, num_points_per_voxel_p, voxel_num_p, points_p,
+        voxel_size_p, coors_range_p, max_points, max_voxels, NDim,
+        deterministic);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op hard_voxelize_forward";
+  auto points_cpu = points.cpu();
+  auto voxel_size_cpu = voxel_size.cpu();
+  auto coors_range_cpu = coors_range.cpu();
+  auto voxels_cpu = voxels.cpu();
+  auto coors_cpu = coors.cpu();
+  auto num_points_per_voxel_cpu = num_points_per_voxel.cpu();
+  auto voxel_num_cpu = voxel_num.cpu();
+
+  int64_t *voxel_num_data_cpu = voxel_num_cpu.data_ptr<int64_t>();
+  std::vector<float> voxel_size_v_cpu(
+      voxel_size_cpu.data_ptr<float>(),
+      voxel_size_cpu.data_ptr<float>() + voxel_size_cpu.numel());
+  std::vector<float> coors_range_v_cpu(
+      coors_range_cpu.data_ptr<float>(),
+      coors_range_cpu.data_ptr<float>() + coors_range_cpu.numel());
+
+  if (deterministic) {
+    *voxel_num_data_cpu = hard_voxelize_forward_impl(
+        points_cpu, voxels_cpu, coors_cpu, num_points_per_voxel_cpu,
+        voxel_size_v_cpu, coors_range_v_cpu, max_points, max_voxels, NDim);
+  } else {
+    puts("nondeterministic hard_voxelize_forward is not supported on host!");
+    abort();
+  }
+  voxels.copy_(voxels_cpu);
+  coors.copy_(coors_cpu);
+  num_points_per_voxel.copy_(num_points_per_voxel_cpu);
+  voxel_num.copy_(voxel_num_cpu);
+  return;
+}
+
+void dynamic_voxelize_forward_diopi(const at::Tensor &points,
+                                    const at::Tensor &voxel_size,
+                                    const at::Tensor &coors_range,
+                                    at::Tensor &coors, const int NDim = 3) {
+  auto points_p = toDiopiTensorHandle(points);
+  diopiDevice_t device;
+  diopiGetTensorDevice(points_p, &device);
+  if (device == diopi_host) {
+    std::vector<float> voxel_size_v(
+        voxel_size.data_ptr<float>(),
+        voxel_size.data_ptr<float>() + voxel_size.numel());
+    std::vector<float> coors_range_v(
+        coors_range.data_ptr<float>(),
+        coors_range.data_ptr<float>() + coors_range.numel());
+    dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
+                                  NDim);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto voxel_size_p = toDiopiTensorHandle(voxel_size);
+  auto coors_range_p = toDiopiTensorHandle(coors_range);
+  auto coors_p = toDiopiTensorHandle(coors);
+  if (reinterpret_cast<void *>(diopiDynamicVoxelizeMmcv) != nullptr) {
+    auto ret = diopiDynamicVoxelizeMmcv(ch, coors_p, points_p, voxel_size_p,
+                                        coors_range_p, NDim);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op dynamic_voxelize_forward";
+  auto points_cpu = points.cpu();
+  auto voxel_size_cpu = voxel_size.cpu();
+  auto coors_range_cpu = coors_range.cpu();
+  auto coors_cpu = coors.cpu();
+
+  std::vector<float> voxel_size_v_cpu(
+      voxel_size_cpu.data_ptr<float>(),
+      voxel_size_cpu.data_ptr<float>() + voxel_size_cpu.numel());
+  std::vector<float> coors_range_v_cpu(
+      coors_range_cpu.data_ptr<float>(),
+      coors_range_cpu.data_ptr<float>() + coors_range_cpu.numel());
+  dynamic_voxelize_forward_impl(points_cpu, coors_cpu, voxel_size_v_cpu,
+                                coors_range_v_cpu, NDim);
+  coors.copy_(coors_cpu);
+  return;
+}
+#endif
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim = 3,
+                           const bool deterministic = true) {
+#ifdef MMCV_WITH_DIOPI
+  hard_voxelize_forward_diopi(points, voxel_size, coors_range, voxels, coors,
+                              num_points_per_voxel, voxel_num, max_points,
+                              max_voxels, NDim, deterministic);
+#else
+  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+
+  if (deterministic) {
+    *voxel_num_data = hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  } else {
+    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  }
+#endif
+}
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim = 3) {
+#ifdef MMCV_WITH_DIOPI
+  dynamic_voxelize_forward_diopi(points, voxel_size, coors_range, coors, NDim);
+#else
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
+                                NDim);
+#endif
+}
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/deform_conv.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..8251bc7328ae2d0132ded3ed19f285ac7242bdeb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/deform_conv.py
@@ -0,0 +1,501 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.logging import print_log
+from mmengine.registry import MODELS
+from mmengine.utils import deprecated_api_warning
+from torch import Tensor
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+
+from mmcv.utils import IS_MLU_AVAILABLE
+from ..utils import ext_loader
+from .modulated_deform_conv import ModulatedDeformConv2dFunction
+
+ext_module = ext_loader.load_ext('_ext', [
+    'deform_conv_forward', 'deform_conv_backward_input',
+    'deform_conv_backward_parameters'
+])
+
+
+class DeformConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g,
+                 input,
+                 offset,
+                 weight,
+                 stride,
+                 padding,
+                 dilation,
+                 groups,
+                 deform_groups,
+                 bias=False,
+                 im2col_step=32):
+        return g.op(
+            'mmcv::MMCVDeformConv2d',
+            input,
+            offset,
+            weight,
+            stride_i=stride,
+            padding_i=padding,
+            dilation_i=dilation,
+            groups_i=groups,
+            deform_groups_i=deform_groups,
+            bias_i=bias,
+            im2col_step_i=im2col_step)
+
+    @staticmethod
+    def _npu_backward(ctx, grad_output):
+        input_tensor, weight, offset_out, offset_all, sort_index_for_npu_bp = \
+            ctx.saved_tensors
+        grad_input, grad_weight, grad_offset_all, grad_bias = \
+            torch.npu_deformable_conv2dbk(
+                input_tensor, grad_output, offset_out, weight, offset_all,
+                kernel_size=[weight.shape[3], weight.shape[2]],
+                stride=[1, 1, ctx.stride[0], ctx.stride[1]],
+                padding=[ctx.padding[0], ctx.padding[0], ctx.padding[1],
+                         ctx.padding[1]],
+                dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
+                groups=ctx.groups, deformable_groups=ctx.deform_groups,
+                modulated=True)
+        grad_offset = grad_offset_all.index_select(1, sort_index_for_npu_bp)
+        return grad_input, grad_offset, grad_weight, \
+            None, None, None, None, None, None, None
+
+    @staticmethod
+    def forward(ctx,
+                input: Tensor,
+                offset: Tensor,
+                weight: Tensor,
+                stride: Union[int, Tuple[int, ...]] = 1,
+                padding: Union[int, Tuple[int, ...]] = 0,
+                dilation: Union[int, Tuple[int, ...]] = 1,
+                groups: int = 1,
+                deform_groups: int = 1,
+                bias: bool = False,
+                im2col_step: int = 32) -> Tensor:
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                f'Expected 4D tensor as input, got {input.dim()}D tensor \
+                  instead.')
+        assert bias is False, 'Only support bias is False.'
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deform_groups = deform_groups
+        ctx.im2col_step = im2col_step
+        ctx.device = input.device.type
+
+        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
+        # amp won't cast the type of model (float32), but "offset" is cast
+        # to float16 by nn.Conv2d automatically, leading to the type
+        # mismatch with input (when it is float32) or weight.
+        # The flag for whether to use fp16 or amp is the type of "offset",
+        # we cast weight and input to temporarily support fp16 and amp
+        # whatever the pytorch version is.
+        input = input.type_as(offset)
+        weight = weight.type_as(input)
+        if ctx.device == 'npu':
+            mask_shape, _ = torch.chunk(offset, 2, dim=1)
+            mask = torch.ones_like(mask_shape).to(input.device)
+            bias = input.new_empty(0)
+            output = ModulatedDeformConv2dFunction._npu_forward(
+                ctx, input, offset, mask, weight, bias)
+            return output
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty([
+            int(i)
+            for i in DeformConv2dFunction._output_size(ctx, input, weight)
+        ])
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) % cur_im2col_step
+                ) == 0, 'batch size must be divisible by im2col_step'
+        ext_module.deform_conv_forward(
+            input,
+            weight,
+            offset,
+            output,
+            ctx.bufs_[0],
+            ctx.bufs_[1],
+            kW=weight.size(3),
+            kH=weight.size(2),
+            dW=ctx.stride[1],
+            dH=ctx.stride[0],
+            padW=ctx.padding[1],
+            padH=ctx.padding[0],
+            dilationW=ctx.dilation[1],
+            dilationH=ctx.dilation[0],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            im2col_step=cur_im2col_step)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], None,
+               None, None, None, None, None, None]:
+        if ctx.device == 'npu':
+            return DeformConv2dFunction._npu_backward(ctx, grad_output)
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) % cur_im2col_step
+                ) == 0, 'batch size must be divisible by im2col_step'
+
+        grad_output = grad_output.contiguous()
+        if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+            grad_input = torch.zeros_like(input)
+            grad_offset = torch.zeros_like(offset)
+            ext_module.deform_conv_backward_input(
+                input,
+                offset,
+                grad_output,
+                grad_input,
+                grad_offset,
+                weight,
+                ctx.bufs_[0],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                im2col_step=cur_im2col_step)
+
+        if ctx.needs_input_grad[2]:
+            grad_weight = torch.zeros_like(weight)
+            ext_module.deform_conv_backward_parameters(
+                input,
+                offset,
+                grad_output,
+                grad_weight,
+                ctx.bufs_[0],
+                ctx.bufs_[1],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                scale=1,
+                im2col_step=cur_im2col_step)
+
+        return grad_input, grad_offset, grad_weight, \
+            None, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(ctx, input, weight):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = ctx.padding[d]
+            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = ctx.stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be ' +
+                'x'.join(map(str, output_size)) + ')')
+        return output_size
+
+
+deform_conv2d = DeformConv2dFunction.apply
+
+
+class DeformConv2d(nn.Module):
+    r"""Deformable 2D convolution.
+
+    Applies a deformable 2D convolution over an input signal composed of
+    several input planes. DeformConv2d was described in the paper
+    `Deformable Convolutional Networks
+    <https://arxiv.org/pdf/1703.06211.pdf>`_
+
+    Note:
+        The argument ``im2col_step`` was added in version 1.3.17, which means
+        number of samples processed by the ``im2col_cuda_kernel`` per call.
+        It enables users to define ``batch_size`` and ``im2col_step`` more
+        flexibly and solved `issue mmcv#1440
+        <https://github.com/open-mmlab/mmcv/issues/1440>`_.
+
+    Args:
+        in_channels (int): Number of channels in the input image.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size(int, tuple): Size of the convolving kernel.
+        stride(int, tuple): Stride of the convolution. Default: 1.
+        padding (int or tuple): Zero-padding added to both sides of the input.
+            Default: 0.
+        dilation (int or tuple): Spacing between kernel elements. Default: 1.
+        groups (int): Number of blocked connections from input.
+            channels to output channels. Default: 1.
+        deform_groups (int): Number of deformable group partitions.
+        bias (bool): If True, adds a learnable bias to the output.
+            Default: False.
+        im2col_step (int): Number of samples processed by im2col_cuda_kernel
+            per call. It will work when ``batch_size`` > ``im2col_step``, but
+            ``batch_size`` must be divisible by ``im2col_step``. Default: 32.
+            `New in version 1.3.17.`
+    """
+
+    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
+                            cls_name='DeformConv2d')
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, ...]],
+                 stride: Union[int, Tuple[int, ...]] = 1,
+                 padding: Union[int, Tuple[int, ...]] = 0,
+                 dilation: Union[int, Tuple[int, ...]] = 1,
+                 groups: int = 1,
+                 deform_groups: int = 1,
+                 bias: bool = False,
+                 im2col_step: int = 32) -> None:
+        super().__init__()
+
+        assert not bias, \
+            f'bias={bias} is not supported in DeformConv2d.'
+        assert in_channels % groups == 0, \
+            f'in_channels {in_channels} cannot be divisible by groups {groups}'
+        assert out_channels % groups == 0, \
+            f'out_channels {out_channels} cannot be divisible by groups \
+              {groups}'
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deform_groups = deform_groups
+        self.im2col_step = im2col_step
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+
+        # only weight, no bias
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups,
+                         *self.kernel_size))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        # switch the initialization of `self.weight` to the standard kaiming
+        # method described in `Delving deep into rectifiers: Surpassing
+        # human-level performance on ImageNet classification` - He, K. et al.
+        # (2015), using a uniform distribution
+        nn.init.kaiming_uniform_(self.weight, nonlinearity='relu')
+
+    def forward(self, x: Tensor, offset: Tensor) -> Tensor:
+        """Deformable Convolutional forward function.
+
+        Args:
+            x (Tensor): Input feature, shape (B, C_in, H_in, W_in)
+            offset (Tensor): Offset for deformable convolution, shape
+                (B, deform_groups*kernel_size[0]*kernel_size[1]*2,
+                H_out, W_out), H_out, W_out are equal to the output's.
+
+                An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
+                The spatial arrangement is like:
+
+                .. code:: text
+
+                    (x0, y0) (x1, y1) (x2, y2)
+                    (x3, y3) (x4, y4) (x5, y5)
+                    (x6, y6) (x7, y7) (x8, y8)
+
+        Returns:
+            Tensor: Output of the layer.
+        """
+        # To fix an assert error in deform_conv_cuda.cpp:128
+        # input image is smaller than kernel
+        input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) <
+                                                          self.kernel_size[1])
+        if input_pad:
+            pad_h = max(self.kernel_size[0] - x.size(2), 0)
+            pad_w = max(self.kernel_size[1] - x.size(3), 0)
+            x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()
+            offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0)
+            offset = offset.contiguous()
+        out = deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                            self.dilation, self.groups, self.deform_groups,
+                            False, self.im2col_step)
+        if input_pad:
+            out = out[:, :, :out.size(2) - pad_h, :out.size(3) -
+                      pad_w].contiguous()
+        return out
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(in_channels={self.in_channels},\n'
+        s += f'out_channels={self.out_channels},\n'
+        s += f'kernel_size={self.kernel_size},\n'
+        s += f'stride={self.stride},\n'
+        s += f'padding={self.padding},\n'
+        s += f'dilation={self.dilation},\n'
+        s += f'groups={self.groups},\n'
+        s += f'deform_groups={self.deform_groups},\n'
+        # bias is not supported in DeformConv2d.
+        s += 'bias=False)'
+        return s
+
+
+@MODELS.register_module('DCN')
+class DeformConv2dPack(DeformConv2d):
+    """A Deformable Conv Encapsulation that acts as normal Conv layers.
+
+    The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
+    The spatial arrangement is like:
+
+    .. code:: text
+
+        (x0, y0) (x1, y1) (x2, y2)
+        (x3, y3) (x4, y4) (x5, y5)
+        (x6, y6) (x7, y7) (x8, y8)
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+
+    _version = 2
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            dilation=_pair(self.dilation),
+            bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+
+    def forward(self, x: Tensor) -> Tensor:  # type: ignore
+        offset = self.conv_offset(x)
+        return deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                             self.dilation, self.groups, self.deform_groups,
+                             False, self.im2col_step)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, DeformConvPack loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+
+        if version is not None and version > 1:
+            print_log(
+                f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to '
+                'version 2.',
+                logger='current')
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+
+if IS_MLU_AVAILABLE:
+    import torchvision
+    from mmengine.utils import digit_version
+    from torchvision.ops import deform_conv2d as tv_deform_conv2d
+
+    @MODELS.register_module('DCN', force=True)
+    class DeformConv2dPack_MLU(DeformConv2d):
+        """This class is the DCN implementation of the MLU device. The MLU
+        backend support of the operator has been implemented in torchvision.
+        The mmcv registration mechanism is used for multiplexing here. The
+        torchvision implementation of DCN is called.
+        Args:
+            in_channels (int): Same as nn.Conv2d.
+            out_channels (int): Same as nn.Conv2d.
+            kernel_size (int or tuple[int]): Same as nn.Conv2d.
+            stride (int): Same as nn.Conv2d, while tuple is not supported.
+            padding (int): Same as nn.Conv2d, while tuple is not supported.
+            dilation (int): Same as nn.Conv2d, while tuple is not supported.
+            groups (int): Same as nn.Conv2d.
+            bias (bool or str): If specified as `auto`, it will be decided by
+                the norm_cfg. Bias will be set as True if norm_cfg is None,
+                otherwise False.
+            im2col_step (int): Number of samples processed by
+                im2col_cuda_kernel per call. It will work when ``batch_size``
+                > ``im2col_step``, but ``batch_size`` must be divisible by
+                ``im2col_step``. Default: 32. `New in version 1.7.2.
+                Currently not supported on MLU devices.`
+        """
+
+        def __init__(self, *args, **kwargs):
+            assert digit_version(torchvision.__version__) >= digit_version(
+                '0.10.0a0'), 'the version of torchvision should be >= 0.10.0'
+            super().__init__(*args, **kwargs)
+
+            self.conv_offset = nn.Conv2d(
+                self.in_channels,
+                self.deform_groups * 2 * self.kernel_size[0] *
+                self.kernel_size[1],
+                kernel_size=self.kernel_size,
+                stride=_pair(self.stride),
+                padding=_pair(self.padding),
+                dilation=_pair(self.dilation),
+                bias=True)
+            self.init_offset()
+
+        def init_offset(self):
+            self.conv_offset.weight.data.zero_()
+            self.conv_offset.bias.data.zero_()
+
+        def forward(self, x: Tensor) -> Tensor:  # type: ignore
+            cur_im2col_step = min(self.im2col_step, x.size(0))
+            assert (x.size(0) % cur_im2col_step
+                    ) == 0, 'batch size must be divisible by im2col_step'
+            offset = self.conv_offset(x)
+            x = x.type_as(offset)
+            weight = self.weight.type_as(x)
+            return tv_deform_conv2d(x, offset, weight, None, self.stride,
+                                    self.padding, self.dilation)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/deform_roi_pool.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/deform_roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b140888025b3f4af2bf892e2e38d2b89e0f324a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/deform_roi_pool.py
@@ -0,0 +1,211 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+from torch import Tensor, nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['deform_roi_pool_forward', 'deform_roi_pool_backward'])
+
+
+class DeformRoIPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, offset, output_size, spatial_scale,
+                 sampling_ratio, gamma):
+        inputs = [input, rois]
+        if offset is not None:
+            inputs = [input, rois, offset]
+        return g.op(
+            'mmcv::MMCVDeformRoIPool',
+            *inputs,
+            pooled_height_i=output_size[0],
+            pooled_width_i=output_size[1],
+            spatial_scale_f=spatial_scale,
+            sampling_ratio_f=sampling_ratio,
+            gamma_f=gamma,
+        )
+
+    @staticmethod
+    def forward(ctx,
+                input: Tensor,
+                rois: Tensor,
+                offset: Optional[Tensor],
+                output_size: Tuple[int, ...],
+                spatial_scale: float = 1.0,
+                sampling_ratio: int = 0,
+                gamma: float = 0.1) -> Tensor:
+        if offset is None:
+            offset = input.new_zeros(0)
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = float(spatial_scale)
+        ctx.sampling_ratio = int(sampling_ratio)
+        ctx.gamma = float(gamma)
+
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+
+        ext_module.deform_roi_pool_forward(
+            input,
+            rois,
+            offset,
+            output,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            gamma=ctx.gamma)
+
+        ctx.save_for_backward(input, rois, offset)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Tensor, None, Tensor, None, None, None, None]:
+        input, rois, offset = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(input.shape)
+        grad_offset = grad_output.new_zeros(offset.shape)
+
+        ext_module.deform_roi_pool_backward(
+            grad_output,
+            input,
+            rois,
+            offset,
+            grad_input,
+            grad_offset,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            gamma=ctx.gamma)
+        if grad_offset.numel() == 0:
+            grad_offset = None
+        return grad_input, None, grad_offset, None, None, None, None
+
+
+deform_roi_pool = DeformRoIPoolFunction.apply
+
+
+class DeformRoIPool(nn.Module):
+
+    def __init__(self,
+                 output_size: Tuple[int, ...],
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__()
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.gamma = float(gamma)
+
+    def forward(self,
+                input: Tensor,
+                rois: Tensor,
+                offset: Optional[Tensor] = None) -> Tensor:
+        return deform_roi_pool(input, rois, offset, self.output_size,
+                               self.spatial_scale, self.sampling_ratio,
+                               self.gamma)
+
+
+class DeformRoIPoolPack(DeformRoIPool):
+
+    def __init__(self,
+                 output_size: Tuple[int, ...],
+                 output_channels: int,
+                 deform_fc_channels: int = 1024,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)
+
+        self.output_channels = output_channels
+        self.deform_fc_channels = deform_fc_channels
+
+        self.offset_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 2))
+        self.offset_fc[-1].weight.data.zero_()
+        self.offset_fc[-1].bias.data.zero_()
+
+    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
+        assert input.size(1) == self.output_channels
+        x = deform_roi_pool(input, rois, None, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        rois_num = rois.size(0)
+        offset = self.offset_fc(x.view(rois_num, -1))
+        offset = offset.view(rois_num, 2, self.output_size[0],
+                             self.output_size[1])
+        return deform_roi_pool(input, rois, offset, self.output_size,
+                               self.spatial_scale, self.sampling_ratio,
+                               self.gamma)
+
+
+class ModulatedDeformRoIPoolPack(DeformRoIPool):
+
+    def __init__(self,
+                 output_size: Tuple[int, ...],
+                 output_channels: int,
+                 deform_fc_channels: int = 1024,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)
+
+        self.output_channels = output_channels
+        self.deform_fc_channels = deform_fc_channels
+
+        self.offset_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 2))
+        self.offset_fc[-1].weight.data.zero_()
+        self.offset_fc[-1].bias.data.zero_()
+
+        self.mask_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 1),
+            nn.Sigmoid())
+        self.mask_fc[2].weight.data.zero_()
+        self.mask_fc[2].bias.data.zero_()
+
+    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
+        assert input.size(1) == self.output_channels
+        x = deform_roi_pool(input, rois, None, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        rois_num = rois.size(0)
+        offset = self.offset_fc(x.view(rois_num, -1))
+        offset = offset.view(rois_num, 2, self.output_size[0],
+                             self.output_size[1])
+        mask = self.mask_fc(x.view(rois_num, -1))
+        mask = mask.view(rois_num, 1, self.output_size[0], self.output_size[1])
+        d = deform_roi_pool(input, rois, offset, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        return d * mask
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/deprecated_wrappers.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/deprecated_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..629a8033ff56be221b71a475ffd650ab7164f114
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/deprecated_wrappers.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This file is for backward compatibility.
+# Module wrappers for empty tensor have been moved to mmcv.cnn.bricks.
+import warnings
+
+from ..cnn.bricks.wrappers import Conv2d, ConvTranspose2d, Linear, MaxPool2d
+
+
+class Conv2d_deprecated(Conv2d):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing Conv2d wrapper from "mmcv.ops" will be deprecated in'
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
+
+
+class ConvTranspose2d_deprecated(ConvTranspose2d):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing ConvTranspose2d wrapper from "mmcv.ops" will be '
+            'deprecated in the future. Please import them from "mmcv.cnn" '
+            'instead', DeprecationWarning)
+
+
+class MaxPool2d_deprecated(MaxPool2d):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing MaxPool2d wrapper from "mmcv.ops" will be deprecated in'
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
+
+
+class Linear_deprecated(Linear):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing Linear wrapper from "mmcv.ops" will be deprecated in'
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/diff_iou_rotated.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/diff_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddcf4b4fc279ed7043196b88b4885870102c0955
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/diff_iou_rotated.py
@@ -0,0 +1,301 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/box_intersection_2d.py  # noqa
+# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/oriented_iou_loss.py  # noqa
+from typing import Tuple
+
+import torch
+from torch import Tensor
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+EPSILON = 1e-8
+ext_module = ext_loader.load_ext('_ext',
+                                 ['diff_iou_rotated_sort_vertices_forward'])
+
+
+class SortVertices(Function):
+
+    @staticmethod
+    def forward(ctx, vertices, mask, num_valid):
+        idx = ext_module.diff_iou_rotated_sort_vertices_forward(
+            vertices, mask, num_valid)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, gradout):
+        return ()
+
+
+def box_intersection(corners1: Tensor,
+                     corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Find intersection points of rectangles.
+    Convention: if two edges are collinear, there is no intersection point.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 4, 4, 2) Intersections.
+         - Tensor: (B, N, 4, 4) Valid intersections mask.
+    """
+    # build edges from corners
+    # B, N, 4, 4: Batch, Box, edge, point
+    line1 = torch.cat([corners1, corners1[:, :, [1, 2, 3, 0], :]], dim=3)
+    line2 = torch.cat([corners2, corners2[:, :, [1, 2, 3, 0], :]], dim=3)
+    # duplicate data to pair each edges from the boxes
+    # (B, N, 4, 4) -> (B, N, 4, 4, 4) : Batch, Box, edge1, edge2, point
+    line1_ext = line1.unsqueeze(3)
+    line2_ext = line2.unsqueeze(2)
+    x1, y1, x2, y2 = line1_ext.split([1, 1, 1, 1], dim=-1)
+    x3, y3, x4, y4 = line2_ext.split([1, 1, 1, 1], dim=-1)
+    # math: https://en.wikipedia.org/wiki/Line%E2%80%93line_intersection
+    numerator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
+    denumerator_t = (x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4)
+    t = denumerator_t / numerator
+    t[numerator == .0] = -1.
+    mask_t = (t > 0) & (t < 1)  # intersection on line segment 1
+    denumerator_u = (x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3)
+    u = -denumerator_u / numerator
+    u[numerator == .0] = -1.
+    mask_u = (u > 0) & (u < 1)  # intersection on line segment 2
+    mask = mask_t * mask_u
+    # overwrite with EPSILON. otherwise numerically unstable
+    t = denumerator_t / (numerator + EPSILON)
+    intersections = torch.stack([x1 + t * (x2 - x1), y1 + t * (y2 - y1)],
+                                dim=-1)
+    intersections = intersections * mask.float().unsqueeze(-1)
+    return intersections, mask
+
+
+def box1_in_box2(corners1: Tensor, corners2: Tensor) -> Tensor:
+    """Check if corners of box1 lie in box2.
+    Convention: if a corner is exactly on the edge of the other box,
+    it's also a valid point.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tensor: (B, N, 4) Intersection.
+    """
+    # a, b, c, d - 4 vertices of box2
+    a = corners2[:, :, 0:1, :]  # (B, N, 1, 2)
+    b = corners2[:, :, 1:2, :]  # (B, N, 1, 2)
+    d = corners2[:, :, 3:4, :]  # (B, N, 1, 2)
+    # ab, am, ad - vectors between corresponding vertices
+    ab = b - a  # (B, N, 1, 2)
+    am = corners1 - a  # (B, N, 4, 2)
+    ad = d - a  # (B, N, 1, 2)
+    prod_ab = torch.sum(ab * am, dim=-1)  # (B, N, 4)
+    norm_ab = torch.sum(ab * ab, dim=-1)  # (B, N, 1)
+    prod_ad = torch.sum(ad * am, dim=-1)  # (B, N, 4)
+    norm_ad = torch.sum(ad * ad, dim=-1)  # (B, N, 1)
+    # NOTE: the expression looks ugly but is stable if the two boxes
+    # are exactly the same also stable with different scale of bboxes
+    cond1 = (prod_ab / norm_ab > -1e-6) * (prod_ab / norm_ab < 1 + 1e-6
+                                           )  # (B, N, 4)
+    cond2 = (prod_ad / norm_ad > -1e-6) * (prod_ad / norm_ad < 1 + 1e-6
+                                           )  # (B, N, 4)
+    return cond1 * cond2
+
+
+def box_in_box(corners1: Tensor, corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Check if corners of two boxes lie in each other.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 4) True if i-th corner of box1 is in box2.
+         - Tensor: (B, N, 4) True if i-th corner of box2 is in box1.
+    """
+    c1_in_2 = box1_in_box2(corners1, corners2)
+    c2_in_1 = box1_in_box2(corners2, corners1)
+    return c1_in_2, c2_in_1
+
+
+def build_vertices(corners1: Tensor, corners2: Tensor, c1_in_2: Tensor,
+                   c2_in_1: Tensor, intersections: Tensor,
+                   valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
+    """Find vertices of intersection area.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+        c1_in_2 (Tensor): (B, N, 4) True if i-th corner of box1 is in box2.
+        c2_in_1 (Tensor): (B, N, 4) True if i-th corner of box2 is in box1.
+        intersections (Tensor): (B, N, 4, 4, 2) Intersections.
+        valid_mask (Tensor): (B, N, 4, 4) Valid intersections mask.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 24, 2) Vertices of intersection area;
+               only some elements are valid.
+         - Tensor: (B, N, 24) Mask of valid elements in vertices.
+    """
+    # NOTE: inter has elements equals zero and has zeros gradient
+    # (masked by multiplying with 0); can be used as trick
+    B = corners1.size()[0]
+    N = corners1.size()[1]
+    # (B, N, 4 + 4 + 16, 2)
+    vertices = torch.cat(
+        [corners1, corners2,
+         intersections.view([B, N, -1, 2])], dim=2)
+    # Bool (B, N, 4 + 4 + 16)
+    mask = torch.cat([c1_in_2, c2_in_1, valid_mask.view([B, N, -1])], dim=2)
+    return vertices, mask
+
+
+def sort_indices(vertices: Tensor, mask: Tensor) -> Tensor:
+    """Sort indices.
+    Note:
+        why 9? the polygon has maximal 8 vertices.
+        +1 to duplicate the first element.
+        the index should have following structure:
+            (A, B, C, ... , A, X, X, X)
+        and X indicates the index of arbitrary elements in the last
+        16 (intersections not corners) with value 0 and mask False.
+        (cause they have zero value and zero gradient)
+
+    Args:
+        vertices (Tensor): (B, N, 24, 2) Box vertices.
+        mask (Tensor): (B, N, 24) Mask.
+
+    Returns:
+        Tensor: (B, N, 9) Sorted indices.
+
+    """
+    num_valid = torch.sum(mask.int(), dim=2).int()  # (B, N)
+    mean = torch.sum(
+        vertices * mask.float().unsqueeze(-1), dim=2,
+        keepdim=True) / num_valid.unsqueeze(-1).unsqueeze(-1)
+    vertices_normalized = vertices - mean  # normalization makes sorting easier
+    return SortVertices.apply(vertices_normalized, mask, num_valid).long()
+
+
+def calculate_area(idx_sorted: Tensor,
+                   vertices: Tensor) -> Tuple[Tensor, Tensor]:
+    """Calculate area of intersection.
+
+    Args:
+        idx_sorted (Tensor): (B, N, 9) Sorted vertex ids.
+        vertices (Tensor): (B, N, 24, 2) Vertices.
+
+    Returns:
+        Tuple:
+         - Tensor (B, N): Area of intersection.
+         - Tensor: (B, N, 9, 2) Vertices of polygon with zero padding.
+    """
+    idx_ext = idx_sorted.unsqueeze(-1).repeat([1, 1, 1, 2])
+    selected = torch.gather(vertices, 2, idx_ext)
+    total = selected[:, :, 0:-1, 0] * selected[:, :, 1:, 1] \
+        - selected[:, :, 0:-1, 1] * selected[:, :, 1:, 0]
+    total = torch.sum(total, dim=2)
+    area = torch.abs(total) / 2
+    return area, selected
+
+
+def oriented_box_intersection_2d(corners1: Tensor,
+                                 corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Calculate intersection area of 2d rotated boxes.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor (B, N): Area of intersection.
+         - Tensor (B, N, 9, 2): Vertices of polygon with zero padding.
+    """
+    intersections, valid_mask = box_intersection(corners1, corners2)
+    c12, c21 = box_in_box(corners1, corners2)
+    vertices, mask = build_vertices(corners1, corners2, c12, c21,
+                                    intersections, valid_mask)
+    sorted_indices = sort_indices(vertices, mask)
+    return calculate_area(sorted_indices, vertices)
+
+
+def box2corners(box: Tensor) -> Tensor:
+    """Convert rotated 2d box coordinate to corners.
+
+    Args:
+        box (Tensor): (B, N, 5) with x, y, w, h, alpha.
+
+    Returns:
+        Tensor: (B, N, 4, 2) Corners.
+    """
+    B = box.size()[0]
+    x, y, w, h, alpha = box.split([1, 1, 1, 1, 1], dim=-1)
+    x4 = box.new_tensor([0.5, -0.5, -0.5, 0.5]).to(box.device)
+    x4 = x4 * w  # (B, N, 4)
+    y4 = box.new_tensor([0.5, 0.5, -0.5, -0.5]).to(box.device)
+    y4 = y4 * h  # (B, N, 4)
+    corners = torch.stack([x4, y4], dim=-1)  # (B, N, 4, 2)
+    sin = torch.sin(alpha)
+    cos = torch.cos(alpha)
+    row1 = torch.cat([cos, sin], dim=-1)
+    row2 = torch.cat([-sin, cos], dim=-1)  # (B, N, 2)
+    rot_T = torch.stack([row1, row2], dim=-2)  # (B, N, 2, 2)
+    rotated = torch.bmm(corners.view([-1, 4, 2]), rot_T.view([-1, 2, 2]))
+    rotated = rotated.view([B, -1, 4, 2])  # (B * N, 4, 2) -> (B, N, 4, 2)
+    rotated[..., 0] += x
+    rotated[..., 1] += y
+    return rotated
+
+
+def diff_iou_rotated_2d(box1: Tensor, box2: Tensor) -> Tensor:
+    """Calculate differentiable iou of rotated 2d boxes.
+
+    Args:
+        box1 (Tensor): (B, N, 5) First box.
+        box2 (Tensor): (B, N, 5) Second box.
+
+    Returns:
+        Tensor: (B, N) IoU.
+    """
+    corners1 = box2corners(box1)
+    corners2 = box2corners(box2)
+    intersection, _ = oriented_box_intersection_2d(corners1,
+                                                   corners2)  # (B, N)
+    area1 = box1[:, :, 2] * box1[:, :, 3]
+    area2 = box2[:, :, 2] * box2[:, :, 3]
+    union = area1 + area2 - intersection
+    iou = intersection / union
+    return iou
+
+
+def diff_iou_rotated_3d(box3d1: Tensor, box3d2: Tensor) -> Tensor:
+    """Calculate differentiable iou of rotated 3d boxes.
+
+    Args:
+        box3d1 (Tensor): (B, N, 3+3+1) First box (x,y,z,w,h,l,alpha).
+        box3d2 (Tensor): (B, N, 3+3+1) Second box (x,y,z,w,h,l,alpha).
+
+    Returns:
+        Tensor: (B, N) IoU.
+    """
+    box1 = box3d1[..., [0, 1, 3, 4, 6]]  # 2d box
+    box2 = box3d2[..., [0, 1, 3, 4, 6]]
+    corners1 = box2corners(box1)
+    corners2 = box2corners(box2)
+    intersection, _ = oriented_box_intersection_2d(corners1, corners2)
+    zmax1 = box3d1[..., 2] + box3d1[..., 5] * 0.5
+    zmin1 = box3d1[..., 2] - box3d1[..., 5] * 0.5
+    zmax2 = box3d2[..., 2] + box3d2[..., 5] * 0.5
+    zmin2 = box3d2[..., 2] - box3d2[..., 5] * 0.5
+    z_overlap = (torch.min(zmax1, zmax2) -
+                 torch.max(zmin1, zmin2)).clamp_(min=0.)
+    intersection_3d = intersection * z_overlap
+    volume1 = box3d1[..., 3] * box3d1[..., 4] * box3d1[..., 5]
+    volume2 = box3d2[..., 3] * box3d2[..., 4] * box3d2[..., 5]
+    union_3d = volume1 + volume2 - intersection_3d
+    return intersection_3d / union_3d
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/filtered_lrelu.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/filtered_lrelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..04a98484ab185d10f54b8ef9b842a9993aefbb14
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/filtered_lrelu.py
@@ -0,0 +1,414 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/filtered_lrelu.py # noqa
+import warnings
+from typing import Dict, Optional, Union
+
+import numpy as np
+import torch
+
+from ..utils import ext_loader
+from .bias_act import bias_act
+from .upfirdn2d import _get_filter_size, _parse_padding, upfirdn2d
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['filtered_lrelu', 'filtered_lrelu_act_'])
+
+_plugin = None
+
+
+def filtered_lrelu(input: torch.Tensor,
+                   filter_up: Optional[torch.Tensor] = None,
+                   filter_down: Optional[torch.Tensor] = None,
+                   bias: Optional[torch.Tensor] = None,
+                   up: int = 1,
+                   down: int = 1,
+                   padding: int = 0,
+                   gain: float = np.sqrt(2),
+                   slope: float = 0.2,
+                   clamp: Optional[Union[float, int]] = None,
+                   flip_filter: bool = False,
+                   use_custom_op: bool = True):
+    """Filtered leaky ReLU for a batch of 2D images.
+
+    Performs the following sequence of operations for each channel:
+
+    1. Add channel-specific bias if `bias` is provided.
+
+    2. Upsample the image by inserting N-1 zeros after each pixel (`up`).
+
+    3. Pad the image with the specified number of zeros on each side
+      (`padding`). Negative padding corresponds to cropping the image.
+
+    4. Convolve the image with the specified upsampling FIR filter
+        (`filter_up`), shrinking it so that the footprint of all output pixels
+        lies within the input image.
+
+    5. Multiply each value by the provided gain factor (`gain`).
+
+    6. Apply leaky ReLU activation function to each value.
+
+    7. Clamp each value between -clamp and +clamp, if `clamp` parameter is
+       provided.
+
+    8. Convolve the image with the specified downsampling FIR filter
+        (`filter_down`), shrinking it so that the footprint of all output
+        pixels lies within the input image.
+
+    9. Downsample the image by keeping every Nth pixel (`down`).
+
+    The fused op is considerably more efficient than performing the same
+    calculation using standard PyTorch ops. It supports gradients of arbitrary
+    order.
+
+    Args:
+        input (torch.Tensor): Float32/float16/float64 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter_up (torch.Tensor): Float32 upsampling FIR filter of the shape
+            `[filter_height, filter_width]` (non-separable), `[filter_taps]`
+            (separable), or `None` (identity). Defaults to None.
+        filter_down (torch.Tensor): Float32 downsampling FIR filter of the
+            shape `[filter_height, filter_width]` (non-separable),
+            `[filter_taps]` (separable), or `None` (identity).
+            Defaults to None.
+        bias (torch.Tensor): Bias vector, or `None` to disable. Must be
+            a 1D tensor of the same type as `input`. The length of vector must
+            match the channel dimension of `input`. Defaults to None.
+        up (int): Integer upsampling factor. Defaults to 1.
+        down (int): Integer downsampling factor. Defaults to 1.
+        padding (int): Padding with respect to the upsampled image. Can be a
+            single number or a list/tuple `[x, y]` or `[x_before, x_after,
+            y_before, y_after]`. Defaults to 0.
+        gain (float): Overall scaling factor for signal magnitude.
+            Defaults to np.sqrt(2).
+        slope (float): Slope on the negative side of leaky ReLU.
+            Defaults to 0.2.
+        clamp (Optional[Union[float, int]]): Maximum magnitude for leaky ReLU
+            output. Defaults to None.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+        use_custom_op (bool): Whether to use customized op.
+            Defaults to True.
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height,
+        out_width]`.
+    """
+    assert isinstance(input, torch.Tensor)
+    if use_custom_op and input.is_cuda:
+        return _filtered_lrelu_cuda(
+            up=up,
+            down=down,
+            padding=padding,
+            gain=gain,
+            slope=slope,
+            clamp=clamp,
+            flip_filter=flip_filter).apply(input, filter_up, filter_down, bias,
+                                           None, 0, 0)
+    return _filtered_lrelu_ref(
+        input,
+        filter_up=filter_up,
+        filter_down=filter_down,
+        bias=bias,
+        up=up,
+        down=down,
+        padding=padding,
+        gain=gain,
+        slope=slope,
+        clamp=clamp,
+        flip_filter=flip_filter)
+
+
+def _filtered_lrelu_ref(input: torch.Tensor,
+                        filter_up: Optional[torch.Tensor] = None,
+                        filter_down: Optional[torch.Tensor] = None,
+                        bias: Optional[torch.Tensor] = None,
+                        up: int = 1,
+                        down: int = 1,
+                        padding: int = 0,
+                        gain: float = np.sqrt(2),
+                        slope: float = 0.2,
+                        clamp: Optional[Union[float, int]] = None,
+                        flip_filter: bool = False):
+    """Slow and memory-inefficient reference implementation of
+    `filtered_lrelu()` using existing `upfirdn2n()` and `bias_act()` ops.
+
+    Args:
+        input (torch.Tensor): Float32/float16/float64 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter_up (torch.Tensor): Float32 upsampling FIR filter of the shape
+            `[filter_height, filter_width]` (non-separable), `[filter_taps]`
+            (separable), or `None` (identity). Defaults to None.
+        filter_down (torch.Tensor): Float32 downsampling FIR filter of the
+            shape `[filter_height, filter_width]` (non-separable),
+            `[filter_taps]` (separable), or `None` (identity).
+            Defaults to None.
+        bias (torch.Tensor): Bias vector, or `None` to disable. Must be
+            a 1D tensor of the same type as `input`. The length of vector must
+            match the channel dimension of `input`. Defaults to None.
+        up (int): Integer upsampling factor. Defaults to 1.
+        down (int): Integer downsampling factor. Defaults to 1.
+        padding (int): Padding with respect to the upsampled image. Can be a
+            single number or a list/tuple `[x, y]` or `[x_before, x_after,
+            y_before, y_after]`. Defaults to 0.
+        gain (float): Overall scaling factor for signal magnitude.
+            Defaults to np.sqrt(2).
+        slope (float): Slope on the negative side of leaky ReLU.
+            Defaults to 0.2.
+        clamp (float or int): Maximum magnitude for leaky ReLU
+            output. Defaults to None.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height,
+        out_width]`.
+    """
+    assert isinstance(input, torch.Tensor) and input.ndim == 4
+    filter_up_w, filter_up_h = _get_filter_size(filter_up)
+    filter_down_w, filter_down_h = _get_filter_size(filter_down)
+    if bias is not None:
+        assert isinstance(bias, torch.Tensor) and bias.dtype == input.dtype
+    assert isinstance(up, int) and up >= 1
+    assert isinstance(down, int) and down >= 1
+    px0, px1, py0, py1 = _parse_padding(padding)
+    assert gain == float(gain) and gain > 0
+    assert slope == float(slope) and slope >= 0
+    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
+
+    # Calculate output size.
+    batch_size, channels, in_h, in_w = input.shape
+    in_dtype = input.dtype
+    out_w = (in_w * up + (px0 + px1) - (filter_up_w - 1) -
+             (filter_down_w - 1) + (down - 1)) // down
+    out_h = (in_h * up + (py0 + py1) - (filter_up_h - 1) -
+             (filter_down_h - 1) + (down - 1)) // down
+
+    # Compute using existing ops.
+    output = bias_act(input=input, bias=bias)  # Apply bias.
+    output = upfirdn2d(
+        input=output,
+        filter=filter_up,
+        up=up,
+        padding=[px0, px1, py0, py1],
+        gain=up**2,
+        flip_filter=flip_filter)  # Upsample.
+    output = bias_act(
+        input=output, act='lrelu', alpha=slope, gain=gain,
+        clamp=clamp)  # Bias, leaky ReLU, clamp.
+    output = upfirdn2d(
+        input=output, filter=filter_down, down=down,
+        flip_filter=flip_filter)  # Downsample.
+
+    assert output.shape == (batch_size, channels, out_h, out_w)
+    assert output.dtype == in_dtype
+    return output
+
+
+_filtered_lrelu_cuda_cache: Dict = dict()
+
+
+def _filtered_lrelu_cuda(up: int = 1,
+                         down: int = 1,
+                         padding: int = 0,
+                         gain: float = np.sqrt(2),
+                         slope: float = 0.2,
+                         clamp: Optional[Union[float, int]] = None,
+                         flip_filter: bool = False):
+    """Fast CUDA implementation of `filtered_lrelu()` using custom ops.
+
+    Args:
+        up (int): Integer upsampling factor. Defaults to 1.
+        down (int): Integer downsampling factor. Defaults to 1.
+        padding (int): Padding with respect to the upsampled image. Can be a
+            single number or a list/tuple `[x, y]` or `[x_before, x_after,
+            y_before, y_after]`. Defaults to 0.
+        gain (float): Overall scaling factor for signal magnitude.
+            Defaults to np.sqrt(2).
+        slope (float): Slope on the negative side of leaky ReLU.
+            Defaults to 0.2.
+        clamp (float or int): Maximum magnitude for leaky ReLU
+            output. Defaults to None.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height,
+        out_width]`.
+    """
+    assert isinstance(up, int) and up >= 1
+    assert isinstance(down, int) and down >= 1
+    px0, px1, py0, py1 = _parse_padding(padding)
+    assert gain == float(gain) and gain > 0
+    gain = float(gain)
+    assert slope == float(slope) and slope >= 0
+    slope = float(slope)
+    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
+    clamp = float(clamp if clamp is not None else 'inf')
+
+    # Lookup from cache.
+    key = (up, down, px0, px1, py0, py1, gain, slope, clamp, flip_filter)
+    if key in _filtered_lrelu_cuda_cache:
+        return _filtered_lrelu_cuda_cache[key]
+
+    # Forward op.
+    class FilteredLReluCuda(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, input, filter_up, filter_down, bias, si, sx, sy):
+            # pylint: disable=arguments-differ
+            assert isinstance(input, torch.Tensor) and input.ndim == 4
+
+            # Replace empty up/downsample kernels with full 1x1 kernels
+            # (faster than separable).
+            if filter_up is None:
+                filter_up = torch.ones([1, 1],
+                                       dtype=torch.float32,
+                                       device=input.device)
+            if filter_down is None:
+                filter_down = torch.ones([1, 1],
+                                         dtype=torch.float32,
+                                         device=input.device)
+            assert 1 <= filter_up.ndim <= 2
+            assert 1 <= filter_down.ndim <= 2
+
+            # Replace separable 1x1 kernels with full 1x1 kernels when scale
+            # factor is 1.
+            if up == 1 and filter_up.ndim == 1 and filter_up.shape[0] == 1:
+                filter_up = filter_up.square()[None]
+            if down == 1 and filter_down.ndim == 1 and filter_down.shape[
+                    0] == 1:
+                filter_down = filter_down.square()[None]
+
+            # Missing sign input tensor.
+            if si is None:
+                si = torch.empty([0])
+
+            # Missing bias tensor.
+            if bias is None:
+                bias = torch.zeros([input.shape[1]],
+                                   dtype=input.dtype,
+                                   device=input.device)
+
+            # Construct internal sign tensor only if gradients are needed.
+            write_signs = (si.numel() == 0) and (input.requires_grad
+                                                 or bias.requires_grad)
+
+            # Warn if input storage strides are not in decreasing order due to
+            # e.g. channels-last layout.
+            strides = [
+                input.stride(i) for i in range(input.ndim) if input.size(i) > 1
+            ]
+            if any(a < b for a, b in zip(strides[:-1], strides[1:])):
+                warnings.warn(
+                    'low-performance memory layout detected in filtered_lrelu '
+                    'input', RuntimeWarning)
+
+            # Call C++/Cuda plugin if datatype is supported.
+            if input.dtype in [torch.float16, torch.float32]:
+                if torch.cuda.current_stream(
+                        input.device) != torch.cuda.default_stream(
+                            input.device):
+                    warnings.warn(
+                        'filtered_lrelu called with non-default cuda stream '
+                        'but concurrent execution is not supported',
+                        RuntimeWarning)
+                y, so, return_code = ext_module.filtered_lrelu(
+                    input, filter_up, filter_down, bias, si.to(input.device),
+                    up, down, px0, px1, py0, py1, sx, sy, gain, slope, clamp,
+                    flip_filter, write_signs)
+            else:
+                return_code = -1
+
+            # No Cuda kernel found? Fall back to generic implementation.
+            # Still more memory efficient than the reference implementation
+            # because only the bit-packed sign tensor is retained for gradient
+            # computation.
+            if return_code < 0:
+                warnings.warn(
+                    'filtered_lrelu called with parameters that have no '
+                    'optimized CUDA kernel, using generic fallback',
+                    RuntimeWarning)
+
+                y = input.add(bias.unsqueeze(-1).unsqueeze(-1))  # Add bias.
+                y = upfirdn2d(
+                    input=y,
+                    filter=filter_up,
+                    up=up,
+                    padding=[px0, px1, py0, py1],
+                    gain=float(up**2),
+                    flip_filter=flip_filter)  # Upsample.
+                # Activation function and sign handling. Modifies y in-place.
+                so = ext_module.filtered_lrelu_act_(y, si.to(y.device), sx, sy,
+                                                    gain, slope, clamp,
+                                                    write_signs)
+                y = upfirdn2d(
+                    input=y,
+                    filter=filter_down,
+                    down=down,
+                    flip_filter=flip_filter)  # Downsample.
+
+            # Prepare for gradient computation.
+            ctx.save_for_backward(filter_up, filter_down,
+                                  (si if si.numel() else so))
+            ctx.x_shape = input.shape
+            ctx.y_shape = y.shape
+            ctx.s_ofs = sx, sy
+            return y
+
+        @staticmethod
+        def backward(ctx, dy):  # pylint: disable=arguments-differ
+            filter_up, filter_down, si = ctx.saved_tensors
+            _, _, xh, xw = ctx.x_shape
+            _, _, yh, yw = ctx.y_shape
+            sx, sy = ctx.s_ofs
+            dx = None  # 0
+            dfu = None
+            assert not ctx.needs_input_grad[1]
+            dfd = None
+            assert not ctx.needs_input_grad[2]
+            db = None  # 3
+            dsi = None
+            assert not ctx.needs_input_grad[4]
+            dsx = None
+            assert not ctx.needs_input_grad[5]
+            dsy = None
+            assert not ctx.needs_input_grad[6]
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[3]:
+                pp = [
+                    (filter_up.shape[-1] - 1) + (filter_down.shape[-1] - 1) -
+                    px0,
+                    xw * up - yw * down + px0 - (up - 1),
+                    (filter_up.shape[0] - 1) + (filter_down.shape[0] - 1) -
+                    py0,
+                    xh * up - yh * down + py0 - (up - 1),
+                ]
+                gg = gain * (up**2) / (down**2)
+                ff = (not flip_filter)
+                sx = sx - (filter_up.shape[-1] - 1) + px0
+                sy = sy - (filter_up.shape[0] - 1) + py0
+                dx = _filtered_lrelu_cuda(
+                    up=down,
+                    down=up,
+                    padding=pp,
+                    gain=gg,
+                    slope=slope,
+                    clamp=None,
+                    flip_filter=ff).apply(dy, filter_down, filter_up, None, si,
+                                          sx, sy)
+
+            if ctx.needs_input_grad[3]:
+                db = dx.sum([0, 2, 3])
+
+            return dx, dfu, dfd, db, dsi, dsx, dsy
+
+    # Add to cache.
+    _filtered_lrelu_cuda_cache[key] = FilteredLReluCuda
+    return FilteredLReluCuda
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/focal_loss.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..69aab7305205f1024dbbd2976517ae5ec3e7af9d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/focal_loss.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'sigmoid_focal_loss_forward', 'sigmoid_focal_loss_backward',
+    'softmax_focal_loss_forward', 'softmax_focal_loss_backward'
+])
+
+
+class SigmoidFocalLossFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input: torch.Tensor,
+                target: Union[torch.LongTensor, torch.cuda.LongTensor],
+                gamma: float = 2.0,
+                alpha: float = 0.25,
+                weight: Optional[torch.Tensor] = None,
+                reduction: str = 'mean') -> torch.Tensor:
+
+        assert target.dtype == torch.long
+        assert input.dim() == 2
+        assert target.dim() == 1
+        assert input.size(0) == target.size(0)
+        if weight is None:
+            weight = input.new_empty(0)
+        else:
+            assert weight.dim() == 1
+            assert input.size(1) == weight.size(0)
+        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
+        assert reduction in ctx.reduction_dict.keys()
+
+        ctx.gamma = float(gamma)
+        ctx.alpha = float(alpha)
+        ctx.reduction = ctx.reduction_dict[reduction]
+
+        output = input.new_zeros(input.size())
+
+        ext_module.sigmoid_focal_loss_forward(
+            input, target, weight, output, gamma=ctx.gamma, alpha=ctx.alpha)
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            output = output.sum() / input.size(0)
+        elif ctx.reduction == ctx.reduction_dict['sum']:
+            output = output.sum()
+        ctx.save_for_backward(input, target, weight)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        input, target, weight = ctx.saved_tensors
+
+        grad_input = input.new_zeros(input.size())
+
+        ext_module.sigmoid_focal_loss_backward(
+            input,
+            target,
+            weight,
+            grad_input,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        grad_input *= grad_output
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            grad_input /= input.size(0)
+        return grad_input, None, None, None, None, None
+
+
+sigmoid_focal_loss = SigmoidFocalLossFunction.apply
+
+
+class SigmoidFocalLoss(nn.Module):
+
+    def __init__(self,
+                 gamma: float,
+                 alpha: float,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean'):
+        super().__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.register_buffer('weight', weight)
+        self.reduction = reduction
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        target: Union[torch.LongTensor, torch.cuda.LongTensor],
+    ) -> torch.Tensor:
+        return sigmoid_focal_loss(input, target, self.gamma, self.alpha,
+                                  self.weight, self.reduction)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(gamma={self.gamma}, '
+        s += f'alpha={self.alpha}, '
+        s += f'reduction={self.reduction})'
+        return s
+
+
+class SoftmaxFocalLossFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input: torch.Tensor,
+                target: Union[torch.LongTensor, torch.cuda.LongTensor],
+                gamma: float = 2.0,
+                alpha: float = 0.25,
+                weight: Optional[torch.Tensor] = None,
+                reduction='mean') -> torch.Tensor:
+
+        assert target.dtype == torch.long
+        assert input.dim() == 2
+        assert target.dim() == 1
+        assert input.size(0) == target.size(0)
+        if weight is None:
+            weight = input.new_empty(0)
+        else:
+            assert weight.dim() == 1
+            assert input.size(1) == weight.size(0)
+        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
+        assert reduction in ctx.reduction_dict.keys()
+
+        ctx.gamma = float(gamma)
+        ctx.alpha = float(alpha)
+        ctx.reduction = ctx.reduction_dict[reduction]
+
+        channel_stats, _ = torch.max(input, dim=1)
+        input_softmax = input - channel_stats.unsqueeze(1).expand_as(input)
+        input_softmax.exp_()
+
+        channel_stats = input_softmax.sum(dim=1)
+        input_softmax /= channel_stats.unsqueeze(1).expand_as(input)
+
+        output = input.new_zeros(input.size(0))
+        ext_module.softmax_focal_loss_forward(
+            input_softmax,
+            target,
+            weight,
+            output,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            output = output.sum() / input.size(0)
+        elif ctx.reduction == ctx.reduction_dict['sum']:
+            output = output.sum()
+        ctx.save_for_backward(input_softmax, target, weight)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        input_softmax, target, weight = ctx.saved_tensors
+        buff = input_softmax.new_zeros(input_softmax.size(0))
+        grad_input = input_softmax.new_zeros(input_softmax.size())
+
+        ext_module.softmax_focal_loss_backward(
+            input_softmax,
+            target,
+            weight,
+            buff,
+            grad_input,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        grad_input *= grad_output
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            grad_input /= input_softmax.size(0)
+        return grad_input, None, None, None, None, None
+
+
+softmax_focal_loss = SoftmaxFocalLossFunction.apply
+
+
+class SoftmaxFocalLoss(nn.Module):
+
+    def __init__(self,
+                 gamma: float,
+                 alpha: float,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean'):
+        super().__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.register_buffer('weight', weight)
+        self.reduction = reduction
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        target: Union[torch.LongTensor, torch.cuda.LongTensor],
+    ) -> torch.Tensor:
+        return softmax_focal_loss(input, target, self.gamma, self.alpha,
+                                  self.weight, self.reduction)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(gamma={self.gamma}, '
+        s += f'alpha={self.alpha}, '
+        s += f'reduction={self.reduction})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/furthest_point_sample.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/furthest_point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..22b1a3048d08b3f1eda43e4a3d5c36a6f6ab5349
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/furthest_point_sample.py
@@ -0,0 +1,84 @@
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'furthest_point_sampling_forward',
+    'furthest_point_sampling_with_dist_forward'
+])
+
+
+class FurthestPointSampling(Function):
+    """Uses iterative furthest point sampling to select a set of features whose
+    corresponding points have the furthest distance."""
+
+    @staticmethod
+    def forward(ctx, points_xyz: torch.Tensor,
+                num_points: int) -> torch.Tensor:
+        """
+        Args:
+            points_xyz (torch.Tensor): (B, N, 3) where N > num_points.
+            num_points (int): Number of points in the sampled set.
+
+        Returns:
+            torch.Tensor: (B, num_points) indices of the sampled points.
+        """
+        assert points_xyz.is_contiguous()
+
+        B, N = points_xyz.size()[:2]
+        output = torch.cuda.IntTensor(B, num_points)
+        temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
+
+        ext_module.furthest_point_sampling_forward(
+            points_xyz,
+            temp,
+            output,
+            b=B,
+            n=N,
+            m=num_points,
+        )
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(output)
+        return output
+
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+
+
+class FurthestPointSamplingWithDist(Function):
+    """Uses iterative furthest point sampling to select a set of features whose
+    corresponding points have the furthest distance."""
+
+    @staticmethod
+    def forward(ctx, points_dist: torch.Tensor,
+                num_points: int) -> torch.Tensor:
+        """
+        Args:
+            points_dist (torch.Tensor): (B, N, N) Distance between each point
+                pair.
+            num_points (int): Number of points in the sampled set.
+
+        Returns:
+            torch.Tensor: (B, num_points) indices of the sampled points.
+        """
+        assert points_dist.is_contiguous()
+
+        B, N, _ = points_dist.size()
+        output = points_dist.new_zeros([B, num_points], dtype=torch.int32)
+        temp = points_dist.new_zeros([B, N]).fill_(1e10)
+
+        ext_module.furthest_point_sampling_with_dist_forward(
+            points_dist, temp, output, b=B, n=N, m=num_points)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(output)
+        return output
+
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+
+
+furthest_point_sample = FurthestPointSampling.apply
+furthest_point_sample_with_dist = FurthestPointSamplingWithDist.apply
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/fused_bias_leakyrelu.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/fused_bias_leakyrelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e23617fb3af36234f1694e7c1210797d04b72113
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/fused_bias_leakyrelu.py
@@ -0,0 +1,282 @@
+# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py # noqa:E501
+
+# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+# Augmentation (ADA)
+# =======================================================================
+
+# 1. Definitions
+
+# "Licensor" means any person or entity that distributes its Work.
+
+# "Software" means the original work of authorship made available under
+# this License.
+
+# "Work" means the Software and any additions to or derivative works of
+# the Software that are made available under this License.
+
+# The terms "reproduce," "reproduction," "derivative works," and
+# "distribution" have the meaning as provided under U.S. copyright law;
+# provided, however, that for the purposes of this License, derivative
+# works shall not include works that remain separable from, or merely
+# link (or bind by name) to the interfaces of, the Work.
+
+# Works, including the Software, are "made available" under this License
+# by including in or with the Work either (a) a copyright notice
+# referencing the applicability of this License to the Work, or (b) a
+# copy of this License.
+
+# 2. License Grants
+
+#     2.1 Copyright Grant. Subject to the terms and conditions of this
+#     License, each Licensor grants to you a perpetual, worldwide,
+#     non-exclusive, royalty-free, copyright license to reproduce,
+#     prepare derivative works of, publicly display, publicly perform,
+#     sublicense and distribute its Work and any resulting derivative
+#     works in any form.
+
+# 3. Limitations
+
+#     3.1 Redistribution. You may reproduce or distribute the Work only
+#     if (a) you do so under this License, (b) you include a complete
+#     copy of this License with your distribution, and (c) you retain
+#     without modification any copyright, patent, trademark, or
+#     attribution notices that are present in the Work.
+
+#     3.2 Derivative Works. You may specify that additional or different
+#     terms apply to the use, reproduction, and distribution of your
+#     derivative works of the Work ("Your Terms") only if (a) Your Terms
+#     provide that the use limitation in Section 3.3 applies to your
+#     derivative works, and (b) you identify the specific derivative
+#     works that are subject to Your Terms. Notwithstanding Your Terms,
+#     this License (including the redistribution requirements in Section
+#     3.1) will continue to apply to the Work itself.
+
+#     3.3 Use Limitation. The Work and any derivative works thereof only
+#     may be used or intended for use non-commercially. Notwithstanding
+#     the foregoing, NVIDIA and its affiliates may use the Work and any
+#     derivative works commercially. As used herein, "non-commercially"
+#     means for research or evaluation purposes only.
+
+#     3.4 Patent Claims. If you bring or threaten to bring a patent claim
+#     against any Licensor (including any claim, cross-claim or
+#     counterclaim in a lawsuit) to enforce any patents that you allege
+#     are infringed by any Work, then your rights under this License from
+#     such Licensor (including the grant in Section 2.1) will terminate
+#     immediately.
+
+#     3.5 Trademarks. This License does not grant any rights to use any
+#     Licensor’s or its affiliates’ names, logos, or trademarks, except
+#     as necessary to reproduce the notices described in this License.
+
+#     3.6 Termination. If you violate any term of this License, then your
+#     rights under this License (including the grant in Section 2.1) will
+#     terminate immediately.
+
+# 4. Disclaimer of Warranty.
+
+# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+# THIS LICENSE.
+
+# 5. Limitation of Liability.
+
+# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGES.
+
+# =======================================================================
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['fused_bias_leakyrelu'])
+
+
+class FusedBiasLeakyReLUFunctionBackward(Function):
+    """Calculate second order deviation.
+
+    This function is to compute the second order deviation for the fused leaky
+    relu operation.
+    """
+
+    @staticmethod
+    def forward(ctx, grad_output: torch.Tensor, out: torch.Tensor,
+                negative_slope: float, scale: float) -> tuple:
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+
+        empty = grad_output.new_empty(0)
+
+        grad_input = ext_module.fused_bias_leakyrelu(
+            grad_output,
+            empty,
+            out,
+            act=3,
+            grad=1,
+            alpha=negative_slope,
+            scale=scale)
+
+        dim = [0]
+
+        if grad_input.ndim > 2:
+            dim += list(range(2, grad_input.ndim))
+
+        grad_bias = grad_input.sum(dim).detach()
+
+        return grad_input, grad_bias
+
+    @staticmethod
+    def backward(ctx, gradgrad_input: torch.Tensor,
+                 gradgrad_bias: nn.Parameter) -> tuple:
+        out, = ctx.saved_tensors
+
+        # The second order deviation, in fact, contains two parts, while the
+        # the first part is zero. Thus, we direct consider the second part
+        # which is similar with the first order deviation in implementation.
+        gradgrad_out = ext_module.fused_bias_leakyrelu(
+            gradgrad_input,
+            gradgrad_bias.to(out.dtype),
+            out,
+            act=3,
+            grad=1,
+            alpha=ctx.negative_slope,
+            scale=ctx.scale)
+
+        return gradgrad_out, None, None, None
+
+
+class FusedBiasLeakyReLUFunction(Function):
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor, bias: nn.Parameter,
+                negative_slope: float, scale: float) -> torch.Tensor:
+        empty = input.new_empty(0)
+
+        out = ext_module.fused_bias_leakyrelu(
+            input,
+            bias,
+            empty,
+            act=3,
+            grad=0,
+            alpha=negative_slope,
+            scale=scale)
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        out, = ctx.saved_tensors
+
+        grad_input, grad_bias = FusedBiasLeakyReLUFunctionBackward.apply(
+            grad_output, out, ctx.negative_slope, ctx.scale)
+
+        return grad_input, grad_bias, None, None
+
+
+class FusedBiasLeakyReLU(nn.Module):
+    r"""Fused bias leaky ReLU.
+
+    This function is introduced in the StyleGAN2:
+    `Analyzing and Improving the Image Quality of StyleGAN
+    <http://arxiv.org/abs/1912.04958>`_
+
+    The bias term comes from the convolution operation. In addition, to keep
+    the variance of the feature map or gradients unchanged, they also adopt a
+    scale similarly with Kaiming initialization. However, since the
+    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
+    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
+    your own scale.
+
+    TODO: Implement the CPU version.
+
+    Args:
+        num_channels (int): The channel number of the feature map.
+        negative_slope (float, optional): Same as nn.LeakyRelu.
+            Defaults to 0.2.
+        scale (float, optional): A scalar to adjust the variance of the feature
+            map. Defaults to 2**0.5.
+    """
+
+    def __init__(self,
+                 num_channels: int,
+                 negative_slope: float = 0.2,
+                 scale: float = 2**0.5):
+        super().__init__()
+
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.negative_slope = negative_slope
+        self.scale = scale
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return fused_bias_leakyrelu(input, self.bias, self.negative_slope,
+                                    self.scale)
+
+
+def fused_bias_leakyrelu(input: torch.Tensor,
+                         bias: nn.Parameter,
+                         negative_slope: float = 0.2,
+                         scale: float = 2**0.5) -> torch.Tensor:
+    r"""Fused bias leaky ReLU function.
+
+    This function is introduced in the StyleGAN2:
+    `Analyzing and Improving the Image Quality of StyleGAN
+    <http://arxiv.org/abs/1912.04958>`_
+
+    The bias term comes from the convolution operation. In addition, to keep
+    the variance of the feature map or gradients unchanged, they also adopt a
+    scale similarly with Kaiming initialization. However, since the
+    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
+    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
+    your own scale.
+
+    Args:
+        input (torch.Tensor): Input feature map.
+        bias (nn.Parameter): The bias from convolution operation.
+        negative_slope (float, optional): Same as nn.LeakyRelu.
+            Defaults to 0.2.
+        scale (float, optional): A scalar to adjust the variance of the feature
+            map. Defaults to 2**0.5.
+
+    Returns:
+        torch.Tensor: Feature map after non-linear activation.
+    """
+
+    if not input.is_cuda:
+        return bias_leakyrelu_ref(input, bias, negative_slope, scale)
+
+    return FusedBiasLeakyReLUFunction.apply(input, bias.to(input.dtype),
+                                            negative_slope, scale)
+
+
+def bias_leakyrelu_ref(x: torch.Tensor,
+                       bias: nn.Parameter,
+                       negative_slope: float = 0.2,
+                       scale: float = 2**0.5) -> torch.Tensor:
+
+    if bias is not None:
+        assert bias.ndim == 1
+        assert bias.shape[0] == x.shape[1]
+        x = x + bias.reshape([-1 if i == 1 else 1 for i in range(x.ndim)])
+
+    x = F.leaky_relu(x, negative_slope)
+    if scale != 1:
+        x = x * scale
+
+    return x
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/gather_points.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/gather_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..895bfab643ba5c9da218e398501c12a646b869e8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/gather_points.py
@@ -0,0 +1,59 @@
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['gather_points_forward', 'gather_points_backward'])
+
+
+class GatherPoints(Function):
+    """Gather points with given index."""
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): (B, C, N) features to gather.
+            indices (torch.Tensor): (B, M) where M is the number of points.
+
+        Returns:
+            torch.Tensor: (B, C, M) where M is the number of points.
+        """
+        assert features.is_contiguous()
+        assert indices.is_contiguous()
+
+        B, npoint = indices.size()
+        _, C, N = features.size()
+        output = features.new_zeros((B, C, npoint))
+
+        ext_module.gather_points_forward(
+            features, indices, output, b=B, c=C, n=N, npoints=npoint)
+
+        ctx.for_backwards = (indices, C, N)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(indices)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
+        idx, C, N = ctx.for_backwards
+        B, npoint = idx.size()
+
+        grad_features = grad_out.new_zeros((B, C, N))
+        grad_out_data = grad_out.data.contiguous()
+        ext_module.gather_points_backward(
+            grad_out_data,
+            idx,
+            grad_features.data,
+            b=B,
+            c=C,
+            n=N,
+            npoints=npoint)
+        return grad_features, None
+
+
+gather_points = GatherPoints.apply
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/group_points.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/group_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..999728c22a4cc4aa3b368d1261b29a67e11d5523
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/group_points.py
@@ -0,0 +1,299 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+from .ball_query import ball_query
+from .knn import knn
+
+ext_module = ext_loader.load_ext('_ext', [
+    'group_points_forward', 'group_points_backward',
+    'stack_group_points_forward', 'stack_group_points_backward'
+])
+
+
+class QueryAndGroup(nn.Module):
+    """Groups points with a ball query of radius.
+
+    Args:
+        max_radius (float): The maximum radius of the balls.
+            If None is given, we will use kNN sampling instead of ball query.
+        sample_num (int): Maximum number of features to gather in the ball.
+        min_radius (float, optional): The minimum radius of the balls.
+            Default: 0.
+        use_xyz (bool, optional): Whether to use xyz.
+            Default: True.
+        return_grouped_xyz (bool, optional): Whether to return grouped xyz.
+            Default: False.
+        normalize_xyz (bool, optional): Whether to normalize xyz.
+            Default: False.
+        uniform_sample (bool, optional): Whether to sample uniformly.
+            Default: False
+        return_unique_cnt (bool, optional): Whether to return the count of
+            unique samples. Default: False.
+        return_grouped_idx (bool, optional): Whether to return grouped idx.
+            Default: False.
+    """
+
+    def __init__(self,
+                 max_radius: float,
+                 sample_num: int,
+                 min_radius: float = 0.,
+                 use_xyz: bool = True,
+                 return_grouped_xyz: bool = False,
+                 normalize_xyz: bool = False,
+                 uniform_sample: bool = False,
+                 return_unique_cnt: bool = False,
+                 return_grouped_idx: bool = False):
+        super().__init__()
+        self.max_radius = max_radius
+        self.min_radius = min_radius
+        self.sample_num = sample_num
+        self.use_xyz = use_xyz
+        self.return_grouped_xyz = return_grouped_xyz
+        self.normalize_xyz = normalize_xyz
+        self.uniform_sample = uniform_sample
+        self.return_unique_cnt = return_unique_cnt
+        self.return_grouped_idx = return_grouped_idx
+        if self.return_unique_cnt:
+            assert self.uniform_sample, \
+                'uniform_sample should be True when ' \
+                'returning the count of unique samples'
+        if self.max_radius is None:
+            assert not self.normalize_xyz, \
+                'can not normalize grouped xyz when max_radius is None'
+
+    def forward(
+        self,
+        points_xyz: torch.Tensor,
+        center_xyz: torch.Tensor,
+        features: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple]:
+        """
+        Args:
+            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of the
+                points.
+            center_xyz (torch.Tensor): (B, npoint, 3) coordinates of the
+                centriods.
+            features (torch.Tensor): (B, C, N) The features of grouped
+                points.
+
+        Returns:
+            Tuple | torch.Tensor: (B, 3 + C, npoint, sample_num) Grouped
+            concatenated coordinates and features of points.
+        """
+        # if self.max_radius is None, we will perform kNN instead of ball query
+        # idx is of shape [B, npoint, sample_num]
+        if self.max_radius is None:
+            idx = knn(self.sample_num, points_xyz, center_xyz, False)
+            idx = idx.transpose(1, 2).contiguous()
+        else:
+            idx = ball_query(self.min_radius, self.max_radius, self.sample_num,
+                             points_xyz, center_xyz)
+
+        if self.uniform_sample:
+            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
+            for i_batch in range(idx.shape[0]):
+                for i_region in range(idx.shape[1]):
+                    unique_ind = torch.unique(idx[i_batch, i_region, :])
+                    num_unique = unique_ind.shape[0]
+                    unique_cnt[i_batch, i_region] = num_unique
+                    sample_ind = torch.randint(
+                        0,
+                        num_unique, (self.sample_num - num_unique, ),
+                        dtype=torch.long)
+                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
+                    idx[i_batch, i_region, :] = all_ind
+
+        xyz_trans = points_xyz.transpose(1, 2).contiguous()
+        # (B, 3, npoint, sample_num)
+        grouped_xyz = grouping_operation(xyz_trans, idx)
+        grouped_xyz_diff = grouped_xyz - \
+            center_xyz.transpose(1, 2).unsqueeze(-1)  # relative offsets
+        if self.normalize_xyz:
+            grouped_xyz_diff /= self.max_radius
+
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                # (B, C + 3, npoint, sample_num)
+                new_features = torch.cat([grouped_xyz_diff, grouped_features],
+                                         dim=1)
+            else:
+                new_features = grouped_features
+        else:
+            assert (self.use_xyz
+                    ), 'Cannot have not features and not use xyz as a feature!'
+            new_features = grouped_xyz_diff
+
+        ret = [new_features]
+        if self.return_grouped_xyz:
+            ret.append(grouped_xyz)
+        if self.return_unique_cnt:
+            ret.append(unique_cnt)
+        if self.return_grouped_idx:
+            ret.append(idx)
+        if len(ret) == 1:
+            return ret[0]
+        else:
+            return tuple(ret)
+
+
+class GroupAll(nn.Module):
+    """Group xyz with feature.
+
+    Args:
+        use_xyz (bool): Whether to use xyz.
+    """
+
+    def __init__(self, use_xyz: bool = True):
+        super().__init__()
+        self.use_xyz = use_xyz
+
+    def forward(self,
+                xyz: torch.Tensor,
+                new_xyz: torch.Tensor,
+                features: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Args:
+            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            new_xyz (Tensor): new xyz coordinates of the features.
+            features (Tensor): (B, C, N) features to group.
+
+        Returns:
+            Tensor: (B, C + 3, 1, N) Grouped feature.
+        """
+        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
+        if features is not None:
+            grouped_features = features.unsqueeze(2)
+            if self.use_xyz:
+                # (B, 3 + C, 1, N)
+                new_features = torch.cat([grouped_xyz, grouped_features],
+                                         dim=1)
+            else:
+                new_features = grouped_features
+        else:
+            new_features = grouped_xyz
+
+        return new_features
+
+
+class GroupingOperation(Function):
+    """Group feature with given index."""
+
+    @staticmethod
+    def forward(
+            ctx,
+            features: torch.Tensor,
+            indices: torch.Tensor,
+            features_batch_cnt: Optional[torch.Tensor] = None,
+            indices_batch_cnt: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Args:
+            features (Tensor): Tensor of features to group, input shape is
+                (B, C, N) or stacked inputs (N1 + N2 ..., C).
+            indices (Tensor):  The indices of features to group with, input
+                shape is (B, npoint, nsample) or stacked inputs
+                (M1 + M2 ..., nsample).
+            features_batch_cnt (Tensor, optional): Input features nums in
+                each batch, just like (N1, N2, ...). Defaults to None.
+                New in version 1.7.0.
+            indices_batch_cnt (Tensor, optional): Input indices nums in
+                each batch, just like (M1, M2, ...). Defaults to None.
+                New in version 1.7.0.
+
+        Returns:
+            Tensor: Grouped features, the shape is (B, C, npoint, nsample)
+            or (M1 + M2 ..., C, nsample).
+        """
+        features = features.contiguous()
+        indices = indices.contiguous()
+        if features_batch_cnt is not None and indices_batch_cnt is not None:
+            assert features_batch_cnt.dtype == torch.int
+            assert indices_batch_cnt.dtype == torch.int
+            M, nsample = indices.size()
+            N, C = features.size()
+            B = indices_batch_cnt.shape[0]
+            output = features.new_zeros((M, C, nsample))
+            ext_module.stack_group_points_forward(
+                features,
+                features_batch_cnt,
+                indices,
+                indices_batch_cnt,
+                output,
+                b=B,
+                m=M,
+                c=C,
+                nsample=nsample)
+            ctx.for_backwards = (B, N, indices, features_batch_cnt,
+                                 indices_batch_cnt)
+        else:
+            B, nfeatures, nsample = indices.size()
+            _, C, N = features.size()
+            output = features.new_zeros(B, C, nfeatures, nsample)
+
+            ext_module.group_points_forward(
+                features,
+                indices,
+                output,
+                b=B,
+                c=C,
+                n=N,
+                npoints=nfeatures,
+                nsample=nsample)
+
+            ctx.for_backwards = (indices, N)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple:
+        """
+        Args:
+            grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
+                of the output from forward.
+
+        Returns:
+            Tensor: (B, C, N) gradient of the features.
+        """
+        if len(ctx.for_backwards) != 5:
+            idx, N = ctx.for_backwards
+
+            B, C, npoint, nsample = grad_out.size()
+            grad_features = grad_out.new_zeros(B, C, N)
+
+            grad_out_data = grad_out.data.contiguous()
+            ext_module.group_points_backward(
+                grad_out_data,
+                idx,
+                grad_features.data,
+                b=B,
+                c=C,
+                n=N,
+                npoints=npoint,
+                nsample=nsample)
+            return grad_features, None
+        else:
+            B, N, idx, features_batch_cnt, idx_batch_cnt = ctx.for_backwards
+
+            M, C, nsample = grad_out.size()
+            grad_features = grad_out.new_zeros(N, C)
+
+            grad_out_data = grad_out.data.contiguous()
+            ext_module.stack_group_points_backward(
+                grad_out_data,
+                idx,
+                idx_batch_cnt,
+                features_batch_cnt,
+                grad_features.data,
+                b=B,
+                c=C,
+                m=M,
+                n=N,
+                nsample=nsample)
+            return grad_features, None, None, None
+
+
+grouping_operation = GroupingOperation.apply
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/info.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/info.py
new file mode 100644
index 0000000000000000000000000000000000000000..b24b981f8f513b3bf5c2300d37375acaded62e21
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/info.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+if torch.__version__ == 'parrots':
+    import parrots
+
+    def get_compiler_version():
+        return 'GCC ' + parrots.version.compiler
+
+    def get_compiling_cuda_version():
+        return parrots.version.cuda
+else:
+    from ..utils import ext_loader
+    ext_module = ext_loader.load_ext(
+        '_ext', ['get_compiler_version', 'get_compiling_cuda_version'])
+
+    def get_compiler_version():
+        return ext_module.get_compiler_version()
+
+    def get_compiling_cuda_version():
+        return ext_module.get_compiling_cuda_version()
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/iou3d.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/iou3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..94e2057ad2530e25a53ad89a0d2d78ee75ca0483
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/iou3d.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'iou3d_boxes_overlap_bev_forward', 'iou3d_nms3d_forward',
+    'iou3d_nms3d_normal_forward'
+])
+
+
+def boxes_overlap_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
+    """Calculate boxes BEV overlap.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).
+
+    Returns:
+        torch.Tensor: BEV overlap result with shape (M, N).
+    """
+    ans_overlap = boxes_a.new_zeros(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
+    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),
+                                               boxes_b.contiguous(),
+                                               ans_overlap)
+
+    return ans_overlap
+
+
+def boxes_iou3d(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
+    """Calculate boxes 3D IoU.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).
+
+    Returns:
+        torch.Tensor: 3D IoU result with shape (M, N).
+    """
+    assert boxes_a.shape[1] == boxes_b.shape[1] == 7,\
+        'Input boxes shape should be (N, 7)'
+
+    boxes_a_height_max = (boxes_a[:, 2] + boxes_a[:, 5] / 2).view(-1, 1)
+    boxes_a_height_min = (boxes_a[:, 2] - boxes_a[:, 5] / 2).view(-1, 1)
+    boxes_b_height_max = (boxes_b[:, 2] + boxes_b[:, 5] / 2).view(1, -1)
+    boxes_b_height_min = (boxes_b[:, 2] - boxes_b[:, 5] / 2).view(1, -1)
+
+    overlaps_bev = boxes_a.new_zeros(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
+    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),
+                                               boxes_b.contiguous(),
+                                               overlaps_bev)
+
+    max_of_min = torch.max(boxes_a_height_min, boxes_b_height_min)
+    min_of_max = torch.min(boxes_a_height_max, boxes_b_height_max)
+    overlaps_h = torch.clamp(min_of_max - max_of_min, min=0)
+    overlaps_3d = overlaps_bev * overlaps_h
+    vol_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1)
+    vol_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1)
+    iou3d = overlaps_3d / torch.clamp(vol_a + vol_b - overlaps_3d, min=1e-6)
+    return iou3d
+
+
+def nms3d(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
+    """3D NMS function GPU implementation (for BEV boxes).
+
+    Args:
+        boxes (torch.Tensor): Input boxes with the shape of (N, 7)
+            ([x, y, z, dx, dy, dz, heading]).
+        scores (torch.Tensor): Scores of boxes with the shape of (N).
+        iou_threshold (float): Overlap threshold of NMS.
+
+    Returns:
+        torch.Tensor: Indexes after NMS.
+    """
+    assert boxes.size(1) == 7, 'Input boxes shape should be (N, 7)'
+    order = scores.sort(0, descending=True)[1]
+    boxes = boxes[order].contiguous()
+
+    keep = boxes.new_zeros(boxes.size(0), dtype=torch.long)
+    num_out = boxes.new_zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms3d_forward(
+        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
+    keep = order[keep[:num_out].to(boxes.device)].contiguous()
+    return keep
+
+
+def nms3d_normal(boxes: Tensor, scores: Tensor,
+                 iou_threshold: float) -> Tensor:
+    """Normal 3D NMS function GPU implementation. The overlap of two boxes for
+    IoU calculation is defined as the exact overlapping area of the two boxes
+    WITH their yaw angle set to 0.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with shape (N, 7).
+            ([x, y, z, dx, dy, dz, heading]).
+        scores (torch.Tensor): Scores of predicted boxes with shape (N).
+        iou_threshold (float): Overlap threshold of NMS.
+
+    Returns:
+        torch.Tensor: Remaining indices with scores in descending order.
+    """
+    assert boxes.shape[1] == 7, 'Input boxes shape should be (N, 7)'
+    order = scores.sort(0, descending=True)[1]
+    boxes = boxes[order].contiguous()
+
+    keep = boxes.new_zeros(boxes.size(0), dtype=torch.long)
+    num_out = boxes.new_zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms3d_normal_forward(
+        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
+    return order[keep[:num_out].to(boxes.device)].contiguous()
+
+
+def _xyxyr2xywhr(boxes: Tensor) -> Tensor:
+    """Convert [x1, y1, x2, y2, heading] box to [x, y, dx, dy, heading] box.
+
+    Args:
+        box (torch.Tensor): Input boxes with shape (N, 5).
+
+    Returns:
+        torch.Tensor: Converted boxes with shape (N, 7).
+    """
+    warnings.warn(
+        'This function is deprecated and will be removed in the future.',
+        DeprecationWarning)
+    return torch.stack(
+        ((boxes[:, 0] + boxes[:, 2]) / 2, (boxes[:, 1] + boxes[:, 3]) / 2,
+         boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1], boxes[:, 4]),
+        dim=-1)
+
+
+def boxes_iou_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
+    """Calculate boxes IoU in the Bird's Eye View.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 5)
+            ([x1, y1, x2, y2, ry]).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 5)
+            ([x1, y1, x2, y2, ry]).
+
+    Returns:
+        torch.Tensor: IoU result with shape (M, N).
+    """
+    from .box_iou_rotated import box_iou_rotated
+
+    warnings.warn(
+        '`iou3d.boxes_iou_bev` is deprecated and will be removed in'
+        ' the future. Please, use `box_iou_rotated.box_iou_rotated`.',
+        DeprecationWarning)
+
+    return box_iou_rotated(_xyxyr2xywhr(boxes_a), _xyxyr2xywhr(boxes_b))
+
+
+def nms_bev(boxes: Tensor,
+            scores: Tensor,
+            thresh: float,
+            pre_max_size: Optional[int] = None,
+            post_max_size: Optional[int] = None) -> Tensor:
+    """NMS function GPU implementation (for BEV boxes).
+
+    The overlap of two boxes for IoU calculation is defined as the exact
+    overlapping area of the two boxes. In this function, one can also
+    set ``pre_max_size`` and ``post_max_size``.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with the shape of (N, 5)
+            ([x1, y1, x2, y2, ry]).
+        scores (torch.Tensor): Scores of boxes with the shape of (N,).
+        thresh (float): Overlap threshold of NMS.
+        pre_max_size (int, optional): Max size of boxes before NMS.
+            Default: None.
+        post_max_size (int, optional): Max size of boxes after NMS.
+            Default: None.
+
+    Returns:
+        torch.Tensor: Indexes after NMS.
+    """
+    from .nms import nms_rotated
+
+    warnings.warn(
+        '`iou3d.nms_bev` is deprecated and will be removed in'
+        ' the future. Please, use `nms.nms_rotated`.', DeprecationWarning)
+    assert boxes.size(1) == 5, 'Input boxes shape should be (N, 5)'
+    order = scores.sort(0, descending=True)[1]
+
+    if pre_max_size is not None:
+        order = order[:pre_max_size]
+    boxes = _xyxyr2xywhr(boxes)[order]
+    scores = scores[order]
+
+    keep = nms_rotated(boxes, scores, thresh)[1]
+    keep = order[keep]
+
+    if post_max_size is not None:
+        keep = keep[:post_max_size]
+    return keep
+
+
+def nms_normal_bev(boxes: Tensor, scores: Tensor, thresh: float) -> Tensor:
+    """Normal NMS function GPU implementation (for BEV boxes).
+
+    The overlap of two boxes for IoU calculation is defined as the exact
+    overlapping area of the two boxes WITH their yaw angle set to 0.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with shape (N, 5)
+            ([x1, y1, x2, y2, ry]).
+        scores (torch.Tensor): Scores of predicted boxes with shape (N,).
+        thresh (float): Overlap threshold of NMS.
+
+    Returns:
+        torch.Tensor: Remaining indices with scores in descending order.
+    """
+    from .nms import nms
+
+    warnings.warn(
+        '`iou3d.nms_normal_bev` is deprecated and will be removed in'
+        ' the future. Please, use `nms.nms`.', DeprecationWarning)
+    assert boxes.shape[1] == 5, 'Input boxes shape should be (N, 5)'
+
+    return nms(boxes[:, :-1], scores, thresh)[1]
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/knn.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..48ce92f9259bdcec166a23be2ba81544a69bc8c1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/knn.py
@@ -0,0 +1,80 @@
+from typing import Optional
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['knn_forward'])
+
+
+class KNN(Function):
+    r"""KNN (CUDA) based on heap data structure.
+
+    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
+    scene_seg/lib/pointops/src/knnquery_heap>`_.
+
+    Find k-nearest points.
+    """
+
+    @staticmethod
+    def forward(ctx,
+                k: int,
+                xyz: torch.Tensor,
+                center_xyz: Optional[torch.Tensor] = None,
+                transposed: bool = False) -> torch.Tensor:
+        """
+        Args:
+            k (int): number of nearest neighbors.
+            xyz (torch.Tensor): (B, N, 3) if transposed == False, else
+                (B, 3, N). xyz coordinates of the features.
+            center_xyz (torch.Tensor, optional): (B, npoint, 3) if transposed
+                is False, else (B, 3, npoint). centers of the knn query.
+                Default: None.
+            transposed (bool, optional): whether the input tensors are
+                transposed. Should not explicitly use this keyword when
+                calling knn (=KNN.apply), just add the fourth param.
+                Default: False.
+
+        Returns:
+            torch.Tensor: (B, k, npoint) tensor with the indices of the
+            features that form k-nearest neighbours.
+        """
+        assert (k > 0) & (k < 100), 'k should be in range(0, 100)'
+
+        if center_xyz is None:
+            center_xyz = xyz
+
+        if transposed:
+            xyz = xyz.transpose(2, 1).contiguous()
+            center_xyz = center_xyz.transpose(2, 1).contiguous()
+
+        assert xyz.is_contiguous()  # [B, N, 3]
+        assert center_xyz.is_contiguous()  # [B, npoint, 3]
+
+        center_xyz_device = center_xyz.get_device()
+        assert center_xyz_device == xyz.get_device(), \
+            'center_xyz and xyz should be put on the same device'
+        if torch.cuda.current_device() != center_xyz_device:
+            torch.cuda.set_device(center_xyz_device)
+
+        B, npoint, _ = center_xyz.shape
+        N = xyz.shape[1]
+
+        idx = center_xyz.new_zeros((B, npoint, k)).int()
+        dist2 = center_xyz.new_zeros((B, npoint, k)).float()
+
+        ext_module.knn_forward(
+            xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k)
+        # idx shape to [B, k, npoint]
+        idx = idx.transpose(2, 1).contiguous()
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None
+
+
+knn = KNN.apply
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/masked_conv.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/masked_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..919702e9cbd04b9e1f5c93147bcced8a1be38c61
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/masked_conv.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['masked_im2col_forward', 'masked_col2im_forward'])
+
+
+class MaskedConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features, mask, weight, bias, padding, stride=1):
+        return g.op(
+            'mmcv::MMCVMaskedConv2d',
+            features,
+            mask,
+            weight,
+            bias,
+            padding_i=padding,
+            stride_i=stride)
+
+    @staticmethod
+    def forward(ctx,
+                features: torch.Tensor,
+                mask: torch.Tensor,
+                weight: torch.nn.Parameter,
+                bias: torch.nn.Parameter,
+                padding: int = 0,
+                stride: int = 1) -> torch.Tensor:
+        assert mask.dim() == 3 and mask.size(0) == 1
+        assert features.dim() == 4 and features.size(0) == 1
+        assert features.size()[2:] == mask.size()[1:]
+        pad_h, pad_w = _pair(padding)
+        stride_h, stride_w = _pair(stride)
+        if stride_h != 1 or stride_w != 1:
+            raise ValueError(
+                'Stride could not only be 1 in masked_conv2d currently.')
+        out_channel, in_channel, kernel_h, kernel_w = weight.size()
+
+        if features.device.type == 'npu':
+            import torch_npu
+            output = torch_npu.npu_conv2d(
+                features,
+                weight,
+                bias,
+                stride=(stride_h, stride_w),
+                padding=(pad_h, pad_w),
+                dilation=(1, 1),
+                groups=1)
+            if mask.size()[1:] != output.size()[2:]:
+                raise ValueError(
+                    'The mask is inconsistent with the shape of output_conv.')
+            mask = mask > 0
+            mask = mask.type(output.dtype)
+            output = output * mask
+            return output
+
+        batch_size = features.size(0)
+        out_h = int(
+            math.floor(
+                torch.true_divide((features.size(2) + 2 * pad_h -
+                                   (kernel_h - 1) - 1), stride_h) + 1))
+        out_w = int(
+            math.floor(
+                torch.true_divide((features.size(3) + 2 * pad_w -
+                                   (kernel_w - 1) - 1), stride_w) + 1))
+        mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False)
+        output = features.new_zeros(batch_size, out_channel, out_h, out_w)
+        if mask_inds.numel() > 0:
+            mask_h_idx = mask_inds[:, 0].contiguous()
+            mask_w_idx = mask_inds[:, 1].contiguous()
+            data_col = features.new_zeros(in_channel * kernel_h * kernel_w,
+                                          mask_inds.size(0))
+            ext_module.masked_im2col_forward(
+                features,
+                mask_h_idx,
+                mask_w_idx,
+                data_col,
+                kernel_h=kernel_h,
+                kernel_w=kernel_w,
+                pad_h=pad_h,
+                pad_w=pad_w)
+            masked_output = torch.addmm(1, bias[:, None], 1,
+                                        weight.view(out_channel, -1), data_col)
+            ext_module.masked_col2im_forward(
+                masked_output,
+                mask_h_idx,
+                mask_w_idx,
+                output,
+                height=out_h,
+                width=out_w,
+                channels=out_channel)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        return (None, ) * 5
+
+
+masked_conv2d = MaskedConv2dFunction.apply
+
+
+class MaskedConv2d(nn.Conv2d):
+    """A MaskedConv2d which inherits the official Conv2d.
+
+    The masked forward doesn't implement the backward function and only
+    supports the stride parameter to be 1 currently.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, ...]],
+                 stride: int = 1,
+                 padding: int = 0,
+                 dilation: int = 1,
+                 groups: int = 1,
+                 bias: bool = True):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias)
+
+    def forward(self,
+                input: torch.Tensor,
+                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if mask is None:  # fallback to the normal Conv2d
+            return super().forward(input)
+        else:
+            return masked_conv2d(input, mask, self.weight, self.bias,
+                                 self.padding)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/merge_cells.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/merge_cells.py
new file mode 100644
index 0000000000000000000000000000000000000000..19c3fe6582bc04390819b1da9b2620548b462836
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/merge_cells.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from abc import abstractmethod
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..cnn import ConvModule
+
+
+class BaseMergeCell(nn.Module):
+    """The basic class for cells used in NAS-FPN and NAS-FCOS.
+
+    BaseMergeCell takes 2 inputs. After applying convolution
+    on them, they are resized to the target size. Then,
+    they go through binary_op, which depends on the type of cell.
+    If with_out_conv is True, the result of output will go through
+    another convolution layer.
+
+    Args:
+        fused_channels (int): number of input channels in out_conv layer.
+        out_channels (int): number of output channels in out_conv layer.
+        with_out_conv (bool): Whether to use out_conv layer
+        out_conv_cfg (dict): Config dict for convolution layer, which should
+            contain "groups", "kernel_size", "padding", "bias" to build
+            out_conv layer.
+        out_norm_cfg (dict): Config dict for normalization layer in out_conv.
+        out_conv_order (tuple): The order of conv/norm/activation layers in
+            out_conv.
+        with_input1_conv (bool): Whether to use convolution on input1.
+        with_input2_conv (bool): Whether to use convolution on input2.
+        input_conv_cfg (dict): Config dict for building input1_conv layer and
+            input2_conv layer, which is expected to contain the type of
+            convolution.
+            Default: None, which means using conv2d.
+        input_norm_cfg (dict): Config dict for normalization layer in
+            input1_conv and input2_conv layer. Default: None.
+        upsample_mode (str): Interpolation method used to resize the output
+            of input1_conv and input2_conv to target size. Currently, we
+            support ['nearest', 'bilinear']. Default: 'nearest'.
+    """
+
+    def __init__(self,
+                 fused_channels: Optional[int] = 256,
+                 out_channels: Optional[int] = 256,
+                 with_out_conv: bool = True,
+                 out_conv_cfg: dict = dict(
+                     groups=1, kernel_size=3, padding=1, bias=True),
+                 out_norm_cfg: Optional[dict] = None,
+                 out_conv_order: tuple = ('act', 'conv', 'norm'),
+                 with_input1_conv: bool = False,
+                 with_input2_conv: bool = False,
+                 input_conv_cfg: Optional[dict] = None,
+                 input_norm_cfg: Optional[dict] = None,
+                 upsample_mode: str = 'nearest'):
+        super().__init__()
+        assert upsample_mode in ['nearest', 'bilinear']
+        self.with_out_conv = with_out_conv
+        self.with_input1_conv = with_input1_conv
+        self.with_input2_conv = with_input2_conv
+        self.upsample_mode = upsample_mode
+
+        if self.with_out_conv:
+            self.out_conv = ConvModule(
+                fused_channels,  # type: ignore
+                out_channels,  # type: ignore
+                **out_conv_cfg,
+                norm_cfg=out_norm_cfg,
+                order=out_conv_order)
+
+        self.input1_conv = self._build_input_conv(
+            out_channels, input_conv_cfg,
+            input_norm_cfg) if with_input1_conv else nn.Sequential()
+        self.input2_conv = self._build_input_conv(
+            out_channels, input_conv_cfg,
+            input_norm_cfg) if with_input2_conv else nn.Sequential()
+
+    def _build_input_conv(self, channel, conv_cfg, norm_cfg):
+        return ConvModule(
+            channel,
+            channel,
+            3,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True)
+
+    @abstractmethod
+    def _binary_op(self, x1, x2):
+        pass
+
+    def _resize(self, x, size):
+        if x.shape[-2:] == size:
+            return x
+        elif x.shape[-2:] < size:
+            return F.interpolate(x, size=size, mode=self.upsample_mode)
+        else:
+            if x.shape[-2] % size[-2] != 0 or x.shape[-1] % size[-1] != 0:
+                h, w = x.shape[-2:]
+                target_h, target_w = size
+                pad_h = math.ceil(h / target_h) * target_h - h
+                pad_w = math.ceil(w / target_w) * target_w - w
+                pad_l = pad_w // 2
+                pad_r = pad_w - pad_l
+                pad_t = pad_h // 2
+                pad_b = pad_h - pad_t
+                pad = (pad_l, pad_r, pad_t, pad_b)
+                x = F.pad(x, pad, mode='constant', value=0.0)
+            kernel_size = (x.shape[-2] // size[-2], x.shape[-1] // size[-1])
+            x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size)
+            return x
+
+    def forward(self,
+                x1: torch.Tensor,
+                x2: torch.Tensor,
+                out_size: Optional[tuple] = None) -> torch.Tensor:
+        assert x1.shape[:2] == x2.shape[:2]
+        assert out_size is None or len(out_size) == 2
+        if out_size is None:  # resize to larger one
+            out_size = max(x1.size()[2:], x2.size()[2:])
+
+        x1 = self.input1_conv(x1)
+        x2 = self.input2_conv(x2)
+
+        x1 = self._resize(x1, out_size)
+        x2 = self._resize(x2, out_size)
+
+        x = self._binary_op(x1, x2)
+        if self.with_out_conv:
+            x = self.out_conv(x)
+        return x
+
+
+class SumCell(BaseMergeCell):
+
+    def __init__(self, in_channels: int, out_channels: int, **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+
+    def _binary_op(self, x1, x2):
+        return x1 + x2
+
+
+class ConcatCell(BaseMergeCell):
+
+    def __init__(self, in_channels: int, out_channels: int, **kwargs):
+        super().__init__(in_channels * 2, out_channels, **kwargs)
+
+    def _binary_op(self, x1, x2):
+        ret = torch.cat([x1, x2], dim=1)
+        return ret
+
+
+class GlobalPoolingCell(BaseMergeCell):
+
+    def __init__(self,
+                 in_channels: Optional[int] = None,
+                 out_channels: Optional[int] = None,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
+
+    def _binary_op(self, x1, x2):
+        x2_att = self.global_pool(x2).sigmoid()
+        return x2 + x2_att * x1
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/min_area_polygons.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/min_area_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..b95f58796f4a894ab5cc48e2d766319f4c3640c7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/min_area_polygons.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['min_area_polygons'])
+
+
+def min_area_polygons(pointsets: torch.Tensor) -> torch.Tensor:
+    """Find the smallest polygons that surrounds all points in the point sets.
+
+    Args:
+        pointsets (Tensor): point sets with shape  (N, 18).
+
+    Returns:
+        torch.Tensor: Return the smallest polygons with shape (N, 8).
+    """
+    polygons = pointsets.new_zeros((pointsets.size(0), 8))
+    ext_module.min_area_polygons(pointsets, polygons)
+    return polygons
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/modulated_deform_conv.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/modulated_deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4a42ced61fd3ed6fb67d0387f0678b9bd8cd603
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/modulated_deform_conv.py
@@ -0,0 +1,427 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.logging import print_log
+from mmengine.registry import MODELS
+from mmengine.utils import deprecated_api_warning
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+
+from mmcv.utils import IS_MLU_AVAILABLE
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['modulated_deform_conv_forward', 'modulated_deform_conv_backward'])
+
+
+class ModulatedDeformConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, offset, mask, weight, bias, stride, padding,
+                 dilation, groups, deform_groups):
+        input_tensors = [input, offset, mask, weight]
+        if bias is not None:
+            input_tensors.append(bias)
+        return g.op(
+            'mmcv::MMCVModulatedDeformConv2d',
+            *input_tensors,
+            stride_i=stride,
+            padding_i=padding,
+            dilation_i=dilation,
+            groups_i=groups,
+            deform_groups_i=deform_groups)
+
+    @staticmethod
+    def _calculate_sort_index(kernel_h, kernel_w, deformable_group):
+        split_num = deformable_group * 2 * kernel_h * kernel_w
+        sort_index = list(range(split_num))
+        sort_index_fp = (sort_index[1::2] + sort_index[::2])
+        sort_index_bp_dict = {i: idx for idx, i in enumerate(sort_index_fp)}
+        sort_index_bp = [sort_index_bp_dict[i] for i in sort_index]
+        sort_index_fp = torch.IntTensor(sort_index_fp)
+        sort_index_bp = torch.IntTensor(sort_index_bp)
+        sort_index_fp = sort_index_fp.npu()
+        sort_index_bp = sort_index_bp.npu()
+        return sort_index_fp, sort_index_bp
+
+    @staticmethod
+    def _npu_forward(ctx, input_tensor, offset, mask, weight, bias):
+        _, _, kernel_h, kernel_w = weight.shape
+        conv2d_bias = bias if len(bias) > 0 else None
+        sort_index_fp, sort_index_bp = \
+            ModulatedDeformConv2dFunction._calculate_sort_index(
+                kernel_w, kernel_h, ctx.deform_groups)
+        select_offset = offset.index_select(1, sort_index_fp)
+        offset_all = torch.cat([select_offset, mask], dim=1)
+        import torch_npu
+        output, offset_out = torch_npu.npu_deformable_conv2d(
+            input_tensor,
+            weight,
+            offset_all,
+            conv2d_bias,
+            kernel_size=[kernel_w, kernel_h],
+            stride=[1, 1, ctx.stride[0], ctx.stride[1]],
+            padding=[
+                ctx.padding[0], ctx.padding[0], ctx.padding[1], ctx.padding[1]
+            ],
+            dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
+            groups=ctx.groups,
+            deformable_groups=ctx.deform_groups,
+            modulated=True)
+        if weight.requires_grad or mask.requires_grad or offset.requires_grad \
+                or input_tensor.requires_grad:
+            ctx.save_for_backward(input_tensor, weight, offset_out, offset_all,
+                                  sort_index_bp)
+        return output
+
+    @staticmethod
+    def _npu_backward(ctx, grad_output):
+        input_tensor, weight, offset_out, offset_all, sort_index_bp = \
+            ctx.saved_tensors
+        grad_input, grad_weight, grad_offset_all, grad_bias = \
+            torch.npu_deformable_conv2dbk(
+                input_tensor, grad_output, offset_out, weight, offset_all,
+                kernel_size=[weight.shape[3], weight.shape[2]],
+                stride=[1, 1, ctx.stride[0], ctx.stride[1]],
+                padding=[ctx.padding[0], ctx.padding[0], ctx.padding[1],
+                         ctx.padding[1]],
+                dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
+                groups=ctx.groups, deformable_groups=ctx.deform_groups,
+                modulated=True)
+        grad_offset = grad_offset_all.index_select(1, sort_index_bp)
+        grad_mask = grad_offset_all[:, grad_offset.shape[1]:, :, :]
+        if not ctx.with_bias:
+            grad_bias = None
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None, None, None, None)
+
+    @staticmethod
+    def forward(ctx,
+                input: torch.Tensor,
+                offset: torch.Tensor,
+                mask: torch.Tensor,
+                weight: nn.Parameter,
+                bias: Optional[nn.Parameter] = None,
+                stride: int = 1,
+                padding: int = 0,
+                dilation: int = 1,
+                groups: int = 1,
+                deform_groups: int = 1) -> torch.Tensor:
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                f'Expected 4D tensor as input, got {input.dim()}D tensor \
+                  instead.')
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deform_groups = deform_groups
+        ctx.with_bias = bias is not None
+        ctx.device = input.device.type
+        if not ctx.with_bias:
+            bias = input.new_empty(0)  # fake tensor
+        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
+        # amp won't cast the type of model (float32), but "offset" is cast
+        # to float16 by nn.Conv2d automatically, leading to the type
+        # mismatch with input (when it is float32) or weight.
+        # The flag for whether to use fp16 or amp is the type of "offset",
+        # we cast weight and input to temporarily support fp16 and amp
+        # whatever the pytorch version is.
+        input = input.type_as(offset)
+        weight = weight.type_as(input)
+        bias = bias.type_as(input)  # type: ignore
+        mask = mask.type_as(input)
+        if ctx.device == 'npu':
+            output = ModulatedDeformConv2dFunction._npu_forward(
+                ctx, input, offset, mask, weight, bias)
+            return output
+        ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty([
+            int(i) for i in ModulatedDeformConv2dFunction._output_size(
+                ctx, input, weight)
+        ])
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        ext_module.modulated_deform_conv_forward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            output,
+            ctx._bufs[1],
+            kernel_h=weight.size(2),
+            kernel_w=weight.size(3),
+            stride_h=ctx.stride[0],
+            stride_w=ctx.stride[1],
+            pad_h=ctx.padding[0],
+            pad_w=ctx.padding[1],
+            dilation_h=ctx.dilation[0],
+            dilation_w=ctx.dilation[1],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            with_bias=ctx.with_bias)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        if ctx.device == 'npu':
+            return ModulatedDeformConv2dFunction._npu_backward(
+                ctx, grad_output)
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        grad_output = grad_output.contiguous()
+        ext_module.modulated_deform_conv_backward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            ctx._bufs[1],
+            grad_input,
+            grad_weight,
+            grad_bias,
+            grad_offset,
+            grad_mask,
+            grad_output,
+            kernel_h=weight.size(2),
+            kernel_w=weight.size(3),
+            stride_h=ctx.stride[0],
+            stride_w=ctx.stride[1],
+            pad_h=ctx.padding[0],
+            pad_w=ctx.padding[1],
+            dilation_h=ctx.dilation[0],
+            dilation_w=ctx.dilation[1],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            with_bias=ctx.with_bias)
+        if not ctx.with_bias:
+            grad_bias = None
+
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None)
+
+    @staticmethod
+    def _output_size(ctx, input, weight):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = ctx.padding[d]
+            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = ctx.stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be ' +
+                'x'.join(map(str, output_size)) + ')')
+        return output_size
+
+
+modulated_deform_conv2d = ModulatedDeformConv2dFunction.apply
+
+
+class ModulatedDeformConv2d(nn.Module):
+
+    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
+                            cls_name='ModulatedDeformConv2d')
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int]],
+                 stride: int = 1,
+                 padding: int = 0,
+                 dilation: int = 1,
+                 groups: int = 1,
+                 deform_groups: int = 1,
+                 bias: Union[bool, str] = True):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deform_groups = deform_groups
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups,
+                         *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.init_weights()
+
+    def init_weights(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.zero_()
+
+    def forward(self, x: torch.Tensor, offset: torch.Tensor,
+                mask: torch.Tensor) -> torch.Tensor:
+        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
+                                       self.stride, self.padding,
+                                       self.dilation, self.groups,
+                                       self.deform_groups)
+
+
+@MODELS.register_module('DCNv2')
+class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
+    """A ModulatedDeformable Conv Encapsulation that acts as normal Conv
+    layers.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int): Same as nn.Conv2d, while tuple is not supported.
+        padding (int): Same as nn.Conv2d, while tuple is not supported.
+        dilation (int): Same as nn.Conv2d, while tuple is not supported.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+
+    _version = 2
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            bias=True)
+        self.init_weights()
+
+    def init_weights(self) -> None:
+        super().init_weights()
+        if hasattr(self, 'conv_offset'):
+            self.conv_offset.weight.data.zero_()
+            self.conv_offset.bias.data.zero_()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore
+        out = self.conv_offset(x)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
+                                       self.stride, self.padding,
+                                       self.dilation, self.groups,
+                                       self.deform_groups)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, ModulatedDeformConvPack
+            # loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+
+        if version is not None and version > 1:
+            print_log(
+                f'ModulatedDeformConvPack {prefix.rstrip(".")} is upgraded to '
+                'version 2.',
+                logger='current')
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+
+if IS_MLU_AVAILABLE:
+    import torchvision
+    from mmengine.utils import digit_version
+    from torchvision.ops import deform_conv2d as tv_deform_conv2d
+
+    @MODELS.register_module('DCNv2', force=True)
+    class ModulatedDeformConv2dPack_MLU(ModulatedDeformConv2d):
+        """This class is the DCNv2 implementation of the MLU device.
+
+        The MLU backend support of the operator has been implemented
+        in torchvision. The mmcv registration mechanism is used for
+        multiplexing here. The torchvision implementation of DCNv2 is called.
+        Args:
+            in_channels (int): Same as nn.Conv2d.
+            out_channels (int): Same as nn.Conv2d.
+            kernel_size (int or tuple[int]): Same as nn.Conv2d.
+            stride (int): Same as nn.Conv2d, while tuple is not supported.
+            padding (int): Same as nn.Conv2d, while tuple is not supported.
+            dilation (int): Same as nn.Conv2d, while tuple is not supported.
+            groups (int): Same as nn.Conv2d.
+            bias (bool or str): If specified as `auto`, it will be decided by
+                the norm_cfg. Bias will be set as True if norm_cfg is None,
+                otherwise False.
+        """
+
+        def __init__(self, *args, **kwargs):
+            assert digit_version(torchvision.__version__) >= digit_version(
+                '0.10.0a0'), 'the version of torchvision should be >= 0.10.0'
+            super().__init__(*args, **kwargs)
+            self.conv_offset = nn.Conv2d(
+                self.in_channels,
+                self.deform_groups * 3 * self.kernel_size[0] *
+                self.kernel_size[1],
+                kernel_size=self.kernel_size,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                bias=True)
+            self.init_weights()
+
+        def init_weights(self):
+            super().init_weights()
+            if hasattr(self, 'conv_offset'):
+                self.conv_offset.weight.data.zero_()
+                self.conv_offset.bias.data.zero_()
+
+        def forward(self, x):
+            out = self.conv_offset(x)
+            o1, o2, mask = torch.chunk(out, 3, dim=1)
+            offset = torch.cat((o1, o2), dim=1)
+            mask = torch.sigmoid(mask)
+            x = x.type_as(offset)
+            weight = self.weight.type_as(x)
+            mask = mask.type_as(x)
+            return tv_deform_conv2d(
+                x,
+                offset,
+                weight,
+                bias=self.bias,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                mask=mask)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/multi_scale_deform_attn.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/multi_scale_deform_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7459263cdf0f2dca67bbf99fcad9c8ff1171dc8e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/multi_scale_deform_attn.py
@@ -0,0 +1,381 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from typing import Optional, no_type_check
+
+import mmengine
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule, constant_init, xavier_init
+from mmengine.registry import MODELS
+from mmengine.utils import deprecated_api_warning
+from torch.autograd.function import Function, once_differentiable
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+class MultiScaleDeformableAttnFunction(Function):
+
+    @staticmethod
+    def forward(ctx, value: torch.Tensor, value_spatial_shapes: torch.Tensor,
+                value_level_start_index: torch.Tensor,
+                sampling_locations: torch.Tensor,
+                attention_weights: torch.Tensor,
+                im2col_step: torch.Tensor) -> torch.Tensor:
+        """GPU/MLU version of multi-scale deformable attention.
+
+        Args:
+            value (torch.Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (torch.Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (torch.Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (torch.Tensor): The weight of sampling points
+                used when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (torch.Tensor): The step used in image to column.
+
+        Returns:
+            torch.Tensor: has shape (bs, num_queries, embed_dims)
+        """
+
+        ctx.im2col_step = im2col_step
+
+        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
+        # amp won't cast the type of sampling_locations, attention_weights
+        # (float32), but "value" is cast to float16, leading to the type
+        # mismatch with input (when it is float32) or weight.
+        # The flag for whether to use fp16 or amp is the type of "value",
+        # we cast sampling_locations and attention_weights to
+        # temporarily support fp16 and amp whatever the
+        # pytorch version is.
+        sampling_locations = sampling_locations.type_as(value)
+        attention_weights = attention_weights.type_as(value)
+
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        """GPU/MLU version of backward function.
+
+        Args:
+            grad_output (torch.Tensor): Gradient of output tensor of forward.
+
+        Returns:
+            tuple[Tensor]: Gradient of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index,\
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
+
+
+def multi_scale_deformable_attn_pytorch(
+        value: torch.Tensor, value_spatial_shapes: torch.Tensor,
+        sampling_locations: torch.Tensor,
+        attention_weights: torch.Tensor) -> torch.Tensor:
+    """CPU version of multi-scale deformable attention.
+
+    Args:
+        value (torch.Tensor): The value has shape
+            (bs, num_keys, num_heads, embed_dims//num_heads)
+        value_spatial_shapes (torch.Tensor): Spatial shape of
+            each feature map, has shape (num_levels, 2),
+            last dimension 2 represent (h, w)
+        sampling_locations (torch.Tensor): The location of sampling points,
+            has shape
+            (bs ,num_queries, num_heads, num_levels, num_points, 2),
+            the last dimension 2 represent (x, y).
+        attention_weights (torch.Tensor): The weight of sampling points used
+            when calculate the attention, has shape
+            (bs ,num_queries, num_heads, num_levels, num_points),
+
+    Returns:
+        torch.Tensor: has shape (bs, num_queries, embed_dims)
+    """
+
+    bs, _, num_heads, embed_dims = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ =\
+        sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],
+                             dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (H_, W_) in enumerate(value_spatial_shapes):
+        # bs, H_*W_, num_heads, embed_dims ->
+        # bs, H_*W_, num_heads*embed_dims ->
+        # bs, num_heads*embed_dims, H_*W_ ->
+        # bs*num_heads, embed_dims, H_, W_
+        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(
+            bs * num_heads, embed_dims, H_, W_)
+        # bs, num_queries, num_heads, num_points, 2 ->
+        # bs, num_heads, num_queries, num_points, 2 ->
+        # bs*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :,
+                                          level].transpose(1, 2).flatten(0, 1)
+        # bs*num_heads, embed_dims, num_queries, num_points
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (bs, num_queries, num_heads, num_levels, num_points) ->
+    # (bs, num_heads, num_queries, num_levels, num_points) ->
+    # (bs, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        bs * num_heads, 1, num_queries, num_levels * num_points)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) *
+              attention_weights).sum(-1).view(bs, num_heads * embed_dims,
+                                              num_queries)
+    return output.transpose(1, 2).contiguous()
+
+
+@MODELS.register_module()
+class MultiScaleDeformableAttention(BaseModule):
+    """An attention module used in Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 8.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        value_proj_ratio (float): The expansion ratio of value_proj.
+            Default: 1.0.
+    """
+
+    def __init__(self,
+                 embed_dims: int = 256,
+                 num_heads: int = 8,
+                 num_levels: int = 4,
+                 num_points: int = 4,
+                 im2col_step: int = 64,
+                 dropout: float = 0.1,
+                 batch_first: bool = False,
+                 norm_cfg: Optional[dict] = None,
+                 init_cfg: Optional[mmengine.ConfigDict] = None,
+                 value_proj_ratio: float = 1.0):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        value_proj_size = int(embed_dims * value_proj_ratio)
+        self.value_proj = nn.Linear(embed_dims, value_proj_size)
+        self.output_proj = nn.Linear(value_proj_size, embed_dims)
+        self.init_weights()
+
+    def init_weights(self) -> None:
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        device = next(self.parameters()).device
+        thetas = torch.arange(
+            self.num_heads, dtype=torch.float32,
+            device=device) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @no_type_check
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query: torch.Tensor,
+                key: Optional[torch.Tensor] = None,
+                value: Optional[torch.Tensor] = None,
+                identity: Optional[torch.Tensor] = None,
+                query_pos: Optional[torch.Tensor] = None,
+                key_padding_mask: Optional[torch.Tensor] = None,
+                reference_points: Optional[torch.Tensor] = None,
+                spatial_shapes: Optional[torch.Tensor] = None,
+                level_start_index: Optional[torch.Tensor] = None,
+                **kwargs) -> torch.Tensor:
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (torch.Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (torch.Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (torch.Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (torch.Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (torch.Tensor): The positional encoding for `query`.
+                Default: None.
+            key_padding_mask (torch.Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            reference_points (torch.Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            spatial_shapes (torch.Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (torch.Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+            torch.Tensor: forwarded results with shape
+            [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if ((IS_CUDA_AVAILABLE and value.is_cuda)
+                or (IS_MLU_AVAILABLE and value.is_mlu)):
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/nms.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb08ba07c6f54409f5b8f479b6b00462e2718a15
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/nms.py
@@ -0,0 +1,491 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.utils import deprecated_api_warning
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['nms', 'softnms', 'nms_match', 'nms_rotated', 'nms_quadri'])
+
+
+# This function is modified from: https://github.com/pytorch/vision/
+class NMSop(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, bboxes: Tensor, scores: Tensor, iou_threshold: float,
+                offset: int, score_threshold: float, max_num: int) -> Tensor:
+        is_filtering_by_score = score_threshold > 0
+        if is_filtering_by_score:
+            valid_mask = scores > score_threshold
+            bboxes, scores = bboxes[valid_mask], scores[valid_mask]
+            valid_inds = torch.nonzero(
+                valid_mask, as_tuple=False).squeeze(dim=1)
+
+        inds = ext_module.nms(
+            bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)
+
+        if max_num > 0:
+            inds = inds[:max_num]
+        if is_filtering_by_score:
+            inds = valid_inds[inds]
+        return inds
+
+
+class SoftNMSop(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, boxes: Tensor, scores: Tensor, iou_threshold: float,
+                sigma: float, min_score: float, method: int,
+                offset: int) -> Tuple[Tensor, Tensor]:
+        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
+        inds = ext_module.softnms(
+            boxes.cpu(),
+            scores.cpu(),
+            dets.cpu(),
+            iou_threshold=float(iou_threshold),
+            sigma=float(sigma),
+            min_score=float(min_score),
+            method=int(method),
+            offset=int(offset))
+        return dets, inds
+
+    @staticmethod
+    def symbolic(g, boxes, scores, iou_threshold, sigma, min_score, method,
+                 offset):
+        from packaging import version
+        assert version.parse(torch.__version__) >= version.parse('1.7.0')
+        nms_out = g.op(
+            'mmcv::SoftNonMaxSuppression',
+            boxes,
+            scores,
+            iou_threshold_f=float(iou_threshold),
+            sigma_f=float(sigma),
+            min_score_f=float(min_score),
+            method_i=int(method),
+            offset_i=int(offset),
+            outputs=2)
+        return nms_out
+
+
+array_like_type = Union[Tensor, np.ndarray]
+
+
+@deprecated_api_warning({'iou_thr': 'iou_threshold'})
+def nms(boxes: array_like_type,
+        scores: array_like_type,
+        iou_threshold: float,
+        offset: int = 0,
+        score_threshold: float = 0,
+        max_num: int = -1) -> Tuple[array_like_type, array_like_type]:
+    """Dispatch to either CPU or GPU NMS implementations.
+
+    The input can be either torch tensor or numpy array. GPU NMS will be used
+    if the input is gpu tensor, otherwise CPU NMS
+    will be used. The returned type will always be the same as inputs.
+
+    Arguments:
+        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
+        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
+        iou_threshold (float): IoU threshold for NMS.
+        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
+        score_threshold (float): score threshold for NMS.
+        max_num (int): maximum number of boxes after NMS.
+
+    Returns:
+        tuple: kept dets (boxes and scores) and indice, which always have
+        the same data type as the input.
+
+    Example:
+        >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9],
+        >>>                   [49.3, 32.9, 51.0, 35.3],
+        >>>                   [49.2, 31.8, 51.0, 35.4],
+        >>>                   [35.1, 11.5, 39.1, 15.7],
+        >>>                   [35.6, 11.8, 39.3, 14.2],
+        >>>                   [35.3, 11.5, 39.9, 14.5],
+        >>>                   [35.2, 11.7, 39.7, 15.7]], dtype=np.float32)
+        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.5, 0.4, 0.3],\
+               dtype=np.float32)
+        >>> iou_threshold = 0.6
+        >>> dets, inds = nms(boxes, scores, iou_threshold)
+        >>> assert len(inds) == len(dets) == 3
+    """
+    assert isinstance(boxes, (Tensor, np.ndarray))
+    assert isinstance(scores, (Tensor, np.ndarray))
+    is_numpy = False
+    if isinstance(boxes, np.ndarray):
+        is_numpy = True
+        boxes = torch.from_numpy(boxes)
+    if isinstance(scores, np.ndarray):
+        scores = torch.from_numpy(scores)
+    assert boxes.size(1) == 4
+    assert boxes.size(0) == scores.size(0)
+    assert offset in (0, 1)
+
+    inds = NMSop.apply(boxes, scores, iou_threshold, offset, score_threshold,
+                       max_num)
+    dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)
+    if is_numpy:
+        dets = dets.cpu().numpy()
+        inds = inds.cpu().numpy()
+    return dets, inds
+
+
+@deprecated_api_warning({'iou_thr': 'iou_threshold'})
+def soft_nms(boxes: array_like_type,
+             scores: array_like_type,
+             iou_threshold: float = 0.3,
+             sigma: float = 0.5,
+             min_score: float = 1e-3,
+             method: str = 'linear',
+             offset: int = 0) -> Tuple[array_like_type, array_like_type]:
+    """Dispatch to only CPU Soft NMS implementations.
+
+    The input can be either a torch tensor or numpy array.
+    The returned type will always be the same as inputs.
+
+    Args:
+        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
+        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
+        iou_threshold (float): IoU threshold for NMS.
+        sigma (float): hyperparameter for gaussian method
+        min_score (float): score filter threshold
+        method (str): either 'linear' or 'gaussian'
+        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
+
+    Returns:
+        tuple: kept dets (boxes and scores) and indice, which always have
+        the same data type as the input.
+
+    Example:
+        >>> boxes = np.array([[4., 3., 5., 3.],
+        >>>                   [4., 3., 5., 4.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.]], dtype=np.float32)
+        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.4, 0.0], dtype=np.float32)
+        >>> iou_threshold = 0.6
+        >>> dets, inds = soft_nms(boxes, scores, iou_threshold, sigma=0.5)
+        >>> assert len(inds) == len(dets) == 5
+    """
+
+    assert isinstance(boxes, (Tensor, np.ndarray))
+    assert isinstance(scores, (Tensor, np.ndarray))
+    is_numpy = False
+    if isinstance(boxes, np.ndarray):
+        is_numpy = True
+        boxes = torch.from_numpy(boxes)
+    if isinstance(scores, np.ndarray):
+        scores = torch.from_numpy(scores)
+    assert boxes.size(1) == 4
+    assert boxes.size(0) == scores.size(0)
+    assert offset in (0, 1)
+    method_dict = {'naive': 0, 'linear': 1, 'gaussian': 2}
+    assert method in method_dict.keys()
+
+    if torch.__version__ == 'parrots':
+        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
+        indata_list = [boxes.cpu(), scores.cpu(), dets.cpu()]
+        indata_dict = {
+            'iou_threshold': float(iou_threshold),
+            'sigma': float(sigma),
+            'min_score': min_score,
+            'method': method_dict[method],
+            'offset': int(offset)
+        }
+        inds = ext_module.softnms(*indata_list, **indata_dict)
+    else:
+        dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(),
+                                     float(iou_threshold), float(sigma),
+                                     float(min_score), method_dict[method],
+                                     int(offset))
+
+    dets = dets[:inds.size(0)]
+
+    if is_numpy:
+        dets = dets.cpu().numpy()
+        inds = inds.cpu().numpy()
+        return dets, inds
+    else:
+        return dets.to(device=boxes.device), inds.to(device=boxes.device)
+
+
+def batched_nms(boxes: Tensor,
+                scores: Tensor,
+                idxs: Tensor,
+                nms_cfg: Optional[Dict],
+                class_agnostic: bool = False) -> Tuple[Tensor, Tensor]:
+    r"""Performs non-maximum suppression in a batched fashion.
+
+    Modified from `torchvision/ops/boxes.py#L39
+    <https://github.com/pytorch/vision/blob/
+    505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39>`_.
+    In order to perform NMS independently per class, we add an offset to all
+    the boxes. The offset is dependent only on the class idx, and is large
+    enough so that boxes from different classes do not overlap.
+
+    Note:
+        In v1.4.1 and later, ``batched_nms`` supports skipping the NMS and
+        returns sorted raw results when `nms_cfg` is None.
+
+    Args:
+        boxes (torch.Tensor): boxes in shape (N, 4) or (N, 5).
+        scores (torch.Tensor): scores in shape (N, ).
+        idxs (torch.Tensor): each index value correspond to a bbox cluster,
+            and NMS will not be applied between elements of different idxs,
+            shape (N, ).
+        nms_cfg (dict | optional): Supports skipping the nms when `nms_cfg`
+            is None, otherwise it should specify nms type and other
+            parameters like `iou_thr`. Possible keys includes the following.
+
+            - iou_threshold (float): IoU threshold used for NMS.
+            - split_thr (float): threshold number of boxes. In some cases the
+              number of boxes is large (e.g., 200k). To avoid OOM during
+              training, the users could set `split_thr` to a small value.
+              If the number of boxes is greater than the threshold, it will
+              perform NMS on each group of boxes separately and sequentially.
+              Defaults to 10000.
+        class_agnostic (bool): if true, nms is class agnostic,
+            i.e. IoU thresholding happens over all boxes,
+            regardless of the predicted class. Defaults to False.
+
+    Returns:
+        tuple: kept dets and indice.
+
+        - boxes (Tensor): Bboxes with score after nms, has shape
+          (num_bboxes, 5). last dimension 5 arrange as
+          (x1, y1, x2, y2, score)
+        - keep (Tensor): The indices of remaining boxes in input
+          boxes.
+    """
+    # skip nms when nms_cfg is None
+    if nms_cfg is None:
+        scores, inds = scores.sort(descending=True)
+        boxes = boxes[inds]
+        return torch.cat([boxes, scores[:, None]], -1), inds
+
+    nms_cfg_ = nms_cfg.copy()
+    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
+    if class_agnostic:
+        boxes_for_nms = boxes
+    else:
+        # When using rotated boxes, only apply offsets on center.
+        if boxes.size(-1) == 5:
+            # Strictly, the maximum coordinates of the rotating box
+            # (x,y,w,h,a) should be calculated by polygon coordinates.
+            # But the conversion from rotated box to polygon will
+            # slow down the speed.
+            # So we use max(x,y) + max(w,h) as max coordinate
+            # which is larger than polygon max coordinate
+            # max(x1, y1, x2, y2,x3, y3, x4, y4)
+            max_coordinate = boxes[..., :2].max() + boxes[..., 2:4].max()
+            offsets = idxs.to(boxes) * (
+                max_coordinate + torch.tensor(1).to(boxes))
+            boxes_ctr_for_nms = boxes[..., :2] + offsets[:, None]
+            boxes_for_nms = torch.cat([boxes_ctr_for_nms, boxes[..., 2:5]],
+                                      dim=-1)
+        else:
+            max_coordinate = boxes.max()
+            offsets = idxs.to(boxes) * (
+                max_coordinate + torch.tensor(1).to(boxes))
+            boxes_for_nms = boxes + offsets[:, None]
+
+    nms_op = nms_cfg_.pop('type', 'nms')
+    if isinstance(nms_op, str):
+        nms_op = eval(nms_op)
+
+    split_thr = nms_cfg_.pop('split_thr', 10000)
+    # Won't split to multiple nms nodes when exporting to onnx
+    if boxes_for_nms.shape[0] < split_thr:
+        dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
+        boxes = boxes[keep]
+
+        # This assumes `dets` has arbitrary dimensions where
+        # the last dimension is score.
+        # Currently it supports bounding boxes [x1, y1, x2, y2, score] or
+        # rotated boxes [cx, cy, w, h, angle_radian, score].
+
+        scores = dets[:, -1]
+    else:
+        max_num = nms_cfg_.pop('max_num', -1)
+        total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
+        # Some type of nms would reweight the score, such as SoftNMS
+        scores_after_nms = scores.new_zeros(scores.size())
+        for id in torch.unique(idxs):
+            mask = (idxs == id).nonzero(as_tuple=False).view(-1)
+            dets, keep = nms_op(boxes_for_nms[mask], scores[mask], **nms_cfg_)
+            total_mask[mask[keep]] = True
+            scores_after_nms[mask[keep]] = dets[:, -1]
+        keep = total_mask.nonzero(as_tuple=False).view(-1)
+
+        scores, inds = scores_after_nms[keep].sort(descending=True)
+        keep = keep[inds]
+        boxes = boxes[keep]
+
+        if max_num > 0:
+            keep = keep[:max_num]
+            boxes = boxes[:max_num]
+            scores = scores[:max_num]
+
+    boxes = torch.cat([boxes, scores[:, None]], -1)
+    return boxes, keep
+
+
+def nms_match(dets: array_like_type,
+              iou_threshold: float) -> List[array_like_type]:
+    """Matched dets into different groups by NMS.
+
+    NMS match is Similar to NMS but when a bbox is suppressed, nms match will
+    record the indice of suppressed bbox and form a group with the indice of
+    kept bbox. In each group, indice is sorted as score order.
+
+    Args:
+        dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5).
+        iou_threshold (float): IoU thresh for NMS.
+
+    Returns:
+        list[torch.Tensor | np.ndarray]: The outer list corresponds different
+        matched group, the inner Tensor corresponds the indices for a group
+        in score order.
+    """
+    if dets.shape[0] == 0:
+        matched = []
+    else:
+        assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \
+                                    f'but get {dets.shape}'
+        if isinstance(dets, Tensor):
+            dets_t = dets.detach().cpu()
+        else:
+            dets_t = torch.from_numpy(dets)
+        indata_list = [dets_t]
+        indata_dict = {'iou_threshold': float(iou_threshold)}
+        matched = ext_module.nms_match(*indata_list, **indata_dict)
+        if torch.__version__ == 'parrots':
+            matched = matched.tolist()  # type: ignore
+
+    if isinstance(dets, Tensor):
+        return [dets.new_tensor(m, dtype=torch.long) for m in matched]
+    else:
+        return [np.array(m, dtype=int) for m in matched]
+
+
+def nms_rotated(dets: Tensor,
+                scores: Tensor,
+                iou_threshold: float,
+                labels: Optional[Tensor] = None,
+                clockwise: bool = True) -> Tuple[Tensor, Tensor]:
+    """Performs non-maximum suppression (NMS) on the rotated boxes according to
+    their intersection-over-union (IoU).
+
+    Rotated NMS iteratively removes lower scoring rotated boxes which have an
+    IoU greater than iou_threshold with another (higher scoring) rotated box.
+
+    Args:
+        dets (torch.Tensor):  Rotated boxes in shape (N, 5).
+            They are expected to be in
+            (x_ctr, y_ctr, width, height, angle_radian) format.
+        scores (torch.Tensor): scores in shape (N, ).
+        iou_threshold (float): IoU thresh for NMS.
+        labels (torch.Tensor, optional): boxes' label in shape (N,).
+        clockwise (bool): flag indicating whether the positive angular
+            orientation is clockwise. default True.
+            `New in version 1.4.3.`
+
+    Returns:
+        tuple: kept dets(boxes and scores) and indice, which is always the
+        same data type as the input.
+    """
+    if dets.shape[0] == 0:
+        return dets, None
+    if not clockwise:
+        flip_mat = dets.new_ones(dets.shape[-1])
+        flip_mat[-1] = -1
+        dets_cw = dets * flip_mat
+    else:
+        dets_cw = dets
+    multi_label = labels is not None
+    if labels is None:
+        input_labels = scores.new_empty(0, dtype=torch.int)
+    else:
+        input_labels = labels
+    if dets.device.type in ('npu', 'mlu'):
+        order = scores.new_empty(0, dtype=torch.long)
+        if dets.device.type == 'npu':
+            coefficient = 57.29578  # 180 / PI
+            for i in range(dets.size()[0]):
+                dets_cw[i][4] *= coefficient  # radians to angle
+        keep_inds = ext_module.nms_rotated(dets_cw, scores, order, dets_cw,
+                                           input_labels, iou_threshold,
+                                           multi_label)
+        dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
+                         dim=1)
+        return dets, keep_inds
+
+    if multi_label:
+        dets_wl = torch.cat((dets_cw, labels.unsqueeze(1)), 1)  # type: ignore
+    else:
+        dets_wl = dets_cw
+    _, order = scores.sort(0, descending=True)
+    dets_sorted = dets_wl.index_select(0, order)
+
+    if torch.__version__ == 'parrots':
+        keep_inds = ext_module.nms_rotated(
+            dets_wl,
+            scores,
+            order,
+            dets_sorted,
+            input_labels,
+            iou_threshold=iou_threshold,
+            multi_label=multi_label)
+    else:
+        keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted,
+                                           input_labels, iou_threshold,
+                                           multi_label)
+    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
+                     dim=1)
+    return dets, keep_inds
+
+
+def nms_quadri(dets: Tensor,
+               scores: Tensor,
+               iou_threshold: float,
+               labels: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
+    """Performs non-maximum suppression (NMS) on the quadrilateral boxes
+    according to their intersection-over-union (IoU).
+
+    Quadri NMS iteratively removes lower scoring quadrilateral boxes
+    which have an IoU greater than iou_threshold with another (higher
+    scoring) quadrilateral box.
+
+    Args:
+        dets (torch.Tensor):  Quadri boxes in shape (N, 8).
+            They are expected to be in
+            (x1, y1, ..., x4, y4) format.
+        scores (torch.Tensor): scores in shape (N, ).
+        iou_threshold (float): IoU thresh for NMS.
+        labels (torch.Tensor, optional): boxes' label in shape (N,).
+
+    Returns:
+        tuple: kept dets(boxes and scores) and indice, which is always the
+        same data type as the input.
+    """
+    if dets.shape[0] == 0:
+        return dets, None
+
+    multi_label = labels is not None
+    if multi_label:
+        dets_with_lables = \
+            torch.cat((dets, labels.unsqueeze(1)), 1)  # type: ignore
+    else:
+        dets_with_lables = dets
+    _, order = scores.sort(0, descending=True)
+    dets_sorted = dets_with_lables.index_select(0, order)
+
+    keep_inds = ext_module.nms_quadri(dets_with_lables, scores, order,
+                                      dets_sorted, iou_threshold, multi_label)
+    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
+                     dim=1)
+    return dets, keep_inds
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/pixel_group.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/pixel_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf73e326da8f46bf899b84955d0b911dd3f65014
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/pixel_group.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['pixel_group'])
+
+
+def pixel_group(
+    score: Union[np.ndarray, Tensor],
+    mask: Union[np.ndarray, Tensor],
+    embedding: Union[np.ndarray, Tensor],
+    kernel_label: Union[np.ndarray, Tensor],
+    kernel_contour: Union[np.ndarray, Tensor],
+    kernel_region_num: int,
+    distance_threshold: float,
+) -> List[List[float]]:
+    """Group pixels into text instances, which is widely used text detection
+    methods.
+
+    Arguments:
+        score (np.array or torch.Tensor): The foreground score with size hxw.
+        mask (np.array or Tensor): The foreground mask with size hxw.
+        embedding (np.array or torch.Tensor): The embedding with size hxwxc to
+            distinguish instances.
+        kernel_label (np.array or torch.Tensor): The instance kernel index with
+            size hxw.
+        kernel_contour (np.array or torch.Tensor): The kernel contour with
+            size hxw.
+        kernel_region_num (int): The instance kernel region number.
+        distance_threshold (float): The embedding distance threshold between
+            kernel and pixel in one instance.
+
+    Returns:
+        list[list[float]]: The instance coordinates and attributes list. Each
+        element consists of averaged confidence, pixel number, and coordinates
+        (x_i, y_i for all pixels) in order.
+    """
+    assert isinstance(score, (torch.Tensor, np.ndarray))
+    assert isinstance(mask, (torch.Tensor, np.ndarray))
+    assert isinstance(embedding, (torch.Tensor, np.ndarray))
+    assert isinstance(kernel_label, (torch.Tensor, np.ndarray))
+    assert isinstance(kernel_contour, (torch.Tensor, np.ndarray))
+    assert isinstance(kernel_region_num, int)
+    assert isinstance(distance_threshold, float)
+
+    if isinstance(score, np.ndarray):
+        score = torch.from_numpy(score)
+    if isinstance(mask, np.ndarray):
+        mask = torch.from_numpy(mask)
+    if isinstance(embedding, np.ndarray):
+        embedding = torch.from_numpy(embedding)
+    if isinstance(kernel_label, np.ndarray):
+        kernel_label = torch.from_numpy(kernel_label)
+    if isinstance(kernel_contour, np.ndarray):
+        kernel_contour = torch.from_numpy(kernel_contour)
+
+    if torch.__version__ == 'parrots':
+        label = ext_module.pixel_group(
+            score,
+            mask,
+            embedding,
+            kernel_label,
+            kernel_contour,
+            kernel_region_num=kernel_region_num,
+            distance_threshold=distance_threshold)
+        label = label.tolist()
+        label = label[0]
+        list_index = kernel_region_num
+        pixel_assignment = []
+        for x in range(kernel_region_num):
+            pixel_assignment.append(
+                np.array(
+                    label[list_index:list_index + int(label[x])],
+                    dtype=np.float))
+            list_index = list_index + int(label[x])
+    else:
+        pixel_assignment = ext_module.pixel_group(score, mask, embedding,
+                                                  kernel_label, kernel_contour,
+                                                  kernel_region_num,
+                                                  distance_threshold)
+    return pixel_assignment
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/point_sample.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..38112531285c865b706cef6f073975ab3329940c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/point_sample.py
@@ -0,0 +1,332 @@
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa
+
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+
+def bilinear_grid_sample(im: Tensor,
+                         grid: Tensor,
+                         align_corners: bool = False) -> Tensor:
+    """Given an input and a flow-field grid, computes the output using input
+    values and pixel locations from grid. Supported only bilinear interpolation
+    method to sample the input pixels.
+
+    Args:
+        im (torch.Tensor): Input feature map, shape (N, C, H, W)
+        grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2)
+        align_corners (bool): If set to True, the extrema (-1 and 1) are
+            considered as referring to the center points of the input’s
+            corner pixels. If set to False, they are instead considered as
+            referring to the corner points of the input’s corner pixels,
+            making the sampling more resolution agnostic.
+
+    Returns:
+        torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)
+    """
+    n, c, h, w = im.shape
+    gn, gh, gw, _ = grid.shape
+    assert n == gn
+
+    x = grid[:, :, :, 0]
+    y = grid[:, :, :, 1]
+
+    if align_corners:
+        x = ((x + 1) / 2) * (w - 1)
+        y = ((y + 1) / 2) * (h - 1)
+    else:
+        x = ((x + 1) * w - 1) / 2
+        y = ((y + 1) * h - 1) / 2
+
+    x = x.view(n, -1)
+    y = y.view(n, -1)
+
+    x0 = torch.floor(x).long()
+    y0 = torch.floor(y).long()
+    x1 = x0 + 1
+    y1 = y0 + 1
+
+    wa = ((x1 - x) * (y1 - y)).unsqueeze(1)
+    wb = ((x1 - x) * (y - y0)).unsqueeze(1)
+    wc = ((x - x0) * (y1 - y)).unsqueeze(1)
+    wd = ((x - x0) * (y - y0)).unsqueeze(1)
+
+    # Apply default for grid_sample function zero padding
+    im_padded = F.pad(im, pad=[1, 1, 1, 1], mode='constant', value=0)
+    padded_h = h + 2
+    padded_w = w + 2
+    # save points positions after padding
+    x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1
+
+    # Clip coordinates to padded image size
+    x0 = torch.where(x0 < 0, torch.tensor(0), x0)
+    x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1), x0)
+    x1 = torch.where(x1 < 0, torch.tensor(0), x1)
+    x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1), x1)
+    y0 = torch.where(y0 < 0, torch.tensor(0), y0)
+    y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1), y0)
+    y1 = torch.where(y1 < 0, torch.tensor(0), y1)
+    y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1), y1)
+
+    im_padded = im_padded.view(n, c, -1)
+
+    x0_y0 = (x0 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x0_y1 = (x0 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x1_y0 = (x1 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x1_y1 = (x1 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
+
+    Ia = torch.gather(im_padded, 2, x0_y0)
+    Ib = torch.gather(im_padded, 2, x0_y1)
+    Ic = torch.gather(im_padded, 2, x1_y0)
+    Id = torch.gather(im_padded, 2, x1_y1)
+
+    return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw)
+
+
+def normalize(grid: Tensor) -> Tensor:
+    """Normalize input grid from [-1, 1] to [0, 1]
+
+    Args:
+        grid (torch.Tensor): The grid to be normalize, range [-1, 1].
+
+    Returns:
+        torch.Tensor: Normalized grid, range [0, 1].
+    """
+
+    return (grid + 1.0) / 2.0
+
+
+def denormalize(grid: Tensor) -> Tensor:
+    """Denormalize input grid from range [0, 1] to [-1, 1]
+
+    Args:
+        grid (torch.Tensor): The grid to be denormalize, range [0, 1].
+
+    Returns:
+        torch.Tensor: Denormalized grid, range [-1, 1].
+    """
+
+    return grid * 2.0 - 1.0
+
+
+def generate_grid(num_grid: int, size: Tuple[int, int],
+                  device: torch.device) -> Tensor:
+    """Generate regular square grid of points in [0, 1] x [0, 1] coordinate
+    space.
+
+    Args:
+        num_grid (int): The number of grids to sample, one for each region.
+        size (tuple[int, int]): The side size of the regular grid.
+        device (torch.device): Desired device of returned tensor.
+
+    Returns:
+        torch.Tensor: A tensor of shape (num_grid, size[0]*size[1], 2) that
+        contains coordinates for the regular grids.
+    """
+
+    affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device)
+    grid = F.affine_grid(
+        affine_trans, torch.Size((1, 1, *size)), align_corners=False)
+    grid = normalize(grid)
+    return grid.view(1, -1, 2).expand(num_grid, -1, -1)
+
+
+def rel_roi_point_to_abs_img_point(rois: Tensor,
+                                   rel_roi_points: Tensor) -> Tensor:
+    """Convert roi based relative point coordinates to image based absolute
+    point coordinates.
+
+    Args:
+        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
+            to RoI, location, range (0, 1), shape (N, P, 2)
+    Returns:
+        torch.Tensor: Image based absolute point coordinates, shape (N, P, 2)
+    """
+
+    with torch.no_grad():
+        assert rel_roi_points.size(0) == rois.size(0)
+        assert rois.dim() == 2
+        assert rel_roi_points.dim() == 3
+        assert rel_roi_points.size(2) == 2
+        # remove batch idx
+        if rois.size(1) == 5:
+            rois = rois[:, 1:]
+        abs_img_points = rel_roi_points.clone()
+        # To avoid an error during exporting to onnx use independent
+        # variables instead inplace computation
+        xs = abs_img_points[:, :, 0] * (rois[:, None, 2] - rois[:, None, 0])
+        ys = abs_img_points[:, :, 1] * (rois[:, None, 3] - rois[:, None, 1])
+        xs += rois[:, None, 0]
+        ys += rois[:, None, 1]
+        abs_img_points = torch.stack([xs, ys], dim=2)
+    return abs_img_points
+
+
+def get_shape_from_feature_map(x: Tensor) -> Tensor:
+    """Get spatial resolution of input feature map considering exporting to
+    onnx mode.
+
+    Args:
+        x (torch.Tensor): Input tensor, shape (N, C, H, W)
+
+    Returns:
+        torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2)
+    """
+    img_shape = torch.tensor(x.shape[2:]).flip(0).view(1, 1,
+                                                       2).to(x.device).float()
+    return img_shape
+
+
+def abs_img_point_to_rel_img_point(abs_img_points: Tensor,
+                                   img: Union[tuple, Tensor],
+                                   spatial_scale: float = 1.) -> Tensor:
+    """Convert image based absolute point coordinates to image based relative
+    coordinates for sampling.
+
+    Args:
+        abs_img_points (torch.Tensor): Image based absolute point coordinates,
+            shape (N, P, 2)
+        img (tuple or torch.Tensor): (height, width) of image or feature map.
+        spatial_scale (float, optional): Scale points by this factor.
+            Default: 1.
+
+    Returns:
+        Tensor: Image based relative point coordinates for sampling, shape
+        (N, P, 2).
+    """
+
+    assert (isinstance(img, tuple) and len(img) == 2) or \
+           (isinstance(img, torch.Tensor) and len(img.shape) == 4)
+
+    if isinstance(img, tuple):
+        h, w = img
+        scale = torch.tensor([w, h],
+                             dtype=torch.float,
+                             device=abs_img_points.device)
+        scale = scale.view(1, 1, 2)
+    else:
+        scale = get_shape_from_feature_map(img)
+
+    return abs_img_points / scale * spatial_scale
+
+
+def rel_roi_point_to_rel_img_point(rois: Tensor,
+                                   rel_roi_points: Tensor,
+                                   img: Union[tuple, Tensor],
+                                   spatial_scale: float = 1.) -> Tensor:
+    """Convert roi based relative point coordinates to image based absolute
+    point coordinates.
+
+    Args:
+        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
+            to RoI, location, range (0, 1), shape (N, P, 2)
+        img (tuple or torch.Tensor): (height, width) of image or feature map.
+        spatial_scale (float, optional): Scale points by this factor.
+            Default: 1.
+
+    Returns:
+        torch.Tensor: Image based relative point coordinates for sampling,
+        shape (N, P, 2).
+    """
+
+    abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points)
+    rel_img_point = abs_img_point_to_rel_img_point(abs_img_point, img,
+                                                   spatial_scale)
+
+    return rel_img_point
+
+
+def point_sample(input: Tensor,
+                 points: Tensor,
+                 align_corners: bool = False,
+                 **kwargs) -> Tensor:
+    """A wrapper around :func:`grid_sample` to support 3D point_coords tensors
+    Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
+    lie inside ``[0, 1] x [0, 1]`` square.
+
+    Args:
+        input (torch.Tensor): Feature map, shape (N, C, H, W).
+        points (torch.Tensor): Image based absolute point coordinates
+            (normalized), range [0, 1] x [0, 1], shape (N, P, 2) or
+            (N, Hgrid, Wgrid, 2).
+        align_corners (bool, optional): Whether align_corners.
+            Default: False
+
+    Returns:
+        torch.Tensor: Features of `point` on `input`, shape (N, C, P) or
+        (N, C, Hgrid, Wgrid).
+    """
+
+    add_dim = False
+    if points.dim() == 3:
+        add_dim = True
+        points = points.unsqueeze(2)
+    output = F.grid_sample(
+        input, denormalize(points), align_corners=align_corners, **kwargs)
+    if add_dim:
+        output = output.squeeze(3)
+    return output
+
+
+class SimpleRoIAlign(nn.Module):
+
+    def __init__(self,
+                 output_size: Tuple[int],
+                 spatial_scale: float,
+                 aligned: bool = True) -> None:
+        """Simple RoI align in PointRend, faster than standard RoIAlign.
+
+        Args:
+            output_size (tuple[int]): h, w
+            spatial_scale (float): scale the input boxes by this number
+            aligned (bool): if False, use the legacy implementation in
+                MMDetection, align_corners=True will be used in F.grid_sample.
+                If True, align the results more perfectly.
+        """
+
+        super().__init__()
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        # to be consistent with other RoI ops
+        self.use_torchvision = False
+        self.aligned = aligned
+
+    def forward(self, features: Tensor, rois: Tensor) -> Tensor:
+        num_imgs = features.size(0)
+        num_rois = rois.size(0)
+        rel_roi_points = generate_grid(
+            num_rois, self.output_size, device=rois.device)
+
+        point_feats = []
+        for batch_ind in range(num_imgs):
+            # unravel batch dim
+            feat = features[batch_ind].unsqueeze(0)
+            inds = (rois[:, 0].long() == batch_ind)
+            if inds.any():
+                rel_img_points = rel_roi_point_to_rel_img_point(
+                    rois[inds], rel_roi_points[inds], feat,
+                    self.spatial_scale).unsqueeze(0)
+                point_feat = point_sample(
+                    feat, rel_img_points, align_corners=not self.aligned)
+                point_feat = point_feat.squeeze(0).transpose(0, 1)
+                point_feats.append(point_feat)
+
+        point_feats_t = torch.cat(point_feats, dim=0)
+
+        channels = features.size(1)
+        roi_feats = point_feats_t.reshape(num_rois, channels,
+                                          *self.output_size)
+
+        return roi_feats
+
+    def __repr__(self) -> str:
+        format_str = self.__class__.__name__
+        format_str += '(output_size={}, spatial_scale={}'.format(
+            self.output_size, self.spatial_scale)
+        return format_str
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/points_in_boxes.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/points_in_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..4915e6b573923fe40658d9dca09b39da9dcb31ed
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/points_in_boxes.py
@@ -0,0 +1,137 @@
+import torch
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'points_in_boxes_part_forward', 'points_in_boxes_cpu_forward',
+    'points_in_boxes_all_forward'
+])
+
+
+def points_in_boxes_part(points: Tensor, boxes: Tensor) -> Tensor:
+    """Find the box in which each point is (CUDA).
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate.
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
+            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center.
+
+    Returns:
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M). Default background = -1.
+    """
+    assert points.shape[0] == boxes.shape[0], \
+        'Points and boxes should have the same batch size, ' \
+        f'but got {points.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        'boxes dimension should be 7, ' \
+        f'but got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        'points dimension should be 3, ' \
+        f'but got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+
+    box_idxs_of_pts = points.new_zeros((batch_size, num_points),
+                                       dtype=torch.int).fill_(-1)
+
+    # If manually put the tensor 'points' or 'boxes' on a device
+    # which is not the current device, some temporary variables
+    # will be created on the current device in the cuda op,
+    # and the output will be incorrect.
+    # Therefore, we force the current device to be the same
+    # as the device of the tensors if it was not.
+    # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305
+    # for the incorrect output before the fix.
+    points_device = points.get_device()
+    assert points_device == boxes.get_device(), \
+        'Points and boxes should be put on the same device'
+    if torch.cuda.current_device() != points_device:
+        torch.cuda.set_device(points_device)
+
+    ext_module.points_in_boxes_part_forward(boxes.contiguous(),
+                                            points.contiguous(),
+                                            box_idxs_of_pts)
+
+    return box_idxs_of_pts
+
+
+def points_in_boxes_cpu(points: Tensor, boxes: Tensor) -> Tensor:
+    """Find all boxes in which each point is (CPU). The CPU version of
+    :meth:`points_in_boxes_all`.
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in
+            LiDAR/DEPTH coordinate
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
+            (x, y, z) is the bottom center.
+
+    Returns:
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M, T). Default background = 0.
+    """
+    assert points.shape[0] == boxes.shape[0], \
+        'Points and boxes should have the same batch size, ' \
+        f'but got {points.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        'boxes dimension should be 7, ' \
+        f'but got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        'points dimension should be 3, ' \
+        f'but got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+    num_boxes = boxes.shape[1]
+
+    point_indices = points.new_zeros((batch_size, num_boxes, num_points),
+                                     dtype=torch.int)
+    for b in range(batch_size):
+        ext_module.points_in_boxes_cpu_forward(boxes[b].float().contiguous(),
+                                               points[b].float().contiguous(),
+                                               point_indices[b])
+    point_indices = point_indices.transpose(1, 2)
+
+    return point_indices
+
+
+def points_in_boxes_all(points: Tensor, boxes: Tensor) -> Tensor:
+    """Find all boxes in which each point is (CUDA).
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
+            (x, y, z) is the bottom center.
+
+    Returns:
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M, T). Default background = 0.
+    """
+    assert boxes.shape[0] == points.shape[0], \
+        'Points and boxes should have the same batch size, ' \
+        f'but got {boxes.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        'boxes dimension should be 7, ' \
+        f'but got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        'points dimension should be 3, ' \
+        f'but got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+    num_boxes = boxes.shape[1]
+
+    box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),
+                                       dtype=torch.int).fill_(0)
+
+    # Same reason as line 25-32
+    points_device = points.get_device()
+    assert points_device == boxes.get_device(), \
+        'Points and boxes should be put on the same device'
+    if torch.cuda.current_device() != points_device:
+        torch.cuda.set_device(points_device)
+
+    ext_module.points_in_boxes_all_forward(boxes.contiguous(),
+                                           points.contiguous(),
+                                           box_idxs_of_pts)
+
+    return box_idxs_of_pts
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/points_in_polygons.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/points_in_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..e54b5a896dfc94a135c613a624fb43690da8aae3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/points_in_polygons.py
@@ -0,0 +1,41 @@
+import torch
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['points_in_polygons_forward'])
+
+
+def points_in_polygons(points: Tensor, polygons: Tensor) -> Tensor:
+    """Judging whether points are inside polygons, which is used in the ATSS
+    assignment for the rotated boxes.
+
+    It should be noted that when the point is just at the polygon boundary, the
+    judgment will be inaccurate, but the effect on assignment is limited.
+
+    Args:
+        points (torch.Tensor): It has shape (B, 2), indicating (x, y).
+            M means the number of predicted points.
+        polygons (torch.Tensor): It has shape (M, 8), indicating
+            (x1, y1, x2, y2, x3, y3, x4, y4). M means the number of
+            ground truth polygons.
+
+    Returns:
+        torch.Tensor: Return the result with the shape of (B, M),
+        1 indicates that the point is inside the polygon,
+        0 indicates that the point is outside the polygon.
+    """
+    assert points.shape[1] == 2, \
+        'points dimension should be 2, ' \
+        f'but got unexpected shape {points.shape[1]}'
+    assert polygons.shape[1] == 8, \
+        'polygons dimension should be 8, ' \
+        f'but got unexpected shape {polygons.shape[1]}'
+    output = torch.zeros(
+        points.shape[0],
+        polygons.shape[0],
+        dtype=torch.float32,
+        device=points.device)
+    ext_module.points_in_polygons_forward(points.contiguous(),
+                                          polygons.contiguous(), output)
+    return output
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/points_sampler.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/points_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..776abc76eccfb6c233ad9b481a230f5d1e22982f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/points_sampler.py
@@ -0,0 +1,178 @@
+from typing import List
+
+import torch
+from torch import Tensor
+from torch import nn as nn
+
+from .furthest_point_sample import (furthest_point_sample,
+                                    furthest_point_sample_with_dist)
+
+
+def calc_square_dist(point_feat_a: Tensor,
+                     point_feat_b: Tensor,
+                     norm: bool = True) -> Tensor:
+    """Calculating square distance between a and b.
+
+    Args:
+        point_feat_a (torch.Tensor): (B, N, C) Feature vector of each point.
+        point_feat_b (torch.Tensor): (B, M, C) Feature vector of each point.
+        norm (bool, optional): Whether to normalize the distance.
+            Default: True.
+
+    Returns:
+        torch.Tensor: (B, N, M) Square distance between each point pair.
+    """
+    num_channel = point_feat_a.shape[-1]
+    dist = torch.cdist(point_feat_a, point_feat_b)
+    if norm:
+        dist = dist / num_channel
+    else:
+        dist = torch.square(dist)
+    return dist
+
+
+def get_sampler_cls(sampler_type: str) -> nn.Module:
+    """Get the type and mode of points sampler.
+
+    Args:
+        sampler_type (str): The type of points sampler.
+            The valid value are "D-FPS", "F-FPS", or "FS".
+
+    Returns:
+        class: Points sampler type.
+    """
+    sampler_mappings = {
+        'D-FPS': DFPSSampler,
+        'F-FPS': FFPSSampler,
+        'FS': FSSampler,
+    }
+    try:
+        return sampler_mappings[sampler_type]
+    except KeyError:
+        raise KeyError(
+            f'Supported `sampler_type` are {sampler_mappings.keys()}, but got \
+                {sampler_type}')
+
+
+class PointsSampler(nn.Module):
+    """Points sampling.
+
+    Args:
+        num_point (list[int]): Number of sample points.
+        fps_mod_list (list[str], optional): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
+            F-FPS: using feature distances for FPS.
+            D-FPS: using Euclidean distances of points for FPS.
+            FS: using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (list[int], optional):
+            Range of points to apply FPS. Default: [-1].
+    """
+
+    def __init__(self,
+                 num_point: List[int],
+                 fps_mod_list: List[str] = ['D-FPS'],
+                 fps_sample_range_list: List[int] = [-1]) -> None:
+        super().__init__()
+        # FPS would be applied to different fps_mod in the list,
+        # so the length of the num_point should be equal to
+        # fps_mod_list and fps_sample_range_list.
+        assert len(num_point) == len(fps_mod_list) == len(
+            fps_sample_range_list)
+        self.num_point = num_point
+        self.fps_sample_range_list = fps_sample_range_list
+        self.samplers = nn.ModuleList()
+        for fps_mod in fps_mod_list:
+            self.samplers.append(get_sampler_cls(fps_mod)())
+        self.fp16_enabled = False
+
+    def forward(self, points_xyz: Tensor, features: Tensor) -> Tensor:
+        """
+        Args:
+            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of
+                the points.
+            features (torch.Tensor): (B, C, N) features of the points.
+
+        Returns:
+            torch.Tensor: (B, npoint, sample_num) Indices of sampled points.
+        """
+        if points_xyz.dtype == torch.half:
+            points_xyz = points_xyz.to(torch.float32)
+        if features is not None and features.dtype == torch.half:
+            features = features.to(torch.float32)
+
+        indices = []
+        last_fps_end_index = 0
+        for fps_sample_range, sampler, npoint in zip(
+                self.fps_sample_range_list, self.samplers, self.num_point):
+            assert fps_sample_range < points_xyz.shape[1]
+
+            if fps_sample_range == -1:
+                sample_points_xyz = points_xyz[:, last_fps_end_index:]
+                if features is not None:
+                    sample_features = features[:, :, last_fps_end_index:]
+                else:
+                    sample_features = None
+            else:
+                sample_points_xyz = points_xyz[:, last_fps_end_index:
+                                               fps_sample_range]
+                if features is not None:
+                    sample_features = features[:, :, last_fps_end_index:
+                                               fps_sample_range]
+                else:
+                    sample_features = None
+
+            fps_idx = sampler(sample_points_xyz.contiguous(), sample_features,
+                              npoint)
+
+            indices.append(fps_idx + last_fps_end_index)
+            last_fps_end_index = fps_sample_range
+        indices = torch.cat(indices, dim=1)
+
+        return indices
+
+
+class DFPSSampler(nn.Module):
+    """Using Euclidean distances of points for FPS."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
+        """Sampling points with D-FPS."""
+        fps_idx = furthest_point_sample(points.contiguous(), npoint)
+        return fps_idx
+
+
+class FFPSSampler(nn.Module):
+    """Using feature distances for FPS."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
+        """Sampling points with F-FPS."""
+        assert features is not None, \
+            'feature input to FFPS_Sampler should not be None'
+        features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2)
+        features_dist = calc_square_dist(
+            features_for_fps, features_for_fps, norm=False)
+        fps_idx = furthest_point_sample_with_dist(features_dist, npoint)
+        return fps_idx
+
+
+class FSSampler(nn.Module):
+    """Using F-FPS and D-FPS simultaneously."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
+        """Sampling points with FS_Sampling."""
+        assert features is not None, \
+            'feature input to FS_Sampler should not be None'
+        ffps_sampler = FFPSSampler()
+        dfps_sampler = DFPSSampler()
+        fps_idx_ffps = ffps_sampler(points, features, npoint)
+        fps_idx_dfps = dfps_sampler(points, features, npoint)
+        fps_idx = torch.cat([fps_idx_ffps, fps_idx_dfps], dim=1)
+        return fps_idx
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/prroi_pool.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/prroi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c263e30780cc221afaa721ebd3196f02f4a3776
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/prroi_pool.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.utils.dl_utils import TORCH_VERSION
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['prroi_pool_forward', 'prroi_pool_backward', 'prroi_pool_coor_backward'])
+
+
+class PrRoIPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features, rois, output_size, spatial_scale):
+        return g.op(
+            'mmcv::PrRoIPool',
+            features,
+            rois,
+            pooled_height_i=int(output_size[0]),
+            pooled_width_i=int(output_size[1]),
+            spatial_scale_f=float(spatial_scale))
+
+    @staticmethod
+    def forward(ctx,
+                features: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: Tuple,
+                spatial_scale: float = 1.0) -> torch.Tensor:
+        if features.dtype != torch.float32 or rois.dtype != torch.float32:
+            raise ValueError('Precise RoI Pooling only takes float input, got '
+                             f'{features.dtype()} for features and'
+                             f'{rois.dtype()} for rois.')
+
+        pooled_height = int(output_size[0])
+        pooled_width = int(output_size[1])
+        spatial_scale = float(spatial_scale)
+
+        features = features.contiguous()
+        rois = rois.contiguous()
+        output_shape = (rois.size(0), features.size(1), pooled_height,
+                        pooled_width)
+        output = features.new_zeros(output_shape)
+        params = (pooled_height, pooled_width, spatial_scale)
+
+        ext_module.prroi_pool_forward(
+            features,
+            rois,
+            output,
+            pooled_height=params[0],
+            pooled_width=params[1],
+            spatial_scale=params[2])
+        ctx.params = params
+        # everything here is contiguous.
+        ctx.save_for_backward(features, rois, output)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, None, None, None]:
+        features, rois, output = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(*features.shape)
+        grad_coor = grad_output.new_zeros(*rois.shape)
+
+        if features.requires_grad or TORCH_VERSION == 'parrots':
+            grad_output = grad_output.contiguous()
+            ext_module.prroi_pool_backward(
+                grad_output,
+                rois,
+                grad_input,
+                pooled_height=ctx.params[0],
+                pooled_width=ctx.params[1],
+                spatial_scale=ctx.params[2])
+        if rois.requires_grad or TORCH_VERSION == 'parrots':
+            grad_output = grad_output.contiguous()
+            ext_module.prroi_pool_coor_backward(
+                output,
+                grad_output,
+                features,
+                rois,
+                grad_coor,
+                pooled_height=ctx.params[0],
+                pooled_width=ctx.params[1],
+                spatial_scale=ctx.params[2])
+
+        return grad_input, grad_coor, None, None, None
+
+
+prroi_pool = PrRoIPoolFunction.apply
+
+
+class PrRoIPool(nn.Module):
+    """The operation of precision RoI pooling. The implementation of PrRoIPool
+    is modified from https://github.com/vacancy/PreciseRoIPooling/
+
+    Precise RoI Pooling (PrRoIPool) is an integration-based (bilinear
+    interpolation) average pooling method for RoI Pooling. It avoids any
+    quantization and has a continuous gradient on bounding box coordinates.
+    It is:
+
+    1. different from the original RoI Pooling proposed in Fast R-CNN. PrRoI
+    Pooling uses average pooling instead of max pooling for each bin and has a
+    continuous gradient on bounding box coordinates. That is, one can take the
+    derivatives of some loss function w.r.t the coordinates of each RoI and
+    optimize the RoI coordinates.
+    2. different from the RoI Align proposed in Mask R-CNN. PrRoI Pooling uses
+    a full integration-based average pooling instead of sampling a constant
+    number of points. This makes the gradient w.r.t. the coordinates
+    continuous.
+
+    Args:
+        output_size (Union[int, tuple]): h, w.
+        spatial_scale (float, optional): scale the input boxes by this number.
+            Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 output_size: Union[int, tuple],
+                 spatial_scale: float = 1.0):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+
+    def forward(self, features: torch.Tensor,
+                rois: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): The feature map.
+            rois (torch.Tensor): The RoI bboxes in [tl_x, tl_y, br_x, br_y]
+                format.
+
+        Returns:
+            torch.Tensor: The pooled results.
+        """
+        return prroi_pool(features, rois, self.output_size, self.spatial_scale)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/psa_mask.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/psa_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..45f4946662c6751fe72fe6fd139f6e4b508d6cba
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/psa_mask.py
@@ -0,0 +1,98 @@
+# Modified from https://github.com/hszhao/semseg/blob/master/lib/psa
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['psamask_forward', 'psamask_backward'])
+
+
+class PSAMaskFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, psa_type, mask_size):
+        return g.op(
+            'mmcv::MMCVPSAMask',
+            input,
+            psa_type_i=psa_type,
+            mask_size_i=mask_size)
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor, psa_type: str,
+                mask_size: int) -> torch.Tensor:
+        ctx.psa_type = psa_type
+        ctx.mask_size = _pair(mask_size)
+        ctx.save_for_backward(input)
+
+        h_mask, w_mask = ctx.mask_size
+        batch_size, channels, h_feature, w_feature = input.size()
+        assert channels == h_mask * w_mask
+        output = input.new_zeros(
+            (batch_size, h_feature * w_feature, h_feature, w_feature))
+
+        ext_module.psamask_forward(
+            input,
+            output,
+            psa_type=psa_type,
+            num_=batch_size,
+            h_feature=h_feature,
+            w_feature=w_feature,
+            h_mask=h_mask,
+            w_mask=w_mask,
+            half_h_mask=(h_mask - 1) // 2,
+            half_w_mask=(w_mask - 1) // 2)
+        return output
+
+    @staticmethod
+    def backward(
+            ctx, grad_output: torch.Tensor
+    ) -> Tuple[torch.Tensor, None, None, None]:
+        input = ctx.saved_tensors[0]
+        psa_type = ctx.psa_type
+        h_mask, w_mask = ctx.mask_size
+        batch_size, channels, h_feature, w_feature = input.size()
+        grad_input = grad_output.new_zeros(
+            (batch_size, channels, h_feature, w_feature))
+        ext_module.psamask_backward(
+            grad_output,
+            grad_input,
+            psa_type=psa_type,
+            num_=batch_size,
+            h_feature=h_feature,
+            w_feature=w_feature,
+            h_mask=h_mask,
+            w_mask=w_mask,
+            half_h_mask=(h_mask - 1) // 2,
+            half_w_mask=(w_mask - 1) // 2)
+        return grad_input, None, None, None
+
+
+psa_mask = PSAMaskFunction.apply
+
+
+class PSAMask(nn.Module):
+
+    def __init__(self, psa_type: str, mask_size: Optional[tuple] = None):
+        super().__init__()
+        assert psa_type in ['collect', 'distribute']
+        if psa_type == 'collect':
+            psa_type_enum = 0
+        else:
+            psa_type_enum = 1
+        self.psa_type_enum = psa_type_enum
+        self.mask_size = mask_size
+        self.psa_type = psa_type
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return psa_mask(input, self.psa_type_enum, self.mask_size)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(psa_type={self.psa_type}, '
+        s += f'mask_size={self.mask_size})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/riroi_align_rotated.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/riroi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4e5a542f2a4fd69c96415ce60ee1536ce77ed0f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/riroi_align_rotated.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.utils import is_tuple_of
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['riroi_align_rotated_forward', 'riroi_align_rotated_backward'])
+
+
+class RiRoIAlignRotatedFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any,
+                features: torch.Tensor,
+                rois: torch.Tensor,
+                out_size: Union[int, tuple],
+                spatial_scale: float,
+                num_samples: int = 0,
+                num_orientations: int = 8,
+                clockwise: bool = False) -> torch.Tensor:
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif is_tuple_of(out_size, int):
+            assert len(out_size) == 2
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                f'"out_size" should be an integer or tuple of integers,'
+                f' but got {out_size}')
+        ctx.spatial_scale = spatial_scale
+        ctx.num_samples = num_samples
+        ctx.num_orientations = num_orientations
+        ctx.clockwise = clockwise
+        ctx.save_for_backward(rois)
+        ctx.feature_size = features.size()
+
+        batch_size, num_channels, _, _ = features.size()
+        num_rois = rois.size(0)
+
+        output = features.new_zeros(num_rois, num_channels, out_h, out_w)
+
+        ext_module.riroi_align_rotated_forward(
+            features,
+            rois,
+            output,
+            pooled_height=out_h,
+            pooled_width=out_w,
+            spatial_scale=spatial_scale,
+            num_samples=num_samples,
+            num_orientations=num_orientations,
+            clockwise=clockwise)
+        return output
+
+    @staticmethod
+    def backward(
+        ctx: Any, grad_output: torch.Tensor
+    ) -> Optional[Tuple[torch.Tensor, None, None, None, None, None, None]]:
+        feature_size = ctx.feature_size
+        spatial_scale = ctx.spatial_scale
+        num_orientations = ctx.num_orientations
+        clockwise = ctx.clockwise
+        num_samples = ctx.num_samples
+        rois = ctx.saved_tensors[0]
+        assert feature_size is not None
+        batch_size, num_channels, feature_h, feature_w = feature_size
+
+        out_w = grad_output.size(3)
+        out_h = grad_output.size(2)
+
+        grad_input = None
+
+        if ctx.needs_input_grad[0]:
+            grad_input = rois.new_zeros(batch_size, num_channels, feature_h,
+                                        feature_w)
+            ext_module.riroi_align_rotated_backward(
+                grad_output.contiguous(),
+                rois,
+                grad_input,
+                pooled_height=out_h,
+                pooled_width=out_w,
+                spatial_scale=spatial_scale,
+                num_samples=num_samples,
+                num_orientations=num_orientations,
+                clockwise=clockwise)
+
+            return grad_input, None, None, None, None, None, None
+        return None
+
+
+riroi_align_rotated = RiRoIAlignRotatedFunction.apply
+
+
+class RiRoIAlignRotated(nn.Module):
+    """Rotation-invariant RoI align pooling layer for rotated proposals.
+
+    It accepts a feature map of shape (N, C, H, W) and rois with shape
+    (n, 6) with each roi decoded as (batch_index, center_x, center_y,
+    w, h, angle). The angle is in radian.
+
+    The details are described in the paper `ReDet: A Rotation-equivariant
+    Detector for Aerial Object Detection  <https://arxiv.org/abs/2103.07733>`_.
+
+    Args:
+        out_size (tuple): fixed dimensional RoI output with shape (h, w).
+        spatial_scale (float): scale the input boxes by this number
+        num_samples (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        num_orientations (int): number of oriented channels.
+        clockwise (bool): If True, the angle in each proposal follows a
+            clockwise fashion in image space, otherwise, the angle is
+            counterclockwise. Default: False.
+    """
+
+    def __init__(self,
+                 out_size: tuple,
+                 spatial_scale: float,
+                 num_samples: int = 0,
+                 num_orientations: int = 8,
+                 clockwise: bool = False):
+        super().__init__()
+
+        self.out_size = out_size
+        self.spatial_scale = float(spatial_scale)
+        self.num_samples = int(num_samples)
+        self.num_orientations = int(num_orientations)
+        self.clockwise = clockwise
+
+    def forward(self, features: torch.Tensor,
+                rois: torch.Tensor) -> torch.Tensor:
+        return RiRoIAlignRotatedFunction.apply(features, rois, self.out_size,
+                                               self.spatial_scale,
+                                               self.num_samples,
+                                               self.num_orientations,
+                                               self.clockwise)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roi_align.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2bed204df1b9ed00a379147086b7ca5123a1e3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roi_align.py
@@ -0,0 +1,221 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any
+
+import torch
+import torch.nn as nn
+from mmengine.utils import deprecated_api_warning
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['roi_align_forward', 'roi_align_backward'])
+
+
+class RoIAlignFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
+                 pool_mode, aligned):
+        from torch.onnx import TensorProtoDataType
+        from torch.onnx.symbolic_opset9 import sub
+
+        def _select(g, self, dim, index):
+            return g.op('Gather', self, index, axis_i=dim)
+
+        # batch_indices = rois[:, 0].long()
+        batch_indices = _select(
+            g, rois, 1,
+            g.op('Constant', value_t=torch.tensor([0], dtype=torch.long)))
+        batch_indices = g.op('Squeeze', batch_indices, axes_i=[1])
+        batch_indices = g.op(
+            'Cast', batch_indices, to_i=TensorProtoDataType.INT64)
+        # rois = rois[:, 1:]
+        rois = _select(
+            g, rois, 1,
+            g.op(
+                'Constant',
+                value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
+
+        if aligned:
+            # rois -= 0.5/spatial_scale
+            aligned_offset = g.op(
+                'Constant',
+                value_t=torch.tensor([0.5 / spatial_scale],
+                                     dtype=torch.float32))
+            rois = sub(g, rois, aligned_offset)
+        # roi align
+        return g.op(
+            'RoiAlign',
+            input,
+            rois,
+            batch_indices,
+            output_height_i=output_size[0],
+            output_width_i=output_size[1],
+            spatial_scale_f=spatial_scale,
+            sampling_ratio_i=max(0, sampling_ratio),
+            mode_s=pool_mode)
+
+    @staticmethod
+    def forward(ctx: Any,
+                input: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: int,
+                spatial_scale: float = 1.0,
+                sampling_ratio: int = 0,
+                pool_mode: str = 'avg',
+                aligned: bool = True) -> torch.Tensor:
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        assert pool_mode in ('max', 'avg')
+        ctx.pool_mode = 0 if pool_mode == 'max' else 1
+        ctx.aligned = aligned
+        ctx.input_shape = input.size()
+
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        if ctx.pool_mode == 0:
+            argmax_y = input.new_zeros(output_shape)
+            argmax_x = input.new_zeros(output_shape)
+        else:
+            argmax_y = input.new_zeros(0)
+            argmax_x = input.new_zeros(0)
+
+        ext_module.roi_align_forward(
+            input,
+            rois,
+            output,
+            argmax_y,
+            argmax_x,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            pool_mode=ctx.pool_mode,
+            aligned=ctx.aligned)
+
+        ctx.save_for_backward(rois, argmax_y, argmax_x)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        rois, argmax_y, argmax_x = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+        # complex head architecture may cause grad_output uncontiguous.
+        grad_output = grad_output.contiguous()
+        ext_module.roi_align_backward(
+            grad_output,
+            rois,
+            argmax_y,
+            argmax_x,
+            grad_input,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            pool_mode=ctx.pool_mode,
+            aligned=ctx.aligned)
+        return grad_input, None, None, None, None, None, None
+
+
+roi_align = RoIAlignFunction.apply
+
+
+class RoIAlign(nn.Module):
+    """RoI align pooling layer.
+
+    Args:
+        output_size (tuple): h, w
+        spatial_scale (float): scale the input boxes by this number
+        sampling_ratio (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        pool_mode (str, 'avg' or 'max'): pooling mode in each bin.
+        aligned (bool): if False, use the legacy implementation in
+            MMDetection. If True, align the results more perfectly.
+        use_torchvision (bool): whether to use roi_align from torchvision.
+
+    Note:
+        The implementation of RoIAlign when aligned=True is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        The meaning of aligned=True:
+
+        Given a continuous coordinate c, its two neighboring pixel
+        indices (in our pixel model) are computed by floor(c - 0.5) and
+        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
+        indices [0] and [1] (which are sampled from the underlying signal
+        at continuous coordinates 0.5 and 1.5). But the original roi_align
+        (aligned=False) does not subtract the 0.5 when computing
+        neighboring pixel indices and therefore it uses pixels with a
+        slightly incorrect alignment (relative to our pixel model) when
+        performing bilinear interpolation.
+
+        With `aligned=True`,
+        we first appropriately scale the ROI and then shift it by -0.5
+        prior to calling roi_align. This produces the correct neighbors;
+
+        The difference does not make a difference to the model's
+        performance if ROIAlign is used together with conv layers.
+    """
+
+    @deprecated_api_warning(
+        {
+            'out_size': 'output_size',
+            'sample_num': 'sampling_ratio'
+        },
+        cls_name='RoIAlign')
+    def __init__(self,
+                 output_size: tuple,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 pool_mode: str = 'avg',
+                 aligned: bool = True,
+                 use_torchvision: bool = False):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.pool_mode = pool_mode
+        self.aligned = aligned
+        self.use_torchvision = use_torchvision
+
+    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input: NCHW images
+            rois: Bx5 boxes. First column is the index into N.\
+                The other 4 columns are xyxy.
+        """
+        if self.use_torchvision:
+            from torchvision.ops import roi_align as tv_roi_align
+            if 'aligned' in tv_roi_align.__code__.co_varnames:
+                return tv_roi_align(input, rois, self.output_size,
+                                    self.spatial_scale, self.sampling_ratio,
+                                    self.aligned)
+            else:
+                if self.aligned:
+                    rois -= rois.new_tensor([0.] +
+                                            [0.5 / self.spatial_scale] * 4)
+                return tv_roi_align(input, rois, self.output_size,
+                                    self.spatial_scale, self.sampling_ratio)
+        else:
+            return roi_align(input, rois, self.output_size, self.spatial_scale,
+                             self.sampling_ratio, self.pool_mode, self.aligned)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale}, '
+        s += f'sampling_ratio={self.sampling_ratio}, '
+        s += f'pool_mode={self.pool_mode}, '
+        s += f'aligned={self.aligned}, '
+        s += f'use_torchvision={self.use_torchvision})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roi_align_rotated.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e6ea3d32705ac2d61d486fe67ff550bfd8a3f4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roi_align_rotated.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.utils import deprecated_api_warning
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['roi_align_rotated_forward', 'roi_align_rotated_backward'])
+
+
+class RoIAlignRotatedFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
+                 aligned, clockwise):
+        if isinstance(output_size, int):
+            out_h = output_size
+            out_w = output_size
+        elif isinstance(output_size, tuple):
+            assert len(output_size) == 2
+            assert isinstance(output_size[0], int)
+            assert isinstance(output_size[1], int)
+            out_h, out_w = output_size
+        else:
+            raise TypeError(
+                '"output_size" must be an integer or tuple of integers')
+        return g.op(
+            'mmcv::MMCVRoIAlignRotated',
+            input,
+            rois,
+            output_height_i=out_h,
+            output_width_i=out_h,
+            spatial_scale_f=spatial_scale,
+            sampling_ratio_i=sampling_ratio,
+            aligned_i=aligned,
+            clockwise_i=clockwise)
+
+    @staticmethod
+    def forward(ctx: Any,
+                input: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: Union[int, tuple],
+                spatial_scale: float,
+                sampling_ratio: int = 0,
+                aligned: bool = True,
+                clockwise: bool = False) -> torch.Tensor:
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        ctx.aligned = aligned
+        ctx.clockwise = clockwise
+        ctx.save_for_backward(rois)
+        ctx.feature_size = input.size()
+
+        batch_size, num_channels, data_height, data_width = input.size()
+        num_rois = rois.size(0)
+
+        output = input.new_zeros(num_rois, num_channels, ctx.output_size[0],
+                                 ctx.output_size[1])
+        ext_module.roi_align_rotated_forward(
+            input,
+            rois,
+            output,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            aligned=ctx.aligned,
+            clockwise=ctx.clockwise)
+        return output
+
+    @staticmethod
+    def backward(
+        ctx: Any, grad_output: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], None, None,
+               None, None, None]:
+        feature_size = ctx.feature_size
+        rois = ctx.saved_tensors[0]
+        assert feature_size is not None
+        batch_size, num_channels, data_height, data_width = feature_size
+
+        out_w = grad_output.size(3)
+        out_h = grad_output.size(2)
+
+        grad_input = grad_rois = None
+
+        if ctx.needs_input_grad[0]:
+            grad_input = rois.new_zeros(batch_size, num_channels, data_height,
+                                        data_width)
+            ext_module.roi_align_rotated_backward(
+                grad_output.contiguous(),
+                rois,
+                grad_input,
+                pooled_height=out_h,
+                pooled_width=out_w,
+                spatial_scale=ctx.spatial_scale,
+                sampling_ratio=ctx.sampling_ratio,
+                aligned=ctx.aligned,
+                clockwise=ctx.clockwise)
+        return grad_input, grad_rois, None, None, None, None, None
+
+
+roi_align_rotated = RoIAlignRotatedFunction.apply
+
+
+class RoIAlignRotated(nn.Module):
+    """RoI align pooling layer for rotated proposals.
+
+    It accepts a feature map of shape (N, C, H, W) and rois with shape
+    (n, 6) with each roi decoded as (batch_index, center_x, center_y,
+    w, h, angle). The angle is in radian.
+
+    Args:
+        output_size (tuple): h, w
+        spatial_scale (float): scale the input boxes by this number
+        sampling_ratio(int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        aligned (bool): if False, use the legacy implementation in
+            MMDetection. If True, align the results more perfectly.
+            Default: True.
+        clockwise (bool): If True, the angle in each proposal follows a
+            clockwise fashion in image space, otherwise, the angle is
+            counterclockwise. Default: False.
+
+    Note:
+        The implementation of RoIAlign when aligned=True is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        The meaning of aligned=True:
+
+        Given a continuous coordinate c, its two neighboring pixel
+        indices (in our pixel model) are computed by floor(c - 0.5) and
+        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
+        indices [0] and [1] (which are sampled from the underlying signal
+        at continuous coordinates 0.5 and 1.5). But the original roi_align
+        (aligned=False) does not subtract the 0.5 when computing
+        neighboring pixel indices and therefore it uses pixels with a
+        slightly incorrect alignment (relative to our pixel model) when
+        performing bilinear interpolation.
+
+        With `aligned=True`,
+        we first appropriately scale the ROI and then shift it by -0.5
+        prior to calling roi_align. This produces the correct neighbors;
+
+        The difference does not make a difference to the model's
+        performance if ROIAlign is used together with conv layers.
+    """
+
+    @deprecated_api_warning(
+        {
+            'out_size': 'output_size',
+            'sample_num': 'sampling_ratio'
+        },
+        cls_name='RoIAlignRotated')
+    def __init__(self,
+                 output_size: Union[int, tuple],
+                 spatial_scale: float,
+                 sampling_ratio: int = 0,
+                 aligned: bool = True,
+                 clockwise: bool = False):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.aligned = aligned
+        self.clockwise = clockwise
+
+    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+        return RoIAlignRotatedFunction.apply(input, rois, self.output_size,
+                                             self.spatial_scale,
+                                             self.sampling_ratio, self.aligned,
+                                             self.clockwise)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale}, '
+        s += f'sampling_ratio={self.sampling_ratio}, '
+        s += f'aligned={self.aligned}, '
+        s += f'clockwise={self.clockwise})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roi_pool.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..e295b6a0c16b893688be3a574c6ce423df3399e4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roi_pool.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['roi_pool_forward', 'roi_pool_backward'])
+
+
+class RoIPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale):
+        return g.op(
+            'MaxRoiPool',
+            input,
+            rois,
+            pooled_shape_i=output_size,
+            spatial_scale_f=spatial_scale)
+
+    @staticmethod
+    def forward(ctx: Any,
+                input: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: Union[int, tuple],
+                spatial_scale: float = 1.0) -> torch.Tensor:
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.input_shape = input.size()
+
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        argmax = input.new_zeros(output_shape, dtype=torch.int)
+
+        ext_module.roi_pool_forward(
+            input,
+            rois,
+            output,
+            argmax,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale)
+
+        ctx.save_for_backward(rois, argmax)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+            ctx: Any, grad_output: torch.Tensor
+    ) -> Tuple[torch.Tensor, None, None, None]:
+        rois, argmax = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+
+        ext_module.roi_pool_backward(
+            grad_output,
+            rois,
+            argmax,
+            grad_input,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale)
+
+        return grad_input, None, None, None
+
+
+roi_pool = RoIPoolFunction.apply
+
+
+class RoIPool(nn.Module):
+
+    def __init__(self,
+                 output_size: Union[int, tuple],
+                 spatial_scale: float = 1.0):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+
+    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+        return roi_pool(input, rois, self.output_size, self.spatial_scale)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roiaware_pool3d.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roiaware_pool3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..728f246809ae991481a85cc0b4eb8e689bdc7f8f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roiaware_pool3d.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Tuple, Union
+
+import mmengine
+import torch
+from torch import nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['roiaware_pool3d_forward', 'roiaware_pool3d_backward'])
+
+
+class RoIAwarePool3d(nn.Module):
+    """Encode the geometry-specific features of each 3D proposal.
+
+    Please refer to `PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_ for more
+    details.
+
+    Args:
+        out_size (int or tuple): The size of output features. n or
+            [n1, n2, n3].
+        max_pts_per_voxel (int, optional): The maximum number of points per
+            voxel. Default: 128.
+        mode (str, optional): Pooling method of RoIAware, 'max' or 'avg'.
+            Default: 'max'.
+    """
+
+    def __init__(self,
+                 out_size: Union[int, tuple],
+                 max_pts_per_voxel: int = 128,
+                 mode: str = 'max'):
+        super().__init__()
+
+        self.out_size = out_size
+        self.max_pts_per_voxel = max_pts_per_voxel
+        assert mode in ['max', 'avg']
+        pool_mapping = {'max': 0, 'avg': 1}
+        self.mode = pool_mapping[mode]
+
+    def forward(self, rois: torch.Tensor, pts: torch.Tensor,
+                pts_feature: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
+                (x, y, z) is the bottom center of rois.
+            pts (torch.Tensor): [npoints, 3], coordinates of input points.
+            pts_feature (torch.Tensor): [npoints, C], features of input points.
+
+        Returns:
+            torch.Tensor: Pooled features whose shape is
+            [N, out_x, out_y, out_z, C].
+        """
+
+        return RoIAwarePool3dFunction.apply(rois, pts, pts_feature,
+                                            self.out_size,
+                                            self.max_pts_per_voxel, self.mode)
+
+
+class RoIAwarePool3dFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any, rois: torch.Tensor, pts: torch.Tensor,
+                pts_feature: torch.Tensor, out_size: Union[int, tuple],
+                max_pts_per_voxel: int, mode: int) -> torch.Tensor:
+        """
+        Args:
+            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
+                (x, y, z) is the bottom center of rois.
+            pts (torch.Tensor): [npoints, 3], coordinates of input points.
+            pts_feature (torch.Tensor): [npoints, C], features of input points.
+            out_size (int or tuple): The size of output features. n or
+                [n1, n2, n3].
+            max_pts_per_voxel (int): The maximum number of points per voxel.
+                Default: 128.
+            mode (int): Pooling method of RoIAware, 0 (max pool) or 1 (average
+                pool).
+
+        Returns:
+            torch.Tensor: Pooled features whose shape is
+            [N, out_x, out_y, out_z, C].
+        """
+
+        if isinstance(out_size, int):
+            out_x = out_y = out_z = out_size
+        else:
+            assert len(out_size) == 3
+            assert mmengine.is_tuple_of(out_size, int)
+            out_x, out_y, out_z = out_size
+
+        num_rois = rois.shape[0]
+        num_channels = pts_feature.shape[-1]
+        num_pts = pts.shape[0]
+
+        pooled_features = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, num_channels))
+        argmax = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int)
+        pts_idx_of_voxels = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, max_pts_per_voxel),
+            dtype=torch.int)
+
+        ext_module.roiaware_pool3d_forward(
+            rois,
+            pts,
+            pts_feature,
+            argmax,
+            pts_idx_of_voxels,
+            pooled_features,
+            pool_method=mode)
+
+        ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode,
+                                            num_pts, num_channels)
+        return pooled_features
+
+    @staticmethod
+    def backward(
+        ctx: Any, grad_out: torch.Tensor
+    ) -> Tuple[None, None, torch.Tensor, None, None, None]:
+        ret = ctx.roiaware_pool3d_for_backward
+        pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret
+
+        grad_in = grad_out.new_zeros((num_pts, num_channels))
+        ext_module.roiaware_pool3d_backward(
+            pts_idx_of_voxels,
+            argmax,
+            grad_out.contiguous(),
+            grad_in,
+            pool_method=mode)
+
+        return None, None, grad_in, None, None, None
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roipoint_pool3d.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roipoint_pool3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c16f5fa67cb3cf6d48d4263b5acf0173ccde7bf
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/roipoint_pool3d.py
@@ -0,0 +1,87 @@
+from typing import Any, Tuple
+
+import torch
+from torch import nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['roipoint_pool3d_forward'])
+
+
+class RoIPointPool3d(nn.Module):
+    """Encode the geometry-specific features of each 3D proposal.
+
+    Please refer to `Paper of PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_
+    for more details.
+
+    Args:
+        num_sampled_points (int, optional): Number of samples in each roi.
+            Default: 512.
+    """
+
+    def __init__(self, num_sampled_points: int = 512):
+        super().__init__()
+        self.num_sampled_points = num_sampled_points
+
+    def forward(self, points: torch.Tensor, point_features: torch.Tensor,
+                boxes3d: torch.Tensor) -> Tuple[torch.Tensor]:
+        """
+        Args:
+            points (torch.Tensor): Input points whose shape is (B, N, C).
+            point_features (torch.Tensor): Features of input points whose shape
+                is (B, N, C).
+            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the pooled features whose shape is (B, M, 512, 3 + C). The
+            second is an empty flag whose shape is (B, M).
+        """
+        return RoIPointPool3dFunction.apply(points, point_features, boxes3d,
+                                            self.num_sampled_points)
+
+
+class RoIPointPool3dFunction(Function):
+
+    @staticmethod
+    def forward(
+            ctx: Any,
+            points: torch.Tensor,
+            point_features: torch.Tensor,
+            boxes3d: torch.Tensor,
+            num_sampled_points: int = 512
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            points (torch.Tensor): Input points whose shape is (B, N, C).
+            point_features (torch.Tensor): Features of input points whose shape
+                is (B, N, C).
+            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).
+            num_sampled_points (int, optional): The num of sampled points.
+                Default: 512.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the pooled features whose shape is (B, M, 512, 3 + C). The
+            second is an empty flag whose shape is (B, M).
+        """
+        assert len(points.shape) == 3 and points.shape[2] == 3
+        batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[
+            1], point_features.shape[2]
+        pooled_boxes3d = boxes3d.view(batch_size, -1, 7)
+        pooled_features = point_features.new_zeros(
+            (batch_size, boxes_num, num_sampled_points, 3 + feature_len))
+        pooled_empty_flag = point_features.new_zeros(
+            (batch_size, boxes_num)).int()
+
+        ext_module.roipoint_pool3d_forward(points.contiguous(),
+                                           pooled_boxes3d.contiguous(),
+                                           point_features.contiguous(),
+                                           pooled_features, pooled_empty_flag)
+
+        return pooled_features, pooled_empty_flag
+
+    @staticmethod
+    def backward(ctx: Any, grad_out: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/rotated_feature_align.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/rotated_feature_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..0132c048621790fcd6211f179405a6f7b0d77390
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/rotated_feature_align.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any
+
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['rotated_feature_align_forward', 'rotated_feature_align_backward'])
+
+
+class RotatedFeatureAlignFunction(Function):
+    """Using the feature interpolation to obtain the position information
+    correspond to the refined rotate anchors and reconstruct the feature maps
+    in pixel-wise manner to achieve feature alignment.
+
+    The details are described in the paper
+    `R3Det: Refined Single-Stage Detector with Feature Refinement for Rotating
+    Object <https://arxiv.org/abs/1908.05612>`_.
+    """
+
+    @staticmethod
+    def symbolic(g, features, best_rbboxes, spatial_scale, points):
+        assert points in [1, 5]
+        return g.op(
+            'mmcv::MMCVRotatedFeatureAlign',
+            features,
+            best_rbboxes,
+            spatial_scale_f=spatial_scale,
+            points_i=points)
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, best_rbboxes: torch.Tensor,
+                spatial_scale: float, points: int) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Input features with shape [N,C,H,W].
+            best_rbboxes (torch.Tensor): Refined rotate anchors with
+                shape [N,H,W,5]. Coordinate format (cx,cx,h,w,a).
+            spatial_scale (float): The scale of feature map size and
+                input image size.
+            points (int, optional): The number of sample points.
+                Only 1 and 5 are supported. Defaults to 1.
+
+        Returns:
+            torch.Tensor: Refined features with shape [N,C,H,W].
+        """
+        ctx.spatial_scale = spatial_scale
+        ctx.points = points
+        ctx.save_for_backward(best_rbboxes)
+        assert points in [1, 5]
+        output = torch.zeros_like(features)
+        ext_module.rotated_feature_align_forward(
+            features,
+            best_rbboxes,
+            output,
+            spatial_scale=spatial_scale,
+            points=points)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        """
+        Args:
+            grad_output (torch.Tensor): The gradient of output features
+                with shape [N,C,H,W].
+
+        Returns:
+            torch.Tensor: The gradient of input features with shape [N,C,H,W].
+        """
+        best_rbboxes = ctx.saved_tensors[0]
+        points = ctx.points
+        spatial_scale = ctx.spatial_scale
+        grad_input = None
+        if ctx.needs_input_grad[0]:
+            grad_input = torch.zeros_like(grad_output)
+            ext_module.rotated_feature_align_backward(
+                grad_output.contiguous(),
+                best_rbboxes,
+                grad_input,
+                spatial_scale=spatial_scale,
+                points=points)
+        return grad_input, None, None, None
+
+
+def rotated_feature_align(features: torch.Tensor,
+                          best_rbboxes: torch.Tensor,
+                          spatial_scale: float = 1 / 8,
+                          points: int = 1) -> torch.Tensor:
+    return RotatedFeatureAlignFunction.apply(features, best_rbboxes,
+                                             spatial_scale, points)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/saconv.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/saconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..f932884073e0cc9a428d41f66b8aec0112b9e5ff
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/saconv.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import constant_init
+from mmengine.registry import MODELS
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+from mmcv.cnn import ConvAWS2d
+from mmcv.ops.deform_conv import deform_conv2d
+
+
+@MODELS.register_module(name='SAC')
+class SAConv2d(ConvAWS2d):
+    """SAC (Switchable Atrous Convolution)
+
+    This is an implementation of `DetectoRS: Detecting Objects with Recursive
+    Feature Pyramid and Switchable Atrous Convolution
+    <https://arxiv.org/abs/2006.02334>`_.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel elements.
+            Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+        use_deform: If ``True``, replace convolution with deformable
+            convolution. Default: ``False``.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 use_deform=False):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.use_deform = use_deform
+        self.switch = nn.Conv2d(
+            self.in_channels, 1, kernel_size=1, stride=stride, bias=True)
+        self.weight_diff = nn.Parameter(torch.Tensor(self.weight.size()))
+        self.pre_context = nn.Conv2d(
+            self.in_channels, self.in_channels, kernel_size=1, bias=True)
+        self.post_context = nn.Conv2d(
+            self.out_channels, self.out_channels, kernel_size=1, bias=True)
+        if self.use_deform:
+            self.offset_s = nn.Conv2d(
+                self.in_channels,
+                18,
+                kernel_size=3,
+                padding=1,
+                stride=stride,
+                bias=True)
+            self.offset_l = nn.Conv2d(
+                self.in_channels,
+                18,
+                kernel_size=3,
+                padding=1,
+                stride=stride,
+                bias=True)
+        self.init_weights()
+
+    def init_weights(self):
+        constant_init(self.switch, 0, bias=1)
+        self.weight_diff.data.zero_()
+        constant_init(self.pre_context, 0)
+        constant_init(self.post_context, 0)
+        if self.use_deform:
+            constant_init(self.offset_s, 0)
+            constant_init(self.offset_l, 0)
+
+    def forward(self, x):
+        # pre-context
+        avg_x = F.adaptive_avg_pool2d(x, output_size=1)
+        avg_x = self.pre_context(avg_x)
+        avg_x = avg_x.expand_as(x)
+        x = x + avg_x
+        # switch
+        avg_x = F.pad(x, pad=(2, 2, 2, 2), mode='reflect')
+        avg_x = F.avg_pool2d(avg_x, kernel_size=5, stride=1, padding=0)
+        switch = self.switch(avg_x)
+        # sac
+        weight = self._get_weight(self.weight)
+        zero_bias = torch.zeros(
+            self.out_channels, device=weight.device, dtype=weight.dtype)
+
+        if self.use_deform:
+            offset = self.offset_s(avg_x)
+            out_s = deform_conv2d(x, offset, weight, self.stride, self.padding,
+                                  self.dilation, self.groups, 1)
+        else:
+            if (TORCH_VERSION == 'parrots'
+                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):
+                out_s = super().conv2d_forward(x, weight)
+            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
+                # bias is a required argument of _conv_forward in torch 1.8.0
+                out_s = super()._conv_forward(x, weight, zero_bias)
+            else:
+                out_s = super()._conv_forward(x, weight)
+        ori_p = self.padding
+        ori_d = self.dilation
+        self.padding = tuple(3 * p for p in self.padding)
+        self.dilation = tuple(3 * d for d in self.dilation)
+        weight = weight + self.weight_diff
+        if self.use_deform:
+            offset = self.offset_l(avg_x)
+            out_l = deform_conv2d(x, offset, weight, self.stride, self.padding,
+                                  self.dilation, self.groups, 1)
+        else:
+            if (TORCH_VERSION == 'parrots'
+                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):
+                out_l = super().conv2d_forward(x, weight)
+            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
+                # bias is a required argument of _conv_forward in torch 1.8.0
+                out_l = super()._conv_forward(x, weight, zero_bias)
+            else:
+                out_l = super()._conv_forward(x, weight)
+
+        out = switch * out_s + (1 - switch) * out_l
+        self.padding = ori_p
+        self.dilation = ori_d
+        # post-context
+        avg_x = F.adaptive_avg_pool2d(out, output_size=1)
+        avg_x = self.post_context(avg_x)
+        avg_x = avg_x.expand_as(out)
+        out = out + avg_x
+        return out
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/scatter_points.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/scatter_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d881bfe63309fb406c123ee69d4e37125f45843
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/scatter_points.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['dynamic_point_to_voxel_forward', 'dynamic_point_to_voxel_backward'])
+
+
+class _DynamicScatter(Function):
+
+    @staticmethod
+    def forward(ctx: Any,
+                feats: torch.Tensor,
+                coors: torch.Tensor,
+                reduce_type: str = 'max') -> Tuple[torch.Tensor, torch.Tensor]:
+        """convert kitti points(N, >=3) to voxels.
+
+        Args:
+            feats (torch.Tensor): [N, C]. Points features to be reduced
+                into voxels.
+            coors (torch.Tensor): [N, ndim]. Corresponding voxel coordinates
+                (specifically multi-dim voxel index) of each points.
+            reduce_type (str, optional): Reduce op. support 'max', 'sum' and
+                'mean'. Default: 'max'.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        results = ext_module.dynamic_point_to_voxel_forward(
+            feats, coors, reduce_type)
+        (voxel_feats, voxel_coors, point2voxel_map,
+         voxel_points_count) = results
+        ctx.reduce_type = reduce_type
+        ctx.save_for_backward(feats, voxel_feats, point2voxel_map,
+                              voxel_points_count)
+        ctx.mark_non_differentiable(voxel_coors)
+        return voxel_feats, voxel_coors
+
+    @staticmethod
+    def backward(ctx: Any,
+                 grad_voxel_feats: torch.Tensor,
+                 grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple:
+        (feats, voxel_feats, point2voxel_map,
+         voxel_points_count) = ctx.saved_tensors
+        grad_feats = torch.zeros_like(feats)
+        # TODO: whether to use index put or use cuda_backward
+        # To use index put, need point to voxel index
+        ext_module.dynamic_point_to_voxel_backward(
+            grad_feats, grad_voxel_feats.contiguous(), feats, voxel_feats,
+            point2voxel_map, voxel_points_count, ctx.reduce_type)
+        return grad_feats, None, None
+
+
+dynamic_scatter = _DynamicScatter.apply
+
+
+class DynamicScatter(nn.Module):
+    """Scatters points into voxels, used in the voxel encoder with dynamic
+    voxelization.
+
+    Note:
+        The CPU and GPU implementation get the same output, but have numerical
+        difference after summation and division (e.g., 5e-7).
+
+    Args:
+        voxel_size (list): list [x, y, z] size of three dimension.
+        point_cloud_range (list): The coordinate range of points, [x_min,
+            y_min, z_min, x_max, y_max, z_max].
+        average_points (bool): whether to use avg pooling to scatter points
+            into voxel.
+    """
+
+    def __init__(self, voxel_size: List, point_cloud_range: List,
+                 average_points: bool):
+        super().__init__()
+
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.average_points = average_points
+
+    def forward_single(
+            self, points: torch.Tensor,
+            coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Scatters points into voxels.
+
+        Args:
+            points (torch.Tensor): Points to be reduced into voxels.
+            coors (torch.Tensor): Corresponding voxel coordinates (specifically
+                multi-dim voxel index) of each points.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        reduce = 'mean' if self.average_points else 'max'
+        return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)
+
+    def forward(self, points: torch.Tensor,
+                coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Scatters points/features into voxels.
+
+        Args:
+            points (torch.Tensor): Points to be reduced into voxels.
+            coors (torch.Tensor): Corresponding voxel coordinates (specifically
+                multi-dim voxel index) of each points.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        if coors.size(-1) == 3:
+            return self.forward_single(points, coors)
+        else:
+            batch_size = coors[-1, 0] + 1
+            voxels, voxel_coors = [], []
+            for i in range(batch_size):
+                inds = torch.where(coors[:, 0] == i)
+                voxel, voxel_coor = self.forward_single(
+                    points[inds], coors[inds][:, 1:])
+                coor_pad = F.pad(voxel_coor, (1, 0), mode='constant', value=i)
+                voxel_coors.append(coor_pad)
+                voxels.append(voxel)
+            features = torch.cat(voxels, dim=0)
+            feature_coors = torch.cat(voxel_coors, dim=0)
+
+            return features, feature_coors
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'voxel_size=' + str(self.voxel_size)
+        s += ', point_cloud_range=' + str(self.point_cloud_range)
+        s += ', average_points=' + str(self.average_points)
+        s += ')'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_conv.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32129d1833d0ed61ec2c8bef77bcb9013a11ee2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_conv.py
@@ -0,0 +1,455 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import numpy as np
+import torch
+from mmengine.registry import MODELS
+from torch.nn import init
+from torch.nn.parameter import Parameter
+
+from . import sparse_functional as Fsp
+from . import sparse_ops as ops
+from .sparse_modules import SparseModule
+from .sparse_structure import SparseConvTensor
+
+
+def _calculate_fan_in_and_fan_out_hwio(tensor):
+    dimensions = tensor.ndimension()
+    if dimensions < 2:
+        raise ValueError('fan in and fan out can not be computed for tensor'
+                         'with fewer than 2 dimensions')
+
+    if dimensions == 2:  # Linear
+        fan_in = tensor.size(-2)
+        fan_out = tensor.size(-1)
+    else:
+        num_input_fmaps = tensor.size(-2)
+        num_output_fmaps = tensor.size(-1)
+        receptive_field_size = 1
+        if tensor.dim() > 2:
+            receptive_field_size = tensor[..., 0, 0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+class SparseConvolution(SparseModule):
+
+    def __init__(self,
+                 ndim,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 subm=False,
+                 output_padding=0,
+                 transposed=False,
+                 inverse=False,
+                 indice_key=None,
+                 fused_bn=False):
+        super().__init__()
+        assert groups == 1
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+        if not isinstance(output_padding, (list, tuple)):
+            output_padding = [output_padding] * ndim
+
+        for d, s in zip(dilation, stride):
+            assert any([s == 1, d == 1]), "don't support this."
+
+        self.ndim = ndim
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.conv1x1 = np.prod(kernel_size) == 1
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.transposed = transposed
+        self.inverse = inverse
+        self.output_padding = output_padding
+        self.groups = groups
+        self.subm = subm
+        self.indice_key = indice_key
+        self.fused_bn = fused_bn
+
+        self.weight = Parameter(
+            torch.Tensor(*kernel_size, in_channels, out_channels))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input):
+        assert isinstance(input, SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            if self.transposed:
+                out_spatial_shape = ops.get_deconv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation, self.output_padding)
+            else:
+                out_spatial_shape = ops.get_conv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation)
+
+        else:
+            out_spatial_shape = spatial_shape
+
+        if self.conv1x1:
+            features = torch.mm(
+                input.features,
+                self.weight.view(self.in_channels, self.out_channels))
+            if self.bias is not None:
+                features += self.bias
+            out_tensor = SparseConvTensor(features, input.indices,
+                                          input.spatial_shape,
+                                          input.batch_size)
+            out_tensor.indice_dict = input.indice_dict
+            out_tensor.grid = input.grid
+            return out_tensor
+        data = input.find_indice_pair(self.indice_key)
+        if self.inverse:
+            assert data is not None and self.indice_key is not None
+            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = data
+            assert indice_pairs.shape[0] == np.prod(
+                self.kernel_size
+            ), 'inverse conv must have same kernel size as its couple conv'
+        else:
+            if self.indice_key is not None and data is not None:
+                outids, _, indice_pairs, indice_pair_num, _ = data
+            else:
+                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
+                    indices,
+                    batch_size,
+                    spatial_shape,
+                    self.kernel_size,
+                    self.stride,
+                    self.padding,
+                    self.dilation,
+                    self.output_padding,
+                    self.subm,
+                    self.transposed,
+                    grid=input.grid)
+                input.indice_dict[self.indice_key] = (outids, indices,
+                                                      indice_pairs,
+                                                      indice_pair_num,
+                                                      spatial_shape)
+        if self.fused_bn:
+            assert self.bias is not None
+            out_features = ops.fused_indice_conv(features, self.weight,
+                                                 self.bias,
+                                                 indice_pairs.to(device),
+                                                 indice_pair_num,
+                                                 outids.shape[0], self.inverse,
+                                                 self.subm)
+        else:
+            if self.subm:
+                out_features = Fsp.indice_subm_conv(features, self.weight,
+                                                    indice_pairs.to(device),
+                                                    indice_pair_num,
+                                                    outids.shape[0])
+            else:
+                if self.inverse:
+                    out_features = Fsp.indice_inverse_conv(
+                        features, self.weight, indice_pairs.to(device),
+                        indice_pair_num, outids.shape[0])
+                else:
+                    out_features = Fsp.indice_conv(features, self.weight,
+                                                   indice_pairs.to(device),
+                                                   indice_pair_num,
+                                                   outids.shape[0])
+
+            if self.bias is not None:
+                out_features += self.bias
+        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
+                                      batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+
+
+@MODELS.register_module()
+class SparseConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SparseConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SparseConv4d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SparseConvTranspose2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SparseConvTranspose3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SparseInverseConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key=None,
+                 bias=True):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SparseInverseConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key=None,
+                 bias=True):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SubMConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SubMConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SubMConv4d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_functional.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a80a545aa5411b047518bf1286bb8489bece76b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_functional.py
@@ -0,0 +1,156 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+
+import torch
+from torch.autograd import Function
+
+from . import sparse_ops as ops
+
+
+class SparseConvFunction(Function):
+    """Sparse Convolution.
+
+    Please refer to `SECOND <https://www.mdpi.com/1424-8220/18/10/3337>`_ for
+    more details.
+    """
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
+                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, False)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            False)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SparseInverseConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
+                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, True, False)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            True, False)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SubMConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
+                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, False, True)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            False, True)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SparseMaxPoolFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, indice_pairs: torch.Tensor,
+                indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from sparse maxpooling.
+        """
+        out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,
+                                 num_activate_out)
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
+        input_bp = ops.indice_maxpool_backward(features, out, grad_output,
+                                               indice_pairs, indice_pair_num)
+        return input_bp, None, None, None
+
+
+indice_conv = SparseConvFunction.apply
+indice_inverse_conv = SparseInverseConvFunction.apply
+indice_subm_conv = SubMConvFunction.apply
+indice_maxpool = SparseMaxPoolFunction.apply
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_modules.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..20a92aa279754767da493eee876cb1ab716bc770
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_modules.py
@@ -0,0 +1,203 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from collections import OrderedDict
+from typing import Any, List, Optional, Union
+
+import torch
+from torch import nn
+
+from .sparse_structure import SparseConvTensor
+
+
+def is_spconv_module(module: nn.Module) -> bool:
+    spconv_modules = (SparseModule, )
+    return isinstance(module, spconv_modules)
+
+
+def is_sparse_conv(module: nn.Module) -> bool:
+    from .sparse_conv import SparseConvolution
+    return isinstance(module, SparseConvolution)
+
+
+def _mean_update(vals: Union[int, List], m_vals: Union[int, List],
+                 t: float) -> List:
+    outputs = []
+    if not isinstance(vals, list):
+        vals = [vals]
+    if not isinstance(m_vals, list):
+        m_vals = [m_vals]
+    for val, m_val in zip(vals, m_vals):
+        output = t / float(t + 1) * m_val + 1 / float(t + 1) * val
+        outputs.append(output)
+    if len(outputs) == 1:
+        outputs = outputs[0]
+    return outputs
+
+
+class SparseModule(nn.Module):
+    """place holder, All module subclass from this will take sptensor in
+    SparseSequential."""
+    pass
+
+
+class SparseSequential(SparseModule):
+    r"""A sequential container.
+    Modules will be added to it in the order they are passed in the
+    constructor.
+    Alternatively, an ordered dict of modules can also be passed in.
+
+    To make it easier to understand, given is a small example::
+
+    Example:
+        >>> # using Sequential:
+        >>> from mmcv.ops import SparseSequential
+        >>> model = SparseSequential(
+                    SparseConv2d(1,20,5),
+                    nn.ReLU(),
+                    SparseConv2d(20,64,5),
+                    nn.ReLU()
+                    )
+
+        >>> # using Sequential with OrderedDict
+        >>> model = SparseSequential(OrderedDict([
+                      ('conv1', SparseConv2d(1,20,5)),
+                      ('relu1', nn.ReLU()),
+                      ('conv2', SparseConv2d(20,64,5)),
+                      ('relu2', nn.ReLU())
+                    ]))
+
+        >>> # using Sequential with kwargs(python 3.6+)
+        >>> model = SparseSequential(
+                      conv1=SparseConv2d(1,20,5),
+                      relu1=nn.ReLU(),
+                      conv2=SparseConv2d(20,64,5),
+                      relu2=nn.ReLU()
+                    )
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+        for name, module in kwargs.items():
+            if sys.version_info < (3, 6):
+                raise ValueError('kwargs only supported in py36+')
+            if name in self._modules:
+                raise ValueError('name exists.')
+            self.add_module(name, module)
+        self._sparity_dict = {}
+
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError(f'index {idx} is out of range')
+        if idx < 0:
+            idx += len(self)
+        it = iter(self._modules.values())
+        for i in range(idx):
+            next(it)
+        return next(it)
+
+    def __len__(self):
+        return len(self._modules)
+
+    @property
+    def sparity_dict(self):
+        return self._sparity_dict
+
+    def add(self, module: Any, name: Optional[str] = None) -> None:
+        if name is None:
+            name = str(len(self._modules))
+            if name in self._modules:
+                raise KeyError('name exists')
+        self.add_module(name, module)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        for k, module in self._modules.items():
+            if is_spconv_module(module):
+                assert isinstance(input, SparseConvTensor)
+                self._sparity_dict[k] = input.sparity
+                input = module(input)
+            else:
+                if isinstance(input, SparseConvTensor):
+                    if input.indices.shape[0] != 0:
+                        input.features = module(input.features)
+                else:
+                    input = module(input)
+        return input
+
+    def fused(self):
+        from .sparse_conv import SparseConvolution
+        mods = [v for k, v in self._modules.items()]
+        fused_mods = []
+        idx = 0
+        while idx < len(mods):
+            if is_sparse_conv(mods[idx]):
+                if idx < len(mods) - 1 and isinstance(mods[idx + 1],
+                                                      nn.BatchNorm1d):
+                    new_module = SparseConvolution(
+                        ndim=mods[idx].ndim,
+                        in_channels=mods[idx].in_channels,
+                        out_channels=mods[idx].out_channels,
+                        kernel_size=mods[idx].kernel_size,
+                        stride=mods[idx].stride,
+                        padding=mods[idx].padding,
+                        dilation=mods[idx].dilation,
+                        groups=mods[idx].groups,
+                        bias=True,
+                        subm=mods[idx].subm,
+                        output_padding=mods[idx].output_padding,
+                        transposed=mods[idx].transposed,
+                        inverse=mods[idx].inverse,
+                        indice_key=mods[idx].indice_key,
+                        fused_bn=True,
+                    )
+                    new_module.load_state_dict(mods[idx].state_dict(), False)
+                    new_module.to(mods[idx].weight.device)
+                    conv = new_module
+                    bn = mods[idx + 1]
+                    conv.bias.data.zero_()
+                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (
+                        torch.sqrt(bn.running_var) + bn.eps)
+                    conv.bias.data[:] = (
+                        conv.bias.data - bn.running_mean) * bn.weight.data / (
+                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
+                    fused_mods.append(conv)
+                    idx += 2
+                else:
+                    fused_mods.append(mods[idx])
+                    idx += 1
+            else:
+                fused_mods.append(mods[idx])
+                idx += 1
+        return SparseSequential(*fused_mods)
+
+
+class ToDense(SparseModule):
+    """convert SparseConvTensor to NCHW dense tensor."""
+
+    def forward(self, x: SparseConvTensor):
+        return x.dense()
+
+
+class RemoveGrid(SparseModule):
+    """remove pre-allocated grid buffer."""
+
+    def forward(self, x: SparseConvTensor):
+        x.grid = None
+        return x
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_ops.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b3f54bcffe7f6d8aae166ab06bceb9d2494b93
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_ops.py
@@ -0,0 +1,174 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'get_indice_pairs_2d_forward', 'get_indice_pairs_3d_forward',
+    'get_indice_pairs_4d_forward', 'get_indice_pairs_2d_backward',
+    'get_indice_pairs_3d_backward', 'indice_conv_forward',
+    'indice_conv_backward', 'fused_indice_conv_forward',
+    'indice_maxpool_forward', 'indice_maxpool_backward'
+])
+
+
+def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        size = (input_size[i] + 2 * padding[i] - dilation[i] *
+                (kernel_size[i] - 1) - 1) // stride[i] + 1
+        if kernel_size[i] == -1:
+            output_size.append(1)
+        else:
+            output_size.append(size)
+    return output_size
+
+
+def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
+                           output_padding):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        if kernel_size[i] == -1:
+            raise ValueError("deconv don't support kernel_size < 0")
+        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
+            i] + output_padding[i]
+        output_size.append(size)
+    return output_size
+
+
+def get_indice_pairs(indices,
+                     batch_size,
+                     spatial_shape,
+                     ksize=3,
+                     stride=1,
+                     padding=0,
+                     dilation=1,
+                     out_padding=0,
+                     subm=False,
+                     transpose=False,
+                     grid=None):
+    ndim = indices.shape[1] - 1
+    if not isinstance(ksize, (list, tuple)):
+        ksize = [ksize] * ndim
+    if not isinstance(stride, (list, tuple)):
+        stride = [stride] * ndim
+    if not isinstance(padding, (list, tuple)):
+        padding = [padding] * ndim
+    if not isinstance(dilation, (list, tuple)):
+        dilation = [dilation] * ndim
+    if not isinstance(out_padding, (list, tuple)):
+        out_padding = [out_padding] * ndim
+
+    for d, s in zip(dilation, stride):
+        assert any([s == 1, d == 1]), "don't support this."
+
+    if not subm:
+        if transpose:
+            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
+                                               padding, dilation, out_padding)
+        else:
+            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
+                                             padding, dilation)
+
+    else:
+        out_shape = spatial_shape
+    if grid is None:
+        if ndim == 2:
+            get_indice_pairs_func = ext_module.get_indice_pairs_2d_forward
+        elif ndim == 3:
+            get_indice_pairs_func = ext_module.get_indice_pairs_3d_forward
+        elif ndim == 4:
+            get_indice_pairs_func = ext_module.get_indice_pairs_4d_forward
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+    else:
+        if ndim == 2:
+            get_indice_pairs_func = ext_module.get_indice_pairs_2d_backward
+        elif ndim == 3:
+            get_indice_pairs_func = ext_module.get_indice_pairs_3d_backward
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, grid, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+
+
+def indice_conv(features,
+                filters,
+                indice_pairs,
+                indice_pair_num,
+                num_activate_out,
+                inverse=False,
+                subm=False):
+    if filters.dtype == torch.float32 or filters.dtype == torch.half:
+        return ext_module.indice_conv_forward(features, filters, indice_pairs,
+                                              indice_pair_num,
+                                              num_activate_out, int(inverse),
+                                              int(subm))
+    else:
+        raise NotImplementedError
+
+
+def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
+                      num_activate_out, inverse, subm):
+    if features.dtype == torch.half or filters.dtypes == torch.float32:
+        func = ext_module.fused_indice_conv_forward
+    else:
+        raise NotImplementedError
+
+    return func(features, filters, bias, indice_pairs, indice_pair_num,
+                num_activate_out, int(inverse), int(subm))
+
+
+def indice_conv_backward(features,
+                         filters,
+                         out_bp,
+                         indice_pairs,
+                         indice_pair_num,
+                         inverse=False,
+                         subm=False):
+    if filters.dtype == torch.float32 or filters.dtype == torch.half:
+        return ext_module.indice_conv_backward(features, filters, out_bp,
+                                               indice_pairs, indice_pair_num,
+                                               int(inverse), int(subm))
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
+    if features.dtype == torch.float32 or features.dtype == torch.half:
+        return ext_module.indice_maxpool_forward(features, indice_pairs,
+                                                 indice_pair_num,
+                                                 num_activate_out)
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
+                            indice_pair_num):
+    if features.dtype == torch.float32 or features.dtype == torch.half:
+        return ext_module.indice_maxpool_backward(features, out_features,
+                                                  out_bp, indice_pairs,
+                                                  indice_pair_num)
+    else:
+        raise NotImplementedError
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_pool.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4edb1d4e4029bfff2978bc9ea7961719d873110
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_pool.py
@@ -0,0 +1,86 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# import sparse_functional as Fsp
+# import sparse_ops as ops
+from .sparse_functional import indice_maxpool
+from .sparse_modules import SparseModule
+from .sparse_ops import get_conv_output_size, get_indice_pairs
+from .sparse_structure import SparseConvTensor
+
+
+class SparseMaxPool(SparseModule):
+
+    def __init__(self,
+                 ndim,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 subm=False):
+        super().__init__()
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+
+        self.ndim = ndim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.subm = subm
+        self.dilation = dilation
+
+    def forward(self, input):
+        assert isinstance(input, SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            out_spatial_shape = get_conv_output_size(spatial_shape,
+                                                     self.kernel_size,
+                                                     self.stride, self.padding,
+                                                     self.dilation)
+        else:
+            out_spatial_shape = spatial_shape
+        outids, indice_pairs, indice_pairs_num = get_indice_pairs(
+            indices, batch_size, spatial_shape, self.kernel_size, self.stride,
+            self.padding, self.dilation, 0, self.subm)
+
+        out_features = indice_maxpool(features, indice_pairs.to(device),
+                                      indice_pairs_num.to(device),
+                                      outids.shape[0])
+        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
+                                      batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+
+
+class SparseMaxPool2d(SparseMaxPool):
+
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super().__init__(2, kernel_size, stride, padding, dilation)
+
+
+class SparseMaxPool3d(SparseMaxPool):
+
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super().__init__(3, kernel_size, stride, padding, dilation)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_structure.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_structure.py
new file mode 100644
index 0000000000000000000000000000000000000000..83907ab5563ff292e8c48715f5b1149a7d31f460
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_structure.py
@@ -0,0 +1,66 @@
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+
+def scatter_nd(indices: torch.Tensor, updates: torch.Tensor,
+               shape: torch.Tensor) -> torch.Tensor:
+    """pytorch edition of tensorflow scatter_nd.
+
+    this function don't contain except handle code. so use this carefully when
+    indice repeats, don't support repeat add which is supported in tensorflow.
+    """
+    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    ret[slices] = updates.view(*output_shape)
+    return ret
+
+
+class SparseConvTensor:
+
+    def __init__(self,
+                 features: torch.Tensor,
+                 indices: torch.Tensor,
+                 spatial_shape: Union[List, Tuple],
+                 batch_size: int,
+                 grid: Optional[torch.Tensor] = None):
+        self.features = features
+        self.indices = indices
+        if self.indices.dtype != torch.int32:
+            self.indices.int()
+        self.spatial_shape = spatial_shape
+        self.batch_size = batch_size
+        self.indice_dict: dict = {}
+        self.grid = grid
+
+    @property
+    def spatial_size(self):
+        return np.prod(self.spatial_shape)
+
+    def find_indice_pair(self, key):
+        if key is None:
+            return None
+        if key in self.indice_dict:
+            return self.indice_dict[key]
+        return None
+
+    def dense(self, channels_first: bool = True) -> torch.Tensor:
+        output_shape = [self.batch_size] + list(
+            self.spatial_shape) + [self.features.shape[1]]
+        res = scatter_nd(self.indices.long(), self.features, output_shape)
+        if not channels_first:
+            return res
+        ndim = len(self.spatial_shape)
+        trans_params = list(range(0, ndim + 1))
+        trans_params.insert(1, ndim + 1)
+        return res.permute(*trans_params).contiguous()
+
+    @property
+    def sparity(self):
+        return (self.indices.shape[0] / np.prod(self.spatial_shape) /
+                self.batch_size)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sync_bn.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sync_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b14d30376c7d1dce957e2225b5b8c8af54bb52a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/sync_bn.py
@@ -0,0 +1,283 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from mmengine.registry import MODELS
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.module import Module
+from torch.nn.parameter import Parameter
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'sync_bn_forward_mean', 'sync_bn_forward_var', 'sync_bn_forward_output',
+    'sync_bn_backward_param', 'sync_bn_backward_data'
+])
+
+
+class SyncBatchNormFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, running_mean, running_var, weight, bias, momentum,
+                 eps, group, group_size, stats_mode):
+        return g.op(
+            'mmcv::MMCVSyncBatchNorm',
+            input,
+            running_mean,
+            running_var,
+            weight,
+            bias,
+            momentum_f=momentum,
+            eps_f=eps,
+            group_i=group,
+            group_size_i=group_size,
+            stats_mode=stats_mode)
+
+    @staticmethod
+    def forward(self, input: torch.Tensor, running_mean: torch.Tensor,
+                running_var: torch.Tensor, weight: torch.Tensor,
+                bias: torch.Tensor, momentum: float, eps: float, group: int,
+                group_size: int, stats_mode: str) -> torch.Tensor:
+        self.momentum = momentum
+        self.eps = eps
+        self.group = group
+        self.group_size = group_size
+        self.stats_mode = stats_mode
+
+        assert isinstance(
+                   input, (torch.HalfTensor, torch.FloatTensor,
+                           torch.cuda.HalfTensor, torch.cuda.FloatTensor)), \
+               f'only support Half or Float Tensor, but {input.type()}'
+        output = torch.zeros_like(input)
+        input3d = input.flatten(start_dim=2)
+        output3d = output.view_as(input3d)
+        num_channels = input3d.size(1)
+
+        # ensure mean/var/norm/std are initialized as zeros
+        # ``torch.empty()`` does not guarantee that
+        mean = torch.zeros(
+            num_channels, dtype=torch.float, device=input3d.device)
+        var = torch.zeros(
+            num_channels, dtype=torch.float, device=input3d.device)
+        norm = torch.zeros_like(
+            input3d, dtype=torch.float, device=input3d.device)
+        std = torch.zeros(
+            num_channels, dtype=torch.float, device=input3d.device)
+
+        batch_size = input3d.size(0)
+        if batch_size > 0:
+            ext_module.sync_bn_forward_mean(input3d, mean)
+            batch_flag = torch.ones([1], device=mean.device, dtype=mean.dtype)
+        else:
+            # skip updating mean and leave it as zeros when the input is empty
+            batch_flag = torch.zeros([1], device=mean.device, dtype=mean.dtype)
+
+        # synchronize mean and the batch flag
+        vec = torch.cat([mean, batch_flag])
+        if self.stats_mode == 'N':
+            vec *= batch_size
+        if self.group_size > 1:
+            dist.all_reduce(vec, group=self.group)
+        total_batch = vec[-1].detach()
+        mean = vec[:num_channels]
+
+        if self.stats_mode == 'default':
+            mean = mean / self.group_size
+        elif self.stats_mode == 'N':
+            mean = mean / total_batch.clamp(min=1)
+        else:
+            raise NotImplementedError
+
+        # leave var as zeros when the input is empty
+        if batch_size > 0:
+            ext_module.sync_bn_forward_var(input3d, mean, var)
+
+        if self.stats_mode == 'N':
+            var *= batch_size
+        if self.group_size > 1:
+            dist.all_reduce(var, group=self.group)
+
+        if self.stats_mode == 'default':
+            var /= self.group_size
+        elif self.stats_mode == 'N':
+            var /= total_batch.clamp(min=1)
+        else:
+            raise NotImplementedError
+
+        # if the total batch size over all the ranks is zero,
+        # we should not update the statistics in the current batch
+        update_flag = total_batch.clamp(max=1)
+        momentum = update_flag * self.momentum
+        ext_module.sync_bn_forward_output(
+            input3d,
+            mean,
+            var,
+            weight,
+            bias,
+            running_mean,
+            running_var,
+            norm,
+            std,
+            output3d,
+            eps=self.eps,
+            momentum=momentum,
+            group_size=self.group_size)
+        self.save_for_backward(norm, std, weight)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(self, grad_output: torch.Tensor) -> tuple:
+        norm, std, weight = self.saved_tensors
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(weight)
+        grad_input = torch.zeros_like(grad_output)
+        grad_output3d = grad_output.flatten(start_dim=2)
+        grad_input3d = grad_input.view_as(grad_output3d)
+
+        batch_size = grad_input3d.size(0)
+        if batch_size > 0:
+            ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight,
+                                              grad_bias)
+
+        # all reduce
+        if self.group_size > 1:
+            dist.all_reduce(grad_weight, group=self.group)
+            dist.all_reduce(grad_bias, group=self.group)
+            grad_weight /= self.group_size
+            grad_bias /= self.group_size
+
+        if batch_size > 0:
+            ext_module.sync_bn_backward_data(grad_output3d, weight,
+                                             grad_weight, grad_bias, norm, std,
+                                             grad_input3d)
+
+        return grad_input, None, None, grad_weight, grad_bias, \
+            None, None, None, None, None
+
+
+@MODELS.register_module(name='MMSyncBN')
+class SyncBatchNorm(Module):
+    """Synchronized Batch Normalization.
+
+    Args:
+        num_features (int): number of features/chennels in input tensor
+        eps (float, optional): a value added to the denominator for numerical
+            stability. Defaults to 1e-5.
+        momentum (float, optional): the value used for the running_mean and
+            running_var computation. Defaults to 0.1.
+        affine (bool, optional): whether to use learnable affine parameters.
+            Defaults to True.
+        track_running_stats (bool, optional): whether to track the running
+            mean and variance during training. When set to False, this
+            module does not track such statistics, and initializes statistics
+            buffers ``running_mean`` and ``running_var`` as ``None``. When
+            these buffers are ``None``, this module always uses batch
+            statistics in both training and eval modes. Defaults to True.
+        group (int, optional): synchronization of stats happen within
+            each process group individually. By default it is synchronization
+            across the whole world. Defaults to None.
+        stats_mode (str, optional): The statistical mode. Available options
+            includes ``'default'`` and ``'N'``. Defaults to 'default'.
+            When ``stats_mode=='default'``, it computes the overall statistics
+            using those from each worker with equal weight, i.e., the
+            statistics are synchronized and simply divied by ``group``. This
+            mode will produce inaccurate statistics when empty tensors occur.
+            When ``stats_mode=='N'``, it compute the overall statistics using
+            the total number of batches in each worker ignoring the number of
+            group, i.e., the statistics are synchronized and then divied by
+            the total batch ``N``. This mode is beneficial when empty tensors
+            occur during training, as it average the total mean by the real
+            number of batch.
+    """
+
+    def __init__(self,
+                 num_features: int,
+                 eps: float = 1e-5,
+                 momentum: float = 0.1,
+                 affine: bool = True,
+                 track_running_stats: bool = True,
+                 group: Optional[int] = None,
+                 stats_mode: str = 'default'):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        group = dist.group.WORLD if group is None else group
+        self.group = group
+        self.group_size = dist.get_world_size(group)
+        assert stats_mode in ['default', 'N'], \
+            f'"stats_mode" only accepts "default" and "N", got "{stats_mode}"'
+        self.stats_mode = stats_mode
+        if self.affine:
+            self.weight = Parameter(torch.Tensor(num_features))
+            self.bias = Parameter(torch.Tensor(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        if self.track_running_stats:
+            self.register_buffer('running_mean', torch.zeros(num_features))
+            self.register_buffer('running_var', torch.ones(num_features))
+            self.register_buffer('num_batches_tracked',
+                                 torch.tensor(0, dtype=torch.long))
+        else:
+            self.register_buffer('running_mean', None)
+            self.register_buffer('running_var', None)
+            self.register_buffer('num_batches_tracked', None)
+        self.reset_parameters()
+
+    def reset_running_stats(self):
+        if self.track_running_stats:
+            self.running_mean.zero_()
+            self.running_var.fill_(1)
+            self.num_batches_tracked.zero_()
+
+    def reset_parameters(self):
+        self.reset_running_stats()
+        if self.affine:
+            self.weight.data.uniform_()  # pytorch use ones_()
+            self.bias.data.zero_()
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if input.dim() < 2:
+            raise ValueError(
+                f'expected at least 2D input, got {input.dim()}D input')
+        if self.momentum is None:
+            exponential_average_factor = 0.0
+        else:
+            exponential_average_factor = self.momentum
+
+        if self.training and self.track_running_stats:
+            if self.num_batches_tracked is not None:
+                self.num_batches_tracked += 1
+                if self.momentum is None:  # use cumulative moving average
+                    exponential_average_factor = 1.0 / float(
+                        self.num_batches_tracked)
+                else:  # use exponential moving average
+                    exponential_average_factor = self.momentum
+
+        if self.training or not self.track_running_stats:
+            return SyncBatchNormFunction.apply(
+                input, self.running_mean, self.running_var, self.weight,
+                self.bias, exponential_average_factor, self.eps, self.group,
+                self.group_size, self.stats_mode)
+        else:
+            return F.batch_norm(input, self.running_mean, self.running_var,
+                                self.weight, self.bias, False,
+                                exponential_average_factor, self.eps)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'({self.num_features}, '
+        s += f'eps={self.eps}, '
+        s += f'momentum={self.momentum}, '
+        s += f'affine={self.affine}, '
+        s += f'track_running_stats={self.track_running_stats}, '
+        s += f'group_size={self.group_size},'
+        s += f'stats_mode={self.stats_mode})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/three_interpolate.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/three_interpolate.py
new file mode 100644
index 0000000000000000000000000000000000000000..286bd0472ebae83f405534178a19fefe9ffbc384
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/three_interpolate.py
@@ -0,0 +1,69 @@
+from typing import Any, Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['three_interpolate_forward', 'three_interpolate_backward'])
+
+
+class ThreeInterpolate(Function):
+    """Performs weighted linear interpolation on 3 features.
+
+    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_
+    for more details.
+    """
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, indices: torch.Tensor,
+                weight: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): (B, C, M) Features descriptors to be
+                interpolated.
+            indices (torch.Tensor): (B, n, 3) indices of three nearest
+                neighbor features for the target features.
+            weight (torch.Tensor): (B, n, 3) weights of three nearest
+                neighbor features for the target features.
+
+        Returns:
+            torch.Tensor: (B, C, N) tensor of the interpolated features
+        """
+        assert features.is_contiguous()
+        assert indices.is_contiguous()
+        assert weight.is_contiguous()
+
+        B, c, m = features.size()
+        n = indices.size(1)
+        ctx.three_interpolate_for_backward = (indices, weight, m)
+        output = features.new_empty(B, c, n)
+
+        ext_module.three_interpolate_forward(
+            features, indices, weight, output, b=B, c=c, m=m, n=n)
+        return output
+
+    @staticmethod
+    def backward(
+        ctx, grad_out: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            grad_out (torch.Tensor): (B, C, N) tensor with gradients of outputs
+
+        Returns:
+            torch.Tensor: (B, C, M) tensor with gradients of features
+        """
+        idx, weight, m = ctx.three_interpolate_for_backward
+        B, c, n = grad_out.size()
+
+        grad_features = grad_out.new_zeros(B, c, m)
+        grad_out_data = grad_out.data.contiguous()
+
+        ext_module.three_interpolate_backward(
+            grad_out_data, idx, weight, grad_features.data, b=B, c=c, n=n, m=m)
+        return grad_features, None, None
+
+
+three_interpolate = ThreeInterpolate.apply
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/three_nn.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/three_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d41b9789cf8e33c4cd490a8ec522e9c4bc7851e4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/three_nn.py
@@ -0,0 +1,51 @@
+from typing import Any, Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['three_nn_forward'])
+
+
+class ThreeNN(Function):
+    """Find the top-3 nearest neighbors of the target set from the source set.
+
+    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_
+    for more details.
+    """
+
+    @staticmethod
+    def forward(ctx: Any, target: torch.Tensor,
+                source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            target (torch.Tensor): shape (B, N, 3), points set that needs to
+                find the nearest neighbors.
+            source (torch.Tensor): shape (B, M, 3), points set that is used
+                to find the nearest neighbors of points in target set.
+
+        Returns:
+            torch.Tensor: shape (B, N, 3), L2 distance of each point in target
+            set to their corresponding top three nearest neighbors.
+        """
+        target = target.contiguous()
+        source = source.contiguous()
+
+        B, N, _ = target.size()
+        m = source.size(1)
+        dist2 = target.new_empty(B, N, 3)
+        idx = target.new_empty(B, N, 3, dtype=torch.int32)
+
+        ext_module.three_nn_forward(target, source, dist2, idx, b=B, n=N, m=m)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+
+        return torch.sqrt(dist2), idx
+
+    @staticmethod
+    def backward(ctx, a=None, b=None):
+        return None, None
+
+
+three_nn = ThreeNN.apply
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/tin_shift.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/tin_shift.py
new file mode 100644
index 0000000000000000000000000000000000000000..473231cc0de002bbf8bdb22cc19755487fbddb48
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/tin_shift.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Code reference from "Temporal Interlacing Network"
+# https://github.com/deepcs233/TIN/blob/master/cuda_shift/rtc_wrap.py
+# Hao Shao, Shengju Qian, Yu Liu
+# shaoh19@mails.tsinghua.edu.cn, sjqian@cse.cuhk.edu.hk, yuliu@ee.cuhk.edu.hk
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['tin_shift_forward', 'tin_shift_backward'])
+
+
+class TINShiftFunction(Function):
+
+    @staticmethod
+    def forward(ctx, input, shift):
+        if input.size(0) != shift.size(0):
+            raise ValueError(
+                'The first dim (batch) of `input` and `shift` should be '
+                f'same, but got {input.size(0)} and {shift.size(0)}.')
+        C = input.size(2)
+        num_segments = shift.size(1)
+        if C // num_segments <= 0 or C % num_segments != 0:
+            raise ValueError('C should be a multiple of num_segments, '
+                             f'but got C={C} and num_segments={num_segments}.')
+
+        ctx.save_for_backward(shift)
+
+        out = torch.zeros_like(input)
+        ext_module.tin_shift_forward(input, shift, out)
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        shift = ctx.saved_tensors[0]
+        data_grad_input = grad_output.new(*grad_output.size()).zero_()
+        shift_grad_input = shift.new(*shift.size()).zero_()
+        ext_module.tin_shift_backward(grad_output, shift, data_grad_input)
+
+        return data_grad_input, shift_grad_input
+
+
+tin_shift = TINShiftFunction.apply
+
+
+class TINShift(nn.Module):
+    """Temporal Interlace Shift.
+
+    Temporal Interlace shift is a differentiable temporal-wise frame shifting
+    which is proposed in "Temporal Interlacing Network"
+
+    Please refer to `Temporal Interlacing Network
+    <https://arxiv.org/abs/2001.06499>`_ for more details.
+
+    Code is modified from https://github.com/mit-han-lab/temporal-shift-module
+    """
+
+    def forward(self, input, shift):
+        """Perform temporal interlace shift.
+
+        Args:
+            input (torch.Tensor): Feature map with shape
+                [N, num_segments, C, H * W].
+            shift (torch.Tensor): Shift tensor with shape [N, num_segments].
+
+        Returns:
+            Feature map after temporal interlace shift.
+        """
+        return tin_shift(input, shift)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/upfirdn2d.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/upfirdn2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..857e840c1b7eebd94f446849b71deef996a55387
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/upfirdn2d.py
@@ -0,0 +1,460 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/upfirdn2d.py # noqa
+"""Custom PyTorch ops for efficient resampling of 2D images."""
+from typing import Dict, List, Union
+
+import torch
+
+from ..utils import ext_loader
+from .conv2d_gradfix import conv2d
+
+ext_module = ext_loader.load_ext('_ext', ['upfirdn2d'])
+
+
+def _parse_scaling(scaling):
+    """parse scaling into list [x, y]"""
+    if isinstance(scaling, int):
+        scaling = [scaling, scaling]
+    assert isinstance(scaling, (list, tuple))
+    assert all(isinstance(x, int) for x in scaling)
+    sx, sy = scaling
+    assert sx >= 1 and sy >= 1
+    return sx, sy
+
+
+def _parse_padding(padding):
+    """parse padding into list [padx0, padx1, pady0, pady1]"""
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    assert isinstance(padding, (list, tuple))
+    assert all(isinstance(x, int) for x in padding)
+    if len(padding) == 2:
+        padx, pady = padding
+        padding = [padx, padx, pady, pady]
+    padx0, padx1, pady0, pady1 = padding
+    return padx0, padx1, pady0, pady1
+
+
+def _get_filter_size(filter):
+    """get width and height of filter kernel."""
+    if filter is None:
+        return 1, 1
+    assert isinstance(filter, torch.Tensor) and filter.ndim in [1, 2]
+    fw = filter.shape[-1]
+    fh = filter.shape[0]
+    fw = int(fw)
+    fh = int(fh)
+    assert fw >= 1 and fh >= 1
+    return fw, fh
+
+
+def upfirdn2d(input: torch.Tensor,
+              filter: torch.Tensor,
+              up: int = 1,
+              down: int = 1,
+              padding: Union[int, List[int]] = 0,
+              flip_filter: bool = False,
+              gain: Union[float, int] = 1,
+              use_custom_op: bool = True):
+    """Pad, upsample, filter, and downsample a batch of 2D images.
+
+    Performs the following sequence of operations for each channel:
+
+    1. Upsample the image by inserting N-1 zeros after each pixel (`up`).
+
+    2. Pad the image with the specified number of zeros on each side
+    (`padding`). Negative padding corresponds to cropping the image.
+
+    3. Convolve the image with the specified 2D FIR filter (`f`),
+    shrinking it so that the footprint of all output pixels lies within
+    the input image.
+
+    4. Downsample the image by keeping every Nth pixel (`down`).
+
+    This sequence of operations bears close resemblance to
+        scipy.signal.upfirdn().
+
+    The fused op is considerably more efficient than performing the same
+    calculation using standard PyTorch ops. It supports gradients of arbitrary
+    order.
+
+    Args:
+        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
+            filter_width]` (non-separable), `[filter_taps]` (separable), or
+            `None` (identity).
+        up (int): Integer upsampling factor. Can be a single int or a
+            list/tuple `[x, y]`. Defaults to 1.
+        down (int): Integer downsampling factor. Can be a single int
+            or a list/tuple `[x, y]`. Defaults to 1.
+        padding (int | tuple[int]): Padding with respect to the upsampled
+            image. Can be a single number or a list/tuple `[x, y]` or
+            `[x_before, x_after, y_before, y_after]`. Defaults to 0.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+        gain (int): Overall scaling factor for signal magnitude.
+            Defaults to 1.
+        use_custom_op (bool): Whether to use customized op.
+            Defaults to True.
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`
+    """
+    assert isinstance(input, torch.Tensor)
+    if use_custom_op and input.device.type == 'cuda':
+        return _upfirdn2d_cuda(
+            up=up,
+            down=down,
+            padding=padding,
+            flip_filter=flip_filter,
+            gain=gain).apply(input, filter)
+    return _upfirdn2d_ref(
+        input,
+        filter,
+        up=up,
+        down=down,
+        padding=padding,
+        flip_filter=flip_filter,
+        gain=gain)
+
+
+def _upfirdn2d_ref(input: torch.Tensor,
+                   filter: torch.Tensor,
+                   up: int = 1,
+                   down: int = 1,
+                   padding: Union[int, List[int]] = 0,
+                   flip_filter: bool = False,
+                   gain: Union[float, int] = 1):
+    """Slow reference implementation of `upfirdn2d()` using standard PyTorch
+    ops.
+
+    Args:
+        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
+            filter_width]` (non-separable), `[filter_taps]` (separable), or
+            `None` (identity).
+        up (int): Integer upsampling factor. Can be a single int or a
+            list/tuple `[x, y]`. Defaults to 1.
+        down (int): Integer downsampling factor. Can be a single int
+            or a list/tuple `[x, y]`. Defaults to 1.
+        padding (int | tuple[int]): Padding with respect to the upsampled
+            image. Can be a single number or a list/tuple `[x, y]` or
+            `[x_before, x_after, y_before, y_after]`. Defaults to 0.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+        gain (int): Overall scaling factor for signal magnitude.
+            Defaults to 1.
+
+    Returns:
+        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
+            out_height, out_width]`.
+    """
+    # Validate arguments.
+    assert isinstance(input, torch.Tensor) and input.ndim == 4
+    if filter is None:
+        filter = torch.ones([1, 1], dtype=torch.float32, device=input.device)
+    assert isinstance(filter, torch.Tensor) and filter.ndim in [1, 2]
+    assert filter.dtype == torch.float32 and not filter.requires_grad
+    batch_size, num_channels, in_height, in_width = input.shape
+    upx, upy = _parse_scaling(up)
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+
+    # Check that upsampled buffer is not smaller than the filter.
+    upW = in_width * upx + padx0 + padx1
+    upH = in_height * upy + pady0 + pady1
+    assert upW >= filter.shape[-1] and upH >= filter.shape[0]
+
+    # Upsample by inserting zeros.
+    x = input.reshape([batch_size, num_channels, in_height, 1, in_width, 1])
+    x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1])
+    x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx])
+
+    # Pad or crop.
+    x = torch.nn.functional.pad(
+        x, [max(padx0, 0),
+            max(padx1, 0),
+            max(pady0, 0),
+            max(pady1, 0)])
+    x = x[:, :,
+          max(-pady0, 0):x.shape[2] - max(-pady1, 0),
+          max(-padx0, 0):x.shape[3] - max(-padx1, 0)]
+
+    # Setup filter.
+    filter = filter * (gain**(filter.ndim / 2))
+    filter = filter.to(x.dtype)
+    if not flip_filter:
+        filter = filter.flip(list(range(filter.ndim)))
+
+    # Convolve with the filter.
+    filter = filter[None, None].repeat([num_channels, 1] + [1] * filter.ndim)
+    if filter.ndim == 4:
+        x = conv2d(input=x, weight=filter, groups=num_channels)
+    else:
+        x = conv2d(input=x, weight=filter.unsqueeze(2), groups=num_channels)
+        x = conv2d(input=x, weight=filter.unsqueeze(3), groups=num_channels)
+
+    # Downsample by throwing away pixels.
+    x = x[:, :, ::downy, ::downx]
+    return x
+
+
+_upfirdn2d_cuda_cache: Dict = dict()
+
+
+def _upfirdn2d_cuda(up: int = 1,
+                    down: int = 1,
+                    padding: Union[int, List[int]] = 0,
+                    flip_filter: bool = False,
+                    gain: Union[float, int] = 1):
+    """Fast CUDA implementation of `upfirdn2d()` using custom ops.
+
+    Args:
+        up (int): Integer upsampling factor. Can be a single int or a
+            list/tuple `[x, y]`. Defaults to 1.
+        down (int): Integer downsampling factor. Can be a single int
+            or a list/tuple `[x, y]`. Defaults to 1.
+        padding (int | tuple[int]): Padding with respect to the upsampled
+            image. Can be a single number or a list/tuple `[x, y]` or
+            `[x_before, x_after, y_before, y_after]`. Defaults to 0.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+        gain (int): Overall scaling factor for signal magnitude.
+            Defaults to 1.
+
+    Returns:
+        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
+        out_height, out_width]`
+    """
+    # Parse arguments.
+    upx, upy = _parse_scaling(up)
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+
+    # Lookup from cache.
+    key = (upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter,
+           gain)
+    if key in _upfirdn2d_cuda_cache:
+        return _upfirdn2d_cuda_cache[key]
+
+    # Forward op.
+    class Upfirdn2dCuda(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, x, f):  # pylint: disable=arguments-differ
+            assert isinstance(x, torch.Tensor) and x.ndim == 4
+            if f is None:
+                f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+            if f.ndim == 1 and f.shape[0] == 1:
+                f = f.square().unsqueeze(
+                    0)  # Convert separable-1 into full-1x1.
+            assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+            y = x
+            if f.ndim == 2:
+                y = ext_module.upfirdn2d(y, f, upx, upy, downx, downy, padx0,
+                                         padx1, pady0, pady1, flip_filter,
+                                         gain)
+            else:
+                y = ext_module.upfirdn2d(y, f.unsqueeze(0), upx, 1, downx, 1,
+                                         padx0, padx1, 0, 0, flip_filter, 1.0)
+                y = ext_module.upfirdn2d(y, f.unsqueeze(1), 1, upy, 1, downy,
+                                         0, 0, pady0, pady1, flip_filter, gain)
+            ctx.save_for_backward(f)
+            ctx.x_shape = x.shape
+            return y
+
+        @staticmethod
+        def backward(ctx, dy):  # pylint: disable=arguments-differ
+            f, = ctx.saved_tensors
+            _, _, ih, iw = ctx.x_shape
+            _, _, oh, ow = dy.shape
+            fw, fh = _get_filter_size(f)
+            p = [
+                fw - padx0 - 1,
+                iw * upx - ow * downx + padx0 - upx + 1,
+                fh - pady0 - 1,
+                ih * upy - oh * downy + pady0 - upy + 1,
+            ]
+            dx = None
+            df = None
+
+            if ctx.needs_input_grad[0]:
+                dx = _upfirdn2d_cuda(
+                    up=down,
+                    down=up,
+                    padding=p,
+                    flip_filter=(not flip_filter),
+                    gain=gain).apply(dy, f)
+
+            assert not ctx.needs_input_grad[1]
+            return dx, df
+
+    # Add to cache.
+    _upfirdn2d_cuda_cache[key] = Upfirdn2dCuda
+    return Upfirdn2dCuda
+
+
+def filter2d(input: torch.Tensor,
+             filter: torch.Tensor,
+             padding: Union[int, List[int]] = 0,
+             flip_filter: bool = False,
+             gain: Union[float, int] = 1,
+             use_custom_op: bool = True):
+    """Filter a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape matches the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
+            filter_width]` (non-separable), `[filter_taps]` (separable), or
+            `None`.
+        padding (int | tuple[int]): Padding with respect to the output.
+            Can be a single number or a list/tuple `[x, y]` or `[x_before,
+            x_after, y_before, y_after]`. Defaults to 0.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+        gain (int): Overall scaling factor for signal magnitude.
+            Defaults to 1.
+        use_custom_op (bool): Whether to use customized op.
+            Defaults to True.
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height,
+        out_width]`.
+    """
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(filter)
+    p = [
+        padx0 + fw // 2,
+        padx1 + (fw - 1) // 2,
+        pady0 + fh // 2,
+        pady1 + (fh - 1) // 2,
+    ]
+    return upfirdn2d(
+        input,
+        filter,
+        padding=p,
+        flip_filter=flip_filter,
+        gain=gain,
+        use_custom_op=use_custom_op)
+
+
+def upsample2d(input: torch.Tensor,
+               filter: torch.Tensor,
+               up: int = 2,
+               padding: Union[int, List[int]] = 0,
+               flip_filter: bool = False,
+               gain: Union[float, int] = 1,
+               use_custom_op: bool = True):
+    """Upsample a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape is a multiple of the
+    input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
+            filter_width]` (non-separable), `[filter_taps]` (separable), or
+            `None` (identity).
+        up (int): Integer upsampling factor. Can be a single int or a
+            list/tuple `[x, y]`. Defaults to 2.
+        padding (int | tuple[int]): Padding with respect to the output.
+            Can be a single number or a list/tuple `[x, y]` or `[x_before,
+            x_after, y_before, y_after]`. Defaults to 0.
+        flip_filter (bool): False = convolution, True = correlation. Defaults
+            to False.
+        gain (int): Overall scaling factor for signal magnitude. Defaults to 1.
+        use_custom_op (bool): Whether to use customized op.
+            Defaults to True.
+
+    Returns:
+        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
+        out_height, out_width]`
+    """
+    upx, upy = _parse_scaling(up)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(filter)
+    p = [
+        padx0 + (fw + upx - 1) // 2,
+        padx1 + (fw - upx) // 2,
+        pady0 + (fh + upy - 1) // 2,
+        pady1 + (fh - upy) // 2,
+    ]
+    return upfirdn2d(
+        input,
+        filter,
+        up=up,
+        padding=p,
+        flip_filter=flip_filter,
+        gain=gain * upx * upy,
+        use_custom_op=use_custom_op)
+
+
+def downsample2d(input: torch.Tensor,
+                 filter: torch.Tensor,
+                 down: int = 2,
+                 padding: Union[int, List[int]] = 0,
+                 flip_filter: bool = False,
+                 gain: Union[float, int] = 1,
+                 use_custom_op: bool = True):
+    """Downsample a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape is a fraction of the
+    input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
+            filter_width]` (non-separable), `[filter_taps]` (separable), or
+            `None` (identity).
+        down (int): Integer downsampling factor. Can be a single int or a
+                     list/tuple `[x, y]` (default: 1). Defaults to 2.
+        padding (int | tuple[int]): Padding with respect to the input.
+            Can be a single number or a list/tuple `[x, y]` or `[x_before,
+            x_after, y_before, y_after]`. Defaults to 0.
+        flip_filter (bool): False = convolution, True = correlation. Defaults
+            to False.
+        gain (int): Overall scaling factor for signal magnitude. Defaults to 1.
+        use_custom_op (bool): Whether to use customized op.
+            Defaults to True.
+
+    Returns:
+        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
+        out_height, out_width]`.
+    """
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(filter)
+    p = [
+        padx0 + (fw - downx + 1) // 2,
+        padx1 + (fw - downx) // 2,
+        pady0 + (fh - downy + 1) // 2,
+        pady1 + (fh - downy) // 2,
+    ]
+    return upfirdn2d(
+        input,
+        filter,
+        down=down,
+        padding=p,
+        flip_filter=flip_filter,
+        gain=gain,
+        use_custom_op=use_custom_op)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/voxelize.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/voxelize.py
new file mode 100644
index 0000000000000000000000000000000000000000..992ce68fd2a970bd475abaae68e62c78fec0e4c8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/ops/voxelize.py
@@ -0,0 +1,183 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, List, Tuple, Union
+
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['dynamic_voxelize_forward', 'hard_voxelize_forward'])
+
+
+class _Voxelization(Function):
+
+    @staticmethod
+    def forward(
+            ctx: Any,
+            points: torch.Tensor,
+            voxel_size: Union[tuple, float],
+            coors_range: Union[tuple, float],
+            max_points: int = 35,
+            max_voxels: int = 20000,
+            deterministic: bool = True) -> Union[Tuple[torch.Tensor], Tuple]:
+        """Convert kitti points(N, >=3) to voxels.
+
+        Args:
+            points (torch.Tensor): [N, ndim]. Points[:, :3] contain xyz points
+                and points[:, 3:] contain other information like reflectivity.
+            voxel_size (tuple or float): The size of voxel with the shape of
+                [3].
+            coors_range (tuple or float): The coordinate range of voxel with
+                the shape of [6].
+            max_points (int, optional): maximum points contained in a voxel. if
+                max_points=-1, it means using dynamic_voxelize. Default: 35.
+            max_voxels (int, optional): maximum voxels this function create.
+                for second, 20000 is a good choice. Users should shuffle points
+                before call this function because max_voxels may drop points.
+                Default: 20000.
+            deterministic: bool. whether to invoke the non-deterministic
+                version of hard-voxelization implementations. non-deterministic
+                version is considerablly fast but is not deterministic. only
+                affects hard voxelization. default True. for more information
+                of this argument and the implementation insights, please refer
+                to the following links:
+                https://github.com/open-mmlab/mmdetection3d/issues/894
+                https://github.com/open-mmlab/mmdetection3d/pull/904
+                it is an experimental feature and we will appreciate it if
+                you could share with us the failing cases.
+
+        Returns:
+            tuple[torch.Tensor]: tuple[torch.Tensor]: A tuple contains three
+            elements. The first one is the output voxels with the shape of
+            [M, max_points, n_dim], which only contain points and returned
+            when max_points != -1. The second is the voxel coordinates with
+            shape of [M, 3]. The last is number of point per voxel with the
+            shape of [M], which only returned when max_points != -1.
+        """
+        if max_points == -1 or max_voxels == -1:
+            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
+            ext_module.dynamic_voxelize_forward(
+                points,
+                torch.tensor(voxel_size, dtype=torch.float),
+                torch.tensor(coors_range, dtype=torch.float),
+                coors,
+                NDim=3)
+            return coors
+        else:
+            voxels = points.new_zeros(
+                size=(max_voxels, max_points, points.size(1)))
+            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
+            num_points_per_voxel = points.new_zeros(
+                size=(max_voxels, ), dtype=torch.int)
+            voxel_num = torch.zeros(size=(), dtype=torch.long)
+            ext_module.hard_voxelize_forward(
+                points,
+                torch.tensor(voxel_size, dtype=torch.float),
+                torch.tensor(coors_range, dtype=torch.float),
+                voxels,
+                coors,
+                num_points_per_voxel,
+                voxel_num,
+                max_points=max_points,
+                max_voxels=max_voxels,
+                NDim=3,
+                deterministic=deterministic)
+            # select the valid voxels
+            voxels_out = voxels[:voxel_num]
+            coors_out = coors[:voxel_num]
+            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
+            return voxels_out, coors_out, num_points_per_voxel_out
+
+
+voxelization = _Voxelization.apply
+
+
+class Voxelization(nn.Module):
+    """Convert kitti points(N, >=3) to voxels.
+
+    Please refer to `Point-Voxel CNN for Efficient 3D Deep Learning
+    <https://arxiv.org/abs/1907.03739>`_ for more details.
+
+    Args:
+        voxel_size (tuple or float): The size of voxel with the shape of [3].
+        point_cloud_range (tuple or float): The coordinate range of voxel with
+            the shape of [6].
+        max_num_points (int): maximum points contained in a voxel. if
+            max_points=-1, it means using dynamic_voxelize.
+        max_voxels (int, optional): maximum voxels this function create.
+            for second, 20000 is a good choice. Users should shuffle points
+            before call this function because max_voxels may drop points.
+            Default: 20000.
+    """
+
+    def __init__(self,
+                 voxel_size: List,
+                 point_cloud_range: List,
+                 max_num_points: int,
+                 max_voxels: Union[tuple, int] = 20000,
+                 deterministic: bool = True):
+        """
+        Args:
+            voxel_size (list): list [x, y, z] size of three dimension
+            point_cloud_range (list):
+                [x_min, y_min, z_min, x_max, y_max, z_max]
+            max_num_points (int): max number of points per voxel
+            max_voxels (tuple or int): max number of voxels in
+                (training, testing) time
+            deterministic: bool. whether to invoke the non-deterministic
+                version of hard-voxelization implementations. non-deterministic
+                version is considerablly fast but is not deterministic. only
+                affects hard voxelization. default True. for more information
+                of this argument and the implementation insights, please refer
+                to the following links:
+                https://github.com/open-mmlab/mmdetection3d/issues/894
+                https://github.com/open-mmlab/mmdetection3d/pull/904
+                it is an experimental feature and we will appreciate it if
+                you could share with us the failing cases.
+        """
+        super().__init__()
+
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.max_num_points = max_num_points
+        if isinstance(max_voxels, tuple):
+            self.max_voxels = max_voxels
+        else:
+            self.max_voxels = _pair(max_voxels)
+        self.deterministic = deterministic
+
+        point_cloud_range = torch.tensor(
+            point_cloud_range, dtype=torch.float32)
+        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
+        grid_size = (
+            point_cloud_range[3:] -  # type: ignore
+            point_cloud_range[:3]) / voxel_size  # type: ignore
+        grid_size = torch.round(grid_size).long()
+        input_feat_shape = grid_size[:2]
+        self.grid_size = grid_size
+        # the origin shape is as [x-len, y-len, z-len]
+        # [w, h, d] -> [d, h, w]
+        self.pcd_shape = [*input_feat_shape, 1][::-1]
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.training:
+            max_voxels = self.max_voxels[0]
+        else:
+            max_voxels = self.max_voxels[1]
+
+        return voxelization(input, self.voxel_size, self.point_cloud_range,
+                            self.max_num_points, max_voxels,
+                            self.deterministic)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'voxel_size=' + str(self.voxel_size)
+        s += ', point_cloud_range=' + str(self.point_cloud_range)
+        s += ', max_num_points=' + str(self.max_num_points)
+        s += ', max_voxels=' + str(self.max_voxels)
+        s += ', deterministic=' + str(self.deterministic)
+        s += ')'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/__init__.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4dbaa1bbfbde35696a67876eed4a283d79c85b4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseTransform
+from .builder import TRANSFORMS
+from .loading import LoadAnnotations, LoadImageFromFile
+from .processing import (CenterCrop, MultiScaleFlipAug, Normalize, Pad,
+                         RandomChoiceResize, RandomFlip, RandomGrayscale,
+                         RandomResize, Resize, TestTimeAug)
+from .wrappers import (Compose, KeyMapper, RandomApply, RandomChoice,
+                       TransformBroadcaster)
+
+try:
+    import torch  # noqa: F401
+except ImportError:
+    __all__ = [
+        'BaseTransform', 'TRANSFORMS', 'TransformBroadcaster', 'Compose',
+        'RandomChoice', 'KeyMapper', 'LoadImageFromFile', 'LoadAnnotations',
+        'Normalize', 'Resize', 'Pad', 'RandomFlip', 'RandomChoiceResize',
+        'CenterCrop', 'RandomGrayscale', 'MultiScaleFlipAug', 'RandomResize',
+        'RandomApply', 'TestTimeAug'
+    ]
+else:
+    from .formatting import ImageToTensor, ToTensor, to_tensor
+
+    __all__ = [
+        'BaseTransform', 'TRANSFORMS', 'TransformBroadcaster', 'Compose',
+        'RandomChoice', 'KeyMapper', 'LoadImageFromFile', 'LoadAnnotations',
+        'Normalize', 'Resize', 'Pad', 'ToTensor', 'to_tensor', 'ImageToTensor',
+        'RandomFlip', 'RandomChoiceResize', 'CenterCrop', 'RandomGrayscale',
+        'MultiScaleFlipAug', 'RandomResize', 'RandomApply', 'TestTimeAug'
+    ]
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/base.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..321afb6038ca9f4504289c92c4bedd1c7aed40c6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/base.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Optional, Tuple, Union
+
+
+class BaseTransform(metaclass=ABCMeta):
+    """Base class for all transformations."""
+
+    def __call__(self,
+                 results: Dict) -> Optional[Union[Dict, Tuple[List, List]]]:
+
+        return self.transform(results)
+
+    @abstractmethod
+    def transform(self,
+                  results: Dict) -> Optional[Union[Dict, Tuple[List, List]]]:
+        """The transform function. All subclass of BaseTransform should
+        override this method.
+
+        This function takes the result dict as the input, and can add new
+        items to the dict or modify existing items in the dict. And the result
+        dict will be returned in the end, which allows to concate multiple
+        transforms into a pipeline.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/builder.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..aefa21222a83d7bafa9eaf580eab41472c2cbd55
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/builder.py
@@ -0,0 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.registry import TRANSFORMS  # noqa: F401
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/formatting.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/formatting.py
new file mode 100644
index 0000000000000000000000000000000000000000..02089215e1796fa4709befc5030af93166332e41
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/formatting.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Union
+
+import mmengine
+import numpy as np
+import torch
+
+from .base import BaseTransform
+from .builder import TRANSFORMS
+
+
+def to_tensor(
+    data: Union[torch.Tensor, np.ndarray, Sequence, int,
+                float]) -> torch.Tensor:
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+
+    Returns:
+        torch.Tensor: the converted data.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmengine.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@TRANSFORMS.register_module()
+class ToTensor(BaseTransform):
+    """Convert some results to :obj:`torch.Tensor` by given keys.
+
+    Required keys:
+
+    - all these keys in `keys`
+
+    Modified Keys:
+
+    - all these keys in `keys`
+
+    Args:
+        keys (Sequence[str]): Keys that need to be converted to Tensor.
+    """
+
+    def __init__(self, keys: Sequence[str]) -> None:
+        self.keys = keys
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to convert data to `torch.Tensor`.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: `keys` in results will be updated.
+        """
+        for key in self.keys:
+
+            key_list = key.split('.')
+            cur_item = results
+            for i in range(len(key_list)):
+                if key_list[i] not in cur_item:
+                    raise KeyError(f'Can not find key {key}')
+                if i == len(key_list) - 1:
+                    cur_item[key_list[i]] = to_tensor(cur_item[key_list[i]])
+                    break
+                cur_item = cur_item[key_list[i]]
+
+        return results
+
+    def __repr__(self) -> str:
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@TRANSFORMS.register_module()
+class ImageToTensor(BaseTransform):
+    """Convert image to :obj:`torch.Tensor` by given keys.
+
+    The dimension order of input image is (H, W, C). The pipeline will convert
+    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
+    (1, H, W).
+
+    Required keys:
+
+    - all these keys in `keys`
+
+    Modified Keys:
+
+    - all these keys in `keys`
+
+    Args:
+        keys (Sequence[str]): Key of images to be converted to Tensor.
+    """
+
+    def __init__(self, keys: dict) -> None:
+        self.keys = keys
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to convert image in results to
+        :obj:`torch.Tensor` and transpose the channel order.
+        Args:
+            results (dict): Result dict contains the image data to convert.
+        Returns:
+            dict: The result dict contains the image converted
+            to :obj:``torch.Tensor`` and transposed to (C, H, W) order.
+        """
+        for key in self.keys:
+            img = results[key]
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            results[key] = (to_tensor(img.transpose(2, 0, 1))).contiguous()
+        return results
+
+    def __repr__(self) -> str:
+        return self.__class__.__name__ + f'(keys={self.keys})'
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/loading.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..6028837ecab99c993d7a14885e585810d059270c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/loading.py
@@ -0,0 +1,363 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional
+
+import mmengine.fileio as fileio
+import numpy as np
+
+import mmcv
+from .base import BaseTransform
+from .builder import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class LoadImageFromFile(BaseTransform):
+    """Load an image from file.
+
+    Required Keys:
+
+    - img_path
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+            Defaults to 'color'.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:`mmcv.imfrombytes`.
+            See :func:`mmcv.imfrombytes` for details.
+            Defaults to 'cv2'.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+            Deprecated in version 2.0.0rc4.
+        ignore_empty (bool): Whether to allow loading empty image or file path
+            not existent. Defaults to False.
+        backend_args (dict, optional): Instantiates the corresponding file
+            backend. It may contain `backend` key to specify the file
+            backend. If it contains, the file backend corresponding to this
+            value will be used and initialized with the remaining values,
+            otherwise the corresponding file backend will be selected
+            based on the prefix of the file path. Defaults to None.
+            New in version 2.0.0rc4.
+    """
+
+    def __init__(self,
+                 to_float32: bool = False,
+                 color_type: str = 'color',
+                 imdecode_backend: str = 'cv2',
+                 file_client_args: Optional[dict] = None,
+                 ignore_empty: bool = False,
+                 *,
+                 backend_args: Optional[dict] = None) -> None:
+        self.ignore_empty = ignore_empty
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.imdecode_backend = imdecode_backend
+
+        self.file_client_args: Optional[dict] = None
+        self.backend_args: Optional[dict] = None
+        if file_client_args is not None:
+            warnings.warn(
+                '"file_client_args" will be deprecated in future. '
+                'Please use "backend_args" instead', DeprecationWarning)
+            if backend_args is not None:
+                raise ValueError(
+                    '"file_client_args" and "backend_args" cannot be set '
+                    'at the same time.')
+
+            self.file_client_args = file_client_args.copy()
+        if backend_args is not None:
+            self.backend_args = backend_args.copy()
+
+    def transform(self, results: dict) -> Optional[dict]:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+        try:
+            if self.file_client_args is not None:
+                file_client = fileio.FileClient.infer_client(
+                    self.file_client_args, filename)
+                img_bytes = file_client.get(filename)
+            else:
+                img_bytes = fileio.get(
+                    filename, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(
+                img_bytes, flag=self.color_type, backend=self.imdecode_backend)
+        except Exception as e:
+            if self.ignore_empty:
+                return None
+            else:
+                raise e
+        # in some cases, images are not read successfully, the img would be
+        # `None`, refer to https://github.com/open-mmlab/mmpretrain/issues/1427
+        assert img is not None, f'failed to load image: {filename}'
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'ignore_empty={self.ignore_empty}, '
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f"imdecode_backend='{self.imdecode_backend}', ")
+
+        if self.file_client_args is not None:
+            repr_str += f'file_client_args={self.file_client_args})'
+        else:
+            repr_str += f'backend_args={self.backend_args})'
+
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadAnnotations(BaseTransform):
+    """Load and process the ``instances`` and ``seg_map`` annotation provided
+    by dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'instances':
+            [
+                {
+                # List of 4 numbers representing the bounding box of the
+                # instance, in (x1, y1, x2, y2) order.
+                'bbox': [x1, y1, x2, y2],
+
+                # Label of image classification.
+                'bbox_label': 1,
+
+                # Used in key point detection.
+                # Can only load the format of [x1, y1, v1,…, xn, yn, vn]. v[i]
+                # means the visibility of this keypoint. n must be equal to the
+                # number of keypoint categories.
+                'keypoints': [x1, y1, v1, ..., xn, yn, vn]
+                }
+            ]
+            # Filename of semantic or panoptic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+
+    .. code-block:: python
+
+        {
+            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
+            # in np.float32
+            'gt_bboxes': np.ndarray(N, 4)
+             # In np.int64 type.
+            'gt_bboxes_labels': np.ndarray(N, )
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+             # with (x, y, v) order, in np.float32 type.
+            'gt_keypoints': np.ndarray(N, NK, 3)
+        }
+
+    Required Keys:
+
+    - instances
+
+      - bbox (optional)
+      - bbox_label
+      - keypoints (optional)
+
+    - seg_map_path (optional)
+
+    Added Keys:
+
+    - gt_bboxes (np.float32)
+    - gt_bboxes_labels (np.int64)
+    - gt_seg_map (np.uint8)
+    - gt_keypoints (np.float32)
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+            Defaults to True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Defaults to True.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Defaults to False.
+        with_keypoints (bool): Whether to parse and load the keypoints
+            annotation. Defaults to False.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:`mmcv.imfrombytes`.
+            See :func:`mmcv.imfrombytes` for details.
+            Defaults to 'cv2'.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+            Deprecated in version 2.0.0rc4.
+        backend_args (dict, optional): Instantiates the corresponding file
+            backend. It may contain `backend` key to specify the file
+            backend. If it contains, the file backend corresponding to this
+            value will be used and initialized with the remaining values,
+            otherwise the corresponding file backend will be selected
+            based on the prefix of the file path. Defaults to None.
+            New in version 2.0.0rc4.
+    """
+
+    def __init__(
+        self,
+        with_bbox: bool = True,
+        with_label: bool = True,
+        with_seg: bool = False,
+        with_keypoints: bool = False,
+        imdecode_backend: str = 'cv2',
+        file_client_args: Optional[dict] = None,
+        *,
+        backend_args: Optional[dict] = None,
+    ) -> None:
+        super().__init__()
+        self.with_bbox = with_bbox
+        self.with_label = with_label
+        self.with_seg = with_seg
+        self.with_keypoints = with_keypoints
+        self.imdecode_backend = imdecode_backend
+
+        self.file_client_args: Optional[dict] = None
+        self.backend_args: Optional[dict] = None
+        if file_client_args is not None:
+            warnings.warn(
+                '"file_client_args" will be deprecated in future. '
+                'Please use "backend_args" instead', DeprecationWarning)
+            if backend_args is not None:
+                raise ValueError(
+                    '"file_client_args" and "backend_args" cannot be set '
+                    'at the same time.')
+
+            self.file_client_args = file_client_args.copy()
+        if backend_args is not None:
+            self.backend_args = backend_args.copy()
+
+    def _load_bboxes(self, results: dict) -> None:
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        gt_bboxes = []
+        for instance in results['instances']:
+            gt_bboxes.append(instance['bbox'])
+        results['gt_bboxes'] = np.array(
+            gt_bboxes, dtype=np.float32).reshape(-1, 4)
+
+    def _load_labels(self, results: dict) -> None:
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+        gt_bboxes_labels = []
+        for instance in results['instances']:
+            gt_bboxes_labels.append(instance['bbox_label'])
+        results['gt_bboxes_labels'] = np.array(
+            gt_bboxes_labels, dtype=np.int64)
+
+    def _load_seg_map(self, results: dict) -> None:
+        """Private function to load semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+        if self.file_client_args is not None:
+            file_client = fileio.FileClient.infer_client(
+                self.file_client_args, results['seg_map_path'])
+            img_bytes = file_client.get(results['seg_map_path'])
+        else:
+            img_bytes = fileio.get(
+                results['seg_map_path'], backend_args=self.backend_args)
+
+        results['gt_seg_map'] = mmcv.imfrombytes(
+            img_bytes, flag='unchanged',
+            backend=self.imdecode_backend).squeeze()
+
+    def _load_kps(self, results: dict) -> None:
+        """Private function to load keypoints annotations.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded keypoints annotations.
+        """
+        gt_keypoints = []
+        for instance in results['instances']:
+            gt_keypoints.append(instance['keypoints'])
+        results['gt_keypoints'] = np.array(gt_keypoints, np.float32).reshape(
+            (len(gt_keypoints), -1, 3))
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label and
+            semantic segmentation and keypoints annotations.
+        """
+
+        if self.with_bbox:
+            self._load_bboxes(results)
+        if self.with_label:
+            self._load_labels(results)
+        if self.with_seg:
+            self._load_seg_map(results)
+        if self.with_keypoints:
+            self._load_kps(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'with_keypoints={self.with_keypoints}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+
+        if self.file_client_args is not None:
+            repr_str += f'file_client_args={self.file_client_args})'
+        else:
+            repr_str += f'backend_args={self.backend_args})'
+
+        return repr_str
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/processing.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b65b112b6aa9b6c415ae355ecd325acd9892da8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/processing.py
@@ -0,0 +1,1562 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import random
+import warnings
+from itertools import product
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import numpy as np
+
+import mmcv
+from mmcv.image.geometric import _scale_size
+from .base import BaseTransform
+from .builder import TRANSFORMS
+from .utils import cache_randomness
+from .wrappers import Compose
+
+Number = Union[int, float]
+
+
+@TRANSFORMS.register_module()
+class Normalize(BaseTransform):
+    """Normalize the image.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Added Keys:
+
+    - img_norm_cfg
+
+      - mean
+      - std
+      - to_rgb
+
+
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB before
+            normlizing the image. If ``to_rgb=True``, the order of mean and std
+            should be RGB. If ``to_rgb=False``, the order of mean and std
+            should be the same order of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 mean: Sequence[Number],
+                 std: Sequence[Number],
+                 to_rgb: bool = True) -> None:
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+    def transform(self, results: dict) -> dict:
+        """Function to normalize images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Normalized results, key 'img_norm_cfg' key is added in to
+            result dict.
+        """
+
+        results['img'] = mmcv.imnormalize(results['img'], self.mean, self.std,
+                                          self.to_rgb)
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Resize(BaseTransform):
+    """Resize images & bbox & seg & keypoints.
+
+    This transform resizes the input image according to ``scale`` or
+    ``scale_factor``. Bboxes, seg map and keypoints are then resized with the
+    same scale factor.
+    if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to
+    resize.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_keypoints (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_seg_map
+    - gt_keypoints
+    - img_shape
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+
+    Args:
+        scale (int or tuple): Images scales for resizing. Defaults to None
+        scale_factor (float or tuple[float]): Scale factors for resizing.
+            Defaults to None.
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Defaults to False.
+        clip_object_border (bool): Whether to clip the objects
+            outside the border of the image. In some dataset like MOT17, the gt
+            bboxes are allowed to cross the border of images. Therefore, we
+            don't need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 scale: Optional[Union[int, Tuple[int, int]]] = None,
+                 scale_factor: Optional[Union[float, Tuple[float,
+                                                           float]]] = None,
+                 keep_ratio: bool = False,
+                 clip_object_border: bool = True,
+                 backend: str = 'cv2',
+                 interpolation='bilinear') -> None:
+        assert scale is not None or scale_factor is not None, (
+            '`scale` and'
+            '`scale_factor` can not both be `None`')
+        if scale is None:
+            self.scale = None
+        else:
+            if isinstance(scale, int):
+                self.scale = (scale, scale)
+            else:
+                self.scale = scale
+
+        self.backend = backend
+        self.interpolation = interpolation
+        self.keep_ratio = keep_ratio
+        self.clip_object_border = clip_object_border
+        if scale_factor is None:
+            self.scale_factor = None
+        elif isinstance(scale_factor, float):
+            self.scale_factor = (scale_factor, scale_factor)
+        elif isinstance(scale_factor, tuple):
+            assert (len(scale_factor)) == 2
+            self.scale_factor = scale_factor
+        else:
+            raise TypeError(
+                f'expect scale_factor is float or Tuple(float), but'
+                f'get {type(scale_factor)}')
+
+    def _resize_img(self, results: dict) -> None:
+        """Resize images with ``results['scale']``."""
+
+        if results.get('img', None) is not None:
+            if self.keep_ratio:
+                img, scale_factor = mmcv.imrescale(
+                    results['img'],
+                    results['scale'],
+                    interpolation=self.interpolation,
+                    return_scale=True,
+                    backend=self.backend)
+                # the w_scale and h_scale has minor difference
+                # a real fix should be done in the mmcv.imrescale in the future
+                new_h, new_w = img.shape[:2]
+                h, w = results['img'].shape[:2]
+                w_scale = new_w / w
+                h_scale = new_h / h
+            else:
+                img, w_scale, h_scale = mmcv.imresize(
+                    results['img'],
+                    results['scale'],
+                    interpolation=self.interpolation,
+                    return_scale=True,
+                    backend=self.backend)
+            results['img'] = img
+            results['img_shape'] = img.shape[:2]
+            results['scale_factor'] = (w_scale, h_scale)
+            results['keep_ratio'] = self.keep_ratio
+
+    def _resize_bboxes(self, results: dict) -> None:
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        if results.get('gt_bboxes', None) is not None:
+            bboxes = results['gt_bboxes'] * np.tile(
+                np.array(results['scale_factor']), 2)
+            if self.clip_object_border:
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0,
+                                          results['img_shape'][1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0,
+                                          results['img_shape'][0])
+            results['gt_bboxes'] = bboxes
+
+    def _resize_seg(self, results: dict) -> None:
+        """Resize semantic segmentation map with ``results['scale']``."""
+        if results.get('gt_seg_map', None) is not None:
+            if self.keep_ratio:
+                gt_seg = mmcv.imrescale(
+                    results['gt_seg_map'],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            else:
+                gt_seg = mmcv.imresize(
+                    results['gt_seg_map'],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            results['gt_seg_map'] = gt_seg
+
+    def _resize_keypoints(self, results: dict) -> None:
+        """Resize keypoints with ``results['scale_factor']``."""
+        if results.get('gt_keypoints', None) is not None:
+            keypoints = results['gt_keypoints']
+
+            keypoints[:, :, :2] = keypoints[:, :, :2] * np.array(
+                results['scale_factor'])
+            if self.clip_object_border:
+                keypoints[:, :, 0] = np.clip(keypoints[:, :, 0], 0,
+                                             results['img_shape'][1])
+                keypoints[:, :, 1] = np.clip(keypoints[:, :, 1], 0,
+                                             results['img_shape'][0])
+            results['gt_keypoints'] = keypoints
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to resize images, bounding boxes, semantic
+        segmentation map and keypoints.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'gt_keypoints', 'scale', 'scale_factor', 'img_shape',
+            and 'keep_ratio' keys are updated in result dict.
+        """
+
+        if self.scale:
+            results['scale'] = self.scale
+        else:
+            img_shape = results['img'].shape[:2]
+            results['scale'] = _scale_size(img_shape[::-1],
+                                           self.scale_factor)  # type: ignore
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_seg(results)
+        self._resize_keypoints(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(scale={self.scale}, '
+        repr_str += f'scale_factor={self.scale_factor}, '
+        repr_str += f'keep_ratio={self.keep_ratio}, '
+        repr_str += f'clip_object_border={self.clip_object_border}), '
+        repr_str += f'backend={self.backend}), '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Pad(BaseTransform):
+    """Pad the image & segmentation map.
+
+    There are three padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number. and (3)pad to square. Also,
+    pad to square and pad to the minimum size can be used as the same time.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+    - img_shape
+
+    Added Keys:
+
+    - pad_shape
+    - pad_fixed_size
+    - pad_size_divisor
+
+    Args:
+        size (tuple, optional): Fixed padding size.
+            Expected padding shape (w, h). Defaults to None.
+        size_divisor (int, optional): The divisor of padded size. Defaults to
+            None.
+        pad_to_square (bool): Whether to pad the image into a square.
+            Currently only used for YOLOX. Defaults to False.
+        pad_val (Number | dict[str, Number], optional): Padding value for if
+            the pad_mode is "constant". If it is a single number, the value
+            to pad the image is the number and to pad the semantic
+            segmentation map is 255. If it is a dict, it should have the
+            following keys:
+
+            - img: The value to pad the image.
+            - seg: The value to pad the semantic segmentation map.
+
+            Defaults to dict(img=0, seg=255).
+        padding_mode (str): Type of padding. Should be: constant, edge,
+            reflect or symmetric. Defaults to 'constant'.
+
+            - constant: pads with a constant value, this value is specified
+              with pad_val.
+            - edge: pads with the last value at the edge of the image.
+            - reflect: pads with reflection of image without repeating the last
+              value on the edge. For example, padding [1, 2, 3, 4] with 2
+              elements on both sides in reflect mode will result in
+              [3, 2, 1, 2, 3, 4, 3, 2].
+            - symmetric: pads with reflection of image repeating the last value
+              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+              both sides in symmetric mode will result in
+              [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    def __init__(self,
+                 size: Optional[Tuple[int, int]] = None,
+                 size_divisor: Optional[int] = None,
+                 pad_to_square: bool = False,
+                 pad_val: Union[Number, dict] = dict(img=0, seg=255),
+                 padding_mode: str = 'constant') -> None:
+        self.size = size
+        self.size_divisor = size_divisor
+        if isinstance(pad_val, int):
+            pad_val = dict(img=pad_val, seg=255)
+        assert isinstance(pad_val, dict), 'pad_val '
+        self.pad_val = pad_val
+        self.pad_to_square = pad_to_square
+
+        if pad_to_square:
+            assert size is None, \
+                'The size and size_divisor must be None ' \
+                'when pad2square is True'
+        else:
+            assert size is not None or size_divisor is not None, \
+                'only one of size and size_divisor should be valid'
+            assert size is None or size_divisor is None
+        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+        self.padding_mode = padding_mode
+
+    def _pad_img(self, results: dict) -> None:
+        """Pad images according to ``self.size``."""
+        pad_val = self.pad_val.get('img', 0)
+
+        size = None
+        if self.pad_to_square:
+            max_size = max(results['img'].shape[:2])
+            size = (max_size, max_size)
+        if self.size_divisor is not None:
+            if size is None:
+                size = (results['img'].shape[0], results['img'].shape[1])
+            pad_h = int(np.ceil(
+                size[0] / self.size_divisor)) * self.size_divisor
+            pad_w = int(np.ceil(
+                size[1] / self.size_divisor)) * self.size_divisor
+            size = (pad_h, pad_w)
+        elif self.size is not None:
+            size = self.size[::-1]
+        if isinstance(pad_val, int) and results['img'].ndim == 3:
+            pad_val = tuple(pad_val for _ in range(results['img'].shape[2]))
+        padded_img = mmcv.impad(
+            results['img'],
+            shape=size,
+            pad_val=pad_val,
+            padding_mode=self.padding_mode)
+
+        results['img'] = padded_img
+        results['pad_shape'] = padded_img.shape
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+        results['img_shape'] = padded_img.shape[:2]
+
+    def _pad_seg(self, results: dict) -> None:
+        """Pad semantic segmentation map according to
+        ``results['pad_shape']``."""
+        if results.get('gt_seg_map', None) is not None:
+            pad_val = self.pad_val.get('seg', 255)
+            if isinstance(pad_val, int) and results['gt_seg_map'].ndim == 3:
+                pad_val = tuple(
+                    pad_val for _ in range(results['gt_seg_map'].shape[2]))
+            results['gt_seg_map'] = mmcv.impad(
+                results['gt_seg_map'],
+                shape=results['pad_shape'][:2],
+                pad_val=pad_val,
+                padding_mode=self.padding_mode)
+
+    def transform(self, results: dict) -> dict:
+        """Call function to pad images, masks, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        self._pad_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, '
+        repr_str += f'size_divisor={self.size_divisor}, '
+        repr_str += f'pad_to_square={self.pad_to_square}, '
+        repr_str += f'pad_val={self.pad_val}), '
+        repr_str += f'padding_mode={self.padding_mode})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CenterCrop(BaseTransform):
+    """Crop the center of the image, segmentation masks, bounding boxes and key
+    points. If the crop area exceeds the original image and ``auto_pad`` is
+    True, the original image will be padded before cropping.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map (optional)
+    - gt_bboxes (optional)
+    - gt_keypoints (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_seg_map (optional)
+    - gt_bboxes (optional)
+    - gt_keypoints (optional)
+
+    Added Key:
+
+    - pad_shape
+
+
+    Args:
+        crop_size (Union[int, Tuple[int, int]]):  Expected size after cropping
+            with the format of (w, h). If set to an integer, then cropping
+            width and height are equal to this integer.
+        auto_pad (bool): Whether to pad the image if it's smaller than the
+            ``crop_size``. Defaults to False.
+        pad_cfg (dict): Base config for padding. Refer to ``mmcv.Pad`` for
+            detail. Defaults to ``dict(type='Pad')``.
+        clip_object_border (bool): Whether to clip the objects
+            outside the border of the image. In some dataset like MOT17, the
+            gt bboxes are allowed to cross the border of images. Therefore,
+            we don't need to clip the gt bboxes in these cases.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 crop_size: Union[int, Tuple[int, int]],
+                 auto_pad: bool = False,
+                 pad_cfg: dict = dict(type='Pad'),
+                 clip_object_border: bool = True) -> None:
+        super().__init__()
+        assert isinstance(crop_size, int) or (
+            isinstance(crop_size, tuple) and len(crop_size) == 2
+        ), 'The expected crop_size is an integer, or a tuple containing two '
+        'intergers'
+
+        if isinstance(crop_size, int):
+            crop_size = (crop_size, crop_size)
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        self.crop_size = crop_size
+        self.auto_pad = auto_pad
+
+        self.pad_cfg = pad_cfg.copy()
+        # size will be overwritten
+        if 'size' in self.pad_cfg and auto_pad:
+            warnings.warn('``size`` is set in ``pad_cfg``,'
+                          'however this argument will be overwritten'
+                          ' according to crop size and image size')
+
+        self.clip_object_border = clip_object_border
+
+    def _crop_img(self, results: dict, bboxes: np.ndarray) -> None:
+        """Crop image.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
+        """
+        if results.get('img', None) is not None:
+            img = mmcv.imcrop(results['img'], bboxes=bboxes)
+            img_shape = img.shape[:2]  # type: ignore
+            results['img'] = img
+            results['img_shape'] = img_shape
+            results['pad_shape'] = img_shape
+
+    def _crop_seg_map(self, results: dict, bboxes: np.ndarray) -> None:
+        """Crop semantic segmentation map.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
+        """
+        if results.get('gt_seg_map', None) is not None:
+            img = mmcv.imcrop(results['gt_seg_map'], bboxes=bboxes)
+            results['gt_seg_map'] = img
+
+    def _crop_bboxes(self, results: dict, bboxes: np.ndarray) -> None:
+        """Update bounding boxes according to CenterCrop.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
+        """
+        if 'gt_bboxes' in results:
+            offset_w = bboxes[0]
+            offset_h = bboxes[1]
+            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h])
+            # gt_bboxes has shape (num_gts, 4) in (tl_x, tl_y, br_x, br_y)
+            # order.
+            gt_bboxes = results['gt_bboxes'] - bbox_offset
+            if self.clip_object_border:
+                gt_bboxes[:, 0::2] = np.clip(gt_bboxes[:, 0::2], 0,
+                                             results['img'].shape[1])
+                gt_bboxes[:, 1::2] = np.clip(gt_bboxes[:, 1::2], 0,
+                                             results['img'].shape[0])
+            results['gt_bboxes'] = gt_bboxes
+
+    def _crop_keypoints(self, results: dict, bboxes: np.ndarray) -> None:
+        """Update key points according to CenterCrop. Keypoints that not in the
+        cropped image will be set invisible.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
+        """
+        if 'gt_keypoints' in results:
+            offset_w = bboxes[0]
+            offset_h = bboxes[1]
+            keypoints_offset = np.array([offset_w, offset_h, 0])
+            # gt_keypoints has shape (N, NK, 3) in (x, y, visibility) order,
+            # NK = number of points per object
+            gt_keypoints = results['gt_keypoints'] - keypoints_offset
+            # set gt_kepoints out of the result image invisible
+            height, width = results['img'].shape[:2]
+            valid_pos = (gt_keypoints[:, :, 0] >=
+                         0) * (gt_keypoints[:, :, 0] <
+                               width) * (gt_keypoints[:, :, 1] >= 0) * (
+                                   gt_keypoints[:, :, 1] < height)
+            gt_keypoints[:, :, 2] = np.where(valid_pos, gt_keypoints[:, :, 2],
+                                             0)
+            gt_keypoints[:, :, 0] = np.clip(gt_keypoints[:, :, 0], 0,
+                                            results['img'].shape[1])
+            gt_keypoints[:, :, 1] = np.clip(gt_keypoints[:, :, 1], 0,
+                                            results['img'].shape[0])
+            results['gt_keypoints'] = gt_keypoints
+
+    def transform(self, results: dict) -> dict:
+        """Apply center crop on results.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+            dict: Results with CenterCropped image and semantic segmentation
+            map.
+        """
+        crop_width, crop_height = self.crop_size[0], self.crop_size[1]
+
+        assert 'img' in results, '`img` is not found in results'
+        img = results['img']
+        # img.shape has length 2 for grayscale, length 3 for color
+        img_height, img_width = img.shape[:2]
+
+        if crop_height > img_height or crop_width > img_width:
+            if self.auto_pad:
+                # pad the area
+                img_height = max(img_height, crop_height)
+                img_width = max(img_width, crop_width)
+                pad_size = (img_width, img_height)
+                _pad_cfg = self.pad_cfg.copy()
+                _pad_cfg.update(dict(size=pad_size))
+                pad_transform = TRANSFORMS.build(_pad_cfg)
+                results = pad_transform(results)
+            else:
+                crop_height = min(crop_height, img_height)
+                crop_width = min(crop_width, img_width)
+
+        y1 = max(0, int(round((img_height - crop_height) / 2.)))
+        x1 = max(0, int(round((img_width - crop_width) / 2.)))
+        y2 = min(img_height, y1 + crop_height) - 1
+        x2 = min(img_width, x1 + crop_width) - 1
+        bboxes = np.array([x1, y1, x2, y2])
+
+        # crop the image
+        self._crop_img(results, bboxes)
+        # crop the gt_seg_map
+        self._crop_seg_map(results, bboxes)
+        # crop the bounding box
+        self._crop_bboxes(results, bboxes)
+        # crop the keypoints
+        self._crop_keypoints(results, bboxes)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size = {self.crop_size}'
+        repr_str += f', auto_pad={self.auto_pad}'
+        repr_str += f', pad_cfg={self.pad_cfg}'
+        repr_str += f',clip_object_border = {self.clip_object_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomGrayscale(BaseTransform):
+    """Randomly convert image to grayscale with a probability.
+
+    Required Key:
+
+    - img
+
+    Modified Key:
+
+    - img
+
+    Added Keys:
+
+    - grayscale
+    - grayscale_weights
+
+    Args:
+        prob (float): Probability that image should be converted to
+            grayscale. Defaults to 0.1.
+        keep_channels (bool): Whether keep channel number the same as
+            input. Defaults to False.
+        channel_weights (tuple): The grayscale weights of each channel,
+            and the weights will be normalized. For example, (1, 2, 1)
+            will be normalized as (0.25, 0.5, 0.25). Defaults to
+            (1., 1., 1.).
+        color_format (str): Color format set to be any of 'bgr',
+            'rgb', 'hsv'. Note: 'hsv' image will be transformed into 'bgr'
+            format no matter whether it is grayscaled. Defaults to 'bgr'.
+    """
+
+    def __init__(self,
+                 prob: float = 0.1,
+                 keep_channels: bool = False,
+                 channel_weights: Sequence[float] = (1., 1., 1.),
+                 color_format: str = 'bgr') -> None:
+        super().__init__()
+        assert 0. <= prob <= 1., ('The range of ``prob`` value is [0., 1.],' +
+                                  f' but got {prob} instead')
+        self.prob = prob
+        self.keep_channels = keep_channels
+        self.channel_weights = channel_weights
+        assert color_format in ['bgr', 'rgb', 'hsv']
+        self.color_format = color_format
+
+    @cache_randomness
+    def _random_prob(self):
+        return random.random()
+
+    def transform(self, results: dict) -> dict:
+        """Apply random grayscale on results.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+           dict: Results with grayscale image.
+        """
+        img = results['img']
+        # convert hsv to bgr
+        if self.color_format == 'hsv':
+            img = mmcv.hsv2bgr(img)
+        img = img[..., None] if img.ndim == 2 else img
+        num_output_channels = img.shape[2]
+        if self._random_prob() < self.prob:
+            if num_output_channels > 1:
+                assert num_output_channels == len(
+                    self.channel_weights
+                ), 'The length of ``channel_weights`` are supposed to be '
+                f'num_output_channels, but got {len(self.channel_weights)}'
+                ' instead.'
+                normalized_weights = (
+                    np.array(self.channel_weights) / sum(self.channel_weights))
+                img = (normalized_weights * img).sum(axis=2)
+                img = img.astype('uint8')
+                if self.keep_channels:
+                    img = img[:, :, None]
+                    results['img'] = np.dstack(
+                        [img for _ in range(num_output_channels)])
+                else:
+                    results['img'] = img
+                return results
+        img = img.astype('uint8')
+        results['img'] = img
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob = {self.prob}'
+        repr_str += f', keep_channels = {self.keep_channels}'
+        repr_str += f', channel_weights = {self.channel_weights}'
+        repr_str += f', color_format = {self.color_format})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MultiScaleFlipAug(BaseTransform):
+    """Test-time augmentation with multiple scales and flipping.
+
+    An example configuration is as followed:
+
+    .. code-block::
+
+        dict(
+            type='MultiScaleFlipAug',
+            scales=[(1333, 400), (1333, 800)],
+            flip=True,
+            transforms=[
+                dict(type='Normalize', **img_norm_cfg),
+                dict(type='Pad', size_divisor=1),
+                dict(type='ImageToTensor', keys=['img']),
+                dict(type='Collect', keys=['img'])
+            ])
+
+    ``results`` will be resized using all the sizes in ``scales``.
+    If ``flip`` is True, then flipped results will also be added into output
+    list.
+
+    For the above configuration, there are four combinations of resize
+    and flip:
+
+    - Resize to (1333, 400) + no flip
+    - Resize to (1333, 400) + flip
+    - Resize to (1333, 800) + no flip
+    - resize to (1333, 800) + flip
+
+    The four results are then transformed with ``transforms`` argument.
+    After that, results are wrapped into lists of the same length as below:
+
+    .. code-block::
+
+        dict(
+            inputs=[...],
+            data_samples=[...]
+        )
+
+    Where the length of ``inputs`` and ``data_samples`` are both 4.
+
+    Required Keys:
+
+    - Depending on the requirements of the ``transforms`` parameter.
+
+    Modified Keys:
+
+    - All output keys of each transform.
+
+    Args:
+        transforms (list[dict]): Transforms to be applied to each resized
+            and flipped data.
+        scales (tuple | list[tuple] | None): Images scales for resizing.
+        scale_factor (float or tuple[float]): Scale factors for resizing.
+            Defaults to None.
+        allow_flip (bool): Whether apply flip augmentation. Defaults to False.
+        flip_direction (str | list[str]): Flip augmentation directions,
+            options are "horizontal", "vertical" and "diagonal". If
+            flip_direction is a list, multiple flip augmentations will be
+            applied. It has no effect when flip == False. Defaults to
+            "horizontal".
+        resize_cfg (dict): Base config for resizing. Defaults to
+            ``dict(type='Resize', keep_ratio=True)``.
+        flip_cfg (dict): Base config for flipping. Defaults to
+            ``dict(type='RandomFlip')``.
+    """
+
+    def __init__(
+        self,
+        transforms: List[dict],
+        scales: Optional[Union[Tuple, List[Tuple]]] = None,
+        scale_factor: Optional[Union[float, List[float]]] = None,
+        allow_flip: bool = False,
+        flip_direction: Union[str, List[str]] = 'horizontal',
+        resize_cfg: dict = dict(type='Resize', keep_ratio=True),
+        flip_cfg: dict = dict(type='RandomFlip')
+    ) -> None:
+        super().__init__()
+        self.transforms = Compose(transforms)  # type: ignore
+
+        if scales is not None:
+            self.scales = scales if isinstance(scales, list) else [scales]
+            self.scale_key = 'scale'
+            assert mmengine.is_list_of(self.scales, tuple)
+        else:
+            # if ``scales`` and ``scale_factor`` both be ``None``
+            if scale_factor is None:
+                self.scales = [1.]  # type: ignore
+            elif isinstance(scale_factor, list):
+                self.scales = scale_factor  # type: ignore
+            else:
+                self.scales = [scale_factor]  # type: ignore
+
+            self.scale_key = 'scale_factor'
+
+        self.allow_flip = allow_flip
+        self.flip_direction = flip_direction if isinstance(
+            flip_direction, list) else [flip_direction]
+        assert mmengine.is_list_of(self.flip_direction, str)
+        if not self.allow_flip and self.flip_direction != ['horizontal']:
+            warnings.warn(
+                'flip_direction has no effect when flip is set to False')
+        self.resize_cfg = resize_cfg.copy()
+        self.flip_cfg = flip_cfg
+
+    def transform(self, results: dict) -> Dict:
+        """Apply test time augment transforms on results.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+            dict: The augmented data, where each value is wrapped
+            into a list.
+        """
+
+        data_samples = []
+        inputs = []
+        flip_args = [(False, '')]
+        if self.allow_flip:
+            flip_args += [(True, direction)
+                          for direction in self.flip_direction]
+        for scale in self.scales:
+            for flip, direction in flip_args:
+                _resize_cfg = self.resize_cfg.copy()
+                _resize_cfg.update({self.scale_key: scale})
+                _resize_flip = [_resize_cfg]
+
+                if flip:
+                    _flip_cfg = self.flip_cfg.copy()
+                    _flip_cfg.update(prob=1.0, direction=direction)
+                    _resize_flip.append(_flip_cfg)
+                else:
+                    results['flip'] = False
+                    results['flip_direction'] = None
+
+                resize_flip = Compose(_resize_flip)
+                _results = resize_flip(results.copy())
+                packed_results = self.transforms(_results)  # type: ignore
+
+                inputs.append(packed_results['inputs'])  # type: ignore
+                data_samples.append(
+                    packed_results['data_sample'])  # type: ignore
+        return dict(inputs=inputs, data_sample=data_samples)
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms={self.transforms}'
+        repr_str += f', scales={self.scales}'
+        repr_str += f', allow_flip={self.allow_flip}'
+        repr_str += f', flip_direction={self.flip_direction})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class TestTimeAug(BaseTransform):
+    """Test-time augmentation transform.
+
+    An example configuration is as followed:
+
+    .. code-block::
+
+        dict(type='TestTimeAug',
+             transforms=[
+                [dict(type='Resize', scale=(1333, 400), keep_ratio=True),
+                 dict(type='Resize', scale=(1333, 800), keep_ratio=True)],
+                [dict(type='RandomFlip', prob=1.),
+                 dict(type='RandomFlip', prob=0.)],
+                [dict(type='PackDetInputs',
+                      meta_keys=('img_id', 'img_path', 'ori_shape',
+                                 'img_shape', 'scale_factor', 'flip',
+                                 'flip_direction'))]])
+
+    ``results`` will be transformed using all transforms defined in
+    ``transforms`` arguments.
+
+    For the above configuration, there are four combinations of resize
+    and flip:
+
+    - Resize to (1333, 400) + no flip
+    - Resize to (1333, 400) + flip
+    - Resize to (1333, 800) + no flip
+    - resize to (1333, 800) + flip
+
+    After that, results are wrapped into lists of the same length as below:
+
+    .. code-block::
+
+        dict(
+            inputs=[...],
+            data_samples=[...]
+        )
+
+    The length of ``inputs`` and ``data_samples`` are both 4.
+
+    Required Keys:
+
+    - Depending on the requirements of the ``transforms`` parameter.
+
+    Modified Keys:
+
+    - All output keys of each transform.
+
+    Args:
+        transforms (list[list[dict]]): Transforms to be applied to data sampled
+            from dataset. ``transforms`` is a list of list, and each list
+            element usually represents a series of transforms with the same
+            type and different arguments. Data will be processed by each list
+            elements sequentially. See more information in :meth:`transform`.
+    """
+
+    def __init__(self, transforms: list):
+        for i, transform_list in enumerate(transforms):
+            for j, transform in enumerate(transform_list):
+                if isinstance(transform, dict):
+                    transform_list[j] = TRANSFORMS.build(transform)
+                elif callable(transform):
+                    continue
+                else:
+                    raise TypeError(
+                        'transform must be callable or a dict, but got'
+                        f' {type(transform)}')
+            transforms[i] = transform_list
+
+        self.subroutines = [
+            Compose(subroutine) for subroutine in product(*transforms)
+        ]
+
+    def transform(self, results: dict) -> dict:
+        """Apply all transforms defined in :attr:`transforms` to the results.
+
+        As the example given in :obj:`TestTimeAug`, ``transforms`` consists of
+        2 ``Resize``, 2 ``RandomFlip`` and 1 ``PackDetInputs``.
+        The data sampled from dataset will be processed as follows:
+
+        1. Data will be processed by 2 ``Resize`` and return a list
+           of 2 results.
+        2. Each result in list will be further passed to 2
+           ``RandomFlip``, and aggregates into a list of 4 results.
+        3. Each result will be processed by ``PackDetInputs``, and
+           return a list of dict.
+        4. Aggregates the same fields of results, and finally returns
+           a dict. Each value of the dict represents 4 transformed
+           results.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+            dict: The augmented data, where each value is wrapped
+            into a list.
+        """
+        results_list = []  # type: ignore
+        for subroutine in self.subroutines:
+            result = subroutine(copy.deepcopy(results))
+            assert isinstance(result, dict), (
+                f'Data processed by {subroutine} must return a dict, but got '
+                f'{result}')
+            assert result is not None, (
+                f'Data processed by {subroutine} in `TestTimeAug` should not '
+                'be None! Please check your validation dataset and the '
+                f'transforms in {subroutine}')
+            results_list.append(result)
+
+        aug_data_dict = {
+            key: [item[key] for item in results_list]  # type: ignore
+            for key in results_list[0]  # type: ignore
+        }
+        return aug_data_dict
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += 'transforms=\n'
+        for subroutine in self.subroutines:
+            repr_str += f'{repr(subroutine)}\n'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomChoiceResize(BaseTransform):
+    """Resize images & bbox & mask from a list of multiple scales.
+
+    This transform resizes the input image to some scale. Bboxes and masks are
+    then resized with the same scale factor. Resize scale will be randomly
+    selected from ``scales``.
+
+    How to choose the target scale to resize the image will follow the rules
+    below:
+
+    - if `scale` is a list of tuple, the target scale is sampled from the list
+      uniformally.
+    - if `scale` is a tuple, the target scale will be set to the tuple.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_keypoints (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_keypoints (optional)
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - scale_idx
+    - keep_ratio
+
+
+    Args:
+        scales (Union[list, Tuple]): Images scales for resizing.
+        resize_type (str): The type of resize class to use. Defaults to
+            "Resize".
+        **resize_kwargs: Other keyword arguments for the ``resize_type``.
+
+    Note:
+        By defaults, the ``resize_type`` is "Resize", if it's not overwritten
+        by your registry, it indicates the :class:`mmcv.Resize`. And therefore,
+        ``resize_kwargs`` accepts any keyword arguments of it, like
+        ``keep_ratio``, ``interpolation`` and so on.
+
+        If you want to use your custom resize class, the class should accept
+        ``scale`` argument and have ``scale`` attribution which determines the
+        resize shape.
+    """
+
+    def __init__(
+        self,
+        scales: Sequence[Union[int, Tuple]],
+        resize_type: str = 'Resize',
+        **resize_kwargs,
+    ) -> None:
+        super().__init__()
+        if isinstance(scales, list):
+            self.scales = scales
+        else:
+            self.scales = [scales]
+        assert mmengine.is_seq_of(self.scales, (tuple, int))
+
+        self.resize_cfg = dict(type=resize_type, **resize_kwargs)
+        # create a empty Resize object
+        self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})
+
+    @cache_randomness
+    def _random_select(self) -> Tuple[int, int]:
+        """Randomly select an scale from given candidates.
+
+        Returns:
+            (tuple, int): Returns a tuple ``(scale, scale_dix)``,
+            where ``scale`` is the selected image scale and
+            ``scale_idx`` is the selected index in the given candidates.
+        """
+
+        scale_idx = np.random.randint(len(self.scales))
+        scale = self.scales[scale_idx]
+        return scale, scale_idx
+
+    def transform(self, results: dict) -> dict:
+        """Apply resize transforms on results from a list of scales.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'gt_keypoints', 'scale', 'scale_factor', 'img_shape',
+            and 'keep_ratio' keys are updated in result dict.
+        """
+
+        target_scale, scale_idx = self._random_select()
+        self.resize.scale = target_scale
+        results = self.resize(results)
+        results['scale_idx'] = scale_idx
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(scales={self.scales}'
+        repr_str += f', resize_cfg={self.resize_cfg})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomFlip(BaseTransform):
+    """Flip the image & bbox & keypoints & segmentation map. Added or Updated
+    keys: flip, flip_direction, img, gt_bboxes, gt_seg_map, and
+    gt_keypoints. There are 3 flip modes:
+
+    - ``prob`` is float, ``direction`` is string: the image will be
+      ``direction``ly flipped with probability of ``prob`` .
+      E.g., ``prob=0.5``, ``direction='horizontal'``,
+      then image will be horizontally flipped with probability of 0.5.
+
+    - ``prob`` is float, ``direction`` is list of string: the image will
+      be ``direction[i]``ly flipped with probability of
+      ``prob/len(direction)``.
+      E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``,
+      then image will be horizontally flipped with probability of 0.25,
+      vertically with probability of 0.25.
+
+    - ``prob`` is list of float, ``direction`` is list of string:
+      given ``len(prob) == len(direction)``, the image will
+      be ``direction[i]``ly flipped with probability of ``prob[i]``.
+      E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal',
+      'vertical']``, then image will be horizontally flipped with
+      probability of 0.3, vertically with probability of 0.5.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_keypoints (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_keypoints (optional)
+
+    Added Keys:
+
+    - flip
+    - flip_direction
+    - swap_seg_labels (optional)
+
+    Args:
+        prob (float | list[float], optional): The flipping probability.
+            Defaults to None.
+        direction(str | list[str]): The flipping direction. Options
+            If input is a list, the length must equal ``prob``. Each
+            element in ``prob`` indicates the flip probability of
+            corresponding direction. Defaults to 'horizontal'.
+        swap_seg_labels (list, optional): The label pair need to be swapped
+            for ground truth, like 'left arm' and 'right arm' need to be
+            swapped after horizontal flipping. For example, ``[(1, 5)]``,
+            where 1/5 is the label of the left/right arm. Defaults to None.
+    """
+
+    def __init__(self,
+                 prob: Optional[Union[float, Iterable[float]]] = None,
+                 direction: Union[str, Sequence[Optional[str]]] = 'horizontal',
+                 swap_seg_labels: Optional[Sequence] = None) -> None:
+        if isinstance(prob, list):
+            assert mmengine.is_list_of(prob, float)
+            assert 0 <= sum(prob) <= 1
+        elif isinstance(prob, float):
+            assert 0 <= prob <= 1
+        else:
+            raise ValueError(f'probs must be float or list of float, but \
+                              got `{type(prob)}`.')
+        self.prob = prob
+        self.swap_seg_labels = swap_seg_labels
+
+        valid_directions = ['horizontal', 'vertical', 'diagonal']
+        if isinstance(direction, str):
+            assert direction in valid_directions
+        elif isinstance(direction, list):
+            assert mmengine.is_list_of(direction, str)
+            assert set(direction).issubset(set(valid_directions))
+        else:
+            raise ValueError(f'direction must be either str or list of str, \
+                               but got `{type(direction)}`.')
+        self.direction = direction
+
+        if isinstance(prob, list):
+            assert len(prob) == len(self.direction)
+
+    def _flip_bbox(self, bboxes: np.ndarray, img_shape: Tuple[int, int],
+                   direction: str) -> np.ndarray:
+        """Flip bboxes horizontally.
+
+        Args:
+            bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
+            img_shape (tuple[int]): Image shape (height, width)
+            direction (str): Flip direction. Options are 'horizontal',
+                'vertical', and 'diagonal'.
+
+        Returns:
+            numpy.ndarray: Flipped bounding boxes.
+        """
+        assert bboxes.shape[-1] % 4 == 0
+        flipped = bboxes.copy()
+        h, w = img_shape
+        if direction == 'horizontal':
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+        elif direction == 'vertical':
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        elif direction == 'diagonal':
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        else:
+            raise ValueError(
+                f"Flipping direction must be 'horizontal', 'vertical', \
+                  or 'diagonal', but got '{direction}'")
+        return flipped
+
+    def _flip_keypoints(
+        self,
+        keypoints: np.ndarray,
+        img_shape: Tuple[int, int],
+        direction: str,
+    ) -> np.ndarray:
+        """Flip keypoints horizontally, vertically or diagonally.
+
+        Args:
+            keypoints (numpy.ndarray): Keypoints, shape (..., 2)
+            img_shape (tuple[int]): Image shape (height, width)
+            direction (str): Flip direction. Options are 'horizontal',
+                'vertical', and 'diagonal'.
+
+        Returns:
+            numpy.ndarray: Flipped keypoints.
+        """
+
+        meta_info = keypoints[..., 2:]
+        keypoints = keypoints[..., :2]
+        flipped = keypoints.copy()
+        h, w = img_shape
+        if direction == 'horizontal':
+            flipped[..., 0::2] = w - keypoints[..., 0::2]
+        elif direction == 'vertical':
+            flipped[..., 1::2] = h - keypoints[..., 1::2]
+        elif direction == 'diagonal':
+            flipped[..., 0::2] = w - keypoints[..., 0::2]
+            flipped[..., 1::2] = h - keypoints[..., 1::2]
+        else:
+            raise ValueError(
+                f"Flipping direction must be 'horizontal', 'vertical', \
+                  or 'diagonal', but got '{direction}'")
+        flipped = np.concatenate([flipped, meta_info], axis=-1)
+        return flipped
+
+    def _flip_seg_map(self, seg_map: dict, direction: str) -> np.ndarray:
+        """Flip segmentation map horizontally, vertically or diagonally.
+
+        Args:
+            seg_map (numpy.ndarray): segmentation map, shape (H, W).
+            direction (str): Flip direction. Options are 'horizontal',
+                'vertical'.
+
+        Returns:
+            numpy.ndarray: Flipped segmentation map.
+        """
+        seg_map = mmcv.imflip(seg_map, direction=direction)
+        if self.swap_seg_labels is not None:
+            # to handle datasets with left/right annotations
+            # like 'Left-arm' and 'Right-arm' in LIP dataset
+            # Modified from https://github.com/openseg-group/openseg.pytorch/blob/master/lib/datasets/tools/cv2_aug_transforms.py # noqa:E501
+            # Licensed under MIT license
+            temp = seg_map.copy()
+            assert isinstance(self.swap_seg_labels, (tuple, list))
+            for pair in self.swap_seg_labels:
+                assert isinstance(pair, (tuple, list)) and len(pair) == 2, \
+                    'swap_seg_labels must be a sequence with pair, but got ' \
+                    f'{self.swap_seg_labels}.'
+                seg_map[temp == pair[0]] = pair[1]
+                seg_map[temp == pair[1]] = pair[0]
+        return seg_map
+
+    @cache_randomness
+    def _choose_direction(self) -> str:
+        """Choose the flip direction according to `prob` and `direction`"""
+        if isinstance(self.direction,
+                      Sequence) and not isinstance(self.direction, str):
+            # None means non-flip
+            direction_list: list = list(self.direction) + [None]
+        elif isinstance(self.direction, str):
+            # None means non-flip
+            direction_list = [self.direction, None]
+
+        if isinstance(self.prob, list):
+            non_prob: float = 1 - sum(self.prob)
+            prob_list = self.prob + [non_prob]
+        elif isinstance(self.prob, float):
+            non_prob = 1. - self.prob
+            # exclude non-flip
+            single_ratio = self.prob / (len(direction_list) - 1)
+            prob_list = [single_ratio] * (len(direction_list) - 1) + [non_prob]
+
+        cur_dir = np.random.choice(direction_list, p=prob_list)
+
+        return cur_dir
+
+    def _flip(self, results: dict) -> None:
+        """Flip images, bounding boxes, semantic segmentation map and
+        keypoints."""
+        # flip image
+        results['img'] = mmcv.imflip(
+            results['img'], direction=results['flip_direction'])
+
+        img_shape = results['img'].shape[:2]
+
+        # flip bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'] = self._flip_bbox(results['gt_bboxes'],
+                                                   img_shape,
+                                                   results['flip_direction'])
+
+        # flip keypoints
+        if results.get('gt_keypoints', None) is not None:
+            results['gt_keypoints'] = self._flip_keypoints(
+                results['gt_keypoints'], img_shape, results['flip_direction'])
+
+        # flip seg map
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = self._flip_seg_map(
+                results['gt_seg_map'], direction=results['flip_direction'])
+            results['swap_seg_labels'] = self.swap_seg_labels
+
+    def _flip_on_direction(self, results: dict) -> None:
+        """Function to flip images, bounding boxes, semantic segmentation map
+        and keypoints."""
+        cur_dir = self._choose_direction()
+        if cur_dir is None:
+            results['flip'] = False
+            results['flip_direction'] = None
+        else:
+            results['flip'] = True
+            results['flip_direction'] = cur_dir
+            self._flip(results)
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to flip images, bounding boxes, semantic
+        segmentation map and keypoints.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'gt_keypoints', 'flip', and 'flip_direction' keys are
+            updated in result dict.
+        """
+        self._flip_on_direction(results)
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'direction={self.direction})'
+
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomResize(BaseTransform):
+    """Random resize images & bbox & keypoints.
+
+    How to choose the target scale to resize the image will follow the rules
+    below:
+
+    - if ``scale`` is a sequence of tuple
+
+    .. math::
+        target\\_scale[0] \\sim Uniform([scale[0][0], scale[1][0]])
+    .. math::
+        target\\_scale[1] \\sim Uniform([scale[0][1], scale[1][1]])
+
+    Following the resize order of weight and height in cv2, ``scale[i][0]``
+    is for width, and ``scale[i][1]`` is for height.
+
+    - if ``scale`` is a tuple
+
+    .. math::
+        target\\_scale[0] \\sim Uniform([ratio\\_range[0], ratio\\_range[1]])
+            * scale[0]
+    .. math::
+        target\\_scale[1] \\sim Uniform([ratio\\_range[0], ratio\\_range[1]])
+            * scale[1]
+
+    Following the resize order of weight and height in cv2, ``ratio_range[0]``
+    is for width, and ``ratio_range[1]`` is for height.
+
+    - if ``keep_ratio`` is True, the minimum value of ``target_scale`` will be
+      used to set the shorter side and the maximum value will be used to
+      set the longer side.
+
+    - if ``keep_ratio`` is False, the value of ``target_scale`` will be used to
+      reisze the width and height accordingly.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes
+    - gt_seg_map
+    - gt_keypoints
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_seg_map
+    - gt_keypoints
+    - img_shape
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+
+    Args:
+        scale (tuple or Sequence[tuple]): Images scales for resizing.
+            Defaults to None.
+        ratio_range (tuple[float], optional): (min_ratio, max_ratio).
+            Defaults to None.
+        resize_type (str): The type of resize class to use. Defaults to
+            "Resize".
+        **resize_kwargs: Other keyword arguments for the ``resize_type``.
+
+    Note:
+        By defaults, the ``resize_type`` is "Resize", if it's not overwritten
+        by your registry, it indicates the :class:`mmcv.Resize`. And therefore,
+        ``resize_kwargs`` accepts any keyword arguments of it, like
+        ``keep_ratio``, ``interpolation`` and so on.
+
+        If you want to use your custom resize class, the class should accept
+        ``scale`` argument and have ``scale`` attribution which determines the
+        resize shape.
+    """
+
+    def __init__(
+        self,
+        scale: Union[Tuple[int, int], Sequence[Tuple[int, int]]],
+        ratio_range: Tuple[float, float] = None,
+        resize_type: str = 'Resize',
+        **resize_kwargs,
+    ) -> None:
+
+        self.scale = scale
+        self.ratio_range = ratio_range
+
+        self.resize_cfg = dict(type=resize_type, **resize_kwargs)
+        # create a empty Reisize object
+        self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})
+
+    @staticmethod
+    def _random_sample(scales: Sequence[Tuple[int, int]]) -> tuple:
+        """Private function to randomly sample a scale from a list of tuples.
+
+        Args:
+            scales (list[tuple]): Images scale range for sampling.
+                There must be two tuples in scales, which specify the lower
+                and upper bound of image scales.
+
+        Returns:
+            tuple: The targeted scale of the image to be resized.
+        """
+
+        assert mmengine.is_list_of(scales, tuple) and len(scales) == 2
+        scale_0 = [scales[0][0], scales[1][0]]
+        scale_1 = [scales[0][1], scales[1][1]]
+        edge_0 = np.random.randint(min(scale_0), max(scale_0) + 1)
+        edge_1 = np.random.randint(min(scale_1), max(scale_1) + 1)
+        scale = (edge_0, edge_1)
+        return scale
+
+    @staticmethod
+    def _random_sample_ratio(scale: tuple, ratio_range: Tuple[float,
+                                                              float]) -> tuple:
+        """Private function to randomly sample a scale from a tuple.
+
+        A ratio will be randomly sampled from the range specified by
+        ``ratio_range``. Then it would be multiplied with ``scale`` to
+        generate sampled scale.
+
+        Args:
+            scale (tuple): Images scale base to multiply with ratio.
+            ratio_range (tuple[float]): The minimum and maximum ratio to scale
+                the ``scale``.
+
+        Returns:
+            tuple: The targeted scale of the image to be resized.
+        """
+
+        assert isinstance(scale, tuple) and len(scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(scale[0] * ratio), int(scale[1] * ratio)
+        return scale
+
+    @cache_randomness
+    def _random_scale(self) -> tuple:
+        """Private function to randomly sample an scale according to the type
+        of ``scale``.
+
+        Returns:
+            tuple: The targeted scale of the image to be resized.
+        """
+
+        if mmengine.is_tuple_of(self.scale, int):
+            assert self.ratio_range is not None and len(self.ratio_range) == 2
+            scale = self._random_sample_ratio(
+                self.scale,  # type: ignore
+                self.ratio_range)
+        elif mmengine.is_seq_of(self.scale, tuple):
+            scale = self._random_sample(self.scale)  # type: ignore
+        else:
+            raise NotImplementedError('Do not support sampling function '
+                                      f'for "{self.scale}"')
+
+        return scale
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to resize images, bounding boxes, semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, ``img``, ``gt_bboxes``, ``gt_semantic_seg``,
+            ``gt_keypoints``, ``scale``, ``scale_factor``, ``img_shape``, and
+            ``keep_ratio`` keys are updated in result dict.
+        """
+        results['scale'] = self._random_scale()
+        self.resize.scale = results['scale']
+        results = self.resize(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(scale={self.scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'resize_cfg={self.resize_cfg})'
+        return repr_str
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/utils.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..feb2396bf8d96f3aefee540632fd818954168b55
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/utils.py
@@ -0,0 +1,253 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import copy
+import functools
+import inspect
+import weakref
+from collections import defaultdict
+from collections.abc import Iterable
+from contextlib import contextmanager
+from typing import Callable, Union
+
+from .base import BaseTransform
+
+
+class cache_randomness:
+    """Decorator that marks the method with random return value(s) in a
+    transform class.
+
+    This decorator is usually used together with the context-manager
+    :func`:cache_random_params`. In this context, a decorated method will
+    cache its return value(s) at the first time of being invoked, and always
+    return the cached values when being invoked again.
+
+    .. note::
+        Only an instance method can be decorated with ``cache_randomness``.
+    """
+
+    def __init__(self, func):
+
+        # Check `func` is to be bound as an instance method
+        if not inspect.isfunction(func):
+            raise TypeError('Unsupport callable to decorate with'
+                            '@cache_randomness.')
+        func_args = inspect.getfullargspec(func).args
+        if len(func_args) == 0 or func_args[0] != 'self':
+            raise TypeError(
+                '@cache_randomness should only be used to decorate '
+                'instance methods (the first argument is ``self``).')
+
+        functools.update_wrapper(self, func)
+        self.func = func
+        self.instance_ref = None
+
+    def __set_name__(self, owner, name):
+        # Maintain a record of decorated methods in the class
+        if not hasattr(owner, '_methods_with_randomness'):
+            setattr(owner, '_methods_with_randomness', [])
+
+        # Here `name` equals to `self.__name__`, i.e., the name of the
+        # decorated function, due to the invocation of `update_wrapper` in
+        # `self.__init__()`
+        owner._methods_with_randomness.append(name)
+
+    def __call__(self, *args, **kwargs):
+        # Get the transform instance whose method is decorated
+        # by cache_randomness
+        instance = self.instance_ref()
+        name = self.__name__
+
+        # Check the flag ``self._cache_enabled``, which should be
+        # set by the contextmanagers like ``cache_random_parameters```
+        cache_enabled = getattr(instance, '_cache_enabled', False)
+
+        if cache_enabled:
+            # Initialize the cache of the transform instances. The flag
+            # ``cache_enabled``` is set by contextmanagers like
+            # ``cache_random_params```.
+            if not hasattr(instance, '_cache'):
+                setattr(instance, '_cache', {})
+
+            if name not in instance._cache:
+                instance._cache[name] = self.func(instance, *args, **kwargs)
+            # Return the cached value
+            return instance._cache[name]
+        else:
+            # Clear cache
+            if hasattr(instance, '_cache'):
+                del instance._cache
+            # Return function output
+            return self.func(instance, *args, **kwargs)
+
+    def __get__(self, obj, cls):
+        self.instance_ref = weakref.ref(obj)
+        # Return a copy to avoid multiple transform instances sharing
+        # one `cache_randomness` instance, which may cause data races
+        # in multithreading cases.
+        return copy.copy(self)
+
+
+def avoid_cache_randomness(cls):
+    """Decorator that marks a data transform class (subclass of
+    :class:`BaseTransform`) prohibited from caching randomness. With this
+    decorator, errors will be raised in following cases:
+
+        1. A method is defined in the class with the decorate
+    `cache_randomness`;
+        2. An instance of the class is invoked with the context
+    `cache_random_params`.
+
+    A typical usage of `avoid_cache_randomness` is to decorate the data
+    transforms with non-cacheable random behaviors (e.g., the random behavior
+    can not be defined in a method, thus can not be decorated with
+    `cache_randomness`). This is for preventing unintentinoal use of such data
+    transforms within the context of caching randomness, which may lead to
+    unexpected results.
+    """
+
+    # Check that cls is a data transform class
+    assert issubclass(cls, BaseTransform)
+
+    # Check that no method is decorated with `cache_randomness` in cls
+    if getattr(cls, '_methods_with_randomness', None):
+        raise RuntimeError(
+            f'Class {cls.__name__} decorated with '
+            '``avoid_cache_randomness`` should not have methods decorated '
+            'with ``cache_randomness`` (invalid methods: '
+            f'{cls._methods_with_randomness})')
+
+    class AvoidCacheRandomness:
+
+        def __get__(self, obj, objtype=None):
+            # Here we check the value in `objtype.__dict__` instead of
+            # directly checking the attribute
+            # `objtype._avoid_cache_randomness`. So if the base class is
+            # decorated with :func:`avoid_cache_randomness`, it will not be
+            # inherited by subclasses.
+            return objtype.__dict__.get('_avoid_cache_randomness', False)
+
+    cls.avoid_cache_randomness = AvoidCacheRandomness()
+    cls._avoid_cache_randomness = True
+
+    return cls
+
+
+@contextmanager
+def cache_random_params(transforms: Union[BaseTransform, Iterable]):
+    """Context-manager that enables the cache of return values of methods
+    decorated with ``cache_randomness`` in transforms.
+
+    In this mode, decorated methods will cache their return values on the
+    first invoking, and always return the cached value afterward. This allow
+    to apply random transforms in a deterministic way. For example, apply same
+    transforms on multiple examples. See ``cache_randomness`` for more
+    information.
+
+    Args:
+        transforms (BaseTransform|list[BaseTransform]): The transforms to
+            enable cache.
+    """
+
+    # key2method stores the original methods that are replaced by the wrapped
+    # ones. These methods will be restituted when exiting the context.
+    key2method = dict()
+
+    # key2counter stores the usage number of each cache_randomness. This is
+    # used to check that any cache_randomness is invoked once during processing
+    # on data sample.
+    key2counter: dict = defaultdict(int)
+
+    def _add_invoke_counter(obj, method_name):
+        method = getattr(obj, method_name)
+        key = f'{id(obj)}.{method_name}'
+        key2method[key] = method
+
+        @functools.wraps(method)
+        def wrapped(*args, **kwargs):
+            key2counter[key] += 1
+            return method(*args, **kwargs)
+
+        return wrapped
+
+    def _add_invoke_checker(obj, method_name):
+        # check that the method in _methods_with_randomness has been
+        # invoked at most once
+        method = getattr(obj, method_name)
+        key = f'{id(obj)}.{method_name}'
+        key2method[key] = method
+
+        @functools.wraps(method)
+        def wrapped(*args, **kwargs):
+            # clear counter
+            for name in obj._methods_with_randomness:
+                key = f'{id(obj)}.{name}'
+                key2counter[key] = 0
+
+            output = method(*args, **kwargs)
+
+            for name in obj._methods_with_randomness:
+                key = f'{id(obj)}.{name}'
+                if key2counter[key] > 1:
+                    raise RuntimeError(
+                        'The method decorated with ``cache_randomness`` '
+                        'should be invoked at most once during processing '
+                        f'one data sample. The method {name} of {obj} has '
+                        f'been invoked {key2counter[key]} times.')
+            return output
+
+        return wrapped
+
+    def _start_cache(t: BaseTransform):
+        # Check if cache is allowed for `t`
+        if getattr(t, 'avoid_cache_randomness', False):
+            raise RuntimeError(
+                f'Class {t.__class__.__name__} decorated with '
+                '``avoid_cache_randomness`` is not allowed to be used with'
+                ' ``cache_random_params`` (e.g. wrapped by '
+                '``ApplyToMultiple`` with ``share_random_params==True``).')
+
+        # Skip transforms w/o random method
+        if not hasattr(t, '_methods_with_randomness'):
+            return
+
+        # Set cache enabled flag
+        setattr(t, '_cache_enabled', True)
+
+        # Store the original method and init the counter
+        if hasattr(t, '_methods_with_randomness'):
+            setattr(t, 'transform', _add_invoke_checker(t, 'transform'))
+            for name in getattr(t, '_methods_with_randomness'):
+                setattr(t, name, _add_invoke_counter(t, name))
+
+    def _end_cache(t: BaseTransform):
+        # Skip transforms w/o random method
+        if not hasattr(t, '_methods_with_randomness'):
+            return
+
+        # Remove cache enabled flag
+        delattr(t, '_cache_enabled')
+        if hasattr(t, '_cache'):
+            delattr(t, '_cache')
+
+        # Restore the original method
+        if hasattr(t, '_methods_with_randomness'):
+            for name in getattr(t, '_methods_with_randomness'):
+                key = f'{id(t)}.{name}'
+                setattr(t, name, key2method[key])
+
+            key_transform = f'{id(t)}.transform'
+            setattr(t, 'transform', key2method[key_transform])
+
+    def _apply(t: Union[BaseTransform, Iterable],
+               func: Callable[[BaseTransform], None]):
+        if isinstance(t, BaseTransform):
+            func(t)
+        if isinstance(t, Iterable):
+            for _t in t:
+                _apply(_t, func)
+
+    try:
+        _apply(transforms, _start_cache)
+        yield
+    finally:
+        _apply(transforms, _end_cache)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/wrappers.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..27e07d6335862aadffb21cfb479ebf8160705588
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/transforms/wrappers.py
@@ -0,0 +1,649 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union
+
+import mmengine
+import numpy as np
+
+from .base import BaseTransform
+from .builder import TRANSFORMS
+from .utils import cache_random_params, cache_randomness
+
+# Define type of transform or transform config
+Transform = Union[Dict, Callable[[Dict], Dict]]
+
+# Indicator of keys marked by KeyMapper._map_input, which means ignoring the
+# marked keys in KeyMapper._apply_transform so they will be invisible to
+# wrapped transforms.
+# This can be 2 possible case:
+# 1. The key is required but missing in results
+# 2. The key is manually set as ... (Ellipsis) in ``mapping``, which means
+# the original value in results should be ignored
+IgnoreKey = object()
+
+# Import nullcontext if python>=3.7, otherwise use a simple alternative
+# implementation.
+try:
+    from contextlib import nullcontext  # type: ignore
+except ImportError:
+    from contextlib import contextmanager
+
+    @contextmanager  # type: ignore
+    def nullcontext(resource=None):
+        try:
+            yield resource
+        finally:
+            pass
+
+
+@TRANSFORMS.register_module()
+class Compose(BaseTransform):
+    """Compose multiple transforms sequentially.
+
+    Args:
+        transforms (list[dict | callable]): Sequence of transform object or
+            config dict to be composed.
+
+    Examples:
+        >>> pipeline = [
+        >>>     dict(type='Compose',
+        >>>         transforms=[
+        >>>             dict(type='LoadImageFromFile'),
+        >>>             dict(type='Normalize')
+        >>>         ]
+        >>>     )
+        >>> ]
+    """
+
+    def __init__(self, transforms: Union[Transform, Sequence[Transform]]):
+        super().__init__()
+
+        if not isinstance(transforms, Sequence):
+            transforms = [transforms]
+        self.transforms: List = []
+        for transform in transforms:
+            if isinstance(transform, dict):
+                transform = TRANSFORMS.build(transform)
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError('transform must be callable or a dict, but got'
+                                f' {type(transform)}')
+
+    def __iter__(self):
+        """Allow easy iteration over the transform sequence."""
+        return iter(self.transforms)
+
+    def transform(self, results: Dict) -> Optional[Dict]:
+        """Call function to apply transforms sequentially.
+
+        Args:
+            results (dict): A result dict contains the results to transform.
+
+        Returns:
+            dict or None: Transformed results.
+        """
+        for t in self.transforms:
+            results = t(results)  # type: ignore
+            if results is None:
+                return None
+        return results
+
+    def __repr__(self):
+        """Compute the string representation."""
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += f'\n    {t}'
+        format_string += '\n)'
+        return format_string
+
+
+@TRANSFORMS.register_module()
+class KeyMapper(BaseTransform):
+    """A transform wrapper to map and reorganize the input/output of the
+    wrapped transforms (or sub-pipeline).
+
+    Args:
+        transforms (list[dict | callable], optional): Sequence of transform
+            object or config dict to be wrapped.
+        mapping (dict): A dict that defines the input key mapping.
+            The keys corresponds to the inner key (i.e., kwargs of the
+            ``transform`` method), and should be string type. The values
+            corresponds to the outer keys (i.e., the keys of the
+            data/results), and should have a type of string, list or dict.
+            None means not applying input mapping. Default: None.
+        remapping (dict): A dict that defines the output key mapping.
+            The keys and values have the same meanings and rules as in the
+            ``mapping``. Default: None.
+        auto_remap (bool, optional): If True, an inverse of the mapping will
+            be used as the remapping. If auto_remap is not given, it will be
+            automatically set True if 'remapping' is not given, and vice
+            versa. Default: None.
+        allow_nonexist_keys (bool): If False, the outer keys in the mapping
+            must exist in the input data, or an exception will be raised.
+            Default: False.
+
+    Examples:
+        >>> # Example 1: KeyMapper 'gt_img' to 'img'
+        >>> pipeline = [
+        >>>     # Use KeyMapper to convert outer (original) field name
+        >>>     # 'gt_img' to inner (used by inner transforms) filed name
+        >>>     # 'img'
+        >>>     dict(type='KeyMapper',
+        >>>         mapping={'img': 'gt_img'},
+        >>>         # auto_remap=True means output key mapping is the revert of
+        >>>         # the input key mapping, e.g. inner 'img' will be mapped
+        >>>         # back to outer 'gt_img'
+        >>>         auto_remap=True,
+        >>>         transforms=[
+        >>>             # In all transforms' implementation just use 'img'
+        >>>             # as a standard field name
+        >>>             dict(type='Crop', crop_size=(384, 384)),
+        >>>             dict(type='Normalize'),
+        >>>         ])
+        >>> ]
+
+        >>> # Example 2: Collect and structure multiple items
+        >>> pipeline = [
+        >>>     # The inner field 'imgs' will be a dict with keys 'img_src'
+        >>>     # and 'img_tar', whose values are outer fields 'img1' and
+        >>>     # 'img2' respectively.
+        >>>     dict(type='KeyMapper',
+        >>>         dict(
+        >>>             type='KeyMapper',
+        >>>             mapping=dict(
+        >>>                 imgs=dict(
+        >>>                     img_src='img1',
+        >>>                     img_tar='img2')),
+        >>>         transforms=...)
+        >>> ]
+
+        >>> # Example 3: Manually set ignored keys by "..."
+        >>> pipeline = [
+        >>>     ...
+        >>>     dict(type='KeyMapper',
+        >>>         mapping={
+        >>>             # map outer key "gt_img" to inner key "img"
+        >>>             'img': 'gt_img',
+        >>>             # ignore outer key "mask"
+        >>>             'mask': ...,
+        >>>         },
+        >>>         transforms=[
+        >>>             dict(type='RandomFlip'),
+        >>>         ])
+        >>>     ...
+        >>> ]
+    """
+
+    def __init__(self,
+                 transforms: Union[Transform, List[Transform]] = None,
+                 mapping: Optional[Dict] = None,
+                 remapping: Optional[Dict] = None,
+                 auto_remap: Optional[bool] = None,
+                 allow_nonexist_keys: bool = False):
+
+        super().__init__()
+
+        self.allow_nonexist_keys = allow_nonexist_keys
+        self.mapping = mapping
+
+        if auto_remap is None:
+            auto_remap = remapping is None
+        self.auto_remap = auto_remap
+
+        if self.auto_remap:
+            if remapping is not None:
+                raise ValueError('KeyMapper: ``remapping`` must be None if'
+                                 '`auto_remap` is set True.')
+            self.remapping = mapping
+        else:
+            self.remapping = remapping
+
+        if transforms is None:
+            transforms = []
+        self.transforms = Compose(transforms)
+
+    def __iter__(self):
+        """Allow easy iteration over the transform sequence."""
+        return iter(self.transforms)
+
+    def _map_input(self, data: Dict,
+                   mapping: Optional[Dict]) -> Dict[str, Any]:
+        """KeyMapper inputs for the wrapped transforms by gathering and
+        renaming data items according to the mapping.
+
+        Args:
+            data (dict): The original input data
+            mapping (dict, optional): The input key mapping. See the document
+                of ``mmcv.transforms.wrappers.KeyMapper`` for details. In
+                set None, return the input data directly.
+
+        Returns:
+            dict: The input data with remapped keys. This will be the actual
+                input of the wrapped pipeline.
+        """
+
+        if mapping is None:
+            return data.copy()
+
+        def _map(data, m):
+            if isinstance(m, dict):
+                # m is a dict {inner_key:outer_key, ...}
+                return {k_in: _map(data, k_out) for k_in, k_out in m.items()}
+            if isinstance(m, (tuple, list)):
+                # m is a list or tuple [outer_key1, outer_key2, ...]
+                # This is the case when we collect items from the original
+                # data to form a list or tuple to feed to the wrapped
+                # transforms.
+                return m.__class__(_map(data, e) for e in m)
+
+            # allow manually mark a key to be ignored by ...
+            if m is ...:
+                return IgnoreKey
+
+            # m is an outer_key
+            if self.allow_nonexist_keys:
+                return data.get(m, IgnoreKey)
+            else:
+                return data.get(m)
+
+        collected = _map(data, mapping)
+
+        # Retain unmapped items
+        inputs = data.copy()
+        inputs.update(collected)
+
+        return inputs
+
+    def _map_output(self, data: Dict,
+                    remapping: Optional[Dict]) -> Dict[str, Any]:
+        """KeyMapper outputs from the wrapped transforms by gathering and
+        renaming data items according to the remapping.
+
+        Args:
+            data (dict): The output of the wrapped pipeline.
+            remapping (dict, optional): The output key mapping. See the
+                document of ``mmcv.transforms.wrappers.KeyMapper`` for
+                details. If ``remapping is None``, no key mapping will be
+                applied but only remove the special token ``IgnoreKey``.
+
+        Returns:
+            dict: The output with remapped keys.
+        """
+
+        # Remove ``IgnoreKey``
+        if remapping is None:
+            return {k: v for k, v in data.items() if v is not IgnoreKey}
+
+        def _map(data, m):
+            if isinstance(m, dict):
+                assert isinstance(data, dict)
+                results = {}
+                for k_in, k_out in m.items():
+                    assert k_in in data
+                    results.update(_map(data[k_in], k_out))
+                return results
+            if isinstance(m, (list, tuple)):
+                assert isinstance(data, (list, tuple))
+                assert len(data) == len(m)
+                results = {}
+                for m_i, d_i in zip(m, data):
+                    results.update(_map(d_i, m_i))
+                return results
+
+            # ``m is ...`` means the key is marked ignored, in which case the
+            # inner resuls will not affect the outer results in remapping.
+            # Another case that will have ``data is IgnoreKey`` is that the
+            # key is missing in the inputs. In this case, if the inner key is
+            # created by the wrapped transforms, it will be remapped to the
+            # corresponding outer key during remapping.
+            if m is ... or data is IgnoreKey:
+                return {}
+
+            return {m: data}
+
+        # Note that unmapped items are not retained, which is different from
+        # the behavior in _map_input. This is to avoid original data items
+        # being overwritten by intermediate namesakes
+        return _map(data, remapping)
+
+    def _apply_transforms(self, inputs: Dict) -> Dict:
+        """Apply ``self.transforms``.
+
+        Note that the special token ``IgnoreKey`` will be invisible to
+        ``self.transforms``, but not removed in this method. It will be
+        eventually removed in :func:``self._map_output``.
+        """
+        results = inputs.copy()
+        inputs = {k: v for k, v in inputs.items() if v is not IgnoreKey}
+        outputs = self.transforms(inputs)
+
+        if outputs is None:
+            raise ValueError(
+                f'Transforms wrapped by {self.__class__.__name__} should '
+                'not return None.')
+
+        results.update(outputs)  # type: ignore
+        return results
+
+    def transform(self, results: Dict) -> Dict:
+        """Apply mapping, wrapped transforms and remapping."""
+
+        # Apply mapping
+        inputs = self._map_input(results, self.mapping)
+        # Apply wrapped transforms
+        outputs = self._apply_transforms(inputs)
+        # Apply remapping
+        outputs = self._map_output(outputs, self.remapping)
+
+        results.update(outputs)  # type: ignore
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms = {self.transforms}'
+        repr_str += f', mapping = {self.mapping}'
+        repr_str += f', remapping = {self.remapping}'
+        repr_str += f', auto_remap = {self.auto_remap}'
+        repr_str += f', allow_nonexist_keys = {self.allow_nonexist_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class TransformBroadcaster(KeyMapper):
+    """A transform wrapper to apply the wrapped transforms to multiple data
+    items. For example, apply Resize to multiple images.
+
+    Args:
+        transforms (list[dict | callable]): Sequence of transform object or
+            config dict to be wrapped.
+        mapping (dict): A dict that defines the input key mapping.
+            Note that to apply the transforms to multiple data items, the
+            outer keys of the target items should be remapped as a list with
+            the standard inner key (The key required by the wrapped transform).
+            See the following example and the document of
+            ``mmcv.transforms.wrappers.KeyMapper`` for details.
+        remapping (dict): A dict that defines the output key mapping.
+            The keys and values have the same meanings and rules as in the
+            ``mapping``. Default: None.
+        auto_remap (bool, optional): If True, an inverse of the mapping will
+            be used as the remapping. If auto_remap is not given, it will be
+            automatically set True if 'remapping' is not given, and vice
+            versa. Default: None.
+        allow_nonexist_keys (bool): If False, the outer keys in the mapping
+            must exist in the input data, or an exception will be raised.
+            Default: False.
+        share_random_params (bool): If True, the random transform
+            (e.g., RandomFlip) will be conducted in a deterministic way and
+            have the same behavior on all data items. For example, to randomly
+            flip either both input image and ground-truth image, or none.
+            Default: False.
+
+    .. note::
+        To apply the transforms to each elements of a list or tuple, instead
+        of separating data items, you can map the outer key of the target
+        sequence to the standard inner key. See example 2.
+        example.
+
+    Examples:
+        >>> # Example 1: Broadcast to enumerated keys, each contains a single
+        >>> # data element
+        >>> pipeline = [
+        >>>     dict(type='LoadImageFromFile', key='lq'),  # low-quality img
+        >>>     dict(type='LoadImageFromFile', key='gt'),  # ground-truth img
+        >>>     # TransformBroadcaster maps multiple outer fields to standard
+        >>>     # the inner field and process them with wrapped transforms
+        >>>     # respectively
+        >>>     dict(type='TransformBroadcaster',
+        >>>         # case 1: from multiple outer fields
+        >>>         mapping={'img': ['lq', 'gt']},
+        >>>         auto_remap=True,
+        >>>         # share_random_param=True means using identical random
+        >>>         # parameters in every processing
+        >>>         share_random_param=True,
+        >>>         transforms=[
+        >>>             dict(type='Crop', crop_size=(384, 384)),
+        >>>             dict(type='Normalize'),
+        >>>         ])
+        >>> ]
+
+        >>> # Example 2: Broadcast to keys that contains data sequences
+        >>> pipeline = [
+        >>>     dict(type='LoadImageFromFile', key='lq'),  # low-quality img
+        >>>     dict(type='LoadImageFromFile', key='gt'),  # ground-truth img
+        >>>     # TransformBroadcaster maps multiple outer fields to standard
+        >>>     # the inner field and process them with wrapped transforms
+        >>>     # respectively
+        >>>     dict(type='TransformBroadcaster',
+        >>>         # case 2: from one outer field that contains multiple
+        >>>         # data elements (e.g. a list)
+        >>>         # mapping={'img': 'images'},
+        >>>         auto_remap=True,
+        >>>         share_random_param=True,
+        >>>         transforms=[
+        >>>             dict(type='Crop', crop_size=(384, 384)),
+        >>>             dict(type='Normalize'),
+        >>>         ])
+        >>> ]
+
+        >>> Example 3: Set ignored keys in broadcasting
+        >>> pipeline = [
+        >>>        dict(type='TransformBroadcaster',
+        >>>            # Broadcast the wrapped transforms to multiple images
+        >>>            # 'lq' and 'gt, but only update 'img_shape' once
+        >>>            mapping={
+        >>>                'img': ['lq', 'gt'],
+        >>>                'img_shape': ['img_shape', ...],
+        >>>             },
+        >>>            auto_remap=True,
+        >>>            share_random_params=True,
+        >>>            transforms=[
+        >>>                # `RandomCrop` will modify the field "img",
+        >>>                # and optionally update "img_shape" if it exists
+        >>>                dict(type='RandomCrop'),
+        >>>            ])
+        >>>    ]
+    """
+
+    def __init__(self,
+                 transforms: List[Union[Dict, Callable[[Dict], Dict]]],
+                 mapping: Optional[Dict] = None,
+                 remapping: Optional[Dict] = None,
+                 auto_remap: Optional[bool] = None,
+                 allow_nonexist_keys: bool = False,
+                 share_random_params: bool = False):
+        super().__init__(transforms, mapping, remapping, auto_remap,
+                         allow_nonexist_keys)
+
+        self.share_random_params = share_random_params
+
+    def scatter_sequence(self, data: Dict) -> List[Dict]:
+        """Scatter the broadcasting targets to a list of inputs of the wrapped
+        transforms."""
+
+        # infer split number from input
+        seq_len = 0
+        key_rep = None
+
+        if self.mapping:
+            keys = self.mapping.keys()
+        else:
+            keys = data.keys()
+
+        for key in keys:
+            assert isinstance(data[key], Sequence)
+            if seq_len:
+                if len(data[key]) != seq_len:
+                    raise ValueError('Got inconsistent sequence length: '
+                                     f'{seq_len} ({key_rep}) vs. '
+                                     f'{len(data[key])} ({key})')
+            else:
+                seq_len = len(data[key])
+                key_rep = key
+
+        assert seq_len > 0, 'Fail to get the number of broadcasting targets'
+
+        scatters = []
+        for i in range(seq_len):  # type: ignore
+            scatter = data.copy()
+            for key in keys:
+                scatter[key] = data[key][i]
+            scatters.append(scatter)
+        return scatters
+
+    def transform(self, results: Dict):
+        """Broadcast wrapped transforms to multiple targets."""
+
+        # Apply input remapping
+        inputs = self._map_input(results, self.mapping)
+
+        # Scatter sequential inputs into a list
+        input_scatters = self.scatter_sequence(inputs)
+
+        # Control random parameter sharing with a context manager
+        if self.share_random_params:
+            # The context manager :func`:cache_random_params` will let
+            # cacheable method of the transforms cache their outputs. Thus
+            # the random parameters will only generated once and shared
+            # by all data items.
+            ctx = cache_random_params  # type: ignore
+        else:
+            ctx = nullcontext  # type: ignore
+
+        with ctx(self.transforms):
+            output_scatters = [
+                self._apply_transforms(_input) for _input in input_scatters
+            ]
+
+        # Collate output scatters (list of dict to dict of list)
+        outputs = {
+            key: [_output[key] for _output in output_scatters]
+            for key in output_scatters[0]
+        }
+
+        # Apply remapping
+        outputs = self._map_output(outputs, self.remapping)
+
+        results.update(outputs)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms = {self.transforms}'
+        repr_str += f', mapping = {self.mapping}'
+        repr_str += f', remapping = {self.remapping}'
+        repr_str += f', auto_remap = {self.auto_remap}'
+        repr_str += f', allow_nonexist_keys = {self.allow_nonexist_keys}'
+        repr_str += f', share_random_params = {self.share_random_params})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomChoice(BaseTransform):
+    """Process data with a randomly chosen transform from given candidates.
+
+    Args:
+        transforms (list[list]): A list of transform candidates, each is a
+            sequence of transforms.
+        prob (list[float], optional): The probabilities associated
+            with each pipeline. The length should be equal to the pipeline
+            number and the sum should be 1. If not given, a uniform
+            distribution will be assumed.
+
+    Examples:
+        >>> # config
+        >>> pipeline = [
+        >>>     dict(type='RandomChoice',
+        >>>         transforms=[
+        >>>             [dict(type='RandomHorizontalFlip')],  # subpipeline 1
+        >>>             [dict(type='RandomRotate')],  # subpipeline 2
+        >>>         ]
+        >>>     )
+        >>> ]
+    """
+
+    def __init__(self,
+                 transforms: List[Union[Transform, List[Transform]]],
+                 prob: Optional[List[float]] = None):
+
+        super().__init__()
+
+        if prob is not None:
+            assert mmengine.is_seq_of(prob, float)
+            assert len(transforms) == len(prob), \
+                '``transforms`` and ``prob`` must have same lengths. ' \
+                f'Got {len(transforms)} vs {len(prob)}.'
+            assert sum(prob) == 1
+
+        self.prob = prob
+        self.transforms = [Compose(transforms) for transforms in transforms]
+
+    def __iter__(self):
+        return iter(self.transforms)
+
+    @cache_randomness
+    def random_pipeline_index(self) -> int:
+        """Return a random transform index."""
+        indices = np.arange(len(self.transforms))
+        return np.random.choice(indices, p=self.prob)
+
+    def transform(self, results: Dict) -> Optional[Dict]:
+        """Randomly choose a transform to apply."""
+        idx = self.random_pipeline_index()
+        return self.transforms[idx](results)
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms = {self.transforms}'
+        repr_str += f'prob = {self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomApply(BaseTransform):
+    """Apply transforms randomly with a given probability.
+
+    Args:
+        transforms (list[dict | callable]): The transform or transform list
+            to randomly apply.
+        prob (float): The probability to apply transforms. Default: 0.5
+
+    Examples:
+        >>> # config
+        >>> pipeline = [
+        >>>     dict(type='RandomApply',
+        >>>         transforms=[dict(type='HorizontalFlip')],
+        >>>         prob=0.3)
+        >>> ]
+    """
+
+    def __init__(self,
+                 transforms: Union[Transform, List[Transform]],
+                 prob: float = 0.5):
+
+        super().__init__()
+        self.prob = prob
+        self.transforms = Compose(transforms)
+
+    def __iter__(self):
+        return iter(self.transforms)
+
+    @cache_randomness
+    def random_apply(self) -> bool:
+        """Return a random bool value indicating whether apply the
+        transform."""
+        return np.random.rand() < self.prob
+
+    def transform(self, results: Dict) -> Optional[Dict]:
+        """Randomly apply the transform."""
+        if self.random_apply():
+            return self.transforms(results)  # type: ignore
+        else:
+            return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms = {self.transforms}'
+        repr_str += f', prob = {self.prob})'
+        return repr_str
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/__init__.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53ebb9453749d470a261e2aace1e5d2c47266545
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .device_type import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE,
+                          IS_MPS_AVAILABLE, IS_NPU_AVAILABLE)
+from .env import collect_env
+from .parrots_jit import jit, skip_no_elena
+
+__all__ = [
+    'IS_MLU_AVAILABLE', 'IS_MPS_AVAILABLE', 'IS_CUDA_AVAILABLE',
+    'IS_NPU_AVAILABLE', 'collect_env', 'jit', 'skip_no_elena'
+]
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/device_type.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/device_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a84371276df230f21119ba58155f37b973eb367
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/device_type.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.device import (is_cuda_available, is_mlu_available,
+                             is_mps_available, is_npu_available)
+
+IS_MLU_AVAILABLE = is_mlu_available()
+IS_MPS_AVAILABLE = is_mps_available()
+IS_CUDA_AVAILABLE = is_cuda_available()
+IS_NPU_AVAILABLE = is_npu_available()
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/env.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..27988cf2aead8573533076430b9c488a51be3a24
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/env.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This file holding some environment constant for sharing by other files."""
+
+import os.path as osp
+import subprocess
+
+import torch
+from mmengine.utils.dl_utils import collect_env as mmengine_collect_env
+
+import mmcv
+
+
+def collect_env():
+    """Collect the information of the running environments.
+
+    Returns:
+        dict: The environment information. The following fields are contained.
+
+            - sys.platform: The variable of ``sys.platform``.
+            - Python: Python version.
+            - CUDA available: Bool, indicating if CUDA is available.
+            - GPU devices: Device type of each GPU.
+            - CUDA_HOME (optional): The env var ``CUDA_HOME``.
+            - NVCC (optional): NVCC version.
+            - GCC: GCC version, "n/a" if GCC is not installed.
+            - MSVC: Microsoft Virtual C++ Compiler version, Windows only.
+            - PyTorch: PyTorch version.
+            - PyTorch compiling details: The output of \
+                ``torch.__config__.show()``.
+            - TorchVision (optional): TorchVision version.
+            - OpenCV: OpenCV version.
+            - MMEngine: MMEngine version.
+            - MMCV: MMCV version.
+            - MMCV Compiler: The GCC version for compiling MMCV ops.
+            - MMCV CUDA Compiler: The CUDA version for compiling MMCV ops.
+    """
+    env_info = mmengine_collect_env()
+
+    # MMEngine does not add the hipcc compiler information when collecting
+    # environment information, so it is added here. When MMEngine v0.3.0 is
+    # released, the code here can be removed.
+    cuda_available = torch.cuda.is_available()
+    if cuda_available and env_info.get('NVCC') == 'Not Available':
+        CUDA_HOME = env_info['CUDA_HOME']
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            if CUDA_HOME == '/opt/rocm':
+                try:
+                    nvcc = osp.join(CUDA_HOME, 'hip/bin/hipcc')
+                    nvcc = subprocess.check_output(
+                        f'"{nvcc}" --version', shell=True)
+                    nvcc = nvcc.decode('utf-8').strip()
+                    release = nvcc.rfind('HIP version:')
+                    build = nvcc.rfind('')
+                    nvcc = nvcc[release:build].strip()
+                except subprocess.SubprocessError:
+                    nvcc = 'Not Available'
+            else:
+                try:
+                    nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                    nvcc = subprocess.check_output(f'"{nvcc}" -V', shell=True)
+                    nvcc = nvcc.decode('utf-8').strip()
+                    release = nvcc.rfind('Cuda compilation tools')
+                    build = nvcc.rfind('Build ')
+                    nvcc = nvcc[release:build].strip()
+                except subprocess.SubprocessError:
+                    nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+
+    env_info['MMCV'] = mmcv.__version__
+
+    try:
+        from mmcv.ops import get_compiler_version, get_compiling_cuda_version
+    except ModuleNotFoundError:
+        env_info['MMCV Compiler'] = 'n/a'
+        env_info['MMCV CUDA Compiler'] = 'n/a'
+    else:
+        env_info['MMCV Compiler'] = get_compiler_version()
+        env_info['MMCV CUDA Compiler'] = get_compiling_cuda_version()
+
+    return env_info
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/ext_loader.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/ext_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..a31e107dfef8b710dc56fd887f569097d1c63208
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/ext_loader.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import importlib
+import os
+import pkgutil
+import warnings
+from collections import namedtuple
+
+import torch
+
+if torch.__version__ != 'parrots':
+
+    def load_ext(name, funcs):
+        ext = importlib.import_module('mmcv.' + name)
+        for fun in funcs:
+            assert hasattr(ext, fun), f'{fun} miss in module {name}'
+        return ext
+else:
+    from parrots import extension
+    from parrots.base import ParrotsException
+
+    has_return_value_ops = [
+        'nms',
+        'softnms',
+        'nms_match',
+        'nms_rotated',
+        'top_pool_forward',
+        'top_pool_backward',
+        'bottom_pool_forward',
+        'bottom_pool_backward',
+        'left_pool_forward',
+        'left_pool_backward',
+        'right_pool_forward',
+        'right_pool_backward',
+        'fused_bias_leakyrelu',
+        'upfirdn2d',
+        'ms_deform_attn_forward',
+        'pixel_group',
+        'contour_expand',
+        'diff_iou_rotated_sort_vertices_forward',
+    ]
+
+    def get_fake_func(name, e):
+
+        def fake_func(*args, **kwargs):
+            warnings.warn(f'{name} is not supported in parrots now')
+            raise e
+
+        return fake_func
+
+    def load_ext(name, funcs):
+        ExtModule = namedtuple('ExtModule', funcs)
+        ext_list = []
+        lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+        for fun in funcs:
+            try:
+                ext_fun = extension.load(fun, name, lib_dir=lib_root)
+            except ParrotsException as e:
+                if 'No element registered' not in e.message:
+                    warnings.warn(e.message)
+                ext_fun = get_fake_func(fun, e)
+                ext_list.append(ext_fun)
+            else:
+                if fun in has_return_value_ops:
+                    ext_list.append(ext_fun.op)
+                else:
+                    ext_list.append(ext_fun.op_)
+        return ExtModule(*ext_list)
+
+
+def check_ops_exist() -> bool:
+    ext_loader = pkgutil.find_loader('mmcv._ext')
+    return ext_loader is not None
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/parrots_jit.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/parrots_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e3a58c242db29c5c3cc140e073c977fc14c3d9f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/utils/parrots_jit.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+from mmengine.utils.dl_utils.parrots_wrapper import TORCH_VERSION
+
+parrots_jit_option = os.getenv('PARROTS_JIT_OPTION')
+
+if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON':
+    from parrots.jit import pat as jit
+else:
+
+    def jit(func=None,
+            check_input=None,
+            full_shape=True,
+            derivate=False,
+            coderize=False,
+            optimize=False):
+
+        def wrapper(func):
+
+            def wrapper_inner(*args, **kargs):
+                return func(*args, **kargs)
+
+            return wrapper_inner
+
+        if func is None:
+            return wrapper
+        else:
+            return func
+
+
+if TORCH_VERSION == 'parrots':
+    from parrots.utils.tester import skip_no_elena
+else:
+
+    def skip_no_elena(func):
+
+        def wrapper(*args, **kargs):
+            return func(*args, **kargs)
+
+        return wrapper
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/version.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9f3ba40b1feb9c1ae1c06530185a2e3cc53847e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/version.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+__version__ = '2.1.0'
+
+
+def parse_version_info(version_str: str, length: int = 4) -> tuple:
+    """Parse a version string into a tuple.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Default: 4.
+
+    Returns:
+        tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
+            (1, 3, 0, 0, 0, 0), and "2.0.0rc1" is parsed into
+            (2, 0, 0, 0, 'rc', 1) (when length is set to 4).
+    """
+    from packaging.version import parse
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        release.extend(list(version.pre))  # type: ignore
+    elif version.is_postrelease:
+        release.extend(list(version.post))  # type: ignore
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+
+version_info = tuple(int(x) for x in __version__.split('.')[:3])
+
+__all__ = ['__version__', 'version_info', 'parse_version_info']
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/video/__init__.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/video/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73199b01dec52820dc6ca0139903536344d5a1eb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/video/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .io import Cache, VideoReader, frames2video
+from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread,
+                      flowwrite, quantize_flow, sparse_flow_from_bytes)
+from .processing import concat_video, convert_video, cut_video, resize_video
+
+__all__ = [
+    'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video',
+    'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow',
+    'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes'
+]
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/video/io.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/video/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..378f5b9f7cc72984f543d262533044d8b031b4e9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/video/io.py
@@ -0,0 +1,316 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import OrderedDict
+
+import cv2
+from cv2 import (CAP_PROP_FOURCC, CAP_PROP_FPS, CAP_PROP_FRAME_COUNT,
+                 CAP_PROP_FRAME_HEIGHT, CAP_PROP_FRAME_WIDTH,
+                 CAP_PROP_POS_FRAMES, VideoWriter_fourcc)
+from mmengine.utils import (check_file_exist, mkdir_or_exist, scandir,
+                            track_progress)
+
+
+class Cache:
+
+    def __init__(self, capacity):
+        self._cache = OrderedDict()
+        self._capacity = int(capacity)
+        if capacity <= 0:
+            raise ValueError('capacity must be a positive integer')
+
+    @property
+    def capacity(self):
+        return self._capacity
+
+    @property
+    def size(self):
+        return len(self._cache)
+
+    def put(self, key, val):
+        if key in self._cache:
+            return
+        if len(self._cache) >= self.capacity:
+            self._cache.popitem(last=False)
+        self._cache[key] = val
+
+    def get(self, key, default=None):
+        val = self._cache[key] if key in self._cache else default
+        return val
+
+
+class VideoReader:
+    """Video class with similar usage to a list object.
+
+    This video wrapper class provides convenient apis to access frames.
+    There exists an issue of OpenCV's VideoCapture class that jumping to a
+    certain frame may be inaccurate. It is fixed in this class by checking
+    the position after jumping each time.
+    Cache is used when decoding videos. So if the same frame is visited for
+    the second time, there is no need to decode again if it is stored in the
+    cache.
+
+    Examples:
+        >>> import mmcv
+        >>> v = mmcv.VideoReader('sample.mp4')
+        >>> len(v)  # get the total frame number with `len()`
+        120
+        >>> for img in v:  # v is iterable
+        >>>     mmcv.imshow(img)
+        >>> v[5]  # get the 6th frame
+    """
+
+    def __init__(self, filename, cache_capacity=10):
+        # Check whether the video path is a url
+        if not filename.startswith(('https://', 'http://')):
+            check_file_exist(filename, 'Video file not found: ' + filename)
+        self._vcap = cv2.VideoCapture(filename)
+        assert cache_capacity > 0
+        self._cache = Cache(cache_capacity)
+        self._position = 0
+        # get basic info
+        self._width = int(self._vcap.get(CAP_PROP_FRAME_WIDTH))
+        self._height = int(self._vcap.get(CAP_PROP_FRAME_HEIGHT))
+        self._fps = self._vcap.get(CAP_PROP_FPS)
+        self._frame_cnt = int(self._vcap.get(CAP_PROP_FRAME_COUNT))
+        self._fourcc = self._vcap.get(CAP_PROP_FOURCC)
+
+    @property
+    def vcap(self):
+        """:obj:`cv2.VideoCapture`: The raw VideoCapture object."""
+        return self._vcap
+
+    @property
+    def opened(self):
+        """bool: Indicate whether the video is opened."""
+        return self._vcap.isOpened()
+
+    @property
+    def width(self):
+        """int: Width of video frames."""
+        return self._width
+
+    @property
+    def height(self):
+        """int: Height of video frames."""
+        return self._height
+
+    @property
+    def resolution(self):
+        """tuple: Video resolution (width, height)."""
+        return (self._width, self._height)
+
+    @property
+    def fps(self):
+        """float: FPS of the video."""
+        return self._fps
+
+    @property
+    def frame_cnt(self):
+        """int: Total frames of the video."""
+        return self._frame_cnt
+
+    @property
+    def fourcc(self):
+        """str: "Four character code" of the video."""
+        return self._fourcc
+
+    @property
+    def position(self):
+        """int: Current cursor position, indicating frame decoded."""
+        return self._position
+
+    def _get_real_position(self):
+        return int(round(self._vcap.get(CAP_PROP_POS_FRAMES)))
+
+    def _set_real_position(self, frame_id):
+        self._vcap.set(CAP_PROP_POS_FRAMES, frame_id)
+        pos = self._get_real_position()
+        for _ in range(frame_id - pos):
+            self._vcap.read()
+        self._position = frame_id
+
+    def read(self):
+        """Read the next frame.
+
+        If the next frame have been decoded before and in the cache, then
+        return it directly, otherwise decode, cache and return it.
+
+        Returns:
+            ndarray or None: Return the frame if successful, otherwise None.
+        """
+        # pos = self._position
+        if self._cache:
+            img = self._cache.get(self._position)
+            if img is not None:
+                ret = True
+            else:
+                if self._position != self._get_real_position():
+                    self._set_real_position(self._position)
+                ret, img = self._vcap.read()
+                if ret:
+                    self._cache.put(self._position, img)
+        else:
+            ret, img = self._vcap.read()
+        if ret:
+            self._position += 1
+        return img
+
+    def get_frame(self, frame_id):
+        """Get frame by index.
+
+        Args:
+            frame_id (int): Index of the expected frame, 0-based.
+
+        Returns:
+            ndarray or None: Return the frame if successful, otherwise None.
+        """
+        if frame_id < 0 or frame_id >= self._frame_cnt:
+            raise IndexError(
+                f'"frame_id" must be between 0 and {self._frame_cnt - 1}')
+        if frame_id == self._position:
+            return self.read()
+        if self._cache:
+            img = self._cache.get(frame_id)
+            if img is not None:
+                self._position = frame_id + 1
+                return img
+        self._set_real_position(frame_id)
+        ret, img = self._vcap.read()
+        if ret:
+            if self._cache:
+                self._cache.put(self._position, img)
+            self._position += 1
+        return img
+
+    def current_frame(self):
+        """Get the current frame (frame that is just visited).
+
+        Returns:
+            ndarray or None: If the video is fresh, return None, otherwise
+            return the frame.
+        """
+        if self._position == 0:
+            return None
+        return self._cache.get(self._position - 1)
+
+    def cvt2frames(self,
+                   frame_dir,
+                   file_start=0,
+                   filename_tmpl='{:06d}.jpg',
+                   start=0,
+                   max_num=0,
+                   show_progress=True):
+        """Convert a video to frame images.
+
+        Args:
+            frame_dir (str): Output directory to store all the frame images.
+            file_start (int): Filenames will start from the specified number.
+            filename_tmpl (str): Filename template with the index as the
+                placeholder.
+            start (int): The starting frame index.
+            max_num (int): Maximum number of frames to be written.
+            show_progress (bool): Whether to show a progress bar.
+        """
+        mkdir_or_exist(frame_dir)
+        if max_num == 0:
+            task_num = self.frame_cnt - start
+        else:
+            task_num = min(self.frame_cnt - start, max_num)
+        if task_num <= 0:
+            raise ValueError('start must be less than total frame number')
+        if start > 0:
+            self._set_real_position(start)
+
+        def write_frame(file_idx):
+            img = self.read()
+            if img is None:
+                return
+            filename = osp.join(frame_dir, filename_tmpl.format(file_idx))
+            cv2.imwrite(filename, img)
+
+        if show_progress:
+            track_progress(write_frame, range(file_start,
+                                              file_start + task_num))
+        else:
+            for i in range(task_num):
+                write_frame(file_start + i)
+
+    def __len__(self):
+        return self.frame_cnt
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            return [
+                self.get_frame(i)
+                for i in range(*index.indices(self.frame_cnt))
+            ]
+        # support negative indexing
+        if index < 0:
+            index += self.frame_cnt
+            if index < 0:
+                raise IndexError('index out of range')
+        return self.get_frame(index)
+
+    def __iter__(self):
+        self._set_real_position(0)
+        return self
+
+    def __next__(self):
+        img = self.read()
+        if img is not None:
+            return img
+        else:
+            raise StopIteration
+
+    next = __next__
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._vcap.release()
+
+
+def frames2video(frame_dir: str,
+                 video_file: str,
+                 fps: float = 30,
+                 fourcc: str = 'XVID',
+                 filename_tmpl: str = '{:06d}.jpg',
+                 start: int = 0,
+                 end: int = 0,
+                 show_progress: bool = True) -> None:
+    """Read the frame images from a directory and join them as a video.
+
+    Args:
+        frame_dir (str): The directory containing video frames.
+        video_file (str): Output filename.
+        fps (float): FPS of the output video.
+        fourcc (str): Fourcc of the output video, this should be compatible
+            with the output file type.
+        filename_tmpl (str): Filename template with the index as the variable.
+        start (int): Starting frame index.
+        end (int): Ending frame index.
+        show_progress (bool): Whether to show a progress bar.
+    """
+    if end == 0:
+        ext = filename_tmpl.split('.')[-1]
+        end = len([name for name in scandir(frame_dir, ext)])
+    first_file = osp.join(frame_dir, filename_tmpl.format(start))
+    check_file_exist(first_file, 'The start frame not found: ' + first_file)
+    img = cv2.imread(first_file)
+    height, width = img.shape[:2]
+    resolution = (width, height)
+    vwriter = cv2.VideoWriter(video_file, VideoWriter_fourcc(*fourcc), fps,
+                              resolution)
+
+    def write_frame(file_idx):
+        filename = osp.join(frame_dir, filename_tmpl.format(file_idx))
+        img = cv2.imread(filename)
+        vwriter.write(img)
+
+    if show_progress:
+        track_progress(write_frame, range(start, end))
+    else:
+        for i in range(start, end):
+            write_frame(i)
+    vwriter.release()
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/video/optflow.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/video/optflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..edd3e42069ff53a0782ef722403d8ae0ec36291a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/video/optflow.py
@@ -0,0 +1,272 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Tuple, Union
+
+import cv2
+import numpy as np
+from mmengine.utils import is_str
+
+from mmcv.arraymisc import dequantize, quantize
+from mmcv.image import imread, imwrite
+
+
+def flowread(flow_or_path: Union[np.ndarray, str],
+             quantize: bool = False,
+             concat_axis: int = 0,
+             *args,
+             **kwargs) -> np.ndarray:
+    """Read an optical flow map.
+
+    Args:
+        flow_or_path (ndarray or str): A flow map or filepath.
+        quantize (bool): whether to read quantized pair, if set to True,
+            remaining args will be passed to :func:`dequantize_flow`.
+        concat_axis (int): The axis that dx and dy are concatenated,
+            can be either 0 or 1. Ignored if quantize is False.
+
+    Returns:
+        ndarray: Optical flow represented as a (h, w, 2) numpy array
+    """
+    if isinstance(flow_or_path, np.ndarray):
+        if (flow_or_path.ndim != 3) or (flow_or_path.shape[-1] != 2):
+            raise ValueError(f'Invalid flow with shape {flow_or_path.shape}')
+        return flow_or_path
+    elif not is_str(flow_or_path):
+        raise TypeError(f'"flow_or_path" must be a filename or numpy array, '
+                        f'not {type(flow_or_path)}')
+
+    if not quantize:
+        with open(flow_or_path, 'rb') as f:
+            try:
+                header = f.read(4).decode('utf-8')
+            except Exception:
+                raise OSError(f'Invalid flow file: {flow_or_path}')
+            else:
+                if header != 'PIEH':
+                    raise OSError(f'Invalid flow file: {flow_or_path}, '
+                                  'header does not contain PIEH')
+
+            w = np.fromfile(f, np.int32, 1).squeeze()
+            h = np.fromfile(f, np.int32, 1).squeeze()
+            flow = np.fromfile(f, np.float32, w * h * 2).reshape((h, w, 2))
+    else:
+        assert concat_axis in [0, 1]
+        cat_flow = imread(flow_or_path, flag='unchanged')
+        if cat_flow.ndim != 2:
+            raise OSError(
+                f'{flow_or_path} is not a valid quantized flow file, '
+                f'its dimension is {cat_flow.ndim}.')
+        assert cat_flow.shape[concat_axis] % 2 == 0
+        dx, dy = np.split(cat_flow, 2, axis=concat_axis)
+        flow = dequantize_flow(dx, dy, *args, **kwargs)
+
+    return flow.astype(np.float32)
+
+
+def flowwrite(flow: np.ndarray,
+              filename: str,
+              quantize: bool = False,
+              concat_axis: int = 0,
+              *args,
+              **kwargs) -> None:
+    """Write optical flow to file.
+
+    If the flow is not quantized, it will be saved as a .flo file losslessly,
+    otherwise a jpeg image which is lossy but of much smaller size. (dx and dy
+    will be concatenated horizontally into a single image if quantize is True.)
+
+    Args:
+        flow (ndarray): (h, w, 2) array of optical flow.
+        filename (str): Output filepath.
+        quantize (bool): Whether to quantize the flow and save it to 2 jpeg
+            images. If set to True, remaining args will be passed to
+            :func:`quantize_flow`.
+        concat_axis (int): The axis that dx and dy are concatenated,
+            can be either 0 or 1. Ignored if quantize is False.
+    """
+    if not quantize:
+        with open(filename, 'wb') as f:
+            f.write(b'PIEH')
+            np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)
+            flow = flow.astype(np.float32)
+            flow.tofile(f)
+            f.flush()
+    else:
+        assert concat_axis in [0, 1]
+        dx, dy = quantize_flow(flow, *args, **kwargs)
+        dxdy = np.concatenate((dx, dy), axis=concat_axis)
+        imwrite(dxdy, filename)
+
+
+def quantize_flow(flow: np.ndarray,
+                  max_val: float = 0.02,
+                  norm: bool = True) -> tuple:
+    """Quantize flow to [0, 255].
+
+    After this step, the size of flow will be much smaller, and can be
+    dumped as jpeg images.
+
+    Args:
+        flow (ndarray): (h, w, 2) array of optical flow.
+        max_val (float): Maximum value of flow, values beyond
+                        [-max_val, max_val] will be truncated.
+        norm (bool): Whether to divide flow values by image width/height.
+
+    Returns:
+        tuple[ndarray]: Quantized dx and dy.
+    """
+    h, w, _ = flow.shape
+    dx = flow[..., 0]
+    dy = flow[..., 1]
+    if norm:
+        dx = dx / w  # avoid inplace operations
+        dy = dy / h
+    # use 255 levels instead of 256 to make sure 0 is 0 after dequantization.
+    flow_comps = [
+        quantize(d, -max_val, max_val, 255, np.uint8) for d in [dx, dy]
+    ]
+    return tuple(flow_comps)
+
+
+def dequantize_flow(dx: np.ndarray,
+                    dy: np.ndarray,
+                    max_val: float = 0.02,
+                    denorm: bool = True) -> np.ndarray:
+    """Recover from quantized flow.
+
+    Args:
+        dx (ndarray): Quantized dx.
+        dy (ndarray): Quantized dy.
+        max_val (float): Maximum value used when quantizing.
+        denorm (bool): Whether to multiply flow values with width/height.
+
+    Returns:
+        ndarray: Dequantized flow.
+    """
+    assert dx.shape == dy.shape
+    assert dx.ndim == 2 or (dx.ndim == 3 and dx.shape[-1] == 1)
+
+    dx, dy = (dequantize(d, -max_val, max_val, 255) for d in [dx, dy])
+
+    if denorm:
+        dx *= dx.shape[1]
+        dy *= dx.shape[0]
+    flow = np.dstack((dx, dy))
+    return flow
+
+
+def flow_warp(img: np.ndarray,
+              flow: np.ndarray,
+              filling_value: int = 0,
+              interpolate_mode: str = 'nearest') -> np.ndarray:
+    """Use flow to warp img.
+
+    Args:
+        img (ndarray): Image to be warped.
+        flow (ndarray): Optical Flow.
+        filling_value (int): The missing pixels will be set with filling_value.
+        interpolate_mode (str): bilinear -> Bilinear Interpolation;
+                                nearest -> Nearest Neighbor.
+
+    Returns:
+        ndarray: Warped image with the same shape of img
+    """
+    warnings.warn('This function is just for prototyping and cannot '
+                  'guarantee the computational efficiency.')
+    assert flow.ndim == 3, 'Flow must be in 3D arrays.'
+    height = flow.shape[0]
+    width = flow.shape[1]
+    channels = img.shape[2]
+
+    output = np.ones(
+        (height, width, channels), dtype=img.dtype) * filling_value
+
+    grid = np.indices((height, width)).swapaxes(0, 1).swapaxes(1, 2)
+    dx = grid[:, :, 0] + flow[:, :, 1]
+    dy = grid[:, :, 1] + flow[:, :, 0]
+    sx = np.floor(dx).astype(int)
+    sy = np.floor(dy).astype(int)
+    valid = (sx >= 0) & (sx < height - 1) & (sy >= 0) & (sy < width - 1)
+
+    if interpolate_mode == 'nearest':
+        output[valid, :] = img[dx[valid].round().astype(int),
+                               dy[valid].round().astype(int), :]
+    elif interpolate_mode == 'bilinear':
+        # dirty walkround for integer positions
+        eps_ = 1e-6
+        dx, dy = dx + eps_, dy + eps_
+        left_top_ = img[np.floor(dx[valid]).astype(int),
+                        np.floor(dy[valid]).astype(int), :] * (
+                            np.ceil(dx[valid]) - dx[valid])[:, None] * (
+                                np.ceil(dy[valid]) - dy[valid])[:, None]
+        left_down_ = img[np.ceil(dx[valid]).astype(int),
+                         np.floor(dy[valid]).astype(int), :] * (
+                             dx[valid] - np.floor(dx[valid]))[:, None] * (
+                                 np.ceil(dy[valid]) - dy[valid])[:, None]
+        right_top_ = img[np.floor(dx[valid]).astype(int),
+                         np.ceil(dy[valid]).astype(int), :] * (
+                             np.ceil(dx[valid]) - dx[valid])[:, None] * (
+                                 dy[valid] - np.floor(dy[valid]))[:, None]
+        right_down_ = img[np.ceil(dx[valid]).astype(int),
+                          np.ceil(dy[valid]).astype(int), :] * (
+                              dx[valid] - np.floor(dx[valid]))[:, None] * (
+                                  dy[valid] - np.floor(dy[valid]))[:, None]
+        output[valid, :] = left_top_ + left_down_ + right_top_ + right_down_
+    else:
+        raise NotImplementedError(
+            'We only support interpolation modes of nearest and bilinear, '
+            f'but got {interpolate_mode}.')
+    return output.astype(img.dtype)
+
+
+def flow_from_bytes(content: bytes) -> np.ndarray:
+    """Read dense optical flow from bytes.
+
+    .. note::
+        This load optical flow function works for FlyingChairs, FlyingThings3D,
+        Sintel, FlyingChairsOcc datasets, but cannot load the data from
+        ChairsSDHom.
+
+    Args:
+        content (bytes): Optical flow bytes got from files or other streams.
+
+    Returns:
+        ndarray: Loaded optical flow with the shape (H, W, 2).
+    """
+
+    # header in first 4 bytes
+    header = content[:4]
+    if header.decode('utf-8') != 'PIEH':
+        raise Exception('Flow file header does not contain PIEH')
+    # width in second 4 bytes
+    width = np.frombuffer(content[4:], np.int32, 1).squeeze()
+    # height in third 4 bytes
+    height = np.frombuffer(content[8:], np.int32, 1).squeeze()
+    # after first 12 bytes, all bytes are flow
+    flow = np.frombuffer(content[12:], np.float32, width * height * 2).reshape(
+        (height, width, 2))
+
+    return flow
+
+
+def sparse_flow_from_bytes(content: bytes) -> Tuple[np.ndarray, np.ndarray]:
+    """Read the optical flow in KITTI datasets from bytes.
+
+    This function is modified from RAFT load the `KITTI datasets
+    <https://github.com/princeton-vl/RAFT/blob/224320502d66c356d88e6c712f38129e60661e80/core/utils/frame_utils.py#L102>`_.
+
+    Args:
+        content (bytes): Optical flow bytes got from files or other streams.
+
+    Returns:
+        Tuple(ndarray, ndarray): Loaded optical flow with the shape (H, W, 2)
+        and flow valid mask with the shape (H, W).
+    """  # nopa
+
+    content = np.frombuffer(content, np.uint8)
+    flow = cv2.imdecode(content, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
+    flow = flow[:, :, ::-1].astype(np.float32)
+    # flow shape (H, W, 2) valid shape (H, W)
+    flow, valid = flow[:, :, :2], flow[:, :, 2]
+    flow = (flow - 2**15) / 64.0
+    return flow, valid
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/video/processing.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/video/processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..4962e08a9e7c0c05279146491c71282708289c32
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/video/processing.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import subprocess
+import tempfile
+from typing import List, Optional, Union
+
+from mmengine.utils import requires_executable
+
+
+@requires_executable('ffmpeg')
+def convert_video(in_file: str,
+                  out_file: str,
+                  print_cmd: bool = False,
+                  pre_options: str = '',
+                  **kwargs) -> None:
+    """Convert a video with ffmpeg.
+
+    This provides a general api to ffmpeg, the executed command is::
+
+        `ffmpeg -y <pre_options> -i <in_file> <options> <out_file>`
+
+    Options(kwargs) are mapped to ffmpeg commands with the following rules:
+
+    - key=val: "-key val"
+    - key=True: "-key"
+    - key=False: ""
+
+    Args:
+        in_file (str): Input video filename.
+        out_file (str): Output video filename.
+        pre_options (str): Options appears before "-i <in_file>".
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    options = []
+    for k, v in kwargs.items():
+        if isinstance(v, bool):
+            if v:
+                options.append(f'-{k}')
+        elif k == 'log_level':
+            assert v in [
+                'quiet', 'panic', 'fatal', 'error', 'warning', 'info',
+                'verbose', 'debug', 'trace'
+            ]
+            options.append(f'-loglevel {v}')
+        else:
+            options.append(f'-{k} {v}')
+    cmd = f'ffmpeg -y {pre_options} -i {in_file} {" ".join(options)} ' \
+          f'{out_file}'
+    if print_cmd:
+        print(cmd)
+    subprocess.call(cmd, shell=True)
+
+
+@requires_executable('ffmpeg')
+def resize_video(in_file: str,
+                 out_file: str,
+                 size: Optional[tuple] = None,
+                 ratio: Union[tuple, float, None] = None,
+                 keep_ar: bool = False,
+                 log_level: str = 'info',
+                 print_cmd: bool = False) -> None:
+    """Resize a video.
+
+    Args:
+        in_file (str): Input video filename.
+        out_file (str): Output video filename.
+        size (tuple): Expected size (w, h), eg, (320, 240) or (320, -1).
+        ratio (tuple or float): Expected resize ratio, (2, 0.5) means
+            (w*2, h*0.5).
+        keep_ar (bool): Whether to keep original aspect ratio.
+        log_level (str): Logging level of ffmpeg.
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    if size is None and ratio is None:
+        raise ValueError('expected size or ratio must be specified')
+    if size is not None and ratio is not None:
+        raise ValueError('size and ratio cannot be specified at the same time')
+    options = {'log_level': log_level}
+    if size:
+        if not keep_ar:
+            options['vf'] = f'scale={size[0]}:{size[1]}'
+        else:
+            options['vf'] = f'scale=w={size[0]}:h={size[1]}:' \
+                            'force_original_aspect_ratio=decrease'
+    else:
+        if not isinstance(ratio, tuple):
+            ratio = (ratio, ratio)
+        options['vf'] = f'scale="trunc(iw*{ratio[0]}):trunc(ih*{ratio[1]})"'
+    convert_video(in_file, out_file, print_cmd, **options)
+
+
+@requires_executable('ffmpeg')
+def cut_video(in_file: str,
+              out_file: str,
+              start: Optional[float] = None,
+              end: Optional[float] = None,
+              vcodec: Optional[str] = None,
+              acodec: Optional[str] = None,
+              log_level: str = 'info',
+              print_cmd: bool = False) -> None:
+    """Cut a clip from a video.
+
+    Args:
+        in_file (str): Input video filename.
+        out_file (str): Output video filename.
+        start (None or float): Start time (in seconds).
+        end (None or float): End time (in seconds).
+        vcodec (None or str): Output video codec, None for unchanged.
+        acodec (None or str): Output audio codec, None for unchanged.
+        log_level (str): Logging level of ffmpeg.
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    options = {'log_level': log_level}
+    if vcodec is None:
+        options['vcodec'] = 'copy'
+    if acodec is None:
+        options['acodec'] = 'copy'
+    if start:
+        options['ss'] = start  # type: ignore
+    else:
+        start = 0
+    if end:
+        options['t'] = end - start  # type: ignore
+    convert_video(in_file, out_file, print_cmd, **options)
+
+
+@requires_executable('ffmpeg')
+def concat_video(video_list: List,
+                 out_file: str,
+                 vcodec: Optional[str] = None,
+                 acodec: Optional[str] = None,
+                 log_level: str = 'info',
+                 print_cmd: bool = False) -> None:
+    """Concatenate multiple videos into a single one.
+
+    Args:
+        video_list (list): A list of video filenames
+        out_file (str): Output video filename
+        vcodec (None or str): Output video codec, None for unchanged
+        acodec (None or str): Output audio codec, None for unchanged
+        log_level (str): Logging level of ffmpeg.
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    tmp_filehandler, tmp_filename = tempfile.mkstemp(suffix='.txt', text=True)
+    with open(tmp_filename, 'w') as f:
+        for filename in video_list:
+            f.write(f'file {osp.abspath(filename)}\n')
+    options = {'log_level': log_level}
+    if vcodec is None:
+        options['vcodec'] = 'copy'
+    if acodec is None:
+        options['acodec'] = 'copy'
+    convert_video(
+        tmp_filename,
+        out_file,
+        print_cmd,
+        pre_options='-f concat -safe 0',
+        **options)
+    os.close(tmp_filehandler)
+    os.remove(tmp_filename)
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/visualization/__init__.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..835df136bdcf69348281d22914d41aa84cdf92b1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/visualization/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .color import Color, color_val
+from .image import imshow, imshow_bboxes, imshow_det_bboxes
+from .optflow import flow2rgb, flowshow, make_color_wheel
+
+__all__ = [
+    'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes',
+    'flowshow', 'flow2rgb', 'make_color_wheel'
+]
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/visualization/color.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/visualization/color.py
new file mode 100644
index 0000000000000000000000000000000000000000..05796a80c38fb6b167369fc696f3f6aa1935e00d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/visualization/color.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import Enum
+from typing import Union
+
+import numpy as np
+from mmengine.utils import is_str
+
+
+class Color(Enum):
+    """An enum that defines common colors.
+
+    Contains red, green, blue, cyan, yellow, magenta, white and black.
+    """
+    red = (0, 0, 255)
+    green = (0, 255, 0)
+    blue = (255, 0, 0)
+    cyan = (255, 255, 0)
+    yellow = (0, 255, 255)
+    magenta = (255, 0, 255)
+    white = (255, 255, 255)
+    black = (0, 0, 0)
+
+
+def color_val(color: Union[Color, str, tuple, int, np.ndarray]) -> tuple:
+    """Convert various input to color tuples.
+
+    Args:
+        color (:obj:`Color`/str/tuple/int/ndarray): Color inputs
+
+    Returns:
+        tuple[int]: A tuple of 3 integers indicating BGR channels.
+    """
+    if is_str(color):
+        return Color[color].value  # type: ignore
+    elif isinstance(color, Color):
+        return color.value
+    elif isinstance(color, tuple):
+        assert len(color) == 3
+        for channel in color:
+            assert 0 <= channel <= 255
+        return color
+    elif isinstance(color, int):
+        assert 0 <= color <= 255
+        return color, color, color
+    elif isinstance(color, np.ndarray):
+        assert color.ndim == 1 and color.size == 3
+        assert np.all((color >= 0) & (color <= 255))
+        color = color.astype(np.uint8)
+        return tuple(color)
+    else:
+        raise TypeError(f'Invalid type for color: {type(color)}')
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/visualization/image.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/visualization/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7ac4c181744cb08e51a77707c970400a9198a74
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/visualization/image.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import cv2
+import numpy as np
+
+from mmcv.image import imread, imwrite
+from .color import Color, color_val
+
+# a type alias declares the optional types of color argument
+ColorType = Union[Color, str, tuple, int, np.ndarray]
+
+
+def imshow(img: Union[str, np.ndarray],
+           win_name: str = '',
+           wait_time: int = 0):
+    """Show an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+    """
+    cv2.imshow(win_name, imread(img))
+    if wait_time == 0:  # prevent from hanging if windows was closed
+        while True:
+            ret = cv2.waitKey(1)
+
+            closed = cv2.getWindowProperty(win_name, cv2.WND_PROP_VISIBLE) < 1
+            # if user closed window or if some key pressed
+            if closed or ret != -1:
+                break
+    else:
+        ret = cv2.waitKey(wait_time)
+
+
+def imshow_bboxes(img: Union[str, np.ndarray],
+                  bboxes: Union[list, np.ndarray],
+                  colors: ColorType = 'green',
+                  top_k: int = -1,
+                  thickness: int = 1,
+                  show: bool = True,
+                  win_name: str = '',
+                  wait_time: int = 0,
+                  out_file: Optional[str] = None):
+    """Draw bboxes on an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (list or ndarray): A list of ndarray of shape (k, 4).
+        colors (Color or str or tuple or int or ndarray): A list of colors.
+        top_k (int): Plot the first k bboxes only if set positive.
+        thickness (int): Thickness of lines.
+        show (bool): Whether to show the image.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+        out_file (str, optional): The filename to write the image.
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+    img = imread(img)
+    img = np.ascontiguousarray(img)
+
+    if isinstance(bboxes, np.ndarray):
+        bboxes = [bboxes]
+    if not isinstance(colors, list):
+        colors = [colors for _ in range(len(bboxes))]
+    colors = [color_val(c) for c in colors]
+    assert len(bboxes) == len(colors)
+
+    for i, _bboxes in enumerate(bboxes):
+        _bboxes = _bboxes.astype(np.int32)
+        if top_k <= 0:
+            _top_k = _bboxes.shape[0]
+        else:
+            _top_k = min(top_k, _bboxes.shape[0])
+        for j in range(_top_k):
+            left_top = (_bboxes[j, 0], _bboxes[j, 1])
+            right_bottom = (_bboxes[j, 2], _bboxes[j, 3])
+            cv2.rectangle(
+                img, left_top, right_bottom, colors[i], thickness=thickness)
+
+    if show:
+        imshow(img, win_name, wait_time)
+    if out_file is not None:
+        imwrite(img, out_file)
+    return img
+
+
+def imshow_det_bboxes(img: Union[str, np.ndarray],
+                      bboxes: np.ndarray,
+                      labels: np.ndarray,
+                      class_names: List[str] = None,
+                      score_thr: float = 0,
+                      bbox_color: ColorType = 'green',
+                      text_color: ColorType = 'green',
+                      thickness: int = 1,
+                      font_scale: float = 0.5,
+                      show: bool = True,
+                      win_name: str = '',
+                      wait_time: int = 0,
+                      out_file: Optional[str] = None):
+    """Draw bboxes and class labels (with scores) on an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or
+            (n, 5).
+        labels (ndarray): Labels of bboxes.
+        class_names (list[str]): Names of each classes.
+        score_thr (float): Minimum score of bboxes to be shown.
+        bbox_color (Color or str or tuple or int or ndarray): Color
+            of bbox lines.
+        text_color (Color or str or tuple or int or ndarray): Color
+            of texts.
+        thickness (int): Thickness of lines.
+        font_scale (float): Font scales of texts.
+        show (bool): Whether to show the image.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+        out_file (str or None): The filename to write the image.
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+    assert bboxes.ndim == 2
+    assert labels.ndim == 1
+    assert bboxes.shape[0] == labels.shape[0]
+    assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5
+    img = imread(img)
+    img = np.ascontiguousarray(img)
+
+    if score_thr > 0:
+        assert bboxes.shape[1] == 5
+        scores = bboxes[:, -1]
+        inds = scores > score_thr
+        bboxes = bboxes[inds, :]
+        labels = labels[inds]
+
+    bbox_color = color_val(bbox_color)
+    text_color = color_val(text_color)
+
+    for bbox, label in zip(bboxes, labels):
+        bbox_int = bbox.astype(np.int32)
+        left_top = (bbox_int[0], bbox_int[1])
+        right_bottom = (bbox_int[2], bbox_int[3])
+        cv2.rectangle(
+            img, left_top, right_bottom, bbox_color, thickness=thickness)
+        label_text = class_names[
+            label] if class_names is not None else f'cls {label}'
+        if len(bbox) > 4:
+            label_text += f'|{bbox[-1]:.02f}'
+        cv2.putText(img, label_text, (bbox_int[0], bbox_int[1] - 2),
+                    cv2.FONT_HERSHEY_COMPLEX, font_scale, text_color)
+
+    if show:
+        imshow(img, win_name, wait_time)
+    if out_file is not None:
+        imwrite(img, out_file)
+    return img
diff --git a/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/visualization/optflow.py b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/visualization/optflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..080b0e61f401c2aab3eedd307d8fc8686b0cae08
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/lib.linux-x86_64-cpython-311/mmcv/visualization/optflow.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import numpy as np
+
+from mmcv.image import rgb2bgr
+from mmcv.video import flowread
+from .image import imshow
+
+
+def flowshow(flow: Union[np.ndarray, str],
+             win_name: str = '',
+             wait_time: int = 0) -> None:
+    """Show optical flow.
+
+    Args:
+        flow (ndarray or str): The optical flow to be displayed.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+    """
+    flow = flowread(flow)
+    flow_img = flow2rgb(flow)
+    imshow(rgb2bgr(flow_img), win_name, wait_time)
+
+
+def flow2rgb(flow: np.ndarray,
+             color_wheel: Optional[np.ndarray] = None,
+             unknown_thr: float = 1e6) -> np.ndarray:
+    """Convert flow map to RGB image.
+
+    Args:
+        flow (ndarray): Array of optical flow.
+        color_wheel (ndarray or None): Color wheel used to map flow field to
+            RGB colorspace. Default color wheel will be used if not specified.
+        unknown_thr (float): Values above this threshold will be marked as
+            unknown and thus ignored.
+
+    Returns:
+        ndarray: RGB image that can be visualized.
+    """
+    assert flow.ndim == 3 and flow.shape[-1] == 2
+    if color_wheel is None:
+        color_wheel = make_color_wheel()
+    assert color_wheel.ndim == 2 and color_wheel.shape[1] == 3
+    num_bins = color_wheel.shape[0]
+
+    dx = flow[:, :, 0].copy()
+    dy = flow[:, :, 1].copy()
+
+    ignore_inds = (
+        np.isnan(dx) | np.isnan(dy) | (np.abs(dx) > unknown_thr) |
+        (np.abs(dy) > unknown_thr))
+    dx[ignore_inds] = 0
+    dy[ignore_inds] = 0
+
+    rad = np.sqrt(dx**2 + dy**2)
+    if np.any(rad > np.finfo(float).eps):
+        max_rad = np.max(rad)
+        dx /= max_rad
+        dy /= max_rad
+
+    rad = np.sqrt(dx**2 + dy**2)
+    angle = np.arctan2(-dy, -dx) / np.pi
+
+    bin_real = (angle + 1) / 2 * (num_bins - 1)
+    bin_left = np.floor(bin_real).astype(int)
+    bin_right = (bin_left + 1) % num_bins
+    w = (bin_real - bin_left.astype(np.float32))[..., None]
+    flow_img = (1 -
+                w) * color_wheel[bin_left, :] + w * color_wheel[bin_right, :]
+    small_ind = rad <= 1
+    flow_img[small_ind] = 1 - rad[small_ind, None] * (1 - flow_img[small_ind])
+    flow_img[np.logical_not(small_ind)] *= 0.75
+
+    flow_img[ignore_inds, :] = 0
+
+    return flow_img
+
+
+def make_color_wheel(bins: Optional[Union[list, tuple]] = None) -> np.ndarray:
+    """Build a color wheel.
+
+    Args:
+        bins(list or tuple, optional): Specify the number of bins for each
+            color range, corresponding to six ranges: red -> yellow,
+            yellow -> green, green -> cyan, cyan -> blue, blue -> magenta,
+            magenta -> red. [15, 6, 4, 11, 13, 6] is used for default
+            (see Middlebury).
+
+    Returns:
+        ndarray: Color wheel of shape (total_bins, 3).
+    """
+    if bins is None:
+        bins = [15, 6, 4, 11, 13, 6]
+    assert len(bins) == 6
+
+    RY, YG, GC, CB, BM, MR = tuple(bins)
+
+    ry = [1, np.arange(RY) / RY, 0]
+    yg = [1 - np.arange(YG) / YG, 1, 0]
+    gc = [0, 1, np.arange(GC) / GC]
+    cb = [0, 1 - np.arange(CB) / CB, 1]
+    bm = [np.arange(BM) / BM, 0, 1]
+    mr = [1, 0, 1 - np.arange(MR) / MR]
+
+    num_bins = RY + YG + GC + CB + BM + MR
+
+    color_wheel = np.zeros((3, num_bins), dtype=np.float32)
+
+    col = 0
+    for i, color in enumerate([ry, yg, gc, cb, bm, mr]):
+        for j in range(3):
+            color_wheel[j, col:col + bins[i]] = color[j]
+        col += bins[i]
+
+    return color_wheel.T
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/.ninja_deps b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/.ninja_deps
new file mode 100644
index 0000000000000000000000000000000000000000..aaae9d3306b241e074e2b2a1316ae3244f3335c6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/.ninja_deps
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5908a3e18d46239c9c3dbfda105276338987671c651be1c3a65cfe4bf9079c2b
+size 3181956
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/.ninja_log b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/.ninja_log
new file mode 100644
index 0000000000000000000000000000000000000000..02a97987564b98a767a464aaead1068bc8bfeb16
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/.ninja_log
@@ -0,0 +1,137 @@
+# ninja log v7
+3	24746	1763022914228538796	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_rotated.o	c678ba141db8488a
+2	26333	1763022914226537860	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/active_rotated_filter.o	b9c8e29be84132ff
+2	26346	1763022914227538328	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/assign_score_withk.o	53cfb9654441e009
+2	26359	1763022914227538328	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ball_query.o	904e208d6d8f9585
+3	26372	1763022914227538328	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bezier_align.o	8f72c49820d42345
+3	26386	1763022914227538328	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bias_act.o	6f4b9d86d431cea8
+3	26410	1763022914228538796	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/border_align.o	6703f4e604c633f9
+4	26424	1763022914228538796	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe.o	1d0c3c540cdf15da
+4	26438	1763022914228538796	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe_naive.o	b7b0df114391a0e9
+4	26462	1763022914229539264	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/contour_expand.o	fa2a64ac90a3ccec
+4	26499	1763022914229539264	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/convex_iou.o	aeb0313c961f20a2
+5	26532	1763022914229539264	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/correlation.o	571a4c55f28a18be
+5	26572	1763022914230539733	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.o	996e6c2afa4af691
+7	26606	1763022914231540201	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.o	5d20e030c1aaf191
+7	26621	1763022914232540669	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.o	d54d41930a734bd3
+8	26649	1763022914232540669	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.o	38c6aebe1bb04c4e
+9	26677	1763022914234541606	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/pixel_group.o	5357ca34b19918f5
+9	26692	1763022914234541606	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.o	9c4ae63d9c0cbfa9
+10	26709	1763022914234541606	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/psamask.o	3cdb97f7879d0f03
+10	26730	1763022914235542074	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align.o	af07fa83946dadab
+12	26752	1763022914237543010	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/voxelization.o	e78ea299b355243b
+48	26775	1763022914273559866	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/cudabind.o	76d26e22364c7954
+232	26791	1763022914459646954	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_roi_pool.o	30c98f2dee693d39
+260	26808	1763022914487660065	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/focal_loss.o	f6d596642f8103b5
+2	26824	1763022914227538328	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bbox_overlaps.o	2d1225e778b7f543
+3	26840	1763022914228538796	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_quadri.o	dde0deb3056f7d09
+4	26865	1763022914229539264	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/chamfer_distance.o	96eed9e2cb4f9415
+5	26881	1763022914229539264	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.o	3cc05ac4e8843c5e
+9	26897	1763022914233541137	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_rotated.o	e6664b8865ac9cc4
+7	26913	1763022914232540669	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/deform_conv.o	ff172497b1f3d6a6
+8	27017	1763022914233541137	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_quadri.o	469966807082c1af
+8	27223	1763022914233541137	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms.o	69fc2dee7939756f
+5	27491	1763022914231540201	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bezier_align.o	e0ca57ff2c6bce1c
+11	27882	1763022914236542542	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.o	b1253df7cc95934a
+11	28233	1763022914235542074	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.o	130ddb0dfc08808c
+226	28442	1763022914453644145	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_conv.o	c580ca38f935a928
+12	29318	1763022914236542542	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.o	f88013026919fff4
+264	29678	1763022914490661469	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/furthest_point_sample.o	140fc99922b1f6b6
+11	29698	1763022914236542542	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.o	4b4a9f8003c295b5
+245	29790	1763022914472653041	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/filtered_lrelu.o	58106b7e0b685256
+11	30775	1763022914236542542	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_indice.o	18c47a3989a6d5bf
+242	31278	1763022914468651168	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/diff_iou_rotated.o	35c743e97b6497e1
+26376	48608	1763022940601881462	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/info.o	d2ba6e5cc06c2069
+26400	48669	1763022940625892700	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/iou3d.o	8a4d7f1a2b5bad31
+26427	48712	1763022940653905810	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/masked_conv2d.o	fc124cd7d972d41b
+26625	48773	1763022940850998049	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pixel_group.o	1b2d358df66c32cf
+26576	48787	1763022940802975574	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_quadri.o	fd9bfc44d9b2fec1
+27497	48856	1763022941722405866	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_nn.o	2890124a7d68925c
+26682	49015	1763022940907024269	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_polygons.o	18f81a0f6ee5ce4a
+26336	49176	1763022940561862734	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_spconv_ops.o	439aa857e562872b
+26780	49195	1763022941005070154	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align.o	6f09d1b30dbe53d2
+26713	49270	1763022940939039252	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/psamask.o	f1634f59e3b0767a
+26349	49287	1763022940575869289	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/gather_points.o	c7a074086cc96700
+26414	49322	1763022940639899255	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/knn.o	dc4293444251e71f
+26503	49349	1763022940728940926	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ms_deform_attn.o	afa1a49aa70b39fd
+26697	49373	1763022940923031760	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/prroi_pool.o	10ce592bf7cf7e6f
+26362	49387	1763022940588875376	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/group_points.o	d7ded90716544845
+26886	49486	1763022941111119785	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/scatter_points.o	255636f9521f1853
+26902	49522	1763022941127127277	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sparse_pool_ops.o	31bb1d68d4651c31
+26318	49587	1763022940543854306	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.o	8250a855e79e3b21
+26796	49602	1763022941022078114	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align_rotated.o	7020a0f34401a050
+26869	49617	1763022941095112294	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/rotated_feature_align.o	afd7f9e34ed14eaa
+27890	49631	1763022942115589875	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/tin_shift.o	e2ea2121c1da03f4
+26441	49644	1763022940667912365	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/min_area_polygons.o	4eba62b8a07e7b52
+26536	49661	1763022940762956845	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms.o	ccc447c31813fb01
+26844	49674	1763022941070100588	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roipoint_pool3d.o	41efa1014f15bd1e
+26653	49701	1763022940878010690	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_boxes.o	1a5b1cbc540edfa1
+26756	49715	1763022940982059385	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/riroi_align_rotated.o	27993f9b3a63e2b2
+26828	49796	1763022941054093097	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roiaware_pool3d.o	7343d57e5744ef2
+28241	49836	1763022942466754219	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/upfirdn2d.o	14e51971f114cfbc
+26611	49891	1763022940835991025	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_rotated.o	7718054776dbfd0
+27228	49948	1763022941455280851	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_interpolate.o	18f678233633a56e
+26812	50046	1763022941038085605	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_pool.o	8f139066be9ecf7d
+28449	50182	1763022942675852076	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/voxelization.o	27d62343a07fd7e1
+27024	50213	1763022941250184867	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sync_bn.o	d6f95a83e51d3b84
+26918	50323	1763022941143134768	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_ops.o	13ae133d96b0318e
+26465	50383	1763022940690923134	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/modulated_deform_conv.o	a8f4a31f94f326fb
+126	57922	1763022914352596855	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.o	25b385c543922ecb
+90	58212	1763022914316579999	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.o	d8dd0a3358db8199
+15	58313	1763022914239543947	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.o	746fbd3b577491a3
+13	58378	1763022914238543478	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.o	432a74b6d932b268
+88	58416	1763022914314579063	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/knn_cuda.o	c20523fa9bf9af95
+53	58438	1763022914279562675	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.o	f966c65796976394
+64	58684	1763022914289567357	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.o	8d8f6511e3af2501
+31	58859	1763022914256551907	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.o	943717b1bbb1afd3
+116	58906	1763022914340591236	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.o	7215809b35576ce4
+115	59107	1763022914340591236	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.o	395fd01ebf0eff3f
+14	59298	1763022914238543478	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.o	8c54543d08e06f6f
+76	59398	1763022914301572976	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.o	f3663f3e137bac39
+109	59410	1763022914334588427	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_cuda.o	e222d19e8a010b07
+28	59482	1763022914254550970	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.o	f5d422d7bda1538
+33	59564	1763022914258552843	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.o	befbf8f9acd539c5
+62	59692	1763022914288566889	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.o	6b18df07692bd715
+16	59727	1763022914241544883	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.o	9adce453d9205178
+124	59832	1763022914350595919	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.o	f86d09f2c1d23400
+201	59920	1763022914427631971	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.o	3e5fc7fab369c2c1
+121	60037	1763022914347594514	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.o	27630715a6051c7b
+152	60051	1763022914378609029	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.o	eea49c5ea314c0e1
+197	60567	1763022914424630567	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.o	b50bb246e41a7e55
+20	60581	1763022914246547224	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.o	fe4bbdba9382e3fe
+51	60757	1763022914277561739	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.o	6609f98e2ec4ad1c
+14	60924	1763022914239543947	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.o	cd2feff60ec6b62a
+149	60938	1763022914375607624	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.o	231afde3285f881b
+93	61032	1763022914319581404	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.o	93d54d0e72f5d4c9
+71	61363	1763022914297571103	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.o	a9aa5e9ad0f14084
+223	61434	1763022914449642272	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.o	252159da7597a4ab
+191	61671	1763022914417627289	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.o	ec1b9eccd1e0236d
+108	61973	1763022914333587959	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.o	ad134de4cbda18cc
+204	62129	1763022914430633376	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.o	7001be4fa7b450f0
+129	62215	1763022914355598260	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.o	177456325a1e4949
+166	62664	1763022914392615584	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.o	51e444ea73c0c635
+95	63226	1763022914321582340	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.o	44bb6b27639dc0d4
+25	63240	1763022914250549097	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.o	f2571ea697f20db1
+73	63257	1763022914299572040	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.o	9b2d360190c53f5d
+113	63313	1763022914338590300	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.o	1c760b77b75b3652
+55	63781	1763022914281563612	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.o	897af899132aead9
+208	64060	1763022914434635249	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.o	50f3f7e7b87476a7
+194	64182	1763022914420628694	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.o	81e2d57d7f1939ce
+22	64342	1763022914248548161	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.o	5f2a0613df473612
+132	64722	1763022914358599665	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.o	13d91029262dec34
+66	64948	1763022914291568294	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.o	c4f6dec0d425a2f9
+17	65609	1763022914243545820	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.o	f3325fd57a4dd180
+37	66270	1763022914263555184	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.o	62b74a4d5e6fea76
+117	67113	1763022914342592173	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.o	122e7c60f19753cb
+35	68777	1763022914261554247	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/convex_iou.o	629adda179ad2b5b
+188	74000	1763022914414625885	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.o	e9ebf42b040dd744
+26735	74018	1763022940960049085	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pybind.o	482bb7ea4566d187
+169	74289	1763022914395616988	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_indice.o	879c7ca0425663aa
+177	74751	1763022914403620734	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.o	a0c65a027412f6e3
+110	75041	1763022914335588895	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.o	808f30de801ec9a7
+180	75589	1763022914406622139	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.o	c8f2b1c3feb99290
+68	75741	1763022914293569230	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.o	a52e6606d3f2f985
+173	77569	1763022914399618861	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.o	cf81dbfeb3bb0d95
+113	78718	1763022914338590300	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.o	94ab9675ef9d7b61
+213	92819	1763022914439637590	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.o	20a451850fb8137f
+58	171788	1763022914283564548	/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.o	3443e733454b28ff
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/build.ninja b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/build.ninja
new file mode 100644
index 0000000000000000000000000000000000000000..836b2c5523bb49138596f5357f1b237841508c69
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/build.ninja
@@ -0,0 +1,167 @@
+ninja_required_version = 1.3
+cxx = c++
+nvcc = /usr/local/cuda/bin/nvcc
+
+cflags = -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c
+post_cflags = -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+cuda_cflags = -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c
+cuda_post_cflags = -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+cuda_dlink_post_cflags = 
+ldflags = 
+
+rule compile
+  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags
+  depfile = $out.d
+  deps = gcc
+
+rule cuda_compile
+  depfile = $out.d
+  deps = gcc
+  command = $nvcc --generate-dependencies-with-compile --dependency-output $out.d $cuda_cflags -c $in -o $out $cuda_post_cflags
+
+
+
+
+
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/active_rotated_filter.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/assign_score_withk.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/assign_score_withk.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ball_query.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/ball_query.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bbox_overlaps.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bezier_align.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bezier_align.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bias_act.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bias_act.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/border_align.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/border_align.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_quadri.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/box_iou_quadri.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_rotated.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/carafe.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe_naive.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/carafe_naive.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/chamfer_distance.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/chamfer_distance.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/contour_expand.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/contour_expand.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/convex_iou.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/convex_iou.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/correlation.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/correlation.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bezier_align.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/deform_conv.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_quadri.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_rotated.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/pixel_group.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/psamask.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/psamask.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_indice.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/voxelization.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/convex_iou.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/cudabind.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/knn_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_indice.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.o: cuda_compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_conv.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/deform_conv.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_roi_pool.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/diff_iou_rotated.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/filtered_lrelu.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/filtered_lrelu.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/focal_loss.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/focal_loss.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/furthest_point_sample.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_spconv_ops.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/gather_points.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/gather_points.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/group_points.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/group_points.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/info.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/info.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/iou3d.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/iou3d.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/knn.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/knn.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/masked_conv2d.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/min_area_polygons.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/min_area_polygons.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/modulated_deform_conv.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ms_deform_attn.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_quadri.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms_quadri.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_rotated.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms_rotated.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pixel_group.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/pixel_group.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_boxes.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/points_in_boxes.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_polygons.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/points_in_polygons.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/prroi_pool.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/prroi_pool.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/psamask.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/psamask.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pybind.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/pybind.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/riroi_align_rotated.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_align.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align_rotated.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_pool.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_pool.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roiaware_pool3d.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roipoint_pool3d.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/rotated_feature_align.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/scatter_points.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/scatter_points.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sparse_pool_ops.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_ops.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/spconv_ops.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sync_bn.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/sync_bn.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_interpolate.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/three_interpolate.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_nn.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/three_nn.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/tin_shift.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/tin_shift.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/upfirdn2d.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/upfirdn2d.cpp
+build /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/voxelization.o: compile /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/voxelization.cpp
+
+
+
+
+
+
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/active_rotated_filter.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/active_rotated_filter.o
new file mode 100644
index 0000000000000000000000000000000000000000..0f3c59bdd5ebd0dfe019aa5f35599c3900f76afc
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/active_rotated_filter.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/assign_score_withk.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/assign_score_withk.o
new file mode 100644
index 0000000000000000000000000000000000000000..5ab8901d23dc108e9e9cf51fa324f218aa974773
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/assign_score_withk.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ball_query.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ball_query.o
new file mode 100644
index 0000000000000000000000000000000000000000..635df1e3e246808ce47bbb134a0f696330b1181e
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ball_query.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bbox_overlaps.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bbox_overlaps.o
new file mode 100644
index 0000000000000000000000000000000000000000..8774c84962f1144583235a6c2897327d7b129549
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bbox_overlaps.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bezier_align.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bezier_align.o
new file mode 100644
index 0000000000000000000000000000000000000000..16004f686aaf8224f9ffb70fa31ce71bbc545087
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bezier_align.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bias_act.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bias_act.o
new file mode 100644
index 0000000000000000000000000000000000000000..4881c90b2af211deb544fac88fb75e38b4d98655
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bias_act.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/border_align.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/border_align.o
new file mode 100644
index 0000000000000000000000000000000000000000..26bbb554884e87cb66953e4ed2bd871ab0596173
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/border_align.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_quadri.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_quadri.o
new file mode 100644
index 0000000000000000000000000000000000000000..7523d7c3d1904eea71785f123f0be235f2980438
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_quadri.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_rotated.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_rotated.o
new file mode 100644
index 0000000000000000000000000000000000000000..f7974f6513cf2ea7e2102a35cb7ec1f8afac2d88
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_rotated.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe.o
new file mode 100644
index 0000000000000000000000000000000000000000..2f6f64c02d602103eaefa82aca7264738e235613
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe_naive.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe_naive.o
new file mode 100644
index 0000000000000000000000000000000000000000..67ff42248c8b02d040faf6774350246874489380
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe_naive.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/chamfer_distance.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/chamfer_distance.o
new file mode 100644
index 0000000000000000000000000000000000000000..8c80960b3b341df1be0fdf19175dc202f4ac7166
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/chamfer_distance.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/contour_expand.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/contour_expand.o
new file mode 100644
index 0000000000000000000000000000000000000000..650bda56f8b74471294a681bf6e939501267a431
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/contour_expand.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/convex_iou.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/convex_iou.o
new file mode 100644
index 0000000000000000000000000000000000000000..c579fab8806e3a23b34630de41ca0ba98e501e00
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/convex_iou.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/correlation.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/correlation.o
new file mode 100644
index 0000000000000000000000000000000000000000..e5133e77b321789fd802fa5347ea71df7764ac32
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/correlation.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.o
new file mode 100644
index 0000000000000000000000000000000000000000..7b8540e665192deec18fe8de30d9a3182a6c053a
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.o
new file mode 100644
index 0000000000000000000000000000000000000000..c70e02a99e98c41e0be19a5587a083eaa33ca7b8
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bezier_align.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bezier_align.o
new file mode 100644
index 0000000000000000000000000000000000000000..ae7ddb7cbef4af383f6c4a4d631666b0736b2a1e
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bezier_align.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.o
new file mode 100644
index 0000000000000000000000000000000000000000..ec948a850b0ed69c8d8579113eba03f7f1c38112
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.o
new file mode 100644
index 0000000000000000000000000000000000000000..f60c2e6cfc95c1d549fbdd803a710069e216dfa9
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/deform_conv.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/deform_conv.o
new file mode 100644
index 0000000000000000000000000000000000000000..ee3ac9498d16e118c071213dd6a031081bd15cd2
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/deform_conv.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.o
new file mode 100644
index 0000000000000000000000000000000000000000..49adae0a8cfa1484148da90bbeb3ddd66e0e3948
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms.o
new file mode 100644
index 0000000000000000000000000000000000000000..f587d80bbc48c19525150cfd159c8601f473eebc
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_quadri.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_quadri.o
new file mode 100644
index 0000000000000000000000000000000000000000..bca743ac22fd3e7fa7ef5c87e9df445cee459f32
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_quadri.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_rotated.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_rotated.o
new file mode 100644
index 0000000000000000000000000000000000000000..11bb6d6a3efbd343982c8644ed3ae11967bf9a9c
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_rotated.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/pixel_group.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/pixel_group.o
new file mode 100644
index 0000000000000000000000000000000000000000..6bb78282a273b4aade1d38cb77817f4f6b1798cb
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/pixel_group.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.o
new file mode 100644
index 0000000000000000000000000000000000000000..d9eece4f4953362d397a0beb5620c546362267fd
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/psamask.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/psamask.o
new file mode 100644
index 0000000000000000000000000000000000000000..b29e638ac23115e87eb251a96916ae6a02414635
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/psamask.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align.o
new file mode 100644
index 0000000000000000000000000000000000000000..aecf7605493cd5b332f27dbad4a96ea3cd0ec0f3
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.o
new file mode 100644
index 0000000000000000000000000000000000000000..138e5f7dec9c54ed422c87c8aefcb80ddfe46697
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.o
new file mode 100644
index 0000000000000000000000000000000000000000..9f557a2a4497615101eaa5e0792abd0c9dcfd760
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_indice.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_indice.o
new file mode 100644
index 0000000000000000000000000000000000000000..464d91a99a6e1e864dfa5fe9876a45b32f16c424
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_indice.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.o
new file mode 100644
index 0000000000000000000000000000000000000000..48c03ffbacb0140a2c26fc5a04c32c10f0a30fd0
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.o
new file mode 100644
index 0000000000000000000000000000000000000000..661948eac00ab453a63454356c24cb3868bd559b
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/voxelization.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/voxelization.o
new file mode 100644
index 0000000000000000000000000000000000000000..ab2afe948c9a514d26702f9ee289e838851aaa56
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/voxelization.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..139d9ab42a3592b4c7ac246da5a6813145a5b2e6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:82a907dff3fea1783e4738e2758db69f82f6c436d4665dfd8045d7490996501d
+size 533528
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..0ea70e9559bc6d7ec7bad09abcec69f31a7605ea
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37977bf6c5d7031ba9f517b81971361a64dc8d45aa773d985675ce25eabeffdf
+size 607784
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..9fe607541be96be873d3f52124d9da6d7afadce9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8677d7295034b3b3e202d1e7c1027bfa3038e94b7c73f1b43ae541f6146d8133
+size 485768
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..8ca80ae180cb0ad3eb227a370bfb22d567415f45
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8910e2be11954bbfa5a48a9bef82e6622ccc7a694b91cd8baaeac944460b8820
+size 500664
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..7f5b3123733fc03cb3449c4c984f601cdc752f89
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eea1d4129215fa64be347ee6307290c85c8810f226ba1c3056826904cec0e869
+size 638016
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..b6ce05c297cade1e0861ae1241ca2fc04bfe3b8a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d840871d4479ca9dfdc83508e4e59ac442e315cf38f935c0322b82193c79e2b4
+size 1106400
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..56e19d69f9364e9d1c70b636300632155c3296ae
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7424e6edb65dbace0778ece044187ef0ef7bc0a3314614775ac08dd715d75c21
+size 580040
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..95703d99e91d5c4ee8465c17607e2ca5b8d0e442
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:301964af18e59b5cc3d4a1434b8046fa089927b85a1c2a5b947bbbea6cadd79e
+size 767536
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..076c05401d3cac04abcb7bc08318403edbcf88d7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d6e484b45583fd86594236b6096e7ee41b37117e423456243041a8a627e2b1ab
+size 786008
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..fac0d205811f01c5d2db84acd15fd349109566e2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b46fc8b42603144548c746bcd8cf36f1bde65924d331fb34f55e13bd5c34d331
+size 704472
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..554f2050633fd73f520d002ebce4489151b10218
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1f6de262168ee30d6d63fbc0cec1d18a16be2088e6a2f79052d5e4278902f4cc
+size 553768
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..e941ad1c0ae7b7a5842d8935cc9fcffd5817d2ca
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c96bc34177d6ff2d180ad70f345e74ac253529822e39bfe0fc78cb013264d568
+size 570384
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/convex_iou.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/convex_iou.o
new file mode 100644
index 0000000000000000000000000000000000000000..84faed20dd6552fd11abda83b1c0070e8f969aad
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/convex_iou.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e865b78c5139b707bc3775c319be6a9b5016df7fb2ee8074d2c23895440eec1d
+size 1627048
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..34b5619c364dfa4936b8f0621b50e7c61b213bf1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da3a139815070faeef4c94a3daa1ae5f68c2b2f7a97aa3c4aead3e0d7c108459
+size 709944
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/cudabind.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/cudabind.o
new file mode 100644
index 0000000000000000000000000000000000000000..3070d267b526c72700f2a632e039d94be35ab07f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/cudabind.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6dd2412d44b3707d6dee135b3f606221cb3a720e8826eb5f63b6bb46dfea5e05
+size 213472
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..50bca1f2b6fe3de5c4098c3dafb58fcf7e4b3b96
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24363ba984df19f01fd39dd47de58ccc2089f85f715524367d31c881b3c2bea8
+size 514832
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..dcf04a38b0391009ca183dba59f4105cc32afd9c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77331906ca9dac8bbcd86b8338870f81aa8fb4250393335ba6d01d69f1927bcb
+size 609768
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..d3d21636721a62ea440623997459899256488d9d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:507a085731022b05aac52276a671eebc5a1351f117580c55a2ede2c2fe04a9dc
+size 716600
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.o
new file mode 100644
index 0000000000000000000000000000000000000000..a46c412557064cd457ff9b349d862abe6196478d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b01931a98901a7c3a649b9e802d3e4b1e18018a54d64c22ef2f1ccd96b267782
+size 11792792
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..cf709043c2eb3a8710d0c21e7027d15d021e6629
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7aa5141787be0bea8e7065e3167c876b7964ef7a1e1e5b6fb13b545b60e60ff6
+size 684464
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..e6ced5a09bdbd290113d6e13886347b0a4f4e646
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bed723273ff78e57ca0c8a34f42f8f4567a461ae5e2cfc51cc66016dab58949
+size 566816
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..64937ec1694afda255a38b0514a541b1f423e5ff
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4729ff27a299880f390670b45d44a98b56682d5e94344848d92cc735a63e1b6a
+size 639024
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..32e77a0647dfa2e343f53b8b3b80bdf1411cca0b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4d09b794b44294a29ae64c84d5b5050c8caae4cc786f2282c149297a9773ec26
+size 1039952
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..d6a899fabe0798438bf42b8eb2e8ba04aedf299e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:315748d69c8efcc3f80cea52ae7e4b1716a5b40f843222d17828935eb2815959
+size 492104
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..d933f597373b954c737ebdea167f84ce0da217ca
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7454de426f9d3b96b5e10e602b22416d180d38b1c1bdd5b7cee25e00c307b5bf
+size 496408
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..f899cc67151f14dca1e11fac92ed0794986a0869
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:504accc18ca60b83ac9850b27f7ed60df9fc1fac64d7d23e3523832e1425beee
+size 972720
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/knn_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/knn_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..5886be72af90fd7fe13ecf13d86dba05228b10e4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/knn_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12c792fcc9f2a7b6030728279d606168715d98deaba0219741476b6f9db7c274
+size 512184
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..d4e9ace2d2f957ea20925e69bd4b0f0312efe85f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e588a35a8768d5a35db77fa2b97d4393ec05b605782488b6dbafe8e7a6d7388b
+size 510496
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.o
new file mode 100644
index 0000000000000000000000000000000000000000..13d3a256db029ff44d6008b92e781d356ab461cc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1b64ce087a815fdc6f7d8c9a229843842d15a74dcc64db19eab87ace40c4e98
+size 722880
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..0547f636249d8eec5315ca89170cc33690ffb1ae
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e304a72fbf5dc0bcee919fa68cd095b3a8ccc1c18dce79c438c5a835c33e0c3c
+size 528528
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..40b25cad8816bb5e871c0390fe3732d1d3fba52e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0af42a9e0d3e915a530dbd7362235c0ac203c21cf4a4efcd68684fd1d94bde88
+size 1250744
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..3d07322d8bf3a9de1227e6d28b7ce6a6944b9cb3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e3a8d937f3e49f743f1275322c9a4ac9d4bccfb022459e4aa04f7c3ac1f3bc93
+size 664688
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..fa7a5a36f01d950a4726d5c45c245db64f764c5d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff0002b464de96032e8f67fb06fc39dca9de5b50b80fa7ef2278a07d60948559
+size 1975656
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..be92d1f1f2a64175b3a384127de51bc50c330ca8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b79d95d36e473a1634575ddf9a86bffc0130514de5758d7f3624f4ba4308641
+size 2020808
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..cf68da32d42d67fe3c6023751e06130e7ccf48fd
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0bfaca59c30dd2ac2eeb4bc62936222755896748b95339cd9a53e6bb094c7e9
+size 535416
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..6f94e2ffcc070284bf62336e6fd511ddb2494745
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22cf9036e2573171c2badf6ca336e490ad3df889a0c54ffaf6adbac789f330db
+size 493088
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..f54bfa92ac1984becd8e26d4e2b008577d27240f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c133dcf05bea8e9421783b8494fb6c1ea31ccb0f8135fddcf0513b36452401f0
+size 515784
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..941341a5054bf02077d0dd28827d0d12c0d4141b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d1e465d3d6594d1170ee3b7f6b4b7d81ad33fd6ff9fc87bdb48403b46752023c
+size 574000
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..7dbb94cb301f936f6c040a5100b17fd9713aff4e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2d5b0b95af4965217a8e8023c164d375dd138bbafe52bea970fee8717a1feae
+size 633048
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..4d585a71c3b3f60fdf7ec2c631ab69b409d0e79b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:111df49e96cd2e2460d79a48c1aea7cf9456f7cb4063e2f90537c5bc4c6e864d
+size 656600
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..11f87d7a9efbcae3d8adad3c5450294d93075dc1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d0dcba31893c70c272de1ecd1229ab15c44548fd87d7c9b2a6e156b601f785dd
+size 525288
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..99030421a7fdf4095aa4cdd48c02d050d4faeefa
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b1e219885c3b0db5104e277850e5c852b18b3a7fc4f6f2d344a5bf7ef496a38
+size 566984
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..f595f44e107202ca7d32f5cd2823968f1529f28e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0dd05f78a49bd99a952e9ce154500d6ec9f322f551c22ce92331058497be4828
+size 813744
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..5246f3d504fb63dbe3f8e5e39d05387685fc9a72
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:764162302f897c1a8915b89fd57d44b240b2fe882648b0bf953e2e136a98d655
+size 657752
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..7e12eea27887a35c34f59efce074e000371007eb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77d94469bfae01ca6e37659f23a39f91cd402169dfe31011278c2d63d92cc17a
+size 603936
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..c2b1dbe9a77656d60177a8a4dbc029b251dc6ec9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5251291c6ef797b9b4b83238f0b933b013a9be1daaa17675f7a6f1613e50ffc
+size 785064
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_indice.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_indice.o
new file mode 100644
index 0000000000000000000000000000000000000000..425fd6240ca8ecbac1d1920be4379e135b182cbe
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_indice.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94b0a86031d1caf557a508e66498ca536cd01c5fa244d8b16d7ab08227c7ea40
+size 1015464
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.o
new file mode 100644
index 0000000000000000000000000000000000000000..a0e18a8ca9e4ad3c6e220b38b15ea849d66f11fb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7b09b214ab2f9895b5eda5e0de5c469f153bc831f55c398489926b29c5e2299
+size 1298792
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..0a31d32f088761742fbef65bd393956e0549a7e4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3dbc6929535be79c0ea2ae231d24e236928ef76227b1382d0ce3eeb814f5549c
+size 998176
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.o
new file mode 100644
index 0000000000000000000000000000000000000000..7ff7d9671c97bf43c78c4eee99520c1facce32a3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f80b4b1776ae501e47c13353e8f49ce627fcb57a6b0d1da891079986d2a6ba12
+size 981944
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..60142af5846936165b1a1a9e6664038902c053ca
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5faab8851557081060a0d97ece1cd4f2e40f4158dbbaebe4de85ca2dcd0a44d8
+size 1291592
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..9bb53f779495418b159ecab01b6c56582d9f4fd8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f4c034e0a199e34aa3cd4094fcc9bb0f171a0fac18d55ebf5e793399c9fcd5d1
+size 503984
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..d639fd8bdf10f1cfb32147b6499a6246349c85a5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef3ba9b660b22a70f9a6144fd248ef50dcb2625383e2d73070775aca2298f9e8
+size 520888
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..01d671e8fb38e4d61bd514d1a83212382753307f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea28270971bf5ad84b8dd73673e355d2a74d6e567266f3e3a6e8d8537717a5ad
+size 629952
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..fa805d0e87e71874c7e798545287ff2b04f14edf
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15118b0851e1880e21f39285e2d57577b039621b8469d0d08c3b87cd9e9690f1
+size 498840
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..583c5526fd6f395eb62792801d91ce55e85ddb2b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ecf1a5eb0c4bbd4255d25cdd25bceed780b240e5b9241bf470170334e259b0c3
+size 503696
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..43cae86f541ba73324fd879e3de4c340d03171ca
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8331fe04d9d6955b4651285639123c64770c721571867c212009101312730f55
+size 525360
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.o
new file mode 100644
index 0000000000000000000000000000000000000000..505804402de54830f6acc965ceb57e5c66f65f02
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c4e9e6486e5a16cdd80785019be3eeb1b8b45e50119b8b94d016b9d3f783a6f
+size 5091432
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.o
new file mode 100644
index 0000000000000000000000000000000000000000..50e08d137d48ca0173d152b53738fb1190e71b27
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d27d3a533323ccfe72148f5fb8f6c536b71d3a5155248617c9475d74fbe6a71
+size 1028976
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_conv.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_conv.o
new file mode 100644
index 0000000000000000000000000000000000000000..22271f1bac1fc4c1f361dce784b823d2d817134e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_conv.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:152cf1ed63dc9f9116177613bb9c76e88fb82c0d630eb5f12a7252383ec3bacf
+size 152288
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_roi_pool.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_roi_pool.o
new file mode 100644
index 0000000000000000000000000000000000000000..fcc94d1331383e5ae3c36a9256cae92b925e898f
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_roi_pool.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/diff_iou_rotated.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/diff_iou_rotated.o
new file mode 100644
index 0000000000000000000000000000000000000000..b469c7327587e6f083a5d3e0ecc37c4679c03bbf
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/diff_iou_rotated.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/filtered_lrelu.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/filtered_lrelu.o
new file mode 100644
index 0000000000000000000000000000000000000000..6aee6aa8d25fc9bc1df928810db8e41da2cb82b7
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/filtered_lrelu.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/focal_loss.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/focal_loss.o
new file mode 100644
index 0000000000000000000000000000000000000000..43e7c8f361a5a74d1173a0ddcebf3fe28fb68f44
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/focal_loss.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/furthest_point_sample.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/furthest_point_sample.o
new file mode 100644
index 0000000000000000000000000000000000000000..50224665a026fac6c3b3786b49268195399edc5b
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/furthest_point_sample.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.o
new file mode 100644
index 0000000000000000000000000000000000000000..a6ff0a0169c14363eacd30de5ddb0e5a87cbf1fb
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_spconv_ops.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_spconv_ops.o
new file mode 100644
index 0000000000000000000000000000000000000000..35e89a0c040429e8edc9052e76dfb6385df636b7
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_spconv_ops.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/gather_points.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/gather_points.o
new file mode 100644
index 0000000000000000000000000000000000000000..b698482d918f3d44492b45f56a3801193f68ae12
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/gather_points.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/group_points.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/group_points.o
new file mode 100644
index 0000000000000000000000000000000000000000..26da0eb13b0905f156183f919e2cf57756c43437
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/group_points.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/info.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/info.o
new file mode 100644
index 0000000000000000000000000000000000000000..e9b2c9d2d21d188b6d37782a3e072bd0756d7a1f
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/info.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/iou3d.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/iou3d.o
new file mode 100644
index 0000000000000000000000000000000000000000..f4946510093b6410878e898650843a0a6985ad94
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/iou3d.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/knn.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/knn.o
new file mode 100644
index 0000000000000000000000000000000000000000..2ad5105a2d296b00adc331563d7b2b1a845df597
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/knn.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/masked_conv2d.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/masked_conv2d.o
new file mode 100644
index 0000000000000000000000000000000000000000..f1a205c21efd7c7a2477e14cdb77cb87423441d3
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/masked_conv2d.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/min_area_polygons.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/min_area_polygons.o
new file mode 100644
index 0000000000000000000000000000000000000000..816af3cbb1b71b082cdff05dbe27ff897a09436a
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/min_area_polygons.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/modulated_deform_conv.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/modulated_deform_conv.o
new file mode 100644
index 0000000000000000000000000000000000000000..d1916b882b95c6f43fd9df5ce9b1df1974060ea3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/modulated_deform_conv.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2f0c4b69364d0ae6ea30136d24e98d2323a43b3f55bb0fa4f9b33378cc31dcae
+size 127312
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ms_deform_attn.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ms_deform_attn.o
new file mode 100644
index 0000000000000000000000000000000000000000..bd0de278e41347d9b0000d591773f18845b861df
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ms_deform_attn.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms.o
new file mode 100644
index 0000000000000000000000000000000000000000..85dd7f3af2b451e2948686c495810f534c278153
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_quadri.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_quadri.o
new file mode 100644
index 0000000000000000000000000000000000000000..41afffd0046b54c76d3aa8d8aa3e816e117b035d
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_quadri.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_rotated.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_rotated.o
new file mode 100644
index 0000000000000000000000000000000000000000..bd7aab670f82e0fcf08dd9b0363ed5e5a3c46d00
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_rotated.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pixel_group.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pixel_group.o
new file mode 100644
index 0000000000000000000000000000000000000000..604123d04705841acc346226e6e13fb04d1c500f
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pixel_group.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_boxes.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_boxes.o
new file mode 100644
index 0000000000000000000000000000000000000000..52262e3aac82aa1b5ff45f89847fe69b43bc70f8
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_boxes.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_polygons.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_polygons.o
new file mode 100644
index 0000000000000000000000000000000000000000..407ce02cef7b5f8896aa6452e84ab70619d77bfc
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_polygons.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/prroi_pool.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/prroi_pool.o
new file mode 100644
index 0000000000000000000000000000000000000000..5f3db3991077aa45a78fd8955b9f121d0c128c38
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/prroi_pool.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/psamask.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/psamask.o
new file mode 100644
index 0000000000000000000000000000000000000000..1cd977a73aaf5f2bba0439e2e3453b74ea2f0aab
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/psamask.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pybind.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pybind.o
new file mode 100644
index 0000000000000000000000000000000000000000..656d390d2d8199c5fb74d39b567fb9305c35d415
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pybind.o
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8de046409a7a6ea9d3e9b706d58ae18a985ec8da78f67b708269394e2f2ae2eb
+size 1099848
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/riroi_align_rotated.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/riroi_align_rotated.o
new file mode 100644
index 0000000000000000000000000000000000000000..481c1faa8e7d380e3b4bec489039891f080eb909
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/riroi_align_rotated.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align.o
new file mode 100644
index 0000000000000000000000000000000000000000..a45ebe0e0a9f5688c8f26be6ed02fca27ccab815
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align_rotated.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align_rotated.o
new file mode 100644
index 0000000000000000000000000000000000000000..4d300ed071fe7fa4b1774f243c4c9c9b32f4ccb9
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align_rotated.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_pool.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_pool.o
new file mode 100644
index 0000000000000000000000000000000000000000..ef5cd665e038e7781be2ec3780a380ed32901d94
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_pool.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roiaware_pool3d.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roiaware_pool3d.o
new file mode 100644
index 0000000000000000000000000000000000000000..4a81c9acad545c52b304e36805bead6260758a6c
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roiaware_pool3d.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roipoint_pool3d.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roipoint_pool3d.o
new file mode 100644
index 0000000000000000000000000000000000000000..7d0bec97458fb0b86c1aeae10e7faf07160e4b3a
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roipoint_pool3d.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/rotated_feature_align.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/rotated_feature_align.o
new file mode 100644
index 0000000000000000000000000000000000000000..5dc2dcf9f11e250f18b51977607a0cf91d51ee7a
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/rotated_feature_align.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/scatter_points.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/scatter_points.o
new file mode 100644
index 0000000000000000000000000000000000000000..705def301f66435b5df84c2358f3e0594d3515d0
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/scatter_points.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sparse_pool_ops.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sparse_pool_ops.o
new file mode 100644
index 0000000000000000000000000000000000000000..bd498a850b8a78c7ceaf48160fba0294f5deb647
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sparse_pool_ops.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_ops.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_ops.o
new file mode 100644
index 0000000000000000000000000000000000000000..e24545c27cde79687a583bdd20125f829135e5f3
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_ops.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sync_bn.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sync_bn.o
new file mode 100644
index 0000000000000000000000000000000000000000..efd35ed77fb7c42bab2b4393930a34da3dc80cd0
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sync_bn.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_interpolate.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_interpolate.o
new file mode 100644
index 0000000000000000000000000000000000000000..fc36ea4cf923ebb83e14a7941f6ba7b4c2e9ff8c
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_interpolate.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_nn.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_nn.o
new file mode 100644
index 0000000000000000000000000000000000000000..4486143fd8aedeb5836c4a783bde20663ae131e7
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_nn.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/tin_shift.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/tin_shift.o
new file mode 100644
index 0000000000000000000000000000000000000000..959a4e61933d5f1b6cab36070313eaa31eaa023e
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/tin_shift.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/upfirdn2d.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/upfirdn2d.o
new file mode 100644
index 0000000000000000000000000000000000000000..dda929743c08adb410d795272939692f240c0ec2
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/upfirdn2d.o differ
diff --git a/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/voxelization.o b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/voxelization.o
new file mode 100644
index 0000000000000000000000000000000000000000..1d4f3823782c1fa650d9469d5fc53c6233a3e54d
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/voxelization.o differ
diff --git a/head_extractor/mmcv-2.1.0/docs/en/Makefile b/head_extractor/mmcv-2.1.0/docs/en/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..51285967a7d9722c5bdee4f6a81c154a56aa0846
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/community/1.png b/head_extractor/mmcv-2.1.0/docs/en/_static/community/1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1837fbc8ca1dd46fc169d3c16fd2aef73645af92
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/docs/en/_static/community/1.png differ
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/community/2.png b/head_extractor/mmcv-2.1.0/docs/en/_static/community/2.png
new file mode 100644
index 0000000000000000000000000000000000000000..76e21def858b2f9392a90999d741cb653e766ae5
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/docs/en/_static/community/2.png differ
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/community/3.png b/head_extractor/mmcv-2.1.0/docs/en/_static/community/3.png
new file mode 100644
index 0000000000000000000000000000000000000000..41fa74d3b1b7ceae7a8eefb6f5f3cf25a33250a0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/_static/community/3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f1e74b1a76b7a96b8a58731dc17c0419efd177eb863cb125a90ec657ba3d529
+size 182941
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/css/readthedocs.css b/head_extractor/mmcv-2.1.0/docs/en/_static/css/readthedocs.css
new file mode 100644
index 0000000000000000000000000000000000000000..9e3a567d5f78aedb606600bb3111034a1003b362
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/_static/css/readthedocs.css
@@ -0,0 +1,10 @@
+.header-logo {
+    background-image: url("../image/mmcv-logo.png");
+    background-size: 85px 40px;
+    height: 40px;
+    width: 85px;
+}
+
+table.colwidths-auto td {
+    width: 50%
+}
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/flow_img2toimg1.png b/head_extractor/mmcv-2.1.0/docs/en/_static/flow_img2toimg1.png
new file mode 100644
index 0000000000000000000000000000000000000000..12df0a17ddd3290f5f05072c2bcd38ae79d9f100
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/docs/en/_static/flow_img2toimg1.png differ
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/flow_raw_images.png b/head_extractor/mmcv-2.1.0/docs/en/_static/flow_raw_images.png
new file mode 100644
index 0000000000000000000000000000000000000000..fdf3cd98a392d110ef0c2a0c4cd9efe927f5daf7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/_static/flow_raw_images.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:01c88e247d12d8bdbdcdfb7080572e4f6c9191c944381c8349e7232bcf2f1337
+size 1515531
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/flow_visualization.png b/head_extractor/mmcv-2.1.0/docs/en/_static/flow_visualization.png
new file mode 100644
index 0000000000000000000000000000000000000000..4b2e026a058f85d31c70d51cabd11c02d7b26c35
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/docs/en/_static/flow_visualization.png differ
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/flow_warp.png b/head_extractor/mmcv-2.1.0/docs/en/_static/flow_warp.png
new file mode 100644
index 0000000000000000000000000000000000000000..60f53d85939c60b7d038331fe7cad3118c78763c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/_static/flow_warp.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:243859c897f342fef7d03116bddb39dd05c1c0bb8f43c2b5422e330f90698f93
+size 760348
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/flow_warp_diff.png b/head_extractor/mmcv-2.1.0/docs/en/_static/flow_warp_diff.png
new file mode 100644
index 0000000000000000000000000000000000000000..7989c9636bd852ce7512457c64192e630ea27cb7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/_static/flow_warp_diff.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa0e1308dca16a1c60e43e451b933221c43b857774523d1584ff6166b35b7603
+size 1379939
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/image/mmcv-logo.png b/head_extractor/mmcv-2.1.0/docs/en/_static/image/mmcv-logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..bcc5759f8fe3bc7d191d411c38a9e1d3c1c27a84
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/docs/en/_static/image/mmcv-logo.png differ
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/parallel_progress.gif b/head_extractor/mmcv-2.1.0/docs/en/_static/parallel_progress.gif
new file mode 100644
index 0000000000000000000000000000000000000000..943603058e4d4c3652fa37875ec146609db1848a
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/docs/en/_static/parallel_progress.gif differ
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/parallel_progress.png b/head_extractor/mmcv-2.1.0/docs/en/_static/parallel_progress.png
new file mode 100644
index 0000000000000000000000000000000000000000..3affeeb3cf59a07db44b0025b8e483f06d144c24
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/docs/en/_static/parallel_progress.png differ
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/progress.gif b/head_extractor/mmcv-2.1.0/docs/en/_static/progress.gif
new file mode 100644
index 0000000000000000000000000000000000000000..d06ef1a36b15488be84f3f0d594388398cb92075
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/_static/progress.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7c8eb74c612be3da8c9c936e1d95c3ed2b10828c24f444881b596c46e10aa16
+size 100747
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/progress.png b/head_extractor/mmcv-2.1.0/docs/en/_static/progress.png
new file mode 100644
index 0000000000000000000000000000000000000000..a4070e0052427373c59967ed07bb9f936ca8df59
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/docs/en/_static/progress.png differ
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_static/version.json b/head_extractor/mmcv-2.1.0/docs/en/_static/version.json
new file mode 100644
index 0000000000000000000000000000000000000000..522c371ce1b2846f5f488d442450f841926207db
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/_static/version.json
@@ -0,0 +1,738 @@
+{
+    "Linux": [
+        {
+            "cuda": "11.8",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.7",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.7",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.6",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.6",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.5",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.3",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.3",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.3",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.1",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.1",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.1",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.0",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "10.1",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.1",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "10.1",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "9.2",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "9.2",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        }
+    ],
+    "Windows": [
+        {
+            "cuda": "11.8",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.7",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.7",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.6",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.6",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.5",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.3",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.3",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.3",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.1",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.1",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.1",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "10.1",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.1",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3"
+            ]
+        },
+        {
+            "cuda": "10.1",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        }
+    ],
+    "macOS": [
+        {
+            "cuda": "cpu",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "mps",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2"
+            ]
+        }
+    ]
+}
diff --git a/head_extractor/mmcv-2.1.0/docs/en/_templates/classtemplate.rst b/head_extractor/mmcv-2.1.0/docs/en/_templates/classtemplate.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4f74842394ec9807fb1ae2d8f05a8a57e9a2e24c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/_templates/classtemplate.rst
@@ -0,0 +1,14 @@
+.. role:: hidden
+    :class: hidden-section
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :members:
+
+
+..
+  autogenerated from source/_templates/classtemplate.rst
+  note it does not have :inherited-members:
diff --git a/head_extractor/mmcv-2.1.0/docs/en/api/arraymisc.rst b/head_extractor/mmcv-2.1.0/docs/en/api/arraymisc.rst
new file mode 100644
index 0000000000000000000000000000000000000000..28975eb76e94994c50d2fe52b8f34c7ce533e788
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/api/arraymisc.rst
@@ -0,0 +1,19 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.arraymisc
+===================================
+
+.. contents:: mmcv.arraymisc
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.arraymisc
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   quantize
+   dequantize
diff --git a/head_extractor/mmcv-2.1.0/docs/en/api/cnn.rst b/head_extractor/mmcv-2.1.0/docs/en/api/cnn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..022191f179fdbe3b1644abbb96ffdc92e4e37e06
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/api/cnn.rst
@@ -0,0 +1,71 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.cnn
+===================================
+
+.. contents:: mmcv.cnn
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.cnn
+
+Module
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   ContextBlock
+   Conv2d
+   Conv3d
+   ConvAWS2d
+   ConvModule
+   ConvTranspose2d
+   ConvTranspose3d
+   ConvWS2d
+   DepthwiseSeparableConvModule
+   GeneralizedAttention
+   HSigmoid
+   HSwish
+   LayerScale
+   Linear
+   MaxPool2d
+   MaxPool3d
+   NonLocal1d
+   NonLocal2d
+   NonLocal3d
+   Scale
+   Swish
+   Conv2dRFSearchOp
+
+Build Function
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   build_activation_layer
+   build_conv_layer
+   build_norm_layer
+   build_padding_layer
+   build_plugin_layer
+   build_upsample_layer
+
+Miscellaneous
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   fuse_conv_bn
+   conv_ws_2d
+   is_norm
+   make_res_layer
+   make_vgg_layer
+   get_model_complexity_info
diff --git a/head_extractor/mmcv-2.1.0/docs/en/api/image.rst b/head_extractor/mmcv-2.1.0/docs/en/api/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3b93484952cd0c45b9d103088b0677f93fe5615d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/api/image.rst
@@ -0,0 +1,100 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.image
+===================================
+
+.. contents:: mmcv.image
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.image
+
+IO
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   imfrombytes
+   imread
+   imwrite
+   use_backend
+
+Color Space
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   bgr2gray
+   bgr2hls
+   bgr2hsv
+   bgr2rgb
+   bgr2ycbcr
+   gray2bgr
+   gray2rgb
+   hls2bgr
+   hsv2bgr
+   imconvert
+   rgb2bgr
+   rgb2gray
+   rgb2ycbcr
+   ycbcr2bgr
+   ycbcr2rgb
+
+Geometric
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   cutout
+   imcrop
+   imflip
+   impad
+   impad_to_multiple
+   imrescale
+   imresize
+   imresize_like
+   imresize_to_multiple
+   imrotate
+   imshear
+   imtranslate
+   rescale_size
+
+Photometric
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   adjust_brightness
+   adjust_color
+   adjust_contrast
+   adjust_hue
+   adjust_lighting
+   adjust_sharpness
+   auto_contrast
+   clahe
+   imdenormalize
+   imequalize
+   iminvert
+   imnormalize
+   lut_transform
+   posterize
+   solarize
+
+Miscellaneous
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   tensor2imgs
diff --git a/head_extractor/mmcv-2.1.0/docs/en/api/ops.rst b/head_extractor/mmcv-2.1.0/docs/en/api/ops.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b0290457bfa0c08f14d7fe346efccb33f388bdae
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/api/ops.rst
@@ -0,0 +1,135 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.ops
+===================================
+
+.. contents:: mmcv.ops
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.ops
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   BorderAlign
+   CARAFE
+   CARAFENaive
+   CARAFEPack
+   Conv2d
+   ConvTranspose2d
+   CornerPool
+   Correlation
+   CrissCrossAttention
+   DeformConv2d
+   DeformConv2dPack
+   DeformRoIPool
+   DeformRoIPoolPack
+   DynamicScatter
+   FusedBiasLeakyReLU
+   GroupAll
+   Linear
+   MaskedConv2d
+   MaxPool2d
+   ModulatedDeformConv2d
+   ModulatedDeformConv2dPack
+   ModulatedDeformRoIPoolPack
+   MultiScaleDeformableAttention
+   PSAMask
+   PointsSampler
+   PrRoIPool
+   QueryAndGroup
+   RiRoIAlignRotated
+   RoIAlign
+   RoIAlignRotated
+   RoIAwarePool3d
+   RoIPointPool3d
+   RoIPool
+   SAConv2d
+   SigmoidFocalLoss
+   SimpleRoIAlign
+   SoftmaxFocalLoss
+   SparseConv2d
+   SparseConv3d
+   SparseConvTensor
+   SparseConvTranspose2d
+   SparseConvTranspose3d
+   SparseInverseConv2d
+   SparseInverseConv3d
+   SparseMaxPool2d
+   SparseMaxPool3d
+   SparseModule
+   SparseSequential
+   SubMConv2d
+   SubMConv3d
+   SyncBatchNorm
+   TINShift
+   Voxelization
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   active_rotated_filter
+   assign_score_withk
+   ball_query
+   batched_nms
+   bbox_overlaps
+   border_align
+   box_iou_rotated
+   boxes_iou3d
+   boxes_iou_bev
+   boxes_overlap_bev
+   carafe
+   carafe_naive
+   chamfer_distance
+   contour_expand
+   convex_giou
+   convex_iou
+   deform_conv2d
+   deform_roi_pool
+   diff_iou_rotated_2d
+   diff_iou_rotated_3d
+   dynamic_scatter
+   furthest_point_sample
+   furthest_point_sample_with_dist
+   fused_bias_leakyrelu
+   gather_points
+   grouping_operation
+   knn
+   masked_conv2d
+   min_area_polygons
+   modulated_deform_conv2d
+   nms
+   nms3d
+   nms3d_normal
+   nms_bev
+   nms_match
+   nms_normal_bev
+   nms_rotated
+   pixel_group
+   point_sample
+   points_in_boxes_all
+   points_in_boxes_cpu
+   points_in_boxes_part
+   points_in_polygons
+   prroi_pool
+   rel_roi_point_to_rel_img_point
+   riroi_align_rotated
+   roi_align
+   roi_align_rotated
+   roi_pool
+   rotated_feature_align
+   scatter_nd
+   sigmoid_focal_loss
+   soft_nms
+   softmax_focal_loss
+   three_interpolate
+   three_nn
+   tin_shift
+   upfirdn2d
+   voxelization
diff --git a/head_extractor/mmcv-2.1.0/docs/en/api/transforms.rst b/head_extractor/mmcv-2.1.0/docs/en/api/transforms.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b080133d6b7736398b855174c325169b8af92aae
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/api/transforms.rst
@@ -0,0 +1,60 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.transforms
+===================================
+
+.. currentmodule:: mmcv.transforms
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   BaseTransform
+   TestTimeAug
+
+Loading
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   LoadAnnotations
+   LoadImageFromFile
+
+Processing
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   CenterCrop
+   MultiScaleFlipAug
+   Normalize
+   Pad
+   RandomChoiceResize
+   RandomFlip
+   RandomGrayscale
+   RandomResize
+   Resize
+   ToTensor
+   ImageToTensor
+
+Wrapper
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   Compose
+   KeyMapper
+   RandomApply
+   RandomChoice
+   TransformBroadcaster
diff --git a/head_extractor/mmcv-2.1.0/docs/en/api/utils.rst b/head_extractor/mmcv-2.1.0/docs/en/api/utils.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f2ff4c2a3872bc9ae0c2942debac5e5b523bd071
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/api/utils.rst
@@ -0,0 +1,23 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.utils
+===================================
+
+.. contents:: mmcv.utils
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.utils
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   IS_CUDA_AVAILABLE
+   IS_MLU_AVAILABLE
+   IS_MPS_AVAILABLE
+   collect_env
+   jit
+   skip_no_elena
diff --git a/head_extractor/mmcv-2.1.0/docs/en/api/video.rst b/head_extractor/mmcv-2.1.0/docs/en/api/video.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a6ebca0eb73afcf3f3f11aae8520e2782a310f13
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/api/video.rst
@@ -0,0 +1,56 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.video
+===================================
+
+.. contents:: mmcv.video
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.video
+
+IO
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   VideoReader
+   Cache
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   frames2video
+
+Optical Flow
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   dequantize_flow
+   flow_from_bytes
+   flow_warp
+   flowread
+   flowwrite
+   quantize_flow
+   sparse_flow_from_bytes
+
+Video Processing
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   concat_video
+   convert_video
+   cut_video
+   resize_video
diff --git a/head_extractor/mmcv-2.1.0/docs/en/api/visualization.rst b/head_extractor/mmcv-2.1.0/docs/en/api/visualization.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8f43ef27a441dcd9001a352cf18e97f8e615676d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/api/visualization.rst
@@ -0,0 +1,50 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.visualization
+===================================
+
+.. contents:: mmcv.visualization
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.visualization
+
+Color
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   Color
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   color_val
+
+Image
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   imshow
+   imshow_bboxes
+   imshow_det_bboxes
+
+Optical Flow
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   flow2rgb
+   flowshow
+   make_color_wheel
diff --git a/head_extractor/mmcv-2.1.0/docs/en/community/contributing.md b/head_extractor/mmcv-2.1.0/docs/en/community/contributing.md
new file mode 100644
index 0000000000000000000000000000000000000000..303fd0d79431d8c317d8306049360732cc06404b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/community/contributing.md
@@ -0,0 +1,267 @@
+## Contributing to OpenMMLab
+
+Welcome to the MMCV community, we are committed to building a cutting-edge computer vision foundational library and all kinds of contributions are welcomed, including but not limited to
+
+**Fix bug**
+
+You can directly post a Pull Request to fix typo in code or documents
+
+The steps to fix the bug of code implementation are as follows.
+
+1. If the modification involve significant changes, you should create an issue first and describe the error information and how to trigger the bug. Other developers will discuss with you and propose an proper solution.
+
+2. Posting a pull request after fixing the bug and adding corresponding unit test.
+
+**New Feature or Enhancement**
+
+1. If the modification involve significant changes, you should create an issue to discuss with our developers to propose an proper design.
+2. Post a Pull Request after implementing the new feature or enhancement and add corresponding unit test.
+
+**Document**
+
+You can directly post a pull request to fix documents. If you want to add a document, you should first create an issue to check if it is reasonable.
+
+### Pull Request Workflow
+
+If you're not familiar with Pull Request, don't worry! The following guidance will tell you how to create a Pull Request step by step. If you want to dive into the develop mode of Pull Request, you can refer to the [official documents](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
+
+#### 1. Fork and clone
+
+If you are posting a pull request for the first time, you should fork the OpenMMLab repositories by clicking the **Fork** button in the top right corner of the GitHub page, and the forked repositories will appear under your GitHub profile.
+
+<img src="https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png" width="1200">
+
+Then, you can clone the repositories to local:
+
+```shell
+git clone git@github.com:{username}/mmcv.git
+```
+
+After that, you should ddd official repository as the upstream repository
+
+```bash
+git remote add upstream git@github.com:open-mmlab/mmcv
+```
+
+Check whether remote repository has been added successfully by `git remote -v`
+
+```bash
+origin	git@github.com:{username}/mmcv.git (fetch)
+origin	git@github.com:{username}/mmcv.git (push)
+upstream	git@github.com:open-mmlab/mmcv (fetch)
+upstream	git@github.com:open-mmlab/mmcv (push)
+```
+
+```{note}
+Here's a brief introduction to origin and upstream. When we use "git clone", we create an "origin" remote by default, which points to the repository cloned from. As for "upstream", we add it ourselves to point to the target repository. Of course, if you don't like the name "upstream", you could name it as you wish. Usually, we'll push the code to "origin". If the pushed code conflicts with the latest code in official("upstream"), we should pull the latest code from upstream to resolve the conflicts, and then push to "origin" again. The posted Pull Request will be updated automatically.
+```
+
+#### 2. Configure pre-commit
+
+You should configure [pre-commit](https://pre-commit.com/#intro) in the local development environment to make sure the code style matches that of OpenMMLab. **Note**: The following code should be executed under the MMCV directory.
+
+```shell
+pip install -U pre-commit
+pre-commit install
+```
+
+Check that pre-commit is configured successfully, and install the hooks defined in `.pre-commit-config.yaml`.
+
+```shell
+pre-commit run --all-files
+```
+
+<img src="https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png" width="1200">
+
+<img src="https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png" width="1200">
+
+```{note}
+Chinese users may fail to download the pre-commit hooks due to the network issue. In this case, you could download these hooks from gitee by setting the .pre-commit-config-zh-cn.yaml
+
+pre-commit install -c .pre-commit-config-zh-cn.yaml
+pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml
+```
+
+If the installation process is interrupted, you can repeatedly run `pre-commit run ... ` to continue the installation.
+
+If the code does not conform to the code style specification, pre-commit will raise a warning and  fixes some of the errors automatically.
+
+<img src="https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png" width="1200">
+
+If we want to commit our code bypassing the pre-commit hook, we can use the `--no-verify` option(**only for temporarily commit**.
+
+```shell
+git commit -m "xxx" --no-verify
+```
+
+#### 3. Create a development branch
+
+After configuring the pre-commit, we should create a branch based on the main branch to develop the new feature or fix the bug. The proposed branch name is `username/pr_name`
+
+```shell
+git checkout -b yhc/refactor_contributing_doc
+```
+
+In subsequent development, if the main branch of the local repository is behind the main branch of "upstream", we need to pull the upstream for synchronization, and then execute the above command:
+
+```shell
+git pull upstream main
+```
+
+#### 4. Commit the code and pass the unit test
+
+- MMCV introduces mypy to do static type checking to increase the robustness of the code. Therefore, we need to add Type Hints to our code and pass the mypy check. If you are not familiar with Type Hints, you can refer to [this tutorial](https://docs.python.org/3/library/typing.html).
+
+- The committed code should pass through the unit test
+
+  ```shell
+  # Pass all unit tests
+  pytest tests
+
+  # Pass the unit test of runner
+  pytest tests/test_runner/test_runner.py
+  ```
+
+  If the unit test fails for lack of dependencies, you can install the dependencies referring to the [guidance](#unit-test)
+
+- If the documents are modified/added, we should check the rendering result referring to [guidance](#document-rendering)
+
+#### 5. Push the code to remote
+
+We could push the local commits to remote after passing through the check of unit test and pre-commit. You can associate the local branch with remote branch by adding `-u` option.
+
+```shell
+git push -u origin {branch_name}
+```
+
+This will allow you to use the `git push` command to push code directly next time, without having to specify a branch or the remote repository.
+
+#### 6. Create a Pull Request
+
+(1) Create a pull request in GitHub's Pull request interface
+
+<img src="https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png" width="1200">
+
+(2) Modify the PR description according to the guidelines so that other developers can better understand your changes
+
+<img src="https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png" width="1200">
+
+Find more details about Pull Request description in [pull request guidelines](#pr-specs).
+
+**note**
+
+(a) The Pull Request description should contain the reason for the change, the content of the change, and the impact of the change, and be associated with the relevant Issue (see [documentation](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)
+
+(b) If it is your first contribution, please sign the CLA
+
+<img src="https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png" width="1200">
+
+(c) Check whether the Pull Request pass through the CI
+
+<img src="https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png" width="1200">
+
+MMCV will run unit test for the posted Pull Request on different platforms (Linux, Window, Mac), based on different versions of Python, PyTorch, CUDA to make sure the code is correct. We can see the specific test information by clicking `Details` in the above image so that we can modify the code.
+
+(3) If the Pull Request passes the CI, then you can wait for the review from other developers. You'll modify the code based on the reviewer's comments, and repeat the steps [4](#4-commit-the-code-and-pass-the-unit-test)-[5](#5-push-the-code-to-remote) until all reviewers approve it. Then, we will merge it ASAP.
+
+<img src="https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png" width="1200">
+
+#### 7. Resolve conflicts
+
+If your local branch conflicts with the latest main branch of "upstream", you'll need to resolove them. There are two ways to do this:
+
+```shell
+git fetch --all --prune
+git rebase upstream/main
+```
+
+or
+
+```shell
+git fetch --all --prune
+git merge upstream/main
+```
+
+If you are very good at handling conflicts, then you can use rebase to resolve conflicts, as this will keep your commit logs tidy. If you are not familiar with `rebase`, then you can use `merge` to resolve conflicts.
+
+### Guidance
+
+#### Unit test
+
+If you cannot run the unit test of some modules for lacking of some dependencies, such as [video](https://github.com/open-mmlab/mmcv/tree/main/mmcv/video) module, you can try to install the following dependencies:
+
+```shell
+# Linux
+sudo apt-get update -y
+sudo apt-get install -y libturbojpeg
+sudo apt-get install -y ffmpeg
+
+# Windows
+conda install ffmpeg
+```
+
+We should also make sure the committed code will not decrease the coverage of unit test, we could run the following command to check the coverage of unit test:
+
+```shell
+python -m coverage run -m pytest /path/to/test_file
+python -m coverage html
+# check file in htmlcov/index.html
+```
+
+#### Document rendering
+
+If the documents are modified/added, we should check the rendering result. We could install the dependencies and run the following command to render the documents and check the results:
+
+```shell
+pip install -r requirements/docs.txt
+cd docs/zh_cn/
+# or docs/en
+make html
+# check file in ./docs/zh_cn/_build/html/index.html
+```
+
+### Code style
+
+#### Python
+
+We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.
+
+We use the following tools for linting and formatting:
+
+- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools.
+- [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports.
+- [yapf](https://github.com/google/yapf): A formatter for Python files.
+- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files.
+- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.
+- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.
+
+Style configurations of yapf and isort can be found in [setup.cfg](./setup.cfg).
+
+We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,
+fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.
+The config for a pre-commit hook is stored in [.pre-commit-config](./.pre-commit-config.yaml).
+
+#### C++ and CUDA
+
+We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
+
+### PR Specs
+
+1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style
+
+2. One short-time branch should be matched with only one PR
+
+3. Accomplish a detailed change in one PR. Avoid large PR
+
+   - Bad: Support Faster R-CNN
+   - Acceptable: Add a box head to Faster R-CNN
+   - Good: Add a parameter to box head to support custom conv-layer number
+
+4. Provide clear and significant commit message
+
+5. Provide clear and meaningful PR description
+
+   - Task name should be clarified in title. The general format is: \[Prefix\] Short description of the PR (Suffix)
+   - Prefix: add new feature \[Feature\], fix bug \[Fix\], related to documents \[Docs\], in developing \[WIP\] (which will not be reviewed temporarily)
+   - Introduce main changes, results and influences on other modules in short description
+   - Associate related issues and pull requests with a milestone
diff --git a/head_extractor/mmcv-2.1.0/docs/en/community/pr.md b/head_extractor/mmcv-2.1.0/docs/en/community/pr.md
new file mode 100644
index 0000000000000000000000000000000000000000..1bdd90f2bc41867e5c17403690f6a35cfe2c07b7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/community/pr.md
@@ -0,0 +1,3 @@
+## Pull Request (PR)
+
+Content has been migrated to [contributing guidance](contributing.md).
diff --git a/head_extractor/mmcv-2.1.0/docs/en/compatibility.md b/head_extractor/mmcv-2.1.0/docs/en/compatibility.md
new file mode 100644
index 0000000000000000000000000000000000000000..09d25e1629944190d92718b6c605fdb2e521296d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/compatibility.md
@@ -0,0 +1,231 @@
+### v2.0.0
+
+The OpenMMLab team released a new generation of training engine [MMEngine](https://github.com/open-mmlab/mmengine) at the World Artificial Intelligence Conference on September 1, 2022. It is a foundational library for training deep learning models. Compared with MMCV, it provides a universal and powerful runner, an open architecture with a more unified interface, and a more customizable training process.
+
+The OpenMMLab team released MMCV v2.0.0 on April 6, 2023. In the 2.x version, it has the following significant changes:
+
+(1) It removed the following components:
+
+- `mmcv.fileio` module, removed in PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179). FileIO module from mmengine will be used wherever required.
+- `mmcv.runner`, `mmcv.parallel`, `mmcv. engine` and `mmcv.device`, removed in PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216).
+- All classes in `mmcv.utils` (eg `Config` and `Registry`) and many functions, removed in PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217). Only a few functions related to mmcv are reserved.
+- `mmcv.onnx`, `mmcv.tensorrt` modules and related functions, removed in PR [#2225](https://github.com/open-mmlab/mmcv/pull/2225).
+- Removed all root registrars in MMCV and registered classes or functions to the [root registrar](https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py) in MMEngine.
+
+(2) It added the [`mmcv.transforms`](https://github.com/open-mmlab/mmcv/tree/main/mmcv/transforms) data transformation module.
+
+(3) It renamed the package name **mmcv** to **mmcv-lite** and **mmcv-full** to **mmcv** in PR [#2235](https://github.com/open-mmlab/mmcv/pull/2235). Also, change the default value of the environment variable `MMCV_WITH_OPS` from 0 to 1.
+
+<table class="docutils">
+<thead>
+  <tr>
+    <th align="center">MMCV < 2.0</th>
+    <th align="center">MMCV >= 2.0 </th>
+<tbody>
+  <tr>
+  <td valign="top">
+
+```bash
+# Contains ops, because the highest version of mmcv-full is less than 2.0.0, so there is no need to add version restrictions
+pip install openmim
+mim install mmcv-full
+
+# do not contain ops
+pip install openmim
+mim install "mmcv < 2.0.0"
+```
+
+</td>
+  <td valign="top">
+
+```bash
+# Contains ops
+pip install openmim
+mim install mmcv
+
+# Ops are not included, because the starting version of mmcv-lite is 2.0.0rc1, so there is no need to add version restrictions
+pip install openmim
+mim install mmcv-lite
+```
+
+</td>
+</tr>
+</thead>
+</table>
+
+### v1.3.18
+
+Some ops have different implementations on different devices. Lots of macros and type checks are scattered in several files, which makes the code hard to maintain. For example:
+
+```c++
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(argmax_y);
+    CHECK_CUDA_INPUT(argmax_x);
+
+    roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
+                           aligned_height, aligned_width, spatial_scale,
+                           sampling_ratio, pool_mode, aligned);
+#else
+    AT_ERROR("RoIAlign is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(rois);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(argmax_y);
+    CHECK_CPU_INPUT(argmax_x);
+    roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+  }
+```
+
+Registry and dispatcher are added to manage these implementations.
+
+```c++
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+// register cuda implementation
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+
+// roi_align.cpp
+// use the dispatcher to invoke different implementation depending on device type of input tensors.
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
+}
+
+```
+
+### v1.3.11
+
+In order to flexibly support more backends and hardwares like `NVIDIA GPUs` and `AMD GPUs`, the directory of `mmcv/ops/csrc` is refactored. Note that this refactoring will not affect the usage in API. For related information, please refer to [PR1206](https://github.com/open-mmlab/mmcv/pull/1206).
+
+The original directory was organized as follows.
+
+```
+.
+├── common_cuda_helper.hpp
+├── ops_cuda_kernel.cuh
+├── pytorch_cpp_helper.hpp
+├── pytorch_cuda_helper.hpp
+├── parrots_cpp_helper.hpp
+├── parrots_cuda_helper.hpp
+├── parrots_cudawarpfunction.cuh
+├── onnxruntime
+│   ├── onnxruntime_register.h
+│   ├── onnxruntime_session_options_config_keys.h
+│   ├── ort_mmcv_utils.h
+│   ├── ...
+│   ├── onnx_ops.h
+│   └── cpu
+│       ├── onnxruntime_register.cpp
+│       ├── ...
+│       └── onnx_ops_impl.cpp
+├── parrots
+│   ├── ...
+│   ├── ops.cpp
+│   ├── ops_cuda.cu
+│   ├── ops_parrots.cpp
+│   └── ops_pytorch.h
+├── pytorch
+│   ├── ...
+│   ├── ops.cpp
+│   ├── ops_cuda.cu
+│   ├── pybind.cpp
+└── tensorrt
+    ├── trt_cuda_helper.cuh
+    ├── trt_plugin_helper.hpp
+    ├── trt_plugin.hpp
+    ├── trt_serialize.hpp
+    ├── ...
+    ├── trt_ops.hpp
+    └── plugins
+        ├── trt_cuda_helper.cu
+        ├── trt_plugin.cpp
+        ├── ...
+        ├── trt_ops.cpp
+        └── trt_ops_kernel.cu
+```
+
+After refactored, it is organized as follows.
+
+```
+.
+├── common
+│   ├── box_iou_rotated_utils.hpp
+│   ├── parrots_cpp_helper.hpp
+│   ├── parrots_cuda_helper.hpp
+│   ├── pytorch_cpp_helper.hpp
+│   ├── pytorch_cuda_helper.hpp
+│   └── cuda
+│       ├── common_cuda_helper.hpp
+│       ├── parrots_cudawarpfunction.cuh
+│       ├── ...
+│       └── ops_cuda_kernel.cuh
+├── onnxruntime
+│   ├── onnxruntime_register.h
+│   ├── onnxruntime_session_options_config_keys.h
+│   ├── ort_mmcv_utils.h
+│   ├── ...
+│   ├── onnx_ops.h
+│   └── cpu
+│       ├── onnxruntime_register.cpp
+│       ├── ...
+│       └── onnx_ops_impl.cpp
+├── parrots
+│   ├── ...
+│   ├── ops.cpp
+│   ├── ops_parrots.cpp
+│   └── ops_pytorch.h
+├── pytorch
+│   ├── info.cpp
+│   ├── pybind.cpp
+│   ├── ...
+│   ├── ops.cpp
+│   └── cuda
+│       ├── ...
+│       └── ops_cuda.cu
+└── tensorrt
+    ├── trt_cuda_helper.cuh
+    ├── trt_plugin_helper.hpp
+    ├── trt_plugin.hpp
+    ├── trt_serialize.hpp
+    ├── ...
+    ├── trt_ops.hpp
+    └── plugins
+        ├── trt_cuda_helper.cu
+        ├── trt_plugin.cpp
+        ├── ...
+        ├── trt_ops.cpp
+        └── trt_ops_kernel.cu
+```
diff --git a/head_extractor/mmcv-2.1.0/docs/en/conf.py b/head_extractor/mmcv-2.1.0/docs/en/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..471bd225adeede01787a236ac0d370d0056b960a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/conf.py
@@ -0,0 +1,215 @@
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+
+import pytorch_sphinx_theme
+from sphinx.builders.html import StandaloneHTMLBuilder
+
+sys.path.insert(0, os.path.abspath('../..'))
+
+version_file = '../../mmcv/version.py'
+with open(version_file) as f:
+    exec(compile(f.read(), version_file, 'exec'))
+__version__ = locals()['__version__']
+
+# -- Project information -----------------------------------------------------
+
+project = 'mmcv'
+copyright = '2018-2022, OpenMMLab'
+author = 'MMCV Authors'
+
+# The short X.Y version
+version = __version__
+# The full version, including alpha/beta/rc tags
+release = __version__
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'sphinx_markdown_tables',
+    'myst_parser',
+    'sphinx_copybutton',
+]  # yapf: disable
+
+myst_heading_anchors = 4
+
+myst_enable_extensions = ['colon_fence']
+
+# Configuration for intersphinx
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3', None),
+    'numpy': ('https://numpy.org/doc/stable', None),
+    'torch': ('https://pytorch.org/docs/stable/', None),
+    'mmengine': ('https://mmengine.readthedocs.io/en/latest', None),
+}
+
+autodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+# html_theme = 'sphinx_rtd_theme'
+html_theme = 'pytorch_sphinx_theme'
+html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+html_theme_options = {
+    'menu': [
+        {
+            'name': 'GitHub',
+            'url': 'https://github.com/open-mmlab/mmcv'
+        },
+    ],
+    # Specify the language of shared menu
+    'menu_lang': 'en',
+}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_css_files = ['css/readthedocs.css']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'mmcvdoc'
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'mmcv.tex', 'mmcv Documentation', 'MMCV Contributors',
+     'manual'),
+]
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, 'mmcv', 'mmcv Documentation', [author], 1)]
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'mmcv', 'mmcv Documentation', author, 'mmcv',
+     'One line description of project.', 'Miscellaneous'),
+]
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# set priority when building html
+StandaloneHTMLBuilder.supported_image_types = [
+    'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg'
+]
+# -- Extension configuration -------------------------------------------------
+# Ignore >>> when copying code
+copybutton_prompt_text = r'>>> |\.\.\. '
+copybutton_prompt_is_regexp = True
diff --git a/head_extractor/mmcv-2.1.0/docs/en/deployment/mmcv_ops_definition.md b/head_extractor/mmcv-2.1.0/docs/en/deployment/mmcv_ops_definition.md
new file mode 100644
index 0000000000000000000000000000000000000000..37097259bdba7ed10fcef03ec227a453bcdc95c4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/deployment/mmcv_ops_definition.md
@@ -0,0 +1,686 @@
+# MMCV Operators
+
+To make custom operators in MMCV more standard, precise definitions of each operator are listed in this document.
+
+<!-- TOC -->
+
+- [MMCV Operators](#mmcv-operators)
+  - [MMCVBorderAlign](#mmcvborderalign)
+    - [Description](#description)
+    - [Parameters](#parameters)
+    - [Inputs](#inputs)
+    - [Outputs](#outputs)
+    - [Type Constraints](#type-constraints)
+  - [MMCVCARAFE](#mmcvcarafe)
+    - [Description](#description-1)
+    - [Parameters](#parameters-1)
+    - [Inputs](#inputs-1)
+    - [Outputs](#outputs-1)
+    - [Type Constraints](#type-constraints-1)
+  - [MMCVCAWeight](#mmcvcaweight)
+    - [Description](#description-2)
+    - [Parameters](#parameters-2)
+    - [Inputs](#inputs-2)
+    - [Outputs](#outputs-2)
+    - [Type Constraints](#type-constraints-2)
+  - [MMCVCAMap](#mmcvcamap)
+    - [Description](#description-3)
+    - [Parameters](#parameters-3)
+    - [Inputs](#inputs-3)
+    - [Outputs](#outputs-3)
+    - [Type Constraints](#type-constraints-3)
+  - [MMCVCornerPool](#mmcvcornerpool)
+    - [Description](#description-4)
+    - [Parameters](#parameters-4)
+    - [Inputs](#inputs-4)
+    - [Outputs](#outputs-4)
+    - [Type Constraints](#type-constraints-4)
+  - [MMCVDeformConv2d](#mmcvdeformconv2d)
+    - [Description](#description-5)
+    - [Parameters](#parameters-5)
+    - [Inputs](#inputs-5)
+    - [Outputs](#outputs-5)
+    - [Type Constraints](#type-constraints-5)
+  - [MMCVModulatedDeformConv2d](#mmcvmodulateddeformconv2d)
+    - [Description](#description-6)
+    - [Parameters](#parameters-6)
+    - [Inputs](#inputs-6)
+    - [Outputs](#outputs-6)
+    - [Type Constraints](#type-constraints-6)
+  - [MMCVDeformRoIPool](#mmcvdeformroipool)
+    - [Description](#description-7)
+    - [Parameters](#parameters-7)
+    - [Inputs](#inputs-7)
+    - [Outputs](#outputs-7)
+    - [Type Constraints](#type-constraints-7)
+  - [MMCVMaskedConv2d](#mmcvmaskedconv2d)
+    - [Description](#description-8)
+    - [Parameters](#parameters-8)
+    - [Inputs](#inputs-8)
+    - [Outputs](#outputs-8)
+    - [Type Constraints](#type-constraints-8)
+  - [MMCVPSAMask](#mmcvpsamask)
+    - [Description](#description-9)
+    - [Parameters](#parameters-9)
+    - [Inputs](#inputs-9)
+    - [Outputs](#outputs-9)
+    - [Type Constraints](#type-constraints-9)
+  - [NonMaxSuppression](#nonmaxsuppression)
+    - [Description](#description-10)
+    - [Parameters](#parameters-10)
+    - [Inputs](#inputs-10)
+    - [Outputs](#outputs-10)
+    - [Type Constraints](#type-constraints-10)
+  - [MMCVRoIAlign](#mmcvroialign)
+    - [Description](#description-11)
+    - [Parameters](#parameters-11)
+    - [Inputs](#inputs-11)
+    - [Outputs](#outputs-11)
+    - [Type Constraints](#type-constraints-11)
+  - [MMCVRoIAlignRotated](#mmcvroialignrotated)
+    - [Description](#description-12)
+    - [Parameters](#parameters-12)
+    - [Inputs](#inputs-12)
+    - [Outputs](#outputs-12)
+    - [Type Constraints](#type-constraints-12)
+  - [grid_sampler\*](#grid_sampler)
+    - [Description](#description-13)
+    - [Parameters](#parameters-13)
+    - [Inputs](#inputs-13)
+    - [Outputs](#outputs-13)
+    - [Type Constraints](#type-constraints-13)
+  - [cummax\*](#cummax)
+    - [Description](#description-14)
+    - [Parameters](#parameters-14)
+    - [Inputs](#inputs-14)
+    - [Outputs](#outputs-14)
+    - [Type Constraints](#type-constraints-14)
+  - [cummin\*](#cummin)
+    - [Description](#description-15)
+    - [Parameters](#parameters-15)
+    - [Inputs](#inputs-15)
+    - [Outputs](#outputs-15)
+    - [Type Constraints](#type-constraints-15)
+  - [Reminders](#reminders)
+
+<!-- TOC -->
+
+## MMCVBorderAlign
+
+### Description
+
+Applies `border_align` over the input feature based on predicted bboxes.
+
+For each border line (e.g. top, left, bottom or right) of each box,
+border_align does the following:
+
+- uniformly samples `pool_size`+1 positions on this line, involving the start and end points.
+- the corresponding features on these points are computed by bilinear interpolation.
+- max pooling over all the `pool_size`+1 positions are used for computing pooled feature.
+
+Read [BorderDet: Border Feature for Dense Object Detection](ttps://arxiv.org/abs/2007.11056) for more detailed information.
+
+### Parameters
+
+| Type  | Parameter   | Description                                                                         |
+| ----- | ----------- | ----------------------------------------------------------------------------------- |
+| `int` | `pool_size` | number of positions sampled over the boxes' borders(e.g. top, bottom, left, right). |
+
+### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>Features with shape [N,4C,H,W]. Channels ranged in [0,C), [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom, right features respectively</dd>
+<dt><tt>boxes</tt>: T</dt>
+<dd>Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>Pooled features with shape [N,C,H*W,4]. The order is(top,left,bottom,right) for the last dimension.</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32)
+
+## MMCVCARAFE
+
+### Description
+
+CARAFE operator performs feature upsampling.
+
+Read [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.02188) for more detailed information.
+
+### Parameters
+
+| Type    | Parameter      | Description                                   |
+| ------- | -------------- | --------------------------------------------- |
+| `int`   | `kernel_size`  | reassemble kernel size, should be odd integer |
+| `int`   | `group_size`   | reassemble group size                         |
+| `float` | `scale_factor` | upsample ratio(>=1)                           |
+
+### Inputs
+
+<dl>
+<dt><tt>features</tt>: T</dt>
+<dd>Input features. 4-D tensor of shape (N, C, H, W). N is the batch size.</dd>
+<dt><tt>masks</tt>: T</dt>
+<dd>The input mask</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>The upsampled features. 4-D tensor of shape (N, C, H * scale_factor, W * scale_factor). N is the batch size.</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32)
+
+## MMCVCAWeight
+
+### Description
+
+Operator for Criss-Cross Attention
+Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/pdf/1811.11721.pdf) for more detailed information.
+
+### Parameters
+
+None
+
+### Inputs
+
+<dl>
+<dt><tt>t</tt>: T</dt>
+<dd>The query matrix of shape (N, C', H, W).</dd>
+<dt><tt>f</tt>: T</dt>
+<dd>The key matrix of shape (N, C', H, W).</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>weight</tt>: T</dt>
+<dd>The attention map of shape (N, H+W-1, H, W).</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32)
+
+## MMCVCAMap
+
+### Description
+
+Operator for Criss-Cross Attention
+Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/pdf/1811.11721.pdf) for more detailed information.
+
+### Parameters
+
+None
+
+### Inputs
+
+<dl>
+<dt><tt>weight</tt>: T</dt>
+<dd>Output from the operator MMCVCAWeight.</dd>
+<dt><tt>value</tt>: T</dt>
+<dd>The value matrix of shape (N, C, H, W).</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>Output tensor of aggregated contextual information</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32)
+
+## MMCVCornerPool
+
+### Description
+
+Perform CornerPool on `input` features. Read [CornerNet -- Detecting Objects as Paired Keypoints](https://arxiv.org/abs/1808.01244) for more details.
+
+### Parameters
+
+| Type  | Parameter | Description                                                      |
+| ----- | --------- | ---------------------------------------------------------------- |
+| `int` | `mode`    | corner pool mode, (0: `top`, 1: `bottom`, 2: `left`, 3: `right`) |
+
+### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>Input features. 4-D tensor of shape (N, C, H, W). N is the batch size.</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>The pooled features. 4-D tensor of shape (N, C, H, W).</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32)
+
+## MMCVDeformConv2d
+
+### Description
+
+Applies a deformable 2D convolution over an input signal composed of several input planes.
+
+Read [Deformable Convolutional Networks](https://arxiv.org/pdf/1703.06211.pdf) for detail.
+
+### Parameters
+
+| Type           | Parameter           | Description                                                                                                       |
+| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------- |
+| `list of ints` | `stride`            | The stride of the convolving kernel, (sH, sW). Defaults to `(1, 1)`.                                              |
+| `list of ints` | `padding`           | Paddings on both sides of the input, (padH, padW).  Defaults to `(0, 0)`.                                         |
+| `list of ints` | `dilation`          | The spacing between kernel elements (dH, dW). Defaults to `(1, 1)`.                                               |
+| `int`          | `groups`            | Split input into groups. `input_channel` should be divisible by the number of groups. Defaults to `1`.            |
+| `int`          | `deformable_groups` | Groups of deformable offset. Defaults to `1`.                                                                     |
+| `int`          | `bias`              | Whether to add a learnable bias to the output. `0` stands for `False` and `1` stands for `True`. Defaults to `0`. |
+| `int`          | `im2col_step`       | Groups of deformable offset. Defaults to `32`.                                                                    |
+
+### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the number of channels, inH and inW are the height and width of the data.</dd>
+<dt><tt>offset</tt>: T</dt>
+<dd>Input offset; 4-D tensor of shape (N, deformable_group* 2* kH* kW, outH, outW), where kH and kW are the height and width of weight, outH and outW is the height and width of offset and output.</dd>
+<dt><tt>weight</tt>: T</dt>
+<dd>Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>Output feature; 4-D tensor of shape (N, output_channel, outH, outW).</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32, Linear)
+
+## MMCVModulatedDeformConv2d
+
+### Description
+
+Perform Modulated Deformable Convolution on input feature, read [Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/abs/1811.11168?from=timeline) for detail.
+
+### Parameters
+
+| Type           | Parameter           | Description                                                                           |
+| -------------- | ------------------- | ------------------------------------------------------------------------------------- |
+| `list of ints` | `stride`            | The stride of the convolving kernel. (sH, sW)                                         |
+| `list of ints` | `padding`           | Paddings on both sides of the input. (padH, padW)                                     |
+| `list of ints` | `dilation`          | The spacing between kernel elements. (dH, dW)                                         |
+| `int`          | `deformable_groups` | Groups of deformable offset.                                                          |
+| `int`          | `groups`            | Split input into groups. `input_channel` should be divisible by the number of groups. |
+
+### Inputs
+
+<dl>
+<dt><tt>feature</tt>: T</dt>
+<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the number of channels, inH and inW are the height and width of the data.</dd>
+<dt><tt>offset</tt>: T</dt>
+<dd>Input offset; 4-D tensor of shape (N, deformable_group* 2* kH* kW, outH, outW), where kH and kW are the height and width of weight, outH and outW are the height and width of offset and output.</dd>
+<dt><tt>mask</tt>: T</dt>
+<dd>Input mask; 4-D tensor of shape (N, deformable_group* kH* kW, outH, outW), where kH and kW are the height and width of weight, outH and outW are the height and width of offset and output.</dd>
+<dt><tt>weight]</tt>: T</dt>
+<dd>Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).</dd>
+<dt><tt>bias</tt>: T, optional</dt>
+<dd>Input bias; 1-D tensor of shape (output_channel).</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>Output feature; 4-D tensor of shape (N, output_channel, outH, outW).</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32, Linear)
+
+## MMCVDeformRoIPool
+
+### Description
+
+Deformable roi pooling layer
+
+### Parameters
+
+| Type    | Parameter        | Description                                                                                                   |
+| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |
+| `int`   | `output_height`  | height of output roi                                                                                          |
+| `int`   | `output_width`   | width of output roi                                                                                           |
+| `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |
+| `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |
+| `float` | `gamma`          | gamma                                                                                                         |
+
+### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>Input feature map; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>
+<dt><tt>rois</tt>: T</dt>
+<dd>RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.</dd>
+<dt><tt>offset</tt>: T</dt>
+<dd>offset of height and width. Defaults to a tensor of zero</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>feat</tt>: T</dt>
+<dd>RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].<dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32)
+
+## MMCVMaskedConv2d
+
+### Description
+
+Performs a masked 2D convolution from PixelRNN
+Read [Pixel Recurrent Neural Networks](https://arxiv.org/abs/1601.06759) for more detailed information.
+
+### Parameters
+
+| Type           | Parameter | Description                                                                      |
+| -------------- | --------- | -------------------------------------------------------------------------------- |
+| `list of ints` | `stride`  | The stride of the convolving kernel. (sH, sW). **Only support stride=1 in mmcv** |
+| `list of ints` | `padding` | Paddings on both sides of the input. (padH, padW). Defaults to `(0, 0)`.         |
+
+### Inputs
+
+<dl>
+<dt><tt>features</tt>: T</dt>
+<dd>Input features; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>
+<dt><tt>mask</tt>: T</dt>
+<dd>Input mask; 3D tensor of shape (N, H, W)</dd>
+<dt><tt>weight</tt>: T</dt>
+<dd>The learnable weights of the module</dd>
+<dt><tt>bias</tt>: T</dt>
+<dd>The learnable bias of the module</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>The output convolved feature</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32)
+
+## MMCVPSAMask
+
+### Description
+
+An operator from PSANet.
+
+Read [PSANet: Point-wise Spatial Attention Network for Scene Parsing](https://hszhao.github.io/papers/eccv18_psanet.pdf) for more detailed information.
+
+### Parameters
+
+| Type           | Parameter   | Description                                  |
+| -------------- | ----------- | -------------------------------------------- |
+| `int`          | `psa_type`  | `0` means collect and `1` means `distribute` |
+| `list of ints` | `mask_size` | The size of mask                             |
+
+### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>Input feature; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>Output tensor of shape (N, H * W, H, W)</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32)
+
+## NonMaxSuppression
+
+### Description
+
+Filter out boxes has high IoU overlap with previously selected boxes or low score. Output the indices of valid boxes.
+
+Note this definition is slightly different with [onnx: NonMaxSuppression](https://github.com/onnx/onnx/blob/main/docs/Operators.md#nonmaxsuppression)
+
+### Parameters
+
+| Type    | Parameter                    | Description                                                                                                                          |
+| ------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
+| `int`   | `center_point_box`           | 0 - the box data is supplied as \[y1, x1, y2, x2\], 1-the box data is supplied as \[x_center, y_center, width, height\].             |
+| `int`   | `max_output_boxes_per_class` | The maximum number of boxes to be selected per batch per class. Default to 0, number of output boxes equal to number of input boxes. |
+| `float` | `iou_threshold`              | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range \[0, 1\]. Default to 0.                   |
+| `float` | `score_threshold`            | The threshold for deciding when to remove boxes based on score.                                                                      |
+| `int`   | `offset`                     | 0 or 1, boxes' width or height is (x2 - x1 + offset).                                                                                |
+
+### Inputs
+
+<dl>
+<dt><tt>boxes</tt>: T</dt>
+<dd>Input boxes. 3-D tensor of shape (num_batches, spatial_dimension, 4).</dd>
+<dt><tt>scores</tt>: T</dt>
+<dd>Input scores. 3-D tensor of shape (num_batches, num_classes, spatial_dimension).</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>indices</tt>: tensor(int32, Linear)</dt>
+<dd>Selected indices. 2-D tensor of shape (num_selected_indices, 3) as [[batch_index, class_index, box_index], ...].</dd>
+<dd>num_selected_indices=num_batches* num_classes* min(max_output_boxes_per_class, spatial_dimension).</dd>
+<dd>All invalid indices will be filled with -1.</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32, Linear)
+
+## MMCVRoIAlign
+
+### Description
+
+Perform RoIAlign on output feature, used in bbox_head of most two-stage detectors.
+
+### Parameters
+
+| Type    | Parameter        | Description                                                                                                   |
+| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |
+| `int`   | `output_height`  | height of output roi                                                                                          |
+| `int`   | `output_width`   | width of output roi                                                                                           |
+| `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |
+| `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |
+| `str`   | `mode`           | pooling mode in each bin. `avg` or `max`                                                                      |
+| `int`   | `aligned`        | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |
+
+### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>Input feature map; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.</dd>
+<dt><tt>rois</tt>: T</dt>
+<dd>RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>feat</tt>: T</dt>
+<dd>RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].<dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32)
+
+## MMCVRoIAlignRotated
+
+### Description
+
+Perform RoI align pooling for rotated proposals
+
+### Parameters
+
+| Type    | Parameter        | Description                                                                                                   |
+| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |
+| `int`   | `output_height`  | height of output roi                                                                                          |
+| `int`   | `output_width`   | width of output roi                                                                                           |
+| `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |
+| `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |
+| `str`   | `mode`           | pooling mode in each bin. `avg` or `max`                                                                      |
+| `int`   | `aligned`        | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |
+| `int`   | `clockwise`      | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |
+
+### Inputs
+
+<dl>
+<dt><tt>features</tt>: T</dt>
+<dd>Input feature map; 4D tensor of shape (N, C, H, W)</dd>
+<dt><tt>rois</tt>: T</dt>
+<dd>RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dd>RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].<dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32)
+
+## grid_sampler\*
+
+### Description
+
+Perform sample from `input` with pixel locations from `grid`.
+
+Check [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html?highlight=grid_sample#torch.nn.functional.grid_sample) for more information.
+
+### Parameters
+
+| Type  | Parameter            | Description                                                                                                                                                                                                                                                                                     |
+| ----- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `int` | `interpolation_mode` | Interpolation mode to calculate output values. (0: `bilinear` , 1: `nearest`)                                                                                                                                                                                                                   |
+| `int` | `padding_mode`       | Padding mode for outside grid values. (0: `zeros`, 1: `border`, 2: `reflection`)                                                                                                                                                                                                                |
+| `int` | `align_corners`      | If `align_corners=1`, the extrema (`-1` and `1`) are considered as referring to the center points of the input's corner pixels. If `align_corners=0`, they are instead considered as referring to the corner points of the input's corner pixels, making the sampling more resolution agnostic. |
+
+### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the numbers of channels, inH and inW are the height and width of the data.</dd>
+<dt><tt>grid</tt>: T</dt>
+<dd>Input offset; 4-D tensor of shape (N, outH, outW, 2), where outH and outW are the height and width of offset and output. </dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>Output feature; 4-D tensor of shape (N, C, outH, outW).</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32, Linear)
+
+## cummax\*
+
+### Description
+
+Returns a tuple (`values`, `indices`) where `values` is the cumulative maximum elements of `input` in the dimension `dim`. And `indices` is the index location of each maximum value found in the dimension `dim`. Read [torch.cummax](https://pytorch.org/docs/stable/generated/torch.cummax.html) for more details.
+
+### Parameters
+
+| Type  | Parameter | Description                            |
+| ----- | --------- | -------------------------------------- |
+| `int` | `dim`     | the dimension to do the operation over |
+
+### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>The input tensor with various shapes. Tensor with empty element is also supported.</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>Output the cumulative maximum elements of `input` in the dimension `dim`, with the same shape and dtype as `input`.</dd>
+<dt><tt>indices</tt>: tensor(int64)</dt>
+<dd>Output the index location of each cumulative maximum value found in the dimension `dim`, with the same shape as `input`.</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32)
+
+## cummin\*
+
+### Description
+
+Returns a tuple (`values`, `indices`) where `values` is the cumulative minimum elements of `input` in the dimension `dim`. And `indices` is the index location of each minimum value found in the dimension `dim`. Read [torch.cummin](https://pytorch.org/docs/stable/generated/torch.cummin.html) for more details.
+
+### Parameters
+
+| Type  | Parameter | Description                            |
+| ----- | --------- | -------------------------------------- |
+| `int` | `dim`     | the dimension to do the operation over |
+
+### Inputs
+
+<dl>
+<dt><tt>input</tt>: T</dt>
+<dd>The input tensor with various shapes. Tensor with empty element is also supported.</dd>
+</dl>
+
+### Outputs
+
+<dl>
+<dt><tt>output</tt>: T</dt>
+<dd>Output the cumulative minimum elements of `input` in the dimension `dim`, with the same shape and dtype as `input`.</dd>
+<dt><tt>indices</tt>: tensor(int64)</dt>
+<dd>Output the index location of each cumulative minimum value found in the dimension `dim`, with the same shape as `input`.</dd>
+</dl>
+
+### Type Constraints
+
+- T:tensor(float32)
+
+## Reminders
+
+- Operators endwith `*` are defined in Torch and are included here for the conversion to ONNX.
diff --git a/head_extractor/mmcv-2.1.0/docs/en/docutils.conf b/head_extractor/mmcv-2.1.0/docs/en/docutils.conf
new file mode 100644
index 0000000000000000000000000000000000000000..0c00c84688701117f231fd0c8ec295fb747b7d8f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/docutils.conf
@@ -0,0 +1,2 @@
+[html writers]
+table_style: colwidths-auto
diff --git a/head_extractor/mmcv-2.1.0/docs/en/faq.md b/head_extractor/mmcv-2.1.0/docs/en/faq.md
new file mode 100644
index 0000000000000000000000000000000000000000..02d31c233a9ff66d5e8f3f288b5d5f64e5c5298c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/faq.md
@@ -0,0 +1,93 @@
+## Frequently Asked Questions
+
+We list some common troubles faced by many users and their corresponding solutions here.
+Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them.
+
+### Installation
+
+- KeyError: "xxx: 'yyy is not in the zzz registry'"
+
+  The registry mechanism will be triggered only when the file of the module is imported.
+  So you need to import that file somewhere. More details can be found at [KeyError: "MaskRCNN: 'RefineRoIHead is not in the models registry'"](https://github.com/open-mmlab/mmdetection/issues/5974).
+
+- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"
+
+  1. Uninstall existing mmcv in the environment using `pip uninstall mmcv`
+  2. Install mmcv-full following the [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) or [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html)
+
+- "invalid device function" or "no kernel image is available for execution"
+
+  1. Check the CUDA compute capability of you GPU
+  2. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built for the correct GPU architecture. You may need to set `TORCH_CUDA_ARCH_LIST` to reinstall MMCV. The compatibility issue could happen when  using old GPUS, e.g., Tesla K80 (3.7) on colab.
+  3. Check whether the running environment is the same as that when mmcv/mmdet is compiled. For example, you may compile mmcv using CUDA 10.0 bug run it on CUDA9.0 environments
+
+- "undefined symbol" or "cannot open xxx.so"
+
+  1. If those symbols are CUDA/C++ symbols (e.g., libcudart.so or GLIBCXX), check
+     whether the CUDA/GCC runtimes are the same as those used for compiling mmcv
+  2. If those symbols are Pytorch symbols (e.g., symbols containing caffe, aten, and TH), check whether the Pytorch version is the same as that used for compiling mmcv
+  3. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built by and running on the same environment
+
+- "RuntimeError: CUDA error: invalid configuration argument"
+
+  This error may be caused by the poor performance of GPU. Try to decrease the value of [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)
+  and recompile mmcv.
+
+- "RuntimeError: nms is not compiled with GPU support"
+
+  This error is because your CUDA environment is not installed correctly.
+  You may try to re-install your CUDA environment and then delete the build/ folder before re-compile mmcv.
+
+- "Segmentation fault"
+
+  1. Check your GCC version and use GCC >= 5.4. This usually caused by the incompatibility between PyTorch and the environment (e.g., GCC \< 4.9 for PyTorch). We also recommend the users to avoid using GCC 5.5 because many feedbacks report that GCC 5.5 will cause "segmentation fault" and simply changing it to GCC 5.4 could solve the problem
+  2. Check whether PyTorch is correctly installed and could use CUDA op, e.g. type the following command in your terminal and see whether they could correctly output results
+     ```shell
+     python -c 'import torch; print(torch.cuda.is_available())'
+     ```
+  3. If PyTorch is correctly installed, check whether MMCV is correctly installed. If MMCV is correctly installed, then there will be no issue of the command
+     ```shell
+     python -c 'import mmcv; import mmcv.ops'
+     ```
+  4. If MMCV and PyTorch are correctly installed, you can use `ipdb` to set breakpoints or directly add `print` to debug and see which part leads the `segmentation fault`
+
+- "libtorch_cuda_cu.so: cannot open shared object file"
+
+  `mmcv-full` depends on the share object but it can not be found. We can check whether the object exists in `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` or try to re-install the PyTorch.
+
+- "fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version!"
+
+  If you are building mmcv-full on Windows and the version of CUDA is 9.2, you will probably encounter the error `"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\crt/host_config.h(133): fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!"`, in which case you can use a lower version of Microsoft Visual Studio like vs2017.
+
+- "error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized"
+
+  If your version of PyTorch is 1.5.0 and you are building mmcv-full on Windows, you will probably encounter the error `- torch/csrc/jit/api/module.h(474): error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized`. The way to solve the error is to replace all the `static constexpr bool all_slots = false;` with `static bool all_slots = false;` at this file `https://github.com/pytorch/pytorch/blob/v1.5.0/torch/csrc/jit/api/module.h`. More details can be found at [member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized](https://github.com/pytorch/pytorch/issues/39394).
+
+- "error: a member with an in-class initializer must be const"
+
+  If your version of PyTorch is 1.6.0 and you are building mmcv-full on Windows, you will probably encounter the error `"- torch/include\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const"`. The way to solve the error is to replace all the `CONSTEXPR_EXCEPT_WIN_CUDA ` with `const` at `torch/include\torch/csrc/jit/api/module.h`. More details can be found at [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575).
+
+- "error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized"
+
+  If your version of PyTorch is 1.7.0 and you are building mmcv-full on Windows, you will probably encounter the error `torch/include\torch/csrc/jit/ir/ir.h(1347): error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized`. The way to solve the error needs to modify several local files of PyTorch:
+
+  - delete `static constexpr Symbol Kind = ::c10::prim::profile;` and `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;` at `torch/include\torch/csrc/jit/ir/ir.h`
+  - replace `explicit operator type&() { return *(this->value); }` with `explicit operator type&() { return *((type*)this->value); }` at `torch\include\pybind11\cast.h`
+  - replace all the `CONSTEXPR_EXCEPT_WIN_CUDA` with `const` at `torch/include\torch/csrc/jit/api/module.h`
+
+  More details can be found at [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956).
+
+- Compatibility issue between MMCV and MMDetection; "ConvWS is already registered in conv layer"
+
+  Please install the correct version of MMCV for the version of your MMDetection following the [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation).
+
+### Usage
+
+- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one"
+
+  1. This error indicates that your module has parameters that were not used in producing loss. This phenomenon may be caused by running different branches in your code in DDP mode. More datails at [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582).
+  2. You can set ` find_unused_parameters = True` in the config to solve the above problems or find those unused parameters manually
+
+- "RuntimeError: Trying to backward through the graph a second time"
+
+  `GradientCumulativeOptimizerHook` and `OptimizerHook` are both set which causes the `loss.backward()` to be called twice so `RuntimeError` was raised. We can only use one of these. More datails at [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379).
diff --git a/head_extractor/mmcv-2.1.0/docs/en/get_started/build.md b/head_extractor/mmcv-2.1.0/docs/en/get_started/build.md
new file mode 100644
index 0000000000000000000000000000000000000000..742e60c35650da5b9ce5ff4e37e9b0cf4d4bf18b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/get_started/build.md
@@ -0,0 +1,349 @@
+## Build MMCV from source
+
+### Build mmcv
+
+Before installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://pytorch.org/get-started/locally/#start-locally). This can be verified using the following command
+
+```bash
+python -c 'import torch;print(torch.__version__)'
+```
+
+If version information is output, then PyTorch is installed.
+
+```{note}
+If you would like to use `opencv-python-headless` instead of `opencv-python`,
+e.g., in a minimum container environment or servers without GUI,
+you can first install it before installing MMCV to skip the installation of `opencv-python`.
+```
+
+#### Build on Linux
+
+1. Clone the repo
+
+   ```bash
+   git clone https://github.com/open-mmlab/mmcv.git
+   cd mmcv
+   ```
+
+2. Install `ninja` and `psutil` to speed up the compilation
+
+   ```bash
+   pip install -r requirements/optional.txt
+   ```
+
+3. Check the nvcc version (requires 9.2+. Skip if no GPU available.)
+
+   ```bash
+   nvcc --version
+   ```
+
+   If the above command outputs the following message, it means that the nvcc setting is OK, otherwise you need to set CUDA_HOME.
+
+   ```
+   nvcc: NVIDIA (R) Cuda compiler driver
+   Copyright (c) 2005-2020 NVIDIA Corporation
+   Built on Mon_Nov_30_19:08:53_PST_2020
+   Cuda compilation tools, release 11.2, V11.2.67
+   Build cuda_11.2.r11.2/compiler.29373293_0
+   ```
+
+   :::{note}
+   If you want to support ROCm, you can refer to [AMD ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html) to install ROCm.
+   :::
+
+4. Check the gcc version (requires 5.4+)
+
+   ```bash
+   gcc --version
+   ```
+
+5. Start building (takes 10+ min)
+
+   ```bash
+   pip install -e . -v
+   ```
+
+6. Validate the installation
+
+   ```bash
+   python .dev_scripts/check_installation.py
+   ```
+
+   If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution.
+
+   If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).
+
+#### Build on macOS
+
+```{note}
+If you are using a mac with apple silicon chip, install the PyTorch 1.13+, otherwise you will encounter the problem in [issues#2218](https://github.com/open-mmlab/mmcv/issues/2218).
+```
+
+1. Clone the repo
+
+   ```bash
+   git clone https://github.com/open-mmlab/mmcv.git
+   cd mmcv
+   ```
+
+2. Install `ninja` and `psutil` to speed up the compilation
+
+   ```bash
+   pip install -r requirements/optional.txt
+   ```
+
+3. Start building
+
+   ```bash
+   MMCV_WITH_OPS=1 pip install -e .
+   ```
+
+4. Validate the installation
+
+   ```bash
+   python .dev_scripts/check_installation.py
+   ```
+
+   If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution.
+
+   If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).
+
+#### Build on Windows
+
+Building MMCV on Windows is a bit more complicated than that on Linux.
+The following instructions show how to get this accomplished.
+
+##### Prerequisite
+
+The following software is required for building MMCV on windows.
+Install them first.
+
+- [Git](https://git-scm.com/download/win)
+  - During installation, tick **add git to Path**.
+- [Visual Studio Community 2019](https://visualstudio.microsoft.com)
+  - A compiler for C++ and CUDA codes.
+- [Miniconda](https://docs.conda.io/en/latest/miniconda.html)
+  - Official distributions of Python should work too.
+- [CUDA 10.2](https://developer.nvidia.com/cuda-10.2-download-archive)
+  - Not required for building CPU version.
+  - Customize the installation if necessary. As a recommendation, skip the driver installation if a newer version is already installed.
+
+```{note}
+You should know how to set up environment variables, especially `Path`, on Windows. The following instruction relies heavily on this skill.
+```
+
+##### Common steps
+
+1. Launch Anaconda prompt from Windows Start menu
+
+   Do not use raw `cmd.exe` s instruction is based on PowerShell syntax.
+
+2. Create a new conda environment
+
+   ```powershell
+   (base) PS C:\Users\xxx> conda create --name mmcv python=3.7
+   (base) PS C:\Users\xxx> conda activate mmcv  # make sure to activate environment before any operation
+   ```
+
+3. Install PyTorch. Choose a version based on your need.
+
+   ```powershell
+   # CUDA version
+   (mmcv) PS C:\Users\xxx> conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
+   # CPU version
+   (mmcv) PS C:\Users\xxx> conda install install pytorch torchvision cpuonly -c pytorch
+   ```
+
+4. Clone the repo
+
+   ```powershell
+   (mmcv) PS C:\Users\xxx> git clone https://github.com/open-mmlab/mmcv.git
+   (mmcv) PS C:\Users\xxx\mmcv> cd mmcv
+   ```
+
+5. Install `ninja` and `psutil` to speed up the compilation
+
+   ```powershell
+   (mmcv) PS C:\Users\xxx\mmcv> pip install -r requirements/optional.txt
+   ```
+
+6. Set up MSVC compiler
+
+   Set Environment variable, add `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` to `PATH`, so that `cl.exe` will be available in prompt, as shown below.
+
+   ```powershell
+   (mmcv) PS C:\Users\xxx\mmcv> cl
+   Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64
+   Copyright (C) Microsoft Corporation.   All rights reserved.
+
+   usage: cl [ option... ] filename... [ / link linkoption... ]
+   ```
+
+   For compatibility, we use the x86-hosted and x64-targeted compiler. note `Hostx86\x64` in the path.
+
+   You may want to change the system language to English because pytorch will parse text output from `cl.exe` to check its version. However only utf-8 is recognized. Navigate to Control Panel -> Region -> Administrative -> Language for Non-Unicode programs and change it to English.
+
+##### Build and install MMCV
+
+mmcv can be built in two ways:
+
+1. Full version (CPU ops)
+
+   Module `ops` will be compiled as a pytorch extension, but only x86 code will be compiled. The compiled ops can be executed on CPU only.
+
+2. Full version (CUDA ops)
+
+   Both x86 and CUDA codes of `ops` module will be compiled. The compiled version can be run on both CPU and CUDA-enabled GPU (if implemented).
+
+###### CPU version
+
+Build and install
+
+```powershell
+(mmcv) PS C:\Users\xxx\mmcv> python setup.py build_ext
+(mmcv) PS C:\Users\xxx\mmcv> python setup.py develop
+```
+
+###### GPU version
+
+1. Make sure `CUDA_PATH` or `CUDA_HOME` is already set in `envs` via `ls env:`, desired output is shown as below:
+
+   ```powershell
+   (mmcv) PS C:\Users\xxx\mmcv> ls env:
+
+   Name                           Value
+   ----                           -----
+   CUDA_PATH                      C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
+   CUDA_PATH_V10_1                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
+   CUDA_PATH_V10_2                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
+   ```
+
+   This should already be done by CUDA installer. If not, or you have multiple version of CUDA toolkit installed, set it with
+
+   ```powershell
+   (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2"
+   # OR
+   (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = $env:CUDA_PATH_V10_2 # if CUDA_PATH_V10_2 is in envs:
+   ```
+
+2. Set CUDA target arch
+
+   ```shell
+   # Here you need to change to the target architecture corresponding to your GPU
+   (mmcv) PS C:\Users\xxx\mmcv> $env:TORCH_CUDA_ARCH_LIST="7.5"
+   ```
+
+   :::{note}
+   Check your the compute capability of your GPU from [here](https://developer.nvidia.com/cuda-gpus).
+
+   ```powershell
+   (mmcv) PS C:\Users\xxx\mmcv> &"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\demo_suite\deviceQuery.exe"
+   Device 0: "NVIDIA GeForce GTX 1660 SUPER"
+   CUDA Driver Version / Runtime Version          11.7 / 11.1
+   CUDA Capability Major/Minor version number:    7.5
+   ```
+
+   The 7.5 above indicates the target architecture. Note: You need to replace v10.2 with your CUDA version in the above command.
+   :::
+
+3. Build and install
+
+   ```powershell
+   # build
+   python setup.py build_ext # if success, cl will be launched to compile ops
+   # install
+   python setup.py develop
+   ```
+
+   ```{note}
+   If you are compiling against PyTorch 1.6.0, you might meet some errors from PyTorch as described in [this issue](https://github.com/pytorch/pytorch/issues/42467). Follow [this pull request](https://github.com/pytorch/pytorch/pull/43380/files) to modify the source code in your local PyTorch installation.
+   ```
+
+##### Validate installation
+
+```powershell
+(mmcv) PS C:\Users\xxx\mmcv> python .dev_scripts/check_installation.py
+```
+
+If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution.
+If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues).
+
+### Build mmcv-lite
+
+If you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation).
+
+1. Clone the repo
+
+   ```bash
+   git clone https://github.com/open-mmlab/mmcv.git
+   cd mmcv
+   ```
+
+2. Start building
+
+   ```bash
+   MMCV_WITH_OPS=0 pip install -e . -v
+   ```
+
+3. Validate installation
+
+   ```bash
+   python -c 'import mmcv;print(mmcv.__version__)'
+   ```
+
+### Build mmcv-full on Cambricon MLU Devices
+
+#### Install torch_mlu
+
+##### Option1: Install mmcv-full based on Cambricon docker image
+
+Firstly, install and pull Cambricon docker image (please email service@cambricon.com for the latest release docker):
+
+```bash
+docker pull ${docker image}
+```
+
+Run and attach to the docker, [Install mmcv-full on MLU device](#install-mmcv\-full-on-cambricon-mlu-device) and [make sure you've installed mmcv-full on MLU device successfully](#test-code)
+
+##### Option2: Install mmcv-full from compiling Cambricon PyTorch source code
+
+Please email service@cambricon.com or contact with Cambricon engineers for a suitable version of CATCH package. After you get the suitable version of CATCH package, please follow the steps in ${CATCH-path}/CONTRIBUTING.md to install Cambricon PyTorch.
+
+#### Install mmcv-full on Cambricon MLU device
+
+Clone the repo
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git
+```
+
+The mlu-ops library will be downloaded to the default directory (mmcv/mlu-ops) while building MMCV. You can also set `MMCV_MLU_OPS_PATH` to an existing mlu-ops library before building as follows:
+
+```bash
+export MMCV_MLU_OPS_PATH=/xxx/xxx/mlu-ops
+```
+
+Install mmcv-full
+
+```bash
+cd mmcv
+export MMCV_WITH_OPS=1
+export FORCE_MLU=1
+python setup.py install
+```
+
+#### Test Code
+
+After finishing previous steps, you can run the following python code to make sure that you've installed mmcv-full on MLU device successfully
+
+```python
+import torch
+import torch_mlu
+from mmcv.ops import sigmoid_focal_loss
+x = torch.randn(3, 10).mlu()
+x.requires_grad = True
+y = torch.tensor([1, 5, 3]).mlu()
+w = torch.ones(10).float().mlu()
+output = sigmoid_focal_loss(x, y, 2.0, 0.25, w, 'none')
+print(output)
+```
diff --git a/head_extractor/mmcv-2.1.0/docs/en/get_started/installation.md b/head_extractor/mmcv-2.1.0/docs/en/get_started/installation.md
new file mode 100644
index 0000000000000000000000000000000000000000..ce7d92d66e7eb0084c699fa53042832fe742747d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/get_started/installation.md
@@ -0,0 +1,348 @@
+## Installation
+
+There are two versions of MMCV:
+
+- **mmcv**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build.
+- **mmcv-lite**: lite, without CUDA ops but all other features, similar to mmcv\<1.0.0. It is useful when you do not need those CUDA ops.
+
+```{warning}
+Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is avaliable`.
+```
+
+### Install mmcv
+
+Before installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://pytorch.org/get-started/locally/#start-locally). This can be verified using the following command
+
+```bash
+python -c 'import torch;print(torch.__version__)'
+```
+
+If version information is output, then PyTorch is installed.
+
+#### Install with mim (recommended)
+
+[mim](https://github.com/open-mmlab/mim) is the package management tool for the OpenMMLab projects, which makes it easy to install mmcv
+
+```bash
+pip install -U openmim
+mim install mmcv
+```
+
+If you find that the above installation command does not use a pre-built package ending with `.whl` but a source package ending with `.tar.gz`, you may not have a pre-build package corresponding to the PyTorch or CUDA or mmcv version, in which case you can [build mmcv from source](build.md).
+
+<details>
+<summary>Installation log using pre-built packages</summary>
+
+Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
+Collecting mmcv<br />
+<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl</b>
+
+</details>
+
+<details>
+<summary>Installation log using source packages</summary>
+
+Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
+Collecting mmcv==2.0.0<br />
+<b>Downloading mmcv-2.0.0.tar.gz</b>
+
+</details>
+
+To install a specific version of mmcv, for example, mmcv version 2.0.0, you can use the following command
+
+```bash
+mim install mmcv==2.0.0
+```
+
+:::{note}
+If you would like to use `opencv-python-headless` instead of `opencv-python`,
+e.g., in a minimum container environment or servers without GUI,
+you can first install it before installing MMCV to skip the installation of `opencv-python`.
+
+Alternatively, if it takes too long to install a dependency library, you can specify the pypi source
+
+```bash
+mim install mmcv -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+:::
+
+You can run [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands.
+
+#### Install with pip
+
+Use the following command to check the version of CUDA and PyTorch
+
+```bash
+python -c 'import torch;print(torch.__version__);print(torch.version.cuda)'
+```
+
+Select the appropriate installation command depending on the type of system, CUDA version, PyTorch version, and MMCV version
+
+<html>
+<body>
+    <style>
+      select {
+          z-index: 1000;
+          position: absolute;
+          top: 10px;
+          width: 6.7rem;
+      }
+      #select-container {
+          position: relative;
+          height: 30px;
+      }
+      #select-cmd {
+          background-color: #f5f6f7;
+          font-size: 14px;
+          margin-top: 20px;
+      }
+      /* 让每一个都间隔1.3rem */
+      #select-os {
+          /* left: 1.375rem; */
+          left: 0;
+      }
+      #select-cuda {
+          /* left: 9.375rem;    9.375 = 1.375 + 6.7 + 1.3 */
+          left: 8rem;
+      }
+      #select-torch {
+          /* left: 17.375rem;    17.375 = 9.375 + 6.7 + 1.3 */
+          left: 16rem;
+      }
+      #select-mmcv {
+          /* left: 25.375rem;    25.375 = 17.375 + 6.7 + 1.3 */
+          left: 24rem;
+      }
+    </style>
+    <div id="select-container">
+        <select
+            onmousedown="handleSelectMouseDown(this.id)"
+            onblur="handleSelectBlur(this.id)"
+            onchange="changeOS(this.value)"
+            id="select-os">
+        </select>
+        <select
+            onmousedown="handleSelectMouseDown(this.id)"
+            onblur="handleSelectBlur(this.id)"
+            onchange="changeCUDA(this.value)"
+            id="select-cuda">
+        </select>
+        <select
+            onmousedown="handleSelectMouseDown(this.id)"
+            onblur="handleSelectBlur(this.id)"
+            onchange="changeTorch(this.value)"
+            id="select-torch">
+        </select>
+        <select
+            onmousedown="handleSelectMouseDown(this.id)"
+            onblur="handleSelectBlur(this.id)"
+            onchange="changeMMCV(this.value)"
+            id="select-mmcv">
+        </select>
+    </div>
+    <pre id="select-cmd"></pre>
+</body>
+<script>
+    let osVal, cudaVal, torchVal, mmcvVal;
+    function changeMMCV(val) {
+        mmcvVal = val;
+        change("select-mmcv");
+    }
+    function changeTorch(val) {
+        torchVal = val;
+        change("select-torch");
+    }
+    function changeCUDA(val) {
+        cudaVal = val;
+        change("select-cuda");
+    }
+    function changeOS(val) {
+        osVal = val;
+        change("select-os");
+    }
+    function handleSelectMouseDown(id) {
+        const dom = document.getElementById(id);
+        if (!dom) return;
+        const len = dom?.options?.length;
+        if (len >= 9) {
+            dom.size = 10;
+            dom.style.zIndex = 100;
+        }
+    }
+    function handleSelectClick() {
+        const selects = Array.from(document.getElementsByTagName("select"));
+        selects.forEach(select => {
+            select.size = 1;
+        });
+    }
+    function handleSelectBlur(id) {
+        const dom = document.getElementById(id);
+        if (!dom) {
+            handleSelectClick();
+            return;
+        }
+        dom.size = 1;
+        dom.style.zIndex = 1;
+    }
+    function changeCmd() {
+        const cmd = document.getElementById("select-cmd");
+        let cmdString = "pip install mmcv=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html";
+        // e.g: pip install mmcv==2.0.0rc1 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9/index.html
+        let cudaVersion;
+        if (cudaVal === "cpu" || cudaVal === "mps") {
+            cudaVersion = "cpu";
+        } else {
+            cudaVersion = `cu${cudaVal.split(".").join("")}`;
+        }
+        const torchVersion = `torch${torchVal.substring(0, torchVal.length - 2)}`;
+        cmdString = cmdString.replace("{cu_version}", cudaVersion).replace("{mmcv_version}", mmcvVal).replace("{torch_version}", torchVersion);
+        cmd.textContent = cmdString;
+    }
+    function unique(arr) {
+        if (!arr || !Array.isArray(arr)) return [];
+        return [...new Set(arr)];
+    }
+    function genOptionFragment(data, id) {
+        const name = id.includes("-")? id.split("-")[1] : id;
+        const fragment = new DocumentFragment();
+        data.forEach(option => {
+            const ele = document.createElement("option");
+            let text = `${name} ${option}`;
+            if (name === "os" || option.toUpperCase() === "CPU" || option.toUpperCase() === "MPS") {
+                text = `${option}`;
+            }
+            ele.textContent = text;
+            ele.value = option;
+            ele.addEventListener('click', handleSelectClick);
+            fragment.appendChild(ele);
+        });
+        return fragment;
+    }
+    function findAndAppend(data, id) {
+        const fragment = genOptionFragment(data, id);
+        const dom = document.getElementById(id);
+        if (dom) dom.replaceChildren(fragment);
+    }
+    function change(id) {
+        const order = ["select-mmcv", "select-torch", "select-cuda", "select-os"];
+        const idx = order.indexOf(id);
+        if (idx === -1) return;
+        const versionDetail = version[osVal];
+        if (idx >= 3) {
+            let cuda = [];
+            versionDetail.forEach(v => {
+                cuda.push(v.cuda);
+            });
+            cuda = unique(cuda);
+            cudaVal = cuda[0];
+            findAndAppend(cuda, "select-cuda");
+        }
+        if (idx >= 2) {
+            const torch = [];
+            versionDetail.forEach(v => {
+                if (v.cuda === cudaVal) torch.push(v.torch);
+            });
+            torchVal = torch[0];
+            findAndAppend(torch, "select-torch");
+        }
+        if (idx >= 1) {
+            let mmcv = [];
+            versionDetail.forEach(v => {
+                if (v.cuda === cudaVal && v.torch === torchVal) mmcv = v.mmcv;
+            });
+            mmcvVal = mmcv[0];
+            findAndAppend(mmcv, "select-mmcv");
+        }
+        changeCmd();
+    }
+    function init() {
+        document.addEventListener("click", handleSelectBlur);
+        const version = window.version;
+        const os = Object.keys(version);
+        osVal = os[0];
+        findAndAppend(os, "select-os");
+        change("select-os");
+        changeCmd();
+    }
+    window.onload = function () {
+        const url = "../_static/version.json"
+        const request = new XMLHttpRequest();
+        request.open("get", url);
+        request.send(null);
+        request.onload = function () {
+            if (request.status !== 200) return;
+            const data = JSON.parse(request.responseText);
+            window.version = data;
+            init();
+        }
+    }
+</script>
+</html>
+
+If you do not find a corresponding version in the dropdown box above, you probably do not have a pre-built package corresponding to the PyTorch or CUDA or mmcv version, at which point you can [build mmcv from source](build.md).
+
+:::{note}
+mmcv is only compiled on PyTorch 1.x.0 because the compatibility
+usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you
+can install mmcv compiled with PyTorch 1.x.0 and it usually works well.
+For example, if your PyTorch version is 1.8.1, you can feel free to choose 1.8.x.
+:::
+
+:::{note}
+If you would like to use `opencv-python-headless` instead of `opencv-python`,
+e.g., in a minimum container environment or servers without GUI,
+you can first install it before installing MMCV to skip the installation of `opencv-python`.
+
+Alternatively, if it takes too long to install a dependency library, you can specify the pypi source
+
+```bash
+mim install mmcv -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+:::
+
+You can run [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) to check the installation of mmcv after running the installation commands.
+
+#### Using mmcv with Docker
+
+Build with local repository
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
+docker build -t mmcv -f docker/release/Dockerfile .
+```
+
+Or build with remote repository
+
+```bash
+docker build -t mmcv https://github.com/open-mmlab/mmcv.git#main:docker/release
+```
+
+The [Dockerfile](release/Dockerfile) installs latest released version of mmcv-full by default, but you can specify mmcv versions to install expected versions.
+
+```bash
+docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0 .
+```
+
+If you also want to use other versions of PyTorch and CUDA, you can also pass them when building docker images.
+
+An example to build an image with PyTorch 1.11 and CUDA 11.3.
+
+```bash
+docker build -t mmcv -f docker/release/Dockerfile \
+    --build-arg PYTORCH=1.11.0 \
+    --build-arg CUDA=11.3 \
+    --build-arg CUDNN=8 \
+    --build-arg MMCV=2.0.0 .
+```
+
+More available versions of PyTorch and CUDA can be found at [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags).
+
+### Install mmcv-lite
+
+If you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation).
+
+```python
+pip install mmcv-lite
+```
diff --git a/head_extractor/mmcv-2.1.0/docs/en/get_started/introduction.md b/head_extractor/mmcv-2.1.0/docs/en/get_started/introduction.md
new file mode 100644
index 0000000000000000000000000000000000000000..461fcc725bbcf4a84296e95789303b64e7b2e9c5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/get_started/introduction.md
@@ -0,0 +1,36 @@
+## Introduction
+
+MMCV is a foundational library for computer vision research and provides the following functionalities.
+
+- [Image/Video processing](../understand_mmcv/data_process.md)
+- [Image and annotation visualization](../understand_mmcv/visualization.md)
+- [Image transformation](../understand_mmcv/data_transform.md)
+- [Various CNN architectures](../understand_mmcv/cnn.md)
+- [High-quality implementation of common CUDA ops](../understand_mmcv/ops.md)
+
+It supports the following systems:
+
+- Linux
+- Windows
+- macOS
+
+It supports many research projects as below:
+
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
diff --git a/head_extractor/mmcv-2.1.0/docs/en/get_started/previous_versions.md b/head_extractor/mmcv-2.1.0/docs/en/get_started/previous_versions.md
new file mode 100644
index 0000000000000000000000000000000000000000..a9c3717667fec3e8f338c319413aa6ad639dc6d3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/get_started/previous_versions.md
@@ -0,0 +1,47 @@
+## OTHER VERSIONS OF PYTORCH BUILT FOR MMCV-FULL
+
+We no longer provide `mmcv-full` packages compiled under lower versions of `PyTorch`, but for your convenience, you can find them below.
+
+### PyTorch 1.4
+
+| 1.0.0 \<= mmcv_version \<= 1.2.1
+
+#### CUDA 10.1
+
+```bash
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html
+```
+
+#### CUDA 9.2
+
+```bash
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html
+```
+
+#### CPU
+
+```bash
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.4.0/index.html
+```
+
+### PyTorch v1.3
+
+| 1.0.0 \<= mmcv_version \<= 1.3.16
+
+#### CUDA 10.1
+
+```bash
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html
+```
+
+#### CUDA 9.2
+
+```bash
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html
+```
+
+#### CPU
+
+```bash
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.3.0/index.html
+```
diff --git a/head_extractor/mmcv-2.1.0/docs/en/index.rst b/head_extractor/mmcv-2.1.0/docs/en/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dee2c37507fb77df42fef5e51fe501214c13d7ce
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/index.rst
@@ -0,0 +1,69 @@
+Welcome to MMCV's documentation!
+================================
+
+You can switch between Chinese and English documents in the lower-left corner of the layout.
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Get Started
+
+   get_started/introduction.md
+   get_started/installation.md
+   get_started/build.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Understand MMCV
+
+   understand_mmcv/data_process.md
+   understand_mmcv/data_transform.md
+   understand_mmcv/visualization.md
+   understand_mmcv/cnn.md
+   understand_mmcv/ops.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Deployment
+
+   deployment/mmcv_ops_definition.md
+
+.. toctree::
+   :caption: Switch Language
+
+   switch_language.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Compatibility
+
+   compatibility.md
+
+.. toctree::
+
+   faq.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Community
+
+   community/contributing.md
+   community/pr.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: API Reference
+
+   mmcv.image <api/image>
+   mmcv.video <api/video>
+   mmcv.visualization <api/visualization>
+   mmcv.cnn <api/cnn>
+   mmcv.ops <api/ops>
+   mmcv.transforms <api/transforms>
+   mmcv.arraymisc <api/arraymisc>
+   mmcv.utils <api/utils>
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/head_extractor/mmcv-2.1.0/docs/en/make.bat b/head_extractor/mmcv-2.1.0/docs/en/make.bat
new file mode 100644
index 0000000000000000000000000000000000000000..7893348a1b7dbb588983a48e6991282eae7e1b55
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/head_extractor/mmcv-2.1.0/docs/en/mmcv-logo.png b/head_extractor/mmcv-2.1.0/docs/en/mmcv-logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..bcc5759f8fe3bc7d191d411c38a9e1d3c1c27a84
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/docs/en/mmcv-logo.png differ
diff --git a/head_extractor/mmcv-2.1.0/docs/en/switch_language.md b/head_extractor/mmcv-2.1.0/docs/en/switch_language.md
new file mode 100644
index 0000000000000000000000000000000000000000..9dc7b34b4fac6a972abedd8c2b0b80d03441d2b9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/switch_language.md
@@ -0,0 +1,3 @@
+## <a href='https://mmcv.readthedocs.io/en/latest/'>English</a>
+
+## <a href='https://mmcv.readthedocs.io/zh_CN/latest/'>简体中文</a>
diff --git a/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/cnn.md b/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/cnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..12f5318bc727868488aebb63f7be4e9721cd66fa
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/cnn.md
@@ -0,0 +1,79 @@
+## CNN
+
+We provide some building bricks for CNNs, including layer building, module bundles and weight initialization.
+
+### Layer building
+
+We may need to try different layers of the same type when running experiments,
+but do not want to modify the code from time to time.
+Here we provide some layer building methods to construct layers from a dict,
+which can be written in configs or specified via command line arguments.
+
+#### Usage
+
+A simplest example is
+
+```python
+from mmcv.cnn import build_conv_layer
+
+cfg = dict(type='Conv3d')
+layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
+```
+
+- `build_conv_layer`: Supported types are Conv1d, Conv2d, Conv3d, Conv (alias for Conv2d).
+- `build_norm_layer`: Supported types are BN1d, BN2d, BN3d, BN (alias for BN2d), SyncBN, GN, LN, IN1d, IN2d, IN3d, IN (alias for IN2d).
+- `build_activation_layer`: Supported types are ReLU, LeakyReLU, PReLU, RReLU, ReLU6, ELU, Sigmoid, Tanh, GELU.
+- `build_upsample_layer`: Supported types are nearest, bilinear, deconv, pixel_shuffle.
+- `build_padding_layer`: Supported types are zero, reflect, replicate.
+
+#### Extension
+
+We also allow extending the building methods with custom layers and operators.
+
+1. Write and register your own module.
+
+   ```python
+   from mmengine.registry import MODELS
+
+   @MODELS.register_module()
+   class MyUpsample:
+
+       def __init__(self, scale_factor):
+           pass
+
+       def forward(self, x):
+           pass
+   ```
+
+2. Import `MyUpsample` somewhere (e.g., in `__init__.py`) and then use it.
+
+   ```python
+   from mmcv.cnn import build_upsample_layer
+
+   cfg = dict(type='MyUpsample', scale_factor=2)
+   layer = build_upsample_layer(cfg)
+   ```
+
+### Module bundles
+
+We also provide common module bundles to facilitate the network construction.
+`ConvModule` is a bundle of convolution, normalization and activation layers,
+please refer to the [api](api.html#mmcv.cnn.ConvModule) for details.
+
+```python
+from mmcv.cnn import ConvModule
+
+# conv + bn + relu
+conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+# conv + gn + relu
+conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
+# conv + relu
+conv = ConvModule(3, 8, 2)
+# conv
+conv = ConvModule(3, 8, 2, act_cfg=None)
+# conv + leaky relu
+conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
+# bn + conv + relu
+conv = ConvModule(
+    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
+```
diff --git a/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/data_process.md b/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/data_process.md
new file mode 100644
index 0000000000000000000000000000000000000000..167928f88528ee6b682a559582a1584c369a5d39
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/data_process.md
@@ -0,0 +1,286 @@
+## Data Process
+
+### Image
+
+This module provides some image processing methods, which requires `opencv` to be installed first.
+
+#### Read/Write/Show
+
+To read or write images files, use `imread` or `imwrite`.
+
+```python
+import mmcv
+
+img = mmcv.imread('test.jpg')
+img = mmcv.imread('test.jpg', flag='grayscale')
+img_ = mmcv.imread(img)  # nothing will happen, img_ = img
+mmcv.imwrite(img, 'out.jpg')
+```
+
+To read images from bytes
+
+```python
+with open('test.jpg', 'rb') as f:
+    data = f.read()
+img = mmcv.imfrombytes(data)
+```
+
+To show an image file or a loaded image
+
+```python
+mmcv.imshow('tests/data/color.jpg')
+# this is equivalent to
+
+for i in range(10):
+    img = np.random.randint(256, size=(100, 100, 3), dtype=np.uint8)
+    mmcv.imshow(img, win_name='test image', wait_time=200)
+```
+
+#### Color space conversion
+
+Supported conversion methods:
+
+- bgr2gray
+- gray2bgr
+- bgr2rgb
+- rgb2bgr
+- bgr2hsv
+- hsv2bgr
+
+```python
+img = mmcv.imread('tests/data/color.jpg')
+img1 = mmcv.bgr2rgb(img)
+img2 = mmcv.rgb2gray(img1)
+img3 = mmcv.bgr2hsv(img)
+```
+
+#### Resize
+
+There are three resize methods. All `imresize_*` methods have an argument `return_scale`,
+if this argument is `False`, then the return value is merely the resized image, otherwise
+is a tuple `(resized_img, scale)`.
+
+```python
+# resize to a given size
+mmcv.imresize(img, (1000, 600), return_scale=True)
+
+# resize to the same size of another image
+mmcv.imresize_like(img, dst_img, return_scale=False)
+
+# resize by a ratio
+mmcv.imrescale(img, 0.5)
+
+# resize so that the max edge no longer than 1000, short edge no longer than 800
+# without changing the aspect ratio
+mmcv.imrescale(img, (1000, 800))
+```
+
+#### Rotate
+
+To rotate an image by some angle, use `imrotate`. The center can be specified,
+which is the center of original image by default. There are two modes of rotating,
+one is to keep the image size unchanged so that some parts of the image will be
+cropped after rotating, the other is to extend the image size to fit the rotated
+image.
+
+```python
+img = mmcv.imread('tests/data/color.jpg')
+
+# rotate the image clockwise by 30 degrees.
+img_ = mmcv.imrotate(img, 30)
+
+# rotate the image counterclockwise by 90 degrees.
+img_ = mmcv.imrotate(img, -90)
+
+# rotate the image clockwise by 30 degrees, and rescale it by 1.5x at the same time.
+img_ = mmcv.imrotate(img, 30, scale=1.5)
+
+# rotate the image clockwise by 30 degrees, with (100, 100) as the center.
+img_ = mmcv.imrotate(img, 30, center=(100, 100))
+
+# rotate the image clockwise by 30 degrees, and extend the image size.
+img_ = mmcv.imrotate(img, 30, auto_bound=True)
+```
+
+#### Flip
+
+To flip an image, use `imflip`.
+
+```python
+img = mmcv.imread('tests/data/color.jpg')
+
+# flip the image horizontally
+mmcv.imflip(img)
+
+# flip the image vertically
+mmcv.imflip(img, direction='vertical')
+```
+
+#### Crop
+
+`imcrop` can crop the image with one or more regions. Each region is represented by the upper left and lower right coordinates as (x1, y1, x2, y2).
+
+```python
+import mmcv
+import numpy as np
+
+img = mmcv.imread('tests/data/color.jpg')
+
+# crop the region (10, 10, 100, 120)
+bboxes = np.array([10, 10, 100, 120])
+patch = mmcv.imcrop(img, bboxes)
+
+# crop two regions (10, 10, 100, 120) and (0, 0, 50, 50)
+bboxes = np.array([[10, 10, 100, 120], [0, 0, 50, 50]])
+patches = mmcv.imcrop(img, bboxes)
+
+# crop two regions, and rescale the patches by 1.2x
+patches = mmcv.imcrop(img, bboxes, scale=1.2)
+```
+
+#### Padding
+
+There are two methods, `impad` and `impad_to_multiple`, to pad an image to the
+specific size with given values.
+
+```python
+img = mmcv.imread('tests/data/color.jpg')
+
+# pad the image to (1000, 1200) with all zeros
+img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=0)
+
+# pad the image to (1000, 1200) with different values for three channels.
+img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=(100, 50, 200))
+
+# pad the image on left, right, top, bottom borders with all zeros
+img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=0)
+
+# pad the image on left, right, top, bottom borders with different values
+# for three channels.
+img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=(100, 50, 200))
+
+# pad an image so that each edge is a multiple of some value.
+img_ = mmcv.impad_to_multiple(img, 32)
+```
+
+### Video
+
+This module provides the following functionalities:
+
+- A `VideoReader` class with friendly apis to read and convert videos.
+- Some methods for editing (cut, concat, resize) videos.
+- Optical flow read/write/warp.
+
+#### VideoReader
+
+The `VideoReader` class provides sequence like apis to access video frames.
+It will internally cache the frames which have been visited.
+
+```python
+video = mmcv.VideoReader('test.mp4')
+
+# obtain basic information
+print(len(video))
+print(video.width, video.height, video.resolution, video.fps)
+
+# iterate over all frames
+for frame in video:
+    print(frame.shape)
+
+# read the next frame
+img = video.read()
+
+# read a frame by index
+img = video[100]
+
+# read some frames
+img = video[5:10]
+```
+
+To convert a video to images or generate a video from a image directory.
+
+```python
+# split a video into frames and save to a folder
+video = mmcv.VideoReader('test.mp4')
+video.cvt2frames('out_dir')
+
+# generate video from frames
+mmcv.frames2video('out_dir', 'test.avi')
+```
+
+#### Editing utils
+
+There are also some methods for editing videos, which wraps the commands of ffmpeg.
+
+```python
+# cut a video clip
+mmcv.cut_video('test.mp4', 'clip1.mp4', start=3, end=10, vcodec='h264')
+
+# join a list of video clips
+mmcv.concat_video(['clip1.mp4', 'clip2.mp4'], 'joined.mp4', log_level='quiet')
+
+# resize a video with the specified size
+mmcv.resize_video('test.mp4', 'resized1.mp4', (360, 240))
+
+# resize a video with a scaling ratio of 2
+mmcv.resize_video('test.mp4', 'resized2.mp4', ratio=2)
+```
+
+#### Optical flow
+
+`mmcv` provides the following methods to operate on optical flows.
+
+- IO
+- Visualization
+- Flow warping
+
+We provide two options to dump optical flow files: uncompressed and compressed.
+The uncompressed way just dumps the floating numbers to a binary file. It is
+lossless but the dumped file has a larger size.
+The compressed way quantizes the optical flow to 0-255 and dumps it as a
+jpeg image. The flow of x-dim and y-dim will be concatenated into a single image.
+
+1. IO
+
+```python
+flow = np.random.rand(800, 600, 2).astype(np.float32)
+# dump the flow to a flo file (~3.7M)
+mmcv.flowwrite(flow, 'uncompressed.flo')
+# dump the flow to a jpeg file (~230K)
+# the shape of the dumped image is (800, 1200)
+mmcv.flowwrite(flow, 'compressed.jpg', quantize=True, concat_axis=1)
+
+# read the flow file, the shape of loaded flow is (800, 600, 2) for both ways
+flow = mmcv.flowread('uncompressed.flo')
+flow = mmcv.flowread('compressed.jpg', quantize=True, concat_axis=1)
+```
+
+2. Visualization
+
+It is possible to visualize optical flows with `mmcv.flowshow()`.
+
+```python
+mmcv.flowshow(flow)
+```
+
+![progress](../_static/flow_visualization.png)
+
+3. Flow warping
+
+```python
+img1 = mmcv.imread('img1.jpg')
+flow = mmcv.flowread('flow.flo')
+warped_img2 = mmcv.flow_warp(img1, flow)
+```
+
+img1 (left) and img2 (right)
+
+![raw images](../_static/flow_raw_images.png)
+
+optical flow (img2 -> img1)
+
+![optical flow](../_static/flow_img2toimg1.png)
+
+warped image and difference with ground truth
+
+![warped image](../_static/flow_warp_diff.png)
diff --git a/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/data_transform.md b/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/data_transform.md
new file mode 100644
index 0000000000000000000000000000000000000000..64c3af980eab0b07d7a298cee2c41465803911f8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/data_transform.md
@@ -0,0 +1,341 @@
+# Data Transformation
+
+In the OpenMMLab algorithm library, dataset construction and data preparation are decoupled. Usually, the construction of the dataset only parses the dataset and records the basic information of each sample, while the data preparation is a series of data transformations including data loading, preprocessing, formatting, and other operations performed according to the basic information of the sample.
+
+## Design of data transformation
+
+In MMCV, we use various callable data transformation classes to manipulate data. These data transformation classes can accept several configuration parameters for the instantiation and then process the input data dictionary by `__call__` method. All data transformation methods accept a dictionary as the input and produce the output as a dictionary as well. A simple example is as follows:
+
+```python
+>>> import numpy as np
+>>> from mmcv.transforms import Resize
+>>>
+>>> transform = Resize(scale=(224, 224))
+>>> data_dict = {'img': np.random.rand(256, 256, 3)}
+>>> data_dict = transform(data_dict)
+>>> print(data_dict['img'].shape)
+(224, 224, 3)
+```
+
+The data transformation class reads some fields of the input dictionary and may add or update some fields. The keys of these fields are mostly fixed. For example, `Resize` will always read fields such as `"img"` in the input dictionary. More information about the conventions for input and output fields could be found in the documentation of the corresponding class.
+
+```{note}
+By convention, the order of image shape which is used as **initialization parameters** in data transformation (such as Resize, Pad) is (width, height). In the dictionary returned by the data transformation, the image related shape, such as `img_shape`, `ori_shape`, `pad_shape`, etc., is (height, width).
+```
+
+MMCV provides a unified base class called `BaseTransform` for all data transformation classes:
+
+```python
+class BaseTransform(metaclass=ABCMeta):
+
+    def __call__(self, results: dict) -> dict:
+
+        return self.transform(results)
+
+    @abstractmethod
+    def transform(self, results: dict) -> dict:
+        pass
+```
+
+All data transformation classes must inherit `BaseTransform` and implement the `transform` method. Both the input and output of the `transform` method are a dictionary. In the **Custom data transformation class** section, we will describe how to implement a data transformation class in more detail.
+
+## Data pipeline
+
+As mentioned above, the inputs and outputs of all data transformations are dictionaries. Moreover, according to the \[Convention on Datasets\] (TODO) in OpenMMLab, the basic information of each sample in the dataset is also a dictionary. This way, we can connect all data transformation operations end to end and combine them into a data pipeline. This pipeline inputs the information dictionary of the samples in the dataset and outputs the information dictionary after a series of processing.
+
+Taking the classification task as an example, we show a typical data pipeline in the figure below. For each sample, the information stored in the dataset is a dictionary, as shown on the far left in the figure. After each data transformation operation represented by the blue block, a new field (marked in green) will be added to the data dictionary or an existing field (marked in orange) will be updated.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/154197953-bf0b1a16-3f41-4bc7-9e67-b2b9b323d895.png" width="90%"/>
+</div>
+
+The data pipeline is a list of several data transformation configuration dictionaries in the configuration file. Each dataset needs to set the parameter `pipeline` to define the data preparation operations the dataset needs to perform. The configuration of the above data pipeline in the configuration file is as follows:
+
+```python
+pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', size=256, keep_ratio=True),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
+    dict(type='ClsFormatBundle')
+]
+
+dataset = dict(
+    ...
+    pipeline=pipeline,
+    ...
+)
+```
+
+## Common data transformation classes
+
+The commonly used data transformation classes can be roughly divided into data loading, data preprocessing and augmentation, and data formatting. In MMCV, we provide some commonly used classes as follows:
+
+### Data loading
+
+To support the loading of large-scale datasets, data is usually not loaded when `Dataset` is initialized. Only the corresponding path is loaded. Therefore, it is necessary to load specific data in the data pipeline.
+
+|            Class            |                    Feature                     |
+| :-------------------------: | :--------------------------------------------: |
+| [`LoadImageFromFile`](TODO) |              Load from file path               |
+|  [`LoadAnnotations`](TODO)  | Load and organize the annotations (bbox, etc.) |
+
+### Data preprocessing and enhancement
+
+Data preprocessing and augmentation usually involve transforming the image itself, such as cropping, padding, scaling, etc.
+
+|              Class               |                        Feature                         |
+| :------------------------------: | :----------------------------------------------------: |
+|          [`Pad`](TODO)           |                        Padding                         |
+|       [`CenterCrop`](TODO)       |                      Center crop                       |
+|       [`Normalize`](TODO)        |                  Image normalization                   |
+|         [`Resize`](TODO)         |         Resize to the specified size or ratio          |
+|      [`RandomResize`](TODO)      |  Scale the image randomly within the specified range   |
+| [`RandomMultiscaleResize`](TODO) | Scale the image to a random size from multiple options |
+|    [`RandomGrayscale`](TODO)     |                    Random grayscale                    |
+|       [`RandomFlip`](TODO)       |                      Random flip                       |
+|   [`MultiScaleFlipAug`](TODO)    |    Support scaling and flipping during the testing     |
+
+### Data formatting
+
+Data formatting operations are type conversions performed on the data.
+
+|          Class          |                   Feature                    |
+| :---------------------: | :------------------------------------------: |
+|   [`ToTensor`](TODO)    | Convert the specified data to `torch.Tensor` |
+| [`ImageToTensor`](TODO) |     Convert the image to `torch.Tensor`      |
+
+## Customize data transformation classes
+
+To implement a new data transformation class, you must inherit `BaseTransform` and implement the `transform` method. Here, we use a simple flip transform (`MyFlip`) as an example:
+
+```python
+import random
+import mmcv
+from mmcv.transforms import BaseTransform, TRANSFORMS
+
+@TRANSFORMS.register_module()
+class MyFlip(BaseTransform):
+    def __init__(self, direction: str):
+        super().__init__()
+        self.direction = direction
+
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        results['img'] = mmcv.imflip(img, direction=self.direction)
+        return results
+```
+
+Now, we can instantiate `MyFlip` as a callable object to handle our data dictionary.
+
+```python
+import numpy as np
+
+transform = MyFlip(direction='horizontal')
+data_dict = {'img': np.random.rand(224, 224, 3)}
+data_dict = transform(data_dict)
+processed_img = data_dict['img']
+```
+
+Alternatively, use `MyFlip` transform in the `pipeline` of the config file.
+
+```python
+pipeline = [
+    ...
+    dict(type='MyFlip', direction='horizontal'),
+    ...
+]
+```
+
+It should be noted that if you want to use it in the configuration file, you must ensure that the file where the `MyFlip` class is located can be imported at the runtime.
+
+## Transform wrapper
+
+Transform wrappers are a special class of data transformations. They do not operate on images, labels or other information in the data dictionary by themselves. Instead, they enhance the behavior of data transformations defined in them.
+
+### KeyMapper
+
+`KeyMapper` is used to map fields in the data dictionary. For example, image processing transforms usually get their values from the `"img"` field in the data dictionary. But sometimes we want these transforms to handle images in other fields in the data dictionary, such as the `"gt_img"` field.
+
+When used with registry and configuration file, the field map wrapper should be used as follows:
+
+```python
+pipeline = [
+    ...
+    dict(type='KeyMapper',
+        mapping={
+            'img': 'gt_img',  # map "gt_img" to "img"
+            'mask': ...,  # The "mask" field in the raw data is not used. That is, for wrapped data transformations, the "mask" field is not included in the data
+        },
+        auto_remap=True,  # remap "img" back to "gt_img" after the transformation
+        transforms=[
+            # only need to specify "img" in `RandomFlip`
+            dict(type='RandomFlip'),
+        ])
+    ...
+]
+```
+
+With `KeyMapper`, we don't need to consider various possible input field names in the `transform` method when we implement the data transformation class. We only need to deal with the default fields.
+
+### RandomChoice and RandomApply
+
+`RandomChoice` is used to randomly select a data transformation pipeline from the given choices. With this wrapper, we can easily implement some data augmentation functions, such as AutoAugment.
+
+In configuration file, you can use `RandomChoice` as follows:
+
+```python
+pipeline = [
+    ...
+    dict(type='RandomChoice',
+        transforms=[
+            [
+                dict(type='Posterize', bits=4),
+                dict(type='Rotate', angle=30.)
+            ],  # the first combo option
+            [
+                dict(type='Equalize'),
+                dict(type='Rotate', angle=30)
+            ],  # the second combo option
+        ],
+        prob=[0.4, 0.6]  # the prob of each combo
+        )
+    ...
+]
+```
+
+`RandomApply` is used to randomly perform a combination of data transformations with a specified probability. For example:
+
+```python
+pipeline = [
+    ...
+    dict(type='RandomApply',
+        transforms=[dict(type='Rotate', angle=30.)],
+        prob=0.3)  # perform the transformation with prob as 0.3
+    ...
+]
+```
+
+### TransformBroadcaster
+
+Usually, a data transformation class only reads the target of an operation from one field. While we can also use `KeyMapper` to change the fields read, there is no way to apply transformations to the data of multiple fields at once. To achieve this, we need to use the multi-target extension wrapper `TransformBroadcaster`.
+
+`TransformBroadcaster` has two uses, one is to apply data transformation to multiple specified fields, and the other is to apply data transformation to a group of targets under a field.
+
+1. Apply to multiple fields
+
+   Suppose we need to apply a data transformation to images in two fields `"lq"` (low-quality) and `"gt"` (ground-truth).
+
+   ```python
+   pipeline = [
+       dict(type='TransformBroadcaster',
+           # apply to the "lq" and "gt" fields respectively, and set the "img" field to both
+           mapping={'img': ['lq', 'gt']},
+           # remap the "img" field back to the original field after the transformation
+           auto_remap=True,
+           # whether to share random variables in the transformation of each target
+           # more introduction will be referred in the following chapters (random variable sharing)
+           share_random_params=True,
+           transforms=[
+               # only need to manipulate the "img" field in the `RandomFlip` class
+               dict(type='RandomFlip'),
+           ])
+   ]
+   ```
+
+   In the `mapping` setting of the multi-target extension, we can also use `...` to ignore the specified original field. As shown in the following example, the wrapped `RandomCrop` will crop the image in the field `"img"` and update the size of the cropped image if the field `"img_shape"` exists. If we want to do the same random cropping for both image fields `"lq"` and `"gt"` at the same time but update the `"img_shape"` field only once, we can do it as in the example:
+
+   ```python
+   pipeline = [
+       dict(type='TransformBroadcaster',
+           mapping={
+               'img': ['lq', 'gt'],
+               'img_shape': ['img_shape', ...],
+            },
+           # remap the "img" and "img_shape" fields back to their original fields after the transformation
+           auto_remap=True,
+           # whether to share random variables in the transformation of each target
+           # more introduction will be referred in the following chapters (random variable sharing)
+           share_random_params=True,
+           transforms=[
+               # "img" and "img_shape" fields are manipulated in the `RandomCrop` class
+               # if "img_shape" is missing, only operate on "img"
+               dict(type='RandomCrop'),
+           ])
+   ]
+   ```
+
+2. A set of targets applied to a field
+
+   Suppose we need to apply a data transformation to the `"images"` field, which is a list of images.
+
+   ```python
+   pipeline = [
+       dict(type='TransformBroadcaster',
+           # map each image under the "images" field to the "img" field
+           mapping={'img': 'images'},
+           # remap the images under the "img" field back to the list in the "images" field after the transformation
+           auto_remap=True,
+           # whether to share random variables in the transformation of each target
+           share_random_params=True,
+           transforms=[
+               # in the `RandomFlip` transformation class, we only need to manipulate the "img" field
+               dict(type='RandomFlip'),
+           ])
+   ]
+   ```
+
+#### Decorator `cache_randomness`
+
+In `TransformBroadcaster`, we provide the `share_random_params` option to support sharing random states across multiple data transformations. For example, in a super-resolution task, we want to apply **the same** random transformations **simultaneously** to the low-resolution image and the original image. If we use this function in a custom data transformation class, we need to mark which random variables support sharing in the class. This can be achieved with the decorator `cache_randomness`.
+
+Taking `MyFlip` from the above example, we want to perform flipping randomly with a certain probability:
+
+```python
+from mmcv.transforms.utils import cache_randomness
+
+@TRANSFORMS.register_module()
+class MyRandomFlip(BaseTransform):
+    def __init__(self, prob: float, direction: str):
+        super().__init__()
+        self.prob = prob
+        self.direction = direction
+
+    @cache_randomness  # label the output of the method as a shareable random variable
+    def do_flip(self):
+        flip = True if random.random() > self.prob else False
+        return flip
+
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        if self.do_flip():
+            results['img'] = mmcv.imflip(img, direction=self.direction)
+        return results
+```
+
+In the above example, we decorate the `do_flip` method with `cache_randomness`, marking the method return value `flip` as a random variable that supports sharing. Therefore, in the transformation of `TransformBroadcaster` to multiple targets, the value of this variable will remain the same.
+
+#### Decorator `avoid_cache_randomness`
+
+In some cases, we cannot separate the process of generating random variables in data transformation into a class method. For example, modules from third-party libraries used in data transformation encapsulate the relevant parts of random variables inside, making them impossible to be extracted as class methods for data transformation. Such data transformations cannot support shared random variables through the decorator `cache_randomness` annotation, and thus cannot share random variables during multi-objective expansion.
+
+To avoid misuse of such data transformations in multi-object extensions, we provide another decorator, `avoid_cache_randomness`, to mark such data transformations:
+
+```python
+from mmcv.transforms.utils import avoid_cache_randomness
+
+@TRANSFORMS.register_module()
+@avoid_cache_randomness
+class MyRandomTransform(BaseTransform):
+
+    def transform(self, results: dict) -> dict:
+        ...
+```
+
+Data transformation classes marked with `avoid_cache_randomness` will throw an exception when their instance is wrapped by `TransformBroadcaster` and the parameter `share_random_params` is set to True. This reminds the user not to use it in this way.
+
+There are a few things to keep in mind when using `avoid_cache_randomness`:
+
+1. `avoid_cache_randomness` is only used to decorate data transformation classes (subclasses of `BaseTransfrom`) and cannot be used to decorate other general classes, class methods, or functions
+2. When a data transformation decorated with `avoid_cache_randomness` is used as a base class, its subclasses **will not inherit** its feature. If the subclass is still unable to share random variables, `avoid_cache_randomness` should be used again.
+3. A data transformation needs to be modified with `avoid_cache_randomness` only when a data transformation is random and cannot share its random parameters. Data transformations without randomness require no decoration
diff --git a/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/ops.md b/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/ops.md
new file mode 100644
index 0000000000000000000000000000000000000000..265327bc5cca61fbecc833f14a9c752b8c09b670
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/ops.md
@@ -0,0 +1,66 @@
+## ops
+
+We implement common ops used in detection, segmentation, etc.
+
+| Device                       | CPU | CUDA | MLU | MPS | Ascend |
+| ---------------------------- | --- | ---- | --- | --- | ------ |
+| ActiveRotatedFilter          | √   | √    |     |     | √      |
+| AssignScoreWithK             |     | √    |     |     |        |
+| BallQuery                    |     | √    | √   |     |        |
+| BBoxOverlaps                 |     | √    | √   | √   | √      |
+| BorderAlign                  |     | √    |     |     |        |
+| BoxIouRotated                | √   | √    | √   |     | √      |
+| BoxIouQuadri                 | √   | √    |     |     |        |
+| CARAFE                       |     | √    | √   |     |        |
+| ChamferDistance              |     | √    |     |     |        |
+| CrissCrossAttention          |     | √    |     |     |        |
+| ContourExpand                | √   |      |     |     |        |
+| ConvexIoU                    |     | √    |     |     |        |
+| CornerPool                   |     | √    |     |     |        |
+| Correlation                  |     | √    |     |     |        |
+| Deformable Convolution v1/v2 | √   | √    | √   |     | √      |
+| Deformable RoIPool           |     | √    | √   |     | √      |
+| DiffIoURotated               |     | √    | √   |     |        |
+| DynamicScatter               |     | √    | √   |     |        |
+| FurthestPointSample          |     | √    |     |     |        |
+| FurthestPointSampleWithDist  |     | √    |     |     |        |
+| FusedBiasLeakyrelu           |     | √    |     |     | √      |
+| GatherPoints                 |     | √    |     |     | √      |
+| GroupPoints                  |     | √    |     |     |        |
+| Iou3d                        |     | √    | √   |     |        |
+| KNN                          |     | √    |     |     |        |
+| MaskedConv                   |     | √    | √   |     | √      |
+| MergeCells                   |     | √    |     |     |        |
+| MinAreaPolygon               |     | √    |     |     |        |
+| ModulatedDeformConv2d        | √   | √    | √   |     | √      |
+| MultiScaleDeformableAttn     |     | √    | √   |     |        |
+| NMS                          | √   | √    | √   |     | √      |
+| NMSRotated                   | √   | √    | √   |     | √      |
+| NMSQuadri                    | √   | √    |     |     |        |
+| PixelGroup                   | √   |      |     |     |        |
+| PointsInBoxes                | √   | √    |     |     |        |
+| PointsInPolygons             |     | √    |     |     | √      |
+| PSAMask                      | √   | √    | √   |     | √      |
+| RotatedFeatureAlign          | √   | √    | √   |     |        |
+| RoIPointPool3d               |     | √    | √   |     |        |
+| RoIPool                      |     | √    | √   |     | √      |
+| RoIAlignRotated              | √   | √    | √   |     |        |
+| RiRoIAlignRotated            |     | √    |     |     |        |
+| RoIAlign                     | √   | √    | √   |     | √      |
+| RoIAwarePool3d               |     | √    | √   |     |        |
+| SAConv2d                     |     | √    |     |     |        |
+| SigmoidFocalLoss             |     | √    | √   |     | √      |
+| SoftmaxFocalLoss             |     | √    |     |     | √      |
+| SoftNMS                      |     | √    |     |     |        |
+| Sparse Convolution           |     | √    | √   |     |        |
+| Synchronized BatchNorm       |     | √    |     |     |        |
+| ThreeInterpolate             |     | √    |     |     |        |
+| ThreeNN                      |     | √    | √   |     |        |
+| TINShift                     |     | √    | √   |     |        |
+| UpFirDn2d                    |     | √    |     |     |        |
+| Voxelization                 | √   | √    | √   |     | √      |
+| PrRoIPool                    |     | √    |     |     |        |
+| BezierAlign                  | √   | √    |     |     |        |
+| BiasAct                      |     | √    |     |     |        |
+| FilteredLrelu                |     | √    |     |     |        |
+| Conv2dGradfix                |     | √    |     |     |        |
diff --git a/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/visualization.md b/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/visualization.md
new file mode 100644
index 0000000000000000000000000000000000000000..968e350589aafdf79c32593a6b5968329d5afa2a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/en/understand_mmcv/visualization.md
@@ -0,0 +1,24 @@
+## Visualization
+
+`mmcv` can show images and annotations (currently supported types include bounding boxes).
+
+```python
+# show an image file
+mmcv.imshow('a.jpg')
+
+# show a loaded image
+img = np.random.rand(100, 100, 3)
+mmcv.imshow(img)
+
+# show image with bounding boxes
+img = np.random.rand(100, 100, 3)
+bboxes = np.array([[0, 0, 50, 50], [20, 20, 60, 60]])
+mmcv.imshow_bboxes(img, bboxes)
+```
+
+`mmcv` can also visualize special images such as optical flows.
+
+```python
+flow = mmcv.flowread('test.flo')
+mmcv.flowshow(flow)
+```
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/Makefile b/head_extractor/mmcv-2.1.0/docs/zh_cn/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..51285967a7d9722c5bdee4f6a81c154a56aa0846
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/_static/css/readthedocs.css b/head_extractor/mmcv-2.1.0/docs/zh_cn/_static/css/readthedocs.css
new file mode 100644
index 0000000000000000000000000000000000000000..9e3a567d5f78aedb606600bb3111034a1003b362
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/_static/css/readthedocs.css
@@ -0,0 +1,10 @@
+.header-logo {
+    background-image: url("../image/mmcv-logo.png");
+    background-size: 85px 40px;
+    height: 40px;
+    width: 85px;
+}
+
+table.colwidths-auto td {
+    width: 50%
+}
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/_static/image/mmcv-logo.png b/head_extractor/mmcv-2.1.0/docs/zh_cn/_static/image/mmcv-logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..bcc5759f8fe3bc7d191d411c38a9e1d3c1c27a84
Binary files /dev/null and b/head_extractor/mmcv-2.1.0/docs/zh_cn/_static/image/mmcv-logo.png differ
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/_static/version.json b/head_extractor/mmcv-2.1.0/docs/zh_cn/_static/version.json
new file mode 100644
index 0000000000000000000000000000000000000000..522c371ce1b2846f5f488d442450f841926207db
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/_static/version.json
@@ -0,0 +1,738 @@
+{
+    "Linux": [
+        {
+            "cuda": "11.8",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.7",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.7",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.6",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.6",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.5",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.3",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.3",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.3",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.1",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.1",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.1",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.0",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "10.1",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.1",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "10.1",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "9.2",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "9.2",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        }
+    ],
+    "Windows": [
+        {
+            "cuda": "11.8",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.7",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.7",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.6",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.6",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.5",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.3",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.3",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.3",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.1",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.1",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "11.1",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3"
+            ]
+        },
+        {
+            "cuda": "10.2",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "10.1",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "10.1",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3"
+            ]
+        },
+        {
+            "cuda": "10.1",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0rc1"
+            ]
+        }
+    ],
+    "macOS": [
+        {
+            "cuda": "cpu",
+            "torch": "2.0.x",
+            "mmcv": [
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "mps",
+            "torch": "1.13.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.12.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.11.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.10.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.9.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.8.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2",
+                "2.0.0"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.7.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2"
+            ]
+        },
+        {
+            "cuda": "cpu",
+            "torch": "1.6.x",
+            "mmcv": [
+                "2.0.0rc4",
+                "2.0.0rc3",
+                "2.0.0rc2"
+            ]
+        }
+    ]
+}
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/_templates/classtemplate.rst b/head_extractor/mmcv-2.1.0/docs/zh_cn/_templates/classtemplate.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4f74842394ec9807fb1ae2d8f05a8a57e9a2e24c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/_templates/classtemplate.rst
@@ -0,0 +1,14 @@
+.. role:: hidden
+    :class: hidden-section
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :members:
+
+
+..
+  autogenerated from source/_templates/classtemplate.rst
+  note it does not have :inherited-members:
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/api/arraymisc.rst b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/arraymisc.rst
new file mode 100644
index 0000000000000000000000000000000000000000..28975eb76e94994c50d2fe52b8f34c7ce533e788
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/arraymisc.rst
@@ -0,0 +1,19 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.arraymisc
+===================================
+
+.. contents:: mmcv.arraymisc
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.arraymisc
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   quantize
+   dequantize
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/api/cnn.rst b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/cnn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..022191f179fdbe3b1644abbb96ffdc92e4e37e06
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/cnn.rst
@@ -0,0 +1,71 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.cnn
+===================================
+
+.. contents:: mmcv.cnn
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.cnn
+
+Module
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   ContextBlock
+   Conv2d
+   Conv3d
+   ConvAWS2d
+   ConvModule
+   ConvTranspose2d
+   ConvTranspose3d
+   ConvWS2d
+   DepthwiseSeparableConvModule
+   GeneralizedAttention
+   HSigmoid
+   HSwish
+   LayerScale
+   Linear
+   MaxPool2d
+   MaxPool3d
+   NonLocal1d
+   NonLocal2d
+   NonLocal3d
+   Scale
+   Swish
+   Conv2dRFSearchOp
+
+Build Function
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   build_activation_layer
+   build_conv_layer
+   build_norm_layer
+   build_padding_layer
+   build_plugin_layer
+   build_upsample_layer
+
+Miscellaneous
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   fuse_conv_bn
+   conv_ws_2d
+   is_norm
+   make_res_layer
+   make_vgg_layer
+   get_model_complexity_info
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/api/image.rst b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3b93484952cd0c45b9d103088b0677f93fe5615d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/image.rst
@@ -0,0 +1,100 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.image
+===================================
+
+.. contents:: mmcv.image
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.image
+
+IO
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   imfrombytes
+   imread
+   imwrite
+   use_backend
+
+Color Space
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   bgr2gray
+   bgr2hls
+   bgr2hsv
+   bgr2rgb
+   bgr2ycbcr
+   gray2bgr
+   gray2rgb
+   hls2bgr
+   hsv2bgr
+   imconvert
+   rgb2bgr
+   rgb2gray
+   rgb2ycbcr
+   ycbcr2bgr
+   ycbcr2rgb
+
+Geometric
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   cutout
+   imcrop
+   imflip
+   impad
+   impad_to_multiple
+   imrescale
+   imresize
+   imresize_like
+   imresize_to_multiple
+   imrotate
+   imshear
+   imtranslate
+   rescale_size
+
+Photometric
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   adjust_brightness
+   adjust_color
+   adjust_contrast
+   adjust_hue
+   adjust_lighting
+   adjust_sharpness
+   auto_contrast
+   clahe
+   imdenormalize
+   imequalize
+   iminvert
+   imnormalize
+   lut_transform
+   posterize
+   solarize
+
+Miscellaneous
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   tensor2imgs
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/api/ops.rst b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/ops.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b0290457bfa0c08f14d7fe346efccb33f388bdae
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/ops.rst
@@ -0,0 +1,135 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.ops
+===================================
+
+.. contents:: mmcv.ops
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.ops
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   BorderAlign
+   CARAFE
+   CARAFENaive
+   CARAFEPack
+   Conv2d
+   ConvTranspose2d
+   CornerPool
+   Correlation
+   CrissCrossAttention
+   DeformConv2d
+   DeformConv2dPack
+   DeformRoIPool
+   DeformRoIPoolPack
+   DynamicScatter
+   FusedBiasLeakyReLU
+   GroupAll
+   Linear
+   MaskedConv2d
+   MaxPool2d
+   ModulatedDeformConv2d
+   ModulatedDeformConv2dPack
+   ModulatedDeformRoIPoolPack
+   MultiScaleDeformableAttention
+   PSAMask
+   PointsSampler
+   PrRoIPool
+   QueryAndGroup
+   RiRoIAlignRotated
+   RoIAlign
+   RoIAlignRotated
+   RoIAwarePool3d
+   RoIPointPool3d
+   RoIPool
+   SAConv2d
+   SigmoidFocalLoss
+   SimpleRoIAlign
+   SoftmaxFocalLoss
+   SparseConv2d
+   SparseConv3d
+   SparseConvTensor
+   SparseConvTranspose2d
+   SparseConvTranspose3d
+   SparseInverseConv2d
+   SparseInverseConv3d
+   SparseMaxPool2d
+   SparseMaxPool3d
+   SparseModule
+   SparseSequential
+   SubMConv2d
+   SubMConv3d
+   SyncBatchNorm
+   TINShift
+   Voxelization
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   active_rotated_filter
+   assign_score_withk
+   ball_query
+   batched_nms
+   bbox_overlaps
+   border_align
+   box_iou_rotated
+   boxes_iou3d
+   boxes_iou_bev
+   boxes_overlap_bev
+   carafe
+   carafe_naive
+   chamfer_distance
+   contour_expand
+   convex_giou
+   convex_iou
+   deform_conv2d
+   deform_roi_pool
+   diff_iou_rotated_2d
+   diff_iou_rotated_3d
+   dynamic_scatter
+   furthest_point_sample
+   furthest_point_sample_with_dist
+   fused_bias_leakyrelu
+   gather_points
+   grouping_operation
+   knn
+   masked_conv2d
+   min_area_polygons
+   modulated_deform_conv2d
+   nms
+   nms3d
+   nms3d_normal
+   nms_bev
+   nms_match
+   nms_normal_bev
+   nms_rotated
+   pixel_group
+   point_sample
+   points_in_boxes_all
+   points_in_boxes_cpu
+   points_in_boxes_part
+   points_in_polygons
+   prroi_pool
+   rel_roi_point_to_rel_img_point
+   riroi_align_rotated
+   roi_align
+   roi_align_rotated
+   roi_pool
+   rotated_feature_align
+   scatter_nd
+   sigmoid_focal_loss
+   soft_nms
+   softmax_focal_loss
+   three_interpolate
+   three_nn
+   tin_shift
+   upfirdn2d
+   voxelization
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/api/transforms.rst b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/transforms.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b080133d6b7736398b855174c325169b8af92aae
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/transforms.rst
@@ -0,0 +1,60 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.transforms
+===================================
+
+.. currentmodule:: mmcv.transforms
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   BaseTransform
+   TestTimeAug
+
+Loading
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   LoadAnnotations
+   LoadImageFromFile
+
+Processing
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   CenterCrop
+   MultiScaleFlipAug
+   Normalize
+   Pad
+   RandomChoiceResize
+   RandomFlip
+   RandomGrayscale
+   RandomResize
+   Resize
+   ToTensor
+   ImageToTensor
+
+Wrapper
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   Compose
+   KeyMapper
+   RandomApply
+   RandomChoice
+   TransformBroadcaster
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/api/utils.rst b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/utils.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f2ff4c2a3872bc9ae0c2942debac5e5b523bd071
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/utils.rst
@@ -0,0 +1,23 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.utils
+===================================
+
+.. contents:: mmcv.utils
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.utils
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   IS_CUDA_AVAILABLE
+   IS_MLU_AVAILABLE
+   IS_MPS_AVAILABLE
+   collect_env
+   jit
+   skip_no_elena
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/api/video.rst b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/video.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a6ebca0eb73afcf3f3f11aae8520e2782a310f13
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/video.rst
@@ -0,0 +1,56 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.video
+===================================
+
+.. contents:: mmcv.video
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.video
+
+IO
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   VideoReader
+   Cache
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   frames2video
+
+Optical Flow
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   dequantize_flow
+   flow_from_bytes
+   flow_warp
+   flowread
+   flowwrite
+   quantize_flow
+   sparse_flow_from_bytes
+
+Video Processing
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   concat_video
+   convert_video
+   cut_video
+   resize_video
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/api/visualization.rst b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/visualization.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8f43ef27a441dcd9001a352cf18e97f8e615676d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/api/visualization.rst
@@ -0,0 +1,50 @@
+.. role:: hidden
+    :class: hidden-section
+
+mmcv.visualization
+===================================
+
+.. contents:: mmcv.visualization
+   :depth: 2
+   :local:
+   :backlinks: top
+
+.. currentmodule:: mmcv.visualization
+
+Color
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   :template: classtemplate.rst
+
+   Color
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   color_val
+
+Image
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   imshow
+   imshow_bboxes
+   imshow_det_bboxes
+
+Optical Flow
+----------------
+
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+
+   flow2rgb
+   flowshow
+   make_color_wheel
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/community/code_style.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/community/code_style.md
new file mode 100644
index 0000000000000000000000000000000000000000..8ddb87c2391e07b848aa073287cc2a230da8c3ec
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/community/code_style.md
@@ -0,0 +1,609 @@
+## 代码规范
+
+### 代码规范标准
+
+#### PEP 8 —— Python 官方代码规范
+
+[Python 官方的代码风格指南](https://www.python.org/dev/peps/pep-0008/)，包含了以下几个方面的内容：
+
+- 代码布局，介绍了 Python 中空行、断行以及导入相关的代码风格规范。比如一个常见的问题：当我的代码较长，无法在一行写下时，何处可以断行？
+
+- 表达式，介绍了 Python 中表达式空格相关的一些风格规范。
+
+- 尾随逗号相关的规范。当列表较长，无法一行写下而写成如下逐行列表时，推荐在末项后加逗号，从而便于追加选项、版本控制等。
+
+  ```python
+  # Correct:
+  FILES = ['setup.cfg', 'tox.ini']
+  # Correct:
+  FILES = [
+      'setup.cfg',
+      'tox.ini',
+  ]
+  # Wrong:
+  FILES = ['setup.cfg', 'tox.ini',]
+  # Wrong:
+  FILES = [
+      'setup.cfg',
+      'tox.ini'
+  ]
+  ```
+
+- 命名相关规范、注释相关规范、类型注解相关规范，我们将在后续章节中做详细介绍。
+
+  "A style guide is about consistency. Consistency with this style guide is important. Consistency within a project is more important. Consistency within one module or function is the most important." PEP 8 -- Style Guide for Python Code
+
+:::{note}
+PEP 8 的代码规范并不是绝对的，项目内的一致性要优先于 PEP 8 的规范。OpenMMLab 各个项目都在 setup.cfg 设定了一些代码规范的设置，请遵照这些设置。一个例子是在 PEP 8 中有如下一个例子：
+
+```python
+# Correct:
+hypot2 = x*x + y*y
+# Wrong:
+hypot2 = x * x + y * y
+```
+
+这一规范是为了指示不同优先级，但 OpenMMLab 的设置中通常没有启用 yapf 的 `ARITHMETIC_PRECEDENCE_INDICATION` 选项，因而格式规范工具不会按照推荐样式格式化，以设置为准。
+:::
+
+#### Google 开源项目风格指南
+
+[Google 使用的编程风格指南](https://google.github.io/styleguide/pyguide.html)，包括了 Python 相关的章节。相较于 PEP 8，该指南提供了更为详尽的代码指南。该指南包括了语言规范和风格规范两个部分。
+
+其中，语言规范对 Python 中很多语言特性进行了优缺点的分析，并给出了使用指导意见，如异常、Lambda 表达式、列表推导式、metaclass 等。
+
+风格规范的内容与 PEP 8 较为接近，大部分约定建立在 PEP 8 的基础上，也有一些更为详细的约定，如函数长度、TODO 注释、文件与 socket 对象的访问等。
+
+推荐将该指南作为参考进行开发，但不必严格遵照，一来该指南存在一些 Python 2 兼容需求，例如指南中要求所有无基类的类应当显式地继承 Object, 而在仅使用 Python 3 的环境中，这一要求是不必要的，依本项目中的惯例即可。二来 OpenMMLab 的项目作为框架级的开源软件，不必对一些高级技巧过于避讳，尤其是 MMCV。但尝试使用这些技巧前应当认真考虑是否真的有必要，并寻求其他开发人员的广泛评估。
+
+另外需要注意的一处规范是关于包的导入，在该指南中，要求导入本地包时必须使用路径全称，且导入的每一个模块都应当单独成行，通常这是不必要的，而且也不符合目前项目的开发惯例，此处进行如下约定：
+
+```python
+# Correct
+from mmcv.cnn.bricks import (Conv2d, build_norm_layer, DropPath, MaxPool2d,
+                             Linear)
+from ..utils import ext_loader
+
+# Wrong
+from mmcv.cnn.bricks import Conv2d, build_norm_layer, DropPath, MaxPool2d, \
+                            Linear  # 使用括号进行连接，而不是反斜杠
+from ...utils import is_str  # 最多向上回溯一层，过多的回溯容易导致结构混乱
+```
+
+OpenMMLab 项目使用 pre-commit 工具自动格式化代码，详情见[贡献代码](./contributing.md#代码风格)。
+
+### 命名规范
+
+#### 命名规范的重要性
+
+优秀的命名是良好代码可读的基础。基础的命名规范对各类变量的命名做了要求，使读者可以方便地根据代码名了解变量是一个类 / 局部变量 / 全局变量等。而优秀的命名则需要代码作者对于变量的功能有清晰的认识，以及良好的表达能力，从而使读者根据名称就能了解其含义，甚至帮助了解该段代码的功能。
+
+#### 基础命名规范
+
+| 类型            | 公有             | 私有               |
+| --------------- | ---------------- | ------------------ |
+| 模块            | lower_with_under | \_lower_with_under |
+| 包              | lower_with_under |                    |
+| 类              | CapWords         | \_CapWords         |
+| 异常            | CapWordsError    |                    |
+| 函数（方法）    | lower_with_under | \_lower_with_under |
+| 函数 / 方法参数 | lower_with_under |                    |
+| 全局 / 类内常量 | CAPS_WITH_UNDER  | \_CAPS_WITH_UNDER  |
+| 全局 / 类内变量 | lower_with_under | \_lower_with_under |
+| 变量            | lower_with_under | \_lower_with_under |
+| 局部变量        | lower_with_under |                    |
+
+注意：
+
+- 尽量避免变量名与保留字冲突，特殊情况下如不可避免，可使用一个后置下划线，如 class\_
+- 尽量不要使用过于简单的命名，除了约定俗成的循环变量 i，文件变量 f，错误变量 e 等。
+- 不会被用到的变量可以命名为 \_，逻辑检查器会将其忽略。
+
+#### 命名技巧
+
+良好的变量命名需要保证三点：
+
+1. 含义准确，没有歧义
+2. 长短适中
+3. 前后统一
+
+```python
+# Wrong
+class Masks(metaclass=ABCMeta):  # 命名无法表现基类；Instance or Semantic？
+    pass
+
+# Correct
+class BaseInstanceMasks(metaclass=ABCMeta):
+    pass
+
+# Wrong，不同地方含义相同的变量尽量用统一的命名
+def __init__(self, inplanes, planes):
+    pass
+
+def __init__(self, in_channels, out_channels):
+    pass
+```
+
+常见的函数命名方法：
+
+- 动宾命名法：crop_img, init_weights
+- 动宾倒置命名法：imread, bbox_flip
+
+注意函数命名与参数的顺序，保证主语在前，符合语言习惯：
+
+- check_keys_exist(key, container)
+- check_keys_contain(container, key)
+
+注意避免非常规或统一约定的缩写，如 nb -> num_blocks，in_nc -> in_channels
+
+### docstring 规范
+
+#### 为什么要写 docstring
+
+docstring 是对一个类、一个函数功能与 API 接口的详细描述，有两个功能，一是帮助其他开发者了解代码功能，方便 debug 和复用代码；二是在 Readthedocs 文档中自动生成相关的 API reference 文档，帮助不了解源代码的社区用户使用相关功能。
+
+#### 如何写 docstring
+
+与注释不同，一份规范的 docstring 有着严格的格式要求，以便于 Python 解释器以及 sphinx 进行文档解析，详细的 docstring 约定参见 [PEP 257](https://www.python.org/dev/peps/pep-0257/)。此处以例子的形式介绍各种文档的标准格式，参考格式为 [Google 风格](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/#comments)。
+
+1. 模块文档
+
+   代码风格规范推荐为每一个模块（即 Python 文件）编写一个 docstring，但目前 OpenMMLab 项目大部分没有此类 docstring，因此不做硬性要求。
+
+   ```python
+   """A one line summary of the module or program, terminated by a period.
+
+   Leave one blank line. The rest of this docstring should contain an
+   overall description of the module or program. Optionally, it may also
+   contain a brief description of exported classes and functions and/or usage
+   examples.
+
+   Typical usage example:
+
+   foo = ClassFoo()
+   bar = foo.FunctionBar()
+   """
+   ```
+
+2. 类文档
+
+   类文档是我们最常需要编写的，此处，按照 OpenMMLab 的惯例，我们使用了与 Google 风格不同的写法。如下例所示，文档中没有使用 Attributes 描述类属性，而是使用 Args 描述 __init__ 函数的参数。
+
+   在 Args 中，遵照 `parameter (type): Description.` 的格式，描述每一个参数类型和功能。其中，多种类型可使用 `(float or str)` 的写法，可以为 None 的参数可以写为 `(int, optional)`。
+
+   ```python
+   class BaseRunner(metaclass=ABCMeta):
+       """The base class of Runner, a training helper for PyTorch.
+
+       All subclasses should implement the following APIs:
+
+       - ``run()``
+       - ``train()``
+       - ``val()``
+       - ``save_checkpoint()``
+
+       Args:
+           model (:obj:`torch.nn.Module`): The model to be run.
+           batch_processor (callable, optional): A callable method that process
+               a data batch. The interface of this method should be
+               ``batch_processor(model, data, train_mode) -> dict``.
+               Defaults to None.
+           optimizer (dict or :obj:`torch.optim.Optimizer`, optional): It can be
+               either an optimizer (in most cases) or a dict of optimizers
+               (in models that requires more than one optimizer, e.g., GAN).
+               Defaults to None.
+           work_dir (str, optional): The working directory to save checkpoints
+               and logs. Defaults to None.
+           logger (:obj:`logging.Logger`): Logger used during training.
+                Defaults to None. (The default value is just for backward
+                compatibility)
+           meta (dict, optional): A dict records some import information such as
+               environment info and seed, which will be logged in logger hook.
+               Defaults to None.
+           max_epochs (int, optional): Total training epochs. Defaults to None.
+           max_iters (int, optional): Total training iterations. Defaults to None.
+       """
+
+       def __init__(self,
+                    model,
+                    batch_processor=None,
+                    optimizer=None,
+                    work_dir=None,
+                    logger=None,
+                    meta=None,
+                    max_iters=None,
+                    max_epochs=None):
+           ...
+   ```
+
+   另外，在一些算法实现的主体类中，建议加入原论文的链接；如果参考了其他开源代码的实现，则应加入 modified from，而如果是直接复制了其他代码库的实现，则应加入 copied from ，并注意源码的 License。如有必要，也可以通过 .. math:: 来加入数学公式
+
+   ```python
+   # 参考实现
+   # This func is modified from `detectron2
+   # <https://github.com/facebookresearch/detectron2/blob/ffff8acc35ea88ad1cb1806ab0f00b4c1c5dbfd9/detectron2/structures/masks.py#L387>`_.
+
+   # 复制代码
+   # This code was copied from the `ubelt
+   # library<https://github.com/Erotemic/ubelt>`_.
+
+   # 引用论文 & 添加公式
+   class LabelSmoothLoss(nn.Module):
+       r"""Initializer for the label smoothed cross entropy loss.
+
+       Refers to `Rethinking the Inception Architecture for Computer Vision
+       <https://arxiv.org/abs/1512.00567>`_.
+
+       This decreases gap between output scores and encourages generalization.
+       Labels provided to forward can be one-hot like vectors (NxC) or class
+       indices (Nx1).
+       And this accepts linear combination of one-hot like labels from mixup or
+       cutmix except multi-label task.
+
+       Args:
+           label_smooth_val (float): The degree of label smoothing.
+           num_classes (int, optional): Number of classes. Defaults to None.
+           mode (str): Refers to notes, Options are "original", "classy_vision",
+               "multi_label". Defaults to "classy_vision".
+           reduction (str): The method used to reduce the loss.
+               Options are "none", "mean" and "sum". Defaults to 'mean'.
+           loss_weight (float):  Weight of the loss. Defaults to 1.0.
+
+       Note:
+           if the ``mode`` is "original", this will use the same label smooth
+           method as the original paper as:
+
+           .. math::
+               (1-\epsilon)\delta_{k, y} + \frac{\epsilon}{K}
+
+           where :math:`\epsilon` is the ``label_smooth_val``, :math:`K` is
+           the ``num_classes`` and :math:`\delta_{k,y}` is Dirac delta,
+           which equals 1 for k=y and 0 otherwise.
+
+           if the ``mode`` is "classy_vision", this will use the same label
+           smooth method as the `facebookresearch/ClassyVision
+           <https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/losses/label_smoothing_loss.py>`_ repo as:
+
+           .. math::
+               \frac{\delta_{k, y} + \epsilon/K}{1+\epsilon}
+
+           if the ``mode`` is "multi_label", this will accept labels from
+           multi-label task and smoothing them as:
+
+           .. math::
+               (1-2\epsilon)\delta_{k, y} + \epsilon
+   ```
+
+```{note}
+注意 \`\`here\`\`、\`here\`、"here" 三种引号功能是不同。
+
+在 reStructured 语法中，\`\`here\`\` 表示一段代码；\`here\` 表示斜体；"here" 无特殊含义，一般可用来表示字符串。其中 \`here\` 的用法与 Markdown 中不同，需要多加留意。
+另外还有 :obj:\`type\` 这种更规范的表示类的写法，但鉴于长度，不做特别要求，一般仅用于表示非常用类型。
+```
+
+3. 方法（函数）文档
+
+   函数文档与类文档的结构基本一致，但需要加入返回值文档。对于较为复杂的函数和类，可以使用 Examples 字段加入示例；如果需要对参数加入一些较长的备注，可以加入 Note 字段进行说明。
+
+   对于使用较为复杂的类或函数，比起看大段大段的说明文字和参数文档，添加合适的示例更能帮助用户迅速了解其用法。需要注意的是，这些示例最好是能够直接在 Python 交互式环境中运行的，并给出一些相对应的结果。如果存在多个示例，可以使用注释简单说明每段示例，也能起到分隔作用。
+
+   ```python
+   def import_modules_from_strings(imports, allow_failed_imports=False):
+       """Import modules from the given list of strings.
+
+       Args:
+           imports (list | str | None): The given module names to be imported.
+           allow_failed_imports (bool): If True, the failed imports will return
+               None. Otherwise, an ImportError is raise. Defaults to False.
+
+       Returns:
+           List[module] | module | None: The imported modules.
+           All these three lines in docstring will be compiled into the same
+           line in readthedocs.
+
+       Examples:
+           >>> osp, sys = import_modules_from_strings(
+           ...     ['os.path', 'sys'])
+           >>> import os.path as osp_
+           >>> import sys as sys_
+           >>> assert osp == osp_
+           >>> assert sys == sys_
+       """
+       ...
+   ```
+
+   如果函数接口在某个版本发生了变化，需要在 docstring 中加入相关的说明，必要时添加 Note 或者 Warning 进行说明，例如：
+
+   ```python
+   class CheckpointHook(Hook):
+       """Save checkpoints periodically.
+
+       Args:
+           out_dir (str, optional): The root directory to save checkpoints. If
+               not specified, ``runner.work_dir`` will be used by default. If
+               specified, the ``out_dir`` will be the concatenation of
+               ``out_dir`` and the last level directory of ``runner.work_dir``.
+               Defaults to None. `Changed in version 1.3.15.`
+           file_client_args (dict, optional): Arguments to instantiate a
+               FileClient. See :class:`mmcv.fileio.FileClient` for details.
+               Defaults to None. `New in version 1.3.15.`
+
+       Warning:
+           Before v1.3.15, the ``out_dir`` argument indicates the path where the
+           checkpoint is stored. However, in v1.3.15 and later, ``out_dir``
+           indicates the root directory and the final path to save checkpoint is
+           the concatenation of out_dir and the last level directory of
+           ``runner.work_dir``. Suppose the value of ``out_dir`` is
+           "/path/of/A" and the value of ``runner.work_dir`` is "/path/of/B",
+           then the final path will be "/path/of/A/B".
+   ```
+
+   如果参数或返回值里带有需要展开描述字段的 dict，则应该采用如下格式：
+
+   ```python
+   def func(x):
+       r"""
+       Args:
+           x (None): A dict with 2 keys, ``padded_targets``, and ``targets``.
+
+               - ``targets`` (list[Tensor]): A list of tensors.
+                 Each tensor has the shape of :math:`(T_i)`. Each
+                 element is the index of a character.
+               - ``padded_targets`` (Tensor): A tensor of shape :math:`(N)`.
+                 Each item is the length of a word.
+
+       Returns:
+           dict: A dict with 2 keys, ``padded_targets``, and ``targets``.
+
+           - ``targets`` (list[Tensor]): A list of tensors.
+             Each tensor has the shape of :math:`(T_i)`. Each
+             element is the index of a character.
+           - ``padded_targets`` (Tensor): A tensor of shape :math:`(N)`.
+             Each item is the length of a word.
+       """
+       return x
+   ```
+
+```{important}
+为了生成 readthedocs 文档，文档的编写需要按照 ReStructrued 文档格式，否则会产生文档渲染错误，在提交 PR 前，最好生成并预览一下文档效果。
+语法规范参考：
+
+- [reStructuredText Primer - Sphinx documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#)
+- [Example Google Style Python Docstrings ‒ napoleon 0.7 documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html#example-google)
+```
+
+### 注释规范
+
+#### 为什么要写注释
+
+对于一个开源项目，团队合作以及社区之间的合作是必不可少的，因而尤其要重视合理的注释。不写注释的代码，很有可能过几个月自己也难以理解，造成额外的阅读和修改成本。
+
+#### 如何写注释
+
+最需要写注释的是代码中那些技巧性的部分。如果你在下次代码审查的时候必须解释一下，那么你应该现在就给它写注释。对于复杂的操作，应该在其操作开始前写上若干行注释。对于不是一目了然的代码，应在其行尾添加注释。
+—— Google 开源项目风格指南
+
+```python
+# We use a weighted dictionary search to find out where i is in
+# the array. We extrapolate position based on the largest num
+# in the array and the array size and then do binary search to
+# get the exact number.
+if i & (i-1) == 0:  # True if i is 0 or a power of 2.
+```
+
+为了提高可读性, 注释应该至少离开代码2个空格.
+另一方面, 绝不要描述代码. 假设阅读代码的人比你更懂Python, 他只是不知道你的代码要做什么.
+—— Google 开源项目风格指南
+
+```python
+# Wrong:
+# Now go through the b array and make sure whenever i occurs
+# the next element is i+1
+
+# Wrong:
+if i & (i-1) == 0:  # True if i bitwise and i-1 is 0.
+```
+
+在注释中，可以使用 Markdown 语法，因为开发人员通常熟悉 Markdown 语法，这样可以便于交流理解，如可使用单反引号表示代码和变量（注意不要和 docstring 中的 ReStructured 语法混淆）
+
+```python
+# `_reversed_padding_repeated_twice` is the padding to be passed to
+# `F.pad` if needed (e.g., for non-zero padding types that are
+# implemented as two ops: padding + conv). `F.pad` accepts paddings in
+# reverse order than the dimension.
+self._reversed_padding_repeated_twice = _reverse_repeat_tuple(self.padding, 2)
+```
+
+#### 注释示例
+
+1. 出自 `mmcv/utils/registry.py`，对于较为复杂的逻辑结构，通过注释，明确了优先级关系。
+
+   ```python
+   # self.build_func will be set with the following priority:
+   # 1. build_func
+   # 2. parent.build_func
+   # 3. build_from_cfg
+   if build_func is None:
+       if parent is not None:
+           self.build_func = parent.build_func
+       else:
+           self.build_func = build_from_cfg
+   else:
+       self.build_func = build_func
+   ```
+
+2. 出自 `mmcv/runner/checkpoint.py`，对于 bug 修复中的一些特殊处理，可以附带相关的 issue 链接，帮助其他人了解 bug 背景。
+
+   ```python
+   def _save_ckpt(checkpoint, file):
+       # The 1.6 release of PyTorch switched torch.save to use a new
+       # zipfile-based file format. It will cause RuntimeError when a
+       # checkpoint was saved in high version (PyTorch version>=1.6.0) but
+       # loaded in low version (PyTorch version<1.6.0). More details at
+       # https://github.com/open-mmlab/mmpose/issues/904
+       if digit_version(TORCH_VERSION) >= digit_version('1.6.0'):
+           torch.save(checkpoint, file, _use_new_zipfile_serialization=False)
+       else:
+           torch.save(checkpoint, file)
+   ```
+
+### 类型注解
+
+#### 为什么要写类型注解
+
+类型注解是对函数中变量的类型做限定或提示，为代码的安全性提供保障、增强代码的可读性、避免出现类型相关的错误。
+Python 没有对类型做强制限制，类型注解只起到一个提示作用，通常你的 IDE 会解析这些类型注解，然后在你调用相关代码时对类型做提示。另外也有类型注解检查工具，这些工具会根据类型注解，对代码中可能出现的问题进行检查，减少 bug 的出现。
+需要注意的是，通常我们不需要注释模块中的所有函数：
+
+1. 公共的 API 需要注释
+2. 在代码的安全性，清晰性和灵活性上进行权衡是否注释
+3. 对于容易出现类型相关的错误的代码进行注释
+4. 难以理解的代码请进行注释
+5. 若代码中的类型已经稳定，可以进行注释. 对于一份成熟的代码，多数情况下，即使注释了所有的函数，也不会丧失太多的灵活性.
+
+#### 如何写类型注解
+
+1. 函数 / 方法类型注解，通常不对 self 和 cls 注释。
+
+   ```python
+   from typing import Optional, List, Tuple
+
+   # 全部位于一行
+   def my_method(self, first_var: int) -> int:
+       pass
+
+   # 另起一行
+   def my_method(
+           self, first_var: int,
+           second_var: float) -> Tuple[MyLongType1, MyLongType1, MyLongType1]:
+       pass
+
+   # 单独成行（具体的应用场合与行宽有关，建议结合 yapf 自动化格式使用）
+   def my_method(
+       self, first_var: int, second_var: float
+   ) -> Tuple[MyLongType1, MyLongType1, MyLongType1]:
+       pass
+
+   # 引用尚未被定义的类型
+   class MyClass:
+       def __init__(self,
+                    stack: List["MyClass"]) -> None:
+           pass
+   ```
+
+   注：类型注解中的类型可以是 Python 内置类型，也可以是自定义类，还可以使用 Python 提供的 wrapper 类对类型注解进行装饰，一些常见的注解如下：
+
+   ```python
+   # 数值类型
+   from numbers import Number
+
+   # 可选类型，指参数可以为 None
+   from typing import Optional
+   def foo(var: Optional[int] = None):
+       pass
+
+   # 联合类型，指同时接受多种类型
+   from typing import Union
+   def foo(var: Union[float, str]):
+       pass
+
+   from typing import Sequence  # 序列类型
+   from typing import Iterable  # 可迭代类型
+   from typing import Any  # 任意类型
+   from typing import Callable  # 可调用类型
+
+   from typing import List, Dict  # 列表和字典的泛型类型
+   from typing import Tuple  # 元组的特殊格式
+   # 虽然在 Python 3.9 中，list, tuple 和 dict 本身已支持泛型，但为了支持之前的版本
+   # 我们在进行类型注解时还是需要使用 List, Tuple, Dict 类型
+   # 另外，在对参数类型进行注解时，尽量使用 Sequence & Iterable & Mapping
+   # List, Tuple, Dict 主要用于返回值类型注解
+   # 参见 https://docs.python.org/3/library/typing.html#typing.List
+   ```
+
+2. 变量类型注解，一般用于难以直接推断其类型时
+
+   ```python
+   # Recommend: 带类型注解的赋值
+   a: Foo = SomeUndecoratedFunction()
+   a: List[int]: [1, 2, 3]         # List 只支持单一类型泛型，可使用 Union
+   b: Tuple[int, int] = (1, 2)     # 长度固定为 2
+   c: Tuple[int, ...] = (1, 2, 3)  # 变长
+   d: Dict[str, int] = {'a': 1, 'b': 2}
+
+   # Not Recommend：行尾类型注释
+   # 虽然这种方式被写在了 Google 开源指南中，但这是一种为了支持 Python 2.7 版本
+   # 而补充的注释方式，鉴于我们只支持 Python 3, 为了风格统一，不推荐使用这种方式。
+   a = SomeUndecoratedFunction()  # type: Foo
+   a = [1, 2, 3]  # type: List[int]
+   b = (1, 2, 3)  # type: Tuple[int, ...]
+   c = (1, "2", 3.5)  # type: Tuple[int, Text, float]
+   ```
+
+3. 泛型
+
+   上文中我们知道，typing 中提供了 list 和 dict 的泛型类型，那么我们自己是否可以定义类似的泛型呢？
+
+   ```python
+   from typing import TypeVar, Generic
+
+   KT = TypeVar('KT')
+   VT = TypeVar('VT')
+
+   class Mapping(Generic[KT, VT]):
+       def __init__(self, data: Dict[KT, VT]):
+           self._data = data
+
+       def __getitem__(self, key: KT) -> VT:
+           return self._data[key]
+   ```
+
+   使用上述方法，我们定义了一个拥有泛型能力的映射类，实际用法如下：
+
+   ```python
+   mapping = Mapping[str, float]({'a': 0.5})
+   value: float = example['a']
+   ```
+
+   另外，我们也可以利用 TypeVar 在函数签名中指定联动的多个类型：
+
+   ```python
+   from typing import TypeVar, List
+
+   T = TypeVar('T')  # Can be anything
+   A = TypeVar('A', str, bytes)  # Must be str or bytes
+
+
+   def repeat(x: T, n: int) -> List[T]:
+       """Return a list containing n references to x."""
+       return [x]*n
+
+
+   def longest(x: A, y: A) -> A:
+       """Return the longest of two strings."""
+       return x if len(x) >= len(y) else y
+   ```
+
+更多关于类型注解的写法请参考 [typing](https://docs.python.org/3/library/typing.html)。
+
+#### 类型注解检查工具
+
+[mypy](https://mypy.readthedocs.io/en/stable/) 是一个 Python 静态类型检查工具。根据你的类型注解，mypy 会检查传参、赋值等操作是否符合类型注解，从而避免可能出现的 bug。
+
+例如如下的一个  Python 脚本文件 test.py:
+
+```python
+def foo(var: int) -> float:
+    return float(var)
+
+a: str = foo('2.0')
+b: int = foo('3.0')  # type: ignore
+```
+
+运行 mypy test.py 可以得到如下检查结果，分别指出了第 4 行在函数调用和返回值赋值两处类型错误。而第 5 行同样存在两个类型错误，由于使用了 type: ignore 而被忽略了，只有部分特殊情况可能需要此类忽略。
+
+```
+test.py:4: error: Incompatible types in assignment (expression has type "float", variable has type "int")
+test.py:4: error: Argument 1 to "foo" has incompatible type "str"; expected "int"
+Found 2 errors in 1 file (checked 1 source file)
+```
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/community/contributing.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/community/contributing.md
new file mode 100644
index 0000000000000000000000000000000000000000..3805223a220aba26e9bed2776a610b7a0f055f8d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/community/contributing.md
@@ -0,0 +1,278 @@
+## 贡献代码
+
+欢迎加入 MMCV 社区，我们致力于打造最前沿的计算机视觉基础库，我们欢迎任何类型的贡献，包括但不限于
+
+**修复错误**
+
+修复代码实现错误的步骤如下：
+
+1. 如果提交的代码改动较大，建议先提交 issue，并正确描述 issue 的现象、原因和复现方式，讨论后确认修复方案。
+2. 修复错误并补充相应的单元测试，提交拉取请求。
+
+**新增功能或组件**
+
+1. 如果新功能或模块涉及较大的代码改动，建议先提交 issue，确认功能的必要性。
+2. 实现新增功能并添单元测试，提交拉取请求。
+
+**文档补充**
+
+修复文档可以直接提交拉取请求
+
+添加文档或将文档翻译成其他语言步骤如下
+
+1. 提交 issue，确认添加文档的必要性。
+2. 添加文档，提交拉取请求。
+
+### 拉取请求工作流
+
+如果你对拉取请求不了解，没关系，接下来的内容将会从零开始，一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式，可以参考 github [官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)
+
+#### 1. 复刻仓库
+
+当你第一次提交拉取请求时，先复刻 OpenMMLab 原代码库，点击 GitHub 页面右上角的 **Fork** 按钮，复刻后的代码库将会出现在你的 GitHub 个人主页下。
+
+<img src="https://user-images.githubusercontent.com/57566630/167305749-43c7f4e9-449b-4e98-ade5-0c9276d5c9ce.png" width="1200">
+
+将代码克隆到本地
+
+```shell
+git clone git@github.com:{username}/mmcv.git
+```
+
+添加原代码库为上游代码库
+
+```bash
+git remote add upstream git@github.com:open-mmlab/mmcv
+```
+
+检查 remote 是否添加成功，在终端输入 `git remote -v`
+
+```bash
+origin	git@github.com:{username}/mmcv.git (fetch)
+origin	git@github.com:{username}/mmcv.git (push)
+upstream	git@github.com:open-mmlab/mmcv (fetch)
+upstream	git@github.com:open-mmlab/mmcv (push)
+```
+
+```{note}
+这里对 origin 和 upstream 进行一个简单的介绍，当我们使用 git clone 来克隆代码时，会默认创建一个 origin 的 remote，它指向我们克隆的代码库地址，而 upstream 则是我们自己添加的，用来指向原始代码库地址。当然如果你不喜欢他叫 upstream，也可以自己修改，比如叫 open-mmlab。我们通常向 origin 提交代码（即 fork 下来的远程仓库），然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突，再从 upstream 拉取最新的代码，和本地分支解决冲突，再提交到 origin。
+```
+
+#### 2. 配置 pre-commit
+
+在本地开发环境中，我们使用 [pre-commit](https://pre-commit.com/#intro) 来检查代码风格，以确保代码风格的统一。在提交代码，需要先安装 pre-commit（需要在 MMCV 目录下执行）:
+
+```shell
+pip install -U pre-commit
+pre-commit install
+```
+
+检查 pre-commit 是否配置成功，并安装 `.pre-commit-config.yaml` 中的钩子：
+
+```shell
+pre-commit run --all-files
+```
+
+<img src="https://user-images.githubusercontent.com/57566630/173660750-3df20a63-cb66-4d33-a986-1f643f1d8aaf.png" width="1200">
+
+<img src="https://user-images.githubusercontent.com/57566630/202368856-0465a90d-8fce-4345-918e-67b8b9c82614.png" width="1200">
+
+```{note}
+如果你是中国用户，由于网络原因，可能会出现安装失败的情况，这时可以使用国内源
+
+pre-commit install -c .pre-commit-config-zh-cn.yaml
+
+pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml
+```
+
+如果安装过程被中断，可以重复执行 `pre-commit run ...` 继续安装。
+
+如果提交的代码不符合代码风格规范，pre-commit 会发出警告，并自动修复部分错误。
+
+<img src="https://user-images.githubusercontent.com/57566630/202369176-67642454-0025-4023-a095-263529107aa3.png" width="1200">
+
+如果我们想临时绕开 pre-commit 的检查提交一次代码，可以在 `git commit` 时加上 `--no-verify`（需要保证最后推送至远程仓库的代码能够通过 pre-commit 检查）。
+
+```shell
+git commit -m "xxx" --no-verify
+```
+
+#### 3. 创建开发分支
+
+安装完 pre-commit 之后，我们需要基于 main 创建开发分支，建议的分支命名规则为 `username/pr_name`。
+
+```shell
+git checkout -b yhc/refactor_contributing_doc
+```
+
+在后续的开发中，如果本地仓库的 main 分支落后于 upstream 的 main 分支，我们需要先拉取 upstream 的代码进行同步，再执行上面的命令
+
+```shell
+git pull upstream main
+```
+
+#### 4. 提交代码并在本地通过单元测试
+
+- MMCV 引入了 mypy 来做静态类型检查，以增加代码的鲁棒性。因此我们在提交代码时，需要补充 Type Hints。具体规则可以参考[教程](https://zhuanlan.zhihu.com/p/519335398)。
+
+- 提交的代码同样需要通过单元测试
+
+  ```shell
+  # 通过全量单元测试
+  pytest tests
+
+  # 我们需要保证提交的代码能够通过修改模块的单元测试，以 runner 为例
+  pytest tests/test_runner/test_runner.py
+  ```
+
+  如果你由于缺少依赖无法运行修改模块的单元测试，可以参考[指引-单元测试](#单元测试)
+
+- 如果修改/添加了文档，参考[指引](#文档渲染)确认文档渲染正常。
+
+#### 5. 推送代码到远程
+
+代码通过单元测试和 pre-commit 检查后，将代码推送到远程仓库，如果是第一次推送，可以在 `git push` 后加上 `-u` 参数以关联远程分支
+
+```shell
+git push -u origin {branch_name}
+```
+
+这样下次就可以直接使用 `git push` 命令推送代码了，而无需指定分支和远程仓库。
+
+#### 6. 提交拉取请求（PR）
+
+(1) 在 GitHub 的 Pull request 界面创建拉取请求
+<img src="https://user-images.githubusercontent.com/57566630/201533288-516f7ac4-0b14-4dc8-afbd-912475c368b5.png" width="1200">
+
+(2) 根据指引修改 PR 描述，以便于其他开发者更好地理解你的修改
+
+<img src="https://user-images.githubusercontent.com/57566630/202242953-c91a18ff-e388-4ff9-8591-5fae0ead6c1e.png" width="1200">
+
+描述规范详见[拉取请求规范](#拉取请求规范)
+
+&#160;
+
+**注意事项**
+
+(a) PR 描述应该包含修改理由、修改内容以及修改后带来的影响，并关联相关 Issue（具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）
+
+(b) 如果是第一次为 OpenMMLab 做贡献，需要签署 CLA
+
+<img src="https://user-images.githubusercontent.com/57566630/167307569-a794b967-6e28-4eac-a942-00deb657815f.png" width="1200">
+
+(c) 检查提交的 PR 是否通过 CI（集成测试）
+
+<img src="https://user-images.githubusercontent.com/57566630/167307490-f9ebf9fa-63c0-4d83-8ba1-081ea169eb3a.png" width="1200">
+
+MMCV 会在不同的平台（Linux、Window、Mac），基于不同版本的 Python、PyTorch、CUDA 对提交的代码进行单元测试，以保证代码的正确性，如果有任何一个没有通过，我们可点击上图中的 `Details` 来查看具体的测试信息，以便于我们修改代码。
+
+(3) 如果 PR 通过了 CI，那么就可以等待其他开发者的 review，并根据 reviewer 的意见，修改代码，并重复 [4](#4-提交代码并本地通过单元测试)-[5](#5-推送代码到远程) 步骤，直到 reviewer 同意合入 PR。
+
+<img src="https://user-images.githubusercontent.com/57566630/202145400-cc2cd8c4-10b0-472f-ba37-07e6f50acc67.png" width="1200">
+
+所有 reviewer 同意合入 PR 后，我们会尽快将 PR 合并到主分支。
+
+#### 7. 解决冲突
+
+随着时间的推移，我们的代码库会不断更新，这时候，如果你的 PR 与主分支存在冲突，你需要解决冲突，解决冲突的方式有两种：
+
+```shell
+git fetch --all --prune
+git rebase upstream/main
+```
+
+或者
+
+```shell
+git fetch --all --prune
+git merge upstream/main
+```
+
+如果你非常善于处理冲突，那么可以使用 rebase 的方式来解决冲突，因为这能够保证你的 commit log 的整洁。如果你不太熟悉 `rebase` 的使用，那么可以使用 `merge` 的方式来解决冲突。
+
+### 指引
+
+#### 单元测试
+
+如果你无法正常执行部分模块的单元测试，例如 [video](https://github.com/open-mmlab/mmcv/tree/main/mmcv/video) 模块，可能是你的当前环境没有安装以下依赖
+
+```shell
+# Linux
+sudo apt-get update -y
+sudo apt-get install -y libturbojpeg
+sudo apt-get install -y ffmpeg
+
+# Windows
+conda install ffmpeg
+```
+
+在提交修复代码错误或新增特性的拉取请求时，我们应该尽可能的让单元测试覆盖所有提交的代码，计算单元测试覆盖率的方法如下
+
+```shell
+python -m coverage run -m pytest /path/to/test_file
+python -m coverage html
+# check file in htmlcov/index.html
+```
+
+#### 文档渲染
+
+在提交修复代码错误或新增特性的拉取请求时，可能会需要修改/新增模块的 docstring。我们需要确认渲染后的文档样式是正确的。
+本地生成渲染后的文档的方法如下
+
+```shell
+pip install -r requirements/docs.txt
+cd docs/zh_cn/
+# or docs/en
+make html
+# check file in ./docs/zh_cn/_build/html/index.html
+```
+
+### 代码风格
+
+#### Python
+
+[PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范，我们使用以下工具检查和格式化代码
+
+- [flake8](https://github.com/PyCQA/flake8): Python 官方发布的代码规范检查工具，是多个检查工具的封装
+- [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具
+- [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具
+- [codespell](https://github.com/codespell-project/codespell): 检查单词拼写是否有误
+- [mdformat](https://github.com/executablebooks/mdformat): 检查 markdown 文件的工具
+- [docformatter](https://github.com/myint/docformatter): 格式化 docstring 的工具
+
+yapf 和 isort 的配置可以在 [setup.cfg](./setup.cfg) 找到
+
+通过配置 [pre-commit hook](https://pre-commit.com/) ，我们可以在提交代码时自动检查和格式化 `flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`，
+修复 `end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`，调整 `requirments.txt` 的包顺序。
+pre-commit 钩子的配置可以在 [.pre-commit-config](./.pre-commit-config.yaml) 找到。
+
+pre-commit 具体的安装使用方式见[拉取请求](#2-配置-pre-commit)。
+
+更具体的规范请参考 [OpenMMLab 代码规范](code_style.md)。
+
+#### C++ and CUDA
+
+C++ 和 CUDA 的代码规范遵从 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html)
+
+### 拉取请求规范
+
+1. 使用 [pre-commit hook](https://pre-commit.com)，尽量减少代码风格相关问题
+
+2. 一个`拉取请求`对应一个短期分支
+
+3. 粒度要细，一个`拉取请求`只做一件事情，避免超大的`拉取请求`
+
+   - Bad：实现 Faster R-CNN
+   - Acceptable：给 Faster R-CNN 添加一个 box head
+   - Good：给 box head 增加一个参数来支持自定义的 conv 层数
+
+4. 每次 Commit 时需要提供清晰且有意义 commit 信息
+
+5. 提供清晰且有意义的`拉取请求`描述
+
+   - 标题写明白任务名称，一般格式:\[Prefix\] Short description of the pull request (Suffix)
+   - prefix: 新增功能 \[Feature\], 修 bug \[Fix\], 文档相关 \[Docs\], 开发中 \[WIP\] (暂时不会被review)
+   - 描述里介绍`拉取请求`的主要修改内容，结果，以及对其他部分的影响, 参考`拉取请求`模板
+   - 关联相关的`议题` (issue) 和其他`拉取请求`
+
+6. 如果引入了其他三方库，或借鉴了三方库的代码，请确认他们的许可证和 mmcv 兼容，并在借鉴的代码上补充 `This code is inspired from http://`
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/community/pr.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/community/pr.md
new file mode 100644
index 0000000000000000000000000000000000000000..427fdf9e4965e404970c761676e7edd29e7b2e56
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/community/pr.md
@@ -0,0 +1,3 @@
+## 拉取请求
+
+本文档的内容已迁移到[贡献指南](contributing.md)。
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/compatibility.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/compatibility.md
new file mode 100644
index 0000000000000000000000000000000000000000..2729243f9fc8a5d82c6f7a9a90b9bbfa5accae1b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/compatibility.md
@@ -0,0 +1,231 @@
+### v2.0.0
+
+OpenMMLab 团队于 2022 年 9 月 1 日在世界人工智能大会发布了新一代训练引擎 [MMEngine](https://github.com/open-mmlab/mmengine)，它是一个用于训练深度学习模型的基础库。相比于 MMCV，它提供了更高级且通用的训练器、接口更加统一的开放架构以及可定制化程度更高的训练流程。
+
+OpenMMLab 团队于 2023 年 4 月 6 日发布 MMCV [v2.0.0](https://github.com/open-mmlab/mmcv/releases/tag/v2.0.0)。在 2.x 版本中，它有以下重大变化：
+
+（1）删除了以下组件：
+
+- `mmcv.fileio` 模块，删除于 PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179)。在需要使用 FileIO 的地方使用 mmengine 中的 FileIO 模块
+- `mmcv.runner`、`mmcv.parallel`、`mmcv.engine` 和 `mmcv.device`，删除于 PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216)
+- `mmcv.utils` 的所有类（例如 `Config` 和 `Registry`）和大部分函数，删除于 PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217)，只保留少数和 mmcv 相关的函数
+- `mmcv.onnx`、`mmcv.tensorrt` 模块以及相关的函数，删除于 PR [#2225](https://github.com/open-mmlab/mmcv/pull/2225)
+- 删除 MMCV 所有的根注册器并将类或者函数注册到 MMEngine 的[根注册器](https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py)
+
+（2）新增了 [`mmcv.transforms`](https://github.com/open-mmlab/mmcv/tree/main/mmcv/transforms) 数据变换模块
+
+（3）在 PR [#2235](https://github.com/open-mmlab/mmcv/pull/2235) 中将包名 **mmcv** 重命名为 **mmcv-lite**、 **mmcv-full** 重命名为 **mmcv**。此外，将环境变量 `MMCV_WITH_OPS` 的默认值从 0 改为 1
+
+<table class="docutils">
+<thead>
+  <tr>
+    <th align="center">MMCV < 2.0</th>
+    <th align="center">MMCV >= 2.0 </th>
+<tbody>
+  <tr>
+  <td valign="top">
+
+```bash
+# 包含算子，因为 mmcv-full 的最高版本小于 2.0.0，所以无需加版本限制
+pip install openmim
+mim install mmcv-full
+
+# 不包含算子
+pip install openmim
+mim install "mmcv < 2.0.0"
+```
+
+</td>
+  <td valign="top">
+
+```bash
+# 包含算子
+pip install openmim
+mim install mmcv
+
+# 不包含算子，因为 mmcv-lite 的起始版本为 2.0.0，所以无需加版本限制
+pip install openmim
+mim install mmcv-lite
+```
+
+</td>
+</tr>
+</thead>
+</table>
+
+### v1.3.18
+
+部分自定义算子对于不同的设备有不同实现，为此添加的大量宏命令与类型检查使得代码变得难以维护。例如：
+
+```c++
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(rois);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(argmax_y);
+    CHECK_CUDA_INPUT(argmax_x);
+
+    roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
+                           aligned_height, aligned_width, spatial_scale,
+                           sampling_ratio, pool_mode, aligned);
+#else
+    AT_ERROR("RoIAlign is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(rois);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(argmax_y);
+    CHECK_CPU_INPUT(argmax_x);
+    roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+  }
+```
+
+为此我们设计了注册与分发的机制以更好的管理这些算子实现。
+
+```c++
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+// 注册算子的cuda实现
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+
+// roi_align.cpp
+// 使用dispatcher根据参数中的Tensor device类型对实现进行分发
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
+}
+
+```
+
+### v1.3.11
+
+为了灵活地支持更多的后端和硬件，例如 `NVIDIA GPUs` 、`AMD GPUs`，我们重构了 `mmcv/ops/csrc` 目录。注意，这次重构不会影响 API 的使用。更多相关信息，请参考 [PR1206](https://github.com/open-mmlab/mmcv/pull/1206)。
+
+原始的目录结构如下所示
+
+```
+.
+├── common_cuda_helper.hpp
+├── ops_cuda_kernel.cuh
+├── pytorch_cpp_helper.hpp
+├── pytorch_cuda_helper.hpp
+├── parrots_cpp_helper.hpp
+├── parrots_cuda_helper.hpp
+├── parrots_cudawarpfunction.cuh
+├── onnxruntime
+│   ├── onnxruntime_register.h
+│   ├── onnxruntime_session_options_config_keys.h
+│   ├── ort_mmcv_utils.h
+│   ├── ...
+│   ├── onnx_ops.h
+│   └── cpu
+│       ├── onnxruntime_register.cpp
+│       ├── ...
+│       └── onnx_ops_impl.cpp
+├── parrots
+│   ├── ...
+│   ├── ops.cpp
+│   ├── ops_cuda.cu
+│   ├── ops_parrots.cpp
+│   └── ops_pytorch.h
+├── pytorch
+│   ├── ...
+│   ├── ops.cpp
+│   ├── ops_cuda.cu
+│   ├── pybind.cpp
+└── tensorrt
+    ├── trt_cuda_helper.cuh
+    ├── trt_plugin_helper.hpp
+    ├── trt_plugin.hpp
+    ├── trt_serialize.hpp
+    ├── ...
+    ├── trt_ops.hpp
+    └── plugins
+        ├── trt_cuda_helper.cu
+        ├── trt_plugin.cpp
+        ├── ...
+        ├── trt_ops.cpp
+        └── trt_ops_kernel.cu
+```
+
+重构之后，它的结构如下所示
+
+```
+.
+├── common
+│   ├── box_iou_rotated_utils.hpp
+│   ├── parrots_cpp_helper.hpp
+│   ├── parrots_cuda_helper.hpp
+│   ├── pytorch_cpp_helper.hpp
+│   ├── pytorch_cuda_helper.hpp
+│   └── cuda
+│       ├── common_cuda_helper.hpp
+│       ├── parrots_cudawarpfunction.cuh
+│       ├── ...
+│       └── ops_cuda_kernel.cuh
+├── onnxruntime
+│   ├── onnxruntime_register.h
+│   ├── onnxruntime_session_options_config_keys.h
+│   ├── ort_mmcv_utils.h
+│   ├── ...
+│   ├── onnx_ops.h
+│   └── cpu
+│       ├── onnxruntime_register.cpp
+│       ├── ...
+│       └── onnx_ops_impl.cpp
+├── parrots
+│   ├── ...
+│   ├── ops.cpp
+│   ├── ops_parrots.cpp
+│   └── ops_pytorch.h
+├── pytorch
+│   ├── info.cpp
+│   ├── pybind.cpp
+│   ├── ...
+│   ├── ops.cpp
+│   └── cuda
+│       ├── ...
+│       └── ops_cuda.cu
+└── tensorrt
+    ├── trt_cuda_helper.cuh
+    ├── trt_plugin_helper.hpp
+    ├── trt_plugin.hpp
+    ├── trt_serialize.hpp
+    ├── ...
+    ├── trt_ops.hpp
+    └── plugins
+        ├── trt_cuda_helper.cu
+        ├── trt_plugin.cpp
+        ├── ...
+        ├── trt_ops.cpp
+        └── trt_ops_kernel.cu
+```
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/conf.py b/head_extractor/mmcv-2.1.0/docs/zh_cn/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bfb9c23a726bb917761c725472d307e6d1d865a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/conf.py
@@ -0,0 +1,217 @@
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+
+import pytorch_sphinx_theme
+from sphinx.builders.html import StandaloneHTMLBuilder
+
+sys.path.insert(0, os.path.abspath('../..'))
+
+version_file = '../../mmcv/version.py'
+with open(version_file) as f:
+    exec(compile(f.read(), version_file, 'exec'))
+__version__ = locals()['__version__']
+
+# -- Project information -----------------------------------------------------
+
+project = 'mmcv'
+copyright = '2018-2022, OpenMMLab'
+author = 'MMCV Authors'
+
+# The short X.Y version
+version = __version__
+# The full version, including alpha/beta/rc tags
+release = __version__
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.autosummary',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.autosectionlabel',
+    'sphinx_markdown_tables',
+    'myst_parser',
+    'sphinx_copybutton',
+]  # yapf: disable
+
+myst_heading_anchors = 4
+
+myst_enable_extensions = ['colon_fence']
+
+# Configuration for intersphinx
+intersphinx_mapping = {
+    'python': ('https://docs.python.org/3', None),
+    'numpy': ('https://numpy.org/doc/stable', None),
+    'torch': ('https://pytorch.org/docs/stable/', None),
+    'mmengine': ('https://mmengine.readthedocs.io/en/latest', None),
+}
+
+autodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision']
+autosectionlabel_prefix_document = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = {
+    '.rst': 'restructuredtext',
+    '.md': 'markdown',
+}
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = 'zh_CN'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+# html_theme = 'sphinx_rtd_theme'
+html_theme = 'pytorch_sphinx_theme'
+html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+html_theme_options = {
+    'menu': [
+        {
+            'name': 'GitHub',
+            'url': 'https://github.com/open-mmlab/mmcv'
+        },
+    ],
+    # Specify the language of shared menu
+    'menu_lang': 'cn',
+}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_css_files = ['css/readthedocs.css']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'mmcvdoc'
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'mmcv.tex', 'mmcv Documentation', 'MMCV Contributors',
+     'manual'),
+]
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, 'mmcv', 'mmcv Documentation', [author], 1)]
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'mmcv', 'mmcv Documentation', author, 'mmcv',
+     'One line description of project.', 'Miscellaneous'),
+]
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# set priority when building html
+StandaloneHTMLBuilder.supported_image_types = [
+    'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg'
+]
+# -- Extension configuration -------------------------------------------------
+# Ignore >>> when copying code
+copybutton_prompt_text = r'>>> |\.\.\. '
+copybutton_prompt_is_regexp = True
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/docutils.conf b/head_extractor/mmcv-2.1.0/docs/zh_cn/docutils.conf
new file mode 100644
index 0000000000000000000000000000000000000000..0c00c84688701117f231fd0c8ec295fb747b7d8f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/docutils.conf
@@ -0,0 +1,2 @@
+[html writers]
+table_style: colwidths-auto
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/faq.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/faq.md
new file mode 100644
index 0000000000000000000000000000000000000000..6cfb100c631b101fa0cff0650105a3cc7d735e7b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/faq.md
@@ -0,0 +1,91 @@
+## 常见问题
+
+在这里我们列出了用户经常遇到的问题以及对应的解决方法。如果您遇到了其他常见的问题，并且知道可以帮到大家的解决办法，
+欢迎随时丰富这个列表。
+
+### 安装问题
+
+- KeyError: "xxx: 'yyy is not in the zzz registry'"
+
+  只有模块所在的文件被导入时，注册机制才会被触发，所以您需要在某处导入该文件，更多详情请查看 [KeyError: "MaskRCNN: 'RefineRoIHead is not in the models registry'"](https://github.com/open-mmlab/mmdetection/issues/5974)。
+
+- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"
+
+  1. 使用 `pip uninstall mmcv` 卸载您环境中的 mmcv
+  2. 参考 [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) 或者 [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html) 安装 mmcv-full
+
+- "invalid device function" 或者 "no kernel image is available for execution"
+
+  1. 检查 GPU 的 CUDA 计算能力
+  2. 运行 `python mmdet/utils/collect_env.py` 来检查 PyTorch、torchvision 和 MMCV 是否是针对正确的 GPU 架构构建的，您可能需要去设置 `TORCH_CUDA_ARCH_LIST` 来重新安装 MMCV。兼容性问题可能会出现在使用旧版的 GPUs，如：colab 上的 Tesla K80 (3.7)
+  3. 检查运行环境是否和 mmcv/mmdet 编译时的环境相同。例如，您可能使用 CUDA 10.0 编译 mmcv，但在 CUDA 9.0 的环境中运行它
+
+- "undefined symbol" 或者 "cannot open xxx.so"
+
+  1. 如果符号和 CUDA/C++ 相关（例如：libcudart.so 或者 GLIBCXX），请检查 CUDA/GCC 运行时的版本是否和编译 mmcv 的一致
+  2. 如果符号和 PyTorch 相关（例如：符号包含 caffe、aten 和 TH），请检查 PyTorch 运行时的版本是否和编译 mmcv 的一致
+  3. 运行 `python mmdet/utils/collect_env.py` 以检查 PyTorch、torchvision 和 MMCV 构建和运行的环境是否相同
+
+- "RuntimeError: CUDA error: invalid configuration argument"
+
+  这个错误可能是由于您的 GPU 性能不佳造成的。尝试降低 [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)
+  的值并重新编译 mmcv。
+
+- "RuntimeError: nms is not compiled with GPU support"
+
+  这个错误是由于您的 CUDA 环境没有正确安装。
+  您可以尝试重新安装您的 CUDA 环境，然后删除 mmcv/build 文件夹并重新编译 mmcv。
+
+- "Segmentation fault"
+
+  1. 检查 GCC 的版本，通常是因为 PyTorch 版本与 GCC 版本不匹配 （例如 GCC \< 4.9 )，我们推荐用户使用 GCC 5.4，我们也不推荐使用 GCC 5.5， 因为有反馈 GCC 5.5 会导致 "segmentation fault" 并且切换到 GCC 5.4 就可以解决问题
+  2. 检查是否正确安装 CUDA 版本的 PyTorc。输入以下命令并检查是否返回 True
+     ```shell
+     python -c 'import torch; print(torch.cuda.is_available())'
+     ```
+  3. 如果 `torch` 安装成功，那么检查 MMCV 是否安装成功。输入以下命令，如果没有报错说明 mmcv-full 安装成。
+     ```shell
+     python -c 'import mmcv; import mmcv.ops'
+     ```
+  4. 如果 MMCV 与 PyTorch 都安装成功了，则可以使用 `ipdb` 设置断点或者使用 `print` 函数，分析是哪一部分的代码导致了 `segmentation fault`
+
+- "libtorch_cuda_cu.so: cannot open shared object file"
+
+  `mmcv-full` 依赖 `libtorch_cuda_cu.so` 文件，但程序运行时没能找到该文件。我们可以检查该文件是否存在 `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` 也可以尝试重装 PyTorch。
+
+- "fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version!"
+
+  如果您在 Windows 上编译 mmcv-full 并且 CUDA 的版本是 9.2，您很可能会遇到这个问题 `"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\crt/host_config.h(133): fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!"`，您可以尝试使用低版本的 Microsoft Visual Studio，例如 vs2017。
+
+- "error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized"
+
+  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.5.0，您很可能会遇到这个问题 `- torch/csrc/jit/api/module.h(474): error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized`。解决这个问题的方法是将 `torch/csrc/jit/api/module.h` 文件中所有 `static constexpr bool all_slots = false;` 替换为 `static bool all_slots = false;`。更多细节可以查看 [member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized](https://github.com/pytorch/pytorch/issues/39394)。
+
+- "error: a member with an in-class initializer must be const"
+
+  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.6.0，您很可能会遇到这个问题 `"- torch/include\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const"`. 解决这个问题的方法是将 `torch/include\torch/csrc/jit/api/module.h` 文件中的所有 `CONSTEXPR_EXCEPT_WIN_CUDA ` 替换为 `const`。更多细节可以查看 [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575)。
+
+- "error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized"
+
+  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.7.0，您很可能会遇到这个问题 `torch/include\torch/csrc/jit/ir/ir.h(1347): error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized`. 解决这个问题的方法是修改 PyTorch 中的几个文件：
+
+  - 删除 `torch/include\torch/csrc/jit/ir/ir.h` 文件中的 `static constexpr Symbol Kind = ::c10::prim::profile;` 和 `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;`
+  - 将 `torch\include\pybind11\cast.h` 文件中的 `explicit operator type&() { return *(this->value); }` 替换为 `explicit operator type&() { return *((type*)this->value); }`
+  - 将 `torch/include\torch/csrc/jit/api/module.h` 文件中的 所有 `CONSTEXPR_EXCEPT_WIN_CUDA` 替换为 `const`
+
+  更多细节可以查看 [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956)。
+
+- MMCV 和 MMDetection 的兼容性问题；"ConvWS is already registered in conv layer"
+
+  请参考 [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation) 为您的 MMDetection 版本安装正确版本的 MMCV。
+
+### 使用问题
+
+- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one"
+
+  1. 这个错误是因为有些参数没有参与 loss 的计算，可能是代码中存在多个分支，导致有些分支没有参与 loss 的计算。更多细节见 [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582)。
+  2. 你可以设置 DDP 中的 `find_unused_parameters` 为 `True`，或者手动查找哪些参数没有用到。
+
+- "RuntimeError: Trying to backward through the graph a second time"
+
+  不能同时设置 `GradientCumulativeOptimizerHook` 和 `OptimizerHook`，这会导致 `loss.backward()` 被调用两次，于是程序抛出 `RuntimeError`。我们只需设置其中的一个。更多细节见 [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379)。
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/article.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/article.md
new file mode 100644
index 0000000000000000000000000000000000000000..96768502cedb607d58ea2dc8d17b3dd8b9af20b2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/article.md
@@ -0,0 +1,63 @@
+## 解读文章汇总
+
+这篇文章汇总了 [OpenMMLab](https://www.zhihu.com/people/openmmlab) 解读的部分文章（更多文章和视频见 [OpenMMLabCourse](https://github.com/open-mmlab/OpenMMLabCourse)），如果您有推荐的文章（不一定是 OpenMMLab 发布的文章，可以是自己写的文章），非常欢迎提 [Pull Request](http://127.0.0.1:5501/mmcv/docs/zh_cn/_build/html/community/pr.html) 添加到这里。
+
+### MMCV 解读文章
+
+#### 框架解读
+
+- [MMCV 核心组件分析(一)：整体概述](https://zhuanlan.zhihu.com/p/336081587)
+- [MMCV 核心组件分析(二)：FileHandler](https://zhuanlan.zhihu.com/p/336097883)
+- [MMCV 核心组件分析(三): FileClient](https://zhuanlan.zhihu.com/p/339190576)
+- [MMCV 核心组件分析(四): Config](https://zhuanlan.zhihu.com/p/346203167)
+- [MMCV 核心组件分析(五): Registry](https://zhuanlan.zhihu.com/p/355271993)
+- [MMCV 核心组件分析(六): Hook](https://zhuanlan.zhihu.com/p/355272220)
+- [MMCV 核心组件分析(七): Runner](https://zhuanlan.zhihu.com/p/355272459)
+- [MMCV Hook 食用指南](https://zhuanlan.zhihu.com/p/448600739)
+- [PyTorch & MMCV Dispatcher 机制解析](https://zhuanlan.zhihu.com/p/451671838)
+
+#### 工具解读
+
+- [训练可视化工具哪款是你的菜？MMCV一行代码随你挑](https://zhuanlan.zhihu.com/p/387078211)
+
+#### 安装指南
+
+- [久等了！Windows 平台 MMCV 的预编译包终于来了！](https://zhuanlan.zhihu.com/p/441653536)
+- [Windows 环境从零安装 mmcv-full](https://zhuanlan.zhihu.com/p/434491590)
+
+#### 知乎问答
+
+- [深度学习科研，如何高效进行代码和实验管理？](https://www.zhihu.com/question/269707221/answer/2480772257)
+- [深度学习方面的科研工作中的实验代码有什么规范和写作技巧？如何妥善管理实验数据？](https://www.zhihu.com/question/268193800/answer/2586000037)
+
+### 下游算法库解读文章
+
+- [MMDetection](https://mmdetection.readthedocs.io/zh_CN/latest/article.html)
+
+### PyTorch 解读文章
+
+- [PyTorch1.11 亮点一览：TorchData、functorch、DDP 静态图](https://zhuanlan.zhihu.com/p/486222256)
+- [PyTorch1.12 亮点一览：DataPipe + TorchArrow 新的数据加载与处理范式](https://zhuanlan.zhihu.com/p/537868554)
+- [PyTorch 源码解读之 nn.Module：核心网络模块接口详解](https://zhuanlan.zhihu.com/p/340453841)
+- [PyTorch 源码解读之 torch.autograd：梯度计算详解](https://zhuanlan.zhihu.com/p/321449610)
+- [PyTorch 源码解读之 torch.utils.data：解析数据处理全流程](https://zhuanlan.zhihu.com/p/337850513)
+- [PyTorch 源码解读之 torch.optim：优化算法接口详解](https://zhuanlan.zhihu.com/p/346205754)
+- [PyTorch 源码解读之 DP & DDP：模型并行和分布式训练解析](https://zhuanlan.zhihu.com/p/343951042)
+- [PyTorch 源码解读之 BN & SyncBN：BN 与 多卡同步 BN 详解](https://zhuanlan.zhihu.com/p/337732517)
+- [PyTorch 源码解读之 torch.cuda.amp: 自动混合精度详解](https://zhuanlan.zhihu.com/p/348554267)
+- [PyTorch 源码解读之 cpp_extension：揭秘 C++/CUDA 算子实现和调用全流程](https://zhuanlan.zhihu.com/p/348555597)
+- [PyTorch 源码解读之即时编译篇](https://zhuanlan.zhihu.com/p/361101354)
+- [PyTorch 源码解读之分布式训练了解一下？](https://zhuanlan.zhihu.com/p/361314953)
+- [PyTorch 源码解读之 torch.serialization & torch.hub](https://zhuanlan.zhihu.com/p/364239544)
+
+### 其他
+
+- [困扰我 48 小时的深拷贝，今天终于...](https://zhuanlan.zhihu.com/p/470892209)
+- [拿什么拯救我的 4G 显卡](https://zhuanlan.zhihu.com/p/430123077)
+- [是谁偷偷动了我的 logger](https://zhuanlan.zhihu.com/p/481383590)
+- [三句话，让 logger 言听计从](https://zhuanlan.zhihu.com/p/487524917)
+- [Logging 不为人知的二三事](https://zhuanlan.zhihu.com/p/502610682)
+- [Type Hints 入门教程，让代码更加规范整洁](https://zhuanlan.zhihu.com/p/519335398)
+- [手把手教你如何高效地在 MMCV 中贡献算子](https://zhuanlan.zhihu.com/p/464492627)
+- [OpenMMLab 支持 IPU 训练芯片](https://zhuanlan.zhihu.com/p/517527926)
+- [基于 MMCV 走上开源大佬之路？](https://zhuanlan.zhihu.com/p/391144979)
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/build.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/build.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c2ad15d641a4f910ace360e8580a5ec3d55560e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/build.md
@@ -0,0 +1,404 @@
+## 从源码编译 MMCV
+
+### 编译 mmcv
+
+在编译 mmcv 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。可使用以下命令验证
+
+```bash
+python -c 'import torch;print(torch.__version__)'
+```
+
+:::{note}
+
+- 如果克隆代码仓库的速度过慢，可以使用以下命令克隆（注意：gitee 的 mmcv 不一定和 github 的保持一致，因为每天只同步一次）
+
+```bash
+git clone https://gitee.com/open-mmlab/mmcv.git
+```
+
+- 如果打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。
+
+- 如果编译过程安装依赖库的时间过长，可以[设置 pypi 源](https://mirrors.tuna.tsinghua.edu.cn/help/pypi/)
+
+```bash
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+:::
+
+#### 在 Linux 上编译 mmcv
+
+| TODO: 视频教程
+
+1. 克隆代码仓库
+
+   ```bash
+   git clone https://github.com/open-mmlab/mmcv.git
+   cd mmcv
+   ```
+
+2. 安装 `ninja` 和 `psutil` 以加快编译速度
+
+   ```bash
+   pip install -r requirements/optional.txt
+   ```
+
+3. 检查 nvcc 的版本（要求大于等于 9.2，如果没有 GPU，可以跳过）
+
+   ```bash
+   nvcc --version
+   ```
+
+   上述命令如果输出以下信息，表示 nvcc 的设置没有问题，否则需要设置 CUDA_HOME
+
+   ```
+   nvcc: NVIDIA (R) Cuda compiler driver
+   Copyright (c) 2005-2020 NVIDIA Corporation
+   Built on Mon_Nov_30_19:08:53_PST_2020
+   Cuda compilation tools, release 11.2, V11.2.67
+   Build cuda_11.2.r11.2/compiler.29373293_0
+   ```
+
+   :::{note}
+   如果想要支持 ROCm，可以参考 [AMD ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html) 安装 ROCm。
+   :::
+
+4. 检查 gcc 的版本（要求大于等于**5.4**）
+
+   ```bash
+   gcc --version
+   ```
+
+5. 开始编译（预估耗时 10 分钟）
+
+   ```bash
+   pip install -e . -v
+   ```
+
+6. 验证安装
+
+   ```bash
+   python .dev_scripts/check_installation.py
+   ```
+
+   如果上述命令没有报错，说明安装成功。如有报错，请查看[问题解决页面](../faq.html)是否已经有解决方案。
+
+   如果没有找到解决方案，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。
+
+#### 在 macOS 上编译 mmcv
+
+| TODO: 视频教程
+
+```{note}
+如果你使用的是搭载 apple silicon 的 mac 设备，请安装 PyTorch 1.13+ 的版本，否则会遇到 [issues#2218](https://github.com/open-mmlab/mmcv/issues/2218) 中的问题。
+```
+
+1. 克隆代码仓库
+
+   ```bash
+   git clone https://github.com/open-mmlab/mmcv.git
+   cd mmcv
+   ```
+
+2. 安装 `ninja` 和 `psutil` 以加快编译速度
+
+   ```bash
+   pip install -r requirements/optional.txt
+   ```
+
+3. 开始编译
+
+   ```bash
+   pip install -e .
+   ```
+
+4. 验证安装
+
+   ```bash
+   python .dev_scripts/check_installation.py
+   ```
+
+   如果上述命令没有报错，说明安装成功。如有报错，请查看[问题解决页面](../faq.md)是否已经有解决方案。
+
+   如果没有找到解决方案，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。
+
+#### 在 Windows 上编译 mmcv
+
+| TODO: 视频教程
+
+在 Windows 上编译 mmcv 比 Linux 复杂，本节将一步步介绍如何在 Windows 上编译 mmcv。
+
+##### 依赖项
+
+请先安装以下的依赖项：
+
+- [Git](https://git-scm.com/download/win)：安装期间，请选择 **add git to Path**
+- [Visual Studio Community 2019](https://visualstudio.microsoft.com)：用于编译 C++ 和 CUDA 代码
+- [Miniconda](https://docs.conda.io/en/latest/miniconda.html)：包管理工具
+- [CUDA 10.2](https://developer.nvidia.com/cuda-10.2-download-archive)：如果只需要 CPU 版本可以不安装 CUDA，安装 CUDA 时，可根据需要进行自定义安装。如果已经安装新版本的显卡驱动，建议取消驱动程序的安装
+
+```{note}
+如果不清楚如何安装以上依赖，请参考[Windows 环境从零安装 mmcv](https://zhuanlan.zhihu.com/p/434491590)。
+另外，你需要知道如何在 Windows 上设置变量环境，尤其是 "PATH" 的设置，以下安装过程都会用到。
+```
+
+##### 通用步骤
+
+1. 从 Windows 菜单启动 Anaconda 命令行
+
+   如 Miniconda 安装程序建议，不要使用原始的 `cmd.exe` 或是 `powershell.exe`。命令行有两个版本，一个基于 PowerShell，一个基于传统的 `cmd.exe`。请注意以下说明都是使用的基于 PowerShell
+
+2. 创建一个新的 Conda 环境
+
+   ```powershell
+   (base) PS C:\Users\xxx> conda create --name mmcv python=3.7
+   (base) PS C:\Users\xxx> conda activate mmcv  # 确保做任何操作前先激活环境
+   ```
+
+3. 安装 PyTorch 时，可以根据需要安装支持 CUDA 或不支持 CUDA 的版本
+
+   ```powershell
+   # CUDA version
+   (mmcv) PS C:\Users\xxx> conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
+   # CPU version
+   (mmcv) PS C:\Users\xxx> conda install install pytorch torchvision cpuonly -c pytorch
+   ```
+
+4. 克隆代码仓库
+
+   ```powershell
+   (mmcv) PS C:\Users\xxx> git clone https://github.com/open-mmlab/mmcv.git
+   (mmcv) PS C:\Users\xxx> cd mmcv
+   ```
+
+5. 安装 `ninja` 和 `psutil` 以加快编译速度
+
+   ```powershell
+   (mmcv) PS C:\Users\xxx\mmcv> pip install -r requirements/optional.txt
+   ```
+
+6. 设置 MSVC 编译器
+
+   设置环境变量。添加 `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` 到 `PATH`，则 `cl.exe` 可以在命令行中运行，如下所示。
+
+   ```powershell
+   (mmcv) PS C:\Users\xxx\mmcv> cl
+   Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64
+   Copyright (C) Microsoft Corporation.   All rights reserved.
+
+   usage: cl [ option... ] filename... [ / link linkoption... ]
+   ```
+
+   为了兼容性，我们使用 x86-hosted 以及 x64-targeted 版本，即路径中的 `Hostx86\x64` 。
+
+   因为 PyTorch 将解析 `cl.exe` 的输出以检查其版本，只有 utf-8 将会被识别，你可能需要将系统语言更改为英语。控制面板 -> 地区-> 管理-> 非 Unicode 来进行语言转换。
+
+##### 编译与安装 mmcv
+
+mmcv 有两个版本：
+
+- 只包含 CPU 算子的版本
+
+  编译 CPU 算子，但只有 x86 将会被编译，并且编译版本只能在 CPU only 情况下运行
+
+- 既包含 CPU 算子，又包含 CUDA 算子的版本
+
+  同时编译 CPU 和 CUDA 算子，`ops` 模块的 x86 与 CUDA 的代码都可以被编译。同时编译的版本可以在 CUDA 上调用 GPU
+
+###### CPU 版本
+
+编译安装
+
+```powershell
+(mmcv) PS C:\Users\xxx\mmcv> python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子
+(mmcv) PS C:\Users\xxx\mmcv> python setup.py develop  # 安装
+```
+
+###### GPU 版本
+
+1. 检查 `CUDA_PATH` 或者 `CUDA_HOME` 环境变量已经存在在 `envs` 之中
+
+   ```powershell
+   (mmcv) PS C:\Users\xxx\mmcv> ls env:
+
+   Name                           Value
+   ----                           -----
+   CUDA_PATH                      C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
+   CUDA_PATH_V10_1                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
+   CUDA_PATH_V10_2                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
+   ```
+
+   如果没有，你可以按照下面的步骤设置
+
+   ```powershell
+   (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2"
+   # 或者
+   (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = $env:CUDA_PATH_V10_2  # CUDA_PATH_V10_2 已经在环境变量中
+   ```
+
+2. 设置 CUDA 的目标架构
+
+   ```powershell
+   # 这里需要改成你的显卡对应的目标架构
+   (mmcv) PS C:\Users\xxx\mmcv> $env:TORCH_CUDA_ARCH_LIST="7.5"
+   ```
+
+   :::{note}
+   可以点击 [cuda-gpus](https://developer.nvidia.com/cuda-gpus) 查看 GPU 的计算能力，也可以通过 CUDA 目录下的 deviceQuery.exe 工具查看
+
+   ```powershell
+   (mmcv) PS C:\Users\xxx\mmcv> &"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\demo_suite\deviceQuery.exe"
+   Device 0: "NVIDIA GeForce GTX 1660 SUPER"
+   CUDA Driver Version / Runtime Version          11.7 / 11.1
+   CUDA Capability Major/Minor version number:    7.5
+   ```
+
+   上面的 7.5 表示目标架构。注意：需把上面命令的 v10.2 换成你的 CUDA 版本。
+   :::
+
+3. 编译安装
+
+   ```powershell
+   (mmcv) PS C:\Users\xxx\mmcv> python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子
+   (mmcv) PS C:\Users\xxx\mmcv> python setup.py develop # 安装
+   ```
+
+   ```{note}
+   如果你的 PyTorch 版本是 1.6.0，你可能会遇到一些 [issue](https://github.com/pytorch/pytorch/issues/42467) 提到的错误，你可以参考这个 [pull request](https://github.com/pytorch/pytorch/pull/43380/files) 修改本地环境的 PyTorch 源代码
+   ```
+
+##### 验证安装
+
+```powershell
+(mmcv) PS C:\Users\xxx\mmcv> python .dev_scripts/check_installation.py
+```
+
+如果上述命令没有报错，说明安装成功。如有报错，请查看[问题解决页面](../faq.md)是否已经有解决方案。
+如果没有找到解决方案，欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。
+
+### 编译 mmcv-lite
+
+如果你需要使用和 PyTorch 相关的模块，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。
+
+1. 克隆代码仓库
+
+   ```bash
+   git clone https://github.com/open-mmlab/mmcv.git
+   cd mmcv
+   ```
+
+2. 开始编译
+
+   ```bash
+   MMCV_WITH_OPS=0 pip install -e . -v
+   ```
+
+3. 验证安装
+
+   ```bash
+   python -c 'import mmcv;print(mmcv.__version__)'
+   ```
+
+### 在寒武纪 MLU 机器编译 mmcv-full
+
+#### 安装 torch_mlu
+
+##### 选项1: 基于寒武纪 docker image 安装
+
+首先请下载并且拉取寒武纪 docker (请向 service@cambricon.com 发邮件以获得最新的寒武纪 pytorch 发布 docker)。
+
+```
+docker pull ${docker image}
+```
+
+进入 docker, [编译 MMCV MLU](#编译mmcv-mlu) 并[进行验证](#验证是否成功安装)。
+
+##### 选项2：基于 cambricon pytorch 源码编译安装
+
+请向 service@cambricon.com 发送邮件或联系 Cambricon 工程师以获取合适版本的 CATCH 软件包，在您获得合适版本的 CATCH 软件包后，请参照 ${CATCH-path}/CONTRIBUTING.md 中的步骤安装 CATCH。
+
+#### 编译 MMCV
+
+克隆代码仓库
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git
+```
+
+算子库 mlu-ops 在编译 MMCV 时自动下载到默认路径(mmcv/mlu-ops)，你也可以在编译前设置环境变量 MMCV_MLU_OPS_PATH 指向已经存在的 mlu-ops 算子库路径。
+
+```bash
+export MMCV_MLU_OPS_PATH=/xxx/xxx/mlu-ops
+```
+
+开始编译
+
+```bash
+cd mmcv
+export MMCV_WITH_OPS=1
+export FORCE_MLU=1
+python setup.py install
+```
+
+#### 验证是否成功安装
+
+完成上述安装步骤之后，您可以尝试运行下面的 Python 代码以测试您是否成功在 MLU 设备上安装了 mmcv-full
+
+```python
+import torch
+import torch_mlu
+from mmcv.ops import sigmoid_focal_loss
+x = torch.randn(3, 10).mlu()
+x.requires_grad = True
+y = torch.tensor([1, 5, 3]).mlu()
+w = torch.ones(10).float().mlu()
+output = sigmoid_focal_loss(x, y, 2.0, 0.25, w, 'none')
+```
+
+### 在昇腾 NPU 机器编译 mmcv
+
+在编译 mmcv 前，需要安装 torch_npu，完整安装教程详见 [PyTorch 安装指南](https://gitee.com/ascend/pytorch/blob/master/docs/zh/PyTorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97/PyTorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97.md#pytorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97)
+
+#### 选项 1: 使用 NPU 设备源码编译安装 mmcv (推荐方式)
+
+- 拉取 [MMCV 源码](https://github.com/open-mmlab/mmcv.git)
+
+```bash
+git pull https://github.com/open-mmlab/mmcv.git
+```
+
+- 编译
+
+```bash
+MMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_NPU=1 python setup.py build_ext
+```
+
+- 安装
+
+```bash
+MMCV_WITH_OPS=1 FORCE_NPU=1 python setup.py develop
+```
+
+#### 选项 2: 使用 pip 安装 Ascend 编译版本的 mmcv
+
+Ascend 编译版本的 mmcv 在 mmcv >= 1.7.0 时已经支持直接 pip 安装
+
+```bash
+pip install mmcv -f https://download.openmmlab.com/mmcv/dist/ascend/torch1.8.0/index.html
+```
+
+#### 验证
+
+```python
+import torch
+import torch_npu
+from mmcv.ops import softmax_focal_loss
+
+# Init tensor to the NPU
+x = torch.randn(3, 10).npu()
+y = torch.tensor([1, 5, 3]).npu()
+w = torch.ones(10).float().npu()
+
+output = softmax_focal_loss(x, y, 2.0, 0.25, w, 'none')
+print(output)
+```
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/installation.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/installation.md
new file mode 100644
index 0000000000000000000000000000000000000000..16cb007628f4ea35fe51c16c341c6a90fb625ccb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/installation.md
@@ -0,0 +1,369 @@
+## 安装 MMCV
+
+MMCV 有两个版本：
+
+- **mmcv**: 完整版，包含所有的特性以及丰富的开箱即用的 CPU 和 CUDA 算子。注意，完整版本可能需要更长时间来编译。
+- **mmcv-lite**: 精简版，不包含 CPU 和 CUDA 算子但包含其余所有特性和功能，类似 MMCV 1.0 之前的版本。如果你不需要使用算子的话，精简版可以作为一个考虑选项。
+
+```{warning}
+请不要在同一个环境中安装两个版本，否则可能会遇到类似 `ModuleNotFound` 的错误。在安装一个版本之前，需要先卸载另一个。`如果 CUDA 可用，强烈推荐安装 mmcv`。
+```
+
+### 安装 mmcv
+
+在安装 mmcv 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。可使用以下命令验证
+
+```bash
+python -c 'import torch;print(torch.__version__)'
+```
+
+如果输出版本信息，则表示 PyTorch 已安装。
+
+#### 使用 mim 安装（推荐）
+
+[mim](https://github.com/open-mmlab/mim) 是 OpenMMLab 项目的包管理工具，使用它可以很方便地安装 mmcv。
+
+```bash
+pip install -U openmim
+mim install mmcv
+```
+
+如果发现上述的安装命令没有使用预编译包（以 `.whl` 结尾）而是使用源码包（以 `.tar.gz` 结尾）安装，则有可能是我们没有提供和当前环境的 PyTorch 版本、CUDA 版本相匹配的 mmcv 预编译包，此时，你可以[源码安装 mmcv](build.md)。
+
+<details>
+<summary>使用预编译包的安装日志</summary>
+
+Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
+Collecting mmcv<br />
+<b>Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl</b>
+
+</details>
+
+<details>
+<summary>使用源码包的安装日志</summary>
+
+Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html<br />
+Collecting mmcv==2.0.0<br />
+<b>Downloading mmcv-2.0.0.tar.gz</b>
+
+</details>
+
+如需安装指定版本的 mmcv，例如安装 2.0.0 版本的 mmcv，可使用以下命令
+
+```bash
+mim install mmcv==2.0.0
+```
+
+:::{note}
+如果你打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。
+
+另外，如果安装依赖库的时间过长，可以指定 pypi 源
+
+```bash
+mim install "mmcv>=2.0.0rc1" -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+:::
+
+安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) 脚本检查 mmcv 是否安装成功。
+
+#### 使用 pip 安装
+
+使用以下命令查看 CUDA 和 PyTorch 的版本
+
+```bash
+python -c 'import torch;print(torch.__version__);print(torch.version.cuda)'
+```
+
+根据系统的类型、CUDA 版本、PyTorch 版本以及 MMCV 版本选择相应的安装命令
+
+<html>
+<body>
+    <style>
+      select {
+          z-index: 1000;
+          position: absolute;
+          top: 10px;
+          width: 6.7rem;
+      }
+      #select-container {
+          position: relative;
+          height: 30px;
+      }
+      #select-cmd {
+          background-color: #f5f6f7;
+          font-size: 14px;
+          margin-top: 20px;
+      }
+      /* 让每一个都间隔1.3rem */
+      #select-os {
+          /* left: 1.375rem; */
+          left: 0;
+      }
+      #select-cuda {
+          /* left: 9.375rem;    9.375 = 1.375 + 6.7 + 1.3 */
+          left: 8rem;
+      }
+      #select-torch {
+          /* left: 17.375rem;    17.375 = 9.375 + 6.7 + 1.3 */
+          left: 16rem;
+      }
+      #select-mmcv {
+          /* left: 25.375rem;    25.375 = 17.375 + 6.7 + 1.3 */
+          left: 24rem;
+      }
+    </style>
+    <div id="select-container">
+        <select
+            onmousedown="handleSelectMouseDown(this.id)"
+            onblur="handleSelectBlur(this.id)"
+            onchange="changeOS(this.value)"
+            id="select-os">
+        </select>
+        <select
+            onmousedown="handleSelectMouseDown(this.id)"
+            onblur="handleSelectBlur(this.id)"
+            onchange="changeCUDA(this.value)"
+            id="select-cuda">
+        </select>
+        <select
+            onmousedown="handleSelectMouseDown(this.id)"
+            onblur="handleSelectBlur(this.id)"
+            onchange="changeTorch(this.value)"
+            id="select-torch">
+        </select>
+        <select
+            onmousedown="handleSelectMouseDown(this.id)"
+            onblur="handleSelectBlur(this.id)"
+            onchange="changeMMCV(this.value)"
+            id="select-mmcv">
+        </select>
+    </div>
+    <pre id="select-cmd"></pre>
+</body>
+<script>
+    // 各个select当前的值
+    let osVal, cudaVal, torchVal, mmcvVal;
+    function changeMMCV(val) {
+        mmcvVal = val;
+        change("select-mmcv");
+    }
+    function changeTorch(val) {
+        torchVal = val;
+        change("select-torch");
+    }
+    function changeCUDA(val) {
+        cudaVal = val;
+        change("select-cuda");
+    }
+    function changeOS(val) {
+        osVal = val;
+        change("select-os");
+    }
+    // 控制size大小相关的几个方法
+    function handleSelectMouseDown(id) {
+        const dom = document.getElementById(id);
+        if (!dom) return;
+        const len = dom?.options?.length;
+        if (len >= 9) {
+            dom.size = 10;
+            dom.style.zIndex = 100;
+        }
+    }
+    function handleSelectClick() {
+        const selects = Array.from(document.getElementsByTagName("select"));
+        selects.forEach(select => {
+            select.size = 1;
+        });
+    }
+    function handleSelectBlur(id) {
+        const dom = document.getElementById(id);
+        if (!dom) {
+            // 如果没有指定特定的id，那就直接把所有的select都设置成size = 1
+            handleSelectClick();
+            return;
+        }
+        dom.size = 1;
+        dom.style.zIndex = 1;
+    }
+    function changeCmd() {
+        const cmd = document.getElementById("select-cmd");
+        let cmdString = "pip install mmcv=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html";
+        // e.g: pip install mmcv==2.0.0rc1 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9/index.html
+        let cudaVersion;
+        if (cudaVal === "cpu" || cudaVal === "mps") {
+            cudaVersion = "cpu";
+        } else {
+            cudaVersion = `cu${cudaVal.split(".").join("")}`;
+        }
+        const torchVersion = `torch${torchVal.substring(0, torchVal.length - 2)}`;
+        cmdString = cmdString.replace("{cu_version}", cudaVersion).replace("{mmcv_version}", mmcvVal).replace("{torch_version}", torchVersion);
+        cmd.textContent = cmdString;
+    }
+    // string数组去重
+    function unique(arr) {
+        if (!arr || !Array.isArray(arr)) return [];
+        return [...new Set(arr)];
+    }
+    // 根据string数组生成option的DocumentFragment
+    function genOptionFragment(data, id) {
+        const name = id.includes("-")? id.split("-")[1] : id;
+        const fragment = new DocumentFragment();
+        data.forEach(option => {
+            const ele = document.createElement("option");
+            let text = `${name} ${option}`;
+            if (name === "os" || option.toUpperCase() === "CPU" || option.toUpperCase() === "MPS") {
+                text = `${option}`;
+            }
+            ele.textContent = text;
+            // 添加value属性，方便下拉框选择时直接读到数据
+            ele.value = option;
+            // 添加点击事件监听
+            ele.addEventListener('click', handleSelectClick);
+            fragment.appendChild(ele);
+        });
+        return fragment;
+    }
+    // 在dom树中找到id对应的dom（select元素），并将生成的options添加到元素内
+    function findAndAppend(data, id) {
+        const fragment = genOptionFragment(data, id);
+        const dom = document.getElementById(id);
+        if (dom) dom.replaceChildren(fragment);
+    }
+    /**
+     * change方法的重点在于
+     * 1. 各个下拉框数据的联动
+     *      OS ==> cuda ==> torch ==> mmcv
+     * 2. 命令行的修改
+    */
+    function change(id) {
+        const order = ["select-mmcv", "select-torch", "select-cuda", "select-os"];
+        const idx = order.indexOf(id);
+        if (idx === -1) return;
+        const versionDetail = version[osVal];
+        if (idx >= 3) {
+            // 根据os修改cuda
+            let cuda = [];
+            versionDetail.forEach(v => {
+                cuda.push(v.cuda);
+            });
+            cuda = unique(cuda);
+            cudaVal = cuda[0];
+            findAndAppend(cuda, "select-cuda");
+        }
+        if (idx >= 2) {
+            // 根据cuda修改torch
+            const torch = [];
+            versionDetail.forEach(v => {
+                if (v.cuda === cudaVal) torch.push(v.torch);
+            });
+            torchVal = torch[0];
+            findAndAppend(torch, "select-torch");
+        }
+        if (idx >= 1) {
+            // 根据torch修改mmcv
+            let mmcv = [];
+            versionDetail.forEach(v => {
+                if (v.cuda === cudaVal && v.torch === torchVal) mmcv = v.mmcv;
+            });
+            mmcvVal = mmcv[0];
+            findAndAppend(mmcv, "select-mmcv");
+        }
+        changeCmd();
+    }
+    // 初始化，处理version数据，并调用findAndAppend
+    function init() {
+        // 增加一个全局的click事件监听，作为select onBlur事件失效的兜底
+        document.addEventListener("click", handleSelectBlur);
+        const version = window.version;
+        // OS
+        const os = Object.keys(version);
+        osVal = os[0];
+        findAndAppend(os, "select-os");
+        change("select-os");
+        changeCmd();
+    }
+    // 利用xhr获取本地version数据，如果作为html直接浏览的话需要使用本地服务器打开，否则会有跨域问题
+    window.onload = function () {
+        const url = "../_static/version.json"
+        // 申明一个XMLHttpRequest
+        const request = new XMLHttpRequest();
+        // 设置请求方法与路径
+        request.open("get", url);
+        // 不发送数据到服务器
+        request.send(null);
+        //XHR对象获取到返回信息后执行
+        request.onload = function () {
+            // 返回状态为200，即为数据获取成功
+            if (request.status !== 200) return;
+            const data = JSON.parse(request.responseText);
+            window.version = data;
+            init();
+        }
+    }
+</script>
+</html>
+
+如果在上面的下拉框中没有找到对应的版本，则可能是没有对应 PyTorch 或者 CUDA 或者 mmcv 版本的预编译包，此时，你可以[源码安装 mmcv](build.md)。
+
+:::{note}
+PyTorch 在 1.x.0 和 1.x.1 之间通常是兼容的，故 mmcv 只提供 1.x.0 的编译包。如果你
+的 PyTorch 版本是 1.x.1，你可以放心地安装在 1.x.0 版本编译的 mmcv。例如，如果你的
+PyTorch 版本是 1.8.1，你可以放心选择 1.8.x。
+:::
+
+:::{note}
+如果你打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。
+
+另外，如果安装依赖库的时间过长，可以指定 pypi 源
+
+```bash
+pip install mmcv -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+:::
+
+安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) 脚本检查 mmcv 是否安装成功。
+
+#### 使用 docker 镜像
+
+先将算法库克隆到本地再构建镜像
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
+docker build -t mmcv -f docker/release/Dockerfile .
+```
+
+也可以直接使用下面的命令构建镜像
+
+```bash
+docker build -t mmcv https://github.com/open-mmlab/mmcv.git#main:docker/release
+```
+
+[Dockerfile](release/Dockerfile) 默认安装最新的 mmcv，如果你想要指定版本，可以使用下面的命令
+
+```bash
+docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0 .
+```
+
+如果你想要使用其他版本的 PyTorch 和 CUDA，你可以在构建镜像时指定它们的版本。
+
+例如指定 PyTorch 的版本是 1.11，CUDA 的版本是 11.3
+
+```bash
+docker build -t mmcv -f docker/release/Dockerfile \
+    --build-arg PYTORCH=1.11.0 \
+    --build-arg CUDA=11.3 \
+    --build-arg CUDNN=8 \
+    --build-arg MMCV=2.0.0 .
+```
+
+更多 PyTorch 和 CUDA 镜像可以点击 [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags) 查看。
+
+### 安装 mmcv-lite
+
+如果你需要使用和 PyTorch 相关的模块，请确保 PyTorch 已经成功安装在环境中，可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。
+
+```python
+pip install mmcv-lite
+```
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/introduction.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/introduction.md
new file mode 100644
index 0000000000000000000000000000000000000000..4c735b94d3db71e484d04794fb5509cabbed68a9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/introduction.md
@@ -0,0 +1,36 @@
+## 介绍 MMCV
+
+MMCV 是一个面向计算机视觉的基础库，它提供了以下功能：
+
+- [图像和视频处理](../understand_mmcv/data_process.md)
+- [图像和标注结果可视化](../understand_mmcv/visualization.md)
+- [图像变换](../understand_mmcv/data_transform.md)
+- [多种 CNN 网络结构](../understand_mmcv/cnn.md)
+- [高质量实现的常见 CUDA 算子](../understand_mmcv/ops.md)
+
+MMCV 支持多种平台，包括：
+
+- Linux
+- Windows
+- macOS
+
+它支持的 OpenMMLab 项目：
+
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/previous_versions.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/previous_versions.md
new file mode 100644
index 0000000000000000000000000000000000000000..d543818752b51985169d4489bd46708725ce422d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/get_started/previous_versions.md
@@ -0,0 +1,47 @@
+## 其他版本的 PyTorch
+
+我们不再提供在较低的 `PyTorch` 版本下编译的 `mmcv-full` 包，但为了您的方便，您可以在下面找到它们。
+
+### PyTorch 1.4
+
+| 1.0.0 \<= mmcv_version \<= 1.2.1
+
+#### CUDA 10.1
+
+```bash
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html
+```
+
+#### CUDA 9.2
+
+```bash
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html
+```
+
+#### CPU
+
+```bash
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.4.0/index.html
+```
+
+### PyTorch v1.3
+
+| 1.0.0 \<= mmcv_version \<= 1.3.16
+
+#### CUDA 10.1
+
+```bash
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html
+```
+
+#### CUDA 9.2
+
+```bash
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html
+```
+
+#### CPU
+
+```bash
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.3.0/index.html
+```
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/index.rst b/head_extractor/mmcv-2.1.0/docs/zh_cn/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..98cf08890618e699c7ac4731093818a07e862362
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/index.rst
@@ -0,0 +1,66 @@
+欢迎来到 MMCV 的中文文档！
+=============================
+
+您可以在页面左下角切换中英文文档。
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 介绍与安装
+
+   get_started/introduction.md
+   get_started/installation.md
+   get_started/build.md
+   get_started/article.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 深入理解 MMCV
+
+   understand_mmcv/data_process.md
+   understand_mmcv/data_transform.md
+   understand_mmcv/visualization.md
+   understand_mmcv/cnn.md
+   understand_mmcv/ops.md
+
+.. toctree::
+   :caption: 语言切换
+
+   switch_language.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 兼容性
+
+   compatibility.md
+
+.. toctree::
+
+   faq.md
+
+.. toctree::
+   :maxdepth: 2
+   :caption: 社区
+
+   community/contributing.md
+   community/pr.md
+   community/code_style.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: API 文档
+
+   mmcv.image <api/image>
+   mmcv.video <api/video>
+   mmcv.visualization <api/visualization>
+   mmcv.cnn <api/cnn>
+   mmcv.ops <api/ops>
+   mmcv.transforms <api/transforms>
+   mmcv.arraymisc <api/arraymisc>
+   mmcv.utils <api/utils>
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/make.bat b/head_extractor/mmcv-2.1.0/docs/zh_cn/make.bat
new file mode 100644
index 0000000000000000000000000000000000000000..7893348a1b7dbb588983a48e6991282eae7e1b55
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/switch_language.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/switch_language.md
new file mode 100644
index 0000000000000000000000000000000000000000..e4ac4b229ad520f142243f3a918748c542e9989f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/switch_language.md
@@ -0,0 +1,3 @@
+## <a href='https://mmcv.readthedocs.io/en/2.x/'>English</a>
+
+## <a href='https://mmcv.readthedocs.io/zh_CN/2.x/'>简体中文</a>
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/cnn.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/cnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..faba7c59187d3112d80e1c163e456734aa1b4bde
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/cnn.md
@@ -0,0 +1,75 @@
+## 卷积神经网络
+
+我们为卷积神经网络提供了一些构建模块，包括层构建、模块组件和权重初始化。
+
+### 网络层的构建
+
+在运行实验时，我们可能需要尝试同属一种类型但不同配置的层，但又不希望每次都修改代码。于是我们提供一些层构建方法，可以从字典构建层，字典可以在配置文件中配置，也可以通过命令行参数指定。
+
+#### 用法
+
+一个简单的例子：
+
+```python
+from mmcv.cnn import build_conv_layer
+
+cfg = dict(type='Conv3d')
+layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
+```
+
+- `build_conv_layer`: 支持的类型包括 Conv1d、Conv2d、Conv3d、Conv (Conv是Conv2d的别名）
+- `build_norm_layer`: 支持的类型包括 BN1d、BN2d、BN3d、BN (alias for BN2d)、SyncBN、GN、LN、IN1d、IN2d、IN3d、IN（IN是IN2d的别名）
+- `build_activation_layer`：支持的类型包括 ReLU、LeakyReLU、PReLU、RReLU、ReLU6、ELU、Sigmoid、Tanh、GELU
+- `build_upsample_layer`: 支持的类型包括 nearest、bilinear、deconv、pixel_shuffle
+- `build_padding_layer`: 支持的类型包括 zero、reflect、replicate
+
+#### 拓展
+
+我们还允许自定义层和算子来扩展构建方法。
+
+1. 编写和注册自己的模块：
+
+   ```python
+   from mmengine.registry import MODELS
+
+   @MODELS.register_module()
+   class MyUpsample:
+
+       def __init__(self, scale_factor):
+           pass
+
+       def forward(self, x):
+           pass
+   ```
+
+2. 在某处导入 `MyUpsample` （例如 `__init__.py` ）然后使用它：
+
+   ```python
+   from mmcv.cnn import build_upsample_layer
+
+   cfg = dict(type='MyUpsample', scale_factor=2)
+   layer = build_upsample_layer(cfg)
+   ```
+
+### 模块组件
+
+我们还提供了常用的模块组件，以方便网络构建。
+卷积组件 `ConvModule` 由 convolution、normalization以及activation layers 组成，更多细节请参考 [ConvModule api](api.html#mmcv.cnn.ConvModule)。
+
+```python
+from mmcv.cnn import ConvModule
+
+# conv + bn + relu
+conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+# conv + gn + relu
+conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
+# conv + relu
+conv = ConvModule(3, 8, 2)
+# conv
+conv = ConvModule(3, 8, 2, act_cfg=None)
+# conv + leaky relu
+conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
+# bn + conv + relu
+conv = ConvModule(
+    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
+```
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/data_process.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/data_process.md
new file mode 100644
index 0000000000000000000000000000000000000000..7e0afd1e690b51d43d6e5b88cfa198dee32eb3d2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/data_process.md
@@ -0,0 +1,275 @@
+## 数据处理
+
+### 图像
+
+图像模块提供了一些图像预处理的函数，该模块依赖 `opencv` 。
+
+#### 读取/保存/显示
+
+使用 `imread` 和 `imwrite` 函数可以读取和保存图像。
+
+```python
+import mmcv
+
+img = mmcv.imread('test.jpg')
+img = mmcv.imread('test.jpg', flag='grayscale')
+img_ = mmcv.imread(img)  # 相当于什么也没做
+mmcv.imwrite(img, 'out.jpg')
+```
+
+从二进制中读取图像
+
+```python
+with open('test.jpg', 'rb') as f:
+    data = f.read()
+img = mmcv.imfrombytes(data)
+```
+
+显示图像文件或已读取的图像
+
+```python
+mmcv.imshow('tests/data/color.jpg')
+
+for i in range(10):
+    img = np.random.randint(256, size=(100, 100, 3), dtype=np.uint8)
+    mmcv.imshow(img, win_name='test image', wait_time=200)
+```
+
+#### 色彩空间转换
+
+支持的转换函数：
+
+- bgr2gray
+- gray2bgr
+- bgr2rgb
+- rgb2bgr
+- bgr2hsv
+- hsv2bgr
+
+```python
+img = mmcv.imread('tests/data/color.jpg')
+img1 = mmcv.bgr2rgb(img)
+img2 = mmcv.rgb2gray(img1)
+img3 = mmcv.bgr2hsv(img)
+```
+
+#### 缩放
+
+有三种缩放图像的方法。所有以 `imresize_*` 开头的函数都有一个 `return_scale` 参数，如果
+该参数为 `False` ，函数的返回值只有调整之后的图像，否则是一个元组 `(resized_img, scale)` 。
+
+```python
+# 缩放图像至给定的尺寸
+mmcv.imresize(img, (1000, 600), return_scale=True)
+
+# 缩放图像至与给定的图像同样的尺寸
+mmcv.imresize_like(img, dst_img, return_scale=False)
+
+# 以一定的比例缩放图像
+mmcv.imrescale(img, 0.5)
+
+# 缩放图像至最长的边不大于1000、最短的边不大于800并且没有改变图像的长宽比
+mmcv.imrescale(img, (1000, 800))
+```
+
+#### 旋转
+
+我们可以使用 `imrotate` 旋转图像一定的角度。旋转的中心需要指定，默认值是原始图像的中心。有
+两种旋转的模式，一种保持图像的尺寸不变，因此旋转后原始图像中的某些部分会被裁剪，另一种是扩大
+图像的尺寸进而保留完整的原始图像。
+
+```python
+img = mmcv.imread('tests/data/color.jpg')
+
+# 顺时针旋转图像30度
+img_ = mmcv.imrotate(img, 30)
+
+# 逆时针旋转图像90度
+img_ = mmcv.imrotate(img, -90)
+
+# 顺时针旋转图像30度并且缩放图像为原始图像的1.5倍
+img_ = mmcv.imrotate(img, 30, scale=1.5)
+
+# 以坐标(100, 100)为中心顺时针旋转图像30度
+img_ = mmcv.imrotate(img, 30, center=(100, 100))
+
+# 顺时针旋转图像30度并扩大图像的尺寸
+img_ = mmcv.imrotate(img, 30, auto_bound=True)
+```
+
+#### 翻转
+
+我们可以使用 `imflip` 翻转图像。
+
+```python
+img = mmcv.imread('tests/data/color.jpg')
+
+# 水平翻转图像
+mmcv.imflip(img)
+
+# 垂直翻转图像
+mmcv.imflip(img, direction='vertical')
+```
+
+#### 裁剪
+
+`imcrop` 可以裁剪图像的一个或多个区域，每个区域用左上角和右下角坐标表示，形如(x1, y1, x2, y2)
+
+```python
+import mmcv
+import numpy as np
+
+img = mmcv.imread('tests/data/color.jpg')
+
+# 裁剪区域 (10, 10, 100, 120)
+bboxes = np.array([10, 10, 100, 120])
+patch = mmcv.imcrop(img, bboxes)
+
+# 裁剪两个区域，分别是 (10, 10, 100, 120) 和 (0, 0, 50, 50)
+bboxes = np.array([[10, 10, 100, 120], [0, 0, 50, 50]])
+patches = mmcv.imcrop(img, bboxes)
+
+# 裁剪两个区域并且缩放区域1.2倍
+patches = mmcv.imcrop(img, bboxes, scale=1.2)
+```
+
+#### 填充
+
+`impad` and `impad_to_multiple` 可以用给定的值将图像填充至给定的尺寸。
+
+```python
+img = mmcv.imread('tests/data/color.jpg')
+
+# 用给定值将图像填充至 (1000, 1200)
+img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=0)
+
+# 用给定值分别填充图像的3个通道至 (1000, 1200)
+img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=(100, 50, 200))
+
+# 用给定值填充图像的左、右、上、下四条边
+img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=0)
+
+# 用3个值分别填充图像的左、右、上、下四条边的3个通道
+img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=(100, 50, 200))
+
+# 将图像的四条边填充至能够被给定值整除
+img_ = mmcv.impad_to_multiple(img, 32)
+```
+
+### 视频
+
+视频模块提供了以下的功能：
+
+- 一个 `VideoReader` 类，具有友好的 API 接口可以读取和转换视频
+- 一些编辑视频的方法，包括 `cut` ， `concat` ， `resize`
+- 光流的读取/保存/变换
+
+#### VideoReader
+
+`VideoReader` 类提供了和序列一样的接口去获取视频帧。该类会缓存所有被访问过的帧。
+
+```python
+video = mmcv.VideoReader('test.mp4')
+
+# 获取基本的信息
+print(len(video))
+print(video.width, video.height, video.resolution, video.fps)
+
+# 遍历所有的帧
+for frame in video:
+    print(frame.shape)
+
+# 读取下一帧
+img = video.read()
+
+# 使用索引获取帧
+img = video[100]
+
+# 获取指定范围的帧
+img = video[5:10]
+```
+
+将视频切成帧并保存至给定目录或者从给定目录中生成视频。
+
+```python
+# 将视频切成帧并保存至目录
+video = mmcv.VideoReader('test.mp4')
+video.cvt2frames('out_dir')
+
+# 从给定目录中生成视频
+mmcv.frames2video('out_dir', 'test.avi')
+```
+
+#### 编辑函数
+
+有几个用于编辑视频的函数，这些函数是对 `ffmpeg` 的封装。
+
+```python
+# 裁剪视频
+mmcv.cut_video('test.mp4', 'clip1.mp4', start=3, end=10, vcodec='h264')
+
+# 将多个视频拼接成一个视频
+mmcv.concat_video(['clip1.mp4', 'clip2.mp4'], 'joined.mp4', log_level='quiet')
+
+# 将视频缩放至给定的尺寸
+mmcv.resize_video('test.mp4', 'resized1.mp4', (360, 240))
+
+# 将视频缩放至给定的倍率
+mmcv.resize_video('test.mp4', 'resized2.mp4', ratio=2)
+```
+
+#### 光流
+
+`mmcv` 提供了以下用于操作光流的函数：
+
+- 读取/保存
+- 可视化
+- 流变换
+
+我们提供了两种将光流dump到文件的方法，分别是非压缩和压缩的方法。非压缩的方法直接将浮点数值的光流
+保存至二进制文件，虽然光流无损但文件会比较大。而压缩的方法先量化光流至 0-255 整形数值再保存为
+jpeg图像。光流的x维度和y维度会被拼接到图像中。
+
+1. 读取/保存
+
+```python
+flow = np.random.rand(800, 600, 2).astype(np.float32)
+# 保存光流到flo文件 (~3.7M)
+mmcv.flowwrite(flow, 'uncompressed.flo')
+# 保存光流为jpeg图像 (~230K)，图像的尺寸为 (800, 1200)
+mmcv.flowwrite(flow, 'compressed.jpg', quantize=True, concat_axis=1)
+
+# 读取光流文件，以下两种方式读取的光流尺寸均为 (800, 600, 2)
+flow = mmcv.flowread('uncompressed.flo')
+flow = mmcv.flowread('compressed.jpg', quantize=True, concat_axis=1)
+```
+
+2. 可视化
+
+使用 `mmcv.flowshow()` 可视化光流
+
+```python
+mmcv.flowshow(flow)
+```
+
+![progress](../../en/_static/flow_visualization.png)
+
+1. 流变换
+
+```python
+img1 = mmcv.imread('img1.jpg')
+flow = mmcv.flowread('flow.flo')
+warped_img2 = mmcv.flow_warp(img1, flow)
+```
+
+img1 (左) and img2 (右)
+
+![raw images](../../en/_static/flow_raw_images.png)
+
+光流 (img2 -> img1)
+
+![optical flow](../../en/_static/flow_img2toimg1.png)
+
+变换后的图像和真实图像的差异
+
+![warped image](../../en/_static/flow_warp_diff.png)
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/data_transform.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/data_transform.md
new file mode 100644
index 0000000000000000000000000000000000000000..47d16e1b5279cdcdf8700876d3d94e152b3181a0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/data_transform.md
@@ -0,0 +1,341 @@
+# 数据变换
+
+在 OpenMMLab 算法库中，数据集的构建和数据的准备是相互解耦的。通常，数据集的构建只对数据集进行解析，记录每个样本的基本信息；而数据的准备则是通过一系列的数据变换，根据样本的基本信息进行数据加载、预处理、格式化等操作。
+
+## 数据变换的设计
+
+在 MMCV 中，我们使用各种可调用的数据变换类来进行数据的操作。这些数据变换类可以接受若干配置参数进行实例化，之后通过调用的方式对输入的数据字典进行处理。同时，我们约定所有数据变换都接受一个字典作为输入，并将处理后的数据输出为一个字典。一个简单的例子如下：
+
+```python
+>>> import numpy as np
+>>> from mmcv.transforms import Resize
+>>>
+>>> transform = Resize(scale=(224, 224))
+>>> data_dict = {'img': np.random.rand(256, 256, 3)}
+>>> data_dict = transform(data_dict)
+>>> print(data_dict['img'].shape)
+(224, 224, 3)
+```
+
+数据变换类会读取输入字典的某些字段，并且可能添加、或者更新某些字段。这些字段的键大部分情况下是固定的，如 `Resize` 会固定地读取输入字典中的 `"img"` 等字段。我们可以在对应类的文档中了解对输入输出字段的约定。
+
+```{note}
+默认情况下，在需要图像尺寸作为**初始化参数**的数据变换 (如Resize, Pad) 中，图像尺寸的顺序均为 (width, height)。在数据变换**返回的字典**中，图像相关的尺寸， 如 `img_shape`、`ori_shape`、`pad_shape` 等，均为 (height, width)。
+```
+
+MMCV 为所有的数据变换类提供了一个统一的基类 (`BaseTransform`)：
+
+```python
+class BaseTransform(metaclass=ABCMeta):
+
+    def __call__(self, results: dict) -> dict:
+
+        return self.transform(results)
+
+    @abstractmethod
+    def transform(self, results: dict) -> dict:
+        pass
+```
+
+所有的数据变换类都需要继承 `BaseTransform`，并实现 `transform` 方法。`transform` 方法的输入和输出均为一个字典。在**自定义数据变换类**一节中，我们会更详细地介绍如何实现一个数据变换类。
+
+## 数据流水线
+
+如上所述，所有数据变换的输入和输出都是一个字典，而且根据 OpenMMLab 中 [有关数据集的约定](TODO)，数据集中每个样本的基本信息都是一个字典。这样一来，我们可以将所有的数据变换操作首尾相接，组合成为一条数据流水线（data pipeline），输入数据集中样本的信息字典，输出完成一系列处理后的信息字典。
+
+以分类任务为例，我们在下图展示了一个典型的数据流水线。对每个样本，数据集中保存的基本信息是一个如图中最左侧所示的字典，之后每经过一个由蓝色块代表的数据变换操作，数据字典中都会加入新的字段（标记为绿色）或更新现有的字段（标记为橙色）。
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/26739999/154197953-bf0b1a16-3f41-4bc7-9e67-b2b9b323d895.png" width="90%"/>
+</div>
+
+在配置文件中，数据流水线是一个若干数据变换配置字典组成的列表，每个数据集都需要设置参数 `pipeline` 来定义该数据集需要进行的数据准备操作。如上数据流水线在配置文件中的配置如下：
+
+```python
+pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', size=256, keep_ratio=True),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]),
+    dict(type='ClsFormatBundle')
+]
+
+dataset = dict(
+    ...
+    pipeline=pipeline,
+    ...
+)
+```
+
+## 常用的数据变换类
+
+按照功能，常用的数据变换类可以大致分为数据加载、数据预处理与增强、数据格式化。在 MMCV 中，我们提供了一些常用的数据变换类如下：
+
+### 数据加载
+
+为了支持大规模数据集的加载，通常在 `Dataset` 初始化时不加载数据，只加载相应的路径。因此需要在数据流水线中进行具体数据的加载。
+
+|            class            |                   功能                    |
+| :-------------------------: | :---------------------------------------: |
+| [`LoadImageFromFile`](TODO) |             根据路径加载图像              |
+|  [`LoadAnnotations`](TODO)  | 加载和组织标注信息，如 bbox、语义分割图等 |
+
+### 数据预处理及增强
+
+数据预处理和增强通常是对图像本身进行变换，如裁剪、填充、缩放等。
+
+|              class               |                功能                |
+| :------------------------------: | :--------------------------------: |
+|          [`Pad`](TODO)           |            填充图像边缘            |
+|       [`CenterCrop`](TODO)       |              居中裁剪              |
+|       [`Normalize`](TODO)        |          对图像进行归一化          |
+|         [`Resize`](TODO)         |     按照指定尺寸或比例缩放图像     |
+|      [`RandomResize`](TODO)      |    缩放图像至指定范围的随机尺寸    |
+| [`RandomMultiscaleResize`](TODO) | 缩放图像至多个尺寸中的随机一个尺寸 |
+|    [`RandomGrayscale`](TODO)     |             随机灰度化             |
+|       [`RandomFlip`](TODO)       |            图像随机翻转            |
+|   [`MultiScaleFlipAug`](TODO)    |   支持缩放和翻转的测试时数据增强   |
+
+### 数据格式化
+
+数据格式化操作通常是对数据进行的类型转换。
+
+|          class          |               功能                |
+| :---------------------: | :-------------------------------: |
+|   [`ToTensor`](TODO)    | 将指定的数据转换为 `torch.Tensor` |
+| [`ImageToTensor`](TODO) |    将图像转换为 `torch.Tensor`    |
+
+## 自定义数据变换类
+
+要实现一个新的数据变换类，需要继承 `BaseTransform`，并实现 `transform` 方法。这里，我们使用一个简单的翻转变换（`MyFlip`）作为示例：
+
+```python
+import random
+import mmcv
+from mmcv.transforms import BaseTransform, TRANSFORMS
+
+@TRANSFORMS.register_module()
+class MyFlip(BaseTransform):
+    def __init__(self, direction: str):
+        super().__init__()
+        self.direction = direction
+
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        results['img'] = mmcv.imflip(img, direction=self.direction)
+        return results
+```
+
+从而，我们可以实例化一个 `MyFlip` 对象，并将之作为一个可调用对象，来处理我们的数据字典。
+
+```python
+import numpy as np
+
+transform = MyFlip(direction='horizontal')
+data_dict = {'img': np.random.rand(224, 224, 3)}
+data_dict = transform(data_dict)
+processed_img = data_dict['img']
+```
+
+又或者，在配置文件的 pipeline 中使用 `MyFlip` 变换
+
+```python
+pipeline = [
+    ...
+    dict(type='MyFlip', direction='horizontal'),
+    ...
+]
+```
+
+需要注意的是，如需在配置文件中使用，需要保证 `MyFlip` 类所在的文件在运行时能够被导入。
+
+## 变换包装
+
+变换包装是一种特殊的数据变换类，他们本身并不操作数据字典中的图像、标签等信息，而是对其中定义的数据变换的行为进行增强。
+
+### 字段映射（KeyMapper）
+
+字段映射包装（`KeyMapper`）用于对数据字典中的字段进行映射。例如，一般的图像处理变换都从数据字典中的 `"img"` 字段获得值。但有些时候，我们希望这些变换处理数据字典中其他字段中的图像，比如 `"gt_img"` 字段。
+
+如果配合注册器和配置文件使用的话，在配置文件中数据集的 `pipeline` 中如下例使用字段映射包装：
+
+```python
+pipeline = [
+    ...
+    dict(type='KeyMapper',
+        mapping={
+            'img': 'gt_img',  # 将 "gt_img" 字段映射至 "img" 字段
+            'mask': ...,  # 不使用原始数据中的 "mask" 字段。即对于被包装的数据变换，数据中不包含 "mask" 字段
+        },
+        auto_remap=True,  # 在完成变换后，将 "img" 重映射回 "gt_img" 字段
+        transforms=[
+            # 在 `RandomFlip` 变换类中，我们只需要操作 "img" 字段即可
+            dict(type='RandomFlip'),
+        ])
+    ...
+]
+```
+
+利用字段映射包装，我们在实现数据变换类时，不需要考虑在 `transform` 方法中考虑各种可能的输入字段名，只需要处理默认的字段即可。
+
+### 随机选择（RandomChoice）和随机执行（RandomApply）
+
+随机选择包装（`RandomChoice`）用于从一系列数据变换组合中随机应用一个数据变换组合。利用这一包装，我们可以简单地实现一些数据增强功能，比如 AutoAugment。
+
+如果配合注册器和配置文件使用的话，在配置文件中数据集的 `pipeline` 中如下例使用随机选择包装：
+
+```python
+pipeline = [
+    ...
+    dict(type='RandomChoice',
+        transforms=[
+            [
+                dict(type='Posterize', bits=4),
+                dict(type='Rotate', angle=30.)
+            ],  # 第一种随机变化组合
+            [
+                dict(type='Equalize'),
+                dict(type='Rotate', angle=30)
+            ],  # 第二种随机变换组合
+        ],
+        prob=[0.4, 0.6]  # 两种随机变换组合各自的选用概率
+        )
+    ...
+]
+```
+
+随机执行包装（`RandomApply`）用于以指定概率随机执行数据变换组合。例如：
+
+```python
+pipeline = [
+    ...
+    dict(type='RandomApply',
+        transforms=[dict(type='Rotate', angle=30.)],
+        prob=0.3)  # 以 0.3 的概率执行被包装的数据变换
+    ...
+]
+```
+
+### 多目标扩展（TransformBroadcaster）
+
+通常，一个数据变换类只会从一个固定的字段读取操作目标。虽然我们也可以使用 `KeyMapper` 来改变读取的字段，但无法将变换一次性应用于多个字段的数据。为了实现这一功能，我们需要借助多目标扩展包装（`TransformBroadcaster`）。
+
+多目标扩展包装（`TransformBroadcaster`）有两个用法，一是将数据变换作用于指定的多个字段，二是将数据变换作用于某个字段下的一组目标中。
+
+1. 应用于多个字段
+
+   假设我们需要将数据变换应用于 `"lq"` (low-quality) 和 `"gt"` (ground-truth) 两个字段中的图像上。
+
+   ```python
+   pipeline = [
+       dict(type='TransformBroadcaster',
+           # 分别应用于 "lq" 和 "gt" 两个字段，并将二者应设置 "img" 字段
+           mapping={'img': ['lq', 'gt']},
+           # 在完成变换后，将 "img" 字段重映射回原先的字段
+           auto_remap=True,
+           # 是否在对各目标的变换中共享随机变量
+           # 更多介绍参加后续章节（随机变量共享）
+           share_random_params=True,
+           transforms=[
+               # 在 `RandomFlip` 变换类中，我们只需要操作 "img" 字段即可
+               dict(type='RandomFlip'),
+           ])
+   ]
+   ```
+
+   在多目标扩展的 `mapping` 设置中，我们同样可以使用 `...` 来忽略指定的原始字段。如以下例子中，被包裹的 `RandomCrop` 会对字段 `"img"` 中的图像进行裁剪，并且在字段 `"img_shape"` 存在时更新剪裁后的图像大小。如果我们希望同时对两个图像字段 `"lq"` 和 `"gt"` 进行相同的随机裁剪，但只更新一次 `"img_shape"` 字段，可以通过例子中的方式实现：
+
+   ```python
+   pipeline = [
+       dict(type='TransformBroadcaster',
+           mapping={
+               'img': ['lq', 'gt'],
+               'img_shape': ['img_shape', ...],
+            },
+           # 在完成变换后，将 "img" 和 "img_shape" 字段重映射回原先的字段
+           auto_remap=True,
+           # 是否在对各目标的变换中共享随机变量
+           # 更多介绍参加后续章节（随机变量共享）
+           share_random_params=True,
+           transforms=[
+               # `RandomCrop` 类中会操作 "img" 和 "img_shape" 字段。若 "img_shape" 空缺，
+               # 则只操作 "img"
+               dict(type='RandomCrop'),
+           ])
+   ]
+   ```
+
+2. 应用于一个字段的一组目标
+
+   假设我们需要将数据变换应用于 `"images"` 字段，该字段为一个图像组成的 list。
+
+   ```python
+   pipeline = [
+       dict(type='TransformBroadcaster',
+           # 将 "images" 字段下的每张图片映射至 "img" 字段
+           mapping={'img': 'images'},
+           # 在完成变换后，将 "img" 字段下的图片重映射回 "images" 字段的列表中
+           auto_remap=True,
+           # 是否在对各目标的变换中共享随机变量
+           share_random_params=True,
+           transforms=[
+               # 在 `RandomFlip` 变换类中，我们只需要操作 "img" 字段即可
+               dict(type='RandomFlip'),
+           ])
+   ]
+   ```
+
+#### 装饰器 `cache_randomness`
+
+在 `TransformBroadcaster` 中，我们提供了 `share_random_params` 选项来支持在多次数据变换中共享随机状态。例如，在超分辨率任务中，我们希望将随机变换**同步**作用于低分辨率图像和原始图像。如果我们希望在自定义的数据变换类中使用这一功能，需要在类中标注哪些随机变量是支持共享的。这可以通过装饰器 `cache_randomness` 来实现。
+
+以上文中的 `MyFlip` 为例，我们希望以一定的概率随机执行翻转：
+
+```python
+from mmcv.transforms.utils import cache_randomness
+
+@TRANSFORMS.register_module()
+class MyRandomFlip(BaseTransform):
+    def __init__(self, prob: float, direction: str):
+        super().__init__()
+        self.prob = prob
+        self.direction = direction
+
+    @cache_randomness  # 标注该方法的输出为可共享的随机变量
+    def do_flip(self):
+        flip = True if random.random() > self.prob else False
+        return flip
+
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        if self.do_flip():
+            results['img'] = mmcv.imflip(img, direction=self.direction)
+        return results
+```
+
+在上面的例子中，我们用`cache_randomness` 装饰 `do_flip`方法，即将该方法返回值 `flip` 标注为一个支持共享的随机变量。进而，在 `TransformBroadcaster` 对多个目标的变换中，这一变量的值都会保持一致。
+
+#### 装饰器 `avoid_cache_randomness`
+
+在一些情况下，我们无法将数据变换中产生随机变量的过程单独放在类方法中。例如数据变换中使用的来自第三方库的模块，这些模块将随机变量相关的部分封装在了内部，导致无法将其抽出为数据变换的类方法。这样的数据变换无法通过装饰器 `cache_randomness` 标注支持共享的随机变量，进而无法在多目标扩展时共享随机变量。
+
+为了避免在多目标扩展中误用此类数据变换，我们提供了另一个装饰器 `avoid_cache_randomness`，用来对此类数据变换进行标记：
+
+```python
+from mmcv.transforms.utils import avoid_cache_randomness
+
+@TRANSFORMS.register_module()
+@avoid_cache_randomness
+class MyRandomTransform(BaseTransform):
+
+    def transform(self, results: dict) -> dict:
+        ...
+```
+
+用 `avoid_cache_randomness` 标记的数据变换类，当其实例被 `TransformBroadcaster` 包装且将参数 `share_random_params` 设置为 True 时，会抛出异常，以此提醒用户不能这样使用。
+
+在使用 `avoid_cache_randomness` 时需要注意以下几点：
+
+1. `avoid_cache_randomness` 只用于装饰数据变换类（BaseTransfrom 的子类），而不能用与装饰其他一般的类、类方法或函数
+2. 被 `avoid_cache_randomness` 修饰的数据变换作为基类时，其子类将**不会继承**这一特性。如果子类仍无法共享随机变量，则应再次使用 `avoid_cache_randomness` 修饰
+3. 只有当一个数据变换具有随机性，且无法共享随机参数时，才需要以 `avoid_cache_randomness` 修饰。无随机性的数据变换不需要修饰
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/ops.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/ops.md
new file mode 100644
index 0000000000000000000000000000000000000000..ba744daf110da2e78ccf596ff2cd629a2280344b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/ops.md
@@ -0,0 +1,66 @@
+## 算子
+
+MMCV 提供了检测、分割等任务中常用的算子
+
+| Device                       | CPU | CUDA | MLU | MPS | Ascend |
+| ---------------------------- | --- | ---- | --- | --- | ------ |
+| ActiveRotatedFilter          | √   | √    |     |     | √      |
+| AssignScoreWithK             |     | √    |     |     |        |
+| BallQuery                    |     | √    | √   |     |        |
+| BBoxOverlaps                 |     | √    | √   | √   | √      |
+| BorderAlign                  |     | √    |     |     |        |
+| BoxIouRotated                | √   | √    | √   |     | √      |
+| BoxIouQuadri                 | √   | √    |     |     |        |
+| CARAFE                       |     | √    | √   |     |        |
+| ChamferDistance              |     | √    |     |     |        |
+| CrissCrossAttention          |     | √    |     |     |        |
+| ContourExpand                | √   |      |     |     |        |
+| ConvexIoU                    |     | √    |     |     |        |
+| CornerPool                   |     | √    |     |     |        |
+| Correlation                  |     | √    |     |     |        |
+| Deformable Convolution v1/v2 | √   | √    | √   |     | √      |
+| Deformable RoIPool           |     | √    | √   |     | √      |
+| DiffIoURotated               |     | √    | √   |     |        |
+| DynamicScatter               |     | √    | √   |     |        |
+| FurthestPointSample          |     | √    |     |     |        |
+| FurthestPointSampleWithDist  |     | √    |     |     |        |
+| FusedBiasLeakyrelu           |     | √    |     |     | √      |
+| GatherPoints                 |     | √    |     |     | √      |
+| GroupPoints                  |     | √    |     |     |        |
+| Iou3d                        |     | √    | √   |     |        |
+| KNN                          |     | √    |     |     |        |
+| MaskedConv                   |     | √    | √   |     | √      |
+| MergeCells                   |     | √    |     |     |        |
+| MinAreaPolygon               |     | √    |     |     |        |
+| ModulatedDeformConv2d        | √   | √    | √   |     | √      |
+| MultiScaleDeformableAttn     |     | √    | √   |     |        |
+| NMS                          | √   | √    | √   |     | √      |
+| NMSRotated                   | √   | √    | √   |     | √      |
+| NMSQuadri                    | √   | √    |     |     |        |
+| PixelGroup                   | √   |      |     |     |        |
+| PointsInBoxes                | √   | √    |     |     |        |
+| PointsInPolygons             |     | √    |     |     |        |
+| PSAMask                      | √   | √    | √   |     | √      |
+| RotatedFeatureAlign          | √   | √    | √   |     |        |
+| RoIPointPool3d               |     | √    | √   |     |        |
+| RoIPool                      |     | √    | √   |     | √      |
+| RoIAlignRotated              | √   | √    | √   |     |        |
+| RiRoIAlignRotated            |     | √    |     |     |        |
+| RoIAlign                     | √   | √    | √   |     | √      |
+| RoIAwarePool3d               |     | √    | √   |     |        |
+| SAConv2d                     |     | √    |     |     |        |
+| SigmoidFocalLoss             |     | √    | √   |     | √      |
+| SoftmaxFocalLoss             |     | √    |     |     | √      |
+| SoftNMS                      |     | √    |     |     |        |
+| Sparse Convolution           |     | √    | √   |     |        |
+| Synchronized BatchNorm       |     | √    |     |     |        |
+| ThreeInterpolate             |     | √    |     |     |        |
+| ThreeNN                      |     | √    | √   |     |        |
+| TINShift                     |     | √    | √   |     |        |
+| UpFirDn2d                    |     | √    |     |     |        |
+| Voxelization                 | √   | √    | √   |     | √      |
+| PrRoIPool                    |     | √    |     |     |        |
+| BezierAlign                  | √   | √    |     |     |        |
+| BiasAct                      |     | √    |     |     |        |
+| FilteredLrelu                |     | √    |     |     |        |
+| Conv2dGradfix                |     | √    |     |     |        |
diff --git a/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/visualization.md b/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/visualization.md
new file mode 100644
index 0000000000000000000000000000000000000000..9ad26c6a822cae0c084c52e204baa07b88627b97
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/docs/zh_cn/understand_mmcv/visualization.md
@@ -0,0 +1,24 @@
+## 可视化
+
+`mmcv` 可以展示图像以及标注（目前只支持标注框）
+
+```python
+# 展示图像文件
+mmcv.imshow('a.jpg')
+
+# 展示已加载的图像
+img = np.random.rand(100, 100, 3)
+mmcv.imshow(img)
+
+# 展示带有标注框的图像
+img = np.random.rand(100, 100, 3)
+bboxes = np.array([[0, 0, 50, 50], [20, 20, 60, 60]])
+mmcv.imshow_bboxes(img, bboxes)
+```
+
+`mmcv` 也可以展示特殊的图像，例如光流
+
+```python
+flow = mmcv.flowread('test.flo')
+mmcv.flowshow(flow)
+```
diff --git a/head_extractor/mmcv-2.1.0/mmcv.egg-info/PKG-INFO b/head_extractor/mmcv-2.1.0/mmcv.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..e9d17572fe50926244ab7f58a7e9a8d05ec3897e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv.egg-info/PKG-INFO
@@ -0,0 +1,74 @@
+Metadata-Version: 2.4
+Name: mmcv
+Version: 2.1.0
+Summary: OpenMMLab Computer Vision Foundation
+Home-page: https://github.com/open-mmlab/mmcv
+Author: MMCV Contributors
+Author-email: openmmlab@gmail.com
+Keywords: computer vision
+Classifier: Development Status :: 4 - Beta
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Topic :: Utilities
+Requires-Python: >=3.7
+License-File: LICENSE
+License-File: LICENSES.md
+Requires-Dist: addict
+Requires-Dist: mmengine>=0.3.0
+Requires-Dist: numpy
+Requires-Dist: packaging
+Requires-Dist: Pillow
+Requires-Dist: pyyaml
+Requires-Dist: regex; sys_platform == "win32"
+Requires-Dist: yapf
+Provides-Extra: all
+Requires-Dist: pytest-runner; extra == "all"
+Requires-Dist: ninja; extra == "all"
+Requires-Dist: psutil; extra == "all"
+Requires-Dist: addict; extra == "all"
+Requires-Dist: mmengine>=0.3.0; extra == "all"
+Requires-Dist: numpy; extra == "all"
+Requires-Dist: packaging; extra == "all"
+Requires-Dist: Pillow; extra == "all"
+Requires-Dist: pyyaml; extra == "all"
+Requires-Dist: regex; sys_platform == "win32" and extra == "all"
+Requires-Dist: yapf; extra == "all"
+Requires-Dist: coverage; extra == "all"
+Requires-Dist: lmdb; extra == "all"
+Requires-Dist: onnx; extra == "all"
+Requires-Dist: onnxoptimizer; extra == "all"
+Requires-Dist: onnxruntime; extra == "all"
+Requires-Dist: pytest; extra == "all"
+Requires-Dist: PyTurboJPEG; extra == "all"
+Requires-Dist: scipy; extra == "all"
+Requires-Dist: tifffile; extra == "all"
+Provides-Extra: tests
+Requires-Dist: coverage; extra == "tests"
+Requires-Dist: lmdb; extra == "tests"
+Requires-Dist: onnx; extra == "tests"
+Requires-Dist: onnxoptimizer; extra == "tests"
+Requires-Dist: onnxruntime; extra == "tests"
+Requires-Dist: pytest; extra == "tests"
+Requires-Dist: PyTurboJPEG; extra == "tests"
+Requires-Dist: scipy; extra == "tests"
+Requires-Dist: tifffile; extra == "tests"
+Provides-Extra: build
+Requires-Dist: pytest-runner; extra == "build"
+Provides-Extra: optional
+Requires-Dist: ninja; extra == "optional"
+Requires-Dist: psutil; extra == "optional"
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license-file
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
diff --git a/head_extractor/mmcv-2.1.0/mmcv.egg-info/SOURCES.txt b/head_extractor/mmcv-2.1.0/mmcv.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bb33ff2269eb54f2ab8fa990e0660dcf746703cc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv.egg-info/SOURCES.txt
@@ -0,0 +1,672 @@
+LICENSE
+LICENSES.md
+MANIFEST.in
+README.md
+setup.cfg
+setup.py
+./mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
+./mmcv/ops/csrc/pytorch/assign_score_withk.cpp
+./mmcv/ops/csrc/pytorch/ball_query.cpp
+./mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
+./mmcv/ops/csrc/pytorch/bezier_align.cpp
+./mmcv/ops/csrc/pytorch/bias_act.cpp
+./mmcv/ops/csrc/pytorch/border_align.cpp
+./mmcv/ops/csrc/pytorch/box_iou_quadri.cpp
+./mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
+./mmcv/ops/csrc/pytorch/carafe.cpp
+./mmcv/ops/csrc/pytorch/carafe_naive.cpp
+./mmcv/ops/csrc/pytorch/chamfer_distance.cpp
+./mmcv/ops/csrc/pytorch/contour_expand.cpp
+./mmcv/ops/csrc/pytorch/convex_iou.cpp
+./mmcv/ops/csrc/pytorch/correlation.cpp
+./mmcv/ops/csrc/pytorch/deform_conv.cpp
+./mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
+./mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
+./mmcv/ops/csrc/pytorch/filtered_lrelu.cpp
+./mmcv/ops/csrc/pytorch/focal_loss.cpp
+./mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
+./mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
+./mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
+./mmcv/ops/csrc/pytorch/gather_points.cpp
+./mmcv/ops/csrc/pytorch/group_points.cpp
+./mmcv/ops/csrc/pytorch/info.cpp
+./mmcv/ops/csrc/pytorch/iou3d.cpp
+./mmcv/ops/csrc/pytorch/knn.cpp
+./mmcv/ops/csrc/pytorch/masked_conv2d.cpp
+./mmcv/ops/csrc/pytorch/min_area_polygons.cpp
+./mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
+./mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
+./mmcv/ops/csrc/pytorch/nms.cpp
+./mmcv/ops/csrc/pytorch/nms_quadri.cpp
+./mmcv/ops/csrc/pytorch/nms_rotated.cpp
+./mmcv/ops/csrc/pytorch/pixel_group.cpp
+./mmcv/ops/csrc/pytorch/points_in_boxes.cpp
+./mmcv/ops/csrc/pytorch/points_in_polygons.cpp
+./mmcv/ops/csrc/pytorch/prroi_pool.cpp
+./mmcv/ops/csrc/pytorch/psamask.cpp
+./mmcv/ops/csrc/pytorch/pybind.cpp
+./mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
+./mmcv/ops/csrc/pytorch/roi_align.cpp
+./mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
+./mmcv/ops/csrc/pytorch/roi_pool.cpp
+./mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
+./mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
+./mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
+./mmcv/ops/csrc/pytorch/scatter_points.cpp
+./mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
+./mmcv/ops/csrc/pytorch/spconv_ops.cpp
+./mmcv/ops/csrc/pytorch/sync_bn.cpp
+./mmcv/ops/csrc/pytorch/three_interpolate.cpp
+./mmcv/ops/csrc/pytorch/three_nn.cpp
+./mmcv/ops/csrc/pytorch/tin_shift.cpp
+./mmcv/ops/csrc/pytorch/upfirdn2d.cpp
+./mmcv/ops/csrc/pytorch/voxelization.cpp
+./mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
+./mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp
+./mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp
+./mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp
+./mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
+./mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp
+./mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
+./mmcv/ops/csrc/pytorch/cpu/nms.cpp
+./mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp
+./mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
+./mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
+./mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp
+./mmcv/ops/csrc/pytorch/cpu/psamask.cpp
+./mmcv/ops/csrc/pytorch/cpu/roi_align.cpp
+./mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
+./mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
+./mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
+./mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
+./mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
+./mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
+./mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
+./mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
+./mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu
+./mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
+./mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
+./mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
+./mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
+./mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
+./mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
+./mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
+mmcv/__init__.py
+mmcv/version.py
+mmcv.egg-info/PKG-INFO
+mmcv.egg-info/SOURCES.txt
+mmcv.egg-info/dependency_links.txt
+mmcv.egg-info/not-zip-safe
+mmcv.egg-info/requires.txt
+mmcv.egg-info/top_level.txt
+mmcv/arraymisc/__init__.py
+mmcv/arraymisc/quantization.py
+mmcv/cnn/__init__.py
+mmcv/cnn/alexnet.py
+mmcv/cnn/resnet.py
+mmcv/cnn/vgg.py
+mmcv/cnn/bricks/__init__.py
+mmcv/cnn/bricks/activation.py
+mmcv/cnn/bricks/context_block.py
+mmcv/cnn/bricks/conv.py
+mmcv/cnn/bricks/conv2d_adaptive_padding.py
+mmcv/cnn/bricks/conv_module.py
+mmcv/cnn/bricks/conv_ws.py
+mmcv/cnn/bricks/depthwise_separable_conv_module.py
+mmcv/cnn/bricks/drop.py
+mmcv/cnn/bricks/generalized_attention.py
+mmcv/cnn/bricks/hsigmoid.py
+mmcv/cnn/bricks/hswish.py
+mmcv/cnn/bricks/non_local.py
+mmcv/cnn/bricks/norm.py
+mmcv/cnn/bricks/padding.py
+mmcv/cnn/bricks/plugin.py
+mmcv/cnn/bricks/scale.py
+mmcv/cnn/bricks/swish.py
+mmcv/cnn/bricks/transformer.py
+mmcv/cnn/bricks/upsample.py
+mmcv/cnn/bricks/wrappers.py
+mmcv/cnn/rfsearch/__init__.py
+mmcv/cnn/rfsearch/operator.py
+mmcv/cnn/rfsearch/search.py
+mmcv/cnn/rfsearch/utils.py
+mmcv/cnn/utils/__init__.py
+mmcv/cnn/utils/flops_counter.py
+mmcv/cnn/utils/fuse_conv_bn.py
+mmcv/image/__init__.py
+mmcv/image/colorspace.py
+mmcv/image/geometric.py
+mmcv/image/io.py
+mmcv/image/misc.py
+mmcv/image/photometric.py
+mmcv/ops/__init__.py
+mmcv/ops/active_rotated_filter.py
+mmcv/ops/assign_score_withk.py
+mmcv/ops/ball_query.py
+mmcv/ops/bbox.py
+mmcv/ops/bezier_align.py
+mmcv/ops/bias_act.py
+mmcv/ops/border_align.py
+mmcv/ops/box_iou_quadri.py
+mmcv/ops/box_iou_rotated.py
+mmcv/ops/carafe.py
+mmcv/ops/cc_attention.py
+mmcv/ops/chamfer_distance.py
+mmcv/ops/contour_expand.py
+mmcv/ops/conv2d_gradfix.py
+mmcv/ops/convex_iou.py
+mmcv/ops/corner_pool.py
+mmcv/ops/correlation.py
+mmcv/ops/deform_conv.py
+mmcv/ops/deform_roi_pool.py
+mmcv/ops/deprecated_wrappers.py
+mmcv/ops/diff_iou_rotated.py
+mmcv/ops/filtered_lrelu.py
+mmcv/ops/focal_loss.py
+mmcv/ops/furthest_point_sample.py
+mmcv/ops/fused_bias_leakyrelu.py
+mmcv/ops/gather_points.py
+mmcv/ops/group_points.py
+mmcv/ops/info.py
+mmcv/ops/iou3d.py
+mmcv/ops/knn.py
+mmcv/ops/masked_conv.py
+mmcv/ops/merge_cells.py
+mmcv/ops/min_area_polygons.py
+mmcv/ops/modulated_deform_conv.py
+mmcv/ops/multi_scale_deform_attn.py
+mmcv/ops/nms.py
+mmcv/ops/pixel_group.py
+mmcv/ops/point_sample.py
+mmcv/ops/points_in_boxes.py
+mmcv/ops/points_in_polygons.py
+mmcv/ops/points_sampler.py
+mmcv/ops/prroi_pool.py
+mmcv/ops/psa_mask.py
+mmcv/ops/riroi_align_rotated.py
+mmcv/ops/roi_align.py
+mmcv/ops/roi_align_rotated.py
+mmcv/ops/roi_pool.py
+mmcv/ops/roiaware_pool3d.py
+mmcv/ops/roipoint_pool3d.py
+mmcv/ops/rotated_feature_align.py
+mmcv/ops/saconv.py
+mmcv/ops/scatter_points.py
+mmcv/ops/sparse_conv.py
+mmcv/ops/sparse_functional.py
+mmcv/ops/sparse_modules.py
+mmcv/ops/sparse_ops.py
+mmcv/ops/sparse_pool.py
+mmcv/ops/sparse_structure.py
+mmcv/ops/sync_bn.py
+mmcv/ops/three_interpolate.py
+mmcv/ops/three_nn.py
+mmcv/ops/tin_shift.py
+mmcv/ops/upfirdn2d.py
+mmcv/ops/voxelize.py
+mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
+mmcv/ops/csrc/common/parrots_cpp_helper.hpp
+mmcv/ops/csrc/common/parrots_cuda_helper.hpp
+mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
+mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
+mmcv/ops/csrc/common/pytorch_device_registry.hpp
+mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
+mmcv/ops/csrc/common/pytorch_npu_helper.hpp
+mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh
+mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
+mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
+mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
+mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
+mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh
+mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
+mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh
+mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
+mmcv/ops/csrc/common/cuda/spconv/indice.cuh
+mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
+mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
+mmcv/ops/csrc/common/mps/MPSDevice.h
+mmcv/ops/csrc/common/mps/MPSLibrary.h
+mmcv/ops/csrc/common/mps/MPSLibrary.mm
+mmcv/ops/csrc/common/mps/MPSStream.h
+mmcv/ops/csrc/common/mps/MPSUtils.h
+mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
+mmcv/ops/csrc/common/utils/spconv/prettyprint.h
+mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
+mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
+mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
+mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
+mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
+mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
+mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
+mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
+mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
+mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
+mmcv/ops/csrc/parrots/active_rotated_filter.cpp
+mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
+mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
+mmcv/ops/csrc/parrots/assign_score_withk.cpp
+mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp
+mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h
+mmcv/ops/csrc/parrots/ball_query._parrots.cpp
+mmcv/ops/csrc/parrots/ball_query.cpp
+mmcv/ops/csrc/parrots/ball_query_pytorch.h
+mmcv/ops/csrc/parrots/bbox_overlaps.cpp
+mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp
+mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h
+mmcv/ops/csrc/parrots/border_align.cpp
+mmcv/ops/csrc/parrots/border_align_parrots.cpp
+mmcv/ops/csrc/parrots/border_align_pytorch.h
+mmcv/ops/csrc/parrots/box_iou_rotated.cpp
+mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp
+mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h
+mmcv/ops/csrc/parrots/carafe.cpp
+mmcv/ops/csrc/parrots/carafe_naive.cpp
+mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp
+mmcv/ops/csrc/parrots/carafe_naive_pytorch.h
+mmcv/ops/csrc/parrots/carafe_parrots.cpp
+mmcv/ops/csrc/parrots/carafe_pytorch.h
+mmcv/ops/csrc/parrots/chamfer_distance.cpp
+mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp
+mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h
+mmcv/ops/csrc/parrots/contour_expand.cpp
+mmcv/ops/csrc/parrots/contour_expand_parrots.cpp
+mmcv/ops/csrc/parrots/contour_expand_pytorch.h
+mmcv/ops/csrc/parrots/convex_iou.cpp
+mmcv/ops/csrc/parrots/convex_iou_parrots.cpp
+mmcv/ops/csrc/parrots/convex_iou_pytorch.h
+mmcv/ops/csrc/parrots/correlation.cpp
+mmcv/ops/csrc/parrots/correlation_parrots.cpp
+mmcv/ops/csrc/parrots/correlation_pytorch.h
+mmcv/ops/csrc/parrots/cudabind.cpp
+mmcv/ops/csrc/parrots/deform_conv.cpp
+mmcv/ops/csrc/parrots/deform_conv_parrots.cpp
+mmcv/ops/csrc/parrots/deform_conv_pytorch.h
+mmcv/ops/csrc/parrots/deform_roi_pool.cpp
+mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp
+mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h
+mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
+mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
+mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
+mmcv/ops/csrc/parrots/focal_loss.cpp
+mmcv/ops/csrc/parrots/focal_loss_parrots.cpp
+mmcv/ops/csrc/parrots/focal_loss_pytorch.h
+mmcv/ops/csrc/parrots/furthest_point_sample.cpp
+mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp
+mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h
+mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
+mmcv/ops/csrc/parrots/fused_bias_parrots.cpp
+mmcv/ops/csrc/parrots/gather_points.cpp
+mmcv/ops/csrc/parrots/gather_points_parrots.cpp
+mmcv/ops/csrc/parrots/gather_points_pytorch.h
+mmcv/ops/csrc/parrots/group_points.cpp
+mmcv/ops/csrc/parrots/group_points_parrots.cpp
+mmcv/ops/csrc/parrots/group_points_pytorch.h
+mmcv/ops/csrc/parrots/info.cpp
+mmcv/ops/csrc/parrots/iou3d.cpp
+mmcv/ops/csrc/parrots/iou3d_parrots.cpp
+mmcv/ops/csrc/parrots/iou3d_pytorch.h
+mmcv/ops/csrc/parrots/knn.cpp
+mmcv/ops/csrc/parrots/knn_parrots.cpp
+mmcv/ops/csrc/parrots/knn_pytorch.h
+mmcv/ops/csrc/parrots/masked_conv2d.cpp
+mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp
+mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h
+mmcv/ops/csrc/parrots/min_area_polygons.cpp
+mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
+mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
+mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
+mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp
+mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h
+mmcv/ops/csrc/parrots/ms_deform_attn.cpp
+mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp
+mmcv/ops/csrc/parrots/nms.cpp
+mmcv/ops/csrc/parrots/nms_parrots.cpp
+mmcv/ops/csrc/parrots/nms_pytorch.h
+mmcv/ops/csrc/parrots/nms_rotated.cpp
+mmcv/ops/csrc/parrots/pixel_group.cpp
+mmcv/ops/csrc/parrots/pixel_group_parrots.cpp
+mmcv/ops/csrc/parrots/pixel_group_pytorch.h
+mmcv/ops/csrc/parrots/points_in_boxes.cpp
+mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp
+mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h
+mmcv/ops/csrc/parrots/points_in_polygons.cpp
+mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
+mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
+mmcv/ops/csrc/parrots/prroi_pool.cpp
+mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp
+mmcv/ops/csrc/parrots/prroi_pool_pytorch.h
+mmcv/ops/csrc/parrots/psamask.cpp
+mmcv/ops/csrc/parrots/psamask_parrots.cpp
+mmcv/ops/csrc/parrots/psamask_pytorch.h
+mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
+mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
+mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
+mmcv/ops/csrc/parrots/roi_align.cpp
+mmcv/ops/csrc/parrots/roi_align_parrots.cpp
+mmcv/ops/csrc/parrots/roi_align_pytorch.h
+mmcv/ops/csrc/parrots/roi_align_rotated.cpp
+mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
+mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
+mmcv/ops/csrc/parrots/roi_pool.cpp
+mmcv/ops/csrc/parrots/roi_pool_parrots.cpp
+mmcv/ops/csrc/parrots/roi_pool_pytorch.h
+mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
+mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp
+mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h
+mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
+mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp
+mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h
+mmcv/ops/csrc/parrots/rotated_feature_align.cpp
+mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp
+mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h
+mmcv/ops/csrc/parrots/sync_bn.cpp
+mmcv/ops/csrc/parrots/sync_bn_parrots.cpp
+mmcv/ops/csrc/parrots/sync_bn_pytorch.h
+mmcv/ops/csrc/parrots/three_interpolate.cpp
+mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp
+mmcv/ops/csrc/parrots/three_interpolate_pytorch.h
+mmcv/ops/csrc/parrots/three_nn.cpp
+mmcv/ops/csrc/parrots/three_nn_parrots.cpp
+mmcv/ops/csrc/parrots/three_nn_pytorch.h
+mmcv/ops/csrc/parrots/tin_shift.cpp
+mmcv/ops/csrc/parrots/tin_shift_parrots.cpp
+mmcv/ops/csrc/parrots/tin_shift_pytorch.h
+mmcv/ops/csrc/parrots/upfirdn2d.cpp
+mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp
+mmcv/ops/csrc/parrots/voxelization.cpp
+mmcv/ops/csrc/parrots/voxelization_parrots.cpp
+mmcv/ops/csrc/parrots/voxelization_pytorch.h
+mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
+mmcv/ops/csrc/pytorch/assign_score_withk.cpp
+mmcv/ops/csrc/pytorch/ball_query.cpp
+mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
+mmcv/ops/csrc/pytorch/bezier_align.cpp
+mmcv/ops/csrc/pytorch/bias_act.cpp
+mmcv/ops/csrc/pytorch/border_align.cpp
+mmcv/ops/csrc/pytorch/box_iou_quadri.cpp
+mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
+mmcv/ops/csrc/pytorch/carafe.cpp
+mmcv/ops/csrc/pytorch/carafe_naive.cpp
+mmcv/ops/csrc/pytorch/chamfer_distance.cpp
+mmcv/ops/csrc/pytorch/contour_expand.cpp
+mmcv/ops/csrc/pytorch/convex_iou.cpp
+mmcv/ops/csrc/pytorch/correlation.cpp
+mmcv/ops/csrc/pytorch/deform_conv.cpp
+mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
+mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
+mmcv/ops/csrc/pytorch/filtered_lrelu.cpp
+mmcv/ops/csrc/pytorch/focal_loss.cpp
+mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
+mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
+mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
+mmcv/ops/csrc/pytorch/gather_points.cpp
+mmcv/ops/csrc/pytorch/group_points.cpp
+mmcv/ops/csrc/pytorch/info.cpp
+mmcv/ops/csrc/pytorch/iou3d.cpp
+mmcv/ops/csrc/pytorch/knn.cpp
+mmcv/ops/csrc/pytorch/masked_conv2d.cpp
+mmcv/ops/csrc/pytorch/min_area_polygons.cpp
+mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
+mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
+mmcv/ops/csrc/pytorch/nms.cpp
+mmcv/ops/csrc/pytorch/nms_quadri.cpp
+mmcv/ops/csrc/pytorch/nms_rotated.cpp
+mmcv/ops/csrc/pytorch/pixel_group.cpp
+mmcv/ops/csrc/pytorch/points_in_boxes.cpp
+mmcv/ops/csrc/pytorch/points_in_polygons.cpp
+mmcv/ops/csrc/pytorch/prroi_pool.cpp
+mmcv/ops/csrc/pytorch/psamask.cpp
+mmcv/ops/csrc/pytorch/pybind.cpp
+mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
+mmcv/ops/csrc/pytorch/roi_align.cpp
+mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
+mmcv/ops/csrc/pytorch/roi_pool.cpp
+mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
+mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
+mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
+mmcv/ops/csrc/pytorch/scatter_points.cpp
+mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
+mmcv/ops/csrc/pytorch/spconv_ops.cpp
+mmcv/ops/csrc/pytorch/spconv_utils.h
+mmcv/ops/csrc/pytorch/sync_bn.cpp
+mmcv/ops/csrc/pytorch/three_interpolate.cpp
+mmcv/ops/csrc/pytorch/three_nn.cpp
+mmcv/ops/csrc/pytorch/tin_shift.cpp
+mmcv/ops/csrc/pytorch/upfirdn2d.cpp
+mmcv/ops/csrc/pytorch/voxelization.cpp
+mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
+mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp
+mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp
+mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp
+mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
+mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp
+mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
+mmcv/ops/csrc/pytorch/cpu/nms.cpp
+mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp
+mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
+mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
+mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp
+mmcv/ops/csrc/pytorch/cpu/psamask.cpp
+mmcv/ops/csrc/pytorch/cpu/roi_align.cpp
+mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
+mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
+mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
+mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
+mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
+mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
+mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
+mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
+mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu
+mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
+mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
+mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
+mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
+mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
+mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
+mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
+mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp
+mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/diff_iou_rotated_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
+mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
+mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/nms_rotated_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/rotated_feature_align_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/sparse_conv_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
+mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp
+mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
+mmcv/ops/csrc/pytorch/npu/active_rotated_filter_npu.cpp
+mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp
+mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp
+mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp
+mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp
+mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp
+mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp
+mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp
+mmcv/ops/csrc/pytorch/npu/nms_npu.cpp
+mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp
+mmcv/ops/csrc/pytorch/npu/points_in_polygons_npu.cpp
+mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp
+mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp
+mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp
+mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp
+mmcv/transforms/__init__.py
+mmcv/transforms/base.py
+mmcv/transforms/builder.py
+mmcv/transforms/formatting.py
+mmcv/transforms/loading.py
+mmcv/transforms/processing.py
+mmcv/transforms/utils.py
+mmcv/transforms/wrappers.py
+mmcv/utils/__init__.py
+mmcv/utils/device_type.py
+mmcv/utils/env.py
+mmcv/utils/ext_loader.py
+mmcv/utils/parrots_jit.py
+mmcv/video/__init__.py
+mmcv/video/io.py
+mmcv/video/optflow.py
+mmcv/video/processing.py
+mmcv/visualization/__init__.py
+mmcv/visualization/color.py
+mmcv/visualization/image.py
+mmcv/visualization/optflow.py
+requirements/runtime.txt
+tests/test_arraymisc.py
+tests/test_visualization.py
\ No newline at end of file
diff --git a/head_extractor/mmcv-2.1.0/mmcv.egg-info/dependency_links.txt b/head_extractor/mmcv-2.1.0/mmcv.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/head_extractor/mmcv-2.1.0/mmcv.egg-info/not-zip-safe b/head_extractor/mmcv-2.1.0/mmcv.egg-info/not-zip-safe
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv.egg-info/not-zip-safe
@@ -0,0 +1 @@
+
diff --git a/head_extractor/mmcv-2.1.0/mmcv.egg-info/requires.txt b/head_extractor/mmcv-2.1.0/mmcv.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3c48b21fe5c4aaf2938d7d9dfaa5e77883485c85
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv.egg-info/requires.txt
@@ -0,0 +1,52 @@
+addict
+mmengine>=0.3.0
+numpy
+packaging
+Pillow
+pyyaml
+yapf
+
+[:sys_platform == "win32"]
+regex
+
+[all]
+pytest-runner
+ninja
+psutil
+addict
+mmengine>=0.3.0
+numpy
+packaging
+Pillow
+pyyaml
+yapf
+coverage
+lmdb
+onnx
+onnxoptimizer
+onnxruntime
+pytest
+PyTurboJPEG
+scipy
+tifffile
+
+[all:sys_platform == "win32"]
+regex
+
+[build]
+pytest-runner
+
+[optional]
+ninja
+psutil
+
+[tests]
+coverage
+lmdb
+onnx
+onnxoptimizer
+onnxruntime
+pytest
+PyTurboJPEG
+scipy
+tifffile
diff --git a/head_extractor/mmcv-2.1.0/mmcv.egg-info/top_level.txt b/head_extractor/mmcv-2.1.0/mmcv.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e955e4438be640dddd761802f738845d35145fa1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv.egg-info/top_level.txt
@@ -0,0 +1 @@
+mmcv
diff --git a/head_extractor/mmcv-2.1.0/mmcv/__init__.py b/head_extractor/mmcv-2.1.0/mmcv/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2410ea555e905acb450792a427596764e16f62d3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# flake8: noqa
+from .arraymisc import *
+from .image import *
+from .transforms import *
+from .version import *
+from .video import *
+from .visualization import *
+
+# The following modules are not imported to this level, so mmcv may be used
+# without PyTorch.
+# - op
+# - utils
diff --git a/head_extractor/mmcv-2.1.0/mmcv/arraymisc/__init__.py b/head_extractor/mmcv-2.1.0/mmcv/arraymisc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b4700d6139ae3d604ff6e542468cce4200c020c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/arraymisc/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .quantization import dequantize, quantize
+
+__all__ = ['quantize', 'dequantize']
diff --git a/head_extractor/mmcv-2.1.0/mmcv/arraymisc/quantization.py b/head_extractor/mmcv-2.1.0/mmcv/arraymisc/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..6182710d51787061304cfc7304ec97d565822536
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/arraymisc/quantization.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import numpy as np
+
+
+def quantize(arr: np.ndarray,
+             min_val: Union[int, float],
+             max_val: Union[int, float],
+             levels: int,
+             dtype=np.int64) -> tuple:
+    """Quantize an array of (-inf, inf) to [0, levels-1].
+
+    Args:
+        arr (ndarray): Input array.
+        min_val (int or float): Minimum value to be clipped.
+        max_val (int or float): Maximum value to be clipped.
+        levels (int): Quantization levels.
+        dtype (np.type): The type of the quantized array.
+
+    Returns:
+        tuple: Quantized array.
+    """
+    if not (isinstance(levels, int) and levels > 1):
+        raise ValueError(
+            f'levels must be a positive integer, but got {levels}')
+    if min_val >= max_val:
+        raise ValueError(
+            f'min_val ({min_val}) must be smaller than max_val ({max_val})')
+
+    arr = np.clip(arr, min_val, max_val) - min_val
+    quantized_arr = np.minimum(
+        np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1)
+
+    return quantized_arr
+
+
+def dequantize(arr: np.ndarray,
+               min_val: Union[int, float],
+               max_val: Union[int, float],
+               levels: int,
+               dtype=np.float64) -> tuple:
+    """Dequantize an array.
+
+    Args:
+        arr (ndarray): Input array.
+        min_val (int or float): Minimum value to be clipped.
+        max_val (int or float): Maximum value to be clipped.
+        levels (int): Quantization levels.
+        dtype (np.type): The type of the dequantized array.
+
+    Returns:
+        tuple: Dequantized array.
+    """
+    if not (isinstance(levels, int) and levels > 1):
+        raise ValueError(
+            f'levels must be a positive integer, but got {levels}')
+    if min_val >= max_val:
+        raise ValueError(
+            f'min_val ({min_val}) must be smaller than max_val ({max_val})')
+
+    dequantized_arr = (arr + 0.5).astype(dtype) * (max_val -
+                                                   min_val) / levels + min_val
+
+    return dequantized_arr
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/__init__.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..10e7e027e4da544f42a6a4fe3400d9413a57e081
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .alexnet import AlexNet
+# yapf: disable
+from .bricks import (ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule,
+                     ConvTranspose2d, ConvTranspose3d, ConvWS2d,
+                     DepthwiseSeparableConvModule, GeneralizedAttention,
+                     HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d,
+                     NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish,
+                     build_activation_layer, build_conv_layer,
+                     build_norm_layer, build_padding_layer, build_plugin_layer,
+                     build_upsample_layer, conv_ws_2d, is_norm)
+# yapf: enable
+from .resnet import ResNet, make_res_layer
+from .rfsearch import Conv2dRFSearchOp, RFSearchHook
+from .utils import fuse_conv_bn, get_model_complexity_info
+from .vgg import VGG, make_vgg_layer
+
+__all__ = [
+    'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer',
+    'ConvModule', 'build_activation_layer', 'build_conv_layer',
+    'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
+    'build_plugin_layer', 'is_norm', 'NonLocal1d', 'NonLocal2d', 'NonLocal3d',
+    'ContextBlock', 'HSigmoid', 'Swish', 'HSwish', 'GeneralizedAttention',
+    'Scale', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d',
+    'DepthwiseSeparableConvModule', 'Linear', 'Conv2d', 'ConvTranspose2d',
+    'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'fuse_conv_bn',
+    'get_model_complexity_info', 'Conv2dRFSearchOp', 'RFSearchHook'
+]
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/alexnet.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..309be24b66049c86837c67d24ee0e790e6396abc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/alexnet.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmengine.runner import load_checkpoint
+
+
+class AlexNet(nn.Module):
+    """AlexNet backbone.
+
+    Args:
+        num_classes (int): number of classes for classification.
+    """
+
+    def __init__(self, num_classes: int = -1):
+        super().__init__()
+        self.num_classes = num_classes
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Dropout(),
+                nn.Linear(256 * 6 * 6, 4096),
+                nn.ReLU(inplace=True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(inplace=True),
+                nn.Linear(4096, num_classes),
+            )
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            # use default initializer
+            pass
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+
+        x = self.features(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), 256 * 6 * 6)
+            x = self.classifier(x)
+
+        return x
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/__init__.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c74986953bf1a23a246c92c51fd14e033b6d682
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .activation import build_activation_layer
+from .context_block import ContextBlock
+from .conv import build_conv_layer
+from .conv2d_adaptive_padding import Conv2dAdaptivePadding
+from .conv_module import ConvModule
+from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d
+from .depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from .drop import Dropout, DropPath
+from .generalized_attention import GeneralizedAttention
+from .hsigmoid import HSigmoid
+from .hswish import HSwish
+from .non_local import NonLocal1d, NonLocal2d, NonLocal3d
+from .norm import build_norm_layer, is_norm
+from .padding import build_padding_layer
+from .plugin import build_plugin_layer
+from .scale import LayerScale, Scale
+from .swish import Swish
+from .upsample import build_upsample_layer
+from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
+                       Linear, MaxPool2d, MaxPool3d)
+
+__all__ = [
+    'ConvModule', 'build_activation_layer', 'build_conv_layer',
+    'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
+    'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d',
+    'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention',
+    'Scale', 'ConvAWS2d', 'ConvWS2d', 'conv_ws_2d',
+    'DepthwiseSeparableConvModule', 'Swish', 'Linear', 'Conv2dAdaptivePadding',
+    'Conv2d', 'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d',
+    'Conv3d', 'Dropout', 'DropPath', 'LayerScale'
+]
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/activation.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae99714b940913c946fa169883584ea193f645ea
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/activation.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.registry import MODELS
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+for module in [
+        nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU,
+        nn.Sigmoid, nn.Tanh
+]:
+    MODELS.register_module(module=module)
+
+if digit_version(torch.__version__) >= digit_version('1.7.0'):
+    MODELS.register_module(module=nn.SiLU, name='SiLU')
+else:
+
+    class SiLU(nn.Module):
+        """Sigmoid Weighted Liner Unit."""
+
+        def __init__(self, inplace=False):
+            super().__init__()
+            self.inplace = inplace
+
+        def forward(self, inputs) -> torch.Tensor:
+            if self.inplace:
+                return inputs.mul_(torch.sigmoid(inputs))
+            else:
+                return inputs * torch.sigmoid(inputs)
+
+    MODELS.register_module(module=SiLU, name='SiLU')
+
+
+@MODELS.register_module(name='Clip')
+@MODELS.register_module()
+class Clamp(nn.Module):
+    """Clamp activation layer.
+
+    This activation function is to clamp the feature map value within
+    :math:`[min, max]`. More details can be found in ``torch.clamp()``.
+
+    Args:
+        min (Number | optional): Lower-bound of the range to be clamped to.
+            Default to -1.
+        max (Number | optional): Upper-bound of the range to be clamped to.
+            Default to 1.
+    """
+
+    def __init__(self, min: float = -1., max: float = 1.):
+        super().__init__()
+        self.min = min
+        self.max = max
+
+    def forward(self, x) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: Clamped tensor.
+        """
+        return torch.clamp(x, min=self.min, max=self.max)
+
+
+class GELU(nn.Module):
+    r"""Applies the Gaussian Error Linear Units function:
+
+    .. math::
+        \text{GELU}(x) = x * \Phi(x)
+    where :math:`\Phi(x)` is the Cumulative Distribution Function for
+    Gaussian Distribution.
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/GELU.png
+
+    Examples::
+
+        >>> m = nn.GELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.gelu(input)
+
+
+if (TORCH_VERSION == 'parrots'
+        or digit_version(TORCH_VERSION) < digit_version('1.4')):
+    MODELS.register_module(module=GELU)
+else:
+    MODELS.register_module(module=nn.GELU)
+
+
+def build_activation_layer(cfg: Dict) -> nn.Module:
+    """Build activation layer.
+
+    Args:
+        cfg (dict): The activation layer config, which should contain:
+
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an activation layer.
+
+    Returns:
+        nn.Module: Created activation layer.
+    """
+    return MODELS.build(cfg)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/context_block.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/context_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e78df8648b779124091a8595282aad7a8d0d305
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/context_block.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from mmengine.model import constant_init, kaiming_init
+from mmengine.registry import MODELS
+from torch import nn
+
+
+def last_zero_init(m: Union[nn.Module, nn.Sequential]) -> None:
+    if isinstance(m, nn.Sequential):
+        constant_init(m[-1], val=0)
+    else:
+        constant_init(m, val=0)
+
+
+@MODELS.register_module()
+class ContextBlock(nn.Module):
+    """ContextBlock module in GCNet.
+
+    See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    (https://arxiv.org/abs/1904.11492) for details.
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        ratio (float): Ratio of channels of transform bottleneck
+        pooling_type (str): Pooling method for context modeling.
+            Options are 'att' and 'avg', stand for attention pooling and
+            average pooling respectively. Default: 'att'.
+        fusion_types (Sequence[str]): Fusion method for feature fusion,
+            Options are 'channels_add', 'channel_mul', stand for channelwise
+            addition and multiplication respectively. Default: ('channel_add',)
+    """
+
+    _abbr_ = 'context_block'
+
+    def __init__(self,
+                 in_channels: int,
+                 ratio: float,
+                 pooling_type: str = 'att',
+                 fusion_types: tuple = ('channel_add', )):
+        super().__init__()
+        assert pooling_type in ['avg', 'att']
+        assert isinstance(fusion_types, (list, tuple))
+        valid_fusion_types = ['channel_add', 'channel_mul']
+        assert all([f in valid_fusion_types for f in fusion_types])
+        assert len(fusion_types) > 0, 'at least one fusion should be used'
+        self.in_channels = in_channels
+        self.ratio = ratio
+        self.planes = int(in_channels * ratio)
+        self.pooling_type = pooling_type
+        self.fusion_types = fusion_types
+        if pooling_type == 'att':
+            self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
+            self.softmax = nn.Softmax(dim=2)
+        else:
+            self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        if 'channel_add' in fusion_types:
+            self.channel_add_conv = nn.Sequential(
+                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
+                nn.LayerNorm([self.planes, 1, 1]),
+                nn.ReLU(inplace=True),  # yapf: disable
+                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
+        else:
+            self.channel_add_conv = None
+        if 'channel_mul' in fusion_types:
+            self.channel_mul_conv = nn.Sequential(
+                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
+                nn.LayerNorm([self.planes, 1, 1]),
+                nn.ReLU(inplace=True),  # yapf: disable
+                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
+        else:
+            self.channel_mul_conv = None
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        if self.pooling_type == 'att':
+            kaiming_init(self.conv_mask, mode='fan_in')
+            self.conv_mask.inited = True
+
+        if self.channel_add_conv is not None:
+            last_zero_init(self.channel_add_conv)
+        if self.channel_mul_conv is not None:
+            last_zero_init(self.channel_mul_conv)
+
+    def spatial_pool(self, x: torch.Tensor) -> torch.Tensor:
+        batch, channel, height, width = x.size()
+        if self.pooling_type == 'att':
+            input_x = x
+            # [N, C, H * W]
+            input_x = input_x.view(batch, channel, height * width)
+            # [N, 1, C, H * W]
+            input_x = input_x.unsqueeze(1)
+            # [N, 1, H, W]
+            context_mask = self.conv_mask(x)
+            # [N, 1, H * W]
+            context_mask = context_mask.view(batch, 1, height * width)
+            # [N, 1, H * W]
+            context_mask = self.softmax(context_mask)
+            # [N, 1, H * W, 1]
+            context_mask = context_mask.unsqueeze(-1)
+            # [N, 1, C, 1]
+            context = torch.matmul(input_x, context_mask)
+            # [N, C, 1, 1]
+            context = context.view(batch, channel, 1, 1)
+        else:
+            # [N, C, 1, 1]
+            context = self.avg_pool(x)
+
+        return context
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # [N, C, 1, 1]
+        context = self.spatial_pool(x)
+
+        out = x
+        if self.channel_mul_conv is not None:
+            # [N, C, 1, 1]
+            channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))
+            out = out * channel_mul_term
+        if self.channel_add_conv is not None:
+            # [N, C, 1, 1]
+            channel_add_term = self.channel_add_conv(context)
+            out = out + channel_add_term
+
+        return out
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/conv.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..a00b0a52cee31ff7d4dd8df00b1e1046767a6903
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/conv.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+from typing import Dict, Optional
+
+from mmengine.registry import MODELS
+from torch import nn
+
+MODELS.register_module('Conv1d', module=nn.Conv1d)
+MODELS.register_module('Conv2d', module=nn.Conv2d)
+MODELS.register_module('Conv3d', module=nn.Conv3d)
+MODELS.register_module('Conv', module=nn.Conv2d)
+
+
+def build_conv_layer(cfg: Optional[Dict], *args, **kwargs) -> nn.Module:
+    """Build convolution layer.
+
+    Args:
+        cfg (None or dict): The conv layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an conv layer.
+        args (argument list): Arguments passed to the `__init__`
+            method of the corresponding conv layer.
+        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+            method of the corresponding conv layer.
+
+    Returns:
+        nn.Module: Created conv layer.
+    """
+    if cfg is None:
+        cfg_ = dict(type='Conv2d')
+    else:
+        if not isinstance(cfg, dict):
+            raise TypeError('cfg must be a dict')
+        if 'type' not in cfg:
+            raise KeyError('the cfg dict must contain the key "type"')
+        cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if inspect.isclass(layer_type):
+        return layer_type(*args, **kwargs, **cfg_)  # type: ignore
+    # Switch registry to the target scope. If `conv_layer` cannot be found
+    # in the registry, fallback to search `conv_layer` in the
+    # mmengine.MODELS.
+    with MODELS.switch_scope_and_registry(None) as registry:
+        conv_layer = registry.get(layer_type)
+    if conv_layer is None:
+        raise KeyError(f'Cannot find {conv_layer} in registry under scope '
+                       f'name {registry.scope}')
+    layer = conv_layer(*args, **kwargs, **cfg_)
+
+    return layer
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/conv2d_adaptive_padding.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/conv2d_adaptive_padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ac9949e4830c64161036b519594685f7dae72c2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/conv2d_adaptive_padding.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Tuple, Union
+
+import torch
+from mmengine.registry import MODELS
+from torch import nn
+from torch.nn import functional as F
+
+
+@MODELS.register_module()
+class Conv2dAdaptivePadding(nn.Conv2d):
+    """Implementation of 2D convolution in tensorflow with `padding` as "same",
+    which applies padding to input (if needed) so that input image gets fully
+    covered by filter and stride you specified. For stride 1, this will ensure
+    that output image size is same as input. For stride of 2, output dimensions
+    will be half, for example.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements.
+            Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: bool = True):
+        super().__init__(in_channels, out_channels, kernel_size, stride, 0,
+                         dilation, groups, bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        img_h, img_w = x.size()[-2:]
+        kernel_h, kernel_w = self.weight.size()[-2:]
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(img_h / stride_h)
+        output_w = math.ceil(img_w / stride_w)
+        pad_h = (
+            max((output_h - 1) * self.stride[0] +
+                (kernel_h - 1) * self.dilation[0] + 1 - img_h, 0))
+        pad_w = (
+            max((output_w - 1) * self.stride[1] +
+                (kernel_w - 1) * self.dilation[1] + 1 - img_w, 0))
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [
+                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+            ])
+        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
+                        self.dilation, self.groups)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/conv_module.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..760e3788125300e47769bce0bc34156e8385791b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/conv_module.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from functools import partial
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.model import constant_init, kaiming_init
+from mmengine.registry import MODELS
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm
+
+from .activation import build_activation_layer
+from .conv import build_conv_layer
+from .norm import build_norm_layer
+from .padding import build_padding_layer
+
+
+def efficient_conv_bn_eval_forward(bn: _BatchNorm,
+                                   conv: nn.modules.conv._ConvNd,
+                                   x: torch.Tensor):
+    """
+    Implementation based on https://arxiv.org/abs/2305.11624
+    "Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
+    It leverages the associative law between convolution and affine transform,
+    i.e., normalize (weight conv feature) = (normalize weight) conv feature.
+    It works for Eval mode of ConvBN blocks during validation, and can be used
+    for training as well. It reduces memory and computation cost.
+
+    Args:
+        bn (_BatchNorm): a BatchNorm module.
+        conv (nn._ConvNd): a conv module
+        x (torch.Tensor): Input feature map.
+    """
+    # These lines of code are designed to deal with various cases
+    # like bn without affine transform, and conv without bias
+    weight_on_the_fly = conv.weight
+    if conv.bias is not None:
+        bias_on_the_fly = conv.bias
+    else:
+        bias_on_the_fly = torch.zeros_like(bn.running_var)
+
+    if bn.weight is not None:
+        bn_weight = bn.weight
+    else:
+        bn_weight = torch.ones_like(bn.running_var)
+
+    if bn.bias is not None:
+        bn_bias = bn.bias
+    else:
+        bn_bias = torch.zeros_like(bn.running_var)
+
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    weight_coeff = torch.rsqrt(bn.running_var +
+                               bn.eps).reshape([-1] + [1] *
+                                               (len(conv.weight.shape) - 1))
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff
+
+    # shape of [C_out, C_in, k, k] in Conv2d
+    weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
+    # shape of [C_out] in Conv2d
+    bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() *\
+        (bias_on_the_fly - bn.running_mean)
+
+    return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly)
+
+
+@MODELS.register_module()
+class ConvModule(nn.Module):
+    """A conv block that bundles conv/norm/activation layers.
+
+    This block simplifies the usage of convolution layers, which are commonly
+    used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+    It is based upon three build methods: `build_conv_layer()`,
+    `build_norm_layer()` and `build_activation_layer()`.
+
+    Besides, we add some additional features in this module.
+    1. Automatically set `bias` of the conv layer.
+    2. Spectral norm is supported.
+    3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
+    supports zero and circular padding, and we add "reflect" padding mode.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+            Same as that in ``nn._ConvNd``.
+        out_channels (int): Number of channels produced by the convolution.
+            Same as that in ``nn._ConvNd``.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+            Same as that in ``nn._ConvNd``.
+        stride (int | tuple[int]): Stride of the convolution.
+            Same as that in ``nn._ConvNd``.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input. Same as that in ``nn._ConvNd``.
+        dilation (int | tuple[int]): Spacing between kernel elements.
+            Same as that in ``nn._ConvNd``.
+        groups (int): Number of blocked connections from input channels to
+            output channels. Same as that in ``nn._ConvNd``.
+        bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
+            False. Default: "auto".
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        inplace (bool): Whether to use inplace mode for activation.
+            Default: True.
+        with_spectral_norm (bool): Whether use spectral norm in conv module.
+            Default: False.
+        padding_mode (str): If the `padding_mode` has not been supported by
+            current `Conv2d` in PyTorch, we will use our own padding layer
+            instead. Currently, we support ['zeros', 'circular'] with official
+            implementation and ['reflect'] with our own implementation.
+            Default: 'zeros'.
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Common examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+            Default: ('conv', 'norm', 'act').
+        efficient_conv_bn_eval (bool): Whether use efficient conv when the
+            consecutive bn is in eval mode (either training or testing), as
+            proposed in https://arxiv.org/abs/2305.11624 . Default: `False`.
+    """
+
+    _abbr_ = 'conv_block'
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: Union[bool, str] = 'auto',
+                 conv_cfg: Optional[Dict] = None,
+                 norm_cfg: Optional[Dict] = None,
+                 act_cfg: Optional[Dict] = dict(type='ReLU'),
+                 inplace: bool = True,
+                 with_spectral_norm: bool = False,
+                 padding_mode: str = 'zeros',
+                 order: tuple = ('conv', 'norm', 'act'),
+                 efficient_conv_bn_eval: bool = False):
+        super().__init__()
+        assert conv_cfg is None or isinstance(conv_cfg, dict)
+        assert norm_cfg is None or isinstance(norm_cfg, dict)
+        assert act_cfg is None or isinstance(act_cfg, dict)
+        official_padding_mode = ['zeros', 'circular']
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.inplace = inplace
+        self.with_spectral_norm = with_spectral_norm
+        self.with_explicit_padding = padding_mode not in official_padding_mode
+        self.order = order
+        assert isinstance(self.order, tuple) and len(self.order) == 3
+        assert set(order) == {'conv', 'norm', 'act'}
+
+        self.with_norm = norm_cfg is not None
+        self.with_activation = act_cfg is not None
+        # if the conv layer is before a norm layer, bias is unnecessary.
+        if bias == 'auto':
+            bias = not self.with_norm
+        self.with_bias = bias
+
+        if self.with_explicit_padding:
+            pad_cfg = dict(type=padding_mode)
+            self.padding_layer = build_padding_layer(pad_cfg, padding)
+
+        # reset padding to 0 for conv module
+        conv_padding = 0 if self.with_explicit_padding else padding
+        # build convolution layer
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=conv_padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        # export the attributes of self.conv to a higher level for convenience
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+
+        if self.with_spectral_norm:
+            self.conv = nn.utils.spectral_norm(self.conv)
+
+        # build normalization layers
+        if self.with_norm:
+            # norm layer is after conv layer
+            if order.index('norm') > order.index('conv'):
+                norm_channels = out_channels
+            else:
+                norm_channels = in_channels
+            self.norm_name, norm = build_norm_layer(
+                norm_cfg, norm_channels)  # type: ignore
+            self.add_module(self.norm_name, norm)
+            if self.with_bias:
+                if isinstance(norm, (_BatchNorm, _InstanceNorm)):
+                    warnings.warn(
+                        'Unnecessary conv bias before batch/instance norm')
+        else:
+            self.norm_name = None  # type: ignore
+
+        self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
+
+        # build activation layer
+        if self.with_activation:
+            act_cfg_ = act_cfg.copy()  # type: ignore
+            # nn.Tanh has no 'inplace' argument
+            if act_cfg_['type'] not in [
+                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish', 'GELU'
+            ]:
+                act_cfg_.setdefault('inplace', inplace)
+            self.activate = build_activation_layer(act_cfg_)
+
+        # Use msra init by default
+        self.init_weights()
+
+    @property
+    def norm(self):
+        if self.norm_name:
+            return getattr(self, self.norm_name)
+        else:
+            return None
+
+    def init_weights(self):
+        # 1. It is mainly for customized conv layers with their own
+        #    initialization manners by calling their own ``init_weights()``,
+        #    and we do not want ConvModule to override the initialization.
+        # 2. For customized conv layers without their own initialization
+        #    manners (that is, they don't have their own ``init_weights()``)
+        #    and PyTorch's conv layers, they will be initialized by
+        #    this method with default ``kaiming_init``.
+        # Note: For PyTorch's conv layers, they will be overwritten by our
+        #    initialization implementation using default ``kaiming_init``.
+        if not hasattr(self.conv, 'init_weights'):
+            if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':
+                nonlinearity = 'leaky_relu'
+                a = self.act_cfg.get('negative_slope', 0.01)
+            else:
+                nonlinearity = 'relu'
+                a = 0
+            kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
+        if self.with_norm:
+            constant_init(self.norm, 1, bias=0)
+
+    def forward(self,
+                x: torch.Tensor,
+                activate: bool = True,
+                norm: bool = True) -> torch.Tensor:
+        layer_index = 0
+        while layer_index < len(self.order):
+            layer = self.order[layer_index]
+            if layer == 'conv':
+                if self.with_explicit_padding:
+                    x = self.padding_layer(x)
+                # if the next operation is norm and we have a norm layer in
+                # eval mode and we have enabled `efficient_conv_bn_eval` for
+                # the conv operator, then activate the optimized forward and
+                # skip the next norm operator since it has been fused
+                if layer_index + 1 < len(self.order) and \
+                        self.order[layer_index + 1] == 'norm' and norm and \
+                        self.with_norm and not self.norm.training and \
+                        self.efficient_conv_bn_eval_forward is not None:
+                    self.conv.forward = partial(
+                        self.efficient_conv_bn_eval_forward, self.norm,
+                        self.conv)
+                    layer_index += 1
+                    x = self.conv(x)
+                    del self.conv.forward
+                else:
+                    x = self.conv(x)
+            elif layer == 'norm' and norm and self.with_norm:
+                x = self.norm(x)
+            elif layer == 'act' and activate and self.with_activation:
+                x = self.activate(x)
+            layer_index += 1
+        return x
+
+    def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval=True):
+        # efficient_conv_bn_eval works for conv + bn
+        # with `track_running_stats` option
+        if efficient_conv_bn_eval and self.norm \
+                            and isinstance(self.norm, _BatchNorm) \
+                            and self.norm.track_running_stats:
+            self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward  # noqa: E501
+        else:
+            self.efficient_conv_bn_eval_forward = None  # type: ignore
+
+    @staticmethod
+    def create_from_conv_bn(conv: torch.nn.modules.conv._ConvNd,
+                            bn: torch.nn.modules.batchnorm._BatchNorm,
+                            efficient_conv_bn_eval=True) -> 'ConvModule':
+        """Create a ConvModule from a conv and a bn module."""
+        self = ConvModule.__new__(ConvModule)
+        super(ConvModule, self).__init__()
+
+        self.conv_cfg = None
+        self.norm_cfg = None
+        self.act_cfg = None
+        self.inplace = False
+        self.with_spectral_norm = False
+        self.with_explicit_padding = False
+        self.order = ('conv', 'norm', 'act')
+
+        self.with_norm = True
+        self.with_activation = False
+        self.with_bias = conv.bias is not None
+
+        # build convolution layer
+        self.conv = conv
+        # export the attributes of self.conv to a higher level for convenience
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = self.conv.padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+
+        # build normalization layers
+        self.norm_name, norm = 'bn', bn
+        self.add_module(self.norm_name, norm)
+
+        self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval)
+
+        return self
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/conv_ws.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/conv_ws.py
new file mode 100644
index 0000000000000000000000000000000000000000..261f5c1aa9aa9b80891e6330e6d576c3a8ce3e5d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/conv_ws.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.registry import MODELS
+
+
+def conv_ws_2d(input: torch.Tensor,
+               weight: torch.Tensor,
+               bias: Optional[torch.Tensor] = None,
+               stride: Union[int, Tuple[int, int]] = 1,
+               padding: Union[int, Tuple[int, int]] = 0,
+               dilation: Union[int, Tuple[int, int]] = 1,
+               groups: int = 1,
+               eps: float = 1e-5) -> torch.Tensor:
+    c_in = weight.size(0)
+    weight_flat = weight.view(c_in, -1)
+    mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+    std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+    weight = (weight - mean) / (std + eps)
+    return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
+
+
+@MODELS.register_module('ConvWS')
+class ConvWS2d(nn.Conv2d):
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 eps: float = 1e-5):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
+                          self.dilation, self.groups, self.eps)
+
+
+@MODELS.register_module(name='ConvAWS')
+class ConvAWS2d(nn.Conv2d):
+    """AWS (Adaptive Weight Standardization)
+
+    This is a variant of Weight Standardization
+    (https://arxiv.org/pdf/1903.10520.pdf)
+    It is used in DetectoRS to avoid NaN
+    (https://arxiv.org/pdf/2006.02334.pdf)
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the conv kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements.
+            Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If set True, adds a learnable bias to the
+            output. Default: True
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: bool = True):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.register_buffer('weight_gamma',
+                             torch.ones(self.out_channels, 1, 1, 1))
+        self.register_buffer('weight_beta',
+                             torch.zeros(self.out_channels, 1, 1, 1))
+
+    def _get_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        weight_flat = weight.view(weight.size(0), -1)
+        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
+        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
+        weight = (weight - mean) / std
+        weight = self.weight_gamma * weight + self.weight_beta
+        return weight
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        weight = self._get_weight(self.weight)
+        return F.conv2d(x, weight, self.bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+
+    def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str,
+                              local_metadata: Dict, strict: bool,
+                              missing_keys: List[str],
+                              unexpected_keys: List[str],
+                              error_msgs: List[str]) -> None:
+        """Override default load function.
+
+        AWS overrides the function _load_from_state_dict to recover
+        weight_gamma and weight_beta if they are missing. If weight_gamma and
+        weight_beta are found in the checkpoint, this function will return
+        after super()._load_from_state_dict. Otherwise, it will compute the
+        mean and std of the pretrained weights and store them in weight_beta
+        and weight_gamma.
+        """
+
+        self.weight_gamma.data.fill_(-1)
+        local_missing_keys: List = []
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, local_missing_keys,
+                                      unexpected_keys, error_msgs)
+        if self.weight_gamma.data.mean() > 0:
+            for k in local_missing_keys:
+                missing_keys.append(k)
+            return
+        weight = self.weight.data
+        weight_flat = weight.view(weight.size(0), -1)
+        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
+        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
+        self.weight_beta.data.copy_(mean)
+        self.weight_gamma.data.copy_(std)
+        missing_gamma_beta = [
+            k for k in local_missing_keys
+            if k.endswith('weight_gamma') or k.endswith('weight_beta')
+        ]
+        for k in missing_gamma_beta:
+            local_missing_keys.remove(k)
+        for k in local_missing_keys:
+            missing_keys.append(k)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/depthwise_separable_conv_module.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/depthwise_separable_conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf1fe4cad3812007573211fa2bede28b23822122
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/depthwise_separable_conv_module.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from .conv_module import ConvModule
+
+
+class DepthwiseSeparableConvModule(nn.Module):
+    """Depthwise separable convolution module.
+
+    See https://arxiv.org/pdf/1704.04861.pdf for details.
+
+    This module can replace a ConvModule with the conv block replaced by two
+    conv block: depthwise conv block and pointwise conv block. The depthwise
+    conv block contains depthwise-conv/norm/activation layers. The pointwise
+    conv block contains pointwise-conv/norm/activation layers. It should be
+    noted that there will be norm/activation layer in the depthwise conv block
+    if `norm_cfg` and `act_cfg` are specified.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+            Same as that in ``nn._ConvNd``.
+        out_channels (int): Number of channels produced by the convolution.
+            Same as that in ``nn._ConvNd``.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+            Same as that in ``nn._ConvNd``.
+        stride (int | tuple[int]): Stride of the convolution.
+            Same as that in ``nn._ConvNd``. Default: 1.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input. Same as that in ``nn._ConvNd``. Default: 0.
+        dilation (int | tuple[int]): Spacing between kernel elements.
+            Same as that in ``nn._ConvNd``. Default: 1.
+        norm_cfg (dict): Default norm config for both depthwise ConvModule and
+            pointwise ConvModule. Default: None.
+        act_cfg (dict): Default activation config for both depthwise ConvModule
+            and pointwise ConvModule. Default: dict(type='ReLU').
+        dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
+            'default', it will be the same as `norm_cfg`. Default: 'default'.
+        dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
+            'default', it will be the same as `act_cfg`. Default: 'default'.
+        pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
+            'default', it will be the same as `norm_cfg`. Default: 'default'.
+        pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
+            'default', it will be the same as `act_cfg`. Default: 'default'.
+        kwargs (optional): Other shared arguments for depthwise and pointwise
+            ConvModule. See ConvModule for ref.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 norm_cfg: Optional[Dict] = None,
+                 act_cfg: Dict = dict(type='ReLU'),
+                 dw_norm_cfg: Union[Dict, str] = 'default',
+                 dw_act_cfg: Union[Dict, str] = 'default',
+                 pw_norm_cfg: Union[Dict, str] = 'default',
+                 pw_act_cfg: Union[Dict, str] = 'default',
+                 **kwargs):
+        super().__init__()
+        assert 'groups' not in kwargs, 'groups should not be specified'
+
+        # if norm/activation config of depthwise/pointwise ConvModule is not
+        # specified, use default config.
+        dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501
+        dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
+        pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501
+        pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg
+
+        # depthwise convolution
+        self.depthwise_conv = ConvModule(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            norm_cfg=dw_norm_cfg,  # type: ignore
+            act_cfg=dw_act_cfg,  # type: ignore
+            **kwargs)
+
+        self.pointwise_conv = ConvModule(
+            in_channels,
+            out_channels,
+            1,
+            norm_cfg=pw_norm_cfg,  # type: ignore
+            act_cfg=pw_act_cfg,  # type: ignore
+            **kwargs)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv(x)
+        return x
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/drop.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/drop.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe82a2560515858341836de3fa563ed4db3a3e14
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/drop.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+
+def drop_path(x: torch.Tensor,
+              drop_prob: float = 0.,
+              training: bool = False) -> torch.Tensor:
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of
+    residual blocks).
+
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    # handle tensors with different dimensions, not just 4D tensors.
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=x.dtype, device=x.device)
+    output = x.div(keep_prob) * random_tensor.floor()
+    return output
+
+
+@MODELS.register_module()
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
+    residual blocks).
+
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+
+    Args:
+        drop_prob (float): Probability of the path to be zeroed. Default: 0.1
+    """
+
+    def __init__(self, drop_prob: float = 0.1):
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return drop_path(x, self.drop_prob, self.training)
+
+
+@MODELS.register_module()
+class Dropout(nn.Dropout):
+    """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
+    ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
+    ``DropPath``
+
+    Args:
+        drop_prob (float): Probability of the elements to be
+            zeroed. Default: 0.5.
+        inplace (bool):  Do the operation inplace or not. Default: False.
+    """
+
+    def __init__(self, drop_prob: float = 0.5, inplace: bool = False):
+        super().__init__(p=drop_prob, inplace=inplace)
+
+
+def build_dropout(cfg: Dict, default_args: Optional[Dict] = None) -> Any:
+    """Builder for drop out layers."""
+    return MODELS.build(cfg, default_args=default_args)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/generalized_attention.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/generalized_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..239823c787c8c56947c8b4afe2e0987c42a86abb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/generalized_attention.py
@@ -0,0 +1,411 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import kaiming_init
+from mmengine.registry import MODELS
+
+
+@MODELS.register_module()
+class GeneralizedAttention(nn.Module):
+    """GeneralizedAttention module.
+
+    See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
+    (https://arxiv.org/abs/1904.05873) for details.
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        spatial_range (int): The spatial range. -1 indicates no spatial range
+            constraint. Default: -1.
+        num_heads (int): The head number of empirical_attention module.
+            Default: 9.
+        position_embedding_dim (int): The position embedding dimension.
+            Default: -1.
+        position_magnitude (int): A multiplier acting on coord difference.
+            Default: 1.
+        kv_stride (int): The feature stride acting on key/value feature map.
+            Default: 2.
+        q_stride (int): The feature stride acting on query feature map.
+            Default: 1.
+        attention_type (str): A binary indicator string for indicating which
+            items in generalized empirical_attention module are used.
+            Default: '1111'.
+
+            - '1000' indicates 'query and key content' (appr - appr) item,
+            - '0100' indicates 'query content and relative position'
+              (appr - position) item,
+            - '0010' indicates 'key content only' (bias - appr) item,
+            - '0001' indicates 'relative position only' (bias - position) item.
+    """
+
+    _abbr_ = 'gen_attention_block'
+
+    def __init__(self,
+                 in_channels: int,
+                 spatial_range: int = -1,
+                 num_heads: int = 9,
+                 position_embedding_dim: int = -1,
+                 position_magnitude: int = 1,
+                 kv_stride: int = 2,
+                 q_stride: int = 1,
+                 attention_type: str = '1111'):
+
+        super().__init__()
+
+        # hard range means local range for non-local operation
+        self.position_embedding_dim = (
+            position_embedding_dim
+            if position_embedding_dim > 0 else in_channels)
+
+        self.position_magnitude = position_magnitude
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.spatial_range = spatial_range
+        self.kv_stride = kv_stride
+        self.q_stride = q_stride
+        self.attention_type = [bool(int(_)) for _ in attention_type]
+        self.qk_embed_dim = in_channels // num_heads
+        out_c = self.qk_embed_dim * num_heads
+
+        if self.attention_type[0] or self.attention_type[1]:
+            self.query_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_c,
+                kernel_size=1,
+                bias=False)
+            self.query_conv.kaiming_init = True
+
+        if self.attention_type[0] or self.attention_type[2]:
+            self.key_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_c,
+                kernel_size=1,
+                bias=False)
+            self.key_conv.kaiming_init = True
+
+        self.v_dim = in_channels // num_heads
+        self.value_conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=self.v_dim * num_heads,
+            kernel_size=1,
+            bias=False)
+        self.value_conv.kaiming_init = True
+
+        if self.attention_type[1] or self.attention_type[3]:
+            self.appr_geom_fc_x = nn.Linear(
+                self.position_embedding_dim // 2, out_c, bias=False)
+            self.appr_geom_fc_x.kaiming_init = True
+
+            self.appr_geom_fc_y = nn.Linear(
+                self.position_embedding_dim // 2, out_c, bias=False)
+            self.appr_geom_fc_y.kaiming_init = True
+
+        if self.attention_type[2]:
+            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
+            appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv
+            self.appr_bias = nn.Parameter(appr_bias_value)
+
+        if self.attention_type[3]:
+            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
+            geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv
+            self.geom_bias = nn.Parameter(geom_bias_value)
+
+        self.proj_conv = nn.Conv2d(
+            in_channels=self.v_dim * num_heads,
+            out_channels=in_channels,
+            kernel_size=1,
+            bias=True)
+        self.proj_conv.kaiming_init = True
+        self.gamma = nn.Parameter(torch.zeros(1))
+
+        if self.spatial_range >= 0:
+            # only works when non local is after 3*3 conv
+            if in_channels == 256:
+                max_len = 84
+            elif in_channels == 512:
+                max_len = 42
+
+            max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
+            local_constraint_map = np.ones(
+                (max_len, max_len, max_len_kv, max_len_kv), dtype=int)
+            for iy in range(max_len):
+                for ix in range(max_len):
+                    local_constraint_map[
+                        iy, ix,
+                        max((iy - self.spatial_range) //
+                            self.kv_stride, 0):min((iy + self.spatial_range +
+                                                    1) // self.kv_stride +
+                                                   1, max_len),
+                        max((ix - self.spatial_range) //
+                            self.kv_stride, 0):min((ix + self.spatial_range +
+                                                    1) // self.kv_stride +
+                                                   1, max_len)] = 0
+
+            self.local_constraint_map = nn.Parameter(
+                torch.from_numpy(local_constraint_map).byte(),
+                requires_grad=False)
+
+        if self.q_stride > 1:
+            self.q_downsample = nn.AvgPool2d(
+                kernel_size=1, stride=self.q_stride)
+        else:
+            self.q_downsample = None
+
+        if self.kv_stride > 1:
+            self.kv_downsample = nn.AvgPool2d(
+                kernel_size=1, stride=self.kv_stride)
+        else:
+            self.kv_downsample = None
+
+        self.init_weights()
+
+    def get_position_embedding(self,
+                               h,
+                               w,
+                               h_kv,
+                               w_kv,
+                               q_stride,
+                               kv_stride,
+                               device,
+                               dtype,
+                               feat_dim,
+                               wave_length=1000):
+        # the default type of Tensor is float32, leading to type mismatch
+        # in fp16 mode. Cast it to support fp16 mode.
+        h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype)
+        h_idxs = h_idxs.view((h, 1)) * q_stride
+
+        w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype)
+        w_idxs = w_idxs.view((w, 1)) * q_stride
+
+        h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to(
+            device=device, dtype=dtype)
+        h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride
+
+        w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to(
+            device=device, dtype=dtype)
+        w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride
+
+        # (h, h_kv, 1)
+        h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0)
+        h_diff *= self.position_magnitude
+
+        # (w, w_kv, 1)
+        w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0)
+        w_diff *= self.position_magnitude
+
+        feat_range = torch.arange(0, feat_dim / 4).to(
+            device=device, dtype=dtype)
+
+        dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype)
+        dim_mat = dim_mat**((4. / feat_dim) * feat_range)
+        dim_mat = dim_mat.view((1, 1, -1))
+
+        embedding_x = torch.cat(
+            ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2)
+
+        embedding_y = torch.cat(
+            ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2)
+
+        return embedding_x, embedding_y
+
+    def forward(self, x_input: torch.Tensor) -> torch.Tensor:
+        num_heads = self.num_heads
+
+        # use empirical_attention
+        if self.q_downsample is not None:
+            x_q = self.q_downsample(x_input)
+        else:
+            x_q = x_input
+        n, _, h, w = x_q.shape
+
+        if self.kv_downsample is not None:
+            x_kv = self.kv_downsample(x_input)
+        else:
+            x_kv = x_input
+        _, _, h_kv, w_kv = x_kv.shape
+
+        if self.attention_type[0] or self.attention_type[1]:
+            proj_query = self.query_conv(x_q).view(
+                (n, num_heads, self.qk_embed_dim, h * w))
+            proj_query = proj_query.permute(0, 1, 3, 2)
+
+        if self.attention_type[0] or self.attention_type[2]:
+            proj_key = self.key_conv(x_kv).view(
+                (n, num_heads, self.qk_embed_dim, h_kv * w_kv))
+
+        if self.attention_type[1] or self.attention_type[3]:
+            position_embed_x, position_embed_y = self.get_position_embedding(
+                h, w, h_kv, w_kv, self.q_stride, self.kv_stride,
+                x_input.device, x_input.dtype, self.position_embedding_dim)
+            # (n, num_heads, w, w_kv, dim)
+            position_feat_x = self.appr_geom_fc_x(position_embed_x).\
+                view(1, w, w_kv, num_heads, self.qk_embed_dim).\
+                permute(0, 3, 1, 2, 4).\
+                repeat(n, 1, 1, 1, 1)
+
+            # (n, num_heads, h, h_kv, dim)
+            position_feat_y = self.appr_geom_fc_y(position_embed_y).\
+                view(1, h, h_kv, num_heads, self.qk_embed_dim).\
+                permute(0, 3, 1, 2, 4).\
+                repeat(n, 1, 1, 1, 1)
+
+            position_feat_x /= math.sqrt(2)
+            position_feat_y /= math.sqrt(2)
+
+        # accelerate for saliency only
+        if (np.sum(self.attention_type) == 1) and self.attention_type[2]:
+            appr_bias = self.appr_bias.\
+                view(1, num_heads, 1, self.qk_embed_dim).\
+                repeat(n, 1, 1, 1)
+
+            energy = torch.matmul(appr_bias, proj_key).\
+                view(n, num_heads, 1, h_kv * w_kv)
+
+            h = 1
+            w = 1
+        else:
+            # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for
+            if not self.attention_type[0]:
+                energy = torch.zeros(
+                    n,
+                    num_heads,
+                    h,
+                    w,
+                    h_kv,
+                    w_kv,
+                    dtype=x_input.dtype,
+                    device=x_input.device)
+
+            # attention_type[0]: appr - appr
+            # attention_type[1]: appr - position
+            # attention_type[2]: bias - appr
+            # attention_type[3]: bias - position
+            if self.attention_type[0] or self.attention_type[2]:
+                if self.attention_type[0] and self.attention_type[2]:
+                    appr_bias = self.appr_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim)
+                    energy = torch.matmul(proj_query + appr_bias, proj_key).\
+                        view(n, num_heads, h, w, h_kv, w_kv)
+
+                elif self.attention_type[0]:
+                    energy = torch.matmul(proj_query, proj_key).\
+                        view(n, num_heads, h, w, h_kv, w_kv)
+
+                elif self.attention_type[2]:
+                    appr_bias = self.appr_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim).\
+                        repeat(n, 1, 1, 1)
+
+                    energy += torch.matmul(appr_bias, proj_key).\
+                        view(n, num_heads, 1, 1, h_kv, w_kv)
+
+            if self.attention_type[1] or self.attention_type[3]:
+                if self.attention_type[1] and self.attention_type[3]:
+                    geom_bias = self.geom_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim)
+
+                    proj_query_reshape = (proj_query + geom_bias).\
+                        view(n, num_heads, h, w, self.qk_embed_dim)
+
+                    energy_x = torch.matmul(
+                        proj_query_reshape.permute(0, 1, 3, 2, 4),
+                        position_feat_x.permute(0, 1, 2, 4, 3))
+                    energy_x = energy_x.\
+                        permute(0, 1, 3, 2, 4).unsqueeze(4)
+
+                    energy_y = torch.matmul(
+                        proj_query_reshape,
+                        position_feat_y.permute(0, 1, 2, 4, 3))
+                    energy_y = energy_y.unsqueeze(5)
+
+                    energy += energy_x + energy_y
+
+                elif self.attention_type[1]:
+                    proj_query_reshape = proj_query.\
+                        view(n, num_heads, h, w, self.qk_embed_dim)
+                    proj_query_reshape = proj_query_reshape.\
+                        permute(0, 1, 3, 2, 4)
+                    position_feat_x_reshape = position_feat_x.\
+                        permute(0, 1, 2, 4, 3)
+                    position_feat_y_reshape = position_feat_y.\
+                        permute(0, 1, 2, 4, 3)
+
+                    energy_x = torch.matmul(proj_query_reshape,
+                                            position_feat_x_reshape)
+                    energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4)
+
+                    energy_y = torch.matmul(proj_query_reshape,
+                                            position_feat_y_reshape)
+                    energy_y = energy_y.unsqueeze(5)
+
+                    energy += energy_x + energy_y
+
+                elif self.attention_type[3]:
+                    geom_bias = self.geom_bias.\
+                        view(1, num_heads, self.qk_embed_dim, 1).\
+                        repeat(n, 1, 1, 1)
+
+                    position_feat_x_reshape = position_feat_x.\
+                        view(n, num_heads, w * w_kv, self.qk_embed_dim)
+
+                    position_feat_y_reshape = position_feat_y.\
+                        view(n, num_heads, h * h_kv, self.qk_embed_dim)
+
+                    energy_x = torch.matmul(position_feat_x_reshape, geom_bias)
+                    energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv)
+
+                    energy_y = torch.matmul(position_feat_y_reshape, geom_bias)
+                    energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1)
+
+                    energy += energy_x + energy_y
+
+            energy = energy.view(n, num_heads, h * w, h_kv * w_kv)
+
+        if self.spatial_range >= 0:
+            cur_local_constraint_map = \
+                self.local_constraint_map[:h, :w, :h_kv, :w_kv].\
+                contiguous().\
+                view(1, 1, h*w, h_kv*w_kv)
+
+            energy = energy.masked_fill_(cur_local_constraint_map.bool(),
+                                         float('-inf'))
+
+        attention = F.softmax(energy, 3)
+
+        proj_value = self.value_conv(x_kv)
+        proj_value_reshape = proj_value.\
+            view((n, num_heads, self.v_dim, h_kv * w_kv)).\
+            permute(0, 1, 3, 2)
+
+        out = torch.matmul(attention, proj_value_reshape).\
+            permute(0, 1, 3, 2).\
+            contiguous().\
+            view(n, self.v_dim * self.num_heads, h, w)
+
+        out = self.proj_conv(out)
+
+        # output is downsampled, upsample back to input size
+        if self.q_downsample is not None:
+            out = F.interpolate(
+                out,
+                size=x_input.shape[2:],
+                mode='bilinear',
+                align_corners=False)
+
+        out = self.gamma * out + x_input
+        return out
+
+    def init_weights(self):
+        for m in self.modules():
+            if hasattr(m, 'kaiming_init') and m.kaiming_init:
+                kaiming_init(
+                    m,
+                    mode='fan_in',
+                    nonlinearity='leaky_relu',
+                    bias=0,
+                    distribution='uniform',
+                    a=1)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/hsigmoid.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/hsigmoid.py
new file mode 100644
index 0000000000000000000000000000000000000000..423e0aad9ae154cf651d289327bc19da940cf449
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/hsigmoid.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+
+@MODELS.register_module()
+class HSigmoid(nn.Module):
+    """Hard Sigmoid Module. Apply the hard sigmoid function:
+    Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
+    Default: Hsigmoid(x) = min(max((x + 3) / 6, 0), 1)
+
+    Note:
+        In MMCV v1.4.4, we modified the default value of args to align with
+        PyTorch official.
+
+    Args:
+        bias (float): Bias of the input feature map. Default: 3.0.
+        divisor (float): Divisor of the input feature map. Default: 6.0.
+        min_value (float): Lower bound value. Default: 0.0.
+        max_value (float): Upper bound value. Default: 1.0.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 bias: float = 3.0,
+                 divisor: float = 6.0,
+                 min_value: float = 0.0,
+                 max_value: float = 1.0):
+        super().__init__()
+        warnings.warn(
+            'In MMCV v1.4.4, we modified the default value of args to align '
+            'with PyTorch official. Previous Implementation: '
+            'Hsigmoid(x) = min(max((x + 1) / 2, 0), 1). '
+            'Current Implementation: '
+            'Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).')
+        self.bias = bias
+        self.divisor = divisor
+        assert self.divisor != 0
+        self.min_value = min_value
+        self.max_value = max_value
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x + self.bias) / self.divisor
+
+        return x.clamp_(self.min_value, self.max_value)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/hswish.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/hswish.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b6dd006d424bd39a3f99ceefda816408309d71c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/hswish.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+
+class HSwish(nn.Module):
+    """Hard Swish Module.
+
+    This module applies the hard swish function:
+
+    .. math::
+        Hswish(x) = x * ReLU6(x + 3) / 6
+
+    Args:
+        inplace (bool): can optionally do the operation in-place.
+            Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.act = nn.ReLU6(inplace)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * self.act(x + 3) / 6
+
+
+if (TORCH_VERSION == 'parrots'
+        or digit_version(TORCH_VERSION) < digit_version('1.7')):
+    # Hardswish is not supported when PyTorch version < 1.6.
+    # And Hardswish in PyTorch 1.6 does not support inplace.
+    MODELS.register_module(module=HSwish)
+else:
+    MODELS.register_module(module=nn.Hardswish, name='HSwish')
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/non_local.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/non_local.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dd4465cd62fcb07ec1bc3410ebd272f427ec6b1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/non_local.py
@@ -0,0 +1,308 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+from typing import Dict, Optional
+
+import torch
+import torch.nn as nn
+from mmengine.model import constant_init, normal_init
+from mmengine.registry import MODELS
+
+from .conv_module import ConvModule
+
+
+class _NonLocalNd(nn.Module, metaclass=ABCMeta):
+    """Basic Non-local module.
+
+    This module is proposed in
+    "Non-local Neural Networks"
+    Paper reference: https://arxiv.org/abs/1711.07971
+    Code reference: https://github.com/AlexHex7/Non-local_pytorch
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        reduction (int): Channel reduction ratio. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`.
+            Default: True.
+        conv_cfg (None | dict): The config dict for convolution layers.
+            If not specified, it will use `nn.Conv2d` for convolution layers.
+            Default: None.
+        norm_cfg (None | dict): The config dict for normalization layers.
+            Default: None. (This parameter is only applicable to conv_out.)
+        mode (str): Options are `gaussian`, `concatenation`,
+            `embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 reduction: int = 2,
+                 use_scale: bool = True,
+                 conv_cfg: Optional[Dict] = None,
+                 norm_cfg: Optional[Dict] = None,
+                 mode: str = 'embedded_gaussian',
+                 **kwargs):
+        super().__init__()
+        self.in_channels = in_channels
+        self.reduction = reduction
+        self.use_scale = use_scale
+        self.inter_channels = max(in_channels // reduction, 1)
+        self.mode = mode
+
+        if mode not in [
+                'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'
+        ]:
+            raise ValueError("Mode should be in 'gaussian', 'concatenation', "
+                             f"'embedded_gaussian' or 'dot_product', but got "
+                             f'{mode} instead.')
+
+        # g, theta, phi are defaulted as `nn.ConvNd`.
+        # Here we use ConvModule for potential usage.
+        self.g = ConvModule(
+            self.in_channels,
+            self.inter_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            act_cfg=None)  # type: ignore
+        self.conv_out = ConvModule(
+            self.inter_channels,
+            self.in_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        if self.mode != 'gaussian':
+            self.theta = ConvModule(
+                self.in_channels,
+                self.inter_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                act_cfg=None)
+            self.phi = ConvModule(
+                self.in_channels,
+                self.inter_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                act_cfg=None)
+
+        if self.mode == 'concatenation':
+            self.concat_project = ConvModule(
+                self.inter_channels * 2,
+                1,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+                act_cfg=dict(type='ReLU'))
+
+        self.init_weights(**kwargs)
+
+    def init_weights(self, std: float = 0.01, zeros_init: bool = True) -> None:
+        if self.mode != 'gaussian':
+            for m in [self.g, self.theta, self.phi]:
+                normal_init(m.conv, std=std)
+        else:
+            normal_init(self.g.conv, std=std)
+        if zeros_init:
+            if self.conv_out.norm_cfg is None:
+                constant_init(self.conv_out.conv, 0)
+            else:
+                constant_init(self.conv_out.norm, 0)
+        else:
+            if self.conv_out.norm_cfg is None:
+                normal_init(self.conv_out.conv, std=std)
+            else:
+                normal_init(self.conv_out.norm, std=std)
+
+    def gaussian(self, theta_x: torch.Tensor,
+                 phi_x: torch.Tensor) -> torch.Tensor:
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        pairwise_weight = pairwise_weight.softmax(dim=-1)
+        return pairwise_weight
+
+    def embedded_gaussian(self, theta_x: torch.Tensor,
+                          phi_x: torch.Tensor) -> torch.Tensor:
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        if self.use_scale:
+            # theta_x.shape[-1] is `self.inter_channels`
+            pairwise_weight /= theta_x.shape[-1]**0.5
+        pairwise_weight = pairwise_weight.softmax(dim=-1)
+        return pairwise_weight
+
+    def dot_product(self, theta_x: torch.Tensor,
+                    phi_x: torch.Tensor) -> torch.Tensor:
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        pairwise_weight /= pairwise_weight.shape[-1]
+        return pairwise_weight
+
+    def concatenation(self, theta_x: torch.Tensor,
+                      phi_x: torch.Tensor) -> torch.Tensor:
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        h = theta_x.size(2)
+        w = phi_x.size(3)
+        theta_x = theta_x.repeat(1, 1, 1, w)
+        phi_x = phi_x.repeat(1, 1, h, 1)
+
+        concat_feature = torch.cat([theta_x, phi_x], dim=1)
+        pairwise_weight = self.concat_project(concat_feature)
+        n, _, h, w = pairwise_weight.size()
+        pairwise_weight = pairwise_weight.view(n, h, w)
+        pairwise_weight /= pairwise_weight.shape[-1]
+
+        return pairwise_weight
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Assume `reduction = 1`, then `inter_channels = C`
+        # or `inter_channels = C` when `mode="gaussian"`
+
+        # NonLocal1d x: [N, C, H]
+        # NonLocal2d x: [N, C, H, W]
+        # NonLocal3d x: [N, C, T, H, W]
+        n = x.size(0)
+
+        # NonLocal1d g_x: [N, H, C]
+        # NonLocal2d g_x: [N, HxW, C]
+        # NonLocal3d g_x: [N, TxHxW, C]
+        g_x = self.g(x).view(n, self.inter_channels, -1)
+        g_x = g_x.permute(0, 2, 1)
+
+        # NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H]
+        # NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW]
+        # NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW]
+        if self.mode == 'gaussian':
+            theta_x = x.view(n, self.in_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            if self.sub_sample:
+                phi_x = self.phi(x).view(n, self.in_channels, -1)
+            else:
+                phi_x = x.view(n, self.in_channels, -1)
+        elif self.mode == 'concatenation':
+            theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
+        else:
+            theta_x = self.theta(x).view(n, self.inter_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, -1)
+
+        pairwise_func = getattr(self, self.mode)
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = pairwise_func(theta_x, phi_x)
+
+        # NonLocal1d y: [N, H, C]
+        # NonLocal2d y: [N, HxW, C]
+        # NonLocal3d y: [N, TxHxW, C]
+        y = torch.matmul(pairwise_weight, g_x)
+        # NonLocal1d y: [N, C, H]
+        # NonLocal2d y: [N, C, H, W]
+        # NonLocal3d y: [N, C, T, H, W]
+        y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
+                                                    *x.size()[2:])
+
+        output = x + self.conv_out(y)
+
+        return output
+
+
+class NonLocal1d(_NonLocalNd):
+    """1D Non-local module.
+
+    Args:
+        in_channels (int): Same as `NonLocalND`.
+        sub_sample (bool): Whether to apply max pooling after pairwise
+            function (Note that the `sub_sample` is applied on spatial only).
+            Default: False.
+        conv_cfg (None | dict): Same as `NonLocalND`.
+            Default: dict(type='Conv1d').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 sub_sample: bool = False,
+                 conv_cfg: Dict = dict(type='Conv1d'),
+                 **kwargs):
+        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+
+        self.sub_sample = sub_sample
+
+        if sub_sample:
+            max_pool_layer = nn.MaxPool1d(kernel_size=2)
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
+
+
+@MODELS.register_module()
+class NonLocal2d(_NonLocalNd):
+    """2D Non-local module.
+
+    Args:
+        in_channels (int): Same as `NonLocalND`.
+        sub_sample (bool): Whether to apply max pooling after pairwise
+            function (Note that the `sub_sample` is applied on spatial only).
+            Default: False.
+        conv_cfg (None | dict): Same as `NonLocalND`.
+            Default: dict(type='Conv2d').
+    """
+
+    _abbr_ = 'nonlocal_block'
+
+    def __init__(self,
+                 in_channels: int,
+                 sub_sample: bool = False,
+                 conv_cfg: Dict = dict(type='Conv2d'),
+                 **kwargs):
+        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+
+        self.sub_sample = sub_sample
+
+        if sub_sample:
+            max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
+
+
+class NonLocal3d(_NonLocalNd):
+    """3D Non-local module.
+
+    Args:
+        in_channels (int): Same as `NonLocalND`.
+        sub_sample (bool): Whether to apply max pooling after pairwise
+            function (Note that the `sub_sample` is applied on spatial only).
+            Default: False.
+        conv_cfg (None | dict): Same as `NonLocalND`.
+            Default: dict(type='Conv3d').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 sub_sample: bool = False,
+                 conv_cfg: Dict = dict(type='Conv3d'),
+                 **kwargs):
+        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+        self.sub_sample = sub_sample
+
+        if sub_sample:
+            max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/norm.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aabab21a0096c185a929b11861c057cbea4b84d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/norm.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+from typing import Dict, Tuple, Union
+
+import torch.nn as nn
+from mmengine.registry import MODELS
+from mmengine.utils import is_tuple_of
+from mmengine.utils.dl_utils.parrots_wrapper import (SyncBatchNorm, _BatchNorm,
+                                                     _InstanceNorm)
+
+MODELS.register_module('BN', module=nn.BatchNorm2d)
+MODELS.register_module('BN1d', module=nn.BatchNorm1d)
+MODELS.register_module('BN2d', module=nn.BatchNorm2d)
+MODELS.register_module('BN3d', module=nn.BatchNorm3d)
+MODELS.register_module('SyncBN', module=SyncBatchNorm)
+MODELS.register_module('GN', module=nn.GroupNorm)
+MODELS.register_module('LN', module=nn.LayerNorm)
+MODELS.register_module('IN', module=nn.InstanceNorm2d)
+MODELS.register_module('IN1d', module=nn.InstanceNorm1d)
+MODELS.register_module('IN2d', module=nn.InstanceNorm2d)
+MODELS.register_module('IN3d', module=nn.InstanceNorm3d)
+
+
+def infer_abbr(class_type):
+    """Infer abbreviation from the class name.
+
+    When we build a norm layer with `build_norm_layer()`, we want to preserve
+    the norm type in variable names, e.g, self.bn1, self.gn. This method will
+    infer the abbreviation to map class types to abbreviations.
+
+    Rule 1: If the class has the property "_abbr_", return the property.
+    Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or
+    InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and
+    "in" respectively.
+    Rule 3: If the class name contains "batch", "group", "layer" or "instance",
+    the abbreviation of this layer will be "bn", "gn", "ln" and "in"
+    respectively.
+    Rule 4: Otherwise, the abbreviation falls back to "norm".
+
+    Args:
+        class_type (type): The norm layer type.
+
+    Returns:
+        str: The inferred abbreviation.
+    """
+    if not inspect.isclass(class_type):
+        raise TypeError(
+            f'class_type must be a type, but got {type(class_type)}')
+    if hasattr(class_type, '_abbr_'):
+        return class_type._abbr_
+    if issubclass(class_type, _InstanceNorm):  # IN is a subclass of BN
+        return 'in'
+    elif issubclass(class_type, _BatchNorm):
+        return 'bn'
+    elif issubclass(class_type, nn.GroupNorm):
+        return 'gn'
+    elif issubclass(class_type, nn.LayerNorm):
+        return 'ln'
+    else:
+        class_name = class_type.__name__.lower()
+        if 'batch' in class_name:
+            return 'bn'
+        elif 'group' in class_name:
+            return 'gn'
+        elif 'layer' in class_name:
+            return 'ln'
+        elif 'instance' in class_name:
+            return 'in'
+        else:
+            return 'norm_layer'
+
+
+def build_norm_layer(cfg: Dict,
+                     num_features: int,
+                     postfix: Union[int, str] = '') -> Tuple[str, nn.Module]:
+    """Build normalization layer.
+
+    Args:
+        cfg (dict): The norm layer config, which should contain:
+
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a norm layer.
+            - requires_grad (bool, optional): Whether stop gradient updates.
+        num_features (int): Number of input channels.
+        postfix (int | str): The postfix to be appended into norm abbreviation
+            to create named layer.
+
+    Returns:
+        tuple[str, nn.Module]: The first element is the layer name consisting
+        of abbreviation and postfix, e.g., bn1, gn. The second element is the
+        created norm layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+
+    if inspect.isclass(layer_type):
+        norm_layer = layer_type
+    else:
+        # Switch registry to the target scope. If `norm_layer` cannot be found
+        # in the registry, fallback to search `norm_layer` in the
+        # mmengine.MODELS.
+        with MODELS.switch_scope_and_registry(None) as registry:
+            norm_layer = registry.get(layer_type)
+        if norm_layer is None:
+            raise KeyError(f'Cannot find {norm_layer} in registry under '
+                           f'scope name {registry.scope}')
+    abbr = infer_abbr(norm_layer)
+
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+
+    requires_grad = cfg_.pop('requires_grad', True)
+    cfg_.setdefault('eps', 1e-5)
+    if norm_layer is not nn.GroupNorm:
+        layer = norm_layer(num_features, **cfg_)
+        if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
+            layer._specify_ddp_gpu_num(1)
+    else:
+        assert 'num_groups' in cfg_
+        layer = norm_layer(num_channels=num_features, **cfg_)
+
+    for param in layer.parameters():
+        param.requires_grad = requires_grad
+
+    return name, layer
+
+
+def is_norm(layer: nn.Module,
+            exclude: Union[type, tuple, None] = None) -> bool:
+    """Check if a layer is a normalization layer.
+
+    Args:
+        layer (nn.Module): The layer to be checked.
+        exclude (type | tuple[type]): Types to be excluded.
+
+    Returns:
+        bool: Whether the layer is a norm layer.
+    """
+    if exclude is not None:
+        if not isinstance(exclude, tuple):
+            exclude = (exclude, )
+        if not is_tuple_of(exclude, type):
+            raise TypeError(
+                f'"exclude" must be either None or type or a tuple of types, '
+                f'but got {type(exclude)}: {exclude}')
+
+    if exclude and isinstance(layer, exclude):
+        return False
+
+    all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)
+    return isinstance(layer, all_norm_bases)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/padding.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b29996b94a246d63a3661c345219eb1955a03d5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/padding.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+from typing import Dict
+
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+MODELS.register_module('zero', module=nn.ZeroPad2d)
+MODELS.register_module('reflect', module=nn.ReflectionPad2d)
+MODELS.register_module('replicate', module=nn.ReplicationPad2d)
+
+
+def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
+    """Build padding layer.
+
+    Args:
+        cfg (dict): The padding layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a padding layer.
+
+    Returns:
+        nn.Module: Created padding layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+
+    cfg_ = cfg.copy()
+    padding_type = cfg_.pop('type')
+    if inspect.isclass(padding_type):
+        return padding_type(*args, **kwargs, **cfg_)
+    # Switch registry to the target scope. If `padding_layer` cannot be found
+    # in the registry, fallback to search `padding_layer` in the
+    # mmengine.MODELS.
+    with MODELS.switch_scope_and_registry(None) as registry:
+        padding_layer = registry.get(padding_type)
+    if padding_layer is None:
+        raise KeyError(f'Cannot find {padding_layer} in registry under scope '
+                       f'name {registry.scope}')
+    layer = padding_layer(*args, **kwargs, **cfg_)
+
+    return layer
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/plugin.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..3195ed13cf764febf5827224edf1abf9dc951efe
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/plugin.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import platform
+from typing import Dict, Tuple, Union
+
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+if platform.system() == 'Windows':
+    import regex as re  # type: ignore
+else:
+    import re  # type: ignore
+
+
+def infer_abbr(class_type: type) -> str:
+    """Infer abbreviation from the class name.
+
+    This method will infer the abbreviation to map class types to
+    abbreviations.
+
+    Rule 1: If the class has the property "abbr", return the property.
+    Rule 2: Otherwise, the abbreviation falls back to snake case of class
+    name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``.
+
+    Args:
+        class_type (type): The norm layer type.
+
+    Returns:
+        str: The inferred abbreviation.
+    """
+
+    def camel2snack(word):
+        """Convert camel case word into snack case.
+
+        Modified from `inflection lib
+        <https://inflection.readthedocs.io/en/latest/#inflection.underscore>`_.
+
+        Example::
+
+            >>> camel2snack("FancyBlock")
+            'fancy_block'
+        """
+
+        word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word)
+        word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word)
+        word = word.replace('-', '_')
+        return word.lower()
+
+    if not inspect.isclass(class_type):
+        raise TypeError(
+            f'class_type must be a type, but got {type(class_type)}')
+    if hasattr(class_type, '_abbr_'):
+        return class_type._abbr_  # type: ignore
+    else:
+        return camel2snack(class_type.__name__)
+
+
+def build_plugin_layer(cfg: Dict,
+                       postfix: Union[int, str] = '',
+                       **kwargs) -> Tuple[str, nn.Module]:
+    """Build plugin layer.
+
+    Args:
+        cfg (dict): cfg should contain:
+
+            - type (str): identify plugin layer type.
+            - layer args: args needed to instantiate a plugin layer.
+        postfix (int, str): appended into norm abbreviation to
+            create named layer. Default: ''.
+
+    Returns:
+        tuple[str, nn.Module]: The first one is the concatenation of
+        abbreviation and postfix. The second is the created plugin layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if inspect.isclass(layer_type):
+        plugin_layer = layer_type
+    else:
+        # Switch registry to the target scope. If `plugin_layer` cannot be
+        # found in the registry, fallback to search `plugin_layer` in the
+        # mmengine.MODELS.
+        with MODELS.switch_scope_and_registry(None) as registry:
+            plugin_layer = registry.get(layer_type)
+        if plugin_layer is None:
+            raise KeyError(
+                f'Cannot find {plugin_layer} in registry under scope '
+                f'name {registry.scope}')
+    abbr = infer_abbr(plugin_layer)
+
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+
+    layer = plugin_layer(**kwargs, **cfg_)
+
+    return name, layer
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/scale.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..a47379898f75117e5ca2176d9a5f225f563d7b1e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/scale.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+
+class Scale(nn.Module):
+    """A learnable scale parameter.
+
+    This layer scales the input by a learnable factor. It multiplies a
+    learnable scale parameter of shape (1,) with input of any shape.
+
+    Args:
+        scale (float): Initial value of scale factor. Default: 1.0
+    """
+
+    def __init__(self, scale: float = 1.0):
+        super().__init__()
+        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * self.scale
+
+
+class LayerScale(nn.Module):
+    """LayerScale layer.
+
+    Args:
+        dim (int): Dimension of input features.
+        inplace (bool): Whether performs operation in-place.
+            Default: `False`.
+        data_format (str): The input data format, could be 'channels_last'
+            or 'channels_first', representing (B, C, H, W) and
+            (B, N, C) format data respectively. Default: 'channels_last'.
+        scale (float): Initial value of scale factor. Default: 1.0
+    """
+
+    def __init__(self,
+                 dim: int,
+                 inplace: bool = False,
+                 data_format: str = 'channels_last',
+                 scale: float = 1e-5):
+        super().__init__()
+        assert data_format in ('channels_last', 'channels_first'), \
+            "'data_format' could only be channels_last or channels_first."
+        self.inplace = inplace
+        self.data_format = data_format
+        self.weight = nn.Parameter(torch.ones(dim) * scale)
+
+    def forward(self, x) -> torch.Tensor:
+        if self.data_format == 'channels_first':
+            shape = tuple((1, -1, *(1 for _ in range(x.dim() - 2))))
+        else:
+            shape = tuple((*(1 for _ in range(x.dim() - 1)), -1))
+        if self.inplace:
+            return x.mul_(self.weight.view(*shape))
+        else:
+            return x * self.weight.view(*shape)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/swish.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/swish.py
new file mode 100644
index 0000000000000000000000000000000000000000..75ad75b9d73f11375ed63491d9e29efd6f43f143
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/swish.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+
+
+@MODELS.register_module()
+class Swish(nn.Module):
+    """Swish Module.
+
+    This module applies the swish function:
+
+    .. math::
+        Swish(x) = x * Sigmoid(x)
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.sigmoid(x)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/transformer.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f83b9a6977bf821985cb4c2f78de84fcf103fffb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/transformer.py
@@ -0,0 +1,951 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+import warnings
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, ModuleList, Sequential
+from mmengine.registry import MODELS
+from mmengine.utils import deprecated_api_warning, to_2tuple
+
+from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer,
+                      build_norm_layer)
+from .drop import build_dropout
+from .scale import LayerScale
+
+# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
+try:
+    from mmcv.ops.multi_scale_deform_attn import \
+        MultiScaleDeformableAttention  # noqa F401
+    warnings.warn(
+        ImportWarning(
+            '``MultiScaleDeformableAttention`` has been moved to '
+            '``mmcv.ops.multi_scale_deform_attn``, please change original path '  # noqa E501
+            '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '  # noqa E501
+            'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '  # noqa E501
+        ))
+
+except ImportError:
+    warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
+                  '``mmcv.ops.multi_scale_deform_attn``, '
+                  'You should install ``mmcv`` rather than ``mmcv-lite`` '
+                  'if you need this module. ')
+
+
+def build_positional_encoding(cfg, default_args=None):
+    """Builder for Position Encoding."""
+    return MODELS.build(cfg, default_args=default_args)
+
+
+def build_attention(cfg, default_args=None):
+    """Builder for attention."""
+    return MODELS.build(cfg, default_args=default_args)
+
+
+def build_feedforward_network(cfg, default_args=None):
+    """Builder for feed-forward network (FFN)."""
+    return MODELS.build(cfg, default_args=default_args)
+
+
+def build_transformer_layer(cfg, default_args=None):
+    """Builder for transformer layer."""
+    return MODELS.build(cfg, default_args=default_args)
+
+
+def build_transformer_layer_sequence(cfg, default_args=None):
+    """Builder for transformer encoder and transformer decoder."""
+    return MODELS.build(cfg, default_args=default_args)
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding adaptively to the input.
+
+    This module can make input get fully covered by filter
+    you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad
+    zero around input. The "corner"  mode would pad zero
+    to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel. Default: 1.
+        stride (int | tuple): Stride of the filter. Default: 1.
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+        super().__init__()
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        """Calculate the padding size of input.
+
+        Args:
+            input_shape (:obj:`torch.Size`): arrange as (H, W).
+
+        Returns:
+            Tuple[int]: The padding size along the
+            original H and W directions
+        """
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        """Add padding to `x`
+
+        Args:
+            x (Tensor): Input tensor has shape (B, C, H, W).
+
+        Returns:
+            Tensor: The tensor with adaptive padding
+        """
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The type of convolution
+            to generate patch embedding. Default: "Conv2d".
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: 16.
+        padding (int | tuple | string): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only works when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv2d',
+                 kernel_size=16,
+                 stride=16,
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adaptive_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # e.g. when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adaptive_padding:
+                pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (out_h, out_w).
+        """
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map ((used in Swin Transformer)).
+    Our implementation uses `nn.Unfold` to
+    merge patches, which is about 25% faster than the original
+    implementation. However, we need to modify pretrained
+    models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+            to gets fully covered by filter and stride you specified.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adaptive_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+            H, W = x.shape[-2:]
+
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+        x = self.sampler(x)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
+
+
+@MODELS.register_module()
+class MultiheadAttention(BaseModule):
+    """A wrapper for ``torch.nn.MultiheadAttention``.
+
+    This module implements MultiheadAttention with identity connection,
+    and positional encoding  is also passed as input.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): When it is True,  Key, Query and Value are shape of
+            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+             Default to False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=dict(type='Dropout', drop_prob=0.),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+        super().__init__(init_cfg)
+        if 'dropout' in kwargs:
+            warnings.warn(
+                'The arguments `dropout` in MultiheadAttention '
+                'has been deprecated, now you can separately '
+                'set `attn_drop`(float), proj_drop(float), '
+                'and `dropout_layer`(dict) ', DeprecationWarning)
+            attn_drop = kwargs['dropout']
+            dropout_layer['drop_prob'] = kwargs.pop('dropout')
+
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.batch_first = batch_first
+
+        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
+                                          **kwargs)
+
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else nn.Identity()
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiheadAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_pos=None,
+                attn_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `MultiheadAttention`.
+
+        **kwargs allow passing a more general data flow when combining
+        with other operations in `transformerlayer`.
+
+        Args:
+            query (Tensor): The input query with shape [num_queries, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+                If None, the ``query`` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            identity (Tensor): This tensor, with the same shape as x,
+                will be used for the identity link.
+                If None, `x` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `x`. If not None, it will
+                be added to `x` before forward function. Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+
+        Returns:
+            Tensor: forwarded results with shape
+            [num_queries, bs, embed_dims]
+            if self.batch_first is False, else
+            [bs, num_queries embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+        if identity is None:
+            identity = query
+        if key_pos is None:
+            if query_pos is not None:
+                # use query_pos if key_pos is not available
+                if query_pos.shape == key.shape:
+                    key_pos = query_pos
+                else:
+                    warnings.warn(f'position encoding of key is'
+                                  f'missing in {self.__class__.__name__}.')
+        if query_pos is not None:
+            query = query + query_pos
+        if key_pos is not None:
+            key = key + key_pos
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            query = query.transpose(0, 1)
+            key = key.transpose(0, 1)
+            value = value.transpose(0, 1)
+
+        out = self.attn(
+            query=query,
+            key=key,
+            value=value,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+@MODELS.register_module()
+class FFN(BaseModule):
+    """Implements feed-forward networks (FFNs) with identity connection.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        num_fcs (int, optional): The number of fully-connected layers in
+            FFNs. Default: 2.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='ReLU')
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        add_identity (bool, optional): Whether to add the
+            identity connection. Default: `True`.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        layer_scale_init_value (float): Initial value of scale factor in
+            LayerScale. Default: 1.0
+    """
+
+    @deprecated_api_warning(
+        {
+            'dropout': 'ffn_drop',
+            'add_residual': 'add_identity'
+        },
+        cls_name='FFN')
+    def __init__(self,
+                 embed_dims=256,
+                 feedforward_channels=1024,
+                 num_fcs=2,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 add_identity=True,
+                 init_cfg=None,
+                 layer_scale_init_value=0.):
+        super().__init__(init_cfg)
+        assert num_fcs >= 2, 'num_fcs should be no less ' \
+            f'than 2. got {num_fcs}.'
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.num_fcs = num_fcs
+
+        layers = []
+        in_channels = embed_dims
+        for _ in range(num_fcs - 1):
+            layers.append(
+                Sequential(
+                    Linear(in_channels, feedforward_channels),
+                    build_activation_layer(act_cfg), nn.Dropout(ffn_drop)))
+            in_channels = feedforward_channels
+        layers.append(Linear(feedforward_channels, embed_dims))
+        layers.append(nn.Dropout(ffn_drop))
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+        self.add_identity = add_identity
+
+        if layer_scale_init_value > 0:
+            self.gamma2 = LayerScale(embed_dims, scale=layer_scale_init_value)
+        else:
+            self.gamma2 = nn.Identity()
+
+    @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
+    def forward(self, x, identity=None):
+        """Forward function for `FFN`.
+
+        The function would add x to the output tensor if residue is None.
+        """
+        out = self.layers(x)
+        out = self.gamma2(out)
+        if not self.add_identity:
+            return self.dropout_layer(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+@MODELS.register_module()
+class BaseTransformerLayer(BaseModule):
+    """Base `TransformerLayer` for vision transformer.
+
+    It can be built from `mmcv.ConfigDict` and support more flexible
+    customization, for example, using any number of `FFN or LN ` and
+    use different kinds of `attention` by specifying a list of `ConfigDict`
+    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
+    when you specifying `norm` as the first element of `operation_order`.
+    More details about the `prenorm`: `On Layer Normalization in the
+    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
+
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for `self_attention` or `cross_attention` modules,
+            The order of the configs in the list should be consistent with
+            corresponding attentions in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config. Default: None.
+        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for FFN, The order of the configs in the list should be
+            consistent with corresponding ffn in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Support `prenorm` when you specifying first element as `norm`.
+            Default：None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape
+            of (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+    """
+
+    def __init__(self,
+                 attn_cfgs=None,
+                 ffn_cfgs=dict(
+                     type='FFN',
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 operation_order=None,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+
+        deprecated_args = dict(
+            feedforward_channels='feedforward_channels',
+            ffn_dropout='ffn_drop',
+            ffn_num_fcs='num_fcs')
+        for ori_name, new_name in deprecated_args.items():
+            if ori_name in kwargs:
+                warnings.warn(
+                    f'The arguments `{ori_name}` in BaseTransformerLayer '
+                    f'has been deprecated, now you should set `{new_name}` '
+                    f'and other FFN related arguments '
+                    f'to a dict named `ffn_cfgs`. ', DeprecationWarning)
+                ffn_cfgs[new_name] = kwargs[ori_name]
+
+        super().__init__(init_cfg)
+
+        self.batch_first = batch_first
+
+        assert set(operation_order) & {
+            'self_attn', 'norm', 'ffn', 'cross_attn'} == \
+            set(operation_order), f'The operation_order of' \
+            f' {self.__class__.__name__} should ' \
+            f'contains all four operation type ' \
+            f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
+
+        num_attn = operation_order.count('self_attn') + operation_order.count(
+            'cross_attn')
+        if isinstance(attn_cfgs, dict):
+            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        else:
+            assert num_attn == len(attn_cfgs), f'The length ' \
+                f'of attn_cfg {num_attn} is ' \
+                f'not consistent with the number of attention' \
+                f'in operation_order {operation_order}.'
+
+        self.num_attn = num_attn
+        self.operation_order = operation_order
+        self.norm_cfg = norm_cfg
+        self.pre_norm = operation_order[0] == 'norm'
+        self.attentions = ModuleList()
+
+        index = 0
+        for operation_name in operation_order:
+            if operation_name in ['self_attn', 'cross_attn']:
+                if 'batch_first' in attn_cfgs[index]:
+                    assert self.batch_first == attn_cfgs[index]['batch_first']
+                else:
+                    attn_cfgs[index]['batch_first'] = self.batch_first
+                attention = build_attention(attn_cfgs[index])
+                # Some custom attentions used as `self_attn`
+                # or `cross_attn` can have different behavior.
+                attention.operation_name = operation_name
+                self.attentions.append(attention)
+                index += 1
+
+        self.embed_dims = self.attentions[0].embed_dims
+
+        self.ffns = ModuleList()
+        num_ffns = operation_order.count('ffn')
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = ConfigDict(ffn_cfgs)
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
+        assert len(ffn_cfgs) == num_ffns
+        for ffn_index in range(num_ffns):
+            if 'embed_dims' not in ffn_cfgs[ffn_index]:
+                ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims
+            else:
+                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
+            self.ffns.append(
+                build_feedforward_network(ffn_cfgs[ffn_index],
+                                          dict(type='FFN')))
+
+        self.norms = ModuleList()
+        num_norms = operation_order.count('norm')
+        for _ in range(num_norms):
+            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+
+        **kwargs contains some specific arguments of attentions.
+
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                        f'attn_masks {len(attn_masks)} must be equal ' \
+                        f'to the number of attention in ' \
+                        f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            if layer == 'self_attn':
+                temp_key = temp_value = query
+                query = self.attentions[attn_index](
+                    query,
+                    temp_key,
+                    temp_value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
+
+
+@MODELS.register_module()
+class TransformerLayerSequence(BaseModule):
+    """Base class for TransformerEncoder and TransformerDecoder in vision
+    transformer.
+
+    As base-class of Encoder and Decoder in vision transformer.
+    Support customization such as specifying different kind
+    of `transformer_layer` in `transformer_coder`.
+
+    Args:
+        transformerlayer (list[obj:`mmcv.ConfigDict`] |
+            obj:`mmcv.ConfigDict`): Config of transformerlayer
+            in TransformerCoder. If it is obj:`mmcv.ConfigDict`,
+             it would be repeated `num_layer` times to a
+             list[`mmcv.ConfigDict`]. Default: None.
+        num_layers (int): The number of `TransformerLayer`. Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
+        super().__init__(init_cfg)
+        if isinstance(transformerlayers, dict):
+            transformerlayers = [
+                copy.deepcopy(transformerlayers) for _ in range(num_layers)
+            ]
+        else:
+            assert isinstance(transformerlayers, list) and \
+                   len(transformerlayers) == num_layers
+        self.num_layers = num_layers
+        self.layers = ModuleList()
+        for i in range(num_layers):
+            self.layers.append(build_transformer_layer(transformerlayers[i]))
+        self.embed_dims = self.layers[0].embed_dims
+        self.pre_norm = self.layers[0].pre_norm
+
+    def forward(self,
+                query,
+                key,
+                value,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerCoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_queries, bs, embed_dims)`.
+            key (Tensor): The key tensor with shape
+                `(num_keys, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_keys, bs, embed_dims)`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor], optional): Each element is 2D Tensor
+                which is used in calculation of corresponding attention in
+                operation_order. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in self-attention
+                Default: None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor:  results with shape [num_queries, bs, embed_dims].
+        """
+        for layer in self.layers:
+            query = layer(
+                query,
+                key,
+                value,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                attn_masks=attn_masks,
+                query_key_padding_mask=query_key_padding_mask,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+        return query
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/upsample.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/upsample.py
new file mode 100644
index 0000000000000000000000000000000000000000..78fb5bf371712d13a72edf5d57151dca8fce6953
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/upsample.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import xavier_init
+from mmengine.registry import MODELS
+
+MODELS.register_module('nearest', module=nn.Upsample)
+MODELS.register_module('bilinear', module=nn.Upsample)
+
+
+@MODELS.register_module(name='pixel_shuffle')
+class PixelShufflePack(nn.Module):
+    """Pixel Shuffle upsample layer.
+
+    This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to
+    achieve a simple upsampling with pixel shuffle.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        scale_factor (int): Upsample ratio.
+        upsample_kernel (int): Kernel size of the conv layer to expand the
+            channels.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, scale_factor: int,
+                 upsample_kernel: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.scale_factor = scale_factor
+        self.upsample_kernel = upsample_kernel
+        self.upsample_conv = nn.Conv2d(
+            self.in_channels,
+            self.out_channels * scale_factor * scale_factor,
+            self.upsample_kernel,
+            padding=(self.upsample_kernel - 1) // 2)
+        self.init_weights()
+
+    def init_weights(self):
+        xavier_init(self.upsample_conv, distribution='uniform')
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.upsample_conv(x)
+        x = F.pixel_shuffle(x, self.scale_factor)
+        return x
+
+
+def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
+    """Build upsample layer.
+
+    Args:
+        cfg (dict): The upsample layer config, which should contain:
+
+            - type (str): Layer type.
+            - scale_factor (int): Upsample ratio, which is not applicable to
+              deconv.
+            - layer args: Args needed to instantiate a upsample layer.
+        args (argument list): Arguments passed to the ``__init__``
+            method of the corresponding conv layer.
+        kwargs (keyword arguments): Keyword arguments passed to the
+            ``__init__`` method of the corresponding conv layer.
+
+    Returns:
+        nn.Module: Created upsample layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'type' not in cfg:
+        raise KeyError(
+            f'the cfg dict must contain the key "type", but got {cfg}')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+
+    if inspect.isclass(layer_type):
+        upsample = layer_type
+    # Switch registry to the target scope. If `upsample` cannot be found
+    # in the registry, fallback to search `upsample` in the
+    # mmengine.MODELS.
+    else:
+        with MODELS.switch_scope_and_registry(None) as registry:
+            upsample = registry.get(layer_type)
+        if upsample is None:
+            raise KeyError(f'Cannot find {upsample} in registry under scope '
+                           f'name {registry.scope}')
+        if upsample is nn.Upsample:
+            cfg_['mode'] = layer_type
+    layer = upsample(*args, **kwargs, **cfg_)
+    return layer
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/wrappers.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc98c35584430a85910ef7a776e83d1ab9cd036a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/bricks/wrappers.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py  # noqa: E501
+
+Wrap some nn modules to support empty tensor input. Currently, these wrappers
+are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask
+heads are trained on only positive RoIs.
+"""
+import math
+
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+from torch.nn.modules.utils import _pair, _triple
+
+if torch.__version__ == 'parrots':
+    TORCH_VERSION = torch.__version__
+else:
+    # torch.__version__ could be 1.3.1+cu92, we only need the first two
+    # for comparison
+    TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
+
+
+def obsolete_torch_version(torch_version, version_threshold) -> bool:
+    return torch_version == 'parrots' or torch_version <= version_threshold
+
+
+class NewEmptyTensorOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, new_shape: tuple) -> torch.Tensor:
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+
+    @staticmethod
+    def backward(ctx, grad: torch.Tensor) -> tuple:
+        shape = ctx.shape
+        return NewEmptyTensorOp.apply(grad, shape), None
+
+
+@MODELS.register_module('Conv', force=True)
+class Conv2d(nn.Conv2d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
+                                     self.padding, self.stride, self.dilation):
+                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@MODELS.register_module('Conv3d', force=True)
+class Conv3d(nn.Conv3d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
+                                     self.padding, self.stride, self.dilation):
+                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@MODELS.register_module()
+@MODELS.register_module('deconv')
+class ConvTranspose2d(nn.ConvTranspose2d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
+                                         self.padding, self.stride,
+                                         self.dilation, self.output_padding):
+                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@MODELS.register_module()
+@MODELS.register_module('deconv3d')
+class ConvTranspose3d(nn.ConvTranspose3d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0:
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
+                                         self.padding, self.stride,
+                                         self.dilation, self.output_padding):
+                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+class MaxPool2d(nn.MaxPool2d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
+            out_shape = list(x.shape[:2])
+            for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
+                                     _pair(self.padding), _pair(self.stride),
+                                     _pair(self.dilation)):
+                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
+                o = math.ceil(o) if self.ceil_mode else math.floor(o)
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            return empty
+
+        return super().forward(x)
+
+
+class MaxPool3d(nn.MaxPool3d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0:
+            out_shape = list(x.shape[:2])
+            for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size),
+                                     _triple(self.padding),
+                                     _triple(self.stride),
+                                     _triple(self.dilation)):
+                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
+                o = math.ceil(o) if self.ceil_mode else math.floor(o)
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            return empty
+
+        return super().forward(x)
+
+
+class Linear(torch.nn.Linear):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # empty tensor forward of Linear layer is supported in Pytorch 1.6
+        if obsolete_torch_version(TORCH_VERSION, (1, 5)) and x.numel() == 0:
+            out_shape = [x.shape[0], self.out_features]
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/resnet.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fc6abf6ac60b982a8c7998e0545bc55f9ceee78
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/resnet.py
@@ -0,0 +1,321 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import Optional, Sequence, Tuple, Union
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmengine.model import constant_init, kaiming_init
+from mmengine.runner import load_checkpoint
+from torch import Tensor
+
+
+def conv3x3(in_planes: int,
+            out_planes: int,
+            stride: int = 1,
+            dilation: int = 1):
+    """3x3 convolution with padding."""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        dilation=dilation,
+        bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 style: str = 'pytorch',
+                 with_cp: bool = False):
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+        self.conv1 = conv3x3(inplanes, planes, stride, dilation)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        assert not with_cp
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 style: str = 'pytorch',
+                 with_cp: bool = False):
+        """Bottleneck block.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+        if style == 'pytorch':
+            conv1_stride = 1
+            conv2_stride = stride
+        else:
+            conv1_stride = stride
+            conv2_stride = 1
+        self.conv1 = nn.Conv2d(
+            inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(
+            planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    def forward(self, x: Tensor) -> Tensor:
+
+        def _inner_forward(x):
+            residual = x
+
+            out = self.conv1(x)
+            out = self.bn1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.bn2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.bn3(out)
+
+            if self.downsample is not None:
+                residual = self.downsample(x)
+
+            out += residual
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def make_res_layer(block: nn.Module,
+                   inplanes: int,
+                   planes: int,
+                   blocks: int,
+                   stride: int = 1,
+                   dilation: int = 1,
+                   style: str = 'pytorch',
+                   with_cp: bool = False) -> nn.Module:
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = nn.Sequential(
+            nn.Conv2d(
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                bias=False),
+            nn.BatchNorm2d(planes * block.expansion),
+        )
+
+    layers = []
+    layers.append(
+        block(
+            inplanes,
+            planes,
+            stride,
+            dilation,
+            downsample,
+            style=style,
+            with_cp=with_cp))
+    inplanes = planes * block.expansion
+    for _ in range(1, blocks):
+        layers.append(
+            block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp))
+
+    return nn.Sequential(*layers)
+
+
+class ResNet(nn.Module):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        num_stages (int): Resnet stages, normally 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
+            running stats (mean and var).
+        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth: int,
+                 num_stages: int = 4,
+                 strides: Sequence[int] = (1, 2, 2, 2),
+                 dilations: Sequence[int] = (1, 1, 1, 1),
+                 out_indices: Sequence[int] = (0, 1, 2, 3),
+                 style: str = 'pytorch',
+                 frozen_stages: int = -1,
+                 bn_eval: bool = True,
+                 bn_frozen: bool = False,
+                 with_cp: bool = False):
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        assert num_stages >= 1 and num_stages <= 4
+        block, stage_blocks = self.arch_settings[depth]
+        stage_blocks = stage_blocks[:num_stages]  # type: ignore
+        assert len(strides) == len(dilations) == num_stages
+        assert max(out_indices) < num_stages
+
+        self.out_indices = out_indices
+        self.style = style
+        self.frozen_stages = frozen_stages
+        self.bn_eval = bn_eval
+        self.bn_frozen = bn_frozen
+        self.with_cp = with_cp
+
+        self.inplanes: int = 64
+        self.conv1 = nn.Conv2d(
+            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            planes = 64 * 2**i
+            res_layer = make_res_layer(
+                block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                with_cp=with_cp)
+            self.inplanes = planes * block.expansion  # type: ignore
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self.feat_dim = block.expansion * 64 * 2**(  # type: ignore
+            len(stage_blocks) - 1)
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def train(self, mode: bool = True) -> None:
+        super().train(mode)
+        if self.bn_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+                    if self.bn_frozen:
+                        for params in m.parameters():
+                            params.requires_grad = False
+        if mode and self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+            for param in self.bn1.parameters():
+                param.requires_grad = False
+            self.bn1.eval()
+            self.bn1.weight.requires_grad = False
+            self.bn1.bias.requires_grad = False
+            for i in range(1, self.frozen_stages + 1):
+                mod = getattr(self, f'layer{i}')
+                mod.eval()
+                for param in mod.parameters():
+                    param.requires_grad = False
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/rfsearch/__init__.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/rfsearch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..04d45725dc40a15c086f21fc5ce73373318c578e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/rfsearch/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp
+from .search import RFSearchHook
+
+__all__ = ['BaseConvRFSearchOp', 'Conv2dRFSearchOp', 'RFSearchHook']
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/rfsearch/operator.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/rfsearch/operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fa45abb0a282954cd5e06503596141c9a314de4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/rfsearch/operator.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmengine.logging import print_log
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from .utils import expand_rates, get_single_padding
+
+
+class BaseConvRFSearchOp(BaseModule):
+    """Based class of ConvRFSearchOp.
+
+    Args:
+        op_layer (nn.Module): pytorch module, e,g, Conv2d
+        global_config (dict): config dict.
+    """
+
+    def __init__(self, op_layer: nn.Module, global_config: dict):
+        super().__init__()
+        self.op_layer = op_layer
+        self.global_config = global_config
+
+    def normlize(self, weights: nn.Parameter) -> nn.Parameter:
+        """Normalize weights.
+
+        Args:
+            weights (nn.Parameter): Weights to be normalized.
+
+        Returns:
+            nn.Parameters: Normalized weights.
+        """
+        abs_weights = torch.abs(weights)
+        normalized_weights = abs_weights / torch.sum(abs_weights)
+        return normalized_weights
+
+
+class Conv2dRFSearchOp(BaseConvRFSearchOp):
+    """Enable Conv2d with receptive field searching ability.
+
+    Args:
+        op_layer (nn.Module): pytorch module, e,g, Conv2d
+        global_config (dict): config dict. Defaults to None.
+            By default this must include:
+
+            - "init_alphas": The value for initializing weights of each branch.
+            - "num_branches": The controller of the size of
+              search space (the number of branches).
+            - "exp_rate": The controller of the sparsity of search space.
+            - "mmin": The minimum dilation rate.
+            - "mmax": The maximum dilation rate.
+
+            Extra keys may exist, but are used by RFSearchHook, e.g., "step",
+            "max_step", "search_interval", and "skip_layer".
+        verbose (bool): Determines whether to print rf-next
+            related logging messages.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 op_layer: nn.Module,
+                 global_config: dict,
+                 verbose: bool = True):
+        super().__init__(op_layer, global_config)
+        assert global_config is not None, 'global_config is None'
+        self.num_branches = global_config['num_branches']
+        assert self.num_branches in [2, 3]
+        self.verbose = verbose
+        init_dilation = op_layer.dilation
+        self.dilation_rates = expand_rates(init_dilation, global_config)
+        if self.op_layer.kernel_size[
+                0] == 1 or self.op_layer.kernel_size[0] % 2 == 0:
+            self.dilation_rates = [(op_layer.dilation[0], r[1])
+                                   for r in self.dilation_rates]
+        if self.op_layer.kernel_size[
+                1] == 1 or self.op_layer.kernel_size[1] % 2 == 0:
+            self.dilation_rates = [(r[0], op_layer.dilation[1])
+                                   for r in self.dilation_rates]
+
+        self.branch_weights = nn.Parameter(torch.Tensor(self.num_branches))
+        if self.verbose:
+            print_log(f'Expand as {self.dilation_rates}', 'current')
+        nn.init.constant_(self.branch_weights, global_config['init_alphas'])
+
+    def forward(self, input: Tensor) -> Tensor:
+        norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)])
+        if len(self.dilation_rates) == 1:
+            outputs = [
+                nn.functional.conv2d(
+                    input,
+                    weight=self.op_layer.weight,
+                    bias=self.op_layer.bias,
+                    stride=self.op_layer.stride,
+                    padding=self.get_padding(self.dilation_rates[0]),
+                    dilation=self.dilation_rates[0],
+                    groups=self.op_layer.groups,
+                )
+            ]
+        else:
+            outputs = [
+                nn.functional.conv2d(
+                    input,
+                    weight=self.op_layer.weight,
+                    bias=self.op_layer.bias,
+                    stride=self.op_layer.stride,
+                    padding=self.get_padding(r),
+                    dilation=r,
+                    groups=self.op_layer.groups,
+                ) * norm_w[i] for i, r in enumerate(self.dilation_rates)
+            ]
+        output = outputs[0]
+        for i in range(1, len(self.dilation_rates)):
+            output += outputs[i]
+        return output
+
+    def estimate_rates(self) -> None:
+        """Estimate new dilation rate based on trained branch_weights."""
+        norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)])
+        if self.verbose:
+            print_log(
+                'Estimate dilation {} with weight {}.'.format(
+                    self.dilation_rates,
+                    norm_w.detach().cpu().numpy().tolist()), 'current')
+
+        sum0, sum1, w_sum = 0, 0, 0
+        for i in range(len(self.dilation_rates)):
+            sum0 += norm_w[i].item() * self.dilation_rates[i][0]
+            sum1 += norm_w[i].item() * self.dilation_rates[i][1]
+            w_sum += norm_w[i].item()
+        estimated = [
+            np.clip(
+                int(round(sum0 / w_sum)), self.global_config['mmin'],
+                self.global_config['mmax']).item(),
+            np.clip(
+                int(round(sum1 / w_sum)), self.global_config['mmin'],
+                self.global_config['mmax']).item()
+        ]
+        self.op_layer.dilation = tuple(estimated)
+        self.op_layer.padding = self.get_padding(self.op_layer.dilation)
+        self.dilation_rates = [tuple(estimated)]
+        if self.verbose:
+            print_log(f'Estimate as {tuple(estimated)}', 'current')
+
+    def expand_rates(self) -> None:
+        """Expand dilation rate."""
+        dilation = self.op_layer.dilation
+        dilation_rates = expand_rates(dilation, self.global_config)
+        if self.op_layer.kernel_size[
+                0] == 1 or self.op_layer.kernel_size[0] % 2 == 0:
+            dilation_rates = [(dilation[0], r[1]) for r in dilation_rates]
+        if self.op_layer.kernel_size[
+                1] == 1 or self.op_layer.kernel_size[1] % 2 == 0:
+            dilation_rates = [(r[0], dilation[1]) for r in dilation_rates]
+
+        self.dilation_rates = copy.deepcopy(dilation_rates)
+        if self.verbose:
+            print_log(f'Expand as {self.dilation_rates}', 'current')
+        nn.init.constant_(self.branch_weights,
+                          self.global_config['init_alphas'])
+
+    def get_padding(self, dilation) -> tuple:
+        padding = (get_single_padding(self.op_layer.kernel_size[0],
+                                      self.op_layer.stride[0], dilation[0]),
+                   get_single_padding(self.op_layer.kernel_size[1],
+                                      self.op_layer.stride[1], dilation[1]))
+        return padding
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/rfsearch/search.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/rfsearch/search.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4add4b23afd1585fd434931e27dc92187ba1f6f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/rfsearch/search.py
@@ -0,0 +1,239 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Dict, Optional
+
+import mmengine
+import torch  # noqa
+import torch.nn as nn
+from mmengine.hooks import Hook
+from mmengine.logging import print_log
+from mmengine.registry import HOOKS
+
+from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp  # noqa
+from .utils import get_single_padding, write_to_json
+
+
+@HOOKS.register_module()
+class RFSearchHook(Hook):
+    """Rcecptive field search via dilation rates.
+
+    Please refer to `RF-Next: Efficient Receptive Field
+    Search for Convolutional Neural Networks
+    <https://arxiv.org/abs/2206.06637>`_ for more details.
+
+
+    Args:
+        mode (str, optional): It can be set to the following types:
+            'search', 'fixed_single_branch', or 'fixed_multi_branch'.
+            Defaults to 'search'.
+        config (Dict, optional): config dict of search.
+            By default this config contains "search",
+            and config["search"] must include:
+
+            - "step": recording the current searching step.
+            - "max_step": The maximum number of searching steps
+              to update the structures.
+            - "search_interval": The interval (epoch/iteration)
+              between two updates.
+            - "exp_rate": The controller of the sparsity of search space.
+            - "init_alphas": The value for initializing weights of each branch.
+            - "mmin": The minimum dilation rate.
+            - "mmax": The maximum dilation rate.
+            - "num_branches": The controller of the size of
+              search space (the number of branches).
+            - "skip_layer": The modules in skip_layer will be ignored
+              during the receptive field search.
+        rfstructure_file (str, optional): Path to load searched receptive
+            fields of the model. Defaults to None.
+        by_epoch (bool, optional): Determine to perform step by epoch or
+            by iteration. If set to True, it will step by epoch. Otherwise, by
+            iteration. Defaults to True.
+        verbose (bool): Determines whether to print rf-next related logging
+            messages. Defaults to True.
+    """
+
+    def __init__(self,
+                 mode: str = 'search',
+                 config: Dict = {},
+                 rfstructure_file: Optional[str] = None,
+                 by_epoch: bool = True,
+                 verbose: bool = True):
+        assert mode in ['search', 'fixed_single_branch', 'fixed_multi_branch']
+        assert config is not None
+        self.config = config
+        self.config['structure'] = {}
+        self.verbose = verbose
+        if rfstructure_file is not None:
+            rfstructure = mmengine.load(rfstructure_file)['structure']
+            self.config['structure'] = rfstructure
+        self.mode = mode
+        self.num_branches = self.config['search']['num_branches']
+        self.by_epoch = by_epoch
+
+    def init_model(self, model: nn.Module):
+        """init model with search ability.
+
+        Args:
+            model (nn.Module): pytorch model
+
+        Raises:
+            NotImplementedError: only support three modes:
+                search/fixed_single_branch/fixed_multi_branch
+        """
+        if self.verbose:
+            print_log('RFSearch init begin.', 'current')
+        if self.mode == 'search':
+            if self.config['structure']:
+                self.set_model(model, search_op='Conv2d')
+            self.wrap_model(model, search_op='Conv2d')
+        elif self.mode == 'fixed_single_branch':
+            self.set_model(model, search_op='Conv2d')
+        elif self.mode == 'fixed_multi_branch':
+            self.set_model(model, search_op='Conv2d')
+            self.wrap_model(model, search_op='Conv2d')
+        else:
+            raise NotImplementedError
+        if self.verbose:
+            print_log('RFSearch init end.', 'current')
+
+    def after_train_epoch(self, runner):
+        """Performs a dilation searching step after one training epoch."""
+        if self.by_epoch and self.mode == 'search':
+            self.step(runner.model, runner.work_dir)
+
+    def after_train_iter(self, runner, batch_idx, data_batch, outputs):
+        """Performs a dilation searching step after one training iteration."""
+        if not self.by_epoch and self.mode == 'search':
+            self.step(runner.model, runner.work_dir)
+
+    def step(self, model: nn.Module, work_dir: str) -> None:
+        """Performs a dilation searching step.
+
+        Args:
+            model (nn.Module): pytorch model
+            work_dir (str): Directory to save the searching results.
+        """
+        self.config['search']['step'] += 1
+        if (self.config['search']['step']
+            ) % self.config['search']['search_interval'] == 0 and (self.config[
+                'search']['step']) < self.config['search']['max_step']:
+            self.estimate_and_expand(model)
+            for name, module in model.named_modules():
+                if isinstance(module, BaseConvRFSearchOp):
+                    self.config['structure'][name] = module.op_layer.dilation
+
+            write_to_json(
+                self.config,
+                os.path.join(
+                    work_dir,
+                    'local_search_config_step%d.json' %
+                    self.config['search']['step'],
+                ),
+            )
+
+    def estimate_and_expand(self, model: nn.Module) -> None:
+        """estimate and search for RFConvOp.
+
+        Args:
+            model (nn.Module): pytorch model
+        """
+        for module in model.modules():
+            if isinstance(module, BaseConvRFSearchOp):
+                module.estimate_rates()
+                module.expand_rates()
+
+    def wrap_model(self,
+                   model: nn.Module,
+                   search_op: str = 'Conv2d',
+                   prefix: str = '') -> None:
+        """wrap model to support searchable conv op.
+
+        Args:
+            model (nn.Module): pytorch model
+            search_op (str): The module that uses RF search.
+                Defaults to 'Conv2d'.
+            init_rates (int, optional): Set to other initial dilation rates.
+                Defaults to None.
+            prefix (str): Prefix for function recursion. Defaults to ''.
+        """
+        op = 'torch.nn.' + search_op
+        for name, module in model.named_children():
+            if prefix == '':
+                fullname = 'module.' + name
+            else:
+                fullname = prefix + '.' + name
+            if self.config['search']['skip_layer'] is not None:
+                if any(layer in fullname
+                       for layer in self.config['search']['skip_layer']):
+                    continue
+            if isinstance(module, eval(op)):
+                if 1 < module.kernel_size[0] and \
+                    0 != module.kernel_size[0] % 2 or \
+                    1 < module.kernel_size[1] and \
+                        0 != module.kernel_size[1] % 2:
+                    moduleWrap = eval(search_op + 'RFSearchOp')(
+                        module, self.config['search'], self.verbose)
+                    moduleWrap = moduleWrap.to(module.weight.device)
+                    if self.verbose:
+                        print_log(
+                            'Wrap model %s to %s.' %
+                            (str(module), str(moduleWrap)), 'current')
+                    setattr(model, name, moduleWrap)
+            elif not isinstance(module, BaseConvRFSearchOp):
+                self.wrap_model(module, search_op, fullname)
+
+    def set_model(self,
+                  model: nn.Module,
+                  search_op: str = 'Conv2d',
+                  init_rates: Optional[int] = None,
+                  prefix: str = '') -> None:
+        """set model based on config.
+
+        Args:
+            model (nn.Module): pytorch model
+            config (Dict): config file
+            search_op (str): The module that uses RF search.
+                Defaults to 'Conv2d'.
+            init_rates (int, optional):  Set to other initial dilation rates.
+                Defaults to None.
+            prefix (str): Prefix for function recursion. Defaults to ''.
+        """
+        op = 'torch.nn.' + search_op
+        for name, module in model.named_children():
+            if prefix == '':
+                fullname = 'module.' + name
+            else:
+                fullname = prefix + '.' + name
+            if self.config['search']['skip_layer'] is not None:
+                if any(layer in fullname
+                       for layer in self.config['search']['skip_layer']):
+                    continue
+            if isinstance(module, eval(op)):
+                if 1 < module.kernel_size[0] and \
+                    0 != module.kernel_size[0] % 2 or \
+                    1 < module.kernel_size[1] and \
+                        0 != module.kernel_size[1] % 2:
+                    if isinstance(self.config['structure'][fullname], int):
+                        self.config['structure'][fullname] = [
+                            self.config['structure'][fullname],
+                            self.config['structure'][fullname]
+                        ]
+                    module.dilation = (
+                        self.config['structure'][fullname][0],
+                        self.config['structure'][fullname][1],
+                    )
+                    module.padding = (
+                        get_single_padding(
+                            module.kernel_size[0], module.stride[0],
+                            self.config['structure'][fullname][0]),
+                        get_single_padding(
+                            module.kernel_size[1], module.stride[1],
+                            self.config['structure'][fullname][1]))
+                    setattr(model, name, module)
+                    if self.verbose:
+                        print_log(
+                            'Set module %s dilation as: [%d %d]' %
+                            (fullname, module.dilation[0], module.dilation[1]),
+                            'current')
+            elif not isinstance(module, BaseConvRFSearchOp):
+                self.set_model(module, search_op, init_rates, fullname)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/rfsearch/utils.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/rfsearch/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c8168e343d6bded761390f1be9a38b58727badf
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/rfsearch/utils.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine
+import numpy as np
+
+
+def write_to_json(config: dict, filename: str):
+    """save config to json file.
+
+    Args:
+        config (dict): Config to be saved.
+        filename (str): Path to save config.
+    """
+
+    with open(filename, 'w', encoding='utf-8') as f:
+        mmengine.dump(config, f, file_format='json')
+
+
+def expand_rates(dilation: tuple, config: dict) -> list:
+    """expand dilation rate according to config.
+
+    Args:
+        dilation (int): _description_
+        config (dict): config dict
+
+    Returns:
+        list: list of expanded dilation rates
+    """
+    exp_rate = config['exp_rate']
+
+    large_rates = []
+    small_rates = []
+    for _ in range(config['num_branches'] // 2):
+        large_rates.append(
+            tuple([
+                np.clip(
+                    int(round((1 + exp_rate) * dilation[0])), config['mmin'],
+                    config['mmax']).item(),
+                np.clip(
+                    int(round((1 + exp_rate) * dilation[1])), config['mmin'],
+                    config['mmax']).item()
+            ]))
+        small_rates.append(
+            tuple([
+                np.clip(
+                    int(round((1 - exp_rate) * dilation[0])), config['mmin'],
+                    config['mmax']).item(),
+                np.clip(
+                    int(round((1 - exp_rate) * dilation[1])), config['mmin'],
+                    config['mmax']).item()
+            ]))
+
+    small_rates.reverse()
+
+    if config['num_branches'] % 2 == 0:
+        rate_list = small_rates + large_rates
+    else:
+        rate_list = small_rates + [dilation] + large_rates
+
+    unique_rate_list = list(set(rate_list))
+    unique_rate_list.sort(key=rate_list.index)
+    return unique_rate_list
+
+
+def get_single_padding(kernel_size: int,
+                       stride: int = 1,
+                       dilation: int = 1) -> int:
+    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/utils/__init__.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdec9399f6544a90de6ac4238a60b05b8888c907
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .flops_counter import get_model_complexity_info
+from .fuse_conv_bn import fuse_conv_bn
+
+__all__ = ['get_model_complexity_info', 'fuse_conv_bn']
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/utils/flops_counter.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/utils/flops_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b09edbcdff063c5a8276bafdd8d69b440539108e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/utils/flops_counter.py
@@ -0,0 +1,604 @@
+# Modified from flops-counter.pytorch by Vladislav Sovrasov
+# original repo: https://github.com/sovrasov/flops-counter.pytorch
+
+# MIT License
+
+# Copyright (c) 2018 Vladislav Sovrasov
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import sys
+import warnings
+from functools import partial
+from typing import Any, Callable, Dict, Optional, TextIO, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from mmcv.cnn.bricks import (Conv2d, Conv3d, ConvTranspose2d, Linear,
+                             MaxPool2d, MaxPool3d)
+
+
+def get_model_complexity_info(model: nn.Module,
+                              input_shape: tuple,
+                              print_per_layer_stat: bool = True,
+                              as_strings: bool = True,
+                              input_constructor: Optional[Callable] = None,
+                              flush: bool = False,
+                              ost: TextIO = sys.stdout) -> tuple:
+    """Get complexity information of a model.
+
+    This method can calculate FLOPs and parameter counts of a model with
+    corresponding input shape. It can also print complexity information for
+    each layer in a model.
+
+    Supported layers are listed as below:
+        - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``.
+        - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``,
+          ``nn.LeakyReLU``, ``nn.ReLU6``.
+        - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``,
+          ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
+          ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
+          ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
+          ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
+        - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``,
+          ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
+          ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
+        - Linear: ``nn.Linear``.
+        - Deconvolution: ``nn.ConvTranspose2d``.
+        - Upsample: ``nn.Upsample``.
+
+    Args:
+        model (nn.Module): The model for complexity calculation.
+        input_shape (tuple): Input shape used for calculation.
+        print_per_layer_stat (bool): Whether to print complexity information
+            for each layer in a model. Default: True.
+        as_strings (bool): Output FLOPs and params counts in a string form.
+            Default: True.
+        input_constructor (None | callable): If specified, it takes a callable
+            method that generates input. otherwise, it will generate a random
+            tensor with input shape to calculate FLOPs. Default: None.
+        flush (bool): same as that in :func:`print`. Default: False.
+        ost (stream): same as ``file`` param in :func:`print`.
+            Default: sys.stdout.
+
+    Returns:
+        tuple[float | str]: If ``as_strings`` is set to True, it will return
+        FLOPs and parameter counts in a string format. otherwise, it will
+        return those in a float number format.
+    """
+    assert type(input_shape) is tuple
+    assert len(input_shape) >= 1
+    assert isinstance(model, nn.Module)
+    flops_model = add_flops_counting_methods(model)
+    flops_model.eval()
+    flops_model.start_flops_count()
+    if input_constructor:
+        input = input_constructor(input_shape)
+        _ = flops_model(**input)
+    else:
+        try:
+            batch = torch.ones(()).new_empty(
+                (1, *input_shape),
+                dtype=next(flops_model.parameters()).dtype,
+                device=next(flops_model.parameters()).device)
+        except StopIteration:
+            # Avoid StopIteration for models which have no parameters,
+            # like `nn.Relu()`, `nn.AvgPool2d`, etc.
+            batch = torch.ones(()).new_empty((1, *input_shape))
+
+        _ = flops_model(batch)
+
+    flops_count, params_count = flops_model.compute_average_flops_cost()
+    if print_per_layer_stat:
+        print_model_with_flops(
+            flops_model, flops_count, params_count, ost=ost, flush=flush)
+    flops_model.stop_flops_count()
+
+    if as_strings:
+        return flops_to_string(flops_count), params_to_string(params_count)
+
+    return flops_count, params_count
+
+
+def flops_to_string(flops: float,
+                    units: Optional[str] = 'GFLOPs',
+                    precision: int = 2) -> str:
+    """Convert FLOPs number into a string.
+
+    Note that Here we take a multiply-add counts as one FLOP.
+
+    Args:
+        flops (float): FLOPs number to be converted.
+        units (str | None): Converted FLOPs units. Options are None, 'GFLOPs',
+            'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically
+            choose the most suitable unit for FLOPs. Default: 'GFLOPs'.
+        precision (int): Digit number after the decimal point. Default: 2.
+
+    Returns:
+        str: The converted FLOPs number with units.
+
+    Examples:
+        >>> flops_to_string(1e9)
+        '1.0 GFLOPs'
+        >>> flops_to_string(2e5, 'MFLOPs')
+        '0.2 MFLOPs'
+        >>> flops_to_string(3e-9, None)
+        '3e-09 FLOPs'
+    """
+    if units is None:
+        if flops // 10**9 > 0:
+            return str(round(flops / 10.**9, precision)) + ' GFLOPs'
+        elif flops // 10**6 > 0:
+            return str(round(flops / 10.**6, precision)) + ' MFLOPs'
+        elif flops // 10**3 > 0:
+            return str(round(flops / 10.**3, precision)) + ' KFLOPs'
+        else:
+            return str(flops) + ' FLOPs'
+    else:
+        if units == 'GFLOPs':
+            return str(round(flops / 10.**9, precision)) + ' ' + units
+        elif units == 'MFLOPs':
+            return str(round(flops / 10.**6, precision)) + ' ' + units
+        elif units == 'KFLOPs':
+            return str(round(flops / 10.**3, precision)) + ' ' + units
+        else:
+            return str(flops) + ' FLOPs'
+
+
+def params_to_string(num_params: float,
+                     units: Optional[str] = None,
+                     precision: int = 2) -> str:
+    """Convert parameter number into a string.
+
+    Args:
+        num_params (float): Parameter number to be converted.
+        units (str | None): Converted FLOPs units. Options are None, 'M',
+            'K' and ''. If set to None, it will automatically choose the most
+            suitable unit for Parameter number. Default: None.
+        precision (int): Digit number after the decimal point. Default: 2.
+
+    Returns:
+        str: The converted parameter number with units.
+
+    Examples:
+        >>> params_to_string(1e9)
+        '1000.0 M'
+        >>> params_to_string(2e5)
+        '200.0 k'
+        >>> params_to_string(3e-9)
+        '3e-09'
+    """
+    if units is None:
+        if num_params // 10**6 > 0:
+            return str(round(num_params / 10**6, precision)) + ' M'
+        elif num_params // 10**3:
+            return str(round(num_params / 10**3, precision)) + ' k'
+        else:
+            return str(num_params)
+    else:
+        if units == 'M':
+            return str(round(num_params / 10.**6, precision)) + ' ' + units
+        elif units == 'K':
+            return str(round(num_params / 10.**3, precision)) + ' ' + units
+        else:
+            return str(num_params)
+
+
+def print_model_with_flops(model: nn.Module,
+                           total_flops: float,
+                           total_params: float,
+                           units: Optional[str] = 'GFLOPs',
+                           precision: int = 3,
+                           ost: TextIO = sys.stdout,
+                           flush: bool = False) -> None:
+    """Print a model with FLOPs for each layer.
+
+    Args:
+        model (nn.Module): The model to be printed.
+        total_flops (float): Total FLOPs of the model.
+        total_params (float): Total parameter counts of the model.
+        units (str | None): Converted FLOPs units. Default: 'GFLOPs'.
+        precision (int): Digit number after the decimal point. Default: 3.
+        ost (stream): same as `file` param in :func:`print`.
+            Default: sys.stdout.
+        flush (bool): same as that in :func:`print`. Default: False.
+
+    Example:
+        >>> class ExampleModel(nn.Module):
+
+        >>> def __init__(self):
+        >>>     super().__init__()
+        >>>     self.conv1 = nn.Conv2d(3, 8, 3)
+        >>>     self.conv2 = nn.Conv2d(8, 256, 3)
+        >>>     self.conv3 = nn.Conv2d(256, 8, 3)
+        >>>     self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        >>>     self.flatten = nn.Flatten()
+        >>>     self.fc = nn.Linear(8, 1)
+
+        >>> def forward(self, x):
+        >>>     x = self.conv1(x)
+        >>>     x = self.conv2(x)
+        >>>     x = self.conv3(x)
+        >>>     x = self.avg_pool(x)
+        >>>     x = self.flatten(x)
+        >>>     x = self.fc(x)
+        >>>     return x
+
+        >>> model = ExampleModel()
+        >>> x = (3, 16, 16)
+        to print the complexity information state for each layer, you can use
+        >>> get_model_complexity_info(model, x)
+        or directly use
+        >>> print_model_with_flops(model, 4579784.0, 37361)
+        ExampleModel(
+          0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs,
+          (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1))  # noqa: E501
+          (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1))
+          (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1))
+          (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1))
+          (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, )
+          (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True)
+        )
+    """
+
+    def accumulate_params(self):
+        if is_supported_instance(self):
+            return self.__params__
+        else:
+            sum = 0
+            for m in self.children():
+                sum += m.accumulate_params()
+            return sum
+
+    def accumulate_flops(self):
+        if is_supported_instance(self):
+            return self.__flops__ / model.__batch_counter__
+        else:
+            sum = 0
+            for m in self.children():
+                sum += m.accumulate_flops()
+            return sum
+
+    def flops_repr(self):
+        accumulated_num_params = self.accumulate_params()
+        accumulated_flops_cost = self.accumulate_flops()
+        return ', '.join([
+            params_to_string(
+                accumulated_num_params, units='M', precision=precision),
+            f'{accumulated_num_params / total_params:.3%} Params',
+            flops_to_string(
+                accumulated_flops_cost, units=units, precision=precision),
+            f'{accumulated_flops_cost / total_flops:.3%} FLOPs',
+            self.original_extra_repr()
+        ])
+
+    def add_extra_repr(m):
+        m.accumulate_flops = accumulate_flops.__get__(m)
+        m.accumulate_params = accumulate_params.__get__(m)
+        flops_extra_repr = flops_repr.__get__(m)
+        if m.extra_repr != flops_extra_repr:
+            m.original_extra_repr = m.extra_repr
+            m.extra_repr = flops_extra_repr
+            assert m.extra_repr != m.original_extra_repr
+
+    def del_extra_repr(m):
+        if hasattr(m, 'original_extra_repr'):
+            m.extra_repr = m.original_extra_repr
+            del m.original_extra_repr
+        if hasattr(m, 'accumulate_flops'):
+            del m.accumulate_flops
+
+    model.apply(add_extra_repr)
+    print(model, file=ost, flush=flush)
+    model.apply(del_extra_repr)
+
+
+def get_model_parameters_number(model: nn.Module) -> float:
+    """Calculate parameter number of a model.
+
+    Args:
+        model (nn.module): The model for parameter number calculation.
+
+    Returns:
+        float: Parameter number of the model.
+    """
+    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return num_params
+
+
+def add_flops_counting_methods(net_main_module: nn.Module) -> nn.Module:
+    # adding additional methods to the existing module object,
+    # this is done this way so that each function has access to self object
+    net_main_module.start_flops_count = start_flops_count.__get__(  # type: ignore # noqa E501
+        net_main_module)
+    net_main_module.stop_flops_count = stop_flops_count.__get__(  # type: ignore # noqa E501
+        net_main_module)
+    net_main_module.reset_flops_count = reset_flops_count.__get__(  # type: ignore # noqa E501
+        net_main_module)
+    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(  # type: ignore # noqa E501
+        net_main_module)
+
+    net_main_module.reset_flops_count()
+
+    return net_main_module
+
+
+def compute_average_flops_cost(self) -> Tuple[float, float]:
+    """Compute average FLOPs cost.
+
+    A method to compute average FLOPs cost, which will be available after
+    `add_flops_counting_methods()` is called on a desired net object.
+
+    Returns:
+        float: Current mean flops consumption per image.
+    """
+    batches_count = self.__batch_counter__
+    flops_sum = 0
+    for module in self.modules():
+        if is_supported_instance(module):
+            flops_sum += module.__flops__
+    params_sum = get_model_parameters_number(self)
+    return flops_sum / batches_count, params_sum
+
+
+def start_flops_count(self) -> None:
+    """Activate the computation of mean flops consumption per image.
+
+    A method to activate the computation of mean flops consumption per image.
+    which will be available after ``add_flops_counting_methods()`` is called on
+    a desired net object. It should be called before running the network.
+    """
+    add_batch_counter_hook_function(self)
+
+    def add_flops_counter_hook_function(module: nn.Module) -> None:
+        if is_supported_instance(module):
+            if hasattr(module, '__flops_handle__'):
+                return
+
+            else:
+                handle = module.register_forward_hook(
+                    get_modules_mapping()[type(module)])
+
+            module.__flops_handle__ = handle
+
+    self.apply(partial(add_flops_counter_hook_function))
+
+
+def stop_flops_count(self) -> None:
+    """Stop computing the mean flops consumption per image.
+
+    A method to stop computing the mean flops consumption per image, which will
+    be available after ``add_flops_counting_methods()`` is called on a desired
+    net object. It can be called to pause the computation whenever.
+    """
+    remove_batch_counter_hook_function(self)
+    self.apply(remove_flops_counter_hook_function)
+
+
+def reset_flops_count(self) -> None:
+    """Reset statistics computed so far.
+
+    A method to Reset computed statistics, which will be available after
+    `add_flops_counting_methods()` is called on a desired net object.
+    """
+    add_batch_counter_variables_or_reset(self)
+    self.apply(add_flops_counter_variable_or_reset)
+
+
+# ---- Internal functions
+def empty_flops_counter_hook(module: nn.Module, input: tuple,
+                             output: Any) -> None:
+    module.__flops__ += 0
+
+
+def upsample_flops_counter_hook(module: nn.Module, input: tuple,
+                                output: torch.Tensor) -> None:
+    output_size = output[0]
+    batch_size = output_size.shape[0]
+    output_elements_count = batch_size
+    for val in output_size.shape[1:]:
+        output_elements_count *= val
+    module.__flops__ += int(output_elements_count)
+
+
+def relu_flops_counter_hook(module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    active_elements_count = output.numel()
+    module.__flops__ += int(active_elements_count)
+
+
+def linear_flops_counter_hook(module: nn.Module, input: tuple,
+                              output: torch.Tensor) -> None:
+    output_last_dim = output.shape[
+        -1]  # pytorch checks dimensions, so here we don't care much
+    module.__flops__ += int(np.prod(input[0].shape) * output_last_dim)
+
+
+def pool_flops_counter_hook(module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    module.__flops__ += int(np.prod(input[0].shape))
+
+
+def norm_flops_counter_hook(module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    batch_flops = np.prod(input[0].shape)
+    if (getattr(module, 'affine', False)
+            or getattr(module, 'elementwise_affine', False)):
+        batch_flops *= 2
+    module.__flops__ += int(batch_flops)
+
+
+def deconv_flops_counter_hook(conv_module: nn.Module, input: tuple,
+                              output: torch.Tensor) -> None:
+    # Can have multiple inputs, getting the first one
+    batch_size = input[0].shape[0]
+    input_height, input_width = input[0].shape[2:]
+
+    kernel_height, kernel_width = conv_module.kernel_size
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = (
+        kernel_height * kernel_width * in_channels * filters_per_channel)
+
+    active_elements_count = batch_size * input_height * input_width
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+    bias_flops = 0
+    if conv_module.bias is not None:
+        output_height, output_width = output.shape[2:]
+        bias_flops = out_channels * batch_size * output_height * output_width
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+def conv_flops_counter_hook(conv_module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    # Can have multiple inputs, getting the first one
+    batch_size = input[0].shape[0]
+    output_dims = list(output.shape[2:])
+
+    kernel_dims = list(conv_module.kernel_size)
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = int(
+        np.prod(kernel_dims)) * in_channels * filters_per_channel
+
+    active_elements_count = batch_size * int(np.prod(output_dims))
+
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+
+    bias_flops = 0
+
+    if conv_module.bias is not None:
+
+        bias_flops = out_channels * active_elements_count
+
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+def batch_counter_hook(module: nn.Module, input: tuple, output: Any) -> None:
+    batch_size = 1
+    if len(input) > 0:
+        # Can have multiple inputs, getting the first one
+        batch_size = len(input[0])
+    else:
+        warnings.warn('No positional inputs found for a module, '
+                      'assuming batch size is 1.')
+    module.__batch_counter__ += batch_size
+
+
+def add_batch_counter_variables_or_reset(module: nn.Module) -> None:
+
+    module.__batch_counter__ = 0
+
+
+def add_batch_counter_hook_function(module: nn.Module) -> None:
+    if hasattr(module, '__batch_counter_handle__'):
+        return
+
+    handle = module.register_forward_hook(batch_counter_hook)
+    module.__batch_counter_handle__ = handle
+
+
+def remove_batch_counter_hook_function(module: nn.Module) -> None:
+    if hasattr(module, '__batch_counter_handle__'):
+        module.__batch_counter_handle__.remove()
+        del module.__batch_counter_handle__
+
+
+def add_flops_counter_variable_or_reset(module: nn.Module) -> None:
+    if is_supported_instance(module):
+        if hasattr(module, '__flops__') or hasattr(module, '__params__'):
+            warnings.warn('variables __flops__ or __params__ are already '
+                          'defined for the module' + type(module).__name__ +
+                          ' ptflops can affect your code!')
+        module.__flops__ = 0
+        module.__params__ = get_model_parameters_number(module)
+
+
+def is_supported_instance(module: nn.Module) -> bool:
+    if type(module) in get_modules_mapping():
+        return True
+    return False
+
+
+def remove_flops_counter_hook_function(module: nn.Module) -> None:
+    if is_supported_instance(module):
+        if hasattr(module, '__flops_handle__'):
+            module.__flops_handle__.remove()
+            del module.__flops_handle__
+
+
+def get_modules_mapping() -> Dict:
+    return {
+        # convolutions
+        nn.Conv1d: conv_flops_counter_hook,
+        nn.Conv2d: conv_flops_counter_hook,
+        Conv2d: conv_flops_counter_hook,
+        nn.Conv3d: conv_flops_counter_hook,
+        Conv3d: conv_flops_counter_hook,
+        # activations
+        nn.ReLU: relu_flops_counter_hook,
+        nn.PReLU: relu_flops_counter_hook,
+        nn.ELU: relu_flops_counter_hook,
+        nn.LeakyReLU: relu_flops_counter_hook,
+        nn.ReLU6: relu_flops_counter_hook,
+        # poolings
+        nn.MaxPool1d: pool_flops_counter_hook,
+        nn.AvgPool1d: pool_flops_counter_hook,
+        nn.AvgPool2d: pool_flops_counter_hook,
+        nn.MaxPool2d: pool_flops_counter_hook,
+        MaxPool2d: pool_flops_counter_hook,
+        nn.MaxPool3d: pool_flops_counter_hook,
+        MaxPool3d: pool_flops_counter_hook,
+        nn.AvgPool3d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool1d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool1d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool2d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool2d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool3d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool3d: pool_flops_counter_hook,
+        # normalizations
+        nn.BatchNorm1d: norm_flops_counter_hook,
+        nn.BatchNorm2d: norm_flops_counter_hook,
+        nn.BatchNorm3d: norm_flops_counter_hook,
+        nn.GroupNorm: norm_flops_counter_hook,
+        nn.InstanceNorm1d: norm_flops_counter_hook,
+        nn.InstanceNorm2d: norm_flops_counter_hook,
+        nn.InstanceNorm3d: norm_flops_counter_hook,
+        nn.LayerNorm: norm_flops_counter_hook,
+        # FC
+        nn.Linear: linear_flops_counter_hook,
+        Linear: linear_flops_counter_hook,
+        # Upscale
+        nn.Upsample: upsample_flops_counter_hook,
+        # Deconvolution
+        nn.ConvTranspose2d: deconv_flops_counter_hook,
+        ConvTranspose2d: deconv_flops_counter_hook,
+    }
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/utils/fuse_conv_bn.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/utils/fuse_conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ccaab3bf1eb3ce615bad910d6dc45a467bb1fe4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/utils/fuse_conv_bn.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+
+def _fuse_conv_bn(conv: nn.Module, bn: nn.Module) -> nn.Module:
+    """Fuse conv and bn into one module.
+
+    Args:
+        conv (nn.Module): Conv to be fused.
+        bn (nn.Module): BN to be fused.
+
+    Returns:
+        nn.Module: Fused module.
+    """
+    conv_w = conv.weight
+    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+        bn.running_mean)
+
+    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+    conv.weight = nn.Parameter(conv_w *
+                               factor.reshape([conv.out_channels, 1, 1, 1]))
+    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+    return conv
+
+
+def fuse_conv_bn(module: nn.Module) -> nn.Module:
+    """Recursively fuse conv and bn in a module.
+
+    During inference, the functionary of batch norm layers is turned off
+    but only the mean and var alone channels are used, which exposes the
+    chance to fuse it with the preceding conv layers to save computations and
+    simplify network structures.
+
+    Args:
+        module (nn.Module): Module to be fused.
+
+    Returns:
+        nn.Module: Fused module.
+    """
+    last_conv = None
+    last_conv_name = None
+
+    for name, child in module.named_children():
+        if isinstance(child,
+                      (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)):
+            if last_conv is None:  # only fuse BN that is after Conv
+                continue
+            fused_conv = _fuse_conv_bn(last_conv, child)
+            module._modules[last_conv_name] = fused_conv
+            # To reduce changes, set BN as Identity instead of deleting it.
+            module._modules[name] = nn.Identity()
+            last_conv = None
+        elif isinstance(child, nn.Conv2d):
+            last_conv = child
+            last_conv_name = name
+        else:
+            fuse_conv_bn(child)
+    return module
diff --git a/head_extractor/mmcv-2.1.0/mmcv/cnn/vgg.py b/head_extractor/mmcv-2.1.0/mmcv/cnn/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7f3116062c3943bb85fd7540b23a31918622a24
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/cnn/vgg.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch.nn as nn
+from mmengine.model import constant_init, kaiming_init, normal_init
+from mmengine.runner import load_checkpoint
+from torch import Tensor
+
+
+def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module:
+    """3x3 convolution with padding."""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        padding=dilation,
+        dilation=dilation)
+
+
+def make_vgg_layer(inplanes: int,
+                   planes: int,
+                   num_blocks: int,
+                   dilation: int = 1,
+                   with_bn: bool = False,
+                   ceil_mode: bool = False) -> List[nn.Module]:
+    layers = []
+    for _ in range(num_blocks):
+        layers.append(conv3x3(inplanes, planes, dilation))
+        if with_bn:
+            layers.append(nn.BatchNorm2d(planes))
+        layers.append(nn.ReLU(inplace=True))
+        inplanes = planes
+    layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))
+
+    return layers
+
+
+class VGG(nn.Module):
+    """VGG backbone.
+
+    Args:
+        depth (int): Depth of vgg, from {11, 13, 16, 19}.
+        with_bn (bool): Use BatchNorm or not.
+        num_classes (int): number of classes for classification.
+        num_stages (int): VGG stages, normally 5.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
+            running stats (mean and var).
+        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+    """
+
+    arch_settings = {
+        11: (1, 1, 2, 2, 2),
+        13: (2, 2, 2, 2, 2),
+        16: (2, 2, 3, 3, 3),
+        19: (2, 2, 4, 4, 4)
+    }
+
+    def __init__(self,
+                 depth: int,
+                 with_bn: bool = False,
+                 num_classes: int = -1,
+                 num_stages: int = 5,
+                 dilations: Sequence[int] = (1, 1, 1, 1, 1),
+                 out_indices: Sequence[int] = (0, 1, 2, 3, 4),
+                 frozen_stages: int = -1,
+                 bn_eval: bool = True,
+                 bn_frozen: bool = False,
+                 ceil_mode: bool = False,
+                 with_last_pool: bool = True):
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for vgg')
+        assert num_stages >= 1 and num_stages <= 5
+        stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        assert len(dilations) == num_stages
+        assert max(out_indices) <= num_stages
+
+        self.num_classes = num_classes
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.bn_eval = bn_eval
+        self.bn_frozen = bn_frozen
+
+        self.inplanes = 3
+        start_idx = 0
+        vgg_layers = []
+        self.range_sub_modules = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            num_modules = num_blocks * (2 + with_bn) + 1
+            end_idx = start_idx + num_modules
+            dilation = dilations[i]
+            planes = 64 * 2**i if i < 4 else 512
+            vgg_layer = make_vgg_layer(
+                self.inplanes,
+                planes,
+                num_blocks,
+                dilation=dilation,
+                with_bn=with_bn,
+                ceil_mode=ceil_mode)
+            vgg_layers.extend(vgg_layer)
+            self.inplanes = planes
+            self.range_sub_modules.append([start_idx, end_idx])
+            start_idx = end_idx
+        if not with_last_pool:
+            vgg_layers.pop(-1)
+            self.range_sub_modules[-1][1] -= 1
+        self.module_name = 'features'
+        self.add_module(self.module_name, nn.Sequential(*vgg_layers))
+
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Linear(512 * 7 * 7, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, num_classes),
+            )
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=0.01)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor, ...]]:
+        outs = []
+        vgg_layers = getattr(self, self.module_name)
+        for i in range(len(self.stage_blocks)):
+            for j in range(*self.range_sub_modules[i]):
+                vgg_layer = vgg_layers[j]
+                x = vgg_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), -1)
+            x = self.classifier(x)
+            outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def train(self, mode: bool = True) -> None:
+        super().train(mode)
+        if self.bn_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+                    if self.bn_frozen:
+                        for params in m.parameters():
+                            params.requires_grad = False
+        vgg_layers = getattr(self, self.module_name)
+        if mode and self.frozen_stages >= 0:
+            for i in range(self.frozen_stages):
+                for j in range(*self.range_sub_modules[i]):
+                    mod = vgg_layers[j]
+                    mod.eval()
+                    for param in mod.parameters():
+                        param.requires_grad = False
diff --git a/head_extractor/mmcv-2.1.0/mmcv/image/__init__.py b/head_extractor/mmcv-2.1.0/mmcv/image/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ecec4046a6f5ee25b4ea07215ed7c7c810dcfa
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/image/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr,
+                         gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert,
+                         rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb)
+from .geometric import (cutout, imcrop, imflip, imflip_, impad,
+                        impad_to_multiple, imrescale, imresize, imresize_like,
+                        imresize_to_multiple, imrotate, imshear, imtranslate,
+                        rescale_size)
+from .io import imfrombytes, imread, imwrite, supported_backends, use_backend
+from .misc import tensor2imgs
+from .photometric import (adjust_brightness, adjust_color, adjust_contrast,
+                          adjust_hue, adjust_lighting, adjust_sharpness,
+                          auto_contrast, clahe, imdenormalize, imequalize,
+                          iminvert, imnormalize, imnormalize_, lut_transform,
+                          posterize, solarize)
+
+__all__ = [
+    'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb',
+    'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale',
+    'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size',
+    'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate',
+    'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend',
+    'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize',
+    'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr',
+    'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize',
+    'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe',
+    'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting',
+    'adjust_hue'
+]
diff --git a/head_extractor/mmcv-2.1.0/mmcv/image/colorspace.py b/head_extractor/mmcv-2.1.0/mmcv/image/colorspace.py
new file mode 100644
index 0000000000000000000000000000000000000000..08f9952408c8e0bb38b17c10e2089e900ed418c2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/image/colorspace.py
@@ -0,0 +1,309 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, Union
+
+import cv2
+import numpy as np
+
+
+def imconvert(img: np.ndarray, src: str, dst: str) -> np.ndarray:
+    """Convert an image from the src colorspace to dst colorspace.
+
+    Args:
+        img (ndarray): The input image.
+        src (str): The source colorspace, e.g., 'rgb', 'hsv'.
+        dst (str): The destination colorspace, e.g., 'rgb', 'hsv'.
+
+    Returns:
+        ndarray: The converted image.
+    """
+    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+    out_img = cv2.cvtColor(img, code)
+    return out_img
+
+
+def bgr2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
+    """Convert a BGR image to grayscale image.
+
+    Args:
+        img (ndarray): The input image.
+        keepdim (bool): If False (by default), then return the grayscale image
+            with 2 dims, otherwise 3 dims.
+
+    Returns:
+        ndarray: The converted grayscale image.
+    """
+    out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    if keepdim:
+        out_img = out_img[..., None]
+    return out_img
+
+
+def rgb2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
+    """Convert a RGB image to grayscale image.
+
+    Args:
+        img (ndarray): The input image.
+        keepdim (bool): If False (by default), then return the grayscale image
+            with 2 dims, otherwise 3 dims.
+
+    Returns:
+        ndarray: The converted grayscale image.
+    """
+    out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    if keepdim:
+        out_img = out_img[..., None]
+    return out_img
+
+
+def gray2bgr(img: np.ndarray) -> np.ndarray:
+    """Convert a grayscale image to BGR image.
+
+    Args:
+        img (ndarray): The input image.
+
+    Returns:
+        ndarray: The converted BGR image.
+    """
+    img = img[..., None] if img.ndim == 2 else img
+    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+    return out_img
+
+
+def gray2rgb(img: np.ndarray) -> np.ndarray:
+    """Convert a grayscale image to RGB image.
+
+    Args:
+        img (ndarray): The input image.
+
+    Returns:
+        ndarray: The converted RGB image.
+    """
+    img = img[..., None] if img.ndim == 2 else img
+    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+    return out_img
+
+
+def _convert_input_type_range(img: np.ndarray) -> np.ndarray:
+    """Convert the type and range of the input image.
+
+    It converts the input image to np.float32 type and range of [0, 1].
+    It is mainly used for pre-processing the input image in colorspace
+    conversion functions such as rgb2ycbcr and ycbcr2rgb.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        (ndarray): The converted image with type of np.float32 and range of
+            [0, 1].
+    """
+    img_type = img.dtype
+    img = img.astype(np.float32)
+    if img_type == np.float32:
+        pass
+    elif img_type == np.uint8:
+        img /= 255.
+    else:
+        raise TypeError('The img type should be np.float32 or np.uint8, '
+                        f'but got {img_type}')
+    return img
+
+
+def _convert_output_type_range(
+        img: np.ndarray, dst_type: Union[np.uint8, np.float32]) -> np.ndarray:
+    """Convert the type and range of the image according to dst_type.
+
+    It converts the image to desired type and range. If `dst_type` is np.uint8,
+    images will be converted to np.uint8 type with range [0, 255]. If
+    `dst_type` is np.float32, it converts the image to np.float32 type with
+    range [0, 1].
+    It is mainly used for post-processing images in colorspace conversion
+    functions such as rgb2ycbcr and ycbcr2rgb.
+
+    Args:
+        img (ndarray): The image to be converted with np.float32 type and
+            range [0, 255].
+        dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it
+            converts the image to np.uint8 type with range [0, 255]. If
+            dst_type is np.float32, it converts the image to np.float32 type
+            with range [0, 1].
+
+    Returns:
+        (ndarray): The converted image with desired type and range.
+    """
+    if dst_type not in (np.uint8, np.float32):
+        raise TypeError('The dst_type should be np.float32 or np.uint8, '
+                        f'but got {dst_type}')
+    if dst_type == np.uint8:
+        img = img.round()
+    else:
+        img /= 255.
+    return img.astype(dst_type)
+
+
+def rgb2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
+    """Convert a RGB image to YCbCr image.
+
+    This function produces the same results as Matlab's `rgb2ycbcr` function.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img)
+    if y_only:
+        out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
+                  [24.966, 112.0, -18.214]]) + [16, 128, 128]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def bgr2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
+    """Convert a BGR image to YCbCr image.
+
+    The bgr version of rgb2ycbcr.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img)
+    if y_only:
+        out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+                  [65.481, -37.797, 112.0]]) + [16, 128, 128]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def ycbcr2rgb(img: np.ndarray) -> np.ndarray:
+    """Convert a YCbCr image to RGB image.
+
+    This function produces the same results as Matlab's ycbcr2rgb function.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        ndarray: The converted RGB image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img) * 255
+    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+                              [0, -0.00153632, 0.00791071],
+                              [0.00625893, -0.00318811, 0]]) * 255.0 + [
+                                  -222.921, 135.576, -276.836
+                              ]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def ycbcr2bgr(img: np.ndarray) -> np.ndarray:
+    """Convert a YCbCr image to BGR image.
+
+    The bgr version of ycbcr2rgb.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        ndarray: The converted BGR image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img) * 255
+    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+                              [0.00791071, -0.00153632, 0],
+                              [0, -0.00318811, 0.00625893]]) * 255.0 + [
+                                  -276.836, 135.576, -222.921
+                              ]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def convert_color_factory(src: str, dst: str) -> Callable:
+
+    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+
+    def convert_color(img: np.ndarray) -> np.ndarray:
+        out_img = cv2.cvtColor(img, code)
+        return out_img
+
+    convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()}
+        image.
+
+    Args:
+        img (ndarray or str): The input image.
+
+    Returns:
+        ndarray: The converted {dst.upper()} image.
+    """
+
+    return convert_color
+
+
+bgr2rgb = convert_color_factory('bgr', 'rgb')
+
+rgb2bgr = convert_color_factory('rgb', 'bgr')
+
+bgr2hsv = convert_color_factory('bgr', 'hsv')
+
+hsv2bgr = convert_color_factory('hsv', 'bgr')
+
+bgr2hls = convert_color_factory('bgr', 'hls')
+
+hls2bgr = convert_color_factory('hls', 'bgr')
diff --git a/head_extractor/mmcv-2.1.0/mmcv/image/geometric.py b/head_extractor/mmcv-2.1.0/mmcv/image/geometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d62ebff35caf99858c9d73566fc1db0eb3831b2c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/image/geometric.py
@@ -0,0 +1,788 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numbers
+from typing import List, Optional, Tuple, Union, no_type_check
+
+import cv2
+import numpy as np
+from mmengine.utils import to_2tuple
+
+from .io import imread_backend
+
+try:
+    from PIL import Image
+except ImportError:
+    Image = None
+
+
+def _scale_size(
+    size: Tuple[int, int],
+    scale: Union[float, int, Tuple[float, float], Tuple[int, int]],
+) -> Tuple[int, int]:
+    """Rescale a size by a ratio.
+
+    Args:
+        size (tuple[int]): (w, h).
+        scale (float | int | tuple(float) | tuple(int)): Scaling factor.
+
+    Returns:
+        tuple[int]: scaled size.
+    """
+    if isinstance(scale, (float, int)):
+        scale = (scale, scale)
+    w, h = size
+    return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
+
+
+cv2_interp_codes = {
+    'nearest': cv2.INTER_NEAREST,
+    'bilinear': cv2.INTER_LINEAR,
+    'bicubic': cv2.INTER_CUBIC,
+    'area': cv2.INTER_AREA,
+    'lanczos': cv2.INTER_LANCZOS4
+}
+
+cv2_border_modes = {
+    'constant': cv2.BORDER_CONSTANT,
+    'replicate': cv2.BORDER_REPLICATE,
+    'reflect': cv2.BORDER_REFLECT,
+    'wrap': cv2.BORDER_WRAP,
+    'reflect_101': cv2.BORDER_REFLECT_101,
+    'transparent': cv2.BORDER_TRANSPARENT,
+    'isolated': cv2.BORDER_ISOLATED
+}
+
+# Pillow >=v9.1.0 use a slightly different naming scheme for filters.
+# Set pillow_interp_codes according to the naming scheme used.
+if Image is not None:
+    if hasattr(Image, 'Resampling'):
+        pillow_interp_codes = {
+            'nearest': Image.Resampling.NEAREST,
+            'bilinear': Image.Resampling.BILINEAR,
+            'bicubic': Image.Resampling.BICUBIC,
+            'box': Image.Resampling.BOX,
+            'lanczos': Image.Resampling.LANCZOS,
+            'hamming': Image.Resampling.HAMMING
+        }
+    else:
+        pillow_interp_codes = {
+            'nearest': Image.NEAREST,
+            'bilinear': Image.BILINEAR,
+            'bicubic': Image.BICUBIC,
+            'box': Image.BOX,
+            'lanczos': Image.LANCZOS,
+            'hamming': Image.HAMMING
+        }
+
+
+def imresize(
+    img: np.ndarray,
+    size: Tuple[int, int],
+    return_scale: bool = False,
+    interpolation: str = 'bilinear',
+    out: Optional[np.ndarray] = None,
+    backend: Optional[str] = None
+) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
+    """Resize image to a given size.
+
+    Args:
+        img (ndarray): The input image.
+        size (tuple[int]): Target size (w, h).
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        out (ndarray): The output destination.
+        backend (str | None): The image resize backend type. Options are `cv2`,
+            `pillow`, `None`. If backend is None, the global imread_backend
+            specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+    Returns:
+        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = img.shape[:2]
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported for resize.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        pil_image = Image.fromarray(img)
+        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
+        resized_img = np.array(pil_image)
+    else:
+        resized_img = cv2.resize(
+            img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
+    if not return_scale:
+        return resized_img
+    else:
+        w_scale = size[0] / w
+        h_scale = size[1] / h
+        return resized_img, w_scale, h_scale
+
+
+@no_type_check
+def imresize_to_multiple(
+    img: np.ndarray,
+    divisor: Union[int, Tuple[int, int]],
+    size: Union[int, Tuple[int, int], None] = None,
+    scale_factor: Union[float, int, Tuple[float, float], Tuple[int, int],
+                        None] = None,
+    keep_ratio: bool = False,
+    return_scale: bool = False,
+    interpolation: str = 'bilinear',
+    out: Optional[np.ndarray] = None,
+    backend: Optional[str] = None
+) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
+    """Resize image according to a given size or scale factor and then rounds
+    up the the resized or rescaled image size to the nearest value that can be
+    divided by the divisor.
+
+    Args:
+        img (ndarray): The input image.
+        divisor (int | tuple): Resized image size will be a multiple of
+            divisor. If divisor is a tuple, divisor should be
+            (w_divisor, h_divisor).
+        size (None | int | tuple[int]): Target size (w, h). Default: None.
+        scale_factor (None | float | int | tuple[float] | tuple[int]):
+            Multiplier for spatial size. Should match input size if it is a
+            tuple and the 2D style is (w_scale_factor, h_scale_factor).
+            Default: None.
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Default: False.
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        out (ndarray): The output destination.
+        backend (str | None): The image resize backend type. Options are `cv2`,
+            `pillow`, `None`. If backend is None, the global imread_backend
+            specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+    Returns:
+        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = img.shape[:2]
+    if size is not None and scale_factor is not None:
+        raise ValueError('only one of size or scale_factor should be defined')
+    elif size is None and scale_factor is None:
+        raise ValueError('one of size or scale_factor should be defined')
+    elif size is not None:
+        size = to_2tuple(size)
+        if keep_ratio:
+            size = rescale_size((w, h), size, return_scale=False)
+    else:
+        size = _scale_size((w, h), scale_factor)
+
+    divisor = to_2tuple(divisor)
+    size = tuple(int(np.ceil(s / d)) * d for s, d in zip(size, divisor))
+    resized_img, w_scale, h_scale = imresize(
+        img,
+        size,
+        return_scale=True,
+        interpolation=interpolation,
+        out=out,
+        backend=backend)
+    if return_scale:
+        return resized_img, w_scale, h_scale
+    else:
+        return resized_img
+
+
+def imresize_like(
+    img: np.ndarray,
+    dst_img: np.ndarray,
+    return_scale: bool = False,
+    interpolation: str = 'bilinear',
+    backend: Optional[str] = None
+) -> Union[Tuple[np.ndarray, float, float], np.ndarray]:
+    """Resize image to the same size of a given image.
+
+    Args:
+        img (ndarray): The input image.
+        dst_img (ndarray): The target image.
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = dst_img.shape[:2]
+    return imresize(img, (w, h), return_scale, interpolation, backend=backend)
+
+
+def rescale_size(old_size: tuple,
+                 scale: Union[float, int, Tuple[int, int]],
+                 return_scale: bool = False) -> tuple:
+    """Calculate the new size to be rescaled to.
+
+    Args:
+        old_size (tuple[int]): The old size (w, h) of image.
+        scale (float | int | tuple[int]): The scaling factor or maximum size.
+            If it is a float number or an integer, then the image will be
+            rescaled by this factor, else if it is a tuple of 2 integers, then
+            the image will be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image size.
+
+    Returns:
+        tuple[int]: The new rescaled image size.
+    """
+    w, h = old_size
+    if isinstance(scale, (float, int)):
+        if scale <= 0:
+            raise ValueError(f'Invalid scale {scale}, must be positive.')
+        scale_factor = scale
+    elif isinstance(scale, tuple):
+        max_long_edge = max(scale)
+        max_short_edge = min(scale)
+        scale_factor = min(max_long_edge / max(h, w),
+                           max_short_edge / min(h, w))
+    else:
+        raise TypeError(
+            f'Scale must be a number or tuple of int, but got {type(scale)}')
+
+    new_size = _scale_size((w, h), scale_factor)
+
+    if return_scale:
+        return new_size, scale_factor
+    else:
+        return new_size
+
+
+def imrescale(
+    img: np.ndarray,
+    scale: Union[float, int, Tuple[int, int]],
+    return_scale: bool = False,
+    interpolation: str = 'bilinear',
+    backend: Optional[str] = None
+) -> Union[np.ndarray, Tuple[np.ndarray, float]]:
+    """Resize image while keeping the aspect ratio.
+
+    Args:
+        img (ndarray): The input image.
+        scale (float | int | tuple[int]): The scaling factor or maximum size.
+            If it is a float number or an integer, then the image will be
+            rescaled by this factor, else if it is a tuple of 2 integers, then
+            the image will be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The rescaled image.
+    """
+    h, w = img.shape[:2]
+    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
+    rescaled_img = imresize(
+        img, new_size, interpolation=interpolation, backend=backend)
+    if return_scale:
+        return rescaled_img, scale_factor
+    else:
+        return rescaled_img
+
+
+def imflip(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:
+    """Flip an image horizontally or vertically.
+
+    Args:
+        img (ndarray): Image to be flipped.
+        direction (str): The flip direction, either "horizontal" or
+            "vertical" or "diagonal".
+
+    Returns:
+        ndarray: The flipped image.
+    """
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    if direction == 'horizontal':
+        return np.flip(img, axis=1)
+    elif direction == 'vertical':
+        return np.flip(img, axis=0)
+    else:
+        return np.flip(img, axis=(0, 1))
+
+
+def imflip_(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray:
+    """Inplace flip an image horizontally or vertically.
+
+    Args:
+        img (ndarray): Image to be flipped.
+        direction (str): The flip direction, either "horizontal" or
+            "vertical" or "diagonal".
+
+    Returns:
+        ndarray: The flipped image (inplace).
+    """
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    if direction == 'horizontal':
+        return cv2.flip(img, 1, img)
+    elif direction == 'vertical':
+        return cv2.flip(img, 0, img)
+    else:
+        return cv2.flip(img, -1, img)
+
+
+def imrotate(img: np.ndarray,
+             angle: float,
+             center: Optional[Tuple[float, float]] = None,
+             scale: float = 1.0,
+             border_value: int = 0,
+             interpolation: str = 'bilinear',
+             auto_bound: bool = False,
+             border_mode: str = 'constant') -> np.ndarray:
+    """Rotate an image.
+
+    Args:
+        img (np.ndarray): Image to be rotated.
+        angle (float): Rotation angle in degrees, positive values mean
+            clockwise rotation.
+        center (tuple[float], optional): Center point (w, h) of the rotation in
+            the source image. If not specified, the center of the image will be
+            used.
+        scale (float): Isotropic scale factor.
+        border_value (int): Border value used in case of a constant border.
+            Defaults to 0.
+        interpolation (str): Same as :func:`resize`.
+        auto_bound (bool): Whether to adjust the image size to cover the whole
+            rotated image.
+        border_mode (str): Pixel extrapolation method. Defaults to 'constant'.
+
+    Returns:
+        np.ndarray: The rotated image.
+    """
+    if center is not None and auto_bound:
+        raise ValueError('`auto_bound` conflicts with `center`')
+    h, w = img.shape[:2]
+    if center is None:
+        center = ((w - 1) * 0.5, (h - 1) * 0.5)
+    assert isinstance(center, tuple)
+
+    matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+    if auto_bound:
+        cos = np.abs(matrix[0, 0])
+        sin = np.abs(matrix[0, 1])
+        new_w = h * sin + w * cos
+        new_h = h * cos + w * sin
+        matrix[0, 2] += (new_w - w) * 0.5
+        matrix[1, 2] += (new_h - h) * 0.5
+        w = int(np.round(new_w))
+        h = int(np.round(new_h))
+    rotated = cv2.warpAffine(
+        img,
+        matrix, (w, h),
+        flags=cv2_interp_codes[interpolation],
+        borderMode=cv2_border_modes[border_mode],
+        borderValue=border_value)
+    return rotated
+
+
+def bbox_clip(bboxes: np.ndarray, img_shape: Tuple[int, int]) -> np.ndarray:
+    """Clip bboxes to fit the image shape.
+
+    Args:
+        bboxes (ndarray): Shape (..., 4*k)
+        img_shape (tuple[int]): (height, width) of the image.
+
+    Returns:
+        ndarray: Clipped bboxes.
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype)
+    cmin[0::2] = img_shape[1] - 1
+    cmin[1::2] = img_shape[0] - 1
+    clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0)
+    return clipped_bboxes
+
+
+def bbox_scaling(bboxes: np.ndarray,
+                 scale: float,
+                 clip_shape: Optional[Tuple[int, int]] = None) -> np.ndarray:
+    """Scaling bboxes w.r.t the box center.
+
+    Args:
+        bboxes (ndarray): Shape(..., 4).
+        scale (float): Scaling factor.
+        clip_shape (tuple[int], optional): If specified, bboxes that exceed the
+            boundary will be clipped according to the given shape (h, w).
+
+    Returns:
+        ndarray: Scaled bboxes.
+    """
+    if float(scale) == 1.0:
+        scaled_bboxes = bboxes.copy()
+    else:
+        w = bboxes[..., 2] - bboxes[..., 0] + 1
+        h = bboxes[..., 3] - bboxes[..., 1] + 1
+        dw = (w * (scale - 1)) * 0.5
+        dh = (h * (scale - 1)) * 0.5
+        scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1)
+    if clip_shape is not None:
+        return bbox_clip(scaled_bboxes, clip_shape)
+    else:
+        return scaled_bboxes
+
+
+def imcrop(
+    img: np.ndarray,
+    bboxes: np.ndarray,
+    scale: float = 1.0,
+    pad_fill: Union[float, list, None] = None
+) -> Union[np.ndarray, List[np.ndarray]]:
+    """Crop image patches.
+
+    3 steps: scale the bboxes -> clip bboxes -> crop and pad.
+
+    Args:
+        img (ndarray): Image to be cropped.
+        bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes.
+        scale (float, optional): Scale ratio of bboxes, the default value
+            1.0 means no scaling.
+        pad_fill (Number | list[Number]): Value to be filled for padding.
+            Default: None, which means no padding.
+
+    Returns:
+        list[ndarray] | ndarray: The cropped image patches.
+    """
+    chn = 1 if img.ndim == 2 else img.shape[2]
+    if pad_fill is not None:
+        if isinstance(pad_fill, (int, float)):
+            pad_fill = [pad_fill for _ in range(chn)]
+        assert len(pad_fill) == chn
+
+    _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes
+    scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32)
+    clipped_bbox = bbox_clip(scaled_bboxes, img.shape)
+
+    patches = []
+    for i in range(clipped_bbox.shape[0]):
+        x1, y1, x2, y2 = tuple(clipped_bbox[i, :])
+        if pad_fill is None:
+            patch = img[y1:y2 + 1, x1:x2 + 1, ...]
+        else:
+            _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :])
+            patch_h = _y2 - _y1 + 1
+            patch_w = _x2 - _x1 + 1
+            if chn == 1:
+                patch_shape = (patch_h, patch_w)
+            else:
+                patch_shape = (patch_h, patch_w, chn)  # type: ignore
+            patch = np.array(
+                pad_fill, dtype=img.dtype) * np.ones(
+                    patch_shape, dtype=img.dtype)
+            x_start = 0 if _x1 >= 0 else -_x1
+            y_start = 0 if _y1 >= 0 else -_y1
+            w = x2 - x1 + 1
+            h = y2 - y1 + 1
+            patch[y_start:y_start + h, x_start:x_start + w,
+                  ...] = img[y1:y1 + h, x1:x1 + w, ...]
+        patches.append(patch)
+
+    if bboxes.ndim == 1:
+        return patches[0]
+    else:
+        return patches
+
+
+def impad(img: np.ndarray,
+          *,
+          shape: Optional[Tuple[int, int]] = None,
+          padding: Union[int, tuple, None] = None,
+          pad_val: Union[float, List] = 0,
+          padding_mode: str = 'constant') -> np.ndarray:
+    """Pad the given image to a certain shape or pad on all sides with
+    specified padding mode and padding value.
+
+    Args:
+        img (ndarray): Image to be padded.
+        shape (tuple[int]): Expected padding shape (h, w). Default: None.
+        padding (int or tuple[int]): Padding on each border. If a single int is
+            provided this is used to pad all borders. If tuple of length 2 is
+            provided this is the padding on left/right and top/bottom
+            respectively. If a tuple of length 4 is provided this is the
+            padding for the left, top, right and bottom borders respectively.
+            Default: None. Note that `shape` and `padding` can not be both
+            set.
+        pad_val (Number | Sequence[Number]): Values to be filled in padding
+            areas when padding_mode is 'constant'. Default: 0.
+        padding_mode (str): Type of padding. Should be: constant, edge,
+            reflect or symmetric. Default: constant.
+
+            - constant: pads with a constant value, this value is specified
+              with pad_val.
+            - edge: pads with the last value at the edge of the image.
+            - reflect: pads with reflection of image without repeating the last
+              value on the edge. For example, padding [1, 2, 3, 4] with 2
+              elements on both sides in reflect mode will result in
+              [3, 2, 1, 2, 3, 4, 3, 2].
+            - symmetric: pads with reflection of image repeating the last value
+              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+              both sides in symmetric mode will result in
+              [2, 1, 1, 2, 3, 4, 4, 3]
+
+    Returns:
+        ndarray: The padded image.
+    """
+
+    assert (shape is not None) ^ (padding is not None)
+    if shape is not None:
+        width = max(shape[1] - img.shape[1], 0)
+        height = max(shape[0] - img.shape[0], 0)
+        padding = (0, 0, width, height)
+
+    # check pad_val
+    if isinstance(pad_val, tuple):
+        assert len(pad_val) == img.shape[-1]
+    elif not isinstance(pad_val, numbers.Number):
+        raise TypeError('pad_val must be a int or a tuple. '
+                        f'But received {type(pad_val)}')
+
+    # check padding
+    if isinstance(padding, tuple) and len(padding) in [2, 4]:
+        if len(padding) == 2:
+            padding = (padding[0], padding[1], padding[0], padding[1])
+    elif isinstance(padding, numbers.Number):
+        padding = (padding, padding, padding, padding)
+    else:
+        raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
+                         f'But received {padding}')
+
+    # check padding mode
+    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+
+    border_type = {
+        'constant': cv2.BORDER_CONSTANT,
+        'edge': cv2.BORDER_REPLICATE,
+        'reflect': cv2.BORDER_REFLECT_101,
+        'symmetric': cv2.BORDER_REFLECT
+    }
+    img = cv2.copyMakeBorder(
+        img,
+        padding[1],
+        padding[3],
+        padding[0],
+        padding[2],
+        border_type[padding_mode],
+        value=pad_val)
+
+    return img
+
+
+def impad_to_multiple(img: np.ndarray,
+                      divisor: int,
+                      pad_val: Union[float, List] = 0) -> np.ndarray:
+    """Pad an image to ensure each edge to be multiple to some number.
+
+    Args:
+        img (ndarray): Image to be padded.
+        divisor (int): Padded image edges will be multiple to divisor.
+        pad_val (Number | Sequence[Number]): Same as :func:`impad`.
+
+    Returns:
+        ndarray: The padded image.
+    """
+    pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor
+    pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor
+    return impad(img, shape=(pad_h, pad_w), pad_val=pad_val)
+
+
+def cutout(img: np.ndarray,
+           shape: Union[int, Tuple[int, int]],
+           pad_val: Union[int, float, tuple] = 0) -> np.ndarray:
+    """Randomly cut out a rectangle from the original img.
+
+    Args:
+        img (ndarray): Image to be cutout.
+        shape (int | tuple[int]): Expected cutout shape (h, w). If given as a
+            int, the value will be used for both h and w.
+        pad_val (int | float | tuple[int | float]): Values to be filled in the
+            cut area. Defaults to 0.
+
+    Returns:
+        ndarray: The cutout image.
+    """
+
+    channels = 1 if img.ndim == 2 else img.shape[2]
+    if isinstance(shape, int):
+        cut_h, cut_w = shape, shape
+    else:
+        assert isinstance(shape, tuple) and len(shape) == 2, \
+            f'shape must be a int or a tuple with length 2, but got type ' \
+            f'{type(shape)} instead.'
+        cut_h, cut_w = shape
+    if isinstance(pad_val, (int, float)):
+        pad_val = tuple([pad_val] * channels)
+    elif isinstance(pad_val, tuple):
+        assert len(pad_val) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(pad_val), channels)
+    else:
+        raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`')
+
+    img_h, img_w = img.shape[:2]
+    y0 = np.random.uniform(img_h)
+    x0 = np.random.uniform(img_w)
+
+    y1 = int(max(0, y0 - cut_h / 2.))
+    x1 = int(max(0, x0 - cut_w / 2.))
+    y2 = min(img_h, y1 + cut_h)
+    x2 = min(img_w, x1 + cut_w)
+
+    if img.ndim == 2:
+        patch_shape = (y2 - y1, x2 - x1)
+    else:
+        patch_shape = (y2 - y1, x2 - x1, channels)  # type: ignore
+
+    img_cutout = img.copy()
+    patch = np.array(
+        pad_val, dtype=img.dtype) * np.ones(
+            patch_shape, dtype=img.dtype)
+    img_cutout[y1:y2, x1:x2, ...] = patch
+
+    return img_cutout
+
+
+def _get_shear_matrix(magnitude: Union[int, float],
+                      direction: str = 'horizontal') -> np.ndarray:
+    """Generate the shear matrix for transformation.
+
+    Args:
+        magnitude (int | float): The magnitude used for shear.
+        direction (str): The flip direction, either "horizontal"
+            or "vertical".
+
+    Returns:
+        ndarray: The shear matrix with dtype float32.
+    """
+    if direction == 'horizontal':
+        shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]])
+    elif direction == 'vertical':
+        shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]])
+    return shear_matrix
+
+
+def imshear(img: np.ndarray,
+            magnitude: Union[int, float],
+            direction: str = 'horizontal',
+            border_value: Union[int, Tuple[int, int]] = 0,
+            interpolation: str = 'bilinear') -> np.ndarray:
+    """Shear an image.
+
+    Args:
+        img (ndarray): Image to be sheared with format (h, w)
+            or (h, w, c).
+        magnitude (int | float): The magnitude used for shear.
+        direction (str): The flip direction, either "horizontal"
+            or "vertical".
+        border_value (int | tuple[int]): Value used in case of a
+            constant border.
+        interpolation (str): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The sheared image.
+    """
+    assert direction in ['horizontal',
+                         'vertical'], f'Invalid direction: {direction}'
+    height, width = img.shape[:2]
+    if img.ndim == 2:
+        channels = 1
+    elif img.ndim == 3:
+        channels = img.shape[-1]
+    if isinstance(border_value, int):
+        border_value = tuple([border_value] * channels)  # type: ignore
+    elif isinstance(border_value, tuple):
+        assert len(border_value) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(border_value), channels)
+    else:
+        raise ValueError(
+            f'Invalid type {type(border_value)} for `border_value`')
+    shear_matrix = _get_shear_matrix(magnitude, direction)
+    sheared = cv2.warpAffine(
+        img,
+        shear_matrix,
+        (width, height),
+        # Note case when the number elements in `border_value`
+        # greater than 3 (e.g. shearing masks whose channels large
+        # than 3) will raise TypeError in `cv2.warpAffine`.
+        # Here simply slice the first 3 values in `border_value`.
+        borderValue=border_value[:3],  # type: ignore
+        flags=cv2_interp_codes[interpolation])
+    return sheared
+
+
+def _get_translate_matrix(offset: Union[int, float],
+                          direction: str = 'horizontal') -> np.ndarray:
+    """Generate the translate matrix.
+
+    Args:
+        offset (int | float): The offset used for translate.
+        direction (str): The translate direction, either
+            "horizontal" or "vertical".
+
+    Returns:
+        ndarray: The translate matrix with dtype float32.
+    """
+    if direction == 'horizontal':
+        translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]])
+    elif direction == 'vertical':
+        translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]])
+    return translate_matrix
+
+
+def imtranslate(img: np.ndarray,
+                offset: Union[int, float],
+                direction: str = 'horizontal',
+                border_value: Union[int, tuple] = 0,
+                interpolation: str = 'bilinear') -> np.ndarray:
+    """Translate an image.
+
+    Args:
+        img (ndarray): Image to be translated with format
+            (h, w) or (h, w, c).
+        offset (int | float): The offset used for translate.
+        direction (str): The translate direction, either "horizontal"
+            or "vertical".
+        border_value (int | tuple[int]): Value used in case of a
+            constant border.
+        interpolation (str): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The translated image.
+    """
+    assert direction in ['horizontal',
+                         'vertical'], f'Invalid direction: {direction}'
+    height, width = img.shape[:2]
+    if img.ndim == 2:
+        channels = 1
+    elif img.ndim == 3:
+        channels = img.shape[-1]
+    if isinstance(border_value, int):
+        border_value = tuple([border_value] * channels)
+    elif isinstance(border_value, tuple):
+        assert len(border_value) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(border_value), channels)
+    else:
+        raise ValueError(
+            f'Invalid type {type(border_value)} for `border_value`.')
+    translate_matrix = _get_translate_matrix(offset, direction)
+    translated = cv2.warpAffine(
+        img,
+        translate_matrix,
+        (width, height),
+        # Note case when the number elements in `border_value`
+        # greater than 3 (e.g. translating masks whose channels
+        # large than 3) will raise TypeError in `cv2.warpAffine`.
+        # Here simply slice the first 3 values in `border_value`.
+        borderValue=border_value[:3],
+        flags=cv2_interp_codes[interpolation])
+    return translated
diff --git a/head_extractor/mmcv-2.1.0/mmcv/image/io.py b/head_extractor/mmcv-2.1.0/mmcv/image/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..e10d443da6554865afc98cb2441a0cc8eddf0e16
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/image/io.py
@@ -0,0 +1,364 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import io
+import os.path as osp
+import warnings
+from pathlib import Path
+from typing import Optional, Union
+
+import cv2
+import mmengine.fileio as fileio
+import numpy as np
+from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION,
+                 IMREAD_UNCHANGED)
+from mmengine.utils import is_filepath, is_str
+
+try:
+    from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG
+except ImportError:
+    TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None
+
+try:
+    from PIL import Image, ImageOps
+except ImportError:
+    Image = None
+
+try:
+    import tifffile
+except ImportError:
+    tifffile = None
+
+jpeg = None
+supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile']
+
+imread_flags = {
+    'color': IMREAD_COLOR,
+    'grayscale': IMREAD_GRAYSCALE,
+    'unchanged': IMREAD_UNCHANGED,
+    'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR,
+    'grayscale_ignore_orientation':
+    IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE
+}
+
+imread_backend = 'cv2'
+
+
+def use_backend(backend: str) -> None:
+    """Select a backend for image decoding.
+
+    Args:
+        backend (str): The image decoding backend type. Options are `cv2`,
+        `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG)
+        and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg`
+        file format.
+    """
+    assert backend in supported_backends
+    global imread_backend
+    imread_backend = backend
+    if imread_backend == 'turbojpeg':
+        if TurboJPEG is None:
+            raise ImportError('`PyTurboJPEG` is not installed')
+        global jpeg
+        if jpeg is None:
+            jpeg = TurboJPEG()
+    elif imread_backend == 'pillow':
+        if Image is None:
+            raise ImportError('`Pillow` is not installed')
+    elif imread_backend == 'tifffile':
+        if tifffile is None:
+            raise ImportError('`tifffile` is not installed')
+
+
+def _jpegflag(flag: str = 'color', channel_order: str = 'bgr'):
+    channel_order = channel_order.lower()
+    if channel_order not in ['rgb', 'bgr']:
+        raise ValueError('channel order must be either "rgb" or "bgr"')
+
+    if flag == 'color':
+        if channel_order == 'bgr':
+            return TJPF_BGR
+        elif channel_order == 'rgb':
+            return TJCS_RGB
+    elif flag == 'grayscale':
+        return TJPF_GRAY
+    else:
+        raise ValueError('flag must be "color" or "grayscale"')
+
+
+def _pillow2array(img,
+                  flag: str = 'color',
+                  channel_order: str = 'bgr') -> np.ndarray:
+    """Convert a pillow image to numpy array.
+
+    Args:
+        img (:obj:`PIL.Image.Image`): The image loaded using PIL
+        flag (str): Flags specifying the color type of a loaded image,
+            candidates are 'color', 'grayscale' and 'unchanged'.
+            Default to 'color'.
+        channel_order (str): The channel order of the output image array,
+            candidates are 'bgr' and 'rgb'. Default to 'bgr'.
+
+    Returns:
+        np.ndarray: The converted numpy array
+    """
+    channel_order = channel_order.lower()
+    if channel_order not in ['rgb', 'bgr']:
+        raise ValueError('channel order must be either "rgb" or "bgr"')
+
+    if flag == 'unchanged':
+        array = np.array(img)
+        if array.ndim >= 3 and array.shape[2] >= 3:  # color image
+            array[:, :, :3] = array[:, :, (2, 1, 0)]  # RGB to BGR
+    else:
+        # Handle exif orientation tag
+        if flag in ['color', 'grayscale']:
+            img = ImageOps.exif_transpose(img)
+        # If the image mode is not 'RGB', convert it to 'RGB' first.
+        if img.mode != 'RGB':
+            if img.mode != 'LA':
+                # Most formats except 'LA' can be directly converted to RGB
+                img = img.convert('RGB')
+            else:
+                # When the mode is 'LA', the default conversion will fill in
+                #  the canvas with black, which sometimes shadows black objects
+                #  in the foreground.
+                #
+                # Therefore, a random color (124, 117, 104) is used for canvas
+                img_rgba = img.convert('RGBA')
+                img = Image.new('RGB', img_rgba.size, (124, 117, 104))
+                img.paste(img_rgba, mask=img_rgba.split()[3])  # 3 is alpha
+        if flag in ['color', 'color_ignore_orientation']:
+            array = np.array(img)
+            if channel_order != 'rgb':
+                array = array[:, :, ::-1]  # RGB to BGR
+        elif flag in ['grayscale', 'grayscale_ignore_orientation']:
+            img = img.convert('L')
+            array = np.array(img)
+        else:
+            raise ValueError(
+                'flag must be "color", "grayscale", "unchanged", '
+                f'"color_ignore_orientation" or "grayscale_ignore_orientation"'
+                f' but got {flag}')
+    return array
+
+
+def imread(img_or_path: Union[np.ndarray, str, Path],
+           flag: str = 'color',
+           channel_order: str = 'bgr',
+           backend: Optional[str] = None,
+           file_client_args: Optional[dict] = None,
+           *,
+           backend_args: Optional[dict] = None) -> np.ndarray:
+    """Read an image.
+
+    Args:
+        img_or_path (ndarray or str or Path): Either a numpy array or str or
+            pathlib.Path. If it is a numpy array (loaded image), then
+            it will be returned as is.
+        flag (str): Flags specifying the color type of a loaded image,
+            candidates are `color`, `grayscale`, `unchanged`,
+            `color_ignore_orientation` and `grayscale_ignore_orientation`.
+            By default, `cv2` and `pillow` backend would rotate the image
+            according to its EXIF info unless called with `unchanged` or
+            `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend
+            always ignore image's EXIF info regardless of the flag.
+            The `turbojpeg` backend only supports `color` and `grayscale`.
+        channel_order (str): Order of channel, candidates are `bgr` and `rgb`.
+        backend (str | None): The image decoding backend type. Options are
+            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.
+            If backend is None, the global imread_backend specified by
+            ``mmcv.use_backend()`` will be used. Default: None.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Default: None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+            Deprecated in version 2.0.0rc4.
+        backend_args (dict, optional): Instantiates the corresponding file
+            backend. It may contain `backend` key to specify the file
+            backend. If it contains, the file backend corresponding to this
+            value will be used and initialized with the remaining values,
+            otherwise the corresponding file backend will be selected
+            based on the prefix of the file path. Defaults to None.
+            New in version 2.0.0rc4.
+
+    Returns:
+        ndarray: Loaded image array.
+
+    Examples:
+        >>> import mmcv
+        >>> img_path = '/path/to/img.jpg'
+        >>> img = mmcv.imread(img_path)
+        >>> img = mmcv.imread(img_path, flag='color', channel_order='rgb',
+        ...     backend='cv2')
+        >>> img = mmcv.imread(img_path, flag='color', channel_order='bgr',
+        ...     backend='pillow')
+        >>> s3_img_path = 's3://bucket/img.jpg'
+        >>> # infer the file backend by the prefix s3
+        >>> img = mmcv.imread(s3_img_path)
+        >>> # manually set the file backend petrel
+        >>> img = mmcv.imread(s3_img_path, backend_args={
+        ...     'backend': 'petrel'})
+        >>> http_img_path = 'http://path/to/img.jpg'
+        >>> img = mmcv.imread(http_img_path)
+        >>> img = mmcv.imread(http_img_path, backend_args={
+        ...     'backend': 'http'})
+    """
+    if file_client_args is not None:
+        warnings.warn(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead', DeprecationWarning)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args" and "backend_args" cannot be set at the '
+                'same time.')
+
+    if isinstance(img_or_path, Path):
+        img_or_path = str(img_or_path)
+
+    if isinstance(img_or_path, np.ndarray):
+        return img_or_path
+    elif is_str(img_or_path):
+        if file_client_args is not None:
+            file_client = fileio.FileClient.infer_client(
+                file_client_args, img_or_path)
+            img_bytes = file_client.get(img_or_path)
+        else:
+            img_bytes = fileio.get(img_or_path, backend_args=backend_args)
+        return imfrombytes(img_bytes, flag, channel_order, backend)
+    else:
+        raise TypeError('"img" must be a numpy array or a str or '
+                        'a pathlib.Path object')
+
+
+def imfrombytes(content: bytes,
+                flag: str = 'color',
+                channel_order: str = 'bgr',
+                backend: Optional[str] = None) -> np.ndarray:
+    """Read an image from bytes.
+
+    Args:
+        content (bytes): Image bytes got from files or other streams.
+        flag (str): Same as :func:`imread`.
+        channel_order (str): The channel order of the output, candidates
+            are 'bgr' and 'rgb'. Default to 'bgr'.
+        backend (str | None): The image decoding backend type. Options are
+            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is
+            None, the global imread_backend specified by ``mmcv.use_backend()``
+            will be used. Default: None.
+
+    Returns:
+        ndarray: Loaded image array.
+
+    Examples:
+        >>> img_path = '/path/to/img.jpg'
+        >>> with open(img_path, 'rb') as f:
+        >>>     img_buff = f.read()
+        >>> img = mmcv.imfrombytes(img_buff)
+        >>> img = mmcv.imfrombytes(img_buff, flag='color', channel_order='rgb')
+        >>> img = mmcv.imfrombytes(img_buff, backend='pillow')
+        >>> img = mmcv.imfrombytes(img_buff, backend='cv2')
+    """
+
+    if backend is None:
+        backend = imread_backend
+    if backend not in supported_backends:
+        raise ValueError(
+            f'backend: {backend} is not supported. Supported '
+            "backends are 'cv2', 'turbojpeg', 'pillow', 'tifffile'")
+    if backend == 'turbojpeg':
+        img = jpeg.decode(  # type: ignore
+            content, _jpegflag(flag, channel_order))
+        if img.shape[-1] == 1:
+            img = img[:, :, 0]
+        return img
+    elif backend == 'pillow':
+        with io.BytesIO(content) as buff:
+            img = Image.open(buff)
+            img = _pillow2array(img, flag, channel_order)
+        return img
+    elif backend == 'tifffile':
+        with io.BytesIO(content) as buff:
+            img = tifffile.imread(buff)
+        return img
+    else:
+        img_np = np.frombuffer(content, np.uint8)
+        flag = imread_flags[flag] if is_str(flag) else flag
+        img = cv2.imdecode(img_np, flag)
+        if flag == IMREAD_COLOR and channel_order == 'rgb':
+            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+        return img
+
+
+def imwrite(img: np.ndarray,
+            file_path: str,
+            params: Optional[list] = None,
+            auto_mkdir: Optional[bool] = None,
+            file_client_args: Optional[dict] = None,
+            *,
+            backend_args: Optional[dict] = None) -> bool:
+    """Write image to file.
+
+    Warning:
+        The parameter `auto_mkdir` will be deprecated in the future and every
+        file clients will make directory automatically.
+
+    Args:
+        img (ndarray): Image array to be written.
+        file_path (str): Image file path.
+        params (None or list): Same as opencv :func:`imwrite` interface.
+        auto_mkdir (bool): If the parent folder of `file_path` does not exist,
+            whether to create it automatically. It will be deprecated.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Default: None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+            Deprecated in version 2.0.0rc4.
+        backend_args (dict, optional): Instantiates the corresponding file
+            backend. It may contain `backend` key to specify the file
+            backend. If it contains, the file backend corresponding to this
+            value will be used and initialized with the remaining values,
+            otherwise the corresponding file backend will be selected
+            based on the prefix of the file path. Defaults to None.
+            New in version 2.0.0rc4.
+
+    Returns:
+        bool: Successful or not.
+
+    Examples:
+        >>> # write to hard disk client
+        >>> ret = mmcv.imwrite(img, '/path/to/img.jpg')
+        >>> # infer the file backend by the prefix s3
+        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg')
+        >>> # manually set the file backend petrel
+        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg', backend_args={
+        ...     'backend': 'petrel'})
+    """
+    if file_client_args is not None:
+        warnings.warn(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead', DeprecationWarning)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args" and "backend_args" cannot be set at the '
+                'same time.')
+
+    assert is_filepath(file_path)
+    file_path = str(file_path)
+    if auto_mkdir is not None:
+        warnings.warn(
+            'The parameter `auto_mkdir` will be deprecated in the future and '
+            'every file clients will make directory automatically.')
+
+    img_ext = osp.splitext(file_path)[-1]
+    # Encode image according to image suffix.
+    # For example, if image path is '/path/your/img.jpg', the encode
+    # format is '.jpg'.
+    flag, img_buff = cv2.imencode(img_ext, img, params)
+
+    if file_client_args is not None:
+        file_client = fileio.FileClient.infer_client(file_client_args,
+                                                     file_path)
+        file_client.put(img_buff.tobytes(), file_path)
+    else:
+        fileio.put(img_buff.tobytes(), file_path, backend_args=backend_args)
+
+    return flag
diff --git a/head_extractor/mmcv-2.1.0/mmcv/image/misc.py b/head_extractor/mmcv-2.1.0/mmcv/image/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e923cad4e5f7d210640ee51291a48d82c3b84c32
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/image/misc.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import numpy as np
+
+import mmcv
+
+try:
+    import torch
+except ImportError:
+    torch = None
+
+
+def tensor2imgs(tensor,
+                mean: Optional[tuple] = None,
+                std: Optional[tuple] = None,
+                to_rgb: bool = True) -> list:
+    """Convert tensor to 3-channel images or 1-channel gray images.
+
+    Args:
+        tensor (torch.Tensor): Tensor that contains multiple images, shape (
+            N, C, H, W). :math:`C` can be either 3 or 1.
+        mean (tuple[float], optional): Mean of images. If None,
+            (0, 0, 0) will be used for tensor with 3-channel,
+            while (0, ) for tensor with 1-channel. Defaults to None.
+        std (tuple[float], optional): Standard deviation of images. If None,
+            (1, 1, 1) will be used for tensor with 3-channel,
+            while (1, ) for tensor with 1-channel. Defaults to None.
+        to_rgb (bool, optional): Whether the tensor was converted to RGB
+            format in the first place. If so, convert it back to BGR.
+            For the tensor with 1 channel, it must be False. Defaults to True.
+
+    Returns:
+        list[np.ndarray]: A list that contains multiple images.
+    """
+
+    if torch is None:
+        raise RuntimeError('pytorch is not installed')
+    assert torch.is_tensor(tensor) and tensor.ndim == 4
+    channels = tensor.size(1)
+    assert channels in [1, 3]
+    if mean is None:
+        mean = (0, ) * channels
+    if std is None:
+        std = (1, ) * channels
+    assert (channels == len(mean) == len(std) == 3) or \
+        (channels == len(mean) == len(std) == 1 and not to_rgb)
+
+    num_imgs = tensor.size(0)
+    mean = np.array(mean, dtype=np.float32)
+    std = np.array(std, dtype=np.float32)
+    imgs = []
+    for img_id in range(num_imgs):
+        img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
+        img = mmcv.imdenormalize(
+            img, mean, std, to_bgr=to_rgb).astype(np.uint8)
+        imgs.append(np.ascontiguousarray(img))
+    return imgs
diff --git a/head_extractor/mmcv-2.1.0/mmcv/image/photometric.py b/head_extractor/mmcv-2.1.0/mmcv/image/photometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..12cbb90822564bf14cd5176cc3c5532220db40da
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/image/photometric.py
@@ -0,0 +1,561 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional
+
+import cv2
+import numpy as np
+from mmengine.utils import is_tuple_of
+from PIL import Image, ImageEnhance
+
+from .colorspace import bgr2gray, gray2bgr
+from .io import imread_backend
+
+
+def imnormalize(img, mean, std, to_rgb=True):
+    """Normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    img = img.copy().astype(np.float32)
+    return imnormalize_(img, mean, std, to_rgb)
+
+
+def imnormalize_(img, mean, std, to_rgb=True):
+    """Inplace normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    # cv2 inplace normalization does not accept uint8
+    assert img.dtype != np.uint8
+    mean = np.float64(mean.reshape(1, -1))
+    stdinv = 1 / np.float64(std.reshape(1, -1))
+    if to_rgb:
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
+    cv2.subtract(img, mean, img)  # inplace
+    cv2.multiply(img, stdinv, img)  # inplace
+    return img
+
+
+def imdenormalize(img, mean, std, to_bgr=True):
+    assert img.dtype != np.uint8
+    mean = mean.reshape(1, -1).astype(np.float64)
+    std = std.reshape(1, -1).astype(np.float64)
+    img = cv2.multiply(img, std)  # make a copy
+    cv2.add(img, mean, img)  # inplace
+    if to_bgr:
+        cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img)  # inplace
+    return img
+
+
+def iminvert(img):
+    """Invert (negate) an image.
+
+    Args:
+        img (ndarray): Image to be inverted.
+
+    Returns:
+        ndarray: The inverted image.
+    """
+    return np.full_like(img, 255) - img
+
+
+def solarize(img, thr=128):
+    """Solarize an image (invert all pixel values above a threshold)
+
+    Args:
+        img (ndarray): Image to be solarized.
+        thr (int): Threshold for solarizing (0 - 255).
+
+    Returns:
+        ndarray: The solarized image.
+    """
+    img = np.where(img < thr, img, 255 - img)
+    return img
+
+
+def posterize(img, bits):
+    """Posterize an image (reduce the number of bits for each color channel)
+
+    Args:
+        img (ndarray): Image to be posterized.
+        bits (int): Number of bits (1 to 8) to use for posterizing.
+
+    Returns:
+        ndarray: The posterized image.
+    """
+    shift = 8 - bits
+    img = np.left_shift(np.right_shift(img, shift), shift)
+    return img
+
+
+def adjust_color(img, alpha=1, beta=None, gamma=0, backend=None):
+    r"""It blends the source image and its gray image:
+
+    .. math::
+        output = img * alpha + gray\_img * beta + gamma
+
+    Args:
+        img (ndarray): The input source image.
+        alpha (int | float): Weight for the source image. Default 1.
+        beta (int | float): Weight for the converted gray image.
+            If None, it's assigned the value (1 - `alpha`).
+        gamma (int | float): Scalar added to each sum.
+            Same as :func:`cv2.addWeighted`. Default 0.
+        backend (str | None): The image processing backend type. Options are
+            `cv2`, `pillow`, `None`. If backend is None, the global
+            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
+            used. Defaults to None.
+
+    Returns:
+        ndarray: Colored image which has the same size and dtype as input.
+    """
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        warnings.warn("Only use 'alpha' for pillow backend.")
+        # Image.fromarray defaultly supports RGB, not BGR.
+        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
+        enhancer = ImageEnhance.Color(pil_image)
+        pil_image = enhancer.enhance(alpha)
+        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
+    else:
+        gray_img = bgr2gray(img)
+        gray_img = np.tile(gray_img[..., None], [1, 1, 3])
+        if beta is None:
+            beta = 1 - alpha
+        colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma)
+        if not colored_img.dtype == np.uint8:
+            # Note when the dtype of `img` is not the default `np.uint8`
+            # (e.g. np.float32), the value in `colored_img` got from cv2
+            # is not guaranteed to be in range [0, 255], so here clip
+            # is needed.
+            colored_img = np.clip(colored_img, 0, 255)
+        return colored_img.astype(img.dtype)
+
+
+def imequalize(img):
+    """Equalize the image histogram.
+
+    This function applies a non-linear mapping to the input image,
+    in order to create a uniform distribution of grayscale values
+    in the output image.
+
+    Args:
+        img (ndarray): Image to be equalized.
+
+    Returns:
+        ndarray: The equalized image.
+    """
+
+    def _scale_channel(im, c):
+        """Scale the data in the corresponding channel."""
+        im = im[:, :, c]
+        # Compute the histogram of the image channel.
+        histo = np.histogram(im, 256, (0, 255))[0]
+        # For computing the step, filter out the nonzeros.
+        nonzero_histo = histo[histo > 0]
+        step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
+        if not step:
+            lut = np.array(range(256))
+        else:
+            # Compute the cumulative sum, shifted by step // 2
+            # and then normalized by step.
+            lut = (np.cumsum(histo) + (step // 2)) // step
+            # Shift lut, prepending with 0.
+            lut = np.concatenate([[0], lut[:-1]], 0)
+            # handle potential integer overflow
+            lut[lut > 255] = 255
+        # If step is zero, return the original image.
+        # Otherwise, index from lut.
+        return np.where(np.equal(step, 0), im, lut[im])
+
+    # Scales each channel independently and then stacks
+    # the result.
+    s1 = _scale_channel(img, 0)
+    s2 = _scale_channel(img, 1)
+    s3 = _scale_channel(img, 2)
+    equalized_img = np.stack([s1, s2, s3], axis=-1)
+    return equalized_img.astype(img.dtype)
+
+
+def adjust_brightness(img, factor=1., backend=None):
+    """Adjust image brightness.
+
+    This function controls the brightness of an image. An
+    enhancement factor of 0.0 gives a black image.
+    A factor of 1.0 gives the original image. This function
+    blends the source image and the degenerated black image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be brightened.
+        factor (float): A value controls the enhancement.
+            Factor 1.0 returns the original image, lower
+            factors mean less color (brightness, contrast,
+            etc), and higher values more. Default 1.
+        backend (str | None): The image processing backend type. Options are
+            `cv2`, `pillow`, `None`. If backend is None, the global
+            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
+            used. Defaults to None.
+
+    Returns:
+        ndarray: The brightened image.
+    """
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        # Image.fromarray defaultly supports RGB, not BGR.
+        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
+        enhancer = ImageEnhance.Brightness(pil_image)
+        pil_image = enhancer.enhance(factor)
+        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
+    else:
+        degenerated = np.zeros_like(img)
+        # Note manually convert the dtype to np.float32, to
+        # achieve as close results as PIL.ImageEnhance.Brightness.
+        # Set beta=1-factor, and gamma=0
+        brightened_img = cv2.addWeighted(
+            img.astype(np.float32), factor, degenerated.astype(np.float32),
+            1 - factor, 0)
+        brightened_img = np.clip(brightened_img, 0, 255)
+        return brightened_img.astype(img.dtype)
+
+
+def adjust_contrast(img, factor=1., backend=None):
+    """Adjust image contrast.
+
+    This function controls the contrast of an image. An
+    enhancement factor of 0.0 gives a solid grey
+    image. A factor of 1.0 gives the original image. It
+    blends the source image and the degenerated mean image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be contrasted. BGR order.
+        factor (float): Same as :func:`mmcv.adjust_brightness`.
+        backend (str | None): The image processing backend type. Options are
+            `cv2`, `pillow`, `None`. If backend is None, the global
+            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
+            used. Defaults to None.
+
+    Returns:
+        ndarray: The contrasted image.
+    """
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        # Image.fromarray defaultly supports RGB, not BGR.
+        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
+        enhancer = ImageEnhance.Contrast(pil_image)
+        pil_image = enhancer.enhance(factor)
+        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
+    else:
+        gray_img = bgr2gray(img)
+        hist = np.histogram(gray_img, 256, (0, 255))[0]
+        mean = round(np.sum(gray_img) / np.sum(hist))
+        degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype)
+        degenerated = gray2bgr(degenerated)
+        contrasted_img = cv2.addWeighted(
+            img.astype(np.float32), factor, degenerated.astype(np.float32),
+            1 - factor, 0)
+        contrasted_img = np.clip(contrasted_img, 0, 255)
+        return contrasted_img.astype(img.dtype)
+
+
+def auto_contrast(img, cutoff=0):
+    """Auto adjust image contrast.
+
+    This function maximize (normalize) image contrast by first removing cutoff
+    percent of the lightest and darkest pixels from the histogram and remapping
+    the image so that the darkest pixel becomes black (0), and the lightest
+    becomes white (255).
+
+    Args:
+        img (ndarray): Image to be contrasted. BGR order.
+        cutoff (int | float | tuple): The cutoff percent of the lightest and
+            darkest pixels to be removed. If given as tuple, it shall be
+            (low, high). Otherwise, the single value will be used for both.
+            Defaults to 0.
+
+    Returns:
+        ndarray: The contrasted image.
+    """
+
+    def _auto_contrast_channel(im, c, cutoff):
+        im = im[:, :, c]
+        # Compute the histogram of the image channel.
+        histo = np.histogram(im, 256, (0, 255))[0]
+        # Remove cut-off percent pixels from histo
+        histo_sum = np.cumsum(histo)
+        cut_low = histo_sum[-1] * cutoff[0] // 100
+        cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100
+        histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low
+        histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0)
+
+        # Compute mapping
+        low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1]
+        # If all the values have been cut off, return the origin img
+        if low >= high:
+            return im
+        scale = 255.0 / (high - low)
+        offset = -low * scale
+        lut = np.array(range(256))
+        lut = lut * scale + offset
+        lut = np.clip(lut, 0, 255)
+        return lut[im]
+
+    if isinstance(cutoff, (int, float)):
+        cutoff = (cutoff, cutoff)
+    else:
+        assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \
+            f'float or tuple, but got {type(cutoff)} instead.'
+    # Auto adjusts contrast for each channel independently and then stacks
+    # the result.
+    s1 = _auto_contrast_channel(img, 0, cutoff)
+    s2 = _auto_contrast_channel(img, 1, cutoff)
+    s3 = _auto_contrast_channel(img, 2, cutoff)
+    contrasted_img = np.stack([s1, s2, s3], axis=-1)
+    return contrasted_img.astype(img.dtype)
+
+
+def adjust_sharpness(img, factor=1., kernel=None):
+    """Adjust image sharpness.
+
+    This function controls the sharpness of an image. An
+    enhancement factor of 0.0 gives a blurred image. A
+    factor of 1.0 gives the original image. And a factor
+    of 2.0 gives a sharpened image. It blends the source
+    image and the degenerated mean image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be sharpened. BGR order.
+        factor (float): Same as :func:`mmcv.adjust_brightness`.
+        kernel (np.ndarray, optional): Filter kernel to be applied on the img
+            to obtain the degenerated img. Defaults to None.
+
+    Note:
+        No value sanity check is enforced on the kernel set by users. So with
+        an inappropriate kernel, the ``adjust_sharpness`` may fail to perform
+        the function its name indicates but end up performing whatever
+        transform determined by the kernel.
+
+    Returns:
+        ndarray: The sharpened image.
+    """
+
+    if kernel is None:
+        # adopted from PIL.ImageFilter.SMOOTH
+        kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13
+    assert isinstance(kernel, np.ndarray), \
+        f'kernel must be of type np.ndarray, but got {type(kernel)} instead.'
+    assert kernel.ndim == 2, \
+        f'kernel must have a dimension of 2, but got {kernel.ndim} instead.'
+
+    degenerated = cv2.filter2D(img, -1, kernel)
+    sharpened_img = cv2.addWeighted(
+        img.astype(np.float32), factor, degenerated.astype(np.float32),
+        1 - factor, 0)
+    sharpened_img = np.clip(sharpened_img, 0, 255)
+    return sharpened_img.astype(img.dtype)
+
+
+def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True):
+    """AlexNet-style PCA jitter.
+
+    This data augmentation is proposed in `ImageNet Classification with Deep
+    Convolutional Neural Networks
+    <https://dl.acm.org/doi/pdf/10.1145/3065386>`_.
+
+    Args:
+        img (ndarray): Image to be adjusted lighting. BGR order.
+        eigval (ndarray): the eigenvalue of the convariance matrix of pixel
+            values, respectively.
+        eigvec (ndarray): the eigenvector of the convariance matrix of pixel
+            values, respectively.
+        alphastd (float): The standard deviation for distribution of alpha.
+            Defaults to 0.1
+        to_rgb (bool): Whether to convert img to rgb.
+
+    Returns:
+        ndarray: The adjusted image.
+    """
+    assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \
+        f'eigval and eigvec should both be of type np.ndarray, got ' \
+        f'{type(eigval)} and {type(eigvec)} instead.'
+
+    assert eigval.ndim == 1 and eigvec.ndim == 2
+    assert eigvec.shape == (3, eigval.shape[0])
+    n_eigval = eigval.shape[0]
+    assert isinstance(alphastd, float), 'alphastd should be of type float, ' \
+        f'got {type(alphastd)} instead.'
+
+    img = img.copy().astype(np.float32)
+    if to_rgb:
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
+
+    alpha = np.random.normal(0, alphastd, n_eigval)
+    alter = eigvec \
+        * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \
+        * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval))
+    alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape)
+    img_adjusted = img + alter
+    return img_adjusted
+
+
+def lut_transform(img, lut_table):
+    """Transform array by look-up table.
+
+    The function lut_transform fills the output array with values from the
+    look-up table. Indices of the entries are taken from the input array.
+
+    Args:
+        img (ndarray): Image to be transformed.
+        lut_table (ndarray): look-up table of 256 elements; in case of
+            multi-channel input array, the table should either have a single
+            channel (in this case the same table is used for all channels) or
+            the same number of channels as in the input array.
+
+    Returns:
+        ndarray: The transformed image.
+    """
+    assert isinstance(img, np.ndarray)
+    assert 0 <= np.min(img) and np.max(img) <= 255
+    assert isinstance(lut_table, np.ndarray)
+    assert lut_table.shape == (256, )
+
+    return cv2.LUT(np.array(img, dtype=np.uint8), lut_table)
+
+
+def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
+    """Use CLAHE method to process the image.
+
+    See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
+    Graphics Gems, 1994:474-485.` for more information.
+
+    Args:
+        img (ndarray): Image to be processed.
+        clip_limit (float): Threshold for contrast limiting. Default: 40.0.
+        tile_grid_size (tuple[int]): Size of grid for histogram equalization.
+            Input image will be divided into equally sized rectangular tiles.
+            It defines the number of tiles in row and column. Default: (8, 8).
+
+    Returns:
+        ndarray: The processed image.
+    """
+    assert isinstance(img, np.ndarray)
+    assert img.ndim == 2
+    assert isinstance(clip_limit, (float, int))
+    assert is_tuple_of(tile_grid_size, int)
+    assert len(tile_grid_size) == 2
+
+    clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
+    return clahe.apply(np.array(img, dtype=np.uint8))
+
+
+def adjust_hue(img: np.ndarray,
+               hue_factor: float,
+               backend: Optional[str] = None) -> np.ndarray:
+    """Adjust hue of an image.
+
+    The image hue is adjusted by converting the image to HSV and cyclically
+    shifting the intensities in the hue channel (H). The image is then
+    converted back to original image mode.
+
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+
+    Modified from
+    https://github.com/pytorch/vision/blob/main/torchvision/
+    transforms/functional.py
+
+    Args:
+        img (ndarray): Image to be adjusted.
+        hue_factor (float):  How much to shift the hue channel. Should be in
+            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+            HSV space in positive and negative direction respectively.
+            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+            with complementary colors while 0 gives the original image.
+        backend (str | None): The image processing backend type. Options are
+            `cv2`, `pillow`, `None`. If backend is None, the global
+            ``imread_backend`` specified by ``mmcv.use_backend()`` will be
+            used. Defaults to None.
+
+    Returns:
+        ndarray: Hue adjusted image.
+    """
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f'hue_factor:{hue_factor} is not in [-0.5, 0.5].')
+    if not (isinstance(img, np.ndarray) and (img.ndim in {2, 3})):
+        raise TypeError('img should be ndarray with dim=[2 or 3].')
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        # Image.fromarray defaultly supports RGB, not BGR.
+        pil_image = Image.fromarray(img[..., ::-1], mode='RGB')
+        input_mode = pil_image.mode
+        if input_mode in {'L', '1', 'I', 'F'}:
+            return pil_image
+
+        h, s, v = pil_image.convert('HSV').split()
+
+        np_h = np.array(h, dtype=np.uint8)
+        # uint8 addition take cares of rotation across boundaries
+        with np.errstate(over='ignore'):
+            np_h += np.uint8(hue_factor * 255)
+        h = Image.fromarray(np_h, 'L')
+
+        pil_image = Image.merge('HSV', (h, s, v)).convert(input_mode)
+        return np.array(pil_image, dtype=img.dtype)[..., ::-1]
+    else:
+        dtype = img.dtype
+        img = img.astype(np.uint8)
+        hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
+        h, s, v = cv2.split(hsv_img)
+        h = h.astype(np.uint8)
+        # uint8 addition take cares of rotation across boundaries
+        with np.errstate(over='ignore'):
+            h += np.uint8(hue_factor * 255)
+        hsv_img = cv2.merge([h, s, v])
+        return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/__init__.py b/head_extractor/mmcv-2.1.0/mmcv/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffad9b2bfdbf94cf7963a48ca5252959d43fe29c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/__init__.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import IS_MLU_AVAILABLE
+from .active_rotated_filter import active_rotated_filter
+from .assign_score_withk import assign_score_withk
+from .ball_query import ball_query
+from .bbox import bbox_overlaps
+from .bezier_align import BezierAlign, bezier_align
+from .bias_act import bias_act
+from .border_align import BorderAlign, border_align
+from .box_iou_quadri import box_iou_quadri
+from .box_iou_rotated import box_iou_rotated
+from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
+from .cc_attention import CrissCrossAttention
+from .chamfer_distance import chamfer_distance
+from .contour_expand import contour_expand
+from .conv2d_gradfix import conv2d, conv_transpose2d
+from .convex_iou import convex_giou, convex_iou
+from .corner_pool import CornerPool
+from .correlation import Correlation
+from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
+from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack,
+                              ModulatedDeformRoIPoolPack, deform_roi_pool)
+from .deprecated_wrappers import Conv2d_deprecated as Conv2d
+from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d
+from .deprecated_wrappers import Linear_deprecated as Linear
+from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d
+from .diff_iou_rotated import diff_iou_rotated_2d, diff_iou_rotated_3d
+from .filtered_lrelu import filtered_lrelu
+from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
+                         sigmoid_focal_loss, softmax_focal_loss)
+from .furthest_point_sample import (furthest_point_sample,
+                                    furthest_point_sample_with_dist)
+from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu
+from .gather_points import gather_points
+from .group_points import GroupAll, QueryAndGroup, grouping_operation
+from .info import get_compiler_version, get_compiling_cuda_version
+from .iou3d import (boxes_iou3d, boxes_iou_bev, boxes_overlap_bev, nms3d,
+                    nms3d_normal, nms_bev, nms_normal_bev)
+from .knn import knn
+from .masked_conv import MaskedConv2d, masked_conv2d
+from .min_area_polygons import min_area_polygons
+from .modulated_deform_conv import (ModulatedDeformConv2d,
+                                    ModulatedDeformConv2dPack,
+                                    modulated_deform_conv2d)
+from .multi_scale_deform_attn import MultiScaleDeformableAttention
+from .nms import batched_nms, nms, nms_match, nms_quadri, nms_rotated, soft_nms
+from .pixel_group import pixel_group
+from .point_sample import (SimpleRoIAlign, point_sample,
+                           rel_roi_point_to_rel_img_point)
+from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
+                              points_in_boxes_part)
+from .points_in_polygons import points_in_polygons
+from .points_sampler import PointsSampler
+from .prroi_pool import PrRoIPool, prroi_pool
+from .psa_mask import PSAMask
+from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated
+from .roi_align import RoIAlign, roi_align
+from .roi_align_rotated import RoIAlignRotated, roi_align_rotated
+from .roi_pool import RoIPool, roi_pool
+from .roiaware_pool3d import RoIAwarePool3d
+from .roipoint_pool3d import RoIPointPool3d
+from .rotated_feature_align import rotated_feature_align
+from .saconv import SAConv2d
+from .scatter_points import DynamicScatter, dynamic_scatter
+from .sparse_conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
+                          SparseConvTranspose3d, SparseInverseConv2d,
+                          SparseInverseConv3d, SubMConv2d, SubMConv3d)
+from .sparse_modules import SparseModule, SparseSequential
+from .sparse_pool import SparseMaxPool2d, SparseMaxPool3d
+from .sparse_structure import SparseConvTensor, scatter_nd
+from .sync_bn import SyncBatchNorm
+from .three_interpolate import three_interpolate
+from .three_nn import three_nn
+from .tin_shift import TINShift, tin_shift
+from .upfirdn2d import filter2d, upfirdn2d, upsample2d
+from .voxelize import Voxelization, voxelization
+
+__all__ = [
+    'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe',
+    'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack',
+    'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack',
+    'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss',
+    'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss',
+    'get_compiler_version', 'get_compiling_cuda_version', 'MaskedConv2d',
+    'masked_conv2d', 'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',
+    'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match',
+    'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d',
+    'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask',
+    'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',
+    'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',
+    'box_iou_rotated', 'box_iou_quadri', 'RoIPointPool3d', 'nms_rotated',
+    'knn', 'ball_query', 'upfirdn2d', 'FusedBiasLeakyReLU',
+    'fused_bias_leakyrelu', 'rotated_feature_align', 'RiRoIAlignRotated',
+    'riroi_align_rotated', 'RoIAlignRotated', 'roi_align_rotated',
+    'pixel_group', 'QueryAndGroup', 'GroupAll', 'grouping_operation',
+    'contour_expand', 'three_nn', 'three_interpolate',
+    'MultiScaleDeformableAttention', 'BorderAlign', 'border_align',
+    'gather_points', 'furthest_point_sample', 'nms_quadri',
+    'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',
+    'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev',
+    'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization',
+    'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d',
+    'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d',
+    'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d',
+    'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d',
+    'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part',
+    'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',
+    'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',
+    'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',
+    'PrRoIPool', 'prroi_pool', 'bias_act', 'filtered_lrelu', 'conv2d',
+    'conv_transpose2d', 'filter2d', 'upsample2d', 'BezierAlign', 'bezier_align'
+]
+
+if IS_MLU_AVAILABLE:
+    from .deform_conv import DeformConv2dPack_MLU  # noqa:F401
+    from .modulated_deform_conv import \
+        ModulatedDeformConv2dPack_MLU  # noqa:F401
+    __all__.extend(['ModulatedDeformConv2dPack_MLU', 'DeformConv2dPack_MLU'])
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/active_rotated_filter.py b/head_extractor/mmcv-2.1.0/mmcv/ops/active_rotated_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8ba43dd41cca14e0d74b4ba7dd8316da2ba4abe
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/active_rotated_filter.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['active_rotated_filter_forward', 'active_rotated_filter_backward'])
+
+
+class ActiveRotatedFilterFunction(Function):
+    """Encoding the orientation information and generating orientation-
+    sensitive features.
+
+    The details are described in the paper `Align Deep Features for Oriented
+    Object Detection  <https://arxiv.org/abs/2008.09397>_`.
+    """
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input (torch.Tensor): Input features with shape
+                [num_output_planes, num_input_planes, num_orientations, H, W].
+            indices (torch.Tensor): Indices with shape
+                [num_orientations, H, W, num_rotations].
+
+        Returns:
+            torch.Tensor: Refined features with shape [num_output_planes *
+            num_rotations, num_input_planes * num_orientations, H, W].
+        """
+        ctx.save_for_backward(input, indices)
+        op, ip, o, h, w = input.size()
+        o, h, w, r = indices.size()
+        output = input.new_zeros((op * r, ip * o, h, w))
+        ext_module.active_rotated_filter_forward(input, indices, output)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
+        """
+        Args:
+            grad_output (torch.Tensor): The gradient of output features
+                with shape [num_output_planes * num_rotations,
+                num_input_planes * num_orientations, H, W].
+
+        Returns:
+            torch.Tensor: The gradient of input features with shape
+            [num_output_planes, num_input_planes, num_orientations, H, W].
+        """
+        input, indices = ctx.saved_tensors
+        grad_in = torch.zeros_like(input)
+        ext_module.active_rotated_filter_backward(grad_out, indices, grad_in)
+        return grad_in, None
+
+
+active_rotated_filter = ActiveRotatedFilterFunction.apply
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/assign_score_withk.py b/head_extractor/mmcv-2.1.0/mmcv/ops/assign_score_withk.py
new file mode 100644
index 0000000000000000000000000000000000000000..deca0892bddc52b51e9d2543a9e893f0bd67ebdb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/assign_score_withk.py
@@ -0,0 +1,131 @@
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['assign_score_withk_forward', 'assign_score_withk_backward'])
+
+
+class AssignScoreWithK(Function):
+    r"""Perform weighted sum to generate output features according to scores.
+    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
+    scene_seg/lib/paconv_lib/src/gpu>`_.
+
+    This is a memory-efficient CUDA implementation of assign_scores operation,
+    which first transform all point features with weight bank, then assemble
+    neighbor features with ``knn_idx`` and perform weighted sum of ``scores``.
+
+    See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
+        more detailed descriptions.
+
+    Note:
+        This implementation assumes using ``neighbor`` kernel input, which is
+            (point_features - center_features, point_features).
+        See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/
+        pointnet2/paconv.py#L128 for more details.
+    """
+
+    @staticmethod
+    def forward(ctx,
+                scores: torch.Tensor,
+                point_features: torch.Tensor,
+                center_features: torch.Tensor,
+                knn_idx: torch.Tensor,
+                aggregate: str = 'sum') -> torch.Tensor:
+        """
+        Args:
+            scores (torch.Tensor): (B, npoint, K, M), predicted scores to
+                aggregate weight matrices in the weight bank.
+                ``npoint`` is the number of sampled centers.
+                ``K`` is the number of queried neighbors.
+                ``M`` is the number of weight matrices in the weight bank.
+            point_features (torch.Tensor): (B, N, M, out_dim)
+                Pre-computed point features to be aggregated.
+            center_features (torch.Tensor): (B, N, M, out_dim)
+                Pre-computed center features to be aggregated.
+            knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN.
+                We assume the first idx in each row is the idx of the center.
+            aggregate (str, optional): Aggregation method.
+                Can be 'sum', 'avg' or 'max'. Defaults: 'sum'.
+
+        Returns:
+            torch.Tensor: (B, out_dim, npoint, K), the aggregated features.
+        """
+        agg = {'sum': 0, 'avg': 1, 'max': 2}
+
+        B, N, M, out_dim = point_features.size()
+        _, npoint, K, _ = scores.size()
+
+        output = point_features.new_zeros((B, out_dim, npoint, K))
+        ext_module.assign_score_withk_forward(
+            point_features.contiguous(),
+            center_features.contiguous(),
+            scores.contiguous(),
+            knn_idx.contiguous(),
+            output,
+            B=B,
+            N0=N,
+            N1=npoint,
+            M=M,
+            K=K,
+            O=out_dim,
+            aggregate=agg[aggregate])
+
+        ctx.save_for_backward(output, point_features, center_features, scores,
+                              knn_idx)
+        ctx.agg = agg[aggregate]
+
+        return output
+
+    @staticmethod
+    def backward(
+        ctx, grad_out: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None]:
+        """
+        Args:
+            grad_out (torch.Tensor): (B, out_dim, npoint, K)
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains five elements. The first one
+            is the gradient of ``scores`` whose shape is (B, npoint, K, M). The
+            second is the gradient of ``point_features`` whose shape is
+            (B, N, M, out_dim). The third is the gradient of
+            ``center_features`` with the shape of (B, N, M, out_dim). The last
+            two are ``None``.
+        """
+        _, point_features, center_features, scores, knn_idx = ctx.saved_tensors
+
+        agg = ctx.agg
+
+        B, N, M, out_dim = point_features.size()
+        _, npoint, K, _ = scores.size()
+
+        grad_point_features = point_features.new_zeros(point_features.shape)
+        grad_center_features = center_features.new_zeros(center_features.shape)
+        grad_scores = scores.new_zeros(scores.shape)
+
+        ext_module.assign_score_withk_backward(
+            grad_out.contiguous(),
+            point_features.contiguous(),
+            center_features.contiguous(),
+            scores.contiguous(),
+            knn_idx.contiguous(),
+            grad_point_features,
+            grad_center_features,
+            grad_scores,
+            B=B,
+            N0=N,
+            N1=npoint,
+            M=M,
+            K=K,
+            O=out_dim,
+            aggregate=agg)
+
+        return grad_scores, grad_point_features, \
+            grad_center_features, None, None
+
+
+assign_score_withk = AssignScoreWithK.apply
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/ball_query.py b/head_extractor/mmcv-2.1.0/mmcv/ops/ball_query.py
new file mode 100644
index 0000000000000000000000000000000000000000..a89b36b52b1cce8ab90274418a4d1346796d971c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/ball_query.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ball_query_forward', 'stack_ball_query_forward'])
+
+
+class BallQuery(Function):
+    """Find nearby points in spherical space."""
+
+    @staticmethod
+    def forward(
+            ctx,
+            min_radius: float,
+            max_radius: float,
+            sample_num: int,
+            xyz: torch.Tensor,
+            center_xyz: torch.Tensor,
+            xyz_batch_cnt: Optional[torch.Tensor] = None,
+            center_xyz_batch_cnt: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            min_radius (float): minimum radius of the balls.
+            max_radius (float): maximum radius of the balls.
+            sample_num (int): maximum number of features in the balls.
+            xyz (torch.Tensor): (B, N, 3) xyz coordinates of the features,
+                or staked input (N1 + N2 ..., 3).
+            center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball
+                query, or staked input (M1 + M2 ..., 3).
+            xyz_batch_cnt: (batch_size): Stacked input xyz coordinates nums in
+                each batch, just like (N1, N2, ...). Defaults to None.
+                New in version 1.7.0.
+            center_xyz_batch_cnt: (batch_size): Stacked centers coordinates
+                nums in each batch, just line (M1, M2, ...). Defaults to None.
+                New in version 1.7.0.
+
+        Returns:
+            torch.Tensor: (B, npoint, nsample) tensor with the indices of the
+            features that form the query balls.
+        """
+        assert center_xyz.is_contiguous()
+        assert xyz.is_contiguous()
+        assert min_radius < max_radius
+        if xyz_batch_cnt is not None and center_xyz_batch_cnt is not None:
+            assert xyz_batch_cnt.dtype == torch.int
+            assert center_xyz_batch_cnt.dtype == torch.int
+            idx = center_xyz.new_zeros((center_xyz.shape[0], sample_num),
+                                       dtype=torch.int32)
+            ext_module.stack_ball_query_forward(
+                center_xyz,
+                center_xyz_batch_cnt,
+                xyz,
+                xyz_batch_cnt,
+                idx,
+                max_radius=max_radius,
+                nsample=sample_num,
+            )
+        else:
+            B, N, _ = xyz.size()
+            npoint = center_xyz.size(1)
+            idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int32)
+            ext_module.ball_query_forward(
+                center_xyz,
+                xyz,
+                idx,
+                b=B,
+                n=N,
+                m=npoint,
+                min_radius=min_radius,
+                max_radius=max_radius,
+                nsample=sample_num)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, a=None) -> Tuple[None, None, None, None]:
+        return None, None, None, None
+
+
+ball_query = BallQuery.apply
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/bbox.py b/head_extractor/mmcv-2.1.0/mmcv/ops/bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..4583ba7d5a867a86f4a798c524b2c48e9c8f1ae0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/bbox.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])
+
+
+def _bbox_overlaps_cpu(bboxes1: torch.Tensor,
+                       bboxes2: torch.Tensor,
+                       mode: str = 'iou',
+                       aligned: bool = False,
+                       offset: int = 0) -> torch.Tensor:
+    assert mode in ['iou', 'iof']
+
+    if aligned:
+        lt = torch.max(bboxes1[:, :2], bboxes2[:, :2])  # [rows, 2]
+        rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:])  # [rows, 2]
+
+        wh = (rb - lt + offset).clamp(min=0)  # [rows, 2]
+        overlap = wh[:, 0] * wh[:, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + offset)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + offset)
+            ious = overlap / (area1 + area2 - overlap)
+        else:
+            ious = overlap / area1
+    else:
+        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [rows, cols, 2]
+        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [rows, cols, 2]
+
+        wh = (rb - lt + offset).clamp(min=0)  # [rows, cols, 2]
+        overlap = wh[:, :, 0] * wh[:, :, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + offset)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + offset)
+            ious = overlap / (area1[:, None] + area2 - overlap)
+        else:
+            ious = overlap / (area1[:, None])
+
+    return ious
+
+
+def bbox_overlaps(bboxes1: torch.Tensor,
+                  bboxes2: torch.Tensor,
+                  mode: str = 'iou',
+                  aligned: bool = False,
+                  offset: int = 0) -> torch.Tensor:
+    """Calculate overlap between two set of bboxes.
+
+    If ``aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (torch.Tensor): shape (m, 4) in <x1, y1, x2, y2> format or
+            empty.
+        bboxes2 (torch.Tensor): shape (n, 4) in <x1, y1, x2, y2> format or
+            empty. If aligned is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+
+    Returns:
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (m, n) else (m, 1).
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> bbox_overlaps(bboxes1, bboxes2)
+        tensor([[0.5000, 0.0000, 0.0000],
+                [0.0000, 0.0000, 1.0000],
+                [0.0000, 0.0000, 0.0000]])
+
+    Example:
+        >>> empty = torch.FloatTensor([])
+        >>> nonempty = torch.FloatTensor([
+        >>>     [0, 0, 10, 9],
+        >>> ])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    mode_dict = {'iou': 0, 'iof': 1}
+    assert mode in mode_dict.keys()
+    mode_flag = mode_dict[mode]
+    # Either the boxes are empty or the length of boxes' last dimension is 4
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+    assert offset == 1 or offset == 0
+
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+
+    if aligned:
+        assert rows == cols
+        ious = bboxes1.new_zeros(rows)
+    else:
+        ious = bboxes1.new_zeros((rows, cols))
+
+    if rows * cols == 0:
+        return ious
+
+    if bboxes1.device.type == 'cpu' and torch.__version__ == 'parrots':
+        return _bbox_overlaps_cpu(
+            bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset)
+
+    ext_module.bbox_overlaps(
+        bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)
+
+    return ious
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/bezier_align.py b/head_extractor/mmcv-2.1.0/mmcv/ops/bezier_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..6db7f5c8d8567b4c6ad5df2eb77f6cf60a4f0bb6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/bezier_align.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['bezier_align_forward', 'bezier_align_backward'])
+
+
+class BezierAlignFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input: torch.Tensor,
+                beziers: torch.Tensor,
+                output_size: Union[int, Tuple[int, int]],
+                spatial_scale: Union[int, float] = 1.0,
+                sampling_ratio: int = 0,
+                aligned: bool = True) -> torch.Tensor:
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.input_shape = input.size()
+        ctx.sampling_ratio = sampling_ratio
+        ctx.aligned = aligned
+
+        assert beziers.size(1) == 17
+        output_shape = (beziers.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        ext_module.bezier_align_forward(
+            input,
+            beziers,
+            output,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            aligned=ctx.aligned)
+
+        ctx.save_for_backward(beziers)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor):
+        beziers = ctx.saved_tensors[0]
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+        grad_output = grad_output.contiguous()
+        ext_module.bezier_align_backward(
+            grad_output,
+            beziers,
+            grad_input,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            aligned=ctx.aligned)
+        return grad_input, None, None, None, None, None
+
+
+bezier_align = BezierAlignFunction.apply
+
+
+class BezierAlign(nn.Module):
+    """Bezier align pooling layer.
+
+    Args:
+        output_size (tuple): h, w
+        spatial_scale (float): scale the input boxes by this number
+        sampling_ratio (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        aligned (bool): if False, use the legacy implementation in
+            MMDetection. If True, align the results more perfectly.
+
+    Note:
+        The implementation of BezierAlign is modified from
+        https://github.com/aim-uofa/AdelaiDet
+
+        The meaning of aligned=True:
+
+        Given a continuous coordinate c, its two neighboring pixel
+        indices (in our pixel model) are computed by floor(c - 0.5) and
+        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
+        indices [0] and [1] (which are sampled from the underlying signal
+        at continuous coordinates 0.5 and 1.5). But the original roi_align
+        (aligned=False) does not subtract the 0.5 when computing
+        neighboring pixel indices and therefore it uses pixels with a
+        slightly incorrect alignment (relative to our pixel model) when
+        performing bilinear interpolation.
+
+        With `aligned=True`,
+        we first appropriately scale the ROI and then shift it by -0.5
+        prior to calling roi_align. This produces the correct neighbors;
+
+        The difference does not make a difference to the model's
+        performance if ROIAlign is used together with conv layers.
+    """
+
+    def __init__(
+        self,
+        output_size: Tuple,
+        spatial_scale: Union[int, float],
+        sampling_ratio: int,
+        aligned: bool = True,
+    ) -> None:
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.aligned = aligned
+
+    def forward(self, input: torch.Tensor,
+                beziers: torch.Tensor) -> torch.Tensor:
+        """BezierAlign forward.
+
+        Args:
+            inputs (Tensor): input features.
+            beziers (Tensor): beziers for align.
+        """
+        return bezier_align(input, beziers, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.aligned)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale})'
+        s += f'sampling_ratio={self.sampling_ratio})'
+        s += f'aligned={self.aligned})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/bias_act.py b/head_extractor/mmcv-2.1.0/mmcv/ops/bias_act.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dfa55743e0a0a6e8ad408c5937d9097cce6ea7d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/bias_act.py
@@ -0,0 +1,375 @@
+# Modified from
+# https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.py
+
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# source: https://github.com/open-mmlab/mmediting/blob/dev-1.x/mmedit/models/editors/stylegan3/stylegan3_ops/ops/bias_act.py # noqa
+"""Custom PyTorch ops for efficient bias and activation."""
+
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['bias_act'])
+
+
+class EasyDict(dict):
+    """Convenience class that behaves like a dict but allows access with the
+    attribute syntax."""
+
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        self[name] = value
+
+    def __delattr__(self, name: str) -> None:
+        del self[name]
+
+
+activation_funcs = {
+    'linear':
+    EasyDict(
+        func=lambda x, **_: x,
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=1,
+        ref='',
+        has_2nd_grad=False),
+    'relu':
+    EasyDict(
+        func=lambda x, **_: torch.nn.functional.relu(x),
+        def_alpha=0,
+        def_gain=np.sqrt(2),
+        cuda_idx=2,
+        ref='y',
+        has_2nd_grad=False),
+    'lrelu':
+    EasyDict(
+        func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha),
+        def_alpha=0.2,
+        def_gain=np.sqrt(2),
+        cuda_idx=3,
+        ref='y',
+        has_2nd_grad=False),
+    'tanh':
+    EasyDict(
+        func=lambda x, **_: torch.tanh(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=4,
+        ref='y',
+        has_2nd_grad=True),
+    'sigmoid':
+    EasyDict(
+        func=lambda x, **_: torch.sigmoid(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=5,
+        ref='y',
+        has_2nd_grad=True),
+    'elu':
+    EasyDict(
+        func=lambda x, **_: torch.nn.functional.elu(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=6,
+        ref='y',
+        has_2nd_grad=True),
+    'selu':
+    EasyDict(
+        func=lambda x, **_: torch.nn.functional.selu(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=7,
+        ref='y',
+        has_2nd_grad=True),
+    'softplus':
+    EasyDict(
+        func=lambda x, **_: torch.nn.functional.softplus(x),
+        def_alpha=0,
+        def_gain=1,
+        cuda_idx=8,
+        ref='y',
+        has_2nd_grad=True),
+    'swish':
+    EasyDict(
+        func=lambda x, **_: torch.sigmoid(x) * x,
+        def_alpha=0,
+        def_gain=np.sqrt(2),
+        cuda_idx=9,
+        ref='x',
+        has_2nd_grad=True),
+}
+
+_null_tensor = torch.empty([0])
+
+
+def bias_act(input: torch.Tensor,
+             bias: Optional[torch.Tensor] = None,
+             dim: int = 1,
+             act: str = 'linear',
+             alpha: Optional[Union[float, int]] = None,
+             gain: Optional[float] = None,
+             clamp: Optional[float] = None,
+             use_custom_op: bool = True):
+    r"""Fused bias and activation function.
+
+    Adds `bias` to activation tensor `input`, and evaluates activation
+    function `act`, and scales the result by `gain`. Each of the steps is
+    optional.
+
+    In most cases, the fused op is considerably more efficient than performing
+    the same calculation using standard PyTorch ops. It supports first and
+    second order gradients, but not third order gradients.
+
+    Args:
+        input (torch.Tensor): Input activation tensor. Can be of any shape.
+        bias (torch.Tensor): Bias vector, or `None` to disable.
+            Must be a 1D tensor of the same type as `input`. The shape must
+            be known, and it must match the dimension of `input` corresponding
+            to `dim`. Defaults to None.
+        dim (int): The dimension in `input` corresponding to the elements of
+            `bias`. The value of `dim` is ignored if `b` is not specified.
+            Defaults to 1.
+        act (str): Name of the activation function to evaluate, or `"linear"`
+            to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
+            "swish", etc. See `activation_funcs` for a full list. `None` is not
+            allowed. Defaults to `linear`.
+        alpha (float or int): Shape parameter for the activation
+            function, or `None` to use the default. Defaults to None.
+        gain (float): Scaling factor for the output tensor, or `None`
+            to use default. See `activation_funcs` for the default scaling of
+            each activation function. If unsure, consider specifying 1.
+            Defaults to None.
+        clamp (float):  Clamp the output values to `[-clamp, +clamp]`,
+            or `None` to disable the clamping (default). Defaults to None.
+        use_custom_op (bool): Whether to use customized op.
+            Defaults to True.
+
+    Returns:
+        torch.Tensor: Tensor of the same shape and datatype as `input`.
+    """
+    assert isinstance(input, torch.Tensor)
+    if use_custom_op and input.is_cuda:
+        return _bias_act_cuda(
+            dim=dim, act=act, alpha=alpha, gain=gain,
+            clamp=clamp).apply(input, bias)
+    return _bias_act_ref(
+        input=input,
+        bias=bias,
+        dim=dim,
+        act=act,
+        alpha=alpha,
+        gain=gain,
+        clamp=clamp)
+
+
+def _bias_act_ref(input: torch.Tensor,
+                  bias: Optional[torch.Tensor] = None,
+                  dim: int = 1,
+                  act: str = 'linear',
+                  alpha: Optional[Union[float, int]] = None,
+                  gain: Optional[float] = None,
+                  clamp: Optional[float] = None):
+    """Slow reference implementation of `bias_act()` using standard PyTorch
+    ops.
+
+    Adds `bias` to activation tensor `input`, and evaluates activation
+    function `act`, and scales the result by `gain`. Each of the steps is
+    optional.
+
+    In most cases, the fused op is considerably more efficient than performing
+    the same calculation using standard PyTorch ops. It supports first and
+    second order gradients, but not third order gradients.
+
+    Args:
+        input (torch.Tensor): Input activation tensor. Can be of any shape.
+        bias (torch.Tensor): Bias vector, or `None` to disable.
+            Must be a 1D tensor of the same type as `input`. The shape must
+            be known, and it must match the dimension of `input` corresponding
+            to `dim`. Defaults to None.
+        dim (int): The dimension in `input` corresponding to the elements of
+            `bias`. The value of `dim` is ignored if `b` is not specified.
+            Defaults to 1.
+        act (str): Name of the activation function to evaluate, or `"linear"`
+            to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
+            "swish", etc. See `activation_funcs` for a full list. `None` is not
+            allowed. Defaults to `linear`.
+        alpha (float or int): Shape parameter for the activation
+            function, or `None` to use the default. Defaults to None.
+        gain (float): Scaling factor for the output tensor, or `None`
+            to use default. See `activation_funcs` for the default scaling of
+            each activation function. If unsure, consider specifying 1.
+            Defaults to None.
+        clamp (float):  Clamp the output values to
+            `[-clamp, +clamp]`, or `None` to disable the clamping (default).
+            Defaults to None.
+
+    Returns:
+        torch.Tensor: Tensor of the same shape and datatype as `input`.
+    """
+    assert isinstance(input, torch.Tensor)
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Add bias.
+    if bias is not None:
+        assert isinstance(bias, torch.Tensor) and bias.ndim == 1
+        assert 0 <= dim < input.ndim
+        assert bias.shape[0] == input.shape[dim]
+        input = input + bias.reshape(
+            [-1 if i == dim else 1 for i in range(input.ndim)])
+
+    # Evaluate activation function.
+    alpha = float(alpha)
+    output = spec.func(input, alpha=alpha)
+
+    # Scale by gain.
+    gain = float(gain)
+    if gain != 1:
+        output = output * gain
+
+    # Clamp.
+    if clamp >= 0:
+        # pylint: disable=invalid-unary-operand-type
+        output = output.clamp(-clamp, clamp)
+    return output
+
+
+_bias_act_cuda_cache: Dict = dict()
+
+
+def _bias_act_cuda(dim: int = 1,
+                   act: str = 'linear',
+                   alpha: Optional[Union[float, int]] = None,
+                   gain: Optional[float] = None,
+                   clamp: Optional[float] = None):
+    """"Fast CUDA implementation of `bias_act()` using custom ops.
+
+    Args:
+        dim (int): The dimension in `x` corresponding to the elements of `b`.
+            The value of `dim` is ignored if `b` is not specified.
+            Defaults to 1.
+        act (str): Name of the activation function to evaluate, or `"linear"`
+            to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid",
+            "swish", etc. See `activation_funcs` for a full list. `None` is not
+            allowed. Defaults to `linear`.
+        alpha (float | int): Shape parameter for the activation
+            function, or `None` to use the default. Defaults to None.
+        gain (float): Scaling factor for the output tensor, or `None`
+            to use default. See `activation_funcs` for the default scaling of
+            each activation function. If unsure, consider specifying 1.
+            Defaults to None.
+        clamp (float): Clamp the output values to `[-clamp, +clamp]`,
+            or `None` to disable the clamping (default). Defaults to None.
+
+    Returns:
+        torch.Tensor: Tensor of the same shape and datatype as `x`.
+    """
+    # Parse arguments.
+    assert clamp is None or clamp >= 0
+    spec = activation_funcs[act]
+    alpha = float(alpha if alpha is not None else spec.def_alpha)
+    gain = float(gain if gain is not None else spec.def_gain)
+    clamp = float(clamp if clamp is not None else -1)
+
+    # Lookup from cache.
+    key = (dim, act, alpha, gain, clamp)
+    if key in _bias_act_cuda_cache:
+        return _bias_act_cuda_cache[key]
+
+    # Forward op.
+    class BiasActCuda(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, x, b):  # pylint: disable=arguments-differ
+            ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride(
+                1) == 1 else torch.contiguous_format
+            x = x.contiguous(memory_format=ctx.memory_format)
+            b = b.contiguous() if b is not None else _null_tensor.to(x.device)
+            y = x
+            if act != 'linear' or gain != 1 or clamp >= 0 or (
+                    b is not _null_tensor.to(x.device)):
+                y = ext_module.bias_act(x, b, _null_tensor.to(x.device),
+                                        _null_tensor.to(x.device),
+                                        _null_tensor.to(x.device), 0, dim,
+                                        spec.cuda_idx, alpha, gain, clamp)
+            ctx.save_for_backward(
+                x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor.to(
+                    x.device), b if 'x' in spec.ref or spec.has_2nd_grad else
+                _null_tensor.to(x.device),
+                y if 'y' in spec.ref else _null_tensor.to(x.device))
+            return y
+
+        @staticmethod
+        def backward(ctx, dy):  # pylint: disable=arguments-differ
+            dy = dy.contiguous(memory_format=ctx.memory_format)
+            x, b, y = ctx.saved_tensors
+            dx = None
+            db = None
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                dx = dy
+                if act != 'linear' or gain != 1 or clamp >= 0:
+                    dx = BiasActCudaGrad.apply(dy, x, b, y)
+
+            if ctx.needs_input_grad[1]:
+                db = dx.sum([i for i in range(dx.ndim) if i != dim])
+
+            return dx, db
+
+    # Backward op.
+    class BiasActCudaGrad(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, dy, x, b, y):  # pylint: disable=arguments-differ
+            ctx.memory_format = torch.channels_last if dy.ndim > 2 and (
+                dy.stride(1) == 1) else torch.contiguous_format
+            dx = ext_module.bias_act(dy, b, x, y, _null_tensor.to(x.device), 1,
+                                     dim, spec.cuda_idx, alpha, gain, clamp)
+            ctx.save_for_backward(
+                dy if spec.has_2nd_grad else _null_tensor.to(x.device), x, b,
+                y)
+            return dx
+
+        @staticmethod
+        def backward(ctx, d_dx):  # pylint: disable=arguments-differ
+            d_dx = d_dx.contiguous(memory_format=ctx.memory_format)
+            dy, x, b, y = ctx.saved_tensors
+            d_dy = None
+            d_x = None
+            d_b = None
+            d_y = None
+
+            if ctx.needs_input_grad[0]:
+                d_dy = BiasActCudaGrad.apply(d_dx, x, b, y)
+
+            if spec.has_2nd_grad and (ctx.needs_input_grad[1]
+                                      or ctx.needs_input_grad[2]):
+                d_x = ext_module.bias_act(d_dx, b, x, y, dy, 2, dim,
+                                          spec.cuda_idx, alpha, gain, clamp)
+
+            if spec.has_2nd_grad and ctx.needs_input_grad[2]:
+                d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim])
+
+            return d_dy, d_x, d_b, d_y
+
+    # Add to cache.
+    _bias_act_cuda_cache[key] = BiasActCuda
+    return BiasActCuda
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/border_align.py b/head_extractor/mmcv-2.1.0/mmcv/ops/border_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..c09501b962cfce10b1da87e6b651d61911eb8406
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/border_align.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# modified from
+# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['border_align_forward', 'border_align_backward'])
+
+
+class BorderAlignFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, boxes, pool_size):
+        return g.op(
+            'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor, boxes: torch.Tensor,
+                pool_size: int) -> torch.Tensor:
+        ctx.pool_size = pool_size
+        ctx.input_shape = input.size()
+
+        assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]'
+        assert boxes.size(2) == 4, \
+            'the last dimension of boxes must be (x1, y1, x2, y2)'
+        assert input.size(1) % 4 == 0, \
+            'the channel for input feature must be divisible by factor 4'
+
+        # [B, C//4, H*W, 4]
+        output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4)
+        output = input.new_zeros(output_shape)
+        # `argmax_idx` only used for backward
+        argmax_idx = input.new_zeros(output_shape).to(torch.int)
+
+        ext_module.border_align_forward(
+            input, boxes, output, argmax_idx, pool_size=ctx.pool_size)
+
+        ctx.save_for_backward(boxes, argmax_idx)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx,
+                 grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
+        boxes, argmax_idx = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+        # complex head architecture may cause grad_output uncontiguous
+        grad_output = grad_output.contiguous()
+        ext_module.border_align_backward(
+            grad_output,
+            boxes,
+            argmax_idx,
+            grad_input,
+            pool_size=ctx.pool_size)
+        return grad_input, None, None
+
+
+border_align = BorderAlignFunction.apply
+
+
+class BorderAlign(nn.Module):
+    r"""Border align pooling layer.
+
+    Applies border_align over the input feature based on predicted bboxes.
+    The details were described in the paper
+    `BorderDet: Border Feature for Dense Object Detection
+    <https://arxiv.org/abs/2007.11056>`_.
+
+    For each border line (e.g. top, left, bottom or right) of each box,
+    border_align does the following:
+
+    1. uniformly samples ``pool_size`` +1 positions on this line, involving
+       the start and end points.
+    2. the corresponding features on these points are computed by bilinear
+       interpolation.
+    3. max pooling over all the ``pool_size`` +1 positions are used for
+       computing pooled feature.
+
+    Args:
+        pool_size (int): number of positions sampled over the boxes' borders
+            (e.g. top, bottom, left, right).
+    """
+
+    def __init__(self, pool_size: int):
+        super().__init__()
+        self.pool_size = pool_size
+
+    def forward(self, input: torch.Tensor,
+                boxes: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
+                [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom,
+                right features respectively.
+            boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
+
+        Returns:
+            torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is
+            (top,left,bottom,right) for the last dimension.
+        """
+        return border_align(input, boxes, self.pool_size)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(pool_size={self.pool_size})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/box_iou_quadri.py b/head_extractor/mmcv-2.1.0/mmcv/ops/box_iou_quadri.py
new file mode 100644
index 0000000000000000000000000000000000000000..89747fdf1f03e0491351f876385ba3c1369ebaf7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/box_iou_quadri.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['box_iou_quadri'])
+
+
+def box_iou_quadri(bboxes1: torch.Tensor,
+                   bboxes2: torch.Tensor,
+                   mode: str = 'iou',
+                   aligned: bool = False) -> torch.Tensor:
+    """Return intersection-over-union (Jaccard index) of boxes.
+
+    Both sets of boxes are expected to be in
+    (x1, y1, ..., x4, y4) format.
+
+    If ``aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (torch.Tensor): quadrilateral bboxes 1. It has shape (N, 8),
+            indicating (x1, y1, ..., x4, y4) for each row.
+        bboxes2 (torch.Tensor): quadrilateral bboxes 2. It has shape (M, 8),
+            indicating (x1, y1, ..., x4, y4) for each row.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+
+    Returns:
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (N, M) else (N,).
+    """
+    assert mode in ['iou', 'iof']
+    mode_dict = {'iou': 0, 'iof': 1}
+    mode_flag = mode_dict[mode]
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+    if aligned:
+        ious = bboxes1.new_zeros(rows)
+    else:
+        ious = bboxes1.new_zeros(rows * cols)
+    bboxes1 = bboxes1.contiguous()
+    bboxes2 = bboxes2.contiguous()
+    ext_module.box_iou_quadri(
+        bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
+    if not aligned:
+        ious = ious.view(rows, cols)
+    return ious
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/box_iou_rotated.py b/head_extractor/mmcv-2.1.0/mmcv/ops/box_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..a811531d4283f37cab80ae06af35d8d223d4b949
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/box_iou_rotated.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])
+
+
+def box_iou_rotated(bboxes1: torch.Tensor,
+                    bboxes2: torch.Tensor,
+                    mode: str = 'iou',
+                    aligned: bool = False,
+                    clockwise: bool = True) -> torch.Tensor:
+    """Return intersection-over-union (Jaccard index) of boxes.
+
+    Both sets of boxes are expected to be in
+    (x_center, y_center, width, height, angle) format.
+
+    If ``aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+
+    .. note::
+        The operator assumes:
+
+        1) The positive direction along x axis is left -> right.
+
+        2) The positive direction along y axis is top -> down.
+
+        3) The w border is in parallel with x axis when angle = 0.
+
+        However, there are 2 opposite definitions of the positive angular
+        direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports
+        both definitions and uses CW by default.
+
+        Please set ``clockwise=False`` if you are using the CCW definition.
+
+        The coordinate system when ``clockwise`` is ``True`` (default)
+
+            .. code-block:: none
+
+                0-------------------> x (0 rad)
+                |  A-------------B
+                |  |             |
+                |  |     box     h
+                |  |   angle=0   |
+                |  D------w------C
+                v
+                y (pi/2 rad)
+
+            In such coordination system the rotation matrix is
+
+            .. math::
+                \\begin{pmatrix}
+                \\cos\\alpha & -\\sin\\alpha \\\\
+                \\sin\\alpha & \\cos\\alpha
+                \\end{pmatrix}
+
+            The coordinates of the corner point A can be calculated as:
+
+            .. math::
+                P_A=
+                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+                =
+                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+                \\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\
+                \\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+                =
+                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha
+                \\\\
+                y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+
+        The coordinate system when ``clockwise`` is ``False``
+
+            .. code-block:: none
+
+                0-------------------> x (0 rad)
+                |  A-------------B
+                |  |             |
+                |  |     box     h
+                |  |   angle=0   |
+                |  D------w------C
+                v
+                y (-pi/2 rad)
+
+            In such coordination system the rotation matrix is
+
+            .. math::
+                \\begin{pmatrix}
+                \\cos\\alpha & \\sin\\alpha \\\\
+                -\\sin\\alpha & \\cos\\alpha
+                \\end{pmatrix}
+
+            The coordinates of the corner point A can be calculated as:
+
+            .. math::
+                P_A=
+                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+                =
+                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+                \\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\
+                -\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+                =
+                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha
+                \\\\
+                y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+    Args:
+        boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),
+            indicating (x, y, w, h, theta) for each row. Note that theta is in
+            radian.
+        boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),
+            indicating (x, y, w, h, theta) for each row. Note that theta is in
+            radian.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+        clockwise (bool): flag indicating whether the positive angular
+            orientation is clockwise. default True.
+            `New in version 1.4.3.`
+
+    Returns:
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (N, M) else (N,).
+    """
+    assert mode in ['iou', 'iof']
+    mode_dict = {'iou': 0, 'iof': 1}
+    mode_flag = mode_dict[mode]
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+    if aligned:
+        ious = bboxes1.new_zeros(rows)
+    else:
+        if bboxes1.device.type == 'mlu':
+            ious = bboxes1.new_zeros([rows, cols])
+        else:
+            ious = bboxes1.new_zeros(rows * cols)
+    if not clockwise:
+        flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
+        flip_mat[-1] = -1
+        bboxes1 = bboxes1 * flip_mat
+        bboxes2 = bboxes2 * flip_mat
+    if bboxes1.device.type == 'npu':
+        scale_mat = bboxes1.new_ones(bboxes1.shape[-1])
+        scale_mat[-1] = 1.0 / 0.01745329252
+        bboxes1 = bboxes1 * scale_mat
+        bboxes2 = bboxes2 * scale_mat
+    bboxes1 = bboxes1.contiguous()
+    bboxes2 = bboxes2.contiguous()
+    ext_module.box_iou_rotated(
+        bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
+    if not aligned:
+        ious = ious.view(rows, cols)
+    return ious
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/carafe.py b/head_extractor/mmcv-2.1.0/mmcv/ops/carafe.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7e79c275e2bea62ce7e08fb6e6e4629c7565600
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/carafe.py
@@ -0,0 +1,300 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import normal_init, xavier_init
+from mmengine.registry import MODELS
+from torch import Tensor
+from torch.autograd import Function
+from torch.nn.modules.module import Module
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward',
+    'carafe_backward'
+])
+
+
+class CARAFENaiveFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+                 group_size: int, scale_factor: int) -> Tensor:
+        return g.op(
+            'mmcv::MMCVCARAFENaive',
+            features,
+            masks,
+            kernel_size_i=kernel_size,
+            group_size_i=group_size,
+            scale_factor_f=scale_factor)
+
+    @staticmethod
+    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+                group_size: int, scale_factor: int) -> Tensor:
+        assert scale_factor >= 1
+        assert masks.size(1) == kernel_size * kernel_size * group_size
+        assert masks.size(-1) == features.size(-1) * scale_factor
+        assert masks.size(-2) == features.size(-2) * scale_factor
+        assert features.size(1) % group_size == 0
+        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
+        ctx.kernel_size = kernel_size
+        ctx.group_size = group_size
+        ctx.scale_factor = scale_factor
+        ctx.feature_size = features.size()
+        ctx.mask_size = masks.size()
+
+        n, c, h, w = features.size()
+        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
+        ext_module.carafe_naive_forward(
+            features,
+            masks,
+            output,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+
+        if features.requires_grad or masks.requires_grad or \
+                torch.__version__ == 'parrots':
+            ctx.save_for_backward(features, masks)
+        return output
+
+    @staticmethod
+    def backward(
+            ctx,
+            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
+        assert grad_output.is_cuda
+
+        features, masks = ctx.saved_tensors
+        kernel_size = ctx.kernel_size
+        group_size = ctx.group_size
+        scale_factor = ctx.scale_factor
+
+        grad_input = torch.zeros_like(features)
+        grad_masks = torch.zeros_like(masks)
+        ext_module.carafe_naive_backward(
+            grad_output.contiguous(),
+            features,
+            masks,
+            grad_input,
+            grad_masks,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+
+        return grad_input, grad_masks, None, None, None
+
+
+carafe_naive = CARAFENaiveFunction.apply
+
+
+class CARAFENaive(Module):
+
+    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+        super().__init__()
+
+        assert isinstance(kernel_size, int) and isinstance(
+            group_size, int) and isinstance(scale_factor, int)
+        self.kernel_size = kernel_size
+        self.group_size = group_size
+        self.scale_factor = scale_factor
+
+    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
+        return carafe_naive(features, masks, self.kernel_size, self.group_size,
+                            self.scale_factor)
+
+
+class CARAFEFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+                 group_size: int, scale_factor: int) -> Tensor:
+        return g.op(
+            'mmcv::MMCVCARAFE',
+            features,
+            masks,
+            kernel_size_i=kernel_size,
+            group_size_i=group_size,
+            scale_factor_f=scale_factor)
+
+    @staticmethod
+    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+                group_size: int, scale_factor: int) -> Tensor:
+        assert scale_factor >= 1
+        assert masks.size(1) == kernel_size * kernel_size * group_size
+        assert masks.size(-1) == features.size(-1) * scale_factor
+        assert masks.size(-2) == features.size(-2) * scale_factor
+        assert features.size(1) % group_size == 0
+        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
+        ctx.kernel_size = kernel_size
+        ctx.group_size = group_size
+        ctx.scale_factor = scale_factor
+        ctx.feature_size = features.size()
+        ctx.mask_size = masks.size()
+
+        n, c, h, w = features.size()
+        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
+        routput = features.new_zeros(output.size(), requires_grad=False)
+        rfeatures = features.new_zeros(features.size(), requires_grad=False)
+        rmasks = masks.new_zeros(masks.size(), requires_grad=False)
+        ext_module.carafe_forward(
+            features,
+            masks,
+            rfeatures,
+            routput,
+            rmasks,
+            output,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+
+        if features.requires_grad or masks.requires_grad or \
+                torch.__version__ == 'parrots':
+            ctx.save_for_backward(features, masks, rfeatures)
+        return output
+
+    @staticmethod
+    def backward(
+            ctx,
+            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
+        features, masks, rfeatures = ctx.saved_tensors
+        kernel_size = ctx.kernel_size
+        group_size = ctx.group_size
+        scale_factor = ctx.scale_factor
+
+        rgrad_output = torch.zeros_like(grad_output, requires_grad=False)
+        rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False)
+        rgrad_input = torch.zeros_like(features, requires_grad=False)
+        rgrad_masks = torch.zeros_like(masks, requires_grad=False)
+        grad_input = torch.zeros_like(features, requires_grad=False)
+        grad_masks = torch.zeros_like(masks, requires_grad=False)
+        ext_module.carafe_backward(
+            grad_output.contiguous(),
+            rfeatures,
+            masks,
+            rgrad_output,
+            rgrad_input_hs,
+            rgrad_input,
+            rgrad_masks,
+            grad_input,
+            grad_masks,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+        return grad_input, grad_masks, None, None, None
+
+
+carafe = CARAFEFunction.apply
+
+
+class CARAFE(Module):
+    """ CARAFE: Content-Aware ReAssembly of FEatures
+
+    Please refer to `CARAFE: Content-Aware ReAssembly of FEatures
+    <https://arxiv.org/abs/1905.02188>`_ for more details.
+
+    Args:
+        kernel_size (int): reassemble kernel size
+        group_size (int): reassemble group size
+        scale_factor (int): upsample ratio
+
+    Returns:
+        upsampled feature map
+    """
+
+    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+        super().__init__()
+
+        assert isinstance(kernel_size, int) and isinstance(
+            group_size, int) and isinstance(scale_factor, int)
+        self.kernel_size = kernel_size
+        self.group_size = group_size
+        self.scale_factor = scale_factor
+
+    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
+        return carafe(features, masks, self.kernel_size, self.group_size,
+                      self.scale_factor)
+
+
+@MODELS.register_module(name='carafe')
+class CARAFEPack(nn.Module):
+    """A unified package of CARAFE upsampler that contains: 1) channel
+    compressor 2) content encoder 3) CARAFE op.
+
+    Official implementation of ICCV 2019 paper
+    `CARAFE: Content-Aware ReAssembly of FEatures
+    <https://arxiv.org/abs/1905.02188>`_.
+
+    Args:
+        channels (int): input feature channels
+        scale_factor (int): upsample ratio
+        up_kernel (int): kernel size of CARAFE op
+        up_group (int): group size of CARAFE op
+        encoder_kernel (int): kernel size of content encoder
+        encoder_dilation (int): dilation of content encoder
+        compressed_channels (int): output channels of channels compressor
+
+    Returns:
+        upsampled feature map
+    """
+
+    def __init__(self,
+                 channels: int,
+                 scale_factor: int,
+                 up_kernel: int = 5,
+                 up_group: int = 1,
+                 encoder_kernel: int = 3,
+                 encoder_dilation: int = 1,
+                 compressed_channels: int = 64):
+        super().__init__()
+        self.channels = channels
+        self.scale_factor = scale_factor
+        self.up_kernel = up_kernel
+        self.up_group = up_group
+        self.encoder_kernel = encoder_kernel
+        self.encoder_dilation = encoder_dilation
+        self.compressed_channels = compressed_channels
+        self.channel_compressor = nn.Conv2d(channels, self.compressed_channels,
+                                            1)
+        self.content_encoder = nn.Conv2d(
+            self.compressed_channels,
+            self.up_kernel * self.up_kernel * self.up_group *
+            self.scale_factor * self.scale_factor,
+            self.encoder_kernel,
+            padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),
+            dilation=self.encoder_dilation,
+            groups=1)
+        self.init_weights()
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+        normal_init(self.content_encoder, std=0.001)
+
+    def kernel_normalizer(self, mask: Tensor) -> Tensor:
+        mask = F.pixel_shuffle(mask, self.scale_factor)
+        n, mask_c, h, w = mask.size()
+        # use float division explicitly,
+        # to void inconsistency while exporting to onnx
+        mask_channel = int(mask_c / float(self.up_kernel**2))
+        mask = mask.view(n, mask_channel, -1, h, w)
+
+        mask = F.softmax(mask, dim=2, dtype=mask.dtype)
+        mask = mask.view(n, mask_c, h, w).contiguous()
+
+        return mask
+
+    def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor:
+        x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        compressed_x = self.channel_compressor(x)
+        mask = self.content_encoder(compressed_x)
+        mask = self.kernel_normalizer(mask)
+
+        x = self.feature_reassemble(x, mask)
+        return x
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/cc_attention.py b/head_extractor/mmcv-2.1.0/mmcv/ops/cc_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..efde7b703c8c50ecf5aa604e756422f0be488759
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/cc_attention.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.registry import MODELS
+
+from mmcv.cnn import Scale
+
+
+def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
+    """Returns a diagonal matrix of size [n, n].
+
+    The diagonal are all "-inf". This is for avoiding calculating the
+    overlapped element in the Criss-Cross twice.
+    """
+    return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)
+
+
+@MODELS.register_module()
+class CrissCrossAttention(nn.Module):
+    """Criss-Cross Attention Module.
+
+    .. note::
+        Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch
+        to a pure PyTorch and equivalent implementation. For more
+        details, please refer to https://github.com/open-mmlab/mmcv/pull/1201.
+
+        Speed comparison for one forward pass
+
+        - Input size: [2,512,97,97]
+        - Device: 1 NVIDIA GeForce RTX 2080 Ti
+
+        +-----------------------+---------------+------------+---------------+
+        |                       |PyTorch version|CUDA version|Relative speed |
+        +=======================+===============+============+===============+
+        |with torch.no_grad()   |0.00554402 s   |0.0299619 s |5.4x           |
+        +-----------------------+---------------+------------+---------------+
+        |no with torch.no_grad()|0.00562803 s   |0.0301349 s |5.4x           |
+        +-----------------------+---------------+------------+---------------+
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+    """
+
+    def __init__(self, in_channels: int) -> None:
+        super().__init__()
+        self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
+        self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
+        self.value_conv = nn.Conv2d(in_channels, in_channels, 1)
+        self.gamma = Scale(0.)
+        self.in_channels = in_channels
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """forward function of Criss-Cross Attention.
+
+        Args:
+            x (torch.Tensor): Input feature with the shape of
+                (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output of the layer, with the shape of
+            (batch_size, in_channels, height, width)
+        """
+        B, C, H, W = x.size()
+        query = self.query_conv(x)
+        key = self.key_conv(x)
+        value = self.value_conv(x)
+        energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG(
+            H, query.device)
+        energy_H = energy_H.transpose(1, 2)
+        energy_W = torch.einsum('bchw,bchj->bhwj', query, key)
+        attn = F.softmax(
+            torch.cat([energy_H, energy_W], dim=-1), dim=-1)  # [B,H,W,(H+W)]
+        out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H])
+        out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:])
+
+        out = self.gamma(out) + x
+        out = out.contiguous()
+
+        return out
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__
+        s += f'(in_channels={self.in_channels})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/chamfer_distance.py b/head_extractor/mmcv-2.1.0/mmcv/ops/chamfer_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f908a5bbc2655de6233cd6ddfa140ee783079ba
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/chamfer_distance.py
@@ -0,0 +1,93 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Tuple
+
+import torch
+from torch import Tensor
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['chamfer_distance_forward', 'chamfer_distance_backward'])
+
+
+class ChamferDistanceFunction(Function):
+    """This is an implementation of the 2D Chamfer Distance.
+
+    It has been used in the paper `Oriented RepPoints for Aerial Object
+    Detection (CVPR 2022) <https://arxiv.org/abs/2105.11111>_`.
+    """
+
+    @staticmethod
+    def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]:
+        """
+        Args:
+            xyz1 (Tensor): Point set with shape (B, N, 2).
+            xyz2 (Tensor): Point set with shape (B, N, 2).
+
+        Returns:
+            Sequence[Tensor]:
+
+                - dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with
+                    shape (B, N).
+                - dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with
+                    shape (B, N).
+                - idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
+                    with shape (B, N), which be used in compute gradient.
+                - idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
+                    with shape (B, N), which be used in compute gradient.
+        """
+        batch_size, n, _ = xyz1.size()
+        _, m, _ = xyz2.size()
+        device = xyz1.device
+        xyz1 = xyz1.contiguous()
+        xyz2 = xyz2.contiguous()
+
+        dist1 = torch.zeros(batch_size, n).to(device)
+        dist2 = torch.zeros(batch_size, m).to(device)
+        idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device)
+        idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device)
+
+        ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1,
+                                            idx2)
+        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
+        return dist1, dist2, idx1, idx2
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx,
+                 grad_dist1: Tensor,
+                 grad_dist2: Tensor,
+                 grad_idx1=None,
+                 grad_idx2=None) -> Tuple[Tensor, Tensor]:
+        """
+
+        Args:
+            grad_dist1 (Tensor): Gradient of chamfer distance
+                (xyz1 to xyz2) with shape (B, N).
+            grad_dist2 (Tensor): Gradient of chamfer distance
+                (xyz2 to xyz1) with shape (B, N).
+
+        Returns:
+            Tuple[Tensor, Tensor]:
+
+            - grad_xyz1 (Tensor): Gradient of the point set with shape \
+                (B, N, 2).
+            - grad_xyz2 (Tensor):Gradient of the point set with shape \
+                (B, N, 2).
+        """
+        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
+        device = grad_dist1.device
+        grad_dist1 = grad_dist1.contiguous()
+        grad_dist2 = grad_dist2.contiguous()
+        grad_xyz1 = torch.zeros(xyz1.size()).to(device)
+        grad_xyz2 = torch.zeros(xyz2.size()).to(device)
+
+        ext_module.chamfer_distance_backward(xyz1, xyz2, idx1, idx2,
+                                             grad_dist1, grad_dist2, grad_xyz1,
+                                             grad_xyz2)
+        return grad_xyz1, grad_xyz2
+
+
+chamfer_distance = ChamferDistanceFunction.apply
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/contour_expand.py b/head_extractor/mmcv-2.1.0/mmcv/ops/contour_expand.py
new file mode 100644
index 0000000000000000000000000000000000000000..7184609ad9b64d421c17fdfe4a1a0dbeb62d64c8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/contour_expand.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import numpy as np
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['contour_expand'])
+
+
+def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
+                   internal_kernel_label: Union[np.array, torch.Tensor],
+                   min_kernel_area: int, kernel_num: int) -> list:
+    """Expand kernel contours so that foreground pixels are assigned into
+    instances.
+
+    Args:
+        kernel_mask (np.array or torch.Tensor): The instance kernel mask with
+            size hxw.
+        internal_kernel_label (np.array or torch.Tensor): The instance internal
+            kernel label with size hxw.
+        min_kernel_area (int): The minimum kernel area.
+        kernel_num (int): The instance kernel number.
+
+    Returns:
+        list: The instance index map with size hxw.
+    """
+    assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
+    assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
+    assert isinstance(min_kernel_area, int)
+    assert isinstance(kernel_num, int)
+
+    if isinstance(kernel_mask, np.ndarray):
+        kernel_mask = torch.from_numpy(kernel_mask)
+    if isinstance(internal_kernel_label, np.ndarray):
+        internal_kernel_label = torch.from_numpy(internal_kernel_label)
+
+    if torch.__version__ == 'parrots':
+        if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0:
+            label = []
+        else:
+            label = ext_module.contour_expand(
+                kernel_mask,
+                internal_kernel_label,
+                min_kernel_area=min_kernel_area,
+                kernel_num=kernel_num)
+            label = label.tolist()  # type: ignore
+    else:
+        label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
+                                          min_kernel_area, kernel_num)
+    return label
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/conv2d_gradfix.py b/head_extractor/mmcv-2.1.0/mmcv/ops/conv2d_gradfix.py
new file mode 100644
index 0000000000000000000000000000000000000000..b93a76a844457dd78d625dd95e042864943f11c3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/conv2d_gradfix.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/conv2d_gradfix.py # noqa
+"""Custom replacement for `torch.nn.functional.conv2d` that supports
+arbitrarily high order gradients with zero performance penalty."""
+
+import contextlib
+import warnings
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils.parrots_wrapper import is_rocm_pytorch
+
+enabled = True
+weight_gradients_disabled = False
+
+
+@contextlib.contextmanager
+def no_weight_gradients(disable=True):
+    global weight_gradients_disabled
+    old = weight_gradients_disabled
+    if disable:
+        weight_gradients_disabled = True
+    yield
+    weight_gradients_disabled = old
+
+
+def conv2d(input: torch.Tensor,
+           weight: torch.Tensor,
+           bias: Optional[torch.Tensor] = None,
+           stride: Union[int, Tuple[int, ...]] = 1,
+           padding: Union[int, Tuple[int, ...]] = 0,
+           dilation: Union[int, Tuple[int, ...]] = 1,
+           groups: int = 1):
+    flag = True
+    if digit_version(torch.__version__) >= digit_version('1.10.0'):
+        warnings.warn('Since '
+                      'aten:cudnn_convolution_backward_weight is '
+                      f'not supported in torch=={torch.__version__},'
+                      ' rolling back to `torch.nn.functional.conv2d`')
+        flag = False
+    if _should_use_custom_op(input) and flag:
+        return _conv2d_gradfix(
+            transpose=False,
+            weight_shape=weight.shape,
+            stride=stride,
+            padding=padding,
+            output_padding=0,
+            dilation=dilation,
+            groups=groups).apply(input, weight, bias)
+    return torch.nn.functional.conv2d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups)
+
+
+def conv_transpose2d(input: torch.Tensor,
+                     weight: torch.Tensor,
+                     bias: Optional[torch.Tensor] = None,
+                     stride: Union[int, Tuple[int, ...]] = 1,
+                     padding: Union[int, Tuple[int, ...]] = 0,
+                     output_padding: Union[int, Tuple[int, ...]] = 0,
+                     groups: int = 1,
+                     dilation: Union[int, Tuple[int, ...]] = 1):
+    if _should_use_custom_op(input):
+        return _conv2d_gradfix(
+            transpose=True,
+            weight_shape=weight.shape,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            groups=groups,
+            dilation=dilation).apply(input, weight, bias)
+    return torch.nn.functional.conv_transpose2d(
+        input=input,
+        weight=weight,
+        bias=bias,
+        stride=stride,
+        padding=padding,
+        output_padding=output_padding,
+        groups=groups,
+        dilation=dilation)
+
+
+def _should_use_custom_op(input):
+    assert isinstance(input, torch.Tensor)
+    if (not enabled) or (not torch.backends.cudnn.enabled):
+        return False
+    if input.device.type != 'cuda':
+        return False
+    return True
+
+
+def _to_tuple(x, ndim):
+    xs = tuple(x) if isinstance(x, (tuple, list)) else (x, ) * ndim
+    assert len(xs) == ndim
+    assert all(isinstance(x, int) for x in xs)
+    return xs
+
+
+_conv2d_gradfix_cache: Dict = dict()
+_null_tensor = torch.empty([0])
+
+
+def _conv2d_gradfix(
+    transpose: bool,
+    weight_shape: Tuple[int, ...],
+    stride: Union[int, Tuple[int, ...]],
+    padding: Union[int, Tuple[int, ...]],
+    output_padding: Union[int, Tuple[int, ...]],
+    dilation: Union[int, Tuple[int, ...]],
+    groups: int,
+):
+    # Parse arguments.
+    ndim = 2
+    weight_shape = tuple(weight_shape)
+    stride = _to_tuple(stride, ndim)
+    padding = _to_tuple(padding, ndim)
+    output_padding = _to_tuple(output_padding, ndim)
+    dilation = _to_tuple(dilation, ndim)
+
+    # Lookup from cache.
+    key = (transpose, weight_shape, stride, padding, output_padding, dilation,
+           groups)
+    if key in _conv2d_gradfix_cache:
+        return _conv2d_gradfix_cache[key]
+
+    # Validate arguments.
+
+    assert groups >= 1
+    assert len(weight_shape) == ndim + 2
+    assert all(stride[i] >= 1 for i in range(ndim))  # type: ignore
+    assert all(padding[i] >= 0 for i in range(ndim))  # type: ignore
+    assert all(dilation[i] >= 0 for i in range(ndim))  # type: ignore
+    if not transpose:
+        assert all(output_padding[i] == 0 for i in range(ndim))  # type: ignore
+    else:  # transpose
+        for i in range(ndim):
+            assert 0 <= output_padding[i] < max(  # type: ignore
+                stride[i],  # type: ignore
+                dilation[i])  # type: ignore
+
+    # Helpers.
+    common_kwargs = dict(
+        stride=stride, padding=padding, dilation=dilation, groups=groups)
+
+    def calc_output_padding(input_shape, output_shape):
+        if transpose:
+            return [0, 0]
+        return [
+            input_shape[i + 2] - (output_shape[i + 2] - 1) * stride[i] -
+            (1 - 2 * padding[i]) - dilation[i] * (weight_shape[i + 2] - 1)
+            for i in range(ndim)
+        ]
+
+    # Forward & backward.
+    class Conv2d(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, input, weight, bias):
+            assert weight.shape == weight_shape
+            ctx.save_for_backward(
+                input if weight.requires_grad else _null_tensor,
+                weight if input.requires_grad else _null_tensor,
+            )
+            ctx.input_shape = input.shape
+
+            # Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere).
+            if weight_shape[2:] == stride == dilation == (
+                    1, 1) and padding == (
+                        0, 0) and torch.cuda.get_device_capability(
+                            input.device) < (8, 0):
+                a = weight.reshape(groups, weight_shape[0] // groups,
+                                   weight_shape[1])
+                b = input.reshape(input.shape[0], groups,
+                                  input.shape[1] // groups, -1)
+                c = (a.transpose(1, 2) if transpose else a) @ b.permute(
+                    1, 2, 0, 3).flatten(2)
+                c = c.reshape(-1, input.shape[0],
+                              *input.shape[2:]).transpose(0, 1)
+                c = c if bias is None else c + bias.unsqueeze(0).unsqueeze(
+                    2).unsqueeze(3)
+                return c.contiguous(
+                    memory_format=(torch.channels_last if input.stride(1) ==
+                                   1 else torch.contiguous_format))
+
+            # General case => cuDNN.
+            if transpose:
+                return torch.nn.functional.conv_transpose2d(
+                    input=input,
+                    weight=weight,
+                    bias=bias,
+                    output_padding=output_padding,
+                    **common_kwargs)
+            return torch.nn.functional.conv2d(
+                input=input, weight=weight, bias=bias, **common_kwargs)
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            input, weight = ctx.saved_tensors
+            input_shape = ctx.input_shape
+            grad_input = None
+            grad_weight = None
+            grad_bias = None
+
+            if ctx.needs_input_grad[0]:
+                p = calc_output_padding(
+                    input_shape=input_shape, output_shape=grad_output.shape)
+                op = _conv2d_gradfix(
+                    transpose=(not transpose),
+                    weight_shape=weight_shape,
+                    output_padding=p,
+                    **common_kwargs)
+                grad_input = op.apply(grad_output, weight, None)
+                assert grad_input.shape == input_shape
+
+            if ctx.needs_input_grad[1] and not weight_gradients_disabled:
+                grad_weight = Conv2dGradWeight.apply(grad_output, input)
+                assert grad_weight.shape == weight_shape
+
+            if ctx.needs_input_grad[2]:
+                grad_bias = grad_output.sum([0, 2, 3])
+
+            return grad_input, grad_weight, grad_bias
+
+    # Gradient with respect to the weights.
+    class Conv2dGradWeight(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, grad_output, input):
+            ctx.save_for_backward(
+                grad_output if input.requires_grad else _null_tensor,
+                input if grad_output.requires_grad else _null_tensor,
+            )
+            ctx.grad_output_shape = grad_output.shape
+            ctx.input_shape = input.shape
+
+            # Simple 1x1 convolution => cuBLAS (on both Volta and Ampere).
+            if weight_shape[2:] == stride == dilation == (
+                    1, 1) and padding == (0, 0):
+                a = grad_output.reshape(grad_output.shape[0], groups,
+                                        grad_output.shape[1] // groups,
+                                        -1).permute(1, 2, 0, 3).flatten(2)
+                b = input.reshape(input.shape[0], groups,
+                                  input.shape[1] // groups,
+                                  -1).permute(1, 2, 0, 3).flatten(2)
+                c = (b @ a.transpose(1, 2) if transpose else
+                     a @ b.transpose(1, 2)).reshape(weight_shape)
+                return c.contiguous(
+                    memory_format=(torch.channels_last if input.stride(1) ==
+                                   1 else torch.contiguous_format))
+
+            # PyTorch consolidated convolution backward API in PR:
+            # https://github.com/pytorch/pytorch/commit/3dc3651e0ee3623f669c3a2c096408dbc476d122  # noqa: E501
+            # Enhance the code referring to the discussion:
+            # https://github.com/pytorch/pytorch/issues/74437
+            if digit_version(torch.__version__) >= digit_version('1.11.0'):
+                empty_weight = torch.tensor(
+                    0.0, dtype=input.dtype,
+                    device=input.device).expand(weight_shape)
+                output_padding = calc_output_padding(input.shape,
+                                                     grad_output.shape)
+                return torch.ops.aten.convolution_backward(
+                    grad_output,
+                    input,
+                    empty_weight,
+                    None,
+                    stride=stride,
+                    dilation=dilation,
+                    transposed=transpose,
+                    padding=padding,
+                    groups=groups,
+                    output_padding=output_padding,
+                    output_mask=[0, 1, 0])[1]
+            else:
+                if is_rocm_pytorch():
+                    name = 'aten::miopen_convolution_transpose_backward_weight'
+                    if not transpose:
+                        name = 'aten::miopen_convolution_backward_weight'
+                    flags = [
+                        torch.backends.cudnn.benchmark,
+                        torch.backends.cudnn.deterministic
+                    ]
+                else:
+                    # General case => cuDNN.
+                    name = ('aten::cudnn_convolution_transpose_backward_weight'
+                            if transpose else
+                            'aten::cudnn_convolution_backward_weight')
+                    flags = [
+                        torch.backends.cudnn.benchmark,
+                        torch.backends.cudnn.deterministic,
+                        torch.backends.cudnn.allow_tf32
+                    ]
+                return torch._C._jit_get_operation(name)(weight_shape,
+                                                         grad_output, input,
+                                                         padding, stride,
+                                                         dilation, groups,
+                                                         *flags)
+
+        @staticmethod
+        def backward(ctx, grad2_grad_weight):
+            grad_output, input = ctx.saved_tensors
+            grad_output_shape = ctx.grad_output_shape
+            input_shape = ctx.input_shape
+            grad2_grad_output = None
+            grad2_input = None
+
+            if ctx.needs_input_grad[0]:
+                grad2_grad_output = Conv2d.apply(input, grad2_grad_weight,
+                                                 None)
+                assert grad2_grad_output.shape == grad_output_shape
+
+            if ctx.needs_input_grad[1]:
+                p = calc_output_padding(
+                    input_shape=input_shape, output_shape=grad_output_shape)
+                op = _conv2d_gradfix(
+                    transpose=(not transpose),
+                    weight_shape=weight_shape,
+                    output_padding=p,
+                    **common_kwargs)
+                grad2_input = op.apply(grad_output, grad2_grad_weight, None)
+                assert grad2_input.shape == input_shape
+
+            return grad2_grad_output, grad2_input
+
+    _conv2d_gradfix_cache[key] = Conv2d
+    return Conv2d
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/convex_iou.py b/head_extractor/mmcv-2.1.0/mmcv/ops/convex_iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..50050363ac5b08cfa8f86dd186ab7087fac6f48a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/convex_iou.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])
+
+
+def convex_giou(pointsets: torch.Tensor,
+                polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Return generalized intersection-over-union (Jaccard index) between point
+    sets and polygons.
+
+    Args:
+        pointsets (torch.Tensor): It has shape (N, 18),
+            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+        polygons (torch.Tensor): It has shape (N, 8),
+            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: The first element is the gious
+        between point sets and polygons with the shape (N,). The second
+        element is the gradient of point sets with the shape (N, 18).
+    """
+    output = pointsets.new_zeros((pointsets.size(0), 19))
+    ext_module.convex_giou(pointsets, polygons, output)
+    convex_giou = output[:, -1]
+    points_grad = output[:, 0:-1]
+    return convex_giou, points_grad
+
+
+def convex_iou(pointsets: torch.Tensor,
+               polygons: torch.Tensor) -> torch.Tensor:
+    """Return intersection-over-union (Jaccard index) between point sets and
+    polygons.
+
+    Args:
+        pointsets (torch.Tensor): It has shape (N, 18),
+            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+        polygons (torch.Tensor): It has shape (K, 8),
+            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+    Returns:
+        torch.Tensor: Return the ious between point sets and polygons with the
+        shape (N, K).
+    """
+    N, K = pointsets.size(0), polygons.size(0)
+    ious = pointsets.new_zeros((N, K))
+    ext_module.convex_iou(pointsets, polygons, ious)
+    return ious
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/corner_pool.py b/head_extractor/mmcv-2.1.0/mmcv/ops/corner_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f4ebca06304329439fec33965792e84b7513c5c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/corner_pool.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.utils import digit_version
+from torch import Tensor, nn
+
+_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
+
+
+def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:
+    size = x.size(dim)
+    output = x.clone()
+
+    ind = 1
+    while ind < size:
+        if flip:
+            cur_start = 0
+            cur_len = size - ind
+            next_start = ind
+            next_len = size - ind
+        else:
+            cur_start = ind
+            cur_len = size - ind
+            next_start = 0
+            next_len = size - ind
+
+        # max_temp should be cloned for backward computation
+        max_temp = output.narrow(dim, cur_start, cur_len).clone()
+        cur_temp = output.narrow(dim, cur_start, cur_len)
+        next_temp = output.narrow(dim, next_start, next_len)
+
+        cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp)
+
+        ind = ind << 1
+
+    return output
+
+
+class CornerPool(nn.Module):
+    """Corner Pooling.
+
+    Corner Pooling is a new type of pooling layer that helps a
+    convolutional network better localize corners of bounding boxes.
+
+    Please refer to `CornerNet: Detecting Objects as Paired Keypoints
+    <https://arxiv.org/abs/1808.01244>`_ for more details.
+
+    Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
+
+    Args:
+        mode (str): Pooling orientation for the pooling layer
+
+            - 'bottom': Bottom Pooling
+            - 'left': Left Pooling
+            - 'right': Right Pooling
+            - 'top': Top Pooling
+
+    Returns:
+        Feature map after pooling.
+    """
+
+    cummax_dim_flip = {
+        'bottom': (2, False),
+        'left': (3, True),
+        'right': (3, False),
+        'top': (2, True),
+    }
+
+    def __init__(self, mode: str):
+        super().__init__()
+        assert mode in self.cummax_dim_flip
+        self.mode = mode
+
+    def forward(self, x: Tensor) -> Tensor:
+        if (torch.__version__ != 'parrots' and
+                digit_version(torch.__version__) >= digit_version('1.5.0')):
+            dim, flip = self.cummax_dim_flip[self.mode]
+            if flip:
+                x = x.flip(dim)
+            pool_tensor, _ = torch.cummax(x, dim=dim)
+            if flip:
+                pool_tensor = pool_tensor.flip(dim)
+            return pool_tensor
+        else:
+            dim, flip = self.cummax_dim_flip[self.mode]
+            return _corner_pool(x, dim, flip)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/correlation.py b/head_extractor/mmcv-2.1.0/mmcv/ops/correlation.py
new file mode 100644
index 0000000000000000000000000000000000000000..319b7646782637e9ebaac4ef07b82d1f460031b5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/correlation.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from torch import Tensor, nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['correlation_forward', 'correlation_backward'])
+
+
+class CorrelationFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input1: Tensor,
+                input2: Tensor,
+                kernel_size: int = 1,
+                max_displacement: int = 1,
+                stride: int = 1,
+                padding: int = 1,
+                dilation: int = 1,
+                dilation_patch: int = 1) -> Tensor:
+
+        ctx.save_for_backward(input1, input2)
+
+        kH, kW = ctx.kernel_size = _pair(kernel_size)
+        patch_size = max_displacement * 2 + 1
+        ctx.patch_size = patch_size
+        dH, dW = ctx.stride = _pair(stride)
+        padH, padW = ctx.padding = _pair(padding)
+        dilationH, dilationW = ctx.dilation = _pair(dilation)
+        dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair(
+            dilation_patch)
+
+        output_size = CorrelationFunction._output_size(ctx, input1)
+
+        output = input1.new_zeros(output_size)
+
+        ext_module.correlation_forward(
+            input1,
+            input2,
+            output,
+            kH=kH,
+            kW=kW,
+            patchH=patch_size,
+            patchW=patch_size,
+            padH=padH,
+            padW=padW,
+            dilationH=dilationH,
+            dilationW=dilationW,
+            dilation_patchH=dilation_patchH,
+            dilation_patchW=dilation_patchW,
+            dH=dH,
+            dW=dW)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]:
+        input1, input2 = ctx.saved_tensors
+
+        kH, kW = ctx.kernel_size
+        patch_size = ctx.patch_size
+        padH, padW = ctx.padding
+        dilationH, dilationW = ctx.dilation
+        dilation_patchH, dilation_patchW = ctx.dilation_patch
+        dH, dW = ctx.stride
+        grad_input1 = torch.zeros_like(input1)
+        grad_input2 = torch.zeros_like(input2)
+
+        ext_module.correlation_backward(
+            grad_output,
+            input1,
+            input2,
+            grad_input1,
+            grad_input2,
+            kH=kH,
+            kW=kW,
+            patchH=patch_size,
+            patchW=patch_size,
+            padH=padH,
+            padW=padW,
+            dilationH=dilationH,
+            dilationW=dilationW,
+            dilation_patchH=dilation_patchH,
+            dilation_patchW=dilation_patchW,
+            dH=dH,
+            dW=dW)
+        return grad_input1, grad_input2, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(ctx, input1):
+        iH, iW = input1.size(2), input1.size(3)
+        batch_size = input1.size(0)
+        kH, kW = ctx.kernel_size
+        patch_size = ctx.patch_size
+        dH, dW = ctx.stride
+        padH, padW = ctx.padding
+        dilationH, dilationW = ctx.dilation
+        dilatedKH = (kH - 1) * dilationH + 1
+        dilatedKW = (kW - 1) * dilationW + 1
+
+        oH = int((iH + 2 * padH - dilatedKH) / dH + 1)
+        oW = int((iW + 2 * padW - dilatedKW) / dW + 1)
+
+        output_size = (batch_size, patch_size, patch_size, oH, oW)
+        return output_size
+
+
+class Correlation(nn.Module):
+    r"""Correlation operator
+
+    This correlation operator works for optical flow correlation computation.
+
+    There are two batched tensors with shape :math:`(N, C, H, W)`,
+    and the correlation output's shape is :math:`(N, max\_displacement \times
+    2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})`
+
+    where
+
+    .. math::
+        H_{out} = \left\lfloor\frac{H_{in}  + 2 \times padding -
+            dilation \times (kernel\_size - 1) - 1}
+            {stride} + 1\right\rfloor
+
+    .. math::
+        W_{out} = \left\lfloor\frac{W_{in}  + 2 \times padding - dilation
+            \times (kernel\_size - 1) - 1}
+            {stride} + 1\right\rfloor
+
+    the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding
+    window convolution between input1 and shifted input2,
+
+    .. math::
+        Corr(N_i, dx, dy) =
+        \sum_{c=0}^{C-1}
+        input1(N_i, c) \star
+        \mathcal{S}(input2(N_i, c), dy, dx)
+
+    where :math:`\star` is the valid 2d sliding window convolution operator,
+    and :math:`\mathcal{S}` means shifting the input features (auto-complete
+    zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in
+    [-max\_displacement \times dilation\_patch, max\_displacement \times
+    dilation\_patch]`.
+
+    Args:
+        kernel_size (int): The size of sliding window i.e. local neighborhood
+            representing the center points and involved in correlation
+            computation. Defaults to 1.
+        max_displacement (int): The radius for computing correlation volume,
+            but the actual working space can be dilated by dilation_patch.
+            Defaults to 1.
+        stride (int): The stride of the sliding blocks in the input spatial
+            dimensions. Defaults to 1.
+        padding (int): Zero padding added to all four sides of the input1.
+            Defaults to 0.
+        dilation (int): The spacing of local neighborhood that will involved
+            in correlation. Defaults to 1.
+        dilation_patch (int): The spacing between position need to compute
+            correlation.  Defaults to 1.
+    """
+
+    def __init__(self,
+                 kernel_size: int = 1,
+                 max_displacement: int = 1,
+                 stride: int = 1,
+                 padding: int = 0,
+                 dilation: int = 1,
+                 dilation_patch: int = 1) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.max_displacement = max_displacement
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.dilation_patch = dilation_patch
+
+    def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
+        return CorrelationFunction.apply(input1, input2, self.kernel_size,
+                                         self.max_displacement, self.stride,
+                                         self.padding, self.dilation,
+                                         self.dilation_patch)
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__
+        s += f'(kernel_size={self.kernel_size}, '
+        s += f'max_displacement={self.max_displacement}, '
+        s += f'stride={self.stride}, '
+        s += f'padding={self.padding}, '
+        s += f'dilation={self.dilation}, '
+        s += f'dilation_patch={self.dilation_patch})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/README.md b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8fcc6eb1a3260148aa7448470967684f8c9f0365
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/README.md
@@ -0,0 +1,162 @@
+# Code Structure of CUDA operators
+
+This folder contains all non-python code for MMCV custom ops. Please follow the same architecture if you want to add new ops.
+
+## Directories Tree
+
+```folder
+.
+├── common
+│   ├── box_iou_rotated_utils.hpp
+│   ├── parrots_cpp_helper.hpp
+│   ├── parrots_cuda_helper.hpp
+│   ├── pytorch_cpp_helper.hpp
+│   ├── pytorch_cuda_helper.hpp
+│   ├── pytorch_device_registry.hpp
+│   ├── cuda
+│   │   ├── common_cuda_helper.hpp
+│   │   ├── parrots_cudawarpfunction.cuh
+│   │   ├── ...
+│   │   └── ops_cuda_kernel.cuh
+|   ├── mps
+│   │   ├── MPSLibrary.h
+│   │   ├── ...
+│   │   └── MPSUtils.h
+|   ├── mlu
+│   │   └── ...
+|   └── utils
+│   │   └── ...
+├── parrots
+│   ├── ...
+│   ├── ops.cpp
+│   ├── ops_parrots.cpp
+│   └── ops_pytorch.h
+└── pytorch
+    ├── info.cpp
+    ├── pybind.cpp
+    ├── ...
+    ├── ops.cpp
+    ├── cuda
+    │   ├── ...
+    │   └── ops_cuda.cu
+    ├── cpu
+    │   ├── ...
+    │   └── ops.cpp
+    ├── mps
+    │   ├── ...
+    |   └── op_mps.mm
+    └── mlu
+        ├── ...
+        └── op_mlu.cpp
+```
+
+## Components
+
+- `common`: This directory contains all tools and shared codes.
+  - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
+  - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
+  - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
+  - `utils`: The kernels and utils of spconv.
+- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
+- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
+  - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
+  - `cpu`: This directory contain cpu implementations of corresponding custom ops.
+  - `mlu`: This directory contain launchers of each MLU kernels.
+  - `mps`: MPS ops implementation and launchers.
+
+## How to add new PyTorch ops?
+
+1. (Optional) Add shared kernel in `common` to support special hardware platform.
+
+   ```c++
+   // src/common/cuda/new_ops_cuda_kernel.cuh
+
+   template <typename T>
+   __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
+       // forward here
+   }
+
+   ```
+
+   Add cuda kernel launcher in `pytorch/cuda`.
+
+   ```c++
+   // src/pytorch/cuda
+   #include <new_ops_cuda_kernel.cuh>
+
+   void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
+       // initialize
+       at::cuda::CUDAGuard device_guard(input.device());
+       cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+       ...
+       AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+           input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
+               new_ops_forward_cuda_kernel<scalar_t>
+                   <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                       input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
+           }));
+       AT_CUDA_CHECK(cudaGetLastError());
+   }
+   ```
+
+2. Register implementation for different devices.
+
+   ```c++
+   // src/pytorch/cuda/cudabind.cpp
+   ...
+
+   Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
+       // implement cuda forward here
+       // use `NewOpsForwardCUDAKernelLauncher` here
+   }
+   // declare interface here.
+   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
+   // register the implementation for given device (CUDA here).
+   REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
+   ```
+
+3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.
+
+   ```c++
+   // src/pytorch/new_ops.cpp
+   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
+       // dispatch the implementation according to the device type of input.
+       DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
+   }
+   ...
+
+   Tensor new_ops_forward(Tensor input, Tensor output, ...){
+       return new_ops_forward_impl(input, output, ...);
+   }
+   ```
+
+4. Binding the implementation in `pytorch/pybind.cpp`
+
+   ```c++
+   // src/pytorch/pybind.cpp
+
+   ...
+
+   Tensor new_ops_forward(Tensor input, Tensor output, ...);
+
+   ...
+
+   // bind with pybind11
+   m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
+           py::arg("input"), py::arg("output"), ...);
+
+   ...
+
+   ```
+
+5. Build MMCV again. Enjoy new ops in python
+
+   ```python
+   from ..utils import ext_loader
+   ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
+
+   ...
+
+   ext_module.new_ops_forward(input, output, ...)
+
+   ```
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8453eaa8d3638394df8a0b169d8df01dfc27a11
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
@@ -0,0 +1,426 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+#pragma once
+#include <cassert>
+#include <cmath>
+
+#ifdef __CUDACC__
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+#else
+#include <algorithm>
+#define HOST_DEVICE
+#define HOST_DEVICE_INLINE HOST_DEVICE inline
+#endif
+
+namespace {
+
+template <typename T>
+struct RotatedBox {
+  T x_ctr, y_ctr, w, h, a;
+};
+
+template <typename T>
+struct Point {
+  T x, y;
+  HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
+  HOST_DEVICE_INLINE Point operator+(const Point& p) const {
+    return Point(x + p.x, y + p.y);
+  }
+  HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
+    x += p.x;
+    y += p.y;
+    return *this;
+  }
+  HOST_DEVICE_INLINE Point operator-(const Point& p) const {
+    return Point(x - p.x, y - p.y);
+  }
+  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
+    return Point(x * coeff, y * coeff);
+  }
+};
+
+template <typename T>
+HOST_DEVICE_INLINE T dot_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.x + A.y * B.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T cross_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.y - B.x * A.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox<T>& box,
+                                             Point<T> (&pts)[4]) {
+  // M_PI / 180. == 0.01745329251
+  // double theta = box.a * 0.01745329251;
+  // MODIFIED
+  double theta = box.a;
+  T cosTheta2 = (T)cos(theta) * 0.5f;
+  T sinTheta2 = (T)sin(theta) * 0.5f;
+
+  // y: top --> down; x: left --> right
+  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[2].x = 2 * box.x_ctr - pts[0].x;
+  pts[2].y = 2 * box.y_ctr - pts[0].y;
+  pts[3].x = 2 * box.x_ctr - pts[1].x;
+  pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int get_intersection_points(const Point<T> (&pts1)[4],
+                                               const Point<T> (&pts2)[4],
+                                               Point<T> (&intersections)[24]) {
+  // Line vector
+  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+  Point<T> vec1[4], vec2[4];
+  for (int i = 0; i < 4; i++) {
+    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+  }
+
+  // Line test - test all line combos for intersection
+  int num = 0;  // number of intersections
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      // Solve for 2x2 Ax=b
+      T det = cross_2d<T>(vec2[j], vec1[i]);
+
+      // This takes care of parallel lines
+      if (fabs(det) <= 1e-14) {
+        continue;
+      }
+
+      auto vec12 = pts2[j] - pts1[i];
+
+      T t1 = cross_2d<T>(vec2[j], vec12) / det;
+      T t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
+        intersections[num++] = pts1[i] + vec1[i] * t1;
+      }
+    }
+  }
+
+  // Check for vertices of rect1 inside rect2
+  {
+    const auto& AB = vec2[0];
+    const auto& DA = vec2[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      // assume ABCD is the rectangle, and P is the point to be judged
+      // P is inside ABCD iff. P's projection on AB lies within AB
+      // and P's projection on AD lies within AD
+
+      auto AP = pts1[i] - pts2[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts1[i];
+      }
+    }
+  }
+
+  // Reverse the check - check for vertices of rect2 inside rect1
+  {
+    const auto& AB = vec1[0];
+    const auto& DA = vec1[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      auto AP = pts2[i] - pts1[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts2[i];
+      }
+    }
+  }
+
+  return num;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
+                                          const int& num_in, Point<T> (&q)[24],
+                                          bool shift_to_zero = false) {
+  assert(num_in >= 2);
+
+  // Step 1:
+  // Find point with minimum y
+  // if more than 1 points have the same minimum y,
+  // pick the one with the minimum x.
+  int t = 0;
+  for (int i = 1; i < num_in; i++) {
+    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+      t = i;
+    }
+  }
+  auto& start = p[t];  // starting point
+
+  // Step 2:
+  // Subtract starting point from every points (for sorting in the next step)
+  for (int i = 0; i < num_in; i++) {
+    q[i] = p[i] - start;
+  }
+
+  // Swap the starting point to position 0
+  auto tmp = q[0];
+  q[0] = q[t];
+  q[t] = tmp;
+
+  // Step 3:
+  // Sort point 1 ~ num_in according to their relative cross-product values
+  // (essentially sorting according to angles)
+  // If the angles are the same, sort according to their distance to origin
+  T dist[24];
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+#ifdef __CUDACC__
+  // CUDA version
+  // In the future, we can potentially use thrust
+  // for sorting here to improve speed (though not guaranteed)
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+#else
+  // CPU version
+  std::sort(q + 1, q + num_in,
+            [](const Point<T>& A, const Point<T>& B) -> bool {
+              T temp = cross_2d<T>(A, B);
+              if (fabs(temp) < 1e-6) {
+                return dot_2d<T>(A, A) < dot_2d<T>(B, B);
+              } else {
+                return temp > 0;
+              }
+            });
+  // compute distance to origin after sort, since the points are now different.
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+#endif
+
+  // Step 4:
+  // Make sure there are at least 2 points (that don't overlap with each other)
+  // in the stack
+  int k;  // index of the non-overlapped second point
+  for (k = 1; k < num_in; k++) {
+    if (dist[k] > 1e-8) {
+      break;
+    }
+  }
+  if (k == num_in) {
+    // We reach the end, which means the convex hull is just one point
+    q[0] = p[t];
+    return 1;
+  }
+  q[1] = q[k];
+  int m = 2;  // 2 points in the stack
+  // Step 5:
+  // Finally we can start the scanning process.
+  // When a non-convex relationship between the 3 points is found
+  // (either concave shape or duplicated points),
+  // we pop the previous point from the stack
+  // until the 3-point relationship is convex again, or
+  // until the stack only contains two points
+  for (int i = k + 1; i < num_in; i++) {
+    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
+      m--;
+    }
+    q[m++] = q[i];
+  }
+
+  // Step 6 (Optional):
+  // In general sense we need the original coordinates, so we
+  // need to shift the points back (reverting Step 2)
+  // But if we're only interested in getting the area/perimeter of the shape
+  // We can simply return.
+  if (!shift_to_zero) {
+    for (int i = 0; i < m; i++) {
+      q[i] += start;
+    }
+  }
+
+  return m;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T quadri_box_area(const Point<T> (&q)[4]) {
+  T area = 0;
+#pragma unroll
+  for (int i = 1; i < 3; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
+  if (m <= 2) {
+    return 0;
+  }
+
+  T area = 0;
+  for (int i = 1; i < m - 1; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox<T>& box1,
+                                                const RotatedBox<T>& box2) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  Point<T> pts1[4];
+  Point<T> pts2[4];
+  get_rotated_vertices<T>(box1, pts1);
+  get_rotated_vertices<T>(box2, pts2);
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T quadri_boxes_intersection(const Point<T> (&pts1)[4],
+                                               const Point<T> (&pts2)[4]) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+}  // namespace
+
+template <typename T>
+HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw,
+                                            T const* const box2_raw,
+                                            const int mode_flag) {
+  // shift center to the middle point to achieve higher precision in result
+  RotatedBox<T> box1, box2;
+  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+  box1.x_ctr = box1_raw[0] - center_shift_x;
+  box1.y_ctr = box1_raw[1] - center_shift_y;
+  box1.w = box1_raw[2];
+  box1.h = box1_raw[3];
+  box1.a = box1_raw[4];
+  box2.x_ctr = box2_raw[0] - center_shift_x;
+  box2.y_ctr = box2_raw[1] - center_shift_y;
+  box2.w = box2_raw[2];
+  box2.h = box2_raw[3];
+  box2.a = box2_raw[4];
+
+  const T area1 = box1.w * box1.h;
+  const T area2 = box2.w * box2.h;
+  if (area1 < 1e-14 || area2 < 1e-14) {
+    return 0.f;
+  }
+
+  const T intersection = rotated_boxes_intersection<T>(box1, box2);
+  T baseS = 1.0;
+  if (mode_flag == 0) {
+    baseS = (area1 + area2 - intersection);
+  } else if (mode_flag == 1) {
+    baseS = area1;
+  }
+  const T iou = intersection / baseS;
+  return iou;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T single_box_iou_quadri(T const* const pts1_raw,
+                                           T const* const pts2_raw,
+                                           const int mode_flag) {
+  // shift center to the middle point to achieve higher precision in result
+  Point<T> pts1[4], pts2[4];
+
+  auto center_shift_x =
+      (pts1_raw[0] + pts2_raw[0] + pts1_raw[2] + pts2_raw[2] + pts1_raw[4] +
+       pts2_raw[4] + pts1_raw[6] + pts2_raw[6]) /
+      8.0;
+  auto center_shift_y =
+      (pts1_raw[1] + pts2_raw[1] + pts1_raw[3] + pts2_raw[3] + pts1_raw[5] +
+       pts2_raw[5] + pts1_raw[7] + pts2_raw[7]) /
+      8.0;
+  pts1[0].x = pts1_raw[0] - center_shift_x;
+  pts1[0].y = pts1_raw[1] - center_shift_y;
+  pts1[1].x = pts1_raw[2] - center_shift_x;
+  pts1[1].y = pts1_raw[3] - center_shift_y;
+  pts1[2].x = pts1_raw[4] - center_shift_x;
+  pts1[2].y = pts1_raw[5] - center_shift_y;
+  pts1[3].x = pts1_raw[6] - center_shift_x;
+  pts1[3].y = pts1_raw[7] - center_shift_y;
+  pts2[0].x = pts2_raw[0] - center_shift_x;
+  pts2[0].y = pts2_raw[1] - center_shift_y;
+  pts2[1].x = pts2_raw[2] - center_shift_x;
+  pts2[1].y = pts2_raw[3] - center_shift_y;
+  pts2[2].x = pts2_raw[4] - center_shift_x;
+  pts2[2].y = pts2_raw[5] - center_shift_y;
+  pts2[3].x = pts2_raw[6] - center_shift_x;
+  pts2[3].y = pts2_raw[7] - center_shift_y;
+
+  const T area1 = quadri_box_area<T>(pts1);
+  const T area2 = quadri_box_area<T>(pts2);
+  if (area1 < 1e-14 || area2 < 1e-14) {
+    return 0.f;
+  }
+
+  const T intersection = quadri_boxes_intersection<T>(pts1, pts2);
+  T baseS = 1.0;
+  if (mode_flag == 0) {
+    baseS = (area1 + area2 - intersection);
+  } else if (mode_flag == 1) {
+    baseS = area1;
+  }
+  const T iou = intersection / baseS;
+  return iou;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..36e41107ebd52d3cf5e9a71cffe6eddeed4f0765
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
@@ -0,0 +1,59 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
+#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void active_rotated_filter_forward_cuda_kernel(
+    const int nthreads, const scalar_t* weight_data, const int* indices_data,
+    const int num_input_planes, const int num_output_planes,
+    const int num_orientations, const int num_rotations, const int nEntry,
+    scalar_t* output_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int l = index % nEntry;
+    int j = (index / nEntry) % num_input_planes;
+    int i = index / nEntry / num_input_planes;
+    int k;
+    scalar_t val = *(weight_data + index);
+    for (k = 0; k < num_rotations; k++) {
+      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+      scalar_t* target = output_data +
+                         i * (num_rotations * num_input_planes * nEntry) +
+                         k * (num_input_planes * nEntry) + j * (nEntry) + idx;
+      *target = val;
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void active_rotated_filter_backward_cuda_kernel(
+    const int nthreads, const scalar_t* gradWeight_data,
+    const int* indices_data, const int num_input_planes,
+    const int num_output_planes, const int num_orientations,
+    const int num_rotations, const int nEntry, scalar_t* weight_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int l = index % nEntry;
+    int j = (index / nEntry) % num_input_planes;
+    int i = index / nEntry / num_input_planes;
+    int k;
+    scalar_t* val = weight_data + index;
+    *val = 0;
+    scalar_t tmp = 0;
+    for (k = 0; k < num_rotations; k++) {
+      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+      scalar_t target =
+          *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
+            k * (num_input_planes * nEntry) + j * (nEntry) + idx);
+      tmp = tmp + target;
+    }
+    *val = tmp;
+  }
+}
+#endif  // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9f9250844b9ceeca0df0377640c3d28e3f61cecc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
@@ -0,0 +1,116 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
+#define ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K)
+// output: fout(B,O,N)
+// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) =  s(b,i,k,m)*p(b,i(k),m,j)
+//       i(k) = idx(b,i,k)
+//      sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j)
+//      avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k
+//      max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j)))
+
+template <typename T>
+__global__ void assign_score_withk_forward_cuda_kernel(
+    const int B, const int N0, const int N1, const int M, const int K,
+    const int O, const int aggregate, const T* points, const T* centers,
+    const T* scores, const int64_t* knn_idx, T* output) {
+  // ----- parallel loop for B, N1, K and O ---------
+  CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
+    // ------- loop for M ----------
+    const int b = (int)(i / (O * N1 * K));
+    const int o = (int)(i % (O * N1 * K) / (N1 * K));
+    const int n = (int)(i % (N1 * K) / K);
+    const int k = (int)(i % K);
+    const int cn = (int)knn_idx[b * K * N1 + n * K +
+                                0];  // The first neighbor is the center point
+    const int kn = (int)knn_idx[b * K * N1 + n * K + k];
+    if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
+    }
+    assert(b < B);
+    assert(kn < N0);
+    assert(cn < N0);
+    assert(o < O);
+    assert(n < N1);
+    const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
+    T val = output[out_idx];
+    for (int m = 0; m < M; m++) {
+      val += points[b * N0 * M * O + kn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m] -
+             centers[b * N0 * M * O + cn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m];
+    }
+    output[out_idx] = val;
+  }
+}
+
+template <typename T>
+__global__ void assign_score_withk_points_backward_cuda_kernel(
+    const int B, const int N0, const int N, const int M, const int K,
+    const int O, const int aggregate, const T* grad_out, const T* scores,
+    const int64_t* knn_idx, T* grad_points, T* grad_centers) {
+  // ----- parallel loop for B, M, O ---------
+  CUDA_1D_KERNEL_LOOP(i, B * M * O) {
+    int b = (int)(i / (M * O));
+    int m = (int)(i % (M * O) / O);
+    int o = (int)(i % O);
+
+    // ----- loop for N,K ---------
+    for (int n = 0; n < N; n++) {
+      for (int k = 0; k < K; k++) {
+        int kn = knn_idx[b * N * K + n * K + k];
+        int cn = knn_idx[b * N * K + n * K + 0];
+        if (kn >= N0 || kn < 0) {  // if index overflows, it is out of the
+                                   // neighborhood range
+          continue;
+        }
+        atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
+                  scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
+        atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
+                  -scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void assign_score_withk_scores_backward_cuda_kernel(
+    const int B, const int N0, const int N, const int M, const int K,
+    const int O, const int aggregate, const T* grad_out, const T* points,
+    const T* centers, const int64_t* knn_idx, T* grad_scores) {
+  // ----- parallel loop for B, N, K, M ---------
+  CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
+    const int b = (int)(i / (N * M * K));
+    const int n = (int)(i % (N * M * K) / M / K);
+    const int k = (int)(i % (M * K) / M);
+    const int m = (int)(i % M);
+    const int cn = knn_idx[b * N * K + n * K + 0];
+    const int kn = knn_idx[b * N * K + n * K + k];
+    if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
+    }
+
+    // -------------- loop for O ------------------------
+    const int out_idx = b * N * K * M + n * K * M + k * M + m;
+    T val = grad_scores[out_idx];
+    for (int o = 0; o < O; o++) {
+      val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
+              centers[b * N0 * M * O + cn * M * O + m * O + o]) *
+             grad_out[b * O * N * K + o * N * K + n * K + k];
+    }
+    grad_scores[out_idx] = val;
+  }
+}
+
+#endif  // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..632b5c4940b33a9d8d839fa3f3b92e7b6a2bd29e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+#ifndef BALL_QUERY_CUDA_KERNEL_CUH
+#define BALL_QUERY_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
+                                               float min_radius,
+                                               float max_radius, int nsample,
+                                               const T* new_xyz, const T* xyz,
+                                               int* idx) {
+  // new_xyz: (B, M, 3)
+  // xyz: (B, N, 3)
+  // output:
+  //      idx: (B, M, nsample)
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b) return;
+
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+
+    float max_radius2 = max_radius * max_radius;
+    float min_radius2 = min_radius * min_radius;
+    T new_x = new_xyz[0];
+    T new_y = new_xyz[1];
+    T new_z = new_xyz[2];
+
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+      T x = xyz[k * 3 + 0];
+      T y = xyz[k * 3 + 1];
+      T z = xyz[k * 3 + 2];
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+             (new_z - z) * (new_z - z);
+      if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            idx[l] = k;
+          }
+        }
+        idx[cnt] = k;
+        ++cnt;
+        if (cnt >= nsample) break;
+      }
+    }
+  }
+}
+
+#endif  // BALL_QUERY_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..15bd91eca629895d3a99dde3fe6614036ca31dc9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
@@ -0,0 +1,147 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BBOX_OVERLAPS_CUDA_KERNEL_CUH
+#define BBOX_OVERLAPS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
+                                          T& y1, T& x2, T& y2) {
+  x1 = bbox[base];
+  y1 = bbox[base + 1];
+  x2 = bbox[base + 2];
+  y2 = bbox[base + 3];
+}
+
+template <>
+__device__ __forceinline__ void load_bbox<float>(const float* bbox,
+                                                 const int base, float& x1,
+                                                 float& y1, float& x2,
+                                                 float& y2) {
+  const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];
+  x1 = bbox_offset.x;
+  y1 = bbox_offset.y;
+  x2 = bbox_offset.z;
+  y2 = bbox_offset.w;
+}
+
+template <typename T>
+__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
+                                          T* ious, const int num_bbox1,
+                                          const int num_bbox2, const int mode,
+                                          const bool aligned,
+                                          const int offset) {
+  if (aligned) {
+    CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
+      const int b1 = index;
+      const int b2 = index;
+
+      const int base1 = b1 << 2;  // b1 * 4
+      T b1_x1, b1_y1, b1_x2, b1_y2;
+      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      const int base2 = b2 << 2;  // b2 * 4
+      T b2_x1, b2_y1, b2_x2, b2_y2;
+      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      const T width = fmaxf(right - left + offset, 0.f);
+      const T height = fmaxf(bottom - top + offset, 0.f);
+      const T interS = width * height;
+
+      const T baseS =
+          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
+      ious[index] = interS / baseS;
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
+      const int b1 = index / num_bbox2;
+      const int b2 = index % num_bbox2;
+
+      const int base1 = b1 << 2;  // b1 * 4
+      T b1_x1, b1_y1, b1_x2, b1_y2;
+      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      const int base2 = b2 << 2;  // b2 * 4
+      T b2_x1, b2_y1, b2_x2, b2_y2;
+      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      const T width = fmaxf(right - left + offset, 0.f);
+      const T height = fmaxf(bottom - top + offset, 0.f);
+      const T interS = width * height;
+
+      const T baseS =
+          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
+      ious[index] = interS / baseS;
+    }
+  }
+}
+
+#if __CUDA_ARCH__ >= 530
+__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,
+                                              const __half x2, const __half y2,
+                                              const __half offset) {
+  const __half half_w = __hadd(__hsub(x2, x1), offset);
+  const __half half_h = __hadd(__hsub(y2, y1), offset);
+  return __hmul(half_w, half_h);
+}
+
+__device__ __forceinline__ __half __half_max(const __half a, const __half b) {
+  return __hge(a, b) ? a : b;
+}
+
+__device__ __forceinline__ __half __half_min(const __half a, const __half b) {
+  return __hle(a, b) ? a : b;
+}
+
+// fp16 won't provide much increase when aligned==true. It is useful when
+// aligned==false, which would give you ~40% bonus.
+__device__ void bbox_overlaps_cuda_kernel_half(
+    const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,
+    const int num_bbox2, const int mode, const bool aligned, const int offset) {
+  const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;
+  const __half h_offset = __int2half_rn(offset);
+  CUDA_1D_KERNEL_LOOP(index, num_output) {
+    const int b1 = aligned ? index : index / num_bbox2;
+    const int b2 = aligned ? index : index % num_bbox2;
+
+    const int base1 = b1 << 2;
+    __half b1_x1, b1_y1, b1_x2, b1_y2;
+    load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+    const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);
+
+    const int base2 = b2 << 2;
+    __half b2_x1, b2_y1, b2_x2, b2_y2;
+    load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+    const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);
+
+    const __half left = __half_max(b1_x1, b2_x1),
+                 right = __half_min(b1_x2, b2_x2);
+    const __half top = __half_max(b1_y1, b2_y1),
+                 bottom = __half_min(b1_y2, b2_y2);
+    const __half width =
+        __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));
+    const __half height =
+        __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));
+    const __half interS = __hmul(width, height);
+
+    const __half baseS = __half_max(
+        mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,
+        h_offset);
+    ious[index] = __hdiv(interS, baseS);
+  }
+}
+#endif  // __CUDA_ARCH__ >= 530
+
+#endif  // BBOX_OVERLAPS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..537610416e16aae8979d0843972e090d127b0d43
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh
@@ -0,0 +1,230 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu
+#ifndef BEZIER_ALIGN_CUDA_KERNEL_CUH
+#define BEZIER_ALIGN_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+template <typename T>
+__device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3,
+                          const T u) {
+  return ((1. - u) * (1. - u) * (1. - u) * p0 +
+          3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +
+          u * u * u * p3);
+}
+
+template <typename T>
+__global__ void bezier_align_forward_cuda_kernel(
+    const int nthreads,
+    const T *bottom_data,  // inputs
+    const T *bottom_rois,  // bottom rois contains the bezier curve
+    T *top_data,           // outputs
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int sampling_ratio, bool aligned, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    // beziers have size Nx(1+8*2) = Nx17
+    const T *offset_bottom_rois = bottom_rois + n * 17;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+
+    // TODO: avoid this by using parallel annotation, for good
+    T p0_x = offset_bottom_rois[1] * spatial_scale;
+    T p0_y = offset_bottom_rois[2] * spatial_scale;
+    T p1_x = offset_bottom_rois[3] * spatial_scale;
+    T p1_y = offset_bottom_rois[4] * spatial_scale;
+    T p2_x = offset_bottom_rois[5] * spatial_scale;
+    T p2_y = offset_bottom_rois[6] * spatial_scale;
+    T p3_x = offset_bottom_rois[7] * spatial_scale;
+    T p3_y = offset_bottom_rois[8] * spatial_scale;
+    T p4_x = offset_bottom_rois[15] * spatial_scale;
+    T p4_y = offset_bottom_rois[16] * spatial_scale;
+    T p5_x = offset_bottom_rois[13] * spatial_scale;
+    T p5_y = offset_bottom_rois[14] * spatial_scale;
+    T p6_x = offset_bottom_rois[11] * spatial_scale;
+    T p6_y = offset_bottom_rois[12] * spatial_scale;
+    T p7_x = offset_bottom_rois[9] * spatial_scale;
+    T p7_y = offset_bottom_rois[10] * spatial_scale;
+
+    // compute the coords
+    const T u = pw / static_cast<T>(pooled_width);
+    const T v = ph / static_cast<T>(pooled_height);
+    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
+    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
+    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
+    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
+    const T x_center = x1 * v + x0 * (1. - v) - offset;
+    const T y_center = y1 * v + y0 * (1. - v) - offset;
+
+    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
+    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
+    {
+      const T y = y_center - (T)0.5 * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = x_center - (T)0.5 * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+
+        T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,
+                                     index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+template <typename T>
+__global__ void bezier_align_backward_cuda_kernel(
+    const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int sampling_ratio, bool aligned, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    // beziers have size Nx(1+8*2) = Nx17
+    const T *offset_bottom_rois = bottom_rois + n * 17;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T p0_x = offset_bottom_rois[1] * spatial_scale;
+    T p0_y = offset_bottom_rois[2] * spatial_scale;
+    T p1_x = offset_bottom_rois[3] * spatial_scale;
+    T p1_y = offset_bottom_rois[4] * spatial_scale;
+    T p2_x = offset_bottom_rois[5] * spatial_scale;
+    T p2_y = offset_bottom_rois[6] * spatial_scale;
+    T p3_x = offset_bottom_rois[7] * spatial_scale;
+    T p3_y = offset_bottom_rois[8] * spatial_scale;
+    T p4_x = offset_bottom_rois[15] * spatial_scale;
+    T p4_y = offset_bottom_rois[16] * spatial_scale;
+    T p5_x = offset_bottom_rois[13] * spatial_scale;
+    T p5_y = offset_bottom_rois[14] * spatial_scale;
+    T p6_x = offset_bottom_rois[11] * spatial_scale;
+    T p6_y = offset_bottom_rois[12] * spatial_scale;
+    T p7_x = offset_bottom_rois[9] * spatial_scale;
+    T p7_y = offset_bottom_rois[10] * spatial_scale;
+
+    // compute the coords
+    const T u = pw / static_cast<T>(pooled_width);
+    const T v = ph / static_cast<T>(pooled_height);
+    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
+    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
+    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
+    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
+    const T x_center = x1 * v + x0 * (1. - v) - offset;
+    const T y_center = y1 * v + y0 * (1. - v) - offset;
+
+    T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x));
+    T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y));
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T *offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
+    {
+      const T y = y_center - (T)0.5 * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = x_center - (T)0.5 * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low,
+                    static_cast<T>(g1));
+          atomicAdd(offset_bottom_diff + y_low * width + x_high,
+                    static_cast<T>(g2));
+          atomicAdd(offset_bottom_diff + y_high * width + x_low,
+                    static_cast<T>(g3));
+          atomicAdd(offset_bottom_diff + y_high * width + x_high,
+                    static_cast<T>(g4));
+        }  // if
+      }    // ix
+    }      // iy
+  }        // CUDA_1D_KERNEL_LOOP
+}  // BezierAlignBackward
+
+#endif  // BEZIER_ALIGN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1d2a2197b45ef5c82412c4b75d7819a7e27674f6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
@@ -0,0 +1,200 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/csrc/border_align/border_align_kernel.cu.
+// the main difference: (1) use `argmax_idx` for fast computing of gradient
+// during the backward. (2) `wh` is directly computed by `boxes`, rather than
+// passing it as argument to forward or backward functions.
+
+#ifndef BORDER_ALIGN_CUDA_KERNEL_CUH
+#define BORDER_ALIGN_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+enum BorderMode { Top = 0, Left = 1, Bottom = 2, Right = 3 };
+
+/*** Forward ***/
+template <typename T>
+__global__ void border_align_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* boxes, T* output,
+    int* argmax_idx, const int channels, const int box_size, const int height,
+    const int width, const int pool_size) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
+    // output, and `extreme_idx` is in range [0,3]
+    int batch_idx, c_idx, box_idx, extreme_idx, maxidx, *offset_argmax_idx;
+    const T *offset_box, *offset_input, *offset_box_x;
+    T *offset_output, box_width, box_height, stride, x_stride, y_stride, x, y,
+        val, maxval;
+
+    extreme_idx = threadIdx.y;
+    // shape (N, C, box_size, 4) for output
+    batch_idx = index / channels / box_size;
+    // shape (N, box_size, 4) for boxes
+    box_idx = index % box_size + batch_idx * box_size;
+    c_idx = (index / box_size) % channels;
+
+    offset_box = boxes + box_idx * 4;
+    box_width = *(offset_box + 2) - *offset_box;
+    box_height = *(offset_box + 3) - *(offset_box + 1);
+    offset_output = output + index * 4 + extreme_idx;
+    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
+    // shape (N, 4C, h, w) for input.
+    // [0,C) for top feature, [C,2C) for left feature,
+    // [2C,3C) for bottom feature, [3C,4C) for right feature
+    offset_input =
+        input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) *
+                    height * width;
+
+    // extreme_idx in [0,1] -> offset_box_x indexed at x1
+    // extreme_idx in [2,3] -> offset_box_x indexed at x2
+    offset_box_x = offset_box + extreme_idx / 2 * 2;
+
+    // (x1,y1) or (x2,y2) for (x,y)
+    x = *offset_box_x;
+    y = *(offset_box_x + 1);
+
+    switch (extreme_idx) {
+      // top
+      case BorderMode::Top:
+        stride = box_width / pool_size;
+        x_stride = stride;
+        y_stride = 0;
+        break;
+      // left
+      case BorderMode::Left:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = stride;
+        break;
+      // bottom
+      case BorderMode::Bottom:
+        stride = box_width / pool_size;
+        x_stride = -stride;
+        y_stride = 0;
+        break;
+      // right
+      case BorderMode::Right:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = -stride;
+        break;
+    }
+
+    // initialize maxval and maxidx with the start position (e.g. (x1,y1) or
+    // (x2,y2))
+    maxval = bilinear_interpolate(offset_input, height, width, y, x, index);
+    maxidx = 0;
+
+    // do max_pool along the border
+    for (int i = 1; i <= pool_size; i++) {
+      x += x_stride;
+      y += y_stride;
+      val = bilinear_interpolate(offset_input, height, width, y, x, index);
+      if (val > maxval) {
+        maxval = val;
+        maxidx = i;
+      }
+    }
+
+    // update output and argmax_idx
+    *offset_output = maxval;
+    *offset_argmax_idx = maxidx;
+  }
+}
+
+/*** Backward ***/
+template <typename T>
+__global__ void border_align_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* boxes,
+    const int* argmax_idx, T* grad_input, const int channels,
+    const int box_size, const int height, const int width,
+    const int pool_size) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
+    // output, and `extreme_idx` is in range [0,3]
+    int batch_idx, c_idx, box_idx, extreme_idx;
+    const int* offset_argmax_idx;
+    const T *offset_grad_output, *offset_box, *offset_box_x;
+    T *offset_grad_input, box_width, box_height, stride, x_stride, y_stride, x,
+        y;
+
+    extreme_idx = threadIdx.y;
+    batch_idx = index / channels / box_size;
+    box_idx = index % box_size + batch_idx * box_size;
+    c_idx = (index / box_size) % channels;
+
+    offset_box = boxes + box_idx * 4;
+    box_width = *(offset_box + 2) - *offset_box;
+    box_height = *(offset_box + 3) - *(offset_box + 1);
+    offset_grad_output = grad_output + index * 4 + extreme_idx;
+    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
+    // [0,C) for top feature grad, [C,2C) for left feature grad,
+    // [2C,3C) for bottom feature grad, [3C,4C) for right feature grad
+    offset_grad_input = grad_input + (batch_idx * channels * 4 +
+                                      extreme_idx * channels + c_idx) *
+                                         height * width;
+
+    // extreme_idx in [0,1] -> offset_box_x indexed at x1
+    // extreme_idx in [2,3] -> offset_box_x indexed at x2
+    offset_box_x = offset_box + extreme_idx / 2 * 2;
+
+    switch (extreme_idx) {
+      // top
+      case BorderMode::Top:
+        stride = box_width / pool_size;
+        x_stride = stride;
+        y_stride = 0;
+        break;
+      // left
+      case BorderMode::Left:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = stride;
+        break;
+      // bottom
+      case BorderMode::Bottom:
+        stride = box_width / pool_size;
+        x_stride = -stride;
+        y_stride = 0;
+        break;
+      // right
+      case BorderMode::Right:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = -stride;
+        break;
+    }
+
+    // get position (x,y) which has maximum value during forward
+    x = *offset_box_x;
+    y = *(offset_box_x + 1);
+    x += x_stride * (T)(*offset_argmax_idx);
+    y += y_stride * (T)(*offset_argmax_idx);
+
+    T w1, w2, w3, w4;
+    int x_low, x_high, y_low, y_high;
+    bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low,
+                                  x_high, y_low, y_high, index);
+
+    // update grad_output
+    atomicAdd(offset_grad_input + y_low * width + x_low,
+              *offset_grad_output * w1);
+    atomicAdd(offset_grad_input + y_low * width + x_high,
+              *offset_grad_output * w2);
+    atomicAdd(offset_grad_input + y_high * width + x_low,
+              *offset_grad_output * w3);
+    atomicAdd(offset_grad_input + y_high * width + x_high,
+              *offset_grad_output * w4);
+  }
+}
+
+#endif  // BORDER_ALIGN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..cf8ad5e1a324de3a11c8fc8af28a8d559a661ed6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh
@@ -0,0 +1,91 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#ifndef BOX_IOU_QUADRI_CUDA_CUH
+#define BOX_IOU_QUADRI_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+template <typename T>
+__global__ void box_iou_quadri_cuda_kernel(
+    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
+    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
+  if (aligned) {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
+      int b1 = index;
+      int b2 = index;
+
+      int base1 = b1 * 8;
+
+      float block_boxes1[8];
+      float block_boxes2[8];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+      block_boxes1[5] = dev_boxes1[base1 + 5];
+      block_boxes1[6] = dev_boxes1[base1 + 6];
+      block_boxes1[7] = dev_boxes1[base1 + 7];
+
+      int base2 = b2 * 8;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+      block_boxes2[5] = dev_boxes2[base2 + 5];
+      block_boxes2[6] = dev_boxes2[base2 + 6];
+      block_boxes2[7] = dev_boxes2[base2 + 7];
+
+      dev_ious[index] =
+          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
+      int b1 = index / n_boxes2;
+      int b2 = index % n_boxes2;
+
+      int base1 = b1 * 8;
+
+      float block_boxes1[8];
+      float block_boxes2[8];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+      block_boxes1[5] = dev_boxes1[base1 + 5];
+      block_boxes1[6] = dev_boxes1[base1 + 6];
+      block_boxes1[7] = dev_boxes1[base1 + 7];
+
+      int base2 = b2 * 8;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+      block_boxes2[5] = dev_boxes2[base2 + 5];
+      block_boxes2[6] = dev_boxes2[base2 + 6];
+      block_boxes2[7] = dev_boxes2[base2 + 7];
+
+      dev_ious[index] =
+          single_box_iou_quadri<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  }
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..abd47cd85437804310886de057b5a839a49481b2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
@@ -0,0 +1,81 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+#ifndef BOX_IOU_ROTATED_CUDA_CUH
+#define BOX_IOU_ROTATED_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+template <typename T>
+__global__ void box_iou_rotated_cuda_kernel(
+    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
+    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
+  if (aligned) {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
+      int b1 = index;
+      int b2 = index;
+
+      int base1 = b1 * 5;
+
+      float block_boxes1[5];
+      float block_boxes2[5];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+
+      int base2 = b2 * 5;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+
+      dev_ious[index] =
+          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
+      int b1 = index / n_boxes2;
+      int b2 = index % n_boxes2;
+
+      int base1 = b1 * 5;
+
+      float block_boxes1[5];
+      float block_boxes2[5];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+
+      int base2 = b2 * 5;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+
+      dev_ious[index] =
+          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  }
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..311900fcd303483dea815a1eb996a7eb33fdc55b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
@@ -0,0 +1,335 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_CUDA_KERNEL_CUH
+#define CARAFE_CUDA_KERNEL_CUH
+
+#include <ATen/cuda/DeviceUtils.cuh>
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#ifdef MMCV_WITH_HIP
+#define WARP_SIZE 64
+#else
+#define WARP_SIZE 32
+#endif
+#define THREADS_PER_PIXEL 32
+#define MAX_SHARED_MEMORY 49152
+#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
+#define MAXIMIZE_KERNEL_SIZE true
+#define kTileDim 32
+#define kBlockRows 8
+#define FULL_MASK 0xffffffff
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+                                const int w, const int channel_num,
+                                const int height, const int width) {
+  int index = w + (h + (c + n * channel_num) * height) * width;
+  return index;
+}
+#ifndef MMCV_WITH_HIP
+/* TODO: move this to a common place */
+template <typename scalar_t>
+__device__ inline scalar_t min(scalar_t a, scalar_t b) {
+  return a < b ? a : b;
+}
+
+template <typename scalar_t>
+__device__ inline scalar_t max(scalar_t a, scalar_t b) {
+  return a > b ? a : b;
+}
+#endif
+template <typename scalar_t>
+__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
+  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
+#ifdef MMCV_WITH_HIP
+    val += __shfl_down(val, offset);
+#else
+    val += __shfl_down_sync(FULL_MASK, val, offset);
+#endif
+  return val;
+}
+
+template <>
+__device__ __forceinline__ phalf warpReduceSum(phalf val) {
+  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
+#ifdef MMCV_WITH_HIP
+    // Using PyTorch's macro for half support
+    __PHALF(val) += WARP_SHFL_DOWN(val, offset);
+#else
+    __PHALF(val) +=
+        __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset);
+#endif
+  return val;
+}
+
+// Splits the original matrix into submatrices with size 32 * 32.
+// Each block transposes one submatrix by loading it into shared memory.
+// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
+template <typename scalar_t>
+__global__ void BatchTranspose2DCUDAKernel(const int N, const int H,
+                                           const int W, const int dh,
+                                           const int dw,
+                                           const scalar_t *__restrict__ X,
+                                           scalar_t *__restrict__ Y) {
+  __shared__ scalar_t tile[kTileDim][kTileDim + 1];
+  const int n = blockIdx.x / (dh * dw);
+  const int k = blockIdx.x % (dh * dw);
+  const int r = k / dw;
+  const int c = k % dw;
+  const int offset = n * H * W;
+  int x = c * kTileDim + threadIdx.x;
+  int y = r * kTileDim + threadIdx.y;
+  if (x < W) {
+    for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {
+      tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
+    }
+  }
+  __syncthreads();
+  x = r * kTileDim + threadIdx.x;
+  y = c * kTileDim + threadIdx.y;
+  if (x < H) {
+    for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {
+      Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void CARAFEForward(
+    const int num_kernels, const scalar_t *__restrict__ bottom_data,
+    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int down_height, const int down_width, const int height,
+    const int width, const int mask_channels, scalar_t *__restrict__ top_data) {
+#if MAXIMIZE_KERNEL_SIZE
+  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int down_pw = pw / scale_factor;
+  const int down_ph = ph / scale_factor;
+
+  const int start_w = down_pw - (kernel_size - 1) / 2;
+  const int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+  const int start_h = down_ph - (kernel_size - 1) / 2;
+  const int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+    int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels);
+    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+  }
+  __syncthreads();
+
+  const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    int mask_group = c / channels_per_group;
+    scalar_t output_val = 0;
+#pragma unroll
+    for (int iy = start_h; iy < end_h; iy++) {
+#pragma unroll
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, iy, ix, c, down_height, down_width, channels);
+
+        output_val += bottom_data[feat_index] *
+                      shared_mask[mask_c * WARP_SIZE + pixel_id];
+      }
+    }
+
+    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+    top_data[top_index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void CARAFEBackward_Feature(
+    const int num_kernels, const scalar_t *__restrict__ top_diff,
+    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int down_height, const int down_width, const int height,
+    const int width, const int mask_channels,
+    scalar_t *__restrict__ bottom_diff) {
+#if MAXIMIZE_KERNEL_SIZE
+  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+
+  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  // (n, c, ph, pw) is an element in the bottom_data
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int start_w = pw - (kernel_size - 1) * scale_factor / 2;
+  const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1;
+  const int start_h = ph - (kernel_size - 1) * scale_factor / 2;
+  const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1;
+  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+    const int mask_w = (c % kernel_size) * scale_factor;
+    const int mask_h = (c / kernel_size % kernel_size) * scale_factor;
+    const int mask_x = start_w + mask_w;
+    const int mask_y = start_h + mask_h;
+    if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) {
+      shared_mask[c * WARP_SIZE + pixel_id] = 0;
+      continue;
+    }
+    const int mask_group = c / (kernel_size * kernel_size);
+    const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1;
+    int mask_index =
+        Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width);
+    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+  }
+  __syncthreads();
+  const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    int mask_group = c / channels_per_group;
+    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+    scalar_t output_val = 0;
+#pragma unroll
+    for (int iy = start_h; iy < end_h; iy += scale_factor) {
+#pragma unroll
+      for (int ix = start_w; ix < end_w; ix += scale_factor) {
+        if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) {
+          continue;
+        }
+        int mask_iy =
+            (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+        int mask_ix =
+            (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index = Loc2Index(n, iy, ix, c, height, width, channels);
+        output_val +=
+            shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index];
+      }
+    }
+    bottom_diff[top_index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void FeatureSum(const int num_kernels,
+                           const scalar_t *__restrict__ input_data,
+                           const int scale_factor, const int channels,
+                           const int height, const int width,
+                           scalar_t *__restrict__ output_data) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    scalar_t output_val = 0;
+    for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) {
+      for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) {
+        int input_id = Loc2Index(n, iy, ix, c, height * scale_factor,
+                                 width * scale_factor, channels);
+        output_val += input_data[input_id];
+      }
+    }
+    const int output_id = Loc2Index(n, ph, pw, c, height, width, channels);
+    output_data[output_id] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void CARAFEBackward_Mask(const int num_kernels,
+                                    const scalar_t *__restrict__ top_diff,
+                                    const scalar_t *__restrict__ bottom_data,
+                                    const int kernel_size, const int group_size,
+                                    const int scale_factor, const int channels,
+                                    const int down_height, const int down_width,
+                                    const int height, const int width,
+                                    const int mask_channels,
+                                    scalar_t *__restrict__ mask_diff) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+
+  const int lane_id = index % WARP_SIZE;
+  index = index / WARP_SIZE;
+  const int mask_c = index % mask_channels;
+  // (n, c, ph, pw) is an element in the bottom_data
+  index = index / mask_channels;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int down_pw = pw / scale_factor;
+  const int down_ph = ph / scale_factor;
+
+  const int mask_group = mask_c / (kernel_size * kernel_size);
+  const int mask_loc = mask_c % (kernel_size * kernel_size);
+
+  const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2;
+  const int offset_y =
+      mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2;
+
+  const int down_x = down_pw + offset_x;
+  const int down_y = down_ph + offset_y;
+
+  scalar_t output_val = 0;
+
+  if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 &&
+      down_x <= down_width - 1) {
+    const int channels_per_mask = ceilf(channels / (float)group_size);
+    const int start = channels_per_mask * mask_group;
+    const int end = min(channels_per_mask * (mask_group + 1), channels);
+    for (int c = start + lane_id; c < end; c += WARP_SIZE) {
+      int bottom_id =
+          Loc2Index(n, down_y, down_x, c, down_height, down_width, channels);
+      int top_id = Loc2Index(n, ph, pw, c, height, width, channels);
+      output_val += top_diff[top_id] * bottom_data[bottom_id];
+    }
+  }
+#ifdef MMCV_WITH_HIP
+  __syncthreads();
+#else
+  __syncwarp();
+#endif
+  output_val = warpReduceSum(output_val);
+  if (lane_id == 0) {
+    const int mask_id =
+        Loc2Index(n, ph, pw, mask_c, height, width, mask_channels);
+    mask_diff[mask_id] = output_val;
+  }
+}
+
+#endif  // CARAFE_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..48230c632f223b736aa72a9d5fd682c97b3aa93a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_NAIVE_CUDA_KERNEL_CUH
+#define CARAFE_NAIVE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+                                const int w, const int channel_num,
+                                const int height, const int width) {
+  int index = w + (h + (c + n * channel_num) * height) * width;
+  return index;
+}
+
+template <typename scalar_t>
+__global__ void carafe_naive_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the bottom_data
+    int pw = index % width;
+    int ph = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    int mask_channels = kernel_size * kernel_size * group_size;
+    int mask_group = c / (channels / group_size);
+
+    int down_pw = pw / scale_factor;
+    int down_ph = ph / scale_factor;
+    int down_width = width / scale_factor;
+    int down_height = height / scale_factor;
+    int start_w = down_pw - (kernel_size - 1) / 2;
+    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+    int start_h = down_ph - (kernel_size - 1) / 2;
+    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+
+    scalar_t output_val = 0;
+    for (int iy = start_h; iy < end_h; iy++) {
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+        int mask_index =
+            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+        output_val += bottom_data[feat_index] * bottom_masks[mask_index];
+      }
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void carafe_naive_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data,
+    const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff,
+    const int kernel_size, const int group_size, const int scale_factor,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the bottom_data
+    int pw = index % width;
+    int ph = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    int mask_channels = kernel_size * kernel_size * group_size;
+    int mask_group = c / (channels / group_size);
+
+    int down_pw = pw / scale_factor;
+    int down_ph = ph / scale_factor;
+    int down_width = width / scale_factor;
+    int down_height = height / scale_factor;
+    int start_w = down_pw - (kernel_size - 1) / 2;
+    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+    int start_h = down_ph - (kernel_size - 1) / 2;
+    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+
+    for (int iy = start_h; iy < end_h; iy++) {
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+        int mask_index =
+            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+        atomicAdd(bottom_diff + feat_index,
+                  bottom_masks[mask_index] * top_diff[index]);
+        atomicAdd(mask_diff + mask_index,
+                  bottom_data[feat_index] * top_diff[index]);
+      }
+    }
+  }
+}
+
+#endif  // CARAFE_NAIVE_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..89feea4a546a5093967f26393ca6be3b9fe6ae05
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
@@ -0,0 +1,101 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu
+#ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH
+#define CHAMFER_DISTANCE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
+
+template <typename scalar_t>
+__global__ void chamfer_distance_forward_cuda_kernel(int b, int n,
+                                                     const scalar_t* xyz, int m,
+                                                     const scalar_t* xyz2,
+                                                     scalar_t* result,
+                                                     int* result_i) {
+  __shared__ scalar_t buf[MAX_SHARED_SCALAR_T];
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) {
+      int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2;
+      for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) {
+        buf[j] = xyz2[(i * m + k2) * 2 + j];
+      }
+      __syncthreads();
+      for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
+        scalar_t x1 = xyz[(i * n + j) * 2 + 0];
+        scalar_t y1 = xyz[(i * n + j) * 2 + 1];
+        int best_i = 0;
+        scalar_t best = 1e10;
+        int end_ka = end_k & (~2);
+        if (end_ka == THREADS_PER_BLOCK) {
+          for (int k = 0; k < THREADS_PER_BLOCK; k += 4) {
+#pragma unroll
+            for (int j = 0; j < 4; ++j) {
+              scalar_t x2 = buf[(k + j) * 2] - x1;
+              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
+              scalar_t d = x2 * x2 + y2 * y2;
+              if (d < best) {
+                best = d;
+                best_i = k + k2 + j;
+              }
+            }
+          }
+        } else {
+          for (int k = 0; k < end_ka; k += 4) {
+#pragma unroll
+            for (int j = 0; j < 4; ++j) {
+              scalar_t x2 = buf[(k + j) * 2] - x1;
+              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
+              scalar_t d = x2 * x2 + y2 * y2;
+              if (d < best) {
+                best = d;
+                best_i = k + k2 + j;
+              }
+            }
+          }
+        }
+        for (int k = end_ka; k < end_k; k++) {
+          scalar_t x2 = buf[k * 2 + 0] - x1;
+          scalar_t y2 = buf[k * 2 + 1] - y1;
+          scalar_t d = x2 * x2 + y2 * y2;
+          if (k == 0 || d < best) {
+            best = d;
+            best_i = k + k2;
+          }
+        }
+        if (k2 == 0 || result[(i * n + j)] > best) {
+          result[(i * n + j)] = best;
+          result_i[(i * n + j)] = best_i;
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void chamfer_distance_backward_cuda_kernel(
+    int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2,
+    const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1,
+    scalar_t* grad_xyz2) {
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
+      scalar_t x1 = xyz1[(i * n + j) * 2 + 0];
+      scalar_t y1 = xyz1[(i * n + j) * 2 + 1];
+      int j2 = idx1[i * n + j];
+      scalar_t x2 = xyz2[(i * m + j2) * 2 + 0];
+      scalar_t y2 = xyz2[(i * m + j2) * 2 + 1];
+      scalar_t g = grad_dist1[i * n + j] * 2;
+      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2));
+      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2));
+      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2)));
+      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2)));
+    }
+  }
+}
+#endif  // CHAMFER_DISTANCE_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b12aa9a26a2cc162fd89f68ccc97e17749090a41
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
@@ -0,0 +1,120 @@
+#ifndef COMMON_CUDA_HELPER
+#define COMMON_CUDA_HELPER
+
+#include <cuda.h>
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                             \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \
+       i += blockDim.x * gridDim.x)                                 \
+    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
+         j += blockDim.y * gridDim.y)
+
+#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \
+  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
+    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
+
+#define THREADS_PER_BLOCK 512
+
+inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
+  int optimal_block_num = (N + num_threads - 1) / num_threads;
+  int max_block_num = 4096;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename T>
+__device__ T bilinear_interpolate(const T* input, const int height,
+                                  const int width, T y, T x,
+                                  const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
+    int& x_low, int& x_high, int& y_low, int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+#endif  // COMMON_CUDA_HELPER
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2af96f7963ec347486ced942a5ef7cc4f187db8b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
@@ -0,0 +1,831 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CONVEX_IOU_CUDA_KERNEL_CUH
+#define CONVEX_IOU_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAXN 100
+#define NMAX 512
+__device__ const double EPS = 1E-8;
+
+__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); }
+
+struct Point {
+  double x, y;
+  __device__ Point() {}
+  __device__ Point(double x, double y) : x(x), y(y) {}
+};
+
+__device__ inline bool point_same(Point& a, Point& b) {
+  return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0;
+}
+
+__device__ inline void swap1(Point* a, Point* b) {
+  Point temp;
+  temp.x = a->x;
+  temp.y = a->y;
+
+  a->x = b->x;
+  a->y = b->y;
+
+  b->x = temp.x;
+  b->y = temp.y;
+}
+
+__device__ inline void reverse1(Point* a, const int n) {
+  for (int i = 0; i < (n - 1) / 2.0; i++) {
+    Point* j = &(a[i]);
+    Point* k = &(a[n - 1 - i]);
+    swap1(j, k);
+  }
+}
+
+__device__ inline double cross(Point o, Point a, Point b) {
+  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
+}
+
+__device__ inline double dis(Point a, Point b) {
+  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+}
+__device__ inline double area(Point* ps, int n) {
+  ps[n] = ps[0];
+  double res = 0;
+  for (int i = 0; i < n; i++) {
+    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
+  }
+  return res / 2.0;
+}
+__device__ inline double polygon_area_grad(Point* ps, int n,
+                                           int* polygon_to_pred_index,
+                                           int n_pred, double* grad_C) {
+  ps[n] = ps[0];
+  double partion_grad[4 * 30 + 2];
+  double res = 0;
+  for (int i = 0; i < n; i++) {
+    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
+    partion_grad[i * 4 + 2] = ps[i + 1].y;
+    partion_grad[i * 4 + 3] = -ps[i + 1].x;
+    if (i != n - 1) {
+      partion_grad[i * 4 + 4] = -ps[i].y;
+      partion_grad[i * 4 + 5] = ps[i].x;
+    } else {
+      partion_grad[0] = -ps[i].y;
+      partion_grad[1] = ps[i].x;
+    }
+  }
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < n_pred; j++) {
+      if (i == polygon_to_pred_index[j]) {
+        grad_C[2 * polygon_to_pred_index[j + n_pred]] =
+            (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2;
+        break;
+      }
+    }
+    for (int j = 0; j < n_pred; j++) {
+      if (i == polygon_to_pred_index[j]) {
+        grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] =
+            (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2;
+        break;
+      }
+    }
+  }
+
+  return res / 2.0;
+}
+
+__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p,
+                                double* cut_grad, int m, int n, int i) {
+  double s1, s2;
+  double s2_s1_2;
+  double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd;
+  double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd;
+  s1 = cross(a, b, c);
+  s2 = cross(a, b, d);
+
+  ds1_dxc = -(b.y - a.y);
+  ds1_dyc = b.x - a.x;
+  ds2_dxd = ds1_dxc;
+  ds2_dyd = ds1_dyc;
+  s2_s1_2 = (s2 - s1) * (s2 - s1);
+
+  if (sig(s1) == 0 && sig(s2) == 0) return 2;
+  if (sig(s2 - s1) == 0) return 0;
+
+  dxp_dxc =
+      ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) /
+      (s2_s1_2);
+  dxp_dyc =
+      ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) /
+      (s2_s1_2);
+  dxp_dxd =
+      ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) /
+      (s2_s1_2);
+  dxp_dyd =
+      ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) /
+      (s2_s1_2);
+
+  dyp_dxc =
+      ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) /
+      (s2_s1_2);
+  dyp_dyc =
+      ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) /
+      (s2_s1_2);
+  dyp_dxd =
+      ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) /
+      (s2_s1_2);
+  dyp_dyd =
+      ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) /
+      (s2_s1_2);
+
+  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
+  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
+  if (i == n - 1) {
+    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
+    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
+    cut_grad[4 * n * m + 0] = dxp_dxd;  // + dyp_dxd;
+    cut_grad[4 * n * m + 1] = dyp_dxd;
+    cut_grad[4 * n * m + 2] = dxp_dyd;  // + dyp_dyd;
+    cut_grad[4 * n * m + 3] = dyp_dyd;
+  } else {
+    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
+    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
+    cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd;  // + dyp_dxd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd;  // + dyp_dyd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd;
+  }
+
+  return 1;
+}
+__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b,
+                                   double* cut_grad) {
+  Point pp[MAXN];
+  double ccur_grad[MAXN] = {};
+  int m = 0;
+  p[n] = p[0];
+  int k = n;
+  for (int i = 0; i < n; i++) {
+    if (sig(cross(a, b, p[i])) > 0) {
+      pp[m] = p[i];
+      ccur_grad[4 * n * m + 4 * i] = 1.0;
+      ccur_grad[4 * n * m + 4 * i + 3] = 1.0;
+      m++;
+    }
+    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
+      lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i);
+      m++;
+    }
+  }
+
+  n = 0;
+  for (int i = 0; i < m; i++) {
+    if (!i || !(point_same(pp[i], pp[i - 1]))) {
+      p[n] = pp[i];
+      for (int j = 0; j < 4 * k; j++) {
+        cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j];
+      }
+      n++;
+    }
+  }
+
+  while (n > 1 && point_same(p[n - 1], p[0])) n--;
+}
+
+__device__ inline double intersectArea(Point a, Point b, Point c, Point d,
+                                       double* grad_AB, int order,
+                                       int convex_n) {
+  Point o(0, 0);
+  int res_flag = 0;
+  int s1 = sig(cross(o, a, b));
+  int s2 = sig(cross(o, c, d));
+  if (s1 == 0 || s2 == 0) return 0.0;
+  if (s1 == -1) {
+    Point* i = &a;
+    Point* j = &b;
+    swap1(i, j);
+    res_flag = 1;
+  }
+  if (s2 == -1) {
+    Point* i = &c;
+    Point* j = &d;
+    swap1(i, j);
+  }
+  Point p[10] = {o, a, b};
+  int n = 3, n0 = 3, n1, n2, n3;
+  double cut_grad1[MAXN] = {};
+  double cut_grad2[MAXN] = {};
+  double cut_grad3[MAXN] = {};
+  double p1_p_grad[10][10] = {};
+  double p2_p1_grad[10][10] = {};
+  double p3_p2_grad[10][10] = {};
+
+  double p3_p1_grad[10][10] = {};
+  double p3_p_grad[10][10] = {};
+
+  // 1
+  polygon_cut(p, n, o, c, cut_grad1);
+  n1 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n0; j++) {
+      if (!(j % 2)) {
+        p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j];
+      } else {
+        p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j];
+      }
+    }
+  }
+
+  // 2
+  polygon_cut(p, n, c, d, cut_grad2);
+  n2 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n1; j++) {
+      if (!(j % 2)) {
+        p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j];
+      } else {
+        p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j];
+      }
+    }
+  }
+  // 3
+  polygon_cut(p, n, d, o, cut_grad3);
+  n3 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n2; j++) {
+      if (!(j % 2)) {
+        p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j];
+      } else {
+        p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j];
+      }
+    }
+  }
+
+  // mul
+  //  p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)
+  for (int i = 0; i < 2 * n3; i++) {
+    for (int j = 0; j < 2 * n1; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n2; m++) {
+        sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j];
+      }
+      p3_p1_grad[i][j] = sum;
+    }
+  }
+
+  // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)
+  for (int i = 0; i < 2 * n3; i++) {
+    for (int j = 0; j < 2 * n0; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n1; m++) {
+        sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j];
+      }
+      p3_p_grad[i][j] = sum;
+    }
+  }
+
+  // calculate S_grad
+  int polygon_index_box_index[20];
+  double grad_polygon[20];
+  double S_grad[6];
+
+  for (int i = 0; i < n3; i++) {
+    polygon_index_box_index[i] = i;
+    polygon_index_box_index[i + n3] = i;
+  }
+
+  double res =
+      polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon);
+
+  if (s1 * s2 == -1) {
+    for (int j = 0; j < 2 * 3; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n3; m++) {
+        sum = sum - grad_polygon[m] * p3_p_grad[m][j];
+      }
+      S_grad[j] = sum;
+    }
+
+    if (order != convex_n - 1) {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[2 * order + 2] += S_grad[2];
+        grad_AB[2 * order + 3] += S_grad[3];
+
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[2 * order + 2] += S_grad[4];
+        grad_AB[2 * order + 3] += S_grad[5];
+      }
+    } else {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[0] += S_grad[2];
+        grad_AB[1] += S_grad[3];
+
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[0] += S_grad[4];
+        grad_AB[1] += S_grad[5];
+      }
+    }
+    res = -res;
+  } else {
+    for (int j = 0; j < 2 * 3; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n3; m++) {
+        sum = sum + grad_polygon[m] * p3_p_grad[m][j];
+      }
+      S_grad[j] = sum;
+    }
+
+    if (order != convex_n - 1) {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[2 * order + 2] += S_grad[2];
+        grad_AB[2 * order + 3] += S_grad[3];
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[2 * order + 2] += S_grad[4];
+        grad_AB[2 * order + 3] += S_grad[5];
+      }
+    } else {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[0] += S_grad[2];
+        grad_AB[1] += S_grad[3];
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[0] += S_grad[4];
+        grad_AB[1] += S_grad[5];
+      }
+    }
+  }
+  return res;
+}
+
+__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2,
+                                        double* grad_AB) {
+  if (area(ps1, n1) < 0) reverse1(ps1, n1);
+  if (area(ps2, n2) < 0) reverse1(ps2, n2);
+  ps1[n1] = ps1[0];
+  ps2[n2] = ps2[0];
+  double res = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      res +=
+          intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1);
+    }
+  }
+  return res;
+}
+
+__device__ inline void Jarvis(Point* in_poly, int& n_poly) {
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[NMAX] = {}, top1, top2;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point* j = &(in_poly[0]);
+      Point* k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+  for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]];
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+  for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]];
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+}
+
+__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2,
+                                           int n2, double* grad_C) {
+  Point polygon[MAXN];
+  int n = n1 + n2, n_poly = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n - n1; j++) {
+      if (point_same(ps1[i], ps2[j])) {
+        for (int k = j; k < n - n1 - 1; k++) {
+          ps2[k] = ps2[k + 1];
+        }
+        n2--;
+        break;
+      }
+    }
+  }
+  n_poly = n1 + n2;
+  for (int i = 0; i < n_poly; i++) {
+    if (i < n1) {
+      polygon[i] = ps1[i];
+    } else {
+      polygon[i] = ps2[i - n1];
+    }
+  }
+
+  Jarvis(polygon, n_poly);
+
+  int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                   -1, -1, -1, -1, -1, -1, -1, -1, -1};
+  int n_pred = 0;
+  for (int i = 0; i < n_poly; i++) {
+    for (int j = 0; j < n1; j++) {
+      if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) {
+        polygon_to_pred_index[n_pred] = i;
+        polygon_to_pred_index[n_pred + n1] = j;
+        n_pred += 1;
+        break;
+      }
+    }
+  }
+  if (n_pred == 0) {
+    double polygon_area = fabs(area(polygon, n_poly));
+    for (int i = 0; i < 18; i++) {
+      grad_C[i] = 0.0;
+    }
+    return polygon_area;
+  } else {
+    double polygon_area =
+        polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C);
+    if (polygon_area < 0) {
+      for (int i = 0; i < 18; i++) {
+        grad_C[i] = -grad_C[i];
+      }
+    }
+    return fabs(polygon_area);
+  }
+}
+
+// convex_find and get the polygon_index_box_index
+__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly,
+                                        int* points_to_convex_ind) {
+  int n_input = n_poly;
+  Point input_poly[20];
+  for (int i = 0; i < n_input; i++) {
+    input_poly[i].x = in_poly[i].x;
+    input_poly[i].y = in_poly[i].y;
+  }
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[20], top1, top2;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point* j = &(in_poly[0]);
+      Point* k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+  for (int i = 0; i <= top1; i++) {
+    right_point[i] = in_poly[Stack[i]];
+  }
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+
+  for (int i = top2 - 1; i >= 0; i--) {
+    left_point[i] = in_poly[Stack[i]];
+  }
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+  for (int i = 0; i < n_poly; i++) {
+    for (int j = 0; j < n_input; j++) {
+      if (point_same(in_poly[i], input_poly[j])) {
+        points_to_convex_ind[i] = j;
+        break;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ inline float devrIoU(T const* const p, T const* const q,
+                                T* point_grad, const int idx) {
+  Point ps1[MAXN], ps2[MAXN];
+
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = (double)p[i * 2];
+    convex[i].y = (double)p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
+  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
+
+  int n1 = n_convex;
+  int n2 = 4;
+
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = (double)convex[i].x;
+    ps1[i].y = (double)convex[i].y;
+  }
+
+  for (int i = 0; i < n2; i++) {
+    ps2[i].x = (double)q[i * 2];
+    ps2[i].y = (double)q[i * 2 + 1];
+  }
+
+  int polygon_index_box_index[18];
+  for (int i = 0; i < n1; i++) {
+    polygon_index_box_index[i] = i;
+    polygon_index_box_index[i + n1] = i;
+  }
+
+  double grad_A[18] = {};
+  double grad_AB[18] = {};
+  double grad_C[18] = {};
+
+  double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB);
+  double S_pred =
+      polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A);
+  if (S_pred < 0) {
+    for (int i = 0; i < n_convex * 2; i++) {
+      grad_A[i] = -grad_A[i];
+    }
+  }
+  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
+
+  double iou = inter_area / union_area;
+  double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C);
+
+  //    printf("%d:live\n", idx);
+  double rot_giou = iou - (polygon_area - union_area) / polygon_area;
+
+  float grad_point_temp[18] = {};
+
+  for (int i = 0; i < n_convex; i++) {
+    int grad_point = points_to_convex_ind[i];
+    grad_point_temp[2 * grad_point] =
+        (float)((union_area + inter_area) / (union_area * union_area) *
+                    grad_AB[2 * i] -
+                iou / union_area * grad_A[2 * i] -
+                1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) -
+                (union_area) / polygon_area / polygon_area * grad_C[2 * i]);
+    grad_point_temp[2 * grad_point + 1] =
+        (float)((union_area + inter_area) / (union_area * union_area) *
+                    grad_AB[2 * i + 1] -
+                iou / union_area * grad_A[2 * i + 1] -
+                1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) -
+                (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]);
+  }
+
+  for (int i = 0; i < 9; i++) {
+    point_grad[2 * i] = grad_point_temp[2 * i];
+    point_grad[2 * i + 1] = grad_point_temp[2 * i + 1];
+  }
+  return (float)rot_giou;
+}
+
+template <typename T>
+__global__ void convex_giou_cuda_kernel(const int ex_n_boxes,
+                                        const int gt_n_boxes, const T* ex_boxes,
+                                        const T* gt_boxes, T* point_grad) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T* cur_box = ex_boxes + index * 18;
+    const T* cur_gt_box = gt_boxes + index * 8;
+    T* cur_grad = point_grad + index * 19;
+    T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x);
+    cur_grad[18] = giou;
+  }
+}
+
+__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) {
+  double s1, s2;
+  s1 = cross(a, b, c);
+  s2 = cross(a, b, d);
+  if (sig(s1) == 0 && sig(s2) == 0) return 2;
+  if (sig(s2 - s1) == 0) return 0;
+  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
+  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
+  return 1;
+}
+
+__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) {
+  Point pp[MAXN];
+  int m = 0;
+  p[n] = p[0];
+  for (int i = 0; i < n; i++) {
+    if (sig(cross(a, b, p[i])) > 0) {
+      pp[m] = p[i];
+      m++;
+    }
+    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
+      lineCross(a, b, p[i], p[i + 1], pp[m]);
+      m++;
+    }
+  }
+  n = 0;
+  for (int i = 0; i < m; i++) {
+    if (!i || !(point_same(pp[i], pp[i - 1]))) {
+      p[n] = pp[i];
+      n++;
+    }
+  }
+
+  while (n > 1 && point_same(p[n - 1], p[0])) n--;
+}
+
+__device__ inline double intersectArea(Point a, Point b, Point c, Point d) {
+  Point o(0, 0);
+  int s1 = sig(cross(o, a, b));
+  int s2 = sig(cross(o, c, d));
+  if (s1 == 0 || s2 == 0) return 0.0;
+  if (s1 == -1) {
+    Point* i = &a;
+    Point* j = &b;
+    swap1(i, j);
+  }
+  if (s2 == -1) {
+    Point* i = &c;
+    Point* j = &d;
+    swap1(i, j);
+  }
+  Point p[10] = {o, a, b};
+  int n = 3;
+
+  polygon_cut(p, n, o, c);
+  polygon_cut(p, n, c, d);
+  polygon_cut(p, n, d, o);
+  double res = area(p, n);
+  if (s1 * s2 == -1) res = -res;
+  return res;
+}
+__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2,
+                                        int n2) {
+  if (area(ps1, n1) < 0) reverse1(ps1, n1);
+  if (area(ps2, n2) < 0) reverse1(ps2, n2);
+  ps1[n1] = ps1[0];
+  ps2[n2] = ps2[0];
+  double res = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]);
+    }
+  }
+  return res;
+}
+
+template <typename T>
+__device__ inline float devrIoU(T const* const p, T const* const q) {
+  Point ps1[MAXN], ps2[MAXN];
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = (double)p[i * 2];
+    convex[i].y = (double)p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
+  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
+  int n1 = n_convex;
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = (double)convex[i].x;
+    ps1[i].y = (double)convex[i].y;
+  }
+  int n2 = 4;
+  for (int i = 0; i < n2; i++) {
+    ps2[i].x = (double)q[i * 2];
+    ps2[i].y = (double)q[i * 2 + 1];
+  }
+  double inter_area = intersectAreaO(ps1, n1, ps2, n2);
+  double S_pred = area(ps1, n1);
+  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
+  double iou = inter_area / union_area;
+  return (float)iou;
+}
+
+template <typename T>
+__global__ void convex_iou_cuda_kernel(const int ex_n_boxes,
+                                       const int gt_n_boxes, const T* ex_boxes,
+                                       const T* gt_boxes, T* iou) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T* cur_box = ex_boxes + index * 18;
+    for (int i = 0; i < gt_n_boxes; i++) {
+      iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8);
+    }
+  }
+}
+#endif  // CONVEX_IOU_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..f910561ec309cd50fd6d4da131ab36cdf3ca963a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
@@ -0,0 +1,231 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
+// Original licence: Under MIT License
+
+#ifndef CORRELATION_CUDA
+#define CORRELATION_CUDA
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+// Using <torch/extension.h> is recommended in the official documentation in
+// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
+// However, we use <torch/types.h> for compatibility with CUDA 9.0
+// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
+#include <torch/types.h>
+
+#include <iostream>
+#include <vector>
+
+using namespace torch;
+
+#define TensorAcc4R PackedTensorAccessor32<scalar_t, 4, RestrictPtrTraits>
+#define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>
+#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
+
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
+
+template <typename scalar_t>
+__global__ void correlation_forward_cuda_kernel(
+    const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output,
+    int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH,
+    int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW,
+    int oH, int oW) {
+  const int iH = rInput1.size(1);
+  const int iW = rInput1.size(2);
+  const int C = rInput1.size(3);
+
+  const int n = blockIdx.x;
+  const int h = blockIdx.y * blockDim.y + threadIdx.y;
+  const int w = blockIdx.z * blockDim.z + threadIdx.z;
+
+  if (h >= oH || w >= oW) return;
+
+  const int thread = threadIdx.x;
+
+  const int start_i = -padH + h * dH;
+  const int start_j = -padW + w * dW;
+
+  const int patchRadH = dilation_patchH * (patchH - 1) / 2;
+  const int patchRadW = dilation_patchW * (patchW - 1) / 2;
+
+  for (int ph = 0; ph < patchH; ++ph) {
+    int ph_dilated = ph * dilation_patchH - patchRadH;
+    for (int pw = 0; pw < patchW; ++pw) {
+      int pw_dilated = pw * dilation_patchW - patchRadW;
+      scalar_t prod_sum = 0.0f;
+      for (int i = 0; i < kH; ++i) {
+        int i1 = start_i + i * dilationH;
+        int i2 = i1 + ph_dilated;
+        if (WITHIN_BOUNDS(i1, i2, iH, iH)) {
+          for (int j = 0; j < kW; ++j) {
+            int j1 = start_j + j * dilationW;
+            int j2 = j1 + pw_dilated;
+            if (WITHIN_BOUNDS(j1, j2, iW, iW)) {
+              for (int c = thread; c < C; c += WARP_SIZE) {
+                scalar_t v1 = rInput1[n][i1][j1][c];
+                scalar_t v2 = rInput2[n][i2][j2][c];
+                prod_sum += v1 * v2;
+              }
+            }
+          }
+        }
+      }
+      // accumulate
+      for (int offset = 16; offset > 0; offset /= 2)
+#ifdef MMCV_WITH_HIP
+        prod_sum += __shfl_down(float(prod_sum), offset);
+#else
+        prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset);
+#endif
+      if (thread == 0) {
+        output[n][ph][pw][h][w] = prod_sum;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void correlation_backward_cuda_kernel_input1(
+    const TensorAcc5R grad_output, const TensorAcc4R input2,
+    TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,
+    const int patchW, const int padH, const int padW, const int dilationH,
+    const int dilationW, const int dilation_patchH, const int dilation_patchW,
+    const int dH, const int dW) {
+  const int iH = input2.size(1);
+  const int iW = input2.size(2);
+  const int C = input2.size(3);
+
+  const int H = grad_output.size(3);
+  const int W = grad_output.size(4);
+
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+
+  const int n = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+
+  const int h_2 = h + padH;
+  const int w_2 = w + padW;
+  const int min_h = h_2 - kH * dilationH;
+  const int min_w = w_2 - kW * dilationW;
+
+  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
+  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
+  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
+    const int ph = i / patchW;
+    const int pw = i % patchW;
+    int i1 = h + dilation_patchH * (ph - patchRadH);
+    int j1 = w + dilation_patchW * (pw - patchRadW);
+
+    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+      scalar_t grad_val = 0.0f;
+      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+        int i2 = (h_3) / dH;
+        if (i2 * dH != h_3) continue;
+        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+          int j2 = (w_3) / dW;
+          if (j2 * dW != w_3) continue;
+          if (WITHIN_BOUNDS(i2, j2, H, W)) {
+            grad_val += grad_output[n][ph][pw][i2][j2];
+          }
+        }
+      }
+      grad_cache[i] = grad_val;
+    }
+  }
+  __syncthreads();
+
+  for (int c = threadIdx.x; c < C; c += blockDim.x) {
+    scalar_t grad_input_val = 0.0f;
+    for (int ph = 0; ph < patchH; ++ph) {
+      int i1 = h + dilation_patchH * (ph - patchRadH);
+      for (int pw = 0; pw < patchW; ++pw) {
+        int j1 = w + dilation_patchW * (pw - patchRadW);
+        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+          grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw];
+        }
+      }
+    }
+    grad_input1[n][c][h][w] = grad_input_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void correlation_backward_cuda_kernel_input2(
+    const TensorAcc5R grad_output, const TensorAcc4R input1,
+    TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,
+    int padW, int dilationH, int dilationW, int dilation_patchH,
+    int dilation_patchW, int dH, int dW) {
+  const int iH = input1.size(1);
+  const int iW = input1.size(2);
+  const int C = input1.size(3);
+
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+
+  const int H = grad_output.size(3);
+  const int W = grad_output.size(4);
+
+  const int dilatedKH = kH * dilationH;
+  const int dilatedKW = kW * dilationW;
+
+  const int n = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+
+  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
+  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
+  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
+    const int ph = i / patchW;
+    const int pw = i % patchW;
+    int i1 = h - dilation_patchH * (ph - patchRadH);
+    int j1 = w - dilation_patchW * (pw - patchRadW);
+
+    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+      scalar_t grad_val = 0.0f;
+
+      const int h_2 = i1 + padH;
+      const int w_2 = j1 + padW;
+      const int min_h = h_2 - dilatedKH;
+      const int min_w = w_2 - dilatedKW;
+
+      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+        int i2 = (h_3) / dH;
+        if (i2 * dH != h_3) continue;
+        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+          int j2 = (w_3) / dW;
+          if (j2 * dW != w_3) continue;
+          if (WITHIN_BOUNDS(i2, j2, H, W)) {
+            grad_val += grad_output[n][ph][pw][i2][j2];
+          }
+        }
+      }
+      grad_cache[i] = grad_val;
+    }
+  }
+  __syncthreads();
+
+  for (int c = threadIdx.x; c < C; c += blockDim.x) {
+    scalar_t grad_input_val = 0.0f;
+    for (int ph = 0; ph < patchH; ++ph) {
+      int i1 = h - dilation_patchH * (ph - patchRadH);
+      for (int pw = 0; pw < patchW; ++pw) {
+        int j1 = w - dilation_patchW * (pw - patchRadW);
+        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+          grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw];
+        }
+      }
+    }
+    grad_input2[n][c][h][w] = grad_input_val;
+  }
+}
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6b4d1bbd85bad1b87ee5d6b8a3cd3b29e3cbc411
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
@@ -0,0 +1,367 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef DEFORM_CONV_CUDA_KERNEL_CUH
+#define DEFORM_CONV_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+template <typename T>
+__device__ T deformable_im2col_bilinear(const T *input, const int data_width,
+                                        const int height, const int width, T h,
+                                        T w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+__device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h,
+                                 const int w, const int height,
+                                 const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+__device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height,
+                                   const int width, const T *im_data,
+                                   const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+__global__ void deformable_im2col_gpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = deformable_im2col_bilinear(data_im_ptr, width, height, width,
+                                           h_im, w_im);
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+                                         cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void deformable_col2im_coord_gpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int offset_channels, const int deformable_group, const int height_col,
+    const int width_col, T *grad_offset) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      const T weight = get_coordinate_weight(inv_h, inv_w, height, width,
+                                             data_im_ptr + cnt * height * width,
+                                             width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+#endif  // DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..86c4bc66dd2fb289340a4fb1714edb5db1e798c4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
@@ -0,0 +1,186 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+#define DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void deform_roi_pool_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, const T* offset,
+    T* output, const int pooled_height, const int pooled_width,
+    const T spatial_scale, const int sampling_ratio, const T gamma,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    // Compute roi offset
+    if (offset != NULL) {
+      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+                              ph * pooled_width + pw;
+      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+      T offset_roi_h =
+          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+      roi_start_w += offset_roi_w;
+      roi_start_h += offset_roi_h;
+    }
+
+    // We do average pooling inside a bin
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output[index] = output_val / count;
+  }
+}
+
+template <typename T>
+__global__ void deform_roi_pool_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* input, const T* rois,
+    const T* offset, T* grad_input, T* grad_offset, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int sampling_ratio,
+    const T gamma, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    const T* offset_input =
+        input + ((roi_batch_ind * channels + c) * height * width);
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    // Compute roi offset
+    if (offset != NULL) {
+      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+                              ph * pooled_width + pw;
+      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+      T offset_roi_h =
+          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+      roi_start_w += offset_roi_w;
+      roi_start_h += offset_roi_h;
+    }
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+    const T grad_output_this_bin = grad_output[index] / count;
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_grad_input + y_low * width + x_low,
+                    grad_output_this_bin * w1);
+          atomicAdd(offset_grad_input + y_low * width + x_high,
+                    grad_output_this_bin * w2);
+          atomicAdd(offset_grad_input + y_high * width + x_low,
+                    grad_output_this_bin * w3);
+          atomicAdd(offset_grad_input + y_high * width + x_high,
+                    grad_output_this_bin * w4);
+          if (offset != NULL) {
+            T input_00 = offset_input[y_low * width + x_low];
+            T input_10 = offset_input[y_low * width + x_high];
+            T input_01 = offset_input[y_high * width + x_low];
+            T input_11 = offset_input[y_high * width + x_high];
+            T ogx = gamma * roi_width * grad_output_this_bin *
+                    (input_11 * (y - y_low) + input_10 * (y_high - y) +
+                     input_01 * (y_low - y) + input_00 * (y - y_high));
+            T ogy = gamma * roi_height * grad_output_this_bin *
+                    (input_11 * (x - x_low) + input_01 * (x_high - x) +
+                     input_10 * (x_low - x) + input_00 * (x - x_high));
+            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+                          ph * pooled_width + pw,
+                      ogx);
+            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+                          pooled_width * pooled_height + ph * pooled_width + pw,
+                      ogy);
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif  // DEFORM_ROI_POOL_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..053977a3011692b22a5dce6050fcfec4797f092c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
@@ -0,0 +1,137 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Adapted from
+// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAX_NUM_VERT_IDX 9
+#define INTERSECTION_OFFSET 8
+#define EPSILON 1e-8
+
+inline int opt_n_thread(int work_size) {
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+  return max(min(1 << pow_2, THREADS_PER_BLOCK), 1);
+}
+
+/*
+compare normalized vertices (vertices around (0,0))
+if vertex1 < vertex2 return true.
+order: minimum at x-aixs, become larger in anti-clockwise direction
+*/
+__device__ bool compare_vertices(float x1, float y1, float x2, float y2) {
+  if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON)
+    return false;  // if equal, return false
+
+  if (y1 > 0 && y2 < 0) return true;
+  if (y1 < 0 && y2 > 0) return false;
+
+  float n1 = x1 * x1 + y1 * y1 + EPSILON;
+  float n2 = x2 * x2 + y2 * y2 + EPSILON;
+  float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2;
+
+  if (y1 > 0 && y2 > 0) {
+    if (diff > EPSILON)
+      return true;
+    else
+      return false;
+  }
+  if (y1 < 0 && y2 < 0) {
+    if (diff < EPSILON)
+      return true;
+    else
+      return false;
+  }
+  return false;
+}
+
+__global__ void diff_iou_rotated_sort_vertices_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ vertices,
+    const bool *__restrict__ mask, const int *__restrict__ num_valid,
+    int *__restrict__ idx) {
+  int batch_idx = blockIdx.x;
+  vertices += batch_idx * n * m * 2;
+  mask += batch_idx * n * m;
+  num_valid += batch_idx * n;
+  idx += batch_idx * n * MAX_NUM_VERT_IDX;
+
+  int index = threadIdx.x;  // index of polygon
+  int stride = blockDim.x;
+  for (int i = index; i < n; i += stride) {
+    int pad;  // index of arbitrary invalid intersection point (not box corner!)
+    for (int j = INTERSECTION_OFFSET; j < m; ++j) {
+      if (!mask[i * m + j]) {
+        pad = j;
+        break;
+      }
+    }
+    if (num_valid[i] < 3) {
+      // not enough vertices, take an invalid intersection point
+      // (zero padding)
+      for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) {
+        idx[i * MAX_NUM_VERT_IDX + j] = pad;
+      }
+    } else {
+      // sort the valid vertices
+      // note the number of valid vertices is known
+      // note: check that num_valid[i] < MAX_NUM_VERT_IDX
+      for (int j = 0; j < num_valid[i]; ++j) {
+        // initialize with a "big" value
+        float x_min = 1;
+        float y_min = -EPSILON;
+        int i_take = 0;
+        int i2;
+        float x2, y2;
+        if (j != 0) {
+          i2 = idx[i * MAX_NUM_VERT_IDX + j - 1];
+          x2 = vertices[i * m * 2 + i2 * 2 + 0];
+          y2 = vertices[i * m * 2 + i2 * 2 + 1];
+        }
+        for (int k = 0; k < m; ++k) {
+          float x = vertices[i * m * 2 + k * 2 + 0];
+          float y = vertices[i * m * 2 + k * 2 + 1];
+          if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) {
+            if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) {
+              x_min = x;
+              y_min = y;
+              i_take = k;
+            }
+          }
+        }
+        idx[i * MAX_NUM_VERT_IDX + j] = i_take;
+      }
+      // duplicate the first idx
+      idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0];
+
+      // pad zeros
+      for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) {
+        idx[i * MAX_NUM_VERT_IDX + j] = pad;
+      }
+
+      // for corner case: the two boxes are exactly the same.
+      // in this case, idx would have duplicate elements, which makes the
+      // shoelace formula broken because of the definition, the duplicate
+      // elements only appear in the first 8 positions (they are "corners in
+      // box", not "intersection of edges")
+      if (num_valid[i] == 8) {
+        int counter = 0;
+        for (int j = 0; j < 4; ++j) {
+          int check = idx[i * MAX_NUM_VERT_IDX + j];
+          for (int k = 4; k < INTERSECTION_OFFSET; ++k) {
+            if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++;
+          }
+        }
+        if (counter == 4) {
+          idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0];
+          for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) {
+            idx[i * MAX_NUM_VERT_IDX + j] = pad;
+          }
+        }
+      }
+
+      // TODO: still might need to cover some other corner cases :(
+    }
+  }
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d3801a02c1c8f44874fb84fa884cc23bee25c331
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
@@ -0,0 +1,152 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
+#define FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
+                         int idx1, int idx2) {
+  const float v1 = dists[idx1], v2 = dists[idx2];
+  const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+  dists[idx1] = max(v1, v2);
+  dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  // dataset: (B, N, 3)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * 3;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+
+  int tid = threadIdx.x;
+  const int stride = block_size;
+
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    float x1 = dataset[old * 3 + 0];
+    float y1 = dataset[old * 3 + 1];
+    float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      float x2, y2, z2;
+      x2 = dataset[k * 3 + 0];
+      y2 = dataset[k * 3 + 1];
+      z2 = dataset[k * 3 + 2];
+      // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+      // if (mag <= 1e-3)
+      // continue;
+
+      float d =
+          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+#pragma unroll
+    for (int block_size_thres = 1024; block_size_thres >= 2;
+         block_size_thres >>= 1) {
+      const int tid_thres = block_size_thres / 2;
+      if (block_size >= block_size_thres && tid < tid_thres) {
+        __update(dists, dists_i, tid, tid + tid_thres);
+      }
+      __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+
+// Modified from
+// https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_with_dist_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  // dataset: (B, N, N)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * n;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+
+  int tid = threadIdx.x;
+  const int stride = block_size;
+
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    // float x1 = dataset[old * 3 + 0];
+    // float y1 = dataset[old * 3 + 1];
+    // float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      // float x2, y2, z2;
+      // x2 = dataset[k * 3 + 0];
+      // y2 = dataset[k * 3 + 1];
+      // z2 = dataset[k * 3 + 2];
+
+      // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) *
+      // (z2 - z1);
+      float d = dataset[old * n + k];
+
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+#pragma unroll
+    for (int block_size_thres = 1024; block_size_thres >= 2;
+         block_size_thres >>= 1) {
+      const int tid_thres = block_size_thres / 2;
+      if (block_size >= block_size_thres && tid < tid_thres) {
+        __update(dists, dists_i, tid, tid + tid_thres);
+      }
+      __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+
+#endif  // FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6d932434cba245833e661b8c7e140601940bc35b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef GATHER_POINTS_CUDA_KERNEL_CUH
+#define GATHER_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define TOTAL_THREADS 1024
+
+template <typename T>
+__global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
+                                                  const T *points,
+                                                  const int *__restrict__ idx,
+                                                  T *out) {
+  // points: (B, C, N)
+  // idx: (B, M)
+  // output:
+  //      out: (B, C, M)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    points += bs_idx * c * n + c_idx * n;
+    out[0] = points[idx[0]];
+  }
+}
+
+template <typename T>
+__global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
+                                                   const T *grad_out,
+                                                   const int *__restrict__ idx,
+                                                   T *grad_points) {
+  // grad_out: (B, C, M)
+  // idx: (B, M)
+  // output:
+  //      grad_points: (B, C, N)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    grad_points += bs_idx * c * n + c_idx * n;
+
+    atomicAdd(grad_points + idx[0], grad_out[0]);
+  }
+}
+
+#endif  // GATHER_POINTS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..dfad66fc16d8759f614d7f36fa961673976b1d95
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
@@ -0,0 +1,65 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#ifndef GROUP_POINTS_CUDA_KERNEL_CUH
+#define GROUP_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void group_points_forward_cuda_kernel(int b, int c, int n,
+                                                 int npoints, int nsample,
+                                                 const T *points,
+                                                 const int *__restrict__ idx,
+                                                 T *out) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    int pt_idx = index / nsample;
+    int sample_idx = index % nsample;
+
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                  pt_idx * nsample + sample_idx;
+
+    out[out_idx] = points[in_idx];
+  }
+}
+
+template <typename T>
+__global__ void group_points_backward_cuda_kernel(int b, int c, int n,
+                                                  int npoints, int nsample,
+                                                  const T *grad_out,
+                                                  const int *__restrict__ idx,
+                                                  T *grad_points) {
+  // grad_out: (B, C, npoints, nsample)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      grad_points: (B, C, N)
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
+    int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c) return;
+
+    int sample_idx = index % nsample;
+    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                pt_idx * nsample + sample_idx;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+
+    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
+  }
+}
+
+#endif  // GROUP_POINTS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9ebdcad15eee05a9f412ef34eb12d3553874a4dc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
@@ -0,0 +1,367 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef IOU3D_CUDA_KERNEL_CUH
+#define IOU3D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+const int THREADS_PER_BLOCK_IOU3D = 16;
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+__device__ const float EPS = 1e-8;
+
+struct Point {
+  float x, y;
+  __device__ Point() {}
+  __device__ Point(double _x, double _y) { x = _x, y = _y; }
+
+  __device__ void set(float _x, float _y) {
+    x = _x;
+    y = _y;
+  }
+
+  __device__ Point operator+(const Point &b) const {
+    return Point(x + b.x, y + b.y);
+  }
+
+  __device__ Point operator-(const Point &b) const {
+    return Point(x - b.x, y - b.y);
+  }
+};
+
+__device__ inline float cross(const Point &a, const Point &b) {
+  return a.x * b.y - a.y * b.x;
+}
+
+__device__ inline float cross(const Point &p1, const Point &p2,
+                              const Point &p0) {
+  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+__device__ int check_rect_cross(const Point &p1, const Point &p2,
+                                const Point &q1, const Point &q2) {
+  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
+            min(q1.x, q2.x) <= max(p1.x, p2.x) &&
+            min(p1.y, p2.y) <= max(q1.y, q2.y) &&
+            min(q1.y, q2.y) <= max(p1.y, p2.y);
+  return ret;
+}
+
+__device__ inline int check_in_box2d(const float *box, const Point &p) {
+  // params: box (7) [x, y, z, dx, dy, dz, heading]
+  const float MARGIN = 1e-2;
+
+  float center_x = box[0], center_y = box[1];
+  // rotate the point in the opposite direction of box
+  float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);
+  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
+
+  return (fabs(rot_x) < box[3] / 2 + MARGIN &&
+          fabs(rot_y) < box[4] / 2 + MARGIN);
+}
+
+__device__ inline int intersection(const Point &p1, const Point &p0,
+                                   const Point &q1, const Point &q0,
+                                   Point &ans_point) {
+  // fast exclusion
+  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+  // check cross standing
+  float s1 = cross(q0, p1, p0);
+  float s2 = cross(p1, q1, p0);
+  float s3 = cross(p0, q1, q0);
+  float s4 = cross(q1, p1, q0);
+
+  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+  // calculate intersection of two lines
+  float s5 = cross(q1, p1, p0);
+  if (fabs(s5 - s1) > EPS) {
+    ans_point.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+    ans_point.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+  } else {
+    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+    float D = a0 * b1 - a1 * b0;
+
+    ans_point.x = (b0 * c1 - b1 * c0) / D;
+    ans_point.y = (a1 * c0 - a0 * c1) / D;
+  }
+
+  return 1;
+}
+
+__device__ inline void rotate_around_center(const Point &center,
+                                            const float angle_cos,
+                                            const float angle_sin, Point &p) {
+  float new_x =
+      (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;
+  float new_y =
+      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+  p.set(new_x, new_y);
+}
+
+__device__ inline int point_cmp(const Point &a, const Point &b,
+                                const Point &center) {
+  return atan2(a.y - center.y, a.x - center.x) >
+         atan2(b.y - center.y, b.x - center.x);
+}
+
+__device__ inline float box_overlap(const float *box_a, const float *box_b) {
+  // params box_a: [x, y, z, dx, dy, dz, heading]
+  // params box_b: [x, y, z, dx, dy, dz, heading]
+
+  float a_angle = box_a[6], b_angle = box_b[6];
+  float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,
+        a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
+  float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
+  float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
+  float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
+  float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
+
+  Point center_a(box_a[0], box_a[1]);
+  Point center_b(box_b[0], box_b[1]);
+
+  Point box_a_corners[5];
+  box_a_corners[0].set(a_x1, a_y1);
+  box_a_corners[1].set(a_x2, a_y1);
+  box_a_corners[2].set(a_x2, a_y2);
+  box_a_corners[3].set(a_x1, a_y2);
+
+  Point box_b_corners[5];
+  box_b_corners[0].set(b_x1, b_y1);
+  box_b_corners[1].set(b_x2, b_y1);
+  box_b_corners[2].set(b_x2, b_y2);
+  box_b_corners[3].set(b_x1, b_y2);
+
+  // get oriented corners
+  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+  for (int k = 0; k < 4; k++) {
+    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+  }
+
+  box_a_corners[4] = box_a_corners[0];
+  box_b_corners[4] = box_b_corners[0];
+
+  // get intersection of lines
+  Point cross_points[16];
+  Point poly_center;
+  int cnt = 0, flag = 0;
+
+  poly_center.set(0, 0);
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
+                          box_b_corners[j + 1], box_b_corners[j],
+                          cross_points[cnt]);
+      if (flag) {
+        poly_center = poly_center + cross_points[cnt];
+        cnt++;
+      }
+    }
+  }
+
+  // check corners
+  for (int k = 0; k < 4; k++) {
+    if (check_in_box2d(box_a, box_b_corners[k])) {
+      poly_center = poly_center + box_b_corners[k];
+      cross_points[cnt] = box_b_corners[k];
+      cnt++;
+    }
+    if (check_in_box2d(box_b, box_a_corners[k])) {
+      poly_center = poly_center + box_a_corners[k];
+      cross_points[cnt] = box_a_corners[k];
+      cnt++;
+    }
+  }
+
+  poly_center.x /= cnt;
+  poly_center.y /= cnt;
+
+  // sort the points of polygon
+  Point temp;
+  for (int j = 0; j < cnt - 1; j++) {
+    for (int i = 0; i < cnt - j - 1; i++) {
+      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
+        temp = cross_points[i];
+        cross_points[i] = cross_points[i + 1];
+        cross_points[i + 1] = temp;
+      }
+    }
+  }
+
+  // get the overlap areas
+  float area = 0;
+  for (int k = 0; k < cnt - 1; k++) {
+    area += cross(cross_points[k] - cross_points[0],
+                  cross_points[k + 1] - cross_points[0]);
+  }
+
+  return fabs(area) / 2.0;
+}
+
+__device__ inline float iou_bev(const float *box_a, const float *box_b) {
+  // params box_a: [x, y, z, dx, dy, dz, heading]
+  // params box_b: [x, y, z, dx, dy, dz, heading]
+  float sa = box_a[3] * box_a[4];
+  float sb = box_b[3] * box_b[4];
+  float s_overlap = box_overlap(box_a, box_b);
+  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+__global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(
+    const int num_a, const float *boxes_a, const int num_b,
+    const float *boxes_b, float *ans_overlap) {
+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+  CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
+    if (a_idx >= num_a || b_idx >= num_b) {
+      return;
+    }
+
+    const float *cur_box_a = boxes_a + a_idx * 7;
+    const float *cur_box_b = boxes_b + b_idx * 7;
+    float cur_overlap = box_overlap(cur_box_a, cur_box_b);
+    ans_overlap[a_idx * num_b + b_idx] = cur_overlap;
+  }
+}
+
+__global__ void iou3d_nms3d_forward_cuda_kernel(const int boxes_num,
+                                                const float nms_overlap_thresh,
+                                                const float *boxes,
+                                                unsigned long long *mask) {
+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+  const int blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 7 + 0] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+      block_boxes[threadIdx.x * 7 + 1] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+      block_boxes[threadIdx.x * 7 + 2] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+      block_boxes[threadIdx.x * 7 + 3] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+      block_boxes[threadIdx.x * 7 + 4] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+      block_boxes[threadIdx.x * 7 + 5] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+      block_boxes[threadIdx.x * 7 + 6] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+      const float *cur_box = boxes + cur_box_idx * 7;
+
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks =
+          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+      mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+__device__ inline float iou_normal(float const *const a, float const *const b) {
+  // params: a: [x, y, z, dx, dy, dz, heading]
+  // params: b: [x, y, z, dx, dy, dz, heading]
+
+  float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),
+        right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
+  float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),
+        bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
+  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
+  float interS = width * height;
+  float Sa = a[3] * a[4];
+  float Sb = b[3] * b[4];
+  return interS / fmaxf(Sa + Sb - interS, EPS);
+}
+
+__global__ void iou3d_nms3d_normal_forward_cuda_kernel(
+    const int boxes_num, const float nms_overlap_thresh, const float *boxes,
+    unsigned long long *mask) {
+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+  const int blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 7 + 0] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+      block_boxes[threadIdx.x * 7 + 1] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+      block_boxes[threadIdx.x * 7 + 2] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+      block_boxes[threadIdx.x * 7 + 3] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+      block_boxes[threadIdx.x * 7 + 4] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+      block_boxes[threadIdx.x * 7 + 5] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+      block_boxes[threadIdx.x * 7 + 6] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+      const float *cur_box = boxes + cur_box_idx * 7;
+
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks =
+          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+      mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+#endif  // IOU3D_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3cf52bb90eb27d02b28c52069c760c8a38f83f08
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
@@ -0,0 +1,92 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+#ifndef KNN_CUDA_KERNEL_CUH
+#define KNN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+inline __device__ void swap_float(float *x, float *y) {
+  float tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+inline __device__ void swap_int(int *x, int *y) {
+  int tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+__device__ void reheap(float *dist, int *idx, int k) {
+  int root = 0;
+  int child = root * 2 + 1;
+  while (child < k) {
+    if (child + 1 < k && dist[child + 1] > dist[child]) child++;
+    if (dist[root] > dist[child]) return;
+    swap_float(&dist[root], &dist[child]);
+    swap_int(&idx[root], &idx[child]);
+    root = child;
+    child = root * 2 + 1;
+  }
+}
+
+__device__ void heap_sort(float *dist, int *idx, int k) {
+  int i;
+  for (i = k - 1; i > 0; i--) {
+    swap_float(&dist[0], &dist[i]);
+    swap_int(&idx[0], &idx[i]);
+    reheap(dist, idx, i);
+  }
+}
+
+// input: xyz (b, n, 3) new_xyz (b, m, 3)
+// output: idx (b, m, nsample) dist2 (b, m, nsample)
+template <typename T>
+__global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
+                                        const T *xyz, const T *new_xyz,
+                                        int *__restrict__ idx, T *dist2) {
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b) return;
+
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+    dist2 += bs_idx * m * nsample + pt_idx * nsample;
+
+    T new_x = new_xyz[0];
+    T new_y = new_xyz[1];
+    T new_z = new_xyz[2];
+
+    float best_dist[100];
+    int best_idx[100];
+    for (int i = 0; i < nsample; i++) {
+      best_dist[i] = 1e10;
+      best_idx[i] = 0;
+    }
+    for (int i = 0; i < n; i++) {
+      T x = xyz[i * 3 + 0];
+      T y = xyz[i * 3 + 1];
+      T z = xyz[i * 3 + 2];
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+             (new_z - z) * (new_z - z);
+      if (d2 < best_dist[0]) {
+        best_dist[0] = d2;
+        best_idx[0] = i;
+        reheap(best_dist, best_idx, nsample);
+      }
+    }
+    heap_sort(best_dist, best_idx, nsample);
+    for (int i = 0; i < nsample; i++) {
+      idx[i] = best_idx[i];
+      dist2[i] = best_dist[i];
+    }
+  }
+}
+
+#endif  // KNN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1a0bd040e823eaaa79f96e525f961a8b8fbeafb5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
@@ -0,0 +1,62 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MASKED_CONV2D_CUDA_KERNEL_CUH
+#define MASKED_CONV2D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
+                                    const int height, const int width,
+                                    const int kernel_h, const int kernel_w,
+                                    const int pad_h, const int pad_w,
+                                    const int64_t *mask_h_idx,
+                                    const int64_t *mask_w_idx,
+                                    const int mask_cnt, scalar_t *data_col) {
+  // mask_cnt * channels
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_col = mask_h_idx[m_index];
+    const int w_col = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col - pad_h;
+    const int w_offset = w_col - pad_w;
+    scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;
+    for (int i = 0; i < kernel_h; ++i) {
+      int h_im = h_offset + i;
+      for (int j = 0; j < kernel_w; ++j) {
+        int w_im = w_offset + j;
+        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+          *data_col_ptr =
+              (scalar_t)data_im[(c_im * height + h_im) * width + w_im];
+        } else {
+          *data_col_ptr = 0.0;
+        }
+        data_col_ptr += mask_cnt;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
+                                    const int height, const int width,
+                                    const int channels,
+                                    const int64_t *mask_h_idx,
+                                    const int64_t *mask_w_idx,
+                                    const int mask_cnt, scalar_t *data_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_im = mask_h_idx[m_index];
+    const int w_im = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    // compute the start and end of the output
+    data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
+  }
+}
+
+#endif  // MASKED_CONV2D_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..df56e743669c3426f6abb113e4209d0cc60f2baf
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
@@ -0,0 +1,300 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
+#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAXN 20
+__device__ const float PI = 3.1415926;
+
+struct Point {
+  float x, y;
+  __device__ Point() {}
+  __device__ Point(float x, float y) : x(x), y(y) {}
+};
+
+__device__ inline void swap1(Point *a, Point *b) {
+  Point temp;
+  temp.x = a->x;
+  temp.y = a->y;
+
+  a->x = b->x;
+  a->y = b->y;
+
+  b->x = temp.x;
+  b->y = temp.y;
+}
+__device__ inline float cross(Point o, Point a, Point b) {
+  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
+}
+
+__device__ inline float dis(Point a, Point b) {
+  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+}
+__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) {
+  float convex_points[2][MAXN];
+  for (int j = 0; j < n_points; j++) {
+    convex_points[0][j] = ps[j].x;
+  }
+  for (int j = 0; j < n_points; j++) {
+    convex_points[1][j] = ps[j].y;
+  }
+
+  Point edges[MAXN];
+  float edges_angles[MAXN];
+  float unique_angles[MAXN];
+  int n_edges = n_points - 1;
+  int n_unique = 0;
+  int unique_flag = 0;
+
+  for (int i = 0; i < n_edges; i++) {
+    edges[i].x = ps[i + 1].x - ps[i].x;
+    edges[i].y = ps[i + 1].y - ps[i].y;
+  }
+  for (int i = 0; i < n_edges; i++) {
+    edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x);
+    if (edges_angles[i] >= 0) {
+      edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2);
+    } else {
+      edges_angles[i] =
+          edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2);
+    }
+  }
+  unique_angles[0] = edges_angles[0];
+  n_unique += 1;
+  for (int i = 1; i < n_edges; i++) {
+    for (int j = 0; j < n_unique; j++) {
+      if (edges_angles[i] == unique_angles[j]) {
+        unique_flag += 1;
+      }
+    }
+    if (unique_flag == 0) {
+      unique_angles[n_unique] = edges_angles[i];
+      n_unique += 1;
+      unique_flag = 0;
+    } else {
+      unique_flag = 0;
+    }
+  }
+
+  float minarea = 1e12;
+  for (int i = 0; i < n_unique; i++) {
+    float R[2][2];
+    float rot_points[2][MAXN];
+    R[0][0] = cos(unique_angles[i]);
+    R[0][1] = sin(unique_angles[i]);
+    R[1][0] = -sin(unique_angles[i]);
+    R[1][1] = cos(unique_angles[i]);
+    // R x Points
+    for (int m = 0; m < 2; m++) {
+      for (int n = 0; n < n_points; n++) {
+        float sum = 0.0;
+        for (int k = 0; k < 2; k++) {
+          sum = sum + R[m][k] * convex_points[k][n];
+        }
+        rot_points[m][n] = sum;
+      }
+    }
+
+    // xmin;
+    float xmin, ymin, xmax, ymax;
+    xmin = 1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
+        continue;
+      } else {
+        if (rot_points[0][j] < xmin) {
+          xmin = rot_points[0][j];
+        }
+      }
+    }
+    // ymin
+    ymin = 1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
+        continue;
+      } else {
+        if (rot_points[1][j] < ymin) {
+          ymin = rot_points[1][j];
+        }
+      }
+    }
+    // xmax
+    xmax = -1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
+        continue;
+      } else {
+        if (rot_points[0][j] > xmax) {
+          xmax = rot_points[0][j];
+        }
+      }
+    }
+    // ymax
+    ymax = -1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
+        continue;
+      } else {
+        if (rot_points[1][j] > ymax) {
+          ymax = rot_points[1][j];
+        }
+      }
+    }
+    float area = (xmax - xmin) * (ymax - ymin);
+    if (area < minarea) {
+      minarea = area;
+      minbox[0] = unique_angles[i];
+      minbox[1] = xmin;
+      minbox[2] = ymin;
+      minbox[3] = xmax;
+      minbox[4] = ymax;
+    }
+  }
+}
+
+// convex_find
+__device__ inline void Jarvis(Point *in_poly, int &n_poly) {
+  int n_input = n_poly;
+  Point input_poly[20];
+  for (int i = 0; i < n_input; i++) {
+    input_poly[i].x = in_poly[i].x;
+    input_poly[i].y = in_poly[i].y;
+  }
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[20], top1, top2;
+  // float sign;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point *j = &(in_poly[0]);
+      Point *k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+
+  for (int i = 0; i <= top1; i++) {
+    right_point[i] = in_poly[Stack[i]];
+  }
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+
+  for (int i = top2 - 1; i >= 0; i--) {
+    left_point[i] = in_poly[Stack[i]];
+  }
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+}
+
+template <typename T>
+__device__ inline void Findminbox(T const *const p, T *minpoints) {
+  Point ps1[MAXN];
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = p[i * 2];
+    convex[i].y = p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  Jarvis(convex, n_convex);
+  int n1 = n_convex;
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = convex[i].x;
+    ps1[i].y = convex[i].y;
+  }
+  ps1[n1].x = convex[0].x;
+  ps1[n1].y = convex[0].y;
+
+  float minbbox[5] = {0};
+  minBoundingRect(ps1, n1 + 1, minbbox);
+  float angle = minbbox[0];
+  float xmin = minbbox[1];
+  float ymin = minbbox[2];
+  float xmax = minbbox[3];
+  float ymax = minbbox[4];
+  float R[2][2];
+
+  R[0][0] = cos(angle);
+  R[0][1] = sin(angle);
+  R[1][0] = -sin(angle);
+  R[1][1] = cos(angle);
+
+  minpoints[0] = xmax * R[0][0] + ymin * R[1][0];
+  minpoints[1] = xmax * R[0][1] + ymin * R[1][1];
+  minpoints[2] = xmin * R[0][0] + ymin * R[1][0];
+  minpoints[3] = xmin * R[0][1] + ymin * R[1][1];
+  minpoints[4] = xmin * R[0][0] + ymax * R[1][0];
+  minpoints[5] = xmin * R[0][1] + ymax * R[1][1];
+  minpoints[6] = xmax * R[0][0] + ymax * R[1][0];
+  minpoints[7] = xmax * R[0][1] + ymax * R[1][1];
+}
+
+template <typename T>
+__global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes,
+                                              const T *ex_boxes, T *minbox) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T *cur_box = ex_boxes + index * 18;
+    T *cur_min_box = minbox + index * 8;
+    Findminbox(cur_box, cur_min_box);
+  }
+}
+
+#endif  // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ca0e91a25246569bb7de04649ab4f5afe233670c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
@@ -0,0 +1,399 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+#define MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+template <typename T>
+__device__ T dmcn_im2col_bilinear(const T *input, const int data_width,
+                                  const int height, const int width, T h, T w) {
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+__device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h,
+                                      const int w, const int height,
+                                      const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+__device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w,
+                                        const int height, const int width,
+                                        const T *im_data, const int data_width,
+                                        const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+__global__ void modulated_deformable_im2col_gpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const T *data_mask,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const T *data_mask_ptr =
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im,
+                                     w_im);
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void modulated_deformable_col2im_gpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const T *data_mask,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T mask = data_mask_ptr[data_mask_hw_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight =
+              dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+                                       cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const T *data_mask, const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      else
+        mval += data_col_ptr[col_pos] *
+                dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width,
+                                     height, width, inv_h, inv_w);
+      const T weight = dmcn_get_coordinate_weight(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                      kernel_w +
+                  offset_c / 2) *
+                     height_col +
+                 h) *
+                    width_col +
+                w] = mval;
+  }
+}
+
+#endif  // MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..12225ffdb3b1691ad9edabcd1663109f67ef1a6f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
@@ -0,0 +1,801 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#ifndef DEFORM_ATTN_CUDA_KERNEL
+#define DEFORM_ATTN_CUDA_KERNEL
+
+#include "common_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(
+    const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes,
+    const int64_t *data_level_start_index, const scalar_t *data_sampling_loc,
+    const scalar_t *data_attn_weight, const int batch_size,
+    const int spatial_size, const int num_heads, const int channels,
+    const int num_levels, const int num_query, const int num_point,
+    scalar_t *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr =
+          data_value +
+          (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h,
+                                                spatial_w, num_heads, channels,
+                                                h_im, w_im, m_col, c_col) *
+                 weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+  __shared__ scalar_t cache_grad_attn_weight[blockSize];
+  unsigned int tid = threadIdx.x;
+  const int qid_stride = num_heads * channels;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int _tid = 1; _tid < blockSize; ++_tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[_tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc_out = _grad_w;
+          *(grad_sampling_loc_out + 1) = _grad_h;
+          *grad_attn_weight_out = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+  __shared__ scalar_t cache_grad_attn_weight[blockSize];
+  unsigned int tid = threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight_out = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  extern __shared__ int _s[];
+  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+  unsigned int tid = threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[_tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc_out = _grad_w;
+          *(grad_sampling_loc_out + 1) = _grad_h;
+          *grad_attn_weight_out = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  extern __shared__ int _s[];
+  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+  unsigned int tid = threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight_out = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  extern __shared__ int _s[];
+  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+  unsigned int tid = threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear_gm(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              grad_sampling_loc_out, grad_attn_weight_out);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+#endif  // DEFORM_ATTN_CUDA_KERNEL
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..281d9f0b409f54260a81a79ad96ab09fde9580ce
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
@@ -0,0 +1,117 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef NMS_CUDA_KERNEL_CUH
+#define NMS_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+int const threadsPerBlock = sizeof(unsigned long long int) * 8;
+
+__device__ inline bool devIoU(float const *const a, float const *const b,
+                              const int offset, const float threshold) {
+  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
+  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
+  float width = fmaxf(right - left + offset, 0.f),
+        height = fmaxf(bottom - top + offset, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
+  float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
+  return interS > threshold * (Sa + Sb - interS);
+}
+
+__global__ static void nms_cuda(const int n_boxes, const float iou_threshold,
+                                const int offset, const float *dev_boxes,
+                                unsigned long long *dev_mask) {
+  int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    const int tid = threadIdx.x;
+
+    if (row_start > col_start) return;
+
+    const int row_size =
+        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    __shared__ float block_boxes[threadsPerBlock * 4];
+    if (tid < col_size) {
+      block_boxes[tid * 4 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
+      block_boxes[tid * 4 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
+      block_boxes[tid * 4 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
+      block_boxes[tid * 4 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
+    }
+    __syncthreads();
+
+    if (tid < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + tid;
+      const float *cur_box = dev_boxes + cur_box_idx * 4;
+      int i = 0;
+      unsigned long long int t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = tid + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
+          t |= 1ULL << i;
+        }
+      }
+      dev_mask[cur_box_idx * gridDim.y + col_start] = t;
+    }
+  }
+}
+
+__global__ static void gather_keep_from_mask(bool *keep,
+                                             const unsigned long long *dev_mask,
+                                             const int n_boxes) {
+  const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+  const int tid = threadIdx.x;
+
+  // mark the bboxes which have been removed.
+  extern __shared__ unsigned long long removed[];
+
+  // initialize removed.
+  for (int i = tid; i < col_blocks; i += blockDim.x) {
+    removed[i] = 0;
+  }
+  __syncthreads();
+
+  for (int nblock = 0; nblock < col_blocks; ++nblock) {
+    auto removed_val = removed[nblock];
+    __syncthreads();
+    const int i_offset = nblock * threadsPerBlock;
+#pragma unroll
+    for (int inblock = 0; inblock < threadsPerBlock; ++inblock) {
+      const int i = i_offset + inblock;
+      if (i >= n_boxes) break;
+      // select a candidate, check if it should kept.
+      if (!(removed_val & (1ULL << inblock))) {
+        if (tid == 0) {
+          // mark the output.
+          keep[i] = true;
+        }
+        auto p = dev_mask + i * col_blocks;
+        // remove all bboxes which overlap the candidate.
+        for (int j = tid; j < col_blocks; j += blockDim.x) {
+          if (j >= nblock) removed[j] |= p[j];
+        }
+        __syncthreads();
+        removed_val = removed[nblock];
+      }
+    }
+  }
+}
+
+#endif  // NMS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..bba3b8258f6b8798b9d1a651bfda29c48bb5376a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh
@@ -0,0 +1,141 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#ifndef NMS_QUADRI_CUDA_CUH
+#define NMS_QUADRI_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+__host__ __device__ inline int divideUP(const int x, const int y) {
+  return (((x) + (y)-1) / (y));
+}
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template <typename T>
+__global__ void nms_quadri_cuda_kernel(const int n_boxes,
+                                       const float iou_threshold,
+                                       const T* dev_boxes,
+                                       unsigned long long* dev_mask,
+                                       const int multi_label) {
+  if (multi_label == 1) {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 8 values
+    // (x1, y1, ..., x4, y4) here.
+    __shared__ T block_boxes[threadsPerBlock * 8];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 8 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0];
+      block_boxes[threadIdx.x * 8 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1];
+      block_boxes[threadIdx.x * 8 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2];
+      block_boxes[threadIdx.x * 8 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3];
+      block_boxes[threadIdx.x * 8 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4];
+      block_boxes[threadIdx.x * 8 + 5] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5];
+      block_boxes[threadIdx.x * 8 + 6] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6];
+      block_boxes[threadIdx.x * 8 + 7] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 9;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_quadri function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  } else {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 8 values
+    // (x1, y1, , ..., x4, y4) here.
+    __shared__ T block_boxes[threadsPerBlock * 8];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 8 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 0];
+      block_boxes[threadIdx.x * 8 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 1];
+      block_boxes[threadIdx.x * 8 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 2];
+      block_boxes[threadIdx.x * 8 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 3];
+      block_boxes[threadIdx.x * 8 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 4];
+      block_boxes[threadIdx.x * 8 + 5] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 5];
+      block_boxes[threadIdx.x * 8 + 6] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 6];
+      block_boxes[threadIdx.x * 8 + 7] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 7];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 8;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_quadri function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_quadri<T>(cur_box, block_boxes + i * 8, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..747327afb83900177dd4721f1b0ba99153f658d7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
@@ -0,0 +1,133 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
+#ifndef NMS_ROTATED_CUDA_CUH
+#define NMS_ROTATED_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+__host__ __device__ inline int divideUP(const int x, const int y) {
+  return (((x) + (y)-1) / (y));
+}
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template <typename T>
+__global__ void nms_rotated_cuda_kernel(const int n_boxes,
+                                        const float iou_threshold,
+                                        const T* dev_boxes,
+                                        unsigned long long* dev_mask,
+                                        const int multi_label) {
+  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
+
+  if (multi_label == 1) {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 5 values
+    // (x_center, y_center, width, height, angle_degrees) here.
+    __shared__ T block_boxes[threadsPerBlock * 5];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 5 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
+      block_boxes[threadIdx.x * 5 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
+      block_boxes[threadIdx.x * 5 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
+      block_boxes[threadIdx.x * 5 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
+      block_boxes[threadIdx.x * 5 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 6;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_rotated function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  } else {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 5 values
+    // (x_center, y_center, width, height, angle_degrees) here.
+    __shared__ T block_boxes[threadsPerBlock * 5];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 5 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+      block_boxes[threadIdx.x * 5 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+      block_boxes[threadIdx.x * 5 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+      block_boxes[threadIdx.x * 5 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+      block_boxes[threadIdx.x * 5 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 5;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_rotated function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7918a57452bbde9dc7c249b0c3dd2774aa1961bf
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019, SenseTime.
+ */
+
+#ifndef INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
+#define INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
+
+#ifndef __CUDACC__
+#error cudawarpfunction.cuh should only be included by .cu files
+#endif
+#include <cuda.h>
+
+#include <parrots/foundation/common.hpp>
+
+#ifdef PARROTS_USE_HALF
+#include <cuda_fp16.h>
+#endif
+#ifdef __CUDA_ARCH__
+#define CUDA_INTRINSIC_FUNC(Expr) Expr
+#else
+#define CUDA_INTRINSIC_FUNC(Expr)
+#endif
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#ifdef PARROTS_USE_HALF
+
+#if CUDA_VERSION < 9000
+
+__device__ inline float16 __shfl(float16 var, int srcLane, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl(var.y, srcLane, width););
+}
+
+__device__ inline float16 __shfl_up(float16 var, unsigned delta, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_up(var.y, delta, width););
+}
+
+__device__ inline float16 __shfl_down(float16 var, unsigned delta, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_down(var.y, delta, width););
+}
+
+__device__ inline float16 __shfl_xor(float16 var, int laneMask, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_xor(var.y, laneMask, width););
+}
+
+#else  // CUDA_VERSION >= 9000
+
+__device__ inline float16 __shfl_sync(unsigned mask, float16 var, int srcLane,
+                                      int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(float16 r; r.y = __shfl_sync(mask, var.y, srcLane, width);
+                      return r;);
+}
+
+__device__ inline float16 __shfl_up_sync(unsigned mask, float16 var,
+                                         unsigned delta, int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(
+      float16 r; r.y = __shfl_up_sync(mask, var.y, delta, width); return r;);
+}
+
+__device__ inline float16 __shfl_down_sync(unsigned mask, float16 var,
+                                           unsigned delta,
+                                           int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(
+      float16 r; r.y = __shfl_down_sync(mask, var.y, delta, width); return r;);
+}
+
+__device__ inline float16 __shfl_xor_sync(unsigned mask, float16 var,
+                                          int laneMask, int width) {
+  CUDA_INTRINSIC_FUNC(float16 r;
+                      r.y = __shfl_xor_sync(mask, var.y, laneMask, width);
+                      return r;);
+}
+
+#endif  // CUDA_VERSION < 9000
+
+#endif  // PARROTS_USE_HALF
+
+// warp shuffle interface with a dummy mask
+#if CUDA_VERSION < 9000
+
+template <typename T>
+__device__ inline T __shfl_sync(unsigned mask, T var, int srcLane,
+                                int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl(var, srcLane, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_up_sync(unsigned mask, T var, unsigned delta,
+                                   int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_up(var, delta, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_down_sync(unsigned mask, T var, unsigned delta,
+                                     int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_down(var, delta, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_xor_sync(unsigned mask, T var, int laneMask,
+                                    int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_xor(var, laneMask, width););
+}
+
+#endif  // CUDA_VERSION < 9000
+
+#endif  // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#endif  // INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..342362079a5ce3dde6d19532b3014872f4373330
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
@@ -0,0 +1,95 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINT_IN_BOXES_CUDA_KERNEL_CUH
+#define POINT_IN_BOXES_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void points_in_boxes_part_forward_cuda_kernel(
+    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
+    int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
+  // (B, npoints), default -1
+
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (bs_idx >= batch_size) return;
+
+    boxes += bs_idx * boxes_num * 7;
+    pts += bs_idx * pts_num * 3 + pt_idx * 3;
+    box_idx_of_points += bs_idx * pts_num + pt_idx;
+
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = 0;
+    for (int k = 0; k < boxes_num; k++) {
+      cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+      if (cur_in_flag) {
+        box_idx_of_points[0] = k;
+        break;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void points_in_boxes_all_forward_cuda_kernel(
+    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
+    int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
+  // (B, npoints), default -1
+
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (bs_idx >= batch_size) return;
+
+    boxes += bs_idx * boxes_num * 7;
+    pts += bs_idx * pts_num * 3 + pt_idx * 3;
+    box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
+
+    T local_x = 0, local_y = 0;
+    for (int k = 0; k < boxes_num; k++) {
+      const int cur_in_flag =
+          check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+      if (cur_in_flag) {
+        box_idx_of_points[k] = 1;
+      }
+    }
+  }
+}
+
+#endif  // POINT_IN_BOXES_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a0769d75a29ce8d7eac00931d6f51caa292b2693
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
@@ -0,0 +1,79 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
+#define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+struct point {
+  float x, y;
+};
+
+template <typename scalar_t>
+__global__ void points_in_polygons_forward_cuda_kernel(
+    const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2,
+    const int rows, const int cols, scalar_t *inside_flag) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int row = index / cols;
+    int col = index % cols;
+
+    const scalar_t *offset_vertex1 = vertex1 + row * 2;
+    const scalar_t *offset_vertex2 = vertex2 + col * 8;
+
+    point point_[1];
+    point polygon[4];
+
+    point_[0].x = offset_vertex1[0];
+    point_[0].y = offset_vertex1[1];
+
+    polygon[0].x = offset_vertex2[0];
+    polygon[0].y = offset_vertex2[1];
+    polygon[1].x = offset_vertex2[2];
+    polygon[1].y = offset_vertex2[3];
+    polygon[2].x = offset_vertex2[4];
+    polygon[2].y = offset_vertex2[5];
+    polygon[3].x = offset_vertex2[6];
+    polygon[3].y = offset_vertex2[7];
+
+    int nCross = 0;
+    int i, j;
+    float sx, sy, tx, ty, px, py, x;
+    for (i = 0, j = 3; i < 4; j = i, i++) {
+      sx = polygon[i].x;
+      sy = polygon[i].y;
+      tx = polygon[j].x;
+      ty = polygon[j].y;
+
+      px = point_[0].x;
+      py = point_[0].y;
+
+      if (py < min(sy, ty)) continue;
+      if (py > max(sy, ty)) continue;
+
+      if ((sx == px && sy == py) || (tx == px && ty == py)) {
+        break;
+      } else {
+        if ((sy < py && ty >= py) || (sy >= py && ty < py)) {
+          x = sx + (py - sy) * (tx - sx) / (ty - sy);
+          if (x == px) {
+            break;
+          }
+          if (x > px) {
+            nCross++;
+          }
+        }
+      }
+    }
+    if (nCross % 2 == 1) {
+      inside_flag[index] = 1.0;
+    } else {
+      inside_flag[index] = 0.0;
+    }
+    return;
+  }
+}
+
+#endif  // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e2f5a11b8dd6058f8d2fd288fc943dc235b39c37
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
@@ -0,0 +1,381 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu
+// Distributed under terms of the MIT license.
+#ifndef PRROI_POOL_CUDA_KERNEL_CUH
+#define PRROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data,
+                                                        const int h,
+                                                        const int w,
+                                                        const int height,
+                                                        const int width) {
+  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
+  T retVal = overflow ? 0.0f : data[h * width + w];
+  return retVal;
+}
+
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) {
+  return (1.0f - abs(dh)) * (1.0f - abs(dw));
+}
+
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t,
+                                                                   T c1, T c2) {
+  return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1;
+}
+
+template <typename T>
+__device__ static T PrRoIPoolingInterpolation(const T *data, const T h,
+                                              const T w, const int height,
+                                              const int width) {
+  T retVal = 0.0f;
+  int h1 = floorf(h);
+  int w1 = floorf(w);
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h) + 1;
+  w1 = floorf(w);
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h);
+  w1 = floorf(w) + 1;
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h) + 1;
+  w1 = floorf(w) + 1;
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  return retVal;
+}
+
+template <typename T>
+__device__ static T PrRoIPoolingMatCalculation(const T *this_data,
+                                               const int s_h, const int s_w,
+                                               const int e_h, const int e_w,
+                                               const T y0, const T x0,
+                                               const T y1, const T x1,
+                                               const int h0, const int w0) {
+  T alpha, beta, lim_alpha, lim_beta, tmp;
+  T sum_out = 0;
+
+  alpha = x0 - T(s_w);
+  beta = y0 - T(s_h);
+  lim_alpha = x1 - T(s_w);
+  lim_beta = y1 - T(s_h);
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;
+
+  alpha = x0 - T(s_w);
+  beta = T(e_h) - y1;
+  lim_alpha = x1 - T(s_w);
+  lim_beta = T(e_h) - y0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;
+
+  return sum_out;
+}
+
+template <typename T>
+__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff,
+                                                  const int h, const int w,
+                                                  const int height,
+                                                  const int width,
+                                                  const T coeff) {
+  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
+  if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff);
+}
+
+template <typename T>
+__device__ static void PrRoIPoolingMatDistributeDiff(
+    T *diff, const T top_diff, const int s_h, const int s_w, const int e_h,
+    const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,
+    const int w0) {
+  T alpha, beta, lim_alpha, lim_beta, tmp;
+
+  alpha = x0 - T(s_w);
+  beta = y0 - T(s_h);
+  lim_alpha = x1 - T(s_w);
+  lim_beta = y1 - T(s_h);
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp);
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp);
+
+  alpha = x0 - T(s_w);
+  beta = T(e_h) - y1;
+  lim_alpha = x1 - T(s_w);
+  lim_beta = T(e_h) - y0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp);
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp);
+}
+
+template <typename T>
+__global__ void prroi_pool_forward_cuda_kernel(
+    const int nthreads, const T *input, const T *rois, T *output,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T *offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    T roi_x1 = offset_rois[1] * spatial_scale;
+    T roi_y1 = offset_rois[2] * spatial_scale;
+    T roi_x2 = offset_rois[3] * spatial_scale;
+    T roi_y2 = offset_rois[4] * spatial_scale;
+
+    T roi_width = max(roi_x2 - roi_x1, ((T)0.0));
+    T roi_height = max(roi_y2 - roi_y1, ((T)0.0));
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    const T *this_data =
+        input + (roi_batch_ind * channels + c) * height * width;
+    T *this_out = output + index;
+
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+    if (bin_size == 0) {
+      *this_out = 0;
+      continue;
+    }
+
+    T sum_out = 0;
+
+    int start_x, start_y, end_x, end_y;
+
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
+      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
+        sum_out += PrRoIPoolingMatCalculation(
+            this_data, bin_y, bin_x, bin_y + 1, bin_x + 1,
+            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
+            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
+            width);
+    *this_out = sum_out / bin_size;
+  }
+}
+
+template <typename T>
+__global__ void prroi_pool_backward_cuda_kernel(
+    const int nthreads, const T *grad_output, const T *rois, T *grad_input,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    auto rois_cur = rois + n * 5;
+
+    int roi_batch_ind = rois_cur[0];
+    T roi_x1 = rois_cur[1] * spatial_scale;
+    T roi_y1 = rois_cur[2] * spatial_scale;
+    T roi_x2 = rois_cur[3] * spatial_scale;
+    T roi_y2 = rois_cur[4] * spatial_scale;
+
+    T roi_width = max(roi_x2 - roi_x1, (T)0);
+    T roi_height = max(roi_y2 - roi_y1, (T)0);
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    const T *this_out_grad = grad_output + index;
+    T *this_data_grad =
+        grad_input + (roi_batch_ind * channels + c) * height * width;
+
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+
+    T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size;
+
+    int start_x, start_y, end_x, end_y;
+
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
+      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
+        PrRoIPoolingMatDistributeDiff(
+            this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1,
+            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
+            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
+            width);
+  }
+}
+
+template <typename T>
+__global__ void prroi_pool_coor_backward_cuda_kernel(
+    const int nthreads, const T *output, const T *grad_output, const T *input,
+    const T *rois, T *grad_rois, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    auto rois_cur = rois + n * 5;
+
+    int roi_batch_ind = rois_cur[0];
+    T roi_x1 = rois_cur[1] * spatial_scale;
+    T roi_y1 = rois_cur[2] * spatial_scale;
+    T roi_x2 = rois_cur[3] * spatial_scale;
+    T roi_y2 = rois_cur[4] * spatial_scale;
+
+    T roi_width = max(roi_x2 - roi_x1, (T)0);
+    T roi_height = max(roi_y2 - roi_y1, (T)0);
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    const T output_grad_val = grad_output[index];
+    const T *this_input_data =
+        input + (roi_batch_ind * channels + c) * height * width;
+    const T output_val = output[index];
+    T *this_rois_grad = grad_rois + n * 5;
+
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+
+    T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size;
+
+    // WARNING: to be discussed
+    if (sum_out == 0) continue;
+
+    int start_x, start_y, end_x, end_y;
+
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+
+    T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0;
+    for (int bin_y = start_y; bin_y < end_y; ++bin_y) {
+      grad_x1_y += PrRoIPoolingSingleCoorIntegral(
+          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1,
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1,
+                                    height, width));
+
+      grad_x2_y += PrRoIPoolingSingleCoorIntegral(
+          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2,
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2,
+                                    height, width));
+    }
+
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x) {
+      grad_x_y1 += PrRoIPoolingSingleCoorIntegral(
+          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
+          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x),
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1),
+                                    height, width));
+
+      grad_x_y2 += PrRoIPoolingSingleCoorIntegral(
+          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
+          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x),
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1),
+                                    height, width));
+    }
+
+    T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val;
+    T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val;
+    T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val;
+    T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val;
+
+    partial_x1 = partial_x1 / bin_size * spatial_scale;
+    partial_x2 = partial_x2 / bin_size * spatial_scale;
+    partial_y1 = partial_y1 / bin_size * spatial_scale;
+    partial_y2 = partial_y2 / bin_size * spatial_scale;
+
+    // (index, x1, y1, x2, y2)
+    this_rois_grad[0] = 0;
+    atomicAdd(this_rois_grad + 1,
+              (partial_x1 * (1.0f - T(pw) / pooled_width) +
+               partial_x2 * (1.0f - T(pw + 1) / pooled_width)) *
+                  output_grad_val);
+    atomicAdd(this_rois_grad + 2,
+              (partial_y1 * (1.0f - T(ph) / pooled_height) +
+               partial_y2 * (1.0f - T(ph + 1) / pooled_height)) *
+                  output_grad_val);
+    atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width +
+                                   partial_x1 * T(pw) / pooled_width) *
+                                      output_grad_val);
+    atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height +
+                                   partial_y1 * T(ph) / pooled_height) *
+                                      output_grad_val);
+  }
+}
+
+#endif  // ROI_POOL_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5d946686bdd5fdfbf8a27f6d040e15861202f471
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh
@@ -0,0 +1,141 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef PSAMASK_CUDA_KERNEL_CUH
+#define PSAMASK_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+// CUDA: grid stride looping
+#ifndef CUDA_KERNEL_LOOP
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+#endif
+
+template <typename T>
+__global__ void psamask_collect_forward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* mask_data, T* buffer_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        buffer_data[(n * h_feature * w_feature +
+                     (hidx + h - half_h_mask) * w_feature +
+                     (widx + w - half_w_mask)) *
+                        h_feature * w_feature +
+                    h * w_feature + w] = mask_data
+            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
+                 w_feature +
+             w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_distribute_forward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* mask_data, T* buffer_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        buffer_data[(n * h_feature * w_feature + h * w_feature + w) *
+                        h_feature * w_feature +
+                    (hidx + h - half_h_mask) * w_feature +
+                    (widx + w - half_w_mask)] = mask_data
+            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
+                 w_feature +
+             w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_collect_backward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
+                   h) *
+                      w_feature +
+                  w] = buffer_diff[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_distribute_backward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
+                   h) *
+                      w_feature +
+                  w] =
+            buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *
+                            h_feature * w_feature +
+                        (hidx + h - half_h_mask) * w_feature +
+                        (widx + w - half_w_mask)];
+      }
+    }
+  }
+}
+
+#endif  // PSAMASK_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4383d9e82cce97362f53cf799b8dfa30c7b4cd02
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
@@ -0,0 +1,242 @@
+// Modified from
+// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu
+#ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+#define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+
+/*** Forward ***/
+template <typename scalar_t>
+__global__ void riroi_align_rotated_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_rois, const scalar_t spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int num_orientations, scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int o = (index / pooled_width / pooled_height) % num_orientations;
+    int c =
+        (index / pooled_width / pooled_height / num_orientations) % channels;
+    int n = index / pooled_width / pooled_height / num_orientations / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    // Force malformed ROIs to be 1x1
+    roi_width = max(roi_width, (scalar_t)1.);
+    roi_height = max(roi_height, (scalar_t)1.);
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    // find aligned index
+    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
+    int ind = floorf(ind_float);
+    scalar_t l_var = ind_float - (scalar_t)ind;
+    scalar_t r_var = 1.0 - l_var;
+    // correct start channel
+    ind = (ind + num_orientations) % num_orientations;
+    // rotated channel
+    int ind_rot = (o - ind + num_orientations) % num_orientations;
+    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot) *
+                          height * width;
+
+    const scalar_t *offset_bottom_data_plus =
+        bottom_data + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot_plus) *
+                          height * width;
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (num_samples > 0)
+                             ? num_samples
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosscalar_theta = cos(theta);
+    scalar_t sinscalar_theta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    scalar_t output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta (counterclockwise) around the center and translate
+        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
+        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
+
+        scalar_t val = bilinear_interpolate<scalar_t>(
+            offset_bottom_data, height, width, y, x, index);
+        scalar_t val_plus = bilinear_interpolate<scalar_t>(
+            offset_bottom_data_plus, height, width, y, x, index);
+        output_val += r_var * val + l_var * val_plus;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+/*** Backward ***/
+template <typename scalar_t>
+__global__ void riroi_align_rotated_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
+    const scalar_t spatial_scale, const int num_samples, const bool clockwise,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int o = (index / pooled_width / pooled_height) % num_orientations;
+    int c =
+        (index / pooled_width / pooled_height / num_orientations) % channels;
+    int n = index / pooled_width / pooled_height / num_orientations / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not round
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    // Force malformed ROIs to be 1x1
+    roi_width = max(roi_width, (scalar_t)1.);
+    roi_height = max(roi_height, (scalar_t)1.);
+
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    // find aligned index
+    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
+    int ind = floorf(ind_float);
+    scalar_t l_var = ind_float - (scalar_t)ind;
+    scalar_t r_var = 1.0 - l_var;
+    // correct start channel
+    ind = (ind + num_orientations) % num_orientations;
+    // rotated channel
+    int ind_rot = (o - ind + num_orientations) % num_orientations;
+    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot) *
+                          height * width;
+    scalar_t *offset_bottom_diff_plus =
+        bottom_diff + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot_plus) *
+                          height * width;
+    int top_offset =
+        (n * channels * num_orientations + c * num_orientations + o) *
+        pooled_height * pooled_width;
+    const scalar_t *offset_top_diff = top_diff + top_offset;
+    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (num_samples > 0)
+                             ? num_samples
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosTheta = cos(theta);
+    scalar_t sinTheta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
+        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
+
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
+                                                w4, x_low, x_high, y_low,
+                                                y_high, index);
+
+        scalar_t g1 = top_diff_this_bin * w1 / count;
+        scalar_t g2 = top_diff_this_bin * w2 / count;
+        scalar_t g3 = top_diff_this_bin * w3 / count;
+        scalar_t g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var);
+
+          atomicAdd(offset_bottom_diff_plus + y_low * width + x_low,
+                    g1 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_low * width + x_high,
+                    g2 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_high * width + x_low,
+                    g3 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_high * width + x_high,
+                    g4 * l_var);
+
+        }  // if
+      }    // ix
+    }      // iy
+  }        // CUDA_1D_KERNEL_LOOP
+}  // RiRoIAlignBackward
+
+#endif  // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4541462afd6bd77ee794badd7d84bdd6c91b2c43
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh
@@ -0,0 +1,212 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_ALIGN_CUDA_KERNEL_CUH
+#define ROI_ALIGN_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+/*** Forward ***/
+template <typename T>
+__global__ void roi_align_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, T* output, T* argmax_y,
+    T* argmax_x, const int pooled_height, const int pooled_width,
+    const T spatial_scale, const int sampling_ratio,
+    const int pool_mode,  // 0 - max pool, 1 - avg pool
+    const bool aligned, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    if (pool_mode == 0) {
+      // We do max pooling inside a bin
+      T maxval = -FLT_MAX;
+      T maxidx_y = -1.f, maxidx_x = -1.f;
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+          T val =
+              bilinear_interpolate(offset_input, height, width, y, x, index);
+          if (val > maxval) {
+            maxval = val;
+            maxidx_y = y;
+            maxidx_x = x;
+          }
+        }
+      }
+      output[index] = maxval;
+      argmax_y[index] = maxidx_y;
+      argmax_x[index] = maxidx_x;
+    } else if (pool_mode == 1) {
+      // We do average pooling inside a bin
+      const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+      T output_val = 0.;
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+          T val =
+              bilinear_interpolate(offset_input, height, width, y, x, index);
+          output_val += val;
+        }
+      }
+      output[index] = output_val / count;
+    }
+  }
+}
+
+/*** Backward ***/
+template <typename T>
+__global__ void roi_align_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* rois, const T* argmax_y,
+    const T* argmax_x, T* grad_input, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int sampling_ratio,
+    const int pool_mode,  // 0 - max pool, 1 - avg pool
+    const bool aligned, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T grad_output_this_bin = grad_output[index];
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    if (pool_mode == 0) {
+      T y = argmax_y[index], x = argmax_x[index];
+      if (y != -1.f) {
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_grad_input + y_low * width + x_low,
+                    grad_output_this_bin * w1);
+          atomicAdd(offset_grad_input + y_low * width + x_high,
+                    grad_output_this_bin * w2);
+          atomicAdd(offset_grad_input + y_high * width + x_low,
+                    grad_output_this_bin * w3);
+          atomicAdd(offset_grad_input + y_high * width + x_high,
+                    grad_output_this_bin * w4);
+        }
+      }
+    } else if (pool_mode == 1) {
+      // Do not using rounding; this implementation detail is critical
+      T offset = aligned ? (T)0.5 : (T)0.0;
+      T roi_start_w = offset_rois[1] * spatial_scale - offset;
+      T roi_start_h = offset_rois[2] * spatial_scale - offset;
+      T roi_end_w = offset_rois[3] * spatial_scale - offset;
+      T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+      T roi_width = roi_end_w - roi_start_w;
+      T roi_height = roi_end_h - roi_start_h;
+      if (!aligned) {  // for backward-compatibility only
+        roi_width = max(roi_width, (T)1.);
+        roi_height = max(roi_height, (T)1.);
+      }
+
+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+      // We use roi_bin_grid to sample the grid and mimic integral
+      int roi_bin_grid_h =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : static_cast<int>(ceilf(roi_height / pooled_height));
+      int roi_bin_grid_w =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : static_cast<int>(ceilf(roi_width / pooled_width));
+
+      // We do average (integral) pooling inside a bin
+      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+
+          T w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                        x_low, x_high, y_low, y_high, index);
+
+          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+            atomicAdd(offset_grad_input + y_low * width + x_low,
+                      grad_output_this_bin * w1 / count);
+            atomicAdd(offset_grad_input + y_low * width + x_high,
+                      grad_output_this_bin * w2 / count);
+            atomicAdd(offset_grad_input + y_high * width + x_low,
+                      grad_output_this_bin * w3 / count);
+            atomicAdd(offset_grad_input + y_high * width + x_high,
+                      grad_output_this_bin * w4 / count);
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif  // ROI_ALIGN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8274dc50c709630c4ee456efd543aa1265049b41
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
@@ -0,0 +1,202 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#ifndef ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+#define ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+/*** Forward ***/
+template <typename scalar_t>
+__global__ void roi_align_rotated_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_rois, const scalar_t spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    if (!aligned) {  // for backward-compatibility only
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (scalar_t)1.);
+      roi_height = max(roi_height, (scalar_t)1.);
+    }
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosscalar_theta = cos(theta);
+    scalar_t sinscalar_theta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    scalar_t output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta (counterclockwise) around the center and translate
+        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
+        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
+
+        scalar_t val = bilinear_interpolate<scalar_t>(
+            offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+/*** Backward ***/
+template <typename scalar_t>
+__global__ void roi_align_rotated_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
+    const scalar_t spatial_scale, const int sampling_ratio, const bool aligned,
+    const bool clockwise, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not round
+    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    if (!aligned) {  // for backward-compatibility only
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (scalar_t)1.);
+      roi_height = max(roi_height, (scalar_t)1.);
+    }
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const scalar_t *offset_top_diff = top_diff + top_offset;
+    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosTheta = cos(theta);
+    scalar_t sinTheta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
+        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
+
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
+                                                w4, x_low, x_high, y_low,
+                                                y_high, index);
+
+        scalar_t g1 = top_diff_this_bin * w1 / count;
+        scalar_t g2 = top_diff_this_bin * w2 / count;
+        scalar_t g3 = top_diff_this_bin * w3 / count;
+        scalar_t g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+        }  // if
+      }    // ix
+    }      // iy
+  }        // CUDA_1D_KERNEL_LOOP
+}  // RoIAlignBackward
+
+#endif  // ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3d7eae66b99b7812b92d9fc8bad237cbcbd59436
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh
@@ -0,0 +1,93 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_POOL_CUDA_KERNEL_CUH
+#define ROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void roi_pool_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, T* output, int* argmax,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    // calculate the roi region on feature maps
+    T roi_x1 = offset_rois[1] * spatial_scale;
+    T roi_y1 = offset_rois[2] * spatial_scale;
+    T roi_x2 = (offset_rois[3] + 1) * spatial_scale;
+    T roi_y2 = (offset_rois[4] + 1) * spatial_scale;
+
+    // force malformed rois to be 1x1
+    T roi_w = roi_x2 - roi_x1;
+    T roi_h = roi_y2 - roi_y1;
+    if (roi_w <= 0 || roi_h <= 0) continue;
+
+    T bin_size_w = roi_w / static_cast<T>(pooled_width);
+    T bin_size_h = roi_h / static_cast<T>(pooled_height);
+
+    // the corresponding bin region
+    int bin_x1 = floorf(static_cast<T>(pw) * bin_size_w + roi_x1);
+    int bin_y1 = floorf(static_cast<T>(ph) * bin_size_h + roi_y1);
+    int bin_x2 = ceilf(static_cast<T>(pw + 1) * bin_size_w + roi_x1);
+    int bin_y2 = ceilf(static_cast<T>(ph + 1) * bin_size_h + roi_y1);
+
+    // add roi offsets and clip to input boundaries
+    bin_x1 = min(max(bin_x1, 0), width);
+    bin_y1 = min(max(bin_y1, 0), height);
+    bin_x2 = min(max(bin_x2, 0), width);
+    bin_y2 = min(max(bin_y2, 0), height);
+    bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+    // Define an empty pooling region to be zero
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    T max_val = is_empty ? 0 : -FLT_MAX;
+    int max_idx = -1;
+    for (int h = bin_y1; h < bin_y2; ++h) {
+      for (int w = bin_x1; w < bin_x2; ++w) {
+        int offset = h * width + w;
+        if (offset_input[offset] > max_val) {
+          max_val = offset_input[offset];
+          max_idx = offset;
+        }
+      }
+    }
+    output[index] = max_val;
+    if (argmax != NULL) argmax[index] = max_idx;
+  }
+}
+
+template <typename T>
+__global__ void roi_pool_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* rois, const int* argmax,
+    T* grad_input, const int pooled_height, const int pooled_width,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c) is an element in the pooled output
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    int roi_batch_ind = rois[n * 5];
+    T* grad_input_offset =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+    int argmax_index = argmax[index];
+
+    if (argmax_index != -1) {
+      atomicAdd(grad_input_offset + argmax_index, grad_output[index]);
+    }
+  }
+}
+
+#endif  // ROI_POOL_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..fc0aacf1435f8715fae92de535bf01bac07ac39a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
@@ -0,0 +1,260 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIAWARE_POOL3D_CUDA_KERNEL_CUH
+#define ROIAWARE_POOL3D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
+                                            int out_x, int out_y, int out_z,
+                                            const T *rois, const T *pts,
+                                            int *pts_mask) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
+  // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
+  // y_idxs, z_idxs) by binary bit
+  int box_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (box_idx >= boxes_num) return;
+
+    pts += pt_idx * 3;
+    rois += box_idx * 7;
+    pts_mask += box_idx * pts_num + pt_idx;
+
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
+
+    pts_mask[0] = -1;
+    if (cur_in_flag > 0) {
+      T local_z = pts[2] - rois[2];
+      T x_size = rois[3], y_size = rois[4], z_size = rois[5];
+
+      T x_res = x_size / out_x;
+      T y_res = y_size / out_y;
+      T z_res = z_size / out_z;
+
+      unsigned int x_idx = int((local_x + x_size / 2) / x_res);
+      unsigned int y_idx = int((local_y + y_size / 2) / y_res);
+      unsigned int z_idx = int(local_z / z_res);
+
+      x_idx = min(max(x_idx, 0), out_x - 1);
+      y_idx = min(max(y_idx, 0), out_y - 1);
+      z_idx = min(max(z_idx, 0), out_z - 1);
+
+      unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
+
+      pts_mask[0] = idx_encoding;
+    }
+  }
+}
+
+template <typename T>
+__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
+                                             int max_pts_each_voxel, int out_x,
+                                             int out_y, int out_z,
+                                             const int *pts_mask,
+                                             T *pts_idx_of_voxels) {
+  // params pts_mask: (N, npoints)  0 or 1
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) {
+    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
+
+    for (int k = 0; k < pts_num; k++) {
+      if (pts_mask[box_idx * pts_num + k] != -1) {
+        unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
+        unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
+        unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
+        unsigned int z_idx = idx_encoding & 0xFF;
+        unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
+                                   y_idx * out_z * max_pts_each_voxel +
+                                   z_idx * max_pts_each_voxel;
+        unsigned int cnt = pts_idx_of_voxels[base_offset];
+        if (cnt < max_num_pts) {
+          pts_idx_of_voxels[base_offset + cnt + 1] = k;
+          pts_idx_of_voxels[base_offset]++;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const T *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   T *pooled_features, int *argmax) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    pooled_features += box_idx * out_x * out_y * out_z * channels +
+                       offset_base * channels + channel_idx;
+    argmax += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+    int argmax_idx = -1;
+    float max_val = -1e50;
+
+    int total_pts = pts_idx_of_voxels[0];
+
+    for (int k = 1; k <= total_pts; k++) {
+      if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >
+          max_val) {
+        max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+        argmax_idx = pts_idx_of_voxels[k];
+      }
+    }
+
+    if (argmax_idx != -1) {
+      pooled_features[0] = max_val;
+    }
+    argmax[0] = argmax_idx;
+  }
+}
+
+template <typename T>
+__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const T *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   T *pooled_features) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    pooled_features += box_idx * out_x * out_y * out_z * channels +
+                       offset_base * channels + channel_idx;
+
+    float sum_val = 0;
+    int total_pts = pts_idx_of_voxels[0];
+
+    for (int k = 1; k <= total_pts; k++) {
+      sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+    }
+
+    if (total_pts > 0) {
+      pooled_features[0] = sum_val / total_pts;
+    }
+  }
+}
+
+template <typename T>
+__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            const int *argmax,
+                                            const T *grad_out, T *grad_in) {
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    argmax += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+    grad_out += box_idx * out_x * out_y * out_z * channels +
+                offset_base * channels + channel_idx;
+
+    if (argmax[0] == -1) return;
+
+    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
+  }
+}
+
+template <typename T>
+__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            int max_pts_each_voxel,
+                                            const int *pts_idx_of_voxels,
+                                            const T *grad_out, T *grad_in) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    grad_out += box_idx * out_x * out_y * out_z * channels +
+                offset_base * channels + channel_idx;
+
+    int total_pts = pts_idx_of_voxels[0];
+    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
+    for (int k = 1; k <= total_pts; k++) {
+      atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
+                grad_out[0] * cur_grad);
+    }
+  }
+}
+
+#endif  // ROIAWARE_POOL3D_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..545f6ffa09d4a6cae49f1f1e68c191c1fd54de68
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
@@ -0,0 +1,134 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIPOINT_POOL3D_CUDA_KERNEL_CUH
+#define ROIPOINT_POOL3D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the
+  // bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
+  cz += dz / 2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > dz / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  T in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) &
+              (local_y > -dy / 2.0) & (local_y < dy / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
+                                    const T *xyz, const T *boxes3d,
+                                    int *pts_assign) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
+  // background points
+  int box_idx = blockIdx.y;
+  int bs_idx = blockIdx.z;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
+
+    int assign_idx =
+        bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
+    pts_assign[assign_idx] = 0;
+
+    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
+    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
+
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
+                                        local_x, local_y);
+    pts_assign[assign_idx] = cur_in_flag;
+  }
+}
+
+__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
+                               int sampled_pts_num, const int *pts_assign,
+                               int *pts_idx, int *pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params pts_feature: (B, N, C)
+  // params pts_assign: (B, N)
+  // params pts_idx: (B, M, 512)
+  // params pooled_empty_flag: (B, M)
+  CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) {
+    int bs_idx = blockIdx.y;
+
+    int cnt = 0;
+    for (int k = 0; k < pts_num; k++) {
+      if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +
+                     boxes_idx]) {
+        if (cnt < sampled_pts_num) {
+          pts_idx[bs_idx * boxes_num * sampled_pts_num +
+                  boxes_idx * sampled_pts_num + cnt] = k;
+          cnt++;
+        } else
+          break;
+      }
+    }
+
+    if (cnt == 0) {
+      pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
+    } else if (cnt < sampled_pts_num) {
+      // duplicate same points for sampling
+      for (int k = cnt; k < sampled_pts_num; k++) {
+        int duplicate_idx = k % cnt;
+        int base_offset =
+            bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
+        pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void roipoint_pool3d_forward(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const T *xyz, const int *pts_idx, const T *pts_feature,
+    T *pooled_features, int *pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params pts_idx: (B, M, 512)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  int box_idx = blockIdx.y;
+  int bs_idx = blockIdx.z;
+  CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) {
+    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
+    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return;
+
+    int temp_idx = bs_idx * boxes_num * sampled_pts_num +
+                   box_idx * sampled_pts_num + sample_pt_idx;
+    int src_pt_idx = pts_idx[temp_idx];
+    int dst_feature_offset = temp_idx * (3 + feature_in_len);
+
+    for (int j = 0; j < 3; j++)
+      pooled_features[dst_feature_offset + j] =
+          xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
+
+    int src_feature_offset =
+        bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
+    memcpy(pooled_features + dst_feature_offset + 3,
+           pts_feature + src_feature_offset, feature_in_len * sizeof(T));
+  }
+}
+
+#endif  // ROIPOINT_POOL3D_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ffcc658ccb1f5e3059c0428159bc2e80fbeee3d4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
@@ -0,0 +1,129 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
+#define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void rotated_feature_align_forward_kernel(
+    const int nthreads, const int points, const scalar_t* bottom_data,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width, scalar_t* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    const scalar_t* offset_bottom_data =
+        bottom_data + (n * channels + c) * height * width;
+
+    scalar_t output_val = bottom_data[index];
+    for (int i = 0; i < points; i++) {
+      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
+                                                   width, py[i], px[i], i);
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void rotated_feature_align_backward_kernel(
+    const int nthreads, const int points, const scalar_t* top_diff,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width,
+    scalar_t* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    scalar_t* offset_bottom_diff =
+        bottom_diff + (n * channels + c) * height * width;
+    scalar_t value_top_diff = top_diff[index];
+
+    atomicAdd(bottom_diff + index, value_top_diff);
+    for (int i = 0; i < points; i++) {
+      scalar_t w1, w2, w3, w4;
+      int x_low, x_high, y_low, y_high;
+
+      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
+                                              w2, w3, w4, x_low, x_high, y_low,
+                                              y_high, i);
+      scalar_t g1 = value_top_diff * w1;
+      scalar_t g2 = value_top_diff * w2;
+      scalar_t g3 = value_top_diff * w3;
+      scalar_t g4 = value_top_diff * w4;
+      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+        atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+        atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+        atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+        atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+      }
+    }
+  }
+}
+#endif  // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..af5b9f67b12060ae5dfa52738dba52c8fe674105
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
@@ -0,0 +1,187 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SCATTER_POINTS_CUDA_KERNEL_CUH
+#define SCATTER_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+int const maxGridDim = 50000;
+
+__device__ __forceinline__ static void reduceMax(float *address, float val) {
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(fmaxf(val, __int_as_float(assumed))));
+  } while (assumed != old || __int_as_float(old) < val);
+}
+
+__device__ __forceinline__ static void reduceMax(double *address, double val) {
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(
+        address_as_ull, assumed,
+        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
+  } while (assumed != old || __longlong_as_double(old) < val);
+}
+
+// get rid of meaningless warnings when compiling host code
+#ifdef MMCV_WITH_HIP
+__device__ __forceinline__ static void reduceAdd(float *address, float val) {
+  atomicAdd(address, val);
+}
+__device__ __forceinline__ static void reduceAdd(double *address, double val) {
+  atomicAdd(address, val);
+}
+#else
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ static void reduceAdd(float *address, float val) {
+#if (__CUDA_ARCH__ < 200)
+#ifdef _MSC_VER
+#pragma message( \
+    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32")
+#else
+#warning \
+    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
+#endif
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(val + __int_as_float(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+
+__device__ __forceinline__ static void reduceAdd(double *address, double val) {
+#if (__CUDA_ARCH__ < 600)
+#ifdef _MSC_VER
+#pragma message( \
+    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64")
+#else
+#warning \
+    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
+#endif
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+#endif  // __CUDA_ARCH__
+#endif  // MMCV_WITH_HIP
+
+template <typename T>
+__global__ void feats_reduce_kernel(
+    const T *feats, const int32_t *coors_map,
+    T *reduced_feats,  // shall be 0 at initialization
+    const int num_input, const int num_feats, const reduce_t reduce_type) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) continue;
+
+    const T *feats_offset = feats + x * num_feats;
+    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
+    if (reduce_type == reduce_t::MAX) {
+      for (int i = 0; i < num_feats; i++) {
+        reduceMax(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    } else {
+      for (int i = 0; i < num_feats; i++) {
+        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void add_reduce_traceback_grad_kernel(
+    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
+    const int32_t *reduce_count, const int num_input, const int num_feats,
+    const reduce_t reduce_type) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int input_offset = x * num_feats;
+    T *grad_feats_offset = grad_feats + input_offset;
+    const int reduced_offset = reduce_to * num_feats;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    if (reduce_type == reduce_t::SUM) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i];
+      }
+    } else if (reduce_type == reduce_t::MEAN) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i] /
+                               static_cast<T>(reduce_count[reduce_to]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_traceback_scatter_idx_kernel(
+    const T *feats, const T *reduced_feats, int32_t *reduce_from,
+    const int32_t *coors_map, const int num_input, const int num_feats) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+
+    const int input_offset = x * num_feats;
+    const T *feats_offset = feats + input_offset;
+
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int reduced_offset = reduce_to * num_feats;
+    const T *reduced_feats_offset = reduced_feats + reduced_offset;
+    int32_t *reduce_from_offset = reduce_from + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      if (feats_offset[i] == reduced_feats_offset[i]) {
+        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
+                                               const T *grad_reduced_feats,
+                                               const int32_t *reduce_from,
+                                               const int num_reduced,
+                                               const int num_feats) {
+  CUDA_1D_KERNEL_LOOP(x, num_reduced) {
+    const int reduced_offset = x * num_feats;
+    const int32_t *scatter_to_offset = reduce_from + reduced_offset;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      grad_feats[scatter_to_offset[i] * num_feats + i] =
+          grad_reduced_feats_offset[i];
+    }
+  }
+}
+
+#endif  // SCATTER_POINTS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1eb5f8fcccbaafdb62972652e3979803c0acd1ca
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh
@@ -0,0 +1,71 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
+#define SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void sigmoid_focal_loss_forward_cuda_kernel(
+    const int nthreads, const T* input, const int64_t* target, const T* weight,
+    T* output, const T gamma, const T alpha, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+
+    int64_t t = target[n];
+    T flag_p = (t == c);
+    T flag_n = (t != c);
+
+    // p = sigmoid(x) = 1. / 1. + expf(-x)
+    T p = (T)1. / ((T)1. + expf(-input[index]));
+
+    // (1 - p)**gamma * log(p)
+    T term_p = pow(((T)1. - p), gamma) * log(max(p, (T)FLT_MIN));
+    // p**gamma * log(1 - p)
+    T term_n = pow(p, gamma) * log(max((T)1. - p, (T)FLT_MIN));
+
+    output[index] = (T)0.;
+    output[index] += -flag_p * alpha * term_p;
+    output[index] += -flag_n * ((T)1. - alpha) * term_n;
+    if (weight != NULL) {
+      output[index] *= weight[t];
+    }
+  }
+}
+
+template <typename T>
+__global__ void sigmoid_focal_loss_backward_cuda_kernel(
+    const int nthreads, const T* input, const int64_t* target, const T* weight,
+    T* grad_input, const T gamma, const T alpha, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+
+    int64_t t = target[n];
+    T flag_p = (t == c);
+    T flag_n = (t != c);
+
+    // p = sigmoid(x) = 1. / 1. + expf(-x)
+    T p = (T)1. / ((T)1. + exp(-input[index]));
+
+    // (1 - p)**gamma * (1 - p - gamma*p*log(p))
+    T term_p = pow(((T)1. - p), gamma) *
+               ((T)1. - p - (gamma * p * log(max(p, (T)FLT_MIN))));
+    // p**gamma * (gamma * (1 - p) * log(1 - p) - p)
+    T term_n = pow(p, gamma) *
+               (gamma * ((T)1. - p) * log(max((T)1. - p, (T)FLT_MIN)) - p);
+
+    grad_input[index] = (T)0.;
+    grad_input[index] += -flag_p * alpha * term_p;
+    grad_input[index] += -flag_n * ((T)1. - alpha) * term_n;
+    if (weight != NULL) {
+      grad_input[index] *= weight[t];
+    }
+  }
+}
+
+#endif  // SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..631b2c6175412a9503f6c385ee6597d9527d754f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh
@@ -0,0 +1,72 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
+#define SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void softmax_focal_loss_forward_cuda_kernel(
+    const int nthreads, const T* softmax, const int64_t* target,
+    const T* weight, T* output, const T gamma, const T alpha,
+    const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int64_t label = target[index];
+    T pred = softmax[index * num_classes + label];
+
+    if (label >= 0) {
+      output[index] =
+          -alpha * pow((T)1. - pred, gamma) * log(max(pred, (T)FLT_MIN));
+    } else {
+      output[index] = 0;
+    }
+    if (weight != NULL) {
+      output[index] *= weight[label];
+    }
+  }
+}
+
+template <typename T>
+__global__ void softmax_focal_loss_backward_cuda1_kernel(
+    const int nthreads, const T* softmax, const int64_t* target,
+    const T* weight, T* buff, const T gamma, const T alpha,
+    const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int64_t label = target[index];
+    T pred = softmax[index * num_classes + label];
+
+    if (label >= 0) {
+      buff[index] = alpha * (-pow((T)1. - pred, gamma) +
+                             gamma * pow((T)1. - pred, gamma - 1) * pred *
+                                 log(max(pred, (T)FLT_MIN)));
+    } else {
+      buff[index] = 0;
+    }
+    if (weight != NULL) {
+      buff[index] *= weight[label];
+    }
+  }
+}
+
+template <typename T>
+__global__ void softmax_focal_loss_backward_cuda2_kernel(
+    const int nthreads, const T* softmax, const int64_t* target, const T* buff,
+    T* grad_input, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+    int64_t label = target[n];
+
+    if (label >= 0) {
+      T flag = (label == c ? (T)1. : (T)0.);
+      grad_input[index] = buff[n] * (flag - softmax[index]);
+    } else {
+      grad_input[index] = 0;
+    }
+  }
+}
+
+#endif  // SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/spconv/indice.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/spconv/indice.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5ef0009a10f8effeb447e398cff5103b400056de
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/spconv/indice.cuh
@@ -0,0 +1,236 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef INDICE_CU_H_
+#define INDICE_CU_H_
+#include <utils/spconv/spconv/geometry.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void prepareIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
+    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  auto indicePairsDim2 = indicePairs.dim(2);
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+      indicePairs(offset, 0, oldNum) = ix;
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      indicePairs(offset, 1, oldNum) = index;
+      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void prepareDeConvIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
+    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  auto indicePairsDim2 = indicePairs.dim(2);
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPosTranspose<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+      indicePairs(offset, 0, oldNum) = ix;
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      indicePairs(offset, 1, oldNum) = index;
+      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void assignGridAndIndiceOutKernel(
+    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+    int numAct, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
+  Index index;
+  auto indicesOutPtr = indicesOut.data();
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    index = indicePairUnique[ix];
+    gridsOut[index] = ix;
+    index = tv::rowArrayIdxInv<Index, NDim>(
+        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
+    indicesOut[ix * (NDim + 1)] = index % batchSize;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void assignIndicePairsKernel(
+    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+    int numActIn, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  Index index;
+  int kernelVolume = indicePairs.dim(0);
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    for (int i = 0; i < kernelVolume; ++i) {
+      index = indicePairs(i, 1, ix);
+      if (index > -1) {
+        indicePairs(i, 1, ix) = gridsOut[index];
+      }
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void prepareSubMGridKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index index = 0;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
+                                         outSpatialShape.data()) +
+            spatialVolume * indicesIn(ix, 0);
+    gridsOut[index] = ix;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void getSubMIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
+    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  Index index = 0;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (int i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      if (gridsOut[index] > -1) {
+        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+        indicePairs(offset, 1, oldNum) = gridsOut[index];
+        indicePairs(offset, 0, oldNum) = ix;
+      }
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void resetGridKernel(const Index *indicePairUnique,
+                                tv::TensorView<IndexGrid> gridsOut,
+                                int numAct) {
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    gridsOut[indicePairUnique[ix]] = -1;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void resetGridSubMKernel(
+    const Index *indices, tv::TensorView<IndexGrid> gridsOut,
+    const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {
+  int outSpatialShapeReg[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShapeReg[i] = outSpatialShape[i];
+  }
+  Index spatialVolume = 1;
+  auto indsPtr = indices;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    indsPtr = indices + ix * (NDim + 1);
+    index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);
+    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
+  }
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e3ec68b937b0507e3a119d63a49ad79e8f48eec7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
@@ -0,0 +1,160 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef REORDERING_CU_H_
+#define REORDERING_CU_H_
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features,
+                                    const Index *indices, int size,
+                                    int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size)
+          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
+              features[inds[ilp] + iy];
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features,
+                                const Index *indices, int size, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size)
+          reinterpret_cast<VecType *>(
+              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
+              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType = int4>
+__global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features,
+                                     const Index *indices, int size,
+                                     int numPlanes) {
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
+  features += blockIdx.x * NumTLP;
+  buffer += blockIdx.x * NumTLP;
+
+  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      reinterpret_cast<VecType *>(
+          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
+          reinterpret_cast<const VecType *>(
+              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
+                        threadIdx.x];
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void scatterAddGenericKernel(scalar_t *outFeatures,
+                                        const scalar_t *buffer,
+                                        const Index *indices, int size,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size) {
+          outFeatures[inds[ilp] + iy] +=
+              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType = int4>
+__global__ void scatterAddVecBlockKernel(scalar_t *outFeatures,
+                                         const scalar_t *buffer,
+                                         const Index *indices, int size,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
+  outFeatures += blockIdx.x * NumTLP;
+  buffer += blockIdx.x * NumTLP;
+  scalar_t buf[vecloadFactor];
+  scalar_t buf2[vecloadFactor];
+  Index idx;
+  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(buf)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idx];
+      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
+          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        buf[i] += buf2[i];
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idx] =
+          reinterpret_cast<VecType *>(buf)[0];
+    }
+  }
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..06caefa18d47be11b6cb8770ceb8951479add902
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
@@ -0,0 +1,68 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+#ifndef STACK_BALL_QUERY_CUDA_KERNEL_CUH
+#define STACK_BALL_QUERY_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void stack_ball_query_forward_cuda_kernel(
+    int B, int M, float radius, int nsample, const T *new_xyz,
+    const int *new_xyz_batch_cnt, const T *xyz, const int *xyz_batch_cnt,
+    int *idx) {
+  // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
+  // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
+  // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
+  // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
+  // output:
+  //      idx: (M, nsample)
+  const T *cur_xyz = xyz;
+  int *cur_idx = idx;
+  CUDA_1D_KERNEL_LOOP(pt_idx, M) {
+    int bs_idx = 0;
+    for (int pt_cnt = 0; bs_idx < B; bs_idx++) {
+      pt_cnt += new_xyz_batch_cnt[bs_idx];
+      if (pt_idx < pt_cnt) break;
+    }
+
+    int xyz_batch_start_idx = 0;
+    for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];
+
+    const T *new_xyz_p = new_xyz + pt_idx * 3;
+    cur_xyz += xyz_batch_start_idx * 3;
+    cur_idx += pt_idx * nsample;
+
+    float radius2 = radius * radius;
+    T new_x = new_xyz_p[0];
+    T new_y = new_xyz_p[1];
+    T new_z = new_xyz_p[2];
+    int n = xyz_batch_cnt[bs_idx];
+
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+      T x = cur_xyz[k * 3 + 0];
+      T y = cur_xyz[k * 3 + 1];
+      T z = cur_xyz[k * 3 + 2];
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+             (new_z - z) * (new_z - z);
+      if (d2 < radius2) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            cur_idx[l] = k;
+          }
+        }
+        cur_idx[cnt] = k;
+        ++cnt;
+        if (cnt >= nsample) break;
+      }
+    }
+    if (cnt == 0) cur_idx[0] = -1;
+  }
+}
+
+#endif  // STACK_BALL_QUERY_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4ef3663d05bcd9146e15dd93bb979734538919cb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
@@ -0,0 +1,97 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#ifndef STACK_GROUP_POINTS_CUDA_KERNEL_CUH
+#define STACK_GROUP_POINTS_CUDA_KERNEL_CUH
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include <stdio.h>
+template <typename T>
+__global__ void stack_group_points_forward_cuda_kernel(
+    int b, int c, int m, int nsample, const T *features,
+    const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt,
+    T *out) {
+  // :param features: (N1 + N2 ..., C) tensor of features to group
+  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
+  // indices of features to group with :param idx: (M1 + M2 ..., nsample) tensor
+  // containing the indices of features to group with :param idx_batch_cnt:
+  // (batch_size) [M1 + M2 ...] tensor containing the indices of features to
+  // group with :return:
+  //     output: (M1 + M2, C, nsample) tensor
+  CUDA_1D_KERNEL_LOOP(index, m * c * nsample) {
+    const T *cur_features = features;
+    const int *cur_idx = idx;
+    int sample_idx = index % nsample;
+    int c_idx = (index / nsample) % c;
+    int pt_idx = (index / nsample / c);
+
+    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;
+    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
+    for (int k = 1; k < b; k++) {
+      if (pt_idx < pt_cnt) break;
+      pt_cnt += idx_batch_cnt[k];
+      bs_idx = k;
+    }
+
+    int features_batch_start_idx = 0;
+    int features_batch_end_idx = features_batch_cnt[0];
+    for (int k = 0; k < bs_idx; k++) {
+      features_batch_start_idx += features_batch_cnt[k];
+      features_batch_end_idx =
+          features_batch_start_idx + features_batch_cnt[k + 1];
+    }
+    cur_features += features_batch_start_idx * c;
+
+    cur_idx += pt_idx * nsample + sample_idx;
+    int in_idx = cur_idx[0] * c + c_idx;
+    int out_idx = pt_idx * c * nsample + c_idx * nsample + sample_idx;
+    if (in_idx < features_batch_end_idx * c) {
+      out[out_idx] = cur_features[in_idx];
+    }
+  }
+}
+
+template <typename T>
+__global__ void stack_group_points_backward_cuda_kernel(
+    int b, int c, int m, int n, int nsample, const T *grad_out, const int *idx,
+    const int *idx_batch_cnt, const int *features_batch_cnt, T *grad_features) {
+  // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the
+  // output from forward :param idx: (M1 + M2 ..., nsample) tensor containing
+  // the indices of features to group with :param idx_batch_cnt: (batch_size)
+  // [M1 + M2 ...] tensor containing the indices of features to group with
+  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
+  // indices of features to group with :return:
+  //     grad_features: (N1 + N2 ..., C) gradient of the features
+  CUDA_1D_KERNEL_LOOP(index, m * c * nsample) {
+    const T *cur_grad_out = grad_out;
+    const int *cur_idx = idx;
+    T *cur_grad_features = grad_features;
+    int sample_idx = index % nsample;
+    int c_idx = (index / nsample) % c;
+    int pt_idx = (index / nsample / c);
+
+    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;
+
+    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
+    for (int k = 1; k < b; k++) {
+      if (pt_idx < pt_cnt) break;
+      pt_cnt += idx_batch_cnt[k];
+      bs_idx = k;
+    }
+
+    int features_batch_start_idx = 0;
+    for (int k = 0; k < bs_idx; k++)
+      features_batch_start_idx += features_batch_cnt[k];
+
+    cur_grad_out += pt_idx * c * nsample + c_idx * nsample + sample_idx;
+    cur_idx += pt_idx * nsample + sample_idx;
+    cur_grad_features += (features_batch_start_idx + cur_idx[0]) * c + c_idx;
+
+    atomicAdd(cur_grad_features, cur_grad_out[0]);
+  }
+}
+
+#endif  // GROUP_POINTS_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4ec6a466886832d38c72da6e3a3574e72d53cec8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh
@@ -0,0 +1,331 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SYNCBN_CUDA_KERNEL_CUH
+#define SYNCBN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void sync_bn_forward_mean_cuda_kernel(const T *input, float *mean,
+                                                 int num, int channels,
+                                                 int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer[tid] += input[index];
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    mean[c] = buffer[0] / total;
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_mean_cuda_kernel(const phalf *input,
+                                                 float *mean, int num,
+                                                 int channels, int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer[tid] += static_cast<float>(input[index]);
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    mean[c] = buffer[0] / total;
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_forward_var_cuda_kernel(const T *input,
+                                                const float *mean, float *var,
+                                                int num, int channels,
+                                                int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    float td = input[index] - mean[c];
+    buffer[tid] += td * td;
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    var[c] = buffer[0] / total;
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_var_cuda_kernel(const phalf *input,
+                                                const float *mean, float *var,
+                                                int num, int channels,
+                                                int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    float td = static_cast<float>(input[index]) - mean[c];
+    buffer[tid] += td * td;
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    var[c] = buffer[0] / total;
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_forward_output_cuda_kernel(
+    const T *input, const float *mean, const float *var, float *running_mean,
+    float *running_var, const float *weight, const float *bias, float *norm,
+    float *std, T *output, int num, int channels, int spatial, float eps,
+    float momentum, int group_size) {
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  float mean_value = mean[c];
+  float std_value = sqrt(var[c] + eps);
+
+  if (weight != nullptr) {
+    float weight_value = weight[c];
+    float bias_value = bias[c];
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] = (input[index] - mean_value) / std_value;
+        output[index] = norm[index] * weight_value + bias_value;
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] =
+            (input[index] - mean_value) / std_value * weight_value + bias_value;
+      }
+    }
+  } else {
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = norm[index] = (input[index] - mean_value) / std_value;
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = (input[index] - mean_value) / std_value;
+      }
+    }
+  }
+  if (tid == 0) {
+    if (std != nullptr) std[c] = std_value;
+    if (running_mean != nullptr) {
+      running_mean[c] =
+          momentum * mean_value + (1 - momentum) * running_mean[c];
+      int count = num * spatial * group_size;
+      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
+      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
+    }
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_output_cuda_kernel(
+    const phalf *input, const float *mean, const float *var,
+    float *running_mean, float *running_var, const float *weight,
+    const float *bias, float *norm, float *std, phalf *output, int num,
+    int channels, int spatial, float eps, float momentum, int group_size) {
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  float mean_value = mean[c];
+  float std_value = sqrt(var[c] + eps);
+  if (weight != nullptr) {
+    float weight_value = weight[c];
+    float bias_value = bias[c];
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] =
+            (static_cast<float>(input[index]) - mean_value) / std_value;
+        output[index] =
+            static_cast<phalf>(norm[index] * weight_value + bias_value);
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] =
+            static_cast<phalf>((static_cast<float>(input[index]) - mean_value) /
+                                   std_value * weight_value +
+                               bias_value);
+      }
+    }
+  } else {
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] =
+            (static_cast<float>(input[index]) - mean_value) / std_value;
+        output[index] = static_cast<phalf>(norm[index]);
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = static_cast<phalf>(
+            (static_cast<float>(input[index]) - mean_value) / std_value);
+      }
+    }
+  }
+  if (tid == 0) {
+    if (std != nullptr) std[c] = std_value;
+    if (running_mean != nullptr) {
+      running_mean[c] =
+          momentum * mean_value + (1 - momentum) * running_mean[c];
+      int count = num * spatial * group_size;
+      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
+      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
+    }
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_backward_param_cuda_kernel(const T *grad_output,
+                                                   const float *norm,
+                                                   float *grad_weight,
+                                                   float *grad_bias, int num,
+                                                   int channels, int spatial) {
+  __shared__ float buffer1[THREADS_PER_BLOCK];
+  __shared__ float buffer2[THREADS_PER_BLOCK];
+
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer1[tid] = buffer2[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer1[tid] += grad_output[index] * norm[index];
+    buffer2[tid] += grad_output[index];
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer1[tid] += buffer1[tid + s];
+      buffer2[tid] += buffer2[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_weight[c] = buffer1[0];
+    grad_bias[c] = buffer2[0];
+  }
+}
+
+template <>
+__global__ void sync_bn_backward_param_cuda_kernel(const phalf *grad_output,
+                                                   const float *norm,
+                                                   float *grad_weight,
+                                                   float *grad_bias, int num,
+                                                   int channels, int spatial) {
+  __shared__ float buffer1[THREADS_PER_BLOCK];
+  __shared__ float buffer2[THREADS_PER_BLOCK];
+
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer1[tid] = buffer2[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer1[tid] += static_cast<float>(grad_output[index]) * norm[index];
+    buffer2[tid] += static_cast<float>(grad_output[index]);
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer1[tid] += buffer1[tid + s];
+      buffer2[tid] += buffer2[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_weight[c] = buffer1[0];
+    grad_bias[c] = buffer2[0];
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_backward_data_cuda_kernel(
+    int output_size, const T *grad_output, const float *weight,
+    const float *grad_weight, const float *grad_bias, const float *norm,
+    const float *std, T *grad_input, int num, int channels, int spatial) {
+  int factor = num * spatial;
+  CUDA_1D_KERNEL_LOOP(index, output_size) {
+    int c = (index / spatial) % channels;
+    grad_input[index] =
+        weight[c] *
+        (grad_output[index] -
+         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
+        std[c];
+  }
+}
+
+template <>
+__global__ void sync_bn_backward_data_cuda_kernel(
+    int output_size, const phalf *grad_output, const float *weight,
+    const float *grad_weight, const float *grad_bias, const float *norm,
+    const float *std, phalf *grad_input, int num, int channels, int spatial) {
+  int factor = num * spatial;
+  CUDA_1D_KERNEL_LOOP(index, output_size) {
+    int c = (index / spatial) % channels;
+    grad_input[index] = static_cast<phalf>(
+        weight[c] *
+        (static_cast<float>(grad_output[index]) -
+         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
+        std[c]);
+  }
+}
+
+#endif  // SYNCBN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..971b496e589d2210131351305cbaf0ed1a027cb1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_INTERPOLATE_CUDA_KERNEL_CUH
+#define THREE_INTERPOLATE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void three_interpolate_forward_cuda_kernel(
+    int b, int c, int m, int n, const T *points, const int *__restrict__ idx,
+    const T *weight, T *out) {
+  // points: (B, C, M)
+  // idx: (B, N, 3)
+  // weight: (B, N, 3)
+  // output:
+  //      out: (B, C, N)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+    out += bs_idx * c * n + c_idx * n;
+
+    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
+                  weight[2] * points[idx[2]];
+  }
+}
+
+template <typename T>
+__global__ void three_interpolate_backward_cuda_kernel(
+    int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx,
+    const T *weight, T *grad_points) {
+  // grad_out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
+  //      grad_points: (B, C, M)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    grad_points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+
+    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
+  }
+}
+
+#endif  // THREE_INTERPOLATE_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..15434121b94033afb2fcb9945a83db15b92262d4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
@@ -0,0 +1,67 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_NN_CUDA_KERNEL_CUH
+#define THREE_NN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void three_nn_forward_cuda_kernel(int b, int n, int m,
+                                             const T *unknown, const T *known,
+                                             T *dist2, int *__restrict__ idx) {
+  // unknown: (B, N, 3)
+  // known: (B, M, 3)
+  // output:
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
+
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b) return;
+
+    unknown += bs_idx * n * 3 + pt_idx * 3;
+    known += bs_idx * m * 3;
+    dist2 += bs_idx * n * 3 + pt_idx * 3;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+
+    T ux = unknown[0];
+    T uy = unknown[1];
+    T uz = unknown[2];
+
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < m; ++k) {
+      T x = known[k * 3 + 0];
+      T y = known[k * 3 + 1];
+      T z = known[k * 3 + 2];
+      T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+      if (d < best1) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = best1;
+        besti2 = besti1;
+        best1 = d;
+        besti1 = k;
+      } else if (d < best2) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = d;
+        besti2 = k;
+      } else if (d < best3) {
+        best3 = d;
+        besti3 = k;
+      }
+    }
+    dist2[0] = best1;
+    dist2[1] = best2;
+    dist2[2] = best3;
+    idx[0] = besti1;
+    idx[1] = besti2;
+    idx[2] = besti3;
+  }
+}
+
+#endif  // THREE_NN_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4d1159a515f4de2666c25ba4bd5e4f2cbbca1e10
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TIN_SHIFT_CUDA_KERNEL_CUH
+#define TIN_SHIFT_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void tin_shift_forward_cuda_kernel(
+    const int nthreads, const T* input, const int* shift, T* output,
+    const int batch_size, const int channels, const int t_size,
+    const int hw_size, const int group_size, const int group_channel) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int hw_index = index % hw_size;
+    const int j = (index / hw_size) % channels;
+
+    const int n_index = (index / hw_size / channels) % batch_size;
+    int group_id = j / group_channel;
+    int t_shift = shift[n_index * group_size + group_id];
+    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
+    for (int i = 0; i < t_size; i++) {
+      int now_t = i + t_shift;
+      int data_id = i * hw_size * channels + offset;
+      if (now_t < 0 || now_t >= t_size) {
+        continue;
+      }
+      int out_id = now_t * hw_size * channels + offset;
+      output[out_id] = input[data_id];
+    }
+  }
+}
+
+template <typename T>
+__global__ void tin_shift_backward_cuda_kernel(
+    const int nthreads, const T* input, const int* shift, T* output,
+    const int batch_size, const int channels, const int t_size,
+    const int hw_size, const int group_size, const int group_channel) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int hw_index = index % hw_size;
+    const int j = (index / hw_size) % channels;
+
+    const int n_index = (index / hw_size / channels) % batch_size;
+    int group_id = j / group_channel;
+    int t_shift = shift[n_index * group_size + group_id];
+    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
+    for (int i = 0; i < t_size; i++) {
+      int now_t = i + t_shift;
+      int data_id = i * hw_size * channels + offset;
+      if (now_t < 0 || now_t >= t_size) {
+        continue;
+      }
+      int out_id = now_t * hw_size * channels + offset;
+      output[out_id] = input[data_id];
+    }
+  }
+}
+
+#endif  // TIN_SHIFT_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..021b488d8d716c9e8132173bf04491d42b7b6fa2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
@@ -0,0 +1,216 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef VOXELIZATION_CUDA_KERNEL_CUH
+#define VOXELIZATION_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+template <typename T, typename T_int>
+__global__ void dynamic_voxelize_kernel(
+    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
+    const float voxel_z, const float coors_x_min, const float coors_y_min,
+    const float coors_z_min, const float coors_x_max, const float coors_y_max,
+    const float coors_z_max, const int grid_x, const int grid_y,
+    const int grid_z, const int num_points, const int num_features,
+    const int NDim) {
+  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    // To save some computation
+    auto points_offset = points + index * num_features;
+    auto coors_offset = coors + index * NDim;
+    int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x);
+    if (c_x < 0 || c_x >= grid_x) {
+      coors_offset[0] = -1;
+      continue;
+    }
+
+    int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y);
+    if (c_y < 0 || c_y >= grid_y) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      continue;
+    }
+
+    int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z);
+    if (c_z < 0 || c_z >= grid_z) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      coors_offset[2] = -1;
+    } else {
+      coors_offset[0] = c_z;
+      coors_offset[1] = c_y;
+      coors_offset[2] = c_x;
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_point_to_voxel(const int nthreads, const T* points,
+                                      T_int* point_to_voxelidx,
+                                      T_int* coor_to_voxelidx, T* voxels,
+                                      const int max_points,
+                                      const int num_features,
+                                      const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    int index = thread_idx / num_features;
+
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num > -1 && voxelidx > -1) {
+      auto voxels_offset =
+          voxels + voxelidx * max_points * num_features + num * num_features;
+
+      int k = thread_idx % num_features;
+      voxels_offset[k] = points[thread_idx];
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
+                                   T_int* point_to_voxelidx,
+                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
+                                   const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    // if (index >= num_points) return;
+    int index = thread_idx / NDim;
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num == 0 && voxelidx > -1) {
+      auto coors_offset = voxel_coors + voxelidx * NDim;
+      int k = thread_idx % NDim;
+      coors_offset[k] = coor[thread_idx];
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void point_to_voxelidx_kernel(const T_int* coor,
+                                         T_int* point_to_voxelidx,
+                                         T_int* point_to_pointidx,
+                                         const int max_points,
+                                         const int max_voxels,
+                                         const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    auto coor_offset = coor + index * NDim;
+    // skip invalid points
+    if (coor_offset[0] == -1) continue;
+
+    int num = 0;
+    int coor_x = coor_offset[0];
+    int coor_y = coor_offset[1];
+    int coor_z = coor_offset[2];
+    // only calculate the coors before this coor[index]
+    for (int i = 0; i < index; ++i) {
+      auto prev_coor = coor + i * NDim;
+      if (prev_coor[0] == -1) continue;
+
+      // Find all previous points that have the same coors
+      // if find the same coor, record it
+      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
+          (prev_coor[2] == coor_z)) {
+        num++;
+        if (num == 1) {
+          // point to the same coor that first show up
+          point_to_pointidx[index] = i;
+        } else if (num >= max_points) {
+          // out of boundary
+          break;
+        }
+      }
+    }
+    if (num == 0) {
+      point_to_pointidx[index] = index;
+    }
+    if (num < max_points) {
+      point_to_voxelidx[index] = num;
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void determin_voxel_num(
+    // const T_int* coor,
+    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
+    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
+    const int max_points, const int max_voxels, const int num_points) {
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < num_points; ++i) {
+    int point_pos_in_voxel = point_to_voxelidx[i];
+    // record voxel
+    if (point_pos_in_voxel == -1) {
+      // out of max_points or invalid point
+      continue;
+    } else if (point_pos_in_voxel == 0) {
+      // record new voxel
+      int voxelidx = voxel_num[0];
+      if (voxel_num[0] >= max_voxels) continue;
+      voxel_num[0] += 1;
+      coor_to_voxelidx[i] = voxelidx;
+      num_points_per_voxel[voxelidx] = 1;
+    } else {
+      int point_idx = point_to_pointidx[i];
+      int voxelidx = coor_to_voxelidx[point_idx];
+      if (voxelidx != -1) {
+        coor_to_voxelidx[i] = voxelidx;
+        num_points_per_voxel[voxelidx] += 1;
+      }
+    }
+  }
+}
+
+__global__ void nondeterministic_get_assign_pos(
+    const int nthreads, const int32_t* coors_map, int32_t* pts_id,
+    int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    if (coors_idx > -1) {
+      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
+      pts_id[thread_idx] = coors_pts_pos;
+      if (coors_pts_pos == 0) {
+        coors_order[coors_idx] = atomicAdd(coors_count, 1);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void nondeterministic_assign_point_voxel(
+    const int nthreads, const T* points, const int32_t* coors_map,
+    const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count,
+    const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count,
+    const int max_voxels, const int max_points, const int num_features,
+    const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    int coors_pts_pos = pts_id[thread_idx];
+    if (coors_idx > -1 && coors_pts_pos < max_points) {
+      int coors_pos = coors_order[coors_idx];
+      if (coors_pos < max_voxels) {
+        auto voxels_offset =
+            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
+        auto points_offset = points + thread_idx * num_features;
+        for (int k = 0; k < num_features; k++) {
+          voxels_offset[k] = points_offset[k];
+        }
+        if (coors_pts_pos == 0) {
+          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
+          auto coors_offset = coors + coors_pos * NDim;
+          auto coors_in_offset = coors_in + coors_idx * NDim;
+          for (int k = 0; k < NDim; k++) {
+            coors_offset[k] = coors_in_offset[k];
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif  // VOXELIZATION_CUDA_KERNEL_CUH
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..852737224183c1852f1394903e1106219d9ad40e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
@@ -0,0 +1,256 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef COMMON_MLU_HELPER_HPP_
+#define COMMON_MLU_HELPER_HPP_
+
+#define NFU_ALIGN_SIZE 128          // Byte
+#define REM_FOR_STACK (128 * 1024)  // 128KB reserved for cncc
+
+#ifdef __BANG_ARCH__
+#define MAX_NRAM_SIZE \
+  (__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
+#define MAX_SRAM_SIZE \
+  (__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
+#else
+#define MAX_NRAM_SIZE (384 * 1024)   // 384KB,  initialization value
+#define MAX_SRAM_SIZE (1920 * 1024)  // 1920KB, initialization value
+#endif
+
+#ifndef PAD_UP
+#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
+#endif
+
+#ifndef PAD_DOWN
+#define PAD_DOWN(x, y) (((x) / (y)) * (y))
+#endif
+
+#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
+
+template <typename scalar_t>
+__mlu_func__ inline scalar_t min(scalar_t a, scalar_t b) {
+  return a < b ? a : b;
+}
+
+template <typename scalar_t>
+__mlu_func__ inline scalar_t max(scalar_t a, scalar_t b) {
+  return a > b ? a : b;
+}
+
+/*!
+ * @brief Converts int32 to float32 data type.
+ *
+ * @param[out] dst
+ *   Pointer to NRAM that stores int32 type data.
+ * @param[in,out] dst_addition
+ *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src
+ *   Pointer to NRAM that stores float32 type data.
+ * @param[in,out] src_addition
+ *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src_count
+ *   The count of elements in src.
+ */
+__mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
+                                   float *src_addition, const int src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_int2float((float *)dst, (int32_t *)src, src_count, 0);
+#else
+  // get sign bit
+  const float move_23bit = 8388608.0;
+  // 0x80000000 = 1,000000000,0000000000000000000000000000
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000000);
+  __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
+                    src_count * sizeof(float), NFU_ALIGN_SIZE);
+  // get 1 or 0 from sign bit
+  // judg is Odd
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x00000001);
+  __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
+                   (char *)src_addition, src_count * sizeof(float),
+                   NFU_ALIGN_SIZE);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000001);
+  __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
+                  NFU_ALIGN_SIZE / sizeof(float));
+  // minus xor, positive num invariant
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xffffffff);
+  __bang_cycle_mul(dst, dst_addition, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+  __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));
+  // convert int32 to float32
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x7fffff);
+  __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
+                    src_count * sizeof(float), NFU_ALIGN_SIZE);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x4b000000);
+  __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
+                   src_count * sizeof(float), NFU_ALIGN_SIZE);
+  __bang_sub_scalar(dst, dst, move_23bit, src_count);
+  // add one
+  __bang_add(dst, dst, dst_addition, src_count);
+  // set sign for float32
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xffffffff);
+  __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x00000001);
+  __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000000);
+  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
+                    (char *)src_addition, src_count * 4, 128);
+  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);
+#endif  // __BANG_ARCH__ >= 300
+}
+
+/*!
+ * @brief Converts float32 to int32 data type with to_zero round mode.
+ *
+ * @param[out] dst
+ *   Pointer to NRAM that stores float32 type data.
+ * @param[in,out] dst_addition
+ *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src
+ *   Pointer to NRAM that stores int32 type data.
+ * @param[in,out] src_addition
+ *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src_count
+ *   The count of elements in src.
+ */
+__mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
+                                   float *src_addition, const int src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_float2int_tz((int32_t *)dst, (float *)src, src_count, 0);
+#else
+  // sign ===> src_addition
+  // dst=-1.0 : when src[i] is a negative number
+  // dst=+1.0 : when src[i] is a positive number
+  const int floatDchar = sizeof(float) / sizeof(char);
+  __bang_active_sign((float *)dst, src, src_count);
+  // dst_addition = abs(src)
+  __bang_mul(dst_addition, src, (float *)dst, src_count);
+  // if dst_addition < 1.0 , then src_addition + 1, to fix add error.
+  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     1.0f);
+  __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
+                  NFU_ALIGN_SIZE / sizeof(float));
+  __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xbf800000);
+  // set negative flag -1.0 = 0xbf80000
+  __bang_cycle_eq(
+      (float *)dst, (float *)dst, (float *)src_addition, src_count,
+      NFU_ALIGN_SIZE / sizeof(float));  //  to mark all src in [x<-1.0]
+  __bang_active_abs(dst_addition, src, src_count);
+  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     8388608.0f);
+  // mask shift move 23
+  __bang_cycle_add_tz(
+      dst_addition, dst_addition, src_addition, src_count,
+      NFU_ALIGN_SIZE / sizeof(float));  // right shift move 23bit
+  // two`s complement for negatibe
+  // dst=1.0 , when src <-1.0
+  // dst=0.0 , when src >=-1.0
+  __bang_sub(dst_addition, dst_addition, (float *)dst, src_count);
+  // to fix max value
+  // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
+  // means max value.
+  __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count);
+  __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
+              src_count * floatDchar);
+  // get low 23bit
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     (unsigned)0x007fffff);
+  // mask low 23bit is 1
+  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
+                    (char *)src_addition, src_count * floatDchar,
+                    NFU_ALIGN_SIZE / sizeof(char));
+  // set 9 high bit ===> dst
+  // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
+  //  1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
+  __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
+  __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+  // src or dst_addition
+  __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
+             src_count * floatDchar);
+  __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count);
+  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
+             src_count * floatDchar);
+#endif  // __BANG_ARCH__ >= 300
+}
+
+/*!
+ * @brief Converts float32 to half data type,
+ * the rounding mode on MLU200 is rd, on MLU300 is rn.
+ *
+ * @param[out] dst
+ *   Pointer to NRAM that stores half type data.
+ * @param[in] src
+ *   Pointer to NRAM that stores float32 type data.
+ * @param[in] src_count
+ *   The count of elements in src.
+ */
+__mlu_func__ inline void convertFloat2half(half *dst, float *src,
+                                           int src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_float2half_rn(dst, src, src_count);
+#else
+  __bang_float2half_rd(dst, src, src_count);
+#endif
+}
+
+/*!
+ * @brief recursiveSumPool.
+ * @param[in,out] dst
+ *     Pointer to NRAM that stores the input and output data.
+ * @param[in] low_dim
+ *     Which is the number of low dim.
+ * @param[in] high_dim
+ *     Which is the number of high dim.
+ * @param[in] kernel_limit
+ *     Which is the high_dim of sumpool per time.
+ ******************************************************************************/
+template <typename T>
+__mlu_func__ void recursiveSumPool(T *dst, int low_dim, int high_dim,
+                                   int kernel_limit) {
+  for (; high_dim > 1;) {
+    int repeat_s = high_dim / kernel_limit;
+    int remain_s = high_dim % kernel_limit;
+
+    if (remain_s) {
+      __bang_sumpool((T *)dst, (T *)dst, low_dim, 1, remain_s, 1, remain_s, 1,
+                     1);
+    }
+    if (repeat_s) {
+      __bang_sumpool((T *)dst + (remain_s > 0 ? low_dim : 0),
+                     (T *)dst + remain_s * low_dim, low_dim,
+                     kernel_limit * repeat_s, 1, kernel_limit, 1, 1,
+                     kernel_limit);
+    }
+    high_dim = repeat_s + (bool)remain_s;
+  }
+  return;
+}
+
+#endif  // COMMON_MLU_HELPER_HPP_
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..1356a799ac3ba5d36de9df25a0cdd0a706506e75
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu
@@ -0,0 +1,181 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+
+__nram__ char nram_buffer[MAX_NRAM_SIZE];
+
+template <typename T>
+__mlu_func__ void MLUUnion1MaskedIm2colForward(
+    const T *feature, const int height, const int width, const int channels,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int32_t *mask_h_idx, const int32_t *mask_w_idx, const int mask_cnt,
+    T *data_col) {
+  for (int index = taskId; index < mask_cnt; index += taskDim) {
+    const int h_col = mask_h_idx[index];
+    const int w_col = mask_w_idx[index];
+    const int h_offset = h_col - pad_h;
+    const int w_offset = w_col - pad_w;
+    int h_start = h_offset;
+    int h_end = h_offset + kernel_h - 1;
+    int w_start = w_offset;
+    int w_end = w_start + kernel_w - 1;
+    if (h_start >= height || w_start >= width || h_end < 0 || w_end < 0) {
+      continue;
+    } else {
+      int h_start_valid = max(0, h_start);
+      int h_end_valid = min(height - 1, h_end);
+      int w_start_valid = max(0, w_start);
+      int w_end_valid = min(width - 1, w_end);
+      __memcpy(
+          data_col + index * kernel_h * kernel_w * channels +
+              ((h_start_valid - h_start) * kernel_w +
+               (w_start_valid - w_start)) *
+                  channels,
+          feature + h_start_valid * width * channels + w_start_valid * channels,
+          (w_end_valid - w_start_valid + 1) * channels * sizeof(T), GDRAM2GDRAM,
+          kernel_w * channels * sizeof(T), width * channels * sizeof(T),
+          h_end_valid - h_start_valid);
+    }
+  }
+}
+
+template <typename T>
+__mlu_func__ void MLUUnion1MaskedCol2imForward(const T *col, const int height,
+                                               const int width,
+                                               const int channels,
+                                               const int32_t *mask_h_idx,
+                                               const int32_t *mask_w_idx,
+                                               const int mask_cnt, T *im) {
+  const int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T);
+  if (channels <= channels_max_num_nram) {
+    const int deal_num = channels_max_num_nram / channels;
+    int mask_per_core = mask_cnt / taskDim;
+    const int mask_remain = mask_cnt % taskDim;
+    mask_per_core += taskId < mask_remain ? 1 : 0;
+    int index_start = taskId < mask_remain
+                          ? taskId * mask_per_core
+                          : taskId * mask_per_core + mask_remain;
+    int loop = mask_per_core / deal_num;
+    int remain_num = mask_per_core % deal_num;
+    T *nram_col = (T *)nram_buffer;
+    for (int index = 0; index < loop; ++index) {
+      int cur_index = index_start + index * deal_num;
+      __memcpy(nram_col, col + cur_index * channels,
+               deal_num * channels * sizeof(T), GDRAM2NRAM);
+      for (int i = 0; i < deal_num; ++i) {
+        int mask_index = cur_index + i;
+        const int h_im = mask_h_idx[mask_index];
+        const int w_im = mask_w_idx[mask_index];
+        // if(h_im>=height || w_im>=width) continue;
+        __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,
+                 channels * sizeof(T), NRAM2GDRAM);
+      }
+    }
+    if (remain_num > 0) {
+      int cur_index = index_start + loop * deal_num;
+      __memcpy(nram_col, col + cur_index * channels,
+               remain_num * channels * sizeof(T), GDRAM2NRAM);
+      for (int i = 0; i < remain_num; ++i) {
+        int mask_index = cur_index + i;
+        const int h_im = mask_h_idx[mask_index];
+        const int w_im = mask_w_idx[mask_index];
+        // if(h_im>=height || w_im>=width) continue;
+        __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,
+                 channels * sizeof(T), NRAM2GDRAM);
+      }
+    }
+  } else {
+    for (int index = taskId; index < mask_cnt; index += taskDim) {
+      const int m_index = index % mask_cnt;
+      const int h_im = mask_h_idx[m_index];
+      const int w_im = mask_w_idx[m_index];
+      // if(h_im>=height || w_im>=width) continue;
+      __memcpy(im + (h_im * width + w_im) * channels, col + index * channels,
+               channels * sizeof(T), GDRAM2GDRAM);
+    }
+  }
+}
+
+__mlu_global__ void MLUKernelMaskedIm2colForward(
+    const void *feature, const int height, const int width, const int channels,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt,
+    void *data_col, const cnrtDataType_t data_dtype) {
+  if (coreId == 0x80) {
+    return;
+  }
+
+  switch (data_dtype) {
+    case CNRT_FLOAT16: {
+      MLUUnion1MaskedIm2colForward((half *)feature, height, width, channels,
+                                   kernel_h, kernel_w, pad_h, pad_w,
+                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
+                                   mask_cnt, (half *)data_col);
+    }; break;
+    case CNRT_FLOAT32: {
+      MLUUnion1MaskedIm2colForward((float *)feature, height, width, channels,
+                                   kernel_h, kernel_w, pad_h, pad_w,
+                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
+                                   mask_cnt, (float *)data_col);
+    }; break;
+    default: {
+      break;
+    }
+  }
+}
+
+__mlu_global__ void MLUKernelMaskedCol2imForward(
+    const void *col, const int height, const int width, const int channels,
+    const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt,
+    void *im, const cnrtDataType_t data_dtype) {
+  if (coreId == 0x80) {
+    return;
+  }
+  switch (data_dtype) {
+    case CNRT_FLOAT16: {
+      MLUUnion1MaskedCol2imForward((half *)col, height, width, channels,
+                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
+                                   mask_cnt, (half *)im);
+    }; break;
+    case CNRT_FLOAT32: {
+      MLUUnion1MaskedCol2imForward((float *)col, height, width, channels,
+                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
+                                   mask_cnt, (float *)im);
+    }; break;
+    default: {
+      break;
+    }
+  }
+}
+
+void KernelMaskedIm2colForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    cnrtDataType_t k_dtype, const void *im_ptr, const int height,
+    const int width, const int channels, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const void *mask_h_idx_ptr,
+    const void *mask_w_idx_ptr, const int mask_cnt, void *col_ptr) {
+  MLUKernelMaskedIm2colForward<<<k_dim, k_type, queue>>>(
+      im_ptr, height, width, channels, kernel_h, kernel_w, pad_h, pad_w,
+      mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, col_ptr, k_dtype);
+}
+
+void KernelMaskedCol2imForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                               cnrtQueue_t queue, cnrtDataType_t k_dtype,
+                               const void *col_ptr, const int height,
+                               const int width, const int channels,
+                               const void *mask_h_idx_ptr,
+                               const void *mask_w_idx_ptr, const int mask_cnt,
+                               void *im_ptr) {
+  MLUKernelMaskedCol2imForward<<<k_dim, k_type, queue>>>(
+      col_ptr, height, width, channels, mask_h_idx_ptr, mask_w_idx_ptr,
+      mask_cnt, im_ptr, k_dtype);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..3a6d2d3ba61c2ba87ae9b1fb301c412fea93195c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
@@ -0,0 +1,747 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+
+#define ALIGN_SIZE 64
+#define PIPELINE_COMMON_NUM 2
+#define PIPELINE_PINGPONG_NUM 10
+
+__nram__ char nram_buffer[MAX_NRAM_SIZE];
+
+namespace forward {
+template <typename T>
+__mlu_func__ void getRoiBinInfo(T *input_v, T *rois_v, int bin_i, int height,
+                                int width, int channels, int p_height,
+                                int p_width, T spatial_scale, int *bin_x1,
+                                int *bin_y1, int *bin_x2, int *bin_y2,
+                                int *bin_wdim, int *bin_hdim, int *bin_dims,
+                                T **input_base, bool *is_empty) {
+  int pw = bin_i % p_width;
+  int ph = (bin_i / p_width) % p_height;
+  int roi_n = bin_i / p_width / p_height;
+
+  /*roi*/
+  const T *roi_info = rois_v + roi_n * 5;  // {{batch, x1, y1, x2, y2},,,}
+  int batch_index = (int)roi_info[0];
+  int roi_x1 = round(roi_info[1] * spatial_scale);
+  int roi_y1 = round(roi_info[2] * spatial_scale);
+  int roi_x2 = round(roi_info[3] * spatial_scale);
+  int roi_y2 = round(roi_info[4] * spatial_scale);
+  int roi_w = roi_x2 - roi_x1 + 1 > 1 ? roi_x2 - roi_x1 + 1 : 1;
+  int roi_h = roi_y2 - roi_y1 + 1 > 1 ? roi_y2 - roi_y1 + 1 : 1;
+
+  /*bin*/
+  T bin_w = (T)roi_w / (T)p_width;
+  T bin_h = (T)roi_h / (T)p_height;
+
+  *bin_x1 = (int)floor((T)pw * bin_w) + roi_x1;
+  *bin_x1 = *bin_x1 > 0 ? *bin_x1 : 0;
+  *bin_x1 = *bin_x1 < width ? *bin_x1 : width;
+
+  *bin_y1 = (int)floor((T)ph * bin_h) + roi_y1;
+  *bin_y1 = *bin_y1 > 0 ? *bin_y1 : 0;
+  *bin_y1 = *bin_y1 < height ? *bin_y1 : height;
+
+  *bin_x2 = (int)ceil((T)(pw + 1) * bin_w) + roi_x1;
+  *bin_x2 = *bin_x2 > 0 ? *bin_x2 : 0;
+  *bin_x2 = *bin_x2 < width ? *bin_x2 : width;
+
+  *bin_y2 = (int)ceil((T)(ph + 1) * bin_h) + roi_y1;
+  *bin_y2 = *bin_y2 > 0 ? *bin_y2 : 0;
+  *bin_y2 = *bin_y2 < height ? *bin_y2 : height;
+
+  *input_base = input_v + batch_index * height * width * channels;
+  *bin_wdim = *bin_x2 - *bin_x1;
+  *bin_hdim = *bin_y2 - *bin_y1;
+  *bin_dims = (*bin_hdim) * (*bin_wdim);
+  *is_empty = (*bin_y2 <= *bin_y1) || (*bin_x2 <= *bin_x1);
+}
+
+template <typename T>
+__mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
+                                   int channels, int height, int width,
+                                   int p_height, int p_width, int rois_num,
+                                   T spatial_scale, T *output_v, int *argmax) {
+  /*
+   * NRAM partition
+   *  |---------------------------------------------------|
+   *  |                        ping                       |
+   *  |---------------------------------------------------|
+   *  |                        pong                       |
+   *  |---------------------------------------------------|
+   *  |                        out                        |
+   *  |---------------------------------------------------|
+   *  |                        argmax                     |
+   *  |---------------------------------------------------|
+   *  |                        a                          |
+   *  |---------------------------------------------------|
+   *  |                        b                          |
+   *  |---------------------------------------------------|
+   */
+  uint32_t is_half = sizeof(T) == sizeof(half) ? true : false;
+  uint32_t t_size = sizeof(T);
+  uint32_t float_div = NFU_ALIGN_SIZE / sizeof(float);
+  uint32_t half_div = NFU_ALIGN_SIZE / sizeof(half);
+
+  uint32_t channels_align = PAD_UP(channels, float_div);
+  uint32_t nram_limit = PAD_DOWN(
+      (MAX_NRAM_SIZE / sizeof(float) - 4 * channels_align) / 2, half_div);
+
+  // nram PING/PONG, output, argamx, a, b
+  float *nram_ping = (float *)nram_buffer;
+  float *nram_pong = (float *)nram_buffer + nram_limit;
+  float *nram_out = (float *)nram_buffer + 2 * nram_limit;
+  float *nram_argmax = nram_out + channels_align;
+  float *nram_a = nram_out + 2 * channels_align;
+  float *nram_b = nram_out + 3 * channels_align;
+
+  uint32_t c_bins_num = rois_num * p_height * p_width;
+  uint32_t task_bins = c_bins_num / taskDim;
+  uint32_t rem_bins = c_bins_num % taskDim;
+  if (taskId < rem_bins) {
+    task_bins += 1;
+  }
+  int bin_first =
+      (c_bins_num / taskDim) * taskId + (taskId > rem_bins ? rem_bins : taskId);
+  int bins_loop = bin_first + task_bins;
+
+  T *input_base = NULL;
+  T *output_base = output_v + bin_first * channels;
+  int *argmax_base = NULL != argmax ? argmax + bin_first * channels : NULL;
+  int bin_x1, bin_y1, bin_x2, bin_y2, bin_wdim, bin_hdim, bin_dims;
+  int pbin_x1, pbin_y1, pbin_x2, pbin_y2, pbin_wdim, pbin_hdim, pbin_dims;
+  bool is_empty = false;
+  bool pong_is_empty = false;
+  bool is_first_bin = true;
+  uint32_t src_offset = 0;
+  uint32_t dst_offset = 0;
+  uint32_t nram_offset = 0;
+  uint32_t half_offset =
+      is_half ? (nram_limit / 2 / half_div * half_div) * 2 : 0;
+  float *nram_tmp = NULL;
+
+  uint32_t c_slice = 0;
+  uint32_t c_slice_align = 0;
+  uint32_t pongc_slice = 0;
+  uint32_t pongc_slice_align = 0;
+  for (int bin_i = bin_first; bin_i < bins_loop; bin_i++) {
+    getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i, height, width, channels,
+                  p_height, p_width, (T)spatial_scale, &bin_x1, &bin_y1,
+                  &bin_x2, &bin_y2, &bin_wdim, &bin_hdim, &bin_dims,
+                  &input_base, &is_empty);
+    uint32_t c_rem = channels;
+    c_slice = nram_limit / bin_dims / float_div * float_div;
+
+    if (is_first_bin && !is_empty) {
+      c_slice = c_slice > c_rem ? c_rem : c_slice;
+      c_slice_align = PAD_UP(c_slice, float_div);
+      for (int h = bin_y1; h < bin_y2; h++) {
+        src_offset = (h * width + bin_x1) * channels;
+        nram_offset = (h - bin_y1) * bin_wdim * c_slice_align + half_offset;
+        if (c_slice_align == channels) {
+          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
+                   bin_wdim * c_slice * t_size, GDRAM2NRAM);
+        } else {
+          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
+                   c_slice * t_size, GDRAM2NRAM, c_slice_align * t_size,
+                   channels * t_size, bin_wdim - 1);
+        }
+      }
+    }
+    uint32_t c_offset = 0;
+    while (c_rem > 0) {
+      c_slice = c_slice > c_rem ? c_rem : c_slice;
+      c_slice_align = PAD_UP(c_slice, float_div);
+
+      /*__memcpy_async*/
+      if (c_rem - c_slice > 0 && !is_empty) {
+        pongc_slice = c_rem - c_slice > c_slice ? c_slice : c_rem - c_slice;
+        pongc_slice_align = PAD_UP(pongc_slice, float_div);
+        for (int h = bin_y1; h < bin_y2; h++) {
+          src_offset = (h * width + bin_x1) * channels + c_offset;
+          nram_offset =
+              (h - bin_y1) * bin_wdim * pongc_slice_align + half_offset;
+          __memcpy_async((T *)nram_pong + nram_offset,
+                         (T *)input_base + src_offset + c_slice,
+                         pongc_slice * t_size, GDRAM2NRAM,
+                         pongc_slice_align * t_size, channels * t_size,
+                         bin_wdim - 1);
+        }
+      } else if (bin_i + 1 < bins_loop) {
+        getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i + 1, height, width,
+                      channels, p_height, p_width, (T)spatial_scale, &pbin_x1,
+                      &pbin_y1, &pbin_x2, &pbin_y2, &pbin_wdim, &pbin_hdim,
+                      &pbin_dims, &input_base, &pong_is_empty);
+        pongc_slice = PAD_DOWN(nram_limit / pbin_dims, float_div);
+        pongc_slice = pongc_slice > channels ? channels : pongc_slice;
+        pongc_slice_align = PAD_UP(pongc_slice, float_div);
+        if (!pong_is_empty) {
+          for (int h = pbin_y1; h < pbin_y2; h++) {
+            src_offset = (h * width + pbin_x1) * channels;
+            nram_offset =
+                (h - pbin_y1) * pbin_wdim * pongc_slice_align + half_offset;
+            if (pongc_slice_align == channels) {
+              __memcpy_async((T *)nram_pong + nram_offset,
+                             (T *)input_base + src_offset,
+                             pbin_wdim * pongc_slice * t_size, GDRAM2NRAM);
+            } else {
+              __memcpy_async((T *)nram_pong + nram_offset,
+                             (T *)input_base + src_offset, pongc_slice * t_size,
+                             GDRAM2NRAM, pongc_slice_align * t_size,
+                             channels * t_size, pbin_wdim - 1);
+            }
+          }
+        }
+      }
+
+      if (is_empty) {
+        __bang_write_value((T *)nram_out, c_slice_align, (T)0);
+        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
+                 c_slice * t_size, NRAM2GDRAM);
+        if (NULL != argmax) {
+          __bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
+          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
+                   (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
+        }
+      } else {
+        if (is_half) {
+          uint32_t bin_align64 = PAD_UP(bin_dims * c_slice_align, half_div);
+          __bang_half2float((float *)nram_ping, (half *)nram_ping + half_offset,
+                            bin_align64);
+        }
+        __bang_maxpool((float *)nram_out, (float *)nram_ping, c_slice_align,
+                       bin_hdim, bin_wdim, bin_hdim, bin_wdim, 1, 1);
+        if (is_half) {
+          uint32_t c_align64 = PAD_UP(c_slice_align, half_div);
+          __bang_float2half_rd((half *)nram_out, (float *)nram_out, c_align64);
+        }
+        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
+                 c_slice * t_size, NRAM2GDRAM);
+        if (NULL != argmax) {
+          /*compute max_index*/
+          __bang_maxpool_index((uint32_t *)nram_out, (float *)nram_ping,
+                               c_slice_align, bin_hdim, bin_wdim, bin_hdim,
+                               bin_wdim, 1, 1);
+          convertInt2Float((float *)nram_argmax, (float *)nram_a,
+                           (int32_t *)nram_out, (float *)nram_b, c_slice_align);
+
+          /*compute input_h*/
+          for (int i = 0; i < c_slice; i++) {
+            nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
+          }
+          __bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1,
+                            c_slice_align);
+          __bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width,
+                            c_slice_align);
+
+          /*compute input_w*/
+          __bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim,
+                            c_slice_align);
+          __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
+                     c_slice_align);
+          __bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1,
+                            c_slice_align);
+          __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
+                     c_slice_align);
+          convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
+                           (float *)nram_out, (float *)nram_b, c_slice_align);
+          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
+                   (int32_t *)nram_argmax, c_slice * sizeof(int32_t),
+                   NRAM2GDRAM);
+        }
+      }
+      nram_tmp = nram_ping;
+      nram_ping = nram_pong;
+      nram_pong = nram_tmp;
+      c_offset += c_slice;
+      c_rem -= c_slice;
+      __asm__ volatile("sync;");
+    }
+    dst_offset += channels;
+    is_first_bin = false;
+  }
+}
+
+__mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
+                                     const void *input_data,
+                                     const void *input_rois, int batch,
+                                     int channels, int height, int width,
+                                     int pooled_height, int pooled_width,
+                                     int rois_num, float spatial_scale,
+                                     void *output_data, int *argmax) {
+  switch (data_type) {
+    case CNRT_FLOAT16: {
+      MLUUnion1Roipool((half *)input_data, (half *)input_rois, batch, channels,
+                       height, width, pooled_height, pooled_width, rois_num,
+                       (half)spatial_scale, (half *)output_data, argmax);
+    }; break;
+    case CNRT_FLOAT32: {
+      MLUUnion1Roipool((float *)input_data, (float *)input_rois, batch,
+                       channels, height, width, pooled_height, pooled_width,
+                       rois_num, (float)spatial_scale, (float *)output_data,
+                       argmax);
+    }; break;
+    default: { break; }
+  }
+}
+}  // namespace forward
+
+namespace backward {
+// Convert index of argmax from global grads_image to local bin in RoI. Vector
+// operations do not support int type, so conversion from int to float is
+// performed here.
+__mlu_func__ void convertIndex(
+    int32_t *nram_argmax, int32_t *nram_argmax_fp, int32_t *nram_argmax_fp_bk1,
+    int32_t *nram_argmax_fp_bk2, int32_t *nram_argmax_int,
+    int32_t *nram_argmax_int_h, int32_t *nram_argmax_int_w,
+    int32_t *nram_argmax_fp_h, int32_t *nram_argmax_fp_w,
+    float *nram_atomic_add, float *nram_grads_image, int width, int height,
+    int wstart, int hstart, int w_compute, int h_compute, int align_c,
+    int channels, int loop_flag, int loop_id, int true_limit) {
+  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
+                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
+
+  // This step uses scalar division, because the above vector division causes
+  // rounding accuracy problem.
+  for (int i = 0; i < channels; ++i) {
+    *((float *)nram_argmax_fp + i) = *((float *)nram_argmax_fp + i) / width;
+  }
+
+  // Use 'float2int_tz' to perform '*((int32_t*)nram_argmax + i) / width'
+  // operation.
+  convertFloat2Int((int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk1,
+                   (float *)nram_argmax_fp, (float *)nram_argmax_fp_bk2,
+                   align_c);
+  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
+                   (int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk2,
+                   align_c);
+
+  // Perform 'temp_result - hstart' operation
+  __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
+                    align_c);
+
+  // Perform 'temp_result1 - temp_result2 * width' operation
+  __bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
+                    align_c);
+  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
+                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
+  __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
+             (float *)nram_argmax_fp_w, align_c);
+
+  // Perform 'temp_result - wstart' operation
+  __bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
+                    wstart, align_c);
+
+  // Perform 'temp_result = h * w_compute + w' operation
+  __bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+                    w_compute, align_c);
+  __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+             (float *)nram_argmax_fp_w, align_c);
+
+  if (loop_flag == 1) {
+    __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+                      (loop_id * true_limit), align_c);
+  }
+  convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
+                   (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
+                   align_c);
+}
+
+template <typename T>
+__mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
+                                   const int32_t *argmax, T *grads_image,
+                                   int channels, int height, int width,
+                                   int pooled_height, int pooled_width,
+                                   int rois_num, const T spatial_scale,
+                                   int high_precision) {
+  // Calculate the number of rois processed by each core
+  int bin_num = rois_num * pooled_height * pooled_width;
+  int loop =
+      (bin_num % taskDim) ? (bin_num / taskDim + 1) : (bin_num / taskDim);
+  int tid = taskId * loop;
+  if (bin_num % taskDim != 0) {
+    if (tid >= bin_num) {
+      return;
+    } else {
+      // last part is (bin_num - tid).
+      loop = bin_num - tid < loop ? bin_num - tid : loop;
+    }
+  }
+  int align_c = PAD_UP(channels, ALIGN_SIZE);
+  // Common part has 2: grads, argmax; ping-pong each is PIPELINE_PINGPONG_NUM.
+  int data_size =
+      PAD_DOWN(((MAX_NRAM_SIZE / sizeof(float) - PIPELINE_COMMON_NUM * align_c -
+                 (PIPELINE_PINGPONG_NUM - 1) * align_c * 2) /
+                2),
+               ALIGN_SIZE);
+  int hw_limit = data_size / align_c;
+  float *nram_grads = (float *)nram_buffer;
+  for (int idx = tid; idx < tid + loop; ++idx) {
+    // (n, ph, pw) is a C in the pooled output
+    int pw = idx % pooled_width;
+    int ph = (idx / pooled_width) % pooled_height;
+    int n = idx / pooled_width / pooled_height;
+
+    const T *offset_rois = (const T *)(rois + n * 5);
+    int roi_batch_ind = int(offset_rois[0]);
+    // Calculate the roi region on feature maps
+    int roi_start_w = round(offset_rois[1] * spatial_scale);
+    int roi_start_h = round(offset_rois[2] * spatial_scale);
+    int roi_end_w = round(offset_rois[3] * spatial_scale);
+    int roi_end_h = round(offset_rois[4] * spatial_scale);
+    // Force malformed rois to 1x1
+    int roi_width =
+        roi_end_w - roi_start_w + 1 > 1 ? roi_end_w - roi_start_w + 1 : 1;
+    int roi_height =
+        roi_end_h - roi_start_h + 1 > 1 ? roi_end_h - roi_start_h + 1 : 1;
+    T bin_size_h = (T)roi_height / (T)pooled_height;
+    T bin_size_w = (T)roi_width / (T)pooled_width;
+
+    // The corresponding bin region
+    int hstart = int(floor((T)ph * bin_size_h));
+    int wstart = int(floor((T)pw * bin_size_w));
+    int hend = int(ceil((T)(ph + 1) * bin_size_h));
+    int wend = int(ceil((T)(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries, min(max(A, B), C);
+    hstart = hstart + roi_start_h > 0 ? hstart + roi_start_h : 0;
+    hstart = hstart < height ? hstart : height;
+    hend = hend + roi_start_h > 0 ? hend + roi_start_h : 0;
+    hend = hend < height ? hend : height;
+    wstart = wstart + roi_start_w > 0 ? wstart + roi_start_w : 0;
+    wstart = wstart < width ? wstart : width;
+    wend = wend + roi_start_w > 0 ? wend + roi_start_w : 0;
+    wend = wend < width ? wend : width;
+
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+    if (!is_empty) {
+      int h_compute = hend - hstart;
+      int w_compute = wend - wstart;
+      int true_limit =
+          hw_limit < h_compute * w_compute ? hw_limit : h_compute * w_compute;
+      int loop_int = (h_compute * w_compute) / true_limit;
+      int rem = (h_compute * w_compute) % true_limit;
+      int32_t *nram_argmax = (int32_t *)nram_grads + align_c;
+      int32_t *nram_argmax_fp = (int32_t *)nram_argmax + align_c;
+      int32_t *nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
+      int32_t *nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
+      int32_t *nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
+      int32_t *nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
+      int32_t *nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
+      int32_t *nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
+      int32_t *nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
+      float *nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
+      float *nram_grads_image = (float *)nram_atomic_add + align_c;
+      if (true_limit == h_compute * w_compute) {
+        /*
+         * NRAM partition
+         *  |---------------------------------------------------|
+         *  |                     grads                         |
+         *  |---------------------------------------------------|
+         *  |                     argmax                        |
+         *  |---------------------------------------------------|
+         *  |                     argmax_temp                   |
+         *  |---------------------------------------------------|
+         *  |                     atomic_add                    |
+         *  |---------------------------------------------------|
+         *  |                     grads_image                   |
+         *  |---------------------------------------------------|
+         */
+
+        // Load the data from GDRAM to NRAM.
+        __memcpy(
+            (T *)nram_grads + align_c * high_precision,
+            (const T *)grads +
+                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
+                    channels,
+            channels * sizeof(T), GDRAM2NRAM);
+        if (high_precision) {
+          __bang_half2float((float *)nram_grads,
+                            (half *)nram_grads + align_c * high_precision,
+                            align_c);
+        }
+
+        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
+                                             (n * pooled_height * pooled_width +
+                                              ph * pooled_width + pw) *
+                                                 channels,
+                 channels * sizeof(int32_t), GDRAM2NRAM);
+
+        // Perform pooling operation on NRAM.
+        convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
+                     nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
+                     nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
+                     nram_atomic_add, nram_grads_image, width, height, wstart,
+                     hstart, w_compute, h_compute, align_c, channels, 0, 0, 0);
+        __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
+                          (int32_t *)nram_argmax_int, align_c, h_compute,
+                          w_compute, h_compute, w_compute, h_compute,
+                          w_compute);
+        if (high_precision) {
+          __bang_float2half_rd((half *)nram_grads_image,
+                               (float *)nram_grads_image,
+                               h_compute * w_compute * align_c);
+        }
+
+        // Store the result on NRAM back to GDRAM.
+        for (int hc = 0; hc < h_compute; ++hc) {
+          for (int wc = 0; wc < w_compute; ++wc) {
+            T *dst = (T *)nram_atomic_add;
+            int grad_image_offset = (roi_batch_ind * height * width +
+                                     (hc + hstart) * width + wc + wstart) *
+                                    channels;
+            T *src1 = (T *)grads_image + grad_image_offset;
+            int nram_grads_image_offset = (hc * w_compute + wc) * align_c;
+            T *src2 = (T *)nram_grads_image + nram_grads_image_offset;
+            __bang_atomic_add(dst, src1, src2, channels);
+          }
+        }
+      } else if (true_limit > 0) {
+        /*
+         * NRAM partition
+         *  |---------------------------------------------------|
+         *  |                     grads                         |
+         *  |---------------------------------------------------|
+         *  |                     argmax                        |
+         *  |--------------------ping_pong----------------------|
+         *  |       argmax_temp      |       argmax_temp        |
+         *  |------------------------|--------------------------|
+         *  |       atomic_add       |       atomic_add         |
+         *  |------------------------|--------------------------|
+         *  |       grads_image      |       grads_image        |
+         *  |---------------------------------------------------|
+         */
+
+        // Load the data from GDRAM to NRAM.
+        __memcpy(
+            (T *)nram_grads + align_c * high_precision,
+            (const T *)grads +
+                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
+                    channels,
+            channels * sizeof(T), GDRAM2NRAM);
+        if (high_precision) {
+          __bang_half2float((float *)nram_grads,
+                            (half *)nram_grads + align_c * high_precision,
+                            align_c);
+        }
+        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
+                                             (n * pooled_height * pooled_width +
+                                              ph * pooled_width + pw) *
+                                                 channels,
+                 channels * sizeof(int32_t), GDRAM2NRAM);
+
+        int ping_pong = 0;
+        int ping_pong_offset =
+            (MAX_NRAM_SIZE / sizeof(float) - align_c * PIPELINE_COMMON_NUM) / 2;
+        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
+          int size = (loop_id == loop_int) ? rem : true_limit;
+          if (size == 0) {
+            break;
+          }
+          // Perform pooling operation on NRAM.
+          nram_argmax_fp =
+              (int32_t *)nram_argmax + align_c + ping_pong * ping_pong_offset;
+          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
+          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
+          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
+          nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
+          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
+          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
+          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
+          nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
+          nram_grads_image = (float *)nram_atomic_add + align_c;
+          int loop_id_1 = loop_id;
+          int size_1 = ((loop_id_1) == loop_int) ? rem : true_limit;
+          if (size_1 == 0) {
+            break;
+          }
+          convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
+                       nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
+                       nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
+                       nram_atomic_add, nram_grads_image, width, height, wstart,
+                       hstart, w_compute, h_compute, align_c, channels, 1,
+                       loop_id_1, true_limit);
+          __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
+                            (int32_t *)nram_argmax_int, align_c, size_1, 1,
+                            size_1, 1, size_1, 1);
+          if (high_precision) {
+            __bang_float2half_rd((half *)nram_grads_image,
+                                 (float *)nram_grads_image, size_1 * align_c);
+          }
+
+          // Store the result on NRAM back to GDRAM.
+          for (int index_size = 0; index_size < size; ++index_size) {
+            int h = (loop_id * true_limit + index_size) / w_compute;
+            int w = (loop_id * true_limit + index_size) % w_compute;
+            T *dst = (T *)nram_atomic_add;
+            T *grads_image_n =
+                (T *)grads_image + roi_batch_ind * height * width * channels;
+            T *src1 = (T *)grads_image_n +
+                      ((h + hstart) * width + (w + wstart)) * channels;
+            T *src2 = (T *)nram_grads_image + index_size * align_c;
+            __bang_atomic_add(dst, src1, src2, channels);
+          }
+          ping_pong = 1 - ping_pong;
+        }
+      } else {
+        /*
+         * NRAM partition
+         *  |---------------------------------------------------|
+         *  |                     grads                         |
+         *  |---------------------------------------------------|
+         *  |                     argmax                        |
+         *  |--------------------ping_pong----------------------|
+         *  |       argmax_temp      |       argmax_temp        |
+         *  |------------------------|--------------------------|
+         *  |       atomic_add       |       atomic_add         |
+         *  |------------------------|--------------------------|
+         *  |       grads_image      |       grads_image        |
+         *  |---------------------------------------------------|
+         */
+
+        int c_limit =
+            PAD_DOWN(MAX_NRAM_SIZE / sizeof(float) /
+                         (PIPELINE_COMMON_NUM + PIPELINE_PINGPONG_NUM * 2),
+                     ALIGN_SIZE);
+        int loop_int = channels / c_limit;
+        int rem = channels % c_limit;
+        int ping_pong = 0;
+        int ping_pong_offset =
+            (MAX_NRAM_SIZE / sizeof(float) - c_limit * PIPELINE_COMMON_NUM) / 2;
+        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
+          int size = (loop_id == loop_int) ? rem : c_limit;
+          if (size == 0) {
+            break;
+          }
+          nram_argmax_fp =
+              (int32_t *)nram_argmax + c_limit + ping_pong * ping_pong_offset;
+          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + c_limit;
+          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + c_limit;
+          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + c_limit;
+          nram_argmax_int_h = (int32_t *)nram_argmax_int + c_limit;
+          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + c_limit;
+          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + c_limit;
+          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + c_limit;
+          nram_atomic_add = (float *)nram_argmax_fp_w + c_limit;
+          nram_grads_image = (float *)nram_atomic_add + c_limit;
+
+          // This pipeline loads the data from GDRAM to NRAM.
+          __memcpy((T *)nram_grads + c_limit * high_precision,
+                   (const T *)grads +
+                       n * pooled_height * pooled_width * channels +
+                       ph * pooled_width * channels + pw * channels +
+                       loop_id * c_limit,
+                   size * sizeof(T), GDRAM2NRAM);
+          if (high_precision) {
+            __bang_half2float((float *)nram_grads,
+                              (half *)nram_grads + c_limit * high_precision,
+                              c_limit);
+          }
+          __memcpy((int32_t *)nram_argmax,
+                   (const int32_t *)argmax +
+                       n * pooled_height * pooled_width * channels +
+                       ph * pooled_width * channels + pw * channels +
+                       loop_id * c_limit,
+                   size * sizeof(int32_t), GDRAM2NRAM);
+
+          for (int hc = 0; hc < h_compute; ++hc) {
+            for (int wc = 0; wc < w_compute; ++wc) {
+              // This pipeline performs pooling operation on NRAM.
+              convertIndex(
+                  nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
+                  nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
+                  nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
+                  nram_atomic_add, nram_grads_image, width, height, wstart + wc,
+                  hstart + hc, h_compute, w_compute, c_limit, size, 0, 0, 0);
+              __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
+                                (int32_t *)nram_argmax_int, c_limit, 1, 1, 1, 1,
+                                1, 1);
+              if (high_precision) {
+                __bang_float2half_rd((half *)nram_grads_image,
+                                     (float *)nram_grads_image, c_limit);
+              }
+              // This pipeline stores the result on NRAM back to GDRAM.
+              T *dst = (T *)nram_atomic_add;
+              T *grads_image_n =
+                  (T *)grads_image + roi_batch_ind * height * width * channels;
+              T *src1 = (T *)grads_image_n +
+                        ((hc + hstart) * width + (wc + wstart)) * channels +
+                        loop_id * c_limit;
+              T *src2 = (T *)nram_grads_image;
+              __bang_atomic_add(dst, src1, src2, size);
+            }
+          }
+          ping_pong = 1 - ping_pong;
+        }
+      }
+    }
+  }
+}
+
+__mlu_global__ void MLUKernelRoiPoolBackward(
+    const void *grads, const void *rois, const int *argmax, void *grads_image,
+    int rois_num, int pooled_height, int pooled_width, int channels, int no,
+    int height, int width, const float spatial_scale,
+    const cnrtDataType_t k_dtype) {
+  // make sure that memcore is not used
+  if (coreId == 0x80) {
+    return;
+  }
+  switch (k_dtype) {
+    case CNRT_FLOAT16: {
+      // Using the float type '__bang_max_pool_bp' instruction to increase the
+      // bit width.
+      const int high_precision = 1;
+      MLUUnion1Roipool((const half *)rois, (const half *)grads,
+                       (const int32_t *)argmax, (half *)grads_image, channels,
+                       height, width, pooled_height, pooled_width, rois_num,
+                       (const half)spatial_scale, high_precision);
+    }; break;
+    case CNRT_FLOAT32: {
+      const int high_precision = 0;
+      MLUUnion1Roipool((const float *)rois, (const float *)grads,
+                       (const int32_t *)argmax, (float *)grads_image, channels,
+                       height, width, pooled_height, pooled_width, rois_num,
+                       (const float)spatial_scale, high_precision);
+    }; break;
+    default: { break; }
+  }
+}
+}  // namespace backward
+
+void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                          cnrtQueue_t queue, cnrtDataType_t data_type,
+                          const void *input_data, const void *input_rois,
+                          const int batch, const int channels, const int height,
+                          const int width, const int pooled_height,
+                          const int pooled_width, const int rois_num,
+                          const float spatial_scale, void *output_data,
+                          int *argmax) {
+  forward::MLUKernelRoiPool<<<k_dim, k_type, queue>>>(
+      data_type, input_data, input_rois, batch, channels, height, width,
+      pooled_height, pooled_width, rois_num, spatial_scale, output_data,
+      argmax);
+}
+
+void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                           cnrtQueue_t queue, cnrtDataType_t k_dtype,
+                           const void *grad_output_ptr, const void *rois_ptr,
+                           const int *argmax_ptr, void *grad_input_ptr,
+                           const int box_num, const int pooled_height,
+                           const int pooled_width, const int channels,
+                           const int batch, const int height, const int width,
+                           const float spatial_scale) {
+  backward::MLUKernelRoiPoolBackward<<<k_dim, k_type, queue>>>(
+      grad_output_ptr, rois_ptr, argmax_ptr, grad_input_ptr, box_num,
+      pooled_height, pooled_width, channels, batch, height, width,
+      spatial_scale, k_dtype);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSDevice.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSDevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1d9d49618d7aea6a30b42630350c5a7b77ea0ac
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSDevice.h
@@ -0,0 +1,64 @@
+//  Copyright © 2022 Apple Inc.
+
+// This file is modify from:
+// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h
+
+#pragma once
+#include <ATen/ATen.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLDevice;
+typedef void* MTLDevice_t;
+#endif
+
+using namespace std;
+
+namespace at {
+namespace mps {
+
+//-----------------------------------------------------------------
+//  MPSDevice
+//
+// MPSDevice is a singleton class that returns the default device
+//-----------------------------------------------------------------
+
+class TORCH_API MPSDevice {
+ public:
+  /**
+   * MPSDevice should not be cloneable.
+   */
+  MPSDevice(MPSDevice& other) = delete;
+  /**
+   * MPSDevice should not be assignable.
+   */
+  void operator=(const MPSDevice&) = delete;
+  /**
+   * Gets single instance of the Device.
+   */
+  static MPSDevice* getInstance();
+  /**
+   * Returns the single device.
+   */
+  MTLDevice_t device() { return _mtl_device; }
+
+  ~MPSDevice();
+
+ private:
+  static MPSDevice* _device;
+  MTLDevice_t _mtl_device;
+  MPSDevice();
+};
+
+TORCH_API bool is_available();
+
+TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
+
+}  // namespace mps
+}  // namespace at
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSLibrary.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSLibrary.h
new file mode 100644
index 0000000000000000000000000000000000000000..41c33fba8cbdd43cc5b3285603c11c6f9eee617b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSLibrary.h
@@ -0,0 +1,61 @@
+#ifndef _MPS_LIBRARY_H_
+#define _MPS_LIBRARY_H_
+
+#include <string>
+#include <unordered_map>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+typedef id<MTLComputePipelineState> MTLComputePipelineState_t;
+typedef id<MTLLibrary> MTLLibrary_t;
+#else
+typedef void* MTLComputePipelineState;
+typedef void* MTLComputePipelineState_t;
+typedef void* MTLLibrary;
+typedef void* MTLLibrary_t;
+#endif
+
+class MPSLibrary {
+ public:
+  // disable constructor for singleton
+  static MPSLibrary* createFromUrl(const std::string& library_url);
+  static MPSLibrary* createFromSource(const std::string& source);
+  ~MPSLibrary();
+
+  MTLLibrary_t library() { return _library; }
+
+  MTLComputePipelineState_t getComputePipelineState(
+      const std::string& function_name);
+
+ private:
+  MTLLibrary_t _library;
+  std::unordered_map<std::string, MTLComputePipelineState_t> _pso_map;
+};
+
+class MPSLibraryManager {
+ public:
+  // disable constructor for singleton
+  MPSLibraryManager(const MPSLibraryManager&) = delete;
+  MPSLibraryManager& operator=(const MPSLibraryManager&) = delete;
+  MPSLibraryManager(MPSLibraryManager&&) = delete;
+  MPSLibraryManager& operator=(MPSLibraryManager&&) = delete;
+
+  static MPSLibraryManager* getInstance();
+
+  bool hasLibrary(const std::string& name);
+
+  MPSLibrary* getLibrary(const std::string& library_url);
+
+  MPSLibrary* createLibraryFromSouce(const std::string& name,
+                                     const std::string& sources);
+
+  ~MPSLibraryManager();
+
+ private:
+  MPSLibraryManager();
+  std::unordered_map<std::string, std::unique_ptr<MPSLibrary>> _library_map;
+};
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSLibrary.mm b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSLibrary.mm
new file mode 100644
index 0000000000000000000000000000000000000000..99addc7e28222f890e0b65660bb97711b6b52305
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSLibrary.mm
@@ -0,0 +1,107 @@
+#include "MPSLibrary.h"
+#include "MPSDevice.h"
+
+static std::unique_ptr<MPSLibraryManager> mps_library_manager=nullptr;
+
+MPSLibraryManager* MPSLibraryManager::getInstance() {
+  if(!mps_library_manager)
+    mps_library_manager = std::unique_ptr<MPSLibraryManager>(new MPSLibraryManager());
+  return mps_library_manager.get();
+}
+
+MPSLibraryManager::~MPSLibraryManager() {}
+
+MPSLibraryManager::MPSLibraryManager() {}
+
+bool MPSLibraryManager::hasLibrary(const std::string& name) {
+  return _library_map.find(name) != _library_map.end();
+}
+
+MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) {
+  if (_library_map.find(library_url) != _library_map.end()) {
+    return _library_map[library_url].get();
+  }
+  _library_map.emplace(std::make_pair(
+      library_url, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromUrl(library_url))));
+  return _library_map[library_url].get();
+}
+
+MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name,
+                                                      const std::string& source) {
+  NSString* ns_name = [NSString stringWithCString:name.c_str()];
+  if (_library_map.find(name) != _library_map.end()) {
+    NSLog(@"Library %@ already exist.", ns_name);
+    return nullptr;
+  }
+
+  _library_map.emplace(
+      std::make_pair(name, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromSource(source))));
+  return _library_map[name].get();
+}
+
+MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) {
+  MPSLibrary* library = new MPSLibrary();
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // load library and func
+    NSString* utl_str = [NSString stringWithCString:library_url.c_str()];
+    NSURL* metal_url = [NSURL fileURLWithPath:utl_str];
+    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url
+                                                                                 error:&error];
+    if (library->_library == nil) {
+      NSLog(@"Failed to find library, error %@.", error);
+      exit(1);
+    }
+  }
+
+  return library;
+}
+
+MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) {
+  MPSLibrary* library = new MPSLibrary();
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // load library and func
+    NSString* code_str = [NSString stringWithCString:sources.c_str()];
+    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str
+                                                                                  options:nil
+                                                                                    error:&error];
+    if (library->_library == nil) {
+      NSLog(@"Failed to find library, error %@.", error);
+      exit(1);
+    }
+  }
+
+  return library;
+}
+
+MPSLibrary::~MPSLibrary() {
+  [_library release];
+  _library = nil;
+}
+
+MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) {
+  if (_pso_map.find(function_name) != _pso_map.end()) {
+    return _pso_map[function_name];
+  }
+
+  MTLComputePipelineState_t pso;
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // create function
+    NSString* function_name_str = [NSString stringWithCString:function_name.c_str()];
+    id<MTLFunction> func = [_library newFunctionWithName:function_name_str];
+    if (func == nil) {
+      NSLog(@"Failed to created pipeline state object, error %@.", error);
+      exit(1);
+    }
+    // create pipeline
+    pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func
+                                                                                     error:&error];
+    _pso_map.emplace(std::make_pair(function_name, pso));
+  }
+  return _pso_map[function_name];
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSStream.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSStream.h
new file mode 100644
index 0000000000000000000000000000000000000000..54cd388494c8bbac636db44dd5c8afd1915357c6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSStream.h
@@ -0,0 +1,132 @@
+//  Copyright © 2022 Apple Inc.
+
+// This file is modify from:
+// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/util/Exception.h>
+#include "MPSDevice.h"
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+typedef id<MTLCommandQueue> MTLCommandQueue_t;
+typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
+typedef id<MTLSharedEvent> MTLSharedEvent_t;
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLCommandQueue_t;
+typedef void* MTLCommandQueue;
+typedef void* MTLCommandBuffer_t;
+typedef void* MTLCommandBuffer;
+typedef void* MTLSharedEvent_t;
+typedef void* dispatch_queue_t;
+typedef void* MTLDevice_t;
+#define nil NULL;
+#endif
+
+namespace at {
+namespace mps {
+
+//-----------------------------------------------------------------
+//  MPSStream
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStream {
+ public:
+  enum Unchecked { UNCHECKED };
+  /// Construct a MPSStream from a Stream.  This construction is checked,
+  /// and will raise an error if the Stream is not, in fact, a MPS stream.
+  explicit MPSStream(Stream stream);
+
+  ~MPSStream();
+  MTLCommandQueue_t commandQueue() const { return _commandQueue; };
+  dispatch_queue_t queue() const { return _serialQueue; }
+
+  MTLCommandBuffer_t commandBuffer();
+  void commit(bool flush);
+  void commitAndWait();
+  void synchronize();
+
+  void flush();
+
+  /// Get the MPS device index that this stream is associated with.
+  c10::DeviceIndex device_index() const { return _stream.device_index(); }
+
+  MTLCommandQueue_t stream() const { return _commandQueue; };
+
+  MTLDevice_t device() const { return [_commandQueue device]; }
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const { return _stream; }
+
+ private:
+  Stream _stream;
+  MTLCommandQueue_t _commandQueue = nil;
+  MTLCommandBuffer_t _commandBuffer = nil;
+  void _flush(bool commitAndWait) const;
+
+  dispatch_queue_t _serialQueue = nullptr;
+};
+
+/**
+ * Get the current MPS stream
+ */
+TORCH_API MPSStream* getCurrentMPSStream();
+
+/**
+ * Get the default MPS stream
+ */
+TORCH_API MPSStream* getDefaultMPSStream();
+
+//-----------------------------------------------------------------
+//  MPSStreamImpl
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStreamImpl {
+ public:
+  /**
+   * Gets single instance of the MPSStream.
+   */
+  static MPSStream* getInstance();
+
+ private:
+  static MPSStream* _stream;
+  MPSStreamImpl();
+};
+
+//-----------------------------------------------------------------
+//  MPSEvent
+//-----------------------------------------------------------------
+
+struct TORCH_API MPSEvent {
+  MPSEvent();
+  // MPSEvent(id<MTLDevice> device);
+
+  ~MPSEvent();
+  MTLSharedEvent_t event() const { return _event; }
+
+  void recordEvent(MPSStream* stream);
+  void waitForEvent(MPSStream* queue);  // waits on the cpu
+  bool queryEvent();
+  uint64_t getCurrentValue() { return _currentValue; }
+  void setCurrentValue(uint64_t currValue) { _currentValue = currValue; }
+
+ private:
+  bool _isRecorded = false;
+  uint64_t _currentValue = 0;
+  MTLSharedEvent_t _event;
+};
+
+typedef MPSEvent* mpsEvent_t;
+
+}  // namespace mps
+}  // namespace at
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSUtils.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a4ce6d7978d566e88dd22ee4f9722df914ff0de
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/mps/MPSUtils.h
@@ -0,0 +1,51 @@
+#ifndef _MPS_UTILS_H_
+#define _MPS_UTILS_H_
+#include <torch/extension.h>
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+typedef id<MTLBuffer> MTLBuffer_t;
+typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
+#else
+typedef void* MTLBuffer;
+typedef void* MTLBuffer_t;
+typedef void* MTLComputeCommandEncoder;
+typedef void* MTLComputeCommandEncoder_t;
+#endif
+
+// utils
+static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) {
+  return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data());
+}
+
+template <typename T,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t);
+
+template <typename T,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
+  [encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index];
+}
+
+template <typename T, std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
+  [encoder setBytes:&t length:sizeof(t) atIndex:index];
+}
+
+inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {}
+
+template <typename T, typename... Args>
+void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) {
+  setMTLArg(encoder, index, std::forward<T>(t));
+  setMTLArgsImpl(encoder, index + 1, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) {
+  [encoder setComputePipelineState:pso];
+  setMTLArgsImpl(encoder, 0, std::forward<Args>(args)...);
+}
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/parrots_cpp_helper.hpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/parrots_cpp_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..72701890dd727db911a1c0ce4d6790c1b531348d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/parrots_cpp_helper.hpp
@@ -0,0 +1,40 @@
+#ifndef PARROTS_CPP_HELPER
+#define PARROTS_CPP_HELPER
+#include <parrots/darray/darraymath.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/darraylite.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include <vector>
+
+using namespace parrots;
+
+#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \
+  case prim_type: {                                     \
+    using scalar_t = type;                              \
+    return __VA_ARGS__();                               \
+  }
+
+#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...)                  \
+  [&] {                                                             \
+    const auto& the_type = TYPE;                                    \
+    switch (the_type) {                                             \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)  \
+      default:                                                      \
+        PARROTS_NOTSUPPORTED;                                       \
+    }                                                               \
+  }()
+
+#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...)          \
+  [&] {                                                              \
+    const auto& the_type = TYPE;                                     \
+    switch (the_type) {                                              \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__)  \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)   \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \
+      default:                                                       \
+        PARROTS_NOTSUPPORTED;                                        \
+    }                                                                \
+  }()
+
+#endif  // PARROTS_CPP_HELPER
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/parrots_cuda_helper.hpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/parrots_cuda_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..539009c3f91b46ea58a3a64f0875d799e8bd0b65
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/parrots_cuda_helper.hpp
@@ -0,0 +1,111 @@
+#ifndef PARROTS_CUDA_HELPER
+#define PARROTS_CUDA_HELPER
+
+#include <cuda.h>
+#include <float.h>
+
+#include <parrots/darray/darraymath.hpp>
+#include <parrots/darray/mathfunctions.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/darrayutil.hpp>
+#include <parrots/foundation/exceptions.hpp>
+#include <parrots/foundation/float16.hpp>
+#include <parrots/foundation/mathfunction.hpp>
+
+#include "common_cuda_helper.hpp"
+#include "parrots_cudawarpfunction.cuh"
+
+using namespace parrots;
+using phalf = float16;
+
+#define __PHALF(x) (x.y)
+
+#define PARROTS_CUDA_CHECK(exp)                         \
+  do {                                                  \
+    cudaError_t err = exp;                              \
+    if (err != cudaSuccess) {                           \
+      fprintf(stderr, "cudaCheckError() failed : %s\n", \
+              cudaGetErrorString(err));                 \
+      exit(-1);                                         \
+    }                                                   \
+  } while (0)
+
+#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \
+  case prim_type: {                                     \
+    using scalar_t = type;                              \
+    return __VA_ARGS__();                               \
+  }
+
+#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...)                  \
+  [&] {                                                             \
+    const auto& the_type = TYPE;                                    \
+    switch (the_type) {                                             \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)  \
+      default:                                                      \
+        PARROTS_NOTSUPPORTED;                                       \
+    }                                                               \
+  }()
+
+#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...)          \
+  [&] {                                                              \
+    const auto& the_type = TYPE;                                     \
+    switch (the_type) {                                              \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__)  \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)   \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \
+      default:                                                       \
+        PARROTS_NOTSUPPORTED;                                        \
+    }                                                                \
+  }()
+
+/** atomicAdd **/
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+
+static __inline__ __device__ double atomicAdd(double* address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  if (val == 0.0) return __longlong_as_double(old);
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+#endif
+
+static __inline__ __device__ float16 atomicAdd(float16* address, float16 val) {
+  unsigned int* aligned =
+      (unsigned int*)((size_t)address - ((size_t)address & 2));
+  unsigned int old = *aligned;
+  unsigned int assumed;
+  unsigned short old_as_us;
+  do {
+    assumed = old;
+    old_as_us =
+        (unsigned short)((size_t)address & 2 ? old >> 16 : old & 0xffff);
+
+#if __CUDACC_VER_MAJOR__ >= 9
+    float16 tmp;
+    tmp.x = old_as_us;
+    float16 sum = tmp + val;
+    unsigned short sum_as_us = sum.x;
+//         half sum = __float2half_rn(__half2float(__ushort_as_half(old_as_us))
+//         + (float)(val)); unsigned short sum_as_us = __half_as_ushort(sum);
+#else
+    unsigned short sum_as_us =
+        __float2half_rn(__half2float(old_as_us) + (float)(val));
+#endif
+
+    unsigned int sum_as_ui = (size_t)address & 2
+                                 ? (sum_as_us << 16) | (old & 0xffff)
+                                 : (old & 0xffff0000) | sum_as_us;
+    old = atomicCAS(aligned, assumed, sum_as_ui);
+  } while (assumed != old);
+  //__half_raw raw = {old_as_us};
+  // return float16(raw);
+  return *reinterpret_cast<float16*>(&old_as_us);
+}
+#endif  // PARROTS_CUDA_HELPER
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f68e8740561ef833c09e1ba9f999922f5d04bce5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
@@ -0,0 +1,27 @@
+#ifndef PYTORCH_CPP_HELPER
+#define PYTORCH_CPP_HELPER
+#include <torch/types.h>
+
+#include <vector>
+
+using namespace at;
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_MLU(x) \
+  TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor")
+#define CHECK_CPU(x) \
+  TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) \
+  CHECK_CUDA(x);            \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_MLU_INPUT(x) \
+  CHECK_MLU(x);            \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) \
+  CHECK_CPU(x);            \
+  CHECK_CONTIGUOUS(x)
+
+#endif  // PYTORCH_CPP_HELPER
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..52e512695a403abe2688f9bffeece633a02f189a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
@@ -0,0 +1,20 @@
+#ifndef PYTORCH_CUDA_HELPER
+#define PYTORCH_CUDA_HELPER
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <THC/THCAtomics.cuh>
+
+#include "common_cuda_helper.hpp"
+
+using at::Half;
+using at::Tensor;
+using phalf = at::Half;
+
+#define __PHALF(x) (x)
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+#endif  // PYTORCH_CUDA_HELPER
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_device_registry.hpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_device_registry.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a32b7270c3521f960394af7d18cbbd03ba50df1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_device_registry.hpp
@@ -0,0 +1,141 @@
+#ifndef PYTORCH_DEVICE_REGISTRY_H
+#define PYTORCH_DEVICE_REGISTRY_H
+
+// Using <torch/extension.h> is recommended in the official documentation in
+// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
+// However, we use <torch/types.h> for compatibility with CUDA 9.0
+// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
+#include <torch/types.h>
+
+#include <cassert>
+#include <functional>
+#include <map>
+#include <type_traits>
+
+inline std::string GetDeviceStr(const at::Device& device) {
+  std::string str = DeviceTypeName(device.type(), true);
+  if (device.has_index()) {
+    str.push_back(':');
+    str.append(std::to_string(device.index()));
+  }
+  return str;
+}
+
+// Registry
+template <typename F, F f>
+class DeviceRegistry;
+
+template <typename Ret, typename... Args, Ret (*f)(Args...)>
+class DeviceRegistry<Ret (*)(Args...), f> {
+ public:
+  using FunctionType = Ret (*)(Args...);
+  static const int MAX_DEVICE_TYPES =
+      int8_t(at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
+
+  void Register(at::DeviceType device, FunctionType function) {
+    funcs_[int8_t(device)] = function;
+  }
+
+  FunctionType Find(at::DeviceType device) const {
+    return funcs_[int8_t(device)];
+  }
+
+  static DeviceRegistry& instance() {
+    static DeviceRegistry inst;
+    return inst;
+  }
+
+ private:
+  DeviceRegistry() {
+    for (size_t i = 0; i < MAX_DEVICE_TYPES; ++i) {
+      funcs_[i] = nullptr;
+    }
+  };
+  FunctionType funcs_[MAX_DEVICE_TYPES];
+};
+
+// get device of first tensor param
+
+template <typename T, typename... Args,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
+  return std::forward<T>(t).device();
+}
+template <typename T, typename... Args,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
+  return GetFirstTensorDevice(std::forward<Args>(args)...);
+}
+
+// check device consistency
+
+inline std::pair<int, at::Device> CheckDeviceConsistency(
+    const at::Device& device, int index) {
+  return {index, device};
+}
+
+template <typename T, typename... Args,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args);
+
+template <typename T, typename... Args,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args) {
+  auto new_device = std::forward<T>(t).device();
+  if (new_device.type() != device.type() ||
+      new_device.index() != device.index()) {
+    return {index, new_device};
+  }
+  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
+}
+
+template <
+    typename T, typename... Args,
+    std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args) {
+  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
+}
+
+// dispatch
+
+template <typename R, typename... Args>
+auto Dispatch(const R& registry, const char* name, Args&&... args) {
+  auto device = GetFirstTensorDevice(std::forward<Args>(args)...);
+  auto inconsist =
+      CheckDeviceConsistency(device, 0, std::forward<Args>(args)...);
+  TORCH_CHECK(inconsist.first >= int(sizeof...(Args)), name, ": at param ",
+              inconsist.first,
+              ", inconsistent device: ", GetDeviceStr(inconsist.second).c_str(),
+              " vs ", GetDeviceStr(device).c_str(), "\n")
+  auto f_ptr = registry.Find(device.type());
+  TORCH_CHECK(f_ptr != nullptr, name, ": implementation for device ",
+              GetDeviceStr(device).c_str(), " not found.\n")
+  return f_ptr(std::forward<Args>(args)...);
+}
+
+// helper macro
+
+#define DEVICE_REGISTRY(key) DeviceRegistry<decltype(&(key)), key>::instance()
+
+#define REGISTER_DEVICE_IMPL(key, device, value)           \
+  struct key##_##device##_registerer {                     \
+    key##_##device##_registerer() {                        \
+      DEVICE_REGISTRY(key).Register(at::k##device, value); \
+    }                                                      \
+  };                                                       \
+  static key##_##device##_registerer _##key##_##device##_registerer;
+
+#define DISPATCH_DEVICE_IMPL(key, ...) \
+  Dispatch(DEVICE_REGISTRY(key), #key, __VA_ARGS__)
+
+#endif  // PYTORCH_DEVICE_REGISTRY
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e49572ca841211e2960192f1e0955b54819086cc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
@@ -0,0 +1,61 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef PYTORCH_MLU_HELPER_HPP_
+#define PYTORCH_MLU_HELPER_HPP_
+
+#ifdef MMCV_WITH_MLU
+#include "aten.h"
+
+#define NFU_ALIGN_SIZE 128
+
+#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
+
+#define PAD_DOWN(x, y) (((x) / (y)) * (y))
+
+#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
+
+#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
+
+inline int32_t getJobLimitCapability() {
+  CNcontext drv_ctx;
+  TORCH_CHECK(CN_SUCCESS == cnCtxGetCurrent(&drv_ctx), "cnCtxGetCurrent fails");
+  CNctxConfigParam ctx_conf_param;
+  TORCH_CHECK(
+      CN_SUCCESS == cnGetCtxConfigParam(drv_ctx, CN_CTX_CONFIG_UNION_LIMIT,
+                                        &ctx_conf_param),
+      "cnGetCtxConfigParam fails.");
+  return (int32_t)ctx_conf_param.unionLimit;
+}
+
+inline int32_t getCoreNumOfJobLimitCapability() {
+  switch (getJobLimitCapability()) {
+    default:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) *
+             getJobLimitCapability();
+    case CN_KERNEL_CLASS_BLOCK:
+      return 1;
+    case CN_KERNEL_CLASS_UNION:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+    case CN_KERNEL_CLASS_UNION2:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 2;
+    case CN_KERNEL_CLASS_UNION4:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 4;
+    case CN_KERNEL_CLASS_UNION8:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 8;
+    case CN_KERNEL_CLASS_UNION16:
+      return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 16;
+  }
+}
+
+#endif  // MMCV_WITH_MLU
+
+#endif  // PYTORCH_MLU_HELPER_HPP_
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_npu_helper.hpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_npu_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..073d6b38c345ed480542c2dd68d9fc256a4665ae
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/pytorch_npu_helper.hpp
@@ -0,0 +1,47 @@
+/******************************************************************************
+ * Copyright (c) 2022 Huawei Technologies Co., Ltd
+ * All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License  (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef PYTORCH_NPU_HELPER_HPP_
+#define PYTORCH_NPU_HELPER_HPP_
+
+#include <torch_npu/csrc/aten/CustomFunctions.h>
+#include <torch_npu/csrc/framework/utils/CalcuOpUtil.h>
+#include <torch_npu/csrc/framework/utils/OpAdapter.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+#define NPU_NAME_SPACE at_npu::native
+
+#ifdef MMCV_WITH_XLA
+#define REGISTER_NPU_IMPL(key, value) REGISTER_DEVICE_IMPL(key, XLA, value)
+#else
+#define REGISTER_NPU_IMPL(key, value) \
+  REGISTER_DEVICE_IMPL(key, PrivateUse1, value)
+#endif
+
+#ifdef MMCV_WITH_XLA
+#define CHECK_NPU(x) \
+  TORCH_CHECK(x.device().type() == at::kXLA, #x " must be a NPU tensor")
+#else
+#define CHECK_NPU(x)                                    \
+  TORCH_CHECK(x.device().type() == at::kPrivateUse1, #x \
+              " must be a NPU "                         \
+              "tensor")
+
+#endif
+#endif  // PYTORCH_NPU_HELPER_HPP_
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
new file mode 100644
index 0000000000000000000000000000000000000000..f23ff4482324c51012865c42f2a5f9e59d54848a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
@@ -0,0 +1,70 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PARAMS_GRID_H_
+#define PARAMS_GRID_H_
+#include <tuple>
+#include <vector>
+
+namespace detail {
+template <class scalar_t>
+int getTotalSize(std::vector<scalar_t> arg) {
+  return arg.size();
+}
+
+template <class scalar_t, class... TArgs>
+int getTotalSize(std::vector<scalar_t> arg, std::vector<TArgs>... args) {
+  return arg.size() * getTotalSize(args...);
+}
+
+template <typename scalar_t>
+int getSize(std::vector<scalar_t> arg) {
+  return arg.size();
+}
+
+template <int Idx, class TT, class scalar_t>
+void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg) {
+  std::get<Idx>(src) = arg[counter[Idx]];
+}
+
+template <int Idx, class TT, class scalar_t, class... TArgs>
+void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg,
+              std::vector<TArgs> &... args) {
+  std::get<Idx>(src) = arg[counter[Idx]];
+  assigner<Idx + 1>(src, counter, args...);
+}
+}  // namespace detail
+
+template <class... TArgs>
+std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
+  int length = detail::getTotalSize(args...);
+  std::vector<int> sizes = {detail::getSize(args)...};
+  int size = sizes.size();
+
+  std::vector<std::tuple<TArgs...>> params(length);
+  std::vector<int> counter(size);
+  for (int i = 0; i < length; ++i) {
+    detail::assigner<0>(params[i], counter, args...);
+    counter[size - 1] += 1;
+    for (int c = size - 1; c >= 0; --c) {
+      if (counter[c] == sizes[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return params;
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/prettyprint.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a6bdc3361dc1ada31fdebef87989672c9aeb51c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
@@ -0,0 +1,493 @@
+//          Copyright Louis Delacroix 2010 - 2014.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+//
+// A pretty printing library for C++
+//
+// Usage:
+// Include this header, and operator<< will "just work".
+
+#ifndef H_PRETTY_PRINT
+#define H_PRETTY_PRINT
+
+#include <cstddef>
+#include <iterator>
+#include <memory>
+#include <ostream>
+#include <set>
+#include <tuple>
+#include <type_traits>
+#include <unordered_set>
+#include <utility>
+#include <valarray>
+
+namespace pretty_print {
+namespace detail {
+// SFINAE type trait to detect whether T::const_iterator exists.
+
+struct sfinae_base {
+  using yes = char;
+  using no = yes[2];
+};
+
+template <typename T>
+struct has_const_iterator : private sfinae_base {
+ private:
+  template <typename C>
+  static yes &test(typename C::const_iterator *);
+  template <typename C>
+  static no &test(...);
+
+ public:
+  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
+  using type = T;
+};
+
+template <typename T>
+struct has_begin_end : private sfinae_base {
+ private:
+  template <typename C>
+  static yes &
+  f(typename std::enable_if<
+      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
+                                            const>(&C::begin)),
+                   typename C::const_iterator (C::*)() const>::value>::type *);
+
+  template <typename C>
+  static no &f(...);
+
+  template <typename C>
+  static yes &g(typename std::enable_if<
+                std::is_same<decltype(static_cast<typename C::const_iterator (
+                                          C::*)() const>(&C::end)),
+                             typename C::const_iterator (C::*)() const>::value,
+                void>::type *);
+
+  template <typename C>
+  static no &g(...);
+
+ public:
+  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
+  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
+};
+
+}  // namespace detail
+
+// Holds the delimiter values for a specific character type
+
+template <typename TChar>
+struct delimiters_values {
+  using char_type = TChar;
+  const char_type *prefix;
+  const char_type *delimiter;
+  const char_type *postfix;
+};
+
+// Defines the delimiter values for a specific container and character type
+
+template <typename T, typename TChar>
+struct delimiters {
+  using type = delimiters_values<TChar>;
+  static const type values;
+};
+
+// Functor to print containers. You can use this directly if you want
+// to specify a non-default delimiters type. The printing logic can
+// be customized by specializing the nested template.
+
+template <typename T, typename TChar = char,
+          typename TCharTraits = ::std::char_traits<TChar>,
+          typename TDelimiters = delimiters<T, TChar>>
+struct print_container_helper {
+  using delimiters_type = TDelimiters;
+  using ostream_type = std::basic_ostream<TChar, TCharTraits>;
+
+  template <typename U>
+  struct printer {
+    static void print_body(const U &c, ostream_type &stream) {
+      using std::begin;
+      using std::end;
+
+      auto it = begin(c);
+      const auto the_end = end(c);
+
+      if (it != the_end) {
+        for (;;) {
+          stream << *it;
+
+          if (++it == the_end) break;
+
+          if (delimiters_type::values.delimiter != NULL)
+            stream << delimiters_type::values.delimiter;
+        }
+      }
+    }
+  };
+
+  print_container_helper(const T &container) : container_(container) {}
+
+  inline void operator()(ostream_type &stream) const {
+    if (delimiters_type::values.prefix != NULL)
+      stream << delimiters_type::values.prefix;
+
+    printer<T>::print_body(container_, stream);
+
+    if (delimiters_type::values.postfix != NULL)
+      stream << delimiters_type::values.postfix;
+  }
+
+ private:
+  const T &container_;
+};
+
+// Specialization for pairs
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+template <typename T1, typename T2>
+struct print_container_helper<T, TChar, TCharTraits,
+                              TDelimiters>::printer<std::pair<T1, T2>> {
+  using ostream_type =
+      typename print_container_helper<T, TChar, TCharTraits,
+                                      TDelimiters>::ostream_type;
+
+  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
+    stream << c.first;
+    if (print_container_helper<T, TChar, TCharTraits,
+                               TDelimiters>::delimiters_type::values
+            .delimiter != NULL)
+      stream << print_container_helper<T, TChar, TCharTraits,
+                                       TDelimiters>::delimiters_type::values
+                    .delimiter;
+    stream << c.second;
+  }
+};
+
+// Specialization for tuples
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+template <typename... Args>
+struct print_container_helper<T, TChar, TCharTraits,
+                              TDelimiters>::printer<std::tuple<Args...>> {
+  using ostream_type =
+      typename print_container_helper<T, TChar, TCharTraits,
+                                      TDelimiters>::ostream_type;
+  using element_type = std::tuple<Args...>;
+
+  template <std::size_t I>
+  struct Int {};
+
+  static void print_body(const element_type &c, ostream_type &stream) {
+    tuple_print(c, stream, Int<0>());
+  }
+
+  static void tuple_print(const element_type &, ostream_type &,
+                          Int<sizeof...(Args)>) {}
+
+  static void tuple_print(
+      const element_type &c, ostream_type &stream,
+      typename std::conditional<sizeof...(Args) != 0, Int<0>,
+                                std::nullptr_t>::type) {
+    stream << std::get<0>(c);
+    tuple_print(c, stream, Int<1>());
+  }
+
+  template <std::size_t N>
+  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
+    if (print_container_helper<T, TChar, TCharTraits,
+                               TDelimiters>::delimiters_type::values
+            .delimiter != NULL)
+      stream << print_container_helper<T, TChar, TCharTraits,
+                                       TDelimiters>::delimiters_type::values
+                    .delimiter;
+
+    stream << std::get<N>(c);
+
+    tuple_print(c, stream, Int<N + 1>());
+  }
+};
+
+// Prints a print_container_helper to the specified stream.
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+inline std::basic_ostream<TChar, TCharTraits> &operator<<(
+    std::basic_ostream<TChar, TCharTraits> &stream,
+    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
+  helper(stream);
+  return stream;
+}
+
+// Basic is_container template; specialize to derive from std::true_type for all
+// desired container types
+
+template <typename T>
+struct is_container
+    : public std::integral_constant<bool,
+                                    detail::has_const_iterator<T>::value &&
+                                        detail::has_begin_end<T>::beg_value &&
+                                        detail::has_begin_end<T>::end_value> {};
+
+template <typename T, std::size_t N>
+struct is_container<T[N]> : std::true_type {};
+
+template <std::size_t N>
+struct is_container<char[N]> : std::false_type {};
+
+template <typename T>
+struct is_container<std::valarray<T>> : std::true_type {};
+
+template <typename T1, typename T2>
+struct is_container<std::pair<T1, T2>> : std::true_type {};
+
+template <typename... Args>
+struct is_container<std::tuple<Args...>> : std::true_type {};
+
+// Default delimiters
+
+template <typename T>
+struct delimiters<T, char> {
+  static const delimiters_values<char> values;
+};
+template <typename T>
+const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
+template <typename T>
+struct delimiters<T, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename T>
+const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
+                                                                   L"]"};
+
+// Delimiters for (multi)set and unordered_(multi)set
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::set<T, TComp, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<char>
+    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
+                                                                  "}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
+        L"{", L", ", L"}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<char>
+    delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
+        "{", ", ", "}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
+        L"{", L", ", L"}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<char> delimiters<
+    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
+    "{", ", ", "}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<wchar_t> delimiters<
+    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
+    L"{", L", ", L"}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+                  char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<char> delimiters<
+    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
+    "{", ", ", "}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+                  wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+               wchar_t>::values = {L"{", L", ", L"}"};
+
+// Delimiters for pair and tuple
+
+template <typename T1, typename T2>
+struct delimiters<std::pair<T1, T2>, char> {
+  static const delimiters_values<char> values;
+};
+template <typename T1, typename T2>
+const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
+    "(", ", ", ")"};
+template <typename T1, typename T2>
+struct delimiters<::std::pair<T1, T2>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename T1, typename T2>
+const delimiters_values<wchar_t>
+    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
+
+template <typename... Args>
+struct delimiters<std::tuple<Args...>, char> {
+  static const delimiters_values<char> values;
+};
+template <typename... Args>
+const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
+    "(", ", ", ")"};
+template <typename... Args>
+struct delimiters<::std::tuple<Args...>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename... Args>
+const delimiters_values<wchar_t>
+    delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
+
+// Type-erasing helper class for easy use of custom delimiters.
+// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
+// and MyDelims needs to be defined for TChar. Usage: "cout <<
+// pretty_print::custom_delims<MyDelims>(x)".
+
+struct custom_delims_base {
+  virtual ~custom_delims_base() {}
+  virtual std::ostream &stream(::std::ostream &) = 0;
+  virtual std::wostream &stream(::std::wostream &) = 0;
+};
+
+template <typename T, typename Delims>
+struct custom_delims_wrapper : custom_delims_base {
+  custom_delims_wrapper(const T &t_) : t(t_) {}
+
+  std::ostream &stream(std::ostream &s) {
+    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
+               t);
+  }
+
+  std::wostream &stream(std::wostream &s) {
+    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
+                                       Delims>(t);
+  }
+
+ private:
+  const T &t;
+};
+
+template <typename Delims>
+struct custom_delims {
+  template <typename Container>
+  custom_delims(const Container &c)
+      : base(new custom_delims_wrapper<Container, Delims>(c)) {}
+
+  std::unique_ptr<custom_delims_base> base;
+};
+
+template <typename TChar, typename TCharTraits, typename Delims>
+inline std::basic_ostream<TChar, TCharTraits> &operator<<(
+    std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
+  return p.base->stream(s);
+}
+
+// A wrapper for a C-style array given as pointer-plus-size.
+// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
+
+template <typename T>
+struct array_wrapper_n {
+  typedef const T *const_iterator;
+  typedef T value_type;
+
+  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
+  inline const_iterator begin() const { return _array; }
+  inline const_iterator end() const { return _array + _n; }
+
+ private:
+  const T *const _array;
+  size_t _n;
+};
+
+// A wrapper for hash-table based containers that offer local iterators to each
+// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket
+// 5 of container m.)
+
+template <typename T>
+struct bucket_print_wrapper {
+  typedef typename T::const_local_iterator const_iterator;
+  typedef typename T::size_type size_type;
+
+  const_iterator begin() const { return m_map.cbegin(n); }
+
+  const_iterator end() const { return m_map.cend(n); }
+
+  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
+
+ private:
+  const T &m_map;
+  const size_type n;
+};
+
+}  // namespace pretty_print
+
+// Global accessor functions for the convenience wrappers
+
+template <typename T>
+inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
+                                                           size_t n) {
+  return pretty_print::array_wrapper_n<T>(a, n);
+}
+
+template <typename T>
+pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
+                                                   typename T::size_type n) {
+  return pretty_print::bucket_print_wrapper<T>(m, n);
+}
+
+// Main magic entry point: An overload snuck into namespace std.
+// Can we do better?
+
+namespace std {
+// Prints a container to the stream using default delimiters
+
+template <typename T, typename TChar, typename TCharTraits>
+inline typename enable_if<::pretty_print::is_container<T>::value,
+                          basic_ostream<TChar, TCharTraits> &>::type
+operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
+  return stream
+         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
+                container);
+}
+}  // namespace std
+
+#endif  // H_PRETTY_PRINT
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..026e35b1a6b52ec74fee27fbccd2dfda5ef845ce
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
@@ -0,0 +1,60 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <pybind11/embed.h>
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <spconv/tensorview/tensorview.h>
+
+#include <algorithm>
+#include <iostream>
+
+namespace py = pybind11;
+
+template <typename scalar_t, typename TPyObject>
+std::vector<scalar_t> array2Vector(TPyObject arr) {
+  py::array arr_np = arr;
+  size_t size = arr.attr("size").template cast<size_t>();
+  py::array_t<scalar_t> arr_cc = arr_np;
+  std::vector<scalar_t> data(arr_cc.data(), arr_cc.data() + size);
+  return data;
+}
+
+template <typename scalar_t>
+std::vector<scalar_t> arrayT2Vector(py::array_t<scalar_t> arr) {
+  std::vector<scalar_t> data(arr.data(), arr.data() + arr.size());
+  return data;
+}
+
+template <typename scalar_t, typename TPyObject>
+tv::TensorView<scalar_t> array2TensorView(TPyObject arr) {
+  py::array arr_np = arr;
+  py::array_t<scalar_t> arr_cc = arr_np;
+  tv::Shape shape;
+  for (int i = 0; i < arr_cc.ndim(); ++i) {
+    shape.push_back(arr_cc.shape(i));
+  }
+  return tv::TensorView<scalar_t>(arr_cc.mutable_data(), shape);
+}
+template <typename scalar_t>
+tv::TensorView<scalar_t> arrayT2TensorView(py::array_t<scalar_t> arr) {
+  tv::Shape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  return tv::TensorView<scalar_t>(arr.mutable_data(), shape);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
new file mode 100644
index 0000000000000000000000000000000000000000..def6fe5e125a4e8c7e38f889887a6af80557f219
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
@@ -0,0 +1,295 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPCONV_GEOMETRY_H_
+#define SPCONV_GEOMETRY_H_
+
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <iostream>
+#include <limits>
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
+                                    const Index *kernelSize,
+                                    const Index *stride, const Index *padding,
+                                    const Index *dilation,
+                                    const Index *outSpatialShape, Index *out) {
+  Index lowers[NDim];
+  Index uppers[NDim];
+  Index counter[NDim];
+  Index counterSize[NDim];
+  Index pointCounter = 0;
+  Index val;
+  Index numPoints = 1;
+  Index m, offset;
+  bool valid = false;
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
+                 stride[i] + padding[i]) /
+                stride[i];
+    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
+  }
+
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
+    numPoints *= counterSize[i];
+  }
+
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counter[i] = 0;
+  }
+  for (int i = 0; i < numPoints; ++i) {
+    valid = true;
+    m = 1;
+    offset = 0;
+#pragma unroll
+    for (int j = NDim - 1; j >= 0; --j) {
+      val = uppers[j] - counter[j] * dilation[j];
+      out[pointCounter * (NDim + 1) + j] = val;
+      if (val < 0 || (val > outSpatialShape[j] - 1)) {
+        valid = false;
+        // break;
+      }
+      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
+      m *= kernelSize[j];
+    }
+
+    out[pointCounter * (NDim + 1) + NDim] = offset;
+    if (valid) ++pointCounter;
+    counter[NDim - 1] += 1;
+#pragma unroll
+    for (int c = NDim - 1; c >= 0; --c) {
+      if (counter[c] == counterSize[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return pointCounter;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE Index getValidOutPosTranspose(
+    const Index *input_pos, const Index *kernelSize, const Index *stride,
+    const Index *padding, const Index *dilation, const Index *outSpatialShape,
+    Index *out) {
+  Index lowers[NDim];
+  Index uppers[NDim];
+  Index counter[NDim];
+  Index counterSize[NDim];
+  Index pointCounter = 0;
+  Index val;
+  Index numPoints = 1;
+  Index m, offset;
+  bool valid = false;
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    lowers[i] = input_pos[i] * stride[i] - padding[i];
+    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
+  }
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
+    numPoints *= counterSize[i];
+  }
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counter[i] = 0;
+  }
+  for (int i = 0; i < numPoints; ++i) {
+    valid = true;
+    m = 1;
+    offset = 0;
+#pragma unroll
+    for (int j = NDim - 1; j >= 0; --j) {
+      val = uppers[j] - counter[j] * dilation[j];
+      out[pointCounter * (NDim + 1) + j] = val;
+      if (val < 0 || (val > outSpatialShape[j] - 1)) {
+        valid = false;
+      }
+      offset += m * (val - lowers[j]) / dilation[j];
+      m *= kernelSize[j];
+    }
+    out[pointCounter * (NDim + 1) + NDim] = offset;
+    if (valid) ++pointCounter;
+    counter[NDim - 1] += 1;
+#pragma unroll
+    for (int c = NDim - 1; c >= 0; --c) {
+      if (counter[c] == counterSize[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return pointCounter;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
+                         tv::TensorView<Index> indicesOut,
+                         tv::TensorView<IndexGrid> gridsOut,
+                         tv::TensorView<Index> indicePairs,
+                         tv::TensorView<Index> indiceNum,
+                         const Index *kernelSize, const Index *stride,
+                         const Index *padding, const Index *dilation,
+                         const Index *outSpatialShape) {
+  // indicesOut: num_active * kernelVolume * (NDim + 1)
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  for (int j = 0; j < numActIn; ++j) {
+    batchIdx = indicesIn(j, 0);
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+                   spatialVolume * batchIdx;
+      if (gridsOut[index] == -1) {
+        for (unsigned k = 1; k < NDim + 1; ++k) {
+          indicesOut(numAct, k) = pointPtr[k - 1];
+        }
+        indicesOut(numAct, 0) = batchIdx;
+        gridsOut[index] = numAct++;
+      }
+      // indicePairs: [K, 2, L]
+      indicePairs(offset, 0, indiceNum[offset]) = j;
+      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+    }
+  }
+  return numAct;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
+                           tv::TensorView<Index> indicesOut,
+                           tv::TensorView<IndexGrid> gridsOut,
+                           tv::TensorView<Index> indicePairs,
+                           tv::TensorView<Index> indiceNum,
+                           const Index *kernelSize, const Index *stride,
+                           const Index *padding, const Index *dilation,
+                           const Index *outSpatialShape) {
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  for (int j = 0; j < numActIn; ++j) {
+    batchIdx = indicesIn(j, 0);
+    numValidPoints = getValidOutPosTranspose<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+                   spatialVolume * batchIdx;
+      if (gridsOut[index] == -1) {
+        for (unsigned k = 1; k < NDim + 1; ++k) {
+          indicesOut(numAct, k) = pointPtr[k - 1];
+        }
+        indicesOut(numAct, 0) = batchIdx;
+        gridsOut[index] = numAct++;
+      }
+      // indicePairs: [K, 2, L]
+      indicePairs(offset, 0, indiceNum[offset]) = j;
+      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+    }
+  }
+  return numAct;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
+                         tv::TensorView<IndexGrid> gridsOut,
+                         tv::TensorView<Index> indicePairs,
+                         tv::TensorView<Index> indiceNum,
+                         const Index *const kernelSize,
+                         const Index *const stride, const Index *const padding,
+                         const Index *dilation,
+                         const Index *const outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  // Index validPoints[kernelVolume * (NDim + 1)];
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  Index index = 0;
+  for (int j = 0; j < numActIn; ++j) {
+    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
+                                         outSpatialShape) +
+            spatialVolume * indicesIn(j, 0);
+    gridsOut[index] = j;
+  }
+  for (int j = 0; j < numActIn; ++j) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+              spatialVolume * indicesIn(j, 0);
+      if (gridsOut[index] > -1) {
+        indicePairs(offset, 0, indiceNum[offset]) = j;
+        indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+      }
+    }
+  }
+  return numActIn;
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
new file mode 100644
index 0000000000000000000000000000000000000000..96ce34e3b456f0c999002bd53b8b1a6ab082edae
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
@@ -0,0 +1,78 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
+#define SPARSE_CONV_INDICE_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1 {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2 {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctor {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+}  // namespace functor
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..78f32edd4db70724d38826809672aa461a6d065e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
@@ -0,0 +1,37 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
+#define SPARSE_MAXPOOL_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size);
+};
+
+template <typename Device, typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor {
+  void operator()(const Device& d, tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size);
+};
+}  // namespace functor
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..8262b30efb5e127d7e079ebdde0693c671fb96d6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
@@ -0,0 +1,50 @@
+#ifndef MP_HELPER_H_
+#define MP_HELPER_H_
+#include <type_traits>
+#include <utility>
+
+template <class... T>
+struct mp_list {};
+
+template <class T, T... I>
+using mp_list_c = mp_list<std::integral_constant<T, I>...>;
+
+namespace detail {
+
+template <class... T, class F>
+constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
+  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
+}
+
+template <class F>
+constexpr F mp_for_each_impl(mp_list<>, F &&f) {
+  return std::forward<F>(f);
+}
+
+}  // namespace detail
+
+namespace detail {
+
+template <class A, template <class...> class B>
+struct mp_rename_impl {
+  // An error "no type named 'type'" here means that the first argument to
+  // mp_rename is not a list
+};
+
+template <template <class...> class A, class... T, template <class...> class B>
+struct mp_rename_impl<A<T...>, B> {
+  using type = B<T...>;
+};
+
+}  // namespace detail
+
+template <class A, template <class...> class B>
+using mp_rename = typename ::detail::mp_rename_impl<A, B>::type;
+
+template <class L, class F>
+constexpr F mp_for_each(F &&f) {
+  return ::detail::mp_for_each_impl(mp_rename<L, mp_list>(),
+                                    std::forward<F>(f));
+}
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
new file mode 100644
index 0000000000000000000000000000000000000000..95c1c6e389eb2f451e8640592ee2698d8b736010
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
@@ -0,0 +1,385 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <math.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <algorithm>
+#include <iostream>
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, int max_points,
+                          int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_mean(py::array_t<DType> points,
+                               py::array_t<DType> voxels,
+                               py::array_t<DType> means, py::array_t<int> coors,
+                               py::array_t<int> num_points_per_voxel,
+                               py::array_t<int> coor_to_voxelidx,
+                               std::vector<DType> voxel_size,
+                               std::vector<DType> coors_range, int max_points,
+                               int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto means_rw = means.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+      for (int k = 0; k < num_features; ++k) {
+        means_rw(voxelidx, k) +=
+            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
+      }
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    num = num_points_per_voxel_rw(i);
+    for (int j = num; j < max_points; ++j) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(i, j, k) = means_rw(i, k);
+      }
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_height(
+    py::array_t<DType> points, py::array_t<DType> voxels,
+    py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
+    int max_points, int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+        height_rw(voxelidx, k) =
+            std::min(points_rw(i, k), height_rw(voxelidx, k));
+        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    for (int k = 0; k < num_features; ++k) {
+      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
+                    py::array_t<DType> height, py::array_t<DType> maxs,
+                    py::array_t<int> coor_to_voxelidx,
+                    std::vector<DType> voxel_size,
+                    std::vector<DType> coors_range, int max_voxels, DType eps) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<1>();
+  auto maxs_rw = maxs.template mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+    }
+    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
+    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
+  }
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
+      mask(i) = 0;
+    }
+  }
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_with_filtering(
+    py::array_t<DType> points, py::array_t<DType> voxels,
+    py::array_t<int> voxel_mask, py::array_t<DType> mins,
+    py::array_t<DType> maxs, py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
+    int max_points, int max_voxels, int block_factor, int block_size,
+    DType height_threshold) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto mins_rw = mins.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+
+  DType max_value, min_value;
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int block_shape_H = grid_size[1] / block_factor;
+  int block_shape_W = grid_size[0] / block_factor;
+  int voxelidx, num;
+  int block_coor[2];
+  int startx, stopx, starty, stopy;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      block_coor[0] = coor[1] / block_factor;
+      block_coor[1] = coor[2] / block_factor;
+      mins_rw(block_coor[0], block_coor[1]) =
+          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
+      maxs_rw(block_coor[0], block_coor[1]) =
+          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor[1] = coors_rw(i, 1);
+    coor[2] = coors_rw(i, 2);
+    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
+    block_coor[0] = coor[1] / block_factor;
+    block_coor[1] = coor[2] / block_factor;
+    min_value = mins_rw(block_coor[0], block_coor[1]);
+    max_value = maxs_rw(block_coor[0], block_coor[1]);
+    startx = std::max(0, block_coor[0] - block_size / 2);
+    stopx =
+        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
+    starty = std::max(0, block_coor[1] - block_size / 2);
+    stopy =
+        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
+
+    for (int j = startx; j < stopx; ++j) {
+      for (int k = starty; k < stopy; ++k) {
+        min_value = std::min(min_value, mins_rw(j, k));
+        max_value = std::max(max_value, maxs_rw(j, k));
+      }
+    }
+    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
+  }
+  return voxel_num;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
new file mode 100644
index 0000000000000000000000000000000000000000..998d9511b060d02d9f12408038b56a802f63c1da
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
@@ -0,0 +1,36 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_REORDERING_FUNCTOR_H_
+#define SPARSE_REORDERING_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename scalar_t, typename Index>
+struct SparseGatherFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size);
+};
+
+template <typename Device, typename scalar_t, typename Index>
+struct SparseScatterAddFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> out_features,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size,
+                  bool stable = false);
+};
+}  // namespace functor
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..70851bc70ecb8ce1c74d777006d5b30b78e0d232
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
@@ -0,0 +1,75 @@
+#pragma once
+namespace tv {
+namespace detail {
+
+template <typename scalar_t>
+class KernelLoop {
+  struct Iterator {
+    __forceinline__ __device__ Iterator(scalar_t index, scalar_t delta)
+        : index_(index), delta_(delta) {}
+    __forceinline__ __device__ scalar_t operator*() const { return index_; }
+    __forceinline__ __device__ Iterator &operator++() {
+      index_ += delta_;
+      return *this;
+    }
+    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
+      bool greater = index_ > other.index_;
+      bool less = index_ < other.index_;
+      if (!other.delta_) {
+        return less;
+      }
+      if (!delta_) {
+        return greater;
+      }
+      return less || greater;
+    }
+
+   private:
+    scalar_t index_;
+    const scalar_t delta_;
+  };
+
+ public:
+  __forceinline__ __device__ KernelLoop(scalar_t begin, scalar_t delta,
+                                        scalar_t end)
+      : begin_(begin), delta_(delta), end_(end) {}
+
+  __forceinline__ __device__ Iterator begin() const {
+    return Iterator{begin_, delta_};
+  }
+  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
+
+ private:
+  scalar_t begin_;
+  scalar_t delta_;
+  scalar_t end_;
+};
+
+}  // namespace detail
+
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopX(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.x * blockDim.x + threadIdx.x,
+                                      gridDim.x * blockDim.x * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
+// Usage: for(int i : KernelLoopY(count)) { visit(i); }
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopY(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.y * blockDim.y + threadIdx.y,
+                                      gridDim.y * blockDim.y * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
+// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopZ(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.z * blockDim.z + threadIdx.z,
+                                      gridDim.z * blockDim.z * NumILP, count);
+}
+
+}  // namespace tv
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
new file mode 100644
index 0000000000000000000000000000000000000000..163df1720cbb0e55c70fb82e9762b040b3b13fb9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
@@ -0,0 +1,19 @@
+#pragma once
+// from pytorch.aten
+#include "tensorview.h"
+namespace tv {
+namespace launch {
+
+template <typename T1, typename T2>
+inline int DivUp(const T1 a, const T2 b) {
+  return (a + b - 1) / b;
+}
+
+constexpr int CUDA_NUM_THREADS = 1024;
+inline int getBlocks(const int N) {
+  TV_ASSERT_RT_ERR(N > 0,
+                   "CUDA kernel launch blocks must be positive, but got N=", N);
+  return DivUp(N, CUDA_NUM_THREADS);
+}
+}  // namespace launch
+}  // namespace tv
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
new file mode 100644
index 0000000000000000000000000000000000000000..66e01a8ed109d115a8882f30b6222cbeac8c63dc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
@@ -0,0 +1,1120 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <type_traits>
+#include <vector>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace tv {
+
+#if defined(__NVCC__) || defined(__HIP__)
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#define TV_ASSERT(expr) assert(expr)
+#elif defined(__CUDACC_RTC__)
+#define TV_ASSERT(expr) assert(expr)
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#else
+#define TV_ASSERT(x) assert(x)
+#define TV_HOST_DEVICE_INLINE inline
+#define TV_HOST_DEVICE
+#endif
+
+#define TV_REQUIRE(expr, ...) \
+  {                           \
+    if (!(expr)) {            \
+      printf(__VA_ARGS__);    \
+      assert(expr);           \
+    }                         \
+  }
+
+#define TV_DEVICE_REQUIRE(expr, ...)                      \
+  {                                                       \
+    if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
+    assert(expr);                                         \
+  }
+
+template <class SStream, class T>
+void sstream_print(SStream &ss, T val) {
+  ss << val;
+}
+
+template <class SStream, class T, class... TArgs>
+void sstream_print(SStream &ss, T val, TArgs... args) {
+  ss << val << " ";
+  sstream_print(ss, args...);
+}
+
+#define TV_ASSERT_RT_ERR(expr, ...)                     \
+  {                                                     \
+    if (!(expr)) {                                      \
+      std::stringstream __macro_s;                      \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
+      __macro_s << #expr << " assert failed. ";         \
+      tv::sstream_print(__macro_s, __VA_ARGS__);        \
+      throw std::runtime_error(__macro_s.str());        \
+    }                                                   \
+  }
+
+#define TV_ASSERT_INVALID_ARG(expr, ...)                \
+  {                                                     \
+    if (!(expr)) {                                      \
+      std::stringstream __macro_s;                      \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
+      __macro_s << #expr << " assert failed. ";         \
+      tv::sstream_print(__macro_s, __VA_ARGS__);        \
+      throw std::invalid_argument(__macro_s.str());     \
+    }                                                   \
+  }
+
+#define TV_CHECK_CUDA_ERR()                                    \
+  {                                                            \
+    auto err = cudaGetLastError();                             \
+    if (err != cudaSuccess) {                                  \
+      std::stringstream __macro_s;                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";        \
+      __macro_s << "cuda execution failed with error " << err; \
+      throw std::runtime_error(__macro_s.str());               \
+    }                                                          \
+  }
+
+struct CPU {};
+
+#define TV_MAX_DIM 6
+
+template <typename scalar_t, size_t MaxDim = TV_MAX_DIM>
+struct SimpleVector {
+ public:
+  TV_HOST_DEVICE_INLINE SimpleVector(){};
+  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<scalar_t> q) {
+    TV_ASSERT(q.size() <= MaxDim);
+    mSize = 0;
+    for (scalar_t s : q) {
+      mArray[mSize++] = s;
+    }
+    mSize = q.size();
+  }
+  SimpleVector(const std::vector<scalar_t> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE SimpleVector(
+      const SimpleVector<scalar_t, MaxDim> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE const scalar_t &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE void push_back(scalar_t s) {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize < MaxDim);
+#endif
+    mArray[mSize] = s;
+    mSize++;
+  }
+  TV_HOST_DEVICE_INLINE void pop_back() {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize > 0);
+#endif
+    mSize--;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
+  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mArray; }
+  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
+
+  typedef size_t size_type;
+
+  class iterator {
+   public:
+    typedef iterator self_type;
+    typedef scalar_t value_type;
+    typedef scalar_t &reference;
+    typedef scalar_t *pointer;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef std::ptrdiff_t difference_type;
+    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+   private:
+    pointer ptr_;
+  };
+
+  class const_iterator {
+   public:
+    typedef const_iterator self_type;
+    typedef scalar_t value_type;
+    typedef const scalar_t &reference;
+    typedef const scalar_t *pointer;
+    typedef std::ptrdiff_t difference_type;
+    typedef std::forward_iterator_tag iterator_category;
+    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+   private:
+    pointer ptr_;
+  };
+
+  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
+
+  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
+
+  TV_HOST_DEVICE_INLINE const_iterator begin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator end() const {
+    return const_iterator(mArray + mSize);
+  }
+  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator cend() const {
+    return const_iterator(mArray + mSize);
+  }
+
+ protected:
+  scalar_t mArray[MaxDim];
+  size_t mSize = 0;
+};
+
+template <typename scalar_t, size_t MaxDim>
+bool operator==(const SimpleVector<scalar_t, MaxDim> &lfs,
+                const SimpleVector<scalar_t, MaxDim> &rfs) {
+  if (lfs.size() != rfs.size()) return false;
+  for (size_t i = 0; i < lfs.size(); ++i) {
+    if (lfs[i] != rfs[i]) return false;
+  }
+  return true;
+}
+
+template <typename scalar_t, size_t MaxDim>
+bool operator!=(const SimpleVector<scalar_t, MaxDim> &lfs,
+                const SimpleVector<scalar_t, MaxDim> &rfs) {
+  return !(lfs == rfs);
+}
+
+struct Slice {
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
+    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
+    SimpleVector<int, 3> slices{int(ints)...};
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    for (size_t i = 0; i < slices.size(); ++i) {
+      mSlices[i] = slices[i];
+    }
+  }
+
+  TV_HOST_DEVICE_INLINE Slice() {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+  }
+  template <typename scalar_t>
+  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<scalar_t> slice) {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    TV_ASSERT(slice.size() <= 3);
+    int idx = 0;
+    for (scalar_t s : slice) {
+      mSlices[idx] = int(s);
+      ++idx;
+    }
+  }
+  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+
+ protected:
+  int mSlices[3];
+};
+
+template <size_t MaxDim = TV_MAX_DIM>
+struct ShapeBase : public SimpleVector<int, MaxDim> {
+  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
+  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+
+  // TODO: find out why this template can no be used on windows
+  // template <typename scalar_t, template <class...> class Container>
+  // ShapeBase(Container<scalar_t> shape) : SimpleVector<int, MaxDim>(shape) {}
+  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
+
+  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < end; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && start <= this->mSize);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < this->mSize; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const {
+    if (this->mSize == 0) return 0;
+    size_t s = 1;
+    for (int i = 0; i < int(this->mSize); ++i) {
+      s *= this->mArray[i];
+    }
+    return s;
+  }
+  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+};
+
+using Shape = ShapeBase<TV_MAX_DIM>;
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#ifdef TV_DEBUG
+  TV_ASSERT(sizeof...(indexes) == shape.size());
+#endif
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           std::vector<int> &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = shape.size() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           const Shape &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
+                                           const Index *shape) {
+  unsigned offset = 0;
+  unsigned m = 1;
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    offset += m * indexes[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
+                                           const Index *shape) {
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    output[i] = index % shape[i];
+    index -= output[i];
+    index /= shape[i];
+  }
+  return index;
+}
+
+template <int N>
+struct ArrayIndexRowMajor {
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return indexes[N - 1] +
+           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
+  }
+};
+
+template <>
+struct ArrayIndexRowMajor<0> {
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return 0;
+  }
+};
+
+namespace detail {
+template <typename scalar_t>
+constexpr const char *simpleTypeName(scalar_t val = scalar_t());
+template <>
+constexpr const char *simpleTypeName(float val) {
+  return "float32";
+}
+template <>
+constexpr const char *simpleTypeName(double val) {
+  return "float64";
+}
+template <>
+constexpr const char *simpleTypeName(int val) {
+  return "int32";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned val) {
+  return "uint32";
+}
+template <>
+constexpr const char *simpleTypeName(long val) {
+  return "int64";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned long val) {
+  return "uint64";
+}
+};  // namespace detail
+
+template <typename scalar_t, int Rank = -1>
+struct TensorView {
+  TV_HOST_DEVICE_INLINE TensorView() {}
+  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Shape shape)
+      : mPtr(ptr), mShape(shape) {}
+  template <class... Integers>
+  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Integers... shapes)
+      : mPtr(ptr) {
+    mShape = {int(shapes)...};
+  }
+
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
+      const TensorView<scalar_t, Rank> &tensor) {
+    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
+               "\n");
+    scalar_t *ptr = mPtr;
+    const scalar_t *other_ptr = tensor.data();
+    for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
+    return *this;
+  }
+
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
+      std::initializer_list<T1> seq) {
+    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
+               "\n");
+    scalar_t *ptr = mPtr;
+    for (const T1 &s : seq) *(ptr++) = scalar_t(s);
+    return *this;
+  }
+
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(Inds... inds) {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(Inds... inds) const {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  TV_HOST_DEVICE_INLINE scalar_t &operator()() {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()() const {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+
+  template <class T1>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1) {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  template <class T1>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3,
+                                                   T4 i4) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
+                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
+#else
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+               int(idx), size());
+#endif
+#endif
+    return mPtr[idx];
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> operator[](
+      SimpleVector<Slice> slice_vec) {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE const TensorView<scalar_t, Rank> operator[](
+      SimpleVector<Slice> slice_vec) const {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
+  TV_HOST_DEVICE_INLINE scalar_t *data() { return mPtr; }
+  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mPtr; }
+  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
+  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
+  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Inds... newShapes) {
+    Shape shapes{int(newShapes)...};
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Shape shapes) {
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(
+      Inds... newShapes) const {
+    Shape shapes{int(newShapes)...};
+    for (size_t i = 0; i < shapes.ndim(); ++i) {
+      if (shapes[i] == -1) {
+        shapes[i] = 1;
+        shapes[i] = size() / shapes.size();
+        break;
+      }
+    }
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<scalar_t, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(Shape shapes) const {
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<scalar_t, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze() const {
+    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze());
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze(int dim) const {
+    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze(dim));
+  }
+  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
+
+  template <class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
+      Slice slice, Slices... slices) const {
+    return subview<float, Slice, Slices...>(slice, slices...);
+  }
+  template <class T2 = float, class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
+      Slices... slices) const {
+    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
+    Shape new_shape{to_slice(slices)[0]...};
+    Shape start{to_slice(slices)[0]...};
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1;
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
+  }
+
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(int id,
+                                                           Integers... ints) {
+    Shape start = {id, ints...};
+    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
+      start.push_back(0);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + rowArrayIdx(mShape, start),
+                                      mShape.subshape(sizeof...(ints) + 1));
+  }
+
+  std::string repr() const {
+    std::ostringstream ss;
+    if (empty()) return "";
+    if (mShape.ndim() == 0) {
+      ss << *mPtr;
+      ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
+      return ss.str();
+    }
+    Shape counter = mShape;
+    auto tensor_flat = this->view(-1);
+    for (int i = 0; i < counter.ndim(); ++i) {
+      counter[i] = 0;
+      ss << "[";
+    }
+    for (size_t i = 0; i < this->size(); ++i) {
+      ss << tensor_flat(rowArrayIdx(mShape, counter));
+      counter[counter.ndim() - 1] += 1;
+      int inc_count = 0;
+      bool print_comma = true;
+      for (int c = counter.ndim() - 1; c >= 0; --c) {
+        if (counter[c] == this->dim(c) && c > 0) {
+          ++inc_count;
+          counter[c - 1] += 1;
+          counter[c] = 0;
+          print_comma = false;
+        }
+      }
+      if (print_comma && i != this->size() - 1) ss << ", ";
+      for (int j = 0; j < inc_count; ++j) {
+        ss << "]";
+      }
+      if (i != this->size() - 1) {
+        if (inc_count != 0) ss << "\n";
+        for (int j = 0; j < inc_count; ++j) {
+          ss << "[";
+        }
+      }
+    }
+    ss << "]";
+    ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
+    return ss.str();
+  }
+
+ protected:
+  // TODO: make this function public.
+  // currently this function is called unexpectedly when using subview({0, 0}).
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> _subview(
+      SimpleVector<Slice> slice_vec) {
+    Shape new_shape;
+    for (int i = 0; i < slice_vec.size(); ++i) {
+      new_shape.push_back(slice_vec[i][0]);
+    }
+    Shape start = new_shape;
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1;  // reduce dim
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
+  }
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
+    return Slice{int(s), -1, -1};
+  }
+
+  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
+
+  scalar_t *mPtr = nullptr;
+  Shape mShape;
+};
+
+template <typename Os, typename scalar_t, int Rank>
+Os &operator<<(Os &os, const TensorView<scalar_t, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+template <typename Os, typename scalar_t, int Rank>
+Os &operator<<(Os &os, const TensorView<const scalar_t, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+namespace detail {
+template <typename scalar_t>
+constexpr const char *printfTypeFormat(scalar_t val = scalar_t());
+template <>
+constexpr const char *printfTypeFormat(float val) {
+  return "%.2f";
+}
+template <>
+constexpr const char *printfTypeFormat(double val) {
+  return "%.2f";
+}
+template <>
+constexpr const char *printfTypeFormat(int val) {
+  return "%d";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned val) {
+  return "%u";
+}
+template <>
+constexpr const char *printfTypeFormat(long val) {
+  return "%ld";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned long val) {
+  return "%lu";
+}
+};  // namespace detail
+
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const TensorView<scalar_t> tensor,
+                                    const char *format) {
+  if (tensor.empty()) return;
+  if (tensor.ndim() == 0) {
+    printf(format, tensor());
+    printf("\n");
+    return;
+  }
+  Shape counter = tensor.shape();
+  auto tensor_flat = tensor.view(-1);
+  for (int i = 0; i < counter.ndim(); ++i) {
+    counter[i] = 0;
+    printf("[");
+  }
+  for (size_t i = 0; i < tensor.size(); ++i) {
+    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
+    counter[counter.ndim() - 1] += 1;
+    int inc_count = 0;
+    bool print_comma = true;
+    for (int c = counter.ndim() - 1; c >= 0; --c) {
+      if (counter[c] == tensor.dim(c) && c > 0) {
+        ++inc_count;
+        counter[c - 1] += 1;
+        counter[c] = 0;
+        print_comma = false;
+      }
+    }
+    if (print_comma && i != tensor.size() - 1) printf(", ");
+    for (int j = 0; j < inc_count; ++j) {
+      printf("]");
+    }
+    if (i != tensor.size() - 1) {
+      if (inc_count != 0) printf("\n");
+      for (int j = 0; j < inc_count; ++j) {
+        printf("[");
+      }
+    }
+  }
+  printf("]\n");
+}
+
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(TensorView<scalar_t> tensor) {
+  using Traw = typename std::remove_const<scalar_t>::type;
+  return printTensorView(tensor, detail::printfTypeFormat<Traw>());
+}
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape) {
+  using Traw = typename std::remove_const<scalar_t>::type;
+  return printTensorView(TensorView<const scalar_t>(ptr, shape),
+                         detail::printfTypeFormat<Traw>());
+}
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape,
+                                    const char *format) {
+  return printTensorView(TensorView<const scalar_t>(ptr, shape), format);
+}
+
+}  // namespace tv
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/active_rotated_filter.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1ead1f8e4700d019fff7b25034e2475087040c8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/active_rotated_filter.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
+                       output);
+}
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
+                       grad_in);
+}
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output) {
+  active_rotated_filter_forward_impl(input, indices, output);
+}
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in) {
+  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9097f7e0a15d817b8e176a01e080e8f4476f6be9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "active_rotated_filter_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void active_rotated_filter_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto input = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  active_rotated_filter_forward(input, indices, output);
+}
+
+void active_rotated_filter_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto grad_out = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto grad_in = buildATensor(ctx, outs[0]);
+  active_rotated_filter_backward(grad_out, indices, grad_in);
+}
+#endif
+
+void active_rotated_filter_forward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto input = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  active_rotated_filter_forward(input, indices, output);
+}
+
+void active_rotated_filter_backward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto grad_out = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto grad_in = buildATensor(ctx, outs[0]);
+  active_rotated_filter_backward(grad_out, indices, grad_in);
+}
+
+PARROTS_EXTENSION_REGISTER(active_rotated_filter_forward)
+    .input(2)
+    .output(1)
+    .apply(active_rotated_filter_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(active_rotated_filter_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(active_rotated_filter_backward)
+    .input(2)
+    .output(1)
+    .apply(active_rotated_filter_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(active_rotated_filter_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a4d2ce96a416d6d845413f08b586aa55c57ea2f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
@@ -0,0 +1,13 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ACTIVE_ROTATED_FILTER_PYTORCH_H
+#define ACTIVE_ROTATED_FILTER_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output);
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in);
+
+#endif  // ACTIVE_ROTATED_FILTER_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/assign_score_withk.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/assign_score_withk.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9076277181c48c7c8f236cb9da79a83c5d38d47f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/assign_score_withk.cpp
@@ -0,0 +1,42 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
+                       aggregate, points, centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
+                       aggregate, grad_out, points, centers, scores, knn_idx,
+                       grad_points, grad_centers, grad_scores);
+}
+
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
+                                const Tensor& scores, const Tensor& knn_idx,
+                                Tensor& output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate) {
+  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
+                                  centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
+                                 const Tensor& centers, const Tensor& scores,
+                                 const Tensor& knn_idx, Tensor& grad_points,
+                                 Tensor& grad_centers, Tensor& grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate) {
+  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
+                                   points, centers, scores, knn_idx,
+                                   grad_points, grad_centers, grad_scores);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5729c716310069f2abd49412255b048a5dfe3f68
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp
@@ -0,0 +1,89 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "assign_score_withk_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void assign_score_withk_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int B, N0, N1, M, K, O, aggregate;
+  SSAttrs(attr)
+      .get<int>("B", B)
+      .get<int>("N0", N0)
+      .get<int>("N1", N1)
+      .get<int>("M", M)
+      .get<int>("K", K)
+      .get<int>("O", O)
+      .get<int>("aggregate", aggregate)
+      .done();
+
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& centers = buildATensor(ctx, ins[1]);
+  const auto& scores = buildATensor(ctx, ins[2]);
+  const auto& knn_idx = buildATensor(ctx, ins[3]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  assign_score_withk_forward(points, centers, scores, knn_idx, output, B, N0,
+                             N1, M, K, O, aggregate);
+}
+
+void assign_score_withk_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int B, N0, N1, M, K, O, aggregate;
+  SSAttrs(attr)
+      .get<int>("B", B)
+      .get<int>("N0", N0)
+      .get<int>("N1", N1)
+      .get<int>("M", M)
+      .get<int>("K", K)
+      .get<int>("O", O)
+      .get<int>("aggregate", aggregate)
+      .done();
+
+  const auto& grad_out = buildATensor(ctx, ins[0]);
+  const auto& points = buildATensor(ctx, ins[1]);
+  const auto& centers = buildATensor(ctx, ins[2]);
+  const auto& scores = buildATensor(ctx, ins[3]);
+  const auto& knn_idx = buildATensor(ctx, ins[4]);
+
+  auto grad_points = buildATensor(ctx, outs[0]);
+  auto grad_centers = buildATensor(ctx, outs[1]);
+  auto grad_scores = buildATensor(ctx, outs[2]);
+  assign_score_withk_backward(grad_out, points, centers, scores, knn_idx,
+                              grad_points, grad_centers, grad_scores, B, N0, N1,
+                              M, K, O, aggregate);
+}
+
+PARROTS_EXTENSION_REGISTER(assign_score_withk_forward)
+    .attr("B")
+    .attr("N0")
+    .attr("N1")
+    .attr("M")
+    .attr("K")
+    .attr("O")
+    .attr("aggregate")
+    .input(4)
+    .output(1)
+    .apply(assign_score_withk_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(assign_score_withk_backward)
+    .attr("B")
+    .attr("N0")
+    .attr("N1")
+    .attr("M")
+    .attr("K")
+    .attr("O")
+    .attr("aggregate")
+    .input(5)
+    .output(3)
+    .apply(assign_score_withk_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..660594feec80371eaece3a5663facf1db2b366d9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h
@@ -0,0 +1,19 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ASSIGN_SCORE_WITHK_PYTORCH_H
+#define ASSIGN_SCORE_WITHK_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
+                                const Tensor& scores, const Tensor& knn_idx,
+                                Tensor& output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate);
+
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
+                                 const Tensor& centers, const Tensor& scores,
+                                 const Tensor& knn_idx, Tensor& grad_points,
+                                 Tensor& grad_centers, Tensor& grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate);
+
+#endif  // ASSIGN_SCORE_WITHK_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ball_query._parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ball_query._parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..01ab9739b09986a59b69961c5b108bb098b36d6e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ball_query._parrots.cpp
@@ -0,0 +1,43 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "ball_query_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void ball_query_parrots(CudaContext& ctx, const SSElement& attr,
+                        const OperatorBase::in_list_t& ins,
+                        OperatorBase::out_list_t& outs) {
+  int b, n, m, nsample;
+  float min_radius, max_radius;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("n", n)
+      .get<int>("m", m)
+      .get<int>("nsample", nsample)
+      .get<float>("min_radius", min_radius)
+      .get<float>("max_radius", max_radius)
+      .done();
+
+  const auto& center_xyz = buildATensor(ctx, ins[0]);
+  const auto& xyz = buildATensor(ctx, ins[1]);
+  auto idx = buildATensor(ctx, outs[0]);
+  ball_query_forward(center_xyz, xyz, idx, b, n, m, min_radius, max_radius,
+                     nsample);
+}
+
+PARROTS_EXTENSION_REGISTER(ball_query_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .attr("nsample")
+    .attr("min_radius")
+    .attr("max_radius")
+    .input(2)
+    .output(1)
+    .apply(ball_query_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ball_query.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ball_query.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c9e7a20785e894c80d15256a1b040beffa92b47
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ball_query.cpp
@@ -0,0 +1,20 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
+                       nsample, new_xyz, xyz, idx);
+}
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample) {
+  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
+                          new_xyz_tensor, xyz_tensor, idx_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ball_query_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ball_query_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..70026f315089d1c37335865ae719f301407d6231
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ball_query_pytorch.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BALL_QUERY_PYTORCH_H
+#define BALL_QUERY_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void ball_query_forward(const Tensor new_xyz, const Tensor xyz, Tensor idx,
+                        int b, int n, int m, float min_radius, float max_radius,
+                        int nsample);
+
+#endif  // BALL_QUERY_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/bbox_overlaps.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..187216fb01a307906a6fff8d7c10fc4efa1b9b3a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
+                       aligned, offset);
+}
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset) {
+  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5f6264d3c07a6b0c0f5b1cb98666580e7bae6a25
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp
@@ -0,0 +1,40 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "bbox_overlaps_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*
+ * void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor
+ * ious, const int mode, const bool aligned, const int offset);
+ */
+void bbox_overlaps_parrots(CudaContext& ctx, const SSElement& attr,
+                           const OperatorBase::in_list_t& ins,
+                           OperatorBase::out_list_t& outs) {
+  int mode, offset;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("mode", mode)
+      .get<bool>("aligned", aligned)
+      .get<int>("offset", offset)
+      .done();
+
+  const auto& bboxes1 = buildATensor(ctx, ins[0]);
+  const auto& bboxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+PARROTS_EXTENSION_REGISTER(bbox_overlaps)
+    .attr("mode")
+    .attr("aligned")
+    .attr("offset")
+    .input(2)
+    .output(1)
+    .apply(bbox_overlaps_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f68aa3397d80db7dd2cf4299b4391cddc533920
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h
@@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BBOX_OVERLAPS_PYTORCH_H
+#define BBOX_OVERLAPS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+
+#endif  // BBOX_OVERLAPS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/border_align.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/border_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..565de689913413ab106884365e6dc1edfa940de0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/border_align.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
+                       argmax_idx, pool_size);
+}
+
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
+                       argmax_idx, grad_input, pool_size);
+}
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size) {
+  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
+}
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size) {
+  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
+                             pool_size);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/border_align_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/border_align_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c3bea58cca4903a1a33361ecdfe0e0d37404e0b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/border_align_parrots.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "border_align_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void border_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int pool_size;
+  SSAttrs(attr).get<int>("pool_size", pool_size).done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& boxes = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax_idx = buildATensor(ctx, outs[1]);
+  border_align_forward_cuda(input, boxes, output, argmax_idx, pool_size);
+}
+
+void border_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int pool_size;
+  SSAttrs(attr).get<int>("pool_size", pool_size).done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& boxes = buildATensor(ctx, ins[1]);
+  const auto& argmax_idx = buildATensor(ctx, ins[2]);
+
+  auto bottom_grad = buildATensor(ctx, outs[0]);
+  border_align_backward_cuda(top_grad, boxes, argmax_idx, bottom_grad,
+                             pool_size);
+}
+
+PARROTS_EXTENSION_REGISTER(border_align_forward)
+    .attr("pool_size")
+    .input(2)
+    .output(2)
+    .apply(border_align_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(border_align_backward)
+    .attr("pool_size")
+    .input(3)
+    .output(1)
+    .apply(border_align_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/border_align_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/border_align_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb031e572a50df4edec4fc65056700c8850f7715
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/border_align_pytorch.h
@@ -0,0 +1,17 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BORDER_ALIGN_PYTORCH_H
+#define BORDER_ALIGN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size);
+#endif
+
+#endif  // BORDER_ALIGN_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/box_iou_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2a4e0953a5575f72c167bd668c6b6e758ebae87
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
@@ -0,0 +1,19 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
+                       aligned);
+}
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned) {
+  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a90d640458b8ed38b9e18c3b26f574ce4c58e8fb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "box_iou_rotated_pytorch.h"
+
+using namespace parrots;
+
+/*
+ * void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor
+ * ious, const int mode_flag, const bool aligned);
+ */
+void box_iou_rotated_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  bool aligned;
+  int mode_flag;
+  SSAttrs(attr)
+      .get<bool>("aligned", aligned)
+      .get<int>("mode_flag", mode_flag)
+      .done();
+
+  const auto& boxes1 = buildATensor(ctx, ins[0]);
+  const auto& boxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+#ifdef MMCV_WITH_CUDA
+/*
+ * void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor
+ * ious, const int mode_flag, const bool aligned);
+ */
+void box_iou_rotated_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  bool aligned;
+  int mode_flag;
+  SSAttrs(attr)
+      .get<bool>("aligned", aligned)
+      .get<int>("mode_flag", mode_flag)
+      .done();
+
+  const auto& boxes1 = buildATensor(ctx, ins[0]);
+  const auto& boxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
+}
+#endif
+
+PARROTS_EXTENSION_REGISTER(box_iou_rotated)
+    .attr("aligned")
+    .attr("mode_flag")
+    .input(2)
+    .output(1)
+    .apply(box_iou_rotated_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(box_iou_rotated_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..afab7031812d4389707e6b4235affba93faef6c0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BOX_IOU_ROTATED_PYTORCH_H
+#define BOX_IOU_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned);
+
+#ifdef MMCV_WITH_CUDA
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+#endif
+
+#endif  // BOX_IOU_ROTATED_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a563aed94f04e32614e38062c4e7f4250c6dafe6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
+                       rmasks, output, kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
+                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor) {
+  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor) {
+  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_naive.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_naive.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e8917a61d93c7e6613566902cb00623ea89444e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_naive.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
+                       kernel_size, group_size, scale_factor);
+}
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c16a3707991d015971325fe161c2c9c4c2c31a6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "carafe_naive_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+ *                                int kernel_size, int group_size,
+ *                                int scale_factor)
+ */
+void carafe_naive_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& features = buildATensor(ctx, ins[0]);
+  const auto& masks = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+/*void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor
+ * masks, Tensor bottom_grad, Tensor mask_grad, int kernel_size, int group_size,
+ *                                int scale_factor);
+ */
+void carafe_naive_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& features = buildATensor(ctx, ins[1]);
+  const auto& masks = buildATensor(ctx, ins[2]);
+
+  auto bottom_grad = buildATensor(ctx, outs[0]);
+  auto mask_grad = buildATensor(ctx, outs[1]);
+  carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
+
+PARROTS_EXTENSION_REGISTER(carafe_naive_forward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(2)
+    .output(1)
+    .apply(carafe_naive_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(carafe_naive_backward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(3)
+    .output(2)
+    .apply(carafe_naive_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_naive_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_naive_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6df9b88c231b4949f128c528cc3f31633c76fb79
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_naive_pytorch.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_NAIVE_PYTORCH_H
+#define CARAFE_NAIVE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+#endif  // CARAFE_NAIVE_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e99f59ef221bfe7058c53a486c75e201c44e7f68
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_parrots.cpp
@@ -0,0 +1,88 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "carafe_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*
+ * void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+ *                          Tensor routput, Tensor rmasks, Tensor output,
+ *                          int kernel_size, int group_size, int scale_factor);
+ */
+void carafe_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& features = buildATensor(ctx, ins[0]);
+  const auto& masks = buildATensor(ctx, ins[1]);
+
+  auto rfeatures = buildATensor(ctx, outs[0]);
+  auto routput = buildATensor(ctx, outs[1]);
+  auto rmasks = buildATensor(ctx, outs[2]);
+  auto output = buildATensor(ctx, outs[3]);
+
+  carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+/*
+ * void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+ *                           Tensor rtop_grad, Tensor rbottom_grad_hs,
+ *                           Tensor rbottom_grad, Tensor rmask_grad,
+ *                           Tensor bottom_grad, Tensor mask_grad, int
+ * kernel_size, int group_size, int scale_factor);
+ */
+void carafe_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& rfeatures = buildATensor(ctx, ins[1]);
+  const auto& masks = buildATensor(ctx, ins[2]);
+
+  auto rtop_grad = buildATensor(ctx, outs[0]);
+  auto rbottom_grad_hs = buildATensor(ctx, outs[1]);
+  auto rbottom_grad = buildATensor(ctx, outs[2]);
+  auto rmask_grad = buildATensor(ctx, outs[3]);
+  auto bottom_grad = buildATensor(ctx, outs[4]);
+  auto mask_grad = buildATensor(ctx, outs[5]);
+
+  carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
+
+PARROTS_EXTENSION_REGISTER(carafe_forward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(2)
+    .output(4)
+    .apply(carafe_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(carafe_backward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(3)
+    .output(6)
+    .apply(carafe_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b94d44d3c9d1a81e0838bf209d774c703004fa9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/carafe_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_PYTORCH_H
+#define CARAFE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+#endif  // CARAFE_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/chamfer_distance.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/chamfer_distance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dcff69893185d7cc52d8048d300b45ccfe0b3968
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/chamfer_distance.cpp
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2) {
+  DISPATCH_DEVICE_IMPL(chamfer_distance_forward_impl, xyz1, xyz2, dist1, dist2,
+                       idx1, idx2);
+}
+
+void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor idx1, Tensor idx2, Tensor graddist1,
+                                    Tensor graddist2, Tensor gradxyz1,
+                                    Tensor gradxyz2) {
+  DISPATCH_DEVICE_IMPL(chamfer_distance_backward_impl, xyz1, xyz2, idx1, idx2,
+                       graddist1, graddist2, gradxyz1, gradxyz2);
+}
+
+void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
+                              const Tensor dist1, const Tensor dist2,
+                              const Tensor idx1, const Tensor idx2) {
+  chamfer_distance_forward_impl(xyz1, xyz2, dist1, dist2, idx1, idx2);
+}
+
+void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
+                               Tensor idx1, Tensor idx2, Tensor graddist1,
+                               Tensor graddist2, Tensor gradxyz1,
+                               Tensor gradxyz2) {
+  chamfer_distance_backward_impl(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
+                                 gradxyz1, gradxyz2);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db8eff1d6f5e0c4a0c1e21a55f54381f1d5a3104
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp
@@ -0,0 +1,51 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "chamfer_distance_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void chamfer_distance_forward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  auto xyz1 = buildATensor(ctx, ins[0]);
+  auto xyz2 = buildATensor(ctx, ins[1]);
+  auto dist1 = buildATensor(ctx, outs[0]);
+  auto dist2 = buildATensor(ctx, outs[1]);
+  auto idx1 = buildATensor(ctx, outs[2]);
+  auto idx2 = buildATensor(ctx, outs[3]);
+  chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1, idx2);
+}
+
+void chamfer_distance_backward_cuda_parrots(CudaContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  auto xyz1 = buildATensor(ctx, ins[0]);
+  auto xyz2 = buildATensor(ctx, ins[1]);
+  auto idx1 = buildATensor(ctx, ins[2]);
+  auto idx2 = buildATensor(ctx, ins[3]);
+  auto graddist1 = buildATensor(ctx, ins[4]);
+  auto graddist2 = buildATensor(ctx, ins[5]);
+  auto gradxyz1 = buildATensor(ctx, outs[0]);
+  auto gradxyz2 = buildATensor(ctx, outs[1]);
+  chamfer_distance_backward(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
+                            gradxyz1, gradxyz2);
+}
+
+PARROTS_EXTENSION_REGISTER(chamfer_distance_forward)
+    .input(2)
+    .output(4)
+    .apply(chamfer_distance_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(chamfer_distance_backward)
+    .input(6)
+    .output(2)
+    .apply(chamfer_distance_backward_cuda_parrots)
+    .done();
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6405526b0c4c73d6aa1bb2142687d148ba559af2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
+#define ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
+                              const Tensor dist1, const Tensor dist2,
+                              const Tensor idx1, const Tensor idx);
+
+void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
+                               Tensor idx1, Tensor idx2, Tensor graddist1,
+                               Tensor graddist2, Tensor gradxyz1,
+                               Tensor gradxyz2);
+
+#endif  // ACTIVE_CHAMFER_DISTANCE_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/contour_expand.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/contour_expand.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..586c48ee44b6b7dbb24573b4a2d2ecf499a56d0b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/contour_expand.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/whai362/PSENet
+#include <iostream>
+#include <queue>
+
+#include "pytorch_cpp_helper.hpp"
+
+using namespace std;
+
+class Point2d {
+ public:
+  int x;
+  int y;
+
+  Point2d() : x(0), y(0) {}
+  Point2d(int _x, int _y) : x(_x), y(_y) {}
+};
+
+void kernel_dilate(const uint8_t *data, IntArrayRef data_shape,
+                   const int *label_map, int &label_num, int &min_area,
+                   vector<vector<int>> &text_line) {
+  std::vector<int> area(label_num + 1);
+  int kernel_num = data_shape[0];
+  int height = data_shape[1];
+  int width = data_shape[2];
+
+  for (int x = 0; x < height; ++x) {
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      area[label] += 1;
+    }
+  }
+
+  queue<Point2d> queue, next_queue;
+  for (int x = 0; x < height; ++x) {
+    vector<int> row(width);
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      if (area[label] < min_area) continue;
+
+      Point2d point(x, y);
+      queue.push(point);
+      row[y] = label;
+    }
+    text_line.emplace_back(row);
+  }
+
+  int dx[] = {-1, 1, 0, 0};
+  int dy[] = {0, 0, -1, 1};
+  vector<int> kernel_step(kernel_num);
+  std::for_each(kernel_step.begin(), kernel_step.end(),
+                [=](int &k) { return k * height * width; });
+
+  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
+    while (!queue.empty()) {
+      Point2d point = queue.front();
+      queue.pop();
+      int x = point.x;
+      int y = point.y;
+      int label = text_line[x][y];
+
+      bool is_edge = true;
+      for (int d = 0; d < 4; ++d) {
+        int tmp_x = x + dx[d];
+        int tmp_y = y + dy[d];
+
+        if (tmp_x < 0 || tmp_x >= height) continue;
+        if (tmp_y < 0 || tmp_y >= width) continue;
+        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
+        if (kernel_value == 0) continue;
+        if (text_line[tmp_x][tmp_y] > 0) continue;
+
+        Point2d point(tmp_x, tmp_y);
+        queue.push(point);
+        text_line[tmp_x][tmp_y] = label;
+        is_edge = false;
+      }
+
+      if (is_edge) {
+        next_queue.push(point);
+      }
+    }
+    swap(queue, next_queue);
+  }
+}
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num) {
+  kernel_mask = kernel_mask.contiguous();
+  internal_kernel_label = internal_kernel_label.contiguous();
+  assert(kernel_mask.dim() == 3);
+  assert(internal_kernel_label.dim() == 2);
+  assert(kernel_mask.size(1) == internal_kernel_label.size(0));
+  assert(kernel_mask.size(2) == internal_kernel_label.size(1));
+  CHECK_CPU_INPUT(kernel_mask);
+  CHECK_CPU_INPUT(internal_kernel_label);
+  auto ptr_data = kernel_mask.data_ptr<uint8_t>();
+  IntArrayRef data_shape = kernel_mask.sizes();
+
+  auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
+  vector<vector<int>> text_line;
+
+  kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
+                min_kernel_area, text_line);
+
+  return text_line;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/contour_expand_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/contour_expand_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1581fdc833c8f6b19a8e5a892ddbd8ec9414333e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/contour_expand_parrots.cpp
@@ -0,0 +1,43 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "contour_expand_pytorch.h"
+
+using namespace parrots;
+using namespace std;
+
+template <typename T>
+void contour_expand_parrots(T& ctx, const SSElement& attr,
+                            const OperatorBase::in_list_t& ins,
+                            OperatorBase::out_list_t& outs) {
+  int min_kernel_area, kernel_num;
+  SSAttrs(attr)
+      .get<int>("min_kernel_area", min_kernel_area)
+      .get<int>("kernel_num", kernel_num)
+      .done();
+  at::Tensor kernel_mask;
+  at::Tensor internal_kernel_label;
+  kernel_mask = buildATensor(ctx, ins[0]);
+  internal_kernel_label = buildATensor(ctx, ins[1]);
+  auto out = contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
+                            kernel_num);
+  int n = out.size(), m = 0;
+  for (int i = 0; i < n; ++i)
+    if (m < out[i].size()) m = out[i].size();
+  auto options = torch::TensorOptions().dtype(at::kInt);
+  auto tensor = torch::zeros({n, m}, options);
+  for (int i = 0; i < n; i++)
+    tensor.slice(0, i, i + 1) =
+        torch::from_blob(out[i].data(), {out[i].size()}, options);
+  updateDArray(ctx, tensor, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(contour_expand)
+    .attr("min_kernel_area")
+    .attr("kernel_num")
+    .input(2)
+    .output(1)
+    .apply(contour_expand_parrots<HostContext>)
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/contour_expand_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/contour_expand_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..881bbac3cb73494e0063314c340adc7a280f4fc6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/contour_expand_pytorch.h
@@ -0,0 +1,12 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CONTOUR_EXPAND_PYTORCH_H
+#define CONTOUR_EXPAND_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num);
+
+#endif  // CONTOUR_EXPAND_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/convex_iou.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/convex_iou.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79f2028b551c474453aff2f6633dd426194e4afd
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/convex_iou.cpp
@@ -0,0 +1,23 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
+}
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
+  convex_iou_impl(pointsets, polygons, ious);
+}
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
+}
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
+  convex_giou_impl(pointsets, polygons, output);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf766542f0a04da85a1b15022f3e5f078c283a1a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp
@@ -0,0 +1,40 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "convex_iou_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void convex_iou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  convex_iou(pointsets, polygons, ious);
+}
+
+void convex_giou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  convex_giou(pointsets, polygons, output);
+}
+
+PARROTS_EXTENSION_REGISTER(convex_iou)
+    .input(2)
+    .output(1)
+    .apply(convex_iou_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(convex_giou)
+    .input(2)
+    .output(1)
+    .apply(convex_giou_forward_cuda_parrots)
+    .done();
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/convex_iou_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/convex_iou_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f16a1ce4b62bbe91b3083465468c2b9ae6df055
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/convex_iou_pytorch.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CONVEX_IOU_PYTORCH_H
+#define CONVEX_IOU_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
+
+#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/correlation.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/correlation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4adba2a0c17201476352c473f1c7117af020ab2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/correlation.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <iostream>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
+                       patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
+                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
+                       padW, dilationH, dilationW, dilation_patchH,
+                       dilation_patchW, dH, dW);
+}
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW) {
+  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
+                           padW, dilationH, dilationW, dilation_patchH,
+                           dilation_patchW, dH, dW);
+}
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW) {
+  correlation_backward_impl(grad_output, input1, input2, grad_input1,
+                            grad_input2, kH, kW, patchH, patchW, padH, padW,
+                            dilationH, dilationW, dilation_patchH,
+                            dilation_patchW, dH, dW);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/correlation_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/correlation_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b1e287d063564775070389285a6fee7ea1aaeb80
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/correlation_parrots.cpp
@@ -0,0 +1,176 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "correlation_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void correlation_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto input1 = buildATensor(ctx, ins[0]);
+  auto input2 = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  correlation_forward(input1, input2, output, kH, kW, patchH, patchW, padH,
+                      padW, dilationH, dilationW, dilation_patchH,
+                      dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto input1 = buildATensor(ctx, ins[1]);
+  auto input2 = buildATensor(ctx, ins[2]);
+
+  auto grad_input1 = buildATensor(ctx, outs[0]);
+  auto grad_input2 = buildATensor(ctx, outs[1]);
+
+  correlation_backward(grad_output, input1, input2, grad_input1, grad_input2,
+                       kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+#endif
+
+void correlation_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto input1 = buildATensor(ctx, ins[0]);
+  auto input2 = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  correlation_forward(input1, input2, output, kH, kW, patchH, patchW, padH,
+                      padW, dilationH, dilationW, dilation_patchH,
+                      dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto input1 = buildATensor(ctx, ins[1]);
+  auto input2 = buildATensor(ctx, ins[2]);
+
+  auto grad_input1 = buildATensor(ctx, outs[0]);
+  auto grad_input2 = buildATensor(ctx, outs[1]);
+
+  correlation_backward(grad_output, input1, input2, grad_input1, grad_input2,
+                       kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+
+PARROTS_EXTENSION_REGISTER(correlation_forward)
+    .attr("kH")
+    .attr("kW")
+    .attr("patchH")
+    .attr("patchW")
+    .attr("padH")
+    .attr("padW")
+    .attr("dilationH")
+    .attr("dilationW")
+    .attr("dilation_patchH")
+    .attr("dilation_patchW")
+    .attr("dH")
+    .attr("dW")
+    .input(2)
+    .output(1)
+    .apply(correlation_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(correlation_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(correlation_backward)
+    .attr("kH")
+    .attr("kW")
+    .attr("patchH")
+    .attr("patchW")
+    .attr("padH")
+    .attr("padW")
+    .attr("dilationH")
+    .attr("dilationW")
+    .attr("dilation_patchH")
+    .attr("dilation_patchW")
+    .attr("dH")
+    .attr("dW")
+    .input(3)
+    .output(2)
+    .apply(correlation_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(correlation_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/correlation_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/correlation_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..806fcaa710deb7d4622be6373dda84b20e7278fc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/correlation_pytorch.h
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CORRELATION_PYTORCH_H
+#define CORRELATION_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW);
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW);
+
+#endif  // CORRELATION_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/cudabind.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/cudabind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4521ddf4a928c971069e3da1e2a71cf8b2b4e0e8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/cudabind.cpp
@@ -0,0 +1,1626 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output);
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  AssignScoreWithKForwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
+};
+
+void assign_score_withk_backward_cuda(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  AssignScoreWithKBackwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
+      grad_points, grad_centers, grad_scores);
+};
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output);
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
+                     assign_score_withk_forward_cuda);
+REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,
+                     assign_score_withk_backward_cuda);
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx);
+
+void ball_query_forward_cuda(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
+                                     new_xyz, xyz, idx);
+};
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx);
+REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset);
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor& input,
+                                          const Tensor& boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size);
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor& grad_output,
+                                           const Tensor& boxes,
+                                           const Tensor& argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size);
+
+void border_align_forward_cuda(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
+                                       pool_size);
+}
+
+void border_align_backward_cuda(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
+                                        grad_input, pool_size);
+}
+
+void border_align_forward_impl(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_impl(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size);
+
+REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
+                     border_align_forward_cuda);
+REGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,
+                     border_align_backward_cuda);
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor);
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor);
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                  output, kernel_size, group_size,
+                                  scale_factor);
+}
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                   bottom_grad, mask_grad, kernel_size,
+                                   group_size, scale_factor);
+}
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor);
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor);
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+                                       group_size, scale_factor);
+}
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+                                        mask_grad, kernel_size, group_size,
+                                        scale_factor);
+}
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,
+                     carafe_naive_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,
+                     carafe_naive_backward_cuda);
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW);
+
+void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
+                                           Tensor input2, Tensor grad_input1,
+                                           Tensor grad_input2, int kH, int kW,
+                                           int patchH, int patchW, int padH,
+                                           int padW, int dilationH,
+                                           int dilationW, int dilation_patchH,
+                                           int dilation_patchW, int dH, int dW);
+
+void correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  CorrelationForwardCUDAKernelLauncher(
+      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
+      dilationW, dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  CorrelationBackwardCUDAKernelLauncher(
+      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
+      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW);
+}
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW);
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW);
+
+REGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);
+REGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,
+                     correlation_backward_cuda);
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,
+                     deformable_col2im_coord_cuda);
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma);
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma);
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+                                         pooled_height, pooled_width,
+                                         spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DeformRoIPoolBackwardCUDAKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,
+                     deform_roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,
+                     deform_roi_pool_backward_cuda);
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha);
+}
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,
+                     sigmoid_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,
+                     sigmoid_focal_loss_backward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,
+                     softmax_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
+                     softmax_focal_loss_backward_cuda);
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float* dataset,
+                                                    float* temp, int* idxs);
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float* dataset, float* temp, int* idxs);
+
+void furthest_point_sampling_forward_cuda(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
+                                                         idxs);
+}
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m);
+
+REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
+                     furthest_point_sampling_forward_cuda);
+REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
+                     furthest_point_sampling_with_dist_forward_cuda);
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
+                                      int grad, float alpha, float scale);
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale);
+REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
+                     fused_bias_leakyrelu_op);
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out);
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points);
+
+void gather_points_forward_cuda(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
+};
+
+void gather_points_backward_cuda(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
+                                         grad_points);
+};
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out);
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,
+                     gather_points_forward_cuda);
+REGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,
+                     gather_points_backward_cuda);
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out);
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points);
+
+void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
+                                       out);
+};
+
+void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
+                                        idx, grad_points);
+};
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out);
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
+                     group_points_forward_cuda);
+REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
+                     group_points_backward_cuda);
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2);
+
+void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
+}
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2);
+REGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w);
+
+void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int height,
+                                           const int width, const int channels);
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                        width, channels);
+}
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+
+REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,
+                     masked_im2col_forward_cuda);
+REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,
+                     masked_col2im_forward_cuda);
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,
+                     modulated_deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
+                     modulated_deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
+                     modulated_deformable_col2im_coord_cuda);
+
+Tensor ms_deform_attn_cuda_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_cuda_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+Tensor ms_deform_attn_impl_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_impl_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
+                     ms_deform_attn_cuda_forward);
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,
+                     ms_deform_attn_cuda_backward);
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset);
+
+Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points);
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points);
+
+void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                             boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                            boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points);
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points);
+REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,
+                     points_in_boxes_part_forward_cuda);
+REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,
+                     points_in_boxes_all_forward_cuda);
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask);
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask);
+
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
+                                   w_feature, h_mask, w_mask, half_h_mask,
+                                   half_w_mask);
+}
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                    h_feature, w_feature, h_mask, w_mask,
+                                    half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  ROIAlignBackwardCUDAKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output);
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
+
+void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = input.size(1);
+  int data_height = input.size(2);
+  int data_width = input.size(3);
+  ROIAlignRotatedForwardCUDAKernelLauncher(
+      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, output);
+}
+
+void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  ROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, bottom_grad);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
+                     roi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
+                     roi_align_rotated_backward_cuda);
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output);
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad);
+
+void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(features);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = features.size(1) / num_orientations;
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  RiROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, output);
+}
+
+void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(top_grad);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = bottom_grad.size(1) / num_orientations;
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  RiROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, bottom_grad);
+}
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise);
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise);
+
+REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
+                     riroi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
+                     riroi_align_rotated_backward_cuda);
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method);
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method);
+
+void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  RoiawarePool3dForwardCUDAKernelLauncher(
+      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
+      pool_method);
+};
+
+void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  RoiawarePool3dBackwardCUDAKernelLauncher(
+      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
+};
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,
+                     roiaware_pool3d_forward_cuda);
+REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,
+                     roiaware_pool3d_backward_cuda);
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
+
+void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  RoIPointPool3dForwardCUDAKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+};
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag);
+REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,
+                     roipoint_pool3d_forward_cuda);
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale);
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale);
+
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
+                                   pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
+                                    pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor& feats, const at::Tensor& coors,
+    const reduce_t reduce_type);
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor& grad_feats, const at::Tensor& grad_reduced_feats,
+    const at::Tensor& feats, const at::Tensor& reduced_feats,
+    const at::Tensor& coors_map, const at::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type) {
+  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
+                                                      reduce_type);
+};
+
+void dynamic_point_to_voxel_backward_cuda(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type) {
+  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
+                                                feats, reduced_feats, coors_idx,
+                                                reduce_count, reduce_type);
+};
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
+                     dynamic_point_to_voxel_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
+                     dynamic_point_to_voxel_backward_cuda);
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var);
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size);
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias);
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input);
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
+  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
+}
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
+                                        running_var, weight, bias, norm, std,
+                                        output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
+                                        grad_bias);
+}
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
+                                       grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,
+                     sync_bn_forward_mean_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,
+                     sync_bn_forward_output_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,
+                     sync_bn_backward_param_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,
+                     sync_bn_backward_data_cuda);
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight, Tensor out);
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points);
+
+void three_interpolate_forward_cuda(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
+                                            out);
+};
+
+void three_interpolate_backward_cuda(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
+                                             grad_points);
+};
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out);
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points);
+REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,
+                     three_interpolate_forward_cuda);
+REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,
+                     three_interpolate_backward_cuda);
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx);
+
+void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
+};
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx);
+REGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output);
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input);
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardCUDAKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);
+
+torch::Tensor upfirdn2d_op(const torch::Tensor& input,
+                           const torch::Tensor& kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1);
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1);
+REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3);
+
+int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim) {
+  return HardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+int nondeterministic_hard_voxelize_forward_cuda(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim) {
+  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim) {
+  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
+                                           coors_range, NDim);
+};
+
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
+                     hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
+                     nondeterministic_hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
+                     dynamic_voxelize_forward_cuda);
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output);
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad);
+
+void rotated_feature_align_forward_cuda(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
+                                               spatial_scale, points, output);
+};
+
+void rotated_feature_align_backward_cuda(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  RotatedFeatureAlignBackwardCUDAKernelLauncher(
+      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
+};
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
+                     rotated_feature_align_forward_cuda);
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
+                     rotated_feature_align_backward_cuda);
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output);
+
+void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
+                                            output);
+};
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols);
+
+REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
+                     points_in_polygons_forward_cuda);
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
+
+void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
+  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
+}
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
+
+REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output);
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in);
+
+void active_rotated_filter_forward_cuda(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
+};
+
+void active_rotated_filter_backward_cuda(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
+};
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
+                     active_rotated_filter_forward_cuda);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
+                     active_rotated_filter_backward_cuda);
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious);
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output);
+
+void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
+}
+
+void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
+}
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious);
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output);
+
+REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
+REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
+
+Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
+                                                    Tensor mask,
+                                                    Tensor num_valid);
+
+Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
+                                                      num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
+                     diff_iou_rotated_sort_vertices_forward_cuda);
+
+void ChamferDistanceForwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
+    const Tensor dist2, const Tensor idx1, const Tensor idx2);
+
+void ChamferDistanceBackwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
+    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2);
+
+void chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2) {
+  ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,
+                                           idx2);
+};
+
+void chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor idx1, Tensor idx2, Tensor graddist1,
+                                    Tensor graddist2, Tensor gradxyz1,
+                                    Tensor gradxyz2) {
+  ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, idx1, idx2, graddist1,
+                                            graddist2, gradxyz1, gradxyz2);
+};
+
+void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2);
+
+void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor idx1, Tensor idx2, Tensor graddist1,
+                                    Tensor graddist2, Tensor gradxyz1,
+                                    Tensor gradxyz2);
+
+REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
+                     chamfer_distance_forward_cuda);
+REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
+                     chamfer_distance_backward_cuda);
+
+void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                        Tensor output, int pooled_height,
+                                        int pooled_width, float spatial_scale);
+
+void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                         Tensor grad_input, int pooled_height,
+                                         int pooled_width, float spatial_scale);
+
+void PrROIPoolCoorBackwardCUDAKernelLauncher(
+    Tensor output, Tensor grad_output, Tensor input, Tensor rois,
+    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);
+
+void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
+                                     pooled_width, spatial_scale);
+}
+
+void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
+                                      pooled_height, pooled_width,
+                                      spatial_scale);
+}
+
+void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
+                                          grad_rois, pooled_height,
+                                          pooled_width, spatial_scale);
+}
+
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale);
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale);
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale);
+REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
+                     prroi_pool_coor_backward_cuda);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_conv.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86690b9394a4b758104009062f656dcfe0de178e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_conv.cpp
@@ -0,0 +1,517 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, data_col);
+}
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, grad_im);
+}
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
+                       data_offset, channels, height, width, ksize_h, ksize_w,
+                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                       parallel_imgs, deformable_group, grad_offset);
+}
+
+void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
+                             at::Tensor *gradOutput, at::Tensor weight, int kH,
+                             int kW, int dH, int dW, int padH, int padW,
+                             int dilationH, int dilationW, int group,
+                             int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got kH: %d kW: %d",
+              kH, kW);
+
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+              "kernel size should be consistent with weight, ",
+              "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+              kH, kW, weight.size(2), weight.size(3));
+
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got dH: %d dW: %d", dH,
+              dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(ndim == 3 || ndim == 4,
+              "3D or 4D input tensor expected but got: %s", ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
+              "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(input.size(1) == nInputPlane,
+              "invalid number of input planes, expected: %d, but got: %d",
+              nInputPlane, input.size(1));
+
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+              "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+              "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane, gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight, outputWidth, gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
+                          padW, dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
+                                    im2col_step * outputHeight, outputWidth},
+                                   output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradInput);
+    CHECK_CUDA_INPUT(gradOffset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(columns);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradInput);
+    CHECK_CPU_INPUT(gradOffset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(columns);
+  }
+  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
+                          padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
+                                 inputHeight, inputWidth, kH, kW, padH, padW,
+                                 dH, dW, dilationH, dilationW, im2col_step,
+                                 deformable_group, gradOffset[elt]);
+
+    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group,
+                           gradInput[elt]);
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradWeight);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradWeight);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
+                          dW, padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer = gradOutputBuffer.contiguous();
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_conv_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_conv_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c07a170dfb73032756277096d53b82a528ecafd1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_conv_parrots.cpp
@@ -0,0 +1,273 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "deform_conv_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void deform_conv_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+
+  deform_conv_forward(input, weight, offset, output, columns, ones, kW, kH, dW,
+                      dH, padW, padH, dilationW, dilationH, group,
+                      deformable_group, im2col_step);
+}
+
+void deform_conv_backward_input_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradInput = buildATensor(ctx, outs[0]);
+  auto gradOffset = buildATensor(ctx, outs[1]);
+  auto weight = buildATensor(ctx, outs[2]);
+  auto columns = buildATensor(ctx, outs[3]);
+
+  deform_conv_backward_input(input, offset, gradOutput, gradInput, gradOffset,
+                             weight, columns, kW, kH, dW, dH, padW, padH,
+                             dilationW, dilationH, group, deformable_group,
+                             im2col_step);
+}
+
+void deform_conv_backward_parameters_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  float scale;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<float>("scale", scale)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradWeight = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+  deform_conv_backward_parameters(input, offset, gradOutput, gradWeight,
+                                  columns, ones, kW, kH, dW, dH, padW, padH,
+                                  dilationW, dilationH, group, deformable_group,
+                                  scale, im2col_step);
+}
+#endif
+
+void deform_conv_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+
+  deform_conv_forward(input, weight, offset, output, columns, ones, kW, kH, dW,
+                      dH, padW, padH, dilationW, dilationH, group,
+                      deformable_group, im2col_step);
+}
+
+void deform_conv_backward_input_cpu_parrots(HostContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradInput = buildATensor(ctx, outs[0]);
+  auto gradOffset = buildATensor(ctx, outs[1]);
+  auto weight = buildATensor(ctx, outs[2]);
+  auto columns = buildATensor(ctx, outs[3]);
+
+  deform_conv_backward_input(input, offset, gradOutput, gradInput, gradOffset,
+                             weight, columns, kW, kH, dW, dH, padW, padH,
+                             dilationW, dilationH, group, deformable_group,
+                             im2col_step);
+}
+
+void deform_conv_backward_parameters_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  float scale;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<float>("scale", scale)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradWeight = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+  deform_conv_backward_parameters(input, offset, gradOutput, gradWeight,
+                                  columns, ones, kW, kH, dW, dH, padW, padH,
+                                  dilationW, dilationH, group, deformable_group,
+                                  scale, im2col_step);
+}
+
+PARROTS_EXTENSION_REGISTER(deform_conv_forward)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("im2col_step")
+    .input(3)
+    .output(3)
+    .apply(deform_conv_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(deform_conv_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_conv_backward_input)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("im2col_step")
+    .input(3)
+    .output(4)
+    .apply(deform_conv_backward_input_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(deform_conv_backward_input_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("scale")
+    .attr("im2col_step")
+    .input(3)
+    .output(3)
+    .apply(deform_conv_backward_parameters_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(deform_conv_backward_parameters_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_conv_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_conv_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0d3d40d1c9eb32a466d5d4b427556741a4c79fc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_conv_pytorch.h
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DEFORM_CONV_PYTORCH_H
+#define DEFORM_CONV_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step);
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step);
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step);
+
+#endif  // DEFORM_CONV_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_roi_pool.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4fb78a96e74f7e97dff5212bb767eab743f2e73c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
+                       output, pooled_height, pooled_width, spatial_scale,
+                       sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
+                       offset, grad_input, grad_offset, pooled_height,
+                       pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma) {
+  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma) {
+  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fc2701d52d921ee03fd2ff518852d52e291d6c4c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp
@@ -0,0 +1,102 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "deform_roi_pool_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+ *                                  Tensor output, int pooled_height,
+ *                                  int pooled_width, float spatial_scale,
+ *                                  int sampling_ratio, float gamma);
+ */
+void deform_roi_pool_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  float gamma;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<float>("gamma", gamma)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+
+/*void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+ *                                   Tensor rois, Tensor offset,
+ *                                   Tensor grad_input, Tensor grad_offset,
+ *                                   int pooled_height, int pooled_width,
+ *                                   float spatial_scale, int sampling_ratio,
+ *                                   float gamma);
+ */
+void deform_roi_pool_backward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  float gamma;
+
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<float>("gamma", gamma)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& input = buildATensor(ctx, ins[1]);
+  const auto& rois = buildATensor(ctx, ins[2]);
+  const auto& offset = buildATensor(ctx, ins[3]);
+
+  auto grad_input = buildATensor(ctx, outs[0]);
+  auto grad_offset = buildATensor(ctx, outs[1]);
+
+  deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
+
+PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("gamma")
+    .input(3)
+    .output(1)
+    .apply(deform_roi_pool_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("gamma")
+    .input(4)
+    .output(2)
+    .apply(deform_roi_pool_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac0f2c324bb8329f2a0b6bc683f3d902a300156c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DEFORM_ROI_POOL_PYTORCH_H
+#define DEFORM_ROI_POOL_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+#endif  // DEFORM_ROI_POOL_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2361b7fbe5c86fa62a0fa78f39f6d018de108f8f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
+                              vertices, mask, num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
+                                              Tensor num_valid) {
+  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4d3e0e05900a1c9c731fcc7e2194eeedc8b9bfb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "diff_iou_rotated_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void diff_iou_rotated_sort_vertices_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  at::Tensor boxes, scores, dets;
+  auto vertices = buildATensor(ctx, ins[0]);
+  auto mask = buildATensor(ctx, ins[1]);
+  auto num_valid = buildATensor(ctx, ins[2]);
+  auto out =
+      diff_iou_rotated_sort_vertices_forward_cuda(vertices, mask, num_valid);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(diff_iou_rotated_sort_vertices_forward)
+    .input(3)
+    .output(1)
+    .apply(diff_iou_rotated_sort_vertices_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef911ecc20c7e648dea7aeb74a4d3ec2f46ec990
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
@@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DIFF_IOU_ROTATED_PYTORCH_H
+#define DIFF_IOU_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+#endif  // DIFF_IOU_ROTATED_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/focal_loss.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/focal_loss.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed0e2186532d9d6d909f76d653283bbdc29eac11
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/focal_loss.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
+                       grad_input, gamma, alpha);
+}
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
+                       buff, grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha) {
+  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
+                                   alpha);
+}
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha) {
+  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/focal_loss_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/focal_loss_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..044e200c40ef6342c6147e2d9282d856cc3dd9a2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/focal_loss_parrots.cpp
@@ -0,0 +1,113 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "focal_loss_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void sigmoid_focal_loss_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto grad_input = buildATensor(ctx, outs[0]);
+
+  sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
+                                   alpha);
+}
+
+void softmax_focal_loss_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  softmax_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto buff = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
+
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_backward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(softmax_focal_loss_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(2)
+    .apply(softmax_focal_loss_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/focal_loss_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/focal_loss_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7a00c8abcd5fccd5bf2e3bfcde0451545c69f28
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/focal_loss_pytorch.h
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef FOCAL_LOSS_PYTORCH_H
+#define FOCAL_LOSS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+#endif  // FOCAL_LOSS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/furthest_point_sample.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c7098acdb5b8392a698803dd7c7d34a360df6ad
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
@@ -0,0 +1,34 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
+                       temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
+                       points_tensor, temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m) {
+  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
+                                       b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m) {
+  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
+                                                 idx_tensor, b, n, m);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..483bfb24316d505c6c6086f0ec1f70a61c2e2baf
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp
@@ -0,0 +1,57 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "furthest_point_sample_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void furthest_point_sample_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int b, n, m;
+  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto temp_tensor = buildATensor(ctx, ins[1]);
+
+  auto idx_tensor = buildATensor(ctx, outs[0]);
+
+  furthest_point_sampling_forward(points_tensor, temp_tensor, idx_tensor, b, n,
+                                  m);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int b, n, m;
+  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto temp_tensor = buildATensor(ctx, ins[1]);
+
+  auto idx_tensor = buildATensor(ctx, outs[0]);
+
+  furthest_point_sampling_with_dist_forward(points_tensor, temp_tensor,
+                                            idx_tensor, b, n, m);
+}
+PARROTS_EXTENSION_REGISTER(furthest_point_sampling_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .input(2)
+    .output(1)
+    .apply(furthest_point_sample_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(furthest_point_sampling_with_dist_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .input(2)
+    .output(1)
+    .apply(furthest_point_sampling_with_dist_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0325cd66ed317574d2ab258152617091552a9301
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef FURTHEST_POINT_SAMPLE_PYTORCH_H
+#define FURTHEST_POINT_SAMPLE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m);
+#endif  // FURTHEST_POINT_SAMPLE_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8d411c9d843f15174653aab4b24cbb3c37564073
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
@@ -0,0 +1,119 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale) {
+  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
+                              act, grad, alpha, scale);
+}
+
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
+                                   const torch::Tensor& bias,
+                                   const torch::Tensor& refer, int act,
+                                   int grad, float alpha, float scale) {
+  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
+                                      scale);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/fused_bias_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/fused_bias_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47409ad20bbb5d4852eceb16038d3cec41e3431c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/fused_bias_parrots.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+using namespace at;
+using namespace parrots;
+
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor &input,
+                                   const torch::Tensor &bias,
+                                   const torch::Tensor &refer, int act,
+                                   int grad, float alpha, float scale);
+
+void fused_bias_leakyrelu_parrots(CudaContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int act, grad;
+  float alpha, scale;
+  SSAttrs(attr)
+      .get<int>("act", act)
+      .get<int>("grad", grad)
+      .get<float>("alpha", alpha)
+      .get<float>("scale", scale)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  const auto &bias = buildATensor(ctx, ins[1]);
+  const auto &refer = buildATensor(ctx, ins[2]);
+  auto out = fused_bias_leakyrelu(input, bias, refer, act, grad, alpha, scale);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(fused_bias_leakyrelu)
+    .attr("act")
+    .attr("grad")
+    .attr("alpha")
+    .attr("scale")
+    .input(3)
+    .output(1)
+    .apply(fused_bias_leakyrelu_parrots)
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/gather_points.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/gather_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8fb020022902bfbeb5ba940621d51859c616bdc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/gather_points.cpp
@@ -0,0 +1,30 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
+                       idx, out);
+}
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
+                       idx, grad_points);
+}
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n,
+                           int npoints) {
+  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
+                             out_tensor);
+}
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints) {
+  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
+                              grad_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/gather_points_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/gather_points_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d2d9e1290f26ccbfeb301a102fcb0917ff2cfa1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/gather_points_parrots.cpp
@@ -0,0 +1,71 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "gather_points_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void gather_points_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto out_tensor = buildATensor(ctx, outs[0]);
+
+  gather_points_forward(points_tensor, idx_tensor, out_tensor, b, c, n,
+                        npoints);
+}
+
+void gather_points_backward_cuda_parrots(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .done();
+
+  auto grad_out_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto grad_points_tensor = buildATensor(ctx, outs[0]);
+
+  gather_points_backward(grad_out_tensor, idx_tensor, grad_points_tensor, b, c,
+                         n, npoints);
+}
+
+PARROTS_EXTENSION_REGISTER(gather_points_forward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .input(2)
+    .output(1)
+    .apply(gather_points_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(gather_points_backward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .input(2)
+    .output(1)
+    .apply(gather_points_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/gather_points_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/gather_points_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1689ae6ad9ca00e795510ac356f6b49c4890bf2e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/gather_points_pytorch.h
@@ -0,0 +1,13 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef GATHER_POINTS_PYTORCH_H
+#define GATHER_POINTS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n, int npoints);
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints);
+#endif  // GATHER_POINTS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/group_points.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/group_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cdd190d40bbfdb109e34148791775dfe9d16be2e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/group_points.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points, idx, out);
+}
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
+                       grad_out, idx, grad_points);
+}
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points_tensor, idx_tensor, out_tensor);
+}
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample) {
+  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
+                             idx_tensor, grad_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/group_points_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/group_points_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..282c01a8c175cc6145ab45e5938325d2f7e0d491
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/group_points_parrots.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "group_points_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void group_points_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints, nsample;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .get<int>("nsample", nsample)
+      .done();
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto out_tensor = buildATensor(ctx, outs[0]);
+
+  group_points_forward(points_tensor, idx_tensor, out_tensor, b, c, n, npoints,
+                       nsample);
+}
+
+void group_points_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints, nsample;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .get<int>("nsample", nsample)
+      .done();
+  auto grad_out_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto grad_points_tensor = buildATensor(ctx, outs[0]);
+
+  group_points_backward(grad_out_tensor, idx_tensor, grad_points_tensor, b, c,
+                        n, npoints, nsample);
+}
+
+PARROTS_EXTENSION_REGISTER(group_points_forward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .attr("nsample")
+    .input(2)
+    .output(1)
+    .apply(group_points_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(group_points_backward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .attr("nsample")
+    .input(2)
+    .output(1)
+    .apply(group_points_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/group_points_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/group_points_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e704ab078e0ea3833c0ef29e5e4ab00693151be3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/group_points_pytorch.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef GROUP_POINTS_PYTORCH_H
+#define GROUP_POINTS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample);
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample);
+
+#endif  // GROUP_POINTS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/info.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/info.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4cc41861128dc0a8f8ccd641f68044428c4dc2c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/info.cpp
@@ -0,0 +1,65 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+#ifdef MMCV_WITH_HIP
+#include <hip/hip_runtime_api.h>
+int get_hiprt_version() {
+  int runtimeVersion;
+  hipRuntimeGetVersion(&runtimeVersion);
+  return runtimeVersion;
+}
+#else
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef MMCV_WITH_HIP
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  std::ostringstream oss;
+  oss << get_hiprt_version();
+  return oss.str();
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/iou3d.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/iou3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a347c0ee96db9ceefd6168c3cce84bea243e7044
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/iou3d.cpp
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
+                       num_b, boxes_b, ans_overlap);
+}
+
+void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
+                              Tensor &keep_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, keep, keep_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
+                                     Tensor &keep_num,
+                                     float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, keep, keep_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
+                                       ans_overlap);
+}
+
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  iou3d_nms3d_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  iou3d_nms3d_normal_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/iou3d_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..20e288aeab9bdaef047115bdac645e4b58e4c629
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
@@ -0,0 +1,70 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "iou3d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void iou3d_boxes_overlap_bev_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto boxes_a = buildATensor(ctx, ins[0]);
+  auto boxes_b = buildATensor(ctx, ins[1]);
+
+  auto ans_iou = buildATensor(ctx, outs[0]);
+
+  iou3d_boxes_overlap_bev_forward(boxes_a, boxes_b, ans_iou);
+}
+
+void iou3d_nms3d_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  float nms_overlap_thresh;
+  SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
+
+  auto boxes = buildATensor(ctx, ins[0]);
+
+  auto keep = buildATensor(ctx, outs[0]);
+  auto keep_num = buildATensor(ctx, outs[1]);
+
+  iou3d_nms3d_forward(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+void iou3d_nms3d_normal_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  float nms_overlap_thresh;
+  SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
+
+  auto boxes = buildATensor(ctx, ins[0]);
+
+  auto keep = buildATensor(ctx, outs[0]);
+  auto keep_num = buildATensor(ctx, outs[1]);
+
+  iou3d_nms3d_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+PARROTS_EXTENSION_REGISTER(iou3d_boxes_overlap_bev_forward)
+    .input(2)
+    .output(1)
+    .apply(iou3d_boxes_overlap_bev_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(iou3d_nms3d_forward)
+    .attr("nms_overlap_thresh")
+    .input(1)
+    .output(2)
+    .apply(iou3d_nms3d_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(iou3d_nms3d_normal_forward)
+    .attr("nms_overlap_thresh")
+    .input(1)
+    .output(2)
+    .apply(iou3d_nms3d_normal_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/iou3d_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/iou3d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..76170edc7083dbaff4a2d23356c4e7702b929a2d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/iou3d_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef IOU_3D_PYTORCH_H
+#define IOU_3D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap);
+
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh);
+
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh);
+
+#endif  // IOU_3D_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/knn.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/knn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4be9428c59c0f04635891b954f4c73f7fb0536d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/knn.cpp
@@ -0,0 +1,17 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
+                       dist2);
+}
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
+  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
+                   dist2_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/knn_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/knn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..585b84644a4427330046ac0ea2220d07580ee638
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/knn_parrots.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "knn_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void knn_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                              const OperatorBase::in_list_t& ins,
+                              OperatorBase::out_list_t& outs) {
+  int b, n, m, nsample;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("n", n)
+      .get<int>("m", m)
+      .get<int>("nsample", nsample)
+      .done();
+
+  auto xyz_tensor = buildATensor(ctx, ins[0]);
+  auto new_xyz_tensor = buildATensor(ctx, ins[1]);
+
+  auto idx_tensor = buildATensor(ctx, outs[0]);
+  auto dist2_tensor = buildATensor(ctx, outs[1]);
+
+  knn_forward(xyz_tensor, new_xyz_tensor, idx_tensor, dist2_tensor, b, n, m,
+              nsample);
+}
+
+PARROTS_EXTENSION_REGISTER(knn_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .attr("nsample")
+    .input(2)
+    .output(2)
+    .apply(knn_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/knn_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/knn_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0875f8389ee91bfc93083da844ccd4f6be9fdf3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/knn_pytorch.h
@@ -0,0 +1,9 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef KNN_PYTORCH_H
+#define KNN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample);
+#endif  // KNN_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/masked_conv2d.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/masked_conv2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5903925351fcb193b86c8b5f01b410e4fc0bbaf9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/masked_conv2d.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
+                       col, kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
+                       im, height, width, channels);
+}
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w) {
+  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels) {
+  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39f19740c84b521cf16a2030fb01b07bda1e75e4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "masked_conv2d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void masked_im2col_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  int kernel_h, kernel_w, pad_h, pad_w;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .done();
+
+  const auto& im = buildATensor(ctx, ins[0]);
+  const auto& mask_h_idx = buildATensor(ctx, ins[1]);
+  const auto& mask_w_idx = buildATensor(ctx, ins[2]);
+
+  auto col = buildATensor(ctx, outs[0]);
+  masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  int height, width, channels;
+  SSAttrs(attr)
+      .get<int>("height", height)
+      .get<int>("width", width)
+      .get<int>("channels", channels)
+      .done();
+
+  const auto& col = buildATensor(ctx, ins[0]);
+  const auto& mask_h_idx = buildATensor(ctx, ins[1]);
+  const auto& mask_w_idx = buildATensor(ctx, ins[2]);
+
+  auto im = buildATensor(ctx, outs[0]);
+  masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
+
+PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .input(3)
+    .output(1)
+    .apply(masked_im2col_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
+    .attr("height")
+    .attr("width")
+    .attr("channels")
+    .input(3)
+    .output(1)
+    .apply(masked_col2im_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..36d5643f6037bf05cfdcdb23a02151aab0c1d4b4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MASKED_CONV2D_PYTORCH_H
+#define MASKED_CONV2D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+#endif  // MASKED_CONV2D_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/min_area_polygons.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/min_area_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ff996dc8992b4c95633516054ecdba5913de8f3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/min_area_polygons.cpp
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
+  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
+}
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons) {
+  min_area_polygons_impl(pointsets, polygons);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9e4ff4b3dd80746ca534cbf4f02ace966b363d8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "min_area_polygons_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void min_area_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+
+  auto polygons = buildATensor(ctx, outs[0]);
+  min_area_polygons(pointsets, polygons);
+}
+
+PARROTS_EXTENSION_REGISTER(min_area_polygons)
+    .input(1)
+    .output(1)
+    .apply(min_area_polygons_cuda_parrots)
+    .done();
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1df27641882c6ae29028809f726c1a19b9a192cd
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
@@ -0,0 +1,9 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MIN_AREA_POLYGONS_PYTORCH_H
+#define MIN_AREA_POLYGONS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons);
+
+#endif  // MIN_AREA_POLYGONS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..12b538a05e6fd98becccfddf8e79cba7abf96d93
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
@@ -0,0 +1,237 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, data_col);
+}
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, grad_im);
+}
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
+                       data_im, data_offset, data_mask, batch_size, channels,
+                       height_im, width_im, height_col, width_col, kernel_h,
+                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                       dilation_w, deformable_group, grad_offset, grad_mask);
+}
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_impl(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_impl(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ef7efff6e473abd4ee94d21c8b8dc05ab34f1d9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp
@@ -0,0 +1,199 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "modulated_deform_conv_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void modulated_deform_conv_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+
+  modulated_deform_conv_forward(input, weight, bias, ones, offset, mask, output,
+                                columns, kernel_h, kernel_w, stride_h, stride_w,
+                                pad_h, pad_w, dilation_h, dilation_w, group,
+                                deformable_group, with_bias);
+}
+
+void modulated_deform_conv_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto columns = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  auto grad_weight = buildATensor(ctx, outs[2]);
+  auto grad_bias = buildATensor(ctx, outs[3]);
+  auto grad_offset = buildATensor(ctx, outs[4]);
+  auto grad_mask = buildATensor(ctx, outs[5]);
+  auto grad_output = buildATensor(ctx, outs[6]);
+  modulated_deform_conv_backward(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+}
+#endif
+
+void modulated_deform_conv_forward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+
+  modulated_deform_conv_forward(input, weight, bias, ones, offset, mask, output,
+                                columns, kernel_h, kernel_w, stride_h, stride_w,
+                                pad_h, pad_w, dilation_h, dilation_w, group,
+                                deformable_group, with_bias);
+}
+
+void modulated_deform_conv_backward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto columns = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  auto grad_weight = buildATensor(ctx, outs[2]);
+  auto grad_bias = buildATensor(ctx, outs[3]);
+  auto grad_offset = buildATensor(ctx, outs[4]);
+  auto grad_mask = buildATensor(ctx, outs[5]);
+  auto grad_output = buildATensor(ctx, outs[6]);
+  modulated_deform_conv_backward(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+}
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(2)
+    .apply(modulated_deform_conv_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(modulated_deform_conv_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(7)
+    .apply(modulated_deform_conv_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(modulated_deform_conv_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..12f6868612d5e7596378c4ce2e8fa25f1b9c0afc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MODULATED_DEFORM_CONV_PYTORCH_H
+#define MODULATED_DEFORM_CONV_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias);
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias);
+#endif  // MODULATED_DEFORM_CONV_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ms_deform_attn.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..25c8f6209b16c475ba181eea7c880eb27cca4082
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
@@ -0,0 +1,60 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step) {
+  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
+                              spatial_shapes, level_start_index, sampling_loc,
+                              attn_weight, im2col_step);
+}
+
+void ms_deform_attn_impl_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
+    const int im2col_step) {
+  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
+                       level_start_index, sampling_loc, attn_weight,
+                       grad_output, grad_value, grad_sampling_loc,
+                       grad_attn_weight, im2col_step);
+}
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight,
+                              const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
+}
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
+                               sampling_loc, attn_weight, grad_output,
+                               grad_value, grad_sampling_loc, grad_attn_weight,
+                               im2col_step);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3ad786a8e08129fa84fa73b710637e6e23b2994
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+using namespace at;
+using namespace parrots;
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight, const int im2col_step);
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step);
+
+void ms_deform_attn_forward_parrots(CudaContext &ctx, const SSElement &attr,
+                                    const OperatorBase::in_list_t &ins,
+                                    OperatorBase::out_list_t &outs) {
+  int im2col_step;
+  SSAttrs(attr).get<int>("im2col_step", im2col_step).done();
+  const auto &value = buildATensor(ctx, ins[0]);
+  const auto &spatial_shapes = buildATensor(ctx, ins[1]);
+  const auto &level_start_index = buildATensor(ctx, ins[2]);
+  const auto &sampling_loc = buildATensor(ctx, ins[3]);
+  const auto &attn_weight = buildATensor(ctx, ins[4]);
+  auto out = ms_deform_attn_forward(value, spatial_shapes, level_start_index,
+                                    sampling_loc, attn_weight, im2col_step);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void ms_deform_attn_backward_parrots(CudaContext &ctx, const SSElement &attr,
+                                     const OperatorBase::in_list_t &ins,
+                                     OperatorBase::out_list_t &outs) {
+  int im2col_step;
+  SSAttrs(attr).get<int>("im2col_step", im2col_step).done();
+  const auto &value = buildATensor(ctx, ins[0]);
+  const auto &spatial_shapes = buildATensor(ctx, ins[1]);
+  const auto &level_start_index = buildATensor(ctx, ins[2]);
+  const auto &sampling_loc = buildATensor(ctx, ins[3]);
+  const auto &attn_weight = buildATensor(ctx, ins[4]);
+  const auto &grad_output = buildATensor(ctx, ins[5]);
+  auto grad_value = buildATensor(ctx, outs[0]);
+  auto grad_sampling_loc = buildATensor(ctx, outs[1]);
+  auto grad_attn_weight = buildATensor(ctx, outs[2]);
+  ms_deform_attn_backward(value, spatial_shapes, level_start_index,
+                          sampling_loc, attn_weight, grad_output, grad_value,
+                          grad_sampling_loc, grad_attn_weight, im2col_step);
+}
+
+PARROTS_EXTENSION_REGISTER(ms_deform_attn_forward)
+    .attr("im2col_step")
+    .input(5)
+    .output(1)
+    .apply(ms_deform_attn_forward_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(ms_deform_attn_backward)
+    .attr("im2col_step")
+    .input(6)
+    .output(3)
+    .apply(ms_deform_attn_backward_parrots)
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/nms.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..199d8af236f5442fcdd53ce3dfd8d24aa67481bb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/nms.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset) {
+  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
+                              sigma, min_score, method, offset);
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets,
+                                              float iou_threshold) {
+  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
+}
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return nms_impl(boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset) {
+  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
+                      method, offset);
+}
+
+std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
+  return nms_match_impl(dets, iou_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/nms_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/nms_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db8b5f16e9a276a9891f0a415276c334ebf0901f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/nms_parrots.cpp
@@ -0,0 +1,140 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "nms_pytorch.h"
+
+using namespace parrots;
+
+// Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+template <typename T>
+void nms_parrots(T& ctx, const SSElement& attr,
+                 const OperatorBase::in_list_t& ins,
+                 OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int offset;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("offset", offset)
+      .done();
+  at::Tensor boxes, scores;
+  boxes = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  auto out = nms(boxes, scores, iou_threshold, offset);
+  updateDArray(ctx, out, outs[0]);
+}
+
+/*Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+ *                float sigma, float min_score, int method, int offset);*/
+template <typename T>
+void softnms_parrots(T& ctx, const SSElement& attr,
+                     const OperatorBase::in_list_t& ins,
+                     OperatorBase::out_list_t& outs) {
+  float iou_threshold, sigma, min_score;
+  int method, offset;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("sigma", sigma)
+      .get("min_score", min_score)
+      .get("method", method)
+      .get("offset", offset)
+      .done();
+  at::Tensor boxes, scores, dets;
+  boxes = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  dets = buildATensor(ctx, ins[2]);
+  auto out = softnms(boxes, scores, dets, iou_threshold, sigma, min_score,
+                     method, offset);
+  updateDArray(ctx, out, outs[0]);
+}
+
+// std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold);
+template <typename T>
+void nms_match_parrots(T& ctx, const SSElement& attr,
+                       const OperatorBase::in_list_t& ins,
+                       OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  SSAttrs(attr).get("iou_threshold", iou_threshold).done();
+  at::Tensor dets;
+  dets = buildATensor(ctx, ins[0]);
+  auto out = nms_match(dets, iou_threshold);
+  int n = out.size(), m = 0;
+  for (int i = 0; i < n; ++i)
+    if (m < out[i].size()) m = out[i].size();
+  auto options = torch::TensorOptions().dtype(at::kInt);
+  auto tensor = torch::zeros({n, m}, options);
+  for (int i = 0; i < n; i++)
+    tensor.slice(0, i, i + 1) =
+        torch::from_blob(out[i].data(), {out[i].size()}, options);
+  updateDArray(ctx, tensor, outs[0]);
+}
+
+/*Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+ *                    const Tensor dets_sorted, const float iou_threshold,
+ *                                       const int multi_label);*/
+template <typename T>
+void nms_rotated_parrots(T& ctx, const SSElement& attr,
+                         const OperatorBase::in_list_t& ins,
+                         OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int multi_label;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("multi_label", multi_label)
+      .done();
+  at::Tensor dets, scores, order, dets_sorted;
+  dets = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  order = buildATensor(ctx, ins[2]);
+  dets_sorted = buildATensor(ctx, ins[3]);
+  auto out =
+      nms_rotated(dets, scores, order, dets_sorted, iou_threshold, multi_label);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(nms)
+    .attr("iou_threshold")
+    .attr("offset")
+    .input(2)
+    .output(1)
+    .apply(nms_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softnms)
+    .attr("iou_threshold")
+    .attr("sigma")
+    .attr("min_score")
+    .attr("method")
+    .attr("offset")
+    .input(3)
+    .output(1)
+    .apply(softnms_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(softnms_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(nms_match)
+    .attr("iou_threshold")
+    .input(1)
+    .output(1)
+    .apply(nms_match_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_match_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(nms_rotated)
+    .attr("multi_label")
+    .attr("iou_threshold")
+    .input(4)
+    .output(1)
+    .apply(nms_rotated_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_rotated_parrots<CudaContext>)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/nms_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/nms_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..78c680e57c3089b44d29586175f56a5599560914
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/nms_pytorch.h
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef NMS_PYTORCH_H
+#define NMS_PYTORCH_H
+#include <torch/extension.h>
+
+at::Tensor nms(at::Tensor boxes, at::Tensor scores, float iou_threshold,
+               int offset);
+
+at::Tensor softnms(at::Tensor boxes, at::Tensor scores, at::Tensor dets,
+                   float iou_threshold, float sigma, float min_score,
+                   int method, int offset);
+
+std::vector<std::vector<int> > nms_match(at::Tensor dets, float iou_threshold);
+
+at::Tensor nms_rotated(const at::Tensor dets, const at::Tensor scores,
+                       const at::Tensor order, const at::Tensor dets_sorted,
+                       const float iou_threshold, const int multi_label);
+#endif  // NMS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/nms_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/nms_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e4ef676a9d6f94e5f60b7c9e1df8ce78eb6cbaa2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/nms_rotated.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h
+#include "pytorch_cpp_helper.hpp"
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold);
+
+#ifdef MMCV_WITH_CUDA
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order, const Tensor dets_sorted,
+                        const float iou_threshold, const int multi_label);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const float iou_threshold,
+                   const int multi_label) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
+                            multi_label);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  return nms_rotated_cpu(dets, scores, iou_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/pixel_group.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/pixel_group.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bf8c8bbf2061cacb9e0c2d33c8a635834407622
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/pixel_group.cpp
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/WenmuZhou/PAN.pytorch
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
+                              kernel_label, kernel_contour, kernel_region_num,
+                              dis_threshold);
+}
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold) {
+  score = score.contiguous();
+  mask = mask.contiguous();
+  embedding = embedding.contiguous();
+  kernel_label = kernel_label.contiguous();
+  kernel_contour = kernel_contour.contiguous();
+
+  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
+                          kernel_region_num, distance_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/pixel_group_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/pixel_group_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd863a4e1b341441b3700fe3931c9bb78c159ee6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/pixel_group_parrots.cpp
@@ -0,0 +1,54 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "pixel_group_pytorch.h"
+
+using namespace parrots;
+using namespace std;
+
+template <typename T>
+void pixel_group_parrots(T& ctx, const SSElement& attr,
+                         const OperatorBase::in_list_t& ins,
+                         OperatorBase::out_list_t& outs) {
+  int kernel_region_num;
+  float distance_threshold;
+  SSAttrs(attr)
+      .get<int>("kernel_region_num", kernel_region_num)
+      .get<float>("distance_threshold", distance_threshold)
+      .done();
+  at::Tensor score;
+  at::Tensor mask;
+  at::Tensor embedding;
+  at::Tensor kernel_label;
+  at::Tensor kernel_contour;
+  score = buildATensor(ctx, ins[0]);
+  mask = buildATensor(ctx, ins[1]);
+  embedding = buildATensor(ctx, ins[2]);
+  kernel_label = buildATensor(ctx, ins[3]);
+  kernel_contour = buildATensor(ctx, ins[4]);
+  auto out = pixel_group(score, mask, embedding, kernel_label, kernel_contour,
+                         kernel_region_num, distance_threshold);
+  int n = out.size();
+  std::vector<float> out_tensor;
+  for (int i = 0; i < n; ++i) out_tensor.push_back(float(out[i].size()));
+  for (int i = 0; i < n; ++i)
+    out_tensor.insert(out_tensor.end(), out[i].begin(), out[i].end());
+  auto options = torch::TensorOptions().dtype(at::kFloat);
+  auto tensor = torch::zeros({1, out_tensor.size()}, options);
+  tensor.slice(0, 0, 1) =
+      torch::from_blob(out_tensor.data(), {out_tensor.size()}, options);
+  updateDArray(ctx, tensor, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(pixel_group)
+    .attr("kernel_region_num")
+    .attr("distance_threshold")
+    .input(5)
+    .output(1)
+    .apply(pixel_group_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(pixel_group_parrots<CudaContext>)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/pixel_group_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/pixel_group_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1686ef3ee3647ada5fa37ded01415c37a4186f2d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/pixel_group_pytorch.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef PIXEL_GROUP_PYTORCH_H
+#define PIXEL_GROUP_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold);
+
+#endif  // PIXEL_GROUP_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_boxes.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_boxes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..540da94038f6dea2dc10443905f289ddd131f1af
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_boxes.cpp
@@ -0,0 +1,44 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
+                                    boxes_tensor, pts_tensor,
+                                    box_idx_of_points_tensor);
+}
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
+  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
+                                   pts_tensor, box_idx_of_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..afd2b0eb2d6c84f0dc44229c08b6b764185365fb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp
@@ -0,0 +1,64 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "points_in_boxes_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void points_in_boxes_part_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto boxes_tensor = buildATensor(ctx, ins[0]);
+  auto pts_tensor = buildATensor(ctx, ins[1]);
+
+  auto box_idx_of_points_tensor = buildATensor(ctx, outs[0]);
+
+  points_in_boxes_part_forward(boxes_tensor, pts_tensor,
+                               box_idx_of_points_tensor);
+}
+
+void points_in_boxes_all_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto boxes_tensor = buildATensor(ctx, ins[0]);
+  auto pts_tensor = buildATensor(ctx, ins[1]);
+
+  auto box_idx_of_points_tensor = buildATensor(ctx, outs[0]);
+
+  points_in_boxes_all_forward(boxes_tensor, pts_tensor,
+                              box_idx_of_points_tensor);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_boxes_part_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_boxes_part_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(points_in_boxes_all_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_boxes_all_forward_cuda_parrots)
+    .done();
+#endif
+
+void points_in_boxes_forward_cpu_parrots(HostContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  auto boxes_tensor = buildATensor(ctx, ins[0]);
+  auto pts_tensor = buildATensor(ctx, ins[1]);
+
+  auto pts_indices_tensor = buildATensor(ctx, outs[0]);
+
+  points_in_boxes_cpu_forward(boxes_tensor, pts_tensor, pts_indices_tensor);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_boxes_cpu_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_boxes_forward_cpu_parrots)
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3e465e3c785e5e78c020f61eaeaa23e59d1948a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINTS_IN_BOXES_PYTORCH_H
+#define POINTS_IN_BOXES_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor);
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor);
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor);
+
+#endif  // POINTS_IN_BOXES_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_polygons.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75a93dcef33f23904c1218048e16beff65c230d1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_polygons.cpp
@@ -0,0 +1,15 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
+                       output, rows, cols);
+}
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
+  int rows = points.size(0);
+  int cols = polygons.size(0);
+  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d52018e6451f52d0c10648cea2ee036b3214376d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "points_in_polygons_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void points_in_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  auto points = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  points_in_polygons_forward(points, polygons, output);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_polygons_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_polygons_cuda_parrots)
+    .done();
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..042678143472b18c85ac6d1bdcd79cc97a4e7ab0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
@@ -0,0 +1,9 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINTS_IN_POLYGONS_PYTORCH_H
+#define POINTS_IN_POLYGONS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
+
+#endif  // POINTS_IN_POLYGONS_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/prroi_pool.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/prroi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00db84a154bef7a7cee8d38ba6236d959849a3bc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/prroi_pool.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
+                       input, rois, grad_rois, pooled_height, pooled_width,
+                       spatial_scale);
+}
+
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale) {
+  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
+                          spatial_scale);
+}
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale) {
+  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
+                           pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
+                                pooled_height, pooled_width, spatial_scale);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e82955818640f3276255f14cd1e7db232117773
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp
@@ -0,0 +1,97 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "prroi_pool_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void prroi_pool_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  prroi_pool_forward(input, rois, output, pooled_height, pooled_width,
+                     spatial_scale);
+}
+
+void prroi_pool_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  prroi_pool_backward(grad_output, rois, grad_input, pooled_height,
+                      pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& output = buildATensor(ctx, ins[0]);
+  const auto& grad_output = buildATensor(ctx, ins[1]);
+  const auto& input = buildATensor(ctx, ins[2]);
+  const auto& rois = buildATensor(ctx, ins[3]);
+  auto grad_rois = buildATensor(ctx, outs[0]);
+  prroi_pool_coor_backward(output, grad_output, input, rois, grad_rois,
+                           pooled_height, pooled_width, spatial_scale);
+}
+
+PARROTS_EXTENSION_REGISTER(prroi_pool_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(2)
+    .output(1)
+    .apply(prroi_pool_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(prroi_pool_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(2)
+    .output(1)
+    .apply(prroi_pool_backward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(prroi_pool_coor_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(4)
+    .output(1)
+    .apply(prroi_pool_coor_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/prroi_pool_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/prroi_pool_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..451b01dd5d289cd6a4533f62f326b326cd89da16
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/prroi_pool_pytorch.h
@@ -0,0 +1,19 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef PRROI_POOL_PYTORCH_H
+#define PRROI_POOL_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale);
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale);
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale);
+
+#endif  // PRROI_POOL_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/psamask.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/psamask.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6064c9ba5fd7ec9bcfef22b3abcc65ef50106d67
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/psamask.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
+                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
+                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask) {
+  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask) {
+  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/psamask_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/psamask_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f67102d02cc124a81d300aea4946c65155ede81d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/psamask_parrots.cpp
@@ -0,0 +1,129 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "psamask_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void psamask_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  auto output = buildATensor(ctx, outs[0]);
+  psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                   const OperatorBase::in_list_t &ins,
+                                   OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
+#endif
+
+void psamask_forward_cpu_parrots(HostContext &ctx, const SSElement &attr,
+                                 const OperatorBase::in_list_t &ins,
+                                 OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  auto output = buildATensor(ctx, outs[0]);
+  psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
+                      h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward_cpu_parrots(HostContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
+                       w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+PARROTS_EXTENSION_REGISTER(psamask_forward)
+    .attr("psa_type")
+    .attr("num_")
+    .attr("h_feature")
+    .attr("w_feature")
+    .attr("h_mask")
+    .attr("w_mask")
+    .attr("half_h_mask")
+    .attr("half_w_mask")
+    .input(1)
+    .output(1)
+    .apply(psamask_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(psamask_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(psamask_backward)
+    .attr("psa_type")
+    .attr("num_")
+    .attr("h_feature")
+    .attr("w_feature")
+    .attr("h_mask")
+    .attr("w_mask")
+    .attr("half_h_mask")
+    .attr("half_w_mask")
+    .input(1)
+    .output(1)
+    .apply(psamask_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(psamask_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/psamask_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/psamask_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3f0579efb8b8149f1840d0a20fc5ba91df74f06
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/psamask_pytorch.h
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef PSAMASK_PYTORCH_H
+#define PSAMASK_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+#endif
+void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask);
+
+void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask);
+#endif  // PSAMASK_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81ffa9fd6dcd82117ca13ac83b88b5f023aca466
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
+                       pooled_height, pooled_width, spatial_scale, num_samples,
+                       num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, pooled_height, pooled_width, spatial_scale,
+                       num_samples, num_orientations, clockwise);
+}
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise) {
+  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
+                                   pooled_width, spatial_scale, num_samples,
+                                   num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise) {
+  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
+                                    pooled_width, spatial_scale, num_samples,
+                                    num_orientations, clockwise);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5eb340ce42cf0ed4ccbe66a4b97aaed55a13be8b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
@@ -0,0 +1,86 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "riroi_align_rotated_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void riroi_align_rotated_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  int num_orientations;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("num_samples", sample_num)
+      .get<int>("num_orientations", num_orientations)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  auto input = buildATensor(ctx, ins[0]);
+  auto rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  riroi_align_rotated_forward(input, rois, output, pooled_height, pooled_width,
+                              spatial_scale, sample_num, num_orientations,
+                              clockwise);
+}
+
+void riroi_align_rotated_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  int num_orientations;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("num_samples", sample_num)
+      .get<int>("num_orientations", num_orientations)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  riroi_align_rotated_backward(grad_output, rois, grad_input, pooled_height,
+                               pooled_width, spatial_scale, sample_num,
+                               num_orientations, clockwise);
+}
+
+PARROTS_EXTENSION_REGISTER(riroi_align_rotated_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("num_samples")
+    .attr("num_orientations")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(riroi_align_rotated_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(riroi_align_rotated_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("num_samples")
+    .attr("num_orientations")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(riroi_align_rotated_backward_cuda_parrots)
+    .done();
+
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..49a30bffaffe059c98884332449c6af817036390
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef RIROI_ALIGN_ROTATED_PYTORCH_H
+#define RIROI_ALIGN_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise);
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise);
+
+#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e7077397d06ecd55af1e1060e64fe8c5ff08c94
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
+                       argmax_x, grad_input, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60abea092709427b0e62c101931911c2c1924cf1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_parrots.cpp
@@ -0,0 +1,151 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roi_align_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax_y = buildATensor(ctx, outs[1]);
+  auto argmax_x = buildATensor(ctx, outs[2]);
+  roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& argmax_y = buildATensor(ctx, ins[2]);
+  const auto& argmax_x = buildATensor(ctx, ins[3]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+}
+#endif
+
+void roi_align_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax_y = buildATensor(ctx, outs[1]);
+  auto argmax_x = buildATensor(ctx, outs[2]);
+  roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x, aligned_height,
+                        aligned_width, spatial_scale, sampling_ratio, pool_mode,
+                        aligned);
+}
+
+void roi_align_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& argmax_y = buildATensor(ctx, ins[2]);
+  const auto& argmax_x = buildATensor(ctx, ins[3]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_align_forward)
+    .attr("aligned_height")
+    .attr("aligned_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("pool_mode")
+    .attr("aligned")
+    .input(2)
+    .output(3)
+    .apply(roi_align_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_align_backward)
+    .attr("aligned_height")
+    .attr("aligned_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("pool_mode")
+    .attr("aligned")
+    .input(4)
+    .output(1)
+    .apply(roi_align_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c60160984fd964663547c590025558780c8c62f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_pytorch.h
@@ -0,0 +1,32 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_ALIGN_PYTORCH_H
+#define ROI_ALIGN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+#endif
+
+void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+#endif  // ROI_ALIGN_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ef691ada07e599740906254369631189e5d6f51
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
+                       aligned_height, aligned_width, spatial_scale,
+                       sample_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, aligned_height, aligned_width,
+                       spatial_scale, sample_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned, bool clockwise) {
+  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
+                                 aligned_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                Tensor bottom_grad, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise) {
+  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
+                                  aligned_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9386250a27b1db338bcc522c4acf9b29b05077db
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
@@ -0,0 +1,147 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roi_align_rotated_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_rotated_forward_cuda_parrots(CudaContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
+                                 pooled_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_rotated_backward_cuda(grad_output, rois, grad_input, pooled_height,
+                                  pooled_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
+}
+#endif
+
+void roi_align_rotated_forward_cpu_parrots(HostContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
+                                pooled_width, spatial_scale, sampling_ratio,
+                                aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cpu_parrots(HostContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
+                                 pooled_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_align_rotated_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("aligned")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(roi_align_rotated_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_rotated_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_align_rotated_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("aligned")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(roi_align_rotated_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_rotated_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8136b56d133d4dfa32b0d1aa2a02425560dee0e0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_ALIGN_ROTATED_PYTORCH_H
+#define ROI_ALIGN_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                                    int pooled_height, int pooled_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_cuda(Tensor grad_output, Tensor rois,
+                                     Tensor bottom_grad, int pooled_height,
+                                     int pooled_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+#endif
+
+void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_cpu(Tensor grad_output, Tensor rois,
+                                    Tensor bottom_grad, int pooled_height,
+                                    int pooled_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise);
+
+#endif  // ROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_pool.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bba90b806c5fe59d9e20a0b41a51df9922e91c3f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_pool.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
+                       grad_input, pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width,
+                      float spatial_scale) {
+  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
+}
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale) {
+  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_pool_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_pool_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0acde4a41e46ccac53c8b4bae80bd88fb2fde6d6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_pool_parrots.cpp
@@ -0,0 +1,67 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roi_pool_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roi_pool_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax = buildATensor(ctx, outs[1]);
+  roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& argmax = buildATensor(ctx, ins[2]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_pool_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(2)
+    .output(2)
+    .apply(roi_pool_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_pool_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(3)
+    .output(1)
+    .apply(roi_pool_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_pool_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_pool_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d67a1502fe955fa469cc5f854687df88ee432756
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roi_pool_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_POOL_PYTORCH_H
+#define ROI_POOL_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+#endif
+#endif  // ROI_POOL_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cf9cf0945db4c0ce1774aed6d334b62f3e1a9e4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
@@ -0,0 +1,72 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
+                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
+                       pts, pts_feature, argmax, pts_idx_of_voxels,
+                       pooled_features, pool_method);
+}
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
+                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
+                       argmax, grad_out, grad_in, pool_method);
+}
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR
+  // coordinate
+  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // params pts_feature: (npoints, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params pooled_features: (N, out_x, out_y, out_z, C)
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = rois.size(0);
+  int pts_num = pts.size(0);
+  int channels = pts_feature.size(1);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  assert((out_x < 256) && (out_y < 256) &&
+         (out_z < 256));  // we encode index with 8bit
+
+  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
+                               out_x, out_y, out_z, rois, pts, pts_feature,
+                               argmax, pts_idx_of_voxels, pooled_features,
+                               pool_method);
+}
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in,
+                              int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = pts_idx_of_voxels.size(0);
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int channels = grad_out.size(4);
+
+  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
+                                max_pts_each_voxel, pts_idx_of_voxels, argmax,
+                                grad_out, grad_in, pool_method);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..771d920043869cd538377a9f9a7320dd67243c69
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roiaware_pool3d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roiaware_pool3d_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int pool_method;
+  SSAttrs(attr).get<int>("pool_method", pool_method).done();
+  auto rois = buildATensor(ctx, ins[0]);
+  auto pts = buildATensor(ctx, ins[1]);
+  auto pts_feature = buildATensor(ctx, ins[2]);
+
+  auto argmax = buildATensor(ctx, outs[0]);
+  auto pts_idx_of_voxels = buildATensor(ctx, outs[1]);
+  auto pooled_features = buildATensor(ctx, outs[2]);
+
+  roiaware_pool3d_forward(rois, pts, pts_feature, argmax, pts_idx_of_voxels,
+                          pooled_features, pool_method);
+}
+
+void roiaware_pool3d_backward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pool_method;
+  SSAttrs(attr).get<int>("pool_method", pool_method).done();
+  auto pts_idx_of_voxels = buildATensor(ctx, ins[0]);
+  auto argmax = buildATensor(ctx, ins[1]);
+  auto grad_out = buildATensor(ctx, ins[2]);
+
+  auto grad_in = buildATensor(ctx, outs[0]);
+
+  roiaware_pool3d_backward(pts_idx_of_voxels, argmax, grad_out, grad_in,
+                           pool_method);
+}
+
+PARROTS_EXTENSION_REGISTER(roiaware_pool3d_forward)
+    .attr("pool_method")
+    .input(3)
+    .output(3)
+    .apply(roiaware_pool3d_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roiaware_pool3d_backward)
+    .attr("pool_method")
+    .input(3)
+    .output(1)
+    .apply(roiaware_pool3d_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b4b0402afa573c2231a3667fec41632ed854ad2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIAWARE_POOL3D_PYTORCH_H
+#define ROIAWARE_POOL3D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in, int pool_method);
+
+#endif  // ROIAWARE_POOL3D_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a10080b7c23abb3a31b6f764c972ea7917f52346
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
@@ -0,0 +1,39 @@
+/*
+Modified from
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
+                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
+                       pts_feature, pooled_features, pooled_empty_flag);
+}
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  int batch_size = xyz.size(0);
+  int pts_num = xyz.size(1);
+  int boxes_num = boxes3d.size(1);
+  int feature_in_len = pts_feature.size(2);
+  int sampled_pts_num = pooled_features.size(2);
+
+  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
+                               sampled_pts_num, xyz, boxes3d, pts_feature,
+                               pooled_features, pooled_empty_flag);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..17f549849df4d433d5c7369f5f43715d1f88a56e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roipoint_pool3d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roipoint_pool3d_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  auto xyz = buildATensor(ctx, ins[0]);
+  auto boxes3d = buildATensor(ctx, ins[1]);
+  auto pts_feature = buildATensor(ctx, ins[2]);
+
+  auto pooled_features = buildATensor(ctx, outs[0]);
+  auto pooled_empty_flag = buildATensor(ctx, outs[1]);
+
+  roipoint_pool3d_forward(xyz, boxes3d, pts_feature, pooled_features,
+                          pooled_empty_flag);
+}
+
+PARROTS_EXTENSION_REGISTER(roipoint_pool3d_forward)
+    .input(3)
+    .output(2)
+    .apply(roipoint_pool3d_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5b61b0d9ab2d2ed6ea3db9947ae8dc1e0d96992
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h
@@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIPOINT_POOL3D_PYTORCH_H
+#define ROIPOINT_POOL3D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag);
+
+#endif  // ROIPOINT_POOL3D_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/rotated_feature_align.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71fe0c9a0a26003310a388d4edca6e79aa7b9026
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/rotated_feature_align.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
+                       best_bboxes, spatial_scale, points, output);
+}
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
+                       best_bboxes, spatial_scale, points, bottom_grad);
+}
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale,
+                                   const int points) {
+  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
+                                     points, output);
+}
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points) {
+  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
+                                      points, bottom_grad);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4efaf1d3a3d8f382047202defc7546b8af6c48f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp
@@ -0,0 +1,99 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "rotated_feature_align_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void rotated_feature_align_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto features = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,
+                                points);
+}
+
+void rotated_feature_align_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,
+                                 spatial_scale, points);
+}
+#endif
+
+void rotated_feature_align_forward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto features = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,
+                                points);
+}
+
+void rotated_feature_align_backward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,
+                                 spatial_scale, points);
+}
+
+PARROTS_EXTENSION_REGISTER(rotated_feature_align_forward)
+    .attr("spatial_scale")
+    .attr("points")
+    .input(2)
+    .output(1)
+    .apply(rotated_feature_align_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(rotated_feature_align_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(rotated_feature_align_backward)
+    .attr("spatial_scale")
+    .attr("points")
+    .input(2)
+    .output(1)
+    .apply(rotated_feature_align_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(rotated_feature_align_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a695ee5e3de4b2d8f77e93fb06986967f3a35d0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h
@@ -0,0 +1,17 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROTATED_FEATURE_ALIGN_PYTORCH_H
+#define ROTATED_FEATURE_ALIGN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale, const int points);
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points);
+
+#endif  // ROTATED_FEATURE_ALIGN_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/sync_bn.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/sync_bn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd5a513273a7bbce2cf41c790706fe4801f4c414
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/sync_bn.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
+}
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
+}
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
+                       running_mean, running_var, weight, bias, norm, std,
+                       output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
+                       grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
+                       grad_weight, grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean) {
+  sync_bn_forward_mean_impl(input, mean);
+}
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
+  sync_bn_forward_var_impl(input, mean, var);
+}
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size) {
+  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
+}
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias) {
+  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input) {
+  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/sync_bn_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/sync_bn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b1855abd1cca4bd0cd831c3b86e50f273779339
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/sync_bn_parrots.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "sync_bn_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void sync_bn_forward_mean_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  const auto& input = buildATensor(ctx, ins[0]);
+  auto mean = buildATensor(ctx, outs[0]);
+  sync_bn_forward_mean_cuda(input, mean);
+}
+
+void sync_bn_forward_var_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& mean = buildATensor(ctx, ins[1]);
+  auto var = buildATensor(ctx, outs[0]);
+  sync_bn_forward_var_cuda(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda_parrots(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  size_t group_size;
+  float eps, momentum;
+  SSAttrs(attr)
+      .get<float>("eps", eps)
+      .get<float>("momentum", momentum)
+      .get<size_t>("group_size", group_size)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& mean = buildATensor(ctx, ins[1]);
+  const auto& var = buildATensor(ctx, ins[2]);
+  const auto& weight = buildATensor(ctx, ins[3]);
+  const auto& bias = buildATensor(ctx, ins[4]);
+  auto running_mean = buildATensor(ctx, outs[0]);
+  auto running_var = buildATensor(ctx, outs[1]);
+  auto norm = buildATensor(ctx, outs[2]);
+  auto std = buildATensor(ctx, outs[3]);
+  auto output = buildATensor(ctx, outs[4]);
+  sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
+}
+
+void sync_bn_backward_param_cuda_parrots(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& norm = buildATensor(ctx, ins[1]);
+  auto grad_weight = buildATensor(ctx, outs[0]);
+  auto grad_bias = buildATensor(ctx, outs[1]);
+  sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& grad_weight = buildATensor(ctx, ins[2]);
+  const auto& grad_bias = buildATensor(ctx, ins[3]);
+  const auto& norm = buildATensor(ctx, ins[4]);
+  const auto& std = buildATensor(ctx, ins[5]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
+}
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_mean)
+    .input(1)
+    .output(1)
+    .apply(sync_bn_forward_mean_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_var)
+    .input(2)
+    .output(1)
+    .apply(sync_bn_forward_var_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_output)
+    .attr("eps")
+    .attr("momentum")
+    .attr("group_size")
+    .input(5)
+    .output(5)
+    .apply(sync_bn_forward_output_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_backward_param)
+    .input(2)
+    .output(2)
+    .apply(sync_bn_backward_param_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_backward_data)
+    .input(6)
+    .output(1)
+    .apply(sync_bn_backward_data_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/sync_bn_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/sync_bn_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bd6a7fada22ed512489f74d69445042b9aaf84b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/sync_bn_pytorch.h
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SYNC_BN_PYTORCH_H
+#define SYNC_BN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+#endif  // SYNC_BN_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_interpolate.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_interpolate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e0ec71bb3d3fdb8416dcc62cfda926cc45c9977
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_interpolate.cpp
@@ -0,0 +1,33 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
+                       weight, out);
+}
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
+                       idx, weight, grad_points);
+}
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n) {
+  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
+                                 weight_tensor, out_tensor);
+}
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m) {
+  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
+                                  weight_tensor, grad_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a71a90fd1e6b0321e14665265430a31c2934cb51
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "three_interpolate_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void three_interpolate_forward_cuda_parrots(CudaContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int b, c, m, n;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("m", m)
+      .get<int>("n", n)
+      .done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+  auto weight_tensor = buildATensor(ctx, ins[2]);
+
+  auto out_tensor = buildATensor(ctx, outs[0]);
+
+  three_interpolate_forward(points_tensor, idx_tensor, weight_tensor,
+                            out_tensor, b, c, m, n);
+}
+
+void three_interpolate_backward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int b, c, n, m;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("m", m)
+      .done();
+
+  auto grad_out_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+  auto weight_tensor = buildATensor(ctx, ins[2]);
+
+  auto grad_points_tensor = buildATensor(ctx, outs[0]);
+
+  three_interpolate_backward(grad_out_tensor, idx_tensor, weight_tensor,
+                             grad_points_tensor, b, c, n, m);
+}
+
+PARROTS_EXTENSION_REGISTER(three_interpolate_forward)
+    .attr("b")
+    .attr("c")
+    .attr("m")
+    .attr("n")
+    .input(3)
+    .output(1)
+    .apply(three_interpolate_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(three_interpolate_backward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("m")
+    .input(3)
+    .output(1)
+    .apply(three_interpolate_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_interpolate_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_interpolate_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..464c6d90051529e2f2c694bfda9cb15f5998c9c5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_interpolate_pytorch.h
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_INTERPOLATE_PYTORCH_H
+#define THREE_INTERPOLATE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n);
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m);
+#endif  // THREE_INTERPOLATE_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_nn.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_nn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b629200c0727cdec5ca4e0abd8ac65baacaa31f9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_nn.cpp
@@ -0,0 +1,18 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
+                       idx);
+}
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m) {
+  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
+                        idx_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_nn_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_nn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c28c7d216cc6c2d4ab55de26b7b9d9e0197642b3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_nn_parrots.cpp
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "three_nn_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void three_nn_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int b, n, m;
+  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();
+
+  auto unknown_tensor = buildATensor(ctx, ins[0]);
+  auto known_tensor = buildATensor(ctx, ins[1]);
+
+  auto dist2_tensor = buildATensor(ctx, outs[0]);
+  auto idx_tensor = buildATensor(ctx, outs[1]);
+
+  three_nn_forward(unknown_tensor, known_tensor, dist2_tensor, idx_tensor, b, n,
+                   m);
+}
+
+PARROTS_EXTENSION_REGISTER(three_nn_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .input(2)
+    .output(2)
+    .apply(three_nn_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_nn_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_nn_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6574fba0912bd87425de995db5ddb6c7b715381d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/three_nn_pytorch.h
@@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_NN_PYTORCH_H
+#define THREE_NN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m);
+#endif  // THREE_NN_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/tin_shift.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/tin_shift.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b03f587541f17cae3c3f03f5cb8747d4b0208efc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/tin_shift.cpp
@@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
+  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
+}
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
+}
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
+  tin_shift_forward_impl(input, shift, output);
+}
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
+  tin_shift_backward_impl(grad_output, shift, grad_input);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/tin_shift_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/tin_shift_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0920928e73a0af9650726420396c6a481e1b2bd
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/tin_shift_parrots.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "tin_shift_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void tin_shift_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                    const OperatorBase::in_list_t &ins,
+                                    OperatorBase::out_list_t &outs) {
+  const auto &input = buildATensor(ctx, ins[0]);
+  const auto &shift = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  tin_shift_forward_cuda(input, shift, output);
+}
+
+void tin_shift_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                     const OperatorBase::in_list_t &ins,
+                                     OperatorBase::out_list_t &outs) {
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  const auto &shift = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  tin_shift_backward_cuda(grad_output, shift, grad_input);
+}
+
+PARROTS_EXTENSION_REGISTER(tin_shift_forward)
+    .input(2)
+    .output(1)
+    .apply(tin_shift_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(tin_shift_backward)
+    .input(2)
+    .output(1)
+    .apply(tin_shift_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/tin_shift_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/tin_shift_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe72383764cd0ed13fd8b74938027ea9db992d52
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/tin_shift_pytorch.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TIN_SHIFT_PYTORCH_H
+#define TIN_SHIFT_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+#endif  // TIN_SHIFT_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/upfirdn2d.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/upfirdn2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd325bd7887a49b5f0ccd134604f24c0fd40fc10
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/upfirdn2d.cpp
@@ -0,0 +1,118 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1) {
+  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, kernel, up_x, up_y,
+                              down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
+}
+
+torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
+                        int up_x, int up_y, int down_x, int down_y, int pad_x0,
+                        int pad_x1, int pad_y0, int pad_y1) {
+  return upfirdn2d_op_impl(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                           pad_x1, pad_y0, pad_y1);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f0c50db5cdfca872a8231c26d6f578d0fdc171f5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+using namespace at;
+using namespace parrots;
+
+torch::Tensor upfirdn2d(const Tensor &input, const Tensor &kernel, int up_x,
+                        int up_y, int down_x, int down_y, int pad_x0,
+                        int pad_x1, int pad_y0, int pad_y1);
+
+void upfirdn2d_parrots(CudaContext &ctx, const SSElement &attr,
+                       const OperatorBase::in_list_t &ins,
+                       OperatorBase::out_list_t &outs) {
+  int up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1;
+  const auto &input = buildATensor(ctx, ins[0]);
+  const auto &kernel = buildATensor(ctx, ins[1]);
+  SSAttrs(attr)
+      .get("up_x", up_x)
+      .get("up_y", up_y)
+      .get("down_x", down_x)
+      .get("down_y", down_y)
+      .get("pad_x0", pad_x0)
+      .get("pad_x1", pad_x1)
+      .get("pad_y0", pad_y0)
+      .get("pad_y1", pad_y1)
+      .done();
+  auto out = upfirdn2d(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                       pad_x1, pad_y0, pad_y1);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(upfirdn2d)
+    .attr("up_x")
+    .attr("up_y")
+    .attr("down_x")
+    .attr("down_y")
+    .attr("pad_x0")
+    .attr("pad_x1")
+    .attr("pad_y0")
+    .attr("pad_y1")
+    .input(2)
+    .output(1)
+    .apply(upfirdn2d_parrots)
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/voxelization.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/voxelization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7946be6178ad5eae64958b4631c1cabec2a04eee
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/voxelization.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
+                              num_points_per_voxel, voxel_size, coors_range,
+                              max_points, max_voxels, NDim);
+}
+
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
+                              points, voxels, coors, num_points_per_voxel,
+                              voxel_size, coors_range, max_points, max_voxels,
+                              NDim);
+}
+
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim = 3) {
+  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
+                       coors_range, NDim);
+}
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim = 3,
+                           const bool deterministic = true) {
+  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+
+  if (deterministic) {
+    *voxel_num_data = hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  } else {
+    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  }
+}
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim = 3) {
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
+                                NDim);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/voxelization_parrots.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/voxelization_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..90e2a4445c217a49ecddf064455874b1be12a14f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/voxelization_parrots.cpp
@@ -0,0 +1,113 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "voxelization_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void hard_voxelize_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int max_points, max_voxels, NDim;
+  bool deterministic;
+  SSAttrs(attr)
+      .get<int>("max_points", max_points)
+      .get<int>("max_voxels", max_voxels)
+      .get<int>("NDim", NDim)
+      .get<bool>("deterministic", deterministic)
+      .done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto voxels = buildATensor(ctx, outs[0]);
+  auto coors = buildATensor(ctx, outs[1]);
+  auto num_points_per_voxel = buildATensor(ctx, outs[2]);
+  auto voxel_num = buildATensor(ctx, outs[3]);
+
+  hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
+                        num_points_per_voxel, voxel_num, max_points, max_voxels,
+                        NDim, deterministic);
+}
+
+void dynamic_voxelize_forward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int NDim;
+  SSAttrs(attr).get<int>("NDim", NDim).done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto coors = buildATensor(ctx, outs[0]);
+
+  dynamic_voxelize_forward(points, voxel_size, coors_range, coors, NDim);
+}
+#endif
+
+void hard_voxelize_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int max_points, max_voxels, NDim;
+  bool deterministic;
+  SSAttrs(attr)
+      .get<int>("max_points", max_points)
+      .get<int>("max_voxels", max_voxels)
+      .get<int>("NDim", NDim)
+      .get<bool>("deterministic", deterministic)
+      .done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto voxels = buildATensor(ctx, outs[0]);
+  auto coors = buildATensor(ctx, outs[1]);
+  auto num_points_per_voxel = buildATensor(ctx, outs[2]);
+  auto voxel_num = buildATensor(ctx, outs[3]);
+
+  hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
+                        num_points_per_voxel, voxel_num, max_points, max_voxels,
+                        NDim, deterministic);
+}
+
+void dynamic_voxelize_forward_cpu_parrots(HostContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int NDim;
+  SSAttrs(attr).get<int>("NDim", NDim).done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto coors = buildATensor(ctx, outs[0]);
+
+  dynamic_voxelize_forward(points, voxel_size, coors_range, coors, NDim);
+}
+
+PARROTS_EXTENSION_REGISTER(hard_voxelize_forward)
+    .attr("max_points")
+    .attr("max_voxels")
+    .attr("NDim")
+    .attr("deterministic")
+    .input(3)
+    .output(4)
+    .apply(hard_voxelize_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(hard_voxelize_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(dynamic_voxelize_forward)
+    .attr("NDim")
+    .input(3)
+    .output(1)
+    .apply(dynamic_voxelize_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(dynamic_voxelize_forward_cuda_parrots)
+#endif
+    .done();
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/voxelization_pytorch.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/voxelization_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0019d51912cb4b8077147e553925ab107bc216ce
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/parrots/voxelization_pytorch.h
@@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef VOXELIZATION_PYTORCH_H
+#define VOXELIZATION_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim = 3,
+                           const bool deterministic = true);
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim = 3);
+
+#endif  // VOXELIZATION_PYTORCH_H
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1ead1f8e4700d019fff7b25034e2475087040c8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
+                       output);
+}
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
+                       grad_in);
+}
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output) {
+  active_rotated_filter_forward_impl(input, indices, output);
+}
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in) {
+  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/assign_score_withk.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/assign_score_withk.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9076277181c48c7c8f236cb9da79a83c5d38d47f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/assign_score_withk.cpp
@@ -0,0 +1,42 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
+                       aggregate, points, centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
+                       aggregate, grad_out, points, centers, scores, knn_idx,
+                       grad_points, grad_centers, grad_scores);
+}
+
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
+                                const Tensor& scores, const Tensor& knn_idx,
+                                Tensor& output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate) {
+  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
+                                  centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
+                                 const Tensor& centers, const Tensor& scores,
+                                 const Tensor& knn_idx, Tensor& grad_points,
+                                 Tensor& grad_centers, Tensor& grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate) {
+  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
+                                   points, centers, scores, knn_idx,
+                                   grad_points, grad_centers, grad_scores);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/ball_query.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/ball_query.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0534db5ce136ed43e2f72a497281fb1968f41f9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/ball_query.cpp
@@ -0,0 +1,38 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
+                       nsample, new_xyz, xyz, idx);
+}
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample) {
+  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
+                          new_xyz_tensor, xyz_tensor, idx_tensor);
+}
+
+void stack_ball_query_forward_impl(float max_radius, int nsample,
+                                   const Tensor new_xyz,
+                                   const Tensor new_xyz_batch_cnt,
+                                   const Tensor xyz, const Tensor xyz_batch_cnt,
+                                   Tensor idx) {
+  DISPATCH_DEVICE_IMPL(stack_ball_query_forward_impl, max_radius, nsample,
+                       new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
+}
+
+void stack_ball_query_forward(Tensor new_xyz_tensor, Tensor new_xyz_batch_cnt,
+                              Tensor xyz_tensor, Tensor xyz_batch_cnt,
+                              Tensor idx_tensor, float max_radius,
+                              int nsample) {
+  stack_ball_query_forward_impl(max_radius, nsample, new_xyz_tensor,
+                                new_xyz_batch_cnt, xyz_tensor, xyz_batch_cnt,
+                                idx_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..52269a1fd945d937de447f761c669b2ebae16c50
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
@@ -0,0 +1,60 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#ifdef MMCV_WITH_DIOPI
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+#include <diopi/functions_mmcv.h>
+
+#include "csrc_dipu/base/basedef.h"
+#include "csrc_dipu/diopirt/diopirt_impl.h"
+
+using dipu::diopi_helper::toDiopiScalar;
+using dipu::diopi_helper::toDiopiTensorHandle;
+#endif
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
+                       aligned, offset);
+}
+
+#ifdef MMCV_WITH_DIOPI
+void bbox_overlaps_diopi(const Tensor bboxes1, const Tensor bboxes2,
+                         Tensor ious, const int mode, const bool aligned,
+                         const int offset) {
+  auto bboxes1_p = toDiopiTensorHandle(bboxes1);
+  diopiDevice_t device;
+  diopiGetTensorDevice(bboxes1_p, &device);
+  if (device == diopi_host) {
+    bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto bboxes2_p = toDiopiTensorHandle(bboxes2);
+  auto ious_p = toDiopiTensorHandle(ious);
+  bool is_mock_cuda = bboxes1.device().type() == dipu::DIPU_DEVICE_TYPE;
+  if (is_mock_cuda &&
+      reinterpret_cast<void *>(diopiBboxOverlapsMmcv) != nullptr) {
+    auto ret = diopiBboxOverlapsMmcv(ch, ious_p, bboxes1_p, bboxes2_p, mode,
+                                     offset, aligned);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op bbox_overlaps";
+  auto bboxes1_cpu = bboxes1.cpu();
+  auto bboxes2_cpu = bboxes2.cpu();
+  auto ious_cpu = ious.cpu();
+  bbox_overlaps_impl(bboxes1_cpu, bboxes2_cpu, ious_cpu, mode, aligned, offset);
+  ious.copy_(ious_cpu);
+}
+#endif
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset) {
+#ifdef MMCV_WITH_DIOPI
+  bbox_overlaps_diopi(bboxes1, bboxes2, ious, mode, aligned, offset);
+#else
+  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
+#endif
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bezier_align.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bezier_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8521d66cb2ec1dae27cb215f8a42c8a61709073
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bezier_align.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned) {
+  DISPATCH_DEVICE_IMPL(bezier_align_forward_impl, input, rois, output,
+                       aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, aligned);
+}
+
+void bezier_align_backward_impl(Tensor grad_output, Tensor rois,
+                                Tensor grad_input, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned) {
+  DISPATCH_DEVICE_IMPL(bezier_align_backward_impl, grad_output, rois,
+                       grad_input, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, aligned);
+}
+
+void bezier_align_forward(Tensor input, Tensor rois, Tensor output,
+                          int aligned_height, int aligned_width,
+                          float spatial_scale, int sampling_ratio,
+                          bool aligned) {
+  bezier_align_forward_impl(input, rois, output, aligned_height, aligned_width,
+                            spatial_scale, sampling_ratio, aligned);
+}
+
+void bezier_align_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                           int aligned_height, int aligned_width,
+                           float spatial_scale, int sampling_ratio,
+                           bool aligned) {
+  bezier_align_backward_impl(grad_output, rois, grad_input, aligned_height,
+                             aligned_width, spatial_scale, sampling_ratio,
+                             aligned);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bias_act.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bias_act.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ad32dc50119492ee86c8dbd905208bee7c2e3dc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bias_act.cpp
@@ -0,0 +1,20 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor bias_act_op_impl(const torch::Tensor &input,
+                               const torch::Tensor &bias,
+                               const torch::Tensor &xref,
+                               const torch::Tensor &yref,
+                               const torch::Tensor &dy, int grad, int dim,
+                               int act, float alpha, float gain, float clamp) {
+  return DISPATCH_DEVICE_IMPL(bias_act_op_impl, input, bias, xref, yref, dy,
+                              grad, dim, act, alpha, gain, clamp);
+}
+
+torch::Tensor bias_act(const torch::Tensor &input, const torch::Tensor &bias,
+                       const torch::Tensor &xref, const torch::Tensor &yref,
+                       const torch::Tensor &dy, int grad, int dim, int act,
+                       float alpha, float gain, float clamp) {
+  return bias_act_op_impl(input, bias, xref, yref, dy, grad, dim, act, alpha,
+                          gain, clamp);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/border_align.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/border_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..565de689913413ab106884365e6dc1edfa940de0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/border_align.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
+                       argmax_idx, pool_size);
+}
+
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
+                       argmax_idx, grad_input, pool_size);
+}
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size) {
+  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
+}
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size) {
+  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
+                             pool_size);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/box_iou_quadri.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/box_iou_quadri.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48c928106d955e983e824efc1682ad6c0c514791
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/box_iou_quadri.cpp
@@ -0,0 +1,17 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_quadri_impl, boxes1, boxes2, ious, mode_flag,
+                       aligned);
+}
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+void box_iou_quadri(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                    const int mode_flag, const bool aligned) {
+  box_iou_quadri_impl(boxes1, boxes2, ious, mode_flag, aligned);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2a4e0953a5575f72c167bd668c6b6e758ebae87
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
@@ -0,0 +1,19 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
+                       aligned);
+}
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned) {
+  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/carafe.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/carafe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a563aed94f04e32614e38062c4e7f4250c6dafe6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/carafe.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
+                       rmasks, output, kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
+                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor) {
+  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor) {
+  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/carafe_naive.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/carafe_naive.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e8917a61d93c7e6613566902cb00623ea89444e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/carafe_naive.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
+                       kernel_size, group_size, scale_factor);
+}
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/chamfer_distance.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/chamfer_distance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dcff69893185d7cc52d8048d300b45ccfe0b3968
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/chamfer_distance.cpp
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2) {
+  DISPATCH_DEVICE_IMPL(chamfer_distance_forward_impl, xyz1, xyz2, dist1, dist2,
+                       idx1, idx2);
+}
+
+void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor idx1, Tensor idx2, Tensor graddist1,
+                                    Tensor graddist2, Tensor gradxyz1,
+                                    Tensor gradxyz2) {
+  DISPATCH_DEVICE_IMPL(chamfer_distance_backward_impl, xyz1, xyz2, idx1, idx2,
+                       graddist1, graddist2, gradxyz1, gradxyz2);
+}
+
+void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
+                              const Tensor dist1, const Tensor dist2,
+                              const Tensor idx1, const Tensor idx2) {
+  chamfer_distance_forward_impl(xyz1, xyz2, dist1, dist2, idx1, idx2);
+}
+
+void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
+                               Tensor idx1, Tensor idx2, Tensor graddist1,
+                               Tensor graddist2, Tensor gradxyz1,
+                               Tensor gradxyz2) {
+  chamfer_distance_backward_impl(xyz1, xyz2, idx1, idx2, graddist1, graddist2,
+                                 gradxyz1, gradxyz2);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/contour_expand.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/contour_expand.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..586c48ee44b6b7dbb24573b4a2d2ecf499a56d0b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/contour_expand.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/whai362/PSENet
+#include <iostream>
+#include <queue>
+
+#include "pytorch_cpp_helper.hpp"
+
+using namespace std;
+
+class Point2d {
+ public:
+  int x;
+  int y;
+
+  Point2d() : x(0), y(0) {}
+  Point2d(int _x, int _y) : x(_x), y(_y) {}
+};
+
+void kernel_dilate(const uint8_t *data, IntArrayRef data_shape,
+                   const int *label_map, int &label_num, int &min_area,
+                   vector<vector<int>> &text_line) {
+  std::vector<int> area(label_num + 1);
+  int kernel_num = data_shape[0];
+  int height = data_shape[1];
+  int width = data_shape[2];
+
+  for (int x = 0; x < height; ++x) {
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      area[label] += 1;
+    }
+  }
+
+  queue<Point2d> queue, next_queue;
+  for (int x = 0; x < height; ++x) {
+    vector<int> row(width);
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      if (area[label] < min_area) continue;
+
+      Point2d point(x, y);
+      queue.push(point);
+      row[y] = label;
+    }
+    text_line.emplace_back(row);
+  }
+
+  int dx[] = {-1, 1, 0, 0};
+  int dy[] = {0, 0, -1, 1};
+  vector<int> kernel_step(kernel_num);
+  std::for_each(kernel_step.begin(), kernel_step.end(),
+                [=](int &k) { return k * height * width; });
+
+  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
+    while (!queue.empty()) {
+      Point2d point = queue.front();
+      queue.pop();
+      int x = point.x;
+      int y = point.y;
+      int label = text_line[x][y];
+
+      bool is_edge = true;
+      for (int d = 0; d < 4; ++d) {
+        int tmp_x = x + dx[d];
+        int tmp_y = y + dy[d];
+
+        if (tmp_x < 0 || tmp_x >= height) continue;
+        if (tmp_y < 0 || tmp_y >= width) continue;
+        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
+        if (kernel_value == 0) continue;
+        if (text_line[tmp_x][tmp_y] > 0) continue;
+
+        Point2d point(tmp_x, tmp_y);
+        queue.push(point);
+        text_line[tmp_x][tmp_y] = label;
+        is_edge = false;
+      }
+
+      if (is_edge) {
+        next_queue.push(point);
+      }
+    }
+    swap(queue, next_queue);
+  }
+}
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num) {
+  kernel_mask = kernel_mask.contiguous();
+  internal_kernel_label = internal_kernel_label.contiguous();
+  assert(kernel_mask.dim() == 3);
+  assert(internal_kernel_label.dim() == 2);
+  assert(kernel_mask.size(1) == internal_kernel_label.size(0));
+  assert(kernel_mask.size(2) == internal_kernel_label.size(1));
+  CHECK_CPU_INPUT(kernel_mask);
+  CHECK_CPU_INPUT(internal_kernel_label);
+  auto ptr_data = kernel_mask.data_ptr<uint8_t>();
+  IntArrayRef data_shape = kernel_mask.sizes();
+
+  auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
+  vector<vector<int>> text_line;
+
+  kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
+                min_kernel_area, text_line);
+
+  return text_line;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/convex_iou.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/convex_iou.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79f2028b551c474453aff2f6633dd426194e4afd
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/convex_iou.cpp
@@ -0,0 +1,23 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
+}
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
+  convex_iou_impl(pointsets, polygons, ious);
+}
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
+}
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
+  convex_giou_impl(pointsets, polygons, output);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/correlation.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/correlation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4adba2a0c17201476352c473f1c7117af020ab2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/correlation.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <iostream>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
+                       patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
+                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
+                       padW, dilationH, dilationW, dilation_patchH,
+                       dilation_patchW, dH, dW);
+}
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW) {
+  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
+                           padW, dilationH, dilationW, dilation_patchH,
+                           dilation_patchW, dH, dW);
+}
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW) {
+  correlation_backward_impl(grad_output, input1, input2, grad_input1,
+                            grad_input2, kH, kW, patchH, patchW, padH, padW,
+                            dilationH, dilationW, dilation_patchH,
+                            dilation_patchW, dH, dW);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa5a8b3d517e9cec4cf953aa9f3de8e2fb17c3a3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
@@ -0,0 +1,120 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cpu/ActiveRotatingFilter_cpu.cpp
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+void active_rotated_filter_forward_cpu_kernel(
+    const T* weightData, const int* indicesData, const int num_output_planes,
+    const int num_input_planes, const int num_orientations, const int kH,
+    const int kW, const int num_rotations, T* outputData) {
+  const int nEntry = num_orientations * kH * kW;
+  int i, j, l;
+  int k;
+
+#pragma omp parallel for private(i, j, l, k)
+  for (i = 0; i < num_output_planes; i++) {
+    for (j = 0; j < num_input_planes; j++) {
+      for (l = 0; l < nEntry; l++) {
+        int weightIndex = i * num_input_planes * nEntry + j * nEntry + l;
+        T val = *(weightData + weightIndex);
+        for (k = 0; k < num_rotations; k++) {
+          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
+          T* target = outputData +
+                      i * (num_rotations * num_input_planes * nEntry) +
+                      k * (num_input_planes * nEntry) + j * (nEntry) + index;
+          *target = val;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void active_rotated_filter_backward_cpu_kernel(
+    const T* gradOutputData, const int* indicesData,
+    const int num_output_planes, const int num_input_planes,
+    const int num_orientations, const int kH, const int kW,
+    const int num_rotations, T* gradInputData) {
+  const int nEntry = num_orientations * kH * kW;
+  int i, j, l;
+  int k;
+
+#pragma omp parallel for private(i, j, l, k)
+  for (i = 0; i < num_output_planes; i++) {
+    for (j = 0; j < num_input_planes; j++) {
+      for (l = 0; l < nEntry; l++) {
+        int gradInputIndex = i * num_input_planes * nEntry + j * nEntry + l;
+        T* val = gradInputData + gradInputIndex;
+        *val = 0;
+        for (k = 0; k < num_rotations; k++) {
+          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
+          const T* target =
+              gradOutputData + i * (num_rotations * num_input_planes * nEntry) +
+              k * (num_input_planes * nEntry) + j * (nEntry) + index;
+          *val = *val + *target;
+        }
+      }
+    }
+  }
+}
+
+void ActiveRotatedFilterForwardCPULauncher(const Tensor input,
+                                           const Tensor indices,
+                                           Tensor output) {
+  const int num_output_planes = input.size(0);
+  const int num_input_planes = input.size(1);
+  const int num_orientations = input.size(2);
+  const int kH = input.size(3);
+  const int kW = input.size(4);
+  const int num_rotations = indices.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "active_rotated_filter_forward_cpu_kernel", [&] {
+        active_rotated_filter_forward_cpu_kernel<scalar_t>(
+            input.data_ptr<scalar_t>(), indices.data_ptr<int>(),
+            num_output_planes, num_input_planes, num_orientations, kH, kW,
+            num_rotations, output.data_ptr<scalar_t>());
+      });
+}
+
+void ActiveRotatedFilterBackwardCPULauncher(const Tensor grad_out,
+                                            const Tensor indices,
+                                            Tensor grad_in) {
+  const int num_orientations = indices.size(0);
+  const int kH = indices.size(1);
+  const int kW = indices.size(2);
+  const int num_rotations = indices.size(3);
+  const int num_output_planes = grad_out.size(0) / num_rotations;
+  const int num_input_planes = grad_out.size(1) / num_orientations;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "active_rotated_filter_backward_cpu_kernel", [&] {
+        active_rotated_filter_backward_cpu_kernel<scalar_t>(
+            grad_out.data_ptr<scalar_t>(), indices.data_ptr<int>(),
+            num_output_planes, num_input_planes, num_orientations, kH, kW,
+            num_rotations, grad_in.data_ptr<scalar_t>());
+      });
+}
+
+void active_rotated_filter_forward_cpu(const Tensor input, const Tensor indices,
+                                       Tensor output) {
+  ActiveRotatedFilterForwardCPULauncher(input, indices, output);
+}
+
+void active_rotated_filter_backward_cpu(const Tensor grad_out,
+                                        const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCPULauncher(grad_out, indices, grad_in);
+}
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CPU,
+                     active_rotated_filter_forward_cpu);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CPU,
+                     active_rotated_filter_backward_cpu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..44988954894d7ce4067ab005cd76391355b8bac6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp
@@ -0,0 +1,65 @@
+// Copyright(c) OpenMMLab.All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+using torch::indexing::None;
+using torch::indexing::Slice;
+
+void bbox_overlaps_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
+                              Tensor ious, const int mode_flag,
+                              const bool aligned, const int offset) {
+  Tensor temp_ious;
+  if (aligned) {
+    Tensor lt = torch::max(boxes1.index({Slice(None), Slice({None, 2})}),
+                           boxes2.index({Slice(None), Slice({None, 2})}));
+    Tensor rb = torch::min(boxes1.index({Slice(None), Slice(2)}),
+                           boxes2.index({Slice(None), Slice(2)}));
+    Tensor wh = (rb - lt + offset).clamp(0.f, INT_MAX * 1.f);
+    Tensor overlap = wh.index({Slice(None), 0}) * wh.index({Slice(None), 1});
+    Tensor area1 = (boxes1.index({Slice(None), 2}) -
+                    boxes1.index({Slice(None), 0}) + offset) *
+                   (boxes1.index({Slice(None), 3}) -
+                    boxes1.index({Slice(None), 1}) + offset);
+    if (mode_flag == 0) {
+      Tensor area2 = (boxes2.index({Slice(None), 2}) -
+                      boxes2.index({Slice(None), 0}) + offset) *
+                     (boxes2.index({Slice(None), 3}) -
+                      boxes2.index({Slice(None), 1}) + offset);
+      temp_ious = overlap / (area1 + area2 - overlap);
+    } else {
+      temp_ious = overlap / area1;
+    }
+  } else {
+    Tensor lt = torch::max(boxes1.index({Slice(None), None, Slice({None, 2})}),
+                           boxes2.index({Slice(None), Slice({None, 2})}));
+    Tensor rb = torch::min(boxes1.index({Slice(None), None, Slice(2)}),
+                           boxes2.index({Slice(None), Slice(2)}));
+    Tensor wh = (rb - lt + offset).clamp(0.f, INT_MAX * 1.f);
+    Tensor overlap = wh.index({"...", 0}) * wh.index({"...", 1});
+    Tensor area1 = (boxes1.index({Slice(None), 2}) -
+                    boxes1.index({Slice(None), 0}) + offset) *
+                   (boxes1.index({Slice(None), 3}) -
+                    boxes1.index({Slice(None), 1}) + offset);
+    if (mode_flag == 0) {
+      Tensor area2 = (boxes2.index({Slice(None), 2}) -
+                      boxes2.index({Slice(None), 0}) + offset) *
+                     (boxes2.index({Slice(None), 3}) -
+                      boxes2.index({Slice(None), 1}) + offset);
+      temp_ious =
+          overlap / (area1.index({Slice(None), None}) + area2 - overlap);
+    } else {
+      temp_ious = overlap / area1.index({Slice(None), None});
+    }
+  }
+  ious.copy_(temp_ious);
+}
+
+void bbox_overlaps_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                       const int mode, const bool aligned, const int offset) {
+  bbox_overlaps_cpu_kernel(boxes1, boxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CPU, bbox_overlaps_cpu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7eb0e5b9402c9fb2077252a4d758e7ea6345e672
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp
@@ -0,0 +1,447 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/BezierAlign
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+T bezier_curve(const T p0, const T p1, const T p2, const T p3, const T u) {
+  return ((1. - u) * (1. - u) * (1. - u) * p0 +
+          3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 +
+          u * u * u * p3);
+}
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper, T p0_x,
+    T p0_y, T p1_x, T p1_y, T p2_x, T p2_y, T p3_x, T p3_y, T p4_x, T p4_y,
+    T p5_x, T p5_y, T p6_x, T p6_y, T p7_x, T p7_y, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, T offset,
+    std::vector<PreCalc<T>> &pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      // compute the coords
+      const T u = pw / static_cast<T>(pooled_width);
+      const T v = ph / static_cast<T>(pooled_height);
+      const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
+      const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
+      const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
+      const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
+      const T x_center = x1 * v + x0 * (1. - v) - offset;
+      const T y_center = y1 * v + y0 * (1. - v) - offset;
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = y_center - (T)0.5 * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = x_center - (T)0.5 * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void BezierAlignForward(const int nthreads, const T *input, const T *rois,
+                        T *output, const int pooled_height,
+                        const int pooled_width, const T &spatial_scale,
+                        const int sampling_ratio, bool aligned,
+                        const int channels, const int height, const int width) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    // beziers have size Nx(1+8*2) = Nx17
+    const T *offset_rois = rois + n * 17;
+    int roi_batch_ind = offset_rois[0];
+
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    // Do not use rounding; this implementation detail is critical
+    T p0_x = offset_rois[1] * spatial_scale;
+    T p0_y = offset_rois[2] * spatial_scale;
+    T p1_x = offset_rois[3] * spatial_scale;
+    T p1_y = offset_rois[4] * spatial_scale;
+    T p2_x = offset_rois[5] * spatial_scale;
+    T p2_y = offset_rois[6] * spatial_scale;
+    T p3_x = offset_rois[7] * spatial_scale;
+    T p3_y = offset_rois[8] * spatial_scale;
+    T p4_x = offset_rois[15] * spatial_scale;
+    T p4_y = offset_rois[16] * spatial_scale;
+    T p5_x = offset_rois[13] * spatial_scale;
+    T p5_y = offset_rois[14] * spatial_scale;
+    T p6_x = offset_rois[11] * spatial_scale;
+    T p6_y = offset_rois[12] * spatial_scale;
+    T p7_x = offset_rois[9] * spatial_scale;
+    T p7_y = offset_rois[10] * spatial_scale;
+
+    T roi_width = std::max(std::abs(p0_x - p3_x), std::abs(p4_x - p7_x));
+    T roi_height = std::max(std::abs(p0_y - p3_y), std::abs(p4_y - p7_y));
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "Beziers in BezierAlign cannot have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, p0_x, p0_y, p1_x, p1_y, p2_x, p2_y, p3_x, p3_y, p4_x,
+        p4_y, p5_x, p5_y, p6_x, p6_y, p7_x, p7_y, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, offset, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T *offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                            pc.w2 * offset_input[pc.pos2] +
+                            pc.w3 * offset_input[pc.pos3] +
+                            pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T &w1, T &w2, T &w3, T &w4, int &x_low,
+                                   int &x_high, int &y_low, int &y_high,
+                                   const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+}
+
+template <class T>
+inline void add(T *address, const T &val) {
+  *address += val;
+}
+
+template <typename T>
+void BezierAlignBackward(const int nthreads, const T *grad_output,
+                         const T *rois, T *grad_input, const int pooled_height,
+                         const int pooled_width, const T &spatial_scale,
+                         const int sampling_ratio, bool aligned,
+                         const int channels, const int height, const int width,
+                         const int n_stride, const int c_stride,
+                         const int h_stride, const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T *offset_rois = rois + n * 17;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T p0_x = offset_rois[1] * spatial_scale;
+    T p0_y = offset_rois[2] * spatial_scale;
+    T p1_x = offset_rois[3] * spatial_scale;
+    T p1_y = offset_rois[4] * spatial_scale;
+    T p2_x = offset_rois[5] * spatial_scale;
+    T p2_y = offset_rois[6] * spatial_scale;
+    T p3_x = offset_rois[7] * spatial_scale;
+    T p3_y = offset_rois[8] * spatial_scale;
+    T p4_x = offset_rois[15] * spatial_scale;
+    T p4_y = offset_rois[16] * spatial_scale;
+    T p5_x = offset_rois[13] * spatial_scale;
+    T p5_y = offset_rois[14] * spatial_scale;
+    T p6_x = offset_rois[11] * spatial_scale;
+    T p6_y = offset_rois[12] * spatial_scale;
+    T p7_x = offset_rois[9] * spatial_scale;
+    T p7_y = offset_rois[10] * spatial_scale;
+
+    // compute the coords
+    const T u = pw / static_cast<T>(pooled_width);
+    const T v = ph / static_cast<T>(pooled_height);
+    const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u);
+    const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u);
+    const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u);
+    const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u);
+    const T x_center = x1 * v + x0 * (1. - v) - offset;
+    const T y_center = y1 * v + y0 * (1. - v) - offset;
+
+    T roi_width = std::max(std::abs(p0_x - p3_x), std::abs(p4_x - p7_x));
+    T roi_height = std::max(std::abs(p0_y - p3_y), std::abs(p4_y - p7_y));
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "Beziers in BezierAlign do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T *offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T *offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = y_center - (T)0.5 * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = x_center - (T)0.5 * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        }  // if
+      }    // ix
+    }      // iy
+  }        // for
+}  // BezierAlignBackward
+
+void BezierAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                   int aligned_height, int aligned_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "BezierAlign_forward", [&] {
+        BezierAlignForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(), aligned_height, aligned_width,
+            static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
+            channels, height, width);
+      });
+}
+
+void BezierAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                    Tensor grad_input, int aligned_height,
+                                    int aligned_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad_output.stride(0);
+  int c_stride = grad_output.stride(1);
+  int h_stride = grad_output.stride(2);
+  int w_stride = grad_output.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "BezierAlign_backward", [&] {
+        BezierAlignBackward<scalar_t>(
+            output_size, grad_output.data_ptr<scalar_t>(),
+            rois.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
+            sampling_ratio, aligned, channels, height, width, n_stride,
+            c_stride, h_stride, w_stride);
+      });
+}
+
+void bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned);
+
+void bezier_align_backward_impl(Tensor grad_output, Tensor rois,
+                                Tensor grad_input, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned);
+
+REGISTER_DEVICE_IMPL(bezier_align_forward_impl, CPU,
+                     BezierAlignForwardCPULauncher);
+REGISTER_DEVICE_IMPL(bezier_align_backward_impl, CPU,
+                     BezierAlignBackwardCPULauncher);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..211699ce2b832324848c4c6c5f7e5f90fcab97f2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp
@@ -0,0 +1,36 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+void box_iou_quadri_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
+                               Tensor ious, const int mode_flag,
+                               const bool aligned) {
+  int output_size = ious.numel();
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  if (aligned) {
+    for (int i = 0; i < output_size; i++) {
+      ious[i] = single_box_iou_quadri<T>(boxes1[i].data_ptr<T>(),
+                                         boxes2[i].data_ptr<T>(), mode_flag);
+    }
+  } else {
+    for (int i = 0; i < num_boxes1; i++) {
+      for (int j = 0; j < num_boxes2; j++) {
+        ious[i * num_boxes2 + j] = single_box_iou_quadri<T>(
+            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
+      }
+    }
+  }
+}
+
+void box_iou_quadri_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                        const int mode_flag, const bool aligned) {
+  box_iou_quadri_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_quadri_impl, CPU, box_iou_quadri_cpu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..585d2c9fddd1566e4898c35ce6e1f4533cd1a236
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
+                                Tensor ious, const int mode_flag,
+                                const bool aligned) {
+  int output_size = ious.numel();
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  if (aligned) {
+    for (int i = 0; i < output_size; i++) {
+      ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
+                                          boxes2[i].data_ptr<T>(), mode_flag);
+    }
+  } else {
+    for (int i = 0; i < num_boxes1; i++) {
+      for (int j = 0; j < num_boxes2; j++) {
+        ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
+            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
+      }
+    }
+  }
+}
+
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CPU, box_iou_rotated_cpu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ab67e78c7b5fb4468f47066935cb35b68525b54
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp
@@ -0,0 +1,408 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
+                                 const int height, const int width, T h, T w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+T get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
+                          const int height, const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+T get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
+                            const int width, const T *im_data,
+                            const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+void deformable_im2col_cpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  for (int index = 0; index < n; index++) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,
+                                               width, h_im, w_im);
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+void deformable_col2im_cpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  for (int index = 0; index < n; index++) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight =
+              get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
+                                      cur_h + dy, cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void deformable_col2im_coord_cpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int offset_channels, const int deformable_group, const int height_col,
+    const int width_col, T *grad_offset) {
+  for (int index = 0; index < n; index++) {
+    T val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      const T weight = get_coordinate_weight_cpu(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor data_col) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_cpu", [&] {
+        deformable_im2col_cpu_kernel<scalar_t>(
+            num_kernels, data_im.data_ptr<scalar_t>(),
+            data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels,
+            deformable_group, height_col, width_col,
+            data_col.data_ptr<scalar_t>());
+      });
+}
+
+void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_cpu_kernel<scalar_t>(
+            num_kernels, data_col_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+            dilation_w, channel_per_deformable_group, parallel_imgs,
+            deformable_group, height_col, width_col, grad_im_);
+      }));
+}
+
+void deformable_col2im_coord_cpu(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+                    deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_cpu_kernel<scalar_t>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height,
+            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+}
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CPU, deformable_im2col_cpu);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CPU, deformable_col2im_cpu);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CPU,
+                     deformable_col2im_coord_cpu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..95390956450d062a37eaec98664aff11a8035587
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
@@ -0,0 +1,436 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T dmcn_im2col_bilinear_cpu(const T *input, const int data_width,
+                           const int height, const int width, T h, T w) {
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+T dmcn_get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
+                               const int height, const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+T dmcn_get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
+                                 const int width, const T *im_data,
+                                 const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+void modulated_deformable_im2col_cpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const T *data_mask,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  for (int index = 0; index < n; index++) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const T *data_mask_ptr =
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width,
+                                         h_im, w_im);
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+void modulated_deformable_col2im_cpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const T *data_mask,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  for (int index = 0; index < n; index++) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T mask = data_mask_ptr[data_mask_hw_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data,
+                                                  cur_inv_w_data, cur_h + dy,
+                                                  cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void modulated_deformable_col2im_coord_cpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const T *data_mask, const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
+  for (int index = 0; index < n; index++) {
+    T val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      else
+        mval += data_col_ptr[col_pos] *
+                dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width,
+                                         width, height, width, inv_h, inv_w);
+      const T weight = dmcn_get_coordinate_weight_cpu(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                      kernel_w +
+                  offset_c / 2) *
+                     height_col +
+                 h) *
+                    width_col +
+                w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cpu(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_cpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_cpu_kernel(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im,
+            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
+            channels, deformable_group, height_col, width_col, data_col_);
+      }));
+}
+
+void modulated_deformable_col2im_cpu(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_cpu_kernel(
+            num_kernels, data_col_, data_offset_, data_mask_, channels,
+            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+}
+
+void modulated_deformable_col2im_coord_cpu(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+                          kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_cpu_kernel(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
+            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size,
+            2 * kernel_h * kernel_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_, grad_mask_);
+      }));
+}
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CPU,
+                     modulated_deformable_im2col_cpu);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CPU,
+                     modulated_deformable_col2im_cpu);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CPU,
+                     modulated_deformable_col2im_coord_cpu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..53e9b9a8d82c405e8f923be06f78cda730c0f4ee
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms.cpp
@@ -0,0 +1,230 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto nboxes = boxes.size(0);
+  Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
+
+  auto select = select_t.data_ptr<bool>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  for (int64_t _i = 0; _i < nboxes; _i++) {
+    if (select[_i] == false) continue;
+    auto i = order[_i];
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+      if (select[_j] == false) continue;
+      auto j = order[_j];
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr > iou_threshold) select[_j] = false;
+    }
+  }
+  return order_t.masked_select(select_t);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CPU, nms_cpu);
+
+Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
+                   float iou_threshold, float sigma, float min_score,
+                   int method, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+  auto scores_t = scores.clone();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto nboxes = boxes.size(0);
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto sc = scores_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+  auto de = dets.data_ptr<float>();
+
+  int64_t pos = 0;
+  Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
+  auto inds = inds_t.data_ptr<int64_t>();
+
+  for (int64_t i = 0; i < nboxes; i++) {
+    auto max_score = sc[i];
+    auto max_pos = i;
+
+    pos = i + 1;
+    // get max box
+    while (pos < nboxes) {
+      if (max_score < sc[pos]) {
+        max_score = sc[pos];
+        max_pos = pos;
+      }
+      pos = pos + 1;
+    }
+    // swap
+    auto ix1 = de[i * 5 + 0] = x1[max_pos];
+    auto iy1 = de[i * 5 + 1] = y1[max_pos];
+    auto ix2 = de[i * 5 + 2] = x2[max_pos];
+    auto iy2 = de[i * 5 + 3] = y2[max_pos];
+    auto iscore = de[i * 5 + 4] = sc[max_pos];
+    auto iarea = areas[max_pos];
+    auto iind = inds[max_pos];
+    x1[max_pos] = x1[i];
+    y1[max_pos] = y1[i];
+    x2[max_pos] = x2[i];
+    y2[max_pos] = y2[i];
+    sc[max_pos] = sc[i];
+    areas[max_pos] = areas[i];
+    inds[max_pos] = inds[i];
+    x1[i] = ix1;
+    y1[i] = iy1;
+    x2[i] = ix2;
+    y2[i] = iy2;
+    sc[i] = iscore;
+    areas[i] = iarea;
+    inds[i] = iind;
+
+    pos = i + 1;
+    while (pos < nboxes) {
+      auto xx1 = std::max(ix1, x1[pos]);
+      auto yy1 = std::max(iy1, y1[pos]);
+      auto xx2 = std::min(ix2, x2[pos]);
+      auto yy2 = std::min(iy2, y2[pos]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[pos] - inter);
+
+      float weight = 1.;
+      if (method == 0) {
+        if (ovr >= iou_threshold) weight = 0;
+      } else if (method == 1) {
+        if (ovr >= iou_threshold) weight = 1 - ovr;
+      } else if (method == 2) {
+        weight = std::exp(-(ovr * ovr) / sigma);
+      }
+      sc[pos] *= weight;
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update N
+      if (sc[pos] < min_score) {
+        x1[pos] = x1[nboxes - 1];
+        y1[pos] = y1[nboxes - 1];
+        x2[pos] = x2[nboxes - 1];
+        y2[pos] = y2[nboxes - 1];
+        sc[pos] = sc[nboxes - 1];
+        areas[pos] = areas[nboxes - 1];
+        inds[pos] = inds[nboxes - 1];
+        nboxes = nboxes - 1;
+        pos = pos - 1;
+      }
+      pos = pos + 1;
+    }
+  }
+  return inds_t.slice(0, 0, nboxes);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset);
+REGISTER_DEVICE_IMPL(softnms_impl, CPU, softnms_cpu);
+
+std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
+  auto x1_t = dets.select(1, 0).contiguous();
+  auto y1_t = dets.select(1, 1).contiguous();
+  auto x2_t = dets.select(1, 2).contiguous();
+  auto y2_t = dets.select(1, 3).contiguous();
+  auto scores = dets.select(1, 4).contiguous();
+
+  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t =
+      at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  std::vector<int> keep;
+  std::vector<std::vector<int> > matched;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) continue;
+    keep.push_back(i);
+    std::vector<int> v_i;
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) continue;
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(static_cast<float>(0), xx2 - xx1);
+      auto h = std::max(static_cast<float>(0), yy2 - yy1);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+        v_i.push_back(j);
+      }
+    }
+    matched.push_back(v_i);
+  }
+  for (size_t i = 0; i < keep.size(); i++)
+    matched[i].insert(matched[i].begin(), keep[i]);
+  return matched;
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets, float iou_threshold);
+REGISTER_DEVICE_IMPL(nms_match_impl, CPU, nms_match_cpu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..086df167eb49b1406c01129eea8783e393d7320f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp
@@ -0,0 +1,64 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+
+template <typename scalar_t>
+Tensor nms_quadri_cpu_kernel(const Tensor dets, const Tensor scores,
+                             const float iou_threshold) {
+  // nms_quadri_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+  // however, the code in this function is much shorter because
+  // we delegate the IoU computation for quadri boxes to
+  // the single_box_iou_quadri function in box_iou_rotated_utils.h
+  AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),
+             "dets should have the same type as scores");
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+
+  int64_t num_to_keep = 0;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) {
+      continue;
+    }
+
+    keep[num_to_keep++] = i;
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) {
+        continue;
+      }
+
+      auto ovr = single_box_iou_quadri<scalar_t>(
+          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+      }
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+
+Tensor nms_quadri_cpu(const Tensor dets, const Tensor scores,
+                      const float iou_threshold) {
+  auto result = at::empty({0}, dets.options());
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_quadri", [&] {
+    result = nms_quadri_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2774c82654ef83d220ca81566cce8d25d02c275
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
@@ -0,0 +1,66 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+
+template <typename scalar_t>
+Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
+                              const float iou_threshold) {
+  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+  // however, the code in this function is much shorter because
+  // we delegate the IoU computation for rotated boxes to
+  // the single_box_iou_rotated function in box_iou_rotated_utils.h
+  AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),
+             "dets should have the same type as scores");
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+
+  int64_t num_to_keep = 0;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) {
+      continue;
+    }
+
+    keep[num_to_keep++] = i;
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) {
+        continue;
+      }
+
+      auto ovr = single_box_iou_rotated<scalar_t>(
+          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+      }
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold) {
+  auto result = at::empty({0}, dets.options());
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
+    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db06a224a075e641b8d7738fe3e7be3f71990fc7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
@@ -0,0 +1,126 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/WenmuZhou/PAN.pytorch
+
+#include <queue>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> estimate_confidence(int32_t* label,
+                                                    float* score, int label_num,
+                                                    int height, int width) {
+  std::vector<std::vector<float>> point_vector;
+  for (int i = 0; i < label_num; i++) {
+    std::vector<float> point;
+    point.push_back(0);
+    point.push_back(0);
+    point_vector.push_back(point);
+  }
+  for (int y = 0; y < height; y++) {
+    auto label_tmp = label + y * width;
+    auto score_tmp = score + y * width;
+    for (int x = 0; x < width; x++) {
+      auto l = label_tmp[x];
+      if (l > 0) {
+        float confidence = score_tmp[x];
+        point_vector[l].push_back(x);
+        point_vector[l].push_back(y);
+        point_vector[l][0] += confidence;
+        point_vector[l][1] += 1;
+      }
+    }
+  }
+  for (size_t l = 0; l < point_vector.size(); l++)
+    if (point_vector[l][1] > 0) {
+      point_vector[l][0] /= point_vector[l][1];
+    }
+  return point_vector;
+}
+std::vector<std::vector<float>> pixel_group_cpu(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  assert(score.dim() == 2);
+  assert(mask.dim() == 2);
+  assert(embedding.dim() == 3);
+  int height = score.size(0);
+  int width = score.size(1);
+  assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
+  assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
+
+  auto threshold_square = dis_threshold * dis_threshold;
+  auto ptr_score = score.data_ptr<float>();
+  auto ptr_mask = mask.data_ptr<bool>();
+  auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
+  auto ptr_embedding = embedding.data_ptr<float>();
+  auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
+  std::queue<std::tuple<int, int, int32_t>> contour_pixels;
+  auto embedding_dim = embedding.size(2);
+  std::vector<std::vector<float>> kernel_vector(
+      kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
+
+  Tensor text_label;
+  text_label = kernel_label.clone();
+  auto ptr_text_label = text_label.data_ptr<int32_t>();
+
+  for (int i = 0; i < height; i++) {
+    auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
+    auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
+    auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
+
+    for (int j = 0, k = 0; j < width && k < width * embedding_dim;
+         j++, k += embedding_dim) {
+      int32_t label = ptr_kernel_label_tmp[j];
+      if (label > 0) {
+        for (int d = 0; d < embedding_dim; d++)
+          kernel_vector[label][d] += ptr_embedding_tmp[k + d];
+        kernel_vector[label][embedding_dim] += 1;
+        // kernel pixel number
+        if (ptr_kernel_contour_tmp[j]) {
+          contour_pixels.push(std::make_tuple(i, j, label));
+        }
+      }
+    }
+  }
+  for (int i = 0; i < kernel_region_num; i++) {
+    for (int j = 0; j < embedding_dim; j++) {
+      kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
+    }
+  }
+  int dx[4] = {-1, 1, 0, 0};
+  int dy[4] = {0, 0, -1, 1};
+  while (!contour_pixels.empty()) {
+    auto query_pixel = contour_pixels.front();
+    contour_pixels.pop();
+    int y = std::get<0>(query_pixel);
+    int x = std::get<1>(query_pixel);
+    int32_t l = std::get<2>(query_pixel);
+    auto kernel_cv = kernel_vector[l];
+    for (int idx = 0; idx < 4; idx++) {
+      int tmpy = y + dy[idx];
+      int tmpx = x + dx[idx];
+      auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
+      if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
+      if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
+        continue;
+
+      float dis = 0;
+      auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
+      for (size_t i = 0; i < size_t(embedding_dim); i++) {
+        dis +=
+            pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
+        // ignore further computing if dis is big enough
+        if (dis >= threshold_square) break;
+      }
+      if (dis >= threshold_square) continue;
+      contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
+      ptr_text_label_tmp[tmpx] = l;
+    }
+  }
+
+  return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
+                             height, width);
+}
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold);
+REGISTER_DEVICE_IMPL(pixel_group_impl, CPU, pixel_group_cpu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c16baa4cca4c380db4ae25462f5074607f084214
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp
@@ -0,0 +1,53 @@
+#include "pytorch_cpp_helper.hpp"
+
+inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
+                                      float &local_x, float &local_y) {
+  float cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
+                                 float &local_x, float &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  float x = pt[0], y = pt[1], z = pt[2];
+  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor) {
+  // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (npoints, 3) [x, y, z] in LiDAR coordinate params pts_indices: (N, npoints)
+
+  CHECK_CONTIGUOUS(boxes_tensor);
+  CHECK_CONTIGUOUS(pts_tensor);
+  CHECK_CONTIGUOUS(pts_indices_tensor);
+
+  int boxes_num = boxes_tensor.size(0);
+  int pts_num = pts_tensor.size(0);
+
+  const float *boxes = boxes_tensor.data_ptr<float>();
+  const float *pts = pts_tensor.data_ptr<float>();
+  int *pts_indices = pts_indices_tensor.data_ptr<int>();
+
+  float local_x = 0, local_y = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    for (int j = 0; j < pts_num; j++) {
+      int cur_in_flag =
+          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
+      pts_indices[i * pts_num + j] = cur_in_flag;
+    }
+  }
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/psamask.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/psamask.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa7fdcbdca908e3f037d75bcc6d7d9e68102d192
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/psamask.cpp
@@ -0,0 +1,199 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+#ifndef min
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+#ifndef max
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+void psamask_collect_forward(const int num_, const int h_feature,
+                             const int w_feature, const int h_mask,
+                             const int w_mask, const int half_h_mask,
+                             const int half_w_mask, const Tensor mask_data,
+                             Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view({-1})[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_forward(const int num_, const int h_feature,
+                                const int w_feature, const int h_mask,
+                                const int w_mask, const int half_h_mask,
+                                const int half_w_mask, const Tensor mask_data,
+                                Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view(
+                {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                          h_feature * w_feature +
+                      (hidx + h - half_h_mask) * w_feature +
+                      (widx + w - half_w_mask)] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_collect_backward(const int num_, const int h_feature,
+                              const int w_feature, const int h_mask,
+                              const int w_mask, const int half_h_mask,
+                              const int half_w_mask, const Tensor buffer_diff,
+                              Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view({-1})[(n * h_feature * w_feature +
+                                        (hidx + h - half_h_mask) * w_feature +
+                                        (widx + w - half_w_mask)) *
+                                           h_feature * w_feature +
+                                       h * w_feature + w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_backward(const int num_, const int h_feature,
+                                 const int w_feature, const int h_mask,
+                                 const int w_mask, const int half_h_mask,
+                                 const int half_w_mask,
+                                 const Tensor buffer_diff, Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view(
+                    {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                              h_feature * w_feature +
+                          (hidx + h - half_h_mask) * w_feature +
+                          (widx + w - half_w_mask)];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                            half_h_mask, half_w_mask, input, output);
+  else
+    psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                               half_h_mask, half_w_mask, input, output);
+}
+
+void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                             half_h_mask, half_w_mask, grad_output, grad_input);
+  else
+    psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                                half_h_mask, half_w_mask, grad_output,
+                                grad_input);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CPU, psamask_forward_cpu);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CPU, psamask_backward_cpu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d545390645917aff7e5e8b42564fb83eb4e62ae7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp
@@ -0,0 +1,466 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(const int nthreads, const T* input, const T* rois,
+                     T* output, T* argmax_y, T* argmax_x,
+                     const int pooled_height, const int pooled_width,
+                     const T spatial_scale, const int sampling_ratio,
+                     const int pool_mode,  // 0 - max pool, 1 - avg pool
+                     const bool aligned, const int channels, const int height,
+                     const int width) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign cannot have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          T maxval = -10000;
+          T maxidx_y = -1.f, maxidx_x = -1.f;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const T y = roi_start_h + ph * bin_size_h +
+                        static_cast<T>(iy + .5f) * bin_size_h /
+                            static_cast<T>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const T x = roi_start_w + pw * bin_size_w +
+                          static_cast<T>(ix + .5f) * bin_size_w /
+                              static_cast<T>(roi_bin_grid_w);
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              T val = pc.w1 * offset_input[pc.pos1] +
+                      pc.w2 * offset_input[pc.pos2] +
+                      pc.w3 * offset_input[pc.pos3] +
+                      pc.w4 * offset_input[pc.pos4];
+              if (val > maxval) {
+                maxval = val;
+                maxidx_y = y;
+                maxidx_x = x;
+              }
+              output_val += val;
+              pre_calc_index += 1;
+            }
+          }
+          if (pool_mode == 0) {
+            // We do max pooling inside a bin
+            output[index] = maxval;
+            argmax_y[index] = maxidx_y;
+            argmax_x[index] = maxidx_x;
+          } else if (pool_mode == 1) {
+            // We do average (integral) pooling inside a bin
+            output[index] = output_val / count;
+          }  // if
+        }    // for pw
+      }      // for ph
+    }        // for c
+  }          // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high,
+                                   const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
+                      const T* argmax_y, const T* argmax_x, T* grad_input,
+                      const int pooled_height, const int pooled_width,
+                      const T spatial_scale, const int sampling_ratio,
+                      const int pool_mode,  // 0 - max pool, 1 - avg pool
+                      const bool aligned, const int channels, const int height,
+                      const int width, const int n_stride, const int c_stride,
+                      const int h_stride, const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    if (pool_mode == 0) {
+      // We do max pooling inside a bin
+      T y = argmax_y[index], x = argmax_x[index];
+      if (y != -1.f) {
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        T g1 = grad_output_this_bin * w1;
+        T g2 = grad_output_this_bin * w2;
+        T g3 = grad_output_this_bin * w3;
+        T g4 = grad_output_this_bin * w4;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        }  // if
+      }    // mode
+    } else if (pool_mode == 1) {
+      // We do average (integral) pooling inside a bin
+      // We use roi_bin_grid to sample the grid and mimic integral
+      int roi_bin_grid_h =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : ceilf(roi_height / pooled_height);  // e.g., = 2
+      int roi_bin_grid_w = (sampling_ratio > 0)
+                               ? sampling_ratio
+                               : ceilf(roi_width / pooled_width);
+
+      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+
+          T w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+
+          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                        x_low, x_high, y_low, y_high, index);
+
+          T g1 = grad_output_this_bin * w1 / count;
+          T g2 = grad_output_this_bin * w2 / count;
+          T g3 = grad_output_this_bin * w3 / count;
+          T g4 = grad_output_this_bin * w4 / count;
+
+          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+            // atomic add is not needed for now since it is single threaded
+            add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+            add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+            add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+            add(offset_grad_input + y_high * width + x_high,
+                static_cast<T>(g4));
+          }  // if
+        }    // ix
+      }      // iy
+    }        // mode
+  }          // for
+}  // ROIAlignBackward
+
+void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                Tensor argmax_y, Tensor argmax_x,
+                                int aligned_height, int aligned_width,
+                                float spatial_scale, int sampling_ratio,
+                                int pool_mode, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlign_forward", [&] {
+        ROIAlignForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,
+            static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+            aligned, channels, height, width);
+      });
+}
+
+void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                 Tensor argmax_y, Tensor argmax_x,
+                                 Tensor grad_input, int aligned_height,
+                                 int aligned_width, float spatial_scale,
+                                 int sampling_ratio, int pool_mode,
+                                 bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad_output.stride(0);
+  int c_stride = grad_output.stride(1);
+  int h_stride = grad_output.stride(2);
+  int w_stride = grad_output.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "ROIAlign_backward", [&] {
+        ROIAlignBackward<scalar_t>(
+            output_size, grad_output.data_ptr<scalar_t>(),
+            rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
+            sampling_ratio, pool_mode, aligned, channels, height, width,
+            n_stride, c_stride, h_stride, w_stride);
+      });
+}
+
+void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
+                             aligned_height, aligned_width, spatial_scale,
+                             sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
+                              aligned_height, aligned_width, spatial_scale,
+                              sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CPU, roi_align_forward_cpu);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CPU, roi_align_backward_cpu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c849de0cbc564a9a88cdbcd35b4acdb065f99a3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
@@ -0,0 +1,455 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w,
+    T cos_theta, T sin_theta, std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignRotatedForward(const int nthreads, const T* input,
+                            const T& spatial_scale, const bool aligned,
+                            const bool clockwise, const int channels,
+                            const int height, const int width,
+                            const int pooled_height, const int pooled_width,
+                            const int sampling_ratio, const T* rois,
+                            T* output) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlignRotated do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
+        sin_theta, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                            pc.w2 * offset_input[pc.pos2] +
+                            pc.w3 * offset_input[pc.pos3] +
+                            pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignRotatedBackward(
+    const int nthreads,
+    // may not be contiguous. should index using n_stride, etc
+    const T* grad_output, const T& spatial_scale, const bool aligned,
+    const bool clockwise, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int sampling_ratio,
+    T* grad_input, const T* rois, const int n_stride, const int c_stride,
+    const int h_stride, const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlignRotated do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T yy = roi_start_h + ph * bin_size_h +
+                   static_cast<T>(iy + .5f) * bin_size_h /
+                       static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+                     static_cast<T>(ix + .5f) * bin_size_w /
+                         static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        }  // if
+      }    // ix
+    }      // iy
+  }        // for
+}  // ROIAlignRotatedBackward
+
+void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       bool aligned, bool clockwise) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(),
+            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
+            height, width, aligned_height, aligned_width, sampling_ratio,
+            rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
+      });
+}
+
+void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, bool aligned,
+                                        bool clockwise) {
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad_output.stride(0);
+  int c_stride = grad_output.stride(1);
+  int h_stride = grad_output.stride(2);
+  int w_stride = grad_output.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "ROIAlignRotated_backward", [&] {
+        ROIAlignRotatedBackward<scalar_t>(
+            grad_output.numel(), grad_output.data_ptr<scalar_t>(),
+            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
+            height, width, aligned_height, aligned_width, sampling_ratio,
+            grad_input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            n_stride, c_stride, h_stride, w_stride);
+      });
+}
+
+void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                                   int aligned_height, int aligned_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise) {
+  ROIAlignRotatedForwardCPULauncher(input, rois, output, aligned_height,
+                                    aligned_width, spatial_scale,
+                                    sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
+                                    Tensor bottom_grad, int aligned_height,
+                                    int aligned_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise) {
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  ROIAlignRotatedBackwardCPULauncher(
+      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
+      sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU,
+                     roi_align_rotated_forward_cpu);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CPU,
+                     roi_align_rotated_backward_cpu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..09dcdd33759aa03e619c629ef7ae052d0fe48f2b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
@@ -0,0 +1,262 @@
+// modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T bilinear_interpolate(const T* input, const int height, const int width, T y,
+                       T x, const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  const T v_low = fma(v2 - v1, lx, v1);
+  const T v_high = fma(v4 - v3, lx, v3);
+  const T val = fma(v_high - v_low, ly, v_low);
+
+  return val;
+}
+
+template <typename scalar_t>
+void rotated_feature_align_forward_cpu_kernel(
+    const int nthreads, const int points, const scalar_t* bottom_data,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width, scalar_t* top_data) {
+  for (int index = 0; index < nthreads; index++) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    const scalar_t* offset_bottom_data =
+        bottom_data + (n * channels + c) * height * width;
+
+    scalar_t output_val = bottom_data[index];
+    for (int i = 0; i < points; i++) {
+      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
+                                                   width, py[i], px[i], i);
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high,
+                                   const int index) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename scalar_t>
+inline void valueAdd(scalar_t* address, scalar_t val) {
+  scalar_t old = *address;
+  *address = (old + val);
+}
+
+template <typename scalar_t>
+void rotated_feature_align_backward_cpu_kernel(
+    const int nthreads, const int points, const scalar_t* top_diff,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width,
+    scalar_t* bottom_diff) {
+  for (int index = 0; index < nthreads; index++) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    scalar_t* offset_bottom_diff =
+        bottom_diff + (n * channels + c) * height * width;
+    scalar_t value_top_diff = top_diff[index];
+
+    valueAdd(bottom_diff + index, value_top_diff);
+    for (int i = 0; i < points; i++) {
+      scalar_t w1, w2, w3, w4;
+      int x_low, x_high, y_low, y_high;
+
+      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
+                                              w2, w3, w4, x_low, x_high, y_low,
+                                              y_high, i);
+      scalar_t g1 = value_top_diff * w1;
+      scalar_t g2 = value_top_diff * w2;
+      scalar_t g3 = value_top_diff * w3;
+      scalar_t g4 = value_top_diff * w4;
+      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+        valueAdd(offset_bottom_diff + y_low * width + x_low, g1);
+        valueAdd(offset_bottom_diff + y_low * width + x_high, g2);
+        valueAdd(offset_bottom_diff + y_high * width + x_low, g3);
+        valueAdd(offset_bottom_diff + y_high * width + x_high, g4);
+      }
+    }
+  }
+}
+
+void rotated_feature_align_forward_cpu(const Tensor features,
+                                       const Tensor best_bboxes,
+                                       const float spatial_scale,
+                                       const int points, Tensor output) {
+  const int output_size = features.numel();
+  AT_DISPATCH_FLOATING_TYPES(
+      features.scalar_type(), "rotated_feature_align_forward_cpu_kernel", [&] {
+        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* top_data = output.data_ptr<scalar_t>();
+
+        rotated_feature_align_forward_cpu_kernel<scalar_t>(
+            output_size, points, bottom_data, bboxes_data,
+            scalar_t(spatial_scale), features.size(1), features.size(2),
+            features.size(3), top_data);
+      });
+}
+
+void rotated_feature_align_backward_cpu(const Tensor top_grad,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor bottom_grad) {
+  const int output_size = top_grad.numel();
+  AT_DISPATCH_FLOATING_TYPES(
+      top_grad.scalar_type(), "rotated_feature_align_backward_cpu_kernel", [&] {
+        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();
+
+        rotated_feature_align_backward_cpu_kernel<scalar_t>(
+            output_size, points, top_diff, bboxes_data, scalar_t(spatial_scale),
+            top_grad.size(1), top_grad.size(2), top_grad.size(3), bottom_diff);
+      });
+}
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CPU,
+                     rotated_feature_align_forward_cpu);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CPU,
+                     rotated_feature_align_backward_cpu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2c592b77d35af5dba3c8bc986aca30c2726d25c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
@@ -0,0 +1,84 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/script.h>
+#include <utils/spconv/spconv/geometry.h>
+#include <utils/spconv/spconv/indice.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    if (transpose)
+      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+    else
+      return getIndicePairsConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    return getIndicePairsSubM<Index, IndexGrid, NDim>(
+        indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                           \
+  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
+                                                       NDIM>;               \
+  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
+                                                       NDIM>;
+
+#define DECLARE_CPU_INDEX(Index)          \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_CPU_INDEX(int);
+DECLARE_CPU_INDEX(long);
+
+#undef DECLARE_CPU_INDEX
+#undef DECLARE_CPU_SPECS_INDEX_NDIM
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6266741ff9a4c1e122012d94578bb2cef58e4178
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
@@ -0,0 +1,82 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/script.h>
+#include <utils/spconv/spconv/maxpool.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
+          outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
+    }
+  }
+};
+
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto foutData = fout.data();
+    auto finData = fin.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
+          finData[idxi + plane] += foutData[idxo + plane];
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
+
+#define DECLARE_CPU_SPECS(T)         \
+  DECLARE_CPU_SPECS_T_INDEX(T, int); \
+  DECLARE_CPU_SPECS_T_INDEX(T, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4223da36093c558f62dd92a698411b3f5572096
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
@@ -0,0 +1,68 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/script.h>
+#include <utils/spconv/spconv/reordering.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseGatherFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size) {
+    int numPlanes = features.dim(1);
+    for (int i = 0; i < size; ++i) {
+      std::memcpy(buffer.data() + i * numPlanes,
+                  features.data() + indices[i] * numPlanes,
+                  sizeof(scalar_t) * numPlanes);
+    }
+  }
+};
+
+template <typename scalar_t, typename Index>
+struct SparseScatterAddFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    int numPlanes = outFeatures.dim(1);
+    const scalar_t* buf = buffer.data();
+    scalar_t* out = outFeatures.data();
+    for (int i = 0; i < size; ++i) {
+      buf = buffer.data() + i * numPlanes;
+      out = outFeatures.data() + indices[i] * numPlanes;
+      for (int j = 0; j < numPlanes; ++j) {
+        out[j] += buf[j];
+      }
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_CPU_SPECS_T_INDEX(scalar_t, Index)                        \
+  template struct functor::SparseGatherFunctor<tv::CPU, scalar_t, Index>; \
+  template struct functor::SparseScatterAddFunctor<tv::CPU, scalar_t, Index>;
+
+#define DECLARE_CPU_SPECS(scalar_t)         \
+  DECLARE_CPU_SPECS_T_INDEX(scalar_t, int); \
+  DECLARE_CPU_SPECS_T_INDEX(scalar_t, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a21f849a0b90ebb489d26daadbbc48427d6dd502
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
@@ -0,0 +1,186 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T, typename T_int>
+void dynamic_voxelize_forward_cpu_kernel(
+    const torch::TensorAccessor<T, 2> points,
+    torch::TensorAccessor<T_int, 2> coors, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const std::vector<int> grid_size,
+    const int num_points, const int num_features, const int NDim) {
+  const int ndim_minus_1 = NDim - 1;
+  bool failed = false;
+  // int coor[NDim];
+  int* coor = new int[NDim]();
+  int c;
+
+  for (int i = 0; i < num_points; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
+      // necessary to rm points out of range
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+
+    // memcpy and memset will cause problem because of the memory distribution
+    // discontinuity of TensorAccessor, so here using loops to replace memcpy
+    // or memset
+    if (failed) {
+      for (int k = 0; k < NDim; ++k) {
+        coors[i][k] = -1;
+      }
+    } else {
+      for (int k = 0; k < NDim; ++k) {
+        coors[i][k] = coor[k];
+      }
+    }
+  }
+
+  delete[] coor;
+  return;
+}
+
+template <typename T, typename T_int>
+void hard_voxelize_forward_cpu_kernel(
+    const torch::TensorAccessor<T, 2> points,
+    torch::TensorAccessor<T, 3> voxels, torch::TensorAccessor<T_int, 2> coors,
+    torch::TensorAccessor<T_int, 1> num_points_per_voxel,
+    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const std::vector<int> grid_size, const int max_points,
+    const int max_voxels, const int num_points, const int num_features,
+    const int NDim) {
+  // declare a temp coors
+  at::Tensor temp_coors = at::zeros(
+      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
+
+  // First use dynamic voxelization to get coors,
+  // then check max points/voxels constraints
+  dynamic_voxelize_forward_cpu_kernel<T, int>(
+      points, temp_coors.accessor<int, 2>(), voxel_size, coors_range, grid_size,
+      num_points, num_features, NDim);
+
+  int voxelidx, num;
+  auto coor = temp_coors.accessor<int, 2>();
+
+  for (int i = 0; i < num_points; ++i) {
+    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
+
+    if (coor[i][0] == -1) continue;
+
+    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+
+    // record voxel
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (max_voxels != -1 && voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+
+      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
+      // memcpy will cause problem because of the memory distribution
+      // discontinuity of TensorAccessor, so here using loops to replace memcpy
+      for (int k = 0; k < NDim; ++k) {
+        coors[voxelidx][k] = coor[i][k];
+      }
+    }
+
+    // put points into voxel
+    num = num_points_per_voxel[voxelidx];
+    if (max_points == -1 || num < max_points) {
+      // memcpy will cause problem because of the memory distribution
+      // discontinuity of TensorAccessor, so here using loops to replace memcpy
+      for (int k = 0; k < num_features; ++k) {
+        voxels[voxelidx][num][k] = points[i][k];
+      }
+      num_points_per_voxel[voxelidx] += 1;
+    }
+  }
+
+  return;
+}
+
+void dynamic_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& coors,
+                                  const std::vector<float> voxel_size,
+                                  const std::vector<float> coors_range,
+                                  const int NDim = 3) {
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "dynamic_voxelize_forward_cpu_kernel", [&] {
+        dynamic_voxelize_forward_cpu_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
+            voxel_size, coors_range, grid_size, num_points, num_features, NDim);
+      });
+}
+
+int hard_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& voxels,
+                              at::Tensor& coors,
+                              at::Tensor& num_points_per_voxel,
+                              const std::vector<float> voxel_size,
+                              const std::vector<float> coors_range,
+                              const int max_points, const int max_voxels,
+                              const int NDim = 3) {
+  // current version tooks about 0.02s_0.03s for one frame on cpu
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
+  // grid_size[1], grid_size[0]);
+  at::Tensor coor_to_voxelidx =
+      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
+
+  int voxel_num = 0;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "hard_voxelize_forward_cpu_kernel", [&] {
+        hard_voxelize_forward_cpu_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
+            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
+            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
+            coors_range, grid_size, max_points, max_voxels, num_points,
+            num_features, NDim);
+      });
+
+  return voxel_num;
+}
+
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CPU,
+                     hard_voxelize_forward_cpu);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CPU,
+                     dynamic_voxelize_forward_cpu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..27fffb9faeaa33eff201c0fcaf236866e5d10712
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
+#include "active_rotated_filter_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output) {
+  int num_output_planes = input.size(0);
+  int num_input_planes = input.size(1);
+  int num_orientations = input.size(2);
+  int kH = input.size(3);
+  int kW = input.size(4);
+  int num_rotations = indices.size(3);
+  int nEntry = num_orientations * kH * kW;
+  int output_size = input.numel();
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "active_rotated_filter_forward_cuda_kernel", [&] {
+        active_rotated_filter_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                indices.data_ptr<int>(), num_input_planes, num_output_planes,
+                num_orientations, num_rotations, nEntry,
+                output.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in) {
+  int num_orientations = indices.size(0);
+  int kH = indices.size(1);
+  int kW = indices.size(2);
+  int num_rotations = indices.size(3);
+  int num_output_planes = grad_out.size(0) / num_rotations;
+  int num_input_planes = grad_out.size(1) / num_orientations;
+  int nEntry = num_orientations * kH * kW;
+  int output_size = grad_in.numel();
+
+  at::cuda::CUDAGuard device_guard(indices.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "active_rotated_filter_backward_cuda_kernel",
+      [&] {
+        active_rotated_filter_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_out.data_ptr<scalar_t>(),
+                indices.data_ptr<int>(), num_input_planes, num_output_planes,
+                num_orientations, num_rotations, nEntry,
+                grad_in.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bdb5fab9fc61ad19d9230cfdc26642dc7fe5972e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "assign_score_withk_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output) {
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "assign_score_withk_forward_cuda_kernel", [&] {
+        assign_score_withk_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, points.data_ptr<scalar_t>(),
+                centers.data_ptr<scalar_t>(), scores.data_ptr<scalar_t>(),
+                knn_idx.data_ptr<int64_t>(), output.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));
+  dim3 threads1(THREADS_PER_BLOCK);
+  dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));
+  dim3 threads2(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "assign_score_withk_points_backward_cuda_kernel",
+      [&] {
+        assign_score_withk_points_backward_cuda_kernel<scalar_t>
+            <<<blocks1, threads1, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
+                scores.data_ptr<scalar_t>(), knn_idx.data_ptr<int64_t>(),
+                grad_points.data_ptr<scalar_t>(),
+                grad_centers.data_ptr<scalar_t>());
+      });
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "assign_score_withk_scores_backward_cuda_kernel",
+      [&] {
+        assign_score_withk_scores_backward_cuda_kernel<scalar_t>
+            <<<blocks2, threads2, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
+                points.data_ptr<scalar_t>(), centers.data_ptr<scalar_t>(),
+                knn_idx.data_ptr<int64_t>(), grad_scores.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c42c3e2ae6164dfc504c2794db1436607ec8445f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ball_query_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx) {
+  // new_xyz: (B, M, 3)
+  // xyz: (B, N, 3)
+  // output:
+  //      idx: (B, M, nsample)
+
+  at::cuda::CUDAGuard device_guard(new_xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      new_xyz.scalar_type(), "ball_query_forward_cuda_kernel", [&] {
+        ball_query_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, n, m, min_radius, max_radius, nsample,
+                new_xyz.data_ptr<scalar_t>(), xyz.data_ptr<scalar_t>(),
+                idx.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7dae535cfb4818d6cae445666378332db29bb9f0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
@@ -0,0 +1,40 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "bbox_overlaps_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+// Disable fp16 on ROCm device
+#ifndef MMCV_WITH_HIP
+#if __CUDA_ARCH__ >= 530
+template <>
+__global__ void bbox_overlaps_cuda_kernel<at::Half>(
+    const at::Half* bbox1, const at::Half* bbox2, at::Half* ious,
+    const int num_bbox1, const int num_bbox2, const int mode,
+    const bool aligned, const int offset) {
+  bbox_overlaps_cuda_kernel_half(reinterpret_cast<const __half*>(bbox1),
+                                 reinterpret_cast<const __half*>(bbox2),
+                                 reinterpret_cast<__half*>(ious), num_bbox1,
+                                 num_bbox2, mode, aligned, offset);
+}
+
+#endif  // __CUDA_ARCH__ >= 530
+#endif  // MMCV_WITH_HIP
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset) {
+  int output_size = ious.numel();
+  int num_bbox1 = bboxes1.size(0);
+  int num_bbox2 = bboxes2.size(0);
+
+  at::cuda::CUDAGuard device_guard(bboxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bboxes1.scalar_type(), "bbox_overlaps_cuda_kernel", ([&] {
+        bbox_overlaps_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),
+                ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
+                offset);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b2786a84eb7dcbb6b06e950f6a1e80f8fcaebfb7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "bezier_align_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BezierAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                          Tensor output, int aligned_height,
+                                          int aligned_width,
+                                          float spatial_scale,
+                                          int sampling_ratio, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "bezier_align_forward_cuda_kernel", [&] {
+        bezier_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
+                channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void BezierAlignBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,
+    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "bezier_align_backward_cuda_kernel", [&] {
+        bezier_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, aligned,
+                channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..74d02dcececb0f8b338c3068f0fae31bddc55efc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu
@@ -0,0 +1,301 @@
+// Modified from
+// https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.cpp
+
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+#include <c10/util/Half.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+struct bias_act_kernel_params {
+  const void *x;     // [sizeX]
+  const void *b;     // [sizeB] or NULL
+  const void *xref;  // [sizeX] or NULL
+  const void *yref;  // [sizeX] or NULL
+  const void *dy;    // [sizeX] or NULL
+  void *y;           // [sizeX]
+
+  int grad;
+  int act;
+  float alpha;
+  float gain;
+  float clamp;
+
+  int sizeX;
+  int sizeB;
+  int stepB;
+  int loopX;
+};
+
+// CUDA kernel selection.
+
+template <class T>
+void *choose_bias_act_kernel(const bias_act_kernel_params &p);
+//------------------------------------------------------------------------
+// Helpers.
+
+template <class T>
+struct InternalType;
+template <>
+struct InternalType<double> {
+  typedef double scalar_t;
+};
+template <>
+struct InternalType<float> {
+  typedef float scalar_t;
+};
+template <>
+struct InternalType<c10::Half> {
+  typedef float scalar_t;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel.
+
+template <class T, int A>
+__global__ void bias_act_kernel(bias_act_kernel_params p) {
+  typedef typename InternalType<T>::scalar_t scalar_t;
+  int G = p.grad;
+  scalar_t alpha = (scalar_t)p.alpha;
+  scalar_t gain = (scalar_t)p.gain;
+  scalar_t clamp = (scalar_t)p.clamp;
+  scalar_t one = (scalar_t)1;
+  scalar_t two = (scalar_t)2;
+  scalar_t expRange = (scalar_t)80;
+  scalar_t halfExpRange = (scalar_t)40;
+  scalar_t seluScale = (scalar_t)1.0507009873554804934193349852946;
+  scalar_t seluAlpha = (scalar_t)1.6732632423543772848170429916717;
+
+  // Loop over elements.
+  int xi = blockIdx.x * p.loopX * blockDim.x + threadIdx.x;
+  for (int loopIdx = 0; loopIdx < p.loopX && xi < p.sizeX;
+       loopIdx++, xi += blockDim.x) {
+    // Load.
+    scalar_t x = (scalar_t)((const T *)p.x)[xi];
+    scalar_t b =
+        (p.b) ? (scalar_t)((const T *)p.b)[(xi / p.stepB) % p.sizeB] : 0;
+    scalar_t xref = (p.xref) ? (scalar_t)((const T *)p.xref)[xi] : 0;
+    scalar_t yref = (p.yref) ? (scalar_t)((const T *)p.yref)[xi] : 0;
+    scalar_t dy = (p.dy) ? (scalar_t)((const T *)p.dy)[xi] : one;
+    scalar_t yy = (gain != 0) ? yref / gain : 0;
+    scalar_t y = 0;
+
+    // Apply bias.
+    ((G == 0) ? x : xref) += b;
+
+    // linear
+    if (A == 1) {
+      if (G == 0) y = x;
+      if (G == 1) y = x;
+    }
+
+    // relu
+    if (A == 2) {
+      if (G == 0) y = (x > 0) ? x : 0;
+      if (G == 1) y = (yy > 0) ? x : 0;
+    }
+
+    // lrelu
+    if (A == 3) {
+      if (G == 0) y = (x > 0) ? x : x * alpha;
+      if (G == 1) y = (yy > 0) ? x : x * alpha;
+    }
+
+    // tanh
+    if (A == 4) {
+      if (G == 0) {
+        scalar_t c = exp(x);
+        scalar_t d = one / c;
+        y = (x < -expRange) ? -one : (x > expRange) ? one : (c - d) / (c + d);
+      }
+      if (G == 1) y = x * (one - yy * yy);
+      if (G == 2) y = x * (one - yy * yy) * (-two * yy);
+    }
+
+    // sigmoid
+    if (A == 5) {
+      if (G == 0) y = (x < -expRange) ? 0 : one / (exp(-x) + one);
+      if (G == 1) y = x * yy * (one - yy);
+      if (G == 2) y = x * yy * (one - yy) * (one - two * yy);
+    }
+
+    // elu
+    if (A == 6) {
+      if (G == 0) y = (x >= 0) ? x : exp(x) - one;
+      if (G == 1) y = (yy >= 0) ? x : x * (yy + one);
+      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + one);
+    }
+
+    // selu
+    if (A == 7) {
+      if (G == 0)
+        y = (x >= 0) ? seluScale * x : (seluScale * seluAlpha) * (exp(x) - one);
+      if (G == 1)
+        y = (yy >= 0) ? x * seluScale : x * (yy + seluScale * seluAlpha);
+      if (G == 2) y = (yy >= 0) ? 0 : x * (yy + seluScale * seluAlpha);
+    }
+
+    // softplus
+    if (A == 8) {
+      if (G == 0) y = (x > expRange) ? x : log(exp(x) + one);
+      if (G == 1) y = x * (one - exp(-yy));
+      if (G == 2) {
+        scalar_t c = exp(-yy);
+        y = x * c * (one - c);
+      }
+    }
+
+    // swish
+    if (A == 9) {
+      if (G == 0)
+        y = (x < -expRange) ? 0 : x / (exp(-x) + one);
+      else {
+        scalar_t c = exp(xref);
+        scalar_t d = c + one;
+        if (G == 1)
+          y = (xref > halfExpRange) ? x : x * c * (xref + d) / (d * d);
+        else
+          y = (xref > halfExpRange)
+                  ? 0
+                  : x * c * (xref * (two - d) + two * d) / (d * d * d);
+        yref = (xref < -expRange) ? 0 : xref / (exp(-xref) + one) * gain;
+      }
+    }
+
+    // Apply gain.
+    y *= gain * dy;
+
+    // Clamp.
+    if (clamp >= 0) {
+      if (G == 0)
+        y = (y > -clamp & y < clamp) ? y : (y >= 0) ? clamp : -clamp;
+      else
+        y = (yref > -clamp & yref < clamp) ? y : 0;
+    }
+
+    // Store.
+    ((T *)p.y)[xi] = (T)y;
+  }
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T>
+void *choose_bias_act_kernel(const bias_act_kernel_params &p) {
+  if (p.act == 1) return (void *)bias_act_kernel<T, 1>;
+  if (p.act == 2) return (void *)bias_act_kernel<T, 2>;
+  if (p.act == 3) return (void *)bias_act_kernel<T, 3>;
+  if (p.act == 4) return (void *)bias_act_kernel<T, 4>;
+  if (p.act == 5) return (void *)bias_act_kernel<T, 5>;
+  if (p.act == 6) return (void *)bias_act_kernel<T, 6>;
+  if (p.act == 7) return (void *)bias_act_kernel<T, 7>;
+  if (p.act == 8) return (void *)bias_act_kernel<T, 8>;
+  if (p.act == 9) return (void *)bias_act_kernel<T, 9>;
+  return NULL;
+}
+
+//------------------------------------------------------------------------
+
+static bool has_same_layout(torch::Tensor x, torch::Tensor y) {
+  if (x.dim() != y.dim()) return false;
+  for (int64_t i = 0; i < x.dim(); i++) {
+    if (x.size(i) != y.size(i)) return false;
+    if (x.size(i) >= 2 && x.stride(i) != y.stride(i)) return false;
+  }
+  return true;
+}
+
+//------------------------------------------------------------------------
+torch::Tensor bias_act_op(const torch::Tensor &x, const torch::Tensor &b,
+                          const torch::Tensor &xref, const torch::Tensor &yref,
+                          const torch::Tensor &dy, int grad, int dim, int act,
+                          float alpha, float gain, float clamp) {
+  // Validate arguments.
+  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+  TORCH_CHECK(
+      b.numel() == 0 || (b.dtype() == x.dtype() && b.device() == x.device()),
+      "b must have the same dtype and device as x");
+  TORCH_CHECK(xref.numel() == 0 ||
+                  (xref.sizes() == x.sizes() && xref.dtype() == x.dtype() &&
+                   xref.device() == x.device()),
+              "xref must have the same shape, dtype, and device as x");
+  TORCH_CHECK(yref.numel() == 0 ||
+                  (yref.sizes() == x.sizes() && yref.dtype() == x.dtype() &&
+                   yref.device() == x.device()),
+              "yref must have the same shape, dtype, and device as x");
+  TORCH_CHECK(
+      dy.numel() == 0 || (dy.sizes() == x.sizes() && dy.dtype() == x.dtype() &&
+                          dy.device() == x.device()),
+      "dy must have the same dtype and device as x");
+  TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
+  TORCH_CHECK(b.dim() == 1, "b must have rank 1");
+  TORCH_CHECK(b.numel() == 0 || (dim >= 0 && dim < x.dim()),
+              "dim is out of bounds");
+  TORCH_CHECK(b.numel() == 0 || b.numel() == x.size(dim),
+              "b has wrong number of elements");
+  TORCH_CHECK(grad >= 0, "grad must be non-negative");
+
+  // Validate layout.
+  TORCH_CHECK(x.is_non_overlapping_and_dense(),
+              "x must be non-overlapping and dense");
+  TORCH_CHECK(b.is_contiguous(), "b must be contiguous");
+  TORCH_CHECK(xref.numel() == 0 || has_same_layout(xref, x),
+              "xref must have the same layout as x");
+  TORCH_CHECK(yref.numel() == 0 || has_same_layout(yref, x),
+              "yref must have the same layout as x");
+  TORCH_CHECK(dy.numel() == 0 || has_same_layout(dy, x),
+              "dy must have the same layout as x");
+
+  // Create output tensor.
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+  torch::Tensor y = torch::empty_like(x);
+  TORCH_CHECK(has_same_layout(y, x), "y must have the same layout as x");
+
+  // Initialize CUDA kernel parameters.
+  bias_act_kernel_params p;
+  p.x = x.data_ptr();
+  p.b = (b.numel()) ? b.data_ptr() : NULL;
+  p.xref = (xref.numel()) ? xref.data_ptr() : NULL;
+  p.yref = (yref.numel()) ? yref.data_ptr() : NULL;
+  p.dy = (dy.numel()) ? dy.data_ptr() : NULL;
+  p.y = y.data_ptr();
+  p.grad = grad;
+  p.act = act;
+  p.alpha = alpha;
+  p.gain = gain;
+  p.clamp = clamp;
+  p.sizeX = (int)x.numel();
+  p.sizeB = (int)b.numel();
+  p.stepB = (b.numel()) ? (int)x.stride(dim) : 1;
+
+  // Choose CUDA kernel.
+  void *kernel;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
+    kernel = choose_bias_act_kernel<scalar_t>(p);
+  });
+  TORCH_CHECK(kernel, "no CUDA kernel found for the specified activation func");
+
+  // Launch CUDA kernel.
+  p.loopX = 4;
+  int blockSize = 4 * 32;
+  int gridSize = (p.sizeX - 1) / (p.loopX * blockSize) + 1;
+  void *args[] = {&p};
+#ifdef MMCV_WITH_HIP
+  AT_CUDA_CHECK(hipLaunchKernel(kernel, gridSize, blockSize, args, 0,
+                                at::cuda::getCurrentCUDAStream()));
+#else
+  AT_CUDA_CHECK(cudaLaunchKernel(kernel, gridSize, blockSize, args, 0,
+                                 at::cuda::getCurrentCUDAStream()));
+#endif
+
+  return y;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3aeefea5ddafa81da74f320ae7f166f4977787b4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu
@@ -0,0 +1,68 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "border_align_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
+                                          const Tensor &boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size) {
+  // shape assertion
+  AT_ASSERTM(input.ndimension() == 4,
+             "non-empty 4D(batch mode) tensor expected for input feature");
+  AT_ASSERTM(boxes.ndimension() == 3,
+             "boxes must be 3D tensor with size of [B, H*W, 4]");
+
+  int batch_size = input.size(0);
+  int feat_channels = input.size(1);
+  int channels = feat_channels / 4;
+  int height = input.size(2);
+  int width = input.size(3);
+  // shape [N, box_size, 4] for boxes. (x1, y1, x2, y2) format
+  int box_size = boxes.size(1);
+  // shape [N, channels, box_size, 4] for output
+  int nthreads = batch_size * channels * box_size;
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 block(128, 4);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "border_align_forward_cuda_kernel", [&] {
+        border_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
+                nthreads, input.data_ptr<scalar_t>(),
+                boxes.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax_idx.data_ptr<int>(), channels, box_size, height, width,
+                pool_size);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
+                                           const Tensor &boxes,
+                                           const Tensor &argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size) {
+  int batch_size = grad_input.size(0);
+  int feat_channels = grad_input.size(1);
+  int channels = feat_channels / 4;
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  int box_size = boxes.size(1);
+  int nthreads = batch_size * channels * box_size;
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 block(128, 4);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "border_align_backward_cuda_kernel", [&] {
+        border_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
+                nthreads, grad_output.data_ptr<scalar_t>(),
+                boxes.data_ptr<scalar_t>(), argmax_idx.data_ptr<int>(),
+                grad_input.data_ptr<scalar_t>(), channels, box_size, height,
+                width, pool_size);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25b6819a795354f015c421b612fd2ae130482e91
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu
@@ -0,0 +1,23 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "box_iou_quadri_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void box_iou_quadri_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  using scalar_t = float;
+  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
+
+  int output_size = ious.numel();
+  int num_boxes1 = boxes1.size(0);
+  int num_boxes2 = boxes2.size(0);
+
+  at::cuda::CUDAGuard device_guard(boxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  box_iou_quadri_cuda_kernel<scalar_t>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
+          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
+          mode_flag, aligned);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3c13e06237b208a48e2489ef8246c90ada78ef51
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
@@ -0,0 +1,25 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+#include "box_iou_rotated_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  using scalar_t = float;
+  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
+
+  int output_size = ious.numel();
+  int num_boxes1 = boxes1.size(0);
+  int num_boxes2 = boxes2.size(0);
+
+  at::cuda::CUDAGuard device_guard(boxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  box_iou_rotated_cuda_kernel<scalar_t>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
+          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
+          mode_flag, aligned);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..984e734f9ea5e15de2517d6a580dbe35a11c208b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu
@@ -0,0 +1,180 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "carafe_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor) {
+  const int batch_size = output.size(0);
+  const int channels = output.size(1);
+  const int output_height = output.size(2);
+  const int output_width = output.size(3);
+
+  const int input_height = features.size(2);
+  const int input_width = features.size(3);
+
+  const int mask_channels = masks.size(1);
+
+  rfeatures.resize_({batch_size, input_height, input_width, channels});
+  routput.resize_({batch_size, output_height, output_width, channels});
+  rmasks.resize_({batch_size, output_height, output_width, mask_channels});
+
+  // one warp per pixel
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Feature", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        scalar_t *top_data = rfeatures.data_ptr<scalar_t>();
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(input_height * input_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, input_height * input_width, dh, dw,
+                bottom_data, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Masks", ([&] {
+        const scalar_t *bottom_data = masks.data_ptr<scalar_t>();
+        scalar_t *top_data = rmasks.data_ptr<scalar_t>();
+        const int dh = divideUP(mask_channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, mask_channels, output_height * output_width, dh, dw,
+                bottom_data, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFELaucherForward", ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();
+        scalar_t *top_data = routput.data_ptr<scalar_t>();
+
+        CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
+                                  THREADS_PER_BLOCK, 0, stream>>>(
+            num_kernels, bottom_data, bottom_masks, kernel_size, group_size,
+            scale_factor, channels, input_height, input_width, output_height,
+            output_width, mask_channels, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NHWC2NCHW", ([&] {
+        const scalar_t *bottom_data = routput.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor) {
+  const int batch_size = top_grad.size(0);
+  const int channels = top_grad.size(1);
+  const int output_height = top_grad.size(2);
+  const int output_width = top_grad.size(3);
+
+  const int input_height = bottom_grad.size(2);
+  const int input_width = bottom_grad.size(3);
+
+  const int mask_channels = masks.size(1);
+
+  rtop_grad.resize_({batch_size, output_height, output_width, channels});
+  rbottom_grad.resize_({batch_size, input_height, input_width, channels});
+  rbottom_grad_hs.resize_({batch_size, output_height, output_width, channels});
+  rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});
+
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] {
+        const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, output_height * output_width, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();
+
+        CARAFEBackward_Feature<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, top_diff, bottom_masks, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, bottom_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "FeatureSum", ([&] {
+        const int num_kernels =
+            batch_size * input_height * input_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();
+
+        FeatureSum<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, bottom_diff_hs, scale_factor, channels,
+                         input_height, input_width, bottom_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] {
+        const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(input_height * input_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, input_height * input_width, channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] {
+        const int num_kernels = batch_size * output_height * output_width *
+                                mask_channels * WARP_SIZE;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();
+
+        CARAFEBackward_Mask<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, top_diff, bottom_data, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, mask_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] {
+        const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = mask_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(mask_channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, mask_channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2fc5667686d225064bd14c2f2ad5d06b93bd5fca
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
@@ -0,0 +1,52 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "carafe_naive_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor) {
+  int output_size = output.numel();
+  int channels = output.size(1);
+  int height = output.size(2);
+  int width = output.size(3);
+
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFENAIVEForward", ([&] {
+        carafe_naive_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, features.data_ptr<scalar_t>(),
+                masks.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                kernel_size, group_size, scale_factor, channels, height, width);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor) {
+  int output_size = top_grad.numel();
+  int channels = top_grad.size(1);
+  int height = top_grad.size(2);
+  int width = top_grad.size(3);
+
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFENAIVEBackward", ([&] {
+        carafe_naive_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, top_grad.data_ptr<scalar_t>(),
+                features.data_ptr<scalar_t>(), masks.data_ptr<scalar_t>(),
+                bottom_grad.data_ptr<scalar_t>(),
+                mask_grad.data_ptr<scalar_t>(), kernel_size, group_size,
+                scale_factor, channels, height, width);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6effa29ee7f9998f03461df5e0c251657aeccc39
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
@@ -0,0 +1,63 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
+#include "chamfer_distance_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ChamferDistanceForwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
+    const Tensor dist2, const Tensor idx1, const Tensor idx2) {
+  int batch_size = xyz1.size(0);
+  int n = xyz1.size(1);
+  int m = xyz2.size(1);
+
+  at::cuda::CUDAGuard device_guard(xyz1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
+        chamfer_distance_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK, 0, stream>>>(
+                batch_size, n, xyz1.data_ptr<scalar_t>(), m,
+                xyz2.data_ptr<scalar_t>(), dist1.data_ptr<scalar_t>(),
+                idx1.data_ptr<int>());
+      });
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
+        chamfer_distance_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK, 0, stream>>>(
+                batch_size, m, xyz2.data_ptr<scalar_t>(), n,
+                xyz1.data_ptr<scalar_t>(), dist2.data_ptr<scalar_t>(),
+                idx2.data_ptr<int>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ChamferDistanceBackwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
+    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2) {
+  int batch_size = xyz1.size(0);
+  int n = xyz1.size(1);
+  int m = xyz2.size(1);
+
+  at::cuda::CUDAGuard device_guard(xyz1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
+        chamfer_distance_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                batch_size, m, xyz1.data_ptr<scalar_t>(), n,
+                xyz2.data_ptr<scalar_t>(), grad_dist1.data_ptr<scalar_t>(),
+                idx1.data_ptr<int>(), grad_xyz1.data_ptr<scalar_t>(),
+                grad_xyz2.data_ptr<scalar_t>());
+      });
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
+        chamfer_distance_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                batch_size, n, xyz2.data_ptr<scalar_t>(), m,
+                xyz1.data_ptr<scalar_t>(), grad_dist2.data_ptr<scalar_t>(),
+                idx2.data_ptr<int>(), grad_xyz2.data_ptr<scalar_t>(),
+                grad_xyz1.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
new file mode 100644
index 0000000000000000000000000000000000000000..804f7ac3bae433173f2e71011fa5be2c2c81e761
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/iou/src/convex_iou_kernel.cu
+#include "convex_iou_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious) {
+  int output_size = ious.numel();
+  int num_pointsets = pointsets.size(0);
+  int num_polygons = polygons.size(0);
+
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "convex_iou_cuda_kernel", ([&] {
+        convex_iou_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>(), ious.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output) {
+  int output_size = output.numel();
+  int num_pointsets = pointsets.size(0);
+  int num_polygons = polygons.size(0);
+
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "convex_giou_cuda_kernel", ([&] {
+        convex_giou_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6a43cfc70dafd8050699eac05bfc9bd896f5ba2f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
@@ -0,0 +1,94 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
+// Original licence: Under MIT License
+
+#include "correlation_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW) {
+  const int batch_size = input1.size(0);
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+  const int dilatedKH = (kH - 1) * dilationH + 1;
+  const int dilatedKW = (kW - 1) * dilationW + 1;
+
+  const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;
+  const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;
+
+  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
+  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
+
+  const dim3 threads(WARP_SIZE, 4, 4);
+  const dim3 blocks(batch_size, (oH + 3) >> 2, (oW + 3) >> 2);
+
+  at::cuda::CUDAGuard device_guard(input1.device());
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input1.scalar_type(), "correlation_forward_cuda", ([&] {
+        TensorAcc4R trInput1_acc =
+            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R trInput2_acc =
+            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc5R output_acc =
+            output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
+
+        correlation_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                trInput1_acc, trInput2_acc, output_acc, kH, kW, patchH, patchW,
+                padH, padW, dilationH, dilationW, dilation_patchH,
+                dilation_patchW, dH, dW, oH, oW);
+      }));
+}
+
+void CorrelationBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input1, Tensor input2, Tensor grad_input1,
+    Tensor grad_input2, int kH, int kW, int patchH, int patchW, int padH,
+    int padW, int dilationH, int dilationW, int dilation_patchH,
+    int dilation_patchW, int dH, int dW) {
+  const int batch_size = input1.size(0);
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+  const int C = input1.size(1);
+
+  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
+  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
+  const dim3 blocks(batch_size, iH, iW);
+  const dim3 threads(THREADS_PER_BLOCK);
+
+  at::cuda::CUDAGuard device_guard(input1.device());
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input1.scalar_type(), "correlation_backward_cuda", ([&] {
+        const int grad_cache_size = patchH * patchW * sizeof(scalar_t);
+        TensorAcc4R input1_acc =
+            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R input2_acc =
+            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R grad_input1_acc =
+            grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R grad_input2_acc =
+            grad_input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc5R grad_output_acc =
+            grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
+
+        correlation_backward_cuda_kernel_input1<scalar_t>
+            <<<blocks, threads, grad_cache_size,
+               at::cuda::getCurrentCUDAStream()>>>(
+                grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
+                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                dilation_patchW, dH, dW);
+
+        correlation_backward_cuda_kernel_input2<scalar_t>
+            <<<blocks, threads, grad_cache_size,
+               at::cuda::getCurrentCUDAStream()>>>(
+                grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
+                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                dilation_patchW, dH, dW);
+      }));
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9359551d26ebbc7afd5443d3eada01880225944
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
@@ -0,0 +1,1918 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor &points, const Tensor &centers, const Tensor &scores,
+    const Tensor &knn_idx, Tensor &output);
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
+    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
+    Tensor &grad_centers, Tensor &grad_scores);
+
+void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor &points,
+                                     const Tensor &centers,
+                                     const Tensor &scores,
+                                     const Tensor &knn_idx, Tensor &output) {
+  AssignScoreWithKForwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
+};
+
+void assign_score_withk_backward_cuda(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
+    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
+    Tensor &grad_centers, Tensor &grad_scores) {
+  AssignScoreWithKBackwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
+      grad_points, grad_centers, grad_scores);
+};
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor &points,
+                                     const Tensor &centers,
+                                     const Tensor &scores,
+                                     const Tensor &knn_idx, Tensor &output);
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor &grad_out, const Tensor &points, const Tensor &centers,
+    const Tensor &scores, const Tensor &knn_idx, Tensor &grad_points,
+    Tensor &grad_centers, Tensor &grad_scores);
+
+REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
+                     assign_score_withk_forward_cuda);
+REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,
+                     assign_score_withk_backward_cuda);
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx);
+
+void ball_query_forward_cuda(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
+                                     new_xyz, xyz, idx);
+};
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx);
+REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);
+
+void StackBallQueryForwardCUDAKernelLauncher(float max_radius, int nsample,
+                                             const Tensor new_xyz,
+                                             const Tensor new_xyz_batch_cnt,
+                                             const Tensor xyz,
+                                             const Tensor xyz_batch_cnt,
+                                             Tensor idx);
+
+void stack_ball_query_forward_cuda(float max_radius, int nsample,
+                                   const Tensor new_xyz,
+                                   const Tensor new_xyz_batch_cnt,
+                                   const Tensor xyz, const Tensor xyz_batch_cnt,
+                                   Tensor idx) {
+  StackBallQueryForwardCUDAKernelLauncher(
+      max_radius, nsample, new_xyz, new_xyz_batch_cnt, xyz, xyz_batch_cnt, idx);
+};
+
+void stack_ball_query_forward_impl(float max_radius, int nsample,
+                                   const Tensor new_xyz,
+                                   const Tensor new_xyz_batch_cnt,
+                                   const Tensor xyz, const Tensor xyz_batch_cnt,
+                                   Tensor idx);
+REGISTER_DEVICE_IMPL(stack_ball_query_forward_impl, CUDA,
+                     stack_ball_query_forward_cuda);
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset);
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
+                                          const Tensor &boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size);
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
+                                           const Tensor &boxes,
+                                           const Tensor &argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size);
+
+void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
+                                       pool_size);
+}
+
+void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
+                                        grad_input, pool_size);
+}
+
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size);
+
+REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
+                     border_align_forward_cuda);
+REGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,
+                     border_align_backward_cuda);
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);
+
+void box_iou_quadri_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned);
+
+void box_iou_quadri_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_quadri_impl, CUDA, box_iou_quadri_cuda);
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor);
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor);
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                  output, kernel_size, group_size,
+                                  scale_factor);
+}
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                   bottom_grad, mask_grad, kernel_size,
+                                   group_size, scale_factor);
+}
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor);
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor);
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+                                       group_size, scale_factor);
+}
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+                                        mask_grad, kernel_size, group_size,
+                                        scale_factor);
+}
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,
+                     carafe_naive_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,
+                     carafe_naive_backward_cuda);
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW);
+
+void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
+                                           Tensor input2, Tensor grad_input1,
+                                           Tensor grad_input2, int kH, int kW,
+                                           int patchH, int patchW, int padH,
+                                           int padW, int dilationH,
+                                           int dilationW, int dilation_patchH,
+                                           int dilation_patchW, int dH, int dW);
+
+void correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  CorrelationForwardCUDAKernelLauncher(
+      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
+      dilationW, dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  CorrelationBackwardCUDAKernelLauncher(
+      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
+      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW);
+}
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW);
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW);
+
+REGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);
+REGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,
+                     correlation_backward_cuda);
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,
+                     deformable_col2im_coord_cuda);
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma);
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma);
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+                                         pooled_height, pooled_width,
+                                         spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DeformRoIPoolBackwardCUDAKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,
+                     deform_roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,
+                     deform_roi_pool_backward_cuda);
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha);
+}
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,
+                     sigmoid_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,
+                     sigmoid_focal_loss_backward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,
+                     softmax_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
+                     softmax_focal_loss_backward_cuda);
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float *dataset,
+                                                    float *temp, int *idxs);
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float *dataset, float *temp, int *idxs);
+
+void furthest_point_sampling_forward_cuda(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  const float *dataset = points_tensor.data_ptr<float>();
+  float *temp = temp_tensor.data_ptr<float>();
+  int *idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  const float *dataset = points_tensor.data_ptr<float>();
+  float *temp = temp_tensor.data_ptr<float>();
+  int *idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
+                                                         idxs);
+}
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m);
+
+REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
+                     furthest_point_sampling_forward_cuda);
+REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
+                     furthest_point_sampling_with_dist_forward_cuda);
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input,
+                                      const torch::Tensor &bias,
+                                      const torch::Tensor &refer, int act,
+                                      int grad, float alpha, float scale);
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor &input,
+                                           const torch::Tensor &bias,
+                                           const torch::Tensor &refer, int act,
+                                           int grad, float alpha, float scale);
+REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
+                     fused_bias_leakyrelu_op);
+
+torch::Tensor bias_act_op_impl(const torch::Tensor &input,
+                               const torch::Tensor &bias,
+                               const torch::Tensor &xref,
+                               const torch::Tensor &yref,
+                               const torch::Tensor &dy, int grad, int dim,
+                               int act, float alpha, float gain, float clamp);
+
+torch::Tensor bias_act_op(const torch::Tensor &input, const torch::Tensor &bias,
+                          const torch::Tensor &xref, const torch::Tensor &yref,
+                          const torch::Tensor &dy, int grad, int dim, int act,
+                          float alpha, float gain, float clamp);
+
+REGISTER_DEVICE_IMPL(bias_act_op_impl, CUDA, bias_act_op);
+
+torch::Tensor filtered_lrelu_act_op_impl(torch::Tensor x, torch::Tensor si,
+                                         int sx, int sy, float gain,
+                                         float slope, float clamp,
+                                         bool writeSigns);
+
+torch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,
+                                    int sy, float gain, float slope,
+                                    float clamp, bool writeSigns);
+
+REGISTER_DEVICE_IMPL(filtered_lrelu_act_op_impl, CUDA, filtered_lrelu_act_op);
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out);
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points);
+
+void gather_points_forward_cuda(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
+};
+
+void gather_points_backward_cuda(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
+                                         grad_points);
+};
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out);
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,
+                     gather_points_forward_cuda);
+REGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,
+                     gather_points_backward_cuda);
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out);
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points);
+
+void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
+                                       out);
+};
+
+void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
+                                        idx, grad_points);
+};
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out);
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
+                     group_points_forward_cuda);
+REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
+                     group_points_backward_cuda);
+
+void StackGroupPointsForwardCUDAKernelLauncher(
+    int b, int c, int m, int nsample, const Tensor features_tensor,
+    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,
+    const Tensor idx_batch_cnt_tensor, Tensor out_tensor);
+void StackGroupPointsBackwardCUDAKernelLauncher(
+    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,
+    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,
+    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor);
+
+void stack_group_points_forward_cuda(int b, int c, int m, int nsample,
+                                     const Tensor features_tensor,
+                                     const Tensor features_batch_cnt_tensor,
+                                     const Tensor idx_tensor,
+                                     const Tensor idx_batch_cnt_tensor,
+                                     Tensor out_tensor) {
+  StackGroupPointsForwardCUDAKernelLauncher(
+      b, c, m, nsample, features_tensor, features_batch_cnt_tensor, idx_tensor,
+      idx_batch_cnt_tensor, out_tensor);
+};
+
+void stack_group_points_backward_cuda(int b, int c, int m, int n, int nsample,
+                                      const Tensor grad_out_tensor,
+                                      const Tensor idx_tensor,
+                                      const Tensor idx_batch_cnt_tensor,
+                                      const Tensor features_batch_cnt_tensor,
+                                      Tensor grad_features_tensor) {
+  StackGroupPointsBackwardCUDAKernelLauncher(
+      b, c, m, n, nsample, grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,
+      features_batch_cnt_tensor, grad_features_tensor);
+};
+
+void stack_group_points_forward_impl(int b, int c, int m, int nsample,
+                                     const Tensor features_tensor,
+                                     const Tensor features_batch_cnt_tensor,
+                                     const Tensor idx_tensor,
+                                     const Tensor idx_batch_cnt_tensor,
+                                     Tensor out_tensor);
+
+void stack_group_points_backward_impl(int b, int c, int m, int n, int nsample,
+                                      const Tensor grad_out_tensor,
+                                      const Tensor idx_tensor,
+                                      const Tensor idx_batch_cnt_tensor,
+                                      const Tensor features_batch_cnt_tensor,
+                                      Tensor grad_features_tensor);
+
+REGISTER_DEVICE_IMPL(stack_group_points_forward_impl, CUDA,
+                     stack_group_points_forward_cuda);
+REGISTER_DEVICE_IMPL(stack_group_points_backward_impl, CUDA,
+                     stack_group_points_backward_cuda);
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap);
+
+void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor &keep,
+                                         Tensor &keep_num,
+                                         float nms_overlap_thresh);
+
+void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor &keep,
+                                               Tensor &keep_num,
+                                               float nms_overlap_thresh);
+
+void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                                ans_overlap);
+};
+
+void iou3d_nms3d_forward_cuda(const Tensor boxes, Tensor &keep,
+                              Tensor &keep_num, float nms_overlap_thresh) {
+  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, keep, keep_num,
+                                      nms_overlap_thresh);
+};
+
+void iou3d_nms3d_normal_forward_cuda(const Tensor boxes, Tensor &keep,
+                                     Tensor &keep_num,
+                                     float nms_overlap_thresh) {
+  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, keep, keep_num,
+                                            nms_overlap_thresh);
+};
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap);
+
+void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
+                              Tensor &keep_num, float nms_overlap_thresh);
+
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
+                                     Tensor &keep_num,
+                                     float nms_overlap_thresh);
+
+REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
+                     iou3d_boxes_overlap_bev_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, CUDA, iou3d_nms3d_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, CUDA,
+                     iou3d_nms3d_normal_forward_cuda);
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2);
+
+void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
+}
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2);
+REGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w);
+
+void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int height,
+                                           const int width, const int channels);
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                        width, channels);
+}
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+
+REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,
+                     masked_im2col_forward_cuda);
+REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,
+                     masked_col2im_forward_cuda);
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,
+                     modulated_deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
+                     modulated_deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
+                     modulated_deformable_col2im_coord_cuda);
+
+Tensor ms_deform_attn_cuda_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_cuda_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
+
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_impl_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
+                     ms_deform_attn_cuda_forward);
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,
+                     ms_deform_attn_cuda_backward);
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset);
+
+Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points);
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points);
+
+void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                             boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                            boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points);
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points);
+REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,
+                     points_in_boxes_part_forward_cuda);
+REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,
+                     points_in_boxes_all_forward_cuda);
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask);
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask);
+
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
+                                   w_feature, h_mask, w_mask, half_h_mask,
+                                   half_w_mask);
+}
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                    h_feature, w_feature, h_mask, w_mask,
+                                    half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  ROIAlignBackwardCUDAKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output);
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
+
+void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = input.size(1);
+  int data_height = input.size(2);
+  int data_width = input.size(3);
+  ROIAlignRotatedForwardCUDAKernelLauncher(
+      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, output);
+}
+
+void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  ROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, bottom_grad);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
+                     roi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
+                     roi_align_rotated_backward_cuda);
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output);
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad);
+
+void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(features);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = features.size(1) / num_orientations;
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  RiROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, output);
+}
+
+void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(top_grad);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = bottom_grad.size(1) / num_orientations;
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  RiROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, bottom_grad);
+}
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise);
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise);
+
+REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
+                     riroi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
+                     riroi_align_rotated_backward_cuda);
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method);
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method);
+
+void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  RoiawarePool3dForwardCUDAKernelLauncher(
+      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
+      pool_method);
+};
+
+void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  RoiawarePool3dBackwardCUDAKernelLauncher(
+      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
+};
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,
+                     roiaware_pool3d_forward_cuda);
+REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,
+                     roiaware_pool3d_backward_cuda);
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
+
+void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  RoIPointPool3dForwardCUDAKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+};
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag);
+REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,
+                     roipoint_pool3d_forward_cuda);
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale);
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale);
+
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
+                                   pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
+                                    pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor &feats, const at::Tensor &coors,
+    const reduce_t reduce_type);
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
+    const at::Tensor &feats, const at::Tensor &reduced_feats,
+    const at::Tensor &coors_map, const at::Tensor &reduce_count,
+    const reduce_t reduce_type);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const reduce_t reduce_type) {
+  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
+                                                      reduce_type);
+};
+
+void dynamic_point_to_voxel_backward_cuda(
+    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
+    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
+    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
+    const reduce_t reduce_type) {
+  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
+                                                feats, reduced_feats, coors_idx,
+                                                reduce_count, reduce_type);
+};
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
+    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
+    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
+    const reduce_t reduce_type);
+
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
+                     dynamic_point_to_voxel_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
+                     dynamic_point_to_voxel_backward_cuda);
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var);
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size);
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias);
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input);
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
+  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
+}
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
+                                        running_var, weight, bias, norm, std,
+                                        output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
+                                        grad_bias);
+}
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
+                                       grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,
+                     sync_bn_forward_mean_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,
+                     sync_bn_forward_output_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,
+                     sync_bn_backward_param_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,
+                     sync_bn_backward_data_cuda);
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight, Tensor out);
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points);
+
+void three_interpolate_forward_cuda(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
+                                            out);
+};
+
+void three_interpolate_backward_cuda(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
+                                             grad_points);
+};
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out);
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points);
+REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,
+                     three_interpolate_forward_cuda);
+REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,
+                     three_interpolate_backward_cuda);
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx);
+
+void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
+};
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx);
+REGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output);
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input);
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardCUDAKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);
+
+torch::Tensor upfirdn2d_op(torch::Tensor input, torch::Tensor filter, int upx,
+                           int upy, int downx, int downy, int padx0, int padx1,
+                           int pady0, int pady1, bool flip, float gain);
+
+torch::Tensor upfirdn2d_op_impl(torch::Tensor input, torch::Tensor filter,
+                                int upx, int upy, int downx, int downy,
+                                int padx0, int padx1, int pady0, int pady1,
+                                bool flip, float gain);
+REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3);
+
+int hard_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim) {
+  return HardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+int nondeterministic_hard_voxelize_forward_cuda(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim) {
+  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+void dynamic_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim) {
+  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
+                                           coors_range, NDim);
+};
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
+                     hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
+                     nondeterministic_hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
+                     dynamic_voxelize_forward_cuda);
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output);
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad);
+
+void rotated_feature_align_forward_cuda(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
+                                               spatial_scale, points, output);
+};
+
+void rotated_feature_align_backward_cuda(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  RotatedFeatureAlignBackwardCUDAKernelLauncher(
+      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
+};
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
+                     rotated_feature_align_forward_cuda);
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
+                     rotated_feature_align_backward_cuda);
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output);
+
+void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
+                                            output);
+};
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols);
+
+REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
+                     points_in_polygons_forward_cuda);
+
+torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
+                                                     torch::Tensor indicePairs,
+                                                     torch::Tensor indiceNum,
+                                                     int64_t numAct);
+
+torch::Tensor indice_maxpool_forward_cuda(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct) {
+  return IndiceMaxpoolForwardCUDAKernelLauncher(features, indicePairs,
+                                                indiceNum, numAct);
+};
+
+torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct);
+REGISTER_DEVICE_IMPL(indice_maxpool_forward_impl, CUDA,
+                     indice_maxpool_forward_cuda);
+
+torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
+                                                      torch::Tensor outFeatures,
+                                                      torch::Tensor outGrad,
+                                                      torch::Tensor indicePairs,
+                                                      torch::Tensor indiceNum);
+
+torch::Tensor indice_maxpool_backward_cuda(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum) {
+  return IndiceMaxpoolBackwardCUDAKernelLauncher(features, outFeatures, outGrad,
+                                                 indicePairs, indiceNum);
+};
+
+torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum);
+
+REGISTER_DEVICE_IMPL(indice_maxpool_backward_impl, CUDA,
+                     indice_maxpool_backward_cuda)
+
+torch::Tensor IndiceConvForwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
+    int64_t _subM);
+
+torch::Tensor indice_conv_forward_cuda(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM) {
+  return IndiceConvForwardCUDAKernelLauncher(
+      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
+};
+
+torch::Tensor indice_conv_forward_impl(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM);
+
+REGISTER_DEVICE_IMPL(indice_conv_forward_impl, CUDA, indice_conv_forward_cuda);
+
+std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM);
+
+std::vector<torch::Tensor> indice_conv_backward_cuda(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return IndiceConvBackwardCUDAKernelLauncher(
+      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
+};
+
+std::vector<torch::Tensor> indice_conv_backward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM);
+
+REGISTER_DEVICE_IMPL(indice_conv_backward_impl, CUDA,
+                     indice_conv_backward_cuda);
+
+torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM);
+
+torch::Tensor fused_indice_conv_batchnorm_forward_cuda(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return FusedIndiceConvBatchnormCUDAKernelLauncher(features, filters, bias,
+                                                    indicePairs, indiceNum,
+                                                    numActOut, _inverse, _subM);
+};
+
+torch::Tensor fused_indice_conv_batchnorm_forward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM);
+
+REGISTER_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl, CUDA,
+                     fused_indice_conv_batchnorm_forward_cuda)
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
+
+void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
+  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
+}
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
+
+REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output);
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in);
+
+void active_rotated_filter_forward_cuda(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
+};
+
+void active_rotated_filter_backward_cuda(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
+};
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
+                     active_rotated_filter_forward_cuda);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
+                     active_rotated_filter_backward_cuda);
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious);
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output);
+
+void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
+}
+
+void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
+}
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious);
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output);
+
+REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
+REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
+
+Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
+                                                    Tensor mask,
+                                                    Tensor num_valid);
+
+Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
+                                                      num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
+                     diff_iou_rotated_sort_vertices_forward_cuda);
+
+void ChamferDistanceForwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
+    const Tensor dist2, const Tensor idx1, const Tensor idx2);
+
+void ChamferDistanceBackwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, Tensor idx1, Tensor idx2,
+    Tensor grad_dist1, Tensor grad_dist2, Tensor grad_xyz1, Tensor grad_xyz2);
+
+void chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2) {
+  ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,
+                                           idx2);
+};
+
+void chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor idx1, Tensor idx2, Tensor graddist1,
+                                    Tensor graddist2, Tensor gradxyz1,
+                                    Tensor gradxyz2) {
+  ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, idx1, idx2, graddist1,
+                                            graddist2, gradxyz1, gradxyz2);
+};
+
+void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2);
+
+void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor idx1, Tensor idx2, Tensor graddist1,
+                                    Tensor graddist2, Tensor gradxyz1,
+                                    Tensor gradxyz2);
+
+REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
+                     chamfer_distance_forward_cuda);
+REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
+                     chamfer_distance_backward_cuda);
+
+void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                        Tensor output, int pooled_height,
+                                        int pooled_width, float spatial_scale);
+
+void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                         Tensor grad_input, int pooled_height,
+                                         int pooled_width, float spatial_scale);
+
+void PrROIPoolCoorBackwardCUDAKernelLauncher(
+    Tensor output, Tensor grad_output, Tensor input, Tensor rois,
+    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);
+
+void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
+                                     pooled_width, spatial_scale);
+}
+
+void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
+                                      pooled_height, pooled_width,
+                                      spatial_scale);
+}
+
+void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
+                                          grad_rois, pooled_height,
+                                          pooled_width, spatial_scale);
+}
+
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale);
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale);
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale);
+REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
+                     prroi_pool_coor_backward_cuda);
+
+void BezierAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                          Tensor output, int aligned_height,
+                                          int aligned_width,
+                                          float spatial_scale,
+                                          int sampling_ratio, bool aligned);
+
+void BezierAlignBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor rois, Tensor grad_input, int aligned_height,
+    int aligned_width, float spatial_scale, int sampling_ratio, bool aligned);
+
+void bezier_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned);
+
+void bezier_align_backward_impl(Tensor grad_output, Tensor rois,
+                                Tensor grad_input, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned);
+
+REGISTER_DEVICE_IMPL(bezier_align_forward_impl, CUDA,
+                     BezierAlignForwardCUDAKernelLauncher);
+REGISTER_DEVICE_IMPL(bezier_align_backward_impl, CUDA,
+                     BezierAlignBackwardCUDAKernelLauncher);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..05fc08b70be937411ed04c0dc80c40f5479c0d9e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
@@ -0,0 +1,105 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "deform_conv_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels),
+                                       THREADS_PER_BLOCK, 0,
+                                       at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_im_, data_offset_, height, width, ksize_h,
+            ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels,
+            deformable_group, height_col, width_col, data_col_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels),
+                                       THREADS_PER_BLOCK, 0,
+                                       at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+            dilation_w, channel_per_deformable_group, parallel_imgs,
+            deformable_group, height_col, width_col, grad_im_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+                    deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height,
+            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d44399829e99f725e2c24418723ea14685819858
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
@@ -0,0 +1,55 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "deform_roi_pool_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "deform_roi_pool_forward_cuda_kernel", [&] {
+        deform_roi_pool_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "deform_roi_pool_backward_cuda_kernel", [&] {
+        deform_roi_pool_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+                offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+                grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..62dbf5da357ac8f2178e53d21fd8f9d3339eca81
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Adapted from
+// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa
+#include "diff_iou_rotated_cuda_kernel.cuh"
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
+
+at::Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(at::Tensor vertices,
+                                                        at::Tensor mask,
+                                                        at::Tensor num_valid) {
+  at::cuda::CUDAGuard device_guard(vertices.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  CHECK_CONTIGUOUS(vertices);
+  CHECK_CONTIGUOUS(mask);
+  CHECK_CONTIGUOUS(num_valid);
+  CHECK_CUDA(vertices);
+  CHECK_CUDA(mask);
+  CHECK_CUDA(num_valid);
+
+  int b = vertices.size(0);
+  int n = vertices.size(1);
+  int m = vertices.size(2);
+  at::Tensor idx =
+      torch::zeros({b, n, MAX_NUM_VERT_IDX},
+                   at::device(vertices.device()).dtype(at::ScalarType::Int));
+
+  diff_iou_rotated_sort_vertices_forward_cuda_kernel<<<b, opt_n_thread(n), 0,
+                                                       stream>>>(
+      b, n, m, vertices.data_ptr<float>(), mask.data_ptr<bool>(),
+      num_valid.data_ptr<int>(), idx.data_ptr<int>());
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return idx;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cee9b26168336431bf3b212bf362a505f6b8edd3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu
@@ -0,0 +1,2056 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#include <c10/util/Half.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include <cstdint>
+
+#include "pytorch_cuda_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+//------------------------------------------------------------------------
+// CUDA kernel parameters.
+
+struct filtered_lrelu_kernel_params {
+  // These parameters decide which kernel to use.
+  int up;        // upsampling ratio (1, 2, 4)
+  int down;      // downsampling ratio (1, 2, 4)
+  int2 fuShape;  // [size, 1] | [size, size]
+  int2 fdShape;  // [size, 1] | [size, size]
+
+  int _dummy;  // Alignment.
+
+  // Rest of the parameters.
+  const void *x;     // Input tensor.
+  void *y;           // Output tensor.
+  const void *b;     // Bias tensor.
+  unsigned char *s;  // Sign tensor in/out. NULL if unused.
+  const float *fu;   // Upsampling filter.
+  const float *fd;   // Downsampling filter.
+
+  int2 pad0;    // Left/top padding.
+  float gain;   // Additional gain factor.
+  float slope;  // Leaky ReLU slope on negative side.
+  float clamp;  // Clamp after nonlinearity.
+  int flip;     // Filter kernel flip for gradient computation.
+
+  int tilesXdim;  // Original number of horizontal output tiles.
+  int tilesXrep;  // Number of horizontal tiles per CTA.
+  int blockZofs;  // Block z offset to support large minibatch, channel
+                  // dimensions.
+
+  int4 xShape;  // [width, height, channel, batch]
+  int4 yShape;  // [width, height, channel, batch]
+  int2 sShape;  // [width, height] - width is in bytes. Contiguous. Zeros if
+                // unused.
+  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
+  int swLimit;  // Active width of sign tensor in bytes.
+
+  longlong4 xStride;   // Strides of all tensors except signs, same component
+                       // order as shapes.
+  longlong4 yStride;   //
+  int64_t bStride;     //
+  longlong3 fuStride;  //
+  longlong3 fdStride;  //
+};
+
+struct filtered_lrelu_act_kernel_params {
+  void *x;           // Input/output, modified in-place.
+  unsigned char *s;  // Sign tensor in/out. NULL if unused.
+
+  float gain;   // Additional gain factor.
+  float slope;  // Leaky ReLU slope on negative side.
+  float clamp;  // Clamp after nonlinearity.
+
+  int4 xShape;        // [width, height, channel, batch]
+  longlong4 xStride;  // Input/output tensor strides, same order as in shape.
+  int2 sShape;  // [width, height] - width is in elements. Contiguous. Zeros if
+                // unused.
+  int2 sOfs;  // [ofs_x, ofs_y] - offset between upsampled data and sign tensor.
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel specialization.
+
+struct filtered_lrelu_kernel_spec {
+  void *setup;   // Function for filter kernel setup.
+  void *exec;    // Function for main operation.
+  int2 tileOut;  // Width/height of launch tile.
+  int numWarps;  // Number of warps per thread block, determines launch block
+                 // size.
+  int xrep;      // For processing multiple horizontal tiles per thread block.
+  int dynamicSharedKB;  // How much dynamic shared memory the exec kernel wants.
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T, class index_t, bool signWrite, bool signRead>
+filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(
+    const filtered_lrelu_kernel_params &p, int sharedKB);
+template <class T, bool signWrite, bool signRead>
+void *choose_filtered_lrelu_act_kernel(void);
+
+//------------------------------------------------------------------------
+// Helpers.
+
+enum              // Filter modes.
+{ MODE_SUSD = 0,  // Separable upsampling, separable downsampling.
+  MODE_FUSD = 1,  // Full upsampling, separable downsampling.
+  MODE_SUFD = 2,  // Separable upsampling, full downsampling.
+  MODE_FUFD = 3,  // Full upsampling, full downsampling.
+};
+
+template <class T>
+struct InternalType;
+template <>
+struct InternalType<double> {
+  typedef double scalar_t;
+  typedef double2 vec2_t;
+  typedef double4 vec4_t;
+  __device__ __forceinline__ static vec2_t zero_vec2(void) {
+    return make_double2(0, 0);
+  }
+  __device__ __forceinline__ static vec4_t zero_vec4(void) {
+    return make_double4(0, 0, 0, 0);
+  }
+  __device__ __forceinline__ static double clamp(double x, double c) {
+    return fmin(fmax(x, -c), c);
+  }
+};
+template <>
+struct InternalType<float> {
+  typedef float scalar_t;
+  typedef float2 vec2_t;
+  typedef float4 vec4_t;
+  __device__ __forceinline__ static vec2_t zero_vec2(void) {
+    return make_float2(0, 0);
+  }
+  __device__ __forceinline__ static vec4_t zero_vec4(void) {
+    return make_float4(0, 0, 0, 0);
+  }
+  __device__ __forceinline__ static float clamp(float x, float c) {
+    return fminf(fmaxf(x, -c), c);
+  }
+};
+template <>
+struct InternalType<c10::Half> {
+  typedef float scalar_t;
+  typedef float2 vec2_t;
+  typedef float4 vec4_t;
+  __device__ __forceinline__ static vec2_t zero_vec2(void) {
+    return make_float2(0, 0);
+  }
+  __device__ __forceinline__ static vec4_t zero_vec4(void) {
+    return make_float4(0, 0, 0, 0);
+  }
+  __device__ __forceinline__ static float clamp(float x, float c) {
+    return fminf(fmaxf(x, -c), c);
+  }
+};
+
+#define MIN(A, B) ((A) < (B) ? (A) : (B))
+#define MAX(A, B) ((A) > (B) ? (A) : (B))
+#define CEIL_DIV(A, B)                                   \
+  (((B) == 1)                                            \
+       ? (A)                                             \
+       : ((B) == 2) ? ((int)((A) + 1) >> 1)              \
+                    : ((B) == 4) ? ((int)((A) + 3) >> 2) \
+                                 : (((A) + ((A) > 0 ? (B)-1 : 0)) / (B)))
+
+// This works only up to blocks of size 256 x 256 and for all N that are powers
+// of two.
+template <int N>
+__device__ __forceinline__ void fast_div_mod(int &x, int &y, unsigned int i) {
+  if ((N & (N - 1)) && N <= 256)
+    y = (i * ((1 << 24) / N + 1)) >> 24;  // Assumes N <= 256, i < N*256.
+  else
+    y = i / N;
+
+  x = i - y * N;
+}
+
+// Type cast stride before reading it.
+template <class T>
+__device__ __forceinline__ T get_stride(const int64_t &x) {
+  return *reinterpret_cast<const T *>(&x);
+}
+
+//------------------------------------------------------------------------
+// Filters, setup kernel, copying function.
+
+#define MAX_FILTER_SIZE 32
+
+// Combined up/down filter buffers so that transfer can be done with one copy.
+__device__ float
+    g_fbuf[2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE];  // Filters in global memory,
+                                                    // written by setup kernel.
+__device__ __constant__ float
+    c_fbuf[2 * MAX_FILTER_SIZE *
+           MAX_FILTER_SIZE];  // Filters in constant memory, read by main
+                              // kernel.
+
+// Accessors to combined buffers to index up/down filters individually.
+#define c_fu (c_fbuf)
+#define c_fd (c_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
+#define g_fu (g_fbuf)
+#define g_fd (g_fbuf + MAX_FILTER_SIZE * MAX_FILTER_SIZE)
+
+// Set up filters into global memory buffer.
+static __global__ void setup_filters_kernel(filtered_lrelu_kernel_params p) {
+  for (int idx = threadIdx.x; idx < MAX_FILTER_SIZE * MAX_FILTER_SIZE;
+       idx += blockDim.x) {
+    int x, y;
+    fast_div_mod<MAX_FILTER_SIZE>(x, y, idx);
+
+    int fu_x = p.flip ? x : (p.fuShape.x - 1 - x);
+    int fu_y = p.flip ? y : (p.fuShape.y - 1 - y);
+    if (p.fuShape.y > 0)
+      g_fu[idx] = (x >= p.fuShape.x || y >= p.fuShape.y)
+                      ? 0.0f
+                      : p.fu[fu_x * p.fuStride.x + fu_y * p.fuStride.y];
+    else
+      g_fu[idx] =
+          (x >= p.fuShape.x || y > 0) ? 0.0f : p.fu[fu_x * p.fuStride.x];
+
+    int fd_x = p.flip ? x : (p.fdShape.x - 1 - x);
+    int fd_y = p.flip ? y : (p.fdShape.y - 1 - y);
+    if (p.fdShape.y > 0)
+      g_fd[idx] = (x >= p.fdShape.x || y >= p.fdShape.y)
+                      ? 0.0f
+                      : p.fd[fd_x * p.fdStride.x + fd_y * p.fdStride.y];
+    else
+      g_fd[idx] =
+          (x >= p.fdShape.x || y > 0) ? 0.0f : p.fd[fd_x * p.fdStride.x];
+  }
+}
+
+// Host function to copy filters written by setup kernel into constant buffer
+// for main kernel.
+static cudaError_t copy_filters(cudaStream_t stream) {
+  void *src = 0;
+  cudaError_t err = cudaGetSymbolAddress(&src, g_fbuf);
+  if (err) return err;
+  return cudaMemcpyToSymbolAsync(
+      c_fbuf, src, 2 * MAX_FILTER_SIZE * MAX_FILTER_SIZE * sizeof(float), 0,
+      cudaMemcpyDeviceToDevice, stream);
+}
+
+//------------------------------------------------------------------------
+// Coordinate spaces:
+// - Relative to input tensor:      inX, inY, tileInX, tileInY
+// - Relative to input tile:        relInX, relInY, tileInW, tileInH
+// - Relative to upsampled tile:    relUpX, relUpY, tileUpW, tileUpH
+// - Relative to output tile:       relOutX, relOutY, tileOutW, tileOutH
+// - Relative to output tensor:     outX, outY, tileOutX, tileOutY
+//
+// Relationships between coordinate spaces:
+// - inX = tileInX + relInX
+// - inY = tileInY + relInY
+// - relUpX = relInX * up + phaseInX
+// - relUpY = relInY * up + phaseInY
+// - relUpX = relOutX * down
+// - relUpY = relOutY * down
+// - outX = tileOutX + relOutX
+// - outY = tileOutY + relOutY
+
+extern __shared__ char
+    s_buf_raw[];  // When sharedKB <= 48, allocate shared memory statically
+                  // inside the kernel, otherwise use the externally allocated
+                  // shared memory buffer.
+
+template <class T, class index_t, int sharedKB, bool signWrite, bool signRead,
+          int filterMode, int up, int fuSize, int down, int fdSize,
+          int tileOutW, int tileOutH, int threadsPerBlock, bool enableXrep,
+          bool enableWriteSkip>
+static __global__ void filtered_lrelu_kernel(filtered_lrelu_kernel_params p) {
+  // Check that we don't try to support non-existing filter modes.
+  static_assert(up == 1 || up == 2 || up == 4,
+                "only up=1, up=2, up=4 scales supported");
+  static_assert(down == 1 || down == 2 || down == 4,
+                "only down=1, down=2, down=4 scales supported");
+  static_assert(fuSize >= up,
+                "upsampling filter size must be at least upsampling factor");
+  static_assert(
+      fdSize >= down,
+      "downsampling filter size must be at least downsampling factor");
+  static_assert(
+      fuSize % up == 0,
+      "upsampling filter size must be divisible with upsampling factor");
+  static_assert(
+      fdSize % down == 0,
+      "downsampling filter size must be divisible with downsampling factor");
+  static_assert(fuSize <= MAX_FILTER_SIZE && fdSize <= MAX_FILTER_SIZE,
+                "filter size greater than MAX_FILTER_SIZE");
+  static_assert(up != 1 || (fuSize == 1 && (filterMode == MODE_FUFD ||
+                                            filterMode == MODE_FUSD)),
+                "up=1 supported only for 1x1 full filters");
+  static_assert(down != 1 || (fdSize == 1 && (filterMode == MODE_FUFD ||
+                                              filterMode == MODE_SUFD)),
+                "down=1 supported only for 1x1 full filters");
+  static_assert(
+      !(up == 4 && (filterMode == MODE_FUFD || filterMode == MODE_FUSD)),
+      "full filters not supported for up=4");
+  static_assert(
+      !(down == 4 && (filterMode == MODE_FUFD || filterMode == MODE_SUFD)),
+      "full filters not supported for down=4");
+
+  // Static definitions.
+  typedef typename InternalType<T>::scalar_t scalar_t;
+  typedef typename InternalType<T>::vec2_t vec2_t;
+  typedef typename InternalType<T>::vec4_t vec4_t;
+  const int tileUpW = (tileOutW * down + (fdSize - 1) - (down - 1) + 3) &
+                      ~3;  // Upsampled tile width, rounded up to multiple of 4.
+  const int tileUpH =
+      tileOutH * down + (fdSize - 1) - (down - 1);  // Upsampled tile height.
+  const int tileInW =
+      CEIL_DIV(tileUpW + (fuSize - 1), up);  // Input tile width.
+  const int tileInH =
+      CEIL_DIV(tileUpH + (fuSize - 1), up);  // Input tile height.
+  const int tileUpH_up =
+      CEIL_DIV(tileUpH, up) *
+      up;  // Upsampled tile height rounded up to a multiple of up.
+  const int tileInH_up =
+      CEIL_DIV(tileUpH_up + (fuSize - 1),
+               up);  // For allocations only, to avoid shared memory read
+                     // overruns with up=2 and up=4.
+
+  // Merge 1x1 downsampling into last upsampling step for upf1 and ups2.
+  const bool downInline =
+      (down == 1) && ((up == 1 && filterMode == MODE_FUFD) ||
+                      (up == 2 && filterMode == MODE_SUFD));
+
+  // Sizes of logical buffers.
+  const int szIn = tileInH_up * tileInW;
+  const int szUpX = tileInH_up * tileUpW;
+  const int szUpXY = downInline ? 0 : (tileUpH * tileUpW);
+  const int szDownX = tileUpH * tileOutW;
+
+  // Sizes for shared memory arrays.
+  const int s_buf0_size_base =
+      (filterMode == MODE_SUSD)
+          ? MAX(szIn, szUpXY)
+          : (filterMode == MODE_FUSD)
+                ? MAX(szIn, szDownX)
+                : (filterMode == MODE_SUFD)
+                      ? MAX(szIn, szUpXY)
+                      : (filterMode == MODE_FUFD) ? szIn : -1;
+  const int s_buf1_size_base =
+      (filterMode == MODE_SUSD)
+          ? MAX(szUpX, szDownX)
+          : (filterMode == MODE_FUSD)
+                ? szUpXY
+                : (filterMode == MODE_SUFD)
+                      ? szUpX
+                      : (filterMode == MODE_FUFD) ? szUpXY : -1;
+
+  // Ensure U128 alignment.
+  const int s_buf0_size = (s_buf0_size_base + 3) & ~3;
+  const int s_buf1_size = (s_buf1_size_base + 3) & ~3;
+
+  // Check at compile time that we don't use too much shared memory.
+  static_assert(
+      (s_buf0_size + s_buf1_size) * sizeof(scalar_t) <= (sharedKB << 10),
+      "shared memory overflow");
+
+  // Declare shared memory arrays.
+  scalar_t *s_buf0;
+  scalar_t *s_buf1;
+  if (sharedKB <= 48) {
+    // Allocate shared memory arrays here.
+    __shared__ scalar_t
+        s_buf0_st[(sharedKB > 48)
+                      ? (1 << 24)
+                      : (s_buf0_size +
+                         s_buf1_size)];  // Prevent launching if this isn't
+                                         // optimized away when unused.
+    s_buf0 = s_buf0_st;
+    s_buf1 = s_buf0 + s_buf0_size;
+  } else {
+    // Use the dynamically allocated shared memory array.
+    s_buf0 = (scalar_t *)s_buf_raw;
+    s_buf1 = s_buf0 + s_buf0_size;
+  }
+
+  // Pointers to the buffers.
+  scalar_t *
+      s_tileIn;  // Input tile:                      [relInX * tileInH + relInY]
+  scalar_t *s_tileUpX;   // After horizontal upsampling:     [relInY * tileUpW +
+                         // relUpX]
+  scalar_t *s_tileUpXY;  // After upsampling:                [relUpY * tileUpW +
+                         // relUpX]
+  scalar_t *s_tileDownX;  // After horizontal downsampling:   [relUpY * tileOutW
+                          // + relOutX]
+  if (filterMode == MODE_SUSD) {
+    s_tileIn = s_buf0;
+    s_tileUpX = s_buf1;
+    s_tileUpXY = s_buf0;
+    s_tileDownX = s_buf1;
+  } else if (filterMode == MODE_FUSD) {
+    s_tileIn = s_buf0;
+    s_tileUpXY = s_buf1;
+    s_tileDownX = s_buf0;
+  } else if (filterMode == MODE_SUFD) {
+    s_tileIn = s_buf0;
+    s_tileUpX = s_buf1;
+    s_tileUpXY = s_buf0;
+  } else if (filterMode == MODE_FUFD) {
+    s_tileIn = s_buf0;
+    s_tileUpXY = s_buf1;
+  }
+
+  // Allow large grids in z direction via per-launch offset.
+  int channelIdx = blockIdx.z + p.blockZofs;
+  int batchIdx = channelIdx / p.yShape.z;
+  channelIdx -= batchIdx * p.yShape.z;
+
+  // Offset to output feature map. In bytes.
+  index_t mapOfsOut = channelIdx * get_stride<index_t>(p.yStride.z) +
+                      batchIdx * get_stride<index_t>(p.yStride.w);
+
+  // Sign shift amount.
+  uint32_t signXo = ((threadIdx.x + p.sOfs.x) << 1) & 6;
+
+// Inner tile loop.
+#pragma unroll 1
+  for (int tileIdx = 0;
+       !enableXrep ||
+       (tileIdx < MIN(p.tilesXrep, p.tilesXdim - p.tilesXrep * blockIdx.y));
+       tileIdx++) {
+    // Locate output tile.
+    int tileX = enableXrep ? blockIdx.y * p.tilesXrep + tileIdx : blockIdx.x;
+    int tileOutX = tileX * tileOutW;
+    int tileOutY = (enableXrep ? blockIdx.x : blockIdx.y) * tileOutH;
+
+    // Locate input tile.
+    int tmpX = tileOutX * down - p.pad0.x;
+    int tmpY = tileOutY * down - p.pad0.y;
+    int tileInX = CEIL_DIV(tmpX, up);
+    int tileInY = CEIL_DIV(tmpY, up);
+    const int phaseInX = tileInX * up - tmpX;
+    const int phaseInY = tileInY * up - tmpY;
+
+    // Extra sync if input and output buffers are the same and we are not on
+    // first tile.
+    if (enableXrep && tileIdx > 0 &&
+        (filterMode == MODE_FUSD || (filterMode == MODE_SUFD && !downInline) ||
+         (filterMode == MODE_FUFD && downInline)))
+      __syncthreads();
+
+    // Load input tile & apply bias. Unrolled.
+    scalar_t b =
+        (scalar_t) * (const T *)((const char *)p.b +
+                                 (channelIdx * get_stride<index_t>(p.bStride)));
+    index_t mapOfsIn = channelIdx * get_stride<index_t>(p.xStride.z) +
+                       batchIdx * get_stride<index_t>(p.xStride.w);
+    int idx = threadIdx.x;
+    const int loopCountIN = CEIL_DIV(tileInW * tileInH, threadsPerBlock);
+#pragma unroll
+    for (int loop = 0; loop < loopCountIN; loop++) {
+      int relInX, relInY;
+      fast_div_mod<tileInW>(relInX, relInY, idx);
+      int inX = tileInX + relInX;
+      int inY = tileInY + relInY;
+      scalar_t v = 0;
+
+      if ((uint32_t)inX < p.xShape.x && (uint32_t)inY < p.xShape.y)
+        v = (scalar_t) * ((const T *)((const char *)p.x +
+                                      (inX * get_stride<index_t>(p.xStride.x) +
+                                       inY * get_stride<index_t>(p.xStride.y) +
+                                       mapOfsIn))) +
+            b;
+
+      bool skip = (loop == loopCountIN - 1) && (idx >= tileInW * tileInH);
+      if (!skip) s_tileIn[idx] = v;
+
+      idx += threadsPerBlock;
+    }
+
+    if (filterMode == MODE_SUSD ||
+        filterMode == MODE_SUFD)  // Separable upsampling filter.
+    {
+      // Horizontal upsampling.
+      __syncthreads();
+      if (up == 4) {
+        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;
+             idx += blockDim.x * up) {
+          int relUpX0, relInY;
+          fast_div_mod<tileUpW>(relUpX0, relInY, idx);
+          int relInX0 = relUpX0 / up;
+          int src0 = relInX0 + tileInW * relInY;
+          int dst = relInY * tileUpW + relUpX0;
+          vec4_t v = InternalType<T>::zero_vec4();
+          scalar_t a = s_tileIn[src0];
+          if (phaseInX == 0) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileIn[src0 + step + 1];
+              v.y += a * (scalar_t)c_fu[step * up + 3];
+              v.z += a * (scalar_t)c_fu[step * up + 2];
+              v.w += a * (scalar_t)c_fu[step * up + 1];
+            }
+          } else if (phaseInX == 1) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 1];
+              v.y += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileIn[src0 + step + 1];
+              v.z += a * (scalar_t)c_fu[step * up + 3];
+              v.w += a * (scalar_t)c_fu[step * up + 2];
+            }
+          } else if (phaseInX == 2) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 2];
+              v.y += a * (scalar_t)c_fu[step * up + 1];
+              v.z += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileIn[src0 + step + 1];
+              v.w += a * (scalar_t)c_fu[step * up + 3];
+            }
+          } else  // (phaseInX == 3)
+          {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 3];
+              v.y += a * (scalar_t)c_fu[step * up + 2];
+              v.z += a * (scalar_t)c_fu[step * up + 1];
+              v.w += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileIn[src0 + step + 1];
+            }
+          }
+          s_tileUpX[dst + 0] = v.x;
+          s_tileUpX[dst + 1] = v.y;
+          s_tileUpX[dst + 2] = v.z;
+          s_tileUpX[dst + 3] = v.w;
+        }
+      } else if (up == 2) {
+        bool p0 = (phaseInX == 0);
+        for (int idx = threadIdx.x * up; idx < tileUpW * tileInH;
+             idx += blockDim.x * up) {
+          int relUpX0, relInY;
+          fast_div_mod<tileUpW>(relUpX0, relInY, idx);
+          int relInX0 = relUpX0 / up;
+          int src0 = relInX0 + tileInW * relInY;
+          int dst = relInY * tileUpW + relUpX0;
+          vec2_t v = InternalType<T>::zero_vec2();
+          scalar_t a = s_tileIn[src0];
+          if (p0)  // (phaseInX == 0)
+          {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileIn[src0 + step + 1];
+              v.y += a * (scalar_t)c_fu[step * up + 1];
+            }
+          } else  // (phaseInX == 1)
+          {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 1];
+              v.y += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileIn[src0 + step + 1];
+            }
+          }
+          s_tileUpX[dst + 0] = v.x;
+          s_tileUpX[dst + 1] = v.y;
+        }
+      }
+
+      // Vertical upsampling & nonlinearity.
+
+      __syncthreads();
+      int groupMask = 15 << ((threadIdx.x & 31) & ~3);
+      int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH
+                          : 0;  // Skip already written signs.
+      int sShapeMaxY =
+          MIN(p.sShape.y,
+              tileOutY * down + tileUpH);  // Avoid out-of-tile sign writes.
+      if (up == 4) {
+        minY -= 3;  // Adjust according to block height.
+        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;
+             idx += blockDim.x) {
+          int relUpX, relInY0;
+          fast_div_mod<tileUpW>(relUpX, relInY0, idx);
+          int relUpY0 = relInY0 * up;
+          int src0 = relInY0 * tileUpW + relUpX;
+          int dst = relUpY0 * tileUpW + relUpX;
+          vec4_t v = InternalType<T>::zero_vec4();
+
+          scalar_t a = s_tileUpX[src0];
+          if (phaseInY == 0) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileUpX[src0 + (step + 1) * tileUpW];
+              v.y += a * (scalar_t)c_fu[step * up + 3];
+              v.z += a * (scalar_t)c_fu[step * up + 2];
+              v.w += a * (scalar_t)c_fu[step * up + 1];
+            }
+          } else if (phaseInY == 1) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 1];
+              v.y += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileUpX[src0 + (step + 1) * tileUpW];
+              v.z += a * (scalar_t)c_fu[step * up + 3];
+              v.w += a * (scalar_t)c_fu[step * up + 2];
+            }
+          } else if (phaseInY == 2) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 2];
+              v.y += a * (scalar_t)c_fu[step * up + 1];
+              v.z += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileUpX[src0 + (step + 1) * tileUpW];
+              v.w += a * (scalar_t)c_fu[step * up + 3];
+            }
+          } else  // (phaseInY == 3)
+          {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 3];
+              v.y += a * (scalar_t)c_fu[step * up + 2];
+              v.z += a * (scalar_t)c_fu[step * up + 1];
+              v.w += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileUpX[src0 + (step + 1) * tileUpW];
+            }
+          }
+
+          int x = tileOutX * down + relUpX;
+          int y = tileOutY * down + relUpY0;
+          int signX = x + p.sOfs.x;
+          int signY = y + p.sOfs.y;
+          int signZ = blockIdx.z + p.blockZofs;
+          int signXb = signX >> 2;
+          index_t si0 =
+              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+          index_t si1 = si0 + p.sShape.x;
+          index_t si2 = si0 + p.sShape.x * 2;
+          index_t si3 = si0 + p.sShape.x * 3;
+
+          v.x *= (scalar_t)((float)up * (float)up * p.gain);
+          v.y *= (scalar_t)((float)up * (float)up * p.gain);
+          v.z *= (scalar_t)((float)up * (float)up * p.gain);
+          v.w *= (scalar_t)((float)up * (float)up * p.gain);
+
+          if (signWrite) {
+            if (!enableWriteSkip) {
+              // Determine and write signs.
+              int sx = __float_as_uint(v.x) >> 31 << 0;
+              int sy = __float_as_uint(v.y) >> 31 << 8;
+              int sz = __float_as_uint(v.z) >> 31 << 16;
+              int sw = __float_as_uint(v.w) >> 31 << 24;
+              if (sx) v.x *= p.slope;
+              if (sy) v.y *= p.slope;
+              if (sz) v.z *= p.slope;
+              if (sw) v.w *= p.slope;
+              if (fabsf(v.x) > p.clamp) {
+                sx = 2 << 0;
+                v.x = InternalType<T>::clamp(v.x, p.clamp);
+              }
+              if (fabsf(v.y) > p.clamp) {
+                sy = 2 << 8;
+                v.y = InternalType<T>::clamp(v.y, p.clamp);
+              }
+              if (fabsf(v.z) > p.clamp) {
+                sz = 2 << 16;
+                v.z = InternalType<T>::clamp(v.z, p.clamp);
+              }
+              if (fabsf(v.w) > p.clamp) {
+                sw = 2 << 24;
+                v.w = InternalType<T>::clamp(v.w, p.clamp);
+              }
+
+              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
+                // Combine signs.
+                uint32_t s = sx + sy + sw + sz;
+                s <<= (signX & 3) << 1;
+#ifdef MMCV_WITH_HIP
+                s |= __shfl_xor(s, 1);
+                s |= __shfl_xor(s, 2);
+#else
+                s |= __shfl_xor_sync(groupMask, s, 1);
+                s |= __shfl_xor_sync(groupMask, s, 2);
+#endif
+
+                // Write signs.
+                if ((uint32_t)(signY + 0) < sShapeMaxY) {
+                  p.s[si0] = (unsigned char)(s >> 0);
+                }
+                if ((uint32_t)(signY + 1) < sShapeMaxY) {
+                  p.s[si1] = (unsigned char)(s >> 8);
+                }
+                if ((uint32_t)(signY + 2) < sShapeMaxY) {
+                  p.s[si2] = (unsigned char)(s >> 16);
+                }
+                if ((uint32_t)(signY + 3) < sShapeMaxY) {
+                  p.s[si3] = (unsigned char)(s >> 24);
+                }
+              }
+            } else {
+              // Determine and write signs.
+              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
+                int sx = __float_as_uint(v.x) >> 31 << 0;
+                int sy = __float_as_uint(v.y) >> 31 << 8;
+                int sz = __float_as_uint(v.z) >> 31 << 16;
+                int sw = __float_as_uint(v.w) >> 31 << 24;
+                if (sx) v.x *= p.slope;
+                if (sy) v.y *= p.slope;
+                if (sz) v.z *= p.slope;
+                if (sw) v.w *= p.slope;
+                if (fabsf(v.x) > p.clamp) {
+                  sx = 2 << 0;
+                  v.x = InternalType<T>::clamp(v.x, p.clamp);
+                }
+                if (fabsf(v.y) > p.clamp) {
+                  sy = 2 << 8;
+                  v.y = InternalType<T>::clamp(v.y, p.clamp);
+                }
+                if (fabsf(v.z) > p.clamp) {
+                  sz = 2 << 16;
+                  v.z = InternalType<T>::clamp(v.z, p.clamp);
+                }
+                if (fabsf(v.w) > p.clamp) {
+                  sw = 2 << 24;
+                  v.w = InternalType<T>::clamp(v.w, p.clamp);
+                }
+
+                // Combine signs.
+                uint32_t s = sx + sy + sw + sz;
+                s <<= (signX & 3) << 1;
+#ifdef MMCV_WITH_HIP
+                s |= __shfl_xor(s, 1);
+                s |= __shfl_xor(s, 2);
+#else
+                s |= __shfl_xor_sync(groupMask, s, 1);
+                s |= __shfl_xor_sync(groupMask, s, 2);
+#endif
+
+                // Write signs.
+                if ((uint32_t)(signY + 0) < sShapeMaxY) {
+                  p.s[si0] = (unsigned char)(s >> 0);
+                }
+                if ((uint32_t)(signY + 1) < sShapeMaxY) {
+                  p.s[si1] = (unsigned char)(s >> 8);
+                }
+                if ((uint32_t)(signY + 2) < sShapeMaxY) {
+                  p.s[si2] = (unsigned char)(s >> 16);
+                }
+                if ((uint32_t)(signY + 3) < sShapeMaxY) {
+                  p.s[si3] = (unsigned char)(s >> 24);
+                }
+              } else {
+                // Just compute the values.
+                if (v.x < 0.f) v.x *= p.slope;
+                v.x = InternalType<T>::clamp(v.x, p.clamp);
+                if (v.y < 0.f) v.y *= p.slope;
+                v.y = InternalType<T>::clamp(v.y, p.clamp);
+                if (v.z < 0.f) v.z *= p.slope;
+                v.z = InternalType<T>::clamp(v.z, p.clamp);
+                if (v.w < 0.f) v.w *= p.slope;
+                v.w = InternalType<T>::clamp(v.w, p.clamp);
+              }
+            }
+          } else if (signRead)  // Read signs and apply.
+          {
+            if ((uint32_t)signXb < p.swLimit) {
+              int ss = (signX & 3) << 1;
+              if ((uint32_t)(signY + 0) < p.sShape.y) {
+                int s = p.s[si0] >> ss;
+                if (s & 1) v.x *= p.slope;
+                if (s & 2) v.x = 0.f;
+              }
+              if ((uint32_t)(signY + 1) < p.sShape.y) {
+                int s = p.s[si1] >> ss;
+                if (s & 1) v.y *= p.slope;
+                if (s & 2) v.y = 0.f;
+              }
+              if ((uint32_t)(signY + 2) < p.sShape.y) {
+                int s = p.s[si2] >> ss;
+                if (s & 1) v.z *= p.slope;
+                if (s & 2) v.z = 0.f;
+              }
+              if ((uint32_t)(signY + 3) < p.sShape.y) {
+                int s = p.s[si3] >> ss;
+                if (s & 1) v.w *= p.slope;
+                if (s & 2) v.w = 0.f;
+              }
+            }
+          } else  // Forward pass with no sign write.
+          {
+            if (v.x < 0.f) v.x *= p.slope;
+            v.x = InternalType<T>::clamp(v.x, p.clamp);
+            if (v.y < 0.f) v.y *= p.slope;
+            v.y = InternalType<T>::clamp(v.y, p.clamp);
+            if (v.z < 0.f) v.z *= p.slope;
+            v.z = InternalType<T>::clamp(v.z, p.clamp);
+            if (v.w < 0.f) v.w *= p.slope;
+            v.w = InternalType<T>::clamp(v.w, p.clamp);
+          }
+
+          s_tileUpXY[dst + 0 * tileUpW] = v.x;
+          if (relUpY0 + 1 < tileUpH) s_tileUpXY[dst + 1 * tileUpW] = v.y;
+          if (relUpY0 + 2 < tileUpH) s_tileUpXY[dst + 2 * tileUpW] = v.z;
+          if (relUpY0 + 3 < tileUpH) s_tileUpXY[dst + 3 * tileUpW] = v.w;
+        }
+      } else if (up == 2) {
+        minY -= 1;  // Adjust according to block height.
+        for (int idx = threadIdx.x; idx < tileUpW * tileUpH_up / up;
+             idx += blockDim.x) {
+          int relUpX, relInY0;
+          fast_div_mod<tileUpW>(relUpX, relInY0, idx);
+          int relUpY0 = relInY0 * up;
+          int src0 = relInY0 * tileUpW + relUpX;
+          int dst = relUpY0 * tileUpW + relUpX;
+          vec2_t v = InternalType<T>::zero_vec2();
+
+          scalar_t a = s_tileUpX[src0];
+          if (phaseInY == 0) {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileUpX[src0 + (step + 1) * tileUpW];
+              v.y += a * (scalar_t)c_fu[step * up + 1];
+            }
+          } else  // (phaseInY == 1)
+          {
+#pragma unroll
+            for (int step = 0; step < fuSize / up; step++) {
+              v.x += a * (scalar_t)c_fu[step * up + 1];
+              v.y += a * (scalar_t)c_fu[step * up + 0];
+              a = s_tileUpX[src0 + (step + 1) * tileUpW];
+            }
+          }
+
+          int x = tileOutX * down + relUpX;
+          int y = tileOutY * down + relUpY0;
+          int signX = x + p.sOfs.x;
+          int signY = y + p.sOfs.y;
+          int signZ = blockIdx.z + p.blockZofs;
+          int signXb = signX >> 2;
+          index_t si0 =
+              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+          index_t si1 = si0 + p.sShape.x;
+
+          v.x *= (scalar_t)((float)up * (float)up * p.gain);
+          v.y *= (scalar_t)((float)up * (float)up * p.gain);
+
+          if (signWrite) {
+            if (!enableWriteSkip) {
+              // Determine and write signs.
+              int sx = __float_as_uint(v.x) >> 31 << 0;
+              int sy = __float_as_uint(v.y) >> 31 << 8;
+              if (sx) v.x *= p.slope;
+              if (sy) v.y *= p.slope;
+              if (fabsf(v.x) > p.clamp) {
+                sx = 2 << 0;
+                v.x = InternalType<T>::clamp(v.x, p.clamp);
+              }
+              if (fabsf(v.y) > p.clamp) {
+                sy = 2 << 8;
+                v.y = InternalType<T>::clamp(v.y, p.clamp);
+              }
+
+              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
+                // Combine signs.
+                int s = sx + sy;
+                s <<= signXo;
+#ifdef MMCV_WITH_HIP
+                s |= __shfl_xor(s, 1);
+                s |= __shfl_xor(s, 2);
+#else
+                s |= __shfl_xor_sync(groupMask, s, 1);
+                s |= __shfl_xor_sync(groupMask, s, 2);
+#endif
+
+                // Write signs.
+                if ((uint32_t)(signY + 0) < sShapeMaxY) {
+                  p.s[si0] = (unsigned char)(s >> 0);
+                }
+                if ((uint32_t)(signY + 1) < sShapeMaxY) {
+                  p.s[si1] = (unsigned char)(s >> 8);
+                }
+              }
+            } else {
+              // Determine and write signs.
+              if ((uint32_t)signXb < p.swLimit && signY >= minY) {
+                int sx = __float_as_uint(v.x) >> 31 << 0;
+                int sy = __float_as_uint(v.y) >> 31 << 8;
+                if (sx) v.x *= p.slope;
+                if (sy) v.y *= p.slope;
+                if (fabsf(v.x) > p.clamp) {
+                  sx = 2 << 0;
+                  v.x = InternalType<T>::clamp(v.x, p.clamp);
+                }
+                if (fabsf(v.y) > p.clamp) {
+                  sy = 2 << 8;
+                  v.y = InternalType<T>::clamp(v.y, p.clamp);
+                }
+
+                // Combine signs.
+                int s = sx + sy;
+                s <<= signXo;
+#ifdef MMCV_WITH_HIP
+                s |= __shfl_xor(s, 1);
+                s |= __shfl_xor(s, 2);
+#else
+                s |= __shfl_xor_sync(groupMask, s, 1);
+                s |= __shfl_xor_sync(groupMask, s, 2);
+#endif
+
+                // Write signs.
+                if ((uint32_t)(signY + 0) < sShapeMaxY) {
+                  p.s[si0] = (unsigned char)(s >> 0);
+                }
+                if ((uint32_t)(signY + 1) < sShapeMaxY) {
+                  p.s[si1] = (unsigned char)(s >> 8);
+                }
+              } else {
+                // Just compute the values.
+                if (v.x < 0.f) v.x *= p.slope;
+                v.x = InternalType<T>::clamp(v.x, p.clamp);
+                if (v.y < 0.f) v.y *= p.slope;
+                v.y = InternalType<T>::clamp(v.y, p.clamp);
+              }
+            }
+          } else if (signRead)  // Read signs and apply.
+          {
+            if ((uint32_t)signXb < p.swLimit) {
+              if ((uint32_t)(signY + 0) < p.sShape.y) {
+                int s = p.s[si0] >> signXo;
+                if (s & 1) v.x *= p.slope;
+                if (s & 2) v.x = 0.f;
+              }
+              if ((uint32_t)(signY + 1) < p.sShape.y) {
+                int s = p.s[si1] >> signXo;
+                if (s & 1) v.y *= p.slope;
+                if (s & 2) v.y = 0.f;
+              }
+            }
+          } else  // Forward pass with no sign write.
+          {
+            if (v.x < 0.f) v.x *= p.slope;
+            v.x = InternalType<T>::clamp(v.x, p.clamp);
+            if (v.y < 0.f) v.y *= p.slope;
+            v.y = InternalType<T>::clamp(v.y, p.clamp);
+          }
+
+          if (!downInline) {
+            // Write into temporary buffer.
+            s_tileUpXY[dst] = v.x;
+            if (relUpY0 < tileUpH - 1) s_tileUpXY[dst + tileUpW] = v.y;
+          } else {
+            // Write directly into output buffer.
+            if ((uint32_t)x < p.yShape.x) {
+              int ymax = MIN(p.yShape.y, tileUpH + tileOutY * down);
+              index_t ofs = x * get_stride<index_t>(p.yStride.x) +
+                            y * get_stride<index_t>(p.yStride.y) + mapOfsOut;
+              if ((uint32_t)y + 0 < p.yShape.y)
+                *((T *)((char *)p.y + ofs)) = (T)(v.x * (scalar_t)c_fd[0]);
+              if ((uint32_t)y + 1 < ymax)
+                *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.y))) =
+                    (T)(v.y * (scalar_t)c_fd[0]);
+            }
+          }
+        }
+      }
+    } else if (filterMode == MODE_FUSD || filterMode == MODE_FUFD) {
+      // Full upsampling filter.
+
+      if (up == 2) {
+        // 2 x 2-wide.
+        __syncthreads();
+        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH + p.sOfs.y
+                            : 0;  // Skip already written signs.
+        for (int idx = threadIdx.x * 4; idx < tileUpW * tileUpH;
+             idx += blockDim.x * 4) {
+          int relUpX0, relUpY0;
+          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
+          int relInX0 = CEIL_DIV(relUpX0 - phaseInX, up);
+          int relInY0 = CEIL_DIV(relUpY0 - phaseInY, up);
+          int src0 = relInX0 + tileInW * relInY0;
+          int tap0y = (relInY0 * up + phaseInY - relUpY0);
+
+#define X_LOOP(TAPY, PX)                                             \
+  for (int sx = 0; sx < fuSize / up; sx++) {                         \
+    v.x += a * (scalar_t)c_fu[(sx * up + (((PX)-0) & (up - 1))) +    \
+                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+    v.z += b * (scalar_t)c_fu[(sx * up + (((PX)-0) & (up - 1))) +    \
+                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+    if ((PX) == 0) {                                                 \
+      a = b;                                                         \
+      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \
+    }                                                                \
+    v.y += a * (scalar_t)c_fu[(sx * up + (((PX)-1) & (up - 1))) +    \
+                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+    v.w += b * (scalar_t)c_fu[(sx * up + (((PX)-1) & (up - 1))) +    \
+                              (sy * up + (TAPY)) * MAX_FILTER_SIZE]; \
+    if ((PX) == 1) {                                                 \
+      a = b;                                                         \
+      b = s_tileIn[src0 + 2 + sx + sy * tileInW];                    \
+    }                                                                \
+  }
+
+          vec4_t v = InternalType<T>::zero_vec4();
+          if (tap0y == 0 && phaseInX == 0)
+#pragma unroll
+            for (int sy = 0; sy < fuSize / up; sy++) {
+              scalar_t a = s_tileIn[src0 + sy * tileInW];
+              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+#pragma unroll
+              X_LOOP(0, 0)
+            }
+          if (tap0y == 0 && phaseInX == 1)
+#pragma unroll
+            for (int sy = 0; sy < fuSize / up; sy++) {
+              scalar_t a = s_tileIn[src0 + sy * tileInW];
+              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+#pragma unroll
+              X_LOOP(0, 1)
+            }
+          if (tap0y == 1 && phaseInX == 0)
+#pragma unroll
+            for (int sy = 0; sy < fuSize / up; sy++) {
+              scalar_t a = s_tileIn[src0 + sy * tileInW];
+              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+#pragma unroll
+              X_LOOP(1, 0)
+            }
+          if (tap0y == 1 && phaseInX == 1)
+#pragma unroll
+            for (int sy = 0; sy < fuSize / up; sy++) {
+              scalar_t a = s_tileIn[src0 + sy * tileInW];
+              scalar_t b = s_tileIn[src0 + sy * tileInW + 1];
+#pragma unroll
+              X_LOOP(1, 1)
+            }
+
+#undef X_LOOP
+
+          int x = tileOutX * down + relUpX0;
+          int y = tileOutY * down + relUpY0;
+          int signX = x + p.sOfs.x;
+          int signY = y + p.sOfs.y;
+          int signZ = blockIdx.z + p.blockZofs;
+          int signXb = signX >> 2;
+          index_t si =
+              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+
+          v.x *= (scalar_t)((float)up * (float)up * p.gain);
+          v.y *= (scalar_t)((float)up * (float)up * p.gain);
+          v.z *= (scalar_t)((float)up * (float)up * p.gain);
+          v.w *= (scalar_t)((float)up * (float)up * p.gain);
+
+          if (signWrite) {
+            if (!enableWriteSkip) {
+              // Determine and write signs.
+              int sx = __float_as_uint(v.x) >> 31;
+              int sy = __float_as_uint(v.y) >> 31;
+              int sz = __float_as_uint(v.z) >> 31;
+              int sw = __float_as_uint(v.w) >> 31;
+              if (sx) v.x *= p.slope;
+              if (fabsf(v.x) > p.clamp) {
+                sx = 2;
+                v.x = InternalType<T>::clamp(v.x, p.clamp);
+              }
+              if (sy) v.y *= p.slope;
+              if (fabsf(v.y) > p.clamp) {
+                sy = 2;
+                v.y = InternalType<T>::clamp(v.y, p.clamp);
+              }
+              if (sz) v.z *= p.slope;
+              if (fabsf(v.z) > p.clamp) {
+                sz = 2;
+                v.z = InternalType<T>::clamp(v.z, p.clamp);
+              }
+              if (sw) v.w *= p.slope;
+              if (fabsf(v.w) > p.clamp) {
+                sw = 2;
+                v.w = InternalType<T>::clamp(v.w, p.clamp);
+              }
+
+              if ((uint32_t)signXb < p.swLimit &&
+                  (uint32_t)signY < p.sShape.y && signY >= minY) {
+                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
+              }
+            } else {
+              // Determine and write signs.
+              if ((uint32_t)signXb < p.swLimit &&
+                  (uint32_t)signY < p.sShape.y && signY >= minY) {
+                int sx = __float_as_uint(v.x) >> 31;
+                int sy = __float_as_uint(v.y) >> 31;
+                int sz = __float_as_uint(v.z) >> 31;
+                int sw = __float_as_uint(v.w) >> 31;
+                if (sx) v.x *= p.slope;
+                if (fabsf(v.x) > p.clamp) {
+                  sx = 2;
+                  v.x = InternalType<T>::clamp(v.x, p.clamp);
+                }
+                if (sy) v.y *= p.slope;
+                if (fabsf(v.y) > p.clamp) {
+                  sy = 2;
+                  v.y = InternalType<T>::clamp(v.y, p.clamp);
+                }
+                if (sz) v.z *= p.slope;
+                if (fabsf(v.z) > p.clamp) {
+                  sz = 2;
+                  v.z = InternalType<T>::clamp(v.z, p.clamp);
+                }
+                if (sw) v.w *= p.slope;
+                if (fabsf(v.w) > p.clamp) {
+                  sw = 2;
+                  v.w = InternalType<T>::clamp(v.w, p.clamp);
+                }
+
+                p.s[si] = sx + (sy << 2) + (sz << 4) + (sw << 6);
+              } else {
+                // Just compute the values.
+                if (v.x < 0.f) v.x *= p.slope;
+                v.x = InternalType<T>::clamp(v.x, p.clamp);
+                if (v.y < 0.f) v.y *= p.slope;
+                v.y = InternalType<T>::clamp(v.y, p.clamp);
+                if (v.z < 0.f) v.z *= p.slope;
+                v.z = InternalType<T>::clamp(v.z, p.clamp);
+                if (v.w < 0.f) v.w *= p.slope;
+                v.w = InternalType<T>::clamp(v.w, p.clamp);
+              }
+            }
+          } else if (signRead)  // Read sign and apply.
+          {
+            if ((uint32_t)signY < p.sShape.y) {
+              int s = 0;
+              if ((uint32_t)signXb < p.swLimit) s = p.s[si];
+              if ((uint32_t)signXb + 1 < p.swLimit) s |= p.s[si + 1] << 8;
+              s >>= (signX & 3) << 1;
+              if (s & 0x01) v.x *= p.slope;
+              if (s & 0x02) v.x = 0.f;
+              if (s & 0x04) v.y *= p.slope;
+              if (s & 0x08) v.y = 0.f;
+              if (s & 0x10) v.z *= p.slope;
+              if (s & 0x20) v.z = 0.f;
+              if (s & 0x40) v.w *= p.slope;
+              if (s & 0x80) v.w = 0.f;
+            }
+          } else  // Forward pass with no sign write.
+          {
+            if (v.x < 0.f) v.x *= p.slope;
+            v.x = InternalType<T>::clamp(v.x, p.clamp);
+            if (v.y < 0.f) v.y *= p.slope;
+            v.y = InternalType<T>::clamp(v.y, p.clamp);
+            if (v.z < 0.f) v.z *= p.slope;
+            v.z = InternalType<T>::clamp(v.z, p.clamp);
+            if (v.w < 0.f) v.w *= p.slope;
+            v.w = InternalType<T>::clamp(v.w, p.clamp);
+          }
+
+          s_tileUpXY[idx + 0] = v.x;
+          s_tileUpXY[idx + 1] = v.y;
+          s_tileUpXY[idx + 2] = v.z;
+          s_tileUpXY[idx + 3] = v.w;
+        }
+      } else if (up == 1) {
+        __syncthreads();
+        uint32_t groupMask = 15 << ((threadIdx.x & 31) & ~3);
+        int minY = tileOutY ? (tileOutY - tileOutH) * down + tileUpH
+                            : 0;  // Skip already written signs.
+        for (int idx = threadIdx.x; idx < tileUpW * tileUpH;
+             idx += blockDim.x) {
+          int relUpX0, relUpY0;
+          fast_div_mod<tileUpW>(relUpX0, relUpY0, idx);
+          scalar_t v = s_tileIn[idx] * (scalar_t)c_fu[0];  // 1x1 filter.
+
+          int x = tileOutX * down + relUpX0;
+          int y = tileOutY * down + relUpY0;
+          int signX = x + p.sOfs.x;
+          int signY = y + p.sOfs.y;
+          int signZ = blockIdx.z + p.blockZofs;
+          int signXb = signX >> 2;
+          index_t si =
+              signXb + p.sShape.x * (signY + (index_t)p.sShape.y * signZ);
+          v *= (scalar_t)((float)up * (float)up * p.gain);
+
+          if (signWrite) {
+            if (!enableWriteSkip) {
+              // Determine and write sign.
+              uint32_t s = 0;
+              uint32_t signXbit = (1u << signXo);
+              if (v < 0.f) {
+                s = signXbit;
+                v *= p.slope;
+              }
+              if (fabsf(v) > p.clamp) {
+                s = signXbit * 2;
+                v = InternalType<T>::clamp(v, p.clamp);
+              }
+              if ((uint32_t)signXb < p.swLimit &&
+                  (uint32_t)signY < p.sShape.y && signY >= minY) {
+#ifdef MMCV_WITH_HIP
+                s += __shfl_xor(s, 1);  // Coalesce.
+                s += __shfl_xor(s, 2);  // Coalesce.
+#else
+                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
+                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
+#endif
+                p.s[si] = s;  // Write.
+              }
+            } else {
+              // Determine and write sign.
+              if ((uint32_t)signXb < p.swLimit &&
+                  (uint32_t)signY < p.sShape.y && signY >= minY) {
+                uint32_t s = 0;
+                uint32_t signXbit = (1u << signXo);
+                if (v < 0.f) {
+                  s = signXbit;
+                  v *= p.slope;
+                }
+                if (fabsf(v) > p.clamp) {
+                  s = signXbit * 2;
+                  v = InternalType<T>::clamp(v, p.clamp);
+                }
+#ifdef MMCV_WITH_HIP
+                s += __shfl_xor(s, 1);  // Coalesce.
+                s += __shfl_xor(s, 2);  // Coalesce.
+#else
+                s += __shfl_xor_sync(groupMask, s, 1);  // Coalesce.
+                s += __shfl_xor_sync(groupMask, s, 2);  // Coalesce.
+#endif
+                p.s[si] = s;  // Write.
+              } else {
+                // Just compute the value.
+                if (v < 0.f) v *= p.slope;
+                v = InternalType<T>::clamp(v, p.clamp);
+              }
+            }
+          } else if (signRead) {
+            // Read sign and apply if within sign tensor bounds.
+            if ((uint32_t)signXb < p.swLimit && (uint32_t)signY < p.sShape.y) {
+              int s = p.s[si];
+              s >>= signXo;
+              if (s & 1) v *= p.slope;
+              if (s & 2) v = 0.f;
+            }
+          } else  // Forward pass with no sign write.
+          {
+            if (v < 0.f) v *= p.slope;
+            v = InternalType<T>::clamp(v, p.clamp);
+          }
+
+          if (!downInline)  // Write into temporary buffer.
+            s_tileUpXY[idx] = v;
+          else if ((uint32_t)x < p.yShape.x &&
+                   (uint32_t)y <
+                       p.yShape.y)  // Write directly into output buffer
+            *((T *)((char *)p.y + (x * get_stride<index_t>(p.yStride.x) +
+                                   y * get_stride<index_t>(p.yStride.y) +
+                                   mapOfsOut))) = (T)(v * (scalar_t)c_fd[0]);
+        }
+      }
+    }
+
+    // Downsampling.
+    if (filterMode == MODE_SUSD || filterMode == MODE_FUSD) {
+      // Horizontal downsampling.
+      __syncthreads();
+      if (down == 4 && tileOutW % 4 == 0) {
+        // Calculate 4 pixels at a time.
+        for (int idx = threadIdx.x * 4; idx < tileOutW * tileUpH;
+             idx += blockDim.x * 4) {
+          int relOutX0, relUpY;
+          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+          int relUpX0 = relOutX0 * down;
+          int src0 = relUpY * tileUpW + relUpX0;
+          vec4_t v = InternalType<T>::zero_vec4();
+#pragma unroll
+          for (int step = 0; step < fdSize; step++) {
+            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];
+            v.y += s_tileUpXY[src0 + 4 + step] * (scalar_t)c_fd[step];
+            v.z += s_tileUpXY[src0 + 8 + step] * (scalar_t)c_fd[step];
+            v.w += s_tileUpXY[src0 + 12 + step] * (scalar_t)c_fd[step];
+          }
+          s_tileDownX[idx + 0] = v.x;
+          s_tileDownX[idx + 1] = v.y;
+          s_tileDownX[idx + 2] = v.z;
+          s_tileDownX[idx + 3] = v.w;
+        }
+      } else if ((down == 2 || down == 4) && (tileOutW % 2 == 0)) {
+        // Calculate 2 pixels at a time.
+        for (int idx = threadIdx.x * 2; idx < tileOutW * tileUpH;
+             idx += blockDim.x * 2) {
+          int relOutX0, relUpY;
+          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+          int relUpX0 = relOutX0 * down;
+          int src0 = relUpY * tileUpW + relUpX0;
+          vec2_t v = InternalType<T>::zero_vec2();
+#pragma unroll
+          for (int step = 0; step < fdSize; step++) {
+            v.x += s_tileUpXY[src0 + 0 + step] * (scalar_t)c_fd[step];
+            v.y += s_tileUpXY[src0 + down + step] * (scalar_t)c_fd[step];
+          }
+          s_tileDownX[idx + 0] = v.x;
+          s_tileDownX[idx + 1] = v.y;
+        }
+      } else {
+        // Calculate 1 pixel at a time.
+        for (int idx = threadIdx.x; idx < tileOutW * tileUpH;
+             idx += blockDim.x) {
+          int relOutX0, relUpY;
+          fast_div_mod<tileOutW>(relOutX0, relUpY, idx);
+          int relUpX0 = relOutX0 * down;
+          int src = relUpY * tileUpW + relUpX0;
+          scalar_t v = 0.f;
+#pragma unroll
+          for (int step = 0; step < fdSize; step++)
+            v += s_tileUpXY[src + step] * (scalar_t)c_fd[step];
+          s_tileDownX[idx] = v;
+        }
+      }
+
+      // Vertical downsampling & store output tile.
+      __syncthreads();
+      for (int idx = threadIdx.x; idx < tileOutW * tileOutH;
+           idx += blockDim.x) {
+        int relOutX, relOutY0;
+        fast_div_mod<tileOutW>(relOutX, relOutY0, idx);
+        int relUpY0 = relOutY0 * down;
+        int src0 = relUpY0 * tileOutW + relOutX;
+        scalar_t v = 0;
+#pragma unroll
+        for (int step = 0; step < fdSize; step++)
+          v += s_tileDownX[src0 + step * tileOutW] * (scalar_t)c_fd[step];
+
+        int outX = tileOutX + relOutX;
+        int outY = tileOutY + relOutY0;
+
+        if (outX < p.yShape.x & outY < p.yShape.y)
+          *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +
+                                 outY * get_stride<index_t>(p.yStride.y) +
+                                 mapOfsOut))) = (T)v;
+      }
+    } else if (filterMode == MODE_SUFD || filterMode == MODE_FUFD) {
+      // Full downsampling filter.
+      if (down == 2) {
+        // 2-wide.
+        __syncthreads();
+        for (int idx = threadIdx.x * 2; idx < tileOutW * tileOutH;
+             idx += blockDim.x * 2) {
+          int relOutX0, relOutY0;
+          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
+          int relUpX0 = relOutX0 * down;
+          int relUpY0 = relOutY0 * down;
+          int src0 = relUpY0 * tileUpW + relUpX0;
+          vec2_t v = InternalType<T>::zero_vec2();
+#pragma unroll
+          for (int sy = 0; sy < fdSize; sy++)
+#pragma unroll
+            for (int sx = 0; sx < fdSize; sx++) {
+              v.x += s_tileUpXY[src0 + 0 + sx + sy * tileUpW] *
+                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
+              v.y += s_tileUpXY[src0 + 2 + sx + sy * tileUpW] *
+                     (scalar_t)c_fd[sx + sy * MAX_FILTER_SIZE];
+            }
+
+          int outX = tileOutX + relOutX0;
+          int outY = tileOutY + relOutY0;
+          if ((uint32_t)outY < p.yShape.y) {
+            index_t ofs = outX * get_stride<index_t>(p.yStride.x) +
+                          outY * get_stride<index_t>(p.yStride.y) + mapOfsOut;
+            if (outX + 0 < p.yShape.x) *((T *)((char *)p.y + ofs)) = (T)v.x;
+            if (outX + 1 < p.yShape.x)
+              *((T *)((char *)p.y + ofs + get_stride<index_t>(p.yStride.x))) =
+                  (T)v.y;
+          }
+        }
+      } else if (down == 1 && !downInline) {
+        // Thread per pixel.
+        __syncthreads();
+        for (int idx = threadIdx.x; idx < tileOutW * tileOutH;
+             idx += blockDim.x) {
+          int relOutX0, relOutY0;
+          fast_div_mod<tileOutW>(relOutX0, relOutY0, idx);
+          scalar_t v = s_tileUpXY[idx] * (scalar_t)c_fd[0];  // 1x1 filter.
+
+          int outX = tileOutX + relOutX0;
+          int outY = tileOutY + relOutY0;
+          if ((uint32_t)outX < p.yShape.x && (uint32_t)outY < p.yShape.y)
+            *((T *)((char *)p.y + (outX * get_stride<index_t>(p.yStride.x) +
+                                   outY * get_stride<index_t>(p.yStride.y) +
+                                   mapOfsOut))) = (T)v;
+        }
+      }
+    }
+
+    if (!enableXrep) break;
+  }
+}
+
+//------------------------------------------------------------------------
+// Compute activation function and signs for upsampled data tensor, modifying
+// data tensor in-place. Used for accelerating the generic variant. Sign tensor
+// is known to be contiguous, and p.x and p.s have the same z, w dimensions.
+// 64-bit indexing is always used.
+
+template <class T, bool signWrite, bool signRead>
+static __global__ void filtered_lrelu_act_kernel(
+    filtered_lrelu_act_kernel_params p) {
+  typedef typename InternalType<T>::scalar_t scalar_t;
+
+  // Indexing.
+  int32_t x = threadIdx.x + blockIdx.x * blockDim.x;
+  int32_t ymax = signWrite ? p.sShape.y : p.xShape.y;
+  int32_t qmax =
+      p.xShape.z * p.xShape.w;  // Combined minibatch*channel maximum index.
+
+  // Loop to accommodate oversized tensors.
+  for (int32_t q = blockIdx.z; q < qmax; q += gridDim.z)
+    for (int32_t y = blockIdx.y; y < ymax; y += gridDim.y) {
+      // Extract z and w (channel, minibatch index).
+      int32_t w = q / p.xShape.z;
+      int32_t z = q - w * p.xShape.z;
+
+      // Choose behavior based on sign read/write mode.
+      if (signWrite) {
+        // Process value if in p.x.
+        uint32_t s = 0;
+        if (x < p.xShape.x && y < p.xShape.y) {
+          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
+                       w * p.xStride.w;
+          T *pv = ((T *)p.x) + ix;
+          scalar_t v = (scalar_t)(*pv);
+
+          // Gain, LReLU, clamp.
+          v *= p.gain;
+          if (v < 0.f) {
+            v *= p.slope;
+            s = 1;  // Sign.
+          }
+          if (fabsf(v) > p.clamp) {
+            v = InternalType<T>::clamp(v, p.clamp);
+            s = 2;  // Clamp.
+          }
+
+          *pv = (T)v;  // Write value.
+        }
+
+        // Coalesce into threads 0 and 16 of warp.
+        uint32_t m = (threadIdx.x & 16) ? 0xffff0000u : 0x0000ffffu;
+        s <<= ((threadIdx.x & 15) << 1);  // Shift into place.
+#ifdef MMCV_WITH_HIP
+        s |= __shfl_xor(s, 1);  // Distribute.
+        s |= __shfl_xor(s, 2);
+        s |= __shfl_xor(s, 4);
+        s |= __shfl_xor(s, 8);
+#else
+        s |= __shfl_xor_sync(m, s, 1);                  // Distribute.
+        s |= __shfl_xor_sync(m, s, 2);
+        s |= __shfl_xor_sync(m, s, 4);
+        s |= __shfl_xor_sync(m, s, 8);
+#endif
+
+        // Write signs if leader and in p.s.
+        if (!(threadIdx.x & 15) && x < p.sShape.x)  // y is always in.
+        {
+          uint64_t is =
+              x + p.sShape.x * (y + (int64_t)p.sShape.y * q);  // Contiguous.
+          ((uint32_t *)p.s)[is >> 4] = s;
+        }
+      } else if (signRead) {
+        // Process value if in p.x.
+        if (x < p.xShape.x)  // y is always in.
+        {
+          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
+                       w * p.xStride.w;
+          T *pv = ((T *)p.x) + ix;
+          scalar_t v = (scalar_t)(*pv);
+          v *= p.gain;
+
+          // Apply sign buffer offset.
+          uint32_t sx = x + p.sOfs.x;
+          uint32_t sy = y + p.sOfs.y;
+
+          // Read and apply signs if we land inside valid region of sign buffer.
+          if (sx < p.sShape.x && sy < p.sShape.y) {
+            uint64_t is =
+                (sx >> 2) + (p.sShape.x >> 2) *
+                                (sy + (uint64_t)p.sShape.y * q);  // Contiguous.
+            unsigned char s = p.s[is];
+            s >>= (sx & 3) << 1;  // Shift into place.
+            if (s & 1)            // Sign?
+              v *= p.slope;
+            if (s & 2)  // Clamp?
+              v = 0.f;
+          }
+
+          *pv = (T)v;  // Write value.
+        }
+      } else {
+        // Forward pass with no sign write. Process value if in p.x.
+        if (x < p.xShape.x)  // y is always in.
+        {
+          int64_t ix = x * p.xStride.x + y * p.xStride.y + z * p.xStride.z +
+                       w * p.xStride.w;
+          T *pv = ((T *)p.x) + ix;
+          scalar_t v = (scalar_t)(*pv);
+          v *= p.gain;
+          if (v < 0.f) v *= p.slope;
+          if (fabsf(v) > p.clamp) v = InternalType<T>::clamp(v, p.clamp);
+          *pv = (T)v;  // Write value.
+        }
+      }
+    }
+}
+
+template <class T, bool signWrite, bool signRead>
+void *choose_filtered_lrelu_act_kernel(void) {
+  return (void *)filtered_lrelu_act_kernel<T, signWrite, signRead>;
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T, class index_t, bool signWrite, bool signRead>
+filtered_lrelu_kernel_spec choose_filtered_lrelu_kernel(
+    const filtered_lrelu_kernel_params &p, int sharedKB) {
+  filtered_lrelu_kernel_spec s = {0};
+
+  // Return the first matching kernel.
+#define CASE(SH, U, FU, D, FD, MODE, TW, TH, W, XR, WS)                        \
+  if (sharedKB >= SH)                                                          \
+    if ((p.fuShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_SUFD)) ||      \
+        (p.fuShape.y > 0 && (MODE == MODE_FUSD || MODE == MODE_FUFD)))         \
+      if ((p.fdShape.y == 0 && (MODE == MODE_SUSD || MODE == MODE_FUSD)) ||    \
+          (p.fdShape.y > 0 && (MODE == MODE_SUFD || MODE == MODE_FUFD)))       \
+        if (p.up == U && p.fuShape.x <= FU && p.fuShape.y <= FU &&             \
+            p.down == D && p.fdShape.x <= FD && p.fdShape.y <= FD) {           \
+          static_assert((D * TW % 4) == 0,                                     \
+                        "down * tileWidth must be divisible by 4");            \
+          static_assert(                                                       \
+              FU % U == 0,                                                     \
+              "upscaling filter size must be multiple of upscaling factor");   \
+          static_assert(FD % D == 0,                                           \
+                        "downscaling filter size must be multiple of "         \
+                        "downscaling factor");                                 \
+          s.setup = (void *)setup_filters_kernel;                              \
+          s.exec = (void *)                                                    \
+              filtered_lrelu_kernel<T, index_t, SH, signWrite, signRead, MODE, \
+                                    U, FU, D, FD, TW, TH, W * 32, !!XR, !!WS>; \
+          s.tileOut = make_int2(TW, TH);                                       \
+          s.numWarps = W;                                                      \
+          s.xrep = XR;                                                         \
+          s.dynamicSharedKB = (SH == 48) ? 0 : SH;                             \
+          return s;                                                            \
+        }
+
+  // Launch parameters for various kernel specializations.
+  // Small filters must be listed before large filters, otherwise the kernel for
+  // larger filter will always match first. Kernels that use more shared memory
+  // must be listed before those that use less, for the same reason.
+
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 1, 1, /*mode*/ MODE_FUFD,
+       /*tw,th,warps,xrep,wskip*/ 64, 178, 32, 0, 0)  // 1t-upf1-downf1
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 152, 95, 16, 0, 0)  // 4t-ups2-downf1
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 56, 22, 16, 0, 0)  // 4t-upf1-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 56, 29, 16, 11, 0)  // 4t-ups2-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 60, 28, 16, 0, 0)  // 4t-upf2-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 56, 28, 16, 0, 0)  // 4t-ups2-downf2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 56, 31, 16, 11, 0)  // 4t-ups4-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 16, /*down,fd*/ 2, 8, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 56, 36, 16, 0, 0)  // 4t-ups4-downf2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 16, 22, 16, 12, 0)  // 4t-ups2-downs4
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 8, /*down,fd*/ 4, 16, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 29, 15, 16, 0, 0)  // 4t-upf2-downs4
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 96, 150, 28, 0, 0)  // 6t-ups2-downf1
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 32, 35, 24, 0, 0)  // 6t-upf1-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 32, 46, 16, 10, 0)  // 6t-ups2-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 58, 28, 24, 8, 0)  // 6t-upf2-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 52, 28, 16, 0, 0)  // 6t-ups2-downf2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 32, 51, 16, 5, 0)  // 6t-ups4-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 24, /*down,fd*/ 2, 12, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 32, 56, 16, 6, 0)  // 6t-ups4-downf2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 16, 18, 16, 12, 0)  // 6t-ups2-downs4
+  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 27, 31, 32, 6, 0)  // 6t-upf2-downs4 96kB
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 12, /*down,fd*/ 4, 24, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 27, 13, 24, 0, 0)  // 6t-upf2-downs4
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 1, 1, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 148, 89, 24, 0, 0)  // 8t-ups2-downf1
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 1, 1, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 32, 31, 16, 5, 0)  // 8t-upf1-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 32, 41, 16, 9, 0)  // 8t-ups2-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 56, 26, 24, 0, 0)  // 8t-upf2-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 32, 40, 16, 0, 0)  // 8t-ups2-downf2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 32, 46, 24, 5, 0)  // 8t-ups4-downs2
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 4, 32, /*down,fd*/ 2, 16, /*mode*/ MODE_SUFD,
+       /*tw,th,warps,xrep,wskip*/ 32, 50, 16, 0, 0)  // 8t-ups4-downf2
+  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 24, 24, 32, 12, 1)  // 8t-ups2-downs4 96kB
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_SUSD,
+       /*tw,th,warps,xrep,wskip*/ 16, 13, 16, 10, 1)  // 8t-ups2-downs4
+  CASE(/*sharedKB*/ 96, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 25, 28, 28, 4, 0)  // 8t-upf2-downs4 96kB
+  CASE(/*sharedKB*/ 48, /*up,fu*/ 2, 16, /*down,fd*/ 4, 32, /*mode*/ MODE_FUSD,
+       /*tw,th,warps,xrep,wskip*/ 25, 10, 24, 0, 0)  // 8t-upf2-downs4
+
+#undef CASE
+  return s;  // No kernel found.
+}
+
+//------------------------------------------------------------------------
+
+#define BUILD_FILTERED_LRELU_OP 1
+
+#ifndef MMCV_WITH_HIP
+#ifdef __GNUC__
+#if __GNUC__ < 6
+#undef BUILD_FILTERED_LRELU_OP
+#define BUILD_FILTERED_LRELU_OP 0
+#endif
+#endif
+
+#if CUDA_VERSION < 10020
+#undef BUILD_FILTERED_LRELU_OP
+#define BUILD_FILTERED_LRELU_OP 0
+#endif
+#endif
+
+#if BUILD_FILTERED_LRELU_OP == 1
+std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
+    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
+    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
+    bool writeSigns) {
+  // Set CUDA device.
+  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+
+  // Validate arguments.
+  TORCH_CHECK(fu.device() == x.device() && fd.device() == x.device() &&
+                  b.device() == x.device(),
+              "all input tensors must reside on the same device");
+  TORCH_CHECK(fu.dtype() == torch::kFloat && fd.dtype() == torch::kFloat,
+              "fu and fd must be float32");
+  TORCH_CHECK(b.dtype() == x.dtype(), "x and b must have the same dtype");
+  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat,
+              "x and b must be float16 or float32");
+  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&
+                  x.size(3) <= INT_MAX,
+              "x is too large");
+  TORCH_CHECK(x.numel() > 0, "x is empty");
+  TORCH_CHECK(
+      (fu.dim() == 1 || fu.dim() == 2) && (fd.dim() == 1 || fd.dim() == 2),
+      "fu and fd must be rank 1 or 2");
+  TORCH_CHECK(fu.size(0) <= INT_MAX && fu.size(-1) <= INT_MAX,
+              "fu is too large");
+  TORCH_CHECK(fd.size(0) <= INT_MAX && fd.size(-1) <= INT_MAX,
+              "fd is too large");
+  TORCH_CHECK(fu.numel() > 0, "fu is empty");
+  TORCH_CHECK(fd.numel() > 0, "fd is empty");
+  TORCH_CHECK(b.dim() == 1 && b.size(0) == x.size(1),
+              "b must be a vector with the same number of channels as x");
+  TORCH_CHECK(up >= 1 && down >= 1, "up and down must be at least 1");
+
+  // Figure out how much shared memory is available on the device.
+  int maxSharedBytes = 0;
+#ifdef MMCV_WITH_HIP
+  cudaDeviceGetAttribute(&maxSharedBytes,
+                         hipDeviceAttributeMaxSharedMemoryPerBlock,
+                         x.device().index());
+#else
+  AT_CUDA_CHECK(cudaDeviceGetAttribute(&maxSharedBytes,
+                                       cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                                       x.device().index()));
+#endif
+  int sharedKB = maxSharedBytes >> 10;
+
+  // Populate enough launch parameters to check if a CUDA kernel exists.
+  filtered_lrelu_kernel_params p;
+  p.up = up;
+  p.down = down;
+  p.fuShape =
+      make_int2((int)fu.size(-1),
+                fu.dim() == 2 ? (int)fu.size(0)
+                              : 0);  // shape [n, 0] indicates separable filter.
+  p.fdShape = make_int2((int)fd.size(-1), fd.dim() == 2 ? (int)fd.size(0) : 0);
+  filtered_lrelu_kernel_spec test_spec =
+      choose_filtered_lrelu_kernel<float, int32_t, false, false>(p, sharedKB);
+  if (!test_spec.exec) {
+    // No kernel found - return empty tensors and indicate missing kernel with
+    // return code of -1.
+    return std::make_tuple(torch::Tensor(), torch::Tensor(), -1);
+  }
+
+  // Input/output element size.
+  int64_t sz = (x.dtype() == torch::kHalf) ? 2 : 4;
+
+  // Input sizes.
+  int64_t xw = (int)x.size(3);
+  int64_t xh = (int)x.size(2);
+  int64_t fut_w = (int)fu.size(-1) - 1;
+  int64_t fut_h = (int)fu.size(0) - 1;
+  int64_t fdt_w = (int)fd.size(-1) - 1;
+  int64_t fdt_h = (int)fd.size(0) - 1;
+
+  // Logical size of upsampled buffer.
+  int64_t cw = xw * up + (px0 + px1) - fut_w;
+  int64_t ch = xh * up + (py0 + py1) - fut_h;
+  TORCH_CHECK(
+      cw > fdt_w && ch > fdt_h,
+      "upsampled buffer must be at least the size of downsampling filter");
+  TORCH_CHECK(cw <= INT_MAX && ch <= INT_MAX, "upsampled buffer is too large");
+
+  // Compute output size and allocate.
+  int64_t yw = (cw - fdt_w + (down - 1)) / down;
+  int64_t yh = (ch - fdt_h + (down - 1)) / down;
+  TORCH_CHECK(yw > 0 && yh > 0, "output must be at least 1x1");
+  TORCH_CHECK(yw <= INT_MAX && yh <= INT_MAX, "output is too large");
+  torch::Tensor y = torch::empty({x.size(0), x.size(1), yh, yw}, x.options(),
+                                 x.suggest_memory_format());
+
+  // Allocate sign tensor.
+  torch::Tensor so;
+  torch::Tensor s = si;
+  bool readSigns = !!s.numel();
+  int64_t sw_active = 0;  // Active width of sign tensor.
+  if (writeSigns) {
+    sw_active = yw * down - (down - 1) + fdt_w;   // Active width in elements.
+    int64_t sh = yh * down - (down - 1) + fdt_h;  // Height = active height.
+    int64_t sw = (sw_active + 15) & ~15;  // Width  = active width in elements,
+                                          // rounded up to multiple of 16.
+    TORCH_CHECK(sh <= INT_MAX && (sw >> 2) <= INT_MAX, "signs is too large");
+    s = so = torch::empty({x.size(0), x.size(1), sh, sw >> 2},
+                          x.options().dtype(torch::kUInt8),
+                          at::MemoryFormat::Contiguous);
+  } else if (readSigns)
+    sw_active = s.size(3) << 2;
+
+  // Validate sign tensor if in use.
+  if (readSigns || writeSigns) {
+    TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
+    TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
+    TORCH_CHECK(s.device() == x.device(),
+                "signs must reside on the same device as x");
+    TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
+    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),
+                "signs must have same batch & channels as x");
+    TORCH_CHECK(s.size(2) <= INT_MAX && s.size(3) <= INT_MAX,
+                "signs is too large");
+  }
+
+  // Populate rest of CUDA kernel parameters.
+  p.x = x.data_ptr();
+  p.y = y.data_ptr();
+  p.b = b.data_ptr();
+  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
+  p.fu = fu.data_ptr<float>();
+  p.fd = fd.data_ptr<float>();
+  p.pad0 = make_int2(px0, py0);
+  p.gain = gain;
+  p.slope = slope;
+  p.clamp = clamp;
+  p.flip = (flip_filters) ? 1 : 0;
+  p.xShape =
+      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+  p.yShape =
+      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
+  p.sShape = (readSigns || writeSigns)
+                 ? make_int2((int)s.size(3), (int)s.size(2))
+                 : make_int2(0, 0);  // Width is in bytes. Contiguous.
+  p.sOfs = make_int2(sx, sy);
+  p.swLimit = (sw_active + 3) >> 2;  // Rounded up to bytes.
+
+  // x, y, b strides are in bytes.
+  p.xStride = make_longlong4(sz * x.stride(3), sz * x.stride(2),
+                             sz * x.stride(1), sz * x.stride(0));
+  p.yStride = make_longlong4(sz * y.stride(3), sz * y.stride(2),
+                             sz * y.stride(1), sz * y.stride(0));
+  p.bStride = sz * b.stride(0);
+
+  // fu, fd strides are in elements.
+  p.fuStride =
+      make_longlong3(fu.stride(-1), fu.dim() == 2 ? fu.stride(0) : 0, 0);
+  p.fdStride =
+      make_longlong3(fd.stride(-1), fd.dim() == 2 ? fd.stride(0) : 0, 0);
+
+  // Determine if indices don't fit in int32. Support negative strides although
+  // Torch currently never produces those.
+  bool index64b = false;
+  if (std::abs(p.bStride * x.size(1)) > INT_MAX) index64b = true;
+  if (std::min(x.size(0) * p.xStride.w, 0ll) +
+          std::min(x.size(1) * p.xStride.z, 0ll) +
+          std::min(x.size(2) * p.xStride.y, 0ll) +
+          std::min(x.size(3) * p.xStride.x, 0ll) <
+      -INT_MAX)
+    index64b = true;
+  if (std::max(x.size(0) * p.xStride.w, 0ll) +
+          std::max(x.size(1) * p.xStride.z, 0ll) +
+          std::max(x.size(2) * p.xStride.y, 0ll) +
+          std::max(x.size(3) * p.xStride.x, 0ll) >
+      INT_MAX)
+    index64b = true;
+  if (std::min(y.size(0) * p.yStride.w, 0ll) +
+          std::min(y.size(1) * p.yStride.z, 0ll) +
+          std::min(y.size(2) * p.yStride.y, 0ll) +
+          std::min(y.size(3) * p.yStride.x, 0ll) <
+      -INT_MAX)
+    index64b = true;
+  if (std::max(y.size(0) * p.yStride.w, 0ll) +
+          std::max(y.size(1) * p.yStride.z, 0ll) +
+          std::max(y.size(2) * p.yStride.y, 0ll) +
+          std::max(y.size(3) * p.yStride.x, 0ll) >
+      INT_MAX)
+    index64b = true;
+  if (s.numel() > INT_MAX) index64b = true;
+
+  // Choose CUDA kernel.
+  filtered_lrelu_kernel_spec spec = {0};
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      x.scalar_type(), "filtered_lrelu_cuda", [&] {
+        if constexpr (sizeof(scalar_t) <=
+                      4)  // Exclude doubles. constexpr
+                          // prevents template instantiation.
+        {
+          // Choose kernel based on index type, datatype and sign read/write
+          // modes.
+          if (!index64b && writeSigns && !readSigns)
+            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, true, false>(
+                p, sharedKB);
+          else if (!index64b && !writeSigns && readSigns)
+            spec = choose_filtered_lrelu_kernel<scalar_t, int32_t, false, true>(
+                p, sharedKB);
+          else if (!index64b && !writeSigns && !readSigns)
+            spec =
+                choose_filtered_lrelu_kernel<scalar_t, int32_t, false, false>(
+                    p, sharedKB);
+          else if (index64b && writeSigns && !readSigns)
+            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, true, false>(
+                p, sharedKB);
+          else if (index64b && !writeSigns && readSigns)
+            spec = choose_filtered_lrelu_kernel<scalar_t, int64_t, false, true>(
+                p, sharedKB);
+          else if (index64b && !writeSigns && !readSigns)
+            spec =
+                choose_filtered_lrelu_kernel<scalar_t, int64_t, false, false>(
+                    p, sharedKB);
+        }
+      });
+  TORCH_CHECK(
+      spec.exec,
+      "internal error - CUDA kernel not found")  // This should not happen
+                                                 // because we tested earlier
+                                                 // that kernel exists.
+
+  // Launch CUDA kernel.
+  void *args[] = {&p};
+  int bx = spec.numWarps * 32;
+  int gx = (p.yShape.x - 1) / spec.tileOut.x + 1;
+  int gy = (p.yShape.y - 1) / spec.tileOut.y + 1;
+  int gz = p.yShape.z * p.yShape.w;
+
+  // Repeat multiple horizontal tiles in a CTA?
+  if (spec.xrep) {
+    p.tilesXrep = spec.xrep;
+    p.tilesXdim = gx;
+
+    gx = (gx + p.tilesXrep - 1) / p.tilesXrep;
+    std::swap(gx, gy);
+  } else {
+    p.tilesXrep = 0;
+    p.tilesXdim = 0;
+  }
+#ifdef MMCV_WITH_HIP
+  AT_CUDA_CHECK(hipLaunchKernel(spec.setup, 1, 1024, args, 0,
+                                at::cuda::getCurrentCUDAStream()));
+#else
+  // Launch filter setup kernel.
+  AT_CUDA_CHECK(cudaLaunchKernel(spec.setup, 1, 1024, args, 0,
+                                 at::cuda::getCurrentCUDAStream()));
+#endif
+
+  // Copy kernels to constant memory.
+  if (writeSigns && !readSigns)
+    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));
+  else if (!writeSigns && readSigns)
+    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));
+  else if (!writeSigns && !readSigns)
+    AT_CUDA_CHECK((copy_filters(at::cuda::getCurrentCUDAStream())));
+
+  // Set cache and shared memory configurations for main kernel.
+  AT_CUDA_CHECK(cudaFuncSetCacheConfig(spec.exec, cudaFuncCachePreferShared));
+  if (spec.dynamicSharedKB)  // Need dynamically allocated shared memory?
+#ifdef MMCV_WITH_HIP
+    AT_CUDA_CHECK(hipFuncSetAttribute(
+        spec.exec, hipFuncAttributeMaxDynamicSharedMemorySize,
+        spec.dynamicSharedKB << 10));
+#else
+    AT_CUDA_CHECK(cudaFuncSetAttribute(
+        spec.exec, cudaFuncAttributeMaxDynamicSharedMemorySize,
+        spec.dynamicSharedKB << 10));
+#endif
+  AT_CUDA_CHECK(
+      cudaFuncSetSharedMemConfig(spec.exec, cudaSharedMemBankSizeFourByte));
+
+  // Launch main kernel.
+  const int maxSubGz = 65535;  // CUDA maximum for block z dimension.
+  for (int zofs = 0; zofs < gz;
+       zofs += maxSubGz)  // Do multiple launches if gz is too big.
+  {
+    p.blockZofs = zofs;
+    int subGz = std::min(maxSubGz, gz - zofs);
+#ifdef MMCV_WITH_HIP
+    AT_CUDA_CHECK(hipLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,
+                                  spec.dynamicSharedKB << 10,
+                                  at::cuda::getCurrentCUDAStream()));
+#else
+    AT_CUDA_CHECK(cudaLaunchKernel(spec.exec, dim3(gx, gy, subGz), bx, args,
+                                   spec.dynamicSharedKB << 10,
+                                   at::cuda::getCurrentCUDAStream()));
+#endif
+  }
+
+  // Done.
+  return std::make_tuple(y, so, 0);
+}
+
+std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op_impl(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
+    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
+    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
+    bool writeSigns);
+
+REGISTER_DEVICE_IMPL(filtered_lrelu_op_impl, CUDA, filtered_lrelu_op);
+
+#else
+
+#pragma message(                           \
+    "filtered_lrelu_op is not available. " \
+    "Please update your compiler and cuda version.")
+
+#endif
+#undef BUILD_FILTERED_LRELU_OP
+
+//------------------------------------------------------------------------
+
+torch::Tensor filtered_lrelu_act_op(torch::Tensor x, torch::Tensor si, int sx,
+                                    int sy, float gain, float slope,
+                                    float clamp, bool writeSigns) {
+  // Set CUDA device.
+  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+
+  // Validate arguments.
+  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+  TORCH_CHECK(x.size(0) * x.size(1) <= INT_MAX && x.size(2) <= INT_MAX &&
+                  x.size(3) <= INT_MAX,
+              "x is too large");
+  TORCH_CHECK(x.numel() > 0, "x is empty");
+  TORCH_CHECK(x.dtype() == torch::kHalf || x.dtype() == torch::kFloat ||
+                  x.dtype() == torch::kDouble,
+              "x must be float16, float32 or float64");
+
+  // Output signs if we don't have sign input.
+  torch::Tensor so;
+  torch::Tensor s = si;
+  bool readSigns = !!s.numel();
+  if (writeSigns) {
+    int64_t sw = x.size(3);
+    sw = (sw + 15) & ~15;  // Round to a multiple of 16 for coalescing.
+    s = so = torch::empty({x.size(0), x.size(1), x.size(2), sw >> 2},
+                          x.options().dtype(torch::kUInt8),
+                          at::MemoryFormat::Contiguous);
+  }
+
+  // Validate sign tensor if in use.
+  if (readSigns || writeSigns) {
+    TORCH_CHECK(s.is_contiguous(), "signs must be contiguous");
+    TORCH_CHECK(s.dtype() == torch::kUInt8, "signs must be uint8");
+    TORCH_CHECK(s.device() == x.device(),
+                "signs must reside on the same device as x");
+    TORCH_CHECK(s.dim() == 4, "signs must be rank 4");
+    TORCH_CHECK(s.size(0) == x.size(0) && s.size(1) == x.size(1),
+                "signs must have same batch & channels as x");
+    TORCH_CHECK(s.size(2) <= INT_MAX && (s.size(3) << 2) <= INT_MAX,
+                "signs tensor is too large");
+  }
+
+  // Initialize CUDA kernel parameters.
+  filtered_lrelu_act_kernel_params p;
+  p.x = x.data_ptr();
+  p.s = (readSigns || writeSigns) ? s.data_ptr<unsigned char>() : 0;
+  p.gain = gain;
+  p.slope = slope;
+  p.clamp = clamp;
+  p.xShape =
+      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+  p.xStride =
+      make_longlong4(x.stride(3), x.stride(2), x.stride(1), x.stride(0));
+  p.sShape = (readSigns || writeSigns)
+                 ? make_int2((int)s.size(3) << 2, (int)s.size(2))
+                 : make_int2(0, 0);  // Width is in elements. Contiguous.
+  p.sOfs = make_int2(sx, sy);
+
+  // Choose CUDA kernel.
+  void *func = 0;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      x.scalar_type(), "filtered_lrelu_act_cuda", [&] {
+        if (writeSigns)
+          func = choose_filtered_lrelu_act_kernel<scalar_t, true, false>();
+        else if (readSigns)
+          func = choose_filtered_lrelu_act_kernel<scalar_t, false, true>();
+        else
+          func = choose_filtered_lrelu_act_kernel<scalar_t, false, false>();
+      });
+  TORCH_CHECK(func, "internal error - CUDA kernel not found");
+
+  // Launch CUDA kernel.
+  void *args[] = {&p};
+  int bx = 128;  // 4 warps per block.
+
+  // Logical size of launch = writeSigns ? p.s : p.x
+  uint32_t gx = writeSigns ? p.sShape.x : p.xShape.x;
+  uint32_t gy = writeSigns ? p.sShape.y : p.xShape.y;
+  uint32_t gz =
+      p.xShape.z * p.xShape.w;  // Same as in p.sShape if signs are in use.
+  gx = (gx - 1) / bx + 1;
+
+  // Make sure grid y and z dimensions are within CUDA launch limits. Kernel
+  // loops internally to do the rest.
+  const uint32_t gmax = 65535;
+  gy = std::min(gy, gmax);
+  gz = std::min(gz, gmax);
+
+  // Launch.
+#ifdef MMCV_WITH_HIP
+  AT_CUDA_CHECK(hipLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,
+                                at::cuda::getCurrentCUDAStream()));
+#else
+  AT_CUDA_CHECK(cudaLaunchKernel(func, dim3(gx, gy, gz), bx, args, 0,
+                                 at::cuda::getCurrentCUDAStream()));
+#endif
+
+  return so;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cb899f954fd969e57a23d5723bf2f9c49b35a853
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "sigmoid_focal_loss_cuda_kernel.cuh"
+#include "softmax_focal_loss_cuda_kernel.cuh"
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  int output_size = output.numel();
+  int num_classes = input.size(1);
+  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_forward_cuda_kernel", [&] {
+        sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha) {
+  int output_size = grad_input.numel();
+  int num_classes = input.size(1);
+
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_backward_cuda_kernel", [&] {
+        sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  int output_size = output.numel();
+  int num_classes = softmax.size(1);
+
+  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(softmax.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      softmax.scalar_type(), "softmax_focal_loss_forward_cuda_kernel", [&] {
+        softmax_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha) {
+  int num_classes = softmax.size(1);
+
+  int output_size = buff.numel();
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(),
+      "softmax_focal_loss_backward_cuda1_"
+      "kernel",
+      [&] {
+        softmax_focal_loss_backward_cuda1_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  output_size = grad_input.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(),
+      "softmax_focal_loss_backward_cuda2_"
+      "kernel",
+      [&] {
+        softmax_focal_loss_backward_cuda2_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),
+                grad_input.data_ptr<scalar_t>(), num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cfb4cd3646fa181de8bf61df33526c99dfdf5522
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
@@ -0,0 +1,143 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling_gpu.cu
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "furthest_point_sample_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+inline int opt_n_threads(int work_size) {
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+
+  return max(min(1 << pow_2, 1024), 1);
+}
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float* dataset,
+                                                    float* temp, int* idxs) {
+  // dataset: (B, N, 3)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  unsigned int n_threads = opt_n_threads(n);
+
+  switch (n_threads) {
+    case 1024:
+      furthest_point_sampling_forward_cuda_kernel<1024>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 512:
+      furthest_point_sampling_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_forward_cuda_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_forward_cuda_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_forward_cuda_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_forward_cuda_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_forward_cuda_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_forward_cuda_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_forward_cuda_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_forward_cuda_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_forward_cuda_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float* dataset, float* temp, int* idxs) {
+  // dataset: (B, N, N)
+  // temp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  unsigned int n_threads = opt_n_threads(n);
+
+  switch (n_threads) {
+    case 1024:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<1024>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 512:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..911ea019aad65c8e51ca94c273cb5bbad70ae8db
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
@@ -0,0 +1,109 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act_kernel.cu
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+template <typename scalar_t>
+static __global__ void fused_bias_act_kernel(
+    scalar_t* out, const scalar_t* p_x, const scalar_t* p_b,
+    const scalar_t* p_ref, int act, int grad, scalar_t alpha, scalar_t scale,
+    int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) {
+  int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x;
+
+  scalar_t zero = 0.0;
+
+  for (int loop_idx = 0; loop_idx < loop_x && xi < size_x;
+       loop_idx++, xi += blockDim.x) {
+    scalar_t x = p_x[xi];
+
+    if (use_bias) {
+      x += p_b[(xi / step_b) % size_b];
+    }
+
+    scalar_t ref = use_ref ? p_ref[xi] : zero;
+
+    scalar_t y;
+
+    // act = 1: linear layer
+    // act = 3: leaky relu layer
+    // grad = 0: direct forward path
+    // grad = 1: first order deviation
+    // grad = 2: second order deviation
+    switch (act * 10 + grad) {
+      default:
+      case 10:
+        y = x;
+        break;
+      case 11:
+        y = x;
+        break;
+      case 12:
+        y = 0.0;
+        break;
+
+      case 30:
+        y = (x > 0.0) ? x : x * alpha;
+        break;
+      case 31:
+        y = (ref > 0.0) ? x : x * alpha;
+        break;
+      case 32:
+        y = 0.0;
+        break;
+    }
+
+    out[xi] = y * scale;
+  }
+}
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
+                                      int grad, float alpha, float scale) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+
+  auto x = input.contiguous();
+  auto b = bias.contiguous();
+  auto ref = refer.contiguous();
+
+  int use_bias = b.numel() ? 1 : 0;
+  int use_ref = ref.numel() ? 1 : 0;
+
+  int size_x = x.numel();
+  int size_b = b.numel();
+  int step_b = 1;
+
+  for (int i = 1 + 1; i < x.dim(); i++) {
+    step_b *= x.size(i);
+  }
+
+  int loop_x = 4;
+  int block_size = 4 * 32;
+  int grid_size = (size_x - 1) / (loop_x * block_size) + 1;
+
+  auto y = torch::empty_like(x);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      x.scalar_type(), "fused_bias_act_kernel", [&] {
+        fused_bias_act_kernel<scalar_t><<<grid_size, block_size, 0, stream>>>(
+            y.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(), ref.data_ptr<scalar_t>(), act, grad, alpha,
+            scale, loop_x, size_x, step_b, size_b, use_bias, use_ref);
+      });
+
+  return y;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aa0e4564c6fdd30d997d4d62245e4c56d1c49c55
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
@@ -0,0 +1,104 @@
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+// clang-format off
+// TODO: make spconv_utils.h order agnostic
+#include "../spconv_utils.h"
+// clang-format on
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/reordering.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+
+  torch::Tensor output =
+      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {  // the center index of subm conv don't need gather and scatter
+               // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "FusedIndiceConvBatchnormKernel", [&] {
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            /* slower than SparseGatherFunctor, may due to int->long conversion
+            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+            auto indicePairBlob =
+            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
+            indicePairOptions); torch::index_select_out(inputBufferBlob,
+            features, 0, indicePairBlob);*/
+          }
+          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+
+  return output;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd0a7b5daf03510cfb7408ff82cfac760af92afb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gather_points_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out) {
+  // points: (B, C, N)
+  // idx: (B, npoints)
+  // output:
+  //      out: (B, C, npoints)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "gather_points_forward_cuda_kernel", [&] {
+        gather_points_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, points.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points) {
+  // grad_out: (B, C, npoints)
+  // idx: (B, npoints)
+  // output:
+  //      grad_points: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "gather_points_backward_cuda_kernel", [&] {
+        gather_points_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, grad_out.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..42fc2bb67b13938b8994f1961ec2fbc41a30d2d8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "group_points_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "group_points_forward_cuda_kernel", [&] {
+        group_points_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, nsample, points.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points) {
+  // grad_out: (B, C, npoints, nsample)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      grad_points: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "group_points_backward_cuda_kernel", [&] {
+        group_points_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, nsample, grad_out.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..968e13d5dccca91135ee13fce3ec83a75e75a919
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
@@ -0,0 +1,104 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include <stdio.h>
+
+#include "iou3d_cuda_kernel.cuh"
+#include "nms_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap) {
+  at::cuda::CUDAGuard device_guard(boxes_a.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
+              GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
+  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
+
+  iou3d_boxes_overlap_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      num_a, boxes_a.data_ptr<float>(), num_b, boxes_b.data_ptr<float>(),
+      ans_overlap.data_ptr<float>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
+                                         Tensor& keep_num,
+                                         float nms_overlap_thresh) {
+  using namespace at::indexing;
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int boxes_num = boxes.size(0);
+
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+
+  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
+              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+
+  iou3d_nms3d_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),
+      (unsigned long long*)mask.data_ptr<int64_t>());
+
+  at::Tensor keep_t = at::zeros(
+      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
+  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
+                          col_blocks * sizeof(unsigned long long), stream>>>(
+      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
+      boxes_num);
+
+  auto keep_data = keep_t.nonzero().index({Slice(), 0});
+  keep_num.fill_(at::Scalar(keep_data.size(0)));
+  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes, Tensor& keep,
+                                               Tensor& keep_num,
+                                               float nms_overlap_thresh) {
+  using namespace at::indexing;
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int boxes_num = boxes.size(0);
+
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+
+  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
+              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+
+  iou3d_nms3d_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(),
+      (unsigned long long*)mask.data_ptr<int64_t>());
+
+  at::Tensor keep_t = at::zeros(
+      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
+  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
+                          col_blocks * sizeof(unsigned long long), stream>>>(
+      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
+      boxes_num);
+
+  auto keep_data = keep_t.nonzero().index({Slice(), 0});
+  keep_num.fill_(at::Scalar(keep_data.size(0)));
+  keep.index_put_({Slice(0, keep_data.size(0))}, keep_data);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e3351819779cc356cc21d7bb375082f71da2cb75
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
@@ -0,0 +1,34 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+
+#include <cmath>
+#include <cstdio>
+
+#include "knn_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2) {
+  // param new_xyz: (B, m, 3)
+  // param xyz: (B, n, 3)
+  // param idx: (B, m, nsample)
+
+  at::cuda::CUDAGuard device_guard(new_xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      new_xyz.scalar_type(), "knn_forward_cuda_kernel", [&] {
+        knn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            b, n, m, nsample, xyz.data_ptr<scalar_t>(),
+            new_xyz.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+            dist2.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..022e18901580a415037d1d5942791b3ccafc30b9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
@@ -0,0 +1,54 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "masked_conv2d_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w) {
+  int channels = bottom_data.size(1);
+  int height = bottom_data.size(2);
+  int width = bottom_data.size(3);
+  int mask_cnt = mask_h_idx.size(0);
+  int output_size = mask_cnt * channels;
+
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
+        MaskedIm2colForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data_, height, width, kernel_h, kernel_w,
+                pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void MaskedCol2imForwardCUDAKernelLauncher(
+    const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,
+    Tensor top_data, const int height, const int width, const int channels) {
+  int mask_cnt = mask_h_idx.size(0);
+  int output_size = mask_cnt * channels;
+
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
+
+        MaskedCol2imForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data_, height, width, channels, mask_h_idx_,
+                mask_w_idx_, mask_cnt, top_data_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9314f2dda6c89e1f35369b1b7ab9d290cf2ab295
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/minareabbox/src/minareabbox_kernel.cu
+#include "min_area_polygons_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets,
+                                       Tensor polygons) {
+  int num_pointsets = pointsets.size(0);
+  const int output_size = polygons.numel();
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "min_area_polygons_cuda_kernel", ([&] {
+        min_area_polygons_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                num_pointsets, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2b52796e4fdfa2b8bf039fd66f0b16a3af8c84ee
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
@@ -0,0 +1,96 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "modulated_deform_conv_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im,
+            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
+            channels, deformable_group, height_col, width_col, data_col_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_offset_, data_mask_, channels,
+            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+                          kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
+            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size,
+            2 * kernel_h * kernel_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_, grad_mask_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8e1e62df512b3f94ea450b082e2457e1a541e9e6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
@@ -0,0 +1,351 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <THC/THCAtomics.cuh>
+#include <vector>
+
+#include "ms_deform_attn_cuda_kernel.cuh"
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value,
+                               const int64_t *data_spatial_shapes,
+                               const int64_t *data_level_start_index,
+                               const scalar_t *data_sampling_loc,
+                               const scalar_t *data_attn_weight,
+                               const int batch_size, const int spatial_size,
+                               const int num_heads, const int channels,
+                               const int num_levels, const int num_query,
+                               const int num_point, scalar_t *data_col) {
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = THREADS_PER_BLOCK;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0, stream>>>(
+          num_kernels, data_value, data_spatial_shapes, data_level_start_index,
+          data_sampling_loc, data_attn_weight, batch_size, spatial_size,
+          num_heads, channels, num_levels, num_query, num_point, data_col);
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(
+    cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  const int num_threads =
+      (channels > THREADS_PER_BLOCK) ? THREADS_PER_BLOCK : channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > THREADS_PER_BLOCK) {
+    if ((channels & THREADS_PER_BLOCK - 1) == 0) {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+             num_threads * 3 * sizeof(scalar_t), stream>>>(
+              num_kernels, grad_col, data_value, data_spatial_shapes,
+              data_level_start_index, data_sampling_loc, data_attn_weight,
+              batch_size, spatial_size, num_heads, channels, num_levels,
+              num_query, num_point, grad_value, grad_sampling_loc,
+              grad_attn_weight);
+    } else {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, spatial_size, num_heads,
+                       channels, num_levels, num_query, num_point, grad_value,
+                       grad_sampling_loc, grad_attn_weight);
+    }
+  } else {
+    switch (channels) {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      1>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      2>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      4>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      8>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      16>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      32>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      64>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      128>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      256>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      512>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      default:
+        if (channels < 64) {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        } else {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
+                                       const at::Tensor &spatial_shapes,
+                                       const at::Tensor &level_start_index,
+                                       const at::Tensor &sampling_loc,
+                                       const at::Tensor &attn_weight,
+                                       const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+
+  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  auto output =
+      at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+  const int batch_n = im2col_step_;
+  auto output_n = output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto columns = output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
+          ms_deformable_im2col_cuda(
+              at::cuda::getCurrentCUDAStream(),
+              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data_ptr<int64_t>(),
+              level_start_index.data_ptr<int64_t>(),
+              sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point, columns.data_ptr<scalar_t>());
+        }));
+  }
+
+  output = output.view({batch, num_query, num_heads * channels});
+
+  return output;
+}
+
+void ms_deform_attn_cuda_backward(
+    const at::Tensor &value, const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index, const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight, const at::Tensor &grad_output,
+    at::Tensor &grad_value, at::Tensor &grad_sampling_loc,
+    at::Tensor &grad_attn_weight, const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+  AT_ASSERTM(grad_output.is_contiguous(),
+             "grad_output tensor has to be contiguous");
+
+  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+  AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  const int batch_n = im2col_step_;
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  auto grad_output_n = grad_output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto grad_output_g = grad_output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
+          ms_deformable_col2im_cuda(
+              at::cuda::getCurrentCUDAStream(),
+              grad_output_g.data_ptr<scalar_t>(),
+              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data_ptr<int64_t>(),
+              level_start_index.data_ptr<int64_t>(),
+              sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point,
+              grad_value.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_value_size,
+              grad_sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              grad_attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size);
+        }));
+  }
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1b87e0fa75bd5507ed0c94c7e32eb601a95a5f76
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
@@ -0,0 +1,36 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "nms_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset) {
+  at::cuda::CUDAGuard device_guard(boxes.device());
+
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+  auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));
+  auto boxes_sorted = boxes.index_select(0, order_t);
+
+  int boxes_num = boxes.size(0);
+  const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;
+  const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  dim3 blocks(col_blocks_alloc, col_blocks_alloc);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  nms_cuda<<<blocks, threads, 0, stream>>>(
+      boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),
+      (unsigned long long*)mask.data_ptr<int64_t>());
+
+  // Filter the boxes which should be kept.
+  at::Tensor keep_t = at::zeros(
+      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
+  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
+                          col_blocks * sizeof(unsigned long long), stream>>>(
+      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
+      boxes_num);
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.masked_select(keep_t);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..15004b82179ab36408355cac4deef90de252b291
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu
@@ -0,0 +1,60 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "nms_quadri_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+Tensor nms_quadri_cuda(const Tensor dets, const Tensor scores,
+                       const Tensor order_t, const Tensor dets_sorted,
+                       float iou_threshold, const int multi_label) {
+  // using scalar_t = float;
+  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());
+
+  int dets_num = dets.size(0);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
+
+  Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      dets_sorted.scalar_type(), "nms_quadri_kernel_cuda", [&] {
+        nms_quadri_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
+      });
+
+  Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e1185f81cb2fd58d00a30d3fff5215af76f57a85
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
@@ -0,0 +1,62 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
+#include "nms_rotated_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order_t, const Tensor dets_sorted,
+                        float iou_threshold, const int multi_label) {
+  // using scalar_t = float;
+  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());
+
+  int dets_num = dets.size(0);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
+
+  Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
+        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
+      });
+
+  Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3cc89d010a80126360fe42503a1754ef4a420afa
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
@@ -0,0 +1,62 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <stdio.h>
+
+#include "points_in_boxes_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is
+  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
+  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
+  // -1
+
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      boxes.scalar_type(), "points_in_boxes_part_forward_cuda_kernel", [&] {
+        points_in_boxes_part_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
+                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      boxes.scalar_type(), "points_in_boxes_all_forward_cuda_kernel", [&] {
+        points_in_boxes_all_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
+                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6e7db9ddfd63e4bfb3ca150a83dde5a79fb1717e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/ming71/CUDA/blob/master/point_justify/points_justify_kernel.cu
+
+#include <stdio.h>
+
+#include "points_in_polygons_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output) {
+  const int output_size = rows * cols;
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "points_in_polygons_forward_cuda_kernel", ([&] {
+        const scalar_t *vertex1 = points.data_ptr<scalar_t>();
+        const scalar_t *vertex2 = polygons.data_ptr<scalar_t>();
+        scalar_t *inside_flag = output.data_ptr<scalar_t>();
+
+        points_in_polygons_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, vertex1, vertex2, rows, cols, inside_flag);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e0636098b1d6fb6eef0c6a5ff334ddb43ae7855f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
@@ -0,0 +1,65 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "prroi_pool_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                        Tensor output, int pooled_height,
+                                        int pooled_width, float spatial_scale) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_forward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, input.data_ptr<float>(), rois.data_ptr<float>(),
+          output.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                         Tensor grad_input, int pooled_height,
+                                         int pooled_width,
+                                         float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_backward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, grad_output.data_ptr<float>(), rois.data_ptr<float>(),
+          grad_input.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void PrROIPoolCoorBackwardCUDAKernelLauncher(Tensor output, Tensor grad_output,
+                                             Tensor input, Tensor rois,
+                                             Tensor grad_rois,
+                                             int pooled_height,
+                                             int pooled_width,
+                                             float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_coor_backward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, output.data_ptr<float>(), grad_output.data_ptr<float>(),
+          input.data_ptr<float>(), rois.data_ptr<float>(),
+          grad_rois.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a0bdfa60c2d3ba75d089d0bfa44648821aaf4fed
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
@@ -0,0 +1,60 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+
+#include <torch/serialize/tensor.h>
+
+#include "psamask_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (psa_type == 0)
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "psamask_collect_forward_cuda", [&] {
+          psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+              half_w_mask, input.data_ptr<scalar_t>(),
+              output.data_ptr<scalar_t>());
+        });
+  else
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "psamask_distribute_forward_cuda", [&] {
+          psamask_distribute_forward_cuda<scalar_t>
+              <<<nthreads, 512, 0, stream>>>(
+                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                  half_w_mask, input.data_ptr<scalar_t>(),
+                  output.data_ptr<scalar_t>());
+        });
+}
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (psa_type == 0)
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_input.scalar_type(), "psamask_collect_backward_cuda", [&] {
+          psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+              half_w_mask, grad_output.data_ptr<scalar_t>(),
+              grad_input.data_ptr<scalar_t>());
+        });
+  else
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_input.scalar_type(), "psamask_distribute_backward_cuda", [&] {
+          psamask_distribute_backward_cuda<scalar_t>
+              <<<nthreads, 512, 0, stream>>>(
+                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                  half_w_mask, grad_output.data_ptr<scalar_t>(),
+                  grad_input.data_ptr<scalar_t>());
+        });
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9829da731d6f5ad61ad2cde04a3b8511b5ca942c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "riroi_align_rotated_cuda_kernel.cuh"
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output) {
+  const int output_size =
+      num_rois * pooled_height * pooled_width * channels * num_orientations;
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "riroi_align_rotated_forward_cuda_kernel", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+
+        riroi_align_rotated_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                num_samples, clockwise, channels, height, width, pooled_height,
+                pooled_width, num_orientations, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad) {
+  const int output_size =
+      num_rois * pooled_height * pooled_width * channels * num_orientations;
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "riroi_align_rotated_backward_cuda_kernel", ([&] {
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
+        riroi_align_rotated_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, top_diff, rois_data, spatial_scale, num_samples,
+                clockwise, channels, height, width, pooled_height, pooled_width,
+                num_orientations, bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3d4f7614e4bce44b77027c82d99cabbd571e608c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "roi_align_cuda_kernel.cuh"
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "roi_align_forward_cuda_kernel", [&] {
+        roi_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax_y.data_ptr<scalar_t>(), argmax_x.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+                aligned, channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "roi_align_backward_cuda_kernel", [&] {
+        roi_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+                argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+                aligned, channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c0fd987bb91d4c903c7e408190d7a31b906bae62
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
@@ -0,0 +1,45 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "roi_align_rotated_cuda_kernel.cuh"
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlignRotatedLaucherForward", ([&] {
+        const scalar_t *bottom_data = input.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+
+        roi_align_rotated_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                sampling_ratio, aligned, clockwise, channels, height, width,
+                pooled_height, pooled_width, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] {
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
+        roi_align_rotated_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, top_diff, rois_data, spatial_scale, sampling_ratio,
+                aligned, clockwise, channels, height, width, pooled_height,
+                pooled_width, bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d9cdf3050964e9bd4fbb64f0650b138ccb51ac6d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
@@ -0,0 +1,50 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "roi_pool_cuda_kernel.cuh"
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "roi_pool_forward_cuda_kernel", [&] {
+        roi_pool_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax.data_ptr<int>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "roi_pool_backward_cuda_kernel", [&] {
+        roi_pool_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), argmax.data_ptr<int>(),
+                grad_input.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7d83755f4c89104a037cb7c16a59e6dd25f84e12
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
@@ -0,0 +1,118 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <stdio.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "roiaware_pool3d_cuda_kernel.cuh"
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate params pts: (npoints, 3) [x, y, z] in LiDAR coordinate params
+  // pts_feature: (npoints, C) params argmax: (N, out_x, out_y, out_z, C) params
+  // pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) params
+  // pooled_features: (N, out_x, out_y, out_z, C) params pool_method: 0:
+  // max_pool 1: avg_pool
+
+  at::cuda::CUDAGuard device_guard(pts_feature.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  Tensor pts_mask =
+      -at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));
+
+  dim3 blocks_mask(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      rois.scalar_type(), "generate_pts_mask_for_box3d", [&] {
+        generate_pts_mask_for_box3d<scalar_t>
+            <<<blocks_mask, threads, 0, stream>>>(
+                boxes_num, pts_num, out_x, out_y, out_z,
+                rois.data_ptr<scalar_t>(), pts.data_ptr<scalar_t>(),
+                pts_mask.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // TODO: Merge the collect and pool functions, SS
+
+  dim3 blocks_collect(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK));
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      pts_idx_of_voxels.scalar_type(), "collect_inside_pts_for_box3d", [&] {
+        collect_inside_pts_for_box3d<scalar_t>
+            <<<blocks_collect, threads, 0, stream>>>(
+                boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z,
+                pts_mask.data_ptr<int>(),
+                pts_idx_of_voxels.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  dim3 blocks_pool(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK),
+                   channels, boxes_num);
+  if (pool_method == 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        pts_feature.scalar_type(), "roiaware_maxpool3d", [&] {
+          roiaware_maxpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
+              out_z, pts_feature.data_ptr<scalar_t>(),
+              pts_idx_of_voxels.data_ptr<int>(),
+              pooled_features.data_ptr<scalar_t>(), argmax.data_ptr<int>());
+        });
+  } else if (pool_method == 1) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        pts_feature.scalar_type(), "roiaware_avgpool3d", [&] {
+          roiaware_avgpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
+              out_z, pts_feature.data_ptr<scalar_t>(),
+              pts_idx_of_voxels.data_ptr<int>(),
+              pooled_features.data_ptr<scalar_t>());
+        });
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool, 1: avg_pool
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+              boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  if (pool_method == 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        grad_in.scalar_type(), "roiaware_maxpool3d_backward", [&] {
+          roiaware_maxpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
+              boxes_num, channels, out_x, out_y, out_z, argmax.data_ptr<int>(),
+              grad_out.data_ptr<scalar_t>(), grad_in.data_ptr<scalar_t>());
+        });
+  } else if (pool_method == 1) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        grad_in.scalar_type(), "roiaware_avgpool3d_backward", [&] {
+          roiaware_avgpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
+              boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
+              pts_idx_of_voxels.data_ptr<int>(), grad_out.data_ptr<scalar_t>(),
+              grad_in.data_ptr<scalar_t>());
+        });
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..af2098e8229ef29c08fe3c8d715863fe67cda06e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
@@ -0,0 +1,60 @@
+/*
+Modified from
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "roipoint_pool3d_cuda_kernel.cuh"
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features,
+    Tensor pooled_empty_flag) {
+  Tensor pts_assign = at::empty({batch_size, pts_num, boxes_num},
+                                boxes3d.options().dtype(at::kInt));
+
+  at::cuda::CUDAGuard device_guard(xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz.scalar_type(), "assign_pts_to_box3d", [&] {
+        assign_pts_to_box3d<scalar_t><<<blocks, threads, 0, stream>>>(
+            batch_size, pts_num, boxes_num, xyz.data_ptr<scalar_t>(),
+            boxes3d.data_ptr<scalar_t>(), pts_assign.data_ptr<int>());
+      });
+
+  Tensor pts_idx = at::empty({batch_size, boxes_num, sampled_pts_num},
+                             boxes3d.options().dtype(at::kInt));
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks2(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK), batch_size);
+
+  get_pooled_idx<<<blocks2, threads, 0, stream>>>(
+      batch_size, pts_num, boxes_num, sampled_pts_num,
+      pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),
+      pooled_empty_flag.data_ptr<int>());
+
+  dim3 blocks_pool(GET_BLOCKS(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
+                   batch_size);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz.scalar_type(), "roipoint_pool3d_forward", [&] {
+        roipoint_pool3d_forward<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+            batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
+            xyz.data_ptr<scalar_t>(), pts_idx.data_ptr<int>(),
+            pts_feature.data_ptr<scalar_t>(),
+            pooled_features.data_ptr<scalar_t>(),
+            pooled_empty_flag.data_ptr<int>());
+      });
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d172338ae76b7d1509b3011383d3ea95ee8d9527
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#include "pytorch_cuda_helper.hpp"
+#include "rotated_feature_align_cuda_kernel.cuh"
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int output_size = features.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "rotated_feature_align_forward_cuda_kernel",
+      ([&] {
+        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* top_data = output.data_ptr<scalar_t>();
+
+        rotated_feature_align_forward_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, points, bottom_data, bboxes_data,
+                scalar_t(spatial_scale), features.size(1), features.size(2),
+                features.size(3), top_data);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad) {
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int output_size = top_grad.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "rotated_feature_align_backward_cuda_kernel",
+      ([&] {
+        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();
+
+        rotated_feature_align_backward_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, points, top_diff, bboxes_data,
+                scalar_t(spatial_scale), top_grad.size(1), top_grad.size(2),
+                top_grad.size(3), bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cbc44651fc51a5392031e51355de242837242596
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
@@ -0,0 +1,132 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <stdio.h>
+#include <stdlib.h>
+#include <torch/types.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "scatter_points_cuda_kernel.cuh"
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor &feats, const at::Tensor &coors,
+    const reduce_t reduce_type) {
+  const int num_input = feats.size(0);
+  const int num_feats = feats.size(1);
+
+  if (num_input == 0)
+    return {feats.clone().detach(), coors.clone().detach(),
+            coors.new_empty({0}, torch::kInt32),
+            coors.new_empty({0}, torch::kInt32)};
+
+  at::Tensor out_coors;
+  at::Tensor coors_map;
+  at::Tensor reduce_count;
+
+  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);
+
+  std::tie(out_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, true);
+
+  if (out_coors[0][0].lt(0).item<bool>()) {
+    // the first element of out_coors (-1,-1,-1) and should be removed
+    out_coors = out_coors.slice(0, 1);
+    reduce_count = reduce_count.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  coors_map = coors_map.to(torch::kInt32);
+  reduce_count = reduce_count.to(torch::kInt32);
+
+  auto reduced_feats =
+      at::empty({out_coors.size(0), num_feats}, feats.options());
+
+  at::cuda::CUDAGuard device_guard(feats.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      feats.scalar_type(), "feats_reduce_kernel", ([&] {
+        if (reduce_type == reduce_t::MAX)
+          reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
+        else
+          reduced_feats.fill_(static_cast<scalar_t>(0));
+
+        dim3 blocks(std::min(
+            at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+        dim3 threads(THREADS_PER_BLOCK);
+        feats_reduce_kernel<<<blocks, threads, 0, stream>>>(
+            feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
+            reduced_feats.data_ptr<scalar_t>(), num_input, num_feats,
+            reduce_type);
+        if (reduce_type == reduce_t::MEAN)
+          reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return {reduced_feats, out_coors, coors_map, reduce_count};
+}
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
+    const at::Tensor &feats, const at::Tensor &reduced_feats,
+    const at::Tensor &coors_map, const at::Tensor &reduce_count,
+    const reduce_t reduce_type) {
+  const int num_input = feats.size(0);
+  const int num_reduced = reduced_feats.size(0);
+  const int num_feats = feats.size(1);
+
+  grad_feats.fill_(0);
+  // copy voxel grad to points
+
+  if (num_input == 0 || num_reduced == 0) return;
+  at::cuda::CUDAGuard device_guard(feats.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
+        ([&] {
+          dim3 blocks(std::min(
+              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          add_reduce_traceback_grad_kernel<<<blocks, threads, 0, stream>>>(
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
+              num_input, num_feats, reduce_type);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  } else {
+    auto reduce_from = at::full({num_reduced, num_feats}, num_input,
+                                coors_map.options().dtype(torch::kInt32));
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(std::min(
+              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads, 0,
+                                                    stream>>>(
+              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
+              num_input, num_feats);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(
+              std::min(at::cuda::ATenCeilDiv(num_reduced, THREADS_PER_BLOCK),
+                       maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          max_reduce_scatter_grad_kernel<<<blocks, threads, 0, stream>>>(
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a014dfc8e943c076662bd49b7be36b82e2338992
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
@@ -0,0 +1,159 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+// clang-format off
+// TODO: make spconv_utils.h order agnostic
+#include "../spconv_utils.h"
+// clang-format on
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
+#include <spconv/indice.cuh>
+#include <type_traits>
+
+#include "pytorch_cuda_helper.hpp"
+
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose) {
+    Index batchSize = gridsOut.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    if (transpose)
+      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                              indiceNum, indicePairUnique, kernelSize, stride,
+                              padding, dilation, outSpatialShape);
+    else
+      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                              indiceNum, indicePairUnique, kernelSize, stride,
+                              padding, dilation, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    return 1;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    Index batchSize = gridsOut.dim(0);
+    auto kernelVolume = indicePairs.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    Index numAct = indicePairUnique.dim(0) - 1;
+    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
+                            indicePairUnique, outSpatialShape, batchSize);
+    TV_CHECK_CUDA_ERR();
+    assignIndicePairsKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
+                            indicePairUnique, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+
+    if (resetGrid) {
+      resetGridKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numAct;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    prepareSubMGridKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
+                            kernelSize, stride, padding, dilation,
+                            outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+
+    if (resetGrid) {
+      resetGridSubMKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
+                              numActIn);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numActIn;
+  }
+};
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \
+  template struct functor::CreateConvIndicePairFunctor<tv::TorchGPU, Index,   \
+                                                       int, NDIM>;            \
+  template struct functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, \
+                                                         int, NDIM>;          \
+  template struct functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, \
+                                                         int, NDIM>;          \
+  template struct functor::CreateSubMIndicePairFunctor<tv::TorchGPU, Index,   \
+                                                       int, NDIM>;
+
+#define DECLARE_GPU_INDEX(Index)          \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_GPU_INDEX(int);
+
+#undef DECLARE_GPU_INDEX
+#undef DECLARE_GPU_SPECS_INDEX_NDIM
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
new file mode 100644
index 0000000000000000000000000000000000000000..985820556d57300a306fdfb3b216edc8a5a8a726
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
@@ -0,0 +1,486 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+// clang-format off
+// TODO: make spconv_utils.h order agnostic
+#include "../spconv_utils.h"
+// clang-format on
+#include <utils/spconv/spconv/maxpool.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
+#include <type_traits>
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdBlockKernel(scalar_t *outFeatures,
+                                      const scalar_t *inFeatures,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
+  scalar_t in, out;
+  int ILPStrideY[NumILP];
+  Index idxo, idxi;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in > out) {
+          outFeatures[idxo] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdGenericBlockKernel(scalar_t *outFeatures,
+                                             const scalar_t *inFeatures,
+                                             const Index *indicesIn,
+                                             const Index *indicesOut,
+                                             int numHot, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in > out) {
+          outFeatures[RO[ilp] + iy] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void maxPoolFwdVecBlockKernel(scalar_t *outFeatures,
+                                         const scalar_t *inFeatures,
+                                         const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+  scalar_t bufi[vecloadFactor];
+  scalar_t bufo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] > bufo[i]) {
+          bufo[i] = bufi[i];
+        }
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idxo] =
+          reinterpret_cast<VecType *>(bufo)[0];
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdGenericKernel(scalar_t *outFeatures,
+                                        const scalar_t *inFeatures,
+                                        const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in > out) {
+            outFeatures[RO[ilp] + iy] = in;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdBlockKernel(const scalar_t *outFeatures,
+                                      const scalar_t *inFeatures,
+                                      const scalar_t *fout, scalar_t *fin,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
+  scalar_t in, out;
+  Index idxo, idxi;
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  fout += blockIdx.y * NumTLP;
+  fin += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in == out) {
+          fin[idxi] += fout[idxo];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdGenericBlockKernel(
+    const scalar_t *outFeatures, const scalar_t *inFeatures,
+    const scalar_t *fout, scalar_t *fin, const Index *indicesIn,
+    const Index *indicesOut, int numHot, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in == out) {
+          fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void maxPoolBwdVecBlockKernel(const scalar_t *outFeatures,
+                                         const scalar_t *inFeatures,
+                                         const scalar_t *fout, scalar_t *fin,
+                                         const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+  scalar_t bufi[vecloadFactor];
+  scalar_t bufo[vecloadFactor];
+  scalar_t bufdi[vecloadFactor];
+  scalar_t bufdo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<const VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+      reinterpret_cast<VecType *>(bufdo)[0] =
+          reinterpret_cast<const VecType *>(fout)[idxo];
+      reinterpret_cast<VecType *>(bufdi)[0] =
+          reinterpret_cast<VecType *>(fin)[idxi];
+
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] == bufo[i]) {
+          bufdi[i] += bufdo[i];
+        }
+      }
+      reinterpret_cast<VecType *>(fin)[idxi] =
+          reinterpret_cast<VecType *>(bufdi)[0];
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdGenericKernel(const scalar_t *outFeatures,
+                                        const scalar_t *inFeatures,
+                                        const scalar_t *fout, scalar_t *fin,
+                                        const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in == out) {
+            fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolFwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolFwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolFwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+
+      if (size > numHotBlock) {
+        maxPoolFwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d,
+                  tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &fout, &fin,
+                                 &indices, &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolBwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    fout.data(), fin.data(),
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolBwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                       fout.data(), fin.data(),
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolBwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+
+      if (size > numHotBlock) {
+        maxPoolBwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, \
+                                                       Index>;                 \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU,          \
+                                                        scalar_t, Index>;
+
+#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..74b182d1290dfccb1f6b6232fc668b1b0c2048e0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
@@ -0,0 +1,91 @@
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+// clang-format off
+// TODO: make spconv_utils.h order agnostic
+#include "../spconv_utils.h"
+// clang-format on
+#include <utils/spconv/spconv/maxpool.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
+                                                     torch::Tensor indicePairs,
+                                                     torch::Tensor indiceNum,
+                                                     int64_t numAct) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  auto device = features.device().type();
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceMaxpoolForwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, int>
+                forwardFtor;
+            forwardFtor(tv::CPU(), tv::torch2tv<scalar_t>(output),
+                        tv::torch2tv<const scalar_t>(features),
+                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+          } else {
+            functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, int>
+                forwardFtor;
+            forwardFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                        tv::torch2tv<const scalar_t>(features),
+                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return output;
+}
+
+torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
+                                                      torch::Tensor outFeatures,
+                                                      torch::Tensor outGrad,
+                                                      torch::Tensor indicePairs,
+                                                      torch::Tensor indiceNum) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  auto device = features.device().type();
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  auto kernelVolume = indicePairs.size(0);
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceMaxpoolBackwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, int>
+                backwardFtor;
+            backwardFtor(tv::CPU(), tv::torch2tv<const scalar_t>(outFeatures),
+                         tv::torch2tv<const scalar_t>(features),
+                         tv::torch2tv<const scalar_t>(outGrad),
+                         tv::torch2tv<scalar_t>(inputGrad),
+                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+          } else {
+            functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, int>
+                backwardFtor;
+            backwardFtor(tv::TorchGPU(),
+                         tv::torch2tv<const scalar_t>(outFeatures),
+                         tv::torch2tv<const scalar_t>(features),
+                         tv::torch2tv<const scalar_t>(outGrad),
+                         tv::torch2tv<scalar_t>(inputGrad),
+                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return inputGrad;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ba1686082fe16cc10673bc18fa15ff290d5007b9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
@@ -0,0 +1,160 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+// clang-format off
+// TODO: make spconv_utils.h order agnostic
+#include "../spconv_utils.h"
+// clang-format on
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/spconv/reordering.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
+#include <spconv/reordering.cuh>
+#include <type_traits>
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+#include "pytorch_cuda_helper.hpp"
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseGatherFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = features.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            gatherVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                 vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(buffer.data(), features.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
+
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            gatherVecKernel<scalar_t, Index, int(NumTLP), NumILP,
+                            vecload_type_t>
+                <<<dim3(1, numPlanes / NumTLP),
+                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
+                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
+                                    features.data(), indices.data() + nHotBlock,
+                                    size - nHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      gatherGenericKernel<scalar_t, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              buffer.data(), features.data(), indices.data(), size, numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+template <typename scalar_t, typename Index>
+struct SparseScatterAddFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    if (size <= 0) return;
+    int numPlanes = outFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor =
+        sizeof(vecload_type_t) / sizeof(scalar_t);  // important for half.
+    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            scatterAddVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), buffer.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            scatterAddGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(
+                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
+                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      scatterAddGenericKernel<scalar_t, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              outFeatures.data(), buffer.data(), indices.data(), size,
+              numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
+  template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \
+  template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t,     \
+                                                   Index>;
+
+#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d3791f3db2741fb010d4838a0ef0a402afd5cff3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
@@ -0,0 +1,477 @@
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+// clang-format off
+// TODO: make spconv_utils.h order agnostic
+#include "../spconv_utils.h"
+// clang-format on
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/reordering.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  at::cuda::CUDAGuard device_guard(indices.device());
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1;
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor gridOut =
+      torch::full({batchSize * outputVolume}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique = torch::full(
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                     torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
+                                                 NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
+                                                 NDim>();
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  at::cuda::CUDAGuard device_guard(indices.device());
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1;
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique = torch::full(
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose, true);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                     torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose, true);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
+                                                 NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
+                                                 NDim>();
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
+            true);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
+torch::Tensor IndiceConvForwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
+    int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceConvForwardKernel", [&] {
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            /* slower than SparseGatherFunctor, may due to int->long conversion
+            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+            auto indicePairBlob =
+            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
+            indicePairOptions); torch::index_select_out(inputBufferBlob,
+            features, 0, indicePairBlob);*/
+          }
+          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return output;
+}
+
+std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  auto filterShape = filters.sizes();
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    auto filterGradSub = filtersGrad[indicePairMaxOffset];
+    torch::mm_out(filterGradSub, features.t(), outGrad);
+    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
+  }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceConvBackwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtorOut;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            gatherFtorOut(
+                tv::CPU(), tv::torch2tv<scalar_t>(outputBuffer),
+                tv::torch2tv<const scalar_t>(outGrad),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtorOut;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            gatherFtorOut(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(outputBuffer),
+                tv::torch2tv<const scalar_t>(outGrad),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+          auto filterGradSub = filtersGrad[i];
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
+          torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(inputGrad),
+                tv::torch2tv<const scalar_t>(inputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(inputGrad),
+                tv::torch2tv<const scalar_t>(inputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return {inputGrad, filtersGrad.view(filterShape)};
+}
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<2>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<3>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3095df5ee32070b340deec15f43d1fc093a2b282
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu
@@ -0,0 +1,45 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "stack_ball_query_cuda_kernel.cuh"
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+void StackBallQueryForwardCUDAKernelLauncher(float max_radius, int nsample,
+                                             const Tensor new_xyz,
+                                             const Tensor new_xyz_batch_cnt,
+                                             const Tensor xyz,
+                                             const Tensor xyz_batch_cnt,
+                                             Tensor idx) {
+  at::cuda::CUDAGuard device_guard(new_xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  //   const float *new_xyz_ptr = new_xyz.data_ptr<float>();
+  //   const float *xyz_ptr = xyz.data_ptr<float>();
+  //   const int *new_xyz_batch_cnt_ptr = new_xyz_batch_cnt.data_ptr<int>();
+  //   const int *xyz_batch_cnt_ptr = xyz_batch_cnt.data_ptr<int>();
+  //   int *idx_ptr = idx.data_ptr<int>();
+
+  int B = xyz_batch_cnt.size(0);
+  int M = new_xyz.size(0);
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(M, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      new_xyz.scalar_type(), "stack_ball_query_forward_cuda_kernel", [&] {
+        stack_ball_query_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                B, M, max_radius, nsample, new_xyz.data_ptr<scalar_t>(),
+                new_xyz_batch_cnt.data_ptr<int>(), xyz.data_ptr<scalar_t>(),
+                xyz_batch_cnt.data_ptr<int>(), idx.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9f903b02a6750e0352f06ad268c35775d694b0fc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu
@@ -0,0 +1,62 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "stack_group_points_cuda_kernel.cuh"
+
+void StackGroupPointsForwardCUDAKernelLauncher(
+    int b, int c, int m, int nsample, const Tensor features_tensor,
+    const Tensor features_batch_cnt_tensor, const Tensor idx_tensor,
+    const Tensor idx_batch_cnt_tensor, Tensor out_tensor) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+  at::cuda::CUDAGuard device_guard(features_tensor.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features_tensor.scalar_type(), "stack_group_points_forward_cuda_kernel",
+      [&] {
+        stack_group_points_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, m, nsample, features_tensor.data_ptr<scalar_t>(),
+                features_batch_cnt_tensor.data_ptr<int>(),
+                idx_tensor.data_ptr<int>(),
+                idx_batch_cnt_tensor.data_ptr<int>(),
+                out_tensor.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void StackGroupPointsBackwardCUDAKernelLauncher(
+    int b, int c, int m, int n, int nsample, const Tensor grad_out_tensor,
+    const Tensor idx_tensor, const Tensor idx_batch_cnt_tensor,
+    const Tensor features_batch_cnt_tensor, Tensor grad_features_tensor) {
+  at::cuda::CUDAGuard device_guard(grad_features_tensor.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(DIVUP(m * c * nsample, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_features_tensor.scalar_type(),
+      "stack_group_points_backward_cuda_kernel", [&] {
+        stack_group_points_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, m, n, nsample, grad_out_tensor.data_ptr<scalar_t>(),
+                idx_tensor.data_ptr<int>(),
+                idx_batch_cnt_tensor.data_ptr<int>(),
+                features_batch_cnt_tensor.data_ptr<int>(),
+                grad_features_tensor.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..657c81701b7c114af700c4f8cf37094c705b9a94
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
@@ -0,0 +1,110 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "sync_bn_cuda_kernel.cuh"
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_mean_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_var_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
+                var.data_ptr<float>(), num, channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_output_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
+                var.data_ptr<float>(), running_mean.data_ptr<float>(),
+                running_var.data_ptr<float>(), weight.data_ptr<float>(),
+                bias.data_ptr<float>(), norm.data_ptr<float>(),
+                std.data_ptr<float>(), output.data_ptr<scalar_t>(), num,
+                channels, spatial, eps, momentum, group_size);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias) {
+  int num = grad_output.size(0);
+  int channels = grad_output.size(1);
+  int spatial = grad_output.size(2);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "sync_bn_backward_param_cuda_kernel", [&] {
+        sync_bn_backward_param_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                grad_output.data_ptr<scalar_t>(), norm.data_ptr<float>(),
+                grad_weight.data_ptr<float>(), grad_bias.data_ptr<float>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input) {
+  int output_size = grad_input.numel();
+  int num = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int spatial = grad_input.size(2);
+
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "sync_bn_backward_data_cuda_kernel", [&] {
+        sync_bn_backward_data_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                weight.data_ptr<float>(), grad_weight.data_ptr<float>(),
+                grad_bias.data_ptr<float>(), norm.data_ptr<float>(),
+                std.data_ptr<float>(), grad_input.data_ptr<scalar_t>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..56a5550066035efb96d1d8e46c5f1ecd3e36083b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "three_interpolate_cuda_kernel.cuh"
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight,
+                                               Tensor out) {
+  // points: (B, C, M)
+  // idx: (B, N, 3)
+  // weight: (B, N, 3)
+  // output:
+  //      out: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "three_interpolate_forward_cuda_kernel", [&] {
+        three_interpolate_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, m, n, points.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+                weight.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points) {
+  // grad_out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
+  //      grad_points: (B, C, M)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "three_interpolate_backward_cuda_kernel", [&] {
+        three_interpolate_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, m, grad_out.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+                weight.data_ptr<scalar_t>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..91c68829b9f2c19f1a64def88475c0fedf40de9f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
@@ -0,0 +1,35 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "three_nn_cuda_kernel.cuh"
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx) {
+  // unknown: (B, N, 3)
+  // known: (B, M, 3)
+  // output:
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
+
+  at::cuda::CUDAGuard device_guard(unknown.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      unknown.scalar_type(), "three_nn_forward_cuda_kernel", [&] {
+        three_nn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            b, n, m, unknown.data_ptr<scalar_t>(), known.data_ptr<scalar_t>(),
+            dist2.data_ptr<scalar_t>(), idx.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..19c85c76c9f53cb70314d4cdc1c1d2379322f30e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
@@ -0,0 +1,55 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#include "tin_shift_cuda_kernel.cuh"
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output) {
+  int output_size = output.numel();
+  int batch_size = input.size(0);
+  int t_size = input.size(1);
+  int channels = input.size(2);
+  int hw_size = input.size(3);
+  int group_size = shift.size(1);
+  int group_channel = channels / group_size;
+  int num_kernels = batch_size * hw_size * channels;
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "tin_shift_forward_cuda_kernel", [&] {
+        tin_shift_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(), shift.data_ptr<int>(),
+                output.data_ptr<scalar_t>(), batch_size, channels, t_size,
+                hw_size, group_size, group_channel);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input) {
+  int output_size = grad_output.numel();
+  int batch_size = grad_output.size(0);
+  int t_size = grad_output.size(1);
+  int channels = grad_output.size(2);
+  int hw_size = grad_output.size(3);
+  int group_size = shift.size(1);
+  int group_channel = channels / group_size;
+  int num_kernels = batch_size * hw_size * channels;
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "tin_shift_backward_cuda_kernel", [&] {
+        tin_shift_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                shift.data_ptr<int>(), grad_input.data_ptr<scalar_t>(),
+                batch_size, channels, t_size, hw_size, group_size,
+                group_channel);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2f1ae0a683de7c441cec560c92c3831104971d27
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
@@ -0,0 +1,746 @@
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+#include <c10/util/Half.h>
+#include <torch/types.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+struct upfirdn2d_kernel_params {
+  const void *x;
+  const float *f;
+  void *y;
+
+  int2 up;
+  int2 down;
+  int2 pad0;
+  int flip;
+  float gain;
+
+  int4 inSize;  // [width, height, channel, batch]
+  int4 inStride;
+  int2 filterSize;  // [width, height]
+  int2 filterStride;
+  int4 outSize;  // [width, height, channel, batch]
+  int4 outStride;
+  int sizeMinor;
+  int sizeMajor;
+
+  int loopMinor;
+  int loopMajor;
+  int loopX;
+  int launchMinor;
+  int launchMajor;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel specialization.
+
+struct upfirdn2d_kernel_spec {
+  void *kernel;
+  int tileOutW;
+  int tileOutH;
+  int loopMinor;
+  int loopX;
+};
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T>
+upfirdn2d_kernel_spec choose_upfirdn2d_kernel(const upfirdn2d_kernel_params &p);
+//------------------------------------------------------------------------
+
+// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+//------------------------------------------------------------------------
+// Helpers.
+
+template <class T>
+struct InternalType;
+template <>
+struct InternalType<double> {
+  typedef double scalar_t;
+};
+template <>
+struct InternalType<float> {
+  typedef float scalar_t;
+};
+template <>
+struct InternalType<c10::Half> {
+  typedef float scalar_t;
+};
+
+static __device__ __forceinline__ int floor_div(int a, int b) {
+  int t = 1 - a / b;
+  return (a + t * b) / b - t;
+}
+
+//------------------------------------------------------------------------
+// Generic CUDA implementation for large filters.
+
+template <class T>
+static __global__ void upfirdn2d_kernel_large(upfirdn2d_kernel_params p) {
+  typedef typename InternalType<T>::scalar_t scalar_t;
+
+  // Calculate thread index.
+  int minorBase = blockIdx.x * blockDim.x + threadIdx.x;
+  int outY = minorBase / p.launchMinor;
+  minorBase -= outY * p.launchMinor;
+  int outXBase = blockIdx.y * p.loopX * blockDim.y + threadIdx.y;
+  int majorBase = blockIdx.z * p.loopMajor;
+  if (outXBase >= p.outSize.x | outY >= p.outSize.y | majorBase >= p.sizeMajor)
+    return;
+
+  // Setup Y receptive field.
+  int midY = outY * p.down.y + p.up.y - 1 - p.pad0.y;
+  int inY = min(max(floor_div(midY, p.up.y), 0), p.inSize.y);
+  int h =
+      min(max(floor_div(midY + p.filterSize.y, p.up.y), 0), p.inSize.y) - inY;
+  int filterY = midY + p.filterSize.y - (inY + 1) * p.up.y;
+  if (p.flip) filterY = p.filterSize.y - 1 - filterY;
+
+  // Loop over major, minor, and X.
+  for (int majorIdx = 0, major = majorBase;
+       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++)
+    for (int minorIdx = 0, minor = minorBase;
+         minorIdx < p.loopMinor & minor < p.sizeMinor;
+         minorIdx++, minor += p.launchMinor) {
+      int nc = major * p.sizeMinor + minor;
+      int n = nc / p.inSize.z;
+      int c = nc - n * p.inSize.z;
+      for (int loopX = 0, outX = outXBase; loopX < p.loopX & outX < p.outSize.x;
+           loopX++, outX += blockDim.y) {
+        // Setup X receptive field.
+        int midX = outX * p.down.x + p.up.x - 1 - p.pad0.x;
+        int inX = min(max(floor_div(midX, p.up.x), 0), p.inSize.x);
+        int w =
+            min(max(floor_div(midX + p.filterSize.x, p.up.x), 0), p.inSize.x) -
+            inX;
+        int filterX = midX + p.filterSize.x - (inX + 1) * p.up.x;
+        if (p.flip) filterX = p.filterSize.x - 1 - filterX;
+
+        // Initialize pointers.
+        const T *xp =
+            &((const T *)p.x)[inX * p.inStride.x + inY * p.inStride.y +
+                              c * p.inStride.z + n * p.inStride.w];
+        const float *fp =
+            &p.f[filterX * p.filterStride.x + filterY * p.filterStride.y];
+        int filterStepX = ((p.flip) ? p.up.x : -p.up.x) * p.filterStride.x;
+        int filterStepY = ((p.flip) ? p.up.y : -p.up.y) * p.filterStride.y;
+
+        // Inner loop.
+        scalar_t v = 0;
+        for (int y = 0; y < h; y++) {
+          for (int x = 0; x < w; x++) {
+            v += (scalar_t)(*xp) * (scalar_t)(*fp);
+            xp += p.inStride.x;
+            fp += filterStepX;
+          }
+          xp += p.inStride.y - w * p.inStride.x;
+          fp += filterStepY - w * filterStepX;
+        }
+
+        // Store result.
+        v *= p.gain;
+        ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +
+                   c * p.outStride.z + n * p.outStride.w] = (T)v;
+      }
+    }
+}
+
+//------------------------------------------------------------------------
+// Specialized CUDA implementation for small filters.
+
+template <class T, int upx, int upy, int downx, int downy, int filterW,
+          int filterH, int tileOutW, int tileOutH, int loopMinor>
+static __global__ void upfirdn2d_kernel_small(upfirdn2d_kernel_params p) {
+  typedef typename InternalType<T>::scalar_t scalar_t;
+  const int tileInW = ((tileOutW - 1) * downx + filterW - 1) / upx + 1;
+  const int tileInH = ((tileOutH - 1) * downy + filterH - 1) / upy + 1;
+  __shared__ volatile scalar_t sf[filterH][filterW];
+  __shared__ volatile scalar_t sx[tileInH][tileInW][loopMinor];
+
+  // Calculate tile index.
+  int minorBase = blockIdx.x;
+  int tileOutY = minorBase / p.launchMinor;
+  minorBase -= tileOutY * p.launchMinor;
+  minorBase *= loopMinor;
+  tileOutY *= tileOutH;
+  int tileOutXBase = blockIdx.y * p.loopX * tileOutW;
+  int majorBase = blockIdx.z * p.loopMajor;
+  if (tileOutXBase >= p.outSize.x | tileOutY >= p.outSize.y |
+      majorBase >= p.sizeMajor)
+    return;
+
+  // Load filter (flipped).
+  for (int tapIdx = threadIdx.x; tapIdx < filterH * filterW;
+       tapIdx += blockDim.x) {
+    int fy = tapIdx / filterW;
+    int fx = tapIdx - fy * filterW;
+    scalar_t v = 0;
+    if (fx < p.filterSize.x & fy < p.filterSize.y) {
+      int ffx = (p.flip) ? fx : p.filterSize.x - 1 - fx;
+      int ffy = (p.flip) ? fy : p.filterSize.y - 1 - fy;
+      v = (scalar_t)p.f[ffx * p.filterStride.x + ffy * p.filterStride.y];
+    }
+    sf[fy][fx] = v;
+  }
+
+  // Loop over major and X.
+  for (int majorIdx = 0, major = majorBase;
+       majorIdx < p.loopMajor & major < p.sizeMajor; majorIdx++, major++) {
+    int baseNC = major * p.sizeMinor + minorBase;
+    int n = baseNC / p.inSize.z;
+    int baseC = baseNC - n * p.inSize.z;
+    for (int loopX = 0, tileOutX = tileOutXBase;
+         loopX < p.loopX & tileOutX < p.outSize.x;
+         loopX++, tileOutX += tileOutW) {
+      // Load input pixels.
+      int tileMidX = tileOutX * downx + upx - 1 - p.pad0.x;
+      int tileMidY = tileOutY * downy + upy - 1 - p.pad0.y;
+      int tileInX = floor_div(tileMidX, upx);
+      int tileInY = floor_div(tileMidY, upy);
+      __syncthreads();
+      for (int inIdx = threadIdx.x; inIdx < tileInH * tileInW * loopMinor;
+           inIdx += blockDim.x) {
+        int relC = inIdx;
+        int relInX = relC / loopMinor;
+        int relInY = relInX / tileInW;
+        relC -= relInX * loopMinor;
+        relInX -= relInY * tileInW;
+        int c = baseC + relC;
+        int inX = tileInX + relInX;
+        int inY = tileInY + relInY;
+        scalar_t v = 0;
+        if (inX >= 0 & inY >= 0 & inX < p.inSize.x & inY < p.inSize.y &
+            c < p.inSize.z)
+          v = (scalar_t)(
+              (const T *)p.x)[inX * p.inStride.x + inY * p.inStride.y +
+                              c * p.inStride.z + n * p.inStride.w];
+        sx[relInY][relInX][relC] = v;
+      }
+
+      // Loop over output pixels.
+      __syncthreads();
+      for (int outIdx = threadIdx.x; outIdx < tileOutH * tileOutW * loopMinor;
+           outIdx += blockDim.x) {
+        int relC = outIdx;
+        int relOutX = relC / loopMinor;
+        int relOutY = relOutX / tileOutW;
+        relC -= relOutX * loopMinor;
+        relOutX -= relOutY * tileOutW;
+        int c = baseC + relC;
+        int outX = tileOutX + relOutX;
+        int outY = tileOutY + relOutY;
+
+        // Setup receptive field.
+        int midX = tileMidX + relOutX * downx;
+        int midY = tileMidY + relOutY * downy;
+        int inX = floor_div(midX, upx);
+        int inY = floor_div(midY, upy);
+        int relInX = inX - tileInX;
+        int relInY = inY - tileInY;
+        int filterX = (inX + 1) * upx - midX - 1;  // flipped
+        int filterY = (inY + 1) * upy - midY - 1;  // flipped
+
+        // Inner loop.
+        if (outX < p.outSize.x & outY < p.outSize.y & c < p.outSize.z) {
+          scalar_t v = 0;
+#pragma unroll
+          for (int y = 0; y < filterH / upy; y++)
+#pragma unroll
+            for (int x = 0; x < filterW / upx; x++)
+              v += sx[relInY + y][relInX + x][relC] *
+                   sf[filterY + y * upy][filterX + x * upx];
+          v *= p.gain;
+          ((T *)p.y)[outX * p.outStride.x + outY * p.outStride.y +
+                     c * p.outStride.z + n * p.outStride.w] = (T)v;
+        }
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------
+// CUDA kernel selection.
+
+template <class T>
+upfirdn2d_kernel_spec choose_upfirdn2d_kernel(
+    const upfirdn2d_kernel_params &p) {
+  int s = p.inStride.z, fx = p.filterSize.x, fy = p.filterSize.y;
+  upfirdn2d_kernel_spec spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 1,
+                                4};  // contiguous
+  if (s == 1)
+    spec = {(void *)upfirdn2d_kernel_large<T>, -1, -1, 4, 1};  // channels_last
+
+  // No up/downsampling.
+  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 24 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 64, 32, 1>,
+              64, 32, 1, 1};
+    if (s != 1 && fx <= 16 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 64, 32, 1>,
+              64, 32, 1, 1};
+    if (s != 1 && fx <= 7 && fy <= 7)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 6 && fy <= 6)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 5 && fy <= 5)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 4 && fy <= 4)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 3 && fy <= 3)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 24 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    if (s != 1 && fx <= 16 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    if (s != 1 && fx <= 8 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 32, 32, 1>,
+              32, 32, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 24 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 24, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s == 1 && fx <= 16 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 16, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s == 1 && fx <= 7 && fy <= 7)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 7, 7, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 6 && fy <= 6)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 6, 6, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 5 && fy <= 5)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 5, 5, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 4 && fy <= 4)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 4, 4, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 3 && fy <= 3)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 3, 3, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 24 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 24, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+    if (s == 1 && fx <= 16 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 16, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+    if (s == 1 && fx <= 8 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 8, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+    if (s == 1 && fx <= 1 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 24, 1, 128, 16>,
+              1, 128, 16, 1};
+    if (s == 1 && fx <= 1 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 16, 1, 128, 16>,
+              1, 128, 16, 1};
+    if (s == 1 && fx <= 1 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 1, 1, 8, 1, 128, 16>,
+              1, 128, 16, 1};
+  }
+
+  // 2x upsampling.
+  if (p.up.x == 2 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 24 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 64, 32, 1>,
+              64, 32, 1, 1};
+    if (s != 1 && fx <= 16 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 64, 32, 1>,
+              64, 32, 1, 1};
+    if (s != 1 && fx <= 8 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 6 && fy <= 6)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 4 && fy <= 4)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 64, 16, 1>,
+              64, 16, 1, 1};
+    if (s != 1 && fx <= 2 && fy <= 2)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 64, 16, 1>,
+              64, 16, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 24 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 24, 24, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s == 1 && fx <= 16 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 16, 16, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s == 1 && fx <= 8 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 8, 8, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 6 && fy <= 6)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 6, 6, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 4 && fy <= 4)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 4, 4, 16, 16, 8>,
+              16, 16, 8, 1};
+    if (s == 1 && fx <= 2 && fy <= 2)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 2, 1, 1, 2, 2, 16, 16, 8>,
+              16, 16, 8, 1};
+  }
+  if (p.up.x == 2 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 24 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    if (s != 1 && fx <= 16 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    if (s != 1 && fx <= 8 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 24 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 24, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+    if (s == 1 && fx <= 16 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 16, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+    if (s == 1 && fx <= 8 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 2, 1, 1, 1, 8, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+  }
+  if (p.up.x == 1 && p.up.y == 2 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 1 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 32, 32, 1>,
+              32, 32, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 1 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 24, 1, 128, 16>,
+              1, 128, 16, 1};
+    if (s == 1 && fx <= 1 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 16, 1, 128, 16>,
+              1, 128, 16, 1};
+    if (s == 1 && fx <= 1 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 2, 1, 1, 1, 8, 1, 128, 16>,
+              1, 128, 16, 1};
+  }
+
+  // 2x downsampling.
+  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 2) {
+    // contiguous
+    if (s != 1 && fx <= 24 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 32, 16, 1>,
+              32, 16, 1, 1};
+    if (s != 1 && fx <= 16 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 32, 16, 1>,
+              32, 16, 1, 1};
+    if (s != 1 && fx <= 8 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 32, 8, 1>, 32,
+              8, 1, 1};
+    if (s != 1 && fx <= 6 && fy <= 6)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 32, 8, 1>, 32,
+              8, 1, 1};
+    if (s != 1 && fx <= 4 && fy <= 4)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 32, 8, 1>, 32,
+              8, 1, 1};
+    if (s != 1 && fx <= 2 && fy <= 2)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 32, 8, 1>, 32,
+              8, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 24 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 24, 24, 16, 16, 1>,
+              16, 16, 1, 1};
+    if (s == 1 && fx <= 16 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 16, 16, 16, 16, 1>,
+              16, 16, 1, 1};
+    if (s == 1 && fx <= 8 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 8, 8, 8, 8, 8>, 8,
+              8, 8, 1};
+    if (s == 1 && fx <= 6 && fy <= 6)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 6, 6, 8, 8, 8>, 8,
+              8, 8, 1};
+    if (s == 1 && fx <= 4 && fy <= 4)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 4, 4, 8, 8, 8>, 8,
+              8, 8, 1};
+    if (s == 1 && fx <= 2 && fy <= 2)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 2, 2, 2, 8, 8, 8>, 8,
+              8, 8, 1};
+  }
+  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 2 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 24 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 8, 1>,
+              64, 8, 1, 1};
+    if (s != 1 && fx <= 16 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 8, 1>,
+              64, 8, 1, 1};
+    if (s != 1 && fx <= 8 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 8, 1>, 64,
+              8, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 24 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 24, 1, 64, 1, 8>,
+              64, 1, 8, 1};
+    if (s == 1 && fx <= 16 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 16, 1, 64, 1, 8>,
+              64, 1, 8, 1};
+    if (s == 1 && fx <= 8 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 2, 1, 8, 1, 64, 1, 8>, 64,
+              1, 8, 1};
+  }
+  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 2) {
+    // contiguous
+    if (s != 1 && fx <= 1 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 32, 16, 1>,
+              32, 16, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 32, 16, 1>,
+              32, 16, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 32, 16, 1>,
+              32, 16, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 1 && fy <= 24)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 24, 1, 64, 8>, 1,
+              64, 8, 1};
+    if (s == 1 && fx <= 1 && fy <= 16)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 16, 1, 64, 8>, 1,
+              64, 8, 1};
+    if (s == 1 && fx <= 1 && fy <= 8)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 2, 1, 8, 1, 64, 8>, 1,
+              64, 8, 1};
+  }
+
+  // 4x upsampling.
+  if (p.up.x == 4 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 48 && fy <= 48)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 64, 32, 1>,
+              64, 32, 1, 1};
+    if (s != 1 && fx <= 32 && fy <= 32)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 64, 32, 1>,
+              64, 32, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 48 && fy <= 48)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 48, 48, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s == 1 && fx <= 32 && fy <= 32)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 4, 1, 1, 32, 32, 32, 32, 1>,
+              32, 32, 1, 1};
+  }
+  if (p.up.x == 4 && p.up.y == 1 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 48 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    if (s != 1 && fx <= 32 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 8, 1>,
+              128, 8, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 48 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 48, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+    if (s == 1 && fx <= 32 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 4, 1, 1, 1, 32, 1, 128, 1, 16>,
+              128, 1, 16, 1};
+  }
+  if (p.up.x == 1 && p.up.y == 4 && p.down.x == 1 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 1 && fy <= 48)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 32, 32, 1>,
+              32, 32, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 32)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 32, 32, 1>,
+              32, 32, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 1 && fy <= 48)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 48, 1, 128, 16>,
+              1, 128, 16, 1};
+    if (s == 1 && fx <= 1 && fy <= 32)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 4, 1, 1, 1, 32, 1, 128, 16>,
+              1, 128, 16, 1};
+  }
+
+  // 4x downsampling (inefficient).
+  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 4 && p.down.y == 1) {
+    // contiguous
+    if (s != 1 && fx <= 48 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 8, 1>,
+              32, 8, 1, 1};
+    if (s != 1 && fx <= 32 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 8, 1>,
+              32, 8, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 48 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 48, 1, 32, 1, 8>,
+              32, 1, 8, 1};
+    if (s == 1 && fx <= 32 && fy <= 1)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 4, 1, 32, 1, 32, 1, 8>,
+              32, 1, 8, 1};
+  }
+  if (p.up.x == 1 && p.up.y == 1 && p.down.x == 1 && p.down.y == 4) {
+    // contiguous
+    if (s != 1 && fx <= 1 && fy <= 48)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 32, 8, 1>,
+              32, 8, 1, 1};
+    if (s != 1 && fx <= 1 && fy <= 32)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 32, 8, 1>,
+              32, 8, 1, 1};
+    // channels_last
+    if (s == 1 && fx <= 1 && fy <= 48)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 48, 1, 32, 8>, 1,
+              32, 8, 1};
+    if (s == 1 && fx <= 1 && fy <= 32)
+      spec = {(void *)upfirdn2d_kernel_small<T, 1, 1, 1, 4, 1, 32, 1, 32, 8>, 1,
+              32, 8, 1};
+  }
+  return spec;
+}
+
+//------------------------------------------------------------------------
+// Template specializations.
+
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<double>(
+    const upfirdn2d_kernel_params &p);
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<float>(
+    const upfirdn2d_kernel_params &p);
+template upfirdn2d_kernel_spec choose_upfirdn2d_kernel<c10::Half>(
+    const upfirdn2d_kernel_params &p);
+
+//------------------------------------------------------------------------
+
+//------------------------------------------------------------------------
+
+torch::Tensor upfirdn2d_op(torch::Tensor x, torch::Tensor f, int upx, int upy,
+                           int downx, int downy, int padx0, int padx1,
+                           int pady0, int pady1, bool flip, float gain) {
+  // Validate arguments.
+  TORCH_CHECK(x.is_cuda(), "x must reside on CUDA device");
+  TORCH_CHECK(f.device() == x.device(),
+              "f must reside on the same device as x");
+  TORCH_CHECK(f.dtype() == torch::kFloat, "f must be float32");
+  TORCH_CHECK(x.numel() <= INT_MAX, "x is too large");
+  TORCH_CHECK(f.numel() <= INT_MAX, "f is too large");
+  TORCH_CHECK(x.numel() > 0, "x has zero size");
+  TORCH_CHECK(f.numel() > 0, "f has zero size");
+  TORCH_CHECK(x.dim() == 4, "x must be rank 4");
+  TORCH_CHECK(f.dim() == 2, "f must be rank 2");
+  TORCH_CHECK((x.size(0) - 1) * x.stride(0) + (x.size(1) - 1) * x.stride(1) +
+                      (x.size(2) - 1) * x.stride(2) +
+                      (x.size(3) - 1) * x.stride(3) <=
+                  INT_MAX,
+              "x memory footprint is too large");
+  TORCH_CHECK(f.size(0) >= 1 && f.size(1) >= 1, "f must be at least 1x1");
+  TORCH_CHECK(upx >= 1 && upy >= 1, "upsampling factor must be at least 1");
+  TORCH_CHECK(downx >= 1 && downy >= 1,
+              "downsampling factor must be at least 1");
+
+  // Create output tensor.
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
+  int outW =
+      ((int)x.size(3) * upx + padx0 + padx1 - (int)f.size(1) + downx) / downx;
+  int outH =
+      ((int)x.size(2) * upy + pady0 + pady1 - (int)f.size(0) + downy) / downy;
+  TORCH_CHECK(outW >= 1 && outH >= 1, "output must be at least 1x1");
+  torch::Tensor y = torch::empty({x.size(0), x.size(1), outH, outW},
+                                 x.options(), x.suggest_memory_format());
+  TORCH_CHECK(y.numel() <= INT_MAX, "output is too large");
+  TORCH_CHECK((y.size(0) - 1) * y.stride(0) + (y.size(1) - 1) * y.stride(1) +
+                      (y.size(2) - 1) * y.stride(2) +
+                      (y.size(3) - 1) * y.stride(3) <=
+                  INT_MAX,
+              "output memory footprint is too large");
+
+  // Initialize CUDA kernel parameters.
+  upfirdn2d_kernel_params p;
+  p.x = x.data_ptr();
+  p.f = f.data_ptr<float>();
+  p.y = y.data_ptr();
+  p.up = make_int2(upx, upy);
+  p.down = make_int2(downx, downy);
+  p.pad0 = make_int2(padx0, pady0);
+  p.flip = (flip) ? 1 : 0;
+  p.gain = gain;
+  p.inSize =
+      make_int4((int)x.size(3), (int)x.size(2), (int)x.size(1), (int)x.size(0));
+  p.inStride = make_int4((int)x.stride(3), (int)x.stride(2), (int)x.stride(1),
+                         (int)x.stride(0));
+  p.filterSize = make_int2((int)f.size(1), (int)f.size(0));
+  p.filterStride = make_int2((int)f.stride(1), (int)f.stride(0));
+  p.outSize =
+      make_int4((int)y.size(3), (int)y.size(2), (int)y.size(1), (int)y.size(0));
+  p.outStride = make_int4((int)y.stride(3), (int)y.stride(2), (int)y.stride(1),
+                          (int)y.stride(0));
+  p.sizeMajor = (p.inStride.z == 1) ? p.inSize.w : p.inSize.w * p.inSize.z;
+  p.sizeMinor = (p.inStride.z == 1) ? p.inSize.z : 1;
+
+  // Choose CUDA kernel.
+  upfirdn2d_kernel_spec spec;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
+    spec = choose_upfirdn2d_kernel<scalar_t>(p);
+  });
+
+  // Set looping options.
+  p.loopMajor = (p.sizeMajor - 1) / 16384 + 1;
+  p.loopMinor = spec.loopMinor;
+  p.loopX = spec.loopX;
+  p.launchMinor = (p.sizeMinor - 1) / p.loopMinor + 1;
+  p.launchMajor = (p.sizeMajor - 1) / p.loopMajor + 1;
+
+  // Compute grid size.
+  dim3 blockSize, gridSize;
+  if (spec.tileOutW < 0)  // large
+  {
+    blockSize = dim3(4, 32, 1);
+    gridSize =
+        dim3(((p.outSize.y - 1) / blockSize.x + 1) * p.launchMinor,
+             (p.outSize.x - 1) / (blockSize.y * p.loopX) + 1, p.launchMajor);
+  } else  // small
+  {
+    blockSize = dim3(256, 1, 1);
+    gridSize =
+        dim3(((p.outSize.y - 1) / spec.tileOutH + 1) * p.launchMinor,
+             (p.outSize.x - 1) / (spec.tileOutW * p.loopX) + 1, p.launchMajor);
+  }
+
+  // Launch CUDA kernel.
+  void *args[] = {&p};
+#ifdef MMCV_WITH_HIP
+  AT_CUDA_CHECK(hipLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,
+                                at::cuda::getCurrentCUDAStream()));
+#else
+  AT_CUDA_CHECK(cudaLaunchKernel(spec.kernel, gridSize, blockSize, args, 0,
+                                 at::cuda::getCurrentCUDAStream()));
+#endif
+
+  return y;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f4166b7b7a4fc7297f452636a991bbf91789dd85
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
@@ -0,0 +1,286 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "voxelization_cuda_kernel.cuh"
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 block(512);
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int><<<grid, block, 0, stream>>>(
+            points.contiguous().data_ptr<scalar_t>(),
+            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
+            NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 2. map point to the idx of the corresponding voxel, find duplicate coor
+  // create some temporary variables
+  auto point_to_pointidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+
+  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 map_block(512);
+
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.scalar_type(), "determin_duplicate", ([&] {
+        point_to_voxelidx_kernel<int><<<map_grid, map_block, 0, stream>>>(
+            temp_coors.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            point_to_pointidx.contiguous().data_ptr<int>(), max_points,
+            max_voxels, num_points, NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 3. determine voxel num and voxel's coor index
+  // make the logic in the CUDA device could accelerate about 10 times
+  auto coor_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto voxel_num = at::zeros(
+      {
+          1,
+      },
+      points.options().dtype(at::kInt));  // must be zero from the beginning
+
+  AT_DISPATCH_ALL_TYPES(temp_coors.scalar_type(), "determin_duplicate", ([&] {
+                          determin_voxel_num<int><<<1, 1, 0, stream>>>(
+                              num_points_per_voxel.contiguous().data_ptr<int>(),
+                              point_to_voxelidx.contiguous().data_ptr<int>(),
+                              point_to_pointidx.contiguous().data_ptr<int>(),
+                              coor_to_voxelidx.contiguous().data_ptr<int>(),
+                              voxel_num.contiguous().data_ptr<int>(),
+                              max_points, max_voxels, num_points);
+                        }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 4. copy point features to voxels
+  // Step 4 & 5 could be parallel
+  auto pts_output_size = num_points * num_features;
+  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
+  dim3 cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_point_to_voxel<float, int><<<cp_grid, cp_block, 0, stream>>>(
+            pts_output_size, points.contiguous().data_ptr<float>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            voxels.contiguous().data_ptr<float>(), max_points, num_features,
+            num_points, NDim);
+      }));
+  //   cudaDeviceSynchronize();
+  //   AT_CUDA_CHECK(cudaGetLastError());
+
+  // 5. copy coors of each voxels
+  auto coors_output_size = num_points * NDim;
+  dim3 coors_cp_grid(
+      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
+  dim3 coors_cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_voxel_coors<float, int>
+            <<<coors_cp_grid, coors_cp_block, 0, stream>>>(
+                coors_output_size, temp_coors.contiguous().data_ptr<int>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                coor_to_voxelidx.contiguous().data_ptr<int>(),
+                coors.contiguous().data_ptr<int>(), num_points, NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+
+  return voxel_num_int;
+}
+
+int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  if (num_points == 0) return 0;
+
+  dim3 blocks(
+      std::min(at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK), 4096));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+            points.contiguous().data_ptr<scalar_t>(),
+            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
+            NDim);
+      }));
+
+  at::Tensor coors_map;
+  at::Tensor reduce_count;
+
+  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
+
+  std::tie(temp_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, false);
+
+  if (temp_coors[0][0].lt(0).item<bool>()) {
+    // the first element of temp_coors is (-1,-1,-1) and should be removed
+    temp_coors = temp_coors.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  int num_coors = temp_coors.size(0);
+  temp_coors = temp_coors.to(at::kInt);
+  coors_map = coors_map.to(at::kInt);
+
+  at::Tensor coors_count = at::zeros({1}, coors_map.options());
+  at::Tensor coors_order = at::empty({num_coors}, coors_map.options());
+  at::Tensor pts_id = at::zeros({num_points}, coors_map.options());
+  reduce_count = at::zeros({num_coors}, coors_map.options());
+
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "get_assign_pos", ([&] {
+        nondeterministic_get_assign_pos<<<blocks, threads, 0, stream>>>(
+            num_points, coors_map.contiguous().data_ptr<int32_t>(),
+            pts_id.contiguous().data_ptr<int32_t>(),
+            coors_count.contiguous().data_ptr<int32_t>(),
+            reduce_count.contiguous().data_ptr<int32_t>(),
+            coors_order.contiguous().data_ptr<int32_t>());
+      }));
+
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        nondeterministic_assign_point_voxel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                num_points, points.contiguous().data_ptr<scalar_t>(),
+                coors_map.contiguous().data_ptr<int32_t>(),
+                pts_id.contiguous().data_ptr<int32_t>(),
+                temp_coors.contiguous().data_ptr<int32_t>(),
+                reduce_count.contiguous().data_ptr<int32_t>(),
+                coors_order.contiguous().data_ptr<int32_t>(),
+                voxels.contiguous().data_ptr<scalar_t>(),
+                coors.contiguous().data_ptr<int32_t>(),
+                num_points_per_voxel.contiguous().data_ptr<int32_t>(),
+                max_voxels, max_points, num_features, NDim);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+  return max_voxels < num_coors ? max_voxels : num_coors;
+}
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK);
+  dim3 blocks(col_blocks);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
+    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+        points.contiguous().data_ptr<scalar_t>(),
+        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
+  });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/deform_conv.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86690b9394a4b758104009062f656dcfe0de178e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/deform_conv.cpp
@@ -0,0 +1,517 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, data_col);
+}
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, grad_im);
+}
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
+                       data_offset, channels, height, width, ksize_h, ksize_w,
+                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                       parallel_imgs, deformable_group, grad_offset);
+}
+
+void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
+                             at::Tensor *gradOutput, at::Tensor weight, int kH,
+                             int kW, int dH, int dW, int padH, int padW,
+                             int dilationH, int dilationW, int group,
+                             int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got kH: %d kW: %d",
+              kH, kW);
+
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+              "kernel size should be consistent with weight, ",
+              "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+              kH, kW, weight.size(2), weight.size(3));
+
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got dH: %d dW: %d", dH,
+              dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(ndim == 3 || ndim == 4,
+              "3D or 4D input tensor expected but got: %s", ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
+              "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(input.size(1) == nInputPlane,
+              "invalid number of input planes, expected: %d, but got: %d",
+              nInputPlane, input.size(1));
+
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+              "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+              "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane, gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight, outputWidth, gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
+                          padW, dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
+                                    im2col_step * outputHeight, outputWidth},
+                                   output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradInput);
+    CHECK_CUDA_INPUT(gradOffset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(columns);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradInput);
+    CHECK_CPU_INPUT(gradOffset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(columns);
+  }
+  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
+                          padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
+                                 inputHeight, inputWidth, kH, kW, padH, padW,
+                                 dH, dW, dilationH, dilationW, im2col_step,
+                                 deformable_group, gradOffset[elt]);
+
+    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group,
+                           gradInput[elt]);
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradWeight);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradWeight);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
+                          dW, padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer = gradOutputBuffer.contiguous();
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4fb78a96e74f7e97dff5212bb767eab743f2e73c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
+                       output, pooled_height, pooled_width, spatial_scale,
+                       sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
+                       offset, grad_input, grad_offset, pooled_height,
+                       pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma) {
+  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma) {
+  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2361b7fbe5c86fa62a0fa78f39f6d018de108f8f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
+                              vertices, mask, num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
+                                              Tensor num_valid) {
+  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/filtered_lrelu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/filtered_lrelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7ecc14cf96868241c8029fdde579f179bfdbc00
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/filtered_lrelu.cpp
@@ -0,0 +1,37 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu_op_impl(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
+    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
+    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
+    bool writeSigns) {
+  return DISPATCH_DEVICE_IMPL(filtered_lrelu_op_impl, x, fu, fd, b, si, up,
+                              down, px0, px1, py0, py1, sx, sy, gain, slope,
+                              clamp, flip_filters, writeSigns);
+}
+
+std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
+    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
+    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
+    bool writeSigns) {
+  return filtered_lrelu_op_impl(x, fu, fd, b, si, up, down, px0, px1, py0, py1,
+                                sx, sy, gain, slope, clamp, flip_filters,
+                                writeSigns);
+}
+
+torch::Tensor filtered_lrelu_act_op_impl(torch::Tensor x, torch::Tensor si,
+                                         int sx, int sy, float gain,
+                                         float slope, float clamp,
+                                         bool writeSigns) {
+  return DISPATCH_DEVICE_IMPL(filtered_lrelu_act_op_impl, x, si, sx, sy, gain,
+                              slope, clamp, writeSigns);
+}
+
+torch::Tensor filtered_lrelu_act_(torch::Tensor x, torch::Tensor si, int sx,
+                                  int sy, float gain, float slope, float clamp,
+                                  bool writeSigns) {
+  return filtered_lrelu_act_op_impl(x, si, sx, sy, gain, slope, clamp,
+                                    writeSigns);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/focal_loss.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/focal_loss.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..51568ead3442f1a15c54e781c8b9efaac9f01619
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/focal_loss.cpp
@@ -0,0 +1,140 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#ifdef MMCV_WITH_DIOPI
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+#include <diopi/functions_mmcv.h>
+
+#include "csrc_dipu/diopirt/diopirt_impl.h"
+
+using dipu::diopi_helper::toDiopiScalar;
+using dipu::diopi_helper::toDiopiTensorHandle;
+#endif
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
+                       grad_input, gamma, alpha);
+}
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
+                       buff, grad_input, gamma, alpha);
+}
+
+#ifdef MMCV_WITH_DIOPI
+void sigmoid_focal_loss_forward_diopi(Tensor input, Tensor target,
+                                      Tensor weight, Tensor output, float gamma,
+                                      float alpha) {
+  auto input_p = toDiopiTensorHandle(input);
+  diopiDevice_t device;
+  diopiGetTensorDevice(input_p, &device);
+  if (device == diopi_host) {
+    sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma,
+                                    alpha);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto target_p = toDiopiTensorHandle(target);
+  auto weight_p = toDiopiTensorHandle(weight);
+  auto output_p = toDiopiTensorHandle(output);
+  if (reinterpret_cast<void *>(diopiSigmoidFocalLossMmcv) != nullptr) {
+    auto ret = diopiSigmoidFocalLossMmcv(ch, output_p, input_p, target_p,
+                                         weight_p, gamma, alpha);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING)
+      << "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl";
+  auto input_cpu = input.cpu();
+  auto target_cpu = target.cpu();
+  auto weight_cpu = weight.cpu();
+  auto output_cpu = output.cpu();
+  sigmoid_focal_loss_forward_impl(input_cpu, target_cpu, weight_cpu, output_cpu,
+                                  gamma, alpha);
+  output.copy_(output_cpu);
+  return;
+}
+
+void sigmoid_focal_loss_backward_diopi(Tensor input, Tensor target,
+                                       Tensor weight, Tensor grad_input,
+                                       float gamma, float alpha) {
+  auto input_p = toDiopiTensorHandle(input);
+  diopiDevice_t device;
+  diopiGetTensorDevice(input_p, &device);
+  if (device == diopi_host) {
+    sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
+                                     alpha);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto target_p = toDiopiTensorHandle(target);
+  auto weight_p = toDiopiTensorHandle(weight);
+  auto grad_input_p = toDiopiTensorHandle(grad_input);
+  if (reinterpret_cast<void *>(diopiSigmoidFocalLossBackwardMmcv) != nullptr) {
+    auto ret = diopiSigmoidFocalLossBackwardMmcv(
+        ch, grad_input_p, input_p, target_p, weight_p, gamma, alpha);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING)
+      << "Fallback to cpu: mmcv ext op sigmoid_focal_loss_forward_impl";
+  auto input_cpu = input.cpu();
+  auto target_cpu = target.cpu();
+  auto weight_cpu = weight.cpu();
+  auto grad_input_cpu = grad_input.cpu();
+  sigmoid_focal_loss_backward_impl(input_cpu, target_cpu, weight_cpu,
+                                   grad_input_cpu, gamma, alpha);
+  grad_input.copy_(grad_input_cpu);
+  return;
+}
+#endif
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+#ifdef MMCV_WITH_DIOPI
+  sigmoid_focal_loss_forward_diopi(input, target, weight, output, gamma, alpha);
+#else
+  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+#endif
+}
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha) {
+#ifdef MMCV_WITH_DIOPI
+  sigmoid_focal_loss_backward_diopi(input, target, weight, grad_input, gamma,
+                                    alpha);
+#else
+  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
+                                   alpha);
+#endif
+}
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha) {
+  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c7098acdb5b8392a698803dd7c7d34a360df6ad
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
@@ -0,0 +1,34 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
+                       temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
+                       points_tensor, temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m) {
+  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
+                                       b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m) {
+  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
+                                                 idx_tensor, b, n, m);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8d411c9d843f15174653aab4b24cbb3c37564073
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
@@ -0,0 +1,119 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale) {
+  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
+                              act, grad, alpha, scale);
+}
+
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
+                                   const torch::Tensor& bias,
+                                   const torch::Tensor& refer, int act,
+                                   int grad, float alpha, float scale) {
+  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
+                                      scale);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..54073a54ec5d335d2e2ed68c553eb1d6eb49557b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
@@ -0,0 +1,34 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_indice_conv_batchnorm_forward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl,
+                              features, filters, bias, indicePairs, indiceNum,
+                              numActOut, _inverse, _subM);
+}
+
+torch::Tensor fused_indice_conv_batchnorm_forward(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return fused_indice_conv_batchnorm_forward_impl(features, filters, bias,
+                                                  indicePairs, indiceNum,
+                                                  numActOut, _inverse, _subM);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/gather_points.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/gather_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8fb020022902bfbeb5ba940621d51859c616bdc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/gather_points.cpp
@@ -0,0 +1,30 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
+                       idx, out);
+}
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
+                       idx, grad_points);
+}
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n,
+                           int npoints) {
+  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
+                             out_tensor);
+}
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints) {
+  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
+                              grad_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/group_points.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/group_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..850deed9866a63604e8c1171dc6c485ffad62c72
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/group_points.cpp
@@ -0,0 +1,76 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points, idx, out);
+}
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
+                       grad_out, idx, grad_points);
+}
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points_tensor, idx_tensor, out_tensor);
+}
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample) {
+  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
+                             idx_tensor, grad_points_tensor);
+}
+
+void stack_group_points_backward_impl(int b, int c, int m, int n, int nsample,
+                                      const Tensor grad_out_tensor,
+                                      const Tensor idx_tensor,
+                                      const Tensor idx_batch_cnt_tensor,
+                                      const Tensor features_batch_cnt_tensor,
+                                      Tensor grad_features_tensor) {
+  DISPATCH_DEVICE_IMPL(stack_group_points_backward_impl, b, c, m, n, nsample,
+                       grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,
+                       features_batch_cnt_tensor, grad_features_tensor);
+}
+
+void stack_group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                 Tensor idx_batch_cnt_tensor,
+                                 Tensor features_batch_cnt_tensor,
+                                 Tensor grad_features_tensor, int b, int c,
+                                 int m, int n, int nsample) {
+  stack_group_points_backward_impl(
+      b, c, m, n, nsample, grad_out_tensor, idx_tensor, idx_batch_cnt_tensor,
+      features_batch_cnt_tensor, grad_features_tensor);
+}
+
+void stack_group_points_forward_impl(int b, int c, int m, int nsample,
+                                     const Tensor features_tensor,
+                                     const Tensor features_batch_cnt_tensor,
+                                     const Tensor idx_tensor,
+                                     const Tensor idx_batch_cnt_tensor,
+                                     Tensor out_tensor) {
+  DISPATCH_DEVICE_IMPL(stack_group_points_forward_impl, b, c, m, nsample,
+                       features_tensor, features_batch_cnt_tensor, idx_tensor,
+                       idx_batch_cnt_tensor, out_tensor);
+}
+
+void stack_group_points_forward(Tensor features_tensor,
+                                Tensor features_batch_cnt_tensor,
+                                Tensor idx_tensor, Tensor idx_batch_cnt_tensor,
+                                Tensor out_tensor, int b, int c, int m,
+                                int nsample) {
+  DISPATCH_DEVICE_IMPL(stack_group_points_forward_impl, b, c, m, nsample,
+                       features_tensor, features_batch_cnt_tensor, idx_tensor,
+                       idx_batch_cnt_tensor, out_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/info.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/info.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4cc41861128dc0a8f8ccd641f68044428c4dc2c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/info.cpp
@@ -0,0 +1,65 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+#ifdef MMCV_WITH_HIP
+#include <hip/hip_runtime_api.h>
+int get_hiprt_version() {
+  int runtimeVersion;
+  hipRuntimeGetVersion(&runtimeVersion);
+  return runtimeVersion;
+}
+#else
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef MMCV_WITH_HIP
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  std::ostringstream oss;
+  oss << get_hiprt_version();
+  return oss.str();
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/iou3d.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/iou3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a347c0ee96db9ceefd6168c3cce84bea243e7044
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/iou3d.cpp
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
+                       num_b, boxes_b, ans_overlap);
+}
+
+void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
+                              Tensor &keep_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, keep, keep_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes, Tensor &keep,
+                                     Tensor &keep_num,
+                                     float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, keep, keep_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
+                                       ans_overlap);
+}
+
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  iou3d_nms3d_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  iou3d_nms3d_normal_forward_impl(boxes, keep, keep_num, nms_overlap_thresh);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/knn.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/knn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4be9428c59c0f04635891b954f4c73f7fb0536d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/knn.cpp
@@ -0,0 +1,17 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
+                       dist2);
+}
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
+  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
+                   dist2_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/masked_conv2d.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5903925351fcb193b86c8b5f01b410e4fc0bbaf9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
+                       col, kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
+                       im, height, width, channels);
+}
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w) {
+  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels) {
+  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/min_area_polygons.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/min_area_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ff996dc8992b4c95633516054ecdba5913de8f3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/min_area_polygons.cpp
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
+  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
+}
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons) {
+  min_area_polygons_impl(pointsets, polygons);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5016e4a4e345ce7d6a049ffeb9f3dbf140c74a7b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp
@@ -0,0 +1,47 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void ball_query_forward_mlu(int b, int n, int m, float min_radius,
+                            float max_radius, int nsample, const Tensor new_xyz,
+                            const Tensor xyz, Tensor idx) {
+  auto new_xyz_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      new_xyz, new_xyz.suggest_memory_format());
+  auto xyz_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      xyz, new_xyz.suggest_memory_format());
+  auto idx_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      idx, new_xyz.suggest_memory_format());
+
+  MluOpTensorDescriptor new_xyz_desc, xyz_desc, idx_desc;
+  new_xyz_desc.set(new_xyz_contiguous);
+  xyz_desc.set(xyz_contiguous);
+  idx_desc.set(idx_contiguous);
+
+  auto new_xyz_impl = torch_mlu::getMluTensorImpl(new_xyz_contiguous);
+  auto xyz_impl = torch_mlu::getMluTensorImpl(xyz_contiguous);
+  auto idx_impl = torch_mlu::getMluTensorImpl(idx_contiguous);
+  auto new_xyz_ptr = new_xyz_impl->cnnlMalloc();
+  auto xyz_ptr = xyz_impl->cnnlMalloc();
+  auto idx_ptr = idx_impl->cnnlMalloc();
+
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpBallQuery(
+      handle, new_xyz_desc.desc(), new_xyz_ptr, xyz_desc.desc(), xyz_ptr,
+      min_radius, max_radius, nsample, idx_desc.desc(), idx_ptr));
+}
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx);
+
+REGISTER_DEVICE_IMPL(ball_query_forward_impl, MLU, ball_query_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a88dfc74aa2c4d5edf4fb37444533c744299b950
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
@@ -0,0 +1,56 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "mlu_common_helper.h"
+
+void bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                       const int32_t mode, const bool aligned,
+                       const int32_t offset) {
+  // check dtype
+  TORCH_CHECK(
+      bboxes1.scalar_type() == at::kFloat || bboxes1.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      bboxes1.scalar_type(), ".");
+  TORCH_CHECK(bboxes1.scalar_type() == bboxes2.scalar_type(),
+              "bboxes1's dtype should be the same with bboxes2's dtype.");
+
+  // params check
+  TORCH_CHECK(bboxes1.dim() == 2, "bboxes1 should be a 2d tensor, got ",
+              bboxes1.dim(), "D");
+  TORCH_CHECK(bboxes2.dim() == 2, "bboxes2 should be a 2d tensor, got ",
+              bboxes2.dim(), "D");
+
+  auto rows = bboxes1.size(0);
+  auto cols = bboxes2.size(0);
+  auto batch_num_all = rows;
+
+  if (rows * cols == 0) {
+    // return if zero element
+    return;
+  }
+
+  INITIAL_MLU_PARAM_WITH_TENSOR(bboxes1);
+  INITIAL_MLU_PARAM_WITH_TENSOR(bboxes2);
+  INITIAL_MLU_PARAM_WITH_TENSOR(ious);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  TORCH_MLUOP_CHECK(mluOpBboxOverlaps(
+      handle, mode, aligned, offset, bboxes1_desc.desc(), bboxes1_ptr,
+      bboxes2_desc.desc(), bboxes2_ptr, ious_desc.desc(), ious_ptr));
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MLU, bbox_overlaps_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b04f51cb4bc05189e040336a8a8e66b7309a3702
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp
@@ -0,0 +1,54 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void BoxIouRotatedMLUKernelLauncher(const Tensor boxes1, const Tensor boxes2,
+                                    Tensor ious, const int mode_flag,
+                                    const bool aligned) {
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  auto boxes1_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      boxes1, boxes1.suggest_memory_format());
+  auto boxes2_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      boxes2, boxes2.suggest_memory_format());
+  auto ious_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(ious, ious.suggest_memory_format());
+
+  MluOpTensorDescriptor boxes1_desc, boxes2_desc, ious_desc;
+  boxes1_desc.set(boxes1_contiguous);
+  boxes2_desc.set(boxes2_contiguous);
+  ious_desc.set(ious_contiguous);
+
+  auto boxes1_impl = torch_mlu::getMluTensorImpl(boxes1_contiguous);
+  auto boxes2_impl = torch_mlu::getMluTensorImpl(boxes2_contiguous);
+  auto ious_impl = torch_mlu::getMluTensorImpl(ious_contiguous);
+
+  auto boxes1_ptr = boxes1_impl->cnnlMalloc();
+  auto boxes2_ptr = boxes2_impl->cnnlMalloc();
+  auto ious_ptr = ious_impl->cnnlMalloc();
+
+  CNLOG(INFO) << "Call mluOpBoxIouRotated().";
+  TORCH_MLUOP_CHECK(mluOpBoxIouRotated(
+      handle, mode_flag, aligned, boxes1_desc.desc(), boxes1_ptr,
+      boxes2_desc.desc(), boxes2_ptr, ious_desc.desc(), ious_ptr));
+}
+
+void box_iou_rotated_mlu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  BoxIouRotatedMLUKernelLauncher(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, MLU, box_iou_rotated_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5d6e7e20921e168987b96d5b1bdff8097d34dbe6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp
@@ -0,0 +1,208 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void CARAFEForwardMLUKernelLauncher(const Tensor input, const Tensor mask,
+                                    Tensor rinput, Tensor routput, Tensor rmask,
+                                    Tensor output, const int kernel_size,
+                                    const int group_size,
+                                    const int scale_factor) {
+  // check tensor data type
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      input.scalar_type(), ".");
+
+  TORCH_CHECK(mask.scalar_type() == input.scalar_type(),
+              "Data types of input and mask should be the same, but got ",
+              input.scalar_type(), " and ", mask.scalar_type());
+
+  // check number of dimensions
+  TORCH_CHECK(input.dim() == 4, "input should be a 4-D tensor, but has ",
+              input.dim(), "D.");
+  TORCH_CHECK(mask.dim() == 4, "mask should be a 4-D tensor, but has ",
+              input.dim(), "D.");
+
+  // return fast on zero-element tensor
+  if (output.numel() == 0) {
+    output = at::zeros(output.sizes().vec(), output.options());
+    return;
+  }
+
+  // convert NCHW to NHWC
+  auto memory_format_input_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto rinput_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format_input_nhwc);
+
+  auto memory_format_mask_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(mask.dim());
+  auto rmask_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(mask, memory_format_mask_nhwc);
+
+  auto memory_format_output_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(output.dim());
+  auto routput_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format_output_nhwc);
+
+  // set tensor descriptor
+  MluOpTensorDescriptor input_desc, mask_desc, output_desc;
+  input_desc.set_with_layout(rinput_, MLUOP_LAYOUT_NHWC);
+  mask_desc.set_with_layout(rmask_, MLUOP_LAYOUT_NHWC);
+  output_desc.set_with_layout(routput_, MLUOP_LAYOUT_NHWC);
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(rinput_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto mask_impl = torch_mlu::getMluTensorImpl(rmask_);
+  auto mask_ptr = mask_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(routput_);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // set op descriptor
+  auto handle = mluOpGetCurrentHandle();
+  mluOpCarafeDescriptor_t carafe_desc;
+  TORCH_MLUOP_CHECK(mluOpCreateCarafeDescriptor(&carafe_desc));
+  TORCH_MLUOP_CHECK(mluOpSetCarafeDescriptor(
+      carafe_desc, input.dim(), kernel_size, group_size, scale_factor));
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpCarafeForward(handle, carafe_desc, input_desc.desc(),
+                                       input_ptr, mask_desc.desc(), mask_ptr,
+                                       output_desc.desc(), output_ptr));
+  // destroy op descriptor
+  TORCH_MLUOP_CHECK(mluOpDestroyCarafeDescriptor(carafe_desc));
+
+  // copy output from NHWC back into NCHW
+  rinput.copy_(rinput_);
+  output.copy_(routput_);
+}
+
+void CARAFEBackwardMLUKernelLauncher(
+    const Tensor grad_output, const Tensor rinput, const Tensor mask,
+    Tensor rgrad_output, Tensor rgrad_input_hs, Tensor rgrad_input,
+    Tensor rgrad_mask, Tensor grad_input, Tensor grad_mask,
+    const int kernel_size, const int group_size, const int scale_factor) {
+  // data type check
+  TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||
+                  grad_output.scalar_type() == at::kHalf,
+              "grad_output type should be Float or Half, got ",
+              grad_output.scalar_type());
+  TORCH_CHECK(grad_output.scalar_type() == mask.scalar_type(),
+              "mask should have the same type as grad_output");
+
+  // dim check
+  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be a 4d tensor, got ",
+              grad_output.dim(), "D");
+
+  // param check
+  TORCH_CHECK(kernel_size < 137, "kernel_size should be less than 137, got ",
+              kernel_size);
+
+  // convert NCHW to NHWC
+  auto memory_format_input_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(rinput.dim());
+  auto rinput_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rinput, memory_format_input_nhwc);
+
+  auto memory_format_mask_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(mask.dim());
+  auto rmask_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(mask, memory_format_mask_nhwc);
+
+  auto memory_format_grad_output_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
+  auto rgrad_output_ = torch_mlu::cnnl::ops::cnnl_contiguous(
+      grad_output, memory_format_grad_output_nhwc);
+
+  auto memory_format_grad_input_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_input.dim());
+  auto rgrad_input_ = torch_mlu::cnnl::ops::cnnl_contiguous(
+                          grad_input, memory_format_grad_input_nhwc)
+                          .zero_();
+
+  auto memory_format_grad_mask_nhwc =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_mask.dim());
+  auto rgrad_mask_ = torch_mlu::cnnl::ops::cnnl_contiguous(
+      grad_mask, memory_format_grad_mask_nhwc);
+
+  // set tensor descriptor
+  MluOpTensorDescriptor input_desc, mask_desc;
+  input_desc.set_with_layout(rinput_, MLUOP_LAYOUT_NHWC);
+  mask_desc.set_with_layout(rmask_, MLUOP_LAYOUT_NHWC);
+
+  MluOpTensorDescriptor grad_output_desc, grad_input_desc, grad_mask_desc;
+  grad_output_desc.set_with_layout(rgrad_output_, MLUOP_LAYOUT_NHWC);
+  grad_input_desc.set_with_layout(rgrad_input_, MLUOP_LAYOUT_NHWC);
+  grad_mask_desc.set_with_layout(rgrad_mask_, MLUOP_LAYOUT_NHWC);
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(rinput_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto mask_impl = torch_mlu::getMluTensorImpl(rmask_);
+  auto mask_ptr = mask_impl->cnnlMalloc();
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(rgrad_output_);
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(rgrad_input_);
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+  auto grad_mask_impl = torch_mlu::getMluTensorImpl(rgrad_mask_);
+  auto grad_mask_ptr = grad_mask_impl->cnnlMalloc();
+
+  // set op descriptor
+  auto handle = mluOpGetCurrentHandle();
+  mluOpCarafeDescriptor_t carafe_desc;
+  TORCH_MLUOP_CHECK(mluOpCreateCarafeDescriptor(&carafe_desc));
+  TORCH_MLUOP_CHECK(mluOpSetCarafeDescriptor(
+      carafe_desc, grad_output.dim(), kernel_size, group_size, scale_factor));
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpCarafeBackward(
+      handle, carafe_desc, input_desc.desc(), input_ptr, mask_desc.desc(),
+      mask_ptr, grad_output_desc.desc(), grad_output_ptr,
+      grad_input_desc.desc(), grad_input_ptr, grad_mask_desc.desc(),
+      grad_mask_ptr));
+  // destroy op descriptor
+  TORCH_MLUOP_CHECK(mluOpDestroyCarafeDescriptor(carafe_desc));
+
+  // copy output from NHWC back into NCHW
+  grad_input.copy_(rgrad_input_);
+  grad_mask.copy_(rgrad_mask_);
+}
+
+void carafe_forward_mlu(Tensor features, Tensor masks, Tensor rfeatures,
+                        Tensor routput, Tensor rmasks, Tensor output,
+                        int kernel_size, int group_size, int scale_factor) {
+  CARAFEForwardMLUKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                 output, kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward_mlu(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                         Tensor rtop_grad, Tensor rbottom_grad_hs,
+                         Tensor rbottom_grad, Tensor rmask_grad,
+                         Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                         int group_size, int scale_factor) {
+  CARAFEBackwardMLUKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                  rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                  bottom_grad, mask_grad, kernel_size,
+                                  group_size, scale_factor);
+}
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_forward_impl, MLU, carafe_forward_mlu);
+REGISTER_DEVICE_IMPL(carafe_backward_impl, MLU, carafe_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..26705d9f2e51413890981ee96ffd4f80bf2a5e13
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp
@@ -0,0 +1,159 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void DeformRoIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois,
+                                           Tensor offset, Tensor output,
+                                           int pooled_height, int pooled_width,
+                                           float spatial_scale,
+                                           int sampling_ratio, float gamma) {
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+  auto rois_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
+  auto output_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);
+
+  MluOpTensorDescriptor input_desc, rois_desc, offset_desc, output_desc;
+  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
+  rois_desc.set(rois_contiguous);
+  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
+
+  mluOpTensorDescriptor_t offset_real_desc = NULL;
+  void *offset_ptr = NULL;
+  if (offset.defined() && offset.numel() > 0) {
+    auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+        offset, offset.suggest_memory_format());
+    offset_desc.set(offset_contiguous);
+    offset_real_desc = offset_desc.desc();
+    auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);
+    offset_ptr = offset_impl->cnnlMalloc();
+  }
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpDeformRoiPoolForward(
+      handle, input_desc.desc(), input_ptr, rois_desc.desc(), rois_ptr,
+      offset_real_desc, offset_ptr, pooled_height, pooled_width, spatial_scale,
+      sampling_ratio, gamma, output_desc.desc(), output_ptr));
+
+  output.copy_(output_contiguous);
+}
+
+void DeformRoIPoolBackwardMLUKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma) {
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
+  auto grad_output_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);
+  memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+  auto rois_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
+  auto grad_input_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(grad_input, memory_format);
+
+  // get ptr of tensors
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto input_impl = torch_mlu::getMluTensorImpl(input_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  MluOpTensorDescriptor grad_output_desc, input_desc, rois_desc, offset_desc,
+      grad_input_desc, grad_offset_desc;
+  grad_output_desc.set_with_layout(grad_output_, MLUOP_LAYOUT_NHWC);
+  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
+  rois_desc.set(rois_contiguous);
+  grad_input_desc.set_with_layout(grad_input_, MLUOP_LAYOUT_NHWC);
+  mluOpTensorDescriptor_t offset_real_desc = NULL;
+  void *offset_ptr = NULL;
+  if (offset.defined() && offset.numel() > 0) {
+    auto offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+        offset, offset.suggest_memory_format());
+    offset_desc.set(offset_contiguous);
+    offset_real_desc = offset_desc.desc();
+    auto offset_impl = torch_mlu::getMluTensorImpl(offset_contiguous);
+    offset_ptr = offset_impl->cnnlMalloc();
+  }
+  mluOpTensorDescriptor_t grad_offset_real_desc = NULL;
+  void *grad_offset_ptr = NULL;
+  if (grad_offset.defined() && grad_offset.numel() > 0) {
+    auto grad_offset_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+        grad_offset, grad_offset.suggest_memory_format());
+    grad_offset_desc.set(grad_offset_contiguous);
+    grad_offset_real_desc = grad_offset_desc.desc();
+    auto grad_offset_impl = torch_mlu::getMluTensorImpl(grad_offset_contiguous);
+    grad_offset_ptr = grad_offset_impl->cnnlMalloc();
+  }
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpDeformRoiPoolBackward(
+      handle, grad_output_desc.desc(), grad_output_ptr, input_desc.desc(),
+      input_ptr, rois_desc.desc(), rois_ptr, offset_real_desc, offset_ptr,
+      pooled_height, pooled_width, spatial_scale, sampling_ratio, gamma,
+      grad_input_desc.desc(), grad_input_ptr, grad_offset_real_desc,
+      grad_offset_ptr));
+  grad_input.copy_(grad_input_);
+}
+
+void deform_roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor offset,
+                                 Tensor output, int pooled_height,
+                                 int pooled_width, float spatial_scale,
+                                 int sampling_ratio, float gamma) {
+  DeformRoIPoolForwardMLUKernelLauncher(input, rois, offset, output,
+                                        pooled_height, pooled_width,
+                                        spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_mlu(Tensor grad_output, Tensor input, Tensor rois,
+                                  Tensor offset, Tensor grad_input,
+                                  Tensor grad_offset, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DeformRoIPoolBackwardMLUKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, MLU,
+                     deform_roi_pool_forward_mlu);
+REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, MLU,
+                     deform_roi_pool_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/diff_iou_rotated_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/diff_iou_rotated_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..046f5895753c456664bed57edaea1e46c7273c95
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/diff_iou_rotated_mlu.cpp
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (C) 2023 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+Tensor diff_iou_rotated_sort_vertices_forward_mlu(Tensor vertices, Tensor mask,
+                                                  Tensor num_valid) {
+  // params check
+  TORCH_CHECK(vertices.scalar_type() == at::kFloat,
+              "vertices type should be Float, got ", vertices.scalar_type());
+  TORCH_CHECK(mask.scalar_type() == at::kBool, "mask should be Bool, got ",
+              mask.scalar_type());
+  TORCH_CHECK(num_valid.scalar_type() == at::kInt,
+              "num_valid type should be Int32, got ", num_valid.scalar_type());
+  TORCH_CHECK(vertices.size(2) == 24, "vertices.dim(2) should be 24, got ",
+              vertices.size(2));
+  TORCH_CHECK(mask.size(2) == 24, "mask.dim(2) should be 24, got ",
+              mask.size(2));
+
+  // zero-element check
+  if (vertices.numel() == 0) {
+    return at::empty({0}, num_valid.options().dtype(at::kInt));
+  }
+
+  auto idx = at::empty({vertices.size(0), vertices.size(1), 9},
+                       num_valid.options().dtype(at::kInt));
+
+  INITIAL_MLU_PARAM_WITH_TENSOR(vertices);
+  INITIAL_MLU_PARAM_WITH_TENSOR(mask);
+  INITIAL_MLU_PARAM_WITH_TENSOR(num_valid);
+  INITIAL_MLU_PARAM_WITH_TENSOR(idx);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpDiffIouRotatedSortVerticesForward(
+      handle, vertices_desc.desc(), vertices_ptr, mask_desc.desc(), mask_ptr,
+      num_valid_desc.desc(), num_valid_ptr, idx_desc.desc(), idx_ptr));
+  return idx;
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, MLU,
+                     diff_iou_rotated_sort_vertices_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b5633b6df575cb0be1354715ff9fbc826b81d534
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
@@ -0,0 +1,177 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <string>
+#include <vector>
+
+#include "mlu_common_helper.h"
+
+void sigmoid_focal_loss_forward_mlu(Tensor input, Tensor target, Tensor weight,
+                                    Tensor output, const float gamma,
+                                    const float alpha) {
+  // params check
+  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
+              "But now gamma is ", gamma, ".");
+
+  // check dtype
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      input.scalar_type(), ".");
+
+  TORCH_CHECK(
+      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
+      "target type should be Int or Long. ", "But now target type is ",
+      target.scalar_type(), ".");
+
+  if (weight.data_ptr() != nullptr) {
+    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
+                "Data types of input and weight should be the same. But now "
+                "input type is ",
+                input.scalar_type(), ", weight type is ", weight.scalar_type(),
+                ".");
+  } else {
+    CNLOG(INFO) << "weight is a empty tensor.";
+  }
+
+  // return if zero-element
+  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
+    return;
+  }
+
+  // contiguous
+  auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      input, input.suggest_memory_format());
+  // target only support in32
+  auto target_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      target.toType(at::kInt), target.suggest_memory_format());
+  auto weight_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      weight, weight.suggest_memory_format());
+  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      output, output.suggest_memory_format());
+
+  // set tensor descriptor
+  MluOpTensorDescriptor input_desc, target_desc, weight_desc, output_desc;
+  input_desc.set(input_contiguous);
+  target_desc.set(target_contiguous);
+  weight_desc.set(weight_contiguous);
+  output_desc.set(output_contiguous);
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto target_impl = torch_mlu::getMluTensorImpl(target_contiguous);
+  auto target_ptr = target_impl->cnnlMalloc();
+  auto weight_impl = torch_mlu::getMluTensorImpl(weight_contiguous);
+  auto weight_ptr = weight_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // set prefer computation performance and redcuntion approach
+  mluOpComputationPreference_t prefer = MLUOP_COMPUTATION_FAST;
+  mluOpLossReduction_t reduction = MLUOP_LOSS_REDUCTION_NONE;
+
+  auto handle = mluOpGetCurrentHandle();
+
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpFocalLossSigmoidForward(
+      handle, prefer, reduction, input_desc.desc(), input_ptr,
+      target_desc.desc(), target_ptr, weight_desc.desc(), weight_ptr, alpha,
+      gamma, output_desc.desc(), output_ptr));
+}
+
+void sigmoid_focal_loss_backward_mlu(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, const float gamma,
+                                     const float alpha) {
+  // params check
+  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
+              "But now gamma is ", gamma, ".");
+  // check dtype
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      input.scalar_type(), ".");
+
+  TORCH_CHECK(
+      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
+      "target type should be Int or Long. ", "But now target type is ",
+      target.scalar_type(), ".");
+
+  bool has_weight = false;
+  if (weight.data_ptr() != nullptr) {
+    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
+                "Data types of input and weight should be the same. But now "
+                "input type is ",
+                input.scalar_type(), ", weight type is ", weight.scalar_type(),
+                ".");
+    has_weight = true;
+  } else {
+    CNLOG(INFO) << "weight is a empty tensor.";
+  }
+
+  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
+    // return if zero-element
+    return;
+  }
+
+  // contiguous
+  auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      input, input.suggest_memory_format());
+  // only support in32
+  auto target_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      target.toType(at::kInt), target.suggest_memory_format());
+  auto weight_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      weight, weight.suggest_memory_format());
+  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      output, output.suggest_memory_format());
+
+  // set tensor descriptor
+  MluOpTensorDescriptor input_desc, target_desc, weight_desc, output_desc;
+  input_desc.set(input_contiguous);
+  target_desc.set(target_contiguous);
+  weight_desc.set(weight_contiguous);
+  output_desc.set(output_contiguous);
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto target_impl = torch_mlu::getMluTensorImpl(target_contiguous);
+  auto target_ptr = target_impl->cnnlMalloc();
+  auto weight_impl = torch_mlu::getMluTensorImpl(weight_contiguous);
+  auto weight_ptr = weight_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // set prefer computation performance and redcuntion approach
+  // backward only support MLUOP_COMPUTATION_HIGH_PRECISION
+  mluOpComputationPreference_t prefer = MLUOP_COMPUTATION_HIGH_PRECISION;
+  mluOpLossReduction_t reduction = MLUOP_LOSS_REDUCTION_NONE;
+
+  auto handle = mluOpGetCurrentHandle();
+
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpFocalLossSigmoidBackward(
+      handle, prefer, reduction, input_desc.desc(), input_ptr,
+      target_desc.desc(), target_ptr, weight_desc.desc(), weight_ptr, alpha,
+      gamma, output_desc.desc(), output_ptr));
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, MLU,
+                     sigmoid_focal_loss_forward_mlu);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, MLU,
+                     sigmoid_focal_loss_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..58bf6fc4bf9b0c63926afb4fbe0f2541e9370bf9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
@@ -0,0 +1,79 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "mlu_common_helper.h"
+
+void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
+                                 float iou_threshold) {
+  if (boxes.numel() == 0) {
+    return;
+  }
+
+  int input_box_num = boxes.size(0);
+  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
+  auto output = keep.to(boxes.options().dtype(at::kInt));
+  auto output_size = at::empty({1}, boxes.options().dtype(at::kInt));
+
+  MluOpTensorDescriptor boxes_desc, output_desc;
+  boxes_desc.set(boxes_);
+  output_desc.set(output);
+
+  // workspace
+  size_t workspace_size = 0;
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpGetNmsWorkspaceSize(handle, boxes_desc.desc(), NULL,
+                                             &workspace_size));
+  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));
+
+  // get compute queue
+  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
+  auto boxes_ptr = boxes_impl->cnnlMalloc();
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(keep);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto output_size_impl = torch_mlu::getMluTensorImpl(keep_num);
+  auto output_size_ptr = output_size_impl->cnnlMalloc();
+
+  // nms desc
+  mluOpNmsDescriptor_t nms_desc;
+  const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
+  const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
+  const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
+  const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
+  const float soft_nms_sigma = 0.0;
+  const float confidence_threshold = 0.0;
+  const int input_layout = 0;
+  const bool pad_to_max_output_size = false;
+  const int max_output_size = input_box_num;
+  const float offset = 0.0;
+
+  TORCH_MLUOP_CHECK(mluOpCreateNmsDescriptor(&nms_desc));
+  TORCH_MLUOP_CHECK(mluOpSetNmsDescriptor(
+      nms_desc, box_mode, output_mode, algo, method_mode, iou_threshold,
+      soft_nms_sigma, max_output_size, confidence_threshold, offset,
+      input_layout, pad_to_max_output_size));
+
+  TORCH_MLUOP_CHECK(mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr,
+                             NULL, NULL, workspace_ptr, workspace_size,
+                             output_desc.desc(), output_ptr, output_size_ptr));
+  TORCH_MLUOP_CHECK(mluOpDestroyNmsDescriptor(nms_desc));
+}
+
+void iou3d_nms3d_forward_mlu(const Tensor boxes, Tensor &keep, Tensor &keep_num,
+                             float nms_overlap_thresh) {
+  IoU3DNMS3DMLUKernelLauncher(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+void iou3d_nms3d_forward_impl(const Tensor boxes, Tensor &keep,
+                              Tensor &keep_num, float nms_overlap_thresh);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, MLU, iou3d_nms3d_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7842b3a13841b2caa27ded028ee103193822931
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp
@@ -0,0 +1,226 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelMaskedIm2colForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    cnrtDataType_t k_dtype, const void *im_ptr, const int height,
+    const int width, const int channels, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const void *mask_h_idx_ptr,
+    const void *mask_w_idx_ptr, const int mask_cnt, void *col_ptr);
+
+void KernelMaskedCol2imForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                               cnrtQueue_t queue, cnrtDataType_t k_dtype,
+                               const void *col_ptr, const int height,
+                               const int width, const int channels,
+                               const void *mask_h_idx_ptr,
+                               const void *mask_w_idx_ptr, const int mask_cnt,
+                               void *im_ptr);
+
+// policy function
+static void policyFunc(const int mask_cnt, cnrtDim3_t *k_dim,
+                       cnrtFunctionType_t *k_type) {
+  const size_t cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  const size_t core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  const size_t task_dim = CEIL_ALIGN(mask_cnt, core_num);
+  k_dim->x = core_num;
+  k_dim->y =
+      (task_dim / core_num) > cluster_num ? cluster_num : (task_dim / core_num);
+  k_dim->z = 1;
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+}
+
+void MaskedIm2colForwardMLUKernelLauncher(const Tensor im,
+                                          const Tensor mask_h_idx,
+                                          const Tensor mask_w_idx, Tensor col,
+                                          const int kernel_h,
+                                          const int kernel_w, const int pad_h,
+                                          const int pad_w) {
+  // Check dtype.
+  TORCH_CHECK(im.scalar_type() == at::kFloat || im.scalar_type() == at::kHalf,
+              "im type should be Float or Half, got ", im.scalar_type(), ".");
+  TORCH_CHECK(mask_h_idx.scalar_type() == at::kInt ||
+                  mask_h_idx.scalar_type() == at::kLong,
+              "mask_h_idx type should be Int or Long, got ",
+              mask_h_idx.scalar_type(), ".");
+  TORCH_CHECK(mask_w_idx.scalar_type() == at::kInt ||
+                  mask_w_idx.scalar_type() == at::kLong,
+              "mask_w_idx type should be Int or Long, got ",
+              mask_w_idx.scalar_type(), ".");
+  TORCH_CHECK(kernel_h > 0, "kernel_h should greater than 0, got ", kernel_h,
+              ".");
+  TORCH_CHECK(kernel_w > 0, "kernel_w should greater than 0, got ", kernel_w,
+              ".");
+
+  // zero element check
+  TORCH_CHECK(im.numel() > 0, "im.numel should greater than zero, got ",
+              im.numel(), ".");
+  TORCH_CHECK(col.size(0) > 0, "col.size(0) should greater than zero, got ",
+              col.size(0), ".");
+
+  // large tensor check
+  const size_t max_input_num = 2147483648;  // 2^31, 2G num
+  TORCH_CHECK(im.numel() < max_input_num,
+              "im.numel() should be less than 2147483648, got ", im.numel(),
+              ".");
+  TORCH_CHECK(col.numel() < max_input_num,
+              "col.numel() should be less than 2147483648, got ", col.numel(),
+              ".");
+
+  const int channels = im.size(1);
+  const int height = im.size(2);
+  const int width = im.size(3);
+  const int mask_cnt = mask_h_idx.size(0);
+
+  // auto im_t = im.permute({0, 2, 3, 1}).contiguous();
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(im.dim());
+  auto im_ = torch_mlu::cnnl::ops::cnnl_contiguous(im, memory_format);
+  auto col_ =
+      at::zeros({mask_cnt, kernel_h * kernel_w, channels}, col.options());
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFunc(mask_cnt, &k_dim, &k_type);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+  // get ptr of tensors
+  auto im_impl = torch_mlu::getMluTensorImpl(im_);
+  auto im_ptr = im_impl->cnnlMalloc();
+  auto mask_h_idx_impl = torch_mlu::getMluTensorImpl(mask_h_idx);
+  auto mask_h_idx_ptr = mask_h_idx_impl->cnnlMalloc();
+  auto mask_w_idx_impl = torch_mlu::getMluTensorImpl(mask_w_idx);
+  auto mask_w_idx_ptr = mask_w_idx_impl->cnnlMalloc();
+  auto col_impl = torch_mlu::getMluTensorImpl(col_);
+  auto col_ptr = col_impl->cnnlMalloc();
+
+  // get comput dtype of input
+  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(im.dtype());
+
+  // launch kernel
+  CNLOG(INFO) << "Launch Kernel MLUKernelMaskedIm2colForward<<<" << k_dim.x
+              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
+  KernelMaskedIm2colForward(k_dim, k_type, queue, data_type, im_ptr, height,
+                            width, channels, kernel_h, kernel_w, pad_h, pad_w,
+                            mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, col_ptr);
+
+  col.copy_(col_.permute({2, 1, 0})
+                .reshape({channels * kernel_h * kernel_w, mask_cnt})
+                .contiguous());
+}
+
+void MaskedCol2imForwardMLUKernelLauncher(const Tensor col,
+                                          const Tensor mask_h_idx,
+                                          const Tensor mask_w_idx, Tensor im,
+                                          const int height, const int width,
+                                          const int channels) {
+  // Check dtype.
+  TORCH_CHECK(col.scalar_type() == at::kFloat || col.scalar_type() == at::kHalf,
+              "col type should be Float or Half, got ", col.scalar_type(), ".");
+  TORCH_CHECK(mask_h_idx.scalar_type() == at::kInt ||
+                  mask_h_idx.scalar_type() == at::kLong,
+              "mask_h_idx type should be Int or Long, got ",
+              mask_h_idx.scalar_type(), ".");
+  TORCH_CHECK(mask_w_idx.scalar_type() == at::kInt ||
+                  mask_w_idx.scalar_type() == at::kLong,
+              "mask_w_idx type should be Int or Long, got ",
+              mask_w_idx.scalar_type(), ".");
+
+  // zero element check
+  TORCH_CHECK(im.numel() > 0, "im.numel should greater than zero, got ",
+              im.numel(), ".");
+  TORCH_CHECK(col.size(0) > 0, "col.size(0) should greater than zero, got ",
+              col.size(0), ".");
+
+  // large tensor check
+  const size_t max_input_num = 2147483648;  // 2^31, 2G num
+  TORCH_CHECK(im.numel() < max_input_num,
+              "im.numel() should be less than 2147483648, got ", im.numel(),
+              ".");
+  TORCH_CHECK(col.numel() < max_input_num,
+              "col.numel() should be less than 2147483648, got ", col.numel(),
+              ".");
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(im.dim());
+  at::Tensor im_ =
+      at::empty({1, channels, height, width}, im.options(), memory_format)
+          .zero_();
+
+  auto col_t = torch_mlu::cnnl::ops::cnnl_contiguous(col.transpose(0, 1));
+
+  const int mask_cnt = mask_h_idx.size(0);
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFunc(mask_cnt, &k_dim, &k_type);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+  // get ptr of tensors
+  auto im_impl = torch_mlu::getMluTensorImpl(im_);
+  auto im_ptr = im_impl->cnnlMalloc();
+  auto mask_h_idx_impl = torch_mlu::getMluTensorImpl(mask_h_idx);
+  auto mask_h_idx_ptr = mask_h_idx_impl->cnnlMalloc();
+  auto mask_w_idx_impl = torch_mlu::getMluTensorImpl(mask_w_idx);
+  auto mask_w_idx_ptr = mask_w_idx_impl->cnnlMalloc();
+  auto col_t_impl = torch_mlu::getMluTensorImpl(col_t);
+  auto col_t_ptr = col_t_impl->cnnlMalloc();
+
+  // get comput dtype of input
+  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(col.dtype());
+
+  // launch kernel
+  CNLOG(INFO) << "Launch Kernel MLUKernelMaskedCol2imForward<<<" << k_dim.x
+              << ", " << k_dim.y << ", " << k_dim.z << ">>>";
+
+  KernelMaskedCol2imForward(k_dim, k_type, queue, data_type, col_t_ptr, height,
+                            width, channels, mask_h_idx_ptr, mask_w_idx_ptr,
+                            mask_cnt, im_ptr);
+
+  im.copy_(im_);
+}
+
+void masked_im2col_forward_mlu(const Tensor im, const Tensor mask_h_idx,
+                               const Tensor mask_w_idx, Tensor col,
+                               const int kernel_h, const int kernel_w,
+                               const int pad_h, const int pad_w) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardMLUKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                       kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_mlu(const Tensor col, const Tensor mask_h_idx,
+                               const Tensor mask_w_idx, Tensor im, int height,
+                               int width, int channels) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardMLUKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                       width, channels);
+}
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+
+REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, MLU,
+                     masked_im2col_forward_mlu);
+REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, MLU,
+                     masked_col2im_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9f0b50ae891d2ecbb792faff618e4a562d2cbbdb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
@@ -0,0 +1,150 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+// Descriptors
+mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type) {
+  const std::map<std::string, mluOpDataType_t> mapping_type = {
+      {std::string("c10::Half"), MLUOP_DTYPE_HALF},
+      {std::string("float"), MLUOP_DTYPE_FLOAT},
+      {std::string("double"), MLUOP_DTYPE_DOUBLE},
+      {std::string("int8"), MLUOP_DTYPE_INT8},
+      {std::string("signed char"), MLUOP_DTYPE_INT8},
+      {std::string("short int"), MLUOP_DTYPE_INT16},
+      {std::string("short"), MLUOP_DTYPE_INT16},
+      {std::string("int"), MLUOP_DTYPE_INT32},
+      {std::string("long int"), MLUOP_DTYPE_INT64},
+      {std::string("long"), MLUOP_DTYPE_INT64},
+      {std::string("unsigned char"), MLUOP_DTYPE_UINT8},
+      {std::string("bool"), MLUOP_DTYPE_BOOL},
+      {std::string("c10::complex<c10::Half>"), MLUOP_DTYPE_COMPLEX_HALF},
+      {std::string("c10::complex<float>"), MLUOP_DTYPE_COMPLEX_FLOAT}};
+
+  if (mapping_type.find(std::string(data_type.name())) != mapping_type.end()) {
+    return mapping_type.find(std::string(data_type.name()))->second;
+  }
+  return MLUOP_DTYPE_INVALID;
+}
+
+// laytout
+mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input) {
+  auto suggest_memory_format = input.suggest_memory_format();
+  mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY;
+  switch (input.dim()) {
+    case 4:
+      layout = (suggest_memory_format == at::MemoryFormat::ChannelsLast)
+                   ? MLUOP_LAYOUT_NHWC
+                   : MLUOP_LAYOUT_NCHW;
+      break;
+    case 5:
+      layout = (suggest_memory_format == at::MemoryFormat::ChannelsLast3d)
+                   ? MLUOP_LAYOUT_NDHWC
+                   : MLUOP_LAYOUT_NCDHW;
+      break;
+    default:
+      layout = MLUOP_LAYOUT_ARRAY;
+  }
+  return layout;
+}
+
+mluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type) {
+  const std::map<reduce_t, mluOpReduceMode_t> mapping_type = {
+      {reduce_t::MAX, MLUOP_REDUCE_DMAX},
+      {reduce_t::SUM, MLUOP_REDUCE_DSUM},
+      {reduce_t::MEAN, MLUOP_REDUCE_DMEAN}};
+  if (mapping_type.find(reduce_type) != mapping_type.end()) {
+    return mapping_type.find(reduce_type)->second;
+  } else {
+    TORCH_CHECK(false, "Unsupported reduce type: ", to_string(reduce_type));
+    return MLUOP_REDUCE_DSUM;
+  }
+}
+
+void MluOpTensorDescriptor::set(Tensor t) {
+  mluOpDataType_t data_type = getMluOpDataType(t.dtype());
+  mluOpTensorLayout_t layout = getMluOpSuggestLayout(t);
+  int t_dim = t.dim();
+  std::vector<int> dim_array;
+  if (t_dim == 0) {
+    dim_array.push_back(
+        1);  // ScalarTensor(0-dim 1-item Tensor) view like size = 1 as default;
+  } else {
+    for (int i = 0; i < t_dim; i++) {
+      dim_array.push_back(static_cast<int>(t.sizes().vec()[i]));
+    }
+  }
+  set_desc(t, layout, data_type, dim_array);
+}
+
+void MluOpTensorDescriptor::set_with_layout(Tensor t,
+                                            mluOpTensorLayout_t layout) {
+  mluOpDataType_t data_type = getMluOpDataType(t.dtype());
+  int t_dim = t.dim();
+  std::vector<int> shape_info = checkUpperBoundAndCastTo<int>(t.sizes().vec());
+  std::vector<int> stride_info =
+      checkUpperBoundAndCastTo<int>(t.strides().vec());
+  if (layout == MLUOP_LAYOUT_NHWC || layout == MLUOP_LAYOUT_NDHWC ||
+      layout == MLUOP_LAYOUT_NLC) {
+    convertShapeAndStride(shape_info, stride_info);
+  } else if (layout == MLUOP_LAYOUT_HWCN) {
+    auto convertDepthWiseConvShapeStride = [](const std::vector<int64_t>& vec,
+                                              std::vector<int>& target_vec,
+                                              std::vector<int>& stride_vec) {
+      // NCHW --> HWCN
+      target_vec[0] = static_cast<int>(vec[2]);
+      target_vec[1] = static_cast<int>(vec[3]);
+      target_vec[2] = static_cast<int>(vec[1]);
+      target_vec[3] = static_cast<int>(vec[0]);
+      // Calculate Stride just like contiguous of HWCN.
+      stride_vec[3] = 1;
+      stride_vec[2] = target_vec[3] * stride_vec[3];
+      stride_vec[1] = target_vec[2] * stride_vec[2];
+      stride_vec[0] = target_vec[1] * stride_vec[1];
+    };
+    convertDepthWiseConvShapeStride(t.sizes().vec(), shape_info, stride_info);
+  }
+  TORCH_CHECK(mluOpSetTensorDescriptorEx(
+                  desc_, layout, data_type, t_dim, shape_info.data(),
+                  stride_info.data()) == MLUOP_STATUS_SUCCESS,
+              "mluOpSetTensorDescriptorEx execution failed.");
+}
+
+void MluOpTensorDescriptor::set_desc(const at::Tensor& t,
+                                     mluOpTensorLayout_t layout,
+                                     mluOpDataType_t dtype,
+                                     std::vector<int>& dims) {
+  int dimNb = dims.size();
+  TORCH_MLUOP_CHECK(
+      mluOpSetTensorDescriptor(desc_, layout, dtype, dimNb, dims.data()));
+}
+
+// Handles
+std::once_flag mmcv_mluop_init_flag;
+std::mutex mmcv_mluop_mutex;
+static std::vector<MluOpHandle> mmcv_mluop_handles;
+
+mluOpHandle_t mluOpGetCurrentHandle(c10::DeviceIndex device_index) {
+  std::call_once(mmcv_mluop_init_flag,
+                 []()  // Init mmcv_mluop_handles 1-device <-> 1-handle
+                 {
+                   c10::DeviceIndex num_devices = torch_mlu::device_count();
+                   mmcv_mluop_handles.resize(num_devices);
+                 });
+
+  if (device_index == -1) {
+    device_index = torch_mlu::current_device();
+  }
+  std::lock_guard<std::mutex> mmcv_mluop_guard(mmcv_mluop_mutex);
+  auto queue = torch_mlu::getCurrentQueue(device_index).queue();
+  mmcv_mluop_handles[device_index].setQueue(queue);
+  return mmcv_mluop_handles[device_index].handle;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f295ca804089a732a2280c59696173a2000ac63
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
@@ -0,0 +1,144 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#pragma once
+#include <ATen/ATen.h>
+#include <c10/core/ScalarType.h>
+
+#include "aten.h"
+#include "mlu_op.h"
+#include "pytorch_device_registry.hpp"
+
+#define MLUOP_MAJOR 0
+#define MLUOP_MINOR 8
+#define MLUOP_PATCHLEVEL 1
+
+/*************************************************************************
+ * This MACRO contains operations of simple tensor to mlu-tensor.
+ * _contiguous, _desc, _impl, _ptr will be automatically generated in
+ * this MACRO.
+ *************************************************************************/
+#define INITIAL_MLU_PARAM_WITH_TENSOR(NAME)                         \
+  auto NAME##_contigous = torch_mlu::cnnl::ops::cnnl_contiguous(    \
+      NAME, NAME.suggest_memory_format());                          \
+  MluOpTensorDescriptor NAME##_desc;                                \
+  NAME##_desc.set(NAME##_contigous);                                \
+  auto NAME##_impl = torch_mlu::getMluTensorImpl(NAME##_contigous); \
+  auto NAME##_ptr = NAME##_impl->cnnlMalloc();
+
+#ifndef TORCH_MLUOP_CHECK
+#define TORCH_MLUOP_CHECK(EXPR)                                          \
+  do {                                                                   \
+    mluOpStatus_t status = EXPR;                                         \
+    if (status != MLUOP_STATUS_SUCCESS) {                                \
+      CNLOG(ERROR) << "";                                                \
+      TORCH_CHECK(false, "MLUOPS error: ", mluOpGetErrorString(status)); \
+    }                                                                    \
+  } while (0);
+#endif
+
+enum class reduce_t { SUM = 0, MEAN = 1, MAX = 2 };
+
+inline std::string to_string(reduce_t reduce_type) {
+  if (reduce_type == reduce_t::MAX) {
+    return "max";
+  } else if (reduce_type == reduce_t::MEAN) {
+    return "mean";
+  } else if (reduce_type == reduce_t::SUM) {
+    return "sum";
+  } else {
+    return "unknown reduce type";
+  }
+}
+
+mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type);
+mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input);
+mluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type);
+
+class MluOpTensorDescriptor {
+ public:
+  MluOpTensorDescriptor() {
+    TORCH_MLUOP_CHECK(mluOpCreateTensorDescriptor(&desc_));
+  };
+  ~MluOpTensorDescriptor() {
+    TORCH_MLUOP_CHECK(mluOpDestroyTensorDescriptor(desc_));
+  }
+
+  void set(at::Tensor);
+  void set_with_layout(at::Tensor, mluOpTensorLayout_t layout);
+  mluOpTensorDescriptor_t desc() { return desc_; }
+
+ private:
+  mluOpTensorDescriptor_t desc_;
+  void set_desc(const at::Tensor&, mluOpTensorLayout_t, mluOpDataType_t,
+                std::vector<int>& dims);
+};
+
+mluOpHandle_t mluOpGetCurrentHandle(c10::DeviceIndex device_index = -1);
+
+class MluOpHandle {
+ public:
+  MluOpHandle() : handle(nullptr) { TORCH_MLUOP_CHECK(mluOpCreate(&handle)); }
+  ~MluOpHandle() {
+    if (handle) {
+      TORCH_MLUOP_CHECK(mluOpDestroy(handle));
+      handle = nullptr;
+    }
+  }
+  void setQueue(cnrtQueue_t queue) {
+    TORCH_MLUOP_CHECK(mluOpSetQueue(handle, queue));
+  }
+  mluOpHandle_t handle;
+};
+
+// modify tensor size and stride order based on
+// channels_first to channels_last or channels_last_3d.
+// which this is not same with pytorch original layout,
+// this real layout is based on data storage real order.
+// example: modify channels_last tensor dim to nhwc tensor desc.
+//            N    C H W  -->   N    H W C
+//          C*H*W  1 W C  --> C*H*W  W C 1
+template <typename T>
+void convertShapeAndStride(std::vector<T>& shape_info,
+                           std::vector<T>& stride_info) {
+  TORCH_MLU_CHECK(shape_info.size() == stride_info.size(),
+                  "shape size need equal to stride size.");
+  const int dim = shape_info.size();
+  std::vector<T> temp_shape_info(dim);
+  std::vector<T> temp_stride_info(dim);
+  temp_shape_info[0] = shape_info[0];
+  temp_stride_info[0] = stride_info[0];
+  for (size_t i = 0; i < dim - 1; ++i) {
+    const int index = (i + 1) % (dim - 1) + 1;
+    temp_shape_info[i + 1] = shape_info[index];
+    temp_stride_info[i + 1] = stride_info[index];
+  }
+  shape_info.assign(temp_shape_info.begin(), temp_shape_info.end());
+  stride_info.assign(temp_stride_info.begin(), temp_stride_info.end());
+}
+
+// torch tensor provides int64_t type of shape and stride,
+// but mluops descriptor requires type int32.
+// use this function to ensure safe CAST, or report an error.
+template <typename DST_T, typename SRC_T>
+std::vector<DST_T> checkUpperBoundAndCastTo(const std::vector<SRC_T>& input) {
+  std::vector<DST_T> output;
+  output.reserve(input.size());
+  for (const auto& val : input) {
+    if (val > std::numeric_limits<DST_T>::max()) {
+      TORCH_MLU_CHECK(false, "Requires dim size not greater than ",
+                      std::numeric_limits<DST_T>::max(), ". But got ", val,
+                      ".");
+    }
+    output.push_back(static_cast<DST_T>(val));
+  }
+  return output;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..59192063e487b52887c481b9c33e698154700b9e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
@@ -0,0 +1,129 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+Tensor MsDeformAttnForwardLauncher(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step) {
+  auto handle = mluOpGetCurrentHandle();
+  const int batch_size = value.size(0);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+  const int num_queries = sampling_loc.size(1);
+  auto output = at::zeros({batch_size, num_queries, num_heads, channels},
+                          value.options());
+  auto spatial_shapes_int = spatial_shapes.to(at::kInt);
+  auto level_start_index_int = level_start_index.to(at::kInt);
+  INITIAL_MLU_PARAM_WITH_TENSOR(output);
+  INITIAL_MLU_PARAM_WITH_TENSOR(value);
+  INITIAL_MLU_PARAM_WITH_TENSOR(spatial_shapes_int);
+  INITIAL_MLU_PARAM_WITH_TENSOR(level_start_index_int);
+  INITIAL_MLU_PARAM_WITH_TENSOR(sampling_loc);
+  INITIAL_MLU_PARAM_WITH_TENSOR(attn_weight);
+
+  TORCH_MLUOP_CHECK(mluOpMsDeformAttnForward(
+      handle, value_desc.desc(), value_ptr, spatial_shapes_int_desc.desc(),
+      spatial_shapes_int_ptr, level_start_index_int_desc.desc(),
+      level_start_index_int_ptr, sampling_loc_desc.desc(), sampling_loc_ptr,
+      attn_weight_desc.desc(), attn_weight_ptr, im2col_step, output_desc.desc(),
+      output_ptr));
+
+  output = output.view({batch_size, num_queries, num_heads * channels});
+  return output;
+}
+
+void MsDeformAttnBackwardLauncher(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight,
+    const int im2col_step) {
+  auto handle = mluOpGetCurrentHandle();
+  auto spatial_shapes_int = spatial_shapes.to(at::kInt);
+  auto level_start_index_int = level_start_index.to(at::kInt);
+  const int batch_size = value.size(0);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+  const int num_queries = sampling_loc.size(1);
+
+  auto grad_output_dim4 =
+      grad_output.view({batch_size, num_queries, num_heads, channels});
+  // auto grad_output_dim4 = grad_output.view({batch_size, num_queries,
+  // num_heads, channels}).detach();
+  INITIAL_MLU_PARAM_WITH_TENSOR(value);
+  INITIAL_MLU_PARAM_WITH_TENSOR(spatial_shapes_int);
+  INITIAL_MLU_PARAM_WITH_TENSOR(level_start_index_int);
+  INITIAL_MLU_PARAM_WITH_TENSOR(sampling_loc);
+  INITIAL_MLU_PARAM_WITH_TENSOR(attn_weight);
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_output_dim4);
+  // INITIAL_MLU_PARAM_WITH_TENSOR(grad_output);
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_value);
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_sampling_loc);
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_attn_weight);
+
+  mluOpMsDeformAttnBackward(
+      handle, value_desc.desc(), value_ptr, spatial_shapes_int_desc.desc(),
+      spatial_shapes_int_ptr, level_start_index_int_desc.desc(),
+      level_start_index_int_ptr, sampling_loc_desc.desc(), sampling_loc_ptr,
+      attn_weight_desc.desc(), attn_weight_ptr, grad_output_dim4_desc.desc(),
+      grad_output_dim4_ptr, im2col_step, grad_value_desc.desc(), grad_value_ptr,
+      grad_sampling_loc_desc.desc(), grad_sampling_loc_ptr,
+      grad_attn_weight_desc.desc(), grad_attn_weight_ptr);
+
+  return;
+}
+
+Tensor ms_deform_attn_mlu_forward(const Tensor& value,
+                                  const Tensor& spatial_shapes,
+                                  const Tensor& level_start_index,
+                                  const Tensor& sampling_loc,
+                                  const Tensor& attn_weight,
+                                  const int im2col_step) {
+  return MsDeformAttnForwardLauncher(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
+}
+
+void ms_deform_attn_mlu_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight,
+    const int im2col_step) {
+  return MsDeformAttnBackwardLauncher(value, spatial_shapes, level_start_index,
+                                      sampling_loc, attn_weight, grad_output,
+                                      grad_value, grad_sampling_loc,
+                                      grad_attn_weight, im2col_step);
+}
+
+Tensor ms_deform_attn_impl_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_impl_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, MLU,
+                     ms_deform_attn_mlu_forward);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, MLU,
+                     ms_deform_attn_mlu_backward);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..675c11e5978f04fb9a465ca2f653e52bde6a235c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
@@ -0,0 +1,86 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "mlu_common_helper.h"
+
+Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                            int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  int max_output_boxes = boxes.size(0);
+
+  // transpose boxes (n, 4) to (4, n) for better performance
+  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
+  auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
+  auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kInt));
+  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
+
+  MluOpTensorDescriptor boxes_desc, scores_desc, output_desc;
+  boxes_desc.set(boxes_);
+  scores_desc.set(scores_);
+  output_desc.set(output);
+
+  // workspace
+  size_t workspace_size = 0;
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpGetNmsWorkspaceSize(
+      handle, boxes_desc.desc(), scores_desc.desc(), &workspace_size));
+  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));
+
+  // get compute queue
+  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
+  auto boxes_ptr = boxes_impl->cnnlMalloc();
+  auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
+  auto scores_ptr = scores_impl->cnnlMalloc();
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
+  auto output_size_ptr = output_size_impl->cnnlMalloc();
+
+  // nms desc
+  mluOpNmsDescriptor_t nms_desc;
+  const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
+  const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
+  const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
+  const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
+  const float soft_nms_sigma = 0.0;
+  const float confidence_threshold = 0.0;
+  const int input_layout = 0;
+  const bool pad_to_max_output_size = false;
+  const int max_output_size = max_output_boxes;
+
+  TORCH_MLUOP_CHECK(mluOpCreateNmsDescriptor(&nms_desc));
+  TORCH_MLUOP_CHECK(mluOpSetNmsDescriptor(
+      nms_desc, box_mode, output_mode, algo, method_mode, iou_threshold,
+      soft_nms_sigma, max_output_size, confidence_threshold, (float)offset,
+      input_layout, pad_to_max_output_size));
+
+  TORCH_MLUOP_CHECK(mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr,
+                             scores_desc.desc(), scores_ptr, workspace_ptr,
+                             workspace_size, output_desc.desc(), output_ptr,
+                             output_size_ptr));
+  TORCH_MLUOP_CHECK(mluOpDestroyNmsDescriptor(nms_desc));
+  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
+  auto ret = output.to(boxes.options().dtype(at::kLong));
+  return ret.slice(0, 0, output_num);
+}
+
+Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSMLUKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/nms_rotated_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/nms_rotated_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3b6b333ee9f90440d5582daf0330cdf0017ac90
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/nms_rotated_mlu.cpp
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+Tensor nms_rotated_mlu(Tensor boxes, Tensor scores, float iou_threshold) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  int boxes_num = boxes.size(0);
+  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
+  auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
+  auto output = at::empty({boxes_num}, boxes.options().dtype(at::kInt));
+  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
+
+  MluOpTensorDescriptor boxes_desc, scores_desc, output_desc;
+  boxes_desc.set(boxes_);
+  scores_desc.set(scores_);
+  output_desc.set(output);
+
+  // workspace
+  size_t workspace_size = 0;
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpGetNmsRotatedWorkspaceSize(handle, boxes_desc.desc(),
+                                                    &workspace_size));
+  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));
+
+  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
+  auto boxes_ptr = boxes_impl->cnnlMalloc();
+  auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
+  auto scores_ptr = scores_impl->cnnlMalloc();
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
+  auto output_size_ptr = output_size_impl->cnnlMalloc();
+
+  TORCH_MLUOP_CHECK(mluOpNmsRotated(
+      handle, iou_threshold, boxes_desc.desc(), boxes_ptr, scores_desc.desc(),
+      scores_ptr, workspace_ptr, workspace_size, output_desc.desc(), output_ptr,
+      (int *)output_size_ptr));
+  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
+  auto ret = output.to(boxes.options().dtype(at::kLong));
+  return ret.slice(0, 0, output_num);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e43787a7af38c97549a23afee1efeda3ff90b42
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
@@ -0,0 +1,110 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void PSAMaskForwardMLUKernelLauncher(const int psa_type, const Tensor x,
+                                     Tensor y, const int num_,
+                                     const int h_feature, const int w_feature,
+                                     const int h_mask, const int w_mask,
+                                     const int half_h_mask,
+                                     const int half_w_mask) {
+  int y_c = y.size(1);
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(x.dim());
+  auto x_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(x, memory_format);
+  at::Tensor y_tmp =
+      at::empty({num_, y_c, h_feature, w_feature}, x.options(), memory_format);
+
+  MluOpTensorDescriptor x_desc, y_desc;
+  x_desc.set_with_layout(x_tensor, MLUOP_LAYOUT_NHWC);
+  y_desc.set_with_layout(y_tmp, MLUOP_LAYOUT_NHWC);
+
+  auto handle = mluOpGetCurrentHandle();
+  auto x_impl = torch_mlu::getMluTensorImpl(x_tensor);
+  auto x_ptr = x_impl->cnnlMalloc();
+  auto y_impl = torch_mlu::getMluTensorImpl(y_tmp);
+  auto y_ptr = y_impl->cnnlMalloc();
+
+  TORCH_MLUOP_CHECK(mluOpPsamaskForward(handle, psa_type, x_desc.desc(), x_ptr,
+                                        h_mask, w_mask, y_desc.desc(), y_ptr));
+
+  y.copy_(y_tmp);
+}
+
+void PSAMaskBackwardMLUKernelLauncher(const int psa_type, const Tensor dy,
+                                      Tensor dx, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask) {
+  int dx_c = dx.size(1);
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(dy.dim());
+  auto dy_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(dy, memory_format);
+  at::Tensor dx_tmp = at::empty({num_, dx_c, h_feature, w_feature},
+                                dy.options(), memory_format);
+
+  MluOpTensorDescriptor dy_desc, dx_tmp_desc;
+  dy_desc.set_with_layout(dy_tensor, MLUOP_LAYOUT_NHWC);
+  dx_tmp_desc.set_with_layout(dx_tmp, MLUOP_LAYOUT_NHWC);
+
+  auto handle = mluOpGetCurrentHandle();
+
+  // get ptr of tensors
+  auto dx_impl = torch_mlu::getMluTensorImpl(dx_tmp);
+  auto dx_ptr = dx_impl->cnnlMalloc();
+  auto dy_impl = torch_mlu::getMluTensorImpl(dy_tensor);
+  auto dy_ptr = dy_impl->cnnlMalloc();
+
+  TORCH_MLUOP_CHECK(mluOpPsamaskBackward(handle, psa_type, dy_desc.desc(),
+                                         dy_ptr, h_mask, w_mask,
+                                         dx_tmp_desc.desc(), dx_ptr));
+
+  dx.copy_(dx_tmp);
+}
+
+void psamask_forward_mlu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  PSAMaskForwardMLUKernelLauncher(psa_type, input, output, num_, h_feature,
+                                  w_feature, h_mask, w_mask, half_h_mask,
+                                  half_w_mask);
+}
+
+void psamask_backward_mlu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardMLUKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                   h_feature, w_feature, h_mask, w_mask,
+                                   half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+
+REGISTER_DEVICE_IMPL(psamask_forward_impl, MLU, psamask_forward_mlu);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, MLU, psamask_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..41307fd2ff16b6332484573d1b021663e0974249
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
@@ -0,0 +1,185 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax_y, Tensor argmax_x,
+                                      int aligned_height, int aligned_width,
+                                      float spatial_scale, int sampling_ratio,
+                                      int pool_mode, bool aligned) {
+  // params check
+  TORCH_CHECK(pool_mode == 1, "pool_mode only supports 'avg' currently");
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_tensor =
+      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  auto output_contiguous =
+      at::empty({num_rois, channels, aligned_height, aligned_width},
+                input.options(), memory_format);
+  // get tensor impl
+  auto self_impl = torch_mlu::getMluTensorImpl(input_tensor);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+
+  MluOpTensorDescriptor input_desc, rois_desc, argmax_y_desc, argmax_x_desc,
+      output_desc;
+  input_desc.set_with_layout(input_tensor, MLUOP_LAYOUT_NHWC);
+  rois_desc.set_with_layout(rois, MLUOP_LAYOUT_ARRAY);
+  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
+
+  // get the mlu ptr
+  auto self_ptr = self_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  mluOpRoiAlignForwardDescriptor_t roialign_desc;
+  TORCH_MLUOP_CHECK(mluOpCreateRoiAlignForwardDescriptor(&roialign_desc));
+  TORCH_MLUOP_CHECK(mluOpSetRoiAlignForwardDescriptor_v2(
+      roialign_desc, aligned_height, aligned_width, sampling_ratio,
+      spatial_scale, pool_mode, aligned));
+
+  auto handle = mluOpGetCurrentHandle();
+  if (pool_mode == 0) {
+    auto argmax_y_contiguous =
+        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_y, memory_format);
+    auto argmax_x_contiguous =
+        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_x, memory_format);
+    auto argmax_x_impl = torch_mlu::getMluTensorImpl(argmax_x_contiguous);
+    auto argmax_y_impl = torch_mlu::getMluTensorImpl(argmax_y_contiguous);
+    auto argmax_x_ptr = argmax_x_impl->cnnlMalloc();
+    auto argmax_y_ptr = argmax_y_impl->cnnlMalloc();
+    argmax_y_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
+    argmax_x_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
+    TORCH_MLUOP_CHECK(mluOpRoiAlignForward_v2(
+        handle, roialign_desc, input_desc.desc(), self_ptr, rois_desc.desc(),
+        rois_ptr, output_desc.desc(), output_ptr, argmax_x_desc.desc(),
+        argmax_x_ptr, argmax_y_desc.desc(), argmax_y_ptr));
+    argmax_x.copy_(argmax_x_contiguous);
+    argmax_y.copy_(argmax_y_contiguous);
+  } else {
+    TORCH_MLUOP_CHECK(mluOpRoiAlignForward_v2(
+        handle, roialign_desc, input_desc.desc(), self_ptr, rois_desc.desc(),
+        rois_ptr, output_desc.desc(), output_ptr, NULL, NULL, NULL, NULL));
+  }
+  TORCH_MLUOP_CHECK(mluOpDestroyRoiAlignForwardDescriptor(roialign_desc));
+  output.copy_(output_contiguous);
+}
+
+void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       Tensor grad_input, int aligned_height,
+                                       int aligned_width, float spatial_scale,
+                                       int sampling_ratio, int pool_mode,
+                                       bool aligned) {
+  // params check
+  TORCH_CHECK(pool_mode == 1, "pool_mode only supports 'avg' currently");
+  int batch_size = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad.dim());
+  auto grad_ = torch_mlu::cnnl::ops::cnnl_contiguous(grad, memory_format);
+  auto grad_input_ = at::empty({batch_size, channels, height, width},
+                               grad.options(), memory_format)
+                         .zero_();
+
+  int boxes_num = rois.size(0);
+  int hi = grad.size(2);
+  int wi = grad.size(3);
+  int c = grad.size(1);
+
+  int no = grad_input.size(0);
+  int ho = grad_input.size(2);
+  int wo = grad_input.size(3);
+
+  // get tensor impl
+  auto grad_impl = torch_mlu::getMluTensorImpl(grad_);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+
+  // get the mlu ptr
+  auto grad_ptr = grad_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  MluOpTensorDescriptor grads_desc, rois_desc, argmax_y_desc, argmax_x_desc,
+      grad_input_desc;
+  grads_desc.set_with_layout(grad_, MLUOP_LAYOUT_NHWC);
+  rois_desc.set_with_layout(rois, MLUOP_LAYOUT_ARRAY);
+  grad_input_desc.set_with_layout(grad_input_, MLUOP_LAYOUT_NHWC);
+
+  auto handle = mluOpGetCurrentHandle();
+  if (pool_mode == 0) {
+    auto argmax_y_contiguous =
+        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_y, memory_format);
+    auto argmax_x_contiguous =
+        torch_mlu::cnnl::ops::cnnl_contiguous(argmax_x, memory_format);
+    auto argmax_x_impl = torch_mlu::getMluTensorImpl(argmax_x_contiguous);
+    auto argmax_y_impl = torch_mlu::getMluTensorImpl(argmax_y_contiguous);
+    auto argmax_x_ptr = argmax_x_impl->cnnlMalloc();
+    auto argmax_y_ptr = argmax_y_impl->cnnlMalloc();
+    argmax_y_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
+    argmax_x_desc.set_with_layout(argmax_x_contiguous, MLUOP_LAYOUT_NHWC);
+    TORCH_MLUOP_CHECK(mluOpRoiAlignBackward_v2(
+        handle, grads_desc.desc(), grad_ptr, rois_desc.desc(), rois_ptr,
+        argmax_y_desc.desc(), argmax_x_ptr, argmax_y_desc.desc(), argmax_y_ptr,
+        spatial_scale, sampling_ratio, aligned, pool_mode,
+        grad_input_desc.desc(), grad_input_ptr));
+  } else {
+    TORCH_MLUOP_CHECK(mluOpRoiAlignBackward_v2(
+        handle, grads_desc.desc(), grad_ptr, rois_desc.desc(), rois_ptr, NULL,
+        NULL, NULL, NULL, spatial_scale, sampling_ratio, aligned, pool_mode,
+        grad_input_desc.desc(), grad_input_ptr));
+  }
+  grad_input.copy_(grad_input_);
+}
+
+void roi_align_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  ROIAlignForwardMLUKernelLauncher(input, rois, output, argmax_y, argmax_x,
+                                   aligned_height, aligned_width, spatial_scale,
+                                   sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignBackwardMLUKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, MLU, roi_align_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, MLU, roi_align_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..10a8b6836356ec1f0d899e8fb29797926eb993b4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
@@ -0,0 +1,119 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void ROIAlignRotatedForwardMLUKernelLauncher(Tensor input, Tensor rois,
+                                             Tensor output, int pooled_height,
+                                             int pooled_width,
+                                             float spatial_scale,
+                                             int sampling_ratio, bool aligned,
+                                             bool clockwise) {
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+  auto rois_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
+  auto output_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);
+
+  MluOpTensorDescriptor input_desc, rois_desc, output_desc;
+  input_desc.set_with_layout(input_, MLUOP_LAYOUT_NHWC);
+  rois_desc.set(rois_contiguous);
+  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpRoiAlignRotatedForward(
+      handle, input_desc.desc(), input_ptr, rois_desc.desc(), rois_ptr,
+      pooled_height, pooled_width, sampling_ratio, spatial_scale, aligned,
+      clockwise, output_desc.desc(), output_ptr));
+
+  output.copy_(output_contiguous);
+}
+
+void ROIAlignRotatedBackwardMLUKernelLauncher(
+    Tensor top_grad, Tensor rois, Tensor bottom_grad, int pooled_height,
+    int pooled_width, float spatial_scale, int sampling_ratio, bool aligned,
+    bool clockwise) {
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());
+  auto top_grad_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);
+  auto rois_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
+  auto bottom_grad_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(bottom_grad, memory_format);
+
+  // get ptr of tensors
+  auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_);
+  auto top_grad_ptr = top_grad_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_);
+  auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();
+
+  MluOpTensorDescriptor top_grad_desc, rois_desc, bottom_grad_desc;
+  top_grad_desc.set_with_layout(top_grad_, MLUOP_LAYOUT_NHWC);
+  rois_desc.set(rois_contiguous);
+  bottom_grad_desc.set_with_layout(bottom_grad_, MLUOP_LAYOUT_NHWC);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpRoiAlignRotatedBackward(
+      handle, top_grad_desc.desc(), top_grad_ptr, rois_desc.desc(), rois_ptr,
+      pooled_height, pooled_width, sampling_ratio, spatial_scale, aligned,
+      clockwise, bottom_grad_desc.desc(), bottom_grad_ptr));
+  bottom_grad.copy_(bottom_grad_);
+}
+
+void roi_align_rotated_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                                   int aligned_height, int aligned_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise) {
+  ROIAlignRotatedForwardMLUKernelLauncher(input, rois, output, aligned_height,
+                                          aligned_width, spatial_scale,
+                                          sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_mlu(Tensor top_grad, Tensor rois,
+                                    Tensor bottom_grad, int aligned_height,
+                                    int aligned_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise) {
+  ROIAlignRotatedBackwardMLUKernelLauncher(
+      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
+      sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, MLU,
+                     roi_align_rotated_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, MLU,
+                     roi_align_rotated_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7db23957d2cbba8f496b9effd67a62f87cde39e5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
@@ -0,0 +1,275 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                          cnrtQueue_t queue, cnrtDataType_t data_type,
+                          const void *input_data, const void *input_rois,
+                          const int batch, const int channels, const int height,
+                          const int width, const int pooled_height,
+                          const int pooled_width, const int rois_num,
+                          const float spatial_scale, void *output_data,
+                          int *argmax);
+
+void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                           cnrtQueue_t queue, cnrtDataType_t k_dtype,
+                           const void *grad_output_ptr, const void *rois_ptr,
+                           const int *argmax_ptr, void *grad_input_ptr,
+                           const int box_num, const int pooled_height,
+                           const int pooled_width, const int channels,
+                           const int batch, const int height, const int width,
+                           const float spatial_scale);
+
+// policy function for forward
+static void policyFuncForward(const int bin_num, cnrtDim3_t *k_dim,
+                              cnrtFunctionType_t *k_type) {
+  auto core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = core_num;
+  unsigned int use_cluster = bin_num / core_num + (bin_num % core_num > 0);
+  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
+  k_dim->z = 1;
+}
+
+void ROIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                     Tensor argmax, int pooled_height,
+                                     int pooled_width, float spatial_scale) {
+  // Check dtype.
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "input type should be Float or Half, got ", input.scalar_type());
+  TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
+              "rois should have the same type as input");
+
+  // Check dtype relationship.
+  TORCH_CHECK(
+      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
+      "argmax type should be Int or Long, got ", argmax.scalar_type());
+
+  // Check shape.
+  TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
+              "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
+              argmax.dim(), "D");
+
+  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
+              "spatial_scale should be within (0, 1], got ", spatial_scale);
+
+  // compute kernel params
+  auto batch = input.size(0);
+  auto height = input.size(2);
+  auto width = input.size(3);
+  auto channels = input.size(1);
+  auto rois_num = output.size(0);
+
+  if (output.numel() == 0) {
+    output = at::zeros({rois_num, channels, pooled_height, pooled_width},
+                       input.options());
+    return;
+  }
+  if (argmax.numel() == 0) {
+    argmax = at::zeros({rois_num, channels, pooled_height, pooled_width},
+                       argmax.options());
+    return;
+  }
+
+  // zero element check
+  if (input.numel() == 0 || rois.numel() == 0 || output.numel() == 0 ||
+      argmax.numel() == 0) {
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+
+  at::Tensor output_ =
+      at::empty({rois_num, channels, pooled_height, pooled_width},
+                input.options(), memory_format);
+  at::Tensor argmax_ =
+      at::empty({rois_num, channels, pooled_height, pooled_width},
+                argmax.options(), memory_format);
+
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFuncForward(rois_num * pooled_height * pooled_width, &k_dim, &k_type);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+
+  // get comput dtype of input
+  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input_.dtype());
+
+  // launch kernel
+  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolForward<<<" << k_dim.x << ", "
+              << k_dim.y << ", " << k_dim.z << ">>>";
+
+  KernelRoiPoolForward(k_dim, k_type, queue, data_type, input_ptr, rois_ptr,
+                       batch, channels, height, width, pooled_height,
+                       pooled_width, rois_num, spatial_scale, output_ptr,
+                       (int *)argmax_ptr);
+  output.copy_(output_);
+  argmax.copy_(argmax_);
+}
+
+// policy function for backward
+static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  k_dim->z = 1;
+}
+
+void ROIPoolBackwardMLUKernelLauncher(Tensor grad_output, Tensor rois,
+                                      Tensor argmax, Tensor grad_input,
+                                      int pooled_height, int pooled_width,
+                                      float spatial_scale) {
+  // Check dtype.
+  TORCH_CHECK(
+      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
+      "argmax type should be Int or Long, got ", argmax.scalar_type());
+  TORCH_CHECK((grad_output.scalar_type() == at::kFloat ||
+               grad_output.scalar_type() == at::kHalf),
+              "grad_output type should be FLoat or Half, got ",
+              grad_output.scalar_type());
+
+  // Check dtype relationship.
+  TORCH_CHECK((rois.scalar_type() == grad_output.scalar_type()),
+              "rois should have the same type as grad_output");
+
+  // Check shape.
+  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be 4d tensor, got ",
+              grad_output.dim(), "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
+              argmax.dim(), "D");
+
+  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
+              "spatial_scale should be within (0, 1], got ", spatial_scale);
+
+  // Check relationship between tensor.
+  // Check the relationship of n.
+  TORCH_CHECK(grad_output.size(0) == rois.size(0),
+              "grad_output.size(0) = ", grad_output.size(0),
+              ", while rois.size(0) = ", rois.size(0),
+              ". They should be the same.");
+
+  // Check the relationship of channels.
+  TORCH_CHECK(grad_output.size(1) == argmax.size(1),
+              "grad_output.size(1) = ", grad_output.size(1),
+              ", while argmax.size(1) = ", argmax.size(1),
+              ". They should be the same.");
+
+  // Check the relationship of height and width.
+  TORCH_CHECK(grad_output.size(2) == argmax.size(2),
+              "argmax.size(2) = ", argmax.size(2),
+              ", while grad_output.size(2) = ", grad_output.size(2),
+              ". They should be the same.");
+  TORCH_CHECK(grad_output.size(3) == argmax.size(3),
+              "argmax.size(3) = ", argmax.size(3),
+              ", while grad_output.size(3) = ", grad_output.size(3),
+              ". They should be the same.");
+
+  // Check zero element.
+  if (grad_output.numel() == 0 || rois.numel() == 0 || argmax.numel() == 0 ||
+      grad_input.numel() == 0) {
+    // return if zero-element
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
+  auto grad_output_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);
+  auto argmax_ = torch_mlu::cnnl::ops::cnnl_contiguous(argmax, memory_format);
+
+  int boxes_num = grad_output.size(0);
+  int no = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  auto grad_input_ = at::empty({no, channels, height, width},
+                               grad_input.options(), memory_format)
+                         .zero_();
+
+  // get tensor impl
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get mlu ptr
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  // calculate task dimension
+  cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad_input.dtype());
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFuncBackward(&k_dim, &k_type);
+
+  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolBackward<<<" << k_dim.x << ", "
+              << k_dim.y << ", " << k_dim.z << ">>>";
+
+  KernelRoiPoolBackward(k_dim, k_type, queue, k_dtype, grad_output_ptr,
+                        rois_ptr, (int *)argmax_ptr, grad_input_ptr, boxes_num,
+                        pooled_height, pooled_width, channels, no, height,
+                        width, spatial_scale);
+
+  grad_input.copy_(grad_input_);
+}
+
+void roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                          Tensor argmax, int pooled_height, int pooled_width,
+                          float spatial_scale) {
+  ROIPoolForwardMLUKernelLauncher(input, rois, output, argmax, pooled_height,
+                                  pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax,
+                           Tensor grad_input, int pooled_height,
+                           int pooled_width, float spatial_scale) {
+  ROIPoolBackwardMLUKernelLauncher(grad_output, rois, argmax, grad_input,
+                                   pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, MLU, roi_pool_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, MLU, roi_pool_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad01a36bd36bc6f6c6ebb6a501bb4e0b4a2108c9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp
@@ -0,0 +1,164 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void RoiawarePool3dForwardMLUKernelLauncher(
+    const int pool_method, const int boxes_num, const int pts_num,
+    const int channels, const int max_pts_each_voxel, const int out_x,
+    const int out_y, const int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor pts_idx_of_voxels, Tensor pooled_features,
+    Tensor argmax) {
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  auto rois_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(rois, rois.suggest_memory_format());
+  auto pts_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(pts, pts.suggest_memory_format());
+  auto pts_feature_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pts_feature, pts_feature.suggest_memory_format());
+  auto argmax_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      argmax, argmax.suggest_memory_format());
+  auto pts_idx_of_voxels_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pts_idx_of_voxels, pts_idx_of_voxels.suggest_memory_format());
+  auto pooled_features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pooled_features, pooled_features.suggest_memory_format());
+
+  MluOpTensorDescriptor rois_desc, pts_desc, pts_feature_desc, argmax_desc,
+      pts_idx_of_voxels_desc, pooled_features_desc;
+  rois_desc.set(rois_contiguous);
+  pts_desc.set(pts_contiguous);
+  pts_feature_desc.set(pts_feature_contiguous);
+  argmax_desc.set(argmax_contiguous);
+  pts_idx_of_voxels_desc.set(pts_idx_of_voxels_contiguous);
+  pooled_features_desc.set(pooled_features_contiguous);
+
+  // allocate extra space for workspace
+  size_t workspace_size = 0;
+  TORCH_MLUOP_CHECK(mluOpGetRoiawarePool3dForwardWorkspaceSize(
+      handle, rois_desc.desc(), pts_desc.desc(), pts_feature_desc.desc(),
+      &workspace_size));
+
+  auto workspace = at::empty(workspace_size, rois.options().dtype(at::kByte));
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois_contiguous);
+  auto pts_impl = torch_mlu::getMluTensorImpl(pts_contiguous);
+  auto pts_feature_impl = torch_mlu::getMluTensorImpl(pts_feature_contiguous);
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_contiguous);
+  auto pts_idx_of_voxels_impl =
+      torch_mlu::getMluTensorImpl(pts_idx_of_voxels_contiguous);
+  auto pooled_features_impl =
+      torch_mlu::getMluTensorImpl(pooled_features_contiguous);
+
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto pts_ptr = pts_impl->cnnlMalloc();
+  auto pts_feature_ptr = pts_feature_impl->cnnlMalloc();
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+  auto pts_idx_of_voxels_ptr = pts_idx_of_voxels_impl->cnnlMalloc();
+  auto pooled_features_ptr = pooled_features_impl->cnnlMalloc();
+
+  CNLOG(INFO) << "Call mluOpRoiawarePool3dForward().";
+  TORCH_MLUOP_CHECK(mluOpRoiawarePool3dForward(
+      handle, pool_method, boxes_num, pts_num, channels, rois_desc.desc(),
+      rois_ptr, pts_desc.desc(), pts_ptr, pts_feature_desc.desc(),
+      pts_feature_ptr, workspace_ptr, workspace_size, max_pts_each_voxel, out_x,
+      out_y, out_z, argmax_desc.desc(), argmax_ptr,
+      pts_idx_of_voxels_desc.desc(), pts_idx_of_voxels_ptr,
+      pooled_features_desc.desc(), pooled_features_ptr));
+}
+
+void roiaware_pool3d_forward_mlu(int boxes_num, int pts_num, int channels,
+                                 int max_pts_each_voxel, int out_x, int out_y,
+                                 int out_z, const Tensor rois, const Tensor pts,
+                                 const Tensor pts_feature, Tensor argmax,
+                                 Tensor pts_idx_of_voxels,
+                                 Tensor pooled_features, int pool_method) {
+  RoiawarePool3dForwardMLUKernelLauncher(
+      pool_method, boxes_num, pts_num, channels, max_pts_each_voxel, out_x,
+      out_y, out_z, rois, pts, pts_feature, pts_idx_of_voxels, pooled_features,
+      argmax);
+}
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, MLU,
+                     roiaware_pool3d_forward_mlu);
+
+void RoiawarePool3dBackwardMLUKernelLauncher(
+    int pool_method, int boxes_num, int out_x, int out_y, int out_z,
+    int channels, int max_pts_each_voxel, const Tensor pts_idx_of_voxels,
+    const Tensor argmax, const Tensor grad_out, Tensor grad_in) {
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  auto pts_idx_of_voxels_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pts_idx_of_voxels, pts_idx_of_voxels.suggest_memory_format());
+  auto argmax_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      argmax, argmax.suggest_memory_format());
+  auto grad_out_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      grad_out, grad_out.suggest_memory_format());
+  auto grad_in_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      grad_in, grad_in.suggest_memory_format());
+
+  MluOpTensorDescriptor pts_idx_of_voxels_desc, argmax_desc, grad_out_desc,
+      grad_in_desc;
+
+  pts_idx_of_voxels_desc.set(pts_idx_of_voxels_contiguous);
+  argmax_desc.set(argmax_contiguous);
+  grad_out_desc.set(grad_out_contiguous);
+  grad_in_desc.set(grad_in_contiguous);
+
+  auto pts_idx_of_voxels_impl =
+      torch_mlu::getMluTensorImpl(pts_idx_of_voxels_contiguous);
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_contiguous);
+  auto grad_out_impl = torch_mlu::getMluTensorImpl(grad_out_contiguous);
+  auto grad_in_impl = torch_mlu::getMluTensorImpl(grad_in_contiguous);
+
+  auto pts_idx_of_voxels_ptr = pts_idx_of_voxels_impl->cnnlMalloc();
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+  auto grad_out_ptr = grad_out_impl->cnnlMalloc();
+  auto grad_in_ptr = grad_in_impl->cnnlMalloc();
+
+  CNLOG(INFO) << "Call mluOpRoiawarePool3dBackward().";
+  TORCH_MLUOP_CHECK(mluOpRoiawarePool3dBackward(
+      handle, pool_method, boxes_num, out_x, out_y, out_z, channels,
+      max_pts_each_voxel, pts_idx_of_voxels_desc.desc(), pts_idx_of_voxels_ptr,
+      argmax_desc.desc(), argmax_ptr, grad_out_desc.desc(), grad_out_ptr,
+      grad_in_desc.desc(), grad_in_ptr));
+}
+
+void roiaware_pool3d_backward_mlu(int boxes_num, int out_x, int out_y,
+                                  int out_z, int channels,
+                                  int max_pts_each_voxel,
+                                  const Tensor pts_idx_of_voxels,
+                                  const Tensor argmax, const Tensor grad_out,
+                                  Tensor grad_in, int pool_method) {
+  RoiawarePool3dBackwardMLUKernelLauncher(
+      pool_method, boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+      pts_idx_of_voxels, argmax, grad_out, grad_in);
+}
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, MLU,
+                     roiaware_pool3d_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..109745c65996bb95b2857175efcf7cd649b050c4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp
@@ -0,0 +1,150 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void RoIPointPool3dForwardMLUKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features,
+    Tensor pooled_empty_flag) {
+  // check datatype
+  TORCH_CHECK(((xyz.scalar_type() == pooled_features.scalar_type()) &&
+               (boxes3d.scalar_type() == pooled_features.scalar_type()) &&
+               (pts_feature.scalar_type() == pooled_features.scalar_type())),
+              "data types of xyz, boxes3d, pts_feature and pooled_features "
+              "should be the same, ",
+              "but now xyz type is ", xyz.scalar_type(), ", boxes3d type is ",
+              boxes3d.scalar_type(), ", pts_feature type is ",
+              pts_feature.scalar_type(), ", pooled_features type is ",
+              pooled_features.scalar_type(), ".");
+  TORCH_CHECK(
+      (xyz.scalar_type() == at::kFloat || xyz.scalar_type() == at::kHalf),
+      "xyz type should be Float or Half, got ", xyz.scalar_type(), ".");
+  TORCH_CHECK((pooled_empty_flag.scalar_type() == at::kInt),
+              "pooled_empty_flag type should be Int, got ",
+              pooled_empty_flag.scalar_type(), ".");
+
+  // check shape
+  TORCH_CHECK(boxes3d.dim() == 3, "boxes3d should be a 3d tensor, got ",
+              boxes3d.dim(), "D.");
+  TORCH_CHECK(pts_feature.dim() == 3, "pts_feature should be a 3d tensor, got ",
+              pts_feature.dim(), "D.");
+
+  TORCH_CHECK(boxes3d.size(2) == 7,
+              "the 3rd dimensions of boxes3d should be 7, got ",
+              boxes3d.size(2), ".");
+  TORCH_CHECK((boxes3d.size(0) == batch_size),
+              "the 1st dimensions of boxes3d should be batch_size, ",
+              "but now the 1st dimension of boxes3d is ", boxes3d.size(0),
+              ", and batch_size is ", batch_size, ".");
+  TORCH_CHECK((pts_feature.size(0) == batch_size),
+              "the 1st dimensions of pts_feature should be batch_size, ",
+              "but now the 1st dimension of pts_feature is ",
+              pts_feature.size(0), ", and batch_size is ", batch_size, ".");
+  TORCH_CHECK((pts_feature.size(1) == pts_num),
+              "the 2nd dimensions of pts_feature should be pts_num, ",
+              "but now the 2nd dimension of pts_feature is ",
+              pts_feature.size(1), ", and pts_num is ", pts_num, ".");
+
+  // check zero element
+  if (xyz.numel() == 0 || pts_feature.numel() == 0 || boxes3d.numel() == 0 ||
+      pooled_features.numel() == 0 || pooled_empty_flag.numel() == 0) {
+    return;
+  }
+
+  // large tensor check
+  const size_t max_input_size = 2147483648;
+  TORCH_CHECK(xyz.numel() < max_input_size,
+              "xyz element num should be less than 2^31, got ", xyz.numel(),
+              ".");
+  TORCH_CHECK(boxes3d.numel() < max_input_size,
+              "boxes3d element num should be less than 2^31, got ",
+              boxes3d.numel(), ".");
+  TORCH_CHECK(pts_feature.numel() < max_input_size,
+              "pts_feature element num should be less than 2^31, got ",
+              pts_feature.numel(), ".");
+
+  // set contiguous
+  auto xyz_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(xyz, xyz.suggest_memory_format());
+  auto pts_feature_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pts_feature, pts_feature.suggest_memory_format());
+  auto boxes3d_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      boxes3d, boxes3d.suggest_memory_format());
+  auto pooled_features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pooled_features, pooled_features.suggest_memory_format());
+  auto pooled_empty_flag_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      pooled_empty_flag, pooled_empty_flag.suggest_memory_format());
+
+  // get ptr of tensors
+  auto xyz_impl = torch_mlu::getMluTensorImpl(xyz_contiguous);
+  auto xyz_ptr = xyz_impl->cnnlMalloc();
+  auto pts_feature_impl = torch_mlu::getMluTensorImpl(pts_feature_contiguous);
+  auto pts_feature_ptr = pts_feature_impl->cnnlMalloc();
+  auto boxes3d_impl = torch_mlu::getMluTensorImpl(boxes3d_contiguous);
+  auto boxes3d_ptr = boxes3d_impl->cnnlMalloc();
+  auto pooled_features_impl =
+      torch_mlu::getMluTensorImpl(pooled_features_contiguous);
+  auto pooled_features_ptr = pooled_features_impl->cnnlMalloc();
+  auto pooled_empty_flag_impl =
+      torch_mlu::getMluTensorImpl(pooled_empty_flag_contiguous);
+  auto pooled_empty_flag_ptr = pooled_empty_flag_impl->cnnlMalloc();
+
+  // create tensor descriptors
+  MluOpTensorDescriptor xyz_desc, pts_feature_desc, boxes3d_desc,
+      pooled_features_desc, pooled_empty_flag_desc;
+  xyz_desc.set(xyz_contiguous);
+  pts_feature_desc.set(pts_feature_contiguous);
+  boxes3d_desc.set(boxes3d_contiguous);
+  pooled_features_desc.set(pooled_features_contiguous);
+  pooled_empty_flag_desc.set(pooled_empty_flag_contiguous);
+
+  // get workspace
+  size_t workspace_size = 0;
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpGetRoiPointPool3dWorkspaceSize(
+      handle, batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
+      xyz_desc.desc(), pts_feature_desc.desc(), boxes3d_desc.desc(),
+      pooled_features_desc.desc(), pooled_empty_flag_desc.desc(),
+      &workspace_size));
+
+  auto workspace = at::empty(workspace_size, xyz.options().dtype(at::kByte));
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+  TORCH_MLUOP_CHECK(mluOpRoiPointPool3d(
+      handle, batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
+      xyz_desc.desc(), xyz_ptr, pts_feature_desc.desc(), pts_feature_ptr,
+      boxes3d_desc.desc(), boxes3d_ptr, workspace_ptr, workspace_size,
+      pooled_features_desc.desc(), pooled_features_ptr,
+      pooled_empty_flag_desc.desc(), (int *)pooled_empty_flag_ptr));
+}
+
+void roipoint_pool3d_forward_mlu(int batch_size, int pts_num, int boxes_num,
+                                 int feature_in_len, int sampled_pts_num,
+                                 const Tensor xyz, const Tensor boxes3d,
+                                 const Tensor pts_feature,
+                                 Tensor pooled_features,
+                                 Tensor pooled_empty_flag) {
+  RoIPointPool3dForwardMLUKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+}
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag);
+
+REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, MLU,
+                     roipoint_pool3d_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/rotated_feature_align_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/rotated_feature_align_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..46f93facda2d19223bb90ad1471d380c2179983a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/rotated_feature_align_mlu.cpp
@@ -0,0 +1,115 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void RotatedFeatureAlignForwardMLUKernelLauncher(const Tensor features,
+                                                 const Tensor best_bboxes,
+                                                 const float spatial_scale,
+                                                 const int points,
+                                                 Tensor output) {
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(features.dim());
+  auto features_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(features, memory_format);
+  auto best_bboxes_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      best_bboxes, best_bboxes.suggest_memory_format());
+  auto output_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(output, memory_format);
+
+  MluOpTensorDescriptor features_desc, best_bboxes_desc, output_desc;
+  features_desc.set_with_layout(features_, MLUOP_LAYOUT_NHWC);
+  best_bboxes_desc.set(best_bboxes_contiguous);
+  output_desc.set_with_layout(output_contiguous, MLUOP_LAYOUT_NHWC);
+
+  // get ptr of tensors
+  auto features_impl = torch_mlu::getMluTensorImpl(features_);
+  auto features_ptr = features_impl->cnnlMalloc();
+  auto best_bboxes_impl = torch_mlu::getMluTensorImpl(best_bboxes_contiguous);
+  auto best_bboxes_ptr = best_bboxes_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpRotatedFeatureAlignForward(
+      handle, features_desc.desc(), features_ptr, best_bboxes_desc.desc(),
+      best_bboxes_ptr, spatial_scale, points, output_desc.desc(), output_ptr));
+
+  output.copy_(output_contiguous);
+}
+
+void RotatedFeatureAlignBackwardMLUKernelLauncher(const Tensor top_grad,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor bottom_grad) {
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());
+  auto top_grad_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);
+  auto best_bboxes_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      best_bboxes, best_bboxes.suggest_memory_format());
+  auto bottom_grad_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(bottom_grad, memory_format);
+
+  // get ptr of tensors
+  auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_);
+  auto top_grad_ptr = top_grad_impl->cnnlMalloc();
+  auto best_bboxes_impl = torch_mlu::getMluTensorImpl(best_bboxes_contiguous);
+  auto best_bboxes_ptr = best_bboxes_impl->cnnlMalloc();
+  auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_);
+  auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();
+
+  MluOpTensorDescriptor top_grad_desc, best_bboxes_desc, bottom_grad_desc;
+  top_grad_desc.set_with_layout(top_grad_, MLUOP_LAYOUT_NHWC);
+  best_bboxes_desc.set(best_bboxes_contiguous);
+  bottom_grad_desc.set_with_layout(bottom_grad_, MLUOP_LAYOUT_NHWC);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpRotatedFeatureAlignBackward(
+      handle, top_grad_desc.desc(), top_grad_ptr, best_bboxes_desc.desc(),
+      best_bboxes_ptr, spatial_scale, points, bottom_grad_desc.desc(),
+      bottom_grad_ptr));
+  bottom_grad.copy_(bottom_grad_);
+}
+
+void rotated_feature_align_forward_mlu(const Tensor features,
+                                       const Tensor best_bboxes,
+                                       const float spatial_scale,
+                                       const int points, Tensor output) {
+  RotatedFeatureAlignForwardMLUKernelLauncher(features, best_bboxes,
+                                              spatial_scale, points, output);
+}
+
+void rotated_feature_align_backward_mlu(const Tensor top_grad,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor bottom_grad) {
+  RotatedFeatureAlignBackwardMLUKernelLauncher(
+      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
+}
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, MLU,
+                     rotated_feature_align_forward_mlu);
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, MLU,
+                     rotated_feature_align_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fa5098722073d6f32bd9c16835cd19af53e6c3bf
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp
@@ -0,0 +1,156 @@
+/*************************************************************************
+ * Copyright (C) 2023 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+std::vector<Tensor> dynamic_point_to_voxel_forward_mlu(
+    const Tensor &feats, const Tensor &coors, const reduce_t reduce_type) {
+  // params check
+  TORCH_CHECK(feats.scalar_type() == at::kFloat,
+              "feats type should be Float, got ", feats.scalar_type());
+  TORCH_CHECK(coors.scalar_type() == at::kInt,
+              "coors type should be Int32, got ", coors.scalar_type());
+  TORCH_CHECK(feats.size(0) == coors.size(0),
+              "feats.dim(0) and coors.dim(0) should be same, got ",
+              feats.size(0), " vs ", coors.size(0));
+
+  const int num_input = feats.size(0);
+  const int num_feats = feats.size(1);
+  // zero-element check
+  if (num_input == 0)
+    return {feats.clone().detach(), coors.clone().detach(),
+            coors.new_empty({0}, torch::kInt32),
+            coors.new_empty({0}, torch::kInt32)};
+
+  auto mlu_reduce_type = getMluOpReduceMode(reduce_type);
+  auto reduced_feats = at::empty({num_input, num_feats}, feats.options());
+  auto out_coors = at::empty({num_input, 3}, coors.options());
+  auto coors_map = at::empty({num_input}, coors.options());
+  auto reduce_count = at::empty({num_input}, coors.options());
+  auto voxel_num = at::empty({1}, coors.options());
+
+  INITIAL_MLU_PARAM_WITH_TENSOR(feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(coors);
+  INITIAL_MLU_PARAM_WITH_TENSOR(reduced_feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(out_coors);
+  INITIAL_MLU_PARAM_WITH_TENSOR(coors_map);
+  INITIAL_MLU_PARAM_WITH_TENSOR(reduce_count);
+  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  size_t workspace_size;
+  TORCH_MLUOP_CHECK(mluOpGetDynamicPointToVoxelForwardWorkspaceSize(
+      handle, feats_desc.desc(), coors_desc.desc(), &workspace_size));
+  auto workspace_tensor =
+      at::empty(workspace_size, feats.options().dtype(at::kByte));
+  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);
+
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpDynamicPointToVoxelForward(
+      handle, mlu_reduce_type, feats_desc.desc(), feats_ptr, coors_desc.desc(),
+      coors_ptr, workspace_tensor_ptr, workspace_size,
+      reduced_feats_desc.desc(), reduced_feats_ptr, out_coors_desc.desc(),
+      out_coors_ptr, coors_map_desc.desc(), coors_map_ptr,
+      reduce_count_desc.desc(), reduce_count_ptr, voxel_num_desc.desc(),
+      voxel_num_ptr));
+
+  int voxel_num_value = *static_cast<int *>(voxel_num.cpu().data_ptr());
+  TORCH_CHECK(voxel_num_value <= feats.size(0),
+              "voxel_num should be less than or equal to feats_num, got ",
+              voxel_num_value, " vs ", feats.size(0));
+  return {reduced_feats.slice(0, 0, voxel_num_value),
+          out_coors.slice(0, 0, voxel_num_value), coors_map,
+          reduce_count.slice(0, 0, voxel_num_value)};
+}
+
+void dynamic_point_to_voxel_backward_mlu(
+    Tensor &grad_feats, const Tensor &grad_reduced_feats, const Tensor &feats,
+    const Tensor &reduced_feats, const Tensor &coors_idx,
+    const Tensor &reduce_count, const reduce_t reduce_type) {
+  // params check
+  TORCH_CHECK(grad_reduced_feats.scalar_type() == at::kFloat,
+              "grad_reduced_feats type should be Float, got ",
+              grad_reduced_feats.scalar_type());
+  TORCH_CHECK(feats.scalar_type() == at::kFloat,
+              "feats type should be Float, got ", feats.scalar_type());
+  TORCH_CHECK(reduced_feats.scalar_type() == at::kFloat,
+              "reduced_feats type should be Float, got ",
+              reduced_feats.scalar_type());
+  TORCH_CHECK(coors_idx.scalar_type() == at::kInt,
+              "coors_idx type should be Int32, got ", coors_idx.scalar_type());
+  TORCH_CHECK(reduce_count.scalar_type() == at::kInt,
+              "reduce_count type should be Int32, got ",
+              reduce_count.scalar_type());
+
+  const int num_input = feats.size(0);
+  const int num_reduced = reduced_feats.size(0);
+  const int num_feats = feats.size(1);
+
+  grad_feats.fill_(0);
+
+  // zero-element check
+  if (num_input == 0 || num_reduced == 0) return;
+
+  // TODO(miaochen): remove this after mlu-ops supports other mode of reduce.
+  TORCH_CHECK(reduce_type == reduce_t::MAX,
+              "only supports max reduce in current version, got ",
+              to_string(reduce_type));
+
+  int voxel_num_value = reduced_feats.size(0);
+  auto opts = torch::TensorOptions().dtype(torch::kInt32);
+  auto voxel_num =
+      torch::from_blob(&voxel_num_value, {1}, opts).clone().to(at::kMLU);
+  auto mlu_reduce_type = getMluOpReduceMode(reduce_type);
+
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_reduced_feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(reduced_feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(coors_idx);
+  INITIAL_MLU_PARAM_WITH_TENSOR(reduce_count);
+  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  size_t workspace_size;
+  TORCH_MLUOP_CHECK(mluOpGetDynamicPointToVoxelBackwardWorkspaceSize(
+      handle, mlu_reduce_type, grad_feats_desc.desc(), feats_desc.desc(),
+      grad_reduced_feats_desc.desc(), coors_idx_desc.desc(),
+      reduce_count_desc.desc(), voxel_num_desc.desc(), &workspace_size));
+  auto workspace_tensor =
+      at::empty(workspace_size, feats.options().dtype(at::kByte));
+  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);
+
+  // launch kernel
+  TORCH_MLUOP_CHECK(mluOpDynamicPointToVoxelBackward(
+      handle, mlu_reduce_type, grad_reduced_feats_desc.desc(),
+      grad_reduced_feats_ptr, feats_desc.desc(), feats_ptr,
+      reduced_feats_desc.desc(), reduced_feats_ptr, coors_idx_desc.desc(),
+      coors_idx_ptr, reduce_count_desc.desc(), reduce_count_ptr,
+      voxel_num_desc.desc(), voxel_num_ptr, workspace_tensor_ptr,
+      workspace_size, grad_feats_desc.desc(), grad_feats_ptr));
+}
+
+std::vector<Tensor> dynamic_point_to_voxel_forward_impl(
+    const Tensor &feats, const Tensor &coors, const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_impl(
+    Tensor &grad_feats, const Tensor &grad_reduced_feats, const Tensor &feats,
+    const Tensor &reduced_feats, const Tensor &coors_idx,
+    const Tensor &reduce_count, const reduce_t reduce_type);
+
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, MLU,
+                     dynamic_point_to_voxel_forward_mlu);
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, MLU,
+                     dynamic_point_to_voxel_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/sparse_conv_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/sparse_conv_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..03acd84abc3f251b19e33e41c736395477d589e2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/sparse_conv_mlu.cpp
@@ -0,0 +1,444 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <torch/script.h>
+
+#include <vector>
+
+#include "mlu_common_helper.h"
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  // The following code is copied from
+  // mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu to ensure the output is
+  // available for network train. The outputs of this function have correct
+  // shape but wrong value.
+  auto numAct = indices.size(0);
+  auto kernelVolume = kernelSize[0];
+  int sub_m = (int)_subM;
+  int transpose = (int)_transpose;
+  int batch = (int)batchSize;
+  auto coorDim = indices.size(1) - 1;
+
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  torch::Tensor indicePairs = at::full({kernelVolume, 2, numAct}, -1,
+                                       indices.options().dtype(at::kInt));
+  torch::Tensor indiceNum =
+      at::zeros({kernelVolume}, indices.options().dtype(at::kInt));
+  int out_size = sub_m == 1
+                     ? numAct
+                     : std::min(numAct * kernelVolume, batch * outputVolume);
+  torch::Tensor out_indices =
+      at::zeros({out_size, coorDim + 1}, indices.options().dtype(at::kInt));
+  auto indices_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      indices, at::MemoryFormat::Contiguous);
+  auto indicePairs_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      indicePairs, at::MemoryFormat::Contiguous);
+  auto indiceNum_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      indiceNum, at::MemoryFormat::Contiguous);
+  auto out_indices_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      out_indices, at::MemoryFormat::Contiguous);
+
+  std::vector<int> input_space;
+  std::vector<int> filter_space;
+  std::vector<int> output_space;
+  std::vector<int> padding32;
+  std::vector<int> stride32;
+  std::vector<int> dilation32;
+  for (int i = 0; i < NDim; i++) {
+    input_space.push_back(spatialShape[i]);
+    filter_space.push_back(kernelSize[i]);
+    output_space.push_back(outSpatialShape[i]);
+    padding32.push_back(padding[i]);
+    stride32.push_back(stride[i]);
+    dilation32.push_back(dilation[i]);
+  }
+  MluOpTensorDescriptor indices_desc, out_indices_desc, indicePairs_desc,
+      indiceNum_desc;
+  indices_desc.set(indices_contiguous);
+  indicePairs_desc.set(indicePairs_contiguous);
+  indiceNum_desc.set(indiceNum_contiguous);
+  out_indices_desc.set(out_indices_contiguous);
+  {
+    mluOpTensorLayout_t layout = MLUOP_LAYOUT_ARRAY;
+    mluOpDataType_t dtype = MLUOP_DTYPE_INT32;
+    std::vector<int> dims;
+    dims = {numAct, coorDim + 1};
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        indices_desc.desc(), layout, dtype, dims.size(), dims.data()));
+    dims = {kernelVolume, 2, numAct};
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        indicePairs_desc.desc(), layout, dtype, dims.size(), dims.data()));
+    dims = {kernelVolume};
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        indiceNum_desc.desc(), layout, dtype, dims.size(), dims.data()));
+    dims = {out_size, coorDim + 1};
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        out_indices_desc.desc(), layout, dtype, dims.size(), dims.data()));
+  }
+
+  mluOpSparseConvolutionDescriptor_t sparse_conv_desc;
+  TORCH_MLUOP_CHECK(mluOpCreateSparseConvolutionDescriptor(&sparse_conv_desc));
+  TORCH_MLUOP_CHECK(mluOpSetSparseConvolutionDescriptor(
+      sparse_conv_desc, NDim + 2, batch, padding32.data(), stride32.data(),
+      dilation32.data(), input_space.data(), filter_space.data(),
+      output_space.data(), sub_m, transpose, 0));
+
+  auto handle = mluOpGetCurrentHandle();
+  size_t workspace_size = 0;
+  TORCH_MLUOP_CHECK(mluOpGetIndicePairsWorkspaceSize(
+      handle, sparse_conv_desc, indices_desc.desc(), indicePairs_desc.desc(),
+      out_indices_desc.desc(), indiceNum_desc.desc(), &workspace_size));
+  auto indice_workspace_size =
+      at::empty(workspace_size, indices.options().dtype(at::kByte));
+
+  auto indices_impl = torch_mlu::getMluTensorImpl(indices_contiguous);
+  auto out_indices_impl = torch_mlu::getMluTensorImpl(out_indices_contiguous);
+  auto indicePairs_impl = torch_mlu::getMluTensorImpl(indicePairs_contiguous);
+  auto indiceNum_impl = torch_mlu::getMluTensorImpl(indiceNum_contiguous);
+  auto indice_workspace_impl =
+      torch_mlu::getMluTensorImpl(indice_workspace_size);
+
+  auto indices_ptr = indices_impl->cnnlMalloc();
+  auto out_indices_ptr = out_indices_impl->cnnlMalloc();
+  auto indicePairs_ptr = indicePairs_impl->cnnlMalloc();
+  auto indiceNum_ptr = indiceNum_impl->cnnlMalloc();
+  auto indice_workspace_ptr = indice_workspace_impl->cnnlMalloc();
+
+  TORCH_MLUOP_CHECK(mluOpGetIndicePairs(
+      handle, sparse_conv_desc, indices_desc.desc(), indices_ptr,
+      indice_workspace_ptr, workspace_size, indicePairs_desc.desc(),
+      indicePairs_ptr, out_indices_desc.desc(), out_indices_ptr,
+      indiceNum_desc.desc(), indiceNum_ptr));
+  int num_act_out = 0;
+  TORCH_MLUOP_CHECK(
+      mluOpGetSparseConvolutionNumActOut(sparse_conv_desc, &num_act_out));
+  TORCH_MLUOP_CHECK(mluOpDestroySparseConvolutionDescriptor(sparse_conv_desc));
+  if (!sub_m) {
+    return {out_indices.slice(0, 0, num_act_out), indicePairs, indiceNum};
+  } else {
+    return {indices, indicePairs, indiceNum};
+  }
+}
+
+torch::Tensor IndiceConvForwardMLUKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
+    int64_t _subM) {
+  auto indice_num_cpu = indiceNum.to({torch::kCPU});
+  auto indice_num_cpu_64 = indice_num_cpu.to(torch::kInt64);
+  auto indice_num = indice_num_cpu_64.data_ptr<int64_t>();
+
+  // generate empty output
+  int C = filters.dim() == 4 ? filters.size(3) : filters.size(4);
+  torch::Tensor output =
+      at::zeros({numActOut, C}, features.options().dtype(at::kFloat));
+  // generate descriptor
+  auto features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      features, at::MemoryFormat::Contiguous);
+  auto filters_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      filters, at::MemoryFormat::Contiguous);
+  auto indice_pairs_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      indicePairs, at::MemoryFormat::Contiguous);
+  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      output, at::MemoryFormat::Contiguous);
+
+  MluOpTensorDescriptor features_desc, filters_desc, indice_pairs_desc,
+      output_desc;
+  features_desc.set(features_contiguous);
+  filters_desc.set(filters_contiguous);
+  indice_pairs_desc.set(indice_pairs_contiguous);
+  output_desc.set(output_contiguous);
+
+  // set layout
+  {
+    mluOpTensorLayout_t layout;
+    mluOpDataType_t dtype;
+    int dim;
+    int dims[8];
+
+    // features_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(features_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        features_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+
+    // filters_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(filters_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        filters_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+
+    // indice_pairs_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(indice_pairs_desc.desc(),
+                                               &layout, &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        indice_pairs_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+
+    // output_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(output_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        output_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+  }
+
+  auto handle = mluOpGetCurrentHandle();
+  size_t workspace_size = 0;
+  TORCH_MLUOP_CHECK(mluOpGetIndiceConvolutionForwardWorkspaceSize(
+      handle, features_desc.desc(), filters_desc.desc(),
+      indice_pairs_desc.desc(), output_desc.desc(), indice_num, numActOut,
+      _inverse, _subM, &workspace_size));
+
+  auto workspace =
+      at::empty(workspace_size, features.options().dtype(at::kByte));
+
+  auto features_impl = torch_mlu::getMluTensorImpl(features_contiguous);
+  auto filters_impl = torch_mlu::getMluTensorImpl(filters_contiguous);
+  auto indice_pairs_impl = torch_mlu::getMluTensorImpl(indice_pairs_contiguous);
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+
+  auto features_ptr = features_impl->cnnlMalloc();
+  auto filters_ptr = filters_impl->cnnlMalloc();
+  auto indice_pairs_ptr = indice_pairs_impl->cnnlMalloc();
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+
+  //  outputs
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+  TORCH_MLUOP_CHECK(mluOpIndiceConvolutionForward(
+      handle, features_desc.desc(), features_ptr, filters_desc.desc(),
+      filters_ptr, indice_pairs_desc.desc(), indice_pairs_ptr, indice_num,
+      numActOut, _inverse, _subM, workspace_ptr, workspace_size,
+      output_desc.desc(), output_ptr));
+
+  return output;
+}
+
+std::vector<torch::Tensor> IndiceConvBackwardMLUKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  auto indice_num_cpu = indiceNum.to({torch::kCPU});
+  auto indice_num_cpu_64 = indice_num_cpu.to(torch::kInt64);
+  auto indice_num = indice_num_cpu_64.data_ptr<int64_t>();
+
+  // generate empty input_grad
+  torch::Tensor input_grad = at::zeros({features.size(0), features.size(1)},
+                                       features.options().dtype(at::kFloat));
+  torch::Tensor filters_grad;
+  if (filters.dim() == 4) {
+    int h = filters.size(0);
+    int w = filters.size(1);
+    int c = filters.size(2);
+    int n = filters.size(3);
+    filters_grad = at::zeros({h, w, c, n}, filters.options().dtype(at::kFloat));
+  } else if (filters.dim() == 5) {
+    int d = filters.size(0);
+    int h = filters.size(1);
+    int w = filters.size(2);
+    int c = filters.size(3);
+    int n = filters.size(4);
+    filters_grad =
+        at::zeros({d, h, w, c, n}, filters.options().dtype(at::kFloat));
+  }
+
+  auto features_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      features, at::MemoryFormat::Contiguous);
+  auto filters_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      filters, at::MemoryFormat::Contiguous);
+  auto output_grad_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      outGrad, at::MemoryFormat::Contiguous);
+  auto indice_pairs_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      indicePairs, at::MemoryFormat::Contiguous);
+  auto input_grad_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      features, at::MemoryFormat::Contiguous);
+  auto filters_grad_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      filters, at::MemoryFormat::Contiguous);
+
+  MluOpTensorDescriptor features_desc, output_grad_desc, filters_desc,
+      indice_pairs_desc, input_grad_desc, filters_grad_desc;
+  features_desc.set(features_contiguous);
+  filters_desc.set(filters_contiguous);
+  output_grad_desc.set(output_grad_contiguous);
+  indice_pairs_desc.set(indice_pairs_contiguous);
+  input_grad_desc.set(input_grad_contiguous);
+  filters_grad_desc.set(filters_grad_contiguous);
+
+  // need to set desc layout with mluOp functions
+  {
+    mluOpTensorLayout_t layout;
+    mluOpDataType_t dtype;
+    int dim;
+    int dims[8];
+
+    // features_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(features_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        features_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+
+    // filters_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(filters_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    if (dim == 4) {
+      TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+          filters_desc.desc(), MLUOP_LAYOUT_HWCN, dtype, dim, dims));
+    } else {
+      TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+          filters_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+    }
+
+    // output_grad_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(output_grad_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        output_grad_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+
+    // indice_pairs_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(indice_pairs_desc.desc(),
+                                               &layout, &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        indice_pairs_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+
+    // input_grad_desc
+    TORCH_MLUOP_CHECK(mluOpGetTensorDescriptor(input_grad_desc.desc(), &layout,
+                                               &dtype, &dim, dims));
+    TORCH_MLUOP_CHECK(mluOpSetTensorDescriptor(
+        input_grad_desc.desc(), MLUOP_LAYOUT_ARRAY, dtype, dim, dims));
+  }
+
+  auto handle = mluOpGetCurrentHandle();
+  size_t data_workspace_size = 0;
+  mluOpGetIndiceConvolutionBackwardDataWorkspaceSize(
+      handle, output_grad_desc.desc(), filters_desc.desc(),
+      indice_pairs_desc.desc(), input_grad_desc.desc(), indice_num, _inverse,
+      &data_workspace_size);
+
+  size_t filters_workspace_size = 0;
+  TORCH_MLUOP_CHECK(mluOpGetIndiceConvolutionBackwardFilterWorkspaceSize(
+      handle, features_desc.desc(), output_grad_desc.desc(),
+      indice_pairs_desc.desc(), filters_grad_desc.desc(), indice_num, _inverse,
+      _subM, &filters_workspace_size));
+
+  auto indice_convbpdata_workspace =
+      at::empty(data_workspace_size, features.options().dtype(at::kByte));
+  auto indice_convbpfilter_workspace =
+      at::empty(filters_workspace_size, filters.options().dtype(at::kByte));
+
+  auto features_impl = torch_mlu::getMluTensorImpl(features_contiguous);
+  auto filters_impl = torch_mlu::getMluTensorImpl(filters_contiguous);
+  auto output_grad_impl = torch_mlu::getMluTensorImpl(output_grad_contiguous);
+  auto indice_pairs_impl = torch_mlu::getMluTensorImpl(indice_pairs_contiguous);
+  auto indice_convbpdata_workspace_impl =
+      torch_mlu::getMluTensorImpl(indice_convbpdata_workspace);
+  auto indice_convbpfilter_workspace_impl =
+      torch_mlu::getMluTensorImpl(indice_convbpfilter_workspace);
+
+  auto features_ptr = features_impl->cnnlMalloc();
+  auto filters_ptr = filters_impl->cnnlMalloc();
+  auto output_grad_ptr = output_grad_impl->cnnlMalloc();
+  auto indice_pairs_ptr = indice_pairs_impl->cnnlMalloc();
+  auto indice_convbpdata_workspace_ptr =
+      indice_convbpdata_workspace_impl->cnnlMalloc();
+  auto indice_convbpfilter_workspace_ptr =
+      indice_convbpfilter_workspace_impl->cnnlMalloc();
+
+  // outputs
+  auto input_grad_impl = torch_mlu::getMluTensorImpl(input_grad);
+  auto input_grad_ptr = input_grad_impl->cnnlMalloc();
+  auto filters_grad_impl = torch_mlu::getMluTensorImpl(filters_grad);
+  auto filters_grad_ptr = filters_grad_impl->cnnlMalloc();
+
+  TORCH_MLUOP_CHECK(mluOpIndiceConvolutionBackwardData(
+      handle, output_grad_desc.desc(), output_grad_ptr, filters_desc.desc(),
+      filters_ptr, indice_pairs_desc.desc(), indice_pairs_ptr, indice_num,
+      _inverse, _subM, indice_convbpdata_workspace_ptr, data_workspace_size,
+      input_grad_desc.desc(), input_grad_ptr));
+
+  TORCH_MLUOP_CHECK(mluOpIndiceConvolutionBackwardFilter(
+      handle, features_desc.desc(), features_ptr, output_grad_desc.desc(),
+      output_grad_ptr, indice_pairs_desc.desc(), indice_pairs_ptr, indice_num,
+      _inverse, _subM, indice_convbpfilter_workspace_ptr,
+      filters_workspace_size, filters_grad_desc.desc(), filters_grad_ptr));
+
+  std::vector<torch::Tensor> result;
+  result.push_back(input_grad);
+  result.push_back(filters_grad);
+  return result;
+}
+
+torch::Tensor indice_conv_forward_mlu(torch::Tensor features,
+                                      torch::Tensor filters,
+                                      torch::Tensor indicePairs,
+                                      torch::Tensor indiceNum,
+                                      int64_t numActOut, int64_t _inverse,
+                                      int64_t _subM) {
+  return IndiceConvForwardMLUKernelLauncher(
+      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
+}
+
+std::vector<torch::Tensor> indice_conv_backward_mlu(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return IndiceConvBackwardMLUKernelLauncher(
+      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
+}
+
+torch::Tensor indice_conv_forward_impl(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM);
+
+std::vector<torch::Tensor> indice_conv_backward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM);
+
+REGISTER_DEVICE_IMPL(indice_conv_forward_impl, MLU, indice_conv_forward_mlu);
+REGISTER_DEVICE_IMPL(indice_conv_backward_impl, MLU, indice_conv_backward_mlu);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9aa57472d9ef651c85335f32d771f044c86b256d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp
@@ -0,0 +1,63 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void ThreeNNMLUKernelLauncher(int b, int n, int m, const Tensor unknown,
+                              const Tensor known, Tensor dist2, Tensor idx) {
+  auto unknown_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      unknown, unknown.suggest_memory_format());
+  auto known_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      known, known.suggest_memory_format());
+  auto dist2_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      dist2, dist2.suggest_memory_format());
+  auto idx_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(idx, idx.suggest_memory_format());
+
+  MluOpTensorDescriptor unknown_desc, known_desc, dist2_desc, idx_desc;
+  unknown_desc.set(unknown_contiguous);
+  known_desc.set(known_contiguous);
+  dist2_desc.set(dist2_contiguous);
+  idx_desc.set(idx_contiguous);
+
+  auto handle = mluOpGetCurrentHandle();
+  size_t workspace_size = 0;
+  TORCH_MLUOP_CHECK(mluOpGetThreeNNForwardWorkspaceSize(
+      handle, known_desc.desc(), &workspace_size));
+  auto known_workspace =
+      at::empty(workspace_size, known.options().dtype(at::kByte));
+
+  auto unknown_impl = torch_mlu::getMluTensorImpl(unknown_contiguous);
+  auto known_impl = torch_mlu::getMluTensorImpl(known_contiguous);
+  auto dist2_impl = torch_mlu::getMluTensorImpl(dist2_contiguous);
+  auto idx_impl = torch_mlu::getMluTensorImpl(idx_contiguous);
+  auto workspace_impl = torch_mlu::getMluTensorImpl(known_workspace);
+  auto unknown_ptr = unknown_impl->cnnlMalloc();
+  auto known_ptr = known_impl->cnnlMalloc();
+  auto dist2_ptr = dist2_impl->cnnlMalloc();
+  auto idx_ptr = idx_impl->cnnlMalloc();
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+
+  TORCH_MLUOP_CHECK(mluOpThreeNNForward(
+      handle, unknown_desc.desc(), unknown_ptr, known_desc.desc(), known_ptr,
+      workspace_ptr, workspace_size, dist2_desc.desc(), dist2_ptr,
+      idx_desc.desc(), idx_ptr));
+}
+
+void three_nn_forward_mlu(int b, int n, int m, const Tensor unknown,
+                          const Tensor known, Tensor dist2, Tensor idx) {
+  ThreeNNMLUKernelLauncher(b, n, m, unknown, known, dist2, idx);
+}
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx);
+
+REGISTER_DEVICE_IMPL(three_nn_forward_impl, MLU, three_nn_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b7714f599530f2c25f923ee718a6008c2030514
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
@@ -0,0 +1,138 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+void TINShiftForwardMLUKernelLauncher(Tensor input, Tensor shift,
+                                      Tensor output) {
+  // params check
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "input type should be Float or Half, got ", input.scalar_type(), ".");
+  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
+              input.dim(), "d.");
+  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
+              shift.dim(), "d.");
+  TORCH_CHECK(
+      input.size(0) == shift.size(0),
+      "input batch size should be the same as shift's, input batch size is ",
+      input.size(0), " and shift batch size is ", shift.size(0), ".");
+  TORCH_CHECK(input.size(0) != 0, "Input batch size should not be zero.");
+  TORCH_CHECK(input.size(3) != 0,
+              "The last dim size of input should not be zero.");
+  if (input.size(1) == 0) {
+    return;
+  }
+
+  // set contiguous
+  auto input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      input, input.suggest_memory_format());
+  auto shift_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      shift, shift.suggest_memory_format());
+  auto output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      output, output.suggest_memory_format());
+
+  // get tensor impl
+  auto input_impl = torch_mlu::getMluTensorImpl(input_contiguous);
+  auto shift_impl = torch_mlu::getMluTensorImpl(shift_contiguous);
+  auto output_impl = torch_mlu::getMluTensorImpl(output_contiguous);
+
+  // get the mlu ptr
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto shift_ptr = shift_impl->cnnlMalloc();
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // set tensor descriptor
+  MluOpTensorDescriptor input_desc, shift_desc, output_desc;
+  input_desc.set(input_contiguous);
+  shift_desc.set(shift_contiguous);
+  output_desc.set(output_contiguous);
+
+  // get current handle
+  auto handle = mluOpGetCurrentHandle();
+
+  TORCH_MLUOP_CHECK(mluOpTinShiftForward(handle, input_desc.desc(), input_ptr,
+                                         shift_desc.desc(), shift_ptr,
+                                         output_desc.desc(), output_ptr));
+}
+
+void TINShiftBackwardMLUKernelLauncher(Tensor grad_output, Tensor shift,
+                                       Tensor grad_input) {
+  // params check
+  TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||
+                  grad_output.scalar_type() == at::kHalf,
+              "grad_output type should be Float or Half, got ",
+              grad_output.scalar_type(), ".");
+  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be a 4d tensor, got ",
+              grad_output.dim(), "d.");
+  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
+              shift.dim(), "d.");
+  TORCH_CHECK(grad_output.size(0) == shift.size(0),
+              "grad_output batch size should be the same as shift's, "
+              "grad_output batch size is ",
+              grad_output.size(0), ", shift batch size is ", shift.size(0),
+              ".");
+  TORCH_CHECK(grad_output.size(0) != 0,
+              "grad_output batch size should not be zero.");
+  TORCH_CHECK(grad_output.size(3) != 0,
+              "The last dim size of grad_output should not be zero.");
+  if (grad_output.size(1) == 0) {
+    return;
+  }
+
+  // set contiguous
+  auto grad_output_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      grad_output, grad_output.suggest_memory_format());
+  auto shift_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      shift, shift.suggest_memory_format());
+  auto grad_input_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      grad_input, grad_input.suggest_memory_format());
+
+  // get tensor impl
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_contiguous);
+  auto shift_impl = torch_mlu::getMluTensorImpl(shift_contiguous);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_contiguous);
+
+  // get the mlu ptr
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto shift_ptr = shift_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  // set tensor descriptor
+  MluOpTensorDescriptor grad_output_desc, shift_desc, grad_input_desc;
+  grad_output_desc.set(grad_output_contiguous);
+  shift_desc.set(shift_contiguous);
+  grad_input_desc.set(grad_input_contiguous);
+
+  // get current handle
+  auto handle = mluOpGetCurrentHandle();
+
+  TORCH_MLUOP_CHECK(mluOpTinShiftBackward(
+      handle, grad_output_desc.desc(), grad_output_ptr, shift_desc.desc(),
+      shift_ptr, grad_input_desc.desc(), grad_input_ptr));
+}
+
+void tin_shift_forward_mlu(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardMLUKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_mlu(Tensor grad_output, Tensor shift,
+                            Tensor grad_input) {
+  TINShiftBackwardMLUKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, MLU, tin_shift_forward_mlu);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, MLU, tin_shift_backward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e6d4ff6d2b260a5c7a0e1c9196a4fb93bc2b760f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp
@@ -0,0 +1,99 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+/*************************************************************************
+ * This MACRO contains operations of simple tensor to mlu-tensor.
+ * _contiguous, _desc, _impl, _ptr will be automatically generated in
+ * this MACRO.
+ *************************************************************************/
+#define INITIAL_MLU_PARAM_WITH_TENSOR(NAME)                         \
+  auto NAME##_contigous = torch_mlu::cnnl::ops::cnnl_contiguous(    \
+      NAME, NAME.suggest_memory_format());                          \
+  MluOpTensorDescriptor NAME##_desc;                                \
+  NAME##_desc.set(NAME##_contigous);                                \
+  auto NAME##_impl = torch_mlu::getMluTensorImpl(NAME##_contigous); \
+  auto NAME##_ptr = NAME##_impl->cnnlMalloc();
+
+int HardVoxelizeForwardMLUKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  std::vector<float> _voxel_size(voxel_size.begin(), voxel_size.end());
+  std::vector<float> _coors_range(coors_range.begin(), coors_range.end());
+  auto opts = torch::TensorOptions().dtype(torch::kFloat32);
+  auto voxel_size_tensor =
+      torch::from_blob(_voxel_size.data(), {int64_t(_voxel_size.size())}, opts)
+          .clone()
+          .to(at::kMLU);
+  auto coors_range_tensor =
+      torch::from_blob(_coors_range.data(), {int64_t(_coors_range.size())},
+                       opts)
+          .clone()
+          .to(at::kMLU);
+  INITIAL_MLU_PARAM_WITH_TENSOR(points);
+  INITIAL_MLU_PARAM_WITH_TENSOR(voxels);
+  INITIAL_MLU_PARAM_WITH_TENSOR(coors);
+  INITIAL_MLU_PARAM_WITH_TENSOR(num_points_per_voxel);
+  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_size_tensor);
+  INITIAL_MLU_PARAM_WITH_TENSOR(coors_range_tensor);
+
+  auto voxel_num_tensor = at::empty({1}, points.options().dtype(torch::kInt32));
+  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num_tensor);
+
+  size_t workspace_size;
+  auto handle = mluOpGetCurrentHandle();
+  TORCH_MLUOP_CHECK(mluOpGetVoxelizationWorkspaceSize(
+      handle, points_desc.desc(), voxel_size_tensor_desc.desc(),
+      coors_range_tensor_desc.desc(), max_points, max_voxels, NDim, true,
+      voxels_desc.desc(), coors_desc.desc(), num_points_per_voxel_desc.desc(),
+      voxel_num_tensor_desc.desc(), &workspace_size));
+  auto workspace_tensor =
+      at::empty(workspace_size, points.options().dtype(at::kByte));
+  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);
+
+  TORCH_MLUOP_CHECK(mluOpVoxelization(
+      handle, points_desc.desc(), points_ptr, voxel_size_tensor_desc.desc(),
+      voxel_size_tensor_ptr, coors_range_tensor_desc.desc(),
+      coors_range_tensor_ptr, max_points, max_voxels, NDim, true,
+      workspace_tensor_ptr, workspace_size, voxels_desc.desc(), voxels_ptr,
+      coors_desc.desc(), coors_ptr, num_points_per_voxel_desc.desc(),
+      num_points_per_voxel_ptr, voxel_num_tensor_desc.desc(),
+      voxel_num_tensor_ptr));
+  auto voxel_num_cpu = voxel_num_tensor.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+  return voxel_num_int;
+}
+
+int hard_voxelize_forward_mlu(const at::Tensor &points, at::Tensor &voxels,
+                              at::Tensor &coors,
+                              at::Tensor &num_points_per_voxel,
+                              const std::vector<float> voxel_size,
+                              const std::vector<float> coors_range,
+                              const int max_points, const int max_voxels,
+                              const int NDim) {
+  return HardVoxelizeForwardMLUKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+}
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, MLU,
+                     hard_voxelize_forward_mlu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2b9493dbc9a300fd97aa0ffb5d8cce87b33fa29d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
@@ -0,0 +1,409 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#ifdef MMCV_WITH_DIOPI
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+#include <diopi/functions_mmcv.h>
+
+#include "csrc_dipu/diopirt/diopirt_impl.h"
+
+using dipu::diopi_helper::toDiopiScalar;
+using dipu::diopi_helper::toDiopiTensorHandle;
+#endif
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, data_col);
+}
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, grad_im);
+}
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
+                       data_im, data_offset, data_mask, batch_size, channels,
+                       height_im, width_im, height_col, width_col, kernel_h,
+                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                       dilation_w, deformable_group, grad_offset, grad_mask);
+}
+
+void modulated_deform_conv_forward_fallthrough(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_backward_fallthrough(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_impl(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_impl(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
+
+#ifdef MMCV_WITH_DIOPI
+void modulated_deform_conv_forward_diopi(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+  auto input_p = toDiopiTensorHandle(input);
+  diopiDevice_t device;
+  diopiGetTensorDevice(input_p, &device);
+  if (device == diopi_host) {
+    modulated_deform_conv_forward_fallthrough(
+        input, weight, bias, ones, offset, mask, output, columns, kernel_h,
+        kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
+        group, deformable_group, with_bias);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto weight_p = toDiopiTensorHandle(weight);
+  auto bias_p = toDiopiTensorHandle(bias);
+  auto ones_p = toDiopiTensorHandle(ones);
+  auto offset_p = toDiopiTensorHandle(offset);
+  auto mask_p = toDiopiTensorHandle(mask);
+  auto output_p = toDiopiTensorHandle(output);
+  auto columns_p = toDiopiTensorHandle(columns);
+  if (reinterpret_cast<void*>(diopiModulatedDeformConvMmcv) != nullptr) {
+    auto ret = diopiModulatedDeformConvMmcv(
+        ch, output_p, columns_p, ones_p, input_p, weight_p, bias_p, offset_p,
+        mask_p, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w,
+        dilation_h, dilation_w, group, deformable_group, with_bias);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op modulated_deform_conv_forward";
+  auto input_cpu = input.cpu();
+  auto weight_cpu = weight.cpu();
+  auto bias_cpu = bias.cpu();
+  auto ones_cpu = ones.cpu();
+  auto offset_cpu = offset.cpu();
+  auto mask_cpu = mask.cpu();
+  auto output_cpu = output.cpu();
+  auto columns_cpu = columns.cpu();
+  modulated_deform_conv_forward_fallthrough(
+      input_cpu, weight_cpu, bias_cpu, ones_cpu, offset_cpu, mask_cpu,
+      output_cpu, columns_cpu, kernel_h, kernel_w, stride_h, stride_w, pad_h,
+      pad_w, dilation_h, dilation_w, group, deformable_group, with_bias);
+  output.copy_(output_cpu);
+  return;
+}
+
+void modulated_deform_conv_backward_diopi(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  auto input_p = toDiopiTensorHandle(input);
+  diopiDevice_t device;
+  diopiGetTensorDevice(input_p, &device);
+  if (device == diopi_host) {
+    modulated_deform_conv_backward_fallthrough(
+        input, weight, bias, ones, offset, mask, columns, grad_input,
+        grad_weight, grad_bias, grad_offset, grad_mask, grad_output, kernel_h,
+        kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
+        group, deformable_group, with_bias);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto weight_p = toDiopiTensorHandle(weight);
+  auto bias_p = toDiopiTensorHandle(bias);
+  auto ones_p = toDiopiTensorHandle(ones);
+  auto offset_p = toDiopiTensorHandle(offset);
+  auto mask_p = toDiopiTensorHandle(mask);
+  auto columns_p = toDiopiTensorHandle(columns);
+  auto grad_input_p = toDiopiTensorHandle(grad_input);
+  auto grad_weight_p = toDiopiTensorHandle(grad_weight);
+  auto grad_bias_p = toDiopiTensorHandle(grad_bias);
+  auto grad_offset_p = toDiopiTensorHandle(grad_offset);
+  auto grad_mask_p = toDiopiTensorHandle(grad_mask);
+  auto grad_output_p = toDiopiTensorHandle(grad_output);
+
+  if (reinterpret_cast<void*>(diopiModulatedDeformConvBackwardMmcv) !=
+      nullptr) {
+    auto ret = diopiModulatedDeformConvBackwardMmcv(
+        ch, grad_input_p, grad_weight_p, grad_bias_p, grad_offset_p,
+        grad_mask_p, input_p, weight_p, bias_p, ones_p, offset_p, mask_p,
+        columns_p, grad_output_p, kernel_h, kernel_w, stride_h, stride_w, pad_h,
+        pad_w, dilation_h, dilation_w, group, deformable_group, with_bias);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op modulated_deform_conv_forward";
+  auto input_cpu = input.cpu();
+  auto weight_cpu = weight.cpu();
+  auto bias_cpu = bias.cpu();
+  auto ones_cpu = ones.cpu();
+  auto offset_cpu = offset.cpu();
+  auto mask_cpu = mask.cpu();
+  auto columns_cpu = columns.cpu();
+  auto grad_input_cpu = grad_input.cpu();
+  auto grad_weight_cpu = grad_weight.cpu();
+  auto grad_bias_cpu = grad_bias.cpu();
+  auto grad_offset_cpu = grad_offset.cpu();
+  auto grad_mask_cpu = grad_mask.cpu();
+  auto grad_output_cpu = grad_output.cpu();
+  modulated_deform_conv_backward_fallthrough(
+      input_cpu, weight_cpu, bias_cpu, ones_cpu, offset_cpu, mask_cpu,
+      columns_cpu, grad_input_cpu, grad_weight_cpu, grad_bias_cpu,
+      grad_offset_cpu, grad_mask_cpu, grad_output_cpu, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+  grad_input.copy_(grad_input_cpu);
+  grad_weight.copy_(grad_weight_cpu);
+  grad_bias.copy_(grad_bias_cpu);
+  grad_offset.copy_(grad_offset_cpu);
+  grad_mask.copy_(grad_mask_cpu);
+  return;
+}
+#endif
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+#ifdef MMCV_WITH_DIOPI
+  modulated_deform_conv_forward_diopi(
+      input, weight, bias, ones, offset, mask, output, columns, kernel_h,
+      kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+#else
+  modulated_deform_conv_forward_fallthrough(
+      input, weight, bias, ones, offset, mask, output, columns, kernel_h,
+      kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+#endif
+}
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+#ifdef MMCV_WITH_DIOPI
+  modulated_deform_conv_backward_diopi(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+#else
+  modulated_deform_conv_backward_fallthrough(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+#endif
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
new file mode 100644
index 0000000000000000000000000000000000000000..cad6a41a09a0d9dbf43ae473235c356b16a2eec8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
@@ -0,0 +1,99 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "pytorch_device_registry.hpp"
+
+#include "MPSLibrary.h"
+#include "MPSStream.h"
+#include "MPSUtils.h"
+
+using at::Tensor;
+
+const static std::string kSourceCode = R"(
+#include <metal_math>
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void bbox_overlap_mps_kernel(constant const float4* bboxes1,
+                       constant const float4* bboxes2,
+                       device float* ious,
+                       constant int& num_bbox1,
+                       constant int& num_bbox2,
+                       constant int& mode,
+                       constant bool& aligned,
+                       constant int& offset,
+                       uint index [[thread_position_in_grid]])
+{
+    int base1 = index;
+    int base2 = index;
+    if(!aligned){
+      base1 = index / num_bbox2;
+      base2 = index % num_bbox2;
+    }
+
+    const float f_offset = float(offset);
+
+    const float4 b1 = bboxes1[base1];
+    const float b1_area = (b1[2]-b1[0]+f_offset)*(b1[3]-b1[1]+f_offset);
+
+    const float4 b2 = bboxes2[base2];
+    const float b2_area = (b2[2]-b2[0]+f_offset)*(b2[3]-b2[1]+f_offset);
+
+    const float2 left_top = fmax(b1.xy, b2.xy);
+    const float2 right_bottom = fmin(b1.zw, b2.zw);
+    const float2 wh = fmax(right_bottom - left_top + f_offset, 0.0f);
+    const float interS = wh.x * wh.y;
+
+    const float baseS =
+        fmax(mode == 0 ? b1_area + b2_area - interS : b1_area, f_offset);
+    ious[index] = interS / baseS;
+}
+)";
+
+void BBoxOverlapsMPSKernelLauncher(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                                   const int mode, const bool aligned, const int offset) {
+  // get stream
+  auto stream = at::mps::getCurrentMPSStream();
+  auto library_manager = MPSLibraryManager::getInstance();
+  MPSLibrary* library;
+  const static std::string kLibraryName = "bbox_overlap";
+  if (library_manager->hasLibrary(kLibraryName))
+    library = library_manager->getLibrary(kLibraryName);
+  else
+    library = library_manager->createLibraryFromSouce(kLibraryName, kSourceCode);
+  auto func_pso = library->getComputePipelineState("bbox_overlap_mps_kernel");
+
+  // create command buffer and encoder
+  MTLCommandBuffer_t command_buffer = stream->commandBuffer();
+  MTLComputeCommandEncoder_t compute_encoder = [command_buffer computeCommandEncoder];
+
+  // set pso and buffer
+  int output_size = ious.numel();
+  int num_bbox1 = bboxes1.size(0);
+  int num_bbox2 = bboxes2.size(0);
+  int num_elements = output_size;
+  setMTLArgs(compute_encoder, func_pso, bboxes1, bboxes2, ious, num_bbox1, num_bbox2, mode, aligned,
+             offset);
+
+  // set grid size
+  MTLSize grid_size = MTLSizeMake(num_elements, 1, 1);
+  NSUInteger thread_group_size_x = func_pso.maxTotalThreadsPerThreadgroup;
+  if (thread_group_size_x > num_elements) {
+    thread_group_size_x = num_elements;
+  }
+  MTLSize thread_group_size = MTLSizeMake(thread_group_size_x, 1, 1);
+
+  // encoding
+  [compute_encoder dispatchThreads:grid_size threadsPerThreadgroup:thread_group_size];
+  [compute_encoder endEncoding];
+
+  // commit, not sure if flush is required
+  stream->commit(false);
+}
+
+void bbox_overlaps_mps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
+                       const bool aligned, const int offset) {
+  BBoxOverlapsMPSKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
+                        const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MPS, bbox_overlaps_mps);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..25c8f6209b16c475ba181eea7c880eb27cca4082
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
@@ -0,0 +1,60 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step) {
+  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
+                              spatial_shapes, level_start_index, sampling_loc,
+                              attn_weight, im2col_step);
+}
+
+void ms_deform_attn_impl_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
+    const int im2col_step) {
+  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
+                       level_start_index, sampling_loc, attn_weight,
+                       grad_output, grad_value, grad_sampling_loc,
+                       grad_attn_weight, im2col_step);
+}
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight,
+                              const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
+}
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
+                               sampling_loc, attn_weight, grad_output,
+                               grad_value, grad_sampling_loc, grad_attn_weight,
+                               im2col_step);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4370091d2b76469e6a3439be13f04b2280188120
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms.cpp
@@ -0,0 +1,78 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#ifdef MMCV_WITH_DIOPI
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+#include <diopi/functions_mmcv.h>
+
+#include "csrc_dipu/base/basedef.h"
+#include "csrc_dipu/diopirt/diopirt_impl.h"
+
+using dipu::diopi_helper::toDiopiScalar;
+using dipu::diopi_helper::toDiopiTensorHandle;
+#endif
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset) {
+  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
+                              sigma, min_score, method, offset);
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets,
+                                              float iou_threshold) {
+  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
+}
+
+#ifdef MMCV_WITH_DIOPI
+Tensor nms_diopi(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  auto boxes_p = toDiopiTensorHandle(boxes);
+  diopiDevice_t device;
+  diopiGetTensorDevice(boxes_p, &device);
+  if (device == diopi_host) {
+    return nms_impl(boxes, scores, iou_threshold, offset);
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  Tensor out;
+  auto outp = toDiopiTensorHandle(out);
+  diopiTensorHandle_t* outhandle = &outp;
+  auto scores_p = toDiopiTensorHandle(scores);
+  bool is_mock_cuda = boxes.device().type() == dipu::DIPU_DEVICE_TYPE;
+  if (is_mock_cuda && reinterpret_cast<void*>(diopiNmsMmcv) != nullptr) {
+    auto ret =
+        diopiNmsMmcv(ch, outhandle, boxes_p, scores_p, iou_threshold, offset);
+    if (ret == diopiSuccess) {
+      auto tensorhandle = reinterpret_cast<Tensor*>(*outhandle);
+      return *tensorhandle;
+    }
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op nms";
+  auto boxes_cpu = boxes.cpu();
+  auto scores_cpu = scores.cpu();
+  return nms_impl(boxes_cpu, scores_cpu, iou_threshold, offset);
+}
+#endif
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+#ifdef MMCV_WITH_DIOPI
+  return nms_diopi(boxes, scores, iou_threshold, offset);
+#else
+  return nms_impl(boxes, scores, iou_threshold, offset);
+#endif
+}
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset) {
+  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
+                      method, offset);
+}
+
+std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
+  return nms_match_impl(dets, iou_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms_quadri.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms_quadri.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8baed951a6306d589e3609986f6fce1dd571067
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms_quadri.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "pytorch_cpp_helper.hpp"
+
+Tensor nms_quadri_cpu(const Tensor dets, const Tensor scores,
+                      const float iou_threshold);
+
+#ifdef MMCV_WITH_CUDA
+Tensor nms_quadri_cuda(const Tensor dets, const Tensor scores,
+                       const Tensor order, const Tensor dets_sorted,
+                       const float iou_threshold, const int multi_label);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+Tensor nms_quadri(const Tensor dets, const Tensor scores, const Tensor order,
+                  const Tensor dets_sorted, const float iou_threshold,
+                  const int multi_label) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    return nms_quadri_cuda(dets, scores, order, dets_sorted, iou_threshold,
+                           multi_label);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  return nms_quadri_cpu(dets, scores, iou_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3b23b19309f9eed9163636fddd613daac1309eb7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms_rotated.cpp
@@ -0,0 +1,54 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h
+#include "pytorch_cpp_helper.hpp"
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold);
+
+#ifdef MMCV_WITH_CUDA
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order, const Tensor dets_sorted,
+                        const float iou_threshold, const int multi_label);
+#endif
+
+#ifdef MMCV_WITH_NPU
+Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
+                       const Tensor labels, const float iou_threshold);
+#endif
+
+#ifdef MMCV_WITH_MLU
+Tensor nms_rotated_mlu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const Tensor labels,
+                   const float iou_threshold, const int multi_label) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    return nms_rotated_cuda(dets, scores, order, dets_sorted.contiguous(),
+                            iou_threshold, multi_label);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+#ifdef MMCV_WITH_XLA
+  } else if (dets.device().type() == at::kXLA) {
+    return nms_rotated_npu(dets, scores, labels, iou_threshold);
+#endif
+#ifdef MMCV_WITH_KPRIVATE
+  } else if (dets.device().type() == at::kPrivateUse1) {
+    return nms_rotated_npu(dets, scores, labels, iou_threshold);
+#endif
+#ifdef MMCV_WITH_MLU
+  } else if (dets.device().type() == at::kMLU) {
+    return nms_rotated_mlu(dets, scores, iou_threshold);
+#endif
+  }
+
+  return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/active_rotated_filter_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/active_rotated_filter_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6363fefbf7ad906af51c52806c0b2ce52e0c4b3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/active_rotated_filter_npu.cpp
@@ -0,0 +1,36 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+void active_rotated_filter_forward_npu(const Tensor input, const Tensor indices,
+                                       Tensor output) {
+  OpCommand cmd;
+  cmd.Name("ActiveRotatedFilter")
+      .Input(input)
+      .Input(indices)
+      .Output(output)
+      .Run();
+}
+
+void active_rotated_filter_backward_npu(const Tensor grad_out,
+                                        const Tensor indices, Tensor grad_in) {
+  OpCommand cmd;
+  cmd.Name("ActiveRotatedFilterGrad")
+      .Input(grad_out)
+      .Input(indices)
+      .Output(grad_in)
+      .Run();
+}
+
+REGISTER_NPU_IMPL(active_rotated_filter_forward_impl,
+                  active_rotated_filter_forward_npu);
+
+REGISTER_NPU_IMPL(active_rotated_filter_backward_impl,
+                  active_rotated_filter_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed04622af687492517f0c0b0d6e509b3b93de5b1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp
@@ -0,0 +1,51 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+
+void bbox_overlaps_npu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                       const int mode, const bool aligned, const int offset) {
+  string modeStr = "iou";
+  if (mode == 1) {
+    modeStr = "iof";
+  }
+  bool swap_flag = false;
+  at::Tensor bboxesFP32 = bboxes2;
+  at::Tensor gtboxesFP32 = bboxes1;
+  if (bboxes2.size(0) < bboxes1.size(0)) {
+    swap_flag = true;
+    bboxesFP32 = bboxes1;
+    gtboxesFP32 = bboxes2;
+  }
+  if (bboxes2.scalar_type() != at::kFloat) {
+    bboxesFP32 = bboxesFP32.to(at::kFloat);
+    gtboxesFP32 = gtboxesFP32.to(at::kFloat);
+  }
+  c10::SmallVector<int64_t, SIZE> iousSize = {gtboxesFP32.size(0),
+                                              bboxesFP32.size(0)};
+  if (aligned) {
+    iousSize = {gtboxesFP32.size(0), 1};
+  }
+  at::Tensor iousFP32 = at::empty(iousSize, bboxesFP32.options());
+  bboxesFP32 = aligned ? bboxesFP32.transpose(0, 1) : bboxesFP32;
+  gtboxesFP32 = aligned ? gtboxesFP32.transpose(0, 1) : gtboxesFP32;
+  OpCommand cmd;
+  cmd.Name("Iou")
+      .Input(bboxesFP32)
+      .Input(gtboxesFP32)
+      .Output(iousFP32)
+      .Attr("mode", modeStr)
+      .Attr("eps", (float)offset)
+      .Attr("aligned", aligned)
+      .Run();
+  if (bboxes2.scalar_type() != at::kFloat) {
+    iousFP32 = iousFP32.to(at::kHalf);
+  }
+  iousFP32 = swap_flag ? iousFP32.transpose(0, 1) : iousFP32;
+  ious.copy_(iousFP32);
+}
+
+REGISTER_NPU_IMPL(bbox_overlaps_impl, bbox_overlaps_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6e6b664781a0184c9a0c390e90878578051ee11
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp
@@ -0,0 +1,47 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+void box_iou_rotated_npu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  at::Tensor boxes = at::ones_like(boxes1);
+  at::Tensor query_boxes = at::ones_like(boxes2);
+  boxes = boxes1.transpose(0, 1).unsqueeze(0);
+  query_boxes = boxes2.transpose(0, 1).unsqueeze(0);
+
+  bool is_trans = false;
+  string modeStr = "iou";
+  if (mode_flag == 1) {
+    modeStr = "iof";
+  }
+  bool is_cross = true;
+  if (aligned) {
+    is_cross = false;
+  }
+  float v_threshold = 0;
+  float e_threshold = 0;
+
+  OpCommand cmd;
+  cmd.Name("RotatedIou")
+      .Input(boxes)
+      .Input(query_boxes)
+      .Output(ious)
+      .Attr("trans", is_trans)
+      .Attr("mode", modeStr)
+      .Attr("is_cross", is_cross)
+      .Attr("v_threshold", v_threshold)
+      .Attr("e_threshold", e_threshold)
+      .Run();
+
+  if (is_cross) {
+    ious = ious.view({boxes1.size(0), boxes2.size(0)});
+  } else {
+    ious = ious.view({boxes1.size(0), 1});
+  }
+}
+
+REGISTER_NPU_IMPL(box_iou_rotated_impl, box_iou_rotated_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..074e52d4f42c9653f68969719a5d64632a29cdf6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp
@@ -0,0 +1,63 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+void deform_roi_pool_forward_npu(Tensor input, Tensor rois, Tensor offset,
+                                 Tensor output, int pooled_height,
+                                 int pooled_width, float spatial_scale,
+                                 int sampling_ratio, float gamma) {
+  c10::SmallVector<int64_t, 2> output_sizes = {pooled_height, pooled_width};
+  at::IntArrayRef output_size = at::IntArrayRef(output_sizes);
+  int64_t sampling_ratio_ = (int64_t)sampling_ratio;
+  OpCommand cmd;
+  cmd.Name("DeformableRoiPool")
+      .Input(input)
+      .Input(rois)
+      .Input(offset)
+      .Output(output)
+      .Attr("spatial_scale", spatial_scale)
+      .Attr("output_size", output_size)
+      .Attr("sampling_ratio", sampling_ratio_)
+      .Attr("gamma", gamma)
+      .Run();
+}
+
+void deform_roi_pool_backward_npu(Tensor grad_output, Tensor input, Tensor rois,
+                                  Tensor offset, Tensor grad_input,
+                                  Tensor grad_offset, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  c10::SmallVector<int64_t, 2> output_sizes = {pooled_height, pooled_width};
+  at::IntArrayRef output_size = at::IntArrayRef(output_sizes);
+  int64_t sampling_ratio_ = (int64_t)sampling_ratio;
+  OpCommand cmd;
+  cmd.Name("DeformableRoiPoolGrad")
+      .Input(grad_output)
+      .Input(input)
+      .Input(rois)
+      .Input(offset)
+      .Output(grad_input)
+      .Output(grad_offset)
+      .Attr("output_size", output_size)
+      .Attr("spatial_scale", spatial_scale)
+      .Attr("sample_ratio", sampling_ratio_)
+      .Attr("gamma", gamma)
+      .Run();
+}
+
+REGISTER_NPU_IMPL(deform_roi_pool_forward_impl, deform_roi_pool_forward_npu);
+
+REGISTER_NPU_IMPL(deform_roi_pool_backward_impl, deform_roi_pool_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b7c995a223eb2a6adb94866a278fd19d5eb6d9fc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp
@@ -0,0 +1,151 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
+                                    Tensor output, float gamma, float alpha) {
+  int64_t n_class = input.size(1);
+  at::Tensor target_y = at::ones_like(input);
+  if (n_class == 1) {
+    target_y = at::reshape(target, input.sizes());
+    target_y = at::mul(target_y, -1.0);
+    target_y = at::add(target_y, 1.0);
+  } else {
+    target_y = at::one_hot(target, n_class);
+  }
+  target_y = target_y.to(at::kInt);
+  int64_t weight_size = weight.size(0);
+  at::Tensor weight_y = at::ones_like(input);
+  if (weight_size > 0) {
+    weight_y = at::broadcast_to(weight, input.sizes());
+  }
+  OpCommand cmd;
+  string reduction = "none";
+  cmd.Name("SigmoidFocalLoss")
+      .Input(input)
+      .Input(target_y)
+      .Input(weight_y)
+      .Output(output)
+      .Attr("gamma", gamma)
+      .Attr("alpha", alpha)
+      .Attr("reduction", reduction)
+      .Run();
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
+                                     Tensor grad_input, float gamma,
+                                     float alpha) {
+  int64_t n_class = input.size(1);
+  at::Tensor target_y = at::ones_like(input);
+  if (n_class == 1) {
+    target_y = at::reshape(target, input.sizes());
+  } else {
+    target_y = at::one_hot(target, n_class);
+    target_y = at::mul(target_y, -1.0);
+    target_y = at::add(target_y, 1.0);
+  }
+  target_y = target_y.to(at::kInt);
+  at::Tensor grad_up = at::ones_like(input);
+  int64_t weight_size = weight.size(0);
+  at::Tensor weight_y = at::ones_like(input);
+  if (weight_size > 0) {
+    weight_y = at::broadcast_to(weight, input.sizes());
+  }
+  OpCommand cmd;
+  string reduction = "none";
+  cmd.Name("SigmoidFocalLossGrad")
+      .Input(input)
+      .Input(target_y)
+      .Input(grad_up)
+      .Input(weight_y)
+      .Output(grad_input)
+      .Attr("gamma", gamma)
+      .Attr("alpha", alpha)
+      .Attr("reduction", reduction)
+      .Run();
+}
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
+                                    Tensor output, float gamma, float alpha) {
+  int64_t n_class = input.size(1);
+  at::Tensor target_y = at::one_hot(target, n_class);
+  target_y = target_y.to(at::kInt);
+  int64_t weight_size = weight.size(0);
+  at::Tensor weight_y = at::ones_like(input);
+  if (weight_size > 0) {
+    weight_y = at::broadcast_to(weight, input.sizes());
+  }
+  at::Tensor op_output = at::ones_like(input);
+  OpCommand cmd;
+  string reduction = "none";
+  cmd.Name("SoftmaxFocalLoss")
+      .Input(input)
+      .Input(target_y)
+      .Input(weight_y)
+      .Output(op_output)
+      .Attr("gamma", gamma)
+      .Attr("alpha", alpha)
+      .Attr("reduction", reduction)
+      .Run();
+  int64_t n_batch = input.size(0);
+  c10::SmallVector<int64_t, 2> offsets = {0, 0};
+  c10::SmallVector<int64_t, 2> sizes = {n_batch, 1};
+  at::IntArrayRef offset = at::IntArrayRef(offsets);
+  at::IntArrayRef size = at::IntArrayRef(sizes);
+  at_npu::native::custom_ops::npu_slice_out(op_output, offset, size, output);
+}
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor grad_input, float gamma,
+                                     float alpha);
+
+void softmax_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
+                                     Tensor buff, Tensor grad_input,
+                                     float gamma, float alpha) {
+  int64_t n_class = input.size(1);
+  at::Tensor target_y = at::one_hot(target, n_class);
+  target_y = target_y.to(at::kInt);
+  at::Tensor grad_up = at::ones_like(input);
+  int64_t weight_size = weight.size(0);
+  at::Tensor weight_y = at::ones_like(input);
+  if (weight_size > 0) {
+    weight_y = at::broadcast_to(weight, input.sizes());
+  }
+  OpCommand cmd;
+  string reduction = "none";
+  cmd.Name("SoftmaxFocalLossGrad")
+      .Input(input)
+      .Input(target_y)
+      .Input(grad_up)
+      .Input(weight_y)
+      .Output(grad_input)
+      .Attr("gamma", gamma)
+      .Attr("alpha", alpha)
+      .Attr("reduction", reduction)
+      .Run();
+}
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+
+REGISTER_NPU_IMPL(sigmoid_focal_loss_forward_impl,
+                  sigmoid_focal_loss_forward_npu);
+
+REGISTER_NPU_IMPL(sigmoid_focal_loss_backward_impl,
+                  sigmoid_focal_loss_backward_npu);
+
+REGISTER_NPU_IMPL(softmax_focal_loss_forward_impl,
+                  softmax_focal_loss_forward_npu);
+
+REGISTER_NPU_IMPL(softmax_focal_loss_backward_impl,
+                  softmax_focal_loss_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3d44cacb27339961cab7ef317e38542d7dc1793
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp
@@ -0,0 +1,55 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+Tensor fused_bias_leakyrelu_op_impl(const Tensor &input, const Tensor &bias,
+                                    const Tensor &refer, int act, int grad,
+                                    float alpha, float scale);
+
+Tensor fused_bias_leakyrelu_npu(const Tensor &input, const Tensor &bias,
+                                const Tensor &refer, int act, int grad,
+                                float alpha, float scale) {
+  at::Tensor py = at::empty_like(input);
+  // forward
+  if (grad == 0) {
+    auto input_size = input.sizes();
+    int input_length = input_size.size();
+    c10::SmallVector<int64_t, SIZE> input_size_tmp;
+    input_size_tmp = array_to_small_vector(input_size);
+    if (input_length > 1) {
+      for (int i = 0; i < input_length; i++) {
+        if (i != 1) {
+          input_size_tmp[i] = 1;
+        }
+      }
+    }
+    at::Tensor bias_tmp = at::reshape(bias, input_size_tmp);
+    // at::Tensor bias_ = at_npu::native::NPUNativeFunctions::npu_broadcast(
+    //     bias_tmp, input.sizes());
+    at::Tensor bias_ = at::broadcast_to(bias_tmp, input.sizes());
+    OpCommand cmd;
+    cmd.Name("FusedBiasLeakyRelu")
+        .Input(input)
+        .Input(bias_)
+        .Output(py)
+        .Attr("scale", scale)
+        .Attr("negative_slope", alpha)
+        .Run();
+  }
+
+  // backward
+  if (grad == 1) {
+    OpCommand cmd;
+    cmd.Name("FusedBiasLeakyReluGrad")
+        .Input(input)
+        .Input(refer)
+        .Output(py)
+        .Attr("scale", scale)
+        .Attr("negative_slope", alpha)
+        .Run();
+  }
+  return py;
+}
+
+REGISTER_NPU_IMPL(fused_bias_leakyrelu_op_impl, fused_bias_leakyrelu_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b84fcfcac277eeca0ada386094d49cb65de3f983
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp
@@ -0,0 +1,73 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void gather_points_forward_npu(int b, int c, int n, int npoints,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  // b, c, n, and npoints do not need to be passed into gatherv2,
+  // b, c, n, and npoints are calculated inside the operator
+  // gatherv2 operator in ascend needs to set axis to 2, batch_dims is 1
+  c10::SmallVector<int64_t, N> axis = {2};
+  int64_t batch_dims = 1;
+
+  OpCommand cmd;
+  cmd.Name("GatherV2")
+      .Input(points)
+      .Input(idx)
+      .Input(axis)
+      .Output(out)
+      .Attr("batch_dims", batch_dims)
+      .Run();
+}
+void gather_points_backward_npu(int b, int c, int n, int npoints,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  at::Tensor indices = idx;
+  if (idx.scalar_type() != at::ScalarType::Int) {
+    indices = idx.to(at::kInt);
+  }
+  if (idx.dim() == 0) {
+    indices.unsqueeze_(0);
+  }
+  int64_t dim = 0;
+  at::SmallVector<int64_t, N> pad_size = array_to_small_vector(idx.sizes());
+  at::Tensor trans_grad_points = grad_points.transpose(1, 2).contiguous();
+  at::Tensor grad_points_view = trans_grad_points.view(
+      {trans_grad_points.sizes()[0] * trans_grad_points.sizes()[1],
+       trans_grad_points.sizes()[2]});
+  at::Tensor trans_grad_out = grad_out.transpose(1, 2).contiguous();
+  trans_grad_out = trans_grad_out.view(
+      {trans_grad_out.sizes()[0] * trans_grad_out.sizes()[1],
+       trans_grad_out.sizes()[2]});
+  auto index = at::arange(0, b);
+  index = index.to(grad_out.device());
+  index = at::mul(index, n);
+  index = index.view({b, 1});
+  index = at::broadcast_to(index, pad_size);
+  indices = at::add(index, indices);
+  indices = indices.view({-1});
+  OpCommand cmd;
+  cmd.Name("InplaceIndexAdd")
+      .Input(grad_points_view)
+      .Input(indices)
+      .Input(trans_grad_out)
+      .Output(grad_points_view)
+      .Attr("axis", dim)
+      .Run();
+  at::Tensor grad_points_result =
+      grad_points_view.view(trans_grad_points.sizes());
+  grad_points_result = grad_points_result.transpose(1, 2);
+  grad_points.copy_(grad_points_result);
+}
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out);
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points);
+
+REGISTER_NPU_IMPL(gather_points_forward_impl, gather_points_forward_npu);
+REGISTER_NPU_IMPL(gather_points_backward_impl, gather_points_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eabf9118c71912e108b49be14664c0bfcc2441da
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp
@@ -0,0 +1,45 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void group_points_forward_npu(int b, int c, int n, int npoints, int nsample,
+                              const Tensor points, const Tensor idx,
+                              Tensor out) {
+  // b, c, n, and npoints do not need to be passed into gatherv2,
+  // b, c, n, and npoints are calculated inside the operator
+  // gatherv2 operator in ascend needs to set axis to 0, batch_dims is 0
+  c10::SmallVector<int64_t, N> axis = {0};
+  int64_t batch_dims = 0;
+
+  auto index = at::arange(0, b);
+  index = index.to(points.device());
+  index = index.view({-1, 1, 1});
+  index = at::mul(index, n);
+  at::Tensor indices = at::add(index, idx);
+  indices = indices.view({-1});
+
+  at::Tensor trans_features = points.transpose(1, 2);
+  at::Tensor features = NpuUtils::format_contiguous(trans_features);
+  features = features.view({b * n, c});
+
+  OpCommand cmd;
+  cmd.Name("GatherV2")
+      .Input(features)
+      .Input(indices)
+      .Input(axis)
+      .Output(out)
+      .Attr("batch_dims", batch_dims)
+      .Run();
+
+  at::Tensor output =
+      out.view({b, npoints, nsample, c}).transpose(1, 3).transpose(2, 3);
+  at::Tensor res = NpuUtils::format_contiguous(output);
+  out.copy_(res);
+}
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out);
+
+REGISTER_NPU_IMPL(group_points_forward_impl, group_points_forward_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/nms_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/nms_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5638c8be381590e60e3519fe23ea45a55096936b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/nms_npu.cpp
@@ -0,0 +1,40 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+Tensor nms_npu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  TORCH_CHECK((boxes.scalar_type() == at::ScalarType::Float),
+              "The type of boxes tensor passed in nms_npu should be float");
+  int64_t offset_64 = offset;
+  at::Tensor iou_threshold_y =
+      at::empty({}, boxes.options().dtype(at::kFloat)).fill_(iou_threshold);
+  at::Tensor scores_threshold_y =
+      at::empty({}, boxes.options().dtype(at::kFloat)).fill_(0);
+  at::Tensor max_outputsize_y =
+      at::empty({}, boxes.options().dtype(at::kInt)).fill_(boxes.size(0));
+
+  c10::SmallVector<int64_t, SIZE> outputsize = {boxes.size(0)};
+  at::Tensor output =
+      at::empty(outputsize, boxes.options().dtype(at::kInt)).fill_(-1);
+  OpCommand cmd;
+  cmd.Name("NonMaxSuppressionV3")
+      .Input(boxes)
+      .Input(scores)
+      .Input(max_outputsize_y)
+      .Input(iou_threshold_y)
+      .Input(scores_threshold_y)
+      .Attr("offset", offset_64)
+      .Output(output)
+      .Run();
+  auto outputsizeBool = at::gt(output, -1);
+  auto outputsizeInt = outputsizeBool.to(at::kInt);
+  auto countLen = at::sum(outputsizeInt, at::kInt);
+  at::Tensor actual_output = output.slice(0, 0, countLen.item().toLong());
+  actual_output = actual_output.to(at::kLong);
+  return actual_output;
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+
+REGISTER_NPU_IMPL(nms_impl, nms_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a92ec8d830e721954ac5a4f54dbefff331c42b83
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp
@@ -0,0 +1,33 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+
+Tensor nms_rotated_npu(const Tensor dets, const Tensor scores,
+                       const Tensor labels, const float iou_threshold) {
+  auto originDtype = dets.scalar_type();
+  at::Tensor detsCast = dets;
+  at::Tensor scoresCast = scores;
+  if (originDtype != at::kFloat) {
+    detsCast = detsCast.to(at::kFloat);
+    scoresCast = scoresCast.to(at::kFloat);
+  }
+  c10::SmallVector<int64_t, SIZE> selectedIndexSize = {dets.size(0)};
+
+  at::Tensor selectedBox = at::empty_like(dets);
+  at::Tensor selectedIndex =
+      at::empty(selectedIndexSize, dets.options().dtype(at::kInt));
+
+  c10::SmallVector<int64_t, N> output_sync_idx = {0, 1};
+  OpCommand cmd;
+  cmd.Sync(output_sync_idx)
+      .Name("RotatedNMS")
+      .Input(detsCast)
+      .Input(scoresCast)
+      .Input(labels)
+      .Output(selectedBox)
+      .Output(selectedIndex)
+      .Attr("iou_threshold", (float)iou_threshold)
+      .Run();
+  selectedIndex = selectedIndex.to(at::kLong);
+  return selectedIndex;
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/points_in_polygons_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/points_in_polygons_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f282afeed34f24f0eb23300ffc80c3ceb00fe229
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/points_in_polygons_npu.cpp
@@ -0,0 +1,27 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+constexpr int32_t MAX_POLYGONS_BATCH = 2800;
+
+void points_in_polygons_npu(const Tensor points, Tensor polygons, Tensor output,
+                            const int rows, const int cols) {
+  TORCH_CHECK(
+      (polygons.sizes()[0] <= MAX_POLYGONS_BATCH),
+      "The batch of polygons tensor must be less than MAX_POLYGONS_BATCH");
+  at::Tensor trans_polygons = polygons.transpose(0, 1);
+  OpCommand cmd;
+  at::Tensor new_trans_polygons = NpuUtils::format_contiguous(trans_polygons);
+  cmd.Name("PointsInPolygons")
+      .Input(points, (string) "points")
+      .Input(new_trans_polygons, (string) "polygons")
+      .Output(output)
+      .Run();
+}
+
+void points_in_polygons_forward_impl(const Tensor points, Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols);
+
+REGISTER_NPU_IMPL(points_in_polygons_forward_impl, points_in_polygons_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..44ddb5431f498917099b9160a4deb3a1d9210815
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp
@@ -0,0 +1,75 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void psamask_forward_npu(const int psa_type, const Tensor x, Tensor y,
+                         const int num, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  int64_t psa_type_i64 = psa_type;
+  int64_t num_i64 = num;
+  int64_t h_feature_i64 = h_feature;
+  int64_t w_feature_i64 = w_feature;
+  int64_t h_mask_i64 = h_mask;
+  int64_t w_mask_i64 = w_mask;
+  int64_t half_h_mask_i64 = half_h_mask;
+  int64_t half_w_mask_i64 = half_w_mask;
+  OpCommand cmd;
+  cmd.Name("PSAMask")
+      .Input(x)
+      .Output(y)
+      .Attr("psa_type", psa_type_i64)
+      .Attr("num", num_i64)
+      .Attr("h_feature", h_feature_i64)
+      .Attr("w_feature", w_feature_i64)
+      .Attr("h_mask", h_mask_i64)
+      .Attr("w_mask", w_mask_i64)
+      .Attr("half_h_mask", half_h_mask_i64)
+      .Attr("half_w_mask", half_w_mask_i64)
+      .Run();
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor x, Tensor y,
+                          const int num, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_npu(const int psa_type, const Tensor y_grad,
+                          Tensor x_grad, const int num, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  int64_t psa_type_i64 = psa_type;
+  int64_t num_i64 = num;
+  int64_t h_feature_i64 = h_feature;
+  int64_t w_feature_i64 = w_feature;
+  int64_t h_mask_i64 = h_mask;
+  int64_t w_mask_i64 = w_mask;
+  int64_t half_h_mask_i64 = half_h_mask;
+  int64_t half_w_mask_i64 = half_w_mask;
+  OpCommand cmd;
+  cmd.Name("PSAMaskGrad")
+      .Input(y_grad)
+      .Output(x_grad)
+      .Attr("psa_type", psa_type_i64)
+      .Attr("num", num_i64)
+      .Attr("h_feature", h_feature_i64)
+      .Attr("w_feature", w_feature_i64)
+      .Attr("h_mask", h_mask_i64)
+      .Attr("w_mask", w_mask_i64)
+      .Attr("half_h_mask", half_h_mask_i64)
+      .Attr("half_w_mask", half_w_mask_i64)
+      .Run();
+}
+
+void psamask_backward_impl(const int psa_type, const Tensor y_grad,
+                           Tensor x_grad, const int num, const int h_feature,
+                           const int w_feature, const int h_mask,
+                           const int w_mask, const int half_h_mask,
+                           const int half_w_mask);
+
+REGISTER_NPU_IMPL(psamask_forward_impl, psamask_forward_npu);
+REGISTER_NPU_IMPL(psamask_backward_impl, psamask_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f505b23e1859618b39295a2662f9c3ecf541eca4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp
@@ -0,0 +1,73 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void roi_align_forward_npu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  int64_t roi_end_mode = 2;
+  if (!aligned) {
+    LOG(WARNING) << "The [aligned] attr in roi_align op is false";
+    roi_end_mode = 0;
+  }
+  int64_t aligned_height_64 = aligned_height;
+  int64_t aligned_width_64 = aligned_width;
+  int64_t sampling_ratio_64 = sampling_ratio;
+  OpCommand cmd;
+  cmd.Name("ROIAlign")
+      .Input(input)
+      .Input(rois)
+      .Output(output)
+      .Attr("spatial_scale", spatial_scale)
+      .Attr("pooled_height", aligned_height_64)
+      .Attr("pooled_width", aligned_width_64)
+      .Attr("sample_num", sampling_ratio_64)
+      .Attr("roi_end_mode", roi_end_mode)
+      .Run();
+}
+
+void roi_align_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  int64_t aligned_height_64 = aligned_height;
+  int64_t aligned_width_64 = aligned_width;
+  int64_t sampling_ratio_64 = sampling_ratio;
+  int64_t roi_end_mode = 2;
+  if (!aligned) {
+    LOG(WARNING) << "The [aligned] attr in roi_align_grad op is false";
+    roi_end_mode = 0;
+  }
+  c10::SmallVector<int64_t, SIZE> xdiff_shape =
+      array_to_small_vector(grad_input.sizes());
+  OpCommand cmd;
+  cmd.Name("ROIAlignGrad")
+      .Input(grad_output)
+      .Input(rois)
+      .Output(grad_input)
+      .Attr("xdiff_shape", xdiff_shape)
+      .Attr("pooled_width", aligned_width_64)
+      .Attr("pooled_height", aligned_height_64)
+      .Attr("spatial_scale", spatial_scale)
+      .Attr("sample_num", sampling_ratio_64)
+      .Attr("roi_end_mode", roi_end_mode)
+      .Run();
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_NPU_IMPL(roi_align_forward_impl, roi_align_forward_npu);
+REGISTER_NPU_IMPL(roi_align_backward_impl, roi_align_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7a11e8c6d2cf3360e4ed02bb998b2d82757adfa
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp
@@ -0,0 +1,81 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+void roi_pool_forward_npu(Tensor input, Tensor rois, Tensor output,
+                          Tensor argmax, int pooled_height, int pooled_width,
+                          float spatial_scale) {
+  int64_t pooled_height_64 = pooled_height;
+  int64_t pooled_width_64 = pooled_width;
+  int64_t pooled_channel = 1;
+  at::Tensor roi_actual_num =
+      at::empty_like(rois, rois.options().dtype(at::kInt));
+  if (input.sizes()[1] % 16 == 0) {
+    OpCommand cmd;
+    cmd.Name("RoiPoolingWithArgMax")
+        .Input(input)
+        .Input(rois)
+        .Input(roi_actual_num)
+        .Output(output)
+        .Output(argmax)
+        .Attr("pooled_h", pooled_height_64)
+        .Attr("pooled_w", pooled_width_64)
+        .Attr("spatial_scale_h", spatial_scale)
+        .Attr("spatial_scale_w", spatial_scale)
+        .Attr("pool_channel", pooled_channel)
+        .Run();
+
+  } else {
+    OpCommand cmd;
+    cmd.Name("RoiPoolingWithArgMax")
+        .Input(input)
+        .Input(rois)
+        .Input(roi_actual_num)
+        .Output(output)
+        .Output(argmax)
+        .Attr("pooled_h", pooled_height_64)
+        .Attr("pooled_w", pooled_width_64)
+        .Attr("spatial_scale_h", spatial_scale)
+        .Attr("spatial_scale_w", spatial_scale)
+        .Attr("pool_channel", pooled_channel)
+        .Attr("_exclude_engines", (string) "AiCore")
+        .Run();
+  }
+}
+
+void roi_pool_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax,
+                           Tensor grad_input, int pooled_height,
+                           int pooled_width, float spatial_scale) {
+  int64_t pooled_height_64 = pooled_height;
+  int64_t pooled_width_64 = pooled_width;
+  int64_t pooled_channel = 1;
+  at::Tensor roi_actual_num =
+      at::empty_like(rois, rois.options().dtype(at::kInt));
+  at::Tensor x = at::ones_like(grad_input);
+  OpCommand cmd;
+  cmd.Name("RoiPoolingGradWithArgMax")
+      .Input(grad_output)
+      .Input(x)
+      .Input(rois)
+      .Input(roi_actual_num)
+      .Input(argmax)
+      .Output(grad_input)
+      .Attr("pooled_h", pooled_height_64)
+      .Attr("pooled_w", pooled_width_64)
+      .Attr("spatial_scale_h", spatial_scale)
+      .Attr("spatial_scale_w", spatial_scale)
+      .Attr("pool_channel", pooled_channel)
+      .Run();
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+
+REGISTER_NPU_IMPL(roi_pool_forward_impl, roi_pool_forward_npu);
+REGISTER_NPU_IMPL(roi_pool_backward_impl, roi_pool_backward_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ffd9b4c43b943505c98681c7427e20ff9e26aa3a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp
@@ -0,0 +1,56 @@
+#include "pytorch_npu_helper.hpp"
+
+using namespace NPU_NAME_SPACE;
+using namespace std;
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim = 3);
+
+int hard_voxelize_forward_npu(const at::Tensor &points, at::Tensor &voxels,
+                              at::Tensor &coors,
+                              at::Tensor &num_points_per_voxel,
+                              const std::vector<float> voxel_size,
+                              const std::vector<float> coors_range,
+                              const int max_points, const int max_voxels,
+                              const int NDim = 3) {
+  at::Tensor voxel_num_tmp = at::empty({1}, points.options());
+  at::Tensor voxel_num = voxel_num_tmp.to(at::kInt);
+
+  at::Tensor voxel_size_cpu = at::from_blob(
+      const_cast<float *>(voxel_size.data()), {3}, dtype(at::kFloat));
+  at::Tensor voxel_size_npu = voxel_size_cpu.to(points.device());
+
+  at::Tensor coors_range_cpu = at::from_blob(
+      const_cast<float *>(coors_range.data()), {6}, dtype(at::kFloat));
+  at::Tensor coors_range_npu = coors_range_cpu.to(points.device());
+
+  int64_t max_points_ = (int64_t)max_points;
+  int64_t max_voxels_ = (int64_t)max_voxels;
+
+  // only support true now
+  bool deterministic = true;
+
+  OpCommand cmd;
+  cmd.Name("Voxelization")
+      .Input(points)
+      .Input(voxel_size_npu)
+      .Input(coors_range_npu)
+      .Output(voxels)
+      .Output(coors)
+      .Output(num_points_per_voxel)
+      .Output(voxel_num)
+      .Attr("max_points", max_points_)
+      .Attr("max_voxels", max_voxels_)
+      .Attr("deterministic", deterministic)
+      .Run();
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+  return voxel_num_int;
+}
+
+REGISTER_NPU_IMPL(hard_voxelize_forward_impl, hard_voxelize_forward_npu);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/pixel_group.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/pixel_group.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bf8c8bbf2061cacb9e0c2d33c8a635834407622
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/pixel_group.cpp
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/WenmuZhou/PAN.pytorch
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
+                              kernel_label, kernel_contour, kernel_region_num,
+                              dis_threshold);
+}
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold) {
+  score = score.contiguous();
+  mask = mask.contiguous();
+  embedding = embedding.contiguous();
+  kernel_label = kernel_label.contiguous();
+  kernel_contour = kernel_contour.contiguous();
+
+  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
+                          kernel_region_num, distance_threshold);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/points_in_boxes.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/points_in_boxes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..540da94038f6dea2dc10443905f289ddd131f1af
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/points_in_boxes.cpp
@@ -0,0 +1,44 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
+                                    boxes_tensor, pts_tensor,
+                                    box_idx_of_points_tensor);
+}
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
+  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
+                                   pts_tensor, box_idx_of_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/points_in_polygons.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/points_in_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75a93dcef33f23904c1218048e16beff65c230d1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/points_in_polygons.cpp
@@ -0,0 +1,15 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
+                       output, rows, cols);
+}
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
+  int rows = points.size(0);
+  int cols = polygons.size(0);
+  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/prroi_pool.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/prroi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00db84a154bef7a7cee8d38ba6236d959849a3bc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/prroi_pool.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
+                       input, rois, grad_rois, pooled_height, pooled_width,
+                       spatial_scale);
+}
+
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale) {
+  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
+                          spatial_scale);
+}
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale) {
+  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
+                           pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
+                                pooled_height, pooled_width, spatial_scale);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/psamask.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/psamask.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6064c9ba5fd7ec9bcfef22b3abcc65ef50106d67
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/psamask.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
+                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
+                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask) {
+  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask) {
+  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/pybind.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/pybind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c8591a5cc151b35abe6694fc1b81c0ac91786397
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/pybind.cpp
@@ -0,0 +1,950 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+std::string get_compiler_version();
+std::string get_compiling_cuda_version();
+
+void assign_score_withk_forward(const Tensor &points, const Tensor &centers,
+                                const Tensor &scores, const Tensor &knn_idx,
+                                Tensor &output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate);
+
+void assign_score_withk_backward(const Tensor &grad_out, const Tensor &points,
+                                 const Tensor &centers, const Tensor &scores,
+                                 const Tensor &knn_idx, Tensor &grad_points,
+                                 Tensor &grad_centers, Tensor &grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate);
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor);
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor);
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor);
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step);
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step);
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step);
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma);
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma);
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample);
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample);
+
+void stack_group_points_forward(Tensor features_tensor,
+                                Tensor features_batch_cnt_tensor,
+                                Tensor idx_tensor, Tensor idx_batch_cnt_tensor,
+                                Tensor out_tensor, int b, int c, int m,
+                                int nsample);
+
+void stack_group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                 Tensor idx_batch_cnt_tensor,
+                                 Tensor features_batch_cnt_tensor,
+                                 Tensor grad_features_tensor, int b, int c,
+                                 int m, int n, int nsample);
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag);
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n, int npoints);
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints);
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha);
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha);
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n);
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m);
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m);
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset);
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample);
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap);
+
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh);
+
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh);
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m);
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w);
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels);
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias);
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias);
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight, const int im2col_step);
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step);
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset);
+
+std::vector<std::vector<int>> nms_match(Tensor dets, float iou_threshold);
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold);
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num);
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width, float spatial_scale);
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale);
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var);
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size);
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input);
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask);
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask);
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input);
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample);
+
+void stack_ball_query_forward(Tensor new_xyz_tensor, Tensor new_xyz_batch_cnt,
+                              Tensor xyz_tensor, Tensor xyz_batch_cnt,
+                              Tensor idx_tensor, float max_radius, int nsample);
+
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale);
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale);
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<Tensor> get_indice_pairs_backward(
+    Tensor indices, Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+Tensor indice_conv_forward(Tensor features, Tensor filters, Tensor indicePairs,
+                           Tensor indiceNum, int64_t numActOut,
+                           int64_t _inverse, int64_t _subM);
+
+std::vector<Tensor> indice_conv_backward(Tensor features, Tensor filters,
+                                         Tensor outGrad, Tensor indicePairs,
+                                         Tensor indiceNum, int64_t _inverse,
+                                         int64_t _subM);
+
+Tensor fused_indice_conv_batchnorm_forward(Tensor features, Tensor filters,
+                                           Tensor bias, Tensor indicePairs,
+                                           Tensor indiceNum, int64_t numActOut,
+                                           int64_t _inverse, int64_t _subM);
+
+Tensor indice_maxpool_forward(Tensor features, Tensor indicePairs,
+                              Tensor indiceNum, int64_t numAct);
+
+Tensor indice_maxpool_backward(Tensor features, Tensor outFeatures,
+                               Tensor outGrad, Tensor indicePairs,
+                               Tensor indiceNum);
+
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned);
+
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const Tensor labels,
+                   const float iou_threshold, const int multi_label);
+
+Tensor upfirdn2d(torch::Tensor input, torch::Tensor filter, int upx, int upy,
+                 int downx, int downy, int padx0, int padx1, int pady0,
+                 int pady1, bool flip, float gain);
+
+Tensor fused_bias_leakyrelu(const Tensor &input, const Tensor &bias,
+                            const Tensor &refer, int act, int grad, float alpha,
+                            float scale);
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int pooled_height, int pooled_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned, bool clockwise);
+
+void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
+                                Tensor grad_input, int pooled_height,
+                                int pooled_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const std::string &reduce_type);
+
+void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
+                                     const torch::Tensor &grad_reduced_feats,
+                                     const torch::Tensor &feats,
+                                     const torch::Tensor &reduced_feats,
+                                     const torch::Tensor &coors_idx,
+                                     const torch::Tensor &reduce_count,
+                                     const std::string &reduce_type);
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim,
+                           const bool deterministic);
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim);
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size);
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size);
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor);
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor);
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor);
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in, int pool_method);
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW);
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW);
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale, const int points);
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points);
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise);
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise);
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons);
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output);
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in);
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
+
+at::Tensor diff_iou_rotated_sort_vertices_forward(at::Tensor vertices,
+                                                  at::Tensor mask,
+                                                  at::Tensor num_valid);
+
+void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
+                              const Tensor dist1, const Tensor dist2,
+                              const Tensor idx1, const Tensor idx);
+
+void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
+                               Tensor idx1, Tensor idx2, Tensor graddist1,
+                               Tensor graddist2, Tensor gradxyz1,
+                               Tensor gradxyz2);
+
+Tensor bias_act(const Tensor &input, const Tensor &bias, const Tensor &xref,
+                const Tensor &yref, const Tensor &dy, int grad, int dim,
+                int act, float alpha, float gain, float clamp);
+
+std::tuple<torch::Tensor, torch::Tensor, int> filtered_lrelu(
+    torch::Tensor x, torch::Tensor fu, torch::Tensor fd, torch::Tensor b,
+    torch::Tensor si, int up, int down, int px0, int px1, int py0, int py1,
+    int sx, int sy, float gain, float slope, float clamp, bool flip_filters,
+    bool writeSigns);
+
+torch::Tensor filtered_lrelu_act_(torch::Tensor x, torch::Tensor si, int sx,
+                                  int sy, float gain, float slope, float clamp,
+                                  bool writeSigns);
+
+void box_iou_quadri(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                    const int mode_flag, const bool aligned);
+
+Tensor nms_quadri(const Tensor dets, const Tensor scores, const Tensor order,
+                  const Tensor dets_sorted, const float iou_threshold,
+                  const int multi_label);
+
+void bezier_align_forward(Tensor input, Tensor rois, Tensor output,
+                          int aligned_height, int aligned_width,
+                          float spatial_scale, int sampling_ratio,
+                          bool aligned);
+
+void bezier_align_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                           int aligned_height, int aligned_width,
+                           float spatial_scale, int sampling_ratio,
+                           bool aligned);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)", py::arg("input"),
+        py::arg("filter"), py::arg("upx"), py::arg("upy"), py::arg("downx"),
+        py::arg("downy"), py::arg("padx0"), py::arg("padx1"), py::arg("pady0"),
+        py::arg("pady1"), py::arg("flip"), py::arg("gain"));
+  m.def("fused_bias_leakyrelu", &fused_bias_leakyrelu,
+        "fused_bias_leakyrelu (CUDA)", py::arg("input"), py::arg("bias"),
+        py::arg("empty"), py::arg("act"), py::arg("grad"), py::arg("alpha"),
+        py::arg("scale"));
+  m.def("gather_points_forward", &gather_points_forward,
+        "gather_points_forward", py::arg("points_tensor"),
+        py::arg("idx_tensor"), py::arg("out_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"));
+  m.def("gather_points_backward", &gather_points_backward,
+        "gather_points_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"));
+  m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
+  m.def("get_compiling_cuda_version", &get_compiling_cuda_version,
+        "get_compiling_cuda_version");
+  m.def("assign_score_withk_forward", &assign_score_withk_forward,
+        "assign_score_withk_forward", py::arg("points"), py::arg("centers"),
+        py::arg("scores"), py::arg("knn_idx"), py::arg("output"), py::arg("B"),
+        py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"), py::arg("O"),
+        py::arg("aggregate"));
+  m.def("assign_score_withk_backward", &assign_score_withk_backward,
+        "assign_score_withk_backward", py::arg("grad_out"), py::arg("points"),
+        py::arg("centers"), py::arg("scores"), py::arg("knn_idx"),
+        py::arg("grad_points"), py::arg("grad_centers"), py::arg("grad_scores"),
+        py::arg("B"), py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"),
+        py::arg("O"), py::arg("aggregate"));
+  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("xyz_tensor"),
+        py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("dist2_tensor"), py::arg("b"), py::arg("n"), py::arg("m"),
+        py::arg("nsample"));
+  m.def("carafe_naive_forward", &carafe_naive_forward, "carafe_naive_forward",
+        py::arg("features"), py::arg("masks"), py::arg("output"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_naive_backward", &carafe_naive_backward,
+        "carafe_naive_backward", py::arg("top_grad"), py::arg("features"),
+        py::arg("masks"), py::arg("bottom_grad"), py::arg("mask_grad"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_forward", &carafe_forward, "carafe_forward",
+        py::arg("features"), py::arg("masks"), py::arg("rfeatures"),
+        py::arg("routput"), py::arg("rmasks"), py::arg("output"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_backward", &carafe_backward, "carafe_backward",
+        py::arg("top_grad"), py::arg("rfeatures"), py::arg("masks"),
+        py::arg("rtop_grad"), py::arg("rbottom_grad_hs"),
+        py::arg("rbottom_grad"), py::arg("rmask_grad"), py::arg("bottom_grad"),
+        py::arg("mask_grad"), py::arg("kernel_size"), py::arg("group_size"),
+        py::arg("scale_factor"));
+  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward",
+        py::arg("input"), py::arg("weight"), py::arg("offset"),
+        py::arg("output"), py::arg("columns"), py::arg("ones"), py::arg("kW"),
+        py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padW"),
+        py::arg("padH"), py::arg("dilationW"), py::arg("dilationH"),
+        py::arg("group"), py::arg("deformable_group"), py::arg("im2col_step"));
+  m.def("deform_conv_backward_input", &deform_conv_backward_input,
+        "deform_conv_backward_input", py::arg("input"), py::arg("offset"),
+        py::arg("gradOutput"), py::arg("gradInput"), py::arg("gradOffset"),
+        py::arg("weight"), py::arg("columns"), py::arg("kW"), py::arg("kH"),
+        py::arg("dW"), py::arg("dH"), py::arg("padW"), py::arg("padH"),
+        py::arg("dilationW"), py::arg("dilationH"), py::arg("group"),
+        py::arg("deformable_group"), py::arg("im2col_step"));
+  m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters,
+        "deform_conv_backward_parameters", py::arg("input"), py::arg("offset"),
+        py::arg("gradOutput"), py::arg("gradWeight"), py::arg("columns"),
+        py::arg("ones"), py::arg("kW"), py::arg("kH"), py::arg("dW"),
+        py::arg("dH"), py::arg("padW"), py::arg("padH"), py::arg("dilationW"),
+        py::arg("dilationH"), py::arg("group"), py::arg("deformable_group"),
+        py::arg("scale"), py::arg("im2col_step"));
+  m.def("deform_roi_pool_forward", &deform_roi_pool_forward,
+        "deform roi pool forward", py::arg("input"), py::arg("rois"),
+        py::arg("offset"), py::arg("output"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("gamma"));
+  m.def("deform_roi_pool_backward", &deform_roi_pool_backward,
+        "deform roi pool backward", py::arg("grad_output"), py::arg("input"),
+        py::arg("rois"), py::arg("offset"), py::arg("grad_input"),
+        py::arg("grad_offset"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("gamma"));
+  m.def("roipoint_pool3d_forward", &roipoint_pool3d_forward,
+        "roipoint_pool3d_forward", py::arg("xyz"), py::arg("boxes3d"),
+        py::arg("pts_feature"), py::arg("pooled_features"),
+        py::arg("pooled_empty_flag"));
+  m.def("sigmoid_focal_loss_forward", &sigmoid_focal_loss_forward,
+        "sigmoid_focal_loss_forward ", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("output"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("sigmoid_focal_loss_backward", &sigmoid_focal_loss_backward,
+        "sigmoid_focal_loss_backward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("grad_input"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("softmax_focal_loss_forward", &softmax_focal_loss_forward,
+        "softmax_focal_loss_forward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("output"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("softmax_focal_loss_backward", &softmax_focal_loss_backward,
+        "softmax_focal_loss_backward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("buff"), py::arg("grad_input"),
+        py::arg("gamma"), py::arg("alpha"));
+  m.def("three_interpolate_forward", &three_interpolate_forward,
+        "three_interpolate_forward", py::arg("points_tensor"),
+        py::arg("idx_tensor"), py::arg("weight_tensor"), py::arg("out_tensor"),
+        py::arg("b"), py::arg("c"), py::arg("m"), py::arg("n"));
+  m.def("three_interpolate_backward", &three_interpolate_backward,
+        "three_interpolate_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("weight_tensor"),
+        py::arg("grad_points_tensor"), py::arg("b"), py::arg("c"), py::arg("n"),
+        py::arg("m"));
+  m.def("three_nn_forward", &three_nn_forward, "three_nn_forward",
+        py::arg("unknown_tensor"), py::arg("known_tensor"),
+        py::arg("dist2_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("bbox_overlaps", &bbox_overlaps, "bbox_overlaps", py::arg("bboxes1"),
+        py::arg("bboxes2"), py::arg("ious"), py::arg("mode"),
+        py::arg("aligned"), py::arg("offset"));
+  m.def("group_points_forward", &group_points_forward, "group_points_forward",
+        py::arg("points_tensor"), py::arg("idx_tensor"), py::arg("out_tensor"),
+        py::arg("b"), py::arg("c"), py::arg("n"), py::arg("npoints"),
+        py::arg("nsample"));
+  m.def("group_points_backward", &group_points_backward,
+        "group_points_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"), py::arg("nsample"));
+  m.def("stack_group_points_forward", &stack_group_points_forward,
+        "stack_group_points_forward", py::arg("features_tensor"),
+        py::arg("features_batch_cnt_tensor"), py::arg("idx_tensor"),
+        py::arg("idx_batch_cnt_tensor"), py::arg("out_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("m"), py::arg("nsample"));
+  m.def("stack_group_points_backward", &stack_group_points_backward,
+        "stack_group_points_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("idx_batch_cnt_tensor"),
+        py::arg("features_batch_cnt_tensor"), py::arg("grad_features_tensor"),
+        py::arg("b"), py::arg("c"), py::arg("m"), py::arg("n"),
+        py::arg("nsample"));
+  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("b"), py::arg("n"),
+        py::arg("m"), py::arg("nsample"), py::arg("xyz_tensor"),
+        py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("dist2_tensor"));
+  m.def("iou3d_boxes_overlap_bev_forward", &iou3d_boxes_overlap_bev_forward,
+        "iou3d_boxes_overlap_bev_forward", py::arg("boxes_a"),
+        py::arg("boxes_b"), py::arg("ans_iou"));
+  m.def("iou3d_nms3d_forward", &iou3d_nms3d_forward, "iou3d_nms3d_forward",
+        py::arg("boxes"), py::arg("keep"), py::arg("num_out"),
+        py::arg("nms_overlap_thresh"));
+  m.def("iou3d_nms3d_normal_forward", &iou3d_nms3d_normal_forward,
+        "iou3d_nms3d_normal_forward", py::arg("boxes"), py::arg("keep"),
+        py::arg("num_out"), py::arg("nms_overlap_thresh"));
+  m.def("furthest_point_sampling_forward", &furthest_point_sampling_forward,
+        "furthest_point_sampling_forward", py::arg("points_tensor"),
+        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("furthest_point_sampling_with_dist_forward",
+        &furthest_point_sampling_with_dist_forward,
+        "furthest_point_sampling_with_dist_forward", py::arg("points_tensor"),
+        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("masked_im2col_forward", &masked_im2col_forward,
+        "masked_im2col_forward", py::arg("im"), py::arg("mask_h_idx"),
+        py::arg("mask_w_idx"), py::arg("col"), py::arg("kernel_h"),
+        py::arg("kernel_w"), py::arg("pad_h"), py::arg("pad_w"));
+  m.def("masked_col2im_forward", &masked_col2im_forward,
+        "masked_col2im_forward", py::arg("col"), py::arg("mask_h_idx"),
+        py::arg("mask_w_idx"), py::arg("im"), py::arg("height"),
+        py::arg("width"), py::arg("channels"));
+  m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward,
+        "modulated deform conv forward", py::arg("input"), py::arg("weight"),
+        py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
+        py::arg("output"), py::arg("columns"), py::arg("kernel_h"),
+        py::arg("kernel_w"), py::arg("stride_h"), py::arg("stride_w"),
+        py::arg("pad_h"), py::arg("pad_w"), py::arg("dilation_h"),
+        py::arg("dilation_w"), py::arg("group"), py::arg("deformable_group"),
+        py::arg("with_bias"));
+  m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward,
+        "modulated deform conv backward", py::arg("input"), py::arg("weight"),
+        py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
+        py::arg("columns"), py::arg("grad_input"), py::arg("grad_weight"),
+        py::arg("grad_bias"), py::arg("grad_offset"), py::arg("grad_mask"),
+        py::arg("grad_output"), py::arg("kernel_h"), py::arg("kernel_w"),
+        py::arg("stride_h"), py::arg("stride_w"), py::arg("pad_h"),
+        py::arg("pad_w"), py::arg("dilation_h"), py::arg("dilation_w"),
+        py::arg("group"), py::arg("deformable_group"), py::arg("with_bias"));
+  m.def("nms", &nms, "nms (CPU/CUDA) ", py::arg("boxes"), py::arg("scores"),
+        py::arg("iou_threshold"), py::arg("offset"));
+  m.def("softnms", &softnms, "softnms (CPU) ", py::arg("boxes"),
+        py::arg("scores"), py::arg("dets"), py::arg("iou_threshold"),
+        py::arg("sigma"), py::arg("min_score"), py::arg("method"),
+        py::arg("offset"));
+  m.def("nms_match", &nms_match, "nms_match (CPU) ", py::arg("dets"),
+        py::arg("iou_threshold"));
+  m.def("pixel_group", &pixel_group, "pixel group (CPU) ", py::arg("score"),
+        py::arg("mask"), py::arg("embedding"), py::arg("kernel_label"),
+        py::arg("kernel_contour"), py::arg("kernel_region_label"),
+        py::arg("distance_threshold"));
+  m.def("contour_expand", &contour_expand, "contour exapnd (CPU) ",
+        py::arg("kernel_mask"), py::arg("internal_kernel_label"),
+        py::arg("min_kernel_area"), py::arg("kernel_num"));
+  m.def("roi_align_forward", &roi_align_forward, "roi_align forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"),
+        py::arg("argmax_y"), py::arg("argmax_x"), py::arg("aligned_height"),
+        py::arg("aligned_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
+  m.def("roi_align_backward", &roi_align_backward, "roi_align backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("argmax_y"),
+        py::arg("argmax_x"), py::arg("grad_input"), py::arg("aligned_height"),
+        py::arg("aligned_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
+  m.def("roi_pool_forward", &roi_pool_forward, "roi_pool forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"), py::arg("argmax"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("roi_pool_backward", &roi_pool_backward, "roi_pool backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("argmax"),
+        py::arg("grad_input"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"));
+  m.def("sync_bn_forward_mean", &sync_bn_forward_mean, "sync_bn forward_mean",
+        py::arg("input"), py::arg("mean"));
+  m.def("sync_bn_forward_var", &sync_bn_forward_var, "sync_bn forward_var",
+        py::arg("input"), py::arg("mean"), py::arg("var"));
+  m.def("sync_bn_forward_output", &sync_bn_forward_output,
+        "sync_bn forward_output", py::arg("input"), py::arg("mean"),
+        py::arg("var"), py::arg("weight"), py::arg("bias"),
+        py::arg("running_mean"), py::arg("running_var"), py::arg("norm"),
+        py::arg("std"), py::arg("output"), py::arg("eps"), py::arg("momentum"),
+        py::arg("group_size"));
+  m.def("sync_bn_backward_param", &sync_bn_backward_param,
+        "sync_bn backward_param", py::arg("grad_output"), py::arg("norm"),
+        py::arg("grad_weight"), py::arg("grad_bias"));
+  m.def("sync_bn_backward_data", &sync_bn_backward_data,
+        "sync_bn backward_data", py::arg("grad_output"), py::arg("weight"),
+        py::arg("grad_weight"), py::arg("grad_bias"), py::arg("norm"),
+        py::arg("std"), py::arg("grad_input"));
+  m.def("get_indice_pairs_2d_forward", &get_indice_pairs_forward<2>,
+        "get_indice_pairs_2d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_3d_forward", &get_indice_pairs_forward<3>,
+        "get_indice_pairs_3d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_4d_forward", &get_indice_pairs_forward<4>,
+        "get_indice_pairs_4d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_2d_backward", &get_indice_pairs_backward<2>,
+        "get_indice_pairs_2d_backward", py::arg("indices"), py::arg("gridOut"),
+        py::arg("batchSize"), py::arg("outSpatialShape"),
+        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
+        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
+        py::arg("_subM"), py::arg("_transpose"));
+  m.def("get_indice_pairs_3d_backward", &get_indice_pairs_backward<3>,
+        "get_indice_pairs_3d_backward", py::arg("indices"), py::arg("gridOut"),
+        py::arg("batchSize"), py::arg("outSpatialShape"),
+        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
+        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
+        py::arg("_subM"), py::arg("_transpose"));
+  m.def("indice_conv_forward", &indice_conv_forward, "indice_conv_forward",
+        py::arg("features"), py::arg("filters"), py::arg("indicePairs"),
+        py::arg("indiceNum"), py::arg("numActOut"), py::arg("_inverse"),
+        py::arg("_subM"));
+  m.def("indice_conv_backward", &indice_conv_backward, "indice_conv_backward",
+        py::arg("features"), py::arg("filters"), py::arg("outGrad"),
+        py::arg("indicePairs"), py::arg("indiceNum"), py::arg("_inverse"),
+        py::arg("_subM"));
+  m.def("fused_indice_conv_forward", &fused_indice_conv_batchnorm_forward,
+        "fused_indice_conv_forward", py::arg("features"), py::arg("filters"),
+        py::arg("bias"), py::arg("indicePairs"), py::arg("indiceNum"),
+        py::arg("numActOut"), py::arg("_inverse"), py::arg("_subM"));
+  m.def("indice_maxpool_forward", &indice_maxpool_forward,
+        "indice_maxpool_forward", py::arg("features"), py::arg("indicePairs"),
+        py::arg("indiceNum"), py::arg("numAct"));
+  m.def("indice_maxpool_backward", &indice_maxpool_backward,
+        "indice_maxpool_backward", py::arg("features"), py::arg("outFeatures"),
+        py::arg("outGrad"), py::arg("indicePairs"), py::arg("indiceNum"));
+  m.def("psamask_forward", &psamask_forward, "PSAMASK forward (CPU/CUDA)",
+        py::arg("input"), py::arg("output"), py::arg("psa_type"),
+        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
+        py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
+        py::arg("half_w_mask"));
+  m.def("psamask_backward", &psamask_backward, "PSAMASK backward (CPU/CUDA)",
+        py::arg("grad_output"), py::arg("grad_input"), py::arg("psa_type"),
+        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
+        py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
+        py::arg("half_w_mask"));
+  m.def("tin_shift_forward", &tin_shift_forward, "tin_shift forward",
+        py::arg("input"), py::arg("shift"), py::arg("output"));
+  m.def("tin_shift_backward", &tin_shift_backward, "tin_shift backward",
+        py::arg("grad_output"), py::arg("shift"), py::arg("grad_input"));
+  m.def("box_iou_rotated", &box_iou_rotated, "IoU for rotated boxes",
+        py::arg("boxes1"), py::arg("boxes2"), py::arg("ious"),
+        py::arg("mode_flag"), py::arg("aligned"));
+  m.def("nms_rotated", &nms_rotated, "NMS for rotated boxes", py::arg("dets"),
+        py::arg("scores"), py::arg("order"), py::arg("dets_sorted"),
+        py::arg("labels"), py::arg("iou_threshold"), py::arg("multi_label"));
+  m.def("ball_query_forward", &ball_query_forward, "ball_query_forward",
+        py::arg("new_xyz_tensor"), py::arg("xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("b"), py::arg("n"), py::arg("m"), py::arg("min_radius"),
+        py::arg("max_radius"), py::arg("nsample"));
+  m.def("stack_ball_query_forward", &stack_ball_query_forward,
+        "stack_ball_query_forward", py::arg("new_xyz_tensor"),
+        py::arg("new_xyz_batch_cnt"), py::arg("xyz_tensor"),
+        py::arg("xyz_batch_cnt"), py::arg("idx_tensor"), py::arg("max_radius"),
+        py::arg("nsample"));
+  m.def("roi_align_rotated_forward", &roi_align_rotated_forward,
+        "roi_align_rotated forward", py::arg("input"), py::arg("rois"),
+        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"), py::arg("sampling_ratio"), py::arg("aligned"),
+        py::arg("clockwise"));
+  m.def("roi_align_rotated_backward", &roi_align_rotated_backward,
+        "roi_align_rotated backward", py::arg("rois"), py::arg("grad_input"),
+        py::arg("grad_output"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("aligned"), py::arg("clockwise"));
+  m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward,
+        "dynamic_point_to_voxel_forward", py::arg("feats"), py::arg("coors"),
+        py::arg("reduce_type"));
+  m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward,
+        "dynamic_point_to_voxel_backward", py::arg("grad_feats"),
+        py::arg("grad_reduced_feats"), py::arg("feats"),
+        py::arg("reduced_feats"), py::arg("coors_idx"), py::arg("reduce_count"),
+        py::arg("reduce_type"));
+  m.def("hard_voxelize_forward", &hard_voxelize_forward,
+        "hard_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
+        py::arg("coors_range"), py::arg("voxels"), py::arg("coors"),
+        py::arg("num_points_per_voxel"), py::arg("voxel_num"),
+        py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"),
+        py::arg("deterministic"));
+  m.def("dynamic_voxelize_forward", &dynamic_voxelize_forward,
+        "dynamic_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
+        py::arg("coors_range"), py::arg("coors"), py::arg("NDim"));
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward,
+        "forward function of multi-scale deformable attention",
+        py::arg("value"), py::arg("value_spatial_shapes"),
+        py::arg("value_level_start_index"), py::arg("sampling_locations"),
+        py::arg("attention_weights"), py::arg("im2col_step"));
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward,
+        "backward function of multi-scale deformable attention",
+        py::arg("value"), py::arg("value_spatial_shapes"),
+        py::arg("value_level_start_index"), py::arg("sampling_locations"),
+        py::arg("attention_weights"), py::arg("grad_output"),
+        py::arg("grad_value"), py::arg("grad_sampling_loc"),
+        py::arg("grad_attn_weight"), py::arg("im2col_step"));
+  m.def("border_align_forward", &border_align_forward,
+        "forward function of border_align", py::arg("input"), py::arg("boxes"),
+        py::arg("output"), py::arg("argmax_idx"), py::arg("pool_size"));
+  m.def("border_align_backward", &border_align_backward,
+        "backward function of border_align", py::arg("grad_output"),
+        py::arg("boxes"), py::arg("argmax_idx"), py::arg("grad_input"),
+        py::arg("pool_size"));
+  m.def("correlation_forward", &correlation_forward, "Correlation forward",
+        py::arg("input1"), py::arg("input2"), py::arg("output"), py::arg("kH"),
+        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
+        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
+        py::arg("dW"));
+  m.def("correlation_backward", &correlation_backward, "Correlation backward",
+        py::arg("grad_output"), py::arg("input1"), py::arg("input2"),
+        py::arg("grad_input1"), py::arg("grad_input2"), py::arg("kH"),
+        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
+        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
+        py::arg("dW"));
+  m.def("points_in_boxes_cpu_forward", &points_in_boxes_cpu_forward,
+        "points_in_boxes_cpu_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("pts_indices_tensor"));
+  m.def("points_in_boxes_part_forward", &points_in_boxes_part_forward,
+        "points_in_boxes_part_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("box_idx_of_points_tensor"));
+  m.def("points_in_boxes_all_forward", &points_in_boxes_all_forward,
+        "points_in_boxes_all_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("box_idx_of_points_tensor"));
+  m.def("roiaware_pool3d_forward", &roiaware_pool3d_forward,
+        "roiaware_pool3d_forward", py::arg("rois"), py::arg("pts"),
+        py::arg("pts_feature"), py::arg("argmax"), py::arg("pts_idx_of_voxels"),
+        py::arg("pooled_features"), py::arg("pool_method"));
+  m.def("roiaware_pool3d_backward", &roiaware_pool3d_backward,
+        "roiaware_pool3d_backward", py::arg("pts_idx_of_voxels"),
+        py::arg("argmax"), py::arg("grad_out"), py::arg("grad_in"),
+        py::arg("pool_method"));
+  m.def("rotated_feature_align_forward", &rotated_feature_align_forward,
+        "Feature Refine forward (CUDA)", py::arg("features"),
+        py::arg("best_bboxes"), py::arg("output"), py::arg("spatial_scale"),
+        py::arg("points"));
+  m.def("rotated_feature_align_backward", &rotated_feature_align_backward,
+        "Feature Refine backward (CUDA)", py::arg("top_grad"),
+        py::arg("best_bboxes"), py::arg("bottom_grad"),
+        py::arg("spatial_scale"), py::arg("points"));
+  m.def("riroi_align_rotated_forward", &riroi_align_rotated_forward,
+        "riroi_align_rotated forward", py::arg("features"), py::arg("rois"),
+        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"), py::arg("num_samples"),
+        py::arg("num_orientations"), py::arg("clockwise"));
+  m.def("riroi_align_rotated_backward", &riroi_align_rotated_backward,
+        "riroi_align_rotated backward", py::arg("top_grad"), py::arg("rois"),
+        py::arg("bottom_grad"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("num_samples"), py::arg("num_orientations"),
+        py::arg("clockwise"));
+  m.def("points_in_polygons_forward", &points_in_polygons_forward,
+        "points_in_polygons_forward", py::arg("points"), py::arg("polygons"),
+        py::arg("output"));
+  m.def("min_area_polygons", &min_area_polygons, "min_area_polygons",
+        py::arg("pointsets"), py::arg("polygons"));
+  m.def("active_rotated_filter_forward", &active_rotated_filter_forward,
+        "active_rotated_filter_forward", py::arg("input"), py::arg("indices"),
+        py::arg("output"));
+  m.def("active_rotated_filter_backward", &active_rotated_filter_backward,
+        "active_rotated_filter_backward", py::arg("grad_out"),
+        py::arg("indices"), py::arg("grad_in"));
+  m.def("convex_iou", &convex_iou, "convex_iou", py::arg("pointsets"),
+        py::arg("polygons"), py::arg("ious"));
+  m.def("convex_giou", &convex_giou, "convex_giou", py::arg("pointsets"),
+        py::arg("polygons"), py::arg("output"));
+  m.def("diff_iou_rotated_sort_vertices_forward",
+        &diff_iou_rotated_sort_vertices_forward,
+        "diff_iou_rotated_sort_vertices_forward", py::arg("vertices"),
+        py::arg("mask"), py::arg("num_valid"));
+  m.def("chamfer_distance_forward", &chamfer_distance_forward,
+        "chamfer_distance_forward", py::arg("xyz1"), py::arg("xyz2"),
+        py::arg("dist1"), py::arg("dist2"), py::arg("idx1"), py::arg("idx2"));
+  m.def("chamfer_distance_backward", &chamfer_distance_backward,
+        "chamfer_distance_backward", py::arg("xyz1"), py::arg("xyz2"),
+        py::arg("idx1"), py::arg("idx2"), py::arg("graddist1"),
+        py::arg("graddist2"), py::arg("gradxyz1"), py::arg("gradxyz2"));
+  m.def("prroi_pool_forward", &prroi_pool_forward, "prroi_pool forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("prroi_pool_backward", &prroi_pool_backward, "prroi_pool_backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("grad_input"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("prroi_pool_coor_backward", &prroi_pool_coor_backward,
+        "prroi_pool_coor_backward", py::arg("output"), py::arg("grad_output"),
+        py::arg("input"), py::arg("rois"), py::arg("grad_rois"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("bias_act", &bias_act, "bias_act (CUDA)", py::arg("input"),
+        py::arg("bias"), py::arg("xref"), py::arg("yref"), py::arg("dy"),
+        py::arg("grad"), py::arg("dim"), py::arg("act"), py::arg("alpha"),
+        py::arg("gain"), py::arg("clamp"));
+  m.def("filtered_lrelu", &filtered_lrelu, "filtered_lrelu (CUDA)",
+        py::arg("x"), py::arg("fu"), py::arg("fd"), py::arg("b"), py::arg("si"),
+        py::arg("up"), py::arg("down"), py::arg("px0"), py::arg("px1"),
+        py::arg("py0"), py::arg("py1"), py::arg("sx"), py::arg("sy"),
+        py::arg("gain"), py::arg("slope"), py::arg("clamp"),
+        py::arg("flip_filters"), py::arg("writeSigns"));
+  m.def("filtered_lrelu_act_", &filtered_lrelu_act_,
+        "filtered_lrelu_act_ (CUDA)", py::arg("x"), py::arg("si"),
+        py::arg("sx"), py::arg("sy"), py::arg("gain"), py::arg("slope"),
+        py::arg("clamp"), py::arg("writeSigns"));
+  m.def("box_iou_quadri", &box_iou_quadri, "IoU for quadrilateral boxes",
+        py::arg("boxes1"), py::arg("boxes2"), py::arg("ious"),
+        py::arg("mode_flag"), py::arg("aligned"));
+  m.def("nms_quadri", &nms_quadri, "NMS for quadrilateral boxes",
+        py::arg("dets"), py::arg("scores"), py::arg("order"),
+        py::arg("dets_sorted"), py::arg("iou_threshold"),
+        py::arg("multi_label"));
+  m.def("bezier_align_forward", &bezier_align_forward, "bezier_align forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"),
+        py::arg("aligned_height"), py::arg("aligned_width"),
+        py::arg("spatial_scale"), py::arg("sampling_ratio"),
+        py::arg("aligned"));
+  m.def("bezier_align_backward", &bezier_align_backward,
+        "bezier_align backward", py::arg("grad_output"), py::arg("rois"),
+        py::arg("grad_input"), py::arg("aligned_height"),
+        py::arg("aligned_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("aligned"));
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81ffa9fd6dcd82117ca13ac83b88b5f023aca466
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
+                       pooled_height, pooled_width, spatial_scale, num_samples,
+                       num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, pooled_height, pooled_width, spatial_scale,
+                       num_samples, num_orientations, clockwise);
+}
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise) {
+  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
+                                   pooled_width, spatial_scale, num_samples,
+                                   num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise) {
+  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
+                                    pooled_width, spatial_scale, num_samples,
+                                    num_orientations, clockwise);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_align.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a59c2b43b072907b543e05143876d69ab385f958
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_align.cpp
@@ -0,0 +1,146 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#ifdef MMCV_WITH_DIOPI
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+#include <diopi/functions_mmcv.h>
+
+#include "csrc_dipu/base/basedef.h"
+#include "csrc_dipu/diopirt/diopirt_impl.h"
+
+using dipu::diopi_helper::toDiopiScalar;
+using dipu::diopi_helper::toDiopiTensorHandle;
+#endif
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
+                       argmax_x, grad_input, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+#ifdef MMCV_WITH_DIOPI
+void roi_align_forward_diopi(Tensor input, Tensor rois, Tensor output,
+                             Tensor argmax_y, Tensor argmax_x,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  auto input_p = toDiopiTensorHandle(input);
+  diopiDevice_t device;
+  diopiGetTensorDevice(input_p, &device);
+  if (device == diopi_host) {
+    roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
+                           aligned_height, aligned_width, spatial_scale,
+                           sampling_ratio, pool_mode, aligned);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto rois_p = toDiopiTensorHandle(rois);
+  auto out_p = toDiopiTensorHandle(output);
+  auto argmax_y_p = toDiopiTensorHandle(argmax_y);
+  auto argmax_x_p = toDiopiTensorHandle(argmax_x);
+  bool is_mock_cuda = input.device().type() == dipu::DIPU_DEVICE_TYPE;
+  if (is_mock_cuda && reinterpret_cast<void *>(diopiRoiAlignMmcv) != nullptr) {
+    auto ret = diopiRoiAlignMmcv(
+        ch, out_p, argmax_y_p, argmax_x_p, input_p, rois_p, aligned_height,
+        aligned_width, sampling_ratio, pool_mode, spatial_scale, aligned);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op roi_align_forward";
+  auto input_cpu = input.cpu();
+  auto rois_cpu = rois.cpu();
+  auto out_cpu = output.cpu();
+  auto argmax_y_cpu = argmax_y.cpu();
+  auto argmax_x_cpu = argmax_x.cpu();
+  roi_align_forward_impl(input_cpu, rois_cpu, out_cpu, argmax_y_cpu,
+                         argmax_x_cpu, aligned_height, aligned_width,
+                         spatial_scale, sampling_ratio, pool_mode, aligned);
+  output.copy_(out_cpu);
+}
+
+void roi_align_backward_diopi(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                              Tensor argmax_x, Tensor grad_input,
+                              int aligned_height, int aligned_width,
+                              float spatial_scale, int sampling_ratio,
+                              int pool_mode, bool aligned) {
+  auto grad_output_ = toDiopiTensorHandle(grad_output);
+  diopiDevice_t device;
+  diopiGetTensorDevice(grad_output_, &device);
+  if (device == diopi_host) {
+    roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
+                            aligned_height, aligned_width, spatial_scale,
+                            sampling_ratio, pool_mode, aligned);
+    return;
+  }
+  auto rois_ = toDiopiTensorHandle(rois);
+  auto argmax_y_ = toDiopiTensorHandle(argmax_y);
+  auto argmax_x_ = toDiopiTensorHandle(argmax_x);
+  auto grad_input_ = toDiopiTensorHandle(grad_input);
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  bool is_mock_cuda = grad_output.device().type() == dipu::DIPU_DEVICE_TYPE;
+  if (is_mock_cuda &&
+      reinterpret_cast<void *>(diopiRoiAlignBackwardMmcv) != nullptr) {
+    auto ret = diopiRoiAlignBackwardMmcv(ch, grad_input_, grad_output_, rois_,
+                                         argmax_y_, argmax_x_, aligned_height,
+                                         aligned_width, sampling_ratio,
+                                         pool_mode, spatial_scale, aligned);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op roi_align_backward";
+  auto grad_output_cpu = grad_output.cpu();
+  auto rois_cpu = rois.cpu();
+  auto argmax_y_cpu = argmax_y.cpu();
+  auto argmax_x_cpu = argmax_x.cpu();
+  auto grad_input_cpu = grad_input.cpu();
+  roi_align_backward_impl(grad_output_cpu, rois_cpu, argmax_y_cpu, argmax_x_cpu,
+                          grad_input_cpu, aligned_height, aligned_width,
+                          spatial_scale, sampling_ratio, pool_mode, aligned);
+  grad_input.copy_(grad_input_cpu);
+}
+#endif
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned) {
+#ifdef MMCV_WITH_DIOPI
+  roi_align_forward_diopi(input, rois, output, argmax_y, argmax_x,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+#else
+  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+#endif
+}
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned) {
+#ifdef MMCV_WITH_DIOPI
+  roi_align_backward_diopi(grad_output, rois, argmax_y, argmax_x, grad_input,
+                           aligned_height, aligned_width, spatial_scale,
+                           sampling_ratio, pool_mode, aligned);
+#else
+  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+#endif
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..77ea5ce70cff1724a6b012aee127ba256c7dd326
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, input, rois, output,
+                       aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned, bool clockwise) {
+  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
+                                 aligned_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                Tensor bottom_grad, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise) {
+  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
+                                  aligned_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_pool.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bba90b806c5fe59d9e20a0b41a51df9922e91c3f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_pool.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
+                       grad_input, pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width,
+                      float spatial_scale) {
+  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
+}
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale) {
+  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cf9cf0945db4c0ce1774aed6d334b62f3e1a9e4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
@@ -0,0 +1,72 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
+                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
+                       pts, pts_feature, argmax, pts_idx_of_voxels,
+                       pooled_features, pool_method);
+}
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
+                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
+                       argmax, grad_out, grad_in, pool_method);
+}
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR
+  // coordinate
+  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // params pts_feature: (npoints, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params pooled_features: (N, out_x, out_y, out_z, C)
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = rois.size(0);
+  int pts_num = pts.size(0);
+  int channels = pts_feature.size(1);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  assert((out_x < 256) && (out_y < 256) &&
+         (out_z < 256));  // we encode index with 8bit
+
+  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
+                               out_x, out_y, out_z, rois, pts, pts_feature,
+                               argmax, pts_idx_of_voxels, pooled_features,
+                               pool_method);
+}
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in,
+                              int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = pts_idx_of_voxels.size(0);
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int channels = grad_out.size(4);
+
+  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
+                                max_pts_each_voxel, pts_idx_of_voxels, argmax,
+                                grad_out, grad_in, pool_method);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a10080b7c23abb3a31b6f764c972ea7917f52346
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
@@ -0,0 +1,39 @@
+/*
+Modified from
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
+                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
+                       pts_feature, pooled_features, pooled_empty_flag);
+}
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  int batch_size = xyz.size(0);
+  int pts_num = xyz.size(1);
+  int boxes_num = boxes3d.size(1);
+  int feature_in_len = pts_feature.size(2);
+  int sampled_pts_num = pooled_features.size(2);
+
+  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
+                               sampled_pts_num, xyz, boxes3d, pts_feature,
+                               pooled_features, pooled_empty_flag);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71fe0c9a0a26003310a388d4edca6e79aa7b9026
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
+                       best_bboxes, spatial_scale, points, output);
+}
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
+                       best_bboxes, spatial_scale, points, bottom_grad);
+}
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale,
+                                   const int points) {
+  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
+                                     points, output);
+}
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points) {
+  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
+                                      points, bottom_grad);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/scatter_points.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/scatter_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0de8ebf64a3432db25b61a81fce305efc09195b8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/scatter_points.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const reduce_t reduce_type) {
+  return DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, feats, coors,
+                              reduce_type);
+}
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
+    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
+    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
+    const reduce_t reduce_type) {
+  DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, grad_feats,
+                       grad_reduced_feats, feats, reduced_feats, coors_idx,
+                       reduce_count, reduce_type);
+}
+
+inline reduce_t convert_reduce_type(const std::string &reduce_type) {
+  if (reduce_type == "max")
+    return reduce_t::MAX;
+  else if (reduce_type == "sum")
+    return reduce_t::SUM;
+  else if (reduce_type == "mean")
+    return reduce_t::MEAN;
+  else
+    TORCH_CHECK(false, "do not support reduce type " + reduce_type)
+  return reduce_t::SUM;
+}
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const std::string &reduce_type) {
+  return dynamic_point_to_voxel_forward_impl(feats, coors,
+                                             convert_reduce_type(reduce_type));
+}
+
+void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
+                                     const torch::Tensor &grad_reduced_feats,
+                                     const torch::Tensor &feats,
+                                     const torch::Tensor &reduced_feats,
+                                     const torch::Tensor &coors_idx,
+                                     const torch::Tensor &reduce_count,
+                                     const std::string &reduce_type) {
+  dynamic_point_to_voxel_backward_impl(grad_feats, grad_reduced_feats, feats,
+                                       reduced_feats, coors_idx, reduce_count,
+                                       convert_reduce_type(reduce_type));
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6f38fc68a3ec4fc1de253215c1068fba6109599
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
@@ -0,0 +1,48 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct) {
+  return DISPATCH_DEVICE_IMPL(indice_maxpool_forward_impl, features,
+                              indicePairs, indiceNum, numAct);
+}
+
+torch::Tensor indice_maxpool_forward(torch::Tensor features,
+                                     torch::Tensor indicePairs,
+                                     torch::Tensor indiceNum, int64_t numAct) {
+  return indice_maxpool_forward_impl(features, indicePairs, indiceNum, numAct);
+}
+
+torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum) {
+  return DISPATCH_DEVICE_IMPL(indice_maxpool_backward_impl, features,
+                              outFeatures, outGrad, indicePairs, indiceNum);
+}
+
+torch::Tensor indice_maxpool_backward(torch::Tensor features,
+                                      torch::Tensor outFeatures,
+                                      torch::Tensor outGrad,
+                                      torch::Tensor indicePairs,
+                                      torch::Tensor indiceNum) {
+  return indice_maxpool_backward_impl(features, outFeatures, outGrad,
+                                      indicePairs, indiceNum);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/spconv_ops.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/spconv_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..723c6c7b900b47b128222cc8ebf0b1cf4e3af116
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/spconv_ops.cpp
@@ -0,0 +1,197 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward_cuda(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsForwardCUDAKernelLauncher<NDim>(
+      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+      padding, dilation, outPadding, _subM, _transpose);
+};
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardMLUKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward_mlu(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsForwardMLUKernelLauncher<NDim>(
+      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+      padding, dilation, outPadding, _subM, _transpose);
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_backward_cuda(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsBackwardCUDAKernelLauncher<NDim>(
+      indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
+      stride, padding, dilation, outPadding, _subM, _transpose);
+};
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  if (indices.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(indices);
+
+    return get_indice_pairs_forward_cuda<NDim>(
+        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+        padding, dilation, outPadding, _subM, _transpose);
+#else
+    AT_ERROR("get_indice_pairs is not compiled with GPU support");
+#endif
+#ifdef MMCV_WITH_MLU
+  } else if (indices.device().type() == at::kMLU) {
+    return get_indice_pairs_forward_mlu<NDim>(
+        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+        padding, dilation, outPadding, _subM, _transpose);
+#endif
+  } else {
+    AT_ERROR("get_indice_pairs is not implemented on CPU");
+  }
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_backward(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  if (indices.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(indices);
+    CHECK_CUDA_INPUT(gridOut);
+
+    return get_indice_pairs_backward_cuda<NDim>(
+        indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
+        stride, padding, dilation, outPadding, _subM, _transpose);
+#else
+    AT_ERROR("get_indice_pairs is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("get_indice_pairs is not implemented on CPU");
+  }
+}
+
+torch::Tensor indice_conv_forward_impl(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(indice_conv_forward_impl, features, filters,
+                              indicePairs, indiceNum, numActOut, _inverse,
+                              _subM);
+}
+
+torch::Tensor indice_conv_forward(torch::Tensor features, torch::Tensor filters,
+                                  torch::Tensor indicePairs,
+                                  torch::Tensor indiceNum, int64_t numActOut,
+                                  int64_t _inverse, int64_t _subM) {
+  return indice_conv_forward_impl(features, filters, indicePairs, indiceNum,
+                                  numActOut, _inverse, _subM);
+}
+
+std::vector<torch::Tensor> indice_conv_backward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(indice_conv_backward_impl, features, filters,
+                              outGrad, indicePairs, indiceNum, _inverse, _subM);
+}
+
+std::vector<torch::Tensor> indice_conv_backward(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return indice_conv_backward_impl(features, filters, outGrad, indicePairs,
+                                   indiceNum, _inverse, _subM);
+}
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_backward<2>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_backward<3>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/spconv_utils.h b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/spconv_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d3de025b690f6247abfb813614e70de36b02d7d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/spconv_utils.h
@@ -0,0 +1,79 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/script.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+namespace tv {
+struct GPU {
+  GPU(cudaStream_t s = 0) : mStream(s) {}
+  virtual cudaStream_t getStream() const { return mStream; }
+  cudaStream_t mStream = 0;
+};
+
+struct TorchGPU : public tv::GPU {
+  virtual cudaStream_t getStream() const override {
+    return at::cuda::getCurrentCUDAStream();
+  }
+};
+
+template <typename scalar_t>
+void check_torch_dtype(const torch::Tensor &tensor) {
+  switch (tensor.type().scalarType()) {
+    case at::ScalarType::Double: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, double>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Float: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, float>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Int: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, int>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Half: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, at::Half>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Long: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, long>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    default:
+      TV_ASSERT_RT_ERR(false, "error");
+  }
+}
+
+template <typename scalar_t>
+tv::TensorView<scalar_t> torch2tv(const torch::Tensor &tensor) {
+  check_torch_dtype<scalar_t>(tensor);
+  tv::Shape shape;
+  for (auto i : tensor.sizes()) {
+    shape.push_back(i);
+  }
+  return tv::TensorView<scalar_t>(
+      tensor.data_ptr<std::remove_const_t<scalar_t>>(), shape);
+}
+}  // namespace tv
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/sync_bn.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/sync_bn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd5a513273a7bbce2cf41c790706fe4801f4c414
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/sync_bn.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
+}
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
+}
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
+                       running_mean, running_var, weight, bias, norm, std,
+                       output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
+                       grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
+                       grad_weight, grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean) {
+  sync_bn_forward_mean_impl(input, mean);
+}
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
+  sync_bn_forward_var_impl(input, mean, var);
+}
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size) {
+  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
+}
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias) {
+  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input) {
+  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/three_interpolate.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/three_interpolate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e0ec71bb3d3fdb8416dcc62cfda926cc45c9977
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/three_interpolate.cpp
@@ -0,0 +1,33 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
+                       weight, out);
+}
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
+                       idx, weight, grad_points);
+}
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n) {
+  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
+                                 weight_tensor, out_tensor);
+}
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m) {
+  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
+                                  weight_tensor, grad_points_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/three_nn.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/three_nn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b629200c0727cdec5ca4e0abd8ac65baacaa31f9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/three_nn.cpp
@@ -0,0 +1,18 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
+                       idx);
+}
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m) {
+  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
+                        idx_tensor);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/tin_shift.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/tin_shift.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b03f587541f17cae3c3f03f5cb8747d4b0208efc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/tin_shift.cpp
@@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
+  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
+}
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
+}
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
+  tin_shift_forward_impl(input, shift, output);
+}
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
+  tin_shift_backward_impl(grad_output, shift, grad_input);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/upfirdn2d.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/upfirdn2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4a3e928c1a0a0d5f1c29c6173bb8b058e871079f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/upfirdn2d.cpp
@@ -0,0 +1,118 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor upfirdn2d_op_impl(torch::Tensor input, torch::Tensor filter,
+                                int upx, int upy, int downx, int downy,
+                                int padx0, int padx1, int pady0, int pady1,
+                                bool flip, float gain) {
+  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, filter, upx, upy, downx,
+                              downy, padx0, padx1, pady0, pady1, flip, gain);
+}
+
+torch::Tensor upfirdn2d(torch::Tensor input, torch::Tensor filter, int upx,
+                        int upy, int downx, int downy, int padx0, int padx1,
+                        int pady0, int pady1, bool flip, float gain) {
+  return upfirdn2d_op_impl(input, filter, upx, upy, downx, downy, padx0, padx1,
+                           pady0, pady1, flip, gain);
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/voxelization.cpp b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/voxelization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75673511591215422dea113082eb6fed331df7da
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/voxelization.cpp
@@ -0,0 +1,220 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#ifdef MMCV_WITH_DIOPI
+#include <diopi/diopirt.h>
+#include <diopi/functions.h>
+#include <diopi/functions_mmcv.h>
+
+#include "csrc_dipu/diopirt/diopirt_impl.h"
+
+using dipu::diopi_helper::toDiopiScalar;
+using dipu::diopi_helper::toDiopiTensorHandle;
+#endif
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
+                              num_points_per_voxel, voxel_size, coors_range,
+                              max_points, max_voxels, NDim);
+}
+
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
+                              points, voxels, coors, num_points_per_voxel,
+                              voxel_size, coors_range, max_points, max_voxels,
+                              NDim);
+}
+
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim = 3) {
+  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
+                       coors_range, NDim);
+}
+
+#ifdef MMCV_WITH_DIOPI
+void hard_voxelize_forward_diopi(const at::Tensor &points,
+                                 const at::Tensor &voxel_size,
+                                 const at::Tensor &coors_range,
+                                 at::Tensor &voxels, at::Tensor &coors,
+                                 at::Tensor &num_points_per_voxel,
+                                 at::Tensor &voxel_num, const int max_points,
+                                 const int max_voxels, const int NDim = 3,
+                                 const bool deterministic = true) {
+  auto points_p = toDiopiTensorHandle(points);
+  diopiDevice_t device;
+  diopiGetTensorDevice(points_p, &device);
+  if (device == diopi_host) {
+    int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
+    std::vector<float> voxel_size_v(
+        voxel_size.data_ptr<float>(),
+        voxel_size.data_ptr<float>() + voxel_size.numel());
+    std::vector<float> coors_range_v(
+        coors_range.data_ptr<float>(),
+        coors_range.data_ptr<float>() + coors_range.numel());
+
+    if (deterministic) {
+      *voxel_num_data = hard_voxelize_forward_impl(
+          points, voxels, coors, num_points_per_voxel, voxel_size_v,
+          coors_range_v, max_points, max_voxels, NDim);
+    } else {
+      TORCH_CHECK(
+          deterministic,
+          "nondeterministic hard_voxelize_forward is not supported on host!");
+    }
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto voxel_size_p = toDiopiTensorHandle(voxel_size);
+  auto coors_range_p = toDiopiTensorHandle(coors_range);
+  auto voxels_p = toDiopiTensorHandle(voxels);
+  auto coors_p = toDiopiTensorHandle(coors);
+  auto num_points_per_voxel_p = toDiopiTensorHandle(num_points_per_voxel);
+  auto voxel_num_p = toDiopiTensorHandle(voxel_num);
+  if (reinterpret_cast<void *>(diopiHardVoxelizeMmcv) != nullptr) {
+    auto ret = diopiHardVoxelizeMmcv(
+        ch, voxels_p, coors_p, num_points_per_voxel_p, voxel_num_p, points_p,
+        voxel_size_p, coors_range_p, max_points, max_voxels, NDim,
+        deterministic);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op hard_voxelize_forward";
+  auto points_cpu = points.cpu();
+  auto voxel_size_cpu = voxel_size.cpu();
+  auto coors_range_cpu = coors_range.cpu();
+  auto voxels_cpu = voxels.cpu();
+  auto coors_cpu = coors.cpu();
+  auto num_points_per_voxel_cpu = num_points_per_voxel.cpu();
+  auto voxel_num_cpu = voxel_num.cpu();
+
+  int64_t *voxel_num_data_cpu = voxel_num_cpu.data_ptr<int64_t>();
+  std::vector<float> voxel_size_v_cpu(
+      voxel_size_cpu.data_ptr<float>(),
+      voxel_size_cpu.data_ptr<float>() + voxel_size_cpu.numel());
+  std::vector<float> coors_range_v_cpu(
+      coors_range_cpu.data_ptr<float>(),
+      coors_range_cpu.data_ptr<float>() + coors_range_cpu.numel());
+
+  if (deterministic) {
+    *voxel_num_data_cpu = hard_voxelize_forward_impl(
+        points_cpu, voxels_cpu, coors_cpu, num_points_per_voxel_cpu,
+        voxel_size_v_cpu, coors_range_v_cpu, max_points, max_voxels, NDim);
+  } else {
+    puts("nondeterministic hard_voxelize_forward is not supported on host!");
+    abort();
+  }
+  voxels.copy_(voxels_cpu);
+  coors.copy_(coors_cpu);
+  num_points_per_voxel.copy_(num_points_per_voxel_cpu);
+  voxel_num.copy_(voxel_num_cpu);
+  return;
+}
+
+void dynamic_voxelize_forward_diopi(const at::Tensor &points,
+                                    const at::Tensor &voxel_size,
+                                    const at::Tensor &coors_range,
+                                    at::Tensor &coors, const int NDim = 3) {
+  auto points_p = toDiopiTensorHandle(points);
+  diopiDevice_t device;
+  diopiGetTensorDevice(points_p, &device);
+  if (device == diopi_host) {
+    std::vector<float> voxel_size_v(
+        voxel_size.data_ptr<float>(),
+        voxel_size.data_ptr<float>() + voxel_size.numel());
+    std::vector<float> coors_range_v(
+        coors_range.data_ptr<float>(),
+        coors_range.data_ptr<float>() + coors_range.numel());
+    dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
+                                  NDim);
+    return;
+  }
+  diopiContext ctx(dipu::getCurrentDIPUStream().rawstream());
+  diopiContextHandle_t ch = &ctx;
+  auto voxel_size_p = toDiopiTensorHandle(voxel_size);
+  auto coors_range_p = toDiopiTensorHandle(coors_range);
+  auto coors_p = toDiopiTensorHandle(coors);
+  if (reinterpret_cast<void *>(diopiDynamicVoxelizeMmcv) != nullptr) {
+    auto ret = diopiDynamicVoxelizeMmcv(ch, coors_p, points_p, voxel_size_p,
+                                        coors_range_p, NDim);
+    if (ret == diopiSuccess) return;
+  }
+  LOG(WARNING) << "Fallback to cpu: mmcv ext op dynamic_voxelize_forward";
+  auto points_cpu = points.cpu();
+  auto voxel_size_cpu = voxel_size.cpu();
+  auto coors_range_cpu = coors_range.cpu();
+  auto coors_cpu = coors.cpu();
+
+  std::vector<float> voxel_size_v_cpu(
+      voxel_size_cpu.data_ptr<float>(),
+      voxel_size_cpu.data_ptr<float>() + voxel_size_cpu.numel());
+  std::vector<float> coors_range_v_cpu(
+      coors_range_cpu.data_ptr<float>(),
+      coors_range_cpu.data_ptr<float>() + coors_range_cpu.numel());
+  dynamic_voxelize_forward_impl(points_cpu, coors_cpu, voxel_size_v_cpu,
+                                coors_range_v_cpu, NDim);
+  coors.copy_(coors_cpu);
+  return;
+}
+#endif
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim = 3,
+                           const bool deterministic = true) {
+#ifdef MMCV_WITH_DIOPI
+  hard_voxelize_forward_diopi(points, voxel_size, coors_range, voxels, coors,
+                              num_points_per_voxel, voxel_num, max_points,
+                              max_voxels, NDim, deterministic);
+#else
+  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+
+  if (deterministic) {
+    *voxel_num_data = hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  } else {
+    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  }
+#endif
+}
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim = 3) {
+#ifdef MMCV_WITH_DIOPI
+  dynamic_voxelize_forward_diopi(points, voxel_size, coors_range, coors, NDim);
+#else
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
+                                NDim);
+#endif
+}
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/deform_conv.py b/head_extractor/mmcv-2.1.0/mmcv/ops/deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..8251bc7328ae2d0132ded3ed19f285ac7242bdeb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/deform_conv.py
@@ -0,0 +1,501 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.logging import print_log
+from mmengine.registry import MODELS
+from mmengine.utils import deprecated_api_warning
+from torch import Tensor
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+
+from mmcv.utils import IS_MLU_AVAILABLE
+from ..utils import ext_loader
+from .modulated_deform_conv import ModulatedDeformConv2dFunction
+
+ext_module = ext_loader.load_ext('_ext', [
+    'deform_conv_forward', 'deform_conv_backward_input',
+    'deform_conv_backward_parameters'
+])
+
+
+class DeformConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g,
+                 input,
+                 offset,
+                 weight,
+                 stride,
+                 padding,
+                 dilation,
+                 groups,
+                 deform_groups,
+                 bias=False,
+                 im2col_step=32):
+        return g.op(
+            'mmcv::MMCVDeformConv2d',
+            input,
+            offset,
+            weight,
+            stride_i=stride,
+            padding_i=padding,
+            dilation_i=dilation,
+            groups_i=groups,
+            deform_groups_i=deform_groups,
+            bias_i=bias,
+            im2col_step_i=im2col_step)
+
+    @staticmethod
+    def _npu_backward(ctx, grad_output):
+        input_tensor, weight, offset_out, offset_all, sort_index_for_npu_bp = \
+            ctx.saved_tensors
+        grad_input, grad_weight, grad_offset_all, grad_bias = \
+            torch.npu_deformable_conv2dbk(
+                input_tensor, grad_output, offset_out, weight, offset_all,
+                kernel_size=[weight.shape[3], weight.shape[2]],
+                stride=[1, 1, ctx.stride[0], ctx.stride[1]],
+                padding=[ctx.padding[0], ctx.padding[0], ctx.padding[1],
+                         ctx.padding[1]],
+                dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
+                groups=ctx.groups, deformable_groups=ctx.deform_groups,
+                modulated=True)
+        grad_offset = grad_offset_all.index_select(1, sort_index_for_npu_bp)
+        return grad_input, grad_offset, grad_weight, \
+            None, None, None, None, None, None, None
+
+    @staticmethod
+    def forward(ctx,
+                input: Tensor,
+                offset: Tensor,
+                weight: Tensor,
+                stride: Union[int, Tuple[int, ...]] = 1,
+                padding: Union[int, Tuple[int, ...]] = 0,
+                dilation: Union[int, Tuple[int, ...]] = 1,
+                groups: int = 1,
+                deform_groups: int = 1,
+                bias: bool = False,
+                im2col_step: int = 32) -> Tensor:
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                f'Expected 4D tensor as input, got {input.dim()}D tensor \
+                  instead.')
+        assert bias is False, 'Only support bias is False.'
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deform_groups = deform_groups
+        ctx.im2col_step = im2col_step
+        ctx.device = input.device.type
+
+        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
+        # amp won't cast the type of model (float32), but "offset" is cast
+        # to float16 by nn.Conv2d automatically, leading to the type
+        # mismatch with input (when it is float32) or weight.
+        # The flag for whether to use fp16 or amp is the type of "offset",
+        # we cast weight and input to temporarily support fp16 and amp
+        # whatever the pytorch version is.
+        input = input.type_as(offset)
+        weight = weight.type_as(input)
+        if ctx.device == 'npu':
+            mask_shape, _ = torch.chunk(offset, 2, dim=1)
+            mask = torch.ones_like(mask_shape).to(input.device)
+            bias = input.new_empty(0)
+            output = ModulatedDeformConv2dFunction._npu_forward(
+                ctx, input, offset, mask, weight, bias)
+            return output
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty([
+            int(i)
+            for i in DeformConv2dFunction._output_size(ctx, input, weight)
+        ])
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) % cur_im2col_step
+                ) == 0, 'batch size must be divisible by im2col_step'
+        ext_module.deform_conv_forward(
+            input,
+            weight,
+            offset,
+            output,
+            ctx.bufs_[0],
+            ctx.bufs_[1],
+            kW=weight.size(3),
+            kH=weight.size(2),
+            dW=ctx.stride[1],
+            dH=ctx.stride[0],
+            padW=ctx.padding[1],
+            padH=ctx.padding[0],
+            dilationW=ctx.dilation[1],
+            dilationH=ctx.dilation[0],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            im2col_step=cur_im2col_step)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], None,
+               None, None, None, None, None, None]:
+        if ctx.device == 'npu':
+            return DeformConv2dFunction._npu_backward(ctx, grad_output)
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) % cur_im2col_step
+                ) == 0, 'batch size must be divisible by im2col_step'
+
+        grad_output = grad_output.contiguous()
+        if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+            grad_input = torch.zeros_like(input)
+            grad_offset = torch.zeros_like(offset)
+            ext_module.deform_conv_backward_input(
+                input,
+                offset,
+                grad_output,
+                grad_input,
+                grad_offset,
+                weight,
+                ctx.bufs_[0],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                im2col_step=cur_im2col_step)
+
+        if ctx.needs_input_grad[2]:
+            grad_weight = torch.zeros_like(weight)
+            ext_module.deform_conv_backward_parameters(
+                input,
+                offset,
+                grad_output,
+                grad_weight,
+                ctx.bufs_[0],
+                ctx.bufs_[1],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                scale=1,
+                im2col_step=cur_im2col_step)
+
+        return grad_input, grad_offset, grad_weight, \
+            None, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(ctx, input, weight):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = ctx.padding[d]
+            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = ctx.stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be ' +
+                'x'.join(map(str, output_size)) + ')')
+        return output_size
+
+
+deform_conv2d = DeformConv2dFunction.apply
+
+
+class DeformConv2d(nn.Module):
+    r"""Deformable 2D convolution.
+
+    Applies a deformable 2D convolution over an input signal composed of
+    several input planes. DeformConv2d was described in the paper
+    `Deformable Convolutional Networks
+    <https://arxiv.org/pdf/1703.06211.pdf>`_
+
+    Note:
+        The argument ``im2col_step`` was added in version 1.3.17, which means
+        number of samples processed by the ``im2col_cuda_kernel`` per call.
+        It enables users to define ``batch_size`` and ``im2col_step`` more
+        flexibly and solved `issue mmcv#1440
+        <https://github.com/open-mmlab/mmcv/issues/1440>`_.
+
+    Args:
+        in_channels (int): Number of channels in the input image.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size(int, tuple): Size of the convolving kernel.
+        stride(int, tuple): Stride of the convolution. Default: 1.
+        padding (int or tuple): Zero-padding added to both sides of the input.
+            Default: 0.
+        dilation (int or tuple): Spacing between kernel elements. Default: 1.
+        groups (int): Number of blocked connections from input.
+            channels to output channels. Default: 1.
+        deform_groups (int): Number of deformable group partitions.
+        bias (bool): If True, adds a learnable bias to the output.
+            Default: False.
+        im2col_step (int): Number of samples processed by im2col_cuda_kernel
+            per call. It will work when ``batch_size`` > ``im2col_step``, but
+            ``batch_size`` must be divisible by ``im2col_step``. Default: 32.
+            `New in version 1.3.17.`
+    """
+
+    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
+                            cls_name='DeformConv2d')
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, ...]],
+                 stride: Union[int, Tuple[int, ...]] = 1,
+                 padding: Union[int, Tuple[int, ...]] = 0,
+                 dilation: Union[int, Tuple[int, ...]] = 1,
+                 groups: int = 1,
+                 deform_groups: int = 1,
+                 bias: bool = False,
+                 im2col_step: int = 32) -> None:
+        super().__init__()
+
+        assert not bias, \
+            f'bias={bias} is not supported in DeformConv2d.'
+        assert in_channels % groups == 0, \
+            f'in_channels {in_channels} cannot be divisible by groups {groups}'
+        assert out_channels % groups == 0, \
+            f'out_channels {out_channels} cannot be divisible by groups \
+              {groups}'
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deform_groups = deform_groups
+        self.im2col_step = im2col_step
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+
+        # only weight, no bias
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups,
+                         *self.kernel_size))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        # switch the initialization of `self.weight` to the standard kaiming
+        # method described in `Delving deep into rectifiers: Surpassing
+        # human-level performance on ImageNet classification` - He, K. et al.
+        # (2015), using a uniform distribution
+        nn.init.kaiming_uniform_(self.weight, nonlinearity='relu')
+
+    def forward(self, x: Tensor, offset: Tensor) -> Tensor:
+        """Deformable Convolutional forward function.
+
+        Args:
+            x (Tensor): Input feature, shape (B, C_in, H_in, W_in)
+            offset (Tensor): Offset for deformable convolution, shape
+                (B, deform_groups*kernel_size[0]*kernel_size[1]*2,
+                H_out, W_out), H_out, W_out are equal to the output's.
+
+                An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
+                The spatial arrangement is like:
+
+                .. code:: text
+
+                    (x0, y0) (x1, y1) (x2, y2)
+                    (x3, y3) (x4, y4) (x5, y5)
+                    (x6, y6) (x7, y7) (x8, y8)
+
+        Returns:
+            Tensor: Output of the layer.
+        """
+        # To fix an assert error in deform_conv_cuda.cpp:128
+        # input image is smaller than kernel
+        input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) <
+                                                          self.kernel_size[1])
+        if input_pad:
+            pad_h = max(self.kernel_size[0] - x.size(2), 0)
+            pad_w = max(self.kernel_size[1] - x.size(3), 0)
+            x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()
+            offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0)
+            offset = offset.contiguous()
+        out = deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                            self.dilation, self.groups, self.deform_groups,
+                            False, self.im2col_step)
+        if input_pad:
+            out = out[:, :, :out.size(2) - pad_h, :out.size(3) -
+                      pad_w].contiguous()
+        return out
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(in_channels={self.in_channels},\n'
+        s += f'out_channels={self.out_channels},\n'
+        s += f'kernel_size={self.kernel_size},\n'
+        s += f'stride={self.stride},\n'
+        s += f'padding={self.padding},\n'
+        s += f'dilation={self.dilation},\n'
+        s += f'groups={self.groups},\n'
+        s += f'deform_groups={self.deform_groups},\n'
+        # bias is not supported in DeformConv2d.
+        s += 'bias=False)'
+        return s
+
+
+@MODELS.register_module('DCN')
+class DeformConv2dPack(DeformConv2d):
+    """A Deformable Conv Encapsulation that acts as normal Conv layers.
+
+    The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
+    The spatial arrangement is like:
+
+    .. code:: text
+
+        (x0, y0) (x1, y1) (x2, y2)
+        (x3, y3) (x4, y4) (x5, y5)
+        (x6, y6) (x7, y7) (x8, y8)
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+
+    _version = 2
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            dilation=_pair(self.dilation),
+            bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+
+    def forward(self, x: Tensor) -> Tensor:  # type: ignore
+        offset = self.conv_offset(x)
+        return deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                             self.dilation, self.groups, self.deform_groups,
+                             False, self.im2col_step)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, DeformConvPack loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+
+        if version is not None and version > 1:
+            print_log(
+                f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to '
+                'version 2.',
+                logger='current')
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+
+if IS_MLU_AVAILABLE:
+    import torchvision
+    from mmengine.utils import digit_version
+    from torchvision.ops import deform_conv2d as tv_deform_conv2d
+
+    @MODELS.register_module('DCN', force=True)
+    class DeformConv2dPack_MLU(DeformConv2d):
+        """This class is the DCN implementation of the MLU device. The MLU
+        backend support of the operator has been implemented in torchvision.
+        The mmcv registration mechanism is used for multiplexing here. The
+        torchvision implementation of DCN is called.
+        Args:
+            in_channels (int): Same as nn.Conv2d.
+            out_channels (int): Same as nn.Conv2d.
+            kernel_size (int or tuple[int]): Same as nn.Conv2d.
+            stride (int): Same as nn.Conv2d, while tuple is not supported.
+            padding (int): Same as nn.Conv2d, while tuple is not supported.
+            dilation (int): Same as nn.Conv2d, while tuple is not supported.
+            groups (int): Same as nn.Conv2d.
+            bias (bool or str): If specified as `auto`, it will be decided by
+                the norm_cfg. Bias will be set as True if norm_cfg is None,
+                otherwise False.
+            im2col_step (int): Number of samples processed by
+                im2col_cuda_kernel per call. It will work when ``batch_size``
+                > ``im2col_step``, but ``batch_size`` must be divisible by
+                ``im2col_step``. Default: 32. `New in version 1.7.2.
+                Currently not supported on MLU devices.`
+        """
+
+        def __init__(self, *args, **kwargs):
+            assert digit_version(torchvision.__version__) >= digit_version(
+                '0.10.0a0'), 'the version of torchvision should be >= 0.10.0'
+            super().__init__(*args, **kwargs)
+
+            self.conv_offset = nn.Conv2d(
+                self.in_channels,
+                self.deform_groups * 2 * self.kernel_size[0] *
+                self.kernel_size[1],
+                kernel_size=self.kernel_size,
+                stride=_pair(self.stride),
+                padding=_pair(self.padding),
+                dilation=_pair(self.dilation),
+                bias=True)
+            self.init_offset()
+
+        def init_offset(self):
+            self.conv_offset.weight.data.zero_()
+            self.conv_offset.bias.data.zero_()
+
+        def forward(self, x: Tensor) -> Tensor:  # type: ignore
+            cur_im2col_step = min(self.im2col_step, x.size(0))
+            assert (x.size(0) % cur_im2col_step
+                    ) == 0, 'batch size must be divisible by im2col_step'
+            offset = self.conv_offset(x)
+            x = x.type_as(offset)
+            weight = self.weight.type_as(x)
+            return tv_deform_conv2d(x, offset, weight, None, self.stride,
+                                    self.padding, self.dilation)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/deform_roi_pool.py b/head_extractor/mmcv-2.1.0/mmcv/ops/deform_roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b140888025b3f4af2bf892e2e38d2b89e0f324a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/deform_roi_pool.py
@@ -0,0 +1,211 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+from torch import Tensor, nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['deform_roi_pool_forward', 'deform_roi_pool_backward'])
+
+
+class DeformRoIPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, offset, output_size, spatial_scale,
+                 sampling_ratio, gamma):
+        inputs = [input, rois]
+        if offset is not None:
+            inputs = [input, rois, offset]
+        return g.op(
+            'mmcv::MMCVDeformRoIPool',
+            *inputs,
+            pooled_height_i=output_size[0],
+            pooled_width_i=output_size[1],
+            spatial_scale_f=spatial_scale,
+            sampling_ratio_f=sampling_ratio,
+            gamma_f=gamma,
+        )
+
+    @staticmethod
+    def forward(ctx,
+                input: Tensor,
+                rois: Tensor,
+                offset: Optional[Tensor],
+                output_size: Tuple[int, ...],
+                spatial_scale: float = 1.0,
+                sampling_ratio: int = 0,
+                gamma: float = 0.1) -> Tensor:
+        if offset is None:
+            offset = input.new_zeros(0)
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = float(spatial_scale)
+        ctx.sampling_ratio = int(sampling_ratio)
+        ctx.gamma = float(gamma)
+
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+
+        ext_module.deform_roi_pool_forward(
+            input,
+            rois,
+            offset,
+            output,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            gamma=ctx.gamma)
+
+        ctx.save_for_backward(input, rois, offset)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Tensor, None, Tensor, None, None, None, None]:
+        input, rois, offset = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(input.shape)
+        grad_offset = grad_output.new_zeros(offset.shape)
+
+        ext_module.deform_roi_pool_backward(
+            grad_output,
+            input,
+            rois,
+            offset,
+            grad_input,
+            grad_offset,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            gamma=ctx.gamma)
+        if grad_offset.numel() == 0:
+            grad_offset = None
+        return grad_input, None, grad_offset, None, None, None, None
+
+
+deform_roi_pool = DeformRoIPoolFunction.apply
+
+
+class DeformRoIPool(nn.Module):
+
+    def __init__(self,
+                 output_size: Tuple[int, ...],
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__()
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.gamma = float(gamma)
+
+    def forward(self,
+                input: Tensor,
+                rois: Tensor,
+                offset: Optional[Tensor] = None) -> Tensor:
+        return deform_roi_pool(input, rois, offset, self.output_size,
+                               self.spatial_scale, self.sampling_ratio,
+                               self.gamma)
+
+
+class DeformRoIPoolPack(DeformRoIPool):
+
+    def __init__(self,
+                 output_size: Tuple[int, ...],
+                 output_channels: int,
+                 deform_fc_channels: int = 1024,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)
+
+        self.output_channels = output_channels
+        self.deform_fc_channels = deform_fc_channels
+
+        self.offset_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 2))
+        self.offset_fc[-1].weight.data.zero_()
+        self.offset_fc[-1].bias.data.zero_()
+
+    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
+        assert input.size(1) == self.output_channels
+        x = deform_roi_pool(input, rois, None, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        rois_num = rois.size(0)
+        offset = self.offset_fc(x.view(rois_num, -1))
+        offset = offset.view(rois_num, 2, self.output_size[0],
+                             self.output_size[1])
+        return deform_roi_pool(input, rois, offset, self.output_size,
+                               self.spatial_scale, self.sampling_ratio,
+                               self.gamma)
+
+
+class ModulatedDeformRoIPoolPack(DeformRoIPool):
+
+    def __init__(self,
+                 output_size: Tuple[int, ...],
+                 output_channels: int,
+                 deform_fc_channels: int = 1024,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)
+
+        self.output_channels = output_channels
+        self.deform_fc_channels = deform_fc_channels
+
+        self.offset_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 2))
+        self.offset_fc[-1].weight.data.zero_()
+        self.offset_fc[-1].bias.data.zero_()
+
+        self.mask_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 1),
+            nn.Sigmoid())
+        self.mask_fc[2].weight.data.zero_()
+        self.mask_fc[2].bias.data.zero_()
+
+    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
+        assert input.size(1) == self.output_channels
+        x = deform_roi_pool(input, rois, None, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        rois_num = rois.size(0)
+        offset = self.offset_fc(x.view(rois_num, -1))
+        offset = offset.view(rois_num, 2, self.output_size[0],
+                             self.output_size[1])
+        mask = self.mask_fc(x.view(rois_num, -1))
+        mask = mask.view(rois_num, 1, self.output_size[0], self.output_size[1])
+        d = deform_roi_pool(input, rois, offset, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        return d * mask
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/deprecated_wrappers.py b/head_extractor/mmcv-2.1.0/mmcv/ops/deprecated_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..629a8033ff56be221b71a475ffd650ab7164f114
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/deprecated_wrappers.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This file is for backward compatibility.
+# Module wrappers for empty tensor have been moved to mmcv.cnn.bricks.
+import warnings
+
+from ..cnn.bricks.wrappers import Conv2d, ConvTranspose2d, Linear, MaxPool2d
+
+
+class Conv2d_deprecated(Conv2d):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing Conv2d wrapper from "mmcv.ops" will be deprecated in'
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
+
+
+class ConvTranspose2d_deprecated(ConvTranspose2d):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing ConvTranspose2d wrapper from "mmcv.ops" will be '
+            'deprecated in the future. Please import them from "mmcv.cnn" '
+            'instead', DeprecationWarning)
+
+
+class MaxPool2d_deprecated(MaxPool2d):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing MaxPool2d wrapper from "mmcv.ops" will be deprecated in'
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
+
+
+class Linear_deprecated(Linear):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing Linear wrapper from "mmcv.ops" will be deprecated in'
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/diff_iou_rotated.py b/head_extractor/mmcv-2.1.0/mmcv/ops/diff_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddcf4b4fc279ed7043196b88b4885870102c0955
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/diff_iou_rotated.py
@@ -0,0 +1,301 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/box_intersection_2d.py  # noqa
+# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/oriented_iou_loss.py  # noqa
+from typing import Tuple
+
+import torch
+from torch import Tensor
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+EPSILON = 1e-8
+ext_module = ext_loader.load_ext('_ext',
+                                 ['diff_iou_rotated_sort_vertices_forward'])
+
+
+class SortVertices(Function):
+
+    @staticmethod
+    def forward(ctx, vertices, mask, num_valid):
+        idx = ext_module.diff_iou_rotated_sort_vertices_forward(
+            vertices, mask, num_valid)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, gradout):
+        return ()
+
+
+def box_intersection(corners1: Tensor,
+                     corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Find intersection points of rectangles.
+    Convention: if two edges are collinear, there is no intersection point.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 4, 4, 2) Intersections.
+         - Tensor: (B, N, 4, 4) Valid intersections mask.
+    """
+    # build edges from corners
+    # B, N, 4, 4: Batch, Box, edge, point
+    line1 = torch.cat([corners1, corners1[:, :, [1, 2, 3, 0], :]], dim=3)
+    line2 = torch.cat([corners2, corners2[:, :, [1, 2, 3, 0], :]], dim=3)
+    # duplicate data to pair each edges from the boxes
+    # (B, N, 4, 4) -> (B, N, 4, 4, 4) : Batch, Box, edge1, edge2, point
+    line1_ext = line1.unsqueeze(3)
+    line2_ext = line2.unsqueeze(2)
+    x1, y1, x2, y2 = line1_ext.split([1, 1, 1, 1], dim=-1)
+    x3, y3, x4, y4 = line2_ext.split([1, 1, 1, 1], dim=-1)
+    # math: https://en.wikipedia.org/wiki/Line%E2%80%93line_intersection
+    numerator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
+    denumerator_t = (x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4)
+    t = denumerator_t / numerator
+    t[numerator == .0] = -1.
+    mask_t = (t > 0) & (t < 1)  # intersection on line segment 1
+    denumerator_u = (x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3)
+    u = -denumerator_u / numerator
+    u[numerator == .0] = -1.
+    mask_u = (u > 0) & (u < 1)  # intersection on line segment 2
+    mask = mask_t * mask_u
+    # overwrite with EPSILON. otherwise numerically unstable
+    t = denumerator_t / (numerator + EPSILON)
+    intersections = torch.stack([x1 + t * (x2 - x1), y1 + t * (y2 - y1)],
+                                dim=-1)
+    intersections = intersections * mask.float().unsqueeze(-1)
+    return intersections, mask
+
+
+def box1_in_box2(corners1: Tensor, corners2: Tensor) -> Tensor:
+    """Check if corners of box1 lie in box2.
+    Convention: if a corner is exactly on the edge of the other box,
+    it's also a valid point.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tensor: (B, N, 4) Intersection.
+    """
+    # a, b, c, d - 4 vertices of box2
+    a = corners2[:, :, 0:1, :]  # (B, N, 1, 2)
+    b = corners2[:, :, 1:2, :]  # (B, N, 1, 2)
+    d = corners2[:, :, 3:4, :]  # (B, N, 1, 2)
+    # ab, am, ad - vectors between corresponding vertices
+    ab = b - a  # (B, N, 1, 2)
+    am = corners1 - a  # (B, N, 4, 2)
+    ad = d - a  # (B, N, 1, 2)
+    prod_ab = torch.sum(ab * am, dim=-1)  # (B, N, 4)
+    norm_ab = torch.sum(ab * ab, dim=-1)  # (B, N, 1)
+    prod_ad = torch.sum(ad * am, dim=-1)  # (B, N, 4)
+    norm_ad = torch.sum(ad * ad, dim=-1)  # (B, N, 1)
+    # NOTE: the expression looks ugly but is stable if the two boxes
+    # are exactly the same also stable with different scale of bboxes
+    cond1 = (prod_ab / norm_ab > -1e-6) * (prod_ab / norm_ab < 1 + 1e-6
+                                           )  # (B, N, 4)
+    cond2 = (prod_ad / norm_ad > -1e-6) * (prod_ad / norm_ad < 1 + 1e-6
+                                           )  # (B, N, 4)
+    return cond1 * cond2
+
+
+def box_in_box(corners1: Tensor, corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Check if corners of two boxes lie in each other.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 4) True if i-th corner of box1 is in box2.
+         - Tensor: (B, N, 4) True if i-th corner of box2 is in box1.
+    """
+    c1_in_2 = box1_in_box2(corners1, corners2)
+    c2_in_1 = box1_in_box2(corners2, corners1)
+    return c1_in_2, c2_in_1
+
+
+def build_vertices(corners1: Tensor, corners2: Tensor, c1_in_2: Tensor,
+                   c2_in_1: Tensor, intersections: Tensor,
+                   valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
+    """Find vertices of intersection area.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+        c1_in_2 (Tensor): (B, N, 4) True if i-th corner of box1 is in box2.
+        c2_in_1 (Tensor): (B, N, 4) True if i-th corner of box2 is in box1.
+        intersections (Tensor): (B, N, 4, 4, 2) Intersections.
+        valid_mask (Tensor): (B, N, 4, 4) Valid intersections mask.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 24, 2) Vertices of intersection area;
+               only some elements are valid.
+         - Tensor: (B, N, 24) Mask of valid elements in vertices.
+    """
+    # NOTE: inter has elements equals zero and has zeros gradient
+    # (masked by multiplying with 0); can be used as trick
+    B = corners1.size()[0]
+    N = corners1.size()[1]
+    # (B, N, 4 + 4 + 16, 2)
+    vertices = torch.cat(
+        [corners1, corners2,
+         intersections.view([B, N, -1, 2])], dim=2)
+    # Bool (B, N, 4 + 4 + 16)
+    mask = torch.cat([c1_in_2, c2_in_1, valid_mask.view([B, N, -1])], dim=2)
+    return vertices, mask
+
+
+def sort_indices(vertices: Tensor, mask: Tensor) -> Tensor:
+    """Sort indices.
+    Note:
+        why 9? the polygon has maximal 8 vertices.
+        +1 to duplicate the first element.
+        the index should have following structure:
+            (A, B, C, ... , A, X, X, X)
+        and X indicates the index of arbitrary elements in the last
+        16 (intersections not corners) with value 0 and mask False.
+        (cause they have zero value and zero gradient)
+
+    Args:
+        vertices (Tensor): (B, N, 24, 2) Box vertices.
+        mask (Tensor): (B, N, 24) Mask.
+
+    Returns:
+        Tensor: (B, N, 9) Sorted indices.
+
+    """
+    num_valid = torch.sum(mask.int(), dim=2).int()  # (B, N)
+    mean = torch.sum(
+        vertices * mask.float().unsqueeze(-1), dim=2,
+        keepdim=True) / num_valid.unsqueeze(-1).unsqueeze(-1)
+    vertices_normalized = vertices - mean  # normalization makes sorting easier
+    return SortVertices.apply(vertices_normalized, mask, num_valid).long()
+
+
+def calculate_area(idx_sorted: Tensor,
+                   vertices: Tensor) -> Tuple[Tensor, Tensor]:
+    """Calculate area of intersection.
+
+    Args:
+        idx_sorted (Tensor): (B, N, 9) Sorted vertex ids.
+        vertices (Tensor): (B, N, 24, 2) Vertices.
+
+    Returns:
+        Tuple:
+         - Tensor (B, N): Area of intersection.
+         - Tensor: (B, N, 9, 2) Vertices of polygon with zero padding.
+    """
+    idx_ext = idx_sorted.unsqueeze(-1).repeat([1, 1, 1, 2])
+    selected = torch.gather(vertices, 2, idx_ext)
+    total = selected[:, :, 0:-1, 0] * selected[:, :, 1:, 1] \
+        - selected[:, :, 0:-1, 1] * selected[:, :, 1:, 0]
+    total = torch.sum(total, dim=2)
+    area = torch.abs(total) / 2
+    return area, selected
+
+
+def oriented_box_intersection_2d(corners1: Tensor,
+                                 corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Calculate intersection area of 2d rotated boxes.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor (B, N): Area of intersection.
+         - Tensor (B, N, 9, 2): Vertices of polygon with zero padding.
+    """
+    intersections, valid_mask = box_intersection(corners1, corners2)
+    c12, c21 = box_in_box(corners1, corners2)
+    vertices, mask = build_vertices(corners1, corners2, c12, c21,
+                                    intersections, valid_mask)
+    sorted_indices = sort_indices(vertices, mask)
+    return calculate_area(sorted_indices, vertices)
+
+
+def box2corners(box: Tensor) -> Tensor:
+    """Convert rotated 2d box coordinate to corners.
+
+    Args:
+        box (Tensor): (B, N, 5) with x, y, w, h, alpha.
+
+    Returns:
+        Tensor: (B, N, 4, 2) Corners.
+    """
+    B = box.size()[0]
+    x, y, w, h, alpha = box.split([1, 1, 1, 1, 1], dim=-1)
+    x4 = box.new_tensor([0.5, -0.5, -0.5, 0.5]).to(box.device)
+    x4 = x4 * w  # (B, N, 4)
+    y4 = box.new_tensor([0.5, 0.5, -0.5, -0.5]).to(box.device)
+    y4 = y4 * h  # (B, N, 4)
+    corners = torch.stack([x4, y4], dim=-1)  # (B, N, 4, 2)
+    sin = torch.sin(alpha)
+    cos = torch.cos(alpha)
+    row1 = torch.cat([cos, sin], dim=-1)
+    row2 = torch.cat([-sin, cos], dim=-1)  # (B, N, 2)
+    rot_T = torch.stack([row1, row2], dim=-2)  # (B, N, 2, 2)
+    rotated = torch.bmm(corners.view([-1, 4, 2]), rot_T.view([-1, 2, 2]))
+    rotated = rotated.view([B, -1, 4, 2])  # (B * N, 4, 2) -> (B, N, 4, 2)
+    rotated[..., 0] += x
+    rotated[..., 1] += y
+    return rotated
+
+
+def diff_iou_rotated_2d(box1: Tensor, box2: Tensor) -> Tensor:
+    """Calculate differentiable iou of rotated 2d boxes.
+
+    Args:
+        box1 (Tensor): (B, N, 5) First box.
+        box2 (Tensor): (B, N, 5) Second box.
+
+    Returns:
+        Tensor: (B, N) IoU.
+    """
+    corners1 = box2corners(box1)
+    corners2 = box2corners(box2)
+    intersection, _ = oriented_box_intersection_2d(corners1,
+                                                   corners2)  # (B, N)
+    area1 = box1[:, :, 2] * box1[:, :, 3]
+    area2 = box2[:, :, 2] * box2[:, :, 3]
+    union = area1 + area2 - intersection
+    iou = intersection / union
+    return iou
+
+
+def diff_iou_rotated_3d(box3d1: Tensor, box3d2: Tensor) -> Tensor:
+    """Calculate differentiable iou of rotated 3d boxes.
+
+    Args:
+        box3d1 (Tensor): (B, N, 3+3+1) First box (x,y,z,w,h,l,alpha).
+        box3d2 (Tensor): (B, N, 3+3+1) Second box (x,y,z,w,h,l,alpha).
+
+    Returns:
+        Tensor: (B, N) IoU.
+    """
+    box1 = box3d1[..., [0, 1, 3, 4, 6]]  # 2d box
+    box2 = box3d2[..., [0, 1, 3, 4, 6]]
+    corners1 = box2corners(box1)
+    corners2 = box2corners(box2)
+    intersection, _ = oriented_box_intersection_2d(corners1, corners2)
+    zmax1 = box3d1[..., 2] + box3d1[..., 5] * 0.5
+    zmin1 = box3d1[..., 2] - box3d1[..., 5] * 0.5
+    zmax2 = box3d2[..., 2] + box3d2[..., 5] * 0.5
+    zmin2 = box3d2[..., 2] - box3d2[..., 5] * 0.5
+    z_overlap = (torch.min(zmax1, zmax2) -
+                 torch.max(zmin1, zmin2)).clamp_(min=0.)
+    intersection_3d = intersection * z_overlap
+    volume1 = box3d1[..., 3] * box3d1[..., 4] * box3d1[..., 5]
+    volume2 = box3d2[..., 3] * box3d2[..., 4] * box3d2[..., 5]
+    union_3d = volume1 + volume2 - intersection_3d
+    return intersection_3d / union_3d
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/filtered_lrelu.py b/head_extractor/mmcv-2.1.0/mmcv/ops/filtered_lrelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..04a98484ab185d10f54b8ef9b842a9993aefbb14
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/filtered_lrelu.py
@@ -0,0 +1,414 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/filtered_lrelu.py # noqa
+import warnings
+from typing import Dict, Optional, Union
+
+import numpy as np
+import torch
+
+from ..utils import ext_loader
+from .bias_act import bias_act
+from .upfirdn2d import _get_filter_size, _parse_padding, upfirdn2d
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['filtered_lrelu', 'filtered_lrelu_act_'])
+
+_plugin = None
+
+
+def filtered_lrelu(input: torch.Tensor,
+                   filter_up: Optional[torch.Tensor] = None,
+                   filter_down: Optional[torch.Tensor] = None,
+                   bias: Optional[torch.Tensor] = None,
+                   up: int = 1,
+                   down: int = 1,
+                   padding: int = 0,
+                   gain: float = np.sqrt(2),
+                   slope: float = 0.2,
+                   clamp: Optional[Union[float, int]] = None,
+                   flip_filter: bool = False,
+                   use_custom_op: bool = True):
+    """Filtered leaky ReLU for a batch of 2D images.
+
+    Performs the following sequence of operations for each channel:
+
+    1. Add channel-specific bias if `bias` is provided.
+
+    2. Upsample the image by inserting N-1 zeros after each pixel (`up`).
+
+    3. Pad the image with the specified number of zeros on each side
+      (`padding`). Negative padding corresponds to cropping the image.
+
+    4. Convolve the image with the specified upsampling FIR filter
+        (`filter_up`), shrinking it so that the footprint of all output pixels
+        lies within the input image.
+
+    5. Multiply each value by the provided gain factor (`gain`).
+
+    6. Apply leaky ReLU activation function to each value.
+
+    7. Clamp each value between -clamp and +clamp, if `clamp` parameter is
+       provided.
+
+    8. Convolve the image with the specified downsampling FIR filter
+        (`filter_down`), shrinking it so that the footprint of all output
+        pixels lies within the input image.
+
+    9. Downsample the image by keeping every Nth pixel (`down`).
+
+    The fused op is considerably more efficient than performing the same
+    calculation using standard PyTorch ops. It supports gradients of arbitrary
+    order.
+
+    Args:
+        input (torch.Tensor): Float32/float16/float64 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter_up (torch.Tensor): Float32 upsampling FIR filter of the shape
+            `[filter_height, filter_width]` (non-separable), `[filter_taps]`
+            (separable), or `None` (identity). Defaults to None.
+        filter_down (torch.Tensor): Float32 downsampling FIR filter of the
+            shape `[filter_height, filter_width]` (non-separable),
+            `[filter_taps]` (separable), or `None` (identity).
+            Defaults to None.
+        bias (torch.Tensor): Bias vector, or `None` to disable. Must be
+            a 1D tensor of the same type as `input`. The length of vector must
+            match the channel dimension of `input`. Defaults to None.
+        up (int): Integer upsampling factor. Defaults to 1.
+        down (int): Integer downsampling factor. Defaults to 1.
+        padding (int): Padding with respect to the upsampled image. Can be a
+            single number or a list/tuple `[x, y]` or `[x_before, x_after,
+            y_before, y_after]`. Defaults to 0.
+        gain (float): Overall scaling factor for signal magnitude.
+            Defaults to np.sqrt(2).
+        slope (float): Slope on the negative side of leaky ReLU.
+            Defaults to 0.2.
+        clamp (Optional[Union[float, int]]): Maximum magnitude for leaky ReLU
+            output. Defaults to None.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+        use_custom_op (bool): Whether to use customized op.
+            Defaults to True.
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height,
+        out_width]`.
+    """
+    assert isinstance(input, torch.Tensor)
+    if use_custom_op and input.is_cuda:
+        return _filtered_lrelu_cuda(
+            up=up,
+            down=down,
+            padding=padding,
+            gain=gain,
+            slope=slope,
+            clamp=clamp,
+            flip_filter=flip_filter).apply(input, filter_up, filter_down, bias,
+                                           None, 0, 0)
+    return _filtered_lrelu_ref(
+        input,
+        filter_up=filter_up,
+        filter_down=filter_down,
+        bias=bias,
+        up=up,
+        down=down,
+        padding=padding,
+        gain=gain,
+        slope=slope,
+        clamp=clamp,
+        flip_filter=flip_filter)
+
+
+def _filtered_lrelu_ref(input: torch.Tensor,
+                        filter_up: Optional[torch.Tensor] = None,
+                        filter_down: Optional[torch.Tensor] = None,
+                        bias: Optional[torch.Tensor] = None,
+                        up: int = 1,
+                        down: int = 1,
+                        padding: int = 0,
+                        gain: float = np.sqrt(2),
+                        slope: float = 0.2,
+                        clamp: Optional[Union[float, int]] = None,
+                        flip_filter: bool = False):
+    """Slow and memory-inefficient reference implementation of
+    `filtered_lrelu()` using existing `upfirdn2n()` and `bias_act()` ops.
+
+    Args:
+        input (torch.Tensor): Float32/float16/float64 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter_up (torch.Tensor): Float32 upsampling FIR filter of the shape
+            `[filter_height, filter_width]` (non-separable), `[filter_taps]`
+            (separable), or `None` (identity). Defaults to None.
+        filter_down (torch.Tensor): Float32 downsampling FIR filter of the
+            shape `[filter_height, filter_width]` (non-separable),
+            `[filter_taps]` (separable), or `None` (identity).
+            Defaults to None.
+        bias (torch.Tensor): Bias vector, or `None` to disable. Must be
+            a 1D tensor of the same type as `input`. The length of vector must
+            match the channel dimension of `input`. Defaults to None.
+        up (int): Integer upsampling factor. Defaults to 1.
+        down (int): Integer downsampling factor. Defaults to 1.
+        padding (int): Padding with respect to the upsampled image. Can be a
+            single number or a list/tuple `[x, y]` or `[x_before, x_after,
+            y_before, y_after]`. Defaults to 0.
+        gain (float): Overall scaling factor for signal magnitude.
+            Defaults to np.sqrt(2).
+        slope (float): Slope on the negative side of leaky ReLU.
+            Defaults to 0.2.
+        clamp (float or int): Maximum magnitude for leaky ReLU
+            output. Defaults to None.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height,
+        out_width]`.
+    """
+    assert isinstance(input, torch.Tensor) and input.ndim == 4
+    filter_up_w, filter_up_h = _get_filter_size(filter_up)
+    filter_down_w, filter_down_h = _get_filter_size(filter_down)
+    if bias is not None:
+        assert isinstance(bias, torch.Tensor) and bias.dtype == input.dtype
+    assert isinstance(up, int) and up >= 1
+    assert isinstance(down, int) and down >= 1
+    px0, px1, py0, py1 = _parse_padding(padding)
+    assert gain == float(gain) and gain > 0
+    assert slope == float(slope) and slope >= 0
+    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
+
+    # Calculate output size.
+    batch_size, channels, in_h, in_w = input.shape
+    in_dtype = input.dtype
+    out_w = (in_w * up + (px0 + px1) - (filter_up_w - 1) -
+             (filter_down_w - 1) + (down - 1)) // down
+    out_h = (in_h * up + (py0 + py1) - (filter_up_h - 1) -
+             (filter_down_h - 1) + (down - 1)) // down
+
+    # Compute using existing ops.
+    output = bias_act(input=input, bias=bias)  # Apply bias.
+    output = upfirdn2d(
+        input=output,
+        filter=filter_up,
+        up=up,
+        padding=[px0, px1, py0, py1],
+        gain=up**2,
+        flip_filter=flip_filter)  # Upsample.
+    output = bias_act(
+        input=output, act='lrelu', alpha=slope, gain=gain,
+        clamp=clamp)  # Bias, leaky ReLU, clamp.
+    output = upfirdn2d(
+        input=output, filter=filter_down, down=down,
+        flip_filter=flip_filter)  # Downsample.
+
+    assert output.shape == (batch_size, channels, out_h, out_w)
+    assert output.dtype == in_dtype
+    return output
+
+
+_filtered_lrelu_cuda_cache: Dict = dict()
+
+
+def _filtered_lrelu_cuda(up: int = 1,
+                         down: int = 1,
+                         padding: int = 0,
+                         gain: float = np.sqrt(2),
+                         slope: float = 0.2,
+                         clamp: Optional[Union[float, int]] = None,
+                         flip_filter: bool = False):
+    """Fast CUDA implementation of `filtered_lrelu()` using custom ops.
+
+    Args:
+        up (int): Integer upsampling factor. Defaults to 1.
+        down (int): Integer downsampling factor. Defaults to 1.
+        padding (int): Padding with respect to the upsampled image. Can be a
+            single number or a list/tuple `[x, y]` or `[x_before, x_after,
+            y_before, y_after]`. Defaults to 0.
+        gain (float): Overall scaling factor for signal magnitude.
+            Defaults to np.sqrt(2).
+        slope (float): Slope on the negative side of leaky ReLU.
+            Defaults to 0.2.
+        clamp (float or int): Maximum magnitude for leaky ReLU
+            output. Defaults to None.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height,
+        out_width]`.
+    """
+    assert isinstance(up, int) and up >= 1
+    assert isinstance(down, int) and down >= 1
+    px0, px1, py0, py1 = _parse_padding(padding)
+    assert gain == float(gain) and gain > 0
+    gain = float(gain)
+    assert slope == float(slope) and slope >= 0
+    slope = float(slope)
+    assert clamp is None or (clamp == float(clamp) and clamp >= 0)
+    clamp = float(clamp if clamp is not None else 'inf')
+
+    # Lookup from cache.
+    key = (up, down, px0, px1, py0, py1, gain, slope, clamp, flip_filter)
+    if key in _filtered_lrelu_cuda_cache:
+        return _filtered_lrelu_cuda_cache[key]
+
+    # Forward op.
+    class FilteredLReluCuda(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, input, filter_up, filter_down, bias, si, sx, sy):
+            # pylint: disable=arguments-differ
+            assert isinstance(input, torch.Tensor) and input.ndim == 4
+
+            # Replace empty up/downsample kernels with full 1x1 kernels
+            # (faster than separable).
+            if filter_up is None:
+                filter_up = torch.ones([1, 1],
+                                       dtype=torch.float32,
+                                       device=input.device)
+            if filter_down is None:
+                filter_down = torch.ones([1, 1],
+                                         dtype=torch.float32,
+                                         device=input.device)
+            assert 1 <= filter_up.ndim <= 2
+            assert 1 <= filter_down.ndim <= 2
+
+            # Replace separable 1x1 kernels with full 1x1 kernels when scale
+            # factor is 1.
+            if up == 1 and filter_up.ndim == 1 and filter_up.shape[0] == 1:
+                filter_up = filter_up.square()[None]
+            if down == 1 and filter_down.ndim == 1 and filter_down.shape[
+                    0] == 1:
+                filter_down = filter_down.square()[None]
+
+            # Missing sign input tensor.
+            if si is None:
+                si = torch.empty([0])
+
+            # Missing bias tensor.
+            if bias is None:
+                bias = torch.zeros([input.shape[1]],
+                                   dtype=input.dtype,
+                                   device=input.device)
+
+            # Construct internal sign tensor only if gradients are needed.
+            write_signs = (si.numel() == 0) and (input.requires_grad
+                                                 or bias.requires_grad)
+
+            # Warn if input storage strides are not in decreasing order due to
+            # e.g. channels-last layout.
+            strides = [
+                input.stride(i) for i in range(input.ndim) if input.size(i) > 1
+            ]
+            if any(a < b for a, b in zip(strides[:-1], strides[1:])):
+                warnings.warn(
+                    'low-performance memory layout detected in filtered_lrelu '
+                    'input', RuntimeWarning)
+
+            # Call C++/Cuda plugin if datatype is supported.
+            if input.dtype in [torch.float16, torch.float32]:
+                if torch.cuda.current_stream(
+                        input.device) != torch.cuda.default_stream(
+                            input.device):
+                    warnings.warn(
+                        'filtered_lrelu called with non-default cuda stream '
+                        'but concurrent execution is not supported',
+                        RuntimeWarning)
+                y, so, return_code = ext_module.filtered_lrelu(
+                    input, filter_up, filter_down, bias, si.to(input.device),
+                    up, down, px0, px1, py0, py1, sx, sy, gain, slope, clamp,
+                    flip_filter, write_signs)
+            else:
+                return_code = -1
+
+            # No Cuda kernel found? Fall back to generic implementation.
+            # Still more memory efficient than the reference implementation
+            # because only the bit-packed sign tensor is retained for gradient
+            # computation.
+            if return_code < 0:
+                warnings.warn(
+                    'filtered_lrelu called with parameters that have no '
+                    'optimized CUDA kernel, using generic fallback',
+                    RuntimeWarning)
+
+                y = input.add(bias.unsqueeze(-1).unsqueeze(-1))  # Add bias.
+                y = upfirdn2d(
+                    input=y,
+                    filter=filter_up,
+                    up=up,
+                    padding=[px0, px1, py0, py1],
+                    gain=float(up**2),
+                    flip_filter=flip_filter)  # Upsample.
+                # Activation function and sign handling. Modifies y in-place.
+                so = ext_module.filtered_lrelu_act_(y, si.to(y.device), sx, sy,
+                                                    gain, slope, clamp,
+                                                    write_signs)
+                y = upfirdn2d(
+                    input=y,
+                    filter=filter_down,
+                    down=down,
+                    flip_filter=flip_filter)  # Downsample.
+
+            # Prepare for gradient computation.
+            ctx.save_for_backward(filter_up, filter_down,
+                                  (si if si.numel() else so))
+            ctx.x_shape = input.shape
+            ctx.y_shape = y.shape
+            ctx.s_ofs = sx, sy
+            return y
+
+        @staticmethod
+        def backward(ctx, dy):  # pylint: disable=arguments-differ
+            filter_up, filter_down, si = ctx.saved_tensors
+            _, _, xh, xw = ctx.x_shape
+            _, _, yh, yw = ctx.y_shape
+            sx, sy = ctx.s_ofs
+            dx = None  # 0
+            dfu = None
+            assert not ctx.needs_input_grad[1]
+            dfd = None
+            assert not ctx.needs_input_grad[2]
+            db = None  # 3
+            dsi = None
+            assert not ctx.needs_input_grad[4]
+            dsx = None
+            assert not ctx.needs_input_grad[5]
+            dsy = None
+            assert not ctx.needs_input_grad[6]
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[3]:
+                pp = [
+                    (filter_up.shape[-1] - 1) + (filter_down.shape[-1] - 1) -
+                    px0,
+                    xw * up - yw * down + px0 - (up - 1),
+                    (filter_up.shape[0] - 1) + (filter_down.shape[0] - 1) -
+                    py0,
+                    xh * up - yh * down + py0 - (up - 1),
+                ]
+                gg = gain * (up**2) / (down**2)
+                ff = (not flip_filter)
+                sx = sx - (filter_up.shape[-1] - 1) + px0
+                sy = sy - (filter_up.shape[0] - 1) + py0
+                dx = _filtered_lrelu_cuda(
+                    up=down,
+                    down=up,
+                    padding=pp,
+                    gain=gg,
+                    slope=slope,
+                    clamp=None,
+                    flip_filter=ff).apply(dy, filter_down, filter_up, None, si,
+                                          sx, sy)
+
+            if ctx.needs_input_grad[3]:
+                db = dx.sum([0, 2, 3])
+
+            return dx, dfu, dfd, db, dsi, dsx, dsy
+
+    # Add to cache.
+    _filtered_lrelu_cuda_cache[key] = FilteredLReluCuda
+    return FilteredLReluCuda
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/focal_loss.py b/head_extractor/mmcv-2.1.0/mmcv/ops/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..69aab7305205f1024dbbd2976517ae5ec3e7af9d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/focal_loss.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'sigmoid_focal_loss_forward', 'sigmoid_focal_loss_backward',
+    'softmax_focal_loss_forward', 'softmax_focal_loss_backward'
+])
+
+
+class SigmoidFocalLossFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input: torch.Tensor,
+                target: Union[torch.LongTensor, torch.cuda.LongTensor],
+                gamma: float = 2.0,
+                alpha: float = 0.25,
+                weight: Optional[torch.Tensor] = None,
+                reduction: str = 'mean') -> torch.Tensor:
+
+        assert target.dtype == torch.long
+        assert input.dim() == 2
+        assert target.dim() == 1
+        assert input.size(0) == target.size(0)
+        if weight is None:
+            weight = input.new_empty(0)
+        else:
+            assert weight.dim() == 1
+            assert input.size(1) == weight.size(0)
+        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
+        assert reduction in ctx.reduction_dict.keys()
+
+        ctx.gamma = float(gamma)
+        ctx.alpha = float(alpha)
+        ctx.reduction = ctx.reduction_dict[reduction]
+
+        output = input.new_zeros(input.size())
+
+        ext_module.sigmoid_focal_loss_forward(
+            input, target, weight, output, gamma=ctx.gamma, alpha=ctx.alpha)
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            output = output.sum() / input.size(0)
+        elif ctx.reduction == ctx.reduction_dict['sum']:
+            output = output.sum()
+        ctx.save_for_backward(input, target, weight)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        input, target, weight = ctx.saved_tensors
+
+        grad_input = input.new_zeros(input.size())
+
+        ext_module.sigmoid_focal_loss_backward(
+            input,
+            target,
+            weight,
+            grad_input,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        grad_input *= grad_output
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            grad_input /= input.size(0)
+        return grad_input, None, None, None, None, None
+
+
+sigmoid_focal_loss = SigmoidFocalLossFunction.apply
+
+
+class SigmoidFocalLoss(nn.Module):
+
+    def __init__(self,
+                 gamma: float,
+                 alpha: float,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean'):
+        super().__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.register_buffer('weight', weight)
+        self.reduction = reduction
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        target: Union[torch.LongTensor, torch.cuda.LongTensor],
+    ) -> torch.Tensor:
+        return sigmoid_focal_loss(input, target, self.gamma, self.alpha,
+                                  self.weight, self.reduction)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(gamma={self.gamma}, '
+        s += f'alpha={self.alpha}, '
+        s += f'reduction={self.reduction})'
+        return s
+
+
+class SoftmaxFocalLossFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input: torch.Tensor,
+                target: Union[torch.LongTensor, torch.cuda.LongTensor],
+                gamma: float = 2.0,
+                alpha: float = 0.25,
+                weight: Optional[torch.Tensor] = None,
+                reduction='mean') -> torch.Tensor:
+
+        assert target.dtype == torch.long
+        assert input.dim() == 2
+        assert target.dim() == 1
+        assert input.size(0) == target.size(0)
+        if weight is None:
+            weight = input.new_empty(0)
+        else:
+            assert weight.dim() == 1
+            assert input.size(1) == weight.size(0)
+        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
+        assert reduction in ctx.reduction_dict.keys()
+
+        ctx.gamma = float(gamma)
+        ctx.alpha = float(alpha)
+        ctx.reduction = ctx.reduction_dict[reduction]
+
+        channel_stats, _ = torch.max(input, dim=1)
+        input_softmax = input - channel_stats.unsqueeze(1).expand_as(input)
+        input_softmax.exp_()
+
+        channel_stats = input_softmax.sum(dim=1)
+        input_softmax /= channel_stats.unsqueeze(1).expand_as(input)
+
+        output = input.new_zeros(input.size(0))
+        ext_module.softmax_focal_loss_forward(
+            input_softmax,
+            target,
+            weight,
+            output,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            output = output.sum() / input.size(0)
+        elif ctx.reduction == ctx.reduction_dict['sum']:
+            output = output.sum()
+        ctx.save_for_backward(input_softmax, target, weight)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        input_softmax, target, weight = ctx.saved_tensors
+        buff = input_softmax.new_zeros(input_softmax.size(0))
+        grad_input = input_softmax.new_zeros(input_softmax.size())
+
+        ext_module.softmax_focal_loss_backward(
+            input_softmax,
+            target,
+            weight,
+            buff,
+            grad_input,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        grad_input *= grad_output
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            grad_input /= input_softmax.size(0)
+        return grad_input, None, None, None, None, None
+
+
+softmax_focal_loss = SoftmaxFocalLossFunction.apply
+
+
+class SoftmaxFocalLoss(nn.Module):
+
+    def __init__(self,
+                 gamma: float,
+                 alpha: float,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean'):
+        super().__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.register_buffer('weight', weight)
+        self.reduction = reduction
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        target: Union[torch.LongTensor, torch.cuda.LongTensor],
+    ) -> torch.Tensor:
+        return softmax_focal_loss(input, target, self.gamma, self.alpha,
+                                  self.weight, self.reduction)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(gamma={self.gamma}, '
+        s += f'alpha={self.alpha}, '
+        s += f'reduction={self.reduction})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/furthest_point_sample.py b/head_extractor/mmcv-2.1.0/mmcv/ops/furthest_point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..22b1a3048d08b3f1eda43e4a3d5c36a6f6ab5349
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/furthest_point_sample.py
@@ -0,0 +1,84 @@
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'furthest_point_sampling_forward',
+    'furthest_point_sampling_with_dist_forward'
+])
+
+
+class FurthestPointSampling(Function):
+    """Uses iterative furthest point sampling to select a set of features whose
+    corresponding points have the furthest distance."""
+
+    @staticmethod
+    def forward(ctx, points_xyz: torch.Tensor,
+                num_points: int) -> torch.Tensor:
+        """
+        Args:
+            points_xyz (torch.Tensor): (B, N, 3) where N > num_points.
+            num_points (int): Number of points in the sampled set.
+
+        Returns:
+            torch.Tensor: (B, num_points) indices of the sampled points.
+        """
+        assert points_xyz.is_contiguous()
+
+        B, N = points_xyz.size()[:2]
+        output = torch.cuda.IntTensor(B, num_points)
+        temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
+
+        ext_module.furthest_point_sampling_forward(
+            points_xyz,
+            temp,
+            output,
+            b=B,
+            n=N,
+            m=num_points,
+        )
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(output)
+        return output
+
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+
+
+class FurthestPointSamplingWithDist(Function):
+    """Uses iterative furthest point sampling to select a set of features whose
+    corresponding points have the furthest distance."""
+
+    @staticmethod
+    def forward(ctx, points_dist: torch.Tensor,
+                num_points: int) -> torch.Tensor:
+        """
+        Args:
+            points_dist (torch.Tensor): (B, N, N) Distance between each point
+                pair.
+            num_points (int): Number of points in the sampled set.
+
+        Returns:
+            torch.Tensor: (B, num_points) indices of the sampled points.
+        """
+        assert points_dist.is_contiguous()
+
+        B, N, _ = points_dist.size()
+        output = points_dist.new_zeros([B, num_points], dtype=torch.int32)
+        temp = points_dist.new_zeros([B, N]).fill_(1e10)
+
+        ext_module.furthest_point_sampling_with_dist_forward(
+            points_dist, temp, output, b=B, n=N, m=num_points)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(output)
+        return output
+
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+
+
+furthest_point_sample = FurthestPointSampling.apply
+furthest_point_sample_with_dist = FurthestPointSamplingWithDist.apply
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/fused_bias_leakyrelu.py b/head_extractor/mmcv-2.1.0/mmcv/ops/fused_bias_leakyrelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e23617fb3af36234f1694e7c1210797d04b72113
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/fused_bias_leakyrelu.py
@@ -0,0 +1,282 @@
+# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py # noqa:E501
+
+# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+# Augmentation (ADA)
+# =======================================================================
+
+# 1. Definitions
+
+# "Licensor" means any person or entity that distributes its Work.
+
+# "Software" means the original work of authorship made available under
+# this License.
+
+# "Work" means the Software and any additions to or derivative works of
+# the Software that are made available under this License.
+
+# The terms "reproduce," "reproduction," "derivative works," and
+# "distribution" have the meaning as provided under U.S. copyright law;
+# provided, however, that for the purposes of this License, derivative
+# works shall not include works that remain separable from, or merely
+# link (or bind by name) to the interfaces of, the Work.
+
+# Works, including the Software, are "made available" under this License
+# by including in or with the Work either (a) a copyright notice
+# referencing the applicability of this License to the Work, or (b) a
+# copy of this License.
+
+# 2. License Grants
+
+#     2.1 Copyright Grant. Subject to the terms and conditions of this
+#     License, each Licensor grants to you a perpetual, worldwide,
+#     non-exclusive, royalty-free, copyright license to reproduce,
+#     prepare derivative works of, publicly display, publicly perform,
+#     sublicense and distribute its Work and any resulting derivative
+#     works in any form.
+
+# 3. Limitations
+
+#     3.1 Redistribution. You may reproduce or distribute the Work only
+#     if (a) you do so under this License, (b) you include a complete
+#     copy of this License with your distribution, and (c) you retain
+#     without modification any copyright, patent, trademark, or
+#     attribution notices that are present in the Work.
+
+#     3.2 Derivative Works. You may specify that additional or different
+#     terms apply to the use, reproduction, and distribution of your
+#     derivative works of the Work ("Your Terms") only if (a) Your Terms
+#     provide that the use limitation in Section 3.3 applies to your
+#     derivative works, and (b) you identify the specific derivative
+#     works that are subject to Your Terms. Notwithstanding Your Terms,
+#     this License (including the redistribution requirements in Section
+#     3.1) will continue to apply to the Work itself.
+
+#     3.3 Use Limitation. The Work and any derivative works thereof only
+#     may be used or intended for use non-commercially. Notwithstanding
+#     the foregoing, NVIDIA and its affiliates may use the Work and any
+#     derivative works commercially. As used herein, "non-commercially"
+#     means for research or evaluation purposes only.
+
+#     3.4 Patent Claims. If you bring or threaten to bring a patent claim
+#     against any Licensor (including any claim, cross-claim or
+#     counterclaim in a lawsuit) to enforce any patents that you allege
+#     are infringed by any Work, then your rights under this License from
+#     such Licensor (including the grant in Section 2.1) will terminate
+#     immediately.
+
+#     3.5 Trademarks. This License does not grant any rights to use any
+#     Licensor’s or its affiliates’ names, logos, or trademarks, except
+#     as necessary to reproduce the notices described in this License.
+
+#     3.6 Termination. If you violate any term of this License, then your
+#     rights under this License (including the grant in Section 2.1) will
+#     terminate immediately.
+
+# 4. Disclaimer of Warranty.
+
+# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+# THIS LICENSE.
+
+# 5. Limitation of Liability.
+
+# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGES.
+
+# =======================================================================
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['fused_bias_leakyrelu'])
+
+
+class FusedBiasLeakyReLUFunctionBackward(Function):
+    """Calculate second order deviation.
+
+    This function is to compute the second order deviation for the fused leaky
+    relu operation.
+    """
+
+    @staticmethod
+    def forward(ctx, grad_output: torch.Tensor, out: torch.Tensor,
+                negative_slope: float, scale: float) -> tuple:
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+
+        empty = grad_output.new_empty(0)
+
+        grad_input = ext_module.fused_bias_leakyrelu(
+            grad_output,
+            empty,
+            out,
+            act=3,
+            grad=1,
+            alpha=negative_slope,
+            scale=scale)
+
+        dim = [0]
+
+        if grad_input.ndim > 2:
+            dim += list(range(2, grad_input.ndim))
+
+        grad_bias = grad_input.sum(dim).detach()
+
+        return grad_input, grad_bias
+
+    @staticmethod
+    def backward(ctx, gradgrad_input: torch.Tensor,
+                 gradgrad_bias: nn.Parameter) -> tuple:
+        out, = ctx.saved_tensors
+
+        # The second order deviation, in fact, contains two parts, while the
+        # the first part is zero. Thus, we direct consider the second part
+        # which is similar with the first order deviation in implementation.
+        gradgrad_out = ext_module.fused_bias_leakyrelu(
+            gradgrad_input,
+            gradgrad_bias.to(out.dtype),
+            out,
+            act=3,
+            grad=1,
+            alpha=ctx.negative_slope,
+            scale=ctx.scale)
+
+        return gradgrad_out, None, None, None
+
+
+class FusedBiasLeakyReLUFunction(Function):
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor, bias: nn.Parameter,
+                negative_slope: float, scale: float) -> torch.Tensor:
+        empty = input.new_empty(0)
+
+        out = ext_module.fused_bias_leakyrelu(
+            input,
+            bias,
+            empty,
+            act=3,
+            grad=0,
+            alpha=negative_slope,
+            scale=scale)
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        out, = ctx.saved_tensors
+
+        grad_input, grad_bias = FusedBiasLeakyReLUFunctionBackward.apply(
+            grad_output, out, ctx.negative_slope, ctx.scale)
+
+        return grad_input, grad_bias, None, None
+
+
+class FusedBiasLeakyReLU(nn.Module):
+    r"""Fused bias leaky ReLU.
+
+    This function is introduced in the StyleGAN2:
+    `Analyzing and Improving the Image Quality of StyleGAN
+    <http://arxiv.org/abs/1912.04958>`_
+
+    The bias term comes from the convolution operation. In addition, to keep
+    the variance of the feature map or gradients unchanged, they also adopt a
+    scale similarly with Kaiming initialization. However, since the
+    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
+    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
+    your own scale.
+
+    TODO: Implement the CPU version.
+
+    Args:
+        num_channels (int): The channel number of the feature map.
+        negative_slope (float, optional): Same as nn.LeakyRelu.
+            Defaults to 0.2.
+        scale (float, optional): A scalar to adjust the variance of the feature
+            map. Defaults to 2**0.5.
+    """
+
+    def __init__(self,
+                 num_channels: int,
+                 negative_slope: float = 0.2,
+                 scale: float = 2**0.5):
+        super().__init__()
+
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.negative_slope = negative_slope
+        self.scale = scale
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return fused_bias_leakyrelu(input, self.bias, self.negative_slope,
+                                    self.scale)
+
+
+def fused_bias_leakyrelu(input: torch.Tensor,
+                         bias: nn.Parameter,
+                         negative_slope: float = 0.2,
+                         scale: float = 2**0.5) -> torch.Tensor:
+    r"""Fused bias leaky ReLU function.
+
+    This function is introduced in the StyleGAN2:
+    `Analyzing and Improving the Image Quality of StyleGAN
+    <http://arxiv.org/abs/1912.04958>`_
+
+    The bias term comes from the convolution operation. In addition, to keep
+    the variance of the feature map or gradients unchanged, they also adopt a
+    scale similarly with Kaiming initialization. However, since the
+    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
+    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
+    your own scale.
+
+    Args:
+        input (torch.Tensor): Input feature map.
+        bias (nn.Parameter): The bias from convolution operation.
+        negative_slope (float, optional): Same as nn.LeakyRelu.
+            Defaults to 0.2.
+        scale (float, optional): A scalar to adjust the variance of the feature
+            map. Defaults to 2**0.5.
+
+    Returns:
+        torch.Tensor: Feature map after non-linear activation.
+    """
+
+    if not input.is_cuda:
+        return bias_leakyrelu_ref(input, bias, negative_slope, scale)
+
+    return FusedBiasLeakyReLUFunction.apply(input, bias.to(input.dtype),
+                                            negative_slope, scale)
+
+
+def bias_leakyrelu_ref(x: torch.Tensor,
+                       bias: nn.Parameter,
+                       negative_slope: float = 0.2,
+                       scale: float = 2**0.5) -> torch.Tensor:
+
+    if bias is not None:
+        assert bias.ndim == 1
+        assert bias.shape[0] == x.shape[1]
+        x = x + bias.reshape([-1 if i == 1 else 1 for i in range(x.ndim)])
+
+    x = F.leaky_relu(x, negative_slope)
+    if scale != 1:
+        x = x * scale
+
+    return x
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/gather_points.py b/head_extractor/mmcv-2.1.0/mmcv/ops/gather_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..895bfab643ba5c9da218e398501c12a646b869e8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/gather_points.py
@@ -0,0 +1,59 @@
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['gather_points_forward', 'gather_points_backward'])
+
+
+class GatherPoints(Function):
+    """Gather points with given index."""
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): (B, C, N) features to gather.
+            indices (torch.Tensor): (B, M) where M is the number of points.
+
+        Returns:
+            torch.Tensor: (B, C, M) where M is the number of points.
+        """
+        assert features.is_contiguous()
+        assert indices.is_contiguous()
+
+        B, npoint = indices.size()
+        _, C, N = features.size()
+        output = features.new_zeros((B, C, npoint))
+
+        ext_module.gather_points_forward(
+            features, indices, output, b=B, c=C, n=N, npoints=npoint)
+
+        ctx.for_backwards = (indices, C, N)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(indices)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
+        idx, C, N = ctx.for_backwards
+        B, npoint = idx.size()
+
+        grad_features = grad_out.new_zeros((B, C, N))
+        grad_out_data = grad_out.data.contiguous()
+        ext_module.gather_points_backward(
+            grad_out_data,
+            idx,
+            grad_features.data,
+            b=B,
+            c=C,
+            n=N,
+            npoints=npoint)
+        return grad_features, None
+
+
+gather_points = GatherPoints.apply
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/group_points.py b/head_extractor/mmcv-2.1.0/mmcv/ops/group_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..999728c22a4cc4aa3b368d1261b29a67e11d5523
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/group_points.py
@@ -0,0 +1,299 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+from .ball_query import ball_query
+from .knn import knn
+
+ext_module = ext_loader.load_ext('_ext', [
+    'group_points_forward', 'group_points_backward',
+    'stack_group_points_forward', 'stack_group_points_backward'
+])
+
+
+class QueryAndGroup(nn.Module):
+    """Groups points with a ball query of radius.
+
+    Args:
+        max_radius (float): The maximum radius of the balls.
+            If None is given, we will use kNN sampling instead of ball query.
+        sample_num (int): Maximum number of features to gather in the ball.
+        min_radius (float, optional): The minimum radius of the balls.
+            Default: 0.
+        use_xyz (bool, optional): Whether to use xyz.
+            Default: True.
+        return_grouped_xyz (bool, optional): Whether to return grouped xyz.
+            Default: False.
+        normalize_xyz (bool, optional): Whether to normalize xyz.
+            Default: False.
+        uniform_sample (bool, optional): Whether to sample uniformly.
+            Default: False
+        return_unique_cnt (bool, optional): Whether to return the count of
+            unique samples. Default: False.
+        return_grouped_idx (bool, optional): Whether to return grouped idx.
+            Default: False.
+    """
+
+    def __init__(self,
+                 max_radius: float,
+                 sample_num: int,
+                 min_radius: float = 0.,
+                 use_xyz: bool = True,
+                 return_grouped_xyz: bool = False,
+                 normalize_xyz: bool = False,
+                 uniform_sample: bool = False,
+                 return_unique_cnt: bool = False,
+                 return_grouped_idx: bool = False):
+        super().__init__()
+        self.max_radius = max_radius
+        self.min_radius = min_radius
+        self.sample_num = sample_num
+        self.use_xyz = use_xyz
+        self.return_grouped_xyz = return_grouped_xyz
+        self.normalize_xyz = normalize_xyz
+        self.uniform_sample = uniform_sample
+        self.return_unique_cnt = return_unique_cnt
+        self.return_grouped_idx = return_grouped_idx
+        if self.return_unique_cnt:
+            assert self.uniform_sample, \
+                'uniform_sample should be True when ' \
+                'returning the count of unique samples'
+        if self.max_radius is None:
+            assert not self.normalize_xyz, \
+                'can not normalize grouped xyz when max_radius is None'
+
+    def forward(
+        self,
+        points_xyz: torch.Tensor,
+        center_xyz: torch.Tensor,
+        features: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple]:
+        """
+        Args:
+            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of the
+                points.
+            center_xyz (torch.Tensor): (B, npoint, 3) coordinates of the
+                centriods.
+            features (torch.Tensor): (B, C, N) The features of grouped
+                points.
+
+        Returns:
+            Tuple | torch.Tensor: (B, 3 + C, npoint, sample_num) Grouped
+            concatenated coordinates and features of points.
+        """
+        # if self.max_radius is None, we will perform kNN instead of ball query
+        # idx is of shape [B, npoint, sample_num]
+        if self.max_radius is None:
+            idx = knn(self.sample_num, points_xyz, center_xyz, False)
+            idx = idx.transpose(1, 2).contiguous()
+        else:
+            idx = ball_query(self.min_radius, self.max_radius, self.sample_num,
+                             points_xyz, center_xyz)
+
+        if self.uniform_sample:
+            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
+            for i_batch in range(idx.shape[0]):
+                for i_region in range(idx.shape[1]):
+                    unique_ind = torch.unique(idx[i_batch, i_region, :])
+                    num_unique = unique_ind.shape[0]
+                    unique_cnt[i_batch, i_region] = num_unique
+                    sample_ind = torch.randint(
+                        0,
+                        num_unique, (self.sample_num - num_unique, ),
+                        dtype=torch.long)
+                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
+                    idx[i_batch, i_region, :] = all_ind
+
+        xyz_trans = points_xyz.transpose(1, 2).contiguous()
+        # (B, 3, npoint, sample_num)
+        grouped_xyz = grouping_operation(xyz_trans, idx)
+        grouped_xyz_diff = grouped_xyz - \
+            center_xyz.transpose(1, 2).unsqueeze(-1)  # relative offsets
+        if self.normalize_xyz:
+            grouped_xyz_diff /= self.max_radius
+
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                # (B, C + 3, npoint, sample_num)
+                new_features = torch.cat([grouped_xyz_diff, grouped_features],
+                                         dim=1)
+            else:
+                new_features = grouped_features
+        else:
+            assert (self.use_xyz
+                    ), 'Cannot have not features and not use xyz as a feature!'
+            new_features = grouped_xyz_diff
+
+        ret = [new_features]
+        if self.return_grouped_xyz:
+            ret.append(grouped_xyz)
+        if self.return_unique_cnt:
+            ret.append(unique_cnt)
+        if self.return_grouped_idx:
+            ret.append(idx)
+        if len(ret) == 1:
+            return ret[0]
+        else:
+            return tuple(ret)
+
+
+class GroupAll(nn.Module):
+    """Group xyz with feature.
+
+    Args:
+        use_xyz (bool): Whether to use xyz.
+    """
+
+    def __init__(self, use_xyz: bool = True):
+        super().__init__()
+        self.use_xyz = use_xyz
+
+    def forward(self,
+                xyz: torch.Tensor,
+                new_xyz: torch.Tensor,
+                features: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Args:
+            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            new_xyz (Tensor): new xyz coordinates of the features.
+            features (Tensor): (B, C, N) features to group.
+
+        Returns:
+            Tensor: (B, C + 3, 1, N) Grouped feature.
+        """
+        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
+        if features is not None:
+            grouped_features = features.unsqueeze(2)
+            if self.use_xyz:
+                # (B, 3 + C, 1, N)
+                new_features = torch.cat([grouped_xyz, grouped_features],
+                                         dim=1)
+            else:
+                new_features = grouped_features
+        else:
+            new_features = grouped_xyz
+
+        return new_features
+
+
+class GroupingOperation(Function):
+    """Group feature with given index."""
+
+    @staticmethod
+    def forward(
+            ctx,
+            features: torch.Tensor,
+            indices: torch.Tensor,
+            features_batch_cnt: Optional[torch.Tensor] = None,
+            indices_batch_cnt: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Args:
+            features (Tensor): Tensor of features to group, input shape is
+                (B, C, N) or stacked inputs (N1 + N2 ..., C).
+            indices (Tensor):  The indices of features to group with, input
+                shape is (B, npoint, nsample) or stacked inputs
+                (M1 + M2 ..., nsample).
+            features_batch_cnt (Tensor, optional): Input features nums in
+                each batch, just like (N1, N2, ...). Defaults to None.
+                New in version 1.7.0.
+            indices_batch_cnt (Tensor, optional): Input indices nums in
+                each batch, just like (M1, M2, ...). Defaults to None.
+                New in version 1.7.0.
+
+        Returns:
+            Tensor: Grouped features, the shape is (B, C, npoint, nsample)
+            or (M1 + M2 ..., C, nsample).
+        """
+        features = features.contiguous()
+        indices = indices.contiguous()
+        if features_batch_cnt is not None and indices_batch_cnt is not None:
+            assert features_batch_cnt.dtype == torch.int
+            assert indices_batch_cnt.dtype == torch.int
+            M, nsample = indices.size()
+            N, C = features.size()
+            B = indices_batch_cnt.shape[0]
+            output = features.new_zeros((M, C, nsample))
+            ext_module.stack_group_points_forward(
+                features,
+                features_batch_cnt,
+                indices,
+                indices_batch_cnt,
+                output,
+                b=B,
+                m=M,
+                c=C,
+                nsample=nsample)
+            ctx.for_backwards = (B, N, indices, features_batch_cnt,
+                                 indices_batch_cnt)
+        else:
+            B, nfeatures, nsample = indices.size()
+            _, C, N = features.size()
+            output = features.new_zeros(B, C, nfeatures, nsample)
+
+            ext_module.group_points_forward(
+                features,
+                indices,
+                output,
+                b=B,
+                c=C,
+                n=N,
+                npoints=nfeatures,
+                nsample=nsample)
+
+            ctx.for_backwards = (indices, N)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple:
+        """
+        Args:
+            grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
+                of the output from forward.
+
+        Returns:
+            Tensor: (B, C, N) gradient of the features.
+        """
+        if len(ctx.for_backwards) != 5:
+            idx, N = ctx.for_backwards
+
+            B, C, npoint, nsample = grad_out.size()
+            grad_features = grad_out.new_zeros(B, C, N)
+
+            grad_out_data = grad_out.data.contiguous()
+            ext_module.group_points_backward(
+                grad_out_data,
+                idx,
+                grad_features.data,
+                b=B,
+                c=C,
+                n=N,
+                npoints=npoint,
+                nsample=nsample)
+            return grad_features, None
+        else:
+            B, N, idx, features_batch_cnt, idx_batch_cnt = ctx.for_backwards
+
+            M, C, nsample = grad_out.size()
+            grad_features = grad_out.new_zeros(N, C)
+
+            grad_out_data = grad_out.data.contiguous()
+            ext_module.stack_group_points_backward(
+                grad_out_data,
+                idx,
+                idx_batch_cnt,
+                features_batch_cnt,
+                grad_features.data,
+                b=B,
+                c=C,
+                m=M,
+                n=N,
+                nsample=nsample)
+            return grad_features, None, None, None
+
+
+grouping_operation = GroupingOperation.apply
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/info.py b/head_extractor/mmcv-2.1.0/mmcv/ops/info.py
new file mode 100644
index 0000000000000000000000000000000000000000..b24b981f8f513b3bf5c2300d37375acaded62e21
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/info.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+if torch.__version__ == 'parrots':
+    import parrots
+
+    def get_compiler_version():
+        return 'GCC ' + parrots.version.compiler
+
+    def get_compiling_cuda_version():
+        return parrots.version.cuda
+else:
+    from ..utils import ext_loader
+    ext_module = ext_loader.load_ext(
+        '_ext', ['get_compiler_version', 'get_compiling_cuda_version'])
+
+    def get_compiler_version():
+        return ext_module.get_compiler_version()
+
+    def get_compiling_cuda_version():
+        return ext_module.get_compiling_cuda_version()
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/iou3d.py b/head_extractor/mmcv-2.1.0/mmcv/ops/iou3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..94e2057ad2530e25a53ad89a0d2d78ee75ca0483
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/iou3d.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'iou3d_boxes_overlap_bev_forward', 'iou3d_nms3d_forward',
+    'iou3d_nms3d_normal_forward'
+])
+
+
+def boxes_overlap_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
+    """Calculate boxes BEV overlap.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).
+
+    Returns:
+        torch.Tensor: BEV overlap result with shape (M, N).
+    """
+    ans_overlap = boxes_a.new_zeros(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
+    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),
+                                               boxes_b.contiguous(),
+                                               ans_overlap)
+
+    return ans_overlap
+
+
+def boxes_iou3d(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
+    """Calculate boxes 3D IoU.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).
+
+    Returns:
+        torch.Tensor: 3D IoU result with shape (M, N).
+    """
+    assert boxes_a.shape[1] == boxes_b.shape[1] == 7,\
+        'Input boxes shape should be (N, 7)'
+
+    boxes_a_height_max = (boxes_a[:, 2] + boxes_a[:, 5] / 2).view(-1, 1)
+    boxes_a_height_min = (boxes_a[:, 2] - boxes_a[:, 5] / 2).view(-1, 1)
+    boxes_b_height_max = (boxes_b[:, 2] + boxes_b[:, 5] / 2).view(1, -1)
+    boxes_b_height_min = (boxes_b[:, 2] - boxes_b[:, 5] / 2).view(1, -1)
+
+    overlaps_bev = boxes_a.new_zeros(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
+    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),
+                                               boxes_b.contiguous(),
+                                               overlaps_bev)
+
+    max_of_min = torch.max(boxes_a_height_min, boxes_b_height_min)
+    min_of_max = torch.min(boxes_a_height_max, boxes_b_height_max)
+    overlaps_h = torch.clamp(min_of_max - max_of_min, min=0)
+    overlaps_3d = overlaps_bev * overlaps_h
+    vol_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1)
+    vol_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1)
+    iou3d = overlaps_3d / torch.clamp(vol_a + vol_b - overlaps_3d, min=1e-6)
+    return iou3d
+
+
+def nms3d(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
+    """3D NMS function GPU implementation (for BEV boxes).
+
+    Args:
+        boxes (torch.Tensor): Input boxes with the shape of (N, 7)
+            ([x, y, z, dx, dy, dz, heading]).
+        scores (torch.Tensor): Scores of boxes with the shape of (N).
+        iou_threshold (float): Overlap threshold of NMS.
+
+    Returns:
+        torch.Tensor: Indexes after NMS.
+    """
+    assert boxes.size(1) == 7, 'Input boxes shape should be (N, 7)'
+    order = scores.sort(0, descending=True)[1]
+    boxes = boxes[order].contiguous()
+
+    keep = boxes.new_zeros(boxes.size(0), dtype=torch.long)
+    num_out = boxes.new_zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms3d_forward(
+        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
+    keep = order[keep[:num_out].to(boxes.device)].contiguous()
+    return keep
+
+
+def nms3d_normal(boxes: Tensor, scores: Tensor,
+                 iou_threshold: float) -> Tensor:
+    """Normal 3D NMS function GPU implementation. The overlap of two boxes for
+    IoU calculation is defined as the exact overlapping area of the two boxes
+    WITH their yaw angle set to 0.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with shape (N, 7).
+            ([x, y, z, dx, dy, dz, heading]).
+        scores (torch.Tensor): Scores of predicted boxes with shape (N).
+        iou_threshold (float): Overlap threshold of NMS.
+
+    Returns:
+        torch.Tensor: Remaining indices with scores in descending order.
+    """
+    assert boxes.shape[1] == 7, 'Input boxes shape should be (N, 7)'
+    order = scores.sort(0, descending=True)[1]
+    boxes = boxes[order].contiguous()
+
+    keep = boxes.new_zeros(boxes.size(0), dtype=torch.long)
+    num_out = boxes.new_zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms3d_normal_forward(
+        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
+    return order[keep[:num_out].to(boxes.device)].contiguous()
+
+
+def _xyxyr2xywhr(boxes: Tensor) -> Tensor:
+    """Convert [x1, y1, x2, y2, heading] box to [x, y, dx, dy, heading] box.
+
+    Args:
+        box (torch.Tensor): Input boxes with shape (N, 5).
+
+    Returns:
+        torch.Tensor: Converted boxes with shape (N, 7).
+    """
+    warnings.warn(
+        'This function is deprecated and will be removed in the future.',
+        DeprecationWarning)
+    return torch.stack(
+        ((boxes[:, 0] + boxes[:, 2]) / 2, (boxes[:, 1] + boxes[:, 3]) / 2,
+         boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1], boxes[:, 4]),
+        dim=-1)
+
+
+def boxes_iou_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
+    """Calculate boxes IoU in the Bird's Eye View.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 5)
+            ([x1, y1, x2, y2, ry]).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 5)
+            ([x1, y1, x2, y2, ry]).
+
+    Returns:
+        torch.Tensor: IoU result with shape (M, N).
+    """
+    from .box_iou_rotated import box_iou_rotated
+
+    warnings.warn(
+        '`iou3d.boxes_iou_bev` is deprecated and will be removed in'
+        ' the future. Please, use `box_iou_rotated.box_iou_rotated`.',
+        DeprecationWarning)
+
+    return box_iou_rotated(_xyxyr2xywhr(boxes_a), _xyxyr2xywhr(boxes_b))
+
+
+def nms_bev(boxes: Tensor,
+            scores: Tensor,
+            thresh: float,
+            pre_max_size: Optional[int] = None,
+            post_max_size: Optional[int] = None) -> Tensor:
+    """NMS function GPU implementation (for BEV boxes).
+
+    The overlap of two boxes for IoU calculation is defined as the exact
+    overlapping area of the two boxes. In this function, one can also
+    set ``pre_max_size`` and ``post_max_size``.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with the shape of (N, 5)
+            ([x1, y1, x2, y2, ry]).
+        scores (torch.Tensor): Scores of boxes with the shape of (N,).
+        thresh (float): Overlap threshold of NMS.
+        pre_max_size (int, optional): Max size of boxes before NMS.
+            Default: None.
+        post_max_size (int, optional): Max size of boxes after NMS.
+            Default: None.
+
+    Returns:
+        torch.Tensor: Indexes after NMS.
+    """
+    from .nms import nms_rotated
+
+    warnings.warn(
+        '`iou3d.nms_bev` is deprecated and will be removed in'
+        ' the future. Please, use `nms.nms_rotated`.', DeprecationWarning)
+    assert boxes.size(1) == 5, 'Input boxes shape should be (N, 5)'
+    order = scores.sort(0, descending=True)[1]
+
+    if pre_max_size is not None:
+        order = order[:pre_max_size]
+    boxes = _xyxyr2xywhr(boxes)[order]
+    scores = scores[order]
+
+    keep = nms_rotated(boxes, scores, thresh)[1]
+    keep = order[keep]
+
+    if post_max_size is not None:
+        keep = keep[:post_max_size]
+    return keep
+
+
+def nms_normal_bev(boxes: Tensor, scores: Tensor, thresh: float) -> Tensor:
+    """Normal NMS function GPU implementation (for BEV boxes).
+
+    The overlap of two boxes for IoU calculation is defined as the exact
+    overlapping area of the two boxes WITH their yaw angle set to 0.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with shape (N, 5)
+            ([x1, y1, x2, y2, ry]).
+        scores (torch.Tensor): Scores of predicted boxes with shape (N,).
+        thresh (float): Overlap threshold of NMS.
+
+    Returns:
+        torch.Tensor: Remaining indices with scores in descending order.
+    """
+    from .nms import nms
+
+    warnings.warn(
+        '`iou3d.nms_normal_bev` is deprecated and will be removed in'
+        ' the future. Please, use `nms.nms`.', DeprecationWarning)
+    assert boxes.shape[1] == 5, 'Input boxes shape should be (N, 5)'
+
+    return nms(boxes[:, :-1], scores, thresh)[1]
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/knn.py b/head_extractor/mmcv-2.1.0/mmcv/ops/knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..48ce92f9259bdcec166a23be2ba81544a69bc8c1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/knn.py
@@ -0,0 +1,80 @@
+from typing import Optional
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['knn_forward'])
+
+
+class KNN(Function):
+    r"""KNN (CUDA) based on heap data structure.
+
+    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
+    scene_seg/lib/pointops/src/knnquery_heap>`_.
+
+    Find k-nearest points.
+    """
+
+    @staticmethod
+    def forward(ctx,
+                k: int,
+                xyz: torch.Tensor,
+                center_xyz: Optional[torch.Tensor] = None,
+                transposed: bool = False) -> torch.Tensor:
+        """
+        Args:
+            k (int): number of nearest neighbors.
+            xyz (torch.Tensor): (B, N, 3) if transposed == False, else
+                (B, 3, N). xyz coordinates of the features.
+            center_xyz (torch.Tensor, optional): (B, npoint, 3) if transposed
+                is False, else (B, 3, npoint). centers of the knn query.
+                Default: None.
+            transposed (bool, optional): whether the input tensors are
+                transposed. Should not explicitly use this keyword when
+                calling knn (=KNN.apply), just add the fourth param.
+                Default: False.
+
+        Returns:
+            torch.Tensor: (B, k, npoint) tensor with the indices of the
+            features that form k-nearest neighbours.
+        """
+        assert (k > 0) & (k < 100), 'k should be in range(0, 100)'
+
+        if center_xyz is None:
+            center_xyz = xyz
+
+        if transposed:
+            xyz = xyz.transpose(2, 1).contiguous()
+            center_xyz = center_xyz.transpose(2, 1).contiguous()
+
+        assert xyz.is_contiguous()  # [B, N, 3]
+        assert center_xyz.is_contiguous()  # [B, npoint, 3]
+
+        center_xyz_device = center_xyz.get_device()
+        assert center_xyz_device == xyz.get_device(), \
+            'center_xyz and xyz should be put on the same device'
+        if torch.cuda.current_device() != center_xyz_device:
+            torch.cuda.set_device(center_xyz_device)
+
+        B, npoint, _ = center_xyz.shape
+        N = xyz.shape[1]
+
+        idx = center_xyz.new_zeros((B, npoint, k)).int()
+        dist2 = center_xyz.new_zeros((B, npoint, k)).float()
+
+        ext_module.knn_forward(
+            xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k)
+        # idx shape to [B, k, npoint]
+        idx = idx.transpose(2, 1).contiguous()
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None
+
+
+knn = KNN.apply
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/masked_conv.py b/head_extractor/mmcv-2.1.0/mmcv/ops/masked_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..919702e9cbd04b9e1f5c93147bcced8a1be38c61
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/masked_conv.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['masked_im2col_forward', 'masked_col2im_forward'])
+
+
+class MaskedConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features, mask, weight, bias, padding, stride=1):
+        return g.op(
+            'mmcv::MMCVMaskedConv2d',
+            features,
+            mask,
+            weight,
+            bias,
+            padding_i=padding,
+            stride_i=stride)
+
+    @staticmethod
+    def forward(ctx,
+                features: torch.Tensor,
+                mask: torch.Tensor,
+                weight: torch.nn.Parameter,
+                bias: torch.nn.Parameter,
+                padding: int = 0,
+                stride: int = 1) -> torch.Tensor:
+        assert mask.dim() == 3 and mask.size(0) == 1
+        assert features.dim() == 4 and features.size(0) == 1
+        assert features.size()[2:] == mask.size()[1:]
+        pad_h, pad_w = _pair(padding)
+        stride_h, stride_w = _pair(stride)
+        if stride_h != 1 or stride_w != 1:
+            raise ValueError(
+                'Stride could not only be 1 in masked_conv2d currently.')
+        out_channel, in_channel, kernel_h, kernel_w = weight.size()
+
+        if features.device.type == 'npu':
+            import torch_npu
+            output = torch_npu.npu_conv2d(
+                features,
+                weight,
+                bias,
+                stride=(stride_h, stride_w),
+                padding=(pad_h, pad_w),
+                dilation=(1, 1),
+                groups=1)
+            if mask.size()[1:] != output.size()[2:]:
+                raise ValueError(
+                    'The mask is inconsistent with the shape of output_conv.')
+            mask = mask > 0
+            mask = mask.type(output.dtype)
+            output = output * mask
+            return output
+
+        batch_size = features.size(0)
+        out_h = int(
+            math.floor(
+                torch.true_divide((features.size(2) + 2 * pad_h -
+                                   (kernel_h - 1) - 1), stride_h) + 1))
+        out_w = int(
+            math.floor(
+                torch.true_divide((features.size(3) + 2 * pad_w -
+                                   (kernel_w - 1) - 1), stride_w) + 1))
+        mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False)
+        output = features.new_zeros(batch_size, out_channel, out_h, out_w)
+        if mask_inds.numel() > 0:
+            mask_h_idx = mask_inds[:, 0].contiguous()
+            mask_w_idx = mask_inds[:, 1].contiguous()
+            data_col = features.new_zeros(in_channel * kernel_h * kernel_w,
+                                          mask_inds.size(0))
+            ext_module.masked_im2col_forward(
+                features,
+                mask_h_idx,
+                mask_w_idx,
+                data_col,
+                kernel_h=kernel_h,
+                kernel_w=kernel_w,
+                pad_h=pad_h,
+                pad_w=pad_w)
+            masked_output = torch.addmm(1, bias[:, None], 1,
+                                        weight.view(out_channel, -1), data_col)
+            ext_module.masked_col2im_forward(
+                masked_output,
+                mask_h_idx,
+                mask_w_idx,
+                output,
+                height=out_h,
+                width=out_w,
+                channels=out_channel)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        return (None, ) * 5
+
+
+masked_conv2d = MaskedConv2dFunction.apply
+
+
+class MaskedConv2d(nn.Conv2d):
+    """A MaskedConv2d which inherits the official Conv2d.
+
+    The masked forward doesn't implement the backward function and only
+    supports the stride parameter to be 1 currently.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, ...]],
+                 stride: int = 1,
+                 padding: int = 0,
+                 dilation: int = 1,
+                 groups: int = 1,
+                 bias: bool = True):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias)
+
+    def forward(self,
+                input: torch.Tensor,
+                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if mask is None:  # fallback to the normal Conv2d
+            return super().forward(input)
+        else:
+            return masked_conv2d(input, mask, self.weight, self.bias,
+                                 self.padding)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/merge_cells.py b/head_extractor/mmcv-2.1.0/mmcv/ops/merge_cells.py
new file mode 100644
index 0000000000000000000000000000000000000000..19c3fe6582bc04390819b1da9b2620548b462836
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/merge_cells.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from abc import abstractmethod
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..cnn import ConvModule
+
+
+class BaseMergeCell(nn.Module):
+    """The basic class for cells used in NAS-FPN and NAS-FCOS.
+
+    BaseMergeCell takes 2 inputs. After applying convolution
+    on them, they are resized to the target size. Then,
+    they go through binary_op, which depends on the type of cell.
+    If with_out_conv is True, the result of output will go through
+    another convolution layer.
+
+    Args:
+        fused_channels (int): number of input channels in out_conv layer.
+        out_channels (int): number of output channels in out_conv layer.
+        with_out_conv (bool): Whether to use out_conv layer
+        out_conv_cfg (dict): Config dict for convolution layer, which should
+            contain "groups", "kernel_size", "padding", "bias" to build
+            out_conv layer.
+        out_norm_cfg (dict): Config dict for normalization layer in out_conv.
+        out_conv_order (tuple): The order of conv/norm/activation layers in
+            out_conv.
+        with_input1_conv (bool): Whether to use convolution on input1.
+        with_input2_conv (bool): Whether to use convolution on input2.
+        input_conv_cfg (dict): Config dict for building input1_conv layer and
+            input2_conv layer, which is expected to contain the type of
+            convolution.
+            Default: None, which means using conv2d.
+        input_norm_cfg (dict): Config dict for normalization layer in
+            input1_conv and input2_conv layer. Default: None.
+        upsample_mode (str): Interpolation method used to resize the output
+            of input1_conv and input2_conv to target size. Currently, we
+            support ['nearest', 'bilinear']. Default: 'nearest'.
+    """
+
+    def __init__(self,
+                 fused_channels: Optional[int] = 256,
+                 out_channels: Optional[int] = 256,
+                 with_out_conv: bool = True,
+                 out_conv_cfg: dict = dict(
+                     groups=1, kernel_size=3, padding=1, bias=True),
+                 out_norm_cfg: Optional[dict] = None,
+                 out_conv_order: tuple = ('act', 'conv', 'norm'),
+                 with_input1_conv: bool = False,
+                 with_input2_conv: bool = False,
+                 input_conv_cfg: Optional[dict] = None,
+                 input_norm_cfg: Optional[dict] = None,
+                 upsample_mode: str = 'nearest'):
+        super().__init__()
+        assert upsample_mode in ['nearest', 'bilinear']
+        self.with_out_conv = with_out_conv
+        self.with_input1_conv = with_input1_conv
+        self.with_input2_conv = with_input2_conv
+        self.upsample_mode = upsample_mode
+
+        if self.with_out_conv:
+            self.out_conv = ConvModule(
+                fused_channels,  # type: ignore
+                out_channels,  # type: ignore
+                **out_conv_cfg,
+                norm_cfg=out_norm_cfg,
+                order=out_conv_order)
+
+        self.input1_conv = self._build_input_conv(
+            out_channels, input_conv_cfg,
+            input_norm_cfg) if with_input1_conv else nn.Sequential()
+        self.input2_conv = self._build_input_conv(
+            out_channels, input_conv_cfg,
+            input_norm_cfg) if with_input2_conv else nn.Sequential()
+
+    def _build_input_conv(self, channel, conv_cfg, norm_cfg):
+        return ConvModule(
+            channel,
+            channel,
+            3,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True)
+
+    @abstractmethod
+    def _binary_op(self, x1, x2):
+        pass
+
+    def _resize(self, x, size):
+        if x.shape[-2:] == size:
+            return x
+        elif x.shape[-2:] < size:
+            return F.interpolate(x, size=size, mode=self.upsample_mode)
+        else:
+            if x.shape[-2] % size[-2] != 0 or x.shape[-1] % size[-1] != 0:
+                h, w = x.shape[-2:]
+                target_h, target_w = size
+                pad_h = math.ceil(h / target_h) * target_h - h
+                pad_w = math.ceil(w / target_w) * target_w - w
+                pad_l = pad_w // 2
+                pad_r = pad_w - pad_l
+                pad_t = pad_h // 2
+                pad_b = pad_h - pad_t
+                pad = (pad_l, pad_r, pad_t, pad_b)
+                x = F.pad(x, pad, mode='constant', value=0.0)
+            kernel_size = (x.shape[-2] // size[-2], x.shape[-1] // size[-1])
+            x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size)
+            return x
+
+    def forward(self,
+                x1: torch.Tensor,
+                x2: torch.Tensor,
+                out_size: Optional[tuple] = None) -> torch.Tensor:
+        assert x1.shape[:2] == x2.shape[:2]
+        assert out_size is None or len(out_size) == 2
+        if out_size is None:  # resize to larger one
+            out_size = max(x1.size()[2:], x2.size()[2:])
+
+        x1 = self.input1_conv(x1)
+        x2 = self.input2_conv(x2)
+
+        x1 = self._resize(x1, out_size)
+        x2 = self._resize(x2, out_size)
+
+        x = self._binary_op(x1, x2)
+        if self.with_out_conv:
+            x = self.out_conv(x)
+        return x
+
+
+class SumCell(BaseMergeCell):
+
+    def __init__(self, in_channels: int, out_channels: int, **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+
+    def _binary_op(self, x1, x2):
+        return x1 + x2
+
+
+class ConcatCell(BaseMergeCell):
+
+    def __init__(self, in_channels: int, out_channels: int, **kwargs):
+        super().__init__(in_channels * 2, out_channels, **kwargs)
+
+    def _binary_op(self, x1, x2):
+        ret = torch.cat([x1, x2], dim=1)
+        return ret
+
+
+class GlobalPoolingCell(BaseMergeCell):
+
+    def __init__(self,
+                 in_channels: Optional[int] = None,
+                 out_channels: Optional[int] = None,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
+
+    def _binary_op(self, x1, x2):
+        x2_att = self.global_pool(x2).sigmoid()
+        return x2 + x2_att * x1
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/min_area_polygons.py b/head_extractor/mmcv-2.1.0/mmcv/ops/min_area_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..b95f58796f4a894ab5cc48e2d766319f4c3640c7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/min_area_polygons.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['min_area_polygons'])
+
+
+def min_area_polygons(pointsets: torch.Tensor) -> torch.Tensor:
+    """Find the smallest polygons that surrounds all points in the point sets.
+
+    Args:
+        pointsets (Tensor): point sets with shape  (N, 18).
+
+    Returns:
+        torch.Tensor: Return the smallest polygons with shape (N, 8).
+    """
+    polygons = pointsets.new_zeros((pointsets.size(0), 8))
+    ext_module.min_area_polygons(pointsets, polygons)
+    return polygons
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/modulated_deform_conv.py b/head_extractor/mmcv-2.1.0/mmcv/ops/modulated_deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4a42ced61fd3ed6fb67d0387f0678b9bd8cd603
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/modulated_deform_conv.py
@@ -0,0 +1,427 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.logging import print_log
+from mmengine.registry import MODELS
+from mmengine.utils import deprecated_api_warning
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+
+from mmcv.utils import IS_MLU_AVAILABLE
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['modulated_deform_conv_forward', 'modulated_deform_conv_backward'])
+
+
+class ModulatedDeformConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, offset, mask, weight, bias, stride, padding,
+                 dilation, groups, deform_groups):
+        input_tensors = [input, offset, mask, weight]
+        if bias is not None:
+            input_tensors.append(bias)
+        return g.op(
+            'mmcv::MMCVModulatedDeformConv2d',
+            *input_tensors,
+            stride_i=stride,
+            padding_i=padding,
+            dilation_i=dilation,
+            groups_i=groups,
+            deform_groups_i=deform_groups)
+
+    @staticmethod
+    def _calculate_sort_index(kernel_h, kernel_w, deformable_group):
+        split_num = deformable_group * 2 * kernel_h * kernel_w
+        sort_index = list(range(split_num))
+        sort_index_fp = (sort_index[1::2] + sort_index[::2])
+        sort_index_bp_dict = {i: idx for idx, i in enumerate(sort_index_fp)}
+        sort_index_bp = [sort_index_bp_dict[i] for i in sort_index]
+        sort_index_fp = torch.IntTensor(sort_index_fp)
+        sort_index_bp = torch.IntTensor(sort_index_bp)
+        sort_index_fp = sort_index_fp.npu()
+        sort_index_bp = sort_index_bp.npu()
+        return sort_index_fp, sort_index_bp
+
+    @staticmethod
+    def _npu_forward(ctx, input_tensor, offset, mask, weight, bias):
+        _, _, kernel_h, kernel_w = weight.shape
+        conv2d_bias = bias if len(bias) > 0 else None
+        sort_index_fp, sort_index_bp = \
+            ModulatedDeformConv2dFunction._calculate_sort_index(
+                kernel_w, kernel_h, ctx.deform_groups)
+        select_offset = offset.index_select(1, sort_index_fp)
+        offset_all = torch.cat([select_offset, mask], dim=1)
+        import torch_npu
+        output, offset_out = torch_npu.npu_deformable_conv2d(
+            input_tensor,
+            weight,
+            offset_all,
+            conv2d_bias,
+            kernel_size=[kernel_w, kernel_h],
+            stride=[1, 1, ctx.stride[0], ctx.stride[1]],
+            padding=[
+                ctx.padding[0], ctx.padding[0], ctx.padding[1], ctx.padding[1]
+            ],
+            dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
+            groups=ctx.groups,
+            deformable_groups=ctx.deform_groups,
+            modulated=True)
+        if weight.requires_grad or mask.requires_grad or offset.requires_grad \
+                or input_tensor.requires_grad:
+            ctx.save_for_backward(input_tensor, weight, offset_out, offset_all,
+                                  sort_index_bp)
+        return output
+
+    @staticmethod
+    def _npu_backward(ctx, grad_output):
+        input_tensor, weight, offset_out, offset_all, sort_index_bp = \
+            ctx.saved_tensors
+        grad_input, grad_weight, grad_offset_all, grad_bias = \
+            torch.npu_deformable_conv2dbk(
+                input_tensor, grad_output, offset_out, weight, offset_all,
+                kernel_size=[weight.shape[3], weight.shape[2]],
+                stride=[1, 1, ctx.stride[0], ctx.stride[1]],
+                padding=[ctx.padding[0], ctx.padding[0], ctx.padding[1],
+                         ctx.padding[1]],
+                dilation=[1, 1, ctx.dilation[0], ctx.dilation[1]],
+                groups=ctx.groups, deformable_groups=ctx.deform_groups,
+                modulated=True)
+        grad_offset = grad_offset_all.index_select(1, sort_index_bp)
+        grad_mask = grad_offset_all[:, grad_offset.shape[1]:, :, :]
+        if not ctx.with_bias:
+            grad_bias = None
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None, None, None, None)
+
+    @staticmethod
+    def forward(ctx,
+                input: torch.Tensor,
+                offset: torch.Tensor,
+                mask: torch.Tensor,
+                weight: nn.Parameter,
+                bias: Optional[nn.Parameter] = None,
+                stride: int = 1,
+                padding: int = 0,
+                dilation: int = 1,
+                groups: int = 1,
+                deform_groups: int = 1) -> torch.Tensor:
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                f'Expected 4D tensor as input, got {input.dim()}D tensor \
+                  instead.')
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deform_groups = deform_groups
+        ctx.with_bias = bias is not None
+        ctx.device = input.device.type
+        if not ctx.with_bias:
+            bias = input.new_empty(0)  # fake tensor
+        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
+        # amp won't cast the type of model (float32), but "offset" is cast
+        # to float16 by nn.Conv2d automatically, leading to the type
+        # mismatch with input (when it is float32) or weight.
+        # The flag for whether to use fp16 or amp is the type of "offset",
+        # we cast weight and input to temporarily support fp16 and amp
+        # whatever the pytorch version is.
+        input = input.type_as(offset)
+        weight = weight.type_as(input)
+        bias = bias.type_as(input)  # type: ignore
+        mask = mask.type_as(input)
+        if ctx.device == 'npu':
+            output = ModulatedDeformConv2dFunction._npu_forward(
+                ctx, input, offset, mask, weight, bias)
+            return output
+        ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty([
+            int(i) for i in ModulatedDeformConv2dFunction._output_size(
+                ctx, input, weight)
+        ])
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        ext_module.modulated_deform_conv_forward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            output,
+            ctx._bufs[1],
+            kernel_h=weight.size(2),
+            kernel_w=weight.size(3),
+            stride_h=ctx.stride[0],
+            stride_w=ctx.stride[1],
+            pad_h=ctx.padding[0],
+            pad_w=ctx.padding[1],
+            dilation_h=ctx.dilation[0],
+            dilation_w=ctx.dilation[1],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            with_bias=ctx.with_bias)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        if ctx.device == 'npu':
+            return ModulatedDeformConv2dFunction._npu_backward(
+                ctx, grad_output)
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        grad_output = grad_output.contiguous()
+        ext_module.modulated_deform_conv_backward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            ctx._bufs[1],
+            grad_input,
+            grad_weight,
+            grad_bias,
+            grad_offset,
+            grad_mask,
+            grad_output,
+            kernel_h=weight.size(2),
+            kernel_w=weight.size(3),
+            stride_h=ctx.stride[0],
+            stride_w=ctx.stride[1],
+            pad_h=ctx.padding[0],
+            pad_w=ctx.padding[1],
+            dilation_h=ctx.dilation[0],
+            dilation_w=ctx.dilation[1],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            with_bias=ctx.with_bias)
+        if not ctx.with_bias:
+            grad_bias = None
+
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None)
+
+    @staticmethod
+    def _output_size(ctx, input, weight):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = ctx.padding[d]
+            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = ctx.stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be ' +
+                'x'.join(map(str, output_size)) + ')')
+        return output_size
+
+
+modulated_deform_conv2d = ModulatedDeformConv2dFunction.apply
+
+
+class ModulatedDeformConv2d(nn.Module):
+
+    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
+                            cls_name='ModulatedDeformConv2d')
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int]],
+                 stride: int = 1,
+                 padding: int = 0,
+                 dilation: int = 1,
+                 groups: int = 1,
+                 deform_groups: int = 1,
+                 bias: Union[bool, str] = True):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deform_groups = deform_groups
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups,
+                         *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.init_weights()
+
+    def init_weights(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.zero_()
+
+    def forward(self, x: torch.Tensor, offset: torch.Tensor,
+                mask: torch.Tensor) -> torch.Tensor:
+        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
+                                       self.stride, self.padding,
+                                       self.dilation, self.groups,
+                                       self.deform_groups)
+
+
+@MODELS.register_module('DCNv2')
+class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
+    """A ModulatedDeformable Conv Encapsulation that acts as normal Conv
+    layers.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int): Same as nn.Conv2d, while tuple is not supported.
+        padding (int): Same as nn.Conv2d, while tuple is not supported.
+        dilation (int): Same as nn.Conv2d, while tuple is not supported.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+
+    _version = 2
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            bias=True)
+        self.init_weights()
+
+    def init_weights(self) -> None:
+        super().init_weights()
+        if hasattr(self, 'conv_offset'):
+            self.conv_offset.weight.data.zero_()
+            self.conv_offset.bias.data.zero_()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore
+        out = self.conv_offset(x)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
+                                       self.stride, self.padding,
+                                       self.dilation, self.groups,
+                                       self.deform_groups)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, ModulatedDeformConvPack
+            # loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+
+        if version is not None and version > 1:
+            print_log(
+                f'ModulatedDeformConvPack {prefix.rstrip(".")} is upgraded to '
+                'version 2.',
+                logger='current')
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+
+if IS_MLU_AVAILABLE:
+    import torchvision
+    from mmengine.utils import digit_version
+    from torchvision.ops import deform_conv2d as tv_deform_conv2d
+
+    @MODELS.register_module('DCNv2', force=True)
+    class ModulatedDeformConv2dPack_MLU(ModulatedDeformConv2d):
+        """This class is the DCNv2 implementation of the MLU device.
+
+        The MLU backend support of the operator has been implemented
+        in torchvision. The mmcv registration mechanism is used for
+        multiplexing here. The torchvision implementation of DCNv2 is called.
+        Args:
+            in_channels (int): Same as nn.Conv2d.
+            out_channels (int): Same as nn.Conv2d.
+            kernel_size (int or tuple[int]): Same as nn.Conv2d.
+            stride (int): Same as nn.Conv2d, while tuple is not supported.
+            padding (int): Same as nn.Conv2d, while tuple is not supported.
+            dilation (int): Same as nn.Conv2d, while tuple is not supported.
+            groups (int): Same as nn.Conv2d.
+            bias (bool or str): If specified as `auto`, it will be decided by
+                the norm_cfg. Bias will be set as True if norm_cfg is None,
+                otherwise False.
+        """
+
+        def __init__(self, *args, **kwargs):
+            assert digit_version(torchvision.__version__) >= digit_version(
+                '0.10.0a0'), 'the version of torchvision should be >= 0.10.0'
+            super().__init__(*args, **kwargs)
+            self.conv_offset = nn.Conv2d(
+                self.in_channels,
+                self.deform_groups * 3 * self.kernel_size[0] *
+                self.kernel_size[1],
+                kernel_size=self.kernel_size,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                bias=True)
+            self.init_weights()
+
+        def init_weights(self):
+            super().init_weights()
+            if hasattr(self, 'conv_offset'):
+                self.conv_offset.weight.data.zero_()
+                self.conv_offset.bias.data.zero_()
+
+        def forward(self, x):
+            out = self.conv_offset(x)
+            o1, o2, mask = torch.chunk(out, 3, dim=1)
+            offset = torch.cat((o1, o2), dim=1)
+            mask = torch.sigmoid(mask)
+            x = x.type_as(offset)
+            weight = self.weight.type_as(x)
+            mask = mask.type_as(x)
+            return tv_deform_conv2d(
+                x,
+                offset,
+                weight,
+                bias=self.bias,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                mask=mask)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/multi_scale_deform_attn.py b/head_extractor/mmcv-2.1.0/mmcv/ops/multi_scale_deform_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7459263cdf0f2dca67bbf99fcad9c8ff1171dc8e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/multi_scale_deform_attn.py
@@ -0,0 +1,381 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from typing import Optional, no_type_check
+
+import mmengine
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule, constant_init, xavier_init
+from mmengine.registry import MODELS
+from mmengine.utils import deprecated_api_warning
+from torch.autograd.function import Function, once_differentiable
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+class MultiScaleDeformableAttnFunction(Function):
+
+    @staticmethod
+    def forward(ctx, value: torch.Tensor, value_spatial_shapes: torch.Tensor,
+                value_level_start_index: torch.Tensor,
+                sampling_locations: torch.Tensor,
+                attention_weights: torch.Tensor,
+                im2col_step: torch.Tensor) -> torch.Tensor:
+        """GPU/MLU version of multi-scale deformable attention.
+
+        Args:
+            value (torch.Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (torch.Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (torch.Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (torch.Tensor): The weight of sampling points
+                used when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (torch.Tensor): The step used in image to column.
+
+        Returns:
+            torch.Tensor: has shape (bs, num_queries, embed_dims)
+        """
+
+        ctx.im2col_step = im2col_step
+
+        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
+        # amp won't cast the type of sampling_locations, attention_weights
+        # (float32), but "value" is cast to float16, leading to the type
+        # mismatch with input (when it is float32) or weight.
+        # The flag for whether to use fp16 or amp is the type of "value",
+        # we cast sampling_locations and attention_weights to
+        # temporarily support fp16 and amp whatever the
+        # pytorch version is.
+        sampling_locations = sampling_locations.type_as(value)
+        attention_weights = attention_weights.type_as(value)
+
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        """GPU/MLU version of backward function.
+
+        Args:
+            grad_output (torch.Tensor): Gradient of output tensor of forward.
+
+        Returns:
+            tuple[Tensor]: Gradient of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index,\
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
+
+
+def multi_scale_deformable_attn_pytorch(
+        value: torch.Tensor, value_spatial_shapes: torch.Tensor,
+        sampling_locations: torch.Tensor,
+        attention_weights: torch.Tensor) -> torch.Tensor:
+    """CPU version of multi-scale deformable attention.
+
+    Args:
+        value (torch.Tensor): The value has shape
+            (bs, num_keys, num_heads, embed_dims//num_heads)
+        value_spatial_shapes (torch.Tensor): Spatial shape of
+            each feature map, has shape (num_levels, 2),
+            last dimension 2 represent (h, w)
+        sampling_locations (torch.Tensor): The location of sampling points,
+            has shape
+            (bs ,num_queries, num_heads, num_levels, num_points, 2),
+            the last dimension 2 represent (x, y).
+        attention_weights (torch.Tensor): The weight of sampling points used
+            when calculate the attention, has shape
+            (bs ,num_queries, num_heads, num_levels, num_points),
+
+    Returns:
+        torch.Tensor: has shape (bs, num_queries, embed_dims)
+    """
+
+    bs, _, num_heads, embed_dims = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ =\
+        sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],
+                             dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (H_, W_) in enumerate(value_spatial_shapes):
+        # bs, H_*W_, num_heads, embed_dims ->
+        # bs, H_*W_, num_heads*embed_dims ->
+        # bs, num_heads*embed_dims, H_*W_ ->
+        # bs*num_heads, embed_dims, H_, W_
+        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(
+            bs * num_heads, embed_dims, H_, W_)
+        # bs, num_queries, num_heads, num_points, 2 ->
+        # bs, num_heads, num_queries, num_points, 2 ->
+        # bs*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :,
+                                          level].transpose(1, 2).flatten(0, 1)
+        # bs*num_heads, embed_dims, num_queries, num_points
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (bs, num_queries, num_heads, num_levels, num_points) ->
+    # (bs, num_heads, num_queries, num_levels, num_points) ->
+    # (bs, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        bs * num_heads, 1, num_queries, num_levels * num_points)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) *
+              attention_weights).sum(-1).view(bs, num_heads * embed_dims,
+                                              num_queries)
+    return output.transpose(1, 2).contiguous()
+
+
+@MODELS.register_module()
+class MultiScaleDeformableAttention(BaseModule):
+    """An attention module used in Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 8.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        value_proj_ratio (float): The expansion ratio of value_proj.
+            Default: 1.0.
+    """
+
+    def __init__(self,
+                 embed_dims: int = 256,
+                 num_heads: int = 8,
+                 num_levels: int = 4,
+                 num_points: int = 4,
+                 im2col_step: int = 64,
+                 dropout: float = 0.1,
+                 batch_first: bool = False,
+                 norm_cfg: Optional[dict] = None,
+                 init_cfg: Optional[mmengine.ConfigDict] = None,
+                 value_proj_ratio: float = 1.0):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        value_proj_size = int(embed_dims * value_proj_ratio)
+        self.value_proj = nn.Linear(embed_dims, value_proj_size)
+        self.output_proj = nn.Linear(value_proj_size, embed_dims)
+        self.init_weights()
+
+    def init_weights(self) -> None:
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        device = next(self.parameters()).device
+        thetas = torch.arange(
+            self.num_heads, dtype=torch.float32,
+            device=device) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @no_type_check
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query: torch.Tensor,
+                key: Optional[torch.Tensor] = None,
+                value: Optional[torch.Tensor] = None,
+                identity: Optional[torch.Tensor] = None,
+                query_pos: Optional[torch.Tensor] = None,
+                key_padding_mask: Optional[torch.Tensor] = None,
+                reference_points: Optional[torch.Tensor] = None,
+                spatial_shapes: Optional[torch.Tensor] = None,
+                level_start_index: Optional[torch.Tensor] = None,
+                **kwargs) -> torch.Tensor:
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (torch.Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (torch.Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (torch.Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (torch.Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (torch.Tensor): The positional encoding for `query`.
+                Default: None.
+            key_padding_mask (torch.Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            reference_points (torch.Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            spatial_shapes (torch.Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (torch.Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+            torch.Tensor: forwarded results with shape
+            [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if ((IS_CUDA_AVAILABLE and value.is_cuda)
+                or (IS_MLU_AVAILABLE and value.is_mlu)):
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/nms.py b/head_extractor/mmcv-2.1.0/mmcv/ops/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb08ba07c6f54409f5b8f479b6b00462e2718a15
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/nms.py
@@ -0,0 +1,491 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.utils import deprecated_api_warning
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['nms', 'softnms', 'nms_match', 'nms_rotated', 'nms_quadri'])
+
+
+# This function is modified from: https://github.com/pytorch/vision/
+class NMSop(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, bboxes: Tensor, scores: Tensor, iou_threshold: float,
+                offset: int, score_threshold: float, max_num: int) -> Tensor:
+        is_filtering_by_score = score_threshold > 0
+        if is_filtering_by_score:
+            valid_mask = scores > score_threshold
+            bboxes, scores = bboxes[valid_mask], scores[valid_mask]
+            valid_inds = torch.nonzero(
+                valid_mask, as_tuple=False).squeeze(dim=1)
+
+        inds = ext_module.nms(
+            bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)
+
+        if max_num > 0:
+            inds = inds[:max_num]
+        if is_filtering_by_score:
+            inds = valid_inds[inds]
+        return inds
+
+
+class SoftNMSop(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, boxes: Tensor, scores: Tensor, iou_threshold: float,
+                sigma: float, min_score: float, method: int,
+                offset: int) -> Tuple[Tensor, Tensor]:
+        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
+        inds = ext_module.softnms(
+            boxes.cpu(),
+            scores.cpu(),
+            dets.cpu(),
+            iou_threshold=float(iou_threshold),
+            sigma=float(sigma),
+            min_score=float(min_score),
+            method=int(method),
+            offset=int(offset))
+        return dets, inds
+
+    @staticmethod
+    def symbolic(g, boxes, scores, iou_threshold, sigma, min_score, method,
+                 offset):
+        from packaging import version
+        assert version.parse(torch.__version__) >= version.parse('1.7.0')
+        nms_out = g.op(
+            'mmcv::SoftNonMaxSuppression',
+            boxes,
+            scores,
+            iou_threshold_f=float(iou_threshold),
+            sigma_f=float(sigma),
+            min_score_f=float(min_score),
+            method_i=int(method),
+            offset_i=int(offset),
+            outputs=2)
+        return nms_out
+
+
+array_like_type = Union[Tensor, np.ndarray]
+
+
+@deprecated_api_warning({'iou_thr': 'iou_threshold'})
+def nms(boxes: array_like_type,
+        scores: array_like_type,
+        iou_threshold: float,
+        offset: int = 0,
+        score_threshold: float = 0,
+        max_num: int = -1) -> Tuple[array_like_type, array_like_type]:
+    """Dispatch to either CPU or GPU NMS implementations.
+
+    The input can be either torch tensor or numpy array. GPU NMS will be used
+    if the input is gpu tensor, otherwise CPU NMS
+    will be used. The returned type will always be the same as inputs.
+
+    Arguments:
+        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
+        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
+        iou_threshold (float): IoU threshold for NMS.
+        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
+        score_threshold (float): score threshold for NMS.
+        max_num (int): maximum number of boxes after NMS.
+
+    Returns:
+        tuple: kept dets (boxes and scores) and indice, which always have
+        the same data type as the input.
+
+    Example:
+        >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9],
+        >>>                   [49.3, 32.9, 51.0, 35.3],
+        >>>                   [49.2, 31.8, 51.0, 35.4],
+        >>>                   [35.1, 11.5, 39.1, 15.7],
+        >>>                   [35.6, 11.8, 39.3, 14.2],
+        >>>                   [35.3, 11.5, 39.9, 14.5],
+        >>>                   [35.2, 11.7, 39.7, 15.7]], dtype=np.float32)
+        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.5, 0.4, 0.3],\
+               dtype=np.float32)
+        >>> iou_threshold = 0.6
+        >>> dets, inds = nms(boxes, scores, iou_threshold)
+        >>> assert len(inds) == len(dets) == 3
+    """
+    assert isinstance(boxes, (Tensor, np.ndarray))
+    assert isinstance(scores, (Tensor, np.ndarray))
+    is_numpy = False
+    if isinstance(boxes, np.ndarray):
+        is_numpy = True
+        boxes = torch.from_numpy(boxes)
+    if isinstance(scores, np.ndarray):
+        scores = torch.from_numpy(scores)
+    assert boxes.size(1) == 4
+    assert boxes.size(0) == scores.size(0)
+    assert offset in (0, 1)
+
+    inds = NMSop.apply(boxes, scores, iou_threshold, offset, score_threshold,
+                       max_num)
+    dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)
+    if is_numpy:
+        dets = dets.cpu().numpy()
+        inds = inds.cpu().numpy()
+    return dets, inds
+
+
+@deprecated_api_warning({'iou_thr': 'iou_threshold'})
+def soft_nms(boxes: array_like_type,
+             scores: array_like_type,
+             iou_threshold: float = 0.3,
+             sigma: float = 0.5,
+             min_score: float = 1e-3,
+             method: str = 'linear',
+             offset: int = 0) -> Tuple[array_like_type, array_like_type]:
+    """Dispatch to only CPU Soft NMS implementations.
+
+    The input can be either a torch tensor or numpy array.
+    The returned type will always be the same as inputs.
+
+    Args:
+        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
+        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
+        iou_threshold (float): IoU threshold for NMS.
+        sigma (float): hyperparameter for gaussian method
+        min_score (float): score filter threshold
+        method (str): either 'linear' or 'gaussian'
+        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
+
+    Returns:
+        tuple: kept dets (boxes and scores) and indice, which always have
+        the same data type as the input.
+
+    Example:
+        >>> boxes = np.array([[4., 3., 5., 3.],
+        >>>                   [4., 3., 5., 4.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.]], dtype=np.float32)
+        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.4, 0.0], dtype=np.float32)
+        >>> iou_threshold = 0.6
+        >>> dets, inds = soft_nms(boxes, scores, iou_threshold, sigma=0.5)
+        >>> assert len(inds) == len(dets) == 5
+    """
+
+    assert isinstance(boxes, (Tensor, np.ndarray))
+    assert isinstance(scores, (Tensor, np.ndarray))
+    is_numpy = False
+    if isinstance(boxes, np.ndarray):
+        is_numpy = True
+        boxes = torch.from_numpy(boxes)
+    if isinstance(scores, np.ndarray):
+        scores = torch.from_numpy(scores)
+    assert boxes.size(1) == 4
+    assert boxes.size(0) == scores.size(0)
+    assert offset in (0, 1)
+    method_dict = {'naive': 0, 'linear': 1, 'gaussian': 2}
+    assert method in method_dict.keys()
+
+    if torch.__version__ == 'parrots':
+        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
+        indata_list = [boxes.cpu(), scores.cpu(), dets.cpu()]
+        indata_dict = {
+            'iou_threshold': float(iou_threshold),
+            'sigma': float(sigma),
+            'min_score': min_score,
+            'method': method_dict[method],
+            'offset': int(offset)
+        }
+        inds = ext_module.softnms(*indata_list, **indata_dict)
+    else:
+        dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(),
+                                     float(iou_threshold), float(sigma),
+                                     float(min_score), method_dict[method],
+                                     int(offset))
+
+    dets = dets[:inds.size(0)]
+
+    if is_numpy:
+        dets = dets.cpu().numpy()
+        inds = inds.cpu().numpy()
+        return dets, inds
+    else:
+        return dets.to(device=boxes.device), inds.to(device=boxes.device)
+
+
+def batched_nms(boxes: Tensor,
+                scores: Tensor,
+                idxs: Tensor,
+                nms_cfg: Optional[Dict],
+                class_agnostic: bool = False) -> Tuple[Tensor, Tensor]:
+    r"""Performs non-maximum suppression in a batched fashion.
+
+    Modified from `torchvision/ops/boxes.py#L39
+    <https://github.com/pytorch/vision/blob/
+    505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39>`_.
+    In order to perform NMS independently per class, we add an offset to all
+    the boxes. The offset is dependent only on the class idx, and is large
+    enough so that boxes from different classes do not overlap.
+
+    Note:
+        In v1.4.1 and later, ``batched_nms`` supports skipping the NMS and
+        returns sorted raw results when `nms_cfg` is None.
+
+    Args:
+        boxes (torch.Tensor): boxes in shape (N, 4) or (N, 5).
+        scores (torch.Tensor): scores in shape (N, ).
+        idxs (torch.Tensor): each index value correspond to a bbox cluster,
+            and NMS will not be applied between elements of different idxs,
+            shape (N, ).
+        nms_cfg (dict | optional): Supports skipping the nms when `nms_cfg`
+            is None, otherwise it should specify nms type and other
+            parameters like `iou_thr`. Possible keys includes the following.
+
+            - iou_threshold (float): IoU threshold used for NMS.
+            - split_thr (float): threshold number of boxes. In some cases the
+              number of boxes is large (e.g., 200k). To avoid OOM during
+              training, the users could set `split_thr` to a small value.
+              If the number of boxes is greater than the threshold, it will
+              perform NMS on each group of boxes separately and sequentially.
+              Defaults to 10000.
+        class_agnostic (bool): if true, nms is class agnostic,
+            i.e. IoU thresholding happens over all boxes,
+            regardless of the predicted class. Defaults to False.
+
+    Returns:
+        tuple: kept dets and indice.
+
+        - boxes (Tensor): Bboxes with score after nms, has shape
+          (num_bboxes, 5). last dimension 5 arrange as
+          (x1, y1, x2, y2, score)
+        - keep (Tensor): The indices of remaining boxes in input
+          boxes.
+    """
+    # skip nms when nms_cfg is None
+    if nms_cfg is None:
+        scores, inds = scores.sort(descending=True)
+        boxes = boxes[inds]
+        return torch.cat([boxes, scores[:, None]], -1), inds
+
+    nms_cfg_ = nms_cfg.copy()
+    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
+    if class_agnostic:
+        boxes_for_nms = boxes
+    else:
+        # When using rotated boxes, only apply offsets on center.
+        if boxes.size(-1) == 5:
+            # Strictly, the maximum coordinates of the rotating box
+            # (x,y,w,h,a) should be calculated by polygon coordinates.
+            # But the conversion from rotated box to polygon will
+            # slow down the speed.
+            # So we use max(x,y) + max(w,h) as max coordinate
+            # which is larger than polygon max coordinate
+            # max(x1, y1, x2, y2,x3, y3, x4, y4)
+            max_coordinate = boxes[..., :2].max() + boxes[..., 2:4].max()
+            offsets = idxs.to(boxes) * (
+                max_coordinate + torch.tensor(1).to(boxes))
+            boxes_ctr_for_nms = boxes[..., :2] + offsets[:, None]
+            boxes_for_nms = torch.cat([boxes_ctr_for_nms, boxes[..., 2:5]],
+                                      dim=-1)
+        else:
+            max_coordinate = boxes.max()
+            offsets = idxs.to(boxes) * (
+                max_coordinate + torch.tensor(1).to(boxes))
+            boxes_for_nms = boxes + offsets[:, None]
+
+    nms_op = nms_cfg_.pop('type', 'nms')
+    if isinstance(nms_op, str):
+        nms_op = eval(nms_op)
+
+    split_thr = nms_cfg_.pop('split_thr', 10000)
+    # Won't split to multiple nms nodes when exporting to onnx
+    if boxes_for_nms.shape[0] < split_thr:
+        dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
+        boxes = boxes[keep]
+
+        # This assumes `dets` has arbitrary dimensions where
+        # the last dimension is score.
+        # Currently it supports bounding boxes [x1, y1, x2, y2, score] or
+        # rotated boxes [cx, cy, w, h, angle_radian, score].
+
+        scores = dets[:, -1]
+    else:
+        max_num = nms_cfg_.pop('max_num', -1)
+        total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
+        # Some type of nms would reweight the score, such as SoftNMS
+        scores_after_nms = scores.new_zeros(scores.size())
+        for id in torch.unique(idxs):
+            mask = (idxs == id).nonzero(as_tuple=False).view(-1)
+            dets, keep = nms_op(boxes_for_nms[mask], scores[mask], **nms_cfg_)
+            total_mask[mask[keep]] = True
+            scores_after_nms[mask[keep]] = dets[:, -1]
+        keep = total_mask.nonzero(as_tuple=False).view(-1)
+
+        scores, inds = scores_after_nms[keep].sort(descending=True)
+        keep = keep[inds]
+        boxes = boxes[keep]
+
+        if max_num > 0:
+            keep = keep[:max_num]
+            boxes = boxes[:max_num]
+            scores = scores[:max_num]
+
+    boxes = torch.cat([boxes, scores[:, None]], -1)
+    return boxes, keep
+
+
+def nms_match(dets: array_like_type,
+              iou_threshold: float) -> List[array_like_type]:
+    """Matched dets into different groups by NMS.
+
+    NMS match is Similar to NMS but when a bbox is suppressed, nms match will
+    record the indice of suppressed bbox and form a group with the indice of
+    kept bbox. In each group, indice is sorted as score order.
+
+    Args:
+        dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5).
+        iou_threshold (float): IoU thresh for NMS.
+
+    Returns:
+        list[torch.Tensor | np.ndarray]: The outer list corresponds different
+        matched group, the inner Tensor corresponds the indices for a group
+        in score order.
+    """
+    if dets.shape[0] == 0:
+        matched = []
+    else:
+        assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \
+                                    f'but get {dets.shape}'
+        if isinstance(dets, Tensor):
+            dets_t = dets.detach().cpu()
+        else:
+            dets_t = torch.from_numpy(dets)
+        indata_list = [dets_t]
+        indata_dict = {'iou_threshold': float(iou_threshold)}
+        matched = ext_module.nms_match(*indata_list, **indata_dict)
+        if torch.__version__ == 'parrots':
+            matched = matched.tolist()  # type: ignore
+
+    if isinstance(dets, Tensor):
+        return [dets.new_tensor(m, dtype=torch.long) for m in matched]
+    else:
+        return [np.array(m, dtype=int) for m in matched]
+
+
+def nms_rotated(dets: Tensor,
+                scores: Tensor,
+                iou_threshold: float,
+                labels: Optional[Tensor] = None,
+                clockwise: bool = True) -> Tuple[Tensor, Tensor]:
+    """Performs non-maximum suppression (NMS) on the rotated boxes according to
+    their intersection-over-union (IoU).
+
+    Rotated NMS iteratively removes lower scoring rotated boxes which have an
+    IoU greater than iou_threshold with another (higher scoring) rotated box.
+
+    Args:
+        dets (torch.Tensor):  Rotated boxes in shape (N, 5).
+            They are expected to be in
+            (x_ctr, y_ctr, width, height, angle_radian) format.
+        scores (torch.Tensor): scores in shape (N, ).
+        iou_threshold (float): IoU thresh for NMS.
+        labels (torch.Tensor, optional): boxes' label in shape (N,).
+        clockwise (bool): flag indicating whether the positive angular
+            orientation is clockwise. default True.
+            `New in version 1.4.3.`
+
+    Returns:
+        tuple: kept dets(boxes and scores) and indice, which is always the
+        same data type as the input.
+    """
+    if dets.shape[0] == 0:
+        return dets, None
+    if not clockwise:
+        flip_mat = dets.new_ones(dets.shape[-1])
+        flip_mat[-1] = -1
+        dets_cw = dets * flip_mat
+    else:
+        dets_cw = dets
+    multi_label = labels is not None
+    if labels is None:
+        input_labels = scores.new_empty(0, dtype=torch.int)
+    else:
+        input_labels = labels
+    if dets.device.type in ('npu', 'mlu'):
+        order = scores.new_empty(0, dtype=torch.long)
+        if dets.device.type == 'npu':
+            coefficient = 57.29578  # 180 / PI
+            for i in range(dets.size()[0]):
+                dets_cw[i][4] *= coefficient  # radians to angle
+        keep_inds = ext_module.nms_rotated(dets_cw, scores, order, dets_cw,
+                                           input_labels, iou_threshold,
+                                           multi_label)
+        dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
+                         dim=1)
+        return dets, keep_inds
+
+    if multi_label:
+        dets_wl = torch.cat((dets_cw, labels.unsqueeze(1)), 1)  # type: ignore
+    else:
+        dets_wl = dets_cw
+    _, order = scores.sort(0, descending=True)
+    dets_sorted = dets_wl.index_select(0, order)
+
+    if torch.__version__ == 'parrots':
+        keep_inds = ext_module.nms_rotated(
+            dets_wl,
+            scores,
+            order,
+            dets_sorted,
+            input_labels,
+            iou_threshold=iou_threshold,
+            multi_label=multi_label)
+    else:
+        keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted,
+                                           input_labels, iou_threshold,
+                                           multi_label)
+    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
+                     dim=1)
+    return dets, keep_inds
+
+
+def nms_quadri(dets: Tensor,
+               scores: Tensor,
+               iou_threshold: float,
+               labels: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
+    """Performs non-maximum suppression (NMS) on the quadrilateral boxes
+    according to their intersection-over-union (IoU).
+
+    Quadri NMS iteratively removes lower scoring quadrilateral boxes
+    which have an IoU greater than iou_threshold with another (higher
+    scoring) quadrilateral box.
+
+    Args:
+        dets (torch.Tensor):  Quadri boxes in shape (N, 8).
+            They are expected to be in
+            (x1, y1, ..., x4, y4) format.
+        scores (torch.Tensor): scores in shape (N, ).
+        iou_threshold (float): IoU thresh for NMS.
+        labels (torch.Tensor, optional): boxes' label in shape (N,).
+
+    Returns:
+        tuple: kept dets(boxes and scores) and indice, which is always the
+        same data type as the input.
+    """
+    if dets.shape[0] == 0:
+        return dets, None
+
+    multi_label = labels is not None
+    if multi_label:
+        dets_with_lables = \
+            torch.cat((dets, labels.unsqueeze(1)), 1)  # type: ignore
+    else:
+        dets_with_lables = dets
+    _, order = scores.sort(0, descending=True)
+    dets_sorted = dets_with_lables.index_select(0, order)
+
+    keep_inds = ext_module.nms_quadri(dets_with_lables, scores, order,
+                                      dets_sorted, iou_threshold, multi_label)
+    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
+                     dim=1)
+    return dets, keep_inds
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/pixel_group.py b/head_extractor/mmcv-2.1.0/mmcv/ops/pixel_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf73e326da8f46bf899b84955d0b911dd3f65014
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/pixel_group.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['pixel_group'])
+
+
+def pixel_group(
+    score: Union[np.ndarray, Tensor],
+    mask: Union[np.ndarray, Tensor],
+    embedding: Union[np.ndarray, Tensor],
+    kernel_label: Union[np.ndarray, Tensor],
+    kernel_contour: Union[np.ndarray, Tensor],
+    kernel_region_num: int,
+    distance_threshold: float,
+) -> List[List[float]]:
+    """Group pixels into text instances, which is widely used text detection
+    methods.
+
+    Arguments:
+        score (np.array or torch.Tensor): The foreground score with size hxw.
+        mask (np.array or Tensor): The foreground mask with size hxw.
+        embedding (np.array or torch.Tensor): The embedding with size hxwxc to
+            distinguish instances.
+        kernel_label (np.array or torch.Tensor): The instance kernel index with
+            size hxw.
+        kernel_contour (np.array or torch.Tensor): The kernel contour with
+            size hxw.
+        kernel_region_num (int): The instance kernel region number.
+        distance_threshold (float): The embedding distance threshold between
+            kernel and pixel in one instance.
+
+    Returns:
+        list[list[float]]: The instance coordinates and attributes list. Each
+        element consists of averaged confidence, pixel number, and coordinates
+        (x_i, y_i for all pixels) in order.
+    """
+    assert isinstance(score, (torch.Tensor, np.ndarray))
+    assert isinstance(mask, (torch.Tensor, np.ndarray))
+    assert isinstance(embedding, (torch.Tensor, np.ndarray))
+    assert isinstance(kernel_label, (torch.Tensor, np.ndarray))
+    assert isinstance(kernel_contour, (torch.Tensor, np.ndarray))
+    assert isinstance(kernel_region_num, int)
+    assert isinstance(distance_threshold, float)
+
+    if isinstance(score, np.ndarray):
+        score = torch.from_numpy(score)
+    if isinstance(mask, np.ndarray):
+        mask = torch.from_numpy(mask)
+    if isinstance(embedding, np.ndarray):
+        embedding = torch.from_numpy(embedding)
+    if isinstance(kernel_label, np.ndarray):
+        kernel_label = torch.from_numpy(kernel_label)
+    if isinstance(kernel_contour, np.ndarray):
+        kernel_contour = torch.from_numpy(kernel_contour)
+
+    if torch.__version__ == 'parrots':
+        label = ext_module.pixel_group(
+            score,
+            mask,
+            embedding,
+            kernel_label,
+            kernel_contour,
+            kernel_region_num=kernel_region_num,
+            distance_threshold=distance_threshold)
+        label = label.tolist()
+        label = label[0]
+        list_index = kernel_region_num
+        pixel_assignment = []
+        for x in range(kernel_region_num):
+            pixel_assignment.append(
+                np.array(
+                    label[list_index:list_index + int(label[x])],
+                    dtype=np.float))
+            list_index = list_index + int(label[x])
+    else:
+        pixel_assignment = ext_module.pixel_group(score, mask, embedding,
+                                                  kernel_label, kernel_contour,
+                                                  kernel_region_num,
+                                                  distance_threshold)
+    return pixel_assignment
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/point_sample.py b/head_extractor/mmcv-2.1.0/mmcv/ops/point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..38112531285c865b706cef6f073975ab3329940c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/point_sample.py
@@ -0,0 +1,332 @@
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa
+
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+
+def bilinear_grid_sample(im: Tensor,
+                         grid: Tensor,
+                         align_corners: bool = False) -> Tensor:
+    """Given an input and a flow-field grid, computes the output using input
+    values and pixel locations from grid. Supported only bilinear interpolation
+    method to sample the input pixels.
+
+    Args:
+        im (torch.Tensor): Input feature map, shape (N, C, H, W)
+        grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2)
+        align_corners (bool): If set to True, the extrema (-1 and 1) are
+            considered as referring to the center points of the input’s
+            corner pixels. If set to False, they are instead considered as
+            referring to the corner points of the input’s corner pixels,
+            making the sampling more resolution agnostic.
+
+    Returns:
+        torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)
+    """
+    n, c, h, w = im.shape
+    gn, gh, gw, _ = grid.shape
+    assert n == gn
+
+    x = grid[:, :, :, 0]
+    y = grid[:, :, :, 1]
+
+    if align_corners:
+        x = ((x + 1) / 2) * (w - 1)
+        y = ((y + 1) / 2) * (h - 1)
+    else:
+        x = ((x + 1) * w - 1) / 2
+        y = ((y + 1) * h - 1) / 2
+
+    x = x.view(n, -1)
+    y = y.view(n, -1)
+
+    x0 = torch.floor(x).long()
+    y0 = torch.floor(y).long()
+    x1 = x0 + 1
+    y1 = y0 + 1
+
+    wa = ((x1 - x) * (y1 - y)).unsqueeze(1)
+    wb = ((x1 - x) * (y - y0)).unsqueeze(1)
+    wc = ((x - x0) * (y1 - y)).unsqueeze(1)
+    wd = ((x - x0) * (y - y0)).unsqueeze(1)
+
+    # Apply default for grid_sample function zero padding
+    im_padded = F.pad(im, pad=[1, 1, 1, 1], mode='constant', value=0)
+    padded_h = h + 2
+    padded_w = w + 2
+    # save points positions after padding
+    x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1
+
+    # Clip coordinates to padded image size
+    x0 = torch.where(x0 < 0, torch.tensor(0), x0)
+    x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1), x0)
+    x1 = torch.where(x1 < 0, torch.tensor(0), x1)
+    x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1), x1)
+    y0 = torch.where(y0 < 0, torch.tensor(0), y0)
+    y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1), y0)
+    y1 = torch.where(y1 < 0, torch.tensor(0), y1)
+    y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1), y1)
+
+    im_padded = im_padded.view(n, c, -1)
+
+    x0_y0 = (x0 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x0_y1 = (x0 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x1_y0 = (x1 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x1_y1 = (x1 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
+
+    Ia = torch.gather(im_padded, 2, x0_y0)
+    Ib = torch.gather(im_padded, 2, x0_y1)
+    Ic = torch.gather(im_padded, 2, x1_y0)
+    Id = torch.gather(im_padded, 2, x1_y1)
+
+    return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw)
+
+
+def normalize(grid: Tensor) -> Tensor:
+    """Normalize input grid from [-1, 1] to [0, 1]
+
+    Args:
+        grid (torch.Tensor): The grid to be normalize, range [-1, 1].
+
+    Returns:
+        torch.Tensor: Normalized grid, range [0, 1].
+    """
+
+    return (grid + 1.0) / 2.0
+
+
+def denormalize(grid: Tensor) -> Tensor:
+    """Denormalize input grid from range [0, 1] to [-1, 1]
+
+    Args:
+        grid (torch.Tensor): The grid to be denormalize, range [0, 1].
+
+    Returns:
+        torch.Tensor: Denormalized grid, range [-1, 1].
+    """
+
+    return grid * 2.0 - 1.0
+
+
+def generate_grid(num_grid: int, size: Tuple[int, int],
+                  device: torch.device) -> Tensor:
+    """Generate regular square grid of points in [0, 1] x [0, 1] coordinate
+    space.
+
+    Args:
+        num_grid (int): The number of grids to sample, one for each region.
+        size (tuple[int, int]): The side size of the regular grid.
+        device (torch.device): Desired device of returned tensor.
+
+    Returns:
+        torch.Tensor: A tensor of shape (num_grid, size[0]*size[1], 2) that
+        contains coordinates for the regular grids.
+    """
+
+    affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device)
+    grid = F.affine_grid(
+        affine_trans, torch.Size((1, 1, *size)), align_corners=False)
+    grid = normalize(grid)
+    return grid.view(1, -1, 2).expand(num_grid, -1, -1)
+
+
+def rel_roi_point_to_abs_img_point(rois: Tensor,
+                                   rel_roi_points: Tensor) -> Tensor:
+    """Convert roi based relative point coordinates to image based absolute
+    point coordinates.
+
+    Args:
+        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
+            to RoI, location, range (0, 1), shape (N, P, 2)
+    Returns:
+        torch.Tensor: Image based absolute point coordinates, shape (N, P, 2)
+    """
+
+    with torch.no_grad():
+        assert rel_roi_points.size(0) == rois.size(0)
+        assert rois.dim() == 2
+        assert rel_roi_points.dim() == 3
+        assert rel_roi_points.size(2) == 2
+        # remove batch idx
+        if rois.size(1) == 5:
+            rois = rois[:, 1:]
+        abs_img_points = rel_roi_points.clone()
+        # To avoid an error during exporting to onnx use independent
+        # variables instead inplace computation
+        xs = abs_img_points[:, :, 0] * (rois[:, None, 2] - rois[:, None, 0])
+        ys = abs_img_points[:, :, 1] * (rois[:, None, 3] - rois[:, None, 1])
+        xs += rois[:, None, 0]
+        ys += rois[:, None, 1]
+        abs_img_points = torch.stack([xs, ys], dim=2)
+    return abs_img_points
+
+
+def get_shape_from_feature_map(x: Tensor) -> Tensor:
+    """Get spatial resolution of input feature map considering exporting to
+    onnx mode.
+
+    Args:
+        x (torch.Tensor): Input tensor, shape (N, C, H, W)
+
+    Returns:
+        torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2)
+    """
+    img_shape = torch.tensor(x.shape[2:]).flip(0).view(1, 1,
+                                                       2).to(x.device).float()
+    return img_shape
+
+
+def abs_img_point_to_rel_img_point(abs_img_points: Tensor,
+                                   img: Union[tuple, Tensor],
+                                   spatial_scale: float = 1.) -> Tensor:
+    """Convert image based absolute point coordinates to image based relative
+    coordinates for sampling.
+
+    Args:
+        abs_img_points (torch.Tensor): Image based absolute point coordinates,
+            shape (N, P, 2)
+        img (tuple or torch.Tensor): (height, width) of image or feature map.
+        spatial_scale (float, optional): Scale points by this factor.
+            Default: 1.
+
+    Returns:
+        Tensor: Image based relative point coordinates for sampling, shape
+        (N, P, 2).
+    """
+
+    assert (isinstance(img, tuple) and len(img) == 2) or \
+           (isinstance(img, torch.Tensor) and len(img.shape) == 4)
+
+    if isinstance(img, tuple):
+        h, w = img
+        scale = torch.tensor([w, h],
+                             dtype=torch.float,
+                             device=abs_img_points.device)
+        scale = scale.view(1, 1, 2)
+    else:
+        scale = get_shape_from_feature_map(img)
+
+    return abs_img_points / scale * spatial_scale
+
+
+def rel_roi_point_to_rel_img_point(rois: Tensor,
+                                   rel_roi_points: Tensor,
+                                   img: Union[tuple, Tensor],
+                                   spatial_scale: float = 1.) -> Tensor:
+    """Convert roi based relative point coordinates to image based absolute
+    point coordinates.
+
+    Args:
+        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
+            to RoI, location, range (0, 1), shape (N, P, 2)
+        img (tuple or torch.Tensor): (height, width) of image or feature map.
+        spatial_scale (float, optional): Scale points by this factor.
+            Default: 1.
+
+    Returns:
+        torch.Tensor: Image based relative point coordinates for sampling,
+        shape (N, P, 2).
+    """
+
+    abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points)
+    rel_img_point = abs_img_point_to_rel_img_point(abs_img_point, img,
+                                                   spatial_scale)
+
+    return rel_img_point
+
+
+def point_sample(input: Tensor,
+                 points: Tensor,
+                 align_corners: bool = False,
+                 **kwargs) -> Tensor:
+    """A wrapper around :func:`grid_sample` to support 3D point_coords tensors
+    Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
+    lie inside ``[0, 1] x [0, 1]`` square.
+
+    Args:
+        input (torch.Tensor): Feature map, shape (N, C, H, W).
+        points (torch.Tensor): Image based absolute point coordinates
+            (normalized), range [0, 1] x [0, 1], shape (N, P, 2) or
+            (N, Hgrid, Wgrid, 2).
+        align_corners (bool, optional): Whether align_corners.
+            Default: False
+
+    Returns:
+        torch.Tensor: Features of `point` on `input`, shape (N, C, P) or
+        (N, C, Hgrid, Wgrid).
+    """
+
+    add_dim = False
+    if points.dim() == 3:
+        add_dim = True
+        points = points.unsqueeze(2)
+    output = F.grid_sample(
+        input, denormalize(points), align_corners=align_corners, **kwargs)
+    if add_dim:
+        output = output.squeeze(3)
+    return output
+
+
+class SimpleRoIAlign(nn.Module):
+
+    def __init__(self,
+                 output_size: Tuple[int],
+                 spatial_scale: float,
+                 aligned: bool = True) -> None:
+        """Simple RoI align in PointRend, faster than standard RoIAlign.
+
+        Args:
+            output_size (tuple[int]): h, w
+            spatial_scale (float): scale the input boxes by this number
+            aligned (bool): if False, use the legacy implementation in
+                MMDetection, align_corners=True will be used in F.grid_sample.
+                If True, align the results more perfectly.
+        """
+
+        super().__init__()
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        # to be consistent with other RoI ops
+        self.use_torchvision = False
+        self.aligned = aligned
+
+    def forward(self, features: Tensor, rois: Tensor) -> Tensor:
+        num_imgs = features.size(0)
+        num_rois = rois.size(0)
+        rel_roi_points = generate_grid(
+            num_rois, self.output_size, device=rois.device)
+
+        point_feats = []
+        for batch_ind in range(num_imgs):
+            # unravel batch dim
+            feat = features[batch_ind].unsqueeze(0)
+            inds = (rois[:, 0].long() == batch_ind)
+            if inds.any():
+                rel_img_points = rel_roi_point_to_rel_img_point(
+                    rois[inds], rel_roi_points[inds], feat,
+                    self.spatial_scale).unsqueeze(0)
+                point_feat = point_sample(
+                    feat, rel_img_points, align_corners=not self.aligned)
+                point_feat = point_feat.squeeze(0).transpose(0, 1)
+                point_feats.append(point_feat)
+
+        point_feats_t = torch.cat(point_feats, dim=0)
+
+        channels = features.size(1)
+        roi_feats = point_feats_t.reshape(num_rois, channels,
+                                          *self.output_size)
+
+        return roi_feats
+
+    def __repr__(self) -> str:
+        format_str = self.__class__.__name__
+        format_str += '(output_size={}, spatial_scale={}'.format(
+            self.output_size, self.spatial_scale)
+        return format_str
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/points_in_boxes.py b/head_extractor/mmcv-2.1.0/mmcv/ops/points_in_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..4915e6b573923fe40658d9dca09b39da9dcb31ed
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/points_in_boxes.py
@@ -0,0 +1,137 @@
+import torch
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'points_in_boxes_part_forward', 'points_in_boxes_cpu_forward',
+    'points_in_boxes_all_forward'
+])
+
+
+def points_in_boxes_part(points: Tensor, boxes: Tensor) -> Tensor:
+    """Find the box in which each point is (CUDA).
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate.
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
+            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center.
+
+    Returns:
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M). Default background = -1.
+    """
+    assert points.shape[0] == boxes.shape[0], \
+        'Points and boxes should have the same batch size, ' \
+        f'but got {points.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        'boxes dimension should be 7, ' \
+        f'but got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        'points dimension should be 3, ' \
+        f'but got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+
+    box_idxs_of_pts = points.new_zeros((batch_size, num_points),
+                                       dtype=torch.int).fill_(-1)
+
+    # If manually put the tensor 'points' or 'boxes' on a device
+    # which is not the current device, some temporary variables
+    # will be created on the current device in the cuda op,
+    # and the output will be incorrect.
+    # Therefore, we force the current device to be the same
+    # as the device of the tensors if it was not.
+    # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305
+    # for the incorrect output before the fix.
+    points_device = points.get_device()
+    assert points_device == boxes.get_device(), \
+        'Points and boxes should be put on the same device'
+    if torch.cuda.current_device() != points_device:
+        torch.cuda.set_device(points_device)
+
+    ext_module.points_in_boxes_part_forward(boxes.contiguous(),
+                                            points.contiguous(),
+                                            box_idxs_of_pts)
+
+    return box_idxs_of_pts
+
+
+def points_in_boxes_cpu(points: Tensor, boxes: Tensor) -> Tensor:
+    """Find all boxes in which each point is (CPU). The CPU version of
+    :meth:`points_in_boxes_all`.
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in
+            LiDAR/DEPTH coordinate
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
+            (x, y, z) is the bottom center.
+
+    Returns:
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M, T). Default background = 0.
+    """
+    assert points.shape[0] == boxes.shape[0], \
+        'Points and boxes should have the same batch size, ' \
+        f'but got {points.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        'boxes dimension should be 7, ' \
+        f'but got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        'points dimension should be 3, ' \
+        f'but got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+    num_boxes = boxes.shape[1]
+
+    point_indices = points.new_zeros((batch_size, num_boxes, num_points),
+                                     dtype=torch.int)
+    for b in range(batch_size):
+        ext_module.points_in_boxes_cpu_forward(boxes[b].float().contiguous(),
+                                               points[b].float().contiguous(),
+                                               point_indices[b])
+    point_indices = point_indices.transpose(1, 2)
+
+    return point_indices
+
+
+def points_in_boxes_all(points: Tensor, boxes: Tensor) -> Tensor:
+    """Find all boxes in which each point is (CUDA).
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
+            (x, y, z) is the bottom center.
+
+    Returns:
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M, T). Default background = 0.
+    """
+    assert boxes.shape[0] == points.shape[0], \
+        'Points and boxes should have the same batch size, ' \
+        f'but got {boxes.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        'boxes dimension should be 7, ' \
+        f'but got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        'points dimension should be 3, ' \
+        f'but got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+    num_boxes = boxes.shape[1]
+
+    box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),
+                                       dtype=torch.int).fill_(0)
+
+    # Same reason as line 25-32
+    points_device = points.get_device()
+    assert points_device == boxes.get_device(), \
+        'Points and boxes should be put on the same device'
+    if torch.cuda.current_device() != points_device:
+        torch.cuda.set_device(points_device)
+
+    ext_module.points_in_boxes_all_forward(boxes.contiguous(),
+                                           points.contiguous(),
+                                           box_idxs_of_pts)
+
+    return box_idxs_of_pts
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/points_in_polygons.py b/head_extractor/mmcv-2.1.0/mmcv/ops/points_in_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..e54b5a896dfc94a135c613a624fb43690da8aae3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/points_in_polygons.py
@@ -0,0 +1,41 @@
+import torch
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['points_in_polygons_forward'])
+
+
+def points_in_polygons(points: Tensor, polygons: Tensor) -> Tensor:
+    """Judging whether points are inside polygons, which is used in the ATSS
+    assignment for the rotated boxes.
+
+    It should be noted that when the point is just at the polygon boundary, the
+    judgment will be inaccurate, but the effect on assignment is limited.
+
+    Args:
+        points (torch.Tensor): It has shape (B, 2), indicating (x, y).
+            M means the number of predicted points.
+        polygons (torch.Tensor): It has shape (M, 8), indicating
+            (x1, y1, x2, y2, x3, y3, x4, y4). M means the number of
+            ground truth polygons.
+
+    Returns:
+        torch.Tensor: Return the result with the shape of (B, M),
+        1 indicates that the point is inside the polygon,
+        0 indicates that the point is outside the polygon.
+    """
+    assert points.shape[1] == 2, \
+        'points dimension should be 2, ' \
+        f'but got unexpected shape {points.shape[1]}'
+    assert polygons.shape[1] == 8, \
+        'polygons dimension should be 8, ' \
+        f'but got unexpected shape {polygons.shape[1]}'
+    output = torch.zeros(
+        points.shape[0],
+        polygons.shape[0],
+        dtype=torch.float32,
+        device=points.device)
+    ext_module.points_in_polygons_forward(points.contiguous(),
+                                          polygons.contiguous(), output)
+    return output
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/points_sampler.py b/head_extractor/mmcv-2.1.0/mmcv/ops/points_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..776abc76eccfb6c233ad9b481a230f5d1e22982f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/points_sampler.py
@@ -0,0 +1,178 @@
+from typing import List
+
+import torch
+from torch import Tensor
+from torch import nn as nn
+
+from .furthest_point_sample import (furthest_point_sample,
+                                    furthest_point_sample_with_dist)
+
+
+def calc_square_dist(point_feat_a: Tensor,
+                     point_feat_b: Tensor,
+                     norm: bool = True) -> Tensor:
+    """Calculating square distance between a and b.
+
+    Args:
+        point_feat_a (torch.Tensor): (B, N, C) Feature vector of each point.
+        point_feat_b (torch.Tensor): (B, M, C) Feature vector of each point.
+        norm (bool, optional): Whether to normalize the distance.
+            Default: True.
+
+    Returns:
+        torch.Tensor: (B, N, M) Square distance between each point pair.
+    """
+    num_channel = point_feat_a.shape[-1]
+    dist = torch.cdist(point_feat_a, point_feat_b)
+    if norm:
+        dist = dist / num_channel
+    else:
+        dist = torch.square(dist)
+    return dist
+
+
+def get_sampler_cls(sampler_type: str) -> nn.Module:
+    """Get the type and mode of points sampler.
+
+    Args:
+        sampler_type (str): The type of points sampler.
+            The valid value are "D-FPS", "F-FPS", or "FS".
+
+    Returns:
+        class: Points sampler type.
+    """
+    sampler_mappings = {
+        'D-FPS': DFPSSampler,
+        'F-FPS': FFPSSampler,
+        'FS': FSSampler,
+    }
+    try:
+        return sampler_mappings[sampler_type]
+    except KeyError:
+        raise KeyError(
+            f'Supported `sampler_type` are {sampler_mappings.keys()}, but got \
+                {sampler_type}')
+
+
+class PointsSampler(nn.Module):
+    """Points sampling.
+
+    Args:
+        num_point (list[int]): Number of sample points.
+        fps_mod_list (list[str], optional): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
+            F-FPS: using feature distances for FPS.
+            D-FPS: using Euclidean distances of points for FPS.
+            FS: using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (list[int], optional):
+            Range of points to apply FPS. Default: [-1].
+    """
+
+    def __init__(self,
+                 num_point: List[int],
+                 fps_mod_list: List[str] = ['D-FPS'],
+                 fps_sample_range_list: List[int] = [-1]) -> None:
+        super().__init__()
+        # FPS would be applied to different fps_mod in the list,
+        # so the length of the num_point should be equal to
+        # fps_mod_list and fps_sample_range_list.
+        assert len(num_point) == len(fps_mod_list) == len(
+            fps_sample_range_list)
+        self.num_point = num_point
+        self.fps_sample_range_list = fps_sample_range_list
+        self.samplers = nn.ModuleList()
+        for fps_mod in fps_mod_list:
+            self.samplers.append(get_sampler_cls(fps_mod)())
+        self.fp16_enabled = False
+
+    def forward(self, points_xyz: Tensor, features: Tensor) -> Tensor:
+        """
+        Args:
+            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of
+                the points.
+            features (torch.Tensor): (B, C, N) features of the points.
+
+        Returns:
+            torch.Tensor: (B, npoint, sample_num) Indices of sampled points.
+        """
+        if points_xyz.dtype == torch.half:
+            points_xyz = points_xyz.to(torch.float32)
+        if features is not None and features.dtype == torch.half:
+            features = features.to(torch.float32)
+
+        indices = []
+        last_fps_end_index = 0
+        for fps_sample_range, sampler, npoint in zip(
+                self.fps_sample_range_list, self.samplers, self.num_point):
+            assert fps_sample_range < points_xyz.shape[1]
+
+            if fps_sample_range == -1:
+                sample_points_xyz = points_xyz[:, last_fps_end_index:]
+                if features is not None:
+                    sample_features = features[:, :, last_fps_end_index:]
+                else:
+                    sample_features = None
+            else:
+                sample_points_xyz = points_xyz[:, last_fps_end_index:
+                                               fps_sample_range]
+                if features is not None:
+                    sample_features = features[:, :, last_fps_end_index:
+                                               fps_sample_range]
+                else:
+                    sample_features = None
+
+            fps_idx = sampler(sample_points_xyz.contiguous(), sample_features,
+                              npoint)
+
+            indices.append(fps_idx + last_fps_end_index)
+            last_fps_end_index = fps_sample_range
+        indices = torch.cat(indices, dim=1)
+
+        return indices
+
+
+class DFPSSampler(nn.Module):
+    """Using Euclidean distances of points for FPS."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
+        """Sampling points with D-FPS."""
+        fps_idx = furthest_point_sample(points.contiguous(), npoint)
+        return fps_idx
+
+
+class FFPSSampler(nn.Module):
+    """Using feature distances for FPS."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
+        """Sampling points with F-FPS."""
+        assert features is not None, \
+            'feature input to FFPS_Sampler should not be None'
+        features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2)
+        features_dist = calc_square_dist(
+            features_for_fps, features_for_fps, norm=False)
+        fps_idx = furthest_point_sample_with_dist(features_dist, npoint)
+        return fps_idx
+
+
+class FSSampler(nn.Module):
+    """Using F-FPS and D-FPS simultaneously."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
+        """Sampling points with FS_Sampling."""
+        assert features is not None, \
+            'feature input to FS_Sampler should not be None'
+        ffps_sampler = FFPSSampler()
+        dfps_sampler = DFPSSampler()
+        fps_idx_ffps = ffps_sampler(points, features, npoint)
+        fps_idx_dfps = dfps_sampler(points, features, npoint)
+        fps_idx = torch.cat([fps_idx_ffps, fps_idx_dfps], dim=1)
+        return fps_idx
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/prroi_pool.py b/head_extractor/mmcv-2.1.0/mmcv/ops/prroi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c263e30780cc221afaa721ebd3196f02f4a3776
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/prroi_pool.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.utils.dl_utils import TORCH_VERSION
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['prroi_pool_forward', 'prroi_pool_backward', 'prroi_pool_coor_backward'])
+
+
+class PrRoIPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features, rois, output_size, spatial_scale):
+        return g.op(
+            'mmcv::PrRoIPool',
+            features,
+            rois,
+            pooled_height_i=int(output_size[0]),
+            pooled_width_i=int(output_size[1]),
+            spatial_scale_f=float(spatial_scale))
+
+    @staticmethod
+    def forward(ctx,
+                features: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: Tuple,
+                spatial_scale: float = 1.0) -> torch.Tensor:
+        if features.dtype != torch.float32 or rois.dtype != torch.float32:
+            raise ValueError('Precise RoI Pooling only takes float input, got '
+                             f'{features.dtype()} for features and'
+                             f'{rois.dtype()} for rois.')
+
+        pooled_height = int(output_size[0])
+        pooled_width = int(output_size[1])
+        spatial_scale = float(spatial_scale)
+
+        features = features.contiguous()
+        rois = rois.contiguous()
+        output_shape = (rois.size(0), features.size(1), pooled_height,
+                        pooled_width)
+        output = features.new_zeros(output_shape)
+        params = (pooled_height, pooled_width, spatial_scale)
+
+        ext_module.prroi_pool_forward(
+            features,
+            rois,
+            output,
+            pooled_height=params[0],
+            pooled_width=params[1],
+            spatial_scale=params[2])
+        ctx.params = params
+        # everything here is contiguous.
+        ctx.save_for_backward(features, rois, output)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, None, None, None]:
+        features, rois, output = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(*features.shape)
+        grad_coor = grad_output.new_zeros(*rois.shape)
+
+        if features.requires_grad or TORCH_VERSION == 'parrots':
+            grad_output = grad_output.contiguous()
+            ext_module.prroi_pool_backward(
+                grad_output,
+                rois,
+                grad_input,
+                pooled_height=ctx.params[0],
+                pooled_width=ctx.params[1],
+                spatial_scale=ctx.params[2])
+        if rois.requires_grad or TORCH_VERSION == 'parrots':
+            grad_output = grad_output.contiguous()
+            ext_module.prroi_pool_coor_backward(
+                output,
+                grad_output,
+                features,
+                rois,
+                grad_coor,
+                pooled_height=ctx.params[0],
+                pooled_width=ctx.params[1],
+                spatial_scale=ctx.params[2])
+
+        return grad_input, grad_coor, None, None, None
+
+
+prroi_pool = PrRoIPoolFunction.apply
+
+
+class PrRoIPool(nn.Module):
+    """The operation of precision RoI pooling. The implementation of PrRoIPool
+    is modified from https://github.com/vacancy/PreciseRoIPooling/
+
+    Precise RoI Pooling (PrRoIPool) is an integration-based (bilinear
+    interpolation) average pooling method for RoI Pooling. It avoids any
+    quantization and has a continuous gradient on bounding box coordinates.
+    It is:
+
+    1. different from the original RoI Pooling proposed in Fast R-CNN. PrRoI
+    Pooling uses average pooling instead of max pooling for each bin and has a
+    continuous gradient on bounding box coordinates. That is, one can take the
+    derivatives of some loss function w.r.t the coordinates of each RoI and
+    optimize the RoI coordinates.
+    2. different from the RoI Align proposed in Mask R-CNN. PrRoI Pooling uses
+    a full integration-based average pooling instead of sampling a constant
+    number of points. This makes the gradient w.r.t. the coordinates
+    continuous.
+
+    Args:
+        output_size (Union[int, tuple]): h, w.
+        spatial_scale (float, optional): scale the input boxes by this number.
+            Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 output_size: Union[int, tuple],
+                 spatial_scale: float = 1.0):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+
+    def forward(self, features: torch.Tensor,
+                rois: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): The feature map.
+            rois (torch.Tensor): The RoI bboxes in [tl_x, tl_y, br_x, br_y]
+                format.
+
+        Returns:
+            torch.Tensor: The pooled results.
+        """
+        return prroi_pool(features, rois, self.output_size, self.spatial_scale)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/psa_mask.py b/head_extractor/mmcv-2.1.0/mmcv/ops/psa_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..45f4946662c6751fe72fe6fd139f6e4b508d6cba
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/psa_mask.py
@@ -0,0 +1,98 @@
+# Modified from https://github.com/hszhao/semseg/blob/master/lib/psa
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['psamask_forward', 'psamask_backward'])
+
+
+class PSAMaskFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, psa_type, mask_size):
+        return g.op(
+            'mmcv::MMCVPSAMask',
+            input,
+            psa_type_i=psa_type,
+            mask_size_i=mask_size)
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor, psa_type: str,
+                mask_size: int) -> torch.Tensor:
+        ctx.psa_type = psa_type
+        ctx.mask_size = _pair(mask_size)
+        ctx.save_for_backward(input)
+
+        h_mask, w_mask = ctx.mask_size
+        batch_size, channels, h_feature, w_feature = input.size()
+        assert channels == h_mask * w_mask
+        output = input.new_zeros(
+            (batch_size, h_feature * w_feature, h_feature, w_feature))
+
+        ext_module.psamask_forward(
+            input,
+            output,
+            psa_type=psa_type,
+            num_=batch_size,
+            h_feature=h_feature,
+            w_feature=w_feature,
+            h_mask=h_mask,
+            w_mask=w_mask,
+            half_h_mask=(h_mask - 1) // 2,
+            half_w_mask=(w_mask - 1) // 2)
+        return output
+
+    @staticmethod
+    def backward(
+            ctx, grad_output: torch.Tensor
+    ) -> Tuple[torch.Tensor, None, None, None]:
+        input = ctx.saved_tensors[0]
+        psa_type = ctx.psa_type
+        h_mask, w_mask = ctx.mask_size
+        batch_size, channels, h_feature, w_feature = input.size()
+        grad_input = grad_output.new_zeros(
+            (batch_size, channels, h_feature, w_feature))
+        ext_module.psamask_backward(
+            grad_output,
+            grad_input,
+            psa_type=psa_type,
+            num_=batch_size,
+            h_feature=h_feature,
+            w_feature=w_feature,
+            h_mask=h_mask,
+            w_mask=w_mask,
+            half_h_mask=(h_mask - 1) // 2,
+            half_w_mask=(w_mask - 1) // 2)
+        return grad_input, None, None, None
+
+
+psa_mask = PSAMaskFunction.apply
+
+
+class PSAMask(nn.Module):
+
+    def __init__(self, psa_type: str, mask_size: Optional[tuple] = None):
+        super().__init__()
+        assert psa_type in ['collect', 'distribute']
+        if psa_type == 'collect':
+            psa_type_enum = 0
+        else:
+            psa_type_enum = 1
+        self.psa_type_enum = psa_type_enum
+        self.mask_size = mask_size
+        self.psa_type = psa_type
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return psa_mask(input, self.psa_type_enum, self.mask_size)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(psa_type={self.psa_type}, '
+        s += f'mask_size={self.mask_size})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/riroi_align_rotated.py b/head_extractor/mmcv-2.1.0/mmcv/ops/riroi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4e5a542f2a4fd69c96415ce60ee1536ce77ed0f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/riroi_align_rotated.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.utils import is_tuple_of
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['riroi_align_rotated_forward', 'riroi_align_rotated_backward'])
+
+
+class RiRoIAlignRotatedFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any,
+                features: torch.Tensor,
+                rois: torch.Tensor,
+                out_size: Union[int, tuple],
+                spatial_scale: float,
+                num_samples: int = 0,
+                num_orientations: int = 8,
+                clockwise: bool = False) -> torch.Tensor:
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif is_tuple_of(out_size, int):
+            assert len(out_size) == 2
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                f'"out_size" should be an integer or tuple of integers,'
+                f' but got {out_size}')
+        ctx.spatial_scale = spatial_scale
+        ctx.num_samples = num_samples
+        ctx.num_orientations = num_orientations
+        ctx.clockwise = clockwise
+        ctx.save_for_backward(rois)
+        ctx.feature_size = features.size()
+
+        batch_size, num_channels, _, _ = features.size()
+        num_rois = rois.size(0)
+
+        output = features.new_zeros(num_rois, num_channels, out_h, out_w)
+
+        ext_module.riroi_align_rotated_forward(
+            features,
+            rois,
+            output,
+            pooled_height=out_h,
+            pooled_width=out_w,
+            spatial_scale=spatial_scale,
+            num_samples=num_samples,
+            num_orientations=num_orientations,
+            clockwise=clockwise)
+        return output
+
+    @staticmethod
+    def backward(
+        ctx: Any, grad_output: torch.Tensor
+    ) -> Optional[Tuple[torch.Tensor, None, None, None, None, None, None]]:
+        feature_size = ctx.feature_size
+        spatial_scale = ctx.spatial_scale
+        num_orientations = ctx.num_orientations
+        clockwise = ctx.clockwise
+        num_samples = ctx.num_samples
+        rois = ctx.saved_tensors[0]
+        assert feature_size is not None
+        batch_size, num_channels, feature_h, feature_w = feature_size
+
+        out_w = grad_output.size(3)
+        out_h = grad_output.size(2)
+
+        grad_input = None
+
+        if ctx.needs_input_grad[0]:
+            grad_input = rois.new_zeros(batch_size, num_channels, feature_h,
+                                        feature_w)
+            ext_module.riroi_align_rotated_backward(
+                grad_output.contiguous(),
+                rois,
+                grad_input,
+                pooled_height=out_h,
+                pooled_width=out_w,
+                spatial_scale=spatial_scale,
+                num_samples=num_samples,
+                num_orientations=num_orientations,
+                clockwise=clockwise)
+
+            return grad_input, None, None, None, None, None, None
+        return None
+
+
+riroi_align_rotated = RiRoIAlignRotatedFunction.apply
+
+
+class RiRoIAlignRotated(nn.Module):
+    """Rotation-invariant RoI align pooling layer for rotated proposals.
+
+    It accepts a feature map of shape (N, C, H, W) and rois with shape
+    (n, 6) with each roi decoded as (batch_index, center_x, center_y,
+    w, h, angle). The angle is in radian.
+
+    The details are described in the paper `ReDet: A Rotation-equivariant
+    Detector for Aerial Object Detection  <https://arxiv.org/abs/2103.07733>`_.
+
+    Args:
+        out_size (tuple): fixed dimensional RoI output with shape (h, w).
+        spatial_scale (float): scale the input boxes by this number
+        num_samples (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        num_orientations (int): number of oriented channels.
+        clockwise (bool): If True, the angle in each proposal follows a
+            clockwise fashion in image space, otherwise, the angle is
+            counterclockwise. Default: False.
+    """
+
+    def __init__(self,
+                 out_size: tuple,
+                 spatial_scale: float,
+                 num_samples: int = 0,
+                 num_orientations: int = 8,
+                 clockwise: bool = False):
+        super().__init__()
+
+        self.out_size = out_size
+        self.spatial_scale = float(spatial_scale)
+        self.num_samples = int(num_samples)
+        self.num_orientations = int(num_orientations)
+        self.clockwise = clockwise
+
+    def forward(self, features: torch.Tensor,
+                rois: torch.Tensor) -> torch.Tensor:
+        return RiRoIAlignRotatedFunction.apply(features, rois, self.out_size,
+                                               self.spatial_scale,
+                                               self.num_samples,
+                                               self.num_orientations,
+                                               self.clockwise)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/roi_align.py b/head_extractor/mmcv-2.1.0/mmcv/ops/roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..de2bed204df1b9ed00a379147086b7ca5123a1e3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/roi_align.py
@@ -0,0 +1,221 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any
+
+import torch
+import torch.nn as nn
+from mmengine.utils import deprecated_api_warning
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['roi_align_forward', 'roi_align_backward'])
+
+
+class RoIAlignFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
+                 pool_mode, aligned):
+        from torch.onnx import TensorProtoDataType
+        from torch.onnx.symbolic_opset9 import sub
+
+        def _select(g, self, dim, index):
+            return g.op('Gather', self, index, axis_i=dim)
+
+        # batch_indices = rois[:, 0].long()
+        batch_indices = _select(
+            g, rois, 1,
+            g.op('Constant', value_t=torch.tensor([0], dtype=torch.long)))
+        batch_indices = g.op('Squeeze', batch_indices, axes_i=[1])
+        batch_indices = g.op(
+            'Cast', batch_indices, to_i=TensorProtoDataType.INT64)
+        # rois = rois[:, 1:]
+        rois = _select(
+            g, rois, 1,
+            g.op(
+                'Constant',
+                value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
+
+        if aligned:
+            # rois -= 0.5/spatial_scale
+            aligned_offset = g.op(
+                'Constant',
+                value_t=torch.tensor([0.5 / spatial_scale],
+                                     dtype=torch.float32))
+            rois = sub(g, rois, aligned_offset)
+        # roi align
+        return g.op(
+            'RoiAlign',
+            input,
+            rois,
+            batch_indices,
+            output_height_i=output_size[0],
+            output_width_i=output_size[1],
+            spatial_scale_f=spatial_scale,
+            sampling_ratio_i=max(0, sampling_ratio),
+            mode_s=pool_mode)
+
+    @staticmethod
+    def forward(ctx: Any,
+                input: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: int,
+                spatial_scale: float = 1.0,
+                sampling_ratio: int = 0,
+                pool_mode: str = 'avg',
+                aligned: bool = True) -> torch.Tensor:
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        assert pool_mode in ('max', 'avg')
+        ctx.pool_mode = 0 if pool_mode == 'max' else 1
+        ctx.aligned = aligned
+        ctx.input_shape = input.size()
+
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        if ctx.pool_mode == 0:
+            argmax_y = input.new_zeros(output_shape)
+            argmax_x = input.new_zeros(output_shape)
+        else:
+            argmax_y = input.new_zeros(0)
+            argmax_x = input.new_zeros(0)
+
+        ext_module.roi_align_forward(
+            input,
+            rois,
+            output,
+            argmax_y,
+            argmax_x,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            pool_mode=ctx.pool_mode,
+            aligned=ctx.aligned)
+
+        ctx.save_for_backward(rois, argmax_y, argmax_x)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        rois, argmax_y, argmax_x = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+        # complex head architecture may cause grad_output uncontiguous.
+        grad_output = grad_output.contiguous()
+        ext_module.roi_align_backward(
+            grad_output,
+            rois,
+            argmax_y,
+            argmax_x,
+            grad_input,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            pool_mode=ctx.pool_mode,
+            aligned=ctx.aligned)
+        return grad_input, None, None, None, None, None, None
+
+
+roi_align = RoIAlignFunction.apply
+
+
+class RoIAlign(nn.Module):
+    """RoI align pooling layer.
+
+    Args:
+        output_size (tuple): h, w
+        spatial_scale (float): scale the input boxes by this number
+        sampling_ratio (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        pool_mode (str, 'avg' or 'max'): pooling mode in each bin.
+        aligned (bool): if False, use the legacy implementation in
+            MMDetection. If True, align the results more perfectly.
+        use_torchvision (bool): whether to use roi_align from torchvision.
+
+    Note:
+        The implementation of RoIAlign when aligned=True is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        The meaning of aligned=True:
+
+        Given a continuous coordinate c, its two neighboring pixel
+        indices (in our pixel model) are computed by floor(c - 0.5) and
+        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
+        indices [0] and [1] (which are sampled from the underlying signal
+        at continuous coordinates 0.5 and 1.5). But the original roi_align
+        (aligned=False) does not subtract the 0.5 when computing
+        neighboring pixel indices and therefore it uses pixels with a
+        slightly incorrect alignment (relative to our pixel model) when
+        performing bilinear interpolation.
+
+        With `aligned=True`,
+        we first appropriately scale the ROI and then shift it by -0.5
+        prior to calling roi_align. This produces the correct neighbors;
+
+        The difference does not make a difference to the model's
+        performance if ROIAlign is used together with conv layers.
+    """
+
+    @deprecated_api_warning(
+        {
+            'out_size': 'output_size',
+            'sample_num': 'sampling_ratio'
+        },
+        cls_name='RoIAlign')
+    def __init__(self,
+                 output_size: tuple,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 pool_mode: str = 'avg',
+                 aligned: bool = True,
+                 use_torchvision: bool = False):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.pool_mode = pool_mode
+        self.aligned = aligned
+        self.use_torchvision = use_torchvision
+
+    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input: NCHW images
+            rois: Bx5 boxes. First column is the index into N.\
+                The other 4 columns are xyxy.
+        """
+        if self.use_torchvision:
+            from torchvision.ops import roi_align as tv_roi_align
+            if 'aligned' in tv_roi_align.__code__.co_varnames:
+                return tv_roi_align(input, rois, self.output_size,
+                                    self.spatial_scale, self.sampling_ratio,
+                                    self.aligned)
+            else:
+                if self.aligned:
+                    rois -= rois.new_tensor([0.] +
+                                            [0.5 / self.spatial_scale] * 4)
+                return tv_roi_align(input, rois, self.output_size,
+                                    self.spatial_scale, self.sampling_ratio)
+        else:
+            return roi_align(input, rois, self.output_size, self.spatial_scale,
+                             self.sampling_ratio, self.pool_mode, self.aligned)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale}, '
+        s += f'sampling_ratio={self.sampling_ratio}, '
+        s += f'pool_mode={self.pool_mode}, '
+        s += f'aligned={self.aligned}, '
+        s += f'use_torchvision={self.use_torchvision})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/roi_align_rotated.py b/head_extractor/mmcv-2.1.0/mmcv/ops/roi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e6ea3d32705ac2d61d486fe67ff550bfd8a3f4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/roi_align_rotated.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.utils import deprecated_api_warning
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['roi_align_rotated_forward', 'roi_align_rotated_backward'])
+
+
+class RoIAlignRotatedFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
+                 aligned, clockwise):
+        if isinstance(output_size, int):
+            out_h = output_size
+            out_w = output_size
+        elif isinstance(output_size, tuple):
+            assert len(output_size) == 2
+            assert isinstance(output_size[0], int)
+            assert isinstance(output_size[1], int)
+            out_h, out_w = output_size
+        else:
+            raise TypeError(
+                '"output_size" must be an integer or tuple of integers')
+        return g.op(
+            'mmcv::MMCVRoIAlignRotated',
+            input,
+            rois,
+            output_height_i=out_h,
+            output_width_i=out_h,
+            spatial_scale_f=spatial_scale,
+            sampling_ratio_i=sampling_ratio,
+            aligned_i=aligned,
+            clockwise_i=clockwise)
+
+    @staticmethod
+    def forward(ctx: Any,
+                input: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: Union[int, tuple],
+                spatial_scale: float,
+                sampling_ratio: int = 0,
+                aligned: bool = True,
+                clockwise: bool = False) -> torch.Tensor:
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        ctx.aligned = aligned
+        ctx.clockwise = clockwise
+        ctx.save_for_backward(rois)
+        ctx.feature_size = input.size()
+
+        batch_size, num_channels, data_height, data_width = input.size()
+        num_rois = rois.size(0)
+
+        output = input.new_zeros(num_rois, num_channels, ctx.output_size[0],
+                                 ctx.output_size[1])
+        ext_module.roi_align_rotated_forward(
+            input,
+            rois,
+            output,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            aligned=ctx.aligned,
+            clockwise=ctx.clockwise)
+        return output
+
+    @staticmethod
+    def backward(
+        ctx: Any, grad_output: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], None, None,
+               None, None, None]:
+        feature_size = ctx.feature_size
+        rois = ctx.saved_tensors[0]
+        assert feature_size is not None
+        batch_size, num_channels, data_height, data_width = feature_size
+
+        out_w = grad_output.size(3)
+        out_h = grad_output.size(2)
+
+        grad_input = grad_rois = None
+
+        if ctx.needs_input_grad[0]:
+            grad_input = rois.new_zeros(batch_size, num_channels, data_height,
+                                        data_width)
+            ext_module.roi_align_rotated_backward(
+                grad_output.contiguous(),
+                rois,
+                grad_input,
+                pooled_height=out_h,
+                pooled_width=out_w,
+                spatial_scale=ctx.spatial_scale,
+                sampling_ratio=ctx.sampling_ratio,
+                aligned=ctx.aligned,
+                clockwise=ctx.clockwise)
+        return grad_input, grad_rois, None, None, None, None, None
+
+
+roi_align_rotated = RoIAlignRotatedFunction.apply
+
+
+class RoIAlignRotated(nn.Module):
+    """RoI align pooling layer for rotated proposals.
+
+    It accepts a feature map of shape (N, C, H, W) and rois with shape
+    (n, 6) with each roi decoded as (batch_index, center_x, center_y,
+    w, h, angle). The angle is in radian.
+
+    Args:
+        output_size (tuple): h, w
+        spatial_scale (float): scale the input boxes by this number
+        sampling_ratio(int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        aligned (bool): if False, use the legacy implementation in
+            MMDetection. If True, align the results more perfectly.
+            Default: True.
+        clockwise (bool): If True, the angle in each proposal follows a
+            clockwise fashion in image space, otherwise, the angle is
+            counterclockwise. Default: False.
+
+    Note:
+        The implementation of RoIAlign when aligned=True is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        The meaning of aligned=True:
+
+        Given a continuous coordinate c, its two neighboring pixel
+        indices (in our pixel model) are computed by floor(c - 0.5) and
+        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
+        indices [0] and [1] (which are sampled from the underlying signal
+        at continuous coordinates 0.5 and 1.5). But the original roi_align
+        (aligned=False) does not subtract the 0.5 when computing
+        neighboring pixel indices and therefore it uses pixels with a
+        slightly incorrect alignment (relative to our pixel model) when
+        performing bilinear interpolation.
+
+        With `aligned=True`,
+        we first appropriately scale the ROI and then shift it by -0.5
+        prior to calling roi_align. This produces the correct neighbors;
+
+        The difference does not make a difference to the model's
+        performance if ROIAlign is used together with conv layers.
+    """
+
+    @deprecated_api_warning(
+        {
+            'out_size': 'output_size',
+            'sample_num': 'sampling_ratio'
+        },
+        cls_name='RoIAlignRotated')
+    def __init__(self,
+                 output_size: Union[int, tuple],
+                 spatial_scale: float,
+                 sampling_ratio: int = 0,
+                 aligned: bool = True,
+                 clockwise: bool = False):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.aligned = aligned
+        self.clockwise = clockwise
+
+    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+        return RoIAlignRotatedFunction.apply(input, rois, self.output_size,
+                                             self.spatial_scale,
+                                             self.sampling_ratio, self.aligned,
+                                             self.clockwise)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale}, '
+        s += f'sampling_ratio={self.sampling_ratio}, '
+        s += f'aligned={self.aligned}, '
+        s += f'clockwise={self.clockwise})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/roi_pool.py b/head_extractor/mmcv-2.1.0/mmcv/ops/roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..e295b6a0c16b893688be3a574c6ce423df3399e4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/roi_pool.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['roi_pool_forward', 'roi_pool_backward'])
+
+
+class RoIPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale):
+        return g.op(
+            'MaxRoiPool',
+            input,
+            rois,
+            pooled_shape_i=output_size,
+            spatial_scale_f=spatial_scale)
+
+    @staticmethod
+    def forward(ctx: Any,
+                input: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: Union[int, tuple],
+                spatial_scale: float = 1.0) -> torch.Tensor:
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.input_shape = input.size()
+
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        argmax = input.new_zeros(output_shape, dtype=torch.int)
+
+        ext_module.roi_pool_forward(
+            input,
+            rois,
+            output,
+            argmax,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale)
+
+        ctx.save_for_backward(rois, argmax)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+            ctx: Any, grad_output: torch.Tensor
+    ) -> Tuple[torch.Tensor, None, None, None]:
+        rois, argmax = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+
+        ext_module.roi_pool_backward(
+            grad_output,
+            rois,
+            argmax,
+            grad_input,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale)
+
+        return grad_input, None, None, None
+
+
+roi_pool = RoIPoolFunction.apply
+
+
+class RoIPool(nn.Module):
+
+    def __init__(self,
+                 output_size: Union[int, tuple],
+                 spatial_scale: float = 1.0):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+
+    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+        return roi_pool(input, rois, self.output_size, self.spatial_scale)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/roiaware_pool3d.py b/head_extractor/mmcv-2.1.0/mmcv/ops/roiaware_pool3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..728f246809ae991481a85cc0b4eb8e689bdc7f8f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/roiaware_pool3d.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Tuple, Union
+
+import mmengine
+import torch
+from torch import nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['roiaware_pool3d_forward', 'roiaware_pool3d_backward'])
+
+
+class RoIAwarePool3d(nn.Module):
+    """Encode the geometry-specific features of each 3D proposal.
+
+    Please refer to `PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_ for more
+    details.
+
+    Args:
+        out_size (int or tuple): The size of output features. n or
+            [n1, n2, n3].
+        max_pts_per_voxel (int, optional): The maximum number of points per
+            voxel. Default: 128.
+        mode (str, optional): Pooling method of RoIAware, 'max' or 'avg'.
+            Default: 'max'.
+    """
+
+    def __init__(self,
+                 out_size: Union[int, tuple],
+                 max_pts_per_voxel: int = 128,
+                 mode: str = 'max'):
+        super().__init__()
+
+        self.out_size = out_size
+        self.max_pts_per_voxel = max_pts_per_voxel
+        assert mode in ['max', 'avg']
+        pool_mapping = {'max': 0, 'avg': 1}
+        self.mode = pool_mapping[mode]
+
+    def forward(self, rois: torch.Tensor, pts: torch.Tensor,
+                pts_feature: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
+                (x, y, z) is the bottom center of rois.
+            pts (torch.Tensor): [npoints, 3], coordinates of input points.
+            pts_feature (torch.Tensor): [npoints, C], features of input points.
+
+        Returns:
+            torch.Tensor: Pooled features whose shape is
+            [N, out_x, out_y, out_z, C].
+        """
+
+        return RoIAwarePool3dFunction.apply(rois, pts, pts_feature,
+                                            self.out_size,
+                                            self.max_pts_per_voxel, self.mode)
+
+
+class RoIAwarePool3dFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any, rois: torch.Tensor, pts: torch.Tensor,
+                pts_feature: torch.Tensor, out_size: Union[int, tuple],
+                max_pts_per_voxel: int, mode: int) -> torch.Tensor:
+        """
+        Args:
+            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
+                (x, y, z) is the bottom center of rois.
+            pts (torch.Tensor): [npoints, 3], coordinates of input points.
+            pts_feature (torch.Tensor): [npoints, C], features of input points.
+            out_size (int or tuple): The size of output features. n or
+                [n1, n2, n3].
+            max_pts_per_voxel (int): The maximum number of points per voxel.
+                Default: 128.
+            mode (int): Pooling method of RoIAware, 0 (max pool) or 1 (average
+                pool).
+
+        Returns:
+            torch.Tensor: Pooled features whose shape is
+            [N, out_x, out_y, out_z, C].
+        """
+
+        if isinstance(out_size, int):
+            out_x = out_y = out_z = out_size
+        else:
+            assert len(out_size) == 3
+            assert mmengine.is_tuple_of(out_size, int)
+            out_x, out_y, out_z = out_size
+
+        num_rois = rois.shape[0]
+        num_channels = pts_feature.shape[-1]
+        num_pts = pts.shape[0]
+
+        pooled_features = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, num_channels))
+        argmax = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int)
+        pts_idx_of_voxels = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, max_pts_per_voxel),
+            dtype=torch.int)
+
+        ext_module.roiaware_pool3d_forward(
+            rois,
+            pts,
+            pts_feature,
+            argmax,
+            pts_idx_of_voxels,
+            pooled_features,
+            pool_method=mode)
+
+        ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode,
+                                            num_pts, num_channels)
+        return pooled_features
+
+    @staticmethod
+    def backward(
+        ctx: Any, grad_out: torch.Tensor
+    ) -> Tuple[None, None, torch.Tensor, None, None, None]:
+        ret = ctx.roiaware_pool3d_for_backward
+        pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret
+
+        grad_in = grad_out.new_zeros((num_pts, num_channels))
+        ext_module.roiaware_pool3d_backward(
+            pts_idx_of_voxels,
+            argmax,
+            grad_out.contiguous(),
+            grad_in,
+            pool_method=mode)
+
+        return None, None, grad_in, None, None, None
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/roipoint_pool3d.py b/head_extractor/mmcv-2.1.0/mmcv/ops/roipoint_pool3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c16f5fa67cb3cf6d48d4263b5acf0173ccde7bf
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/roipoint_pool3d.py
@@ -0,0 +1,87 @@
+from typing import Any, Tuple
+
+import torch
+from torch import nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['roipoint_pool3d_forward'])
+
+
+class RoIPointPool3d(nn.Module):
+    """Encode the geometry-specific features of each 3D proposal.
+
+    Please refer to `Paper of PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_
+    for more details.
+
+    Args:
+        num_sampled_points (int, optional): Number of samples in each roi.
+            Default: 512.
+    """
+
+    def __init__(self, num_sampled_points: int = 512):
+        super().__init__()
+        self.num_sampled_points = num_sampled_points
+
+    def forward(self, points: torch.Tensor, point_features: torch.Tensor,
+                boxes3d: torch.Tensor) -> Tuple[torch.Tensor]:
+        """
+        Args:
+            points (torch.Tensor): Input points whose shape is (B, N, C).
+            point_features (torch.Tensor): Features of input points whose shape
+                is (B, N, C).
+            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the pooled features whose shape is (B, M, 512, 3 + C). The
+            second is an empty flag whose shape is (B, M).
+        """
+        return RoIPointPool3dFunction.apply(points, point_features, boxes3d,
+                                            self.num_sampled_points)
+
+
+class RoIPointPool3dFunction(Function):
+
+    @staticmethod
+    def forward(
+            ctx: Any,
+            points: torch.Tensor,
+            point_features: torch.Tensor,
+            boxes3d: torch.Tensor,
+            num_sampled_points: int = 512
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            points (torch.Tensor): Input points whose shape is (B, N, C).
+            point_features (torch.Tensor): Features of input points whose shape
+                is (B, N, C).
+            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).
+            num_sampled_points (int, optional): The num of sampled points.
+                Default: 512.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the pooled features whose shape is (B, M, 512, 3 + C). The
+            second is an empty flag whose shape is (B, M).
+        """
+        assert len(points.shape) == 3 and points.shape[2] == 3
+        batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[
+            1], point_features.shape[2]
+        pooled_boxes3d = boxes3d.view(batch_size, -1, 7)
+        pooled_features = point_features.new_zeros(
+            (batch_size, boxes_num, num_sampled_points, 3 + feature_len))
+        pooled_empty_flag = point_features.new_zeros(
+            (batch_size, boxes_num)).int()
+
+        ext_module.roipoint_pool3d_forward(points.contiguous(),
+                                           pooled_boxes3d.contiguous(),
+                                           point_features.contiguous(),
+                                           pooled_features, pooled_empty_flag)
+
+        return pooled_features, pooled_empty_flag
+
+    @staticmethod
+    def backward(ctx: Any, grad_out: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/rotated_feature_align.py b/head_extractor/mmcv-2.1.0/mmcv/ops/rotated_feature_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..0132c048621790fcd6211f179405a6f7b0d77390
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/rotated_feature_align.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any
+
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['rotated_feature_align_forward', 'rotated_feature_align_backward'])
+
+
+class RotatedFeatureAlignFunction(Function):
+    """Using the feature interpolation to obtain the position information
+    correspond to the refined rotate anchors and reconstruct the feature maps
+    in pixel-wise manner to achieve feature alignment.
+
+    The details are described in the paper
+    `R3Det: Refined Single-Stage Detector with Feature Refinement for Rotating
+    Object <https://arxiv.org/abs/1908.05612>`_.
+    """
+
+    @staticmethod
+    def symbolic(g, features, best_rbboxes, spatial_scale, points):
+        assert points in [1, 5]
+        return g.op(
+            'mmcv::MMCVRotatedFeatureAlign',
+            features,
+            best_rbboxes,
+            spatial_scale_f=spatial_scale,
+            points_i=points)
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, best_rbboxes: torch.Tensor,
+                spatial_scale: float, points: int) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Input features with shape [N,C,H,W].
+            best_rbboxes (torch.Tensor): Refined rotate anchors with
+                shape [N,H,W,5]. Coordinate format (cx,cx,h,w,a).
+            spatial_scale (float): The scale of feature map size and
+                input image size.
+            points (int, optional): The number of sample points.
+                Only 1 and 5 are supported. Defaults to 1.
+
+        Returns:
+            torch.Tensor: Refined features with shape [N,C,H,W].
+        """
+        ctx.spatial_scale = spatial_scale
+        ctx.points = points
+        ctx.save_for_backward(best_rbboxes)
+        assert points in [1, 5]
+        output = torch.zeros_like(features)
+        ext_module.rotated_feature_align_forward(
+            features,
+            best_rbboxes,
+            output,
+            spatial_scale=spatial_scale,
+            points=points)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        """
+        Args:
+            grad_output (torch.Tensor): The gradient of output features
+                with shape [N,C,H,W].
+
+        Returns:
+            torch.Tensor: The gradient of input features with shape [N,C,H,W].
+        """
+        best_rbboxes = ctx.saved_tensors[0]
+        points = ctx.points
+        spatial_scale = ctx.spatial_scale
+        grad_input = None
+        if ctx.needs_input_grad[0]:
+            grad_input = torch.zeros_like(grad_output)
+            ext_module.rotated_feature_align_backward(
+                grad_output.contiguous(),
+                best_rbboxes,
+                grad_input,
+                spatial_scale=spatial_scale,
+                points=points)
+        return grad_input, None, None, None
+
+
+def rotated_feature_align(features: torch.Tensor,
+                          best_rbboxes: torch.Tensor,
+                          spatial_scale: float = 1 / 8,
+                          points: int = 1) -> torch.Tensor:
+    return RotatedFeatureAlignFunction.apply(features, best_rbboxes,
+                                             spatial_scale, points)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/saconv.py b/head_extractor/mmcv-2.1.0/mmcv/ops/saconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..f932884073e0cc9a428d41f66b8aec0112b9e5ff
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/saconv.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import constant_init
+from mmengine.registry import MODELS
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+from mmcv.cnn import ConvAWS2d
+from mmcv.ops.deform_conv import deform_conv2d
+
+
+@MODELS.register_module(name='SAC')
+class SAConv2d(ConvAWS2d):
+    """SAC (Switchable Atrous Convolution)
+
+    This is an implementation of `DetectoRS: Detecting Objects with Recursive
+    Feature Pyramid and Switchable Atrous Convolution
+    <https://arxiv.org/abs/2006.02334>`_.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel elements.
+            Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+        use_deform: If ``True``, replace convolution with deformable
+            convolution. Default: ``False``.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 use_deform=False):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.use_deform = use_deform
+        self.switch = nn.Conv2d(
+            self.in_channels, 1, kernel_size=1, stride=stride, bias=True)
+        self.weight_diff = nn.Parameter(torch.Tensor(self.weight.size()))
+        self.pre_context = nn.Conv2d(
+            self.in_channels, self.in_channels, kernel_size=1, bias=True)
+        self.post_context = nn.Conv2d(
+            self.out_channels, self.out_channels, kernel_size=1, bias=True)
+        if self.use_deform:
+            self.offset_s = nn.Conv2d(
+                self.in_channels,
+                18,
+                kernel_size=3,
+                padding=1,
+                stride=stride,
+                bias=True)
+            self.offset_l = nn.Conv2d(
+                self.in_channels,
+                18,
+                kernel_size=3,
+                padding=1,
+                stride=stride,
+                bias=True)
+        self.init_weights()
+
+    def init_weights(self):
+        constant_init(self.switch, 0, bias=1)
+        self.weight_diff.data.zero_()
+        constant_init(self.pre_context, 0)
+        constant_init(self.post_context, 0)
+        if self.use_deform:
+            constant_init(self.offset_s, 0)
+            constant_init(self.offset_l, 0)
+
+    def forward(self, x):
+        # pre-context
+        avg_x = F.adaptive_avg_pool2d(x, output_size=1)
+        avg_x = self.pre_context(avg_x)
+        avg_x = avg_x.expand_as(x)
+        x = x + avg_x
+        # switch
+        avg_x = F.pad(x, pad=(2, 2, 2, 2), mode='reflect')
+        avg_x = F.avg_pool2d(avg_x, kernel_size=5, stride=1, padding=0)
+        switch = self.switch(avg_x)
+        # sac
+        weight = self._get_weight(self.weight)
+        zero_bias = torch.zeros(
+            self.out_channels, device=weight.device, dtype=weight.dtype)
+
+        if self.use_deform:
+            offset = self.offset_s(avg_x)
+            out_s = deform_conv2d(x, offset, weight, self.stride, self.padding,
+                                  self.dilation, self.groups, 1)
+        else:
+            if (TORCH_VERSION == 'parrots'
+                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):
+                out_s = super().conv2d_forward(x, weight)
+            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
+                # bias is a required argument of _conv_forward in torch 1.8.0
+                out_s = super()._conv_forward(x, weight, zero_bias)
+            else:
+                out_s = super()._conv_forward(x, weight)
+        ori_p = self.padding
+        ori_d = self.dilation
+        self.padding = tuple(3 * p for p in self.padding)
+        self.dilation = tuple(3 * d for d in self.dilation)
+        weight = weight + self.weight_diff
+        if self.use_deform:
+            offset = self.offset_l(avg_x)
+            out_l = deform_conv2d(x, offset, weight, self.stride, self.padding,
+                                  self.dilation, self.groups, 1)
+        else:
+            if (TORCH_VERSION == 'parrots'
+                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):
+                out_l = super().conv2d_forward(x, weight)
+            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
+                # bias is a required argument of _conv_forward in torch 1.8.0
+                out_l = super()._conv_forward(x, weight, zero_bias)
+            else:
+                out_l = super()._conv_forward(x, weight)
+
+        out = switch * out_s + (1 - switch) * out_l
+        self.padding = ori_p
+        self.dilation = ori_d
+        # post-context
+        avg_x = F.adaptive_avg_pool2d(out, output_size=1)
+        avg_x = self.post_context(avg_x)
+        avg_x = avg_x.expand_as(out)
+        out = out + avg_x
+        return out
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/scatter_points.py b/head_extractor/mmcv-2.1.0/mmcv/ops/scatter_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d881bfe63309fb406c123ee69d4e37125f45843
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/scatter_points.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['dynamic_point_to_voxel_forward', 'dynamic_point_to_voxel_backward'])
+
+
+class _DynamicScatter(Function):
+
+    @staticmethod
+    def forward(ctx: Any,
+                feats: torch.Tensor,
+                coors: torch.Tensor,
+                reduce_type: str = 'max') -> Tuple[torch.Tensor, torch.Tensor]:
+        """convert kitti points(N, >=3) to voxels.
+
+        Args:
+            feats (torch.Tensor): [N, C]. Points features to be reduced
+                into voxels.
+            coors (torch.Tensor): [N, ndim]. Corresponding voxel coordinates
+                (specifically multi-dim voxel index) of each points.
+            reduce_type (str, optional): Reduce op. support 'max', 'sum' and
+                'mean'. Default: 'max'.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        results = ext_module.dynamic_point_to_voxel_forward(
+            feats, coors, reduce_type)
+        (voxel_feats, voxel_coors, point2voxel_map,
+         voxel_points_count) = results
+        ctx.reduce_type = reduce_type
+        ctx.save_for_backward(feats, voxel_feats, point2voxel_map,
+                              voxel_points_count)
+        ctx.mark_non_differentiable(voxel_coors)
+        return voxel_feats, voxel_coors
+
+    @staticmethod
+    def backward(ctx: Any,
+                 grad_voxel_feats: torch.Tensor,
+                 grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple:
+        (feats, voxel_feats, point2voxel_map,
+         voxel_points_count) = ctx.saved_tensors
+        grad_feats = torch.zeros_like(feats)
+        # TODO: whether to use index put or use cuda_backward
+        # To use index put, need point to voxel index
+        ext_module.dynamic_point_to_voxel_backward(
+            grad_feats, grad_voxel_feats.contiguous(), feats, voxel_feats,
+            point2voxel_map, voxel_points_count, ctx.reduce_type)
+        return grad_feats, None, None
+
+
+dynamic_scatter = _DynamicScatter.apply
+
+
+class DynamicScatter(nn.Module):
+    """Scatters points into voxels, used in the voxel encoder with dynamic
+    voxelization.
+
+    Note:
+        The CPU and GPU implementation get the same output, but have numerical
+        difference after summation and division (e.g., 5e-7).
+
+    Args:
+        voxel_size (list): list [x, y, z] size of three dimension.
+        point_cloud_range (list): The coordinate range of points, [x_min,
+            y_min, z_min, x_max, y_max, z_max].
+        average_points (bool): whether to use avg pooling to scatter points
+            into voxel.
+    """
+
+    def __init__(self, voxel_size: List, point_cloud_range: List,
+                 average_points: bool):
+        super().__init__()
+
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.average_points = average_points
+
+    def forward_single(
+            self, points: torch.Tensor,
+            coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Scatters points into voxels.
+
+        Args:
+            points (torch.Tensor): Points to be reduced into voxels.
+            coors (torch.Tensor): Corresponding voxel coordinates (specifically
+                multi-dim voxel index) of each points.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        reduce = 'mean' if self.average_points else 'max'
+        return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)
+
+    def forward(self, points: torch.Tensor,
+                coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Scatters points/features into voxels.
+
+        Args:
+            points (torch.Tensor): Points to be reduced into voxels.
+            coors (torch.Tensor): Corresponding voxel coordinates (specifically
+                multi-dim voxel index) of each points.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        if coors.size(-1) == 3:
+            return self.forward_single(points, coors)
+        else:
+            batch_size = coors[-1, 0] + 1
+            voxels, voxel_coors = [], []
+            for i in range(batch_size):
+                inds = torch.where(coors[:, 0] == i)
+                voxel, voxel_coor = self.forward_single(
+                    points[inds], coors[inds][:, 1:])
+                coor_pad = F.pad(voxel_coor, (1, 0), mode='constant', value=i)
+                voxel_coors.append(coor_pad)
+                voxels.append(voxel)
+            features = torch.cat(voxels, dim=0)
+            feature_coors = torch.cat(voxel_coors, dim=0)
+
+            return features, feature_coors
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'voxel_size=' + str(self.voxel_size)
+        s += ', point_cloud_range=' + str(self.point_cloud_range)
+        s += ', average_points=' + str(self.average_points)
+        s += ')'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_conv.py b/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32129d1833d0ed61ec2c8bef77bcb9013a11ee2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_conv.py
@@ -0,0 +1,455 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import numpy as np
+import torch
+from mmengine.registry import MODELS
+from torch.nn import init
+from torch.nn.parameter import Parameter
+
+from . import sparse_functional as Fsp
+from . import sparse_ops as ops
+from .sparse_modules import SparseModule
+from .sparse_structure import SparseConvTensor
+
+
+def _calculate_fan_in_and_fan_out_hwio(tensor):
+    dimensions = tensor.ndimension()
+    if dimensions < 2:
+        raise ValueError('fan in and fan out can not be computed for tensor'
+                         'with fewer than 2 dimensions')
+
+    if dimensions == 2:  # Linear
+        fan_in = tensor.size(-2)
+        fan_out = tensor.size(-1)
+    else:
+        num_input_fmaps = tensor.size(-2)
+        num_output_fmaps = tensor.size(-1)
+        receptive_field_size = 1
+        if tensor.dim() > 2:
+            receptive_field_size = tensor[..., 0, 0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+class SparseConvolution(SparseModule):
+
+    def __init__(self,
+                 ndim,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 subm=False,
+                 output_padding=0,
+                 transposed=False,
+                 inverse=False,
+                 indice_key=None,
+                 fused_bn=False):
+        super().__init__()
+        assert groups == 1
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+        if not isinstance(output_padding, (list, tuple)):
+            output_padding = [output_padding] * ndim
+
+        for d, s in zip(dilation, stride):
+            assert any([s == 1, d == 1]), "don't support this."
+
+        self.ndim = ndim
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.conv1x1 = np.prod(kernel_size) == 1
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.transposed = transposed
+        self.inverse = inverse
+        self.output_padding = output_padding
+        self.groups = groups
+        self.subm = subm
+        self.indice_key = indice_key
+        self.fused_bn = fused_bn
+
+        self.weight = Parameter(
+            torch.Tensor(*kernel_size, in_channels, out_channels))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input):
+        assert isinstance(input, SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            if self.transposed:
+                out_spatial_shape = ops.get_deconv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation, self.output_padding)
+            else:
+                out_spatial_shape = ops.get_conv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation)
+
+        else:
+            out_spatial_shape = spatial_shape
+
+        if self.conv1x1:
+            features = torch.mm(
+                input.features,
+                self.weight.view(self.in_channels, self.out_channels))
+            if self.bias is not None:
+                features += self.bias
+            out_tensor = SparseConvTensor(features, input.indices,
+                                          input.spatial_shape,
+                                          input.batch_size)
+            out_tensor.indice_dict = input.indice_dict
+            out_tensor.grid = input.grid
+            return out_tensor
+        data = input.find_indice_pair(self.indice_key)
+        if self.inverse:
+            assert data is not None and self.indice_key is not None
+            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = data
+            assert indice_pairs.shape[0] == np.prod(
+                self.kernel_size
+            ), 'inverse conv must have same kernel size as its couple conv'
+        else:
+            if self.indice_key is not None and data is not None:
+                outids, _, indice_pairs, indice_pair_num, _ = data
+            else:
+                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
+                    indices,
+                    batch_size,
+                    spatial_shape,
+                    self.kernel_size,
+                    self.stride,
+                    self.padding,
+                    self.dilation,
+                    self.output_padding,
+                    self.subm,
+                    self.transposed,
+                    grid=input.grid)
+                input.indice_dict[self.indice_key] = (outids, indices,
+                                                      indice_pairs,
+                                                      indice_pair_num,
+                                                      spatial_shape)
+        if self.fused_bn:
+            assert self.bias is not None
+            out_features = ops.fused_indice_conv(features, self.weight,
+                                                 self.bias,
+                                                 indice_pairs.to(device),
+                                                 indice_pair_num,
+                                                 outids.shape[0], self.inverse,
+                                                 self.subm)
+        else:
+            if self.subm:
+                out_features = Fsp.indice_subm_conv(features, self.weight,
+                                                    indice_pairs.to(device),
+                                                    indice_pair_num,
+                                                    outids.shape[0])
+            else:
+                if self.inverse:
+                    out_features = Fsp.indice_inverse_conv(
+                        features, self.weight, indice_pairs.to(device),
+                        indice_pair_num, outids.shape[0])
+                else:
+                    out_features = Fsp.indice_conv(features, self.weight,
+                                                   indice_pairs.to(device),
+                                                   indice_pair_num,
+                                                   outids.shape[0])
+
+            if self.bias is not None:
+                out_features += self.bias
+        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
+                                      batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+
+
+@MODELS.register_module()
+class SparseConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SparseConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SparseConv4d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SparseConvTranspose2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SparseConvTranspose3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SparseInverseConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key=None,
+                 bias=True):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SparseInverseConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key=None,
+                 bias=True):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SubMConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SubMConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
+
+
+@MODELS.register_module()
+class SubMConv4d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_functional.py b/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a80a545aa5411b047518bf1286bb8489bece76b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_functional.py
@@ -0,0 +1,156 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+
+import torch
+from torch.autograd import Function
+
+from . import sparse_ops as ops
+
+
+class SparseConvFunction(Function):
+    """Sparse Convolution.
+
+    Please refer to `SECOND <https://www.mdpi.com/1424-8220/18/10/3337>`_ for
+    more details.
+    """
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
+                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, False)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            False)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SparseInverseConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
+                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, True, False)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            True, False)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SubMConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
+                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, False, True)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            False, True)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SparseMaxPoolFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, indice_pairs: torch.Tensor,
+                indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from sparse maxpooling.
+        """
+        out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,
+                                 num_activate_out)
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
+        input_bp = ops.indice_maxpool_backward(features, out, grad_output,
+                                               indice_pairs, indice_pair_num)
+        return input_bp, None, None, None
+
+
+indice_conv = SparseConvFunction.apply
+indice_inverse_conv = SparseInverseConvFunction.apply
+indice_subm_conv = SubMConvFunction.apply
+indice_maxpool = SparseMaxPoolFunction.apply
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_modules.py b/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..20a92aa279754767da493eee876cb1ab716bc770
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_modules.py
@@ -0,0 +1,203 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from collections import OrderedDict
+from typing import Any, List, Optional, Union
+
+import torch
+from torch import nn
+
+from .sparse_structure import SparseConvTensor
+
+
+def is_spconv_module(module: nn.Module) -> bool:
+    spconv_modules = (SparseModule, )
+    return isinstance(module, spconv_modules)
+
+
+def is_sparse_conv(module: nn.Module) -> bool:
+    from .sparse_conv import SparseConvolution
+    return isinstance(module, SparseConvolution)
+
+
+def _mean_update(vals: Union[int, List], m_vals: Union[int, List],
+                 t: float) -> List:
+    outputs = []
+    if not isinstance(vals, list):
+        vals = [vals]
+    if not isinstance(m_vals, list):
+        m_vals = [m_vals]
+    for val, m_val in zip(vals, m_vals):
+        output = t / float(t + 1) * m_val + 1 / float(t + 1) * val
+        outputs.append(output)
+    if len(outputs) == 1:
+        outputs = outputs[0]
+    return outputs
+
+
+class SparseModule(nn.Module):
+    """place holder, All module subclass from this will take sptensor in
+    SparseSequential."""
+    pass
+
+
+class SparseSequential(SparseModule):
+    r"""A sequential container.
+    Modules will be added to it in the order they are passed in the
+    constructor.
+    Alternatively, an ordered dict of modules can also be passed in.
+
+    To make it easier to understand, given is a small example::
+
+    Example:
+        >>> # using Sequential:
+        >>> from mmcv.ops import SparseSequential
+        >>> model = SparseSequential(
+                    SparseConv2d(1,20,5),
+                    nn.ReLU(),
+                    SparseConv2d(20,64,5),
+                    nn.ReLU()
+                    )
+
+        >>> # using Sequential with OrderedDict
+        >>> model = SparseSequential(OrderedDict([
+                      ('conv1', SparseConv2d(1,20,5)),
+                      ('relu1', nn.ReLU()),
+                      ('conv2', SparseConv2d(20,64,5)),
+                      ('relu2', nn.ReLU())
+                    ]))
+
+        >>> # using Sequential with kwargs(python 3.6+)
+        >>> model = SparseSequential(
+                      conv1=SparseConv2d(1,20,5),
+                      relu1=nn.ReLU(),
+                      conv2=SparseConv2d(20,64,5),
+                      relu2=nn.ReLU()
+                    )
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+        for name, module in kwargs.items():
+            if sys.version_info < (3, 6):
+                raise ValueError('kwargs only supported in py36+')
+            if name in self._modules:
+                raise ValueError('name exists.')
+            self.add_module(name, module)
+        self._sparity_dict = {}
+
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError(f'index {idx} is out of range')
+        if idx < 0:
+            idx += len(self)
+        it = iter(self._modules.values())
+        for i in range(idx):
+            next(it)
+        return next(it)
+
+    def __len__(self):
+        return len(self._modules)
+
+    @property
+    def sparity_dict(self):
+        return self._sparity_dict
+
+    def add(self, module: Any, name: Optional[str] = None) -> None:
+        if name is None:
+            name = str(len(self._modules))
+            if name in self._modules:
+                raise KeyError('name exists')
+        self.add_module(name, module)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        for k, module in self._modules.items():
+            if is_spconv_module(module):
+                assert isinstance(input, SparseConvTensor)
+                self._sparity_dict[k] = input.sparity
+                input = module(input)
+            else:
+                if isinstance(input, SparseConvTensor):
+                    if input.indices.shape[0] != 0:
+                        input.features = module(input.features)
+                else:
+                    input = module(input)
+        return input
+
+    def fused(self):
+        from .sparse_conv import SparseConvolution
+        mods = [v for k, v in self._modules.items()]
+        fused_mods = []
+        idx = 0
+        while idx < len(mods):
+            if is_sparse_conv(mods[idx]):
+                if idx < len(mods) - 1 and isinstance(mods[idx + 1],
+                                                      nn.BatchNorm1d):
+                    new_module = SparseConvolution(
+                        ndim=mods[idx].ndim,
+                        in_channels=mods[idx].in_channels,
+                        out_channels=mods[idx].out_channels,
+                        kernel_size=mods[idx].kernel_size,
+                        stride=mods[idx].stride,
+                        padding=mods[idx].padding,
+                        dilation=mods[idx].dilation,
+                        groups=mods[idx].groups,
+                        bias=True,
+                        subm=mods[idx].subm,
+                        output_padding=mods[idx].output_padding,
+                        transposed=mods[idx].transposed,
+                        inverse=mods[idx].inverse,
+                        indice_key=mods[idx].indice_key,
+                        fused_bn=True,
+                    )
+                    new_module.load_state_dict(mods[idx].state_dict(), False)
+                    new_module.to(mods[idx].weight.device)
+                    conv = new_module
+                    bn = mods[idx + 1]
+                    conv.bias.data.zero_()
+                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (
+                        torch.sqrt(bn.running_var) + bn.eps)
+                    conv.bias.data[:] = (
+                        conv.bias.data - bn.running_mean) * bn.weight.data / (
+                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
+                    fused_mods.append(conv)
+                    idx += 2
+                else:
+                    fused_mods.append(mods[idx])
+                    idx += 1
+            else:
+                fused_mods.append(mods[idx])
+                idx += 1
+        return SparseSequential(*fused_mods)
+
+
+class ToDense(SparseModule):
+    """convert SparseConvTensor to NCHW dense tensor."""
+
+    def forward(self, x: SparseConvTensor):
+        return x.dense()
+
+
+class RemoveGrid(SparseModule):
+    """remove pre-allocated grid buffer."""
+
+    def forward(self, x: SparseConvTensor):
+        x.grid = None
+        return x
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_ops.py b/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b3f54bcffe7f6d8aae166ab06bceb9d2494b93
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_ops.py
@@ -0,0 +1,174 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'get_indice_pairs_2d_forward', 'get_indice_pairs_3d_forward',
+    'get_indice_pairs_4d_forward', 'get_indice_pairs_2d_backward',
+    'get_indice_pairs_3d_backward', 'indice_conv_forward',
+    'indice_conv_backward', 'fused_indice_conv_forward',
+    'indice_maxpool_forward', 'indice_maxpool_backward'
+])
+
+
+def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        size = (input_size[i] + 2 * padding[i] - dilation[i] *
+                (kernel_size[i] - 1) - 1) // stride[i] + 1
+        if kernel_size[i] == -1:
+            output_size.append(1)
+        else:
+            output_size.append(size)
+    return output_size
+
+
+def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
+                           output_padding):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        if kernel_size[i] == -1:
+            raise ValueError("deconv don't support kernel_size < 0")
+        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
+            i] + output_padding[i]
+        output_size.append(size)
+    return output_size
+
+
+def get_indice_pairs(indices,
+                     batch_size,
+                     spatial_shape,
+                     ksize=3,
+                     stride=1,
+                     padding=0,
+                     dilation=1,
+                     out_padding=0,
+                     subm=False,
+                     transpose=False,
+                     grid=None):
+    ndim = indices.shape[1] - 1
+    if not isinstance(ksize, (list, tuple)):
+        ksize = [ksize] * ndim
+    if not isinstance(stride, (list, tuple)):
+        stride = [stride] * ndim
+    if not isinstance(padding, (list, tuple)):
+        padding = [padding] * ndim
+    if not isinstance(dilation, (list, tuple)):
+        dilation = [dilation] * ndim
+    if not isinstance(out_padding, (list, tuple)):
+        out_padding = [out_padding] * ndim
+
+    for d, s in zip(dilation, stride):
+        assert any([s == 1, d == 1]), "don't support this."
+
+    if not subm:
+        if transpose:
+            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
+                                               padding, dilation, out_padding)
+        else:
+            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
+                                             padding, dilation)
+
+    else:
+        out_shape = spatial_shape
+    if grid is None:
+        if ndim == 2:
+            get_indice_pairs_func = ext_module.get_indice_pairs_2d_forward
+        elif ndim == 3:
+            get_indice_pairs_func = ext_module.get_indice_pairs_3d_forward
+        elif ndim == 4:
+            get_indice_pairs_func = ext_module.get_indice_pairs_4d_forward
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+    else:
+        if ndim == 2:
+            get_indice_pairs_func = ext_module.get_indice_pairs_2d_backward
+        elif ndim == 3:
+            get_indice_pairs_func = ext_module.get_indice_pairs_3d_backward
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, grid, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+
+
+def indice_conv(features,
+                filters,
+                indice_pairs,
+                indice_pair_num,
+                num_activate_out,
+                inverse=False,
+                subm=False):
+    if filters.dtype == torch.float32 or filters.dtype == torch.half:
+        return ext_module.indice_conv_forward(features, filters, indice_pairs,
+                                              indice_pair_num,
+                                              num_activate_out, int(inverse),
+                                              int(subm))
+    else:
+        raise NotImplementedError
+
+
+def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
+                      num_activate_out, inverse, subm):
+    if features.dtype == torch.half or filters.dtypes == torch.float32:
+        func = ext_module.fused_indice_conv_forward
+    else:
+        raise NotImplementedError
+
+    return func(features, filters, bias, indice_pairs, indice_pair_num,
+                num_activate_out, int(inverse), int(subm))
+
+
+def indice_conv_backward(features,
+                         filters,
+                         out_bp,
+                         indice_pairs,
+                         indice_pair_num,
+                         inverse=False,
+                         subm=False):
+    if filters.dtype == torch.float32 or filters.dtype == torch.half:
+        return ext_module.indice_conv_backward(features, filters, out_bp,
+                                               indice_pairs, indice_pair_num,
+                                               int(inverse), int(subm))
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
+    if features.dtype == torch.float32 or features.dtype == torch.half:
+        return ext_module.indice_maxpool_forward(features, indice_pairs,
+                                                 indice_pair_num,
+                                                 num_activate_out)
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
+                            indice_pair_num):
+    if features.dtype == torch.float32 or features.dtype == torch.half:
+        return ext_module.indice_maxpool_backward(features, out_features,
+                                                  out_bp, indice_pairs,
+                                                  indice_pair_num)
+    else:
+        raise NotImplementedError
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_pool.py b/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4edb1d4e4029bfff2978bc9ea7961719d873110
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_pool.py
@@ -0,0 +1,86 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# import sparse_functional as Fsp
+# import sparse_ops as ops
+from .sparse_functional import indice_maxpool
+from .sparse_modules import SparseModule
+from .sparse_ops import get_conv_output_size, get_indice_pairs
+from .sparse_structure import SparseConvTensor
+
+
+class SparseMaxPool(SparseModule):
+
+    def __init__(self,
+                 ndim,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 subm=False):
+        super().__init__()
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+
+        self.ndim = ndim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.subm = subm
+        self.dilation = dilation
+
+    def forward(self, input):
+        assert isinstance(input, SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            out_spatial_shape = get_conv_output_size(spatial_shape,
+                                                     self.kernel_size,
+                                                     self.stride, self.padding,
+                                                     self.dilation)
+        else:
+            out_spatial_shape = spatial_shape
+        outids, indice_pairs, indice_pairs_num = get_indice_pairs(
+            indices, batch_size, spatial_shape, self.kernel_size, self.stride,
+            self.padding, self.dilation, 0, self.subm)
+
+        out_features = indice_maxpool(features, indice_pairs.to(device),
+                                      indice_pairs_num.to(device),
+                                      outids.shape[0])
+        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
+                                      batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+
+
+class SparseMaxPool2d(SparseMaxPool):
+
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super().__init__(2, kernel_size, stride, padding, dilation)
+
+
+class SparseMaxPool3d(SparseMaxPool):
+
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super().__init__(3, kernel_size, stride, padding, dilation)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_structure.py b/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_structure.py
new file mode 100644
index 0000000000000000000000000000000000000000..83907ab5563ff292e8c48715f5b1149a7d31f460
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/sparse_structure.py
@@ -0,0 +1,66 @@
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+
+def scatter_nd(indices: torch.Tensor, updates: torch.Tensor,
+               shape: torch.Tensor) -> torch.Tensor:
+    """pytorch edition of tensorflow scatter_nd.
+
+    this function don't contain except handle code. so use this carefully when
+    indice repeats, don't support repeat add which is supported in tensorflow.
+    """
+    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    ret[slices] = updates.view(*output_shape)
+    return ret
+
+
+class SparseConvTensor:
+
+    def __init__(self,
+                 features: torch.Tensor,
+                 indices: torch.Tensor,
+                 spatial_shape: Union[List, Tuple],
+                 batch_size: int,
+                 grid: Optional[torch.Tensor] = None):
+        self.features = features
+        self.indices = indices
+        if self.indices.dtype != torch.int32:
+            self.indices.int()
+        self.spatial_shape = spatial_shape
+        self.batch_size = batch_size
+        self.indice_dict: dict = {}
+        self.grid = grid
+
+    @property
+    def spatial_size(self):
+        return np.prod(self.spatial_shape)
+
+    def find_indice_pair(self, key):
+        if key is None:
+            return None
+        if key in self.indice_dict:
+            return self.indice_dict[key]
+        return None
+
+    def dense(self, channels_first: bool = True) -> torch.Tensor:
+        output_shape = [self.batch_size] + list(
+            self.spatial_shape) + [self.features.shape[1]]
+        res = scatter_nd(self.indices.long(), self.features, output_shape)
+        if not channels_first:
+            return res
+        ndim = len(self.spatial_shape)
+        trans_params = list(range(0, ndim + 1))
+        trans_params.insert(1, ndim + 1)
+        return res.permute(*trans_params).contiguous()
+
+    @property
+    def sparity(self):
+        return (self.indices.shape[0] / np.prod(self.spatial_shape) /
+                self.batch_size)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/sync_bn.py b/head_extractor/mmcv-2.1.0/mmcv/ops/sync_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b14d30376c7d1dce957e2225b5b8c8af54bb52a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/sync_bn.py
@@ -0,0 +1,283 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from mmengine.registry import MODELS
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.module import Module
+from torch.nn.parameter import Parameter
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'sync_bn_forward_mean', 'sync_bn_forward_var', 'sync_bn_forward_output',
+    'sync_bn_backward_param', 'sync_bn_backward_data'
+])
+
+
+class SyncBatchNormFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, running_mean, running_var, weight, bias, momentum,
+                 eps, group, group_size, stats_mode):
+        return g.op(
+            'mmcv::MMCVSyncBatchNorm',
+            input,
+            running_mean,
+            running_var,
+            weight,
+            bias,
+            momentum_f=momentum,
+            eps_f=eps,
+            group_i=group,
+            group_size_i=group_size,
+            stats_mode=stats_mode)
+
+    @staticmethod
+    def forward(self, input: torch.Tensor, running_mean: torch.Tensor,
+                running_var: torch.Tensor, weight: torch.Tensor,
+                bias: torch.Tensor, momentum: float, eps: float, group: int,
+                group_size: int, stats_mode: str) -> torch.Tensor:
+        self.momentum = momentum
+        self.eps = eps
+        self.group = group
+        self.group_size = group_size
+        self.stats_mode = stats_mode
+
+        assert isinstance(
+                   input, (torch.HalfTensor, torch.FloatTensor,
+                           torch.cuda.HalfTensor, torch.cuda.FloatTensor)), \
+               f'only support Half or Float Tensor, but {input.type()}'
+        output = torch.zeros_like(input)
+        input3d = input.flatten(start_dim=2)
+        output3d = output.view_as(input3d)
+        num_channels = input3d.size(1)
+
+        # ensure mean/var/norm/std are initialized as zeros
+        # ``torch.empty()`` does not guarantee that
+        mean = torch.zeros(
+            num_channels, dtype=torch.float, device=input3d.device)
+        var = torch.zeros(
+            num_channels, dtype=torch.float, device=input3d.device)
+        norm = torch.zeros_like(
+            input3d, dtype=torch.float, device=input3d.device)
+        std = torch.zeros(
+            num_channels, dtype=torch.float, device=input3d.device)
+
+        batch_size = input3d.size(0)
+        if batch_size > 0:
+            ext_module.sync_bn_forward_mean(input3d, mean)
+            batch_flag = torch.ones([1], device=mean.device, dtype=mean.dtype)
+        else:
+            # skip updating mean and leave it as zeros when the input is empty
+            batch_flag = torch.zeros([1], device=mean.device, dtype=mean.dtype)
+
+        # synchronize mean and the batch flag
+        vec = torch.cat([mean, batch_flag])
+        if self.stats_mode == 'N':
+            vec *= batch_size
+        if self.group_size > 1:
+            dist.all_reduce(vec, group=self.group)
+        total_batch = vec[-1].detach()
+        mean = vec[:num_channels]
+
+        if self.stats_mode == 'default':
+            mean = mean / self.group_size
+        elif self.stats_mode == 'N':
+            mean = mean / total_batch.clamp(min=1)
+        else:
+            raise NotImplementedError
+
+        # leave var as zeros when the input is empty
+        if batch_size > 0:
+            ext_module.sync_bn_forward_var(input3d, mean, var)
+
+        if self.stats_mode == 'N':
+            var *= batch_size
+        if self.group_size > 1:
+            dist.all_reduce(var, group=self.group)
+
+        if self.stats_mode == 'default':
+            var /= self.group_size
+        elif self.stats_mode == 'N':
+            var /= total_batch.clamp(min=1)
+        else:
+            raise NotImplementedError
+
+        # if the total batch size over all the ranks is zero,
+        # we should not update the statistics in the current batch
+        update_flag = total_batch.clamp(max=1)
+        momentum = update_flag * self.momentum
+        ext_module.sync_bn_forward_output(
+            input3d,
+            mean,
+            var,
+            weight,
+            bias,
+            running_mean,
+            running_var,
+            norm,
+            std,
+            output3d,
+            eps=self.eps,
+            momentum=momentum,
+            group_size=self.group_size)
+        self.save_for_backward(norm, std, weight)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(self, grad_output: torch.Tensor) -> tuple:
+        norm, std, weight = self.saved_tensors
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(weight)
+        grad_input = torch.zeros_like(grad_output)
+        grad_output3d = grad_output.flatten(start_dim=2)
+        grad_input3d = grad_input.view_as(grad_output3d)
+
+        batch_size = grad_input3d.size(0)
+        if batch_size > 0:
+            ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight,
+                                              grad_bias)
+
+        # all reduce
+        if self.group_size > 1:
+            dist.all_reduce(grad_weight, group=self.group)
+            dist.all_reduce(grad_bias, group=self.group)
+            grad_weight /= self.group_size
+            grad_bias /= self.group_size
+
+        if batch_size > 0:
+            ext_module.sync_bn_backward_data(grad_output3d, weight,
+                                             grad_weight, grad_bias, norm, std,
+                                             grad_input3d)
+
+        return grad_input, None, None, grad_weight, grad_bias, \
+            None, None, None, None, None
+
+
+@MODELS.register_module(name='MMSyncBN')
+class SyncBatchNorm(Module):
+    """Synchronized Batch Normalization.
+
+    Args:
+        num_features (int): number of features/chennels in input tensor
+        eps (float, optional): a value added to the denominator for numerical
+            stability. Defaults to 1e-5.
+        momentum (float, optional): the value used for the running_mean and
+            running_var computation. Defaults to 0.1.
+        affine (bool, optional): whether to use learnable affine parameters.
+            Defaults to True.
+        track_running_stats (bool, optional): whether to track the running
+            mean and variance during training. When set to False, this
+            module does not track such statistics, and initializes statistics
+            buffers ``running_mean`` and ``running_var`` as ``None``. When
+            these buffers are ``None``, this module always uses batch
+            statistics in both training and eval modes. Defaults to True.
+        group (int, optional): synchronization of stats happen within
+            each process group individually. By default it is synchronization
+            across the whole world. Defaults to None.
+        stats_mode (str, optional): The statistical mode. Available options
+            includes ``'default'`` and ``'N'``. Defaults to 'default'.
+            When ``stats_mode=='default'``, it computes the overall statistics
+            using those from each worker with equal weight, i.e., the
+            statistics are synchronized and simply divied by ``group``. This
+            mode will produce inaccurate statistics when empty tensors occur.
+            When ``stats_mode=='N'``, it compute the overall statistics using
+            the total number of batches in each worker ignoring the number of
+            group, i.e., the statistics are synchronized and then divied by
+            the total batch ``N``. This mode is beneficial when empty tensors
+            occur during training, as it average the total mean by the real
+            number of batch.
+    """
+
+    def __init__(self,
+                 num_features: int,
+                 eps: float = 1e-5,
+                 momentum: float = 0.1,
+                 affine: bool = True,
+                 track_running_stats: bool = True,
+                 group: Optional[int] = None,
+                 stats_mode: str = 'default'):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        group = dist.group.WORLD if group is None else group
+        self.group = group
+        self.group_size = dist.get_world_size(group)
+        assert stats_mode in ['default', 'N'], \
+            f'"stats_mode" only accepts "default" and "N", got "{stats_mode}"'
+        self.stats_mode = stats_mode
+        if self.affine:
+            self.weight = Parameter(torch.Tensor(num_features))
+            self.bias = Parameter(torch.Tensor(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        if self.track_running_stats:
+            self.register_buffer('running_mean', torch.zeros(num_features))
+            self.register_buffer('running_var', torch.ones(num_features))
+            self.register_buffer('num_batches_tracked',
+                                 torch.tensor(0, dtype=torch.long))
+        else:
+            self.register_buffer('running_mean', None)
+            self.register_buffer('running_var', None)
+            self.register_buffer('num_batches_tracked', None)
+        self.reset_parameters()
+
+    def reset_running_stats(self):
+        if self.track_running_stats:
+            self.running_mean.zero_()
+            self.running_var.fill_(1)
+            self.num_batches_tracked.zero_()
+
+    def reset_parameters(self):
+        self.reset_running_stats()
+        if self.affine:
+            self.weight.data.uniform_()  # pytorch use ones_()
+            self.bias.data.zero_()
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if input.dim() < 2:
+            raise ValueError(
+                f'expected at least 2D input, got {input.dim()}D input')
+        if self.momentum is None:
+            exponential_average_factor = 0.0
+        else:
+            exponential_average_factor = self.momentum
+
+        if self.training and self.track_running_stats:
+            if self.num_batches_tracked is not None:
+                self.num_batches_tracked += 1
+                if self.momentum is None:  # use cumulative moving average
+                    exponential_average_factor = 1.0 / float(
+                        self.num_batches_tracked)
+                else:  # use exponential moving average
+                    exponential_average_factor = self.momentum
+
+        if self.training or not self.track_running_stats:
+            return SyncBatchNormFunction.apply(
+                input, self.running_mean, self.running_var, self.weight,
+                self.bias, exponential_average_factor, self.eps, self.group,
+                self.group_size, self.stats_mode)
+        else:
+            return F.batch_norm(input, self.running_mean, self.running_var,
+                                self.weight, self.bias, False,
+                                exponential_average_factor, self.eps)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'({self.num_features}, '
+        s += f'eps={self.eps}, '
+        s += f'momentum={self.momentum}, '
+        s += f'affine={self.affine}, '
+        s += f'track_running_stats={self.track_running_stats}, '
+        s += f'group_size={self.group_size},'
+        s += f'stats_mode={self.stats_mode})'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/three_interpolate.py b/head_extractor/mmcv-2.1.0/mmcv/ops/three_interpolate.py
new file mode 100644
index 0000000000000000000000000000000000000000..286bd0472ebae83f405534178a19fefe9ffbc384
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/three_interpolate.py
@@ -0,0 +1,69 @@
+from typing import Any, Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['three_interpolate_forward', 'three_interpolate_backward'])
+
+
+class ThreeInterpolate(Function):
+    """Performs weighted linear interpolation on 3 features.
+
+    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_
+    for more details.
+    """
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, indices: torch.Tensor,
+                weight: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): (B, C, M) Features descriptors to be
+                interpolated.
+            indices (torch.Tensor): (B, n, 3) indices of three nearest
+                neighbor features for the target features.
+            weight (torch.Tensor): (B, n, 3) weights of three nearest
+                neighbor features for the target features.
+
+        Returns:
+            torch.Tensor: (B, C, N) tensor of the interpolated features
+        """
+        assert features.is_contiguous()
+        assert indices.is_contiguous()
+        assert weight.is_contiguous()
+
+        B, c, m = features.size()
+        n = indices.size(1)
+        ctx.three_interpolate_for_backward = (indices, weight, m)
+        output = features.new_empty(B, c, n)
+
+        ext_module.three_interpolate_forward(
+            features, indices, weight, output, b=B, c=c, m=m, n=n)
+        return output
+
+    @staticmethod
+    def backward(
+        ctx, grad_out: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            grad_out (torch.Tensor): (B, C, N) tensor with gradients of outputs
+
+        Returns:
+            torch.Tensor: (B, C, M) tensor with gradients of features
+        """
+        idx, weight, m = ctx.three_interpolate_for_backward
+        B, c, n = grad_out.size()
+
+        grad_features = grad_out.new_zeros(B, c, m)
+        grad_out_data = grad_out.data.contiguous()
+
+        ext_module.three_interpolate_backward(
+            grad_out_data, idx, weight, grad_features.data, b=B, c=c, n=n, m=m)
+        return grad_features, None, None
+
+
+three_interpolate = ThreeInterpolate.apply
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/three_nn.py b/head_extractor/mmcv-2.1.0/mmcv/ops/three_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d41b9789cf8e33c4cd490a8ec522e9c4bc7851e4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/three_nn.py
@@ -0,0 +1,51 @@
+from typing import Any, Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['three_nn_forward'])
+
+
+class ThreeNN(Function):
+    """Find the top-3 nearest neighbors of the target set from the source set.
+
+    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_
+    for more details.
+    """
+
+    @staticmethod
+    def forward(ctx: Any, target: torch.Tensor,
+                source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            target (torch.Tensor): shape (B, N, 3), points set that needs to
+                find the nearest neighbors.
+            source (torch.Tensor): shape (B, M, 3), points set that is used
+                to find the nearest neighbors of points in target set.
+
+        Returns:
+            torch.Tensor: shape (B, N, 3), L2 distance of each point in target
+            set to their corresponding top three nearest neighbors.
+        """
+        target = target.contiguous()
+        source = source.contiguous()
+
+        B, N, _ = target.size()
+        m = source.size(1)
+        dist2 = target.new_empty(B, N, 3)
+        idx = target.new_empty(B, N, 3, dtype=torch.int32)
+
+        ext_module.three_nn_forward(target, source, dist2, idx, b=B, n=N, m=m)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+
+        return torch.sqrt(dist2), idx
+
+    @staticmethod
+    def backward(ctx, a=None, b=None):
+        return None, None
+
+
+three_nn = ThreeNN.apply
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/tin_shift.py b/head_extractor/mmcv-2.1.0/mmcv/ops/tin_shift.py
new file mode 100644
index 0000000000000000000000000000000000000000..473231cc0de002bbf8bdb22cc19755487fbddb48
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/tin_shift.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Code reference from "Temporal Interlacing Network"
+# https://github.com/deepcs233/TIN/blob/master/cuda_shift/rtc_wrap.py
+# Hao Shao, Shengju Qian, Yu Liu
+# shaoh19@mails.tsinghua.edu.cn, sjqian@cse.cuhk.edu.hk, yuliu@ee.cuhk.edu.hk
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['tin_shift_forward', 'tin_shift_backward'])
+
+
+class TINShiftFunction(Function):
+
+    @staticmethod
+    def forward(ctx, input, shift):
+        if input.size(0) != shift.size(0):
+            raise ValueError(
+                'The first dim (batch) of `input` and `shift` should be '
+                f'same, but got {input.size(0)} and {shift.size(0)}.')
+        C = input.size(2)
+        num_segments = shift.size(1)
+        if C // num_segments <= 0 or C % num_segments != 0:
+            raise ValueError('C should be a multiple of num_segments, '
+                             f'but got C={C} and num_segments={num_segments}.')
+
+        ctx.save_for_backward(shift)
+
+        out = torch.zeros_like(input)
+        ext_module.tin_shift_forward(input, shift, out)
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        shift = ctx.saved_tensors[0]
+        data_grad_input = grad_output.new(*grad_output.size()).zero_()
+        shift_grad_input = shift.new(*shift.size()).zero_()
+        ext_module.tin_shift_backward(grad_output, shift, data_grad_input)
+
+        return data_grad_input, shift_grad_input
+
+
+tin_shift = TINShiftFunction.apply
+
+
+class TINShift(nn.Module):
+    """Temporal Interlace Shift.
+
+    Temporal Interlace shift is a differentiable temporal-wise frame shifting
+    which is proposed in "Temporal Interlacing Network"
+
+    Please refer to `Temporal Interlacing Network
+    <https://arxiv.org/abs/2001.06499>`_ for more details.
+
+    Code is modified from https://github.com/mit-han-lab/temporal-shift-module
+    """
+
+    def forward(self, input, shift):
+        """Perform temporal interlace shift.
+
+        Args:
+            input (torch.Tensor): Feature map with shape
+                [N, num_segments, C, H * W].
+            shift (torch.Tensor): Shift tensor with shape [N, num_segments].
+
+        Returns:
+            Feature map after temporal interlace shift.
+        """
+        return tin_shift(input, shift)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/upfirdn2d.py b/head_extractor/mmcv-2.1.0/mmcv/ops/upfirdn2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..857e840c1b7eebd94f446849b71deef996a55387
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/upfirdn2d.py
@@ -0,0 +1,460 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+
+# source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/upfirdn2d.py # noqa
+"""Custom PyTorch ops for efficient resampling of 2D images."""
+from typing import Dict, List, Union
+
+import torch
+
+from ..utils import ext_loader
+from .conv2d_gradfix import conv2d
+
+ext_module = ext_loader.load_ext('_ext', ['upfirdn2d'])
+
+
+def _parse_scaling(scaling):
+    """parse scaling into list [x, y]"""
+    if isinstance(scaling, int):
+        scaling = [scaling, scaling]
+    assert isinstance(scaling, (list, tuple))
+    assert all(isinstance(x, int) for x in scaling)
+    sx, sy = scaling
+    assert sx >= 1 and sy >= 1
+    return sx, sy
+
+
+def _parse_padding(padding):
+    """parse padding into list [padx0, padx1, pady0, pady1]"""
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    assert isinstance(padding, (list, tuple))
+    assert all(isinstance(x, int) for x in padding)
+    if len(padding) == 2:
+        padx, pady = padding
+        padding = [padx, padx, pady, pady]
+    padx0, padx1, pady0, pady1 = padding
+    return padx0, padx1, pady0, pady1
+
+
+def _get_filter_size(filter):
+    """get width and height of filter kernel."""
+    if filter is None:
+        return 1, 1
+    assert isinstance(filter, torch.Tensor) and filter.ndim in [1, 2]
+    fw = filter.shape[-1]
+    fh = filter.shape[0]
+    fw = int(fw)
+    fh = int(fh)
+    assert fw >= 1 and fh >= 1
+    return fw, fh
+
+
+def upfirdn2d(input: torch.Tensor,
+              filter: torch.Tensor,
+              up: int = 1,
+              down: int = 1,
+              padding: Union[int, List[int]] = 0,
+              flip_filter: bool = False,
+              gain: Union[float, int] = 1,
+              use_custom_op: bool = True):
+    """Pad, upsample, filter, and downsample a batch of 2D images.
+
+    Performs the following sequence of operations for each channel:
+
+    1. Upsample the image by inserting N-1 zeros after each pixel (`up`).
+
+    2. Pad the image with the specified number of zeros on each side
+    (`padding`). Negative padding corresponds to cropping the image.
+
+    3. Convolve the image with the specified 2D FIR filter (`f`),
+    shrinking it so that the footprint of all output pixels lies within
+    the input image.
+
+    4. Downsample the image by keeping every Nth pixel (`down`).
+
+    This sequence of operations bears close resemblance to
+        scipy.signal.upfirdn().
+
+    The fused op is considerably more efficient than performing the same
+    calculation using standard PyTorch ops. It supports gradients of arbitrary
+    order.
+
+    Args:
+        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
+            filter_width]` (non-separable), `[filter_taps]` (separable), or
+            `None` (identity).
+        up (int): Integer upsampling factor. Can be a single int or a
+            list/tuple `[x, y]`. Defaults to 1.
+        down (int): Integer downsampling factor. Can be a single int
+            or a list/tuple `[x, y]`. Defaults to 1.
+        padding (int | tuple[int]): Padding with respect to the upsampled
+            image. Can be a single number or a list/tuple `[x, y]` or
+            `[x_before, x_after, y_before, y_after]`. Defaults to 0.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+        gain (int): Overall scaling factor for signal magnitude.
+            Defaults to 1.
+        use_custom_op (bool): Whether to use customized op.
+            Defaults to True.
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height, out_width]`
+    """
+    assert isinstance(input, torch.Tensor)
+    if use_custom_op and input.device.type == 'cuda':
+        return _upfirdn2d_cuda(
+            up=up,
+            down=down,
+            padding=padding,
+            flip_filter=flip_filter,
+            gain=gain).apply(input, filter)
+    return _upfirdn2d_ref(
+        input,
+        filter,
+        up=up,
+        down=down,
+        padding=padding,
+        flip_filter=flip_filter,
+        gain=gain)
+
+
+def _upfirdn2d_ref(input: torch.Tensor,
+                   filter: torch.Tensor,
+                   up: int = 1,
+                   down: int = 1,
+                   padding: Union[int, List[int]] = 0,
+                   flip_filter: bool = False,
+                   gain: Union[float, int] = 1):
+    """Slow reference implementation of `upfirdn2d()` using standard PyTorch
+    ops.
+
+    Args:
+        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
+            filter_width]` (non-separable), `[filter_taps]` (separable), or
+            `None` (identity).
+        up (int): Integer upsampling factor. Can be a single int or a
+            list/tuple `[x, y]`. Defaults to 1.
+        down (int): Integer downsampling factor. Can be a single int
+            or a list/tuple `[x, y]`. Defaults to 1.
+        padding (int | tuple[int]): Padding with respect to the upsampled
+            image. Can be a single number or a list/tuple `[x, y]` or
+            `[x_before, x_after, y_before, y_after]`. Defaults to 0.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+        gain (int): Overall scaling factor for signal magnitude.
+            Defaults to 1.
+
+    Returns:
+        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
+            out_height, out_width]`.
+    """
+    # Validate arguments.
+    assert isinstance(input, torch.Tensor) and input.ndim == 4
+    if filter is None:
+        filter = torch.ones([1, 1], dtype=torch.float32, device=input.device)
+    assert isinstance(filter, torch.Tensor) and filter.ndim in [1, 2]
+    assert filter.dtype == torch.float32 and not filter.requires_grad
+    batch_size, num_channels, in_height, in_width = input.shape
+    upx, upy = _parse_scaling(up)
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+
+    # Check that upsampled buffer is not smaller than the filter.
+    upW = in_width * upx + padx0 + padx1
+    upH = in_height * upy + pady0 + pady1
+    assert upW >= filter.shape[-1] and upH >= filter.shape[0]
+
+    # Upsample by inserting zeros.
+    x = input.reshape([batch_size, num_channels, in_height, 1, in_width, 1])
+    x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1])
+    x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx])
+
+    # Pad or crop.
+    x = torch.nn.functional.pad(
+        x, [max(padx0, 0),
+            max(padx1, 0),
+            max(pady0, 0),
+            max(pady1, 0)])
+    x = x[:, :,
+          max(-pady0, 0):x.shape[2] - max(-pady1, 0),
+          max(-padx0, 0):x.shape[3] - max(-padx1, 0)]
+
+    # Setup filter.
+    filter = filter * (gain**(filter.ndim / 2))
+    filter = filter.to(x.dtype)
+    if not flip_filter:
+        filter = filter.flip(list(range(filter.ndim)))
+
+    # Convolve with the filter.
+    filter = filter[None, None].repeat([num_channels, 1] + [1] * filter.ndim)
+    if filter.ndim == 4:
+        x = conv2d(input=x, weight=filter, groups=num_channels)
+    else:
+        x = conv2d(input=x, weight=filter.unsqueeze(2), groups=num_channels)
+        x = conv2d(input=x, weight=filter.unsqueeze(3), groups=num_channels)
+
+    # Downsample by throwing away pixels.
+    x = x[:, :, ::downy, ::downx]
+    return x
+
+
+_upfirdn2d_cuda_cache: Dict = dict()
+
+
+def _upfirdn2d_cuda(up: int = 1,
+                    down: int = 1,
+                    padding: Union[int, List[int]] = 0,
+                    flip_filter: bool = False,
+                    gain: Union[float, int] = 1):
+    """Fast CUDA implementation of `upfirdn2d()` using custom ops.
+
+    Args:
+        up (int): Integer upsampling factor. Can be a single int or a
+            list/tuple `[x, y]`. Defaults to 1.
+        down (int): Integer downsampling factor. Can be a single int
+            or a list/tuple `[x, y]`. Defaults to 1.
+        padding (int | tuple[int]): Padding with respect to the upsampled
+            image. Can be a single number or a list/tuple `[x, y]` or
+            `[x_before, x_after, y_before, y_after]`. Defaults to 0.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+        gain (int): Overall scaling factor for signal magnitude.
+            Defaults to 1.
+
+    Returns:
+        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
+        out_height, out_width]`
+    """
+    # Parse arguments.
+    upx, upy = _parse_scaling(up)
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+
+    # Lookup from cache.
+    key = (upx, upy, downx, downy, padx0, padx1, pady0, pady1, flip_filter,
+           gain)
+    if key in _upfirdn2d_cuda_cache:
+        return _upfirdn2d_cuda_cache[key]
+
+    # Forward op.
+    class Upfirdn2dCuda(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, x, f):  # pylint: disable=arguments-differ
+            assert isinstance(x, torch.Tensor) and x.ndim == 4
+            if f is None:
+                f = torch.ones([1, 1], dtype=torch.float32, device=x.device)
+            if f.ndim == 1 and f.shape[0] == 1:
+                f = f.square().unsqueeze(
+                    0)  # Convert separable-1 into full-1x1.
+            assert isinstance(f, torch.Tensor) and f.ndim in [1, 2]
+            y = x
+            if f.ndim == 2:
+                y = ext_module.upfirdn2d(y, f, upx, upy, downx, downy, padx0,
+                                         padx1, pady0, pady1, flip_filter,
+                                         gain)
+            else:
+                y = ext_module.upfirdn2d(y, f.unsqueeze(0), upx, 1, downx, 1,
+                                         padx0, padx1, 0, 0, flip_filter, 1.0)
+                y = ext_module.upfirdn2d(y, f.unsqueeze(1), 1, upy, 1, downy,
+                                         0, 0, pady0, pady1, flip_filter, gain)
+            ctx.save_for_backward(f)
+            ctx.x_shape = x.shape
+            return y
+
+        @staticmethod
+        def backward(ctx, dy):  # pylint: disable=arguments-differ
+            f, = ctx.saved_tensors
+            _, _, ih, iw = ctx.x_shape
+            _, _, oh, ow = dy.shape
+            fw, fh = _get_filter_size(f)
+            p = [
+                fw - padx0 - 1,
+                iw * upx - ow * downx + padx0 - upx + 1,
+                fh - pady0 - 1,
+                ih * upy - oh * downy + pady0 - upy + 1,
+            ]
+            dx = None
+            df = None
+
+            if ctx.needs_input_grad[0]:
+                dx = _upfirdn2d_cuda(
+                    up=down,
+                    down=up,
+                    padding=p,
+                    flip_filter=(not flip_filter),
+                    gain=gain).apply(dy, f)
+
+            assert not ctx.needs_input_grad[1]
+            return dx, df
+
+    # Add to cache.
+    _upfirdn2d_cuda_cache[key] = Upfirdn2dCuda
+    return Upfirdn2dCuda
+
+
+def filter2d(input: torch.Tensor,
+             filter: torch.Tensor,
+             padding: Union[int, List[int]] = 0,
+             flip_filter: bool = False,
+             gain: Union[float, int] = 1,
+             use_custom_op: bool = True):
+    """Filter a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape matches the input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
+            filter_width]` (non-separable), `[filter_taps]` (separable), or
+            `None`.
+        padding (int | tuple[int]): Padding with respect to the output.
+            Can be a single number or a list/tuple `[x, y]` or `[x_before,
+            x_after, y_before, y_after]`. Defaults to 0.
+        flip_filter (bool): False = convolution, True = correlation.
+            Defaults to False.
+        gain (int): Overall scaling factor for signal magnitude.
+            Defaults to 1.
+        use_custom_op (bool): Whether to use customized op.
+            Defaults to True.
+
+    Returns:
+        Tensor of the shape `[batch_size, num_channels, out_height,
+        out_width]`.
+    """
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(filter)
+    p = [
+        padx0 + fw // 2,
+        padx1 + (fw - 1) // 2,
+        pady0 + fh // 2,
+        pady1 + (fh - 1) // 2,
+    ]
+    return upfirdn2d(
+        input,
+        filter,
+        padding=p,
+        flip_filter=flip_filter,
+        gain=gain,
+        use_custom_op=use_custom_op)
+
+
+def upsample2d(input: torch.Tensor,
+               filter: torch.Tensor,
+               up: int = 2,
+               padding: Union[int, List[int]] = 0,
+               flip_filter: bool = False,
+               gain: Union[float, int] = 1,
+               use_custom_op: bool = True):
+    """Upsample a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape is a multiple of the
+    input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
+            filter_width]` (non-separable), `[filter_taps]` (separable), or
+            `None` (identity).
+        up (int): Integer upsampling factor. Can be a single int or a
+            list/tuple `[x, y]`. Defaults to 2.
+        padding (int | tuple[int]): Padding with respect to the output.
+            Can be a single number or a list/tuple `[x, y]` or `[x_before,
+            x_after, y_before, y_after]`. Defaults to 0.
+        flip_filter (bool): False = convolution, True = correlation. Defaults
+            to False.
+        gain (int): Overall scaling factor for signal magnitude. Defaults to 1.
+        use_custom_op (bool): Whether to use customized op.
+            Defaults to True.
+
+    Returns:
+        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
+        out_height, out_width]`
+    """
+    upx, upy = _parse_scaling(up)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(filter)
+    p = [
+        padx0 + (fw + upx - 1) // 2,
+        padx1 + (fw - upx) // 2,
+        pady0 + (fh + upy - 1) // 2,
+        pady1 + (fh - upy) // 2,
+    ]
+    return upfirdn2d(
+        input,
+        filter,
+        up=up,
+        padding=p,
+        flip_filter=flip_filter,
+        gain=gain * upx * upy,
+        use_custom_op=use_custom_op)
+
+
+def downsample2d(input: torch.Tensor,
+                 filter: torch.Tensor,
+                 down: int = 2,
+                 padding: Union[int, List[int]] = 0,
+                 flip_filter: bool = False,
+                 gain: Union[float, int] = 1,
+                 use_custom_op: bool = True):
+    """Downsample a batch of 2D images using the given 2D FIR filter.
+
+    By default, the result is padded so that its shape is a fraction of the
+    input.
+    User-specified padding is applied on top of that, with negative values
+    indicating cropping. Pixels outside the image are assumed to be zero.
+
+    Args:
+        input (torch.Tensor): Float32/float64/float16 input tensor of the shape
+            `[batch_size, num_channels, in_height, in_width]`.
+        filter (torch.Tensor): Float32 FIR filter of the shape `[filter_height,
+            filter_width]` (non-separable), `[filter_taps]` (separable), or
+            `None` (identity).
+        down (int): Integer downsampling factor. Can be a single int or a
+                     list/tuple `[x, y]` (default: 1). Defaults to 2.
+        padding (int | tuple[int]): Padding with respect to the input.
+            Can be a single number or a list/tuple `[x, y]` or `[x_before,
+            x_after, y_before, y_after]`. Defaults to 0.
+        flip_filter (bool): False = convolution, True = correlation. Defaults
+            to False.
+        gain (int): Overall scaling factor for signal magnitude. Defaults to 1.
+        use_custom_op (bool): Whether to use customized op.
+            Defaults to True.
+
+    Returns:
+        torch.Tensor: Tensor of the shape `[batch_size, num_channels,
+        out_height, out_width]`.
+    """
+    downx, downy = _parse_scaling(down)
+    padx0, padx1, pady0, pady1 = _parse_padding(padding)
+    fw, fh = _get_filter_size(filter)
+    p = [
+        padx0 + (fw - downx + 1) // 2,
+        padx1 + (fw - downx) // 2,
+        pady0 + (fh - downy + 1) // 2,
+        pady1 + (fh - downy) // 2,
+    ]
+    return upfirdn2d(
+        input,
+        filter,
+        down=down,
+        padding=p,
+        flip_filter=flip_filter,
+        gain=gain,
+        use_custom_op=use_custom_op)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/ops/voxelize.py b/head_extractor/mmcv-2.1.0/mmcv/ops/voxelize.py
new file mode 100644
index 0000000000000000000000000000000000000000..992ce68fd2a970bd475abaae68e62c78fec0e4c8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/ops/voxelize.py
@@ -0,0 +1,183 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, List, Tuple, Union
+
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['dynamic_voxelize_forward', 'hard_voxelize_forward'])
+
+
+class _Voxelization(Function):
+
+    @staticmethod
+    def forward(
+            ctx: Any,
+            points: torch.Tensor,
+            voxel_size: Union[tuple, float],
+            coors_range: Union[tuple, float],
+            max_points: int = 35,
+            max_voxels: int = 20000,
+            deterministic: bool = True) -> Union[Tuple[torch.Tensor], Tuple]:
+        """Convert kitti points(N, >=3) to voxels.
+
+        Args:
+            points (torch.Tensor): [N, ndim]. Points[:, :3] contain xyz points
+                and points[:, 3:] contain other information like reflectivity.
+            voxel_size (tuple or float): The size of voxel with the shape of
+                [3].
+            coors_range (tuple or float): The coordinate range of voxel with
+                the shape of [6].
+            max_points (int, optional): maximum points contained in a voxel. if
+                max_points=-1, it means using dynamic_voxelize. Default: 35.
+            max_voxels (int, optional): maximum voxels this function create.
+                for second, 20000 is a good choice. Users should shuffle points
+                before call this function because max_voxels may drop points.
+                Default: 20000.
+            deterministic: bool. whether to invoke the non-deterministic
+                version of hard-voxelization implementations. non-deterministic
+                version is considerablly fast but is not deterministic. only
+                affects hard voxelization. default True. for more information
+                of this argument and the implementation insights, please refer
+                to the following links:
+                https://github.com/open-mmlab/mmdetection3d/issues/894
+                https://github.com/open-mmlab/mmdetection3d/pull/904
+                it is an experimental feature and we will appreciate it if
+                you could share with us the failing cases.
+
+        Returns:
+            tuple[torch.Tensor]: tuple[torch.Tensor]: A tuple contains three
+            elements. The first one is the output voxels with the shape of
+            [M, max_points, n_dim], which only contain points and returned
+            when max_points != -1. The second is the voxel coordinates with
+            shape of [M, 3]. The last is number of point per voxel with the
+            shape of [M], which only returned when max_points != -1.
+        """
+        if max_points == -1 or max_voxels == -1:
+            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
+            ext_module.dynamic_voxelize_forward(
+                points,
+                torch.tensor(voxel_size, dtype=torch.float),
+                torch.tensor(coors_range, dtype=torch.float),
+                coors,
+                NDim=3)
+            return coors
+        else:
+            voxels = points.new_zeros(
+                size=(max_voxels, max_points, points.size(1)))
+            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
+            num_points_per_voxel = points.new_zeros(
+                size=(max_voxels, ), dtype=torch.int)
+            voxel_num = torch.zeros(size=(), dtype=torch.long)
+            ext_module.hard_voxelize_forward(
+                points,
+                torch.tensor(voxel_size, dtype=torch.float),
+                torch.tensor(coors_range, dtype=torch.float),
+                voxels,
+                coors,
+                num_points_per_voxel,
+                voxel_num,
+                max_points=max_points,
+                max_voxels=max_voxels,
+                NDim=3,
+                deterministic=deterministic)
+            # select the valid voxels
+            voxels_out = voxels[:voxel_num]
+            coors_out = coors[:voxel_num]
+            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
+            return voxels_out, coors_out, num_points_per_voxel_out
+
+
+voxelization = _Voxelization.apply
+
+
+class Voxelization(nn.Module):
+    """Convert kitti points(N, >=3) to voxels.
+
+    Please refer to `Point-Voxel CNN for Efficient 3D Deep Learning
+    <https://arxiv.org/abs/1907.03739>`_ for more details.
+
+    Args:
+        voxel_size (tuple or float): The size of voxel with the shape of [3].
+        point_cloud_range (tuple or float): The coordinate range of voxel with
+            the shape of [6].
+        max_num_points (int): maximum points contained in a voxel. if
+            max_points=-1, it means using dynamic_voxelize.
+        max_voxels (int, optional): maximum voxels this function create.
+            for second, 20000 is a good choice. Users should shuffle points
+            before call this function because max_voxels may drop points.
+            Default: 20000.
+    """
+
+    def __init__(self,
+                 voxel_size: List,
+                 point_cloud_range: List,
+                 max_num_points: int,
+                 max_voxels: Union[tuple, int] = 20000,
+                 deterministic: bool = True):
+        """
+        Args:
+            voxel_size (list): list [x, y, z] size of three dimension
+            point_cloud_range (list):
+                [x_min, y_min, z_min, x_max, y_max, z_max]
+            max_num_points (int): max number of points per voxel
+            max_voxels (tuple or int): max number of voxels in
+                (training, testing) time
+            deterministic: bool. whether to invoke the non-deterministic
+                version of hard-voxelization implementations. non-deterministic
+                version is considerablly fast but is not deterministic. only
+                affects hard voxelization. default True. for more information
+                of this argument and the implementation insights, please refer
+                to the following links:
+                https://github.com/open-mmlab/mmdetection3d/issues/894
+                https://github.com/open-mmlab/mmdetection3d/pull/904
+                it is an experimental feature and we will appreciate it if
+                you could share with us the failing cases.
+        """
+        super().__init__()
+
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.max_num_points = max_num_points
+        if isinstance(max_voxels, tuple):
+            self.max_voxels = max_voxels
+        else:
+            self.max_voxels = _pair(max_voxels)
+        self.deterministic = deterministic
+
+        point_cloud_range = torch.tensor(
+            point_cloud_range, dtype=torch.float32)
+        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
+        grid_size = (
+            point_cloud_range[3:] -  # type: ignore
+            point_cloud_range[:3]) / voxel_size  # type: ignore
+        grid_size = torch.round(grid_size).long()
+        input_feat_shape = grid_size[:2]
+        self.grid_size = grid_size
+        # the origin shape is as [x-len, y-len, z-len]
+        # [w, h, d] -> [d, h, w]
+        self.pcd_shape = [*input_feat_shape, 1][::-1]
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.training:
+            max_voxels = self.max_voxels[0]
+        else:
+            max_voxels = self.max_voxels[1]
+
+        return voxelization(input, self.voxel_size, self.point_cloud_range,
+                            self.max_num_points, max_voxels,
+                            self.deterministic)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'voxel_size=' + str(self.voxel_size)
+        s += ', point_cloud_range=' + str(self.point_cloud_range)
+        s += ', max_num_points=' + str(self.max_num_points)
+        s += ', max_voxels=' + str(self.max_voxels)
+        s += ', deterministic=' + str(self.deterministic)
+        s += ')'
+        return s
diff --git a/head_extractor/mmcv-2.1.0/mmcv/transforms/__init__.py b/head_extractor/mmcv-2.1.0/mmcv/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4dbaa1bbfbde35696a67876eed4a283d79c85b4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/transforms/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseTransform
+from .builder import TRANSFORMS
+from .loading import LoadAnnotations, LoadImageFromFile
+from .processing import (CenterCrop, MultiScaleFlipAug, Normalize, Pad,
+                         RandomChoiceResize, RandomFlip, RandomGrayscale,
+                         RandomResize, Resize, TestTimeAug)
+from .wrappers import (Compose, KeyMapper, RandomApply, RandomChoice,
+                       TransformBroadcaster)
+
+try:
+    import torch  # noqa: F401
+except ImportError:
+    __all__ = [
+        'BaseTransform', 'TRANSFORMS', 'TransformBroadcaster', 'Compose',
+        'RandomChoice', 'KeyMapper', 'LoadImageFromFile', 'LoadAnnotations',
+        'Normalize', 'Resize', 'Pad', 'RandomFlip', 'RandomChoiceResize',
+        'CenterCrop', 'RandomGrayscale', 'MultiScaleFlipAug', 'RandomResize',
+        'RandomApply', 'TestTimeAug'
+    ]
+else:
+    from .formatting import ImageToTensor, ToTensor, to_tensor
+
+    __all__ = [
+        'BaseTransform', 'TRANSFORMS', 'TransformBroadcaster', 'Compose',
+        'RandomChoice', 'KeyMapper', 'LoadImageFromFile', 'LoadAnnotations',
+        'Normalize', 'Resize', 'Pad', 'ToTensor', 'to_tensor', 'ImageToTensor',
+        'RandomFlip', 'RandomChoiceResize', 'CenterCrop', 'RandomGrayscale',
+        'MultiScaleFlipAug', 'RandomResize', 'RandomApply', 'TestTimeAug'
+    ]
diff --git a/head_extractor/mmcv-2.1.0/mmcv/transforms/base.py b/head_extractor/mmcv-2.1.0/mmcv/transforms/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..321afb6038ca9f4504289c92c4bedd1c7aed40c6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/transforms/base.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Optional, Tuple, Union
+
+
+class BaseTransform(metaclass=ABCMeta):
+    """Base class for all transformations."""
+
+    def __call__(self,
+                 results: Dict) -> Optional[Union[Dict, Tuple[List, List]]]:
+
+        return self.transform(results)
+
+    @abstractmethod
+    def transform(self,
+                  results: Dict) -> Optional[Union[Dict, Tuple[List, List]]]:
+        """The transform function. All subclass of BaseTransform should
+        override this method.
+
+        This function takes the result dict as the input, and can add new
+        items to the dict or modify existing items in the dict. And the result
+        dict will be returned in the end, which allows to concate multiple
+        transforms into a pipeline.
+
+        Args:
+            results (dict): The result dict.
+
+        Returns:
+            dict: The result dict.
+        """
diff --git a/head_extractor/mmcv-2.1.0/mmcv/transforms/builder.py b/head_extractor/mmcv-2.1.0/mmcv/transforms/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..aefa21222a83d7bafa9eaf580eab41472c2cbd55
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/transforms/builder.py
@@ -0,0 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.registry import TRANSFORMS  # noqa: F401
diff --git a/head_extractor/mmcv-2.1.0/mmcv/transforms/formatting.py b/head_extractor/mmcv-2.1.0/mmcv/transforms/formatting.py
new file mode 100644
index 0000000000000000000000000000000000000000..02089215e1796fa4709befc5030af93166332e41
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/transforms/formatting.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Union
+
+import mmengine
+import numpy as np
+import torch
+
+from .base import BaseTransform
+from .builder import TRANSFORMS
+
+
+def to_tensor(
+    data: Union[torch.Tensor, np.ndarray, Sequence, int,
+                float]) -> torch.Tensor:
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+
+    Returns:
+        torch.Tensor: the converted data.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmengine.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@TRANSFORMS.register_module()
+class ToTensor(BaseTransform):
+    """Convert some results to :obj:`torch.Tensor` by given keys.
+
+    Required keys:
+
+    - all these keys in `keys`
+
+    Modified Keys:
+
+    - all these keys in `keys`
+
+    Args:
+        keys (Sequence[str]): Keys that need to be converted to Tensor.
+    """
+
+    def __init__(self, keys: Sequence[str]) -> None:
+        self.keys = keys
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to convert data to `torch.Tensor`.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: `keys` in results will be updated.
+        """
+        for key in self.keys:
+
+            key_list = key.split('.')
+            cur_item = results
+            for i in range(len(key_list)):
+                if key_list[i] not in cur_item:
+                    raise KeyError(f'Can not find key {key}')
+                if i == len(key_list) - 1:
+                    cur_item[key_list[i]] = to_tensor(cur_item[key_list[i]])
+                    break
+                cur_item = cur_item[key_list[i]]
+
+        return results
+
+    def __repr__(self) -> str:
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@TRANSFORMS.register_module()
+class ImageToTensor(BaseTransform):
+    """Convert image to :obj:`torch.Tensor` by given keys.
+
+    The dimension order of input image is (H, W, C). The pipeline will convert
+    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
+    (1, H, W).
+
+    Required keys:
+
+    - all these keys in `keys`
+
+    Modified Keys:
+
+    - all these keys in `keys`
+
+    Args:
+        keys (Sequence[str]): Key of images to be converted to Tensor.
+    """
+
+    def __init__(self, keys: dict) -> None:
+        self.keys = keys
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to convert image in results to
+        :obj:`torch.Tensor` and transpose the channel order.
+        Args:
+            results (dict): Result dict contains the image data to convert.
+        Returns:
+            dict: The result dict contains the image converted
+            to :obj:``torch.Tensor`` and transposed to (C, H, W) order.
+        """
+        for key in self.keys:
+            img = results[key]
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            results[key] = (to_tensor(img.transpose(2, 0, 1))).contiguous()
+        return results
+
+    def __repr__(self) -> str:
+        return self.__class__.__name__ + f'(keys={self.keys})'
diff --git a/head_extractor/mmcv-2.1.0/mmcv/transforms/loading.py b/head_extractor/mmcv-2.1.0/mmcv/transforms/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..6028837ecab99c993d7a14885e585810d059270c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/transforms/loading.py
@@ -0,0 +1,363 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional
+
+import mmengine.fileio as fileio
+import numpy as np
+
+import mmcv
+from .base import BaseTransform
+from .builder import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class LoadImageFromFile(BaseTransform):
+    """Load an image from file.
+
+    Required Keys:
+
+    - img_path
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+            Defaults to 'color'.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:`mmcv.imfrombytes`.
+            See :func:`mmcv.imfrombytes` for details.
+            Defaults to 'cv2'.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+            Deprecated in version 2.0.0rc4.
+        ignore_empty (bool): Whether to allow loading empty image or file path
+            not existent. Defaults to False.
+        backend_args (dict, optional): Instantiates the corresponding file
+            backend. It may contain `backend` key to specify the file
+            backend. If it contains, the file backend corresponding to this
+            value will be used and initialized with the remaining values,
+            otherwise the corresponding file backend will be selected
+            based on the prefix of the file path. Defaults to None.
+            New in version 2.0.0rc4.
+    """
+
+    def __init__(self,
+                 to_float32: bool = False,
+                 color_type: str = 'color',
+                 imdecode_backend: str = 'cv2',
+                 file_client_args: Optional[dict] = None,
+                 ignore_empty: bool = False,
+                 *,
+                 backend_args: Optional[dict] = None) -> None:
+        self.ignore_empty = ignore_empty
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.imdecode_backend = imdecode_backend
+
+        self.file_client_args: Optional[dict] = None
+        self.backend_args: Optional[dict] = None
+        if file_client_args is not None:
+            warnings.warn(
+                '"file_client_args" will be deprecated in future. '
+                'Please use "backend_args" instead', DeprecationWarning)
+            if backend_args is not None:
+                raise ValueError(
+                    '"file_client_args" and "backend_args" cannot be set '
+                    'at the same time.')
+
+            self.file_client_args = file_client_args.copy()
+        if backend_args is not None:
+            self.backend_args = backend_args.copy()
+
+    def transform(self, results: dict) -> Optional[dict]:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+        try:
+            if self.file_client_args is not None:
+                file_client = fileio.FileClient.infer_client(
+                    self.file_client_args, filename)
+                img_bytes = file_client.get(filename)
+            else:
+                img_bytes = fileio.get(
+                    filename, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(
+                img_bytes, flag=self.color_type, backend=self.imdecode_backend)
+        except Exception as e:
+            if self.ignore_empty:
+                return None
+            else:
+                raise e
+        # in some cases, images are not read successfully, the img would be
+        # `None`, refer to https://github.com/open-mmlab/mmpretrain/issues/1427
+        assert img is not None, f'failed to load image: {filename}'
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'ignore_empty={self.ignore_empty}, '
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f"imdecode_backend='{self.imdecode_backend}', ")
+
+        if self.file_client_args is not None:
+            repr_str += f'file_client_args={self.file_client_args})'
+        else:
+            repr_str += f'backend_args={self.backend_args})'
+
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadAnnotations(BaseTransform):
+    """Load and process the ``instances`` and ``seg_map`` annotation provided
+    by dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'instances':
+            [
+                {
+                # List of 4 numbers representing the bounding box of the
+                # instance, in (x1, y1, x2, y2) order.
+                'bbox': [x1, y1, x2, y2],
+
+                # Label of image classification.
+                'bbox_label': 1,
+
+                # Used in key point detection.
+                # Can only load the format of [x1, y1, v1,…, xn, yn, vn]. v[i]
+                # means the visibility of this keypoint. n must be equal to the
+                # number of keypoint categories.
+                'keypoints': [x1, y1, v1, ..., xn, yn, vn]
+                }
+            ]
+            # Filename of semantic or panoptic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+
+    .. code-block:: python
+
+        {
+            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
+            # in np.float32
+            'gt_bboxes': np.ndarray(N, 4)
+             # In np.int64 type.
+            'gt_bboxes_labels': np.ndarray(N, )
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+             # with (x, y, v) order, in np.float32 type.
+            'gt_keypoints': np.ndarray(N, NK, 3)
+        }
+
+    Required Keys:
+
+    - instances
+
+      - bbox (optional)
+      - bbox_label
+      - keypoints (optional)
+
+    - seg_map_path (optional)
+
+    Added Keys:
+
+    - gt_bboxes (np.float32)
+    - gt_bboxes_labels (np.int64)
+    - gt_seg_map (np.uint8)
+    - gt_keypoints (np.float32)
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+            Defaults to True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Defaults to True.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Defaults to False.
+        with_keypoints (bool): Whether to parse and load the keypoints
+            annotation. Defaults to False.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:`mmcv.imfrombytes`.
+            See :func:`mmcv.imfrombytes` for details.
+            Defaults to 'cv2'.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+            Deprecated in version 2.0.0rc4.
+        backend_args (dict, optional): Instantiates the corresponding file
+            backend. It may contain `backend` key to specify the file
+            backend. If it contains, the file backend corresponding to this
+            value will be used and initialized with the remaining values,
+            otherwise the corresponding file backend will be selected
+            based on the prefix of the file path. Defaults to None.
+            New in version 2.0.0rc4.
+    """
+
+    def __init__(
+        self,
+        with_bbox: bool = True,
+        with_label: bool = True,
+        with_seg: bool = False,
+        with_keypoints: bool = False,
+        imdecode_backend: str = 'cv2',
+        file_client_args: Optional[dict] = None,
+        *,
+        backend_args: Optional[dict] = None,
+    ) -> None:
+        super().__init__()
+        self.with_bbox = with_bbox
+        self.with_label = with_label
+        self.with_seg = with_seg
+        self.with_keypoints = with_keypoints
+        self.imdecode_backend = imdecode_backend
+
+        self.file_client_args: Optional[dict] = None
+        self.backend_args: Optional[dict] = None
+        if file_client_args is not None:
+            warnings.warn(
+                '"file_client_args" will be deprecated in future. '
+                'Please use "backend_args" instead', DeprecationWarning)
+            if backend_args is not None:
+                raise ValueError(
+                    '"file_client_args" and "backend_args" cannot be set '
+                    'at the same time.')
+
+            self.file_client_args = file_client_args.copy()
+        if backend_args is not None:
+            self.backend_args = backend_args.copy()
+
+    def _load_bboxes(self, results: dict) -> None:
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        gt_bboxes = []
+        for instance in results['instances']:
+            gt_bboxes.append(instance['bbox'])
+        results['gt_bboxes'] = np.array(
+            gt_bboxes, dtype=np.float32).reshape(-1, 4)
+
+    def _load_labels(self, results: dict) -> None:
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+        gt_bboxes_labels = []
+        for instance in results['instances']:
+            gt_bboxes_labels.append(instance['bbox_label'])
+        results['gt_bboxes_labels'] = np.array(
+            gt_bboxes_labels, dtype=np.int64)
+
+    def _load_seg_map(self, results: dict) -> None:
+        """Private function to load semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+        if self.file_client_args is not None:
+            file_client = fileio.FileClient.infer_client(
+                self.file_client_args, results['seg_map_path'])
+            img_bytes = file_client.get(results['seg_map_path'])
+        else:
+            img_bytes = fileio.get(
+                results['seg_map_path'], backend_args=self.backend_args)
+
+        results['gt_seg_map'] = mmcv.imfrombytes(
+            img_bytes, flag='unchanged',
+            backend=self.imdecode_backend).squeeze()
+
+    def _load_kps(self, results: dict) -> None:
+        """Private function to load keypoints annotations.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded keypoints annotations.
+        """
+        gt_keypoints = []
+        for instance in results['instances']:
+            gt_keypoints.append(instance['keypoints'])
+        results['gt_keypoints'] = np.array(gt_keypoints, np.float32).reshape(
+            (len(gt_keypoints), -1, 3))
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label and
+            semantic segmentation and keypoints annotations.
+        """
+
+        if self.with_bbox:
+            self._load_bboxes(results)
+        if self.with_label:
+            self._load_labels(results)
+        if self.with_seg:
+            self._load_seg_map(results)
+        if self.with_keypoints:
+            self._load_kps(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'with_keypoints={self.with_keypoints}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+
+        if self.file_client_args is not None:
+            repr_str += f'file_client_args={self.file_client_args})'
+        else:
+            repr_str += f'backend_args={self.backend_args})'
+
+        return repr_str
diff --git a/head_extractor/mmcv-2.1.0/mmcv/transforms/processing.py b/head_extractor/mmcv-2.1.0/mmcv/transforms/processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b65b112b6aa9b6c415ae355ecd325acd9892da8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/transforms/processing.py
@@ -0,0 +1,1562 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import random
+import warnings
+from itertools import product
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import numpy as np
+
+import mmcv
+from mmcv.image.geometric import _scale_size
+from .base import BaseTransform
+from .builder import TRANSFORMS
+from .utils import cache_randomness
+from .wrappers import Compose
+
+Number = Union[int, float]
+
+
+@TRANSFORMS.register_module()
+class Normalize(BaseTransform):
+    """Normalize the image.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Added Keys:
+
+    - img_norm_cfg
+
+      - mean
+      - std
+      - to_rgb
+
+
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB before
+            normlizing the image. If ``to_rgb=True``, the order of mean and std
+            should be RGB. If ``to_rgb=False``, the order of mean and std
+            should be the same order of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 mean: Sequence[Number],
+                 std: Sequence[Number],
+                 to_rgb: bool = True) -> None:
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+    def transform(self, results: dict) -> dict:
+        """Function to normalize images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Normalized results, key 'img_norm_cfg' key is added in to
+            result dict.
+        """
+
+        results['img'] = mmcv.imnormalize(results['img'], self.mean, self.std,
+                                          self.to_rgb)
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Resize(BaseTransform):
+    """Resize images & bbox & seg & keypoints.
+
+    This transform resizes the input image according to ``scale`` or
+    ``scale_factor``. Bboxes, seg map and keypoints are then resized with the
+    same scale factor.
+    if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to
+    resize.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_keypoints (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_seg_map
+    - gt_keypoints
+    - img_shape
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+
+    Args:
+        scale (int or tuple): Images scales for resizing. Defaults to None
+        scale_factor (float or tuple[float]): Scale factors for resizing.
+            Defaults to None.
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Defaults to False.
+        clip_object_border (bool): Whether to clip the objects
+            outside the border of the image. In some dataset like MOT17, the gt
+            bboxes are allowed to cross the border of images. Therefore, we
+            don't need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 scale: Optional[Union[int, Tuple[int, int]]] = None,
+                 scale_factor: Optional[Union[float, Tuple[float,
+                                                           float]]] = None,
+                 keep_ratio: bool = False,
+                 clip_object_border: bool = True,
+                 backend: str = 'cv2',
+                 interpolation='bilinear') -> None:
+        assert scale is not None or scale_factor is not None, (
+            '`scale` and'
+            '`scale_factor` can not both be `None`')
+        if scale is None:
+            self.scale = None
+        else:
+            if isinstance(scale, int):
+                self.scale = (scale, scale)
+            else:
+                self.scale = scale
+
+        self.backend = backend
+        self.interpolation = interpolation
+        self.keep_ratio = keep_ratio
+        self.clip_object_border = clip_object_border
+        if scale_factor is None:
+            self.scale_factor = None
+        elif isinstance(scale_factor, float):
+            self.scale_factor = (scale_factor, scale_factor)
+        elif isinstance(scale_factor, tuple):
+            assert (len(scale_factor)) == 2
+            self.scale_factor = scale_factor
+        else:
+            raise TypeError(
+                f'expect scale_factor is float or Tuple(float), but'
+                f'get {type(scale_factor)}')
+
+    def _resize_img(self, results: dict) -> None:
+        """Resize images with ``results['scale']``."""
+
+        if results.get('img', None) is not None:
+            if self.keep_ratio:
+                img, scale_factor = mmcv.imrescale(
+                    results['img'],
+                    results['scale'],
+                    interpolation=self.interpolation,
+                    return_scale=True,
+                    backend=self.backend)
+                # the w_scale and h_scale has minor difference
+                # a real fix should be done in the mmcv.imrescale in the future
+                new_h, new_w = img.shape[:2]
+                h, w = results['img'].shape[:2]
+                w_scale = new_w / w
+                h_scale = new_h / h
+            else:
+                img, w_scale, h_scale = mmcv.imresize(
+                    results['img'],
+                    results['scale'],
+                    interpolation=self.interpolation,
+                    return_scale=True,
+                    backend=self.backend)
+            results['img'] = img
+            results['img_shape'] = img.shape[:2]
+            results['scale_factor'] = (w_scale, h_scale)
+            results['keep_ratio'] = self.keep_ratio
+
+    def _resize_bboxes(self, results: dict) -> None:
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        if results.get('gt_bboxes', None) is not None:
+            bboxes = results['gt_bboxes'] * np.tile(
+                np.array(results['scale_factor']), 2)
+            if self.clip_object_border:
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0,
+                                          results['img_shape'][1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0,
+                                          results['img_shape'][0])
+            results['gt_bboxes'] = bboxes
+
+    def _resize_seg(self, results: dict) -> None:
+        """Resize semantic segmentation map with ``results['scale']``."""
+        if results.get('gt_seg_map', None) is not None:
+            if self.keep_ratio:
+                gt_seg = mmcv.imrescale(
+                    results['gt_seg_map'],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            else:
+                gt_seg = mmcv.imresize(
+                    results['gt_seg_map'],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            results['gt_seg_map'] = gt_seg
+
+    def _resize_keypoints(self, results: dict) -> None:
+        """Resize keypoints with ``results['scale_factor']``."""
+        if results.get('gt_keypoints', None) is not None:
+            keypoints = results['gt_keypoints']
+
+            keypoints[:, :, :2] = keypoints[:, :, :2] * np.array(
+                results['scale_factor'])
+            if self.clip_object_border:
+                keypoints[:, :, 0] = np.clip(keypoints[:, :, 0], 0,
+                                             results['img_shape'][1])
+                keypoints[:, :, 1] = np.clip(keypoints[:, :, 1], 0,
+                                             results['img_shape'][0])
+            results['gt_keypoints'] = keypoints
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to resize images, bounding boxes, semantic
+        segmentation map and keypoints.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'gt_keypoints', 'scale', 'scale_factor', 'img_shape',
+            and 'keep_ratio' keys are updated in result dict.
+        """
+
+        if self.scale:
+            results['scale'] = self.scale
+        else:
+            img_shape = results['img'].shape[:2]
+            results['scale'] = _scale_size(img_shape[::-1],
+                                           self.scale_factor)  # type: ignore
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_seg(results)
+        self._resize_keypoints(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(scale={self.scale}, '
+        repr_str += f'scale_factor={self.scale_factor}, '
+        repr_str += f'keep_ratio={self.keep_ratio}, '
+        repr_str += f'clip_object_border={self.clip_object_border}), '
+        repr_str += f'backend={self.backend}), '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Pad(BaseTransform):
+    """Pad the image & segmentation map.
+
+    There are three padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number. and (3)pad to square. Also,
+    pad to square and pad to the minimum size can be used as the same time.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+    - img_shape
+
+    Added Keys:
+
+    - pad_shape
+    - pad_fixed_size
+    - pad_size_divisor
+
+    Args:
+        size (tuple, optional): Fixed padding size.
+            Expected padding shape (w, h). Defaults to None.
+        size_divisor (int, optional): The divisor of padded size. Defaults to
+            None.
+        pad_to_square (bool): Whether to pad the image into a square.
+            Currently only used for YOLOX. Defaults to False.
+        pad_val (Number | dict[str, Number], optional): Padding value for if
+            the pad_mode is "constant". If it is a single number, the value
+            to pad the image is the number and to pad the semantic
+            segmentation map is 255. If it is a dict, it should have the
+            following keys:
+
+            - img: The value to pad the image.
+            - seg: The value to pad the semantic segmentation map.
+
+            Defaults to dict(img=0, seg=255).
+        padding_mode (str): Type of padding. Should be: constant, edge,
+            reflect or symmetric. Defaults to 'constant'.
+
+            - constant: pads with a constant value, this value is specified
+              with pad_val.
+            - edge: pads with the last value at the edge of the image.
+            - reflect: pads with reflection of image without repeating the last
+              value on the edge. For example, padding [1, 2, 3, 4] with 2
+              elements on both sides in reflect mode will result in
+              [3, 2, 1, 2, 3, 4, 3, 2].
+            - symmetric: pads with reflection of image repeating the last value
+              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+              both sides in symmetric mode will result in
+              [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    def __init__(self,
+                 size: Optional[Tuple[int, int]] = None,
+                 size_divisor: Optional[int] = None,
+                 pad_to_square: bool = False,
+                 pad_val: Union[Number, dict] = dict(img=0, seg=255),
+                 padding_mode: str = 'constant') -> None:
+        self.size = size
+        self.size_divisor = size_divisor
+        if isinstance(pad_val, int):
+            pad_val = dict(img=pad_val, seg=255)
+        assert isinstance(pad_val, dict), 'pad_val '
+        self.pad_val = pad_val
+        self.pad_to_square = pad_to_square
+
+        if pad_to_square:
+            assert size is None, \
+                'The size and size_divisor must be None ' \
+                'when pad2square is True'
+        else:
+            assert size is not None or size_divisor is not None, \
+                'only one of size and size_divisor should be valid'
+            assert size is None or size_divisor is None
+        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+        self.padding_mode = padding_mode
+
+    def _pad_img(self, results: dict) -> None:
+        """Pad images according to ``self.size``."""
+        pad_val = self.pad_val.get('img', 0)
+
+        size = None
+        if self.pad_to_square:
+            max_size = max(results['img'].shape[:2])
+            size = (max_size, max_size)
+        if self.size_divisor is not None:
+            if size is None:
+                size = (results['img'].shape[0], results['img'].shape[1])
+            pad_h = int(np.ceil(
+                size[0] / self.size_divisor)) * self.size_divisor
+            pad_w = int(np.ceil(
+                size[1] / self.size_divisor)) * self.size_divisor
+            size = (pad_h, pad_w)
+        elif self.size is not None:
+            size = self.size[::-1]
+        if isinstance(pad_val, int) and results['img'].ndim == 3:
+            pad_val = tuple(pad_val for _ in range(results['img'].shape[2]))
+        padded_img = mmcv.impad(
+            results['img'],
+            shape=size,
+            pad_val=pad_val,
+            padding_mode=self.padding_mode)
+
+        results['img'] = padded_img
+        results['pad_shape'] = padded_img.shape
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+        results['img_shape'] = padded_img.shape[:2]
+
+    def _pad_seg(self, results: dict) -> None:
+        """Pad semantic segmentation map according to
+        ``results['pad_shape']``."""
+        if results.get('gt_seg_map', None) is not None:
+            pad_val = self.pad_val.get('seg', 255)
+            if isinstance(pad_val, int) and results['gt_seg_map'].ndim == 3:
+                pad_val = tuple(
+                    pad_val for _ in range(results['gt_seg_map'].shape[2]))
+            results['gt_seg_map'] = mmcv.impad(
+                results['gt_seg_map'],
+                shape=results['pad_shape'][:2],
+                pad_val=pad_val,
+                padding_mode=self.padding_mode)
+
+    def transform(self, results: dict) -> dict:
+        """Call function to pad images, masks, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        self._pad_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, '
+        repr_str += f'size_divisor={self.size_divisor}, '
+        repr_str += f'pad_to_square={self.pad_to_square}, '
+        repr_str += f'pad_val={self.pad_val}), '
+        repr_str += f'padding_mode={self.padding_mode})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CenterCrop(BaseTransform):
+    """Crop the center of the image, segmentation masks, bounding boxes and key
+    points. If the crop area exceeds the original image and ``auto_pad`` is
+    True, the original image will be padded before cropping.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map (optional)
+    - gt_bboxes (optional)
+    - gt_keypoints (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_seg_map (optional)
+    - gt_bboxes (optional)
+    - gt_keypoints (optional)
+
+    Added Key:
+
+    - pad_shape
+
+
+    Args:
+        crop_size (Union[int, Tuple[int, int]]):  Expected size after cropping
+            with the format of (w, h). If set to an integer, then cropping
+            width and height are equal to this integer.
+        auto_pad (bool): Whether to pad the image if it's smaller than the
+            ``crop_size``. Defaults to False.
+        pad_cfg (dict): Base config for padding. Refer to ``mmcv.Pad`` for
+            detail. Defaults to ``dict(type='Pad')``.
+        clip_object_border (bool): Whether to clip the objects
+            outside the border of the image. In some dataset like MOT17, the
+            gt bboxes are allowed to cross the border of images. Therefore,
+            we don't need to clip the gt bboxes in these cases.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 crop_size: Union[int, Tuple[int, int]],
+                 auto_pad: bool = False,
+                 pad_cfg: dict = dict(type='Pad'),
+                 clip_object_border: bool = True) -> None:
+        super().__init__()
+        assert isinstance(crop_size, int) or (
+            isinstance(crop_size, tuple) and len(crop_size) == 2
+        ), 'The expected crop_size is an integer, or a tuple containing two '
+        'intergers'
+
+        if isinstance(crop_size, int):
+            crop_size = (crop_size, crop_size)
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        self.crop_size = crop_size
+        self.auto_pad = auto_pad
+
+        self.pad_cfg = pad_cfg.copy()
+        # size will be overwritten
+        if 'size' in self.pad_cfg and auto_pad:
+            warnings.warn('``size`` is set in ``pad_cfg``,'
+                          'however this argument will be overwritten'
+                          ' according to crop size and image size')
+
+        self.clip_object_border = clip_object_border
+
+    def _crop_img(self, results: dict, bboxes: np.ndarray) -> None:
+        """Crop image.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
+        """
+        if results.get('img', None) is not None:
+            img = mmcv.imcrop(results['img'], bboxes=bboxes)
+            img_shape = img.shape[:2]  # type: ignore
+            results['img'] = img
+            results['img_shape'] = img_shape
+            results['pad_shape'] = img_shape
+
+    def _crop_seg_map(self, results: dict, bboxes: np.ndarray) -> None:
+        """Crop semantic segmentation map.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
+        """
+        if results.get('gt_seg_map', None) is not None:
+            img = mmcv.imcrop(results['gt_seg_map'], bboxes=bboxes)
+            results['gt_seg_map'] = img
+
+    def _crop_bboxes(self, results: dict, bboxes: np.ndarray) -> None:
+        """Update bounding boxes according to CenterCrop.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
+        """
+        if 'gt_bboxes' in results:
+            offset_w = bboxes[0]
+            offset_h = bboxes[1]
+            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h])
+            # gt_bboxes has shape (num_gts, 4) in (tl_x, tl_y, br_x, br_y)
+            # order.
+            gt_bboxes = results['gt_bboxes'] - bbox_offset
+            if self.clip_object_border:
+                gt_bboxes[:, 0::2] = np.clip(gt_bboxes[:, 0::2], 0,
+                                             results['img'].shape[1])
+                gt_bboxes[:, 1::2] = np.clip(gt_bboxes[:, 1::2], 0,
+                                             results['img'].shape[0])
+            results['gt_bboxes'] = gt_bboxes
+
+    def _crop_keypoints(self, results: dict, bboxes: np.ndarray) -> None:
+        """Update key points according to CenterCrop. Keypoints that not in the
+        cropped image will be set invisible.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+            bboxes (np.ndarray): Shape (4, ), location of cropped bboxes.
+        """
+        if 'gt_keypoints' in results:
+            offset_w = bboxes[0]
+            offset_h = bboxes[1]
+            keypoints_offset = np.array([offset_w, offset_h, 0])
+            # gt_keypoints has shape (N, NK, 3) in (x, y, visibility) order,
+            # NK = number of points per object
+            gt_keypoints = results['gt_keypoints'] - keypoints_offset
+            # set gt_kepoints out of the result image invisible
+            height, width = results['img'].shape[:2]
+            valid_pos = (gt_keypoints[:, :, 0] >=
+                         0) * (gt_keypoints[:, :, 0] <
+                               width) * (gt_keypoints[:, :, 1] >= 0) * (
+                                   gt_keypoints[:, :, 1] < height)
+            gt_keypoints[:, :, 2] = np.where(valid_pos, gt_keypoints[:, :, 2],
+                                             0)
+            gt_keypoints[:, :, 0] = np.clip(gt_keypoints[:, :, 0], 0,
+                                            results['img'].shape[1])
+            gt_keypoints[:, :, 1] = np.clip(gt_keypoints[:, :, 1], 0,
+                                            results['img'].shape[0])
+            results['gt_keypoints'] = gt_keypoints
+
+    def transform(self, results: dict) -> dict:
+        """Apply center crop on results.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+            dict: Results with CenterCropped image and semantic segmentation
+            map.
+        """
+        crop_width, crop_height = self.crop_size[0], self.crop_size[1]
+
+        assert 'img' in results, '`img` is not found in results'
+        img = results['img']
+        # img.shape has length 2 for grayscale, length 3 for color
+        img_height, img_width = img.shape[:2]
+
+        if crop_height > img_height or crop_width > img_width:
+            if self.auto_pad:
+                # pad the area
+                img_height = max(img_height, crop_height)
+                img_width = max(img_width, crop_width)
+                pad_size = (img_width, img_height)
+                _pad_cfg = self.pad_cfg.copy()
+                _pad_cfg.update(dict(size=pad_size))
+                pad_transform = TRANSFORMS.build(_pad_cfg)
+                results = pad_transform(results)
+            else:
+                crop_height = min(crop_height, img_height)
+                crop_width = min(crop_width, img_width)
+
+        y1 = max(0, int(round((img_height - crop_height) / 2.)))
+        x1 = max(0, int(round((img_width - crop_width) / 2.)))
+        y2 = min(img_height, y1 + crop_height) - 1
+        x2 = min(img_width, x1 + crop_width) - 1
+        bboxes = np.array([x1, y1, x2, y2])
+
+        # crop the image
+        self._crop_img(results, bboxes)
+        # crop the gt_seg_map
+        self._crop_seg_map(results, bboxes)
+        # crop the bounding box
+        self._crop_bboxes(results, bboxes)
+        # crop the keypoints
+        self._crop_keypoints(results, bboxes)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size = {self.crop_size}'
+        repr_str += f', auto_pad={self.auto_pad}'
+        repr_str += f', pad_cfg={self.pad_cfg}'
+        repr_str += f',clip_object_border = {self.clip_object_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomGrayscale(BaseTransform):
+    """Randomly convert image to grayscale with a probability.
+
+    Required Key:
+
+    - img
+
+    Modified Key:
+
+    - img
+
+    Added Keys:
+
+    - grayscale
+    - grayscale_weights
+
+    Args:
+        prob (float): Probability that image should be converted to
+            grayscale. Defaults to 0.1.
+        keep_channels (bool): Whether keep channel number the same as
+            input. Defaults to False.
+        channel_weights (tuple): The grayscale weights of each channel,
+            and the weights will be normalized. For example, (1, 2, 1)
+            will be normalized as (0.25, 0.5, 0.25). Defaults to
+            (1., 1., 1.).
+        color_format (str): Color format set to be any of 'bgr',
+            'rgb', 'hsv'. Note: 'hsv' image will be transformed into 'bgr'
+            format no matter whether it is grayscaled. Defaults to 'bgr'.
+    """
+
+    def __init__(self,
+                 prob: float = 0.1,
+                 keep_channels: bool = False,
+                 channel_weights: Sequence[float] = (1., 1., 1.),
+                 color_format: str = 'bgr') -> None:
+        super().__init__()
+        assert 0. <= prob <= 1., ('The range of ``prob`` value is [0., 1.],' +
+                                  f' but got {prob} instead')
+        self.prob = prob
+        self.keep_channels = keep_channels
+        self.channel_weights = channel_weights
+        assert color_format in ['bgr', 'rgb', 'hsv']
+        self.color_format = color_format
+
+    @cache_randomness
+    def _random_prob(self):
+        return random.random()
+
+    def transform(self, results: dict) -> dict:
+        """Apply random grayscale on results.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+           dict: Results with grayscale image.
+        """
+        img = results['img']
+        # convert hsv to bgr
+        if self.color_format == 'hsv':
+            img = mmcv.hsv2bgr(img)
+        img = img[..., None] if img.ndim == 2 else img
+        num_output_channels = img.shape[2]
+        if self._random_prob() < self.prob:
+            if num_output_channels > 1:
+                assert num_output_channels == len(
+                    self.channel_weights
+                ), 'The length of ``channel_weights`` are supposed to be '
+                f'num_output_channels, but got {len(self.channel_weights)}'
+                ' instead.'
+                normalized_weights = (
+                    np.array(self.channel_weights) / sum(self.channel_weights))
+                img = (normalized_weights * img).sum(axis=2)
+                img = img.astype('uint8')
+                if self.keep_channels:
+                    img = img[:, :, None]
+                    results['img'] = np.dstack(
+                        [img for _ in range(num_output_channels)])
+                else:
+                    results['img'] = img
+                return results
+        img = img.astype('uint8')
+        results['img'] = img
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob = {self.prob}'
+        repr_str += f', keep_channels = {self.keep_channels}'
+        repr_str += f', channel_weights = {self.channel_weights}'
+        repr_str += f', color_format = {self.color_format})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MultiScaleFlipAug(BaseTransform):
+    """Test-time augmentation with multiple scales and flipping.
+
+    An example configuration is as followed:
+
+    .. code-block::
+
+        dict(
+            type='MultiScaleFlipAug',
+            scales=[(1333, 400), (1333, 800)],
+            flip=True,
+            transforms=[
+                dict(type='Normalize', **img_norm_cfg),
+                dict(type='Pad', size_divisor=1),
+                dict(type='ImageToTensor', keys=['img']),
+                dict(type='Collect', keys=['img'])
+            ])
+
+    ``results`` will be resized using all the sizes in ``scales``.
+    If ``flip`` is True, then flipped results will also be added into output
+    list.
+
+    For the above configuration, there are four combinations of resize
+    and flip:
+
+    - Resize to (1333, 400) + no flip
+    - Resize to (1333, 400) + flip
+    - Resize to (1333, 800) + no flip
+    - resize to (1333, 800) + flip
+
+    The four results are then transformed with ``transforms`` argument.
+    After that, results are wrapped into lists of the same length as below:
+
+    .. code-block::
+
+        dict(
+            inputs=[...],
+            data_samples=[...]
+        )
+
+    Where the length of ``inputs`` and ``data_samples`` are both 4.
+
+    Required Keys:
+
+    - Depending on the requirements of the ``transforms`` parameter.
+
+    Modified Keys:
+
+    - All output keys of each transform.
+
+    Args:
+        transforms (list[dict]): Transforms to be applied to each resized
+            and flipped data.
+        scales (tuple | list[tuple] | None): Images scales for resizing.
+        scale_factor (float or tuple[float]): Scale factors for resizing.
+            Defaults to None.
+        allow_flip (bool): Whether apply flip augmentation. Defaults to False.
+        flip_direction (str | list[str]): Flip augmentation directions,
+            options are "horizontal", "vertical" and "diagonal". If
+            flip_direction is a list, multiple flip augmentations will be
+            applied. It has no effect when flip == False. Defaults to
+            "horizontal".
+        resize_cfg (dict): Base config for resizing. Defaults to
+            ``dict(type='Resize', keep_ratio=True)``.
+        flip_cfg (dict): Base config for flipping. Defaults to
+            ``dict(type='RandomFlip')``.
+    """
+
+    def __init__(
+        self,
+        transforms: List[dict],
+        scales: Optional[Union[Tuple, List[Tuple]]] = None,
+        scale_factor: Optional[Union[float, List[float]]] = None,
+        allow_flip: bool = False,
+        flip_direction: Union[str, List[str]] = 'horizontal',
+        resize_cfg: dict = dict(type='Resize', keep_ratio=True),
+        flip_cfg: dict = dict(type='RandomFlip')
+    ) -> None:
+        super().__init__()
+        self.transforms = Compose(transforms)  # type: ignore
+
+        if scales is not None:
+            self.scales = scales if isinstance(scales, list) else [scales]
+            self.scale_key = 'scale'
+            assert mmengine.is_list_of(self.scales, tuple)
+        else:
+            # if ``scales`` and ``scale_factor`` both be ``None``
+            if scale_factor is None:
+                self.scales = [1.]  # type: ignore
+            elif isinstance(scale_factor, list):
+                self.scales = scale_factor  # type: ignore
+            else:
+                self.scales = [scale_factor]  # type: ignore
+
+            self.scale_key = 'scale_factor'
+
+        self.allow_flip = allow_flip
+        self.flip_direction = flip_direction if isinstance(
+            flip_direction, list) else [flip_direction]
+        assert mmengine.is_list_of(self.flip_direction, str)
+        if not self.allow_flip and self.flip_direction != ['horizontal']:
+            warnings.warn(
+                'flip_direction has no effect when flip is set to False')
+        self.resize_cfg = resize_cfg.copy()
+        self.flip_cfg = flip_cfg
+
+    def transform(self, results: dict) -> Dict:
+        """Apply test time augment transforms on results.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+            dict: The augmented data, where each value is wrapped
+            into a list.
+        """
+
+        data_samples = []
+        inputs = []
+        flip_args = [(False, '')]
+        if self.allow_flip:
+            flip_args += [(True, direction)
+                          for direction in self.flip_direction]
+        for scale in self.scales:
+            for flip, direction in flip_args:
+                _resize_cfg = self.resize_cfg.copy()
+                _resize_cfg.update({self.scale_key: scale})
+                _resize_flip = [_resize_cfg]
+
+                if flip:
+                    _flip_cfg = self.flip_cfg.copy()
+                    _flip_cfg.update(prob=1.0, direction=direction)
+                    _resize_flip.append(_flip_cfg)
+                else:
+                    results['flip'] = False
+                    results['flip_direction'] = None
+
+                resize_flip = Compose(_resize_flip)
+                _results = resize_flip(results.copy())
+                packed_results = self.transforms(_results)  # type: ignore
+
+                inputs.append(packed_results['inputs'])  # type: ignore
+                data_samples.append(
+                    packed_results['data_sample'])  # type: ignore
+        return dict(inputs=inputs, data_sample=data_samples)
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms={self.transforms}'
+        repr_str += f', scales={self.scales}'
+        repr_str += f', allow_flip={self.allow_flip}'
+        repr_str += f', flip_direction={self.flip_direction})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class TestTimeAug(BaseTransform):
+    """Test-time augmentation transform.
+
+    An example configuration is as followed:
+
+    .. code-block::
+
+        dict(type='TestTimeAug',
+             transforms=[
+                [dict(type='Resize', scale=(1333, 400), keep_ratio=True),
+                 dict(type='Resize', scale=(1333, 800), keep_ratio=True)],
+                [dict(type='RandomFlip', prob=1.),
+                 dict(type='RandomFlip', prob=0.)],
+                [dict(type='PackDetInputs',
+                      meta_keys=('img_id', 'img_path', 'ori_shape',
+                                 'img_shape', 'scale_factor', 'flip',
+                                 'flip_direction'))]])
+
+    ``results`` will be transformed using all transforms defined in
+    ``transforms`` arguments.
+
+    For the above configuration, there are four combinations of resize
+    and flip:
+
+    - Resize to (1333, 400) + no flip
+    - Resize to (1333, 400) + flip
+    - Resize to (1333, 800) + no flip
+    - resize to (1333, 800) + flip
+
+    After that, results are wrapped into lists of the same length as below:
+
+    .. code-block::
+
+        dict(
+            inputs=[...],
+            data_samples=[...]
+        )
+
+    The length of ``inputs`` and ``data_samples`` are both 4.
+
+    Required Keys:
+
+    - Depending on the requirements of the ``transforms`` parameter.
+
+    Modified Keys:
+
+    - All output keys of each transform.
+
+    Args:
+        transforms (list[list[dict]]): Transforms to be applied to data sampled
+            from dataset. ``transforms`` is a list of list, and each list
+            element usually represents a series of transforms with the same
+            type and different arguments. Data will be processed by each list
+            elements sequentially. See more information in :meth:`transform`.
+    """
+
+    def __init__(self, transforms: list):
+        for i, transform_list in enumerate(transforms):
+            for j, transform in enumerate(transform_list):
+                if isinstance(transform, dict):
+                    transform_list[j] = TRANSFORMS.build(transform)
+                elif callable(transform):
+                    continue
+                else:
+                    raise TypeError(
+                        'transform must be callable or a dict, but got'
+                        f' {type(transform)}')
+            transforms[i] = transform_list
+
+        self.subroutines = [
+            Compose(subroutine) for subroutine in product(*transforms)
+        ]
+
+    def transform(self, results: dict) -> dict:
+        """Apply all transforms defined in :attr:`transforms` to the results.
+
+        As the example given in :obj:`TestTimeAug`, ``transforms`` consists of
+        2 ``Resize``, 2 ``RandomFlip`` and 1 ``PackDetInputs``.
+        The data sampled from dataset will be processed as follows:
+
+        1. Data will be processed by 2 ``Resize`` and return a list
+           of 2 results.
+        2. Each result in list will be further passed to 2
+           ``RandomFlip``, and aggregates into a list of 4 results.
+        3. Each result will be processed by ``PackDetInputs``, and
+           return a list of dict.
+        4. Aggregates the same fields of results, and finally returns
+           a dict. Each value of the dict represents 4 transformed
+           results.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+            dict: The augmented data, where each value is wrapped
+            into a list.
+        """
+        results_list = []  # type: ignore
+        for subroutine in self.subroutines:
+            result = subroutine(copy.deepcopy(results))
+            assert isinstance(result, dict), (
+                f'Data processed by {subroutine} must return a dict, but got '
+                f'{result}')
+            assert result is not None, (
+                f'Data processed by {subroutine} in `TestTimeAug` should not '
+                'be None! Please check your validation dataset and the '
+                f'transforms in {subroutine}')
+            results_list.append(result)
+
+        aug_data_dict = {
+            key: [item[key] for item in results_list]  # type: ignore
+            for key in results_list[0]  # type: ignore
+        }
+        return aug_data_dict
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += 'transforms=\n'
+        for subroutine in self.subroutines:
+            repr_str += f'{repr(subroutine)}\n'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomChoiceResize(BaseTransform):
+    """Resize images & bbox & mask from a list of multiple scales.
+
+    This transform resizes the input image to some scale. Bboxes and masks are
+    then resized with the same scale factor. Resize scale will be randomly
+    selected from ``scales``.
+
+    How to choose the target scale to resize the image will follow the rules
+    below:
+
+    - if `scale` is a list of tuple, the target scale is sampled from the list
+      uniformally.
+    - if `scale` is a tuple, the target scale will be set to the tuple.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_keypoints (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_keypoints (optional)
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - scale_idx
+    - keep_ratio
+
+
+    Args:
+        scales (Union[list, Tuple]): Images scales for resizing.
+        resize_type (str): The type of resize class to use. Defaults to
+            "Resize".
+        **resize_kwargs: Other keyword arguments for the ``resize_type``.
+
+    Note:
+        By defaults, the ``resize_type`` is "Resize", if it's not overwritten
+        by your registry, it indicates the :class:`mmcv.Resize`. And therefore,
+        ``resize_kwargs`` accepts any keyword arguments of it, like
+        ``keep_ratio``, ``interpolation`` and so on.
+
+        If you want to use your custom resize class, the class should accept
+        ``scale`` argument and have ``scale`` attribution which determines the
+        resize shape.
+    """
+
+    def __init__(
+        self,
+        scales: Sequence[Union[int, Tuple]],
+        resize_type: str = 'Resize',
+        **resize_kwargs,
+    ) -> None:
+        super().__init__()
+        if isinstance(scales, list):
+            self.scales = scales
+        else:
+            self.scales = [scales]
+        assert mmengine.is_seq_of(self.scales, (tuple, int))
+
+        self.resize_cfg = dict(type=resize_type, **resize_kwargs)
+        # create a empty Resize object
+        self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})
+
+    @cache_randomness
+    def _random_select(self) -> Tuple[int, int]:
+        """Randomly select an scale from given candidates.
+
+        Returns:
+            (tuple, int): Returns a tuple ``(scale, scale_dix)``,
+            where ``scale`` is the selected image scale and
+            ``scale_idx`` is the selected index in the given candidates.
+        """
+
+        scale_idx = np.random.randint(len(self.scales))
+        scale = self.scales[scale_idx]
+        return scale, scale_idx
+
+    def transform(self, results: dict) -> dict:
+        """Apply resize transforms on results from a list of scales.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'gt_keypoints', 'scale', 'scale_factor', 'img_shape',
+            and 'keep_ratio' keys are updated in result dict.
+        """
+
+        target_scale, scale_idx = self._random_select()
+        self.resize.scale = target_scale
+        results = self.resize(results)
+        results['scale_idx'] = scale_idx
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(scales={self.scales}'
+        repr_str += f', resize_cfg={self.resize_cfg})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomFlip(BaseTransform):
+    """Flip the image & bbox & keypoints & segmentation map. Added or Updated
+    keys: flip, flip_direction, img, gt_bboxes, gt_seg_map, and
+    gt_keypoints. There are 3 flip modes:
+
+    - ``prob`` is float, ``direction`` is string: the image will be
+      ``direction``ly flipped with probability of ``prob`` .
+      E.g., ``prob=0.5``, ``direction='horizontal'``,
+      then image will be horizontally flipped with probability of 0.5.
+
+    - ``prob`` is float, ``direction`` is list of string: the image will
+      be ``direction[i]``ly flipped with probability of
+      ``prob/len(direction)``.
+      E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``,
+      then image will be horizontally flipped with probability of 0.25,
+      vertically with probability of 0.25.
+
+    - ``prob`` is list of float, ``direction`` is list of string:
+      given ``len(prob) == len(direction)``, the image will
+      be ``direction[i]``ly flipped with probability of ``prob[i]``.
+      E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal',
+      'vertical']``, then image will be horizontally flipped with
+      probability of 0.3, vertically with probability of 0.5.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_keypoints (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_keypoints (optional)
+
+    Added Keys:
+
+    - flip
+    - flip_direction
+    - swap_seg_labels (optional)
+
+    Args:
+        prob (float | list[float], optional): The flipping probability.
+            Defaults to None.
+        direction(str | list[str]): The flipping direction. Options
+            If input is a list, the length must equal ``prob``. Each
+            element in ``prob`` indicates the flip probability of
+            corresponding direction. Defaults to 'horizontal'.
+        swap_seg_labels (list, optional): The label pair need to be swapped
+            for ground truth, like 'left arm' and 'right arm' need to be
+            swapped after horizontal flipping. For example, ``[(1, 5)]``,
+            where 1/5 is the label of the left/right arm. Defaults to None.
+    """
+
+    def __init__(self,
+                 prob: Optional[Union[float, Iterable[float]]] = None,
+                 direction: Union[str, Sequence[Optional[str]]] = 'horizontal',
+                 swap_seg_labels: Optional[Sequence] = None) -> None:
+        if isinstance(prob, list):
+            assert mmengine.is_list_of(prob, float)
+            assert 0 <= sum(prob) <= 1
+        elif isinstance(prob, float):
+            assert 0 <= prob <= 1
+        else:
+            raise ValueError(f'probs must be float or list of float, but \
+                              got `{type(prob)}`.')
+        self.prob = prob
+        self.swap_seg_labels = swap_seg_labels
+
+        valid_directions = ['horizontal', 'vertical', 'diagonal']
+        if isinstance(direction, str):
+            assert direction in valid_directions
+        elif isinstance(direction, list):
+            assert mmengine.is_list_of(direction, str)
+            assert set(direction).issubset(set(valid_directions))
+        else:
+            raise ValueError(f'direction must be either str or list of str, \
+                               but got `{type(direction)}`.')
+        self.direction = direction
+
+        if isinstance(prob, list):
+            assert len(prob) == len(self.direction)
+
+    def _flip_bbox(self, bboxes: np.ndarray, img_shape: Tuple[int, int],
+                   direction: str) -> np.ndarray:
+        """Flip bboxes horizontally.
+
+        Args:
+            bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
+            img_shape (tuple[int]): Image shape (height, width)
+            direction (str): Flip direction. Options are 'horizontal',
+                'vertical', and 'diagonal'.
+
+        Returns:
+            numpy.ndarray: Flipped bounding boxes.
+        """
+        assert bboxes.shape[-1] % 4 == 0
+        flipped = bboxes.copy()
+        h, w = img_shape
+        if direction == 'horizontal':
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+        elif direction == 'vertical':
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        elif direction == 'diagonal':
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        else:
+            raise ValueError(
+                f"Flipping direction must be 'horizontal', 'vertical', \
+                  or 'diagonal', but got '{direction}'")
+        return flipped
+
+    def _flip_keypoints(
+        self,
+        keypoints: np.ndarray,
+        img_shape: Tuple[int, int],
+        direction: str,
+    ) -> np.ndarray:
+        """Flip keypoints horizontally, vertically or diagonally.
+
+        Args:
+            keypoints (numpy.ndarray): Keypoints, shape (..., 2)
+            img_shape (tuple[int]): Image shape (height, width)
+            direction (str): Flip direction. Options are 'horizontal',
+                'vertical', and 'diagonal'.
+
+        Returns:
+            numpy.ndarray: Flipped keypoints.
+        """
+
+        meta_info = keypoints[..., 2:]
+        keypoints = keypoints[..., :2]
+        flipped = keypoints.copy()
+        h, w = img_shape
+        if direction == 'horizontal':
+            flipped[..., 0::2] = w - keypoints[..., 0::2]
+        elif direction == 'vertical':
+            flipped[..., 1::2] = h - keypoints[..., 1::2]
+        elif direction == 'diagonal':
+            flipped[..., 0::2] = w - keypoints[..., 0::2]
+            flipped[..., 1::2] = h - keypoints[..., 1::2]
+        else:
+            raise ValueError(
+                f"Flipping direction must be 'horizontal', 'vertical', \
+                  or 'diagonal', but got '{direction}'")
+        flipped = np.concatenate([flipped, meta_info], axis=-1)
+        return flipped
+
+    def _flip_seg_map(self, seg_map: dict, direction: str) -> np.ndarray:
+        """Flip segmentation map horizontally, vertically or diagonally.
+
+        Args:
+            seg_map (numpy.ndarray): segmentation map, shape (H, W).
+            direction (str): Flip direction. Options are 'horizontal',
+                'vertical'.
+
+        Returns:
+            numpy.ndarray: Flipped segmentation map.
+        """
+        seg_map = mmcv.imflip(seg_map, direction=direction)
+        if self.swap_seg_labels is not None:
+            # to handle datasets with left/right annotations
+            # like 'Left-arm' and 'Right-arm' in LIP dataset
+            # Modified from https://github.com/openseg-group/openseg.pytorch/blob/master/lib/datasets/tools/cv2_aug_transforms.py # noqa:E501
+            # Licensed under MIT license
+            temp = seg_map.copy()
+            assert isinstance(self.swap_seg_labels, (tuple, list))
+            for pair in self.swap_seg_labels:
+                assert isinstance(pair, (tuple, list)) and len(pair) == 2, \
+                    'swap_seg_labels must be a sequence with pair, but got ' \
+                    f'{self.swap_seg_labels}.'
+                seg_map[temp == pair[0]] = pair[1]
+                seg_map[temp == pair[1]] = pair[0]
+        return seg_map
+
+    @cache_randomness
+    def _choose_direction(self) -> str:
+        """Choose the flip direction according to `prob` and `direction`"""
+        if isinstance(self.direction,
+                      Sequence) and not isinstance(self.direction, str):
+            # None means non-flip
+            direction_list: list = list(self.direction) + [None]
+        elif isinstance(self.direction, str):
+            # None means non-flip
+            direction_list = [self.direction, None]
+
+        if isinstance(self.prob, list):
+            non_prob: float = 1 - sum(self.prob)
+            prob_list = self.prob + [non_prob]
+        elif isinstance(self.prob, float):
+            non_prob = 1. - self.prob
+            # exclude non-flip
+            single_ratio = self.prob / (len(direction_list) - 1)
+            prob_list = [single_ratio] * (len(direction_list) - 1) + [non_prob]
+
+        cur_dir = np.random.choice(direction_list, p=prob_list)
+
+        return cur_dir
+
+    def _flip(self, results: dict) -> None:
+        """Flip images, bounding boxes, semantic segmentation map and
+        keypoints."""
+        # flip image
+        results['img'] = mmcv.imflip(
+            results['img'], direction=results['flip_direction'])
+
+        img_shape = results['img'].shape[:2]
+
+        # flip bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'] = self._flip_bbox(results['gt_bboxes'],
+                                                   img_shape,
+                                                   results['flip_direction'])
+
+        # flip keypoints
+        if results.get('gt_keypoints', None) is not None:
+            results['gt_keypoints'] = self._flip_keypoints(
+                results['gt_keypoints'], img_shape, results['flip_direction'])
+
+        # flip seg map
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = self._flip_seg_map(
+                results['gt_seg_map'], direction=results['flip_direction'])
+            results['swap_seg_labels'] = self.swap_seg_labels
+
+    def _flip_on_direction(self, results: dict) -> None:
+        """Function to flip images, bounding boxes, semantic segmentation map
+        and keypoints."""
+        cur_dir = self._choose_direction()
+        if cur_dir is None:
+            results['flip'] = False
+            results['flip_direction'] = None
+        else:
+            results['flip'] = True
+            results['flip_direction'] = cur_dir
+            self._flip(results)
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to flip images, bounding boxes, semantic
+        segmentation map and keypoints.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'gt_keypoints', 'flip', and 'flip_direction' keys are
+            updated in result dict.
+        """
+        self._flip_on_direction(results)
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'direction={self.direction})'
+
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomResize(BaseTransform):
+    """Random resize images & bbox & keypoints.
+
+    How to choose the target scale to resize the image will follow the rules
+    below:
+
+    - if ``scale`` is a sequence of tuple
+
+    .. math::
+        target\\_scale[0] \\sim Uniform([scale[0][0], scale[1][0]])
+    .. math::
+        target\\_scale[1] \\sim Uniform([scale[0][1], scale[1][1]])
+
+    Following the resize order of weight and height in cv2, ``scale[i][0]``
+    is for width, and ``scale[i][1]`` is for height.
+
+    - if ``scale`` is a tuple
+
+    .. math::
+        target\\_scale[0] \\sim Uniform([ratio\\_range[0], ratio\\_range[1]])
+            * scale[0]
+    .. math::
+        target\\_scale[1] \\sim Uniform([ratio\\_range[0], ratio\\_range[1]])
+            * scale[1]
+
+    Following the resize order of weight and height in cv2, ``ratio_range[0]``
+    is for width, and ``ratio_range[1]`` is for height.
+
+    - if ``keep_ratio`` is True, the minimum value of ``target_scale`` will be
+      used to set the shorter side and the maximum value will be used to
+      set the longer side.
+
+    - if ``keep_ratio`` is False, the value of ``target_scale`` will be used to
+      reisze the width and height accordingly.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes
+    - gt_seg_map
+    - gt_keypoints
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_seg_map
+    - gt_keypoints
+    - img_shape
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+
+    Args:
+        scale (tuple or Sequence[tuple]): Images scales for resizing.
+            Defaults to None.
+        ratio_range (tuple[float], optional): (min_ratio, max_ratio).
+            Defaults to None.
+        resize_type (str): The type of resize class to use. Defaults to
+            "Resize".
+        **resize_kwargs: Other keyword arguments for the ``resize_type``.
+
+    Note:
+        By defaults, the ``resize_type`` is "Resize", if it's not overwritten
+        by your registry, it indicates the :class:`mmcv.Resize`. And therefore,
+        ``resize_kwargs`` accepts any keyword arguments of it, like
+        ``keep_ratio``, ``interpolation`` and so on.
+
+        If you want to use your custom resize class, the class should accept
+        ``scale`` argument and have ``scale`` attribution which determines the
+        resize shape.
+    """
+
+    def __init__(
+        self,
+        scale: Union[Tuple[int, int], Sequence[Tuple[int, int]]],
+        ratio_range: Tuple[float, float] = None,
+        resize_type: str = 'Resize',
+        **resize_kwargs,
+    ) -> None:
+
+        self.scale = scale
+        self.ratio_range = ratio_range
+
+        self.resize_cfg = dict(type=resize_type, **resize_kwargs)
+        # create a empty Reisize object
+        self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})
+
+    @staticmethod
+    def _random_sample(scales: Sequence[Tuple[int, int]]) -> tuple:
+        """Private function to randomly sample a scale from a list of tuples.
+
+        Args:
+            scales (list[tuple]): Images scale range for sampling.
+                There must be two tuples in scales, which specify the lower
+                and upper bound of image scales.
+
+        Returns:
+            tuple: The targeted scale of the image to be resized.
+        """
+
+        assert mmengine.is_list_of(scales, tuple) and len(scales) == 2
+        scale_0 = [scales[0][0], scales[1][0]]
+        scale_1 = [scales[0][1], scales[1][1]]
+        edge_0 = np.random.randint(min(scale_0), max(scale_0) + 1)
+        edge_1 = np.random.randint(min(scale_1), max(scale_1) + 1)
+        scale = (edge_0, edge_1)
+        return scale
+
+    @staticmethod
+    def _random_sample_ratio(scale: tuple, ratio_range: Tuple[float,
+                                                              float]) -> tuple:
+        """Private function to randomly sample a scale from a tuple.
+
+        A ratio will be randomly sampled from the range specified by
+        ``ratio_range``. Then it would be multiplied with ``scale`` to
+        generate sampled scale.
+
+        Args:
+            scale (tuple): Images scale base to multiply with ratio.
+            ratio_range (tuple[float]): The minimum and maximum ratio to scale
+                the ``scale``.
+
+        Returns:
+            tuple: The targeted scale of the image to be resized.
+        """
+
+        assert isinstance(scale, tuple) and len(scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(scale[0] * ratio), int(scale[1] * ratio)
+        return scale
+
+    @cache_randomness
+    def _random_scale(self) -> tuple:
+        """Private function to randomly sample an scale according to the type
+        of ``scale``.
+
+        Returns:
+            tuple: The targeted scale of the image to be resized.
+        """
+
+        if mmengine.is_tuple_of(self.scale, int):
+            assert self.ratio_range is not None and len(self.ratio_range) == 2
+            scale = self._random_sample_ratio(
+                self.scale,  # type: ignore
+                self.ratio_range)
+        elif mmengine.is_seq_of(self.scale, tuple):
+            scale = self._random_sample(self.scale)  # type: ignore
+        else:
+            raise NotImplementedError('Do not support sampling function '
+                                      f'for "{self.scale}"')
+
+        return scale
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to resize images, bounding boxes, semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, ``img``, ``gt_bboxes``, ``gt_semantic_seg``,
+            ``gt_keypoints``, ``scale``, ``scale_factor``, ``img_shape``, and
+            ``keep_ratio`` keys are updated in result dict.
+        """
+        results['scale'] = self._random_scale()
+        self.resize.scale = results['scale']
+        results = self.resize(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(scale={self.scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'resize_cfg={self.resize_cfg})'
+        return repr_str
diff --git a/head_extractor/mmcv-2.1.0/mmcv/transforms/utils.py b/head_extractor/mmcv-2.1.0/mmcv/transforms/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..feb2396bf8d96f3aefee540632fd818954168b55
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/transforms/utils.py
@@ -0,0 +1,253 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import copy
+import functools
+import inspect
+import weakref
+from collections import defaultdict
+from collections.abc import Iterable
+from contextlib import contextmanager
+from typing import Callable, Union
+
+from .base import BaseTransform
+
+
+class cache_randomness:
+    """Decorator that marks the method with random return value(s) in a
+    transform class.
+
+    This decorator is usually used together with the context-manager
+    :func`:cache_random_params`. In this context, a decorated method will
+    cache its return value(s) at the first time of being invoked, and always
+    return the cached values when being invoked again.
+
+    .. note::
+        Only an instance method can be decorated with ``cache_randomness``.
+    """
+
+    def __init__(self, func):
+
+        # Check `func` is to be bound as an instance method
+        if not inspect.isfunction(func):
+            raise TypeError('Unsupport callable to decorate with'
+                            '@cache_randomness.')
+        func_args = inspect.getfullargspec(func).args
+        if len(func_args) == 0 or func_args[0] != 'self':
+            raise TypeError(
+                '@cache_randomness should only be used to decorate '
+                'instance methods (the first argument is ``self``).')
+
+        functools.update_wrapper(self, func)
+        self.func = func
+        self.instance_ref = None
+
+    def __set_name__(self, owner, name):
+        # Maintain a record of decorated methods in the class
+        if not hasattr(owner, '_methods_with_randomness'):
+            setattr(owner, '_methods_with_randomness', [])
+
+        # Here `name` equals to `self.__name__`, i.e., the name of the
+        # decorated function, due to the invocation of `update_wrapper` in
+        # `self.__init__()`
+        owner._methods_with_randomness.append(name)
+
+    def __call__(self, *args, **kwargs):
+        # Get the transform instance whose method is decorated
+        # by cache_randomness
+        instance = self.instance_ref()
+        name = self.__name__
+
+        # Check the flag ``self._cache_enabled``, which should be
+        # set by the contextmanagers like ``cache_random_parameters```
+        cache_enabled = getattr(instance, '_cache_enabled', False)
+
+        if cache_enabled:
+            # Initialize the cache of the transform instances. The flag
+            # ``cache_enabled``` is set by contextmanagers like
+            # ``cache_random_params```.
+            if not hasattr(instance, '_cache'):
+                setattr(instance, '_cache', {})
+
+            if name not in instance._cache:
+                instance._cache[name] = self.func(instance, *args, **kwargs)
+            # Return the cached value
+            return instance._cache[name]
+        else:
+            # Clear cache
+            if hasattr(instance, '_cache'):
+                del instance._cache
+            # Return function output
+            return self.func(instance, *args, **kwargs)
+
+    def __get__(self, obj, cls):
+        self.instance_ref = weakref.ref(obj)
+        # Return a copy to avoid multiple transform instances sharing
+        # one `cache_randomness` instance, which may cause data races
+        # in multithreading cases.
+        return copy.copy(self)
+
+
+def avoid_cache_randomness(cls):
+    """Decorator that marks a data transform class (subclass of
+    :class:`BaseTransform`) prohibited from caching randomness. With this
+    decorator, errors will be raised in following cases:
+
+        1. A method is defined in the class with the decorate
+    `cache_randomness`;
+        2. An instance of the class is invoked with the context
+    `cache_random_params`.
+
+    A typical usage of `avoid_cache_randomness` is to decorate the data
+    transforms with non-cacheable random behaviors (e.g., the random behavior
+    can not be defined in a method, thus can not be decorated with
+    `cache_randomness`). This is for preventing unintentinoal use of such data
+    transforms within the context of caching randomness, which may lead to
+    unexpected results.
+    """
+
+    # Check that cls is a data transform class
+    assert issubclass(cls, BaseTransform)
+
+    # Check that no method is decorated with `cache_randomness` in cls
+    if getattr(cls, '_methods_with_randomness', None):
+        raise RuntimeError(
+            f'Class {cls.__name__} decorated with '
+            '``avoid_cache_randomness`` should not have methods decorated '
+            'with ``cache_randomness`` (invalid methods: '
+            f'{cls._methods_with_randomness})')
+
+    class AvoidCacheRandomness:
+
+        def __get__(self, obj, objtype=None):
+            # Here we check the value in `objtype.__dict__` instead of
+            # directly checking the attribute
+            # `objtype._avoid_cache_randomness`. So if the base class is
+            # decorated with :func:`avoid_cache_randomness`, it will not be
+            # inherited by subclasses.
+            return objtype.__dict__.get('_avoid_cache_randomness', False)
+
+    cls.avoid_cache_randomness = AvoidCacheRandomness()
+    cls._avoid_cache_randomness = True
+
+    return cls
+
+
+@contextmanager
+def cache_random_params(transforms: Union[BaseTransform, Iterable]):
+    """Context-manager that enables the cache of return values of methods
+    decorated with ``cache_randomness`` in transforms.
+
+    In this mode, decorated methods will cache their return values on the
+    first invoking, and always return the cached value afterward. This allow
+    to apply random transforms in a deterministic way. For example, apply same
+    transforms on multiple examples. See ``cache_randomness`` for more
+    information.
+
+    Args:
+        transforms (BaseTransform|list[BaseTransform]): The transforms to
+            enable cache.
+    """
+
+    # key2method stores the original methods that are replaced by the wrapped
+    # ones. These methods will be restituted when exiting the context.
+    key2method = dict()
+
+    # key2counter stores the usage number of each cache_randomness. This is
+    # used to check that any cache_randomness is invoked once during processing
+    # on data sample.
+    key2counter: dict = defaultdict(int)
+
+    def _add_invoke_counter(obj, method_name):
+        method = getattr(obj, method_name)
+        key = f'{id(obj)}.{method_name}'
+        key2method[key] = method
+
+        @functools.wraps(method)
+        def wrapped(*args, **kwargs):
+            key2counter[key] += 1
+            return method(*args, **kwargs)
+
+        return wrapped
+
+    def _add_invoke_checker(obj, method_name):
+        # check that the method in _methods_with_randomness has been
+        # invoked at most once
+        method = getattr(obj, method_name)
+        key = f'{id(obj)}.{method_name}'
+        key2method[key] = method
+
+        @functools.wraps(method)
+        def wrapped(*args, **kwargs):
+            # clear counter
+            for name in obj._methods_with_randomness:
+                key = f'{id(obj)}.{name}'
+                key2counter[key] = 0
+
+            output = method(*args, **kwargs)
+
+            for name in obj._methods_with_randomness:
+                key = f'{id(obj)}.{name}'
+                if key2counter[key] > 1:
+                    raise RuntimeError(
+                        'The method decorated with ``cache_randomness`` '
+                        'should be invoked at most once during processing '
+                        f'one data sample. The method {name} of {obj} has '
+                        f'been invoked {key2counter[key]} times.')
+            return output
+
+        return wrapped
+
+    def _start_cache(t: BaseTransform):
+        # Check if cache is allowed for `t`
+        if getattr(t, 'avoid_cache_randomness', False):
+            raise RuntimeError(
+                f'Class {t.__class__.__name__} decorated with '
+                '``avoid_cache_randomness`` is not allowed to be used with'
+                ' ``cache_random_params`` (e.g. wrapped by '
+                '``ApplyToMultiple`` with ``share_random_params==True``).')
+
+        # Skip transforms w/o random method
+        if not hasattr(t, '_methods_with_randomness'):
+            return
+
+        # Set cache enabled flag
+        setattr(t, '_cache_enabled', True)
+
+        # Store the original method and init the counter
+        if hasattr(t, '_methods_with_randomness'):
+            setattr(t, 'transform', _add_invoke_checker(t, 'transform'))
+            for name in getattr(t, '_methods_with_randomness'):
+                setattr(t, name, _add_invoke_counter(t, name))
+
+    def _end_cache(t: BaseTransform):
+        # Skip transforms w/o random method
+        if not hasattr(t, '_methods_with_randomness'):
+            return
+
+        # Remove cache enabled flag
+        delattr(t, '_cache_enabled')
+        if hasattr(t, '_cache'):
+            delattr(t, '_cache')
+
+        # Restore the original method
+        if hasattr(t, '_methods_with_randomness'):
+            for name in getattr(t, '_methods_with_randomness'):
+                key = f'{id(t)}.{name}'
+                setattr(t, name, key2method[key])
+
+            key_transform = f'{id(t)}.transform'
+            setattr(t, 'transform', key2method[key_transform])
+
+    def _apply(t: Union[BaseTransform, Iterable],
+               func: Callable[[BaseTransform], None]):
+        if isinstance(t, BaseTransform):
+            func(t)
+        if isinstance(t, Iterable):
+            for _t in t:
+                _apply(_t, func)
+
+    try:
+        _apply(transforms, _start_cache)
+        yield
+    finally:
+        _apply(transforms, _end_cache)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/transforms/wrappers.py b/head_extractor/mmcv-2.1.0/mmcv/transforms/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..27e07d6335862aadffb21cfb479ebf8160705588
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/transforms/wrappers.py
@@ -0,0 +1,649 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union
+
+import mmengine
+import numpy as np
+
+from .base import BaseTransform
+from .builder import TRANSFORMS
+from .utils import cache_random_params, cache_randomness
+
+# Define type of transform or transform config
+Transform = Union[Dict, Callable[[Dict], Dict]]
+
+# Indicator of keys marked by KeyMapper._map_input, which means ignoring the
+# marked keys in KeyMapper._apply_transform so they will be invisible to
+# wrapped transforms.
+# This can be 2 possible case:
+# 1. The key is required but missing in results
+# 2. The key is manually set as ... (Ellipsis) in ``mapping``, which means
+# the original value in results should be ignored
+IgnoreKey = object()
+
+# Import nullcontext if python>=3.7, otherwise use a simple alternative
+# implementation.
+try:
+    from contextlib import nullcontext  # type: ignore
+except ImportError:
+    from contextlib import contextmanager
+
+    @contextmanager  # type: ignore
+    def nullcontext(resource=None):
+        try:
+            yield resource
+        finally:
+            pass
+
+
+@TRANSFORMS.register_module()
+class Compose(BaseTransform):
+    """Compose multiple transforms sequentially.
+
+    Args:
+        transforms (list[dict | callable]): Sequence of transform object or
+            config dict to be composed.
+
+    Examples:
+        >>> pipeline = [
+        >>>     dict(type='Compose',
+        >>>         transforms=[
+        >>>             dict(type='LoadImageFromFile'),
+        >>>             dict(type='Normalize')
+        >>>         ]
+        >>>     )
+        >>> ]
+    """
+
+    def __init__(self, transforms: Union[Transform, Sequence[Transform]]):
+        super().__init__()
+
+        if not isinstance(transforms, Sequence):
+            transforms = [transforms]
+        self.transforms: List = []
+        for transform in transforms:
+            if isinstance(transform, dict):
+                transform = TRANSFORMS.build(transform)
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError('transform must be callable or a dict, but got'
+                                f' {type(transform)}')
+
+    def __iter__(self):
+        """Allow easy iteration over the transform sequence."""
+        return iter(self.transforms)
+
+    def transform(self, results: Dict) -> Optional[Dict]:
+        """Call function to apply transforms sequentially.
+
+        Args:
+            results (dict): A result dict contains the results to transform.
+
+        Returns:
+            dict or None: Transformed results.
+        """
+        for t in self.transforms:
+            results = t(results)  # type: ignore
+            if results is None:
+                return None
+        return results
+
+    def __repr__(self):
+        """Compute the string representation."""
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += f'\n    {t}'
+        format_string += '\n)'
+        return format_string
+
+
+@TRANSFORMS.register_module()
+class KeyMapper(BaseTransform):
+    """A transform wrapper to map and reorganize the input/output of the
+    wrapped transforms (or sub-pipeline).
+
+    Args:
+        transforms (list[dict | callable], optional): Sequence of transform
+            object or config dict to be wrapped.
+        mapping (dict): A dict that defines the input key mapping.
+            The keys corresponds to the inner key (i.e., kwargs of the
+            ``transform`` method), and should be string type. The values
+            corresponds to the outer keys (i.e., the keys of the
+            data/results), and should have a type of string, list or dict.
+            None means not applying input mapping. Default: None.
+        remapping (dict): A dict that defines the output key mapping.
+            The keys and values have the same meanings and rules as in the
+            ``mapping``. Default: None.
+        auto_remap (bool, optional): If True, an inverse of the mapping will
+            be used as the remapping. If auto_remap is not given, it will be
+            automatically set True if 'remapping' is not given, and vice
+            versa. Default: None.
+        allow_nonexist_keys (bool): If False, the outer keys in the mapping
+            must exist in the input data, or an exception will be raised.
+            Default: False.
+
+    Examples:
+        >>> # Example 1: KeyMapper 'gt_img' to 'img'
+        >>> pipeline = [
+        >>>     # Use KeyMapper to convert outer (original) field name
+        >>>     # 'gt_img' to inner (used by inner transforms) filed name
+        >>>     # 'img'
+        >>>     dict(type='KeyMapper',
+        >>>         mapping={'img': 'gt_img'},
+        >>>         # auto_remap=True means output key mapping is the revert of
+        >>>         # the input key mapping, e.g. inner 'img' will be mapped
+        >>>         # back to outer 'gt_img'
+        >>>         auto_remap=True,
+        >>>         transforms=[
+        >>>             # In all transforms' implementation just use 'img'
+        >>>             # as a standard field name
+        >>>             dict(type='Crop', crop_size=(384, 384)),
+        >>>             dict(type='Normalize'),
+        >>>         ])
+        >>> ]
+
+        >>> # Example 2: Collect and structure multiple items
+        >>> pipeline = [
+        >>>     # The inner field 'imgs' will be a dict with keys 'img_src'
+        >>>     # and 'img_tar', whose values are outer fields 'img1' and
+        >>>     # 'img2' respectively.
+        >>>     dict(type='KeyMapper',
+        >>>         dict(
+        >>>             type='KeyMapper',
+        >>>             mapping=dict(
+        >>>                 imgs=dict(
+        >>>                     img_src='img1',
+        >>>                     img_tar='img2')),
+        >>>         transforms=...)
+        >>> ]
+
+        >>> # Example 3: Manually set ignored keys by "..."
+        >>> pipeline = [
+        >>>     ...
+        >>>     dict(type='KeyMapper',
+        >>>         mapping={
+        >>>             # map outer key "gt_img" to inner key "img"
+        >>>             'img': 'gt_img',
+        >>>             # ignore outer key "mask"
+        >>>             'mask': ...,
+        >>>         },
+        >>>         transforms=[
+        >>>             dict(type='RandomFlip'),
+        >>>         ])
+        >>>     ...
+        >>> ]
+    """
+
+    def __init__(self,
+                 transforms: Union[Transform, List[Transform]] = None,
+                 mapping: Optional[Dict] = None,
+                 remapping: Optional[Dict] = None,
+                 auto_remap: Optional[bool] = None,
+                 allow_nonexist_keys: bool = False):
+
+        super().__init__()
+
+        self.allow_nonexist_keys = allow_nonexist_keys
+        self.mapping = mapping
+
+        if auto_remap is None:
+            auto_remap = remapping is None
+        self.auto_remap = auto_remap
+
+        if self.auto_remap:
+            if remapping is not None:
+                raise ValueError('KeyMapper: ``remapping`` must be None if'
+                                 '`auto_remap` is set True.')
+            self.remapping = mapping
+        else:
+            self.remapping = remapping
+
+        if transforms is None:
+            transforms = []
+        self.transforms = Compose(transforms)
+
+    def __iter__(self):
+        """Allow easy iteration over the transform sequence."""
+        return iter(self.transforms)
+
+    def _map_input(self, data: Dict,
+                   mapping: Optional[Dict]) -> Dict[str, Any]:
+        """KeyMapper inputs for the wrapped transforms by gathering and
+        renaming data items according to the mapping.
+
+        Args:
+            data (dict): The original input data
+            mapping (dict, optional): The input key mapping. See the document
+                of ``mmcv.transforms.wrappers.KeyMapper`` for details. In
+                set None, return the input data directly.
+
+        Returns:
+            dict: The input data with remapped keys. This will be the actual
+                input of the wrapped pipeline.
+        """
+
+        if mapping is None:
+            return data.copy()
+
+        def _map(data, m):
+            if isinstance(m, dict):
+                # m is a dict {inner_key:outer_key, ...}
+                return {k_in: _map(data, k_out) for k_in, k_out in m.items()}
+            if isinstance(m, (tuple, list)):
+                # m is a list or tuple [outer_key1, outer_key2, ...]
+                # This is the case when we collect items from the original
+                # data to form a list or tuple to feed to the wrapped
+                # transforms.
+                return m.__class__(_map(data, e) for e in m)
+
+            # allow manually mark a key to be ignored by ...
+            if m is ...:
+                return IgnoreKey
+
+            # m is an outer_key
+            if self.allow_nonexist_keys:
+                return data.get(m, IgnoreKey)
+            else:
+                return data.get(m)
+
+        collected = _map(data, mapping)
+
+        # Retain unmapped items
+        inputs = data.copy()
+        inputs.update(collected)
+
+        return inputs
+
+    def _map_output(self, data: Dict,
+                    remapping: Optional[Dict]) -> Dict[str, Any]:
+        """KeyMapper outputs from the wrapped transforms by gathering and
+        renaming data items according to the remapping.
+
+        Args:
+            data (dict): The output of the wrapped pipeline.
+            remapping (dict, optional): The output key mapping. See the
+                document of ``mmcv.transforms.wrappers.KeyMapper`` for
+                details. If ``remapping is None``, no key mapping will be
+                applied but only remove the special token ``IgnoreKey``.
+
+        Returns:
+            dict: The output with remapped keys.
+        """
+
+        # Remove ``IgnoreKey``
+        if remapping is None:
+            return {k: v for k, v in data.items() if v is not IgnoreKey}
+
+        def _map(data, m):
+            if isinstance(m, dict):
+                assert isinstance(data, dict)
+                results = {}
+                for k_in, k_out in m.items():
+                    assert k_in in data
+                    results.update(_map(data[k_in], k_out))
+                return results
+            if isinstance(m, (list, tuple)):
+                assert isinstance(data, (list, tuple))
+                assert len(data) == len(m)
+                results = {}
+                for m_i, d_i in zip(m, data):
+                    results.update(_map(d_i, m_i))
+                return results
+
+            # ``m is ...`` means the key is marked ignored, in which case the
+            # inner resuls will not affect the outer results in remapping.
+            # Another case that will have ``data is IgnoreKey`` is that the
+            # key is missing in the inputs. In this case, if the inner key is
+            # created by the wrapped transforms, it will be remapped to the
+            # corresponding outer key during remapping.
+            if m is ... or data is IgnoreKey:
+                return {}
+
+            return {m: data}
+
+        # Note that unmapped items are not retained, which is different from
+        # the behavior in _map_input. This is to avoid original data items
+        # being overwritten by intermediate namesakes
+        return _map(data, remapping)
+
+    def _apply_transforms(self, inputs: Dict) -> Dict:
+        """Apply ``self.transforms``.
+
+        Note that the special token ``IgnoreKey`` will be invisible to
+        ``self.transforms``, but not removed in this method. It will be
+        eventually removed in :func:``self._map_output``.
+        """
+        results = inputs.copy()
+        inputs = {k: v for k, v in inputs.items() if v is not IgnoreKey}
+        outputs = self.transforms(inputs)
+
+        if outputs is None:
+            raise ValueError(
+                f'Transforms wrapped by {self.__class__.__name__} should '
+                'not return None.')
+
+        results.update(outputs)  # type: ignore
+        return results
+
+    def transform(self, results: Dict) -> Dict:
+        """Apply mapping, wrapped transforms and remapping."""
+
+        # Apply mapping
+        inputs = self._map_input(results, self.mapping)
+        # Apply wrapped transforms
+        outputs = self._apply_transforms(inputs)
+        # Apply remapping
+        outputs = self._map_output(outputs, self.remapping)
+
+        results.update(outputs)  # type: ignore
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms = {self.transforms}'
+        repr_str += f', mapping = {self.mapping}'
+        repr_str += f', remapping = {self.remapping}'
+        repr_str += f', auto_remap = {self.auto_remap}'
+        repr_str += f', allow_nonexist_keys = {self.allow_nonexist_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class TransformBroadcaster(KeyMapper):
+    """A transform wrapper to apply the wrapped transforms to multiple data
+    items. For example, apply Resize to multiple images.
+
+    Args:
+        transforms (list[dict | callable]): Sequence of transform object or
+            config dict to be wrapped.
+        mapping (dict): A dict that defines the input key mapping.
+            Note that to apply the transforms to multiple data items, the
+            outer keys of the target items should be remapped as a list with
+            the standard inner key (The key required by the wrapped transform).
+            See the following example and the document of
+            ``mmcv.transforms.wrappers.KeyMapper`` for details.
+        remapping (dict): A dict that defines the output key mapping.
+            The keys and values have the same meanings and rules as in the
+            ``mapping``. Default: None.
+        auto_remap (bool, optional): If True, an inverse of the mapping will
+            be used as the remapping. If auto_remap is not given, it will be
+            automatically set True if 'remapping' is not given, and vice
+            versa. Default: None.
+        allow_nonexist_keys (bool): If False, the outer keys in the mapping
+            must exist in the input data, or an exception will be raised.
+            Default: False.
+        share_random_params (bool): If True, the random transform
+            (e.g., RandomFlip) will be conducted in a deterministic way and
+            have the same behavior on all data items. For example, to randomly
+            flip either both input image and ground-truth image, or none.
+            Default: False.
+
+    .. note::
+        To apply the transforms to each elements of a list or tuple, instead
+        of separating data items, you can map the outer key of the target
+        sequence to the standard inner key. See example 2.
+        example.
+
+    Examples:
+        >>> # Example 1: Broadcast to enumerated keys, each contains a single
+        >>> # data element
+        >>> pipeline = [
+        >>>     dict(type='LoadImageFromFile', key='lq'),  # low-quality img
+        >>>     dict(type='LoadImageFromFile', key='gt'),  # ground-truth img
+        >>>     # TransformBroadcaster maps multiple outer fields to standard
+        >>>     # the inner field and process them with wrapped transforms
+        >>>     # respectively
+        >>>     dict(type='TransformBroadcaster',
+        >>>         # case 1: from multiple outer fields
+        >>>         mapping={'img': ['lq', 'gt']},
+        >>>         auto_remap=True,
+        >>>         # share_random_param=True means using identical random
+        >>>         # parameters in every processing
+        >>>         share_random_param=True,
+        >>>         transforms=[
+        >>>             dict(type='Crop', crop_size=(384, 384)),
+        >>>             dict(type='Normalize'),
+        >>>         ])
+        >>> ]
+
+        >>> # Example 2: Broadcast to keys that contains data sequences
+        >>> pipeline = [
+        >>>     dict(type='LoadImageFromFile', key='lq'),  # low-quality img
+        >>>     dict(type='LoadImageFromFile', key='gt'),  # ground-truth img
+        >>>     # TransformBroadcaster maps multiple outer fields to standard
+        >>>     # the inner field and process them with wrapped transforms
+        >>>     # respectively
+        >>>     dict(type='TransformBroadcaster',
+        >>>         # case 2: from one outer field that contains multiple
+        >>>         # data elements (e.g. a list)
+        >>>         # mapping={'img': 'images'},
+        >>>         auto_remap=True,
+        >>>         share_random_param=True,
+        >>>         transforms=[
+        >>>             dict(type='Crop', crop_size=(384, 384)),
+        >>>             dict(type='Normalize'),
+        >>>         ])
+        >>> ]
+
+        >>> Example 3: Set ignored keys in broadcasting
+        >>> pipeline = [
+        >>>        dict(type='TransformBroadcaster',
+        >>>            # Broadcast the wrapped transforms to multiple images
+        >>>            # 'lq' and 'gt, but only update 'img_shape' once
+        >>>            mapping={
+        >>>                'img': ['lq', 'gt'],
+        >>>                'img_shape': ['img_shape', ...],
+        >>>             },
+        >>>            auto_remap=True,
+        >>>            share_random_params=True,
+        >>>            transforms=[
+        >>>                # `RandomCrop` will modify the field "img",
+        >>>                # and optionally update "img_shape" if it exists
+        >>>                dict(type='RandomCrop'),
+        >>>            ])
+        >>>    ]
+    """
+
+    def __init__(self,
+                 transforms: List[Union[Dict, Callable[[Dict], Dict]]],
+                 mapping: Optional[Dict] = None,
+                 remapping: Optional[Dict] = None,
+                 auto_remap: Optional[bool] = None,
+                 allow_nonexist_keys: bool = False,
+                 share_random_params: bool = False):
+        super().__init__(transforms, mapping, remapping, auto_remap,
+                         allow_nonexist_keys)
+
+        self.share_random_params = share_random_params
+
+    def scatter_sequence(self, data: Dict) -> List[Dict]:
+        """Scatter the broadcasting targets to a list of inputs of the wrapped
+        transforms."""
+
+        # infer split number from input
+        seq_len = 0
+        key_rep = None
+
+        if self.mapping:
+            keys = self.mapping.keys()
+        else:
+            keys = data.keys()
+
+        for key in keys:
+            assert isinstance(data[key], Sequence)
+            if seq_len:
+                if len(data[key]) != seq_len:
+                    raise ValueError('Got inconsistent sequence length: '
+                                     f'{seq_len} ({key_rep}) vs. '
+                                     f'{len(data[key])} ({key})')
+            else:
+                seq_len = len(data[key])
+                key_rep = key
+
+        assert seq_len > 0, 'Fail to get the number of broadcasting targets'
+
+        scatters = []
+        for i in range(seq_len):  # type: ignore
+            scatter = data.copy()
+            for key in keys:
+                scatter[key] = data[key][i]
+            scatters.append(scatter)
+        return scatters
+
+    def transform(self, results: Dict):
+        """Broadcast wrapped transforms to multiple targets."""
+
+        # Apply input remapping
+        inputs = self._map_input(results, self.mapping)
+
+        # Scatter sequential inputs into a list
+        input_scatters = self.scatter_sequence(inputs)
+
+        # Control random parameter sharing with a context manager
+        if self.share_random_params:
+            # The context manager :func`:cache_random_params` will let
+            # cacheable method of the transforms cache their outputs. Thus
+            # the random parameters will only generated once and shared
+            # by all data items.
+            ctx = cache_random_params  # type: ignore
+        else:
+            ctx = nullcontext  # type: ignore
+
+        with ctx(self.transforms):
+            output_scatters = [
+                self._apply_transforms(_input) for _input in input_scatters
+            ]
+
+        # Collate output scatters (list of dict to dict of list)
+        outputs = {
+            key: [_output[key] for _output in output_scatters]
+            for key in output_scatters[0]
+        }
+
+        # Apply remapping
+        outputs = self._map_output(outputs, self.remapping)
+
+        results.update(outputs)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms = {self.transforms}'
+        repr_str += f', mapping = {self.mapping}'
+        repr_str += f', remapping = {self.remapping}'
+        repr_str += f', auto_remap = {self.auto_remap}'
+        repr_str += f', allow_nonexist_keys = {self.allow_nonexist_keys}'
+        repr_str += f', share_random_params = {self.share_random_params})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomChoice(BaseTransform):
+    """Process data with a randomly chosen transform from given candidates.
+
+    Args:
+        transforms (list[list]): A list of transform candidates, each is a
+            sequence of transforms.
+        prob (list[float], optional): The probabilities associated
+            with each pipeline. The length should be equal to the pipeline
+            number and the sum should be 1. If not given, a uniform
+            distribution will be assumed.
+
+    Examples:
+        >>> # config
+        >>> pipeline = [
+        >>>     dict(type='RandomChoice',
+        >>>         transforms=[
+        >>>             [dict(type='RandomHorizontalFlip')],  # subpipeline 1
+        >>>             [dict(type='RandomRotate')],  # subpipeline 2
+        >>>         ]
+        >>>     )
+        >>> ]
+    """
+
+    def __init__(self,
+                 transforms: List[Union[Transform, List[Transform]]],
+                 prob: Optional[List[float]] = None):
+
+        super().__init__()
+
+        if prob is not None:
+            assert mmengine.is_seq_of(prob, float)
+            assert len(transforms) == len(prob), \
+                '``transforms`` and ``prob`` must have same lengths. ' \
+                f'Got {len(transforms)} vs {len(prob)}.'
+            assert sum(prob) == 1
+
+        self.prob = prob
+        self.transforms = [Compose(transforms) for transforms in transforms]
+
+    def __iter__(self):
+        return iter(self.transforms)
+
+    @cache_randomness
+    def random_pipeline_index(self) -> int:
+        """Return a random transform index."""
+        indices = np.arange(len(self.transforms))
+        return np.random.choice(indices, p=self.prob)
+
+    def transform(self, results: Dict) -> Optional[Dict]:
+        """Randomly choose a transform to apply."""
+        idx = self.random_pipeline_index()
+        return self.transforms[idx](results)
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms = {self.transforms}'
+        repr_str += f'prob = {self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomApply(BaseTransform):
+    """Apply transforms randomly with a given probability.
+
+    Args:
+        transforms (list[dict | callable]): The transform or transform list
+            to randomly apply.
+        prob (float): The probability to apply transforms. Default: 0.5
+
+    Examples:
+        >>> # config
+        >>> pipeline = [
+        >>>     dict(type='RandomApply',
+        >>>         transforms=[dict(type='HorizontalFlip')],
+        >>>         prob=0.3)
+        >>> ]
+    """
+
+    def __init__(self,
+                 transforms: Union[Transform, List[Transform]],
+                 prob: float = 0.5):
+
+        super().__init__()
+        self.prob = prob
+        self.transforms = Compose(transforms)
+
+    def __iter__(self):
+        return iter(self.transforms)
+
+    @cache_randomness
+    def random_apply(self) -> bool:
+        """Return a random bool value indicating whether apply the
+        transform."""
+        return np.random.rand() < self.prob
+
+    def transform(self, results: Dict) -> Optional[Dict]:
+        """Randomly apply the transform."""
+        if self.random_apply():
+            return self.transforms(results)  # type: ignore
+        else:
+            return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms = {self.transforms}'
+        repr_str += f', prob = {self.prob})'
+        return repr_str
diff --git a/head_extractor/mmcv-2.1.0/mmcv/utils/__init__.py b/head_extractor/mmcv-2.1.0/mmcv/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53ebb9453749d470a261e2aace1e5d2c47266545
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/utils/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .device_type import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE,
+                          IS_MPS_AVAILABLE, IS_NPU_AVAILABLE)
+from .env import collect_env
+from .parrots_jit import jit, skip_no_elena
+
+__all__ = [
+    'IS_MLU_AVAILABLE', 'IS_MPS_AVAILABLE', 'IS_CUDA_AVAILABLE',
+    'IS_NPU_AVAILABLE', 'collect_env', 'jit', 'skip_no_elena'
+]
diff --git a/head_extractor/mmcv-2.1.0/mmcv/utils/device_type.py b/head_extractor/mmcv-2.1.0/mmcv/utils/device_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a84371276df230f21119ba58155f37b973eb367
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/utils/device_type.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.device import (is_cuda_available, is_mlu_available,
+                             is_mps_available, is_npu_available)
+
+IS_MLU_AVAILABLE = is_mlu_available()
+IS_MPS_AVAILABLE = is_mps_available()
+IS_CUDA_AVAILABLE = is_cuda_available()
+IS_NPU_AVAILABLE = is_npu_available()
diff --git a/head_extractor/mmcv-2.1.0/mmcv/utils/env.py b/head_extractor/mmcv-2.1.0/mmcv/utils/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..27988cf2aead8573533076430b9c488a51be3a24
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/utils/env.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This file holding some environment constant for sharing by other files."""
+
+import os.path as osp
+import subprocess
+
+import torch
+from mmengine.utils.dl_utils import collect_env as mmengine_collect_env
+
+import mmcv
+
+
+def collect_env():
+    """Collect the information of the running environments.
+
+    Returns:
+        dict: The environment information. The following fields are contained.
+
+            - sys.platform: The variable of ``sys.platform``.
+            - Python: Python version.
+            - CUDA available: Bool, indicating if CUDA is available.
+            - GPU devices: Device type of each GPU.
+            - CUDA_HOME (optional): The env var ``CUDA_HOME``.
+            - NVCC (optional): NVCC version.
+            - GCC: GCC version, "n/a" if GCC is not installed.
+            - MSVC: Microsoft Virtual C++ Compiler version, Windows only.
+            - PyTorch: PyTorch version.
+            - PyTorch compiling details: The output of \
+                ``torch.__config__.show()``.
+            - TorchVision (optional): TorchVision version.
+            - OpenCV: OpenCV version.
+            - MMEngine: MMEngine version.
+            - MMCV: MMCV version.
+            - MMCV Compiler: The GCC version for compiling MMCV ops.
+            - MMCV CUDA Compiler: The CUDA version for compiling MMCV ops.
+    """
+    env_info = mmengine_collect_env()
+
+    # MMEngine does not add the hipcc compiler information when collecting
+    # environment information, so it is added here. When MMEngine v0.3.0 is
+    # released, the code here can be removed.
+    cuda_available = torch.cuda.is_available()
+    if cuda_available and env_info.get('NVCC') == 'Not Available':
+        CUDA_HOME = env_info['CUDA_HOME']
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            if CUDA_HOME == '/opt/rocm':
+                try:
+                    nvcc = osp.join(CUDA_HOME, 'hip/bin/hipcc')
+                    nvcc = subprocess.check_output(
+                        f'"{nvcc}" --version', shell=True)
+                    nvcc = nvcc.decode('utf-8').strip()
+                    release = nvcc.rfind('HIP version:')
+                    build = nvcc.rfind('')
+                    nvcc = nvcc[release:build].strip()
+                except subprocess.SubprocessError:
+                    nvcc = 'Not Available'
+            else:
+                try:
+                    nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                    nvcc = subprocess.check_output(f'"{nvcc}" -V', shell=True)
+                    nvcc = nvcc.decode('utf-8').strip()
+                    release = nvcc.rfind('Cuda compilation tools')
+                    build = nvcc.rfind('Build ')
+                    nvcc = nvcc[release:build].strip()
+                except subprocess.SubprocessError:
+                    nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+
+    env_info['MMCV'] = mmcv.__version__
+
+    try:
+        from mmcv.ops import get_compiler_version, get_compiling_cuda_version
+    except ModuleNotFoundError:
+        env_info['MMCV Compiler'] = 'n/a'
+        env_info['MMCV CUDA Compiler'] = 'n/a'
+    else:
+        env_info['MMCV Compiler'] = get_compiler_version()
+        env_info['MMCV CUDA Compiler'] = get_compiling_cuda_version()
+
+    return env_info
diff --git a/head_extractor/mmcv-2.1.0/mmcv/utils/ext_loader.py b/head_extractor/mmcv-2.1.0/mmcv/utils/ext_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..a31e107dfef8b710dc56fd887f569097d1c63208
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/utils/ext_loader.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import importlib
+import os
+import pkgutil
+import warnings
+from collections import namedtuple
+
+import torch
+
+if torch.__version__ != 'parrots':
+
+    def load_ext(name, funcs):
+        ext = importlib.import_module('mmcv.' + name)
+        for fun in funcs:
+            assert hasattr(ext, fun), f'{fun} miss in module {name}'
+        return ext
+else:
+    from parrots import extension
+    from parrots.base import ParrotsException
+
+    has_return_value_ops = [
+        'nms',
+        'softnms',
+        'nms_match',
+        'nms_rotated',
+        'top_pool_forward',
+        'top_pool_backward',
+        'bottom_pool_forward',
+        'bottom_pool_backward',
+        'left_pool_forward',
+        'left_pool_backward',
+        'right_pool_forward',
+        'right_pool_backward',
+        'fused_bias_leakyrelu',
+        'upfirdn2d',
+        'ms_deform_attn_forward',
+        'pixel_group',
+        'contour_expand',
+        'diff_iou_rotated_sort_vertices_forward',
+    ]
+
+    def get_fake_func(name, e):
+
+        def fake_func(*args, **kwargs):
+            warnings.warn(f'{name} is not supported in parrots now')
+            raise e
+
+        return fake_func
+
+    def load_ext(name, funcs):
+        ExtModule = namedtuple('ExtModule', funcs)
+        ext_list = []
+        lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+        for fun in funcs:
+            try:
+                ext_fun = extension.load(fun, name, lib_dir=lib_root)
+            except ParrotsException as e:
+                if 'No element registered' not in e.message:
+                    warnings.warn(e.message)
+                ext_fun = get_fake_func(fun, e)
+                ext_list.append(ext_fun)
+            else:
+                if fun in has_return_value_ops:
+                    ext_list.append(ext_fun.op)
+                else:
+                    ext_list.append(ext_fun.op_)
+        return ExtModule(*ext_list)
+
+
+def check_ops_exist() -> bool:
+    ext_loader = pkgutil.find_loader('mmcv._ext')
+    return ext_loader is not None
diff --git a/head_extractor/mmcv-2.1.0/mmcv/utils/parrots_jit.py b/head_extractor/mmcv-2.1.0/mmcv/utils/parrots_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e3a58c242db29c5c3cc140e073c977fc14c3d9f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/utils/parrots_jit.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+from mmengine.utils.dl_utils.parrots_wrapper import TORCH_VERSION
+
+parrots_jit_option = os.getenv('PARROTS_JIT_OPTION')
+
+if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON':
+    from parrots.jit import pat as jit
+else:
+
+    def jit(func=None,
+            check_input=None,
+            full_shape=True,
+            derivate=False,
+            coderize=False,
+            optimize=False):
+
+        def wrapper(func):
+
+            def wrapper_inner(*args, **kargs):
+                return func(*args, **kargs)
+
+            return wrapper_inner
+
+        if func is None:
+            return wrapper
+        else:
+            return func
+
+
+if TORCH_VERSION == 'parrots':
+    from parrots.utils.tester import skip_no_elena
+else:
+
+    def skip_no_elena(func):
+
+        def wrapper(*args, **kargs):
+            return func(*args, **kargs)
+
+        return wrapper
diff --git a/head_extractor/mmcv-2.1.0/mmcv/version.py b/head_extractor/mmcv-2.1.0/mmcv/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9f3ba40b1feb9c1ae1c06530185a2e3cc53847e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/version.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+__version__ = '2.1.0'
+
+
+def parse_version_info(version_str: str, length: int = 4) -> tuple:
+    """Parse a version string into a tuple.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Default: 4.
+
+    Returns:
+        tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
+            (1, 3, 0, 0, 0, 0), and "2.0.0rc1" is parsed into
+            (2, 0, 0, 0, 'rc', 1) (when length is set to 4).
+    """
+    from packaging.version import parse
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        release.extend(list(version.pre))  # type: ignore
+    elif version.is_postrelease:
+        release.extend(list(version.post))  # type: ignore
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+
+version_info = tuple(int(x) for x in __version__.split('.')[:3])
+
+__all__ = ['__version__', 'version_info', 'parse_version_info']
diff --git a/head_extractor/mmcv-2.1.0/mmcv/video/__init__.py b/head_extractor/mmcv-2.1.0/mmcv/video/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73199b01dec52820dc6ca0139903536344d5a1eb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/video/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .io import Cache, VideoReader, frames2video
+from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread,
+                      flowwrite, quantize_flow, sparse_flow_from_bytes)
+from .processing import concat_video, convert_video, cut_video, resize_video
+
+__all__ = [
+    'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video',
+    'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow',
+    'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes'
+]
diff --git a/head_extractor/mmcv-2.1.0/mmcv/video/io.py b/head_extractor/mmcv-2.1.0/mmcv/video/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..378f5b9f7cc72984f543d262533044d8b031b4e9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/video/io.py
@@ -0,0 +1,316 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import OrderedDict
+
+import cv2
+from cv2 import (CAP_PROP_FOURCC, CAP_PROP_FPS, CAP_PROP_FRAME_COUNT,
+                 CAP_PROP_FRAME_HEIGHT, CAP_PROP_FRAME_WIDTH,
+                 CAP_PROP_POS_FRAMES, VideoWriter_fourcc)
+from mmengine.utils import (check_file_exist, mkdir_or_exist, scandir,
+                            track_progress)
+
+
+class Cache:
+
+    def __init__(self, capacity):
+        self._cache = OrderedDict()
+        self._capacity = int(capacity)
+        if capacity <= 0:
+            raise ValueError('capacity must be a positive integer')
+
+    @property
+    def capacity(self):
+        return self._capacity
+
+    @property
+    def size(self):
+        return len(self._cache)
+
+    def put(self, key, val):
+        if key in self._cache:
+            return
+        if len(self._cache) >= self.capacity:
+            self._cache.popitem(last=False)
+        self._cache[key] = val
+
+    def get(self, key, default=None):
+        val = self._cache[key] if key in self._cache else default
+        return val
+
+
+class VideoReader:
+    """Video class with similar usage to a list object.
+
+    This video wrapper class provides convenient apis to access frames.
+    There exists an issue of OpenCV's VideoCapture class that jumping to a
+    certain frame may be inaccurate. It is fixed in this class by checking
+    the position after jumping each time.
+    Cache is used when decoding videos. So if the same frame is visited for
+    the second time, there is no need to decode again if it is stored in the
+    cache.
+
+    Examples:
+        >>> import mmcv
+        >>> v = mmcv.VideoReader('sample.mp4')
+        >>> len(v)  # get the total frame number with `len()`
+        120
+        >>> for img in v:  # v is iterable
+        >>>     mmcv.imshow(img)
+        >>> v[5]  # get the 6th frame
+    """
+
+    def __init__(self, filename, cache_capacity=10):
+        # Check whether the video path is a url
+        if not filename.startswith(('https://', 'http://')):
+            check_file_exist(filename, 'Video file not found: ' + filename)
+        self._vcap = cv2.VideoCapture(filename)
+        assert cache_capacity > 0
+        self._cache = Cache(cache_capacity)
+        self._position = 0
+        # get basic info
+        self._width = int(self._vcap.get(CAP_PROP_FRAME_WIDTH))
+        self._height = int(self._vcap.get(CAP_PROP_FRAME_HEIGHT))
+        self._fps = self._vcap.get(CAP_PROP_FPS)
+        self._frame_cnt = int(self._vcap.get(CAP_PROP_FRAME_COUNT))
+        self._fourcc = self._vcap.get(CAP_PROP_FOURCC)
+
+    @property
+    def vcap(self):
+        """:obj:`cv2.VideoCapture`: The raw VideoCapture object."""
+        return self._vcap
+
+    @property
+    def opened(self):
+        """bool: Indicate whether the video is opened."""
+        return self._vcap.isOpened()
+
+    @property
+    def width(self):
+        """int: Width of video frames."""
+        return self._width
+
+    @property
+    def height(self):
+        """int: Height of video frames."""
+        return self._height
+
+    @property
+    def resolution(self):
+        """tuple: Video resolution (width, height)."""
+        return (self._width, self._height)
+
+    @property
+    def fps(self):
+        """float: FPS of the video."""
+        return self._fps
+
+    @property
+    def frame_cnt(self):
+        """int: Total frames of the video."""
+        return self._frame_cnt
+
+    @property
+    def fourcc(self):
+        """str: "Four character code" of the video."""
+        return self._fourcc
+
+    @property
+    def position(self):
+        """int: Current cursor position, indicating frame decoded."""
+        return self._position
+
+    def _get_real_position(self):
+        return int(round(self._vcap.get(CAP_PROP_POS_FRAMES)))
+
+    def _set_real_position(self, frame_id):
+        self._vcap.set(CAP_PROP_POS_FRAMES, frame_id)
+        pos = self._get_real_position()
+        for _ in range(frame_id - pos):
+            self._vcap.read()
+        self._position = frame_id
+
+    def read(self):
+        """Read the next frame.
+
+        If the next frame have been decoded before and in the cache, then
+        return it directly, otherwise decode, cache and return it.
+
+        Returns:
+            ndarray or None: Return the frame if successful, otherwise None.
+        """
+        # pos = self._position
+        if self._cache:
+            img = self._cache.get(self._position)
+            if img is not None:
+                ret = True
+            else:
+                if self._position != self._get_real_position():
+                    self._set_real_position(self._position)
+                ret, img = self._vcap.read()
+                if ret:
+                    self._cache.put(self._position, img)
+        else:
+            ret, img = self._vcap.read()
+        if ret:
+            self._position += 1
+        return img
+
+    def get_frame(self, frame_id):
+        """Get frame by index.
+
+        Args:
+            frame_id (int): Index of the expected frame, 0-based.
+
+        Returns:
+            ndarray or None: Return the frame if successful, otherwise None.
+        """
+        if frame_id < 0 or frame_id >= self._frame_cnt:
+            raise IndexError(
+                f'"frame_id" must be between 0 and {self._frame_cnt - 1}')
+        if frame_id == self._position:
+            return self.read()
+        if self._cache:
+            img = self._cache.get(frame_id)
+            if img is not None:
+                self._position = frame_id + 1
+                return img
+        self._set_real_position(frame_id)
+        ret, img = self._vcap.read()
+        if ret:
+            if self._cache:
+                self._cache.put(self._position, img)
+            self._position += 1
+        return img
+
+    def current_frame(self):
+        """Get the current frame (frame that is just visited).
+
+        Returns:
+            ndarray or None: If the video is fresh, return None, otherwise
+            return the frame.
+        """
+        if self._position == 0:
+            return None
+        return self._cache.get(self._position - 1)
+
+    def cvt2frames(self,
+                   frame_dir,
+                   file_start=0,
+                   filename_tmpl='{:06d}.jpg',
+                   start=0,
+                   max_num=0,
+                   show_progress=True):
+        """Convert a video to frame images.
+
+        Args:
+            frame_dir (str): Output directory to store all the frame images.
+            file_start (int): Filenames will start from the specified number.
+            filename_tmpl (str): Filename template with the index as the
+                placeholder.
+            start (int): The starting frame index.
+            max_num (int): Maximum number of frames to be written.
+            show_progress (bool): Whether to show a progress bar.
+        """
+        mkdir_or_exist(frame_dir)
+        if max_num == 0:
+            task_num = self.frame_cnt - start
+        else:
+            task_num = min(self.frame_cnt - start, max_num)
+        if task_num <= 0:
+            raise ValueError('start must be less than total frame number')
+        if start > 0:
+            self._set_real_position(start)
+
+        def write_frame(file_idx):
+            img = self.read()
+            if img is None:
+                return
+            filename = osp.join(frame_dir, filename_tmpl.format(file_idx))
+            cv2.imwrite(filename, img)
+
+        if show_progress:
+            track_progress(write_frame, range(file_start,
+                                              file_start + task_num))
+        else:
+            for i in range(task_num):
+                write_frame(file_start + i)
+
+    def __len__(self):
+        return self.frame_cnt
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            return [
+                self.get_frame(i)
+                for i in range(*index.indices(self.frame_cnt))
+            ]
+        # support negative indexing
+        if index < 0:
+            index += self.frame_cnt
+            if index < 0:
+                raise IndexError('index out of range')
+        return self.get_frame(index)
+
+    def __iter__(self):
+        self._set_real_position(0)
+        return self
+
+    def __next__(self):
+        img = self.read()
+        if img is not None:
+            return img
+        else:
+            raise StopIteration
+
+    next = __next__
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._vcap.release()
+
+
+def frames2video(frame_dir: str,
+                 video_file: str,
+                 fps: float = 30,
+                 fourcc: str = 'XVID',
+                 filename_tmpl: str = '{:06d}.jpg',
+                 start: int = 0,
+                 end: int = 0,
+                 show_progress: bool = True) -> None:
+    """Read the frame images from a directory and join them as a video.
+
+    Args:
+        frame_dir (str): The directory containing video frames.
+        video_file (str): Output filename.
+        fps (float): FPS of the output video.
+        fourcc (str): Fourcc of the output video, this should be compatible
+            with the output file type.
+        filename_tmpl (str): Filename template with the index as the variable.
+        start (int): Starting frame index.
+        end (int): Ending frame index.
+        show_progress (bool): Whether to show a progress bar.
+    """
+    if end == 0:
+        ext = filename_tmpl.split('.')[-1]
+        end = len([name for name in scandir(frame_dir, ext)])
+    first_file = osp.join(frame_dir, filename_tmpl.format(start))
+    check_file_exist(first_file, 'The start frame not found: ' + first_file)
+    img = cv2.imread(first_file)
+    height, width = img.shape[:2]
+    resolution = (width, height)
+    vwriter = cv2.VideoWriter(video_file, VideoWriter_fourcc(*fourcc), fps,
+                              resolution)
+
+    def write_frame(file_idx):
+        filename = osp.join(frame_dir, filename_tmpl.format(file_idx))
+        img = cv2.imread(filename)
+        vwriter.write(img)
+
+    if show_progress:
+        track_progress(write_frame, range(start, end))
+    else:
+        for i in range(start, end):
+            write_frame(i)
+    vwriter.release()
diff --git a/head_extractor/mmcv-2.1.0/mmcv/video/optflow.py b/head_extractor/mmcv-2.1.0/mmcv/video/optflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..edd3e42069ff53a0782ef722403d8ae0ec36291a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/video/optflow.py
@@ -0,0 +1,272 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Tuple, Union
+
+import cv2
+import numpy as np
+from mmengine.utils import is_str
+
+from mmcv.arraymisc import dequantize, quantize
+from mmcv.image import imread, imwrite
+
+
+def flowread(flow_or_path: Union[np.ndarray, str],
+             quantize: bool = False,
+             concat_axis: int = 0,
+             *args,
+             **kwargs) -> np.ndarray:
+    """Read an optical flow map.
+
+    Args:
+        flow_or_path (ndarray or str): A flow map or filepath.
+        quantize (bool): whether to read quantized pair, if set to True,
+            remaining args will be passed to :func:`dequantize_flow`.
+        concat_axis (int): The axis that dx and dy are concatenated,
+            can be either 0 or 1. Ignored if quantize is False.
+
+    Returns:
+        ndarray: Optical flow represented as a (h, w, 2) numpy array
+    """
+    if isinstance(flow_or_path, np.ndarray):
+        if (flow_or_path.ndim != 3) or (flow_or_path.shape[-1] != 2):
+            raise ValueError(f'Invalid flow with shape {flow_or_path.shape}')
+        return flow_or_path
+    elif not is_str(flow_or_path):
+        raise TypeError(f'"flow_or_path" must be a filename or numpy array, '
+                        f'not {type(flow_or_path)}')
+
+    if not quantize:
+        with open(flow_or_path, 'rb') as f:
+            try:
+                header = f.read(4).decode('utf-8')
+            except Exception:
+                raise OSError(f'Invalid flow file: {flow_or_path}')
+            else:
+                if header != 'PIEH':
+                    raise OSError(f'Invalid flow file: {flow_or_path}, '
+                                  'header does not contain PIEH')
+
+            w = np.fromfile(f, np.int32, 1).squeeze()
+            h = np.fromfile(f, np.int32, 1).squeeze()
+            flow = np.fromfile(f, np.float32, w * h * 2).reshape((h, w, 2))
+    else:
+        assert concat_axis in [0, 1]
+        cat_flow = imread(flow_or_path, flag='unchanged')
+        if cat_flow.ndim != 2:
+            raise OSError(
+                f'{flow_or_path} is not a valid quantized flow file, '
+                f'its dimension is {cat_flow.ndim}.')
+        assert cat_flow.shape[concat_axis] % 2 == 0
+        dx, dy = np.split(cat_flow, 2, axis=concat_axis)
+        flow = dequantize_flow(dx, dy, *args, **kwargs)
+
+    return flow.astype(np.float32)
+
+
+def flowwrite(flow: np.ndarray,
+              filename: str,
+              quantize: bool = False,
+              concat_axis: int = 0,
+              *args,
+              **kwargs) -> None:
+    """Write optical flow to file.
+
+    If the flow is not quantized, it will be saved as a .flo file losslessly,
+    otherwise a jpeg image which is lossy but of much smaller size. (dx and dy
+    will be concatenated horizontally into a single image if quantize is True.)
+
+    Args:
+        flow (ndarray): (h, w, 2) array of optical flow.
+        filename (str): Output filepath.
+        quantize (bool): Whether to quantize the flow and save it to 2 jpeg
+            images. If set to True, remaining args will be passed to
+            :func:`quantize_flow`.
+        concat_axis (int): The axis that dx and dy are concatenated,
+            can be either 0 or 1. Ignored if quantize is False.
+    """
+    if not quantize:
+        with open(filename, 'wb') as f:
+            f.write(b'PIEH')
+            np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)
+            flow = flow.astype(np.float32)
+            flow.tofile(f)
+            f.flush()
+    else:
+        assert concat_axis in [0, 1]
+        dx, dy = quantize_flow(flow, *args, **kwargs)
+        dxdy = np.concatenate((dx, dy), axis=concat_axis)
+        imwrite(dxdy, filename)
+
+
+def quantize_flow(flow: np.ndarray,
+                  max_val: float = 0.02,
+                  norm: bool = True) -> tuple:
+    """Quantize flow to [0, 255].
+
+    After this step, the size of flow will be much smaller, and can be
+    dumped as jpeg images.
+
+    Args:
+        flow (ndarray): (h, w, 2) array of optical flow.
+        max_val (float): Maximum value of flow, values beyond
+                        [-max_val, max_val] will be truncated.
+        norm (bool): Whether to divide flow values by image width/height.
+
+    Returns:
+        tuple[ndarray]: Quantized dx and dy.
+    """
+    h, w, _ = flow.shape
+    dx = flow[..., 0]
+    dy = flow[..., 1]
+    if norm:
+        dx = dx / w  # avoid inplace operations
+        dy = dy / h
+    # use 255 levels instead of 256 to make sure 0 is 0 after dequantization.
+    flow_comps = [
+        quantize(d, -max_val, max_val, 255, np.uint8) for d in [dx, dy]
+    ]
+    return tuple(flow_comps)
+
+
+def dequantize_flow(dx: np.ndarray,
+                    dy: np.ndarray,
+                    max_val: float = 0.02,
+                    denorm: bool = True) -> np.ndarray:
+    """Recover from quantized flow.
+
+    Args:
+        dx (ndarray): Quantized dx.
+        dy (ndarray): Quantized dy.
+        max_val (float): Maximum value used when quantizing.
+        denorm (bool): Whether to multiply flow values with width/height.
+
+    Returns:
+        ndarray: Dequantized flow.
+    """
+    assert dx.shape == dy.shape
+    assert dx.ndim == 2 or (dx.ndim == 3 and dx.shape[-1] == 1)
+
+    dx, dy = (dequantize(d, -max_val, max_val, 255) for d in [dx, dy])
+
+    if denorm:
+        dx *= dx.shape[1]
+        dy *= dx.shape[0]
+    flow = np.dstack((dx, dy))
+    return flow
+
+
+def flow_warp(img: np.ndarray,
+              flow: np.ndarray,
+              filling_value: int = 0,
+              interpolate_mode: str = 'nearest') -> np.ndarray:
+    """Use flow to warp img.
+
+    Args:
+        img (ndarray): Image to be warped.
+        flow (ndarray): Optical Flow.
+        filling_value (int): The missing pixels will be set with filling_value.
+        interpolate_mode (str): bilinear -> Bilinear Interpolation;
+                                nearest -> Nearest Neighbor.
+
+    Returns:
+        ndarray: Warped image with the same shape of img
+    """
+    warnings.warn('This function is just for prototyping and cannot '
+                  'guarantee the computational efficiency.')
+    assert flow.ndim == 3, 'Flow must be in 3D arrays.'
+    height = flow.shape[0]
+    width = flow.shape[1]
+    channels = img.shape[2]
+
+    output = np.ones(
+        (height, width, channels), dtype=img.dtype) * filling_value
+
+    grid = np.indices((height, width)).swapaxes(0, 1).swapaxes(1, 2)
+    dx = grid[:, :, 0] + flow[:, :, 1]
+    dy = grid[:, :, 1] + flow[:, :, 0]
+    sx = np.floor(dx).astype(int)
+    sy = np.floor(dy).astype(int)
+    valid = (sx >= 0) & (sx < height - 1) & (sy >= 0) & (sy < width - 1)
+
+    if interpolate_mode == 'nearest':
+        output[valid, :] = img[dx[valid].round().astype(int),
+                               dy[valid].round().astype(int), :]
+    elif interpolate_mode == 'bilinear':
+        # dirty walkround for integer positions
+        eps_ = 1e-6
+        dx, dy = dx + eps_, dy + eps_
+        left_top_ = img[np.floor(dx[valid]).astype(int),
+                        np.floor(dy[valid]).astype(int), :] * (
+                            np.ceil(dx[valid]) - dx[valid])[:, None] * (
+                                np.ceil(dy[valid]) - dy[valid])[:, None]
+        left_down_ = img[np.ceil(dx[valid]).astype(int),
+                         np.floor(dy[valid]).astype(int), :] * (
+                             dx[valid] - np.floor(dx[valid]))[:, None] * (
+                                 np.ceil(dy[valid]) - dy[valid])[:, None]
+        right_top_ = img[np.floor(dx[valid]).astype(int),
+                         np.ceil(dy[valid]).astype(int), :] * (
+                             np.ceil(dx[valid]) - dx[valid])[:, None] * (
+                                 dy[valid] - np.floor(dy[valid]))[:, None]
+        right_down_ = img[np.ceil(dx[valid]).astype(int),
+                          np.ceil(dy[valid]).astype(int), :] * (
+                              dx[valid] - np.floor(dx[valid]))[:, None] * (
+                                  dy[valid] - np.floor(dy[valid]))[:, None]
+        output[valid, :] = left_top_ + left_down_ + right_top_ + right_down_
+    else:
+        raise NotImplementedError(
+            'We only support interpolation modes of nearest and bilinear, '
+            f'but got {interpolate_mode}.')
+    return output.astype(img.dtype)
+
+
+def flow_from_bytes(content: bytes) -> np.ndarray:
+    """Read dense optical flow from bytes.
+
+    .. note::
+        This load optical flow function works for FlyingChairs, FlyingThings3D,
+        Sintel, FlyingChairsOcc datasets, but cannot load the data from
+        ChairsSDHom.
+
+    Args:
+        content (bytes): Optical flow bytes got from files or other streams.
+
+    Returns:
+        ndarray: Loaded optical flow with the shape (H, W, 2).
+    """
+
+    # header in first 4 bytes
+    header = content[:4]
+    if header.decode('utf-8') != 'PIEH':
+        raise Exception('Flow file header does not contain PIEH')
+    # width in second 4 bytes
+    width = np.frombuffer(content[4:], np.int32, 1).squeeze()
+    # height in third 4 bytes
+    height = np.frombuffer(content[8:], np.int32, 1).squeeze()
+    # after first 12 bytes, all bytes are flow
+    flow = np.frombuffer(content[12:], np.float32, width * height * 2).reshape(
+        (height, width, 2))
+
+    return flow
+
+
+def sparse_flow_from_bytes(content: bytes) -> Tuple[np.ndarray, np.ndarray]:
+    """Read the optical flow in KITTI datasets from bytes.
+
+    This function is modified from RAFT load the `KITTI datasets
+    <https://github.com/princeton-vl/RAFT/blob/224320502d66c356d88e6c712f38129e60661e80/core/utils/frame_utils.py#L102>`_.
+
+    Args:
+        content (bytes): Optical flow bytes got from files or other streams.
+
+    Returns:
+        Tuple(ndarray, ndarray): Loaded optical flow with the shape (H, W, 2)
+        and flow valid mask with the shape (H, W).
+    """  # nopa
+
+    content = np.frombuffer(content, np.uint8)
+    flow = cv2.imdecode(content, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
+    flow = flow[:, :, ::-1].astype(np.float32)
+    # flow shape (H, W, 2) valid shape (H, W)
+    flow, valid = flow[:, :, :2], flow[:, :, 2]
+    flow = (flow - 2**15) / 64.0
+    return flow, valid
diff --git a/head_extractor/mmcv-2.1.0/mmcv/video/processing.py b/head_extractor/mmcv-2.1.0/mmcv/video/processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..4962e08a9e7c0c05279146491c71282708289c32
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/video/processing.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import subprocess
+import tempfile
+from typing import List, Optional, Union
+
+from mmengine.utils import requires_executable
+
+
+@requires_executable('ffmpeg')
+def convert_video(in_file: str,
+                  out_file: str,
+                  print_cmd: bool = False,
+                  pre_options: str = '',
+                  **kwargs) -> None:
+    """Convert a video with ffmpeg.
+
+    This provides a general api to ffmpeg, the executed command is::
+
+        `ffmpeg -y <pre_options> -i <in_file> <options> <out_file>`
+
+    Options(kwargs) are mapped to ffmpeg commands with the following rules:
+
+    - key=val: "-key val"
+    - key=True: "-key"
+    - key=False: ""
+
+    Args:
+        in_file (str): Input video filename.
+        out_file (str): Output video filename.
+        pre_options (str): Options appears before "-i <in_file>".
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    options = []
+    for k, v in kwargs.items():
+        if isinstance(v, bool):
+            if v:
+                options.append(f'-{k}')
+        elif k == 'log_level':
+            assert v in [
+                'quiet', 'panic', 'fatal', 'error', 'warning', 'info',
+                'verbose', 'debug', 'trace'
+            ]
+            options.append(f'-loglevel {v}')
+        else:
+            options.append(f'-{k} {v}')
+    cmd = f'ffmpeg -y {pre_options} -i {in_file} {" ".join(options)} ' \
+          f'{out_file}'
+    if print_cmd:
+        print(cmd)
+    subprocess.call(cmd, shell=True)
+
+
+@requires_executable('ffmpeg')
+def resize_video(in_file: str,
+                 out_file: str,
+                 size: Optional[tuple] = None,
+                 ratio: Union[tuple, float, None] = None,
+                 keep_ar: bool = False,
+                 log_level: str = 'info',
+                 print_cmd: bool = False) -> None:
+    """Resize a video.
+
+    Args:
+        in_file (str): Input video filename.
+        out_file (str): Output video filename.
+        size (tuple): Expected size (w, h), eg, (320, 240) or (320, -1).
+        ratio (tuple or float): Expected resize ratio, (2, 0.5) means
+            (w*2, h*0.5).
+        keep_ar (bool): Whether to keep original aspect ratio.
+        log_level (str): Logging level of ffmpeg.
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    if size is None and ratio is None:
+        raise ValueError('expected size or ratio must be specified')
+    if size is not None and ratio is not None:
+        raise ValueError('size and ratio cannot be specified at the same time')
+    options = {'log_level': log_level}
+    if size:
+        if not keep_ar:
+            options['vf'] = f'scale={size[0]}:{size[1]}'
+        else:
+            options['vf'] = f'scale=w={size[0]}:h={size[1]}:' \
+                            'force_original_aspect_ratio=decrease'
+    else:
+        if not isinstance(ratio, tuple):
+            ratio = (ratio, ratio)
+        options['vf'] = f'scale="trunc(iw*{ratio[0]}):trunc(ih*{ratio[1]})"'
+    convert_video(in_file, out_file, print_cmd, **options)
+
+
+@requires_executable('ffmpeg')
+def cut_video(in_file: str,
+              out_file: str,
+              start: Optional[float] = None,
+              end: Optional[float] = None,
+              vcodec: Optional[str] = None,
+              acodec: Optional[str] = None,
+              log_level: str = 'info',
+              print_cmd: bool = False) -> None:
+    """Cut a clip from a video.
+
+    Args:
+        in_file (str): Input video filename.
+        out_file (str): Output video filename.
+        start (None or float): Start time (in seconds).
+        end (None or float): End time (in seconds).
+        vcodec (None or str): Output video codec, None for unchanged.
+        acodec (None or str): Output audio codec, None for unchanged.
+        log_level (str): Logging level of ffmpeg.
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    options = {'log_level': log_level}
+    if vcodec is None:
+        options['vcodec'] = 'copy'
+    if acodec is None:
+        options['acodec'] = 'copy'
+    if start:
+        options['ss'] = start  # type: ignore
+    else:
+        start = 0
+    if end:
+        options['t'] = end - start  # type: ignore
+    convert_video(in_file, out_file, print_cmd, **options)
+
+
+@requires_executable('ffmpeg')
+def concat_video(video_list: List,
+                 out_file: str,
+                 vcodec: Optional[str] = None,
+                 acodec: Optional[str] = None,
+                 log_level: str = 'info',
+                 print_cmd: bool = False) -> None:
+    """Concatenate multiple videos into a single one.
+
+    Args:
+        video_list (list): A list of video filenames
+        out_file (str): Output video filename
+        vcodec (None or str): Output video codec, None for unchanged
+        acodec (None or str): Output audio codec, None for unchanged
+        log_level (str): Logging level of ffmpeg.
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    tmp_filehandler, tmp_filename = tempfile.mkstemp(suffix='.txt', text=True)
+    with open(tmp_filename, 'w') as f:
+        for filename in video_list:
+            f.write(f'file {osp.abspath(filename)}\n')
+    options = {'log_level': log_level}
+    if vcodec is None:
+        options['vcodec'] = 'copy'
+    if acodec is None:
+        options['acodec'] = 'copy'
+    convert_video(
+        tmp_filename,
+        out_file,
+        print_cmd,
+        pre_options='-f concat -safe 0',
+        **options)
+    os.close(tmp_filehandler)
+    os.remove(tmp_filename)
diff --git a/head_extractor/mmcv-2.1.0/mmcv/visualization/__init__.py b/head_extractor/mmcv-2.1.0/mmcv/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..835df136bdcf69348281d22914d41aa84cdf92b1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/visualization/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .color import Color, color_val
+from .image import imshow, imshow_bboxes, imshow_det_bboxes
+from .optflow import flow2rgb, flowshow, make_color_wheel
+
+__all__ = [
+    'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes',
+    'flowshow', 'flow2rgb', 'make_color_wheel'
+]
diff --git a/head_extractor/mmcv-2.1.0/mmcv/visualization/color.py b/head_extractor/mmcv-2.1.0/mmcv/visualization/color.py
new file mode 100644
index 0000000000000000000000000000000000000000..05796a80c38fb6b167369fc696f3f6aa1935e00d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/visualization/color.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import Enum
+from typing import Union
+
+import numpy as np
+from mmengine.utils import is_str
+
+
+class Color(Enum):
+    """An enum that defines common colors.
+
+    Contains red, green, blue, cyan, yellow, magenta, white and black.
+    """
+    red = (0, 0, 255)
+    green = (0, 255, 0)
+    blue = (255, 0, 0)
+    cyan = (255, 255, 0)
+    yellow = (0, 255, 255)
+    magenta = (255, 0, 255)
+    white = (255, 255, 255)
+    black = (0, 0, 0)
+
+
+def color_val(color: Union[Color, str, tuple, int, np.ndarray]) -> tuple:
+    """Convert various input to color tuples.
+
+    Args:
+        color (:obj:`Color`/str/tuple/int/ndarray): Color inputs
+
+    Returns:
+        tuple[int]: A tuple of 3 integers indicating BGR channels.
+    """
+    if is_str(color):
+        return Color[color].value  # type: ignore
+    elif isinstance(color, Color):
+        return color.value
+    elif isinstance(color, tuple):
+        assert len(color) == 3
+        for channel in color:
+            assert 0 <= channel <= 255
+        return color
+    elif isinstance(color, int):
+        assert 0 <= color <= 255
+        return color, color, color
+    elif isinstance(color, np.ndarray):
+        assert color.ndim == 1 and color.size == 3
+        assert np.all((color >= 0) & (color <= 255))
+        color = color.astype(np.uint8)
+        return tuple(color)
+    else:
+        raise TypeError(f'Invalid type for color: {type(color)}')
diff --git a/head_extractor/mmcv-2.1.0/mmcv/visualization/image.py b/head_extractor/mmcv-2.1.0/mmcv/visualization/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7ac4c181744cb08e51a77707c970400a9198a74
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/visualization/image.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import cv2
+import numpy as np
+
+from mmcv.image import imread, imwrite
+from .color import Color, color_val
+
+# a type alias declares the optional types of color argument
+ColorType = Union[Color, str, tuple, int, np.ndarray]
+
+
+def imshow(img: Union[str, np.ndarray],
+           win_name: str = '',
+           wait_time: int = 0):
+    """Show an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+    """
+    cv2.imshow(win_name, imread(img))
+    if wait_time == 0:  # prevent from hanging if windows was closed
+        while True:
+            ret = cv2.waitKey(1)
+
+            closed = cv2.getWindowProperty(win_name, cv2.WND_PROP_VISIBLE) < 1
+            # if user closed window or if some key pressed
+            if closed or ret != -1:
+                break
+    else:
+        ret = cv2.waitKey(wait_time)
+
+
+def imshow_bboxes(img: Union[str, np.ndarray],
+                  bboxes: Union[list, np.ndarray],
+                  colors: ColorType = 'green',
+                  top_k: int = -1,
+                  thickness: int = 1,
+                  show: bool = True,
+                  win_name: str = '',
+                  wait_time: int = 0,
+                  out_file: Optional[str] = None):
+    """Draw bboxes on an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (list or ndarray): A list of ndarray of shape (k, 4).
+        colors (Color or str or tuple or int or ndarray): A list of colors.
+        top_k (int): Plot the first k bboxes only if set positive.
+        thickness (int): Thickness of lines.
+        show (bool): Whether to show the image.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+        out_file (str, optional): The filename to write the image.
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+    img = imread(img)
+    img = np.ascontiguousarray(img)
+
+    if isinstance(bboxes, np.ndarray):
+        bboxes = [bboxes]
+    if not isinstance(colors, list):
+        colors = [colors for _ in range(len(bboxes))]
+    colors = [color_val(c) for c in colors]
+    assert len(bboxes) == len(colors)
+
+    for i, _bboxes in enumerate(bboxes):
+        _bboxes = _bboxes.astype(np.int32)
+        if top_k <= 0:
+            _top_k = _bboxes.shape[0]
+        else:
+            _top_k = min(top_k, _bboxes.shape[0])
+        for j in range(_top_k):
+            left_top = (_bboxes[j, 0], _bboxes[j, 1])
+            right_bottom = (_bboxes[j, 2], _bboxes[j, 3])
+            cv2.rectangle(
+                img, left_top, right_bottom, colors[i], thickness=thickness)
+
+    if show:
+        imshow(img, win_name, wait_time)
+    if out_file is not None:
+        imwrite(img, out_file)
+    return img
+
+
+def imshow_det_bboxes(img: Union[str, np.ndarray],
+                      bboxes: np.ndarray,
+                      labels: np.ndarray,
+                      class_names: List[str] = None,
+                      score_thr: float = 0,
+                      bbox_color: ColorType = 'green',
+                      text_color: ColorType = 'green',
+                      thickness: int = 1,
+                      font_scale: float = 0.5,
+                      show: bool = True,
+                      win_name: str = '',
+                      wait_time: int = 0,
+                      out_file: Optional[str] = None):
+    """Draw bboxes and class labels (with scores) on an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or
+            (n, 5).
+        labels (ndarray): Labels of bboxes.
+        class_names (list[str]): Names of each classes.
+        score_thr (float): Minimum score of bboxes to be shown.
+        bbox_color (Color or str or tuple or int or ndarray): Color
+            of bbox lines.
+        text_color (Color or str or tuple or int or ndarray): Color
+            of texts.
+        thickness (int): Thickness of lines.
+        font_scale (float): Font scales of texts.
+        show (bool): Whether to show the image.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+        out_file (str or None): The filename to write the image.
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+    assert bboxes.ndim == 2
+    assert labels.ndim == 1
+    assert bboxes.shape[0] == labels.shape[0]
+    assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5
+    img = imread(img)
+    img = np.ascontiguousarray(img)
+
+    if score_thr > 0:
+        assert bboxes.shape[1] == 5
+        scores = bboxes[:, -1]
+        inds = scores > score_thr
+        bboxes = bboxes[inds, :]
+        labels = labels[inds]
+
+    bbox_color = color_val(bbox_color)
+    text_color = color_val(text_color)
+
+    for bbox, label in zip(bboxes, labels):
+        bbox_int = bbox.astype(np.int32)
+        left_top = (bbox_int[0], bbox_int[1])
+        right_bottom = (bbox_int[2], bbox_int[3])
+        cv2.rectangle(
+            img, left_top, right_bottom, bbox_color, thickness=thickness)
+        label_text = class_names[
+            label] if class_names is not None else f'cls {label}'
+        if len(bbox) > 4:
+            label_text += f'|{bbox[-1]:.02f}'
+        cv2.putText(img, label_text, (bbox_int[0], bbox_int[1] - 2),
+                    cv2.FONT_HERSHEY_COMPLEX, font_scale, text_color)
+
+    if show:
+        imshow(img, win_name, wait_time)
+    if out_file is not None:
+        imwrite(img, out_file)
+    return img
diff --git a/head_extractor/mmcv-2.1.0/mmcv/visualization/optflow.py b/head_extractor/mmcv-2.1.0/mmcv/visualization/optflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..080b0e61f401c2aab3eedd307d8fc8686b0cae08
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/mmcv/visualization/optflow.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import numpy as np
+
+from mmcv.image import rgb2bgr
+from mmcv.video import flowread
+from .image import imshow
+
+
+def flowshow(flow: Union[np.ndarray, str],
+             win_name: str = '',
+             wait_time: int = 0) -> None:
+    """Show optical flow.
+
+    Args:
+        flow (ndarray or str): The optical flow to be displayed.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+    """
+    flow = flowread(flow)
+    flow_img = flow2rgb(flow)
+    imshow(rgb2bgr(flow_img), win_name, wait_time)
+
+
+def flow2rgb(flow: np.ndarray,
+             color_wheel: Optional[np.ndarray] = None,
+             unknown_thr: float = 1e6) -> np.ndarray:
+    """Convert flow map to RGB image.
+
+    Args:
+        flow (ndarray): Array of optical flow.
+        color_wheel (ndarray or None): Color wheel used to map flow field to
+            RGB colorspace. Default color wheel will be used if not specified.
+        unknown_thr (float): Values above this threshold will be marked as
+            unknown and thus ignored.
+
+    Returns:
+        ndarray: RGB image that can be visualized.
+    """
+    assert flow.ndim == 3 and flow.shape[-1] == 2
+    if color_wheel is None:
+        color_wheel = make_color_wheel()
+    assert color_wheel.ndim == 2 and color_wheel.shape[1] == 3
+    num_bins = color_wheel.shape[0]
+
+    dx = flow[:, :, 0].copy()
+    dy = flow[:, :, 1].copy()
+
+    ignore_inds = (
+        np.isnan(dx) | np.isnan(dy) | (np.abs(dx) > unknown_thr) |
+        (np.abs(dy) > unknown_thr))
+    dx[ignore_inds] = 0
+    dy[ignore_inds] = 0
+
+    rad = np.sqrt(dx**2 + dy**2)
+    if np.any(rad > np.finfo(float).eps):
+        max_rad = np.max(rad)
+        dx /= max_rad
+        dy /= max_rad
+
+    rad = np.sqrt(dx**2 + dy**2)
+    angle = np.arctan2(-dy, -dx) / np.pi
+
+    bin_real = (angle + 1) / 2 * (num_bins - 1)
+    bin_left = np.floor(bin_real).astype(int)
+    bin_right = (bin_left + 1) % num_bins
+    w = (bin_real - bin_left.astype(np.float32))[..., None]
+    flow_img = (1 -
+                w) * color_wheel[bin_left, :] + w * color_wheel[bin_right, :]
+    small_ind = rad <= 1
+    flow_img[small_ind] = 1 - rad[small_ind, None] * (1 - flow_img[small_ind])
+    flow_img[np.logical_not(small_ind)] *= 0.75
+
+    flow_img[ignore_inds, :] = 0
+
+    return flow_img
+
+
+def make_color_wheel(bins: Optional[Union[list, tuple]] = None) -> np.ndarray:
+    """Build a color wheel.
+
+    Args:
+        bins(list or tuple, optional): Specify the number of bins for each
+            color range, corresponding to six ranges: red -> yellow,
+            yellow -> green, green -> cyan, cyan -> blue, blue -> magenta,
+            magenta -> red. [15, 6, 4, 11, 13, 6] is used for default
+            (see Middlebury).
+
+    Returns:
+        ndarray: Color wheel of shape (total_bins, 3).
+    """
+    if bins is None:
+        bins = [15, 6, 4, 11, 13, 6]
+    assert len(bins) == 6
+
+    RY, YG, GC, CB, BM, MR = tuple(bins)
+
+    ry = [1, np.arange(RY) / RY, 0]
+    yg = [1 - np.arange(YG) / YG, 1, 0]
+    gc = [0, 1, np.arange(GC) / GC]
+    cb = [0, 1 - np.arange(CB) / CB, 1]
+    bm = [np.arange(BM) / BM, 0, 1]
+    mr = [1, 0, 1 - np.arange(MR) / MR]
+
+    num_bins = RY + YG + GC + CB + BM + MR
+
+    color_wheel = np.zeros((3, num_bins), dtype=np.float32)
+
+    col = 0
+    for i, color in enumerate([ry, yg, gc, cb, bm, mr]):
+        for j in range(3):
+            color_wheel[j, col:col + bins[i]] = color[j]
+        col += bins[i]
+
+    return color_wheel.T
diff --git a/head_extractor/mmcv-2.1.0/requirements.txt b/head_extractor/mmcv-2.1.0/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..448e224f92ec0e79f5aed2efc5c749f1b4447fd0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/requirements.txt
@@ -0,0 +1,4 @@
+-r requirements/build.txt
+-r requirements/optional.txt
+-r requirements/runtime.txt
+-r requirements/test.txt
diff --git a/head_extractor/mmcv-2.1.0/requirements/build.txt b/head_extractor/mmcv-2.1.0/requirements/build.txt
new file mode 100644
index 0000000000000000000000000000000000000000..abf514853e58db1b0903721c7624cb313bf3aa57
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/requirements/build.txt
@@ -0,0 +1 @@
+pytest-runner
diff --git a/head_extractor/mmcv-2.1.0/requirements/docs.txt b/head_extractor/mmcv-2.1.0/requirements/docs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b725cc70aea0ca1ed6d8fc6b8a296f1fbff47747
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/requirements/docs.txt
@@ -0,0 +1,10 @@
+docutils==0.16.0
+markdown>=3.4.0
+myst-parser
+opencv-python
+-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+sphinx==4.0.2
+sphinx-copybutton
+sphinx_markdown_tables>=0.0.16
+torch
+urllib3<2.0.0
diff --git a/head_extractor/mmcv-2.1.0/requirements/optional.txt b/head_extractor/mmcv-2.1.0/requirements/optional.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bc74f1d295b447f385fe6387ae04a0af8dedaf13
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/requirements/optional.txt
@@ -0,0 +1,2 @@
+ninja
+psutil
diff --git a/head_extractor/mmcv-2.1.0/requirements/runtime.txt b/head_extractor/mmcv-2.1.0/requirements/runtime.txt
new file mode 100644
index 0000000000000000000000000000000000000000..433b25e0edba6a52e04862fc66b041a39736698b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/requirements/runtime.txt
@@ -0,0 +1,8 @@
+addict
+mmengine>=0.3.0
+numpy
+packaging
+Pillow
+pyyaml
+regex;sys_platform=='win32'
+yapf
diff --git a/head_extractor/mmcv-2.1.0/requirements/test.txt b/head_extractor/mmcv-2.1.0/requirements/test.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f163c03afd3941b47f52c0704ecf033b8bb8fb0b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/requirements/test.txt
@@ -0,0 +1,9 @@
+coverage
+lmdb
+onnx
+onnxoptimizer
+onnxruntime
+pytest
+PyTurboJPEG
+scipy
+tifffile
diff --git a/head_extractor/mmcv-2.1.0/setup.cfg b/head_extractor/mmcv-2.1.0/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..dc8d3768ef6fb35d4b91d0504faf3fe60def3323
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/setup.cfg
@@ -0,0 +1,26 @@
+[bdist_wheel]
+universal=1
+
+[aliases]
+test=pytest
+
+[yapf]
+based_on_style = pep8
+blank_line_before_nested_class_or_def = true
+split_before_expression_after_opening_paren = true
+
+[isort]
+line_length = 79
+multi_line_output = 0
+extra_standard_library = pkg_resources,setuptools,logging,os,warnings,abc
+known_first_party = mmcv
+known_third_party = addict,cv2,matplotlib,numpy,onnx,packaging,pytest,pytorch_sphinx_theme,scipy,sphinx,torch,torchvision,yaml,yapf
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
+
+# ignore-words-list needs to be lowercase format. For example, if we want to
+# ignore word "BA", then we need to append "ba" to ignore-words-list rather
+# than "BA"
+[codespell]
+quiet-level = 3
+ignore-words-list = inout,hist,ba,ro,inh
diff --git a/head_extractor/mmcv-2.1.0/setup.py b/head_extractor/mmcv-2.1.0/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a5bfa998403dc85d6a4fbf9626db67451332ee7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/setup.py
@@ -0,0 +1,479 @@
+import glob
+import os
+import platform
+import re
+from pkg_resources import DistributionNotFound, get_distribution, parse_version
+from setuptools import find_packages, setup
+
+EXT_TYPE = ''
+try:
+    import torch
+    if torch.__version__ == 'parrots':
+        from parrots.utils.build_extension import BuildExtension
+        EXT_TYPE = 'parrots'
+    elif (hasattr(torch, 'is_mlu_available') and torch.is_mlu_available()) or \
+            os.getenv('FORCE_MLU', '0') == '1':
+        from torch_mlu.utils.cpp_extension import BuildExtension
+        EXT_TYPE = 'pytorch'
+    else:
+        from torch.utils.cpp_extension import BuildExtension
+        EXT_TYPE = 'pytorch'
+    cmd_class = {'build_ext': BuildExtension}
+except ModuleNotFoundError:
+    cmd_class = {}
+    print('Skip building ext ops due to the absence of torch.')
+
+
+def choose_requirement(primary, secondary):
+    """If some version of primary requirement installed, return primary, else
+    return secondary."""
+    try:
+        name = re.split(r'[!<>=]', primary)[0]
+        get_distribution(name)
+    except DistributionNotFound:
+        return secondary
+
+    return str(primary)
+
+
+def get_version():
+    version_file = 'mmcv/version.py'
+    with open(version_file, encoding='utf-8') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+def parse_requirements(fname='requirements/runtime.txt', with_version=True):
+    """Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+
+    Returns:
+        List[str]: list of requirements items
+
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import sys
+    from os.path import exists
+    require_fpath = fname
+
+    def parse_line(line):
+        """Parse information from a line in a requirements text file."""
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath) as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    yield from parse_line(line)
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+
+install_requires = parse_requirements()
+
+try:
+    # OpenCV installed via conda.
+    import cv2  # NOQA: F401
+    major, minor, *rest = cv2.__version__.split('.')
+    if int(major) < 3:
+        raise RuntimeError(
+            f'OpenCV >=3 is required but {cv2.__version__} is installed')
+except ImportError:
+    # If first not installed install second package
+    CHOOSE_INSTALL_REQUIRES = [('opencv-python-headless>=3',
+                                'opencv-python>=3')]
+    for main, secondary in CHOOSE_INSTALL_REQUIRES:
+        install_requires.append(choose_requirement(main, secondary))
+
+
+def get_extensions():
+    extensions = []
+
+    if os.getenv('MMCV_WITH_OPS', '1') == '0':
+        return extensions
+
+    if EXT_TYPE == 'parrots':
+        ext_name = 'mmcv._ext'
+        from parrots.utils.build_extension import Extension
+
+        # new parrots op impl do not use MMCV_USE_PARROTS
+        # define_macros = [('MMCV_USE_PARROTS', None)]
+        define_macros = []
+        include_dirs = []
+        op_files = glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') +\
+            glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') +\
+            glob.glob('./mmcv/ops/csrc/parrots/*.cpp')
+        include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+        include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
+        op_files.remove('./mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu')
+        op_files.remove('./mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp')
+        op_files.remove('./mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu')
+        cuda_args = os.getenv('MMCV_CUDA_ARGS')
+        extra_compile_args = {
+            'nvcc': [cuda_args, '-std=c++14'] if cuda_args else ['-std=c++14'],
+            'cxx': ['-std=c++14'],
+        }
+        if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
+            define_macros += [('MMCV_WITH_CUDA', None)]
+            extra_compile_args['nvcc'] += [
+                '-D__CUDA_NO_HALF_OPERATORS__',
+                '-D__CUDA_NO_HALF_CONVERSIONS__',
+                '-D__CUDA_NO_HALF2_OPERATORS__',
+            ]
+        ext_ops = Extension(
+            name=ext_name,
+            sources=op_files,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+            cuda=True,
+            pytorch=True)
+        extensions.append(ext_ops)
+    elif EXT_TYPE == 'pytorch':
+        ext_name = 'mmcv._ext'
+        from torch.utils.cpp_extension import CppExtension, CUDAExtension
+
+        # prevent ninja from using too many resources
+        try:
+            import psutil
+            num_cpu = len(psutil.Process().cpu_affinity())
+            cpu_use = max(4, num_cpu - 1)
+        except (ModuleNotFoundError, AttributeError):
+            cpu_use = 4
+
+        os.environ.setdefault('MAX_JOBS', str(cpu_use))
+        define_macros = []
+
+        # Before PyTorch1.8.0, when compiling CUDA code, `cxx` is a
+        # required key passed to PyTorch. Even if there is no flag passed
+        # to cxx, users also need to pass an empty list to PyTorch.
+        # Since PyTorch1.8.0, it has a default value so users do not need
+        # to pass an empty list anymore.
+        # More details at https://github.com/pytorch/pytorch/pull/45956
+        extra_compile_args = {'cxx': []}
+
+        if platform.system() != 'Windows':
+            if parse_version(torch.__version__) <= parse_version('1.12.1'):
+                extra_compile_args['cxx'] = ['-std=c++14']
+            else:
+                extra_compile_args['cxx'] = ['-std=c++17']
+        else:
+            if parse_version(torch.__version__) <= parse_version('1.12.1'):
+                extra_compile_args['cxx'] = ['/std:c++14']
+            else:
+                extra_compile_args['cxx'] = ['/std:c++17']
+
+        include_dirs = []
+        library_dirs = []
+        libraries = []
+
+        extra_objects = []
+        extra_link_args = []
+        is_rocm_pytorch = False
+        try:
+            from torch.utils.cpp_extension import ROCM_HOME
+            is_rocm_pytorch = True if ((torch.version.hip is not None) and
+                                       (ROCM_HOME is not None)) else False
+        except ImportError:
+            pass
+
+        if os.getenv('MMCV_WITH_DIOPI', '0') == '1':
+            import mmengine  # NOQA: F401
+            from mmengine.utils.version_utils import digit_version
+            assert digit_version(mmengine.__version__) >= digit_version(
+                '0.7.4'), f'mmengine >= 0.7.4 is required \
+                but {mmengine.__version__} is installed'
+
+            print(f'Compiling {ext_name} with CPU and DIPU')
+            define_macros += [('MMCV_WITH_DIOPI', None)]
+            define_macros += [('DIOPI_ATTR_WEAK', None)]
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')
+            extension = CppExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+            dipu_root = os.getenv('DIPU_ROOT')
+            diopi_path = os.getenv('DIOPI_PATH')
+            dipu_path = os.getenv('DIPU_PATH')
+            vendor_include_dirs = os.getenv('VENDOR_INCLUDE_DIRS')
+            nccl_include_dirs = os.getenv('NCCL_INCLUDE_DIRS')
+            include_dirs.append(dipu_root)
+            include_dirs.append(diopi_path + '/include')
+            include_dirs.append(dipu_path + '/dist/include')
+            include_dirs.append(vendor_include_dirs)
+            if nccl_include_dirs:
+                include_dirs.append(nccl_include_dirs)
+            library_dirs += [dipu_root]
+            libraries += ['torch_dipu']
+        elif is_rocm_pytorch or torch.cuda.is_available() or os.getenv(
+                'FORCE_CUDA', '0') == '1':
+            if is_rocm_pytorch:
+                define_macros += [('MMCV_WITH_HIP', None)]
+            define_macros += [('MMCV_WITH_CUDA', None)]
+            cuda_args = os.getenv('MMCV_CUDA_ARGS')
+            extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cpp')
+            extension = CUDAExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/pytorch'))
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
+        elif (hasattr(torch, 'is_mlu_available') and
+                torch.is_mlu_available()) or \
+                os.getenv('FORCE_MLU', '0') == '1':
+            from torch_mlu.utils.cpp_extension import MLUExtension
+
+            def get_mluops_version(file_path):
+                with open(file_path) as f:
+                    for line in f:
+                        if re.search('MLUOP_MAJOR', line):
+                            major = line.strip().split(' ')[2]
+                        if re.search('MLUOP_MINOR', line):
+                            minor = line.strip().split(' ')[2]
+                        if re.search('MLUOP_PATCHLEVEL', line):
+                            patchlevel = line.strip().split(' ')[2]
+                mluops_version = f'v{major}.{minor}.{patchlevel}'
+                return mluops_version
+
+            mmcv_mluops_version = get_mluops_version(
+                './mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h')
+            mlu_ops_path = os.getenv('MMCV_MLU_OPS_PATH')
+            if mlu_ops_path:
+                exists_mluops_version = get_mluops_version(
+                    mlu_ops_path + '/bangc-ops/mlu_op.h')
+                if exists_mluops_version != mmcv_mluops_version:
+                    print('the version of mlu-ops provided is %s,'
+                          ' while %s is needed.' %
+                          (exists_mluops_version, mmcv_mluops_version))
+                    exit()
+                try:
+                    if os.path.exists('mlu-ops'):
+                        if os.path.islink('mlu-ops'):
+                            os.remove('mlu-ops')
+                            os.symlink(mlu_ops_path, 'mlu-ops')
+                        elif os.path.abspath('mlu-ops') != mlu_ops_path:
+                            os.symlink(mlu_ops_path, 'mlu-ops')
+                    else:
+                        os.symlink(mlu_ops_path, 'mlu-ops')
+                except Exception:
+                    raise FileExistsError(
+                        'mlu-ops already exists, please move it out,'
+                        'or rename or remove it.')
+            else:
+                if not os.path.exists('mlu-ops'):
+                    import requests
+                    mluops_url = 'https://github.com/Cambricon/mlu-ops/' + \
+                        'archive/refs/tags/' + mmcv_mluops_version + '.zip'
+                    req = requests.get(mluops_url)
+                    with open('./mlu-ops.zip', 'wb') as f:
+                        try:
+                            f.write(req.content)
+                        except Exception:
+                            raise ImportError('failed to download mlu-ops')
+
+                    from zipfile import BadZipFile, ZipFile
+                    with ZipFile('./mlu-ops.zip', 'r') as archive:
+                        try:
+                            archive.extractall()
+                            dir_name = archive.namelist()[0].split('/')[0]
+                            os.rename(dir_name, 'mlu-ops')
+                        except BadZipFile:
+                            print('invalid mlu-ops.zip file')
+                else:
+                    exists_mluops_version = get_mluops_version(
+                        './mlu-ops/bangc-ops/mlu_op.h')
+                    if exists_mluops_version != mmcv_mluops_version:
+                        print('the version of provided mlu-ops is %s,'
+                              ' while %s is needed.' %
+                              (exists_mluops_version, mmcv_mluops_version))
+                        exit()
+
+            define_macros += [('MMCV_WITH_MLU', None)]
+            mlu_args = os.getenv('MMCV_MLU_ARGS', '-DNDEBUG ')
+            mluops_includes = []
+            mluops_includes.append('-I' +
+                                   os.path.abspath('./mlu-ops/bangc-ops'))
+            mluops_includes.append(
+                '-I' + os.path.abspath('./mlu-ops/bangc-ops/kernels'))
+            extra_compile_args['cncc'] = [mlu_args] + \
+                mluops_includes if mlu_args else mluops_includes
+            extra_compile_args['cxx'] += ['-fno-gnu-unique']
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/mlu/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/common/mlu/*.mlu') + \
+                glob.glob(
+                    './mlu-ops/bangc-ops/core/**/*.cpp', recursive=True) + \
+                glob.glob(
+                    './mlu-ops/bangc-ops/kernels/**/*.cpp', recursive=True) + \
+                glob.glob(
+                    './mlu-ops/bangc-ops/kernels/**/*.mlu', recursive=True)
+            extra_link_args = [
+                '-Wl,--whole-archive',
+                './mlu-ops/bangc-ops/kernels/kernel_wrapper/lib/libextops.a',
+                '-Wl,--no-whole-archive'
+            ]
+            extension = MLUExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mlu'))
+            include_dirs.append(os.path.abspath('./mlu-ops/bangc-ops'))
+        elif (hasattr(torch.backends, 'mps')
+              and torch.backends.mps.is_available()) or os.getenv(
+                  'FORCE_MPS', '0') == '1':
+            # objc compiler support
+            from distutils.unixccompiler import UnixCCompiler
+            if '.mm' not in UnixCCompiler.src_extensions:
+                UnixCCompiler.src_extensions.append('.mm')
+                UnixCCompiler.language_map['.mm'] = 'objc'
+
+            define_macros += [('MMCV_WITH_MPS', None)]
+            extra_compile_args = {}
+            extra_compile_args['cxx'] = ['-Wall', '-std=c++17']
+            extra_compile_args['cxx'] += [
+                '-framework', 'Metal', '-framework', 'Foundation'
+            ]
+            extra_compile_args['cxx'] += ['-ObjC++']
+            # src
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')
+            # TODO: support mps ops on torch>=2.1.0
+            if parse_version(torch.__version__) < parse_version('2.1.0'):
+                op_files += glob.glob('./mmcv/ops/csrc/common/mps/*.mm') + \
+                    glob.glob('./mmcv/ops/csrc/pytorch/mps/*.mm')
+            extension = CppExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mps'))
+        elif (os.getenv('FORCE_NPU', '0') == '1'):
+            print(f'Compiling {ext_name} only with CPU and NPU')
+            try:
+                from torch_npu.utils.cpp_extension import NpuExtension
+                define_macros += [('MMCV_WITH_NPU', None)]
+                extension = NpuExtension
+                if parse_version(torch.__version__) <= parse_version('2.0.0'):
+                    define_macros += [('MMCV_WITH_XLA', None)]
+                if parse_version(torch.__version__) > parse_version('2.0.0'):
+                    define_macros += [('MMCV_WITH_KPRIVATE', None)]
+            except Exception:
+                raise ImportError('can not find any torch_npu')
+            # src
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/common/npu/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/npu/*.cpp')
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/npu'))
+        else:
+            print(f'Compiling {ext_name} only with CPU')
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')
+            extension = CppExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+
+        # Since the PR (https://github.com/open-mmlab/mmcv/pull/1463) uses
+        # c++14 features, the argument ['std=c++14'] must be added here.
+        # However, in the windows environment, some standard libraries
+        # will depend on c++17 or higher. In fact, for the windows
+        # environment, the compiler will choose the appropriate compiler
+        # to compile those cpp files, so there is no need to add the
+        # argument
+        if 'nvcc' in extra_compile_args and platform.system() != 'Windows':
+            if parse_version(torch.__version__) <= parse_version('1.12.1'):
+                extra_compile_args['nvcc'] += ['-std=c++14']
+            else:
+                extra_compile_args['nvcc'] += ['-std=c++17']
+
+        ext_ops = extension(
+            name=ext_name,
+            sources=op_files,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_objects=extra_objects,
+            extra_compile_args=extra_compile_args,
+            library_dirs=library_dirs,
+            libraries=libraries,
+            extra_link_args=extra_link_args)
+        extensions.append(ext_ops)
+    return extensions
+
+
+setup(
+    name='mmcv' if os.getenv('MMCV_WITH_OPS', '1') == '1' else 'mmcv-lite',
+    version=get_version(),
+    description='OpenMMLab Computer Vision Foundation',
+    keywords='computer vision',
+    packages=find_packages(),
+    include_package_data=True,
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'License :: OSI Approved :: Apache Software License',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
+        'Topic :: Utilities',
+    ],
+    url='https://github.com/open-mmlab/mmcv',
+    author='MMCV Contributors',
+    author_email='openmmlab@gmail.com',
+    install_requires=install_requires,
+    extras_require={
+        'all': parse_requirements('requirements.txt'),
+        'tests': parse_requirements('requirements/test.txt'),
+        'build': parse_requirements('requirements/build.txt'),
+        'optional': parse_requirements('requirements/optional.txt'),
+    },
+    python_requires='>=3.7',
+    ext_modules=get_extensions(),
+    cmdclass=cmd_class,
+    zip_safe=False)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_arraymisc.py b/head_extractor/mmcv-2.1.0/tests/test_arraymisc.py
new file mode 100644
index 0000000000000000000000000000000000000000..b29e5f670c3b43663a3390c0e5d4206d49680b70
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_arraymisc.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import numpy as np
+import pytest
+
+import mmcv
+
+
+def test_quantize():
+    arr = np.random.randn(10, 10)
+    levels = 20
+
+    qarr = mmcv.quantize(arr, -1, 1, levels)
+    assert qarr.shape == arr.shape
+    assert qarr.dtype == np.dtype('int64')
+    for i in range(arr.shape[0]):
+        for j in range(arr.shape[1]):
+            ref = min(levels - 1,
+                      int(np.floor(10 * (1 + max(min(arr[i, j], 1), -1)))))
+            assert qarr[i, j] == ref
+
+    qarr = mmcv.quantize(arr, -1, 1, 20, dtype=np.uint8)
+    assert qarr.shape == arr.shape
+    assert qarr.dtype == np.dtype('uint8')
+
+    with pytest.raises(ValueError):
+        mmcv.quantize(arr, -1, 1, levels=0)
+    with pytest.raises(ValueError):
+        mmcv.quantize(arr, -1, 1, levels=10.0)
+    with pytest.raises(ValueError):
+        mmcv.quantize(arr, 2, 1, levels)
+
+
+def test_dequantize():
+    levels = 20
+    qarr = np.random.randint(levels, size=(10, 10))
+
+    arr = mmcv.dequantize(qarr, -1, 1, levels)
+    assert arr.shape == qarr.shape
+    assert arr.dtype == np.dtype('float64')
+    for i in range(qarr.shape[0]):
+        for j in range(qarr.shape[1]):
+            assert arr[i, j] == (qarr[i, j] + 0.5) / 10 - 1
+
+    arr = mmcv.dequantize(qarr, -1, 1, levels, dtype=np.float32)
+    assert arr.shape == qarr.shape
+    assert arr.dtype == np.dtype('float32')
+
+    with pytest.raises(ValueError):
+        mmcv.dequantize(arr, -1, 1, levels=0)
+    with pytest.raises(ValueError):
+        mmcv.dequantize(arr, -1, 1, levels=10.0)
+    with pytest.raises(ValueError):
+        mmcv.dequantize(arr, 2, 1, levels)
+
+
+def test_joint():
+    arr = np.random.randn(100, 100)
+    levels = 1000
+    qarr = mmcv.quantize(arr, -1, 1, levels)
+    recover = mmcv.dequantize(qarr, -1, 1, levels)
+    assert np.abs(recover[arr < -1] + 0.999).max() < 1e-6
+    assert np.abs(recover[arr > 1] - 0.999).max() < 1e-6
+    assert np.abs((recover - arr)[(arr >= -1) & (arr <= 1)]).max() <= 1e-3
+
+    arr = np.clip(np.random.randn(100) / 1000, -0.01, 0.01)
+    levels = 99
+    qarr = mmcv.quantize(arr, -1, 1, levels)
+    recover = mmcv.dequantize(qarr, -1, 1, levels)
+    assert np.all(recover == 0)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_build_layers.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_build_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..eefdd640ca7cce9ce42c97293118795edb916a39
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_build_layers.py
@@ -0,0 +1,456 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+from importlib import import_module
+
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+from torch.nn import ReflectionPad2d, Upsample
+
+from mmcv.cnn.bricks import (ContextBlock, ConvModule, ConvTranspose2d,
+                             GeneralizedAttention, NonLocal2d,
+                             build_activation_layer, build_conv_layer,
+                             build_norm_layer, build_padding_layer,
+                             build_plugin_layer, build_upsample_layer, is_norm)
+from mmcv.cnn.bricks.activation import Clamp
+from mmcv.cnn.bricks.norm import infer_abbr as infer_norm_abbr
+from mmcv.cnn.bricks.plugin import infer_abbr as infer_plugin_abbr
+from mmcv.cnn.bricks.upsample import PixelShufflePack
+
+
+def test_build_conv_layer():
+    with pytest.raises(TypeError):
+        # cfg must be a dict
+        cfg = 'Conv2d'
+        build_conv_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # `type` must be in cfg
+        cfg = dict(kernel_size=3)
+        build_conv_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # unsupported conv type
+        cfg = dict(type='FancyConv')
+        build_conv_layer(cfg)
+
+    kwargs = dict(
+        in_channels=4, out_channels=8, kernel_size=3, groups=2, dilation=2)
+    cfg = None
+    layer = build_conv_layer(cfg, **kwargs)
+    assert isinstance(layer, nn.Conv2d)
+    assert layer.in_channels == kwargs['in_channels']
+    assert layer.out_channels == kwargs['out_channels']
+    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])
+    assert layer.groups == kwargs['groups']
+    assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])
+
+    cfg = dict(type='Conv')
+    layer = build_conv_layer(cfg, **kwargs)
+    assert isinstance(layer, nn.Conv2d)
+    assert layer.in_channels == kwargs['in_channels']
+    assert layer.out_channels == kwargs['out_channels']
+    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])
+    assert layer.groups == kwargs['groups']
+    assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])
+
+    cfg = dict(type='deconv')
+    layer = build_conv_layer(cfg, **kwargs)
+    assert isinstance(layer, nn.ConvTranspose2d)
+    assert layer.in_channels == kwargs['in_channels']
+    assert layer.out_channels == kwargs['out_channels']
+    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])
+    assert layer.groups == kwargs['groups']
+    assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])
+
+    # sparse convs cannot support the case when groups>1
+    kwargs.pop('groups')
+
+    for type_name, module in MODELS.module_dict.items():
+        for type_name_ in (type_name, module):
+            cfg = dict(type=type_name_)
+            # SparseInverseConv2d and SparseInverseConv3d do not have the
+            # argument 'dilation'
+            if type_name == 'SparseInverseConv2d' or type_name == \
+                    'SparseInverseConv3d':
+                kwargs.pop('dilation')
+            if 'conv' in type_name.lower():
+                layer = build_conv_layer(cfg, **kwargs)
+                assert isinstance(layer, module)
+                assert layer.in_channels == kwargs['in_channels']
+                assert layer.out_channels == kwargs['out_channels']
+                kwargs['dilation'] = 2  # recover the key
+
+
+def test_infer_norm_abbr():
+    with pytest.raises(TypeError):
+        # class_type must be a class
+        infer_norm_abbr(0)
+
+    class MyNorm:
+
+        _abbr_ = 'mn'
+
+    assert infer_norm_abbr(MyNorm) == 'mn'
+
+    class FancyBatchNorm:
+        pass
+
+    assert infer_norm_abbr(FancyBatchNorm) == 'bn'
+
+    class FancyInstanceNorm:
+        pass
+
+    assert infer_norm_abbr(FancyInstanceNorm) == 'in'
+
+    class FancyLayerNorm:
+        pass
+
+    assert infer_norm_abbr(FancyLayerNorm) == 'ln'
+
+    class FancyGroupNorm:
+        pass
+
+    assert infer_norm_abbr(FancyGroupNorm) == 'gn'
+
+    class FancyNorm:
+        pass
+
+    assert infer_norm_abbr(FancyNorm) == 'norm_layer'
+
+
+def test_build_norm_layer():
+    with pytest.raises(TypeError):
+        # cfg must be a dict
+        cfg = 'BN'
+        build_norm_layer(cfg, 3)
+
+    with pytest.raises(KeyError):
+        # `type` must be in cfg
+        cfg = dict()
+        build_norm_layer(cfg, 3)
+
+    with pytest.raises(KeyError):
+        # unsupported norm type
+        cfg = dict(type='FancyNorm')
+        build_norm_layer(cfg, 3)
+
+    with pytest.raises(AssertionError):
+        # postfix must be int or str
+        cfg = dict(type='BN')
+        build_norm_layer(cfg, 3, postfix=[1, 2])
+
+    with pytest.raises(AssertionError):
+        # `num_groups` must be in cfg when using 'GN'
+        cfg = dict(type='GN')
+        build_norm_layer(cfg, 3)
+
+    # test each type of norm layer in norm_cfg
+    abbr_mapping = {
+        'BN': 'bn',
+        'BN1d': 'bn',
+        'BN2d': 'bn',
+        'BN3d': 'bn',
+        'SyncBN': 'bn',
+        'GN': 'gn',
+        'LN': 'ln',
+        'IN': 'in',
+        'IN1d': 'in',
+        'IN2d': 'in',
+        'IN3d': 'in',
+    }
+    for type_name, module in MODELS.module_dict.items():
+        if type_name not in abbr_mapping:
+            continue
+        if type_name == 'MMSyncBN':  # skip MMSyncBN
+            continue
+        for postfix in ['_test', 1]:
+            for type_name_ in (type_name, module):
+                cfg = dict(type=type_name_)
+                if type_name == 'GN':
+                    cfg['num_groups'] = 3
+                name, layer = build_norm_layer(cfg, 3, postfix=postfix)
+                assert name == abbr_mapping[type_name] + str(postfix)
+                assert isinstance(layer, module)
+                if type_name == 'GN':
+                    assert layer.num_channels == 3
+                    assert layer.num_groups == cfg['num_groups']
+                elif type_name != 'LN':
+                    assert layer.num_features == 3
+
+
+def test_build_activation_layer():
+    act_names = [
+        'ReLU', 'LeakyReLU', 'PReLU', 'RReLU', 'ReLU6', 'ELU', 'Sigmoid',
+        'Tanh'
+    ]
+
+    for module_name in ['activation', 'hsigmoid', 'hswish', 'swish']:
+        act_module = import_module(f'mmcv.cnn.bricks.{module_name}')
+        for key, value in act_module.__dict__.items():
+            if inspect.isclass(value) and issubclass(value, nn.Module):
+                act_names.append(key)
+
+    with pytest.raises(TypeError):
+        # cfg must be a dict
+        cfg = 'ReLU'
+        build_activation_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # `type` must be in cfg
+        cfg = dict()
+        build_activation_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # unsupported activation type
+        cfg = dict(type='FancyReLU')
+        build_activation_layer(cfg)
+
+    # test each type of activation layer in activation_cfg
+    for type_name, module in MODELS.module_dict.items():
+        if type_name in act_names:
+            cfg['type'] = type_name
+            layer = build_activation_layer(cfg)
+            assert isinstance(layer, module)
+
+    # sanity check for Clamp
+    for type_name in ('Clamp', Clamp):
+        act = build_activation_layer(dict(type='Clamp'))
+        x = torch.randn(10) * 1000
+        y = act(x)
+        assert np.logical_and((y >= -1).numpy(), (y <= 1).numpy()).all()
+
+    act = build_activation_layer(dict(type='Clip', min=0))
+    y = act(x)
+    assert np.logical_and((y >= 0).numpy(), (y <= 1).numpy()).all()
+    act = build_activation_layer(dict(type='Clamp', max=0))
+    y = act(x)
+    assert np.logical_and((y >= -1).numpy(), (y <= 0).numpy()).all()
+
+
+def test_build_padding_layer():
+    pad_names = ['zero', 'reflect', 'replicate']
+    for module_name in ['padding']:
+        pad_module = import_module(f'mmcv.cnn.bricks.{module_name}')
+        for key, value in pad_module.__dict__.items():
+            if inspect.isclass(value) and issubclass(value, nn.Module):
+                pad_names.append(key)
+
+    with pytest.raises(TypeError):
+        # cfg must be a dict
+        cfg = 'reflect'
+        build_padding_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # `type` must be in cfg
+        cfg = dict()
+        build_padding_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # unsupported activation type
+        cfg = dict(type='FancyPad')
+        build_padding_layer(cfg)
+
+    for type_name, module in MODELS.module_dict.items():
+        if type_name in pad_names:
+            cfg['type'] = type_name
+            layer = build_padding_layer(cfg, 2)
+            assert isinstance(layer, module)
+    for type_name in (ReflectionPad2d, 'reflect'):
+        input_x = torch.randn(1, 2, 5, 5)
+        cfg = dict(type=type_name)
+        padding_layer = build_padding_layer(cfg, 2)
+        res = padding_layer(input_x)
+        assert res.shape == (1, 2, 9, 9)
+
+
+def test_upsample_layer():
+    with pytest.raises(TypeError):
+        # cfg must be a dict
+        cfg = 'bilinear'
+        build_upsample_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # `type` must be in cfg
+        cfg = dict()
+        build_upsample_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # unsupported activation type
+        cfg = dict(type='FancyUpsample')
+        build_upsample_layer(cfg)
+
+    for type_name in ['nearest', 'bilinear']:
+        cfg['type'] = type_name
+        layer = build_upsample_layer(cfg)
+        assert isinstance(layer, nn.Upsample)
+        assert layer.mode == type_name
+
+    cfg = dict()
+    cfg['type'] = Upsample
+    layer_from_cls = build_upsample_layer(cfg)
+    assert isinstance(layer_from_cls, nn.Upsample)
+    assert layer_from_cls.mode == 'nearest'
+
+    cfg = dict(
+        type='deconv', in_channels=3, out_channels=3, kernel_size=3, stride=2)
+    layer = build_upsample_layer(cfg)
+    assert isinstance(layer, nn.ConvTranspose2d)
+
+    for type_name in ('deconv', ConvTranspose2d):
+        cfg = dict(type=ConvTranspose2d)
+        kwargs = dict(in_channels=3, out_channels=3, kernel_size=3, stride=2)
+        layer = build_upsample_layer(cfg, **kwargs)
+        assert isinstance(layer, nn.ConvTranspose2d)
+        assert layer.in_channels == kwargs['in_channels']
+        assert layer.out_channels == kwargs['out_channels']
+        assert layer.kernel_size == (kwargs['kernel_size'],
+                                     kwargs['kernel_size'])
+        assert layer.stride == (kwargs['stride'], kwargs['stride'])
+
+        layer = build_upsample_layer(cfg, 3, 3, 3, 2)
+        assert isinstance(layer, nn.ConvTranspose2d)
+        assert layer.in_channels == kwargs['in_channels']
+        assert layer.out_channels == kwargs['out_channels']
+        assert layer.kernel_size == (kwargs['kernel_size'],
+                                     kwargs['kernel_size'])
+        assert layer.stride == (kwargs['stride'], kwargs['stride'])
+
+    for type_name in ('pixel_shuffle', PixelShufflePack):
+        cfg = dict(
+            type=type_name,
+            in_channels=3,
+            out_channels=3,
+            scale_factor=2,
+            upsample_kernel=3)
+        layer = build_upsample_layer(cfg)
+
+        assert isinstance(layer, PixelShufflePack)
+        assert layer.scale_factor == 2
+        assert layer.upsample_kernel == 3
+
+
+def test_pixel_shuffle_pack():
+    x_in = torch.rand(2, 3, 10, 10)
+    pixel_shuffle = PixelShufflePack(3, 3, scale_factor=2, upsample_kernel=3)
+    assert pixel_shuffle.upsample_conv.kernel_size == (3, 3)
+    x_out = pixel_shuffle(x_in)
+    assert x_out.shape == (2, 3, 20, 20)
+
+
+def test_is_norm():
+    norm_set1 = [
+        nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.InstanceNorm1d,
+        nn.InstanceNorm2d, nn.InstanceNorm3d, nn.LayerNorm
+    ]
+    norm_set2 = [nn.GroupNorm]
+    for norm_type in norm_set1:
+        layer = norm_type(3)
+        assert is_norm(layer)
+        assert not is_norm(layer, exclude=(norm_type, ))
+    for norm_type in norm_set2:
+        layer = norm_type(3, 6)
+        assert is_norm(layer)
+        assert not is_norm(layer, exclude=(norm_type, ))
+
+    class MyNorm(nn.BatchNorm2d):
+        pass
+
+    layer = MyNorm(3)
+    assert is_norm(layer)
+    assert not is_norm(layer, exclude=_BatchNorm)
+    assert not is_norm(layer, exclude=(_BatchNorm, ))
+
+    layer = nn.Conv2d(3, 8, 1)
+    assert not is_norm(layer)
+
+    with pytest.raises(TypeError):
+        layer = nn.BatchNorm1d(3)
+        is_norm(layer, exclude='BN')
+
+    with pytest.raises(TypeError):
+        layer = nn.BatchNorm1d(3)
+        is_norm(layer, exclude=('BN', ))
+
+
+def test_infer_plugin_abbr():
+    with pytest.raises(TypeError):
+        # class_type must be a class
+        infer_plugin_abbr(0)
+
+    class MyPlugin:
+
+        _abbr_ = 'mp'
+
+    assert infer_plugin_abbr(MyPlugin) == 'mp'
+
+    class FancyPlugin:
+        pass
+
+    assert infer_plugin_abbr(FancyPlugin) == 'fancy_plugin'
+
+
+def test_build_plugin_layer():
+    with pytest.raises(TypeError):
+        # cfg must be a dict
+        cfg = 'Plugin'
+        build_plugin_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # `type` must be in cfg
+        cfg = dict()
+        build_plugin_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # unsupported plugin type
+        cfg = dict(type='FancyPlugin')
+        build_plugin_layer(cfg)
+
+    with pytest.raises(AssertionError):
+        # postfix must be int or str
+        cfg = dict(type='ConvModule')
+        build_plugin_layer(cfg, postfix=[1, 2])
+
+    # test ContextBlock
+    for type_name in ('ContextBlock', ContextBlock):
+        for postfix in ['', '_test', 1]:
+            cfg = dict(type=type_name)
+            name, layer = build_plugin_layer(
+                cfg, postfix=postfix, in_channels=16, ratio=1. / 4)
+            assert name == 'context_block' + str(postfix)
+            assert isinstance(layer, MODELS.module_dict['ContextBlock'])
+
+    # test GeneralizedAttention
+    for type_name in ('GeneralizedAttention', GeneralizedAttention):
+        for postfix in ['', '_test', 1]:
+            cfg = dict(type=type_name)
+            name, layer = build_plugin_layer(
+                cfg, postfix=postfix, in_channels=16)
+            assert name == 'gen_attention_block' + str(postfix)
+            assert isinstance(layer,
+                              MODELS.module_dict['GeneralizedAttention'])
+
+    # test NonLocal2d
+    for type_name in ('NonLocal2d', NonLocal2d):
+        for postfix in ['', '_test', 1]:
+            cfg = dict(type='NonLocal2d')
+            name, layer = build_plugin_layer(
+                cfg, postfix=postfix, in_channels=16)
+            assert name == 'nonlocal_block' + str(postfix)
+            assert isinstance(layer, MODELS.module_dict['NonLocal2d'])
+
+    # test ConvModule
+    for postfix in ['', '_test', 1]:
+        for type_name in ('ConvModule', ConvModule):
+            cfg = dict(type=type_name)
+            name, layer = build_plugin_layer(
+                cfg,
+                postfix=postfix,
+                in_channels=16,
+                out_channels=4,
+                kernel_size=3)
+            assert name == 'conv_block' + str(postfix)
+            assert isinstance(layer, MODELS.module_dict['ConvModule'])
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_context_block.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_context_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..864cb417937603d162235c4a72b4eff09b151518
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_context_block.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.cnn.bricks import ContextBlock
+
+
+def test_context_block():
+    with pytest.raises(AssertionError):
+        # pooling_type should be in ['att', 'avg']
+        ContextBlock(16, 1. / 4, pooling_type='unsupport_type')
+
+    with pytest.raises(AssertionError):
+        # fusion_types should be of type list or tuple
+        ContextBlock(16, 1. / 4, fusion_types='unsupport_type')
+
+    with pytest.raises(AssertionError):
+        # fusion_types should be in ['channel_add', 'channel_mul']
+        ContextBlock(16, 1. / 4, fusion_types=('unsupport_type', ))
+
+    # test pooling_type='att'
+    imgs = torch.randn(2, 16, 20, 20)
+    context_block = ContextBlock(16, 1. / 4, pooling_type='att')
+    out = context_block(imgs)
+    assert context_block.conv_mask.in_channels == 16
+    assert context_block.conv_mask.out_channels == 1
+    assert out.shape == imgs.shape
+
+    # test pooling_type='avg'
+    imgs = torch.randn(2, 16, 20, 20)
+    context_block = ContextBlock(16, 1. / 4, pooling_type='avg')
+    out = context_block(imgs)
+    assert hasattr(context_block, 'avg_pool')
+    assert out.shape == imgs.shape
+
+    # test fusion_types=('channel_add',)
+    imgs = torch.randn(2, 16, 20, 20)
+    context_block = ContextBlock(16, 1. / 4, fusion_types=('channel_add', ))
+    out = context_block(imgs)
+    assert context_block.channel_add_conv is not None
+    assert context_block.channel_mul_conv is None
+    assert out.shape == imgs.shape
+
+    # test fusion_types=('channel_mul',)
+    imgs = torch.randn(2, 16, 20, 20)
+    context_block = ContextBlock(16, 1. / 4, fusion_types=('channel_mul', ))
+    out = context_block(imgs)
+    assert context_block.channel_add_conv is None
+    assert context_block.channel_mul_conv is not None
+    assert out.shape == imgs.shape
+
+    # test fusion_types=('channel_add', 'channel_mul')
+    imgs = torch.randn(2, 16, 20, 20)
+    context_block = ContextBlock(
+        16, 1. / 4, fusion_types=('channel_add', 'channel_mul'))
+    out = context_block(imgs)
+    assert context_block.channel_add_conv is not None
+    assert context_block.channel_mul_conv is not None
+    assert out.shape == imgs.shape
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_conv2d_adaptive_padding.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_conv2d_adaptive_padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..83114bd5b5588dd37523a2a7476cef15b8c15df5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_conv2d_adaptive_padding.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmcv.cnn.bricks import Conv2dAdaptivePadding
+
+
+def test_conv2d_samepadding():
+    # test Conv2dAdaptivePadding with stride=1
+    inputs = torch.rand((1, 3, 28, 28))
+    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=1)
+    output = conv(inputs)
+    assert output.shape == inputs.shape
+
+    inputs = torch.rand((1, 3, 13, 13))
+    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=1)
+    output = conv(inputs)
+    assert output.shape == inputs.shape
+
+    # test Conv2dAdaptivePadding with stride=2
+    inputs = torch.rand((1, 3, 28, 28))
+    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=2)
+    output = conv(inputs)
+    assert output.shape == torch.Size([1, 3, 14, 14])
+
+    inputs = torch.rand((1, 3, 13, 13))
+    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=2)
+    output = conv(inputs)
+    assert output.shape == torch.Size([1, 3, 7, 7])
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_conv_module.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..09cb5f4acedfd023ee4b27223a9a9cb343ac0038
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_conv_module.py
@@ -0,0 +1,278 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from unittest.mock import patch
+
+import pytest
+import torch
+import torch.nn as nn
+from mmengine.registry import MODELS
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+from mmcv.cnn.bricks import ConvModule, HSigmoid, HSwish
+
+
+@MODELS.register_module()
+class ExampleConv(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 norm_cfg=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.bias = bias
+        self.norm_cfg = norm_cfg
+        self.output_padding = (0, 0, 0)
+        self.transposed = False
+
+        self.conv0 = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.init_weights()
+
+    def forward(self, x):
+        x = self.conv0(x)
+        return x
+
+    def init_weights(self):
+        nn.init.constant_(self.conv0.weight, 0)
+
+
+def test_conv_module():
+    with pytest.raises(AssertionError):
+        # conv_cfg must be a dict or None
+        conv_cfg = 'conv'
+        ConvModule(3, 8, 2, conv_cfg=conv_cfg)
+
+    with pytest.raises(AssertionError):
+        # norm_cfg must be a dict or None
+        norm_cfg = 'norm'
+        ConvModule(3, 8, 2, norm_cfg=norm_cfg)
+
+    with pytest.raises(KeyError):
+        # softmax is not supported
+        act_cfg = dict(type='softmax')
+        ConvModule(3, 8, 2, act_cfg=act_cfg)
+
+    # conv + norm + act
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+    assert conv.with_activation
+    assert hasattr(conv, 'activate')
+    assert conv.with_norm
+    assert hasattr(conv, 'norm')
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # conv + norm with efficient mode
+    efficient_conv = ConvModule(
+        3, 8, 2, norm_cfg=dict(type='BN'), efficient_conv_bn_eval=True).eval()
+    plain_conv = ConvModule(
+        3, 8, 2, norm_cfg=dict(type='BN'),
+        efficient_conv_bn_eval=False).eval()
+    for efficient_param, plain_param in zip(
+            efficient_conv.state_dict().values(),
+            plain_conv.state_dict().values()):
+        plain_param.copy_(efficient_param)
+
+    efficient_mode_output = efficient_conv(x)
+    plain_mode_output = plain_conv(x)
+    assert torch.allclose(efficient_mode_output, plain_mode_output, atol=1e-5)
+
+    # `conv` attribute can be dynamically modified in efficient mode
+    efficient_conv = ConvModule(
+        3, 8, 2, norm_cfg=dict(type='BN'), efficient_conv_bn_eval=True).eval()
+    new_conv = nn.Conv2d(3, 8, 2).eval()
+    efficient_conv.conv = new_conv
+    efficient_mode_output = efficient_conv(x)
+    plain_mode_output = efficient_conv.activate(
+        efficient_conv.norm(new_conv(x)))
+    assert torch.allclose(efficient_mode_output, plain_mode_output, atol=1e-5)
+
+    # conv + act
+    conv = ConvModule(3, 8, 2)
+    assert conv.with_activation
+    assert hasattr(conv, 'activate')
+    assert not conv.with_norm
+    assert conv.norm is None
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # conv
+    conv = ConvModule(3, 8, 2, act_cfg=None)
+    assert not conv.with_norm
+    assert conv.norm is None
+    assert not conv.with_activation
+    assert not hasattr(conv, 'activate')
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # conv with its own `init_weights` method
+    conv_module = ConvModule(
+        3, 8, 2, conv_cfg=dict(type='ExampleConv'), act_cfg=None)
+    assert torch.equal(conv_module.conv.conv0.weight, torch.zeros(8, 3, 2, 2))
+
+    # with_spectral_norm=True
+    conv = ConvModule(3, 8, 3, padding=1, with_spectral_norm=True)
+    assert hasattr(conv.conv, 'weight_orig')
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # padding_mode='reflect'
+    conv = ConvModule(3, 8, 3, padding=1, padding_mode='reflect')
+    assert isinstance(conv.padding_layer, nn.ReflectionPad2d)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # non-existing padding mode
+    with pytest.raises(KeyError):
+        conv = ConvModule(3, 8, 3, padding=1, padding_mode='non_exists')
+
+    # leaky relu
+    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
+    assert isinstance(conv.activate, nn.LeakyReLU)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # tanh
+    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='Tanh'))
+    assert isinstance(conv.activate, nn.Tanh)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # Sigmoid
+    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='Sigmoid'))
+    assert isinstance(conv.activate, nn.Sigmoid)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # PReLU
+    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='PReLU'))
+    assert isinstance(conv.activate, nn.PReLU)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # HSwish
+    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='HSwish'))
+    if (TORCH_VERSION == 'parrots'
+            or digit_version(TORCH_VERSION) < digit_version('1.7')):
+        assert isinstance(conv.activate, HSwish)
+    else:
+        assert isinstance(conv.activate, nn.Hardswish)
+
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # HSigmoid
+    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='HSigmoid'))
+    assert isinstance(conv.activate, HSigmoid)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+
+def test_bias():
+    # bias: auto, without norm
+    conv = ConvModule(3, 8, 2)
+    assert conv.conv.bias is not None
+
+    # bias: auto, with norm
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+    assert conv.conv.bias is None
+
+    # bias: False, without norm
+    conv = ConvModule(3, 8, 2, bias=False)
+    assert conv.conv.bias is None
+
+    # bias: True, with batch norm
+    with pytest.warns(UserWarning) as record:
+        ConvModule(3, 8, 2, bias=True, norm_cfg=dict(type='BN'))
+    assert len(record) == 1
+    assert record[0].message.args[
+        0] == 'Unnecessary conv bias before batch/instance norm'
+
+    # bias: True, with instance norm
+    with pytest.warns(UserWarning) as record:
+        ConvModule(3, 8, 2, bias=True, norm_cfg=dict(type='IN'))
+    assert len(record) == 1
+    assert record[0].message.args[
+        0] == 'Unnecessary conv bias before batch/instance norm'
+
+    # bias: True, with other norm
+    with pytest.warns(UserWarning) as record:
+        norm_cfg = dict(type='GN', num_groups=1)
+        ConvModule(3, 8, 2, bias=True, norm_cfg=norm_cfg)
+        warnings.warn('No warnings')
+    assert len(record) == 1
+    assert record[0].message.args[0] == 'No warnings'
+
+
+def conv_forward(self, x):
+    return x + '_conv'
+
+
+def bn_forward(self, x):
+    return x + '_bn'
+
+
+def relu_forward(self, x):
+    return x + '_relu'
+
+
+@patch('torch.nn.ReLU.forward', relu_forward)
+@patch('torch.nn.BatchNorm2d.forward', bn_forward)
+@patch('torch.nn.Conv2d.forward', conv_forward)
+def test_order():
+
+    with pytest.raises(AssertionError):
+        # order must be a tuple
+        order = ['conv', 'norm', 'act']
+        ConvModule(3, 8, 2, order=order)
+
+    with pytest.raises(AssertionError):
+        # length of order must be 3
+        order = ('conv', 'norm')
+        ConvModule(3, 8, 2, order=order)
+
+    with pytest.raises(AssertionError):
+        # order must be an order of 'conv', 'norm', 'act'
+        order = ('conv', 'norm', 'norm')
+        ConvModule(3, 8, 2, order=order)
+
+    with pytest.raises(AssertionError):
+        # order must be an order of 'conv', 'norm', 'act'
+        order = ('conv', 'norm', 'something')
+        ConvModule(3, 8, 2, order=order)
+
+    # ('conv', 'norm', 'act')
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+    out = conv('input')
+    assert out == 'input_conv_bn_relu'
+
+    # ('norm', 'conv', 'act')
+    conv = ConvModule(
+        3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
+    out = conv('input')
+    assert out == 'input_bn_conv_relu'
+
+    # ('conv', 'norm', 'act'), activate=False
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+    out = conv('input', activate=False)
+    assert out == 'input_conv_bn'
+
+    # ('conv', 'norm', 'act'), activate=False
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+    out = conv('input', norm=False)
+    assert out == 'input_conv_relu'
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_depthwise_seperable_conv_module.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_depthwise_seperable_conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..748fc1bf88166b50aec9665900e664e638b78186
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_depthwise_seperable_conv_module.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.cnn.bricks import DepthwiseSeparableConvModule
+
+
+def test_depthwise_separable_conv():
+    with pytest.raises(AssertionError):
+        # conv_cfg must be a dict or None
+        DepthwiseSeparableConvModule(4, 8, 2, groups=2)
+
+    # test default config
+    conv = DepthwiseSeparableConvModule(3, 8, 2)
+    assert conv.depthwise_conv.conv.groups == 3
+    assert conv.pointwise_conv.conv.kernel_size == (1, 1)
+    assert not conv.depthwise_conv.with_norm
+    assert not conv.pointwise_conv.with_norm
+    assert conv.depthwise_conv.activate.__class__.__name__ == 'ReLU'
+    assert conv.pointwise_conv.activate.__class__.__name__ == 'ReLU'
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # test dw_norm_cfg
+    conv = DepthwiseSeparableConvModule(3, 8, 2, dw_norm_cfg=dict(type='BN'))
+    assert conv.depthwise_conv.norm_name == 'bn'
+    assert not conv.pointwise_conv.with_norm
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # test pw_norm_cfg
+    conv = DepthwiseSeparableConvModule(3, 8, 2, pw_norm_cfg=dict(type='BN'))
+    assert not conv.depthwise_conv.with_norm
+    assert conv.pointwise_conv.norm_name == 'bn'
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # test norm_cfg
+    conv = DepthwiseSeparableConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+    assert conv.depthwise_conv.norm_name == 'bn'
+    assert conv.pointwise_conv.norm_name == 'bn'
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # add test for ['norm', 'conv', 'act']
+    conv = DepthwiseSeparableConvModule(3, 8, 2, order=('norm', 'conv', 'act'))
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    conv = DepthwiseSeparableConvModule(
+        3, 8, 3, padding=1, with_spectral_norm=True)
+    assert hasattr(conv.depthwise_conv.conv, 'weight_orig')
+    assert hasattr(conv.pointwise_conv.conv, 'weight_orig')
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    conv = DepthwiseSeparableConvModule(
+        3, 8, 3, padding=1, padding_mode='reflect')
+    assert isinstance(conv.depthwise_conv.padding_layer, nn.ReflectionPad2d)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # test dw_act_cfg
+    conv = DepthwiseSeparableConvModule(
+        3, 8, 3, padding=1, dw_act_cfg=dict(type='LeakyReLU'))
+    assert conv.depthwise_conv.activate.__class__.__name__ == 'LeakyReLU'
+    assert conv.pointwise_conv.activate.__class__.__name__ == 'ReLU'
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # test pw_act_cfg
+    conv = DepthwiseSeparableConvModule(
+        3, 8, 3, padding=1, pw_act_cfg=dict(type='LeakyReLU'))
+    assert conv.depthwise_conv.activate.__class__.__name__ == 'ReLU'
+    assert conv.pointwise_conv.activate.__class__.__name__ == 'LeakyReLU'
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # test act_cfg
+    conv = DepthwiseSeparableConvModule(
+        3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
+    assert conv.depthwise_conv.activate.__class__.__name__ == 'LeakyReLU'
+    assert conv.pointwise_conv.activate.__class__.__name__ == 'LeakyReLU'
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_flops_counter.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_flops_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2ba6e242fd95f0d5f7f645046e2871915fae086
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_flops_counter.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.cnn import get_model_complexity_info
+from mmcv.cnn.utils.flops_counter import flops_to_string, params_to_string
+
+try:
+    from StringIO import StringIO
+except ImportError:
+    from io import StringIO
+
+# yapf: disable
+gt_results = [
+    {'model': nn.Conv1d(3, 8, 3), 'input': (3, 16), 'flops': 1120.0, 'params': 80.0},  # noqa: E501
+    {'model': nn.Conv2d(3, 8, 3), 'input': (3, 16, 16), 'flops': 43904.0, 'params': 224.0},  # noqa: E501
+    {'model': nn.Conv3d(3, 8, 3), 'input': (3, 3, 16, 16), 'flops': 128576.0, 'params': 656.0},  # noqa: E501
+    {'model': nn.ReLU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.PReLU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 1},  # noqa: E501
+    {'model': nn.ELU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.LeakyReLU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.ReLU6(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.MaxPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501
+    {'model': nn.MaxPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.MaxPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501
+    {'model': nn.AvgPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501
+    {'model': nn.AvgPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.AvgPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501
+    {'model': nn.AdaptiveMaxPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501
+    {'model': nn.AdaptiveMaxPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.AdaptiveMaxPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501
+    {'model': nn.AdaptiveAvgPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501
+    {'model': nn.AdaptiveAvgPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.AdaptiveAvgPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501
+    {'model': nn.BatchNorm1d(3), 'input': (3, 16), 'flops': 96.0, 'params': 6.0},  # noqa: E501
+    {'model': nn.BatchNorm2d(3), 'input': (3, 16, 16), 'flops': 1536.0, 'params': 6.0},  # noqa: E501
+    {'model': nn.BatchNorm3d(3), 'input': (3, 3, 16, 16), 'flops': 4608.0, 'params': 6.0},  # noqa: E501
+    {'model': nn.GroupNorm(2, 6), 'input': (6, 16, 16), 'flops': 3072.0, 'params': 12.0},  # noqa: E501
+    {'model': nn.InstanceNorm1d(3, affine=True), 'input': (3, 16), 'flops': 96.0, 'params': 6.0},  # noqa: E501
+    {'model': nn.InstanceNorm2d(3, affine=True), 'input': (3, 16, 16), 'flops': 1536.0, 'params': 6.0},  # noqa: E501
+    {'model': nn.InstanceNorm3d(3, affine=True), 'input': (3, 3, 16, 16), 'flops': 4608.0, 'params': 6.0},  # noqa: E501
+    {'model': nn.LayerNorm((3, 16, 16)), 'input': (3, 16, 16), 'flops': 1536.0, 'params': 1536.0},  # noqa: E501
+    {'model': nn.LayerNorm((3, 16, 16), elementwise_affine=False), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.Linear(1024, 2), 'input': (1024, ), 'flops': 2048.0, 'params': 2050.0},  # noqa: E501
+    {'model': nn.ConvTranspose2d(3, 8, 3), 'input': (3, 16, 16), 'flops': 57888, 'params': 224.0},  # noqa: E501
+    {'model': nn.Upsample((32, 32)), 'input': (3, 16, 16), 'flops': 3072.0, 'params': 0}  # noqa: E501
+]
+# yapf: enable
+
+
+class ExampleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv2d = nn.Conv2d(3, 8, 3)
+
+    def forward(self, imgs):
+        x = torch.randn((1, *imgs))
+        return self.conv2d(x)
+
+
+def input_constructor(x):
+    return dict(imgs=x)
+
+
+def test_flops_counter():
+    with pytest.raises(AssertionError):
+        # input_res should be a tuple
+        model = nn.Conv2d(3, 8, 3)
+        input_res = [1, 3, 16, 16]
+        get_model_complexity_info(model, input_res)
+
+    with pytest.raises(AssertionError):
+        # len(input_res) >= 2
+        model = nn.Conv2d(3, 8, 3)
+        input_res = tuple()
+        get_model_complexity_info(model, input_res)
+
+    # test common layers
+    for item in gt_results:
+        model = item['model']
+        input = item['input']
+        flops, params = get_model_complexity_info(
+            model, input, as_strings=False, print_per_layer_stat=False)
+        assert flops == item['flops'] and params == item['params']
+
+    # test input constructor
+    model = ExampleModel()
+    x = (3, 16, 16)
+    flops, params = get_model_complexity_info(
+        model,
+        x,
+        as_strings=False,
+        print_per_layer_stat=False,
+        input_constructor=input_constructor)
+    assert flops == 43904.0 and params == 224.0
+
+    # test output string
+    model = nn.Conv3d(3, 8, 3)
+    x = (3, 3, 512, 512)
+    flops, params = get_model_complexity_info(
+        model, x, print_per_layer_stat=False)
+    assert flops == '0.17 GFLOPs' and params == str(656)
+
+    # test print per layer status
+    model = nn.Conv1d(3, 8, 3)
+    x = (3, 16)
+    out = StringIO()
+    get_model_complexity_info(model, x, ost=out)
+    assert out.getvalue() == \
+        'Conv1d(0.0 M, 100.000% Params, 0.0 GFLOPs, 100.000% FLOPs, 3, 8, kernel_size=(3,), stride=(1,))\n'  # noqa: E501
+
+    # test when model is not a common instance
+    model = nn.Sequential(nn.Conv2d(3, 8, 3), nn.Flatten(), nn.Linear(1568, 2))
+    x = (3, 16, 16)
+    flops, params = get_model_complexity_info(
+        model, x, as_strings=False, print_per_layer_stat=True)
+    assert flops == 47040.0 and params == 3362
+
+
+def test_flops_to_string():
+    flops = 6.54321 * 10.**9
+    assert flops_to_string(flops) == '6.54 GFLOPs'
+    assert flops_to_string(flops, 'MFLOPs') == '6543.21 MFLOPs'
+    assert flops_to_string(flops, 'KFLOPs') == '6543210.0 KFLOPs'
+    assert flops_to_string(flops, 'FLOPs') == '6543210000.0 FLOPs'
+    assert flops_to_string(flops, precision=4) == '6.5432 GFLOPs'
+
+    flops = 6.54321 * 10.**9
+    assert flops_to_string(flops, None) == '6.54 GFLOPs'
+    flops = 3.21 * 10.**7
+    assert flops_to_string(flops, None) == '32.1 MFLOPs'
+    flops = 5.4 * 10.**3
+    assert flops_to_string(flops, None) == '5.4 KFLOPs'
+    flops = 987
+    assert flops_to_string(flops, None) == '987 FLOPs'
+
+
+def test_params_to_string():
+    num_params = 3.21 * 10.**7
+    assert params_to_string(num_params) == '32.1 M'
+    num_params = 4.56 * 10.**5
+    assert params_to_string(num_params) == '456.0 k'
+    num_params = 7.89 * 10.**2
+    assert params_to_string(num_params) == '789.0'
+
+    num_params = 6.54321 * 10.**7
+    assert params_to_string(num_params, 'M') == '65.43 M'
+    assert params_to_string(num_params, 'K') == '65432.1 K'
+    assert params_to_string(num_params, '') == '65432100.0'
+    assert params_to_string(num_params, precision=4) == '65.4321 M'
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_fuse_conv_bn.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_fuse_conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e60be5386c5cc96c765caf066a8d9a82de127996
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_fuse_conv_bn.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmcv.cnn import ConvModule, fuse_conv_bn
+
+
+def test_fuse_conv_bn():
+    inputs = torch.rand((1, 3, 5, 5))
+    modules = nn.ModuleList()
+    modules.append(nn.BatchNorm2d(3))
+    modules.append(ConvModule(3, 5, 3, norm_cfg=dict(type='BN')))
+    modules.append(ConvModule(5, 5, 3, norm_cfg=dict(type='BN')))
+    modules = nn.Sequential(*modules)
+    fused_modules = fuse_conv_bn(modules)
+    assert torch.equal(modules(inputs), fused_modules(inputs))
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_generalized_attention.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_generalized_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b844f0ad57ec8a1410956d7c928e40714d06eeb
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_generalized_attention.py
@@ -0,0 +1,76 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmcv.cnn.bricks import GeneralizedAttention
+
+
+def test_context_block():
+
+    # test attention_type='1000'
+    imgs = torch.randn(2, 16, 20, 20)
+    gen_attention_block = GeneralizedAttention(16, attention_type='1000')
+    assert gen_attention_block.query_conv.in_channels == 16
+    assert gen_attention_block.key_conv.in_channels == 16
+    assert gen_attention_block.key_conv.in_channels == 16
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test attention_type='0100'
+    imgs = torch.randn(2, 16, 20, 20)
+    gen_attention_block = GeneralizedAttention(16, attention_type='0100')
+    assert gen_attention_block.query_conv.in_channels == 16
+    assert gen_attention_block.appr_geom_fc_x.in_features == 8
+    assert gen_attention_block.appr_geom_fc_y.in_features == 8
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test attention_type='0010'
+    imgs = torch.randn(2, 16, 20, 20)
+    gen_attention_block = GeneralizedAttention(16, attention_type='0010')
+    assert gen_attention_block.key_conv.in_channels == 16
+    assert hasattr(gen_attention_block, 'appr_bias')
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test attention_type='0001'
+    imgs = torch.randn(2, 16, 20, 20)
+    gen_attention_block = GeneralizedAttention(16, attention_type='0001')
+    assert gen_attention_block.appr_geom_fc_x.in_features == 8
+    assert gen_attention_block.appr_geom_fc_y.in_features == 8
+    assert hasattr(gen_attention_block, 'geom_bias')
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test spatial_range >= 0
+    imgs = torch.randn(2, 256, 20, 20)
+    gen_attention_block = GeneralizedAttention(256, spatial_range=10)
+    assert hasattr(gen_attention_block, 'local_constraint_map')
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test q_stride > 1
+    imgs = torch.randn(2, 16, 20, 20)
+    gen_attention_block = GeneralizedAttention(16, q_stride=2)
+    assert gen_attention_block.q_downsample is not None
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test kv_stride > 1
+    imgs = torch.randn(2, 16, 20, 20)
+    gen_attention_block = GeneralizedAttention(16, kv_stride=2)
+    assert gen_attention_block.kv_downsample is not None
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test fp16 with attention_type='1111'
+    if torch.cuda.is_available():
+        imgs = torch.randn(2, 16, 20, 20).cuda().to(torch.half)
+        gen_attention_block = GeneralizedAttention(
+            16,
+            spatial_range=-1,
+            num_heads=8,
+            attention_type='1111',
+            kv_stride=2)
+        gen_attention_block.cuda().type(torch.half)
+        out = gen_attention_block(imgs)
+        assert out.shape == imgs.shape
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_hsigmoid.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_hsigmoid.py
new file mode 100644
index 0000000000000000000000000000000000000000..43e9f624a2ccf369d844a9e8ec7238158b364187
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_hsigmoid.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.cnn.bricks import HSigmoid
+
+
+def test_hsigmoid():
+    # test assertion divisor can not be zero
+    with pytest.raises(AssertionError):
+        HSigmoid(divisor=0)
+
+    # test with default parameters
+    act = HSigmoid()
+    input_shape = torch.Size([1, 3, 64, 64])
+    input = torch.randn(input_shape)
+    output = act(input)
+    expected_output = torch.min(
+        torch.max((input + 3) / 6, torch.zeros(input_shape)),
+        torch.ones(input_shape))
+    # test output shape
+    assert output.shape == expected_output.shape
+    # test output value
+    assert torch.equal(output, expected_output)
+
+    # test with designated parameters
+    act = HSigmoid(1, 2, 0, 1)
+    input_shape = torch.Size([1, 3, 64, 64])
+    input = torch.randn(input_shape)
+    output = act(input)
+    expected_output = torch.min(
+        torch.max((input + 1) / 2, torch.zeros(input_shape)),
+        torch.ones(input_shape))
+    # test output shape
+    assert output.shape == expected_output.shape
+    # test output value
+    assert torch.equal(output, expected_output)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_hswish.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_hswish.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cd1bcf31221b14fec0b60537b869f4ebe12f26a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_hswish.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.nn.functional import relu6
+
+from mmcv.cnn.bricks import HSwish
+
+
+def test_hswish():
+    # test inplace
+    act = HSwish(inplace=True)
+    assert act.act.inplace
+    act = HSwish()
+    assert not act.act.inplace
+
+    input = torch.randn(1, 3, 64, 64)
+    expected_output = input * relu6(input + 3) / 6
+    output = act(input)
+    # test output shape
+    assert output.shape == expected_output.shape
+    # test output value
+    assert torch.equal(output, expected_output)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_non_local.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_non_local.py
new file mode 100644
index 0000000000000000000000000000000000000000..25d78833912a195532eb946a8939d1ea986043a5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_non_local.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.cnn import NonLocal1d, NonLocal2d, NonLocal3d
+from mmcv.cnn.bricks.non_local import _NonLocalNd
+
+
+def test_nonlocal():
+    with pytest.raises(ValueError):
+        # mode should be in ['embedded_gaussian', 'dot_product']
+        _NonLocalNd(3, mode='unsupport_mode')
+
+    # _NonLocalNd with zero initialization
+    _NonLocalNd(3)
+    _NonLocalNd(3, norm_cfg=dict(type='BN'))
+
+    # _NonLocalNd without zero initialization
+    _NonLocalNd(3, zeros_init=False)
+    _NonLocalNd(3, norm_cfg=dict(type='BN'), zeros_init=False)
+
+
+def test_nonlocal3d():
+    # NonLocal3d with 'embedded_gaussian' mode
+    imgs = torch.randn(2, 3, 10, 20, 20)
+    nonlocal_3d = NonLocal3d(3)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            # NonLocal is only implemented on gpu in parrots
+            imgs = imgs.cuda()
+            nonlocal_3d.cuda()
+    out = nonlocal_3d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal3d with 'dot_product' mode
+    nonlocal_3d = NonLocal3d(3, mode='dot_product')
+    assert nonlocal_3d.mode == 'dot_product'
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_3d.cuda()
+    out = nonlocal_3d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal3d with 'concatenation' mode
+    nonlocal_3d = NonLocal3d(3, mode='concatenation')
+    assert nonlocal_3d.mode == 'concatenation'
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_3d.cuda()
+    out = nonlocal_3d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal3d with 'gaussian' mode
+    nonlocal_3d = NonLocal3d(3, mode='gaussian')
+    assert not hasattr(nonlocal_3d, 'phi')
+    assert nonlocal_3d.mode == 'gaussian'
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_3d.cuda()
+    out = nonlocal_3d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal3d with 'gaussian' mode and sub_sample
+    nonlocal_3d = NonLocal3d(3, mode='gaussian', sub_sample=True)
+    assert isinstance(nonlocal_3d.g, nn.Sequential) and len(nonlocal_3d.g) == 2
+    assert isinstance(nonlocal_3d.g[1], nn.MaxPool3d)
+    assert nonlocal_3d.g[1].kernel_size == (1, 2, 2)
+    assert isinstance(nonlocal_3d.phi, nn.MaxPool3d)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_3d.cuda()
+    out = nonlocal_3d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal3d with 'dot_product' mode and sub_sample
+    nonlocal_3d = NonLocal3d(3, mode='dot_product', sub_sample=True)
+    for m in [nonlocal_3d.g, nonlocal_3d.phi]:
+        assert isinstance(m, nn.Sequential) and len(m) == 2
+        assert isinstance(m[1], nn.MaxPool3d)
+        assert m[1].kernel_size == (1, 2, 2)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_3d.cuda()
+    out = nonlocal_3d(imgs)
+    assert out.shape == imgs.shape
+
+
+def test_nonlocal2d():
+    # NonLocal2d with 'embedded_gaussian' mode
+    imgs = torch.randn(2, 3, 20, 20)
+    nonlocal_2d = NonLocal2d(3)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_2d.cuda()
+    out = nonlocal_2d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal2d with 'dot_product' mode
+    imgs = torch.randn(2, 3, 20, 20)
+    nonlocal_2d = NonLocal2d(3, mode='dot_product')
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_2d.cuda()
+    out = nonlocal_2d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal2d with 'concatenation' mode
+    imgs = torch.randn(2, 3, 20, 20)
+    nonlocal_2d = NonLocal2d(3, mode='concatenation')
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_2d.cuda()
+    out = nonlocal_2d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal2d with 'gaussian' mode
+    imgs = torch.randn(2, 3, 20, 20)
+    nonlocal_2d = NonLocal2d(3, mode='gaussian')
+    assert not hasattr(nonlocal_2d, 'phi')
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_2d.cuda()
+    out = nonlocal_2d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal2d with 'gaussian' mode and sub_sample
+    nonlocal_2d = NonLocal2d(3, mode='gaussian', sub_sample=True)
+    assert isinstance(nonlocal_2d.g, nn.Sequential) and len(nonlocal_2d.g) == 2
+    assert isinstance(nonlocal_2d.g[1], nn.MaxPool2d)
+    assert nonlocal_2d.g[1].kernel_size == (2, 2)
+    assert isinstance(nonlocal_2d.phi, nn.MaxPool2d)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_2d.cuda()
+    out = nonlocal_2d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal2d with 'dot_product' mode and sub_sample
+    nonlocal_2d = NonLocal2d(3, mode='dot_product', sub_sample=True)
+    for m in [nonlocal_2d.g, nonlocal_2d.phi]:
+        assert isinstance(m, nn.Sequential) and len(m) == 2
+        assert isinstance(m[1], nn.MaxPool2d)
+        assert m[1].kernel_size == (2, 2)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_2d.cuda()
+    out = nonlocal_2d(imgs)
+    assert out.shape == imgs.shape
+
+
+def test_nonlocal1d():
+    # NonLocal1d with 'embedded_gaussian' mode
+    imgs = torch.randn(2, 3, 20)
+    nonlocal_1d = NonLocal1d(3)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_1d.cuda()
+    out = nonlocal_1d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal1d with 'dot_product' mode
+    imgs = torch.randn(2, 3, 20)
+    nonlocal_1d = NonLocal1d(3, mode='dot_product')
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_1d.cuda()
+    out = nonlocal_1d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal1d with 'concatenation' mode
+    imgs = torch.randn(2, 3, 20)
+    nonlocal_1d = NonLocal1d(3, mode='concatenation')
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_1d.cuda()
+    out = nonlocal_1d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal1d with 'gaussian' mode
+    imgs = torch.randn(2, 3, 20)
+    nonlocal_1d = NonLocal1d(3, mode='gaussian')
+    assert not hasattr(nonlocal_1d, 'phi')
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_1d.cuda()
+    out = nonlocal_1d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal1d with 'gaussian' mode and sub_sample
+    nonlocal_1d = NonLocal1d(3, mode='gaussian', sub_sample=True)
+    assert isinstance(nonlocal_1d.g, nn.Sequential) and len(nonlocal_1d.g) == 2
+    assert isinstance(nonlocal_1d.g[1], nn.MaxPool1d)
+    assert nonlocal_1d.g[1].kernel_size == 2
+    assert isinstance(nonlocal_1d.phi, nn.MaxPool1d)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_1d.cuda()
+    out = nonlocal_1d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal1d with 'dot_product' mode and sub_sample
+    nonlocal_1d = NonLocal1d(3, mode='dot_product', sub_sample=True)
+    for m in [nonlocal_1d.g, nonlocal_1d.phi]:
+        assert isinstance(m, nn.Sequential) and len(m) == 2
+        assert isinstance(m[1], nn.MaxPool1d)
+        assert m[1].kernel_size == 2
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_1d.cuda()
+    out = nonlocal_1d(imgs)
+    assert out.shape == imgs.shape
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_rfsearch/test_operator.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_rfsearch/test_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b555605fc58df530c5da38468a565157b36f9c43
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_rfsearch/test_operator.py
@@ -0,0 +1,325 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+
+from mmcv.cnn.rfsearch.operator import Conv2dRFSearchOp
+
+global_config = dict(
+    step=0,
+    max_step=12,
+    search_interval=1,
+    exp_rate=0.5,
+    init_alphas=0.01,
+    mmin=1,
+    mmax=24,
+    num_branches=2,
+    skip_layer=['stem', 'layer1'])
+
+
+# test with 3x3 conv
+def test_rfsearch_operator_3x3():
+    conv = nn.Conv2d(
+        in_channels=3, out_channels=3, kernel_size=3, stride=1, padding=1)
+    operator = Conv2dRFSearchOp(conv, global_config)
+    x = torch.randn(1, 3, 32, 32)
+
+    # set no_grad to perform in-place operator
+    with torch.no_grad():
+        # After expand: (1, 1) (2, 2)
+        assert len(operator.dilation_rates) == 2
+        assert operator.dilation_rates[0] == (1, 1)
+        assert operator.dilation_rates[1] == (2, 2)
+        assert torch.all(operator.branch_weights.data ==
+                         global_config['init_alphas']).item()
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
+        operator.estimate_rates()
+        assert len(operator.dilation_rates) == 1
+        assert operator.dilation_rates[0] == (2, 2)
+        assert operator.op_layer.dilation == (2, 2)
+        assert operator.op_layer.padding == (2, 2)
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        # After expand: (1, 1) (3, 3)
+        operator.expand_rates()
+        assert len(operator.dilation_rates) == 2
+        assert operator.dilation_rates[0] == (1, 1)
+        assert operator.dilation_rates[1] == (3, 3)
+        assert torch.all(operator.branch_weights.data ==
+                         global_config['init_alphas']).item()
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        operator.branch_weights[0] = 0.1
+        operator.branch_weights[1] = 0.4
+        # After estimate: (3, 3) with branch_weights of [0.2 0.8]
+        operator.estimate_rates()
+        assert len(operator.dilation_rates) == 1
+        assert operator.dilation_rates[0] == (3, 3)
+        assert operator.op_layer.dilation == (3, 3)
+        assert operator.op_layer.padding == (3, 3)
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+
+# test with 5x5 conv
+def test_rfsearch_operator_5x5():
+    conv = nn.Conv2d(
+        in_channels=3, out_channels=3, kernel_size=5, stride=1, padding=2)
+    operator = Conv2dRFSearchOp(conv, global_config)
+    x = torch.randn(1, 3, 32, 32)
+
+    with torch.no_grad():
+        # After expand: (1, 1) (2, 2)
+        assert len(operator.dilation_rates) == 2
+        assert operator.dilation_rates[0] == (1, 1)
+        assert operator.dilation_rates[1] == (2, 2)
+        assert torch.all(operator.branch_weights.data ==
+                         global_config['init_alphas']).item()
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
+        operator.estimate_rates()
+        assert len(operator.dilation_rates) == 1
+        assert operator.dilation_rates[0] == (2, 2)
+        assert operator.op_layer.dilation == (2, 2)
+        assert operator.op_layer.padding == (4, 4)
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        # After expand: (1, 1) (3, 3)
+        operator.expand_rates()
+        assert len(operator.dilation_rates) == 2
+        assert operator.dilation_rates[0] == (1, 1)
+        assert operator.dilation_rates[1] == (3, 3)
+        assert torch.all(operator.branch_weights.data ==
+                         global_config['init_alphas']).item()
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        operator.branch_weights[0] = 0.1
+        operator.branch_weights[1] = 0.4
+        # After estimate: (3, 3) with branch_weights of [0.2 0.8]
+        operator.estimate_rates()
+        assert len(operator.dilation_rates) == 1
+        assert operator.dilation_rates[0] == (3, 3)
+        assert operator.op_layer.dilation == (3, 3)
+        assert operator.op_layer.padding == (6, 6)
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+
+# test with 5x5 conv num_branches=3
+def test_rfsearch_operator_5x5_branch3():
+    conv = nn.Conv2d(
+        in_channels=3, out_channels=3, kernel_size=5, stride=1, padding=2)
+    config = deepcopy(global_config)
+    config['num_branches'] = 3
+    operator = Conv2dRFSearchOp(conv, config)
+    x = torch.randn(1, 3, 32, 32)
+
+    with torch.no_grad():
+        # After expand: (1, 1) (2, 2)
+        assert len(operator.dilation_rates) == 2
+        assert operator.dilation_rates[0] == (1, 1)
+        assert operator.dilation_rates[1] == (2, 2)
+        assert torch.all(operator.branch_weights.data ==
+                         global_config['init_alphas']).item()
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
+        operator.estimate_rates()
+        assert len(operator.dilation_rates) == 1
+        assert operator.dilation_rates[0] == (2, 2)
+        assert operator.op_layer.dilation == (2, 2)
+        assert operator.op_layer.padding == (4, 4)
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        # After expand: (1, 1) (2, 2) (3, 3)
+        operator.expand_rates()
+        assert len(operator.dilation_rates) == 3
+        assert operator.dilation_rates[0] == (1, 1)
+        assert operator.dilation_rates[1] == (2, 2)
+        assert operator.dilation_rates[2] == (3, 3)
+        assert torch.all(operator.branch_weights.data ==
+                         global_config['init_alphas']).item()
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        operator.branch_weights[0] = 0.1
+        operator.branch_weights[1] = 0.3
+        operator.branch_weights[2] = 0.6
+        # After estimate: (3, 3) with branch_weights of [0.1 0.3 0.6]
+        operator.estimate_rates()
+        assert len(operator.dilation_rates) == 1
+        assert operator.dilation_rates[0] == (3, 3)
+        assert operator.op_layer.dilation == (3, 3)
+        assert operator.op_layer.padding == (6, 6)
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+
+# test with 1x5 conv
+def test_rfsearch_operator_1x5():
+    conv = nn.Conv2d(
+        in_channels=3,
+        out_channels=3,
+        kernel_size=(1, 5),
+        stride=1,
+        padding=(0, 2))
+    operator = Conv2dRFSearchOp(conv, global_config)
+    x = torch.randn(1, 3, 32, 32)
+
+    # After expand: (1, 1) (1, 2)
+    assert len(operator.dilation_rates) == 2
+    assert operator.dilation_rates[0] == (1, 1)
+    assert operator.dilation_rates[1] == (1, 2)
+    assert torch.all(
+        operator.branch_weights.data == global_config['init_alphas']).item()
+    # test forward
+    assert operator(x).shape == (1, 3, 32, 32)
+
+    with torch.no_grad():
+        # After estimate: (1, 2) with branch_weights of [0.5 0.5]
+        operator.estimate_rates()
+        assert len(operator.dilation_rates) == 1
+        assert operator.dilation_rates[0] == (1, 2)
+        assert operator.op_layer.dilation == (1, 2)
+        assert operator.op_layer.padding == (0, 4)
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        # After expand: (1, 1) (1, 3)
+        operator.expand_rates()
+        assert len(operator.dilation_rates) == 2
+        assert operator.dilation_rates[0] == (1, 1)
+        assert operator.dilation_rates[1] == (1, 3)
+        assert torch.all(operator.branch_weights.data ==
+                         global_config['init_alphas']).item()
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        operator.branch_weights[0] = 0.2
+        operator.branch_weights[1] = 0.8
+        # After estimate: (3, 3) with branch_weights of [0.2 0.8]
+        operator.estimate_rates()
+        assert len(operator.dilation_rates) == 1
+        assert operator.dilation_rates[0] == (1, 3)
+        assert operator.op_layer.dilation == (1, 3)
+        assert operator.op_layer.padding == (0, 6)
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+
+# test with 5x5 conv initial_dilation=(2, 2)
+def test_rfsearch_operator_5x5_d2x2():
+    conv = nn.Conv2d(
+        in_channels=3,
+        out_channels=3,
+        kernel_size=5,
+        stride=1,
+        padding=4,
+        dilation=(2, 2))
+    operator = Conv2dRFSearchOp(conv, global_config)
+    x = torch.randn(1, 3, 32, 32)
+
+    with torch.no_grad():
+        # After expand: (1, 1) (3, 3)
+        assert len(operator.dilation_rates) == 2
+        assert operator.dilation_rates[0] == (1, 1)
+        assert operator.dilation_rates[1] == (3, 3)
+        assert torch.all(operator.branch_weights.data ==
+                         global_config['init_alphas']).item()
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
+        operator.estimate_rates()
+        assert len(operator.dilation_rates) == 1
+        assert operator.dilation_rates[0] == (2, 2)
+        assert operator.op_layer.dilation == (2, 2)
+        assert operator.op_layer.padding == (4, 4)
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        # After expand: (1, 1) (3, 3)
+        operator.expand_rates()
+        assert len(operator.dilation_rates) == 2
+        assert operator.dilation_rates[0] == (1, 1)
+        assert operator.dilation_rates[1] == (3, 3)
+        assert torch.all(operator.branch_weights.data ==
+                         global_config['init_alphas']).item()
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        operator.branch_weights[0] = 0.8
+        operator.branch_weights[1] = 0.2
+        # After estimate: (3, 3) with branch_weights of [0.8 0.2]
+        operator.estimate_rates()
+        assert len(operator.dilation_rates) == 1
+        assert operator.dilation_rates[0] == (1, 1)
+        assert operator.op_layer.dilation == (1, 1)
+        assert operator.op_layer.padding == (2, 2)
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+
+# test with 5x5 conv initial_dilation=(1, 2)
+def test_rfsearch_operator_5x5_d1x2():
+    conv = nn.Conv2d(
+        in_channels=3,
+        out_channels=3,
+        kernel_size=5,
+        stride=1,
+        padding=(2, 4),
+        dilation=(1, 2))
+    operator = Conv2dRFSearchOp(conv, global_config)
+    x = torch.randn(1, 3, 32, 32)
+
+    with torch.no_grad():
+        # After expand: (1, 1) (2, 3)
+        assert len(operator.dilation_rates) == 2
+        assert operator.dilation_rates[0] == (1, 1)
+        assert operator.dilation_rates[1] == (2, 3)
+        assert torch.all(operator.branch_weights.data ==
+                         global_config['init_alphas']).item()
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        # After estimate: (2, 2) with branch_weights of [0.5 0.5]
+        operator.estimate_rates()
+        assert len(operator.dilation_rates) == 1
+        assert operator.dilation_rates[0] == (2, 2)
+        assert operator.op_layer.dilation == (2, 2)
+        assert operator.op_layer.padding == (4, 4)
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        # After expand: (1, 1) (3, 3)
+        operator.expand_rates()
+        assert len(operator.dilation_rates) == 2
+        assert operator.dilation_rates[0] == (1, 1)
+        assert operator.dilation_rates[1] == (3, 3)
+        assert torch.all(operator.branch_weights.data ==
+                         global_config['init_alphas']).item()
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
+
+        operator.branch_weights[0] = 0.1
+        operator.branch_weights[1] = 0.8
+        # After estimate: (3, 3) with branch_weights of [0.1 0.8]
+        operator.estimate_rates()
+        assert len(operator.dilation_rates) == 1
+        assert operator.dilation_rates[0] == (3, 3)
+        assert operator.op_layer.dilation == (3, 3)
+        assert operator.op_layer.padding == (6, 6)
+        # test forward
+        assert operator(x).shape == (1, 3, 32, 32)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_rfsearch/test_search.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_rfsearch/test_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..5614e3c1c3c49cba5da177b03cf9aea6bbfd36c2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_rfsearch/test_search.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch.nn as nn
+
+from mmcv.cnn.rfsearch import Conv2dRFSearchOp, RFSearchHook
+
+
+def test_rfsearchhook():
+
+    def conv(in_channels, out_channels, kernel_size, stride, padding,
+             dilation):
+        return nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation)
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.stem = conv(1, 2, 3, 1, 1, 1)
+            self.conv0 = conv(2, 2, 3, 1, 1, 1)
+            self.layer0 = nn.Sequential(
+                conv(2, 2, 3, 1, 1, 1), conv(2, 2, 3, 1, 1, 1))
+            self.conv1 = conv(2, 2, 1, 1, 0, 1)
+            self.conv2 = conv(2, 2, 3, 1, 1, 1)
+            self.conv3 = conv(2, 2, (1, 3), 1, (0, 1), 1)
+
+        def forward(self, x):
+            x1 = self.stem(x)
+            x2 = self.layer0(x1)
+            x3 = self.conv0(x2)
+            x4 = self.conv1(x3)
+            x5 = self.conv2(x4)
+            x6 = self.conv3(x5)
+            return x6
+
+        def train_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x).mean(), num_samples=x.shape[0])
+
+    rfsearch_cfg = dict(
+        mode='search',
+        rfstructure_file=None,
+        config=dict(
+            search=dict(
+                step=0,
+                max_step=12,
+                search_interval=1,
+                exp_rate=0.5,
+                init_alphas=0.01,
+                mmin=1,
+                mmax=24,
+                num_branches=2,
+                skip_layer=['stem', 'conv0', 'layer0.1'])),
+    )
+
+    # hook for search
+    rfsearchhook_search = RFSearchHook(
+        'search', rfsearch_cfg['config'], by_epoch=True, verbose=True)
+    rfsearchhook_search.config['structure'] = {
+        'module.layer0.0': [1, 1],
+        'module.conv2': [2, 2],
+        'module.conv3': [1, 1]
+    }
+    # hook for fixed_single_branch
+    rfsearchhook_fixed_single_branch = RFSearchHook(
+        'fixed_single_branch',
+        rfsearch_cfg['config'],
+        by_epoch=True,
+        verbose=True)
+    rfsearchhook_fixed_single_branch.config['structure'] = {
+        'module.layer0.0': [1, 1],
+        'module.conv2': [2, 2],
+        'module.conv3': [1, 1]
+    }
+    # hook for fixed_multi_branch
+    rfsearchhook_fixed_multi_branch = RFSearchHook(
+        'fixed_multi_branch',
+        rfsearch_cfg['config'],
+        by_epoch=True,
+        verbose=True)
+    rfsearchhook_fixed_multi_branch.config['structure'] = {
+        'module.layer0.0': [1, 1],
+        'module.conv2': [2, 2],
+        'module.conv3': [1, 1]
+    }
+
+    def test_skip_layer():
+        assert not isinstance(model.stem, Conv2dRFSearchOp)
+        assert not isinstance(model.conv0, Conv2dRFSearchOp)
+        assert isinstance(model.layer0[0], Conv2dRFSearchOp)
+        assert not isinstance(model.layer0[1], Conv2dRFSearchOp)
+
+    # 1. test init_model() with mode of search
+    model = Model()
+    rfsearchhook_search.init_model(model)
+
+    test_skip_layer()
+    assert not isinstance(model.conv1, Conv2dRFSearchOp)
+    assert isinstance(model.conv2, Conv2dRFSearchOp)
+    assert isinstance(model.conv3, Conv2dRFSearchOp)
+    assert model.conv2.dilation_rates == [(1, 1), (3, 3)]
+    assert model.conv3.dilation_rates == [(1, 1), (1, 2)]
+
+    # 2. test init_model() with mode of fixed_single_branch
+    model = Model()
+    rfsearchhook_fixed_single_branch.init_model(model)
+
+    assert not isinstance(model.conv1, Conv2dRFSearchOp)
+    assert not isinstance(model.conv2, Conv2dRFSearchOp)
+    assert not isinstance(model.conv3, Conv2dRFSearchOp)
+    assert model.conv1.dilation == (1, 1)
+    assert model.conv2.dilation == (2, 2)
+    assert model.conv3.dilation == (1, 1)
+
+    # 3. test init_model() with mode of fixed_multi_branch
+    model = Model()
+    rfsearchhook_fixed_multi_branch.init_model(model)
+
+    test_skip_layer()
+    assert not isinstance(model.conv1, Conv2dRFSearchOp)
+    assert isinstance(model.conv2, Conv2dRFSearchOp)
+    assert isinstance(model.conv3, Conv2dRFSearchOp)
+    assert model.conv2.dilation_rates == [(1, 1), (3, 3)]
+    assert model.conv3.dilation_rates == [(1, 1), (1, 2)]
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_scale.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..04d75ec16f56353b3d5f8c85bf5911a04a38d4c3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_scale.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.cnn.bricks import LayerScale, Scale
+
+
+def test_scale():
+    # test default scale
+    scale = Scale()
+    assert scale.scale.data == 1.
+    assert scale.scale.dtype == torch.float
+    x = torch.rand(1, 3, 64, 64)
+    output = scale(x)
+    assert output.shape == (1, 3, 64, 64)
+
+    # test given scale
+    scale = Scale(10.)
+    assert scale.scale.data == 10.
+    assert scale.scale.dtype == torch.float
+    x = torch.rand(1, 3, 64, 64)
+    output = scale(x)
+    assert output.shape == (1, 3, 64, 64)
+
+
+def test_layer_scale():
+    with pytest.raises(AssertionError):
+        cfg = dict(
+            dim=10,
+            data_format='BNC',
+        )
+        LayerScale(**cfg)
+
+    # test init
+    cfg = dict(dim=10)
+    ls = LayerScale(**cfg)
+    assert torch.equal(ls.weight, torch.ones(10, requires_grad=True) * 1e-5)
+
+    # test forward
+    # test channels_last
+    cfg = dict(dim=256, inplace=False, data_format='channels_last')
+    ls_channels_last = LayerScale(**cfg)
+    x = torch.randn((4, 49, 256))
+    out = ls_channels_last(x)
+    assert tuple(out.size()) == (4, 49, 256)
+    assert torch.equal(x * 1e-5, out)
+
+    # test channels_last 2d
+    cfg = dict(dim=256, inplace=False, data_format='channels_last')
+    ls_channels_last = LayerScale(**cfg)
+    x = torch.randn((4, 7, 49, 256))
+    out = ls_channels_last(x)
+    assert tuple(out.size()) == (4, 7, 49, 256)
+    assert torch.equal(x * 1e-5, out)
+
+    # test channels_first
+    cfg = dict(dim=256, inplace=False, data_format='channels_first')
+    ls_channels_first = LayerScale(**cfg)
+    x = torch.randn((4, 256, 7, 7))
+    out = ls_channels_first(x)
+    assert tuple(out.size()) == (4, 256, 7, 7)
+    assert torch.equal(x * 1e-5, out)
+
+    # test channels_first 3D
+    cfg = dict(dim=256, inplace=False, data_format='channels_first')
+    ls_channels_first = LayerScale(**cfg)
+    x = torch.randn((4, 256, 7, 7, 7))
+    out = ls_channels_first(x)
+    assert tuple(out.size()) == (4, 256, 7, 7, 7)
+    assert torch.equal(x * 1e-5, out)
+
+    # test inplace True
+    cfg = dict(dim=256, inplace=True, data_format='channels_first')
+    ls_channels_first = LayerScale(**cfg)
+    x = torch.randn((4, 256, 7, 7))
+    out = ls_channels_first(x)
+    assert tuple(out.size()) == (4, 256, 7, 7)
+    assert x is out
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_silu.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_silu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3bbc0f9bb5f2c07da61b413f0df0fae16fab9d8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_silu.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmcv.cnn.bricks import build_activation_layer
+
+
+def test_silu():
+    act = build_activation_layer(dict(type='SiLU'))
+    input = torch.randn(1, 3, 64, 64)
+    expected_output = input * torch.sigmoid(input)
+    output = act(input)
+    # test output shape
+    assert output.shape == expected_output.shape
+    # test output value
+    assert torch.allclose(output, expected_output)
+
+    # test inplace
+    act = build_activation_layer(dict(type='SiLU', inplace=True))
+    assert act.inplace
+    input = torch.randn(1, 3, 64, 64)
+    expected_output = input * torch.sigmoid(input)
+    output = act(input)
+    # test output shape
+    assert output.shape == expected_output.shape
+    # test output value
+    assert torch.allclose(output, expected_output)
+    assert torch.allclose(input, expected_output)
+    assert input is output
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_swish.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_swish.py
new file mode 100644
index 0000000000000000000000000000000000000000..2317f5a139a5228c049848a260ea914ac02eecee
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_swish.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+
+from mmcv.cnn.bricks import Swish
+
+
+def test_swish():
+    act = Swish()
+    input = torch.randn(1, 3, 64, 64)
+    expected_output = input * F.sigmoid(input)
+    output = act(input)
+    # test output shape
+    assert output.shape == expected_output.shape
+    # test output value
+    assert torch.equal(output, expected_output)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_transformer.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5a9562ee723cdd8e780e5878990170e4be419a7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_transformer.py
@@ -0,0 +1,687 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import pytest
+import torch
+from mmengine.model import ModuleList
+
+from mmcv.cnn.bricks.drop import DropPath
+from mmcv.cnn.bricks.transformer import (FFN, AdaptivePadding,
+                                         BaseTransformerLayer,
+                                         MultiheadAttention, PatchEmbed,
+                                         PatchMerging,
+                                         TransformerLayerSequence)
+
+
+def test_adaptive_padding():
+
+    for padding in ('same', 'corner'):
+        kernel_size = 16
+        stride = 16
+        dilation = 1
+        input = torch.rand(1, 1, 15, 17)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        out = adap_pad(input)
+        # padding to divisible by 16
+        assert (out.shape[2], out.shape[3]) == (16, 32)
+        input = torch.rand(1, 1, 16, 17)
+        out = adap_pad(input)
+        # padding to divisible by 16
+        assert (out.shape[2], out.shape[3]) == (16, 32)
+
+        kernel_size = (2, 2)
+        stride = (2, 2)
+        dilation = (1, 1)
+
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 11, 13)
+        out = adap_pad(input)
+        # padding to divisible by 2
+        assert (out.shape[2], out.shape[3]) == (12, 14)
+
+        kernel_size = (2, 2)
+        stride = (10, 10)
+        dilation = (1, 1)
+
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 10, 13)
+        out = adap_pad(input)
+        #  no padding
+        assert (out.shape[2], out.shape[3]) == (10, 13)
+
+        kernel_size = (11, 11)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 11, 13)
+        out = adap_pad(input)
+        #  all padding
+        assert (out.shape[2], out.shape[3]) == (21, 21)
+
+        # test padding as kernel is (7,9)
+        input = torch.rand(1, 1, 11, 13)
+        stride = (3, 4)
+        kernel_size = (4, 5)
+        dilation = (2, 2)
+        # actually (7, 9)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        dilation_out = adap_pad(input)
+        assert (dilation_out.shape[2], dilation_out.shape[3]) == (16, 21)
+        kernel_size = (7, 9)
+        dilation = (1, 1)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        kernel79_out = adap_pad(input)
+        assert (kernel79_out.shape[2], kernel79_out.shape[3]) == (16, 21)
+        assert kernel79_out.shape == dilation_out.shape
+
+    # assert only support "same" "corner"
+    with pytest.raises(AssertionError):
+        AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=1)
+
+
+def test_patch_embed():
+    B = 2
+    H = 3
+    W = 4
+    C = 3
+    embed_dims = 10
+    kernel_size = 3
+    stride = 1
+    dummy_input = torch.rand(B, C, H, W)
+    patch_merge_1 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=1,
+        norm_cfg=None)
+
+    x1, shape = patch_merge_1(dummy_input)
+    # test out shape
+    assert x1.shape == (2, 2, 10)
+    # test outsize is correct
+    assert shape == (1, 2)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x1.shape[1]
+
+    B = 2
+    H = 10
+    W = 10
+    C = 3
+    embed_dims = 10
+    kernel_size = 5
+    stride = 2
+    dummy_input = torch.rand(B, C, H, W)
+    # test dilation
+    patch_merge_2 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=None,
+    )
+
+    x2, shape = patch_merge_2(dummy_input)
+    # test out shape
+    assert x2.shape == (2, 1, 10)
+    # test outsize is correct
+    assert shape == (1, 1)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x2.shape[1]
+
+    stride = 2
+    input_size = (10, 10)
+
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    x3, shape = patch_merge_3(dummy_input)
+    # test out shape
+    assert x3.shape == (2, 1, 10)
+    # test outsize is correct
+    assert shape == (1, 1)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x3.shape[1]
+
+    # test the init_out_size with nn.Unfold
+    assert patch_merge_3.init_out_size[1] == (input_size[0] - 2 * 4 -
+                                              1) // 2 + 1
+    assert patch_merge_3.init_out_size[0] == (input_size[0] - 2 * 4 -
+                                              1) // 2 + 1
+    H = 11
+    W = 12
+    input_size = (H, W)
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    _, shape = patch_merge_3(dummy_input)
+    # when input_size equal to real input
+    # the out_size should be equal to `init_out_size`
+    assert shape == patch_merge_3.init_out_size
+
+    input_size = (H, W)
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    _, shape = patch_merge_3(dummy_input)
+    # when input_size equal to real input
+    # the out_size should be equal to `init_out_size`
+    assert shape == patch_merge_3.init_out_size
+
+    # test adap padding
+    for padding in ('same', 'corner'):
+        in_c = 2
+        embed_dims = 3
+        B = 2
+
+        # test stride is 1
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (1, 1)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 25, 3)
+        assert out_size == (5, 5)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 1, 3)
+        assert out_size == (1, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (6, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 2, 3)
+        assert out_size == (2, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test different kernel_size with different stride
+        input_size = (6, 5)
+        kernel_size = (6, 2)
+        stride = (6, 2)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 3, 3)
+        assert out_size == (1, 3)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+
+def test_patch_merging():
+
+    # Test the model with int padding
+    in_c = 3
+    out_c = 4
+    kernel_size = 3
+    stride = 3
+    padding = 1
+    dilation = 1
+    bias = False
+    # test the case `pad_to_stride` is False
+    patch_merge = PatchMerging(
+        in_channels=in_c,
+        out_channels=out_c,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=bias)
+    B, L, C = 1, 100, 3
+    input_size = (10, 10)
+    x = torch.rand(B, L, C)
+    x_out, out_size = patch_merge(x, input_size)
+    assert x_out.size() == (1, 16, 4)
+    assert out_size == (4, 4)
+    # assert out size is consistent with real output
+    assert x_out.size(1) == out_size[0] * out_size[1]
+    in_c = 4
+    out_c = 5
+    kernel_size = 6
+    stride = 3
+    padding = 2
+    dilation = 2
+    bias = False
+    patch_merge = PatchMerging(
+        in_channels=in_c,
+        out_channels=out_c,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=bias)
+    B, L, C = 1, 100, 4
+    input_size = (10, 10)
+    x = torch.rand(B, L, C)
+    x_out, out_size = patch_merge(x, input_size)
+    assert x_out.size() == (1, 4, 5)
+    assert out_size == (2, 2)
+    # assert out size is consistent with real output
+    assert x_out.size(1) == out_size[0] * out_size[1]
+
+    # Test with adaptive padding
+    for padding in ('same', 'corner'):
+        in_c = 2
+        out_c = 3
+        B = 2
+
+        # test stride is 1
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (1, 1)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 25, 3)
+        assert out_size == (5, 5)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 1, 3)
+        assert out_size == (1, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (6, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 2, 3)
+        assert out_size == (2, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test different kernel_size with different stride
+        input_size = (6, 5)
+        kernel_size = (6, 2)
+        stride = (6, 2)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 3, 3)
+        assert out_size == (1, 3)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+
+def test_multiheadattention():
+    MultiheadAttention(
+        embed_dims=5,
+        num_heads=5,
+        attn_drop=0,
+        proj_drop=0,
+        dropout_layer=dict(type='Dropout', drop_prob=0.),
+        batch_first=True)
+    batch_dim = 2
+    embed_dim = 5
+    num_query = 100
+    attn_batch_first = MultiheadAttention(
+        embed_dims=5,
+        num_heads=5,
+        attn_drop=0,
+        proj_drop=0,
+        dropout_layer=dict(type='DropPath', drop_prob=0.),
+        batch_first=True)
+
+    attn_query_first = MultiheadAttention(
+        embed_dims=5,
+        num_heads=5,
+        attn_drop=0,
+        proj_drop=0,
+        dropout_layer=dict(type='DropPath', drop_prob=0.),
+        batch_first=False)
+
+    param_dict = dict(attn_query_first.named_parameters())
+    for n, v in attn_batch_first.named_parameters():
+        param_dict[n].data = v.data
+
+    input_batch_first = torch.rand(batch_dim, num_query, embed_dim)
+    input_query_first = input_batch_first.transpose(0, 1)
+
+    assert torch.allclose(
+        attn_query_first(input_query_first).sum(),
+        attn_batch_first(input_batch_first).sum())
+
+    key_batch_first = torch.rand(batch_dim, num_query, embed_dim)
+    key_query_first = key_batch_first.transpose(0, 1)
+
+    assert torch.allclose(
+        attn_query_first(input_query_first, key_query_first).sum(),
+        attn_batch_first(input_batch_first, key_batch_first).sum())
+
+    identity = torch.ones_like(input_query_first)
+
+    # check deprecated arguments can be used normally
+
+    assert torch.allclose(
+        attn_query_first(
+            input_query_first, key_query_first, residual=identity).sum(),
+        attn_batch_first(input_batch_first, key_batch_first).sum() +
+        identity.sum() - input_batch_first.sum())
+
+    assert torch.allclose(
+        attn_query_first(
+            input_query_first, key_query_first, identity=identity).sum(),
+        attn_batch_first(input_batch_first, key_batch_first).sum() +
+        identity.sum() - input_batch_first.sum())
+
+    attn_query_first(
+        input_query_first, key_query_first, identity=identity).sum(),
+
+
+def test_ffn():
+    with pytest.raises(AssertionError):
+        # num_fcs should be no less than 2
+        FFN(num_fcs=1)
+    ffn = FFN(dropout=0, add_identity=True)
+
+    input_tensor = torch.rand(2, 20, 256)
+    input_tensor_nbc = input_tensor.transpose(0, 1)
+    assert torch.allclose(ffn(input_tensor).sum(), ffn(input_tensor_nbc).sum())
+    residual = torch.rand_like(input_tensor)
+    torch.allclose(
+        ffn(input_tensor, residual=residual).sum(),
+        ffn(input_tensor).sum() + residual.sum() - input_tensor.sum())
+
+    torch.allclose(
+        ffn(input_tensor, identity=residual).sum(),
+        ffn(input_tensor).sum() + residual.sum() - input_tensor.sum())
+
+    # test with layer_scale
+    ffn = FFN(dropout=0, add_identity=True, layer_scale_init_value=0.1)
+
+    input_tensor = torch.rand(2, 20, 256)
+    input_tensor_nbc = input_tensor.transpose(0, 1)
+    assert torch.allclose(ffn(input_tensor).sum(), ffn(input_tensor_nbc).sum())
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='Cuda not available')
+def test_basetransformerlayer_cuda():
+    # To test if the BaseTransformerLayer's behaviour remains
+    # consistent after being deepcopied
+    operation_order = ('self_attn', 'ffn')
+    baselayer = BaseTransformerLayer(
+        operation_order=operation_order,
+        batch_first=True,
+        attn_cfgs=dict(
+            type='MultiheadAttention',
+            embed_dims=256,
+            num_heads=8,
+        ),
+    )
+    baselayers = ModuleList([copy.deepcopy(baselayer) for _ in range(2)])
+    baselayers.to('cuda')
+    x = torch.rand(2, 10, 256).cuda()
+    for m in baselayers:
+        x = m(x)
+        assert x.shape == torch.Size([2, 10, 256])
+
+
+@pytest.mark.parametrize('embed_dims', [False, 256])
+def test_basetransformerlayer(embed_dims):
+    attn_cfgs = dict(type='MultiheadAttention', embed_dims=256, num_heads=8),
+    if embed_dims:
+        ffn_cfgs = dict(
+            type='FFN',
+            embed_dims=embed_dims,
+            feedforward_channels=1024,
+            num_fcs=2,
+            ffn_drop=0.,
+            act_cfg=dict(type='ReLU', inplace=True),
+        )
+    else:
+        ffn_cfgs = dict(
+            type='FFN',
+            feedforward_channels=1024,
+            num_fcs=2,
+            ffn_drop=0.,
+            act_cfg=dict(type='ReLU', inplace=True),
+        )
+
+    feedforward_channels = 2048
+    ffn_dropout = 0.1
+    operation_order = ('self_attn', 'norm', 'ffn', 'norm')
+
+    # test deprecated_args
+    baselayer = BaseTransformerLayer(
+        attn_cfgs=attn_cfgs,
+        ffn_cfgs=ffn_cfgs,
+        feedforward_channels=feedforward_channels,
+        ffn_dropout=ffn_dropout,
+        operation_order=operation_order)
+    assert baselayer.batch_first is False
+    assert baselayer.ffns[0].feedforward_channels == feedforward_channels
+
+    attn_cfgs = dict(type='MultiheadAttention', num_heads=8, embed_dims=256),
+    feedforward_channels = 2048
+    ffn_dropout = 0.1
+    operation_order = ('self_attn', 'norm', 'ffn', 'norm')
+    baselayer = BaseTransformerLayer(
+        attn_cfgs=attn_cfgs,
+        feedforward_channels=feedforward_channels,
+        ffn_dropout=ffn_dropout,
+        operation_order=operation_order,
+        batch_first=True)
+    assert baselayer.attentions[0].batch_first
+    in_tensor = torch.rand(2, 10, 256)
+    baselayer(in_tensor)
+
+
+def test_transformerlayersequence():
+    squeue = TransformerLayerSequence(
+        num_layers=6,
+        transformerlayers=dict(
+            type='BaseTransformerLayer',
+            attn_cfgs=[
+                dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.1),
+                dict(type='MultiheadAttention', embed_dims=256, num_heads=4)
+            ],
+            feedforward_channels=1024,
+            ffn_dropout=0.1,
+            operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
+                             'norm')))
+    assert len(squeue.layers) == 6
+    assert squeue.pre_norm is False
+    with pytest.raises(AssertionError):
+        # if transformerlayers is a list, len(transformerlayers)
+        # should be equal to num_layers
+        TransformerLayerSequence(
+            num_layers=6,
+            transformerlayers=[
+                dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(type='MultiheadAttention', embed_dims=256)
+                    ],
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))
+            ])
+
+
+def test_drop_path():
+    drop_path = DropPath(drop_prob=0)
+    test_in = torch.rand(2, 3, 4, 5)
+    assert test_in is drop_path(test_in)
+
+    drop_path = DropPath(drop_prob=0.1)
+    drop_path.training = False
+    test_in = torch.rand(2, 3, 4, 5)
+    assert test_in is drop_path(test_in)
+    drop_path.training = True
+    assert test_in is not drop_path(test_in)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_cnn/test_wrappers.py b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c76ccbdd4f930a1f50d38b38d1d47d09d6d5b3b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_cnn/test_wrappers.py
@@ -0,0 +1,396 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import patch
+
+import pytest
+import torch
+import torch.nn as nn
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+from mmcv.cnn.bricks import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
+                             Linear, MaxPool2d, MaxPool3d)
+
+if torch.__version__ != 'parrots':
+    torch_version = '1.1'
+else:
+    torch_version = 'parrots'
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize(
+    'in_w,in_h,in_channel,out_channel,kernel_size,stride,padding,dilation',
+    [(10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 3, 3, 5, 2, 1, 2)])
+def test_conv2d(in_w, in_h, in_channel, out_channel, kernel_size, stride,
+                padding, dilation):
+    """
+    CommandLine:
+        xdoctest -m tests/test_wrappers.py test_conv2d
+    """
+    # train mode
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_channel, in_h, in_w)
+    torch.manual_seed(0)
+    wrapper = Conv2d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation)
+    wrapper_out = wrapper(x_empty)
+
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_channel, in_h, in_w).requires_grad_(True)
+    torch.manual_seed(0)
+    ref = nn.Conv2d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation)
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    wrapper_out.sum().backward()
+    assert wrapper.weight.grad is not None
+    assert wrapper.weight.grad.shape == wrapper.weight.shape
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+    # eval mode
+    x_empty = torch.randn(0, in_channel, in_h, in_w)
+    wrapper = Conv2d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation)
+    wrapper.eval()
+    wrapper(x_empty)
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize(
+    'in_w,in_h,in_t,in_channel,out_channel,kernel_size,stride,padding,dilation',  # noqa: E501
+    [(10, 10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 20, 3, 3, 5, 2, 1, 2)])
+def test_conv3d(in_w, in_h, in_t, in_channel, out_channel, kernel_size, stride,
+                padding, dilation):
+    """
+    CommandLine:
+        xdoctest -m tests/test_wrappers.py test_conv3d
+    """
+    # train mode
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w)
+    torch.manual_seed(0)
+    wrapper = Conv3d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation)
+    wrapper_out = wrapper(x_empty)
+
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_channel, in_t, in_h,
+                           in_w).requires_grad_(True)
+    torch.manual_seed(0)
+    ref = nn.Conv3d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation)
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    wrapper_out.sum().backward()
+    assert wrapper.weight.grad is not None
+    assert wrapper.weight.grad.shape == wrapper.weight.shape
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+    # eval mode
+    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w)
+    wrapper = Conv3d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation)
+    wrapper.eval()
+    wrapper(x_empty)
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize(
+    'in_w,in_h,in_channel,out_channel,kernel_size,stride,padding,dilation',
+    [(10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 3, 3, 5, 2, 1, 2)])
+def test_conv_transposed_2d(in_w, in_h, in_channel, out_channel, kernel_size,
+                            stride, padding, dilation):
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_channel, in_h, in_w, requires_grad=True)
+    # out padding must be smaller than either stride or dilation
+    op = min(stride, dilation) - 1
+    if torch.__version__ == 'parrots':
+        op = 0
+    torch.manual_seed(0)
+    wrapper = ConvTranspose2d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        output_padding=op)
+    wrapper_out = wrapper(x_empty)
+
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_channel, in_h, in_w)
+    torch.manual_seed(0)
+    ref = nn.ConvTranspose2d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        output_padding=op)
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    wrapper_out.sum().backward()
+    assert wrapper.weight.grad is not None
+    assert wrapper.weight.grad.shape == wrapper.weight.shape
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+    # eval mode
+    x_empty = torch.randn(0, in_channel, in_h, in_w)
+    wrapper = ConvTranspose2d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        output_padding=op)
+    wrapper.eval()
+    wrapper(x_empty)
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize(
+    'in_w,in_h,in_t,in_channel,out_channel,kernel_size,stride,padding,dilation',  # noqa: E501
+    [(10, 10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 20, 3, 3, 5, 2, 1, 2)])
+def test_conv_transposed_3d(in_w, in_h, in_t, in_channel, out_channel,
+                            kernel_size, stride, padding, dilation):
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w, requires_grad=True)
+    # out padding must be smaller than either stride or dilation
+    op = min(stride, dilation) - 1
+    torch.manual_seed(0)
+    wrapper = ConvTranspose3d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        output_padding=op)
+    wrapper_out = wrapper(x_empty)
+
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_channel, in_t, in_h, in_w)
+    torch.manual_seed(0)
+    ref = nn.ConvTranspose3d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        output_padding=op)
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    wrapper_out.sum().backward()
+    assert wrapper.weight.grad is not None
+    assert wrapper.weight.grad.shape == wrapper.weight.shape
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+    # eval mode
+    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w)
+    wrapper = ConvTranspose3d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        output_padding=op)
+    wrapper.eval()
+    wrapper(x_empty)
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize(
+    'in_w,in_h,in_channel,out_channel,kernel_size,stride,padding,dilation',
+    [(10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 3, 3, 5, 2, 1, 2)])
+def test_max_pool_2d(in_w, in_h, in_channel, out_channel, kernel_size, stride,
+                     padding, dilation):
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_channel, in_h, in_w, requires_grad=True)
+    wrapper = MaxPool2d(
+        kernel_size, stride=stride, padding=padding, dilation=dilation)
+    wrapper_out = wrapper(x_empty)
+
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_channel, in_h, in_w)
+    ref = nn.MaxPool2d(
+        kernel_size, stride=stride, padding=padding, dilation=dilation)
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize(
+    'in_w,in_h,in_t,in_channel,out_channel,kernel_size,stride,padding,dilation',  # noqa: E501
+    [(10, 10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 20, 3, 3, 5, 2, 1, 2)])
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots' and not torch.cuda.is_available(),
+    reason='parrots requires CUDA support')
+def test_max_pool_3d(in_w, in_h, in_t, in_channel, out_channel, kernel_size,
+                     stride, padding, dilation):
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w, requires_grad=True)
+    wrapper = MaxPool3d(
+        kernel_size, stride=stride, padding=padding, dilation=dilation)
+    if torch.__version__ == 'parrots':
+        x_empty = x_empty.cuda()
+    wrapper_out = wrapper(x_empty)
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_channel, in_t, in_h, in_w)
+    ref = nn.MaxPool3d(
+        kernel_size, stride=stride, padding=padding, dilation=dilation)
+    if torch.__version__ == 'parrots':
+        x_normal = x_normal.cuda()
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize('in_w,in_h,in_feature,out_feature', [(10, 10, 1, 1),
+                                                              (20, 20, 3, 3)])
+def test_linear(in_w, in_h, in_feature, out_feature):
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_feature, requires_grad=True)
+    torch.manual_seed(0)
+    wrapper = Linear(in_feature, out_feature)
+    wrapper_out = wrapper(x_empty)
+
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_feature)
+    torch.manual_seed(0)
+    ref = nn.Linear(in_feature, out_feature)
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    wrapper_out.sum().backward()
+    assert wrapper.weight.grad is not None
+    assert wrapper.weight.grad.shape == wrapper.weight.shape
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+    # eval mode
+    x_empty = torch.randn(0, in_feature)
+    wrapper = Linear(in_feature, out_feature)
+    wrapper.eval()
+    wrapper(x_empty)
+
+
+@patch('mmcv.cnn.bricks.wrappers.TORCH_VERSION', (1, 10))
+def test_nn_op_forward_called():
+
+    for m in ['Conv2d', 'ConvTranspose2d', 'MaxPool2d']:
+        with patch(f'torch.nn.{m}.forward') as nn_module_forward:
+            # randn input
+            x_empty = torch.randn(0, 3, 10, 10)
+            wrapper = eval(m)(3, 2, 1)
+            wrapper(x_empty)
+            nn_module_forward.assert_called_with(x_empty)
+
+            # non-randn input
+            x_normal = torch.randn(1, 3, 10, 10)
+            wrapper = eval(m)(3, 2, 1)
+            wrapper(x_normal)
+            nn_module_forward.assert_called_with(x_normal)
+
+    for m in ['Conv3d', 'ConvTranspose3d', 'MaxPool3d']:
+        with patch(f'torch.nn.{m}.forward') as nn_module_forward:
+            # randn input
+            x_empty = torch.randn(0, 3, 10, 10, 10)
+            wrapper = eval(m)(3, 2, 1)
+            wrapper(x_empty)
+            nn_module_forward.assert_called_with(x_empty)
+
+            # non-randn input
+            x_normal = torch.randn(1, 3, 10, 10, 10)
+            wrapper = eval(m)(3, 2, 1)
+            wrapper(x_normal)
+            nn_module_forward.assert_called_with(x_normal)
+
+    with patch('torch.nn.Linear.forward') as nn_module_forward:
+        # randn input
+        x_empty = torch.randn(0, 3)
+        wrapper = Linear(3, 3)
+        wrapper(x_empty)
+        nn_module_forward.assert_called_with(x_empty)
+
+        # non-randn input
+        x_normal = torch.randn(1, 3)
+        wrapper = Linear(3, 3)
+        wrapper(x_normal)
+        nn_module_forward.assert_called_with(x_normal)
+
+
+@pytest.mark.skipif(
+    digit_version(TORCH_VERSION) < digit_version('1.10'),
+    reason='MaxPool2d and MaxPool3d will fail fx for torch<=1.9')
+def test_fx_compatibility():
+    from torch import fx
+
+    # ensure the fx trace can pass the network
+    for Net in (MaxPool2d, MaxPool3d):
+        net = Net(1)
+        gm_module = fx.symbolic_trace(net)  # noqa: F841
+    for Net in (Linear, ):
+        net = Net(1, 1)
+        gm_module = fx.symbolic_trace(net)  # noqa: F841
+    for Net in (Conv2d, ConvTranspose2d, Conv3d, ConvTranspose3d):
+        net = Net(1, 1, 1)
+        gm_module = fx.symbolic_trace(net)  # noqa: F841
diff --git a/head_extractor/mmcv-2.1.0/tests/test_image/test_colorspace.py b/head_extractor/mmcv-2.1.0/tests/test_image/test_colorspace.py
new file mode 100644
index 0000000000000000000000000000000000000000..d53e4e44da7bf656fa5b35cb042eb2ee37979a42
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_image/test_colorspace.py
@@ -0,0 +1,355 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+import pytest
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+
+import mmcv
+from mmcv.image.colorspace import (_convert_input_type_range,
+                                   _convert_output_type_range)
+
+
+def test_bgr2gray():
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.bgr2gray(in_img)
+    computed_gray = (
+        in_img[:, :, 0] * 0.114 + in_img[:, :, 1] * 0.587 +
+        in_img[:, :, 2] * 0.299)
+    assert_array_almost_equal(out_img, computed_gray, decimal=4)
+    out_img_3d = mmcv.bgr2gray(in_img, True)
+    assert out_img_3d.shape == (10, 10, 1)
+    assert_array_almost_equal(out_img_3d[..., 0], out_img, decimal=4)
+
+
+def test_rgb2gray():
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.rgb2gray(in_img)
+    computed_gray = (
+        in_img[:, :, 0] * 0.299 + in_img[:, :, 1] * 0.587 +
+        in_img[:, :, 2] * 0.114)
+    assert_array_almost_equal(out_img, computed_gray, decimal=4)
+    out_img_3d = mmcv.rgb2gray(in_img, True)
+    assert out_img_3d.shape == (10, 10, 1)
+    assert_array_almost_equal(out_img_3d[..., 0], out_img, decimal=4)
+
+
+def test_gray2bgr():
+    in_img = np.random.rand(10, 10).astype(np.float32)
+    out_img = mmcv.gray2bgr(in_img)
+    assert out_img.shape == (10, 10, 3)
+    for i in range(3):
+        assert_array_almost_equal(out_img[..., i], in_img, decimal=4)
+
+
+def test_gray2rgb():
+    in_img = np.random.rand(10, 10).astype(np.float32)
+    out_img = mmcv.gray2rgb(in_img)
+    assert out_img.shape == (10, 10, 3)
+    for i in range(3):
+        assert_array_almost_equal(out_img[..., i], in_img, decimal=4)
+
+
+def test_bgr2rgb():
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.bgr2rgb(in_img)
+    assert out_img.shape == in_img.shape
+    assert_array_equal(out_img[..., 0], in_img[..., 2])
+    assert_array_equal(out_img[..., 1], in_img[..., 1])
+    assert_array_equal(out_img[..., 2], in_img[..., 0])
+
+
+def test_rgb2bgr():
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.rgb2bgr(in_img)
+    assert out_img.shape == in_img.shape
+    assert_array_equal(out_img[..., 0], in_img[..., 2])
+    assert_array_equal(out_img[..., 1], in_img[..., 1])
+    assert_array_equal(out_img[..., 2], in_img[..., 0])
+
+
+def test_bgr2hsv():
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.bgr2hsv(in_img)
+    argmax = in_img.argmax(axis=2)
+    computed_hsv = np.empty_like(in_img)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            b, g, r = in_img[i, j]
+            v = max(r, g, b)
+            s = (v - min(r, g, b)) / v if v != 0 else 0
+            if argmax[i, j] == 0:
+                h = 240 + 60 * (r - g) / (v - min(r, g, b))
+            elif argmax[i, j] == 1:
+                h = 120 + 60 * (b - r) / (v - min(r, g, b))
+            else:
+                h = 60 * (g - b) / (v - min(r, g, b))
+            if h < 0:
+                h += 360
+            computed_hsv[i, j, :] = [h, s, v]
+    assert_array_almost_equal(out_img, computed_hsv, decimal=2)
+
+
+def test_convert_input_type_range():
+    with pytest.raises(TypeError):
+        # The img type should be np.float32 or np.uint8
+        in_img = np.random.rand(10, 10, 3).astype(np.uint64)
+        _convert_input_type_range(in_img)
+    # np.float32
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = _convert_input_type_range(in_img)
+    assert out_img.dtype == np.float32
+    assert np.absolute(out_img).mean() < 1
+    # np.uint8
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = _convert_input_type_range(in_img)
+    assert out_img.dtype == np.float32
+    assert np.absolute(out_img).mean() < 1
+
+
+def test_convert_output_type_range():
+    with pytest.raises(TypeError):
+        # The dst_type should be np.float32 or np.uint8
+        in_img = np.random.rand(10, 10, 3).astype(np.float32)
+        _convert_output_type_range(in_img, np.uint64)
+    # np.float32
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.float32)
+    out_img = _convert_output_type_range(in_img, np.float32)
+    assert out_img.dtype == np.float32
+    assert np.absolute(out_img).mean() < 1
+    # np.uint8
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.float32)
+    out_img = _convert_output_type_range(in_img, np.uint8)
+    assert out_img.dtype == np.uint8
+    assert np.absolute(out_img).mean() > 1
+
+
+def assert_image_almost_equal(x, y, atol=1):
+    assert x.dtype == np.uint8
+    assert y.dtype == np.uint8
+    assert np.all(np.abs(x.astype(np.int32) - y.astype(np.int32)) <= atol)
+
+
+def test_rgb2ycbcr():
+    with pytest.raises(TypeError):
+        # The img type should be np.float32 or np.uint8
+        in_img = np.random.rand(10, 10, 3).astype(np.uint64)
+        mmcv.rgb2ycbcr(in_img)
+
+    # float32
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.rgb2ycbcr(in_img)
+    computed_ycbcr = np.empty_like(in_img)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            r, g, b = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0
+            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214
+            computed_ycbcr[i, j, :] = [y, cb, cr]
+    computed_ycbcr /= 255.
+    assert_array_almost_equal(out_img, computed_ycbcr, decimal=2)
+    # y_only=True
+    out_img = mmcv.rgb2ycbcr(in_img, y_only=True)
+    computed_y = np.empty_like(out_img, dtype=out_img.dtype)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            r, g, b = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            computed_y[i, j] = y
+    computed_y /= 255.
+    assert_array_almost_equal(out_img, computed_y, decimal=2)
+
+    # uint8
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = mmcv.rgb2ycbcr(in_img)
+    computed_ycbcr = np.empty_like(in_img)
+    in_img = in_img / 255.
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            r, g, b = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0
+            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214
+            y, cb, cr = y.round(), cb.round(), cr.round()
+            computed_ycbcr[i, j, :] = [y, cb, cr]
+    assert_image_almost_equal(out_img, computed_ycbcr)
+    # y_only=True
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = mmcv.rgb2ycbcr(in_img, y_only=True)
+    computed_y = np.empty_like(out_img, dtype=out_img.dtype)
+    in_img = in_img / 255.
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            r, g, b = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            y = y.round()
+            computed_y[i, j] = y
+    assert_image_almost_equal(out_img, computed_y)
+
+
+def test_bgr2ycbcr():
+    # float32
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.bgr2ycbcr(in_img)
+    computed_ycbcr = np.empty_like(in_img)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            b, g, r = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0
+            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214
+            computed_ycbcr[i, j, :] = [y, cb, cr]
+    computed_ycbcr /= 255.
+    assert_array_almost_equal(out_img, computed_ycbcr, decimal=2)
+    # y_only=True
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.bgr2ycbcr(in_img, y_only=True)
+    computed_y = np.empty_like(out_img, dtype=out_img.dtype)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            b, g, r = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            computed_y[i, j] = y
+    computed_y /= 255.
+    assert_array_almost_equal(out_img, computed_y, decimal=2)
+
+    # uint8
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = mmcv.bgr2ycbcr(in_img)
+    computed_ycbcr = np.empty_like(in_img)
+    in_img = in_img / 255.
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            b, g, r = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0
+            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214
+            y, cb, cr = y.round(), cb.round(), cr.round()
+            computed_ycbcr[i, j, :] = [y, cb, cr]
+    assert_image_almost_equal(out_img, computed_ycbcr)
+    # y_only = True
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = mmcv.bgr2ycbcr(in_img, y_only=True)
+    computed_y = np.empty_like(out_img, dtype=out_img.dtype)
+    in_img = in_img / 255.
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            b, g, r = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            y = y.round()
+            computed_y[i, j] = y
+    assert_image_almost_equal(out_img, computed_y)
+
+
+def test_ycbcr2rgb():
+    with pytest.raises(TypeError):
+        # The img type should be np.float32 or np.uint8
+        in_img = np.random.rand(10, 10, 3).astype(np.uint64)
+        mmcv.ycbcr2rgb(in_img)
+
+    # float32
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.ycbcr2rgb(in_img)
+    computed_rgb = np.empty_like(in_img)
+    in_img *= 255.
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            y, cb, cr = in_img[i, j]
+            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255
+            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \
+                cr * 0.00318811 * 255
+            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255
+            computed_rgb[i, j, :] = [r, g, b]
+    computed_rgb /= 255.
+    assert_array_almost_equal(out_img, computed_rgb, decimal=2)
+
+    # uint8
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = mmcv.ycbcr2rgb(in_img)
+    computed_rgb = np.empty_like(in_img)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            y, cb, cr = in_img[i, j]
+            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255
+            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \
+                cr * 0.00318811 * 255
+            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255
+            r, g, b = r.round(), g.round(), b.round()
+            computed_rgb[i, j, :] = [r, g, b]
+    assert_image_almost_equal(out_img, computed_rgb)
+
+
+def test_ycbcr2bgr():
+    # float32
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.ycbcr2bgr(in_img)
+    computed_bgr = np.empty_like(in_img)
+    in_img *= 255.
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            y, cb, cr = in_img[i, j]
+            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255
+            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \
+                cr * 0.00318811 * 255
+            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255
+            computed_bgr[i, j, :] = [b, g, r]
+    computed_bgr /= 255.
+    assert_array_almost_equal(out_img, computed_bgr, decimal=2)
+
+    # uint8
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = mmcv.ycbcr2bgr(in_img)
+    computed_bgr = np.empty_like(in_img)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            y, cb, cr = in_img[i, j]
+            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255
+            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \
+                cr * 0.00318811 * 255
+            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255
+            r, g, b = r.round(), g.round(), b.round()
+            computed_bgr[i, j, :] = [b, g, r]
+    assert_image_almost_equal(out_img, computed_bgr)
+
+
+def test_bgr2hls():
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.bgr2hls(in_img)
+    argmax = in_img.argmax(axis=2)
+    computed_hls = np.empty_like(in_img)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            b, g, r = in_img[i, j]
+            maxc = max(r, g, b)
+            minc = min(r, g, b)
+            _l = (minc + maxc) / 2.0
+            if minc == maxc:
+                h = 0.0
+                s = 0.0
+            if _l <= 0.5:
+                s = (maxc - minc) / (maxc + minc)
+            else:
+                s = (maxc - minc) / (2.0 - maxc - minc)
+            if argmax[i, j] == 2:
+                h = 60 * (g - b) / (maxc - minc)
+            elif argmax[i, j] == 1:
+                h = 60 * (2.0 + (b - r) / (maxc - minc))
+            else:
+                h = 60 * (4.0 + (r - g) / (maxc - minc))
+            if h < 0:
+                h += 360
+            computed_hls[i, j, :] = [h, _l, s]
+    assert_array_almost_equal(out_img, computed_hls, decimal=2)
+
+
+@pytest.mark.parametrize('src,dst,ref', [('bgr', 'gray', cv2.COLOR_BGR2GRAY),
+                                         ('rgb', 'gray', cv2.COLOR_RGB2GRAY),
+                                         ('bgr', 'rgb', cv2.COLOR_BGR2RGB),
+                                         ('rgb', 'bgr', cv2.COLOR_RGB2BGR),
+                                         ('bgr', 'hsv', cv2.COLOR_BGR2HSV),
+                                         ('hsv', 'bgr', cv2.COLOR_HSV2BGR),
+                                         ('bgr', 'hls', cv2.COLOR_BGR2HLS),
+                                         ('hls', 'bgr', cv2.COLOR_HLS2BGR)])
+def test_imconvert(src, dst, ref):
+    img = np.random.rand(10, 10, 3).astype(np.float32)
+    assert_array_equal(mmcv.imconvert(img, src, dst), cv2.cvtColor(img, ref))
diff --git a/head_extractor/mmcv-2.1.0/tests/test_image/test_geometric.py b/head_extractor/mmcv-2.1.0/tests/test_image/test_geometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6409d7e5c73d22fffebf87b2b2d84ea05e83c16
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_image/test_geometric.py
@@ -0,0 +1,617 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import cv2
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+import mmcv
+
+
+class TestGeometric:
+
+    @classmethod
+    def setup_class(cls):
+        cls.data_dir = osp.join(osp.dirname(__file__), '../data')
+        # the test img resolution is 400x300
+        cls.img_path = osp.join(cls.data_dir, 'color.jpg')
+        cls.img = cv2.imread(cls.img_path)
+
+    def test_imresize(self):
+        resized_img = mmcv.imresize(self.img, (1000, 600))
+        assert resized_img.shape == (600, 1000, 3)
+        resized_img, w_scale, h_scale = mmcv.imresize(self.img, (1000, 600),
+                                                      True)
+        assert (resized_img.shape == (600, 1000, 3) and w_scale == 2.5
+                and h_scale == 2.0)
+        resized_img_dst = np.empty((600, 1000, 3), dtype=self.img.dtype)
+        resized_img = mmcv.imresize(self.img, (1000, 600), out=resized_img_dst)
+        assert id(resized_img_dst) == id(resized_img)
+        assert_array_equal(resized_img_dst,
+                           mmcv.imresize(self.img, (1000, 600)))
+        for mode in ['nearest', 'bilinear', 'bicubic', 'area', 'lanczos']:
+            resized_img = mmcv.imresize(
+                self.img, (1000, 600), interpolation=mode)
+            assert resized_img.shape == (600, 1000, 3)
+
+        # test pillow resize
+        for mode in [
+                'nearest', 'bilinear', 'bicubic', 'box', 'lanczos', 'hamming'
+        ]:
+            resized_img = mmcv.imresize(
+                self.img, (1000, 600), interpolation=mode, backend='pillow')
+            assert resized_img.shape == (600, 1000, 3)
+
+        # resize backend must be 'cv2' or 'pillow'
+        with pytest.raises(ValueError):
+            mmcv.imresize(self.img, (1000, 600), backend='not support')
+
+    def test_imresize_to_multiple(self):
+        # test size and keep_ratio = False
+        resized_img = mmcv.imresize_to_multiple(
+            self.img, divisor=16, size=(511, 513), keep_ratio=False)
+        assert resized_img.shape == (528, 512, 3)
+        resized_img = mmcv.imresize_to_multiple(
+            self.img, divisor=(16, 32), size=(511, 513), keep_ratio=False)
+        assert resized_img.shape == (544, 512, 3)
+
+        # test size, keep_ratio = True, and return_scale
+        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
+            self.img,
+            divisor=16,
+            size=(1000, 600),
+            keep_ratio=True,
+            return_scale=True)
+        assert resized_img.shape == (
+            608, 800, 3) and h_scale == 608 / 300 and w_scale == 800 / 400
+        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
+            self.img,
+            divisor=(18, 16),
+            size=(1000, 600),
+            keep_ratio=True,
+            return_scale=True)
+        assert resized_img.shape == (
+            608, 810, 3) and h_scale == 608 / 300 and w_scale == 810 / 400
+
+        # test scale_factor and return_scale
+        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
+            self.img, divisor=16, scale_factor=2, return_scale=True)
+        assert resized_img.shape == (
+            608, 800, 3) and h_scale == 608 / 300 and w_scale == 800 / 400
+        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
+            self.img, divisor=16, scale_factor=(2, 3), return_scale=True)
+        assert resized_img.shape == (
+            912, 800, 3) and h_scale == 912 / 300 and w_scale == 800 / 400
+        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
+            self.img, divisor=(18, 16), scale_factor=(2, 3), return_scale=True)
+        assert resized_img.shape == (
+            912, 810, 3) and h_scale == 912 / 300 and w_scale == 810 / 400
+
+        # one of size and scale_factor should be given
+        with pytest.raises(ValueError):
+            mmcv.imresize_to_multiple(
+                self.img, divisor=16, size=(1000, 600), scale_factor=2)
+        with pytest.raises(ValueError):
+            mmcv.imresize_to_multiple(
+                self.img, divisor=16, size=None, scale_factor=None)
+
+    def test_imresize_like(self):
+        a = np.zeros((100, 200, 3))
+        resized_img = mmcv.imresize_like(self.img, a)
+        assert resized_img.shape == (100, 200, 3)
+
+    def test_rescale_size(self):
+        new_size, scale_factor = mmcv.rescale_size((400, 300), 1.5, True)
+        assert new_size == (600, 450) and scale_factor == 1.5
+        new_size, scale_factor = mmcv.rescale_size((400, 300), 0.934, True)
+        assert new_size == (374, 280) and scale_factor == 0.934
+
+        new_size = mmcv.rescale_size((400, 300), 1.5)
+        assert new_size == (600, 450)
+        new_size = mmcv.rescale_size((400, 300), 0.934)
+        assert new_size == (374, 280)
+
+        new_size, scale_factor = mmcv.rescale_size((400, 300), (1000, 600),
+                                                   True)
+        assert new_size == (800, 600) and scale_factor == 2.0
+        new_size, scale_factor = mmcv.rescale_size((400, 300), (180, 200),
+                                                   True)
+        assert new_size == (200, 150) and scale_factor == 0.5
+
+        new_size = mmcv.rescale_size((400, 300), (1000, 600))
+        assert new_size == (800, 600)
+        new_size = mmcv.rescale_size((400, 300), (180, 200))
+        assert new_size == (200, 150)
+
+        with pytest.raises(ValueError):
+            mmcv.rescale_size((400, 300), -0.5)
+        with pytest.raises(TypeError):
+            mmcv.rescale_size()((400, 300), [100, 100])
+
+    def test_imrescale(self):
+        # rescale by a certain factor
+        resized_img = mmcv.imrescale(self.img, 1.5)
+        assert resized_img.shape == (450, 600, 3)
+        resized_img = mmcv.imrescale(self.img, 0.934)
+        assert resized_img.shape == (280, 374, 3)
+
+        # rescale by a certain max_size
+        # resize (400, 300) to (max_1000, max_600)
+        resized_img = mmcv.imrescale(self.img, (1000, 600))
+        assert resized_img.shape == (600, 800, 3)
+        resized_img, scale = mmcv.imrescale(
+            self.img, (1000, 600), return_scale=True)
+        assert resized_img.shape == (600, 800, 3) and scale == 2.0
+        # resize (400, 300) to (max_200, max_180)
+        resized_img = mmcv.imrescale(self.img, (180, 200))
+        assert resized_img.shape == (150, 200, 3)
+        resized_img, scale = mmcv.imrescale(
+            self.img, (180, 200), return_scale=True)
+        assert resized_img.shape == (150, 200, 3) and scale == 0.5
+
+        # test exceptions
+        with pytest.raises(ValueError):
+            mmcv.imrescale(self.img, -0.5)
+        with pytest.raises(TypeError):
+            mmcv.imrescale(self.img, [100, 100])
+
+    def test_imflip(self):
+        # direction must be "horizontal" or "vertical" or "diagonal"
+        with pytest.raises(AssertionError):
+            mmcv.imflip(np.random.rand(80, 60, 3), direction='random')
+
+        # test horizontal flip (color image)
+        img = np.random.rand(80, 60, 3)
+        h, w, c = img.shape
+        flipped_img = mmcv.imflip(img)
+        assert flipped_img.shape == img.shape
+        for i in range(h):
+            for j in range(w):
+                for k in range(c):
+                    assert flipped_img[i, j, k] == img[i, w - 1 - j, k]
+
+        # test vertical flip (color image)
+        flipped_img = mmcv.imflip(img, direction='vertical')
+        assert flipped_img.shape == img.shape
+        for i in range(h):
+            for j in range(w):
+                for k in range(c):
+                    assert flipped_img[i, j, k] == img[h - 1 - i, j, k]
+
+        # test diagonal flip (color image)
+        flipped_img = mmcv.imflip(img, direction='diagonal')
+        assert flipped_img.shape == img.shape
+        for i in range(h):
+            for j in range(w):
+                for k in range(c):
+                    assert flipped_img[i, j, k] == img[h - 1 - i, w - 1 - j, k]
+
+        # test horizontal flip (grayscale image)
+        img = np.random.rand(80, 60)
+        h, w = img.shape
+        flipped_img = mmcv.imflip(img)
+        assert flipped_img.shape == img.shape
+        for i in range(h):
+            for j in range(w):
+                assert flipped_img[i, j] == img[i, w - 1 - j]
+
+        # test vertical flip (grayscale image)
+        flipped_img = mmcv.imflip(img, direction='vertical')
+        assert flipped_img.shape == img.shape
+        for i in range(h):
+            for j in range(w):
+                assert flipped_img[i, j] == img[h - 1 - i, j]
+
+        # test diagonal flip (grayscale image)
+        flipped_img = mmcv.imflip(img, direction='diagonal')
+        assert flipped_img.shape == img.shape
+        for i in range(h):
+            for j in range(w):
+                assert flipped_img[i, j] == img[h - 1 - i, w - 1 - j]
+
+    def test_imflip_(self):
+        # direction must be "horizontal" or "vertical" or "diagonal"
+        with pytest.raises(AssertionError):
+            mmcv.imflip_(np.random.rand(80, 60, 3), direction='random')
+
+        # test horizontal flip (color image)
+        img = np.random.rand(80, 60, 3)
+        h, w, c = img.shape
+        img_for_flip = img.copy()
+        flipped_img = mmcv.imflip_(img_for_flip)
+        assert flipped_img.shape == img.shape
+        assert flipped_img.shape == img_for_flip.shape
+        assert id(flipped_img) == id(img_for_flip)
+        for i in range(h):
+            for j in range(w):
+                for k in range(c):
+                    assert flipped_img[i, j, k] == img[i, w - 1 - j, k]
+                    assert flipped_img[i, j, k] == img_for_flip[i, j, k]
+
+        # test vertical flip (color image)
+        img_for_flip = img.copy()
+        flipped_img = mmcv.imflip_(img_for_flip, direction='vertical')
+        assert flipped_img.shape == img.shape
+        assert flipped_img.shape == img_for_flip.shape
+        assert id(flipped_img) == id(img_for_flip)
+        for i in range(h):
+            for j in range(w):
+                for k in range(c):
+                    assert flipped_img[i, j, k] == img[h - 1 - i, j, k]
+                    assert flipped_img[i, j, k] == img_for_flip[i, j, k]
+
+        # test diagonal flip (color image)
+        img_for_flip = img.copy()
+        flipped_img = mmcv.imflip_(img_for_flip, direction='diagonal')
+        assert flipped_img.shape == img.shape
+        assert flipped_img.shape == img_for_flip.shape
+        assert id(flipped_img) == id(img_for_flip)
+        for i in range(h):
+            for j in range(w):
+                for k in range(c):
+                    assert flipped_img[i, j, k] == img[h - 1 - i, w - 1 - j, k]
+                    assert flipped_img[i, j, k] == img_for_flip[i, j, k]
+
+        # test horizontal flip (grayscale image)
+        img = np.random.rand(80, 60)
+        h, w = img.shape
+        img_for_flip = img.copy()
+        flipped_img = mmcv.imflip_(img_for_flip)
+        assert flipped_img.shape == img.shape
+        assert flipped_img.shape == img_for_flip.shape
+        assert id(flipped_img) == id(img_for_flip)
+        for i in range(h):
+            for j in range(w):
+                assert flipped_img[i, j] == img[i, w - 1 - j]
+                assert flipped_img[i, j] == img_for_flip[i, j]
+
+        # test vertical flip (grayscale image)
+        img_for_flip = img.copy()
+        flipped_img = mmcv.imflip_(img_for_flip, direction='vertical')
+        assert flipped_img.shape == img.shape
+        assert flipped_img.shape == img_for_flip.shape
+        assert id(flipped_img) == id(img_for_flip)
+        for i in range(h):
+            for j in range(w):
+                assert flipped_img[i, j] == img[h - 1 - i, j]
+                assert flipped_img[i, j] == img_for_flip[i, j]
+
+        # test diagonal flip (grayscale image)
+        img_for_flip = img.copy()
+        flipped_img = mmcv.imflip_(img_for_flip, direction='diagonal')
+        assert flipped_img.shape == img.shape
+        assert flipped_img.shape == img_for_flip.shape
+        assert id(flipped_img) == id(img_for_flip)
+        for i in range(h):
+            for j in range(w):
+                assert flipped_img[i, j] == img[h - 1 - i, w - 1 - j]
+                assert flipped_img[i, j] == img_for_flip[i, j]
+
+    def test_imcrop(self):
+        # yapf: disable
+        bboxes = np.array([[100, 100, 199, 199],  # center
+                           [0, 0, 150, 100],  # left-top corner
+                           [250, 200, 399, 299],  # right-bottom corner
+                           [0, 100, 399, 199],  # wide
+                           [150, 0, 299, 299]])  # tall
+        # yapf: enable
+
+        # crop one bbox
+        patch = mmcv.imcrop(self.img, bboxes[0, :])
+        patches = mmcv.imcrop(self.img, bboxes[[0], :])
+        assert patch.shape == (100, 100, 3)
+        patch_path = osp.join(self.data_dir, 'patches')
+        ref_patch = np.load(patch_path + '/0.npy')
+        assert_array_equal(patch, ref_patch)
+        assert isinstance(patches, list) and len(patches) == 1
+        assert_array_equal(patches[0], ref_patch)
+
+        # crop with no scaling and padding
+        patches = mmcv.imcrop(self.img, bboxes)
+        assert len(patches) == bboxes.shape[0]
+        for i in range(len(patches)):
+            ref_patch = np.load(patch_path + f'/{i}.npy')
+            assert_array_equal(patches[i], ref_patch)
+
+        # crop with scaling and no padding
+        patches = mmcv.imcrop(self.img, bboxes, 1.2)
+        for i in range(len(patches)):
+            ref_patch = np.load(patch_path + f'/scale_{i}.npy')
+            assert_array_equal(patches[i], ref_patch)
+
+        # crop with scaling and padding
+        patches = mmcv.imcrop(self.img, bboxes, 1.2, pad_fill=[255, 255, 0])
+        for i in range(len(patches)):
+            ref_patch = np.load(patch_path + f'/pad_{i}.npy')
+            assert_array_equal(patches[i], ref_patch)
+        patches = mmcv.imcrop(self.img, bboxes, 1.2, pad_fill=0)
+        for i in range(len(patches)):
+            ref_patch = np.load(patch_path + f'/pad0_{i}.npy')
+            assert_array_equal(patches[i], ref_patch)
+
+    def test_impad(self):
+        # grayscale image
+        img = np.random.rand(10, 10).astype(np.float32)
+        padded_img = mmcv.impad(img, padding=(0, 0, 2, 5), pad_val=0)
+        assert_array_equal(img, padded_img[:10, :10])
+        assert_array_equal(
+            np.zeros((5, 12), dtype='float32'), padded_img[10:, :])
+        assert_array_equal(
+            np.zeros((15, 2), dtype='float32'), padded_img[:, 10:])
+
+        # RGB image
+        img = np.random.rand(10, 10, 3).astype(np.float32)
+        padded_img = mmcv.impad(img, padding=(0, 0, 2, 5), pad_val=0)
+        assert_array_equal(img, padded_img[:10, :10, :])
+        assert_array_equal(
+            np.zeros((5, 12, 3), dtype='float32'), padded_img[10:, :, :])
+        assert_array_equal(
+            np.zeros((15, 2, 3), dtype='float32'), padded_img[:, 10:, :])
+
+        # RGB image with different values for three channels.
+        img = np.random.randint(256, size=(10, 10, 3)).astype('uint8')
+        padded_img = mmcv.impad(
+            img, padding=(0, 0, 2, 5), pad_val=(100, 110, 120))
+        assert_array_equal(img, padded_img[:10, :10, :])
+        assert_array_equal(
+            np.array([100, 110, 120], dtype='uint8') * np.ones(
+                (5, 12, 3), dtype='uint8'), padded_img[10:, :, :])
+        assert_array_equal(
+            np.array([100, 110, 120], dtype='uint8') * np.ones(
+                (15, 2, 3), dtype='uint8'), padded_img[:, 10:, :])
+
+        # Pad the grayscale image to shape (15, 12)
+        img = np.random.rand(10, 10).astype(np.float32)
+        padded_img = mmcv.impad(img, shape=(15, 12))
+        assert_array_equal(img, padded_img[:10, :10])
+        assert_array_equal(
+            np.zeros((5, 12), dtype='float32'), padded_img[10:, :])
+        assert_array_equal(
+            np.zeros((15, 2), dtype='float32'), padded_img[:, 10:])
+
+        # Pad the RGB image to shape (15, 12)
+        img = np.random.rand(10, 10, 3).astype(np.float32)
+        padded_img = mmcv.impad(img, shape=(15, 12))
+        assert_array_equal(img, padded_img[:10, :10, :])
+        assert_array_equal(
+            np.zeros((5, 12, 3), dtype='float32'), padded_img[10:, :, :])
+        assert_array_equal(
+            np.zeros((15, 2, 3), dtype='float32'), padded_img[:, 10:, :])
+
+        # Pad the RGB image to shape (15, 12) with different values for
+        # three channels.
+        img = np.random.randint(256, size=(10, 10, 3)).astype('uint8')
+        padded_img = mmcv.impad(img, shape=(15, 12), pad_val=(100, 110, 120))
+        assert_array_equal(img, padded_img[:10, :10, :])
+        assert_array_equal(
+            np.array([100, 110, 120], dtype='uint8') * np.ones(
+                (5, 12, 3), dtype='uint8'), padded_img[10:, :, :])
+        assert_array_equal(
+            np.array([100, 110, 120], dtype='uint8') * np.ones(
+                (15, 2, 3), dtype='uint8'), padded_img[:, 10:, :])
+
+        # RGB image with padding=[5, 2]
+        img = np.random.rand(10, 10, 3).astype(np.float32)
+        padded_img = mmcv.impad(img, padding=(5, 2), pad_val=0)
+
+        assert padded_img.shape == (14, 20, 3)
+        assert_array_equal(img, padded_img[2:12, 5:15, :])
+        assert_array_equal(
+            np.zeros((2, 5, 3), dtype='float32'), padded_img[:2, :5, :])
+        assert_array_equal(
+            np.zeros((2, 5, 3), dtype='float32'), padded_img[12:, :5, :])
+        assert_array_equal(
+            np.zeros((2, 5, 3), dtype='float32'), padded_img[:2, 15:, :])
+        assert_array_equal(
+            np.zeros((2, 5, 3), dtype='float32'), padded_img[12:, 15:, :])
+
+        # RGB image with type(pad_val) = tuple
+        pad_val = (0, 1, 2)
+        img = np.random.rand(10, 10, 3).astype(np.float32)
+        padded_img = mmcv.impad(img, padding=(0, 0, 5, 2), pad_val=pad_val)
+
+        assert padded_img.shape == (12, 15, 3)
+        assert_array_equal(img, padded_img[:10, :10, :])
+        assert_array_equal(pad_val[0] * np.ones((2, 15, 1), dtype='float32'),
+                           padded_img[10:, :, 0:1])
+        assert_array_equal(pad_val[1] * np.ones((2, 15, 1), dtype='float32'),
+                           padded_img[10:, :, 1:2])
+        assert_array_equal(pad_val[2] * np.ones((2, 15, 1), dtype='float32'),
+                           padded_img[10:, :, 2:3])
+
+        assert_array_equal(pad_val[0] * np.ones((12, 5, 1), dtype='float32'),
+                           padded_img[:, 10:, 0:1])
+        assert_array_equal(pad_val[1] * np.ones((12, 5, 1), dtype='float32'),
+                           padded_img[:, 10:, 1:2])
+        assert_array_equal(pad_val[2] * np.ones((12, 5, 1), dtype='float32'),
+                           padded_img[:, 10:, 2:3])
+
+        # test different padding mode with channel number = 3
+        for mode in ['constant', 'edge', 'reflect', 'symmetric']:
+            img = np.random.rand(10, 10, 3).astype(np.float32)
+            padded_img = mmcv.impad(
+                img, padding=(0, 0, 5, 2), pad_val=pad_val, padding_mode=mode)
+            assert padded_img.shape == (12, 15, 3)
+
+        # test different padding mode with channel number = 1
+        for mode in ['constant', 'edge', 'reflect', 'symmetric']:
+            img = np.random.rand(10, 10).astype(np.float32)
+            padded_img = mmcv.impad(
+                img, padding=(0, 0, 5, 2), pad_val=0, padding_mode=mode)
+            assert padded_img.shape == (12, 15)
+
+        # Padding must be a int or a 2, or 4 element tuple.
+        with pytest.raises(ValueError):
+            mmcv.impad(img, padding=(1, 1, 1))
+
+        # pad_val must be a int or a tuple
+        with pytest.raises(TypeError):
+            mmcv.impad(img, padding=(1, 1, 1, 1), pad_val='wrong')
+
+        # When pad_val is a tuple,
+        # len(pad_val) should be equal to img.shape[-1]
+        img = np.random.rand(10, 10, 3).astype(np.float32)
+        with pytest.raises(AssertionError):
+            mmcv.impad(img, padding=3, pad_val=(100, 200))
+
+        with pytest.raises(AssertionError):
+            mmcv.impad(img, padding=2, pad_val=0, padding_mode='unknown')
+
+        with pytest.raises(AssertionError):
+            mmcv.impad(img, shape=(12, 15), padding=(0, 0, 5, 2))
+
+        # Pad shape smaller than image shape
+        padded_img = mmcv.impad(img, shape=(8, 8))
+        assert padded_img.shape == (10, 10, 3)
+
+    def test_impad_to_multiple(self):
+        img = np.random.rand(11, 14, 3).astype(np.float32)
+        padded_img = mmcv.impad_to_multiple(img, 4)
+        assert padded_img.shape == (12, 16, 3)
+        img = np.random.rand(20, 12).astype(np.float32)
+        padded_img = mmcv.impad_to_multiple(img, 5)
+        assert padded_img.shape == (20, 15)
+        img = np.random.rand(20, 12).astype(np.float32)
+        padded_img = mmcv.impad_to_multiple(img, 2)
+        assert padded_img.shape == (20, 12)
+
+    def test_cutout(self):
+        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)
+
+        # shape must be int or tuple
+        with pytest.raises(AssertionError):
+            mmcv.cutout(img, 2.5)
+        # pad_val must be int or float or tuple with the same length
+        # of img channels
+        with pytest.raises(AssertionError):
+            mmcv.cutout(img, 1, (1, 2, 3))
+        with pytest.raises(TypeError):
+            mmcv.cutout(img, 1, None)
+
+        # test cutout the whole img
+        assert_array_equal(mmcv.cutout(img, 6), np.zeros_like(img))
+        # test not cutout
+        assert_array_equal(mmcv.cutout(img, 0), img)
+        # test cutout when shape is int
+        np.random.seed(0)
+        img_cutout = np.array([[1, 2, 3], [4, 0, 6], [7, 8,
+                                                      9]]).astype(np.uint8)
+        assert_array_equal(mmcv.cutout(img, 1), img_cutout)
+        img_cutout = np.array([[1, 2, 3], [4, 10, 6], [7, 8,
+                                                       9]]).astype(np.uint8)
+        assert_array_equal(mmcv.cutout(img, 1, pad_val=10), img_cutout)
+        # test cutout when shape is tuple
+        np.random.seed(0)
+        img_cutout = np.array([[1, 2, 3], [0, 0, 6], [7, 8,
+                                                      9]]).astype(np.uint8)
+        assert_array_equal(mmcv.cutout(img, (1, 2)), img_cutout)
+        img_cutout = np.array([[1, 2, 3], [10, 10, 6], [7, 8,
+                                                        9]]).astype(np.uint8)
+        assert_array_equal(mmcv.cutout(img, (1, 2), pad_val=10), img_cutout)
+
+    def test_imrotate(self):
+        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)
+        assert_array_equal(mmcv.imrotate(img, 0), img)
+        img_r = np.array([[7, 4, 1], [8, 5, 2], [9, 6, 3]])
+        assert_array_equal(mmcv.imrotate(img, 90), img_r)
+        img_r = np.array([[3, 6, 9], [2, 5, 8], [1, 4, 7]])
+        assert_array_equal(mmcv.imrotate(img, -90), img_r)
+
+        img = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]).astype(np.uint8)
+        img_r = np.array([[0, 6, 2, 0], [0, 7, 3, 0]])
+        assert_array_equal(mmcv.imrotate(img, 90), img_r)
+        img_r = np.array([[1, 0, 0, 0], [2, 0, 0, 0]])
+        assert_array_equal(mmcv.imrotate(img, 90, center=(0, 0)), img_r)
+        img_r = np.array([[255, 6, 2, 255], [255, 7, 3, 255]])
+        assert_array_equal(mmcv.imrotate(img, 90, border_value=255), img_r)
+        img_r = np.array([[5, 1], [6, 2], [7, 3], [8, 4]])
+        assert_array_equal(mmcv.imrotate(img, 90, auto_bound=True), img_r)
+        img_r = np.array([[6, 6, 2, 2], [7, 7, 3, 3]])
+        assert_array_equal(
+            mmcv.imrotate(img, 90, border_mode='replicate'), img_r)
+
+        with pytest.raises(ValueError):
+            mmcv.imrotate(img, 90, center=(0, 0), auto_bound=True)
+
+    def test_imshear(self):
+        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)
+        assert_array_equal(mmcv.imshear(img, 0), img)
+        # magnitude=1, horizontal
+        img_sheared = np.array([[1, 2, 3], [0, 4, 5], [0, 0, 7]],
+                               dtype=np.uint8)
+        assert_array_equal(mmcv.imshear(img, 1), img_sheared)
+        # magnitude=-1, vertical
+        img_sheared = np.array([[1, 5, 9], [4, 8, 0], [7, 0, 0]],
+                               dtype=np.uint8)
+        assert_array_equal(mmcv.imshear(img, -1, 'vertical'), img_sheared)
+        # magnitude=1, vertical, borderValue=100
+        borderValue = 100
+        img_sheared = np.array(
+            [[1, borderValue, borderValue], [4, 2, borderValue], [7, 5, 3]],
+            dtype=np.uint8)
+        assert_array_equal(
+            mmcv.imshear(img, 1, 'vertical', borderValue), img_sheared)
+        # magnitude=1, vertical, borderValue=100, img shape (h,w,3)
+        img = np.stack([img, img, img], axis=-1)
+        img_sheared = np.stack([img_sheared, img_sheared, img_sheared],
+                               axis=-1)
+        assert_array_equal(
+            mmcv.imshear(img, 1, 'vertical', borderValue), img_sheared)
+        # test tuple format of borderValue
+        assert_array_equal(
+            mmcv.imshear(img, 1, 'vertical',
+                         (borderValue, borderValue, borderValue)), img_sheared)
+
+        # test invalid length of borderValue
+        with pytest.raises(AssertionError):
+            mmcv.imshear(img, 0.5, 'horizontal', (borderValue, ))
+
+        # test invalid type of borderValue
+        with pytest.raises(ValueError):
+            mmcv.imshear(img, 0.5, 'horizontal', [borderValue])
+
+        # test invalid value of direction
+        with pytest.raises(AssertionError):
+            mmcv.imshear(img, 0.5, 'diagonal')
+
+    def test_imtranslate(self):
+        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.uint8)
+        assert_array_equal(mmcv.imtranslate(img, 0), img)
+        # offset=1, horizontal
+        img_translated = np.array([[128, 1, 2], [128, 4, 5], [128, 7, 8]],
+                                  dtype=np.uint8)
+        assert_array_equal(
+            mmcv.imtranslate(img, 1, border_value=128), img_translated)
+        # offset=-1, vertical
+        img_translated = np.array([[4, 5, 6], [7, 8, 9], [0, 0, 0]],
+                                  dtype=np.uint8)
+        assert_array_equal(
+            mmcv.imtranslate(img, -1, 'vertical'), img_translated)
+        # offset=-2, horizontal
+        img = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+        img_translated = [[3, 4, 128, 128], [7, 8, 128, 128]]
+        img_translated = np.stack(
+            [img_translated, img_translated, img_translated], axis=-1)
+        assert_array_equal(
+            mmcv.imtranslate(img, -2, border_value=128), img_translated)
+        # offset=2, vertical
+        border_value = (110, 120, 130)
+        img_translated = np.stack([
+            np.ones((2, 4)) * border_value[0],
+            np.ones((2, 4)) * border_value[1],
+            np.ones((2, 4)) * border_value[2]
+        ],
+                                  axis=-1).astype(np.uint8)
+        assert_array_equal(
+            mmcv.imtranslate(img, 2, 'vertical', border_value), img_translated)
+        # test invalid number elements in border_value
+        with pytest.raises(AssertionError):
+            mmcv.imtranslate(img, 1, border_value=(1, ))
+        # test invalid type of border_value
+        with pytest.raises(ValueError):
+            mmcv.imtranslate(img, 1, border_value=[1, 2, 3])
+        # test invalid value of direction
+        with pytest.raises(AssertionError):
+            mmcv.imtranslate(img, 1, 'diagonal')
diff --git a/head_extractor/mmcv-2.1.0/tests/test_image/test_image_misc.py b/head_extractor/mmcv-2.1.0/tests/test_image/test_image_misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..51e61d8e60e719f118bd275f1c7637c7a7adab1e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_image/test_image_misc.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+import mmcv
+
+try:
+    import torch
+except ImportError:
+    torch = None
+
+
+@pytest.mark.skipif(torch is None, reason='requires torch library')
+def test_tensor2imgs():
+
+    # test tensor obj
+    with pytest.raises(AssertionError):
+        tensor = np.random.rand(2, 3, 3)
+        mmcv.tensor2imgs(tensor)
+
+    # test tensor ndim
+    with pytest.raises(AssertionError):
+        tensor = torch.randn(2, 3, 3)
+        mmcv.tensor2imgs(tensor)
+
+    # test tensor dim-1
+    with pytest.raises(AssertionError):
+        tensor = torch.randn(2, 4, 3, 3)
+        mmcv.tensor2imgs(tensor)
+
+    # test mean length
+    with pytest.raises(AssertionError):
+        tensor = torch.randn(2, 3, 5, 5)
+        mmcv.tensor2imgs(tensor, mean=(1, ))
+        tensor = torch.randn(2, 1, 5, 5)
+        mmcv.tensor2imgs(tensor, mean=(0, 0, 0))
+
+    # test std length
+    with pytest.raises(AssertionError):
+        tensor = torch.randn(2, 3, 5, 5)
+        mmcv.tensor2imgs(tensor, std=(1, ))
+        tensor = torch.randn(2, 1, 5, 5)
+        mmcv.tensor2imgs(tensor, std=(1, 1, 1))
+
+    # test to_rgb
+    with pytest.raises(AssertionError):
+        tensor = torch.randn(2, 1, 5, 5)
+        mmcv.tensor2imgs(tensor, mean=(0, ), std=(1, ), to_rgb=True)
+
+    # test rgb=True
+    tensor = torch.randn(2, 3, 5, 5)
+    gts = [
+        t.cpu().numpy().transpose(1, 2, 0).astype(np.uint8)
+        for t in tensor.flip(1)
+    ]
+    outputs = mmcv.tensor2imgs(tensor, to_rgb=True)
+    for gt, output in zip(gts, outputs):
+        assert_array_equal(gt, output)
+
+    # test rgb=False
+    tensor = torch.randn(2, 3, 5, 5)
+    gts = [t.cpu().numpy().transpose(1, 2, 0).astype(np.uint8) for t in tensor]
+    outputs = mmcv.tensor2imgs(tensor, to_rgb=False)
+    for gt, output in zip(gts, outputs):
+        assert_array_equal(gt, output)
+
+    # test tensor channel 1 and rgb=False
+    tensor = torch.randn(2, 1, 5, 5)
+    gts = [t.squeeze(0).cpu().numpy().astype(np.uint8) for t in tensor]
+    outputs = mmcv.tensor2imgs(tensor, to_rgb=False)
+    for gt, output in zip(gts, outputs):
+        assert_array_equal(gt, output)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_image/test_io.py b/head_extractor/mmcv-2.1.0/tests/test_image/test_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..6742924f2303dc3cfba7390644fd31f5f9f363c2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_image/test_io.py
@@ -0,0 +1,437 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import sys
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import cv2
+import mmengine
+import numpy as np
+import pytest
+import torch
+from mmengine.fileio.file_client import HTTPBackend, PetrelBackend
+from numpy.testing import assert_allclose, assert_array_equal
+
+import mmcv
+
+if torch.__version__ == 'parrots':
+    pytest.skip('not necessary in parrots test', allow_module_level=True)
+
+
+class TestIO:
+
+    @classmethod
+    def setup_class(cls):
+        cls.data_dir = osp.join(osp.dirname(__file__), '../data')
+        # the test img resolution is 400x300
+        cls.img_path = osp.join(cls.data_dir, 'color.jpg')
+        cls.img_path_obj = Path(cls.img_path)
+        cls.gray_img_path = osp.join(cls.data_dir, 'grayscale.jpg')
+        cls.gray_img_path_obj = Path(cls.gray_img_path)
+        cls.gray_img_dim3_path = osp.join(cls.data_dir, 'grayscale_dim3.jpg')
+        cls.gray_alpha_img_path = osp.join(cls.data_dir, 'gray_alpha.png')
+        cls.palette_img_path = osp.join(cls.data_dir, 'palette.gif')
+        cls.exif_img_path = osp.join(cls.data_dir, 'color_exif.jpg')
+        cls.img = cv2.imread(cls.img_path)
+        cls.tiff_path = osp.join(cls.data_dir, 'uint16-5channel.tif')
+        # petrel s3 path
+        cls.s3_path = 's3://path/of/your/file.jpg'
+        # http path
+        cls.http_path = 'http://path/of/your/file.jpg'
+        # add mock package
+        sys.modules['petrel_client'] = MagicMock()
+        sys.modules['petrel_client.client'] = MagicMock()
+
+    @classmethod
+    def teardown_class(cls):
+        # clean instances avoid to influence other unittest
+        mmengine.FileClient._instances = {}
+
+    def assert_img_equal(self, img, ref_img, ratio_thr=0.999):
+        assert img.shape == ref_img.shape
+        assert img.dtype == ref_img.dtype
+        area = ref_img.shape[0] * ref_img.shape[1]
+        diff = np.abs(img.astype('int32') - ref_img.astype('int32'))
+        assert np.sum(diff <= 1) / float(area) > ratio_thr
+
+    def test_imread(self):
+        # backend cv2
+        mmcv.use_backend('cv2')
+
+        # file_client_args and backend_args can not be both set
+        with pytest.raises(
+                ValueError,
+                match='"file_client_args" and "backend_args" cannot be set'):
+            mmcv.imread(
+                self.img_path,
+                file_client_args={'backend': 'disk'},
+                backend_args={'backend': 'disk'})
+
+        # HardDiskBackend
+        img_cv2_color_bgr = mmcv.imread(self.img_path)
+        assert img_cv2_color_bgr.shape == (300, 400, 3)
+        img_cv2_color_rgb = mmcv.imread(self.img_path, channel_order='rgb')
+        assert img_cv2_color_rgb.shape == (300, 400, 3)
+        assert_array_equal(img_cv2_color_rgb[:, :, ::-1], img_cv2_color_bgr)
+        img_cv2_grayscale1 = mmcv.imread(self.img_path, 'grayscale')
+        assert img_cv2_grayscale1.shape == (300, 400)
+        img_cv2_grayscale2 = mmcv.imread(self.gray_img_path)
+        assert img_cv2_grayscale2.shape == (300, 400, 3)
+        img_cv2_unchanged = mmcv.imread(self.gray_img_path, 'unchanged')
+        assert img_cv2_unchanged.shape == (300, 400)
+        img_cv2_unchanged = mmcv.imread(img_cv2_unchanged)
+        assert_array_equal(img_cv2_unchanged, mmcv.imread(img_cv2_unchanged))
+
+        img_cv2_color_bgr = mmcv.imread(self.img_path_obj)
+        assert img_cv2_color_bgr.shape == (300, 400, 3)
+        img_cv2_color_rgb = mmcv.imread(self.img_path_obj, channel_order='rgb')
+        assert img_cv2_color_rgb.shape == (300, 400, 3)
+        assert_array_equal(img_cv2_color_rgb[:, :, ::-1], img_cv2_color_bgr)
+        img_cv2_grayscale1 = mmcv.imread(self.img_path_obj, 'grayscale')
+        assert img_cv2_grayscale1.shape == (300, 400)
+        img_cv2_grayscale2 = mmcv.imread(self.gray_img_path_obj)
+        assert img_cv2_grayscale2.shape == (300, 400, 3)
+        img_cv2_unchanged = mmcv.imread(self.gray_img_path_obj, 'unchanged')
+        assert img_cv2_unchanged.shape == (300, 400)
+        with pytest.raises(TypeError):
+            mmcv.imread(1)
+
+        # PetrelBackend
+        img_cv2_color_bgr = mmcv.imread(self.img_path)
+        with patch.object(
+                PetrelBackend, 'get',
+                return_value=img_cv2_color_bgr) as mock_method:
+            img_cv2_color_bgr_petrel = mmcv.imread(self.s3_path, backend='cv2')
+            img_cv2_color_bgr_petrel_with_args = mmcv.imread(
+                self.s3_path,
+                backend='cv2',
+                file_client_args={'backend': 'petrel'})
+            mock_method.assert_called()
+            assert_array_equal(img_cv2_color_bgr_petrel,
+                               img_cv2_color_bgr_petrel_with_args)
+
+            mock_method.reset_mock()
+
+            img_cv2_color_bgr_petrel_with_args = mmcv.imread(
+                self.s3_path,
+                backend='cv2',
+                backend_args={'backend': 'petrel'})
+            mock_method.assert_called()
+            assert_array_equal(img_cv2_color_bgr_petrel,
+                               img_cv2_color_bgr_petrel_with_args)
+
+        # HTTPBackend
+        img_cv2_color_bgr = mmcv.imread(self.img_path)
+        with patch.object(
+                HTTPBackend, 'get',
+                return_value=img_cv2_color_bgr) as mock_method:
+            img_cv2_color_bgr_http = mmcv.imread(self.http_path, backend='cv2')
+            img_cv2_color_bgr_http_with_args = mmcv.imread(
+                self.http_path,
+                backend='cv2',
+                file_client_args={'backend': 'http'})
+            mock_method.assert_called()
+            assert_array_equal(img_cv2_color_bgr_http,
+                               img_cv2_color_bgr_http_with_args)
+
+            mock_method.reset_mock()
+
+            img_cv2_color_bgr_http_with_args = mmcv.imread(
+                self.http_path,
+                backend='cv2',
+                backend_args={'backend': 'http'})
+            mock_method.assert_called()
+            assert_array_equal(img_cv2_color_bgr_http,
+                               img_cv2_color_bgr_http_with_args)
+
+        with pytest.raises(FileNotFoundError):
+            mmcv.imread('/not/exists/' + self.img_path)
+
+        # test arg backend pillow
+        img_pil_gray_alpha = mmcv.imread(
+            self.gray_alpha_img_path, 'grayscale', backend='pillow')
+        assert img_pil_gray_alpha.shape == (400, 500)
+        mean = img_pil_gray_alpha[300:, 400:].mean()
+        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)
+        img_pil_gray_alpha = mmcv.imread(
+            self.gray_alpha_img_path, backend='pillow')
+        mean = img_pil_gray_alpha[300:, 400:].mean(axis=(0, 1))
+        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)
+        assert img_pil_gray_alpha.shape == (400, 500, 3)
+        img_pil_gray_alpha = mmcv.imread(
+            self.gray_alpha_img_path, 'unchanged', backend='pillow')
+        assert img_pil_gray_alpha.shape == (400, 500, 2)
+        img_pil_palette = mmcv.imread(
+            self.palette_img_path, 'grayscale', backend='pillow')
+        assert img_pil_palette.shape == (300, 400)
+        img_pil_palette = mmcv.imread(self.palette_img_path, backend='pillow')
+        assert img_pil_palette.shape == (300, 400, 3)
+        img_pil_palette = mmcv.imread(
+            self.palette_img_path, 'unchanged', backend='pillow')
+        assert img_pil_palette.shape == (300, 400)
+
+        # backend pillow
+        mmcv.use_backend('pillow')
+        img_pil_grayscale1 = mmcv.imread(self.img_path, 'grayscale')
+        assert img_pil_grayscale1.shape == (300, 400)
+        img_pil_gray_alpha = mmcv.imread(self.gray_alpha_img_path, 'grayscale')
+        assert img_pil_gray_alpha.shape == (400, 500)
+        mean = img_pil_gray_alpha[300:, 400:].mean()
+        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)
+        img_pil_gray_alpha = mmcv.imread(self.gray_alpha_img_path)
+        mean = img_pil_gray_alpha[300:, 400:].mean(axis=(0, 1))
+        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)
+        assert img_pil_gray_alpha.shape == (400, 500, 3)
+        img_pil_gray_alpha = mmcv.imread(self.gray_alpha_img_path, 'unchanged')
+        assert img_pil_gray_alpha.shape == (400, 500, 2)
+        img_pil_palette = mmcv.imread(self.palette_img_path, 'grayscale')
+        assert img_pil_palette.shape == (300, 400)
+        img_pil_palette = mmcv.imread(self.palette_img_path)
+        assert img_pil_palette.shape == (300, 400, 3)
+        img_pil_palette = mmcv.imread(self.palette_img_path, 'unchanged')
+        assert img_pil_palette.shape == (300, 400)
+        img_pil_grayscale2 = mmcv.imread(self.gray_img_path)
+        assert img_pil_grayscale2.shape == (300, 400, 3)
+        img_pil_unchanged = mmcv.imread(self.gray_img_path, 'unchanged')
+        assert img_pil_unchanged.shape == (300, 400)
+        img_pil_unchanged = mmcv.imread(img_pil_unchanged)
+        assert_array_equal(img_pil_unchanged, mmcv.imread(img_pil_unchanged))
+
+        img_pil_color_bgr = mmcv.imread(self.img_path_obj)
+        assert img_pil_color_bgr.shape == (300, 400, 3)
+        img_pil_color_rgb = mmcv.imread(self.img_path_obj, channel_order='rgb')
+        assert img_pil_color_rgb.shape == (300, 400, 3)
+        assert (img_pil_color_rgb == img_cv2_color_rgb).sum() / float(
+            img_cv2_color_rgb.size) > 0.5
+        assert_array_equal(img_pil_color_rgb[:, :, ::-1], img_pil_color_bgr)
+        img_pil_grayscale1 = mmcv.imread(self.img_path_obj, 'grayscale')
+        assert img_pil_grayscale1.shape == (300, 400)
+        img_pil_grayscale2 = mmcv.imread(self.gray_img_path_obj)
+        assert img_pil_grayscale2.shape == (300, 400, 3)
+        img_pil_unchanged = mmcv.imread(self.gray_img_path_obj, 'unchanged')
+        assert img_pil_unchanged.shape == (300, 400)
+        with pytest.raises(TypeError):
+            mmcv.imread(1)
+
+        # backend turbojpeg
+        mmcv.use_backend('turbojpeg')
+
+        img_turbojpeg_color_bgr = mmcv.imread(self.img_path)
+        assert img_turbojpeg_color_bgr.shape == (300, 400, 3)
+        assert_array_equal(img_turbojpeg_color_bgr, img_cv2_color_bgr)
+
+        img_turbojpeg_color_rgb = mmcv.imread(
+            self.img_path, channel_order='rgb')
+        assert img_turbojpeg_color_rgb.shape == (300, 400, 3)
+        assert_array_equal(img_turbojpeg_color_rgb, img_cv2_color_rgb)
+
+        with pytest.raises(ValueError):
+            mmcv.imread(self.img_path, channel_order='unsupport_order')
+
+        img_turbojpeg_grayscale1 = mmcv.imread(self.img_path, flag='grayscale')
+        assert img_turbojpeg_grayscale1.shape == (300, 400)
+        assert_array_equal(img_turbojpeg_grayscale1, img_cv2_grayscale1)
+
+        img_turbojpeg_grayscale2 = mmcv.imread(self.gray_img_path)
+        assert img_turbojpeg_grayscale2.shape == (300, 400, 3)
+        assert_array_equal(img_turbojpeg_grayscale2, img_cv2_grayscale2)
+
+        img_turbojpeg_grayscale2 = mmcv.imread(img_turbojpeg_grayscale2)
+        assert_array_equal(img_turbojpeg_grayscale2,
+                           mmcv.imread(img_turbojpeg_grayscale2))
+
+        with pytest.raises(ValueError):
+            mmcv.imread(self.gray_img_path, 'unchanged')
+
+        with pytest.raises(TypeError):
+            mmcv.imread(1)
+
+        with pytest.raises(AssertionError):
+            mmcv.use_backend('unsupport_backend')
+
+        with pytest.raises(ValueError):
+            mmcv.imread(self.img_path, 'unsupported_backend')
+
+        # backend tifffile, multi channel tiff file(> 4 channels).
+        mmcv.use_backend('tifffile')
+        img_tifffile = mmcv.imread(self.tiff_path)
+        assert img_tifffile.shape == (200, 150, 5)
+
+        mmcv.use_backend('cv2')
+
+        # consistent exif behaviour
+        img_cv2_exif = mmcv.imread(self.exif_img_path)
+        img_pil_exif = mmcv.imread(self.exif_img_path, backend='pillow')
+        assert img_cv2_exif.shape == (400, 300, 3)
+        assert img_pil_exif.shape == (400, 300, 3)
+        img_cv2_exif_unchanged = mmcv.imread(
+            self.exif_img_path, flag='unchanged')
+        img_pil_exif_unchanged = mmcv.imread(
+            self.exif_img_path, backend='pillow', flag='unchanged')
+        assert img_cv2_exif_unchanged.shape == (300, 400, 3)
+        assert img_pil_exif_unchanged.shape == (300, 400, 3)
+        img_cv2_color_ignore_exif = mmcv.imread(
+            self.exif_img_path, flag='color_ignore_orientation')
+        img_pil_color_ignore_exif = mmcv.imread(
+            self.exif_img_path,
+            backend='pillow',
+            flag='color_ignore_orientation')
+        assert img_cv2_color_ignore_exif.shape == (300, 400, 3)
+        assert img_pil_color_ignore_exif.shape == (300, 400, 3)
+        img_cv2_grayscale_ignore_exif = mmcv.imread(
+            self.exif_img_path, flag='grayscale_ignore_orientation')
+        img_pil_grayscale_ignore_exif = mmcv.imread(
+            self.exif_img_path,
+            backend='pillow',
+            flag='grayscale_ignore_orientation')
+        assert img_cv2_grayscale_ignore_exif.shape == (300, 400)
+        assert img_pil_grayscale_ignore_exif.shape == (300, 400)
+
+    def test_imfrombytes(self):
+        # backend cv2, channel order: bgr
+        mmcv.use_backend('cv2')
+        with open(self.img_path, 'rb') as f:
+            img_bytes = f.read()
+        img_cv2 = mmcv.imfrombytes(img_bytes)
+        assert img_cv2.shape == (300, 400, 3)
+
+        # backend cv2, channel order: rgb
+        mmcv.use_backend('cv2')
+        with open(self.img_path, 'rb') as f:
+            img_bytes = f.read()
+        img_rgb_cv2 = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+        assert img_rgb_cv2.shape == (300, 400, 3)
+        assert_array_equal(img_rgb_cv2, img_cv2[:, :, ::-1])
+
+        # backend cv2, grayscale, decode as 3 channels
+        with open(self.gray_img_path, 'rb') as f:
+            img_bytes = f.read()
+        gray_img_rgb_cv2 = mmcv.imfrombytes(img_bytes)
+        assert gray_img_rgb_cv2.shape == (300, 400, 3)
+
+        # backend cv2, grayscale
+        with open(self.gray_img_path, 'rb') as f:
+            img_bytes = f.read()
+        gray_img_cv2 = mmcv.imfrombytes(img_bytes, flag='grayscale')
+        assert gray_img_cv2.shape == (300, 400)
+
+        # backend cv2, grayscale dim3
+        with open(self.gray_img_dim3_path, 'rb') as f:
+            img_bytes = f.read()
+        gray_img_dim3_cv2 = mmcv.imfrombytes(img_bytes, flag='grayscale')
+        assert gray_img_dim3_cv2.shape == (300, 400)
+
+        # arg backend pillow, channel order: bgr
+        with open(self.img_path, 'rb') as f:
+            img_bytes = f.read()
+        img_pillow = mmcv.imfrombytes(img_bytes, backend='pillow')
+        assert img_pillow.shape == (300, 400, 3)
+        # Pillow and opencv decoding may not be the same
+        assert (img_cv2 == img_pillow).sum() / float(img_cv2.size) > 0.5
+
+        # backend pillow, channel order: bgr
+        mmcv.use_backend('pillow')
+        with open(self.img_path, 'rb') as f:
+            img_bytes = f.read()
+        img_pillow = mmcv.imfrombytes(img_bytes)
+        assert img_pillow.shape == (300, 400, 3)
+        # Pillow and opencv decoding may not be the same
+        assert (img_cv2 == img_pillow).sum() / float(img_cv2.size) > 0.5
+
+        # backend turbojpeg, channel order: bgr
+        mmcv.use_backend('turbojpeg')
+        with open(self.img_path, 'rb') as f:
+            img_bytes = f.read()
+        img_turbojpeg = mmcv.imfrombytes(img_bytes)
+        assert img_turbojpeg.shape == (300, 400, 3)
+        assert_array_equal(img_cv2, img_turbojpeg)
+
+        # backend turbojpeg, channel order: rgb
+        with open(self.img_path, 'rb') as f:
+            img_bytes = f.read()
+        img_rgb_turbojpeg = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+        assert img_rgb_turbojpeg.shape == (300, 400, 3)
+        assert_array_equal(img_rgb_turbojpeg, img_cv2[:, :, ::-1])
+
+        # backend turbojpeg, grayscale, decode as 3 channels
+        with open(self.gray_img_path, 'rb') as f:
+            img_bytes = f.read()
+        gray_img_turbojpeg = mmcv.imfrombytes(img_bytes)
+        assert gray_img_turbojpeg.shape == (300, 400, 3)
+        assert_array_equal(gray_img_rgb_cv2, gray_img_turbojpeg)
+
+        # backend turbojpeg, grayscale
+        with open(self.gray_img_path, 'rb') as f:
+            img_bytes = f.read()
+        gray_img_turbojpeg = mmcv.imfrombytes(img_bytes, flag='grayscale')
+        assert gray_img_turbojpeg.shape == (300, 400)
+        assert_array_equal(gray_img_cv2, gray_img_turbojpeg)
+
+        # backend turbojpeg, grayscale dim3
+        with open(self.gray_img_dim3_path, 'rb') as f:
+            img_bytes = f.read()
+        gray_img_dim3_turbojpeg = mmcv.imfrombytes(img_bytes, flag='grayscale')
+        assert gray_img_dim3_turbojpeg.shape == (300, 400)
+        assert_array_equal(gray_img_dim3_cv2, gray_img_dim3_turbojpeg)
+
+        mmcv.use_backend('cv2')
+
+        with pytest.raises(ValueError):
+            with open(self.img_path, 'rb') as f:
+                img_bytes = f.read()
+            mmcv.imfrombytes(img_bytes, backend='unsupported_backend')
+
+    def test_imwrite(self):
+        img = mmcv.imread(self.img_path)
+        out_file = osp.join(tempfile.gettempdir(), 'mmcv_test.jpg')
+
+        # file_client_args and backend_args can not be both set
+        with pytest.raises(
+                ValueError,
+                match='"file_client_args" and "backend_args" cannot be set'):
+            mmcv.imwrite(
+                img,
+                out_file,
+                file_client_args={'backend': 'disk'},
+                backend_args={'backend': 'disk'})
+
+        mmcv.imwrite(img, out_file)
+        rewrite_img = mmcv.imread(out_file)
+        os.remove(out_file)
+        self.assert_img_equal(img, rewrite_img)
+
+        # test petrel client
+        with patch.object(
+                PetrelBackend, 'put', return_value=None) as mock_method:
+            ret = mmcv.imwrite(img, self.s3_path)
+            ret_with_args = mmcv.imwrite(
+                img, self.s3_path, file_client_args={'backend': 'petrel'})
+            assert ret
+            assert ret_with_args
+            mock_method.assert_called()
+
+            mock_method.reset_mock()
+
+            ret_with_args = mmcv.imwrite(
+                img, self.s3_path, backend_args={'backend': 'petrel'})
+            assert ret_with_args
+            mock_method.assert_called()
+
+        with pytest.raises(cv2.error):
+            mmcv.imwrite(img, 'error_file.jppg')
+
+    @patch('mmcv.image.io.TurboJPEG', None)
+    def test_no_turbojpeg(self):
+        with pytest.raises(ImportError):
+            mmcv.use_backend('turbojpeg')
+
+        mmcv.use_backend('cv2')
+
+    @patch('mmcv.image.io.Image', None)
+    def test_no_pillow(self):
+        with pytest.raises(ImportError):
+            mmcv.use_backend('pillow')
+
+        mmcv.use_backend('cv2')
diff --git a/head_extractor/mmcv-2.1.0/tests/test_image/test_photometric.py b/head_extractor/mmcv-2.1.0/tests/test_image/test_photometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..2288a5ef62e3bcee8f4c62ac5453d183b50c1241
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_image/test_photometric.py
@@ -0,0 +1,426 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import cv2
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+import mmcv
+
+
+class TestPhotometric:
+
+    @classmethod
+    def setup_class(cls):
+        # the test img resolution is 400x300
+        cls.img_path = osp.join(osp.dirname(__file__), '../data/color.jpg')
+        cls.img = cv2.imread(cls.img_path)
+        cls.mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
+        cls.std = np.array([58.395, 57.12, 57.375], dtype=np.float32)
+
+    def test_imnormalize(self):
+        rgb_img = self.img[:, :, ::-1]
+        baseline = (rgb_img - self.mean) / self.std
+        img = mmcv.imnormalize(self.img, self.mean, self.std)
+        assert np.allclose(img, baseline)
+        assert id(img) != id(self.img)
+        img = mmcv.imnormalize(rgb_img, self.mean, self.std, to_rgb=False)
+        assert np.allclose(img, baseline)
+        assert id(img) != id(rgb_img)
+
+    def test_imnormalize_(self):
+        img_for_normalize = np.float32(self.img)
+        rgb_img_for_normalize = np.float32(self.img[:, :, ::-1])
+        baseline = (rgb_img_for_normalize - self.mean) / self.std
+        img = mmcv.imnormalize_(img_for_normalize, self.mean, self.std)
+        assert np.allclose(img_for_normalize, baseline)
+        assert id(img) == id(img_for_normalize)
+        img = mmcv.imnormalize_(
+            rgb_img_for_normalize, self.mean, self.std, to_rgb=False)
+        assert np.allclose(img, baseline)
+        assert id(img) == id(rgb_img_for_normalize)
+
+    def test_imdenormalize(self):
+        norm_img = (self.img[:, :, ::-1] - self.mean) / self.std
+        rgb_baseline = (norm_img * self.std + self.mean)
+        bgr_baseline = rgb_baseline[:, :, ::-1]
+        img = mmcv.imdenormalize(norm_img, self.mean, self.std)
+        assert np.allclose(img, bgr_baseline)
+        img = mmcv.imdenormalize(norm_img, self.mean, self.std, to_bgr=False)
+        assert np.allclose(img, rgb_baseline)
+
+    def test_iminvert(self):
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img_r = np.array([[255, 127, 0], [254, 128, 1], [253, 126, 2]],
+                         dtype=np.uint8)
+        assert_array_equal(mmcv.iminvert(img), img_r)
+
+    def test_solarize(self):
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img_r = np.array([[0, 127, 0], [1, 127, 1], [2, 126, 2]],
+                         dtype=np.uint8)
+        assert_array_equal(mmcv.solarize(img), img_r)
+        img_r = np.array([[0, 127, 0], [1, 128, 1], [2, 126, 2]],
+                         dtype=np.uint8)
+        assert_array_equal(mmcv.solarize(img, 100), img_r)
+
+    def test_posterize(self):
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img_r = np.array([[0, 128, 128], [0, 0, 128], [0, 128, 128]],
+                         dtype=np.uint8)
+        assert_array_equal(mmcv.posterize(img, 1), img_r)
+        img_r = np.array([[0, 128, 224], [0, 96, 224], [0, 128, 224]],
+                         dtype=np.uint8)
+        assert_array_equal(mmcv.posterize(img, 3), img_r)
+
+    def test_adjust_color(self, nb_rand_test=100):
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+        assert_array_equal(mmcv.adjust_color(img), img)
+        img_gray = mmcv.bgr2gray(img)
+        img_r = np.stack([img_gray, img_gray, img_gray], axis=-1)
+        assert_array_equal(mmcv.adjust_color(img, 0), img_r)
+        assert_array_equal(mmcv.adjust_color(img, 0, 1), img_r)
+        assert_array_equal(
+            mmcv.adjust_color(img, 0.5, 0.5),
+            np.round(np.clip((img * 0.5 + img_r * 0.5), 0,
+                             255)).astype(img.dtype))
+        assert_array_equal(
+            mmcv.adjust_color(img, 1, 1.5),
+            np.round(np.clip(img * 1 + img_r * 1.5, 0, 255)).astype(img.dtype))
+        assert_array_equal(
+            mmcv.adjust_color(img, 0.8, -0.6, gamma=2),
+            np.round(np.clip(img * 0.8 - 0.6 * img_r + 2, 0,
+                             255)).astype(img.dtype))
+        assert_array_equal(
+            mmcv.adjust_color(img, 0.8, -0.6, gamma=-0.6),
+            np.round(np.clip(img * 0.8 - 0.6 * img_r - 0.6, 0,
+                             255)).astype(img.dtype))
+
+        # test float type of image
+        img = img.astype(np.float32)
+        assert_array_equal(
+            np.round(mmcv.adjust_color(img, 0.8, -0.6, gamma=-0.6)),
+            np.round(np.clip(img * 0.8 - 0.6 * img_r - 0.6, 0, 255)))
+
+        # test equalize with randomly sampled image.
+        for _ in range(nb_rand_test):
+            img = np.clip(np.random.normal(0, 1, (256, 256, 3)) * 260, 0,
+                          255).astype(np.uint8)
+            factor = np.random.uniform()
+            cv2_img = mmcv.adjust_color(img, alpha=factor)
+            pil_img = mmcv.adjust_color(img, alpha=factor, backend='pillow')
+            np.testing.assert_allclose(cv2_img, pil_img, rtol=0, atol=2)
+
+        # the input type must be uint8 for pillow backend
+        with pytest.raises(AssertionError):
+            mmcv.adjust_color(img.astype(np.float32), backend='pillow')
+
+        # backend must be 'cv2' or 'pillow'
+        with pytest.raises(ValueError):
+            mmcv.adjust_color(img.astype(np.uint8), backend='not support')
+
+    def test_imequalize(self, nb_rand_test=100):
+
+        def _imequalize(img):
+            # equalize the image using PIL.ImageOps.equalize
+            from PIL import Image, ImageOps
+            img = Image.fromarray(img)
+            equalized_img = np.asarray(ImageOps.equalize(img))
+            return equalized_img
+
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+        equalized_img = mmcv.imequalize(img)
+        assert_array_equal(equalized_img, _imequalize(img))
+
+        # test equalize with case step=0
+        img = np.array([[0, 0, 0], [120, 120, 120], [255, 255, 255]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+        assert_array_equal(mmcv.imequalize(img), img)
+
+        # test equalize with randomly sampled image.
+        for _ in range(nb_rand_test):
+            img = np.clip(np.random.normal(0, 1, (256, 256, 3)) * 260, 0,
+                          255).astype(np.uint8)
+            equalized_img = mmcv.imequalize(img)
+            assert_array_equal(equalized_img, _imequalize(img))
+
+    def test_adjust_brightness(self, nb_rand_test=100):
+
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+        # test case with factor 1.0
+        assert_array_equal(mmcv.adjust_brightness(img, 1.), img)
+        # test case with factor 0.0
+        assert_array_equal(mmcv.adjust_brightness(img, 0.), np.zeros_like(img))
+        # test adjust_brightness with randomly sampled images and factors.
+        for _ in range(nb_rand_test):
+            img = np.clip(
+                np.random.uniform(0, 1, (1000, 1200, 3)) * 260, 0,
+                255).astype(np.uint8)
+            factor = np.random.uniform() + np.random.choice([0, 1])
+            np.testing.assert_allclose(
+                mmcv.adjust_brightness(img, factor).astype(np.int32),
+                mmcv.adjust_brightness(img, factor,
+                                       backend='pillow').astype(np.int32),
+                rtol=0,
+                atol=1)
+
+        # the input type must be uint8 for pillow backend
+        with pytest.raises(AssertionError):
+            mmcv.adjust_brightness(img.astype(np.float32), backend='pillow')
+
+        # backend must be 'cv2' or 'pillow'
+        with pytest.raises(ValueError):
+            mmcv.adjust_brightness(img.astype(np.uint8), backend='not support')
+
+    def test_adjust_contrast(self, nb_rand_test=100):
+
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+        # test case with factor 1.0
+        assert_array_equal(mmcv.adjust_contrast(img, 1.), img)
+        # test case with factor 0.0
+        assert_array_equal(
+            mmcv.adjust_contrast(img, 0.),
+            mmcv.adjust_contrast(img, 0., backend='pillow'))
+        # test adjust_contrast with randomly sampled images and factors.
+        for _ in range(nb_rand_test):
+            img = np.clip(
+                np.random.uniform(0, 1, (1200, 1000, 3)) * 260, 0,
+                255).astype(np.uint8)
+            factor = np.random.uniform() + np.random.choice([0, 1])
+            # Note the gap (less_equal 1) between PIL.ImageEnhance.Contrast
+            # and mmcv.adjust_contrast comes from the gap that converts from
+            # a color image to gray image using mmcv or PIL.
+            np.testing.assert_allclose(
+                mmcv.adjust_contrast(img, factor).astype(np.int32),
+                mmcv.adjust_contrast(img, factor,
+                                     backend='pillow').astype(np.int32),
+                rtol=0,
+                atol=1)
+
+        # the input type must be uint8 pillow backend
+        with pytest.raises(AssertionError):
+            mmcv.adjust_contrast(img.astype(np.float32), backend='pillow')
+
+        # backend must be 'cv2' or 'pillow'
+        with pytest.raises(ValueError):
+            mmcv.adjust_contrast(img.astype(np.uint8), backend='not support')
+
+    def test_auto_contrast(self, nb_rand_test=100):
+
+        def _auto_contrast(img, cutoff=0):
+            from PIL import Image
+            from PIL.ImageOps import autocontrast
+
+            # Image.fromarray defaultly supports RGB, not BGR.
+            # convert from BGR to RGB
+            img = Image.fromarray(img[..., ::-1], mode='RGB')
+            contrasted_img = autocontrast(img, cutoff)
+            # convert from RGB to BGR
+            return np.asarray(contrasted_img)[..., ::-1]
+
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+
+        # test case without cut-off
+        assert_array_equal(mmcv.auto_contrast(img), _auto_contrast(img))
+        # test case with cut-off as int
+        assert_array_equal(
+            mmcv.auto_contrast(img, 10), _auto_contrast(img, 10))
+        # test case with cut-off as float
+        assert_array_equal(
+            mmcv.auto_contrast(img, 12.5), _auto_contrast(img, 12.5))
+        # test case with cut-off as tuple
+        assert_array_equal(
+            mmcv.auto_contrast(img, (10, 10)), _auto_contrast(img, 10))
+        # test case with cut-off with sum over 100
+        assert_array_equal(
+            mmcv.auto_contrast(img, 60), _auto_contrast(img, 60))
+
+        # test auto_contrast with randomly sampled images and factors.
+        for _ in range(nb_rand_test):
+            img = np.clip(
+                np.random.uniform(0, 1, (1200, 1000, 3)) * 260, 0,
+                255).astype(np.uint8)
+            # cut-offs are not set as tuple since in `build.yml`, pillow 6.2.2
+            # is installed, which does not support setting low cut-off and high
+            #  cut-off differently.
+            # With pillow above 8.0.0, cutoff can be set as tuple
+            cutoff = np.random.rand() * 100
+            assert_array_equal(
+                mmcv.auto_contrast(img, cutoff), _auto_contrast(img, cutoff))
+
+    def test_adjust_sharpness(self, nb_rand_test=100):
+
+        def _adjust_sharpness(img, factor):
+            # adjust the sharpness of image using
+            # PIL.ImageEnhance.Sharpness
+            from PIL import Image
+            from PIL.ImageEnhance import Sharpness
+            img = Image.fromarray(img)
+            sharpened_img = Sharpness(img).enhance(factor)
+            return np.asarray(sharpened_img)
+
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+
+        # test case with invalid type of kernel
+        with pytest.raises(AssertionError):
+            mmcv.adjust_sharpness(img, 1., kernel=1.)
+        # test case with invalid shape of kernel
+        kernel = np.ones((3, 3, 3))
+        with pytest.raises(AssertionError):
+            mmcv.adjust_sharpness(img, 1., kernel=kernel)
+        # test case with all-zero kernel, factor 0.0
+        kernel = np.zeros((3, 3))
+        assert_array_equal(
+            mmcv.adjust_sharpness(img, 0., kernel=kernel), np.zeros_like(img))
+
+        # test case with factor 1.0
+        assert_array_equal(mmcv.adjust_sharpness(img, 1.), img)
+        # test adjust_sharpness with randomly sampled images and factors.
+        for _ in range(nb_rand_test):
+            img = np.clip(
+                np.random.uniform(0, 1, (1000, 1200, 3)) * 260, 0,
+                255).astype(np.uint8)
+            factor = np.random.uniform()
+            # Note the gap between PIL.ImageEnhance.Sharpness and
+            # mmcv.adjust_sharpness mainly comes from the difference ways of
+            # handling img edges when applying filters
+            np.testing.assert_allclose(
+                mmcv.adjust_sharpness(img, factor).astype(np.int32)[1:-1,
+                                                                    1:-1],
+                _adjust_sharpness(img, factor).astype(np.int32)[1:-1, 1:-1],
+                rtol=0,
+                atol=1)
+
+    def test_adjust_lighting(self):
+        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+
+        # eigval and eigvec must be np.ndarray
+        with pytest.raises(AssertionError):
+            mmcv.adjust_lighting(img, 1, np.ones((3, 1)))
+        with pytest.raises(AssertionError):
+            mmcv.adjust_lighting(img, np.array([1]), (1, 1, 1))
+        # we must have the same number of eigval and eigvec
+        with pytest.raises(AssertionError):
+            mmcv.adjust_lighting(img, np.array([1]), np.eye(2))
+        with pytest.raises(AssertionError):
+            mmcv.adjust_lighting(img, np.array([1]), np.array([1]))
+
+        img_adjusted = mmcv.adjust_lighting(
+            img,
+            np.random.normal(0, 1, 2),
+            np.random.normal(0, 1, (3, 2)),
+            alphastd=0.)
+        assert_array_equal(img_adjusted, img)
+
+    def test_lut_transform(self):
+        lut_table = np.array(list(range(256)))
+
+        # test assertion image values should between 0 and 255.
+        with pytest.raises(AssertionError):
+            mmcv.lut_transform(np.array([256]), lut_table)
+        with pytest.raises(AssertionError):
+            mmcv.lut_transform(np.array([-1]), lut_table)
+
+        # test assertion lut_table should be ndarray with shape (256, )
+        with pytest.raises(AssertionError):
+            mmcv.lut_transform(np.array([0]), list(range(256)))
+        with pytest.raises(AssertionError):
+            mmcv.lut_transform(np.array([1]), np.array(list(range(257))))
+
+        img = mmcv.lut_transform(self.img, lut_table)
+        baseline = cv2.LUT(self.img, lut_table)
+        assert np.allclose(img, baseline)
+
+        input_img = np.array(
+            [[[0, 128, 255], [255, 128, 0]], [[0, 128, 255], [255, 128, 0]]],
+            dtype=float)
+        img = mmcv.lut_transform(input_img, lut_table)
+        baseline = cv2.LUT(np.array(input_img, dtype=np.uint8), lut_table)
+        assert np.allclose(img, baseline)
+
+        input_img = np.random.randint(0, 256, size=(7, 8, 9, 10, 11))
+        img = mmcv.lut_transform(input_img, lut_table)
+        baseline = cv2.LUT(np.array(input_img, dtype=np.uint8), lut_table)
+        assert np.allclose(img, baseline)
+
+    def test_clahe(self):
+
+        def _clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
+            clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
+            return clahe.apply(np.array(img, dtype=np.uint8))
+
+        # test assertion image should have the right shape
+        with pytest.raises(AssertionError):
+            mmcv.clahe(self.img)
+
+        # test assertion tile_grid_size should be a tuple with 2 integers
+        with pytest.raises(AssertionError):
+            mmcv.clahe(self.img[:, :, 0], tile_grid_size=(8.0, 8.0))
+        with pytest.raises(AssertionError):
+            mmcv.clahe(self.img[:, :, 0], tile_grid_size=(8, 8, 8))
+        with pytest.raises(AssertionError):
+            mmcv.clahe(self.img[:, :, 0], tile_grid_size=[8, 8])
+
+        # test with different channels
+        for i in range(self.img.shape[-1]):
+            img = mmcv.clahe(self.img[:, :, i])
+            img_std = _clahe(self.img[:, :, i])
+            assert np.allclose(img, img_std)
+            assert id(img) != id(self.img[:, :, i])
+            assert id(img_std) != id(self.img[:, :, i])
+
+        # test case with clip_limit=1.2
+        for i in range(self.img.shape[-1]):
+            img = mmcv.clahe(self.img[:, :, i], 1.2)
+            img_std = _clahe(self.img[:, :, i], 1.2)
+            assert np.allclose(img, img_std)
+            assert id(img) != id(self.img[:, :, i])
+            assert id(img_std) != id(self.img[:, :, i])
+
+    def test_adjust_hue(self):
+        # test case with img is not ndarray
+        from PIL import Image
+        pil_img = Image.fromarray(self.img)
+
+        with pytest.raises(TypeError):
+            mmcv.adjust_hue(pil_img, hue_factor=0.0)
+
+        # test case with hue_factor > 0.5 or hue_factor < -0.5
+        with pytest.raises(ValueError):
+            mmcv.adjust_hue(self.img, hue_factor=-0.6)
+        with pytest.raises(ValueError):
+            mmcv.adjust_hue(self.img, hue_factor=0.6)
+
+        for i in np.arange(-0.5, 0.5, 0.2):
+            pil_res = mmcv.adjust_hue(self.img, hue_factor=i, backend='pillow')
+            pil_res = np.array(pil_res)
+            cv2_res = mmcv.adjust_hue(self.img, hue_factor=i)
+            assert np.allclose(pil_res, cv2_res, atol=10.0)
+
+        # test pillow backend
+        with pytest.raises(AssertionError):
+            mmcv.adjust_hue(
+                self.img.astype(np.float32), hue_factor=0, backend='pillow')
+
+        # backend must be 'cv2' or 'pillow'
+        with pytest.raises(ValueError):
+            mmcv.adjust_hue(
+                self.img.astype(np.uint8), hue_factor=0, backend='not support')
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/output.pkl b/head_extractor/mmcv-2.1.0/tests/test_ops/output.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..13d5a982d1c5696baaf335d4251990ce53143e9b
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/output.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a40869eb5a2b2e70be30eda303462686af1f8ccb8ef0e140f5bc4dd515459442
+size 2168
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_active_rotated_filter.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_active_rotated_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2f7295abde2531dc9ccede694969b4fdd7903c9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_active_rotated_filter.py
@@ -0,0 +1,263 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import active_rotated_filter
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
+
+np_feature = np.array([[[[[-1.4934e-01, 1.1341e+00, -1.6241e-01],
+                          [-1.0986e+00, -1.1463e+00, -1.3176e+00],
+                          [1.4808e+00, 7.6572e-01, -1.4548e+00]]]],
+                       [[[[1.9370e+00, 6.2799e-01, 2.5834e-02],
+                          [-1.4242e+00, 7.6566e-01, 1.0015e+00],
+                          [9.8669e-01, 4.1356e-01, 6.1068e-01]]]],
+                       [[[[1.4565e+00, 1.4960e+00, 2.4339e-01],
+                          [-2.2484e-01, 7.5942e-01, -8.1184e-01],
+                          [-1.7077e+00, 1.0658e+00, 3.8311e-01]]]],
+                       [[[[8.4734e-01, 1.0904e+00, 2.4356e+00],
+                          [9.5822e-01, 2.2260e-01, -2.4450e-01],
+                          [-1.5078e+00, 7.0902e-02, -1.5921e+00]]]],
+                       [[[[2.1173e+00, -7.3524e-01, 1.8888e+00],
+                          [1.0169e+00, 4.7033e-01, -1.0875e+00],
+                          [-1.0736e+00, -5.2245e-01, -2.8733e-01]]]],
+                       [[[[-5.6433e-01, 1.5835e+00, -1.5826e+00],
+                          [-8.8974e-01, -4.3128e-01, -2.2423e-01],
+                          [1.6552e-03, -1.7292e+00, 2.6639e-01]]]],
+                       [[[[-1.2951e-01, 1.3493e+00, -1.9329e+00],
+                          [5.6248e-01, -5.1189e-01, 1.3614e+00],
+                          [3.3680e-01, -8.7148e-01, 5.0592e-01]]]],
+                       [[[[1.6781e-02, -8.3929e-01, 1.2060e+00],
+                          [-1.0764e+00, 4.7821e-01, 1.5342e+00],
+                          [-4.4542e-01, -1.8606e+00, 3.0827e-01]]]]])
+
+np_indices = np.array([[[[1, 2, 3, 6, 9, 8, 7, 4], [2, 3, 6, 9, 8, 7, 4, 1],
+                         [3, 6, 9, 8, 7, 4, 1, 2]],
+                        [[4, 1, 2, 3, 6, 9, 8, 7], [5, 5, 5, 5, 5, 5, 5, 5],
+                         [6, 9, 8, 7, 4, 1, 2, 3]],
+                        [[7, 4, 1, 2, 3, 6, 9, 8], [8, 7, 4, 1, 2, 3, 6, 9],
+                         [9, 8, 7, 4, 1, 2, 3, 6]]]])
+
+expected_output = np.array([[[[-1.4934e-01, 1.1341e+00, -1.6241e-01],
+                              [-1.0986e+00, -1.1463e+00, -1.3176e+00],
+                              [1.4808e+00, 7.6572e-01, -1.4548e+00]]],
+                            [[[-1.0986e+00, -1.4934e-01, 1.1341e+00],
+                              [1.4808e+00, -1.1463e+00, -1.6241e-01],
+                              [7.6572e-01, -1.4548e+00, -1.3176e+00]]],
+                            [[[1.4808e+00, -1.0986e+00, -1.4934e-01],
+                              [7.6572e-01, -1.1463e+00, 1.1341e+00],
+                              [-1.4548e+00, -1.3176e+00, -1.6241e-01]]],
+                            [[[7.6572e-01, 1.4808e+00, -1.0986e+00],
+                              [-1.4548e+00, -1.1463e+00, -1.4934e-01],
+                              [-1.3176e+00, -1.6241e-01, 1.1341e+00]]],
+                            [[[-1.4548e+00, 7.6572e-01, 1.4808e+00],
+                              [-1.3176e+00, -1.1463e+00, -1.0986e+00],
+                              [-1.6241e-01, 1.1341e+00, -1.4934e-01]]],
+                            [[[-1.3176e+00, -1.4548e+00, 7.6572e-01],
+                              [-1.6241e-01, -1.1463e+00, 1.4808e+00],
+                              [1.1341e+00, -1.4934e-01, -1.0986e+00]]],
+                            [[[-1.6241e-01, -1.3176e+00, -1.4548e+00],
+                              [1.1341e+00, -1.1463e+00, 7.6572e-01],
+                              [-1.4934e-01, -1.0986e+00, 1.4808e+00]]],
+                            [[[1.1341e+00, -1.6241e-01, -1.3176e+00],
+                              [-1.4934e-01, -1.1463e+00, -1.4548e+00],
+                              [-1.0986e+00, 1.4808e+00, 7.6572e-01]]],
+                            [[[1.9370e+00, 6.2799e-01, 2.5834e-02],
+                              [-1.4242e+00, 7.6566e-01, 1.0015e+00],
+                              [9.8669e-01, 4.1356e-01, 6.1068e-01]]],
+                            [[[-1.4242e+00, 1.9370e+00, 6.2799e-01],
+                              [9.8669e-01, 7.6566e-01, 2.5834e-02],
+                              [4.1356e-01, 6.1068e-01, 1.0015e+00]]],
+                            [[[9.8669e-01, -1.4242e+00, 1.9370e+00],
+                              [4.1356e-01, 7.6566e-01, 6.2799e-01],
+                              [6.1068e-01, 1.0015e+00, 2.5834e-02]]],
+                            [[[4.1356e-01, 9.8669e-01, -1.4242e+00],
+                              [6.1068e-01, 7.6566e-01, 1.9370e+00],
+                              [1.0015e+00, 2.5834e-02, 6.2799e-01]]],
+                            [[[6.1068e-01, 4.1356e-01, 9.8669e-01],
+                              [1.0015e+00, 7.6566e-01, -1.4242e+00],
+                              [2.5834e-02, 6.2799e-01, 1.9370e+00]]],
+                            [[[1.0015e+00, 6.1068e-01, 4.1356e-01],
+                              [2.5834e-02, 7.6566e-01, 9.8669e-01],
+                              [6.2799e-01, 1.9370e+00, -1.4242e+00]]],
+                            [[[2.5834e-02, 1.0015e+00, 6.1068e-01],
+                              [6.2799e-01, 7.6566e-01, 4.1356e-01],
+                              [1.9370e+00, -1.4242e+00, 9.8669e-01]]],
+                            [[[6.2799e-01, 2.5834e-02, 1.0015e+00],
+                              [1.9370e+00, 7.6566e-01, 6.1068e-01],
+                              [-1.4242e+00, 9.8669e-01, 4.1356e-01]]],
+                            [[[1.4565e+00, 1.4960e+00, 2.4339e-01],
+                              [-2.2484e-01, 7.5942e-01, -8.1184e-01],
+                              [-1.7077e+00, 1.0658e+00, 3.8311e-01]]],
+                            [[[-2.2484e-01, 1.4565e+00, 1.4960e+00],
+                              [-1.7077e+00, 7.5942e-01, 2.4339e-01],
+                              [1.0658e+00, 3.8311e-01, -8.1184e-01]]],
+                            [[[-1.7077e+00, -2.2484e-01, 1.4565e+00],
+                              [1.0658e+00, 7.5942e-01, 1.4960e+00],
+                              [3.8311e-01, -8.1184e-01, 2.4339e-01]]],
+                            [[[1.0658e+00, -1.7077e+00, -2.2484e-01],
+                              [3.8311e-01, 7.5942e-01, 1.4565e+00],
+                              [-8.1184e-01, 2.4339e-01, 1.4960e+00]]],
+                            [[[3.8311e-01, 1.0658e+00, -1.7077e+00],
+                              [-8.1184e-01, 7.5942e-01, -2.2484e-01],
+                              [2.4339e-01, 1.4960e+00, 1.4565e+00]]],
+                            [[[-8.1184e-01, 3.8311e-01, 1.0658e+00],
+                              [2.4339e-01, 7.5942e-01, -1.7077e+00],
+                              [1.4960e+00, 1.4565e+00, -2.2484e-01]]],
+                            [[[2.4339e-01, -8.1184e-01, 3.8311e-01],
+                              [1.4960e+00, 7.5942e-01, 1.0658e+00],
+                              [1.4565e+00, -2.2484e-01, -1.7077e+00]]],
+                            [[[1.4960e+00, 2.4339e-01, -8.1184e-01],
+                              [1.4565e+00, 7.5942e-01, 3.8311e-01],
+                              [-2.2484e-01, -1.7077e+00, 1.0658e+00]]],
+                            [[[8.4734e-01, 1.0904e+00, 2.4356e+00],
+                              [9.5822e-01, 2.2260e-01, -2.4450e-01],
+                              [-1.5078e+00, 7.0902e-02, -1.5921e+00]]],
+                            [[[9.5822e-01, 8.4734e-01, 1.0904e+00],
+                              [-1.5078e+00, 2.2260e-01, 2.4356e+00],
+                              [7.0902e-02, -1.5921e+00, -2.4450e-01]]],
+                            [[[-1.5078e+00, 9.5822e-01, 8.4734e-01],
+                              [7.0902e-02, 2.2260e-01, 1.0904e+00],
+                              [-1.5921e+00, -2.4450e-01, 2.4356e+00]]],
+                            [[[7.0902e-02, -1.5078e+00, 9.5822e-01],
+                              [-1.5921e+00, 2.2260e-01, 8.4734e-01],
+                              [-2.4450e-01, 2.4356e+00, 1.0904e+00]]],
+                            [[[-1.5921e+00, 7.0902e-02, -1.5078e+00],
+                              [-2.4450e-01, 2.2260e-01, 9.5822e-01],
+                              [2.4356e+00, 1.0904e+00, 8.4734e-01]]],
+                            [[[-2.4450e-01, -1.5921e+00, 7.0902e-02],
+                              [2.4356e+00, 2.2260e-01, -1.5078e+00],
+                              [1.0904e+00, 8.4734e-01, 9.5822e-01]]],
+                            [[[2.4356e+00, -2.4450e-01, -1.5921e+00],
+                              [1.0904e+00, 2.2260e-01, 7.0902e-02],
+                              [8.4734e-01, 9.5822e-01, -1.5078e+00]]],
+                            [[[1.0904e+00, 2.4356e+00, -2.4450e-01],
+                              [8.4734e-01, 2.2260e-01, -1.5921e+00],
+                              [9.5822e-01, -1.5078e+00, 7.0902e-02]]],
+                            [[[2.1173e+00, -7.3524e-01, 1.8888e+00],
+                              [1.0169e+00, 4.7033e-01, -1.0875e+00],
+                              [-1.0736e+00, -5.2245e-01, -2.8733e-01]]],
+                            [[[1.0169e+00, 2.1173e+00, -7.3524e-01],
+                              [-1.0736e+00, 4.7033e-01, 1.8888e+00],
+                              [-5.2245e-01, -2.8733e-01, -1.0875e+00]]],
+                            [[[-1.0736e+00, 1.0169e+00, 2.1173e+00],
+                              [-5.2245e-01, 4.7033e-01, -7.3524e-01],
+                              [-2.8733e-01, -1.0875e+00, 1.8888e+00]]],
+                            [[[-5.2245e-01, -1.0736e+00, 1.0169e+00],
+                              [-2.8733e-01, 4.7033e-01, 2.1173e+00],
+                              [-1.0875e+00, 1.8888e+00, -7.3524e-01]]],
+                            [[[-2.8733e-01, -5.2245e-01, -1.0736e+00],
+                              [-1.0875e+00, 4.7033e-01, 1.0169e+00],
+                              [1.8888e+00, -7.3524e-01, 2.1173e+00]]],
+                            [[[-1.0875e+00, -2.8733e-01, -5.2245e-01],
+                              [1.8888e+00, 4.7033e-01, -1.0736e+00],
+                              [-7.3524e-01, 2.1173e+00, 1.0169e+00]]],
+                            [[[1.8888e+00, -1.0875e+00, -2.8733e-01],
+                              [-7.3524e-01, 4.7033e-01, -5.2245e-01],
+                              [2.1173e+00, 1.0169e+00, -1.0736e+00]]],
+                            [[[-7.3524e-01, 1.8888e+00, -1.0875e+00],
+                              [2.1173e+00, 4.7033e-01, -2.8733e-01],
+                              [1.0169e+00, -1.0736e+00, -5.2245e-01]]],
+                            [[[-5.6433e-01, 1.5835e+00, -1.5826e+00],
+                              [-8.8974e-01, -4.3128e-01, -2.2423e-01],
+                              [1.6552e-03, -1.7292e+00, 2.6639e-01]]],
+                            [[[-8.8974e-01, -5.6433e-01, 1.5835e+00],
+                              [1.6552e-03, -4.3128e-01, -1.5826e+00],
+                              [-1.7292e+00, 2.6639e-01, -2.2423e-01]]],
+                            [[[1.6552e-03, -8.8974e-01, -5.6433e-01],
+                              [-1.7292e+00, -4.3128e-01, 1.5835e+00],
+                              [2.6639e-01, -2.2423e-01, -1.5826e+00]]],
+                            [[[-1.7292e+00, 1.6552e-03, -8.8974e-01],
+                              [2.6639e-01, -4.3128e-01, -5.6433e-01],
+                              [-2.2423e-01, -1.5826e+00, 1.5835e+00]]],
+                            [[[2.6639e-01, -1.7292e+00, 1.6552e-03],
+                              [-2.2423e-01, -4.3128e-01, -8.8974e-01],
+                              [-1.5826e+00, 1.5835e+00, -5.6433e-01]]],
+                            [[[-2.2423e-01, 2.6639e-01, -1.7292e+00],
+                              [-1.5826e+00, -4.3128e-01, 1.6552e-03],
+                              [1.5835e+00, -5.6433e-01, -8.8974e-01]]],
+                            [[[-1.5826e+00, -2.2423e-01, 2.6639e-01],
+                              [1.5835e+00, -4.3128e-01, -1.7292e+00],
+                              [-5.6433e-01, -8.8974e-01, 1.6552e-03]]],
+                            [[[1.5835e+00, -1.5826e+00, -2.2423e-01],
+                              [-5.6433e-01, -4.3128e-01, 2.6639e-01],
+                              [-8.8974e-01, 1.6552e-03, -1.7292e+00]]],
+                            [[[-1.2951e-01, 1.3493e+00, -1.9329e+00],
+                              [5.6248e-01, -5.1189e-01, 1.3614e+00],
+                              [3.3680e-01, -8.7148e-01, 5.0592e-01]]],
+                            [[[5.6248e-01, -1.2951e-01, 1.3493e+00],
+                              [3.3680e-01, -5.1189e-01, -1.9329e+00],
+                              [-8.7148e-01, 5.0592e-01, 1.3614e+00]]],
+                            [[[3.3680e-01, 5.6248e-01, -1.2951e-01],
+                              [-8.7148e-01, -5.1189e-01, 1.3493e+00],
+                              [5.0592e-01, 1.3614e+00, -1.9329e+00]]],
+                            [[[-8.7148e-01, 3.3680e-01, 5.6248e-01],
+                              [5.0592e-01, -5.1189e-01, -1.2951e-01],
+                              [1.3614e+00, -1.9329e+00, 1.3493e+00]]],
+                            [[[5.0592e-01, -8.7148e-01, 3.3680e-01],
+                              [1.3614e+00, -5.1189e-01, 5.6248e-01],
+                              [-1.9329e+00, 1.3493e+00, -1.2951e-01]]],
+                            [[[1.3614e+00, 5.0592e-01, -8.7148e-01],
+                              [-1.9329e+00, -5.1189e-01, 3.3680e-01],
+                              [1.3493e+00, -1.2951e-01, 5.6248e-01]]],
+                            [[[-1.9329e+00, 1.3614e+00, 5.0592e-01],
+                              [1.3493e+00, -5.1189e-01, -8.7148e-01],
+                              [-1.2951e-01, 5.6248e-01, 3.3680e-01]]],
+                            [[[1.3493e+00, -1.9329e+00, 1.3614e+00],
+                              [-1.2951e-01, -5.1189e-01, 5.0592e-01],
+                              [5.6248e-01, 3.3680e-01, -8.7148e-01]]],
+                            [[[1.6781e-02, -8.3929e-01, 1.2060e+00],
+                              [-1.0764e+00, 4.7821e-01, 1.5342e+00],
+                              [-4.4542e-01, -1.8606e+00, 3.0827e-01]]],
+                            [[[-1.0764e+00, 1.6781e-02, -8.3929e-01],
+                              [-4.4542e-01, 4.7821e-01, 1.2060e+00],
+                              [-1.8606e+00, 3.0827e-01, 1.5342e+00]]],
+                            [[[-4.4542e-01, -1.0764e+00, 1.6781e-02],
+                              [-1.8606e+00, 4.7821e-01, -8.3929e-01],
+                              [3.0827e-01, 1.5342e+00, 1.2060e+00]]],
+                            [[[-1.8606e+00, -4.4542e-01, -1.0764e+00],
+                              [3.0827e-01, 4.7821e-01, 1.6781e-02],
+                              [1.5342e+00, 1.2060e+00, -8.3929e-01]]],
+                            [[[3.0827e-01, -1.8606e+00, -4.4542e-01],
+                              [1.5342e+00, 4.7821e-01, -1.0764e+00],
+                              [1.2060e+00, -8.3929e-01, 1.6781e-02]]],
+                            [[[1.5342e+00, 3.0827e-01, -1.8606e+00],
+                              [1.2060e+00, 4.7821e-01, -4.4542e-01],
+                              [-8.3929e-01, 1.6781e-02, -1.0764e+00]]],
+                            [[[1.2060e+00, 1.5342e+00, 3.0827e-01],
+                              [-8.3929e-01, 4.7821e-01, -1.8606e+00],
+                              [1.6781e-02, -1.0764e+00, -4.4542e-01]]],
+                            [[[-8.3929e-01, 1.2060e+00, 1.5342e+00],
+                              [1.6781e-02, 4.7821e-01, 3.0827e-01],
+                              [-1.0764e+00, -4.4542e-01, -1.8606e+00]]]])
+
+expected_grad = np.array([[[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]]])
+
+
+@pytest.mark.parametrize('device', [
+    'cpu',
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
+])
+def test_active_rotated_filter(device):
+    feature = torch.tensor(
+        np_feature, dtype=torch.float, device=device, requires_grad=True)
+    indices = torch.tensor(np_indices, dtype=torch.int, device=device)
+    output = active_rotated_filter(feature, indices)
+    output.backward(torch.ones_like(output))
+    assert np.allclose(output.data.cpu().numpy(), expected_output, atol=1e-3)
+    assert np.allclose(
+        feature.grad.data.cpu().numpy(), expected_grad, atol=1e-3)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_assign_score_withk.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_assign_score_withk.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8fc6ae6261b77a634e7681c4939612fe80ddf38
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_assign_score_withk.py
@@ -0,0 +1,188 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import assign_score_withk
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_paconv_assign_scores():
+    scores = torch.tensor([[[[0.06947571, 0.6065746], [0.28462553, 0.8378516],
+                             [0.7595994, 0.97220325], [0.519155, 0.766185]],
+                            [[0.15348864, 0.6051019], [0.21510637, 0.31916398],
+                             [0.00236845, 0.5842595], [0.6783676, 0.5216348]]],
+                           [[[0.23089725, 0.5568468], [0.7405102, 0.06438422],
+                             [0.6887394, 0.22089851], [0.0502342, 0.79228795]],
+                            [[0.44883424, 0.15427643],
+                             [0.13817799, 0.34856772], [0.7989621, 0.33788306],
+                             [0.15699774, 0.7693662]]]]).float().cuda()
+    scores.requires_grad_()
+    points = torch.tensor([[[[0.06001121, 0.92963666, 0.5753327, 0.7251477],
+                             [0.53563064, 0.23129565, 0.92366195, 0.44261628]],
+                            [[0.5770022, 0.56625944, 0.23560429, 0.11178821],
+                             [0.7735967, 0.95678777, 0.25468266, 0.02895975]],
+                            [[0.0589869, 0.09017515, 0.5977862, 0.02797985],
+                             [0.603862, 0.35991007, 0.85761684, 0.3096559]],
+                            [[0.22359002, 0.13983732, 0.5544243, 0.68863827],
+                             [0.85646236, 0.75651926, 0.8638947, 0.83600986]],
+                            [[0.45424145, 0.27458847, 0.6456112, 0.47162914],
+                             [0.15773582, 0.47645122, 0.79964715, 0.3323908]],
+                            [[0.8351399, 0.84696376, 0.9431732, 0.29418713],
+                             [0.77168906, 0.6996871, 0.19354361, 0.03392768]],
+                            [[0.30976456, 0.7074133, 0.581795, 0.976677],
+                             [0.69656056, 0.07199162, 0.4708506, 0.29117996]],
+                            [[0.5829035, 0.30201727, 0.76556486, 0.0935446],
+                             [0.88030535, 0.16129416, 0.9242525, 0.49545723]]],
+                           [[[0.50899494, 0.06482804, 0.44939405, 0.37704808],
+                             [0.47028124, 0.11969638, 0.62823206, 0.28560323]],
+                            [[0.40690207, 0.689753, 0.51636654, 0.23040164],
+                             [0.06935787, 0.00488842, 0.22462702, 0.09182382]],
+                            [[0.26611632, 0.00184339, 0.7730655, 0.5228131],
+                             [0.87776035, 0.77895886, 0.2787183, 0.16620636]],
+                            [[0.502574, 0.04039001, 0.5368497, 0.98379374],
+                             [0.40973026, 0.3238272, 0.9733018, 0.13988364]],
+                            [[0.04586202, 0.20983845, 0.20662665, 0.22270602],
+                             [0.60387236, 0.5155574, 0.51237285, 0.6528438]],
+                            [[0.45735973, 0.86821306, 0.61054605, 0.8370336],
+                             [0.45193362, 0.3734138, 0.7825672, 0.5699416]],
+                            [[0.44591594, 0.12447512, 0.09282011, 0.7055254],
+                             [0.25223452, 0.46696228, 0.7051136, 0.892151]],
+                            [[0.49615085, 0.47321403, 0.93138885, 0.7652197],
+                             [0.38766378, 0.30332977, 0.23131835,
+                              0.02863514]]]]).float().cuda()
+    points.requires_grad_()
+    centers = torch.tensor([[[[0.83878064, 0.96658987, 0.8033424, 0.9598312],
+                              [0.45035273, 0.8768925, 0.977736, 0.54547966]],
+                             [[0.01041394, 0.597893, 0.36212963, 0.4410367],
+                              [0.94879234, 0.8372817, 0.21237361, 0.67945415]],
+                             [[0.5096087, 0.26401454, 0.60034937, 0.5417416],
+                              [0.87591463, 0.546456, 0.4096033, 0.16373193]],
+                             [[0.79547447, 0.1482386, 0.12840575, 0.45384115],
+                              [0.5640288, 0.944541, 0.5745328, 0.73229736]],
+                             [[0.93011934, 0.7406011, 0.62621707, 0.8677915],
+                              [0.91563636, 0.3595413, 0.6678378, 0.6085383]],
+                             [[0.22431666, 0.65617776, 0.7483924, 0.6263364],
+                              [0.30968404, 0.78204364, 0.14899081,
+                               0.09628749]],
+                             [[0.73675203, 0.72104895, 0.4648038, 0.6101647],
+                              [0.7817645, 0.16572917, 0.3311919, 0.43407398]],
+                             [[0.8193154, 0.09559608, 0.05978829, 0.90262103],
+                              [0.4256065, 0.8165596, 0.8206446, 0.6604721]]],
+                            [[[0.7159653, 0.18600845, 0.21433902, 0.3159626],
+                              [0.3921569, 0.33221376, 0.5061177, 0.7961841]],
+                             [[0.95338356, 0.04785997, 0.67185795, 0.6538394],
+                              [0.4729132, 0.33404195, 0.17750603, 0.8445621]],
+                             [[0.6755793, 0.16193843, 0.75943846, 0.92123103],
+                              [0.2781859, 0.03114432, 0.710638, 0.52729136]],
+                             [[0.8376105, 0.10858494, 0.13208169, 0.365772],
+                              [0.5930795, 0.27390373, 0.14036089, 0.170403]],
+                             [[0.3479789, 0.89855295, 0.04844379, 0.9871029],
+                              [0.29781651, 0.0244137, 0.9179047, 0.8081611]],
+                             [[0.12460887, 0.44991326, 0.19382608, 0.35037738],
+                              [0.2773472, 0.4362057, 0.36757517, 0.5993509]],
+                             [[0.29630446, 0.90046406, 0.5417113, 0.13510644],
+                              [0.09623539, 0.04226565, 0.32001644,
+                               0.44358212]],
+                             [[0.5274848, 0.82096446, 0.9415489, 0.7123748],
+                              [0.7537517, 0.8086482, 0.85345286,
+                               0.7472754]]]]).float().cuda()
+    centers.requires_grad_()
+    knn_idx = torch.tensor([[[6, 7, 4, 6], [2, 4, 2, 4]],
+                            [[7, 1, 3, 2], [6, 0, 2, 6]]]).long().cuda()
+    aggregate = 'sum'
+    expected_output = torch.tensor(
+        [[[[-0.08134781, 0.03877336, -0.8212776, -0.2869547],
+           [-0.23378491, -0.24112664, -0.1600166, -0.4121864]],
+          [[-0.05780616, -0.12298299, -0.0370461, -0.07889931],
+           [-0.13956165, -0.02006848, -0.10940295, -0.0293439]],
+          [[0.09284145, 0.58250105, 0.5927749, 0.16774094],
+           [0.27070042, 0.13422406, 0.2617501, 0.23416464]],
+          [[-0.06121218, -0.09561322, -0.20408826, 0.08079343],
+           [0.00944228, 0.03874819, 0.08404065, 0.04041629]]],
+         [[[-0.2110898, -0.13335688, -0.09315082, 0.08512095],
+           [0.09121774, 0.15976946, 0.23994486, 0.14350912]],
+          [[-0.36167958, -0.14891288, -0.64470863, -0.0646704],
+           [-0.28276974, -0.08847666, -0.46904767, 0.20491874]],
+          [[-0.34877953, -0.35533834, -0.25225785, -0.4638189],
+           [-0.1420663, 0.09467781, 0.17088932, 0.22580585]],
+          [[-0.3879708, -0.3991068, 0.05276498, -0.46989647],
+           [0.32522714, -0.02163534, 0.21604237, 0.4346682]]]]).float()
+
+    # test forward
+    output = assign_score_withk(scores, points, centers, knn_idx, aggregate)
+    assert torch.allclose(output.detach().cpu(), expected_output, atol=1e-6)
+
+    # test backward
+    loss = output.sum()
+    loss.backward()
+    expected_scores_grad = torch.tensor([[[[0.04288036, -0.18217683],
+                                           [-0.78873926, 0.7485497],
+                                           [-0.6866992, 0.05346543],
+                                           [0.04288036, -0.18217683]],
+                                          [[-1.1407862, 0.13533896],
+                                           [-0.06964391, -0.22948086],
+                                           [-1.1407862, 0.13533896],
+                                           [-0.06964391, -0.22948086]]],
+                                         [[[-0.3363995, -2.212181],
+                                           [-1.1589496, -2.7724311],
+                                           [-0.9387654, -1.3163853],
+                                           [-1.4385346, -1.0614843]],
+                                          [[-0.5048497, 1.4143617],
+                                           [-0.47332114, 0.6017133],
+                                           [-0.30974793, 1.1995442],
+                                           [-0.5048497, 1.4143617]]]]).float()
+    expected_points_grad = torch.tensor(
+        [[[[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0.15585709, 0.15585709, 0.15585709, 0.15585709],
+           [1.1893613, 1.1893613, 1.1893613, 1.1893613]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[1.6530733, 1.6530733, 1.6530733, 1.6530733],
+           [1.8130021, 1.8130021, 1.8130021, 1.8130021]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0.58863074, 0.58863074, 0.58863074, 0.58863074],
+           [1.3727596, 1.3727596, 1.3727596, 1.3727596]],
+          [[0.28462553, 0.28462553, 0.28462553, 0.28462553],
+           [0.8378516, 0.8378516, 0.8378516, 0.8378516]]],
+         [[[0.13817799, 0.13817799, 0.13817799, 0.13817799],
+           [0.34856772, 0.34856772, 0.34856772, 0.34856772]],
+          [[0.7405102, 0.7405102, 0.7405102, 0.7405102],
+           [0.06438422, 0.06438422, 0.06438422, 0.06438422]],
+          [[0.8491963, 0.8491963, 0.8491963, 0.8491963],
+           [1.1301711, 1.1301711, 1.1301711, 1.1301711]],
+          [[0.6887394, 0.6887394, 0.6887394, 0.6887394],
+           [0.22089851, 0.22089851, 0.22089851, 0.22089851]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0.605832, 0.605832, 0.605832, 0.605832],
+           [0.92364264, 0.92364264, 0.92364264, 0.92364264]],
+          [[0.23089725, 0.23089725, 0.23089725, 0.23089725],
+           [0.5568468, 0.5568468, 0.5568468, 0.5568468]]]]).float()
+    expected_centers_grad = torch.tensor(
+        [[[[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[-1.0493311, -1.0493311, -1.0493311, -1.0493311],
+           [-2.0301602, -2.0301602, -2.0301602, -2.0301602]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[-1.6328557, -1.6328557, -1.6328557, -1.6328557],
+           [-3.1828144, -3.1828144, -3.1828144, -3.1828144]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]]],
+         [[[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[-1.5429721, -1.5429721, -1.5429721, -1.5429721],
+           [-1.6100934, -1.6100934, -1.6100934, -1.6100934]],
+          [[-1.7103812, -1.7103812, -1.7103812, -1.7103812],
+           [-1.6344175, -1.6344175, -1.6344175, -1.6344175]]]]).float()
+    assert torch.allclose(
+        scores.grad.detach().cpu(), expected_scores_grad, atol=1e-6)
+    assert torch.allclose(
+        points.grad.detach().cpu(), expected_points_grad, atol=1e-6)
+    assert torch.allclose(
+        centers.grad.detach().cpu(), expected_centers_grad, atol=1e-6)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_ball_query.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_ball_query.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3f65181973b79fa0087235c9a8f96c5592ea7ca
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_ball_query.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import ball_query
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_ball_query(device):
+    new_xyz = torch.tensor(
+        [[[-0.0740, 1.3147, -1.3625], [-2.2769, 2.7817, -0.2334],
+          [-0.4003, 2.4666, -0.5116], [-0.0740, 1.3147, -1.3625],
+          [-0.0740, 1.3147, -1.3625]],
+         [[-2.0289, 2.4952, -0.1708], [-2.0668, 6.0278, -0.4875],
+          [0.4066, 1.4211, -0.2947], [-2.0289, 2.4952, -0.1708],
+          [-2.0289, 2.4952, -0.1708]]],
+        device=device)
+
+    xyz = torch.tensor(
+        [[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
+          [-0.4003, 2.4666, -0.5116], [-0.5251, 2.4379, -0.8466],
+          [-0.9691, 1.1418, -1.3733], [-0.2232, 0.9561, -1.3626],
+          [-2.2769, 2.7817, -0.2334], [-0.2822, 1.3192, -1.3645],
+          [0.1533, 1.5024, -1.0432], [0.4917, 1.1529, -1.3496]],
+         [[-2.0289, 2.4952, -0.1708], [-0.7188, 0.9956, -0.5096],
+          [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
+          [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
+          [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
+          [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856, -1.2000]]],
+        device=device)
+
+    idx = ball_query(0, 0.2, 5, xyz, new_xyz)
+    expected_idx = torch.tensor(
+        [[[0, 0, 0, 0, 0], [6, 6, 6, 6, 6], [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],
+          [0, 0, 0, 0, 0]],
+         [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
+          [0, 0, 0, 0, 0]]],
+        device=device)
+    assert torch.all(idx == expected_idx)
+
+    # test dilated ball query
+    idx = ball_query(0.2, 0.4, 5, xyz, new_xyz)
+    expected_idx = torch.tensor(
+        [[[0, 5, 7, 0, 0], [6, 6, 6, 6, 6], [2, 3, 2, 2, 2], [0, 5, 7, 0, 0],
+          [0, 5, 7, 0, 0]],
+         [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2], [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
+          [0, 0, 0, 0, 0]]],
+        device=device)
+    assert torch.all(idx == expected_idx)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_stack_ball_query():
+    new_xyz = torch.tensor([[-0.0740, 1.3147, -1.3625],
+                            [-2.2769, 2.7817, -0.2334],
+                            [-0.4003, 2.4666, -0.5116],
+                            [-0.0740, 1.3147, -1.3625],
+                            [-0.0740, 1.3147, -1.3625],
+                            [-2.0289, 2.4952, -0.1708],
+                            [-2.0668, 6.0278, -0.4875],
+                            [0.4066, 1.4211, -0.2947],
+                            [-2.0289, 2.4952, -0.1708],
+                            [-2.0289, 2.4952, -0.1708]]).cuda()
+    new_xyz_batch_cnt = torch.tensor([5, 5], dtype=torch.int32).cuda()
+    xyz = torch.tensor([[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
+                        [-0.4003, 2.4666, -0.5116], [-0.5251, 2.4379, -0.8466],
+                        [-0.9691, 1.1418, -1.3733], [-0.2232, 0.9561, -1.3626],
+                        [-2.2769, 2.7817, -0.2334], [-0.2822, 1.3192, -1.3645],
+                        [0.1533, 1.5024, -1.0432], [0.4917, 1.1529, -1.3496],
+                        [-2.0289, 2.4952, -0.1708], [-0.7188, 0.9956, -0.5096],
+                        [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
+                        [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
+                        [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
+                        [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,
+                                                   -1.2000]]).cuda()
+    xyz_batch_cnt = torch.tensor([10, 10], dtype=torch.int32).cuda()
+    idx = ball_query(0, 0.2, 5, xyz, new_xyz, xyz_batch_cnt, new_xyz_batch_cnt)
+    expected_idx = torch.tensor([[0, 0, 0, 0, 0], [6, 6, 6, 6, 6],
+                                 [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],
+                                 [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
+                                 [2, 2, 2, 2, 2], [7, 7, 7, 7, 7],
+                                 [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]).cuda()
+    assert torch.all(idx == expected_idx)
+
+    xyz = xyz.double()
+    new_xyz = new_xyz.double()
+    expected_idx = expected_idx.double()
+    idx = ball_query(0, 0.2, 5, xyz, new_xyz, xyz_batch_cnt, new_xyz_batch_cnt)
+    assert torch.all(idx == expected_idx)
+
+    xyz = xyz.half()
+    new_xyz = new_xyz.half()
+    expected_idx = expected_idx.half()
+    idx = ball_query(0, 0.2, 5, xyz, new_xyz, xyz_batch_cnt, new_xyz_batch_cnt)
+    assert torch.all(idx == expected_idx)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_bbox.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..752d877663a7d5dc12985a5aac19a2f6a574a787
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_bbox.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+from mmengine.utils import digit_version
+
+from mmcv.utils import (IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE,
+                        IS_NPU_AVAILABLE)
+
+
+class TestBBox:
+
+    def _test_bbox_overlaps(self, device='cpu', dtype=torch.float):
+        from mmcv.ops import bbox_overlaps
+        b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0, 4.0],
+                           [7.0, 7.0, 8.0, 8.0]]).to(device).type(dtype)
+        b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0,
+                                                  3.0]]).to(device).type(dtype)
+        should_output = np.array([[0.33333334, 0.5], [0.2, 0.5], [0.0, 0.0]])
+        out = bbox_overlaps(b1, b2, offset=1)
+        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
+
+        b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0,
+                                                  4.0]]).to(device).type(dtype)
+        b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0,
+                                                  3.0]]).to(device).type(dtype)
+        should_output = np.array([0.33333334, 0.5])
+        out = bbox_overlaps(b1, b2, aligned=True, offset=1)
+        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
+
+        b1 = torch.tensor([[0.0, 0.0, 3.0, 3.0]]).to(device).type(dtype)
+        b2 = torch.tensor([[4.0, 0.0, 5.0, 3.0], [3.0, 0.0, 4.0, 3.0],
+                           [2.0, 0.0, 3.0, 3.0], [1.0, 0.0, 2.0,
+                                                  3.0]]).to(device).type(dtype)
+        should_output = np.array([0, 0.2, 0.5, 0.5])
+        out = bbox_overlaps(b1, b2, offset=1)
+        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
+
+        b1 = torch.tensor([[10.0 + i, 10.0 + i, 30.0 + i, 30.0 + i]
+                           for i in range(1000)]).to(device).type(dtype)
+        b2 = torch.tensor([[20.0 + i, 20.0 + i, 40.0 + i, 40.0 + i]
+                           for i in range(1000)]).to(device).type(dtype)
+        should_output = np.array([1 / 7] * 1000)
+        out = bbox_overlaps(b1, b2, aligned=True)
+        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
+
+    @pytest.mark.parametrize('device', [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+        pytest.param(
+            'mps',
+            marks=pytest.mark.skipif(
+                not IS_MPS_AVAILABLE
+                or digit_version(torch.__version__) >= digit_version('2.1.0'),
+                reason='requires MPS support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
+    ])
+    def test_bbox_overlaps_float(self, device):
+        self._test_bbox_overlaps(device, dtype=torch.float)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
+    ])
+    def test_bbox_overlaps_half(self, device):
+        self._test_bbox_overlaps(device, dtype=torch.half)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_bezier_align.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_bezier_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..b86812aceef92eaf785ffe05794cb2d5966fbe91
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_bezier_align.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE
+
+inputs = ([[[
+    [1., 2., 5., 6.],
+    [3., 4., 7., 8.],
+    [9., 10., 13., 14.],
+    [11., 12., 15., 16.],
+]]], [[0., 0., 0., 1, 0., 2., 0., 3., 0., 3., 3., 2., 3., 1., 3., 0., 3.]])
+outputs = ([[[[1., 1.75, 3.5, 5.25], [2.5, 3.25, 5., 6.75],
+              [6., 6.75, 8.5, 10.25],
+              [9.5, 10.25, 12., 13.75]]]], [[[[1.5625, 1.5625, 1.5625, 0.3125],
+                                              [1.5625, 1.5625, 1.5625, 0.3125],
+                                              [1.5625, 1.5625, 1.5625, 0.3125],
+                                              [0.3125, 0.3125, 0.3125,
+                                               0.0625]]]])
+
+
+@pytest.mark.parametrize('device', [
+    'cpu',
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+])
+@pytest.mark.parametrize('dtype', [torch.float, torch.double, torch.half])
+def test_bezieralign(device, dtype):
+    try:
+        from mmcv.ops import bezier_align
+    except ModuleNotFoundError:
+        pytest.skip('test requires compilation')
+    pool_h = 4
+    pool_w = 4
+    spatial_scale = 1.0
+    sampling_ratio = 1
+    np_input = np.array(inputs[0])
+    np_rois = np.array(inputs[1])
+    np_output = np.array(outputs[0])
+    np_grad = np.array(outputs[1])
+
+    x = torch.tensor(np_input, dtype=dtype, device=device, requires_grad=True)
+    rois = torch.tensor(np_rois, dtype=dtype, device=device)
+
+    output = bezier_align(x, rois, (pool_h, pool_w), spatial_scale,
+                          sampling_ratio, False)
+    output.backward(torch.ones_like(output))
+    assert np.allclose(
+        output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)
+    assert np.allclose(
+        x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_bias_act.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_bias_act.py
new file mode 100644
index 0000000000000000000000000000000000000000..01b57c4ae1ca0da2fb3f0737b3ae989596f5a897
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_bias_act.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import bias_act
+from mmcv.ops.bias_act import EasyDict
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck, gradgradcheck
+    _USING_PARROTS = False
+
+
+class TestBiasAct:
+
+    @classmethod
+    def setup_class(cls):
+        cls.input_tensor = torch.randn((1, 3), requires_grad=True)
+        cls.bias = torch.randn(3, requires_grad=True)
+
+    def test_bias_act_cpu(self):
+        out = bias_act(self.input_tensor, self.bias)
+        assert out.shape == (1, 3)
+
+        # test with different dim
+        input_tensor = torch.randn((1, 1, 3), requires_grad=True)
+        bias = torch.randn(3, requires_grad=True)
+        out = bias_act(input_tensor, bias, dim=2)
+        assert out.shape == (1, 1, 3)
+
+        # test with different act
+        out = bias_act(self.input_tensor, self.bias, act='relu')
+        assert out.shape == (1, 3)
+        out = bias_act(self.input_tensor, self.bias, act='lrelu')
+        assert out.shape == (1, 3)
+        out = bias_act(self.input_tensor, self.bias, act='tanh')
+        assert out.shape == (1, 3)
+        out = bias_act(self.input_tensor, self.bias, act='sigmoid')
+        assert out.shape == (1, 3)
+        out = bias_act(self.input_tensor, self.bias, act='elu')
+        assert out.shape == (1, 3)
+        out = bias_act(self.input_tensor, self.bias, act='selu')
+        assert out.shape == (1, 3)
+        out = bias_act(self.input_tensor, self.bias, act='softplus')
+        assert out.shape == (1, 3)
+        out = bias_act(self.input_tensor, self.bias, act='swish')
+        assert out.shape == (1, 3)
+
+        # test with different alpha
+        out = bias_act(self.input_tensor, self.bias, act='lrelu', alpha=0.1)
+        assert out.shape == (1, 3)
+
+        # test with different gain
+        out1 = bias_act(self.input_tensor, self.bias, act='lrelu', gain=0.2)
+        out2 = bias_act(self.input_tensor, self.bias, act='lrelu', gain=0.1)
+        assert torch.allclose(out1, out2 * 2)
+
+        # test with different clamp
+        out1 = bias_act(self.input_tensor, self.bias, act='lrelu', clamp=0.5)
+        out2 = bias_act(self.input_tensor, self.bias, act='lrelu', clamp=0.2)
+        assert out1.max() <= 0.5
+        assert out2.max() <= 0.5
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
+    def test_bias_act_cuda(self):
+        if _USING_PARROTS:
+            gradcheck(
+                bias_act, (self.input_tensor.cuda(), self.bias.cuda()),
+                delta=1e-4,
+                pt_atol=1e-3)
+        else:
+            gradcheck(
+                bias_act, (self.input_tensor.cuda(), self.bias.cuda()),
+                eps=1e-4,
+                atol=1e-3)
+
+            gradgradcheck(
+                bias_act, (self.input_tensor.cuda(), self.bias.cuda()),
+                eps=1e-4,
+                atol=1e-3)
+
+        out = bias_act(self.input_tensor.cuda(), self.bias.cuda())
+        assert out.shape == (1, 3)
+
+        # test with different dim
+        input_tensor = torch.randn((1, 1, 3), requires_grad=True).cuda()
+        bias = torch.randn(3, requires_grad=True).cuda()
+        out = bias_act(input_tensor, bias, dim=2)
+        assert out.shape == (1, 1, 3)
+
+        # test with different act
+        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='relu')
+        assert out.shape == (1, 3)
+
+        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='lrelu')
+        assert out.shape == (1, 3)
+        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='tanh')
+        assert out.shape == (1, 3)
+        out = bias_act(
+            self.input_tensor.cuda(), self.bias.cuda(), act='sigmoid')
+        assert out.shape == (1, 3)
+        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='elu')
+        assert out.shape == (1, 3)
+        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='selu')
+        assert out.shape == (1, 3)
+        out = bias_act(
+            self.input_tensor.cuda(), self.bias.cuda(), act='softplus')
+        assert out.shape == (1, 3)
+        out = bias_act(self.input_tensor.cuda(), self.bias.cuda(), act='swish')
+        assert out.shape == (1, 3)
+
+        # test with different alpha
+        out = bias_act(
+            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', alpha=0.1)
+        assert out.shape == (1, 3)
+
+        # test with different gain
+        out1 = bias_act(
+            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', gain=0.2)
+        out2 = bias_act(
+            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', gain=0.1)
+        assert torch.allclose(out1, out2 * 2)
+
+        # test with different clamp
+        out1 = bias_act(
+            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', clamp=0.5)
+        out2 = bias_act(
+            self.input_tensor.cuda(), self.bias.cuda(), act='lrelu', clamp=0.2)
+        assert out1.max() <= 0.5
+        assert out2.max() <= 0.5
+
+    def test_easy_dict(self):
+        easy_dict = EasyDict(
+            func=lambda x, **_: x,
+            def_alpha=0,
+            def_gain=1,
+            cuda_idx=1,
+            ref='',
+            has_2nd_grad=False)
+        _ = easy_dict.def_alpha
+        easy_dict.def_alpha = 1
+        del easy_dict.def_alpha
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_bilinear_grid_sample.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_bilinear_grid_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f43d4ff244457a56594068db234313bd1b1a2af
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_bilinear_grid_sample.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+
+class TestBilinearGridSample:
+
+    def _test_bilinear_grid_sample(self,
+                                   dtype=torch.float,
+                                   align_corners=False,
+                                   multiplier=1,
+                                   precision=1e-3):
+        from mmcv.ops.point_sample import bilinear_grid_sample
+
+        input = torch.rand(1, 1, 20, 20, dtype=dtype)
+        grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
+        grid = F.affine_grid(
+            grid, (1, 1, 15, 15), align_corners=align_corners).type_as(input)
+        grid *= multiplier
+
+        out = bilinear_grid_sample(input, grid, align_corners=align_corners)
+        ref_out = F.grid_sample(input, grid, align_corners=align_corners)
+
+        assert np.allclose(out.data.detach().cpu().numpy(),
+                           ref_out.data.detach().cpu().numpy(), precision)
+
+    def test_bilinear_grid_sample(self):
+        self._test_bilinear_grid_sample(torch.double, False)
+        self._test_bilinear_grid_sample(torch.double, True)
+        self._test_bilinear_grid_sample(torch.float, False)
+        self._test_bilinear_grid_sample(torch.float, True)
+        self._test_bilinear_grid_sample(torch.float, False)
+        self._test_bilinear_grid_sample(torch.float, True, 5)
+        self._test_bilinear_grid_sample(torch.float, False, 10)
+        self._test_bilinear_grid_sample(torch.float, True, -6)
+        self._test_bilinear_grid_sample(torch.float, False, -10)
+        self._test_bilinear_grid_sample(torch.double, True, 5)
+        self._test_bilinear_grid_sample(torch.double, False, 10)
+        self._test_bilinear_grid_sample(torch.double, True, -6)
+        self._test_bilinear_grid_sample(torch.double, False, -10)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_border_align.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_border_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..71518ce9606f8bc0e9bf54c66c9118c483f6e0f5
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_border_align.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import numpy as np
+import pytest
+import torch
+
+# [1,4c,h,w]
+input_arr = [[[[1., 2., 3., 4.], [5., 6., 7., 8.], [9., 10., 11., 12.]],
+              [[6, 7, 5, 8], [2, 1, 3, 4], [12, 9, 11, 10]],
+              [[-2, -3, 2, 0], [-4, -5, 1, -1], [-1, -1, -1, -1]],
+              [[0, -1, 2, 1], [-4, -3, -2, -1], [-1, -2, -3, -4]]]]
+# [1,h*w,4]
+boxes_arr = [[[0, 0, 2, 1], [1, 0, 3, 1], [1, 0, 2, 1], [0, 0, 3, 1],
+              [0, 0, 1, 2], [0, 0, 2, 2], [1, 0, 2, 1], [1, 0, 3, 1],
+              [0, 1, 1, 2], [0, 0, 3, 2], [1, 0, 3, 2], [2, 0, 3, 2]]]
+output_dict = {
+    # [1,c,h*w,4] for each value,
+    # the output is manually checked for its correctness
+
+    # pool_size=1
+    1: [[[[3., 6., 1., 2.], [4., 7., -1., 1.], [3., 7., 1., 2.],
+          [4., 6., -1., 1.], [2., 12., -1., -1.], [3., 12., -1., 2.],
+          [3., 7., 1., 2.], [4., 7., -1., 1.], [6., 12., -1., -2.],
+          [4., 12., -1., 1.], [4., 9., -1., 1.], [4., 11., -1., 1.]]]],
+
+    # pool_size=2
+    2: [[[[3., 6., 1., 2.], [4., 7., 1., 1.], [3., 7., 1., 2.],
+          [4., 6., -1., 1.], [2., 12., -1., -1.], [3., 12., -1., 2.],
+          [3., 7., 1., 2.], [4., 7., 1., 1.], [6., 12., -1., -2.],
+          [4., 12., -1., 1.], [4., 9., -1., 1.], [4., 11., -1., 1.]]]],
+}
+input_grad_dict = {
+    # [1,4c,h,w] for each value
+    # the grad is manually checked for its correctness
+
+    # pool_size=1
+    1: [[[[0., 1., 4., 6.], [0., 1., 0., 0.], [0., 0., 0., 0.]],
+         [[2., 4., 0., 0.], [0., 0., 0., 0.], [4., 1., 1., 0.]],
+         [[0., 0., 0., 0.], [0., 0., 3., 3.], [0., 2., 1., 3.]],
+         [[0., 1., 4., 6.], [0., 0., 0., 0.], [0., 1., 0., 0.]]]],
+
+    # pool_size=2
+    2: [[[[0., 1., 4., 6.], [0., 1., 0., 0.], [0., 0., 0., 0.]],
+         [[2., 4., 0., 0.], [0., 0., 0., 0.], [4., 1., 1., 0.]],
+         [[0., 0., 0., 0.], [0., 0., 5., 1.], [0., 2., 1., 3.]],
+         [[0., 1., 4., 6.], [0., 0., 0., 0.], [0., 1., 0., 0.]]]],
+}
+
+
+def _test_border_align_allclose(device, dtype, pool_size):
+    if not torch.cuda.is_available() and device == 'cuda':
+        pytest.skip('test requires GPU')
+    try:
+        from mmcv.ops import BorderAlign, border_align
+    except ModuleNotFoundError:
+        pytest.skip('BorderAlign op is not successfully compiled')
+
+    np_input = np.array(input_arr)
+    np_boxes = np.array(boxes_arr)
+    np_output = np.array(output_dict[pool_size])
+    np_grad = np.array(input_grad_dict[pool_size])
+
+    input = torch.tensor(
+        np_input, dtype=dtype, device=device, requires_grad=True)
+    boxes = torch.tensor(np_boxes, dtype=dtype, device=device)
+
+    # test for border_align
+    input_cp = copy.deepcopy(input)
+    output = border_align(input_cp, boxes, pool_size)
+    output.backward(torch.ones_like(output))
+    assert np.allclose(
+        output.data.type(dtype).cpu().numpy(), np_output, atol=1e-5)
+    assert np.allclose(
+        input_cp.grad.data.type(dtype).cpu().numpy(), np_grad, atol=1e-5)
+
+    # test for BorderAlign
+    pool_module = BorderAlign(pool_size)
+    output = pool_module(input, boxes)
+    output.backward(torch.ones_like(output))
+    assert np.allclose(
+        output.data.type(dtype).cpu().numpy(), np_output, atol=1e-5)
+    assert np.allclose(
+        input.grad.data.type(dtype).cpu().numpy(), np_grad, atol=1e-5)
+
+
+@pytest.mark.parametrize('device', ['cuda'])
+@pytest.mark.parametrize('dtype', [torch.float, torch.half, torch.double])
+@pytest.mark.parametrize('pool_size', [1, 2])
+def test_border_align(device, dtype, pool_size):
+    _test_border_align_allclose(device, dtype, pool_size)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_box_iou_quadri.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_box_iou_quadri.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5cfcab61b0eec158c372e633b520db942e66bee
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_box_iou_quadri.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE
+
+
+class TestBoxIoUQuadri:
+
+    @pytest.mark.parametrize('device', [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    ])
+    def test_box_iou_quadri_cuda(self, device):
+        from mmcv.ops import box_iou_quadri
+        np_boxes1 = np.asarray([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0],
+                                [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0],
+                                [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0]],
+                               dtype=np.float32)
+        np_boxes2 = np.asarray([[0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                                [2.0, 1.0, 2.0, 4.0, 4.0, 4.0, 4.0, 1.0],
+                                [7.0, 6.0, 7.0, 8.0, 9.0, 8.0, 9.0, 6.0]],
+                               dtype=np.float32)
+        np_expect_ious = np.asarray(
+            [[0.0714, 1.0000, 0.0000], [0.0000, 0.5000, 0.0000],
+             [0.0000, 0.0000, 0.5000]],
+            dtype=np.float32)
+        np_expect_ious_aligned = np.asarray([0.0714, 0.5000, 0.5000],
+                                            dtype=np.float32)
+
+        boxes1 = torch.from_numpy(np_boxes1).to(device)
+        boxes2 = torch.from_numpy(np_boxes2).to(device)
+
+        ious = box_iou_quadri(boxes1, boxes2)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_quadri(boxes1, boxes2, aligned=True)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+    @pytest.mark.parametrize('device', [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    ])
+    def test_box_iou_quadri_iof_cuda(self, device):
+        from mmcv.ops import box_iou_quadri
+        np_boxes1 = np.asarray([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0],
+                                [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0],
+                                [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0]],
+                               dtype=np.float32)
+        np_boxes2 = np.asarray([[0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                                [2.0, 1.0, 2.0, 4.0, 4.0, 4.0, 4.0, 1.0],
+                                [7.0, 6.0, 7.0, 8.0, 9.0, 8.0, 9.0, 6.0]],
+                               dtype=np.float32)
+        np_expect_ious = np.asarray(
+            [[0.1111, 1.0000, 0.0000], [0.0000, 1.0000, 0.0000],
+             [0.0000, 0.0000, 1.0000]],
+            dtype=np.float32)
+        np_expect_ious_aligned = np.asarray([0.1111, 1.0000, 1.0000],
+                                            dtype=np.float32)
+
+        boxes1 = torch.from_numpy(np_boxes1).to(device)
+        boxes2 = torch.from_numpy(np_boxes2).to(device)
+
+        ious = box_iou_quadri(boxes1, boxes2, mode='iof')
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_quadri(boxes1, boxes2, mode='iof', aligned=True)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_box_iou_rotated.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_box_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..3af811d0fe7975211dd4d774ef25932da41ffdac
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_box_iou_rotated.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import box_iou_rotated
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
+
+
+class TestBoxIoURotated:
+
+    def test_box_iou_rotated_cpu(self):
+        np_boxes1 = np.asarray(
+            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
+             [7.0, 7.0, 8.0, 8.0, 0.4]],
+            dtype=np.float32)
+        np_boxes2 = np.asarray(
+            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
+             [5.0, 5.0, 6.0, 7.0, 0.4]],
+            dtype=np.float32)
+        np_expect_ious = np.asarray(
+            [[0.3708, 0.4351, 0.0000], [0.1104, 0.4487, 0.0424],
+             [0.0000, 0.0000, 0.3622]],
+            dtype=np.float32)
+        np_expect_ious_aligned = np.asarray([0.3708, 0.4487, 0.3622],
+                                            dtype=np.float32)
+
+        boxes1 = torch.from_numpy(np_boxes1)
+        boxes2 = torch.from_numpy(np_boxes2)
+
+        # test cw angle definition
+        ious = box_iou_rotated(boxes1, boxes2)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(boxes1, boxes2, aligned=True)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+        # test ccw angle definition
+        boxes1[..., -1] *= -1
+        boxes2[..., -1] *= -1
+        ious = box_iou_rotated(boxes1, boxes2, clockwise=False)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(boxes1, boxes2, aligned=True, clockwise=False)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
+    ])
+    def test_box_iou_rotated(self, device):
+        np_boxes1 = np.asarray(
+            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
+             [7.0, 7.0, 8.0, 8.0, 0.4]],
+            dtype=np.float32)
+        np_boxes2 = np.asarray(
+            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
+             [5.0, 5.0, 6.0, 7.0, 0.4]],
+            dtype=np.float32)
+        np_expect_ious = np.asarray(
+            [[0.3708, 0.4351, 0.0000], [0.1104, 0.4487, 0.0424],
+             [0.0000, 0.0000, 0.3622]],
+            dtype=np.float32)
+        np_expect_ious_aligned = np.asarray([0.3708, 0.4487, 0.3622],
+                                            dtype=np.float32)
+
+        boxes1 = torch.from_numpy(np_boxes1).to(device)
+        boxes2 = torch.from_numpy(np_boxes2).to(device)
+
+        # test cw angle definition
+        ious = box_iou_rotated(boxes1, boxes2)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(boxes1, boxes2, aligned=True)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+        # test ccw angle definition
+        boxes1[..., -1] *= -1
+        boxes2[..., -1] *= -1
+        ious = box_iou_rotated(boxes1, boxes2, clockwise=False)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(boxes1, boxes2, aligned=True, clockwise=False)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+    def test_box_iou_rotated_iof_cpu(self):
+        np_boxes1 = np.asarray(
+            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
+             [7.0, 7.0, 8.0, 8.0, 0.4]],
+            dtype=np.float32)
+        np_boxes2 = np.asarray(
+            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
+             [5.0, 5.0, 6.0, 7.0, 0.4]],
+            dtype=np.float32)
+        np_expect_ious = np.asarray(
+            [[0.4959, 0.5306, 0.0000], [0.1823, 0.5420, 0.1832],
+             [0.0000, 0.0000, 0.4404]],
+            dtype=np.float32)
+        np_expect_ious_aligned = np.asarray([0.4959, 0.5420, 0.4404],
+                                            dtype=np.float32)
+
+        boxes1 = torch.from_numpy(np_boxes1)
+        boxes2 = torch.from_numpy(np_boxes2)
+
+        # test cw angle definition
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof')
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof', aligned=True)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+        # test ccw angle definition
+        boxes1[..., -1] *= -1
+        boxes2[..., -1] *= -1
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof', clockwise=False)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+        ious = box_iou_rotated(
+            boxes1, boxes2, mode='iof', aligned=True, clockwise=False)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
+    ])
+    def test_box_iou_rotated_iof(self, device):
+        np_boxes1 = np.asarray(
+            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
+             [7.0, 7.0, 8.0, 8.0, 0.4]],
+            dtype=np.float32)
+        np_boxes2 = np.asarray(
+            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
+             [5.0, 5.0, 6.0, 7.0, 0.4]],
+            dtype=np.float32)
+        np_expect_ious = np.asarray(
+            [[0.4959, 0.5306, 0.0000], [0.1823, 0.5420, 0.1832],
+             [0.0000, 0.0000, 0.4404]],
+            dtype=np.float32)
+        np_expect_ious_aligned = np.asarray([0.4959, 0.5420, 0.4404],
+                                            dtype=np.float32)
+
+        boxes1 = torch.from_numpy(np_boxes1).to(device)
+        boxes2 = torch.from_numpy(np_boxes2).to(device)
+
+        # test cw angle definition
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof')
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof', aligned=True)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+        # test ccw angle definition
+        boxes1[..., -1] *= -1
+        boxes2[..., -1] *= -1
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof', clockwise=False)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(
+            boxes1, boxes2, mode='iof', aligned=True, clockwise=False)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_carafe.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_carafe.py
new file mode 100644
index 0000000000000000000000000000000000000000..02d00f1ff8eba11db1752cfa643e6d20e62ef1f9
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_carafe.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+from torch.autograd import gradcheck
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+
+class TestCarafe:
+
+    def test_carafe_naive_gradcheck(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import CARAFENaive
+        feat = torch.randn(
+            2, 64, 3, 3, requires_grad=True, device='cuda').double()
+        mask = torch.randn(
+            2, 100, 6, 6, requires_grad=True,
+            device='cuda').sigmoid().double()
+        gradcheck(CARAFENaive(5, 4, 2), (feat, mask), atol=1e-4, eps=1e-4)
+
+    def test_carafe_gradcheck(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import CARAFE
+        feat = torch.randn(
+            2, 64, 3, 3, requires_grad=True, device='cuda').double()
+        mask = torch.randn(
+            2, 100, 6, 6, requires_grad=True,
+            device='cuda').sigmoid().double()
+        gradcheck(CARAFE(5, 4, 2), (feat, mask), atol=1e-4, eps=1e-4)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_carafe_allclose(self, device):
+        try:
+            from mmcv.ops import CARAFE
+        except ModuleNotFoundError:
+            pytest.skip('test requires compilation')
+
+        np_feat = np.fromfile(
+            'tests/data/for_carafe/carafe_feat.bin', dtype=np.float32)
+        np_mask = np.fromfile(
+            'tests/data/for_carafe/carafe_mask.bin', dtype=np.float32)
+        np_output = np.fromfile(
+            'tests/data/for_carafe/carafe_output.bin', dtype=np.float32)
+        np_feat_grad = np.fromfile(
+            'tests/data/for_carafe/carafe_feat_grad.bin', dtype=np.float32)
+        np_mask_grad = np.fromfile(
+            'tests/data/for_carafe/carafe_mask_grad.bin', dtype=np.float32)
+
+        np_feat = np_feat.reshape((2, 64, 3, 3))
+        np_mask = np_mask.reshape((2, 100, 6, 6))
+        np_output = np_output.reshape((2, 64, 6, 6))
+        np_feat_grad = np_feat_grad.reshape((2, 64, 3, 3))
+        np_mask_grad = np_mask_grad.reshape((2, 100, 6, 6))
+
+        feat = torch.tensor(
+            np_feat, dtype=torch.float, device=device, requires_grad=True)
+        mask = torch.tensor(
+            np_mask, dtype=torch.float, device=device, requires_grad=True)
+
+        carafe = CARAFE(5, 4, 2)
+
+        output = carafe(feat, mask)
+        output.backward(torch.ones_like(output))
+        assert np.allclose(
+            output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)
+        assert np.allclose(
+            feat.grad.data.type(torch.float).cpu().numpy(),
+            np_feat_grad,
+            atol=1e-3)
+        assert np.allclose(
+            mask.grad.data.type(torch.float).cpu().numpy(),
+            np_mask_grad,
+            atol=1e-3)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_cc_attention.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_cc_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2a8d22a39424c4401b0d6c35a1169da72c58dc2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_cc_attention.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+class Loss(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input, target):
+        input = input.view(-1)
+        target = target.view(-1)
+        return torch.mean(input - target)
+
+
+class TestCrissCrossAttention:
+
+    def test_cc_attention(self):
+        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+
+        from mmcv.ops import CrissCrossAttention
+        loss_func = Loss()
+
+        input = np.fromfile(
+            'tests/data/for_ccattention/ccattention_input.bin',
+            dtype=np.float32)
+        output = np.fromfile(
+            'tests/data/for_ccattention/ccattention_output.bin',
+            dtype=np.float32)
+        input = input.reshape((1, 32, 45, 45))
+        output = output.reshape((1, 32, 45, 45))
+        label = torch.ones((1, 32, 45, 45))
+
+        input = torch.FloatTensor(input)
+        output = torch.FloatTensor(output)
+
+        input.requires_grad = True
+
+        shape = input.shape
+        channel = shape[1]
+
+        cca = CrissCrossAttention(channel)
+        cca.to(device)
+        input = input.to(device)
+        label = label.to(device)
+        cca.train()
+        test_output = cca(input)
+        test_loss = loss_func(test_output, label)
+        test_loss.backward()
+        test_output = test_output.detach().cpu().numpy()
+        output = output.numpy()
+
+        assert np.allclose(test_output, output)
+        assert test_output.shape == shape
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_chamfer_distance.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_chamfer_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..522dcdddc76d49cab6e5b5846bee9ae32d116c66
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_chamfer_distance.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import chamfer_distance
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_chamfer_distance():
+    pointset1 = torch.tensor(
+        [[[1.3, 9.39], [2.3, 9.39], [2.3, 10.39], [1.3, 10.39]],
+         [[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]],
+         [[1.6, 9.99], [2.3, 9.99], [2.3, 10.39], [1.6, 10.39]]],
+        device='cuda',
+        requires_grad=True)
+
+    pointset2 = torch.tensor(
+        [[[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]],
+         [[1.3, 9.39], [2.3, 9.39], [2.3, 10.39], [1.3, 10.39]],
+         [[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]]],
+        device='cuda',
+        requires_grad=True)
+
+    expected_dist1 = torch.tensor(
+        [[0.0900, 0.4900, 0.4900, 0.0900], [0.0900, 0.4900, 0.4900, 0.0900],
+         [0.5200, 0.6500, 0.4900, 0.3600]],
+        device='cuda')
+    expected_dist2 = torch.tensor(
+        [[0.0900, 0.4900, 0.4900, 0.0900], [0.0900, 0.4900, 0.4900, 0.0900],
+         [0.7200, 0.8500, 0.4900, 0.3600]],
+        device='cuda')
+
+    expected_pointset1_grad = torch.tensor(
+        [[[0.6000, 0.0000], [-1.4000, 0.0000], [-1.4000, 0.0000],
+          [0.6000, 0.0000]],
+         [[-0.6000, 0.0000], [1.4000, 0.0000], [1.4000, 0.0000],
+          [-0.6000, 0.0000]],
+         [[1.2000, -0.8000], [-1.4000, -0.8000], [-1.4000, 0.0000],
+          [1.2000, 0.0000]]],
+        device='cuda')
+
+    expected_pointset2_grad = torch.tensor(
+        [[[-0.6000, 0.0000], [1.4000, 0.0000], [1.4000, 0.0000],
+          [-0.6000, 0.0000]],
+         [[0.6000, 0.0000], [-1.4000, 0.0000], [-1.4000, 0.0000],
+          [0.6000, 0.0000]],
+         [[0.0000, 0.0000], [0.0000, 0.0000], [2.8000, 0.8000],
+          [-2.4000, 0.8000]]],
+        device='cuda')
+
+    dist1, dist2, idx1, idx2 = chamfer_distance(pointset1, pointset2)
+    dist1.backward(torch.ones_like(dist1))
+    assert torch.allclose(dist1, expected_dist1, 1e-2)
+    assert torch.allclose(dist2, expected_dist2, 1e-2)
+    assert torch.allclose(pointset1.grad.data, expected_pointset1_grad, 1e-2)
+    assert torch.allclose(pointset2.grad.data, expected_pointset2_grad, 1e-2)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_contour_expand.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_contour_expand.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36bbf4155c282418b3659984a536a24fad0d8b4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_contour_expand.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+
+def test_contour_expand():
+    from mmcv.ops import contour_expand
+
+    np_internal_kernel_label = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],
+                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],
+                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],
+                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],
+                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                         [0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0]]).astype(np.int32)
+    np_kernel_mask1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
+                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
+                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
+                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0]]).astype(np.uint8)
+    np_kernel_mask2 = (np_internal_kernel_label > 0).astype(np.uint8)
+
+    np_kernel_mask = np.stack([np_kernel_mask1, np_kernel_mask2])
+    min_area = 1
+    kernel_region_num = 3
+    result = contour_expand(np_kernel_mask, np_internal_kernel_label, min_area,
+                            kernel_region_num)
+    gt = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 2, 2, 2, 0],
+          [0, 0, 1, 1, 1, 1, 2, 2, 2, 0], [0, 0, 1, 1, 1, 1, 2, 2, 2, 0],
+          [0, 0, 1, 1, 1, 1, 2, 2, 2, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
+    assert np.allclose(result, gt)
+
+    np_kernel_mask_t = torch.from_numpy(np_kernel_mask)
+    np_internal_kernel_label_t = torch.from_numpy(np_internal_kernel_label)
+    result = contour_expand(np_kernel_mask_t, np_internal_kernel_label_t,
+                            min_area, kernel_region_num)
+    assert np.allclose(result, gt)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_conv_gradfix.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_conv_gradfix.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff2f35c55a809095efcddd325549d6d0c3b597ac
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_conv_gradfix.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck, gradgradcheck
+
+from mmcv.ops import conv2d, conv_transpose2d
+
+
+class TestCond2d:
+
+    @classmethod
+    def setup_class(cls):
+        cls.input = torch.randn((1, 3, 32, 32), requires_grad=True)
+        cls.weight = nn.Parameter(torch.randn(1, 3, 3, 3))
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
+    def test_conv2d_cuda(self):
+        x = self.input.cuda()
+        weight = self.weight.cuda()
+        res = conv2d(x, weight, None, 1, 1)
+        assert res.shape == (1, 1, 32, 32)
+        gradcheck(conv2d, (x, weight, None, 1, 1), eps=1e-2, atol=0.1)
+        gradgradcheck(conv2d, (x, weight, None, 1, 1), eps=1e-2, atol=0.1)
+
+
+class TestCond2dTansposed:
+
+    @classmethod
+    def setup_class(cls):
+        cls.input = torch.randn((1, 3, 32, 32), requires_grad=True)
+        cls.weight = nn.Parameter(torch.randn(3, 1, 3, 3))
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
+    def test_conv2d_transposed_cuda(self):
+        x = self.input.cuda()
+        weight = self.weight.cuda()
+        res = conv_transpose2d(x, weight, None, 1, 1)
+        assert res.shape == (1, 1, 32, 32)
+        gradcheck(
+            conv_transpose2d, (x, weight, None, 1, 1), eps=1e-2, atol=1e-2)
+        gradgradcheck(
+            conv_transpose2d, (x, weight, None, 1, 1), eps=1e-2, atol=1e-2)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_convex_iou.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_convex_iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..95dc482434bf2189a714dce62883ce0f0309d174
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_convex_iou.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import convex_giou, convex_iou
+
+np_pointsets = np.asarray([[
+    1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0,
+    2.0, 1.5, 1.5
+],
+                           [
+                               1.5, 1.5, 2.5, 2.5, 1.5, 2.5, 2.5, 1.5, 1.5,
+                               3.5, 3.5, 1.5, 2.5, 3.5, 3.5, 2.5, 2.0, 2.0
+                           ]])
+
+np_polygons = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 1.0],
+                          [1.0, 1.0, 1.0, 3.0, 3.0, 3.0, 3.0, 1.0]])
+
+np_expected_iou = np.asarray([[0.2857, 0.8750], [0.0588, 0.4286]])
+
+np_expected_giou = np.asarray([0.2857, 0.3831])
+
+np_expected_grad = np.asarray([[
+    0.0204, 0.0408, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0612,
+    -0.0408, -0.0408, 0.0816, -0.0408, -0.0816, -0.0816, -0.0408, 0.0000,
+    0.0000
+],
+                               [
+                                   -0.1848, -0.1848, 0.0000, 0.0000, 0.0000,
+                                   0.0000, 0.0000, 0.0000, -0.1076, -0.0801,
+                                   -0.0801, -0.1076, -0.0367, -0.0734, -0.0734,
+                                   -0.0367, 0.0000, 0.0000
+                               ]])
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_convex_iou():
+    pointsets = torch.from_numpy(np_pointsets).cuda().float()
+    polygons = torch.from_numpy(np_polygons).cuda().float()
+    expected_iou = torch.from_numpy(np_expected_iou).cuda().float()
+    assert torch.allclose(
+        convex_iou(pointsets, polygons), expected_iou, atol=1e-3)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_convex_giou():
+    pointsets = torch.from_numpy(np_pointsets).cuda().float()
+    polygons = torch.from_numpy(np_polygons).cuda().float()
+    expected_giou = torch.from_numpy(np_expected_giou).cuda().float()
+    expected_grad = torch.from_numpy(np_expected_grad).cuda().float()
+    giou, grad = convex_giou(pointsets, polygons)
+    assert torch.allclose(giou, expected_giou, atol=1e-3)
+    assert torch.allclose(grad, expected_grad, atol=1e-3)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_corner_pool.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_corner_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6dd25f2232f0a420249b8e538357280bf05de61
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_corner_pool.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""
+CommandLine:
+    pytest tests/test_corner_pool.py
+"""
+import pytest
+import torch
+
+from mmcv.ops import CornerPool
+
+
+def test_corner_pool_device_and_dtypes_cpu():
+    """
+    CommandLine:
+        xdoctest -m tests/test_corner_pool.py \
+            test_corner_pool_device_and_dtypes_cpu
+    """
+    with pytest.raises(AssertionError):
+        # pool mode must in ['bottom', 'left', 'right', 'top']
+        pool = CornerPool('corner')
+
+    lr_tensor = torch.tensor([[[[0, 0, 0, 0, 0], [2, 1, 3, 0, 2],
+                                [5, 4, 1, 1, 6], [0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0]]]])
+    tb_tensor = torch.tensor([[[[0, 3, 1, 0, 0], [0, 1, 1, 0, 0],
+                                [0, 3, 4, 0, 0], [0, 2, 2, 0, 0],
+                                [0, 0, 2, 0, 0]]]])
+    # Left Pool
+    left_answer = torch.tensor([[[[0, 0, 0, 0, 0], [3, 3, 3, 2, 2],
+                                  [6, 6, 6, 6, 6], [0, 0, 0, 0, 0],
+                                  [0, 0, 0, 0, 0]]]])
+    pool = CornerPool('left')
+    left_tensor = pool(lr_tensor)
+    assert left_tensor.type() == lr_tensor.type()
+    assert torch.equal(left_tensor, left_answer)
+    # Right Pool
+    right_answer = torch.tensor([[[[0, 0, 0, 0, 0], [2, 2, 3, 3, 3],
+                                   [5, 5, 5, 5, 6], [0, 0, 0, 0, 0],
+                                   [0, 0, 0, 0, 0]]]])
+    pool = CornerPool('right')
+    right_tensor = pool(lr_tensor)
+    assert right_tensor.type() == lr_tensor.type()
+    assert torch.equal(right_tensor, right_answer)
+    # Top Pool
+    top_answer = torch.tensor([[[[0, 3, 4, 0, 0], [0, 3, 4, 0, 0],
+                                 [0, 3, 4, 0, 0], [0, 2, 2, 0, 0],
+                                 [0, 0, 2, 0, 0]]]])
+    pool = CornerPool('top')
+    top_tensor = pool(tb_tensor)
+    assert top_tensor.type() == tb_tensor.type()
+    assert torch.equal(top_tensor, top_answer)
+    # Bottom Pool
+    bottom_answer = torch.tensor([[[[0, 3, 1, 0, 0], [0, 3, 1, 0, 0],
+                                    [0, 3, 4, 0, 0], [0, 3, 4, 0, 0],
+                                    [0, 3, 4, 0, 0]]]])
+    pool = CornerPool('bottom')
+    bottom_tensor = pool(tb_tensor)
+    assert bottom_tensor.type() == tb_tensor.type()
+    assert torch.equal(bottom_tensor, bottom_answer)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_correlation.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_correlation.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf5f9f72d23fd846fc34932ae336c2d46e16107
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_correlation.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import Correlation
+
+_input1 = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
+_input2 = [[[[1., 2., 3.], [3., 1., 2.], [8., 5., 2.]]]]
+
+gt_out_shape = (1, 1, 1, 3, 3)
+_gt_out = [[[[[1., 4., 9.], [0., 1., 4.], [24., 25., 4.]]]]]
+gt_input1_grad = [[[[1., 2., 3.], [3., 1., 2.], [8., 5., 2.]]]]
+
+
+def assert_equal_tensor(tensor_a, tensor_b):
+
+    assert tensor_a.eq(tensor_b).all()
+
+
+class TestCorrelation:
+
+    def _test_correlation(self, dtype=torch.float):
+
+        layer = Correlation(max_displacement=0)
+
+        input1 = torch.tensor(_input1, dtype=dtype).cuda()
+        input2 = torch.tensor(_input2, dtype=dtype).cuda()
+        input1.requires_grad = True
+        input2.requires_grad = True
+        out = layer(input1, input2)
+        out.backward(torch.ones_like(out))
+
+        # `eq_cpu` is not implemented for 'Half' in torch1.5.0,
+        # so we need to make a comparison for cuda tensor
+        # rather than cpu tensor
+        gt_out = torch.tensor(_gt_out, dtype=dtype).cuda()
+        assert_equal_tensor(out, gt_out)
+        assert_equal_tensor(input1.grad.detach(), input2)
+        assert_equal_tensor(input2.grad.detach(), input1)
+
+    @pytest.mark.skipif(
+        not torch.cuda.is_available(), reason='requires CUDA support')
+    def test_correlation(self):
+        self._test_correlation(torch.float)
+        self._test_correlation(torch.double)
+        self._test_correlation(torch.half)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_deform_conv.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f2801fdc095cbb46f6edd13fddb2a416b6b1ea4
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_deform_conv.py
@@ -0,0 +1,258 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+if IS_MLU_AVAILABLE:
+    torch.backends.cnnl.allow_tf32 = False
+
+try:
+    # If PyTorch version >= 1.6.0 and fp16 is enabled, torch.cuda.amp.autocast
+    # would be imported and used; we should test if our modules support it.
+    from torch.cuda.amp import autocast
+except ImportError:
+    pass
+
+input = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
+offset_weight = [[[0.1, 0.4, 0.6, 0.1]], [[0.3, 0.2, 0.1, 0.3]],
+                 [[0.5, 0.5, 0.2, 0.8]], [[0.8, 0.3, 0.9, 0.1]],
+                 [[0.3, 0.1, 0.2, 0.5]], [[0.3, 0.7, 0.5, 0.3]],
+                 [[0.6, 0.2, 0.5, 0.3]], [[0.4, 0.1, 0.8, 0.4]]]
+offset_bias = [0.7, 0.1, 0.8, 0.5, 0.6, 0.5, 0.4, 0.7]
+deform_weight = [[[0.4, 0.2, 0.1, 0.9]]]
+
+gt_out = [[[[1.650, 0.], [0.000, 0.]]]]
+gt_x_grad = [[[[-0.666, 0.204, 0.000], [0.030, -0.416, 0.012],
+               [0.000, 0.252, 0.129]]]]
+gt_offset_weight_grad = [[[[1.44, 2.88], [0.00, 1.44]]],
+                         [[[-0.72, -1.44], [0.00, -0.72]]],
+                         [[[0.00, 0.00], [0.00, 0.00]]],
+                         [[[0.00, 0.00], [0.00, 0.00]]],
+                         [[[-0.10, -0.20], [0.00, -0.10]]],
+                         [[[-0.08, -0.16], [0.00, -0.08]]],
+                         [[[-0.54, -1.08], [0.00, -0.54]]],
+                         [[[-0.54, -1.08], [0.00, -0.54]]]]
+gt_offset_bias_grad = [1.44, -0.72, 0., 0., -0.10, -0.08, -0.54, -0.54],
+gt_deform_weight_grad = [[[[3.62, 0.], [0.40, 0.18]]]]
+
+
+class TestDeformconv:
+
+    def _test_deformconv(self,
+                         dtype=torch.float,
+                         threshold=1e-3,
+                         device='cuda',
+                         batch_size=10,
+                         im2col_step=2):
+        if not torch.cuda.is_available() and device == 'cuda':
+            pytest.skip('test requires GPU')
+        if device == 'mlu':
+            from mmcv.ops import DeformConv2dPack_MLU as DeformConv2dPack
+        else:
+            from mmcv.ops import DeformConv2dPack
+        c_in = 1
+        c_out = 1
+        batch_size = 10
+        repeated_input = np.repeat(input, batch_size, axis=0)
+        repeated_gt_out = np.repeat(gt_out, batch_size, axis=0)
+        repeated_gt_x_grad = np.repeat(gt_x_grad, batch_size, axis=0)
+        x = torch.tensor(repeated_input, device=device, dtype=dtype)
+        x.requires_grad = True
+        model = DeformConv2dPack(
+            in_channels=c_in,
+            out_channels=c_out,
+            kernel_size=2,
+            stride=1,
+            padding=0,
+            im2col_step=im2col_step)
+        model.conv_offset.weight.data = torch.nn.Parameter(
+            torch.Tensor(offset_weight).reshape(8, 1, 2, 2))
+        model.conv_offset.bias.data = torch.nn.Parameter(
+            torch.Tensor(offset_bias).reshape(8))
+        model.weight.data = torch.nn.Parameter(
+            torch.Tensor(deform_weight).reshape(1, 1, 2, 2))
+        if device == 'cuda':
+            model.cuda()
+        elif device == 'mlu':
+            model.mlu()
+        model.type(dtype)
+
+        out = model(x)
+        out.backward(torch.ones_like(out))
+
+        assert np.allclose(out.data.detach().cpu().numpy(), repeated_gt_out,
+                           threshold)
+        assert np.allclose(x.grad.detach().cpu().numpy(), repeated_gt_x_grad,
+                           threshold)
+        # the batch size of the input is increased which results in
+        # a larger gradient so we need to divide by the batch_size
+        assert np.allclose(
+            model.conv_offset.weight.grad.detach().cpu().numpy() / batch_size,
+            gt_offset_weight_grad, threshold)
+        assert np.allclose(
+            model.conv_offset.bias.grad.detach().cpu().numpy() / batch_size,
+            gt_offset_bias_grad, threshold)
+        assert np.allclose(
+            model.weight.grad.detach().cpu().numpy() / batch_size,
+            gt_deform_weight_grad, threshold)
+
+        from mmcv.ops import DeformConv2d
+
+        # test bias
+        model = DeformConv2d(1, 1, 2, stride=1, padding=0)
+        assert not hasattr(model, 'bias')
+        # test bias=True
+        with pytest.raises(AssertionError):
+            model = DeformConv2d(1, 1, 2, stride=1, padding=0, bias=True)
+        # test in_channels % group != 0
+        with pytest.raises(AssertionError):
+            model = DeformConv2d(3, 2, 3, groups=2)
+        # test out_channels % group != 0
+        with pytest.raises(AssertionError):
+            model = DeformConv2d(3, 4, 3, groups=3)
+
+    def _test_amp_deformconv(self,
+                             input_dtype,
+                             threshold=1e-3,
+                             device='cuda',
+                             batch_size=10,
+                             im2col_step=2):
+        """The function to test amp released on pytorch 1.6.0.
+
+        The type of input data might be torch.float or torch.half,
+        so we should test deform_conv in both cases. With amp, the
+        data type of model will NOT be set manually.
+
+        Args:
+            input_dtype: torch.float or torch.half.
+            threshold: the same as above function.
+        """
+        if not torch.cuda.is_available() and device == 'cuda':
+            return
+        if device == 'mlu':
+            from mmcv.ops import DeformConv2dPack_MLU as DeformConv2dPack
+        else:
+            from mmcv.ops import DeformConv2dPack
+        c_in = 1
+        c_out = 1
+        repeated_input = np.repeat(input, batch_size, axis=0)
+        repeated_gt_out = np.repeat(gt_out, batch_size, axis=0)
+        repeated_gt_x_grad = np.repeat(gt_x_grad, batch_size, axis=0)
+        x = torch.Tensor(repeated_input).to(device).type(input_dtype)
+        x.requires_grad = True
+        model = DeformConv2dPack(
+            in_channels=c_in,
+            out_channels=c_out,
+            kernel_size=2,
+            stride=1,
+            padding=0,
+            im2col_step=im2col_step)
+        model.conv_offset.weight.data = torch.nn.Parameter(
+            torch.Tensor(offset_weight).reshape(8, 1, 2, 2))
+        model.conv_offset.bias.data = torch.nn.Parameter(
+            torch.Tensor(offset_bias).reshape(8))
+        model.weight.data = torch.nn.Parameter(
+            torch.Tensor(deform_weight).reshape(1, 1, 2, 2))
+        if device == 'cuda':
+            model.cuda()
+        elif device == 'mlu':
+            model.mlu()
+
+        out = model(x)
+        out.backward(torch.ones_like(out))
+
+        assert np.allclose(out.data.detach().cpu().numpy(), repeated_gt_out,
+                           threshold)
+        assert np.allclose(x.grad.detach().cpu().numpy(), repeated_gt_x_grad,
+                           threshold)
+        assert np.allclose(
+            model.conv_offset.weight.grad.detach().cpu().numpy() / batch_size,
+            gt_offset_weight_grad, threshold)
+        assert np.allclose(
+            model.conv_offset.bias.grad.detach().cpu().numpy() / batch_size,
+            gt_offset_bias_grad, threshold)
+        assert np.allclose(
+            model.weight.grad.detach().cpu().numpy() / batch_size,
+            gt_deform_weight_grad, threshold)
+
+        from mmcv.ops import DeformConv2d
+
+        # test bias
+        model = DeformConv2d(1, 1, 2, stride=1, padding=0)
+        assert not hasattr(model, 'bias')
+        # test bias=True
+        with pytest.raises(AssertionError):
+            model = DeformConv2d(1, 1, 2, stride=1, padding=0, bias=True)
+        # test in_channels % group != 0
+        with pytest.raises(AssertionError):
+            model = DeformConv2d(3, 2, 3, groups=2)
+        # test out_channels % group != 0
+        with pytest.raises(AssertionError):
+            model = DeformConv2d(3, 4, 3, groups=3)
+
+    @pytest.mark.parametrize('device, threshold', [
+        ('cpu', 1e-1),
+        pytest.param(
+            'cuda',
+            1e-3,
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            1e-3,
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    ])
+    def test_deformconv_float(self, device, threshold):
+        self._test_deformconv(torch.float, device=device, threshold=threshold)
+        # test batch_size < im2col_step
+        self._test_deformconv(
+            torch.float, batch_size=1, im2col_step=2, device=device)
+        # test bach_size % im2col_step != 0
+        with pytest.raises(
+                AssertionError,
+                match='batch size must be divisible by im2col_step'):
+            self._test_deformconv(
+                torch.float, batch_size=10, im2col_step=3, device=device)
+
+    @pytest.mark.parametrize('device', [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    ])
+    def test_deformconv_double(self, device):
+        self._test_deformconv(torch.double, device=device)
+
+    @pytest.mark.parametrize('device, threshold', [
+        pytest.param(
+            'cuda',
+            1e-1,
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            1e-1,
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    ])
+    def test_deformconv_half(self, device, threshold):
+        self._test_deformconv(torch.half, device=device, threshold=threshold)
+        # test amp when torch version >= '1.6.0', the type of
+        # input data for deformconv might be torch.float or torch.half
+        if (TORCH_VERSION != 'parrots'
+                and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+            with autocast(enabled=True):
+                self._test_amp_deformconv(
+                    torch.float, device=device, threshold=threshold)
+                self._test_amp_deformconv(
+                    torch.half, device=device, threshold=threshold)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_deform_roi_pool.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_deform_roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..346301fe41bf7c3e81a1888bde190bf954503548
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_deform_roi_pool.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
+                                               1.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+outputs = [([[[[1, 1.25], [1.5, 1.75]]]], [[[[3.0625, 0.4375],
+                                             [0.4375, 0.0625]]]]),
+           ([[[[1., 1.25], [1.5, 1.75]], [[4, 3.75],
+                                          [3.5, 3.25]]]], [[[[3.0625, 0.4375],
+                                                             [0.4375, 0.0625]],
+                                                            [[3.0625, 0.4375],
+                                                             [0.4375,
+                                                              0.0625]]]]),
+           ([[[[1.9375, 4.75],
+               [7.5625,
+                10.375]]]], [[[[0.47265625, 0.4296875, 0.4296875, 0.04296875],
+                               [0.4296875, 0.390625, 0.390625, 0.0390625],
+                               [0.4296875, 0.390625, 0.390625, 0.0390625],
+                               [0.04296875, 0.0390625, 0.0390625,
+                                0.00390625]]]])]
+
+
+class TestDeformRoIPool:
+
+    def test_deform_roi_pool_gradcheck(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import DeformRoIPoolPack
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+        sampling_ratio = 2
+
+        for case in inputs:
+            np_input = np.array(case[0])
+            np_rois = np.array(case[1])
+
+            x = torch.tensor(
+                np_input, device='cuda', dtype=torch.float, requires_grad=True)
+            rois = torch.tensor(np_rois, device='cuda', dtype=torch.float)
+            output_c = x.size(1)
+
+            droipool = DeformRoIPoolPack((pool_h, pool_w),
+                                         output_c,
+                                         spatial_scale=spatial_scale,
+                                         sampling_ratio=sampling_ratio).cuda()
+
+            if _USING_PARROTS:
+                gradcheck(droipool, (x, rois), no_grads=[rois])
+            else:
+                gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2)
+
+    def test_modulated_deform_roi_pool_gradcheck(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import ModulatedDeformRoIPoolPack
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+        sampling_ratio = 2
+
+        for case in inputs:
+            np_input = np.array(case[0])
+            np_rois = np.array(case[1])
+
+            x = torch.tensor(
+                np_input, device='cuda', dtype=torch.float, requires_grad=True)
+            rois = torch.tensor(np_rois, device='cuda', dtype=torch.float)
+            output_c = x.size(1)
+
+            droipool = ModulatedDeformRoIPoolPack(
+                (pool_h, pool_w),
+                output_c,
+                spatial_scale=spatial_scale,
+                sampling_ratio=sampling_ratio).cuda()
+
+            if _USING_PARROTS:
+                gradcheck(droipool, (x, rois), no_grads=[rois])
+            else:
+                gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2)
+
+    def _test_deform_roi_pool_allclose(self, device, dtype=torch.float):
+        from mmcv.ops import DeformRoIPoolPack
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+        sampling_ratio = 2
+
+        for case, output in zip(inputs, outputs):
+            np_input = np.array(case[0])
+            np_rois = np.array(case[1])
+            np_output = np.array(output[0])
+            np_grad = np.array(output[1])
+
+            x = torch.tensor(
+                np_input, device=device, dtype=torch.float, requires_grad=True)
+            rois = torch.tensor(np_rois, device=device, dtype=torch.float)
+            output_c = x.size(1)
+            droipool = DeformRoIPoolPack(
+                (pool_h, pool_w),
+                output_c,
+                spatial_scale=spatial_scale,
+                sampling_ratio=sampling_ratio).to(device)
+
+            output = droipool(x, rois)
+            output.backward(torch.ones_like(output))
+            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
+            assert np.allclose(x.grad.data.cpu().numpy(), np_grad, 1e-3)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support')),
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    @pytest.mark.parametrize('dtype', [
+        torch.float,
+        pytest.param(
+            torch.double,
+            marks=pytest.mark.skipif(
+                IS_MLU_AVAILABLE,
+                reason='MLU does not support for 64-bit floating point')),
+        torch.half
+    ])
+    def test_deform_roi_pool_allclose(self, device, dtype):
+        self._test_deform_roi_pool_allclose(device, dtype)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_diff_iou_rotated.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_diff_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..5688bb8764378f813fea2a49a1359cb4a4649cb3
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_diff_iou_rotated.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import diff_iou_rotated_2d, diff_iou_rotated_3d
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+if IS_MLU_AVAILABLE:
+    torch.backends.mlu.matmul.allow_tf32 = False
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_diff_iou_rotated_2d(device):
+    np_boxes1 = np.asarray([[[0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],
+                             [0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],
+                             [0.5, 0.5, 1., 1., .0]]],
+                           dtype=np.float32)
+    np_boxes2 = np.asarray(
+        [[[0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., np.pi / 2],
+          [0.5, 0.5, 1., 1., np.pi / 4], [1., 1., 1., 1., .0],
+          [1.5, 1.5, 1., 1., .0]]],
+        dtype=np.float32)
+
+    boxes1 = torch.from_numpy(np_boxes1).to(device)
+    boxes2 = torch.from_numpy(np_boxes2).to(device)
+
+    np_expect_ious = np.asarray([[1., 1., .7071, 1 / 7, .0]])
+    ious = diff_iou_rotated_2d(boxes1, boxes2)
+    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_diff_iou_rotated_3d(device):
+    np_boxes1 = np.asarray(
+        [[[.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],
+          [.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],
+          [.5, .5, .5, 1., 1., 1., .0]]],
+        dtype=np.float32)
+    np_boxes2 = np.asarray(
+        [[[.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 2., np.pi / 2],
+          [.5, .5, .5, 1., 1., 1., np.pi / 4], [1., 1., 1., 1., 1., 1., .0],
+          [-1.5, -1.5, -1.5, 2.5, 2.5, 2.5, .0]]],
+        dtype=np.float32)
+
+    boxes1 = torch.from_numpy(np_boxes1).to(device)
+    boxes2 = torch.from_numpy(np_boxes2).to(device)
+
+    np_expect_ious = np.asarray([[1., .5, .7071, 1 / 15, .0]])
+    ious = diff_iou_rotated_3d(boxes1, boxes2)
+    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_filtered_lrelu.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_filtered_lrelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b6ab9e8db0710dbfc00e46eba56c5a02bd701f8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_filtered_lrelu.py
@@ -0,0 +1,224 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils.parrots_wrapper import is_rocm_pytorch
+
+from mmcv.ops import filtered_lrelu
+
+
+class TestFilteredLrelu:
+
+    @classmethod
+    def setup_class(cls):
+        cls.input_tensor = torch.randn((1, 3, 16, 16), requires_grad=True)
+        cls.bias = torch.randn(3, requires_grad=True)
+        cls.filter_up = torch.randn((2, 2))
+        cls.filter_down = torch.randn((2, 2))
+
+    def test_filtered_lrelu_cpu(self):
+        out = filtered_lrelu(self.input_tensor, bias=self.bias)
+        assert out.shape == (1, 3, 16, 16)
+
+        out = filtered_lrelu(
+            self.input_tensor,
+            bias=self.bias,
+            filter_up=self.filter_up,
+            filter_down=self.filter_down,
+            up=2,
+            down=2,
+            padding=1,
+            clamp=0.5)
+        assert out.shape == (1, 3, 16, 16)
+
+        # test with different filter_up
+        filter_up = torch.randn((4, 4))
+        out = filtered_lrelu(
+            self.input_tensor,
+            bias=self.bias,
+            filter_up=filter_up,
+            filter_down=self.filter_down,
+            up=2,
+            down=2,
+            padding=2,
+            clamp=0.5)
+        assert out.shape == (1, 3, 16, 16)
+
+        # test with different filter_down
+        filter_down = torch.randn((4, 4))
+        out = filtered_lrelu(
+            self.input_tensor,
+            bias=self.bias,
+            filter_up=self.filter_up,
+            filter_down=filter_down,
+            up=2,
+            down=2,
+            padding=2,
+            clamp=0.5)
+        assert out.shape == (1, 3, 16, 16)
+
+        # test with different b
+        input_tensor = torch.randn((1, 4, 16, 16), requires_grad=True)
+        bias = torch.randn(4, requires_grad=True)
+        out = filtered_lrelu(
+            input_tensor,
+            bias=bias,
+            filter_up=self.filter_up,
+            filter_down=self.filter_down,
+            up=2,
+            down=2,
+            padding=1,
+            clamp=0.5)
+        assert out.shape == (1, 4, 16, 16)
+
+        # test with different up
+        out = filtered_lrelu(
+            self.input_tensor,
+            bias=self.bias,
+            filter_up=self.filter_up,
+            filter_down=self.filter_down,
+            up=4,
+            down=2,
+            padding=1,
+            clamp=0.5)
+        assert out.shape == (1, 3, 32, 32)
+
+        # test with different down
+        out = filtered_lrelu(
+            self.input_tensor,
+            bias=self.bias,
+            filter_up=self.filter_up,
+            filter_down=self.filter_down,
+            up=2,
+            down=4,
+            padding=1,
+            clamp=0.5)
+        assert out.shape == (1, 3, 8, 8)
+
+        # test with different gain
+        out1 = filtered_lrelu(self.input_tensor, bias=self.bias, gain=0.2)
+        out2 = filtered_lrelu(self.input_tensor, bias=self.bias, gain=0.1)
+        assert torch.allclose(out1, 2 * out2)
+
+        # test with different slope
+        out = filtered_lrelu(self.input_tensor, bias=self.bias, slope=0.2)
+        assert out.shape == (1, 3, 16, 16)
+
+        # test with different clamp
+        out1 = filtered_lrelu(self.input_tensor, bias=self.bias, clamp=0.2)
+        out2 = filtered_lrelu(self.input_tensor, bias=self.bias, clamp=0.1)
+        assert out1.max() <= 0.2
+        assert out2.max() <= 0.1
+
+        # test with different flip_filter
+        out1 = filtered_lrelu(
+            self.input_tensor, bias=self.bias, flip_filter=True)
+        assert out.shape == (1, 3, 16, 16)
+
+    @pytest.mark.skipif(
+        not torch.cuda.is_available() or is_rocm_pytorch()
+        or digit_version(torch.version.cuda) < digit_version('10.2'),
+        reason='requires cuda>=10.2')
+    def test_filtered_lrelu_cuda(self):
+        out = filtered_lrelu(self.input_tensor.cuda(), bias=self.bias.cuda())
+        assert out.shape == (1, 3, 16, 16)
+
+        out = filtered_lrelu(
+            self.input_tensor.cuda(),
+            bias=self.bias.cuda(),
+            filter_up=self.filter_up.cuda(),
+            filter_down=self.filter_down.cuda(),
+            up=2,
+            down=2,
+            padding=1,
+            clamp=0.5)
+        assert out.shape == (1, 3, 16, 16)
+
+        # test with different filter_up
+        filter_up = torch.randn((4, 4))
+        out = filtered_lrelu(
+            self.input_tensor.cuda(),
+            bias=self.bias.cuda(),
+            filter_up=filter_up.cuda(),
+            filter_down=self.filter_down.cuda(),
+            up=2,
+            down=2,
+            padding=2,
+            clamp=0.5)
+        assert out.shape == (1, 3, 16, 16)
+
+        # test with different filter_down
+        filter_down = torch.randn((4, 4))
+        out = filtered_lrelu(
+            self.input_tensor.cuda(),
+            bias=self.bias.cuda(),
+            filter_up=self.filter_up.cuda(),
+            filter_down=filter_down.cuda(),
+            up=2,
+            down=2,
+            padding=2,
+            clamp=0.5)
+        assert out.shape == (1, 3, 16, 16)
+
+        # test with different b
+        input_tensor = torch.randn((1, 4, 16, 16), requires_grad=True)
+        bias = torch.randn(4, requires_grad=True)
+        out = filtered_lrelu(
+            input_tensor.cuda(),
+            bias=bias.cuda(),
+            filter_up=self.filter_up.cuda(),
+            filter_down=self.filter_down.cuda(),
+            up=2,
+            down=2,
+            padding=1,
+            clamp=0.5)
+        assert out.shape == (1, 4, 16, 16)
+
+        # test with different up
+        out = filtered_lrelu(
+            self.input_tensor.cuda(),
+            bias=self.bias.cuda(),
+            filter_up=self.filter_up.cuda(),
+            filter_down=self.filter_down.cuda(),
+            up=4,
+            down=2,
+            padding=1,
+            clamp=0.5)
+        assert out.shape == (1, 3, 32, 32)
+
+        # test with different down
+        out = filtered_lrelu(
+            self.input_tensor.cuda(),
+            bias=self.bias.cuda(),
+            filter_up=self.filter_up.cuda(),
+            filter_down=self.filter_down.cuda(),
+            up=2,
+            down=4,
+            padding=1,
+            clamp=0.5)
+        assert out.shape == (1, 3, 8, 8)
+
+        # test with different gain
+        out1 = filtered_lrelu(
+            self.input_tensor.cuda(), bias=self.bias.cuda(), gain=0.2)
+        out2 = filtered_lrelu(
+            self.input_tensor.cuda(), bias=self.bias.cuda(), gain=0.1)
+        assert torch.allclose(out1, 2 * out2)
+
+        # test with different slope
+        out = filtered_lrelu(
+            self.input_tensor.cuda(), bias=self.bias.cuda(), slope=0.2)
+        assert out.shape == (1, 3, 16, 16)
+
+        # test with different clamp
+        out1 = filtered_lrelu(
+            self.input_tensor.cuda(), bias=self.bias.cuda(), clamp=0.2)
+        out2 = filtered_lrelu(
+            self.input_tensor.cuda(), bias=self.bias.cuda(), clamp=0.1)
+        assert out1.max() <= 0.2
+        assert out2.max() <= 0.1
+
+        # test with different flip_filter
+        out1 = filtered_lrelu(
+            self.input_tensor.cuda(), bias=self.bias.cuda(), flip_filter=True)
+        assert out.shape == (1, 3, 16, 16)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_focal_loss.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee7c9861aea72d556d5255bab959153e54611766
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_focal_loss.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+
+# torch.set_printoptions(precision=8, threshold=100)
+
+inputs = [
+    ([[1., 0], [0, 1.]], [0, 1]),
+    ([[1., 0, -1.], [0, 1., 2.]], [2, 1]),
+    ([[1e-6, 2e-6, 3e-6], [4e-6, 5e-5, 6e-4], [7e-3, 8e-2, 9e-1]], [1, 2, 0]),
+]
+
+softmax_outputs = [(0.00566451, [[-0.00657264, 0.00657264],
+                                 [0.00657264, -0.00657264]]),
+                   (0.34956908, [[0.10165970, 0.03739851, -0.13905823],
+                                 [0.01227554, -0.10298023, 0.09070466]]),
+                   (0.15754992, [[0.02590877, -0.05181759, 0.02590882],
+                                 [0.02589641, 0.02589760, -0.05179400],
+                                 [-0.07307514, 0.02234372, 0.05073142]])]
+
+sigmoid_outputs = [(0.13562961, [[-0.00657264, 0.11185755],
+                                 [0.11185755, -0.00657264]]),
+                   (1.10251057, [[0.28808805, 0.11185755, -0.09602935],
+                                 [0.11185755, -0.00657264, 0.40376765]]),
+                   (0.42287254, [[0.07457182, -0.02485716, 0.07457201],
+                                 [0.07457211, 0.07457669, -0.02483728],
+                                 [-0.02462499, 0.08277918, 0.18050370]])]
+
+
+class Testfocalloss:
+
+    def _test_softmax(self, dtype=torch.float):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import softmax_focal_loss
+        alpha = 0.25
+        gamma = 2.0
+        for case, output in zip(inputs, softmax_outputs):
+            np_x = np.array(case[0])
+            np_y = np.array(case[1])
+            np_x_grad = np.array(output[1])
+
+            x = torch.from_numpy(np_x).cuda().type(dtype)
+            x.requires_grad_()
+            y = torch.from_numpy(np_y).cuda().long()
+
+            loss = softmax_focal_loss(x, y, gamma, alpha, None, 'mean')
+            loss.backward()
+
+            assert np.allclose(loss.data.cpu().numpy(), output[0], 1e-2)
+            assert np.allclose(x.grad.data.cpu(), np_x_grad, 1e-2)
+
+    def _test_sigmoid(self, device, dtype=torch.float):
+        from mmcv.ops import sigmoid_focal_loss
+        alpha = 0.25
+        gamma = 2.0
+        for case, output in zip(inputs, sigmoid_outputs):
+            np_x = np.array(case[0])
+            np_y = np.array(case[1])
+            np_x_grad = np.array(output[1])
+
+            x = torch.from_numpy(np_x).to(device).type(dtype)
+            x.requires_grad_()
+            y = torch.from_numpy(np_y).to(device).long()
+
+            loss = sigmoid_focal_loss(x, y, gamma, alpha, None, 'mean')
+            loss.backward()
+
+            assert np.allclose(loss.data.cpu().numpy(), output[0], 1e-2)
+            assert np.allclose(x.grad.data.cpu(), np_x_grad, 1e-2)
+
+    def _test_grad_softmax(self, dtype=torch.float):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import SoftmaxFocalLoss
+        alpha = 0.25
+        gamma = 2.0
+        for case in inputs:
+            np_x = np.array(case[0])
+            np_y = np.array(case[1])
+
+            x = torch.from_numpy(np_x).cuda().type(dtype)
+            x.requires_grad_()
+            y = torch.from_numpy(np_y).cuda().long()
+
+            floss = SoftmaxFocalLoss(gamma, alpha)
+            if _USING_PARROTS:
+                # gradcheck(floss, (x, y),
+                #           no_grads=[y])
+                pass
+            else:
+                gradcheck(floss, (x, y), eps=1e-2, atol=1e-2)
+
+    def _test_grad_sigmoid(self, dtype=torch.float):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import SigmoidFocalLoss
+        alpha = 0.25
+        gamma = 2.0
+        for case in inputs:
+            np_x = np.array(case[0])
+            np_y = np.array(case[1])
+
+            x = torch.from_numpy(np_x).cuda().type(dtype)
+            x.requires_grad_()
+            y = torch.from_numpy(np_y).cuda().long()
+
+            floss = SigmoidFocalLoss(gamma, alpha)
+            if _USING_PARROTS:
+                # gradcheck(floss, (x, y),
+                #           no_grads=[y])
+                pass
+            else:
+                gradcheck(floss, (x, y), eps=1e-2, atol=1e-2)
+
+    def test_softmax_float(self):
+        self._test_softmax(dtype=torch.float)
+
+    def test_softmax_half(self):
+        self._test_softmax(dtype=torch.half)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support')),
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_sigmoid_float(self, device):
+        self._test_sigmoid(device=device, dtype=torch.float)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support')),
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_sigmoid_half(self, device):
+        self._test_sigmoid(device, dtype=torch.half)
+
+    def test_grad_softmax_float(self):
+        self._test_grad_softmax(dtype=torch.float)
+
+    def test_grad_sigmoid_float(self):
+        self._test_grad_sigmoid(dtype=torch.float)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_furthest_point_sample.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_furthest_point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e61e64a91f541f49828d1e91e6b79c06aa1470a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_furthest_point_sample.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import furthest_point_sample, furthest_point_sample_with_dist
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_fps():
+    xyz = torch.tensor([[[-0.2748, 1.0020, -1.1674], [0.1015, 1.3952, -1.2681],
+                         [-0.8070, 2.4137,
+                          -0.5845], [-1.0001, 2.1982, -0.5859],
+                         [0.3841, 1.8983, -0.7431]],
+                        [[-1.0696, 3.0758,
+                          -0.1899], [-0.2559, 3.5521, -0.1402],
+                         [0.8164, 4.0081, -0.1839], [-1.1000, 3.0213, -0.8205],
+                         [-0.0518, 3.7251, -0.3950]]]).cuda()
+
+    idx = furthest_point_sample(xyz, 3)
+    expected_idx = torch.tensor([[0, 2, 4], [0, 2, 1]]).cuda()
+    assert torch.all(idx == expected_idx)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_fps_with_dist():
+    xyz = torch.tensor([[[-0.2748, 1.0020, -1.1674], [0.1015, 1.3952, -1.2681],
+                         [-0.8070, 2.4137,
+                          -0.5845], [-1.0001, 2.1982, -0.5859],
+                         [0.3841, 1.8983, -0.7431]],
+                        [[-1.0696, 3.0758,
+                          -0.1899], [-0.2559, 3.5521, -0.1402],
+                         [0.8164, 4.0081, -0.1839], [-1.1000, 3.0213, -0.8205],
+                         [-0.0518, 3.7251, -0.3950]]]).cuda()
+
+    expected_idx = torch.tensor([[0, 2, 4], [0, 2, 1]]).cuda()
+    xyz_square_dist = ((xyz.unsqueeze(dim=1) -
+                        xyz.unsqueeze(dim=2))**2).sum(-1)
+    idx = furthest_point_sample_with_dist(xyz_square_dist, 3)
+    assert torch.all(idx == expected_idx)
+
+    import numpy as np
+    fps_idx = np.load('tests/data/for_3d_ops/fps_idx.npy')
+    features_for_fps_distance = np.load(
+        'tests/data/for_3d_ops/features_for_fps_distance.npy')
+    expected_idx = torch.from_numpy(fps_idx).cuda()
+    features_for_fps_distance = torch.from_numpy(
+        features_for_fps_distance).cuda()
+
+    idx = furthest_point_sample_with_dist(features_for_fps_distance, 16)
+    assert torch.all(idx == expected_idx)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_fused_bias_leakyrelu.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_fused_bias_leakyrelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6f6fb9f75916f2ef625b856b47db9b8674a4756
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_fused_bias_leakyrelu.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck, gradgradcheck
+    _USING_PARROTS = False
+
+
+class TestFusedBiasLeakyReLU:
+
+    @classmethod
+    def setup_class(cls):
+        if not IS_CUDA_AVAILABLE and not IS_NPU_AVAILABLE:
+            return
+        if IS_CUDA_AVAILABLE:
+            cls.input_tensor = torch.randn((2, 2, 2, 2),
+                                           requires_grad=True).cuda()
+            cls.bias = torch.zeros(2, requires_grad=True).cuda()
+        elif IS_NPU_AVAILABLE:
+            cls.input_tensor = torch.randn((2, 2, 2, 2),
+                                           requires_grad=True).npu()
+            cls.bias = torch.zeros(2, requires_grad=True).npu()
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
+    ])
+    def test_gradient(self, device):
+
+        from mmcv.ops import FusedBiasLeakyReLU
+        if _USING_PARROTS:
+            if IS_CUDA_AVAILABLE:
+                gradcheck(
+                    FusedBiasLeakyReLU(2).cuda(),
+                    self.input_tensor,
+                    delta=1e-4,
+                    pt_atol=1e-3)
+        else:
+            gradcheck(
+                FusedBiasLeakyReLU(2).to(device),
+                self.input_tensor,
+                eps=1e-4,
+                atol=1e-3)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
+    ])
+    def test_gradgradient(self, device):
+
+        from mmcv.ops import FusedBiasLeakyReLU
+        gradgradcheck(
+            FusedBiasLeakyReLU(2).to(device),
+            self.input_tensor,
+            eps=1e-4,
+            atol=1e-3)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_gather_points.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_gather_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..349a1b65d4754cf2bba38864d3fe5b63fcaab4ba
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_gather_points.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import gather_points
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
+
+
+class TestGatherPoints:
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
+    ])
+    def test_gather_points_all_close(self, device):
+        features = torch.tensor(
+            [[[
+                -1.6095, -0.1029, -0.8876, -1.2447, -2.4031, 0.3708, -1.1586,
+                -1.4967, -0.4800, 0.2252
+            ],
+              [
+                  1.9138, 3.4979, 1.6854, 1.5631, 3.6776, 3.1154, 2.1705,
+                  2.5221, 2.0411, 3.1446
+              ],
+              [
+                  -1.4173, 0.3073, -1.4339, -1.4340, -1.2770, -0.2867, -1.4162,
+                  -1.4044, -1.4245, -1.4074
+              ]],
+             [[
+                 0.2160, 0.0842, 0.3661, -0.2749, -0.4909, -0.6066, -0.8773,
+                 -0.0745, -0.9496, 0.1434
+             ],
+              [
+                  1.3644, 1.8087, 1.6855, 1.9563, 1.2746, 1.9662, 0.9566,
+                  1.8778, 1.1437, 1.3639
+              ],
+              [
+                  -0.7172, 0.1692, 0.2241, 0.0721, -0.7540, 0.0462, -0.6227,
+                  0.3223, -0.6944, -0.5294
+              ]]],
+            dtype=torch.float,
+            device=device)
+        idx = torch.tensor([[0, 1, 4, 0, 0, 0], [0, 5, 6, 0, 0, 0]],
+                           dtype=torch.int32,
+                           device=device)
+        output = gather_points(features, idx)
+        expected_output = torch.tensor(
+            [[[-1.6095, -0.1029, -2.4031, -1.6095, -1.6095, -1.6095],
+              [1.9138, 3.4979, 3.6776, 1.9138, 1.9138, 1.9138],
+              [-1.4173, 0.3073, -1.2770, -1.4173, -1.4173, -1.4173]],
+             [[0.2160, -0.6066, -0.8773, 0.2160, 0.2160, 0.2160],
+              [1.3644, 1.9662, 0.9566, 1.3644, 1.3644, 1.3644],
+              [-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]],
+            dtype=torch.float,
+            device=device)
+
+        assert torch.allclose(output, expected_output)
+
+        # test fp16
+        output_half = gather_points(features.half(), idx)
+        assert torch.allclose(output_half, expected_output.half())
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_group_points.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_group_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..12cf303b7e3e848f2634081bb26d62b64c412b95
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_group_points.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import grouping_operation
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
+])
+@pytest.mark.parametrize('dtype', [torch.half, torch.float, torch.double])
+def test_grouping_points(dtype, device):
+    idx = torch.tensor([[[0, 0, 0], [3, 3, 3], [8, 8, 8], [0, 0, 0], [0, 0, 0],
+                         [0, 0, 0]],
+                        [[0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0], [0, 0, 0],
+                         [0, 0, 0]]]).int().to(device)
+    features = torch.tensor([[[
+        0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,
+        0.9268, 0.8414
+    ],
+                              [
+                                  5.4247, 1.5113, 2.3944, 1.4740, 5.0300,
+                                  5.1030, 1.9360, 2.1939, 2.1581, 3.4666
+                              ],
+                              [
+                                  -1.6266, -1.0281, -1.0393, -1.6931, -1.3982,
+                                  -0.5732, -1.0830, -1.7561, -1.6786, -1.6967
+                              ]],
+                             [[
+                                 -0.0380, -0.1880, -1.5724, 0.6905, -0.3190,
+                                 0.7798, -0.3693, -0.9457, -0.2942, -1.8527
+                             ],
+                              [
+                                  1.1773, 1.5009, 2.6399, 5.9242, 1.0962,
+                                  2.7346, 6.0865, 1.5555, 4.3303, 2.8229
+                              ],
+                              [
+                                  -0.6646, -0.6870, -0.1125, -0.2224, -0.3445,
+                                  -1.4049, 0.4990, -0.7037, -0.9924, 0.0386
+                              ]]],
+                            dtype=dtype).to(device)
+
+    output = grouping_operation(features, idx)
+    expected_output = torch.tensor(
+        [[[[0.5798, 0.5798, 0.5798], [-1.3311, -1.3311, -1.3311],
+           [0.9268, 0.9268, 0.9268], [0.5798, 0.5798, 0.5798],
+           [0.5798, 0.5798, 0.5798], [0.5798, 0.5798, 0.5798]],
+          [[5.4247, 5.4247, 5.4247], [1.4740, 1.4740, 1.4740],
+           [2.1581, 2.1581, 2.1581], [5.4247, 5.4247, 5.4247],
+           [5.4247, 5.4247, 5.4247], [5.4247, 5.4247, 5.4247]],
+          [[-1.6266, -1.6266, -1.6266], [-1.6931, -1.6931, -1.6931],
+           [-1.6786, -1.6786, -1.6786], [-1.6266, -1.6266, -1.6266],
+           [-1.6266, -1.6266, -1.6266], [-1.6266, -1.6266, -1.6266]]],
+         [[[-0.0380, -0.0380, -0.0380], [-0.3693, -0.3693, -0.3693],
+           [-1.8527, -1.8527, -1.8527], [-0.0380, -0.0380, -0.0380],
+           [-0.0380, -0.0380, -0.0380], [-0.0380, -0.0380, -0.0380]],
+          [[1.1773, 1.1773, 1.1773], [6.0865, 6.0865, 6.0865],
+           [2.8229, 2.8229, 2.8229], [1.1773, 1.1773, 1.1773],
+           [1.1773, 1.1773, 1.1773], [1.1773, 1.1773, 1.1773]],
+          [[-0.6646, -0.6646, -0.6646], [0.4990, 0.4990, 0.4990],
+           [0.0386, 0.0386, 0.0386], [-0.6646, -0.6646, -0.6646],
+           [-0.6646, -0.6646, -0.6646], [-0.6646, -0.6646, -0.6646]]]],
+        dtype=dtype).to(device)
+    assert torch.allclose(output, expected_output)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+@pytest.mark.parametrize('dtype', [torch.half, torch.float, torch.double])
+def test_stack_grouping_points(dtype):
+    idx = torch.tensor([[0, 0, 0], [3, 3, 3], [8, 8, 8], [1, 1, 1], [0, 0, 0],
+                        [2, 2, 2], [0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0],
+                        [1, 1, 1], [0, 0, 0]]).int().cuda()
+    features = torch.tensor([[
+        0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,
+        0.9268, 0.8414
+    ],
+                             [
+                                 5.4247, 1.5113, 2.3944, 1.4740, 5.0300,
+                                 5.1030, 1.9360, 2.1939, 2.1581, 3.4666
+                             ],
+                             [
+                                 -1.6266, -1.0281, -1.0393, -1.6931, -1.3982,
+                                 -0.5732, -1.0830, -1.7561, -1.6786, -1.6967
+                             ],
+                             [
+                                 -0.0380, -0.1880, -1.5724, 0.6905, -0.3190,
+                                 0.7798, -0.3693, -0.9457, -0.2942, -1.8527
+                             ],
+                             [
+                                 1.1773, 1.5009, 2.6399, 5.9242, 1.0962,
+                                 2.7346, 6.0865, 1.5555, 4.3303, 2.8229
+                             ],
+                             [
+                                 -0.6646, -0.6870, -0.1125, -0.2224, -0.3445,
+                                 -1.4049, 0.4990, -0.7037, -0.9924, 0.0386
+                             ]],
+                            dtype=dtype).cuda()
+    features_batch_cnt = torch.tensor([3, 3]).int().cuda()
+    indices_batch_cnt = torch.tensor([6, 6]).int().cuda()
+    output = grouping_operation(features, idx, features_batch_cnt,
+                                indices_batch_cnt)
+    expected_output = torch.tensor(
+        [[[0.5798, 0.5798, 0.5798], [-0.7981, -0.7981, -0.7981],
+          [-0.9280, -0.9280, -0.9280], [-1.3311, -1.3311, -1.3311],
+          [1.3687, 1.3687, 1.3687], [0.9277, 0.9277, 0.9277],
+          [-0.4164, -0.4164, -0.4164], [-1.8274, -1.8274, -1.8274],
+          [0.9268, 0.9268, 0.9268], [0.8414, 0.8414, 0.8414]],
+         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],
+         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],
+         [[5.4247, 5.4247, 5.4247], [1.5113, 1.5113, 1.5113],
+          [2.3944, 2.3944, 2.3944], [1.4740, 1.4740, 1.4740],
+          [5.0300, 5.0300, 5.0300], [5.1030, 5.1030, 5.1030],
+          [1.9360, 1.9360, 1.9360], [2.1939, 2.1939, 2.1939],
+          [2.1581, 2.1581, 2.1581], [3.4666, 3.4666, 3.4666]],
+         [[0.5798, 0.5798, 0.5798], [-0.7981, -0.7981, -0.7981],
+          [-0.9280, -0.9280, -0.9280], [-1.3311, -1.3311, -1.3311],
+          [1.3687, 1.3687, 1.3687], [0.9277, 0.9277, 0.9277],
+          [-0.4164, -0.4164, -0.4164], [-1.8274, -1.8274, -1.8274],
+          [0.9268, 0.9268, 0.9268], [0.8414, 0.8414, 0.8414]],
+         [[-1.6266, -1.6266, -1.6266], [-1.0281, -1.0281, -1.0281],
+          [-1.0393, -1.0393, -1.0393], [-1.6931, -1.6931, -1.6931],
+          [-1.3982, -1.3982, -1.3982], [-0.5732, -0.5732, -0.5732],
+          [-1.0830, -1.0830, -1.0830], [-1.7561, -1.7561, -1.7561],
+          [-1.6786, -1.6786, -1.6786], [-1.6967, -1.6967, -1.6967]],
+         [[-0.0380, -0.0380, -0.0380], [-0.1880, -0.1880, -0.1880],
+          [-1.5724, -1.5724, -1.5724], [0.6905, 0.6905, 0.6905],
+          [-0.3190, -0.3190, -0.3190], [0.7798, 0.7798, 0.7798],
+          [-0.3693, -0.3693, -0.3693], [-0.9457, -0.9457, -0.9457],
+          [-0.2942, -0.2942, -0.2942], [-1.8527, -1.8527, -1.8527]],
+         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],
+         [[0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000],
+          [0.0000, 0.0000, 0.0000], [0.0000, 0.0000, 0.0000]],
+         [[-0.0380, -0.0380, -0.0380], [-0.1880, -0.1880, -0.1880],
+          [-1.5724, -1.5724, -1.5724], [0.6905, 0.6905, 0.6905],
+          [-0.3190, -0.3190, -0.3190], [0.7798, 0.7798, 0.7798],
+          [-0.3693, -0.3693, -0.3693], [-0.9457, -0.9457, -0.9457],
+          [-0.2942, -0.2942, -0.2942], [-1.8527, -1.8527, -1.8527]],
+         [[1.1773, 1.1773, 1.1773], [1.5009, 1.5009, 1.5009],
+          [2.6399, 2.6399, 2.6399], [5.9242, 5.9242, 5.9242],
+          [1.0962, 1.0962, 1.0962], [2.7346, 2.7346, 2.7346],
+          [6.0865, 6.0865, 6.0865], [1.5555, 1.5555, 1.5555],
+          [4.3303, 4.3303, 4.3303], [2.8229, 2.8229, 2.8229]],
+         [[-0.0380, -0.0380, -0.0380], [-0.1880, -0.1880, -0.1880],
+          [-1.5724, -1.5724, -1.5724], [0.6905, 0.6905, 0.6905],
+          [-0.3190, -0.3190, -0.3190], [0.7798, 0.7798, 0.7798],
+          [-0.3693, -0.3693, -0.3693], [-0.9457, -0.9457, -0.9457],
+          [-0.2942, -0.2942, -0.2942], [-1.8527, -1.8527, -1.8527]]],
+        dtype=dtype).cuda()
+    assert torch.allclose(output, expected_output)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_info.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3c1722eba09f41222352bf54962a3859566702a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_info.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+class TestInfo:
+
+    def test_info(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import get_compiler_version, get_compiling_cuda_version
+        cv = get_compiler_version()
+        ccv = get_compiling_cuda_version()
+        assert cv is not None
+        assert ccv is not None
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_iou3d.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_iou3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bb8c1cccedfd973e390be57d9a189296ea2deb6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_iou3d.py
@@ -0,0 +1,145 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import boxes_iou3d, boxes_overlap_bev, nms3d, nms3d_normal
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+])
+def test_boxes_overlap_bev(device):
+    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],
+                           dtype=np.float32)
+    np_boxes2 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 2],
+                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 4]],
+                           dtype=np.float32)
+    np_expect_overlaps = np.asarray(
+        [[4.0, 4.0, (8 + 8 * 2**0.5) /
+          (3 + 2 * 2**0.5)], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0]],
+        dtype=np.float32)
+
+    boxes1 = torch.from_numpy(np_boxes1).to(device)
+    boxes2 = torch.from_numpy(np_boxes2).to(device)
+
+    # test for 3 boxes
+    overlaps = boxes_overlap_bev(boxes1, boxes2)
+    assert np.allclose(overlaps.cpu().numpy(), np_expect_overlaps, atol=1e-4)
+
+    # test for many boxes
+    boxes2 = boxes2.repeat_interleave(555, 0)
+
+    overlaps = boxes_overlap_bev(boxes1, boxes2)
+    assert np.allclose(
+        overlaps.cpu().numpy(), np_expect_overlaps.repeat(555, 1), atol=1e-4)
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+])
+def test_boxes_iou3d(device):
+    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],
+                           dtype=np.float32)
+    np_boxes2 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 2],
+                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 4]],
+                           dtype=np.float32)
+    np_expect_ious = np.asarray(
+        [[1.0, 1.0, 1.0 / 2**0.5], [1.0 / 15, 1.0 / 15, 1.0 / 15],
+         [0.0, 0.0, 0.0]],
+        dtype=np.float32)
+
+    boxes1 = torch.from_numpy(np_boxes1).to(device)
+    boxes2 = torch.from_numpy(np_boxes2).to(device)
+
+    ious = boxes_iou3d(boxes1, boxes2)
+    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_nms3d(device):
+    # test for 5 boxes
+    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.3],
+                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0],
+                           [3.0, 3.2, 3.2, 3.0, 2.0, 2.0, 0.3]],
+                          dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.1, 0.2, 0.15], dtype=np.float32)
+    np_inds = np.array([1, 0, 3])
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+    inds = nms3d(boxes.to(device), scores.to(device), iou_threshold=0.3)
+
+    assert np.allclose(inds.cpu().numpy(), np_inds)
+
+    # test for many boxes
+    # In the float data type calculation process, float will be converted to
+    # double in CUDA kernel (https://github.com/open-mmlab/mmcv/blob
+    # /master/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp#L61),
+    # always use float in MLU kernel. The difference between the mentioned
+    # above leads to different results.
+    if device != 'mlu':
+        np.random.seed(42)
+        np_boxes = np.random.rand(555, 7).astype(np.float32)
+        np_scores = np.random.rand(555).astype(np.float32)
+        boxes = torch.from_numpy(np_boxes)
+        scores = torch.from_numpy(np_scores)
+        inds = nms3d(boxes.to(device), scores.to(device), iou_threshold=0.3)
+
+        assert len(inds.cpu().numpy()) == 176
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+])
+def test_nms3d_normal(device):
+    # test for 5 boxes
+    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.3],
+                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0],
+                           [3.0, 3.2, 3.2, 3.0, 2.0, 2.0, 0.3]],
+                          dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.1, 0.2, 0.15], dtype=np.float32)
+    np_inds = np.array([1, 0, 3])
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+    inds = nms3d_normal(boxes.to(device), scores.to(device), iou_threshold=0.3)
+
+    assert np.allclose(inds.cpu().numpy(), np_inds)
+
+    # test for many boxes
+    np.random.seed(42)
+    np_boxes = np.random.rand(555, 7).astype(np.float32)
+    np_scores = np.random.rand(555).astype(np.float32)
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+    inds = nms3d_normal(boxes.to(device), scores.to(device), iou_threshold=0.3)
+
+    assert len(inds.cpu().numpy()) == 148
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_knn.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1236a5fcbe732fd287cea0a97e3166dbcd5555fa
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_knn.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import knn
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_knn():
+    new_xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625],
+                             [-2.2769, 2.7817, -0.2334],
+                             [-0.4003, 2.4666, -0.5116],
+                             [-0.0740, 1.3147, -1.3625],
+                             [-0.0740, 1.3147, -1.3625]],
+                            [[-2.0289, 2.4952, -0.1708],
+                             [-2.0668, 6.0278, -0.4875],
+                             [0.4066, 1.4211, -0.2947],
+                             [-2.0289, 2.4952, -0.1708],
+                             [-2.0289, 2.4952, -0.1708]]]).cuda()
+
+    xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
+                         [-0.4003, 2.4666,
+                          -0.5116], [-0.5251, 2.4379, -0.8466],
+                         [-0.9691, 1.1418,
+                          -1.3733], [-0.2232, 0.9561, -1.3626],
+                         [-2.2769, 2.7817, -0.2334],
+                         [-0.2822, 1.3192, -1.3645], [0.1533, 1.5024, -1.0432],
+                         [0.4917, 1.1529, -1.3496]],
+                        [[-2.0289, 2.4952,
+                          -0.1708], [-0.7188, 0.9956, -0.5096],
+                         [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
+                         [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
+                         [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
+                         [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,
+                                                    -1.2000]]]).cuda()
+
+    idx = knn(5, xyz, new_xyz)
+    new_xyz_ = new_xyz.unsqueeze(2).repeat(1, 1, xyz.shape[1], 1)
+    xyz_ = xyz.unsqueeze(1).repeat(1, new_xyz.shape[1], 1, 1)
+    dist = ((new_xyz_ - xyz_) * (new_xyz_ - xyz_)).sum(-1)
+    expected_idx = dist.topk(k=5, dim=2, largest=False)[1].transpose(2, 1)
+    assert torch.all(idx == expected_idx)
+
+    idx = knn(5,
+              xyz.transpose(1, 2).contiguous(),
+              new_xyz.transpose(1, 2).contiguous(), True)
+    assert torch.all(idx == expected_idx)
+
+    idx = knn(5, xyz, xyz)
+    xyz_ = xyz.unsqueeze(2).repeat(1, 1, xyz.shape[1], 1)
+    xyz__ = xyz.unsqueeze(1).repeat(1, xyz.shape[1], 1, 1)
+    dist = ((xyz_ - xyz__) * (xyz_ - xyz__)).sum(-1)
+    expected_idx = dist.topk(k=5, dim=2, largest=False)[1].transpose(2, 1)
+    assert torch.all(idx == expected_idx)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_masked_conv2d.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_masked_conv2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..c949e0b71c44508bf8de0f0f490bd31000c144ec
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_masked_conv2d.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+if IS_MLU_AVAILABLE:
+    torch.backends.cnnl.allow_tf32 = False
+    torch.backends.mlu.matmul.allow_tf32 = False
+
+
+class TestMaskedConv2d:
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_masked_conv2d_all_close(self, device):
+        from mmcv.ops import MaskedConv2d
+        np_input = np.load(
+            'tests/data/for_masked_conv2d/masked_conv2d_for_input.npy')
+        np_mask = np.load(
+            'tests/data/for_masked_conv2d/masked_conv2d_for_mask.npy')
+        np_weight = np.load(
+            'tests/data/for_masked_conv2d/masked_conv2d_for_weight.npy')
+        np_bias = np.load(
+            'tests/data/for_masked_conv2d/masked_conv2d_for_bias.npy')
+        np_output = np.load(
+            'tests/data/for_masked_conv2d/masked_conv2d_for_output.npy')
+        input = torch.tensor(np_input, dtype=torch.float, device=device)
+        mask = torch.tensor(np_mask, dtype=torch.float, device=device)
+        weight = torch.tensor(np_weight, dtype=torch.float, device=device)
+        bias = torch.tensor(np_bias, dtype=torch.float, device=device)
+        conv = MaskedConv2d(3, 3, 3, 1, 1).to(device)
+        conv.weight = torch.nn.Parameter(weight)
+        conv.bias = torch.nn.Parameter(bias)
+        output = conv(input, mask)
+        assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_merge_cells.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_merge_cells.py
new file mode 100644
index 0000000000000000000000000000000000000000..51551c1416eb39340ed0ec170ce5dd35e436df68
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_merge_cells.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""
+CommandLine:
+    pytest tests/test_merge_cells.py
+"""
+import math
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from mmcv.ops.merge_cells import (BaseMergeCell, ConcatCell, GlobalPoolingCell,
+                                  SumCell)
+
+
+# All size (14, 7) below is to test the situation that
+# the input size can't be divisible by the target size.
+@pytest.mark.parametrize(
+    'inputs_x, inputs_y',
+    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
+     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
+def test_sum_cell(inputs_x, inputs_y):
+    sum_cell = SumCell(256, 256)
+    output = sum_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
+    assert output.size() == inputs_x.size()
+    output = sum_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])
+    assert output.size() == inputs_y.size()
+    output = sum_cell(inputs_x, inputs_y)
+    assert output.size() == inputs_y.size()
+
+
+@pytest.mark.parametrize(
+    'inputs_x, inputs_y',
+    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
+     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
+def test_concat_cell(inputs_x, inputs_y):
+    concat_cell = ConcatCell(256, 256)
+    output = concat_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
+    assert output.size() == inputs_x.size()
+    output = concat_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])
+    assert output.size() == inputs_y.size()
+    output = concat_cell(inputs_x, inputs_y)
+    assert output.size() == inputs_y.size()
+
+
+@pytest.mark.parametrize(
+    'inputs_x, inputs_y',
+    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
+     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
+def test_global_pool_cell(inputs_x, inputs_y):
+    gp_cell = GlobalPoolingCell(with_out_conv=False)
+    gp_cell_out = gp_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
+    assert (gp_cell_out.size() == inputs_x.size())
+    gp_cell = GlobalPoolingCell(256, 256)
+    gp_cell_out = gp_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
+    assert (gp_cell_out.size() == inputs_x.size())
+
+
+@pytest.mark.parametrize('target_size', [(256, 256), (128, 128), (64, 64),
+                                         (14, 7)])
+def test_resize_methods(target_size):
+    inputs_x = torch.randn([2, 256, 128, 128])
+    h, w = inputs_x.shape[-2:]
+    target_h, target_w = target_size
+    if (h <= target_h) or w <= target_w:
+        rs_mode = 'upsample'
+    else:
+        rs_mode = 'downsample'
+
+    if rs_mode == 'upsample':
+        upsample_methods_list = ['nearest', 'bilinear']
+        for method in upsample_methods_list:
+            merge_cell = BaseMergeCell(upsample_mode=method)
+            merge_cell_out = merge_cell._resize(inputs_x, target_size)
+            gt_out = F.interpolate(inputs_x, size=target_size, mode=method)
+            assert merge_cell_out.equal(gt_out)
+    elif rs_mode == 'downsample':
+        merge_cell = BaseMergeCell()
+        merge_cell_out = merge_cell._resize(inputs_x, target_size)
+        if h % target_h != 0 or w % target_w != 0:
+            pad_h = math.ceil(h / target_h) * target_h - h
+            pad_w = math.ceil(w / target_w) * target_w - w
+            pad_l = pad_w // 2
+            pad_r = pad_w - pad_l
+            pad_t = pad_h // 2
+            pad_b = pad_h - pad_t
+            pad = (pad_l, pad_r, pad_t, pad_b)
+            inputs_x = F.pad(inputs_x, pad, mode='constant', value=0.0)
+        kernel_size = (inputs_x.shape[-2] // target_h,
+                       inputs_x.shape[-1] // target_w)
+        gt_out = F.max_pool2d(
+            inputs_x, kernel_size=kernel_size, stride=kernel_size)
+        print(merge_cell_out.shape, gt_out.shape)
+        assert (merge_cell_out == gt_out).all()
+        assert merge_cell_out.shape[-2:] == target_size
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_min_area_polygons.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_min_area_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..649bdecfd62bcba2f782758802c97b265eaa9887
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_min_area_polygons.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import min_area_polygons
+
+np_pointsets = np.asarray([[
+    1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0,
+    2.0, 1.5, 1.5
+],
+                           [
+                               1.0, 1.0, 8.0, 8.0, 1.0, 2.0, 2.0, 1.0, 1.0,
+                               3.0, 3.0, 1.0, 2.0, 3.0, 3.0, 2.0, 1.5, 1.5
+                           ]])
+
+expected_polygons = np.asarray(
+    [[3.0000, 1.0000, 1.0000, 1.0000, 1.0000, 3.0000, 3.0000, 3.0000],
+     [8.0, 8.0, 2.3243, 0.0541, 0.0541, 1.6757, 5.7297, 9.6216]])
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_min_area_polygons():
+    pointsets = torch.from_numpy(np_pointsets).cuda().float()
+
+    assert np.allclose(
+        min_area_polygons(pointsets).cpu().numpy(),
+        expected_polygons,
+        atol=1e-4)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_modulated_deform_conv.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_modulated_deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c9ccbf23bed55ff807c41effe11a53a2f5a34a7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_modulated_deform_conv.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import numpy
+import pytest
+import torch
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+try:
+    # If PyTorch version >= 1.6.0 and fp16 is enabled, torch.cuda.amp.autocast
+    # would be imported and used; we should test if our modules support it.
+    from torch.cuda.amp import autocast
+except ImportError:
+    pass
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+input_t = [[[[1., 2., 3.], [1., 2., 3.], [1., 2., 3.]]]]
+output_t = [[[[0.5, 1.5, 2.5, 1.5], [1.0, 3.0, 5.0, 3.0], [1.0, 3.0, 5.0, 3.0],
+              [0.5, 1.5, 2.5, 1.5]]]]
+input_grad = [[[[2., 2., 2.], [2., 2., 2.], [2., 2., 2.]]]]
+dcn_w_grad = [[[[9., 9.], [9., 9.]]]]
+dcn_offset_w_grad = [[[[-7.0, -4.0], [0.0, 0.0]]], [[[-9.0, 7.5], [-6.0,
+                                                                   5.0]]],
+                     [[[-4.0, -7.0], [0.0, 0.0]]],
+                     [[[-7.5, -9.0], [-5.0, -6.0]]],
+                     [[[-7.0, -4.0], [-7.0, -4.0]]],
+                     [[[-6.0, 5.0], [-9.0, 7.5]]],
+                     [[[-4.0, -7.0], [-4.0, -7.0]]],
+                     [[[-5.0, -6.0], [-7.5, -9.0]]], [[[10.5, 6.0], [7.0,
+                                                                     4.0]]],
+                     [[[6.0, 10.5], [4.0, 7.0]]], [[[7.0, 4.0], [10.5, 6.0]]],
+                     [[[4.0, 7.0], [6.0, 10.5]]]]
+dcn_offset_b_grad = [
+    -3.0, -1.5, -3.0, -1.5, -3.0, -1.5, -3.0, -1.5, 4.5, 4.5, 4.5, 4.5
+]
+
+
+class TestMdconv:
+
+    def _test_mdconv(self, dtype=torch.float, device='cuda'):
+        if not torch.cuda.is_available() and device == 'cuda':
+            pytest.skip('test requires GPU')
+        if device == 'mlu':
+            from mmcv.ops import \
+                ModulatedDeformConv2dPack_MLU as ModulatedDeformConv2dPack
+        else:
+            from mmcv.ops import ModulatedDeformConv2dPack
+
+        input = torch.tensor(input_t, dtype=dtype, device=device)
+        input.requires_grad = True
+
+        dcn = ModulatedDeformConv2dPack(
+            1,
+            1,
+            kernel_size=(2, 2),
+            stride=1,
+            padding=1,
+            deform_groups=1,
+            bias=False).to(device)
+
+        dcn.weight.data.fill_(1.)
+        dcn.type(dtype)
+        output = dcn(input)
+        output.sum().backward()
+        assert numpy.allclose(output.cpu().detach().numpy(), output_t, 1e-2)
+        assert numpy.allclose(input.grad.cpu().detach().numpy(), input_grad,
+                              1e-2)
+        assert numpy.allclose(dcn.weight.grad.cpu().detach().numpy(),
+                              dcn_w_grad, 1e-2)
+        assert numpy.allclose(
+            dcn.conv_offset.weight.grad.cpu().detach().numpy(),
+            dcn_offset_w_grad, 1e-2)
+        assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(),
+                              dcn_offset_b_grad, 1e-2)
+
+    def _test_amp_mdconv(self, input_dtype=torch.float, device='cuda'):
+        """The function to test amp released on pytorch 1.6.0.
+
+        The type of input data might be torch.float or torch.half,
+        so we should test mdconv in both cases. With amp, the data
+        type of model will NOT be set manually.
+
+        Args:
+            input_dtype: torch.float or torch.half.
+        """
+        if not torch.cuda.is_available() and device == 'cuda':
+            return
+        if device == 'mlu':
+            from mmcv.ops import \
+                ModulatedDeformConv2dPack_MLU as ModulatedDeformConv2dPack
+        else:
+            from mmcv.ops import ModulatedDeformConv2dPack
+
+        input = torch.tensor(input_t).to(device).type(input_dtype)
+        input.requires_grad = True
+
+        dcn = ModulatedDeformConv2dPack(
+            1,
+            1,
+            kernel_size=(2, 2),
+            stride=1,
+            padding=1,
+            deform_groups=1,
+            bias=False).to(device)
+        dcn.weight.data.fill_(1.)
+        output = dcn(input)
+        output.sum().backward()
+        assert numpy.allclose(output.cpu().detach().numpy(), output_t, 1e-2)
+        assert numpy.allclose(input.grad.cpu().detach().numpy(), input_grad,
+                              1e-2)
+        assert numpy.allclose(dcn.weight.grad.cpu().detach().numpy(),
+                              dcn_w_grad, 1e-2)
+        assert numpy.allclose(
+            dcn.conv_offset.weight.grad.cpu().detach().numpy(),
+            dcn_offset_w_grad, 1e-2)
+        assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(),
+                              dcn_offset_b_grad, 1e-2)
+
+    @pytest.mark.parametrize('device', [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    ])
+    def test_mdconv_float(self, device):
+        self._test_mdconv(dtype=torch.float, device=device)
+
+    @pytest.mark.parametrize('device', [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    ])
+    def test_mdconv_double(self, device):
+        self._test_mdconv(dtype=torch.double, device=device)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    ])
+    def test_mdconv_half(self, device):
+        self._test_mdconv(torch.half, device=device)
+        # test amp when torch version >= '1.6.0', the type of
+        # input data for mdconv might be torch.float or torch.half
+        if (TORCH_VERSION != 'parrots'
+                and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+            with autocast(enabled=True):
+                self._test_amp_mdconv(torch.float, device=device)
+                self._test_amp_mdconv(torch.half, device=device)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_ms_deformable_attn.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_ms_deformable_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e9f1af8c080a12fee5b8531e23d81f30233a37f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_ms_deformable_attn.py
@@ -0,0 +1,305 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops.multi_scale_deform_attn import (
+    MultiScaleDeformableAttention, MultiScaleDeformableAttnFunction,
+    multi_scale_deformable_attn_pytorch)
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+_USING_PARROTS = True
+_IS_AUTOCAST_AVAILABLE = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+
+try:
+    # If PyTorch version >= 1.6.0 and fp16 is enabled, torch.cuda.amp.autocast
+    # would be imported and used; we should test if our modules support it.
+    from torch.cuda.amp import autocast
+except ImportError:
+    _IS_AUTOCAST_AVAILABLE = False
+    pass
+
+
+@pytest.mark.parametrize('device', [
+    'cpu',
+    pytest.param(
+        'cuda:0',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_multiscale_deformable_attention(device):
+    with pytest.raises(ValueError):
+        # embed_dims must be divisible by num_heads,
+        MultiScaleDeformableAttention(
+            embed_dims=256,
+            num_heads=7,
+        )
+    device = torch.device(device)
+    msda = MultiScaleDeformableAttention(
+        embed_dims=3, num_levels=2, num_heads=3)
+    msda.init_weights()
+    num_query = 5
+    bs = 1
+    embed_dims = 3
+    query = torch.rand(num_query, bs, embed_dims).to(device)
+    key = torch.rand(num_query, bs, embed_dims).to(device)
+    spatial_shapes = torch.Tensor([[2, 2], [1, 1]]).long().to(device)
+    level_start_index = torch.Tensor([0, 4]).long().to(device)
+    reference_points = torch.rand(bs, num_query, 2, 2).to(device)
+    msda.to(device)
+    msda(
+        query,
+        key,
+        key,
+        reference_points=reference_points,
+        spatial_shapes=spatial_shapes,
+        level_start_index=level_start_index)
+
+    # test with value_proj_ratio
+    embed_dims = 6
+    value_proj_ratio = 0.5
+    query = torch.rand(num_query, bs, embed_dims).to(device)
+    key = torch.rand(num_query, bs, embed_dims).to(device)
+    msda = MultiScaleDeformableAttention(
+        embed_dims=embed_dims,
+        num_levels=2,
+        num_heads=3,
+        value_proj_ratio=value_proj_ratio)
+    msda.init_weights()
+    msda.to(device)
+    msda(
+        query,
+        key,
+        key,
+        reference_points=reference_points,
+        spatial_shapes=spatial_shapes,
+        level_start_index=level_start_index)
+
+
+def test_forward_multi_scale_deformable_attn_pytorch():
+    N, M, D = 1, 2, 2
+    Lq, L, P = 2, 2, 2
+    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
+    S = sum((H * W).item() for H, W in shapes)
+
+    torch.manual_seed(3)
+    value = torch.rand(N, S, M, D) * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
+    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
+    attention_weights /= attention_weights.sum(
+        -1, keepdim=True).sum(
+            -2, keepdim=True)
+
+    multi_scale_deformable_attn_pytorch(value.double(), shapes,
+                                        sampling_locations.double(),
+                                        attention_weights.double()).detach()
+
+
+@pytest.mark.skipif(not IS_CUDA_AVAILABLE, reason='requires CUDA support')
+def test_forward_equal_with_pytorch_double():
+    N, M, D = 1, 2, 2
+    Lq, L, P = 2, 2, 2
+    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
+    level_start_index = torch.cat((shapes.new_zeros(
+        (1, )), shapes.prod(1).cumsum(0)[:-1]))
+    S = sum((H * W).item() for H, W in shapes)
+
+    torch.manual_seed(3)
+    value = torch.rand(N, S, M, D) * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
+    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
+    attention_weights /= attention_weights.sum(
+        -1, keepdim=True).sum(
+            -2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = multi_scale_deformable_attn_pytorch(
+        value.double(), shapes, sampling_locations.double(),
+        attention_weights.double()).detach().cpu()
+
+    output_cuda = MultiScaleDeformableAttnFunction.apply(
+        value.cuda().double(), shapes.cuda(), level_start_index.cuda(),
+        sampling_locations.cuda().double(),
+        attention_weights.cuda().double(), im2col_step).detach().cpu()
+    assert torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+    assert max_abs_err < 1e-18
+    assert max_rel_err < 1e-15
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_forward_equal_with_pytorch_float(device):
+    N, M, D = 1, 2, 2
+    Lq, L, P = 2, 2, 2
+    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
+    level_start_index = torch.cat((shapes.new_zeros(
+        (1, )), shapes.prod(1).cumsum(0)[:-1]))
+    S = sum((H * W).item() for H, W in shapes)
+
+    torch.manual_seed(3)
+    value = torch.rand(N, S, M, D) * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
+    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
+    attention_weights /= attention_weights.sum(
+        -1, keepdim=True).sum(
+            -2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = multi_scale_deformable_attn_pytorch(
+        value, shapes, sampling_locations, attention_weights).detach().cpu()
+
+    output_device = MultiScaleDeformableAttnFunction.apply(
+        value.to(device), shapes.to(device), level_start_index.to(device),
+        sampling_locations.to(device), attention_weights.to(device),
+        im2col_step).detach().cpu()
+    assert torch.allclose(output_device, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_device - output_pytorch).abs().max()
+    max_rel_err = ((output_device - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+    assert max_abs_err < 1e-9
+    assert max_rel_err < 1e-6
+
+
+@pytest.mark.skipif(
+    not _IS_AUTOCAST_AVAILABLE, reason='requires autocast support')
+@pytest.mark.skipif(not IS_CUDA_AVAILABLE, reason='requires CUDA support')
+def test_forward_equal_with_autocast():
+    N, M, D = 1, 2, 2
+    Lq, L, P = 2, 2, 2
+    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
+    level_start_index = torch.cat((shapes.new_zeros(
+        (1, )), shapes.prod(1).cumsum(0)[:-1]))
+    S = sum((H * W).item() for H, W in shapes)
+
+    torch.manual_seed(3)
+    value = torch.rand(N, S, M, D) * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
+    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
+    attention_weights /= attention_weights.sum(
+        -1, keepdim=True).sum(
+            -2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = multi_scale_deformable_attn_pytorch(
+        value, shapes, sampling_locations, attention_weights).detach().cpu()
+
+    # float test
+    dtype = torch.float
+    with autocast(enabled=True):
+        output_device = MultiScaleDeformableAttnFunction.apply(
+            value.cuda().type(dtype), shapes.cuda(), level_start_index.cuda(),
+            sampling_locations.cuda(), attention_weights.cuda(),
+            im2col_step).detach().cpu()
+    assert torch.allclose(output_device, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_device - output_pytorch).abs().max()
+    max_rel_err = ((output_device - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+    assert max_abs_err < 1e-9
+    assert max_rel_err < 1e-6
+
+    # half test
+    dtype = torch.half
+    with autocast(enabled=True):
+        output_device = MultiScaleDeformableAttnFunction.apply(
+            value.cuda().type(dtype), shapes.cuda(), level_start_index.cuda(),
+            sampling_locations.cuda(), attention_weights.cuda(),
+            im2col_step).detach().cpu()
+    assert torch.allclose(
+        output_device, output_pytorch.half(), rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_device - output_pytorch).abs().max()
+    max_rel_err = ((output_device - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+    assert max_abs_err < 1e-5
+    assert max_rel_err < 1e-2
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+@pytest.mark.parametrize('dtype', [
+    torch.float,
+    pytest.param(
+        torch.double,
+        marks=pytest.mark.skipif(
+            IS_MLU_AVAILABLE,
+            reason='MLU does not support for 64-bit floating point')),
+    torch.half
+])
+@pytest.mark.parametrize('channels', [
+    4,
+    30,
+    32,
+    64,
+    71,
+    1025,
+])
+def test_gradient_numerical(channels,
+                            device,
+                            dtype,
+                            grad_value=True,
+                            grad_sampling_loc=True,
+                            grad_attn_weight=True):
+
+    N, M, _ = 1, 2, 2
+    Lq, L, P = 2, 2, 2
+    shapes = torch.as_tensor([(3, 2), (2, 1)], dtype=torch.long).to(device)
+    level_start_index = torch.cat((shapes.new_zeros(
+        (1, )), shapes.prod(1).cumsum(0)[:-1]))
+    S = sum((H * W).item() for H, W in shapes)
+
+    value = torch.rand(N, S, M, channels).to(device) * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).to(device)
+    attention_weights = torch.rand(N, Lq, M, L, P).to(device) + 1e-5
+    attention_weights /= attention_weights.sum(
+        -1, keepdim=True).sum(
+            -2, keepdim=True)
+    im2col_step = 2
+
+    func = MultiScaleDeformableAttnFunction.apply
+
+    value.requires_grad = grad_value
+    sampling_locations.requires_grad = grad_sampling_loc
+    attention_weights.requires_grad = grad_attn_weight
+    if device == 'cuda':
+        dtype = torch.double
+        eps = 1e-6
+    elif device == 'mlu':
+        dtype = torch.float
+        eps = 1e-4
+    if _USING_PARROTS:
+        assert gradcheck(
+            func, (value.to(dtype), shapes, level_start_index,
+                   sampling_locations.to(dtype), attention_weights.to(dtype),
+                   im2col_step),
+            no_grads=[shapes, level_start_index],
+            eps=eps)
+    else:
+        assert gradcheck(
+            func, (value.to(dtype), shapes, level_start_index,
+                   sampling_locations.to(dtype), attention_weights.to(dtype),
+                   im2col_step),
+            eps=eps,
+            atol=1e-2)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_nms.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f1ac65d61848d67b6a65da4c610b4658bd3ed10
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_nms.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+
+class Testnms:
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_nms_allclose(self, device):
+        from mmcv.ops import nms
+        np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
+                             [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
+                            dtype=np.float32)
+        np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+        np_inds = np.array([1, 0, 3])
+        np_dets = np.array([[3.0, 6.0, 9.0, 11.0, 0.9],
+                            [6.0, 3.0, 8.0, 7.0, 0.6],
+                            [1.0, 4.0, 13.0, 7.0, 0.2]])
+        boxes = torch.from_numpy(np_boxes)
+        scores = torch.from_numpy(np_scores)
+        dets, inds = nms(boxes, scores, iou_threshold=0.3, offset=0)
+        assert np.allclose(dets, np_dets)  # test cpu
+        assert np.allclose(inds, np_inds)  # test cpu
+        dets, inds = nms(
+            boxes.to(device), scores.to(device), iou_threshold=0.3, offset=0)
+        assert np.allclose(dets.cpu().numpy(), np_dets)  # test gpu
+        assert np.allclose(inds.cpu().numpy(), np_inds)  # test gpu
+
+    def test_softnms_allclose(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import soft_nms
+        np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
+                             [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
+                            dtype=np.float32)
+        np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+
+        np_output = {
+            'linear': {
+                'dets':
+                np.array(
+                    [[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.6],
+                     [3., 7., 10., 12., 0.29024392], [1., 4., 13., 7., 0.2]],
+                    dtype=np.float32),
+                'inds':
+                np.array([1, 0, 2, 3], dtype=np.int64)
+            },
+            'gaussian': {
+                'dets':
+                np.array([[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.59630775],
+                          [3., 7., 10., 12., 0.35275510],
+                          [1., 4., 13., 7., 0.18650459]],
+                         dtype=np.float32),
+                'inds':
+                np.array([1, 0, 2, 3], dtype=np.int64)
+            },
+            'naive': {
+                'dets':
+                np.array([[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.6],
+                          [1., 4., 13., 7., 0.2]],
+                         dtype=np.float32),
+                'inds':
+                np.array([1, 0, 3], dtype=np.int64)
+            }
+        }
+
+        boxes = torch.from_numpy(np_boxes)
+        scores = torch.from_numpy(np_scores)
+
+        configs = [[0.3, 0.5, 0.01, 'linear'], [0.3, 0.5, 0.01, 'gaussian'],
+                   [0.3, 0.5, 0.01, 'naive']]
+
+        for iou, sig, mscore, m in configs:
+            dets, inds = soft_nms(
+                boxes,
+                scores,
+                iou_threshold=iou,
+                sigma=sig,
+                min_score=mscore,
+                method=m)
+            assert np.allclose(dets.cpu().numpy(), np_output[m]['dets'])
+            assert np.allclose(inds.cpu().numpy(), np_output[m]['inds'])
+
+        if torch.__version__ != 'parrots':
+            boxes = boxes.cuda()
+            scores = scores.cuda()
+            for iou, sig, mscore, m in configs:
+                dets, inds = soft_nms(
+                    boxes,
+                    scores,
+                    iou_threshold=iou,
+                    sigma=sig,
+                    min_score=mscore,
+                    method=m)
+                assert np.allclose(dets.cpu().numpy(), np_output[m]['dets'])
+                assert np.allclose(inds.cpu().numpy(), np_output[m]['inds'])
+
+    def test_nms_match(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import nms, nms_match
+        iou_thr = 0.6
+        # empty input
+        empty_dets = np.array([])
+        assert len(nms_match(empty_dets, iou_thr)) == 0
+
+        # non empty ndarray input
+        np_dets = np.array(
+            [[49.1, 32.4, 51.0, 35.9, 0.9], [49.3, 32.9, 51.0, 35.3, 0.9],
+             [35.3, 11.5, 39.9, 14.5, 0.4], [35.2, 11.7, 39.7, 15.7, 0.3]],
+            dtype=np.float32)
+        np_groups = nms_match(np_dets, iou_thr)
+        assert isinstance(np_groups[0], np.ndarray)
+        assert len(np_groups) == 2
+        tensor_dets = torch.from_numpy(np_dets)
+        boxes = tensor_dets[:, :4]
+        scores = tensor_dets[:, 4]
+        nms_keep_inds = nms(boxes.contiguous(), scores.contiguous(),
+                            iou_thr)[1]
+        assert {g[0].item() for g in np_groups} == set(nms_keep_inds.tolist())
+
+        # non empty tensor input
+        tensor_dets = torch.from_numpy(np_dets)
+        tensor_groups = nms_match(tensor_dets, iou_thr)
+        assert isinstance(tensor_groups[0], torch.Tensor)
+        for i in range(len(tensor_groups)):
+            assert np.equal(tensor_groups[i].numpy(), np_groups[i]).all()
+
+        # input of wrong shape
+        wrong_dets = np.zeros((2, 3))
+        with pytest.raises(AssertionError):
+            nms_match(wrong_dets, iou_thr)
+
+    def test_batched_nms(self):
+        from mmcv.ops import batched_nms
+        results = mmengine.load('./tests/data/batched_nms_data.pkl')
+
+        nms_max_num = 100
+        nms_cfg = dict(
+            type='nms',
+            iou_threshold=0.7,
+            score_threshold=0.5,
+            max_num=nms_max_num)
+        boxes, keep = batched_nms(
+            torch.from_numpy(results['boxes']),
+            torch.from_numpy(results['scores']),
+            torch.from_numpy(results['idxs']),
+            nms_cfg,
+            class_agnostic=False)
+
+        nms_cfg.update(split_thr=100)
+        seq_boxes, seq_keep = batched_nms(
+            torch.from_numpy(results['boxes']),
+            torch.from_numpy(results['scores']),
+            torch.from_numpy(results['idxs']),
+            nms_cfg,
+            class_agnostic=False)
+
+        assert torch.equal(keep, seq_keep)
+        assert torch.equal(boxes, seq_boxes)
+        assert torch.equal(keep,
+                           torch.from_numpy(results['keep'][:nms_max_num]))
+
+        nms_cfg = dict(type='soft_nms', iou_threshold=0.7)
+        boxes, keep = batched_nms(
+            torch.from_numpy(results['boxes']),
+            torch.from_numpy(results['scores']),
+            torch.from_numpy(results['idxs']),
+            nms_cfg,
+            class_agnostic=False)
+
+        nms_cfg.update(split_thr=100)
+        seq_boxes, seq_keep = batched_nms(
+            torch.from_numpy(results['boxes']),
+            torch.from_numpy(results['scores']),
+            torch.from_numpy(results['idxs']),
+            nms_cfg,
+            class_agnostic=False)
+
+        assert torch.equal(keep, seq_keep)
+        assert torch.equal(boxes, seq_boxes)
+
+        # test skip nms when `nms_cfg` is None
+        seq_boxes, seq_keep = batched_nms(
+            torch.from_numpy(results['boxes']),
+            torch.from_numpy(results['scores']),
+            torch.from_numpy(results['idxs']),
+            None,
+            class_agnostic=False)
+        assert len(seq_keep) == len(results['boxes'])
+        # assert score is descending order
+        assert ((seq_boxes[:, -1][1:] - seq_boxes[:, -1][:-1]) < 0).all()
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_nms_quadri.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_nms_quadri.py
new file mode 100644
index 0000000000000000000000000000000000000000..51f91f06205ea0a20ef603138d3be872a9e1e0c7
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_nms_quadri.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE
+
+
+class TestNMSQuadri:
+
+    @pytest.mark.parametrize('device', [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    ])
+    def test_ml_nms_quadri(self, device):
+        from mmcv.ops import nms_quadri
+        np_boxes = np.array([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0, 0.7],
+                             [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0, 0.8],
+                             [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0, 0.5],
+                             [0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.9]],
+                            dtype=np.float32)
+        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)
+
+        np_expect_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],
+                                   [2., 2., 3., 4., 4., 2., 3., 1.],
+                                   [7., 7., 8., 8., 9., 7., 8., 6.]],
+                                  dtype=np.float32)
+        np_expect_keep_inds = np.array([3, 1, 2], dtype=np.int64)
+
+        boxes = torch.from_numpy(np_boxes).to(device)
+        labels = torch.from_numpy(np_labels).to(device)
+
+        dets, keep_inds = nms_quadri(boxes[:, :8], boxes[:, -1], 0.3, labels)
+
+        assert np.allclose(dets.cpu().numpy()[:, :8], np_expect_dets)
+        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
+
+    @pytest.mark.parametrize('device', [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    ])
+    def test_nms_quadri(self, device):
+        from mmcv.ops import nms_quadri
+        np_boxes = np.array([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0, 0.7],
+                             [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0, 0.8],
+                             [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0, 0.5],
+                             [0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.9]],
+                            dtype=np.float32)
+
+        np_expect_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],
+                                   [2., 2., 3., 4., 4., 2., 3., 1.],
+                                   [7., 7., 8., 8., 9., 7., 8., 6.]],
+                                  dtype=np.float32)
+        np_expect_keep_inds = np.array([3, 1, 2], dtype=np.int64)
+
+        boxes = torch.from_numpy(np_boxes).to(device)
+
+        dets, keep_inds = nms_quadri(boxes[:, :8], boxes[:, -1], 0.3)
+        assert np.allclose(dets.cpu().numpy()[:, :8], np_expect_dets)
+        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
+
+    @pytest.mark.parametrize('device', [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    ])
+    def test_batched_nms(self, device):
+        # test batched_nms with nms_quadri
+        from mmcv.ops import batched_nms
+
+        np_boxes = np.array([[1.0, 1.0, 3.0, 4.0, 4.0, 4.0, 4.0, 1.0, 0.7],
+                             [2.0, 2.0, 3.0, 4.0, 4.0, 2.0, 3.0, 1.0, 0.8],
+                             [7.0, 7.0, 8.0, 8.0, 9.0, 7.0, 8.0, 6.0, 0.5],
+                             [0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 0.0, 0.9]],
+                            dtype=np.float32)
+        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)
+
+        np_expect_agnostic_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],
+                                            [2., 2., 3., 4., 4., 2., 3., 1.],
+                                            [7., 7., 8., 8., 9., 7., 8., 6.]],
+                                           dtype=np.float32)
+        np_expect_agnostic_keep_inds = np.array([3, 1, 2], dtype=np.int64)
+
+        np_expect_dets = np.array([[0., 0., 0., 2., 2., 2., 2., 0.],
+                                   [2., 2., 3., 4., 4., 2., 3., 1.],
+                                   [1., 1., 3., 4., 4., 4., 4., 1.],
+                                   [7., 7., 8., 8., 9., 7., 8., 6.]],
+                                  dtype=np.float32)
+        np_expect_keep_inds = np.array([3, 1, 0, 2], dtype=np.int64)
+
+        nms_cfg = dict(type='nms_quadri', iou_threshold=0.3)
+
+        # test class_agnostic is True
+        boxes, keep = batched_nms(
+            torch.from_numpy(np_boxes[:, :8]).to(device),
+            torch.from_numpy(np_boxes[:, -1]).to(device),
+            torch.from_numpy(np_labels).to(device),
+            nms_cfg,
+            class_agnostic=True)
+        assert np.allclose(boxes.cpu().numpy()[:, :8], np_expect_agnostic_dets)
+        assert np.allclose(keep.cpu().numpy(), np_expect_agnostic_keep_inds)
+
+        # test class_agnostic is False
+        boxes, keep = batched_nms(
+            torch.from_numpy(np_boxes[:, :8]).to(device),
+            torch.from_numpy(np_boxes[:, -1]).to(device),
+            torch.from_numpy(np_labels).to(device),
+            nms_cfg,
+            class_agnostic=False)
+        assert np.allclose(boxes.cpu().numpy()[:, :8], np_expect_dets)
+        assert np.allclose(keep.cpu().numpy(), np_expect_keep_inds)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_nms_rotated.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_nms_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..88b41fec8515e29c9b6d34421a0ec00121c8d350
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_nms_rotated.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
+
+
+class TestNmsRotated:
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support')),
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_ml_nms_rotated(self, device):
+        from mmcv.ops import nms_rotated
+        np_boxes = np.array(
+            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
+             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]
+             ],
+            dtype=np.float32)
+        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)
+
+        np_expect_dets = np.array(
+            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
+             [6.0, 3.0, 8.0, 7.0, 0.5]],
+            dtype=np.float32)
+        np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)
+
+        boxes = torch.from_numpy(np_boxes).to(device)
+        labels = torch.from_numpy(np_labels).to(device)
+
+        # test cw angle definition
+        dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5, labels)
+
+        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
+        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
+
+        # test ccw angle definition
+        boxes[..., -2] *= -1
+        dets, keep_inds = nms_rotated(
+            boxes[:, :5], boxes[:, -1], 0.5, labels, clockwise=False)
+        dets[..., -2] *= -1
+        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
+        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support')),
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_nms_rotated(self, device):
+        from mmcv.ops import nms_rotated
+        np_boxes = np.array(
+            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
+             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]
+             ],
+            dtype=np.float32)
+
+        np_expect_dets = np.array(
+            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
+             [6.0, 3.0, 8.0, 7.0, 0.5]],
+            dtype=np.float32)
+        np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)
+
+        boxes = torch.from_numpy(np_boxes).to(device)
+
+        # test cw angle definition
+        dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5)
+        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
+        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
+
+        # test ccw angle definition
+        boxes[..., -2] *= -1
+        dets, keep_inds = nms_rotated(
+            boxes[:, :5], boxes[:, -1], 0.5, clockwise=False)
+        dets[..., -2] *= -1
+        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
+        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
+
+    def test_batched_nms(self):
+        # test batched_nms with nms_rotated
+        from mmcv.ops import batched_nms
+
+        np_boxes = np.array(
+            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
+             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]
+             ],
+            dtype=np.float32)
+        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)
+
+        np_expect_agnostic_dets = np.array(
+            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
+             [6.0, 3.0, 8.0, 7.0, 0.5]],
+            dtype=np.float32)
+        np_expect_agnostic_keep_inds = np.array([3, 1, 0], dtype=np.int64)
+
+        np_expect_dets = np.array(
+            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
+             [6.0, 3.0, 8.0, 7.0, 0.5], [3.0, 7.0, 10.0, 12.0, 0.3]],
+            dtype=np.float32)
+        np_expect_keep_inds = np.array([3, 1, 0, 2], dtype=np.int64)
+
+        nms_cfg = dict(type='nms_rotated', iou_threshold=0.5)
+
+        # test class_agnostic is True
+        boxes, keep = batched_nms(
+            torch.from_numpy(np_boxes[:, :5]),
+            torch.from_numpy(np_boxes[:, -1]),
+            torch.from_numpy(np_labels),
+            nms_cfg,
+            class_agnostic=True)
+        assert np.allclose(boxes.cpu().numpy()[:, :5], np_expect_agnostic_dets)
+        assert np.allclose(keep.cpu().numpy(), np_expect_agnostic_keep_inds)
+
+        # test class_agnostic is False
+        boxes, keep = batched_nms(
+            torch.from_numpy(np_boxes[:, :5]),
+            torch.from_numpy(np_boxes[:, -1]),
+            torch.from_numpy(np_labels),
+            nms_cfg,
+            class_agnostic=False)
+        assert np.allclose(boxes.cpu().numpy()[:, :5], np_expect_dets)
+        assert np.allclose(keep.cpu().numpy(), np_expect_keep_inds)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_onnx.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..1058fa765148c8a917a2a7a409e2b135c0f0d25f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_onnx.py
@@ -0,0 +1,287 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import numpy as np
+import onnx
+import pytest
+import torch
+import torch.nn as nn
+
+onnx_file = 'tmp.onnx'
+if torch.__version__ == 'parrots':
+    pytest.skip('not supported in parrots now', allow_module_level=True)
+
+
+@pytest.fixture(autouse=True)
+def run_before_and_after_test():
+    # clear onnx_file before test
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+
+    yield
+
+    # clear onnx_file after test
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+
+
+class WrapFunction(nn.Module):
+
+    def __init__(self, wrapped_function):
+        super().__init__()
+        self.wrapped_function = wrapped_function
+
+    def forward(self, *args, **kwargs):
+        return self.wrapped_function(*args, **kwargs)
+
+
+def test_roialign():
+    rt = pytest.importorskip('onnxruntime')
+    try:
+        from mmcv.ops import roi_align
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('roi_align op is not successfully compiled')
+
+    # roi align config
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+
+    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+              ([[[[1., 2.], [3., 4.]], [[4., 3.],
+                                        [2., 1.]]]], [[0., 0., 0., 1., 1.]]),
+              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+                  [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+
+    def warpped_function(torch_input, torch_rois):
+        return roi_align(torch_input, torch_rois, (pool_w, pool_h),
+                         spatial_scale, sampling_ratio, 'avg', True)
+
+    for case in inputs:
+        np_input = np.array(case[0], dtype=np.float32)
+        np_rois = np.array(case[1], dtype=np.float32)
+        input = torch.from_numpy(np_input)
+        rois = torch.from_numpy(np_rois)
+
+        # compute pytorch_output
+        with torch.no_grad():
+            pytorch_output = roi_align(input, rois, (pool_w, pool_h),
+                                       spatial_scale, sampling_ratio, 'avg',
+                                       True)
+
+        # export and load onnx model
+        wrapped_model = WrapFunction(warpped_function)
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model, (input, rois),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['input', 'rois'],
+                opset_version=11)
+
+        onnx_model = onnx.load(onnx_file)
+        session_options = rt.SessionOptions()
+
+        # compute onnx_output
+        input_all = [node.name for node in onnx_model.graph.input]
+        input_initializer = [
+            node.name for node in onnx_model.graph.initializer
+        ]
+        net_feed_input = list(set(input_all) - set(input_initializer))
+        assert (len(net_feed_input) == 2)
+        sess = rt.InferenceSession(
+            onnx_file, session_options, providers=['CPUExecutionProvider'])
+        onnx_output = sess.run(None, {
+            'input': input.detach().numpy(),
+            'rois': rois.detach().numpy()
+        })
+        onnx_output = onnx_output[0]
+
+        # allclose
+
+        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_roipool():
+    rt = pytest.importorskip('onnxruntime')
+    from mmcv.ops import roi_pool
+
+    # roi pool config
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+
+    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+              ([[[[1., 2.], [3., 4.]], [[4., 3.],
+                                        [2., 1.]]]], [[0., 0., 0., 1., 1.]]),
+              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+                  [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+
+    def warpped_function(torch_input, torch_rois):
+        return roi_pool(torch_input, torch_rois, (pool_w, pool_h),
+                        spatial_scale)
+
+    for case in inputs:
+        np_input = np.array(case[0], dtype=np.float32)
+        np_rois = np.array(case[1], dtype=np.float32)
+        input = torch.from_numpy(np_input).cuda()
+        rois = torch.from_numpy(np_rois).cuda()
+
+        # compute pytorch_output
+        with torch.no_grad():
+            pytorch_output = roi_pool(input, rois, (pool_w, pool_h),
+                                      spatial_scale)
+            pytorch_output = pytorch_output.cpu()
+
+        # export and load onnx model
+        wrapped_model = WrapFunction(warpped_function)
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model, (input, rois),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['input', 'rois'],
+                opset_version=11)
+        onnx_model = onnx.load(onnx_file)
+
+        # compute onnx_output
+        input_all = [node.name for node in onnx_model.graph.input]
+        input_initializer = [
+            node.name for node in onnx_model.graph.initializer
+        ]
+        net_feed_input = list(set(input_all) - set(input_initializer))
+        assert (len(net_feed_input) == 2)
+        sess = rt.InferenceSession(
+            onnx_file, providers=['CPUExecutionProvider'])
+        onnx_output = sess.run(
+            None, {
+                'input': input.detach().cpu().numpy(),
+                'rois': rois.detach().cpu().numpy()
+            })
+        onnx_output = onnx_output[0]
+
+        # allclose
+        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
+
+
+def _test_symbolic(model, inputs, symbol_name):
+    with torch.no_grad():
+        torch.onnx.export(model, inputs, onnx_file, opset_version=11)
+
+    import onnx
+    model = onnx.load(onnx_file)
+    nodes = model.graph.node
+
+    symbol_exist = False
+    for n in nodes:
+        if n.op_type == symbol_name:
+            symbol_exist = True
+    assert symbol_exist
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_border_align():
+    from mmcv.ops import BorderAlign
+    model = BorderAlign(2)
+    input = torch.rand(1, 8, 2, 2).cuda()
+    boxes = torch.rand(1, 4, 4).cuda()
+    _test_symbolic(model, (input, boxes), 'MMCVBorderAlign')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_carafe():
+    from mmcv.ops import CARAFENaive
+    feat = torch.randn(2, 64, 3, 3, device='cuda').double()
+    mask = torch.randn(2, 100, 6, 6, device='cuda').sigmoid().double()
+    _test_symbolic(CARAFENaive(5, 4, 2), (feat, mask), 'MMCVCARAFENaive')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_deform_conv():
+    from mmcv.ops import DeformConv2dPack
+    x = torch.randn(1, 2, 4, 4, device='cuda')
+    _test_symbolic(
+        DeformConv2dPack(2, 4, 3, 1, 1).cuda(), (x, ), 'MMCVDeformConv2d')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_modulated_deform_conv():
+    from mmcv.ops import ModulatedDeformConv2dPack
+    x = torch.randn(1, 2, 4, 4, device='cuda')
+    _test_symbolic(
+        ModulatedDeformConv2dPack(2, 4, 3, 1, 1).cuda(), x,
+        'MMCVModulatedDeformConv2d')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_deform_roi_pool():
+    from mmcv.ops import DeformRoIPoolPack
+    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')
+    rois = torch.tensor([[0., 0., 0., 1., 1.]], device='cuda')
+    output_c = x.size(1)
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+    model = DeformRoIPoolPack((pool_h, pool_w),
+                              output_c,
+                              spatial_scale=spatial_scale,
+                              sampling_ratio=sampling_ratio).cuda()
+
+    _test_symbolic(model, (x, rois), 'MMCVDeformRoIPool')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_masked_conv():
+    from mmcv.ops import MaskedConv2d
+    x = torch.rand(1, 2, 4, 4, device='cuda')
+    mask = torch.rand(1, 4, 4, device='cuda')
+    _test_symbolic(
+        MaskedConv2d(2, 4, 3, 1, 1).cuda(), (x, mask), 'MMCVMaskedConv2d')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_pr_roi_pool():
+    from mmcv.ops import PrRoIPool
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')
+    rois = torch.tensor([[0., 0., 0., 1., 1.]], device='cuda')
+    model = PrRoIPool((pool_h, pool_w), spatial_scale).cuda()
+    _test_symbolic(model, (x, rois), 'PrRoIPool')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_psa_mask():
+    from mmcv.ops import PSAMask
+    input = torch.rand(4, 16, 8, 8).cuda()
+    model = PSAMask('collect', (4, 4)).cuda()
+    _test_symbolic(model, input, 'MMCVPSAMask')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_roi_align_rotated():
+    from mmcv.ops import RoIAlignRotated
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+    x = torch.tensor([[[[1., 2.], [3., 4.]]]], device='cuda')
+    rois = torch.tensor([[0., 0.5, 0.5, 1., 1., 0]], device='cuda')
+    model = RoIAlignRotated((pool_h, pool_w), spatial_scale,
+                            sampling_ratio).cuda()
+    _test_symbolic(model, (x, rois), 'MMCVRoIAlignRotated')
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_roi_feaeture_align():
+    from mmcv.ops import rotated_feature_align
+    wrapped_model = WrapFunction(rotated_feature_align)
+    feature = torch.rand(1, 1, 2, 2, device='cuda')
+    bbox = torch.rand(1, 2, 2, 5, device='cuda')
+    _test_symbolic(wrapped_model, (feature, bbox), 'MMCVRotatedFeatureAlign')
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_pixel_group.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_pixel_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..ceb257365729d0238359ccce8d6a0e60939c6ef6
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_pixel_group.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+
+def test_pixel_group():
+    from mmcv.ops import pixel_group
+    np_score = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],
+                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],
+                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],
+                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]).astype(np.float32)
+    np_mask = (np_score > 0.5)
+    np_embedding = np.zeros((10, 10, 8)).astype(np.float32)
+    np_embedding[:, :7] = 0.9
+    np_embedding[:, 7:] = 10.0
+    np_kernel_label = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],
+                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],
+                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],
+                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0]]).astype(np.int32)
+    np_kernel_contour = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                  [0, 0, 1, 1, 1, 0, 0, 0, 1, 0],
+                                  [0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
+                                  [0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
+                                  [0, 0, 1, 1, 1, 0, 0, 0, 1, 0],
+                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                  [0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                   0]]).astype(np.uint8)
+    kernel_region_num = 3
+    distance_threshold = float(0.8)
+    result = pixel_group(np_score, np_mask, np_embedding, np_kernel_label,
+                         np_kernel_contour, kernel_region_num,
+                         distance_threshold)
+    gt_1 = [
+        0.8999997973442078, 24.0, 1.0, 3.0, 2.0, 3.0, 3.0, 3.0, 4.0, 3.0, 5.0,
+        3.0, 6.0, 3.0, 1.0, 4.0, 2.0, 4.0, 3.0, 4.0, 4.0, 4.0, 5.0, 4.0, 6.0,
+        4.0, 1.0, 5.0, 2.0, 5.0, 3.0, 5.0, 4.0, 5.0, 5.0, 5.0, 6.0, 5.0, 1.0,
+        6.0, 2.0, 6.0, 3.0, 6.0, 4.0, 6.0, 5.0, 6.0, 6.0, 6.0
+    ]
+
+    gt_2 = [
+        0.9000000357627869, 8.0, 7.0, 3.0, 8.0, 3.0, 7.0, 4.0, 8.0, 4.0, 7.0,
+        5.0, 8.0, 5.0, 7.0, 6.0, 8.0, 6.0
+    ]
+
+    assert np.allclose(result[0], [0, 0])
+    assert np.allclose(result[1], gt_1)
+    assert np.allclose(result[2], gt_2)
+
+    # test torch Tensor
+    np_score_t = torch.from_numpy(np_score)
+    np_mask_t = torch.from_numpy(np_mask)
+    np_embedding_t = torch.from_numpy(np_embedding)
+    np_kernel_label_t = torch.from_numpy(np_kernel_label)
+    np_kernel_contour_t = torch.from_numpy(np_kernel_contour)
+
+    result = pixel_group(np_score_t, np_mask_t, np_embedding_t,
+                         np_kernel_label_t, np_kernel_contour_t,
+                         kernel_region_num, distance_threshold)
+
+    assert np.allclose(result[0], [0, 0])
+    assert np.allclose(result[1], gt_1)
+    assert np.allclose(result[2], gt_2)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_points_in_polygons.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_points_in_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..d224d1593adeb6e113388aa55dcc319038365663
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_points_in_polygons.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import points_in_polygons
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_NPU_AVAILABLE
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
+])
+def test_points_in_polygons(device):
+    points = np.array([[300., 300.], [400., 400.], [100., 100], [300, 250],
+                       [100, 0]])
+    polygons = np.array([[200., 200., 400., 400., 500., 200., 400., 100.],
+                         [400., 400., 500., 500., 600., 300., 500., 200.],
+                         [300., 300., 600., 700., 700., 700., 700., 100.]])
+    expected_output = np.array([[0., 0., 0.], [0., 0., 1.], [0., 0., 0.],
+                                [1., 0., 0.], [0., 0., 0.]]).astype(np.float32)
+    points = torch.tensor(points, dtype=torch.float32, device=device)
+    polygons = torch.tensor(polygons, dtype=torch.float32, device=device)
+    assert np.allclose(
+        points_in_polygons(points, polygons).cpu().numpy(), expected_output,
+        1e-3)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_prroi_pool.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_prroi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..0535dfbe21c817a5067279f3d0229ab4b94deb78
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_prroi_pool.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+
+    _USING_PARROTS = False
+
+inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
+                                               1.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+outputs = [
+    ([[[[1.75, 2.25], [2.75, 3.25]]]], [[[[1., 1.],
+                                          [1., 1.]]]], [[0., 2., 4., 2., 4.]]),
+    ([[[[1.75, 2.25], [2.75, 3.25]],
+       [[3.25, 2.75], [2.25, 1.75]]]], [[[[1., 1.], [1., 1.]],
+                                         [[1., 1.],
+                                          [1., 1.]]]], [[0., 0., 0., 0., 0.]]),
+    ([[[[3.75, 6.91666651],
+        [10.08333302,
+         13.25]]]], [[[[0.11111111, 0.22222224, 0.22222222, 0.11111111],
+                       [0.22222224, 0.444444448, 0.44444448, 0.22222224],
+                       [0.22222224, 0.44444448, 0.44444448, 0.22222224],
+                       [0.11111111, 0.22222224, 0.22222224, 0.11111111]]]],
+     [[0.0, 3.33333302, 6.66666603, 3.33333349, 6.66666698]])
+]
+
+
+class TestPrRoiPool:
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+    ])
+    def test_roipool_gradcheck(self, device):
+        from mmcv.ops import PrRoIPool
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+
+        for case in inputs:
+            np_input = np.array(case[0], dtype=np.float32)
+            np_rois = np.array(case[1], dtype=np.float32)
+
+            x = torch.tensor(np_input, device=device, requires_grad=True)
+            rois = torch.tensor(np_rois, device=device)
+
+            froipool = PrRoIPool((pool_h, pool_w), spatial_scale)
+
+            if _USING_PARROTS:
+                gradcheck(froipool, (x, rois), no_grads=[rois])
+            else:
+                gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)
+
+    def _test_roipool_allclose(self, device, dtype=torch.float):
+        from mmcv.ops import prroi_pool
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+
+        for case, output in zip(inputs, outputs):
+            np_input = np.array(case[0], dtype=np.float32)
+            np_rois = np.array(case[1], dtype=np.float32)
+            np_output = np.array(output[0], dtype=np.float32)
+            np_input_grad = np.array(output[1], dtype=np.float32)
+            np_rois_grad = np.array(output[2], dtype=np.float32)
+
+            x = torch.tensor(
+                np_input, dtype=dtype, device=device, requires_grad=True)
+            rois = torch.tensor(
+                np_rois, dtype=dtype, device=device, requires_grad=True)
+
+            output = prroi_pool(x, rois, (pool_h, pool_w), spatial_scale)
+            output.backward(torch.ones_like(output))
+            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
+            assert np.allclose(x.grad.data.cpu().numpy(), np_input_grad, 1e-3)
+            assert np.allclose(rois.grad.data.cpu().numpy(), np_rois_grad,
+                               1e-3)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+    ])
+    def test_roipool_allclose_float(self, device):
+        self._test_roipool_allclose(device, dtype=torch.float)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_psa_mask.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_psa_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0fd86e8f5f10ded735d4f87ac285c399736f194
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_psa_mask.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
+
+
+class Loss(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input, target):
+        input = input.view(-1)
+        target = target.view(-1)
+        return torch.mean(input - target)
+
+
+class TestPSAMask:
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
+    ])
+    def test_psa_mask_collect(self, device):
+        from mmcv.ops import PSAMask
+        test_loss = Loss()
+
+        input = np.fromfile(
+            'tests/data/for_psa_mask/psa_input.bin', dtype=np.float32)
+        output_collect = np.fromfile(
+            'tests/data/for_psa_mask/psa_output_collect.bin', dtype=np.float32)
+
+        input = input.reshape((4, 16, 8, 8))
+        output_collect = output_collect.reshape((4, 64, 8, 8))
+        label = torch.ones((4, 64, 8, 8))
+
+        input = torch.FloatTensor(input)
+        input.requires_grad = True
+
+        psamask_collect = PSAMask('collect', (4, 4))
+
+        # test collect cpu
+        test_output = psamask_collect(input)
+        loss = test_loss(test_output, label)
+        loss.backward()
+        test_output = test_output.detach().numpy()
+        assert np.allclose(test_output, output_collect)
+        assert test_output.shape == output_collect.shape
+
+        psamask_collect.to(device)
+        input = input.to(device)
+        label = label.to(device)
+
+        # test collect on device
+        test_output = psamask_collect(input)
+        loss = test_loss(test_output, label)
+        loss.backward()
+        test_output = test_output.detach().cpu().numpy()
+        assert np.allclose(test_output, output_collect)
+        assert test_output.shape == output_collect.shape
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
+    ])
+    def test_psa_mask_distribute(self, device):
+        from mmcv.ops import PSAMask
+        test_loss = Loss()
+
+        input = np.fromfile(
+            'tests/data/for_psa_mask/psa_input.bin', dtype=np.float32)
+        output_distribute = np.fromfile(
+            'tests/data/for_psa_mask/psa_output_distribute.bin',
+            dtype=np.float32)
+
+        input = input.reshape((4, 16, 8, 8))
+        output_distribute = output_distribute.reshape((4, 64, 8, 8))
+        label = torch.ones((4, 64, 8, 8))
+
+        input = torch.FloatTensor(input)
+        input.requires_grad = True
+
+        psamask_distribute = PSAMask('distribute', (4, 4))
+
+        # test distribute cpu
+        test_output = psamask_distribute(input)
+        loss = test_loss(test_output, label)
+        loss.backward()
+        test_output = test_output.detach().numpy()
+        assert np.allclose(test_output, output_distribute)
+        assert test_output.shape == output_distribute.shape
+
+        psamask_distribute.to(device)
+        input = input.to(device)
+        label = label.to(device)
+
+        # test distribute on device
+        test_output = psamask_distribute(input)
+        loss = test_loss(test_output, label)
+        loss.backward()
+        test_output = test_output.detach().cpu().numpy()
+        assert np.allclose(test_output, output_distribute)
+        assert test_output.shape == output_distribute.shape
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_riroi_align_rotated.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_riroi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7b501cf44b89b687cc8bf687e0583c84705143e
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_riroi_align_rotated.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import RiRoIAlignRotated
+
+if torch.__version__ == 'parrots':
+    from parrots.autograd import gradcheck
+    _USING_PARROTS = True
+else:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+
+np_feature = np.array([[[[1, 2], [3, 4]], [[1, 2], [4, 3]], [[4, 3], [2, 1]],
+                        [[1, 2], [5, 6]], [[3, 4], [7, 8]], [[9, 10], [13,
+                                                                       14]],
+                        [[11, 12], [15, 16]], [[1, 1], [2, 2]]]])
+np_rois = np.array([[0., 0.5, 0.5, 1., 1., np.pi / 3],
+                    [0., 1., 1., 3., 3., np.pi / 2]])
+expect_output = np.array([[[[1.8425, 1.3516], [2.3151, 1.8241]],
+                           [[2.4779, 1.7416], [3.2173, 2.5632]],
+                           [[2.7149, 2.2638], [2.6540, 2.3673]],
+                           [[2.9461, 2.8638], [2.8028, 2.7205]],
+                           [[4.1943, 2.7214], [5.6119, 4.1391]],
+                           [[7.5276, 6.0547], [8.9453, 7.4724]],
+                           [[12.1943, 10.7214], [13.6119, 12.1391]],
+                           [[9.5489, 8.4237], [10.5763, 9.4511]]],
+                          [[[7.6562, 12.5625], [4.0000, 6.6250]],
+                           [[1.0000, 1.3125], [0.5000, 0.6562]],
+                           [[1.6562, 1.9375], [1.0000, 1.3125]],
+                           [[1.8438, 2.0547], [0.7500, 1.1562]],
+                           [[0.8438, 3.0625], [0.2500, 1.1875]],
+                           [[2.6562, 2.5625], [1.5000, 1.6250]],
+                           [[3.6562, 4.5625], [2.0000, 2.6250]],
+                           [[6.6562, 10.5625], [3.5000, 5.6250]]]])
+
+expect_grad = np.array([[[[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]]]])
+
+pool_h = 2
+pool_w = 2
+spatial_scale = 1.0
+num_samples = 2
+sampling_ratio = 2
+num_orientations = 8
+clockwise = False
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_roialign_rotated_gradcheck():
+    x = torch.tensor(
+        np_feature, dtype=torch.float, device='cuda', requires_grad=True)
+    rois = torch.tensor(np_rois, dtype=torch.float, device='cuda')
+    froipool = RiRoIAlignRotated((pool_h, pool_w), spatial_scale, num_samples,
+                                 num_orientations, clockwise)
+    if _USING_PARROTS:
+        gradcheck(
+            froipool, (x, rois), no_grads=[rois], delta=1e-3, pt_atol=1e-3)
+    else:
+        gradcheck(froipool, (x, rois), eps=1e-3, atol=1e-3)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_roialign_rotated_allclose():
+    x = torch.tensor(
+        np_feature, dtype=torch.float, device='cuda', requires_grad=True)
+    rois = torch.tensor(np_rois, dtype=torch.float, device='cuda')
+    froipool = RiRoIAlignRotated((pool_h, pool_w), spatial_scale, num_samples,
+                                 num_orientations, clockwise)
+    output = froipool(x, rois)
+    output.backward(torch.ones_like(output))
+    assert np.allclose(
+        output.data.type(torch.float).cpu().numpy(), expect_output, atol=1e-3)
+    assert np.allclose(
+        x.grad.data.type(torch.float).cpu().numpy(), expect_grad, atol=1e-3)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_roi_align.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcd2103461d54ab34d7586d442829c076cd73f46
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_roi_align.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+
+# yapf:disable
+
+inputs = [([[[[1., 2.], [3., 4.]]]],
+           [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2.], [3., 4.]],
+             [[4., 3.], [2., 1.]]]],
+           [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.],
+              [9., 10., 13., 14.], [11., 12., 15., 16.]]]],
+           [[0., 0., 0., 3., 3.]])]
+outputs = [([[[[1.0, 1.25], [1.5, 1.75]]]],
+            [[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),
+           ([[[[1.0, 1.25], [1.5, 1.75]],
+              [[4.0, 3.75], [3.5, 3.25]]]],
+            [[[[3.0625, 0.4375], [0.4375, 0.0625]],
+              [[3.0625, 0.4375], [0.4375, 0.0625]]]]),
+           ([[[[1.9375, 4.75], [7.5625, 10.375]]]],
+            [[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.04296875, 0.03906250, 0.03906250, 0.00390625]]]])]
+# yapf:enable
+
+pool_h = 2
+pool_w = 2
+spatial_scale = 1.0
+sampling_ratio = 2
+
+
+def _test_roialign_gradcheck(device, dtype):
+    try:
+        from mmcv.ops import RoIAlign
+    except ModuleNotFoundError:
+        pytest.skip('RoIAlign op is not successfully compiled')
+    if dtype is torch.half:
+        pytest.skip('grad check does not support fp16')
+    for case in inputs:
+        np_input = np.array(case[0])
+        np_rois = np.array(case[1])
+
+        x = torch.tensor(
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        rois = torch.tensor(np_rois, dtype=dtype, device=device)
+
+        froipool = RoIAlign((pool_h, pool_w), spatial_scale, sampling_ratio)
+
+        if torch.__version__ == 'parrots':
+            gradcheck(
+                froipool, (x, rois), no_grads=[rois], delta=1e-5, pt_atol=1e-5)
+        else:
+            gradcheck(froipool, (x, rois), eps=1e-5, atol=1e-5)
+
+
+def _test_roialign_allclose(device, dtype):
+    try:
+        from mmcv.ops import roi_align
+    except ModuleNotFoundError:
+        pytest.skip('test requires compilation')
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+    for case, output in zip(inputs, outputs):
+        np_input = np.array(case[0])
+        np_rois = np.array(case[1])
+        np_output = np.array(output[0])
+        np_grad = np.array(output[1])
+
+        x = torch.tensor(
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        rois = torch.tensor(np_rois, dtype=dtype, device=device)
+
+        output = roi_align(x, rois, (pool_h, pool_w), spatial_scale,
+                           sampling_ratio, 'avg', True)
+        output.backward(torch.ones_like(output))
+        assert np.allclose(
+            output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)
+        assert np.allclose(
+            x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
+
+
+@pytest.mark.parametrize('dtype', [torch.float, torch.half])
+@pytest.mark.parametrize('device', [
+    'cpu',
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
+])
+def test_roialign_float(device, dtype):
+    _test_roialign_allclose(device=device, dtype=dtype)
+
+
+@pytest.mark.parametrize('device', [
+    'cpu',
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+])
+def test_roialign_float64(device):
+    _test_roialign_allclose(device=device, dtype=torch.double)
+    _test_roialign_gradcheck(device=device, dtype=torch.double)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_roi_align_rotated.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_roi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d5ca432df1e5139c5b7c75e8d76bb71519a4e58
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_roi_align_rotated.py
@@ -0,0 +1,150 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+# yapf:disable
+inputs = [([[[[1., 2.], [3., 4.]]]],
+           [[0., 0.5, 0.5, 1., 1., 0]]),
+          ([[[[1., 2.], [3., 4.]]]],
+           [[0., 0.5, 0.5, 1., 1., np.pi / 2]]),
+          ([[[[1., 2.], [3., 4.]],
+             [[4., 3.], [2., 1.]]]],
+           [[0., 0.5, 0.5, 1., 1., 0]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.],
+              [9., 10., 13., 14.], [11., 12., 15., 16.]]]],
+           [[0., 1.5, 1.5, 3., 3., 0]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.],
+              [9., 10., 13., 14.], [11., 12., 15., 16.]]]],
+           [[0., 1.5, 1.5, 3., 3., np.pi / 2]])]
+outputs = [([[[[1.0, 1.25], [1.5, 1.75]]]],
+            [[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),
+           ([[[[1.5, 1], [1.75, 1.25]]]],
+            [[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),
+           ([[[[1.0, 1.25], [1.5, 1.75]],
+              [[4.0, 3.75], [3.5, 3.25]]]],
+            [[[[3.0625, 0.4375], [0.4375, 0.0625]],
+              [[3.0625, 0.4375], [0.4375, 0.0625]]]]),
+           ([[[[1.9375, 4.75], [7.5625, 10.375]]]],
+            [[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.04296875, 0.03906250, 0.03906250, 0.00390625]]]]),
+           ([[[[7.5625, 1.9375], [10.375, 4.75]]]],
+            [[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.04296875, 0.03906250, 0.03906250, 0.00390625]]]])]
+# yapf:enable
+
+pool_h = 2
+pool_w = 2
+spatial_scale = 1.0
+sampling_ratio = 2
+
+
+def _test_roialign_rotated_gradcheck(device, dtype):
+    try:
+        from mmcv.ops import RoIAlignRotated
+    except ModuleNotFoundError:
+        pytest.skip('RoIAlignRotated op is not successfully compiled')
+    if dtype is torch.half:
+        pytest.skip('grad check does not support fp16')
+    for case in inputs:
+        np_input = np.array(case[0])
+        np_rois = np.array(case[1])
+
+        x = torch.tensor(
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        rois = torch.tensor(np_rois, dtype=dtype, device=device)
+
+        froipool = RoIAlignRotated((pool_h, pool_w), spatial_scale,
+                                   sampling_ratio)
+        if torch.__version__ == 'parrots':
+            gradcheck(
+                froipool, (x, rois), no_grads=[rois], delta=1e-5, pt_atol=1e-5)
+        else:
+            gradcheck(froipool, (x, rois), eps=1e-5, atol=1e-5)
+
+
+def _test_roialign_rotated_allclose(device, dtype):
+    try:
+        from mmcv.ops import RoIAlignRotated, roi_align_rotated
+    except ModuleNotFoundError:
+        pytest.skip('test requires compilation')
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+
+    for case, output in zip(inputs, outputs):
+        np_input = np.array(case[0])
+        np_rois = np.array(case[1])
+        np_output = np.array(output[0])
+        np_grad = np.array(output[1])
+
+        x = torch.tensor(
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        rois = torch.tensor(np_rois, dtype=dtype, device=device)
+
+        output = roi_align_rotated(x, rois, (pool_h, pool_w), spatial_scale,
+                                   sampling_ratio, True)
+        output.backward(torch.ones_like(output))
+        assert np.allclose(
+            output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)
+        assert np.allclose(
+            x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
+
+    # Test deprecated parameters
+    roi_align_rotated_module_deprecated = RoIAlignRotated(
+        out_size=(pool_h, pool_w),
+        spatial_scale=spatial_scale,
+        sample_num=sampling_ratio)
+
+    output_1 = roi_align_rotated_module_deprecated(x, rois)
+
+    roi_align_rotated_module_new = RoIAlignRotated(
+        output_size=(pool_h, pool_w),
+        spatial_scale=spatial_scale,
+        sampling_ratio=sampling_ratio)
+
+    output_2 = roi_align_rotated_module_new(x, rois)
+
+    assert np.allclose(
+        output_1.data.type(torch.float).cpu().numpy(),
+        output_2.data.type(torch.float).cpu().numpy())
+
+
+@pytest.mark.parametrize('device', [
+    'cpu',
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+@pytest.mark.parametrize('dtype', [
+    torch.float,
+    pytest.param(
+        torch.double,
+        marks=pytest.mark.skipif(
+            IS_MLU_AVAILABLE,
+            reason='MLU does not support for 64-bit floating point')),
+    torch.half
+])
+def test_roialign_rotated(device, dtype):
+    # check double only
+    if dtype is torch.double:
+        _test_roialign_rotated_gradcheck(device=device, dtype=dtype)
+    _test_roialign_rotated_allclose(device=device, dtype=dtype)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_roi_pool.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ab04bce2bb5a9e188f899839a6d35958280e13c
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_roi_pool.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+
+    _USING_PARROTS = False
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
+                                               1.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+outputs = [([[[[1., 2.], [3., 4.]]]], [[[[1., 1.], [1., 1.]]]]),
+           ([[[[1., 2.], [3., 4.]], [[4., 3.], [2., 1.]]]], [[[[1., 1.],
+                                                               [1., 1.]],
+                                                              [[1., 1.],
+                                                               [1., 1.]]]]),
+           ([[[[4., 8.], [12., 16.]]]], [[[[0., 0., 0., 0.], [0., 1., 0., 1.],
+                                           [0., 0., 0., 0.], [0., 1., 0.,
+                                                              1.]]]])]
+
+
+class TestRoiPool:
+
+    def test_roipool_gradcheck(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import RoIPool
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+
+        for case in inputs:
+            np_input = np.array(case[0])
+            np_rois = np.array(case[1])
+
+            x = torch.tensor(np_input, device='cuda', requires_grad=True)
+            rois = torch.tensor(np_rois, device='cuda')
+
+            froipool = RoIPool((pool_h, pool_w), spatial_scale)
+
+            if _USING_PARROTS:
+                pass
+                # gradcheck(froipool, (x, rois), no_grads=[rois])
+            else:
+                gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)
+
+    def _test_roipool_allclose(self, device, dtype=torch.float):
+        from mmcv.ops import roi_pool
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+
+        for case, output in zip(inputs, outputs):
+            np_input = np.array(case[0])
+            np_rois = np.array(case[1])
+            np_output = np.array(output[0])
+            np_grad = np.array(output[1])
+
+            x = torch.tensor(
+                np_input, dtype=dtype, device=device, requires_grad=True)
+            rois = torch.tensor(np_rois, dtype=dtype, device=device)
+            output = roi_pool(x, rois, (pool_h, pool_w), spatial_scale)
+            output.backward(torch.ones_like(output))
+            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
+            assert np.allclose(x.grad.data.cpu().numpy(), np_grad, 1e-3)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+        pytest.param(
+            'npu',
+            marks=pytest.mark.skipif(
+                not IS_NPU_AVAILABLE, reason='requires NPU support'))
+    ])
+    @pytest.mark.parametrize('dtype', [
+        torch.float,
+        pytest.param(
+            torch.double,
+            marks=pytest.mark.skipif(
+                IS_MLU_AVAILABLE or IS_NPU_AVAILABLE,
+                reason='MLU, NPU does not support for 64-bit floating point')),
+        torch.half
+    ])
+    def test_roipool_allclose(self, device, dtype):
+        self._test_roipool_allclose(device, dtype)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_roiaware_pool3d.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_roiaware_pool3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..08943c21b464d021463d712eb1ad5ad260f4f519
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_roiaware_pool3d.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import (RoIAwarePool3d, points_in_boxes_all, points_in_boxes_cpu,
+                      points_in_boxes_part)
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+
+@pytest.mark.parametrize('dtype', [
+    torch.float, torch.half,
+    pytest.param(
+        torch.double,
+        marks=pytest.mark.skipif(
+            IS_MLU_AVAILABLE, reason='MLU does not support for double'))
+])
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_RoIAwarePool3d(device, dtype):
+    roiaware_pool3d_max = RoIAwarePool3d(
+        out_size=4, max_pts_per_voxel=128, mode='max')
+    roiaware_pool3d_avg = RoIAwarePool3d(
+        out_size=4, max_pts_per_voxel=128, mode='avg')
+    rois = torch.tensor(
+        [[1.0, 2.0, 3.0, 5.0, 4.0, 6.0, -0.3 - np.pi / 2],
+         [-10.0, 23.0, 16.0, 20.0, 10.0, 20.0, -0.5 - np.pi / 2]],
+        dtype=dtype).to(device)
+    # boxes (m, 7) with bottom center in lidar coordinate
+    pts = torch.tensor(
+        [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+         [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+         [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],
+         [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],
+        dtype=dtype).to(device)  # points (n, 3) in lidar coordinate
+    pts_feature = pts.clone()
+
+    pooled_features_max = roiaware_pool3d_max(
+        rois=rois, pts=pts, pts_feature=pts_feature)
+    assert pooled_features_max.shape == torch.Size([2, 4, 4, 4, 3])
+    assert torch.allclose(pooled_features_max.sum(),
+                          torch.tensor(51.100, dtype=dtype).to(device), 1e-3)
+
+    pooled_features_avg = roiaware_pool3d_avg(
+        rois=rois, pts=pts, pts_feature=pts_feature)
+    assert pooled_features_avg.shape == torch.Size([2, 4, 4, 4, 3])
+    assert torch.allclose(pooled_features_avg.sum(),
+                          torch.tensor(49.750, dtype=dtype).to(device), 1e-3)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_points_in_boxes_part():
+    boxes = torch.tensor(
+        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3]],
+         [[-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
+        dtype=torch.float32).cuda(
+        )  # boxes (b, t, 7) with bottom center in lidar coordinate
+    pts = torch.tensor(
+        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+          [4.7, 3.5, -12.2]],
+         [[3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9], [-21.3, -52, -5],
+          [0, 0, 0], [6, 7, 8], [-2, -3, -4], [6, 4, 9]]],
+        dtype=torch.float32).cuda()  # points (b, m, 3) in lidar coordinate
+
+    point_indices = points_in_boxes_part(points=pts, boxes=boxes)
+    expected_point_indices = torch.tensor(
+        [[0, 0, 0, 0, 0, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1]],
+        dtype=torch.int32).cuda()
+    assert point_indices.shape == torch.Size([2, 8])
+    assert (point_indices == expected_point_indices).all()
+
+    boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]],
+                         dtype=torch.float32).cuda()  # 30 degrees
+    pts = torch.tensor(
+        [[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0],
+          [-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]],
+        dtype=torch.float32).cuda()
+    point_indices = points_in_boxes_part(points=pts, boxes=boxes)
+    expected_point_indices = torch.tensor([[-1, -1, 0, -1, 0, -1, -1, -1]],
+                                          dtype=torch.int32).cuda()
+    assert (point_indices == expected_point_indices).all()
+
+
+def test_points_in_boxes_cpu():
+    boxes = torch.tensor(
+        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
+          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
+        dtype=torch.float32
+    )  # boxes (m, 7) with bottom center in lidar coordinate
+    pts = torch.tensor(
+        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [
+              -16, -18, 9
+          ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],
+        dtype=torch.float32)  # points (n, 3) in lidar coordinate
+
+    point_indices = points_in_boxes_cpu(points=pts, boxes=boxes)
+    expected_point_indices = torch.tensor(
+        [[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],
+          [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
+        dtype=torch.int32)
+    assert point_indices.shape == torch.Size([1, 15, 2])
+    assert (point_indices == expected_point_indices).all()
+
+    boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]],
+                         dtype=torch.float32)  # 30 degrees
+    pts = torch.tensor(
+        [[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0],
+          [-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]],
+        dtype=torch.float32)
+    point_indices = points_in_boxes_cpu(points=pts, boxes=boxes)
+    expected_point_indices = torch.tensor(
+        [[[0], [0], [1], [0], [1], [0], [0], [0]]], dtype=torch.int32)
+    assert (point_indices == expected_point_indices).all()
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_points_in_boxes_all():
+
+    boxes = torch.tensor(
+        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
+          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
+        dtype=torch.float32).cuda(
+        )  # boxes (m, 7) with bottom center in lidar coordinate
+    pts = torch.tensor(
+        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [
+              -16, -18, 9
+          ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],
+        dtype=torch.float32).cuda()  # points (n, 3) in lidar coordinate
+
+    point_indices = points_in_boxes_all(points=pts, boxes=boxes)
+    expected_point_indices = torch.tensor(
+        [[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],
+          [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
+        dtype=torch.int32).cuda()
+    assert point_indices.shape == torch.Size([1, 15, 2])
+    assert (point_indices == expected_point_indices).all()
+
+    if torch.cuda.device_count() > 1:
+        pts = pts.to('cuda:1')
+        boxes = boxes.to('cuda:1')
+        expected_point_indices = expected_point_indices.to('cuda:1')
+        point_indices = points_in_boxes_all(points=pts, boxes=boxes)
+        assert point_indices.shape == torch.Size([1, 15, 2])
+        assert (point_indices == expected_point_indices).all()
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_roipoint_pool3d.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_roipoint_pool3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..391a0bf3a4ebc79e883a78c8583df786a6f0e30a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_roipoint_pool3d.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import RoIPointPool3d
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+@pytest.mark.parametrize('dtype', [
+    torch.float, torch.half,
+    pytest.param(
+        torch.double,
+        marks=pytest.mark.skipif(
+            IS_MLU_AVAILABLE, reason='MLU does not support for double'))
+])
+def test_roipoint(device, dtype):
+    points = torch.tensor(
+        [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+         [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+         [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],
+         [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],
+        dtype=dtype).unsqueeze(0).to(device)
+    feats = points.clone()
+    rois = torch.tensor([[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
+                          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
+                        dtype=dtype).to(device)
+
+    roipoint_pool3d = RoIPointPool3d(num_sampled_points=4)
+    roi_feat, empty_flag = roipoint_pool3d(points, feats, rois)
+    expected_roi_feat = torch.tensor(
+        [[[[1, 2, 3.3, 1, 2, 3.3], [1.2, 2.5, 3, 1.2, 2.5, 3],
+           [0.8, 2.1, 3.5, 0.8, 2.1, 3.5], [1.6, 2.6, 3.6, 1.6, 2.6, 3.6]],
+          [[-9.2, 21, 18.2, -9.2, 21, 18.2], [-9.2, 21, 18.2, -9.2, 21, 18.2],
+           [-9.2, 21, 18.2, -9.2, 21, 18.2], [-9.2, 21, 18.2, -9.2, 21, 18.2]]]
+         ],
+        dtype=dtype).to(device)
+    expected_empty_flag = torch.tensor([[0, 0]]).int().to(device)
+
+    assert torch.allclose(roi_feat, expected_roi_feat)
+    assert torch.allclose(empty_flag, expected_empty_flag)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_rotated_feature_align.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_rotated_feature_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..005cbcf01cc4778db5439a24ac9e35be54320fa0
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_rotated_feature_align.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import rotated_feature_align
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support')),
+    pytest.param(
+        'cpu',
+        marks=pytest.mark.skipif(
+            torch.__version__ == 'parrots', reason='requires PyTorch support'))
+])
+def test_rotated_feature_align(device):
+    feature = torch.tensor([[[[1.2924, -0.2172, -0.5222, 0.1172],
+                              [0.9144, 1.2248, 1.3115, -0.9690],
+                              [-0.8949, -1.1797, -0.9093, -0.3961],
+                              [-0.4586, 0.5062, -0.7947, -0.7397]],
+                             [[-1.0943, -0.7495, 1.3461, -1.1652],
+                              [0.2034, 0.6763, -1.2357, 0.5231],
+                              [-1.0062, 1.2592, 1.4225, -0.3951],
+                              [-0.1242, -1.6240, 0.1932, 2.7181]],
+                             [[-1.6271, -1.0276, 0.0578, -0.2997],
+                              [-0.9684, -1.6946, -1.3188, -1.1938],
+                              [-1.6744, -0.8917, -0.6556,
+                               1.0073], [-0.1205, 0.3671, -0.3731, -0.5347]]],
+                            [[[0.7035, 0.2089, -0.1774, 3.4670],
+                              [-0.8505, -0.9278, 1.4714, 0.1644],
+                              [0.0898, 0.3531, -0.4007, 0.1927],
+                              [1.2569, -0.2636, -0.5223, 0.0616]],
+                             [[0.1760, -0.7639, -0.4600, -1.3260],
+                              [-0.9921, -0.2970, -0.8955, 1.0508],
+                              [1.3515, -0.1641, 1.9679, 1.1986],
+                              [-0.3616, 0.6287, 0.4933, 0.3360]],
+                             [[-0.5860, 0.2124, -0.8700, 2.4200],
+                              [-0.0551, -1.5103, -1.6779, 0.8399],
+                              [0.8431, 1.2414, -1.1243, -0.3887],
+                              [-2.1254, 0.6047, -0.3515, 0.7254]]]],
+                           device=device,
+                           requires_grad=True)
+
+    bbox = torch.tensor(
+        [[[[1.3080e+01, 1.2688e+01, 1.1214e+01, 9.3944e+01, -9.1905e-01],
+           [3.8104e+01, 1.0134e+01, 1.4659e+02, 9.0306e+01, -9.8211e-01],
+           [-5.3213e+01, 4.9508e+01, 5.1513e+01, 3.2055e+01, -3.1954e-01],
+           [2.6974e+01, 2.5248e+01, 5.4495e+01, 3.1083e+00, -6.2127e-01]],
+          [[-1.5604e+01, -5.1908e+01, 2.3998e+02, 1.5008e+01, -1.2546e+00],
+           [3.1354e+01, -7.3635e+00, 6.7879e+01, 3.5081e+01, -3.3851e-01],
+           [-5.3292e+00, 9.1946e+00, 1.2834e+01, 1.0485e+01, -1.3039e+00],
+           [-2.3925e+01, 3.6623e+01, 3.9875e+01, 7.2009e+01, -6.5934e-01]],
+          [[7.2114e+01, -2.3781e+01, 2.9106e+01, 8.4501e+01, -1.1340e+00],
+           [2.6258e+01, -7.7034e+00, 1.7629e+02, 1.0615e+02, -1.2156e+00],
+           [3.8057e+01, 4.6016e+01, 1.2965e+01, 6.9384e+00, -1.0855e+00],
+           [2.4428e+01, -1.6189e+01, 2.0572e+02, 3.1622e+01, -1.5719e-01]],
+          [[3.8226e+00, 2.9608e+01, 1.4457e+01, 6.8179e+01, -9.1997e-01],
+           [2.5003e+01, -4.2490e+01, 9.6007e+01, 4.9086e+01, -1.4786e+00],
+           [8.5983e+01, 5.4980e+01, 7.8080e+01, 1.0003e+02, -1.0926e+00],
+           [9.9065e+00, 4.1457e+01, 5.9799e+00, 1.7973e+01, -5.6313e-01]]],
+         [[[-1.8244e+01, 4.6309e+00, 5.3010e+01, 2.4310e+01, -7.0345e-01],
+           [1.9419e+01, 3.6704e+01, 5.2390e+01, 5.4133e+01, -3.7730e-01],
+           [5.6387e+01, 2.3752e+01, 9.0441e+00, 1.7792e+01, -1.5583e+00],
+           [3.6303e+01, 1.6396e+01, 2.0283e+01, 1.9148e+01, -8.3419e-01]],
+          [[3.2169e+01, 3.0521e+01, 2.6283e+01, 1.9680e+02, -3.0454e-01],
+           [2.5788e+01, -3.2189e+01, 8.8882e+01, 1.0207e+02, -1.5328e+00],
+           [8.4676e+00, -1.6668e+01, 2.4657e+01, 1.1275e+02, -4.0388e-01],
+           [-1.0799e+01, 6.0422e+00, 9.5807e+00, 3.3677e+01, -3.5438e-01]],
+          [[6.9363e+01, 1.0850e+01, 2.5968e+01, 2.2311e+01, -1.6408e-01],
+           [2.8140e+00, 4.6843e+00, 3.1289e+00, 2.1480e+01, -6.7583e-01],
+           [2.6661e+01, 4.5290e+01, 6.1679e+00, 3.0005e+01, -8.9806e-01],
+           [5.0871e+00, 1.3234e+01, 9.2087e+01, 4.9622e+01, -2.8020e-01]],
+          [[-1.2643e+01, 2.5176e+01, 5.0488e+01, 5.4246e+01, -4.4840e-01],
+           [-3.4521e+01, 9.8435e-01, 5.2413e+01, 9.7996e+00, -8.4218e-01],
+           [4.9829e+01, -1.0808e+01, 2.9848e+01, 7.3579e+01, -6.2672e-01],
+           [8.0446e+01, 2.8064e+01, 4.5273e+01, 5.3809e+01, -1.2359e+00]]]],
+        device=device,
+        requires_grad=True)
+
+    expected_output = torch.tensor([[[[1.1095, -0.2172, -0.5222, -0.6225],
+                                      [0.9144, 0.7662, 1.0487, -0.9690],
+                                      [-0.8949, -1.6384, -0.9093, -0.3961],
+                                      [-0.8604, 0.5062, -0.7947, -0.7397]],
+                                     [[-0.3961, -0.7495, 1.3461, 1.5528],
+                                      [0.2034, 0.5522, -1.6722, 0.5231],
+                                      [-1.0062, 1.1350, 1.4225, -0.3951],
+                                      [-0.4826, -1.6240, 0.1932, 2.7181]],
+                                     [[-2.6436, -1.0276, 0.0578, -0.8344],
+                                      [-0.9684, -1.8151, -2.1843, -1.1938],
+                                      [-1.6744, -1.0121, -0.6556, 1.0073],
+                                      [-0.8474, 0.3671, -0.3731, -0.5347]]],
+                                    [[[0.7035, 0.2089, -0.1774, 3.4670],
+                                      [-0.8505, -0.9278, 1.4714, 0.1644],
+                                      [0.0898, 0.3064, -0.4007, 0.5849],
+                                      [1.2569, -0.2636, -0.5223, 0.0616]],
+                                     [[0.1760, -0.7639, -0.4600, -1.3260],
+                                      [-0.9921, -0.2970, -0.8955, 1.0508],
+                                      [1.3515, -0.6125, 1.9679, 0.5550],
+                                      [-0.3616, 0.6287, 0.4933, 0.3360]],
+                                     [[-0.5860, 0.2124, -0.8700, 2.4200],
+                                      [-0.0551, -1.5103, -1.6779, 0.8399],
+                                      [0.8431, 0.8455, -1.1243, -1.5994],
+                                      [-2.1254, 0.6047, -0.3515, 0.7254]]]],
+                                   device=device)
+
+    expected_grad = torch.tensor([
+        [[[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
+          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]],
+         [[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
+          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]],
+         [[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
+          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]]],
+        [[[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
+          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]],
+         [[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
+          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]],
+         [[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
+          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]]]
+    ],
+                                 device=device)
+
+    output = rotated_feature_align(
+        feature, bbox, spatial_scale=1 / 8, points=1)
+    output.backward(torch.ones_like(output))
+    assert torch.allclose(output, expected_output, 1e-2)
+    assert torch.allclose(feature.grad, expected_grad, 1e-2)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_saconv.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_saconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..607775c38511d5f3afd01ae4656a232474420761
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_saconv.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmcv.ops import SAConv2d
+
+
+def test_sacconv():
+
+    # test with normal cast
+    x = torch.rand(1, 3, 256, 256)
+    saconv = SAConv2d(3, 5, kernel_size=3, padding=1)
+    sac_out = saconv(x)
+    refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=1)
+    refer_out = refer_conv(x)
+    assert sac_out.shape == refer_out.shape
+
+    # test with dilation >= 2
+    dalited_saconv = SAConv2d(3, 5, kernel_size=3, padding=2, dilation=2)
+    dalited_sac_out = dalited_saconv(x)
+    refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=2, dilation=2)
+    refer_out = refer_conv(x)
+    assert dalited_sac_out.shape == refer_out.shape
+
+    # test with deform
+    deform_saconv = SAConv2d(3, 5, kernel_size=3, padding=1, use_deform=True)
+    if torch.cuda.is_available():
+        x = torch.rand(1, 3, 256, 256).cuda()
+        deform_saconv = SAConv2d(
+            3, 5, kernel_size=3, padding=1, use_deform=True).cuda()
+        deform_sac_out = deform_saconv(x).cuda()
+        refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=1).cuda()
+        refer_out = refer_conv(x)
+        assert deform_sac_out.shape == refer_out.shape
+    else:
+        deform_sac_out = deform_saconv(x)
+        refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=1)
+        refer_out = refer_conv(x)
+        assert deform_sac_out.shape == refer_out.shape
+
+    # test with groups >= 2
+    x = torch.rand(1, 4, 256, 256)
+    group_saconv = SAConv2d(4, 4, kernel_size=3, padding=1, groups=2)
+    group_sac_out = group_saconv(x)
+    refer_conv = nn.Conv2d(4, 4, kernel_size=3, padding=1, groups=2)
+    refer_out = refer_conv(x)
+    assert group_sac_out.shape == refer_out.shape
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_scatter_points.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_scatter_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8b569481a5cf882acbb1ed825598d7c0570c7f8
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_scatter_points.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch.autograd import gradcheck
+
+from mmcv.ops import DynamicScatter
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+if torch.__version__ == 'parrots':
+    pytest.skip('not supported in parrots now', allow_module_level=True)
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_dynamic_scatter(device):
+    dsmean = DynamicScatter([0.32, 0.32, 6],
+                            [-74.88, -74.88, -2, 74.88, 74.88, 4], True)
+    dsmax = DynamicScatter([0.32, 0.32, 6],
+                           [-74.88, -74.88, -2, 74.88, 74.88, 4], False)
+
+    # test empty input
+    empty_feats = torch.empty(size=(0, 3), dtype=torch.float32, device=device)
+    empty_coors = torch.empty(size=(0, 3), dtype=torch.int32, device=device)
+
+    empty_feats.requires_grad_()
+    empty_feats_out_mean, empty_coors_out_mean = dsmean(
+        empty_feats, empty_coors)
+    empty_feats_out_mean.sum().backward()
+    empty_feats_out_max, empty_coors_out_max = dsmax(empty_feats, empty_coors)
+    empty_feats_out_max.sum().backward()
+
+    assert empty_feats_out_mean.shape == empty_feats.shape
+    assert empty_feats_out_max.shape == empty_feats.shape
+    assert empty_coors_out_mean.shape == empty_coors.shape
+    assert empty_coors_out_max.shape == empty_coors.shape
+
+    # test empty reduced output
+    empty_o_feats = torch.rand(
+        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50
+    empty_o_coors = torch.randint(
+        low=-1, high=0, size=(200000, 3), dtype=torch.int32, device=device)
+
+    empty_o_feats.requires_grad_()
+    empty_o_feats_out_mean, empty_o_coors_out_mean = dsmean(
+        empty_o_feats, empty_o_coors)
+    empty_o_feats_out_mean.sum().backward()
+    assert (empty_o_feats.grad == 0).all()
+
+    empty_o_feats_out_max, empty_o_coors_out_max = dsmax(
+        empty_o_feats, empty_o_coors)
+    empty_o_feats_out_max.sum().backward()
+    assert (empty_o_feats.grad == 0).all()
+
+    # test non-empty input
+    feats = torch.rand(
+        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50
+    coors = torch.randint(
+        low=-1, high=20, size=(200000, 3), dtype=torch.int32, device=device)
+
+    ref_voxel_coors = coors.unique(dim=0, sorted=True)
+    ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
+    ref_voxel_feats_mean = []
+    ref_voxel_feats_max = []
+    for ref_voxel_coor in ref_voxel_coors:
+        voxel_mask = (coors == ref_voxel_coor).all(dim=-1)
+        ref_voxel_feats_mean.append(feats[voxel_mask].mean(dim=0))
+        ref_voxel_feats_max.append(feats[voxel_mask].max(dim=0).values)
+    ref_voxel_feats_mean = torch.stack(ref_voxel_feats_mean)
+    ref_voxel_feats_max = torch.stack(ref_voxel_feats_max)
+
+    feats_out_mean, coors_out_mean = dsmean(feats, coors)
+    seq_mean = (coors_out_mean[:, 0] * 400 + coors_out_mean[:, 1] * 20 +
+                coors_out_mean[:, 2]).argsort()
+    feats_out_mean = feats_out_mean[seq_mean]
+    coors_out_mean = coors_out_mean[seq_mean]
+
+    feats_out_max, coors_out_max = dsmax(feats, coors)
+    seq_max = (coors_out_max[:, 0] * 400 + coors_out_max[:, 1] * 20 +
+               coors_out_max[:, 2]).argsort()
+    feats_out_max = feats_out_max[seq_max]
+    coors_cout_max = coors_out_max[seq_max]
+
+    assert (coors_out_mean == ref_voxel_coors).all()
+    assert torch.allclose(
+        feats_out_mean, ref_voxel_feats_mean, atol=1e-2, rtol=1e-5)
+    assert (coors_cout_max == ref_voxel_coors).all()
+    assert torch.allclose(
+        feats_out_max, ref_voxel_feats_max, atol=1e-2, rtol=1e-5)
+
+    # test non-empty input without any point out of bound
+    feats = torch.rand(
+        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50
+    coors = torch.randint(
+        low=0, high=20, size=(200000, 3), dtype=torch.int32, device=device)
+
+    ref_voxel_coors = coors.unique(dim=0, sorted=True)
+    ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
+    ref_voxel_feats_mean = []
+    ref_voxel_feats_max = []
+    for ref_voxel_coor in ref_voxel_coors:
+        voxel_mask = (coors == ref_voxel_coor).all(dim=-1)
+        ref_voxel_feats_mean.append(feats[voxel_mask].mean(dim=0))
+        ref_voxel_feats_max.append(feats[voxel_mask].max(dim=0).values)
+    ref_voxel_feats_mean = torch.stack(ref_voxel_feats_mean)
+    ref_voxel_feats_max = torch.stack(ref_voxel_feats_max)
+
+    feats_out_mean, coors_out_mean = dsmean(feats, coors)
+    seq_mean = (coors_out_mean[:, 0] * 400 + coors_out_mean[:, 1] * 20 +
+                coors_out_mean[:, 2]).argsort()
+    feats_out_mean = feats_out_mean[seq_mean]
+    coors_out_mean = coors_out_mean[seq_mean]
+
+    feats_out_max, coors_out_max = dsmax(feats, coors)
+    seq_max = (coors_out_max[:, 0] * 400 + coors_out_max[:, 1] * 20 +
+               coors_out_max[:, 2]).argsort()
+    feats_out_max = feats_out_max[seq_max]
+    coors_cout_max = coors_out_max[seq_max]
+
+    assert (coors_out_mean == ref_voxel_coors).all()
+    assert torch.allclose(
+        feats_out_mean, ref_voxel_feats_mean, atol=1e-2, rtol=1e-5)
+    assert (coors_cout_max == ref_voxel_coors).all()
+    assert torch.allclose(
+        feats_out_max, ref_voxel_feats_max, atol=1e-2, rtol=1e-5)
+
+    # test grad #
+    feats = torch.rand(
+        size=(100, 4), dtype=torch.float32, device=device) * 100 - 50
+    coors = torch.randint(
+        low=-1, high=3, size=(100, 3), dtype=torch.int32, device=device)
+    feats.requires_grad_()
+    # TODO(Cambricon): mlu only support max reduce in current version.
+    if not IS_MLU_AVAILABLE:
+        gradcheck(dsmean, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)
+    gradcheck(dsmax, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_spconv.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_spconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ca5678ed79dc456b34abbed67dbb2859c3a20f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_spconv.py
@@ -0,0 +1,145 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch import nn
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.ops import (SparseConvTensor, SparseInverseConv3d, SparseSequential,
+                      SubMConv3d)
+
+if torch.__version__ == 'parrots':
+    pytest.skip('not supported in parrots now', allow_module_level=True)
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+
+def make_sparse_convmodule(in_channels,
+                           out_channels,
+                           kernel_size,
+                           indice_key,
+                           stride=1,
+                           padding=0,
+                           conv_type='SubMConv3d',
+                           norm_cfg=None,
+                           order=('conv', 'norm', 'act')):
+    """Make sparse convolution module.
+
+    Args:
+        in_channels (int): the number of input channels
+        out_channels (int): the number of out channels
+        kernel_size (int|tuple(int)): kernel size of convolution
+        indice_key (str): the indice key used for sparse tensor
+        stride (int|tuple(int)): the stride of convolution
+        padding (int or list[int]): the padding number of input
+        conv_type (str): sparse conv type in spconv
+        norm_cfg (dict[str]): config of normalization layer
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Common examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+
+    Returns:
+        spconv.SparseSequential: sparse convolution module.
+    """
+    assert isinstance(order, tuple) and len(order) <= 3
+    assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'}
+
+    conv_cfg = dict(type=conv_type, indice_key=indice_key)
+
+    layers = list()
+    for layer in order:
+        if layer == 'conv':
+            if conv_type not in [
+                    'SparseInverseConv3d', 'SparseInverseConv2d',
+                    'SparseInverseConv1d'
+            ]:
+                layers.append(
+                    build_conv_layer(
+                        conv_cfg,
+                        in_channels,
+                        out_channels,
+                        kernel_size,
+                        stride=stride,
+                        padding=padding,
+                        bias=False))
+            else:
+                layers.append(
+                    build_conv_layer(
+                        conv_cfg,
+                        in_channels,
+                        out_channels,
+                        kernel_size,
+                        bias=False))
+        elif layer == 'norm':
+            layers.append(build_norm_layer(norm_cfg, out_channels)[1])
+        elif layer == 'act':
+            layers.append(nn.ReLU(inplace=True))
+
+    layers = SparseSequential(*layers)
+    return layers
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_make_sparse_convmodule(device):
+    torch.cuda.empty_cache()
+    voxel_features = torch.tensor([[6.56126, 0.9648336, -1.7339306, 0.315],
+                                   [6.8162713, -2.480431, -1.3616394, 0.36],
+                                   [11.643568, -4.744306, -1.3580885, 0.16],
+                                   [23.482342, 6.5036807, 0.5806964, 0.35]],
+                                  dtype=torch.float32,
+                                  device=device)  # n, point_features
+    coordinates = torch.tensor(
+        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
+         [1, 35, 930, 469]],
+        dtype=torch.int32,
+        device=device)  # n, 4(batch, ind_x, ind_y, ind_z)
+
+    # test
+    input_sp_tensor = SparseConvTensor(voxel_features, coordinates,
+                                       [41, 1600, 1408], 2)
+
+    sparse_block0 = make_sparse_convmodule(
+        4,
+        16,
+        3,
+        'test0',
+        stride=1,
+        padding=0,
+        conv_type='SubMConv3d',
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+        order=('conv', 'norm', 'act')).to(device)
+    assert isinstance(sparse_block0[0], SubMConv3d)
+    assert sparse_block0[0].in_channels == 4
+    assert sparse_block0[0].out_channels == 16
+    assert isinstance(sparse_block0[1], torch.nn.BatchNorm1d)
+    assert sparse_block0[1].eps == 0.001
+    assert sparse_block0[1].momentum == 0.01
+    assert isinstance(sparse_block0[2], torch.nn.ReLU)
+
+    # test forward
+    out_features = sparse_block0(input_sp_tensor)
+    assert out_features.features.shape == torch.Size([4, 16])
+
+    # device == mlu: not support inverse==1 yet
+    if device != 'mlu':
+        sparse_block1 = make_sparse_convmodule(
+            4,
+            16,
+            3,
+            'test1',
+            stride=1,
+            padding=0,
+            conv_type='SparseInverseConv3d',
+            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+            order=('norm', 'act', 'conv')).to(device)
+        assert isinstance(sparse_block1[2], SparseInverseConv3d)
+        assert isinstance(sparse_block1[0], torch.nn.BatchNorm1d)
+        assert isinstance(sparse_block1[1], torch.nn.ReLU)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_syncbn.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_syncbn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1c1605ad5aa4f846cbd62db62a27e8af32b6840
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_syncbn.py
@@ -0,0 +1,295 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import platform
+
+import numpy as np
+import pytest
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+if platform.system() == 'Windows':
+    import regex as re
+else:
+    import re
+
+
+class TestSyncBN:
+
+    def dist_init(self):
+        rank = int(os.environ['SLURM_PROCID'])
+        world_size = int(os.environ['SLURM_NTASKS'])
+        local_rank = int(os.environ['SLURM_LOCALID'])
+        node_list = str(os.environ['SLURM_NODELIST'])
+
+        node_parts = re.findall('[0-9]+', node_list)
+        os.environ['MASTER_ADDR'] = (f'{node_parts[1]}.{node_parts[2]}' +
+                                     f'.{node_parts[3]}.{node_parts[4]}')
+        os.environ['MASTER_PORT'] = '12341'
+        os.environ['WORLD_SIZE'] = str(world_size)
+        os.environ['RANK'] = str(rank)
+
+        dist.init_process_group('nccl')
+        torch.cuda.set_device(local_rank)
+
+    def _test_syncbn_train(self, size=1, half=False):
+
+        if 'SLURM_NTASKS' not in os.environ or int(
+                os.environ['SLURM_NTASKS']) != 4:
+            print('must run with slurm has 4 processes!\n'
+                  'srun -p test --gres=gpu:4 -n4')
+            return
+        else:
+            print('Running syncbn test')
+        from mmcv.ops import SyncBatchNorm
+
+        assert size in (1, 2, 4)
+        if not dist.is_initialized():
+            self.dist_init()
+        rank = dist.get_rank()
+
+        torch.manual_seed(9)
+        torch.cuda.manual_seed(9)
+
+        self.x = torch.rand(16, 3, 2, 3).cuda()
+        self.y_bp = torch.rand(16, 3, 2, 3).cuda()
+
+        if half:
+            self.x = self.x.half()
+            self.y_bp = self.y_bp.half()
+        dist.broadcast(self.x, src=0)
+        dist.broadcast(self.y_bp, src=0)
+
+        torch.cuda.synchronize()
+        if size == 1:
+            groups = [None, None, None, None]
+            groups[0] = dist.new_group([0])
+            groups[1] = dist.new_group([1])
+            groups[2] = dist.new_group([2])
+            groups[3] = dist.new_group([3])
+            group = groups[rank]
+        elif size == 2:
+            groups = [None, None, None, None]
+            groups[0] = groups[1] = dist.new_group([0, 1])
+            groups[2] = groups[3] = dist.new_group([2, 3])
+            group = groups[rank]
+        elif size == 4:
+            group = dist.group.WORLD
+        syncbn = SyncBatchNorm(3, group=group).cuda()
+        syncbn.weight.data[0] = 0.2
+        syncbn.weight.data[1] = 0.5
+        syncbn.weight.data[2] = 0.7
+        syncbn.train()
+
+        bn = nn.BatchNorm2d(3).cuda()
+        bn.weight.data[0] = 0.2
+        bn.weight.data[1] = 0.5
+        bn.weight.data[2] = 0.7
+        bn.train()
+
+        sx = self.x[rank * 4:rank * 4 + 4]
+        sx.requires_grad_()
+        sy = syncbn(sx)
+        sy.backward(self.y_bp[rank * 4:rank * 4 + 4])
+
+        smean = syncbn.running_mean
+        svar = syncbn.running_var
+        sx_grad = sx.grad
+        sw_grad = syncbn.weight.grad
+        sb_grad = syncbn.bias.grad
+
+        if size == 1:
+            x = self.x[rank * 4:rank * 4 + 4]
+            y_bp = self.y_bp[rank * 4:rank * 4 + 4]
+        elif size == 2:
+            x = self.x[rank // 2 * 8:rank // 2 * 8 + 8]
+            y_bp = self.y_bp[rank // 2 * 8:rank // 2 * 8 + 8]
+        elif size == 4:
+            x = self.x
+            y_bp = self.y_bp
+        x.requires_grad_()
+        y = bn(x)
+        y.backward(y_bp)
+
+        if size == 2:
+            y = y[rank % 2 * 4:rank % 2 * 4 + 4]
+        elif size == 4:
+            y = y[rank * 4:rank * 4 + 4]
+
+        mean = bn.running_mean
+        var = bn.running_var
+        if size == 1:
+            x_grad = x.grad
+            w_grad = bn.weight.grad
+            b_grad = bn.bias.grad
+        elif size == 2:
+            x_grad = x.grad[rank % 2 * 4:rank % 2 * 4 + 4]
+            w_grad = bn.weight.grad / 2
+            b_grad = bn.bias.grad / 2
+        elif size == 4:
+            x_grad = x.grad[rank * 4:rank * 4 + 4]
+            w_grad = bn.weight.grad / 4
+            b_grad = bn.bias.grad / 4
+
+        assert np.allclose(mean.data.cpu().numpy(),
+                           smean.data.cpu().numpy(), 1e-3)
+        assert np.allclose(var.data.cpu().numpy(),
+                           svar.data.cpu().numpy(), 1e-3)
+        assert np.allclose(y.data.cpu().numpy(), sy.data.cpu().numpy(), 1e-3)
+        assert np.allclose(w_grad.data.cpu().numpy(),
+                           sw_grad.data.cpu().numpy(), 1e-3)
+        assert np.allclose(b_grad.data.cpu().numpy(),
+                           sb_grad.data.cpu().numpy(), 1e-3)
+        assert np.allclose(x_grad.data.cpu().numpy(),
+                           sx_grad.data.cpu().numpy(), 1e-2)
+
+    def _test_syncbn_empty_train(self, size=1, half=False):
+
+        if 'SLURM_NTASKS' not in os.environ or int(
+                os.environ['SLURM_NTASKS']) != 4:
+            print('must run with slurm has 4 processes!\n'
+                  'srun -p test --gres=gpu:4 -n4')
+            return
+        else:
+            print('Running syncbn test')
+        from mmcv.ops import SyncBatchNorm
+
+        assert size in (1, 2, 4)
+        if not dist.is_initialized():
+            self.dist_init()
+        rank = dist.get_rank()
+
+        torch.manual_seed(9)
+        torch.cuda.manual_seed(9)
+
+        self.x = torch.rand(0, 3, 2, 3).cuda()
+        self.y_bp = torch.rand(0, 3, 2, 3).cuda()
+
+        if half:
+            self.x = self.x.half()
+            self.y_bp = self.y_bp.half()
+        dist.broadcast(self.x, src=0)
+        dist.broadcast(self.y_bp, src=0)
+
+        torch.cuda.synchronize()
+        if size == 1:
+            groups = [None, None, None, None]
+            groups[0] = dist.new_group([0])
+            groups[1] = dist.new_group([1])
+            groups[2] = dist.new_group([2])
+            groups[3] = dist.new_group([3])
+            group = groups[rank]
+        elif size == 2:
+            groups = [None, None, None, None]
+            groups[0] = groups[1] = dist.new_group([0, 1])
+            groups[2] = groups[3] = dist.new_group([2, 3])
+            group = groups[rank]
+        elif size == 4:
+            group = dist.group.WORLD
+
+        syncbn = SyncBatchNorm(3, group=group, stats_mode='N').cuda()
+        syncbn.weight.data[0] = 0.2
+        syncbn.weight.data[1] = 0.5
+        syncbn.weight.data[2] = 0.7
+        syncbn.train()
+
+        bn = nn.BatchNorm2d(3).cuda()
+        bn.weight.data[0] = 0.2
+        bn.weight.data[1] = 0.5
+        bn.weight.data[2] = 0.7
+        bn.train()
+
+        sx = self.x[rank * 4:rank * 4 + 4]
+        sx.requires_grad_()
+        sy = syncbn(sx)
+        sy.backward(self.y_bp[rank * 4:rank * 4 + 4])
+        smean = syncbn.running_mean
+        svar = syncbn.running_var
+        sx_grad = sx.grad
+        sw_grad = syncbn.weight.grad
+        sb_grad = syncbn.bias.grad
+
+        if size == 1:
+            x = self.x[rank * 4:rank * 4 + 4]
+            y_bp = self.y_bp[rank * 4:rank * 4 + 4]
+        elif size == 2:
+            x = self.x[rank // 2 * 8:rank // 2 * 8 + 8]
+            y_bp = self.y_bp[rank // 2 * 8:rank // 2 * 8 + 8]
+        elif size == 4:
+            x = self.x
+            y_bp = self.y_bp
+        x.requires_grad_()
+        y = bn(x)
+        y.backward(y_bp)
+
+        if size == 2:
+            y = y[rank % 2 * 4:rank % 2 * 4 + 4]
+        elif size == 4:
+            y = y[rank * 4:rank * 4 + 4]
+
+        mean = bn.running_mean
+        var = bn.running_var
+        if size == 1:
+            x_grad = x.grad
+            w_grad = bn.weight.grad
+            b_grad = bn.bias.grad
+        elif size == 2:
+            x_grad = x.grad[rank % 2 * 4:rank % 2 * 4 + 4]
+            w_grad = bn.weight.grad / 2
+            b_grad = bn.bias.grad / 2
+        elif size == 4:
+            x_grad = x.grad[rank * 4:rank * 4 + 4]
+            w_grad = bn.weight.grad / 4
+            b_grad = bn.bias.grad / 4
+
+        assert np.allclose(mean.data.cpu().numpy(),
+                           smean.data.cpu().numpy(), 1e-3)
+        assert np.allclose(var.data.cpu().numpy(),
+                           svar.data.cpu().numpy(), 1e-3)
+        assert np.allclose(y.data.cpu().numpy(), sy.data.cpu().numpy(), 1e-3)
+        assert np.allclose(w_grad.data.cpu().numpy(),
+                           sw_grad.data.cpu().numpy(), 1e-3)
+        assert np.allclose(b_grad.data.cpu().numpy(),
+                           sb_grad.data.cpu().numpy(), 1e-3)
+        assert np.allclose(x_grad.data.cpu().numpy(),
+                           sx_grad.data.cpu().numpy(), 1e-2)
+
+        # 'stats_mode' only allows 'default' and 'N'
+        with pytest.raises(AssertionError):
+            SyncBatchNorm(3, group=group, stats_mode='X')
+
+    def test_syncbn_1(self):
+        self._test_syncbn_train(size=1)
+
+    def test_syncbn_2(self):
+        self._test_syncbn_train(size=2)
+
+    def test_syncbn_4(self):
+        self._test_syncbn_train(size=4)
+
+    def test_syncbn_1_half(self):
+        self._test_syncbn_train(size=1, half=True)
+
+    def test_syncbn_2_half(self):
+        self._test_syncbn_train(size=2, half=True)
+
+    def test_syncbn_4_half(self):
+        self._test_syncbn_train(size=4, half=True)
+
+    def test_syncbn_empty_1(self):
+        self._test_syncbn_empty_train(size=1)
+
+    def test_syncbn_empty_2(self):
+        self._test_syncbn_empty_train(size=2)
+
+    def test_syncbn_empty_4(self):
+        self._test_syncbn_empty_train(size=4)
+
+    def test_syncbn_empty_1_half(self):
+        self._test_syncbn_empty_train(size=1, half=True)
+
+    def test_syncbn_empty_2_half(self):
+        self._test_syncbn_empty_train(size=2, half=True)
+
+    def test_syncbn_empty_4_half(self):
+        self._test_syncbn_empty_train(size=4, half=True)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_three_interpolate.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_three_interpolate.py
new file mode 100644
index 0000000000000000000000000000000000000000..51a6b87327e811fa726153fb8ddd76c8ead93ac2
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_three_interpolate.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import three_interpolate
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+@pytest.mark.parametrize('dtype', [torch.half, torch.float, torch.double])
+def test_three_interpolate(dtype):
+    features = torch.tensor(
+        [[[2.4350, 4.7516, 4.4995, 2.4350, 2.4350, 2.4350],
+          [3.1236, 2.6278, 3.0447, 3.1236, 3.1236, 3.1236],
+          [2.6732, 2.8677, 2.6436, 2.6732, 2.6732, 2.6732],
+          [0.0124, 7.0150, 7.0199, 0.0124, 0.0124, 0.0124],
+          [0.3207, 0.0000, 0.3411, 0.3207, 0.3207, 0.3207]],
+         [[0.0000, 0.9544, 2.4532, 0.0000, 0.0000, 0.0000],
+          [0.5346, 1.9176, 1.4715, 0.5346, 0.5346, 0.5346],
+          [0.0000, 0.2744, 2.0842, 0.0000, 0.0000, 0.0000],
+          [0.3414, 1.5063, 1.6209, 0.3414, 0.3414, 0.3414],
+          [0.5814, 0.0103, 0.0000, 0.5814, 0.5814, 0.5814]]],
+        dtype=dtype).cuda()
+
+    idx = torch.tensor([[[0, 1, 2], [2, 3, 4], [2, 3, 4], [0, 1, 2], [0, 1, 2],
+                         [0, 1, 3]],
+                        [[0, 2, 3], [1, 3, 4], [2, 1, 4], [0, 2, 4], [0, 2, 4],
+                         [0, 1, 2]]]).int().cuda()
+
+    weight = torch.tensor([[[3.3333e-01, 3.3333e-01, 3.3333e-01],
+                            [1.0000e+00, 5.8155e-08, 2.2373e-08],
+                            [1.0000e+00, 1.7737e-08, 1.7356e-08],
+                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
+                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
+                            [3.3333e-01, 3.3333e-01, 3.3333e-01]],
+                           [[3.3333e-01, 3.3333e-01, 3.3333e-01],
+                            [1.0000e+00, 1.3651e-08, 7.7312e-09],
+                            [1.0000e+00, 1.7148e-08, 1.4070e-08],
+                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
+                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
+                            [3.3333e-01, 3.3333e-01, 3.3333e-01]]],
+                          dtype=dtype).cuda()
+
+    output = three_interpolate(features, idx, weight)
+    expected_output = torch.tensor([[[
+        3.8953e+00, 4.4995e+00, 4.4995e+00, 3.8953e+00, 3.8953e+00, 3.2072e+00
+    ], [
+        2.9320e+00, 3.0447e+00, 3.0447e+00, 2.9320e+00, 2.9320e+00, 2.9583e+00
+    ], [
+        2.7281e+00, 2.6436e+00, 2.6436e+00, 2.7281e+00, 2.7281e+00, 2.7380e+00
+    ], [
+        4.6824e+00, 7.0199e+00, 7.0199e+00, 4.6824e+00, 4.6824e+00, 2.3466e+00
+    ], [
+        2.2060e-01, 3.4110e-01, 3.4110e-01, 2.2060e-01, 2.2060e-01, 2.1380e-01
+    ]],
+                                    [[
+                                        8.1773e-01, 9.5440e-01, 2.4532e+00,
+                                        8.1773e-01, 8.1773e-01, 1.1359e+00
+                                    ],
+                                     [
+                                         8.4689e-01, 1.9176e+00, 1.4715e+00,
+                                         8.4689e-01, 8.4689e-01, 1.3079e+00
+                                     ],
+                                     [
+                                         6.9473e-01, 2.7440e-01, 2.0842e+00,
+                                         6.9473e-01, 6.9473e-01, 7.8619e-01
+                                     ],
+                                     [
+                                         7.6789e-01, 1.5063e+00, 1.6209e+00,
+                                         7.6789e-01, 7.6789e-01, 1.1562e+00
+                                     ],
+                                     [
+                                         3.8760e-01, 1.0300e-02, 8.3569e-09,
+                                         3.8760e-01, 3.8760e-01, 1.9723e-01
+                                     ]]],
+                                   dtype=dtype).cuda()
+
+    assert torch.allclose(output, expected_output, 1e-3, 1e-4)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_three_nn.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_three_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..456188b9179bf8dc577985f80d3883af42c4aa86
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_three_nn.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import three_nn
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+known = [[[-1.8373, 3.5605, -0.7867], [0.7615, 2.9420, 0.2314],
+          [-0.6503, 3.6637, -1.0622], [-1.8373, 3.5605, -0.7867],
+          [-1.8373, 3.5605, -0.7867]],
+         [[-1.3399, 1.9991, -0.3698], [-0.0799, 0.9698, -0.8457],
+          [0.0858, 2.4721, -0.1928], [-1.3399, 1.9991, -0.3698],
+          [-1.3399, 1.9991, -0.3698]]]
+
+unknown = [[[-1.8373, 3.5605, -0.7867], [0.7615, 2.9420, 0.2314],
+            [-0.6503, 3.6637, -1.0622], [-1.5237, 2.3976, -0.8097],
+            [-0.0722, 3.4017, -0.2880], [0.5198, 3.0661, -0.4605],
+            [-2.0185, 3.5019, -0.3236], [0.5098, 3.1020, 0.5799],
+            [-1.6137, 3.8443, -0.5269], [0.7341, 2.9626, -0.3189]],
+           [[-1.3399, 1.9991, -0.3698], [-0.0799, 0.9698, -0.8457],
+            [0.0858, 2.4721, -0.1928], [-0.9022, 1.6560, -1.3090],
+            [0.1156, 1.6901, -0.4366], [-0.6477, 2.3576, -0.1563],
+            [-0.8482, 1.1466, -1.2704], [-0.8753, 2.0845, -0.3460],
+            [-0.5621, 1.4233, -1.2858], [-0.5883, 1.3114, -1.2899]]]
+
+expected_dist = [[[0.0000, 0.0000, 0.0000], [0.0000, 2.0463, 2.8588],
+                  [0.0000, 1.2229, 1.2229], [1.2047, 1.2047, 1.2047],
+                  [1.0011, 1.0845, 1.8411], [0.7433, 1.4451, 2.4304],
+                  [0.5007, 0.5007, 0.5007], [0.4587, 2.0875, 2.7544],
+                  [0.4450, 0.4450, 0.4450], [0.5514, 1.7206, 2.6811]],
+                 [[0.0000, 0.0000, 0.0000], [0.0000, 1.6464, 1.6952],
+                  [0.0000, 1.5125, 1.5125], [1.0915, 1.0915, 1.0915],
+                  [0.8197, 0.8511, 1.4894], [0.7433, 0.8082, 0.8082],
+                  [0.8955, 1.3340, 1.3340], [0.4730, 0.4730, 0.4730],
+                  [0.7949, 1.3325, 1.3325], [0.7566, 1.3727, 1.3727]]]
+
+expected_idx = [[[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4], [2, 1, 0],
+                 [1, 2, 0], [0, 3, 4], [1, 2, 0], [0, 3, 4], [1, 2, 0]],
+                [[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4], [2, 1, 0],
+                 [2, 0, 3], [1, 0, 3], [0, 3, 4], [1, 0, 3], [1, 0, 3]]]
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+@pytest.mark.parametrize('dtype,rtol', [(torch.float, 1e-8),
+                                        (torch.half, 1e-3)])
+def test_three_nn(device, dtype, rtol):
+    dtype = torch.float
+    known_t = torch.tensor(known, dtype=dtype, device=device)
+    unknown_t = torch.tensor(unknown, dtype=dtype, device=device)
+
+    dist_t, idx_t = three_nn(unknown_t, known_t)
+    expected_dist_t = torch.tensor(expected_dist, dtype=dtype, device=device)
+    expected_idx_t = torch.tensor(expected_idx, device=device)
+
+    assert torch.allclose(dist_t, expected_dist_t, atol=1e-4, rtol=rtol)
+    assert torch.all(idx_t == expected_idx_t)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_tin_shift.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_tin_shift.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8ce14465cf957e13df4dcd72c95d647c1cba3aa
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_tin_shift.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+
+    _USING_PARROTS = False
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+inputs = ([[[[0.88572276, 0.46422583], [0.97408265, 0.59547687],
+             [0.030812204, 0.96236038], [0.75418317, 0.44058233],
+             [0.33279222, 0.00084149837], [0.7069388, 0.23255438],
+             [0.13547045, 0.81549376], [0.40174931, 0.36317211]],
+            [[0.57444429, 0.15905505], [0.39897251, 0.25790238],
+             [0.93282568, 0.18451685], [0.92526674, 0.18283755],
+             [0.31664443, 0.59323865], [0.1957739, 0.42505842],
+             [0.081158757, 0.81340349], [0.43456328, 0.30195212]],
+            [[0.8198145, 0.05990988], [0.98062474, 0.34803438],
+             [0.10412294, 0.37183142], [0.15021622, 0.038857818],
+             [0.40985721, 0.42253625], [0.71150124, 0.59778064],
+             [0.83851069, 0.15194464], [0.097513378, 0.74820143]],
+            [[0.80680406, 0.49327564], [0.17821097, 0.12980539],
+             [0.50657678, 0.14446253], [0.04178369, 0.53071898],
+             [0.84983683, 0.3826949], [0.32193625, 0.91275406],
+             [0.75628334, 0.52934098], [0.27994192, 0.3053292]]],
+           [[[0.082397044, 0.4210068], [0.23563534, 0.7938987],
+             [0.63669145, 0.69397897], [0.8844561, 0.97854084],
+             [0.79027033, 0.60640401], [0.63528901, 0.72172403],
+             [0.0097346902, 0.70800996], [0.87891227, 0.13674974]],
+            [[0.74329448, 0.0243572], [0.82178867, 0.85750699],
+             [0.7568835, 0.73146772], [0.5031184, 0.30479157],
+             [0.28713053, 0.47414285], [0.4682079, 0.067471564],
+             [0.48368263, 0.14590704], [0.25397325, 0.19946373]],
+            [[0.4291026, 0.068739474], [0.7159555, 0.79903615],
+             [0.76412082, 0.85348046], [0.081224024, 0.82264912],
+             [0.97173303, 0.24291694], [0.48957139, 0.43488795],
+             [0.67382395, 0.21889746], [0.36712623, 0.67127824]],
+            [[0.12054044, 0.18096751], [0.86675781, 0.54755616],
+             [0.68208277, 0.15164375], [0.79991871, 0.80811197],
+             [0.85256428, 0.68253738], [0.185983, 0.95642138],
+             [0.48102546, 0.28009653], [0.35726011, 0.58168036]]]])
+
+shifts = [([[1, 0, 1, -2], [-2, 1, -1, 1]]), ([[2, 1, 2, -1], [-1, 2, 0, 2]])]
+
+outputs = [([[[[0.0, 0.0], [0.0, 0.0], [0.030812, 0.96236], [0.75418, 0.44058],
+               [0.0, 0.0], [0.0, 0.0], [0.83851, 0.15194], [0.097513, 0.7482]],
+              [[0.88572, 0.46423], [0.97408, 0.59548], [0.93283, 0.18452],
+               [0.92527, 0.18284], [0.33279, 0.0008415], [0.70694, 0.23255],
+               [0.75628, 0.52934], [0.27994, 0.30533]],
+              [[0.57444, 0.15906], [0.39897, 0.2579], [0.10412, 0.37183],
+               [0.15022, 0.038858], [0.31664, 0.59324], [0.19577, 0.42506],
+               [0.0, 0.0], [0.0, 0.0]],
+              [[0.81981, 0.05991], [0.98062, 0.34803], [0.50658, 0.14446],
+               [0.041784, 0.53072], [0.40986, 0.42254], [0.7115, 0.59778],
+               [0.0, 0.0], [0.0, 0.0]]],
+             [[[0.4291, 0.068739], [0.71596, 0.79904], [0.0, 0.0], [0.0, 0.0],
+               [0.28713, 0.47414], [0.46821, 0.067472], [0.0, 0.0], [0.0,
+                                                                     0.0]],
+              [[0.12054, 0.18097], [0.86676, 0.54756], [0.63669, 0.69398],
+               [0.88446, 0.97854], [0.97173, 0.24292], [0.48957, 0.43489],
+               [0.0097347, 0.70801], [0.87891, 0.13675]],
+              [[0.0, 0.0], [0.0, 0.0], [0.75688, 0.73147], [0.50312, 0.30479],
+               [0.85256, 0.68254], [0.18598, 0.95642], [0.48368, 0.14591],
+               [0.25397, 0.19946]],
+              [[0.0, 0.0], [0.0, 0.0], [0.76412, 0.85348], [0.081224, 0.82265],
+               [0.0, 0.0], [0.0, 0.0], [0.67382, 0.2189], [0.36713,
+                                                           0.67128]]]]),
+           ([[[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0],
+               [0.0, 0.0], [0.081159, 0.8134], [0.43456, 0.30195]],
+              [[0.0, 0.0], [0.0, 0.0], [0.030812, 0.96236], [0.75418, 0.44058],
+               [0.0, 0.0], [0.0, 0.0], [0.83851, 0.15194], [0.097513, 0.7482]],
+              [[0.88572, 0.46423], [0.97408, 0.59548], [0.93283, 0.18452],
+               [0.92527, 0.18284], [0.33279, 0.0008415], [0.70694, 0.23255],
+               [0.75628, 0.52934], [0.27994, 0.30533]],
+              [[0.57444, 0.15906], [0.39897, 0.2579], [0.10412, 0.37183],
+               [0.15022, 0.038858], [0.31664, 0.59324], [0.19577, 0.42506],
+               [0.0, 0.0], [0.0, 0.0]]],
+             [[[0.74329, 0.024357], [0.82179, 0.85751], [0.0, 0.0], [0.0, 0.0],
+               [0.79027, 0.6064], [0.63529, 0.72172], [0.0, 0.0], [0.0, 0.0]],
+              [[0.4291, 0.068739], [0.71596, 0.79904], [0.0, 0.0], [0.0, 0.0],
+               [0.28713, 0.47414], [0.46821, 0.067472], [0.0, 0.0], [0.0,
+                                                                     0.0]],
+              [[0.12054, 0.18097], [0.86676, 0.54756], [0.63669, 0.69398],
+               [0.88446, 0.97854], [0.97173, 0.24292], [0.48957, 0.43489],
+               [0.0097347, 0.70801], [0.87891, 0.13675]],
+              [[0.0, 0.0], [0.0, 0.0], [0.75688, 0.73147], [0.50312, 0.30479],
+               [0.85256, 0.68254], [0.18598, 0.95642], [0.48368, 0.14591],
+               [0.25397, 0.19946]]]])]
+
+grads = [
+    [[[[0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.],
+       [1., 1.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
+       [1., 1.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [0., 0.],
+       [0., 0.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [0., 0.],
+       [0., 0.]]],
+     [[[1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.],
+       [0., 0.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
+       [1., 1.]],
+      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
+       [1., 1.]],
+      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.],
+       [1., 1.]]]],
+    [[[[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [1., 1.],
+       [1., 1.]],
+      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.],
+       [1., 1.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
+       [1., 1.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [0., 0.],
+       [0., 0.]]],
+     [[[1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.],
+       [0., 0.]],
+      [[1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.],
+       [0., 0.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
+       [1., 1.]],
+      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
+       [1., 1.]]]]
+]
+
+
+def _test_tinshift_gradcheck(device, dtype):
+    try:
+        from mmcv.ops import tin_shift
+    except ModuleNotFoundError:
+        pytest.skip('TINShift op is not successfully compiled')
+
+    if dtype == torch.half:
+        pytest.skip('"add_cpu/sub_cpu" not implemented for Half')
+
+    for shift in shifts:
+        np_input = np.array(inputs)
+        np_shift = np.array(shift)
+
+        x = torch.tensor(
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        shift = torch.tensor(np_shift, device=device).int()
+        if torch.__version__ == 'parrots':
+            gradcheck(tin_shift, (x, shift))
+        else:
+            gradcheck(tin_shift, (x, shift), atol=1, rtol=0.1)
+
+
+def _test_tinshift_allclose(device, dtype):
+    try:
+        from mmcv.ops import tin_shift
+    except ModuleNotFoundError:
+        pytest.skip('TINShift op is not successfully compiled')
+
+    for shift, output, grad in zip(shifts, outputs, grads):
+        np_input = np.array(inputs)
+        np_shift = np.array(shift)
+        np_output = np.array(output)
+        np_grad = np.array(grad)
+
+        x = torch.tensor(
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        shift = torch.tensor(np_shift, device=device).int()
+
+        output = tin_shift(x, shift)
+        output.backward(torch.ones_like(output))
+        assert np.allclose(
+            output.data.type(torch.float).cpu().numpy(), np_output, 1e-3)
+        assert np.allclose(
+            x.grad.data.type(torch.float).cpu().numpy(), np_grad, 1e-3)
+
+
+def _test_tinshift_assert(device, dtype):
+    try:
+        from mmcv.ops import tin_shift
+    except ModuleNotFoundError:
+        pytest.skip('TINShift op is not successfully compiled')
+
+    inputs = [
+        torch.rand(2, 3, 4, 2),
+        torch.rand(2, 3, 4, 2),
+        torch.rand(1, 3, 4, 2)
+    ]
+    shifts = [torch.rand(2, 3), torch.rand(2, 5)]
+
+    for x, shift in zip(inputs, shifts):
+        x = x.to(device).type(dtype)
+        shift = shift.to(device).type(dtype)
+
+        # A ValueError should be raised if ops get inputs with wrong shapes.
+        with pytest.raises(ValueError):
+            tin_shift(x, shift)
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+@pytest.mark.parametrize('dtype', [
+    torch.float,
+    pytest.param(
+        torch.double,
+        marks=pytest.mark.skipif(
+            IS_MLU_AVAILABLE,
+            reason='MLU does not support for 64-bit floating point')),
+    torch.half
+])
+def test_tinshift(device, dtype):
+    _test_tinshift_allclose(device=device, dtype=dtype)
+    _test_tinshift_gradcheck(device=device, dtype=dtype)
+    _test_tinshift_assert(device=device, dtype=dtype)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_upfirdn2d.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_upfirdn2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..1342480a634cac27193c08424ae4f9a0a3d0a3f1
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_upfirdn2d.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck, gradgradcheck
+    _USING_PARROTS = False
+
+
+class TestUpFirDn2d:
+    """Unit test for UpFirDn2d.
+
+    Here, we just test the basic case of upsample version. More gerneal tests
+    will be included in other unit test for UpFirDnUpsample and
+    UpFirDnDownSample modules.
+    """
+
+    @classmethod
+    def setup_class(cls):
+        kernel_1d = torch.tensor([1., 3., 3., 1.])
+        cls.kernel = kernel_1d[:, None] * kernel_1d[None, :]
+        cls.kernel = cls.kernel / cls.kernel.sum()
+        cls.factor = 2
+        pad = cls.kernel.shape[0] - cls.factor
+        cls.pad = ((pad + 1) // 2 + cls.factor - 1, pad // 2)
+
+        cls.input_tensor = torch.randn((2, 3, 4, 4), requires_grad=True)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
+    def test_upfirdn2d(self):
+        from mmcv.ops import upfirdn2d
+        if _USING_PARROTS:
+            gradcheck(
+                upfirdn2d,
+                (self.input_tensor.cuda(),
+                 self.kernel.type_as(
+                     self.input_tensor).cuda(), self.factor, 1, self.pad),
+                delta=1e-4,
+                pt_atol=1e-3)
+        else:
+            gradcheck(
+                upfirdn2d,
+                (self.input_tensor.cuda(),
+                 self.kernel.type_as(
+                     self.input_tensor).cuda(), self.factor, 1, self.pad),
+                eps=1e-4,
+                atol=1e-3)
+
+            gradgradcheck(
+                upfirdn2d,
+                (self.input_tensor.cuda(),
+                 self.kernel.type_as(
+                     self.input_tensor).cuda(), self.factor, 1, self.pad),
+                eps=1e-4,
+                atol=1e-3)
+
+        # test with different up
+        kernel = torch.randn(3, 3)
+        out = upfirdn2d(
+            self.input_tensor.cuda(), filter=kernel.cuda(), up=2, padding=1)
+        assert out.shape == (2, 3, 8, 8)
+
+        # test with different down
+        input_tensor = torch.randn(2, 3, 8, 8)
+        out = upfirdn2d(
+            input_tensor.cuda(), filter=self.kernel.cuda(), down=2, padding=1)
+        assert out.shape == (2, 3, 4, 4)
+
+        # test with different flip_filter
+        out = upfirdn2d(
+            self.input_tensor.cuda(),
+            filter=self.kernel.cuda(),
+            flip_filter=True)
+        assert out.shape == (2, 3, 1, 1)
+
+        # test with different gain
+        out1 = upfirdn2d(
+            self.input_tensor.cuda(), filter=self.kernel.cuda(), gain=0.2)
+        out2 = upfirdn2d(
+            self.input_tensor.cuda(), filter=self.kernel.cuda(), gain=0.1)
+        assert torch.allclose(out1, out2 * 2)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_ops/test_voxelization.py b/head_extractor/mmcv-2.1.0/tests/test_ops/test_voxelization.py
new file mode 100644
index 0000000000000000000000000000000000000000..78282a8ad00add475f7829bbeaf1c88cf283ee4a
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_ops/test_voxelization.py
@@ -0,0 +1,217 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import Voxelization
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_NPU_AVAILABLE
+
+
+def _get_voxel_points_indices(points, coors, voxel):
+    result_form = np.equal(coors, voxel)
+    return result_form[:, 0] & result_form[:, 1] & result_form[:, 2]
+
+
+@pytest.mark.parametrize('device_type', [
+    'cpu',
+    pytest.param(
+        'cuda:0',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+])
+def test_voxelization(device_type):
+    voxel_size = [0.5, 0.5, 0.5]
+    point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+    voxel_dict = np.load(
+        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
+    expected_coors = voxel_dict['coors']
+    expected_voxels = voxel_dict['voxels']
+    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']
+    points = voxel_dict['points']
+
+    points = torch.tensor(points)
+    max_num_points = -1
+    dynamic_voxelization = Voxelization(voxel_size, point_cloud_range,
+                                        max_num_points)
+    max_num_points = 1000
+    hard_voxelization = Voxelization(voxel_size, point_cloud_range,
+                                     max_num_points)
+
+    device = torch.device(device_type)
+
+    # test hard_voxelization on cpu/gpu
+    points = points.contiguous().to(device)
+    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy()
+    voxels = voxels.cpu().detach().numpy()
+    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()
+    assert np.all(coors == expected_coors)
+    assert np.all(voxels == expected_voxels)
+    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)
+
+    # test dynamic_voxelization on cpu/gpu
+    coors = dynamic_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy()
+    points = points.cpu().detach().numpy()
+    for i in range(expected_voxels.shape[0]):
+        indices = _get_voxel_points_indices(points, coors, expected_voxels[i])
+        num_points_current_voxel = points[indices].shape[0]
+        assert num_points_current_voxel > 0
+        assert np.all(
+            points[indices] == expected_coors[i][:num_points_current_voxel])
+        assert num_points_current_voxel == expected_num_points_per_voxel[i]
+
+
+@pytest.mark.skipif(not IS_CUDA_AVAILABLE, reason='requires CUDA support')
+def test_voxelization_nondeterministic():
+    voxel_size = [0.5, 0.5, 0.5]
+    point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+    voxel_dict = np.load(
+        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
+    points = voxel_dict['points']
+
+    points = torch.tensor(points)
+    max_num_points = -1
+    dynamic_voxelization = Voxelization(voxel_size, point_cloud_range,
+                                        max_num_points)
+
+    max_num_points = 10
+    max_voxels = 50
+    hard_voxelization = Voxelization(
+        voxel_size,
+        point_cloud_range,
+        max_num_points,
+        max_voxels,
+        deterministic=False)
+
+    # test hard_voxelization (non-deterministic version) on gpu
+    points = torch.tensor(points).contiguous().to(device='cuda:0')
+    voxels, coors, num_points_per_voxel = hard_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy().tolist()
+    voxels = voxels.cpu().detach().numpy().tolist()
+    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy().tolist()
+
+    coors_all = dynamic_voxelization.forward(points)
+    coors_all = coors_all.cpu().detach().numpy().tolist()
+
+    coors_set = {tuple(c) for c in coors}
+    coors_all_set = {tuple(c) for c in coors_all}
+
+    assert len(coors_set) == len(coors)
+    assert len(coors_set - coors_all_set) == 0
+
+    points = points.cpu().detach().numpy().tolist()
+
+    coors_points_dict = {}
+    for c, ps in zip(coors_all, points):
+        if tuple(c) not in coors_points_dict:
+            coors_points_dict[tuple(c)] = set()
+        coors_points_dict[tuple(c)].add(tuple(ps))
+
+    for c, ps, n in zip(coors, voxels, num_points_per_voxel):
+        ideal_voxel_points_set = coors_points_dict[tuple(c)]
+        voxel_points_set = {tuple(p) for p in ps[:n]}
+        assert len(voxel_points_set) == n
+        if n < max_num_points:
+            assert voxel_points_set == ideal_voxel_points_set
+            for p in ps[n:]:
+                assert max(p) == min(p) == 0
+        else:
+            assert len(voxel_points_set - ideal_voxel_points_set) == 0
+
+    # test hard_voxelization (non-deterministic version) on gpu
+    # with all input point in range
+    points = torch.tensor(points).contiguous().to(device='cuda:0')[:max_voxels]
+    coors_all = dynamic_voxelization.forward(points)
+    valid_mask = coors_all.ge(0).all(-1)
+    points = points[valid_mask]
+    coors_all = coors_all[valid_mask]
+    coors_all = coors_all.cpu().detach().numpy().tolist()
+
+    voxels, coors, num_points_per_voxel = hard_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy().tolist()
+
+    coors_set = {tuple(c) for c in coors}
+    coors_all_set = {tuple(c) for c in coors_all}
+
+    assert len(coors_set) == len(coors) == len(coors_all_set)
+
+
+@pytest.mark.parametrize(
+    'device_type',
+    [
+        pytest.param(
+            # this is only used for dipu device testing case.
+            # dipu will mock to cuda automatically on mlu physical device.
+            'cuda:0',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+def test_voxelization_mlu(device_type):
+    voxel_size = [0.5, 0.5, 0.5]
+    point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+    voxel_dict = np.load(
+        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
+    expected_coors = voxel_dict['coors']
+    expected_voxels = voxel_dict['voxels']
+    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']
+    points = voxel_dict['points']
+
+    points = torch.tensor(points)
+    max_num_points = 1000
+    hard_voxelization = Voxelization(voxel_size, point_cloud_range,
+                                     max_num_points)
+
+    device = torch.device(device_type)
+
+    # test hard_voxelization on mlu
+    points = points.contiguous().to(device)
+    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy()
+    voxels = voxels.cpu().detach().numpy()
+    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()
+    assert np.all(coors == expected_coors)
+    assert np.all(voxels == expected_voxels)
+    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)
+
+
+@pytest.mark.parametrize('device_type', [
+    pytest.param(
+        'npu',
+        marks=pytest.mark.skipif(
+            not IS_NPU_AVAILABLE, reason='requires NPU support'))
+])
+def test_voxelization_npu(device_type):
+    voxel_size = [0.5, 0.5, 0.5]
+    point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+    voxel_dict = np.load(
+        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
+    expected_coors = voxel_dict['coors']
+    expected_voxels = voxel_dict['voxels']
+    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']
+    points = voxel_dict['points']
+
+    points = torch.tensor(points)
+    max_num_points = 1000
+    hard_voxelization = Voxelization(voxel_size, point_cloud_range,
+                                     max_num_points)
+
+    device = torch.device(device_type)
+
+    # test hard_voxelization on npu
+    points = points.contiguous().to(device)
+    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy()
+    voxels = voxels.cpu().detach().numpy()
+    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()
+    assert np.all(coors == expected_coors)
+    assert np.all(voxels == expected_voxels)
+    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_transforms/test_transforms_formatting.py b/head_extractor/mmcv-2.1.0/tests/test_transforms/test_transforms_formatting.py
new file mode 100644
index 0000000000000000000000000000000000000000..96abc8c221b81ec50f8374a843778de81a1b9c24
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_transforms/test_transforms_formatting.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+try:
+    import torch
+except ModuleNotFoundError:
+    torch = None
+else:
+    from mmcv.transforms import ToTensor, to_tensor, ImageToTensor
+
+import copy
+
+import numpy as np
+import pytest
+
+
+@pytest.mark.skipif(condition=torch is None, reason='No torch in current env')
+def test_to_tensor():
+
+    # The type of the input object is torch.Tensor
+    data_tensor = torch.tensor([1, 2, 3])
+    tensor_from_tensor = to_tensor(data_tensor)
+    assert isinstance(tensor_from_tensor, torch.Tensor)
+
+    # The type of the input object is numpy.ndarray
+    data_numpy = np.array([1, 2, 3])
+    tensor_from_numpy = to_tensor(data_numpy)
+    assert isinstance(tensor_from_numpy, torch.Tensor)
+
+    # The type of the input object is list
+    data_list = [1, 2, 3]
+    tensor_from_list = to_tensor(data_list)
+    assert isinstance(tensor_from_list, torch.Tensor)
+
+    # The type of the input object is int
+    data_int = 1
+    tensor_from_int = to_tensor(data_int)
+    assert isinstance(tensor_from_int, torch.Tensor)
+
+    # The type of the input object is float
+    data_float = 1.0
+    tensor_from_float = to_tensor(data_float)
+    assert isinstance(tensor_from_float, torch.Tensor)
+
+    # The type of the input object is invalid
+    with pytest.raises(TypeError):
+        data_str = '123'
+        _ = to_tensor(data_str)
+
+
+@pytest.mark.skipif(condition=torch is None, reason='No torch in current env')
+class TestToTensor:
+
+    def test_init(self):
+        TRANSFORM = ToTensor(keys=['img_label'])
+        assert TRANSFORM.keys == ['img_label']
+
+    def test_transform(self):
+        TRANSFORMS = ToTensor(['instances.bbox', 'img_label'])
+
+        # Test multi-level key and single-level key (multi-level key is
+        # not in results)
+        with pytest.raises(KeyError):
+            results = {'instances': {'label': [1]}, 'img_label': [1]}
+            results_tensor = TRANSFORMS.transform(copy.deepcopy(results))
+            assert isinstance(results_tensor['instances']['label'], list)
+            assert isinstance(results_tensor['img_label'], torch.Tensor)
+
+        # Test multi-level key (multi-level key is in results)
+        results = {'instances': {'bbox': [[0, 0, 10, 10]]}, 'img_label': [1]}
+        results_tensor = TRANSFORMS.transform(copy.deepcopy(results))
+        assert isinstance(results_tensor['instances']['bbox'], torch.Tensor)
+
+    def test_repr(self):
+        TRANSFORMS = ToTensor(['instances.bbox', 'img_label'])
+        TRANSFORMS_str = str(TRANSFORMS)
+        isinstance(TRANSFORMS_str, str)
+
+
+@pytest.mark.skipif(condition=torch is None, reason='No torch in current env')
+class TestImageToTensor:
+
+    def test_init(self):
+        TRANSFORMS = ImageToTensor(['img'])
+        assert TRANSFORMS.keys == ['img']
+
+    def test_transform(self):
+        TRANSFORMS = ImageToTensor(['img'])
+
+        # image only has one channel
+        results = {'img': np.zeros((224, 224))}
+        results = TRANSFORMS.transform(results)
+        assert results['img'].shape == (1, 224, 224)
+
+        # image has three channels
+        results = {'img': np.zeros((224, 224, 3))}
+        results = TRANSFORMS.transform(results)
+        assert results['img'].shape == (3, 224, 224)
+
+    def test_repr(self):
+        TRANSFORMS = ImageToTensor(['img'])
+        TRANSFORMS_str = str(TRANSFORMS)
+        assert isinstance(TRANSFORMS_str, str)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_transforms/test_transforms_loading.py b/head_extractor/mmcv-2.1.0/tests/test_transforms/test_transforms_loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..918783c993de58d22d0962452632f3864ceb64dc
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_transforms/test_transforms_loading.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+
+import numpy as np
+import pytest
+
+from mmcv.transforms import LoadAnnotations, LoadImageFromFile
+
+
+class TestLoadImageFromFile:
+
+    def test_load_img(self):
+        # file_client_args and backend_args can not be both set
+        with pytest.raises(
+                ValueError,
+                match='"file_client_args" and "backend_args" cannot be set'):
+            LoadImageFromFile(
+                file_client_args={'backend': 'disk'},
+                backend_args={'backend': 'disk'})
+        data_prefix = osp.join(osp.dirname(__file__), '../data')
+
+        results = dict(img_path=osp.join(data_prefix, 'color.jpg'))
+        transform = LoadImageFromFile()
+        results = transform(copy.deepcopy(results))
+        assert results['img_path'] == osp.join(data_prefix, 'color.jpg')
+        assert results['img'].shape == (300, 400, 3)
+        assert results['img'].dtype == np.uint8
+        assert results['img_shape'] == (300, 400)
+        assert results['ori_shape'] == (300, 400)
+        assert repr(transform) == transform.__class__.__name__ + \
+            "(ignore_empty=False, to_float32=False, color_type='color', " + \
+            "imdecode_backend='cv2', backend_args=None)"
+
+        # to_float32
+        transform = LoadImageFromFile(to_float32=True)
+        results = transform(copy.deepcopy(results))
+        assert results['img'].dtype == np.float32
+
+        # gray image
+        results = dict(img_path=osp.join(data_prefix, 'grayscale.jpg'))
+        transform = LoadImageFromFile()
+        results = transform(copy.deepcopy(results))
+        assert results['img'].shape == (300, 400, 3)
+        assert results['img'].dtype == np.uint8
+
+        transform = LoadImageFromFile(color_type='unchanged')
+        results = transform(copy.deepcopy(results))
+        assert results['img'].shape == (300, 400)
+        assert results['img'].dtype == np.uint8
+
+        # test load empty
+        fake_img_path = osp.join(data_prefix, 'fake.jpg')
+        results['img_path'] = fake_img_path
+        transform = LoadImageFromFile(ignore_empty=False)
+        with pytest.raises(FileNotFoundError):
+            transform(copy.deepcopy(results))
+        transform = LoadImageFromFile(ignore_empty=True)
+        assert transform(copy.deepcopy(results)) is None
+
+
+class TestLoadAnnotations:
+
+    def setup_class(cls):
+        data_prefix = osp.join(osp.dirname(__file__), '../data')
+        seg_map = osp.join(data_prefix, 'grayscale.jpg')
+        cls.results = {
+            'seg_map_path':
+            seg_map,
+            'instances': [{
+                'bbox': [0, 0, 10, 20],
+                'bbox_label': 1,
+                'keypoints': [1, 2, 3]
+            }, {
+                'bbox': [10, 10, 110, 120],
+                'bbox_label': 2,
+                'keypoints': [4, 5, 6]
+            }]
+        }
+
+    def test_init(self):
+        # file_client_args and backend_args can not be both set
+        with pytest.raises(
+                ValueError,
+                match='"file_client_args" and "backend_args" cannot be set'):
+            LoadAnnotations(
+                file_client_args={'backend': 'disk'},
+                backend_args={'backend': 'disk'})
+
+    def test_load_bboxes(self):
+        transform = LoadAnnotations(
+            with_bbox=True,
+            with_label=False,
+            with_seg=False,
+            with_keypoints=False,
+        )
+        results = transform(copy.deepcopy(self.results))
+        assert 'gt_bboxes' in results
+        assert (results['gt_bboxes'] == np.array([[0, 0, 10, 20],
+                                                  [10, 10, 110, 120]])).all()
+        assert results['gt_bboxes'].dtype == np.float32
+
+    def test_load_labels(self):
+        transform = LoadAnnotations(
+            with_bbox=False,
+            with_label=True,
+            with_seg=False,
+            with_keypoints=False,
+        )
+        results = transform(copy.deepcopy(self.results))
+        assert 'gt_bboxes_labels' in results
+        assert (results['gt_bboxes_labels'] == np.array([1, 2])).all()
+        assert results['gt_bboxes_labels'].dtype == np.int64
+
+    def test_load_kps(self):
+        transform = LoadAnnotations(
+            with_bbox=False,
+            with_label=False,
+            with_seg=False,
+            with_keypoints=True,
+        )
+        results = transform(copy.deepcopy(self.results))
+        assert 'gt_keypoints' in results
+        assert (results['gt_keypoints'] == np.array([[[1, 2, 3]],
+                                                     [[4, 5, 6]]])).all()
+        assert results['gt_keypoints'].dtype == np.float32
+
+    def test_load_seg_map(self):
+        transform = LoadAnnotations(
+            with_bbox=False,
+            with_label=False,
+            with_seg=True,
+            with_keypoints=False,
+        )
+        results = transform(copy.deepcopy(self.results))
+        assert 'gt_seg_map' in results
+        assert results['gt_seg_map'].shape[:2] == (300, 400)
+        assert results['gt_seg_map'].dtype == np.uint8
+
+    def test_repr(self):
+        transform = LoadAnnotations(
+            with_bbox=True,
+            with_label=False,
+            with_seg=False,
+            with_keypoints=False,
+        )
+        assert repr(transform) == (
+            'LoadAnnotations(with_bbox=True, '
+            'with_label=False, with_seg=False, '
+            "with_keypoints=False, imdecode_backend='cv2', "
+            'backend_args=None)')
diff --git a/head_extractor/mmcv-2.1.0/tests/test_transforms/test_transforms_processing.py b/head_extractor/mmcv-2.1.0/tests/test_transforms/test_transforms_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..716b9cf26d0a327fdcebadc9705963724a151b5d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_transforms/test_transforms_processing.py
@@ -0,0 +1,1014 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from unittest.mock import Mock
+
+import numpy as np
+import pytest
+
+import mmcv
+from mmcv.transforms import (TRANSFORMS, Normalize, Pad, RandomFlip,
+                             RandomResize, Resize, TestTimeAug)
+from mmcv.transforms.base import BaseTransform
+
+try:
+    import torch
+except ModuleNotFoundError:
+    torch = None
+else:
+    import torchvision
+
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+from PIL import Image
+
+
+class TestNormalize:
+
+    def test_normalize(self):
+        img_norm_cfg = dict(
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            to_rgb=True)
+        transform = Normalize(**img_norm_cfg)
+        results = dict()
+        img = mmcv.imread(
+            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
+        original_img = copy.deepcopy(img)
+        results['img'] = img
+        results = transform(results)
+        mean = np.array(img_norm_cfg['mean'])
+        std = np.array(img_norm_cfg['std'])
+        converted_img = (original_img[..., ::-1] - mean) / std
+        assert np.allclose(results['img'], converted_img)
+
+    def test_repr(self):
+        img_norm_cfg = dict(
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            to_rgb=True)
+        transform = Normalize(**img_norm_cfg)
+        assert repr(transform) == ('Normalize(mean=[123.675 116.28  103.53 ], '
+                                   'std=[58.395 57.12  57.375], to_rgb=True)')
+
+
+class TestResize:
+
+    def test_resize(self):
+        data_info = dict(
+            img=np.random.random((1333, 800, 3)),
+            gt_seg_map=np.random.random((1333, 800, 3)),
+            gt_bboxes=np.array([[0, 0, 112, 112]]),
+            gt_keypoints=np.array([[[20, 50, 1]]]))
+
+        with pytest.raises(AssertionError):
+            transform = Resize(scale=None, scale_factor=None)
+        with pytest.raises(TypeError):
+            transform = Resize(scale_factor=[])
+        # test scale is int
+        transform = Resize(scale=2000)
+        results = transform(copy.deepcopy(data_info))
+        assert results['img'].shape[:2] == (2000, 2000)
+        assert results['scale_factor'] == (2000 / 800, 2000 / 1333)
+
+        # test scale is tuple
+        transform = Resize(scale=(2000, 2000))
+        results = transform(copy.deepcopy(data_info))
+        assert results['img'].shape[:2] == (2000, 2000)
+        assert results['scale_factor'] == (2000 / 800, 2000 / 1333)
+
+        # test scale_factor is float
+        transform = Resize(scale_factor=2.0)
+        results = transform(copy.deepcopy(data_info))
+        assert results['img'].shape[:2] == (2666, 1600)
+        assert results['scale_factor'] == (2.0, 2.0)
+
+        # test scale_factor is tuple
+        transform = Resize(scale_factor=(1.5, 2))
+        results = transform(copy.deepcopy(data_info))
+        assert results['img'].shape[:2] == (2666, 1200)
+        assert results['scale_factor'] == (1.5, 2)
+
+        # test keep_ratio is True
+        transform = Resize(scale=(2000, 2000), keep_ratio=True)
+        results = transform(copy.deepcopy(data_info))
+        assert results['img'].shape[:2] == (2000, 1200)
+        assert results['scale_factor'] == (1200 / 800, 2000 / 1333)
+
+        # test resize_bboxes/seg/kps
+        transform = Resize(scale_factor=(1.5, 2))
+        results = transform(copy.deepcopy(data_info))
+        assert (results['gt_bboxes'] == np.array([[0, 0, 168, 224]])).all()
+        assert (results['gt_keypoints'] == np.array([[[30, 100, 1]]])).all()
+        assert results['gt_seg_map'].shape[:2] == (2666, 1200)
+
+        # test clip_object_border = False
+        data_info = dict(
+            img=np.random.random((300, 400, 3)),
+            gt_bboxes=np.array([[200, 150, 600, 450]]))
+        transform = Resize(scale=(200, 150), clip_object_border=False)
+        results = transform(data_info)
+        assert (results['gt_bboxes'] == np.array([100, 75, 300, 225])).all()
+
+    def test_repr(self):
+        transform = Resize(scale=(2000, 2000), keep_ratio=True)
+        assert repr(transform) == ('Resize(scale=(2000, 2000), '
+                                   'scale_factor=None, keep_ratio=True, '
+                                   'clip_object_border=True), backend=cv2), '
+                                   'interpolation=bilinear)')
+
+
+class TestPad:
+
+    def test_pad(self):
+        # test size and size_divisor are both set
+        with pytest.raises(AssertionError):
+            Pad(size=(10, 10), size_divisor=2)
+
+        # test size and size_divisor are both None
+        with pytest.raises(AssertionError):
+            Pad(size=None, size_divisor=None)
+
+        # test size and pad_to_square are both None
+        with pytest.raises(AssertionError):
+            Pad(size=(10, 10), pad_to_square=True)
+
+        # test pad_val is not int or tuple
+        with pytest.raises(AssertionError):
+            Pad(size=(10, 10), pad_val=[])
+
+        # test padding_mode is not 'constant', 'edge', 'reflect' or 'symmetric'
+        with pytest.raises(AssertionError):
+            Pad(size=(10, 10), padding_mode='edg')
+
+        data_info = dict(
+            img=np.random.random((1333, 800, 3)),
+            gt_seg_map=np.random.random((1333, 800, 3)),
+            gt_bboxes=np.array([[0, 0, 112, 112]]),
+            gt_keypoints=np.array([[[20, 50, 1]]]))
+
+        # test pad img / gt_seg_map with size
+        trans = Pad(size=(1200, 2000))
+        results = trans(copy.deepcopy(data_info))
+        assert results['img'].shape[:2] == (2000, 1200)
+        assert results['gt_seg_map'].shape[:2] == (2000, 1200)
+
+        # test pad img/gt_seg_map with size_divisor
+        trans = Pad(size_divisor=11)
+        results = trans(copy.deepcopy(data_info))
+        assert results['img'].shape[:2] == (1342, 803)
+        assert results['gt_seg_map'].shape[:2] == (1342, 803)
+
+        # test pad img/gt_seg_map with pad_to_square
+        trans = Pad(pad_to_square=True)
+        results = trans(copy.deepcopy(data_info))
+        assert results['img'].shape[:2] == (1333, 1333)
+        assert results['gt_seg_map'].shape[:2] == (1333, 1333)
+
+        # test pad img/gt_seg_map with pad_to_square and size_divisor
+        trans = Pad(pad_to_square=True, size_divisor=11)
+        results = trans(copy.deepcopy(data_info))
+        assert results['img'].shape[:2] == (1342, 1342)
+        assert results['gt_seg_map'].shape[:2] == (1342, 1342)
+
+        # test pad img/gt_seg_map with pad_to_square and size_divisor
+        trans = Pad(pad_to_square=True, size_divisor=11)
+        results = trans(copy.deepcopy(data_info))
+        assert results['img'].shape[:2] == (1342, 1342)
+        assert results['gt_seg_map'].shape[:2] == (1342, 1342)
+
+        # test padding_mode
+        new_img = np.ones((1333, 800, 3))
+        data_info['img'] = new_img
+        trans = Pad(pad_to_square=True, padding_mode='edge')
+        results = trans(copy.deepcopy(data_info))
+        assert (results['img'] == np.ones((1333, 1333, 3))).all()
+
+        # test pad_val is dict
+        # test rgb image, size=(2000, 2000)
+        trans = Pad(
+            size=(2000, 2000),
+            pad_val=dict(img=(12, 12, 12), seg=(10, 10, 10)))
+        results = trans(copy.deepcopy(data_info))
+        assert (results['img'][1333:2000, 800:2000, :] == 12).all()
+        assert (results['gt_seg_map'][1333:2000, 800:2000, :] == 10).all()
+
+        trans = Pad(size=(2000, 2000), pad_val=dict(img=(12, 12, 12)))
+        results = trans(copy.deepcopy(data_info))
+        assert (results['img'][1333:2000, 800:2000, :] == 12).all()
+        assert (results['gt_seg_map'][1333:2000, 800:2000, :] == 255).all()
+
+        # test rgb image, pad_to_square=True
+        trans = Pad(
+            pad_to_square=True,
+            pad_val=dict(img=(12, 12, 12), seg=(10, 10, 10)))
+        results = trans(copy.deepcopy(data_info))
+        assert (results['img'][:, 800:1333, :] == 12).all()
+        assert (results['gt_seg_map'][:, 800:1333, :] == 10).all()
+
+        trans = Pad(pad_to_square=True, pad_val=dict(img=(12, 12, 12)))
+        results = trans(copy.deepcopy(data_info))
+        assert (results['img'][:, 800:1333, :] == 12).all()
+        assert (results['gt_seg_map'][:, 800:1333, :] == 255).all()
+
+        # test pad_val is int
+        # test rgb image
+        trans = Pad(size=(2000, 2000), pad_val=12)
+        results = trans(copy.deepcopy(data_info))
+        assert (results['img'][1333:2000, 800:2000, :] == 12).all()
+        assert (results['gt_seg_map'][1333:2000, 800:2000, :] == 255).all()
+        # test gray image
+        new_img = np.random.random((1333, 800))
+        data_info['img'] = new_img
+        new_semantic_seg = np.random.random((1333, 800))
+        data_info['gt_seg_map'] = new_semantic_seg
+        trans = Pad(size=(2000, 2000), pad_val=12)
+        results = trans(copy.deepcopy(data_info))
+        assert (results['img'][1333:2000, 800:2000] == 12).all()
+        assert (results['gt_seg_map'][1333:2000, 800:2000] == 255).all()
+
+    def test_repr(self):
+        trans = Pad(pad_to_square=True, size_divisor=11, padding_mode='edge')
+        assert repr(trans) == (
+            'Pad(size=None, size_divisor=11, pad_to_square=True, '
+            "pad_val={'img': 0, 'seg': 255}), padding_mode=edge)")
+
+
+class TestCenterCrop:
+
+    @classmethod
+    def setup_class(cls):
+        img = mmcv.imread(
+            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
+        cls.original_img = copy.deepcopy(img)
+        seg = np.random.randint(0, 19, (300, 400)).astype(np.uint8)
+        cls.gt_semantic_map = copy.deepcopy(seg)
+
+    @staticmethod
+    def reset_results(results, original_img, gt_semantic_map):
+        results['img'] = copy.deepcopy(original_img)
+        results['gt_seg_map'] = copy.deepcopy(gt_semantic_map)
+        results['gt_bboxes'] = np.array([[0, 0, 210, 160],
+                                         [200, 150, 400, 300]])
+        results['gt_keypoints'] = np.array([[[20, 50, 1]], [[200, 150, 1]],
+                                            [[300, 225, 1]]])
+        return results
+
+    @pytest.mark.skipif(
+        condition=torch is None, reason='No torch in current env')
+    def test_error(self):
+        # test assertion if size is smaller than 0
+        with pytest.raises(AssertionError):
+            transform = dict(type='CenterCrop', crop_size=-1)
+            TRANSFORMS.build(transform)
+
+        # test assertion if size is tuple but one value is smaller than 0
+        with pytest.raises(AssertionError):
+            transform = dict(type='CenterCrop', crop_size=(224, -1))
+            TRANSFORMS.build(transform)
+
+        # test assertion if size is tuple and len(size) < 2
+        with pytest.raises(AssertionError):
+            transform = dict(type='CenterCrop', crop_size=(224, ))
+            TRANSFORMS.build(transform)
+
+        # test assertion if size is tuple len(size) > 2
+        with pytest.raises(AssertionError):
+            transform = dict(type='CenterCrop', crop_size=(224, 224, 3))
+            TRANSFORMS.build(transform)
+
+    def test_repr(self):
+        # test repr
+        transform = dict(type='CenterCrop', crop_size=224)
+        center_crop_module = TRANSFORMS.build(transform)
+        assert isinstance(repr(center_crop_module), str)
+
+    def test_transform(self):
+        results = {}
+        self.reset_results(results, self.original_img, self.gt_semantic_map)
+
+        # test CenterCrop when size is int
+        transform = dict(type='CenterCrop', crop_size=224)
+        center_crop_module = TRANSFORMS.build(transform)
+        results = center_crop_module(results)
+        assert results['img_shape'] == (224, 224)
+        assert (results['img'] == self.original_img[38:262, 88:312, ...]).all()
+        assert (results['gt_seg_map'] == self.gt_semantic_map[38:262,
+                                                              88:312]).all()
+        assert np.equal(results['gt_bboxes'],
+                        np.array([[0, 0, 122, 122], [112, 112, 224,
+                                                     224]])).all()
+        assert np.equal(
+            results['gt_keypoints'],
+            np.array([[[0, 12, 0]], [[112, 112, 1]], [[212, 187, 1]]])).all()
+
+        # test CenterCrop when size is tuple
+        transform = dict(type='CenterCrop', crop_size=(224, 224))
+        center_crop_module = TRANSFORMS.build(transform)
+        results = self.reset_results(results, self.original_img,
+                                     self.gt_semantic_map)
+        results = center_crop_module(results)
+        assert results['img_shape'] == (224, 224)
+        assert (results['img'] == self.original_img[38:262, 88:312, ...]).all()
+        assert (results['gt_seg_map'] == self.gt_semantic_map[38:262,
+                                                              88:312]).all()
+        assert np.equal(results['gt_bboxes'],
+                        np.array([[0, 0, 122, 122], [112, 112, 224,
+                                                     224]])).all()
+        assert np.equal(
+            results['gt_keypoints'],
+            np.array([[[0, 12, 0]], [[112, 112, 1]], [[212, 187, 1]]])).all()
+
+        # test CenterCrop when crop_height != crop_width
+        transform = dict(type='CenterCrop', crop_size=(224, 256))
+        center_crop_module = TRANSFORMS.build(transform)
+        results = self.reset_results(results, self.original_img,
+                                     self.gt_semantic_map)
+        results = center_crop_module(results)
+        assert results['img_shape'] == (256, 224)
+        assert (results['img'] == self.original_img[22:278, 88:312, ...]).all()
+        assert (results['gt_seg_map'] == self.gt_semantic_map[22:278,
+                                                              88:312]).all()
+        assert np.equal(results['gt_bboxes'],
+                        np.array([[0, 0, 122, 138], [112, 128, 224,
+                                                     256]])).all()
+        assert np.equal(
+            results['gt_keypoints'],
+            np.array([[[0, 28, 0]], [[112, 128, 1]], [[212, 203, 1]]])).all()
+
+        # test CenterCrop when crop_size is equal to img.shape
+        img_height, img_width, _ = self.original_img.shape
+        transform = dict(type='CenterCrop', crop_size=(img_width, img_height))
+        center_crop_module = TRANSFORMS.build(transform)
+        results = self.reset_results(results, self.original_img,
+                                     self.gt_semantic_map)
+        results = center_crop_module(results)
+        assert results['img_shape'] == (300, 400)
+        assert (results['img'] == self.original_img).all()
+        assert (results['gt_seg_map'] == self.gt_semantic_map).all()
+        assert np.equal(results['gt_bboxes'],
+                        np.array([[0, 0, 210, 160], [200, 150, 400,
+                                                     300]])).all()
+        assert np.equal(
+            results['gt_keypoints'],
+            np.array([[[20, 50, 1]], [[200, 150, 1]], [[300, 225, 1]]])).all()
+
+        # test CenterCrop when crop_size is larger than img.shape
+        transform = dict(
+            type='CenterCrop', crop_size=(img_width * 2, img_height * 2))
+        center_crop_module = TRANSFORMS.build(transform)
+        results = self.reset_results(results, self.original_img,
+                                     self.gt_semantic_map)
+        results = center_crop_module(results)
+        assert results['img_shape'] == (300, 400)
+        assert (results['img'] == self.original_img).all()
+        assert (results['gt_seg_map'] == self.gt_semantic_map).all()
+        assert np.equal(results['gt_bboxes'],
+                        np.array([[0, 0, 210, 160], [200, 150, 400,
+                                                     300]])).all()
+        assert np.equal(
+            results['gt_keypoints'],
+            np.array([[[20, 50, 1]], [[200, 150, 1]], [[300, 225, 1]]])).all()
+
+        # test with padding
+        transform = dict(
+            type='CenterCrop',
+            crop_size=(img_width // 2, img_height * 2),
+            auto_pad=True,
+            pad_cfg=dict(type='Pad', padding_mode='constant', pad_val=12))
+        center_crop_module = TRANSFORMS.build(transform)
+        results = self.reset_results(results, self.original_img,
+                                     self.gt_semantic_map)
+        results = center_crop_module(results)
+        assert results['img_shape'] == (600, 200)
+        assert results['img'].shape[:2] == results['gt_seg_map'].shape
+        assert (results['img'][300:600, 100:300, ...] == 12).all()
+        assert (results['gt_seg_map'][300:600, 100:300] == 255).all()
+        assert np.equal(results['gt_bboxes'],
+                        np.array([[0, 0, 110, 160], [100, 150, 200,
+                                                     300]])).all()
+        assert np.equal(
+            results['gt_keypoints'],
+            np.array([[[0, 50, 0]], [[100, 150, 1]], [[200, 225, 0]]])).all()
+
+        transform = dict(
+            type='CenterCrop',
+            crop_size=(img_width // 2, img_height * 2),
+            auto_pad=True,
+            pad_cfg=dict(
+                type='Pad',
+                padding_mode='constant',
+                pad_val=dict(img=13, seg=33)))
+        center_crop_module = TRANSFORMS.build(transform)
+        results = self.reset_results(results, self.original_img,
+                                     self.gt_semantic_map)
+        results = center_crop_module(results)
+        assert results['img_shape'] == (600, 200)
+        assert (results['img'][300:600, 100:300, ...] == 13).all()
+        assert (results['gt_seg_map'][300:600, 100:300] == 33).all()
+        assert np.equal(results['gt_bboxes'],
+                        np.array([[0, 0, 110, 160], [100, 150, 200,
+                                                     300]])).all()
+        assert np.equal(
+            results['gt_keypoints'],
+            np.array([[[0, 50, 0]], [[100, 150, 1]], [[200, 225, 0]]])).all()
+
+        # test CenterCrop when crop_width is smaller than img_width
+        transform = dict(
+            type='CenterCrop', crop_size=(img_width // 2, img_height))
+        center_crop_module = TRANSFORMS.build(transform)
+        results = self.reset_results(results, self.original_img,
+                                     self.gt_semantic_map)
+        results = center_crop_module(results)
+        assert results['img_shape'] == (img_height, img_width // 2)
+        assert (results['img'] == self.original_img[:, 100:300, ...]).all()
+        assert (results['gt_seg_map'] == self.gt_semantic_map[:,
+                                                              100:300]).all()
+        assert np.equal(results['gt_bboxes'],
+                        np.array([[0, 0, 110, 160], [100, 150, 200,
+                                                     300]])).all()
+        assert np.equal(
+            results['gt_keypoints'],
+            np.array([[[0, 50, 0]], [[100, 150, 1]], [[200, 225, 0]]])).all()
+
+        # test CenterCrop when crop_height is smaller than img_height
+        transform = dict(
+            type='CenterCrop', crop_size=(img_width, img_height // 2))
+        center_crop_module = TRANSFORMS.build(transform)
+        results = self.reset_results(results, self.original_img,
+                                     self.gt_semantic_map)
+        results = center_crop_module(results)
+        assert results['img_shape'] == (img_height // 2, img_width)
+        assert (results['img'] == self.original_img[75:225, ...]).all()
+        assert (results['gt_seg_map'] == self.gt_semantic_map[75:225,
+                                                              ...]).all()
+        assert np.equal(results['gt_bboxes'],
+                        np.array([[0, 0, 210, 85], [200, 75, 400,
+                                                    150]])).all()
+        assert np.equal(
+            results['gt_keypoints'],
+            np.array([[[20, 0, 0]], [[200, 75, 1]], [[300, 150, 0]]])).all()
+
+    @pytest.mark.skipif(
+        condition=torch is None, reason='No torch in current env')
+    def test_torchvision_compare(self):
+        # compare results with torchvision
+        results = {}
+        transform = dict(type='CenterCrop', crop_size=224)
+        center_crop_module = TRANSFORMS.build(transform)
+        results = self.reset_results(results, self.original_img,
+                                     self.gt_semantic_map)
+        results = center_crop_module(results)
+        center_crop_module = torchvision.transforms.CenterCrop(size=224)
+        pil_img = Image.fromarray(self.original_img)
+        pil_seg = Image.fromarray(self.gt_semantic_map)
+        cropped_img = center_crop_module(pil_img)
+        cropped_img = np.array(cropped_img)
+        cropped_seg = center_crop_module(pil_seg)
+        cropped_seg = np.array(cropped_seg)
+        assert np.equal(results['img'], cropped_img).all()
+        assert np.equal(results['gt_seg_map'], cropped_seg).all()
+
+
+class TestRandomGrayscale:
+
+    @classmethod
+    def setup_class(cls):
+        cls.img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+
+    def test_repr(self):
+        # test repr
+        transform = dict(
+            type='RandomGrayscale',
+            prob=1.,
+            channel_weights=(0.299, 0.587, 0.114),
+            keep_channels=True)
+        random_gray_scale_module = TRANSFORMS.build(transform)
+        assert isinstance(repr(random_gray_scale_module), str)
+
+    def test_error(self):
+        # test invalid argument
+        transform = dict(type='RandomGrayscale', prob=2)
+        with pytest.raises(AssertionError):
+            TRANSFORMS.build(transform)
+
+    def test_transform(self):
+        results = dict()
+        # test rgb2gray, return the grayscale image with prob = 1.
+        transform = dict(
+            type='RandomGrayscale',
+            prob=1.,
+            channel_weights=(0.299, 0.587, 0.114),
+            keep_channels=True)
+
+        random_gray_scale_module = TRANSFORMS.build(transform)
+        results['img'] = copy.deepcopy(self.img)
+        img = random_gray_scale_module(results)['img']
+        computed_gray = (self.img[:, :, 0] * 0.299 +
+                         self.img[:, :, 1] * 0.587 +
+                         self.img[:, :, 2] * 0.114).astype(np.uint8)
+        for i in range(img.shape[2]):
+            assert_array_almost_equal(img[:, :, i], computed_gray, decimal=4)
+        assert img.shape == (10, 10, 3)
+
+        # test rgb2gray, return the original image with p=0.
+        transform = dict(type='RandomGrayscale', prob=0.)
+        random_gray_scale_module = TRANSFORMS.build(transform)
+        results['img'] = copy.deepcopy(self.img)
+        img = random_gray_scale_module(results)['img']
+        assert_array_equal(img, self.img)
+        assert img.shape == (10, 10, 3)
+
+        # test image with one channel
+        transform = dict(type='RandomGrayscale', prob=1.)
+        results['img'] = self.img[:, :, 0:1]
+        random_gray_scale_module = TRANSFORMS.build(transform)
+        img = random_gray_scale_module(results)['img']
+        assert_array_equal(img, self.img[:, :, 0:1])
+        assert img.shape == (10, 10, 1)
+
+
+@TRANSFORMS.register_module()
+class MockPackTaskInputs(BaseTransform):
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def transform(self, results):
+        packed_results = dict(inputs=results['img'], data_sample=Mock())
+        return packed_results
+
+
+class TestMultiScaleFlipAug:
+
+    @classmethod
+    def setup_class(cls):
+        cls.img = mmcv.imread(
+            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
+        cls.original_img = copy.deepcopy(cls.img)
+
+    def test_error(self):
+        # test assertion if scales is not tuple or list of tuple
+        with pytest.raises(AssertionError):
+            transform = dict(
+                type='MultiScaleFlipAug', scales=[1333, 800], transforms=[])
+            TRANSFORMS.build(transform)
+
+        # test assertion if flip_direction is not str or list of str
+        with pytest.raises(AssertionError):
+            transform = dict(
+                type='MultiScaleFlipAug',
+                scales=[(1333, 800)],
+                flip_direction=1,
+                transforms=[])
+            TRANSFORMS.build(transform)
+
+    @pytest.mark.skipif(
+        condition=torch is None, reason='No torch in current env')
+    def test_multi_scale_flip_aug(self):
+        # test with empty transforms
+        transform = dict(
+            type='MultiScaleFlipAug',
+            transforms=[dict(type='MockPackTaskInputs')],
+            scales=[(1333, 800), (800, 600), (640, 480)],
+            allow_flip=True,
+            flip_direction=['horizontal', 'vertical', 'diagonal'])
+        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
+        results = dict()
+        results['img'] = copy.deepcopy(self.original_img)
+        packed_results = multi_scale_flip_aug_module(results)
+        assert len(packed_results['inputs']) == 12
+
+        # test with allow_flip=False
+        transform = dict(
+            type='MultiScaleFlipAug',
+            transforms=[dict(type='MockPackTaskInputs')],
+            scales=[(1333, 800), (800, 600), (640, 480)],
+            allow_flip=False,
+            flip_direction=['horizontal', 'vertical', 'diagonal'])
+        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
+        results = dict()
+        results['img'] = copy.deepcopy(self.original_img)
+        packed_results = multi_scale_flip_aug_module(results)
+        assert len(packed_results['inputs']) == 3
+
+        # test with transforms
+        img_norm_cfg = dict(
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            to_rgb=True)
+        transforms_cfg = [
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='MockPackTaskInputs')
+        ]
+        transform = dict(
+            type='MultiScaleFlipAug',
+            transforms=transforms_cfg,
+            scales=[(1333, 800), (800, 600), (640, 480)],
+            allow_flip=True,
+            flip_direction=['horizontal', 'vertical', 'diagonal'])
+        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
+        results = dict()
+        results['img'] = copy.deepcopy(self.original_img)
+        packed_results = multi_scale_flip_aug_module(results)
+        assert len(packed_results['inputs']) == 12
+
+        # test with scale_factor
+        img_norm_cfg = dict(
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            to_rgb=True)
+        transforms_cfg = [
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='MockPackTaskInputs')
+        ]
+        transform = dict(
+            type='MultiScaleFlipAug',
+            transforms=transforms_cfg,
+            scale_factor=[0.5, 1., 2.],
+            allow_flip=True,
+            flip_direction=['horizontal', 'vertical', 'diagonal'])
+        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
+        results = dict()
+        results['img'] = copy.deepcopy(self.original_img)
+        packed_results = multi_scale_flip_aug_module(results)
+        assert len(packed_results['inputs']) == 12
+
+        # test no resize
+        img_norm_cfg = dict(
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            to_rgb=True)
+        transforms_cfg = [
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='MockPackTaskInputs')
+        ]
+        transform = dict(
+            type='MultiScaleFlipAug',
+            transforms=transforms_cfg,
+            allow_flip=True,
+            flip_direction=['horizontal', 'vertical', 'diagonal'])
+        multi_scale_flip_aug_module = TRANSFORMS.build(transform)
+        results = dict()
+        results['img'] = copy.deepcopy(self.original_img)
+        packed_results = multi_scale_flip_aug_module(results)
+        assert len(packed_results['inputs']) == 4
+
+
+class TestRandomChoiceResize:
+
+    @classmethod
+    def setup_class(cls):
+        cls.img = mmcv.imread(
+            osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
+        cls.original_img = copy.deepcopy(cls.img)
+
+    def reset_results(self, results):
+        results['img'] = copy.deepcopy(self.original_img)
+        results['gt_seg_map'] = copy.deepcopy(self.original_img)
+
+    def test_repr(self):
+        # test repr
+        transform = dict(
+            type='RandomChoiceResize', scales=[(1333, 800), (1333, 600)])
+        random_multiscale_resize = TRANSFORMS.build(transform)
+        assert isinstance(repr(random_multiscale_resize), str)
+
+    def test_error(self):
+        # test assertion if size is smaller than 0
+        with pytest.raises(AssertionError):
+            transform = dict(type='RandomChoiceResize', scales=[0.5, 1, 2])
+            TRANSFORMS.build(transform)
+
+    def test_random_multiscale_resize(self):
+        results = dict()
+        # test with one scale
+        transform = dict(type='RandomChoiceResize', scales=[(1333, 800)])
+        random_multiscale_resize = TRANSFORMS.build(transform)
+        self.reset_results(results)
+        results = random_multiscale_resize(results)
+        assert results['img'].shape == (800, 1333, 3)
+
+        # test with multi scales
+        _scale_choice = [(1333, 800), (1333, 600)]
+        transform = dict(type='RandomChoiceResize', scales=_scale_choice)
+        random_multiscale_resize = TRANSFORMS.build(transform)
+        self.reset_results(results)
+        results = random_multiscale_resize(results)
+        assert (results['img'].shape[1],
+                results['img'].shape[0]) in _scale_choice
+
+        # test keep_ratio
+        transform = dict(
+            type='RandomChoiceResize',
+            scales=[(900, 600)],
+            resize_type='Resize',
+            keep_ratio=True)
+        random_multiscale_resize = TRANSFORMS.build(transform)
+        self.reset_results(results)
+        _input_ratio = results['img'].shape[0] / results['img'].shape[1]
+        results = random_multiscale_resize(results)
+        _output_ratio = results['img'].shape[0] / results['img'].shape[1]
+        assert_array_almost_equal(_input_ratio, _output_ratio)
+
+        # test clip_object_border
+        gt_bboxes = [[200, 150, 600, 450]]
+        transform = dict(
+            type='RandomChoiceResize',
+            scales=[(200, 150)],
+            resize_type='Resize',
+            clip_object_border=True)
+        random_multiscale_resize = TRANSFORMS.build(transform)
+        self.reset_results(results)
+        results['gt_bboxes'] = np.array(gt_bboxes)
+        results = random_multiscale_resize(results)
+        assert results['img'].shape == (150, 200, 3)
+        assert np.equal(results['gt_bboxes'], np.array([[100, 75, 200,
+                                                         150]])).all()
+
+        transform = dict(
+            type='RandomChoiceResize',
+            scales=[(200, 150)],
+            resize_type='Resize',
+            clip_object_border=False)
+        random_multiscale_resize = TRANSFORMS.build(transform)
+        self.reset_results(results)
+        results['gt_bboxes'] = np.array(gt_bboxes)
+        results = random_multiscale_resize(results)
+        assert results['img'].shape == (150, 200, 3)
+        assert np.equal(results['gt_bboxes'], np.array([[100, 75, 300,
+                                                         225]])).all()
+
+
+class TestRandomFlip:
+
+    def test_init(self):
+
+        # prob is float
+        TRANSFORMS = RandomFlip(0.1)
+        assert TRANSFORMS.prob == 0.1
+
+        # prob is None
+        with pytest.raises(ValueError):
+            TRANSFORMS = RandomFlip(None)
+            assert TRANSFORMS.prob is None
+
+        # prob is a list
+        TRANSFORMS = RandomFlip([0.1, 0.2], ['horizontal', 'vertical'])
+        assert len(TRANSFORMS.prob) == 2
+        assert len(TRANSFORMS.direction) == 2
+
+        # direction is an invalid type
+        with pytest.raises(ValueError):
+            TRANSFORMS = RandomFlip(0.1, 1)
+
+        # prob is an invalid type
+        with pytest.raises(ValueError):
+            TRANSFORMS = RandomFlip('0.1')
+
+    def test_transform(self):
+
+        results = {
+            'img': np.random.random((224, 224, 3)),
+            'gt_bboxes': np.array([[0, 1, 100, 101]]),
+            'gt_keypoints': np.array([[[100, 100, 1.0]]]),
+            # seg map flip is irrelative with image, so there is no requirement
+            # that gt_set_map of test data matches image.
+            'gt_seg_map': np.array([[0, 1], [2, 3]])
+        }
+
+        # horizontal flip
+        TRANSFORMS = RandomFlip([1.0], ['horizontal'])
+        results_update = TRANSFORMS.transform(copy.deepcopy(results))
+        assert (results_update['gt_bboxes'] == np.array([[124, 1, 224,
+                                                          101]])).all()
+        assert (results_update['gt_seg_map'] == np.array([[1, 0], [3,
+                                                                   2]])).all()
+
+        # diagonal flip
+        TRANSFORMS = RandomFlip([1.0], ['diagonal'])
+        results_update = TRANSFORMS.transform(copy.deepcopy(results))
+        assert (results_update['gt_bboxes'] == np.array([[124, 123, 224,
+                                                          223]])).all()
+        assert (results_update['gt_seg_map'] == np.array([[3, 2], [1,
+                                                                   0]])).all()
+
+        # vertical flip
+        TRANSFORMS = RandomFlip([1.0], ['vertical'])
+        results_update = TRANSFORMS.transform(copy.deepcopy(results))
+        assert (results_update['gt_bboxes'] == np.array([[0, 123, 100,
+                                                          223]])).all()
+        assert (results_update['gt_seg_map'] == np.array([[2, 3], [0,
+                                                                   1]])).all()
+
+        # horizontal flip when direction is None
+        TRANSFORMS = RandomFlip(1.0)
+        results_update = TRANSFORMS.transform(copy.deepcopy(results))
+        assert (results_update['gt_bboxes'] == np.array([[124, 1, 224,
+                                                          101]])).all()
+        assert (results_update['gt_seg_map'] == np.array([[1, 0], [3,
+                                                                   2]])).all()
+
+        # horizontal flip and swap label pair
+        TRANSFORMS = RandomFlip([1.0], ['horizontal'],
+                                swap_seg_labels=[[0, 1]])
+        results_update = TRANSFORMS.transform(copy.deepcopy(results))
+        assert (results_update['gt_seg_map'] == np.array([[0, 1], [3,
+                                                                   2]])).all()
+        assert results_update['swap_seg_labels'] == [[0, 1]]
+
+        TRANSFORMS = RandomFlip(0.0)
+        results_update = TRANSFORMS.transform(copy.deepcopy(results))
+        assert (results_update['gt_bboxes'] == np.array([[0, 1, 100,
+                                                          101]])).all()
+        assert (results_update['gt_seg_map'] == np.array([[0, 1], [2,
+                                                                   3]])).all()
+
+        # flip direction is invalid in bbox flip
+        with pytest.raises(ValueError):
+            TRANSFORMS = RandomFlip(1.0)
+            results_update = TRANSFORMS._flip_bbox(results['gt_bboxes'],
+                                                   (224, 224), 'invalid')
+
+        # flip direction is invalid in keypoints flip
+        with pytest.raises(ValueError):
+            TRANSFORMS = RandomFlip(1.0)
+            results_update = TRANSFORMS._flip_keypoints(
+                results['gt_keypoints'], (224, 224), 'invalid')
+
+        # swap pair is invalid
+        with pytest.raises(AssertionError):
+            TRANSFORMS = RandomFlip(1.0, swap_seg_labels='invalid')
+            results_update = TRANSFORMS._flip_seg_map(results['gt_seg_map'],
+                                                      'horizontal')
+
+    def test_repr(self):
+        TRANSFORMS = RandomFlip(0.1)
+        TRANSFORMS_str = str(TRANSFORMS)
+        assert isinstance(TRANSFORMS_str, str)
+
+
+class TestRandomResize:
+
+    def test_init(self):
+        TRANSFORMS = RandomResize(
+            (224, 224),
+            (1.0, 2.0),
+        )
+        assert TRANSFORMS.scale == (224, 224)
+
+    def test_repr(self):
+        TRANSFORMS = RandomResize(
+            (224, 224),
+            (1.0, 2.0),
+        )
+        TRANSFORMS_str = str(TRANSFORMS)
+        assert isinstance(TRANSFORMS_str, str)
+
+    def test_transform(self):
+
+        # choose target scale from init when override is True
+        results = {}
+        TRANSFORMS = RandomResize((224, 224), (1.0, 2.0))
+        results_update = TRANSFORMS.transform(copy.deepcopy(results))
+        assert results_update['scale'][0] >= 224 and results_update['scale'][
+            0] <= 448
+        assert results_update['scale'][1] >= 224 and results_update['scale'][
+            1] <= 448
+
+        # keep ratio is True
+        results = {
+            'img': np.random.random((224, 224, 3)),
+            'gt_seg_map': np.random.random((224, 224, 3)),
+            'gt_bboxes': np.array([[0, 0, 112, 112]]),
+            'gt_keypoints': np.array([[[112, 112]]])
+        }
+
+        TRANSFORMS = RandomResize((224, 224), (1.0, 2.0),
+                                  resize_type='Resize',
+                                  keep_ratio=True)
+        results_update = TRANSFORMS.transform(copy.deepcopy(results))
+        assert 224 <= results_update['img_shape'][0]
+        assert 448 >= results_update['img_shape'][0]
+        assert 224 <= results_update['img_shape'][1]
+        assert 448 >= results_update['img_shape'][1]
+        assert results_update['keep_ratio']
+        assert results['gt_bboxes'][0][2] >= 112
+        assert results['gt_bboxes'][0][2] <= 112
+
+        # keep ratio is False
+        TRANSFORMS = RandomResize((224, 224), (1.0, 2.0),
+                                  resize_type='Resize',
+                                  keep_ratio=False)
+        results_update = TRANSFORMS.transform(copy.deepcopy(results))
+
+        # choose target scale from init when override is False and scale is a
+        # list of tuples
+        results = {}
+        TRANSFORMS = RandomResize([(224, 448), (112, 224)],
+                                  resize_type='Resize',
+                                  keep_ratio=True)
+        results_update = TRANSFORMS.transform(copy.deepcopy(results))
+        assert results_update['scale'][1] >= 224 and results_update['scale'][
+            1] <= 448
+        assert results_update['scale'][0] >= 112 and results_update['scale'][
+            0] <= 224
+
+        # the type of scale is invalid in init
+        with pytest.raises(NotImplementedError):
+            results = {}
+            TRANSFORMS = RandomResize([(224, 448), [112, 224]],
+                                      resize_type='Resize',
+                                      keep_ratio=True)
+            results_update = TRANSFORMS.transform(copy.deepcopy(results))
+
+
+class TestTestTimeAug:
+
+    def test_init(self):
+        subroutines = [[
+            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+            dict(type='Resize', scale=(1333, 400), keep_ratio=True)
+        ], [
+            dict(type='RandomFlip', prob=1.),
+            dict(type='RandomFlip', prob=0.)
+        ], [dict(type='Normalize', mean=(0, 0, 0), std=(1, 1, 1))]]
+
+        tta_transform = TestTimeAug(subroutines)
+        subroutines = tta_transform.subroutines
+        assert len(subroutines) == 4
+
+        assert isinstance(subroutines[0].transforms[0], Resize)
+        assert isinstance(subroutines[0].transforms[1], RandomFlip)
+        assert isinstance(subroutines[0].transforms[2], Normalize)
+        assert isinstance(subroutines[1].transforms[0], Resize)
+        assert isinstance(subroutines[1].transforms[1], RandomFlip)
+        assert isinstance(subroutines[1].transforms[2], Normalize)
+
+    def test_transform(self):
+        results = {
+            'img': np.random.random((224, 224, 3)),
+            'gt_bboxes': np.array([[0, 1, 100, 101]]),
+            'gt_keypoints': np.array([[[100, 100, 1.0]]]),
+            'gt_seg_map': np.random.random((224, 224, 3))
+        }
+        input_results = copy.deepcopy(results)
+        transforms = [[
+            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+            dict(type='Resize', scale=(1333, 400), keep_ratio=True)
+        ], [
+            dict(type='RandomFlip', prob=0.),
+            dict(type='RandomFlip', prob=1.)
+        ], [dict(type='Normalize', mean=(0, 0, 0), std=(1, 1, 1))]]
+
+        tta_transform = TestTimeAug(transforms)
+        results = tta_transform.transform(results)
+        assert len(results['img']) == 4
+
+        resize1 = tta_transform.subroutines[0].transforms[0]
+        resize2 = tta_transform.subroutines[2].transforms[0]
+        flip1 = tta_transform.subroutines[0].transforms[1]
+        flip2 = tta_transform.subroutines[1].transforms[1]
+        normalize = tta_transform.subroutines[0].transforms[2]
+        target_results = [
+            normalize.transform(
+                flip1.transform(
+                    resize1.transform(copy.deepcopy(input_results)))),
+            normalize.transform(
+                flip2.transform(
+                    resize1.transform(copy.deepcopy(input_results)))),
+            normalize.transform(
+                flip1.transform(
+                    resize2.transform(copy.deepcopy(input_results)))),
+            normalize.transform(
+                flip2.transform(
+                    resize2.transform(copy.deepcopy(input_results)))),
+        ]
+
+        assert np.allclose(target_results[0]['img'], results['img'][0])
+        assert np.allclose(target_results[1]['img'], results['img'][1])
+        assert np.allclose(target_results[2]['img'], results['img'][2])
+        assert np.allclose(target_results[3]['img'], results['img'][3])
+
+    def test_repr(self):
+        transforms = [[
+            dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+            dict(type='Resize', scale=(1333, 400), keep_ratio=True)
+        ], [
+            dict(type='RandomFlip', prob=0.),
+            dict(type='RandomFlip', prob=1.)
+        ], [dict(type='Normalize', mean=(0, 0, 0), std=(1, 1, 1))]]
+
+        tta_transform = TestTimeAug(transforms)
+        repr_str = repr(tta_transform)
+        repr_str_list = repr_str.split('\n')
+        assert repr_str_list[0] == 'TestTimeAugtransforms='
+        assert repr_str_list[1] == 'Compose('
+        assert repr_str_list[2].startswith('    Resize(scale=(1333, 800)')
+        assert repr_str_list[3].startswith('    RandomFlip(prob=0.0')
+        assert repr_str_list[4].startswith('    Normalize(mean=[0. 0. 0.]')
diff --git a/head_extractor/mmcv-2.1.0/tests/test_transforms/test_transforms_wrapper.py b/head_extractor/mmcv-2.1.0/tests/test_transforms/test_transforms_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..98feeb83e0788f21aa44c25a09bc65524e30598f
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_transforms/test_transforms_wrapper.py
@@ -0,0 +1,585 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import pytest
+
+from mmcv.transforms.base import BaseTransform
+from mmcv.transforms.builder import TRANSFORMS
+from mmcv.transforms.utils import (avoid_cache_randomness, cache_random_params,
+                                   cache_randomness)
+from mmcv.transforms.wrappers import (Compose, KeyMapper, RandomApply,
+                                      RandomChoice, TransformBroadcaster)
+
+
+@TRANSFORMS.register_module()
+class AddToValue(BaseTransform):
+    """Dummy transform to add a given addend to results['value']"""
+
+    def __init__(self, addend=0) -> None:
+        super().__init__()
+        self.addend = addend
+
+    def add(self, results, addend):
+        augend = results['value']
+
+        if isinstance(augend, list):
+            warnings.warn('value is a list', UserWarning)
+        if isinstance(augend, dict):
+            warnings.warn('value is a dict', UserWarning)
+
+        def _add_to_value(augend, addend):
+            if isinstance(augend, list):
+                return [_add_to_value(v, addend) for v in augend]
+            if isinstance(augend, dict):
+                return {k: _add_to_value(v, addend) for k, v in augend.items()}
+            return augend + addend
+
+        results['value'] = _add_to_value(results['value'], addend)
+        return results
+
+    def transform(self, results):
+        return self.add(results, self.addend)
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'addend = {self.addend}'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomAddToValue(AddToValue):
+    """Dummy transform to add a random addend to results['value']"""
+
+    def __init__(self, repeat=1) -> None:
+        super().__init__(addend=None)
+        self.repeat = repeat
+
+    @cache_randomness
+    def get_random_addend(self):
+        return np.random.rand()
+
+    def transform(self, results):
+        for _ in range(self.repeat):
+            results = self.add(results, addend=self.get_random_addend())
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'repeat = {self.repeat}'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class SumTwoValues(BaseTransform):
+    """Dummy transform to test transform wrappers."""
+
+    def transform(self, results):
+        if 'num_1' in results and 'num_2' in results:
+            results['sum'] = results['num_1'] + results['num_2']
+        elif 'num_1' in results:
+            results['sum'] = results['num_1']
+        elif 'num_2' in results:
+            results['sum'] = results['num_2']
+        else:
+            results['sum'] = np.nan
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        return repr_str
+
+
+def test_compose():
+
+    # Case 1: build from cfg
+    pipeline = [dict(type='AddToValue')]
+    pipeline = Compose(pipeline)
+    _ = str(pipeline)
+
+    # Case 2: build from transform list
+    pipeline = [AddToValue()]
+    pipeline = Compose(pipeline)
+
+    # Case 3: invalid build arguments
+    pipeline = [[dict(type='AddToValue')]]
+    with pytest.raises(TypeError):
+        pipeline = Compose(pipeline)
+
+    # Case 4: contain transform with None output
+    class DummyTransform(BaseTransform):
+
+        def transform(self, results):
+            return None
+
+    pipeline = Compose([DummyTransform()])
+    results = pipeline({})
+    assert results is None
+
+
+def test_cache_random_parameters():
+
+    transform = RandomAddToValue()
+
+    # Case 1: cache random parameters
+    assert hasattr(RandomAddToValue, '_methods_with_randomness')
+    assert 'get_random_addend' in RandomAddToValue._methods_with_randomness
+
+    with cache_random_params(transform):
+        results_1 = transform(dict(value=0))
+        results_2 = transform(dict(value=0))
+        np.testing.assert_equal(results_1['value'], results_2['value'])
+
+    # Case 2: do not cache random parameters
+    results_1 = transform(dict(value=0))
+    results_2 = transform(dict(value=0))
+    with pytest.raises(AssertionError):
+        np.testing.assert_equal(results_1['value'], results_2['value'])
+
+    # Case 3: allow to invoke random method 0 times
+    transform = RandomAddToValue(repeat=0)
+    with cache_random_params(transform):
+        _ = transform(dict(value=0))
+
+    # Case 4: NOT allow to invoke random method >1 times
+    transform = RandomAddToValue(repeat=2)
+    with pytest.raises(RuntimeError):
+        with cache_random_params(transform):
+            _ = transform(dict(value=0))
+
+    # Case 5: apply on nested transforms
+    transform = Compose([RandomAddToValue()])
+    with cache_random_params(transform):
+        results_1 = transform(dict(value=0))
+        results_2 = transform(dict(value=0))
+        np.testing.assert_equal(results_1['value'], results_2['value'])
+
+
+def test_key_mapper():
+    # Case 0: only remap
+    pipeline = KeyMapper(
+        transforms=[AddToValue(addend=1)], remapping={'value': 'v_out'})
+
+    results = dict(value=0)
+    results = pipeline(results)
+
+    np.testing.assert_equal(results['value'], 0)  # should be unchanged
+    np.testing.assert_equal(results['v_out'], 1)
+
+    # Case 1: simple remap
+    pipeline = KeyMapper(
+        transforms=[AddToValue(addend=1)],
+        mapping={'value': 'v_in'},
+        remapping={'value': 'v_out'})
+
+    results = dict(value=0, v_in=1)
+    results = pipeline(results)
+
+    np.testing.assert_equal(results['value'], 0)  # should be unchanged
+    np.testing.assert_equal(results['v_in'], 1)
+    np.testing.assert_equal(results['v_out'], 2)
+
+    # Case 2: collecting list
+    pipeline = KeyMapper(
+        transforms=[AddToValue(addend=2)],
+        mapping={'value': ['v_in_1', 'v_in_2']},
+        remapping={'value': ['v_out_1', 'v_out_2']})
+    results = dict(value=0, v_in_1=1, v_in_2=2)
+
+    with pytest.warns(UserWarning, match='value is a list'):
+        results = pipeline(results)
+
+    np.testing.assert_equal(results['value'], 0)  # should be unchanged
+    np.testing.assert_equal(results['v_in_1'], 1)
+    np.testing.assert_equal(results['v_in_2'], 2)
+    np.testing.assert_equal(results['v_out_1'], 3)
+    np.testing.assert_equal(results['v_out_2'], 4)
+
+    # Case 3: collecting dict
+    pipeline = KeyMapper(
+        transforms=[AddToValue(addend=2)],
+        mapping={'value': {
+            'v1': 'v_in_1',
+            'v2': 'v_in_2'
+        }},
+        remapping={'value': {
+            'v1': 'v_out_1',
+            'v2': 'v_out_2'
+        }})
+    results = dict(value=0, v_in_1=1, v_in_2=2)
+
+    with pytest.warns(UserWarning, match='value is a dict'):
+        results = pipeline(results)
+
+    np.testing.assert_equal(results['value'], 0)  # should be unchanged
+    np.testing.assert_equal(results['v_in_1'], 1)
+    np.testing.assert_equal(results['v_in_2'], 2)
+    np.testing.assert_equal(results['v_out_1'], 3)
+    np.testing.assert_equal(results['v_out_2'], 4)
+
+    # Case 4: collecting list with auto_remap mode
+    pipeline = KeyMapper(
+        transforms=[AddToValue(addend=2)],
+        mapping=dict(value=['v_in_1', 'v_in_2']),
+        auto_remap=True)
+    results = dict(value=0, v_in_1=1, v_in_2=2)
+
+    with pytest.warns(UserWarning, match='value is a list'):
+        results = pipeline(results)
+
+    np.testing.assert_equal(results['value'], 0)
+    np.testing.assert_equal(results['v_in_1'], 3)
+    np.testing.assert_equal(results['v_in_2'], 4)
+
+    # Case 5: collecting dict with auto_remap mode
+    pipeline = KeyMapper(
+        transforms=[AddToValue(addend=2)],
+        mapping=dict(value=dict(v1='v_in_1', v2='v_in_2')),
+        auto_remap=True)
+    results = dict(value=0, v_in_1=1, v_in_2=2)
+
+    with pytest.warns(UserWarning, match='value is a dict'):
+        results = pipeline(results)
+
+    np.testing.assert_equal(results['value'], 0)
+    np.testing.assert_equal(results['v_in_1'], 3)
+    np.testing.assert_equal(results['v_in_2'], 4)
+
+    # Case 6: nested collection with auto_remap mode
+    pipeline = KeyMapper(
+        transforms=[AddToValue(addend=2)],
+        mapping=dict(value=['v1', dict(v2=['v21', 'v22'], v3='v3')]),
+        auto_remap=True)
+    results = dict(value=0, v1=1, v21=2, v22=3, v3=4)
+
+    with pytest.warns(UserWarning, match='value is a list'):
+        results = pipeline(results)
+
+    np.testing.assert_equal(results['value'], 0)
+    np.testing.assert_equal(results['v1'], 3)
+    np.testing.assert_equal(results['v21'], 4)
+    np.testing.assert_equal(results['v22'], 5)
+    np.testing.assert_equal(results['v3'], 6)
+
+    # Case 7: output_map must be None if `auto_remap` is set True
+    with pytest.raises(ValueError):
+        pipeline = KeyMapper(
+            transforms=[AddToValue(addend=1)],
+            mapping=dict(value='v_in'),
+            remapping=dict(value='v_out'),
+            auto_remap=True)
+
+    # Case 8: allow_nonexist_keys8
+    pipeline = KeyMapper(
+        transforms=[SumTwoValues()],
+        mapping=dict(num_1='a', num_2='b'),
+        auto_remap=False,
+        allow_nonexist_keys=True)
+
+    results = pipeline(dict(a=1, b=2))
+    np.testing.assert_equal(results['sum'], 3)
+
+    results = pipeline(dict(a=1))
+    np.testing.assert_equal(results['sum'], 1)
+
+    # Case 9: use wrapper as a transform
+    transform = KeyMapper(mapping=dict(b='a'), auto_remap=False)
+    results = transform(dict(a=1))
+    # note that the original key 'a' will not be removed
+    assert results == dict(a=1, b=1)
+
+    # Case 10: manually set keys ignored
+    pipeline = KeyMapper(
+        transforms=[SumTwoValues()],
+        mapping=dict(num_1='a', num_2=...),  # num_2 (b) will be ignored
+        auto_remap=False,
+        # allow_nonexist_keys will not affect manually ignored keys
+        allow_nonexist_keys=False)
+
+    results = pipeline(dict(a=1, b=2))
+    np.testing.assert_equal(results['sum'], 1)
+
+    # Test basic functions
+    pipeline = KeyMapper(
+        transforms=[AddToValue(addend=1)],
+        mapping=dict(value='v_in'),
+        remapping=dict(value='v_out'))
+
+    # __iter__
+    for _ in pipeline:
+        pass
+
+    # __repr__
+    assert repr(pipeline) == (
+        'KeyMapper(transforms = Compose(\n    ' + 'AddToValueaddend = 1' +
+        '\n), mapping = {\'value\': \'v_in\'}, ' +
+        'remapping = {\'value\': \'v_out\'}, auto_remap = False, ' +
+        'allow_nonexist_keys = False)')
+
+
+def test_transform_broadcaster():
+
+    # Case 1: apply to list in results
+    pipeline = TransformBroadcaster(
+        transforms=[AddToValue(addend=1)],
+        mapping=dict(value='values'),
+        auto_remap=True)
+    results = dict(values=[1, 2])
+
+    results = pipeline(results)
+
+    np.testing.assert_equal(results['values'], [2, 3])
+
+    # Case 2: apply to multiple keys
+    pipeline = TransformBroadcaster(
+        transforms=[AddToValue(addend=1)],
+        mapping=dict(value=['v_1', 'v_2']),
+        auto_remap=True)
+    results = dict(v_1=1, v_2=2)
+
+    results = pipeline(results)
+
+    np.testing.assert_equal(results['v_1'], 2)
+    np.testing.assert_equal(results['v_2'], 3)
+
+    # Case 3: apply to multiple groups of keys
+    pipeline = TransformBroadcaster(
+        transforms=[SumTwoValues()],
+        mapping=dict(num_1=['a_1', 'b_1'], num_2=['a_2', 'b_2']),
+        remapping=dict(sum=['a', 'b']),
+        auto_remap=False)
+
+    results = dict(a_1=1, a_2=2, b_1=3, b_2=4)
+    results = pipeline(results)
+
+    np.testing.assert_equal(results['a'], 3)
+    np.testing.assert_equal(results['b'], 7)
+
+    # Case 3: apply to all keys
+    pipeline = TransformBroadcaster(
+        transforms=[SumTwoValues()], mapping=None, remapping=None)
+    results = dict(num_1=[1, 2, 3], num_2=[4, 5, 6])
+
+    results = pipeline(results)
+
+    np.testing.assert_equal(results['sum'], [5, 7, 9])
+
+    # Case 4: inconsistent sequence length
+    with pytest.raises(ValueError):
+        pipeline = TransformBroadcaster(
+            transforms=[SumTwoValues()],
+            mapping=dict(num_1='list_1', num_2='list_2'),
+            auto_remap=False)
+
+        results = dict(list_1=[1, 2], list_2=[1, 2, 3])
+        _ = pipeline(results)
+
+    # Case 5: share random parameter
+    pipeline = TransformBroadcaster(
+        transforms=[RandomAddToValue()],
+        mapping=dict(value='values'),
+        auto_remap=True,
+        share_random_params=True)
+
+    results = dict(values=[0, 0])
+    results = pipeline(results)
+
+    np.testing.assert_equal(results['values'][0], results['values'][1])
+
+    # Case 6: partial broadcasting
+    pipeline = TransformBroadcaster(
+        transforms=[SumTwoValues()],
+        mapping=dict(num_1=['a_1', 'b_1'], num_2=['a_2', ...]),
+        remapping=dict(sum=['a', 'b']),
+        auto_remap=False)
+
+    results = dict(a_1=1, a_2=2, b_1=3, b_2=4)
+    results = pipeline(results)
+
+    np.testing.assert_equal(results['a'], 3)
+    np.testing.assert_equal(results['b'], 3)
+
+    pipeline = TransformBroadcaster(
+        transforms=[SumTwoValues()],
+        mapping=dict(num_1=['a_1', 'b_1'], num_2=['a_2', 'b_2']),
+        remapping=dict(sum=['a', ...]),
+        auto_remap=False)
+
+    results = dict(a_1=1, a_2=2, b_1=3, b_2=4)
+    results = pipeline(results)
+
+    np.testing.assert_equal(results['a'], 3)
+    assert 'b' not in results
+
+    # Test repr
+    assert repr(pipeline) == (
+        'TransformBroadcaster(transforms = Compose(\n' + '    SumTwoValues' +
+        '\n), mapping = {\'num_1\': [\'a_1\', \'b_1\'], ' +
+        '\'num_2\': [\'a_2\', \'b_2\']}, ' +
+        'remapping = {\'sum\': [\'a\', Ellipsis]}, auto_remap = False, ' +
+        'allow_nonexist_keys = False, share_random_params = False)')
+
+
+def test_random_choice():
+
+    # Case 1: given probability
+    pipeline = RandomChoice(
+        transforms=[[AddToValue(addend=1.0)], [AddToValue(addend=2.0)]],
+        prob=[1.0, 0.0])
+
+    results = pipeline(dict(value=1))
+    np.testing.assert_equal(results['value'], 2.0)
+
+    # Case 2: default probability
+    pipeline = RandomChoice(transforms=[[AddToValue(
+        addend=1.0)], [AddToValue(addend=2.0)]])
+
+    _ = pipeline(dict(value=1))
+
+    # Case 3: nested RandomChoice in TransformBroadcaster
+    pipeline = TransformBroadcaster(
+        transforms=[
+            RandomChoice(
+                transforms=[[AddToValue(addend=1.0)],
+                            [AddToValue(addend=2.0)]], ),
+        ],
+        mapping={'value': 'values'},
+        auto_remap=True,
+        share_random_params=True)
+
+    results = dict(values=[0 for _ in range(10)])
+    results = pipeline(results)
+    # check share_random_params=True works so that all values are same
+    values = results['values']
+    assert all(map(lambda x: x == values[0], values))
+
+    # repr
+    assert repr(pipeline) == (
+        'TransformBroadcaster(transforms = Compose(\n' +
+        '    RandomChoice(transforms = [Compose(\n' +
+        '    AddToValueaddend = 1.0' + '\n), Compose(\n' +
+        '    AddToValueaddend = 2.0' + '\n)]prob = None)' +
+        '\n), mapping = {\'value\': \'values\'}, ' +
+        'remapping = {\'value\': \'values\'}, auto_remap = True, ' +
+        'allow_nonexist_keys = False, share_random_params = True)')
+
+
+def test_random_apply():
+
+    # Case 1: simple use
+    pipeline = RandomApply(transforms=[AddToValue(addend=1.0)], prob=1.0)
+    results = pipeline(dict(value=1))
+    np.testing.assert_equal(results['value'], 2.0)
+
+    pipeline = RandomApply(transforms=[AddToValue(addend=1.0)], prob=0.0)
+    results = pipeline(dict(value=1))
+    np.testing.assert_equal(results['value'], 1.0)
+
+    # Case 2: nested RandomApply in TransformBroadcaster
+    pipeline = TransformBroadcaster(
+        transforms=[RandomApply(transforms=[AddToValue(addend=1)], prob=0.5)],
+        mapping={'value': 'values'},
+        auto_remap=True,
+        share_random_params=True)
+
+    results = dict(values=[0 for _ in range(10)])
+    results = pipeline(results)
+    # check share_random_params=True works so that all values are same
+    values = results['values']
+    assert all(map(lambda x: x == values[0], values))
+
+    # __iter__
+    for _ in pipeline:
+        pass
+
+    # repr
+    assert repr(pipeline) == (
+        'TransformBroadcaster(transforms = Compose(\n' +
+        '    RandomApply(transforms = Compose(\n' +
+        '    AddToValueaddend = 1' + '\n), prob = 0.5)' +
+        '\n), mapping = {\'value\': \'values\'}, ' +
+        'remapping = {\'value\': \'values\'}, auto_remap = True, ' +
+        'allow_nonexist_keys = False, share_random_params = True)')
+
+
+def test_utils():
+    # Test cache_randomness: normal case
+    class DummyTransform(BaseTransform):
+
+        @cache_randomness
+        def func(self):
+            return np.random.rand()
+
+        def transform(self, results):
+            _ = self.func()
+            return results
+
+    transform = DummyTransform()
+    _ = transform({})
+    with cache_random_params(transform):
+        _ = transform({})
+
+    # Test cache_randomness: invalid function type
+    with pytest.raises(TypeError):
+
+        class DummyTransform(BaseTransform):
+
+            @cache_randomness
+            @staticmethod
+            def func():
+                return np.random.rand()
+
+            def transform(self, results):
+                return results
+
+    # Test cache_randomness: invalid function argument list
+    with pytest.raises(TypeError):
+
+        class DummyTransform(BaseTransform):
+
+            @cache_randomness
+            def func(cls):
+                return np.random.rand()
+
+            def transform(self, results):
+                return results
+
+    # Test avoid_cache_randomness: invalid mixture with cache_randomness
+    with pytest.raises(RuntimeError):
+
+        @avoid_cache_randomness
+        class DummyTransform(BaseTransform):
+
+            @cache_randomness
+            def func(self):
+                pass
+
+            def transform(self, results):
+                return results
+
+    # Test avoid_cache_randomness: raise error in cache_random_params
+    with pytest.raises(RuntimeError):
+
+        @avoid_cache_randomness
+        class DummyTransform(BaseTransform):
+
+            def transform(self, results):
+                return results
+
+        transform = DummyTransform()
+        with cache_random_params(transform):
+            pass
+
+    # Test avoid_cache_randomness: non-inheritable
+    @avoid_cache_randomness
+    class DummyBaseTransform(BaseTransform):
+
+        def transform(self, results):
+            return results
+
+    class DummyTransform(DummyBaseTransform):
+        pass
+
+    transform = DummyTransform()
+    with cache_random_params(transform):
+        pass
diff --git a/head_extractor/mmcv-2.1.0/tests/test_utils/test_env.py b/head_extractor/mmcv-2.1.0/tests/test_utils/test_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..74bafff3715d862394147f505adff77448108e11
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_utils/test_env.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+
+import pytest
+
+import mmcv
+
+
+def test_collect_env():
+    try:
+        import torch  # noqa: F401
+    except ModuleNotFoundError:
+        pytest.skip('skipping tests that require PyTorch')
+
+    from mmcv.utils import collect_env
+    env_info = collect_env()
+    expected_keys = [
+        'sys.platform', 'Python', 'CUDA available', 'PyTorch',
+        'PyTorch compiling details', 'OpenCV', 'MMCV', 'MMCV Compiler', 'GCC',
+        'MMCV CUDA Compiler'
+    ]
+    for key in expected_keys:
+        assert key in env_info
+
+    if env_info['CUDA available']:
+        for key in ['CUDA_HOME', 'NVCC']:
+            assert key in env_info
+
+    if sys.platform == 'win32':
+        assert 'MSVC' in env_info
+
+    assert env_info['sys.platform'] == sys.platform
+    assert env_info['Python'] == sys.version.replace('\n', '')
+    assert env_info['MMCV'] == mmcv.__version__
diff --git a/head_extractor/mmcv-2.1.0/tests/test_utils/test_parrots_jit.py b/head_extractor/mmcv-2.1.0/tests/test_utils/test_parrots_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..921a4402de82f699b6b96566da6dfed12f0a2b5d
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_utils/test_parrots_jit.py
@@ -0,0 +1,278 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+import mmcv
+
+pytest.skip('this test not ready now', allow_module_level=True)
+skip_no_parrots = pytest.mark.skipif(
+    TORCH_VERSION != 'parrots', reason='test case under parrots environment')
+
+
+class TestJit:
+
+    def test_add_dict(self):
+
+        @mmcv.jit
+        def add_dict(oper):
+            rets = oper['x'] + oper['y']
+            return {'result': rets}
+
+        def add_dict_pyfunc(oper):
+            rets = oper['x'] + oper['y']
+            return {'result': rets}
+
+        a = torch.rand((3, 4))
+        b = torch.rand((3, 4))
+        oper = {'x': a, 'y': b}
+
+        rets_t = add_dict(oper)
+        rets = add_dict_pyfunc(oper)
+        assert 'result' in rets
+        assert (rets_t['result'] == rets['result']).all()
+
+    def test_add_list(self):
+
+        @mmcv.jit
+        def add_list(oper, x, y):
+            rets = {}
+            for idx, pair in enumerate(oper):
+                rets[f'k{idx}'] = pair['x'] + pair['y']
+            rets[f'k{len(oper)}'] = x + y
+            return rets
+
+        def add_list_pyfunc(oper, x, y):
+            rets = {}
+            for idx, pair in enumerate(oper):
+                rets[f'k{idx}'] = pair['x'] + pair['y']
+            rets[f'k{len(oper)}'] = x + y
+            return rets
+
+        pair_num = 3
+        oper = []
+        for _ in range(pair_num):
+            oper.append({'x': torch.rand((3, 4)), 'y': torch.rand((3, 4))})
+        a = torch.rand((3, 4))
+        b = torch.rand((3, 4))
+        rets = add_list_pyfunc(oper, x=a, y=b)
+        rets_t = add_list(oper, x=a, y=b)
+        for idx in range(pair_num + 1):
+            assert f'k{idx}' in rets_t
+            assert (rets[f'k{idx}'] == rets_t[f'k{idx}']).all()
+
+    @skip_no_parrots
+    def test_jit_cache(self):
+
+        @mmcv.jit
+        def func(oper):
+            if oper['const'] > 1:
+                return oper['x'] * 2 + oper['y']
+            else:
+                return oper['x'] * 2 - oper['y']
+
+        def pyfunc(oper):
+            if oper['const'] > 1:
+                return oper['x'] * 2 + oper['y']
+            else:
+                return oper['x'] * 2 - oper['y']
+
+        assert len(func._cache._cache) == 0
+
+        oper = {'const': 2, 'x': torch.rand((3, 4)), 'y': torch.rand((3, 4))}
+        rets_plus = pyfunc(oper)
+        rets_plus_t = func(oper)
+        assert (rets_plus == rets_plus_t).all()
+        assert len(func._cache._cache) == 1
+
+        oper['const'] = 0.5
+        rets_minus = pyfunc(oper)
+        rets_minus_t = func(oper)
+        assert (rets_minus == rets_minus_t).all()
+        assert len(func._cache._cache) == 2
+
+        rets_a = (rets_minus_t + rets_plus_t) / 4
+        assert torch.allclose(oper['x'], rets_a)
+
+    @skip_no_parrots
+    def test_jit_shape(self):
+
+        @mmcv.jit
+        def func(a):
+            return a + 1
+
+        assert len(func._cache._cache) == 0
+
+        a = torch.ones((3, 4))
+        r = func(a)
+        assert r.shape == (3, 4)
+        assert (r == 2).all()
+        assert len(func._cache._cache) == 1
+
+        a = torch.ones((2, 3, 4))
+        r = func(a)
+        assert r.shape == (2, 3, 4)
+        assert (r == 2).all()
+        assert len(func._cache._cache) == 2
+
+    @skip_no_parrots
+    def test_jit_kwargs(self):
+
+        @mmcv.jit
+        def func(a, b):
+            return torch.mean((a - b) * (a - b))
+
+        assert len(func._cache._cache) == 0
+        x = torch.rand((16, 32))
+        y = torch.rand((16, 32))
+        func(x, y)
+        assert len(func._cache._cache) == 1
+        func(x, b=y)
+        assert len(func._cache._cache) == 1
+        func(b=y, a=x)
+        assert len(func._cache._cache) == 1
+
+    def test_jit_derivate(self):
+
+        @mmcv.jit(derivate=True)
+        def func(x, y):
+            return (x + 2) * (y - 2)
+
+        a = torch.rand((3, 4))
+        b = torch.rand((3, 4))
+        a.requires_grad = True
+
+        c = func(a, b)
+        assert c.requires_grad
+        d = torch.empty_like(c)
+        d.fill_(1.0)
+        c.backward(d)
+        assert torch.allclose(a.grad, (b - 2))
+        assert b.grad is None
+
+        a.grad = None
+        c = func(a, b)
+        assert c.requires_grad
+        d = torch.empty_like(c)
+        d.fill_(2.7)
+        c.backward(d)
+        assert torch.allclose(a.grad, 2.7 * (b - 2))
+        assert b.grad is None
+
+    def test_jit_optimize(self):
+
+        @mmcv.jit(optimize=True)
+        def func(a, b):
+            return torch.mean((a - b) * (a - b))
+
+        def pyfunc(a, b):
+            return torch.mean((a - b) * (a - b))
+
+        a = torch.rand((16, 32))
+        b = torch.rand((16, 32))
+
+        c = func(a, b)
+        d = pyfunc(a, b)
+        assert torch.allclose(c, d)
+
+    @mmcv.skip_no_elena
+    def test_jit_coderize(self):
+        if not torch.cuda.is_available():
+            return
+
+        @mmcv.jit(coderize=True)
+        def func(a, b):
+            return (a + b) * (a - b)
+
+        def pyfunc(a, b):
+            return (a + b) * (a - b)
+
+        a = torch.rand((16, 32), device='cuda')
+        b = torch.rand((16, 32), device='cuda')
+
+        c = func(a, b)
+        d = pyfunc(a, b)
+        assert torch.allclose(c, d)
+
+    def test_jit_value_dependent(self):
+
+        @mmcv.jit
+        def func(a, b):
+            torch.nonzero(a)
+            return torch.mean((a - b) * (a - b))
+
+        def pyfunc(a, b):
+            torch.nonzero(a)
+            return torch.mean((a - b) * (a - b))
+
+        a = torch.rand((16, 32))
+        b = torch.rand((16, 32))
+
+        c = func(a, b)
+        d = pyfunc(a, b)
+        assert torch.allclose(c, d)
+
+    @skip_no_parrots
+    def test_jit_check_input(self):
+
+        def func(x):
+            y = torch.rand_like(x)
+            return x + y
+
+        a = torch.ones((3, 4))
+        with pytest.raises(AssertionError):
+            func = mmcv.jit(func, check_input=(a, ))
+
+    @skip_no_parrots
+    def test_jit_partial_shape(self):
+
+        @mmcv.jit(full_shape=False)
+        def func(a, b):
+            return torch.mean((a - b) * (a - b))
+
+        def pyfunc(a, b):
+            return torch.mean((a - b) * (a - b))
+
+        a = torch.rand((3, 4))
+        b = torch.rand((3, 4))
+        assert torch.allclose(func(a, b), pyfunc(a, b))
+        assert len(func._cache._cache) == 1
+
+        a = torch.rand((6, 5))
+        b = torch.rand((6, 5))
+        assert torch.allclose(func(a, b), pyfunc(a, b))
+        assert len(func._cache._cache) == 1
+
+        a = torch.rand((3, 4, 5))
+        b = torch.rand((3, 4, 5))
+        assert torch.allclose(func(a, b), pyfunc(a, b))
+        assert len(func._cache._cache) == 2
+
+        a = torch.rand((1, 9, 8))
+        b = torch.rand((1, 9, 8))
+        assert torch.allclose(func(a, b), pyfunc(a, b))
+        assert len(func._cache._cache) == 2
+
+    def test_instance_method(self):
+
+        class T:
+
+            def __init__(self, shape):
+                self._c = torch.rand(shape)
+
+            @mmcv.jit
+            def test_method(self, x, y):
+                return (x * self._c) + y
+
+        shape = (16, 32)
+        t = T(shape)
+        a = torch.rand(shape)
+        b = torch.rand(shape)
+        res = (a * t._c) + b
+        jit_res = t.test_method(a, b)
+        assert torch.allclose(res, jit_res)
+
+        t = T(shape)
+        res = (a * t._c) + b
+        jit_res = t.test_method(a, b)
+        assert torch.allclose(res, jit_res)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_video/test_optflow.py b/head_extractor/mmcv-2.1.0/tests/test_video/test_optflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5aaba3f5e062babe723d753e74e4b4451b1e452
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_video/test_optflow.py
@@ -0,0 +1,291 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+
+import cv2
+import numpy as np
+import pytest
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+
+import mmcv
+
+
+def test_flowread():
+    data_dir = osp.join(osp.dirname(__file__), '../data')
+    flow_shape = (60, 80, 2)
+
+    # read .flo file
+    flow = mmcv.flowread(osp.join(data_dir, 'optflow.flo'))
+    assert flow.shape == flow_shape
+
+    # pseudo read
+    flow_same = mmcv.flowread(flow)
+    assert_array_equal(flow, flow_same)
+
+    # read quantized flow concatenated vertically
+    flow = mmcv.flowread(
+        osp.join(data_dir, 'optflow_concat0.jpg'), quantize=True, denorm=True)
+    assert flow.shape == flow_shape
+
+    # read quantized flow concatenated horizontally
+    flow = mmcv.flowread(
+        osp.join(data_dir, 'optflow_concat1.jpg'),
+        quantize=True,
+        concat_axis=1,
+        denorm=True)
+    assert flow.shape == flow_shape
+
+    # test exceptions
+    notflow_file = osp.join(data_dir, 'color.jpg')
+    with pytest.raises(TypeError):
+        mmcv.flowread(1)
+    with pytest.raises(IOError):
+        mmcv.flowread(notflow_file)
+    with pytest.raises(IOError):
+        mmcv.flowread(notflow_file, quantize=True)
+    with pytest.raises(ValueError):
+        mmcv.flowread(np.zeros((100, 100, 1)))
+
+
+def test_flowwrite():
+    flow = np.random.rand(100, 100, 2).astype(np.float32)
+
+    # write to a .flo file
+    tmp_filehandler, filename = tempfile.mkstemp()
+    mmcv.flowwrite(flow, filename)
+    flow_from_file = mmcv.flowread(filename)
+    assert_array_equal(flow, flow_from_file)
+    os.close(tmp_filehandler)
+    os.remove(filename)
+
+    # write to two .jpg files
+    tmp_filename = osp.join(tempfile.gettempdir(), 'mmcv_test_flow.jpg')
+    for concat_axis in range(2):
+        mmcv.flowwrite(
+            flow, tmp_filename, quantize=True, concat_axis=concat_axis)
+        shape = (200, 100) if concat_axis == 0 else (100, 200)
+        assert osp.isfile(tmp_filename)
+        assert mmcv.imread(tmp_filename, flag='unchanged').shape == shape
+        os.remove(tmp_filename)
+
+    # test exceptions
+    with pytest.raises(AssertionError):
+        mmcv.flowwrite(flow, tmp_filename, quantize=True, concat_axis=2)
+
+
+def test_quantize_flow():
+    flow = (np.random.rand(10, 8, 2).astype(np.float32) - 0.5) * 15
+    max_val = 5.0
+    dx, dy = mmcv.quantize_flow(flow, max_val=max_val, norm=False)
+    ref = np.zeros_like(flow, dtype=np.uint8)
+    for i in range(ref.shape[0]):
+        for j in range(ref.shape[1]):
+            for k in range(ref.shape[2]):
+                val = flow[i, j, k] + max_val
+                val = min(max(val, 0), 2 * max_val)
+                ref[i, j, k] = min(np.floor(255 * val / (2 * max_val)), 254)
+    assert_array_equal(dx, ref[..., 0])
+    assert_array_equal(dy, ref[..., 1])
+    max_val = 0.5
+    dx, dy = mmcv.quantize_flow(flow, max_val=max_val, norm=True)
+    ref = np.zeros_like(flow, dtype=np.uint8)
+    for i in range(ref.shape[0]):
+        for j in range(ref.shape[1]):
+            for k in range(ref.shape[2]):
+                scale = flow.shape[1] if k == 0 else flow.shape[0]
+                val = flow[i, j, k] / scale + max_val
+                val = min(max(val, 0), 2 * max_val)
+                ref[i, j, k] = min(np.floor(255 * val / (2 * max_val)), 254)
+    assert_array_equal(dx, ref[..., 0])
+    assert_array_equal(dy, ref[..., 1])
+
+
+def test_dequantize_flow():
+    dx = np.random.randint(256, size=(10, 8), dtype=np.uint8)
+    dy = np.random.randint(256, size=(10, 8), dtype=np.uint8)
+    max_val = 5.0
+    flow = mmcv.dequantize_flow(dx, dy, max_val=max_val, denorm=False)
+    ref = np.zeros_like(flow, dtype=np.float32)
+    for i in range(ref.shape[0]):
+        for j in range(ref.shape[1]):
+            ref[i, j, 0] = float(dx[i, j] + 0.5) * 2 * max_val / 255 - max_val
+            ref[i, j, 1] = float(dy[i, j] + 0.5) * 2 * max_val / 255 - max_val
+    assert_array_almost_equal(flow, ref)
+    max_val = 0.5
+    flow = mmcv.dequantize_flow(dx, dy, max_val=max_val, denorm=True)
+    h, w = dx.shape
+    ref = np.zeros_like(flow, dtype=np.float32)
+    for i in range(ref.shape[0]):
+        for j in range(ref.shape[1]):
+            ref[i, j,
+                0] = (float(dx[i, j] + 0.5) * 2 * max_val / 255 - max_val) * w
+            ref[i, j,
+                1] = (float(dy[i, j] + 0.5) * 2 * max_val / 255 - max_val) * h
+    assert_array_almost_equal(flow, ref)
+
+
+def test_flow2rgb():
+    flow = np.array([[[0, 0], [0.5, 0.5], [1, 1], [2, 1], [3, np.inf]]],
+                    dtype=np.float32)
+    flow_img = mmcv.flow2rgb(flow)
+    # yapf: disable
+    assert_array_almost_equal(
+        flow_img,
+        np.array([[[1., 1., 1.],
+                   [1., 0.826074731, 0.683772236],
+                   [1., 0.652149462, 0.367544472],
+                   [1., 0.265650552, 5.96046448e-08],
+                   [0., 0., 0.]]],
+                 dtype=np.float32))
+    # yapf: enable
+
+
+def test_flow_warp():
+
+    img = np.zeros((5, 5, 3))
+    img[2, 2, 0] = 1
+    flow = np.ones((5, 5, 2))
+
+    res_nn = mmcv.flow_warp(img, flow, interpolate_mode='nearest')
+    res_bi = mmcv.flow_warp(img, flow, interpolate_mode='bilinear')
+
+    assert_array_almost_equal(res_nn, res_bi, decimal=5)
+
+    img = np.zeros((5, 5, 1))
+    img[2, 2, 0] = 1
+    img[2, 3, 0] = 0.75
+    flow = np.zeros((5, 5, 2))
+    flow[2, 2, :] = [0.5, 0.7]
+
+    res_ = np.copy(img)
+    res_[2, 2] = 0.5 * 0.3 + 0.75 * 0.5 * 0.3
+    res_bi = mmcv.flow_warp(img, flow, interpolate_mode='bilinear')
+    assert_array_almost_equal(res_, res_bi, decimal=5)
+
+    with pytest.raises(NotImplementedError):
+        _ = mmcv.flow_warp(img, flow, interpolate_mode='xxx')
+
+    with pytest.raises(AssertionError):
+        _ = mmcv.flow_warp(img, flow[:, :, 0], interpolate_mode='xxx')
+
+
+def test_make_color_wheel():
+    default_color_wheel = mmcv.make_color_wheel()
+    color_wheel = mmcv.make_color_wheel([2, 2, 2, 2, 2, 2])
+    # yapf: disable
+    assert_array_equal(default_color_wheel, np.array(
+        [[1.       , 0.        , 0.        ],  # noqa
+        [1.        , 0.06666667, 0.        ],  # noqa
+        [1.        , 0.13333334, 0.        ],  # noqa
+        [1.        , 0.2       , 0.        ],  # noqa
+        [1.        , 0.26666668, 0.        ],  # noqa
+        [1.        , 0.33333334, 0.        ],  # noqa
+        [1.        , 0.4       , 0.        ],  # noqa
+        [1.        , 0.46666667, 0.        ],  # noqa
+        [1.        , 0.53333336, 0.        ],  # noqa
+        [1.        , 0.6       , 0.        ],  # noqa
+        [1.        , 0.6666667 , 0.        ],  # noqa
+        [1.        , 0.73333335, 0.        ],  # noqa
+        [1.        , 0.8       , 0.        ],  # noqa
+        [1.        , 0.8666667 , 0.        ],  # noqa
+        [1.        , 0.93333334, 0.        ],  # noqa
+        [1.        , 1.        , 0.        ],  # noqa
+        [0.8333333 , 1.        , 0.        ],  # noqa
+        [0.6666667 , 1.        , 0.        ],  # noqa
+        [0.5       , 1.        , 0.        ],  # noqa
+        [0.33333334, 1.        , 0.        ],  # noqa
+        [0.16666667, 1.        , 0.        ],  # noqa
+        [0.        , 1.        , 0.        ],  # noqa
+        [0.        , 1.        , 0.25      ],  # noqa
+        [0.        , 1.        , 0.5       ],  # noqa
+        [0.        , 1.        , 0.75      ],  # noqa
+        [0.        , 1.        , 1.        ],  # noqa
+        [0.        , 0.90909094, 1.        ],  # noqa
+        [0.        , 0.8181818 , 1.        ],  # noqa
+        [0.        , 0.72727275, 1.        ],  # noqa
+        [0.        , 0.6363636 , 1.        ],  # noqa
+        [0.        , 0.54545456, 1.        ],  # noqa
+        [0.        , 0.45454547, 1.        ],  # noqa
+        [0.        , 0.36363637, 1.        ],  # noqa
+        [0.        , 0.27272728, 1.        ],  # noqa
+        [0.        , 0.18181819, 1.        ],  # noqa
+        [0.        , 0.09090909, 1.        ],  # noqa
+        [0.        , 0.        , 1.        ],  # noqa
+        [0.07692308, 0.        , 1.        ],  # noqa
+        [0.15384616, 0.        , 1.        ],  # noqa
+        [0.23076923, 0.        , 1.        ],  # noqa
+        [0.30769232, 0.        , 1.        ],  # noqa
+        [0.3846154 , 0.        , 1.        ],  # noqa
+        [0.46153846, 0.        , 1.        ],  # noqa
+        [0.53846157, 0.        , 1.        ],  # noqa
+        [0.61538464, 0.        , 1.        ],  # noqa
+        [0.6923077 , 0.        , 1.        ],  # noqa
+        [0.7692308 , 0.        , 1.        ],  # noqa
+        [0.84615386, 0.        , 1.        ],  # noqa
+        [0.9230769 , 0.        , 1.        ],  # noqa
+        [1.        , 0.        , 1.        ],  # noqa
+        [1.        , 0.        , 0.8333333 ],  # noqa
+        [1.        , 0.        , 0.6666667 ],  # noqa
+        [1.        , 0.        , 0.5       ],  # noqa
+        [1.        , 0.        , 0.33333334],  # noqa
+        [1.        , 0.        , 0.16666667]], dtype=np.float32))  # noqa
+
+    assert_array_equal(
+        color_wheel,
+        np.array([[1., 0. , 0. ],  # noqa
+                 [1. , 0.5, 0. ],  # noqa
+                 [1. , 1. , 0. ],  # noqa
+                 [0.5, 1. , 0. ],  # noqa
+                 [0. , 1. , 0. ],  # noqa
+                 [0. , 1. , 0.5],  # noqa
+                 [0. , 1. , 1. ],  # noqa
+                 [0. , 0.5, 1. ],  # noqa
+                 [0. , 0. , 1. ],  # noqa
+                 [0.5, 0. , 1. ],  # noqa
+                 [1. , 0. , 1. ],  # noqa
+                 [1. , 0. , 0.5]], dtype=np.float32))  # noqa
+    # yapf: enable
+
+
+def test_flow_from_bytes():
+    data_dir = osp.join(osp.dirname(__file__), '../data')
+    flow_shape = (60, 80, 2)
+    flow_file = osp.join(data_dir, 'optflow.flo')
+
+    # read .flo file
+    flow_fromfile = mmcv.flowread(flow_file)
+
+    with open(flow_file, 'rb') as f:
+        flow_bytes = f.read()
+    flow_frombytes = mmcv.flow_from_bytes(flow_bytes)
+
+    assert flow_frombytes.shape == flow_shape
+    assert np.all(flow_frombytes == flow_fromfile)
+
+
+def test_sparse_flow_from_bytes():
+    data_dir = osp.join(osp.dirname(__file__), '../data')
+    flow_file = osp.join(data_dir, 'sparse_flow.png')
+
+    with open(flow_file, 'rb') as f:
+        flow_bytes = f.read()
+    # read flow from bytes
+    flow_frombytes, valid_frombytes = mmcv.sparse_flow_from_bytes(flow_bytes)
+
+    # test flow shape is [H, W, 2] and valid shape is [H, W]
+    assert flow_frombytes.shape[:2] == valid_frombytes.shape
+    assert flow_frombytes.shape[2] == 2
+
+    def read_sparse_flow_from_file():
+        flow = cv2.imread(flow_file, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
+        flow = flow[:, :, ::-1].astype(np.float32)
+        flow, valid = flow[:, :, :2], flow[:, :, 2]
+        flow = (flow - 2**15) / 64.0
+        return flow, valid
+
+    # read flow from file
+    flow_flowfile, valid_fromfile = read_sparse_flow_from_file()
+
+    assert np.all(flow_frombytes == flow_flowfile)
+    assert np.all(valid_frombytes == valid_fromfile)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_video/test_processing.py b/head_extractor/mmcv-2.1.0/tests/test_video/test_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..88c37a2bd3f1f353e1b402119226b4725cabe904
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_video/test_processing.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import platform
+import tempfile
+
+import pytest
+
+import mmcv
+
+
+class TestVideoEditor:
+
+    @classmethod
+    def setup_class(cls):
+        cls.video_path = osp.join(osp.dirname(__file__), '../data/test.mp4')
+        cls.num_frames = 168
+
+    @pytest.mark.skipif(platform.system() == 'Windows', reason='skip windows')
+    def test_cut_concat_video(self):
+        part1_file = osp.join(tempfile.gettempdir(), '.mmcv_test1.mp4')
+        part2_file = osp.join(tempfile.gettempdir(), '.mmcv_test2.mp4')
+        mmcv.cut_video(self.video_path, part1_file, end=3, vcodec='h264')
+        mmcv.cut_video(self.video_path, part2_file, start=3, vcodec='h264')
+        v1 = mmcv.VideoReader(part1_file)
+        v2 = mmcv.VideoReader(part2_file)
+        assert len(v1) == 75
+        assert len(v2) == self.num_frames - 75
+
+        out_file = osp.join(tempfile.gettempdir(), '.mmcv_test.mp4')
+        mmcv.concat_video([part1_file, part2_file], out_file)
+        v = mmcv.VideoReader(out_file)
+        assert len(v) == self.num_frames
+        os.remove(part1_file)
+        os.remove(part2_file)
+        os.remove(out_file)
+
+    @pytest.mark.skipif(platform.system() == 'Windows', reason='skip windows')
+    def test_resize_video(self):
+        out_file = osp.join(tempfile.gettempdir(), '.mmcv_test.mp4')
+        mmcv.resize_video(
+            self.video_path, out_file, (200, 100), log_level='panic')
+        v = mmcv.VideoReader(out_file)
+        assert v.resolution == (200, 100)
+        os.remove(out_file)
+        mmcv.resize_video(self.video_path, out_file, ratio=2)
+        v = mmcv.VideoReader(out_file)
+        assert v.resolution == (294 * 2, 240 * 2)
+        os.remove(out_file)
+        mmcv.resize_video(self.video_path, out_file, (1000, 480), keep_ar=True)
+        v = mmcv.VideoReader(out_file)
+        assert v.resolution == (294 * 2, 240 * 2)
+        os.remove(out_file)
+        mmcv.resize_video(
+            self.video_path, out_file, ratio=(2, 1.5), keep_ar=True)
+        v = mmcv.VideoReader(out_file)
+        assert v.resolution == (294 * 2, 360)
+        os.remove(out_file)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_video/test_reader.py b/head_extractor/mmcv-2.1.0/tests/test_video/test_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3bbdb7dcbbdd42e3c1e5ffefccbbf8b5c6c3897
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_video/test_reader.py
@@ -0,0 +1,210 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+import tempfile
+from collections import OrderedDict
+
+import pytest
+
+import mmcv
+
+
+class TestCache:
+
+    def test_init(self):
+        with pytest.raises(ValueError):
+            mmcv.Cache(0)
+        cache = mmcv.Cache(100)
+        assert cache.capacity == 100
+        assert cache.size == 0
+
+    def test_put(self):
+        cache = mmcv.Cache(3)
+        for i in range(1, 4):
+            cache.put(f'k{i}', i)
+            assert cache.size == i
+        assert cache._cache == OrderedDict([('k1', 1), ('k2', 2), ('k3', 3)])
+        cache.put('k4', 4)
+        assert cache.size == 3
+        assert cache._cache == OrderedDict([('k2', 2), ('k3', 3), ('k4', 4)])
+        cache.put('k2', 2)
+        assert cache._cache == OrderedDict([('k2', 2), ('k3', 3), ('k4', 4)])
+
+    def test_get(self):
+        cache = mmcv.Cache(3)
+        assert cache.get('key_none') is None
+        assert cache.get('key_none', 0) == 0
+        cache.put('k1', 1)
+        assert cache.get('k1') == 1
+
+
+class TestVideoReader:
+
+    @classmethod
+    def setup_class(cls):
+        cls.video_path = osp.join(osp.dirname(__file__), '../data/test.mp4')
+        cls.num_frames = 168
+        cls.video_url = 'https://download.openmmlab.com/mmcv/test_data/sample-mp4-file.mp4'  # noqa: E501
+
+    def test_load(self):
+        # read from video file
+        v = mmcv.VideoReader(self.video_path)
+        assert v.width == 294
+        assert v.height == 240
+        assert v.fps == 25
+        assert v.frame_cnt == self.num_frames
+        assert len(v) == self.num_frames
+        assert v.opened
+        import cv2
+        assert isinstance(v.vcap, type(cv2.VideoCapture()))
+
+        # read from video url
+        v = mmcv.VideoReader(self.video_url)
+        assert v.width == 320
+        assert v.height == 240
+        assert v.fps == 15
+        assert v.frame_cnt == 1889
+        assert len(v) == 1889
+        assert v.opened
+        assert isinstance(v.vcap, type(cv2.VideoCapture()))
+
+    def test_read(self):
+        v = mmcv.VideoReader(self.video_path)
+        img = v.read()
+        assert int(round(img.mean())) == 94
+        img = v.get_frame(63)
+        assert int(round(img.mean())) == 94
+        img = v[64]
+        assert int(round(img.mean())) == 205
+        img = v[-104]
+        assert int(round(img.mean())) == 205
+        img = v[63]
+        assert int(round(img.mean())) == 94
+        img = v[-105]
+        assert int(round(img.mean())) == 94
+        img = v.read()
+        assert int(round(img.mean())) == 205
+        with pytest.raises(IndexError):
+            v.get_frame(self.num_frames + 1)
+        with pytest.raises(IndexError):
+            v[-self.num_frames - 1]
+
+    def test_slice(self):
+        v = mmcv.VideoReader(self.video_path)
+        imgs = v[-105:-103]
+        assert int(round(imgs[0].mean())) == 94
+        assert int(round(imgs[1].mean())) == 205
+        assert len(imgs) == 2
+        imgs = v[63:65]
+        assert int(round(imgs[0].mean())) == 94
+        assert int(round(imgs[1].mean())) == 205
+        assert len(imgs) == 2
+        imgs = v[64:62:-1]
+        assert int(round(imgs[0].mean())) == 205
+        assert int(round(imgs[1].mean())) == 94
+        assert len(imgs) == 2
+        imgs = v[:5]
+        assert len(imgs) == 5
+        for img in imgs:
+            assert int(round(img.mean())) == 94
+        imgs = v[165:]
+        assert len(imgs) == 3
+        for img in imgs:
+            assert int(round(img.mean())) == 0
+        imgs = v[-3:]
+        assert len(imgs) == 3
+        for img in imgs:
+            assert int(round(img.mean())) == 0
+
+    def test_current_frame(self):
+        v = mmcv.VideoReader(self.video_path)
+        assert v.current_frame() is None
+        v.read()
+        img = v.current_frame()
+        assert int(round(img.mean())) == 94
+
+    def test_position(self):
+        v = mmcv.VideoReader(self.video_path)
+        assert v.position == 0
+        for _ in range(10):
+            v.read()
+        assert v.position == 10
+        v.get_frame(99)
+        assert v.position == 100
+
+    def test_iterator(self):
+        cnt = 0
+        for img in mmcv.VideoReader(self.video_path):
+            cnt += 1
+            assert img.shape == (240, 294, 3)
+        assert cnt == self.num_frames
+
+    def test_with(self):
+        with mmcv.VideoReader(self.video_path) as v:
+            assert v.opened
+        assert not v.opened
+
+    def test_cvt2frames(self):
+        v = mmcv.VideoReader(self.video_path)
+        frame_dir = tempfile.mkdtemp()
+        v.cvt2frames(frame_dir)
+        assert osp.isdir(frame_dir)
+        for i in range(self.num_frames):
+            filename = f'{frame_dir}/{i:06d}.jpg'
+            assert osp.isfile(filename)
+            os.remove(filename)
+
+        v = mmcv.VideoReader(self.video_path)
+        v.cvt2frames(frame_dir, show_progress=False)
+        assert osp.isdir(frame_dir)
+        for i in range(self.num_frames):
+            filename = f'{frame_dir}/{i:06d}.jpg'
+            assert osp.isfile(filename)
+            os.remove(filename)
+
+        v = mmcv.VideoReader(self.video_path)
+        v.cvt2frames(
+            frame_dir,
+            file_start=100,
+            filename_tmpl='{:03d}.JPEG',
+            start=100,
+            max_num=20)
+        assert osp.isdir(frame_dir)
+        for i in range(100, 120):
+            filename = f'{frame_dir}/{i:03d}.JPEG'
+            assert osp.isfile(filename)
+            os.remove(filename)
+        shutil.rmtree(frame_dir)
+
+    def test_frames2video(self):
+        v = mmcv.VideoReader(self.video_path)
+        frame_dir = tempfile.mkdtemp()
+        v.cvt2frames(frame_dir)
+        assert osp.isdir(frame_dir)
+        for i in range(self.num_frames):
+            filename = f'{frame_dir}/{i:06d}.jpg'
+            assert osp.isfile(filename)
+
+        out_filename = osp.join(tempfile.gettempdir(), 'mmcv_test.avi')
+        mmcv.frames2video(frame_dir, out_filename)
+        v = mmcv.VideoReader(out_filename)
+        assert v.fps == 30
+        assert len(v) == self.num_frames
+
+        mmcv.frames2video(
+            frame_dir,
+            out_filename,
+            fps=25,
+            start=10,
+            end=50,
+            show_progress=False)
+
+        with mmcv.VideoReader(out_filename) as v:
+            assert v.fps == 25
+            assert len(v) == 40
+
+            for i in range(self.num_frames):
+                filename = f'{frame_dir}/{i:06d}.jpg'
+                os.remove(filename)
+            shutil.rmtree(frame_dir)
diff --git a/head_extractor/mmcv-2.1.0/tests/test_visualization.py b/head_extractor/mmcv-2.1.0/tests/test_visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..82dd093bf8b6b97d196396d0ff79cde8d239b119
--- /dev/null
+++ b/head_extractor/mmcv-2.1.0/tests/test_visualization.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+
+import mmcv
+
+
+def test_color():
+    assert mmcv.color_val(mmcv.Color.blue) == (255, 0, 0)
+    assert mmcv.color_val('green') == (0, 255, 0)
+    assert mmcv.color_val((1, 2, 3)) == (1, 2, 3)
+    assert mmcv.color_val(100) == (100, 100, 100)
+    assert mmcv.color_val(np.zeros(3, dtype=int)) == (0, 0, 0)
+    with pytest.raises(TypeError):
+        mmcv.color_val([255, 255, 255])
+    with pytest.raises(TypeError):
+        mmcv.color_val(1.0)
+    with pytest.raises(AssertionError):
+        mmcv.color_val((0, 0, 500))
diff --git a/head_extractor/mmcv_install.log b/head_extractor/mmcv_install.log
new file mode 100644
index 0000000000000000000000000000000000000000..f9e66055182712062d506cdc2896a821e6724700
--- /dev/null
+++ b/head_extractor/mmcv_install.log
@@ -0,0 +1,3262 @@
+Using pip 25.3 from /opt/conda/lib/python3.11/site-packages/pip (python 3.11)
+Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
+Processing /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0
+  Preparing metadata (pyproject.toml): started
+  Running command Preparing metadata (pyproject.toml)
+  <string>:5: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
+  /opt/conda/lib/python3.11/site-packages/torch/cuda/__init__.py:58: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+    import pynvml  # type: ignore[import]
+  /opt/conda/lib/python3.11/site-packages/setuptools/dist.py:759: SetuptoolsDeprecationWarning: License classifiers are deprecated.
+  !!
+
+          ********************************************************************************
+          Please consider removing the following classifiers in favor of a SPDX license expression:
+
+          License :: OSI Approved :: Apache Software License
+
+          See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.
+          ********************************************************************************
+
+  !!
+    self._finalize_license_expression()
+  running dist_info
+  creating /tmp/pip-modern-metadata-ya8romzo/mmcv.egg-info
+  writing /tmp/pip-modern-metadata-ya8romzo/mmcv.egg-info/PKG-INFO
+  writing dependency_links to /tmp/pip-modern-metadata-ya8romzo/mmcv.egg-info/dependency_links.txt
+  writing requirements to /tmp/pip-modern-metadata-ya8romzo/mmcv.egg-info/requires.txt
+  writing top-level names to /tmp/pip-modern-metadata-ya8romzo/mmcv.egg-info/top_level.txt
+  writing manifest file '/tmp/pip-modern-metadata-ya8romzo/mmcv.egg-info/SOURCES.txt'
+  reading manifest file '/tmp/pip-modern-metadata-ya8romzo/mmcv.egg-info/SOURCES.txt'
+  reading manifest template 'MANIFEST.in'
+  adding license file 'LICENSE'
+  adding license file 'LICENSES.md'
+  writing manifest file '/tmp/pip-modern-metadata-ya8romzo/mmcv.egg-info/SOURCES.txt'
+  creating '/tmp/pip-modern-metadata-ya8romzo/mmcv-2.1.0.dist-info'
+  /opt/conda/lib/python3.11/site-packages/setuptools/_distutils/cmd.py:135: SetuptoolsDeprecationWarning: bdist_wheel.universal is deprecated
+  !!
+
+          ********************************************************************************
+          With Python 2.7 end-of-life, support for building universal wheels
+          (i.e., wheels that support both Python 2 and Python 3)
+          is being obviated.
+          Please discontinue using this option, or if you still need it,
+          file an issue with pypa/setuptools describing your use case.
+
+          This deprecation is overdue, please update your project and remove deprecated
+          calls to avoid build errors in the future.
+          ********************************************************************************
+
+  !!
+    self.finalize_options()
+  Preparing metadata (pyproject.toml): finished with status 'done'
+Requirement already satisfied: addict in /opt/conda/lib/python3.11/site-packages (from mmcv==2.1.0) (2.4.0)
+Requirement already satisfied: mmengine>=0.3.0 in /opt/conda/lib/python3.11/site-packages (from mmcv==2.1.0) (0.10.7)
+Requirement already satisfied: numpy in /opt/conda/lib/python3.11/site-packages (from mmcv==2.1.0) (2.2.6)
+Requirement already satisfied: packaging in /opt/conda/lib/python3.11/site-packages (from mmcv==2.1.0) (24.1)
+Requirement already satisfied: Pillow in /opt/conda/lib/python3.11/site-packages (from mmcv==2.1.0) (10.4.0)
+Requirement already satisfied: pyyaml in /opt/conda/lib/python3.11/site-packages (from mmcv==2.1.0) (6.0.1)
+Requirement already satisfied: yapf in /opt/conda/lib/python3.11/site-packages (from mmcv==2.1.0) (0.43.0)
+Requirement already satisfied: matplotlib in /opt/conda/lib/python3.11/site-packages (from mmengine>=0.3.0->mmcv==2.1.0) (3.9.3)
+Requirement already satisfied: rich in /opt/conda/lib/python3.11/site-packages (from mmengine>=0.3.0->mmcv==2.1.0) (13.9.4)
+Requirement already satisfied: termcolor in /opt/conda/lib/python3.11/site-packages (from mmengine>=0.3.0->mmcv==2.1.0) (3.2.0)
+Requirement already satisfied: opencv-python>=3 in /opt/conda/lib/python3.11/site-packages (from mmengine>=0.3.0->mmcv==2.1.0) (4.9.0.80)
+Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/lib/python3.11/site-packages (from matplotlib->mmengine>=0.3.0->mmcv==2.1.0) (1.3.1)
+Requirement already satisfied: cycler>=0.10 in /opt/conda/lib/python3.11/site-packages (from matplotlib->mmengine>=0.3.0->mmcv==2.1.0) (0.12.1)
+Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/lib/python3.11/site-packages (from matplotlib->mmengine>=0.3.0->mmcv==2.1.0) (4.55.3)
+Requirement already satisfied: kiwisolver>=1.3.1 in /opt/conda/lib/python3.11/site-packages (from matplotlib->mmengine>=0.3.0->mmcv==2.1.0) (1.4.7)
+Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/lib/python3.11/site-packages (from matplotlib->mmengine>=0.3.0->mmcv==2.1.0) (3.2.0)
+Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.11/site-packages (from matplotlib->mmengine>=0.3.0->mmcv==2.1.0) (2.9.0.post0)
+Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.11/site-packages (from python-dateutil>=2.7->matplotlib->mmengine>=0.3.0->mmcv==2.1.0) (1.16.0)
+Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/conda/lib/python3.11/site-packages (from rich->mmengine>=0.3.0->mmcv==2.1.0) (3.0.0)
+Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/lib/python3.11/site-packages (from rich->mmengine>=0.3.0->mmcv==2.1.0) (2.15.1)
+Requirement already satisfied: mdurl~=0.1 in /opt/conda/lib/python3.11/site-packages (from markdown-it-py>=2.2.0->rich->mmengine>=0.3.0->mmcv==2.1.0) (0.1.2)
+Requirement already satisfied: platformdirs>=3.5.1 in /opt/conda/lib/python3.11/site-packages (from yapf->mmcv==2.1.0) (3.10.0)
+Building wheels for collected packages: mmcv
+  Building wheel for mmcv (pyproject.toml): started
+  Running command Building wheel for mmcv (pyproject.toml)
+  <string>:5: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
+  /opt/conda/lib/python3.11/site-packages/torch/cuda/__init__.py:58: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py instead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
+    import pynvml  # type: ignore[import]
+  /opt/conda/lib/python3.11/site-packages/setuptools/dist.py:759: SetuptoolsDeprecationWarning: License classifiers are deprecated.
+  !!
+
+          ********************************************************************************
+          Please consider removing the following classifiers in favor of a SPDX license expression:
+
+          License :: OSI Approved :: Apache Software License
+
+          See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.
+          ********************************************************************************
+
+  !!
+    self._finalize_license_expression()
+  running bdist_wheel
+  /opt/conda/lib/python3.11/site-packages/setuptools/_distutils/cmd.py:135: SetuptoolsDeprecationWarning: bdist_wheel.universal is deprecated
+  !!
+
+          ********************************************************************************
+          With Python 2.7 end-of-life, support for building universal wheels
+          (i.e., wheels that support both Python 2 and Python 3)
+          is being obviated.
+          Please discontinue using this option, or if you still need it,
+          file an issue with pypa/setuptools describing your use case.
+
+          This deprecation is overdue, please update your project and remove deprecated
+          calls to avoid build errors in the future.
+          ********************************************************************************
+
+  !!
+    self.finalize_options()
+  running build
+  running build_py
+  creating build/lib.linux-x86_64-cpython-311/mmcv
+  copying mmcv/__init__.py -> build/lib.linux-x86_64-cpython-311/mmcv
+  copying mmcv/version.py -> build/lib.linux-x86_64-cpython-311/mmcv
+  creating build/lib.linux-x86_64-cpython-311/mmcv/arraymisc
+  copying mmcv/arraymisc/__init__.py -> build/lib.linux-x86_64-cpython-311/mmcv/arraymisc
+  copying mmcv/arraymisc/quantization.py -> build/lib.linux-x86_64-cpython-311/mmcv/arraymisc
+  creating build/lib.linux-x86_64-cpython-311/mmcv/cnn
+  copying mmcv/cnn/vgg.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn
+  copying mmcv/cnn/resnet.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn
+  copying mmcv/cnn/__init__.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn
+  copying mmcv/cnn/alexnet.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn
+  creating build/lib.linux-x86_64-cpython-311/mmcv/visualization
+  copying mmcv/visualization/color.py -> build/lib.linux-x86_64-cpython-311/mmcv/visualization
+  copying mmcv/visualization/optflow.py -> build/lib.linux-x86_64-cpython-311/mmcv/visualization
+  copying mmcv/visualization/__init__.py -> build/lib.linux-x86_64-cpython-311/mmcv/visualization
+  copying mmcv/visualization/image.py -> build/lib.linux-x86_64-cpython-311/mmcv/visualization
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/roi_align.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/sync_bn.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/deprecated_wrappers.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/gather_points.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/sparse_pool.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/multi_scale_deform_attn.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/roiaware_pool3d.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/bbox.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/three_nn.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/roipoint_pool3d.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/diff_iou_rotated.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/carafe.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/group_points.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/points_in_polygons.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/filtered_lrelu.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/saconv.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/scatter_points.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/furthest_point_sample.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/three_interpolate.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/nms.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/__init__.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/border_align.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/upfirdn2d.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/sparse_functional.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/contour_expand.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/bezier_align.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/merge_cells.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/prroi_pool.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/sparse_structure.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/box_iou_rotated.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/riroi_align_rotated.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/deform_conv.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/iou3d.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/tin_shift.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/sparse_conv.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/psa_mask.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/correlation.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/voxelize.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/info.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/pixel_group.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/focal_loss.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/corner_pool.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/point_sample.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/bias_act.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/deform_roi_pool.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/fused_bias_leakyrelu.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/sparse_modules.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/convex_iou.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/cc_attention.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/sparse_ops.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/points_in_boxes.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/ball_query.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/knn.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/roi_pool.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/box_iou_quadri.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/conv2d_gradfix.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/active_rotated_filter.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/assign_score_withk.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/modulated_deform_conv.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/points_sampler.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/masked_conv.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/rotated_feature_align.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/roi_align_rotated.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/min_area_polygons.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  copying mmcv/ops/chamfer_distance.py -> build/lib.linux-x86_64-cpython-311/mmcv/ops
+  creating build/lib.linux-x86_64-cpython-311/mmcv/transforms
+  copying mmcv/transforms/formatting.py -> build/lib.linux-x86_64-cpython-311/mmcv/transforms
+  copying mmcv/transforms/wrappers.py -> build/lib.linux-x86_64-cpython-311/mmcv/transforms
+  copying mmcv/transforms/__init__.py -> build/lib.linux-x86_64-cpython-311/mmcv/transforms
+  copying mmcv/transforms/processing.py -> build/lib.linux-x86_64-cpython-311/mmcv/transforms
+  copying mmcv/transforms/base.py -> build/lib.linux-x86_64-cpython-311/mmcv/transforms
+  copying mmcv/transforms/builder.py -> build/lib.linux-x86_64-cpython-311/mmcv/transforms
+  copying mmcv/transforms/utils.py -> build/lib.linux-x86_64-cpython-311/mmcv/transforms
+  copying mmcv/transforms/loading.py -> build/lib.linux-x86_64-cpython-311/mmcv/transforms
+  creating build/lib.linux-x86_64-cpython-311/mmcv/utils
+  copying mmcv/utils/parrots_jit.py -> build/lib.linux-x86_64-cpython-311/mmcv/utils
+  copying mmcv/utils/ext_loader.py -> build/lib.linux-x86_64-cpython-311/mmcv/utils
+  copying mmcv/utils/__init__.py -> build/lib.linux-x86_64-cpython-311/mmcv/utils
+  copying mmcv/utils/env.py -> build/lib.linux-x86_64-cpython-311/mmcv/utils
+  copying mmcv/utils/device_type.py -> build/lib.linux-x86_64-cpython-311/mmcv/utils
+  creating build/lib.linux-x86_64-cpython-311/mmcv/image
+  copying mmcv/image/__init__.py -> build/lib.linux-x86_64-cpython-311/mmcv/image
+  copying mmcv/image/misc.py -> build/lib.linux-x86_64-cpython-311/mmcv/image
+  copying mmcv/image/colorspace.py -> build/lib.linux-x86_64-cpython-311/mmcv/image
+  copying mmcv/image/geometric.py -> build/lib.linux-x86_64-cpython-311/mmcv/image
+  copying mmcv/image/photometric.py -> build/lib.linux-x86_64-cpython-311/mmcv/image
+  copying mmcv/image/io.py -> build/lib.linux-x86_64-cpython-311/mmcv/image
+  creating build/lib.linux-x86_64-cpython-311/mmcv/video
+  copying mmcv/video/optflow.py -> build/lib.linux-x86_64-cpython-311/mmcv/video
+  copying mmcv/video/__init__.py -> build/lib.linux-x86_64-cpython-311/mmcv/video
+  copying mmcv/video/processing.py -> build/lib.linux-x86_64-cpython-311/mmcv/video
+  copying mmcv/video/io.py -> build/lib.linux-x86_64-cpython-311/mmcv/video
+  creating build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/transformer.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/conv2d_adaptive_padding.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/norm.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/swish.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/activation.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/scale.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/conv.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/hsigmoid.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/wrappers.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/context_block.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/__init__.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/conv_ws.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/non_local.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/generalized_attention.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/upsample.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/depthwise_separable_conv_module.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/drop.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/conv_module.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/padding.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/plugin.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  copying mmcv/cnn/bricks/hswish.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks
+  creating build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch
+  copying mmcv/cnn/rfsearch/__init__.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch
+  copying mmcv/cnn/rfsearch/operator.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch
+  copying mmcv/cnn/rfsearch/utils.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch
+  copying mmcv/cnn/rfsearch/search.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch
+  creating build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils
+  copying mmcv/cnn/utils/flops_counter.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils
+  copying mmcv/cnn/utils/__init__.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils
+  copying mmcv/cnn/utils/fuse_conv_bn.py -> build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils
+  running egg_info
+  creating mmcv.egg-info
+  writing mmcv.egg-info/PKG-INFO
+  writing dependency_links to mmcv.egg-info/dependency_links.txt
+  writing requirements to mmcv.egg-info/requires.txt
+  writing top-level names to mmcv.egg-info/top_level.txt
+  writing manifest file 'mmcv.egg-info/SOURCES.txt'
+  reading manifest file 'mmcv.egg-info/SOURCES.txt'
+  reading manifest template 'MANIFEST.in'
+  adding license file 'LICENSE'
+  adding license file 'LICENSES.md'
+  writing manifest file 'mmcv.egg-info/SOURCES.txt'
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.common' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.common' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.common' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.common' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.common' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.common.cuda' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.common.cuda' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.common.cuda' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.common.cuda' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.common.cuda' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.common.cuda.spconv' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.common.cuda.spconv' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.common.cuda.spconv' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.common.cuda.spconv' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.common.cuda.spconv' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.common.mlu' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.common.mlu' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.common.mlu' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.common.mlu' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.common.mlu' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.common.mps' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.common.mps' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.common.mps' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.common.mps' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.common.mps' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.common.utils.spconv' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.common.utils.spconv' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.common.utils.spconv' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.common.utils.spconv' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.common.utils.spconv' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.common.utils.spconv.spconv' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.common.utils.spconv.spconv' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.common.utils.spconv.spconv' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.common.utils.spconv.spconv' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.common.utils.spconv.spconv' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.common.utils.spconv.tensorview' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.common.utils.spconv.tensorview' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.common.utils.spconv.tensorview' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.common.utils.spconv.tensorview' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.common.utils.spconv.tensorview' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.parrots' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.parrots' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.parrots' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.parrots' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.parrots' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.pytorch' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.pytorch' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.pytorch' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.pytorch' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.pytorch' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.pytorch.cpu' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.pytorch.cpu' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.pytorch.cpu' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.pytorch.cpu' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.pytorch.cpu' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.pytorch.cuda' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.pytorch.cuda' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.pytorch.cuda' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.pytorch.cuda' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.pytorch.cuda' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.pytorch.mlu' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.pytorch.mlu' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.pytorch.mlu' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.pytorch.mlu' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.pytorch.mlu' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.pytorch.mps' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.pytorch.mps' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.pytorch.mps' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.pytorch.mps' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.pytorch.mps' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  /opt/conda/lib/python3.11/site-packages/setuptools/command/build_py.py:212: _Warning: Package 'mmcv.ops.csrc.pytorch.npu' is absent from the `packages` configuration.
+  !!
+
+          ********************************************************************************
+          ############################
+          # Package would be ignored #
+          ############################
+          Python recognizes 'mmcv.ops.csrc.pytorch.npu' as an importable package[^1],
+          but it is absent from setuptools' `packages` configuration.
+
+          This leads to an ambiguous overall configuration. If you want to distribute this
+          package, please make sure that 'mmcv.ops.csrc.pytorch.npu' is explicitly added
+          to the `packages` configuration field.
+
+          Alternatively, you can also rely on setuptools' discovery methods
+          (for example by using `find_namespace_packages(...)`/`find_namespace:`
+          instead of `find_packages(...)`/`find:`).
+
+          You can read more about "package discovery" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/package_discovery.html
+
+          If you don't want 'mmcv.ops.csrc.pytorch.npu' to be distributed and are
+          already explicitly excluding 'mmcv.ops.csrc.pytorch.npu' via
+          `find_namespace_packages(...)/find_namespace` or `find_packages(...)/find`,
+          you can try to use `exclude_package_data`, or `include-package-data=False` in
+          combination with a more fine grained `package-data` configuration.
+
+          You can read more about "package data files" on setuptools documentation page:
+
+          - https://setuptools.pypa.io/en/latest/userguide/datafiles.html
+
+
+          [^1]: For Python, any directory (with suitable naming) can be imported,
+                even if it does not contain any `.py` files.
+                On the other hand, currently there is no concept of package data
+                directory, all directories are treated like packages.
+          ********************************************************************************
+
+  !!
+    check.warn(importable)
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common
+  copying mmcv/ops/csrc/common/box_iou_rotated_utils.hpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common
+  copying mmcv/ops/csrc/common/parrots_cpp_helper.hpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common
+  copying mmcv/ops/csrc/common/parrots_cuda_helper.hpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common
+  copying mmcv/ops/csrc/common/pytorch_cpp_helper.hpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common
+  copying mmcv/ops/csrc/common/pytorch_cuda_helper.hpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common
+  copying mmcv/ops/csrc/common/pytorch_device_registry.hpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common
+  copying mmcv/ops/csrc/common/pytorch_mlu_helper.hpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common
+  copying mmcv/ops/csrc/common/pytorch_npu_helper.hpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/correlation_cuda.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  copying mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/spconv
+  copying mmcv/ops/csrc/common/cuda/spconv/indice.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/spconv
+  copying mmcv/ops/csrc/common/cuda/spconv/reordering.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/spconv
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mlu
+  copying mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mlu
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps
+  copying mmcv/ops/csrc/common/mps/MPSDevice.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps
+  copying mmcv/ops/csrc/common/mps/MPSLibrary.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps
+  copying mmcv/ops/csrc/common/mps/MPSLibrary.mm -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps
+  copying mmcv/ops/csrc/common/mps/MPSStream.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps
+  copying mmcv/ops/csrc/common/mps/MPSUtils.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv
+  copying mmcv/ops/csrc/common/utils/spconv/paramsgrid.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv
+  copying mmcv/ops/csrc/common/utils/spconv/prettyprint.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv
+  copying mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv
+  copying mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv
+  copying mmcv/ops/csrc/common/utils/spconv/spconv/indice.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv
+  copying mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv
+  copying mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv
+  copying mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv
+  copying mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview
+  copying mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview
+  copying mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview
+  copying mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/active_rotated_filter.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/assign_score_withk.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/ball_query._parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/ball_query.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/ball_query_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/bbox_overlaps.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/border_align.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/border_align_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/border_align_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/box_iou_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/carafe.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/carafe_naive.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/carafe_naive_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/carafe_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/carafe_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/chamfer_distance.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/contour_expand.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/contour_expand_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/contour_expand_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/convex_iou.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/convex_iou_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/convex_iou_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/correlation.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/correlation_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/correlation_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/cudabind.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/deform_conv.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/deform_conv_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/deform_conv_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/deform_roi_pool.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/diff_iou_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/focal_loss.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/focal_loss_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/focal_loss_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/furthest_point_sample.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/fused_bias_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/gather_points.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/gather_points_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/gather_points_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/group_points.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/group_points_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/group_points_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/info.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/iou3d.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/iou3d_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/iou3d_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/knn.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/knn_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/knn_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/masked_conv2d.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/min_area_polygons.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/modulated_deform_conv.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/ms_deform_attn.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/nms.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/nms_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/nms_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/nms_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/pixel_group.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/pixel_group_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/pixel_group_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/points_in_boxes.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/points_in_polygons.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/prroi_pool.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/prroi_pool_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/psamask.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/psamask_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/psamask_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/riroi_align_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roi_align.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roi_align_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roi_align_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roi_align_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roi_pool.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roi_pool_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roi_pool_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roiaware_pool3d.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roipoint_pool3d.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/rotated_feature_align.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/sync_bn.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/sync_bn_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/sync_bn_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/three_interpolate.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/three_interpolate_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/three_nn.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/three_nn_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/three_nn_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/tin_shift.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/tin_shift_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/tin_shift_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/upfirdn2d.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/voxelization.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/voxelization_parrots.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  copying mmcv/ops/csrc/parrots/voxelization_pytorch.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/active_rotated_filter.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/assign_score_withk.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/ball_query.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/bbox_overlaps.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/bezier_align.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/bias_act.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/border_align.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/box_iou_quadri.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/box_iou_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/carafe.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/carafe_naive.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/chamfer_distance.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/contour_expand.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/convex_iou.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/correlation.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/deform_conv.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/deform_roi_pool.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/filtered_lrelu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/focal_loss.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/furthest_point_sample.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/gather_points.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/group_points.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/info.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/iou3d.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/knn.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/masked_conv2d.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/min_area_polygons.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/ms_deform_attn.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/nms.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/nms_quadri.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/nms_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/pixel_group.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/points_in_boxes.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/points_in_polygons.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/prroi_pool.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/psamask.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/pybind.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/roi_align.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/roi_align_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/roi_pool.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/rotated_feature_align.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/scatter_points.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/spconv_ops.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/spconv_utils.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/sync_bn.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/three_interpolate.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/three_nn.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/tin_shift.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/upfirdn2d.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  copying mmcv/ops/csrc/pytorch/voxelization.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/nms.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/psamask.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/roi_align.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  copying mmcv/ops/csrc/pytorch/cpu/voxelization.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/convex_iou.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/cudabind.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  copying mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/diff_iou_rotated_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/nms_rotated_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/rotated_feature_align_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/sparse_conv_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  copying mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mps
+  copying mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mps
+  creating build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/active_rotated_filter_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/nms_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/points_in_polygons_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  copying mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp -> build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu
+  running build_ext
+  /opt/conda/lib/python3.11/site-packages/torch/utils/cpp_extension.py:424: UserWarning: There are no g++ version bounds defined for CUDA version 12.4
+    warnings.warn(f'There are no {compiler_name} version bounds defined for CUDA version {cuda_str_version}')
+  building 'mmcv._ext' extension
+  creating /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch
+  creating /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu
+  creating /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda
+  /opt/conda/lib/python3.11/site-packages/torch/utils/cpp_extension.py:1965: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation.
+  If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].
+    warnings.warn(
+  Emitting ninja build file /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/build.ninja...
+  Compiling objects...
+  Using envvar MAX_JOBS (100) as the number of workers...
+  [1/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_rotated.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_rotated.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [2/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/active_rotated_filter.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/active_rotated_filter.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [3/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/assign_score_withk.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/assign_score_withk.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/assign_score_withk.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [4/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ball_query.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/ball_query.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ball_query.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [5/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bezier_align.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bezier_align.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bezier_align.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [6/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bias_act.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bias_act.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bias_act.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [7/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/border_align.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/border_align.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/border_align.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [8/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/carafe.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [9/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe_naive.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/carafe_naive.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe_naive.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [10/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/contour_expand.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/contour_expand.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/contour_expand.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [11/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/convex_iou.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/convex_iou.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/convex_iou.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [12/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/correlation.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/correlation.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/correlation.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [13/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [14/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  In file included from /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp:2:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp:276: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    276 | #pragma unroll
+        |
+  [15/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  In file included from /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp:4:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp:276: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    276 | #pragma unroll
+        |
+  [16/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [17/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/pixel_group.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/pixel_group.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [18/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [19/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/psamask.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/psamask.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/psamask.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [20/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [21/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/voxelization.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/voxelization.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [22/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/cudabind.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/cudabind.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [23/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_roi_pool.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_roi_pool.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [24/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/focal_loss.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/focal_loss.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/focal_loss.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [25/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bbox_overlaps.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bbox_overlaps.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [26/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_quadri.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/box_iou_quadri.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_quadri.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [27/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/chamfer_distance.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/chamfer_distance.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/chamfer_distance.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [28/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp:16: warning: ignoring ‘#pragma omp parallel’ [-Wunknown-pragmas]
+     16 | #pragma omp parallel for private(i, j, l, k)
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp:44: warning: ignoring ‘#pragma omp parallel’ [-Wunknown-pragmas]
+     44 | #pragma omp parallel for private(i, j, l, k)
+        |
+  [29/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_rotated.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_rotated.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  In file included from /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp:4:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp:276: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    276 | #pragma unroll
+        |
+  [30/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/deform_conv.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/deform_conv.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [31/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_quadri.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_quadri.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  In file included from /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp:2:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp:276: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    276 | #pragma unroll
+        |
+  [32/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/nms.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [33/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bezier_align.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bezier_align.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [34/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [35/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [36/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_conv.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/deform_conv.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_conv.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [37/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  In file included from /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h:17,
+                   from /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp:16:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:387: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    387 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:412: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    412 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:436: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    436 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:447: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    447 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:885: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    885 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:895: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    895 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:901: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    901 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:907: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    907 | #pragma unroll
+        |
+  [38/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/furthest_point_sample.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/furthest_point_sample.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [39/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  In file included from /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h:17,
+                   from /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp:16:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:387: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    387 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:412: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    412 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:436: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    436 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:447: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    447 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:885: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    885 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:895: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    895 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:901: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    901 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:907: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    907 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h: In instantiation of ‘tv::ShapeBase<MaxDim> tv::ShapeBase<MaxDim>::subshape(int) const [with long unsigned int MaxDim = 6]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:922:54:   required from ‘tv::TensorView<scalar_t, Rank> tv::TensorView<scalar_t, Rank>::subview(int, Integers ...) [with Integers = {}; scalar_t = const int; int Rank = -1]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp:29:37:   required from ‘void functor::SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, Index>::operator()(const tv::CPU&, tv::TensorView<scalar_t>, tv::TensorView<const scalar_t>, tv::TensorView<const scalar_t>, int) [with scalar_t = float; Index = int]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp:77:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:345:27: warning: comparison of integer expressions of different signedness: ‘int’ and ‘const size_t’ {aka ‘const long unsigned int’} [-Wsign-compare]
+    345 |     for (int i = start; i < this->mSize; ++i) {
+        |                         ~~^~~~~~~~~~~~~
+  [40/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/filtered_lrelu.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/filtered_lrelu.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/filtered_lrelu.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [41/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_indice.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_indice.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  In file included from /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:18,
+                   from /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:16:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:387: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    387 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:412: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    412 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:436: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    436 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:447: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    447 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:885: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    885 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:895: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    895 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:901: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    901 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h:907: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    907 | #pragma unroll
+        |
+  In file included from /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:16:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:38: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+     38 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:46: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+     46 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:52: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+     52 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:60: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+     60 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:75: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+     75 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:100: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    100 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:105: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    105 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:110: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    110 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:118: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    118 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:131: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    131 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:156: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    156 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:161: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    161 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:207: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    207 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:212: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    212 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:256: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    256 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:261: warning: ignoring ‘#pragma unroll ’ [-Wunknown-pragmas]
+    261 | #pragma unroll
+        |
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsDeConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = int; IndexGrid = int; unsigned int NDim = 1]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:36:58:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = int; IndexGrid = int; unsigned int NDim = 1]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:80:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:208:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    208 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:213:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    213 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = int; IndexGrid = int; unsigned int NDim = 1]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:41:56:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = int; IndexGrid = int; unsigned int NDim = 1]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:80:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:157:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    157 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:162:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    162 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsSubM(tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = int; IndexGrid = int; unsigned int NDim = 1]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:60:54:   required from ‘Index functor::CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = int; IndexGrid = int; unsigned int NDim = 1]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:80:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:257:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    257 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:262:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    262 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsDeConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = int; IndexGrid = int; unsigned int NDim = 2]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:36:58:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = int; IndexGrid = int; unsigned int NDim = 2]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:80:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:208:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    208 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:213:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    213 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = int; IndexGrid = int; unsigned int NDim = 2]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:41:56:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = int; IndexGrid = int; unsigned int NDim = 2]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:80:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:157:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    157 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:162:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    162 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsSubM(tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = int; IndexGrid = int; unsigned int NDim = 2]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:60:54:   required from ‘Index functor::CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = int; IndexGrid = int; unsigned int NDim = 2]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:80:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:257:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    257 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:262:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    262 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsDeConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = int; IndexGrid = int; unsigned int NDim = 3]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:36:58:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = int; IndexGrid = int; unsigned int NDim = 3]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:80:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:208:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    208 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:213:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    213 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = int; IndexGrid = int; unsigned int NDim = 3]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:41:56:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = int; IndexGrid = int; unsigned int NDim = 3]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:80:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:157:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    157 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:162:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    162 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsSubM(tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = int; IndexGrid = int; unsigned int NDim = 3]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:60:54:   required from ‘Index functor::CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = int; IndexGrid = int; unsigned int NDim = 3]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:80:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:257:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    257 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:262:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    262 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsDeConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = int; IndexGrid = int; unsigned int NDim = 4]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:36:58:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = int; IndexGrid = int; unsigned int NDim = 4]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:80:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:208:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    208 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:213:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    213 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = int; IndexGrid = int; unsigned int NDim = 4]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:41:56:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = int; IndexGrid = int; unsigned int NDim = 4]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:80:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:157:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    157 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:162:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    162 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsSubM(tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = int; IndexGrid = int; unsigned int NDim = 4]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:60:54:   required from ‘Index functor::CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = int; IndexGrid = int; unsigned int NDim = 4]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:80:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:257:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    257 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:262:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    262 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsDeConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = long int; IndexGrid = int; unsigned int NDim = 1]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:36:58:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = long int; IndexGrid = int; unsigned int NDim = 1]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:81:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:208:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    208 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:213:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    213 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = long int; IndexGrid = int; unsigned int NDim = 1]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:41:56:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = long int; IndexGrid = int; unsigned int NDim = 1]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:81:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:157:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    157 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:162:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    162 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsSubM(tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = long int; IndexGrid = int; unsigned int NDim = 1]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:60:54:   required from ‘Index functor::CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = long int; IndexGrid = int; unsigned int NDim = 1]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:81:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:257:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    257 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:262:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    262 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsDeConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = long int; IndexGrid = int; unsigned int NDim = 2]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:36:58:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = long int; IndexGrid = int; unsigned int NDim = 2]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:81:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:208:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    208 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:213:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    213 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = long int; IndexGrid = int; unsigned int NDim = 2]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:41:56:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = long int; IndexGrid = int; unsigned int NDim = 2]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:81:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:157:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    157 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:162:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    162 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsSubM(tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = long int; IndexGrid = int; unsigned int NDim = 2]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:60:54:   required from ‘Index functor::CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = long int; IndexGrid = int; unsigned int NDim = 2]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:81:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:257:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    257 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:262:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    262 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsDeConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = long int; IndexGrid = int; unsigned int NDim = 3]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:36:58:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = long int; IndexGrid = int; unsigned int NDim = 3]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:81:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:208:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    208 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:213:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    213 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = long int; IndexGrid = int; unsigned int NDim = 3]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:41:56:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = long int; IndexGrid = int; unsigned int NDim = 3]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:81:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:157:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    157 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:162:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    162 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsSubM(tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = long int; IndexGrid = int; unsigned int NDim = 3]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:60:54:   required from ‘Index functor::CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = long int; IndexGrid = int; unsigned int NDim = 3]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:81:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:257:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    257 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:262:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    262 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsDeConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = long int; IndexGrid = int; unsigned int NDim = 4]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:36:58:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = long int; IndexGrid = int; unsigned int NDim = 4]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:81:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:208:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    208 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:213:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    213 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsConv(tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = long int; IndexGrid = int; unsigned int NDim = 4]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:41:56:   required from ‘Index functor::CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = long int; IndexGrid = int; unsigned int NDim = 4]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:81:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:157:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    157 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:162:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    162 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h: In instantiation of ‘Index getIndicePairsSubM(tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, const Index*, const Index*, const Index*, const Index*, const Index*) [with Index = long int; IndexGrid = int; unsigned int NDim = 4]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:60:54:   required from ‘Index functor::CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim>::operator()(const tv::CPU&, tv::TensorView<const scalar_t>, tv::TensorView<IndexGrid>, tv::TensorView<scalar_t>, tv::TensorView<scalar_t>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, tv::SimpleVector<Index, NDim>, bool, bool) [with Index = long int; IndexGrid = int; unsigned int NDim = 4]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp:81:1:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:257:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    257 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h:262:21: warning: comparison of integer expressions of different signedness: ‘int’ and ‘unsigned int’ [-Wsign-compare]
+    262 |   for (int i = 0; i < NDim; ++i) {
+        |                   ~~^~~~~~
+  [42/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/diff_iou_rotated.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/diff_iou_rotated.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [43/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/info.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/info.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/info.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [44/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/iou3d.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/iou3d.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/iou3d.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [45/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/masked_conv2d.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/masked_conv2d.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/masked_conv2d.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [46/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pixel_group.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/pixel_group.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pixel_group.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [47/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_quadri.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms_quadri.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_quadri.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [48/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_nn.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/three_nn.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_nn.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [49/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_polygons.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/points_in_polygons.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_polygons.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [50/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_spconv_ops.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_spconv_ops.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [51/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_align.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [52/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/psamask.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/psamask.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/psamask.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [53/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/gather_points.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/gather_points.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/gather_points.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [54/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/knn.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/knn.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/knn.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [55/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ms_deform_attn.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ms_deform_attn.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [56/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/prroi_pool.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/prroi_pool.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/prroi_pool.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [57/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/group_points.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/group_points.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/group_points.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [58/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/scatter_points.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/scatter_points.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/scatter_points.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [59/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sparse_pool_ops.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sparse_pool_ops.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [60/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [61/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align_rotated.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align_rotated.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [62/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/rotated_feature_align.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/rotated_feature_align.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [63/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/tin_shift.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/tin_shift.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/tin_shift.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [64/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/min_area_polygons.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/min_area_polygons.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/min_area_polygons.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [65/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [66/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roipoint_pool3d.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roipoint_pool3d.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [67/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_boxes.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/points_in_boxes.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_boxes.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [68/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/riroi_align_rotated.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/riroi_align_rotated.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [69/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roiaware_pool3d.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roiaware_pool3d.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [70/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/upfirdn2d.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/upfirdn2d.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/upfirdn2d.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [71/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_rotated.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/nms_rotated.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_rotated.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [72/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_interpolate.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/three_interpolate.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_interpolate.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [73/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_pool.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/roi_pool.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_pool.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [74/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/voxelization.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/voxelization.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/voxelization.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [75/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sync_bn.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/sync_bn.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sync_bn.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [76/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_ops.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/spconv_ops.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_ops.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [77/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/modulated_deform_conv.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/modulated_deform_conv.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [78/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [79/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [80/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [81/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [82/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/knn_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/knn_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [83/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [84/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [85/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [86/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [87/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [88/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [89/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [90/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [91/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [92/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [93/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [94/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [95/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [96/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [97/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [98/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [99/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [100/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [101/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [102/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [103/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [104/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [105/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [106/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [107/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [108/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [109/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [110/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [111/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [112/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [113/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [114/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [115/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [116/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [117/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [118/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [119/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [120/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [121/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [122/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [123/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [124/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [125/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/convex_iou.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/convex_iou.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [126/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu(279): warning #177-D: variable "totalGatherTime" was declared but never referenced
+      double totalGatherTime = 0;
+             ^
+
+  Remark: The warnings can be suppressed with "-diag-suppress <warning-number>"
+
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu(280): warning #177-D: variable "totalGEMMTime" was declared but never referenced
+      double totalGEMMTime = 0;
+             ^
+
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu(281): warning #177-D: variable "totalSAddTime" was declared but never referenced
+      double totalSAddTime = 0;
+             ^
+
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = double]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = double]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu:288:1385:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = const double]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = const double]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu:288:1428:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = const int]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = const int]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu:288:1466:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = float]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = float]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu:288:0:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = const float]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = const float]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu:288:0:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = c10::Half]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = c10::Half]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu:288:0:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = const c10::Half]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = const c10::Half]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu:288:0:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = int]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = int]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu:74:97:   required from ‘std::vector<at::Tensor> GetIndicePairsForwardCUDAKernelLauncher(at::Tensor, int64_t, std::vector<long int>, std::vector<long int>, std::vector<long int>, std::vector<long int>, std::vector<long int>, std::vector<long int>, std::vector<long int>, int64_t, int64_t) [with unsigned int NDim = 2; int64_t = long int]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu:444:425:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  [127/136] c++ -MMD -MF /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pybind.o.d -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -fPIC -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/pybind.cpp -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pybind.o -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0
+  [128/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_indice.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_indice.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [129/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = double]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = double]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu:28:1188:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = const double]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = const double]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu:28:1226:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = const int]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = const int]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu:28:1264:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = float]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = float]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu:28:2940:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = const float]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = const float]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu:28:2977:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = c10::Half]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = c10::Half]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu:28:0:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = const c10::Half]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = const c10::Half]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu:28:0:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  [130/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [131/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [132/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = double]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = double]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu:52:1400:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = const double]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = const double]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu:52:1443:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = const int]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = const int]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu:52:1481:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = float]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = float]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu:52:0:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = const float]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = const float]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu:52:0:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = c10::Half]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = c10::Half]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu:52:0:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h: In instantiation of ‘void tv::check_torch_dtype(const at::Tensor&) [with scalar_t = const c10::Half]’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:71:30:   required from ‘tv::TensorView<scalar_t> tv::torch2tv(const at::Tensor&) [with scalar_t = const c10::Half]’
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu:52:0:   required from here
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/../spconv_utils.h:38:20: warning: ‘at::DeprecatedTypeProperties& at::Tensor::type() const’ is deprecated: Tensor.type() is deprecated. Instead use Tensor.options(), which in many cases (e.g. in a constructor) is a drop-in replacement. If you were using data from type(), that is now available from Tensor itself, so instead of tensor.type().scalar_type(), use tensor.scalar_type() instead and instead of tensor.type().backend() use tensor.device(). [-Wdeprecated-declarations]
+     38 |   switch (tensor.type().scalarType()) {
+        |         ~~~~~~~~~~~^~
+  /opt/conda/lib/python3.11/site-packages/torch/include/ATen/core/TensorBody.h:225:1: note: declared here
+    225 |   DeprecatedTypeProperties & type() const {
+        | ^ ~~
+  [133/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [134/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [135/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  [136/136] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.o.d -DMMCV_WITH_CUDA -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common -I/mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/common/cuda -I/opt/conda/lib/python3.11/site-packages/torch/include -I/opt/conda/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -I/opt/conda/lib/python3.11/site-packages/torch/include/TH -I/opt/conda/lib/python3.11/site-packages/torch/include/THC -I/usr/local/cuda/include -I/opt/conda/include/python3.11 -c -c /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu -o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.o -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr --compiler-options ''"'"'-fPIC'"'"'' -std=c++17 -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1011"' -DTORCH_EXTENSION_NAME=_ext -D_GLIBCXX_USE_CXX11_ABI=0 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_90,code=sm_90
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu: In function ‘std::tuple<at::Tensor, at::Tensor, int> filtered_lrelu_op(at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, int, int, int, int, int, int, int, int, float, float, float, bool, bool)’:
+  /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu:1914:58: warning: ‘cudaError_t cudaFuncSetSharedMemConfig(T*, cudaSharedMemConfig) [with T = void; cudaError_t = cudaError]’ is deprecated [-Wdeprecated-declarations]
+   1914 |   AT_CUDA_CHECK(
+        |                                                          ^
+  /usr/local/cuda/include/cuda_runtime.h:1589:1: note: declared here
+   1589 | __host__ cudaError_t cudaFuncSetSharedMemConfig(
+        | ^~~~~~~~~~~~~~~~~~~~~~~~~~
+  g++ -pthread -B /opt/conda/compiler_compat -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /opt/conda/include -fPIC -O2 -isystem /opt/conda/include -shared /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/active_rotated_filter.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/assign_score_withk.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ball_query.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bbox_overlaps.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bezier_align.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bias_act.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/border_align.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_quadri.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_rotated.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe_naive.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/chamfer_distance.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/contour_expand.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/convex_iou.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/correlation.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bezier_align.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/deform_conv.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_quadri.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_rotated.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/pixel_group.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/psamask.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_indice.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/voxelization.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/convex_iou.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/cudabind.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/knn_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_indice.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_conv.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_roi_pool.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/diff_iou_rotated.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/filtered_lrelu.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/focal_loss.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/furthest_point_sample.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_spconv_ops.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/gather_points.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/group_points.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/info.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/iou3d.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/knn.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/masked_conv2d.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/min_area_polygons.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/modulated_deform_conv.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ms_deform_attn.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_quadri.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_rotated.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pixel_group.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_boxes.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_polygons.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/prroi_pool.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/psamask.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pybind.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/riroi_align_rotated.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align_rotated.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_pool.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roiaware_pool3d.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roipoint_pool3d.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/rotated_feature_align.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/scatter_points.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sparse_pool_ops.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_ops.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sync_bn.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_interpolate.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_nn.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/tin_shift.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/upfirdn2d.o /mnt/data/ssd/datasets/iwc_ssd/shensheng/project_dev/head_extractor/mmcv-2.1.0/build/temp.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/voxelization.o -L/opt/conda/lib/python3.11/site-packages/torch/lib -L/usr/local/cuda/lib64 -lc10 -ltorch -ltorch_cpu -ltorch_python -lcudart -lc10_cuda -ltorch_cuda -o build/lib.linux-x86_64-cpython-311/mmcv/_ext.cpython-311-x86_64-linux-gnu.so
+  installing to build/bdist.linux-x86_64/wheel
+  running install
+  running install_lib
+  creating build/bdist.linux-x86_64/wheel
+  creating build/bdist.linux-x86_64/wheel/mmcv
+  creating build/bdist.linux-x86_64/wheel/mmcv/arraymisc
+  copying build/lib.linux-x86_64-cpython-311/mmcv/arraymisc/__init__.py -> build/bdist.linux-x86_64/wheel/./mmcv/arraymisc
+  copying build/lib.linux-x86_64-cpython-311/mmcv/arraymisc/quantization.py -> build/bdist.linux-x86_64/wheel/./mmcv/arraymisc
+  copying build/lib.linux-x86_64-cpython-311/mmcv/__init__.py -> build/bdist.linux-x86_64/wheel/./mmcv
+  creating build/bdist.linux-x86_64/wheel/mmcv/cnn
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/vgg.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn
+  creating build/bdist.linux-x86_64/wheel/mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/transformer.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv2d_adaptive_padding.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/norm.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/swish.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/activation.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/scale.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/hsigmoid.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/wrappers.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/context_block.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/__init__.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv_ws.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/non_local.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/generalized_attention.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/upsample.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/depthwise_separable_conv_module.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/drop.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/conv_module.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/padding.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/plugin.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/bricks/hswish.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/bricks
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/resnet.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/__init__.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/alexnet.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn
+  creating build/bdist.linux-x86_64/wheel/mmcv/cnn/rfsearch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/__init__.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/rfsearch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/operator.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/rfsearch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/utils.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/rfsearch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/rfsearch/search.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/rfsearch
+  creating build/bdist.linux-x86_64/wheel/mmcv/cnn/utils
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils/flops_counter.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/utils
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils/__init__.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/utils
+  copying build/lib.linux-x86_64-cpython-311/mmcv/cnn/utils/fuse_conv_bn.py -> build/bdist.linux-x86_64/wheel/./mmcv/cnn/utils
+  creating build/bdist.linux-x86_64/wheel/mmcv/visualization
+  copying build/lib.linux-x86_64-cpython-311/mmcv/visualization/color.py -> build/bdist.linux-x86_64/wheel/./mmcv/visualization
+  copying build/lib.linux-x86_64-cpython-311/mmcv/visualization/optflow.py -> build/bdist.linux-x86_64/wheel/./mmcv/visualization
+  copying build/lib.linux-x86_64-cpython-311/mmcv/visualization/__init__.py -> build/bdist.linux-x86_64/wheel/./mmcv/visualization
+  copying build/lib.linux-x86_64-cpython-311/mmcv/visualization/image.py -> build/bdist.linux-x86_64/wheel/./mmcv/visualization
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/roi_align.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/sync_bn.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/deprecated_wrappers.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/gather_points.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_pool.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/multi_scale_deform_attn.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/roiaware_pool3d.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/bbox.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/three_nn.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/roipoint_pool3d.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/diff_iou_rotated.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/carafe.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/group_points.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/points_in_polygons.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/filtered_lrelu.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/saconv.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/scatter_points.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/furthest_point_sample.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/three_interpolate.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/nms.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/__init__.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/border_align.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/upfirdn2d.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_functional.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/contour_expand.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/bezier_align.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/merge_cells.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/prroi_pool.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_structure.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/box_iou_rotated.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/riroi_align_rotated.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/deform_conv.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/focal_loss.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_pool_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/iou3d.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/pixel_group.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/fused_bias_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/psamask_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/tin_shift_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/voxelization_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/sync_bn_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/iou3d_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/cudabind.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/masked_conv2d.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/chamfer_distance.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/convex_iou.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_nn_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_conv.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/correlation_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/focal_loss_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/min_area_polygons.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/furthest_point_sample.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/group_points_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/sync_bn.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_interpolate_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_pool.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/psamask_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ball_query._parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/gather_points.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_conv_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/upfirdn2d.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/contour_expand_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/convex_iou_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/gather_points_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/box_iou_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/group_points_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_roi_pool.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_conv_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/focal_loss_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/correlation_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_nn_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/tin_shift_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ball_query.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/border_align.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/prroi_pool_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/gather_points_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/knn.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/rotated_feature_align.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/pixel_group_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_pool_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_polygons.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/group_points.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/assign_score_withk.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/voxelization.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/correlation.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/tin_shift.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/border_align_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_nn.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/iou3d_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/contour_expand.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/voxelization_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/prroi_pool.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/active_rotated_filter.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/knn_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/sync_bn_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_naive_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ms_deform_attn.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/bbox_overlaps.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/knn_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/contour_expand_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/pixel_group_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/psamask.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/three_interpolate.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/carafe_naive.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_boxes.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/ball_query_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/info.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/border_align_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/parrots/nms_pytorch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/parrots
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/focal_loss.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_utils.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/iou3d.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pixel_group.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/pybind.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_quadri.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/masked_conv2d.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/chamfer_distance.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/convex_iou.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/spconv_ops.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_conv.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bezier_align.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/filtered_lrelu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/min_area_polygons.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/pytorch/mps
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mps
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/sync_bn.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_pool.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/gather_points.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/upfirdn2d.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_quadri.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roi_align.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/nms.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/psamask.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cpu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/scatter_points.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ball_query.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bias_act.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/border_align.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/knn.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_polygons.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/group_points.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/assign_score_withk.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/voxelization.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/points_in_polygons_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/nms_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/active_rotated_filter_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/npu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/correlation.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/tin_shift.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_nn.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/contour_expand.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/prroi_pool.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/rotated_feature_align_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/diff_iou_rotated_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/sparse_conv_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/nms_rotated_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/nms.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/psamask.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/three_interpolate.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/carafe_naive.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/points_in_boxes.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/pytorch/info.cpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/pytorch
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/common
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_npu_helper.hpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_device_registry.hpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/common/mps
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSUtils.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/mps
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSStream.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/mps
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSDevice.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/mps
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSLibrary.mm -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/mps
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mps/MPSLibrary.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/mps
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/parrots_cuda_helper.hpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/parrots_cpp_helper.hpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/common/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/mlu
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/common/cuda/spconv
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/spconv/indice.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda/spconv
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda/spconv
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/cuda
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/common/utils
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/common/utils/spconv
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/common/utils/spconv/spconv
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/utils/spconv/spconv
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/utils/spconv/spconv
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/utils/spconv/spconv
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/utils/spconv/spconv
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/utils/spconv/spconv
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/utils/spconv/spconv
+  creating build/bdist.linux-x86_64/wheel/mmcv/ops/csrc/common/utils/spconv/tensorview
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/utils/spconv/tensorview
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/utils/spconv/tensorview
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/utils/spconv/tensorview
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/utils/spconv
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/utils/spconv
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/csrc/common/utils/spconv/prettyprint.h -> build/bdist.linux-x86_64/wheel/./mmcv/ops/csrc/common/utils/spconv
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/iou3d.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/tin_shift.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_conv.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/psa_mask.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/correlation.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/voxelize.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/info.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/pixel_group.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/focal_loss.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/corner_pool.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/point_sample.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/bias_act.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/deform_roi_pool.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/fused_bias_leakyrelu.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_modules.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/convex_iou.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/cc_attention.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/sparse_ops.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/points_in_boxes.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/ball_query.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/knn.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/roi_pool.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/box_iou_quadri.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/conv2d_gradfix.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/active_rotated_filter.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/assign_score_withk.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/modulated_deform_conv.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/points_sampler.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/masked_conv.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/rotated_feature_align.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/roi_align_rotated.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/min_area_polygons.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/ops/chamfer_distance.py -> build/bdist.linux-x86_64/wheel/./mmcv/ops
+  copying build/lib.linux-x86_64-cpython-311/mmcv/version.py -> build/bdist.linux-x86_64/wheel/./mmcv
+  copying build/lib.linux-x86_64-cpython-311/mmcv/_ext.cpython-311-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/wheel/./mmcv
+  creating build/bdist.linux-x86_64/wheel/mmcv/transforms
+  copying build/lib.linux-x86_64-cpython-311/mmcv/transforms/formatting.py -> build/bdist.linux-x86_64/wheel/./mmcv/transforms
+  copying build/lib.linux-x86_64-cpython-311/mmcv/transforms/wrappers.py -> build/bdist.linux-x86_64/wheel/./mmcv/transforms
+  copying build/lib.linux-x86_64-cpython-311/mmcv/transforms/__init__.py -> build/bdist.linux-x86_64/wheel/./mmcv/transforms
+  copying build/lib.linux-x86_64-cpython-311/mmcv/transforms/processing.py -> build/bdist.linux-x86_64/wheel/./mmcv/transforms
+  copying build/lib.linux-x86_64-cpython-311/mmcv/transforms/base.py -> build/bdist.linux-x86_64/wheel/./mmcv/transforms
+  copying build/lib.linux-x86_64-cpython-311/mmcv/transforms/builder.py -> build/bdist.linux-x86_64/wheel/./mmcv/transforms
+  copying build/lib.linux-x86_64-cpython-311/mmcv/transforms/utils.py -> build/bdist.linux-x86_64/wheel/./mmcv/transforms
+  copying build/lib.linux-x86_64-cpython-311/mmcv/transforms/loading.py -> build/bdist.linux-x86_64/wheel/./mmcv/transforms
+  creating build/bdist.linux-x86_64/wheel/mmcv/utils
+  copying build/lib.linux-x86_64-cpython-311/mmcv/utils/parrots_jit.py -> build/bdist.linux-x86_64/wheel/./mmcv/utils
+  copying build/lib.linux-x86_64-cpython-311/mmcv/utils/ext_loader.py -> build/bdist.linux-x86_64/wheel/./mmcv/utils
+  copying build/lib.linux-x86_64-cpython-311/mmcv/utils/__init__.py -> build/bdist.linux-x86_64/wheel/./mmcv/utils
+  copying build/lib.linux-x86_64-cpython-311/mmcv/utils/env.py -> build/bdist.linux-x86_64/wheel/./mmcv/utils
+  copying build/lib.linux-x86_64-cpython-311/mmcv/utils/device_type.py -> build/bdist.linux-x86_64/wheel/./mmcv/utils
+  creating build/bdist.linux-x86_64/wheel/mmcv/image
+  copying build/lib.linux-x86_64-cpython-311/mmcv/image/__init__.py -> build/bdist.linux-x86_64/wheel/./mmcv/image
+  copying build/lib.linux-x86_64-cpython-311/mmcv/image/misc.py -> build/bdist.linux-x86_64/wheel/./mmcv/image
+  copying build/lib.linux-x86_64-cpython-311/mmcv/image/colorspace.py -> build/bdist.linux-x86_64/wheel/./mmcv/image
+  copying build/lib.linux-x86_64-cpython-311/mmcv/image/geometric.py -> build/bdist.linux-x86_64/wheel/./mmcv/image
+  copying build/lib.linux-x86_64-cpython-311/mmcv/image/photometric.py -> build/bdist.linux-x86_64/wheel/./mmcv/image
+  copying build/lib.linux-x86_64-cpython-311/mmcv/image/io.py -> build/bdist.linux-x86_64/wheel/./mmcv/image
+  creating build/bdist.linux-x86_64/wheel/mmcv/video
+  copying build/lib.linux-x86_64-cpython-311/mmcv/video/optflow.py -> build/bdist.linux-x86_64/wheel/./mmcv/video
+  copying build/lib.linux-x86_64-cpython-311/mmcv/video/__init__.py -> build/bdist.linux-x86_64/wheel/./mmcv/video
+  copying build/lib.linux-x86_64-cpython-311/mmcv/video/processing.py -> build/bdist.linux-x86_64/wheel/./mmcv/video
+  copying build/lib.linux-x86_64-cpython-311/mmcv/video/io.py -> build/bdist.linux-x86_64/wheel/./mmcv/video
+  running install_egg_info
+  Copying mmcv.egg-info to build/bdist.linux-x86_64/wheel/./mmcv-2.1.0-py3.11.egg-info
+  running install_scripts
+  creating build/bdist.linux-x86_64/wheel/mmcv-2.1.0.dist-info/WHEEL
+  creating '/tmp/pip-ephem-wheel-cache-17ah21v2/wheels/8a/6e/07/4f30f6bd85feadaa7ab4756187e7b1eff8dac871fbc14cafab/tmp_ltmtb4b/.tmp-brx_k7h1/mmcv-2.1.0-cp311-cp311-linux_x86_64.whl' and adding 'build/bdist.linux-x86_64/wheel' to it
+  adding 'mmcv/__init__.py'
+  adding 'mmcv/_ext.cpython-311-x86_64-linux-gnu.so'
+  adding 'mmcv/version.py'
+  adding 'mmcv/arraymisc/__init__.py'
+  adding 'mmcv/arraymisc/quantization.py'
+  adding 'mmcv/cnn/__init__.py'
+  adding 'mmcv/cnn/alexnet.py'
+  adding 'mmcv/cnn/resnet.py'
+  adding 'mmcv/cnn/vgg.py'
+  adding 'mmcv/cnn/bricks/__init__.py'
+  adding 'mmcv/cnn/bricks/activation.py'
+  adding 'mmcv/cnn/bricks/context_block.py'
+  adding 'mmcv/cnn/bricks/conv.py'
+  adding 'mmcv/cnn/bricks/conv2d_adaptive_padding.py'
+  adding 'mmcv/cnn/bricks/conv_module.py'
+  adding 'mmcv/cnn/bricks/conv_ws.py'
+  adding 'mmcv/cnn/bricks/depthwise_separable_conv_module.py'
+  adding 'mmcv/cnn/bricks/drop.py'
+  adding 'mmcv/cnn/bricks/generalized_attention.py'
+  adding 'mmcv/cnn/bricks/hsigmoid.py'
+  adding 'mmcv/cnn/bricks/hswish.py'
+  adding 'mmcv/cnn/bricks/non_local.py'
+  adding 'mmcv/cnn/bricks/norm.py'
+  adding 'mmcv/cnn/bricks/padding.py'
+  adding 'mmcv/cnn/bricks/plugin.py'
+  adding 'mmcv/cnn/bricks/scale.py'
+  adding 'mmcv/cnn/bricks/swish.py'
+  adding 'mmcv/cnn/bricks/transformer.py'
+  adding 'mmcv/cnn/bricks/upsample.py'
+  adding 'mmcv/cnn/bricks/wrappers.py'
+  adding 'mmcv/cnn/rfsearch/__init__.py'
+  adding 'mmcv/cnn/rfsearch/operator.py'
+  adding 'mmcv/cnn/rfsearch/search.py'
+  adding 'mmcv/cnn/rfsearch/utils.py'
+  adding 'mmcv/cnn/utils/__init__.py'
+  adding 'mmcv/cnn/utils/flops_counter.py'
+  adding 'mmcv/cnn/utils/fuse_conv_bn.py'
+  adding 'mmcv/image/__init__.py'
+  adding 'mmcv/image/colorspace.py'
+  adding 'mmcv/image/geometric.py'
+  adding 'mmcv/image/io.py'
+  adding 'mmcv/image/misc.py'
+  adding 'mmcv/image/photometric.py'
+  adding 'mmcv/ops/__init__.py'
+  adding 'mmcv/ops/active_rotated_filter.py'
+  adding 'mmcv/ops/assign_score_withk.py'
+  adding 'mmcv/ops/ball_query.py'
+  adding 'mmcv/ops/bbox.py'
+  adding 'mmcv/ops/bezier_align.py'
+  adding 'mmcv/ops/bias_act.py'
+  adding 'mmcv/ops/border_align.py'
+  adding 'mmcv/ops/box_iou_quadri.py'
+  adding 'mmcv/ops/box_iou_rotated.py'
+  adding 'mmcv/ops/carafe.py'
+  adding 'mmcv/ops/cc_attention.py'
+  adding 'mmcv/ops/chamfer_distance.py'
+  adding 'mmcv/ops/contour_expand.py'
+  adding 'mmcv/ops/conv2d_gradfix.py'
+  adding 'mmcv/ops/convex_iou.py'
+  adding 'mmcv/ops/corner_pool.py'
+  adding 'mmcv/ops/correlation.py'
+  adding 'mmcv/ops/deform_conv.py'
+  adding 'mmcv/ops/deform_roi_pool.py'
+  adding 'mmcv/ops/deprecated_wrappers.py'
+  adding 'mmcv/ops/diff_iou_rotated.py'
+  adding 'mmcv/ops/filtered_lrelu.py'
+  adding 'mmcv/ops/focal_loss.py'
+  adding 'mmcv/ops/furthest_point_sample.py'
+  adding 'mmcv/ops/fused_bias_leakyrelu.py'
+  adding 'mmcv/ops/gather_points.py'
+  adding 'mmcv/ops/group_points.py'
+  adding 'mmcv/ops/info.py'
+  adding 'mmcv/ops/iou3d.py'
+  adding 'mmcv/ops/knn.py'
+  adding 'mmcv/ops/masked_conv.py'
+  adding 'mmcv/ops/merge_cells.py'
+  adding 'mmcv/ops/min_area_polygons.py'
+  adding 'mmcv/ops/modulated_deform_conv.py'
+  adding 'mmcv/ops/multi_scale_deform_attn.py'
+  adding 'mmcv/ops/nms.py'
+  adding 'mmcv/ops/pixel_group.py'
+  adding 'mmcv/ops/point_sample.py'
+  adding 'mmcv/ops/points_in_boxes.py'
+  adding 'mmcv/ops/points_in_polygons.py'
+  adding 'mmcv/ops/points_sampler.py'
+  adding 'mmcv/ops/prroi_pool.py'
+  adding 'mmcv/ops/psa_mask.py'
+  adding 'mmcv/ops/riroi_align_rotated.py'
+  adding 'mmcv/ops/roi_align.py'
+  adding 'mmcv/ops/roi_align_rotated.py'
+  adding 'mmcv/ops/roi_pool.py'
+  adding 'mmcv/ops/roiaware_pool3d.py'
+  adding 'mmcv/ops/roipoint_pool3d.py'
+  adding 'mmcv/ops/rotated_feature_align.py'
+  adding 'mmcv/ops/saconv.py'
+  adding 'mmcv/ops/scatter_points.py'
+  adding 'mmcv/ops/sparse_conv.py'
+  adding 'mmcv/ops/sparse_functional.py'
+  adding 'mmcv/ops/sparse_modules.py'
+  adding 'mmcv/ops/sparse_ops.py'
+  adding 'mmcv/ops/sparse_pool.py'
+  adding 'mmcv/ops/sparse_structure.py'
+  adding 'mmcv/ops/sync_bn.py'
+  adding 'mmcv/ops/three_interpolate.py'
+  adding 'mmcv/ops/three_nn.py'
+  adding 'mmcv/ops/tin_shift.py'
+  adding 'mmcv/ops/upfirdn2d.py'
+  adding 'mmcv/ops/voxelize.py'
+  adding 'mmcv/ops/csrc/common/box_iou_rotated_utils.hpp'
+  adding 'mmcv/ops/csrc/common/parrots_cpp_helper.hpp'
+  adding 'mmcv/ops/csrc/common/parrots_cuda_helper.hpp'
+  adding 'mmcv/ops/csrc/common/pytorch_cpp_helper.hpp'
+  adding 'mmcv/ops/csrc/common/pytorch_cuda_helper.hpp'
+  adding 'mmcv/ops/csrc/common/pytorch_device_registry.hpp'
+  adding 'mmcv/ops/csrc/common/pytorch_mlu_helper.hpp'
+  adding 'mmcv/ops/csrc/common/pytorch_npu_helper.hpp'
+  adding 'mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp'
+  adding 'mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/correlation_cuda.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/spconv/indice.cuh'
+  adding 'mmcv/ops/csrc/common/cuda/spconv/reordering.cuh'
+  adding 'mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp'
+  adding 'mmcv/ops/csrc/common/mps/MPSDevice.h'
+  adding 'mmcv/ops/csrc/common/mps/MPSLibrary.h'
+  adding 'mmcv/ops/csrc/common/mps/MPSLibrary.mm'
+  adding 'mmcv/ops/csrc/common/mps/MPSStream.h'
+  adding 'mmcv/ops/csrc/common/mps/MPSUtils.h'
+  adding 'mmcv/ops/csrc/common/utils/spconv/paramsgrid.h'
+  adding 'mmcv/ops/csrc/common/utils/spconv/prettyprint.h'
+  adding 'mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h'
+  adding 'mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h'
+  adding 'mmcv/ops/csrc/common/utils/spconv/spconv/indice.h'
+  adding 'mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h'
+  adding 'mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h'
+  adding 'mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h'
+  adding 'mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h'
+  adding 'mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh'
+  adding 'mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h'
+  adding 'mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h'
+  adding 'mmcv/ops/csrc/parrots/active_rotated_filter.cpp'
+  adding 'mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/assign_score_withk.cpp'
+  adding 'mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/ball_query._parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/ball_query.cpp'
+  adding 'mmcv/ops/csrc/parrots/ball_query_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/bbox_overlaps.cpp'
+  adding 'mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/border_align.cpp'
+  adding 'mmcv/ops/csrc/parrots/border_align_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/border_align_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/box_iou_rotated.cpp'
+  adding 'mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/carafe.cpp'
+  adding 'mmcv/ops/csrc/parrots/carafe_naive.cpp'
+  adding 'mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/carafe_naive_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/carafe_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/carafe_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/chamfer_distance.cpp'
+  adding 'mmcv/ops/csrc/parrots/chamfer_distance_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/chamfer_distance_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/contour_expand.cpp'
+  adding 'mmcv/ops/csrc/parrots/contour_expand_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/contour_expand_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/convex_iou.cpp'
+  adding 'mmcv/ops/csrc/parrots/convex_iou_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/convex_iou_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/correlation.cpp'
+  adding 'mmcv/ops/csrc/parrots/correlation_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/correlation_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/cudabind.cpp'
+  adding 'mmcv/ops/csrc/parrots/deform_conv.cpp'
+  adding 'mmcv/ops/csrc/parrots/deform_conv_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/deform_conv_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/deform_roi_pool.cpp'
+  adding 'mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/diff_iou_rotated.cpp'
+  adding 'mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/focal_loss.cpp'
+  adding 'mmcv/ops/csrc/parrots/focal_loss_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/focal_loss_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/furthest_point_sample.cpp'
+  adding 'mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp'
+  adding 'mmcv/ops/csrc/parrots/fused_bias_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/gather_points.cpp'
+  adding 'mmcv/ops/csrc/parrots/gather_points_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/gather_points_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/group_points.cpp'
+  adding 'mmcv/ops/csrc/parrots/group_points_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/group_points_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/info.cpp'
+  adding 'mmcv/ops/csrc/parrots/iou3d.cpp'
+  adding 'mmcv/ops/csrc/parrots/iou3d_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/iou3d_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/knn.cpp'
+  adding 'mmcv/ops/csrc/parrots/knn_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/knn_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/masked_conv2d.cpp'
+  adding 'mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/min_area_polygons.cpp'
+  adding 'mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/modulated_deform_conv.cpp'
+  adding 'mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/ms_deform_attn.cpp'
+  adding 'mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/nms.cpp'
+  adding 'mmcv/ops/csrc/parrots/nms_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/nms_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/nms_rotated.cpp'
+  adding 'mmcv/ops/csrc/parrots/pixel_group.cpp'
+  adding 'mmcv/ops/csrc/parrots/pixel_group_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/pixel_group_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/points_in_boxes.cpp'
+  adding 'mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/points_in_polygons.cpp'
+  adding 'mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/prroi_pool.cpp'
+  adding 'mmcv/ops/csrc/parrots/prroi_pool_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/prroi_pool_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/psamask.cpp'
+  adding 'mmcv/ops/csrc/parrots/psamask_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/psamask_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/riroi_align_rotated.cpp'
+  adding 'mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/roi_align.cpp'
+  adding 'mmcv/ops/csrc/parrots/roi_align_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/roi_align_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/roi_align_rotated.cpp'
+  adding 'mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/roi_pool.cpp'
+  adding 'mmcv/ops/csrc/parrots/roi_pool_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/roi_pool_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/roiaware_pool3d.cpp'
+  adding 'mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/roipoint_pool3d.cpp'
+  adding 'mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/rotated_feature_align.cpp'
+  adding 'mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/sync_bn.cpp'
+  adding 'mmcv/ops/csrc/parrots/sync_bn_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/sync_bn_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/three_interpolate.cpp'
+  adding 'mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/three_interpolate_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/three_nn.cpp'
+  adding 'mmcv/ops/csrc/parrots/three_nn_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/three_nn_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/tin_shift.cpp'
+  adding 'mmcv/ops/csrc/parrots/tin_shift_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/tin_shift_pytorch.h'
+  adding 'mmcv/ops/csrc/parrots/upfirdn2d.cpp'
+  adding 'mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/voxelization.cpp'
+  adding 'mmcv/ops/csrc/parrots/voxelization_parrots.cpp'
+  adding 'mmcv/ops/csrc/parrots/voxelization_pytorch.h'
+  adding 'mmcv/ops/csrc/pytorch/active_rotated_filter.cpp'
+  adding 'mmcv/ops/csrc/pytorch/assign_score_withk.cpp'
+  adding 'mmcv/ops/csrc/pytorch/ball_query.cpp'
+  adding 'mmcv/ops/csrc/pytorch/bbox_overlaps.cpp'
+  adding 'mmcv/ops/csrc/pytorch/bezier_align.cpp'
+  adding 'mmcv/ops/csrc/pytorch/bias_act.cpp'
+  adding 'mmcv/ops/csrc/pytorch/border_align.cpp'
+  adding 'mmcv/ops/csrc/pytorch/box_iou_quadri.cpp'
+  adding 'mmcv/ops/csrc/pytorch/box_iou_rotated.cpp'
+  adding 'mmcv/ops/csrc/pytorch/carafe.cpp'
+  adding 'mmcv/ops/csrc/pytorch/carafe_naive.cpp'
+  adding 'mmcv/ops/csrc/pytorch/chamfer_distance.cpp'
+  adding 'mmcv/ops/csrc/pytorch/contour_expand.cpp'
+  adding 'mmcv/ops/csrc/pytorch/convex_iou.cpp'
+  adding 'mmcv/ops/csrc/pytorch/correlation.cpp'
+  adding 'mmcv/ops/csrc/pytorch/deform_conv.cpp'
+  adding 'mmcv/ops/csrc/pytorch/deform_roi_pool.cpp'
+  adding 'mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp'
+  adding 'mmcv/ops/csrc/pytorch/filtered_lrelu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/focal_loss.cpp'
+  adding 'mmcv/ops/csrc/pytorch/furthest_point_sample.cpp'
+  adding 'mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp'
+  adding 'mmcv/ops/csrc/pytorch/gather_points.cpp'
+  adding 'mmcv/ops/csrc/pytorch/group_points.cpp'
+  adding 'mmcv/ops/csrc/pytorch/info.cpp'
+  adding 'mmcv/ops/csrc/pytorch/iou3d.cpp'
+  adding 'mmcv/ops/csrc/pytorch/knn.cpp'
+  adding 'mmcv/ops/csrc/pytorch/masked_conv2d.cpp'
+  adding 'mmcv/ops/csrc/pytorch/min_area_polygons.cpp'
+  adding 'mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp'
+  adding 'mmcv/ops/csrc/pytorch/ms_deform_attn.cpp'
+  adding 'mmcv/ops/csrc/pytorch/nms.cpp'
+  adding 'mmcv/ops/csrc/pytorch/nms_quadri.cpp'
+  adding 'mmcv/ops/csrc/pytorch/nms_rotated.cpp'
+  adding 'mmcv/ops/csrc/pytorch/pixel_group.cpp'
+  adding 'mmcv/ops/csrc/pytorch/points_in_boxes.cpp'
+  adding 'mmcv/ops/csrc/pytorch/points_in_polygons.cpp'
+  adding 'mmcv/ops/csrc/pytorch/prroi_pool.cpp'
+  adding 'mmcv/ops/csrc/pytorch/psamask.cpp'
+  adding 'mmcv/ops/csrc/pytorch/pybind.cpp'
+  adding 'mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp'
+  adding 'mmcv/ops/csrc/pytorch/roi_align.cpp'
+  adding 'mmcv/ops/csrc/pytorch/roi_align_rotated.cpp'
+  adding 'mmcv/ops/csrc/pytorch/roi_pool.cpp'
+  adding 'mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp'
+  adding 'mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp'
+  adding 'mmcv/ops/csrc/pytorch/rotated_feature_align.cpp'
+  adding 'mmcv/ops/csrc/pytorch/scatter_points.cpp'
+  adding 'mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp'
+  adding 'mmcv/ops/csrc/pytorch/spconv_ops.cpp'
+  adding 'mmcv/ops/csrc/pytorch/spconv_utils.h'
+  adding 'mmcv/ops/csrc/pytorch/sync_bn.cpp'
+  adding 'mmcv/ops/csrc/pytorch/three_interpolate.cpp'
+  adding 'mmcv/ops/csrc/pytorch/three_nn.cpp'
+  adding 'mmcv/ops/csrc/pytorch/tin_shift.cpp'
+  adding 'mmcv/ops/csrc/pytorch/upfirdn2d.cpp'
+  adding 'mmcv/ops/csrc/pytorch/voxelization.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/bbox_overlaps_cpu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/bezier_align.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/box_iou_quadri.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/nms.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/nms_quadri.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/psamask.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/roi_align.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cpu/voxelization.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/bezier_align_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/box_iou_quadri_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/convex_iou.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/cudabind.cpp'
+  adding 'mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/nms_quadri_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/stack_ball_query_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/stack_group_points_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu'
+  adding 'mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu'
+  adding 'mmcv/ops/csrc/pytorch/mlu/ball_query_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/carafe_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/deform_roi_pool_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/diff_iou_rotated_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/masked_conv2d_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h'
+  adding 'mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/nms_rotated_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/roipoint_pool3d_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/rotated_feature_align_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/sparse_conv_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mlu/voxelization_mlu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm'
+  adding 'mmcv/ops/csrc/pytorch/npu/active_rotated_filter_npu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/bbox_overlaps_npu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/box_iou_rotated_npu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/deform_roi_pool.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/fused_bias_leakyrelu_npu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/group_points_npu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/nms_npu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/nms_rotated_npu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/points_in_polygons_npu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/psa_mask_npu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/roi_align_npu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp'
+  adding 'mmcv/ops/csrc/pytorch/npu/voxelization_npu.cpp'
+  adding 'mmcv/transforms/__init__.py'
+  adding 'mmcv/transforms/base.py'
+  adding 'mmcv/transforms/builder.py'
+  adding 'mmcv/transforms/formatting.py'
+  adding 'mmcv/transforms/loading.py'
+  adding 'mmcv/transforms/processing.py'
+  adding 'mmcv/transforms/utils.py'
+  adding 'mmcv/transforms/wrappers.py'
+  adding 'mmcv/utils/__init__.py'
+  adding 'mmcv/utils/device_type.py'
+  adding 'mmcv/utils/env.py'
+  adding 'mmcv/utils/ext_loader.py'
+  adding 'mmcv/utils/parrots_jit.py'
+  adding 'mmcv/video/__init__.py'
+  adding 'mmcv/video/io.py'
+  adding 'mmcv/video/optflow.py'
+  adding 'mmcv/video/processing.py'
+  adding 'mmcv/visualization/__init__.py'
+  adding 'mmcv/visualization/color.py'
+  adding 'mmcv/visualization/image.py'
+  adding 'mmcv/visualization/optflow.py'
+  adding 'mmcv-2.1.0.dist-info/licenses/LICENSE'
+  adding 'mmcv-2.1.0.dist-info/licenses/LICENSES.md'
+  adding 'mmcv-2.1.0.dist-info/METADATA'
+  adding 'mmcv-2.1.0.dist-info/WHEEL'
+  adding 'mmcv-2.1.0.dist-info/top_level.txt'
+  adding 'mmcv-2.1.0.dist-info/RECORD'
+  removing build/bdist.linux-x86_64/wheel
+  Building wheel for mmcv (pyproject.toml): finished with status 'done'
+  Created wheel for mmcv: filename=mmcv-2.1.0-cp311-cp311-linux_x86_64.whl size=10531068 sha256=df77bad0722d00a835d55bd53c7dd8a5a29a028ef10a3a031ce007da0b688e80
+  Stored in directory: /tmp/pip-ephem-wheel-cache-17ah21v2/wheels/8a/6e/07/4f30f6bd85feadaa7ab4756187e7b1eff8dac871fbc14cafab
+Successfully built mmcv
+Installing collected packages: mmcv
+  Attempting uninstall: mmcv
+    Found existing installation: mmcv 2.1.0
+    Uninstalling mmcv-2.1.0:
+      Removing file or directory /opt/conda/lib/python3.11/site-packages/mmcv-2.1.0.dist-info/
+      Removing file or directory /opt/conda/lib/python3.11/site-packages/mmcv/
+      Successfully uninstalled mmcv-2.1.0
+Successfully installed mmcv-2.1.0
+WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
diff --git a/head_extractor/pyproject.toml b/head_extractor/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..a7ab2d7b081d6057187850e76b7d4dcadc551449
--- /dev/null
+++ b/head_extractor/pyproject.toml
@@ -0,0 +1,52 @@
+
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "head_extractor"
+version = "0.1.0"
+authors = [
+  { name="Shen Sheng", email="codyshens@tju.edu.cn" },
+]
+description = "A package to extract human heads from images using segmentation."
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+]
+dependencies = [
+    # 基础依赖
+    "numpy",
+    "Pillow",
+    "opencv-python-headless",
+    "torch",
+    "torchvision",
+    "importlib-resources; python_version < '3.9'",
+
+    # mmengine/mmseg/mmdet 的运行时依赖 (因为它们是源码内植)
+    "addict",
+    "rich",
+    "prettytable",
+    "terminaltables",
+    "tqdm",
+    "matplotlib",
+    "scipy",
+    "ftfy",
+    "regex","pycocotools", "shapely",
+
+    # 当 setup.sh 已经安装好 mmcv 后，这个条件会自动满足。
+    "mmcv == 2.1.0"
+]
+
+[tool.setuptools]
+package-dir = {"" = "src"}
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+"head_extractor" = ["models/*"]
+"mmseg" = ["utils/bpe_simple_vocab_16e6.txt.gz"]
+
diff --git a/head_extractor/readme.md b/head_extractor/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..17d81a1b644683db2cd81b131bb0dfcde7208131
--- /dev/null
+++ b/head_extractor/readme.md
@@ -0,0 +1,128 @@
+# Head Extractor
+
+## 🚀 安装
+
+bash setup.sh 
+
+如果出现：
+--- 步骤 6: 验证安装 ---
+✅ head_extractor
+✅ mmengine
+✅ mmdet (core)
+✅ mmseg
+✅ mmcv
+✅ MMCV 版本: 2.1.0
+则表示安装完成
+
+测试：python test.py
+
+## 💡 使用方法
+
+以下是一个基本的使用示例，展示了如何加载模型并处理单张图片。
+
+```python
+# example.py
+from PIL import Image
+from head_extractor import ProcessorPipeline
+
+# 1. 初始化 Pipeline，它会自动加载封装在包内的模型
+# 这个过程在首次调用时可能会花费一些时间
+print("正在加载模型...")
+pipeline = ProcessorPipeline.load()
+print("模型加载成功！")
+
+# 2. 打开一张待处理的图片
+try:
+    input_image = Image.open("001.jpg") # 替换为你的图片路径
+except FileNotFoundError:
+    print("错误：测试图片未找到！")
+    exit()
+
+# 3. 提取头部
+# 默认输出为带白色背景、填充为正方形的 RGB 图像
+print("正在提取头部...")
+extracted_head = pipeline.extract_head(input_image)
+
+# 4. 保存结果
+output_path = "extracted_head_default.png"
+extracted_head.save(output_path)
+print(f"处理完成！结果已保存至: {output_path}")
+```
+
+## ⚡️ 性能基准 (Performance Benchmark)
+
+以下是在**单张**图片上进行处理的性能测试结果。
+
+**测试环境**:
+
+- **GPU**: NVIDIA H100
+- **配置**: 默认参数 (`long_edge=1024, crop_padding=10, pad2square=True`)
+
+
+| 指标               | 结果          |
+| :------------------- | :-------------- |
+| **平均每次耗时**   | `~0.27` 秒    |
+| **处理速度 (FPS)** | `~3.75` 帧/秒 |
+| **显存占用**       | `~7.7` GB     |
+
+## 📚 API 参考
+
+### `ProcessorPipeline.extract_head()`
+
+```python
+pipeline.extract_head(
+    image: Image.Image,
+    crop_padding: int = 10,
+    background_color: tuple = (255, 255, 255),
+    pad2square: bool = True,
+    output_mode: str = 'RGB'
+) -> Image.Image:
+```
+
+**参数**:
+
+- `image` (`PIL.Image.Image`): **必需**。输入的 PIL 图像对象。
+- `crop_padding` (`int`, 可选, 默认: `10`): 在检测到的头部边界框周围额外增加的边距（像素）。
+- `background_color` (`tuple`, 可选, 默认: `(255, 255, 255)`): 当 `output_mode` 为 `'RGB'` 时，用于背景和填充区域的 RGB 颜色。
+- `pad2square` (`bool`, 可选, 默认: `True`): 是否将最终的输出图像填充为正方形。
+  - `True`: 输出为正方形图像。
+  - `False`: 输出为紧密裁剪后的矩形图像。
+- `output_mode` (`str`, 可选, 默认: `'RGB'`): 输出图像的模式。
+  - `'RGB'`: 输出为三通道的 RGB 图像，背景为 `background_color` 指定的颜色。
+  - `'RGBA'`: 输出为带 Alpha 通道的四通道 RGBA 图像，背景为透明。
+- `long_edge` (`int`, 可选, 默认: `1024`): 在送入模型前，将图片长边缩放到的尺寸。
+
+
+**返回值**:
+
+- `PIL.Image.Image`: 处理完成后的 PIL 图像对象。
+
+## ⚙️ 更多示例
+
+#### 示例 1: 输出带透明背景、不填充为正方形的头像
+
+```python
+result_rgba = pipeline.extract_head(
+    input_image,
+    output_mode='RGBA',
+    pad2square=False
+)
+# 注意：带透明通道的图像应保存为 PNG 格式
+result_rgba.save("extracted_head_transparent.png")
+```
+
+#### 示例 2: 输出带黑色背景、填充为正方形的头像
+
+```python
+result_black_bg = pipeline.extract_head(
+    input_image,
+    background_color=(0, 0, 0),
+    pad2square=True,
+    output_mode='RGB'
+)
+result_black_bg.save("extracted_head_black_bg.jpg")
+```
+
+```
+
+```
diff --git a/head_extractor/setup.sh b/head_extractor/setup.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6639b9d4c77618823d25a5e463bd71bcbe370f41
--- /dev/null
+++ b/head_extractor/setup.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# 这是一个用于一键式安装 head_extractor 包及其本地依赖 mmcv 的脚本。
+
+# 如果任何命令失败，则立即退出
+set -e
+
+echo "--- 步骤 1: 安装编译和加速工具 ---"
+pip install -U pip setuptools wheel ninja cmake yapf
+pip install torch torchvision numpy==2.0
+
+echo "--- 步骤 2: 设置编译加速环境变量 ---"
+export USE_NINJA=1
+export MAX_JOBS=$(nproc)
+export CMAKE_BUILD_PARALLEL_LEVEL=$(nproc)
+
+
+# echo "--- 步骤 3: 清理旧版本，确保干净的安装环境 ---"
+# pip uninstall head_extractor mmcv -y || echo "旧版本未找到，继续..."
+# rm -rf build dist *.egg-info mmcv-2.1.0/build mmcv-2.1.0/*.egg-info
+
+
+echo "--- 步骤 4: 安装 head_extractor  ---"
+pip install -e .
+
+# echo "--- 步骤 5: 安装 mmcv  ---"
+# cd ./mmcv-2.1.0
+# MMCV_WITH_OPS=1 pip install --no-build-isolation -v . 2>&1 | tee ../mmcv_install.log
+# BUILD_SUCCESS=$?
+# cd ../
+
+
+echo "--- 步骤 6: 验证安装 ---"
+python -c "import head_extractor" && echo "✅ head_extractor"
+python -c "import mmengine" && echo "✅ mmengine"
+python -c "from mmdet.models.dense_heads import Mask2FormerHead" && echo "✅ mmdet (core)"
+python -c "import mmseg" && echo "✅ mmseg"
+python -c "import mmcv; import mmcv._ext" && echo "✅ mmcv"
+python -c "import mmcv; print('✅ MMCV 版本:', mmcv.__version__)"
+
+echo "🎉 全部安装和验证完成！"
\ No newline at end of file
diff --git a/head_extractor/src/head_extractor.egg-info/PKG-INFO b/head_extractor/src/head_extractor.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..5dadbe5d7f37b975918cc599b1410c646ec5d97f
--- /dev/null
+++ b/head_extractor/src/head_extractor.egg-info/PKG-INFO
@@ -0,0 +1,27 @@
+Metadata-Version: 2.4
+Name: head_extractor
+Version: 0.1.0
+Summary: A package to extract human heads from images using segmentation.
+Author-email: Shen Sheng <codyshens@tju.edu.cn>
+Classifier: Programming Language :: Python :: 3
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+Requires-Dist: numpy
+Requires-Dist: Pillow
+Requires-Dist: opencv-python-headless
+Requires-Dist: torch
+Requires-Dist: torchvision
+Requires-Dist: importlib-resources; python_version < "3.9"
+Requires-Dist: addict
+Requires-Dist: rich
+Requires-Dist: prettytable
+Requires-Dist: terminaltables
+Requires-Dist: tqdm
+Requires-Dist: matplotlib
+Requires-Dist: scipy
+Requires-Dist: ftfy
+Requires-Dist: regex
+Requires-Dist: pycocotools
+Requires-Dist: shapely
+Requires-Dist: mmcv==2.1.0
diff --git a/head_extractor/src/head_extractor.egg-info/SOURCES.txt b/head_extractor/src/head_extractor.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fef18179fd322b508ac2c0b10476754362fe2c36
--- /dev/null
+++ b/head_extractor/src/head_extractor.egg-info/SOURCES.txt
@@ -0,0 +1,1037 @@
+pyproject.toml
+src/head_extractor/__init__.py
+src/head_extractor/processor.py
+src/head_extractor.egg-info/PKG-INFO
+src/head_extractor.egg-info/SOURCES.txt
+src/head_extractor.egg-info/dependency_links.txt
+src/head_extractor.egg-info/requires.txt
+src/head_extractor.egg-info/top_level.txt
+src/head_extractor/models/__init__.py
+src/head_extractor/models/ckpt.pth
+src/head_extractor/models/depth_anything_large_mask2former_16xb1_160k_human_parsing_fashion_1024x1024.py
+src/mmdet/__init__.py
+src/mmdet/registry.py
+src/mmdet/version.py
+src/mmdet/apis/__init__.py
+src/mmdet/apis/det_inferencer.py
+src/mmdet/apis/inference.py
+src/mmdet/configs/_base_/default_runtime.py
+src/mmdet/configs/_base_/datasets/coco_detection.py
+src/mmdet/configs/_base_/datasets/coco_instance.py
+src/mmdet/configs/_base_/datasets/coco_instance_semantic.py
+src/mmdet/configs/_base_/datasets/coco_panoptic.py
+src/mmdet/configs/_base_/datasets/mot_challenge.py
+src/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
+src/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py
+src/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py
+src/mmdet/configs/_base_/models/mask_rcnn_r50_caffe_c4.py
+src/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py
+src/mmdet/configs/_base_/models/retinanet_r50_fpn.py
+src/mmdet/configs/_base_/schedules/schedule_1x.py
+src/mmdet/configs/_base_/schedules/schedule_2x.py
+src/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py
+src/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py
+src/mmdet/configs/common/lsj_100e_coco_detection.py
+src/mmdet/configs/common/lsj_100e_coco_instance.py
+src/mmdet/configs/common/lsj_200e_coco_detection.py
+src/mmdet/configs/common/lsj_200e_coco_instance.py
+src/mmdet/configs/common/ms_3x_coco.py
+src/mmdet/configs/common/ms_3x_coco_instance.py
+src/mmdet/configs/common/ms_90k_coco.py
+src/mmdet/configs/common/ms_poly_3x_coco_instance.py
+src/mmdet/configs/common/ms_poly_90k_coco_instance.py
+src/mmdet/configs/common/ssj_270_coco_instance.py
+src/mmdet/configs/common/ssj_scp_270k_coco_instance.py
+src/mmdet/configs/deformable_detr/deformable_detr_r50_16xb2_50e_coco.py
+src/mmdet/configs/deformable_detr/deformable_detr_refine_r50_16xb2_50e_coco.py
+src/mmdet/configs/deformable_detr/deformable_detr_refine_twostage_r50_16xb2_50e_coco.py
+src/mmdet/configs/detr/detr_r101_8xb2_500e_coco.py
+src/mmdet/configs/detr/detr_r18_8xb2_500e_coco.py
+src/mmdet/configs/detr/detr_r50_8xb2_150e_coco.py
+src/mmdet/configs/detr/detr_r50_8xb2_500e_coco.py
+src/mmdet/configs/dino/dino_4scale_r50_8xb2_12e_coco.py
+src/mmdet/configs/dino/dino_4scale_r50_8xb2_24e_coco.py
+src/mmdet/configs/dino/dino_4scale_r50_8xb2_36e_coco.py
+src/mmdet/configs/dino/dino_4scale_r50_improved_8xb2_12e_coco.py
+src/mmdet/configs/dino/dino_5scale_swin_l_8xb2_12e_coco.py
+src/mmdet/configs/dino/dino_5scale_swin_l_8xb2_36e_coco.py
+src/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_ms_poly_3x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_8xb8_amp_lsj_200e_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_ms_poly_3x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_2x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_3x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_amp_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_ms_poly_-3x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_ms_poly_3x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_3x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_x101_64_4d_fpn_1x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py
+src/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_ms_poly_3x_coco.py
+src/mmdet/configs/maskformer/maskformer_r50_ms_16xb1_75e_coco.py
+src/mmdet/configs/maskformer/maskformer_swin_l_p4_w12_64xb1_ms_300e_coco.py
+src/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py
+src/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_ms_3x_coco.py
+src/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py
+src/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_ms_3x_coco.py
+src/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_4e_base.py
+src/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
+src/mmdet/configs/retinanet/retinanet_r50_fpn_1x_coco.py
+src/mmdet/configs/retinanet/retinanet_tta.py
+src/mmdet/configs/rtmdet/rtmdet_ins_l_8xb32_300e_coco.py
+src/mmdet/configs/rtmdet/rtmdet_ins_m_8xb32_300e_coco.py
+src/mmdet/configs/rtmdet/rtmdet_ins_s_8xb32_300e_coco.py
+src/mmdet/configs/rtmdet/rtmdet_ins_tiny_8xb32_300e_coco.py
+src/mmdet/configs/rtmdet/rtmdet_ins_x_8xb16_300e_coco.py
+src/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py
+src/mmdet/configs/rtmdet/rtmdet_m_8xb32_300e_coco.py
+src/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py
+src/mmdet/configs/rtmdet/rtmdet_tiny_8xb32_300e_coco.py
+src/mmdet/configs/rtmdet/rtmdet_tta.py
+src/mmdet/configs/rtmdet/rtmdet_x_8xb32_300e_coco.py
+src/mmdet/datasets/__init__.py
+src/mmdet/datasets/ade20k.py
+src/mmdet/datasets/base_det_dataset.py
+src/mmdet/datasets/base_semseg_dataset.py
+src/mmdet/datasets/base_video_dataset.py
+src/mmdet/datasets/cityscapes.py
+src/mmdet/datasets/coco.py
+src/mmdet/datasets/coco_caption.py
+src/mmdet/datasets/coco_panoptic.py
+src/mmdet/datasets/coco_semantic.py
+src/mmdet/datasets/crowdhuman.py
+src/mmdet/datasets/dataset_wrappers.py
+src/mmdet/datasets/deepfashion.py
+src/mmdet/datasets/dod.py
+src/mmdet/datasets/dsdl.py
+src/mmdet/datasets/flickr30k.py
+src/mmdet/datasets/isaid.py
+src/mmdet/datasets/lvis.py
+src/mmdet/datasets/mdetr_style_refcoco.py
+src/mmdet/datasets/mot_challenge_dataset.py
+src/mmdet/datasets/objects365.py
+src/mmdet/datasets/odvg.py
+src/mmdet/datasets/openimages.py
+src/mmdet/datasets/refcoco.py
+src/mmdet/datasets/reid_dataset.py
+src/mmdet/datasets/utils.py
+src/mmdet/datasets/v3det.py
+src/mmdet/datasets/voc.py
+src/mmdet/datasets/wider_face.py
+src/mmdet/datasets/xml_style.py
+src/mmdet/datasets/youtube_vis_dataset.py
+src/mmdet/datasets/api_wrappers/__init__.py
+src/mmdet/datasets/api_wrappers/coco_api.py
+src/mmdet/datasets/api_wrappers/cocoeval_mp.py
+src/mmdet/datasets/samplers/__init__.py
+src/mmdet/datasets/samplers/batch_sampler.py
+src/mmdet/datasets/samplers/class_aware_sampler.py
+src/mmdet/datasets/samplers/custom_sample_size_sampler.py
+src/mmdet/datasets/samplers/multi_data_sampler.py
+src/mmdet/datasets/samplers/multi_source_sampler.py
+src/mmdet/datasets/samplers/track_img_sampler.py
+src/mmdet/datasets/transforms/__init__.py
+src/mmdet/datasets/transforms/augment_wrappers.py
+src/mmdet/datasets/transforms/colorspace.py
+src/mmdet/datasets/transforms/formatting.py
+src/mmdet/datasets/transforms/frame_sampling.py
+src/mmdet/datasets/transforms/geometric.py
+src/mmdet/datasets/transforms/instaboost.py
+src/mmdet/datasets/transforms/loading.py
+src/mmdet/datasets/transforms/text_transformers.py
+src/mmdet/datasets/transforms/transformers_glip.py
+src/mmdet/datasets/transforms/transforms.py
+src/mmdet/datasets/transforms/wrappers.py
+src/mmdet/engine/__init__.py
+src/mmdet/engine/hooks/__init__.py
+src/mmdet/engine/hooks/checkloss_hook.py
+src/mmdet/engine/hooks/mean_teacher_hook.py
+src/mmdet/engine/hooks/memory_profiler_hook.py
+src/mmdet/engine/hooks/num_class_check_hook.py
+src/mmdet/engine/hooks/pipeline_switch_hook.py
+src/mmdet/engine/hooks/set_epoch_info_hook.py
+src/mmdet/engine/hooks/sync_norm_hook.py
+src/mmdet/engine/hooks/utils.py
+src/mmdet/engine/hooks/visualization_hook.py
+src/mmdet/engine/hooks/yolox_mode_switch_hook.py
+src/mmdet/engine/optimizers/__init__.py
+src/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py
+src/mmdet/engine/runner/__init__.py
+src/mmdet/engine/runner/loops.py
+src/mmdet/engine/schedulers/__init__.py
+src/mmdet/engine/schedulers/quadratic_warmup.py
+src/mmdet/evaluation/__init__.py
+src/mmdet/evaluation/evaluator/__init__.py
+src/mmdet/evaluation/evaluator/multi_datasets_evaluator.py
+src/mmdet/evaluation/functional/__init__.py
+src/mmdet/evaluation/functional/bbox_overlaps.py
+src/mmdet/evaluation/functional/cityscapes_utils.py
+src/mmdet/evaluation/functional/class_names.py
+src/mmdet/evaluation/functional/mean_ap.py
+src/mmdet/evaluation/functional/panoptic_utils.py
+src/mmdet/evaluation/functional/recall.py
+src/mmdet/evaluation/functional/ytvis.py
+src/mmdet/evaluation/functional/ytviseval.py
+src/mmdet/evaluation/metrics/__init__.py
+src/mmdet/evaluation/metrics/base_video_metric.py
+src/mmdet/evaluation/metrics/cityscapes_metric.py
+src/mmdet/evaluation/metrics/coco_caption_metric.py
+src/mmdet/evaluation/metrics/coco_metric.py
+src/mmdet/evaluation/metrics/coco_occluded_metric.py
+src/mmdet/evaluation/metrics/coco_panoptic_metric.py
+src/mmdet/evaluation/metrics/coco_video_metric.py
+src/mmdet/evaluation/metrics/crowdhuman_metric.py
+src/mmdet/evaluation/metrics/dod_metric.py
+src/mmdet/evaluation/metrics/dump_det_results.py
+src/mmdet/evaluation/metrics/dump_odvg_results.py
+src/mmdet/evaluation/metrics/dump_proposals_metric.py
+src/mmdet/evaluation/metrics/flickr30k_metric.py
+src/mmdet/evaluation/metrics/grefcoco_metric.py
+src/mmdet/evaluation/metrics/lvis_metric.py
+src/mmdet/evaluation/metrics/mot_challenge_metric.py
+src/mmdet/evaluation/metrics/openimages_metric.py
+src/mmdet/evaluation/metrics/ov_coco_metric.py
+src/mmdet/evaluation/metrics/refexp_metric.py
+src/mmdet/evaluation/metrics/refseg_metric.py
+src/mmdet/evaluation/metrics/reid_metric.py
+src/mmdet/evaluation/metrics/semseg_metric.py
+src/mmdet/evaluation/metrics/voc_metric.py
+src/mmdet/evaluation/metrics/youtube_vis_metric.py
+src/mmdet/models/__init__.py
+src/mmdet/models/backbones/__init__.py
+src/mmdet/models/backbones/csp_darknet.py
+src/mmdet/models/backbones/cspnext.py
+src/mmdet/models/backbones/darknet.py
+src/mmdet/models/backbones/detectors_resnet.py
+src/mmdet/models/backbones/detectors_resnext.py
+src/mmdet/models/backbones/efficientnet.py
+src/mmdet/models/backbones/hourglass.py
+src/mmdet/models/backbones/hrnet.py
+src/mmdet/models/backbones/mobilenet_v2.py
+src/mmdet/models/backbones/pvt.py
+src/mmdet/models/backbones/regnet.py
+src/mmdet/models/backbones/res2net.py
+src/mmdet/models/backbones/resnest.py
+src/mmdet/models/backbones/resnet.py
+src/mmdet/models/backbones/resnext.py
+src/mmdet/models/backbones/ssd_vgg.py
+src/mmdet/models/backbones/swin.py
+src/mmdet/models/backbones/trident_resnet.py
+src/mmdet/models/data_preprocessors/__init__.py
+src/mmdet/models/data_preprocessors/data_preprocessor.py
+src/mmdet/models/data_preprocessors/reid_data_preprocessor.py
+src/mmdet/models/data_preprocessors/track_data_preprocessor.py
+src/mmdet/models/dense_heads/__init__.py
+src/mmdet/models/dense_heads/anchor_free_head.py
+src/mmdet/models/dense_heads/anchor_head.py
+src/mmdet/models/dense_heads/atss_head.py
+src/mmdet/models/dense_heads/atss_vlfusion_head.py
+src/mmdet/models/dense_heads/autoassign_head.py
+src/mmdet/models/dense_heads/base_dense_head.py
+src/mmdet/models/dense_heads/base_mask_head.py
+src/mmdet/models/dense_heads/boxinst_head.py
+src/mmdet/models/dense_heads/cascade_rpn_head.py
+src/mmdet/models/dense_heads/centernet_head.py
+src/mmdet/models/dense_heads/centernet_update_head.py
+src/mmdet/models/dense_heads/centripetal_head.py
+src/mmdet/models/dense_heads/condinst_head.py
+src/mmdet/models/dense_heads/conditional_detr_head.py
+src/mmdet/models/dense_heads/corner_head.py
+src/mmdet/models/dense_heads/dab_detr_head.py
+src/mmdet/models/dense_heads/ddod_head.py
+src/mmdet/models/dense_heads/ddq_detr_head.py
+src/mmdet/models/dense_heads/deformable_detr_head.py
+src/mmdet/models/dense_heads/dense_test_mixins.py
+src/mmdet/models/dense_heads/detr_head.py
+src/mmdet/models/dense_heads/dino_head.py
+src/mmdet/models/dense_heads/embedding_rpn_head.py
+src/mmdet/models/dense_heads/fcos_head.py
+src/mmdet/models/dense_heads/fovea_head.py
+src/mmdet/models/dense_heads/free_anchor_retina_head.py
+src/mmdet/models/dense_heads/fsaf_head.py
+src/mmdet/models/dense_heads/ga_retina_head.py
+src/mmdet/models/dense_heads/ga_rpn_head.py
+src/mmdet/models/dense_heads/gfl_head.py
+src/mmdet/models/dense_heads/grounding_dino_head.py
+src/mmdet/models/dense_heads/guided_anchor_head.py
+src/mmdet/models/dense_heads/lad_head.py
+src/mmdet/models/dense_heads/ld_head.py
+src/mmdet/models/dense_heads/mask2former_head.py
+src/mmdet/models/dense_heads/maskformer_head.py
+src/mmdet/models/dense_heads/nasfcos_head.py
+src/mmdet/models/dense_heads/paa_head.py
+src/mmdet/models/dense_heads/pisa_retinanet_head.py
+src/mmdet/models/dense_heads/pisa_ssd_head.py
+src/mmdet/models/dense_heads/reppoints_head.py
+src/mmdet/models/dense_heads/retina_head.py
+src/mmdet/models/dense_heads/retina_sepbn_head.py
+src/mmdet/models/dense_heads/rpn_head.py
+src/mmdet/models/dense_heads/rtmdet_head.py
+src/mmdet/models/dense_heads/rtmdet_ins_head.py
+src/mmdet/models/dense_heads/sabl_retina_head.py
+src/mmdet/models/dense_heads/solo_head.py
+src/mmdet/models/dense_heads/solov2_head.py
+src/mmdet/models/dense_heads/ssd_head.py
+src/mmdet/models/dense_heads/tood_head.py
+src/mmdet/models/dense_heads/vfnet_head.py
+src/mmdet/models/dense_heads/yolact_head.py
+src/mmdet/models/dense_heads/yolo_head.py
+src/mmdet/models/dense_heads/yolof_head.py
+src/mmdet/models/dense_heads/yolox_head.py
+src/mmdet/models/detectors/__init__.py
+src/mmdet/models/detectors/atss.py
+src/mmdet/models/detectors/autoassign.py
+src/mmdet/models/detectors/base.py
+src/mmdet/models/detectors/base_detr.py
+src/mmdet/models/detectors/boxinst.py
+src/mmdet/models/detectors/cascade_rcnn.py
+src/mmdet/models/detectors/centernet.py
+src/mmdet/models/detectors/condinst.py
+src/mmdet/models/detectors/conditional_detr.py
+src/mmdet/models/detectors/cornernet.py
+src/mmdet/models/detectors/crowddet.py
+src/mmdet/models/detectors/d2_wrapper.py
+src/mmdet/models/detectors/dab_detr.py
+src/mmdet/models/detectors/ddod.py
+src/mmdet/models/detectors/ddq_detr.py
+src/mmdet/models/detectors/deformable_detr.py
+src/mmdet/models/detectors/detr.py
+src/mmdet/models/detectors/dino.py
+src/mmdet/models/detectors/fast_rcnn.py
+src/mmdet/models/detectors/faster_rcnn.py
+src/mmdet/models/detectors/fcos.py
+src/mmdet/models/detectors/fovea.py
+src/mmdet/models/detectors/fsaf.py
+src/mmdet/models/detectors/gfl.py
+src/mmdet/models/detectors/glip.py
+src/mmdet/models/detectors/grid_rcnn.py
+src/mmdet/models/detectors/grounding_dino.py
+src/mmdet/models/detectors/htc.py
+src/mmdet/models/detectors/kd_one_stage.py
+src/mmdet/models/detectors/lad.py
+src/mmdet/models/detectors/mask2former.py
+src/mmdet/models/detectors/mask_rcnn.py
+src/mmdet/models/detectors/mask_scoring_rcnn.py
+src/mmdet/models/detectors/maskformer.py
+src/mmdet/models/detectors/nasfcos.py
+src/mmdet/models/detectors/paa.py
+src/mmdet/models/detectors/panoptic_fpn.py
+src/mmdet/models/detectors/panoptic_two_stage_segmentor.py
+src/mmdet/models/detectors/point_rend.py
+src/mmdet/models/detectors/queryinst.py
+src/mmdet/models/detectors/reppoints_detector.py
+src/mmdet/models/detectors/retinanet.py
+src/mmdet/models/detectors/rpn.py
+src/mmdet/models/detectors/rtmdet.py
+src/mmdet/models/detectors/scnet.py
+src/mmdet/models/detectors/semi_base.py
+src/mmdet/models/detectors/single_stage.py
+src/mmdet/models/detectors/single_stage_instance_seg.py
+src/mmdet/models/detectors/soft_teacher.py
+src/mmdet/models/detectors/solo.py
+src/mmdet/models/detectors/solov2.py
+src/mmdet/models/detectors/sparse_rcnn.py
+src/mmdet/models/detectors/tood.py
+src/mmdet/models/detectors/trident_faster_rcnn.py
+src/mmdet/models/detectors/two_stage.py
+src/mmdet/models/detectors/vfnet.py
+src/mmdet/models/detectors/yolact.py
+src/mmdet/models/detectors/yolo.py
+src/mmdet/models/detectors/yolof.py
+src/mmdet/models/detectors/yolox.py
+src/mmdet/models/language_models/__init__.py
+src/mmdet/models/language_models/bert.py
+src/mmdet/models/layers/__init__.py
+src/mmdet/models/layers/activations.py
+src/mmdet/models/layers/bbox_nms.py
+src/mmdet/models/layers/brick_wrappers.py
+src/mmdet/models/layers/conv_upsample.py
+src/mmdet/models/layers/csp_layer.py
+src/mmdet/models/layers/dropblock.py
+src/mmdet/models/layers/ema.py
+src/mmdet/models/layers/inverted_residual.py
+src/mmdet/models/layers/matrix_nms.py
+src/mmdet/models/layers/msdeformattn_pixel_decoder.py
+src/mmdet/models/layers/normed_predictor.py
+src/mmdet/models/layers/pixel_decoder.py
+src/mmdet/models/layers/positional_encoding.py
+src/mmdet/models/layers/res_layer.py
+src/mmdet/models/layers/se_layer.py
+src/mmdet/models/layers/transformer/__init__.py
+src/mmdet/models/layers/transformer/conditional_detr_layers.py
+src/mmdet/models/layers/transformer/dab_detr_layers.py
+src/mmdet/models/layers/transformer/ddq_detr_layers.py
+src/mmdet/models/layers/transformer/deformable_detr_layers.py
+src/mmdet/models/layers/transformer/detr_layers.py
+src/mmdet/models/layers/transformer/dino_layers.py
+src/mmdet/models/layers/transformer/grounding_dino_layers.py
+src/mmdet/models/layers/transformer/mask2former_layers.py
+src/mmdet/models/layers/transformer/utils.py
+src/mmdet/models/losses/__init__.py
+src/mmdet/models/losses/accuracy.py
+src/mmdet/models/losses/ae_loss.py
+src/mmdet/models/losses/balanced_l1_loss.py
+src/mmdet/models/losses/cross_entropy_loss.py
+src/mmdet/models/losses/ddq_detr_aux_loss.py
+src/mmdet/models/losses/dice_loss.py
+src/mmdet/models/losses/eqlv2_loss.py
+src/mmdet/models/losses/focal_loss.py
+src/mmdet/models/losses/gaussian_focal_loss.py
+src/mmdet/models/losses/gfocal_loss.py
+src/mmdet/models/losses/ghm_loss.py
+src/mmdet/models/losses/iou_loss.py
+src/mmdet/models/losses/kd_loss.py
+src/mmdet/models/losses/l2_loss.py
+src/mmdet/models/losses/margin_loss.py
+src/mmdet/models/losses/mse_loss.py
+src/mmdet/models/losses/multipos_cross_entropy_loss.py
+src/mmdet/models/losses/pisa_loss.py
+src/mmdet/models/losses/seesaw_loss.py
+src/mmdet/models/losses/smooth_l1_loss.py
+src/mmdet/models/losses/triplet_loss.py
+src/mmdet/models/losses/utils.py
+src/mmdet/models/losses/varifocal_loss.py
+src/mmdet/models/mot/__init__.py
+src/mmdet/models/mot/base.py
+src/mmdet/models/mot/bytetrack.py
+src/mmdet/models/mot/deep_sort.py
+src/mmdet/models/mot/ocsort.py
+src/mmdet/models/mot/qdtrack.py
+src/mmdet/models/mot/strongsort.py
+src/mmdet/models/necks/__init__.py
+src/mmdet/models/necks/bfp.py
+src/mmdet/models/necks/channel_mapper.py
+src/mmdet/models/necks/cspnext_pafpn.py
+src/mmdet/models/necks/ct_resnet_neck.py
+src/mmdet/models/necks/dilated_encoder.py
+src/mmdet/models/necks/dyhead.py
+src/mmdet/models/necks/fpg.py
+src/mmdet/models/necks/fpn.py
+src/mmdet/models/necks/fpn_carafe.py
+src/mmdet/models/necks/fpn_dropblock.py
+src/mmdet/models/necks/hrfpn.py
+src/mmdet/models/necks/nas_fpn.py
+src/mmdet/models/necks/nasfcos_fpn.py
+src/mmdet/models/necks/pafpn.py
+src/mmdet/models/necks/rfp.py
+src/mmdet/models/necks/ssd_neck.py
+src/mmdet/models/necks/ssh.py
+src/mmdet/models/necks/yolo_neck.py
+src/mmdet/models/necks/yolox_pafpn.py
+src/mmdet/models/reid/__init__.py
+src/mmdet/models/reid/base_reid.py
+src/mmdet/models/reid/fc_module.py
+src/mmdet/models/reid/gap.py
+src/mmdet/models/reid/linear_reid_head.py
+src/mmdet/models/roi_heads/__init__.py
+src/mmdet/models/roi_heads/base_roi_head.py
+src/mmdet/models/roi_heads/cascade_roi_head.py
+src/mmdet/models/roi_heads/double_roi_head.py
+src/mmdet/models/roi_heads/dynamic_roi_head.py
+src/mmdet/models/roi_heads/grid_roi_head.py
+src/mmdet/models/roi_heads/htc_roi_head.py
+src/mmdet/models/roi_heads/mask_scoring_roi_head.py
+src/mmdet/models/roi_heads/multi_instance_roi_head.py
+src/mmdet/models/roi_heads/pisa_roi_head.py
+src/mmdet/models/roi_heads/point_rend_roi_head.py
+src/mmdet/models/roi_heads/scnet_roi_head.py
+src/mmdet/models/roi_heads/sparse_roi_head.py
+src/mmdet/models/roi_heads/standard_roi_head.py
+src/mmdet/models/roi_heads/test_mixins.py
+src/mmdet/models/roi_heads/trident_roi_head.py
+src/mmdet/models/roi_heads/bbox_heads/__init__.py
+src/mmdet/models/roi_heads/bbox_heads/bbox_head.py
+src/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
+src/mmdet/models/roi_heads/bbox_heads/dii_head.py
+src/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py
+src/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py
+src/mmdet/models/roi_heads/bbox_heads/sabl_head.py
+src/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py
+src/mmdet/models/roi_heads/mask_heads/__init__.py
+src/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py
+src/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py
+src/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
+src/mmdet/models/roi_heads/mask_heads/feature_relay_head.py
+src/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py
+src/mmdet/models/roi_heads/mask_heads/global_context_head.py
+src/mmdet/models/roi_heads/mask_heads/grid_head.py
+src/mmdet/models/roi_heads/mask_heads/htc_mask_head.py
+src/mmdet/models/roi_heads/mask_heads/mask_point_head.py
+src/mmdet/models/roi_heads/mask_heads/maskiou_head.py
+src/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py
+src/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py
+src/mmdet/models/roi_heads/roi_extractors/__init__.py
+src/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
+src/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
+src/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
+src/mmdet/models/roi_heads/shared_heads/__init__.py
+src/mmdet/models/roi_heads/shared_heads/res_layer.py
+src/mmdet/models/seg_heads/__init__.py
+src/mmdet/models/seg_heads/base_semantic_head.py
+src/mmdet/models/seg_heads/panoptic_fpn_head.py
+src/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py
+src/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py
+src/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py
+src/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py
+src/mmdet/models/task_modules/__init__.py
+src/mmdet/models/task_modules/builder.py
+src/mmdet/models/task_modules/assigners/__init__.py
+src/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py
+src/mmdet/models/task_modules/assigners/assign_result.py
+src/mmdet/models/task_modules/assigners/atss_assigner.py
+src/mmdet/models/task_modules/assigners/base_assigner.py
+src/mmdet/models/task_modules/assigners/center_region_assigner.py
+src/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py
+src/mmdet/models/task_modules/assigners/grid_assigner.py
+src/mmdet/models/task_modules/assigners/hungarian_assigner.py
+src/mmdet/models/task_modules/assigners/iou2d_calculator.py
+src/mmdet/models/task_modules/assigners/match_cost.py
+src/mmdet/models/task_modules/assigners/max_iou_assigner.py
+src/mmdet/models/task_modules/assigners/multi_instance_assigner.py
+src/mmdet/models/task_modules/assigners/point_assigner.py
+src/mmdet/models/task_modules/assigners/region_assigner.py
+src/mmdet/models/task_modules/assigners/sim_ota_assigner.py
+src/mmdet/models/task_modules/assigners/task_aligned_assigner.py
+src/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py
+src/mmdet/models/task_modules/assigners/uniform_assigner.py
+src/mmdet/models/task_modules/coders/__init__.py
+src/mmdet/models/task_modules/coders/base_bbox_coder.py
+src/mmdet/models/task_modules/coders/bucketing_bbox_coder.py
+src/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py
+src/mmdet/models/task_modules/coders/distance_point_bbox_coder.py
+src/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py
+src/mmdet/models/task_modules/coders/pseudo_bbox_coder.py
+src/mmdet/models/task_modules/coders/tblr_bbox_coder.py
+src/mmdet/models/task_modules/coders/yolo_bbox_coder.py
+src/mmdet/models/task_modules/prior_generators/__init__.py
+src/mmdet/models/task_modules/prior_generators/anchor_generator.py
+src/mmdet/models/task_modules/prior_generators/point_generator.py
+src/mmdet/models/task_modules/prior_generators/utils.py
+src/mmdet/models/task_modules/samplers/__init__.py
+src/mmdet/models/task_modules/samplers/base_sampler.py
+src/mmdet/models/task_modules/samplers/combined_sampler.py
+src/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py
+src/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py
+src/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py
+src/mmdet/models/task_modules/samplers/mask_sampling_result.py
+src/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py
+src/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py
+src/mmdet/models/task_modules/samplers/ohem_sampler.py
+src/mmdet/models/task_modules/samplers/pseudo_sampler.py
+src/mmdet/models/task_modules/samplers/random_sampler.py
+src/mmdet/models/task_modules/samplers/sampling_result.py
+src/mmdet/models/task_modules/samplers/score_hlr_sampler.py
+src/mmdet/models/task_modules/tracking/__init__.py
+src/mmdet/models/task_modules/tracking/aflink.py
+src/mmdet/models/task_modules/tracking/camera_motion_compensation.py
+src/mmdet/models/task_modules/tracking/interpolation.py
+src/mmdet/models/task_modules/tracking/kalman_filter.py
+src/mmdet/models/task_modules/tracking/similarity.py
+src/mmdet/models/test_time_augs/__init__.py
+src/mmdet/models/test_time_augs/det_tta.py
+src/mmdet/models/test_time_augs/merge_augs.py
+src/mmdet/models/trackers/__init__.py
+src/mmdet/models/trackers/base_tracker.py
+src/mmdet/models/trackers/byte_tracker.py
+src/mmdet/models/trackers/masktrack_rcnn_tracker.py
+src/mmdet/models/trackers/ocsort_tracker.py
+src/mmdet/models/trackers/quasi_dense_tracker.py
+src/mmdet/models/trackers/sort_tracker.py
+src/mmdet/models/trackers/strongsort_tracker.py
+src/mmdet/models/tracking_heads/__init__.py
+src/mmdet/models/tracking_heads/mask2former_track_head.py
+src/mmdet/models/tracking_heads/quasi_dense_embed_head.py
+src/mmdet/models/tracking_heads/quasi_dense_track_head.py
+src/mmdet/models/tracking_heads/roi_embed_head.py
+src/mmdet/models/tracking_heads/roi_track_head.py
+src/mmdet/models/utils/__init__.py
+src/mmdet/models/utils/gaussian_target.py
+src/mmdet/models/utils/image.py
+src/mmdet/models/utils/make_divisible.py
+src/mmdet/models/utils/misc.py
+src/mmdet/models/utils/panoptic_gt_processing.py
+src/mmdet/models/utils/point_sample.py
+src/mmdet/models/utils/vlfuse_helper.py
+src/mmdet/models/utils/wbf.py
+src/mmdet/models/vis/__init__.py
+src/mmdet/models/vis/mask2former_vis.py
+src/mmdet/models/vis/masktrack_rcnn.py
+src/mmdet/structures/__init__.py
+src/mmdet/structures/det_data_sample.py
+src/mmdet/structures/reid_data_sample.py
+src/mmdet/structures/track_data_sample.py
+src/mmdet/structures/bbox/__init__.py
+src/mmdet/structures/bbox/base_boxes.py
+src/mmdet/structures/bbox/bbox_overlaps.py
+src/mmdet/structures/bbox/box_type.py
+src/mmdet/structures/bbox/horizontal_boxes.py
+src/mmdet/structures/bbox/transforms.py
+src/mmdet/structures/mask/__init__.py
+src/mmdet/structures/mask/mask_target.py
+src/mmdet/structures/mask/structures.py
+src/mmdet/structures/mask/utils.py
+src/mmdet/testing/__init__.py
+src/mmdet/testing/_fast_stop_training_hook.py
+src/mmdet/testing/_utils.py
+src/mmdet/utils/__init__.py
+src/mmdet/utils/benchmark.py
+src/mmdet/utils/collect_env.py
+src/mmdet/utils/compat_config.py
+src/mmdet/utils/contextmanagers.py
+src/mmdet/utils/dist_utils.py
+src/mmdet/utils/large_image.py
+src/mmdet/utils/logger.py
+src/mmdet/utils/memory.py
+src/mmdet/utils/misc.py
+src/mmdet/utils/mot_error_visualize.py
+src/mmdet/utils/profiling.py
+src/mmdet/utils/replace_cfg_vals.py
+src/mmdet/utils/setup_env.py
+src/mmdet/utils/split_batch.py
+src/mmdet/utils/typing_utils.py
+src/mmdet/utils/util_mixins.py
+src/mmdet/utils/util_random.py
+src/mmdet/visualization/__init__.py
+src/mmdet/visualization/local_visualizer.py
+src/mmdet/visualization/palette.py
+src/mmengine/__init__.py
+src/mmengine/version.py
+src/mmengine/_strategy/__init__.py
+src/mmengine/_strategy/base.py
+src/mmengine/_strategy/colossalai.py
+src/mmengine/_strategy/deepspeed.py
+src/mmengine/_strategy/distributed.py
+src/mmengine/_strategy/fsdp.py
+src/mmengine/_strategy/single_device.py
+src/mmengine/_strategy/utils.py
+src/mmengine/analysis/__init__.py
+src/mmengine/analysis/complexity_analysis.py
+src/mmengine/analysis/jit_analysis.py
+src/mmengine/analysis/jit_handles.py
+src/mmengine/analysis/print_helper.py
+src/mmengine/config/__init__.py
+src/mmengine/config/config.py
+src/mmengine/config/lazy.py
+src/mmengine/config/utils.py
+src/mmengine/dataset/__init__.py
+src/mmengine/dataset/base_dataset.py
+src/mmengine/dataset/dataset_wrapper.py
+src/mmengine/dataset/sampler.py
+src/mmengine/dataset/utils.py
+src/mmengine/device/__init__.py
+src/mmengine/device/utils.py
+src/mmengine/dist/__init__.py
+src/mmengine/dist/dist.py
+src/mmengine/dist/utils.py
+src/mmengine/evaluator/__init__.py
+src/mmengine/evaluator/evaluator.py
+src/mmengine/evaluator/metric.py
+src/mmengine/evaluator/utils.py
+src/mmengine/fileio/__init__.py
+src/mmengine/fileio/file_client.py
+src/mmengine/fileio/io.py
+src/mmengine/fileio/parse.py
+src/mmengine/fileio/backends/__init__.py
+src/mmengine/fileio/backends/base.py
+src/mmengine/fileio/backends/http_backend.py
+src/mmengine/fileio/backends/lmdb_backend.py
+src/mmengine/fileio/backends/local_backend.py
+src/mmengine/fileio/backends/memcached_backend.py
+src/mmengine/fileio/backends/petrel_backend.py
+src/mmengine/fileio/backends/registry_utils.py
+src/mmengine/fileio/handlers/__init__.py
+src/mmengine/fileio/handlers/base.py
+src/mmengine/fileio/handlers/json_handler.py
+src/mmengine/fileio/handlers/pickle_handler.py
+src/mmengine/fileio/handlers/registry_utils.py
+src/mmengine/fileio/handlers/yaml_handler.py
+src/mmengine/hooks/__init__.py
+src/mmengine/hooks/checkpoint_hook.py
+src/mmengine/hooks/early_stopping_hook.py
+src/mmengine/hooks/ema_hook.py
+src/mmengine/hooks/empty_cache_hook.py
+src/mmengine/hooks/hook.py
+src/mmengine/hooks/iter_timer_hook.py
+src/mmengine/hooks/logger_hook.py
+src/mmengine/hooks/naive_visualization_hook.py
+src/mmengine/hooks/param_scheduler_hook.py
+src/mmengine/hooks/profiler_hook.py
+src/mmengine/hooks/runtime_info_hook.py
+src/mmengine/hooks/sampler_seed_hook.py
+src/mmengine/hooks/sync_buffer_hook.py
+src/mmengine/hooks/test_time_aug_hook.py
+src/mmengine/hub/__init__.py
+src/mmengine/hub/hub.py
+src/mmengine/infer/__init__.py
+src/mmengine/infer/infer.py
+src/mmengine/logging/__init__.py
+src/mmengine/logging/history_buffer.py
+src/mmengine/logging/logger.py
+src/mmengine/logging/message_hub.py
+src/mmengine/model/__init__.py
+src/mmengine/model/averaged_model.py
+src/mmengine/model/base_module.py
+src/mmengine/model/efficient_conv_bn_eval.py
+src/mmengine/model/test_time_aug.py
+src/mmengine/model/utils.py
+src/mmengine/model/weight_init.py
+src/mmengine/model/base_model/__init__.py
+src/mmengine/model/base_model/base_model.py
+src/mmengine/model/base_model/data_preprocessor.py
+src/mmengine/model/wrappers/__init__.py
+src/mmengine/model/wrappers/distributed.py
+src/mmengine/model/wrappers/fully_sharded_distributed.py
+src/mmengine/model/wrappers/seperate_distributed.py
+src/mmengine/model/wrappers/utils.py
+src/mmengine/optim/__init__.py
+src/mmengine/optim/optimizer/__init__.py
+src/mmengine/optim/optimizer/amp_optimizer_wrapper.py
+src/mmengine/optim/optimizer/apex_optimizer_wrapper.py
+src/mmengine/optim/optimizer/base.py
+src/mmengine/optim/optimizer/builder.py
+src/mmengine/optim/optimizer/default_constructor.py
+src/mmengine/optim/optimizer/optimizer_wrapper.py
+src/mmengine/optim/optimizer/optimizer_wrapper_dict.py
+src/mmengine/optim/optimizer/zero_optimizer.py
+src/mmengine/optim/scheduler/__init__.py
+src/mmengine/optim/scheduler/lr_scheduler.py
+src/mmengine/optim/scheduler/momentum_scheduler.py
+src/mmengine/optim/scheduler/param_scheduler.py
+src/mmengine/registry/__init__.py
+src/mmengine/registry/build_functions.py
+src/mmengine/registry/default_scope.py
+src/mmengine/registry/registry.py
+src/mmengine/registry/root.py
+src/mmengine/registry/utils.py
+src/mmengine/runner/__init__.py
+src/mmengine/runner/_flexible_runner.py
+src/mmengine/runner/activation_checkpointing.py
+src/mmengine/runner/amp.py
+src/mmengine/runner/base_loop.py
+src/mmengine/runner/checkpoint.py
+src/mmengine/runner/log_processor.py
+src/mmengine/runner/loops.py
+src/mmengine/runner/priority.py
+src/mmengine/runner/runner.py
+src/mmengine/runner/utils.py
+src/mmengine/structures/__init__.py
+src/mmengine/structures/base_data_element.py
+src/mmengine/structures/instance_data.py
+src/mmengine/structures/label_data.py
+src/mmengine/structures/pixel_data.py
+src/mmengine/testing/__init__.py
+src/mmengine/testing/compare.py
+src/mmengine/testing/runner_test_case.py
+src/mmengine/testing/_internal/__init__.py
+src/mmengine/testing/_internal/distributed.py
+src/mmengine/utils/__init__.py
+src/mmengine/utils/manager.py
+src/mmengine/utils/misc.py
+src/mmengine/utils/package_utils.py
+src/mmengine/utils/path.py
+src/mmengine/utils/progressbar.py
+src/mmengine/utils/progressbar_rich.py
+src/mmengine/utils/timer.py
+src/mmengine/utils/version_utils.py
+src/mmengine/utils/dl_utils/__init__.py
+src/mmengine/utils/dl_utils/collect_env.py
+src/mmengine/utils/dl_utils/hub.py
+src/mmengine/utils/dl_utils/misc.py
+src/mmengine/utils/dl_utils/parrots_wrapper.py
+src/mmengine/utils/dl_utils/setup_env.py
+src/mmengine/utils/dl_utils/time_counter.py
+src/mmengine/utils/dl_utils/torch_ops.py
+src/mmengine/utils/dl_utils/trace.py
+src/mmengine/utils/dl_utils/visualize.py
+src/mmengine/visualization/__init__.py
+src/mmengine/visualization/utils.py
+src/mmengine/visualization/vis_backend.py
+src/mmengine/visualization/visualizer.py
+src/mmseg/__init__.py
+src/mmseg/version.py
+src/mmseg/apis/__init__.py
+src/mmseg/apis/inference.py
+src/mmseg/apis/mmseg_inferencer.py
+src/mmseg/apis/remote_sense_inferencer.py
+src/mmseg/apis/utils.py
+src/mmseg/configs/_base_/default_runtime.py
+src/mmseg/configs/_base_/datasets/loveda.py
+src/mmseg/configs/_base_/datasets/potsdam.py
+src/mmseg/configs/_base_/schedules/schedule_160k.py
+src/mmseg/configs/_base_/schedules/schedule_20k.py
+src/mmseg/configs/_base_/schedules/schedule_240k.py
+src/mmseg/configs/_base_/schedules/schedule_25k.py
+src/mmseg/configs/_base_/schedules/schedule_320k.py
+src/mmseg/configs/_base_/schedules/schedule_40k.py
+src/mmseg/configs/_base_/schedules/schedule_80k.py
+src/mmseg/datasets/__init__.py
+src/mmseg/datasets/ade.py
+src/mmseg/datasets/basesegdataset.py
+src/mmseg/datasets/bdd100k.py
+src/mmseg/datasets/chase_db1.py
+src/mmseg/datasets/cityscapes.py
+src/mmseg/datasets/coco_stuff.py
+src/mmseg/datasets/dark_zurich.py
+src/mmseg/datasets/dataset_wrappers.py
+src/mmseg/datasets/decathlon.py
+src/mmseg/datasets/deep_fashion.py
+src/mmseg/datasets/deepfashion_10k.py
+src/mmseg/datasets/drive.py
+src/mmseg/datasets/dsdl.py
+src/mmseg/datasets/fashion.py
+src/mmseg/datasets/fashion_3category.py
+src/mmseg/datasets/hrf.py
+src/mmseg/datasets/hsi_drive.py
+src/mmseg/datasets/human_parsing.py
+src/mmseg/datasets/human_union.py
+src/mmseg/datasets/imaterialist.py
+src/mmseg/datasets/imaterialist_5cat.py
+src/mmseg/datasets/isaid.py
+src/mmseg/datasets/isprs.py
+src/mmseg/datasets/levir.py
+src/mmseg/datasets/lip.py
+src/mmseg/datasets/loveda.py
+src/mmseg/datasets/mapillary.py
+src/mmseg/datasets/night_driving.py
+src/mmseg/datasets/nyu.py
+src/mmseg/datasets/pascal_context.py
+src/mmseg/datasets/potsdam.py
+src/mmseg/datasets/refuge.py
+src/mmseg/datasets/stare.py
+src/mmseg/datasets/synapse.py
+src/mmseg/datasets/union_new.py
+src/mmseg/datasets/union_new_add_mask.py
+src/mmseg/datasets/voc.py
+src/mmseg/datasets/transforms/__init__.py
+src/mmseg/datasets/transforms/formatting.py
+src/mmseg/datasets/transforms/loading.py
+src/mmseg/datasets/transforms/transforms.py
+src/mmseg/engine/__init__.py
+src/mmseg/engine/hooks/__init__.py
+src/mmseg/engine/hooks/visualization_hook.py
+src/mmseg/engine/optimizers/__init__.py
+src/mmseg/engine/optimizers/force_default_constructor.py
+src/mmseg/engine/optimizers/layer_decay_optimizer_constructor.py
+src/mmseg/engine/schedulers/__init__.py
+src/mmseg/engine/schedulers/poly_ratio_scheduler.py
+src/mmseg/evaluation/__init__.py
+src/mmseg/evaluation/metrics/__init__.py
+src/mmseg/evaluation/metrics/citys_metric.py
+src/mmseg/evaluation/metrics/depth_metric.py
+src/mmseg/evaluation/metrics/iou_metric.py
+src/mmseg/models/__init__.py
+src/mmseg/models/builder.py
+src/mmseg/models/data_preprocessor.py
+src/mmseg/models/assigners/__init__.py
+src/mmseg/models/assigners/base_assigner.py
+src/mmseg/models/assigners/hungarian_assigner.py
+src/mmseg/models/assigners/match_cost.py
+src/mmseg/models/backbones/__init__.py
+src/mmseg/models/backbones/beit.py
+src/mmseg/models/backbones/bisenetv1.py
+src/mmseg/models/backbones/bisenetv2.py
+src/mmseg/models/backbones/cgnet.py
+src/mmseg/models/backbones/ddrnet.py
+src/mmseg/models/backbones/dinov2.py
+src/mmseg/models/backbones/erfnet.py
+src/mmseg/models/backbones/fast_scnn.py
+src/mmseg/models/backbones/hrnet.py
+src/mmseg/models/backbones/icnet.py
+src/mmseg/models/backbones/mae.py
+src/mmseg/models/backbones/mit.py
+src/mmseg/models/backbones/mobilenet_v2.py
+src/mmseg/models/backbones/mobilenet_v3.py
+src/mmseg/models/backbones/mscan.py
+src/mmseg/models/backbones/pidnet.py
+src/mmseg/models/backbones/resnest.py
+src/mmseg/models/backbones/resnet.py
+src/mmseg/models/backbones/resnext.py
+src/mmseg/models/backbones/stdc.py
+src/mmseg/models/backbones/swin.py
+src/mmseg/models/backbones/timm_backbone.py
+src/mmseg/models/backbones/twins.py
+src/mmseg/models/backbones/unet.py
+src/mmseg/models/backbones/vit.py
+src/mmseg/models/backbones/vpd.py
+src/mmseg/models/decode_heads/__init__.py
+src/mmseg/models/decode_heads/ann_head.py
+src/mmseg/models/decode_heads/apc_head.py
+src/mmseg/models/decode_heads/aspp_head.py
+src/mmseg/models/decode_heads/cascade_decode_head.py
+src/mmseg/models/decode_heads/cc_head.py
+src/mmseg/models/decode_heads/da_head.py
+src/mmseg/models/decode_heads/ddr_head.py
+src/mmseg/models/decode_heads/decode_head.py
+src/mmseg/models/decode_heads/dm_head.py
+src/mmseg/models/decode_heads/dnl_head.py
+src/mmseg/models/decode_heads/dpt_head.py
+src/mmseg/models/decode_heads/ema_head.py
+src/mmseg/models/decode_heads/enc_head.py
+src/mmseg/models/decode_heads/fcn_head.py
+src/mmseg/models/decode_heads/fpn_head.py
+src/mmseg/models/decode_heads/gc_head.py
+src/mmseg/models/decode_heads/ham_head.py
+src/mmseg/models/decode_heads/isa_head.py
+src/mmseg/models/decode_heads/knet_head.py
+src/mmseg/models/decode_heads/lraspp_head.py
+src/mmseg/models/decode_heads/mask2former_head.py
+src/mmseg/models/decode_heads/maskformer_head.py
+src/mmseg/models/decode_heads/nl_head.py
+src/mmseg/models/decode_heads/ocr_head.py
+src/mmseg/models/decode_heads/pid_head.py
+src/mmseg/models/decode_heads/point_head.py
+src/mmseg/models/decode_heads/psa_head.py
+src/mmseg/models/decode_heads/psp_head.py
+src/mmseg/models/decode_heads/san_head.py
+src/mmseg/models/decode_heads/segformer_head.py
+src/mmseg/models/decode_heads/segmenter_mask_head.py
+src/mmseg/models/decode_heads/sep_aspp_head.py
+src/mmseg/models/decode_heads/sep_fcn_head.py
+src/mmseg/models/decode_heads/setr_mla_head.py
+src/mmseg/models/decode_heads/setr_up_head.py
+src/mmseg/models/decode_heads/stdc_head.py
+src/mmseg/models/decode_heads/uper_head.py
+src/mmseg/models/decode_heads/vpd_depth_head.py
+src/mmseg/models/losses/__init__.py
+src/mmseg/models/losses/accuracy.py
+src/mmseg/models/losses/boundary_loss.py
+src/mmseg/models/losses/cross_entropy_loss.py
+src/mmseg/models/losses/dice_loss.py
+src/mmseg/models/losses/focal_loss.py
+src/mmseg/models/losses/huasdorff_distance_loss.py
+src/mmseg/models/losses/kldiv_loss.py
+src/mmseg/models/losses/lovasz_loss.py
+src/mmseg/models/losses/ohem_cross_entropy_loss.py
+src/mmseg/models/losses/silog_loss.py
+src/mmseg/models/losses/tversky_loss.py
+src/mmseg/models/losses/utils.py
+src/mmseg/models/necks/__init__.py
+src/mmseg/models/necks/featurepyramid.py
+src/mmseg/models/necks/fpn.py
+src/mmseg/models/necks/ic_neck.py
+src/mmseg/models/necks/jpu.py
+src/mmseg/models/necks/mla_neck.py
+src/mmseg/models/necks/multilevel_neck.py
+src/mmseg/models/segmentors/__init__.py
+src/mmseg/models/segmentors/base.py
+src/mmseg/models/segmentors/cascade_encoder_decoder.py
+src/mmseg/models/segmentors/depth_estimator.py
+src/mmseg/models/segmentors/encoder_decoder.py
+src/mmseg/models/segmentors/multimodal_encoder_decoder.py
+src/mmseg/models/segmentors/seg_tta.py
+src/mmseg/models/text_encoder/__init__.py
+src/mmseg/models/text_encoder/clip_text_encoder.py
+src/mmseg/models/utils/__init__.py
+src/mmseg/models/utils/basic_block.py
+src/mmseg/models/utils/embed.py
+src/mmseg/models/utils/encoding.py
+src/mmseg/models/utils/inverted_residual.py
+src/mmseg/models/utils/make_divisible.py
+src/mmseg/models/utils/point_sample.py
+src/mmseg/models/utils/ppm.py
+src/mmseg/models/utils/res_layer.py
+src/mmseg/models/utils/san_layers.py
+src/mmseg/models/utils/se_layer.py
+src/mmseg/models/utils/self_attention_block.py
+src/mmseg/models/utils/shape_convert.py
+src/mmseg/models/utils/up_conv_block.py
+src/mmseg/models/utils/wrappers.py
+src/mmseg/registry/__init__.py
+src/mmseg/registry/registry.py
+src/mmseg/structures/__init__.py
+src/mmseg/structures/seg_data_sample.py
+src/mmseg/structures/sampler/__init__.py
+src/mmseg/structures/sampler/base_pixel_sampler.py
+src/mmseg/structures/sampler/builder.py
+src/mmseg/structures/sampler/ohem_pixel_sampler.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/hubconf.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/setup.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/utils.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/vision_transformer.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/augmentations.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/collate.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/loaders.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/masking.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/samplers.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/transforms.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/decoders.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/extended.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net_22k.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/distributed/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/knn.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/linear.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/log_regression.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/metrics.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/setup.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/utils.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/fsdp/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/block.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/dino_head.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/patch_embed.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/swiglu_ffn.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/helpers.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/dino_clstoken_loss.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/ibot_patch_loss.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/vision_transformer.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/submit.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/log_regression.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/ssl_meta_arch.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/train.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/cluster.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/config.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/param_groups.py
+src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/utils.py
+src/mmseg/utils/__init__.py
+src/mmseg/utils/bpe_simple_vocab_16e6.txt.gz
+src/mmseg/utils/class_names.py
+src/mmseg/utils/collect_env.py
+src/mmseg/utils/get_templates.py
+src/mmseg/utils/io.py
+src/mmseg/utils/mask_classification.py
+src/mmseg/utils/misc.py
+src/mmseg/utils/set_env.py
+src/mmseg/utils/tokenizer.py
+src/mmseg/utils/typing_utils.py
+src/mmseg/visualization/__init__.py
+src/mmseg/visualization/local_visualizer.py
\ No newline at end of file
diff --git a/head_extractor/src/head_extractor.egg-info/dependency_links.txt b/head_extractor/src/head_extractor.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/head_extractor/src/head_extractor.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/head_extractor/src/head_extractor.egg-info/requires.txt b/head_extractor/src/head_extractor.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fe445591fa8e8b80af17d9f9dd3d09291e1620da
--- /dev/null
+++ b/head_extractor/src/head_extractor.egg-info/requires.txt
@@ -0,0 +1,20 @@
+numpy
+Pillow
+opencv-python-headless
+torch
+torchvision
+addict
+rich
+prettytable
+terminaltables
+tqdm
+matplotlib
+scipy
+ftfy
+regex
+pycocotools
+shapely
+mmcv==2.1.0
+
+[:python_version < "3.9"]
+importlib-resources
diff --git a/head_extractor/src/head_extractor.egg-info/top_level.txt b/head_extractor/src/head_extractor.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fe527ebe45e7cd5257e78e6635a5320ad0ed2fda
--- /dev/null
+++ b/head_extractor/src/head_extractor.egg-info/top_level.txt
@@ -0,0 +1,4 @@
+head_extractor
+mmdet
+mmengine
+mmseg
diff --git a/head_extractor/src/head_extractor/__init__.py b/head_extractor/src/head_extractor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2988acd2b20e4bd119a8ebcb0e8798069b953f0
--- /dev/null
+++ b/head_extractor/src/head_extractor/__init__.py
@@ -0,0 +1,6 @@
+from .processor import ProcessorPipeline, TaskType
+
+__version__ = "0.1.0"
+
+# 让外部可以直接 from head_extractor import ProcessorPipeline
+__all__ = ['ProcessorPipeline', 'TaskType']
\ No newline at end of file
diff --git a/head_extractor/src/head_extractor/models/__init__.py b/head_extractor/src/head_extractor/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/head_extractor/src/head_extractor/models/depth_anything_large_mask2former_16xb1_160k_human_parsing_fashion_1024x1024.py b/head_extractor/src/head_extractor/models/depth_anything_large_mask2former_16xb1_160k_human_parsing_fashion_1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..a42645384835b5a843ec3acbcc16a6d497953d0b
--- /dev/null
+++ b/head_extractor/src/head_extractor/models/depth_anything_large_mask2former_16xb1_160k_human_parsing_fashion_1024x1024.py
@@ -0,0 +1,573 @@
+auto_scale_lr = dict(base_batch_size=16, enable=False)
+backbone_embed_multi = dict(decay_mult=0.0, lr_mult=0.1)
+backbone_norm_multi = dict(decay_mult=0.0, lr_mult=0.1)
+crop_size = (
+    896,
+    896,
+)
+custom_keys = dict({
+    'backbone.dinov2':
+    dict(decay_mult=1.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.0.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.1.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.10.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.11.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.12.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.13.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.14.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.15.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.16.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.17.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.18.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.19.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.2.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.20.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.21.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.22.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.23.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.3.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.4.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.5.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.6.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.7.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.8.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.9.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'level_embed':
+    dict(decay_mult=0.0, lr_mult=1.0),
+    'pos_embed':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'query_embed':
+    dict(decay_mult=0.0, lr_mult=1.0),
+    'query_feat':
+    dict(decay_mult=0.0, lr_mult=1.0)
+})
+data_preprocessor = dict(
+    bgr_to_rgb=True,
+    mean=[
+        123.675,
+        116.28,
+        103.53,
+    ],
+    pad_val=0,
+    seg_pad_val=255,
+    size=(
+        896,
+        896,
+    ),
+    std=[
+        58.395,
+        57.12,
+        57.375,
+    ],
+    type='SegDataPreProcessor')
+data_root = '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/deep_fashion_10k'
+dataset_type = 'HumanParsingDataset'
+default_hooks = dict(
+    checkpoint=dict(
+        by_epoch=False,
+        interval=2000,
+        max_keep_ckpts=50,
+        save_best='mIoU',
+        type='CheckpointHook'),
+    logger=dict(interval=50, log_metric_by_epoch=False, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='SegVisualizationHook'))
+default_scope = 'mmseg'
+embed_multi = dict(decay_mult=0.0, lr_mult=1.0)
+env_cfg = dict(
+    cudnn_benchmark=True,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+find_unused_parameters = True
+img_ratios = [
+    0.5,
+    0.75,
+    1.0,
+    1.25,
+    1.5,
+    1.75,
+]
+launcher = 'none'
+load_from = '/mnt/data_ssd/limaopeng/limaopeng/segmentation/mmsegmentation/work_dirs/depth_anything_large_mask2former_16xb1_160k_human_parsing_896x896/best_mIoU_iter_110000.pth'
+log_level = 'INFO'
+log_processor = dict(by_epoch=False)
+model = dict(
+    backbone=dict(
+        freeze=False,
+        # load_from='./checkpoints/depth_anything_vitl14.pth',
+        type='DINOv2',
+        version='large'),
+    data_preprocessor=dict(
+        bgr_to_rgb=True,
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        pad_val=0,
+        seg_pad_val=255,
+        size=(
+            896,
+            896,
+        ),
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        type='SegDataPreProcessor'),
+    decode_head=dict(
+        align_corners=False,
+        enforce_decoder_input_project=False,
+        feat_channels=1024,
+        in_channels=[
+            1024,
+            1024,
+            1024,
+            1024,
+        ],
+        loss_boundary=dict(loss_weight=5.0, type='BoundaryLoss'),
+        loss_cls=dict(
+            class_weight=[
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+            ],
+            loss_weight=2.0,
+            reduction='mean',
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False),
+        loss_dice=dict(
+            activate=True,
+            eps=1.0,
+            loss_weight=5.0,
+            naive_dice=True,
+            reduction='mean',
+            type='mmdet.DiceLoss',
+            use_sigmoid=True),
+        loss_mask=dict(
+            loss_weight=5.0,
+            reduction='mean',
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True),
+        num_classes=43,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        out_channels=1024,
+        pixel_decoder=dict(
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                init_cfg=None,
+                layer_cfg=dict(
+                    ffn_cfg=dict(
+                        act_cfg=dict(inplace=True, type='ReLU'),
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        ffn_drop=0.0,
+                        num_fcs=2),
+                    self_attn_cfg=dict(
+                        batch_first=True,
+                        dropout=0.0,
+                        embed_dims=1024,
+                        im2col_step=64,
+                        init_cfg=None,
+                        norm_cfg=None,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4)),
+                num_layers=6),
+            init_cfg=None,
+            norm_cfg=dict(num_groups=32, type='GN'),
+            num_outs=3,
+            positional_encoding=dict(normalize=True, num_feats=512),
+            type='mmdet.MSDeformAttnPixelDecoder'),
+        positional_encoding=dict(normalize=True, num_feats=512),
+        train_cfg=dict(
+            assigner=dict(
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        use_sigmoid=True,
+                        weight=5.0),
+                    dict(
+                        eps=1.0,
+                        pred_act=True,
+                        type='mmdet.DiceCost',
+                        weight=5.0),
+                ],
+                type='mmdet.HungarianAssigner'),
+            importance_sample_ratio=0.75,
+            num_points=12544,
+            oversample_ratio=3.0,
+            sampler=dict(type='mmdet.MaskPseudoSampler')),
+        transformer_decoder=dict(
+            init_cfg=None,
+            layer_cfg=dict(
+                cross_attn_cfg=dict(
+                    attn_drop=0.0,
+                    batch_first=True,
+                    dropout_layer=None,
+                    embed_dims=1024,
+                    num_heads=32,
+                    proj_drop=0.0),
+                ffn_cfg=dict(
+                    act_cfg=dict(inplace=True, type='ReLU'),
+                    add_identity=True,
+                    dropout_layer=None,
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    ffn_drop=0.0,
+                    num_fcs=2),
+                self_attn_cfg=dict(
+                    attn_drop=0.0,
+                    batch_first=True,
+                    dropout_layer=None,
+                    embed_dims=1024,
+                    num_heads=32,
+                    proj_drop=0.0)),
+            num_layers=9,
+            return_intermediate=True),
+        type='Mask2FormerHead'),
+    neck=dict(
+        embed_dim=1024, rescales=[
+            4,
+            2,
+            1,
+            0.5,
+        ], type='Feature2Pyramid'),
+    test_cfg=dict(crop_size=(
+        896,
+        896,
+    ), mode='slide', stride=(
+        426,
+        426,
+    )),
+    train_cfg=dict(),
+    type='EncoderDecoder')
+num_classes = 43
+optim_wrapper = dict(
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    optimizer=dict(
+        betas=(
+            0.9,
+            0.999,
+        ),
+        eps=1e-08,
+        lr=3e-05,
+        type='AdamW',
+        weight_decay=0.05),
+    paramwise_cfg=dict(
+        custom_keys=dict({
+            'backbone.dinov2':
+            dict(decay_mult=1.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.0.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.1.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.10.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.11.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.12.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.13.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.14.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.15.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.16.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.17.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.18.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.19.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.2.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.20.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.21.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.22.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.23.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.3.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.4.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.5.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.6.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.7.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.8.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.9.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'level_embed':
+            dict(decay_mult=0.0, lr_mult=1.0),
+            'pos_embed':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'query_embed':
+            dict(decay_mult=0.0, lr_mult=1.0),
+            'query_feat':
+            dict(decay_mult=0.0, lr_mult=1.0)
+        }),
+        norm_decay_mult=0.0),
+    type='OptimWrapper')
+optimizer = dict(
+    betas=(
+        0.9,
+        0.999,
+    ), eps=1e-08, lr=3e-05, type='AdamW', weight_decay=0.05)
+param_scheduler = [
+    dict(
+        begin=0, by_epoch=False, end=1500, start_factor=1e-06,
+        type='LinearLR'),
+    dict(
+        begin=1500,
+        by_epoch=False,
+        end=300000,
+        eta_min=0.0,
+        power=0.9,
+        type='PolyLR'),
+]
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='val20250512/images', seg_map_path='val20250512/labels'),
+        data_root=
+        '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/deep_fashion_10k',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=False, scale=(
+                896,
+                896,
+            ), type='Resize'),
+            dict(reduce_zero_label=False, type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type='HumanParsingDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(keep_ratio=False, scale=(
+        896,
+        896,
+    ), type='Resize'),
+    dict(reduce_zero_label=False, type='LoadAnnotations'),
+    dict(type='PackSegInputs'),
+]
+train_cfg = dict(
+    max_iters=300000, type='IterBasedTrainLoop', val_interval=2000)
+train_dataloader = dict(
+    batch_size=3,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='train20250512/images',
+            seg_map_path='train20250512/labels'),
+        data_root=
+        '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/deep_fashion_10k',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations'),
+            dict(
+                keep_ratio=True,
+                ratio_range=(
+                    0.2,
+                    2.0,
+                ),
+                scale=(
+                    896,
+                    896,
+                ),
+                type='RandomResize'),
+            dict(
+                cat_max_ratio=0.75, crop_size=(
+                    896,
+                    896,
+                ), type='RandomCrop'),
+            dict(keep_ratio=True, scale=(
+                896,
+                896,
+            ), type='Resize'),
+            dict(degree=45, prob=0.5, seg_pad_val=0, type='RandomRotate'),
+            dict(type='PhotoMetricDistortion'),
+            dict(type='PackSegInputs'),
+        ],
+        type='HumanParsingDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='InfiniteSampler'))
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        keep_ratio=True,
+        ratio_range=(
+            0.2,
+            2.0,
+        ),
+        scale=(
+            896,
+            896,
+        ),
+        type='RandomResize'),
+    dict(cat_max_ratio=0.75, crop_size=(
+        896,
+        896,
+    ), type='RandomCrop'),
+    dict(keep_ratio=True, scale=(
+        896,
+        896,
+    ), type='Resize'),
+    dict(degree=45, prob=0.5, seg_pad_val=0, type='RandomRotate'),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs'),
+]
+tta_model = dict(type='SegTTAModel')
+tta_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(
+        transforms=[
+            [
+                dict(keep_ratio=True, scale_factor=0.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=0.75, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.0, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.25, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.75, type='Resize'),
+            ],
+            [
+                dict(direction='horizontal', prob=0.0, type='RandomFlip'),
+                dict(direction='horizontal', prob=1.0, type='RandomFlip'),
+            ],
+            [
+                dict(type='LoadAnnotations'),
+            ],
+            [
+                dict(type='PackSegInputs'),
+            ],
+        ],
+        type='TestTimeAug'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='val20250512/images', seg_map_path='val20250512/labels'),
+        data_root=
+        '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/deep_fashion_10k',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=False, scale=(
+                896,
+                896,
+            ), type='Resize'),
+            dict(reduce_zero_label=False, type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type='HumanParsingDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    name='visualizer',
+    type='SegLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_160k_human_pasing_fasion_1024x1024_boundary_20250521'
diff --git a/head_extractor/src/head_extractor/processor.py b/head_extractor/src/head_extractor/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..868ae0e69b9ec01f118660ffcd50b40d95be8dd7
--- /dev/null
+++ b/head_extractor/src/head_extractor/processor.py
@@ -0,0 +1,585 @@
+import os
+import numpy as np
+from mmseg.apis import inference_model, init_model
+from PIL import Image
+import cv2
+from enum import Enum
+import importlib.resources
+
+
+'''
+Labels: 
+0: 'background'	1: 'top'	2: 'outer'	3: 'skirt'
+4: 'dress'	5: 'pants'	6: 'leggings'	7: 'headwear'
+8: 'eyeglass'	9: 'neckwear'	10: 'belt'	11: 'footwear'
+12: 'bag'	13: 'hair'	14: 'face'	15: 'skin'
+16: 'ring'	17: 'wrist_wearing'	18: 'socks'	19: 'gloves'
+20: 'necklace'	21: 'rompers'	22: 'earrings'	23: 'tie'
+24: Left_Foot
+25: Left_Hand
+26: Left_Lower_Arm
+27: Left_Lower_Leg
+28: Left_Upper_Arm
+29: Left_Upper_Leg
+30: Right_Foot
+31: Right_Hand
+32: Right_Lower_Arm
+33: Right_Lower_Leg
+34: Right_Upper_Arm
+35: Right_Upper_Leg
+36: Torso
+'''
+
+class PersonSeg:
+    def __init__(self, config_path, model_path, device='cuda'):
+        # init model
+        self.model = init_model(config_path, model_path, device=device)
+
+    def process(self, image):
+        result = inference_model(self.model, image)
+        pred_seg = result.pred_sem_seg.data.cpu().numpy()[0]
+        return pred_seg
+
+class TaskType(Enum):
+    face = "face"
+    head = "head"
+    head_plus_shoulders = "head_plus_shoulders"
+
+    # 衣服相关任务
+    top_cloth = "top_cloth"
+    bottom_cloth = "bottom_cloth"
+    full_clothes = "full_clothes"
+
+    # 全身相关任务
+    full_character = "full_character"
+
+class ProcessorPipeline:
+    """
+    该功能主要用于从单个图像中提取指定内容的mask
+    """
+    def __init__(self, seg_pipe: PersonSeg):
+        self.seg_pipe = seg_pipe
+
+    @classmethod
+    def load(cls, device: str = 'cuda') -> "ProcessorPipeline":
+        """
+        从包内加载模型和配置来初始化 Pipeline。
+        不再需要外部路径。
+        """
+        # 使用 importlib.resources 安全地获取包内文件的路径
+        with importlib.resources.path('head_extractor.models', 'depth_anything_large_mask2former_16xb1_160k_human_parsing_fashion_1024x1024.py') as config_path:
+            with importlib.resources.path('head_extractor.models', 'ckpt.pth') as model_path:
+                seg_pipe = PersonSeg(str(config_path), str(model_path), device=device)
+        
+        return cls(seg_pipe)
+
+    def process(
+        self,
+        image: Image.Image,
+        task_type: TaskType,
+        long_edge: int = 1024
+    ) -> tuple[np.ndarray, np.ndarray]:
+        """
+        从图像中提取mask，内部流程优化为返回NumPy数组。
+        
+        Args:
+            image: 输入图像
+            task_type: 任务类型 ('head' or 'face')
+            long_edge (int): 用于缩放图像的长边尺寸，值越小速度越快。
+        
+        Returns:
+            (处理后的图像 NumPy 数组, 生成的mask NumPy 数组)
+        """
+        # 1. 预处理图像：统一转换为numpy array (RGB)
+        if isinstance(image, Image.Image):
+            image_np = np.array(image.convert("RGB"))
+        else: # 假设是numpy array
+            image_np = image
+        
+        if len(image_np.shape) == 2:
+            image_np = cv2.cvtColor(image_np, cv2.COLOR_GRAY2RGB)
+        elif image_np.shape[2] == 4:
+            image_np = cv2.cvtColor(image_np, cv2.COLOR_RGBA2RGB)
+
+        processed_image_np = self.resize_long_edge(image_np, long_edge=long_edge)
+        ori_h, ori_w = processed_image_np.shape[:2]
+
+        # 2. 运行分割
+        pred_mask_map = self.seg_pipe.process(processed_image_np)
+
+        if task_type == TaskType.head_plus_shoulders:
+            # 2.1 先做“头部”基础mask
+            head_labels = [7, 8, 13, 14]  # headwear, eyeglass, hair, face
+            head_mask = np.isin(pred_mask_map, head_labels).astype(np.float32)
+            head_mask = cv2.resize(head_mask, (ori_w, ori_h), interpolation=cv2.INTER_NEAREST)
+
+            # 2.2 计算头部bbox并向下和左右扩展一段
+            rows = np.any(head_mask > 0, axis=1)
+            cols = np.any(head_mask > 0, axis=0)
+            if np.any(rows) and np.any(cols):
+                rmin, rmax = np.where(rows)[0][[0, -1]]
+                cmin, cmax = np.where(cols)[0][[0, -1]]
+                h_box = max(1, rmax - rmin)
+                w_box = max(1, cmax - cmin)
+
+                down_ratio = 0.1   # 向下扩展比例（相对头bbox高）
+                side_ratio = 0.6  # 左右扩展比例（相对头bbox宽）
+
+                r2max = min(ori_h, rmax + int(h_box * down_ratio))
+                c2min = max(0, cmin - int(w_box * side_ratio))
+                c2max = min(ori_w, cmax + int(w_box * side_ratio))
+
+                rect_mask = np.zeros((ori_h, ori_w), dtype=np.float32)
+                rect_mask[rmin:r2max, c2min:c2max] = 1.0
+
+                # 2.3 在扩展矩形内，仅保留“人物相关像素”（过滤掉背景）
+                person_labels = list(range(1, 37))  # 1..36 都是人物部件
+                person_mask = np.isin(pred_mask_map, person_labels).astype(np.float32)
+                person_mask = cv2.resize(person_mask, (ori_w, ori_h), interpolation=cv2.INTER_NEAREST)
+
+                initial_mask = np.clip(head_mask + (person_mask * rect_mask), 0, 1)
+            else:
+                initial_mask = head_mask
+        else:
+            # 其它任务保持原逻辑
+            labels_map = self._get_labels_for_task(task_type)
+            primary_labels = labels_map['primary']
+            initial_mask = np.isin(pred_mask_map, primary_labels).astype(np.float32)
+            initial_mask = cv2.resize(initial_mask, (ori_w, ori_h), interpolation=cv2.INTER_NEAREST)
+
+        # 3. 后处理（不同任务的形态学策略）
+        final_mask_np = self._apply_task_specific_mask_processing(initial_mask, task_type, ori_h, ori_w)
+
+        # 4. 返回
+        final_mask_uint8 = (final_mask_np * 255).astype(np.uint8)
+        return processed_image_np, final_mask_uint8
+
+    def _get_labels_for_task(self, task_type: TaskType) -> dict:
+        """根据任务类型获取对应的标签映射"""
+        labels_map = {
+            TaskType.face: { 'primary': [8, 14] }, # eyeglass, face
+            TaskType.head: { 'primary': [7, 8, 13, 14] }, # headwear, eyeglass, hair, face
+            TaskType.top_cloth: { 'primary': [1, 2] }, # top, outer
+            TaskType.bottom_cloth: { 'primary': [3, 4, 5, 6] }, # skirt, dress, pants, leggings
+            TaskType.full_clothes: { 'primary': [1, 2, 3, 4, 5, 6] }, # all clothes
+            TaskType.full_character: { 'primary': list(range(1, 37)) }, # 包含所有人物相关部分
+        }
+        return labels_map.get(task_type, {'primary': []})
+
+    def _apply_task_specific_mask_processing(self, mask: np.ndarray, task_type: TaskType, ori_h: int, ori_w: int) -> np.ndarray:
+        """根据任务类型对mask进行特殊处理"""
+        if task_type == TaskType.face:
+            # 人脸任务：简单膨胀
+            expand_kernel = 5
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (expand_kernel, expand_kernel))
+            mask = cv2.dilate((mask > 0.5).astype(np.float32), kernel)
+            
+        elif task_type == TaskType.head:
+            # 头部任务：先腐蚀再膨胀
+            kernel = np.ones((7, 7), dtype=np.uint8)
+            mask = cv2.erode(mask, kernel, iterations=1)
+            
+            expand_kernel = 11
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (expand_kernel, expand_kernel))
+            mask = cv2.dilate((mask > 0.5).astype(np.float32), kernel)
+
+        elif task_type == TaskType.head_plus_shoulders:
+            # 比 head 更偏向“向下与左右扩展”的膨胀（高度核 > 宽度核）
+            # 轻微腐蚀，避免边界毛刺
+            erode_k = 5
+            kernel = np.ones((erode_k, erode_k), dtype=np.uint8)
+            mask = cv2.erode(mask, kernel, iterations=1)
+
+            max_side = max(ori_h, ori_w)
+            h_kernel = max(15, int(max_side * 0.05))  # 更高
+            w_kernel = max(11, int(max_side * 0.03))  # 稍窄
+            # 保证奇数
+            h_kernel = h_kernel // 2 * 2 + 1
+            w_kernel = w_kernel // 2 * 2 + 1
+
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (w_kernel, h_kernel))
+            mask = cv2.dilate((mask > 0.5).astype(np.float32), kernel)
+
+        if task_type in [TaskType.top_cloth, TaskType.bottom_cloth, TaskType.full_clothes, TaskType.full_character]:
+            # 衣服相关任务：膨胀和模糊处理
+            expand_ratio = 0.01
+            max_side = max(ori_h, ori_w)
+            blur_kernel = 1
+            expand_kernel = int(max_side * expand_ratio) // 2 * 2 + 1
+            
+            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (expand_kernel, expand_kernel))
+            expanded = cv2.dilate((mask > 0.5).astype(np.uint8), kernel)
+            
+            blurred = cv2.GaussianBlur(
+                expanded.astype(np.float32),
+                (blur_kernel, blur_kernel),
+                sigmaX=0,
+            )
+            mask = np.clip(blurred / (blurred.max() + 1e-6), 0, 1)
+            
+        return mask
+
+    @staticmethod
+    def resize_long_edge(image_np: np.ndarray, long_edge=1024) -> np.ndarray:
+        """将图像等比例缩放到指定长边尺寸 (使用OpenCV)"""
+        original_height, original_width = image_np.shape[:2]
+        
+        max_dimension = max(original_width, original_height)
+        if max_dimension <= long_edge:
+            return image_np
+        
+        ratio = long_edge / max_dimension
+        new_width = int(original_width * ratio)
+        new_height = int(original_height * ratio)
+        
+        # 使用cv2.INTER_AREA进行缩放，对于缩小图像效果较好且速度快
+        return cv2.resize(image_np, (new_width, new_height), interpolation=cv2.INTER_AREA)
+
+    @staticmethod
+    def _pad_to_square_np(image_np: np.ndarray, background_value: tuple) -> np.ndarray:
+        """将NumPy图像填充为正方形"""
+        height, width = image_np.shape[:2]
+        if width == height:
+            return image_np
+        
+        max_dim = max(width, height)
+        
+        # 根据通道数确定背景色
+        channels = image_np.shape[2] if len(image_np.shape) > 2 else 1
+        
+        # 创建一个正确尺寸的背景板
+        padded_image = np.full((max_dim, max_dim, channels), background_value, dtype=image_np.dtype)
+
+        paste_x = (max_dim - width) // 2
+        paste_y = (max_dim - height) // 2
+        
+        padded_image[paste_y:paste_y+height, paste_x:paste_x+width] = image_np
+        return padded_image
+
+    @staticmethod
+    def pad_to_square(image: Image.Image, background_color: tuple = (255, 255, 255)) -> Image.Image:
+        """
+        将图像填充为正方形
+        
+        Args:
+            image: 输入图像
+            background_color: 填充的背景颜色
+        
+        Returns:
+            填充为正方形的图像
+        """
+        width, height = image.size
+        if width == height:
+            return image
+        
+        max_dim = max(width, height)
+        padded_image = Image.new(image.mode, (max_dim, max_dim), background_color)
+        paste_x = (max_dim - width) // 2
+        paste_y = (max_dim - height) // 2
+        padded_image.paste(image, (paste_x, paste_y))
+        return padded_image
+
+    def crop_image_by_mask(self, image: Image.Image, mask: Image.Image, padding: int = 20) -> Image.Image:
+        """
+        根据mask裁剪图像，只保留mask覆盖的区域
+        
+        Args:
+            image: 原始图像
+            mask: 二值mask图像
+            padding: 裁剪区域的边距扩展像素数
+        
+        Returns:
+            裁剪后的图像
+        """
+        # 转换为numpy数组
+        mask_np = np.array(mask)
+        image_np = np.array(image)
+        
+        # 找到mask中非零像素的边界框
+        rows = np.any(mask_np > 0, axis=1)
+        cols = np.any(mask_np > 0, axis=0)
+        
+        if not np.any(rows) or not np.any(cols):
+            # 如果mask为空，返回原图
+            return image
+        
+        # 获取边界框坐标
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        
+        # 添加padding并确保不超出图像边界
+        h, w = image_np.shape[:2]
+        rmin = max(0, rmin - padding)
+        rmax = min(h, rmax + padding + 1)
+        cmin = max(0, cmin - padding)
+        cmax = min(w, cmax + padding + 1)
+        
+        # 裁剪图像
+        cropped_image = image_np[rmin:rmax, cmin:cmax]
+        
+        return Image.fromarray(cropped_image)
+
+    def _crop_image_and_mask_np(self, image_np: np.ndarray, mask_np: np.ndarray, padding: int = 20) -> tuple[np.ndarray, np.ndarray]:
+        """根据mask同时裁剪NumPy图像和mask"""
+        rows = np.any(mask_np > 0, axis=1)
+        cols = np.any(mask_np > 0, axis=0)
+
+        if not np.any(rows) or not np.any(cols):
+            return image_np, mask_np
+
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+
+        h, w = image_np.shape[:2]
+        rmin = max(0, rmin - padding)
+        rmax = min(h, rmax + padding + 1)
+        cmin = max(0, cmin - padding)
+        cmax = min(w, cmax + padding + 1)
+
+        cropped_image_np = image_np[rmin:rmax, cmin:cmax]
+        cropped_mask_np = mask_np[rmin:rmax, cmin:cmax]
+
+        return cropped_image_np, cropped_mask_np
+
+    def crop_image_and_mask(self, image: Image.Image, mask: Image.Image, padding: int = 20) -> tuple[Image.Image, Image.Image]:
+        """根据mask同时裁剪图像和mask，避免重复计算边界框"""
+        mask_np = np.array(mask)
+        image_np = np.array(image)
+
+        rows = np.any(mask_np > 0, axis=1)
+        cols = np.any(mask_np > 0, axis=0)
+
+        if not np.any(rows) or not np.any(cols):
+            return image, mask
+
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+
+        h, w = image_np.shape[:2]
+        rmin = max(0, rmin - padding)
+        rmax = min(h, rmax + padding + 1)
+        cmin = max(0, cmin - padding)
+        cmax = min(w, cmax + padding + 1)
+
+        cropped_image_np = image_np[rmin:rmax, cmin:cmax]
+        cropped_mask_np = mask_np[rmin:rmax, cmin:cmax]
+
+        return Image.fromarray(cropped_image_np), Image.fromarray(cropped_mask_np)
+
+    def _apply_mask_to_image_np(self, image_np: np.ndarray, mask_np: np.ndarray, background_color: tuple) -> np.ndarray:
+        """将NumPy mask应用到NumPy图像上"""
+        mask_normalized = mask_np.astype(np.float32) / 255.0
+        background = np.full_like(image_np, background_color)
+        result = image_np * mask_normalized[..., np.newaxis] + background * (1 - mask_normalized[..., np.newaxis])
+        return result.astype(np.uint8)
+
+    def apply_mask_to_image(self, image: Image.Image, mask: Image.Image, background_color: tuple = (255, 255, 255)) -> Image.Image:
+        """
+        将mask应用到图像上，mask外的区域设置为指定背景色
+        
+        Args:
+            image: 原始图像
+            mask: 二值mask图像
+            background_color: 背景颜色 (R, G, B)
+        
+        Returns:
+            应用mask后的图像
+        """
+        # 转换为numpy数组
+        image_np = np.array(image)
+        mask_np = np.array(mask)
+        
+        # 将mask归一化到0-1范围
+        mask_normalized = mask_np.astype(np.float32) / 255.0
+        
+        # 创建背景
+        background = np.full_like(image_np, background_color)
+        
+        # 应用mask：mask区域保持原图，其他区域为背景色
+        result = image_np * mask_normalized[..., np.newaxis] + background * (1 - mask_normalized[..., np.newaxis])
+        
+        return Image.fromarray(result.astype(np.uint8))
+
+    def extract_head(
+        self,
+        image: Image.Image,
+        crop_padding: int = 10,
+        background_color: tuple = (255, 255, 255),
+        pad2square: bool = True,
+        output_mode: str = 'RGB',
+        long_edge: int = 1024,
+        include_shoulders: bool = False
+    ) -> Image.Image:
+        """
+        从输入图像中提取头部区域，并返回一个裁剪、填充为正方形的图像。
+
+        Args:
+            image: 输入图像 (PIL.Image or np.ndarray).
+            crop_padding: 裁剪边界框的额外边距.
+            background_color: `output_mode` 为 'RGB' 时，用于填充背景的颜色.
+            pad2square (bool): 是否将最终结果填充为正方形. 默认为 True.
+            output_mode (str): 输出图像模式，可选 'RGB' (纯色背景) 或 'RGBA' (透明背景). 默认为 'RGB'.
+            long_edge (int): 送入模型前缩放的长边尺寸，值越小速度越快，但可能影响精度。默认为1024。
+
+        Returns:
+            处理后的头部图像 (PIL.Image).
+        """
+        # 1. 任务类型改为可选
+        task = TaskType.head_plus_shoulders if include_shoulders else TaskType.head
+        processed_image_np, head_mask_np = self.process(
+            image=image,
+            task_type=task,
+            long_edge=long_edge
+        )
+
+        # 2. NumPy-based 裁剪
+        face_cropped_np, mask_cropped_np = self._crop_image_and_mask_np(
+            processed_image_np, head_mask_np, padding=crop_padding
+        )
+
+        # 3. 根据输出模式（RGB/RGBA）应用蒙版
+        output_mode = output_mode.upper()
+        if output_mode == 'RGBA':
+            # 创建一个带透明通道的RGBA图像
+            # 首先确保图像是3通道的
+            if face_cropped_np.shape[2] == 4:
+                face_cropped_np = face_cropped_np[:,:,:3]
+            # 创建RGBA图像
+            result_image_np = cv2.cvtColor(face_cropped_np, cv2.COLOR_RGB2RGBA)
+            result_image_np[:, :, 3] = mask_cropped_np # 设置alpha通道
+            
+        elif output_mode == 'RGB':
+            # NumPy-based 蒙版应用
+            result_image_np = self._apply_mask_to_image_np(
+                face_cropped_np,
+                mask_cropped_np,
+                background_color=background_color
+            )
+        else:
+            raise ValueError("output_mode must be 'RGB' or 'RGBA'")
+
+        # 4. 可选：NumPy-based 填充
+        if pad2square:
+            if output_mode == 'RGBA':
+                pad_color = (255, 255, 255, 0) # 透明背景
+            else:  # RGB
+                pad_color = background_color
+            
+            final_image_np = self._pad_to_square_np(
+                result_image_np,
+                background_value=pad_color
+            )
+        else:
+            final_image_np = result_image_np
+
+        # 5. 仅在最后一步转换为 PIL Image
+        if output_mode == 'RGBA':
+             return Image.fromarray(final_image_np, 'RGBA')
+        else:
+             return Image.fromarray(final_image_np, 'RGB')
+
+
+    def extract(
+        self,
+        task_type: TaskType.full_character,
+        image: Image.Image,
+        crop_padding: int = 10,
+        background_color: tuple = (255, 255, 255),
+        pad2square: bool = True,
+        output_mode: str = 'RGB',
+        long_edge: int = 1024
+    ) -> Image.Image:
+        """
+        从输入图像中提取头部区域，并返回一个裁剪、填充为正方形的图像。
+
+        Args:
+            image: 输入图像 (PIL.Image or np.ndarray).
+            crop_padding: 裁剪边界框的额外边距.
+            background_color: `output_mode` 为 'RGB' 时，用于填充背景的颜色.
+            pad2square (bool): 是否将最终结果填充为正方形. 默认为 True.
+            output_mode (str): 输出图像模式，可选 'RGB' (纯色背景) 或 'RGBA' (透明背景). 默认为 'RGB'.
+            long_edge (int): 送入模型前缩放的长边尺寸，值越小速度越快，但可能影响精度。默认为1024。
+
+        Returns:
+            处理后的头部图像 (PIL.Image).
+        """
+        # 1. 运行分割，直接获取 NumPy 结果
+        processed_image_np, head_mask_np = self.process(
+            image=image,
+            task_type=task_type,
+            long_edge=long_edge
+        )
+
+        # 2. NumPy-based 裁剪
+        face_cropped_np, mask_cropped_np = self._crop_image_and_mask_np(
+            processed_image_np, head_mask_np, padding=crop_padding
+        )
+
+        # 3. 根据输出模式（RGB/RGBA）应用蒙版
+        output_mode = output_mode.upper()
+        if output_mode == 'RGBA':
+            # 创建一个带透明通道的RGBA图像
+            # 首先确保图像是3通道的
+            if face_cropped_np.shape[2] == 4:
+                face_cropped_np = face_cropped_np[:,:,:3]
+            # 创建RGBA图像
+            result_image_np = cv2.cvtColor(face_cropped_np, cv2.COLOR_RGB2RGBA)
+            result_image_np[:, :, 3] = mask_cropped_np # 设置alpha通道
+            
+        elif output_mode == 'RGB':
+            # NumPy-based 蒙版应用
+            result_image_np = self._apply_mask_to_image_np(
+                face_cropped_np,
+                mask_cropped_np,
+                background_color=background_color
+            )
+        else:
+            raise ValueError("output_mode must be 'RGB' or 'RGBA'")
+
+        # 4. 可选：NumPy-based 填充
+        if pad2square:
+            if output_mode == 'RGBA':
+                pad_color = (255, 255, 255, 0) # 透明背景
+            else:  # RGB
+                pad_color = background_color
+            
+            final_image_np = self._pad_to_square_np(
+                result_image_np,
+                background_value=pad_color
+            )
+        else:
+            final_image_np = result_image_np
+
+        # 5. 仅在最后一步转换为 PIL Image
+        if output_mode == 'RGBA':
+             return Image.fromarray(final_image_np, 'RGBA')
+        else:
+             return Image.fromarray(final_image_np, 'RGB')
+
+if __name__ == '__main__':
+    # 这是一个示例如何初始化和使用 Pipeline
+    print("Initializing pipeline from package resources...")
+    pipeline = ProcessorPipeline.load()
+    print("Pipeline initialized.")
+
+    # 使用示例 (需要提供一张图片):
+
+    # 请替换为你的图片路径
+    image_path = "001.jpg" 
+    if os.path.exists(image_path):
+        print(f"Processing image: {image_path}")
+        image = Image.open(image_path)
+        
+        print("正在提取头部...")
+        extracted_head = pipeline.extract_head(image)
+
+        # 保存最终结果
+        output_path = "output_head_extracted.png"
+        extracted_head.save(output_path)
+
+        print("\n处理完成!")
+        print(f"已保存提取的头部图像至 '{output_path}'")
+
+    else:
+        print(f"示例图片未找到: {image_path}")
+
diff --git a/head_extractor/src/mmdet/__init__.py b/head_extractor/src/mmdet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ac884ac8b40c1543ed840dfcafe367fbe4bda62
--- /dev/null
+++ b/head_extractor/src/mmdet/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import mmengine
+from mmengine.utils import digit_version
+
+from .version import __version__, version_info
+
+mmcv_minimum_version = '2.0.0rc4'
+mmcv_maximum_version = '2.2.0'
+mmcv_version = digit_version(mmcv.__version__)
+
+mmengine_minimum_version = '0.7.1'
+mmengine_maximum_version = '1.0.0'
+mmengine_version = digit_version(mmengine.__version__)
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version < digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <{mmcv_maximum_version}.'
+
+assert (mmengine_version >= digit_version(mmengine_minimum_version)
+        and mmengine_version < digit_version(mmengine_maximum_version)), \
+    f'MMEngine=={mmengine.__version__} is used but incompatible. ' \
+    f'Please install mmengine>={mmengine_minimum_version}, ' \
+    f'<{mmengine_maximum_version}.'
+
+__all__ = ['__version__', 'version_info', 'digit_version']
diff --git a/head_extractor/src/mmdet/apis/__init__.py b/head_extractor/src/mmdet/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c89dc72914b11a73e91dc7e9404f41bf10b93c6c
--- /dev/null
+++ b/head_extractor/src/mmdet/apis/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .det_inferencer import DetInferencer
+from .inference import (async_inference_detector, inference_detector,
+                        inference_mot, init_detector, init_track_model)
+
+__all__ = [
+    'init_detector', 'async_inference_detector', 'inference_detector',
+    'DetInferencer', 'inference_mot', 'init_track_model'
+]
diff --git a/head_extractor/src/mmdet/apis/det_inferencer.py b/head_extractor/src/mmdet/apis/det_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8532eb786558ca3807195781d8e380741cea00
--- /dev/null
+++ b/head_extractor/src/mmdet/apis/det_inferencer.py
@@ -0,0 +1,652 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+import warnings
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
+
+import mmcv
+import mmengine
+import numpy as np
+import torch.nn as nn
+from mmcv.transforms import LoadImageFromFile
+from mmengine.dataset import Compose
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file)
+from mmengine.infer.infer import BaseInferencer, ModelType
+from mmengine.model.utils import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner.checkpoint import _load_checkpoint_to_model
+from mmengine.visualization import Visualizer
+from rich.progress import track
+
+from mmdet.evaluation import INSTANCE_OFFSET
+from mmdet.registry import DATASETS
+from mmdet.structures import DetDataSample
+from mmdet.structures.mask import encode_mask_results, mask2bbox
+from mmdet.utils import ConfigType
+from ..evaluation import get_classes
+
+try:
+    from panopticapi.evaluation import VOID
+    from panopticapi.utils import id2rgb
+except ImportError:
+    id2rgb = None
+    VOID = None
+
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = List[DetDataSample]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
+                  '.tiff', '.webp')
+
+
+class DetInferencer(BaseInferencer):
+    """Object Detection Inferencer.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. For example, it could be
+            "rtmdet-s" or 'rtmdet_s_8xb32-300e_coco' or
+            "configs/rtmdet/rtmdet_s_8xb32-300e_coco.py".
+            If model is not specified, user must provide the
+            `weights` saved by MMEngine which contains the config string.
+            Defaults to None.
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        scope (str, optional): The scope of the model. Defaults to mmdet.
+        palette (str): Color palette used for visualization. The order of
+            priority is palette -> config -> checkpoint. Defaults to 'none'.
+        show_progress (bool): Control whether to display the progress
+            bar during the inference process. Defaults to True.
+    """
+
+    preprocess_kwargs: set = set()
+    forward_kwargs: set = set()
+    visualize_kwargs: set = {
+        'return_vis',
+        'show',
+        'wait_time',
+        'draw_pred',
+        'pred_score_thr',
+        'img_out_dir',
+        'no_save_vis',
+    }
+    postprocess_kwargs: set = {
+        'print_result',
+        'pred_out_dir',
+        'return_datasamples',
+        'no_save_pred',
+    }
+
+    def __init__(self,
+                 model: Optional[Union[ModelType, str]] = None,
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: Optional[str] = 'mmdet',
+                 palette: str = 'none',
+                 show_progress: bool = True) -> None:
+        # A global counter tracking the number of images processed, for
+        # naming of the output images
+        self.num_visualized_imgs = 0
+        self.num_predicted_imgs = 0
+        self.palette = palette
+        init_default_scope(scope)
+        super().__init__(
+            model=model, weights=weights, device=device, scope=scope)
+        self.model = revert_sync_batchnorm(self.model)
+        self.show_progress = show_progress
+
+    def _load_weights_to_model(self, model: nn.Module,
+                               checkpoint: Optional[dict],
+                               cfg: Optional[ConfigType]) -> None:
+        """Loading model weights and meta information from cfg and checkpoint.
+
+        Args:
+            model (nn.Module): Model to load weights and meta information.
+            checkpoint (dict, optional): The loaded checkpoint.
+            cfg (Config or ConfigDict, optional): The loaded config.
+        """
+
+        if checkpoint is not None:
+            _load_checkpoint_to_model(model, checkpoint)
+            checkpoint_meta = checkpoint.get('meta', {})
+            # save the dataset_meta in the model for convenience
+            if 'dataset_meta' in checkpoint_meta:
+                # mmdet 3.x, all keys should be lowercase
+                model.dataset_meta = {
+                    k.lower(): v
+                    for k, v in checkpoint_meta['dataset_meta'].items()
+                }
+            elif 'CLASSES' in checkpoint_meta:
+                # < mmdet 3.x
+                classes = checkpoint_meta['CLASSES']
+                model.dataset_meta = {'classes': classes}
+            else:
+                warnings.warn(
+                    'dataset_meta or class names are not saved in the '
+                    'checkpoint\'s meta data, use COCO classes by default.')
+                model.dataset_meta = {'classes': get_classes('coco')}
+        else:
+            warnings.warn('Checkpoint is not loaded, and the inference '
+                          'result is calculated by the randomly initialized '
+                          'model!')
+            warnings.warn('weights is None, use COCO classes by default.')
+            model.dataset_meta = {'classes': get_classes('coco')}
+
+        # Priority:  args.palette -> config -> checkpoint
+        if self.palette != 'none':
+            model.dataset_meta['palette'] = self.palette
+        else:
+            test_dataset_cfg = copy.deepcopy(cfg.test_dataloader.dataset)
+            # lazy init. We only need the metainfo.
+            test_dataset_cfg['lazy_init'] = True
+            metainfo = DATASETS.build(test_dataset_cfg).metainfo
+            cfg_palette = metainfo.get('palette', None)
+            if cfg_palette is not None:
+                model.dataset_meta['palette'] = cfg_palette
+            else:
+                if 'palette' not in model.dataset_meta:
+                    warnings.warn(
+                        'palette does not exist, random is used by default. '
+                        'You can also set the palette to customize.')
+                    model.dataset_meta['palette'] = 'random'
+
+    def _init_pipeline(self, cfg: ConfigType) -> Compose:
+        """Initialize the test pipeline."""
+        pipeline_cfg = cfg.test_dataloader.dataset.pipeline
+
+        # For inference, the key of ``img_id`` is not used.
+        if 'meta_keys' in pipeline_cfg[-1]:
+            pipeline_cfg[-1]['meta_keys'] = tuple(
+                meta_key for meta_key in pipeline_cfg[-1]['meta_keys']
+                if meta_key != 'img_id')
+
+        load_img_idx = self._get_transform_idx(
+            pipeline_cfg, ('LoadImageFromFile', LoadImageFromFile))
+        if load_img_idx == -1:
+            raise ValueError(
+                'LoadImageFromFile is not found in the test pipeline')
+        pipeline_cfg[load_img_idx]['type'] = 'mmdet.InferencerLoader'
+        return Compose(pipeline_cfg)
+
+    def _get_transform_idx(self, pipeline_cfg: ConfigType,
+                           name: Union[str, Tuple[str, type]]) -> int:
+        """Returns the index of the transform in a pipeline.
+
+        If the transform is not found, returns -1.
+        """
+        for i, transform in enumerate(pipeline_cfg):
+            if transform['type'] in name:
+                return i
+        return -1
+
+    def _init_visualizer(self, cfg: ConfigType) -> Optional[Visualizer]:
+        """Initialize visualizers.
+
+        Args:
+            cfg (ConfigType): Config containing the visualizer information.
+
+        Returns:
+            Visualizer or None: Visualizer initialized with config.
+        """
+        visualizer = super()._init_visualizer(cfg)
+        visualizer.dataset_meta = self.model.dataset_meta
+        return visualizer
+
+    def _inputs_to_list(self, inputs: InputsType) -> list:
+        """Preprocess the inputs to a list.
+
+        Preprocess inputs to a list according to its type:
+
+        - list or tuple: return inputs
+        - str:
+            - Directory path: return all files in the directory
+            - other cases: return a list containing the string. The string
+              could be a path to file, a url or other types of string according
+              to the task.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+
+        Returns:
+            list: List of input for the :meth:`preprocess`.
+        """
+        if isinstance(inputs, str):
+            backend = get_file_backend(inputs)
+            if hasattr(backend, 'isdir') and isdir(inputs):
+                # Backends like HttpsBackend do not implement `isdir`, so only
+                # those backends that implement `isdir` could accept the inputs
+                # as a directory
+                filename_list = list_dir_or_file(
+                    inputs, list_dir=False, suffix=IMG_EXTENSIONS)
+                inputs = [
+                    join_path(inputs, filename) for filename in filename_list
+                ]
+
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        return list(inputs)
+
+    def preprocess(self, inputs: InputsType, batch_size: int = 1, **kwargs):
+        """Process the inputs into a model-feedable format.
+
+        Customize your preprocess by overriding this method. Preprocess should
+        return an iterable object, of which each item will be used as the
+        input of ``model.test_step``.
+
+        ``BaseInferencer.preprocess`` will return an iterable chunked data,
+        which will be used in __call__ like this:
+
+        .. code-block:: python
+
+            def __call__(self, inputs, batch_size=1, **kwargs):
+                chunked_data = self.preprocess(inputs, batch_size, **kwargs)
+                for batch in chunked_data:
+                    preds = self.forward(batch, **kwargs)
+
+        Args:
+            inputs (InputsType): Inputs given by user.
+            batch_size (int): batch size. Defaults to 1.
+
+        Yields:
+            Any: Data processed by the ``pipeline`` and ``collate_fn``.
+        """
+        chunked_data = self._get_chunk_data(inputs, batch_size)
+        yield from map(self.collate_fn, chunked_data)
+
+    def _get_chunk_data(self, inputs: Iterable, chunk_size: int):
+        """Get batch data from inputs.
+
+        Args:
+            inputs (Iterable): An iterable dataset.
+            chunk_size (int): Equivalent to batch size.
+
+        Yields:
+            list: batch data.
+        """
+        inputs_iter = iter(inputs)
+        while True:
+            try:
+                chunk_data = []
+                for _ in range(chunk_size):
+                    inputs_ = next(inputs_iter)
+                    if isinstance(inputs_, dict):
+                        if 'img' in inputs_:
+                            ori_inputs_ = inputs_['img']
+                        else:
+                            ori_inputs_ = inputs_['img_path']
+                        chunk_data.append(
+                            (ori_inputs_,
+                             self.pipeline(copy.deepcopy(inputs_))))
+                    else:
+                        chunk_data.append((inputs_, self.pipeline(inputs_)))
+                yield chunk_data
+            except StopIteration:
+                if chunk_data:
+                    yield chunk_data
+                break
+
+    # TODO: Video and Webcam are currently not supported and
+    #  may consume too much memory if your input folder has a lot of images.
+    #  We will be optimized later.
+    def __call__(
+            self,
+            inputs: InputsType,
+            batch_size: int = 1,
+            return_vis: bool = False,
+            show: bool = False,
+            wait_time: int = 0,
+            no_save_vis: bool = False,
+            draw_pred: bool = True,
+            pred_score_thr: float = 0.3,
+            return_datasamples: bool = False,
+            print_result: bool = False,
+            no_save_pred: bool = True,
+            out_dir: str = '',
+            # by open image task
+            texts: Optional[Union[str, list]] = None,
+            # by open panoptic task
+            stuff_texts: Optional[Union[str, list]] = None,
+            # by GLIP and Grounding DINO
+            custom_entities: bool = False,
+            # by Grounding DINO
+            tokens_positive: Optional[Union[int, list]] = None,
+            **kwargs) -> dict:
+        """Call the inferencer.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            batch_size (int): Inference batch size. Defaults to 1.
+            show (bool): Whether to display the visualization results in a
+                popup window. Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            no_save_vis (bool): Whether to force not to save prediction
+                vis results. Defaults to False.
+            draw_pred (bool): Whether to draw predicted bounding boxes.
+                Defaults to True.
+            pred_score_thr (float): Minimum score of bboxes to draw.
+                Defaults to 0.3.
+            return_datasamples (bool): Whether to return results as
+                :obj:`DetDataSample`. Defaults to False.
+            print_result (bool): Whether to print the inference result w/o
+                visualization to the console. Defaults to False.
+            no_save_pred (bool): Whether to force not to save prediction
+                results. Defaults to True.
+            out_dir: Dir to save the inference results or
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+            texts (str | list[str]): Text prompts. Defaults to None.
+            stuff_texts (str | list[str]): Stuff text prompts of open
+                panoptic task. Defaults to None.
+            custom_entities (bool): Whether to use custom entities.
+                Defaults to False. Only used in GLIP and Grounding DINO.
+            **kwargs: Other keyword arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+        Returns:
+            dict: Inference and visualization results.
+        """
+        (
+            preprocess_kwargs,
+            forward_kwargs,
+            visualize_kwargs,
+            postprocess_kwargs,
+        ) = self._dispatch_kwargs(**kwargs)
+
+        ori_inputs = self._inputs_to_list(inputs)
+
+        if texts is not None and isinstance(texts, str):
+            texts = [texts] * len(ori_inputs)
+        if stuff_texts is not None and isinstance(stuff_texts, str):
+            stuff_texts = [stuff_texts] * len(ori_inputs)
+
+        # Currently only supports bs=1
+        tokens_positive = [tokens_positive] * len(ori_inputs)
+
+        if texts is not None:
+            assert len(texts) == len(ori_inputs)
+            for i in range(len(texts)):
+                if isinstance(ori_inputs[i], str):
+                    ori_inputs[i] = {
+                        'text': texts[i],
+                        'img_path': ori_inputs[i],
+                        'custom_entities': custom_entities,
+                        'tokens_positive': tokens_positive[i]
+                    }
+                else:
+                    ori_inputs[i] = {
+                        'text': texts[i],
+                        'img': ori_inputs[i],
+                        'custom_entities': custom_entities,
+                        'tokens_positive': tokens_positive[i]
+                    }
+        if stuff_texts is not None:
+            assert len(stuff_texts) == len(ori_inputs)
+            for i in range(len(stuff_texts)):
+                ori_inputs[i]['stuff_text'] = stuff_texts[i]
+
+        inputs = self.preprocess(
+            ori_inputs, batch_size=batch_size, **preprocess_kwargs)
+
+        results_dict = {'predictions': [], 'visualization': []}
+        for ori_imgs, data in (track(inputs, description='Inference')
+                               if self.show_progress else inputs):
+            preds = self.forward(data, **forward_kwargs)
+            visualization = self.visualize(
+                ori_imgs,
+                preds,
+                return_vis=return_vis,
+                show=show,
+                wait_time=wait_time,
+                draw_pred=draw_pred,
+                pred_score_thr=pred_score_thr,
+                no_save_vis=no_save_vis,
+                img_out_dir=out_dir,
+                **visualize_kwargs)
+            results = self.postprocess(
+                preds,
+                visualization,
+                return_datasamples=return_datasamples,
+                print_result=print_result,
+                no_save_pred=no_save_pred,
+                pred_out_dir=out_dir,
+                **postprocess_kwargs)
+            results_dict['predictions'].extend(results['predictions'])
+            if results['visualization'] is not None:
+                results_dict['visualization'].extend(results['visualization'])
+        return results_dict
+
+    def visualize(self,
+                  inputs: InputsType,
+                  preds: PredType,
+                  return_vis: bool = False,
+                  show: bool = False,
+                  wait_time: int = 0,
+                  draw_pred: bool = True,
+                  pred_score_thr: float = 0.3,
+                  no_save_vis: bool = False,
+                  img_out_dir: str = '',
+                  **kwargs) -> Union[List[np.ndarray], None]:
+        """Visualize predictions.
+
+        Args:
+            inputs (List[Union[str, np.ndarray]]): Inputs for the inferencer.
+            preds (List[:obj:`DetDataSample`]): Predictions of the model.
+            return_vis (bool): Whether to return the visualization result.
+                Defaults to False.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            draw_pred (bool): Whether to draw predicted bounding boxes.
+                Defaults to True.
+            pred_score_thr (float): Minimum score of bboxes to draw.
+                Defaults to 0.3.
+            no_save_vis (bool): Whether to force not to save prediction
+                vis results. Defaults to False.
+            img_out_dir (str): Output directory of visualization results.
+                If left as empty, no file will be saved. Defaults to ''.
+
+        Returns:
+            List[np.ndarray] or None: Returns visualization results only if
+            applicable.
+        """
+        if no_save_vis is True:
+            img_out_dir = ''
+
+        if not show and img_out_dir == '' and not return_vis:
+            return None
+
+        if self.visualizer is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+
+        results = []
+
+        for single_input, pred in zip(inputs, preds):
+            if isinstance(single_input, str):
+                img_bytes = mmengine.fileio.get(single_input)
+                img = mmcv.imfrombytes(img_bytes)
+                img = img[:, :, ::-1]
+                img_name = osp.basename(single_input)
+            elif isinstance(single_input, np.ndarray):
+                img = single_input.copy()
+                img_num = str(self.num_visualized_imgs).zfill(8)
+                img_name = f'{img_num}.jpg'
+            else:
+                raise ValueError('Unsupported input type: '
+                                 f'{type(single_input)}')
+
+            out_file = osp.join(img_out_dir, 'vis',
+                                img_name) if img_out_dir != '' else None
+
+            self.visualizer.add_datasample(
+                img_name,
+                img,
+                pred,
+                show=show,
+                wait_time=wait_time,
+                draw_gt=False,
+                draw_pred=draw_pred,
+                pred_score_thr=pred_score_thr,
+                out_file=out_file,
+            )
+            results.append(self.visualizer.get_image())
+            self.num_visualized_imgs += 1
+
+        return results
+
+    def postprocess(
+        self,
+        preds: PredType,
+        visualization: Optional[List[np.ndarray]] = None,
+        return_datasamples: bool = False,
+        print_result: bool = False,
+        no_save_pred: bool = False,
+        pred_out_dir: str = '',
+        **kwargs,
+    ) -> Dict:
+        """Process the predictions and visualization results from ``forward``
+        and ``visualize``.
+
+        This method should be responsible for the following tasks:
+
+        1. Convert datasamples into a json-serializable dict if needed.
+        2. Pack the predictions and visualization results and return them.
+        3. Dump or log the predictions.
+
+        Args:
+            preds (List[:obj:`DetDataSample`]): Predictions of the model.
+            visualization (Optional[np.ndarray]): Visualized predictions.
+            return_datasamples (bool): Whether to use Datasample to store
+                inference results. If False, dict will be used.
+            print_result (bool): Whether to print the inference result w/o
+                visualization to the console. Defaults to False.
+            no_save_pred (bool): Whether to force not to save prediction
+                results. Defaults to False.
+            pred_out_dir: Dir to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Inference and visualization results with key ``predictions``
+            and ``visualization``.
+
+            - ``visualization`` (Any): Returned by :meth:`visualize`.
+            - ``predictions`` (dict or DataSample): Returned by
+                :meth:`forward` and processed in :meth:`postprocess`.
+                If ``return_datasamples=False``, it usually should be a
+                json-serializable dict containing only basic data elements such
+                as strings and numbers.
+        """
+        if no_save_pred is True:
+            pred_out_dir = ''
+
+        result_dict = {}
+        results = preds
+        if not return_datasamples:
+            results = []
+            for pred in preds:
+                result = self.pred2dict(pred, pred_out_dir)
+                results.append(result)
+        elif pred_out_dir != '':
+            warnings.warn('Currently does not support saving datasample '
+                          'when return_datasamples is set to True. '
+                          'Prediction results are not saved!')
+        # Add img to the results after printing and dumping
+        result_dict['predictions'] = results
+        if print_result:
+            print(result_dict)
+        result_dict['visualization'] = visualization
+        return result_dict
+
+    # TODO: The data format and fields saved in json need further discussion.
+    #  Maybe should include model name, timestamp, filename, image info etc.
+    def pred2dict(self,
+                  data_sample: DetDataSample,
+                  pred_out_dir: str = '') -> Dict:
+        """Extract elements necessary to represent a prediction into a
+        dictionary.
+
+        It's better to contain only basic data elements such as strings and
+        numbers in order to guarantee it's json-serializable.
+
+        Args:
+            data_sample (:obj:`DetDataSample`): Predictions of the model.
+            pred_out_dir: Dir to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Prediction results.
+        """
+        is_save_pred = True
+        if pred_out_dir == '':
+            is_save_pred = False
+
+        if is_save_pred and 'img_path' in data_sample:
+            img_path = osp.basename(data_sample.img_path)
+            img_path = osp.splitext(img_path)[0]
+            out_img_path = osp.join(pred_out_dir, 'preds',
+                                    img_path + '_panoptic_seg.png')
+            out_json_path = osp.join(pred_out_dir, 'preds', img_path + '.json')
+        elif is_save_pred:
+            out_img_path = osp.join(
+                pred_out_dir, 'preds',
+                f'{self.num_predicted_imgs}_panoptic_seg.png')
+            out_json_path = osp.join(pred_out_dir, 'preds',
+                                     f'{self.num_predicted_imgs}.json')
+            self.num_predicted_imgs += 1
+
+        result = {}
+        if 'pred_instances' in data_sample:
+            masks = data_sample.pred_instances.get('masks')
+            pred_instances = data_sample.pred_instances.numpy()
+            result = {
+                'labels': pred_instances.labels.tolist(),
+                'scores': pred_instances.scores.tolist()
+            }
+            if 'bboxes' in pred_instances:
+                result['bboxes'] = pred_instances.bboxes.tolist()
+            if masks is not None:
+                if 'bboxes' not in pred_instances or pred_instances.bboxes.sum(
+                ) == 0:
+                    # Fake bbox, such as the SOLO.
+                    bboxes = mask2bbox(masks.cpu()).numpy().tolist()
+                    result['bboxes'] = bboxes
+                encode_masks = encode_mask_results(pred_instances.masks)
+                for encode_mask in encode_masks:
+                    if isinstance(encode_mask['counts'], bytes):
+                        encode_mask['counts'] = encode_mask['counts'].decode()
+                result['masks'] = encode_masks
+
+        if 'pred_panoptic_seg' in data_sample:
+            if VOID is None:
+                raise RuntimeError(
+                    'panopticapi is not installed, please install it by: '
+                    'pip install git+https://github.com/cocodataset/'
+                    'panopticapi.git.')
+
+            pan = data_sample.pred_panoptic_seg.sem_seg.cpu().numpy()[0]
+            pan[pan % INSTANCE_OFFSET == len(
+                self.model.dataset_meta['classes'])] = VOID
+            pan = id2rgb(pan).astype(np.uint8)
+
+            if is_save_pred:
+                mmcv.imwrite(pan[:, :, ::-1], out_img_path)
+                result['panoptic_seg_path'] = out_img_path
+            else:
+                result['panoptic_seg'] = pan
+
+        if is_save_pred:
+            mmengine.dump(result, out_json_path)
+
+        return result
diff --git a/head_extractor/src/mmdet/apis/inference.py b/head_extractor/src/mmdet/apis/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e6f914ecabf4b9c110a4fd15310bc97d0197db9
--- /dev/null
+++ b/head_extractor/src/mmdet/apis/inference.py
@@ -0,0 +1,372 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from pathlib import Path
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.ops import RoIPool
+from mmcv.transforms import Compose
+from mmengine.config import Config
+from mmengine.dataset import default_collate
+from mmengine.model.utils import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+
+from mmdet.registry import DATASETS
+from mmdet.utils import ConfigType
+from ..evaluation import get_classes
+from ..registry import MODELS
+from ..structures import DetDataSample, SampleList
+from ..utils import get_test_pipeline_cfg
+
+
+def init_detector(
+    config: Union[str, Path, Config],
+    checkpoint: Optional[str] = None,
+    palette: str = 'none',
+    device: str = 'cuda:0',
+    cfg_options: Optional[dict] = None,
+) -> nn.Module:
+    """Initialize a detector from config file.
+
+    Args:
+        config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path,
+            :obj:`Path`, or the config object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+        palette (str): Color palette used for visualization. If palette
+            is stored in checkpoint, use checkpoint's palette first, otherwise
+            use externally passed palette. Currently, supports 'coco', 'voc',
+            'citys' and 'random'. Defaults to none.
+        device (str): The device where the anchors will be put on.
+            Defaults to cuda:0.
+        cfg_options (dict, optional): Options to override some settings in
+            the used config.
+
+    Returns:
+        nn.Module: The constructed detector.
+    """
+    if isinstance(config, (str, Path)):
+        config = Config.fromfile(config)
+    elif not isinstance(config, Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+    elif 'init_cfg' in config.model.backbone:
+        config.model.backbone.init_cfg = None
+
+    scope = config.get('default_scope', 'mmdet')
+    if scope is not None:
+        init_default_scope(config.get('default_scope', 'mmdet'))
+
+    model = MODELS.build(config.model)
+    model = revert_sync_batchnorm(model)
+    if checkpoint is None:
+        warnings.simplefilter('once')
+        warnings.warn('checkpoint is None, use COCO classes by default.')
+        model.dataset_meta = {'classes': get_classes('coco')}
+    else:
+        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+        # Weights converted from elsewhere may not have meta fields.
+        checkpoint_meta = checkpoint.get('meta', {})
+
+        # save the dataset_meta in the model for convenience
+        if 'dataset_meta' in checkpoint_meta:
+            # mmdet 3.x, all keys should be lowercase
+            model.dataset_meta = {
+                k.lower(): v
+                for k, v in checkpoint_meta['dataset_meta'].items()
+            }
+        elif 'CLASSES' in checkpoint_meta:
+            # < mmdet 3.x
+            classes = checkpoint_meta['CLASSES']
+            model.dataset_meta = {'classes': classes}
+        else:
+            warnings.simplefilter('once')
+            warnings.warn(
+                'dataset_meta or class names are not saved in the '
+                'checkpoint\'s meta data, use COCO classes by default.')
+            model.dataset_meta = {'classes': get_classes('coco')}
+
+    # Priority:  args.palette -> config -> checkpoint
+    if palette != 'none':
+        model.dataset_meta['palette'] = palette
+    else:
+        test_dataset_cfg = copy.deepcopy(config.test_dataloader.dataset)
+        # lazy init. We only need the metainfo.
+        test_dataset_cfg['lazy_init'] = True
+        metainfo = DATASETS.build(test_dataset_cfg).metainfo
+        cfg_palette = metainfo.get('palette', None)
+        if cfg_palette is not None:
+            model.dataset_meta['palette'] = cfg_palette
+        else:
+            if 'palette' not in model.dataset_meta:
+                warnings.warn(
+                    'palette does not exist, random is used by default. '
+                    'You can also set the palette to customize.')
+                model.dataset_meta['palette'] = 'random'
+
+    model.cfg = config  # save the config in the model for convenience
+    model.to(device)
+    model.eval()
+    return model
+
+
+ImagesType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]]
+
+
+def inference_detector(
+    model: nn.Module,
+    imgs: ImagesType,
+    test_pipeline: Optional[Compose] = None,
+    text_prompt: Optional[str] = None,
+    custom_entities: bool = False,
+) -> Union[DetDataSample, SampleList]:
+    """Inference image(s) with the detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        imgs (str, ndarray, Sequence[str/ndarray]):
+           Either image files or loaded images.
+        test_pipeline (:obj:`Compose`): Test pipeline.
+
+    Returns:
+        :obj:`DetDataSample` or list[:obj:`DetDataSample`]:
+        If imgs is a list or tuple, the same length list type results
+        will be returned, otherwise return the detection results directly.
+    """
+
+    if isinstance(imgs, (list, tuple)):
+        is_batch = True
+    else:
+        imgs = [imgs]
+        is_batch = False
+
+    cfg = model.cfg
+
+    if test_pipeline is None:
+        cfg = cfg.copy()
+        test_pipeline = get_test_pipeline_cfg(cfg)
+        if isinstance(imgs[0], np.ndarray):
+            # Calling this method across libraries will result
+            # in module unregistered error if not prefixed with mmdet.
+            test_pipeline[0].type = 'mmdet.LoadImageFromNDArray'
+
+        test_pipeline = Compose(test_pipeline)
+
+    if model.data_preprocessor.device.type == 'cpu':
+        for m in model.modules():
+            assert not isinstance(
+                m, RoIPool
+            ), 'CPU inference with RoIPool is not supported currently.'
+
+    result_list = []
+    for i, img in enumerate(imgs):
+        # prepare data
+        if isinstance(img, np.ndarray):
+            # TODO: remove img_id.
+            data_ = dict(img=img, img_id=0)
+        else:
+            # TODO: remove img_id.
+            data_ = dict(img_path=img, img_id=0)
+
+        if text_prompt:
+            data_['text'] = text_prompt
+            data_['custom_entities'] = custom_entities
+
+        # build the data pipeline
+        data_ = test_pipeline(data_)
+
+        data_['inputs'] = [data_['inputs']]
+        data_['data_samples'] = [data_['data_samples']]
+
+        # forward the model
+        with torch.no_grad():
+            results = model.test_step(data_)[0]
+
+        result_list.append(results)
+
+    if not is_batch:
+        return result_list[0]
+    else:
+        return result_list
+
+
+# TODO: Awaiting refactoring
+async def async_inference_detector(model, imgs):
+    """Async inference image(s) with the detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        img (str | ndarray): Either image files or loaded images.
+
+    Returns:
+        Awaitable detection results.
+    """
+    if not isinstance(imgs, (list, tuple)):
+        imgs = [imgs]
+
+    cfg = model.cfg
+
+    if isinstance(imgs[0], np.ndarray):
+        cfg = cfg.copy()
+        # set loading pipeline type
+        cfg.data.test.pipeline[0].type = 'LoadImageFromNDArray'
+
+    # cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+    test_pipeline = Compose(cfg.data.test.pipeline)
+
+    datas = []
+    for img in imgs:
+        # prepare data
+        if isinstance(img, np.ndarray):
+            # directly add img
+            data = dict(img=img)
+        else:
+            # add information into dict
+            data = dict(img_info=dict(filename=img), img_prefix=None)
+        # build the data pipeline
+        data = test_pipeline(data)
+        datas.append(data)
+
+    for m in model.modules():
+        assert not isinstance(
+            m,
+            RoIPool), 'CPU inference with RoIPool is not supported currently.'
+
+    # We don't restore `torch.is_grad_enabled()` value during concurrent
+    # inference since execution can overlap
+    torch.set_grad_enabled(False)
+    results = await model.aforward_test(data, rescale=True)
+    return results
+
+
+def build_test_pipeline(cfg: ConfigType) -> ConfigType:
+    """Build test_pipeline for mot/vis demo. In mot/vis infer, original
+    test_pipeline should remove the "LoadImageFromFile" and
+    "LoadTrackAnnotations".
+
+    Args:
+         cfg (ConfigDict): The loaded config.
+    Returns:
+         ConfigType: new test_pipeline
+    """
+    # remove the "LoadImageFromFile" and "LoadTrackAnnotations" in pipeline
+    transform_broadcaster = cfg.test_dataloader.dataset.pipeline[0].copy()
+    for transform in transform_broadcaster['transforms']:
+        if transform['type'] == 'Resize':
+            transform_broadcaster['transforms'] = transform
+    pack_track_inputs = cfg.test_dataloader.dataset.pipeline[-1].copy()
+    test_pipeline = Compose([transform_broadcaster, pack_track_inputs])
+
+    return test_pipeline
+
+
+def inference_mot(model: nn.Module, img: np.ndarray, frame_id: int,
+                  video_len: int) -> SampleList:
+    """Inference image(s) with the mot model.
+
+    Args:
+        model (nn.Module): The loaded mot model.
+        img (np.ndarray): Loaded image.
+        frame_id (int): frame id.
+        video_len (int): demo video length
+    Returns:
+        SampleList: The tracking data samples.
+    """
+    cfg = model.cfg
+    data = dict(
+        img=[img.astype(np.float32)],
+        frame_id=[frame_id],
+        ori_shape=[img.shape[:2]],
+        img_id=[frame_id + 1],
+        ori_video_length=[video_len])
+
+    test_pipeline = build_test_pipeline(cfg)
+    data = test_pipeline(data)
+
+    if not next(model.parameters()).is_cuda:
+        for m in model.modules():
+            assert not isinstance(
+                m, RoIPool
+            ), 'CPU inference with RoIPool is not supported currently.'
+
+    # forward the model
+    with torch.no_grad():
+        data = default_collate([data])
+        result = model.test_step(data)[0]
+    return result
+
+
+def init_track_model(config: Union[str, Config],
+                     checkpoint: Optional[str] = None,
+                     detector: Optional[str] = None,
+                     reid: Optional[str] = None,
+                     device: str = 'cuda:0',
+                     cfg_options: Optional[dict] = None) -> nn.Module:
+    """Initialize a model from config file.
+
+    Args:
+        config (str or :obj:`mmengine.Config`): Config file path or the config
+            object.
+        checkpoint (Optional[str], optional): Checkpoint path. Defaults to
+            None.
+        detector (Optional[str], optional): Detector Checkpoint path, use in
+            some tracking algorithms like sort.  Defaults to None.
+        reid (Optional[str], optional): Reid checkpoint path. use in
+            some tracking algorithms like sort. Defaults to None.
+        device (str, optional): The device that the model inferences on.
+            Defaults to `cuda:0`.
+        cfg_options (Optional[dict], optional): Options to override some
+            settings in the used config. Defaults to None.
+
+    Returns:
+        nn.Module: The constructed model.
+    """
+    if isinstance(config, str):
+        config = Config.fromfile(config)
+    elif not isinstance(config, Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+
+    model = MODELS.build(config.model)
+
+    if checkpoint is not None:
+        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+        # Weights converted from elsewhere may not have meta fields.
+        checkpoint_meta = checkpoint.get('meta', {})
+        # save the dataset_meta in the model for convenience
+        if 'dataset_meta' in checkpoint_meta:
+            if 'CLASSES' in checkpoint_meta['dataset_meta']:
+                value = checkpoint_meta['dataset_meta'].pop('CLASSES')
+                checkpoint_meta['dataset_meta']['classes'] = value
+            model.dataset_meta = checkpoint_meta['dataset_meta']
+
+    if detector is not None:
+        assert not (checkpoint and detector), \
+            'Error: checkpoint and detector checkpoint cannot both exist'
+        load_checkpoint(model.detector, detector, map_location='cpu')
+
+    if reid is not None:
+        assert not (checkpoint and reid), \
+            'Error: checkpoint and reid checkpoint cannot both exist'
+        load_checkpoint(model.reid, reid, map_location='cpu')
+
+    # Some methods don't load checkpoints or checkpoints don't contain
+    # 'dataset_meta'
+    # VIS need dataset_meta, MOT don't need dataset_meta
+    if not hasattr(model, 'dataset_meta'):
+        warnings.warn('dataset_meta or class names are missed, '
+                      'use None by default.')
+        model.dataset_meta = {'classes': None}
+
+    model.cfg = config  # save the config in the model for convenience
+    model.to(device)
+    model.eval()
+    return model
diff --git a/head_extractor/src/mmdet/configs/_base_/datasets/coco_detection.py b/head_extractor/src/mmdet/configs/_base_/datasets/coco_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..45041f6d236be95eb7592035d31f155c61bfcb25
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/datasets/coco_detection.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms import LoadImageFromFile
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms import (LoadAnnotations, PackDetInputs,
+                                       RandomFlip, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    batch_sampler=dict(type=AspectRatioBatchSampler),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# inference on test dataset and
+# format the output results for submission.
+# test_dataloader = dict(
+#     batch_size=1,
+#     num_workers=2,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type=DefaultSampler, shuffle=False),
+#     dataset=dict(
+#         type=dataset_type,
+#         data_root=data_root,
+#         ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#         data_prefix=dict(img='test2017/'),
+#         test_mode=True,
+#         pipeline=test_pipeline))
+# test_evaluator = dict(
+#     type=CocoMetric,
+#     metric='bbox',
+#     format_only=True,
+#     ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#     outfile_prefix='./work_dirs/coco_detection/test')
diff --git a/head_extractor/src/mmdet/configs/_base_/datasets/coco_instance.py b/head_extractor/src/mmdet/configs/_base_/datasets/coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9575432e26b7e861c4dfcf535773b7a1990eeab
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/datasets/coco_instance.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet.datasets.coco import CocoDataset
+from mmdet.datasets.samplers.batch_sampler import AspectRatioBatchSampler
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import RandomFlip, Resize
+from mmdet.evaluation.metrics.coco_metric import CocoMetric
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    batch_sampler=dict(type=AspectRatioBatchSampler),
+    dataset=dict(
+        type=CocoDataset,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=CocoDataset,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# inference on test dataset and
+# format the output results for submission.
+# test_dataloader = dict(
+#     batch_size=1,
+#     num_workers=2,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type=DefaultSampler, shuffle=False),
+#     dataset=dict(
+#         type=CocoDataset,
+#         data_root=data_root,
+#         ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#         data_prefix=dict(img='test2017/'),
+#         test_mode=True,
+#         pipeline=test_pipeline))
+# test_evaluator = dict(
+#     type=CocoMetric,
+#     metric=['bbox', 'segm'],
+#     format_only=True,
+#     ann_file=data_root + 'annotations/image_info_test-dev2017.json',
+#     outfile_prefix='./work_dirs/coco_instance/test')
diff --git a/head_extractor/src/mmdet/configs/_base_/datasets/coco_instance_semantic.py b/head_extractor/src/mmdet/configs/_base_/datasets/coco_instance_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cf5b2cfab8a98a6c97e23a8df663e8f1e90b355
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/datasets/coco_instance_semantic.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet.datasets.coco import CocoDataset
+from mmdet.datasets.samplers.batch_sampler import AspectRatioBatchSampler
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import RandomFlip, Resize
+from mmdet.evaluation.metrics.coco_metric import CocoMetric
+
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True, with_seg=True),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    # If you don't have a gt annotation, delete the pipeline
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True, with_seg=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    batch_sampler=dict(type=AspectRatioBatchSampler),
+    dataset=dict(
+        type=CocoDataset,
+        data_root=data_root,
+        ann_file='annotations/instances_train2017.json',
+        data_prefix=dict(img='train2017/', seg='stuffthingmaps/train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=CocoDataset,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmdet/configs/_base_/datasets/coco_panoptic.py b/head_extractor/src/mmdet/configs/_base_/datasets/coco_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..29d655ff619c74c5976d5f06c0c623a0d3459997
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/datasets/coco_panoptic.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmengine.dataset.sampler import DefaultSampler
+
+from mmdet.datasets.coco_panoptic import CocoPanopticDataset
+from mmdet.datasets.samplers.batch_sampler import AspectRatioBatchSampler
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadPanopticAnnotations
+from mmdet.datasets.transforms.transforms import RandomFlip, Resize
+from mmdet.evaluation.metrics.coco_panoptic_metric import CocoPanopticMetric
+
+# dataset settings
+dataset_type = 'CocoPanopticDataset'
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadPanopticAnnotations, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=LoadPanopticAnnotations, backend_args=backend_args),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    batch_sampler=dict(type=AspectRatioBatchSampler),
+    dataset=dict(
+        type=CocoPanopticDataset,
+        data_root=data_root,
+        ann_file='annotations/panoptic_train2017.json',
+        data_prefix=dict(
+            img='train2017/', seg='annotations/panoptic_train2017/'),
+        filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        pipeline=train_pipeline,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=CocoPanopticDataset,
+        data_root=data_root,
+        ann_file='annotations/panoptic_val2017.json',
+        data_prefix=dict(img='val2017/', seg='annotations/panoptic_val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoPanopticMetric,
+    ann_file=data_root + 'annotations/panoptic_val2017.json',
+    seg_prefix=data_root + 'annotations/panoptic_val2017/',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# inference on test dataset and
+# format the output results for submission.
+# test_dataloader = dict(
+#     batch_size=1,
+#     num_workers=1,
+#     persistent_workers=True,
+#     drop_last=False,
+#     sampler=dict(type=DefaultSampler, shuffle=False),
+#     dataset=dict(
+#         type=CocoPanopticDataset,
+#         data_root=data_root,
+#         ann_file='annotations/panoptic_image_info_test-dev2017.json',
+#         data_prefix=dict(img='test2017/'),
+#         test_mode=True,
+#         pipeline=test_pipeline))
+# test_evaluator = dict(
+#     type=CocoPanopticMetric,
+#     format_only=True,
+#     ann_file=data_root + 'annotations/panoptic_image_info_test-dev2017.json',
+#     outfile_prefix='./work_dirs/coco_panoptic/test')
diff --git a/head_extractor/src/mmdet/configs/_base_/datasets/mot_challenge.py b/head_extractor/src/mmdet/configs/_base_/datasets/mot_challenge.py
new file mode 100644
index 0000000000000000000000000000000000000000..a71520a84e52a812f83862920040d96746829285
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/datasets/mot_challenge.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms import (LoadImageFromFile, RandomResize,
+                             TransformBroadcaster)
+
+from mmdet.datasets import MOTChallengeDataset
+from mmdet.datasets.samplers import TrackImgSampler
+from mmdet.datasets.transforms import (LoadTrackAnnotations, PackTrackInputs,
+                                       PhotoMetricDistortion, RandomCrop,
+                                       RandomFlip, Resize,
+                                       UniformRefFrameSample)
+from mmdet.evaluation import MOTChallengeMetric
+
+# dataset settings
+dataset_type = MOTChallengeDataset
+data_root = 'data/MOT17/'
+img_scale = (1088, 1088)
+
+backend_args = None
+# data pipeline
+train_pipeline = [
+    dict(
+        type=UniformRefFrameSample,
+        num_ref_imgs=1,
+        frame_range=10,
+        filter_key_img=True),
+    dict(
+        type=TransformBroadcaster,
+        share_random_params=True,
+        transforms=[
+            dict(type=LoadImageFromFile, backend_args=backend_args),
+            dict(type=LoadTrackAnnotations),
+            dict(
+                type=RandomResize,
+                scale=img_scale,
+                ratio_range=(0.8, 1.2),
+                keep_ratio=True,
+                clip_object_border=False),
+            dict(type=PhotoMetricDistortion)
+        ]),
+    dict(
+        type=TransformBroadcaster,
+        # different cropped positions for different frames
+        share_random_params=False,
+        transforms=[
+            dict(type=RandomCrop, crop_size=img_scale, bbox_clip_border=False)
+        ]),
+    dict(
+        type=TransformBroadcaster,
+        share_random_params=True,
+        transforms=[
+            dict(type=RandomFlip, prob=0.5),
+        ]),
+    dict(type=PackTrackInputs)
+]
+
+test_pipeline = [
+    dict(
+        type=TransformBroadcaster,
+        transforms=[
+            dict(type=LoadImageFromFile, backend_args=backend_args),
+            dict(type=Resize, scale=img_scale, keep_ratio=True),
+            dict(type=LoadTrackAnnotations)
+        ]),
+    dict(type=PackTrackInputs)
+]
+
+# dataloader
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=TrackImgSampler),  # image-based sampling
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        visibility_thr=-1,
+        ann_file='annotations/half-train_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        metainfo=dict(classes=('pedestrian', )),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    # Now we support two ways to test, image_based and video_based
+    # if you want to use video_based sampling, you can use as follows
+    # sampler=dict(type='DefaultSampler', shuffle=False, round_up=False),
+    sampler=dict(type=TrackImgSampler),  # image-based sampling
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/half-val_cocoformat.json',
+        data_prefix=dict(img_path='train'),
+        test_mode=True,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# evaluator
+val_evaluator = dict(
+    type=MOTChallengeMetric, metric=['HOTA', 'CLEAR', 'Identity'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmdet/configs/_base_/default_runtime.py b/head_extractor/src/mmdet/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff96dbf29f3c90266a268d3831878b0a437d98b2
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/default_runtime.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.runner import LogProcessor
+from mmengine.visualization import LocalVisBackend
+
+from mmdet.engine.hooks import DetVisualizationHook
+from mmdet.visualization import DetLocalVisualizer
+
+default_scope = None
+
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, interval=1),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=DetVisualizationHook))
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=DetLocalVisualizer, vis_backends=vis_backends, name='visualizer')
+log_processor = dict(type=LogProcessor, window_size=50, by_epoch=True)
+
+log_level = 'INFO'
+load_from = None
+resume = False
diff --git a/head_extractor/src/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/head_extractor/src/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9132ac40330c67e03ebc608f9527c678c72210e
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.ops import RoIAlign, nms
+from torch.nn import BatchNorm2d
+
+from mmdet.models.backbones.resnet import ResNet
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.rpn_head import RPNHead
+from mmdet.models.detectors.cascade_rcnn import CascadeRCNN
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.losses.smooth_l1_loss import SmoothL1Loss
+from mmdet.models.necks.fpn import FPN
+from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \
+    Shared2FCBBoxHead
+from mmdet.models.roi_heads.cascade_roi_head import CascadeRoIHead
+from mmdet.models.roi_heads.mask_heads.fcn_mask_head import FCNMaskHead
+from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \
+    SingleRoIExtractor
+from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner
+from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \
+    DeltaXYWHBBoxCoder
+from mmdet.models.task_modules.prior_generators.anchor_generator import \
+    AnchorGenerator
+from mmdet.models.task_modules.samplers.random_sampler import RandomSampler
+
+# model settings
+model = dict(
+    type=CascadeRCNN,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=FPN,
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type=RPNHead,
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type=AnchorGenerator,
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type=DeltaXYWHBBoxCoder,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type=CascadeRoIHead,
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type=Shared2FCBBoxHead,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type=DeltaXYWHBBoxCoder,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)),
+            dict(
+                type=Shared2FCBBoxHead,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type=DeltaXYWHBBoxCoder,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)),
+            dict(
+                type=Shared2FCBBoxHead,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type=DeltaXYWHBBoxCoder,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type=FCNMaskHead,
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type=CrossEntropyLoss, use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type=MaxIoUAssigner,
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type=RandomSampler,
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type=MaxIoUAssigner,
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type=RandomSampler,
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type=MaxIoUAssigner,
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type=RandomSampler,
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type=nms, iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/head_extractor/src/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py b/head_extractor/src/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e6654f381f4993a57b81e6ed1f86c0558b56616
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/models/cascade_rcnn_r50_fpn.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.ops import RoIAlign, nms
+from torch.nn import BatchNorm2d
+
+from mmdet.models.backbones.resnet import ResNet
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.rpn_head import RPNHead
+from mmdet.models.detectors.cascade_rcnn import CascadeRCNN
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.losses.smooth_l1_loss import SmoothL1Loss
+from mmdet.models.necks.fpn import FPN
+from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \
+    Shared2FCBBoxHead
+from mmdet.models.roi_heads.cascade_roi_head import CascadeRoIHead
+from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \
+    SingleRoIExtractor
+from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner
+from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \
+    DeltaXYWHBBoxCoder
+from mmdet.models.task_modules.prior_generators.anchor_generator import \
+    AnchorGenerator
+from mmdet.models.task_modules.samplers.random_sampler import RandomSampler
+
+# model settings
+model = dict(
+    type=CascadeRCNN,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=FPN,
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type=RPNHead,
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type=AnchorGenerator,
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type=DeltaXYWHBBoxCoder,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type=CascadeRoIHead,
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type=Shared2FCBBoxHead,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type=DeltaXYWHBBoxCoder,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)),
+            dict(
+                type=Shared2FCBBoxHead,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type=DeltaXYWHBBoxCoder,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)),
+            dict(
+                type=Shared2FCBBoxHead,
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type=DeltaXYWHBBoxCoder,
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+                loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0))
+        ]),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=2000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type=MaxIoUAssigner,
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type=RandomSampler,
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type=MaxIoUAssigner,
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type=RandomSampler,
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type=MaxIoUAssigner,
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type=RandomSampler,
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type=nms, iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/head_extractor/src/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py b/head_extractor/src/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e18de2224d5b4d2cd16a930daf3a9b360455b36
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/models/faster_rcnn_r50_fpn.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.ops import RoIAlign, nms
+from torch.nn import BatchNorm2d
+
+from mmdet.models.backbones.resnet import ResNet
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.rpn_head import RPNHead
+from mmdet.models.detectors.faster_rcnn import FasterRCNN
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.losses.smooth_l1_loss import L1Loss
+from mmdet.models.necks.fpn import FPN
+from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \
+    Shared2FCBBoxHead
+from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \
+    SingleRoIExtractor
+from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead
+from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner
+from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \
+    DeltaXYWHBBoxCoder
+from mmdet.models.task_modules.prior_generators.anchor_generator import \
+    AnchorGenerator
+from mmdet.models.task_modules.samplers.random_sampler import RandomSampler
+
+# model settings
+model = dict(
+    type=FasterRCNN,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=FPN,
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type=RPNHead,
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type=AnchorGenerator,
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type=DeltaXYWHBBoxCoder,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type=L1Loss, loss_weight=1.0)),
+    roi_head=dict(
+        type=StandardRoIHead,
+        bbox_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type=Shared2FCBBoxHead,
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type=DeltaXYWHBBoxCoder,
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type=L1Loss, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type=nms, iou_threshold=0.5),
+            max_per_img=100)
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ))
diff --git a/head_extractor/src/mmdet/configs/_base_/models/mask_rcnn_r50_caffe_c4.py b/head_extractor/src/mmdet/configs/_base_/models/mask_rcnn_r50_caffe_c4.py
new file mode 100644
index 0000000000000000000000000000000000000000..3054818375f708826ee41901650a11bbbe3afca9
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/models/mask_rcnn_r50_caffe_c4.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.ops import RoIAlign, nms
+from mmengine.model.weight_init import PretrainedInit
+from torch.nn import BatchNorm2d
+
+from mmdet.models.backbones.resnet import ResNet
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.rpn_head import RPNHead
+from mmdet.models.detectors.mask_rcnn import MaskRCNN
+from mmdet.models.layers import ResLayer
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.losses.smooth_l1_loss import L1Loss
+from mmdet.models.roi_heads.bbox_heads.bbox_head import BBoxHead
+from mmdet.models.roi_heads.mask_heads.fcn_mask_head import FCNMaskHead
+from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \
+    SingleRoIExtractor
+from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead
+from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner
+from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \
+    DeltaXYWHBBoxCoder
+from mmdet.models.task_modules.prior_generators.anchor_generator import \
+    AnchorGenerator
+from mmdet.models.task_modules.samplers.random_sampler import RandomSampler
+
+# model settings
+norm_cfg = dict(type=BatchNorm2d, requires_grad=False)
+# model settings
+model = dict(
+    type=MaskRCNN,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=3,
+        strides=(1, 2, 2),
+        dilations=(1, 1, 1),
+        out_indices=(2, ),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    rpn_head=dict(
+        type=RPNHead,
+        in_channels=1024,
+        feat_channels=1024,
+        anchor_generator=dict(
+            type=AnchorGenerator,
+            scales=[2, 4, 8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[16]),
+        bbox_coder=dict(
+            type=DeltaXYWHBBoxCoder,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type=L1Loss, loss_weight=1.0)),
+    roi_head=dict(
+        type=StandardRoIHead,
+        shared_head=dict(
+            type=ResLayer,
+            depth=50,
+            stage=3,
+            stride=2,
+            dilation=1,
+            style='caffe',
+            norm_cfg=norm_cfg,
+            norm_eval=True),
+        bbox_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=14, sampling_ratio=0),
+            out_channels=1024,
+            featmap_strides=[16]),
+        bbox_head=dict(
+            type=BBoxHead,
+            with_avg_pool=True,
+            roi_feat_size=7,
+            in_channels=2048,
+            num_classes=80,
+            bbox_coder=dict(
+                type=DeltaXYWHBBoxCoder,
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type=L1Loss, loss_weight=1.0)),
+        mask_roi_extractor=None,
+        mask_head=dict(
+            type=FCNMaskHead,
+            num_convs=0,
+            in_channels=2048,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type=CrossEntropyLoss, use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=12000,
+            max_per_img=2000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=14,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=6000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type=nms, iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/head_extractor/src/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py b/head_extractor/src/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8a0b031da51c8147c8ed5c5f29502bd0c4bbe7f
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/models/mask_rcnn_r50_fpn.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.ops import RoIAlign, nms
+from mmengine.model.weight_init import PretrainedInit
+from torch.nn import BatchNorm2d
+
+from mmdet.models.backbones.resnet import ResNet
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.rpn_head import RPNHead
+from mmdet.models.detectors.mask_rcnn import MaskRCNN
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.losses.smooth_l1_loss import L1Loss
+from mmdet.models.necks.fpn import FPN
+from mmdet.models.roi_heads.bbox_heads.convfc_bbox_head import \
+    Shared2FCBBoxHead
+from mmdet.models.roi_heads.mask_heads.fcn_mask_head import FCNMaskHead
+from mmdet.models.roi_heads.roi_extractors.single_level_roi_extractor import \
+    SingleRoIExtractor
+from mmdet.models.roi_heads.standard_roi_head import StandardRoIHead
+from mmdet.models.task_modules.assigners.max_iou_assigner import MaxIoUAssigner
+from mmdet.models.task_modules.coders.delta_xywh_bbox_coder import \
+    DeltaXYWHBBoxCoder
+from mmdet.models.task_modules.prior_generators.anchor_generator import \
+    AnchorGenerator
+from mmdet.models.task_modules.samplers.random_sampler import RandomSampler
+
+# model settings
+model = dict(
+    type=MaskRCNN,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_mask=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=FPN,
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type=RPNHead,
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type=AnchorGenerator,
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type=DeltaXYWHBBoxCoder,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type=CrossEntropyLoss, use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type=L1Loss, loss_weight=1.0)),
+    roi_head=dict(
+        type=StandardRoIHead,
+        bbox_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type=Shared2FCBBoxHead,
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type=DeltaXYWHBBoxCoder,
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type=CrossEntropyLoss, use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type=L1Loss, loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type=FCNMaskHead,
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type=CrossEntropyLoss, use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=RandomSampler,
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=28,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type=nms, iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type=nms, iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/head_extractor/src/mmdet/configs/_base_/models/retinanet_r50_fpn.py b/head_extractor/src/mmdet/configs/_base_/models/retinanet_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..33e5cc4f1fe69f66801abdfedc578293e96cd23d
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/models/retinanet_r50_fpn.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.ops import nms
+from torch.nn import BatchNorm2d
+
+from mmdet.models import (FPN, DetDataPreprocessor, FocalLoss, L1Loss, ResNet,
+                          RetinaHead, RetinaNet)
+from mmdet.models.task_modules import (AnchorGenerator, DeltaXYWHBBoxCoder,
+                                       MaxIoUAssigner, PseudoSampler)
+
+# model settings
+model = dict(
+    type=RetinaNet,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=FPN,
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_input',
+        num_outs=5),
+    bbox_head=dict(
+        type=RetinaHead,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        anchor_generator=dict(
+            type=AnchorGenerator,
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        bbox_coder=dict(
+            type=DeltaXYWHBBoxCoder,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type=FocalLoss,
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type=L1Loss, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type=MaxIoUAssigner,
+            pos_iou_thr=0.5,
+            neg_iou_thr=0.4,
+            min_pos_iou=0,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type=PseudoSampler),  # Focal loss should use PseudoSampler
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=1000,
+        min_bbox_size=0,
+        score_thr=0.05,
+        nms=dict(type=nms, iou_threshold=0.5),
+        max_per_img=100))
diff --git a/head_extractor/src/mmdet/configs/_base_/schedules/schedule_1x.py b/head_extractor/src/mmdet/configs/_base_/schedules/schedule_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..47d1fa6a4852c40f3f9962a47ec90e365671c61c
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/schedules/schedule_1x.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+# training schedule for 1x
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=12, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/head_extractor/src/mmdet/configs/_base_/schedules/schedule_2x.py b/head_extractor/src/mmdet/configs/_base_/schedules/schedule_2x.py
new file mode 100644
index 0000000000000000000000000000000000000000..51ba09a4723bc6ba41b8b4cb6e623ade7db26511
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/_base_/schedules/schedule_2x.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+# training schedule for 1x
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=24, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/head_extractor/src/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py b/head_extractor/src/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a81c25af8b9506acfa8755ff4ec99d33c661442b
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/cascade_rcnn/cascade_mask_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_instance import *
+    from .._base_.default_runtime import *
+    from .._base_.models.cascade_mask_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_1x import *
diff --git a/head_extractor/src/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py b/head_extractor/src/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..883f09be67066283e1b59484d3483e73d82af776
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/cascade_rcnn/cascade_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_detection import *
+    from .._base_.default_runtime import *
+    from .._base_.models.cascade_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_1x import *
diff --git a/head_extractor/src/mmdet/configs/common/lsj_100e_coco_detection.py b/head_extractor/src/mmdet/configs/common/lsj_100e_coco_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea2d6bad7f500417ad1eb3e16ca7761c6cadca0e
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/common/lsj_100e_coco_detection.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import CocoDataset, RepeatDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+image_size = (1024, 1024)
+
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=RandomResize,
+        scale=image_size,
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(
+        type=RandomCrop,
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+# Use RepeatDataset to speed up training
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=4,  # simply change this from 2 to 16 for 50e - 400e training.
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+max_epochs = 25
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=5)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# optimizer assumes bs=64
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=0.00004))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.067, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
+
+# only keep latest 2 checkpoints
+default_hooks.update(dict(checkpoint=dict(max_keep_ckpts=2)))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/head_extractor/src/mmdet/configs/common/lsj_100e_coco_instance.py b/head_extractor/src/mmdet/configs/common/lsj_100e_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..90104ee503b22ef395a9b87d74ee80431575d90c
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/common/lsj_100e_coco_instance.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import CocoDataset, RepeatDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+image_size = (1024, 1024)
+
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=RandomResize,
+        scale=image_size,
+        ratio_range=(0.1, 2.0),
+        keep_ratio=True),
+    dict(
+        type=RandomCrop,
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+# Use RepeatDataset to speed up training
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    dataset=dict(
+        type=RepeatDataset,
+        times=4,  # simply change this from 2 to 16 for 50e - 400e training.
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+max_epochs = 25
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=5)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# optimizer assumes bs=64
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=0.00004))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.067, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
+
+# only keep latest 2 checkpoints
+default_hooks.update(dict(checkpoint=dict(max_keep_ckpts=2)))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (32 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=64)
diff --git a/head_extractor/src/mmdet/configs/common/lsj_200e_coco_detection.py b/head_extractor/src/mmdet/configs/common/lsj_200e_coco_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..5759499e95dde6ef99246ab00c21264192ff511c
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/common/lsj_200e_coco_detection.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .lsj_100e_coco_detection import *
+
+# 8x25=200e
+train_dataloader.update(dict(dataset=dict(times=8)))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.067, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=25,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
diff --git a/head_extractor/src/mmdet/configs/common/lsj_200e_coco_instance.py b/head_extractor/src/mmdet/configs/common/lsj_200e_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..77c5cdd44c488a763d320768e80b314f999ac555
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/common/lsj_200e_coco_instance.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .lsj_100e_coco_instance import *
+
+# 8x25=200e
+train_dataloader.update(dict(dataset=dict(times=8)))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.067, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=25,
+        by_epoch=True,
+        milestones=[22, 24],
+        gamma=0.1)
+]
diff --git a/head_extractor/src/mmdet/configs/common/ms_3x_coco.py b/head_extractor/src/mmdet/configs/common/ms_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c32b24d96aeed59a7340cd7e743dd16b7c728bf1
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/common/ms_3x_coco.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import RandomResize
+from mmengine.dataset import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import RandomFlip, Resize
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=RandomResize, scale=[(1333, 640), (1333, 800)], keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    pin_memory=True,
+    sampler=dict(type=DefaultSampler, shuffle=True),
+    batch_sampler=dict(type=AspectRatioBatchSampler),
+    dataset=dict(
+        type=RepeatDataset,
+        times=3,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='annotations/instances_val2017.json',
+        data_prefix=dict(img='val2017/'),
+        test_mode=True,
+        pipeline=test_pipeline,
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# training schedule for 3x with `RepeatDataset`
+train_cfg = dict(type=EpochBasedTrainLoop, max_iters=12, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=12,
+        by_epoch=False,
+        milestones=[9, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/head_extractor/src/mmdet/configs/common/ms_3x_coco_instance.py b/head_extractor/src/mmdet/configs/common/ms_3x_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c78909df80173eb37ff83c4ba12614e73848f29
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/common/ms_3x_coco_instance.py
@@ -0,0 +1,136 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.dataset import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type='RandomResize', scale=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader.update(
+    dict(
+        batch_size=2,
+        num_workers=2,
+        persistent_workers=True,
+        sampler=dict(type=DefaultSampler, shuffle=True),
+        batch_sampler=dict(type=AspectRatioBatchSampler),
+        dataset=dict(
+            type=RepeatDataset,
+            times=3,
+            dataset=dict(
+                type=dataset_type,
+                data_root=data_root,
+                ann_file='annotations/instances_train2017.json',
+                data_prefix=dict(img='train2017/'),
+                filter_cfg=dict(filter_empty_gt=True, min_size=32),
+                pipeline=train_pipeline,
+                backend_args=backend_args))))
+val_dataloader.update(
+    dict(
+        batch_size=1,
+        num_workers=2,
+        persistent_workers=True,
+        drop_last=False,
+        sampler=dict(type=DefaultSampler, shuffle=False),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_val2017.json',
+            data_prefix=dict(img='val2017/'),
+            test_mode=True,
+            pipeline=test_pipeline,
+            backend_args=backend_args)))
+test_dataloader = val_dataloader
+
+val_evaluator.update(
+    dict(
+        type=CocoMetric,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        metric='bbox',
+        backend_args=backend_args))
+test_evaluator = val_evaluator
+
+# training schedule for 3x with `RepeatDataset`
+train_cfg.update(dict(type=EpochBasedTrainLoop, max_epochs=12, val_interval=1))
+val_cfg.update(dict(type=ValLoop))
+test_cfg.update(dict(type=TestLoop))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=12,
+        by_epoch=False,
+        milestones=[9, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        type=OptimWrapper,
+        optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr.update(dict(enable=False, base_batch_size=16))
diff --git a/head_extractor/src/mmdet/configs/common/ms_90k_coco.py b/head_extractor/src/mmdet/configs/common/ms_90k_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3abf1d4a4a8cf53a4abfa43722e306ac04770e18
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/common/ms_90k_coco.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.dataset import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# Align with Detectron2
+backend = 'pillow'
+train_pipeline = [
+    dict(
+        type=LoadImageFromFile,
+        backend_args=backend_args,
+        imdecode_backend=backend),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=RandomChoiceResize,
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True,
+        backend=backend),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(
+        type=LoadImageFromFile,
+        backend_args=backend_args,
+        imdecode_backend=backend),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True, backend=backend),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader.update(
+    dict(
+        batch_size=2,
+        num_workers=2,
+        persistent_workers=True,
+        pin_memory=True,
+        sampler=dict(type=InfiniteSampler, shuffle=True),
+        batch_sampler=dict(type=AspectRatioBatchSampler),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader.update(
+    dict(
+        batch_size=1,
+        num_workers=2,
+        persistent_workers=True,
+        drop_last=False,
+        pin_memory=True,
+        sampler=dict(type=DefaultSampler, shuffle=False),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_val2017.json',
+            data_prefix=dict(img='val2017/'),
+            test_mode=True,
+            pipeline=test_pipeline,
+            backend_args=backend_args)))
+test_dataloader = val_dataloader
+
+val_evaluator.update(
+    dict(
+        type=CocoMetric,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        metric='bbox',
+        format_only=False,
+        backend_args=backend_args))
+test_evaluator = val_evaluator
+
+# training schedule for 90k
+max_iter = 90000
+train_cfg.update(
+    dict(type=IterBasedTrainLoop, max_iters=max_iter, val_interval=10000))
+val_cfg.update(dict(type=ValLoop))
+test_cfg.update(dict(type=TestLoop))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[60000, 80000],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        type=OptimWrapper,
+        optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr.update(dict(enable=False, base_batch_size=16))
+
+default_hooks.update(dict(checkpoint=dict(by_epoch=False, interval=10000)))
+log_processor.update(dict(by_epoch=False))
diff --git a/head_extractor/src/mmdet/configs/common/ms_poly_3x_coco_instance.py b/head_extractor/src/mmdet/configs/common/ms_poly_3x_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..53913a059a4db9230ebd777934cc8db5595479fe
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/common/ms_poly_3x_coco_instance.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.dataset import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type='RandomResize', scale=[(1333, 640), (1333, 800)],
+        keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader.update(
+    dict(
+        batch_size=2,
+        num_workers=2,
+        persistent_workers=True,
+        pin_memory=True,
+        sampler=dict(type=DefaultSampler, shuffle=True),
+        batch_sampler=dict(type=AspectRatioBatchSampler),
+        dataset=dict(
+            type=RepeatDataset,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader.update(
+    dict(
+        batch_size=2,
+        num_workers=2,
+        persistent_workers=True,
+        drop_last=False,
+        pin_memory=True,
+        sampler=dict(type=DefaultSampler, shuffle=False),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_val2017.json',
+            data_prefix=dict(img='val2017/'),
+            test_mode=True,
+            pipeline=test_pipeline,
+            backend_args=backend_args)))
+test_dataloader = val_dataloader
+
+val_evaluator.update(
+    dict(
+        type=CocoMetric,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        metric=['bbox', 'segm'],
+        backend_args=backend_args))
+test_evaluator = val_evaluator
+
+# training schedule for 3x with `RepeatDataset`
+train_cfg.update(dict(type=EpochBasedTrainLoop, max_iters=12, val_interval=1))
+val_cfg.update(dict(type=ValLoop))
+test_cfg.update(dict(type=TestLoop))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=12,
+        by_epoch=False,
+        milestones=[9, 11],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        type=OptimWrapper,
+        optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr.update(dict(enable=False, base_batch_size=16))
diff --git a/head_extractor/src/mmdet/configs/common/ms_poly_90k_coco_instance.py b/head_extractor/src/mmdet/configs/common/ms_poly_90k_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..52367350137035604ea167e5732a791c2e9cae87
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/common/ms_poly_90k_coco_instance.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.dataset import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# Align with Detectron2
+backend = 'pillow'
+train_pipeline = [
+    dict(
+        type=LoadImageFromFile,
+        backend_args=backend_args,
+        imdecode_backend=backend),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=RandomChoiceResize,
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True,
+        backend=backend),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(
+        type=LoadImageFromFile,
+        backend_args=backend_args,
+        imdecode_backend=backend),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True, backend=backend),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader.update(
+    dict(
+        batch_size=2,
+        num_workers=2,
+        persistent_workers=True,
+        pin_memory=True,
+        sampler=dict(type=InfiniteSampler, shuffle=True),
+        batch_sampler=dict(type=AspectRatioBatchSampler),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader.update(
+    dict(
+        batch_size=1,
+        num_workers=2,
+        persistent_workers=True,
+        drop_last=False,
+        pin_memory=True,
+        sampler=dict(type=DefaultSampler, shuffle=False),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_val2017.json',
+            data_prefix=dict(img='val2017/'),
+            test_mode=True,
+            pipeline=test_pipeline,
+            backend_args=backend_args)))
+test_dataloader = val_dataloader
+
+val_evaluator.update(
+    dict(
+        type=CocoMetric,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        metric=['bbox', 'segm'],
+        format_only=False,
+        backend_args=backend_args))
+test_evaluator = val_evaluator
+
+# training schedule for 90k
+max_iter = 90000
+train_cfg.update(
+    dict(type=IterBasedTrainLoop, max_iters=max_iter, val_interval=10000))
+val_cfg.update(dict(type=ValLoop))
+test_cfg.update(dict(type=TestLoop))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[60000, 80000],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        type=OptimWrapper,
+        optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr.update(dict(enable=False, base_batch_size=16))
+
+default_hooks.update(dict(checkpoint=dict(by_epoch=False, interval=10000)))
+log_processor.update(dict(by_epoch=False))
diff --git a/head_extractor/src/mmdet/configs/common/ssj_270_coco_instance.py b/head_extractor/src/mmdet/configs/common/ssj_270_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee86fdad4eca5b87ac0066b635e098d6a927bb49
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/common/ssj_270_coco_instance.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.dataset import RepeatDataset
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+from mmengine.optim import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim import SGD
+
+from mmdet.datasets import AspectRatioBatchSampler, CocoDataset
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations,
+                                               LoadImageFromFile)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  RandomResize, Resize)
+from mmdet.evaluation import CocoMetric
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# Standard Scale Jittering (SSJ) resizes and crops an image
+# with a resize range of 0.8 to 1.25 of the original image size.
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=RandomResize,
+        scale=image_size,
+        ratio_range=(0.8, 1.25),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader.update(
+    dict(
+        batch_size=2,
+        num_workers=2,
+        persistent_workers=True,
+        sampler=dict(type=InfiniteSampler),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=train_pipeline,
+            backend_args=backend_args)))
+val_dataloader.update(
+    dict(
+        batch_size=1,
+        num_workers=2,
+        persistent_workers=True,
+        drop_last=False,
+        sampler=dict(type=DefaultSampler, shuffle=False),
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_val2017.json',
+            data_prefix=dict(img='val2017/'),
+            test_mode=True,
+            pipeline=test_pipeline,
+            backend_args=backend_args)))
+test_dataloader = val_dataloader
+
+val_evaluator.update(
+    dict(
+        type=CocoMetric,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        metric=['bbox', 'segm'],
+        format_only=False,
+        backend_args=backend_args))
+test_evaluator = val_evaluator
+
+val_evaluator = dict(
+    type=CocoMetric,
+    ann_file=data_root + 'annotations/instances_val2017.json',
+    metric=['bbox', 'segm'],
+    format_only=False,
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+# The model is trained by 270k iterations with batch_size 64,
+# which is roughly equivalent to 144 epochs.
+
+max_iter = 270000
+train_cfg.update(
+    dict(type=IterBasedTrainLoop, max_iters=max_iter, val_interval=10000))
+val_cfg.update(dict(type=ValLoop))
+test_cfg.update(dict(type=TestLoop))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=1000),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_iter,
+        by_epoch=False,
+        milestones=[243000, 256500, 263250],
+        gamma=0.1)
+]
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        type=OptimWrapper,
+        optimizer=dict(type=SGD, lr=0.1, momentum=0.9, weight_decay=0.00004)))
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr.update(dict(base_batch_size=64))
+
+default_hooks.update(dict(checkpoint=dict(by_epoch=False, interval=10000)))
+log_processor.update(dict(by_epoch=False))
diff --git a/head_extractor/src/mmdet/configs/common/ssj_scp_270k_coco_instance.py b/head_extractor/src/mmdet/configs/common/ssj_scp_270k_coco_instance.py
new file mode 100644
index 0000000000000000000000000000000000000000..68bb1f0904fcb4de3e2f892355e489f52f53d960
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/common/ssj_scp_270k_coco_instance.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .ssj_270_coco_instance import *
+
+from mmdet.datasets import MultiImageMixDataset
+from mmdet.datasets.transforms import CopyPaste
+
+# dataset settings
+dataset_type = CocoDataset
+data_root = 'data/coco/'
+image_size = (1024, 1024)
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection/coco/'
+
+# Method 2: Use `backend_args`, `file_client_args` in versions before 3.0.0rc6
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection/',
+#         'data/': 's3://openmmlab/datasets/detection/'
+#     }))
+backend_args = None
+
+# Standard Scale Jittering (SSJ) resizes and crops an image
+# with a resize range of 0.8 to 1.25 of the original image size.
+load_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=RandomResize,
+        scale=image_size,
+        ratio_range=(0.8, 1.25),
+        keep_ratio=True),
+    dict(
+        type='RandomCrop',
+        crop_type='absolute_range',
+        crop_size=image_size,
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type='FilterAnnotations', min_gt_bbox_wh=(1e-2, 1e-2)),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=image_size),
+]
+train_pipeline = [
+    dict(type=CopyPaste, max_num_pasted=100),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(
+    dict(
+        type=MultiImageMixDataset,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='annotations/instances_train2017.json',
+            data_prefix=dict(img='train2017/'),
+            filter_cfg=dict(filter_empty_gt=True, min_size=32),
+            pipeline=load_pipeline,
+            backend_args=backend_args),
+        pipeline=train_pipeline))
diff --git a/head_extractor/src/mmdet/configs/deformable_detr/deformable_detr_r50_16xb2_50e_coco.py b/head_extractor/src/mmdet/configs/deformable_detr/deformable_detr_r50_16xb2_50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee2a41639d84ed8e278af45229b451b742ac8974
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/deformable_detr/deformable_detr_r50_16xb2_50e_coco.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_detection import *
+    from .._base_.default_runtime import *
+
+from mmcv.transforms import LoadImageFromFile, RandomChoice, RandomChoiceResize
+from mmengine.optim.optimizer import OptimWrapper
+from mmengine.optim.scheduler import MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.adamw import AdamW
+
+from mmdet.datasets.transforms import (LoadAnnotations, PackDetInputs,
+                                       RandomCrop, RandomFlip, Resize)
+from mmdet.models.backbones import ResNet
+from mmdet.models.data_preprocessors import DetDataPreprocessor
+from mmdet.models.dense_heads import DeformableDETRHead
+from mmdet.models.detectors import DeformableDETR
+from mmdet.models.losses import FocalLoss, GIoULoss, L1Loss
+from mmdet.models.necks import ChannelMapper
+from mmdet.models.task_modules import (BBoxL1Cost, FocalLossCost,
+                                       HungarianAssigner, IoUCost)
+
+model = dict(
+    type=DeformableDETR,
+    num_queries=300,
+    num_feature_levels=4,
+    with_box_refine=False,
+    as_two_stage=False,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=ChannelMapper,
+        in_channels=[512, 1024, 2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type='GN', num_groups=32),
+        num_outs=4),
+    encoder=dict(  # DeformableDetrTransformerEncoder
+        num_layers=6,
+        layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+            self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                embed_dims=256,
+                batch_first=True),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.1))),
+    decoder=dict(  # DeformableDetrTransformerDecoder
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(  # DeformableDetrTransformerDecoderLayer
+            self_attn_cfg=dict(  # MultiheadAttention
+                embed_dims=256,
+                num_heads=8,
+                dropout=0.1,
+                batch_first=True),
+            cross_attn_cfg=dict(  # MultiScaleDeformableAttention
+                embed_dims=256,
+                batch_first=True),
+            ffn_cfg=dict(
+                embed_dims=256, feedforward_channels=1024, ffn_drop=0.1)),
+        post_norm_cfg=None),
+    positional_encoding=dict(num_feats=128, normalize=True, offset=-0.5),
+    bbox_head=dict(
+        type=DeformableDETRHead,
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        loss_cls=dict(
+            type=FocalLoss,
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type=L1Loss, loss_weight=5.0),
+        loss_iou=dict(type=GIoULoss, loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type=HungarianAssigner,
+            match_costs=[
+                dict(type=FocalLossCost, weight=2.0),
+                dict(type=BBoxL1Cost, weight=5.0, box_format='xywh'),
+                dict(type=IoUCost, iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=100))
+
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(
+        type=RandomChoice,
+        transforms=[
+            [
+                dict(
+                    type=RandomChoiceResize,
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    resize_type=Resize,
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type=RandomChoiceResize,
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    resize_type=Resize,
+                    keep_ratio=True),
+                dict(
+                    type=RandomCrop,
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type=RandomChoiceResize,
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    resize_type=Resize,
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type=PackDetInputs)
+]
+train_dataloader.update(
+    dict(
+        dataset=dict(
+            filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline)))
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=0.0002, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1),
+            'sampling_offsets': dict(lr_mult=0.1),
+            'reference_points': dict(lr_mult=0.1)
+        }))
+
+# learning policy
+max_epochs = 50
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+param_scheduler = [
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[40],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (16 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=32)
diff --git a/head_extractor/src/mmdet/configs/deformable_detr/deformable_detr_refine_r50_16xb2_50e_coco.py b/head_extractor/src/mmdet/configs/deformable_detr/deformable_detr_refine_r50_16xb2_50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f232d6111026488020e586440852c012dd94608
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/deformable_detr/deformable_detr_refine_r50_16xb2_50e_coco.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .deformable_detr_r50_16xb2_50e_coco import *
+
+model.update(dict(with_box_refine=True))
diff --git a/head_extractor/src/mmdet/configs/deformable_detr/deformable_detr_refine_twostage_r50_16xb2_50e_coco.py b/head_extractor/src/mmdet/configs/deformable_detr/deformable_detr_refine_twostage_r50_16xb2_50e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fac4d8c4f2020b6d87857fbe157419e4c4f0712
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/deformable_detr/deformable_detr_refine_twostage_r50_16xb2_50e_coco.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .deformable_detr_refine_r50_16xb2_50e_coco import *
+
+model.update(dict(as_two_stage=True))
diff --git a/head_extractor/src/mmdet/configs/detr/detr_r101_8xb2_500e_coco.py b/head_extractor/src/mmdet/configs/detr/detr_r101_8xb2_500e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b961468114ce3adb0582378ac422649ef3bd5013
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/detr/detr_r101_8xb2_500e_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+
+with read_base():
+    from .detr_r50_8xb2_500e_coco import *
+
+model.update(
+    dict(
+        backbone=dict(
+            depth=101,
+            init_cfg=dict(
+                type=PretrainedInit, checkpoint='torchvision://resnet101'))))
diff --git a/head_extractor/src/mmdet/configs/detr/detr_r18_8xb2_500e_coco.py b/head_extractor/src/mmdet/configs/detr/detr_r18_8xb2_500e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..11360af18de729bfd9e8d8cb6597067a588852c9
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/detr/detr_r18_8xb2_500e_coco.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+
+with read_base():
+    from .detr_r50_8xb2_500e_coco import *
+
+model.update(
+    dict(
+        backbone=dict(
+            depth=18,
+            init_cfg=dict(
+                type=PretrainedInit, checkpoint='torchvision://resnet18')),
+        neck=dict(in_channels=[512])))
diff --git a/head_extractor/src/mmdet/configs/detr/detr_r50_8xb2_150e_coco.py b/head_extractor/src/mmdet/configs/detr/detr_r50_8xb2_150e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c50726c7890cb59bee4b921179be1949ff12199e
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/detr/detr_r50_8xb2_150e_coco.py
@@ -0,0 +1,182 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms import RandomChoice, RandomChoiceResize
+from mmcv.transforms.loading import LoadImageFromFile
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.nn.modules.activation import ReLU
+from torch.nn.modules.batchnorm import BatchNorm2d
+from torch.optim.adamw import AdamW
+
+from mmdet.datasets.transforms import (LoadAnnotations, PackDetInputs,
+                                       RandomCrop, RandomFlip, Resize)
+from mmdet.models import (DETR, ChannelMapper, DetDataPreprocessor, DETRHead,
+                          ResNet)
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.losses.iou_loss import GIoULoss
+from mmdet.models.losses.smooth_l1_loss import L1Loss
+from mmdet.models.task_modules import (BBoxL1Cost, ClassificationCost,
+                                       HungarianAssigner, IoUCost)
+
+with read_base():
+    from .._base_.datasets.coco_detection import *
+    from .._base_.default_runtime import *
+
+model = dict(
+    type=DETR,
+    num_queries=100,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(3, ),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=ChannelMapper,
+        in_channels=[2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=None,
+        num_outs=1),
+    encoder=dict(  # DetrTransformerEncoder
+        num_layers=6,
+        layer_cfg=dict(  # DetrTransformerEncoderLayer
+            self_attn_cfg=dict(  # MultiheadAttention
+                embed_dims=256,
+                num_heads=8,
+                dropout=0.1,
+                batch_first=True),
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,
+                num_fcs=2,
+                ffn_drop=0.1,
+                act_cfg=dict(type=ReLU, inplace=True)))),
+    decoder=dict(  # DetrTransformerDecoder
+        num_layers=6,
+        layer_cfg=dict(  # DetrTransformerDecoderLayer
+            self_attn_cfg=dict(  # MultiheadAttention
+                embed_dims=256,
+                num_heads=8,
+                dropout=0.1,
+                batch_first=True),
+            cross_attn_cfg=dict(  # MultiheadAttention
+                embed_dims=256,
+                num_heads=8,
+                dropout=0.1,
+                batch_first=True),
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,
+                num_fcs=2,
+                ffn_drop=0.1,
+                act_cfg=dict(type=ReLU, inplace=True))),
+        return_intermediate=True),
+    positional_encoding=dict(num_feats=128, normalize=True),
+    bbox_head=dict(
+        type=DETRHead,
+        num_classes=80,
+        embed_dims=256,
+        loss_cls=dict(
+            type=CrossEntropyLoss,
+            bg_cls_weight=0.1,
+            use_sigmoid=False,
+            loss_weight=1.0,
+            class_weight=1.0),
+        loss_bbox=dict(type=L1Loss, loss_weight=5.0),
+        loss_iou=dict(type=GIoULoss, loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type=HungarianAssigner,
+            match_costs=[
+                dict(type=ClassificationCost, weight=1.),
+                dict(type=BBoxL1Cost, weight=5.0, box_format='xywh'),
+                dict(type=IoUCost, iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=100))
+
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(
+        type=RandomChoice,
+        transforms=[[
+            dict(
+                type=RandomChoiceResize,
+                resize_type=Resize,
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
+                keep_ratio=True)
+        ],
+                    [
+                        dict(
+                            type=RandomChoiceResize,
+                            resize_type=Resize,
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            keep_ratio=True),
+                        dict(
+                            type=RandomCrop,
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type=RandomChoiceResize,
+                            resize_type=Resize,
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            keep_ratio=True)
+                    ]]),
+    dict(type=PackDetInputs)
+]
+train_dataloader.update(dataset=dict(pipeline=train_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=0.0001, weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
+
+# learning policy
+max_epochs = 150
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+param_scheduler = [
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[100],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/head_extractor/src/mmdet/configs/detr/detr_r50_8xb2_500e_coco.py b/head_extractor/src/mmdet/configs/detr/detr_r50_8xb2_500e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7d0817766255a84237f0aea917806e191d161df
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/detr/detr_r50_8xb2_500e_coco.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.optim.scheduler.lr_scheduler import MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop
+
+with read_base():
+    from .detr_r50_8xb2_150e_coco import *
+
+# learning policy
+max_epochs = 500
+train_cfg.update(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=10)
+
+param_scheduler = [
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[334],
+        gamma=0.1)
+]
+
+# only keep latest 2 checkpoints
+default_hooks.update(checkpoint=dict(max_keep_ckpts=2))
diff --git a/head_extractor/src/mmdet/configs/dino/dino_4scale_r50_8xb2_12e_coco.py b/head_extractor/src/mmdet/configs/dino/dino_4scale_r50_8xb2_12e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab8e95a9a76c0cedb78c66993fc7fb7f4623029c
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/dino/dino_4scale_r50_8xb2_12e_coco.py
@@ -0,0 +1,190 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms import RandomChoice, RandomChoiceResize
+from mmcv.transforms.loading import LoadImageFromFile
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import MultiStepLR
+from mmengine.runner.loops import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.nn.modules.batchnorm import BatchNorm2d
+from torch.nn.modules.normalization import GroupNorm
+from torch.optim.adamw import AdamW
+
+from mmdet.datasets.transforms import (LoadAnnotations, PackDetInputs,
+                                       RandomCrop, RandomFlip, Resize)
+from mmdet.models import (DINO, ChannelMapper, DetDataPreprocessor, DINOHead,
+                          ResNet)
+from mmdet.models.losses.focal_loss import FocalLoss
+from mmdet.models.losses.iou_loss import GIoULoss
+from mmdet.models.losses.smooth_l1_loss import L1Loss
+from mmdet.models.task_modules import (BBoxL1Cost, FocalLossCost,
+                                       HungarianAssigner, IoUCost)
+
+with read_base():
+    from .._base_.datasets.coco_detection import *
+    from .._base_.default_runtime import *
+
+model = dict(
+    type=DINO,
+    num_queries=900,  # num_matching_queries
+    with_box_refine=True,
+    as_two_stage=True,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=1),
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet50')),
+    neck=dict(
+        type=ChannelMapper,
+        in_channels=[512, 1024, 2048],
+        kernel_size=1,
+        out_channels=256,
+        act_cfg=None,
+        norm_cfg=dict(type=GroupNorm, num_groups=32),
+        num_outs=4),
+    encoder=dict(
+        num_layers=6,
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_levels=4,
+                               dropout=0.0),  # 0.1 for DeformDETR
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,  # 1024 for DeformDETR
+                ffn_drop=0.0))),  # 0.1 for DeformDETR
+    decoder=dict(
+        num_layers=6,
+        return_intermediate=True,
+        layer_cfg=dict(
+            self_attn_cfg=dict(embed_dims=256, num_heads=8,
+                               dropout=0.0),  # 0.1 for DeformDETR
+            cross_attn_cfg=dict(embed_dims=256, num_levels=4,
+                                dropout=0.0),  # 0.1 for DeformDETR
+            ffn_cfg=dict(
+                embed_dims=256,
+                feedforward_channels=2048,  # 1024 for DeformDETR
+                ffn_drop=0.0)),  # 0.1 for DeformDETR
+        post_norm_cfg=None),
+    positional_encoding=dict(
+        num_feats=128,
+        normalize=True,
+        offset=0.0,  # -0.5 for DeformDETR
+        temperature=20),  # 10000 for DeformDETR
+    bbox_head=dict(
+        type=DINOHead,
+        num_classes=80,
+        sync_cls_avg_factor=True,
+        loss_cls=dict(
+            type=FocalLoss,
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),  # 2.0 in DeformDETR
+        loss_bbox=dict(type=L1Loss, loss_weight=5.0),
+        loss_iou=dict(type=GIoULoss, loss_weight=2.0)),
+    dn_cfg=dict(  # TODO: Move to model.train_cfg ?
+        label_noise_scale=0.5,
+        box_noise_scale=1.0,  # 0.4 for DN-DETR
+        group_cfg=dict(dynamic=True, num_groups=None,
+                       num_dn_queries=100)),  # TODO: half num_dn_queries
+    # training and testing settings
+    train_cfg=dict(
+        assigner=dict(
+            type=HungarianAssigner,
+            match_costs=[
+                dict(type=FocalLossCost, weight=2.0),
+                dict(type=BBoxL1Cost, weight=5.0, box_format='xywh'),
+                dict(type=IoUCost, iou_mode='giou', weight=2.0)
+            ])),
+    test_cfg=dict(max_per_img=300))  # 100 for DeformDETR
+
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(
+        type=RandomChoice,
+        transforms=[
+            [
+                dict(
+                    type=RandomChoiceResize,
+                    resize_type=Resize,
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ],
+            [
+                dict(
+                    type=RandomChoiceResize,
+                    resize_type=Resize,
+                    # The radio of all image in train dataset < 7
+                    # follow the original implement
+                    scales=[(400, 4200), (500, 4200), (600, 4200)],
+                    keep_ratio=True),
+                dict(
+                    type=RandomCrop,
+                    crop_type='absolute_range',
+                    crop_size=(384, 600),
+                    allow_negative_crop=True),
+                dict(
+                    type=RandomChoiceResize,
+                    resize_type=Resize,
+                    scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                            (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                            (736, 1333), (768, 1333), (800, 1333)],
+                    keep_ratio=True)
+            ]
+        ]),
+    dict(type=PackDetInputs)
+]
+train_dataloader.update(
+    dataset=dict(
+        filter_cfg=dict(filter_empty_gt=False), pipeline=train_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(
+        type=AdamW,
+        lr=0.0001,  # 0.0002 for DeformDETR
+        weight_decay=0.0001),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(custom_keys={'backbone': dict(lr_mult=0.1)})
+)  # custom_keys contains sampling_offsets and reference_points in DeformDETR  # noqa
+
+# learning policy
+max_epochs = 12
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1)
+
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+param_scheduler = [
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[11],
+        gamma=0.1)
+]
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (8 GPUs) x (2 samples per GPU)
+auto_scale_lr = dict(base_batch_size=16)
diff --git a/head_extractor/src/mmdet/configs/dino/dino_4scale_r50_8xb2_24e_coco.py b/head_extractor/src/mmdet/configs/dino/dino_4scale_r50_8xb2_24e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c10cc2184de8f71571759ecbeac56696afceb5eb
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/dino/dino_4scale_r50_8xb2_24e_coco.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.runner.loops import EpochBasedTrainLoop
+
+with read_base():
+    from .dino_4scale_r50_8xb2_12e_coco import *
+
+max_epochs = 24
+train_cfg.update(
+    dict(type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1))
+
+param_scheduler[0].update(dict(milestones=[20]))
diff --git a/head_extractor/src/mmdet/configs/dino/dino_4scale_r50_8xb2_36e_coco.py b/head_extractor/src/mmdet/configs/dino/dino_4scale_r50_8xb2_36e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..3779744322a19d2865f1e6299aba564c4ec1e3d5
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/dino/dino_4scale_r50_8xb2_36e_coco.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.runner.loops import EpochBasedTrainLoop
+
+with read_base():
+    from .dino_4scale_r50_8xb2_12e_coco import *
+
+max_epochs = 36
+train_cfg.update(
+    dict(type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1))
+
+param_scheduler[0].update(dict(milestones=[30]))
diff --git a/head_extractor/src/mmdet/configs/dino/dino_4scale_r50_improved_8xb2_12e_coco.py b/head_extractor/src/mmdet/configs/dino/dino_4scale_r50_improved_8xb2_12e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..43c07201079fcdbad3c9ea7a471306080e006cdc
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/dino/dino_4scale_r50_improved_8xb2_12e_coco.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from .dino_4scale_r50_8xb2_12e_coco import *
+
+# from deformable detr hyper
+model.update(
+    dict(
+        backbone=dict(frozen_stages=-1),
+        bbox_head=dict(loss_cls=dict(loss_weight=2.0)),
+        positional_encoding=dict(offset=-0.5, temperature=10000),
+        dn_cfg=dict(group_cfg=dict(num_dn_queries=300))))
+
+# optimizer
+optim_wrapper.update(
+    dict(
+        optimizer=dict(lr=0.0002),
+        paramwise_cfg=dict(
+            custom_keys={
+                'backbone': dict(lr_mult=0.1),
+                'sampling_offsets': dict(lr_mult=0.1),
+                'reference_points': dict(lr_mult=0.1)
+            })))
diff --git a/head_extractor/src/mmdet/configs/dino/dino_5scale_swin_l_8xb2_12e_coco.py b/head_extractor/src/mmdet/configs/dino/dino_5scale_swin_l_8xb2_12e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..25aac0187ab2472dd062514fecf988dcd47504a5
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/dino/dino_5scale_swin_l_8xb2_12e_coco.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+
+from mmdet.models import SwinTransformer
+
+with read_base():
+    from .dino_4scale_r50_8xb2_12e_coco import *
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa
+num_levels = 5
+model.merge(
+    dict(
+        num_feature_levels=num_levels,
+        backbone=dict(
+            _delete_=True,
+            type=SwinTransformer,
+            pretrain_img_size=384,
+            embed_dims=192,
+            depths=[2, 2, 18, 2],
+            num_heads=[6, 12, 24, 48],
+            window_size=12,
+            mlp_ratio=4,
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.,
+            attn_drop_rate=0.,
+            drop_path_rate=0.2,
+            patch_norm=True,
+            out_indices=(0, 1, 2, 3),
+            # Please only add indices that would be used
+            # in FPN, otherwise some parameter will not be used
+            with_cp=True,
+            convert_weights=True,
+            init_cfg=dict(type=PretrainedInit, checkpoint=pretrained)),
+        neck=dict(in_channels=[192, 384, 768, 1536], num_outs=num_levels),
+        encoder=dict(
+            layer_cfg=dict(self_attn_cfg=dict(num_levels=num_levels))),
+        decoder=dict(
+            layer_cfg=dict(cross_attn_cfg=dict(num_levels=num_levels)))))
diff --git a/head_extractor/src/mmdet/configs/dino/dino_5scale_swin_l_8xb2_36e_coco.py b/head_extractor/src/mmdet/configs/dino/dino_5scale_swin_l_8xb2_36e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..494acf59f1c31fe419415920e8b65fbfb9267df1
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/dino/dino_5scale_swin_l_8xb2_36e_coco.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.runner.loops import EpochBasedTrainLoop
+
+with read_base():
+    from .dino_5scale_swin_l_8xb2_12e_coco import *
+
+max_epochs = 36
+train_cfg.update(
+    dict(type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1))
+
+param_scheduler[0].update(dict(milestones=[27, 33]))
diff --git a/head_extractor/src/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py b/head_extractor/src/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0a6d5a21470752fd26fa162edf5c2241afb1fed
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_detection import *
+    from .._base_.default_runtime import *
+    from .._base_.models.faster_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_1x import *
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2780f4afddc05ccd4ae1746206a6a6ad8cece39e
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_1x_coco.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_poly_1x_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_ms_poly_3x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_ms_poly_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a1badfc4f04f6ad5466d9ec3aa2d07708887927
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_caffe_fpn_ms_poly_3x_coco.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from ..common.ms_poly_3x_coco_instance import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        depth=101,
+        norm_cfg=dict(requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet101_caffe')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6770cec8eebe8c5130abd15f9bc44d5b5c5db875
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_1x_coco.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.models.mask_rcnn_r50_fpn import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet101')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd2aafb912ca84f776637e498d2743213a05d18a
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_2x_coco.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_2x_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet101')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_8xb8_amp_lsj_200e_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_8xb8_amp_lsj_200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..665808d5dc479ecb7c5a328af3861f59e460ac78
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_8xb8_amp_lsj_200e_coco.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet101')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_ms_poly_3x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_ms_poly_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..14688795963cb28018f5897429b191b235a86b6b
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r101_fpn_ms_poly_3x_coco.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from ..common.ms_poly_3x_coco_instance import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet101')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..67bd86fa0e8f8b414eec681852511db3b3d4c9c6
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r18_fpn_8xb8_amp_lsj_200e_coco.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    backbone=dict(
+        depth=18,
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet18')),
+    neck=dict(in_channels=[64, 128, 256, 512]))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..494e6ba593efa663f06e1383ceba8b57b9d097b5
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_c4_1x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_instance import *
+    from .._base_.default_runtime import *
+    from .._base_.models.mask_rcnn_r50_caffe_c4 import *
+    from .._base_.schedules.schedule_1x import *
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6481fcfd49eeac603eced8e46ee3a8705add8367
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_1x_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_1x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5952ed587a431740bc3d17ac9d2e6b5a3d326061
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_1x_coco.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_1x_coco import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args={{_base_.backend_args}}),
+    dict(type=LoadAnnotations, with_bbox=True, with_mask=True),
+    dict(
+        type=RandomChoiceResize,
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs),
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d62b9ebe958b3a8f790a6e9581942494f42bf7d6
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_1x_coco import *
+
+from mmcv.transforms import RandomChoiceResize
+from mmengine.model.weight_init import PretrainedInit
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args={{_base_.backend_args}}),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=RandomChoiceResize,
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_2x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa41b7e00ca153814f28ac29638cc497e7a2d3e9
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_2x_coco.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco import *
+
+train_cfg = dict(max_epochs=24)
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_3x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5f9b977b2dfaebfe01d834ac4ad8cf4522fe9c0
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_ms_poly_3x_coco.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_caffe_fpn_ms_poly_1x_coco import *
+
+train_cfg = dict(max_epochs=36)
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..28ba7c77ddf10d295d371db2f46d6c1f117ac7c6
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_poly_1x_coco_v1.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_1x_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+from mmdet.models.losses import SmoothL1Loss
+
+model = dict(
+    # use caffe img_norm
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False),
+    backbone=dict(
+        norm_cfg=dict(requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')),
+    rpn_head=dict(
+        loss_bbox=dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        bbox_roi_extractor=dict(
+            roi_layer=dict(
+                type=RoIAlign, output_size=7, sampling_ratio=2,
+                aligned=False)),
+        bbox_head=dict(
+            loss_bbox=dict(type=SmoothL1Loss, beta=1.0, loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            roi_layer=dict(
+                type=RoIAlign, output_size=14, sampling_ratio=2,
+                aligned=False))))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8145d08fee85c1758d3794cee952a3b7200b14bd
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_instance import *
+    from .._base_.default_runtime import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_1x import *
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2c0876541289d10832a1b26ddb6e91f6a66d89a
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_1x_wandb_coco.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_instance import *
+    from .._base_.default_runtime import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_1x import *
+
+from mmengine.visualization import LocalVisBackend, WandbVisBackend
+
+vis_backends.update(dict(type=WandbVisBackend))
+vis_backends.update(dict(type=LocalVisBackend))
+visualizer.update(dict(vis_backends=vis_backends))
+
+# MMEngine support the following two ways, users can choose
+# according to convenience
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+default_hooks.update(dict(checkpoint=dict(interval=4)))
+
+train_cfg.update(dict(val_interval=2))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6be010b4508d6ba300a1305a1d405ec9a265ae07
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_2x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_instance import *
+    from .._base_.default_runtime import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_2x import *
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_8xb8_amp_lsj_200e_coco.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_amp_1x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_amp_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..110c3c475429701a92321676d17f829f82cbfb76
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_amp_1x_coco.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_1x_coco import *
+
+from mmengine.optim.optimizer.amp_optimizer_wrapper import AmpOptimWrapper
+
+optim_wrapper.update(dict(type=AmpOptimWrapper))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_ms_poly_-3x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_ms_poly_-3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff4eec6d2be0f4bd61c7bd04057fd58b303120c8
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_ms_poly_-3x_coco.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.models.mask_rcnn_r50_fpn import *
+    from ..common.ms_poly_3x_coco_instance import *
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..012e711cb96f9aa67460b86694838d592fd1ae25
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_r50_fpn_poly_1x_coco.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.coco_instance import *
+    from .._base_.default_runtime import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+    from .._base_.schedules.schedule_1x import *
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs),
+]
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5429b1bd5a62f4786936d19e65d6281807d800bf
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_1x_coco.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r101_fpn_1x_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+from mmdet.models.backbones.resnext import ResNeXt
+
+model = dict(
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebae6c1dbc3a234ede68ba5b7a6e199edf966ead
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_2x_coco.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r50_fpn_2x_coco import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+from mmdet.models import ResNeXt
+
+model = dict(
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_ms_poly_3x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_ms_poly_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..aff45d89f351037cec3115271feab678eac3382f
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x4d_fpn_ms_poly_3x_coco.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from ..common.ms_poly_3x_coco_instance import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+
+from mmengine.model.weight_init import PretrainedInit
+
+from mmdet.models.backbones import ResNeXt
+
+model = dict(
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='open-mmlab://resnext101_32x4d')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9f2095dc2dff7396896a9b2af2fb05bcd765c69
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_1x_coco.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_x101_32x4d_fpn_1x_coco import *
+
+model = dict(
+    # ResNeXt-101-32x8d model trained with Caffe2 at FB,
+    # so the mean and std need to be changed.
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[57.375, 57.120, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_1x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8eded941751ce71b9c63baa565275802c7ee9bb2
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_1x_coco.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_r101_fpn_1x_coco import *
+
+from mmcv.transforms import RandomChoiceResize, RandomFlip
+from mmcv.transforms.loading import LoadImageFromFile
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.models.backbones import ResNeXt
+
+model = dict(
+    # ResNeXt-101-32x8d model trained with Caffe2 at FB,
+    # so the mean and std need to be changed.
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[57.375, 57.120, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
+
+backend_args = None
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=RandomChoiceResize,
+        scales=[(1333, 640), (1333, 672), (1333, 704), (1333, 736),
+                (1333, 768), (1333, 800)],
+        keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs),
+]
+
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_3x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3f584675f6da93ec7c188753c2f0478bac25ba8
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_32x8d_fpn_ms_poly_3x_coco.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from ..common.ms_poly_3x_coco_instance import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+
+from mmdet.models.backbones import ResNeXt
+
+model = dict(
+    # ResNeXt-101-32x8d model trained with Caffe2 at FB,
+    # so the mean and std need to be changed.
+    data_preprocessor=dict(
+        mean=[103.530, 116.280, 123.675],
+        std=[57.375, 57.120, 58.395],
+        bgr_to_rgb=False),
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=32,
+        base_width=8,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnext101_32x8d')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_64_4d_fpn_1x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_64_4d_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bb6f636e641138b902f69a543da0bd8a656db3d
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_64_4d_fpn_1x_coco.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_x101_32x4d_fpn_1x_coco import *
+
+model = dict(
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d661076dcf37df9668d6cdf726ecfc5720c561df
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_2x_coco.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .mask_rcnn_x101_32x4d_fpn_2x_coco import *
+
+model = dict(
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_ms_poly_3x_coco.py b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_ms_poly_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9ab3643ec27665f7a9411d95c7e01711dfe7623
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/mask_rcnn/mask_rcnn_x101_64x4d_fpn_ms_poly_3x_coco.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from ..common.ms_poly_3x_coco_instance import *
+    from .._base_.models.mask_rcnn_r50_fpn import *
+
+from mmdet.models.backbones import ResNeXt
+
+model = dict(
+    backbone=dict(
+        type=ResNeXt,
+        depth=101,
+        groups=64,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=True),
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='open-mmlab://resnext101_64x4d')))
diff --git a/head_extractor/src/mmdet/configs/maskformer/maskformer_r50_ms_16xb1_75e_coco.py b/head_extractor/src/mmdet/configs/maskformer/maskformer_r50_ms_16xb1_75e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..70744013afcad76834d05ccc8aa6303dc6399bc0
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/maskformer/maskformer_r50_ms_16xb1_75e_coco.py
@@ -0,0 +1,249 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms import RandomChoice, RandomChoiceResize
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+from mmengine.optim.optimizer import OptimWrapper
+from mmengine.optim.scheduler import MultiStepLR
+from mmengine.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.nn.modules.activation import ReLU
+from torch.nn.modules.batchnorm import BatchNorm2d
+from torch.nn.modules.normalization import GroupNorm
+from torch.optim.adamw import AdamW
+
+from mmdet.datasets.transforms.transforms import RandomCrop
+from mmdet.models import MaskFormer
+from mmdet.models.backbones import ResNet
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.maskformer_head import MaskFormerHead
+from mmdet.models.layers.pixel_decoder import TransformerEncoderPixelDecoder
+from mmdet.models.losses import CrossEntropyLoss, DiceLoss, FocalLoss
+from mmdet.models.seg_heads.panoptic_fusion_heads import MaskFormerFusionHead
+from mmdet.models.task_modules.assigners.hungarian_assigner import \
+    HungarianAssigner
+from mmdet.models.task_modules.assigners.match_cost import (ClassificationCost,
+                                                            DiceCost,
+                                                            FocalLossCost)
+from mmdet.models.task_modules.samplers import MaskPseudoSampler
+
+with read_base():
+    from .._base_.datasets.coco_panoptic import *
+    from .._base_.default_runtime import *
+
+data_preprocessor = dict(
+    type=DetDataPreprocessor,
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_size_divisor=1,
+    pad_mask=True,
+    mask_pad_value=0,
+    pad_seg=True,
+    seg_pad_value=255)
+
+num_things_classes = 80
+num_stuff_classes = 53
+num_classes = num_things_classes + num_stuff_classes
+model = dict(
+    type=MaskFormer,
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type=ResNet,
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(
+            type=PretrainedInit, checkpoint='torchvision://resnet50')),
+    panoptic_head=dict(
+        type=MaskFormerHead,
+        in_channels=[256, 512, 1024, 2048],  # pass to pixel_decoder inside
+        feat_channels=256,
+        out_channels=256,
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        num_queries=100,
+        pixel_decoder=dict(
+            type=TransformerEncoderPixelDecoder,
+            norm_cfg=dict(type=GroupNorm, num_groups=32),
+            act_cfg=dict(type=ReLU),
+            encoder=dict(  # DetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiheadAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        dropout=0.1,
+                        batch_first=True),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=2048,
+                        num_fcs=2,
+                        ffn_drop=0.1,
+                        act_cfg=dict(type=ReLU, inplace=True)))),
+            positional_encoding=dict(num_feats=128, normalize=True)),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(num_feats=128, normalize=True),
+        transformer_decoder=dict(  # DetrTransformerDecoder
+            num_layers=6,
+            layer_cfg=dict(  # DetrTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.1,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.1,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    ffn_drop=0.1,
+                    act_cfg=dict(type=ReLU, inplace=True))),
+            return_intermediate=True),
+        loss_cls=dict(
+            type=CrossEntropyLoss,
+            use_sigmoid=False,
+            loss_weight=1.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type=FocalLoss,
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            reduction='mean',
+            loss_weight=20.0),
+        loss_dice=dict(
+            type=DiceLoss,
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=1.0)),
+    panoptic_fusion_head=dict(
+        type=MaskFormerFusionHead,
+        num_things_classes=num_things_classes,
+        num_stuff_classes=num_stuff_classes,
+        loss_panoptic=None,
+        init_cfg=None),
+    train_cfg=dict(
+        assigner=dict(
+            type=HungarianAssigner,
+            match_costs=[
+                dict(type=ClassificationCost, weight=1.0),
+                dict(type=FocalLossCost, weight=20.0, binary_input=True),
+                dict(type=DiceCost, weight=1.0, pred_act=True, eps=1.0)
+            ]),
+        sampler=dict(type=MaskPseudoSampler)),
+    test_cfg=dict(
+        panoptic_on=True,
+        # For now, the dataset does not support
+        # evaluating semantic segmentation metric.
+        semantic_on=False,
+        instance_on=False,
+        # max_per_image is for instance segmentation.
+        max_per_image=100,
+        object_mask_thr=0.8,
+        iou_thr=0.8,
+        # In MaskFormer's panoptic postprocessing,
+        # it will not filter masks whose score is smaller than 0.5 .
+        filter_low_score=False),
+    init_cfg=None)
+
+# dataset settings
+train_pipeline = [
+    dict(type=LoadImageFromFile),
+    dict(
+        type=LoadPanopticAnnotations,
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(type=RandomFlip, prob=0.5),
+    # dict(type=Resize, scale=(1333, 800), keep_ratio=True),
+    dict(
+        type=RandomChoice,
+        transforms=[[
+            dict(
+                type=RandomChoiceResize,
+                scales=[(480, 1333), (512, 1333), (544, 1333), (576, 1333),
+                        (608, 1333), (640, 1333), (672, 1333), (704, 1333),
+                        (736, 1333), (768, 1333), (800, 1333)],
+                resize_type=Resize,
+                keep_ratio=True)
+        ],
+                    [
+                        dict(
+                            type=RandomChoiceResize,
+                            scales=[(400, 1333), (500, 1333), (600, 1333)],
+                            resize_type=Resize,
+                            keep_ratio=True),
+                        dict(
+                            type=RandomCrop,
+                            crop_type='absolute_range',
+                            crop_size=(384, 600),
+                            allow_negative_crop=True),
+                        dict(
+                            type=RandomChoiceResize,
+                            scales=[(480, 1333), (512, 1333), (544, 1333),
+                                    (576, 1333), (608, 1333), (640, 1333),
+                                    (672, 1333), (704, 1333), (736, 1333),
+                                    (768, 1333), (800, 1333)],
+                            resize_type=Resize,
+                            keep_ratio=True)
+                    ]]),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(
+    dict(batch_size=1, num_workers=1, dataset=dict(pipeline=train_pipeline)))
+
+val_dataloader.update(dict(batch_size=1, num_workers=1))
+
+test_dataloader = val_dataloader
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(
+        type=AdamW,
+        lr=0.0001,
+        weight_decay=0.0001,
+        eps=1e-8,
+        betas=(0.9, 0.999)),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': dict(lr_mult=1.0, decay_mult=0.0)
+        },
+        norm_decay_mult=0.0),
+    clip_grad=dict(max_norm=0.01, norm_type=2))
+
+max_epochs = 75
+
+# learning rate
+param_scheduler = dict(
+    type=MultiStepLR,
+    begin=0,
+    end=max_epochs,
+    by_epoch=True,
+    milestones=[50],
+    gamma=0.1)
+
+train_cfg = dict(
+    type=EpochBasedTrainLoop, max_epochs=max_epochs, val_interval=1)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (16 GPUs) x (1 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/head_extractor/src/mmdet/configs/maskformer/maskformer_swin_l_p4_w12_64xb1_ms_300e_coco.py b/head_extractor/src/mmdet/configs/maskformer/maskformer_swin_l_p4_w12_64xb1_ms_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..2affe520918d0f26c0a858f97bb69646a2860f87
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/maskformer/maskformer_swin_l_p4_w12_64xb1_ms_300e_coco.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.optim.scheduler import LinearLR
+
+from mmdet.models.backbones import SwinTransformer
+from mmdet.models.layers import PixelDecoder
+
+with read_base():
+    from .maskformer_r50_ms_16xb1_75e_coco import *
+
+pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'  # noqa
+depths = [2, 2, 18, 2]
+model.update(
+    dict(
+        backbone=dict(
+            _delete_=True,
+            type=SwinTransformer,
+            pretrain_img_size=384,
+            embed_dims=192,
+            patch_size=4,
+            window_size=12,
+            mlp_ratio=4,
+            depths=depths,
+            num_heads=[6, 12, 24, 48],
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.,
+            attn_drop_rate=0.,
+            drop_path_rate=0.3,
+            patch_norm=True,
+            out_indices=(0, 1, 2, 3),
+            with_cp=False,
+            convert_weights=True,
+            init_cfg=dict(type=PretrainedInit, checkpoint=pretrained)),
+        panoptic_head=dict(
+            in_channels=[192, 384, 768, 1536],  # pass to pixel_decoder inside
+            pixel_decoder=dict(
+                _delete_=True,
+                type=PixelDecoder,
+                norm_cfg=dict(type=GroupNorm, num_groups=32),
+                act_cfg=dict(type=ReLU)),
+            enforce_decoder_input_project=True)))
+
+# optimizer
+
+# weight_decay = 0.01
+# norm_weight_decay = 0.0
+# embed_weight_decay = 0.0
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'norm': norm_multi,
+    'absolute_pos_embed': embed_multi,
+    'relative_position_bias_table': embed_multi,
+    'query_embed': embed_multi
+}
+
+optim_wrapper.update(
+    dict(
+        optimizer=dict(lr=6e-5, weight_decay=0.01),
+        paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0)))
+
+max_epochs = 300
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=max_epochs,
+        by_epoch=True,
+        milestones=[250],
+        gamma=0.1)
+]
+
+train_cfg.update(dict(max_epochs=max_epochs))
+
+# NOTE: `auto_scale_lr` is for automatically scaling LR,
+# USER SHOULD NOT CHANGE ITS VALUES.
+# base_batch_size = (64 GPUs) x (1 samples per GPU)
+auto_scale_lr.update(dict(base_batch_size=64))
diff --git a/head_extractor/src/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py b/head_extractor/src/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6059780da15b24d4845cdac9ad33d65a6b24e75
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_1x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+
+with read_base():
+    from .panoptic_fpn_r50_fpn_1x_coco import *
+
+model.update(
+    dict(
+        backbone=dict(
+            depth=101,
+            init_cfg=dict(
+                type=PretrainedInit, checkpoint='torchvision://resnet101'))))
diff --git a/head_extractor/src/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_ms_3x_coco.py b/head_extractor/src/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_ms_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..c02c3237f81df5823ebf60a6d485365cdb655e32
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/panoptic_fpn/panoptic_fpn_r101_fpn_ms_3x_coco.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.model.weight_init import PretrainedInit
+
+with read_base():
+    from .panoptic_fpn_r50_fpn_ms_3x_coco import *
+
+model.update(
+    dict(
+        backbone=dict(
+            depth=101,
+            init_cfg=dict(
+                type=PretrainedInit, checkpoint='torchvision://resnet101'))))
diff --git a/head_extractor/src/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py b/head_extractor/src/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc8932803ca0d1fd52bee7d450fc12898e0ec7b3
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_1x_coco.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.models.mask_rcnn_r50_fpn import *
+    from .._base_.datasets.coco_panoptic import *
+    from .._base_.schedules.schedule_1x import *
+    from .._base_.default_runtime import *
+
+from mmcv.ops import nms
+from torch.nn import GroupNorm
+
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.detectors.panoptic_fpn import PanopticFPN
+from mmdet.models.losses.cross_entropy_loss import CrossEntropyLoss
+from mmdet.models.seg_heads.panoptic_fpn_head import PanopticFPNHead
+from mmdet.models.seg_heads.panoptic_fusion_heads import HeuristicFusionHead
+
+model.update(
+    dict(
+        type=PanopticFPN,
+        data_preprocessor=dict(
+            type=DetDataPreprocessor,
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.12, 57.375],
+            bgr_to_rgb=True,
+            pad_size_divisor=32,
+            pad_mask=True,
+            mask_pad_value=0,
+            pad_seg=True,
+            seg_pad_value=255),
+        semantic_head=dict(
+            type=PanopticFPNHead,
+            num_things_classes=80,
+            num_stuff_classes=53,
+            in_channels=256,
+            inner_channels=128,
+            start_level=0,
+            end_level=4,
+            norm_cfg=dict(type=GroupNorm, num_groups=32, requires_grad=True),
+            conv_cfg=None,
+            loss_seg=dict(
+                type=CrossEntropyLoss, ignore_index=255, loss_weight=0.5)),
+        panoptic_fusion_head=dict(
+            type=HeuristicFusionHead,
+            num_things_classes=80,
+            num_stuff_classes=53),
+        test_cfg=dict(
+            rcnn=dict(
+                score_thr=0.6,
+                nms=dict(type=nms, iou_threshold=0.5, class_agnostic=True),
+                max_per_img=100,
+                mask_thr_binary=0.5),
+            # used in HeuristicFusionHead
+            panoptic=dict(mask_overlap=0.5, stuff_area_limit=4096))))
+
+# Forced to remove NumClassCheckHook
+custom_hooks = []
diff --git a/head_extractor/src/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_ms_3x_coco.py b/head_extractor/src/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_ms_3x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..25ebe5d67c44831b5a95978ffc0bfacec7c15de6
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/panoptic_fpn/panoptic_fpn_r50_fpn_ms_3x_coco.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+from mmengine.optim.scheduler.lr_scheduler import LinearLR, MultiStepLR
+
+with read_base():
+    from .panoptic_fpn_r50_fpn_1x_coco import *
+
+from mmcv.transforms import RandomResize
+from mmcv.transforms.loading import LoadImageFromFile
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadPanopticAnnotations
+from mmdet.datasets.transforms.transforms import RandomFlip
+
+# In mstrain 3x config, img_scale=[(1333, 640), (1333, 800)],
+# multiscale_mode='range'
+train_pipeline = [
+    dict(type=LoadImageFromFile),
+    dict(
+        type=LoadPanopticAnnotations,
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True),
+    dict(type=RandomResize, scale=[(1333, 640), (1333, 800)], keep_ratio=True),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
+
+# TODO: Use RepeatDataset to speed up training
+# training schedule for 3x
+train_cfg.update(dict(max_epochs=36, val_interval=3))
+
+# learning rate
+param_scheduler = [
+    dict(type=LinearLR, start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type=MultiStepLR,
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[24, 33],
+        gamma=0.1)
+]
diff --git a/head_extractor/src/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_4e_base.py b/head_extractor/src/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_4e_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c672e82c6498092b57c389be01af64a9e26d14bc
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_4e_base.py
@@ -0,0 +1,141 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.models.faster_rcnn_r50_fpn import *
+    from .._base_.models.faster_rcnn_r50_fpn import model
+    from .._base_.default_runtime import *
+
+from mmcv.ops import RoIAlign
+from mmengine.hooks import LoggerHook, SyncBuffersHook
+from mmengine.model.weight_init import PretrainedInit
+from mmengine.optim import MultiStepLR, OptimWrapper
+from mmengine.runner.runner import EpochBasedTrainLoop, TestLoop, ValLoop
+from torch.nn.modules.batchnorm import BatchNorm2d
+from torch.nn.modules.normalization import GroupNorm
+from torch.optim import SGD
+
+from mmdet.engine.hooks import TrackVisualizationHook
+from mmdet.models import (QDTrack, QuasiDenseEmbedHead, QuasiDenseTracker,
+                          QuasiDenseTrackHead, SingleRoIExtractor,
+                          TrackDataPreprocessor)
+from mmdet.models.losses import (L1Loss, MarginL2Loss,
+                                 MultiPosCrossEntropyLoss, SmoothL1Loss)
+from mmdet.models.task_modules import (CombinedSampler,
+                                       InstanceBalancedPosSampler,
+                                       MaxIoUAssigner, RandomSampler)
+from mmdet.visualization import TrackLocalVisualizer
+
+detector = model
+detector.pop('data_preprocessor')
+
+detector['backbone'].update(
+    dict(
+        norm_cfg=dict(type=BatchNorm2d, requires_grad=False),
+        style='caffe',
+        init_cfg=dict(
+            type=PretrainedInit,
+            checkpoint='open-mmlab://detectron2/resnet50_caffe')))
+detector.rpn_head.loss_bbox.update(
+    dict(type=SmoothL1Loss, beta=1.0 / 9.0, loss_weight=1.0))
+detector.rpn_head.bbox_coder.update(dict(clip_border=False))
+detector.roi_head.bbox_head.update(dict(num_classes=1))
+detector.roi_head.bbox_head.bbox_coder.update(dict(clip_border=False))
+detector['init_cfg'] = dict(
+    type=PretrainedInit,
+    checkpoint=  # noqa: E251
+    'https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/'
+    'faster_rcnn_r50_fpn_1x_coco-person/'
+    'faster_rcnn_r50_fpn_1x_coco-person_20201216_175929-d022e227.pth'
+    # noqa: E501
+)
+del model
+
+model = dict(
+    type=QDTrack,
+    data_preprocessor=dict(
+        type=TrackDataPreprocessor,
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    detector=detector,
+    track_head=dict(
+        type=QuasiDenseTrackHead,
+        roi_extractor=dict(
+            type=SingleRoIExtractor,
+            roi_layer=dict(type=RoIAlign, output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        embed_head=dict(
+            type=QuasiDenseEmbedHead,
+            num_convs=4,
+            num_fcs=1,
+            embed_channels=256,
+            norm_cfg=dict(type=GroupNorm, num_groups=32),
+            loss_track=dict(type=MultiPosCrossEntropyLoss, loss_weight=0.25),
+            loss_track_aux=dict(
+                type=MarginL2Loss,
+                neg_pos_ub=3,
+                pos_margin=0,
+                neg_margin=0.1,
+                hard_mining=True,
+                loss_weight=1.0)),
+        loss_bbox=dict(type=L1Loss, loss_weight=1.0),
+        train_cfg=dict(
+            assigner=dict(
+                type=MaxIoUAssigner,
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type=CombinedSampler,
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=3,
+                add_gt_as_proposals=True,
+                pos_sampler=dict(type=InstanceBalancedPosSampler),
+                neg_sampler=dict(type=RandomSampler)))),
+    tracker=dict(
+        type=QuasiDenseTracker,
+        init_score_thr=0.9,
+        obj_score_thr=0.5,
+        match_score_thr=0.5,
+        memo_tracklet_frames=30,
+        memo_backdrop_frames=1,
+        memo_momentum=0.8,
+        nms_conf_thr=0.5,
+        nms_backdrop_iou_thr=0.3,
+        nms_class_iou_thr=0.7,
+        with_cats=True,
+        match_metric='bisoftmax'))
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=SGD, lr=0.02, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=35, norm_type=2))
+# learning policy
+param_scheduler = [
+    dict(type=MultiStepLR, begin=0, end=4, by_epoch=True, milestones=[3])
+]
+
+# runtime settings
+train_cfg = dict(type=EpochBasedTrainLoop, max_epochs=4, val_interval=4)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+default_hooks.update(
+    logger=dict(type=LoggerHook, interval=50),
+    visualization=dict(type=TrackVisualizationHook, draw=False))
+
+visualizer.update(
+    type=TrackLocalVisualizer, vis_backends=vis_backends, name='visualizer')
+
+# custom hooks
+custom_hooks = [
+    # Synchronize model buffers such as running_mean and running_var in BN
+    # at the end of each epoch
+    dict(type=SyncBuffersHook)
+]
diff --git a/head_extractor/src/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py b/head_extractor/src/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fa715e1b3806f9f9816e3b23a100167c791f0b8
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/qdtrack/qdtrack_faster_rcnn_r50_fpn_8xb2-4e_mot17halftrain_test-mot17halfval.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.datasets.mot_challenge import *
+    from .qdtrack_faster_rcnn_r50_fpn_4e_base import *
+
+from mmdet.evaluation import CocoVideoMetric, MOTChallengeMetric
+
+# evaluator
+val_evaluator = [
+    dict(type=CocoVideoMetric, metric=['bbox'], classwise=True),
+    dict(type=MOTChallengeMetric, metric=['HOTA', 'CLEAR', 'Identity'])
+]
diff --git a/head_extractor/src/mmdet/configs/retinanet/retinanet_r50_fpn_1x_coco.py b/head_extractor/src/mmdet/configs/retinanet/retinanet_r50_fpn_1x_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..847600e61b3daf556ff24d06af2f08249deb2284
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/retinanet/retinanet_r50_fpn_1x_coco.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.models.retinanet_r50_fpn import *
+    from .._base_.datasets.coco_detection import *
+    from .._base_.schedules.schedule_1x import *
+    from .._base_.default_runtime import *
+    from .retinanet_tta import *
+
+from torch.optim.sgd import SGD
+
+# optimizer
+optim_wrapper.update(
+    dict(optimizer=dict(type=SGD, lr=0.01, momentum=0.9, weight_decay=0.0001)))
diff --git a/head_extractor/src/mmdet/configs/retinanet/retinanet_tta.py b/head_extractor/src/mmdet/configs/retinanet/retinanet_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e340e5854e58a332ee174b0e69e7f3f9ec2c486
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/retinanet/retinanet_tta.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import TestTimeAug
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import RandomFlip, Resize
+from mmdet.models.test_time_augs.det_tta import DetTTAModel
+
+tta_model = dict(
+    type=DetTTAModel,
+    tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.5), max_per_img=100))
+
+img_scales = [(1333, 800), (666, 400), (2000, 1200)]
+tta_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=None),
+    dict(
+        type=TestTimeAug,
+        transforms=[
+            [dict(type=Resize, scale=s, keep_ratio=True) for s in img_scales],
+            [dict(type=RandomFlip, prob=1.),
+             dict(type=RandomFlip, prob=0.)],
+            [dict(type=LoadAnnotations, with_bbox=True)],
+            [
+                dict(
+                    type=PackDetInputs,
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'scale_factor', 'flip', 'flip_direction'))
+            ]
+        ])
+]
diff --git a/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_l_8xb32_300e_coco.py b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_l_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..302d7cda110b7a598ba525549e1d96d27ee51990
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_l_8xb32_300e_coco.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_l_8xb32_300e_coco import *
+
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize
+from mmengine.hooks.ema_hook import EMAHook
+from torch.nn.modules.activation import SiLU
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  Resize, YOLOXHSVRandomAug)
+from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook
+from mmdet.models.dense_heads.rtmdet_ins_head import RTMDetInsSepBNHead
+from mmdet.models.layers.ema import ExpMomentumEMA
+from mmdet.models.losses.dice_loss import DiceLoss
+from mmdet.models.losses.gfocal_loss import QualityFocalLoss
+from mmdet.models.losses.iou_loss import GIoULoss
+from mmdet.models.task_modules.coders.distance_point_bbox_coder import \
+    DistancePointBBoxCoder
+from mmdet.models.task_modules.prior_generators.point_generator import \
+    MlvlPointGenerator
+
+model.merge(
+    dict(
+        bbox_head=dict(
+            _delete_=True,
+            type=RTMDetInsSepBNHead,
+            num_classes=80,
+            in_channels=256,
+            stacked_convs=2,
+            share_conv=True,
+            pred_kernel_size=1,
+            feat_channels=256,
+            act_cfg=dict(type=SiLU, inplace=True),
+            norm_cfg=dict(type='SyncBN', requires_grad=True),
+            anchor_generator=dict(
+                type=MlvlPointGenerator, offset=0, strides=[8, 16, 32]),
+            bbox_coder=dict(type=DistancePointBBoxCoder),
+            loss_cls=dict(
+                type=QualityFocalLoss,
+                use_sigmoid=True,
+                beta=2.0,
+                loss_weight=1.0),
+            loss_bbox=dict(type=GIoULoss, loss_weight=2.0),
+            loss_mask=dict(
+                type=DiceLoss, loss_weight=2.0, eps=5e-6, reduction='mean')),
+        test_cfg=dict(
+            nms_pre=1000,
+            min_bbox_size=0,
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.6),
+            max_per_img=100,
+            mask_thr_binary=0.5),
+    ))
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0),
+    dict(
+        type=RandomResize,
+        scale=(1280, 1280),
+        ratio_range=(0.1, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(
+        type=RandomCrop,
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type=CachedMixUp,
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(
+    dict(pin_memory=True, dataset=dict(pipeline=train_pipeline)))
+
+train_pipeline_stage2 = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=RandomResize,
+        scale=(640, 640),
+        ratio_range=(0.1, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(
+        type=RandomCrop,
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type=PackDetInputs)
+]
+custom_hooks = [
+    dict(
+        type=EMAHook,
+        ema_type=ExpMomentumEMA,
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type=PipelineSwitchHook,
+        switch_epoch=280,
+        switch_pipeline=train_pipeline_stage2)
+]
+
+val_evaluator.update(dict(metric=['bbox', 'segm']))
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_m_8xb32_300e_coco.py b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_m_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..d90be9293a18cfd703ee1d9993b03237fb3c3dab
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_m_8xb32_300e_coco.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_ins_l_8xb32_300e_coco import *
+
+model.update(
+    dict(
+        backbone=dict(deepen_factor=0.67, widen_factor=0.75),
+        neck=dict(
+            in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2),
+        bbox_head=dict(in_channels=192, feat_channels=192)))
diff --git a/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_s_8xb32_300e_coco.py b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_s_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..58b5b1aff0cff8d770798288b74237bc5183d37b
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_s_8xb32_300e_coco.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_ins_l_8xb32_300e_coco import *
+
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize
+from mmengine.hooks.ema_hook import EMAHook
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  Resize, YOLOXHSVRandomAug)
+from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook
+from mmdet.models.layers.ema import ExpMomentumEMA
+
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth'  # noqa
+model.update(
+    dict(
+        backbone=dict(
+            deepen_factor=0.33,
+            widen_factor=0.5,
+            init_cfg=dict(
+                type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+        neck=dict(
+            in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1),
+        bbox_head=dict(in_channels=128, feat_channels=128)))
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0),
+    dict(
+        type=RandomResize,
+        scale=(1280, 1280),
+        ratio_range=(0.5, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(
+        type=RandomCrop,
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type=CachedMixUp,
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)),
+    dict(type=PackDetInputs)
+]
+
+train_pipeline_stage2 = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=RandomResize,
+        scale=(640, 640),
+        ratio_range=(0.5, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(
+        type=RandomCrop,
+        crop_size=(640, 640),
+        recompute_bbox=True,
+        allow_negative_crop=True),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
+
+custom_hooks = [
+    dict(
+        type=EMAHook,
+        ema_type=ExpMomentumEMA,
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type=PipelineSwitchHook,
+        switch_epoch=280,
+        switch_pipeline=train_pipeline_stage2)
+]
diff --git a/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_tiny_8xb32_300e_coco.py b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_tiny_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0356b1951da584034cf65014a39c7440fc3da56d
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_tiny_8xb32_300e_coco.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_ins_s_8xb32_300e_coco import *
+
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import (FilterAnnotations,
+                                               LoadAnnotations)
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  Resize, YOLOXHSVRandomAug)
+
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth'  # noqa
+
+model.update(
+    dict(
+        backbone=dict(
+            deepen_factor=0.167,
+            widen_factor=0.375,
+            init_cfg=dict(
+                type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+        neck=dict(
+            in_channels=[96, 192, 384], out_channels=96, num_csp_blocks=1),
+        bbox_head=dict(in_channels=96, feat_channels=96)))
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(
+        type=LoadAnnotations, with_bbox=True, with_mask=True, poly2mask=False),
+    dict(
+        type=CachedMosaic,
+        img_scale=(640, 640),
+        pad_val=114.0,
+        max_cached_images=20,
+        random_pop=False),
+    dict(
+        type=RandomResize,
+        scale=(1280, 1280),
+        ratio_range=(0.5, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=(640, 640)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type=CachedMixUp,
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=10,
+        random_pop=False,
+        pad_val=(114, 114, 114),
+        prob=0.5),
+    dict(type=FilterAnnotations, min_gt_bbox_wh=(1, 1)),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_x_8xb16_300e_coco.py b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_x_8xb16_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..555b10102f67ee625d65dbfe0894eb4b41198595
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_ins_x_8xb16_300e_coco.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_ins_l_8xb32_300e_coco import *
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR, LinearLR
+
+model.update(
+    dict(
+        backbone=dict(deepen_factor=1.33, widen_factor=1.25),
+        neck=dict(
+            in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4),
+        bbox_head=dict(in_channels=320, feat_channels=320)))
+
+base_lr = 0.002
+
+# optimizer
+optim_wrapper.update(dict(optimizer=dict(lr=base_lr)))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type=LinearLR, start_factor=1.0e-5, by_epoch=False, begin=0, end=1000),
+    dict(
+        # use cosine lr from 150 to 300 epoch
+        type=CosineAnnealingLR,
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
diff --git a/head_extractor/src/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dcda7bf994db9f3f5c785d8dea824b3ab8e56a2
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_l_8xb32_300e_coco.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .._base_.default_runtime import *
+    from .._base_.schedules.schedule_1x import *
+    from .._base_.datasets.coco_detection import *
+    from .rtmdet_tta import *
+
+from mmcv.ops import nms
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize
+from mmengine.hooks.ema_hook import EMAHook
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import CosineAnnealingLR, LinearLR
+from torch.nn import SyncBatchNorm
+from torch.nn.modules.activation import SiLU
+from torch.optim.adamw import AdamW
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  Resize, YOLOXHSVRandomAug)
+from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook
+from mmdet.models.backbones.cspnext import CSPNeXt
+from mmdet.models.data_preprocessors.data_preprocessor import \
+    DetDataPreprocessor
+from mmdet.models.dense_heads.rtmdet_head import RTMDetSepBNHead
+from mmdet.models.detectors.rtmdet import RTMDet
+from mmdet.models.layers.ema import ExpMomentumEMA
+from mmdet.models.losses.gfocal_loss import QualityFocalLoss
+from mmdet.models.losses.iou_loss import GIoULoss
+from mmdet.models.necks.cspnext_pafpn import CSPNeXtPAFPN
+from mmdet.models.task_modules.assigners.dynamic_soft_label_assigner import \
+    DynamicSoftLabelAssigner
+from mmdet.models.task_modules.coders.distance_point_bbox_coder import \
+    DistancePointBBoxCoder
+from mmdet.models.task_modules.prior_generators.point_generator import \
+    MlvlPointGenerator
+
+model = dict(
+    type=RTMDet,
+    data_preprocessor=dict(
+        type=DetDataPreprocessor,
+        mean=[103.53, 116.28, 123.675],
+        std=[57.375, 57.12, 58.395],
+        bgr_to_rgb=False,
+        batch_augments=None),
+    backbone=dict(
+        type=CSPNeXt,
+        arch='P5',
+        expand_ratio=0.5,
+        deepen_factor=1,
+        widen_factor=1,
+        channel_attention=True,
+        norm_cfg=dict(type=SyncBatchNorm),
+        act_cfg=dict(type=SiLU, inplace=True)),
+    neck=dict(
+        type=CSPNeXtPAFPN,
+        in_channels=[256, 512, 1024],
+        out_channels=256,
+        num_csp_blocks=3,
+        expand_ratio=0.5,
+        norm_cfg=dict(type=SyncBatchNorm),
+        act_cfg=dict(type=SiLU, inplace=True)),
+    bbox_head=dict(
+        type=RTMDetSepBNHead,
+        num_classes=80,
+        in_channels=256,
+        stacked_convs=2,
+        feat_channels=256,
+        anchor_generator=dict(
+            type=MlvlPointGenerator, offset=0, strides=[8, 16, 32]),
+        bbox_coder=dict(type=DistancePointBBoxCoder),
+        loss_cls=dict(
+            type=QualityFocalLoss, use_sigmoid=True, beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type=GIoULoss, loss_weight=2.0),
+        with_objectness=False,
+        exp_on_reg=True,
+        share_conv=True,
+        pred_kernel_size=1,
+        norm_cfg=dict(type=SyncBatchNorm),
+        act_cfg=dict(type=SiLU, inplace=True)),
+    train_cfg=dict(
+        assigner=dict(type=DynamicSoftLabelAssigner, topk=13),
+        allowed_border=-1,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        nms_pre=30000,
+        min_bbox_size=0,
+        score_thr=0.001,
+        nms=dict(type=nms, iou_threshold=0.65),
+        max_per_img=300),
+)
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0),
+    dict(
+        type=RandomResize,
+        scale=(1280, 1280),
+        ratio_range=(0.1, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=(640, 640)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type=CachedMixUp,
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type=PackDetInputs)
+]
+
+train_pipeline_stage2 = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=RandomResize,
+        scale=(640, 640),
+        ratio_range=(0.1, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=(640, 640)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type=PackDetInputs)
+]
+
+test_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=Resize, scale=(640, 640), keep_ratio=True),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=PackDetInputs,
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+
+train_dataloader.update(
+    dict(
+        batch_size=32,
+        num_workers=10,
+        batch_sampler=None,
+        pin_memory=True,
+        dataset=dict(pipeline=train_pipeline)))
+val_dataloader.update(
+    dict(batch_size=5, num_workers=10, dataset=dict(pipeline=test_pipeline)))
+test_dataloader = val_dataloader
+
+max_epochs = 300
+stage2_num_epochs = 20
+base_lr = 0.004
+interval = 10
+
+train_cfg.update(
+    dict(
+        max_epochs=max_epochs,
+        val_interval=interval,
+        dynamic_intervals=[(max_epochs - stage2_num_epochs, 1)]))
+
+val_evaluator.update(dict(proposal_nums=(100, 1, 10)))
+test_evaluator = val_evaluator
+
+# optimizer
+optim_wrapper = dict(
+    type=OptimWrapper,
+    optimizer=dict(type=AdamW, lr=base_lr, weight_decay=0.05),
+    paramwise_cfg=dict(
+        norm_decay_mult=0, bias_decay_mult=0, bypass_duplicate=True))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type=LinearLR, start_factor=1.0e-5, by_epoch=False, begin=0, end=1000),
+    dict(
+        # use cosine lr from 150 to 300 epoch
+        type=CosineAnnealingLR,
+        eta_min=base_lr * 0.05,
+        begin=max_epochs // 2,
+        end=max_epochs,
+        T_max=max_epochs // 2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+]
+
+# hooks
+default_hooks.update(
+    dict(
+        checkpoint=dict(
+            interval=interval,
+            max_keep_ckpts=3  # only keep latest 3 checkpoints
+        )))
+
+custom_hooks = [
+    dict(
+        type=EMAHook,
+        ema_type=ExpMomentumEMA,
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type=PipelineSwitchHook,
+        switch_epoch=max_epochs - stage2_num_epochs,
+        switch_pipeline=train_pipeline_stage2)
+]
diff --git a/head_extractor/src/mmdet/configs/rtmdet/rtmdet_m_8xb32_300e_coco.py b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_m_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e741d8220fe8831894b7b803060031e18dbac62b
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_m_8xb32_300e_coco.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_l_8xb32_300e_coco import *
+
+model.update(
+    dict(
+        backbone=dict(deepen_factor=0.67, widen_factor=0.75),
+        neck=dict(
+            in_channels=[192, 384, 768], out_channels=192, num_csp_blocks=2),
+        bbox_head=dict(in_channels=192, feat_channels=192)))
diff --git a/head_extractor/src/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..db21b747e95a15c69af1c17c16a5e6cfd4a2be78
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_s_8xb32_300e_coco.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_l_8xb32_300e_coco import *
+
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize
+from mmengine.hooks.ema_hook import EMAHook
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  Resize, YOLOXHSVRandomAug)
+from mmdet.engine.hooks.pipeline_switch_hook import PipelineSwitchHook
+from mmdet.models.layers.ema import ExpMomentumEMA
+
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-s_imagenet_600e.pth'  # noqa
+model.update(
+    dict(
+        backbone=dict(
+            deepen_factor=0.33,
+            widen_factor=0.5,
+            init_cfg=dict(
+                type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+        neck=dict(
+            in_channels=[128, 256, 512], out_channels=128, num_csp_blocks=1),
+        bbox_head=dict(in_channels=128, feat_channels=128, exp_on_reg=False)))
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(type=CachedMosaic, img_scale=(640, 640), pad_val=114.0),
+    dict(
+        type=RandomResize,
+        scale=(1280, 1280),
+        ratio_range=(0.5, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=(640, 640)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type=CachedMixUp,
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=20,
+        pad_val=(114, 114, 114)),
+    dict(type=PackDetInputs)
+]
+
+train_pipeline_stage2 = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=RandomResize,
+        scale=(640, 640),
+        ratio_range=(0.5, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=(640, 640)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
+
+custom_hooks = [
+    dict(
+        type=EMAHook,
+        ema_type=ExpMomentumEMA,
+        momentum=0.0002,
+        update_buffers=True,
+        priority=49),
+    dict(
+        type=PipelineSwitchHook,
+        switch_epoch=280,
+        switch_pipeline=train_pipeline_stage2)
+]
diff --git a/head_extractor/src/mmdet/configs/rtmdet/rtmdet_tiny_8xb32_300e_coco.py b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_tiny_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..949d056f16303751d121aeba8f3d859de07b06d2
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_tiny_8xb32_300e_coco.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_s_8xb32_300e_coco import *
+
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import RandomResize
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import (CachedMixUp, CachedMosaic,
+                                                  Pad, RandomCrop, RandomFlip,
+                                                  Resize, YOLOXHSVRandomAug)
+
+checkpoint = 'https://download.openmmlab.com/mmdetection/v3.0/rtmdet/cspnext_rsb_pretrain/cspnext-tiny_imagenet_600e.pth'  # noqa
+
+model.update(
+    dict(
+        backbone=dict(
+            deepen_factor=0.167,
+            widen_factor=0.375,
+            init_cfg=dict(
+                type='Pretrained', prefix='backbone.', checkpoint=checkpoint)),
+        neck=dict(
+            in_channels=[96, 192, 384], out_channels=96, num_csp_blocks=1),
+        bbox_head=dict(in_channels=96, feat_channels=96, exp_on_reg=False)))
+
+train_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=backend_args),
+    dict(type=LoadAnnotations, with_bbox=True),
+    dict(
+        type=CachedMosaic,
+        img_scale=(640, 640),
+        pad_val=114.0,
+        max_cached_images=20,
+        random_pop=False),
+    dict(
+        type=RandomResize,
+        scale=(1280, 1280),
+        ratio_range=(0.5, 2.0),
+        resize_type=Resize,
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=(640, 640)),
+    dict(type=YOLOXHSVRandomAug),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=Pad, size=(640, 640), pad_val=dict(img=(114, 114, 114))),
+    dict(
+        type=CachedMixUp,
+        img_scale=(640, 640),
+        ratio_range=(1.0, 1.0),
+        max_cached_images=10,
+        random_pop=False,
+        pad_val=(114, 114, 114),
+        prob=0.5),
+    dict(type=PackDetInputs)
+]
+
+train_dataloader.update(dict(dataset=dict(pipeline=train_pipeline)))
diff --git a/head_extractor/src/mmdet/configs/rtmdet/rtmdet_tta.py b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..f27b7aa4a3bf13a28cab3e25be755a9792620ece
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_tta.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import TestTimeAug
+
+from mmdet.datasets.transforms.formatting import PackDetInputs
+from mmdet.datasets.transforms.loading import LoadAnnotations
+from mmdet.datasets.transforms.transforms import Pad, RandomFlip, Resize
+from mmdet.models.test_time_augs.det_tta import DetTTAModel
+
+tta_model = dict(
+    type=DetTTAModel,
+    tta_cfg=dict(nms=dict(type='nms', iou_threshold=0.6), max_per_img=100))
+
+img_scales = [(640, 640), (320, 320), (960, 960)]
+
+tta_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=None),
+    dict(
+        type=TestTimeAug,
+        transforms=[
+            [dict(type=Resize, scale=s, keep_ratio=True) for s in img_scales],
+            [
+                # ``RandomFlip`` must be placed before ``Pad``, otherwise
+                # bounding box coordinates after flipping cannot be
+                # recovered correctly.
+                dict(type=RandomFlip, prob=1.),
+                dict(type=RandomFlip, prob=0.)
+            ],
+            [
+                dict(
+                    type=Pad,
+                    size=(960, 960),
+                    pad_val=dict(img=(114, 114, 114))),
+            ],
+            [dict(type=LoadAnnotations, with_bbox=True)],
+            [
+                dict(
+                    type=PackDetInputs,
+                    meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                               'scale_factor', 'flip', 'flip_direction'))
+            ]
+        ])
+]
diff --git a/head_extractor/src/mmdet/configs/rtmdet/rtmdet_x_8xb32_300e_coco.py b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_x_8xb32_300e_coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..04d67d0ca8f08860462eb0eafc645c403e792394
--- /dev/null
+++ b/head_extractor/src/mmdet/configs/rtmdet/rtmdet_x_8xb32_300e_coco.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Please refer to https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta for more details. # noqa
+# mmcv >= 2.0.1
+# mmengine >= 0.8.0
+
+from mmengine.config import read_base
+
+with read_base():
+    from .rtmdet_l_8xb32_300e_coco import *
+
+model.update(
+    dict(
+        backbone=dict(deepen_factor=1.33, widen_factor=1.25),
+        neck=dict(
+            in_channels=[320, 640, 1280], out_channels=320, num_csp_blocks=4),
+        bbox_head=dict(in_channels=320, feat_channels=320)))
diff --git a/head_extractor/src/mmdet/datasets/__init__.py b/head_extractor/src/mmdet/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..670c207cacf9ed0f9fee88bada119ee3aaa85eae
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/__init__.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ade20k import (ADE20KInstanceDataset, ADE20KPanopticDataset,
+                     ADE20KSegDataset)
+from .base_det_dataset import BaseDetDataset
+from .base_semseg_dataset import BaseSegDataset
+from .base_video_dataset import BaseVideoDataset
+from .cityscapes import CityscapesDataset
+from .coco import CocoDataset
+from .coco_caption import CocoCaptionDataset
+from .coco_panoptic import CocoPanopticDataset
+from .coco_semantic import CocoSegDataset
+from .crowdhuman import CrowdHumanDataset
+from .dataset_wrappers import ConcatDataset, MultiImageMixDataset
+from .deepfashion import DeepFashionDataset
+from .dod import DODDataset
+from .dsdl import DSDLDetDataset
+from .flickr30k import Flickr30kDataset
+from .isaid import iSAIDDataset
+from .lvis import LVISDataset, LVISV1Dataset, LVISV05Dataset
+from .mdetr_style_refcoco import MDETRStyleRefCocoDataset
+from .mot_challenge_dataset import MOTChallengeDataset
+from .objects365 import Objects365V1Dataset, Objects365V2Dataset
+from .odvg import ODVGDataset
+from .openimages import OpenImagesChallengeDataset, OpenImagesDataset
+from .refcoco import RefCocoDataset
+from .reid_dataset import ReIDDataset
+from .samplers import (AspectRatioBatchSampler, ClassAwareSampler,
+                       CustomSampleSizeSampler, GroupMultiSourceSampler,
+                       MultiSourceSampler, TrackAspectRatioBatchSampler,
+                       TrackImgSampler)
+from .utils import get_loading_pipeline
+from .v3det import V3DetDataset
+from .voc import VOCDataset
+from .wider_face import WIDERFaceDataset
+from .xml_style import XMLDataset
+from .youtube_vis_dataset import YouTubeVISDataset
+
+__all__ = [
+    'XMLDataset', 'CocoDataset', 'DeepFashionDataset', 'VOCDataset',
+    'CityscapesDataset', 'LVISDataset', 'LVISV05Dataset', 'LVISV1Dataset',
+    'WIDERFaceDataset', 'get_loading_pipeline', 'CocoPanopticDataset',
+    'MultiImageMixDataset', 'OpenImagesDataset', 'OpenImagesChallengeDataset',
+    'AspectRatioBatchSampler', 'ClassAwareSampler', 'MultiSourceSampler',
+    'GroupMultiSourceSampler', 'BaseDetDataset', 'CrowdHumanDataset',
+    'Objects365V1Dataset', 'Objects365V2Dataset', 'DSDLDetDataset',
+    'BaseVideoDataset', 'MOTChallengeDataset', 'TrackImgSampler',
+    'ReIDDataset', 'YouTubeVISDataset', 'TrackAspectRatioBatchSampler',
+    'ADE20KPanopticDataset', 'CocoCaptionDataset', 'RefCocoDataset',
+    'BaseSegDataset', 'ADE20KSegDataset', 'CocoSegDataset',
+    'ADE20KInstanceDataset', 'iSAIDDataset', 'V3DetDataset', 'ConcatDataset',
+    'ODVGDataset', 'MDETRStyleRefCocoDataset', 'DODDataset',
+    'CustomSampleSizeSampler', 'Flickr30kDataset'
+]
diff --git a/head_extractor/src/mmdet/datasets/ade20k.py b/head_extractor/src/mmdet/datasets/ade20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..573271cb5d0cb83571564272895bddde9a5f6ad7
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/ade20k.py
@@ -0,0 +1,260 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from mmengine import fileio
+
+from mmdet.registry import DATASETS
+from .base_semseg_dataset import BaseSegDataset
+from .coco import CocoDataset
+from .coco_panoptic import CocoPanopticDataset
+
+ADE_PALETTE = [(120, 120, 120), (180, 120, 120), (6, 230, 230), (80, 50, 50),
+               (4, 200, 3), (120, 120, 80), (140, 140, 140), (204, 5, 255),
+               (230, 230, 230), (4, 250, 7), (224, 5, 255), (235, 255, 7),
+               (150, 5, 61), (120, 120, 70), (8, 255, 51), (255, 6, 82),
+               (143, 255, 140), (204, 255, 4), (255, 51, 7), (204, 70, 3),
+               (0, 102, 200), (61, 230, 250), (255, 6, 51), (11, 102, 255),
+               (255, 7, 71), (255, 9, 224), (9, 7, 230), (220, 220, 220),
+               (255, 9, 92), (112, 9, 255), (8, 255, 214), (7, 255, 224),
+               (255, 184, 6), (10, 255, 71), (255, 41, 10), (7, 255, 255),
+               (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7),
+               (255, 122, 8), (0, 255, 20), (255, 8, 41), (255, 5, 153),
+               (6, 51, 255), (235, 12, 255), (160, 150, 20), (0, 163, 255),
+               (140, 140, 140), (250, 10, 15), (20, 255, 0), (31, 255, 0),
+               (255, 31, 0), (255, 224, 0), (153, 255, 0), (0, 0, 255),
+               (255, 71, 0), (0, 235, 255), (0, 173, 255), (31, 0, 255),
+               (11, 200, 200), (255, 82, 0), (0, 255, 245), (0, 61, 255),
+               (0, 255, 112), (0, 255, 133), (255, 0, 0), (255, 163, 0),
+               (255, 102, 0), (194, 255, 0), (0, 143, 255), (51, 255, 0),
+               (0, 82, 255), (0, 255, 41), (0, 255, 173), (10, 0, 255),
+               (173, 255, 0), (0, 255, 153), (255, 92, 0), (255, 0, 255),
+               (255, 0, 245), (255, 0, 102), (255, 173, 0), (255, 0, 20),
+               (255, 184, 184), (0, 31, 255), (0, 255, 61), (0, 71, 255),
+               (255, 0, 204), (0, 255, 194), (0, 255, 82), (0, 10, 255),
+               (0, 112, 255), (51, 0, 255), (0, 194, 255), (0, 122, 255),
+               (0, 255, 163), (255, 153, 0), (0, 255, 10), (255, 112, 0),
+               (143, 255, 0), (82, 0, 255), (163, 255, 0), (255, 235, 0),
+               (8, 184, 170), (133, 0, 255), (0, 255, 92), (184, 0, 255),
+               (255, 0, 31), (0, 184, 255), (0, 214, 255), (255, 0, 112),
+               (92, 255, 0), (0, 224, 255), (112, 224, 255), (70, 184, 160),
+               (163, 0, 255), (153, 0, 255), (71, 255, 0), (255, 0, 163),
+               (255, 204, 0), (255, 0, 143), (0, 255, 235), (133, 255, 0),
+               (255, 0, 235), (245, 0, 255), (255, 0, 122), (255, 245, 0),
+               (10, 190, 212), (214, 255, 0), (0, 204, 255), (20, 0, 255),
+               (255, 255, 0), (0, 153, 255), (0, 41, 255), (0, 255, 204),
+               (41, 0, 255), (41, 255, 0), (173, 0, 255), (0, 245, 255),
+               (71, 0, 255), (122, 0, 255), (0, 255, 184), (0, 92, 255),
+               (184, 255, 0), (0, 133, 255), (255, 214, 0), (25, 194, 194),
+               (102, 255, 0), (92, 0, 255)]
+
+
+@DATASETS.register_module()
+class ADE20KPanopticDataset(CocoPanopticDataset):
+    METAINFO = {
+        'classes':
+        ('bed', 'window', 'cabinet', 'person', 'door', 'table', 'curtain',
+         'chair', 'car', 'painting, picture', 'sofa', 'shelf', 'mirror',
+         'armchair', 'seat', 'fence', 'desk', 'wardrobe, closet, press',
+         'lamp', 'tub', 'rail', 'cushion', 'box', 'column, pillar',
+         'signboard, sign', 'chest of drawers, chest, bureau, dresser',
+         'counter', 'sink', 'fireplace', 'refrigerator, icebox', 'stairs',
+         'case, display case, showcase, vitrine',
+         'pool table, billiard table, snooker table', 'pillow',
+         'screen door, screen', 'bookcase', 'coffee table',
+         'toilet, can, commode, crapper, pot, potty, stool, throne', 'flower',
+         'book', 'bench', 'countertop', 'stove', 'palm, palm tree',
+         'kitchen island', 'computer', 'swivel chair', 'boat',
+         'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier',
+         'awning, sunshade, sunblind', 'street lamp', 'booth', 'tv',
+         'airplane', 'clothes', 'pole',
+         'bannister, banister, balustrade, balusters, handrail',
+         'ottoman, pouf, pouffe, puff, hassock', 'bottle', 'van', 'ship',
+         'fountain', 'washer, automatic washer, washing machine',
+         'plaything, toy', 'stool', 'barrel, cask', 'basket, handbasket',
+         'bag', 'minibike, motorbike', 'oven', 'ball', 'food, solid food',
+         'step, stair', 'trade name', 'microwave', 'pot', 'animal', 'bicycle',
+         'dishwasher', 'screen', 'sculpture', 'hood, exhaust hood', 'sconce',
+         'vase', 'traffic light', 'tray', 'trash can', 'fan', 'plate',
+         'monitor', 'bulletin board', 'radiator', 'glass, drinking glass',
+         'clock', 'flag', 'wall', 'building', 'sky', 'floor', 'tree',
+         'ceiling', 'road, route', 'grass', 'sidewalk, pavement',
+         'earth, ground', 'mountain, mount', 'plant', 'water', 'house', 'sea',
+         'rug', 'field', 'rock, stone', 'base, pedestal, stand', 'sand',
+         'skyscraper', 'grandstand, covered stand', 'path', 'runway',
+         'stairway, staircase', 'river', 'bridge, span', 'blind, screen',
+         'hill', 'bar', 'hovel, hut, hutch, shack, shanty', 'tower',
+         'dirt track', 'land, ground, soil',
+         'escalator, moving staircase, moving stairway',
+         'buffet, counter, sideboard',
+         'poster, posting, placard, notice, bill, card', 'stage',
+         'conveyer belt, conveyor belt, conveyer, conveyor, transporter',
+         'canopy', 'pool', 'falls', 'tent', 'cradle', 'tank, storage tank',
+         'lake', 'blanket, cover', 'pier', 'crt screen', 'shower'),
+        'thing_classes':
+        ('bed', 'window', 'cabinet', 'person', 'door', 'table', 'curtain',
+         'chair', 'car', 'painting, picture', 'sofa', 'shelf', 'mirror',
+         'armchair', 'seat', 'fence', 'desk', 'wardrobe, closet, press',
+         'lamp', 'tub', 'rail', 'cushion', 'box', 'column, pillar',
+         'signboard, sign', 'chest of drawers, chest, bureau, dresser',
+         'counter', 'sink', 'fireplace', 'refrigerator, icebox', 'stairs',
+         'case, display case, showcase, vitrine',
+         'pool table, billiard table, snooker table', 'pillow',
+         'screen door, screen', 'bookcase', 'coffee table',
+         'toilet, can, commode, crapper, pot, potty, stool, throne', 'flower',
+         'book', 'bench', 'countertop', 'stove', 'palm, palm tree',
+         'kitchen island', 'computer', 'swivel chair', 'boat',
+         'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier',
+         'awning, sunshade, sunblind', 'street lamp', 'booth', 'tv',
+         'airplane', 'clothes', 'pole',
+         'bannister, banister, balustrade, balusters, handrail',
+         'ottoman, pouf, pouffe, puff, hassock', 'bottle', 'van', 'ship',
+         'fountain', 'washer, automatic washer, washing machine',
+         'plaything, toy', 'stool', 'barrel, cask', 'basket, handbasket',
+         'bag', 'minibike, motorbike', 'oven', 'ball', 'food, solid food',
+         'step, stair', 'trade name', 'microwave', 'pot', 'animal', 'bicycle',
+         'dishwasher', 'screen', 'sculpture', 'hood, exhaust hood', 'sconce',
+         'vase', 'traffic light', 'tray', 'trash can', 'fan', 'plate',
+         'monitor', 'bulletin board', 'radiator', 'glass, drinking glass',
+         'clock', 'flag'),
+        'stuff_classes':
+        ('wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road, route',
+         'grass', 'sidewalk, pavement', 'earth, ground', 'mountain, mount',
+         'plant', 'water', 'house', 'sea', 'rug', 'field', 'rock, stone',
+         'base, pedestal, stand', 'sand', 'skyscraper',
+         'grandstand, covered stand', 'path', 'runway', 'stairway, staircase',
+         'river', 'bridge, span', 'blind, screen', 'hill', 'bar',
+         'hovel, hut, hutch, shack, shanty', 'tower', 'dirt track',
+         'land, ground, soil', 'escalator, moving staircase, moving stairway',
+         'buffet, counter, sideboard',
+         'poster, posting, placard, notice, bill, card', 'stage',
+         'conveyer belt, conveyor belt, conveyer, conveyor, transporter',
+         'canopy', 'pool', 'falls', 'tent', 'cradle', 'tank, storage tank',
+         'lake', 'blanket, cover', 'pier', 'crt screen', 'shower'),
+        'palette':
+        ADE_PALETTE
+    }
+
+
+@DATASETS.register_module()
+class ADE20KInstanceDataset(CocoDataset):
+    METAINFO = {
+        'classes':
+        ('bed', 'windowpane', 'cabinet', 'person', 'door', 'table', 'curtain',
+         'chair', 'car', 'painting', 'sofa', 'shelf', 'mirror', 'armchair',
+         'seat', 'fence', 'desk', 'wardrobe', 'lamp', 'bathtub', 'railing',
+         'cushion', 'box', 'column', 'signboard', 'chest of drawers',
+         'counter', 'sink', 'fireplace', 'refrigerator', 'stairs', 'case',
+         'pool table', 'pillow', 'screen door', 'bookcase', 'coffee table',
+         'toilet', 'flower', 'book', 'bench', 'countertop', 'stove', 'palm',
+         'kitchen island', 'computer', 'swivel chair', 'boat',
+         'arcade machine', 'bus', 'towel', 'light', 'truck', 'chandelier',
+         'awning', 'streetlight', 'booth', 'television receiver', 'airplane',
+         'apparel', 'pole', 'bannister', 'ottoman', 'bottle', 'van', 'ship',
+         'fountain', 'washer', 'plaything', 'stool', 'barrel', 'basket', 'bag',
+         'minibike', 'oven', 'ball', 'food', 'step', 'trade name', 'microwave',
+         'pot', 'animal', 'bicycle', 'dishwasher', 'screen', 'sculpture',
+         'hood', 'sconce', 'vase', 'traffic light', 'tray', 'ashcan', 'fan',
+         'plate', 'monitor', 'bulletin board', 'radiator', 'glass', 'clock',
+         'flag'),
+        'palette': [(204, 5, 255), (230, 230, 230), (224, 5, 255),
+                    (150, 5, 61), (8, 255, 51), (255, 6, 82), (255, 51, 7),
+                    (204, 70, 3), (0, 102, 200), (255, 6, 51), (11, 102, 255),
+                    (255, 7, 71), (220, 220, 220), (8, 255, 214),
+                    (7, 255, 224), (255, 184, 6), (10, 255, 71), (7, 255, 255),
+                    (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7),
+                    (0, 255, 20), (255, 8, 41), (255, 5, 153), (6, 51, 255),
+                    (235, 12, 255), (0, 163, 255), (250, 10, 15), (20, 255, 0),
+                    (255, 224, 0), (0, 0, 255), (255, 71, 0), (0, 235, 255),
+                    (0, 173, 255), (0, 255, 245), (0, 255, 112), (0, 255, 133),
+                    (255, 0, 0), (255, 163, 0), (194, 255, 0), (0, 143, 255),
+                    (51, 255, 0), (0, 82, 255), (0, 255, 41), (0, 255, 173),
+                    (10, 0, 255), (173, 255, 0), (255, 92, 0), (255, 0, 245),
+                    (255, 0, 102), (255, 173, 0), (255, 0, 20), (0, 31, 255),
+                    (0, 255, 61), (0, 71, 255), (255, 0, 204), (0, 255, 194),
+                    (0, 255, 82), (0, 112, 255), (51, 0, 255), (0, 122, 255),
+                    (255, 153, 0), (0, 255, 10), (163, 255, 0), (255, 235, 0),
+                    (8, 184, 170), (184, 0, 255), (255, 0, 31), (0, 214, 255),
+                    (255, 0, 112), (92, 255, 0), (70, 184, 160), (163, 0, 255),
+                    (71, 255, 0), (255, 0, 163), (255, 204, 0), (255, 0, 143),
+                    (133, 255, 0), (255, 0, 235), (245, 0, 255), (255, 0, 122),
+                    (255, 245, 0), (214, 255, 0), (0, 204, 255), (255, 255, 0),
+                    (0, 153, 255), (0, 41, 255), (0, 255, 204), (41, 0, 255),
+                    (41, 255, 0), (173, 0, 255), (0, 245, 255), (0, 255, 184),
+                    (0, 92, 255), (184, 255, 0), (255, 214, 0), (25, 194, 194),
+                    (102, 255, 0), (92, 0, 255)],
+    }
+
+
+@DATASETS.register_module()
+class ADE20KSegDataset(BaseSegDataset):
+    """ADE20K dataset.
+
+    In segmentation map annotation for ADE20K, 0 stands for background, which
+    is not included in 150 categories. The ``img_suffix`` is fixed to '.jpg',
+    and ``seg_map_suffix`` is fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=('wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road',
+                 'bed ', 'windowpane', 'grass', 'cabinet', 'sidewalk',
+                 'person', 'earth', 'door', 'table', 'mountain', 'plant',
+                 'curtain', 'chair', 'car', 'water', 'painting', 'sofa',
+                 'shelf', 'house', 'sea', 'mirror', 'rug', 'field', 'armchair',
+                 'seat', 'fence', 'desk', 'rock', 'wardrobe', 'lamp',
+                 'bathtub', 'railing', 'cushion', 'base', 'box', 'column',
+                 'signboard', 'chest of drawers', 'counter', 'sand', 'sink',
+                 'skyscraper', 'fireplace', 'refrigerator', 'grandstand',
+                 'path', 'stairs', 'runway', 'case', 'pool table', 'pillow',
+                 'screen door', 'stairway', 'river', 'bridge', 'bookcase',
+                 'blind', 'coffee table', 'toilet', 'flower', 'book', 'hill',
+                 'bench', 'countertop', 'stove', 'palm', 'kitchen island',
+                 'computer', 'swivel chair', 'boat', 'bar', 'arcade machine',
+                 'hovel', 'bus', 'towel', 'light', 'truck', 'tower',
+                 'chandelier', 'awning', 'streetlight', 'booth',
+                 'television receiver', 'airplane', 'dirt track', 'apparel',
+                 'pole', 'land', 'bannister', 'escalator', 'ottoman', 'bottle',
+                 'buffet', 'poster', 'stage', 'van', 'ship', 'fountain',
+                 'conveyer belt', 'canopy', 'washer', 'plaything',
+                 'swimming pool', 'stool', 'barrel', 'basket', 'waterfall',
+                 'tent', 'bag', 'minibike', 'cradle', 'oven', 'ball', 'food',
+                 'step', 'tank', 'trade name', 'microwave', 'pot', 'animal',
+                 'bicycle', 'lake', 'dishwasher', 'screen', 'blanket',
+                 'sculpture', 'hood', 'sconce', 'vase', 'traffic light',
+                 'tray', 'ashcan', 'fan', 'pier', 'crt screen', 'plate',
+                 'monitor', 'bulletin board', 'shower', 'radiator', 'glass',
+                 'clock', 'flag'),
+        palette=ADE_PALETTE)
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 return_classes=False,
+                 **kwargs) -> None:
+        self.return_classes = return_classes
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            List[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        ann_dir = self.data_prefix.get('seg_map_path', None)
+        for img in fileio.list_dir_or_file(
+                dir_path=img_dir,
+                list_dir=False,
+                suffix=self.img_suffix,
+                recursive=True,
+                backend_args=self.backend_args):
+            data_info = dict(img_path=osp.join(img_dir, img))
+            if ann_dir is not None:
+                seg_map = img.replace(self.img_suffix, self.seg_map_suffix)
+                data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+            data_info['label_map'] = self.label_map
+            if self.return_classes:
+                data_info['text'] = list(self._metainfo['classes'])
+            data_list.append(data_info)
+        return data_list
diff --git a/head_extractor/src/mmdet/datasets/api_wrappers/__init__.py b/head_extractor/src/mmdet/datasets/api_wrappers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e3c41a2f87b14d10339955208e0502aeeeb7082
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/api_wrappers/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .coco_api import COCO, COCOeval, COCOPanoptic
+from .cocoeval_mp import COCOevalMP
+
+__all__ = ['COCO', 'COCOeval', 'COCOPanoptic', 'COCOevalMP']
diff --git a/head_extractor/src/mmdet/datasets/api_wrappers/coco_api.py b/head_extractor/src/mmdet/datasets/api_wrappers/coco_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d11a122e1860d1b097710ff98adfddc1508c5a
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/api_wrappers/coco_api.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This file add snake case alias for coco api
+
+import warnings
+from collections import defaultdict
+from typing import List, Optional, Union
+
+import pycocotools
+from pycocotools.coco import COCO as _COCO
+from pycocotools.cocoeval import COCOeval as _COCOeval
+
+
+class COCO(_COCO):
+    """This class is almost the same as official pycocotools package.
+
+    It implements some snake case function aliases. So that the COCO class has
+    the same interface as LVIS class.
+    """
+
+    def __init__(self, annotation_file=None):
+        if getattr(pycocotools, '__version__', '0') >= '12.0.2':
+            warnings.warn(
+                'mmpycocotools is deprecated. Please install official pycocotools by "pip install pycocotools"',  # noqa: E501
+                UserWarning)
+        super().__init__(annotation_file=annotation_file)
+        self.img_ann_map = self.imgToAnns
+        self.cat_img_map = self.catToImgs
+
+    def get_ann_ids(self, img_ids=[], cat_ids=[], area_rng=[], iscrowd=None):
+        return self.getAnnIds(img_ids, cat_ids, area_rng, iscrowd)
+
+    def get_cat_ids(self, cat_names=[], sup_names=[], cat_ids=[]):
+        return self.getCatIds(cat_names, sup_names, cat_ids)
+
+    def get_img_ids(self, img_ids=[], cat_ids=[]):
+        return self.getImgIds(img_ids, cat_ids)
+
+    def load_anns(self, ids):
+        return self.loadAnns(ids)
+
+    def load_cats(self, ids):
+        return self.loadCats(ids)
+
+    def load_imgs(self, ids):
+        return self.loadImgs(ids)
+
+
+# just for the ease of import
+COCOeval = _COCOeval
+
+
+class COCOPanoptic(COCO):
+    """This wrapper is for loading the panoptic style annotation file.
+
+    The format is shown in the CocoPanopticDataset class.
+
+    Args:
+        annotation_file (str, optional): Path of annotation file.
+            Defaults to None.
+    """
+
+    def __init__(self, annotation_file: Optional[str] = None) -> None:
+        super(COCOPanoptic, self).__init__(annotation_file)
+
+    def createIndex(self) -> None:
+        """Create index."""
+        # create index
+        print('creating index...')
+        # anns stores 'segment_id -> annotation'
+        anns, cats, imgs = {}, {}, {}
+        img_to_anns, cat_to_imgs = defaultdict(list), defaultdict(list)
+        if 'annotations' in self.dataset:
+            for ann in self.dataset['annotations']:
+                for seg_ann in ann['segments_info']:
+                    # to match with instance.json
+                    seg_ann['image_id'] = ann['image_id']
+                    img_to_anns[ann['image_id']].append(seg_ann)
+                    # segment_id is not unique in coco dataset orz...
+                    # annotations from different images but
+                    # may have same segment_id
+                    if seg_ann['id'] in anns.keys():
+                        anns[seg_ann['id']].append(seg_ann)
+                    else:
+                        anns[seg_ann['id']] = [seg_ann]
+
+            # filter out annotations from other images
+            img_to_anns_ = defaultdict(list)
+            for k, v in img_to_anns.items():
+                img_to_anns_[k] = [x for x in v if x['image_id'] == k]
+            img_to_anns = img_to_anns_
+
+        if 'images' in self.dataset:
+            for img_info in self.dataset['images']:
+                img_info['segm_file'] = img_info['file_name'].replace(
+                    '.jpg', '.png')
+                imgs[img_info['id']] = img_info
+
+        if 'categories' in self.dataset:
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
+            for ann in self.dataset['annotations']:
+                for seg_ann in ann['segments_info']:
+                    cat_to_imgs[seg_ann['category_id']].append(ann['image_id'])
+
+        print('index created!')
+
+        self.anns = anns
+        self.imgToAnns = img_to_anns
+        self.catToImgs = cat_to_imgs
+        self.imgs = imgs
+        self.cats = cats
+
+    def load_anns(self,
+                  ids: Union[List[int], int] = []) -> Optional[List[dict]]:
+        """Load anns with the specified ids.
+
+        ``self.anns`` is a list of annotation lists instead of a
+        list of annotations.
+
+        Args:
+            ids (Union[List[int], int]): Integer ids specifying anns.
+
+        Returns:
+            anns (List[dict], optional): Loaded ann objects.
+        """
+        anns = []
+
+        if hasattr(ids, '__iter__') and hasattr(ids, '__len__'):
+            # self.anns is a list of annotation lists instead of
+            # a list of annotations
+            for id in ids:
+                anns += self.anns[id]
+            return anns
+        elif type(ids) == int:
+            return self.anns[ids]
diff --git a/head_extractor/src/mmdet/datasets/api_wrappers/cocoeval_mp.py b/head_extractor/src/mmdet/datasets/api_wrappers/cocoeval_mp.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3673ea7a7edc593cb49fb336f352a20c1b1015b
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/api_wrappers/cocoeval_mp.py
@@ -0,0 +1,296 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import itertools
+import time
+from collections import defaultdict
+
+import numpy as np
+import torch.multiprocessing as mp
+from mmengine.logging import MMLogger
+from pycocotools.cocoeval import COCOeval
+from tqdm import tqdm
+
+
+class COCOevalMP(COCOeval):
+
+    def _prepare(self):
+        '''
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        '''
+
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                rle = coco.annToRLE(ann)
+                ann['segmentation'] = rle
+
+        p = self.params
+        if p.useCats:
+            gts = []
+            dts = []
+            img_ids = set(p.imgIds)
+            cat_ids = set(p.catIds)
+            for gt in self.cocoGt.dataset['annotations']:
+                if (gt['category_id'] in cat_ids) and (gt['image_id']
+                                                       in img_ids):
+                    gts.append(gt)
+            for dt in self.cocoDt.dataset['annotations']:
+                if (dt['category_id'] in cat_ids) and (dt['image_id']
+                                                       in img_ids):
+                    dts.append(dt)
+            # gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) # noqa
+            # dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) # noqa
+            # gts=self.cocoGt.dataset['annotations']
+            # dts=self.cocoDt.dataset['annotations']
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == 'segm':
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+        # set ignore flag
+        for gt in gts:
+            gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
+            gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
+            if p.iouType == 'keypoints':
+                gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        for gt in gts:
+            self._gts[gt['image_id'], gt['category_id']].append(gt)
+        for dt in dts:
+            self._dts[dt['image_id'], dt['category_id']].append(dt)
+        self.evalImgs = defaultdict(
+            list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def evaluate(self):
+        """Run per image evaluation on given images and store results (a list
+        of dict) in self.evalImgs.
+
+        :return: None
+        """
+        tic = time.time()
+        print('Running per image evaluation...')
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+            print('useSegm (deprecated) is not None. Running {} evaluation'.
+                  format(p.iouType))
+        print('Evaluate annotation type *{}*'.format(p.iouType))
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        nproc = 8
+        split_size = len(catIds) // nproc
+        mp_params = []
+        for i in range(nproc):
+            begin = i * split_size
+            end = (i + 1) * split_size
+            if i == nproc - 1:
+                end = len(catIds)
+            mp_params.append((catIds[begin:end], ))
+
+        MMLogger.get_current_instance().info(
+            'start multi processing evaluation ...')
+        with mp.Pool(nproc) as pool:
+            self.evalImgs = pool.starmap(self._evaluateImg, mp_params)
+
+        self.evalImgs = list(itertools.chain(*self.evalImgs))
+
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format(toc - tic))
+
+    def _evaluateImg(self, catids_chunk):
+        self._prepare()
+        p = self.params
+        maxDet = max(p.maxDets)
+        all_params = []
+        for catId in catids_chunk:
+            for areaRng in p.areaRng:
+                for imgId in p.imgIds:
+                    all_params.append((catId, areaRng, imgId))
+        evalImgs = [
+            self.evaluateImg(imgId, catId, areaRng, maxDet)
+            for catId, areaRng, imgId in tqdm(all_params)
+        ]
+        return evalImgs
+
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+
+        for g in gt:
+            if g['ignore'] or (g['area'] < aRng[0] or g['area'] > aRng[1]):
+                g['_ignore'] = 1
+            else:
+                g['_ignore'] = 0
+
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        # load computed ious
+        # ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId] # noqa
+        ious = self.computeIoU(imgId, catId)
+        ious = ious[:, gtind] if len(ious) > 0 else ious
+
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm = np.zeros((T, G))
+        dtm = np.zeros((T, D))
+        gtIg = np.array([g['_ignore'] for g in gt])
+        dtIg = np.zeros((T, D))
+        if not len(ious) == 0:
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t, 1 - 1e-10])
+                    m = -1
+                    for gind, g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
+                            break
+                        # continue to next gt unless better match made
+                        if ious[dind, gind] < iou:
+                            continue
+                        # if match successful and best so far,
+                        # store appropriately
+                        iou = ious[dind, gind]
+                        m = gind
+                    # if match made store id of match for both dt and gt
+                    if m == -1:
+                        continue
+                    dtIg[tind, dind] = gtIg[m]
+                    dtm[tind, dind] = gt[m]['id']
+                    gtm[tind, m] = d['id']
+        # set unmatched detections outside of area range to ignore
+        a = np.array([d['area'] < aRng[0] or d['area'] > aRng[1]
+                      for d in dt]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T,
+                                                                      0)))
+        # store results for given image and category
+
+        return {
+            'image_id': imgId,
+            'category_id': catId,
+            'aRng': aRng,
+            'maxDet': maxDet,
+            'dtIds': [d['id'] for d in dt],
+            'gtIds': [g['id'] for g in gt],
+            'dtMatches': dtm,
+            'gtMatches': gtm,
+            'dtScores': [d['score'] for d in dt],
+            'gtIgnore': gtIg,
+            'dtIgnore': dtIg,
+        }
+
+    def summarize(self):
+        """Compute and display summary metrics for evaluation results.
+
+        Note this function can *only* be applied on the default parameter
+        setting
+        """
+
+        def _summarize(ap=1, iouThr=None, areaRng='all', maxDets=100):
+            p = self.params
+            iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}'  # noqa
+            titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+            typeStr = '(AP)' if ap == 1 else '(AR)'
+            iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
+                if iouThr is None else '{:0.2f}'.format(iouThr)
+
+            aind = [
+                i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng
+            ]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval['precision']
+                # IoU
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval['recall']
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            print(
+                iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets,
+                            mean_s))
+            return mean_s
+
+        def _summarizeDets():
+            stats = []
+            stats.append(_summarize(1, maxDets=self.params.maxDets[-1]))
+            stats.append(
+                _summarize(1, iouThr=.5, maxDets=self.params.maxDets[-1]))
+            stats.append(
+                _summarize(1, iouThr=.75, maxDets=self.params.maxDets[-1]))
+            for area_rng in ('small', 'medium', 'large'):
+                stats.append(
+                    _summarize(
+                        1, areaRng=area_rng, maxDets=self.params.maxDets[-1]))
+            for max_det in self.params.maxDets:
+                stats.append(_summarize(0, maxDets=max_det))
+            for area_rng in ('small', 'medium', 'large'):
+                stats.append(
+                    _summarize(
+                        0, areaRng=area_rng, maxDets=self.params.maxDets[-1]))
+            stats = np.array(stats)
+            return stats
+
+        def _summarizeKps():
+            stats = np.zeros((10, ))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng='medium')
+            stats[4] = _summarize(1, maxDets=20, areaRng='large')
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng='medium')
+            stats[9] = _summarize(0, maxDets=20, areaRng='large')
+            return stats
+
+        if not self.eval:
+            raise Exception('Please run accumulate() first')
+        iouType = self.params.iouType
+        if iouType == 'segm' or iouType == 'bbox':
+            summarize = _summarizeDets
+        elif iouType == 'keypoints':
+            summarize = _summarizeKps
+        self.stats = summarize()
diff --git a/head_extractor/src/mmdet/datasets/base_det_dataset.py b/head_extractor/src/mmdet/datasets/base_det_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b3876d5c06eb7d3741a29fe8b0963a7e425ec1b
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/base_det_dataset.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List, Optional
+
+from mmengine.dataset import BaseDataset
+from mmengine.fileio import load
+from mmengine.utils import is_abs
+
+from ..registry import DATASETS
+
+
+@DATASETS.register_module()
+class BaseDetDataset(BaseDataset):
+    """Base dataset for detection.
+
+    Args:
+        proposal_file (str, optional): Proposals file path. Defaults to None.
+        file_client_args (dict): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        return_classes (bool): Whether to return class information
+            for open vocabulary-based algorithms. Defaults to False.
+        caption_prompt (dict, optional): Prompt for captioning.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 *args,
+                 seg_map_suffix: str = '.png',
+                 proposal_file: Optional[str] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 return_classes: bool = False,
+                 caption_prompt: Optional[dict] = None,
+                 **kwargs) -> None:
+        self.seg_map_suffix = seg_map_suffix
+        self.proposal_file = proposal_file
+        self.backend_args = backend_args
+        self.return_classes = return_classes
+        self.caption_prompt = caption_prompt
+        if self.caption_prompt is not None:
+            assert self.return_classes, \
+                'return_classes must be True when using caption_prompt'
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+        super().__init__(*args, **kwargs)
+
+    def full_init(self) -> None:
+        """Load annotation file and set ``BaseDataset._fully_initialized`` to
+        True.
+
+        If ``lazy_init=False``, ``full_init`` will be called during the
+        instantiation and ``self._fully_initialized`` will be set to True. If
+        ``obj._fully_initialized=False``, the class method decorated by
+        ``force_full_init`` will call ``full_init`` automatically.
+
+        Several steps to initialize annotation:
+
+            - load_data_list: Load annotations from annotation file.
+            - load_proposals: Load proposals from proposal file, if
+              `self.proposal_file` is not None.
+            - filter data information: Filter annotations according to
+              filter_cfg.
+            - slice_data: Slice dataset according to ``self._indices``
+            - serialize_data: Serialize ``self.data_list`` if
+            ``self.serialize_data`` is True.
+        """
+        if self._fully_initialized:
+            return
+        # load data information
+        self.data_list = self.load_data_list()
+        # get proposals from file
+        if self.proposal_file is not None:
+            self.load_proposals()
+        # filter illegal data, such as data that has no annotations.
+        self.data_list = self.filter_data()
+
+        # Get subset data according to indices.
+        if self._indices is not None:
+            self.data_list = self._get_unserialized_subset(self._indices)
+
+        # serialize data_list
+        if self.serialize_data:
+            self.data_bytes, self.data_address = self._serialize_data()
+
+        self._fully_initialized = True
+
+    def load_proposals(self) -> None:
+        """Load proposals from proposals file.
+
+        The `proposals_list` should be a dict[img_path: proposals]
+        with the same length as `data_list`. And the `proposals` should be
+        a `dict` or :obj:`InstanceData` usually contains following keys.
+
+            - bboxes (np.ndarry): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+            - scores (np.ndarry): Classification scores, has a shape
+              (num_instance, ).
+        """
+        # TODO: Add Unit Test after fully support Dump-Proposal Metric
+        if not is_abs(self.proposal_file):
+            self.proposal_file = osp.join(self.data_root, self.proposal_file)
+        proposals_list = load(
+            self.proposal_file, backend_args=self.backend_args)
+        assert len(self.data_list) == len(proposals_list)
+        for data_info in self.data_list:
+            img_path = data_info['img_path']
+            # `file_name` is the key to obtain the proposals from the
+            # `proposals_list`.
+            file_name = osp.join(
+                osp.split(osp.split(img_path)[0])[-1],
+                osp.split(img_path)[-1])
+            proposals = proposals_list[file_name]
+            data_info['proposals'] = proposals
+
+    def get_cat_ids(self, idx: int) -> List[int]:
+        """Get COCO category ids by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            List[int]: All categories in the image of specified index.
+        """
+        instances = self.get_data_info(idx)['instances']
+        return [instance['bbox_label'] for instance in instances]
diff --git a/head_extractor/src/mmdet/datasets/base_semseg_dataset.py b/head_extractor/src/mmdet/datasets/base_semseg_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d10f762a21a897ab8274fbe9eefab054691a7c60
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/base_semseg_dataset.py
@@ -0,0 +1,265 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import Callable, Dict, List, Optional, Sequence, Union
+
+import mmengine
+import mmengine.fileio as fileio
+import numpy as np
+from mmengine.dataset import BaseDataset, Compose
+
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BaseSegDataset(BaseDataset):
+    """Custom dataset for semantic segmentation. An example of file structure
+    is as followed.
+
+    .. code-block:: none
+
+        ├── data
+        │   ├── my_dataset
+        │   │   ├── img_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{img_suffix}
+        │   │   │   │   ├── yyy{img_suffix}
+        │   │   │   │   ├── zzz{img_suffix}
+        │   │   │   ├── val
+        │   │   ├── ann_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{seg_map_suffix}
+        │   │   │   │   ├── yyy{seg_map_suffix}
+        │   │   │   │   ├── zzz{seg_map_suffix}
+        │   │   │   ├── val
+
+    The img/gt_semantic_seg pair of BaseSegDataset should be of the same
+    except suffix. A valid img/gt_semantic_seg filename pair should be like
+    ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included
+    in the suffix). If split is given, then ``xxx`` is specified in txt file.
+    Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded.
+    Please refer to ``docs/en/tutorials/new_dataset.md`` for more details.
+
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as
+            specify classes to load. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            dict(img_path=None, seg_map_path=None).
+        img_suffix (str): Suffix of images. Default: '.jpg'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        use_label_map (bool, optional): Whether to use label map.
+            Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4 required.
+    """
+    METAINFO: dict = dict()
+
+    def __init__(self,
+                 ann_file: str = '',
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: dict = dict(img_path='', seg_map_path=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 use_label_map: bool = False,
+                 max_refetch: int = 1000,
+                 backend_args: Optional[dict] = None) -> None:
+
+        self.img_suffix = img_suffix
+        self.seg_map_suffix = seg_map_suffix
+        self.backend_args = backend_args.copy() if backend_args else None
+
+        self.data_root = data_root
+        self.data_prefix = copy.copy(data_prefix)
+        self.ann_file = ann_file
+        self.filter_cfg = copy.deepcopy(filter_cfg)
+        self._indices = indices
+        self.serialize_data = serialize_data
+        self.test_mode = test_mode
+        self.max_refetch = max_refetch
+        self.data_list: List[dict] = []
+        self.data_bytes: np.ndarray
+
+        # Set meta information.
+        self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
+
+        # Get label map for custom classes
+        new_classes = self._metainfo.get('classes', None)
+        self.label_map = self.get_label_map(
+            new_classes) if use_label_map else None
+        self._metainfo.update(dict(label_map=self.label_map))
+
+        # Update palette based on label map or generate palette
+        # if it is not defined
+        updated_palette = self._update_palette()
+        self._metainfo.update(dict(palette=updated_palette))
+
+        # Join paths.
+        if self.data_root is not None:
+            self._join_prefix()
+
+        # Build pipeline.
+        self.pipeline = Compose(pipeline)
+        # Full initialize the dataset.
+        if not lazy_init:
+            self.full_init()
+
+        if test_mode:
+            assert self._metainfo.get('classes') is not None, \
+                'dataset metainfo `classes` should be specified when testing'
+
+    @classmethod
+    def get_label_map(cls,
+                      new_classes: Optional[Sequence] = None
+                      ) -> Union[Dict, None]:
+        """Require label mapping.
+
+        The ``label_map`` is a dictionary, its keys are the old label ids and
+        its values are the new label ids, and is used for changing pixel
+        labels in load_annotations. If and only if old classes in cls.METAINFO
+        is not equal to new classes in self._metainfo and nether of them is not
+        None, `label_map` is not None.
+
+        Args:
+            new_classes (list, tuple, optional): The new classes name from
+                metainfo. Default to None.
+
+
+        Returns:
+            dict, optional: The mapping from old classes in cls.METAINFO to
+                new classes in self._metainfo
+        """
+        old_classes = cls.METAINFO.get('classes', None)
+        if (new_classes is not None and old_classes is not None
+                and list(new_classes) != list(old_classes)):
+
+            label_map = {}
+            if not set(new_classes).issubset(cls.METAINFO['classes']):
+                raise ValueError(
+                    f'new classes {new_classes} is not a '
+                    f'subset of classes {old_classes} in METAINFO.')
+            for i, c in enumerate(old_classes):
+                if c not in new_classes:
+                    # 0 is background
+                    label_map[i] = 0
+                else:
+                    label_map[i] = new_classes.index(c)
+            return label_map
+        else:
+            return None
+
+    def _update_palette(self) -> list:
+        """Update palette after loading metainfo.
+
+        If length of palette is equal to classes, just return the palette.
+        If palette is not defined, it will randomly generate a palette.
+        If classes is updated by customer, it will return the subset of
+        palette.
+
+        Returns:
+            Sequence: Palette for current dataset.
+        """
+        palette = self._metainfo.get('palette', [])
+        classes = self._metainfo.get('classes', [])
+        # palette does match classes
+        if len(palette) == len(classes):
+            return palette
+
+        if len(palette) == 0:
+            # Get random state before set seed, and restore
+            # random state later.
+            # It will prevent loss of randomness, as the palette
+            # may be different in each iteration if not specified.
+            # See: https://github.com/open-mmlab/mmdetection/issues/5844
+            state = np.random.get_state()
+            np.random.seed(42)
+            # random palette
+            new_palette = np.random.randint(
+                0, 255, size=(len(classes), 3)).tolist()
+            np.random.set_state(state)
+        elif len(palette) >= len(classes) and self.label_map is not None:
+            new_palette = []
+            # return subset of palette
+            for old_id, new_id in sorted(
+                    self.label_map.items(), key=lambda x: x[1]):
+                # 0 is background
+                if new_id != 0:
+                    new_palette.append(palette[old_id])
+            new_palette = type(palette)(new_palette)
+        elif len(palette) >= len(classes):
+            # Allow palette length is greater than classes.
+            return palette
+        else:
+            raise ValueError('palette does not match classes '
+                             f'as metainfo is {self._metainfo}.')
+        return new_palette
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        ann_dir = self.data_prefix.get('seg_map_path', None)
+        if not osp.isdir(self.ann_file) and self.ann_file:
+            assert osp.isfile(self.ann_file), \
+                f'Failed to load `ann_file` {self.ann_file}'
+            lines = mmengine.list_from_file(
+                self.ann_file, backend_args=self.backend_args)
+            for line in lines:
+                img_name = line.strip()
+                data_info = dict(
+                    img_path=osp.join(img_dir, img_name + self.img_suffix))
+                if ann_dir is not None:
+                    seg_map = img_name + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_list.append(data_info)
+        else:
+            for img in fileio.list_dir_or_file(
+                    dir_path=img_dir,
+                    list_dir=False,
+                    suffix=self.img_suffix,
+                    recursive=True,
+                    backend_args=self.backend_args):
+                data_info = dict(img_path=osp.join(img_dir, img))
+                if ann_dir is not None:
+                    seg_map = img.replace(self.img_suffix, self.seg_map_suffix)
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_list.append(data_info)
+            data_list = sorted(data_list, key=lambda x: x['img_path'])
+        return data_list
diff --git a/head_extractor/src/mmdet/datasets/base_video_dataset.py b/head_extractor/src/mmdet/datasets/base_video_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a4a7a25f16206f06c7b64a7ce4c3588efd5455e
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/base_video_dataset.py
@@ -0,0 +1,304 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from collections import defaultdict
+from typing import Any, List, Tuple
+
+import mmengine.fileio as fileio
+from mmengine.dataset import BaseDataset
+from mmengine.logging import print_log
+
+from mmdet.datasets.api_wrappers import COCO
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BaseVideoDataset(BaseDataset):
+    """Base video dataset for VID, MOT and VIS tasks."""
+
+    META = dict(classes=None)
+    # ann_id is unique in coco dataset.
+    ANN_ID_UNIQUE = True
+
+    def __init__(self, *args, backend_args: dict = None, **kwargs):
+        self.backend_args = backend_args
+        super().__init__(*args, **kwargs)
+
+    def load_data_list(self) -> Tuple[List[dict], List]:
+        """Load annotations from an annotation file named as ``self.ann_file``.
+
+        Returns:
+            tuple(list[dict], list): A list of annotation and a list of
+            valid data indices.
+        """
+        with fileio.get_local_path(self.ann_file) as local_path:
+            self.coco = COCO(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the classes
+        self.cat_ids = self.coco.get_cat_ids(
+            cat_names=self.metainfo['classes'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.coco.cat_img_map)
+        # used in `filter_data`
+        self.img_ids_with_ann = set()
+
+        img_ids = self.coco.get_img_ids()
+        total_ann_ids = []
+        # if ``video_id`` is not in the annotation file, we will assign a big
+        # unique video_id for this video.
+        single_video_id = 100000
+        videos = {}
+        for img_id in img_ids:
+            raw_img_info = self.coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+            if 'video_id' not in raw_img_info:
+                single_video_id = single_video_id + 1
+                video_id = single_video_id
+            else:
+                video_id = raw_img_info['video_id']
+
+            if video_id not in videos:
+                videos[video_id] = {
+                    'video_id': video_id,
+                    'images': [],
+                    'video_length': 0
+                }
+
+            videos[video_id]['video_length'] += 1
+            ann_ids = self.coco.get_ann_ids(
+                img_ids=[img_id], cat_ids=self.cat_ids)
+            raw_ann_info = self.coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            parsed_data_info = self.parse_data_info(
+                dict(raw_img_info=raw_img_info, raw_ann_info=raw_ann_info))
+
+            if len(parsed_data_info['instances']) > 0:
+                self.img_ids_with_ann.add(parsed_data_info['img_id'])
+
+            videos[video_id]['images'].append(parsed_data_info)
+
+        data_list = [v for v in videos.values()]
+
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.coco
+
+        return data_list
+
+    def parse_data_info(self, raw_data_info: dict) -> dict:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_data_info (dict): Raw data information loaded from
+                ``ann_file``.
+
+        Returns:
+            dict: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+        data_info = {}
+
+        data_info.update(img_info)
+        if self.data_prefix.get('img_path', None) is not None:
+            img_path = osp.join(self.data_prefix['img_path'],
+                                img_info['file_name'])
+        else:
+            img_path = img_info['file_name']
+        data_info['img_path'] = img_path
+
+        instances = []
+        for i, ann in enumerate(ann_info):
+            instance = {}
+
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+
+            if ann.get('iscrowd', False):
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[ann['category_id']]
+            if ann.get('segmentation', None):
+                instance['mask'] = ann['segmentation']
+            if ann.get('instance_id', None):
+                instance['instance_id'] = ann['instance_id']
+            else:
+                # image dataset usually has no `instance_id`.
+                # Therefore, we set it to `i`.
+                instance['instance_id'] = i
+            instances.append(instance)
+        data_info['instances'] = instances
+        return data_info
+
+    def filter_data(self) -> List[int]:
+        """Filter image annotations according to filter_cfg.
+
+        Returns:
+            list[int]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        num_imgs_before_filter = sum(
+            [len(info['images']) for info in self.data_list])
+        num_imgs_after_filter = 0
+
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= self.img_ids_with_ann
+
+        new_data_list = []
+        for video_data_info in self.data_list:
+            imgs_data_info = video_data_info['images']
+            valid_imgs_data_info = []
+
+            for data_info in imgs_data_info:
+                img_id = data_info['img_id']
+                width = data_info['width']
+                height = data_info['height']
+                # TODO: simplify these conditions
+                if self.filter_cfg is None:
+                    if img_id not in ids_in_cat:
+                        video_data_info['video_length'] -= 1
+                        continue
+                    if min(width, height) >= 32:
+                        valid_imgs_data_info.append(data_info)
+                        num_imgs_after_filter += 1
+                    else:
+                        video_data_info['video_length'] -= 1
+                else:
+                    if self.filter_cfg.get('filter_empty_gt',
+                                           True) and img_id not in ids_in_cat:
+                        video_data_info['video_length'] -= 1
+                        continue
+                    if min(width, height) >= self.filter_cfg.get(
+                            'min_size', 32):
+                        valid_imgs_data_info.append(data_info)
+                        num_imgs_after_filter += 1
+                    else:
+                        video_data_info['video_length'] -= 1
+                video_data_info['images'] = valid_imgs_data_info
+            new_data_list.append(video_data_info)
+
+        print_log(
+            'The number of samples before and after filtering: '
+            f'{num_imgs_before_filter} / {num_imgs_after_filter}', 'current')
+        return new_data_list
+
+    def prepare_data(self, idx) -> Any:
+        """Get date processed by ``self.pipeline``. Note that ``idx`` is a
+        video index in default since the base element of video dataset is a
+        video. However, in some cases, we need to specific both the video index
+        and frame index. For example, in traing mode, we may want to sample the
+        specific frames and all the frames must be sampled once in a epoch; in
+        test mode, we may want to output data of a single image rather than the
+        whole video for saving memory.
+
+        Args:
+            idx (int): The index of ``data_info``.
+
+        Returns:
+            Any: Depends on ``self.pipeline``.
+        """
+        if isinstance(idx, tuple):
+            assert len(idx) == 2, 'The length of idx must be 2: '
+            '(video_index, frame_index)'
+            video_idx, frame_idx = idx[0], idx[1]
+        else:
+            video_idx, frame_idx = idx, None
+
+        data_info = self.get_data_info(video_idx)
+        if self.test_mode:
+            # Support two test_mode: frame-level and video-level
+            final_data_info = defaultdict(list)
+            if frame_idx is None:
+                frames_idx_list = list(range(data_info['video_length']))
+            else:
+                frames_idx_list = [frame_idx]
+            for index in frames_idx_list:
+                frame_ann = data_info['images'][index]
+                frame_ann['video_id'] = data_info['video_id']
+                # Collate data_list (list of dict to dict of list)
+                for key, value in frame_ann.items():
+                    final_data_info[key].append(value)
+                # copy the info in video-level into img-level
+                # TODO: the value of this key is the same as that of
+                # `video_length` in test mode
+                final_data_info['ori_video_length'].append(
+                    data_info['video_length'])
+
+            final_data_info['video_length'] = [len(frames_idx_list)
+                                               ] * len(frames_idx_list)
+            return self.pipeline(final_data_info)
+        else:
+            # Specify `key_frame_id` for the frame sampling in the pipeline
+            if frame_idx is not None:
+                data_info['key_frame_id'] = frame_idx
+            return self.pipeline(data_info)
+
+    def get_cat_ids(self, index) -> List[int]:
+        """Following image detection, we provide this interface function. Get
+        category ids by video index and frame index.
+
+        Args:
+            index: The index of the dataset. It support two kinds of inputs:
+                Tuple:
+                    video_idx (int): Index of video.
+                    frame_idx (int): Index of frame.
+                Int: Index of video.
+
+        Returns:
+            List[int]: All categories in the image of specified video index
+            and frame index.
+        """
+        if isinstance(index, tuple):
+            assert len(
+                index
+            ) == 2, f'Expect the length of index is 2, but got {len(index)}'
+            video_idx, frame_idx = index
+            instances = self.get_data_info(
+                video_idx)['images'][frame_idx]['instances']
+            return [instance['bbox_label'] for instance in instances]
+        else:
+            cat_ids = []
+            for img in self.get_data_info(index)['images']:
+                for instance in img['instances']:
+                    cat_ids.append(instance['bbox_label'])
+            return cat_ids
+
+    @property
+    def num_all_imgs(self):
+        """Get the number of all the images in this video dataset."""
+        return sum(
+            [len(self.get_data_info(i)['images']) for i in range(len(self))])
+
+    def get_len_per_video(self, idx):
+        """Get length of one video.
+
+        Args:
+            idx (int): Index of video.
+
+        Returns:
+            int (int): The length of the video.
+        """
+        return len(self.get_data_info(idx)['images'])
diff --git a/head_extractor/src/mmdet/datasets/cityscapes.py b/head_extractor/src/mmdet/datasets/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..09755eb1e8b0f0c278085bd2fafbb7247a3fc946
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/cityscapes.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/datasets/cityscapes.py # noqa
+# and https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa
+
+from typing import List
+
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class CityscapesDataset(CocoDataset):
+    """Dataset for Cityscapes."""
+
+    METAINFO = {
+        'classes': ('person', 'rider', 'car', 'truck', 'bus', 'train',
+                    'motorcycle', 'bicycle'),
+        'palette': [(220, 20, 60), (255, 0, 0), (0, 0, 142), (0, 0, 70),
+                    (0, 60, 100), (0, 80, 100), (0, 0, 230), (119, 11, 32)]
+    }
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg.
+
+        Returns:
+            List[dict]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        if self.filter_cfg is None:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False)
+        min_size = self.filter_cfg.get('min_size', 0)
+
+        # obtain images that contain annotation
+        ids_with_ann = set(data_info['img_id'] for data_info in self.data_list)
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= ids_with_ann
+
+        valid_data_infos = []
+        for i, data_info in enumerate(self.data_list):
+            img_id = data_info['img_id']
+            width = data_info['width']
+            height = data_info['height']
+            all_is_crowd = all([
+                instance['ignore_flag'] == 1
+                for instance in data_info['instances']
+            ])
+            if filter_empty_gt and (img_id not in ids_in_cat or all_is_crowd):
+                continue
+            if min(width, height) >= min_size:
+                valid_data_infos.append(data_info)
+
+        return valid_data_infos
diff --git a/head_extractor/src/mmdet/datasets/coco.py b/head_extractor/src/mmdet/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cf21c4e667e3b565ea01d1eb95bcdbf171b90d0
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/coco.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import List, Union
+
+from mmengine.fileio import get_local_path
+
+from mmdet.registry import DATASETS
+from .api_wrappers import COCO
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class CocoDataset(BaseDetDataset):
+    """Dataset for COCO."""
+
+    METAINFO = {
+        'classes':
+        ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+         'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+         'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+         'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+         'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+         'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+         'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+         'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+         'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+         'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+         'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+         'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+         'scissors', 'teddy bear', 'hair drier', 'toothbrush'),
+        # palette is a list of color tuples, which is used for visualization.
+        'palette':
+        [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), (106, 0, 228),
+         (0, 60, 100), (0, 80, 100), (0, 0, 70), (0, 0, 192), (250, 170, 30),
+         (100, 170, 30), (220, 220, 0), (175, 116, 175), (250, 0, 30),
+         (165, 42, 42), (255, 77, 255), (0, 226, 252), (182, 182, 255),
+         (0, 82, 0), (120, 166, 157), (110, 76, 0), (174, 57, 255),
+         (199, 100, 0), (72, 0, 118), (255, 179, 240), (0, 125, 92),
+         (209, 0, 151), (188, 208, 182), (0, 220, 176), (255, 99, 164),
+         (92, 0, 73), (133, 129, 255), (78, 180, 255), (0, 228, 0),
+         (174, 255, 243), (45, 89, 255), (134, 134, 103), (145, 148, 174),
+         (255, 208, 186), (197, 226, 255), (171, 134, 1), (109, 63, 54),
+         (207, 138, 255), (151, 0, 95), (9, 80, 61), (84, 105, 51),
+         (74, 65, 105), (166, 196, 102), (208, 195, 210), (255, 109, 65),
+         (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0),
+         (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161),
+         (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120),
+         (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133),
+         (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62),
+         (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45),
+         (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1),
+         (246, 0, 122), (191, 162, 208)]
+    }
+    COCOAPI = COCO
+    # ann_id is unique in coco dataset.
+    ANN_ID_UNIQUE = True
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.coco = self.COCOAPI(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the `classes`
+        self.cat_ids = self.coco.get_cat_ids(
+            cat_names=self.metainfo['classes'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.coco.cat_img_map)
+
+        img_ids = self.coco.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+
+            ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.coco
+
+        return data_list
+
+    def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+
+        data_info = {}
+
+        # TODO: need to change data_prefix['img'] to data_prefix['img_path']
+        img_path = osp.join(self.data_prefix['img'], img_info['file_name'])
+        if self.data_prefix.get('seg', None):
+            seg_map_path = osp.join(
+                self.data_prefix['seg'],
+                img_info['file_name'].rsplit('.', 1)[0] + self.seg_map_suffix)
+        else:
+            seg_map_path = None
+        data_info['img_path'] = img_path
+        data_info['img_id'] = img_info['img_id']
+        data_info['seg_map_path'] = seg_map_path
+        data_info['height'] = img_info['height']
+        data_info['width'] = img_info['width']
+
+        if self.return_classes:
+            data_info['text'] = self.metainfo['classes']
+            data_info['caption_prompt'] = self.caption_prompt
+            data_info['custom_entities'] = True
+
+        instances = []
+        for i, ann in enumerate(ann_info):
+            instance = {}
+
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+
+            if ann.get('iscrowd', False):
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[ann['category_id']]
+
+            if ann.get('segmentation', None):
+                instance['mask'] = ann['segmentation']
+
+            instances.append(instance)
+        data_info['instances'] = instances
+        return data_info
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg.
+
+        Returns:
+            List[dict]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        if self.filter_cfg is None:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False)
+        min_size = self.filter_cfg.get('min_size', 0)
+
+        # obtain images that contain annotation
+        ids_with_ann = set(data_info['img_id'] for data_info in self.data_list)
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= ids_with_ann
+
+        valid_data_infos = []
+        for i, data_info in enumerate(self.data_list):
+            img_id = data_info['img_id']
+            width = data_info['width']
+            height = data_info['height']
+            if filter_empty_gt and img_id not in ids_in_cat:
+                continue
+            if min(width, height) >= min_size:
+                valid_data_infos.append(data_info)
+
+        return valid_data_infos
diff --git a/head_extractor/src/mmdet/datasets/coco_caption.py b/head_extractor/src/mmdet/datasets/coco_caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee695fe9a768f2be5345c6ad6bafc74177f252c0
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/coco_caption.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from pathlib import Path
+from typing import List
+
+import mmengine
+from mmengine.dataset import BaseDataset
+from mmengine.fileio import get_file_backend
+
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class CocoCaptionDataset(BaseDataset):
+    """COCO2014 Caption dataset."""
+
+    def load_data_list(self) -> List[dict]:
+        """Load data list."""
+        img_prefix = self.data_prefix['img_path']
+        annotations = mmengine.load(self.ann_file)
+        file_backend = get_file_backend(img_prefix)
+
+        data_list = []
+        for ann in annotations:
+            data_info = {
+                'img_id': Path(ann['image']).stem.split('_')[-1],
+                'img_path': file_backend.join_path(img_prefix, ann['image']),
+                'gt_caption': ann['caption'],
+            }
+
+            data_list.append(data_info)
+
+        return data_list
diff --git a/head_extractor/src/mmdet/datasets/coco_panoptic.py b/head_extractor/src/mmdet/datasets/coco_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7a200e01d323e998afa782797e1cc92f75c70cf
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/coco_panoptic.py
@@ -0,0 +1,292 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Callable, List, Optional, Sequence, Union
+
+from mmdet.registry import DATASETS
+from .api_wrappers import COCOPanoptic
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class CocoPanopticDataset(CocoDataset):
+    """Coco dataset for Panoptic segmentation.
+
+    The annotation format is shown as follows. The `ann` field is optional
+    for testing.
+
+    .. code-block:: none
+
+        [
+            {
+                'filename': f'{image_id:012}.png',
+                'image_id':9
+                'segments_info':
+                [
+                    {
+                        'id': 8345037, (segment_id in panoptic png,
+                                        convert from rgb)
+                        'category_id': 51,
+                        'iscrowd': 0,
+                        'bbox': (x1, y1, w, h),
+                        'area': 24315
+                    },
+                    ...
+                ]
+            },
+            ...
+        ]
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as class
+            information. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            ``dict(img=None, ann=None, seg=None)``. The prefix ``seg`` which is
+            for panoptic segmentation map must be not None.
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=False``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+    """
+
+    METAINFO = {
+        'classes':
+        ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+         'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+         'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+         'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+         'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+         'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+         'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+         'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+         'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+         'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+         'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+         'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+         'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+         'blanket', 'bridge', 'cardboard', 'counter', 'curtain', 'door-stuff',
+         'floor-wood', 'flower', 'fruit', 'gravel', 'house', 'light',
+         'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield',
+         'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow',
+         'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile',
+         'wall-wood', 'water-other', 'window-blind', 'window-other',
+         'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+         'cabinet-merged', 'table-merged', 'floor-other-merged',
+         'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged',
+         'paper-merged', 'food-other-merged', 'building-other-merged',
+         'rock-merged', 'wall-other-merged', 'rug-merged'),
+        'thing_classes':
+        ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+         'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+         'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+         'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+         'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+         'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+         'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+         'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+         'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+         'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+         'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+         'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+         'scissors', 'teddy bear', 'hair drier', 'toothbrush'),
+        'stuff_classes':
+        ('banner', 'blanket', 'bridge', 'cardboard', 'counter', 'curtain',
+         'door-stuff', 'floor-wood', 'flower', 'fruit', 'gravel', 'house',
+         'light', 'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield',
+         'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow',
+         'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile',
+         'wall-wood', 'water-other', 'window-blind', 'window-other',
+         'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+         'cabinet-merged', 'table-merged', 'floor-other-merged',
+         'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged',
+         'paper-merged', 'food-other-merged', 'building-other-merged',
+         'rock-merged', 'wall-other-merged', 'rug-merged'),
+        'palette':
+        [(220, 20, 60), (119, 11, 32), (0, 0, 142), (0, 0, 230), (106, 0, 228),
+         (0, 60, 100), (0, 80, 100), (0, 0, 70), (0, 0, 192), (250, 170, 30),
+         (100, 170, 30), (220, 220, 0), (175, 116, 175), (250, 0, 30),
+         (165, 42, 42), (255, 77, 255), (0, 226, 252), (182, 182, 255),
+         (0, 82, 0), (120, 166, 157), (110, 76, 0), (174, 57, 255),
+         (199, 100, 0), (72, 0, 118), (255, 179, 240), (0, 125, 92),
+         (209, 0, 151), (188, 208, 182), (0, 220, 176), (255, 99, 164),
+         (92, 0, 73), (133, 129, 255), (78, 180, 255), (0, 228, 0),
+         (174, 255, 243), (45, 89, 255), (134, 134, 103), (145, 148, 174),
+         (255, 208, 186), (197, 226, 255), (171, 134, 1), (109, 63, 54),
+         (207, 138, 255), (151, 0, 95), (9, 80, 61), (84, 105, 51),
+         (74, 65, 105), (166, 196, 102), (208, 195, 210), (255, 109, 65),
+         (0, 143, 149), (179, 0, 194), (209, 99, 106), (5, 121, 0),
+         (227, 255, 205), (147, 186, 208), (153, 69, 1), (3, 95, 161),
+         (163, 255, 0), (119, 0, 170), (0, 182, 199), (0, 165, 120),
+         (183, 130, 88), (95, 32, 0), (130, 114, 135), (110, 129, 133),
+         (166, 74, 118), (219, 142, 185), (79, 210, 114), (178, 90, 62),
+         (65, 70, 15), (127, 167, 115), (59, 105, 106), (142, 108, 45),
+         (196, 172, 0), (95, 54, 80), (128, 76, 255), (201, 57, 1),
+         (246, 0, 122), (191, 162, 208), (255, 255, 128), (147, 211, 203),
+         (150, 100, 100), (168, 171, 172), (146, 112, 198), (210, 170, 100),
+         (92, 136, 89), (218, 88, 184), (241, 129, 0), (217, 17, 255),
+         (124, 74, 181), (70, 70, 70), (255, 228, 255), (154, 208, 0),
+         (193, 0, 92), (76, 91, 113), (255, 180, 195), (106, 154, 176),
+         (230, 150, 140), (60, 143, 255), (128, 64, 128), (92, 82, 55),
+         (254, 212, 124), (73, 77, 174), (255, 160, 98), (255, 255, 255),
+         (104, 84, 109), (169, 164, 131), (225, 199, 255), (137, 54, 74),
+         (135, 158, 223), (7, 246, 231), (107, 255, 200), (58, 41, 149),
+         (183, 121, 142), (255, 73, 97), (107, 142, 35), (190, 153, 153),
+         (146, 139, 141), (70, 130, 180), (134, 199, 156), (209, 226, 140),
+         (96, 36, 108), (96, 96, 96), (64, 170, 64), (152, 251, 152),
+         (208, 229, 228), (206, 186, 171), (152, 161, 64), (116, 112, 0),
+         (0, 114, 143), (102, 102, 156), (250, 141, 255)]
+    }
+    COCOAPI = COCOPanoptic
+    # ann_id is not unique in coco panoptic dataset.
+    ANN_ID_UNIQUE = False
+
+    def __init__(self,
+                 ann_file: str = '',
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: dict = dict(img=None, ann=None, seg=None),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000,
+                 backend_args: dict = None,
+                 **kwargs) -> None:
+        super().__init__(
+            ann_file=ann_file,
+            metainfo=metainfo,
+            data_root=data_root,
+            data_prefix=data_prefix,
+            filter_cfg=filter_cfg,
+            indices=indices,
+            serialize_data=serialize_data,
+            pipeline=pipeline,
+            test_mode=test_mode,
+            lazy_init=lazy_init,
+            max_refetch=max_refetch,
+            backend_args=backend_args,
+            **kwargs)
+
+    def parse_data_info(self, raw_data_info: dict) -> dict:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``.
+
+        Returns:
+            dict: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+        # filter out unmatched annotations which have
+        # same segment_id but belong to other image
+        ann_info = [
+            ann for ann in ann_info if ann['image_id'] == img_info['img_id']
+        ]
+        data_info = {}
+
+        img_path = osp.join(self.data_prefix['img'], img_info['file_name'])
+        if self.data_prefix.get('seg', None):
+            seg_map_path = osp.join(
+                self.data_prefix['seg'],
+                img_info['file_name'].replace('.jpg', '.png'))
+        else:
+            seg_map_path = None
+        data_info['img_path'] = img_path
+        data_info['img_id'] = img_info['img_id']
+        data_info['seg_map_path'] = seg_map_path
+        data_info['height'] = img_info['height']
+        data_info['width'] = img_info['width']
+
+        if self.return_classes:
+            data_info['text'] = self.metainfo['thing_classes']
+            data_info['stuff_text'] = self.metainfo['stuff_classes']
+            data_info['custom_entities'] = True  # no important
+
+        instances = []
+        segments_info = []
+        for ann in ann_info:
+            instance = {}
+            x1, y1, w, h = ann['bbox']
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            category_id = ann['category_id']
+            contiguous_cat_id = self.cat2label[category_id]
+
+            is_thing = self.coco.load_cats(ids=category_id)[0]['isthing']
+            if is_thing:
+                is_crowd = ann.get('iscrowd', False)
+                instance['bbox'] = bbox
+                instance['bbox_label'] = contiguous_cat_id
+                if not is_crowd:
+                    instance['ignore_flag'] = 0
+                else:
+                    instance['ignore_flag'] = 1
+                    is_thing = False
+
+            segment_info = {
+                'id': ann['id'],
+                'category': contiguous_cat_id,
+                'is_thing': is_thing
+            }
+            segments_info.append(segment_info)
+            if len(instance) > 0 and is_thing:
+                instances.append(instance)
+        data_info['instances'] = instances
+        data_info['segments_info'] = segments_info
+        return data_info
+
+    def filter_data(self) -> List[dict]:
+        """Filter images too small or without ground truth.
+
+        Returns:
+            List[dict]: ``self.data_list`` after filtering.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        if self.filter_cfg is None:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False)
+        min_size = self.filter_cfg.get('min_size', 0)
+
+        ids_with_ann = set()
+        # check whether images have legal thing annotations.
+        for data_info in self.data_list:
+            for segment_info in data_info['segments_info']:
+                if not segment_info['is_thing']:
+                    continue
+                ids_with_ann.add(data_info['img_id'])
+
+        valid_data_list = []
+        for data_info in self.data_list:
+            img_id = data_info['img_id']
+            width = data_info['width']
+            height = data_info['height']
+            if filter_empty_gt and img_id not in ids_with_ann:
+                continue
+            if min(width, height) >= min_size:
+                valid_data_list.append(data_info)
+
+        return valid_data_list
diff --git a/head_extractor/src/mmdet/datasets/coco_semantic.py b/head_extractor/src/mmdet/datasets/coco_semantic.py
new file mode 100644
index 0000000000000000000000000000000000000000..752568454456c1e5edcb2a24c6c2b46f042cb334
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/coco_semantic.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .ade20k import ADE20KSegDataset
+
+
+@DATASETS.register_module()
+class CocoSegDataset(ADE20KSegDataset):
+    """COCO dataset.
+
+    In segmentation map annotation for COCO. The ``img_suffix`` is fixed to
+    '.jpg',  and ``seg_map_suffix`` is fixed to '.png'.
+    """
+
+    METAINFO = dict(
+        classes=(
+            'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+            'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+            'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+            'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+            'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+            'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
+            'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+            'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+            'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+            'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
+            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+            'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+            'blanket', 'branch', 'bridge', 'building-other', 'bush', 'cabinet',
+            'cage', 'cardboard', 'carpet', 'ceiling-other', 'ceiling-tile',
+            'cloth', 'clothes', 'clouds', 'counter', 'cupboard', 'curtain',
+            'desk-stuff', 'dirt', 'door-stuff', 'fence', 'floor-marble',
+            'floor-other', 'floor-stone', 'floor-tile', 'floor-wood', 'flower',
+            'fog', 'food-other', 'fruit', 'furniture-other', 'grass', 'gravel',
+            'ground-other', 'hill', 'house', 'leaves', 'light', 'mat', 'metal',
+            'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net',
+            'paper', 'pavement', 'pillow', 'plant-other', 'plastic',
+            'platform', 'playingfield', 'railing', 'railroad', 'river', 'road',
+            'rock', 'roof', 'rug', 'salad', 'sand', 'sea', 'shelf',
+            'sky-other', 'skyscraper', 'snow', 'solid-other', 'stairs',
+            'stone', 'straw', 'structural-other', 'table', 'tent',
+            'textile-other', 'towel', 'tree', 'vegetable', 'wall-brick',
+            'wall-concrete', 'wall-other', 'wall-panel', 'wall-stone',
+            'wall-tile', 'wall-wood', 'water-other', 'waterdrops',
+            'window-blind', 'window-other', 'wood'),
+        palette=[(120, 120, 120), (180, 120, 120), (6, 230, 230), (80, 50, 50),
+                 (4, 200, 3), (120, 120, 80), (140, 140, 140), (204, 5, 255),
+                 (230, 230, 230), (4, 250, 7), (224, 5, 255), (235, 255, 7),
+                 (150, 5, 61), (120, 120, 70), (8, 255, 51), (255, 6, 82),
+                 (143, 255, 140), (204, 255, 4), (255, 51, 7), (204, 70, 3),
+                 (0, 102, 200), (61, 230, 250), (255, 6, 51), (11, 102, 255),
+                 (255, 7, 71), (255, 9, 224), (9, 7, 230), (220, 220, 220),
+                 (255, 9, 92), (112, 9, 255), (8, 255, 214), (7, 255, 224),
+                 (255, 184, 6), (10, 255, 71), (255, 41, 10), (7, 255, 255),
+                 (224, 255, 8), (102, 8, 255), (255, 61, 6), (255, 194, 7),
+                 (255, 122, 8), (0, 255, 20), (255, 8, 41), (255, 5, 153),
+                 (6, 51, 255), (235, 12, 255), (160, 150, 20), (0, 163, 255),
+                 (140, 140, 140), (250, 10, 15), (20, 255, 0), (31, 255, 0),
+                 (255, 31, 0), (255, 224, 0), (153, 255, 0), (0, 0, 255),
+                 (255, 71, 0), (0, 235, 255), (0, 173, 255), (31, 0, 255),
+                 (11, 200, 200), (255, 82, 0), (0, 255, 245), (0, 61, 255),
+                 (0, 255, 112), (0, 255, 133), (255, 0, 0), (255, 163, 0),
+                 (255, 102, 0), (194, 255, 0), (0, 143, 255), (51, 255, 0),
+                 (0, 82, 255), (0, 255, 41), (0, 255, 173), (10, 0, 255),
+                 (173, 255, 0), (0, 255, 153), (255, 92, 0), (255, 0, 255),
+                 (255, 0, 245), (255, 0, 102), (255, 173, 0), (255, 0, 20),
+                 (255, 184, 184), (0, 31, 255), (0, 255, 61), (0, 71, 255),
+                 (255, 0, 204), (0, 255, 194), (0, 255, 82), (0, 10, 255),
+                 (0, 112, 255), (51, 0, 255), (0, 194, 255), (0, 122, 255),
+                 (0, 255, 163), (255, 153, 0), (0, 255, 10), (255, 112, 0),
+                 (143, 255, 0), (82, 0, 255), (163, 255, 0), (255, 235, 0),
+                 (8, 184, 170), (133, 0, 255), (0, 255, 92), (184, 0, 255),
+                 (255, 0, 31), (0, 184, 255), (0, 214, 255), (255, 0, 112),
+                 (92, 255, 0), (0, 224, 255), (112, 224, 255), (70, 184, 160),
+                 (163, 0, 255), (153, 0, 255), (71, 255, 0), (255, 0, 163),
+                 (255, 204, 0), (255, 0, 143), (0, 255, 235), (133, 255, 0),
+                 (255, 0, 235), (245, 0, 255), (255, 0, 122), (255, 245, 0),
+                 (10, 190, 212), (214, 255, 0), (0, 204, 255), (20, 0, 255),
+                 (255, 255, 0), (0, 153, 255), (0, 41, 255), (0, 255, 204),
+                 (41, 0, 255), (41, 255, 0), (173, 0, 255), (0, 245, 255),
+                 (71, 0, 255), (122, 0, 255), (0, 255, 184), (0, 92, 255),
+                 (184, 255, 0), (0, 133, 255), (255, 214, 0), (25, 194, 194),
+                 (102, 255, 0), (92, 0, 255), (107, 255, 200), (58, 41, 149),
+                 (183, 121, 142), (255, 73, 97), (107, 142, 35),
+                 (190, 153, 153), (146, 139, 141), (70, 130, 180),
+                 (134, 199, 156), (209, 226, 140), (96, 36, 108), (96, 96, 96),
+                 (64, 170, 64), (152, 251, 152), (208, 229, 228),
+                 (206, 186, 171), (152, 161, 64), (116, 112, 0), (0, 114, 143),
+                 (102, 102, 156), (250, 141, 255)])
diff --git a/head_extractor/src/mmdet/datasets/crowdhuman.py b/head_extractor/src/mmdet/datasets/crowdhuman.py
new file mode 100644
index 0000000000000000000000000000000000000000..650176ee545ba6a10a816517553b3b77718d945b
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/crowdhuman.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import logging
+import os.path as osp
+import warnings
+from typing import List, Union
+
+import mmcv
+from mmengine.dist import get_rank
+from mmengine.fileio import dump, get, get_text, load
+from mmengine.logging import print_log
+from mmengine.utils import ProgressBar
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class CrowdHumanDataset(BaseDetDataset):
+    r"""Dataset for CrowdHuman.
+
+    Args:
+        data_root (str): The root directory for
+            ``data_prefix`` and ``ann_file``.
+        ann_file (str): Annotation file path.
+        extra_ann_file (str | optional):The path of extra image metas
+            for CrowdHuman. It can be created by CrowdHumanDataset
+            automatically or by tools/misc/get_crowdhuman_id_hw.py
+            manually. Defaults to None.
+    """
+
+    METAINFO = {
+        'classes': ('person', ),
+        # palette is a list of color tuples, which is used for visualization.
+        'palette': [(220, 20, 60)]
+    }
+
+    def __init__(self, data_root, ann_file, extra_ann_file=None, **kwargs):
+        # extra_ann_file record the size of each image. This file is
+        # automatically created when you first load the CrowdHuman
+        # dataset by mmdet.
+        if extra_ann_file is not None:
+            self.extra_ann_exist = True
+            self.extra_anns = load(extra_ann_file)
+        else:
+            ann_file_name = osp.basename(ann_file)
+            if 'train' in ann_file_name:
+                self.extra_ann_file = osp.join(data_root, 'id_hw_train.json')
+            elif 'val' in ann_file_name:
+                self.extra_ann_file = osp.join(data_root, 'id_hw_val.json')
+            self.extra_ann_exist = False
+            if not osp.isfile(self.extra_ann_file):
+                print_log(
+                    'extra_ann_file does not exist, prepare to collect '
+                    'image height and width...',
+                    level=logging.INFO)
+                self.extra_anns = {}
+            else:
+                self.extra_ann_exist = True
+                self.extra_anns = load(self.extra_ann_file)
+        super().__init__(data_root=data_root, ann_file=ann_file, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        anno_strs = get_text(
+            self.ann_file, backend_args=self.backend_args).strip().split('\n')
+        print_log('loading CrowdHuman annotation...', level=logging.INFO)
+        data_list = []
+        prog_bar = ProgressBar(len(anno_strs))
+        for i, anno_str in enumerate(anno_strs):
+            anno_dict = json.loads(anno_str)
+            parsed_data_info = self.parse_data_info(anno_dict)
+            data_list.append(parsed_data_info)
+            prog_bar.update()
+        if not self.extra_ann_exist and get_rank() == 0:
+            #  TODO: support file client
+            try:
+                dump(self.extra_anns, self.extra_ann_file, file_format='json')
+            except:  # noqa
+                warnings.warn(
+                    'Cache files can not be saved automatically! To speed up'
+                    'loading the dataset, please manually generate the cache'
+                    ' file by file tools/misc/get_crowdhuman_id_hw.py')
+
+            print_log(
+                f'\nsave extra_ann_file in {self.data_root}',
+                level=logging.INFO)
+
+        del self.extra_anns
+        print_log('\nDone', level=logging.INFO)
+        return data_list
+
+    def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        data_info = {}
+        img_path = osp.join(self.data_prefix['img'],
+                            f"{raw_data_info['ID']}.jpg")
+        data_info['img_path'] = img_path
+        data_info['img_id'] = raw_data_info['ID']
+
+        if not self.extra_ann_exist:
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, backend='cv2')
+            data_info['height'], data_info['width'] = img.shape[:2]
+            self.extra_anns[raw_data_info['ID']] = img.shape[:2]
+            del img, img_bytes
+        else:
+            data_info['height'], data_info['width'] = self.extra_anns[
+                raw_data_info['ID']]
+
+        instances = []
+        for i, ann in enumerate(raw_data_info['gtboxes']):
+            instance = {}
+            if ann['tag'] not in self.metainfo['classes']:
+                instance['bbox_label'] = -1
+                instance['ignore_flag'] = 1
+            else:
+                instance['bbox_label'] = self.metainfo['classes'].index(
+                    ann['tag'])
+                instance['ignore_flag'] = 0
+            if 'extra' in ann:
+                if 'ignore' in ann['extra']:
+                    if ann['extra']['ignore'] != 0:
+                        instance['bbox_label'] = -1
+                        instance['ignore_flag'] = 1
+
+            x1, y1, w, h = ann['fbox']
+            bbox = [x1, y1, x1 + w, y1 + h]
+            instance['bbox'] = bbox
+
+            # Record the full bbox(fbox), head bbox(hbox) and visible
+            # bbox(vbox) as additional information. If you need to use
+            # this information, you just need to design the pipeline
+            # instead of overriding the CrowdHumanDataset.
+            instance['fbox'] = bbox
+            hbox = ann['hbox']
+            instance['hbox'] = [
+                hbox[0], hbox[1], hbox[0] + hbox[2], hbox[1] + hbox[3]
+            ]
+            vbox = ann['vbox']
+            instance['vbox'] = [
+                vbox[0], vbox[1], vbox[0] + vbox[2], vbox[1] + vbox[3]
+            ]
+
+            instances.append(instance)
+
+        data_info['instances'] = instances
+        return data_info
diff --git a/head_extractor/src/mmdet/datasets/dataset_wrappers.py b/head_extractor/src/mmdet/datasets/dataset_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4e26e07c0f8a9e9f106bcd351f71e7b24d6ccf9
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/dataset_wrappers.py
@@ -0,0 +1,260 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections
+import copy
+from typing import List, Sequence, Union
+
+from mmengine.dataset import BaseDataset
+from mmengine.dataset import ConcatDataset as MMENGINE_ConcatDataset
+from mmengine.dataset import force_full_init
+
+from mmdet.registry import DATASETS, TRANSFORMS
+
+
+@DATASETS.register_module()
+class MultiImageMixDataset:
+    """A wrapper of multiple images mixed dataset.
+
+    Suitable for training on multiple images mixed data augmentation like
+    mosaic and mixup. For the augmentation pipeline of mixed image data,
+    the `get_indexes` method needs to be provided to obtain the image
+    indexes, and you can set `skip_flags` to change the pipeline running
+    process. At the same time, we provide the `dynamic_scale` parameter
+    to dynamically change the output image size.
+
+    Args:
+        dataset (:obj:`CustomDataset`): The dataset to be mixed.
+        pipeline (Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        dynamic_scale (tuple[int], optional): The image scale can be changed
+            dynamically. Default to None. It is deprecated.
+        skip_type_keys (list[str], optional): Sequence of type string to
+            be skip pipeline. Default to None.
+        max_refetch (int): The maximum number of retry iterations for getting
+            valid results from the pipeline. If the number of iterations is
+            greater than `max_refetch`, but results is still None, then the
+            iteration is terminated and raise the error. Default: 15.
+    """
+
+    def __init__(self,
+                 dataset: Union[BaseDataset, dict],
+                 pipeline: Sequence[str],
+                 skip_type_keys: Union[Sequence[str], None] = None,
+                 max_refetch: int = 15,
+                 lazy_init: bool = False) -> None:
+        assert isinstance(pipeline, collections.abc.Sequence)
+        if skip_type_keys is not None:
+            assert all([
+                isinstance(skip_type_key, str)
+                for skip_type_key in skip_type_keys
+            ])
+        self._skip_type_keys = skip_type_keys
+
+        self.pipeline = []
+        self.pipeline_types = []
+        for transform in pipeline:
+            if isinstance(transform, dict):
+                self.pipeline_types.append(transform['type'])
+                transform = TRANSFORMS.build(transform)
+                self.pipeline.append(transform)
+            else:
+                raise TypeError('pipeline must be a dict')
+
+        self.dataset: BaseDataset
+        if isinstance(dataset, dict):
+            self.dataset = DATASETS.build(dataset)
+        elif isinstance(dataset, BaseDataset):
+            self.dataset = dataset
+        else:
+            raise TypeError(
+                'elements in datasets sequence should be config or '
+                f'`BaseDataset` instance, but got {type(dataset)}')
+
+        self._metainfo = self.dataset.metainfo
+        if hasattr(self.dataset, 'flag'):
+            self.flag = self.dataset.flag
+        self.num_samples = len(self.dataset)
+        self.max_refetch = max_refetch
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        """Get the meta information of the multi-image-mixed dataset.
+
+        Returns:
+            dict: The meta information of multi-image-mixed dataset.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self):
+        """Loop to ``full_init`` each dataset."""
+        if self._fully_initialized:
+            return
+
+        self.dataset.full_init()
+        self._ori_len = len(self.dataset)
+        self._fully_initialized = True
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index.
+
+        Args:
+            idx (int): Global index of ``ConcatDataset``.
+
+        Returns:
+            dict: The idx-th annotation of the datasets.
+        """
+        return self.dataset.get_data_info(idx)
+
+    @force_full_init
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        results = copy.deepcopy(self.dataset[idx])
+        for (transform, transform_type) in zip(self.pipeline,
+                                               self.pipeline_types):
+            if self._skip_type_keys is not None and \
+                    transform_type in self._skip_type_keys:
+                continue
+
+            if hasattr(transform, 'get_indexes'):
+                for i in range(self.max_refetch):
+                    # Make sure the results passed the loading pipeline
+                    # of the original dataset is not None.
+                    indexes = transform.get_indexes(self.dataset)
+                    if not isinstance(indexes, collections.abc.Sequence):
+                        indexes = [indexes]
+                    mix_results = [
+                        copy.deepcopy(self.dataset[index]) for index in indexes
+                    ]
+                    if None not in mix_results:
+                        results['mix_results'] = mix_results
+                        break
+                else:
+                    raise RuntimeError(
+                        'The loading pipeline of the original dataset'
+                        ' always return None. Please check the correctness '
+                        'of the dataset and its pipeline.')
+
+            for i in range(self.max_refetch):
+                # To confirm the results passed the training pipeline
+                # of the wrapper is not None.
+                updated_results = transform(copy.deepcopy(results))
+                if updated_results is not None:
+                    results = updated_results
+                    break
+            else:
+                raise RuntimeError(
+                    'The training pipeline of the dataset wrapper'
+                    ' always return None.Please check the correctness '
+                    'of the dataset and its pipeline.')
+
+            if 'mix_results' in results:
+                results.pop('mix_results')
+
+        return results
+
+    def update_skip_type_keys(self, skip_type_keys):
+        """Update skip_type_keys. It is called by an external hook.
+
+        Args:
+            skip_type_keys (list[str], optional): Sequence of type
+                string to be skip pipeline.
+        """
+        assert all([
+            isinstance(skip_type_key, str) for skip_type_key in skip_type_keys
+        ])
+        self._skip_type_keys = skip_type_keys
+
+
+@DATASETS.register_module()
+class ConcatDataset(MMENGINE_ConcatDataset):
+    """A wrapper of concatenated dataset.
+
+    Same as ``torch.utils.data.dataset.ConcatDataset``, support
+    lazy_init and get_dataset_source.
+
+    Note:
+        ``ConcatDataset`` should not inherit from ``BaseDataset`` since
+        ``get_subset`` and ``get_subset_`` could produce ambiguous meaning
+        sub-dataset which conflicts with original dataset. If you want to use
+        a sub-dataset of ``ConcatDataset``, you should set ``indices``
+        arguments for wrapped dataset which inherit from ``BaseDataset``.
+
+    Args:
+        datasets (Sequence[BaseDataset] or Sequence[dict]): A list of datasets
+            which will be concatenated.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. Defaults to False.
+        ignore_keys (List[str] or str): Ignore the keys that can be
+            unequal in `dataset.metainfo`. Defaults to None.
+            `New in version 0.3.0.`
+    """
+
+    def __init__(self,
+                 datasets: Sequence[Union[BaseDataset, dict]],
+                 lazy_init: bool = False,
+                 ignore_keys: Union[str, List[str], None] = None):
+        self.datasets: List[BaseDataset] = []
+        for i, dataset in enumerate(datasets):
+            if isinstance(dataset, dict):
+                self.datasets.append(DATASETS.build(dataset))
+            elif isinstance(dataset, BaseDataset):
+                self.datasets.append(dataset)
+            else:
+                raise TypeError(
+                    'elements in datasets sequence should be config or '
+                    f'`BaseDataset` instance, but got {type(dataset)}')
+        if ignore_keys is None:
+            self.ignore_keys = []
+        elif isinstance(ignore_keys, str):
+            self.ignore_keys = [ignore_keys]
+        elif isinstance(ignore_keys, list):
+            self.ignore_keys = ignore_keys
+        else:
+            raise TypeError('ignore_keys should be a list or str, '
+                            f'but got {type(ignore_keys)}')
+
+        meta_keys: set = set()
+        for dataset in self.datasets:
+            meta_keys |= dataset.metainfo.keys()
+        # if the metainfo of multiple datasets are the same, use metainfo
+        # of the first dataset, else the metainfo is a list with metainfo
+        # of all the datasets
+        is_all_same = True
+        self._metainfo_first = self.datasets[0].metainfo
+        for i, dataset in enumerate(self.datasets, 1):
+            for key in meta_keys:
+                if key in self.ignore_keys:
+                    continue
+                if key not in dataset.metainfo:
+                    is_all_same = False
+                    break
+                if self._metainfo_first[key] != dataset.metainfo[key]:
+                    is_all_same = False
+                    break
+
+        if is_all_same:
+            self._metainfo = self.datasets[0].metainfo
+        else:
+            self._metainfo = [dataset.metainfo for dataset in self.datasets]
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+            if is_all_same:
+                self._metainfo.update(
+                    dict(cumulative_sizes=self.cumulative_sizes))
+            else:
+                for i, dataset in enumerate(self.datasets):
+                    self._metainfo[i].update(
+                        dict(cumulative_sizes=self.cumulative_sizes))
+
+    def get_dataset_source(self, idx: int) -> int:
+        dataset_idx, _ = self._get_ori_dataset_idx(idx)
+        return dataset_idx
diff --git a/head_extractor/src/mmdet/datasets/deepfashion.py b/head_extractor/src/mmdet/datasets/deepfashion.py
new file mode 100644
index 0000000000000000000000000000000000000000..f853fc63398d598b90a88323e660ba6f4d81e2df
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/deepfashion.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class DeepFashionDataset(CocoDataset):
+    """Dataset for DeepFashion."""
+
+    METAINFO = {
+        'classes': ('top', 'skirt', 'leggings', 'dress', 'outer', 'pants',
+                    'bag', 'neckwear', 'headwear', 'eyeglass', 'belt',
+                    'footwear', 'hair', 'skin', 'face'),
+        # palette is a list of color tuples, which is used for visualization.
+        'palette': [(0, 192, 64), (0, 64, 96), (128, 192, 192), (0, 64, 64),
+                    (0, 192, 224), (0, 192, 192), (128, 192, 64), (0, 192, 96),
+                    (128, 32, 192), (0, 0, 224), (0, 0, 64), (0, 160, 192),
+                    (128, 0, 96), (128, 0, 192), (0, 32, 192)]
+    }
diff --git a/head_extractor/src/mmdet/datasets/dod.py b/head_extractor/src/mmdet/datasets/dod.py
new file mode 100644
index 0000000000000000000000000000000000000000..152d32aaf70c7fb5e3730d46d26e150fc1204f22
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/dod.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List, Optional
+
+import numpy as np
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+try:
+    from d_cube import D3
+except ImportError:
+    D3 = None
+from .api_wrappers import COCO
+
+
+@DATASETS.register_module()
+class DODDataset(BaseDetDataset):
+
+    def __init__(self,
+                 *args,
+                 data_root: Optional[str] = '',
+                 data_prefix: dict = dict(img_path=''),
+                 **kwargs) -> None:
+        if D3 is None:
+            raise ImportError(
+                'Please install d3 by `pip install ddd-dataset`.')
+        pkl_anno_path = osp.join(data_root, data_prefix['anno'])
+        self.img_root = osp.join(data_root, data_prefix['img'])
+        self.d3 = D3(self.img_root, pkl_anno_path)
+
+        sent_infos = self.d3.load_sents()
+        classes = tuple([sent_info['raw_sent'] for sent_info in sent_infos])
+        super().__init__(
+            *args,
+            data_root=data_root,
+            data_prefix=data_prefix,
+            metainfo={'classes': classes},
+            **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        coco = COCO(self.ann_file)
+        data_list = []
+        img_ids = self.d3.get_img_ids()
+        for img_id in img_ids:
+            data_info = {}
+
+            img_info = self.d3.load_imgs(img_id)[0]
+            file_name = img_info['file_name']
+            img_path = osp.join(self.img_root, file_name)
+            data_info['img_path'] = img_path
+            data_info['img_id'] = img_id
+            data_info['height'] = img_info['height']
+            data_info['width'] = img_info['width']
+
+            group_ids = self.d3.get_group_ids(img_ids=[img_id])
+            sent_ids = self.d3.get_sent_ids(group_ids=group_ids)
+            sent_list = self.d3.load_sents(sent_ids=sent_ids)
+            text_list = [sent['raw_sent'] for sent in sent_list]
+            ann_ids = coco.get_ann_ids(img_ids=[img_id])
+            anno = coco.load_anns(ann_ids)
+
+            data_info['text'] = text_list
+            data_info['sent_ids'] = np.array([s for s in sent_ids])
+            data_info['custom_entities'] = True
+
+            instances = []
+            for i, ann in enumerate(anno):
+                instance = {}
+                x1, y1, w, h = ann['bbox']
+                bbox = [x1, y1, x1 + w, y1 + h]
+                instance['ignore_flag'] = 0
+                instance['bbox'] = bbox
+                instance['bbox_label'] = ann['category_id'] - 1
+                instances.append(instance)
+            data_info['instances'] = instances
+            data_list.append(data_info)
+        return data_list
diff --git a/head_extractor/src/mmdet/datasets/dsdl.py b/head_extractor/src/mmdet/datasets/dsdl.py
new file mode 100644
index 0000000000000000000000000000000000000000..75570a2a6396e0e7a4ce5cac5dbf2a23cd164629
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/dsdl.py
@@ -0,0 +1,192 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import List
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+try:
+    from dsdl.dataset import DSDLDataset
+except ImportError:
+    DSDLDataset = None
+
+
+@DATASETS.register_module()
+class DSDLDetDataset(BaseDetDataset):
+    """Dataset for dsdl detection.
+
+    Args:
+        with_bbox(bool): Load bbox or not, defaults to be True.
+        with_polygon(bool): Load polygon or not, defaults to be False.
+        with_mask(bool): Load seg map mask or not, defaults to be False.
+        with_imagelevel_label(bool): Load image level label or not,
+            defaults to be False.
+        with_hierarchy(bool): Load hierarchy information or not,
+            defaults to be False.
+        specific_key_path(dict): Path of specific key which can not
+            be loaded by it's field name.
+        pre_transform(dict): pre-transform functions before loading.
+    """
+
+    METAINFO = {}
+
+    def __init__(self,
+                 with_bbox: bool = True,
+                 with_polygon: bool = False,
+                 with_mask: bool = False,
+                 with_imagelevel_label: bool = False,
+                 with_hierarchy: bool = False,
+                 specific_key_path: dict = {},
+                 pre_transform: dict = {},
+                 **kwargs) -> None:
+
+        if DSDLDataset is None:
+            raise RuntimeError(
+                'Package dsdl is not installed. Please run "pip install dsdl".'
+            )
+
+        self.with_hierarchy = with_hierarchy
+        self.specific_key_path = specific_key_path
+
+        loc_config = dict(type='LocalFileReader', working_dir='')
+        if kwargs.get('data_root'):
+            kwargs['ann_file'] = os.path.join(kwargs['data_root'],
+                                              kwargs['ann_file'])
+        self.required_fields = ['Image', 'ImageShape', 'Label', 'ignore_flag']
+        if with_bbox:
+            self.required_fields.append('Bbox')
+        if with_polygon:
+            self.required_fields.append('Polygon')
+        if with_mask:
+            self.required_fields.append('LabelMap')
+        if with_imagelevel_label:
+            self.required_fields.append('image_level_labels')
+            assert 'image_level_labels' in specific_key_path.keys(
+            ), '`image_level_labels` not specified in `specific_key_path` !'
+
+        self.extra_keys = [
+            key for key in self.specific_key_path.keys()
+            if key not in self.required_fields
+        ]
+
+        self.dsdldataset = DSDLDataset(
+            dsdl_yaml=kwargs['ann_file'],
+            location_config=loc_config,
+            required_fields=self.required_fields,
+            specific_key_path=specific_key_path,
+            transform=pre_transform,
+        )
+
+        BaseDetDataset.__init__(self, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load data info from an dsdl yaml file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of data info.
+        """
+        if self.with_hierarchy:
+            # get classes_names and relation_matrix
+            classes_names, relation_matrix = \
+                self.dsdldataset.class_dom.get_hierarchy_info()
+            self._metainfo['classes'] = tuple(classes_names)
+            self._metainfo['RELATION_MATRIX'] = relation_matrix
+
+        else:
+            self._metainfo['classes'] = tuple(self.dsdldataset.class_names)
+
+        data_list = []
+
+        for i, data in enumerate(self.dsdldataset):
+            # basic image info, including image id, path and size.
+            datainfo = dict(
+                img_id=i,
+                img_path=os.path.join(self.data_prefix['img_path'],
+                                      data['Image'][0].location),
+                width=data['ImageShape'][0].width,
+                height=data['ImageShape'][0].height,
+            )
+
+            # get image label info
+            if 'image_level_labels' in data.keys():
+                if self.with_hierarchy:
+                    # get leaf node name when using hierarchy classes
+                    datainfo['image_level_labels'] = [
+                        self._metainfo['classes'].index(i.leaf_node_name)
+                        for i in data['image_level_labels']
+                    ]
+                else:
+                    datainfo['image_level_labels'] = [
+                        self._metainfo['classes'].index(i.name)
+                        for i in data['image_level_labels']
+                    ]
+
+            # get semantic segmentation info
+            if 'LabelMap' in data.keys():
+                datainfo['seg_map_path'] = data['LabelMap']
+
+            # load instance info
+            instances = []
+            if 'Bbox' in data.keys():
+                for idx in range(len(data['Bbox'])):
+                    bbox = data['Bbox'][idx]
+                    if self.with_hierarchy:
+                        # get leaf node name when using hierarchy classes
+                        label = data['Label'][idx].leaf_node_name
+                        label_index = self._metainfo['classes'].index(label)
+                    else:
+                        label = data['Label'][idx].name
+                        label_index = self._metainfo['classes'].index(label)
+
+                    instance = {}
+                    instance['bbox'] = bbox.xyxy
+                    instance['bbox_label'] = label_index
+
+                    if 'ignore_flag' in data.keys():
+                        # get ignore flag
+                        instance['ignore_flag'] = data['ignore_flag'][idx]
+                    else:
+                        instance['ignore_flag'] = 0
+
+                    if 'Polygon' in data.keys():
+                        # get polygon info
+                        polygon = data['Polygon'][idx]
+                        instance['mask'] = polygon.openmmlabformat
+
+                    for key in self.extra_keys:
+                        # load extra instance info
+                        instance[key] = data[key][idx]
+
+                    instances.append(instance)
+
+            datainfo['instances'] = instances
+            # append a standard sample in data list
+            if len(datainfo['instances']) > 0:
+                data_list.append(datainfo)
+
+        return data_list
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg.
+
+        Returns:
+            List[dict]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) \
+            if self.filter_cfg is not None else False
+        min_size = self.filter_cfg.get('min_size', 0) \
+            if self.filter_cfg is not None else 0
+
+        valid_data_list = []
+        for i, data_info in enumerate(self.data_list):
+            width = data_info['width']
+            height = data_info['height']
+            if filter_empty_gt and len(data_info['instances']) == 0:
+                continue
+            if min(width, height) >= min_size:
+                valid_data_list.append(data_info)
+
+        return valid_data_list
diff --git a/head_extractor/src/mmdet/datasets/flickr30k.py b/head_extractor/src/mmdet/datasets/flickr30k.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c76a41bc965bb0e8348c3d13e77d5c6e8ca08ce
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/flickr30k.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from pycocotools.coco import COCO
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+def convert_phrase_ids(phrase_ids: list) -> list:
+    unique_elements = sorted(set(phrase_ids))
+    element_to_new_label = {
+        element: label
+        for label, element in enumerate(unique_elements)
+    }
+    phrase_ids = [element_to_new_label[element] for element in phrase_ids]
+    return phrase_ids
+
+
+@DATASETS.register_module()
+class Flickr30kDataset(BaseDetDataset):
+    """Flickr30K Dataset."""
+
+    def load_data_list(self) -> List[dict]:
+
+        self.coco = COCO(self.ann_file)
+
+        self.ids = sorted(list(self.coco.imgs.keys()))
+
+        data_list = []
+        for img_id in self.ids:
+            if isinstance(img_id, str):
+                ann_ids = self.coco.getAnnIds(imgIds=[img_id], iscrowd=None)
+            else:
+                ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+
+            coco_img = self.coco.loadImgs(img_id)[0]
+
+            caption = coco_img['caption']
+            file_name = coco_img['file_name']
+            img_path = osp.join(self.data_prefix['img'], file_name)
+            width = coco_img['width']
+            height = coco_img['height']
+            tokens_positive = coco_img['tokens_positive_eval']
+            phrases = [caption[i[0][0]:i[0][1]] for i in tokens_positive]
+            phrase_ids = []
+
+            instances = []
+            annos = self.coco.loadAnns(ann_ids)
+            for anno in annos:
+                instance = {
+                    'bbox': [
+                        anno['bbox'][0], anno['bbox'][1],
+                        anno['bbox'][0] + anno['bbox'][2],
+                        anno['bbox'][1] + anno['bbox'][3]
+                    ],
+                    'bbox_label':
+                    anno['category_id'],
+                    'ignore_flag':
+                    anno['iscrowd']
+                }
+                phrase_ids.append(anno['phrase_ids'])
+                instances.append(instance)
+
+            phrase_ids = convert_phrase_ids(phrase_ids)
+
+            data_list.append(
+                dict(
+                    img_path=img_path,
+                    img_id=img_id,
+                    height=height,
+                    width=width,
+                    instances=instances,
+                    text=caption,
+                    phrase_ids=phrase_ids,
+                    tokens_positive=tokens_positive,
+                    phrases=phrases,
+                ))
+
+        return data_list
diff --git a/head_extractor/src/mmdet/datasets/isaid.py b/head_extractor/src/mmdet/datasets/isaid.py
new file mode 100644
index 0000000000000000000000000000000000000000..87067d8459c4dd6e80e5f808f613e0bd600b5f2f
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/isaid.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class iSAIDDataset(CocoDataset):
+    """Dataset for iSAID instance segmentation.
+
+    iSAID: A Large-scale Dataset for Instance Segmentation
+    in Aerial Images.
+
+    For more detail, please refer to "projects/iSAID/README.md"
+    """
+
+    METAINFO = dict(
+        classes=('background', 'ship', 'store_tank', 'baseball_diamond',
+                 'tennis_court', 'basketball_court', 'Ground_Track_Field',
+                 'Bridge', 'Large_Vehicle', 'Small_Vehicle', 'Helicopter',
+                 'Swimming_pool', 'Roundabout', 'Soccer_ball_field', 'plane',
+                 'Harbor'),
+        palette=[(0, 0, 0), (0, 0, 63), (0, 63, 63), (0, 63, 0), (0, 63, 127),
+                 (0, 63, 191), (0, 63, 255), (0, 127, 63), (0, 127, 127),
+                 (0, 0, 127), (0, 0, 191), (0, 0, 255), (0, 191, 127),
+                 (0, 127, 191), (0, 127, 255), (0, 100, 155)])
diff --git a/head_extractor/src/mmdet/datasets/lvis.py b/head_extractor/src/mmdet/datasets/lvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9629f5d463da183f0b4ab4c5d0f7ff7b07e4348
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/lvis.py
@@ -0,0 +1,638 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List
+
+from mmengine.fileio import get_local_path
+
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class LVISV05Dataset(CocoDataset):
+    """LVIS v0.5 dataset for detection."""
+
+    METAINFO = {
+        'classes':
+        ('acorn', 'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock',
+         'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet',
+         'antenna', 'apple', 'apple_juice', 'applesauce', 'apricot', 'apron',
+         'aquarium', 'armband', 'armchair', 'armoire', 'armor', 'artichoke',
+         'trash_can', 'ashtray', 'asparagus', 'atomizer', 'avocado', 'award',
+         'awning', 'ax', 'baby_buggy', 'basketball_backboard', 'backpack',
+         'handbag', 'suitcase', 'bagel', 'bagpipe', 'baguet', 'bait', 'ball',
+         'ballet_skirt', 'balloon', 'bamboo', 'banana', 'Band_Aid', 'bandage',
+         'bandanna', 'banjo', 'banner', 'barbell', 'barge', 'barrel',
+         'barrette', 'barrow', 'baseball_base', 'baseball', 'baseball_bat',
+         'baseball_cap', 'baseball_glove', 'basket', 'basketball_hoop',
+         'basketball', 'bass_horn', 'bat_(animal)', 'bath_mat', 'bath_towel',
+         'bathrobe', 'bathtub', 'batter_(food)', 'battery', 'beachball',
+         'bead', 'beaker', 'bean_curd', 'beanbag', 'beanie', 'bear', 'bed',
+         'bedspread', 'cow', 'beef_(food)', 'beeper', 'beer_bottle',
+         'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt', 'belt_buckle',
+         'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor', 'binder',
+         'binoculars', 'bird', 'birdfeeder', 'birdbath', 'birdcage',
+         'birdhouse', 'birthday_cake', 'birthday_card', 'biscuit_(bread)',
+         'pirate_flag', 'black_sheep', 'blackboard', 'blanket', 'blazer',
+         'blender', 'blimp', 'blinker', 'blueberry', 'boar', 'gameboard',
+         'boat', 'bobbin', 'bobby_pin', 'boiled_egg', 'bolo_tie', 'deadbolt',
+         'bolt', 'bonnet', 'book', 'book_bag', 'bookcase', 'booklet',
+         'bookmark', 'boom_microphone', 'boot', 'bottle', 'bottle_opener',
+         'bouquet', 'bow_(weapon)', 'bow_(decorative_ribbons)', 'bow-tie',
+         'bowl', 'pipe_bowl', 'bowler_hat', 'bowling_ball', 'bowling_pin',
+         'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere',
+         'bread-bin', 'breechcloth', 'bridal_gown', 'briefcase',
+         'bristle_brush', 'broccoli', 'broach', 'broom', 'brownie',
+         'brussels_sprouts', 'bubble_gum', 'bucket', 'horse_buggy', 'bull',
+         'bulldog', 'bulldozer', 'bullet_train', 'bulletin_board',
+         'bulletproof_vest', 'bullhorn', 'corned_beef', 'bun', 'bunk_bed',
+         'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butcher_knife',
+         'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car',
+         'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf',
+         'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)',
+         'can', 'can_opener', 'candelabrum', 'candle', 'candle_holder',
+         'candy_bar', 'candy_cane', 'walking_cane', 'canister', 'cannon',
+         'canoe', 'cantaloup', 'canteen', 'cap_(headwear)', 'bottle_cap',
+         'cape', 'cappuccino', 'car_(automobile)', 'railcar_(part_of_a_train)',
+         'elevator_car', 'car_battery', 'identity_card', 'card', 'cardigan',
+         'cargo_ship', 'carnation', 'horse_carriage', 'carrot', 'tote_bag',
+         'cart', 'carton', 'cash_register', 'casserole', 'cassette', 'cast',
+         'cat', 'cauliflower', 'caviar', 'cayenne_(spice)', 'CD_player',
+         'celery', 'cellular_telephone', 'chain_mail', 'chair',
+         'chaise_longue', 'champagne', 'chandelier', 'chap', 'checkbook',
+         'checkerboard', 'cherry', 'chessboard',
+         'chest_of_drawers_(furniture)', 'chicken_(animal)', 'chicken_wire',
+         'chickpea', 'Chihuahua', 'chili_(vegetable)', 'chime', 'chinaware',
+         'crisp_(potato_chip)', 'poker_chip', 'chocolate_bar',
+         'chocolate_cake', 'chocolate_milk', 'chocolate_mousse', 'choker',
+         'chopping_board', 'chopstick', 'Christmas_tree', 'slide', 'cider',
+         'cigar_box', 'cigarette', 'cigarette_case', 'cistern', 'clarinet',
+         'clasp', 'cleansing_agent', 'clementine', 'clip', 'clipboard',
+         'clock', 'clock_tower', 'clothes_hamper', 'clothespin', 'clutch_bag',
+         'coaster', 'coat', 'coat_hanger', 'coatrack', 'cock', 'coconut',
+         'coffee_filter', 'coffee_maker', 'coffee_table', 'coffeepot', 'coil',
+         'coin', 'colander', 'coleslaw', 'coloring_material',
+         'combination_lock', 'pacifier', 'comic_book', 'computer_keyboard',
+         'concrete_mixer', 'cone', 'control', 'convertible_(automobile)',
+         'sofa_bed', 'cookie', 'cookie_jar', 'cooking_utensil',
+         'cooler_(for_food)', 'cork_(bottle_plug)', 'corkboard', 'corkscrew',
+         'edible_corn', 'cornbread', 'cornet', 'cornice', 'cornmeal', 'corset',
+         'romaine_lettuce', 'costume', 'cougar', 'coverall', 'cowbell',
+         'cowboy_hat', 'crab_(animal)', 'cracker', 'crape', 'crate', 'crayon',
+         'cream_pitcher', 'credit_card', 'crescent_roll', 'crib', 'crock_pot',
+         'crossbar', 'crouton', 'crow', 'crown', 'crucifix', 'cruise_ship',
+         'police_cruiser', 'crumb', 'crutch', 'cub_(animal)', 'cube',
+         'cucumber', 'cufflink', 'cup', 'trophy_cup', 'cupcake', 'hair_curler',
+         'curling_iron', 'curtain', 'cushion', 'custard', 'cutting_tool',
+         'cylinder', 'cymbal', 'dachshund', 'dagger', 'dartboard',
+         'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
+         'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table',
+         'tux', 'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
+         'dishwasher_detergent', 'diskette', 'dispenser', 'Dixie_cup', 'dog',
+         'dog_collar', 'doll', 'dollar', 'dolphin', 'domestic_ass', 'eye_mask',
+         'doorbell', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly',
+         'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit',
+         'dresser', 'drill', 'drinking_fountain', 'drone', 'dropper',
+         'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling',
+         'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan',
+         'Dutch_oven', 'eagle', 'earphone', 'earplug', 'earring', 'easel',
+         'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater',
+         'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk',
+         'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan',
+         'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)',
+         'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)',
+         'fire_alarm', 'fire_engine', 'fire_extinguisher', 'fire_hose',
+         'fireplace', 'fireplug', 'fish', 'fish_(food)', 'fishbowl',
+         'fishing_boat', 'fishing_rod', 'flag', 'flagpole', 'flamingo',
+         'flannel', 'flash', 'flashlight', 'fleece', 'flip-flop_(sandal)',
+         'flipper_(footwear)', 'flower_arrangement', 'flute_glass', 'foal',
+         'folding_chair', 'food_processor', 'football_(American)',
+         'football_helmet', 'footstool', 'fork', 'forklift', 'freight_car',
+         'French_toast', 'freshener', 'frisbee', 'frog', 'fruit_juice',
+         'fruit_salad', 'frying_pan', 'fudge', 'funnel', 'futon', 'gag',
+         'garbage', 'garbage_truck', 'garden_hose', 'gargle', 'gargoyle',
+         'garlic', 'gasmask', 'gazelle', 'gelatin', 'gemstone', 'giant_panda',
+         'gift_wrap', 'ginger', 'giraffe', 'cincture',
+         'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles',
+         'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose',
+         'gorilla', 'gourd', 'surgical_gown', 'grape', 'grasshopper', 'grater',
+         'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle',
+         'grillroom', 'grinder_(tool)', 'grits', 'grizzly', 'grocery_bag',
+         'guacamole', 'guitar', 'gull', 'gun', 'hair_spray', 'hairbrush',
+         'hairnet', 'hairpin', 'ham', 'hamburger', 'hammer', 'hammock',
+         'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel',
+         'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw',
+         'hardback_book', 'harmonium', 'hat', 'hatbox', 'hatch', 'veil',
+         'headband', 'headboard', 'headlight', 'headscarf', 'headset',
+         'headstall_(for_horses)', 'hearing_aid', 'heart', 'heater',
+         'helicopter', 'helmet', 'heron', 'highchair', 'hinge', 'hippopotamus',
+         'hockey_stick', 'hog', 'home_plate_(baseball)', 'honey', 'fume_hood',
+         'hook', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
+         'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
+         'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
+         'ice_tea', 'igniter', 'incense', 'inhaler', 'iPod',
+         'iron_(for_clothing)', 'ironing_board', 'jacket', 'jam', 'jean',
+         'jeep', 'jelly_bean', 'jersey', 'jet_plane', 'jewelry', 'joystick',
+         'jumpsuit', 'kayak', 'keg', 'kennel', 'kettle', 'key', 'keycard',
+         'kilt', 'kimono', 'kitchen_sink', 'kitchen_table', 'kite', 'kitten',
+         'kiwi_fruit', 'knee_pad', 'knife', 'knight_(chess_piece)',
+         'knitting_needle', 'knob', 'knocker_(on_a_door)', 'koala', 'lab_coat',
+         'ladder', 'ladle', 'ladybug', 'lamb_(animal)', 'lamb-chop', 'lamp',
+         'lamppost', 'lampshade', 'lantern', 'lanyard', 'laptop_computer',
+         'lasagna', 'latch', 'lawn_mower', 'leather', 'legging_(clothing)',
+         'Lego', 'lemon', 'lemonade', 'lettuce', 'license_plate', 'life_buoy',
+         'life_jacket', 'lightbulb', 'lightning_rod', 'lime', 'limousine',
+         'linen_paper', 'lion', 'lip_balm', 'lipstick', 'liquor', 'lizard',
+         'Loafer_(type_of_shoe)', 'log', 'lollipop', 'lotion',
+         'speaker_(stereo_equipment)', 'loveseat', 'machine_gun', 'magazine',
+         'magnet', 'mail_slot', 'mailbox_(at_home)', 'mallet', 'mammoth',
+         'mandarin_orange', 'manger', 'manhole', 'map', 'marker', 'martini',
+         'mascot', 'mashed_potato', 'masher', 'mask', 'mast',
+         'mat_(gym_equipment)', 'matchbox', 'mattress', 'measuring_cup',
+         'measuring_stick', 'meatball', 'medicine', 'melon', 'microphone',
+         'microscope', 'microwave_oven', 'milestone', 'milk', 'minivan',
+         'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)', 'money',
+         'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor',
+         'motor_scooter', 'motor_vehicle', 'motorboat', 'motorcycle',
+         'mound_(baseball)', 'mouse_(animal_rodent)',
+         'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom',
+         'music_stool', 'musical_instrument', 'nailfile', 'nameplate',
+         'napkin', 'neckerchief', 'necklace', 'necktie', 'needle', 'nest',
+         'newsstand', 'nightshirt', 'nosebag_(for_animals)',
+         'noseband_(for_animals)', 'notebook', 'notepad', 'nut', 'nutcracker',
+         'oar', 'octopus_(food)', 'octopus_(animal)', 'oil_lamp', 'olive_oil',
+         'omelet', 'onion', 'orange_(fruit)', 'orange_juice', 'oregano',
+         'ostrich', 'ottoman', 'overalls_(clothing)', 'owl', 'packet',
+         'inkpad', 'pad', 'paddle', 'padlock', 'paintbox', 'paintbrush',
+         'painting', 'pajamas', 'palette', 'pan_(for_cooking)',
+         'pan_(metal_container)', 'pancake', 'pantyhose', 'papaya',
+         'paperclip', 'paper_plate', 'paper_towel', 'paperback_book',
+         'paperweight', 'parachute', 'parakeet', 'parasail_(sports)',
+         'parchment', 'parka', 'parking_meter', 'parrot',
+         'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
+         'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
+         'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'pegboard',
+         'pelican', 'pen', 'pencil', 'pencil_box', 'pencil_sharpener',
+         'pendulum', 'penguin', 'pennant', 'penny_(coin)', 'pepper',
+         'pepper_mill', 'perfume', 'persimmon', 'baby', 'pet', 'petfood',
+         'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
+         'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
+         'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
+         'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
+         'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
+         'plate', 'platter', 'playing_card', 'playpen', 'pliers',
+         'plow_(farm_equipment)', 'pocket_watch', 'pocketknife',
+         'poker_(fire_stirring_tool)', 'pole', 'police_van', 'polo_shirt',
+         'poncho', 'pony', 'pool_table', 'pop_(soda)', 'portrait',
+         'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot',
+         'potato', 'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn',
+         'printer', 'projectile_(weapon)', 'projector', 'propeller', 'prune',
+         'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher',
+         'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit',
+         'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', 'radish',
+         'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat',
+         'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
+         'recliner', 'record_player', 'red_cabbage', 'reflector',
+         'remote_control', 'rhinoceros', 'rib_(food)', 'rifle', 'ring',
+         'river_boat', 'road_map', 'robe', 'rocking_chair', 'roller_skate',
+         'Rollerblade', 'rolling_pin', 'root_beer',
+         'router_(computer_equipment)', 'rubber_band', 'runner_(carpet)',
+         'plastic_bag', 'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag',
+         'safety_pin', 'sail', 'salad', 'salad_plate', 'salami',
+         'salmon_(fish)', 'salmon_(food)', 'salsa', 'saltshaker',
+         'sandal_(type_of_shoe)', 'sandwich', 'satchel', 'saucepan', 'saucer',
+         'sausage', 'sawhorse', 'saxophone', 'scale_(measuring_instrument)',
+         'scarecrow', 'scarf', 'school_bus', 'scissors', 'scoreboard',
+         'scrambled_eggs', 'scraper', 'scratcher', 'screwdriver',
+         'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
+         'seashell', 'seedling', 'serving_dish', 'sewing_machine', 'shaker',
+         'shampoo', 'shark', 'sharpener', 'Sharpie', 'shaver_(electric)',
+         'shaving_cream', 'shawl', 'shears', 'sheep', 'shepherd_dog',
+         'sherbert', 'shield', 'shirt', 'shoe', 'shopping_bag',
+         'shopping_cart', 'short_pants', 'shot_glass', 'shoulder_bag',
+         'shovel', 'shower_head', 'shower_curtain', 'shredder_(for_paper)',
+         'sieve', 'signboard', 'silo', 'sink', 'skateboard', 'skewer', 'ski',
+         'ski_boot', 'ski_parka', 'ski_pole', 'skirt', 'sled', 'sleeping_bag',
+         'sling_(bandage)', 'slipper_(footwear)', 'smoothie', 'snake',
+         'snowboard', 'snowman', 'snowmobile', 'soap', 'soccer_ball', 'sock',
+         'soda_fountain', 'carbonated_water', 'sofa', 'softball',
+         'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon',
+         'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)',
+         'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'sponge',
+         'spoon', 'sportswear', 'spotlight', 'squirrel',
+         'stapler_(stapling_machine)', 'starfish', 'statue_(sculpture)',
+         'steak_(food)', 'steak_knife', 'steamer_(kitchen_appliance)',
+         'steering_wheel', 'stencil', 'stepladder', 'step_stool',
+         'stereo_(sound_system)', 'stew', 'stirrer', 'stirrup',
+         'stockings_(leg_wear)', 'stool', 'stop_sign', 'brake_light', 'stove',
+         'strainer', 'strap', 'straw_(for_drinking)', 'strawberry',
+         'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer',
+         'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower',
+         'sunglasses', 'sunhat', 'sunscreen', 'surfboard', 'sushi', 'mop',
+         'sweat_pants', 'sweatband', 'sweater', 'sweatshirt', 'sweet_potato',
+         'swimsuit', 'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table',
+         'table', 'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag',
+         'taillight', 'tambourine', 'army_tank', 'tank_(storage_vessel)',
+         'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
+         'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
+         'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
+         'telephone_pole', 'telephoto_lens', 'television_camera',
+         'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
+         'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
+         'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer',
+         'tinfoil', 'tinsel', 'tissue_paper', 'toast_(food)', 'toaster',
+         'toaster_oven', 'toilet', 'toilet_tissue', 'tomato', 'tongs',
+         'toolbox', 'toothbrush', 'toothpaste', 'toothpick', 'cover',
+         'tortilla', 'tow_truck', 'towel', 'towel_rack', 'toy',
+         'tractor_(farm_equipment)', 'traffic_light', 'dirt_bike',
+         'trailer_truck', 'train_(railroad_vehicle)', 'trampoline', 'tray',
+         'tree_house', 'trench_coat', 'triangle_(musical_instrument)',
+         'tricycle', 'tripod', 'trousers', 'truck', 'truffle_(chocolate)',
+         'trunk', 'vat', 'turban', 'turkey_(bird)', 'turkey_(food)', 'turnip',
+         'turtle', 'turtleneck_(clothing)', 'typewriter', 'umbrella',
+         'underwear', 'unicycle', 'urinal', 'urn', 'vacuum_cleaner', 'valve',
+         'vase', 'vending_machine', 'vent', 'videotape', 'vinegar', 'violin',
+         'vodka', 'volleyball', 'vulture', 'waffle', 'waffle_iron', 'wagon',
+         'wagon_wheel', 'walking_stick', 'wall_clock', 'wall_socket', 'wallet',
+         'walrus', 'wardrobe', 'wasabi', 'automatic_washer', 'watch',
+         'water_bottle', 'water_cooler', 'water_faucet', 'water_filter',
+         'water_heater', 'water_jug', 'water_gun', 'water_scooter',
+         'water_ski', 'water_tower', 'watering_can', 'watermelon',
+         'weathervane', 'webcam', 'wedding_cake', 'wedding_ring', 'wet_suit',
+         'wheel', 'wheelchair', 'whipped_cream', 'whiskey', 'whistle', 'wick',
+         'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)',
+         'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket',
+         'wineglass', 'wing_chair', 'blinder_(for_horses)', 'wok', 'wolf',
+         'wooden_spoon', 'wreath', 'wrench', 'wristband', 'wristlet', 'yacht',
+         'yak', 'yogurt', 'yoke_(animal_equipment)', 'zebra', 'zucchini'),
+        'palette':
+        None
+    }
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        try:
+            import lvis
+            if getattr(lvis, '__version__', '0') >= '10.5.3':
+                warnings.warn(
+                    'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"',  # noqa: E501
+                    UserWarning)
+            from lvis import LVIS
+        except ImportError:
+            raise ImportError(
+                'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".'  # noqa: E501
+            )
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.lvis = LVIS(local_path)
+        self.cat_ids = self.lvis.get_cat_ids()
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.lvis.cat_img_map)
+
+        img_ids = self.lvis.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.lvis.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+            if raw_img_info['file_name'].startswith('COCO'):
+                # Convert form the COCO 2014 file naming convention of
+                # COCO_[train/val/test]2014_000000000000.jpg to the 2017
+                # naming convention of 000000000000.jpg
+                # (LVIS v1 will fix this naming issue)
+                raw_img_info['file_name'] = raw_img_info['file_name'][-16:]
+            ann_ids = self.lvis.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.lvis.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.lvis
+
+        return data_list
+
+
+LVISDataset = LVISV05Dataset
+DATASETS.register_module(name='LVISDataset', module=LVISDataset)
+
+
+@DATASETS.register_module()
+class LVISV1Dataset(LVISDataset):
+    """LVIS v1 dataset for detection."""
+
+    METAINFO = {
+        'classes':
+        ('aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock',
+         'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet',
+         'antenna', 'apple', 'applesauce', 'apricot', 'apron', 'aquarium',
+         'arctic_(type_of_shoe)', 'armband', 'armchair', 'armoire', 'armor',
+         'artichoke', 'trash_can', 'ashtray', 'asparagus', 'atomizer',
+         'avocado', 'award', 'awning', 'ax', 'baboon', 'baby_buggy',
+         'basketball_backboard', 'backpack', 'handbag', 'suitcase', 'bagel',
+         'bagpipe', 'baguet', 'bait', 'ball', 'ballet_skirt', 'balloon',
+         'bamboo', 'banana', 'Band_Aid', 'bandage', 'bandanna', 'banjo',
+         'banner', 'barbell', 'barge', 'barrel', 'barrette', 'barrow',
+         'baseball_base', 'baseball', 'baseball_bat', 'baseball_cap',
+         'baseball_glove', 'basket', 'basketball', 'bass_horn', 'bat_(animal)',
+         'bath_mat', 'bath_towel', 'bathrobe', 'bathtub', 'batter_(food)',
+         'battery', 'beachball', 'bead', 'bean_curd', 'beanbag', 'beanie',
+         'bear', 'bed', 'bedpan', 'bedspread', 'cow', 'beef_(food)', 'beeper',
+         'beer_bottle', 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt',
+         'belt_buckle', 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor',
+         'billboard', 'binder', 'binoculars', 'bird', 'birdfeeder', 'birdbath',
+         'birdcage', 'birdhouse', 'birthday_cake', 'birthday_card',
+         'pirate_flag', 'black_sheep', 'blackberry', 'blackboard', 'blanket',
+         'blazer', 'blender', 'blimp', 'blinker', 'blouse', 'blueberry',
+         'gameboard', 'boat', 'bob', 'bobbin', 'bobby_pin', 'boiled_egg',
+         'bolo_tie', 'deadbolt', 'bolt', 'bonnet', 'book', 'bookcase',
+         'booklet', 'bookmark', 'boom_microphone', 'boot', 'bottle',
+         'bottle_opener', 'bouquet', 'bow_(weapon)',
+         'bow_(decorative_ribbons)', 'bow-tie', 'bowl', 'pipe_bowl',
+         'bowler_hat', 'bowling_ball', 'box', 'boxing_glove', 'suspenders',
+         'bracelet', 'brass_plaque', 'brassiere', 'bread-bin', 'bread',
+         'breechcloth', 'bridal_gown', 'briefcase', 'broccoli', 'broach',
+         'broom', 'brownie', 'brussels_sprouts', 'bubble_gum', 'bucket',
+         'horse_buggy', 'bull', 'bulldog', 'bulldozer', 'bullet_train',
+         'bulletin_board', 'bulletproof_vest', 'bullhorn', 'bun', 'bunk_bed',
+         'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butter',
+         'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car', 'cabinet',
+         'locker', 'cake', 'calculator', 'calendar', 'calf', 'camcorder',
+         'camel', 'camera', 'camera_lens', 'camper_(vehicle)', 'can',
+         'can_opener', 'candle', 'candle_holder', 'candy_bar', 'candy_cane',
+         'walking_cane', 'canister', 'canoe', 'cantaloup', 'canteen',
+         'cap_(headwear)', 'bottle_cap', 'cape', 'cappuccino',
+         'car_(automobile)', 'railcar_(part_of_a_train)', 'elevator_car',
+         'car_battery', 'identity_card', 'card', 'cardigan', 'cargo_ship',
+         'carnation', 'horse_carriage', 'carrot', 'tote_bag', 'cart', 'carton',
+         'cash_register', 'casserole', 'cassette', 'cast', 'cat',
+         'cauliflower', 'cayenne_(spice)', 'CD_player', 'celery',
+         'cellular_telephone', 'chain_mail', 'chair', 'chaise_longue',
+         'chalice', 'chandelier', 'chap', 'checkbook', 'checkerboard',
+         'cherry', 'chessboard', 'chicken_(animal)', 'chickpea',
+         'chili_(vegetable)', 'chime', 'chinaware', 'crisp_(potato_chip)',
+         'poker_chip', 'chocolate_bar', 'chocolate_cake', 'chocolate_milk',
+         'chocolate_mousse', 'choker', 'chopping_board', 'chopstick',
+         'Christmas_tree', 'slide', 'cider', 'cigar_box', 'cigarette',
+         'cigarette_case', 'cistern', 'clarinet', 'clasp', 'cleansing_agent',
+         'cleat_(for_securing_rope)', 'clementine', 'clip', 'clipboard',
+         'clippers_(for_plants)', 'cloak', 'clock', 'clock_tower',
+         'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster', 'coat',
+         'coat_hanger', 'coatrack', 'cock', 'cockroach', 'cocoa_(beverage)',
+         'coconut', 'coffee_maker', 'coffee_table', 'coffeepot', 'coil',
+         'coin', 'colander', 'coleslaw', 'coloring_material',
+         'combination_lock', 'pacifier', 'comic_book', 'compass',
+         'computer_keyboard', 'condiment', 'cone', 'control',
+         'convertible_(automobile)', 'sofa_bed', 'cooker', 'cookie',
+         'cooking_utensil', 'cooler_(for_food)', 'cork_(bottle_plug)',
+         'corkboard', 'corkscrew', 'edible_corn', 'cornbread', 'cornet',
+         'cornice', 'cornmeal', 'corset', 'costume', 'cougar', 'coverall',
+         'cowbell', 'cowboy_hat', 'crab_(animal)', 'crabmeat', 'cracker',
+         'crape', 'crate', 'crayon', 'cream_pitcher', 'crescent_roll', 'crib',
+         'crock_pot', 'crossbar', 'crouton', 'crow', 'crowbar', 'crown',
+         'crucifix', 'cruise_ship', 'police_cruiser', 'crumb', 'crutch',
+         'cub_(animal)', 'cube', 'cucumber', 'cufflink', 'cup', 'trophy_cup',
+         'cupboard', 'cupcake', 'hair_curler', 'curling_iron', 'curtain',
+         'cushion', 'cylinder', 'cymbal', 'dagger', 'dalmatian', 'dartboard',
+         'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
+         'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table',
+         'tux', 'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
+         'dishwasher_detergent', 'dispenser', 'diving_board', 'Dixie_cup',
+         'dog', 'dog_collar', 'doll', 'dollar', 'dollhouse', 'dolphin',
+         'domestic_ass', 'doorknob', 'doormat', 'doughnut', 'dove',
+         'dragonfly', 'drawer', 'underdrawers', 'dress', 'dress_hat',
+         'dress_suit', 'dresser', 'drill', 'drone', 'dropper',
+         'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling',
+         'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan', 'eagle',
+         'earphone', 'earplug', 'earring', 'easel', 'eclair', 'eel', 'egg',
+         'egg_roll', 'egg_yolk', 'eggbeater', 'eggplant', 'electric_chair',
+         'refrigerator', 'elephant', 'elk', 'envelope', 'eraser', 'escargot',
+         'eyepatch', 'falcon', 'fan', 'faucet', 'fedora', 'ferret',
+         'Ferris_wheel', 'ferry', 'fig_(fruit)', 'fighter_jet', 'figurine',
+         'file_cabinet', 'file_(tool)', 'fire_alarm', 'fire_engine',
+         'fire_extinguisher', 'fire_hose', 'fireplace', 'fireplug',
+         'first-aid_kit', 'fish', 'fish_(food)', 'fishbowl', 'fishing_rod',
+         'flag', 'flagpole', 'flamingo', 'flannel', 'flap', 'flash',
+         'flashlight', 'fleece', 'flip-flop_(sandal)', 'flipper_(footwear)',
+         'flower_arrangement', 'flute_glass', 'foal', 'folding_chair',
+         'food_processor', 'football_(American)', 'football_helmet',
+         'footstool', 'fork', 'forklift', 'freight_car', 'French_toast',
+         'freshener', 'frisbee', 'frog', 'fruit_juice', 'frying_pan', 'fudge',
+         'funnel', 'futon', 'gag', 'garbage', 'garbage_truck', 'garden_hose',
+         'gargle', 'gargoyle', 'garlic', 'gasmask', 'gazelle', 'gelatin',
+         'gemstone', 'generator', 'giant_panda', 'gift_wrap', 'ginger',
+         'giraffe', 'cincture', 'glass_(drink_container)', 'globe', 'glove',
+         'goat', 'goggles', 'goldfish', 'golf_club', 'golfcart',
+         'gondola_(boat)', 'goose', 'gorilla', 'gourd', 'grape', 'grater',
+         'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle',
+         'grill', 'grits', 'grizzly', 'grocery_bag', 'guitar', 'gull', 'gun',
+         'hairbrush', 'hairnet', 'hairpin', 'halter_top', 'ham', 'hamburger',
+         'hammer', 'hammock', 'hamper', 'hamster', 'hair_dryer', 'hand_glass',
+         'hand_towel', 'handcart', 'handcuff', 'handkerchief', 'handle',
+         'handsaw', 'hardback_book', 'harmonium', 'hat', 'hatbox', 'veil',
+         'headband', 'headboard', 'headlight', 'headscarf', 'headset',
+         'headstall_(for_horses)', 'heart', 'heater', 'helicopter', 'helmet',
+         'heron', 'highchair', 'hinge', 'hippopotamus', 'hockey_stick', 'hog',
+         'home_plate_(baseball)', 'honey', 'fume_hood', 'hook', 'hookah',
+         'hornet', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
+         'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
+         'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
+         'igniter', 'inhaler', 'iPod', 'iron_(for_clothing)', 'ironing_board',
+         'jacket', 'jam', 'jar', 'jean', 'jeep', 'jelly_bean', 'jersey',
+         'jet_plane', 'jewel', 'jewelry', 'joystick', 'jumpsuit', 'kayak',
+         'keg', 'kennel', 'kettle', 'key', 'keycard', 'kilt', 'kimono',
+         'kitchen_sink', 'kitchen_table', 'kite', 'kitten', 'kiwi_fruit',
+         'knee_pad', 'knife', 'knitting_needle', 'knob', 'knocker_(on_a_door)',
+         'koala', 'lab_coat', 'ladder', 'ladle', 'ladybug', 'lamb_(animal)',
+         'lamb-chop', 'lamp', 'lamppost', 'lampshade', 'lantern', 'lanyard',
+         'laptop_computer', 'lasagna', 'latch', 'lawn_mower', 'leather',
+         'legging_(clothing)', 'Lego', 'legume', 'lemon', 'lemonade',
+         'lettuce', 'license_plate', 'life_buoy', 'life_jacket', 'lightbulb',
+         'lightning_rod', 'lime', 'limousine', 'lion', 'lip_balm', 'liquor',
+         'lizard', 'log', 'lollipop', 'speaker_(stereo_equipment)', 'loveseat',
+         'machine_gun', 'magazine', 'magnet', 'mail_slot', 'mailbox_(at_home)',
+         'mallard', 'mallet', 'mammoth', 'manatee', 'mandarin_orange',
+         'manger', 'manhole', 'map', 'marker', 'martini', 'mascot',
+         'mashed_potato', 'masher', 'mask', 'mast', 'mat_(gym_equipment)',
+         'matchbox', 'mattress', 'measuring_cup', 'measuring_stick',
+         'meatball', 'medicine', 'melon', 'microphone', 'microscope',
+         'microwave_oven', 'milestone', 'milk', 'milk_can', 'milkshake',
+         'minivan', 'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)',
+         'money', 'monitor_(computer_equipment) computer_monitor', 'monkey',
+         'motor', 'motor_scooter', 'motor_vehicle', 'motorcycle',
+         'mound_(baseball)', 'mouse_(computer_equipment)', 'mousepad',
+         'muffin', 'mug', 'mushroom', 'music_stool', 'musical_instrument',
+         'nailfile', 'napkin', 'neckerchief', 'necklace', 'necktie', 'needle',
+         'nest', 'newspaper', 'newsstand', 'nightshirt',
+         'nosebag_(for_animals)', 'noseband_(for_animals)', 'notebook',
+         'notepad', 'nut', 'nutcracker', 'oar', 'octopus_(food)',
+         'octopus_(animal)', 'oil_lamp', 'olive_oil', 'omelet', 'onion',
+         'orange_(fruit)', 'orange_juice', 'ostrich', 'ottoman', 'oven',
+         'overalls_(clothing)', 'owl', 'packet', 'inkpad', 'pad', 'paddle',
+         'padlock', 'paintbrush', 'painting', 'pajamas', 'palette',
+         'pan_(for_cooking)', 'pan_(metal_container)', 'pancake', 'pantyhose',
+         'papaya', 'paper_plate', 'paper_towel', 'paperback_book',
+         'paperweight', 'parachute', 'parakeet', 'parasail_(sports)',
+         'parasol', 'parchment', 'parka', 'parking_meter', 'parrot',
+         'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
+         'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
+         'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'wooden_leg',
+         'pegboard', 'pelican', 'pen', 'pencil', 'pencil_box',
+         'pencil_sharpener', 'pendulum', 'penguin', 'pennant', 'penny_(coin)',
+         'pepper', 'pepper_mill', 'perfume', 'persimmon', 'person', 'pet',
+         'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
+         'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
+         'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
+         'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
+         'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
+         'plate', 'platter', 'playpen', 'pliers', 'plow_(farm_equipment)',
+         'plume', 'pocket_watch', 'pocketknife', 'poker_(fire_stirring_tool)',
+         'pole', 'polo_shirt', 'poncho', 'pony', 'pool_table', 'pop_(soda)',
+         'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot',
+         'potato', 'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn',
+         'pretzel', 'printer', 'projectile_(weapon)', 'projector', 'propeller',
+         'prune', 'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin',
+         'puncher', 'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt',
+         'rabbit', 'race_car', 'racket', 'radar', 'radiator', 'radio_receiver',
+         'radish', 'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry',
+         'rat', 'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
+         'recliner', 'record_player', 'reflector', 'remote_control',
+         'rhinoceros', 'rib_(food)', 'rifle', 'ring', 'river_boat', 'road_map',
+         'robe', 'rocking_chair', 'rodent', 'roller_skate', 'Rollerblade',
+         'rolling_pin', 'root_beer', 'router_(computer_equipment)',
+         'rubber_band', 'runner_(carpet)', 'plastic_bag',
+         'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', 'safety_pin',
+         'sail', 'salad', 'salad_plate', 'salami', 'salmon_(fish)',
+         'salmon_(food)', 'salsa', 'saltshaker', 'sandal_(type_of_shoe)',
+         'sandwich', 'satchel', 'saucepan', 'saucer', 'sausage', 'sawhorse',
+         'saxophone', 'scale_(measuring_instrument)', 'scarecrow', 'scarf',
+         'school_bus', 'scissors', 'scoreboard', 'scraper', 'screwdriver',
+         'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
+         'seashell', 'sewing_machine', 'shaker', 'shampoo', 'shark',
+         'sharpener', 'Sharpie', 'shaver_(electric)', 'shaving_cream', 'shawl',
+         'shears', 'sheep', 'shepherd_dog', 'sherbert', 'shield', 'shirt',
+         'shoe', 'shopping_bag', 'shopping_cart', 'short_pants', 'shot_glass',
+         'shoulder_bag', 'shovel', 'shower_head', 'shower_cap',
+         'shower_curtain', 'shredder_(for_paper)', 'signboard', 'silo', 'sink',
+         'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka', 'ski_pole',
+         'skirt', 'skullcap', 'sled', 'sleeping_bag', 'sling_(bandage)',
+         'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman',
+         'snowmobile', 'soap', 'soccer_ball', 'sock', 'sofa', 'softball',
+         'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon',
+         'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)',
+         'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'crawfish',
+         'sponge', 'spoon', 'sportswear', 'spotlight', 'squid_(food)',
+         'squirrel', 'stagecoach', 'stapler_(stapling_machine)', 'starfish',
+         'statue_(sculpture)', 'steak_(food)', 'steak_knife', 'steering_wheel',
+         'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew',
+         'stirrer', 'stirrup', 'stool', 'stop_sign', 'brake_light', 'stove',
+         'strainer', 'strap', 'straw_(for_drinking)', 'strawberry',
+         'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer',
+         'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower',
+         'sunglasses', 'sunhat', 'surfboard', 'sushi', 'mop', 'sweat_pants',
+         'sweatband', 'sweater', 'sweatshirt', 'sweet_potato', 'swimsuit',
+         'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table', 'table',
+         'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', 'taillight',
+         'tambourine', 'army_tank', 'tank_(storage_vessel)',
+         'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
+         'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
+         'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
+         'telephone_pole', 'telephoto_lens', 'television_camera',
+         'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
+         'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
+         'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer',
+         'tinfoil', 'tinsel', 'tissue_paper', 'toast_(food)', 'toaster',
+         'toaster_oven', 'toilet', 'toilet_tissue', 'tomato', 'tongs',
+         'toolbox', 'toothbrush', 'toothpaste', 'toothpick', 'cover',
+         'tortilla', 'tow_truck', 'towel', 'towel_rack', 'toy',
+         'tractor_(farm_equipment)', 'traffic_light', 'dirt_bike',
+         'trailer_truck', 'train_(railroad_vehicle)', 'trampoline', 'tray',
+         'trench_coat', 'triangle_(musical_instrument)', 'tricycle', 'tripod',
+         'trousers', 'truck', 'truffle_(chocolate)', 'trunk', 'vat', 'turban',
+         'turkey_(food)', 'turnip', 'turtle', 'turtleneck_(clothing)',
+         'typewriter', 'umbrella', 'underwear', 'unicycle', 'urinal', 'urn',
+         'vacuum_cleaner', 'vase', 'vending_machine', 'vent', 'vest',
+         'videotape', 'vinegar', 'violin', 'vodka', 'volleyball', 'vulture',
+         'waffle', 'waffle_iron', 'wagon', 'wagon_wheel', 'walking_stick',
+         'wall_clock', 'wall_socket', 'wallet', 'walrus', 'wardrobe',
+         'washbasin', 'automatic_washer', 'watch', 'water_bottle',
+         'water_cooler', 'water_faucet', 'water_heater', 'water_jug',
+         'water_gun', 'water_scooter', 'water_ski', 'water_tower',
+         'watering_can', 'watermelon', 'weathervane', 'webcam', 'wedding_cake',
+         'wedding_ring', 'wet_suit', 'wheel', 'wheelchair', 'whipped_cream',
+         'whistle', 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)',
+         'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket',
+         'wineglass', 'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon',
+         'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', 'yogurt',
+         'yoke_(animal_equipment)', 'zebra', 'zucchini'),
+        'palette':
+        None
+    }
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        try:
+            import lvis
+            if getattr(lvis, '__version__', '0') >= '10.5.3':
+                warnings.warn(
+                    'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"',  # noqa: E501
+                    UserWarning)
+            from lvis import LVIS
+        except ImportError:
+            raise ImportError(
+                'Package lvis is not installed. Please run "pip install git+https://github.com/lvis-dataset/lvis-api.git".'  # noqa: E501
+            )
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.lvis = LVIS(local_path)
+        self.cat_ids = self.lvis.get_cat_ids()
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.lvis.cat_img_map)
+
+        img_ids = self.lvis.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.lvis.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+            # coco_url is used in LVISv1 instead of file_name
+            # e.g. http://images.cocodataset.org/train2017/000000391895.jpg
+            # train/val split in specified in url
+            raw_img_info['file_name'] = raw_img_info['coco_url'].replace(
+                'http://images.cocodataset.org/', '')
+            ann_ids = self.lvis.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.lvis.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.lvis
+
+        return data_list
diff --git a/head_extractor/src/mmdet/datasets/mdetr_style_refcoco.py b/head_extractor/src/mmdet/datasets/mdetr_style_refcoco.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc56dec49db72daddf929bcc65471ffc2ca6fb4d
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/mdetr_style_refcoco.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+from mmengine.fileio import get_local_path
+
+from mmdet.datasets import BaseDetDataset
+from mmdet.registry import DATASETS
+from .api_wrappers import COCO
+
+
+@DATASETS.register_module()
+class MDETRStyleRefCocoDataset(BaseDetDataset):
+    """RefCOCO dataset.
+
+    Only support evaluation now.
+    """
+
+    def load_data_list(self) -> List[dict]:
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            coco = COCO(local_path)
+
+        img_ids = coco.get_img_ids()
+
+        data_infos = []
+        for img_id in img_ids:
+            raw_img_info = coco.load_imgs([img_id])[0]
+            ann_ids = coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = coco.load_anns(ann_ids)
+
+            data_info = {}
+            img_path = osp.join(self.data_prefix['img'],
+                                raw_img_info['file_name'])
+            data_info['img_path'] = img_path
+            data_info['img_id'] = img_id
+            data_info['height'] = raw_img_info['height']
+            data_info['width'] = raw_img_info['width']
+            data_info['dataset_mode'] = raw_img_info['dataset_name']
+
+            data_info['text'] = raw_img_info['caption']
+            data_info['custom_entities'] = False
+            data_info['tokens_positive'] = -1
+
+            instances = []
+            for i, ann in enumerate(raw_ann_info):
+                instance = {}
+                x1, y1, w, h = ann['bbox']
+                bbox = [x1, y1, x1 + w, y1 + h]
+                instance['bbox'] = bbox
+                instance['bbox_label'] = ann['category_id']
+                instance['ignore_flag'] = 0
+                instances.append(instance)
+
+            data_info['instances'] = instances
+            data_infos.append(data_info)
+        return data_infos
diff --git a/head_extractor/src/mmdet/datasets/mot_challenge_dataset.py b/head_extractor/src/mmdet/datasets/mot_challenge_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffbdc48ebf8d4a4ba11a605c8bc2a479cf2a0c96
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/mot_challenge_dataset.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List, Union
+
+from mmdet.registry import DATASETS
+from .base_video_dataset import BaseVideoDataset
+
+
+@DATASETS.register_module()
+class MOTChallengeDataset(BaseVideoDataset):
+    """Dataset for MOTChallenge.
+
+    Args:
+        visibility_thr (float, optional): The minimum visibility
+            for the objects during training. Default to -1.
+    """
+
+    METAINFO = {
+        'classes':
+        ('pedestrian', 'person_on_vehicle', 'car', 'bicycle', 'motorbike',
+         'non_mot_vehicle', 'static_person', 'distractor', 'occluder',
+         'occluder_on_ground', 'occluder_full', 'reflection', 'crowd')
+    }
+
+    def __init__(self, visibility_thr: float = -1, *args, **kwargs):
+        self.visibility_thr = visibility_thr
+        super().__init__(*args, **kwargs)
+
+    def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format. The difference between this
+        function and the one in ``BaseVideoDataset`` is that the parsing here
+        adds ``visibility`` and ``mot_conf``.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        img_info = raw_data_info['raw_img_info']
+        ann_info = raw_data_info['raw_ann_info']
+        data_info = {}
+
+        data_info.update(img_info)
+        if self.data_prefix.get('img_path', None) is not None:
+            img_path = osp.join(self.data_prefix['img_path'],
+                                img_info['file_name'])
+        else:
+            img_path = img_info['file_name']
+        data_info['img_path'] = img_path
+
+        instances = []
+        for i, ann in enumerate(ann_info):
+            instance = {}
+
+            if (not self.test_mode) and (ann['visibility'] <
+                                         self.visibility_thr):
+                continue
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+
+            if ann.get('iscrowd', False):
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[ann['category_id']]
+            instance['instance_id'] = ann['instance_id']
+            instance['category_id'] = ann['category_id']
+            instance['mot_conf'] = ann['mot_conf']
+            instance['visibility'] = ann['visibility']
+            if len(instance) > 0:
+                instances.append(instance)
+        if not self.test_mode:
+            assert len(instances) > 0, f'No valid instances found in ' \
+                f'image {data_info["img_path"]}!'
+        data_info['instances'] = instances
+        return data_info
diff --git a/head_extractor/src/mmdet/datasets/objects365.py b/head_extractor/src/mmdet/datasets/objects365.py
new file mode 100644
index 0000000000000000000000000000000000000000..e99869bfa309635af3c03cbfa77f732db3f50637
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/objects365.py
@@ -0,0 +1,284 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import List
+
+from mmengine.fileio import get_local_path
+
+from mmdet.registry import DATASETS
+from .api_wrappers import COCO
+from .coco import CocoDataset
+
+# images exist in annotations but not in image folder.
+objv2_ignore_list = [
+    osp.join('patch16', 'objects365_v2_00908726.jpg'),
+    osp.join('patch6', 'objects365_v1_00320532.jpg'),
+    osp.join('patch6', 'objects365_v1_00320534.jpg'),
+]
+
+
+@DATASETS.register_module()
+class Objects365V1Dataset(CocoDataset):
+    """Objects365 v1 dataset for detection."""
+
+    METAINFO = {
+        'classes':
+        ('person', 'sneakers', 'chair', 'hat', 'lamp', 'bottle',
+         'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk',
+         'handbag', 'street lights', 'book', 'plate', 'helmet',
+         'leather shoes', 'pillow', 'glove', 'potted plant', 'bracelet',
+         'flower', 'tv', 'storage box', 'vase', 'bench', 'wine glass', 'boots',
+         'bowl', 'dining table', 'umbrella', 'boat', 'flag', 'speaker',
+         'trash bin/can', 'stool', 'backpack', 'couch', 'belt', 'carpet',
+         'basket', 'towel/napkin', 'slippers', 'barrel/bucket', 'coffee table',
+         'suv', 'toy', 'tie', 'bed', 'traffic light', 'pen/pencil',
+         'microphone', 'sandals', 'canned', 'necklace', 'mirror', 'faucet',
+         'bicycle', 'bread', 'high heels', 'ring', 'van', 'watch', 'sink',
+         'horse', 'fish', 'apple', 'camera', 'candle', 'teddy bear', 'cake',
+         'motorcycle', 'wild bird', 'laptop', 'knife', 'traffic sign',
+         'cell phone', 'paddle', 'truck', 'cow', 'power outlet', 'clock',
+         'drum', 'fork', 'bus', 'hanger', 'nightstand', 'pot/pan', 'sheep',
+         'guitar', 'traffic cone', 'tea pot', 'keyboard', 'tripod', 'hockey',
+         'fan', 'dog', 'spoon', 'blackboard/whiteboard', 'balloon',
+         'air conditioner', 'cymbal', 'mouse', 'telephone', 'pickup truck',
+         'orange', 'banana', 'airplane', 'luggage', 'skis', 'soccer',
+         'trolley', 'oven', 'remote', 'baseball glove', 'paper towel',
+         'refrigerator', 'train', 'tomato', 'machinery vehicle', 'tent',
+         'shampoo/shower gel', 'head phone', 'lantern', 'donut',
+         'cleaning products', 'sailboat', 'tangerine', 'pizza', 'kite',
+         'computer box', 'elephant', 'toiletries', 'gas stove', 'broccoli',
+         'toilet', 'stroller', 'shovel', 'baseball bat', 'microwave',
+         'skateboard', 'surfboard', 'surveillance camera', 'gun', 'life saver',
+         'cat', 'lemon', 'liquid soap', 'zebra', 'duck', 'sports car',
+         'giraffe', 'pumpkin', 'piano', 'stop sign', 'radiator', 'converter',
+         'tissue ', 'carrot', 'washing machine', 'vent', 'cookies',
+         'cutting/chopping board', 'tennis racket', 'candy',
+         'skating and skiing shoes', 'scissors', 'folder', 'baseball',
+         'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine',
+         'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear',
+         'american football', 'basketball', 'potato', 'paint brush', 'printer',
+         'billiards', 'fire hydrant', 'goose', 'projector', 'sausage',
+         'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball',
+         'chopsticks', 'electronic stove and gas stove', 'pie', 'frisbee',
+         'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender',
+         'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango',
+         'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion',
+         'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale',
+         'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple',
+         'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle',
+         'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar',
+         'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD',
+         'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado',
+         'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear',
+         'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn',
+         'key', 'screwdriver', 'globe', 'broom', 'pliers', 'volleyball',
+         'hammer', 'eggplant', 'trophy', 'dates', 'board eraser', 'rice',
+         'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel',
+         'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste',
+         'antelope', 'shrimp', 'rickshaw', 'trombone', 'pomegranate',
+         'coconut', 'jellyfish', 'mushroom', 'calculator', 'treadmill',
+         'butterfly', 'egg tart', 'cheese', 'pig', 'pomelo', 'race car',
+         'rice cooker', 'tuba', 'crosswalk sign', 'papaya', 'hair drier',
+         'green onion', 'chips', 'dolphin', 'sushi', 'urinal', 'donkey',
+         'electric drill', 'spring rolls', 'tortoise/turtle', 'parrot',
+         'flute', 'measuring cup', 'shark', 'steak', 'poker card',
+         'binoculars', 'llama', 'radish', 'noodles', 'yak', 'mop', 'crab',
+         'microscope', 'barbell', 'bread/bun', 'baozi', 'lion', 'red cabbage',
+         'polar bear', 'lighter', 'seal', 'mangosteen', 'comb', 'eraser',
+         'pitaya', 'scallop', 'pencil case', 'saw', 'table tennis paddle',
+         'okra', 'starfish', 'eagle', 'monkey', 'durian', 'game board',
+         'rabbit', 'french horn', 'ambulance', 'asparagus', 'hoverboard',
+         'pasta', 'target', 'hotair balloon', 'chainsaw', 'lobster', 'iron',
+         'flashlight'),
+        'palette':
+        None
+    }
+
+    COCOAPI = COCO
+    # ann_id is unique in coco dataset.
+    ANN_ID_UNIQUE = True
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.coco = self.COCOAPI(local_path)
+
+        # 'categories' list in objects365_train.json and objects365_val.json
+        # is inconsistent, need sort list(or dict) before get cat_ids.
+        cats = self.coco.cats
+        sorted_cats = {i: cats[i] for i in sorted(cats)}
+        self.coco.cats = sorted_cats
+        categories = self.coco.dataset['categories']
+        sorted_categories = sorted(categories, key=lambda i: i['id'])
+        self.coco.dataset['categories'] = sorted_categories
+        # The order of returned `cat_ids` will not
+        # change with the order of the `classes`
+        self.cat_ids = self.coco.get_cat_ids(
+            cat_names=self.metainfo['classes'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.coco.cat_img_map)
+
+        img_ids = self.coco.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+
+            ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.coco
+
+        return data_list
+
+
+@DATASETS.register_module()
+class Objects365V2Dataset(CocoDataset):
+    """Objects365 v2 dataset for detection."""
+    METAINFO = {
+        'classes':
+        ('Person', 'Sneakers', 'Chair', 'Other Shoes', 'Hat', 'Car', 'Lamp',
+         'Glasses', 'Bottle', 'Desk', 'Cup', 'Street Lights', 'Cabinet/shelf',
+         'Handbag/Satchel', 'Bracelet', 'Plate', 'Picture/Frame', 'Helmet',
+         'Book', 'Gloves', 'Storage box', 'Boat', 'Leather Shoes', 'Flower',
+         'Bench', 'Potted Plant', 'Bowl/Basin', 'Flag', 'Pillow', 'Boots',
+         'Vase', 'Microphone', 'Necklace', 'Ring', 'SUV', 'Wine Glass', 'Belt',
+         'Moniter/TV', 'Backpack', 'Umbrella', 'Traffic Light', 'Speaker',
+         'Watch', 'Tie', 'Trash bin Can', 'Slippers', 'Bicycle', 'Stool',
+         'Barrel/bucket', 'Van', 'Couch', 'Sandals', 'Bakset', 'Drum',
+         'Pen/Pencil', 'Bus', 'Wild Bird', 'High Heels', 'Motorcycle',
+         'Guitar', 'Carpet', 'Cell Phone', 'Bread', 'Camera', 'Canned',
+         'Truck', 'Traffic cone', 'Cymbal', 'Lifesaver', 'Towel',
+         'Stuffed Toy', 'Candle', 'Sailboat', 'Laptop', 'Awning', 'Bed',
+         'Faucet', 'Tent', 'Horse', 'Mirror', 'Power outlet', 'Sink', 'Apple',
+         'Air Conditioner', 'Knife', 'Hockey Stick', 'Paddle', 'Pickup Truck',
+         'Fork', 'Traffic Sign', 'Ballon', 'Tripod', 'Dog', 'Spoon', 'Clock',
+         'Pot', 'Cow', 'Cake', 'Dinning Table', 'Sheep', 'Hanger',
+         'Blackboard/Whiteboard', 'Napkin', 'Other Fish', 'Orange/Tangerine',
+         'Toiletry', 'Keyboard', 'Tomato', 'Lantern', 'Machinery Vehicle',
+         'Fan', 'Green Vegetables', 'Banana', 'Baseball Glove', 'Airplane',
+         'Mouse', 'Train', 'Pumpkin', 'Soccer', 'Skiboard', 'Luggage',
+         'Nightstand', 'Tea pot', 'Telephone', 'Trolley', 'Head Phone',
+         'Sports Car', 'Stop Sign', 'Dessert', 'Scooter', 'Stroller', 'Crane',
+         'Remote', 'Refrigerator', 'Oven', 'Lemon', 'Duck', 'Baseball Bat',
+         'Surveillance Camera', 'Cat', 'Jug', 'Broccoli', 'Piano', 'Pizza',
+         'Elephant', 'Skateboard', 'Surfboard', 'Gun',
+         'Skating and Skiing shoes', 'Gas stove', 'Donut', 'Bow Tie', 'Carrot',
+         'Toilet', 'Kite', 'Strawberry', 'Other Balls', 'Shovel', 'Pepper',
+         'Computer Box', 'Toilet Paper', 'Cleaning Products', 'Chopsticks',
+         'Microwave', 'Pigeon', 'Baseball', 'Cutting/chopping Board',
+         'Coffee Table', 'Side Table', 'Scissors', 'Marker', 'Pie', 'Ladder',
+         'Snowboard', 'Cookies', 'Radiator', 'Fire Hydrant', 'Basketball',
+         'Zebra', 'Grape', 'Giraffe', 'Potato', 'Sausage', 'Tricycle',
+         'Violin', 'Egg', 'Fire Extinguisher', 'Candy', 'Fire Truck',
+         'Billards', 'Converter', 'Bathtub', 'Wheelchair', 'Golf Club',
+         'Briefcase', 'Cucumber', 'Cigar/Cigarette ', 'Paint Brush', 'Pear',
+         'Heavy Truck', 'Hamburger', 'Extractor', 'Extention Cord', 'Tong',
+         'Tennis Racket', 'Folder', 'American Football', 'earphone', 'Mask',
+         'Kettle', 'Tennis', 'Ship', 'Swing', 'Coffee Machine', 'Slide',
+         'Carriage', 'Onion', 'Green beans', 'Projector', 'Frisbee',
+         'Washing Machine/Drying Machine', 'Chicken', 'Printer', 'Watermelon',
+         'Saxophone', 'Tissue', 'Toothbrush', 'Ice cream', 'Hotair ballon',
+         'Cello', 'French Fries', 'Scale', 'Trophy', 'Cabbage', 'Hot dog',
+         'Blender', 'Peach', 'Rice', 'Wallet/Purse', 'Volleyball', 'Deer',
+         'Goose', 'Tape', 'Tablet', 'Cosmetics', 'Trumpet', 'Pineapple',
+         'Golf Ball', 'Ambulance', 'Parking meter', 'Mango', 'Key', 'Hurdle',
+         'Fishing Rod', 'Medal', 'Flute', 'Brush', 'Penguin', 'Megaphone',
+         'Corn', 'Lettuce', 'Garlic', 'Swan', 'Helicopter', 'Green Onion',
+         'Sandwich', 'Nuts', 'Speed Limit Sign', 'Induction Cooker', 'Broom',
+         'Trombone', 'Plum', 'Rickshaw', 'Goldfish', 'Kiwi fruit',
+         'Router/modem', 'Poker Card', 'Toaster', 'Shrimp', 'Sushi', 'Cheese',
+         'Notepaper', 'Cherry', 'Pliers', 'CD', 'Pasta', 'Hammer', 'Cue',
+         'Avocado', 'Hamimelon', 'Flask', 'Mushroon', 'Screwdriver', 'Soap',
+         'Recorder', 'Bear', 'Eggplant', 'Board Eraser', 'Coconut',
+         'Tape Measur/ Ruler', 'Pig', 'Showerhead', 'Globe', 'Chips', 'Steak',
+         'Crosswalk Sign', 'Stapler', 'Campel', 'Formula 1 ', 'Pomegranate',
+         'Dishwasher', 'Crab', 'Hoverboard', 'Meat ball', 'Rice Cooker',
+         'Tuba', 'Calculator', 'Papaya', 'Antelope', 'Parrot', 'Seal',
+         'Buttefly', 'Dumbbell', 'Donkey', 'Lion', 'Urinal', 'Dolphin',
+         'Electric Drill', 'Hair Dryer', 'Egg tart', 'Jellyfish', 'Treadmill',
+         'Lighter', 'Grapefruit', 'Game board', 'Mop', 'Radish', 'Baozi',
+         'Target', 'French', 'Spring Rolls', 'Monkey', 'Rabbit', 'Pencil Case',
+         'Yak', 'Red Cabbage', 'Binoculars', 'Asparagus', 'Barbell', 'Scallop',
+         'Noddles', 'Comb', 'Dumpling', 'Oyster', 'Table Teniis paddle',
+         'Cosmetics Brush/Eyeliner Pencil', 'Chainsaw', 'Eraser', 'Lobster',
+         'Durian', 'Okra', 'Lipstick', 'Cosmetics Mirror', 'Curling',
+         'Table Tennis '),
+        'palette':
+        None
+    }
+
+    COCOAPI = COCO
+    # ann_id is unique in coco dataset.
+    ANN_ID_UNIQUE = True
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """  # noqa: E501
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            self.coco = self.COCOAPI(local_path)
+        # The order of returned `cat_ids` will not
+        # change with the order of the `classes`
+        self.cat_ids = self.coco.get_cat_ids(
+            cat_names=self.metainfo['classes'])
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.cat_img_map = copy.deepcopy(self.coco.cat_img_map)
+
+        img_ids = self.coco.get_img_ids()
+        data_list = []
+        total_ann_ids = []
+        for img_id in img_ids:
+            raw_img_info = self.coco.load_imgs([img_id])[0]
+            raw_img_info['img_id'] = img_id
+
+            ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+            raw_ann_info = self.coco.load_anns(ann_ids)
+            total_ann_ids.extend(ann_ids)
+
+            # file_name should be `patchX/xxx.jpg`
+            file_name = osp.join(
+                osp.split(osp.split(raw_img_info['file_name'])[0])[-1],
+                osp.split(raw_img_info['file_name'])[-1])
+
+            if file_name in objv2_ignore_list:
+                continue
+
+            raw_img_info['file_name'] = file_name
+            parsed_data_info = self.parse_data_info({
+                'raw_ann_info':
+                raw_ann_info,
+                'raw_img_info':
+                raw_img_info
+            })
+            data_list.append(parsed_data_info)
+        if self.ANN_ID_UNIQUE:
+            assert len(set(total_ann_ids)) == len(
+                total_ann_ids
+            ), f"Annotation ids in '{self.ann_file}' are not unique!"
+
+        del self.coco
+
+        return data_list
diff --git a/head_extractor/src/mmdet/datasets/odvg.py b/head_extractor/src/mmdet/datasets/odvg.py
new file mode 100644
index 0000000000000000000000000000000000000000..c73865f2ea724205640bea2c701c355bbd9135e3
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/odvg.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+from typing import List, Optional
+
+from mmengine.fileio import get_local_path
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class ODVGDataset(BaseDetDataset):
+    """object detection and visual grounding dataset."""
+
+    def __init__(self,
+                 *args,
+                 data_root: str = '',
+                 label_map_file: Optional[str] = None,
+                 need_text: bool = True,
+                 **kwargs) -> None:
+        self.dataset_mode = 'VG'
+        self.need_text = need_text
+        if label_map_file:
+            label_map_file = osp.join(data_root, label_map_file)
+            with open(label_map_file, 'r') as file:
+                self.label_map = json.load(file)
+            self.dataset_mode = 'OD'
+        super().__init__(*args, data_root=data_root, **kwargs)
+        assert self.return_classes is True
+
+    def load_data_list(self) -> List[dict]:
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                data_list = [json.loads(line) for line in f]
+
+        out_data_list = []
+        for data in data_list:
+            data_info = {}
+            img_path = osp.join(self.data_prefix['img'], data['filename'])
+            data_info['img_path'] = img_path
+            data_info['height'] = data['height']
+            data_info['width'] = data['width']
+            if self.dataset_mode == 'OD':
+                if self.need_text:
+                    data_info['text'] = self.label_map
+                anno = data.get('detection', {})
+                instances = [obj for obj in anno.get('instances', [])]
+                bboxes = [obj['bbox'] for obj in instances]
+                bbox_labels = [str(obj['label']) for obj in instances]
+
+                instances = []
+                for bbox, label in zip(bboxes, bbox_labels):
+                    instance = {}
+                    x1, y1, x2, y2 = bbox
+                    inter_w = max(0, min(x2, data['width']) - max(x1, 0))
+                    inter_h = max(0, min(y2, data['height']) - max(y1, 0))
+                    if inter_w * inter_h == 0:
+                        continue
+                    if (x2 - x1) < 1 or (y2 - y1) < 1:
+                        continue
+                    instance['ignore_flag'] = 0
+                    instance['bbox'] = bbox
+                    instance['bbox_label'] = int(label)
+                    instances.append(instance)
+                data_info['instances'] = instances
+                data_info['dataset_mode'] = self.dataset_mode
+                out_data_list.append(data_info)
+            else:
+                anno = data['grounding']
+                data_info['text'] = anno['caption']
+                regions = anno['regions']
+
+                instances = []
+                phrases = {}
+                for i, region in enumerate(regions):
+                    bbox = region['bbox']
+                    phrase = region['phrase']
+                    tokens_positive = region['tokens_positive']
+                    if not isinstance(bbox[0], list):
+                        bbox = [bbox]
+                    for box in bbox:
+                        instance = {}
+                        x1, y1, x2, y2 = box
+                        inter_w = max(0, min(x2, data['width']) - max(x1, 0))
+                        inter_h = max(0, min(y2, data['height']) - max(y1, 0))
+                        if inter_w * inter_h == 0:
+                            continue
+                        if (x2 - x1) < 1 or (y2 - y1) < 1:
+                            continue
+                        instance['ignore_flag'] = 0
+                        instance['bbox'] = box
+                        instance['bbox_label'] = i
+                        phrases[i] = {
+                            'phrase': phrase,
+                            'tokens_positive': tokens_positive
+                        }
+                        instances.append(instance)
+                data_info['instances'] = instances
+                data_info['phrases'] = phrases
+                data_info['dataset_mode'] = self.dataset_mode
+                out_data_list.append(data_info)
+
+        del data_list
+        return out_data_list
diff --git a/head_extractor/src/mmdet/datasets/openimages.py b/head_extractor/src/mmdet/datasets/openimages.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3c6c8ec44fdfe86a653fc6a716009836f7d471c
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/openimages.py
@@ -0,0 +1,484 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import csv
+import os.path as osp
+from collections import defaultdict
+from typing import Dict, List, Optional
+
+import numpy as np
+from mmengine.fileio import get_local_path, load
+from mmengine.utils import is_abs
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class OpenImagesDataset(BaseDetDataset):
+    """Open Images dataset for detection.
+
+    Args:
+        ann_file (str): Annotation file path.
+        label_file (str): File path of the label description file that
+            maps the classes names in MID format to their short
+            descriptions.
+        meta_file (str): File path to get image metas.
+        hierarchy_file (str): The file path of the class hierarchy.
+        image_level_ann_file (str): Human-verified image level annotation,
+            which is used in evaluation.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    METAINFO: dict = dict(dataset_type='oid_v6')
+
+    def __init__(self,
+                 label_file: str,
+                 meta_file: str,
+                 hierarchy_file: str,
+                 image_level_ann_file: Optional[str] = None,
+                 **kwargs) -> None:
+        self.label_file = label_file
+        self.meta_file = meta_file
+        self.hierarchy_file = hierarchy_file
+        self.image_level_ann_file = image_level_ann_file
+        super().__init__(**kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """
+        classes_names, label_id_mapping = self._parse_label_file(
+            self.label_file)
+        self._metainfo['classes'] = classes_names
+        self.label_id_mapping = label_id_mapping
+
+        if self.image_level_ann_file is not None:
+            img_level_anns = self._parse_img_level_ann(
+                self.image_level_ann_file)
+        else:
+            img_level_anns = None
+
+        # OpenImagesMetric can get the relation matrix from the dataset meta
+        relation_matrix = self._get_relation_matrix(self.hierarchy_file)
+        self._metainfo['RELATION_MATRIX'] = relation_matrix
+
+        data_list = []
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                last_img_id = None
+                instances = []
+                for i, line in enumerate(reader):
+                    if i == 0:
+                        continue
+                    img_id = line[0]
+                    if last_img_id is None:
+                        last_img_id = img_id
+                    label_id = line[2]
+                    assert label_id in self.label_id_mapping
+                    label = int(self.label_id_mapping[label_id])
+                    bbox = [
+                        float(line[4]),  # xmin
+                        float(line[6]),  # ymin
+                        float(line[5]),  # xmax
+                        float(line[7])  # ymax
+                    ]
+                    is_occluded = True if int(line[8]) == 1 else False
+                    is_truncated = True if int(line[9]) == 1 else False
+                    is_group_of = True if int(line[10]) == 1 else False
+                    is_depiction = True if int(line[11]) == 1 else False
+                    is_inside = True if int(line[12]) == 1 else False
+
+                    instance = dict(
+                        bbox=bbox,
+                        bbox_label=label,
+                        ignore_flag=0,
+                        is_occluded=is_occluded,
+                        is_truncated=is_truncated,
+                        is_group_of=is_group_of,
+                        is_depiction=is_depiction,
+                        is_inside=is_inside)
+                    last_img_path = osp.join(self.data_prefix['img'],
+                                             f'{last_img_id}.jpg')
+                    if img_id != last_img_id:
+                        # switch to a new image, record previous image's data.
+                        data_info = dict(
+                            img_path=last_img_path,
+                            img_id=last_img_id,
+                            instances=instances,
+                        )
+                        data_list.append(data_info)
+                        instances = []
+                    instances.append(instance)
+                    last_img_id = img_id
+                data_list.append(
+                    dict(
+                        img_path=last_img_path,
+                        img_id=last_img_id,
+                        instances=instances,
+                    ))
+
+        # add image metas to data list
+        img_metas = load(
+            self.meta_file, file_format='pkl', backend_args=self.backend_args)
+        assert len(img_metas) == len(data_list)
+        for i, meta in enumerate(img_metas):
+            img_id = data_list[i]['img_id']
+            assert f'{img_id}.jpg' == osp.split(meta['filename'])[-1]
+            h, w = meta['ori_shape'][:2]
+            data_list[i]['height'] = h
+            data_list[i]['width'] = w
+            # denormalize bboxes
+            for j in range(len(data_list[i]['instances'])):
+                data_list[i]['instances'][j]['bbox'][0] *= w
+                data_list[i]['instances'][j]['bbox'][2] *= w
+                data_list[i]['instances'][j]['bbox'][1] *= h
+                data_list[i]['instances'][j]['bbox'][3] *= h
+            # add image-level annotation
+            if img_level_anns is not None:
+                img_labels = []
+                confidences = []
+                img_ann_list = img_level_anns.get(img_id, [])
+                for ann in img_ann_list:
+                    img_labels.append(int(ann['image_level_label']))
+                    confidences.append(float(ann['confidence']))
+                data_list[i]['image_level_labels'] = np.array(
+                    img_labels, dtype=np.int64)
+                data_list[i]['confidences'] = np.array(
+                    confidences, dtype=np.float32)
+        return data_list
+
+    def _parse_label_file(self, label_file: str) -> tuple:
+        """Get classes name and index mapping from cls-label-description file.
+
+        Args:
+            label_file (str): File path of the label description file that
+                maps the classes names in MID format to their short
+                descriptions.
+
+        Returns:
+            tuple: Class name of OpenImages.
+        """
+
+        index_list = []
+        classes_names = []
+        with get_local_path(
+                label_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                for line in reader:
+                    # self.cat2label[line[0]] = line[1]
+                    classes_names.append(line[1])
+                    index_list.append(line[0])
+        index_mapping = {index: i for i, index in enumerate(index_list)}
+        return classes_names, index_mapping
+
+    def _parse_img_level_ann(self,
+                             img_level_ann_file: str) -> Dict[str, List[dict]]:
+        """Parse image level annotations from csv style ann_file.
+
+        Args:
+            img_level_ann_file (str): CSV style image level annotation
+                file path.
+
+        Returns:
+            Dict[str, List[dict]]: Annotations where item of the defaultdict
+            indicates an image, each of which has (n) dicts.
+            Keys of dicts are:
+
+                - `image_level_label` (int): Label id.
+                - `confidence` (float): Labels that are human-verified to be
+                  present in an image have confidence = 1 (positive labels).
+                  Labels that are human-verified to be absent from an image
+                  have confidence = 0 (negative labels). Machine-generated
+                  labels have fractional confidences, generally >= 0.5.
+                  The higher the confidence, the smaller the chance for
+                  the label to be a false positive.
+        """
+
+        item_lists = defaultdict(list)
+        with get_local_path(
+                img_level_ann_file,
+                backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                for i, line in enumerate(reader):
+                    if i == 0:
+                        continue
+                    img_id = line[0]
+                    item_lists[img_id].append(
+                        dict(
+                            image_level_label=int(
+                                self.label_id_mapping[line[2]]),
+                            confidence=float(line[3])))
+        return item_lists
+
+    def _get_relation_matrix(self, hierarchy_file: str) -> np.ndarray:
+        """Get the matrix of class hierarchy from the hierarchy file. Hierarchy
+        for 600 classes can be found at https://storage.googleapis.com/openimag
+        es/2018_04/bbox_labels_600_hierarchy_visualizer/circle.html.
+
+        Args:
+            hierarchy_file (str): File path to the hierarchy for classes.
+
+        Returns:
+            np.ndarray: The matrix of the corresponding relationship between
+            the parent class and the child class, of shape
+            (class_num, class_num).
+        """  # noqa
+
+        hierarchy = load(
+            hierarchy_file, file_format='json', backend_args=self.backend_args)
+        class_num = len(self._metainfo['classes'])
+        relation_matrix = np.eye(class_num, class_num)
+        relation_matrix = self._convert_hierarchy_tree(hierarchy,
+                                                       relation_matrix)
+        return relation_matrix
+
+    def _convert_hierarchy_tree(self,
+                                hierarchy_map: dict,
+                                relation_matrix: np.ndarray,
+                                parents: list = [],
+                                get_all_parents: bool = True) -> np.ndarray:
+        """Get matrix of the corresponding relationship between the parent
+        class and the child class.
+
+        Args:
+            hierarchy_map (dict): Including label name and corresponding
+                subcategory. Keys of dicts are:
+
+                - `LabeName` (str): Name of the label.
+                - `Subcategory` (dict | list): Corresponding subcategory(ies).
+            relation_matrix (ndarray): The matrix of the corresponding
+                relationship between the parent class and the child class,
+                of shape (class_num, class_num).
+            parents (list): Corresponding parent class.
+            get_all_parents (bool): Whether get all parent names.
+                Default: True
+
+        Returns:
+            ndarray: The matrix of the corresponding relationship between
+            the parent class and the child class, of shape
+            (class_num, class_num).
+        """
+
+        if 'Subcategory' in hierarchy_map:
+            for node in hierarchy_map['Subcategory']:
+                if 'LabelName' in node:
+                    children_name = node['LabelName']
+                    children_index = self.label_id_mapping[children_name]
+                    children = [children_index]
+                else:
+                    continue
+                if len(parents) > 0:
+                    for parent_index in parents:
+                        if get_all_parents:
+                            children.append(parent_index)
+                        relation_matrix[children_index, parent_index] = 1
+                relation_matrix = self._convert_hierarchy_tree(
+                    node, relation_matrix, parents=children)
+        return relation_matrix
+
+    def _join_prefix(self):
+        """Join ``self.data_root`` with annotation path."""
+        super()._join_prefix()
+        if not is_abs(self.label_file) and self.label_file:
+            self.label_file = osp.join(self.data_root, self.label_file)
+        if not is_abs(self.meta_file) and self.meta_file:
+            self.meta_file = osp.join(self.data_root, self.meta_file)
+        if not is_abs(self.hierarchy_file) and self.hierarchy_file:
+            self.hierarchy_file = osp.join(self.data_root, self.hierarchy_file)
+        if self.image_level_ann_file and not is_abs(self.image_level_ann_file):
+            self.image_level_ann_file = osp.join(self.data_root,
+                                                 self.image_level_ann_file)
+
+
+@DATASETS.register_module()
+class OpenImagesChallengeDataset(OpenImagesDataset):
+    """Open Images Challenge dataset for detection.
+
+    Args:
+        ann_file (str): Open Images Challenge box annotation in txt format.
+    """
+
+    METAINFO: dict = dict(dataset_type='oid_challenge')
+
+    def __init__(self, ann_file: str, **kwargs) -> None:
+        if not ann_file.endswith('txt'):
+            raise TypeError('The annotation file of Open Images Challenge '
+                            'should be a txt file.')
+
+        super().__init__(ann_file=ann_file, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of annotation.
+        """
+        classes_names, label_id_mapping = self._parse_label_file(
+            self.label_file)
+        self._metainfo['classes'] = classes_names
+        self.label_id_mapping = label_id_mapping
+
+        if self.image_level_ann_file is not None:
+            img_level_anns = self._parse_img_level_ann(
+                self.image_level_ann_file)
+        else:
+            img_level_anns = None
+
+        # OpenImagesMetric can get the relation matrix from the dataset meta
+        relation_matrix = self._get_relation_matrix(self.hierarchy_file)
+        self._metainfo['RELATION_MATRIX'] = relation_matrix
+
+        data_list = []
+        with get_local_path(
+                self.ann_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                lines = f.readlines()
+        i = 0
+        while i < len(lines):
+            instances = []
+            filename = lines[i].rstrip()
+            i += 2
+            img_gt_size = int(lines[i])
+            i += 1
+            for j in range(img_gt_size):
+                sp = lines[i + j].split()
+                instances.append(
+                    dict(
+                        bbox=[
+                            float(sp[1]),
+                            float(sp[2]),
+                            float(sp[3]),
+                            float(sp[4])
+                        ],
+                        bbox_label=int(sp[0]) - 1,  # labels begin from 1
+                        ignore_flag=0,
+                        is_group_ofs=True if int(sp[5]) == 1 else False))
+            i += img_gt_size
+            data_list.append(
+                dict(
+                    img_path=osp.join(self.data_prefix['img'], filename),
+                    instances=instances,
+                ))
+
+        # add image metas to data list
+        img_metas = load(
+            self.meta_file, file_format='pkl', backend_args=self.backend_args)
+        assert len(img_metas) == len(data_list)
+        for i, meta in enumerate(img_metas):
+            img_id = osp.split(data_list[i]['img_path'])[-1][:-4]
+            assert img_id == osp.split(meta['filename'])[-1][:-4]
+            h, w = meta['ori_shape'][:2]
+            data_list[i]['height'] = h
+            data_list[i]['width'] = w
+            data_list[i]['img_id'] = img_id
+            # denormalize bboxes
+            for j in range(len(data_list[i]['instances'])):
+                data_list[i]['instances'][j]['bbox'][0] *= w
+                data_list[i]['instances'][j]['bbox'][2] *= w
+                data_list[i]['instances'][j]['bbox'][1] *= h
+                data_list[i]['instances'][j]['bbox'][3] *= h
+            # add image-level annotation
+            if img_level_anns is not None:
+                img_labels = []
+                confidences = []
+                img_ann_list = img_level_anns.get(img_id, [])
+                for ann in img_ann_list:
+                    img_labels.append(int(ann['image_level_label']))
+                    confidences.append(float(ann['confidence']))
+                data_list[i]['image_level_labels'] = np.array(
+                    img_labels, dtype=np.int64)
+                data_list[i]['confidences'] = np.array(
+                    confidences, dtype=np.float32)
+        return data_list
+
+    def _parse_label_file(self, label_file: str) -> tuple:
+        """Get classes name and index mapping from cls-label-description file.
+
+        Args:
+            label_file (str): File path of the label description file that
+                maps the classes names in MID format to their short
+                descriptions.
+
+        Returns:
+            tuple: Class name of OpenImages.
+        """
+        label_list = []
+        id_list = []
+        index_mapping = {}
+        with get_local_path(
+                label_file, backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                for line in reader:
+                    label_name = line[0]
+                    label_id = int(line[2])
+                    label_list.append(line[1])
+                    id_list.append(label_id)
+                    index_mapping[label_name] = label_id - 1
+        indexes = np.argsort(id_list)
+        classes_names = []
+        for index in indexes:
+            classes_names.append(label_list[index])
+        return classes_names, index_mapping
+
+    def _parse_img_level_ann(self, image_level_ann_file):
+        """Parse image level annotations from csv style ann_file.
+
+        Args:
+            image_level_ann_file (str): CSV style image level annotation
+                file path.
+
+        Returns:
+            defaultdict[list[dict]]: Annotations where item of the defaultdict
+            indicates an image, each of which has (n) dicts.
+            Keys of dicts are:
+
+                - `image_level_label` (int): of shape 1.
+                - `confidence` (float): of shape 1.
+        """
+
+        item_lists = defaultdict(list)
+        with get_local_path(
+                image_level_ann_file,
+                backend_args=self.backend_args) as local_path:
+            with open(local_path, 'r') as f:
+                reader = csv.reader(f)
+                i = -1
+                for line in reader:
+                    i += 1
+                    if i == 0:
+                        continue
+                    else:
+                        img_id = line[0]
+                        label_id = line[1]
+                        assert label_id in self.label_id_mapping
+                        image_level_label = int(
+                            self.label_id_mapping[label_id])
+                        confidence = float(line[2])
+                        item_lists[img_id].append(
+                            dict(
+                                image_level_label=image_level_label,
+                                confidence=confidence))
+        return item_lists
+
+    def _get_relation_matrix(self, hierarchy_file: str) -> np.ndarray:
+        """Get the matrix of class hierarchy from the hierarchy file.
+
+        Args:
+            hierarchy_file (str): File path to the hierarchy for classes.
+
+        Returns:
+            np.ndarray: The matrix of the corresponding
+            relationship between the parent class and the child class,
+            of shape (class_num, class_num).
+        """
+        with get_local_path(
+                hierarchy_file, backend_args=self.backend_args) as local_path:
+            class_label_tree = np.load(local_path, allow_pickle=True)
+        return class_label_tree[1:, 1:]
diff --git a/head_extractor/src/mmdet/datasets/refcoco.py b/head_extractor/src/mmdet/datasets/refcoco.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dae75fd547216a5b69033cc821b93a1d9ac6abc
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/refcoco.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections
+import os.path as osp
+import random
+from typing import Dict, List
+
+import mmengine
+from mmengine.dataset import BaseDataset
+
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class RefCocoDataset(BaseDataset):
+    """RefCOCO dataset.
+
+    The `Refcoco` and `Refcoco+` dataset is based on
+    `ReferItGame: Referring to Objects in Photographs of Natural Scenes
+    <http://tamaraberg.com/papers/referit.pdf>`_.
+
+    The `Refcocog` dataset is based on
+    `Generation and Comprehension of Unambiguous Object Descriptions
+    <https://arxiv.org/abs/1511.02283>`_.
+
+    Args:
+        ann_file (str): Annotation file path.
+        data_root (str): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to ''.
+        data_prefix (str): Prefix for training data.
+        split_file (str): Split file path.
+        split (str): Split name. Defaults to 'train'.
+        text_mode (str): Text mode. Defaults to 'random'.
+        **kwargs: Other keyword arguments in :class:`BaseDataset`.
+    """
+
+    def __init__(self,
+                 data_root: str,
+                 ann_file: str,
+                 split_file: str,
+                 data_prefix: Dict,
+                 split: str = 'train',
+                 text_mode: str = 'random',
+                 **kwargs):
+        self.split_file = split_file
+        self.split = split
+
+        assert text_mode in ['original', 'random', 'concat', 'select_first']
+        self.text_mode = text_mode
+        super().__init__(
+            data_root=data_root,
+            data_prefix=data_prefix,
+            ann_file=ann_file,
+            **kwargs,
+        )
+
+    def _join_prefix(self):
+        if not mmengine.is_abs(self.split_file) and self.split_file:
+            self.split_file = osp.join(self.data_root, self.split_file)
+
+        return super()._join_prefix()
+
+    def _init_refs(self):
+        """Initialize the refs for RefCOCO."""
+        anns, imgs = {}, {}
+        for ann in self.instances['annotations']:
+            anns[ann['id']] = ann
+        for img in self.instances['images']:
+            imgs[img['id']] = img
+
+        refs, ref_to_ann = {}, {}
+        for ref in self.splits:
+            # ids
+            ref_id = ref['ref_id']
+            ann_id = ref['ann_id']
+            # add mapping related to ref
+            refs[ref_id] = ref
+            ref_to_ann[ref_id] = anns[ann_id]
+
+        self.refs = refs
+        self.ref_to_ann = ref_to_ann
+
+    def load_data_list(self) -> List[dict]:
+        """Load data list."""
+        self.splits = mmengine.load(self.split_file, file_format='pkl')
+        self.instances = mmengine.load(self.ann_file, file_format='json')
+        self._init_refs()
+        img_prefix = self.data_prefix['img_path']
+
+        ref_ids = [
+            ref['ref_id'] for ref in self.splits if ref['split'] == self.split
+        ]
+        full_anno = []
+        for ref_id in ref_ids:
+            ref = self.refs[ref_id]
+            ann = self.ref_to_ann[ref_id]
+            ann.update(ref)
+            full_anno.append(ann)
+
+        image_id_list = []
+        final_anno = {}
+        for anno in full_anno:
+            image_id_list.append(anno['image_id'])
+            final_anno[anno['ann_id']] = anno
+        annotations = [value for key, value in final_anno.items()]
+
+        coco_train_id = []
+        image_annot = {}
+        for i in range(len(self.instances['images'])):
+            coco_train_id.append(self.instances['images'][i]['id'])
+            image_annot[self.instances['images'][i]
+                        ['id']] = self.instances['images'][i]
+
+        images = []
+        for image_id in list(set(image_id_list)):
+            images += [image_annot[image_id]]
+
+        data_list = []
+
+        grounding_dict = collections.defaultdict(list)
+        for anno in annotations:
+            image_id = int(anno['image_id'])
+            grounding_dict[image_id].append(anno)
+
+        join_path = mmengine.fileio.get_file_backend(img_prefix).join_path
+        for image in images:
+            img_id = image['id']
+            instances = []
+            sentences = []
+            for grounding_anno in grounding_dict[img_id]:
+                texts = [x['raw'].lower() for x in grounding_anno['sentences']]
+                # random select one text
+                if self.text_mode == 'random':
+                    idx = random.randint(0, len(texts) - 1)
+                    text = [texts[idx]]
+                # concat all texts
+                elif self.text_mode == 'concat':
+                    text = [''.join(texts)]
+                # select the first text
+                elif self.text_mode == 'select_first':
+                    text = [texts[0]]
+                # use all texts
+                elif self.text_mode == 'original':
+                    text = texts
+                else:
+                    raise ValueError(f'Invalid text mode "{self.text_mode}".')
+                ins = [{
+                    'mask': grounding_anno['segmentation'],
+                    'ignore_flag': 0
+                }] * len(text)
+                instances.extend(ins)
+                sentences.extend(text)
+            data_info = {
+                'img_path': join_path(img_prefix, image['file_name']),
+                'img_id': img_id,
+                'instances': instances,
+                'text': sentences
+            }
+            data_list.append(data_info)
+
+        if len(data_list) == 0:
+            raise ValueError(f'No sample in split "{self.split}".')
+
+        return data_list
diff --git a/head_extractor/src/mmdet/datasets/reid_dataset.py b/head_extractor/src/mmdet/datasets/reid_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eed3ee4f0358edf59d19695c2b28394336dffd3
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/reid_dataset.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from collections import defaultdict
+from typing import Any, Dict, List
+
+import numpy as np
+from mmengine.dataset import BaseDataset
+from mmengine.utils import check_file_exist
+
+from mmdet.registry import DATASETS
+
+
+@DATASETS.register_module()
+class ReIDDataset(BaseDataset):
+    """Dataset for ReID.
+
+    Args:
+        triplet_sampler (dict, optional): The sampler for hard mining
+            triplet loss. Defaults to None.
+        keys: num_ids (int): The number of person ids.
+              ins_per_id (int): The number of image for each person.
+    """
+
+    def __init__(self, triplet_sampler: dict = None, *args, **kwargs):
+        self.triplet_sampler = triplet_sampler
+        super().__init__(*args, **kwargs)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ''self.ann_file''.
+
+        Returns:
+              list[dict]: A list of annotation.
+        """
+        assert isinstance(self.ann_file, str)
+        check_file_exist(self.ann_file)
+        data_list = []
+        with open(self.ann_file) as f:
+            samples = [x.strip().split(' ') for x in f.readlines()]
+            for filename, gt_label in samples:
+                info = dict(img_prefix=self.data_prefix)
+                if self.data_prefix['img_path'] is not None:
+                    info['img_path'] = osp.join(self.data_prefix['img_path'],
+                                                filename)
+                else:
+                    info['img_path'] = filename
+                info['gt_label'] = np.array(gt_label, dtype=np.int64)
+                data_list.append(info)
+        self._parse_ann_info(data_list)
+        return data_list
+
+    def _parse_ann_info(self, data_list: List[dict]):
+        """Parse person id annotations."""
+        index_tmp_dic = defaultdict(list)  # pid->[idx1,...,idxN]
+        self.index_dic = dict()  # pid->array([idx1,...,idxN])
+        for idx, info in enumerate(data_list):
+            pid = info['gt_label']
+            index_tmp_dic[int(pid)].append(idx)
+        for pid, idxs in index_tmp_dic.items():
+            self.index_dic[pid] = np.asarray(idxs, dtype=np.int64)
+        self.pids = np.asarray(list(self.index_dic.keys()), dtype=np.int64)
+
+    def prepare_data(self, idx: int) -> Any:
+        """Get data processed by ''self.pipeline''.
+
+        Args:
+            idx (int): The index of ''data_info''
+
+        Returns:
+            Any: Depends on ''self.pipeline''
+        """
+        data_info = self.get_data_info(idx)
+        if self.triplet_sampler is not None:
+            img_info = self.triplet_sampling(data_info['gt_label'],
+                                             **self.triplet_sampler)
+            data_info = copy.deepcopy(img_info)  # triplet -> list
+        else:
+            data_info = copy.deepcopy(data_info)  # no triplet -> dict
+        return self.pipeline(data_info)
+
+    def triplet_sampling(self,
+                         pos_pid,
+                         num_ids: int = 8,
+                         ins_per_id: int = 4) -> Dict:
+        """Triplet sampler for hard mining triplet loss. First, for one
+        pos_pid, random sample ins_per_id images with same person id.
+
+        Then, random sample num_ids - 1 images for each negative id.
+        Finally, random sample ins_per_id images for each negative id.
+
+        Args:
+            pos_pid (ndarray): The person id of the anchor.
+            num_ids (int): The number of person ids.
+            ins_per_id (int): The number of images for each person.
+
+        Returns:
+            Dict: Annotation information of num_ids X ins_per_id images.
+        """
+        assert len(self.pids) >= num_ids, \
+            'The number of person ids in the training set must ' \
+            'be greater than the number of person ids in the sample.'
+
+        pos_idxs = self.index_dic[int(
+            pos_pid)]  # all positive idxs for pos_pid
+        idxs_list = []
+        # select positive samplers
+        idxs_list.extend(pos_idxs[np.random.choice(
+            pos_idxs.shape[0], ins_per_id, replace=True)])
+        # select negative ids
+        neg_pids = np.random.choice(
+            [i for i, _ in enumerate(self.pids) if i != pos_pid],
+            num_ids - 1,
+            replace=False)
+        # select negative samplers for each negative id
+        for neg_pid in neg_pids:
+            neg_idxs = self.index_dic[neg_pid]
+            idxs_list.extend(neg_idxs[np.random.choice(
+                neg_idxs.shape[0], ins_per_id, replace=True)])
+        # return the final triplet batch
+        triplet_img_infos = []
+        for idx in idxs_list:
+            triplet_img_infos.append(copy.deepcopy(self.get_data_info(idx)))
+        # Collect data_list scatters (list of dict -> dict of list)
+        out = dict()
+        for key in triplet_img_infos[0].keys():
+            out[key] = [_info[key] for _info in triplet_img_infos]
+        return out
diff --git a/head_extractor/src/mmdet/datasets/samplers/__init__.py b/head_extractor/src/mmdet/datasets/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ea0e4cb0628fc23bc034c51e503d8ceca5ee90c
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/samplers/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .batch_sampler import (AspectRatioBatchSampler,
+                            MultiDataAspectRatioBatchSampler,
+                            TrackAspectRatioBatchSampler)
+from .class_aware_sampler import ClassAwareSampler
+from .custom_sample_size_sampler import CustomSampleSizeSampler
+from .multi_data_sampler import MultiDataSampler
+from .multi_source_sampler import GroupMultiSourceSampler, MultiSourceSampler
+from .track_img_sampler import TrackImgSampler
+
+__all__ = [
+    'ClassAwareSampler', 'AspectRatioBatchSampler', 'MultiSourceSampler',
+    'GroupMultiSourceSampler', 'TrackImgSampler',
+    'TrackAspectRatioBatchSampler', 'MultiDataSampler',
+    'MultiDataAspectRatioBatchSampler', 'CustomSampleSizeSampler'
+]
diff --git a/head_extractor/src/mmdet/datasets/samplers/batch_sampler.py b/head_extractor/src/mmdet/datasets/samplers/batch_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c17789c4e3ea51f1fa140d039a679f797a7660f6
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/samplers/batch_sampler.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+from torch.utils.data import BatchSampler, Sampler
+
+from mmdet.datasets.samplers.track_img_sampler import TrackImgSampler
+from mmdet.registry import DATA_SAMPLERS
+
+
+# TODO: maybe replace with a data_loader wrapper
+@DATA_SAMPLERS.register_module()
+class AspectRatioBatchSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio (< 1 or.
+
+    >= 1) into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+    """
+
+    def __init__(self,
+                 sampler: Sampler,
+                 batch_size: int,
+                 drop_last: bool = False) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        # two groups for w < h and w >= h
+        self._aspect_ratio_buckets = [[] for _ in range(2)]
+
+    def __iter__(self) -> Sequence[int]:
+        for idx in self.sampler:
+            data_info = self.sampler.dataset.get_data_info(idx)
+            width, height = data_info['width'], data_info['height']
+            bucket_id = 0 if width < height else 1
+            bucket = self._aspect_ratio_buckets[bucket_id]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+
+        # yield the rest data and reset the bucket
+        left_data = self._aspect_ratio_buckets[0] + self._aspect_ratio_buckets[
+            1]
+        self._aspect_ratio_buckets = [[] for _ in range(2)]
+        while len(left_data) > 0:
+            if len(left_data) <= self.batch_size:
+                if not self.drop_last:
+                    yield left_data[:]
+                left_data = []
+            else:
+                yield left_data[:self.batch_size]
+                left_data = left_data[self.batch_size:]
+
+    def __len__(self) -> int:
+        if self.drop_last:
+            return len(self.sampler) // self.batch_size
+        else:
+            return (len(self.sampler) + self.batch_size - 1) // self.batch_size
+
+
+@DATA_SAMPLERS.register_module()
+class TrackAspectRatioBatchSampler(AspectRatioBatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio (< 1 or.
+
+    >= 1) into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+    """
+
+    def __iter__(self) -> Sequence[int]:
+        for idx in self.sampler:
+            # hard code to solve TrackImgSampler
+            if isinstance(self.sampler, TrackImgSampler):
+                video_idx, _ = idx
+            else:
+                video_idx = idx
+            # video_idx
+            data_info = self.sampler.dataset.get_data_info(video_idx)
+            # data_info {video_id, images, video_length}
+            img_data_info = data_info['images'][0]
+            width, height = img_data_info['width'], img_data_info['height']
+            bucket_id = 0 if width < height else 1
+            bucket = self._aspect_ratio_buckets[bucket_id]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+
+        # yield the rest data and reset the bucket
+        left_data = self._aspect_ratio_buckets[0] + self._aspect_ratio_buckets[
+            1]
+        self._aspect_ratio_buckets = [[] for _ in range(2)]
+        while len(left_data) > 0:
+            if len(left_data) <= self.batch_size:
+                if not self.drop_last:
+                    yield left_data[:]
+                left_data = []
+            else:
+                yield left_data[:self.batch_size]
+                left_data = left_data[self.batch_size:]
+
+
+@DATA_SAMPLERS.register_module()
+class MultiDataAspectRatioBatchSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio (< 1 or.
+
+    >= 1) into a same batch for multi-source datasets.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        batch_size (Sequence(int)): Size of mini-batch for multi-source
+        datasets.
+        num_datasets(int): Number of multi-source datasets.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+        its size would be less than ``batch_size``.
+    """
+
+    def __init__(self,
+                 sampler: Sampler,
+                 batch_size: Sequence[int],
+                 num_datasets: int,
+                 drop_last: bool = True) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        self.sampler = sampler
+        self.batch_size = batch_size
+        self.num_datasets = num_datasets
+        self.drop_last = drop_last
+        # two groups for w < h and w >= h for each dataset --> 2 * num_datasets
+        self._buckets = [[] for _ in range(2 * self.num_datasets)]
+
+    def __iter__(self) -> Sequence[int]:
+        for idx in self.sampler:
+            data_info = self.sampler.dataset.get_data_info(idx)
+            width, height = data_info['width'], data_info['height']
+            dataset_source_idx = self.sampler.dataset.get_dataset_source(idx)
+            aspect_ratio_bucket_id = 0 if width < height else 1
+            bucket_id = dataset_source_idx * 2 + aspect_ratio_bucket_id
+            bucket = self._buckets[bucket_id]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size[dataset_source_idx]:
+                yield bucket[:]
+                del bucket[:]
+
+        # yield the rest data and reset the bucket
+        for i in range(self.num_datasets):
+            left_data = self._buckets[i * 2 + 0] + self._buckets[i * 2 + 1]
+            while len(left_data) > 0:
+                if len(left_data) <= self.batch_size[i]:
+                    if not self.drop_last:
+                        yield left_data[:]
+                    left_data = []
+                else:
+                    yield left_data[:self.batch_size[i]]
+                    left_data = left_data[self.batch_size[i]:]
+
+        self._buckets = [[] for _ in range(2 * self.num_datasets)]
+
+    def __len__(self) -> int:
+        sizes = [0 for _ in range(self.num_datasets)]
+        for idx in self.sampler:
+            dataset_source_idx = self.sampler.dataset.get_dataset_source(idx)
+            sizes[dataset_source_idx] += 1
+
+        if self.drop_last:
+            lens = 0
+            for i in range(self.num_datasets):
+                lens += sizes[i] // self.batch_size[i]
+            return lens
+        else:
+            lens = 0
+            for i in range(self.num_datasets):
+                lens += (sizes[i] + self.batch_size[i] -
+                         1) // self.batch_size[i]
+            return lens
diff --git a/head_extractor/src/mmdet/datasets/samplers/class_aware_sampler.py b/head_extractor/src/mmdet/datasets/samplers/class_aware_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ca2f9b3ffb7c780ab25cc3704b67589763259e0
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/samplers/class_aware_sampler.py
@@ -0,0 +1,192 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Dict, Iterator, Optional, Union
+
+import numpy as np
+import torch
+from mmengine.dataset import BaseDataset
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+
+
+@DATA_SAMPLERS.register_module()
+class ClassAwareSampler(Sampler):
+    r"""Sampler that restricts data loading to the label of the dataset.
+
+    A class-aware sampling strategy to effectively tackle the
+    non-uniform class distribution. The length of the training data is
+    consistent with source data. Simple improvements based on `Relay
+    Backpropagation for Effective Learning of Deep Convolutional
+    Neural Networks <https://arxiv.org/abs/1512.05830>`_
+
+    The implementation logic is referred to
+    https://github.com/Sense-X/TSD/blob/master/mmdet/datasets/samplers/distributed_classaware_sampler.py
+
+    Args:
+        dataset: Dataset used for sampling.
+        seed (int, optional): random seed used to shuffle the sampler.
+            This number should be identical across all
+            processes in the distributed group. Defaults to None.
+        num_sample_class (int): The number of samples taken from each
+            per-label list. Defaults to 1.
+    """
+
+    def __init__(self,
+                 dataset: BaseDataset,
+                 seed: Optional[int] = None,
+                 num_sample_class: int = 1) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.epoch = 0
+        # Must be the same across all workers. If None, will use a
+        # random seed shared among workers
+        # (require synchronization among all workers)
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+
+        # The number of samples taken from each per-label list
+        assert num_sample_class > 0 and isinstance(num_sample_class, int)
+        self.num_sample_class = num_sample_class
+        # Get per-label image list from dataset
+        self.cat_dict = self.get_cat2imgs()
+
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / world_size))
+        self.total_size = self.num_samples * self.world_size
+
+        # get number of images containing each category
+        self.num_cat_imgs = [len(x) for x in self.cat_dict.values()]
+        # filter labels without images
+        self.valid_cat_inds = [
+            i for i, length in enumerate(self.num_cat_imgs) if length != 0
+        ]
+        self.num_classes = len(self.valid_cat_inds)
+
+    def get_cat2imgs(self) -> Dict[int, list]:
+        """Get a dict with class as key and img_ids as values.
+
+        Returns:
+            dict[int, list]: A dict of per-label image list,
+            the item of the dict indicates a label index,
+            corresponds to the image index that contains the label.
+        """
+        classes = self.dataset.metainfo.get('classes', None)
+        if classes is None:
+            raise ValueError('dataset metainfo must contain `classes`')
+        # sort the label index
+        cat2imgs = {i: [] for i in range(len(classes))}
+        for i in range(len(self.dataset)):
+            cat_ids = set(self.dataset.get_cat_ids(i))
+            for cat in cat_ids:
+                cat2imgs[cat].append(i)
+        return cat2imgs
+
+    def __iter__(self) -> Iterator[int]:
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch + self.seed)
+
+        # initialize label list
+        label_iter_list = RandomCycleIter(self.valid_cat_inds, generator=g)
+        # initialize each per-label image list
+        data_iter_dict = dict()
+        for i in self.valid_cat_inds:
+            data_iter_dict[i] = RandomCycleIter(self.cat_dict[i], generator=g)
+
+        def gen_cat_img_inds(cls_list, data_dict, num_sample_cls):
+            """Traverse the categories and extract `num_sample_cls` image
+            indexes of the corresponding categories one by one."""
+            id_indices = []
+            for _ in range(len(cls_list)):
+                cls_idx = next(cls_list)
+                for _ in range(num_sample_cls):
+                    id = next(data_dict[cls_idx])
+                    id_indices.append(id)
+            return id_indices
+
+        # deterministically shuffle based on epoch
+        num_bins = int(
+            math.ceil(self.total_size * 1.0 / self.num_classes /
+                      self.num_sample_class))
+        indices = []
+        for i in range(num_bins):
+            indices += gen_cat_img_inds(label_iter_list, data_iter_dict,
+                                        self.num_sample_class)
+
+        # fix extra samples to make it evenly divisible
+        if len(indices) >= self.total_size:
+            indices = indices[:self.total_size]
+        else:
+            indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self) -> int:
+        """The number of samples in this rank."""
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        """Sets the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas use a different
+        random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
+
+
+class RandomCycleIter:
+    """Shuffle the list and do it again after the list have traversed.
+
+    The implementation logic is referred to
+    https://github.com/wutong16/DistributionBalancedLoss/blob/master/mllt/datasets/loader/sampler.py
+
+    Example:
+        >>> label_list = [0, 1, 2, 4, 5]
+        >>> g = torch.Generator()
+        >>> g.manual_seed(0)
+        >>> label_iter_list = RandomCycleIter(label_list, generator=g)
+        >>> index = next(label_iter_list)
+    Args:
+        data (list or ndarray): The data that needs to be shuffled.
+        generator: An torch.Generator object, which is used in setting the seed
+            for generating random numbers.
+    """  # noqa: W605
+
+    def __init__(self,
+                 data: Union[list, np.ndarray],
+                 generator: torch.Generator = None) -> None:
+        self.data = data
+        self.length = len(data)
+        self.index = torch.randperm(self.length, generator=generator).numpy()
+        self.i = 0
+        self.generator = generator
+
+    def __iter__(self) -> Iterator:
+        return self
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def __next__(self):
+        if self.i == self.length:
+            self.index = torch.randperm(
+                self.length, generator=self.generator).numpy()
+            self.i = 0
+        idx = self.data[self.index[self.i]]
+        self.i += 1
+        return idx
diff --git a/head_extractor/src/mmdet/datasets/samplers/custom_sample_size_sampler.py b/head_extractor/src/mmdet/datasets/samplers/custom_sample_size_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bedf6c66be81b091a6424bae6788953ba7763a3
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/samplers/custom_sample_size_sampler.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Iterator, Optional, Sequence, Sized
+
+import torch
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+from .class_aware_sampler import RandomCycleIter
+
+
+@DATA_SAMPLERS.register_module()
+class CustomSampleSizeSampler(Sampler):
+
+    def __init__(self,
+                 dataset: Sized,
+                 dataset_size: Sequence[int],
+                 ratio_mode: bool = False,
+                 seed: Optional[int] = None,
+                 round_up: bool = True) -> None:
+        assert len(dataset.datasets) == len(dataset_size)
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+        self.epoch = 0
+        self.round_up = round_up
+
+        total_size = 0
+        total_size_fake = 0
+        self.dataset_index = []
+        self.dataset_cycle_iter = []
+        new_dataset_size = []
+        for dataset, size in zip(dataset.datasets, dataset_size):
+            self.dataset_index.append(
+                list(range(total_size_fake,
+                           len(dataset) + total_size_fake)))
+            total_size_fake += len(dataset)
+            if size == -1:
+                total_size += len(dataset)
+                self.dataset_cycle_iter.append(None)
+                new_dataset_size.append(-1)
+            else:
+                if ratio_mode:
+                    size = int(size * len(dataset))
+                assert size <= len(
+                    dataset
+                ), f'dataset size {size} is larger than ' \
+                   f'dataset length {len(dataset)}'
+                total_size += size
+                new_dataset_size.append(size)
+
+                g = torch.Generator()
+                g.manual_seed(self.seed)
+                self.dataset_cycle_iter.append(
+                    RandomCycleIter(self.dataset_index[-1], generator=g))
+        self.dataset_size = new_dataset_size
+
+        if self.round_up:
+            self.num_samples = math.ceil(total_size / world_size)
+            self.total_size = self.num_samples * self.world_size
+        else:
+            self.num_samples = math.ceil((total_size - rank) / world_size)
+            self.total_size = total_size
+
+    def __iter__(self) -> Iterator[int]:
+        """Iterate the indices."""
+        # deterministically shuffle based on epoch and seed
+        g = torch.Generator()
+        g.manual_seed(self.seed + self.epoch)
+
+        out_index = []
+        for data_size, data_index, cycle_iter in zip(self.dataset_size,
+                                                     self.dataset_index,
+                                                     self.dataset_cycle_iter):
+            if data_size == -1:
+                out_index += data_index
+            else:
+                index = [next(cycle_iter) for _ in range(data_size)]
+                out_index += index
+
+        index = torch.randperm(len(out_index), generator=g).numpy().tolist()
+        indices = [out_index[i] for i in index]
+
+        if self.round_up:
+            indices = (
+                indices *
+                int(self.total_size / len(indices) + 1))[:self.total_size]
+        indices = indices[self.rank:self.total_size:self.world_size]
+        return iter(indices)
+
+    def __len__(self) -> int:
+        """The number of samples in this rank."""
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        """Sets the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas use a different
+        random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
diff --git a/head_extractor/src/mmdet/datasets/samplers/multi_data_sampler.py b/head_extractor/src/mmdet/datasets/samplers/multi_data_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3a4b60d84122ce9eb2090095e9744c2bd73cc3d
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/samplers/multi_data_sampler.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Iterator, Optional, Sequence, Sized
+
+import torch
+from mmengine.dist import get_dist_info, sync_random_seed
+from mmengine.registry import DATA_SAMPLERS
+from torch.utils.data import Sampler
+
+
+@DATA_SAMPLERS.register_module()
+class MultiDataSampler(Sampler):
+    """The default data sampler for both distributed and non-distributed
+    environment.
+
+    It has several differences from the PyTorch ``DistributedSampler`` as
+    below:
+
+    1. This sampler supports non-distributed environment.
+
+    2. The round up behaviors are a little different.
+
+       - If ``round_up=True``, this sampler will add extra samples to make the
+         number of samples is evenly divisible by the world size. And
+         this behavior is the same as the ``DistributedSampler`` with
+         ``drop_last=False``.
+       - If ``round_up=False``, this sampler won't remove or add any samples
+         while the ``DistributedSampler`` with ``drop_last=True`` will remove
+         tail samples.
+
+    Args:
+        dataset (Sized): The dataset.
+        dataset_ratio (Sequence(int)) The ratios of different datasets.
+        seed (int, optional): Random seed used to shuffle the sampler if
+            :attr:`shuffle=True`. This number should be identical across all
+            processes in the distributed group. Defaults to None.
+        round_up (bool): Whether to add extra samples to make the number of
+            samples evenly divisible by the world size. Defaults to True.
+    """
+
+    def __init__(self,
+                 dataset: Sized,
+                 dataset_ratio: Sequence[int],
+                 seed: Optional[int] = None,
+                 round_up: bool = True) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.dataset_ratio = dataset_ratio
+
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+        self.epoch = 0
+        self.round_up = round_up
+
+        if self.round_up:
+            self.num_samples = math.ceil(len(self.dataset) / world_size)
+            self.total_size = self.num_samples * self.world_size
+        else:
+            self.num_samples = math.ceil(
+                (len(self.dataset) - rank) / world_size)
+            self.total_size = len(self.dataset)
+
+        self.sizes = [len(dataset) for dataset in self.dataset.datasets]
+
+        dataset_weight = [
+            torch.ones(s) * max(self.sizes) / s * r / sum(self.dataset_ratio)
+            for i, (r, s) in enumerate(zip(self.dataset_ratio, self.sizes))
+        ]
+        self.weights = torch.cat(dataset_weight)
+
+    def __iter__(self) -> Iterator[int]:
+        """Iterate the indices."""
+        # deterministically shuffle based on epoch and seed
+        g = torch.Generator()
+        g.manual_seed(self.seed + self.epoch)
+
+        indices = torch.multinomial(
+            self.weights, len(self.weights), generator=g,
+            replacement=True).tolist()
+
+        # add extra samples to make it evenly divisible
+        if self.round_up:
+            indices = (
+                indices *
+                int(self.total_size / len(indices) + 1))[:self.total_size]
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.world_size]
+
+        return iter(indices)
+
+    def __len__(self) -> int:
+        """The number of samples in this rank."""
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        """Sets the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas use a different
+        random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
diff --git a/head_extractor/src/mmdet/datasets/samplers/multi_source_sampler.py b/head_extractor/src/mmdet/datasets/samplers/multi_source_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6efcde35e1375547239825a8f78a9e74f7825290
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/samplers/multi_source_sampler.py
@@ -0,0 +1,214 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+from typing import Iterator, List, Optional, Sized, Union
+
+import numpy as np
+import torch
+from mmengine.dataset import BaseDataset
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+
+
+@DATA_SAMPLERS.register_module()
+class MultiSourceSampler(Sampler):
+    r"""Multi-Source Infinite Sampler.
+
+    According to the sampling ratio, sample data from different
+    datasets to form batches.
+
+    Args:
+        dataset (Sized): The dataset.
+        batch_size (int): Size of mini-batch.
+        source_ratio (list[int | float]): The sampling ratio of different
+            source datasets in a mini-batch.
+        shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
+        seed (int, optional): Random seed. If None, set a random seed.
+            Defaults to None.
+
+    Examples:
+        >>> dataset_type = 'ConcatDataset'
+        >>> sub_dataset_type = 'CocoDataset'
+        >>> data_root = 'data/coco/'
+        >>> sup_ann = '../coco_semi_annos/instances_train2017.1@10.json'
+        >>> unsup_ann = '../coco_semi_annos/' \
+        >>>             'instances_train2017.1@10-unlabeled.json'
+        >>> dataset = dict(type=dataset_type,
+        >>>     datasets=[
+        >>>         dict(
+        >>>             type=sub_dataset_type,
+        >>>             data_root=data_root,
+        >>>             ann_file=sup_ann,
+        >>>             data_prefix=dict(img='train2017/'),
+        >>>             filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        >>>             pipeline=sup_pipeline),
+        >>>         dict(
+        >>>             type=sub_dataset_type,
+        >>>             data_root=data_root,
+        >>>             ann_file=unsup_ann,
+        >>>             data_prefix=dict(img='train2017/'),
+        >>>             filter_cfg=dict(filter_empty_gt=True, min_size=32),
+        >>>             pipeline=unsup_pipeline),
+        >>>         ])
+        >>>     train_dataloader = dict(
+        >>>         batch_size=5,
+        >>>         num_workers=5,
+        >>>         persistent_workers=True,
+        >>>         sampler=dict(type='MultiSourceSampler',
+        >>>             batch_size=5, source_ratio=[1, 4]),
+        >>>         batch_sampler=None,
+        >>>         dataset=dataset)
+    """
+
+    def __init__(self,
+                 dataset: Sized,
+                 batch_size: int,
+                 source_ratio: List[Union[int, float]],
+                 shuffle: bool = True,
+                 seed: Optional[int] = None) -> None:
+
+        assert hasattr(dataset, 'cumulative_sizes'),\
+            f'The dataset must be ConcatDataset, but get {dataset}'
+        assert isinstance(batch_size, int) and batch_size > 0, \
+            'batch_size must be a positive integer value, ' \
+            f'but got batch_size={batch_size}'
+        assert isinstance(source_ratio, list), \
+            f'source_ratio must be a list, but got source_ratio={source_ratio}'
+        assert len(source_ratio) == len(dataset.cumulative_sizes), \
+            'The length of source_ratio must be equal to ' \
+            f'the number of datasets, but got source_ratio={source_ratio}'
+
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.cumulative_sizes = [0] + dataset.cumulative_sizes
+        self.batch_size = batch_size
+        self.source_ratio = source_ratio
+
+        self.num_per_source = [
+            int(batch_size * sr / sum(source_ratio)) for sr in source_ratio
+        ]
+        self.num_per_source[0] = batch_size - sum(self.num_per_source[1:])
+
+        assert sum(self.num_per_source) == batch_size, \
+            'The sum of num_per_source must be equal to ' \
+            f'batch_size, but get {self.num_per_source}'
+
+        self.seed = sync_random_seed() if seed is None else seed
+        self.shuffle = shuffle
+        self.source2inds = {
+            source: self._indices_of_rank(len(ds))
+            for source, ds in enumerate(dataset.datasets)
+        }
+
+    def _infinite_indices(self, sample_size: int) -> Iterator[int]:
+        """Infinitely yield a sequence of indices."""
+        g = torch.Generator()
+        g.manual_seed(self.seed)
+        while True:
+            if self.shuffle:
+                yield from torch.randperm(sample_size, generator=g).tolist()
+            else:
+                yield from torch.arange(sample_size).tolist()
+
+    def _indices_of_rank(self, sample_size: int) -> Iterator[int]:
+        """Slice the infinite indices by rank."""
+        yield from itertools.islice(
+            self._infinite_indices(sample_size), self.rank, None,
+            self.world_size)
+
+    def __iter__(self) -> Iterator[int]:
+        batch_buffer = []
+        while True:
+            for source, num in enumerate(self.num_per_source):
+                batch_buffer_per_source = []
+                for idx in self.source2inds[source]:
+                    idx += self.cumulative_sizes[source]
+                    batch_buffer_per_source.append(idx)
+                    if len(batch_buffer_per_source) == num:
+                        batch_buffer += batch_buffer_per_source
+                        break
+            yield from batch_buffer
+            batch_buffer = []
+
+    def __len__(self) -> int:
+        return len(self.dataset)
+
+    def set_epoch(self, epoch: int) -> None:
+        """Not supported in `epoch-based runner."""
+        pass
+
+
+@DATA_SAMPLERS.register_module()
+class GroupMultiSourceSampler(MultiSourceSampler):
+    r"""Group Multi-Source Infinite Sampler.
+
+    According to the sampling ratio, sample data from different
+    datasets but the same group to form batches.
+
+    Args:
+        dataset (Sized): The dataset.
+        batch_size (int): Size of mini-batch.
+        source_ratio (list[int | float]): The sampling ratio of different
+            source datasets in a mini-batch.
+        shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
+        seed (int, optional): Random seed. If None, set a random seed.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 dataset: BaseDataset,
+                 batch_size: int,
+                 source_ratio: List[Union[int, float]],
+                 shuffle: bool = True,
+                 seed: Optional[int] = None) -> None:
+        super().__init__(
+            dataset=dataset,
+            batch_size=batch_size,
+            source_ratio=source_ratio,
+            shuffle=shuffle,
+            seed=seed)
+
+        self._get_source_group_info()
+        self.group_source2inds = [{
+            source:
+            self._indices_of_rank(self.group2size_per_source[source][group])
+            for source in range(len(dataset.datasets))
+        } for group in range(len(self.group_ratio))]
+
+    def _get_source_group_info(self) -> None:
+        self.group2size_per_source = [{0: 0, 1: 0}, {0: 0, 1: 0}]
+        self.group2inds_per_source = [{0: [], 1: []}, {0: [], 1: []}]
+        for source, dataset in enumerate(self.dataset.datasets):
+            for idx in range(len(dataset)):
+                data_info = dataset.get_data_info(idx)
+                width, height = data_info['width'], data_info['height']
+                group = 0 if width < height else 1
+                self.group2size_per_source[source][group] += 1
+                self.group2inds_per_source[source][group].append(idx)
+
+        self.group_sizes = np.zeros(2, dtype=np.int64)
+        for group2size in self.group2size_per_source:
+            for group, size in group2size.items():
+                self.group_sizes[group] += size
+        self.group_ratio = self.group_sizes / sum(self.group_sizes)
+
+    def __iter__(self) -> Iterator[int]:
+        batch_buffer = []
+        while True:
+            group = np.random.choice(
+                list(range(len(self.group_ratio))), p=self.group_ratio)
+            for source, num in enumerate(self.num_per_source):
+                batch_buffer_per_source = []
+                for idx in self.group_source2inds[group][source]:
+                    idx = self.group2inds_per_source[source][group][
+                        idx] + self.cumulative_sizes[source]
+                    batch_buffer_per_source.append(idx)
+                    if len(batch_buffer_per_source) == num:
+                        batch_buffer += batch_buffer_per_source
+                        break
+            yield from batch_buffer
+            batch_buffer = []
diff --git a/head_extractor/src/mmdet/datasets/samplers/track_img_sampler.py b/head_extractor/src/mmdet/datasets/samplers/track_img_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7db629f40f3f24bdf14cd852ccc4472d1d50f1b
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/samplers/track_img_sampler.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import random
+from typing import Iterator, Optional, Sized
+
+import numpy as np
+from mmengine.dataset import ClassBalancedDataset, ConcatDataset
+from mmengine.dist import get_dist_info, sync_random_seed
+from torch.utils.data import Sampler
+
+from mmdet.registry import DATA_SAMPLERS
+from ..base_video_dataset import BaseVideoDataset
+
+
+@DATA_SAMPLERS.register_module()
+class TrackImgSampler(Sampler):
+    """Sampler that providing image-level sampling outputs for video datasets
+    in tracking tasks. It could be both used in both distributed and
+    non-distributed environment.
+    If using the default sampler in pytorch, the subsequent data receiver will
+    get one video, which is not desired in some cases:
+    (Take a non-distributed environment as an example)
+    1. In test mode, we want only one image is fed into the data pipeline. This
+    is in consideration of memory usage since feeding the whole video commonly
+    requires a large amount of memory (>=20G on MOTChallenge17 dataset), which
+    is not available in some machines.
+    2. In training mode, we may want to make sure all the images in one video
+    are randomly sampled once in one epoch and this can not be guaranteed in
+    the default sampler in pytorch.
+
+    Args:
+        dataset (Sized): Dataset used for sampling.
+        seed (int, optional): random seed used to shuffle the sampler. This
+            number should be identical across all processes in the distributed
+            group. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        dataset: Sized,
+        seed: Optional[int] = None,
+    ) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+        self.epoch = 0
+        if seed is None:
+            self.seed = sync_random_seed()
+        else:
+            self.seed = seed
+
+        self.dataset = dataset
+        self.indices = []
+        # Hard code here to handle different dataset wrapper
+        if isinstance(self.dataset, ConcatDataset):
+            cat_datasets = self.dataset.datasets
+            assert isinstance(
+                cat_datasets[0], BaseVideoDataset
+            ), f'expected BaseVideoDataset, but got {type(cat_datasets[0])}'
+            self.test_mode = cat_datasets[0].test_mode
+            assert not self.test_mode, "'ConcatDataset' should not exist in "
+            'test mode'
+            for dataset in cat_datasets:
+                num_videos = len(dataset)
+                for video_ind in range(num_videos):
+                    self.indices.extend([
+                        (video_ind, frame_ind) for frame_ind in range(
+                            dataset.get_len_per_video(video_ind))
+                    ])
+        elif isinstance(self.dataset, ClassBalancedDataset):
+            ori_dataset = self.dataset.dataset
+            assert isinstance(
+                ori_dataset, BaseVideoDataset
+            ), f'expected BaseVideoDataset, but got {type(ori_dataset)}'
+            self.test_mode = ori_dataset.test_mode
+            assert not self.test_mode, "'ClassBalancedDataset' should not "
+            'exist in test mode'
+            video_indices = self.dataset.repeat_indices
+            for index in video_indices:
+                self.indices.extend([(index, frame_ind) for frame_ind in range(
+                    ori_dataset.get_len_per_video(index))])
+        else:
+            assert isinstance(
+                self.dataset, BaseVideoDataset
+            ), 'TrackImgSampler is only supported in BaseVideoDataset or '
+            'dataset wrapper: ClassBalancedDataset and ConcatDataset, but '
+            f'got {type(self.dataset)} '
+            self.test_mode = self.dataset.test_mode
+            num_videos = len(self.dataset)
+
+            if self.test_mode:
+                # in test mode, the images belong to the same video must be put
+                # on the same device.
+                if num_videos < self.world_size:
+                    raise ValueError(f'only {num_videos} videos loaded,'
+                                     f'but {self.world_size} gpus were given.')
+                chunks = np.array_split(
+                    list(range(num_videos)), self.world_size)
+                for videos_inds in chunks:
+                    indices_chunk = []
+                    for video_ind in videos_inds:
+                        indices_chunk.extend([
+                            (video_ind, frame_ind) for frame_ind in range(
+                                self.dataset.get_len_per_video(video_ind))
+                        ])
+                    self.indices.append(indices_chunk)
+            else:
+                for video_ind in range(num_videos):
+                    self.indices.extend([
+                        (video_ind, frame_ind) for frame_ind in range(
+                            self.dataset.get_len_per_video(video_ind))
+                    ])
+
+        if self.test_mode:
+            self.num_samples = len(self.indices[self.rank])
+            self.total_size = sum(
+                [len(index_list) for index_list in self.indices])
+        else:
+            self.num_samples = int(
+                math.ceil(len(self.indices) * 1.0 / self.world_size))
+            self.total_size = self.num_samples * self.world_size
+
+    def __iter__(self) -> Iterator:
+        if self.test_mode:
+            # in test mode, the order of frames can not be shuffled.
+            indices = self.indices[self.rank]
+        else:
+            # deterministically shuffle based on epoch
+            rng = random.Random(self.epoch + self.seed)
+            indices = rng.sample(self.indices, len(self.indices))
+
+            # add extra samples to make it evenly divisible
+            indices += indices[:(self.total_size - len(indices))]
+            assert len(indices) == self.total_size
+
+            # subsample
+            indices = indices[self.rank:self.total_size:self.world_size]
+            assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/head_extractor/src/mmdet/datasets/transforms/__init__.py b/head_extractor/src/mmdet/datasets/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab3478feb008443cb0e56bf5084261370e38327d
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/transforms/__init__.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .augment_wrappers import AutoAugment, RandAugment
+from .colorspace import (AutoContrast, Brightness, Color, ColorTransform,
+                         Contrast, Equalize, Invert, Posterize, Sharpness,
+                         Solarize, SolarizeAdd)
+from .formatting import (ImageToTensor, PackDetInputs, PackReIDInputs,
+                         PackTrackInputs, ToTensor, Transpose)
+from .frame_sampling import BaseFrameSample, UniformRefFrameSample
+from .geometric import (GeomTransform, Rotate, ShearX, ShearY, TranslateX,
+                        TranslateY)
+from .instaboost import InstaBoost
+from .loading import (FilterAnnotations, InferencerLoader, LoadAnnotations,
+                      LoadEmptyAnnotations, LoadImageFromNDArray,
+                      LoadMultiChannelImageFromFiles, LoadPanopticAnnotations,
+                      LoadProposals, LoadTrackAnnotations)
+from .text_transformers import LoadTextAnnotations, RandomSamplingNegPos
+from .transformers_glip import GTBoxSubOne_GLIP, RandomFlip_GLIP
+from .transforms import (Albu, CachedMixUp, CachedMosaic, CopyPaste, CutOut,
+                         Expand, FixScaleResize, FixShapeResize,
+                         MinIoURandomCrop, MixUp, Mosaic, Pad,
+                         PhotoMetricDistortion, RandomAffine,
+                         RandomCenterCropPad, RandomCrop, RandomErasing,
+                         RandomFlip, RandomShift, Resize, ResizeShortestEdge,
+                         SegRescale, YOLOXHSVRandomAug)
+from .wrappers import MultiBranch, ProposalBroadcaster, RandomOrder
+
+__all__ = [
+    'PackDetInputs', 'ToTensor', 'ImageToTensor', 'Transpose',
+    'LoadImageFromNDArray', 'LoadAnnotations', 'LoadPanopticAnnotations',
+    'LoadMultiChannelImageFromFiles', 'LoadProposals', 'Resize', 'RandomFlip',
+    'RandomCrop', 'SegRescale', 'MinIoURandomCrop', 'Expand',
+    'PhotoMetricDistortion', 'Albu', 'InstaBoost', 'RandomCenterCropPad',
+    'AutoAugment', 'CutOut', 'ShearX', 'ShearY', 'Rotate', 'Color', 'Equalize',
+    'Brightness', 'Contrast', 'TranslateX', 'TranslateY', 'RandomShift',
+    'Mosaic', 'MixUp', 'RandomAffine', 'YOLOXHSVRandomAug', 'CopyPaste',
+    'FilterAnnotations', 'Pad', 'GeomTransform', 'ColorTransform',
+    'RandAugment', 'Sharpness', 'Solarize', 'SolarizeAdd', 'Posterize',
+    'AutoContrast', 'Invert', 'MultiBranch', 'RandomErasing',
+    'LoadEmptyAnnotations', 'RandomOrder', 'CachedMosaic', 'CachedMixUp',
+    'FixShapeResize', 'ProposalBroadcaster', 'InferencerLoader',
+    'LoadTrackAnnotations', 'BaseFrameSample', 'UniformRefFrameSample',
+    'PackTrackInputs', 'PackReIDInputs', 'FixScaleResize',
+    'ResizeShortestEdge', 'GTBoxSubOne_GLIP', 'RandomFlip_GLIP',
+    'RandomSamplingNegPos', 'LoadTextAnnotations'
+]
diff --git a/head_extractor/src/mmdet/datasets/transforms/augment_wrappers.py b/head_extractor/src/mmdet/datasets/transforms/augment_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..19fae6efdf66aa4c26bb85a2f2c96a1e079320b8
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/transforms/augment_wrappers.py
@@ -0,0 +1,264 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import numpy as np
+from mmcv.transforms import RandomChoice
+from mmcv.transforms.utils import cache_randomness
+from mmengine.config import ConfigDict
+
+from mmdet.registry import TRANSFORMS
+
+# AutoAugment uses reinforcement learning to search for
+# some widely useful data augmentation strategies,
+# here we provide AUTOAUG_POLICIES_V0.
+# For AUTOAUG_POLICIES_V0, each tuple is an augmentation
+# operation of the form (operation, probability, magnitude).
+# Each element in policies is a policy that will be applied
+# sequentially on the image.
+
+# RandAugment defines a data augmentation search space, RANDAUG_SPACE,
+# sampling 1~3 data augmentations each time, and
+# setting the magnitude of each data augmentation randomly,
+# which will be applied sequentially on the image.
+
+_MAX_LEVEL = 10
+
+AUTOAUG_POLICIES_V0 = [
+    [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+    [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+    [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+    [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+    [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+    [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+    [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+    [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+    [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+    [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+    [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+    [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+    [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+    [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+    [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+    [('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)],
+    [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+    [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+    [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+    [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+    [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+    [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+    [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+    [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+    [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+]
+
+
+def policies_v0():
+    """Autoaugment policies that was used in AutoAugment Paper."""
+    policies = list()
+    for policy_args in AUTOAUG_POLICIES_V0:
+        policy = list()
+        for args in policy_args:
+            policy.append(dict(type=args[0], prob=args[1], level=args[2]))
+        policies.append(policy)
+    return policies
+
+
+RANDAUG_SPACE = [[dict(type='AutoContrast')], [dict(type='Equalize')],
+                 [dict(type='Invert')], [dict(type='Rotate')],
+                 [dict(type='Posterize')], [dict(type='Solarize')],
+                 [dict(type='SolarizeAdd')], [dict(type='Color')],
+                 [dict(type='Contrast')], [dict(type='Brightness')],
+                 [dict(type='Sharpness')], [dict(type='ShearX')],
+                 [dict(type='ShearY')], [dict(type='TranslateX')],
+                 [dict(type='TranslateY')]]
+
+
+def level_to_mag(level: Optional[int], min_mag: float,
+                 max_mag: float) -> float:
+    """Map from level to magnitude."""
+    if level is None:
+        return round(np.random.rand() * (max_mag - min_mag) + min_mag, 1)
+    else:
+        return round(level / _MAX_LEVEL * (max_mag - min_mag) + min_mag, 1)
+
+
+@TRANSFORMS.register_module()
+class AutoAugment(RandomChoice):
+    """Auto augmentation.
+
+    This data augmentation is proposed in `AutoAugment: Learning
+    Augmentation Policies from Data <https://arxiv.org/abs/1805.09501>`_
+    and in `Learning Data Augmentation Strategies for Object Detection
+    <https://arxiv.org/pdf/1906.11172>`_.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_masks
+    - gt_ignore_flags
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        policies (List[List[Union[dict, ConfigDict]]]):
+            The policies of auto augmentation.Each policy in ``policies``
+            is a specific augmentation policy, and is composed by several
+            augmentations. When AutoAugment is called, a random policy in
+            ``policies`` will be selected to augment images.
+            Defaults to policy_v0().
+        prob (list[float], optional): The probabilities associated
+            with each policy. The length should be equal to the policy
+            number and the sum should be 1. If not given, a uniform
+            distribution will be assumed. Defaults to None.
+
+    Examples:
+        >>> policies = [
+        >>>     [
+        >>>         dict(type='Sharpness', prob=0.0, level=8),
+        >>>         dict(type='ShearX', prob=0.4, level=0,)
+        >>>     ],
+        >>>     [
+        >>>         dict(type='Rotate', prob=0.6, level=10),
+        >>>         dict(type='Color', prob=1.0, level=6)
+        >>>     ]
+        >>> ]
+        >>> augmentation = AutoAugment(policies)
+        >>> img = np.ones(100, 100, 3)
+        >>> gt_bboxes = np.ones(10, 4)
+        >>> results = dict(img=img, gt_bboxes=gt_bboxes)
+        >>> results = augmentation(results)
+    """
+
+    def __init__(self,
+                 policies: List[List[Union[dict, ConfigDict]]] = policies_v0(),
+                 prob: Optional[List[float]] = None) -> None:
+        assert isinstance(policies, list) and len(policies) > 0, \
+            'Policies must be a non-empty list.'
+        for policy in policies:
+            assert isinstance(policy, list) and len(policy) > 0, \
+                'Each policy in policies must be a non-empty list.'
+            for augment in policy:
+                assert isinstance(augment, dict) and 'type' in augment, \
+                    'Each specific augmentation must be a dict with key' \
+                    ' "type".'
+        super().__init__(transforms=policies, prob=prob)
+        self.policies = policies
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(policies={self.policies}, ' \
+               f'prob={self.prob})'
+
+
+@TRANSFORMS.register_module()
+class RandAugment(RandomChoice):
+    """Rand augmentation.
+
+    This data augmentation is proposed in `RandAugment:
+    Practical automated data augmentation with a reduced
+    search space <https://arxiv.org/abs/1909.13719>`_.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_masks
+    - gt_ignore_flags
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        aug_space (List[List[Union[dict, ConfigDict]]]): The augmentation space
+            of rand augmentation. Each augmentation transform in ``aug_space``
+            is a specific transform, and is composed by several augmentations.
+            When RandAugment is called, a random transform in ``aug_space``
+            will be selected to augment images. Defaults to aug_space.
+        aug_num (int): Number of augmentation to apply equentially.
+            Defaults to 2.
+        prob (list[float], optional): The probabilities associated with
+            each augmentation. The length should be equal to the
+            augmentation space and the sum should be 1. If not given,
+            a uniform distribution will be assumed. Defaults to None.
+
+    Examples:
+        >>> aug_space = [
+        >>>     dict(type='Sharpness'),
+        >>>     dict(type='ShearX'),
+        >>>     dict(type='Color'),
+        >>>     ],
+        >>> augmentation = RandAugment(aug_space)
+        >>> img = np.ones(100, 100, 3)
+        >>> gt_bboxes = np.ones(10, 4)
+        >>> results = dict(img=img, gt_bboxes=gt_bboxes)
+        >>> results = augmentation(results)
+    """
+
+    def __init__(self,
+                 aug_space: List[Union[dict, ConfigDict]] = RANDAUG_SPACE,
+                 aug_num: int = 2,
+                 prob: Optional[List[float]] = None) -> None:
+        assert isinstance(aug_space, list) and len(aug_space) > 0, \
+            'Augmentation space must be a non-empty list.'
+        for aug in aug_space:
+            assert isinstance(aug, list) and len(aug) == 1, \
+                'Each augmentation in aug_space must be a list.'
+            for transform in aug:
+                assert isinstance(transform, dict) and 'type' in transform, \
+                    'Each specific transform must be a dict with key' \
+                    ' "type".'
+        super().__init__(transforms=aug_space, prob=prob)
+        self.aug_space = aug_space
+        self.aug_num = aug_num
+
+    @cache_randomness
+    def random_pipeline_index(self):
+        indices = np.arange(len(self.transforms))
+        return np.random.choice(
+            indices, self.aug_num, p=self.prob, replace=False)
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to use RandAugment.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with RandAugment.
+        """
+        for idx in self.random_pipeline_index():
+            results = self.transforms[idx](results)
+        return results
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(' \
+               f'aug_space={self.aug_space}, '\
+               f'aug_num={self.aug_num}, ' \
+               f'prob={self.prob})'
diff --git a/head_extractor/src/mmdet/datasets/transforms/colorspace.py b/head_extractor/src/mmdet/datasets/transforms/colorspace.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0ba2e97c7eedf65df5ab8942ee461f48a785f39
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/transforms/colorspace.py
@@ -0,0 +1,493 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmcv.transforms.utils import cache_randomness
+
+from mmdet.registry import TRANSFORMS
+from .augment_wrappers import _MAX_LEVEL, level_to_mag
+
+
+@TRANSFORMS.register_module()
+class ColorTransform(BaseTransform):
+    """Base class for color transformations. All color transformations need to
+    inherit from this base class. ``ColorTransform`` unifies the class
+    attributes and class functions of color transformations (Color, Brightness,
+    Contrast, Sharpness, Solarize, SolarizeAdd, Equalize, AutoContrast, Invert,
+    and Posterize), and only distort color channels, without impacting the
+    locations of the instances.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing the geometric
+            transformation and should be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for color transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for color transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0 <= prob <= 1.0, f'The probability of the transformation ' \
+                                 f'should be in range [0,1], got {prob}.'
+        assert level is None or isinstance(level, int), \
+            f'The level should be None or type int, got {type(level)}.'
+        assert level is None or 0 <= level <= _MAX_LEVEL, \
+            f'The level should be in range [0,{_MAX_LEVEL}], got {level}.'
+        assert isinstance(min_mag, float), \
+            f'min_mag should be type float, got {type(min_mag)}.'
+        assert isinstance(max_mag, float), \
+            f'max_mag should be type float, got {type(max_mag)}.'
+        assert min_mag <= max_mag, \
+            f'min_mag should smaller than max_mag, ' \
+            f'got min_mag={min_mag} and max_mag={max_mag}'
+        self.prob = prob
+        self.level = level
+        self.min_mag = min_mag
+        self.max_mag = max_mag
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Transform the image."""
+        pass
+
+    @cache_randomness
+    def _random_disable(self):
+        """Randomly disable the transform."""
+        return np.random.rand() > self.prob
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        return level_to_mag(self.level, self.min_mag, self.max_mag)
+
+    def transform(self, results: dict) -> dict:
+        """Transform function for images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Transformed results.
+        """
+
+        if self._random_disable():
+            return results
+        mag = self._get_mag()
+        self._transform_img(results, mag)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'level={self.level}, '
+        repr_str += f'min_mag={self.min_mag}, '
+        repr_str += f'max_mag={self.max_mag})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Color(ColorTransform):
+    """Adjust the color balance of the image, in a manner similar to the
+    controls on a colour TV set. A magnitude=0 gives a black & white image,
+    whereas magnitude=1 gives the original image. The bboxes, masks and
+    segmentations are not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Color transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Color transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Color transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Color should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Color should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Apply Color transformation to image."""
+        # NOTE defaultly the image should be BGR format
+        img = results['img']
+        results['img'] = mmcv.adjust_color(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Brightness(ColorTransform):
+    """Adjust the brightness of the image. A magnitude=0 gives a black image,
+    whereas magnitude=1 gives the original image. The bboxes, masks and
+    segmentations are not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Brightness transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Brightness transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Brightness transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Brightness should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Brightness should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Adjust the brightness of image."""
+        img = results['img']
+        results['img'] = mmcv.adjust_brightness(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Contrast(ColorTransform):
+    """Control the contrast of the image. A magnitude=0 gives a gray image,
+    whereas magnitude=1 gives the original imageThe bboxes, masks and
+    segmentations are not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Contrast transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Contrast transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Contrast transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Contrast should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Contrast should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Adjust the image contrast."""
+        img = results['img']
+        results['img'] = mmcv.adjust_contrast(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Sharpness(ColorTransform):
+    """Adjust images sharpness. A positive magnitude would enhance the
+    sharpness and a negative magnitude would make the image blurry. A
+    magnitude=0 gives the origin img.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Sharpness transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Sharpness transformation.
+            Defaults to 0.1.
+        max_mag (float): The maximum magnitude for Sharpness transformation.
+            Defaults to 1.9.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.1,
+                 max_mag: float = 1.9) -> None:
+        assert 0. <= min_mag <= 2.0, \
+            f'min_mag for Sharpness should be in range [0,2], got {min_mag}.'
+        assert 0. <= max_mag <= 2.0, \
+            f'max_mag for Sharpness should be in range [0,2], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Adjust the image sharpness."""
+        img = results['img']
+        results['img'] = mmcv.adjust_sharpness(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Solarize(ColorTransform):
+    """Solarize images (Invert all pixels above a threshold value of
+    magnitude.).
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Solarize transformation.
+            Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Solarize transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for Solarize transformation.
+            Defaults to 256.0.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 256.0) -> None:
+        assert 0. <= min_mag <= 256.0, f'min_mag for Solarize should be ' \
+                                       f'in range [0, 256], got {min_mag}.'
+        assert 0. <= max_mag <= 256.0, f'max_mag for Solarize should be ' \
+                                       f'in range [0, 256], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Invert all pixel values above magnitude."""
+        img = results['img']
+        results['img'] = mmcv.solarize(img, mag).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class SolarizeAdd(ColorTransform):
+    """SolarizeAdd images. For each pixel in the image that is less than 128,
+    add an additional amount to it decided by the magnitude.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing SolarizeAdd
+            transformation. Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for SolarizeAdd transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for SolarizeAdd transformation.
+            Defaults to 110.0.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 110.0) -> None:
+        assert 0. <= min_mag <= 110.0, f'min_mag for SolarizeAdd should be ' \
+                                       f'in range [0, 110], got {min_mag}.'
+        assert 0. <= max_mag <= 110.0, f'max_mag for SolarizeAdd should be ' \
+                                       f'in range [0, 110], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """SolarizeAdd the image."""
+        img = results['img']
+        img_solarized = np.where(img < 128, np.minimum(img + mag, 255), img)
+        results['img'] = img_solarized.astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Posterize(ColorTransform):
+    """Posterize images (reduce the number of bits for each color channel).
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Posterize
+            transformation. Defaults to 1.0.
+        level (int, optional): Should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for Posterize transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for Posterize transformation.
+            Defaults to 4.0.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 4.0) -> None:
+        assert 0. <= min_mag <= 8.0, f'min_mag for Posterize should be ' \
+                                     f'in range [0, 8], got {min_mag}.'
+        assert 0. <= max_mag <= 8.0, f'max_mag for Posterize should be ' \
+                                     f'in range [0, 8], got {max_mag}.'
+        super().__init__(
+            prob=prob, level=level, min_mag=min_mag, max_mag=max_mag)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Posterize the image."""
+        img = results['img']
+        results['img'] = mmcv.posterize(img, math.ceil(mag)).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Equalize(ColorTransform):
+    """Equalize the image histogram. The bboxes, masks and segmentations are
+    not modified.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing Equalize transformation.
+            Defaults to 1.0.
+        level (int, optional): No use for Equalize transformation.
+            Defaults to None.
+        min_mag (float): No use for Equalize transformation. Defaults to 0.1.
+        max_mag (float): No use for Equalize transformation. Defaults to 1.9.
+    """
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Equalizes the histogram of one image."""
+        img = results['img']
+        results['img'] = mmcv.imequalize(img).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class AutoContrast(ColorTransform):
+    """Auto adjust image contrast.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing AutoContrast should
+             be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): No use for AutoContrast transformation.
+            Defaults to None.
+        min_mag (float): No use for AutoContrast transformation.
+            Defaults to 0.1.
+        max_mag (float): No use for AutoContrast transformation.
+            Defaults to 1.9.
+    """
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Auto adjust image contrast."""
+        img = results['img']
+        results['img'] = mmcv.auto_contrast(img).astype(img.dtype)
+
+
+@TRANSFORMS.register_module()
+class Invert(ColorTransform):
+    """Invert images.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): The probability for performing invert therefore should
+             be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): No use for Invert transformation.
+            Defaults to None.
+        min_mag (float): No use for Invert transformation. Defaults to 0.1.
+        max_mag (float): No use for Invert transformation. Defaults to 1.9.
+    """
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Invert the image."""
+        img = results['img']
+        results['img'] = mmcv.iminvert(img).astype(img.dtype)
diff --git a/head_extractor/src/mmdet/datasets/transforms/formatting.py b/head_extractor/src/mmdet/datasets/transforms/formatting.py
new file mode 100644
index 0000000000000000000000000000000000000000..05263807c0eab470b0c73f435d327ad8cadb60b3
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/transforms/formatting.py
@@ -0,0 +1,512 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
+import numpy as np
+from mmcv.transforms import to_tensor
+from mmcv.transforms.base import BaseTransform
+from mmengine.structures import InstanceData, PixelData
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures import DetDataSample, ReIDDataSample, TrackDataSample
+from mmdet.structures.bbox import BaseBoxes
+
+
+@TRANSFORMS.register_module()
+class PackDetInputs(BaseTransform):
+    """Pack the inputs data for the detection / semantic segmentation /
+    panoptic segmentation.
+
+    The ``img_meta`` item is always populated.  The contents of the
+    ``img_meta`` dictionary depends on ``meta_keys``. By default this includes:
+
+        - ``img_id``: id of the image
+
+        - ``img_path``: path to the image file
+
+        - ``ori_shape``: original shape of the image as a tuple (h, w)
+
+        - ``img_shape``: shape of the image input to the network as a tuple \
+            (h, w).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+
+        - ``scale_factor``: a float indicating the preprocessing scale
+
+        - ``flip``: a boolean indicating if image flip transform was used
+
+        - ``flip_direction``: the flipping direction
+
+    Args:
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ``('img_id', 'img_path', 'ori_shape', 'img_shape',
+            'scale_factor', 'flip', 'flip_direction')``
+    """
+    mapping_table = {
+        'gt_bboxes': 'bboxes',
+        'gt_bboxes_labels': 'labels',
+        'gt_masks': 'masks'
+    }
+
+    def __init__(self,
+                 meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                            'scale_factor', 'flip', 'flip_direction')):
+        self.meta_keys = meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict:
+
+            - 'inputs' (obj:`torch.Tensor`): The forward data of models.
+            - 'data_sample' (obj:`DetDataSample`): The annotation info of the
+                sample.
+        """
+        packed_results = dict()
+        if 'img' in results:
+            img = results['img']
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            # To improve the computational speed by by 3-5 times, apply:
+            # If image is not contiguous, use
+            # `numpy.transpose()` followed by `numpy.ascontiguousarray()`
+            # If image is already contiguous, use
+            # `torch.permute()` followed by `torch.contiguous()`
+            # Refer to https://github.com/open-mmlab/mmdetection/pull/9533
+            # for more details
+            if not img.flags.c_contiguous:
+                img = np.ascontiguousarray(img.transpose(2, 0, 1))
+                img = to_tensor(img)
+            else:
+                img = to_tensor(img).permute(2, 0, 1).contiguous()
+
+            packed_results['inputs'] = img
+
+        if 'gt_ignore_flags' in results:
+            valid_idx = np.where(results['gt_ignore_flags'] == 0)[0]
+            ignore_idx = np.where(results['gt_ignore_flags'] == 1)[0]
+
+        data_sample = DetDataSample()
+        instance_data = InstanceData()
+        ignore_instance_data = InstanceData()
+
+        for key in self.mapping_table.keys():
+            if key not in results:
+                continue
+            if key == 'gt_masks' or isinstance(results[key], BaseBoxes):
+                if 'gt_ignore_flags' in results:
+                    instance_data[
+                        self.mapping_table[key]] = results[key][valid_idx]
+                    ignore_instance_data[
+                        self.mapping_table[key]] = results[key][ignore_idx]
+                else:
+                    instance_data[self.mapping_table[key]] = results[key]
+            else:
+                if 'gt_ignore_flags' in results:
+                    instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key][valid_idx])
+                    ignore_instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key][ignore_idx])
+                else:
+                    instance_data[self.mapping_table[key]] = to_tensor(
+                        results[key])
+        data_sample.gt_instances = instance_data
+        data_sample.ignored_instances = ignore_instance_data
+
+        if 'proposals' in results:
+            proposals = InstanceData(
+                bboxes=to_tensor(results['proposals']),
+                scores=to_tensor(results['proposals_scores']))
+            data_sample.proposals = proposals
+
+        if 'gt_seg_map' in results:
+            gt_sem_seg_data = dict(
+                sem_seg=to_tensor(results['gt_seg_map'][None, ...].copy()))
+            gt_sem_seg_data = PixelData(**gt_sem_seg_data)
+            if 'ignore_index' in results:
+                metainfo = dict(ignore_index=results['ignore_index'])
+                gt_sem_seg_data.set_metainfo(metainfo)
+            data_sample.gt_sem_seg = gt_sem_seg_data
+
+        img_meta = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_meta[key] = results[key]
+        data_sample.set_metainfo(img_meta)
+        packed_results['data_samples'] = data_sample
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ToTensor:
+    """Convert some results to :obj:`torch.Tensor` by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys that need to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert data in results to :obj:`torch.Tensor`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted
+                to :obj:`torch.Tensor`.
+        """
+        for key in self.keys:
+            results[key] = to_tensor(results[key])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@TRANSFORMS.register_module()
+class ImageToTensor:
+    """Convert image to :obj:`torch.Tensor` by given keys.
+
+    The dimension order of input image is (H, W, C). The pipeline will convert
+    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
+    (1, H, W).
+
+    Args:
+        keys (Sequence[str]): Key of images to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert image in results to :obj:`torch.Tensor` and
+        transpose the channel order.
+
+        Args:
+            results (dict): Result dict contains the image data to convert.
+
+        Returns:
+            dict: The result dict contains the image converted
+                to :obj:`torch.Tensor` and permuted to (C, H, W) order.
+        """
+        for key in self.keys:
+            img = results[key]
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            results[key] = to_tensor(img).permute(2, 0, 1).contiguous()
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@TRANSFORMS.register_module()
+class Transpose:
+    """Transpose some results by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys of results to be transposed.
+        order (Sequence[int]): Order of transpose.
+    """
+
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+
+    def __call__(self, results):
+        """Call function to transpose the channel order of data in results.
+
+        Args:
+            results (dict): Result dict contains the data to transpose.
+
+        Returns:
+            dict: The result dict contains the data transposed to \
+                ``self.order``.
+        """
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, order={self.order})'
+
+
+@TRANSFORMS.register_module()
+class WrapFieldsToLists:
+    """Wrap fields of the data dictionary into lists for evaluation.
+
+    This class can be used as a last step of a test or validation
+    pipeline for single image evaluation or inference.
+
+    Example:
+        >>> test_pipeline = [
+        >>>    dict(type='LoadImageFromFile'),
+        >>>    dict(type='Normalize',
+                    mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True),
+        >>>    dict(type='Pad', size_divisor=32),
+        >>>    dict(type='ImageToTensor', keys=['img']),
+        >>>    dict(type='Collect', keys=['img']),
+        >>>    dict(type='WrapFieldsToLists')
+        >>> ]
+    """
+
+    def __call__(self, results):
+        """Call function to wrap fields into lists.
+
+        Args:
+            results (dict): Result dict contains the data to wrap.
+
+        Returns:
+            dict: The result dict where value of ``self.keys`` are wrapped \
+                into list.
+        """
+
+        # Wrap dict fields into lists
+        for key, val in results.items():
+            results[key] = [val]
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}()'
+
+
+@TRANSFORMS.register_module()
+class PackTrackInputs(BaseTransform):
+    """Pack the inputs data for the multi object tracking and video instance
+    segmentation. All the information of images are packed to ``inputs``. All
+    the information except images are packed to ``data_samples``. In order to
+    get the original annotaiton and meta info, we add `instances` key into meta
+    keys.
+
+    Args:
+        meta_keys (Sequence[str]): Meta keys to be collected in
+            ``data_sample.metainfo``. Defaults to None.
+        default_meta_keys (tuple): Default meta keys. Defaults to ('img_id',
+            'img_path', 'ori_shape', 'img_shape', 'scale_factor',
+            'flip', 'flip_direction', 'frame_id', 'is_video_data',
+            'video_id', 'video_length', 'instances').
+    """
+    mapping_table = {
+        'gt_bboxes': 'bboxes',
+        'gt_bboxes_labels': 'labels',
+        'gt_masks': 'masks',
+        'gt_instances_ids': 'instances_ids'
+    }
+
+    def __init__(self,
+                 meta_keys: Optional[dict] = None,
+                 default_meta_keys: tuple = ('img_id', 'img_path', 'ori_shape',
+                                             'img_shape', 'scale_factor',
+                                             'flip', 'flip_direction',
+                                             'frame_id', 'video_id',
+                                             'video_length',
+                                             'ori_video_length', 'instances')):
+        self.meta_keys = default_meta_keys
+        if meta_keys is not None:
+            if isinstance(meta_keys, str):
+                meta_keys = (meta_keys, )
+            else:
+                assert isinstance(meta_keys, tuple), \
+                    'meta_keys must be str or tuple'
+            self.meta_keys += meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+        Args:
+            results (dict): Result dict from the data pipeline.
+        Returns:
+            dict:
+            - 'inputs' (dict[Tensor]): The forward data of models.
+            - 'data_samples' (obj:`TrackDataSample`): The annotation info of
+                the samples.
+        """
+        packed_results = dict()
+        packed_results['inputs'] = dict()
+
+        # 1. Pack images
+        if 'img' in results:
+            imgs = results['img']
+            imgs = np.stack(imgs, axis=0)
+            imgs = imgs.transpose(0, 3, 1, 2)
+            packed_results['inputs'] = to_tensor(imgs)
+
+        # 2. Pack InstanceData
+        if 'gt_ignore_flags' in results:
+            gt_ignore_flags_list = results['gt_ignore_flags']
+            valid_idx_list, ignore_idx_list = [], []
+            for gt_ignore_flags in gt_ignore_flags_list:
+                valid_idx = np.where(gt_ignore_flags == 0)[0]
+                ignore_idx = np.where(gt_ignore_flags == 1)[0]
+                valid_idx_list.append(valid_idx)
+                ignore_idx_list.append(ignore_idx)
+
+        assert 'img_id' in results, "'img_id' must contained in the results "
+        'for counting the number of images'
+
+        num_imgs = len(results['img_id'])
+        instance_data_list = [InstanceData() for _ in range(num_imgs)]
+        ignore_instance_data_list = [InstanceData() for _ in range(num_imgs)]
+
+        for key in self.mapping_table.keys():
+            if key not in results:
+                continue
+            if key == 'gt_masks':
+                mapped_key = self.mapping_table[key]
+                gt_masks_list = results[key]
+                if 'gt_ignore_flags' in results:
+                    for i, gt_mask in enumerate(gt_masks_list):
+                        valid_idx, ignore_idx = valid_idx_list[
+                            i], ignore_idx_list[i]
+                        instance_data_list[i][mapped_key] = gt_mask[valid_idx]
+                        ignore_instance_data_list[i][mapped_key] = gt_mask[
+                            ignore_idx]
+
+                else:
+                    for i, gt_mask in enumerate(gt_masks_list):
+                        instance_data_list[i][mapped_key] = gt_mask
+
+            else:
+                anns_list = results[key]
+                if 'gt_ignore_flags' in results:
+                    for i, ann in enumerate(anns_list):
+                        valid_idx, ignore_idx = valid_idx_list[
+                            i], ignore_idx_list[i]
+                        instance_data_list[i][
+                            self.mapping_table[key]] = to_tensor(
+                                ann[valid_idx])
+                        ignore_instance_data_list[i][
+                            self.mapping_table[key]] = to_tensor(
+                                ann[ignore_idx])
+                else:
+                    for i, ann in enumerate(anns_list):
+                        instance_data_list[i][
+                            self.mapping_table[key]] = to_tensor(ann)
+
+        det_data_samples_list = []
+        for i in range(num_imgs):
+            det_data_sample = DetDataSample()
+            det_data_sample.gt_instances = instance_data_list[i]
+            det_data_sample.ignored_instances = ignore_instance_data_list[i]
+            det_data_samples_list.append(det_data_sample)
+
+        # 3. Pack metainfo
+        for key in self.meta_keys:
+            if key not in results:
+                continue
+            img_metas_list = results[key]
+            for i, img_meta in enumerate(img_metas_list):
+                det_data_samples_list[i].set_metainfo({f'{key}': img_meta})
+
+        track_data_sample = TrackDataSample()
+        track_data_sample.video_data_samples = det_data_samples_list
+        if 'key_frame_flags' in results:
+            key_frame_flags = np.asarray(results['key_frame_flags'])
+            key_frames_inds = np.where(key_frame_flags)[0].tolist()
+            ref_frames_inds = np.where(~key_frame_flags)[0].tolist()
+            track_data_sample.set_metainfo(
+                dict(key_frames_inds=key_frames_inds))
+            track_data_sample.set_metainfo(
+                dict(ref_frames_inds=ref_frames_inds))
+
+        packed_results['data_samples'] = track_data_sample
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'meta_keys={self.meta_keys}, '
+        repr_str += f'default_meta_keys={self.default_meta_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PackReIDInputs(BaseTransform):
+    """Pack the inputs data for the ReID. The ``meta_info`` item is always
+    populated. The contents of the ``meta_info`` dictionary depends on
+    ``meta_keys``. By default this includes:
+
+        - ``img_path``: path to the image file.
+        - ``ori_shape``: original shape of the image as a tuple (H, W).
+        - ``img_shape``: shape of the image input to the network as a tuple
+            (H, W). Note that images may be zero padded on the bottom/right
+          if the batch tensor is larger than this shape.
+        - ``scale``: scale of the image as a tuple (W, H).
+        - ``scale_factor``: a float indicating the pre-processing scale.
+        -  ``flip``: a boolean indicating if image flip transform was used.
+        - ``flip_direction``: the flipping direction.
+    Args:
+        meta_keys (Sequence[str], optional): The meta keys to saved in the
+            ``metainfo`` of the packed ``data_sample``.
+    """
+    default_meta_keys = ('img_path', 'ori_shape', 'img_shape', 'scale',
+                         'scale_factor')
+
+    def __init__(self, meta_keys: Sequence[str] = ()) -> None:
+        self.meta_keys = self.default_meta_keys
+        if meta_keys is not None:
+            if isinstance(meta_keys, str):
+                meta_keys = (meta_keys, )
+            else:
+                assert isinstance(meta_keys, tuple), \
+                    'meta_keys must be str or tuple.'
+            self.meta_keys += meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+        Args:
+            results (dict): Result dict from the data pipeline.
+        Returns:
+            dict:
+            - 'inputs' (dict[Tensor]): The forward data of models.
+            - 'data_samples' (obj:`ReIDDataSample`): The meta info of the
+                sample.
+        """
+        packed_results = dict(inputs=dict(), data_samples=None)
+        assert 'img' in results, 'Missing the key ``img``.'
+        _type = type(results['img'])
+        label = results['gt_label']
+
+        if _type == list:
+            img = results['img']
+            label = np.stack(label, axis=0)  # (N,)
+            assert all([type(v) == _type for v in results.values()]), \
+                'All items in the results must have the same type.'
+        else:
+            img = [results['img']]
+
+        img = np.stack(img, axis=3)  # (H, W, C, N)
+        img = img.transpose(3, 2, 0, 1)  # (N, C, H, W)
+        img = np.ascontiguousarray(img)
+
+        packed_results['inputs'] = to_tensor(img)
+
+        data_sample = ReIDDataSample()
+        data_sample.set_gt_label(label)
+
+        meta_info = dict()
+        for key in self.meta_keys:
+            meta_info[key] = results[key]
+        data_sample.set_metainfo(meta_info)
+        packed_results['data_samples'] = data_sample
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
diff --git a/head_extractor/src/mmdet/datasets/transforms/frame_sampling.py b/head_extractor/src/mmdet/datasets/transforms/frame_sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..a91f1e7880f8f061f183dc30a01758d97b7d03da
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/transforms/frame_sampling.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from collections import defaultdict
+from typing import Dict, List, Optional, Union
+
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class BaseFrameSample(BaseTransform):
+    """Directly get the key frame, no reference frames.
+
+    Args:
+        collect_video_keys (list[str]): The keys of video info to be
+            collected.
+    """
+
+    def __init__(self,
+                 collect_video_keys: List[str] = ['video_id', 'video_length']):
+        self.collect_video_keys = collect_video_keys
+
+    def prepare_data(self, video_infos: dict,
+                     sampled_inds: List[int]) -> Dict[str, List]:
+        """Prepare data for the subsequent pipeline.
+
+        Args:
+            video_infos (dict): The whole video information.
+            sampled_inds (list[int]): The sampled frame indices.
+
+        Returns:
+            dict: The processed data information.
+        """
+        frames_anns = video_infos['images']
+        final_data_info = defaultdict(list)
+        # for data in frames_anns:
+        for index in sampled_inds:
+            data = frames_anns[index]
+            # copy the info in video-level into img-level
+            for key in self.collect_video_keys:
+                if key == 'video_length':
+                    data['ori_video_length'] = video_infos[key]
+                    data['video_length'] = len(sampled_inds)
+                else:
+                    data[key] = video_infos[key]
+            # Collate data_list (list of dict to dict of list)
+            for key, value in data.items():
+                final_data_info[key].append(value)
+
+        return final_data_info
+
+    def transform(self, video_infos: dict) -> Optional[Dict[str, List]]:
+        """Transform the video information.
+
+        Args:
+            video_infos (dict): The whole video information.
+
+        Returns:
+            dict: The data information of the key frames.
+        """
+        if 'key_frame_id' in video_infos:
+            key_frame_id = video_infos['key_frame_id']
+            assert isinstance(video_infos['key_frame_id'], int)
+        else:
+            key_frame_id = random.sample(
+                list(range(video_infos['video_length'])), 1)[0]
+        results = self.prepare_data(video_infos, [key_frame_id])
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(collect_video_keys={self.collect_video_keys})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class UniformRefFrameSample(BaseFrameSample):
+    """Uniformly sample reference frames.
+
+    Args:
+        num_ref_imgs (int): Number of reference frames to be sampled.
+        frame_range (int | list[int]): Range of frames to be sampled around
+            key frame. If int, the range is [-frame_range, frame_range].
+            Defaults to 10.
+        filter_key_img (bool): Whether to filter the key frame when
+            sampling reference frames. Defaults to True.
+        collect_video_keys (list[str]): The keys of video info to be
+            collected.
+    """
+
+    def __init__(self,
+                 num_ref_imgs: int = 1,
+                 frame_range: Union[int, List[int]] = 10,
+                 filter_key_img: bool = True,
+                 collect_video_keys: List[str] = ['video_id', 'video_length']):
+        self.num_ref_imgs = num_ref_imgs
+        self.filter_key_img = filter_key_img
+        if isinstance(frame_range, int):
+            assert frame_range >= 0, 'frame_range can not be a negative value.'
+            frame_range = [-frame_range, frame_range]
+        elif isinstance(frame_range, list):
+            assert len(frame_range) == 2, 'The length must be 2.'
+            assert frame_range[0] <= 0 and frame_range[1] >= 0
+            for i in frame_range:
+                assert isinstance(i, int), 'Each element must be int.'
+        else:
+            raise TypeError('The type of frame_range must be int or list.')
+        self.frame_range = frame_range
+        super().__init__(collect_video_keys=collect_video_keys)
+
+    def sampling_frames(self, video_length: int, key_frame_id: int):
+        """Sampling frames.
+
+        Args:
+            video_length (int): The length of the video.
+            key_frame_id (int): The key frame id.
+
+        Returns:
+            list[int]: The sampled frame indices.
+        """
+        if video_length > 1:
+            left = max(0, key_frame_id + self.frame_range[0])
+            right = min(key_frame_id + self.frame_range[1], video_length - 1)
+            frame_ids = list(range(0, video_length))
+
+            valid_ids = frame_ids[left:right + 1]
+            if self.filter_key_img and key_frame_id in valid_ids:
+                valid_ids.remove(key_frame_id)
+            assert len(
+                valid_ids
+            ) > 0, 'After filtering key frame, there are no valid frames'
+            if len(valid_ids) < self.num_ref_imgs:
+                valid_ids = valid_ids * self.num_ref_imgs
+            ref_frame_ids = random.sample(valid_ids, self.num_ref_imgs)
+        else:
+            ref_frame_ids = [key_frame_id] * self.num_ref_imgs
+
+        sampled_frames_ids = [key_frame_id] + ref_frame_ids
+        sampled_frames_ids = sorted(sampled_frames_ids)
+
+        key_frames_ind = sampled_frames_ids.index(key_frame_id)
+        key_frame_flags = [False] * len(sampled_frames_ids)
+        key_frame_flags[key_frames_ind] = True
+        return sampled_frames_ids, key_frame_flags
+
+    def transform(self, video_infos: dict) -> Optional[Dict[str, List]]:
+        """Transform the video information.
+
+        Args:
+            video_infos (dict): The whole video information.
+
+        Returns:
+            dict: The data information of the sampled frames.
+        """
+        if 'key_frame_id' in video_infos:
+            key_frame_id = video_infos['key_frame_id']
+            assert isinstance(video_infos['key_frame_id'], int)
+        else:
+            key_frame_id = random.sample(
+                list(range(video_infos['video_length'])), 1)[0]
+
+        (sampled_frames_ids, key_frame_flags) = self.sampling_frames(
+            video_infos['video_length'], key_frame_id=key_frame_id)
+        results = self.prepare_data(video_infos, sampled_frames_ids)
+        results['key_frame_flags'] = key_frame_flags
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_ref_imgs={self.num_ref_imgs}, '
+        repr_str += f'frame_range={self.frame_range}, '
+        repr_str += f'filter_key_img={self.filter_key_img}, '
+        repr_str += f'collect_video_keys={self.collect_video_keys})'
+        return repr_str
diff --git a/head_extractor/src/mmdet/datasets/transforms/geometric.py b/head_extractor/src/mmdet/datasets/transforms/geometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2cd6be258f73a69aa2c2b36fef64c6c4e46a2a4
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/transforms/geometric.py
@@ -0,0 +1,754 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Optional, Union
+
+import cv2
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmcv.transforms.utils import cache_randomness
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import autocast_box_type
+from .augment_wrappers import _MAX_LEVEL, level_to_mag
+
+
+@TRANSFORMS.register_module()
+class GeomTransform(BaseTransform):
+    """Base class for geometric transformations. All geometric transformations
+    need to inherit from this base class. ``GeomTransform`` unifies the class
+    attributes and class functions of geometric transformations (ShearX,
+    ShearY, Rotate, TranslateX, and TranslateY), and records the homography
+    matrix.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for performing the geometric
+            transformation and should be in range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum magnitude for geometric transformation.
+            Defaults to 0.0.
+        max_mag (float): The maximum magnitude for geometric transformation.
+            Defaults to 1.0.
+        reversal_prob (float): The probability that reverses the geometric
+            transformation magnitude. Should be in range [0,1].
+            Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 1.0,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0 <= prob <= 1.0, f'The probability of the transformation ' \
+                                 f'should be in range [0,1], got {prob}.'
+        assert level is None or isinstance(level, int), \
+            f'The level should be None or type int, got {type(level)}.'
+        assert level is None or 0 <= level <= _MAX_LEVEL, \
+            f'The level should be in range [0,{_MAX_LEVEL}], got {level}.'
+        assert isinstance(min_mag, float), \
+            f'min_mag should be type float, got {type(min_mag)}.'
+        assert isinstance(max_mag, float), \
+            f'max_mag should be type float, got {type(max_mag)}.'
+        assert min_mag <= max_mag, \
+            f'min_mag should smaller than max_mag, ' \
+            f'got min_mag={min_mag} and max_mag={max_mag}'
+        assert isinstance(reversal_prob, float), \
+            f'reversal_prob should be type float, got {type(max_mag)}.'
+        assert 0 <= reversal_prob <= 1.0, \
+            f'The reversal probability of the transformation magnitude ' \
+            f'should be type float, got {type(reversal_prob)}.'
+        if isinstance(img_border_value, (float, int)):
+            img_border_value = tuple([float(img_border_value)] * 3)
+        elif isinstance(img_border_value, tuple):
+            assert len(img_border_value) == 3, \
+                f'img_border_value as tuple must have 3 elements, ' \
+                f'got {len(img_border_value)}.'
+            img_border_value = tuple([float(val) for val in img_border_value])
+        else:
+            raise ValueError(
+                'img_border_value must be float or tuple with 3 elements.')
+        assert np.all([0 <= val <= 255 for val in img_border_value]), 'all ' \
+            'elements of img_border_value should between range [0,255].' \
+            f'got {img_border_value}.'
+        self.prob = prob
+        self.level = level
+        self.min_mag = min_mag
+        self.max_mag = max_mag
+        self.reversal_prob = reversal_prob
+        self.img_border_value = img_border_value
+        self.mask_border_value = mask_border_value
+        self.seg_ignore_label = seg_ignore_label
+        self.interpolation = interpolation
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Transform the image."""
+        pass
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Transform the masks."""
+        pass
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Transform the segmentation map."""
+        pass
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for the geometric transformation."""
+        return np.eye(3, dtype=np.float32)
+
+    def _transform_bboxes(self, results: dict, mag: float) -> None:
+        """Transform the bboxes."""
+        results['gt_bboxes'].project_(self.homography_matrix)
+        results['gt_bboxes'].clip_(results['img_shape'])
+
+    def _record_homography_matrix(self, results: dict) -> None:
+        """Record the homography matrix for the geometric transformation."""
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = self.homography_matrix
+        else:
+            results['homography_matrix'] = self.homography_matrix @ results[
+                'homography_matrix']
+
+    @cache_randomness
+    def _random_disable(self):
+        """Randomly disable the transform."""
+        return np.random.rand() > self.prob
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        mag = level_to_mag(self.level, self.min_mag, self.max_mag)
+        return -mag if np.random.rand() > self.reversal_prob else mag
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function for images, bounding boxes, masks and semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Transformed results.
+        """
+
+        if self._random_disable():
+            return results
+        mag = self._get_mag()
+        self.homography_matrix = self._get_homography_matrix(results, mag)
+        self._record_homography_matrix(results)
+        self._transform_img(results, mag)
+        if results.get('gt_bboxes', None) is not None:
+            self._transform_bboxes(results, mag)
+        if results.get('gt_masks', None) is not None:
+            self._transform_masks(results, mag)
+        if results.get('gt_seg_map', None) is not None:
+            self._transform_seg(results, mag)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'level={self.level}, '
+        repr_str += f'min_mag={self.min_mag}, '
+        repr_str += f'max_mag={self.max_mag}, '
+        repr_str += f'reversal_prob={self.reversal_prob}, '
+        repr_str += f'img_border_value={self.img_border_value}, '
+        repr_str += f'mask_border_value={self.mask_border_value}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ShearX(GeomTransform):
+    """Shear the images, bboxes, masks and segmentation map horizontally.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for performing Shear and should be in
+            range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum angle for the horizontal shear.
+            Defaults to 0.0.
+        max_mag (float): The maximum angle for the horizontal shear.
+            Defaults to 30.0.
+        reversal_prob (float): The probability that reverses the horizontal
+            shear magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 30.0,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 90., \
+            f'min_mag angle for ShearX should be ' \
+            f'in range [0, 90], got {min_mag}.'
+        assert 0. <= max_mag <= 90., \
+            f'max_mag angle for ShearX should be ' \
+            f'in range [0, 90], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        mag = level_to_mag(self.level, self.min_mag, self.max_mag)
+        mag = np.tan(mag * np.pi / 180)
+        return -mag if np.random.rand() > self.reversal_prob else mag
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for ShearX."""
+        return np.array([[1, mag, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Shear the image horizontally."""
+        results['img'] = mmcv.imshear(
+            results['img'],
+            mag,
+            direction='horizontal',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Shear the masks horizontally."""
+        results['gt_masks'] = results['gt_masks'].shear(
+            results['img_shape'],
+            mag,
+            direction='horizontal',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Shear the segmentation map horizontally."""
+        results['gt_seg_map'] = mmcv.imshear(
+            results['gt_seg_map'],
+            mag,
+            direction='horizontal',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class ShearY(GeomTransform):
+    """Shear the images, bboxes, masks and segmentation map vertically.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for performing ShearY and should be in
+            range [0, 1]. Defaults to 1.0.
+        level (int, optional): The level should be in range [0,_MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum angle for the vertical shear.
+            Defaults to 0.0.
+        max_mag (float): The maximum angle for the vertical shear.
+            Defaults to 30.0.
+        reversal_prob (float): The probability that reverses the vertical
+            shear magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 30.,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 90., \
+            f'min_mag angle for ShearY should be ' \
+            f'in range [0, 90], got {min_mag}.'
+        assert 0. <= max_mag <= 90., \
+            f'max_mag angle for ShearY should be ' \
+            f'in range [0, 90], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    @cache_randomness
+    def _get_mag(self):
+        """Get the magnitude of the transform."""
+        mag = level_to_mag(self.level, self.min_mag, self.max_mag)
+        mag = np.tan(mag * np.pi / 180)
+        return -mag if np.random.rand() > self.reversal_prob else mag
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for ShearY."""
+        return np.array([[1, 0, 0], [mag, 1, 0], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Shear the image vertically."""
+        results['img'] = mmcv.imshear(
+            results['img'],
+            mag,
+            direction='vertical',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Shear the masks vertically."""
+        results['gt_masks'] = results['gt_masks'].shear(
+            results['img_shape'],
+            mag,
+            direction='vertical',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Shear the segmentation map vertically."""
+        results['gt_seg_map'] = mmcv.imshear(
+            results['gt_seg_map'],
+            mag,
+            direction='vertical',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class Rotate(GeomTransform):
+    """Rotate the images, bboxes, masks and segmentation map.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The maximum angle for rotation.
+            Defaults to 0.0.
+        max_mag (float): The maximum angle for rotation.
+            Defaults to 30.0.
+        reversal_prob (float): The probability that reverses the rotation
+            magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 30.0,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 180., \
+            f'min_mag for Rotate should be in range [0,180], got {min_mag}.'
+        assert 0. <= max_mag <= 180., \
+            f'max_mag for Rotate should be in range [0,180], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for Rotate."""
+        img_shape = results['img_shape']
+        center = ((img_shape[1] - 1) * 0.5, (img_shape[0] - 1) * 0.5)
+        cv2_rotation_matrix = cv2.getRotationMatrix2D(center, -mag, 1.0)
+        return np.concatenate(
+            [cv2_rotation_matrix,
+             np.array([0, 0, 1]).reshape((1, 3))]).astype(np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Rotate the image."""
+        results['img'] = mmcv.imrotate(
+            results['img'],
+            mag,
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Rotate the masks."""
+        results['gt_masks'] = results['gt_masks'].rotate(
+            results['img_shape'],
+            mag,
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Rotate the segmentation map."""
+        results['gt_seg_map'] = mmcv.imrotate(
+            results['gt_seg_map'],
+            mag,
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class TranslateX(GeomTransform):
+    """Translate the images, bboxes, masks and segmentation map horizontally.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum pixel's offset ratio for horizontal
+            translation. Defaults to 0.0.
+        max_mag (float): The maximum pixel's offset ratio for horizontal
+            translation. Defaults to 0.1.
+        reversal_prob (float): The probability that reverses the horizontal
+            translation magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 0.1,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 1., \
+            f'min_mag ratio for TranslateX should be ' \
+            f'in range [0, 1], got {min_mag}.'
+        assert 0. <= max_mag <= 1., \
+            f'max_mag ratio for TranslateX should be ' \
+            f'in range [0, 1], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for TranslateX."""
+        mag = int(results['img_shape'][1] * mag)
+        return np.array([[1, 0, mag], [0, 1, 0], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Translate the image horizontally."""
+        mag = int(results['img_shape'][1] * mag)
+        results['img'] = mmcv.imtranslate(
+            results['img'],
+            mag,
+            direction='horizontal',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Translate the masks horizontally."""
+        mag = int(results['img_shape'][1] * mag)
+        results['gt_masks'] = results['gt_masks'].translate(
+            results['img_shape'],
+            mag,
+            direction='horizontal',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Translate the segmentation map horizontally."""
+        mag = int(results['img_shape'][1] * mag)
+        results['gt_seg_map'] = mmcv.imtranslate(
+            results['gt_seg_map'],
+            mag,
+            direction='horizontal',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
+
+
+@TRANSFORMS.register_module()
+class TranslateY(GeomTransform):
+    """Translate the images, bboxes, masks and segmentation map vertically.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        prob (float): The probability for perform transformation and
+            should be in range 0 to 1. Defaults to 1.0.
+        level (int, optional): The level should be in range [0, _MAX_LEVEL].
+            If level is None, it will generate from [0, _MAX_LEVEL] randomly.
+            Defaults to None.
+        min_mag (float): The minimum pixel's offset ratio for vertical
+            translation. Defaults to 0.0.
+        max_mag (float): The maximum pixel's offset ratio for vertical
+            translation. Defaults to 0.1.
+        reversal_prob (float): The probability that reverses the vertical
+            translation magnitude. Should be in range [0,1]. Defaults to 0.5.
+        img_border_value (int | float | tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 prob: float = 1.0,
+                 level: Optional[int] = None,
+                 min_mag: float = 0.0,
+                 max_mag: float = 0.1,
+                 reversal_prob: float = 0.5,
+                 img_border_value: Union[int, float, tuple] = 128,
+                 mask_border_value: int = 0,
+                 seg_ignore_label: int = 255,
+                 interpolation: str = 'bilinear') -> None:
+        assert 0. <= min_mag <= 1., \
+            f'min_mag ratio for TranslateY should be ' \
+            f'in range [0,1], got {min_mag}.'
+        assert 0. <= max_mag <= 1., \
+            f'max_mag ratio for TranslateY should be ' \
+            f'in range [0,1], got {max_mag}.'
+        super().__init__(
+            prob=prob,
+            level=level,
+            min_mag=min_mag,
+            max_mag=max_mag,
+            reversal_prob=reversal_prob,
+            img_border_value=img_border_value,
+            mask_border_value=mask_border_value,
+            seg_ignore_label=seg_ignore_label,
+            interpolation=interpolation)
+
+    def _get_homography_matrix(self, results: dict, mag: float) -> np.ndarray:
+        """Get the homography matrix for TranslateY."""
+        mag = int(results['img_shape'][0] * mag)
+        return np.array([[1, 0, 0], [0, 1, mag], [0, 0, 1]], dtype=np.float32)
+
+    def _transform_img(self, results: dict, mag: float) -> None:
+        """Translate the image vertically."""
+        mag = int(results['img_shape'][0] * mag)
+        results['img'] = mmcv.imtranslate(
+            results['img'],
+            mag,
+            direction='vertical',
+            border_value=self.img_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_masks(self, results: dict, mag: float) -> None:
+        """Translate masks vertically."""
+        mag = int(results['img_shape'][0] * mag)
+        results['gt_masks'] = results['gt_masks'].translate(
+            results['img_shape'],
+            mag,
+            direction='vertical',
+            border_value=self.mask_border_value,
+            interpolation=self.interpolation)
+
+    def _transform_seg(self, results: dict, mag: float) -> None:
+        """Translate segmentation map vertically."""
+        mag = int(results['img_shape'][0] * mag)
+        results['gt_seg_map'] = mmcv.imtranslate(
+            results['gt_seg_map'],
+            mag,
+            direction='vertical',
+            border_value=self.seg_ignore_label,
+            interpolation='nearest')
diff --git a/head_extractor/src/mmdet/datasets/transforms/instaboost.py b/head_extractor/src/mmdet/datasets/transforms/instaboost.py
new file mode 100644
index 0000000000000000000000000000000000000000..30dc1603643ec8d398bfade95f5ec1c9b8f89c8d
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/transforms/instaboost.py
@@ -0,0 +1,150 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import numpy as np
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class InstaBoost(BaseTransform):
+    r"""Data augmentation method in `InstaBoost: Boosting Instance
+    Segmentation Via Probability Map Guided Copy-Pasting
+    <https://arxiv.org/abs/1908.07801>`_.
+
+    Refer to https://github.com/GothicAi/Instaboost for implementation details.
+
+
+    Required Keys:
+
+    - img (np.uint8)
+    - instances
+
+    Modified Keys:
+
+    - img (np.uint8)
+    - instances
+
+    Args:
+        action_candidate (tuple): Action candidates. "normal", "horizontal", \
+            "vertical", "skip" are supported. Defaults to ('normal', \
+            'horizontal', 'skip').
+        action_prob (tuple): Corresponding action probabilities. Should be \
+            the same length as action_candidate. Defaults to (1, 0, 0).
+        scale (tuple): (min scale, max scale). Defaults to (0.8, 1.2).
+        dx (int): The maximum x-axis shift will be (instance width) / dx.
+            Defaults to 15.
+        dy (int): The maximum y-axis shift will be (instance height) / dy.
+            Defaults to 15.
+        theta (tuple): (min rotation degree, max rotation degree). \
+            Defaults to (-1, 1).
+        color_prob (float): Probability of images for color augmentation.
+            Defaults to 0.5.
+        hflag (bool): Whether to use heatmap guided. Defaults to False.
+        aug_ratio (float): Probability of applying this transformation. \
+            Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 action_candidate: tuple = ('normal', 'horizontal', 'skip'),
+                 action_prob: tuple = (1, 0, 0),
+                 scale: tuple = (0.8, 1.2),
+                 dx: int = 15,
+                 dy: int = 15,
+                 theta: tuple = (-1, 1),
+                 color_prob: float = 0.5,
+                 hflag: bool = False,
+                 aug_ratio: float = 0.5) -> None:
+
+        import matplotlib
+        import matplotlib.pyplot as plt
+        default_backend = plt.get_backend()
+
+        try:
+            import instaboostfast as instaboost
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install instaboostfast" '
+                'to install instaboostfast first for instaboost augmentation.')
+
+        # instaboost will modify the default backend
+        # and cause visualization to fail.
+        matplotlib.use(default_backend)
+
+        self.cfg = instaboost.InstaBoostConfig(action_candidate, action_prob,
+                                               scale, dx, dy, theta,
+                                               color_prob, hflag)
+        self.aug_ratio = aug_ratio
+
+    def _load_anns(self, results: dict) -> Tuple[list, list]:
+        """Convert raw anns to instaboost expected input format."""
+        anns = []
+        ignore_anns = []
+        for instance in results['instances']:
+            label = instance['bbox_label']
+            bbox = instance['bbox']
+            mask = instance['mask']
+            x1, y1, x2, y2 = bbox
+            # assert (x2 - x1) >= 1 and (y2 - y1) >= 1
+            bbox = [x1, y1, x2 - x1, y2 - y1]
+
+            if instance['ignore_flag'] == 0:
+                anns.append({
+                    'category_id': label,
+                    'segmentation': mask,
+                    'bbox': bbox
+                })
+            else:
+                # Ignore instances without data augmentation
+                ignore_anns.append(instance)
+        return anns, ignore_anns
+
+    def _parse_anns(self, results: dict, anns: list, ignore_anns: list,
+                    img: np.ndarray) -> dict:
+        """Restore the result of instaboost processing to the original anns
+        format."""
+        instances = []
+        for ann in anns:
+            x1, y1, w, h = ann['bbox']
+            # TODO: more essential bug need to be fixed in instaboost
+            if w <= 0 or h <= 0:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            instances.append(
+                dict(
+                    bbox=bbox,
+                    bbox_label=ann['category_id'],
+                    mask=ann['segmentation'],
+                    ignore_flag=0))
+
+        instances.extend(ignore_anns)
+        results['img'] = img
+        results['instances'] = instances
+        return results
+
+    def transform(self, results) -> dict:
+        """The transform function."""
+        img = results['img']
+        ori_type = img.dtype
+        if 'instances' not in results or len(results['instances']) == 0:
+            return results
+
+        anns, ignore_anns = self._load_anns(results)
+        if np.random.choice([0, 1], p=[1 - self.aug_ratio, self.aug_ratio]):
+            try:
+                import instaboostfast as instaboost
+            except ImportError:
+                raise ImportError('Please run "pip install instaboostfast" '
+                                  'to install instaboostfast first.')
+            anns, img = instaboost.get_new_data(
+                anns, img.astype(np.uint8), self.cfg, background=None)
+
+        results = self._parse_anns(results, anns, ignore_anns,
+                                   img.astype(ori_type))
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(aug_ratio={self.aug_ratio})'
+        return repr_str
diff --git a/head_extractor/src/mmdet/datasets/transforms/loading.py b/head_extractor/src/mmdet/datasets/transforms/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..722d4b0e7c830dfde2412746db1258b880167a2f
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/transforms/loading.py
@@ -0,0 +1,1074 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+import torch
+from mmcv.transforms import BaseTransform
+from mmcv.transforms import LoadAnnotations as MMCV_LoadAnnotations
+from mmcv.transforms import LoadImageFromFile
+from mmengine.fileio import get
+from mmengine.structures import BaseDataElement
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import get_box_type
+from mmdet.structures.bbox.box_type import autocast_box_type
+from mmdet.structures.mask import BitmapMasks, PolygonMasks
+
+
+@TRANSFORMS.register_module()
+class LoadImageFromNDArray(LoadImageFromFile):
+    """Load an image from ``results['img']``.
+
+    Similar with :obj:`LoadImageFromFile`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['img']``. Can be used when loading image
+    from webcam.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_path
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            results (dict): Result dict with Webcam read image in
+                ``results['img']``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        img = results['img']
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img_path'] = None
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadMultiChannelImageFromFiles(BaseTransform):
+    """Load multi-channel images from a list of separate channel files.
+
+    Required Keys:
+
+    - img_path
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:``mmcv.imfrombytes``.
+            Defaults to 'unchanged'.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:``mmcv.imfrombytes``.
+            See :func:``mmcv.imfrombytes`` for details.
+            Defaults to 'cv2'.
+        file_client_args (dict): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet >= 3.0.0rc7. Defaults to None.
+    """
+
+    def __init__(
+        self,
+        to_float32: bool = False,
+        color_type: str = 'unchanged',
+        imdecode_backend: str = 'cv2',
+        file_client_args: dict = None,
+        backend_args: dict = None,
+    ) -> None:
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.imdecode_backend = imdecode_backend
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+    def transform(self, results: dict) -> dict:
+        """Transform functions to load multiple images and get images meta
+        information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded images and meta information.
+        """
+
+        assert isinstance(results['img_path'], list)
+        img = []
+        for name in results['img_path']:
+            img_bytes = get(name, backend_args=self.backend_args)
+            img.append(
+                mmcv.imfrombytes(
+                    img_bytes,
+                    flag=self.color_type,
+                    backend=self.imdecode_backend))
+        img = np.stack(img, axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f"imdecode_backend='{self.imdecode_backend}', "
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadAnnotations(MMCV_LoadAnnotations):
+    """Load and process the ``instances`` and ``seg_map`` annotation provided
+    by dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'instances':
+            [
+                {
+                # List of 4 numbers representing the bounding box of the
+                # instance, in (x1, y1, x2, y2) order.
+                'bbox': [x1, y1, x2, y2],
+
+                # Label of image classification.
+                'bbox_label': 1,
+
+                # Used in instance/panoptic segmentation. The segmentation mask
+                # of the instance or the information of segments.
+                # 1. If list[list[float]], it represents a list of polygons,
+                # one for each connected component of the object. Each
+                # list[float] is one simple polygon in the format of
+                # [x1, y1, ..., xn, yn] (n >= 3). The Xs and Ys are absolute
+                # coordinates in unit of pixels.
+                # 2. If dict, it represents the per-pixel segmentation mask in
+                # COCO's compressed RLE format. The dict should have keys
+                # “size” and “counts”.  Can be loaded by pycocotools
+                'mask': list[list[float]] or dict,
+
+                }
+            ]
+            # Filename of semantic or panoptic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+
+    .. code-block:: python
+
+        {
+            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
+            # in an image
+            'gt_bboxes': BaseBoxes(N, 4)
+             # In int type.
+            'gt_bboxes_labels': np.ndarray(N, )
+             # In built-in class
+            'gt_masks': PolygonMasks (H, W) or BitmapMasks (H, W)
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+             # in (x, y, v) order, float type.
+        }
+
+    Required Keys:
+
+    - height
+    - width
+    - instances
+
+      - bbox (optional)
+      - bbox_label
+      - mask (optional)
+      - ignore_flag
+
+    - seg_map_path (optional)
+
+    Added Keys:
+
+    - gt_bboxes (BaseBoxes[torch.float32])
+    - gt_bboxes_labels (np.int64)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (bool)
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+            Defaults to True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Defaults to True.
+        with_mask (bool): Whether to parse and load the mask annotation.
+             Default: False.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Defaults to False.
+        poly2mask (bool): Whether to convert mask to bitmap. Default: True.
+        box_type (str): The box type used to wrap the bboxes. If ``box_type``
+            is None, gt_bboxes will keep being np.ndarray. Defaults to 'hbox'.
+        reduce_zero_label (bool): Whether reduce all label value
+            by 1. Usually used for datasets where 0 is background label.
+            Defaults to False.
+        ignore_index (int): The label index to be ignored.
+            Valid only if reduce_zero_label is true. Defaults is 255.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:``mmcv.imfrombytes``.
+            See :fun:``mmcv.imfrombytes`` for details.
+            Defaults to 'cv2'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            with_mask: bool = False,
+            poly2mask: bool = True,
+            box_type: str = 'hbox',
+            # use for semseg
+            reduce_zero_label: bool = False,
+            ignore_index: int = 255,
+            **kwargs) -> None:
+        super(LoadAnnotations, self).__init__(**kwargs)
+        self.with_mask = with_mask
+        self.poly2mask = poly2mask
+        self.box_type = box_type
+        self.reduce_zero_label = reduce_zero_label
+        self.ignore_index = ignore_index
+
+    def _load_bboxes(self, results: dict) -> None:
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        gt_bboxes = []
+        gt_ignore_flags = []
+        for instance in results.get('instances', []):
+            gt_bboxes.append(instance['bbox'])
+            gt_ignore_flags.append(instance['ignore_flag'])
+        if self.box_type is None:
+            results['gt_bboxes'] = np.array(
+                gt_bboxes, dtype=np.float32).reshape((-1, 4))
+        else:
+            _, box_type_cls = get_box_type(self.box_type)
+            results['gt_bboxes'] = box_type_cls(gt_bboxes, dtype=torch.float32)
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+
+    def _load_labels(self, results: dict) -> None:
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+        gt_bboxes_labels = []
+        for instance in results.get('instances', []):
+            gt_bboxes_labels.append(instance['bbox_label'])
+        # TODO: Inconsistent with mmcv, consider how to deal with it later.
+        results['gt_bboxes_labels'] = np.array(
+            gt_bboxes_labels, dtype=np.int64)
+
+    def _poly2mask(self, mask_ann: Union[list, dict], img_h: int,
+                   img_w: int) -> np.ndarray:
+        """Private function to convert masks represented with polygon to
+        bitmaps.
+
+        Args:
+            mask_ann (list | dict): Polygon mask annotation input.
+            img_h (int): The height of output mask.
+            img_w (int): The width of output mask.
+
+        Returns:
+            np.ndarray: The decode bitmap mask of shape (img_h, img_w).
+        """
+
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+            rle = maskUtils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = maskUtils.decode(rle)
+        return mask
+
+    def _process_masks(self, results: dict) -> list:
+        """Process gt_masks and filter invalid polygons.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+
+        Returns:
+            list: Processed gt_masks.
+        """
+        gt_masks = []
+        gt_ignore_flags = []
+        for instance in results.get('instances', []):
+            gt_mask = instance['mask']
+            # If the annotation of segmentation mask is invalid,
+            # ignore the whole instance.
+            if isinstance(gt_mask, list):
+                gt_mask = [
+                    np.array(polygon) for polygon in gt_mask
+                    if len(polygon) % 2 == 0 and len(polygon) >= 6
+                ]
+                if len(gt_mask) == 0:
+                    # ignore this instance and set gt_mask to a fake mask
+                    instance['ignore_flag'] = 1
+                    gt_mask = [np.zeros(6)]
+            elif not self.poly2mask:
+                # `PolygonMasks` requires a ploygon of format List[np.array],
+                # other formats are invalid.
+                instance['ignore_flag'] = 1
+                gt_mask = [np.zeros(6)]
+            elif isinstance(gt_mask, dict) and \
+                    not (gt_mask.get('counts') is not None and
+                         gt_mask.get('size') is not None and
+                         isinstance(gt_mask['counts'], (list, str))):
+                # if gt_mask is a dict, it should include `counts` and `size`,
+                # so that `BitmapMasks` can uncompressed RLE
+                instance['ignore_flag'] = 1
+                gt_mask = [np.zeros(6)]
+            gt_masks.append(gt_mask)
+            # re-process gt_ignore_flags
+            gt_ignore_flags.append(instance['ignore_flag'])
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+        return gt_masks
+
+    def _load_masks(self, results: dict) -> None:
+        """Private function to load mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+        """
+        h, w = results['ori_shape']
+        gt_masks = self._process_masks(results)
+        if self.poly2mask:
+            gt_masks = BitmapMasks(
+                [self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
+        else:
+            # fake polygon masks will be ignored in `PackDetInputs`
+            gt_masks = PolygonMasks([mask for mask in gt_masks], h, w)
+        results['gt_masks'] = gt_masks
+
+    def _load_seg_map(self, results: dict) -> None:
+        """Private function to load semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+        if results.get('seg_map_path', None) is None:
+            return
+
+        img_bytes = get(
+            results['seg_map_path'], backend_args=self.backend_args)
+        gt_semantic_seg = mmcv.imfrombytes(
+            img_bytes, flag='unchanged',
+            backend=self.imdecode_backend).squeeze()
+
+        if self.reduce_zero_label:
+            # avoid using underflow conversion
+            gt_semantic_seg[gt_semantic_seg == 0] = self.ignore_index
+            gt_semantic_seg = gt_semantic_seg - 1
+            gt_semantic_seg[gt_semantic_seg == self.ignore_index -
+                            1] = self.ignore_index
+
+        # modify if custom classes
+        if results.get('label_map', None) is not None:
+            # Add deep copy to solve bug of repeatedly
+            # replace `gt_semantic_seg`, which is reported in
+            # https://github.com/open-mmlab/mmsegmentation/pull/1445/
+            gt_semantic_seg_copy = gt_semantic_seg.copy()
+            for old_id, new_id in results['label_map'].items():
+                gt_semantic_seg[gt_semantic_seg_copy == old_id] = new_id
+        results['gt_seg_map'] = gt_semantic_seg
+        results['ignore_index'] = self.ignore_index
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmengine.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label and
+            semantic segmentation.
+        """
+
+        if self.with_bbox:
+            self._load_bboxes(results)
+        if self.with_label:
+            self._load_labels(results)
+        if self.with_mask:
+            self._load_masks(results)
+        if self.with_seg:
+            self._load_seg_map(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'poly2mask={self.poly2mask}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'backend_args={self.backend_args})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadPanopticAnnotations(LoadAnnotations):
+    """Load multiple types of panoptic annotations.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'instances':
+            [
+                {
+                # List of 4 numbers representing the bounding box of the
+                # instance, in (x1, y1, x2, y2) order.
+                'bbox': [x1, y1, x2, y2],
+
+                # Label of image classification.
+                'bbox_label': 1,
+                },
+                ...
+            ]
+            'segments_info':
+            [
+                {
+                # id = cls_id + instance_id * INSTANCE_OFFSET
+                'id': int,
+
+                # Contiguous category id defined in dataset.
+                'category': int
+
+                # Thing flag.
+                'is_thing': bool
+                },
+                ...
+            ]
+
+            # Filename of semantic or panoptic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+
+    .. code-block:: python
+
+        {
+            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
+            # in an image
+            'gt_bboxes': BaseBoxes(N, 4)
+             # In int type.
+            'gt_bboxes_labels': np.ndarray(N, )
+             # In built-in class
+            'gt_masks': PolygonMasks (H, W) or BitmapMasks (H, W)
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+             # in (x, y, v) order, float type.
+        }
+
+    Required Keys:
+
+    - height
+    - width
+    - instances
+      - bbox
+      - bbox_label
+      - ignore_flag
+    - segments_info
+      - id
+      - category
+      - is_thing
+    - seg_map_path
+
+    Added Keys:
+
+    - gt_bboxes (BaseBoxes[torch.float32])
+    - gt_bboxes_labels (np.int64)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (bool)
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+            Defaults to True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Defaults to True.
+        with_mask (bool): Whether to parse and load the mask annotation.
+             Defaults to True.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Defaults to False.
+        box_type (str): The box mode used to wrap the bboxes.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:``mmcv.imfrombytes``.
+            See :fun:``mmcv.imfrombytes`` for details.
+            Defaults to 'cv2'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet >= 3.0.0rc7. Defaults to None.
+    """
+
+    def __init__(self,
+                 with_bbox: bool = True,
+                 with_label: bool = True,
+                 with_mask: bool = True,
+                 with_seg: bool = True,
+                 box_type: str = 'hbox',
+                 imdecode_backend: str = 'cv2',
+                 backend_args: dict = None) -> None:
+        try:
+            from panopticapi import utils
+        except ImportError:
+            raise ImportError(
+                'panopticapi is not installed, please install it by: '
+                'pip install git+https://github.com/cocodataset/'
+                'panopticapi.git.')
+        self.rgb2id = utils.rgb2id
+
+        super(LoadPanopticAnnotations, self).__init__(
+            with_bbox=with_bbox,
+            with_label=with_label,
+            with_mask=with_mask,
+            with_seg=with_seg,
+            with_keypoints=False,
+            box_type=box_type,
+            imdecode_backend=imdecode_backend,
+            backend_args=backend_args)
+
+    def _load_masks_and_semantic_segs(self, results: dict) -> None:
+        """Private function to load mask and semantic segmentation annotations.
+
+        In gt_semantic_seg, the foreground label is from ``0`` to
+        ``num_things - 1``, the background label is from ``num_things`` to
+        ``num_things + num_stuff - 1``, 255 means the ignored label (``VOID``).
+
+        Args:
+            results (dict): Result dict from :obj:``mmdet.CustomDataset``.
+        """
+        # seg_map_path is None, when inference on the dataset without gts.
+        if results.get('seg_map_path', None) is None:
+            return
+
+        img_bytes = get(
+            results['seg_map_path'], backend_args=self.backend_args)
+        pan_png = mmcv.imfrombytes(
+            img_bytes, flag='color', channel_order='rgb').squeeze()
+        pan_png = self.rgb2id(pan_png)
+
+        gt_masks = []
+        gt_seg = np.zeros_like(pan_png) + 255  # 255 as ignore
+
+        for segment_info in results['segments_info']:
+            mask = (pan_png == segment_info['id'])
+            gt_seg = np.where(mask, segment_info['category'], gt_seg)
+
+            # The legal thing masks
+            if segment_info.get('is_thing'):
+                gt_masks.append(mask.astype(np.uint8))
+
+        if self.with_mask:
+            h, w = results['ori_shape']
+            gt_masks = BitmapMasks(gt_masks, h, w)
+            results['gt_masks'] = gt_masks
+
+        if self.with_seg:
+            results['gt_seg_map'] = gt_seg
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types panoptic annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmdet.CustomDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, mask and
+                semantic segmentation annotations.
+        """
+
+        if self.with_bbox:
+            self._load_bboxes(results)
+        if self.with_label:
+            self._load_labels(results)
+        if self.with_mask or self.with_seg:
+            # The tasks completed by '_load_masks' and '_load_semantic_segs'
+            # in LoadAnnotations are merged to one function.
+            self._load_masks_and_semantic_segs(results)
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadProposals(BaseTransform):
+    """Load proposal pipeline.
+
+    Required Keys:
+
+    - proposals
+
+    Modified Keys:
+
+    - proposals
+
+    Args:
+        num_max_proposals (int, optional): Maximum number of proposals to load.
+            If not specified, all proposals will be loaded.
+    """
+
+    def __init__(self, num_max_proposals: Optional[int] = None) -> None:
+        self.num_max_proposals = num_max_proposals
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to load proposals from file.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded proposal annotations.
+        """
+
+        proposals = results['proposals']
+        # the type of proposals should be `dict` or `InstanceData`
+        assert isinstance(proposals, dict) \
+               or isinstance(proposals, BaseDataElement)
+        bboxes = proposals['bboxes'].astype(np.float32)
+        assert bboxes.shape[1] == 4, \
+            f'Proposals should have shapes (n, 4), but found {bboxes.shape}'
+
+        if 'scores' in proposals:
+            scores = proposals['scores'].astype(np.float32)
+            assert bboxes.shape[0] == scores.shape[0]
+        else:
+            scores = np.zeros(bboxes.shape[0], dtype=np.float32)
+
+        if self.num_max_proposals is not None:
+            # proposals should sort by scores during dumping the proposals
+            bboxes = bboxes[:self.num_max_proposals]
+            scores = scores[:self.num_max_proposals]
+
+        if len(bboxes) == 0:
+            bboxes = np.zeros((0, 4), dtype=np.float32)
+            scores = np.zeros(0, dtype=np.float32)
+
+        results['proposals'] = bboxes
+        results['proposals_scores'] = scores
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(num_max_proposals={self.num_max_proposals})'
+
+
+@TRANSFORMS.register_module()
+class FilterAnnotations(BaseTransform):
+    """Filter invalid annotations.
+
+    Required Keys:
+
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_masks (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        min_gt_bbox_wh (tuple[float]): Minimum width and height of ground truth
+            boxes. Default: (1., 1.)
+        min_gt_mask_area (int): Minimum foreground area of ground truth masks.
+            Default: 1
+        by_box (bool): Filter instances with bounding boxes not meeting the
+            min_gt_bbox_wh threshold. Default: True
+        by_mask (bool): Filter instances with masks not meeting
+            min_gt_mask_area threshold. Default: False
+        keep_empty (bool): Whether to return None when it
+            becomes an empty bbox after filtering. Defaults to True.
+    """
+
+    def __init__(self,
+                 min_gt_bbox_wh: Tuple[int, int] = (1, 1),
+                 min_gt_mask_area: int = 1,
+                 by_box: bool = True,
+                 by_mask: bool = False,
+                 keep_empty: bool = True) -> None:
+        # TODO: add more filter options
+        assert by_box or by_mask
+        self.min_gt_bbox_wh = min_gt_bbox_wh
+        self.min_gt_mask_area = min_gt_mask_area
+        self.by_box = by_box
+        self.by_mask = by_mask
+        self.keep_empty = keep_empty
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """Transform function to filter annotations.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        assert 'gt_bboxes' in results
+        gt_bboxes = results['gt_bboxes']
+        if gt_bboxes.shape[0] == 0:
+            return results
+
+        tests = []
+        if self.by_box:
+            tests.append(
+                ((gt_bboxes.widths > self.min_gt_bbox_wh[0]) &
+                 (gt_bboxes.heights > self.min_gt_bbox_wh[1])).numpy())
+        if self.by_mask:
+            assert 'gt_masks' in results
+            gt_masks = results['gt_masks']
+            tests.append(gt_masks.areas >= self.min_gt_mask_area)
+
+        keep = tests[0]
+        for t in tests[1:]:
+            keep = keep & t
+
+        if not keep.any():
+            if self.keep_empty:
+                return None
+
+        keys = ('gt_bboxes', 'gt_bboxes_labels', 'gt_masks', 'gt_ignore_flags')
+        for key in keys:
+            if key in results:
+                results[key] = results[key][keep]
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(min_gt_bbox_wh={self.min_gt_bbox_wh}, ' \
+               f'keep_empty={self.keep_empty})'
+
+
+@TRANSFORMS.register_module()
+class LoadEmptyAnnotations(BaseTransform):
+    """Load Empty Annotations for unlabeled images.
+
+    Added Keys:
+    - gt_bboxes (np.float32)
+    - gt_bboxes_labels (np.int64)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (bool)
+
+    Args:
+        with_bbox (bool): Whether to load the pseudo bbox annotation.
+            Defaults to True.
+        with_label (bool): Whether to load the pseudo label annotation.
+            Defaults to True.
+        with_mask (bool): Whether to load the pseudo mask annotation.
+             Default: False.
+        with_seg (bool): Whether to load the pseudo semantic segmentation
+            annotation. Defaults to False.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+    """
+
+    def __init__(self,
+                 with_bbox: bool = True,
+                 with_label: bool = True,
+                 with_mask: bool = False,
+                 with_seg: bool = False,
+                 seg_ignore_label: int = 255) -> None:
+        self.with_bbox = with_bbox
+        self.with_label = with_label
+        self.with_mask = with_mask
+        self.with_seg = with_seg
+        self.seg_ignore_label = seg_ignore_label
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to load empty annotations.
+
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Updated result dict.
+        """
+        if self.with_bbox:
+            results['gt_bboxes'] = np.zeros((0, 4), dtype=np.float32)
+            results['gt_ignore_flags'] = np.zeros((0, ), dtype=bool)
+        if self.with_label:
+            results['gt_bboxes_labels'] = np.zeros((0, ), dtype=np.int64)
+        if self.with_mask:
+            # TODO: support PolygonMasks
+            h, w = results['img_shape']
+            gt_masks = np.zeros((0, h, w), dtype=np.uint8)
+            results['gt_masks'] = BitmapMasks(gt_masks, h, w)
+        if self.with_seg:
+            h, w = results['img_shape']
+            results['gt_seg_map'] = self.seg_ignore_label * np.ones(
+                (h, w), dtype=np.uint8)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class InferencerLoader(BaseTransform):
+    """Load an image from ``results['img']``.
+
+    Similar with :obj:`LoadImageFromFile`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['img']``. Can be used when loading image
+    from webcam.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_path
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__()
+        self.from_file = TRANSFORMS.build(
+            dict(type='LoadImageFromFile', **kwargs))
+        self.from_ndarray = TRANSFORMS.build(
+            dict(type='mmdet.LoadImageFromNDArray', **kwargs))
+
+    def transform(self, results: Union[str, np.ndarray, dict]) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            results (str, np.ndarray or dict): The result.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        if isinstance(results, str):
+            inputs = dict(img_path=results)
+        elif isinstance(results, np.ndarray):
+            inputs = dict(img=results)
+        elif isinstance(results, dict):
+            inputs = results
+        else:
+            raise NotImplementedError
+
+        if 'img' in inputs:
+            return self.from_ndarray(inputs)
+        return self.from_file(inputs)
+
+
+@TRANSFORMS.register_module()
+class LoadTrackAnnotations(LoadAnnotations):
+    """Load and process the ``instances`` and ``seg_map`` annotation provided
+    by dataset. It must load ``instances_ids`` which is only used in the
+    tracking tasks. The annotation format is as the following:
+
+    .. code-block:: python
+        {
+            'instances':
+            [
+                {
+                # List of 4 numbers representing the bounding box of the
+                # instance, in (x1, y1, x2, y2) order.
+                'bbox': [x1, y1, x2, y2],
+                # Label of image classification.
+                'bbox_label': 1,
+                # Used in tracking.
+                # Id of instances.
+                'instance_id': 100,
+                # Used in instance/panoptic segmentation. The segmentation mask
+                # of the instance or the information of segments.
+                # 1. If list[list[float]], it represents a list of polygons,
+                # one for each connected component of the object. Each
+                # list[float] is one simple polygon in the format of
+                # [x1, y1, ..., xn, yn] (n >= 3). The Xs and Ys are absolute
+                # coordinates in unit of pixels.
+                # 2. If dict, it represents the per-pixel segmentation mask in
+                # COCO's compressed RLE format. The dict should have keys
+                # “size” and “counts”.  Can be loaded by pycocotools
+                'mask': list[list[float]] or dict,
+                }
+            ]
+            # Filename of semantic or panoptic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+    .. code-block:: python
+        {
+            # In (x1, y1, x2, y2) order, float type. N is the number of bboxes
+            # in an image
+            'gt_bboxes': np.ndarray(N, 4)
+             # In int type.
+            'gt_bboxes_labels': np.ndarray(N, )
+             # In built-in class
+            'gt_masks': PolygonMasks (H, W) or BitmapMasks (H, W)
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+             # in (x, y, v) order, float type.
+        }
+
+    Required Keys:
+
+    - height (optional)
+    - width (optional)
+    - instances
+      - bbox (optional)
+      - bbox_label
+      - instance_id (optional)
+      - mask (optional)
+      - ignore_flag (optional)
+    - seg_map_path (optional)
+
+    Added Keys:
+
+    - gt_bboxes (np.float32)
+    - gt_bboxes_labels (np.int32)
+    - gt_instances_ids (np.int32)
+    - gt_masks (BitmapMasks | PolygonMasks)
+    - gt_seg_map (np.uint8)
+    - gt_ignore_flags (np.bool)
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+    def _load_bboxes(self, results: dict) -> None:
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+        gt_bboxes = []
+        gt_ignore_flags = []
+        # TODO: use bbox_type
+        for instance in results['instances']:
+            # The datasets which are only format in evaluation don't have
+            # groundtruth boxes.
+            if 'bbox' in instance:
+                gt_bboxes.append(instance['bbox'])
+            if 'ignore_flag' in instance:
+                gt_ignore_flags.append(instance['ignore_flag'])
+
+        # TODO: check this case
+        if len(gt_bboxes) != len(gt_ignore_flags):
+            # There may be no ``gt_ignore_flags`` in some cases, we treat them
+            # as all False in order to keep the length of ``gt_bboxes`` and
+            # ``gt_ignore_flags`` the same
+            gt_ignore_flags = [False] * len(gt_bboxes)
+
+        results['gt_bboxes'] = np.array(
+            gt_bboxes, dtype=np.float32).reshape(-1, 4)
+        results['gt_ignore_flags'] = np.array(gt_ignore_flags, dtype=bool)
+
+    def _load_instances_ids(self, results: dict) -> None:
+        """Private function to load instances id annotations.
+
+        Args:
+            results (dict): Result dict from :obj :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict containing instances id annotations.
+        """
+        gt_instances_ids = []
+        for instance in results['instances']:
+            gt_instances_ids.append(instance['instance_id'])
+        results['gt_instances_ids'] = np.array(
+            gt_instances_ids, dtype=np.int32)
+
+    def transform(self, results: dict) -> dict:
+        """Function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, instances id
+            and semantic segmentation and keypoints annotations.
+        """
+        results = super().transform(results)
+        self._load_instances_ids(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'poly2mask={self.poly2mask}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'file_client_args={self.file_client_args})'
+        return repr_str
diff --git a/head_extractor/src/mmdet/datasets/transforms/text_transformers.py b/head_extractor/src/mmdet/datasets/transforms/text_transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..12a0e57db3d41baa6f5b7d1834ba74538ad9ca19
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/transforms/text_transformers.py
@@ -0,0 +1,255 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import BaseBoxes
+
+try:
+    from transformers import AutoTokenizer
+    from transformers import BertModel as HFBertModel
+except ImportError:
+    AutoTokenizer = None
+    HFBertModel = None
+
+import random
+import re
+
+import numpy as np
+
+
+def clean_name(name):
+    name = re.sub(r'\(.*\)', '', name)
+    name = re.sub(r'_', ' ', name)
+    name = re.sub(r'  ', ' ', name)
+    name = name.lower()
+    return name
+
+
+def check_for_positive_overflow(gt_bboxes, gt_labels, text, tokenizer,
+                                max_tokens):
+    # Check if we have too many positive labels
+    # generate a caption by appending the positive labels
+    positive_label_list = np.unique(gt_labels).tolist()
+    # random shuffule so we can sample different annotations
+    # at different epochs
+    random.shuffle(positive_label_list)
+
+    kept_lables = []
+    length = 0
+
+    for index, label in enumerate(positive_label_list):
+
+        label_text = clean_name(text[str(label)]) + '. '
+
+        tokenized = tokenizer.tokenize(label_text)
+
+        length += len(tokenized)
+
+        if length > max_tokens:
+            break
+        else:
+            kept_lables.append(label)
+
+    keep_box_index = []
+    keep_gt_labels = []
+    for i in range(len(gt_labels)):
+        if gt_labels[i] in kept_lables:
+            keep_box_index.append(i)
+            keep_gt_labels.append(gt_labels[i])
+
+    return gt_bboxes[keep_box_index], np.array(
+        keep_gt_labels, dtype=np.long), length
+
+
+def generate_senetence_given_labels(positive_label_list, negative_label_list,
+                                    text):
+    label_to_positions = {}
+
+    label_list = negative_label_list + positive_label_list
+
+    random.shuffle(label_list)
+
+    pheso_caption = ''
+
+    label_remap_dict = {}
+    for index, label in enumerate(label_list):
+
+        start_index = len(pheso_caption)
+
+        pheso_caption += clean_name(text[str(label)])
+
+        end_index = len(pheso_caption)
+
+        if label in positive_label_list:
+            label_to_positions[index] = [[start_index, end_index]]
+            label_remap_dict[int(label)] = index
+
+        # if index != len(label_list) - 1:
+        #     pheso_caption += '. '
+        pheso_caption += '. '
+
+    return label_to_positions, pheso_caption, label_remap_dict
+
+
+@TRANSFORMS.register_module()
+class RandomSamplingNegPos(BaseTransform):
+
+    def __init__(self,
+                 tokenizer_name,
+                 num_sample_negative=85,
+                 max_tokens=256,
+                 full_sampling_prob=0.5,
+                 label_map_file=None):
+        if AutoTokenizer is None:
+            raise RuntimeError(
+                'transformers is not installed, please install it by: '
+                'pip install transformers.')
+
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        self.num_sample_negative = num_sample_negative
+        self.full_sampling_prob = full_sampling_prob
+        self.max_tokens = max_tokens
+        self.label_map = None
+        if label_map_file:
+            with open(label_map_file, 'r') as file:
+                self.label_map = json.load(file)
+
+    def transform(self, results: dict) -> dict:
+        if 'phrases' in results:
+            return self.vg_aug(results)
+        else:
+            return self.od_aug(results)
+
+    def vg_aug(self, results):
+        gt_bboxes = results['gt_bboxes']
+        if isinstance(gt_bboxes, BaseBoxes):
+            gt_bboxes = gt_bboxes.tensor
+        gt_labels = results['gt_bboxes_labels']
+        text = results['text'].lower().strip()
+        if not text.endswith('.'):
+            text = text + '. '
+
+        phrases = results['phrases']
+        # TODO: add neg
+        positive_label_list = np.unique(gt_labels).tolist()
+        label_to_positions = {}
+        for label in positive_label_list:
+            label_to_positions[label] = phrases[label]['tokens_positive']
+
+        results['gt_bboxes'] = gt_bboxes
+        results['gt_bboxes_labels'] = gt_labels
+
+        results['text'] = text
+        results['tokens_positive'] = label_to_positions
+        return results
+
+    def od_aug(self, results):
+        gt_bboxes = results['gt_bboxes']
+        if isinstance(gt_bboxes, BaseBoxes):
+            gt_bboxes = gt_bboxes.tensor
+        gt_labels = results['gt_bboxes_labels']
+
+        if 'text' not in results:
+            assert self.label_map is not None
+            text = self.label_map
+        else:
+            text = results['text']
+
+        original_box_num = len(gt_labels)
+        # If the category name is in the format of 'a/b' (in object365),
+        # we randomly select one of them.
+        for key, value in text.items():
+            if '/' in value:
+                text[key] = random.choice(value.split('/')).strip()
+
+        gt_bboxes, gt_labels, positive_caption_length = \
+            check_for_positive_overflow(gt_bboxes, gt_labels,
+                                        text, self.tokenizer, self.max_tokens)
+
+        if len(gt_bboxes) < original_box_num:
+            print('WARNING: removed {} boxes due to positive caption overflow'.
+                  format(original_box_num - len(gt_bboxes)))
+
+        valid_negative_indexes = list(text.keys())
+
+        positive_label_list = np.unique(gt_labels).tolist()
+        full_negative = self.num_sample_negative
+
+        if full_negative > len(valid_negative_indexes):
+            full_negative = len(valid_negative_indexes)
+
+        outer_prob = random.random()
+
+        if outer_prob < self.full_sampling_prob:
+            # c. probability_full: add both all positive and all negatives
+            num_negatives = full_negative
+        else:
+            if random.random() < 1.0:
+                num_negatives = np.random.choice(max(1, full_negative)) + 1
+            else:
+                num_negatives = full_negative
+
+        # Keep some negatives
+        negative_label_list = set()
+        if num_negatives != -1:
+            if num_negatives > len(valid_negative_indexes):
+                num_negatives = len(valid_negative_indexes)
+
+            for i in np.random.choice(
+                    valid_negative_indexes, size=num_negatives, replace=False):
+                if int(i) not in positive_label_list:
+                    negative_label_list.add(i)
+
+        random.shuffle(positive_label_list)
+
+        negative_label_list = list(negative_label_list)
+        random.shuffle(negative_label_list)
+
+        negative_max_length = self.max_tokens - positive_caption_length
+        screened_negative_label_list = []
+
+        for negative_label in negative_label_list:
+            label_text = clean_name(text[str(negative_label)]) + '. '
+
+            tokenized = self.tokenizer.tokenize(label_text)
+
+            negative_max_length -= len(tokenized)
+
+            if negative_max_length > 0:
+                screened_negative_label_list.append(negative_label)
+            else:
+                break
+        negative_label_list = screened_negative_label_list
+        label_to_positions, pheso_caption, label_remap_dict = \
+            generate_senetence_given_labels(positive_label_list,
+                                            negative_label_list, text)
+
+        # label remap
+        if len(gt_labels) > 0:
+            gt_labels = np.vectorize(lambda x: label_remap_dict[x])(gt_labels)
+
+        results['gt_bboxes'] = gt_bboxes
+        results['gt_bboxes_labels'] = gt_labels
+
+        results['text'] = pheso_caption
+        results['tokens_positive'] = label_to_positions
+
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadTextAnnotations(BaseTransform):
+
+    def transform(self, results: dict) -> dict:
+        if 'phrases' in results:
+            tokens_positive = [
+                phrase['tokens_positive']
+                for phrase in results['phrases'].values()
+            ]
+            results['tokens_positive'] = tokens_positive
+        else:
+            text = results['text']
+            results['text'] = list(text.values())
+        return results
diff --git a/head_extractor/src/mmdet/datasets/transforms/transformers_glip.py b/head_extractor/src/mmdet/datasets/transforms/transformers_glip.py
new file mode 100644
index 0000000000000000000000000000000000000000..60c4f87d1b86c13f886da27584114b6420b8b8cb
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/transforms/transformers_glip.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from mmcv.transforms import BaseTransform
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import HorizontalBoxes, autocast_box_type
+from .transforms import RandomFlip
+
+
+@TRANSFORMS.register_module()
+class GTBoxSubOne_GLIP(BaseTransform):
+    """Subtract 1 from the x2 and y2 coordinates of the gt_bboxes."""
+
+    def transform(self, results: dict) -> dict:
+        if 'gt_bboxes' in results:
+            gt_bboxes = results['gt_bboxes']
+            if isinstance(gt_bboxes, np.ndarray):
+                gt_bboxes[:, 2:] -= 1
+                results['gt_bboxes'] = gt_bboxes
+            elif isinstance(gt_bboxes, HorizontalBoxes):
+                gt_bboxes = results['gt_bboxes'].tensor
+                gt_bboxes[:, 2:] -= 1
+                results['gt_bboxes'] = HorizontalBoxes(gt_bboxes)
+            else:
+                raise NotImplementedError
+        return results
+
+
+@TRANSFORMS.register_module()
+class RandomFlip_GLIP(RandomFlip):
+    """Flip the image & bboxes & masks & segs horizontally or vertically.
+
+    When using horizontal flipping, the corresponding bbox x-coordinate needs
+    to be additionally subtracted by one.
+    """
+
+    @autocast_box_type()
+    def _flip(self, results: dict) -> None:
+        """Flip images, bounding boxes, and semantic segmentation map."""
+        # flip image
+        results['img'] = mmcv.imflip(
+            results['img'], direction=results['flip_direction'])
+
+        img_shape = results['img'].shape[:2]
+
+        # flip bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'].flip_(img_shape, results['flip_direction'])
+            # Only change this line
+            if results['flip_direction'] == 'horizontal':
+                results['gt_bboxes'].translate_([-1, 0])
+
+        # TODO: check it
+        # flip masks
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'].flip(
+                results['flip_direction'])
+
+        # flip segs
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = mmcv.imflip(
+                results['gt_seg_map'], direction=results['flip_direction'])
+
+        # record homography matrix for flip
+        self._record_homography_matrix(results)
diff --git a/head_extractor/src/mmdet/datasets/transforms/transforms.py b/head_extractor/src/mmdet/datasets/transforms/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..c50b987db33c91f759f6c89580f605631ce4f558
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/transforms/transforms.py
@@ -0,0 +1,3856 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+import math
+import warnings
+from typing import List, Optional, Sequence, Tuple, Union
+
+import cv2
+import mmcv
+import numpy as np
+from mmcv.image import imresize
+from mmcv.image.geometric import _scale_size
+from mmcv.transforms import BaseTransform
+from mmcv.transforms import Pad as MMCV_Pad
+from mmcv.transforms import RandomFlip as MMCV_RandomFlip
+from mmcv.transforms import Resize as MMCV_Resize
+from mmcv.transforms.utils import avoid_cache_randomness, cache_randomness
+from mmengine.dataset import BaseDataset
+from mmengine.utils import is_str
+from numpy import random
+
+from mmdet.registry import TRANSFORMS
+from mmdet.structures.bbox import HorizontalBoxes, autocast_box_type
+from mmdet.structures.mask import BitmapMasks, PolygonMasks
+from mmdet.utils import log_img_scale
+
+try:
+    from imagecorruptions import corrupt
+except ImportError:
+    corrupt = None
+
+try:
+    import albumentations
+    from albumentations import Compose
+except ImportError:
+    albumentations = None
+    Compose = None
+
+Number = Union[int, float]
+
+
+def _fixed_scale_size(
+    size: Tuple[int, int],
+    scale: Union[float, int, tuple],
+) -> Tuple[int, int]:
+    """Rescale a size by a ratio.
+
+    Args:
+        size (tuple[int]): (w, h).
+        scale (float | tuple(float)): Scaling factor.
+
+    Returns:
+        tuple[int]: scaled size.
+    """
+    if isinstance(scale, (float, int)):
+        scale = (scale, scale)
+    w, h = size
+    # don't need o.5 offset
+    return int(w * float(scale[0])), int(h * float(scale[1]))
+
+
+def rescale_size(old_size: tuple,
+                 scale: Union[float, int, tuple],
+                 return_scale: bool = False) -> tuple:
+    """Calculate the new size to be rescaled to.
+
+    Args:
+        old_size (tuple[int]): The old size (w, h) of image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image size.
+
+    Returns:
+        tuple[int]: The new rescaled image size.
+    """
+    w, h = old_size
+    if isinstance(scale, (float, int)):
+        if scale <= 0:
+            raise ValueError(f'Invalid scale {scale}, must be positive.')
+        scale_factor = scale
+    elif isinstance(scale, tuple):
+        max_long_edge = max(scale)
+        max_short_edge = min(scale)
+        scale_factor = min(max_long_edge / max(h, w),
+                           max_short_edge / min(h, w))
+    else:
+        raise TypeError(
+            f'Scale must be a number or tuple of int, but got {type(scale)}')
+    # only change this
+    new_size = _fixed_scale_size((w, h), scale_factor)
+
+    if return_scale:
+        return new_size, scale_factor
+    else:
+        return new_size
+
+
+def imrescale(
+    img: np.ndarray,
+    scale: Union[float, Tuple[int, int]],
+    return_scale: bool = False,
+    interpolation: str = 'bilinear',
+    backend: Optional[str] = None
+) -> Union[np.ndarray, Tuple[np.ndarray, float]]:
+    """Resize image while keeping the aspect ratio.
+
+    Args:
+        img (ndarray): The input image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The rescaled image.
+    """
+    h, w = img.shape[:2]
+    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
+    rescaled_img = imresize(
+        img, new_size, interpolation=interpolation, backend=backend)
+    if return_scale:
+        return rescaled_img, scale_factor
+    else:
+        return rescaled_img
+
+
+@TRANSFORMS.register_module()
+class Resize(MMCV_Resize):
+    """Resize images & bbox & seg.
+
+    This transform resizes the input image according to ``scale`` or
+    ``scale_factor``. Bboxes, masks, and seg map are then resized
+    with the same scale factor.
+    if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to
+    resize.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+    - homography_matrix
+
+    Args:
+        scale (int or tuple): Images scales for resizing. Defaults to None
+        scale_factor (float or tuple[float]): Scale factors for resizing.
+            Defaults to None.
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Defaults to False.
+        clip_object_border (bool): Whether to clip the objects
+            outside the border of the image. In some dataset like MOT17, the gt
+            bboxes are allowed to cross the border of images. Therefore, we
+            don't need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def _resize_masks(self, results: dict) -> None:
+        """Resize masks with ``results['scale']``"""
+        if results.get('gt_masks', None) is not None:
+            if self.keep_ratio:
+                results['gt_masks'] = results['gt_masks'].rescale(
+                    results['scale'])
+            else:
+                results['gt_masks'] = results['gt_masks'].resize(
+                    results['img_shape'])
+
+    def _resize_bboxes(self, results: dict) -> None:
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'].rescale_(results['scale_factor'])
+            if self.clip_object_border:
+                results['gt_bboxes'].clip_(results['img_shape'])
+
+    def _record_homography_matrix(self, results: dict) -> None:
+        """Record the homography matrix for the Resize."""
+        w_scale, h_scale = results['scale_factor']
+        homography_matrix = np.array(
+            [[w_scale, 0, 0], [0, h_scale, 0], [0, 0, 1]], dtype=np.float32)
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = homography_matrix
+        else:
+            results['homography_matrix'] = homography_matrix @ results[
+                'homography_matrix']
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to resize images, bounding boxes and semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys
+            are updated in result dict.
+        """
+        if self.scale:
+            results['scale'] = self.scale
+        else:
+            img_shape = results['img'].shape[:2]
+            results['scale'] = _scale_size(img_shape[::-1], self.scale_factor)
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        self._record_homography_matrix(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(scale={self.scale}, '
+        repr_str += f'scale_factor={self.scale_factor}, '
+        repr_str += f'keep_ratio={self.keep_ratio}, '
+        repr_str += f'clip_object_border={self.clip_object_border}), '
+        repr_str += f'backend={self.backend}), '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class FixScaleResize(Resize):
+    """Compared to Resize, FixScaleResize fixes the scaling issue when
+    `keep_ratio=true`."""
+
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        if results.get('img', None) is not None:
+            if self.keep_ratio:
+                img, scale_factor = imrescale(
+                    results['img'],
+                    results['scale'],
+                    interpolation=self.interpolation,
+                    return_scale=True,
+                    backend=self.backend)
+                new_h, new_w = img.shape[:2]
+                h, w = results['img'].shape[:2]
+                w_scale = new_w / w
+                h_scale = new_h / h
+            else:
+                img, w_scale, h_scale = mmcv.imresize(
+                    results['img'],
+                    results['scale'],
+                    interpolation=self.interpolation,
+                    return_scale=True,
+                    backend=self.backend)
+            results['img'] = img
+            results['img_shape'] = img.shape[:2]
+            results['scale_factor'] = (w_scale, h_scale)
+            results['keep_ratio'] = self.keep_ratio
+
+
+@TRANSFORMS.register_module()
+class ResizeShortestEdge(BaseTransform):
+    """Resize the image and mask while keeping the aspect ratio unchanged.
+
+    Modified from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/augmentation_impl.py#L130 # noqa:E501
+
+    This transform attempts to scale the shorter edge to the given
+    `scale`, as long as the longer edge does not exceed `max_size`.
+    If `max_size` is reached, then downscale so that the longer
+    edge does not exceed `max_size`.
+
+    Required Keys:
+        - img
+        - gt_seg_map (optional)
+    Modified Keys:
+        - img
+        - img_shape
+        - gt_seg_map (optional))
+    Added Keys:
+        - scale
+        - scale_factor
+        - keep_ratio
+
+    Args:
+        scale (Union[int, Tuple[int, int]]): The target short edge length.
+            If it's tuple, will select the min value as the short edge length.
+        max_size (int): The maximum allowed longest edge length.
+    """
+
+    def __init__(self,
+                 scale: Union[int, Tuple[int, int]],
+                 max_size: Optional[int] = None,
+                 resize_type: str = 'Resize',
+                 **resize_kwargs) -> None:
+        super().__init__()
+        self.scale = scale
+        self.max_size = max_size
+
+        self.resize_cfg = dict(type=resize_type, **resize_kwargs)
+        self.resize = TRANSFORMS.build({'scale': 0, **self.resize_cfg})
+
+    def _get_output_shape(
+            self, img: np.ndarray,
+            short_edge_length: Union[int, Tuple[int, int]]) -> Tuple[int, int]:
+        """Compute the target image shape with the given `short_edge_length`.
+
+        Args:
+            img (np.ndarray): The input image.
+            short_edge_length (Union[int, Tuple[int, int]]): The target short
+                edge length. If it's tuple, will select the min value as the
+                short edge length.
+        """
+        h, w = img.shape[:2]
+        if isinstance(short_edge_length, int):
+            size = short_edge_length * 1.0
+        elif isinstance(short_edge_length, tuple):
+            size = min(short_edge_length) * 1.0
+        scale = size / min(h, w)
+        if h < w:
+            new_h, new_w = size, scale * w
+        else:
+            new_h, new_w = scale * h, size
+
+        if self.max_size and max(new_h, new_w) > self.max_size:
+            scale = self.max_size * 1.0 / max(new_h, new_w)
+            new_h *= scale
+            new_w *= scale
+
+        new_h = int(new_h + 0.5)
+        new_w = int(new_w + 0.5)
+        return new_w, new_h
+
+    def transform(self, results: dict) -> dict:
+        self.resize.scale = self._get_output_shape(results['img'], self.scale)
+        return self.resize(results)
+
+
+@TRANSFORMS.register_module()
+class FixShapeResize(Resize):
+    """Resize images & bbox & seg to the specified size.
+
+    This transform resizes the input image according to ``width`` and
+    ``height``. Bboxes, masks, and seg map are then resized
+    with the same parameters.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+    - homography_matrix
+
+    Args:
+        width (int): width for resizing.
+        height (int): height for resizing.
+            Defaults to None.
+        pad_val (Number | dict[str, Number], optional): Padding value for if
+            the pad_mode is "constant".  If it is a single number, the value
+            to pad the image is the number and to pad the semantic
+            segmentation map is 255. If it is a dict, it should have the
+            following keys:
+
+            - img: The value to pad the image.
+            - seg: The value to pad the semantic segmentation map.
+            Defaults to dict(img=0, seg=255).
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Defaults to False.
+        clip_object_border (bool): Whether to clip the objects
+            outside the border of the image. In some dataset like MOT17, the gt
+            bboxes are allowed to cross the border of images. Therefore, we
+            don't need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def __init__(self,
+                 width: int,
+                 height: int,
+                 pad_val: Union[Number, dict] = dict(img=0, seg=255),
+                 keep_ratio: bool = False,
+                 clip_object_border: bool = True,
+                 backend: str = 'cv2',
+                 interpolation: str = 'bilinear') -> None:
+        assert width is not None and height is not None, (
+            '`width` and'
+            '`height` can not be `None`')
+
+        self.width = width
+        self.height = height
+        self.scale = (width, height)
+
+        self.backend = backend
+        self.interpolation = interpolation
+        self.keep_ratio = keep_ratio
+        self.clip_object_border = clip_object_border
+
+        if keep_ratio is True:
+            # padding to the fixed size when keep_ratio=True
+            self.pad_transform = Pad(size=self.scale, pad_val=pad_val)
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to resize images, bounding boxes and semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Resized results, 'img', 'gt_bboxes', 'gt_seg_map',
+            'scale', 'scale_factor', 'height', 'width', and 'keep_ratio' keys
+            are updated in result dict.
+        """
+        img = results['img']
+        h, w = img.shape[:2]
+        if self.keep_ratio:
+            scale_factor = min(self.width / w, self.height / h)
+            results['scale_factor'] = (scale_factor, scale_factor)
+            real_w, real_h = int(w * float(scale_factor) +
+                                 0.5), int(h * float(scale_factor) + 0.5)
+            img, scale_factor = mmcv.imrescale(
+                results['img'], (real_w, real_h),
+                interpolation=self.interpolation,
+                return_scale=True,
+                backend=self.backend)
+            # the w_scale and h_scale has minor difference
+            # a real fix should be done in the mmcv.imrescale in the future
+            results['img'] = img
+            results['img_shape'] = img.shape[:2]
+            results['keep_ratio'] = self.keep_ratio
+            results['scale'] = (real_w, real_h)
+        else:
+            results['scale'] = (self.width, self.height)
+            results['scale_factor'] = (self.width / w, self.height / h)
+            super()._resize_img(results)
+
+        self._resize_bboxes(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        self._record_homography_matrix(results)
+        if self.keep_ratio:
+            self.pad_transform(results)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(width={self.width}, height={self.height}, '
+        repr_str += f'keep_ratio={self.keep_ratio}, '
+        repr_str += f'clip_object_border={self.clip_object_border}), '
+        repr_str += f'backend={self.backend}), '
+        repr_str += f'interpolation={self.interpolation})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomFlip(MMCV_RandomFlip):
+    """Flip the image & bbox & mask & segmentation map. Added or Updated keys:
+    flip, flip_direction, img, gt_bboxes, and gt_seg_map. There are 3 flip
+    modes:
+
+     - ``prob`` is float, ``direction`` is string: the image will be
+         ``direction``ly flipped with probability of ``prob`` .
+         E.g., ``prob=0.5``, ``direction='horizontal'``,
+         then image will be horizontally flipped with probability of 0.5.
+     - ``prob`` is float, ``direction`` is list of string: the image will
+         be ``direction[i]``ly flipped with probability of
+         ``prob/len(direction)``.
+         E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``,
+         then image will be horizontally flipped with probability of 0.25,
+         vertically with probability of 0.25.
+     - ``prob`` is list of float, ``direction`` is list of string:
+         given ``len(prob) == len(direction)``, the image will
+         be ``direction[i]``ly flipped with probability of ``prob[i]``.
+         E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal',
+         'vertical']``, then image will be horizontally flipped with
+         probability of 0.3, vertically with probability of 0.5.
+
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - flip
+    - flip_direction
+    - homography_matrix
+
+
+    Args:
+         prob (float | list[float], optional): The flipping probability.
+             Defaults to None.
+         direction(str | list[str]): The flipping direction. Options
+             If input is a list, the length must equal ``prob``. Each
+             element in ``prob`` indicates the flip probability of
+             corresponding direction. Defaults to 'horizontal'.
+    """
+
+    def _record_homography_matrix(self, results: dict) -> None:
+        """Record the homography matrix for the RandomFlip."""
+        cur_dir = results['flip_direction']
+        h, w = results['img'].shape[:2]
+
+        if cur_dir == 'horizontal':
+            homography_matrix = np.array([[-1, 0, w], [0, 1, 0], [0, 0, 1]],
+                                         dtype=np.float32)
+        elif cur_dir == 'vertical':
+            homography_matrix = np.array([[1, 0, 0], [0, -1, h], [0, 0, 1]],
+                                         dtype=np.float32)
+        elif cur_dir == 'diagonal':
+            homography_matrix = np.array([[-1, 0, w], [0, -1, h], [0, 0, 1]],
+                                         dtype=np.float32)
+        else:
+            homography_matrix = np.eye(3, dtype=np.float32)
+
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = homography_matrix
+        else:
+            results['homography_matrix'] = homography_matrix @ results[
+                'homography_matrix']
+
+    @autocast_box_type()
+    def _flip(self, results: dict) -> None:
+        """Flip images, bounding boxes, and semantic segmentation map."""
+        # flip image
+        results['img'] = mmcv.imflip(
+            results['img'], direction=results['flip_direction'])
+
+        img_shape = results['img'].shape[:2]
+
+        # flip bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'].flip_(img_shape, results['flip_direction'])
+
+        # flip masks
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'].flip(
+                results['flip_direction'])
+
+        # flip segs
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = mmcv.imflip(
+                results['gt_seg_map'], direction=results['flip_direction'])
+
+        # record homography matrix for flip
+        self._record_homography_matrix(results)
+
+
+@TRANSFORMS.register_module()
+class RandomShift(BaseTransform):
+    """Shift the image and box given shift pixels and probability.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32])
+    - gt_bboxes_labels (np.int64)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_ignore_flags (bool) (optional)
+
+    Args:
+        prob (float): Probability of shifts. Defaults to 0.5.
+        max_shift_px (int): The max pixels for shifting. Defaults to 32.
+        filter_thr_px (int): The width and height threshold for filtering.
+            The bbox and the rest of the targets below the width and
+            height threshold will be filtered. Defaults to 1.
+    """
+
+    def __init__(self,
+                 prob: float = 0.5,
+                 max_shift_px: int = 32,
+                 filter_thr_px: int = 1) -> None:
+        assert 0 <= prob <= 1
+        assert max_shift_px >= 0
+        self.prob = prob
+        self.max_shift_px = max_shift_px
+        self.filter_thr_px = int(filter_thr_px)
+
+    @cache_randomness
+    def _random_prob(self) -> float:
+        return random.uniform(0, 1)
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to random shift images, bounding boxes.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Shift results.
+        """
+        if self._random_prob() < self.prob:
+            img_shape = results['img'].shape[:2]
+
+            random_shift_x = random.randint(-self.max_shift_px,
+                                            self.max_shift_px)
+            random_shift_y = random.randint(-self.max_shift_px,
+                                            self.max_shift_px)
+            new_x = max(0, random_shift_x)
+            ori_x = max(0, -random_shift_x)
+            new_y = max(0, random_shift_y)
+            ori_y = max(0, -random_shift_y)
+
+            # TODO: support mask and semantic segmentation maps.
+            bboxes = results['gt_bboxes'].clone()
+            bboxes.translate_([random_shift_x, random_shift_y])
+
+            # clip border
+            bboxes.clip_(img_shape)
+
+            # remove invalid bboxes
+            valid_inds = (bboxes.widths > self.filter_thr_px).numpy() & (
+                bboxes.heights > self.filter_thr_px).numpy()
+            # If the shift does not contain any gt-bbox area, skip this
+            # image.
+            if not valid_inds.any():
+                return results
+            bboxes = bboxes[valid_inds]
+            results['gt_bboxes'] = bboxes
+            results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
+                valid_inds]
+
+            if results.get('gt_ignore_flags', None) is not None:
+                results['gt_ignore_flags'] = \
+                    results['gt_ignore_flags'][valid_inds]
+
+            # shift img
+            img = results['img']
+            new_img = np.zeros_like(img)
+            img_h, img_w = img.shape[:2]
+            new_h = img_h - np.abs(random_shift_y)
+            new_w = img_w - np.abs(random_shift_x)
+            new_img[new_y:new_y + new_h, new_x:new_x + new_w] \
+                = img[ori_y:ori_y + new_h, ori_x:ori_x + new_w]
+            results['img'] = new_img
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'max_shift_px={self.max_shift_px}, '
+        repr_str += f'filter_thr_px={self.filter_thr_px})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Pad(MMCV_Pad):
+    """Pad the image & segmentation map.
+
+    There are three padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number. and (3)pad to square. Also,
+    pad to square and pad to the minimum size can be used as the same time.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_masks
+    - gt_seg_map
+
+    Added Keys:
+
+    - pad_shape
+    - pad_fixed_size
+    - pad_size_divisor
+
+    Args:
+        size (tuple, optional): Fixed padding size.
+            Expected padding shape (width, height). Defaults to None.
+        size_divisor (int, optional): The divisor of padded size. Defaults to
+            None.
+        pad_to_square (bool): Whether to pad the image into a square.
+            Currently only used for YOLOX. Defaults to False.
+        pad_val (Number | dict[str, Number], optional) - Padding value for if
+            the pad_mode is "constant".  If it is a single number, the value
+            to pad the image is the number and to pad the semantic
+            segmentation map is 255. If it is a dict, it should have the
+            following keys:
+
+            - img: The value to pad the image.
+            - seg: The value to pad the semantic segmentation map.
+            Defaults to dict(img=0, seg=255).
+        padding_mode (str): Type of padding. Should be: constant, edge,
+            reflect or symmetric. Defaults to 'constant'.
+
+            - constant: pads with a constant value, this value is specified
+              with pad_val.
+            - edge: pads with the last value at the edge of the image.
+            - reflect: pads with reflection of image without repeating the last
+              value on the edge. For example, padding [1, 2, 3, 4] with 2
+              elements on both sides in reflect mode will result in
+              [3, 2, 1, 2, 3, 4, 3, 2].
+            - symmetric: pads with reflection of image repeating the last value
+              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+              both sides in symmetric mode will result in
+              [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    def _pad_masks(self, results: dict) -> None:
+        """Pad masks according to ``results['pad_shape']``."""
+        if results.get('gt_masks', None) is not None:
+            pad_val = self.pad_val.get('masks', 0)
+            pad_shape = results['pad_shape'][:2]
+            results['gt_masks'] = results['gt_masks'].pad(
+                pad_shape, pad_val=pad_val)
+
+    def transform(self, results: dict) -> dict:
+        """Call function to pad images, masks, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        self._pad_seg(results)
+        self._pad_masks(results)
+        return results
+
+
+@TRANSFORMS.register_module()
+class RandomCrop(BaseTransform):
+    """Random crop the image & bboxes & masks.
+
+    The absolute ``crop_size`` is sampled based on ``crop_type`` and
+    ``image_size``, then the cropped results are generated.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_masks (optional)
+    - gt_ignore_flags (optional)
+    - gt_seg_map (optional)
+    - gt_instances_ids (options, only used in MOT/VIS)
+
+    Added Keys:
+
+    - homography_matrix
+
+    Args:
+        crop_size (tuple): The relative ratio or absolute pixels of
+            (width, height).
+        crop_type (str, optional): One of "relative_range", "relative",
+            "absolute", "absolute_range". "relative" randomly crops
+            (h * crop_size[0], w * crop_size[1]) part from an input of size
+            (h, w). "relative_range" uniformly samples relative crop size from
+            range [crop_size[0], 1] and [crop_size[1], 1] for height and width
+            respectively. "absolute" crops from an input with absolute size
+            (crop_size[0], crop_size[1]). "absolute_range" uniformly samples
+            crop_h in range [crop_size[0], min(h, crop_size[1])] and crop_w
+            in range [crop_size[0], min(w, crop_size[1])].
+            Defaults to "absolute".
+        allow_negative_crop (bool, optional): Whether to allow a crop that does
+            not contain any bbox area. Defaults to False.
+        recompute_bbox (bool, optional): Whether to re-compute the boxes based
+            on cropped instance masks. Defaults to False.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+
+    Note:
+        - If the image is smaller than the absolute crop size, return the
+            original image.
+        - The keys for bboxes, labels and masks must be aligned. That is,
+          ``gt_bboxes`` corresponds to ``gt_labels`` and ``gt_masks``, and
+          ``gt_bboxes_ignore`` corresponds to ``gt_labels_ignore`` and
+          ``gt_masks_ignore``.
+        - If the crop does not contain any gt-bbox region and
+          ``allow_negative_crop`` is set to False, skip this image.
+    """
+
+    def __init__(self,
+                 crop_size: tuple,
+                 crop_type: str = 'absolute',
+                 allow_negative_crop: bool = False,
+                 recompute_bbox: bool = False,
+                 bbox_clip_border: bool = True) -> None:
+        if crop_type not in [
+                'relative_range', 'relative', 'absolute', 'absolute_range'
+        ]:
+            raise ValueError(f'Invalid crop_type {crop_type}.')
+        if crop_type in ['absolute', 'absolute_range']:
+            assert crop_size[0] > 0 and crop_size[1] > 0
+            assert isinstance(crop_size[0], int) and isinstance(
+                crop_size[1], int)
+            if crop_type == 'absolute_range':
+                assert crop_size[0] <= crop_size[1]
+        else:
+            assert 0 < crop_size[0] <= 1 and 0 < crop_size[1] <= 1
+        self.crop_size = crop_size
+        self.crop_type = crop_type
+        self.allow_negative_crop = allow_negative_crop
+        self.bbox_clip_border = bbox_clip_border
+        self.recompute_bbox = recompute_bbox
+
+    def _crop_data(self, results: dict, crop_size: Tuple[int, int],
+                   allow_negative_crop: bool) -> Union[dict, None]:
+        """Function to randomly crop images, bounding boxes, masks, semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            crop_size (Tuple[int, int]): Expected absolute size after
+                cropping, (h, w).
+            allow_negative_crop (bool): Whether to allow a crop that does not
+                contain any bbox area.
+
+        Returns:
+            results (Union[dict, None]): Randomly cropped results, 'img_shape'
+                key in result dict is updated according to crop size. None will
+                be returned when there is no valid bbox after cropping.
+        """
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        img = results['img']
+        margin_h = max(img.shape[0] - crop_size[0], 0)
+        margin_w = max(img.shape[1] - crop_size[1], 0)
+        offset_h, offset_w = self._rand_offset((margin_h, margin_w))
+        crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
+        crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
+
+        # Record the homography matrix for the RandomCrop
+        homography_matrix = np.array(
+            [[1, 0, -offset_w], [0, 1, -offset_h], [0, 0, 1]],
+            dtype=np.float32)
+        if results.get('homography_matrix', None) is None:
+            results['homography_matrix'] = homography_matrix
+        else:
+            results['homography_matrix'] = homography_matrix @ results[
+                'homography_matrix']
+
+        # crop the image
+        img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+        img_shape = img.shape
+        results['img'] = img
+        results['img_shape'] = img_shape[:2]
+
+        # crop bboxes accordingly and clip to the image boundary
+        if results.get('gt_bboxes', None) is not None:
+            bboxes = results['gt_bboxes']
+            bboxes.translate_([-offset_w, -offset_h])
+            if self.bbox_clip_border:
+                bboxes.clip_(img_shape[:2])
+            valid_inds = bboxes.is_inside(img_shape[:2]).numpy()
+            # If the crop does not contain any gt-bbox area and
+            # allow_negative_crop is False, skip this image.
+            if (not valid_inds.any() and not allow_negative_crop):
+                return None
+
+            results['gt_bboxes'] = bboxes[valid_inds]
+
+            if results.get('gt_ignore_flags', None) is not None:
+                results['gt_ignore_flags'] = \
+                    results['gt_ignore_flags'][valid_inds]
+
+            if results.get('gt_bboxes_labels', None) is not None:
+                results['gt_bboxes_labels'] = \
+                    results['gt_bboxes_labels'][valid_inds]
+
+            if results.get('gt_masks', None) is not None:
+                results['gt_masks'] = results['gt_masks'][
+                    valid_inds.nonzero()[0]].crop(
+                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+                if self.recompute_bbox:
+                    results['gt_bboxes'] = results['gt_masks'].get_bboxes(
+                        type(results['gt_bboxes']))
+
+            # We should remove the instance ids corresponding to invalid boxes.
+            if results.get('gt_instances_ids', None) is not None:
+                results['gt_instances_ids'] = \
+                    results['gt_instances_ids'][valid_inds]
+
+        # crop semantic seg
+        if results.get('gt_seg_map', None) is not None:
+            results['gt_seg_map'] = results['gt_seg_map'][crop_y1:crop_y2,
+                                                          crop_x1:crop_x2]
+
+        return results
+
+    @cache_randomness
+    def _rand_offset(self, margin: Tuple[int, int]) -> Tuple[int, int]:
+        """Randomly generate crop offset.
+
+        Args:
+            margin (Tuple[int, int]): The upper bound for the offset generated
+                randomly.
+
+        Returns:
+            Tuple[int, int]: The random offset for the crop.
+        """
+        margin_h, margin_w = margin
+        offset_h = np.random.randint(0, margin_h + 1)
+        offset_w = np.random.randint(0, margin_w + 1)
+
+        return offset_h, offset_w
+
+    @cache_randomness
+    def _get_crop_size(self, image_size: Tuple[int, int]) -> Tuple[int, int]:
+        """Randomly generates the absolute crop size based on `crop_type` and
+        `image_size`.
+
+        Args:
+            image_size (Tuple[int, int]): (h, w).
+
+        Returns:
+            crop_size (Tuple[int, int]): (crop_h, crop_w) in absolute pixels.
+        """
+        h, w = image_size
+        if self.crop_type == 'absolute':
+            return min(self.crop_size[1], h), min(self.crop_size[0], w)
+        elif self.crop_type == 'absolute_range':
+            crop_h = np.random.randint(
+                min(h, self.crop_size[0]),
+                min(h, self.crop_size[1]) + 1)
+            crop_w = np.random.randint(
+                min(w, self.crop_size[0]),
+                min(w, self.crop_size[1]) + 1)
+            return crop_h, crop_w
+        elif self.crop_type == 'relative':
+            crop_w, crop_h = self.crop_size
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+        else:
+            # 'relative_range'
+            crop_size = np.asarray(self.crop_size, dtype=np.float32)
+            crop_h, crop_w = crop_size + np.random.rand(2) * (1 - crop_size)
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """Transform function to randomly crop images, bounding boxes, masks,
+        semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            results (Union[dict, None]): Randomly cropped results, 'img_shape'
+                key in result dict is updated according to crop size. None will
+                be returned when there is no valid bbox after cropping.
+        """
+        image_size = results['img'].shape[:2]
+        crop_size = self._get_crop_size(image_size)
+        results = self._crop_data(results, crop_size, self.allow_negative_crop)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'crop_type={self.crop_type}, '
+        repr_str += f'allow_negative_crop={self.allow_negative_crop}, '
+        repr_str += f'recompute_bbox={self.recompute_bbox}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class SegRescale(BaseTransform):
+    """Rescale semantic segmentation maps.
+
+    This transform rescale the ``gt_seg_map`` according to ``scale_factor``.
+
+    Required Keys:
+
+    - gt_seg_map
+
+    Modified Keys:
+
+    - gt_seg_map
+
+    Args:
+        scale_factor (float): The scale factor of the final output. Defaults
+            to 1.
+        backend (str): Image rescale backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+    """
+
+    def __init__(self, scale_factor: float = 1, backend: str = 'cv2') -> None:
+        self.scale_factor = scale_factor
+        self.backend = backend
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to scale the semantic segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with semantic segmentation map scaled.
+        """
+        if self.scale_factor != 1:
+            results['gt_seg_map'] = mmcv.imrescale(
+                results['gt_seg_map'],
+                self.scale_factor,
+                interpolation='nearest',
+                backend=self.backend)
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(scale_factor={self.scale_factor}, '
+        repr_str += f'backend={self.backend})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PhotoMetricDistortion(BaseTransform):
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+
+    Required Keys:
+
+    - img (np.uint8)
+
+    Modified Keys:
+
+    - img (np.float32)
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (sequence): range of contrast.
+        saturation_range (sequence): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta: int = 32,
+                 contrast_range: Sequence[Number] = (0.5, 1.5),
+                 saturation_range: Sequence[Number] = (0.5, 1.5),
+                 hue_delta: int = 18) -> None:
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    @cache_randomness
+    def _random_flags(self) -> Sequence[Number]:
+        mode = random.randint(2)
+        brightness_flag = random.randint(2)
+        contrast_flag = random.randint(2)
+        saturation_flag = random.randint(2)
+        hue_flag = random.randint(2)
+        swap_flag = random.randint(2)
+        delta_value = random.uniform(-self.brightness_delta,
+                                     self.brightness_delta)
+        alpha_value = random.uniform(self.contrast_lower, self.contrast_upper)
+        saturation_value = random.uniform(self.saturation_lower,
+                                          self.saturation_upper)
+        hue_value = random.uniform(-self.hue_delta, self.hue_delta)
+        swap_value = random.permutation(3)
+
+        return (mode, brightness_flag, contrast_flag, saturation_flag,
+                hue_flag, swap_flag, delta_value, alpha_value,
+                saturation_value, hue_value, swap_value)
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+        assert 'img' in results, '`img` is not found in results'
+        img = results['img']
+        img = img.astype(np.float32)
+
+        (mode, brightness_flag, contrast_flag, saturation_flag, hue_flag,
+         swap_flag, delta_value, alpha_value, saturation_value, hue_value,
+         swap_value) = self._random_flags()
+
+        # random brightness
+        if brightness_flag:
+            img += delta_value
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        if mode == 1:
+            if contrast_flag:
+                img *= alpha_value
+
+        # convert color from BGR to HSV
+        img = mmcv.bgr2hsv(img)
+
+        # random saturation
+        if saturation_flag:
+            img[..., 1] *= saturation_value
+            # For image(type=float32), after convert bgr to hsv by opencv,
+            # valid saturation value range is [0, 1]
+            if saturation_value > 1:
+                img[..., 1] = img[..., 1].clip(0, 1)
+
+        # random hue
+        if hue_flag:
+            img[..., 0] += hue_value
+            img[..., 0][img[..., 0] > 360] -= 360
+            img[..., 0][img[..., 0] < 0] += 360
+
+        # convert color from HSV to BGR
+        img = mmcv.hsv2bgr(img)
+
+        # random contrast
+        if mode == 0:
+            if contrast_flag:
+                img *= alpha_value
+
+        # randomly swap channels
+        if swap_flag:
+            img = img[..., swap_value]
+
+        results['img'] = img
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(brightness_delta={self.brightness_delta}, '
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)}, '
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)}, '
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Expand(BaseTransform):
+    """Random expand the image & bboxes & masks & segmentation map.
+
+    Randomly place the original image on a canvas of ``ratio`` x original image
+    size filled with mean values. The ratio is in the range of ratio_range.
+
+    Required Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_masks
+    - gt_seg_map
+
+
+    Args:
+        mean (sequence): mean value of dataset.
+        to_rgb (bool): if need to convert the order of mean to align with RGB.
+        ratio_range (sequence)): range of expand ratio.
+        seg_ignore_label (int): label of ignore segmentation map.
+        prob (float): probability of applying this transformation
+    """
+
+    def __init__(self,
+                 mean: Sequence[Number] = (0, 0, 0),
+                 to_rgb: bool = True,
+                 ratio_range: Sequence[Number] = (1, 4),
+                 seg_ignore_label: int = None,
+                 prob: float = 0.5) -> None:
+        self.to_rgb = to_rgb
+        self.ratio_range = ratio_range
+        if to_rgb:
+            self.mean = mean[::-1]
+        else:
+            self.mean = mean
+        self.min_ratio, self.max_ratio = ratio_range
+        self.seg_ignore_label = seg_ignore_label
+        self.prob = prob
+
+    @cache_randomness
+    def _random_prob(self) -> float:
+        return random.uniform(0, 1)
+
+    @cache_randomness
+    def _random_ratio(self) -> float:
+        return random.uniform(self.min_ratio, self.max_ratio)
+
+    @cache_randomness
+    def _random_left_top(self, ratio: float, h: int,
+                         w: int) -> Tuple[int, int]:
+        left = int(random.uniform(0, w * ratio - w))
+        top = int(random.uniform(0, h * ratio - h))
+        return left, top
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to expand images, bounding boxes, masks,
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images, bounding boxes, masks, segmentation
+                map expanded.
+        """
+        if self._random_prob() > self.prob:
+            return results
+        assert 'img' in results, '`img` is not found in results'
+        img = results['img']
+        h, w, c = img.shape
+        ratio = self._random_ratio()
+        # speedup expand when meets large image
+        if np.all(self.mean == self.mean[0]):
+            expand_img = np.empty((int(h * ratio), int(w * ratio), c),
+                                  img.dtype)
+            expand_img.fill(self.mean[0])
+        else:
+            expand_img = np.full((int(h * ratio), int(w * ratio), c),
+                                 self.mean,
+                                 dtype=img.dtype)
+        left, top = self._random_left_top(ratio, h, w)
+        expand_img[top:top + h, left:left + w] = img
+        results['img'] = expand_img
+        results['img_shape'] = expand_img.shape[:2]
+
+        # expand bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'].translate_([left, top])
+
+        # expand masks
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'].expand(
+                int(h * ratio), int(w * ratio), top, left)
+
+        # expand segmentation map
+        if results.get('gt_seg_map', None) is not None:
+            gt_seg = results['gt_seg_map']
+            expand_gt_seg = np.full((int(h * ratio), int(w * ratio)),
+                                    self.seg_ignore_label,
+                                    dtype=gt_seg.dtype)
+            expand_gt_seg[top:top + h, left:left + w] = gt_seg
+            results['gt_seg_map'] = expand_gt_seg
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, to_rgb={self.to_rgb}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MinIoURandomCrop(BaseTransform):
+    """Random crop the image & bboxes & masks & segmentation map, the cropped
+    patches have minimum IoU requirement with original image & bboxes & masks.
+
+    & segmentation map, the IoU threshold is randomly selected from min_ious.
+
+
+    Required Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_seg_map (np.uint8) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes
+    - gt_bboxes_labels
+    - gt_masks
+    - gt_ignore_flags
+    - gt_seg_map
+
+
+    Args:
+        min_ious (Sequence[float]): minimum IoU threshold for all intersections
+            with bounding boxes.
+        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
+        where a >= min_crop_size).
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 min_ious: Sequence[float] = (0.1, 0.3, 0.5, 0.7, 0.9),
+                 min_crop_size: float = 0.3,
+                 bbox_clip_border: bool = True) -> None:
+
+        self.min_ious = min_ious
+        self.sample_mode = (1, *min_ious, 0)
+        self.min_crop_size = min_crop_size
+        self.bbox_clip_border = bbox_clip_border
+
+    @cache_randomness
+    def _random_mode(self) -> Number:
+        return random.choice(self.sample_mode)
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to crop images and bounding boxes with minimum
+        IoU constraint.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images and bounding boxes cropped, \
+                'img_shape' key is updated.
+        """
+        assert 'img' in results, '`img` is not found in results'
+        assert 'gt_bboxes' in results, '`gt_bboxes` is not found in results'
+        img = results['img']
+        boxes = results['gt_bboxes']
+        h, w, c = img.shape
+        while True:
+            mode = self._random_mode()
+            self.mode = mode
+            if mode == 1:
+                return results
+
+            min_iou = self.mode
+            for i in range(50):
+                new_w = random.uniform(self.min_crop_size * w, w)
+                new_h = random.uniform(self.min_crop_size * h, h)
+
+                # h / w in [0.5, 2]
+                if new_h / new_w < 0.5 or new_h / new_w > 2:
+                    continue
+
+                left = random.uniform(w - new_w)
+                top = random.uniform(h - new_h)
+
+                patch = np.array(
+                    (int(left), int(top), int(left + new_w), int(top + new_h)))
+                # Line or point crop is not allowed
+                if patch[2] == patch[0] or patch[3] == patch[1]:
+                    continue
+                overlaps = boxes.overlaps(
+                    HorizontalBoxes(patch.reshape(-1, 4).astype(np.float32)),
+                    boxes).numpy().reshape(-1)
+                if len(overlaps) > 0 and overlaps.min() < min_iou:
+                    continue
+
+                # center of boxes should inside the crop img
+                # only adjust boxes and instance masks when the gt is not empty
+                if len(overlaps) > 0:
+                    # adjust boxes
+                    def is_center_of_bboxes_in_patch(boxes, patch):
+                        centers = boxes.centers.numpy()
+                        mask = ((centers[:, 0] > patch[0]) *
+                                (centers[:, 1] > patch[1]) *
+                                (centers[:, 0] < patch[2]) *
+                                (centers[:, 1] < patch[3]))
+                        return mask
+
+                    mask = is_center_of_bboxes_in_patch(boxes, patch)
+                    if not mask.any():
+                        continue
+                    if results.get('gt_bboxes', None) is not None:
+                        boxes = results['gt_bboxes']
+                        mask = is_center_of_bboxes_in_patch(boxes, patch)
+                        boxes = boxes[mask]
+                        boxes.translate_([-patch[0], -patch[1]])
+                        if self.bbox_clip_border:
+                            boxes.clip_(
+                                [patch[3] - patch[1], patch[2] - patch[0]])
+                        results['gt_bboxes'] = boxes
+
+                        # ignore_flags
+                        if results.get('gt_ignore_flags', None) is not None:
+                            results['gt_ignore_flags'] = \
+                                results['gt_ignore_flags'][mask]
+
+                        # labels
+                        if results.get('gt_bboxes_labels', None) is not None:
+                            results['gt_bboxes_labels'] = results[
+                                'gt_bboxes_labels'][mask]
+
+                        # mask fields
+                        if results.get('gt_masks', None) is not None:
+                            results['gt_masks'] = results['gt_masks'][
+                                mask.nonzero()[0]].crop(patch)
+                # adjust the img no matter whether the gt is empty before crop
+                img = img[patch[1]:patch[3], patch[0]:patch[2]]
+                results['img'] = img
+                results['img_shape'] = img.shape[:2]
+
+                # seg fields
+                if results.get('gt_seg_map', None) is not None:
+                    results['gt_seg_map'] = results['gt_seg_map'][
+                        patch[1]:patch[3], patch[0]:patch[2]]
+                return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(min_ious={self.min_ious}, '
+        repr_str += f'min_crop_size={self.min_crop_size}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Corrupt(BaseTransform):
+    """Corruption augmentation.
+
+    Corruption transforms implemented based on
+    `imagecorruptions <https://github.com/bethgelab/imagecorruptions>`_.
+
+    Required Keys:
+
+    - img (np.uint8)
+
+
+    Modified Keys:
+
+    - img (np.uint8)
+
+
+    Args:
+        corruption (str): Corruption name.
+        severity (int): The severity of corruption. Defaults to 1.
+    """
+
+    def __init__(self, corruption: str, severity: int = 1) -> None:
+        self.corruption = corruption
+        self.severity = severity
+
+    def transform(self, results: dict) -> dict:
+        """Call function to corrupt image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images corrupted.
+        """
+
+        if corrupt is None:
+            raise RuntimeError('imagecorruptions is not installed')
+        results['img'] = corrupt(
+            results['img'].astype(np.uint8),
+            corruption_name=self.corruption,
+            severity=self.severity)
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(corruption={self.corruption}, '
+        repr_str += f'severity={self.severity})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+@avoid_cache_randomness
+class Albu(BaseTransform):
+    """Albumentation augmentation.
+
+    Adds custom transformations from Albumentations library.
+    Please, visit `https://albumentations.readthedocs.io`
+    to get more information.
+
+    Required Keys:
+
+    - img (np.uint8)
+    - gt_bboxes (HorizontalBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+
+    Modified Keys:
+
+    - img (np.uint8)
+    - gt_bboxes (HorizontalBoxes[torch.float32]) (optional)
+    - gt_masks (BitmapMasks | PolygonMasks) (optional)
+    - img_shape (tuple)
+
+    An example of ``transforms`` is as followed:
+
+    .. code-block::
+
+        [
+            dict(
+                type='ShiftScaleRotate',
+                shift_limit=0.0625,
+                scale_limit=0.0,
+                rotate_limit=0,
+                interpolation=1,
+                p=0.5),
+            dict(
+                type='RandomBrightnessContrast',
+                brightness_limit=[0.1, 0.3],
+                contrast_limit=[0.1, 0.3],
+                p=0.2),
+            dict(type='ChannelShuffle', p=0.1),
+            dict(
+                type='OneOf',
+                transforms=[
+                    dict(type='Blur', blur_limit=3, p=1.0),
+                    dict(type='MedianBlur', blur_limit=3, p=1.0)
+                ],
+                p=0.1),
+        ]
+
+    Args:
+        transforms (list[dict]): A list of albu transformations
+        bbox_params (dict, optional): Bbox_params for albumentation `Compose`
+        keymap (dict, optional): Contains
+            {'input key':'albumentation-style key'}
+        skip_img_without_anno (bool): Whether to skip the image if no ann left
+            after aug. Defaults to False.
+    """
+
+    def __init__(self,
+                 transforms: List[dict],
+                 bbox_params: Optional[dict] = None,
+                 keymap: Optional[dict] = None,
+                 skip_img_without_anno: bool = False) -> None:
+        if Compose is None:
+            raise RuntimeError('albumentations is not installed')
+
+        # Args will be modified later, copying it will be safer
+        transforms = copy.deepcopy(transforms)
+        if bbox_params is not None:
+            bbox_params = copy.deepcopy(bbox_params)
+        if keymap is not None:
+            keymap = copy.deepcopy(keymap)
+        self.transforms = transforms
+        self.filter_lost_elements = False
+        self.skip_img_without_anno = skip_img_without_anno
+
+        # A simple workaround to remove masks without boxes
+        if (isinstance(bbox_params, dict) and 'label_fields' in bbox_params
+                and 'filter_lost_elements' in bbox_params):
+            self.filter_lost_elements = True
+            self.origin_label_fields = bbox_params['label_fields']
+            bbox_params['label_fields'] = ['idx_mapper']
+            del bbox_params['filter_lost_elements']
+
+        self.bbox_params = (
+            self.albu_builder(bbox_params) if bbox_params else None)
+        self.aug = Compose([self.albu_builder(t) for t in self.transforms],
+                           bbox_params=self.bbox_params)
+
+        if not keymap:
+            self.keymap_to_albu = {
+                'img': 'image',
+                'gt_masks': 'masks',
+                'gt_bboxes': 'bboxes'
+            }
+        else:
+            self.keymap_to_albu = keymap
+        self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
+
+    def albu_builder(self, cfg: dict) -> albumentations:
+        """Import a module from albumentations.
+
+        It inherits some of :func:`build_from_cfg` logic.
+
+        Args:
+            cfg (dict): Config dict. It should at least contain the key "type".
+
+        Returns:
+            obj: The constructed object.
+        """
+
+        assert isinstance(cfg, dict) and 'type' in cfg
+        args = cfg.copy()
+        obj_type = args.pop('type')
+        if is_str(obj_type):
+            if albumentations is None:
+                raise RuntimeError('albumentations is not installed')
+            obj_cls = getattr(albumentations, obj_type)
+        elif inspect.isclass(obj_type):
+            obj_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a str or valid type, but got {type(obj_type)}')
+
+        if 'transforms' in args:
+            args['transforms'] = [
+                self.albu_builder(transform)
+                for transform in args['transforms']
+            ]
+
+        return obj_cls(**args)
+
+    @staticmethod
+    def mapper(d: dict, keymap: dict) -> dict:
+        """Dictionary mapper. Renames keys according to keymap provided.
+
+        Args:
+            d (dict): old dict
+            keymap (dict): {'old_key':'new_key'}
+        Returns:
+            dict: new dict.
+        """
+        updated_dict = {}
+        for k, v in zip(d.keys(), d.values()):
+            new_k = keymap.get(k, k)
+            updated_dict[new_k] = d[k]
+        return updated_dict
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> Union[dict, None]:
+        """Transform function of Albu."""
+        # TODO: gt_seg_map is not currently supported
+        # dict to albumentations format
+        results = self.mapper(results, self.keymap_to_albu)
+        results, ori_masks = self._preprocess_results(results)
+        results = self.aug(**results)
+        results = self._postprocess_results(results, ori_masks)
+        if results is None:
+            return None
+        # back to the original format
+        results = self.mapper(results, self.keymap_back)
+        results['img_shape'] = results['img'].shape[:2]
+        return results
+
+    def _preprocess_results(self, results: dict) -> tuple:
+        """Pre-processing results to facilitate the use of Albu."""
+        if 'bboxes' in results:
+            # to list of boxes
+            if not isinstance(results['bboxes'], HorizontalBoxes):
+                raise NotImplementedError(
+                    'Albu only supports horizontal boxes now')
+            bboxes = results['bboxes'].numpy()
+            results['bboxes'] = [x for x in bboxes]
+            # add pseudo-field for filtration
+            if self.filter_lost_elements:
+                results['idx_mapper'] = np.arange(len(results['bboxes']))
+
+        # TODO: Support mask structure in albu
+        ori_masks = None
+        if 'masks' in results:
+            if isinstance(results['masks'], PolygonMasks):
+                raise NotImplementedError(
+                    'Albu only supports BitMap masks now')
+            ori_masks = results['masks']
+            if albumentations.__version__ < '0.5':
+                results['masks'] = results['masks'].masks
+            else:
+                results['masks'] = [mask for mask in results['masks'].masks]
+
+        return results, ori_masks
+
+    def _postprocess_results(
+            self,
+            results: dict,
+            ori_masks: Optional[Union[BitmapMasks,
+                                      PolygonMasks]] = None) -> dict:
+        """Post-processing Albu output."""
+        # albumentations may return np.array or list on different versions
+        if 'gt_bboxes_labels' in results and isinstance(
+                results['gt_bboxes_labels'], list):
+            results['gt_bboxes_labels'] = np.array(
+                results['gt_bboxes_labels'], dtype=np.int64)
+        if 'gt_ignore_flags' in results and isinstance(
+                results['gt_ignore_flags'], list):
+            results['gt_ignore_flags'] = np.array(
+                results['gt_ignore_flags'], dtype=bool)
+
+        if 'bboxes' in results:
+            if isinstance(results['bboxes'], list):
+                results['bboxes'] = np.array(
+                    results['bboxes'], dtype=np.float32)
+            results['bboxes'] = results['bboxes'].reshape(-1, 4)
+            results['bboxes'] = HorizontalBoxes(results['bboxes'])
+
+            # filter label_fields
+            if self.filter_lost_elements:
+
+                for label in self.origin_label_fields:
+                    results[label] = np.array(
+                        [results[label][i] for i in results['idx_mapper']])
+                if 'masks' in results:
+                    assert ori_masks is not None
+                    results['masks'] = np.array(
+                        [results['masks'][i] for i in results['idx_mapper']])
+                    results['masks'] = ori_masks.__class__(
+                        results['masks'],
+                        results['masks'][0].shape[0],
+                        results['masks'][0].shape[1],
+                    )
+                if (not len(results['idx_mapper'])
+                        and self.skip_img_without_anno):
+                    return None
+            elif 'masks' in results:
+                results['masks'] = ori_masks.__class__(results['masks'],
+                                                       ori_masks.height,
+                                                       ori_masks.width)
+
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+@avoid_cache_randomness
+class RandomCenterCropPad(BaseTransform):
+    """Random center crop and random around padding for CornerNet.
+
+    This operation generates randomly cropped image from the original image and
+    pads it simultaneously. Different from :class:`RandomCrop`, the output
+    shape may not equal to ``crop_size`` strictly. We choose a random value
+    from ``ratios`` and the output shape could be larger or smaller than
+    ``crop_size``. The padding operation is also different from :class:`Pad`,
+    here we use around padding instead of right-bottom padding.
+
+    The relation between output image (padding image) and original image:
+
+    .. code:: text
+
+                        output image
+
+               +----------------------------+
+               |          padded area       |
+        +------|----------------------------|----------+
+        |      |         cropped area       |          |
+        |      |         +---------------+  |          |
+        |      |         |    .   center |  |          | original image
+        |      |         |        range  |  |          |
+        |      |         +---------------+  |          |
+        +------|----------------------------|----------+
+               |          padded area       |
+               +----------------------------+
+
+    There are 5 main areas in the figure:
+
+    - output image: output image of this operation, also called padding
+      image in following instruction.
+    - original image: input image of this operation.
+    - padded area: non-intersect area of output image and original image.
+    - cropped area: the overlap of output image and original image.
+    - center range: a smaller area where random center chosen from.
+      center range is computed by ``border`` and original image's shape
+      to avoid our random center is too close to original image's border.
+
+    Also this operation act differently in train and test mode, the summary
+    pipeline is listed below.
+
+    Train pipeline:
+
+    1. Choose a ``random_ratio`` from ``ratios``, the shape of padding image
+       will be ``random_ratio * crop_size``.
+    2. Choose a ``random_center`` in center range.
+    3. Generate padding image with center matches the ``random_center``.
+    4. Initialize the padding image with pixel value equals to ``mean``.
+    5. Copy the cropped area to padding image.
+    6. Refine annotations.
+
+    Test pipeline:
+
+    1. Compute output shape according to ``test_pad_mode``.
+    2. Generate padding image with center matches the original image
+       center.
+    3. Initialize the padding image with pixel value equals to ``mean``.
+    4. Copy the ``cropped area`` to padding image.
+
+    Required Keys:
+
+    - img (np.float32)
+    - img_shape (tuple)
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - img (np.float32)
+    - img_shape (tuple)
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Args:
+        crop_size (tuple, optional): expected size after crop, final size will
+            computed according to ratio. Requires  (width, height)
+            in train mode, and None in test mode.
+        ratios (tuple, optional): random select a ratio from tuple and crop
+            image to (crop_size[0] * ratio) * (crop_size[1] * ratio).
+            Only available in train mode. Defaults to (0.9, 1.0, 1.1).
+        border (int, optional): max distance from center select area to image
+            border. Only available in train mode. Defaults to 128.
+        mean (sequence, optional): Mean values of 3 channels.
+        std (sequence, optional): Std values of 3 channels.
+        to_rgb (bool, optional): Whether to convert the image from BGR to RGB.
+        test_mode (bool): whether involve random variables in transform.
+            In train mode, crop_size is fixed, center coords and ratio is
+            random selected from predefined lists. In test mode, crop_size
+            is image's original shape, center coords and ratio is fixed.
+            Defaults to False.
+        test_pad_mode (tuple, optional): padding method and padding shape
+            value, only available in test mode. Default is using
+            'logical_or' with 127 as padding shape value.
+
+            - 'logical_or': final_shape = input_shape | padding_shape_value
+            - 'size_divisor': final_shape = int(
+              ceil(input_shape / padding_shape_value) * padding_shape_value)
+
+            Defaults to ('logical_or', 127).
+        test_pad_add_pix (int): Extra padding pixel in test mode.
+            Defaults to 0.
+        bbox_clip_border (bool): Whether clip the objects outside
+            the border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 crop_size: Optional[tuple] = None,
+                 ratios: Optional[tuple] = (0.9, 1.0, 1.1),
+                 border: Optional[int] = 128,
+                 mean: Optional[Sequence] = None,
+                 std: Optional[Sequence] = None,
+                 to_rgb: Optional[bool] = None,
+                 test_mode: bool = False,
+                 test_pad_mode: Optional[tuple] = ('logical_or', 127),
+                 test_pad_add_pix: int = 0,
+                 bbox_clip_border: bool = True) -> None:
+        if test_mode:
+            assert crop_size is None, 'crop_size must be None in test mode'
+            assert ratios is None, 'ratios must be None in test mode'
+            assert border is None, 'border must be None in test mode'
+            assert isinstance(test_pad_mode, (list, tuple))
+            assert test_pad_mode[0] in ['logical_or', 'size_divisor']
+        else:
+            assert isinstance(crop_size, (list, tuple))
+            assert crop_size[0] > 0 and crop_size[1] > 0, (
+                'crop_size must > 0 in train mode')
+            assert isinstance(ratios, (list, tuple))
+            assert test_pad_mode is None, (
+                'test_pad_mode must be None in train mode')
+
+        self.crop_size = crop_size
+        self.ratios = ratios
+        self.border = border
+        # We do not set default value to mean, std and to_rgb because these
+        # hyper-parameters are easy to forget but could affect the performance.
+        # Please use the same setting as Normalize for performance assurance.
+        assert mean is not None and std is not None and to_rgb is not None
+        self.to_rgb = to_rgb
+        self.input_mean = mean
+        self.input_std = std
+        if to_rgb:
+            self.mean = mean[::-1]
+            self.std = std[::-1]
+        else:
+            self.mean = mean
+            self.std = std
+        self.test_mode = test_mode
+        self.test_pad_mode = test_pad_mode
+        self.test_pad_add_pix = test_pad_add_pix
+        self.bbox_clip_border = bbox_clip_border
+
+    def _get_border(self, border, size):
+        """Get final border for the target size.
+
+        This function generates a ``final_border`` according to image's shape.
+        The area between ``final_border`` and ``size - final_border`` is the
+        ``center range``. We randomly choose center from the ``center range``
+        to avoid our random center is too close to original image's border.
+        Also ``center range`` should be larger than 0.
+
+        Args:
+            border (int): The initial border, default is 128.
+            size (int): The width or height of original image.
+        Returns:
+            int: The final border.
+        """
+        k = 2 * border / size
+        i = pow(2, np.ceil(np.log2(np.ceil(k))) + (k == int(k)))
+        return border // i
+
+    def _filter_boxes(self, patch, boxes):
+        """Check whether the center of each box is in the patch.
+
+        Args:
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+            boxes (numpy array, (N x 4)): Ground truth boxes.
+
+        Returns:
+            mask (numpy array, (N,)): Each box is inside or outside the patch.
+        """
+        center = boxes.centers.numpy()
+        mask = (center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (
+            center[:, 0] < patch[2]) * (
+                center[:, 1] < patch[3])
+        return mask
+
+    def _crop_image_and_paste(self, image, center, size):
+        """Crop image with a given center and size, then paste the cropped
+        image to a blank image with two centers align.
+
+        This function is equivalent to generating a blank image with ``size``
+        as its shape. Then cover it on the original image with two centers (
+        the center of blank image and the random center of original image)
+        aligned. The overlap area is paste from the original image and the
+        outside area is filled with ``mean pixel``.
+
+        Args:
+            image (np array, H x W x C): Original image.
+            center (list[int]): Target crop center coord.
+            size (list[int]): Target crop size. [target_h, target_w]
+
+        Returns:
+            cropped_img (np array, target_h x target_w x C): Cropped image.
+            border (np array, 4): The distance of four border of
+                ``cropped_img`` to the original image area, [top, bottom,
+                left, right]
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+        """
+        center_y, center_x = center
+        target_h, target_w = size
+        img_h, img_w, img_c = image.shape
+
+        x0 = max(0, center_x - target_w // 2)
+        x1 = min(center_x + target_w // 2, img_w)
+        y0 = max(0, center_y - target_h // 2)
+        y1 = min(center_y + target_h // 2, img_h)
+        patch = np.array((int(x0), int(y0), int(x1), int(y1)))
+
+        left, right = center_x - x0, x1 - center_x
+        top, bottom = center_y - y0, y1 - center_y
+
+        cropped_center_y, cropped_center_x = target_h // 2, target_w // 2
+        cropped_img = np.zeros((target_h, target_w, img_c), dtype=image.dtype)
+        for i in range(img_c):
+            cropped_img[:, :, i] += self.mean[i]
+        y_slice = slice(cropped_center_y - top, cropped_center_y + bottom)
+        x_slice = slice(cropped_center_x - left, cropped_center_x + right)
+        cropped_img[y_slice, x_slice, :] = image[y0:y1, x0:x1, :]
+
+        border = np.array([
+            cropped_center_y - top, cropped_center_y + bottom,
+            cropped_center_x - left, cropped_center_x + right
+        ],
+                          dtype=np.float32)
+
+        return cropped_img, border, patch
+
+    def _train_aug(self, results):
+        """Random crop and around padding the original image.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        gt_bboxes = results['gt_bboxes']
+        while True:
+            scale = random.choice(self.ratios)
+            new_h = int(self.crop_size[1] * scale)
+            new_w = int(self.crop_size[0] * scale)
+            h_border = self._get_border(self.border, h)
+            w_border = self._get_border(self.border, w)
+
+            for i in range(50):
+                center_x = random.randint(low=w_border, high=w - w_border)
+                center_y = random.randint(low=h_border, high=h - h_border)
+
+                cropped_img, border, patch = self._crop_image_and_paste(
+                    img, [center_y, center_x], [new_h, new_w])
+
+                if len(gt_bboxes) == 0:
+                    results['img'] = cropped_img
+                    results['img_shape'] = cropped_img.shape[:2]
+                    return results
+
+                # if image do not have valid bbox, any crop patch is valid.
+                mask = self._filter_boxes(patch, gt_bboxes)
+                if not mask.any():
+                    continue
+
+                results['img'] = cropped_img
+                results['img_shape'] = cropped_img.shape[:2]
+
+                x0, y0, x1, y1 = patch
+
+                left_w, top_h = center_x - x0, center_y - y0
+                cropped_center_x, cropped_center_y = new_w // 2, new_h // 2
+
+                # crop bboxes accordingly and clip to the image boundary
+                gt_bboxes = gt_bboxes[mask]
+                gt_bboxes.translate_([
+                    cropped_center_x - left_w - x0,
+                    cropped_center_y - top_h - y0
+                ])
+                if self.bbox_clip_border:
+                    gt_bboxes.clip_([new_h, new_w])
+                keep = gt_bboxes.is_inside([new_h, new_w]).numpy()
+                gt_bboxes = gt_bboxes[keep]
+
+                results['gt_bboxes'] = gt_bboxes
+
+                # ignore_flags
+                if results.get('gt_ignore_flags', None) is not None:
+                    gt_ignore_flags = results['gt_ignore_flags'][mask]
+                    results['gt_ignore_flags'] = \
+                        gt_ignore_flags[keep]
+
+                # labels
+                if results.get('gt_bboxes_labels', None) is not None:
+                    gt_labels = results['gt_bboxes_labels'][mask]
+                    results['gt_bboxes_labels'] = gt_labels[keep]
+
+                if 'gt_masks' in results or 'gt_seg_map' in results:
+                    raise NotImplementedError(
+                        'RandomCenterCropPad only supports bbox.')
+
+                return results
+
+    def _test_aug(self, results):
+        """Around padding the original image without cropping.
+
+        The padding mode and value are from ``test_pad_mode``.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        if self.test_pad_mode[0] in ['logical_or']:
+            # self.test_pad_add_pix is only used for centernet
+            target_h = (h | self.test_pad_mode[1]) + self.test_pad_add_pix
+            target_w = (w | self.test_pad_mode[1]) + self.test_pad_add_pix
+        elif self.test_pad_mode[0] in ['size_divisor']:
+            divisor = self.test_pad_mode[1]
+            target_h = int(np.ceil(h / divisor)) * divisor
+            target_w = int(np.ceil(w / divisor)) * divisor
+        else:
+            raise NotImplementedError(
+                'RandomCenterCropPad only support two testing pad mode:'
+                'logical-or and size_divisor.')
+
+        cropped_img, border, _ = self._crop_image_and_paste(
+            img, [h // 2, w // 2], [target_h, target_w])
+        results['img'] = cropped_img
+        results['img_shape'] = cropped_img.shape[:2]
+        results['border'] = border
+        return results
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        assert img.dtype == np.float32, (
+            'RandomCenterCropPad needs the input image of dtype np.float32,'
+            ' please set "to_float32=True" in "LoadImageFromFile" pipeline')
+        h, w, c = img.shape
+        assert c == len(self.mean)
+        if self.test_mode:
+            return self._test_aug(results)
+        else:
+            return self._train_aug(results)
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'ratios={self.ratios}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'mean={self.input_mean}, '
+        repr_str += f'std={self.input_std}, '
+        repr_str += f'to_rgb={self.to_rgb}, '
+        repr_str += f'test_mode={self.test_mode}, '
+        repr_str += f'test_pad_mode={self.test_pad_mode}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CutOut(BaseTransform):
+    """CutOut operation.
+
+    Randomly drop some regions of image used in
+    `Cutout <https://arxiv.org/abs/1708.04552>`_.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        n_holes (int or tuple[int, int]): Number of regions to be dropped.
+            If it is given as a list, number of holes will be randomly
+            selected from the closed interval [``n_holes[0]``, ``n_holes[1]``].
+        cutout_shape (tuple[int, int] or list[tuple[int, int]], optional):
+            The candidate shape of dropped regions. It can be
+            ``tuple[int, int]`` to use a fixed cutout shape, or
+            ``list[tuple[int, int]]`` to randomly choose shape
+            from the list. Defaults to None.
+        cutout_ratio (tuple[float, float] or list[tuple[float, float]],
+            optional): The candidate ratio of dropped regions. It can be
+            ``tuple[float, float]`` to use a fixed ratio or
+            ``list[tuple[float, float]]`` to randomly choose ratio
+            from the list. Please note that ``cutout_shape`` and
+            ``cutout_ratio`` cannot be both given at the same time.
+            Defaults to None.
+        fill_in (tuple[float, float, float] or tuple[int, int, int]): The value
+            of pixel to fill in the dropped regions. Defaults to (0, 0, 0).
+    """
+
+    def __init__(
+        self,
+        n_holes: Union[int, Tuple[int, int]],
+        cutout_shape: Optional[Union[Tuple[int, int],
+                                     List[Tuple[int, int]]]] = None,
+        cutout_ratio: Optional[Union[Tuple[float, float],
+                                     List[Tuple[float, float]]]] = None,
+        fill_in: Union[Tuple[float, float, float], Tuple[int, int,
+                                                         int]] = (0, 0, 0)
+    ) -> None:
+
+        assert (cutout_shape is None) ^ (cutout_ratio is None), \
+            'Either cutout_shape or cutout_ratio should be specified.'
+        assert (isinstance(cutout_shape, (list, tuple))
+                or isinstance(cutout_ratio, (list, tuple)))
+        if isinstance(n_holes, tuple):
+            assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1]
+        else:
+            n_holes = (n_holes, n_holes)
+        self.n_holes = n_holes
+        self.fill_in = fill_in
+        self.with_ratio = cutout_ratio is not None
+        self.candidates = cutout_ratio if self.with_ratio else cutout_shape
+        if not isinstance(self.candidates, list):
+            self.candidates = [self.candidates]
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Call function to drop some regions of image."""
+        h, w, c = results['img'].shape
+        n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
+        for _ in range(n_holes):
+            x1 = np.random.randint(0, w)
+            y1 = np.random.randint(0, h)
+            index = np.random.randint(0, len(self.candidates))
+            if not self.with_ratio:
+                cutout_w, cutout_h = self.candidates[index]
+            else:
+                cutout_w = int(self.candidates[index][0] * w)
+                cutout_h = int(self.candidates[index][1] * h)
+
+            x2 = np.clip(x1 + cutout_w, 0, w)
+            y2 = np.clip(y1 + cutout_h, 0, h)
+            results['img'][y1:y2, x1:x2, :] = self.fill_in
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(n_holes={self.n_holes}, '
+        repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio
+                     else f'cutout_shape={self.candidates}, ')
+        repr_str += f'fill_in={self.fill_in})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Mosaic(BaseTransform):
+    """Mosaic augmentation.
+
+    Given 4 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |  pad      |
+                |      +-----------+           |
+                |      |           |           |
+                |      |  image1   |--------+  |
+                |      |           |        |  |
+                |      |           | image2 |  |
+     center_y   |----+-------------+-----------|
+                |    |   cropped   |           |
+                |pad |   image3    |  image4   |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+     The mosaic transform steps are as follows:
+
+         1. Choose the mosaic center as the intersections of 4 images
+         2. Get the left top image according to the index, and randomly
+            sample another 3 images from the custom dataset.
+         3. Sub image will be cropped if image is larger than mosaic patch
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        img_scale (Sequence[int]): Image size before mosaic pipeline of single
+            image. The shape order should be (width, height).
+            Defaults to (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Defaults to (0.5, 1.5).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pad_val (int): Pad value. Defaults to 114.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 center_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 bbox_clip_border: bool = True,
+                 pad_val: float = 114.0,
+                 prob: float = 1.0) -> None:
+        assert isinstance(img_scale, tuple)
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
+                                 f'got {prob}.'
+
+        log_img_scale(img_scale, skip_square=True, shape_order='wh')
+        self.img_scale = img_scale
+        self.center_ratio_range = center_ratio_range
+        self.bbox_clip_border = bbox_clip_border
+        self.pad_val = pad_val
+        self.prob = prob
+
+    @cache_randomness
+    def get_indexes(self, dataset: BaseDataset) -> int:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+
+        Returns:
+            list: indexes.
+        """
+
+        indexes = [random.randint(0, len(dataset)) for _ in range(3)]
+        return indexes
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Mosaic transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        assert 'mix_results' in results
+        mosaic_bboxes = []
+        mosaic_bboxes_labels = []
+        mosaic_ignore_flags = []
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2)),
+                self.pad_val,
+                dtype=results['img'].dtype)
+
+        # mosaic center x, y
+        center_x = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        center_y = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[1])
+        center_position = (center_x, center_y)
+
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                results_patch = copy.deepcopy(results)
+            else:
+                results_patch = copy.deepcopy(results['mix_results'][i - 1])
+
+            img_i = results_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(self.img_scale[1] / h_i,
+                                self.img_scale[0] / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+
+            # adjust coordinate
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
+            gt_ignore_flags_i = results_patch['gt_ignore_flags']
+
+            padw = x1_p - x1_c
+            padh = y1_p - y1_c
+            gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
+            gt_bboxes_i.translate_([padw, padh])
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_bboxes_labels.append(gt_bboxes_labels_i)
+            mosaic_ignore_flags.append(gt_ignore_flags_i)
+
+        mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
+        mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
+        mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
+
+        if self.bbox_clip_border:
+            mosaic_bboxes.clip_([2 * self.img_scale[1], 2 * self.img_scale[0]])
+        # remove outside bboxes
+        inside_inds = mosaic_bboxes.is_inside(
+            [2 * self.img_scale[1], 2 * self.img_scale[0]]).numpy()
+        mosaic_bboxes = mosaic_bboxes[inside_inds]
+        mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
+        mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape[:2]
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_bboxes_labels'] = mosaic_bboxes_labels
+        results['gt_ignore_flags'] = mosaic_ignore_flags
+        return results
+
+    def _mosaic_combine(
+            self, loc: str, center_position_xy: Sequence[float],
+            img_shape_wh: Sequence[int]) -> Tuple[Tuple[int], Tuple[int]]:
+        """Calculate global coordinate of mosaic image and local coordinate of
+        cropped sub-image.
+
+        Args:
+            loc (str): Index for the sub-image, loc in ('top_left',
+              'top_right', 'bottom_left', 'bottom_right').
+            center_position_xy (Sequence[float]): Mixing center for 4 images,
+                (x, y).
+            img_shape_wh (Sequence[int]): Width and height of sub-image
+
+        Returns:
+            tuple[tuple[float]]: Corresponding coordinate of pasting and
+                cropping
+                - paste_coord (tuple): paste corner coordinate in mosaic image.
+                - crop_coord (tuple): crop corner coordinate in mosaic image.
+        """
+        assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        if loc == 'top_left':
+            # index0 to top left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             center_position_xy[0], \
+                             center_position_xy[1]
+            crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - (
+                y2 - y1), img_shape_wh[0], img_shape_wh[1]
+
+        elif loc == 'top_right':
+            # index1 to top right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[0] * 2), \
+                             center_position_xy[1]
+            crop_coord = 0, img_shape_wh[1] - (y2 - y1), min(
+                img_shape_wh[0], x2 - x1), img_shape_wh[1]
+
+        elif loc == 'bottom_left':
+            # index2 to bottom left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             center_position_xy[1], \
+                             center_position_xy[0], \
+                             min(self.img_scale[1] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min(
+                y2 - y1, img_shape_wh[1])
+
+        else:
+            # index3 to bottom right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             center_position_xy[1], \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[0] * 2), \
+                             min(self.img_scale[1] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = 0, 0, min(img_shape_wh[0],
+                                   x2 - x1), min(y2 - y1, img_shape_wh[1])
+
+        paste_coord = x1, y1, x2, y2
+        return paste_coord, crop_coord
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class MixUp(BaseTransform):
+    """MixUp data augmentation.
+
+    .. code:: text
+
+                         mixup transform
+                +------------------------------+
+                | mixup image   |              |
+                |      +--------|--------+     |
+                |      |        |        |     |
+                |---------------+        |     |
+                |      |                 |     |
+                |      |      image      |     |
+                |      |                 |     |
+                |      |                 |     |
+                |      |-----------------+     |
+                |             pad              |
+                +------------------------------+
+
+     The mixup transform steps are as follows:
+
+        1. Another random image is picked by dataset and embedded in
+           the top left patch(after padding and resizing)
+        2. The target of mixup transform is the weighted average of mixup
+           image and origin image.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+
+    Args:
+        img_scale (Sequence[int]): Image output size after mixup pipeline.
+            The shape order should be (width, height). Defaults to (640, 640).
+        ratio_range (Sequence[float]): Scale ratio of mixup image.
+            Defaults to (0.5, 1.5).
+        flip_ratio (float): Horizontal flip ratio of mixup image.
+            Defaults to 0.5.
+        pad_val (int): Pad value. Defaults to 114.
+        max_iters (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_iters`, but gt_bbox is still
+            empty, then the iteration is terminated. Defaults to 15.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 flip_ratio: float = 0.5,
+                 pad_val: float = 114.0,
+                 max_iters: int = 15,
+                 bbox_clip_border: bool = True) -> None:
+        assert isinstance(img_scale, tuple)
+        log_img_scale(img_scale, skip_square=True, shape_order='wh')
+        self.dynamic_scale = img_scale
+        self.ratio_range = ratio_range
+        self.flip_ratio = flip_ratio
+        self.pad_val = pad_val
+        self.max_iters = max_iters
+        self.bbox_clip_border = bbox_clip_border
+
+    @cache_randomness
+    def get_indexes(self, dataset: BaseDataset) -> int:
+        """Call function to collect indexes.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+
+        Returns:
+            list: indexes.
+        """
+
+        for i in range(self.max_iters):
+            index = random.randint(0, len(dataset))
+            gt_bboxes_i = dataset[index]['gt_bboxes']
+            if len(gt_bboxes_i) != 0:
+                break
+
+        return index
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """MixUp transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        assert 'mix_results' in results
+        assert len(
+            results['mix_results']) == 1, 'MixUp only support 2 images now !'
+
+        if results['mix_results'][0]['gt_bboxes'].shape[0] == 0:
+            # empty bbox
+            return results
+
+        retrieve_results = results['mix_results'][0]
+        retrieve_img = retrieve_results['img']
+
+        jit_factor = random.uniform(*self.ratio_range)
+        is_flip = random.uniform(0, 1) > self.flip_ratio
+
+        if len(retrieve_img.shape) == 3:
+            out_img = np.ones(
+                (self.dynamic_scale[1], self.dynamic_scale[0], 3),
+                dtype=retrieve_img.dtype) * self.pad_val
+        else:
+            out_img = np.ones(
+                self.dynamic_scale[::-1],
+                dtype=retrieve_img.dtype) * self.pad_val
+
+        # 1. keep_ratio resize
+        scale_ratio = min(self.dynamic_scale[1] / retrieve_img.shape[0],
+                          self.dynamic_scale[0] / retrieve_img.shape[1])
+        retrieve_img = mmcv.imresize(
+            retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
+                           int(retrieve_img.shape[0] * scale_ratio)))
+
+        # 2. paste
+        out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
+
+        # 3. scale jit
+        scale_ratio *= jit_factor
+        out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
+                                          int(out_img.shape[0] * jit_factor)))
+
+        # 4. flip
+        if is_flip:
+            out_img = out_img[:, ::-1, :]
+
+        # 5. random crop
+        ori_img = results['img']
+        origin_h, origin_w = out_img.shape[:2]
+        target_h, target_w = ori_img.shape[:2]
+        padded_img = np.ones((max(origin_h, target_h), max(
+            origin_w, target_w), 3)) * self.pad_val
+        padded_img = padded_img.astype(np.uint8)
+        padded_img[:origin_h, :origin_w] = out_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h,
+                                        x_offset:x_offset + target_w]
+
+        # 6. adjust bbox
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio])
+        if self.bbox_clip_border:
+            retrieve_gt_bboxes.clip_([origin_h, origin_w])
+
+        if is_flip:
+            retrieve_gt_bboxes.flip_([origin_h, origin_w],
+                                     direction='horizontal')
+
+        # 7. filter
+        cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone()
+        cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset])
+        if self.bbox_clip_border:
+            cp_retrieve_gt_bboxes.clip_([target_h, target_w])
+
+        # 8. mix up
+        ori_img = ori_img.astype(np.float32)
+        mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32)
+
+        retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
+        retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
+
+        mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat(
+            (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0)
+        mixup_gt_bboxes_labels = np.concatenate(
+            (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
+        mixup_gt_ignore_flags = np.concatenate(
+            (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
+
+        # remove outside bbox
+        inside_inds = mixup_gt_bboxes.is_inside([target_h, target_w]).numpy()
+        mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
+        mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds]
+        mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds]
+
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape[:2]
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
+        results['gt_ignore_flags'] = mixup_gt_ignore_flags
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(dynamic_scale={self.dynamic_scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'flip_ratio={self.flip_ratio}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'max_iters={self.max_iters}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomAffine(BaseTransform):
+    """Random affine transform data augmentation.
+
+    This operation randomly generates affine transform matrix which including
+    rotation, translation, shear and scaling transforms.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        max_rotate_degree (float): Maximum degrees of rotation transform.
+            Defaults to 10.
+        max_translate_ratio (float): Maximum ratio of translation.
+            Defaults to 0.1.
+        scaling_ratio_range (tuple[float]): Min and max ratio of
+            scaling transform. Defaults to (0.5, 1.5).
+        max_shear_degree (float): Maximum degrees of shear
+            transform. Defaults to 2.
+        border (tuple[int]): Distance from width and height sides of input
+            image to adjust output shape. Only used in mosaic dataset.
+            Defaults to (0, 0).
+        border_val (tuple[int]): Border padding values of 3 channels.
+            Defaults to (114, 114, 114).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+    """
+
+    def __init__(self,
+                 max_rotate_degree: float = 10.0,
+                 max_translate_ratio: float = 0.1,
+                 scaling_ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 max_shear_degree: float = 2.0,
+                 border: Tuple[int, int] = (0, 0),
+                 border_val: Tuple[int, int, int] = (114, 114, 114),
+                 bbox_clip_border: bool = True) -> None:
+        assert 0 <= max_translate_ratio <= 1
+        assert scaling_ratio_range[0] <= scaling_ratio_range[1]
+        assert scaling_ratio_range[0] > 0
+        self.max_rotate_degree = max_rotate_degree
+        self.max_translate_ratio = max_translate_ratio
+        self.scaling_ratio_range = scaling_ratio_range
+        self.max_shear_degree = max_shear_degree
+        self.border = border
+        self.border_val = border_val
+        self.bbox_clip_border = bbox_clip_border
+
+    @cache_randomness
+    def _get_random_homography_matrix(self, height, width):
+        # Rotation
+        rotation_degree = random.uniform(-self.max_rotate_degree,
+                                         self.max_rotate_degree)
+        rotation_matrix = self._get_rotation_matrix(rotation_degree)
+
+        # Scaling
+        scaling_ratio = random.uniform(self.scaling_ratio_range[0],
+                                       self.scaling_ratio_range[1])
+        scaling_matrix = self._get_scaling_matrix(scaling_ratio)
+
+        # Shear
+        x_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        y_degree = random.uniform(-self.max_shear_degree,
+                                  self.max_shear_degree)
+        shear_matrix = self._get_shear_matrix(x_degree, y_degree)
+
+        # Translation
+        trans_x = random.uniform(-self.max_translate_ratio,
+                                 self.max_translate_ratio) * width
+        trans_y = random.uniform(-self.max_translate_ratio,
+                                 self.max_translate_ratio) * height
+        translate_matrix = self._get_translation_matrix(trans_x, trans_y)
+
+        warp_matrix = (
+            translate_matrix @ shear_matrix @ rotation_matrix @ scaling_matrix)
+        return warp_matrix
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        height = img.shape[0] + self.border[1] * 2
+        width = img.shape[1] + self.border[0] * 2
+
+        warp_matrix = self._get_random_homography_matrix(height, width)
+
+        img = cv2.warpPerspective(
+            img,
+            warp_matrix,
+            dsize=(width, height),
+            borderValue=self.border_val)
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+
+        bboxes = results['gt_bboxes']
+        num_bboxes = len(bboxes)
+        if num_bboxes:
+            bboxes.project_(warp_matrix)
+            if self.bbox_clip_border:
+                bboxes.clip_([height, width])
+            # remove outside bbox
+            valid_index = bboxes.is_inside([height, width]).numpy()
+            results['gt_bboxes'] = bboxes[valid_index]
+            results['gt_bboxes_labels'] = results['gt_bboxes_labels'][
+                valid_index]
+            results['gt_ignore_flags'] = results['gt_ignore_flags'][
+                valid_index]
+
+            if 'gt_masks' in results:
+                raise NotImplementedError('RandomAffine only supports bbox.')
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(max_rotate_degree={self.max_rotate_degree}, '
+        repr_str += f'max_translate_ratio={self.max_translate_ratio}, '
+        repr_str += f'scaling_ratio_range={self.scaling_ratio_range}, '
+        repr_str += f'max_shear_degree={self.max_shear_degree}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'border_val={self.border_val}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+    @staticmethod
+    def _get_rotation_matrix(rotate_degrees: float) -> np.ndarray:
+        radian = math.radians(rotate_degrees)
+        rotation_matrix = np.array(
+            [[np.cos(radian), -np.sin(radian), 0.],
+             [np.sin(radian), np.cos(radian), 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return rotation_matrix
+
+    @staticmethod
+    def _get_scaling_matrix(scale_ratio: float) -> np.ndarray:
+        scaling_matrix = np.array(
+            [[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]],
+            dtype=np.float32)
+        return scaling_matrix
+
+    @staticmethod
+    def _get_shear_matrix(x_shear_degrees: float,
+                          y_shear_degrees: float) -> np.ndarray:
+        x_radian = math.radians(x_shear_degrees)
+        y_radian = math.radians(y_shear_degrees)
+        shear_matrix = np.array([[1, np.tan(x_radian), 0.],
+                                 [np.tan(y_radian), 1, 0.], [0., 0., 1.]],
+                                dtype=np.float32)
+        return shear_matrix
+
+    @staticmethod
+    def _get_translation_matrix(x: float, y: float) -> np.ndarray:
+        translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]],
+                                      dtype=np.float32)
+        return translation_matrix
+
+
+@TRANSFORMS.register_module()
+class YOLOXHSVRandomAug(BaseTransform):
+    """Apply HSV augmentation to image sequentially. It is referenced from
+    https://github.com/Megvii-
+    BaseDetection/YOLOX/blob/main/yolox/data/data_augment.py#L21.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        hue_delta (int): delta of hue. Defaults to 5.
+        saturation_delta (int): delta of saturation. Defaults to 30.
+        value_delta (int): delat of value. Defaults to 30.
+    """
+
+    def __init__(self,
+                 hue_delta: int = 5,
+                 saturation_delta: int = 30,
+                 value_delta: int = 30) -> None:
+        self.hue_delta = hue_delta
+        self.saturation_delta = saturation_delta
+        self.value_delta = value_delta
+
+    @cache_randomness
+    def _get_hsv_gains(self):
+        hsv_gains = np.random.uniform(-1, 1, 3) * [
+            self.hue_delta, self.saturation_delta, self.value_delta
+        ]
+        # random selection of h, s, v
+        hsv_gains *= np.random.randint(0, 2, 3)
+        # prevent overflow
+        hsv_gains = hsv_gains.astype(np.int16)
+        return hsv_gains
+
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        hsv_gains = self._get_hsv_gains()
+        img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.int16)
+
+        img_hsv[..., 0] = (img_hsv[..., 0] + hsv_gains[0]) % 180
+        img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_gains[1], 0, 255)
+        img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_gains[2], 0, 255)
+        cv2.cvtColor(img_hsv.astype(img.dtype), cv2.COLOR_HSV2BGR, dst=img)
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(hue_delta={self.hue_delta}, '
+        repr_str += f'saturation_delta={self.saturation_delta}, '
+        repr_str += f'value_delta={self.value_delta})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CopyPaste(BaseTransform):
+    """Simple Copy-Paste is a Strong Data Augmentation Method for Instance
+    Segmentation The simple copy-paste transform steps are as follows:
+
+    1. The destination image is already resized with aspect ratio kept,
+       cropped and padded.
+    2. Randomly select a source image, which is also already resized
+       with aspect ratio kept, cropped and padded in a similar way
+       as the destination image.
+    3. Randomly select some objects from the source image.
+    4. Paste these source objects to the destination image directly,
+       due to the source and destination image have the same size.
+    5. Update object masks of the destination image, for some origin objects
+       may be occluded.
+    6. Generate bboxes from the updated destination masks and
+       filter some objects which are totally occluded, and adjust bboxes
+       which are partly occluded.
+    7. Append selected source bboxes, masks, and labels.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (BaseBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_masks (BitmapMasks) (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+    - gt_masks (optional)
+
+    Args:
+        max_num_pasted (int): The maximum number of pasted objects.
+            Defaults to 100.
+        bbox_occluded_thr (int): The threshold of occluded bbox.
+            Defaults to 10.
+        mask_occluded_thr (int): The threshold of occluded mask.
+            Defaults to 300.
+        selected (bool): Whether select objects or not. If select is False,
+            all objects of the source image will be pasted to the
+            destination image.
+            Defaults to True.
+        paste_by_box (bool): Whether use boxes as masks when masks are not
+            available.
+            Defaults to False.
+    """
+
+    def __init__(
+        self,
+        max_num_pasted: int = 100,
+        bbox_occluded_thr: int = 10,
+        mask_occluded_thr: int = 300,
+        selected: bool = True,
+        paste_by_box: bool = False,
+    ) -> None:
+        self.max_num_pasted = max_num_pasted
+        self.bbox_occluded_thr = bbox_occluded_thr
+        self.mask_occluded_thr = mask_occluded_thr
+        self.selected = selected
+        self.paste_by_box = paste_by_box
+
+    @cache_randomness
+    def get_indexes(self, dataset: BaseDataset) -> int:
+        """Call function to collect indexes.s.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+        Returns:
+            list: Indexes.
+        """
+        return random.randint(0, len(dataset))
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to make a copy-paste of image.
+
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Result dict with copy-paste transformed.
+        """
+
+        assert 'mix_results' in results
+        num_images = len(results['mix_results'])
+        assert num_images == 1, \
+            f'CopyPaste only supports processing 2 images, got {num_images}'
+        if self.selected:
+            selected_results = self._select_object(results['mix_results'][0])
+        else:
+            selected_results = results['mix_results'][0]
+        return self._copy_paste(results, selected_results)
+
+    @cache_randomness
+    def _get_selected_inds(self, num_bboxes: int) -> np.ndarray:
+        max_num_pasted = min(num_bboxes + 1, self.max_num_pasted)
+        num_pasted = np.random.randint(0, max_num_pasted)
+        return np.random.choice(num_bboxes, size=num_pasted, replace=False)
+
+    def get_gt_masks(self, results: dict) -> BitmapMasks:
+        """Get gt_masks originally or generated based on bboxes.
+
+        If gt_masks is not contained in results,
+        it will be generated based on gt_bboxes.
+        Args:
+            results (dict): Result dict.
+        Returns:
+            BitmapMasks: gt_masks, originally or generated based on bboxes.
+        """
+        if results.get('gt_masks', None) is not None:
+            if self.paste_by_box:
+                warnings.warn('gt_masks is already contained in results, '
+                              'so paste_by_box is disabled.')
+            return results['gt_masks']
+        else:
+            if not self.paste_by_box:
+                raise RuntimeError('results does not contain masks.')
+            return results['gt_bboxes'].create_masks(results['img'].shape[:2])
+
+    def _select_object(self, results: dict) -> dict:
+        """Select some objects from the source results."""
+        bboxes = results['gt_bboxes']
+        labels = results['gt_bboxes_labels']
+        masks = self.get_gt_masks(results)
+        ignore_flags = results['gt_ignore_flags']
+
+        selected_inds = self._get_selected_inds(bboxes.shape[0])
+
+        selected_bboxes = bboxes[selected_inds]
+        selected_labels = labels[selected_inds]
+        selected_masks = masks[selected_inds]
+        selected_ignore_flags = ignore_flags[selected_inds]
+
+        results['gt_bboxes'] = selected_bboxes
+        results['gt_bboxes_labels'] = selected_labels
+        results['gt_masks'] = selected_masks
+        results['gt_ignore_flags'] = selected_ignore_flags
+        return results
+
+    def _copy_paste(self, dst_results: dict, src_results: dict) -> dict:
+        """CopyPaste transform function.
+
+        Args:
+            dst_results (dict): Result dict of the destination image.
+            src_results (dict): Result dict of the source image.
+        Returns:
+            dict: Updated result dict.
+        """
+        dst_img = dst_results['img']
+        dst_bboxes = dst_results['gt_bboxes']
+        dst_labels = dst_results['gt_bboxes_labels']
+        dst_masks = self.get_gt_masks(dst_results)
+        dst_ignore_flags = dst_results['gt_ignore_flags']
+
+        src_img = src_results['img']
+        src_bboxes = src_results['gt_bboxes']
+        src_labels = src_results['gt_bboxes_labels']
+        src_masks = src_results['gt_masks']
+        src_ignore_flags = src_results['gt_ignore_flags']
+
+        if len(src_bboxes) == 0:
+            return dst_results
+
+        # update masks and generate bboxes from updated masks
+        composed_mask = np.where(np.any(src_masks.masks, axis=0), 1, 0)
+        updated_dst_masks = self._get_updated_masks(dst_masks, composed_mask)
+        updated_dst_bboxes = updated_dst_masks.get_bboxes(type(dst_bboxes))
+        assert len(updated_dst_bboxes) == len(updated_dst_masks)
+
+        # filter totally occluded objects
+        l1_distance = (updated_dst_bboxes.tensor - dst_bboxes.tensor).abs()
+        bboxes_inds = (l1_distance <= self.bbox_occluded_thr).all(
+            dim=-1).numpy()
+        masks_inds = updated_dst_masks.masks.sum(
+            axis=(1, 2)) > self.mask_occluded_thr
+        valid_inds = bboxes_inds | masks_inds
+
+        # Paste source objects to destination image directly
+        img = dst_img * (1 - composed_mask[..., np.newaxis]
+                         ) + src_img * composed_mask[..., np.newaxis]
+        bboxes = src_bboxes.cat([updated_dst_bboxes[valid_inds], src_bboxes])
+        labels = np.concatenate([dst_labels[valid_inds], src_labels])
+        masks = np.concatenate(
+            [updated_dst_masks.masks[valid_inds], src_masks.masks])
+        ignore_flags = np.concatenate(
+            [dst_ignore_flags[valid_inds], src_ignore_flags])
+
+        dst_results['img'] = img
+        dst_results['gt_bboxes'] = bboxes
+        dst_results['gt_bboxes_labels'] = labels
+        dst_results['gt_masks'] = BitmapMasks(masks, masks.shape[1],
+                                              masks.shape[2])
+        dst_results['gt_ignore_flags'] = ignore_flags
+
+        return dst_results
+
+    def _get_updated_masks(self, masks: BitmapMasks,
+                           composed_mask: np.ndarray) -> BitmapMasks:
+        """Update masks with composed mask."""
+        assert masks.masks.shape[-2:] == composed_mask.shape[-2:], \
+            'Cannot compare two arrays of different size'
+        masks.masks = np.where(composed_mask, 0, masks.masks)
+        return masks
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(max_num_pasted={self.max_num_pasted}, '
+        repr_str += f'bbox_occluded_thr={self.bbox_occluded_thr}, '
+        repr_str += f'mask_occluded_thr={self.mask_occluded_thr}, '
+        repr_str += f'selected={self.selected}), '
+        repr_str += f'paste_by_box={self.paste_by_box})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomErasing(BaseTransform):
+    """RandomErasing operation.
+
+    Random Erasing randomly selects a rectangle region
+    in an image and erases its pixels with random values.
+    `RandomErasing <https://arxiv.org/abs/1708.04896>`_.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (HorizontalBoxes[torch.float32]) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - gt_masks (BitmapMasks) (optional)
+
+    Modified Keys:
+    - img
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+    - gt_masks (optional)
+
+    Args:
+        n_patches (int or tuple[int, int]): Number of regions to be dropped.
+            If it is given as a tuple, number of patches will be randomly
+            selected from the closed interval [``n_patches[0]``,
+            ``n_patches[1]``].
+        ratio (float or tuple[float, float]): The ratio of erased regions.
+            It can be ``float`` to use a fixed ratio or ``tuple[float, float]``
+            to randomly choose ratio from the interval.
+        squared (bool): Whether to erase square region. Defaults to True.
+        bbox_erased_thr (float): The threshold for the maximum area proportion
+            of the bbox to be erased. When the proportion of the area where the
+            bbox is erased is greater than the threshold, the bbox will be
+            removed. Defaults to 0.9.
+        img_border_value (int or float or tuple): The filled values for
+            image border. If float, the same fill value will be used for
+            all the three channels of image. If tuple, it should be 3 elements.
+            Defaults to 128.
+        mask_border_value (int): The fill value used for masks. Defaults to 0.
+        seg_ignore_label (int): The fill value used for segmentation map.
+            Note this value must equals ``ignore_label`` in ``semantic_head``
+            of the corresponding config. Defaults to 255.
+    """
+
+    def __init__(
+        self,
+        n_patches: Union[int, Tuple[int, int]],
+        ratio: Union[float, Tuple[float, float]],
+        squared: bool = True,
+        bbox_erased_thr: float = 0.9,
+        img_border_value: Union[int, float, tuple] = 128,
+        mask_border_value: int = 0,
+        seg_ignore_label: int = 255,
+    ) -> None:
+        if isinstance(n_patches, tuple):
+            assert len(n_patches) == 2 and 0 <= n_patches[0] < n_patches[1]
+        else:
+            n_patches = (n_patches, n_patches)
+        if isinstance(ratio, tuple):
+            assert len(ratio) == 2 and 0 <= ratio[0] < ratio[1] <= 1
+        else:
+            ratio = (ratio, ratio)
+
+        self.n_patches = n_patches
+        self.ratio = ratio
+        self.squared = squared
+        self.bbox_erased_thr = bbox_erased_thr
+        self.img_border_value = img_border_value
+        self.mask_border_value = mask_border_value
+        self.seg_ignore_label = seg_ignore_label
+
+    @cache_randomness
+    def _get_patches(self, img_shape: Tuple[int, int]) -> List[list]:
+        """Get patches for random erasing."""
+        patches = []
+        n_patches = np.random.randint(self.n_patches[0], self.n_patches[1] + 1)
+        for _ in range(n_patches):
+            if self.squared:
+                ratio = np.random.random() * (self.ratio[1] -
+                                              self.ratio[0]) + self.ratio[0]
+                ratio = (ratio, ratio)
+            else:
+                ratio = (np.random.random() * (self.ratio[1] - self.ratio[0]) +
+                         self.ratio[0], np.random.random() *
+                         (self.ratio[1] - self.ratio[0]) + self.ratio[0])
+            ph, pw = int(img_shape[0] * ratio[0]), int(img_shape[1] * ratio[1])
+            px1, py1 = np.random.randint(0,
+                                         img_shape[1] - pw), np.random.randint(
+                                             0, img_shape[0] - ph)
+            px2, py2 = px1 + pw, py1 + ph
+            patches.append([px1, py1, px2, py2])
+        return np.array(patches)
+
+    def _transform_img(self, results: dict, patches: List[list]) -> None:
+        """Random erasing the image."""
+        for patch in patches:
+            px1, py1, px2, py2 = patch
+            results['img'][py1:py2, px1:px2, :] = self.img_border_value
+
+    def _transform_bboxes(self, results: dict, patches: List[list]) -> None:
+        """Random erasing the bboxes."""
+        bboxes = results['gt_bboxes']
+        # TODO: unify the logic by using operators in BaseBoxes.
+        assert isinstance(bboxes, HorizontalBoxes)
+        bboxes = bboxes.numpy()
+        left_top = np.maximum(bboxes[:, None, :2], patches[:, :2])
+        right_bottom = np.minimum(bboxes[:, None, 2:], patches[:, 2:])
+        wh = np.maximum(right_bottom - left_top, 0)
+        inter_areas = wh[:, :, 0] * wh[:, :, 1]
+        bbox_areas = (bboxes[:, 2] - bboxes[:, 0]) * (
+            bboxes[:, 3] - bboxes[:, 1])
+        bboxes_erased_ratio = inter_areas.sum(-1) / (bbox_areas + 1e-7)
+        valid_inds = bboxes_erased_ratio < self.bbox_erased_thr
+        results['gt_bboxes'] = HorizontalBoxes(bboxes[valid_inds])
+        results['gt_bboxes_labels'] = results['gt_bboxes_labels'][valid_inds]
+        results['gt_ignore_flags'] = results['gt_ignore_flags'][valid_inds]
+        if results.get('gt_masks', None) is not None:
+            results['gt_masks'] = results['gt_masks'][valid_inds]
+
+    def _transform_masks(self, results: dict, patches: List[list]) -> None:
+        """Random erasing the masks."""
+        for patch in patches:
+            px1, py1, px2, py2 = patch
+            results['gt_masks'].masks[:, py1:py2,
+                                      px1:px2] = self.mask_border_value
+
+    def _transform_seg(self, results: dict, patches: List[list]) -> None:
+        """Random erasing the segmentation map."""
+        for patch in patches:
+            px1, py1, px2, py2 = patch
+            results['gt_seg_map'][py1:py2, px1:px2] = self.seg_ignore_label
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Transform function to erase some regions of image."""
+        patches = self._get_patches(results['img_shape'])
+        self._transform_img(results, patches)
+        if results.get('gt_bboxes', None) is not None:
+            self._transform_bboxes(results, patches)
+        if results.get('gt_masks', None) is not None:
+            self._transform_masks(results, patches)
+        if results.get('gt_seg_map', None) is not None:
+            self._transform_seg(results, patches)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(n_patches={self.n_patches}, '
+        repr_str += f'ratio={self.ratio}, '
+        repr_str += f'squared={self.squared}, '
+        repr_str += f'bbox_erased_thr={self.bbox_erased_thr}, '
+        repr_str += f'img_border_value={self.img_border_value}, '
+        repr_str += f'mask_border_value={self.mask_border_value}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CachedMosaic(Mosaic):
+    """Cached mosaic augmentation.
+
+    Cached mosaic transform will random select images from the cache
+    and combine them into one output image.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |  pad      |
+                |      +-----------+           |
+                |      |           |           |
+                |      |  image1   |--------+  |
+                |      |           |        |  |
+                |      |           | image2 |  |
+     center_y   |----+-------------+-----------|
+                |    |   cropped   |           |
+                |pad |   image3    |  image4   |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+     The cached mosaic transform steps are as follows:
+
+         1. Append the results from the last transform into the cache.
+         2. Choose the mosaic center as the intersections of 4 images
+         3. Get the left top image according to the index, and randomly
+            sample another 3 images from the result cache.
+         4. Sub image will be cropped if image is larger than mosaic patch
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (np.float32) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+    Args:
+        img_scale (Sequence[int]): Image size before mosaic pipeline of single
+            image. The shape order should be (width, height).
+            Defaults to (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Defaults to (0.5, 1.5).
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        pad_val (int): Pad value. Defaults to 114.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 40.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 *args,
+                 max_cached_images: int = 40,
+                 random_pop: bool = True,
+                 **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.results_cache = []
+        self.random_pop = random_pop
+        assert max_cached_images >= 4, 'The length of cache must >= 4, ' \
+                                       f'but got {max_cached_images}.'
+        self.max_cached_images = max_cached_images
+
+    @cache_randomness
+    def get_indexes(self, cache: list) -> list:
+        """Call function to collect indexes.
+
+        Args:
+            cache (list): The results cache.
+
+        Returns:
+            list: indexes.
+        """
+
+        indexes = [random.randint(0, len(cache) - 1) for _ in range(3)]
+        return indexes
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """Mosaic transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        # cache and pop images
+        self.results_cache.append(copy.deepcopy(results))
+        if len(self.results_cache) > self.max_cached_images:
+            if self.random_pop:
+                index = random.randint(0, len(self.results_cache) - 1)
+            else:
+                index = 0
+            self.results_cache.pop(index)
+
+        if len(self.results_cache) <= 4:
+            return results
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+        indices = self.get_indexes(self.results_cache)
+        mix_results = [copy.deepcopy(self.results_cache[i]) for i in indices]
+
+        # TODO: refactor mosaic to reuse these code.
+        mosaic_bboxes = []
+        mosaic_bboxes_labels = []
+        mosaic_ignore_flags = []
+        mosaic_masks = []
+        with_mask = True if 'gt_masks' in results else False
+
+        if len(results['img'].shape) == 3:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2), 3),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full(
+                (int(self.img_scale[1] * 2), int(self.img_scale[0] * 2)),
+                self.pad_val,
+                dtype=results['img'].dtype)
+
+        # mosaic center x, y
+        center_x = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        center_y = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[1])
+        center_position = (center_x, center_y)
+
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                results_patch = copy.deepcopy(results)
+            else:
+                results_patch = copy.deepcopy(mix_results[i - 1])
+
+            img_i = results_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(self.img_scale[1] / h_i,
+                                self.img_scale[0] / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+
+            # adjust coordinate
+            gt_bboxes_i = results_patch['gt_bboxes']
+            gt_bboxes_labels_i = results_patch['gt_bboxes_labels']
+            gt_ignore_flags_i = results_patch['gt_ignore_flags']
+
+            padw = x1_p - x1_c
+            padh = y1_p - y1_c
+            gt_bboxes_i.rescale_([scale_ratio_i, scale_ratio_i])
+            gt_bboxes_i.translate_([padw, padh])
+            mosaic_bboxes.append(gt_bboxes_i)
+            mosaic_bboxes_labels.append(gt_bboxes_labels_i)
+            mosaic_ignore_flags.append(gt_ignore_flags_i)
+            if with_mask and results_patch.get('gt_masks', None) is not None:
+                gt_masks_i = results_patch['gt_masks']
+                gt_masks_i = gt_masks_i.rescale(float(scale_ratio_i))
+                gt_masks_i = gt_masks_i.translate(
+                    out_shape=(int(self.img_scale[0] * 2),
+                               int(self.img_scale[1] * 2)),
+                    offset=padw,
+                    direction='horizontal')
+                gt_masks_i = gt_masks_i.translate(
+                    out_shape=(int(self.img_scale[0] * 2),
+                               int(self.img_scale[1] * 2)),
+                    offset=padh,
+                    direction='vertical')
+                mosaic_masks.append(gt_masks_i)
+
+        mosaic_bboxes = mosaic_bboxes[0].cat(mosaic_bboxes, 0)
+        mosaic_bboxes_labels = np.concatenate(mosaic_bboxes_labels, 0)
+        mosaic_ignore_flags = np.concatenate(mosaic_ignore_flags, 0)
+
+        if self.bbox_clip_border:
+            mosaic_bboxes.clip_([2 * self.img_scale[1], 2 * self.img_scale[0]])
+        # remove outside bboxes
+        inside_inds = mosaic_bboxes.is_inside(
+            [2 * self.img_scale[1], 2 * self.img_scale[0]]).numpy()
+        mosaic_bboxes = mosaic_bboxes[inside_inds]
+        mosaic_bboxes_labels = mosaic_bboxes_labels[inside_inds]
+        mosaic_ignore_flags = mosaic_ignore_flags[inside_inds]
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape[:2]
+        results['gt_bboxes'] = mosaic_bboxes
+        results['gt_bboxes_labels'] = mosaic_bboxes_labels
+        results['gt_ignore_flags'] = mosaic_ignore_flags
+
+        if with_mask:
+            mosaic_masks = mosaic_masks[0].cat(mosaic_masks)
+            results['gt_masks'] = mosaic_masks[inside_inds]
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'prob={self.prob}, '
+        repr_str += f'max_cached_images={self.max_cached_images}, '
+        repr_str += f'random_pop={self.random_pop})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CachedMixUp(BaseTransform):
+    """Cached mixup data augmentation.
+
+    .. code:: text
+
+                         mixup transform
+                +------------------------------+
+                | mixup image   |              |
+                |      +--------|--------+     |
+                |      |        |        |     |
+                |---------------+        |     |
+                |      |                 |     |
+                |      |      image      |     |
+                |      |                 |     |
+                |      |                 |     |
+                |      |-----------------+     |
+                |             pad              |
+                +------------------------------+
+
+     The cached mixup transform steps are as follows:
+
+        1. Append the results from the last transform into the cache.
+        2. Another random image is picked from the cache and embedded in
+           the top left patch(after padding and resizing)
+        3. The target of mixup transform is the weighted average of mixup
+           image and origin image.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (np.float32) (optional)
+    - gt_bboxes_labels (np.int64) (optional)
+    - gt_ignore_flags (bool) (optional)
+    - mix_results (List[dict])
+
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_bboxes (optional)
+    - gt_bboxes_labels (optional)
+    - gt_ignore_flags (optional)
+
+
+    Args:
+        img_scale (Sequence[int]): Image output size after mixup pipeline.
+            The shape order should be (width, height). Defaults to (640, 640).
+        ratio_range (Sequence[float]): Scale ratio of mixup image.
+            Defaults to (0.5, 1.5).
+        flip_ratio (float): Horizontal flip ratio of mixup image.
+            Defaults to 0.5.
+        pad_val (int): Pad value. Defaults to 114.
+        max_iters (int): The maximum number of iterations. If the number of
+            iterations is greater than `max_iters`, but gt_bbox is still
+            empty, then the iteration is terminated. Defaults to 15.
+        bbox_clip_border (bool, optional): Whether to clip the objects outside
+            the border of the image. In some dataset like MOT17, the gt bboxes
+            are allowed to cross the border of images. Therefore, we don't
+            need to clip the gt bboxes in these cases. Defaults to True.
+        max_cached_images (int): The maximum length of the cache. The larger
+            the cache, the stronger the randomness of this transform. As a
+            rule of thumb, providing 10 caches for each image suffices for
+            randomness. Defaults to 20.
+        random_pop (bool): Whether to randomly pop a result from the cache
+            when the cache is full. If set to False, use FIFO popping method.
+            Defaults to True.
+        prob (float): Probability of applying this transformation.
+            Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 img_scale: Tuple[int, int] = (640, 640),
+                 ratio_range: Tuple[float, float] = (0.5, 1.5),
+                 flip_ratio: float = 0.5,
+                 pad_val: float = 114.0,
+                 max_iters: int = 15,
+                 bbox_clip_border: bool = True,
+                 max_cached_images: int = 20,
+                 random_pop: bool = True,
+                 prob: float = 1.0) -> None:
+        assert isinstance(img_scale, tuple)
+        assert max_cached_images >= 2, 'The length of cache must >= 2, ' \
+                                       f'but got {max_cached_images}.'
+        assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. ' \
+                                 f'got {prob}.'
+        self.dynamic_scale = img_scale
+        self.ratio_range = ratio_range
+        self.flip_ratio = flip_ratio
+        self.pad_val = pad_val
+        self.max_iters = max_iters
+        self.bbox_clip_border = bbox_clip_border
+        self.results_cache = []
+
+        self.max_cached_images = max_cached_images
+        self.random_pop = random_pop
+        self.prob = prob
+
+    @cache_randomness
+    def get_indexes(self, cache: list) -> int:
+        """Call function to collect indexes.
+
+        Args:
+            cache (list): The result cache.
+
+        Returns:
+            int: index.
+        """
+
+        for i in range(self.max_iters):
+            index = random.randint(0, len(cache) - 1)
+            gt_bboxes_i = cache[index]['gt_bboxes']
+            if len(gt_bboxes_i) != 0:
+                break
+        return index
+
+    @autocast_box_type()
+    def transform(self, results: dict) -> dict:
+        """MixUp transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        # cache and pop images
+        self.results_cache.append(copy.deepcopy(results))
+        if len(self.results_cache) > self.max_cached_images:
+            if self.random_pop:
+                index = random.randint(0, len(self.results_cache) - 1)
+            else:
+                index = 0
+            self.results_cache.pop(index)
+
+        if len(self.results_cache) <= 1:
+            return results
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        index = self.get_indexes(self.results_cache)
+        retrieve_results = copy.deepcopy(self.results_cache[index])
+
+        # TODO: refactor mixup to reuse these code.
+        if retrieve_results['gt_bboxes'].shape[0] == 0:
+            # empty bbox
+            return results
+
+        retrieve_img = retrieve_results['img']
+        with_mask = True if 'gt_masks' in results else False
+
+        jit_factor = random.uniform(*self.ratio_range)
+        is_flip = random.uniform(0, 1) > self.flip_ratio
+
+        if len(retrieve_img.shape) == 3:
+            out_img = np.ones(
+                (self.dynamic_scale[1], self.dynamic_scale[0], 3),
+                dtype=retrieve_img.dtype) * self.pad_val
+        else:
+            out_img = np.ones(
+                self.dynamic_scale[::-1],
+                dtype=retrieve_img.dtype) * self.pad_val
+
+        # 1. keep_ratio resize
+        scale_ratio = min(self.dynamic_scale[1] / retrieve_img.shape[0],
+                          self.dynamic_scale[0] / retrieve_img.shape[1])
+        retrieve_img = mmcv.imresize(
+            retrieve_img, (int(retrieve_img.shape[1] * scale_ratio),
+                           int(retrieve_img.shape[0] * scale_ratio)))
+
+        # 2. paste
+        out_img[:retrieve_img.shape[0], :retrieve_img.shape[1]] = retrieve_img
+
+        # 3. scale jit
+        scale_ratio *= jit_factor
+        out_img = mmcv.imresize(out_img, (int(out_img.shape[1] * jit_factor),
+                                          int(out_img.shape[0] * jit_factor)))
+
+        # 4. flip
+        if is_flip:
+            out_img = out_img[:, ::-1, :]
+
+        # 5. random crop
+        ori_img = results['img']
+        origin_h, origin_w = out_img.shape[:2]
+        target_h, target_w = ori_img.shape[:2]
+        padded_img = np.ones((max(origin_h, target_h), max(
+            origin_w, target_w), 3)) * self.pad_val
+        padded_img = padded_img.astype(np.uint8)
+        padded_img[:origin_h, :origin_w] = out_img
+
+        x_offset, y_offset = 0, 0
+        if padded_img.shape[0] > target_h:
+            y_offset = random.randint(0, padded_img.shape[0] - target_h)
+        if padded_img.shape[1] > target_w:
+            x_offset = random.randint(0, padded_img.shape[1] - target_w)
+        padded_cropped_img = padded_img[y_offset:y_offset + target_h,
+                                        x_offset:x_offset + target_w]
+
+        # 6. adjust bbox
+        retrieve_gt_bboxes = retrieve_results['gt_bboxes']
+        retrieve_gt_bboxes.rescale_([scale_ratio, scale_ratio])
+        if with_mask:
+            retrieve_gt_masks = retrieve_results['gt_masks'].rescale(
+                scale_ratio)
+
+        if self.bbox_clip_border:
+            retrieve_gt_bboxes.clip_([origin_h, origin_w])
+
+        if is_flip:
+            retrieve_gt_bboxes.flip_([origin_h, origin_w],
+                                     direction='horizontal')
+            if with_mask:
+                retrieve_gt_masks = retrieve_gt_masks.flip()
+
+        # 7. filter
+        cp_retrieve_gt_bboxes = retrieve_gt_bboxes.clone()
+        cp_retrieve_gt_bboxes.translate_([-x_offset, -y_offset])
+        if with_mask:
+            retrieve_gt_masks = retrieve_gt_masks.translate(
+                out_shape=(target_h, target_w),
+                offset=-x_offset,
+                direction='horizontal')
+            retrieve_gt_masks = retrieve_gt_masks.translate(
+                out_shape=(target_h, target_w),
+                offset=-y_offset,
+                direction='vertical')
+
+        if self.bbox_clip_border:
+            cp_retrieve_gt_bboxes.clip_([target_h, target_w])
+
+        # 8. mix up
+        ori_img = ori_img.astype(np.float32)
+        mixup_img = 0.5 * ori_img + 0.5 * padded_cropped_img.astype(np.float32)
+
+        retrieve_gt_bboxes_labels = retrieve_results['gt_bboxes_labels']
+        retrieve_gt_ignore_flags = retrieve_results['gt_ignore_flags']
+
+        mixup_gt_bboxes = cp_retrieve_gt_bboxes.cat(
+            (results['gt_bboxes'], cp_retrieve_gt_bboxes), dim=0)
+        mixup_gt_bboxes_labels = np.concatenate(
+            (results['gt_bboxes_labels'], retrieve_gt_bboxes_labels), axis=0)
+        mixup_gt_ignore_flags = np.concatenate(
+            (results['gt_ignore_flags'], retrieve_gt_ignore_flags), axis=0)
+        if with_mask:
+            mixup_gt_masks = retrieve_gt_masks.cat(
+                [results['gt_masks'], retrieve_gt_masks])
+
+        # remove outside bbox
+        inside_inds = mixup_gt_bboxes.is_inside([target_h, target_w]).numpy()
+        mixup_gt_bboxes = mixup_gt_bboxes[inside_inds]
+        mixup_gt_bboxes_labels = mixup_gt_bboxes_labels[inside_inds]
+        mixup_gt_ignore_flags = mixup_gt_ignore_flags[inside_inds]
+        if with_mask:
+            mixup_gt_masks = mixup_gt_masks[inside_inds]
+
+        results['img'] = mixup_img.astype(np.uint8)
+        results['img_shape'] = mixup_img.shape[:2]
+        results['gt_bboxes'] = mixup_gt_bboxes
+        results['gt_bboxes_labels'] = mixup_gt_bboxes_labels
+        results['gt_ignore_flags'] = mixup_gt_ignore_flags
+        if with_mask:
+            results['gt_masks'] = mixup_gt_masks
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(dynamic_scale={self.dynamic_scale}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'flip_ratio={self.flip_ratio}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'max_iters={self.max_iters}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border}, '
+        repr_str += f'max_cached_images={self.max_cached_images}, '
+        repr_str += f'random_pop={self.random_pop}, '
+        repr_str += f'prob={self.prob})'
+        return repr_str
diff --git a/head_extractor/src/mmdet/datasets/transforms/wrappers.py b/head_extractor/src/mmdet/datasets/transforms/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a17711c06bfbd4dc0038dce9ea7796d1476c37e
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/transforms/wrappers.py
@@ -0,0 +1,277 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+from mmcv.transforms import BaseTransform, Compose
+from mmcv.transforms.utils import cache_random_params, cache_randomness
+
+from mmdet.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class MultiBranch(BaseTransform):
+    r"""Multiple branch pipeline wrapper.
+
+    Generate multiple data-augmented versions of the same image.
+    `MultiBranch` needs to specify the branch names of all
+    pipelines of the dataset, perform corresponding data augmentation
+    for the current branch, and return None for other branches,
+    which ensures the consistency of return format across
+    different samples.
+
+    Args:
+        branch_field (list): List of branch names.
+        branch_pipelines (dict): Dict of different pipeline configs
+            to be composed.
+
+    Examples:
+        >>> branch_field = ['sup', 'unsup_teacher', 'unsup_student']
+        >>> sup_pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadAnnotations', with_bbox=True),
+        >>>     dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+        >>>     dict(type='RandomFlip', prob=0.5),
+        >>>     dict(
+        >>>         type='MultiBranch',
+        >>>         branch_field=branch_field,
+        >>>         sup=dict(type='PackDetInputs'))
+        >>>     ]
+        >>> weak_pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadAnnotations', with_bbox=True),
+        >>>     dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+        >>>     dict(type='RandomFlip', prob=0.0),
+        >>>     dict(
+        >>>         type='MultiBranch',
+        >>>         branch_field=branch_field,
+        >>>         sup=dict(type='PackDetInputs'))
+        >>>     ]
+        >>> strong_pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadAnnotations', with_bbox=True),
+        >>>     dict(type='Resize', scale=(1333, 800), keep_ratio=True),
+        >>>     dict(type='RandomFlip', prob=1.0),
+        >>>     dict(
+        >>>         type='MultiBranch',
+        >>>         branch_field=branch_field,
+        >>>         sup=dict(type='PackDetInputs'))
+        >>>     ]
+        >>> unsup_pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadEmptyAnnotations'),
+        >>>     dict(
+        >>>         type='MultiBranch',
+        >>>         branch_field=branch_field,
+        >>>         unsup_teacher=weak_pipeline,
+        >>>         unsup_student=strong_pipeline)
+        >>>     ]
+        >>> from mmcv.transforms import Compose
+        >>> sup_branch = Compose(sup_pipeline)
+        >>> unsup_branch = Compose(unsup_pipeline)
+        >>> print(sup_branch)
+        >>> Compose(
+        >>>     LoadImageFromFile(ignore_empty=False, to_float32=False, color_type='color', imdecode_backend='cv2') # noqa
+        >>>     LoadAnnotations(with_bbox=True, with_label=True, with_mask=False, with_seg=False, poly2mask=True, imdecode_backend='cv2') # noqa
+        >>>     Resize(scale=(1333, 800), scale_factor=None, keep_ratio=True, clip_object_border=True), backend=cv2), interpolation=bilinear) # noqa
+        >>>     RandomFlip(prob=0.5, direction=horizontal)
+        >>>     MultiBranch(branch_pipelines=['sup'])
+        >>> )
+        >>> print(unsup_branch)
+        >>> Compose(
+        >>>     LoadImageFromFile(ignore_empty=False, to_float32=False, color_type='color', imdecode_backend='cv2') # noqa
+        >>>     LoadEmptyAnnotations(with_bbox=True, with_label=True, with_mask=False, with_seg=False, seg_ignore_label=255) # noqa
+        >>>     MultiBranch(branch_pipelines=['unsup_teacher', 'unsup_student'])
+        >>> )
+    """
+
+    def __init__(self, branch_field: List[str],
+                 **branch_pipelines: dict) -> None:
+        self.branch_field = branch_field
+        self.branch_pipelines = {
+            branch: Compose(pipeline)
+            for branch, pipeline in branch_pipelines.items()
+        }
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to apply transforms sequentially.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict:
+
+            - 'inputs' (Dict[str, obj:`torch.Tensor`]): The forward data of
+                models from different branches.
+            - 'data_sample' (Dict[str,obj:`DetDataSample`]): The annotation
+                info of the sample from different branches.
+        """
+
+        multi_results = {}
+        for branch in self.branch_field:
+            multi_results[branch] = {'inputs': None, 'data_samples': None}
+        for branch, pipeline in self.branch_pipelines.items():
+            branch_results = pipeline(copy.deepcopy(results))
+            # If one branch pipeline returns None,
+            # it will sample another data from dataset.
+            if branch_results is None:
+                return None
+            multi_results[branch] = branch_results
+
+        format_results = {}
+        for branch, results in multi_results.items():
+            for key in results.keys():
+                if format_results.get(key, None) is None:
+                    format_results[key] = {branch: results[key]}
+                else:
+                    format_results[key][branch] = results[key]
+        return format_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(branch_pipelines={list(self.branch_pipelines.keys())})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomOrder(Compose):
+    """Shuffle the transform Sequence."""
+
+    @cache_randomness
+    def _random_permutation(self):
+        return np.random.permutation(len(self.transforms))
+
+    def transform(self, results: Dict) -> Optional[Dict]:
+        """Transform function to apply transforms in random order.
+
+        Args:
+            results (dict): A result dict contains the results to transform.
+
+        Returns:
+            dict or None: Transformed results.
+        """
+        inds = self._random_permutation()
+        for idx in inds:
+            t = self.transforms[idx]
+            results = t(results)
+            if results is None:
+                return None
+        return results
+
+    def __repr__(self):
+        """Compute the string representation."""
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += f'{t.__class__.__name__}, '
+        format_string += ')'
+        return format_string
+
+
+@TRANSFORMS.register_module()
+class ProposalBroadcaster(BaseTransform):
+    """A transform wrapper to apply the wrapped transforms to process both
+    `gt_bboxes` and `proposals` without adding any codes. It will do the
+    following steps:
+
+        1. Scatter the broadcasting targets to a list of inputs of the wrapped
+           transforms. The type of the list should be list[dict, dict], which
+           the first is the original inputs, the second is the processing
+           results that `gt_bboxes` being rewritten by the `proposals`.
+        2. Apply ``self.transforms``, with same random parameters, which is
+           sharing with a context manager. The type of the outputs is a
+           list[dict, dict].
+        3. Gather the outputs, update the `proposals` in the first item of
+           the outputs with the `gt_bboxes` in the second .
+
+    Args:
+         transforms (list, optional): Sequence of transform
+            object or config dict to be wrapped. Defaults to [].
+
+    Note: The `TransformBroadcaster` in MMCV can achieve the same operation as
+          `ProposalBroadcaster`, but need to set more complex parameters.
+
+    Examples:
+        >>> pipeline = [
+        >>>     dict(type='LoadImageFromFile'),
+        >>>     dict(type='LoadProposals', num_max_proposals=2000),
+        >>>     dict(type='LoadAnnotations', with_bbox=True),
+        >>>     dict(
+        >>>         type='ProposalBroadcaster',
+        >>>         transforms=[
+        >>>             dict(type='Resize', scale=(1333, 800),
+        >>>                  keep_ratio=True),
+        >>>             dict(type='RandomFlip', prob=0.5),
+        >>>         ]),
+        >>>     dict(type='PackDetInputs')]
+    """
+
+    def __init__(self, transforms: List[Union[dict, Callable]] = []) -> None:
+        self.transforms = Compose(transforms)
+
+    def transform(self, results: dict) -> dict:
+        """Apply wrapped transform functions to process both `gt_bboxes` and
+        `proposals`.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        assert results.get('proposals', None) is not None, \
+            '`proposals` should be in the results, please delete ' \
+            '`ProposalBroadcaster` in your configs, or check whether ' \
+            'you have load proposals successfully.'
+
+        inputs = self._process_input(results)
+        outputs = self._apply_transforms(inputs)
+        outputs = self._process_output(outputs)
+        return outputs
+
+    def _process_input(self, data: dict) -> list:
+        """Scatter the broadcasting targets to a list of inputs of the wrapped
+        transforms.
+
+        Args:
+            data (dict): The original input data.
+
+        Returns:
+            list[dict]: A list of input data.
+        """
+        cp_data = copy.deepcopy(data)
+        cp_data['gt_bboxes'] = cp_data['proposals']
+        scatters = [data, cp_data]
+        return scatters
+
+    def _apply_transforms(self, inputs: list) -> list:
+        """Apply ``self.transforms``.
+
+        Args:
+            inputs (list[dict, dict]): list of input data.
+
+        Returns:
+            list[dict]: The output of the wrapped pipeline.
+        """
+        assert len(inputs) == 2
+        ctx = cache_random_params
+        with ctx(self.transforms):
+            output_scatters = [self.transforms(_input) for _input in inputs]
+        return output_scatters
+
+    def _process_output(self, output_scatters: list) -> dict:
+        """Gathering and renaming data items.
+
+        Args:
+            output_scatters (list[dict, dict]): The output of the wrapped
+                pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        assert isinstance(output_scatters, list) and \
+               isinstance(output_scatters[0], dict) and \
+               len(output_scatters) == 2
+        outputs = output_scatters[0]
+        outputs['proposals'] = output_scatters[1]['gt_bboxes']
+        return outputs
diff --git a/head_extractor/src/mmdet/datasets/utils.py b/head_extractor/src/mmdet/datasets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d794eb4b06ec9db56ff3a5fc7b817d1d9332a989
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/utils.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.transforms import LoadImageFromFile
+
+from mmdet.datasets.transforms import LoadAnnotations, LoadPanopticAnnotations
+from mmdet.registry import TRANSFORMS
+
+
+def get_loading_pipeline(pipeline):
+    """Only keep loading image and annotations related configuration.
+
+    Args:
+        pipeline (list[dict]): Data pipeline configs.
+
+    Returns:
+        list[dict]: The new pipeline list with only keep
+            loading image and annotations related configuration.
+
+    Examples:
+        >>> pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations', with_bbox=True),
+        ...    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+        ...    dict(type='RandomFlip', flip_ratio=0.5),
+        ...    dict(type='Normalize', **img_norm_cfg),
+        ...    dict(type='Pad', size_divisor=32),
+        ...    dict(type='DefaultFormatBundle'),
+        ...    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+        ...    ]
+        >>> expected_pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations', with_bbox=True)
+        ...    ]
+        >>> assert expected_pipelines ==\
+        ...        get_loading_pipeline(pipelines)
+    """
+    loading_pipeline_cfg = []
+    for cfg in pipeline:
+        obj_cls = TRANSFORMS.get(cfg['type'])
+        # TODO:use more elegant way to distinguish loading modules
+        if obj_cls is not None and obj_cls in (LoadImageFromFile,
+                                               LoadAnnotations,
+                                               LoadPanopticAnnotations):
+            loading_pipeline_cfg.append(cfg)
+    assert len(loading_pipeline_cfg) == 2, \
+        'The data pipeline in your config file must include ' \
+        'loading image and annotations related pipeline.'
+    return loading_pipeline_cfg
diff --git a/head_extractor/src/mmdet/datasets/v3det.py b/head_extractor/src/mmdet/datasets/v3det.py
new file mode 100644
index 0000000000000000000000000000000000000000..25bfe3bc718841143653c54954240186c3376955
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/v3det.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path
+from typing import Optional
+
+import mmengine
+
+from mmdet.registry import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class V3DetDataset(CocoDataset):
+    """Dataset for V3Det."""
+
+    METAINFO = {
+        'classes': None,
+        'palette': None,
+    }
+
+    def __init__(
+            self,
+            *args,
+            metainfo: Optional[dict] = None,
+            data_root: str = '',
+            label_file='annotations/category_name_13204_v3det_2023_v1.txt',  # noqa
+            **kwargs) -> None:
+        class_names = tuple(
+            mmengine.list_from_file(os.path.join(data_root, label_file)))
+        if metainfo is None:
+            metainfo = {'classes': class_names}
+        super().__init__(
+            *args, data_root=data_root, metainfo=metainfo, **kwargs)
diff --git a/head_extractor/src/mmdet/datasets/voc.py b/head_extractor/src/mmdet/datasets/voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..65e73f2f0bd4f2b16d5237cd3b5f342e44cf0438
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/voc.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .xml_style import XMLDataset
+
+
+@DATASETS.register_module()
+class VOCDataset(XMLDataset):
+    """Dataset for PASCAL VOC."""
+
+    METAINFO = {
+        'classes':
+        ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+         'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+         'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'),
+        # palette is a list of color tuples, which is used for visualization.
+        'palette': [(106, 0, 228), (119, 11, 32), (165, 42, 42), (0, 0, 192),
+                    (197, 226, 255), (0, 60, 100), (0, 0, 142), (255, 77, 255),
+                    (153, 69, 1), (120, 166, 157), (0, 182, 199),
+                    (0, 226, 252), (182, 182, 255), (0, 0, 230), (220, 20, 60),
+                    (163, 255, 0), (0, 82, 0), (3, 95, 161), (0, 80, 100),
+                    (183, 130, 88)]
+    }
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if 'VOC2007' in self.sub_data_root:
+            self._metainfo['dataset_type'] = 'VOC2007'
+        elif 'VOC2012' in self.sub_data_root:
+            self._metainfo['dataset_type'] = 'VOC2012'
+        else:
+            self._metainfo['dataset_type'] = None
diff --git a/head_extractor/src/mmdet/datasets/wider_face.py b/head_extractor/src/mmdet/datasets/wider_face.py
new file mode 100644
index 0000000000000000000000000000000000000000..62c7fff869ab970b6f96908a998ba6feb25ea205
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/wider_face.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import xml.etree.ElementTree as ET
+
+from mmengine.dist import is_main_process
+from mmengine.fileio import get_local_path, list_from_file
+from mmengine.utils import ProgressBar
+
+from mmdet.registry import DATASETS
+from mmdet.utils.typing_utils import List, Union
+from .xml_style import XMLDataset
+
+
+@DATASETS.register_module()
+class WIDERFaceDataset(XMLDataset):
+    """Reader for the WIDER Face dataset in PASCAL VOC format.
+
+    Conversion scripts can be found in
+    https://github.com/sovrasov/wider-face-pascal-voc-annotations
+    """
+    METAINFO = {'classes': ('face', ), 'palette': [(0, 255, 0)]}
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from XML style ann_file.
+
+        Returns:
+            list[dict]: Annotation info from XML file.
+        """
+        assert self._metainfo.get('classes', None) is not None, \
+            'classes in `XMLDataset` can not be None.'
+        self.cat2label = {
+            cat: i
+            for i, cat in enumerate(self._metainfo['classes'])
+        }
+
+        data_list = []
+        img_ids = list_from_file(self.ann_file, backend_args=self.backend_args)
+
+        # loading process takes around 10 mins
+        if is_main_process():
+            prog_bar = ProgressBar(len(img_ids))
+
+        for img_id in img_ids:
+            raw_img_info = {}
+            raw_img_info['img_id'] = img_id
+            raw_img_info['file_name'] = f'{img_id}.jpg'
+            parsed_data_info = self.parse_data_info(raw_img_info)
+            data_list.append(parsed_data_info)
+
+            if is_main_process():
+                prog_bar.update()
+        return data_list
+
+    def parse_data_info(self, img_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        Args:
+            img_info (dict): Raw image information, usually it includes
+                `img_id`, `file_name`, and `xml_path`.
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        data_info = {}
+        img_id = img_info['img_id']
+        xml_path = osp.join(self.data_prefix['img'], 'Annotations',
+                            f'{img_id}.xml')
+        data_info['img_id'] = img_id
+        data_info['xml_path'] = xml_path
+
+        # deal with xml file
+        with get_local_path(
+                xml_path, backend_args=self.backend_args) as local_path:
+            raw_ann_info = ET.parse(local_path)
+        root = raw_ann_info.getroot()
+        size = root.find('size')
+        width = int(size.find('width').text)
+        height = int(size.find('height').text)
+        folder = root.find('folder').text
+        img_path = osp.join(self.data_prefix['img'], folder,
+                            img_info['file_name'])
+        data_info['img_path'] = img_path
+
+        data_info['height'] = height
+        data_info['width'] = width
+
+        # Coordinates are in range [0, width - 1 or height - 1]
+        data_info['instances'] = self._parse_instance_info(
+            raw_ann_info, minus_one=False)
+        return data_info
diff --git a/head_extractor/src/mmdet/datasets/xml_style.py b/head_extractor/src/mmdet/datasets/xml_style.py
new file mode 100644
index 0000000000000000000000000000000000000000..06045ea0092238abdac9622511b336586858f8f5
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/xml_style.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import xml.etree.ElementTree as ET
+from typing import List, Optional, Union
+
+import mmcv
+from mmengine.fileio import get, get_local_path, list_from_file
+
+from mmdet.registry import DATASETS
+from .base_det_dataset import BaseDetDataset
+
+
+@DATASETS.register_module()
+class XMLDataset(BaseDetDataset):
+    """XML dataset for detection.
+
+    Args:
+        img_subdir (str): Subdir where images are stored. Default: JPEGImages.
+        ann_subdir (str): Subdir where annotations are. Default: Annotations.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(self,
+                 img_subdir: str = 'JPEGImages',
+                 ann_subdir: str = 'Annotations',
+                 **kwargs) -> None:
+        self.img_subdir = img_subdir
+        self.ann_subdir = ann_subdir
+        super().__init__(**kwargs)
+
+    @property
+    def sub_data_root(self) -> str:
+        """Return the sub data root."""
+        return self.data_prefix.get('sub_data_root', '')
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from XML style ann_file.
+
+        Returns:
+            list[dict]: Annotation info from XML file.
+        """
+        assert self._metainfo.get('classes', None) is not None, \
+            '`classes` in `XMLDataset` can not be None.'
+        self.cat2label = {
+            cat: i
+            for i, cat in enumerate(self._metainfo['classes'])
+        }
+
+        data_list = []
+        img_ids = list_from_file(self.ann_file, backend_args=self.backend_args)
+        for img_id in img_ids:
+            file_name = osp.join(self.img_subdir, f'{img_id}.jpg')
+            xml_path = osp.join(self.sub_data_root, self.ann_subdir,
+                                f'{img_id}.xml')
+
+            raw_img_info = {}
+            raw_img_info['img_id'] = img_id
+            raw_img_info['file_name'] = file_name
+            raw_img_info['xml_path'] = xml_path
+
+            parsed_data_info = self.parse_data_info(raw_img_info)
+            data_list.append(parsed_data_info)
+        return data_list
+
+    @property
+    def bbox_min_size(self) -> Optional[int]:
+        """Return the minimum size of bounding boxes in the images."""
+        if self.filter_cfg is not None:
+            return self.filter_cfg.get('bbox_min_size', None)
+        else:
+            return None
+
+    def parse_data_info(self, img_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        Args:
+            img_info (dict): Raw image information, usually it includes
+                `img_id`, `file_name`, and `xml_path`.
+
+        Returns:
+            Union[dict, List[dict]]: Parsed annotation.
+        """
+        data_info = {}
+        img_path = osp.join(self.sub_data_root, img_info['file_name'])
+        data_info['img_path'] = img_path
+        data_info['img_id'] = img_info['img_id']
+        data_info['xml_path'] = img_info['xml_path']
+
+        # deal with xml file
+        with get_local_path(
+                img_info['xml_path'],
+                backend_args=self.backend_args) as local_path:
+            raw_ann_info = ET.parse(local_path)
+        root = raw_ann_info.getroot()
+        size = root.find('size')
+        if size is not None:
+            width = int(size.find('width').text)
+            height = int(size.find('height').text)
+        else:
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, backend='cv2')
+            height, width = img.shape[:2]
+            del img, img_bytes
+
+        data_info['height'] = height
+        data_info['width'] = width
+
+        data_info['instances'] = self._parse_instance_info(
+            raw_ann_info, minus_one=True)
+
+        return data_info
+
+    def _parse_instance_info(self,
+                             raw_ann_info: ET,
+                             minus_one: bool = True) -> List[dict]:
+        """parse instance information.
+
+        Args:
+            raw_ann_info (ElementTree): ElementTree object.
+            minus_one (bool): Whether to subtract 1 from the coordinates.
+                Defaults to True.
+
+        Returns:
+            List[dict]: List of instances.
+        """
+        instances = []
+        for obj in raw_ann_info.findall('object'):
+            instance = {}
+            name = obj.find('name').text
+            if name not in self._metainfo['classes']:
+                continue
+            difficult = obj.find('difficult')
+            difficult = 0 if difficult is None else int(difficult.text)
+            bnd_box = obj.find('bndbox')
+            bbox = [
+                int(float(bnd_box.find('xmin').text)),
+                int(float(bnd_box.find('ymin').text)),
+                int(float(bnd_box.find('xmax').text)),
+                int(float(bnd_box.find('ymax').text))
+            ]
+
+            # VOC needs to subtract 1 from the coordinates
+            if minus_one:
+                bbox = [x - 1 for x in bbox]
+
+            ignore = False
+            if self.bbox_min_size is not None:
+                assert not self.test_mode
+                w = bbox[2] - bbox[0]
+                h = bbox[3] - bbox[1]
+                if w < self.bbox_min_size or h < self.bbox_min_size:
+                    ignore = True
+            if difficult or ignore:
+                instance['ignore_flag'] = 1
+            else:
+                instance['ignore_flag'] = 0
+            instance['bbox'] = bbox
+            instance['bbox_label'] = self.cat2label[name]
+            instances.append(instance)
+        return instances
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg.
+
+        Returns:
+            List[dict]: Filtered results.
+        """
+        if self.test_mode:
+            return self.data_list
+
+        filter_empty_gt = self.filter_cfg.get('filter_empty_gt', False) \
+            if self.filter_cfg is not None else False
+        min_size = self.filter_cfg.get('min_size', 0) \
+            if self.filter_cfg is not None else 0
+
+        valid_data_infos = []
+        for i, data_info in enumerate(self.data_list):
+            width = data_info['width']
+            height = data_info['height']
+            if filter_empty_gt and len(data_info['instances']) == 0:
+                continue
+            if min(width, height) >= min_size:
+                valid_data_infos.append(data_info)
+
+        return valid_data_infos
diff --git a/head_extractor/src/mmdet/datasets/youtube_vis_dataset.py b/head_extractor/src/mmdet/datasets/youtube_vis_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..38c3d3909f1b8fd795c181546094056c54c9c4b2
--- /dev/null
+++ b/head_extractor/src/mmdet/datasets/youtube_vis_dataset.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import DATASETS
+from .base_video_dataset import BaseVideoDataset
+
+
+@DATASETS.register_module()
+class YouTubeVISDataset(BaseVideoDataset):
+    """YouTube VIS dataset for video instance segmentation.
+
+    Args:
+        dataset_version (str): Select dataset year version.
+    """
+
+    def __init__(self, dataset_version: str, *args, **kwargs):
+        self.set_dataset_classes(dataset_version)
+        super().__init__(*args, **kwargs)
+
+    @classmethod
+    def set_dataset_classes(cls, dataset_version: str) -> None:
+        """Pass the category of the corresponding year to metainfo.
+
+        Args:
+            dataset_version (str): Select dataset year version.
+        """
+        classes_2019_version = ('person', 'giant_panda', 'lizard', 'parrot',
+                                'skateboard', 'sedan', 'ape', 'dog', 'snake',
+                                'monkey', 'hand', 'rabbit', 'duck', 'cat',
+                                'cow', 'fish', 'train', 'horse', 'turtle',
+                                'bear', 'motorbike', 'giraffe', 'leopard',
+                                'fox', 'deer', 'owl', 'surfboard', 'airplane',
+                                'truck', 'zebra', 'tiger', 'elephant',
+                                'snowboard', 'boat', 'shark', 'mouse', 'frog',
+                                'eagle', 'earless_seal', 'tennis_racket')
+
+        classes_2021_version = ('airplane', 'bear', 'bird', 'boat', 'car',
+                                'cat', 'cow', 'deer', 'dog', 'duck',
+                                'earless_seal', 'elephant', 'fish',
+                                'flying_disc', 'fox', 'frog', 'giant_panda',
+                                'giraffe', 'horse', 'leopard', 'lizard',
+                                'monkey', 'motorbike', 'mouse', 'parrot',
+                                'person', 'rabbit', 'shark', 'skateboard',
+                                'snake', 'snowboard', 'squirrel', 'surfboard',
+                                'tennis_racket', 'tiger', 'train', 'truck',
+                                'turtle', 'whale', 'zebra')
+
+        if dataset_version == '2019':
+            cls.METAINFO = dict(classes=classes_2019_version)
+        elif dataset_version == '2021':
+            cls.METAINFO = dict(classes=classes_2021_version)
+        else:
+            raise NotImplementedError('Not supported YouTubeVIS dataset'
+                                      f'version: {dataset_version}')
diff --git a/head_extractor/src/mmdet/engine/__init__.py b/head_extractor/src/mmdet/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c91ace6ffa20948af572d3a0fd594e8a0b091775
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hooks import *  # noqa: F401, F403
+from .optimizers import *  # noqa: F401, F403
+from .runner import *  # noqa: F401, F403
+from .schedulers import *  # noqa: F401, F403
diff --git a/head_extractor/src/mmdet/engine/hooks/__init__.py b/head_extractor/src/mmdet/engine/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..889fa557adef87e2251c625a7353503226beb079
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/hooks/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .checkloss_hook import CheckInvalidLossHook
+from .mean_teacher_hook import MeanTeacherHook
+from .memory_profiler_hook import MemoryProfilerHook
+from .num_class_check_hook import NumClassCheckHook
+from .pipeline_switch_hook import PipelineSwitchHook
+from .set_epoch_info_hook import SetEpochInfoHook
+from .sync_norm_hook import SyncNormHook
+from .utils import trigger_visualization_hook
+from .visualization_hook import (DetVisualizationHook,
+                                 GroundingVisualizationHook,
+                                 TrackVisualizationHook)
+from .yolox_mode_switch_hook import YOLOXModeSwitchHook
+
+__all__ = [
+    'YOLOXModeSwitchHook', 'SyncNormHook', 'CheckInvalidLossHook',
+    'SetEpochInfoHook', 'MemoryProfilerHook', 'DetVisualizationHook',
+    'NumClassCheckHook', 'MeanTeacherHook', 'trigger_visualization_hook',
+    'PipelineSwitchHook', 'TrackVisualizationHook',
+    'GroundingVisualizationHook'
+]
diff --git a/head_extractor/src/mmdet/engine/hooks/checkloss_hook.py b/head_extractor/src/mmdet/engine/hooks/checkloss_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ebfcd5dfcd7ae329399723d3a9c0fc0a0d722ef
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/hooks/checkloss_hook.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class CheckInvalidLossHook(Hook):
+    """Check invalid loss hook.
+
+    This hook will regularly check whether the loss is valid
+    during training.
+
+    Args:
+        interval (int): Checking interval (every k iterations).
+            Default: 50.
+    """
+
+    def __init__(self, interval: int = 50) -> None:
+        self.interval = interval
+
+    def after_train_iter(self,
+                         runner: Runner,
+                         batch_idx: int,
+                         data_batch: Optional[dict] = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Regularly check whether the loss is valid every n iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict, Optional): Data from dataloader.
+                Defaults to None.
+            outputs (dict, Optional): Outputs from model. Defaults to None.
+        """
+        if self.every_n_train_iters(runner, self.interval):
+            assert torch.isfinite(outputs['loss']), \
+                runner.logger.info('loss become infinite or NaN!')
diff --git a/head_extractor/src/mmdet/engine/hooks/mean_teacher_hook.py b/head_extractor/src/mmdet/engine/hooks/mean_teacher_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..b924c0a5934248d05e7ce1add50e7574b739b9c7
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/hooks/mean_teacher_hook.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+from mmengine.runner import Runner
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class MeanTeacherHook(Hook):
+    """Mean Teacher Hook.
+
+    Mean Teacher is an efficient semi-supervised learning method in
+    `Mean Teacher <https://arxiv.org/abs/1703.01780>`_.
+    This method requires two models with exactly the same structure,
+    as the student model and the teacher model, respectively.
+    The student model updates the parameters through gradient descent,
+    and the teacher model updates the parameters through
+    exponential moving average of the student model.
+    Compared with the student model, the teacher model
+    is smoother and accumulates more knowledge.
+
+    Args:
+        momentum (float): The momentum used for updating teacher's parameter.
+            Teacher's parameter are updated with the formula:
+           `teacher = (1-momentum) * teacher + momentum * student`.
+            Defaults to 0.001.
+        interval (int): Update teacher's parameter every interval iteration.
+            Defaults to 1.
+        skip_buffers (bool): Whether to skip the model buffers, such as
+            batchnorm running stats (running_mean, running_var), it does not
+            perform the ema operation. Default to True.
+    """
+
+    def __init__(self,
+                 momentum: float = 0.001,
+                 interval: int = 1,
+                 skip_buffer=True) -> None:
+        assert 0 < momentum < 1
+        self.momentum = momentum
+        self.interval = interval
+        self.skip_buffers = skip_buffer
+
+    def before_train(self, runner: Runner) -> None:
+        """To check that teacher model and student model exist."""
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        assert hasattr(model, 'teacher')
+        assert hasattr(model, 'student')
+        # only do it at initial stage
+        if runner.iter == 0:
+            self.momentum_update(model, 1)
+
+    def after_train_iter(self,
+                         runner: Runner,
+                         batch_idx: int,
+                         data_batch: Optional[dict] = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Update teacher's parameter every self.interval iterations."""
+        if (runner.iter + 1) % self.interval != 0:
+            return
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        self.momentum_update(model, self.momentum)
+
+    def momentum_update(self, model: nn.Module, momentum: float) -> None:
+        """Compute the moving average of the parameters using exponential
+        moving average."""
+        if self.skip_buffers:
+            for (src_name, src_parm), (dst_name, dst_parm) in zip(
+                    model.student.named_parameters(),
+                    model.teacher.named_parameters()):
+                dst_parm.data.mul_(1 - momentum).add_(
+                    src_parm.data, alpha=momentum)
+        else:
+            for (src_parm,
+                 dst_parm) in zip(model.student.state_dict().values(),
+                                  model.teacher.state_dict().values()):
+                # exclude num_tracking
+                if dst_parm.dtype.is_floating_point:
+                    dst_parm.data.mul_(1 - momentum).add_(
+                        src_parm.data, alpha=momentum)
diff --git a/head_extractor/src/mmdet/engine/hooks/memory_profiler_hook.py b/head_extractor/src/mmdet/engine/hooks/memory_profiler_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dcdcae0b669ade46026d28c46b35f35d90b504b
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/hooks/memory_profiler_hook.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+
+from mmdet.registry import HOOKS
+from mmdet.structures import DetDataSample
+
+
+@HOOKS.register_module()
+class MemoryProfilerHook(Hook):
+    """Memory profiler hook recording memory information including virtual
+    memory, swap memory, and the memory of the current process.
+
+    Args:
+        interval (int): Checking interval (every k iterations).
+            Default: 50.
+    """
+
+    def __init__(self, interval: int = 50) -> None:
+        try:
+            from psutil import swap_memory, virtual_memory
+            self._swap_memory = swap_memory
+            self._virtual_memory = virtual_memory
+        except ImportError:
+            raise ImportError('psutil is not installed, please install it by: '
+                              'pip install psutil')
+
+        try:
+            from memory_profiler import memory_usage
+            self._memory_usage = memory_usage
+        except ImportError:
+            raise ImportError(
+                'memory_profiler is not installed, please install it by: '
+                'pip install memory_profiler')
+
+        self.interval = interval
+
+    def _record_memory_information(self, runner: Runner) -> None:
+        """Regularly record memory information.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training or evaluation
+                process.
+        """
+        # in Byte
+        virtual_memory = self._virtual_memory()
+        swap_memory = self._swap_memory()
+        # in MB
+        process_memory = self._memory_usage()[0]
+        factor = 1024 * 1024
+        runner.logger.info(
+            'Memory information '
+            'available_memory: '
+            f'{round(virtual_memory.available / factor)} MB, '
+            'used_memory: '
+            f'{round(virtual_memory.used / factor)} MB, '
+            f'memory_utilization: {virtual_memory.percent} %, '
+            'available_swap_memory: '
+            f'{round((swap_memory.total - swap_memory.used) / factor)}'
+            ' MB, '
+            f'used_swap_memory: {round(swap_memory.used / factor)} MB, '
+            f'swap_memory_utilization: {swap_memory.percent} %, '
+            'current_process_memory: '
+            f'{round(process_memory)} MB')
+
+    def after_train_iter(self,
+                         runner: Runner,
+                         batch_idx: int,
+                         data_batch: Optional[dict] = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Regularly record memory information.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict, optional): Data from dataloader.
+                Defaults to None.
+            outputs (dict, optional): Outputs from model. Defaults to None.
+        """
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            self._record_memory_information(runner)
+
+    def after_val_iter(
+            self,
+            runner: Runner,
+            batch_idx: int,
+            data_batch: Optional[dict] = None,
+            outputs: Optional[Sequence[DetDataSample]] = None) -> None:
+        """Regularly record memory information.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict, optional): Data from dataloader.
+                Defaults to None.
+            outputs (Sequence[:obj:`DetDataSample`], optional):
+                Outputs from model. Defaults to None.
+        """
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            self._record_memory_information(runner)
+
+    def after_test_iter(
+            self,
+            runner: Runner,
+            batch_idx: int,
+            data_batch: Optional[dict] = None,
+            outputs: Optional[Sequence[DetDataSample]] = None) -> None:
+        """Regularly record memory information.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict, optional): Data from dataloader.
+                Defaults to None.
+            outputs (Sequence[:obj:`DetDataSample`], optional):
+                Outputs from model. Defaults to None.
+        """
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            self._record_memory_information(runner)
diff --git a/head_extractor/src/mmdet/engine/hooks/num_class_check_hook.py b/head_extractor/src/mmdet/engine/hooks/num_class_check_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..6588473acfbd3ffe8e80eb163aa7ee449332e6b8
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/hooks/num_class_check_hook.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import VGG
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class NumClassCheckHook(Hook):
+    """Check whether the `num_classes` in head matches the length of `classes`
+    in `dataset.metainfo`."""
+
+    def _check_head(self, runner: Runner, mode: str) -> None:
+        """Check whether the `num_classes` in head matches the length of
+        `classes` in `dataset.metainfo`.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training or evaluation
+                process.
+        """
+        assert mode in ['train', 'val']
+        model = runner.model
+        dataset = runner.train_dataloader.dataset if mode == 'train' else \
+            runner.val_dataloader.dataset
+        if dataset.metainfo.get('classes', None) is None:
+            runner.logger.warning(
+                f'Please set `classes` '
+                f'in the {dataset.__class__.__name__} `metainfo` and'
+                f'check if it is consistent with the `num_classes` '
+                f'of head')
+        else:
+            classes = dataset.metainfo['classes']
+            assert type(classes) is not str, \
+                (f'`classes` in {dataset.__class__.__name__}'
+                 f'should be a tuple of str.'
+                 f'Add comma if number of classes is 1 as '
+                 f'classes = ({classes},)')
+            from mmdet.models.roi_heads.mask_heads import FusedSemanticHead
+            for name, module in model.named_modules():
+                if hasattr(module, 'num_classes') and not name.endswith(
+                        'rpn_head') and not isinstance(
+                            module, (VGG, FusedSemanticHead)):
+                    assert module.num_classes == len(classes), \
+                        (f'The `num_classes` ({module.num_classes}) in '
+                         f'{module.__class__.__name__} of '
+                         f'{model.__class__.__name__} does not matches '
+                         f'the length of `classes` '
+                         f'{len(classes)}) in '
+                         f'{dataset.__class__.__name__}')
+
+    def before_train_epoch(self, runner: Runner) -> None:
+        """Check whether the training dataset is compatible with head.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training or evaluation
+                process.
+        """
+        self._check_head(runner, 'train')
+
+    def before_val_epoch(self, runner: Runner) -> None:
+        """Check whether the dataset in val epoch is compatible with head.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the training or evaluation
+                process.
+        """
+        self._check_head(runner, 'val')
diff --git a/head_extractor/src/mmdet/engine/hooks/pipeline_switch_hook.py b/head_extractor/src/mmdet/engine/hooks/pipeline_switch_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5abd897803b11793ebace86e45aac8f59938545
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/hooks/pipeline_switch_hook.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms import Compose
+from mmengine.hooks import Hook
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class PipelineSwitchHook(Hook):
+    """Switch data pipeline at switch_epoch.
+
+    Args:
+        switch_epoch (int): switch pipeline at this epoch.
+        switch_pipeline (list[dict]): the pipeline to switch to.
+    """
+
+    def __init__(self, switch_epoch, switch_pipeline):
+        self.switch_epoch = switch_epoch
+        self.switch_pipeline = switch_pipeline
+        self._restart_dataloader = False
+        self._has_switched = False
+
+    def before_train_epoch(self, runner):
+        """switch pipeline."""
+        epoch = runner.epoch
+        train_loader = runner.train_dataloader
+        if epoch >= self.switch_epoch and not self._has_switched:
+            runner.logger.info('Switch pipeline now!')
+            # The dataset pipeline cannot be updated when persistent_workers
+            # is True, so we need to force the dataloader's multi-process
+            # restart. This is a very hacky approach.
+            train_loader.dataset.pipeline = Compose(self.switch_pipeline)
+            if hasattr(train_loader, 'persistent_workers'
+                       ) and train_loader.persistent_workers is True:
+                train_loader._DataLoader__initialized = False
+                train_loader._iterator = None
+                self._restart_dataloader = True
+            self._has_switched = True
+        else:
+            # Once the restart is complete, we need to restore
+            # the initialization flag.
+            if self._restart_dataloader:
+                train_loader._DataLoader__initialized = True
diff --git a/head_extractor/src/mmdet/engine/hooks/set_epoch_info_hook.py b/head_extractor/src/mmdet/engine/hooks/set_epoch_info_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..183f3167445dc0818e4fa37bdd2049d3876ed031
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/hooks/set_epoch_info_hook.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import Hook
+from mmengine.model.wrappers import is_model_wrapper
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class SetEpochInfoHook(Hook):
+    """Set runner's epoch information to the model."""
+
+    def before_train_epoch(self, runner):
+        epoch = runner.epoch
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        model.set_epoch(epoch)
diff --git a/head_extractor/src/mmdet/engine/hooks/sync_norm_hook.py b/head_extractor/src/mmdet/engine/hooks/sync_norm_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1734380c83157c911568098abfce761fb3c9a1f
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/hooks/sync_norm_hook.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+from mmengine.dist import get_dist_info
+from mmengine.hooks import Hook
+from torch import nn
+
+from mmdet.registry import HOOKS
+from mmdet.utils import all_reduce_dict
+
+
+def get_norm_states(module: nn.Module) -> OrderedDict:
+    """Get the state_dict of batch norms in the module."""
+    async_norm_states = OrderedDict()
+    for name, child in module.named_modules():
+        if isinstance(child, nn.modules.batchnorm._NormBase):
+            for k, v in child.state_dict().items():
+                async_norm_states['.'.join([name, k])] = v
+    return async_norm_states
+
+
+@HOOKS.register_module()
+class SyncNormHook(Hook):
+    """Synchronize Norm states before validation, currently used in YOLOX."""
+
+    def before_val_epoch(self, runner):
+        """Synchronizing norm."""
+        module = runner.model
+        _, world_size = get_dist_info()
+        if world_size == 1:
+            return
+        norm_states = get_norm_states(module)
+        if len(norm_states) == 0:
+            return
+        # TODO: use `all_reduce_dict` in mmengine
+        norm_states = all_reduce_dict(norm_states, op='mean')
+        module.load_state_dict(norm_states, strict=False)
diff --git a/head_extractor/src/mmdet/engine/hooks/utils.py b/head_extractor/src/mmdet/engine/hooks/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d267cfe77be163c0520568b7b7936f4453914aab
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/hooks/utils.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def trigger_visualization_hook(cfg, args):
+    default_hooks = cfg.default_hooks
+    if 'visualization' in default_hooks:
+        visualization_hook = default_hooks['visualization']
+        # Turn on visualization
+        visualization_hook['draw'] = True
+        if args.show:
+            visualization_hook['show'] = True
+            visualization_hook['wait_time'] = args.wait_time
+        if args.show_dir:
+            visualization_hook['test_out_dir'] = args.show_dir
+    else:
+        raise RuntimeError(
+            'VisualizationHook must be included in default_hooks.'
+            'refer to usage '
+            '"visualization=dict(type=\'VisualizationHook\')"')
+
+    return cfg
diff --git a/head_extractor/src/mmdet/engine/hooks/visualization_hook.py b/head_extractor/src/mmdet/engine/hooks/visualization_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..3408186b6ef9c4195745b0c740519541572d27d2
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/hooks/visualization_hook.py
@@ -0,0 +1,515 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import Optional, Sequence
+
+import mmcv
+import numpy as np
+from mmengine.fileio import get
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+from mmengine.utils import mkdir_or_exist
+from mmengine.visualization import Visualizer
+
+from mmdet.datasets.samplers import TrackImgSampler
+from mmdet.registry import HOOKS
+from mmdet.structures import DetDataSample, TrackDataSample
+from mmdet.structures.bbox import BaseBoxes
+from mmdet.visualization.palette import _get_adaptive_scales
+
+
+@HOOKS.register_module()
+class DetVisualizationHook(Hook):
+    """Detection Visualization Hook. Used to visualize validation and testing
+    process prediction results.
+
+    In the testing phase:
+
+    1. If ``show`` is True, it means that only the prediction results are
+        visualized without storing data, so ``vis_backends`` needs to
+        be excluded.
+    2. If ``test_out_dir`` is specified, it means that the prediction results
+        need to be saved to ``test_out_dir``. In order to avoid vis_backends
+        also storing data, so ``vis_backends`` needs to be excluded.
+    3. ``vis_backends`` takes effect if the user does not specify ``show``
+        and `test_out_dir``. You can set ``vis_backends`` to WandbVisBackend or
+        TensorboardVisBackend to store the prediction result in Wandb or
+        Tensorboard.
+
+    Args:
+        draw (bool): whether to draw prediction results. If it is False,
+            it means that no drawing will be done. Defaults to False.
+        interval (int): The interval of visualization. Defaults to 50.
+        score_thr (float): The threshold to visualize the bboxes
+            and masks. Defaults to 0.3.
+        show (bool): Whether to display the drawn image. Default to False.
+        wait_time (float): The interval of show (s). Defaults to 0.
+        test_out_dir (str, optional): directory where painted images
+            will be saved in testing process.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    def __init__(self,
+                 draw: bool = False,
+                 interval: int = 50,
+                 score_thr: float = 0.3,
+                 show: bool = False,
+                 wait_time: float = 0.,
+                 test_out_dir: Optional[str] = None,
+                 backend_args: dict = None):
+        self._visualizer: Visualizer = Visualizer.get_current_instance()
+        self.interval = interval
+        self.score_thr = score_thr
+        self.show = show
+        if self.show:
+            # No need to think about vis backends.
+            self._visualizer._vis_backends = {}
+            warnings.warn('The show is True, it means that only '
+                          'the prediction results are visualized '
+                          'without storing data, so vis_backends '
+                          'needs to be excluded.')
+
+        self.wait_time = wait_time
+        self.backend_args = backend_args
+        self.draw = draw
+        self.test_out_dir = test_out_dir
+        self._test_index = 0
+
+    def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                       outputs: Sequence[DetDataSample]) -> None:
+        """Run after every ``self.interval`` validation iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DetDataSample`]]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        # There is no guarantee that the same batch of images
+        # is visualized for each evaluation.
+        total_curr_iter = runner.iter + batch_idx
+
+        # Visualize only the first data
+        img_path = outputs[0].img_path
+        img_bytes = get(img_path, backend_args=self.backend_args)
+        img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+        if total_curr_iter % self.interval == 0:
+            self._visualizer.add_datasample(
+                osp.basename(img_path) if self.show else 'val_img',
+                img,
+                data_sample=outputs[0],
+                show=self.show,
+                wait_time=self.wait_time,
+                pred_score_thr=self.score_thr,
+                step=total_curr_iter)
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[DetDataSample]) -> None:
+        """Run after every testing iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DetDataSample`]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        if self.test_out_dir is not None:
+            self.test_out_dir = osp.join(runner.work_dir, runner.timestamp,
+                                         self.test_out_dir)
+            mkdir_or_exist(self.test_out_dir)
+
+        for data_sample in outputs:
+            self._test_index += 1
+
+            img_path = data_sample.img_path
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+            out_file = None
+            if self.test_out_dir is not None:
+                out_file = osp.basename(img_path)
+                out_file = osp.join(self.test_out_dir, out_file)
+
+            self._visualizer.add_datasample(
+                osp.basename(img_path) if self.show else 'test_img',
+                img,
+                data_sample=data_sample,
+                show=self.show,
+                wait_time=self.wait_time,
+                pred_score_thr=self.score_thr,
+                out_file=out_file,
+                step=self._test_index)
+
+
+@HOOKS.register_module()
+class TrackVisualizationHook(Hook):
+    """Tracking Visualization Hook. Used to visualize validation and testing
+    process prediction results.
+
+    In the testing phase:
+
+    1. If ``show`` is True, it means that only the prediction results are
+        visualized without storing data, so ``vis_backends`` needs to
+        be excluded.
+    2. If ``test_out_dir`` is specified, it means that the prediction results
+        need to be saved to ``test_out_dir``. In order to avoid vis_backends
+        also storing data, so ``vis_backends`` needs to be excluded.
+    3. ``vis_backends`` takes effect if the user does not specify ``show``
+        and `test_out_dir``. You can set ``vis_backends`` to WandbVisBackend or
+        TensorboardVisBackend to store the prediction result in Wandb or
+        Tensorboard.
+
+    Args:
+        draw (bool): whether to draw prediction results. If it is False,
+            it means that no drawing will be done. Defaults to False.
+        frame_interval (int): The interval of visualization. Defaults to 30.
+        score_thr (float): The threshold to visualize the bboxes
+            and masks. Defaults to 0.3.
+        show (bool): Whether to display the drawn image. Default to False.
+        wait_time (float): The interval of show (s). Defaults to 0.
+        test_out_dir (str, optional): directory where painted images
+            will be saved in testing process.
+        backend_args (dict): Arguments to instantiate a file client.
+            Defaults to ``None``.
+    """
+
+    def __init__(self,
+                 draw: bool = False,
+                 frame_interval: int = 30,
+                 score_thr: float = 0.3,
+                 show: bool = False,
+                 wait_time: float = 0.,
+                 test_out_dir: Optional[str] = None,
+                 backend_args: dict = None) -> None:
+        self._visualizer: Visualizer = Visualizer.get_current_instance()
+        self.frame_interval = frame_interval
+        self.score_thr = score_thr
+        self.show = show
+        if self.show:
+            # No need to think about vis backends.
+            self._visualizer._vis_backends = {}
+            warnings.warn('The show is True, it means that only '
+                          'the prediction results are visualized '
+                          'without storing data, so vis_backends '
+                          'needs to be excluded.')
+
+        self.wait_time = wait_time
+        self.backend_args = backend_args
+        self.draw = draw
+        self.test_out_dir = test_out_dir
+        self.image_idx = 0
+
+    def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                       outputs: Sequence[TrackDataSample]) -> None:
+        """Run after every ``self.interval`` validation iteration.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`TrackDataSample`]): Outputs from model.
+        """
+        if self.draw is False:
+            return
+
+        assert len(outputs) == 1, \
+            'only batch_size=1 is supported while validating.'
+
+        sampler = runner.val_dataloader.sampler
+        if isinstance(sampler, TrackImgSampler):
+            if self.every_n_inner_iters(batch_idx, self.frame_interval):
+                total_curr_iter = runner.iter + batch_idx
+                track_data_sample = outputs[0]
+                self.visualize_single_image(track_data_sample[0],
+                                            total_curr_iter)
+        else:
+            # video visualization DefaultSampler
+            if self.every_n_inner_iters(batch_idx, 1):
+                track_data_sample = outputs[0]
+                video_length = len(track_data_sample)
+
+                for frame_id in range(video_length):
+                    if frame_id % self.frame_interval == 0:
+                        total_curr_iter = runner.iter + self.image_idx + \
+                                          frame_id
+                        img_data_sample = track_data_sample[frame_id]
+                        self.visualize_single_image(img_data_sample,
+                                                    total_curr_iter)
+                self.image_idx = self.image_idx + video_length
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[TrackDataSample]) -> None:
+        """Run after every testing iteration.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`TrackDataSample`]): Outputs from model.
+        """
+        if self.draw is False:
+            return
+
+        assert len(outputs) == 1, \
+            'only batch_size=1 is supported while testing.'
+
+        if self.test_out_dir is not None:
+            self.test_out_dir = osp.join(runner.work_dir, runner.timestamp,
+                                         self.test_out_dir)
+            mkdir_or_exist(self.test_out_dir)
+
+        sampler = runner.test_dataloader.sampler
+        if isinstance(sampler, TrackImgSampler):
+            if self.every_n_inner_iters(batch_idx, self.frame_interval):
+                track_data_sample = outputs[0]
+                self.visualize_single_image(track_data_sample[0], batch_idx)
+        else:
+            # video visualization DefaultSampler
+            if self.every_n_inner_iters(batch_idx, 1):
+                track_data_sample = outputs[0]
+                video_length = len(track_data_sample)
+
+                for frame_id in range(video_length):
+                    if frame_id % self.frame_interval == 0:
+                        img_data_sample = track_data_sample[frame_id]
+                        self.visualize_single_image(img_data_sample,
+                                                    self.image_idx + frame_id)
+                self.image_idx = self.image_idx + video_length
+
+    def visualize_single_image(self, img_data_sample: DetDataSample,
+                               step: int) -> None:
+        """
+        Args:
+            img_data_sample (DetDataSample): single image output.
+            step (int): The index of the current image.
+        """
+        img_path = img_data_sample.img_path
+        img_bytes = get(img_path, backend_args=self.backend_args)
+        img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+        out_file = None
+        if self.test_out_dir is not None:
+            video_name = img_path.split('/')[-3]
+            mkdir_or_exist(osp.join(self.test_out_dir, video_name))
+            out_file = osp.join(self.test_out_dir, video_name,
+                                osp.basename(img_path))
+
+        self._visualizer.add_datasample(
+            osp.basename(img_path) if self.show else 'test_img',
+            img,
+            data_sample=img_data_sample,
+            show=self.show,
+            wait_time=self.wait_time,
+            pred_score_thr=self.score_thr,
+            out_file=out_file,
+            step=step)
+
+
+def draw_all_character(visualizer, characters, w):
+    start_index = 2
+    y_index = 5
+    for char in characters:
+        if isinstance(char, str):
+            visualizer.draw_texts(
+                str(char),
+                positions=np.array([start_index, y_index]),
+                colors=(0, 0, 0),
+                font_families='monospace')
+            start_index += len(char) * 8
+        else:
+            visualizer.draw_texts(
+                str(char[0]),
+                positions=np.array([start_index, y_index]),
+                colors=char[1],
+                font_families='monospace')
+            start_index += len(char[0]) * 8
+
+        if start_index > w - 10:
+            start_index = 2
+            y_index += 15
+
+    drawn_text = visualizer.get_image()
+    return drawn_text
+
+
+@HOOKS.register_module()
+class GroundingVisualizationHook(DetVisualizationHook):
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[DetDataSample]) -> None:
+        """Run after every testing iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`DetDataSample`]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        if self.test_out_dir is not None:
+            self.test_out_dir = osp.join(runner.work_dir, runner.timestamp,
+                                         self.test_out_dir)
+            mkdir_or_exist(self.test_out_dir)
+
+        for data_sample in outputs:
+            data_sample = data_sample.cpu()
+
+            self._test_index += 1
+
+            img_path = data_sample.img_path
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+            out_file = None
+            if self.test_out_dir is not None:
+                out_file = osp.basename(img_path)
+                out_file = osp.join(self.test_out_dir, out_file)
+
+            text = data_sample.text
+            if isinstance(text, str):  # VG
+                gt_instances = data_sample.gt_instances
+                tokens_positive = data_sample.tokens_positive
+                if 'phrase_ids' in data_sample:
+                    # flickr30k
+                    gt_labels = data_sample.phrase_ids
+                else:
+                    gt_labels = gt_instances.labels
+                gt_bboxes = gt_instances.get('bboxes', None)
+                if gt_bboxes is not None and isinstance(gt_bboxes, BaseBoxes):
+                    gt_instances.bboxes = gt_bboxes.tensor
+                print(gt_labels, tokens_positive, gt_bboxes, img_path)
+                pred_instances = data_sample.pred_instances
+                pred_instances = pred_instances[
+                    pred_instances.scores > self.score_thr]
+                pred_labels = pred_instances.labels
+                pred_bboxes = pred_instances.bboxes
+                pred_scores = pred_instances.scores
+
+                max_label = 0
+                if len(gt_labels) > 0:
+                    max_label = max(gt_labels)
+                if len(pred_labels) > 0:
+                    max_label = max(max(pred_labels), max_label)
+
+                max_label = int(max(max_label, 0))
+                palette = np.random.randint(0, 256, size=(max_label + 1, 3))
+                bbox_palette = [tuple(c) for c in palette]
+                # bbox_palette = get_palette('random', max_label + 1)
+                if len(gt_labels) >= len(pred_labels):
+                    colors = [bbox_palette[label] for label in gt_labels]
+                else:
+                    colors = [bbox_palette[label] for label in pred_labels]
+
+                self._visualizer.set_image(img)
+
+                for label, bbox, color in zip(gt_labels, gt_bboxes, colors):
+                    self._visualizer.draw_bboxes(
+                        bbox, edge_colors=color, face_colors=color, alpha=0.3)
+                    self._visualizer.draw_bboxes(
+                        bbox, edge_colors=color, alpha=1)
+
+                drawn_img = self._visualizer.get_image()
+
+                new_image = np.ones(
+                    (100, img.shape[1], 3), dtype=np.uint8) * 255
+                self._visualizer.set_image(new_image)
+
+                if tokens_positive == -1:  # REC
+                    gt_tokens_positive = [[]]
+                else:  # Phrase Grounding
+                    gt_tokens_positive = [
+                        tokens_positive[label] for label in gt_labels
+                    ]
+                split_by_character = [char for char in text]
+                characters = []
+                start_index = 0
+                end_index = 0
+                for w in split_by_character:
+                    end_index += len(w)
+                    is_find = False
+                    for i, positive in enumerate(gt_tokens_positive):
+                        for p in positive:
+                            if start_index >= p[0] and end_index <= p[1]:
+                                characters.append([w, colors[i]])
+                                is_find = True
+                                break
+                        if is_find:
+                            break
+                    if not is_find:
+                        characters.append([w, (0, 0, 0)])
+                    start_index = end_index
+
+                drawn_text = draw_all_character(self._visualizer, characters,
+                                                img.shape[1])
+                drawn_gt_img = np.concatenate((drawn_img, drawn_text), axis=0)
+
+                self._visualizer.set_image(img)
+
+                for label, bbox, color in zip(pred_labels, pred_bboxes,
+                                              colors):
+                    self._visualizer.draw_bboxes(
+                        bbox, edge_colors=color, face_colors=color, alpha=0.3)
+                    self._visualizer.draw_bboxes(
+                        bbox, edge_colors=color, alpha=1)
+                print(pred_labels, pred_bboxes, pred_scores, colors)
+                areas = (pred_bboxes[:, 3] - pred_bboxes[:, 1]) * (
+                    pred_bboxes[:, 2] - pred_bboxes[:, 0])
+                scales = _get_adaptive_scales(areas)
+                score = [str(round(s.item(), 2)) for s in pred_scores]
+                font_sizes = [int(13 * scales[i]) for i in range(len(scales))]
+                self._visualizer.draw_texts(
+                    score,
+                    pred_bboxes[:, :2].int(),
+                    colors=(255, 255, 255),
+                    font_sizes=font_sizes,
+                    bboxes=[{
+                        'facecolor': 'black',
+                        'alpha': 0.8,
+                        'pad': 0.7,
+                        'edgecolor': 'none'
+                    }] * len(pred_bboxes))
+
+                drawn_img = self._visualizer.get_image()
+
+                new_image = np.ones(
+                    (100, img.shape[1], 3), dtype=np.uint8) * 255
+                self._visualizer.set_image(new_image)
+                drawn_text = draw_all_character(self._visualizer, characters,
+                                                img.shape[1])
+                drawn_pred_img = np.concatenate((drawn_img, drawn_text),
+                                                axis=0)
+                drawn_img = np.concatenate((drawn_gt_img, drawn_pred_img),
+                                           axis=1)
+
+                if self.show:
+                    self._visualizer.show(
+                        drawn_img,
+                        win_name=osp.basename(img_path),
+                        wait_time=self.wait_time)
+                if out_file is not None:
+                    mmcv.imwrite(drawn_img[..., ::-1], out_file)
+                else:
+                    self.add_image('test_img', drawn_img, self._test_index)
+            else:  # OD
+                self._visualizer.add_datasample(
+                    osp.basename(img_path) if self.show else 'test_img',
+                    img,
+                    data_sample=data_sample,
+                    show=self.show,
+                    wait_time=self.wait_time,
+                    pred_score_thr=self.score_thr,
+                    out_file=out_file,
+                    step=self._test_index)
diff --git a/head_extractor/src/mmdet/engine/hooks/yolox_mode_switch_hook.py b/head_extractor/src/mmdet/engine/hooks/yolox_mode_switch_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a2c69068bedd1c6fb3836e1fc34568e9f6bc83
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/hooks/yolox_mode_switch_hook.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class YOLOXModeSwitchHook(Hook):
+    """Switch the mode of YOLOX during training.
+
+    This hook turns off the mosaic and mixup data augmentation and switches
+    to use L1 loss in bbox_head.
+
+    Args:
+        num_last_epochs (int): The number of latter epochs in the end of the
+            training to close the data augmentation and switch to L1 loss.
+            Defaults to 15.
+       skip_type_keys (Sequence[str], optional): Sequence of type string to be
+            skip pipeline. Defaults to ('Mosaic', 'RandomAffine', 'MixUp').
+    """
+
+    def __init__(
+        self,
+        num_last_epochs: int = 15,
+        skip_type_keys: Sequence[str] = ('Mosaic', 'RandomAffine', 'MixUp')
+    ) -> None:
+        self.num_last_epochs = num_last_epochs
+        self.skip_type_keys = skip_type_keys
+        self._restart_dataloader = False
+        self._has_switched = False
+
+    def before_train_epoch(self, runner) -> None:
+        """Close mosaic and mixup augmentation and switches to use L1 loss."""
+        epoch = runner.epoch
+        train_loader = runner.train_dataloader
+        model = runner.model
+        # TODO: refactor after mmengine using model wrapper
+        if is_model_wrapper(model):
+            model = model.module
+        epoch_to_be_switched = ((epoch + 1) >=
+                                runner.max_epochs - self.num_last_epochs)
+        if epoch_to_be_switched and not self._has_switched:
+            runner.logger.info('No mosaic and mixup aug now!')
+            # The dataset pipeline cannot be updated when persistent_workers
+            # is True, so we need to force the dataloader's multi-process
+            # restart. This is a very hacky approach.
+            train_loader.dataset.update_skip_type_keys(self.skip_type_keys)
+            if hasattr(train_loader, 'persistent_workers'
+                       ) and train_loader.persistent_workers is True:
+                train_loader._DataLoader__initialized = False
+                train_loader._iterator = None
+                self._restart_dataloader = True
+            runner.logger.info('Add additional L1 loss now!')
+            if hasattr(model, 'detector'):
+                model.detector.bbox_head.use_l1 = True
+            else:
+                model.bbox_head.use_l1 = True
+            self._has_switched = True
+        else:
+            # Once the restart is complete, we need to restore
+            # the initialization flag.
+            if self._restart_dataloader:
+                train_loader._DataLoader__initialized = True
diff --git a/head_extractor/src/mmdet/engine/optimizers/__init__.py b/head_extractor/src/mmdet/engine/optimizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..83db069ee34cad0888bbf388d3cc7030ba49bbbb
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/optimizers/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .layer_decay_optimizer_constructor import \
+    LearningRateDecayOptimizerConstructor
+
+__all__ = ['LearningRateDecayOptimizerConstructor']
diff --git a/head_extractor/src/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py b/head_extractor/src/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..73028a0aef698d63dcba8c4935d6ef6c577d0f46
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/optimizers/layer_decay_optimizer_constructor.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+from typing import List
+
+import torch.nn as nn
+from mmengine.dist import get_dist_info
+from mmengine.logging import MMLogger
+from mmengine.optim import DefaultOptimWrapperConstructor
+
+from mmdet.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+def get_layer_id_for_convnext(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates in ``layer_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_layer_id (int): Maximum layer id.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        stage_id = int(var_name.split('.')[2])
+        if stage_id == 0:
+            layer_id = 0
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        block_id = int(var_name.split('.')[3])
+        if stage_id == 0:
+            layer_id = 1
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3 + block_id // 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    else:
+        return max_layer_id + 1
+
+
+def get_stage_id_for_convnext(var_name, max_stage_id):
+    """Get the stage id to set the different learning rates in ``stage_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_stage_id (int): Maximum stage id.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        return 0
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        return stage_id + 1
+    else:
+        return max_stage_id - 1
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimWrapperConstructor):
+    # Different learning rates are set for different layers of backbone.
+    # Note: Currently, this optimizer constructor is built for ConvNeXt.
+
+    def add_params(self, params: List[dict], module: nn.Module,
+                   **kwargs) -> None:
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+        """
+        logger = MMLogger.get_current_instance()
+
+        parameter_groups = {}
+        logger.info(f'self.paramwise_cfg is {self.paramwise_cfg}')
+        num_layers = self.paramwise_cfg.get('num_layers') + 2
+        decay_rate = self.paramwise_cfg.get('decay_rate')
+        decay_type = self.paramwise_cfg.get('decay_type', 'layer_wise')
+        logger.info('Build LearningRateDecayOptimizerConstructor  '
+                    f'{decay_type} {decay_rate} - {num_layers}')
+        weight_decay = self.base_wd
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith('.bias') or name in (
+                    'pos_embed', 'cls_token'):
+                group_name = 'no_decay'
+                this_weight_decay = 0.
+            else:
+                group_name = 'decay'
+                this_weight_decay = weight_decay
+            if 'layer_wise' in decay_type:
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_convnext(
+                        name, self.paramwise_cfg.get('num_layers'))
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            elif decay_type == 'stage_wise':
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_stage_id_for_convnext(name, num_layers)
+                    logger.info(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            group_name = f'layer_{layer_id}_{group_name}'
+
+            if group_name not in parameter_groups:
+                scale = decay_rate**(num_layers - layer_id - 1)
+
+                parameter_groups[group_name] = {
+                    'weight_decay': this_weight_decay,
+                    'params': [],
+                    'param_names': [],
+                    'lr_scale': scale,
+                    'group_name': group_name,
+                    'lr': scale * self.base_lr,
+                }
+
+            parameter_groups[group_name]['params'].append(param)
+            parameter_groups[group_name]['param_names'].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    'param_names': parameter_groups[key]['param_names'],
+                    'lr_scale': parameter_groups[key]['lr_scale'],
+                    'lr': parameter_groups[key]['lr'],
+                    'weight_decay': parameter_groups[key]['weight_decay'],
+                }
+            logger.info(f'Param groups = {json.dumps(to_display, indent=2)}')
+        params.extend(parameter_groups.values())
diff --git a/head_extractor/src/mmdet/engine/runner/__init__.py b/head_extractor/src/mmdet/engine/runner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8bcce4448e48e2d64354ba6770f9f426fb3d869
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/runner/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .loops import TeacherStudentValLoop
+
+__all__ = ['TeacherStudentValLoop']
diff --git a/head_extractor/src/mmdet/engine/runner/loops.py b/head_extractor/src/mmdet/engine/runner/loops.py
new file mode 100644
index 0000000000000000000000000000000000000000..afe53afa5c80facf3ba6c224bd358e0859dade32
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/runner/loops.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.model import is_model_wrapper
+from mmengine.runner import ValLoop
+
+from mmdet.registry import LOOPS
+
+
+@LOOPS.register_module()
+class TeacherStudentValLoop(ValLoop):
+    """Loop for validation of model teacher and student."""
+
+    def run(self):
+        """Launch validation for model teacher and student."""
+        self.runner.call_hook('before_val')
+        self.runner.call_hook('before_val_epoch')
+        self.runner.model.eval()
+
+        model = self.runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        assert hasattr(model, 'teacher')
+        assert hasattr(model, 'student')
+
+        predict_on = model.semi_test_cfg.get('predict_on', None)
+        multi_metrics = dict()
+        for _predict_on in ['teacher', 'student']:
+            model.semi_test_cfg['predict_on'] = _predict_on
+            for idx, data_batch in enumerate(self.dataloader):
+                self.run_iter(idx, data_batch)
+            # compute metrics
+            metrics = self.evaluator.evaluate(len(self.dataloader.dataset))
+            multi_metrics.update(
+                {'/'.join((_predict_on, k)): v
+                 for k, v in metrics.items()})
+        model.semi_test_cfg['predict_on'] = predict_on
+
+        self.runner.call_hook('after_val_epoch', metrics=multi_metrics)
+        self.runner.call_hook('after_val')
diff --git a/head_extractor/src/mmdet/engine/schedulers/__init__.py b/head_extractor/src/mmdet/engine/schedulers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..01261646fa8255c643e86ba0517019760a50d387
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/schedulers/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .quadratic_warmup import (QuadraticWarmupLR, QuadraticWarmupMomentum,
+                               QuadraticWarmupParamScheduler)
+
+__all__ = [
+    'QuadraticWarmupParamScheduler', 'QuadraticWarmupMomentum',
+    'QuadraticWarmupLR'
+]
diff --git a/head_extractor/src/mmdet/engine/schedulers/quadratic_warmup.py b/head_extractor/src/mmdet/engine/schedulers/quadratic_warmup.py
new file mode 100644
index 0000000000000000000000000000000000000000..639b47854887786bf3f81d6d0a375033d190d91e
--- /dev/null
+++ b/head_extractor/src/mmdet/engine/schedulers/quadratic_warmup.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.optim.scheduler.lr_scheduler import LRSchedulerMixin
+from mmengine.optim.scheduler.momentum_scheduler import MomentumSchedulerMixin
+from mmengine.optim.scheduler.param_scheduler import INF, _ParamScheduler
+from torch.optim import Optimizer
+
+from mmdet.registry import PARAM_SCHEDULERS
+
+
+@PARAM_SCHEDULERS.register_module()
+class QuadraticWarmupParamScheduler(_ParamScheduler):
+    r"""Warm up the parameter value of each parameter group by quadratic
+    formula:
+
+    .. math::
+
+        X_{t} = X_{t-1} + \frac{2t+1}{{(end-begin)}^{2}} \times X_{base}
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: Optimizer,
+                 param_name: str,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        if end >= INF:
+            raise ValueError('``end`` must be less than infinity,'
+                             'Please set ``end`` parameter of '
+                             '``QuadraticWarmupScheduler`` as the '
+                             'number of warmup end.')
+        self.total_iters = end - begin
+        super().__init__(
+            optimizer=optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = begin * epoch_length
+        if end != INF:
+            end = end * epoch_length
+        return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step == 0:
+            return [
+                base_value * (2 * self.last_step + 1) / self.total_iters**2
+                for base_value in self.base_values
+            ]
+
+        return [
+            group[self.param_name] + base_value *
+            (2 * self.last_step + 1) / self.total_iters**2
+            for base_value, group in zip(self.base_values,
+                                         self.optimizer.param_groups)
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class QuadraticWarmupLR(LRSchedulerMixin, QuadraticWarmupParamScheduler):
+    """Warm up the learning rate of each parameter group by quadratic formula.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class QuadraticWarmupMomentum(MomentumSchedulerMixin,
+                              QuadraticWarmupParamScheduler):
+    """Warm up the momentum value of each parameter group by quadratic formula.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
diff --git a/head_extractor/src/mmdet/evaluation/__init__.py b/head_extractor/src/mmdet/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..126dea092eb1a4affab9fbe3fb043f5b373607ee
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .evaluator import *  # noqa: F401,F403
+from .functional import *  # noqa: F401,F403
+from .metrics import *  # noqa: F401,F403
diff --git a/head_extractor/src/mmdet/evaluation/evaluator/__init__.py b/head_extractor/src/mmdet/evaluation/evaluator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b13fe99548e7e2e4c6e196a2da22b9c8cbec8a3
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/evaluator/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .multi_datasets_evaluator import MultiDatasetsEvaluator
+
+__all__ = ['MultiDatasetsEvaluator']
diff --git a/head_extractor/src/mmdet/evaluation/evaluator/multi_datasets_evaluator.py b/head_extractor/src/mmdet/evaluation/evaluator/multi_datasets_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cff1cf210e644e11b348f3aa757119ac579170d
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/evaluator/multi_datasets_evaluator.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import OrderedDict
+from typing import Sequence, Union
+
+from mmengine.dist import (broadcast_object_list, collect_results,
+                           is_main_process)
+from mmengine.evaluator import BaseMetric, Evaluator
+from mmengine.evaluator.metric import _to_cpu
+from mmengine.registry import EVALUATOR
+
+from mmdet.utils import ConfigType
+
+
+@EVALUATOR.register_module()
+class MultiDatasetsEvaluator(Evaluator):
+    """Wrapper class to compose class: `ConcatDataset` and multiple
+    :class:`BaseMetric` instances.
+    The metrics will be evaluated on each dataset slice separately. The name of
+    the each metric is the concatenation of the dataset prefix, the metric
+    prefix and the key of metric - e.g.
+    `dataset_prefix/metric_prefix/accuracy`.
+
+    Args:
+        metrics (dict or BaseMetric or Sequence): The config of metrics.
+        dataset_prefixes (Sequence[str]): The prefix of each dataset. The
+            length of this sequence should be the same as the length of the
+            datasets.
+    """
+
+    def __init__(self, metrics: Union[ConfigType, BaseMetric, Sequence],
+                 dataset_prefixes: Sequence[str]) -> None:
+        super().__init__(metrics)
+        self.dataset_prefixes = dataset_prefixes
+        self._setups = False
+
+    def _get_cumulative_sizes(self):
+        # ConcatDataset have a property `cumulative_sizes`
+        if isinstance(self.dataset_meta, Sequence):
+            dataset_slices = self.dataset_meta[0]['cumulative_sizes']
+            if not self._setups:
+                self._setups = True
+                for dataset_meta, metric in zip(self.dataset_meta,
+                                                self.metrics):
+                    metric.dataset_meta = dataset_meta
+        else:
+            dataset_slices = self.dataset_meta['cumulative_sizes']
+        return dataset_slices
+
+    def evaluate(self, size: int) -> dict:
+        """Invoke ``evaluate`` method of each metric and collect the metrics
+        dictionary.
+
+        Args:
+            size (int): Length of the entire validation dataset. When batch
+                size > 1, the dataloader may pad some data samples to make
+                sure all ranks have the same length of dataset slice. The
+                ``collect_results`` function will drop the padded data based on
+                this size.
+
+        Returns:
+            dict: Evaluation results of all metrics. The keys are the names
+            of the metrics, and the values are corresponding results.
+        """
+        metrics_results = OrderedDict()
+        dataset_slices = self._get_cumulative_sizes()
+        assert len(dataset_slices) == len(self.dataset_prefixes)
+
+        for dataset_prefix, start, end, metric in zip(
+                self.dataset_prefixes, [0] + dataset_slices[:-1],
+                dataset_slices, self.metrics):
+            if len(metric.results) == 0:
+                warnings.warn(
+                    f'{metric.__class__.__name__} got empty `self.results`.'
+                    'Please ensure that the processed results are properly '
+                    'added into `self.results` in `process` method.')
+
+            results = collect_results(metric.results, size,
+                                      metric.collect_device)
+
+            if is_main_process():
+                # cast all tensors in results list to cpu
+                results = _to_cpu(results)
+                _metrics = metric.compute_metrics(
+                    results[start:end])  # type: ignore
+
+                if metric.prefix:
+                    final_prefix = '/'.join((dataset_prefix, metric.prefix))
+                else:
+                    final_prefix = dataset_prefix
+                print(f'================{final_prefix}================')
+                metric_results = {
+                    '/'.join((final_prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+
+                # Check metric name conflicts
+                for name in metric_results.keys():
+                    if name in metrics_results:
+                        raise ValueError(
+                            'There are multiple evaluation results with '
+                            f'the same metric name {name}. Please make '
+                            'sure all metrics have different prefixes.')
+                metrics_results.update(metric_results)
+            metric.results.clear()
+        if is_main_process():
+            metrics_results = [metrics_results]
+        else:
+            metrics_results = [None]  # type: ignore
+        broadcast_object_list(metrics_results)
+        return metrics_results[0]
diff --git a/head_extractor/src/mmdet/evaluation/functional/__init__.py b/head_extractor/src/mmdet/evaluation/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..96d58ebd3ab0dd714a6f361622a7faf2a09486cb
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/functional/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_overlaps import bbox_overlaps
+from .cityscapes_utils import evaluateImgLists
+from .class_names import (cityscapes_classes, coco_classes,
+                          coco_panoptic_classes, dataset_aliases, get_classes,
+                          imagenet_det_classes, imagenet_vid_classes,
+                          objects365v1_classes, objects365v2_classes,
+                          oid_challenge_classes, oid_v6_classes, voc_classes)
+from .mean_ap import average_precision, eval_map, print_map_summary
+from .panoptic_utils import (INSTANCE_OFFSET, pq_compute_multi_core,
+                             pq_compute_single_core)
+from .recall import (eval_recalls, plot_iou_recall, plot_num_recall,
+                     print_recall_summary)
+from .ytvis import YTVIS
+from .ytviseval import YTVISeval
+
+__all__ = [
+    'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes',
+    'coco_classes', 'cityscapes_classes', 'dataset_aliases', 'get_classes',
+    'average_precision', 'eval_map', 'print_map_summary', 'eval_recalls',
+    'print_recall_summary', 'plot_num_recall', 'plot_iou_recall',
+    'oid_v6_classes', 'oid_challenge_classes', 'INSTANCE_OFFSET',
+    'pq_compute_single_core', 'pq_compute_multi_core', 'bbox_overlaps',
+    'objects365v1_classes', 'objects365v2_classes', 'coco_panoptic_classes',
+    'evaluateImgLists', 'YTVIS', 'YTVISeval'
+]
diff --git a/head_extractor/src/mmdet/evaluation/functional/bbox_overlaps.py b/head_extractor/src/mmdet/evaluation/functional/bbox_overlaps.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d6eb82fcfc8d5444dd2a13b7d95b978f8206a55
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/functional/bbox_overlaps.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+
+def bbox_overlaps(bboxes1,
+                  bboxes2,
+                  mode='iou',
+                  eps=1e-6,
+                  use_legacy_coordinate=False):
+    """Calculate the ious between each bbox of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (ndarray): Shape (n, 4)
+        bboxes2 (ndarray): Shape (k, 4)
+        mode (str): IOU (intersection over union) or IOF (intersection
+            over foreground)
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Note when function is used in `VOCDataset`, it should be
+            True to align with the official implementation
+            `http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar`
+            Default: False.
+
+    Returns:
+        ious (ndarray): Shape (n, k)
+    """
+
+    assert mode in ['iou', 'iof']
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+    bboxes1 = bboxes1.astype(np.float32)
+    bboxes2 = bboxes2.astype(np.float32)
+    rows = bboxes1.shape[0]
+    cols = bboxes2.shape[0]
+    ious = np.zeros((rows, cols), dtype=np.float32)
+    if rows * cols == 0:
+        return ious
+    exchange = False
+    if bboxes1.shape[0] > bboxes2.shape[0]:
+        bboxes1, bboxes2 = bboxes2, bboxes1
+        ious = np.zeros((cols, rows), dtype=np.float32)
+        exchange = True
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0] + extra_length) * (
+        bboxes1[:, 3] - bboxes1[:, 1] + extra_length)
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0] + extra_length) * (
+        bboxes2[:, 3] - bboxes2[:, 1] + extra_length)
+    for i in range(bboxes1.shape[0]):
+        x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
+        y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
+        x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
+        y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
+        overlap = np.maximum(x_end - x_start + extra_length, 0) * np.maximum(
+            y_end - y_start + extra_length, 0)
+        if mode == 'iou':
+            union = area1[i] + area2 - overlap
+        else:
+            union = area1[i] if not exchange else area2
+        union = np.maximum(union, eps)
+        ious[i, :] = overlap / union
+    if exchange:
+        ious = ious.T
+    return ious
diff --git a/head_extractor/src/mmdet/evaluation/functional/cityscapes_utils.py b/head_extractor/src/mmdet/evaluation/functional/cityscapes_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ced3680deefe333af7cca3675a6359c02dd96f8
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/functional/cityscapes_utils.py
@@ -0,0 +1,302 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) https://github.com/mcordts/cityscapesScripts
+# A wrapper of `cityscapesscripts` which supports loading groundtruth
+# image from `backend_args`.
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Optional, Union
+
+import mmcv
+import numpy as np
+from mmengine.fileio import get
+
+try:
+    import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval  # noqa: E501
+    from cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling import \
+        CArgs  # noqa: E501
+    from cityscapesscripts.evaluation.instance import Instance
+    from cityscapesscripts.helpers.csHelpers import (id2label, labels,
+                                                     writeDict2JSON)
+    HAS_CITYSCAPESAPI = True
+except ImportError:
+    CArgs = object
+    HAS_CITYSCAPESAPI = False
+
+
+def evaluateImgLists(prediction_list: list,
+                     groundtruth_list: list,
+                     args: CArgs,
+                     backend_args: Optional[dict] = None,
+                     dump_matches: bool = False) -> dict:
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.evaluateImgLists``. Support loading
+    groundtruth image from file backend.
+    Args:
+        prediction_list (list): A list of prediction txt file.
+        groundtruth_list (list): A list of groundtruth image file.
+        args (CArgs): A global object setting in
+            obj:``cityscapesscripts.evaluation.
+            evalInstanceLevelSemanticLabeling``
+        backend_args (dict, optional): Arguments to instantiate the
+            preifx of uri corresponding backend. Defaults to None.
+        dump_matches (bool): whether dump matches.json. Defaults to False.
+    Returns:
+        dict: The computed metric.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    # determine labels of interest
+    CSEval.setInstanceLabels(args)
+    # get dictionary of all ground truth instances
+    gt_instances = getGtInstances(
+        groundtruth_list, args, backend_args=backend_args)
+    # match predictions and ground truth
+    matches = matchGtWithPreds(prediction_list, groundtruth_list, gt_instances,
+                               args, backend_args)
+    if dump_matches:
+        CSEval.writeDict2JSON(matches, 'matches.json')
+    # evaluate matches
+    apScores = CSEval.evaluateMatches(matches, args)
+    # averages
+    avgDict = CSEval.computeAverages(apScores, args)
+    # result dict
+    resDict = CSEval.prepareJSONDataForResults(avgDict, apScores, args)
+    if args.JSONOutput:
+        # create output folder if necessary
+        path = os.path.dirname(args.exportFile)
+        CSEval.ensurePath(path)
+        # Write APs to JSON
+        CSEval.writeDict2JSON(resDict, args.exportFile)
+
+    CSEval.printResults(avgDict, args)
+
+    return resDict
+
+
+def matchGtWithPreds(prediction_list: list,
+                     groundtruth_list: list,
+                     gt_instances: dict,
+                     args: CArgs,
+                     backend_args=None):
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.matchGtWithPreds``. Support loading
+    groundtruth image from file backend.
+    Args:
+        prediction_list (list): A list of prediction txt file.
+        groundtruth_list (list): A list of groundtruth image file.
+        gt_instances (dict): Groundtruth dict.
+        args (CArgs): A global object setting in
+            obj:``cityscapesscripts.evaluation.
+            evalInstanceLevelSemanticLabeling``
+        backend_args (dict, optional): Arguments to instantiate the
+            preifx of uri corresponding backend. Defaults to None.
+    Returns:
+        dict: The processed prediction and groundtruth result.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    matches: dict = dict()
+    if not args.quiet:
+        print(f'Matching {len(prediction_list)} pairs of images...')
+
+    count = 0
+    for (pred, gt) in zip(prediction_list, groundtruth_list):
+        # Read input files
+        gt_image = readGTImage(gt, backend_args)
+        pred_info = readPredInfo(pred)
+        # Get and filter ground truth instances
+        unfiltered_instances = gt_instances[gt]
+        cur_gt_instances_orig = CSEval.filterGtInstances(
+            unfiltered_instances, args)
+
+        # Try to assign all predictions
+        (cur_gt_instances,
+         cur_pred_instances) = CSEval.assignGt2Preds(cur_gt_instances_orig,
+                                                     gt_image, pred_info, args)
+
+        # append to global dict
+        matches[gt] = {}
+        matches[gt]['groundTruth'] = cur_gt_instances
+        matches[gt]['prediction'] = cur_pred_instances
+
+        count += 1
+        if not args.quiet:
+            print(f'\rImages Processed: {count}', end=' ')
+            sys.stdout.flush()
+
+    if not args.quiet:
+        print('')
+
+    return matches
+
+
+def readGTImage(image_file: Union[str, Path],
+                backend_args: Optional[dict] = None) -> np.ndarray:
+    """Read an image from path.
+
+    Same as obj:``cityscapesscripts.evaluation.
+    evalInstanceLevelSemanticLabeling.readGTImage``, but support loading
+    groundtruth image from file backend.
+    Args:
+        image_file (str or Path): Either a str or pathlib.Path.
+        backend_args (dict, optional): Instantiates the corresponding file
+            backend. It may contain `backend` key to specify the file
+            backend. If it contains, the file backend corresponding to this
+            value will be used and initialized with the remaining values,
+            otherwise the corresponding file backend will be selected
+            based on the prefix of the file path. Defaults to None.
+    Returns:
+        np.ndarray: The groundtruth image.
+    """
+    img_bytes = get(image_file, backend_args=backend_args)
+    img = mmcv.imfrombytes(img_bytes, flag='unchanged', backend='pillow')
+    return img
+
+
+def readPredInfo(prediction_file: str) -> dict:
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.readPredInfo``.
+    Args:
+        prediction_file (str): The prediction txt file.
+    Returns:
+        dict: The processed prediction results.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    printError = CSEval.printError
+
+    predInfo = {}
+    if (not os.path.isfile(prediction_file)):
+        printError(f"Infofile '{prediction_file}' "
+                   'for the predictions not found.')
+    with open(prediction_file) as f:
+        for line in f:
+            splittedLine = line.split(' ')
+            if len(splittedLine) != 3:
+                printError('Invalid prediction file. Expected content: '
+                           'relPathPrediction1 labelIDPrediction1 '
+                           'confidencePrediction1')
+            if os.path.isabs(splittedLine[0]):
+                printError('Invalid prediction file. First entry in each '
+                           'line must be a relative path.')
+
+            filename = os.path.join(
+                os.path.dirname(prediction_file), splittedLine[0])
+
+            imageInfo = {}
+            imageInfo['labelID'] = int(float(splittedLine[1]))
+            imageInfo['conf'] = float(splittedLine[2])  # type: ignore
+            predInfo[filename] = imageInfo
+
+    return predInfo
+
+
+def getGtInstances(groundtruth_list: list,
+                   args: CArgs,
+                   backend_args: Optional[dict] = None) -> dict:
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.getGtInstances``. Support loading
+    groundtruth image from file backend.
+    Args:
+        groundtruth_list (list): A list of groundtruth image file.
+        args (CArgs): A global object setting in
+            obj:``cityscapesscripts.evaluation.
+            evalInstanceLevelSemanticLabeling``
+        backend_args (dict, optional): Arguments to instantiate the
+            preifx of uri corresponding backend. Defaults to None.
+    Returns:
+        dict: The computed metric.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    # if there is a global statistics json, then load it
+    if (os.path.isfile(args.gtInstancesFile)):
+        if not args.quiet:
+            print('Loading ground truth instances from JSON.')
+        with open(args.gtInstancesFile) as json_file:
+            gt_instances = json.load(json_file)
+    # otherwise create it
+    else:
+        if (not args.quiet):
+            print('Creating ground truth instances from png files.')
+        gt_instances = instances2dict(
+            groundtruth_list, args, backend_args=backend_args)
+        writeDict2JSON(gt_instances, args.gtInstancesFile)
+
+    return gt_instances
+
+
+def instances2dict(image_list: list,
+                   args: CArgs,
+                   backend_args: Optional[dict] = None) -> dict:
+    """A wrapper of obj:``cityscapesscripts.evaluation.
+
+    evalInstanceLevelSemanticLabeling.instances2dict``. Support loading
+    groundtruth image from file backend.
+    Args:
+        image_list (list): A list of image file.
+        args (CArgs): A global object setting in
+            obj:``cityscapesscripts.evaluation.
+            evalInstanceLevelSemanticLabeling``
+        backend_args (dict, optional): Arguments to instantiate the
+            preifx of uri corresponding backend. Defaults to None.
+    Returns:
+        dict: The processed groundtruth results.
+    """
+    if not HAS_CITYSCAPESAPI:
+        raise RuntimeError('Failed to import `cityscapesscripts`.'
+                           'Please try to install official '
+                           'cityscapesscripts by '
+                           '"pip install cityscapesscripts"')
+    imgCount = 0
+    instanceDict = {}
+
+    if not isinstance(image_list, list):
+        image_list = [image_list]
+
+    if not args.quiet:
+        print(f'Processing {len(image_list)} images...')
+
+    for image_name in image_list:
+        # Load image
+        img_bytes = get(image_name, backend_args=backend_args)
+        imgNp = mmcv.imfrombytes(img_bytes, flag='unchanged', backend='pillow')
+
+        # Initialize label categories
+        instances: dict = {}
+        for label in labels:
+            instances[label.name] = []
+
+        # Loop through all instance ids in instance image
+        for instanceId in np.unique(imgNp):
+            instanceObj = Instance(imgNp, instanceId)
+
+            instances[id2label[instanceObj.labelID].name].append(
+                instanceObj.toDict())
+
+        instanceDict[image_name] = instances
+        imgCount += 1
+
+        if not args.quiet:
+            print(f'\rImages Processed: {imgCount}', end=' ')
+            sys.stdout.flush()
+
+    return instanceDict
diff --git a/head_extractor/src/mmdet/evaluation/functional/class_names.py b/head_extractor/src/mmdet/evaluation/functional/class_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..623a89cfdc06ab04831afd3423d5f725acc881f0
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/functional/class_names.py
@@ -0,0 +1,762 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import is_str
+
+
+def wider_face_classes() -> list:
+    """Class names of WIDERFace."""
+    return ['face']
+
+
+def voc_classes() -> list:
+    """Class names of PASCAL VOC."""
+    return [
+        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+        'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
+    ]
+
+
+def imagenet_det_classes() -> list:
+    """Class names of ImageNet Det."""
+    return [
+        'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo',
+        'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam',
+        'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap',
+        'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder',
+        'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito',
+        'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle',
+        'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker',
+        'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew',
+        'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper',
+        'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly',
+        'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig',
+        'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog',
+        'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart',
+        'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger',
+        'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim',
+        'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse',
+        'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle',
+        'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard',
+        'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can',
+        'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace',
+        'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume',
+        'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza',
+        'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine',
+        'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse',
+        'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator',
+        'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler',
+        'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver',
+        'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile',
+        'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula',
+        'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer',
+        'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine',
+        'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie',
+        'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet',
+        'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin',
+        'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft',
+        'whale', 'wine_bottle', 'zebra'
+    ]
+
+
+def imagenet_vid_classes() -> list:
+    """Class names of ImageNet VID."""
+    return [
+        'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
+        'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda',
+        'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit',
+        'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle',
+        'watercraft', 'whale', 'zebra'
+    ]
+
+
+def coco_classes() -> list:
+    """Class names of COCO."""
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign',
+        'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard',
+        'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy_bear', 'hair_drier', 'toothbrush'
+    ]
+
+
+def coco_panoptic_classes() -> list:
+    """Class names of COCO panoptic."""
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+        'blanket', 'bridge', 'cardboard', 'counter', 'curtain', 'door-stuff',
+        'floor-wood', 'flower', 'fruit', 'gravel', 'house', 'light',
+        'mirror-stuff', 'net', 'pillow', 'platform', 'playingfield',
+        'railroad', 'river', 'road', 'roof', 'sand', 'sea', 'shelf', 'snow',
+        'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', 'wall-tile',
+        'wall-wood', 'water-other', 'window-blind', 'window-other',
+        'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+        'cabinet-merged', 'table-merged', 'floor-other-merged',
+        'pavement-merged', 'mountain-merged', 'grass-merged', 'dirt-merged',
+        'paper-merged', 'food-other-merged', 'building-other-merged',
+        'rock-merged', 'wall-other-merged', 'rug-merged'
+    ]
+
+
+def cityscapes_classes() -> list:
+    """Class names of Cityscapes."""
+    return [
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+
+
+def oid_challenge_classes() -> list:
+    """Class names of Open Images Challenge."""
+    return [
+        'Footwear', 'Jeans', 'House', 'Tree', 'Woman', 'Man', 'Land vehicle',
+        'Person', 'Wheel', 'Bus', 'Human face', 'Bird', 'Dress', 'Girl',
+        'Vehicle', 'Building', 'Cat', 'Car', 'Belt', 'Elephant', 'Dessert',
+        'Butterfly', 'Train', 'Guitar', 'Poster', 'Book', 'Boy', 'Bee',
+        'Flower', 'Window', 'Hat', 'Human head', 'Dog', 'Human arm', 'Drink',
+        'Human mouth', 'Human hair', 'Human nose', 'Human hand', 'Table',
+        'Marine invertebrates', 'Fish', 'Sculpture', 'Rose', 'Street light',
+        'Glasses', 'Fountain', 'Skyscraper', 'Swimwear', 'Brassiere', 'Drum',
+        'Duck', 'Countertop', 'Furniture', 'Ball', 'Human leg', 'Boat',
+        'Balloon', 'Bicycle helmet', 'Goggles', 'Door', 'Human eye', 'Shirt',
+        'Toy', 'Teddy bear', 'Pasta', 'Tomato', 'Human ear',
+        'Vehicle registration plate', 'Microphone', 'Musical keyboard',
+        'Tower', 'Houseplant', 'Flowerpot', 'Fruit', 'Vegetable',
+        'Musical instrument', 'Suit', 'Motorcycle', 'Bagel', 'French fries',
+        'Hamburger', 'Chair', 'Salt and pepper shakers', 'Snail', 'Airplane',
+        'Horse', 'Laptop', 'Computer keyboard', 'Football helmet', 'Cocktail',
+        'Juice', 'Tie', 'Computer monitor', 'Human beard', 'Bottle',
+        'Saxophone', 'Lemon', 'Mouse', 'Sock', 'Cowboy hat', 'Sun hat',
+        'Football', 'Porch', 'Sunglasses', 'Lobster', 'Crab', 'Picture frame',
+        'Van', 'Crocodile', 'Surfboard', 'Shorts', 'Helicopter', 'Helmet',
+        'Sports uniform', 'Taxi', 'Swan', 'Goose', 'Coat', 'Jacket', 'Handbag',
+        'Flag', 'Skateboard', 'Television', 'Tire', 'Spoon', 'Palm tree',
+        'Stairs', 'Salad', 'Castle', 'Oven', 'Microwave oven', 'Wine',
+        'Ceiling fan', 'Mechanical fan', 'Cattle', 'Truck', 'Box', 'Ambulance',
+        'Desk', 'Wine glass', 'Reptile', 'Tank', 'Traffic light', 'Billboard',
+        'Tent', 'Insect', 'Spider', 'Treadmill', 'Cupboard', 'Shelf',
+        'Seat belt', 'Human foot', 'Bicycle', 'Bicycle wheel', 'Couch',
+        'Bookcase', 'Fedora', 'Backpack', 'Bench', 'Oyster',
+        'Moths and butterflies', 'Lavender', 'Waffle', 'Fork', 'Animal',
+        'Accordion', 'Mobile phone', 'Plate', 'Coffee cup', 'Saucer',
+        'Platter', 'Dagger', 'Knife', 'Bull', 'Tortoise', 'Sea turtle', 'Deer',
+        'Weapon', 'Apple', 'Ski', 'Taco', 'Traffic sign', 'Beer', 'Necklace',
+        'Sunflower', 'Piano', 'Organ', 'Harpsichord', 'Bed', 'Cabinetry',
+        'Nightstand', 'Curtain', 'Chest of drawers', 'Drawer', 'Parrot',
+        'Sandal', 'High heels', 'Tableware', 'Cart', 'Mushroom', 'Kite',
+        'Missile', 'Seafood', 'Camera', 'Paper towel', 'Toilet paper',
+        'Sombrero', 'Radish', 'Lighthouse', 'Segway', 'Pig', 'Watercraft',
+        'Golf cart', 'studio couch', 'Dolphin', 'Whale', 'Earrings', 'Otter',
+        'Sea lion', 'Whiteboard', 'Monkey', 'Gondola', 'Zebra',
+        'Baseball glove', 'Scarf', 'Adhesive tape', 'Trousers', 'Scoreboard',
+        'Lily', 'Carnivore', 'Power plugs and sockets', 'Office building',
+        'Sandwich', 'Swimming pool', 'Headphones', 'Tin can', 'Crown', 'Doll',
+        'Cake', 'Frog', 'Beetle', 'Ant', 'Gas stove', 'Canoe', 'Falcon',
+        'Blue jay', 'Egg', 'Fire hydrant', 'Raccoon', 'Muffin', 'Wall clock',
+        'Coffee', 'Mug', 'Tea', 'Bear', 'Waste container', 'Home appliance',
+        'Candle', 'Lion', 'Mirror', 'Starfish', 'Marine mammal', 'Wheelchair',
+        'Umbrella', 'Alpaca', 'Violin', 'Cello', 'Brown bear', 'Canary', 'Bat',
+        'Ruler', 'Plastic bag', 'Penguin', 'Watermelon', 'Harbor seal', 'Pen',
+        'Pumpkin', 'Harp', 'Kitchen appliance', 'Roller skates', 'Bust',
+        'Coffee table', 'Tennis ball', 'Tennis racket', 'Ladder', 'Boot',
+        'Bowl', 'Stop sign', 'Volleyball', 'Eagle', 'Paddle', 'Chicken',
+        'Skull', 'Lamp', 'Beehive', 'Maple', 'Sink', 'Goldfish', 'Tripod',
+        'Coconut', 'Bidet', 'Tap', 'Bathroom cabinet', 'Toilet',
+        'Filing cabinet', 'Pretzel', 'Table tennis racket', 'Bronze sculpture',
+        'Rocket', 'Mouse', 'Hamster', 'Lizard', 'Lifejacket', 'Goat',
+        'Washing machine', 'Trumpet', 'Horn', 'Trombone', 'Sheep',
+        'Tablet computer', 'Pillow', 'Kitchen & dining room table',
+        'Parachute', 'Raven', 'Glove', 'Loveseat', 'Christmas tree',
+        'Shellfish', 'Rifle', 'Shotgun', 'Sushi', 'Sparrow', 'Bread',
+        'Toaster', 'Watch', 'Asparagus', 'Artichoke', 'Suitcase', 'Antelope',
+        'Broccoli', 'Ice cream', 'Racket', 'Banana', 'Cookie', 'Cucumber',
+        'Dragonfly', 'Lynx', 'Caterpillar', 'Light bulb', 'Office supplies',
+        'Miniskirt', 'Skirt', 'Fireplace', 'Potato', 'Light switch',
+        'Croissant', 'Cabbage', 'Ladybug', 'Handgun', 'Luggage and bags',
+        'Window blind', 'Snowboard', 'Baseball bat', 'Digital clock',
+        'Serving tray', 'Infant bed', 'Sofa bed', 'Guacamole', 'Fox', 'Pizza',
+        'Snowplow', 'Jet ski', 'Refrigerator', 'Lantern', 'Convenience store',
+        'Sword', 'Rugby ball', 'Owl', 'Ostrich', 'Pancake', 'Strawberry',
+        'Carrot', 'Tart', 'Dice', 'Turkey', 'Rabbit', 'Invertebrate', 'Vase',
+        'Stool', 'Swim cap', 'Shower', 'Clock', 'Jellyfish', 'Aircraft',
+        'Chopsticks', 'Orange', 'Snake', 'Sewing machine', 'Kangaroo', 'Mixer',
+        'Food processor', 'Shrimp', 'Towel', 'Porcupine', 'Jaguar', 'Cannon',
+        'Limousine', 'Mule', 'Squirrel', 'Kitchen knife', 'Tiara', 'Tiger',
+        'Bow and arrow', 'Candy', 'Rhinoceros', 'Shark', 'Cricket ball',
+        'Doughnut', 'Plumbing fixture', 'Camel', 'Polar bear', 'Coin',
+        'Printer', 'Blender', 'Giraffe', 'Billiard table', 'Kettle',
+        'Dinosaur', 'Pineapple', 'Zucchini', 'Jug', 'Barge', 'Teapot',
+        'Golf ball', 'Binoculars', 'Scissors', 'Hot dog', 'Door handle',
+        'Seahorse', 'Bathtub', 'Leopard', 'Centipede', 'Grapefruit', 'Snowman',
+        'Cheetah', 'Alarm clock', 'Grape', 'Wrench', 'Wok', 'Bell pepper',
+        'Cake stand', 'Barrel', 'Woodpecker', 'Flute', 'Corded phone',
+        'Willow', 'Punching bag', 'Pomegranate', 'Telephone', 'Pear',
+        'Common fig', 'Bench', 'Wood-burning stove', 'Burrito', 'Nail',
+        'Turtle', 'Submarine sandwich', 'Drinking straw', 'Peach', 'Popcorn',
+        'Frying pan', 'Picnic basket', 'Honeycomb', 'Envelope', 'Mango',
+        'Cutting board', 'Pitcher', 'Stationary bicycle', 'Dumbbell',
+        'Personal care', 'Dog bed', 'Snowmobile', 'Oboe', 'Briefcase',
+        'Squash', 'Tick', 'Slow cooker', 'Coffeemaker', 'Measuring cup',
+        'Crutch', 'Stretcher', 'Screwdriver', 'Flashlight', 'Spatula',
+        'Pressure cooker', 'Ring binder', 'Beaker', 'Torch', 'Winter melon'
+    ]
+
+
+def oid_v6_classes() -> list:
+    """Class names of Open Images V6."""
+    return [
+        'Tortoise', 'Container', 'Magpie', 'Sea turtle', 'Football',
+        'Ambulance', 'Ladder', 'Toothbrush', 'Syringe', 'Sink', 'Toy',
+        'Organ (Musical Instrument)', 'Cassette deck', 'Apple', 'Human eye',
+        'Cosmetics', 'Paddle', 'Snowman', 'Beer', 'Chopsticks', 'Human beard',
+        'Bird', 'Parking meter', 'Traffic light', 'Croissant', 'Cucumber',
+        'Radish', 'Towel', 'Doll', 'Skull', 'Washing machine', 'Glove', 'Tick',
+        'Belt', 'Sunglasses', 'Banjo', 'Cart', 'Ball', 'Backpack', 'Bicycle',
+        'Home appliance', 'Centipede', 'Boat', 'Surfboard', 'Boot',
+        'Headphones', 'Hot dog', 'Shorts', 'Fast food', 'Bus', 'Boy',
+        'Screwdriver', 'Bicycle wheel', 'Barge', 'Laptop', 'Miniskirt',
+        'Drill (Tool)', 'Dress', 'Bear', 'Waffle', 'Pancake', 'Brown bear',
+        'Woodpecker', 'Blue jay', 'Pretzel', 'Bagel', 'Tower', 'Teapot',
+        'Person', 'Bow and arrow', 'Swimwear', 'Beehive', 'Brassiere', 'Bee',
+        'Bat (Animal)', 'Starfish', 'Popcorn', 'Burrito', 'Chainsaw',
+        'Balloon', 'Wrench', 'Tent', 'Vehicle registration plate', 'Lantern',
+        'Toaster', 'Flashlight', 'Billboard', 'Tiara', 'Limousine', 'Necklace',
+        'Carnivore', 'Scissors', 'Stairs', 'Computer keyboard', 'Printer',
+        'Traffic sign', 'Chair', 'Shirt', 'Poster', 'Cheese', 'Sock',
+        'Fire hydrant', 'Land vehicle', 'Earrings', 'Tie', 'Watercraft',
+        'Cabinetry', 'Suitcase', 'Muffin', 'Bidet', 'Snack', 'Snowmobile',
+        'Clock', 'Medical equipment', 'Cattle', 'Cello', 'Jet ski', 'Camel',
+        'Coat', 'Suit', 'Desk', 'Cat', 'Bronze sculpture', 'Juice', 'Gondola',
+        'Beetle', 'Cannon', 'Computer mouse', 'Cookie', 'Office building',
+        'Fountain', 'Coin', 'Calculator', 'Cocktail', 'Computer monitor',
+        'Box', 'Stapler', 'Christmas tree', 'Cowboy hat', 'Hiking equipment',
+        'Studio couch', 'Drum', 'Dessert', 'Wine rack', 'Drink', 'Zucchini',
+        'Ladle', 'Human mouth', 'Dairy Product', 'Dice', 'Oven', 'Dinosaur',
+        'Ratchet (Device)', 'Couch', 'Cricket ball', 'Winter melon', 'Spatula',
+        'Whiteboard', 'Pencil sharpener', 'Door', 'Hat', 'Shower', 'Eraser',
+        'Fedora', 'Guacamole', 'Dagger', 'Scarf', 'Dolphin', 'Sombrero',
+        'Tin can', 'Mug', 'Tap', 'Harbor seal', 'Stretcher', 'Can opener',
+        'Goggles', 'Human body', 'Roller skates', 'Coffee cup',
+        'Cutting board', 'Blender', 'Plumbing fixture', 'Stop sign',
+        'Office supplies', 'Volleyball (Ball)', 'Vase', 'Slow cooker',
+        'Wardrobe', 'Coffee', 'Whisk', 'Paper towel', 'Personal care', 'Food',
+        'Sun hat', 'Tree house', 'Flying disc', 'Skirt', 'Gas stove',
+        'Salt and pepper shakers', 'Mechanical fan', 'Face powder', 'Fax',
+        'Fruit', 'French fries', 'Nightstand', 'Barrel', 'Kite', 'Tart',
+        'Treadmill', 'Fox', 'Flag', 'French horn', 'Window blind',
+        'Human foot', 'Golf cart', 'Jacket', 'Egg (Food)', 'Street light',
+        'Guitar', 'Pillow', 'Human leg', 'Isopod', 'Grape', 'Human ear',
+        'Power plugs and sockets', 'Panda', 'Giraffe', 'Woman', 'Door handle',
+        'Rhinoceros', 'Bathtub', 'Goldfish', 'Houseplant', 'Goat',
+        'Baseball bat', 'Baseball glove', 'Mixing bowl',
+        'Marine invertebrates', 'Kitchen utensil', 'Light switch', 'House',
+        'Horse', 'Stationary bicycle', 'Hammer', 'Ceiling fan', 'Sofa bed',
+        'Adhesive tape', 'Harp', 'Sandal', 'Bicycle helmet', 'Saucer',
+        'Harpsichord', 'Human hair', 'Heater', 'Harmonica', 'Hamster',
+        'Curtain', 'Bed', 'Kettle', 'Fireplace', 'Scale', 'Drinking straw',
+        'Insect', 'Hair dryer', 'Kitchenware', 'Indoor rower', 'Invertebrate',
+        'Food processor', 'Bookcase', 'Refrigerator', 'Wood-burning stove',
+        'Punching bag', 'Common fig', 'Cocktail shaker', 'Jaguar (Animal)',
+        'Golf ball', 'Fashion accessory', 'Alarm clock', 'Filing cabinet',
+        'Artichoke', 'Table', 'Tableware', 'Kangaroo', 'Koala', 'Knife',
+        'Bottle', 'Bottle opener', 'Lynx', 'Lavender (Plant)', 'Lighthouse',
+        'Dumbbell', 'Human head', 'Bowl', 'Humidifier', 'Porch', 'Lizard',
+        'Billiard table', 'Mammal', 'Mouse', 'Motorcycle',
+        'Musical instrument', 'Swim cap', 'Frying pan', 'Snowplow',
+        'Bathroom cabinet', 'Missile', 'Bust', 'Man', 'Waffle iron', 'Milk',
+        'Ring binder', 'Plate', 'Mobile phone', 'Baked goods', 'Mushroom',
+        'Crutch', 'Pitcher (Container)', 'Mirror', 'Personal flotation device',
+        'Table tennis racket', 'Pencil case', 'Musical keyboard', 'Scoreboard',
+        'Briefcase', 'Kitchen knife', 'Nail (Construction)', 'Tennis ball',
+        'Plastic bag', 'Oboe', 'Chest of drawers', 'Ostrich', 'Piano', 'Girl',
+        'Plant', 'Potato', 'Hair spray', 'Sports equipment', 'Pasta',
+        'Penguin', 'Pumpkin', 'Pear', 'Infant bed', 'Polar bear', 'Mixer',
+        'Cupboard', 'Jacuzzi', 'Pizza', 'Digital clock', 'Pig', 'Reptile',
+        'Rifle', 'Lipstick', 'Skateboard', 'Raven', 'High heels', 'Red panda',
+        'Rose', 'Rabbit', 'Sculpture', 'Saxophone', 'Shotgun', 'Seafood',
+        'Submarine sandwich', 'Snowboard', 'Sword', 'Picture frame', 'Sushi',
+        'Loveseat', 'Ski', 'Squirrel', 'Tripod', 'Stethoscope', 'Submarine',
+        'Scorpion', 'Segway', 'Training bench', 'Snake', 'Coffee table',
+        'Skyscraper', 'Sheep', 'Television', 'Trombone', 'Tea', 'Tank', 'Taco',
+        'Telephone', 'Torch', 'Tiger', 'Strawberry', 'Trumpet', 'Tree',
+        'Tomato', 'Train', 'Tool', 'Picnic basket', 'Cooking spray',
+        'Trousers', 'Bowling equipment', 'Football helmet', 'Truck',
+        'Measuring cup', 'Coffeemaker', 'Violin', 'Vehicle', 'Handbag',
+        'Paper cutter', 'Wine', 'Weapon', 'Wheel', 'Worm', 'Wok', 'Whale',
+        'Zebra', 'Auto part', 'Jug', 'Pizza cutter', 'Cream', 'Monkey', 'Lion',
+        'Bread', 'Platter', 'Chicken', 'Eagle', 'Helicopter', 'Owl', 'Duck',
+        'Turtle', 'Hippopotamus', 'Crocodile', 'Toilet', 'Toilet paper',
+        'Squid', 'Clothing', 'Footwear', 'Lemon', 'Spider', 'Deer', 'Frog',
+        'Banana', 'Rocket', 'Wine glass', 'Countertop', 'Tablet computer',
+        'Waste container', 'Swimming pool', 'Dog', 'Book', 'Elephant', 'Shark',
+        'Candle', 'Leopard', 'Axe', 'Hand dryer', 'Soap dispenser',
+        'Porcupine', 'Flower', 'Canary', 'Cheetah', 'Palm tree', 'Hamburger',
+        'Maple', 'Building', 'Fish', 'Lobster', 'Garden Asparagus',
+        'Furniture', 'Hedgehog', 'Airplane', 'Spoon', 'Otter', 'Bull',
+        'Oyster', 'Horizontal bar', 'Convenience store', 'Bomb', 'Bench',
+        'Ice cream', 'Caterpillar', 'Butterfly', 'Parachute', 'Orange',
+        'Antelope', 'Beaker', 'Moths and butterflies', 'Window', 'Closet',
+        'Castle', 'Jellyfish', 'Goose', 'Mule', 'Swan', 'Peach', 'Coconut',
+        'Seat belt', 'Raccoon', 'Chisel', 'Fork', 'Lamp', 'Camera',
+        'Squash (Plant)', 'Racket', 'Human face', 'Human arm', 'Vegetable',
+        'Diaper', 'Unicycle', 'Falcon', 'Chime', 'Snail', 'Shellfish',
+        'Cabbage', 'Carrot', 'Mango', 'Jeans', 'Flowerpot', 'Pineapple',
+        'Drawer', 'Stool', 'Envelope', 'Cake', 'Dragonfly', 'Common sunflower',
+        'Microwave oven', 'Honeycomb', 'Marine mammal', 'Sea lion', 'Ladybug',
+        'Shelf', 'Watch', 'Candy', 'Salad', 'Parrot', 'Handgun', 'Sparrow',
+        'Van', 'Grinder', 'Spice rack', 'Light bulb', 'Corded phone',
+        'Sports uniform', 'Tennis racket', 'Wall clock', 'Serving tray',
+        'Kitchen & dining room table', 'Dog bed', 'Cake stand',
+        'Cat furniture', 'Bathroom accessory', 'Facial tissue holder',
+        'Pressure cooker', 'Kitchen appliance', 'Tire', 'Ruler',
+        'Luggage and bags', 'Microphone', 'Broccoli', 'Umbrella', 'Pastry',
+        'Grapefruit', 'Band-aid', 'Animal', 'Bell pepper', 'Turkey', 'Lily',
+        'Pomegranate', 'Doughnut', 'Glasses', 'Human nose', 'Pen', 'Ant',
+        'Car', 'Aircraft', 'Human hand', 'Skunk', 'Teddy bear', 'Watermelon',
+        'Cantaloupe', 'Dishwasher', 'Flute', 'Balance beam', 'Sandwich',
+        'Shrimp', 'Sewing machine', 'Binoculars', 'Rays and skates', 'Ipod',
+        'Accordion', 'Willow', 'Crab', 'Crown', 'Seahorse', 'Perfume',
+        'Alpaca', 'Taxi', 'Canoe', 'Remote control', 'Wheelchair',
+        'Rugby ball', 'Armadillo', 'Maracas', 'Helmet'
+    ]
+
+
+def objects365v1_classes() -> list:
+    """Class names of Objects365 V1."""
+    return [
+        'person', 'sneakers', 'chair', 'hat', 'lamp', 'bottle',
+        'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk',
+        'handbag', 'street lights', 'book', 'plate', 'helmet', 'leather shoes',
+        'pillow', 'glove', 'potted plant', 'bracelet', 'flower', 'tv',
+        'storage box', 'vase', 'bench', 'wine glass', 'boots', 'bowl',
+        'dining table', 'umbrella', 'boat', 'flag', 'speaker', 'trash bin/can',
+        'stool', 'backpack', 'couch', 'belt', 'carpet', 'basket',
+        'towel/napkin', 'slippers', 'barrel/bucket', 'coffee table', 'suv',
+        'toy', 'tie', 'bed', 'traffic light', 'pen/pencil', 'microphone',
+        'sandals', 'canned', 'necklace', 'mirror', 'faucet', 'bicycle',
+        'bread', 'high heels', 'ring', 'van', 'watch', 'sink', 'horse', 'fish',
+        'apple', 'camera', 'candle', 'teddy bear', 'cake', 'motorcycle',
+        'wild bird', 'laptop', 'knife', 'traffic sign', 'cell phone', 'paddle',
+        'truck', 'cow', 'power outlet', 'clock', 'drum', 'fork', 'bus',
+        'hanger', 'nightstand', 'pot/pan', 'sheep', 'guitar', 'traffic cone',
+        'tea pot', 'keyboard', 'tripod', 'hockey', 'fan', 'dog', 'spoon',
+        'blackboard/whiteboard', 'balloon', 'air conditioner', 'cymbal',
+        'mouse', 'telephone', 'pickup truck', 'orange', 'banana', 'airplane',
+        'luggage', 'skis', 'soccer', 'trolley', 'oven', 'remote',
+        'baseball glove', 'paper towel', 'refrigerator', 'train', 'tomato',
+        'machinery vehicle', 'tent', 'shampoo/shower gel', 'head phone',
+        'lantern', 'donut', 'cleaning products', 'sailboat', 'tangerine',
+        'pizza', 'kite', 'computer box', 'elephant', 'toiletries', 'gas stove',
+        'broccoli', 'toilet', 'stroller', 'shovel', 'baseball bat',
+        'microwave', 'skateboard', 'surfboard', 'surveillance camera', 'gun',
+        'life saver', 'cat', 'lemon', 'liquid soap', 'zebra', 'duck',
+        'sports car', 'giraffe', 'pumpkin', 'piano', 'stop sign', 'radiator',
+        'converter', 'tissue ', 'carrot', 'washing machine', 'vent', 'cookies',
+        'cutting/chopping board', 'tennis racket', 'candy',
+        'skating and skiing shoes', 'scissors', 'folder', 'baseball',
+        'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine',
+        'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear',
+        'american football', 'basketball', 'potato', 'paint brush', 'printer',
+        'billiards', 'fire hydrant', 'goose', 'projector', 'sausage',
+        'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball',
+        'chopsticks', 'electronic stove and gas stove', 'pie', 'frisbee',
+        'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender',
+        'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango',
+        'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion',
+        'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale',
+        'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple',
+        'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle',
+        'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar',
+        'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD',
+        'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado',
+        'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear',
+        'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn',
+        'key', 'screwdriver', 'globe', 'broom', 'pliers', 'volleyball',
+        'hammer', 'eggplant', 'trophy', 'dates', 'board eraser', 'rice',
+        'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel',
+        'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste', 'antelope',
+        'shrimp', 'rickshaw', 'trombone', 'pomegranate', 'coconut',
+        'jellyfish', 'mushroom', 'calculator', 'treadmill', 'butterfly',
+        'egg tart', 'cheese', 'pig', 'pomelo', 'race car', 'rice cooker',
+        'tuba', 'crosswalk sign', 'papaya', 'hair drier', 'green onion',
+        'chips', 'dolphin', 'sushi', 'urinal', 'donkey', 'electric drill',
+        'spring rolls', 'tortoise/turtle', 'parrot', 'flute', 'measuring cup',
+        'shark', 'steak', 'poker card', 'binoculars', 'llama', 'radish',
+        'noodles', 'yak', 'mop', 'crab', 'microscope', 'barbell', 'bread/bun',
+        'baozi', 'lion', 'red cabbage', 'polar bear', 'lighter', 'seal',
+        'mangosteen', 'comb', 'eraser', 'pitaya', 'scallop', 'pencil case',
+        'saw', 'table tennis paddle', 'okra', 'starfish', 'eagle', 'monkey',
+        'durian', 'game board', 'rabbit', 'french horn', 'ambulance',
+        'asparagus', 'hoverboard', 'pasta', 'target', 'hotair balloon',
+        'chainsaw', 'lobster', 'iron', 'flashlight'
+    ]
+
+
+def objects365v2_classes() -> list:
+    """Class names of Objects365 V2."""
+    return [
+        'Person', 'Sneakers', 'Chair', 'Other Shoes', 'Hat', 'Car', 'Lamp',
+        'Glasses', 'Bottle', 'Desk', 'Cup', 'Street Lights', 'Cabinet/shelf',
+        'Handbag/Satchel', 'Bracelet', 'Plate', 'Picture/Frame', 'Helmet',
+        'Book', 'Gloves', 'Storage box', 'Boat', 'Leather Shoes', 'Flower',
+        'Bench', 'Potted Plant', 'Bowl/Basin', 'Flag', 'Pillow', 'Boots',
+        'Vase', 'Microphone', 'Necklace', 'Ring', 'SUV', 'Wine Glass', 'Belt',
+        'Moniter/TV', 'Backpack', 'Umbrella', 'Traffic Light', 'Speaker',
+        'Watch', 'Tie', 'Trash bin Can', 'Slippers', 'Bicycle', 'Stool',
+        'Barrel/bucket', 'Van', 'Couch', 'Sandals', 'Bakset', 'Drum',
+        'Pen/Pencil', 'Bus', 'Wild Bird', 'High Heels', 'Motorcycle', 'Guitar',
+        'Carpet', 'Cell Phone', 'Bread', 'Camera', 'Canned', 'Truck',
+        'Traffic cone', 'Cymbal', 'Lifesaver', 'Towel', 'Stuffed Toy',
+        'Candle', 'Sailboat', 'Laptop', 'Awning', 'Bed', 'Faucet', 'Tent',
+        'Horse', 'Mirror', 'Power outlet', 'Sink', 'Apple', 'Air Conditioner',
+        'Knife', 'Hockey Stick', 'Paddle', 'Pickup Truck', 'Fork',
+        'Traffic Sign', 'Ballon', 'Tripod', 'Dog', 'Spoon', 'Clock', 'Pot',
+        'Cow', 'Cake', 'Dinning Table', 'Sheep', 'Hanger',
+        'Blackboard/Whiteboard', 'Napkin', 'Other Fish', 'Orange/Tangerine',
+        'Toiletry', 'Keyboard', 'Tomato', 'Lantern', 'Machinery Vehicle',
+        'Fan', 'Green Vegetables', 'Banana', 'Baseball Glove', 'Airplane',
+        'Mouse', 'Train', 'Pumpkin', 'Soccer', 'Skiboard', 'Luggage',
+        'Nightstand', 'Tea pot', 'Telephone', 'Trolley', 'Head Phone',
+        'Sports Car', 'Stop Sign', 'Dessert', 'Scooter', 'Stroller', 'Crane',
+        'Remote', 'Refrigerator', 'Oven', 'Lemon', 'Duck', 'Baseball Bat',
+        'Surveillance Camera', 'Cat', 'Jug', 'Broccoli', 'Piano', 'Pizza',
+        'Elephant', 'Skateboard', 'Surfboard', 'Gun',
+        'Skating and Skiing shoes', 'Gas stove', 'Donut', 'Bow Tie', 'Carrot',
+        'Toilet', 'Kite', 'Strawberry', 'Other Balls', 'Shovel', 'Pepper',
+        'Computer Box', 'Toilet Paper', 'Cleaning Products', 'Chopsticks',
+        'Microwave', 'Pigeon', 'Baseball', 'Cutting/chopping Board',
+        'Coffee Table', 'Side Table', 'Scissors', 'Marker', 'Pie', 'Ladder',
+        'Snowboard', 'Cookies', 'Radiator', 'Fire Hydrant', 'Basketball',
+        'Zebra', 'Grape', 'Giraffe', 'Potato', 'Sausage', 'Tricycle', 'Violin',
+        'Egg', 'Fire Extinguisher', 'Candy', 'Fire Truck', 'Billards',
+        'Converter', 'Bathtub', 'Wheelchair', 'Golf Club', 'Briefcase',
+        'Cucumber', 'Cigar/Cigarette ', 'Paint Brush', 'Pear', 'Heavy Truck',
+        'Hamburger', 'Extractor', 'Extention Cord', 'Tong', 'Tennis Racket',
+        'Folder', 'American Football', 'earphone', 'Mask', 'Kettle', 'Tennis',
+        'Ship', 'Swing', 'Coffee Machine', 'Slide', 'Carriage', 'Onion',
+        'Green beans', 'Projector', 'Frisbee',
+        'Washing Machine/Drying Machine', 'Chicken', 'Printer', 'Watermelon',
+        'Saxophone', 'Tissue', 'Toothbrush', 'Ice cream', 'Hotair ballon',
+        'Cello', 'French Fries', 'Scale', 'Trophy', 'Cabbage', 'Hot dog',
+        'Blender', 'Peach', 'Rice', 'Wallet/Purse', 'Volleyball', 'Deer',
+        'Goose', 'Tape', 'Tablet', 'Cosmetics', 'Trumpet', 'Pineapple',
+        'Golf Ball', 'Ambulance', 'Parking meter', 'Mango', 'Key', 'Hurdle',
+        'Fishing Rod', 'Medal', 'Flute', 'Brush', 'Penguin', 'Megaphone',
+        'Corn', 'Lettuce', 'Garlic', 'Swan', 'Helicopter', 'Green Onion',
+        'Sandwich', 'Nuts', 'Speed Limit Sign', 'Induction Cooker', 'Broom',
+        'Trombone', 'Plum', 'Rickshaw', 'Goldfish', 'Kiwi fruit',
+        'Router/modem', 'Poker Card', 'Toaster', 'Shrimp', 'Sushi', 'Cheese',
+        'Notepaper', 'Cherry', 'Pliers', 'CD', 'Pasta', 'Hammer', 'Cue',
+        'Avocado', 'Hamimelon', 'Flask', 'Mushroon', 'Screwdriver', 'Soap',
+        'Recorder', 'Bear', 'Eggplant', 'Board Eraser', 'Coconut',
+        'Tape Measur/ Ruler', 'Pig', 'Showerhead', 'Globe', 'Chips', 'Steak',
+        'Crosswalk Sign', 'Stapler', 'Campel', 'Formula 1 ', 'Pomegranate',
+        'Dishwasher', 'Crab', 'Hoverboard', 'Meat ball', 'Rice Cooker', 'Tuba',
+        'Calculator', 'Papaya', 'Antelope', 'Parrot', 'Seal', 'Buttefly',
+        'Dumbbell', 'Donkey', 'Lion', 'Urinal', 'Dolphin', 'Electric Drill',
+        'Hair Dryer', 'Egg tart', 'Jellyfish', 'Treadmill', 'Lighter',
+        'Grapefruit', 'Game board', 'Mop', 'Radish', 'Baozi', 'Target',
+        'French', 'Spring Rolls', 'Monkey', 'Rabbit', 'Pencil Case', 'Yak',
+        'Red Cabbage', 'Binoculars', 'Asparagus', 'Barbell', 'Scallop',
+        'Noddles', 'Comb', 'Dumpling', 'Oyster', 'Table Teniis paddle',
+        'Cosmetics Brush/Eyeliner Pencil', 'Chainsaw', 'Eraser', 'Lobster',
+        'Durian', 'Okra', 'Lipstick', 'Cosmetics Mirror', 'Curling',
+        'Table Tennis '
+    ]
+
+
+def lvis_classes() -> list:
+    """Class names of LVIS."""
+    return [
+        'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock', 'alcohol',
+        'alligator', 'almond', 'ambulance', 'amplifier', 'anklet', 'antenna',
+        'apple', 'applesauce', 'apricot', 'apron', 'aquarium',
+        'arctic_(type_of_shoe)', 'armband', 'armchair', 'armoire', 'armor',
+        'artichoke', 'trash_can', 'ashtray', 'asparagus', 'atomizer',
+        'avocado', 'award', 'awning', 'ax', 'baboon', 'baby_buggy',
+        'basketball_backboard', 'backpack', 'handbag', 'suitcase', 'bagel',
+        'bagpipe', 'baguet', 'bait', 'ball', 'ballet_skirt', 'balloon',
+        'bamboo', 'banana', 'Band_Aid', 'bandage', 'bandanna', 'banjo',
+        'banner', 'barbell', 'barge', 'barrel', 'barrette', 'barrow',
+        'baseball_base', 'baseball', 'baseball_bat', 'baseball_cap',
+        'baseball_glove', 'basket', 'basketball', 'bass_horn', 'bat_(animal)',
+        'bath_mat', 'bath_towel', 'bathrobe', 'bathtub', 'batter_(food)',
+        'battery', 'beachball', 'bead', 'bean_curd', 'beanbag', 'beanie',
+        'bear', 'bed', 'bedpan', 'bedspread', 'cow', 'beef_(food)', 'beeper',
+        'beer_bottle', 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt',
+        'belt_buckle', 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor',
+        'billboard', 'binder', 'binoculars', 'bird', 'birdfeeder', 'birdbath',
+        'birdcage', 'birdhouse', 'birthday_cake', 'birthday_card',
+        'pirate_flag', 'black_sheep', 'blackberry', 'blackboard', 'blanket',
+        'blazer', 'blender', 'blimp', 'blinker', 'blouse', 'blueberry',
+        'gameboard', 'boat', 'bob', 'bobbin', 'bobby_pin', 'boiled_egg',
+        'bolo_tie', 'deadbolt', 'bolt', 'bonnet', 'book', 'bookcase',
+        'booklet', 'bookmark', 'boom_microphone', 'boot', 'bottle',
+        'bottle_opener', 'bouquet', 'bow_(weapon)', 'bow_(decorative_ribbons)',
+        'bow-tie', 'bowl', 'pipe_bowl', 'bowler_hat', 'bowling_ball', 'box',
+        'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere',
+        'bread-bin', 'bread', 'breechcloth', 'bridal_gown', 'briefcase',
+        'broccoli', 'broach', 'broom', 'brownie', 'brussels_sprouts',
+        'bubble_gum', 'bucket', 'horse_buggy', 'bull', 'bulldog', 'bulldozer',
+        'bullet_train', 'bulletin_board', 'bulletproof_vest', 'bullhorn',
+        'bun', 'bunk_bed', 'buoy', 'burrito', 'bus_(vehicle)', 'business_card',
+        'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car',
+        'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf',
+        'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)',
+        'can', 'can_opener', 'candle', 'candle_holder', 'candy_bar',
+        'candy_cane', 'walking_cane', 'canister', 'canoe', 'cantaloup',
+        'canteen', 'cap_(headwear)', 'bottle_cap', 'cape', 'cappuccino',
+        'car_(automobile)', 'railcar_(part_of_a_train)', 'elevator_car',
+        'car_battery', 'identity_card', 'card', 'cardigan', 'cargo_ship',
+        'carnation', 'horse_carriage', 'carrot', 'tote_bag', 'cart', 'carton',
+        'cash_register', 'casserole', 'cassette', 'cast', 'cat', 'cauliflower',
+        'cayenne_(spice)', 'CD_player', 'celery', 'cellular_telephone',
+        'chain_mail', 'chair', 'chaise_longue', 'chalice', 'chandelier',
+        'chap', 'checkbook', 'checkerboard', 'cherry', 'chessboard',
+        'chicken_(animal)', 'chickpea', 'chili_(vegetable)', 'chime',
+        'chinaware', 'crisp_(potato_chip)', 'poker_chip', 'chocolate_bar',
+        'chocolate_cake', 'chocolate_milk', 'chocolate_mousse', 'choker',
+        'chopping_board', 'chopstick', 'Christmas_tree', 'slide', 'cider',
+        'cigar_box', 'cigarette', 'cigarette_case', 'cistern', 'clarinet',
+        'clasp', 'cleansing_agent', 'cleat_(for_securing_rope)', 'clementine',
+        'clip', 'clipboard', 'clippers_(for_plants)', 'cloak', 'clock',
+        'clock_tower', 'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster',
+        'coat', 'coat_hanger', 'coatrack', 'cock', 'cockroach',
+        'cocoa_(beverage)', 'coconut', 'coffee_maker', 'coffee_table',
+        'coffeepot', 'coil', 'coin', 'colander', 'coleslaw',
+        'coloring_material', 'combination_lock', 'pacifier', 'comic_book',
+        'compass', 'computer_keyboard', 'condiment', 'cone', 'control',
+        'convertible_(automobile)', 'sofa_bed', 'cooker', 'cookie',
+        'cooking_utensil', 'cooler_(for_food)', 'cork_(bottle_plug)',
+        'corkboard', 'corkscrew', 'edible_corn', 'cornbread', 'cornet',
+        'cornice', 'cornmeal', 'corset', 'costume', 'cougar', 'coverall',
+        'cowbell', 'cowboy_hat', 'crab_(animal)', 'crabmeat', 'cracker',
+        'crape', 'crate', 'crayon', 'cream_pitcher', 'crescent_roll', 'crib',
+        'crock_pot', 'crossbar', 'crouton', 'crow', 'crowbar', 'crown',
+        'crucifix', 'cruise_ship', 'police_cruiser', 'crumb', 'crutch',
+        'cub_(animal)', 'cube', 'cucumber', 'cufflink', 'cup', 'trophy_cup',
+        'cupboard', 'cupcake', 'hair_curler', 'curling_iron', 'curtain',
+        'cushion', 'cylinder', 'cymbal', 'dagger', 'dalmatian', 'dartboard',
+        'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
+        'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', 'tux',
+        'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
+        'dishwasher_detergent', 'dispenser', 'diving_board', 'Dixie_cup',
+        'dog', 'dog_collar', 'doll', 'dollar', 'dollhouse', 'dolphin',
+        'domestic_ass', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly',
+        'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit',
+        'dresser', 'drill', 'drone', 'dropper', 'drum_(musical_instrument)',
+        'drumstick', 'duck', 'duckling', 'duct_tape', 'duffel_bag', 'dumbbell',
+        'dumpster', 'dustpan', 'eagle', 'earphone', 'earplug', 'earring',
+        'easel', 'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater',
+        'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk',
+        'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan',
+        'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)',
+        'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)', 'fire_alarm',
+        'fire_engine', 'fire_extinguisher', 'fire_hose', 'fireplace',
+        'fireplug', 'first-aid_kit', 'fish', 'fish_(food)', 'fishbowl',
+        'fishing_rod', 'flag', 'flagpole', 'flamingo', 'flannel', 'flap',
+        'flash', 'flashlight', 'fleece', 'flip-flop_(sandal)',
+        'flipper_(footwear)', 'flower_arrangement', 'flute_glass', 'foal',
+        'folding_chair', 'food_processor', 'football_(American)',
+        'football_helmet', 'footstool', 'fork', 'forklift', 'freight_car',
+        'French_toast', 'freshener', 'frisbee', 'frog', 'fruit_juice',
+        'frying_pan', 'fudge', 'funnel', 'futon', 'gag', 'garbage',
+        'garbage_truck', 'garden_hose', 'gargle', 'gargoyle', 'garlic',
+        'gasmask', 'gazelle', 'gelatin', 'gemstone', 'generator',
+        'giant_panda', 'gift_wrap', 'ginger', 'giraffe', 'cincture',
+        'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles',
+        'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose',
+        'gorilla', 'gourd', 'grape', 'grater', 'gravestone', 'gravy_boat',
+        'green_bean', 'green_onion', 'griddle', 'grill', 'grits', 'grizzly',
+        'grocery_bag', 'guitar', 'gull', 'gun', 'hairbrush', 'hairnet',
+        'hairpin', 'halter_top', 'ham', 'hamburger', 'hammer', 'hammock',
+        'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel',
+        'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw',
+        'hardback_book', 'harmonium', 'hat', 'hatbox', 'veil', 'headband',
+        'headboard', 'headlight', 'headscarf', 'headset',
+        'headstall_(for_horses)', 'heart', 'heater', 'helicopter', 'helmet',
+        'heron', 'highchair', 'hinge', 'hippopotamus', 'hockey_stick', 'hog',
+        'home_plate_(baseball)', 'honey', 'fume_hood', 'hook', 'hookah',
+        'hornet', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
+        'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
+        'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
+        'igniter', 'inhaler', 'iPod', 'iron_(for_clothing)', 'ironing_board',
+        'jacket', 'jam', 'jar', 'jean', 'jeep', 'jelly_bean', 'jersey',
+        'jet_plane', 'jewel', 'jewelry', 'joystick', 'jumpsuit', 'kayak',
+        'keg', 'kennel', 'kettle', 'key', 'keycard', 'kilt', 'kimono',
+        'kitchen_sink', 'kitchen_table', 'kite', 'kitten', 'kiwi_fruit',
+        'knee_pad', 'knife', 'knitting_needle', 'knob', 'knocker_(on_a_door)',
+        'koala', 'lab_coat', 'ladder', 'ladle', 'ladybug', 'lamb_(animal)',
+        'lamb-chop', 'lamp', 'lamppost', 'lampshade', 'lantern', 'lanyard',
+        'laptop_computer', 'lasagna', 'latch', 'lawn_mower', 'leather',
+        'legging_(clothing)', 'Lego', 'legume', 'lemon', 'lemonade', 'lettuce',
+        'license_plate', 'life_buoy', 'life_jacket', 'lightbulb',
+        'lightning_rod', 'lime', 'limousine', 'lion', 'lip_balm', 'liquor',
+        'lizard', 'log', 'lollipop', 'speaker_(stereo_equipment)', 'loveseat',
+        'machine_gun', 'magazine', 'magnet', 'mail_slot', 'mailbox_(at_home)',
+        'mallard', 'mallet', 'mammoth', 'manatee', 'mandarin_orange', 'manger',
+        'manhole', 'map', 'marker', 'martini', 'mascot', 'mashed_potato',
+        'masher', 'mask', 'mast', 'mat_(gym_equipment)', 'matchbox',
+        'mattress', 'measuring_cup', 'measuring_stick', 'meatball', 'medicine',
+        'melon', 'microphone', 'microscope', 'microwave_oven', 'milestone',
+        'milk', 'milk_can', 'milkshake', 'minivan', 'mint_candy', 'mirror',
+        'mitten', 'mixer_(kitchen_tool)', 'money',
+        'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor',
+        'motor_scooter', 'motor_vehicle', 'motorcycle', 'mound_(baseball)',
+        'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom',
+        'music_stool', 'musical_instrument', 'nailfile', 'napkin',
+        'neckerchief', 'necklace', 'necktie', 'needle', 'nest', 'newspaper',
+        'newsstand', 'nightshirt', 'nosebag_(for_animals)',
+        'noseband_(for_animals)', 'notebook', 'notepad', 'nut', 'nutcracker',
+        'oar', 'octopus_(food)', 'octopus_(animal)', 'oil_lamp', 'olive_oil',
+        'omelet', 'onion', 'orange_(fruit)', 'orange_juice', 'ostrich',
+        'ottoman', 'oven', 'overalls_(clothing)', 'owl', 'packet', 'inkpad',
+        'pad', 'paddle', 'padlock', 'paintbrush', 'painting', 'pajamas',
+        'palette', 'pan_(for_cooking)', 'pan_(metal_container)', 'pancake',
+        'pantyhose', 'papaya', 'paper_plate', 'paper_towel', 'paperback_book',
+        'paperweight', 'parachute', 'parakeet', 'parasail_(sports)', 'parasol',
+        'parchment', 'parka', 'parking_meter', 'parrot',
+        'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
+        'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
+        'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'wooden_leg',
+        'pegboard', 'pelican', 'pen', 'pencil', 'pencil_box',
+        'pencil_sharpener', 'pendulum', 'penguin', 'pennant', 'penny_(coin)',
+        'pepper', 'pepper_mill', 'perfume', 'persimmon', 'person', 'pet',
+        'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
+        'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
+        'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
+        'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
+        'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
+        'plate', 'platter', 'playpen', 'pliers', 'plow_(farm_equipment)',
+        'plume', 'pocket_watch', 'pocketknife', 'poker_(fire_stirring_tool)',
+        'pole', 'polo_shirt', 'poncho', 'pony', 'pool_table', 'pop_(soda)',
+        'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', 'potato',
+        'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', 'pretzel',
+        'printer', 'projectile_(weapon)', 'projector', 'propeller', 'prune',
+        'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher',
+        'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit',
+        'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', 'radish',
+        'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat',
+        'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
+        'recliner', 'record_player', 'reflector', 'remote_control',
+        'rhinoceros', 'rib_(food)', 'rifle', 'ring', 'river_boat', 'road_map',
+        'robe', 'rocking_chair', 'rodent', 'roller_skate', 'Rollerblade',
+        'rolling_pin', 'root_beer', 'router_(computer_equipment)',
+        'rubber_band', 'runner_(carpet)', 'plastic_bag',
+        'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', 'safety_pin',
+        'sail', 'salad', 'salad_plate', 'salami', 'salmon_(fish)',
+        'salmon_(food)', 'salsa', 'saltshaker', 'sandal_(type_of_shoe)',
+        'sandwich', 'satchel', 'saucepan', 'saucer', 'sausage', 'sawhorse',
+        'saxophone', 'scale_(measuring_instrument)', 'scarecrow', 'scarf',
+        'school_bus', 'scissors', 'scoreboard', 'scraper', 'screwdriver',
+        'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
+        'seashell', 'sewing_machine', 'shaker', 'shampoo', 'shark',
+        'sharpener', 'Sharpie', 'shaver_(electric)', 'shaving_cream', 'shawl',
+        'shears', 'sheep', 'shepherd_dog', 'sherbert', 'shield', 'shirt',
+        'shoe', 'shopping_bag', 'shopping_cart', 'short_pants', 'shot_glass',
+        'shoulder_bag', 'shovel', 'shower_head', 'shower_cap',
+        'shower_curtain', 'shredder_(for_paper)', 'signboard', 'silo', 'sink',
+        'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka', 'ski_pole',
+        'skirt', 'skullcap', 'sled', 'sleeping_bag', 'sling_(bandage)',
+        'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman',
+        'snowmobile', 'soap', 'soccer_ball', 'sock', 'sofa', 'softball',
+        'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon',
+        'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)',
+        'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'crawfish',
+        'sponge', 'spoon', 'sportswear', 'spotlight', 'squid_(food)',
+        'squirrel', 'stagecoach', 'stapler_(stapling_machine)', 'starfish',
+        'statue_(sculpture)', 'steak_(food)', 'steak_knife', 'steering_wheel',
+        'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew', 'stirrer',
+        'stirrup', 'stool', 'stop_sign', 'brake_light', 'stove', 'strainer',
+        'strap', 'straw_(for_drinking)', 'strawberry', 'street_sign',
+        'streetlight', 'string_cheese', 'stylus', 'subwoofer', 'sugar_bowl',
+        'sugarcane_(plant)', 'suit_(clothing)', 'sunflower', 'sunglasses',
+        'sunhat', 'surfboard', 'sushi', 'mop', 'sweat_pants', 'sweatband',
+        'sweater', 'sweatshirt', 'sweet_potato', 'swimsuit', 'sword',
+        'syringe', 'Tabasco_sauce', 'table-tennis_table', 'table',
+        'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', 'taillight',
+        'tambourine', 'army_tank', 'tank_(storage_vessel)',
+        'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
+        'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
+        'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
+        'telephone_pole', 'telephoto_lens', 'television_camera',
+        'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
+        'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
+        'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', 'tinfoil',
+        'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', 'toaster_oven',
+        'toilet', 'toilet_tissue', 'tomato', 'tongs', 'toolbox', 'toothbrush',
+        'toothpaste', 'toothpick', 'cover', 'tortilla', 'tow_truck', 'towel',
+        'towel_rack', 'toy', 'tractor_(farm_equipment)', 'traffic_light',
+        'dirt_bike', 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline',
+        'tray', 'trench_coat', 'triangle_(musical_instrument)', 'tricycle',
+        'tripod', 'trousers', 'truck', 'truffle_(chocolate)', 'trunk', 'vat',
+        'turban', 'turkey_(food)', 'turnip', 'turtle', 'turtleneck_(clothing)',
+        'typewriter', 'umbrella', 'underwear', 'unicycle', 'urinal', 'urn',
+        'vacuum_cleaner', 'vase', 'vending_machine', 'vent', 'vest',
+        'videotape', 'vinegar', 'violin', 'vodka', 'volleyball', 'vulture',
+        'waffle', 'waffle_iron', 'wagon', 'wagon_wheel', 'walking_stick',
+        'wall_clock', 'wall_socket', 'wallet', 'walrus', 'wardrobe',
+        'washbasin', 'automatic_washer', 'watch', 'water_bottle',
+        'water_cooler', 'water_faucet', 'water_heater', 'water_jug',
+        'water_gun', 'water_scooter', 'water_ski', 'water_tower',
+        'watering_can', 'watermelon', 'weathervane', 'webcam', 'wedding_cake',
+        'wedding_ring', 'wet_suit', 'wheel', 'wheelchair', 'whipped_cream',
+        'whistle', 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)',
+        'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket',
+        'wineglass', 'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon',
+        'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', 'yogurt',
+        'yoke_(animal_equipment)', 'zebra', 'zucchini'
+    ]
+
+
+dataset_aliases = {
+    'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'],
+    'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'],
+    'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'],
+    'coco': ['coco', 'mscoco', 'ms_coco'],
+    'coco_panoptic': ['coco_panoptic', 'panoptic'],
+    'wider_face': ['WIDERFaceDataset', 'wider_face', 'WIDERFace'],
+    'cityscapes': ['cityscapes'],
+    'oid_challenge': ['oid_challenge', 'openimages_challenge'],
+    'oid_v6': ['oid_v6', 'openimages_v6'],
+    'objects365v1': ['objects365v1', 'obj365v1'],
+    'objects365v2': ['objects365v2', 'obj365v2'],
+    'lvis': ['lvis', 'lvis_v1'],
+}
+
+
+def get_classes(dataset) -> list:
+    """Get class names of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_classes()')
+        else:
+            raise ValueError(f'Unrecognized dataset: {dataset}')
+    else:
+        raise TypeError(f'dataset must a str, but got {type(dataset)}')
+    return labels
diff --git a/head_extractor/src/mmdet/evaluation/functional/mean_ap.py b/head_extractor/src/mmdet/evaluation/functional/mean_ap.py
new file mode 100644
index 0000000000000000000000000000000000000000..989972a48467f74fa915fa6f3807d0db3becdba2
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/functional/mean_ap.py
@@ -0,0 +1,792 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from multiprocessing import Pool
+
+import numpy as np
+from mmengine.logging import print_log
+from mmengine.utils import is_str
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+from .class_names import get_classes
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or ndarray: calculated average precision
+    """
+    no_scale = False
+    if recalls.ndim == 1:
+        no_scale = True
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+        ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    if no_scale:
+        ap = ap[0]
+    return ap
+
+
+def tpfp_imagenet(det_bboxes,
+                  gt_bboxes,
+                  gt_bboxes_ignore=None,
+                  default_iou_thr=0.5,
+                  area_ranges=None,
+                  use_legacy_coordinate=False,
+                  **kwargs):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Defaults to None
+        default_iou_thr (float): IoU threshold to be considered as matched for
+            medium and large bboxes (small ones have special rules).
+            Defaults to 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. Defaults to None.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Defaults to False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0],
+                  dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp
+    # of a certain scale.
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(
+        det_bboxes, gt_bboxes - 1, use_legacy_coordinate=use_legacy_coordinate)
+    gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length
+    gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length
+    iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)),
+                          default_iou_thr)
+    # sort all detections by scores in descending order
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+        else:
+            gt_areas = gt_w * gt_h
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            max_iou = -1
+            matched_gt = -1
+            # find best overlapped available gt
+            for j in range(num_gts):
+                # different from PASCAL VOC: allow finding other gts if the
+                # best overlapped ones are already matched by other det bboxes
+                if gt_covered[j]:
+                    continue
+                elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou:
+                    max_iou = ious[i, j]
+                    matched_gt = j
+            # there are 4 cases for a det bbox:
+            # 1. it matches a gt, tp = 1, fp = 0
+            # 2. it matches an ignored gt, tp = 0, fp = 0
+            # 3. it matches no gt and within area range, tp = 0, fp = 1
+            # 4. it matches no gt but is beyond area range, tp = 0, fp = 0
+            if matched_gt >= 0:
+                gt_covered[matched_gt] = 1
+                if not (gt_ignore_inds[matched_gt]
+                        or gt_area_ignore[matched_gt]):
+                    tp[k, i] = 1
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0] + extra_length) * (
+                    bbox[3] - bbox[1] + extra_length)
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_default(det_bboxes,
+                 gt_bboxes,
+                 gt_bboxes_ignore=None,
+                 iou_thr=0.5,
+                 area_ranges=None,
+                 use_legacy_coordinate=False,
+                 **kwargs):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Defaults to None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Defaults to 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be
+            evaluated, in the format [(min1, max1), (min2, max2), ...].
+            Defaults to None.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Defaults to False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0],
+                  dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+
+    ious = bbox_overlaps(
+        det_bboxes, gt_bboxes, use_legacy_coordinate=use_legacy_coordinate)
+    # for each det, the max iou with all gts
+    ious_max = ious.max(axis=1)
+    # for each det, which gt overlaps most with it
+    ious_argmax = ious.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+        else:
+            gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length) * (
+                gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length)
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            if ious_max[i] >= iou_thr:
+                matched_gt = ious_argmax[i]
+                if not (gt_ignore_inds[matched_gt]
+                        or gt_area_ignore[matched_gt]):
+                    if not gt_covered[matched_gt]:
+                        gt_covered[matched_gt] = True
+                        tp[k, i] = 1
+                    else:
+                        fp[k, i] = 1
+                # otherwise ignore this detected bbox, tp = 0, fp = 0
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0] + extra_length) * (
+                    bbox[3] - bbox[1] + extra_length)
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_openimages(det_bboxes,
+                    gt_bboxes,
+                    gt_bboxes_ignore=None,
+                    iou_thr=0.5,
+                    area_ranges=None,
+                    use_legacy_coordinate=False,
+                    gt_bboxes_group_of=None,
+                    use_group_of=True,
+                    ioa_thr=0.5,
+                    **kwargs):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Defaults to None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Defaults to 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be
+            evaluated, in the format [(min1, max1), (min2, max2), ...].
+            Defaults to None.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Defaults to False.
+        gt_bboxes_group_of (ndarray): GT group_of of this image, of shape
+            (k, 1). Defaults to None
+        use_group_of (bool): Whether to use group of when calculate TP and FP,
+            which only used in OpenImages evaluation. Defaults to True.
+        ioa_thr (float | None): IoA threshold to be considered as matched,
+            which only used in OpenImages evaluation. Defaults to 0.5.
+
+    Returns:
+        tuple[np.ndarray]: Returns a tuple (tp, fp, det_bboxes), where
+        (tp, fp) whose elements are 0 and 1. The shape of each array is
+        (num_scales, m). (det_bboxes) whose will filter those are not
+        matched by group of gts when processing Open Images evaluation.
+        The shape is (num_scales, m).
+    """
+
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0],
+                  dtype=bool), np.ones(gt_bboxes_ignore.shape[0], dtype=bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp, det_bboxes
+
+    if gt_bboxes_group_of is not None and use_group_of:
+        # if handle group-of boxes, divided gt boxes into two parts:
+        # non-group-of and group-of.Then calculate ious and ioas through
+        # non-group-of group-of gts respectively. This only used in
+        # OpenImages evaluation.
+        assert gt_bboxes_group_of.shape[0] == gt_bboxes.shape[0]
+        non_group_gt_bboxes = gt_bboxes[~gt_bboxes_group_of]
+        group_gt_bboxes = gt_bboxes[gt_bboxes_group_of]
+        num_gts_group = group_gt_bboxes.shape[0]
+        ious = bbox_overlaps(det_bboxes, non_group_gt_bboxes)
+        ioas = bbox_overlaps(det_bboxes, group_gt_bboxes, mode='iof')
+    else:
+        # if not consider group-of boxes, only calculate ious through gt boxes
+        ious = bbox_overlaps(
+            det_bboxes, gt_bboxes, use_legacy_coordinate=use_legacy_coordinate)
+        ioas = None
+
+    if ious.shape[1] > 0:
+        # for each det, the max iou with all gts
+        ious_max = ious.max(axis=1)
+        # for each det, which gt overlaps most with it
+        ious_argmax = ious.argmax(axis=1)
+        # sort all dets in descending order by scores
+        sort_inds = np.argsort(-det_bboxes[:, -1])
+        for k, (min_area, max_area) in enumerate(area_ranges):
+            gt_covered = np.zeros(num_gts, dtype=bool)
+            # if no area range is specified, gt_area_ignore is all False
+            if min_area is None:
+                gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+            else:
+                gt_areas = (
+                    gt_bboxes[:, 2] - gt_bboxes[:, 0] + extra_length) * (
+                        gt_bboxes[:, 3] - gt_bboxes[:, 1] + extra_length)
+                gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+            for i in sort_inds:
+                if ious_max[i] >= iou_thr:
+                    matched_gt = ious_argmax[i]
+                    if not (gt_ignore_inds[matched_gt]
+                            or gt_area_ignore[matched_gt]):
+                        if not gt_covered[matched_gt]:
+                            gt_covered[matched_gt] = True
+                            tp[k, i] = 1
+                        else:
+                            fp[k, i] = 1
+                    # otherwise ignore this detected bbox, tp = 0, fp = 0
+                elif min_area is None:
+                    fp[k, i] = 1
+                else:
+                    bbox = det_bboxes[i, :4]
+                    area = (bbox[2] - bbox[0] + extra_length) * (
+                        bbox[3] - bbox[1] + extra_length)
+                    if area >= min_area and area < max_area:
+                        fp[k, i] = 1
+    else:
+        # if there is no no-group-of gt bboxes in this image,
+        # then all det bboxes within area range are false positives.
+        # Only used in OpenImages evaluation.
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (
+                det_bboxes[:, 2] - det_bboxes[:, 0] + extra_length) * (
+                    det_bboxes[:, 3] - det_bboxes[:, 1] + extra_length)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+
+    if ioas is None or ioas.shape[1] <= 0:
+        return tp, fp, det_bboxes
+    else:
+        # The evaluation of group-of TP and FP are done in two stages:
+        # 1. All detections are first matched to non group-of boxes; true
+        #    positives are determined.
+        # 2. Detections that are determined as false positives are matched
+        #    against group-of boxes and calculated group-of TP and FP.
+        # Only used in OpenImages evaluation.
+        det_bboxes_group = np.zeros(
+            (num_scales, ioas.shape[1], det_bboxes.shape[1]), dtype=float)
+        match_group_of = np.zeros((num_scales, num_dets), dtype=bool)
+        tp_group = np.zeros((num_scales, num_gts_group), dtype=np.float32)
+        ioas_max = ioas.max(axis=1)
+        # for each det, which gt overlaps most with it
+        ioas_argmax = ioas.argmax(axis=1)
+        # sort all dets in descending order by scores
+        sort_inds = np.argsort(-det_bboxes[:, -1])
+        for k, (min_area, max_area) in enumerate(area_ranges):
+            box_is_covered = tp[k]
+            # if no area range is specified, gt_area_ignore is all False
+            if min_area is None:
+                gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+            else:
+                gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+                    gt_bboxes[:, 3] - gt_bboxes[:, 1])
+                gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+            for i in sort_inds:
+                matched_gt = ioas_argmax[i]
+                if not box_is_covered[i]:
+                    if ioas_max[i] >= ioa_thr:
+                        if not (gt_ignore_inds[matched_gt]
+                                or gt_area_ignore[matched_gt]):
+                            if not tp_group[k, matched_gt]:
+                                tp_group[k, matched_gt] = 1
+                                match_group_of[k, i] = True
+                            else:
+                                match_group_of[k, i] = True
+
+                            if det_bboxes_group[k, matched_gt, -1] < \
+                                    det_bboxes[i, -1]:
+                                det_bboxes_group[k, matched_gt] = \
+                                    det_bboxes[i]
+
+        fp_group = (tp_group <= 0).astype(float)
+        tps = []
+        fps = []
+        # concatenate tp, fp, and det-boxes which not matched group of
+        # gt boxes and tp_group, fp_group, and det_bboxes_group which
+        # matched group of boxes respectively.
+        for i in range(num_scales):
+            tps.append(
+                np.concatenate((tp[i][~match_group_of[i]], tp_group[i])))
+            fps.append(
+                np.concatenate((fp[i][~match_group_of[i]], fp_group[i])))
+            det_bboxes = np.concatenate(
+                (det_bboxes[~match_group_of[i]], det_bboxes_group[i]))
+
+        tp = np.vstack(tps)
+        fp = np.vstack(fps)
+        return tp, fp, det_bboxes
+
+
+def get_cls_results(det_results, annotations, class_id):
+    """Get det results and gt information of a certain class.
+
+    Args:
+        det_results (list[list]): Same as `eval_map()`.
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        tuple[list[np.ndarray]]: detected bboxes, gt bboxes, ignored gt bboxes
+    """
+    cls_dets = [img_res[class_id] for img_res in det_results]
+    cls_gts = []
+    cls_gts_ignore = []
+    for ann in annotations:
+        gt_inds = ann['labels'] == class_id
+        cls_gts.append(ann['bboxes'][gt_inds, :])
+
+        if ann.get('labels_ignore', None) is not None:
+            ignore_inds = ann['labels_ignore'] == class_id
+            cls_gts_ignore.append(ann['bboxes_ignore'][ignore_inds, :])
+        else:
+            cls_gts_ignore.append(np.empty((0, 4), dtype=np.float32))
+
+    return cls_dets, cls_gts, cls_gts_ignore
+
+
+def get_cls_group_ofs(annotations, class_id):
+    """Get `gt_group_of` of a certain class, which is used in Open Images.
+
+    Args:
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        list[np.ndarray]: `gt_group_of` of a certain class.
+    """
+    gt_group_ofs = []
+    for ann in annotations:
+        gt_inds = ann['labels'] == class_id
+        if ann.get('gt_is_group_ofs', None) is not None:
+            gt_group_ofs.append(ann['gt_is_group_ofs'][gt_inds])
+        else:
+            gt_group_ofs.append(np.empty((0, 1), dtype=bool))
+
+    return gt_group_ofs
+
+
+def eval_map(det_results,
+             annotations,
+             scale_ranges=None,
+             iou_thr=0.5,
+             ioa_thr=None,
+             dataset=None,
+             logger=None,
+             tpfp_fn=None,
+             nproc=4,
+             use_legacy_coordinate=False,
+             use_group_of=False,
+             eval_mode='area'):
+    """Evaluate mAP of a dataset.
+
+    Args:
+        det_results (list[list]): [[cls1_det, cls2_det, ...], ...].
+            The outer list indicates images, and the inner list indicates
+            per-class detected bboxes.
+        annotations (list[dict]): Ground truth annotations where each item of
+            the list indicates an image. Keys of annotations are:
+
+            - `bboxes`: numpy array of shape (n, 4)
+            - `labels`: numpy array of shape (n, )
+            - `bboxes_ignore` (optional): numpy array of shape (k, 4)
+            - `labels_ignore` (optional): numpy array of shape (k, )
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. A range of
+            (32, 64) means the area range between (32**2, 64**2).
+            Defaults to None.
+        iou_thr (float): IoU threshold to be considered as matched.
+            Defaults to 0.5.
+        ioa_thr (float | None): IoA threshold to be considered as matched,
+            which only used in OpenImages evaluation. Defaults to None.
+        dataset (list[str] | str | None): Dataset name or dataset classes,
+            there are minor differences in metrics for different datasets, e.g.
+            "voc", "imagenet_det", etc. Defaults to None.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmengine.logging.print_log()` for details.
+            Defaults to None.
+        tpfp_fn (callable | None): The function used to determine true/
+            false positives. If None, :func:`tpfp_default` is used as default
+            unless dataset is 'det' or 'vid' (:func:`tpfp_imagenet` in this
+            case). If it is given as a function, then this function is used
+            to evaluate tp & fp. Default None.
+        nproc (int): Processes used for computing TP and FP.
+            Defaults to 4.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Defaults to False.
+        use_group_of (bool): Whether to use group of when calculate TP and FP,
+            which only used in OpenImages evaluation. Defaults to False.
+        eval_mode (str): 'area' or '11points', 'area' means calculating the
+            area under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1],
+            PASCAL VOC2007 uses `11points` as default evaluate mode, while
+            others are 'area'. Defaults to 'area'.
+
+    Returns:
+        tuple: (mAP, [dict, dict, ...])
+    """
+    assert len(det_results) == len(annotations)
+    assert eval_mode in ['area', '11points'], \
+        f'Unrecognized {eval_mode} mode, only "area" and "11points" ' \
+        'are supported'
+    if not use_legacy_coordinate:
+        extra_length = 0.
+    else:
+        extra_length = 1.
+
+    num_imgs = len(det_results)
+    num_scales = len(scale_ranges) if scale_ranges is not None else 1
+    num_classes = len(det_results[0])  # positive class num
+    area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
+                   if scale_ranges is not None else None)
+
+    # There is no need to use multi processes to process
+    # when num_imgs = 1 .
+    if num_imgs > 1:
+        assert nproc > 0, 'nproc must be at least one.'
+        nproc = min(nproc, num_imgs)
+        pool = Pool(nproc)
+
+    eval_results = []
+    for i in range(num_classes):
+        # get gt and det bboxes of this class
+        cls_dets, cls_gts, cls_gts_ignore = get_cls_results(
+            det_results, annotations, i)
+        # choose proper function according to datasets to compute tp and fp
+        if tpfp_fn is None:
+            if dataset in ['det', 'vid']:
+                tpfp_fn = tpfp_imagenet
+            elif dataset in ['oid_challenge', 'oid_v6'] \
+                    or use_group_of is True:
+                tpfp_fn = tpfp_openimages
+            else:
+                tpfp_fn = tpfp_default
+        if not callable(tpfp_fn):
+            raise ValueError(
+                f'tpfp_fn has to be a function or None, but got {tpfp_fn}')
+
+        if num_imgs > 1:
+            # compute tp and fp for each image with multiple processes
+            args = []
+            if use_group_of:
+                # used in Open Images Dataset evaluation
+                gt_group_ofs = get_cls_group_ofs(annotations, i)
+                args.append(gt_group_ofs)
+                args.append([use_group_of for _ in range(num_imgs)])
+            if ioa_thr is not None:
+                args.append([ioa_thr for _ in range(num_imgs)])
+
+            tpfp = pool.starmap(
+                tpfp_fn,
+                zip(cls_dets, cls_gts, cls_gts_ignore,
+                    [iou_thr for _ in range(num_imgs)],
+                    [area_ranges for _ in range(num_imgs)],
+                    [use_legacy_coordinate for _ in range(num_imgs)], *args))
+        else:
+            tpfp = tpfp_fn(
+                cls_dets[0],
+                cls_gts[0],
+                cls_gts_ignore[0],
+                iou_thr,
+                area_ranges,
+                use_legacy_coordinate,
+                gt_bboxes_group_of=(get_cls_group_ofs(annotations, i)[0]
+                                    if use_group_of else None),
+                use_group_of=use_group_of,
+                ioa_thr=ioa_thr)
+            tpfp = [tpfp]
+
+        if use_group_of:
+            tp, fp, cls_dets = tuple(zip(*tpfp))
+        else:
+            tp, fp = tuple(zip(*tpfp))
+        # calculate gt number of each scale
+        # ignored gts or gts beyond the specific scale are not counted
+        num_gts = np.zeros(num_scales, dtype=int)
+        for j, bbox in enumerate(cls_gts):
+            if area_ranges is None:
+                num_gts[0] += bbox.shape[0]
+            else:
+                gt_areas = (bbox[:, 2] - bbox[:, 0] + extra_length) * (
+                    bbox[:, 3] - bbox[:, 1] + extra_length)
+                for k, (min_area, max_area) in enumerate(area_ranges):
+                    num_gts[k] += np.sum((gt_areas >= min_area)
+                                         & (gt_areas < max_area))
+        # sort all det bboxes by score, also sort tp and fp
+        cls_dets = np.vstack(cls_dets)
+        num_dets = cls_dets.shape[0]
+        sort_inds = np.argsort(-cls_dets[:, -1])
+        tp = np.hstack(tp)[:, sort_inds]
+        fp = np.hstack(fp)[:, sort_inds]
+        # calculate recall and precision with tp and fp
+        tp = np.cumsum(tp, axis=1)
+        fp = np.cumsum(fp, axis=1)
+        eps = np.finfo(np.float32).eps
+        recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
+        precisions = tp / np.maximum((tp + fp), eps)
+        # calculate AP
+        if scale_ranges is None:
+            recalls = recalls[0, :]
+            precisions = precisions[0, :]
+            num_gts = num_gts.item()
+        ap = average_precision(recalls, precisions, eval_mode)
+        eval_results.append({
+            'num_gts': num_gts,
+            'num_dets': num_dets,
+            'recall': recalls,
+            'precision': precisions,
+            'ap': ap
+        })
+
+    if num_imgs > 1:
+        pool.close()
+
+    if scale_ranges is not None:
+        # shape (num_classes, num_scales)
+        all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
+        all_num_gts = np.vstack(
+            [cls_result['num_gts'] for cls_result in eval_results])
+        mean_ap = []
+        for i in range(num_scales):
+            if np.any(all_num_gts[:, i] > 0):
+                mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean())
+            else:
+                mean_ap.append(0.0)
+    else:
+        aps = []
+        for cls_result in eval_results:
+            if cls_result['num_gts'] > 0:
+                aps.append(cls_result['ap'])
+        mean_ap = np.array(aps).mean().item() if aps else 0.0
+
+    print_map_summary(
+        mean_ap, eval_results, dataset, area_ranges, logger=logger)
+
+    return mean_ap, eval_results
+
+
+def print_map_summary(mean_ap,
+                      results,
+                      dataset=None,
+                      scale_ranges=None,
+                      logger=None):
+    """Print mAP and results of each class.
+
+    A table will be printed to show the gts/dets/recall/AP of each class and
+    the mAP.
+
+    Args:
+        mean_ap (float): Calculated from `eval_map()`.
+        results (list[dict]): Calculated from `eval_map()`.
+        dataset (list[str] | str | None): Dataset name or dataset classes.
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmengine.logging.print_log()` for details.
+            Defaults to None.
+    """
+
+    if logger == 'silent':
+        return
+
+    if isinstance(results[0]['ap'], np.ndarray):
+        num_scales = len(results[0]['ap'])
+    else:
+        num_scales = 1
+
+    if scale_ranges is not None:
+        assert len(scale_ranges) == num_scales
+
+    num_classes = len(results)
+
+    recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+    aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+    num_gts = np.zeros((num_scales, num_classes), dtype=int)
+    for i, cls_result in enumerate(results):
+        if cls_result['recall'].size > 0:
+            recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+        aps[:, i] = cls_result['ap']
+        num_gts[:, i] = cls_result['num_gts']
+
+    if dataset is None:
+        label_names = [str(i) for i in range(num_classes)]
+    elif is_str(dataset):
+        label_names = get_classes(dataset)
+    else:
+        label_names = dataset
+
+    if not isinstance(mean_ap, list):
+        mean_ap = [mean_ap]
+
+    header = ['class', 'gts', 'dets', 'recall', 'ap']
+    for i in range(num_scales):
+        if scale_ranges is not None:
+            print_log(f'Scale range {scale_ranges[i]}', logger=logger)
+        table_data = [header]
+        for j in range(num_classes):
+            row_data = [
+                label_names[j], num_gts[i, j], results[j]['num_dets'],
+                f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}'
+            ]
+            table_data.append(row_data)
+        table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}'])
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print_log('\n' + table.table, logger=logger)
diff --git a/head_extractor/src/mmdet/evaluation/functional/panoptic_utils.py b/head_extractor/src/mmdet/evaluation/functional/panoptic_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6faa8ed52bc46c2cb74b1974b8daa521e616e996
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/functional/panoptic_utils.py
@@ -0,0 +1,228 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Copyright (c) 2018, Alexander Kirillov
+# This file supports `backend_args` for `panopticapi`,
+# the source code is copied from `panopticapi`,
+# only the way to load the gt images is modified.
+import multiprocessing
+import os
+
+import mmcv
+import numpy as np
+from mmengine.fileio import get
+
+# A custom value to distinguish instance ID and category ID; need to
+# be greater than the number of categories.
+# For a pixel in the panoptic result map:
+#   pan_id = ins_id * INSTANCE_OFFSET + cat_id
+INSTANCE_OFFSET = 1000
+
+try:
+    from panopticapi.evaluation import OFFSET, VOID, PQStat
+    from panopticapi.utils import rgb2id
+except ImportError:
+    PQStat = None
+    rgb2id = None
+    VOID = 0
+    OFFSET = 256 * 256 * 256
+
+
+def pq_compute_single_core(proc_id,
+                           annotation_set,
+                           gt_folder,
+                           pred_folder,
+                           categories,
+                           backend_args=None,
+                           print_log=False):
+    """The single core function to evaluate the metric of Panoptic
+    Segmentation.
+
+    Same as the function with the same name in `panopticapi`. Only the function
+    to load the images is changed to use the file client.
+
+    Args:
+        proc_id (int): The id of the mini process.
+        gt_folder (str): The path of the ground truth images.
+        pred_folder (str): The path of the prediction images.
+        categories (str): The categories of the dataset.
+        backend_args (object): The Backend of the dataset. If None,
+            the backend will be set to `local`.
+        print_log (bool): Whether to print the log. Defaults to False.
+    """
+    if PQStat is None:
+        raise RuntimeError(
+            'panopticapi is not installed, please install it by: '
+            'pip install git+https://github.com/cocodataset/'
+            'panopticapi.git.')
+
+    pq_stat = PQStat()
+
+    idx = 0
+    for gt_ann, pred_ann in annotation_set:
+        if print_log and idx % 100 == 0:
+            print('Core: {}, {} from {} images processed'.format(
+                proc_id, idx, len(annotation_set)))
+        idx += 1
+        # The gt images can be on the local disk or `ceph`, so we use
+        # backend here.
+        img_bytes = get(
+            os.path.join(gt_folder, gt_ann['file_name']),
+            backend_args=backend_args)
+        pan_gt = mmcv.imfrombytes(img_bytes, flag='color', channel_order='rgb')
+        pan_gt = rgb2id(pan_gt)
+
+        # The predictions can only be on the local dist now.
+        pan_pred = mmcv.imread(
+            os.path.join(pred_folder, pred_ann['file_name']),
+            flag='color',
+            channel_order='rgb')
+        pan_pred = rgb2id(pan_pred)
+
+        gt_segms = {el['id']: el for el in gt_ann['segments_info']}
+        pred_segms = {el['id']: el for el in pred_ann['segments_info']}
+
+        # predicted segments area calculation + prediction sanity checks
+        pred_labels_set = set(el['id'] for el in pred_ann['segments_info'])
+        labels, labels_cnt = np.unique(pan_pred, return_counts=True)
+        for label, label_cnt in zip(labels, labels_cnt):
+            if label not in pred_segms:
+                if label == VOID:
+                    continue
+                raise KeyError(
+                    'In the image with ID {} segment with ID {} is '
+                    'presented in PNG and not presented in JSON.'.format(
+                        gt_ann['image_id'], label))
+            pred_segms[label]['area'] = label_cnt
+            pred_labels_set.remove(label)
+            if pred_segms[label]['category_id'] not in categories:
+                raise KeyError(
+                    'In the image with ID {} segment with ID {} has '
+                    'unknown category_id {}.'.format(
+                        gt_ann['image_id'], label,
+                        pred_segms[label]['category_id']))
+        if len(pred_labels_set) != 0:
+            raise KeyError(
+                'In the image with ID {} the following segment IDs {} '
+                'are presented in JSON and not presented in PNG.'.format(
+                    gt_ann['image_id'], list(pred_labels_set)))
+
+        # confusion matrix calculation
+        pan_gt_pred = pan_gt.astype(np.uint64) * OFFSET + pan_pred.astype(
+            np.uint64)
+        gt_pred_map = {}
+        labels, labels_cnt = np.unique(pan_gt_pred, return_counts=True)
+        for label, intersection in zip(labels, labels_cnt):
+            gt_id = label // OFFSET
+            pred_id = label % OFFSET
+            gt_pred_map[(gt_id, pred_id)] = intersection
+
+        # count all matched pairs
+        gt_matched = set()
+        pred_matched = set()
+        for label_tuple, intersection in gt_pred_map.items():
+            gt_label, pred_label = label_tuple
+            if gt_label not in gt_segms:
+                continue
+            if pred_label not in pred_segms:
+                continue
+            if gt_segms[gt_label]['iscrowd'] == 1:
+                continue
+            if gt_segms[gt_label]['category_id'] != pred_segms[pred_label][
+                    'category_id']:
+                continue
+
+            union = pred_segms[pred_label]['area'] + gt_segms[gt_label][
+                'area'] - intersection - gt_pred_map.get((VOID, pred_label), 0)
+            iou = intersection / union
+            if iou > 0.5:
+                pq_stat[gt_segms[gt_label]['category_id']].tp += 1
+                pq_stat[gt_segms[gt_label]['category_id']].iou += iou
+                gt_matched.add(gt_label)
+                pred_matched.add(pred_label)
+
+        # count false positives
+        crowd_labels_dict = {}
+        for gt_label, gt_info in gt_segms.items():
+            if gt_label in gt_matched:
+                continue
+            # crowd segments are ignored
+            if gt_info['iscrowd'] == 1:
+                crowd_labels_dict[gt_info['category_id']] = gt_label
+                continue
+            pq_stat[gt_info['category_id']].fn += 1
+
+        # count false positives
+        for pred_label, pred_info in pred_segms.items():
+            if pred_label in pred_matched:
+                continue
+            # intersection of the segment with VOID
+            intersection = gt_pred_map.get((VOID, pred_label), 0)
+            # plus intersection with corresponding CROWD region if it exists
+            if pred_info['category_id'] in crowd_labels_dict:
+                intersection += gt_pred_map.get(
+                    (crowd_labels_dict[pred_info['category_id']], pred_label),
+                    0)
+            # predicted segment is ignored if more than half of
+            # the segment correspond to VOID and CROWD regions
+            if intersection / pred_info['area'] > 0.5:
+                continue
+            pq_stat[pred_info['category_id']].fp += 1
+
+    if print_log:
+        print('Core: {}, all {} images processed'.format(
+            proc_id, len(annotation_set)))
+    return pq_stat
+
+
+def pq_compute_multi_core(matched_annotations_list,
+                          gt_folder,
+                          pred_folder,
+                          categories,
+                          backend_args=None,
+                          nproc=32):
+    """Evaluate the metrics of Panoptic Segmentation with multithreading.
+
+    Same as the function with the same name in `panopticapi`.
+
+    Args:
+        matched_annotations_list (list): The matched annotation list. Each
+            element is a tuple of annotations of the same image with the
+            format (gt_anns, pred_anns).
+        gt_folder (str): The path of the ground truth images.
+        pred_folder (str): The path of the prediction images.
+        categories (str): The categories of the dataset.
+        backend_args (object): The file client of the dataset. If None,
+            the backend will be set to `local`.
+        nproc (int): Number of processes for panoptic quality computing.
+            Defaults to 32. When `nproc` exceeds the number of cpu cores,
+            the number of cpu cores is used.
+    """
+    if PQStat is None:
+        raise RuntimeError(
+            'panopticapi is not installed, please install it by: '
+            'pip install git+https://github.com/cocodataset/'
+            'panopticapi.git.')
+
+    cpu_num = min(nproc, multiprocessing.cpu_count())
+
+    annotations_split = np.array_split(matched_annotations_list, cpu_num)
+    print('Number of cores: {}, images per core: {}'.format(
+        cpu_num, len(annotations_split[0])))
+    workers = multiprocessing.Pool(processes=cpu_num)
+    processes = []
+    for proc_id, annotation_set in enumerate(annotations_split):
+        p = workers.apply_async(pq_compute_single_core,
+                                (proc_id, annotation_set, gt_folder,
+                                 pred_folder, categories, backend_args))
+        processes.append(p)
+
+    # Close the process pool, otherwise it will lead to memory
+    # leaking problems.
+    workers.close()
+    workers.join()
+
+    pq_stat = PQStat()
+    for p in processes:
+        pq_stat += p.get()
+
+    return pq_stat
diff --git a/head_extractor/src/mmdet/evaluation/functional/recall.py b/head_extractor/src/mmdet/evaluation/functional/recall.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bce2bf3614ab454dbbdf48efc4650018cc71b13
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/functional/recall.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Sequence
+
+import numpy as np
+from mmengine.logging import print_log
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+    img_num = all_ious.shape[0]
+    total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+    _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+    for k, proposal_num in enumerate(proposal_nums):
+        tmp_ious = np.zeros(0)
+        for i in range(img_num):
+            ious = all_ious[i][:, :proposal_num].copy()
+            gt_ious = np.zeros((ious.shape[0]))
+            if ious.size == 0:
+                tmp_ious = np.hstack((tmp_ious, gt_ious))
+                continue
+            for j in range(ious.shape[0]):
+                gt_max_overlaps = ious.argmax(axis=1)
+                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+                gt_idx = max_ious.argmax()
+                gt_ious[j] = max_ious[gt_idx]
+                box_idx = gt_max_overlaps[gt_idx]
+                ious[gt_idx, :] = -1
+                ious[:, box_idx] = -1
+            tmp_ious = np.hstack((tmp_ious, gt_ious))
+        _ious[k, :] = tmp_ious
+
+    _ious = np.fliplr(np.sort(_ious, axis=1))
+    recalls = np.zeros((proposal_nums.size, thrs.size))
+    for i, thr in enumerate(thrs):
+        recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num)
+
+    return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+    """Check proposal_nums and iou_thrs and set correct format."""
+    if isinstance(proposal_nums, Sequence):
+        _proposal_nums = np.array(proposal_nums)
+    elif isinstance(proposal_nums, int):
+        _proposal_nums = np.array([proposal_nums])
+    else:
+        _proposal_nums = proposal_nums
+
+    if iou_thrs is None:
+        _iou_thrs = np.array([0.5])
+    elif isinstance(iou_thrs, Sequence):
+        _iou_thrs = np.array(iou_thrs)
+    elif isinstance(iou_thrs, float):
+        _iou_thrs = np.array([iou_thrs])
+    else:
+        _iou_thrs = iou_thrs
+
+    return _proposal_nums, _iou_thrs
+
+
+def eval_recalls(gts,
+                 proposals,
+                 proposal_nums=None,
+                 iou_thrs=0.5,
+                 logger=None,
+                 use_legacy_coordinate=False):
+    """Calculate recalls.
+
+    Args:
+        gts (list[ndarray]): a list of arrays of shape (n, 4)
+        proposals (list[ndarray]): a list of arrays of shape (k, 4) or (k, 5)
+        proposal_nums (int | Sequence[int]): Top N proposals to be evaluated.
+        iou_thrs (float | Sequence[float]): IoU thresholds. Default: 0.5.
+        logger (logging.Logger | str | None): The way to print the recall
+            summary. See `mmengine.logging.print_log()` for details.
+            Default: None.
+        use_legacy_coordinate (bool): Whether use coordinate system
+            in mmdet v1.x. "1" was added to both height and width
+            which means w, h should be
+            computed as 'x2 - x1 + 1` and 'y2 - y1 + 1'. Default: False.
+
+
+    Returns:
+        ndarray: recalls of different ious and proposal nums
+    """
+
+    img_num = len(gts)
+    assert img_num == len(proposals)
+    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+    all_ious = []
+    for i in range(img_num):
+        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+            scores = proposals[i][:, 4]
+            sort_idx = np.argsort(scores)[::-1]
+            img_proposal = proposals[i][sort_idx, :]
+        else:
+            img_proposal = proposals[i]
+        prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+        if gts[i] is None or gts[i].shape[0] == 0:
+            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+        else:
+            ious = bbox_overlaps(
+                gts[i],
+                img_proposal[:prop_num, :4],
+                use_legacy_coordinate=use_legacy_coordinate)
+        all_ious.append(ious)
+    all_ious = np.array(all_ious)
+    recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+
+    print_recall_summary(recalls, proposal_nums, iou_thrs, logger=logger)
+    return recalls
+
+
+def print_recall_summary(recalls,
+                         proposal_nums,
+                         iou_thrs,
+                         row_idxs=None,
+                         col_idxs=None,
+                         logger=None):
+    """Print recalls in a table.
+
+    Args:
+        recalls (ndarray): calculated from `bbox_recalls`
+        proposal_nums (ndarray or list): top N proposals
+        iou_thrs (ndarray or list): iou thresholds
+        row_idxs (ndarray): which rows(proposal nums) to print
+        col_idxs (ndarray): which cols(iou thresholds) to print
+        logger (logging.Logger | str | None): The way to print the recall
+            summary. See `mmengine.logging.print_log()` for details.
+            Default: None.
+    """
+    proposal_nums = np.array(proposal_nums, dtype=np.int32)
+    iou_thrs = np.array(iou_thrs)
+    if row_idxs is None:
+        row_idxs = np.arange(proposal_nums.size)
+    if col_idxs is None:
+        col_idxs = np.arange(iou_thrs.size)
+    row_header = [''] + iou_thrs[col_idxs].tolist()
+    table_data = [row_header]
+    for i, num in enumerate(proposal_nums[row_idxs]):
+        row = [f'{val:.3f}' for val in recalls[row_idxs[i], col_idxs].tolist()]
+        row.insert(0, num)
+        table_data.append(row)
+    table = AsciiTable(table_data)
+    print_log('\n' + table.table, logger=logger)
+
+
+def plot_num_recall(recalls, proposal_nums):
+    """Plot Proposal_num-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        proposal_nums(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(proposal_nums, np.ndarray):
+        _proposal_nums = proposal_nums.tolist()
+    else:
+        _proposal_nums = proposal_nums
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot([0] + _proposal_nums, [0] + _recalls)
+    plt.xlabel('Proposal num')
+    plt.ylabel('Recall')
+    plt.axis([0, proposal_nums.max(), 0, 1])
+    f.show()
+
+
+def plot_iou_recall(recalls, iou_thrs):
+    """Plot IoU-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        iou_thrs(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(iou_thrs, np.ndarray):
+        _iou_thrs = iou_thrs.tolist()
+    else:
+        _iou_thrs = iou_thrs
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot(_iou_thrs + [1.0], _recalls + [0.])
+    plt.xlabel('IoU')
+    plt.ylabel('Recall')
+    plt.axis([iou_thrs.min(), 1, 0, 1])
+    f.show()
diff --git a/head_extractor/src/mmdet/evaluation/functional/ytvis.py b/head_extractor/src/mmdet/evaluation/functional/ytvis.py
new file mode 100644
index 0000000000000000000000000000000000000000..c65a7e9bc956c7de42e0d6e511dabb3d7325782d
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/functional/ytvis.py
@@ -0,0 +1,305 @@
+# Copyright (c) Github URL
+# Copied from
+# https://github.com/youtubevos/cocoapi/blob/master/PythonAPI/pycocotools/ytvos.py
+__author__ = 'ychfan'
+# Interface for accessing the YouTubeVIS dataset.
+
+# The following API functions are defined:
+#  YTVIS       - YTVIS api class that loads YouTubeVIS annotation file
+#  and prepare data structures.
+#  decodeMask - Decode binary mask M encoded via run-length encoding.
+#  encodeMask - Encode binary mask M using run-length encoding.
+#  getAnnIds  - Get ann ids that satisfy given filter conditions.
+#  getCatIds  - Get cat ids that satisfy given filter conditions.
+#  getImgIds  - Get img ids that satisfy given filter conditions.
+#  loadAnns   - Load anns with the specified ids.
+#  loadCats   - Load cats with the specified ids.
+#  loadImgs   - Load imgs with the specified ids.
+#  annToMask  - Convert segmentation in an annotation to binary mask.
+#  loadRes    - Load algorithm results and create API for accessing them.
+
+# Microsoft COCO Toolbox.      version 2.0
+# Data, paper, and tutorials available at:  http://mscoco.org/
+# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
+# Licensed under the Simplified BSD License [see bsd.txt]
+
+import copy
+import itertools
+import json
+import sys
+import time
+from collections import defaultdict
+
+import numpy as np
+from pycocotools import mask as maskUtils
+
+PYTHON_VERSION = sys.version_info[0]
+
+
+def _isArrayLike(obj):
+    return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
+
+
+class YTVIS:
+
+    def __init__(self, annotation_file=None):
+        """Constructor of Microsoft COCO helper class for reading and
+        visualizing annotations.
+
+        :param annotation_file (str | dict): location of annotation file or
+            dict results.
+        :param image_folder (str): location to the folder that hosts images.
+        :return:
+        """
+        # load dataset
+        self.dataset, self.anns, self.cats, self.vids = dict(), dict(), dict(
+        ), dict()
+        self.vidToAnns, self.catToVids = defaultdict(list), defaultdict(list)
+        if annotation_file is not None:
+            print('loading annotations into memory...')
+            tic = time.time()
+            if type(annotation_file) == str:
+                dataset = json.load(open(annotation_file, 'r'))
+            else:
+                dataset = annotation_file
+            assert type(
+                dataset
+            ) == dict, 'annotation file format {} not supported'.format(
+                type(dataset))
+            print('Done (t={:0.2f}s)'.format(time.time() - tic))
+            self.dataset = dataset
+            self.createIndex()
+
+    def createIndex(self):
+        # create index
+        print('creating index...')
+        anns, cats, vids = {}, {}, {}
+        vidToAnns, catToVids = defaultdict(list), defaultdict(list)
+        if 'annotations' in self.dataset:
+            for ann in self.dataset['annotations']:
+                vidToAnns[ann['video_id']].append(ann)
+                anns[ann['id']] = ann
+
+        if 'videos' in self.dataset:
+            for vid in self.dataset['videos']:
+                vids[vid['id']] = vid
+
+        if 'categories' in self.dataset:
+            for cat in self.dataset['categories']:
+                cats[cat['id']] = cat
+
+        if 'annotations' in self.dataset and 'categories' in self.dataset:
+            for ann in self.dataset['annotations']:
+                catToVids[ann['category_id']].append(ann['video_id'])
+
+        print('index created!')
+
+        # create class members
+        self.anns = anns
+        self.vidToAnns = vidToAnns
+        self.catToVids = catToVids
+        self.vids = vids
+        self.cats = cats
+
+    def getAnnIds(self, vidIds=[], catIds=[], areaRng=[], iscrowd=None):
+        """Get ann ids that satisfy given filter conditions. default skips that
+        filter.
+
+        :param vidIds  (int array)     : get anns for given vids
+               catIds  (int array)     : get anns for given cats
+               areaRng (float array)   : get anns for given area range
+               iscrowd (boolean)       : get anns for given crowd label
+        :return: ids (int array)       : integer array of ann ids
+        """
+        vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(vidIds) == len(catIds) == len(areaRng) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(vidIds) == 0:
+                lists = [
+                    self.vidToAnns[vidId] for vidId in vidIds
+                    if vidId in self.vidToAnns
+                ]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.dataset['annotations']
+            anns = anns if len(catIds) == 0 else [
+                ann for ann in anns if ann['category_id'] in catIds
+            ]
+            anns = anns if len(areaRng) == 0 else [
+                ann for ann in anns if ann['avg_area'] > areaRng[0]
+                and ann['avg_area'] < areaRng[1]
+            ]
+        if iscrowd is not None:
+            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
+        else:
+            ids = [ann['id'] for ann in anns]
+        return ids
+
+    def getCatIds(self, catNms=[], supNms=[], catIds=[]):
+        """filtering parameters. default skips that filter.
+
+        :param catNms (str array)  : get cats for given cat names
+        :param supNms (str array)  : get cats for given supercategory names
+        :param catIds (int array)  : get cats for given cat ids
+        :return: ids (int array)   : integer array of cat ids
+        """
+        catNms = catNms if _isArrayLike(catNms) else [catNms]
+        supNms = supNms if _isArrayLike(supNms) else [supNms]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(catNms) == len(supNms) == len(catIds) == 0:
+            cats = self.dataset['categories']
+        else:
+            cats = self.dataset['categories']
+            cats = cats if len(catNms) == 0 else [
+                cat for cat in cats if cat['name'] in catNms
+            ]
+            cats = cats if len(supNms) == 0 else [
+                cat for cat in cats if cat['supercategory'] in supNms
+            ]
+            cats = cats if len(catIds) == 0 else [
+                cat for cat in cats if cat['id'] in catIds
+            ]
+        ids = [cat['id'] for cat in cats]
+        return ids
+
+    def getVidIds(self, vidIds=[], catIds=[]):
+        """Get vid ids that satisfy given filter conditions.
+
+        :param vidIds (int array) : get vids for given ids
+        :param catIds (int array) : get vids with all given cats
+        :return: ids (int array)  : integer array of vid ids
+        """
+        vidIds = vidIds if _isArrayLike(vidIds) else [vidIds]
+        catIds = catIds if _isArrayLike(catIds) else [catIds]
+
+        if len(vidIds) == len(catIds) == 0:
+            ids = self.vids.keys()
+        else:
+            ids = set(vidIds)
+            for i, catId in enumerate(catIds):
+                if i == 0 and len(ids) == 0:
+                    ids = set(self.catToVids[catId])
+                else:
+                    ids &= set(self.catToVids[catId])
+        return list(ids)
+
+    def loadAnns(self, ids=[]):
+        """Load anns with the specified ids.
+
+        :param ids (int array)       : integer ids specifying anns
+        :return: anns (object array) : loaded ann objects
+        """
+        if _isArrayLike(ids):
+            return [self.anns[id] for id in ids]
+        elif type(ids) == int:
+            return [self.anns[ids]]
+
+    def loadCats(self, ids=[]):
+        """Load cats with the specified ids.
+
+        :param ids (int array)       : integer ids specifying cats
+        :return: cats (object array) : loaded cat objects
+        """
+        if _isArrayLike(ids):
+            return [self.cats[id] for id in ids]
+        elif type(ids) == int:
+            return [self.cats[ids]]
+
+    def loadVids(self, ids=[]):
+        """Load anns with the specified ids.
+
+        :param ids (int array)       : integer ids specifying vid
+        :return: vids (object array) : loaded vid objects
+        """
+        if _isArrayLike(ids):
+            return [self.vids[id] for id in ids]
+        elif type(ids) == int:
+            return [self.vids[ids]]
+
+    def loadRes(self, resFile):
+        """Load result file and return a result api object.
+
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = YTVIS()
+        res.dataset['videos'] = [img for img in self.dataset['videos']]
+
+        print('Loading and preparing results...')
+        tic = time.time()
+        if type(resFile) == str or (PYTHON_VERSION == 2
+                                    and type(resFile) == str):
+            anns = json.load(open(resFile))
+        elif type(resFile) == np.ndarray:
+            anns = self.loadNumpyAnnotations(resFile)
+        else:
+            anns = resFile
+        assert type(anns) == list, 'results in not an array of objects'
+        annsVidIds = [ann['video_id'] for ann in anns]
+        assert set(annsVidIds) == (set(annsVidIds) & set(self.getVidIds())), \
+               'Results do not correspond to current coco set'
+        if 'segmentations' in anns[0]:
+            res.dataset['categories'] = copy.deepcopy(
+                self.dataset['categories'])
+            for id, ann in enumerate(anns):
+                ann['areas'] = []
+                if 'bboxes' not in ann:
+                    ann['bboxes'] = []
+                for seg in ann['segmentations']:
+                    # now only support compressed RLE format
+                    # as segmentation results
+                    if seg:
+                        ann['areas'].append(maskUtils.area(seg))
+                        if len(ann['bboxes']) < len(ann['areas']):
+                            ann['bboxes'].append(maskUtils.toBbox(seg))
+                    else:
+                        ann['areas'].append(None)
+                        if len(ann['bboxes']) < len(ann['areas']):
+                            ann['bboxes'].append(None)
+                ann['id'] = id + 1
+                l_ori = [a for a in ann['areas'] if a]
+                if len(l_ori) == 0:
+                    ann['avg_area'] = 0
+                else:
+                    ann['avg_area'] = np.array(l_ori).mean()
+                ann['iscrowd'] = 0
+        print('DONE (t={:0.2f}s)'.format(time.time() - tic))
+
+        res.dataset['annotations'] = anns
+        res.createIndex()
+        return res
+
+    def annToRLE(self, ann, frameId):
+        """Convert annotation which can be polygons, uncompressed RLE to RLE.
+
+        :return: binary mask (numpy 2D array)
+        """
+        t = self.vids[ann['video_id']]
+        h, w = t['height'], t['width']
+        segm = ann['segmentations'][frameId]
+        if type(segm) == list:
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(segm, h, w)
+            rle = maskUtils.merge(rles)
+        elif type(segm['counts']) == list:
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(segm, h, w)
+        else:
+            # rle
+            rle = segm
+        return rle
+
+    def annToMask(self, ann, frameId):
+        """Convert annotation which can be polygons, uncompressed RLE, or RLE
+        to binary mask.
+
+        :return: binary mask (numpy 2D array)
+        """
+        rle = self.annToRLE(ann, frameId)
+        m = maskUtils.decode(rle)
+        return m
diff --git a/head_extractor/src/mmdet/evaluation/functional/ytviseval.py b/head_extractor/src/mmdet/evaluation/functional/ytviseval.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdaf110d37c61b4e02873a4dc83e1722a70a29f1
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/functional/ytviseval.py
@@ -0,0 +1,623 @@
+# Copyright (c) Github URL
+# Copied from
+# https://github.com/youtubevos/cocoapi/blob/master/PythonAPI/pycocotools/ytvoseval.py
+__author__ = 'ychfan'
+
+import copy
+import datetime
+import time
+from collections import defaultdict
+
+import numpy as np
+from pycocotools import mask as maskUtils
+
+
+class YTVISeval:
+    # Interface for evaluating video instance segmentation on
+    # the YouTubeVIS dataset.
+    #
+    # The usage for YTVISeval is as follows:
+    #  cocoGt=..., cocoDt=...       # load dataset and results
+    #  E = YTVISeval(cocoGt,cocoDt); # initialize YTVISeval object
+    #  E.params.recThrs = ...;      # set parameters as desired
+    #  E.evaluate();                # run per image evaluation
+    #  E.accumulate();              # accumulate per image results
+    #  E.summarize();               # display summary metrics of results
+    # For example usage see evalDemo.m and http://mscoco.org/.
+    #
+    # The evaluation parameters are as follows (defaults in brackets):
+    #  imgIds     - [all] N img ids to use for evaluation
+    #  catIds     - [all] K cat ids to use for evaluation
+    #  iouThrs    - [.5:.05:.95] T=10 IoU thresholds for evaluation
+    #  recThrs    - [0:.01:1] R=101 recall thresholds for evaluation
+    #  areaRng    - [...] A=4 object area ranges for evaluation
+    #  maxDets    - [1 10 100] M=3 thresholds on max detections per image
+    #  iouType    - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints'
+    #  iouType replaced the now DEPRECATED useSegm parameter.
+    #  useCats    - [1] if true use category labels for evaluation
+    # Note: if useCats=0 category labels are ignored as in proposal scoring.
+    # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified.
+    #
+    # evaluate(): evaluates detections on every image and every category and
+    # concats the results into the "evalImgs" with fields:
+    #  dtIds      - [1xD] id for each of the D detections (dt)
+    #  gtIds      - [1xG] id for each of the G ground truths (gt)
+    #  dtMatches  - [TxD] matching gt id at each IoU or 0
+    #  gtMatches  - [TxG] matching dt id at each IoU or 0
+    #  dtScores   - [1xD] confidence of each dt
+    #  gtIgnore   - [1xG] ignore flag for each gt
+    #  dtIgnore   - [TxD] ignore flag for each dt at each IoU
+    #
+    # accumulate(): accumulates the per-image, per-category evaluation
+    # results in "evalImgs" into the dictionary "eval" with fields:
+    #  params     - parameters used for evaluation
+    #  date       - date evaluation was performed
+    #  counts     - [T,R,K,A,M] parameter dimensions (see above)
+    #  precision  - [TxRxKxAxM] precision for every evaluation setting
+    #  recall     - [TxKxAxM] max recall for every evaluation setting
+    # Note: precision and recall==-1 for settings with no gt objects.
+    #
+    # See also coco, mask, pycocoDemo, pycocoEvalDemo
+    #
+    # Microsoft COCO Toolbox.      version 2.0
+    # Data, paper, and tutorials available at:  http://mscoco.org/
+    # Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
+    # Licensed under the Simplified BSD License [see coco/license.txt]
+    def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'):
+        """Initialize CocoEval using coco APIs for gt and dt.
+
+        :param cocoGt: coco object with ground truth annotations
+        :param cocoDt: coco object with detection results
+        :return: None
+        """
+        if not iouType:
+            print('iouType not specified. use default iouType segm')
+        self.cocoGt = cocoGt  # ground truth COCO API
+        self.cocoDt = cocoDt  # detections COCO API
+        self.params = {}  # evaluation parameters
+        self.evalVids = defaultdict(
+            list)  # per-image per-category evaluation results [KxAxI] elements
+        self.eval = {}  # accumulated evaluation results
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self.params = Params(iouType=iouType)  # parameters
+        self._paramsEval = {}  # parameters for evaluation
+        self.stats = []  # result summarization
+        self.ious = {}  # ious between all gts and dts
+        if cocoGt is not None:
+            self.params.vidIds = sorted(cocoGt.getVidIds())
+            self.params.catIds = sorted(cocoGt.getCatIds())
+
+    def _prepare(self):
+        '''
+        Prepare ._gts and ._dts for evaluation based on params
+        :return: None
+        '''
+
+        def _toMask(anns, coco):
+            # modify ann['segmentation'] by reference
+            for ann in anns:
+                for i, a in enumerate(ann['segmentations']):
+                    if a:
+                        rle = coco.annToRLE(ann, i)
+                        ann['segmentations'][i] = rle
+                l_ori = [a for a in ann['areas'] if a]
+                if len(l_ori) == 0:
+                    ann['avg_area'] = 0
+                else:
+                    ann['avg_area'] = np.array(l_ori).mean()
+
+        p = self.params
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(
+                self.cocoGt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds))
+            dts = self.cocoDt.loadAnns(
+                self.cocoDt.getAnnIds(vidIds=p.vidIds, catIds=p.catIds))
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(vidIds=p.vidIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(vidIds=p.vidIds))
+
+        # convert ground truth to mask if iouType == 'segm'
+        if p.iouType == 'segm':
+            _toMask(gts, self.cocoGt)
+            _toMask(dts, self.cocoDt)
+        # set ignore flag
+        for gt in gts:
+            gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0
+            gt['ignore'] = 'iscrowd' in gt and gt['iscrowd']
+            if p.iouType == 'keypoints':
+                gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore']
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        for gt in gts:
+            self._gts[gt['video_id'], gt['category_id']].append(gt)
+        for dt in dts:
+            self._dts[dt['video_id'], dt['category_id']].append(dt)
+        self.evalVids = defaultdict(
+            list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def evaluate(self):
+        '''
+        Run per image evaluation on given images and store
+        results (a list of dict) in self.evalVids
+        :return: None
+        '''
+        tic = time.time()
+        print('Running per image evaluation...')
+        p = self.params
+        # add backward compatibility if useSegm is specified in params
+        if p.useSegm is not None:
+            p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+            print('useSegm (deprecated) is not None. Running {} evaluation'.
+                  format(p.iouType))
+        print('Evaluate annotation type *{}*'.format(p.iouType))
+        p.vidIds = list(np.unique(p.vidIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+        # loop through images, area range, max detection number
+        catIds = p.catIds if p.useCats else [-1]
+
+        if p.iouType == 'segm' or p.iouType == 'bbox':
+            computeIoU = self.computeIoU
+        elif p.iouType == 'keypoints':
+            computeIoU = self.computeOks
+        self.ious = {(vidId, catId): computeIoU(vidId, catId)
+                     for vidId in p.vidIds for catId in catIds}
+
+        evaluateVid = self.evaluateVid
+        maxDet = p.maxDets[-1]
+
+        self.evalImgs = [
+            evaluateVid(vidId, catId, areaRng, maxDet) for catId in catIds
+            for areaRng in p.areaRng for vidId in p.vidIds
+        ]
+        self._paramsEval = copy.deepcopy(self.params)
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format(toc - tic))
+
+    def computeIoU(self, vidId, catId):
+        p = self.params
+        if p.useCats:
+            gt = self._gts[vidId, catId]
+            dt = self._dts[vidId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[vidId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[vidId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+        inds = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0:p.maxDets[-1]]
+
+        if p.iouType == 'segm':
+            g = [g['segmentations'] for g in gt]
+            d = [d['segmentations'] for d in dt]
+        elif p.iouType == 'bbox':
+            g = [g['bboxes'] for g in gt]
+            d = [d['bboxes'] for d in dt]
+        else:
+            raise Exception('unknown iouType for iou computation')
+
+        # compute iou between each dt and gt region
+
+        def iou_seq(d_seq, g_seq):
+            i = .0
+            u = .0
+            for d, g in zip(d_seq, g_seq):
+                if d and g:
+                    i += maskUtils.area(maskUtils.merge([d, g], True))
+                    u += maskUtils.area(maskUtils.merge([d, g], False))
+                elif not d and g:
+                    u += maskUtils.area(g)
+                elif d and not g:
+                    u += maskUtils.area(d)
+            if not u > .0:
+                print('Mask sizes in video {} and category {} may not match!'.
+                      format(vidId, catId))
+            iou = i / u if u > .0 else .0
+            return iou
+
+        ious = np.zeros([len(d), len(g)])
+        for i, j in np.ndindex(ious.shape):
+            ious[i, j] = iou_seq(d[i], g[j])
+
+        return ious
+
+    def computeOks(self, imgId, catId):
+        p = self.params
+
+        gts = self._gts[imgId, catId]
+        dts = self._dts[imgId, catId]
+        inds = np.argsort([-d['score'] for d in dts], kind='mergesort')
+        dts = [dts[i] for i in inds]
+        if len(dts) > p.maxDets[-1]:
+            dts = dts[0:p.maxDets[-1]]
+        # if len(gts) == 0 and len(dts) == 0:
+        if len(gts) == 0 or len(dts) == 0:
+            return []
+        ious = np.zeros((len(dts), len(gts)))
+        sigmas = np.array([
+            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
+            .87, .87, .89, .89
+        ]) / 10.0
+        vars = (sigmas * 2)**2
+        k = len(sigmas)
+        # compute oks between each detection and ground truth object
+        for j, gt in enumerate(gts):
+            # create bounds for ignore regions(double the gt bbox)
+            g = np.array(gt['keypoints'])
+            xg = g[0::3]
+            yg = g[1::3]
+            vg = g[2::3]
+            k1 = np.count_nonzero(vg > 0)
+            bb = gt['bbox']
+            x0 = bb[0] - bb[2]
+            x1 = bb[0] + bb[2] * 2
+            y0 = bb[1] - bb[3]
+            y1 = bb[1] + bb[3] * 2
+            for i, dt in enumerate(dts):
+                d = np.array(dt['keypoints'])
+                xd = d[0::3]
+                yd = d[1::3]
+                if k1 > 0:
+                    # measure the per-keypoint distance if keypoints visible
+                    dx = xd - xg
+                    dy = yd - yg
+                else:
+                    # measure minimum distance to keypoints
+                    z = np.zeros((k))
+                    dx = np.max((z, x0 - xd), axis=0) + np.max(
+                        (z, xd - x1), axis=0)
+                    dy = np.max((z, y0 - yd), axis=0) + np.max(
+                        (z, yd - y1), axis=0)
+                e = (dx**2 + dy**2) / vars / (gt['avg_area'] +
+                                              np.spacing(1)) / 2
+                if k1 > 0:
+                    e = e[vg > 0]
+                ious[i, j] = np.sum(np.exp(-e)) / e.shape[0]
+        return ious
+
+    def evaluateVid(self, vidId, catId, aRng, maxDet):
+        '''
+        perform evaluation for single category and image
+        :return: dict (single image results)
+        '''
+        p = self.params
+        if p.useCats:
+            gt = self._gts[vidId, catId]
+            dt = self._dts[vidId, catId]
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[vidId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[vidId, cId]]
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+
+        for g in gt:
+            if g['ignore'] or (g['avg_area'] < aRng[0]
+                               or g['avg_area'] > aRng[1]):
+                g['_ignore'] = 1
+            else:
+                g['_ignore'] = 0
+
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort')
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d['score'] for d in dt], kind='mergesort')
+        dt = [dt[i] for i in dtind[0:maxDet]]
+        iscrowd = [int(o['iscrowd']) for o in gt]
+        # load computed ious
+        ious = self.ious[vidId, catId][:, gtind] if len(
+            self.ious[vidId, catId]) > 0 else self.ious[vidId, catId]
+
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm = np.zeros((T, G))
+        dtm = np.zeros((T, D))
+        gtIg = np.array([g['_ignore'] for g in gt])
+        dtIg = np.zeros((T, D))
+        if not len(ious) == 0:
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t, 1 - 1e-10])
+                    m = -1
+                    for gind, g in enumerate(gt):
+                        # if this gt already matched, and not a crowd, continue
+                        if gtm[tind, gind] > 0 and not iscrowd[gind]:
+                            continue
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
+                            break
+                        # continue to next gt unless better match made
+                        if ious[dind, gind] < iou:
+                            continue
+                        # if match successful and best so far,
+                        # store appropriately
+                        iou = ious[dind, gind]
+                        m = gind
+                    # if match made store id of match for both dt and gt
+                    if m == -1:
+                        continue
+                    dtIg[tind, dind] = gtIg[m]
+                    dtm[tind, dind] = gt[m]['id']
+                    gtm[tind, m] = d['id']
+        # set unmatched detections outside of area range to ignore
+        a = np.array([
+            d['avg_area'] < aRng[0] or d['avg_area'] > aRng[1] for d in dt
+        ]).reshape((1, len(dt)))
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T,
+                                                                      0)))
+        # store results for given image and category
+        return {
+            'video_id': vidId,
+            'category_id': catId,
+            'aRng': aRng,
+            'maxDet': maxDet,
+            'dtIds': [d['id'] for d in dt],
+            'gtIds': [g['id'] for g in gt],
+            'dtMatches': dtm,
+            'gtMatches': gtm,
+            'dtScores': [d['score'] for d in dt],
+            'gtIgnore': gtIg,
+            'dtIgnore': dtIg,
+        }
+
+    def accumulate(self, p=None):
+        """Accumulate per image evaluation results and store the result in
+        self.eval.
+
+        :param p: input params for evaluation
+        :return: None
+        """
+        print('Accumulating evaluation results...')
+        tic = time.time()
+        if not self.evalImgs:
+            print('Please run evaluate() first')
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+        p.catIds = p.catIds if p.useCats == 1 else [-1]
+        T = len(p.iouThrs)
+        R = len(p.recThrs)
+        K = len(p.catIds) if p.useCats else 1
+        A = len(p.areaRng)
+        M = len(p.maxDets)
+        precision = -np.ones(
+            (T, R, K, A, M))  # -1 for the precision of absent categories
+        recall = -np.ones((T, K, A, M))
+        scores = -np.ones((T, R, K, A, M))
+
+        # create dictionary for future indexing
+        _pe = self._paramsEval
+        catIds = _pe.catIds if _pe.useCats else [-1]
+        setK = set(catIds)
+        setA = set(map(tuple, _pe.areaRng))
+        setM = set(_pe.maxDets)
+        setI = set(_pe.vidIds)
+        # get inds to evaluate
+        k_list = [n for n, k in enumerate(p.catIds) if k in setK]
+        m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
+        a_list = [
+            n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng))
+            if a in setA
+        ]
+        i_list = [n for n, i in enumerate(p.vidIds) if i in setI]
+        I0 = len(_pe.vidIds)
+        A0 = len(_pe.areaRng)
+        # retrieve E at each category, area range, and max number of detections
+        for k, k0 in enumerate(k_list):
+            Nk = k0 * A0 * I0
+            for a, a0 in enumerate(a_list):
+                Na = a0 * I0
+                for m, maxDet in enumerate(m_list):
+                    E = [self.evalImgs[Nk + Na + i] for i in i_list]
+                    E = [e for e in E if e is not None]
+                    if len(E) == 0:
+                        continue
+                    dtScores = np.concatenate(
+                        [e['dtScores'][0:maxDet] for e in E])
+
+                    inds = np.argsort(-dtScores, kind='mergesort')
+                    dtScoresSorted = dtScores[inds]
+
+                    dtm = np.concatenate(
+                        [e['dtMatches'][:, 0:maxDet] for e in E], axis=1)[:,
+                                                                          inds]
+                    dtIg = np.concatenate(
+                        [e['dtIgnore'][:, 0:maxDet] for e in E], axis=1)[:,
+                                                                         inds]
+                    gtIg = np.concatenate([e['gtIgnore'] for e in E])
+                    npig = np.count_nonzero(gtIg == 0)
+                    if npig == 0:
+                        continue
+                    tps = np.logical_and(dtm, np.logical_not(dtIg))
+                    fps = np.logical_and(
+                        np.logical_not(dtm), np.logical_not(dtIg))
+
+                    tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
+                    fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
+                    for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                        tp = np.array(tp)
+                        fp = np.array(fp)
+                        nd_ori = len(tp)
+                        rc = tp / npig
+                        pr = tp / (fp + tp + np.spacing(1))
+                        q = np.zeros((R, ))
+                        ss = np.zeros((R, ))
+
+                        if nd_ori:
+                            recall[t, k, a, m] = rc[-1]
+                        else:
+                            recall[t, k, a, m] = 0
+
+                        # use python array gets significant speed improvement
+                        pr = pr.tolist()
+                        q = q.tolist()
+
+                        for i in range(nd_ori - 1, 0, -1):
+                            if pr[i] > pr[i - 1]:
+                                pr[i - 1] = pr[i]
+
+                        inds = np.searchsorted(rc, p.recThrs, side='left')
+                        try:
+                            for ri, pi in enumerate(inds):
+                                q[ri] = pr[pi]
+                                ss[ri] = dtScoresSorted[pi]
+                        except Exception:
+                            pass
+                        precision[t, :, k, a, m] = np.array(q)
+                        scores[t, :, k, a, m] = np.array(ss)
+        self.eval = {
+            'params': p,
+            'counts': [T, R, K, A, M],
+            'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+            'precision': precision,
+            'recall': recall,
+            'scores': scores,
+        }
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format(toc - tic))
+
+    def summarize(self):
+        """Compute and display summary metrics for evaluation results.
+
+        Note this function can *only* be applied on the default parameter
+        setting
+        """
+
+        def _summarize(ap=1, iouThr=None, areaRng='all', maxDets=100):
+            p = self.params
+            iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | ' \
+                   'maxDets={:>3d} ] = {:0.3f}'
+            titleStr = 'Average Precision' if ap == 1 else 'Average Recall'
+            typeStr = '(AP)' if ap == 1 else '(AR)'
+            iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \
+                if iouThr is None else '{:0.2f}'.format(iouThr)
+
+            aind = [
+                i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng
+            ]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+            if ap == 1:
+                # dimension of precision: [TxRxKxAxM]
+                s = self.eval['precision']
+                # IoU
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, :, aind, mind]
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = self.eval['recall']
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+            else:
+                mean_s = np.mean(s[s > -1])
+            print(
+                iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets,
+                            mean_s))
+            return mean_s
+
+        def _summarizeDets():
+            stats = np.zeros((12, ))
+            stats[0] = _summarize(1)
+            stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2])
+            stats[2] = _summarize(
+                1, iouThr=.75, maxDets=self.params.maxDets[2])
+            stats[3] = _summarize(
+                1, areaRng='small', maxDets=self.params.maxDets[2])
+            stats[4] = _summarize(
+                1, areaRng='medium', maxDets=self.params.maxDets[2])
+            stats[5] = _summarize(
+                1, areaRng='large', maxDets=self.params.maxDets[2])
+            stats[6] = _summarize(0, maxDets=self.params.maxDets[0])
+            stats[7] = _summarize(0, maxDets=self.params.maxDets[1])
+            stats[8] = _summarize(0, maxDets=self.params.maxDets[2])
+            stats[9] = _summarize(
+                0, areaRng='small', maxDets=self.params.maxDets[2])
+            stats[10] = _summarize(
+                0, areaRng='medium', maxDets=self.params.maxDets[2])
+            stats[11] = _summarize(
+                0, areaRng='large', maxDets=self.params.maxDets[2])
+            return stats
+
+        def _summarizeKps():
+            stats = np.zeros((10, ))
+            stats[0] = _summarize(1, maxDets=20)
+            stats[1] = _summarize(1, maxDets=20, iouThr=.5)
+            stats[2] = _summarize(1, maxDets=20, iouThr=.75)
+            stats[3] = _summarize(1, maxDets=20, areaRng='medium')
+            stats[4] = _summarize(1, maxDets=20, areaRng='large')
+            stats[5] = _summarize(0, maxDets=20)
+            stats[6] = _summarize(0, maxDets=20, iouThr=.5)
+            stats[7] = _summarize(0, maxDets=20, iouThr=.75)
+            stats[8] = _summarize(0, maxDets=20, areaRng='medium')
+            stats[9] = _summarize(0, maxDets=20, areaRng='large')
+            return stats
+
+        if not self.eval:
+            raise Exception('Please run accumulate() first')
+        iouType = self.params.iouType
+        if iouType == 'segm' or iouType == 'bbox':
+            summarize = _summarizeDets
+        elif iouType == 'keypoints':
+            summarize = _summarizeKps
+        self.stats = summarize()
+
+    def __str__(self):
+        self.summarize()
+
+
+class Params:
+    """Params for coco evaluation api."""
+
+    def setDetParams(self):
+        self.vidIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange
+        # is slightly larger than the true value
+        self.iouThrs = np.linspace(
+            .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(
+            .0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [[0**2, 1e5**2], [0**2, 128**2], [128**2, 256**2],
+                        [256**2, 1e5**2]]
+        self.areaRngLbl = ['all', 'small', 'medium', 'large']
+        self.useCats = 1
+
+    def setKpParams(self):
+        self.vidIds = []
+        self.catIds = []
+        # np.arange causes trouble.  the data point on arange
+        # is slightly larger than the true value
+        self.iouThrs = np.linspace(
+            .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.recThrs = np.linspace(
+            .0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
+        self.maxDets = [20]
+        self.areaRng = [[0**2, 1e5**2], [32**2, 96**2], [96**2, 1e5**2]]
+        self.areaRngLbl = ['all', 'medium', 'large']
+        self.useCats = 1
+
+    def __init__(self, iouType='segm'):
+        if iouType == 'segm' or iouType == 'bbox':
+            self.setDetParams()
+        elif iouType == 'keypoints':
+            self.setKpParams()
+        else:
+            raise Exception('iouType not supported')
+        self.iouType = iouType
+        # useSegm is deprecated
+        self.useSegm = None
diff --git a/head_extractor/src/mmdet/evaluation/metrics/__init__.py b/head_extractor/src/mmdet/evaluation/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ad040cf6ffe3ada4b77e6a6b9caee3ad7afdf1d
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/__init__.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_video_metric import BaseVideoMetric
+from .cityscapes_metric import CityScapesMetric
+from .coco_caption_metric import COCOCaptionMetric
+from .coco_metric import CocoMetric
+from .coco_occluded_metric import CocoOccludedSeparatedMetric
+from .coco_panoptic_metric import CocoPanopticMetric
+from .coco_video_metric import CocoVideoMetric
+from .crowdhuman_metric import CrowdHumanMetric
+from .dod_metric import DODCocoMetric
+from .dump_det_results import DumpDetResults
+from .dump_odvg_results import DumpODVGResults
+from .dump_proposals_metric import DumpProposals
+from .flickr30k_metric import Flickr30kMetric
+from .grefcoco_metric import gRefCOCOMetric
+from .lvis_metric import LVISMetric
+from .mot_challenge_metric import MOTChallengeMetric
+from .openimages_metric import OpenImagesMetric
+from .ov_coco_metric import OVCocoMetric
+from .refexp_metric import RefExpMetric
+from .refseg_metric import RefSegMetric
+from .reid_metric import ReIDMetrics
+from .semseg_metric import SemSegMetric
+from .voc_metric import VOCMetric
+from .youtube_vis_metric import YouTubeVISMetric
+
+__all__ = [
+    'CityScapesMetric', 'CocoMetric', 'CocoPanopticMetric', 'OpenImagesMetric',
+    'VOCMetric', 'LVISMetric', 'CrowdHumanMetric', 'DumpProposals',
+    'CocoOccludedSeparatedMetric', 'DumpDetResults', 'BaseVideoMetric',
+    'MOTChallengeMetric', 'CocoVideoMetric', 'ReIDMetrics', 'YouTubeVISMetric',
+    'COCOCaptionMetric', 'SemSegMetric', 'RefSegMetric', 'RefExpMetric',
+    'gRefCOCOMetric', 'DODCocoMetric', 'DumpODVGResults', 'Flickr30kMetric',
+    'OVCocoMetric'
+]
diff --git a/head_extractor/src/mmdet/evaluation/metrics/base_video_metric.py b/head_extractor/src/mmdet/evaluation/metrics/base_video_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..90c7cdcbed5f12b59b6978ccba7576d6d2c25c5e
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/base_video_metric.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import warnings
+from typing import Optional, Sequence
+
+import torch
+from mmengine.dist import (barrier, broadcast, broadcast_object_list,
+                           get_dist_info, is_main_process)
+from mmengine.evaluator import BaseMetric
+from mmengine.utils import mkdir_or_exist
+
+
+class BaseVideoMetric(BaseMetric):
+    """Base class for a metric in video task.
+
+    The metric first processes each batch of data_samples and predictions,
+    and appends the processed results to the results list. Then it
+    collects all results together from all ranks if distributed training
+    is used. Finally, it computes the metrics of the entire dataset.
+
+    A subclass of class:`BaseVideoMetric` should assign a meaningful value
+    to the class attribute `default_prefix`. See the argument `prefix` for
+    details.
+    """
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for track_data_sample in data_samples:
+            video_data_samples = track_data_sample['video_data_samples']
+            ori_video_len = video_data_samples[0].ori_video_length
+            if ori_video_len == len(video_data_samples):
+                # video process
+                self.process_video(video_data_samples)
+            else:
+                # image process
+                self.process_image(video_data_samples, ori_video_len)
+
+    def evaluate(self, size: int = 1) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        if len(self.results) == 0:
+            warnings.warn(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.')
+
+        results = collect_tracking_results(self.results, self.collect_device)
+
+        if is_main_process():
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
+
+
+def collect_tracking_results(results: list,
+                             device: str = 'cpu',
+                             tmpdir: Optional[str] = None) -> Optional[list]:
+    """Collected results in distributed environments. different from the
+    function mmengine.dist.collect_results, tracking compute metrics don't use
+    paramenter size, which means length of the entire validation dataset.
+    because it's equal to video num, but compute metrics need image num.
+
+    Args:
+        results (list): Result list containing result parts to be
+            collected. Each item of ``result_part`` should be a picklable
+            object.
+        device (str): Device name. Optional values are 'cpu' and 'gpu'.
+        tmpdir (str | None): Temporal directory for collected results to
+            store. If set to None, it will create a temporal directory for it.
+            ``tmpdir`` should be None when device is 'gpu'. Defaults to None.
+
+    Returns:
+        list or None: The collected results.
+    """
+    if device not in ['gpu', 'cpu']:
+        raise NotImplementedError(
+            f"device must be 'cpu' or 'gpu', but got {device}")
+
+    if device == 'gpu':
+        assert tmpdir is None, 'tmpdir should be None when device is "gpu"'
+        raise NotImplementedError('GPU collecting has not been supported yet')
+    else:
+        return collect_tracking_results_cpu(results, tmpdir)
+
+
+def collect_tracking_results_cpu(result_part: list,
+                                 tmpdir: Optional[str] = None
+                                 ) -> Optional[list]:
+    """Collect results on cpu mode.
+
+    Saves the results on different gpus to 'tmpdir' and collects them by the
+    rank 0 worker.
+
+    Args:
+        result_part (list): The part of prediction results.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode. If is None, use `tempfile.mkdtemp()`
+            to make a temporary path. Defaults to None.
+
+    Returns:
+        list or None: The collected results.
+    """
+    rank, world_size = get_dist_info()
+    if world_size == 1:
+        return result_part
+
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8)
+        if rank == 0:
+            mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8)
+            dir_tensor[:len(tmpdir)] = tmpdir
+        broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.numpy().tobytes().decode().rstrip()
+    else:
+        mkdir_or_exist(tmpdir)
+
+    # dump the part result to the dir
+    with open(osp.join(tmpdir, f'part_{rank}.pkl'), 'wb') as f:  # type: ignore
+        pickle.dump(result_part, f, protocol=2)
+
+    barrier()
+
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            path = osp.join(tmpdir, f'part_{i}.pkl')  # type: ignore
+            with open(path, 'rb') as f:
+                part_list.extend(pickle.load(f))
+        shutil.rmtree(tmpdir)
+        return part_list
diff --git a/head_extractor/src/mmdet/evaluation/metrics/cityscapes_metric.py b/head_extractor/src/mmdet/evaluation/metrics/cityscapes_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5cdc179a3c76ef3742dd3ee6692c7deb9905459
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/cityscapes_metric.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+import tempfile
+from collections import OrderedDict
+from typing import Dict, Optional, Sequence
+
+import mmcv
+import numpy as np
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS
+
+try:
+    import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as CSEval  # noqa: E501
+    import cityscapesscripts.helpers.labels as CSLabels
+
+    from mmdet.evaluation.functional import evaluateImgLists
+    HAS_CITYSCAPESAPI = True
+except ImportError:
+    HAS_CITYSCAPESAPI = False
+
+
+@METRICS.register_module()
+class CityScapesMetric(BaseMetric):
+    """CityScapes metric for instance segmentation.
+
+    Args:
+        outfile_prefix (str): The prefix of txt and png files. The txt and
+            png file will be save in a directory whose path is
+            "outfile_prefix.results/".
+        seg_prefix (str, optional): Path to the directory which contains the
+            cityscapes instance segmentation masks. It's necessary when
+            training and validation. It could be None when infer on test
+            dataset. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+        dump_matches (bool): Whether dump matches.json file during evaluating.
+            Defaults to False.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+    default_prefix: Optional[str] = 'cityscapes'
+
+    def __init__(self,
+                 outfile_prefix: str,
+                 seg_prefix: Optional[str] = None,
+                 format_only: bool = False,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 dump_matches: bool = False,
+                 file_client_args: dict = None,
+                 backend_args: dict = None) -> None:
+
+        if not HAS_CITYSCAPESAPI:
+            raise RuntimeError('Failed to import `cityscapesscripts`.'
+                               'Please try to install official '
+                               'cityscapesscripts by '
+                               '"pip install cityscapesscripts"')
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.tmp_dir = None
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+        else:
+            assert seg_prefix is not None, '`seg_prefix` is necessary when '
+            'computing the CityScapes metrics'
+
+        if outfile_prefix is None:
+            self.tmp_dir = tempfile.TemporaryDirectory()
+            self.outfile_prefix = osp.join(self.tmp_dir.name, 'results')
+        else:
+            # the directory to save predicted panoptic segmentation mask
+            self.outfile_prefix = osp.join(outfile_prefix, 'results')  # type: ignore # yapf: disable # noqa: E501
+
+        dir_name = osp.expanduser(self.outfile_prefix)
+
+        if osp.exists(dir_name) and is_main_process():
+            logger: MMLogger = MMLogger.get_current_instance()
+            logger.info('remove previous results.')
+            shutil.rmtree(dir_name)
+        os.makedirs(dir_name, exist_ok=True)
+
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        self.seg_prefix = seg_prefix
+        self.dump_matches = dump_matches
+
+    def __del__(self) -> None:
+        """Clean up the results if necessary."""
+        if self.tmp_dir is not None:
+            self.tmp_dir.cleanup()
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            # parse pred
+            result = dict()
+            pred = data_sample['pred_instances']
+            filename = data_sample['img_path']
+            basename = osp.splitext(osp.basename(filename))[0]
+            pred_txt = osp.join(self.outfile_prefix, basename + '_pred.txt')
+            result['pred_txt'] = pred_txt
+            labels = pred['labels'].cpu().numpy()
+            masks = pred['masks'].cpu().numpy().astype(np.uint8)
+            if 'mask_scores' in pred:
+                # some detectors use different scores for bbox and mask
+                mask_scores = pred['mask_scores'].cpu().numpy()
+            else:
+                mask_scores = pred['scores'].cpu().numpy()
+
+            with open(pred_txt, 'w') as f:
+                for i, (label, mask, mask_score) in enumerate(
+                        zip(labels, masks, mask_scores)):
+                    class_name = self.dataset_meta['classes'][label]
+                    class_id = CSLabels.name2label[class_name].id
+                    png_filename = osp.join(
+                        self.outfile_prefix,
+                        basename + f'_{i}_{class_name}.png')
+                    mmcv.imwrite(mask, png_filename)
+                    f.write(f'{osp.basename(png_filename)} '
+                            f'{class_id} {mask_score}\n')
+
+            # parse gt
+            gt = dict()
+            img_path = filename.replace('leftImg8bit.png',
+                                        'gtFine_instanceIds.png')
+            gt['file_name'] = img_path.replace('leftImg8bit', 'gtFine')
+
+            self.results.append((gt, result))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        if self.format_only:
+            logger.info(
+                f'results are saved to {osp.dirname(self.outfile_prefix)}')
+            return OrderedDict()
+        logger.info('starts to compute metric')
+
+        gts, preds = zip(*results)
+        # set global states in cityscapes evaluation API
+        gt_instances_file = osp.join(self.outfile_prefix, 'gtInstances.json')  # type: ignore # yapf: disable # noqa: E501
+        # split gt and prediction list
+        gts, preds = zip(*results)
+        CSEval.args.JSONOutput = False
+        CSEval.args.colorized = False
+        CSEval.args.gtInstancesFile = gt_instances_file
+
+        groundTruthImgList = [gt['file_name'] for gt in gts]
+        predictionImgList = [pred['pred_txt'] for pred in preds]
+        CSEval_results = evaluateImgLists(
+            predictionImgList,
+            groundTruthImgList,
+            CSEval.args,
+            self.backend_args,
+            dump_matches=self.dump_matches)['averages']
+
+        eval_results = OrderedDict()
+        eval_results['mAP'] = CSEval_results['allAp']
+        eval_results['AP@50'] = CSEval_results['allAp50%']
+
+        return eval_results
diff --git a/head_extractor/src/mmdet/evaluation/metrics/coco_caption_metric.py b/head_extractor/src/mmdet/evaluation/metrics/coco_caption_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8c7350150f73d8d568597b352e33ad2a202c609
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/coco_caption_metric.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import tempfile
+from typing import List, Optional
+
+from mmengine.evaluator import BaseMetric
+from mmengine.utils import track_iter_progress
+from pycocotools.coco import COCO
+
+from mmdet.registry import METRICS
+
+try:
+    from pycocoevalcap.eval import COCOEvalCap
+except ImportError:
+    COCOEvalCap = None
+
+
+@METRICS.register_module()
+class COCOCaptionMetric(BaseMetric):
+    """Coco Caption evaluation wrapper.
+
+    Save the generated captions and transform into coco format.
+    Calling COCO API for caption metrics.
+
+    Args:
+        ann_file (str): the path for the COCO format caption ground truth
+            json file, load for evaluations.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Should be modified according to the
+            `retrieval_type` for unambiguous results. Defaults to TR.
+    """
+
+    def __init__(self,
+                 ann_file: str,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None):
+        if COCOEvalCap is None:
+            raise RuntimeError(
+                'COCOEvalCap is not installed, please install it by: '
+                'pip install pycocoevalcap')
+
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.ann_file = ann_file
+
+    def process(self, data_batch, data_samples):
+        """Process one batch of data samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch: A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+
+        for data_sample in data_samples:
+            result = dict()
+
+            result['caption'] = data_sample['pred_caption']
+            result['image_id'] = int(data_sample['img_id'])
+
+            # Save the result to `self.results`.
+            self.results.append(result)
+
+    def compute_metrics(self, results: List):
+        """Compute the metrics from processed results.
+
+        Args:
+            results (dict): The processed results of each batch.
+
+        Returns:
+            Dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        # NOTICE: don't access `self.results` from the method.
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+
+            eval_result_file = save_result(
+                result=results,
+                result_dir=temp_dir,
+                filename='caption_pred',
+                remove_duplicate='image_id',
+            )
+
+            coco_val = coco_caption_eval(eval_result_file, self.ann_file)
+
+        return coco_val
+
+
+def save_result(result, result_dir, filename, remove_duplicate=''):
+    """Saving predictions as json file for evaluation."""
+    # combine results from all processes
+    if remove_duplicate:
+        result_new = []
+        id_list = []
+        for res in track_iter_progress(result):
+            if res[remove_duplicate] not in id_list:
+                id_list.append(res[remove_duplicate])
+                result_new.append(res)
+        result = result_new
+
+    final_result_file_url = os.path.join(result_dir, '%s.json' % filename)
+    print(f'result file saved to {final_result_file_url}')
+    json.dump(result, open(final_result_file_url, 'w'))
+
+    return final_result_file_url
+
+
+def coco_caption_eval(results_file, ann_file):
+    """Evaluation between gt json and prediction json files."""
+    # create coco object and coco_result object
+    coco = COCO(ann_file)
+    coco_result = coco.loadRes(results_file)
+
+    # create coco_eval object by taking coco and coco_result
+    coco_eval = COCOEvalCap(coco, coco_result)
+
+    # make sure the image ids are the same
+    coco_eval.params['image_id'] = coco_result.getImgIds()
+
+    # This will take some times at the first run
+    coco_eval.evaluate()
+
+    # print output evaluation scores
+    for metric, score in coco_eval.eval.items():
+        print(f'{metric}: {score:.3f}')
+
+    return coco_eval.eval
diff --git a/head_extractor/src/mmdet/evaluation/metrics/coco_metric.py b/head_extractor/src/mmdet/evaluation/metrics/coco_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfdc66e03b96e62366a921c137fc5a5727e26302
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/coco_metric.py
@@ -0,0 +1,597 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import itertools
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import dump, get_local_path, load
+from mmengine.logging import MMLogger
+from terminaltables import AsciiTable
+
+from mmdet.datasets.api_wrappers import COCO, COCOeval, COCOevalMP
+from mmdet.registry import METRICS
+from mmdet.structures.mask import encode_mask_results
+from ..functional import eval_recalls
+
+
+@METRICS.register_module()
+class CocoMetric(BaseMetric):
+    """COCO evaluation metric.
+
+    Evaluate AR, AP, and mAP for detection tasks including proposal/box
+    detection and instance segmentation. Please refer to
+    https://cocodataset.org/#detection-eval for more details.
+
+    Args:
+        ann_file (str, optional): Path to the coco format annotation file.
+            If not specified, ground truth annotations from the dataset will
+            be converted to coco format. Defaults to None.
+        metric (str | List[str]): Metrics to be evaluated. Valid metrics
+            include 'bbox', 'segm', 'proposal', and 'proposal_fast'.
+            Defaults to 'bbox'.
+        classwise (bool): Whether to evaluate the metric class-wise.
+            Defaults to False.
+        proposal_nums (Sequence[int]): Numbers of proposals to be evaluated.
+            Defaults to (100, 300, 1000).
+        iou_thrs (float | List[float], optional): IoU threshold to compute AP
+            and AR. If not specified, IoUs from 0.5 to 0.95 will be used.
+            Defaults to None.
+        metric_items (List[str], optional): Metric result names to be
+            recorded in the evaluation result. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        outfile_prefix (str, optional): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+        sort_categories (bool): Whether sort categories in annotations. Only
+            used for `Objects365V1Dataset`. Defaults to False.
+        use_mp_eval (bool): Whether to use mul-processing evaluation
+    """
+    default_prefix: Optional[str] = 'coco'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: Union[str, List[str]] = 'bbox',
+                 classwise: bool = False,
+                 proposal_nums: Sequence[int] = (100, 300, 1000),
+                 iou_thrs: Optional[Union[float, Sequence[float]]] = None,
+                 metric_items: Optional[Sequence[str]] = None,
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 sort_categories: bool = False,
+                 use_mp_eval: bool = False) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        # coco evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(
+                    "metric should be one of 'bbox', 'segm', 'proposal', "
+                    f"'proposal_fast', but got {metric}.")
+
+        # do class wise evaluation, default False
+        self.classwise = classwise
+        # whether to use multi processing evaluation, default False
+        self.use_mp_eval = use_mp_eval
+
+        # proposal_nums used to compute recall or precision.
+        self.proposal_nums = list(proposal_nums)
+
+        # iou_thrs used to compute recall or precision.
+        if iou_thrs is None:
+            iou_thrs = np.linspace(
+                .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.iou_thrs = iou_thrs
+        self.metric_items = metric_items
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+
+        self.outfile_prefix = outfile_prefix
+
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        # if ann_file is not specified,
+        # initialize coco api with the converted dataset
+        if ann_file is not None:
+            with get_local_path(
+                    ann_file, backend_args=self.backend_args) as local_path:
+                self._coco_api = COCO(local_path)
+                if sort_categories:
+                    # 'categories' list in objects365_train.json and
+                    # objects365_val.json is inconsistent, need sort
+                    # list(or dict) before get cat_ids.
+                    cats = self._coco_api.cats
+                    sorted_cats = {i: cats[i] for i in sorted(cats)}
+                    self._coco_api.cats = sorted_cats
+                    categories = self._coco_api.dataset['categories']
+                    sorted_categories = sorted(
+                        categories, key=lambda i: i['id'])
+                    self._coco_api.dataset['categories'] = sorted_categories
+        else:
+            self._coco_api = None
+
+        # handle dataset lazy init
+        self.cat_ids = None
+        self.img_ids = None
+
+    def fast_eval_recall(self,
+                         results: List[dict],
+                         proposal_nums: Sequence[int],
+                         iou_thrs: Sequence[float],
+                         logger: Optional[MMLogger] = None) -> np.ndarray:
+        """Evaluate proposal recall with COCO's fast_eval_recall.
+
+        Args:
+            results (List[dict]): Results of the dataset.
+            proposal_nums (Sequence[int]): Proposal numbers used for
+                evaluation.
+            iou_thrs (Sequence[float]): IoU thresholds used for evaluation.
+            logger (MMLogger, optional): Logger used for logging the recall
+                summary.
+        Returns:
+            np.ndarray: Averaged recall results.
+        """
+        gt_bboxes = []
+        pred_bboxes = [result['bboxes'] for result in results]
+        for i in range(len(self.img_ids)):
+            ann_ids = self._coco_api.get_ann_ids(img_ids=self.img_ids[i])
+            ann_info = self._coco_api.load_anns(ann_ids)
+            if len(ann_info) == 0:
+                gt_bboxes.append(np.zeros((0, 4)))
+                continue
+            bboxes = []
+            for ann in ann_info:
+                if ann.get('ignore', False) or ann['iscrowd']:
+                    continue
+                x1, y1, w, h = ann['bbox']
+                bboxes.append([x1, y1, x1 + w, y1 + h])
+            bboxes = np.array(bboxes, dtype=np.float32)
+            if bboxes.shape[0] == 0:
+                bboxes = np.zeros((0, 4))
+            gt_bboxes.append(bboxes)
+
+        recalls = eval_recalls(
+            gt_bboxes, pred_bboxes, proposal_nums, iou_thrs, logger=logger)
+        ar = recalls.mean(axis=1)
+        return ar
+
+    def xyxy2xywh(self, bbox: np.ndarray) -> list:
+        """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO
+        evaluation.
+
+        Args:
+            bbox (numpy.ndarray): The bounding boxes, shape (4, ), in
+                ``xyxy`` order.
+
+        Returns:
+            list[float]: The converted bounding boxes, in ``xywh`` order.
+        """
+
+        _bbox: List = bbox.tolist()
+        return [
+            _bbox[0],
+            _bbox[1],
+            _bbox[2] - _bbox[0],
+            _bbox[3] - _bbox[1],
+        ]
+
+    def results2json(self, results: Sequence[dict],
+                     outfile_prefix: str) -> dict:
+        """Dump the detection results to a COCO style json file.
+
+        There are 3 types of results: proposals, bbox predictions, mask
+        predictions, and they have different data types. This method will
+        automatically recognize the type, and dump them to json files.
+
+        Args:
+            results (Sequence[dict]): Testing results of the
+                dataset.
+            outfile_prefix (str): The filename prefix of the json files. If the
+                prefix is "somepath/xxx", the json files will be named
+                "somepath/xxx.bbox.json", "somepath/xxx.segm.json",
+                "somepath/xxx.proposal.json".
+
+        Returns:
+            dict: Possible keys are "bbox", "segm", "proposal", and
+            values are corresponding filenames.
+        """
+        bbox_json_results = []
+        segm_json_results = [] if 'masks' in results[0] else None
+        for idx, result in enumerate(results):
+            image_id = result.get('img_id', idx)
+            labels = result['labels']
+            bboxes = result['bboxes']
+            scores = result['scores']
+            # bbox results
+            for i, label in enumerate(labels):
+                data = dict()
+                data['image_id'] = image_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(scores[i])
+                data['category_id'] = self.cat_ids[label]
+                bbox_json_results.append(data)
+
+            if segm_json_results is None:
+                continue
+
+            # segm results
+            masks = result['masks']
+            mask_scores = result.get('mask_scores', scores)
+            for i, label in enumerate(labels):
+                data = dict()
+                data['image_id'] = image_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(mask_scores[i])
+                data['category_id'] = self.cat_ids[label]
+                if isinstance(masks[i]['counts'], bytes):
+                    masks[i]['counts'] = masks[i]['counts'].decode()
+                data['segmentation'] = masks[i]
+                segm_json_results.append(data)
+
+        result_files = dict()
+        result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+        result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+        dump(bbox_json_results, result_files['bbox'])
+
+        if segm_json_results is not None:
+            result_files['segm'] = f'{outfile_prefix}.segm.json'
+            dump(segm_json_results, result_files['segm'])
+
+        return result_files
+
+    def gt_to_coco_json(self, gt_dicts: Sequence[dict],
+                        outfile_prefix: str) -> str:
+        """Convert ground truth to coco format json file.
+
+        Args:
+            gt_dicts (Sequence[dict]): Ground truth of the dataset.
+            outfile_prefix (str): The filename prefix of the json files. If the
+                prefix is "somepath/xxx", the json file will be named
+                "somepath/xxx.gt.json".
+        Returns:
+            str: The filename of the json file.
+        """
+        categories = [
+            dict(id=id, name=name)
+            for id, name in enumerate(self.dataset_meta['classes'])
+        ]
+        image_infos = []
+        annotations = []
+
+        for idx, gt_dict in enumerate(gt_dicts):
+            img_id = gt_dict.get('img_id', idx)
+            image_info = dict(
+                id=img_id,
+                width=gt_dict['width'],
+                height=gt_dict['height'],
+                file_name='')
+            image_infos.append(image_info)
+            for ann in gt_dict['anns']:
+                label = ann['bbox_label']
+                bbox = ann['bbox']
+                coco_bbox = [
+                    bbox[0],
+                    bbox[1],
+                    bbox[2] - bbox[0],
+                    bbox[3] - bbox[1],
+                ]
+
+                annotation = dict(
+                    id=len(annotations) +
+                    1,  # coco api requires id starts with 1
+                    image_id=img_id,
+                    bbox=coco_bbox,
+                    iscrowd=ann.get('ignore_flag', 0),
+                    category_id=int(label),
+                    area=coco_bbox[2] * coco_bbox[3])
+                if ann.get('mask', None):
+                    mask = ann['mask']
+                    # area = mask_util.area(mask)
+                    if isinstance(mask, dict) and isinstance(
+                            mask['counts'], bytes):
+                        mask['counts'] = mask['counts'].decode()
+                    annotation['segmentation'] = mask
+                    # annotation['area'] = float(area)
+                annotations.append(annotation)
+
+        info = dict(
+            date_created=str(datetime.datetime.now()),
+            description='Coco json file converted by mmdet CocoMetric.')
+        coco_json = dict(
+            info=info,
+            images=image_infos,
+            categories=categories,
+            licenses=None,
+        )
+        if len(annotations) > 0:
+            coco_json['annotations'] = annotations
+        converted_json_path = f'{outfile_prefix}.gt.json'
+        dump(coco_json, converted_json_path)
+        return converted_json_path
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            result['labels'] = pred['labels'].cpu().numpy()
+            # encode mask to RLE
+            if 'masks' in pred:
+                result['masks'] = encode_mask_results(
+                    pred['masks'].detach().cpu().numpy()) if isinstance(
+                        pred['masks'], torch.Tensor) else pred['masks']
+            # some detectors use different scores for bbox and mask
+            if 'mask_scores' in pred:
+                result['mask_scores'] = pred['mask_scores'].cpu().numpy()
+
+            # parse gt
+            gt = dict()
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+            gt['img_id'] = data_sample['img_id']
+            if self._coco_api is None:
+                # TODO: Need to refactor to support LoadAnnotations
+                assert 'instances' in data_sample, \
+                    'ground truth is required for evaluation when ' \
+                    '`ann_file` is not provided'
+                gt['anns'] = data_sample['instances']
+            # add converted result to the results list
+            self.results.append((gt, result))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # split gt and prediction list
+        gts, preds = zip(*results)
+
+        tmp_dir = None
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+
+        if self._coco_api is None:
+            # use converted gt json file to initialize coco api
+            logger.info('Converting ground truth to coco format...')
+            coco_json_path = self.gt_to_coco_json(
+                gt_dicts=gts, outfile_prefix=outfile_prefix)
+            self._coco_api = COCO(coco_json_path)
+
+        # handle lazy init
+        if self.cat_ids is None:
+            self.cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['classes'])
+        if self.img_ids is None:
+            self.img_ids = self._coco_api.get_img_ids()
+
+        # convert predictions to coco format and dump to json file
+        result_files = self.results2json(preds, outfile_prefix)
+
+        eval_results = OrderedDict()
+        if self.format_only:
+            logger.info('results are saved in '
+                        f'{osp.dirname(outfile_prefix)}')
+            return eval_results
+
+        for metric in self.metrics:
+            logger.info(f'Evaluating {metric}...')
+
+            # TODO: May refactor fast_eval_recall to an independent metric?
+            # fast eval recall
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    preds, self.proposal_nums, self.iou_thrs, logger=logger)
+                log_msg = []
+                for i, num in enumerate(self.proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+                log_msg = ''.join(log_msg)
+                logger.info(log_msg)
+                continue
+
+            # evaluate proposal, bbox and segm
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            if metric not in result_files:
+                raise KeyError(f'{metric} is not in results')
+            try:
+                predictions = load(result_files[metric])
+                if iou_type == 'segm':
+                    # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331  # noqa
+                    # When evaluating mask AP, if the results contain bbox,
+                    # cocoapi will use the box area instead of the mask area
+                    # for calculating the instance area. Though the overall AP
+                    # is not affected, this leads to different
+                    # small/medium/large mask AP results.
+                    for x in predictions:
+                        x.pop('bbox')
+                coco_dt = self._coco_api.loadRes(predictions)
+
+            except IndexError:
+                logger.error(
+                    'The testing results of the whole dataset is empty.')
+                break
+
+            if self.use_mp_eval:
+                coco_eval = COCOevalMP(self._coco_api, coco_dt, iou_type)
+            else:
+                coco_eval = COCOeval(self._coco_api, coco_dt, iou_type)
+
+            coco_eval.params.catIds = self.cat_ids
+            coco_eval.params.imgIds = self.img_ids
+            coco_eval.params.maxDets = list(self.proposal_nums)
+            coco_eval.params.iouThrs = self.iou_thrs
+
+            # mapping of cocoEval.stats
+            coco_metric_names = {
+                'mAP': 0,
+                'mAP_50': 1,
+                'mAP_75': 2,
+                'mAP_s': 3,
+                'mAP_m': 4,
+                'mAP_l': 5,
+                'AR@100': 6,
+                'AR@300': 7,
+                'AR@1000': 8,
+                'AR_s@1000': 9,
+                'AR_m@1000': 10,
+                'AR_l@1000': 11
+            }
+            metric_items = self.metric_items
+            if metric_items is not None:
+                for metric_item in metric_items:
+                    if metric_item not in coco_metric_names:
+                        raise KeyError(
+                            f'metric item "{metric_item}" is not supported')
+
+            if metric == 'proposal':
+                coco_eval.params.useCats = 0
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if metric_items is None:
+                    metric_items = [
+                        'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
+                        'AR_m@1000', 'AR_l@1000'
+                    ]
+
+                for item in metric_items:
+                    val = float(
+                        f'{coco_eval.stats[coco_metric_names[item]]:.3f}')
+                    eval_results[item] = val
+            else:
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if self.classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = coco_eval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, cat_id in enumerate(self.cat_ids):
+                        t = []
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        nm = self._coco_api.loadCats(cat_id)[0]
+                        precision = precisions[:, :, idx, 0, -1]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        t.append(f'{nm["name"]}')
+                        t.append(f'{round(ap, 3)}')
+                        eval_results[f'{nm["name"]}_precision'] = round(ap, 3)
+
+                        # indexes of IoU  @50 and @75
+                        for iou in [0, 5]:
+                            precision = precisions[iou, :, idx, 0, -1]
+                            precision = precision[precision > -1]
+                            if precision.size:
+                                ap = np.mean(precision)
+                            else:
+                                ap = float('nan')
+                            t.append(f'{round(ap, 3)}')
+
+                        # indexes of area of small, median and large
+                        for area in [1, 2, 3]:
+                            precision = precisions[:, :, idx, area, -1]
+                            precision = precision[precision > -1]
+                            if precision.size:
+                                ap = np.mean(precision)
+                            else:
+                                ap = float('nan')
+                            t.append(f'{round(ap, 3)}')
+                        results_per_category.append(tuple(t))
+
+                    num_columns = len(results_per_category[0])
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = [
+                        'category', 'mAP', 'mAP_50', 'mAP_75', 'mAP_s',
+                        'mAP_m', 'mAP_l'
+                    ]
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    logger.info('\n' + table.table)
+
+                if metric_items is None:
+                    metric_items = [
+                        'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+                    ]
+
+                for metric_item in metric_items:
+                    key = f'{metric}_{metric_item}'
+                    val = coco_eval.stats[coco_metric_names[metric_item]]
+                    eval_results[key] = float(f'{round(val, 3)}')
+
+                ap = coco_eval.stats[:6]
+                logger.info(f'{metric}_mAP_copypaste: {ap[0]:.3f} '
+                            f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                            f'{ap[4]:.3f} {ap[5]:.3f}')
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
diff --git a/head_extractor/src/mmdet/evaluation/metrics/coco_occluded_metric.py b/head_extractor/src/mmdet/evaluation/metrics/coco_occluded_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..81235a04e6ee1929cfd6b5cdc284d239765b0d69
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/coco_occluded_metric.py
@@ -0,0 +1,204 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Union
+
+import mmengine
+import numpy as np
+from mmengine.fileio import load
+from mmengine.logging import print_log
+from pycocotools import mask as coco_mask
+from terminaltables import AsciiTable
+
+from mmdet.registry import METRICS
+from .coco_metric import CocoMetric
+
+
+@METRICS.register_module()
+class CocoOccludedSeparatedMetric(CocoMetric):
+    """Metric of separated and occluded masks which presented in paper `A Tri-
+    Layer Plugin to Improve Occluded Detection.
+
+    <https://arxiv.org/abs/2210.10046>`_.
+
+    Separated COCO and Occluded COCO are automatically generated subsets of
+    COCO val dataset, collecting separated objects and partially occluded
+    objects for a large variety of categories. In this way, we define
+    occlusion into two major categories: separated and partially occluded.
+
+    - Separation: target object segmentation mask is separated into distinct
+      regions by the occluder.
+    - Partial Occlusion: target object is partially occluded but the
+      segmentation mask is connected.
+
+    These two new scalable real-image datasets are to benchmark a model's
+    capability to detect occluded objects of 80 common categories.
+
+    Please cite the paper if you use this dataset:
+
+    @article{zhan2022triocc,
+        title={A Tri-Layer Plugin to Improve Occluded Detection},
+        author={Zhan, Guanqi and Xie, Weidi and Zisserman, Andrew},
+        journal={British Machine Vision Conference},
+        year={2022}
+    }
+
+    Args:
+        occluded_ann (str): Path to the occluded coco annotation file.
+        separated_ann (str): Path to the separated coco annotation file.
+        score_thr (float): Score threshold of the detection masks.
+            Defaults to 0.3.
+        iou_thr (float): IoU threshold for the recall calculation.
+            Defaults to 0.75.
+        metric (str | List[str]): Metrics to be evaluated. Valid metrics
+            include 'bbox', 'segm', 'proposal', and 'proposal_fast'.
+            Defaults to 'bbox'.
+    """
+    default_prefix: Optional[str] = 'coco'
+
+    def __init__(
+            self,
+            *args,
+            occluded_ann:
+        str = 'https://www.robots.ox.ac.uk/~vgg/research/tpod/datasets/occluded_coco.pkl',  # noqa
+            separated_ann:
+        str = 'https://www.robots.ox.ac.uk/~vgg/research/tpod/datasets/separated_coco.pkl',  # noqa
+            score_thr: float = 0.3,
+            iou_thr: float = 0.75,
+            metric: Union[str, List[str]] = ['bbox', 'segm'],
+            **kwargs) -> None:
+        super().__init__(*args, metric=metric, **kwargs)
+        self.occluded_ann = load(occluded_ann)
+        self.separated_ann = load(separated_ann)
+        self.score_thr = score_thr
+        self.iou_thr = iou_thr
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        coco_metric_res = super().compute_metrics(results)
+        eval_res = self.evaluate_occluded_separated(results)
+        coco_metric_res.update(eval_res)
+        return coco_metric_res
+
+    def evaluate_occluded_separated(self, results: List[tuple]) -> dict:
+        """Compute the recall of occluded and separated masks.
+
+        Args:
+            results (list[tuple]): Testing results of the dataset.
+
+        Returns:
+            dict[str, float]: The recall of occluded and separated masks.
+        """
+        dict_det = {}
+        print_log('processing detection results...')
+        prog_bar = mmengine.ProgressBar(len(results))
+        for i in range(len(results)):
+            gt, dt = results[i]
+            img_id = dt['img_id']
+            cur_img_name = self._coco_api.imgs[img_id]['file_name']
+            if cur_img_name not in dict_det.keys():
+                dict_det[cur_img_name] = []
+
+            for bbox, score, label, mask in zip(dt['bboxes'], dt['scores'],
+                                                dt['labels'], dt['masks']):
+                cur_binary_mask = coco_mask.decode(mask)
+                dict_det[cur_img_name].append([
+                    score, self.dataset_meta['classes'][label],
+                    cur_binary_mask, bbox
+                ])
+            dict_det[cur_img_name].sort(
+                key=lambda x: (-x[0], x[3][0], x[3][1])
+            )  # rank by confidence from high to low, avoid same confidence
+            prog_bar.update()
+        print_log('\ncomputing occluded mask recall...', logger='current')
+        occluded_correct_num, occluded_recall = self.compute_recall(
+            dict_det, gt_ann=self.occluded_ann, is_occ=True)
+        print_log(
+            f'\nCOCO occluded mask recall: {occluded_recall:.2f}%',
+            logger='current')
+        print_log(
+            f'COCO occluded mask success num: {occluded_correct_num}',
+            logger='current')
+        print_log('computing separated mask recall...', logger='current')
+        separated_correct_num, separated_recall = self.compute_recall(
+            dict_det, gt_ann=self.separated_ann, is_occ=False)
+        print_log(
+            f'\nCOCO separated mask recall: {separated_recall:.2f}%',
+            logger='current')
+        print_log(
+            f'COCO separated mask success num: {separated_correct_num}',
+            logger='current')
+        table_data = [
+            ['mask type', 'recall', 'num correct'],
+            ['occluded', f'{occluded_recall:.2f}%', occluded_correct_num],
+            ['separated', f'{separated_recall:.2f}%', separated_correct_num]
+        ]
+        table = AsciiTable(table_data)
+        print_log('\n' + table.table, logger='current')
+        return dict(
+            occluded_recall=occluded_recall, separated_recall=separated_recall)
+
+    def compute_recall(self,
+                       result_dict: dict,
+                       gt_ann: list,
+                       is_occ: bool = True) -> tuple:
+        """Compute the recall of occluded or separated masks.
+
+        Args:
+            result_dict (dict): Processed mask results.
+            gt_ann (list): Occluded or separated coco annotations.
+            is_occ (bool): Whether the annotation is occluded mask.
+                Defaults to True.
+        Returns:
+            tuple: number of correct masks and the recall.
+        """
+        correct = 0
+        prog_bar = mmengine.ProgressBar(len(gt_ann))
+        for iter_i in range(len(gt_ann)):
+            cur_item = gt_ann[iter_i]
+            cur_img_name = cur_item[0]
+            cur_gt_bbox = cur_item[3]
+            if is_occ:
+                cur_gt_bbox = [
+                    cur_gt_bbox[0], cur_gt_bbox[1],
+                    cur_gt_bbox[0] + cur_gt_bbox[2],
+                    cur_gt_bbox[1] + cur_gt_bbox[3]
+                ]
+            cur_gt_class = cur_item[1]
+            cur_gt_mask = coco_mask.decode(cur_item[4])
+
+            assert cur_img_name in result_dict.keys()
+            cur_detections = result_dict[cur_img_name]
+
+            correct_flag = False
+            for i in range(len(cur_detections)):
+                cur_det_confidence = cur_detections[i][0]
+                if cur_det_confidence < self.score_thr:
+                    break
+                cur_det_class = cur_detections[i][1]
+                if cur_det_class != cur_gt_class:
+                    continue
+                cur_det_mask = cur_detections[i][2]
+                cur_iou = self.mask_iou(cur_det_mask, cur_gt_mask)
+                if cur_iou >= self.iou_thr:
+                    correct_flag = True
+                    break
+            if correct_flag:
+                correct += 1
+            prog_bar.update()
+        recall = correct / len(gt_ann) * 100
+        return correct, recall
+
+    def mask_iou(self, mask1: np.ndarray, mask2: np.ndarray) -> np.ndarray:
+        """Compute IoU between two masks."""
+        mask1_area = np.count_nonzero(mask1 == 1)
+        mask2_area = np.count_nonzero(mask2 == 1)
+        intersection = np.count_nonzero(np.logical_and(mask1 == 1, mask2 == 1))
+        iou = intersection / (mask1_area + mask2_area - intersection)
+        return iou
diff --git a/head_extractor/src/mmdet/evaluation/metrics/coco_panoptic_metric.py b/head_extractor/src/mmdet/evaluation/metrics/coco_panoptic_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..f86be916f9cacbdd1160d0fdb3dd6b5d43399299
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/coco_panoptic_metric.py
@@ -0,0 +1,618 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import itertools
+import os.path as osp
+import tempfile
+from typing import Dict, Optional, Sequence, Tuple, Union
+
+import mmcv
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import dump, get_local_path, load
+from mmengine.logging import MMLogger, print_log
+from terminaltables import AsciiTable
+
+from mmdet.datasets.api_wrappers import COCOPanoptic
+from mmdet.registry import METRICS
+from ..functional import (INSTANCE_OFFSET, pq_compute_multi_core,
+                          pq_compute_single_core)
+
+try:
+    import panopticapi
+    from panopticapi.evaluation import VOID, PQStat
+    from panopticapi.utils import id2rgb, rgb2id
+except ImportError:
+    panopticapi = None
+    id2rgb = None
+    rgb2id = None
+    VOID = None
+    PQStat = None
+
+
+@METRICS.register_module()
+class CocoPanopticMetric(BaseMetric):
+    """COCO panoptic segmentation evaluation metric.
+
+    Evaluate PQ, SQ RQ for panoptic segmentation tasks. Please refer to
+    https://cocodataset.org/#panoptic-eval for more details.
+
+    Args:
+        ann_file (str, optional): Path to the coco format annotation file.
+            If not specified, ground truth annotations from the dataset will
+            be converted to coco format. Defaults to None.
+        seg_prefix (str, optional): Path to the directory which contains the
+            coco panoptic segmentation mask. It should be specified when
+            evaluate. Defaults to None.
+        classwise (bool): Whether to evaluate the metric class-wise.
+            Defaults to False.
+        outfile_prefix (str, optional): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created.
+            It should be specified when format_only is True. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        nproc (int): Number of processes for panoptic quality computing.
+            Defaults to 32. When ``nproc`` exceeds the number of cpu cores,
+            the number of cpu cores is used.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+    default_prefix: Optional[str] = 'coco_panoptic'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 seg_prefix: Optional[str] = None,
+                 classwise: bool = False,
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 nproc: int = 32,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        if panopticapi is None:
+            raise RuntimeError(
+                'panopticapi is not installed, please install it by: '
+                'pip install git+https://github.com/cocodataset/'
+                'panopticapi.git.')
+
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.classwise = classwise
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+
+        self.tmp_dir = None
+        # outfile_prefix should be a prefix of a path which points to a shared
+        # storage when train or test with multi nodes.
+        self.outfile_prefix = outfile_prefix
+        if outfile_prefix is None:
+            self.tmp_dir = tempfile.TemporaryDirectory()
+            self.outfile_prefix = osp.join(self.tmp_dir.name, 'results')
+        # the directory to save predicted panoptic segmentation mask
+        self.seg_out_dir = f'{self.outfile_prefix}.panoptic'
+        self.nproc = nproc
+        self.seg_prefix = seg_prefix
+
+        self.cat_ids = None
+        self.cat2label = None
+
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        if ann_file:
+            with get_local_path(
+                    ann_file, backend_args=self.backend_args) as local_path:
+                self._coco_api = COCOPanoptic(local_path)
+            self.categories = self._coco_api.cats
+        else:
+            self._coco_api = None
+            self.categories = None
+
+    def __del__(self) -> None:
+        """Clean up."""
+        if self.tmp_dir is not None:
+            self.tmp_dir.cleanup()
+
+    def gt_to_coco_json(self, gt_dicts: Sequence[dict],
+                        outfile_prefix: str) -> Tuple[str, str]:
+        """Convert ground truth to coco panoptic segmentation format json file.
+
+        Args:
+            gt_dicts (Sequence[dict]): Ground truth of the dataset.
+            outfile_prefix (str): The filename prefix of the json file. If the
+                prefix is "somepath/xxx", the json file will be named
+                "somepath/xxx.gt.json".
+
+        Returns:
+            Tuple[str, str]: The filename of the json file and the name of the\
+                directory which contains panoptic segmentation masks.
+        """
+        assert len(gt_dicts) > 0, 'gt_dicts is empty.'
+        gt_folder = osp.dirname(gt_dicts[0]['seg_map_path'])
+        converted_json_path = f'{outfile_prefix}.gt.json'
+
+        categories = []
+        for id, name in enumerate(self.dataset_meta['classes']):
+            isthing = 1 if name in self.dataset_meta['thing_classes'] else 0
+            categories.append({'id': id, 'name': name, 'isthing': isthing})
+
+        image_infos = []
+        annotations = []
+        for gt_dict in gt_dicts:
+            img_id = gt_dict['image_id']
+            image_info = {
+                'id': img_id,
+                'width': gt_dict['width'],
+                'height': gt_dict['height'],
+                'file_name': osp.split(gt_dict['seg_map_path'])[-1]
+            }
+            image_infos.append(image_info)
+
+            pan_png = mmcv.imread(gt_dict['seg_map_path']).squeeze()
+            pan_png = pan_png[:, :, ::-1]
+            pan_png = rgb2id(pan_png)
+            segments_info = []
+            for segment_info in gt_dict['segments_info']:
+                id = segment_info['id']
+                label = segment_info['category']
+                mask = pan_png == id
+                isthing = categories[label]['isthing']
+                if isthing:
+                    iscrowd = 1 if not segment_info['is_thing'] else 0
+                else:
+                    iscrowd = 0
+
+                new_segment_info = {
+                    'id': id,
+                    'category_id': label,
+                    'isthing': isthing,
+                    'iscrowd': iscrowd,
+                    'area': mask.sum()
+                }
+                segments_info.append(new_segment_info)
+
+            segm_file = image_info['file_name'].replace('.jpg', '.png')
+            annotation = dict(
+                image_id=img_id,
+                segments_info=segments_info,
+                file_name=segm_file)
+            annotations.append(annotation)
+            pan_png = id2rgb(pan_png)
+
+        info = dict(
+            date_created=str(datetime.datetime.now()),
+            description='Coco json file converted by mmdet CocoPanopticMetric.'
+        )
+        coco_json = dict(
+            info=info,
+            images=image_infos,
+            categories=categories,
+            licenses=None,
+        )
+        if len(annotations) > 0:
+            coco_json['annotations'] = annotations
+        dump(coco_json, converted_json_path)
+        return converted_json_path, gt_folder
+
+    def result2json(self, results: Sequence[dict],
+                    outfile_prefix: str) -> Tuple[str, str]:
+        """Dump the panoptic results to a COCO style json file and a directory.
+
+        Args:
+            results (Sequence[dict]): Testing results of the dataset.
+            outfile_prefix (str): The filename prefix of the json files and the
+                directory.
+
+        Returns:
+            Tuple[str, str]: The json file and the directory which contains \
+                panoptic segmentation masks. The filename of the json is
+                "somepath/xxx.panoptic.json" and name of the directory is
+                "somepath/xxx.panoptic".
+        """
+        label2cat = dict((v, k) for (k, v) in self.cat2label.items())
+        pred_annotations = []
+        for idx in range(len(results)):
+            result = results[idx]
+            for segment_info in result['segments_info']:
+                sem_label = segment_info['category_id']
+                # convert sem_label to json label
+                cat_id = label2cat[sem_label]
+                segment_info['category_id'] = label2cat[sem_label]
+                is_thing = self.categories[cat_id]['isthing']
+                segment_info['isthing'] = is_thing
+            pred_annotations.append(result)
+        pan_json_results = dict(annotations=pred_annotations)
+        json_filename = f'{outfile_prefix}.panoptic.json'
+        dump(pan_json_results, json_filename)
+        return json_filename, (
+            self.seg_out_dir
+            if self.tmp_dir is None else tempfile.gettempdir())
+
+    def _parse_predictions(self,
+                           pred: dict,
+                           img_id: int,
+                           segm_file: str,
+                           label2cat=None) -> dict:
+        """Parse panoptic segmentation predictions.
+
+        Args:
+            pred (dict): Panoptic segmentation predictions.
+            img_id (int): Image id.
+            segm_file (str): Segmentation file name.
+            label2cat (dict): Mapping from label to category id.
+                Defaults to None.
+
+        Returns:
+            dict: Parsed predictions.
+        """
+        result = dict()
+        result['img_id'] = img_id
+        # shape (1, H, W) -> (H, W)
+        pan = pred['pred_panoptic_seg']['sem_seg'].cpu().numpy()[0]
+        ignore_index = pred['pred_panoptic_seg'].get(
+            'ignore_index', len(self.dataset_meta['classes']))
+        pan_labels = np.unique(pan)
+        segments_info = []
+        for pan_label in pan_labels:
+            sem_label = pan_label % INSTANCE_OFFSET
+            # We reserve the length of dataset_meta['classes']
+            # and ignore_index for VOID label
+            if sem_label == len(
+                    self.dataset_meta['classes']) or sem_label == ignore_index:
+                continue
+            mask = pan == pan_label
+            area = mask.sum()
+            segments_info.append({
+                'id':
+                int(pan_label),
+                # when ann_file provided, sem_label should be cat_id, otherwise
+                # sem_label should be a continuous id, not the cat_id
+                # defined in dataset
+                'category_id':
+                label2cat[sem_label] if label2cat else sem_label,
+                'area':
+                int(area)
+            })
+        # evaluation script uses 0 for VOID label.
+        pan[pan % INSTANCE_OFFSET == len(self.dataset_meta['classes'])] = VOID
+        pan[pan % INSTANCE_OFFSET == ignore_index] = VOID
+
+        pan = id2rgb(pan).astype(np.uint8)
+        mmcv.imwrite(pan[:, :, ::-1], osp.join(self.seg_out_dir, segm_file))
+        result = {
+            'image_id': img_id,
+            'segments_info': segments_info,
+            'file_name': segm_file
+        }
+
+        return result
+
+    def _compute_batch_pq_stats(self, data_samples: Sequence[dict]):
+        """Process gts and predictions when ``outfile_prefix`` is not set, gts
+        are from dataset or a json file which is defined by ``ann_file``.
+
+        Intermediate results, ``pq_stats``, are computed here and put into
+        ``self.results``.
+        """
+        if self._coco_api is None:
+            categories = dict()
+            for id, name in enumerate(self.dataset_meta['classes']):
+                isthing = 1 if name in self.dataset_meta['thing_classes']\
+                    else 0
+                categories[id] = {'id': id, 'name': name, 'isthing': isthing}
+            label2cat = None
+        else:
+            categories = self.categories
+            cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['classes'])
+            label2cat = {i: cat_id for i, cat_id in enumerate(cat_ids)}
+
+        for data_sample in data_samples:
+            # parse pred
+            img_id = data_sample['img_id']
+            segm_file = osp.basename(data_sample['img_path']).replace(
+                '.jpg', '.png')
+            result = self._parse_predictions(
+                pred=data_sample,
+                img_id=img_id,
+                segm_file=segm_file,
+                label2cat=label2cat)
+
+            # parse gt
+            gt = dict()
+            gt['image_id'] = img_id
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+            gt['file_name'] = segm_file
+
+            if self._coco_api is None:
+                # get segments_info from data_sample
+                seg_map_path = osp.join(self.seg_prefix, segm_file)
+                pan_png = mmcv.imread(seg_map_path).squeeze()
+                pan_png = pan_png[:, :, ::-1]
+                pan_png = rgb2id(pan_png)
+                segments_info = []
+
+                for segment_info in data_sample['segments_info']:
+                    id = segment_info['id']
+                    label = segment_info['category']
+                    mask = pan_png == id
+                    isthing = categories[label]['isthing']
+                    if isthing:
+                        iscrowd = 1 if not segment_info['is_thing'] else 0
+                    else:
+                        iscrowd = 0
+
+                    new_segment_info = {
+                        'id': id,
+                        'category_id': label,
+                        'isthing': isthing,
+                        'iscrowd': iscrowd,
+                        'area': mask.sum()
+                    }
+                    segments_info.append(new_segment_info)
+            else:
+                # get segments_info from annotation file
+                segments_info = self._coco_api.imgToAnns[img_id]
+
+            gt['segments_info'] = segments_info
+
+            pq_stats = pq_compute_single_core(
+                proc_id=0,
+                annotation_set=[(gt, result)],
+                gt_folder=self.seg_prefix,
+                pred_folder=self.seg_out_dir,
+                categories=categories,
+                backend_args=self.backend_args)
+
+            self.results.append(pq_stats)
+
+    def _process_gt_and_predictions(self, data_samples: Sequence[dict]):
+        """Process gts and predictions when ``outfile_prefix`` is set.
+
+        The predictions will be saved to directory specified by
+        ``outfile_predfix``. The matched pair (gt, result) will be put into
+        ``self.results``.
+        """
+        for data_sample in data_samples:
+            # parse pred
+            img_id = data_sample['img_id']
+            segm_file = osp.basename(data_sample['img_path']).replace(
+                '.jpg', '.png')
+            result = self._parse_predictions(
+                pred=data_sample, img_id=img_id, segm_file=segm_file)
+
+            # parse gt
+            gt = dict()
+            gt['image_id'] = img_id
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+
+            if self._coco_api is None:
+                # get segments_info from dataset
+                gt['segments_info'] = data_sample['segments_info']
+                gt['seg_map_path'] = data_sample['seg_map_path']
+
+            self.results.append((gt, result))
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        # If ``self.tmp_dir`` is none, it will save gt and predictions to
+        # self.results, otherwise, it will compute pq_stats here.
+        if self.tmp_dir is None:
+            self._process_gt_and_predictions(data_samples)
+        else:
+            self._compute_batch_pq_stats(data_samples)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch. There
+                are two cases:
+
+                - When ``outfile_prefix`` is not provided, the elements in
+                  results are pq_stats which can be summed directly to get PQ.
+                - When ``outfile_prefix`` is provided, the elements in
+                  results are tuples like (gt, pred).
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        if self.tmp_dir is None:
+            # do evaluation after collect all the results
+
+            # split gt and prediction list
+            gts, preds = zip(*results)
+
+            if self._coco_api is None:
+                # use converted gt json file to initialize coco api
+                logger.info('Converting ground truth to coco format...')
+                coco_json_path, gt_folder = self.gt_to_coco_json(
+                    gt_dicts=gts, outfile_prefix=self.outfile_prefix)
+                self._coco_api = COCOPanoptic(coco_json_path)
+            else:
+                gt_folder = self.seg_prefix
+
+            self.cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['classes'])
+            self.cat2label = {
+                cat_id: i
+                for i, cat_id in enumerate(self.cat_ids)
+            }
+            self.img_ids = self._coco_api.get_img_ids()
+            self.categories = self._coco_api.cats
+
+            # convert predictions to coco format and dump to json file
+            json_filename, pred_folder = self.result2json(
+                results=preds, outfile_prefix=self.outfile_prefix)
+
+            if self.format_only:
+                logger.info('results are saved in '
+                            f'{osp.dirname(self.outfile_prefix)}')
+                return dict()
+
+            imgs = self._coco_api.imgs
+            gt_json = self._coco_api.img_ann_map
+            gt_json = [{
+                'image_id': k,
+                'segments_info': v,
+                'file_name': imgs[k]['segm_file']
+            } for k, v in gt_json.items()]
+            pred_json = load(json_filename)
+            pred_json = dict(
+                (el['image_id'], el) for el in pred_json['annotations'])
+
+            # match the gt_anns and pred_anns in the same image
+            matched_annotations_list = []
+            for gt_ann in gt_json:
+                img_id = gt_ann['image_id']
+                if img_id not in pred_json.keys():
+                    raise Exception('no prediction for the image'
+                                    ' with id: {}'.format(img_id))
+                matched_annotations_list.append((gt_ann, pred_json[img_id]))
+
+            pq_stat = pq_compute_multi_core(
+                matched_annotations_list,
+                gt_folder,
+                pred_folder,
+                self.categories,
+                backend_args=self.backend_args,
+                nproc=self.nproc)
+
+        else:
+            # aggregate the results generated in process
+            if self._coco_api is None:
+                categories = dict()
+                for id, name in enumerate(self.dataset_meta['classes']):
+                    isthing = 1 if name in self.dataset_meta[
+                        'thing_classes'] else 0
+                    categories[id] = {
+                        'id': id,
+                        'name': name,
+                        'isthing': isthing
+                    }
+                self.categories = categories
+
+            pq_stat = PQStat()
+            for result in results:
+                pq_stat += result
+
+        metrics = [('All', None), ('Things', True), ('Stuff', False)]
+        pq_results = {}
+
+        for name, isthing in metrics:
+            pq_results[name], classwise_results = pq_stat.pq_average(
+                self.categories, isthing=isthing)
+            if name == 'All':
+                pq_results['classwise'] = classwise_results
+
+        classwise_results = None
+        if self.classwise:
+            classwise_results = {
+                k: v
+                for k, v in zip(self.dataset_meta['classes'],
+                                pq_results['classwise'].values())
+            }
+
+        print_panoptic_table(pq_results, classwise_results, logger=logger)
+        results = parse_pq_results(pq_results)
+
+        return results
+
+
+def parse_pq_results(pq_results: dict) -> dict:
+    """Parse the Panoptic Quality results.
+
+    Args:
+        pq_results (dict): Panoptic Quality results.
+
+    Returns:
+        dict: Panoptic Quality results parsed.
+    """
+    result = dict()
+    result['PQ'] = 100 * pq_results['All']['pq']
+    result['SQ'] = 100 * pq_results['All']['sq']
+    result['RQ'] = 100 * pq_results['All']['rq']
+    result['PQ_th'] = 100 * pq_results['Things']['pq']
+    result['SQ_th'] = 100 * pq_results['Things']['sq']
+    result['RQ_th'] = 100 * pq_results['Things']['rq']
+    result['PQ_st'] = 100 * pq_results['Stuff']['pq']
+    result['SQ_st'] = 100 * pq_results['Stuff']['sq']
+    result['RQ_st'] = 100 * pq_results['Stuff']['rq']
+    return result
+
+
+def print_panoptic_table(
+        pq_results: dict,
+        classwise_results: Optional[dict] = None,
+        logger: Optional[Union['MMLogger', str]] = None) -> None:
+    """Print the panoptic evaluation results table.
+
+    Args:
+        pq_results(dict): The Panoptic Quality results.
+        classwise_results(dict, optional): The classwise Panoptic Quality.
+            results. The keys are class names and the values are metrics.
+            Defaults to None.
+        logger (:obj:`MMLogger` | str, optional): Logger used for printing
+            related information during evaluation. Default: None.
+    """
+
+    headers = ['', 'PQ', 'SQ', 'RQ', 'categories']
+    data = [headers]
+    for name in ['All', 'Things', 'Stuff']:
+        numbers = [
+            f'{(pq_results[name][k] * 100):0.3f}' for k in ['pq', 'sq', 'rq']
+        ]
+        row = [name] + numbers + [pq_results[name]['n']]
+        data.append(row)
+    table = AsciiTable(data)
+    print_log('Panoptic Evaluation Results:\n' + table.table, logger=logger)
+
+    if classwise_results is not None:
+        class_metrics = [(name, ) + tuple(f'{(metrics[k] * 100):0.3f}'
+                                          for k in ['pq', 'sq', 'rq'])
+                         for name, metrics in classwise_results.items()]
+        num_columns = min(8, len(class_metrics) * 4)
+        results_flatten = list(itertools.chain(*class_metrics))
+        headers = ['category', 'PQ', 'SQ', 'RQ'] * (num_columns // 4)
+        results_2d = itertools.zip_longest(
+            *[results_flatten[i::num_columns] for i in range(num_columns)])
+        data = [headers]
+        data += [result for result in results_2d]
+        table = AsciiTable(data)
+        print_log(
+            'Classwise Panoptic Evaluation Results:\n' + table.table,
+            logger=logger)
diff --git a/head_extractor/src/mmdet/evaluation/metrics/coco_video_metric.py b/head_extractor/src/mmdet/evaluation/metrics/coco_video_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5c75d025a6109762db21a600e3d866764caf1cb
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/coco_video_metric.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Sequence
+
+from mmengine.dist import broadcast_object_list, is_main_process
+
+from mmdet.registry import METRICS
+from .base_video_metric import collect_tracking_results
+from .coco_metric import CocoMetric
+
+
+@METRICS.register_module()
+class CocoVideoMetric(CocoMetric):
+    """COCO evaluation metric.
+
+    Evaluate AR, AP, and mAP for detection tasks including proposal/box
+    detection and instance segmentation. Please refer to
+    https://cocodataset.org/#detection-eval for more details.
+    """
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for track_data_sample in data_samples:
+            video_data_samples = track_data_sample['video_data_samples']
+            ori_video_len = video_data_samples[0].ori_video_length
+            video_len = len(video_data_samples)
+            if ori_video_len == video_len:
+                # video process
+                for frame_id in range(video_len):
+                    img_data_sample = video_data_samples[frame_id].to_dict()
+                    super().process(None, [img_data_sample])
+            else:
+                # image process
+                img_data_sample = video_data_samples[0].to_dict()
+                super().process(None, [img_data_sample])
+
+    def evaluate(self, size: int = 1) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        if len(self.results) == 0:
+            warnings.warn(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.')
+
+        results = collect_tracking_results(self.results, self.collect_device)
+
+        if is_main_process():
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
diff --git a/head_extractor/src/mmdet/evaluation/metrics/crowdhuman_metric.py b/head_extractor/src/mmdet/evaluation/metrics/crowdhuman_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..50ac210ae8606bab6cada69418334c113c90fb38
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/crowdhuman_metric.py
@@ -0,0 +1,824 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import json
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+from multiprocessing import Process, Queue
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import dump, get_text, load
+from mmengine.logging import MMLogger
+from scipy.sparse import csr_matrix
+from scipy.sparse.csgraph import maximum_bipartite_matching
+
+from mmdet.evaluation.functional.bbox_overlaps import bbox_overlaps
+from mmdet.registry import METRICS
+
+PERSON_CLASSES = ['background', 'person']
+
+
+@METRICS.register_module()
+class CrowdHumanMetric(BaseMetric):
+    """CrowdHuman evaluation metric.
+
+    Evaluate Average Precision (AP), Miss Rate (MR) and Jaccard Index (JI)
+    for detection tasks.
+
+    Args:
+        ann_file (str): Path to the annotation file.
+        metric (str | List[str]): Metrics to be evaluated. Valid metrics
+            include 'AP', 'MR' and 'JI'. Defaults to 'AP'.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        outfile_prefix (str, optional): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+        eval_mode (int): Select the mode of evaluate. Valid mode include
+            0(just body box), 1(just head box) and 2(both of them).
+            Defaults to 0.
+        iou_thres (float): IoU threshold. Defaults to 0.5.
+        compare_matching_method (str, optional): Matching method to compare
+            the detection results with the ground_truth when compute 'AP'
+            and 'MR'.Valid method include VOC and None(CALTECH). Default to
+            None.
+        mr_ref (str): Different parameter selection to calculate MR. Valid
+            ref include CALTECH_-2 and CALTECH_-4. Defaults to CALTECH_-2.
+        num_ji_process (int): The number of processes to evaluation JI.
+            Defaults to 10.
+    """
+    default_prefix: Optional[str] = 'crowd_human'
+
+    def __init__(self,
+                 ann_file: str,
+                 metric: Union[str, List[str]] = ['AP', 'MR', 'JI'],
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 eval_mode: int = 0,
+                 iou_thres: float = 0.5,
+                 compare_matching_method: Optional[str] = None,
+                 mr_ref: str = 'CALTECH_-2',
+                 num_ji_process: int = 10) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.ann_file = ann_file
+        # crowdhuman evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['MR', 'AP', 'JI']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f"metric should be one of 'MR', 'AP', 'JI',"
+                               f'but got {metric}.')
+
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+        self.outfile_prefix = outfile_prefix
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        assert eval_mode in [0, 1, 2], \
+            "Unknown eval mode. mr_ref should be one of '0', '1', '2'."
+        assert compare_matching_method is None or \
+               compare_matching_method == 'VOC', \
+               'The alternative compare_matching_method is VOC.' \
+               'This parameter defaults to CALTECH(None)'
+        assert mr_ref == 'CALTECH_-2' or mr_ref == 'CALTECH_-4', \
+            "mr_ref should be one of 'CALTECH_-2', 'CALTECH_-4'."
+        self.eval_mode = eval_mode
+        self.iou_thres = iou_thres
+        self.compare_matching_method = compare_matching_method
+        self.mr_ref = mr_ref
+        self.num_ji_process = num_ji_process
+
+    @staticmethod
+    def results2json(results: Sequence[dict], outfile_prefix: str) -> str:
+        """Dump the detection results to a json file."""
+        result_file_path = f'{outfile_prefix}.json'
+        bbox_json_results = []
+        for i, result in enumerate(results):
+            ann, pred = result
+            dump_dict = dict()
+            dump_dict['ID'] = ann['ID']
+            dump_dict['width'] = ann['width']
+            dump_dict['height'] = ann['height']
+            dtboxes = []
+            bboxes = pred.tolist()
+            for _, single_bbox in enumerate(bboxes):
+                temp_dict = dict()
+                x1, y1, x2, y2, score = single_bbox
+                temp_dict['box'] = [x1, y1, x2 - x1, y2 - y1]
+                temp_dict['score'] = score
+                temp_dict['tag'] = 1
+                dtboxes.append(temp_dict)
+            dump_dict['dtboxes'] = dtboxes
+            bbox_json_results.append(dump_dict)
+        dump(bbox_json_results, result_file_path)
+        return result_file_path
+
+    def process(self, data_batch: Sequence[dict],
+                data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            ann = dict()
+            ann['ID'] = data_sample['img_id']
+            ann['width'] = data_sample['ori_shape'][1]
+            ann['height'] = data_sample['ori_shape'][0]
+            pred_bboxes = data_sample['pred_instances']['bboxes'].cpu().numpy()
+            pred_scores = data_sample['pred_instances']['scores'].cpu().numpy()
+
+            pred_bbox_scores = np.hstack(
+                [pred_bboxes, pred_scores.reshape((-1, 1))])
+
+            self.results.append((ann, pred_bbox_scores))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            eval_results(Dict[str, float]): The computed metrics.
+            The keys are the names of the metrics, and the values
+            are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        tmp_dir = None
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'result')
+        else:
+            outfile_prefix = self.outfile_prefix
+
+        # convert predictions to coco format and dump to json file
+        result_file = self.results2json(results, outfile_prefix)
+        eval_results = OrderedDict()
+        if self.format_only:
+            logger.info(f'results are saved in {osp.dirname(outfile_prefix)}')
+            return eval_results
+
+        # load evaluation samples
+        eval_samples = self.load_eval_samples(result_file)
+
+        if 'AP' in self.metrics or 'MR' in self.metrics:
+            score_list = self.compare(eval_samples)
+            gt_num = sum([eval_samples[i].gt_num for i in eval_samples])
+            ign_num = sum([eval_samples[i].ign_num for i in eval_samples])
+            gt_num = gt_num - ign_num
+            img_num = len(eval_samples)
+
+        for metric in self.metrics:
+            logger.info(f'Evaluating {metric}...')
+            if metric == 'AP':
+                AP = self.eval_ap(score_list, gt_num, img_num)
+                eval_results['mAP'] = float(f'{round(AP, 4)}')
+            if metric == 'MR':
+                MR = self.eval_mr(score_list, gt_num, img_num)
+                eval_results['mMR'] = float(f'{round(MR, 4)}')
+            if metric == 'JI':
+                JI = self.eval_ji(eval_samples)
+                eval_results['JI'] = float(f'{round(JI, 4)}')
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        return eval_results
+
+    def load_eval_samples(self, result_file):
+        """Load data from annotations file and detection results.
+
+        Args:
+            result_file (str): The file path of the saved detection results.
+
+        Returns:
+            Dict[Image]: The detection result packaged by Image
+        """
+        gt_str = get_text(
+            self.ann_file, backend_args=self.backend_args).strip().split('\n')
+        gt_records = [json.loads(line) for line in gt_str]
+
+        pred_records = load(result_file, backend_args=self.backend_args)
+        eval_samples = dict()
+        for gt_record, pred_record in zip(gt_records, pred_records):
+            assert gt_record['ID'] == pred_record['ID'], \
+                'please set val_dataloader.sampler.shuffle=False and try again'
+            eval_samples[pred_record['ID']] = Image(self.eval_mode)
+            eval_samples[pred_record['ID']].load(gt_record, 'box', None,
+                                                 PERSON_CLASSES, True)
+            eval_samples[pred_record['ID']].load(pred_record, 'box', None,
+                                                 PERSON_CLASSES, False)
+            eval_samples[pred_record['ID']].clip_all_boader()
+        return eval_samples
+
+    def compare(self, samples):
+        """Match the detection results with the ground_truth.
+
+        Args:
+            samples (dict[Image]): The detection result packaged by Image.
+
+        Returns:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+            a list of tuples (dtbox, label, imgID) in the descending
+            sort of dtbox.score.
+        """
+        score_list = list()
+        for id in samples:
+            if self.compare_matching_method == 'VOC':
+                result = samples[id].compare_voc(self.iou_thres)
+            else:
+                result = samples[id].compare_caltech(self.iou_thres)
+            score_list.extend(result)
+        # In the descending sort of dtbox score.
+        score_list.sort(key=lambda x: x[0][-1], reverse=True)
+        return score_list
+
+    @staticmethod
+    def eval_ap(score_list, gt_num, img_num):
+        """Evaluate by average precision.
+
+        Args:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+                a list of tuples (dtbox, label, imgID) in the descending
+                sort of dtbox.score.
+            gt_num(int): The number of gt boxes in the entire dataset.
+            img_num(int): The number of images in the entire dataset.
+
+        Returns:
+            ap(float): result of average precision.
+        """
+
+        # calculate general ap score
+        def _calculate_map(_recall, _precision):
+            assert len(_recall) == len(_precision)
+            area = 0
+            for k in range(1, len(_recall)):
+                delta_h = (_precision[k - 1] + _precision[k]) / 2
+                delta_w = _recall[k] - _recall[k - 1]
+                area += delta_w * delta_h
+            return area
+
+        tp, fp = 0.0, 0.0
+        rpX, rpY = list(), list()
+
+        fpn = []
+        recalln = []
+        thr = []
+        fppi = []
+        for i, item in enumerate(score_list):
+            if item[1] == 1:
+                tp += 1.0
+            elif item[1] == 0:
+                fp += 1.0
+            fn = gt_num - tp
+            recall = tp / (tp + fn)
+            precision = tp / (tp + fp)
+            rpX.append(recall)
+            rpY.append(precision)
+            fpn.append(fp)
+            recalln.append(tp)
+            thr.append(item[0][-1])
+            fppi.append(fp / img_num)
+
+        ap = _calculate_map(rpX, rpY)
+        return ap
+
+    def eval_mr(self, score_list, gt_num, img_num):
+        """Evaluate by Caltech-style log-average miss rate.
+
+        Args:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+                a list of tuples (dtbox, label, imgID) in the descending
+                sort of dtbox.score.
+            gt_num(int): The number of gt boxes in the entire dataset.
+            img_num(int): The number of image in the entire dataset.
+
+        Returns:
+            mr(float): result of miss rate.
+        """
+
+        # find greater_than
+        def _find_gt(lst, target):
+            for idx, _item in enumerate(lst):
+                if _item >= target:
+                    return idx
+            return len(lst) - 1
+
+        if self.mr_ref == 'CALTECH_-2':
+            # CALTECH_MRREF_2: anchor points (from 10^-2 to 1) as in
+            # P.Dollar's paper
+            ref = [
+                0.0100, 0.0178, 0.03160, 0.0562, 0.1000, 0.1778, 0.3162,
+                0.5623, 1.000
+            ]
+        else:
+            # CALTECH_MRREF_4: anchor points (from 10^-4 to 1) as in
+            # S.Zhang's paper
+            ref = [
+                0.0001, 0.0003, 0.00100, 0.0032, 0.0100, 0.0316, 0.1000,
+                0.3162, 1.000
+            ]
+
+        tp, fp = 0.0, 0.0
+        fppiX, fppiY = list(), list()
+        for i, item in enumerate(score_list):
+            if item[1] == 1:
+                tp += 1.0
+            elif item[1] == 0:
+                fp += 1.0
+
+            fn = gt_num - tp
+            recall = tp / (tp + fn)
+            missrate = 1.0 - recall
+            fppi = fp / img_num
+            fppiX.append(fppi)
+            fppiY.append(missrate)
+
+        score = list()
+        for pos in ref:
+            argmin = _find_gt(fppiX, pos)
+            if argmin >= 0:
+                score.append(fppiY[argmin])
+        score = np.array(score)
+        mr = np.exp(np.log(score).mean())
+        return mr
+
+    def eval_ji(self, samples):
+        """Evaluate by JI using multi_process.
+
+        Args:
+            samples(Dict[str, Image]): The detection result packaged by Image.
+
+        Returns:
+            ji(float): result of jaccard index.
+        """
+        import math
+        res_line = []
+        res_ji = []
+        for i in range(10):
+            score_thr = 1e-1 * i
+            total = len(samples)
+            stride = math.ceil(total / self.num_ji_process)
+            result_queue = Queue(10000)
+            results, procs = [], []
+            records = list(samples.items())
+            for i in range(self.num_ji_process):
+                start = i * stride
+                end = np.min([start + stride, total])
+                sample_data = dict(records[start:end])
+                p = Process(
+                    target=self.compute_ji_with_ignore,
+                    args=(result_queue, sample_data, score_thr))
+                p.start()
+                procs.append(p)
+            for i in range(total):
+                t = result_queue.get()
+                results.append(t)
+            for p in procs:
+                p.join()
+            line, mean_ratio = self.gather(results)
+            line = 'score_thr:{:.1f}, {}'.format(score_thr, line)
+            res_line.append(line)
+            res_ji.append(mean_ratio)
+        return max(res_ji)
+
+    def compute_ji_with_ignore(self, result_queue, dt_result, score_thr):
+        """Compute JI with ignore.
+
+        Args:
+            result_queue(Queue): The Queue for save compute result when
+                multi_process.
+            dt_result(dict[Image]): Detection result packaged by Image.
+            score_thr(float): The threshold of detection score.
+        Returns:
+            dict: compute result.
+        """
+        for ID, record in dt_result.items():
+            gt_boxes = record.gt_boxes
+            dt_boxes = record.dt_boxes
+            keep = dt_boxes[:, -1] > score_thr
+            dt_boxes = dt_boxes[keep][:, :-1]
+
+            gt_tag = np.array(gt_boxes[:, -1] != -1)
+            matches = self.compute_ji_matching(dt_boxes, gt_boxes[gt_tag, :4])
+            # get the unmatched_indices
+            matched_indices = np.array([j for (j, _) in matches])
+            unmatched_indices = list(
+                set(np.arange(dt_boxes.shape[0])) - set(matched_indices))
+            num_ignore_dt = self.get_ignores(dt_boxes[unmatched_indices],
+                                             gt_boxes[~gt_tag, :4])
+            matched_indices = np.array([j for (_, j) in matches])
+            unmatched_indices = list(
+                set(np.arange(gt_boxes[gt_tag].shape[0])) -
+                set(matched_indices))
+            num_ignore_gt = self.get_ignores(
+                gt_boxes[gt_tag][unmatched_indices], gt_boxes[~gt_tag, :4])
+            # compute results
+            eps = 1e-6
+            k = len(matches)
+            m = gt_tag.sum() - num_ignore_gt
+            n = dt_boxes.shape[0] - num_ignore_dt
+            ratio = k / (m + n - k + eps)
+            recall = k / (m + eps)
+            cover = k / (n + eps)
+            noise = 1 - cover
+            result_dict = dict(
+                ratio=ratio,
+                recall=recall,
+                cover=cover,
+                noise=noise,
+                k=k,
+                m=m,
+                n=n)
+            result_queue.put_nowait(result_dict)
+
+    @staticmethod
+    def gather(results):
+        """Integrate test results."""
+        assert len(results)
+        img_num = 0
+        for result in results:
+            if result['n'] != 0 or result['m'] != 0:
+                img_num += 1
+        mean_ratio = np.sum([rb['ratio'] for rb in results]) / img_num
+        valids = np.sum([rb['k'] for rb in results])
+        total = np.sum([rb['n'] for rb in results])
+        gtn = np.sum([rb['m'] for rb in results])
+        line = 'mean_ratio:{:.4f}, valids:{}, total:{}, gtn:{}'\
+            .format(mean_ratio, valids, total, gtn)
+        return line, mean_ratio
+
+    def compute_ji_matching(self, dt_boxes, gt_boxes):
+        """Match the annotation box for each detection box.
+
+        Args:
+            dt_boxes(ndarray): Detection boxes.
+            gt_boxes(ndarray): Ground_truth boxes.
+
+        Returns:
+            matches_(list[tuple[int, int]]): Match result.
+        """
+        assert dt_boxes.shape[-1] > 3 and gt_boxes.shape[-1] > 3
+        if dt_boxes.shape[0] < 1 or gt_boxes.shape[0] < 1:
+            return list()
+
+        ious = bbox_overlaps(dt_boxes, gt_boxes, mode='iou')
+        input_ = copy.deepcopy(ious)
+        input_[input_ < self.iou_thres] = 0
+        match_scipy = maximum_bipartite_matching(
+            csr_matrix(input_), perm_type='column')
+        matches_ = []
+        for i in range(len(match_scipy)):
+            if match_scipy[i] != -1:
+                matches_.append((i, int(match_scipy[i])))
+        return matches_
+
+    def get_ignores(self, dt_boxes, gt_boxes):
+        """Get the number of ignore bboxes."""
+        if gt_boxes.size:
+            ioas = bbox_overlaps(dt_boxes, gt_boxes, mode='iof')
+            ioas = np.max(ioas, axis=1)
+            rows = np.where(ioas > self.iou_thres)[0]
+            return len(rows)
+        else:
+            return 0
+
+
+class Image(object):
+    """Data structure for evaluation of CrowdHuman.
+
+    Note:
+        This implementation is modified from https://github.com/Purkialo/
+        CrowdDet/blob/master/lib/evaluate/APMRToolkits/image.py
+
+    Args:
+        mode (int): Select the mode of evaluate. Valid mode include
+            0(just body box), 1(just head box) and 2(both of them).
+            Defaults to 0.
+    """
+
+    def __init__(self, mode):
+        self.ID = None
+        self.width = None
+        self.height = None
+        self.dt_boxes = None
+        self.gt_boxes = None
+        self.eval_mode = mode
+
+        self.ign_num = None
+        self.gt_num = None
+        self.dt_num = None
+
+    def load(self, record, body_key, head_key, class_names, gt_flag):
+        """Loading information for evaluation.
+
+        Args:
+            record (dict): Label information or test results.
+                The format might look something like this:
+                {
+                    'ID': '273271,c9db000d5146c15',
+                    'gtboxes': [
+                        {'fbox': [72, 202, 163, 503], 'tag': 'person', ...},
+                        {'fbox': [199, 180, 144, 499], 'tag': 'person', ...},
+                        ...
+                    ]
+                }
+                or:
+                {
+                    'ID': '273271,c9db000d5146c15',
+                    'width': 800,
+                    'height': 1067,
+                    'dtboxes': [
+                        {
+                            'box': [306.22, 205.95, 164.05, 394.04],
+                            'score': 0.99,
+                            'tag': 1
+                        },
+                        {
+                            'box': [403.60, 178.66, 157.15, 421.33],
+                            'score': 0.99,
+                            'tag': 1
+                        },
+                        ...
+                    ]
+                }
+            body_key (str, None): key of detection body box.
+                Valid when loading detection results and self.eval_mode!=1.
+            head_key (str, None): key of detection head box.
+                Valid when loading detection results and self.eval_mode!=0.
+            class_names (list[str]):class names of data set.
+                Defaults to ['background', 'person'].
+            gt_flag (bool): Indicate whether record is ground truth
+                or predicting the outcome.
+        """
+        if 'ID' in record and self.ID is None:
+            self.ID = record['ID']
+        if 'width' in record and self.width is None:
+            self.width = record['width']
+        if 'height' in record and self.height is None:
+            self.height = record['height']
+        if gt_flag:
+            self.gt_num = len(record['gtboxes'])
+            body_bbox, head_bbox = self.load_gt_boxes(record, 'gtboxes',
+                                                      class_names)
+            if self.eval_mode == 0:
+                self.gt_boxes = body_bbox
+                self.ign_num = (body_bbox[:, -1] == -1).sum()
+            elif self.eval_mode == 1:
+                self.gt_boxes = head_bbox
+                self.ign_num = (head_bbox[:, -1] == -1).sum()
+            else:
+                gt_tag = np.array([
+                    body_bbox[i, -1] != -1 and head_bbox[i, -1] != -1
+                    for i in range(len(body_bbox))
+                ])
+                self.ign_num = (gt_tag == 0).sum()
+                self.gt_boxes = np.hstack(
+                    (body_bbox[:, :-1], head_bbox[:, :-1],
+                     gt_tag.reshape(-1, 1)))
+
+        if not gt_flag:
+            self.dt_num = len(record['dtboxes'])
+            if self.eval_mode == 0:
+                self.dt_boxes = self.load_det_boxes(record, 'dtboxes',
+                                                    body_key, 'score')
+            elif self.eval_mode == 1:
+                self.dt_boxes = self.load_det_boxes(record, 'dtboxes',
+                                                    head_key, 'score')
+            else:
+                body_dtboxes = self.load_det_boxes(record, 'dtboxes', body_key,
+                                                   'score')
+                head_dtboxes = self.load_det_boxes(record, 'dtboxes', head_key,
+                                                   'score')
+                self.dt_boxes = np.hstack((body_dtboxes, head_dtboxes))
+
+    @staticmethod
+    def load_gt_boxes(dict_input, key_name, class_names):
+        """load ground_truth and transform [x, y, w, h] to [x1, y1, x2, y2]"""
+        assert key_name in dict_input
+        if len(dict_input[key_name]) < 1:
+            return np.empty([0, 5])
+        head_bbox = []
+        body_bbox = []
+        for rb in dict_input[key_name]:
+            if rb['tag'] in class_names:
+                body_tag = class_names.index(rb['tag'])
+                head_tag = copy.deepcopy(body_tag)
+            else:
+                body_tag = -1
+                head_tag = -1
+            if 'extra' in rb:
+                if 'ignore' in rb['extra']:
+                    if rb['extra']['ignore'] != 0:
+                        body_tag = -1
+                        head_tag = -1
+            if 'head_attr' in rb:
+                if 'ignore' in rb['head_attr']:
+                    if rb['head_attr']['ignore'] != 0:
+                        head_tag = -1
+            head_bbox.append(np.hstack((rb['hbox'], head_tag)))
+            body_bbox.append(np.hstack((rb['fbox'], body_tag)))
+        head_bbox = np.array(head_bbox)
+        head_bbox[:, 2:4] += head_bbox[:, :2]
+        body_bbox = np.array(body_bbox)
+        body_bbox[:, 2:4] += body_bbox[:, :2]
+        return body_bbox, head_bbox
+
+    @staticmethod
+    def load_det_boxes(dict_input, key_name, key_box, key_score, key_tag=None):
+        """load detection boxes."""
+        assert key_name in dict_input
+        if len(dict_input[key_name]) < 1:
+            return np.empty([0, 5])
+        else:
+            assert key_box in dict_input[key_name][0]
+            if key_score:
+                assert key_score in dict_input[key_name][0]
+            if key_tag:
+                assert key_tag in dict_input[key_name][0]
+        if key_score:
+            if key_tag:
+                bboxes = np.vstack([
+                    np.hstack((rb[key_box], rb[key_score], rb[key_tag]))
+                    for rb in dict_input[key_name]
+                ])
+            else:
+                bboxes = np.vstack([
+                    np.hstack((rb[key_box], rb[key_score]))
+                    for rb in dict_input[key_name]
+                ])
+        else:
+            if key_tag:
+                bboxes = np.vstack([
+                    np.hstack((rb[key_box], rb[key_tag]))
+                    for rb in dict_input[key_name]
+                ])
+            else:
+                bboxes = np.vstack(
+                    [rb[key_box] for rb in dict_input[key_name]])
+        bboxes[:, 2:4] += bboxes[:, :2]
+        return bboxes
+
+    def clip_all_boader(self):
+        """Make sure boxes are within the image range."""
+
+        def _clip_boundary(boxes, height, width):
+            assert boxes.shape[-1] >= 4
+            boxes[:, 0] = np.minimum(np.maximum(boxes[:, 0], 0), width - 1)
+            boxes[:, 1] = np.minimum(np.maximum(boxes[:, 1], 0), height - 1)
+            boxes[:, 2] = np.maximum(np.minimum(boxes[:, 2], width), 0)
+            boxes[:, 3] = np.maximum(np.minimum(boxes[:, 3], height), 0)
+            return boxes
+
+        assert self.dt_boxes.shape[-1] >= 4
+        assert self.gt_boxes.shape[-1] >= 4
+        assert self.width is not None and self.height is not None
+        if self.eval_mode == 2:
+            self.dt_boxes[:, :4] = _clip_boundary(self.dt_boxes[:, :4],
+                                                  self.height, self.width)
+            self.gt_boxes[:, :4] = _clip_boundary(self.gt_boxes[:, :4],
+                                                  self.height, self.width)
+            self.dt_boxes[:, 4:8] = _clip_boundary(self.dt_boxes[:, 4:8],
+                                                   self.height, self.width)
+            self.gt_boxes[:, 4:8] = _clip_boundary(self.gt_boxes[:, 4:8],
+                                                   self.height, self.width)
+        else:
+            self.dt_boxes = _clip_boundary(self.dt_boxes, self.height,
+                                           self.width)
+            self.gt_boxes = _clip_boundary(self.gt_boxes, self.height,
+                                           self.width)
+
+    def compare_voc(self, thres):
+        """Match the detection results with the ground_truth by VOC.
+
+        Args:
+            thres (float): IOU threshold.
+
+        Returns:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+            a list of tuples (dtbox, label, imgID) in the descending
+            sort of dtbox.score.
+        """
+        if self.dt_boxes is None:
+            return list()
+        dtboxes = self.dt_boxes
+        gtboxes = self.gt_boxes if self.gt_boxes is not None else list()
+        dtboxes.sort(key=lambda x: x.score, reverse=True)
+        gtboxes.sort(key=lambda x: x.ign)
+
+        score_list = list()
+        for i, dt in enumerate(dtboxes):
+            maxpos = -1
+            maxiou = thres
+
+            for j, gt in enumerate(gtboxes):
+                overlap = dt.iou(gt)
+                if overlap > maxiou:
+                    maxiou = overlap
+                    maxpos = j
+
+            if maxpos >= 0:
+                if gtboxes[maxpos].ign == 0:
+                    gtboxes[maxpos].matched = 1
+                    dtboxes[i].matched = 1
+                    score_list.append((dt, self.ID))
+                else:
+                    dtboxes[i].matched = -1
+            else:
+                dtboxes[i].matched = 0
+                score_list.append((dt, self.ID))
+        return score_list
+
+    def compare_caltech(self, thres):
+        """Match the detection results with the ground_truth by Caltech
+        matching strategy.
+
+        Args:
+            thres (float): IOU threshold.
+
+        Returns:
+            score_list(list[tuple[ndarray, int, str]]): Matching result.
+            a list of tuples (dtbox, label, imgID) in the descending
+            sort of dtbox.score.
+        """
+        if self.dt_boxes is None or self.gt_boxes is None:
+            return list()
+
+        dtboxes = self.dt_boxes if self.dt_boxes is not None else list()
+        gtboxes = self.gt_boxes if self.gt_boxes is not None else list()
+        dt_matched = np.zeros(dtboxes.shape[0])
+        gt_matched = np.zeros(gtboxes.shape[0])
+
+        dtboxes = np.array(sorted(dtboxes, key=lambda x: x[-1], reverse=True))
+        gtboxes = np.array(sorted(gtboxes, key=lambda x: x[-1], reverse=True))
+        if len(dtboxes):
+            overlap_iou = bbox_overlaps(dtboxes, gtboxes, mode='iou')
+            overlap_ioa = bbox_overlaps(dtboxes, gtboxes, mode='iof')
+        else:
+            return list()
+
+        score_list = list()
+        for i, dt in enumerate(dtboxes):
+            maxpos = -1
+            maxiou = thres
+            for j, gt in enumerate(gtboxes):
+                if gt_matched[j] == 1:
+                    continue
+                if gt[-1] > 0:
+                    overlap = overlap_iou[i][j]
+                    if overlap > maxiou:
+                        maxiou = overlap
+                        maxpos = j
+                else:
+                    if maxpos >= 0:
+                        break
+                    else:
+                        overlap = overlap_ioa[i][j]
+                        if overlap > thres:
+                            maxiou = overlap
+                            maxpos = j
+            if maxpos >= 0:
+                if gtboxes[maxpos, -1] > 0:
+                    gt_matched[maxpos] = 1
+                    dt_matched[i] = 1
+                    score_list.append((dt, 1, self.ID))
+                else:
+                    dt_matched[i] = -1
+            else:
+                dt_matched[i] = 0
+                score_list.append((dt, 0, self.ID))
+        return score_list
diff --git a/head_extractor/src/mmdet/evaluation/metrics/dod_metric.py b/head_extractor/src/mmdet/evaluation/metrics/dod_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..b47d07219dad112a336123444e58c72978953439
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/dod_metric.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import List, Optional, Sequence
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger
+
+from mmdet.datasets.api_wrappers import COCO, COCOeval
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class DODCocoMetric(BaseMetric):
+
+    default_prefix: Optional[str] = 'dod'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 outfile_prefix: Optional[str] = None,
+                 backend_args: dict = None,
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.outfile_prefix = outfile_prefix
+        with get_local_path(ann_file, backend_args=backend_args) as local_path:
+            self._coco_api = COCO(local_path)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+
+            result['labels'] = pred['labels'].cpu().numpy()
+            result['labels'] = data_sample['sent_ids'][result['labels']]
+            self.results.append(result)
+
+    def xyxy2xywh(self, bbox: np.ndarray) -> list:
+        """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO
+        evaluation.
+
+        Args:
+            bbox (numpy.ndarray): The bounding boxes, shape (4, ), in
+                ``xyxy`` order.
+
+        Returns:
+            list[float]: The converted bounding boxes, in ``xywh`` order.
+        """
+
+        _bbox: List = bbox.tolist()
+        return [
+            _bbox[0],
+            _bbox[1],
+            _bbox[2] - _bbox[0],
+            _bbox[3] - _bbox[1],
+        ]
+
+    def results2json(self, results: Sequence[dict]) -> list:
+        """Dump the detection results to a COCO style json file.
+
+        There are 3 types of results: proposals, bbox predictions, mask
+        predictions, and they have different data types. This method will
+        automatically recognize the type, and dump them to json files.
+
+        Args:
+            results (Sequence[dict]): Testing results of the
+                dataset.
+
+        Returns:
+            dict: Possible keys are "bbox", "segm", "proposal", and
+            values are corresponding filenames.
+        """
+        bbox_json_results = []
+        for idx, result in enumerate(results):
+            image_id = result.get('img_id', idx)
+            labels = result['labels']
+            bboxes = result['bboxes']
+            scores = result['scores']
+            for i, label in enumerate(labels):
+                data = dict()
+                data['image_id'] = image_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(scores[i])
+                data['category_id'] = label
+                bbox_json_results.append(data)
+        return bbox_json_results
+
+    def compute_metrics(self, results: list) -> dict:
+        logger: MMLogger = MMLogger.get_current_instance()
+        result_files = self.results2json(results)
+        d3_res = self._coco_api.loadRes(result_files)
+        cocoEval = COCOeval(self._coco_api, d3_res, 'bbox')
+        cocoEval.evaluate()
+        cocoEval.accumulate()
+        cocoEval.summarize()
+
+        aps = cocoEval.eval['precision'][:, :, :, 0, -1]
+        category_ids = self._coco_api.getCatIds()
+        category_names = [
+            cat['name'] for cat in self._coco_api.loadCats(category_ids)
+        ]
+
+        aps_lens = defaultdict(list)
+        counter_lens = defaultdict(int)
+        for i in range(len(category_names)):
+            ap = aps[:, :, i]
+            ap_value = ap[ap > -1].mean()
+            if not np.isnan(ap_value):
+                len_ref = len(category_names[i].split(' '))
+                aps_lens[len_ref].append(ap_value)
+                counter_lens[len_ref] += 1
+
+        ap_sum_short = sum([sum(aps_lens[i]) for i in range(0, 4)])
+        ap_sum_mid = sum([sum(aps_lens[i]) for i in range(4, 7)])
+        ap_sum_long = sum([sum(aps_lens[i]) for i in range(7, 10)])
+        ap_sum_very_long = sum([
+            sum(aps_lens[i]) for i in range(10,
+                                            max(counter_lens.keys()) + 1)
+        ])
+        c_sum_short = sum([counter_lens[i] for i in range(1, 4)])
+        c_sum_mid = sum([counter_lens[i] for i in range(4, 7)])
+        c_sum_long = sum([counter_lens[i] for i in range(7, 10)])
+        c_sum_very_long = sum(
+            [counter_lens[i] for i in range(10,
+                                            max(counter_lens.keys()) + 1)])
+        map_short = ap_sum_short / c_sum_short
+        map_mid = ap_sum_mid / c_sum_mid
+        map_long = ap_sum_long / c_sum_long
+        map_very_long = ap_sum_very_long / c_sum_very_long
+
+        coco_metric_names = {
+            'mAP': 0,
+            'mAP_50': 1,
+            'mAP_75': 2,
+            'mAP_s': 3,
+            'mAP_m': 4,
+            'mAP_l': 5,
+            'AR@100': 6,
+            'AR@300': 7,
+            'AR@1000': 8,
+            'AR_s@1000': 9,
+            'AR_m@1000': 10,
+            'AR_l@1000': 11
+        }
+        metric_items = ['mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l']
+
+        eval_results = {}
+        for metric_item in metric_items:
+            key = f'{metric_item}'
+            val = cocoEval.stats[coco_metric_names[metric_item]]
+            eval_results[key] = float(f'{round(val, 3)}')
+
+        ap = cocoEval.stats[:6]
+        logger.info(f'mAP_copypaste: {ap[0]:.3f} '
+                    f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                    f'{ap[4]:.3f} {ap[5]:.3f}')
+
+        logger.info(f'mAP over reference length: short - {map_short:.4f}, '
+                    f'mid - {map_mid:.4f}, long - {map_long:.4f}, '
+                    f'very long - {map_very_long:.4f}')
+        eval_results['mAP_short'] = float(f'{round(map_short, 3)}')
+        eval_results['mAP_mid'] = float(f'{round(map_mid, 3)}')
+        eval_results['mAP_long'] = float(f'{round(map_long, 3)}')
+        eval_results['mAP_very_long'] = float(f'{round(map_very_long, 3)}')
+        return eval_results
diff --git a/head_extractor/src/mmdet/evaluation/metrics/dump_det_results.py b/head_extractor/src/mmdet/evaluation/metrics/dump_det_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3071d19a6ad0199458d13dfe6f570f181a5ea7f
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/dump_det_results.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Sequence
+
+from mmengine.evaluator import DumpResults
+from mmengine.evaluator.metric import _to_cpu
+
+from mmdet.registry import METRICS
+from mmdet.structures.mask import encode_mask_results
+
+
+@METRICS.register_module()
+class DumpDetResults(DumpResults):
+    """Dump model predictions to a pickle file for offline evaluation.
+
+    Different from `DumpResults` in MMEngine, it compresses instance
+    segmentation masks into RLE format.
+
+    Args:
+        out_file_path (str): Path of the dumped file. Must end with '.pkl'
+            or '.pickle'.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+    """
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """transfer tensors in predictions to CPU."""
+        data_samples = _to_cpu(data_samples)
+        for data_sample in data_samples:
+            # remove gt
+            data_sample.pop('gt_instances', None)
+            data_sample.pop('ignored_instances', None)
+            data_sample.pop('gt_panoptic_seg', None)
+
+            if 'pred_instances' in data_sample:
+                pred = data_sample['pred_instances']
+                # encode mask to RLE
+                if 'masks' in pred:
+                    pred['masks'] = encode_mask_results(pred['masks'].numpy())
+            if 'pred_panoptic_seg' in data_sample:
+                warnings.warn(
+                    'Panoptic segmentation map will not be compressed. '
+                    'The dumped file will be extremely large! '
+                    'Suggest using `CocoPanopticMetric` to save the coco '
+                    'format json and segmentation png files directly.')
+        self.results.extend(data_samples)
diff --git a/head_extractor/src/mmdet/evaluation/metrics/dump_odvg_results.py b/head_extractor/src/mmdet/evaluation/metrics/dump_odvg_results.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1446b0538053e14b6b9b21bebc6d91c9564d9b5
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/dump_odvg_results.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Sequence
+
+from mmcv.ops import batched_nms
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import print_log
+
+from mmdet.registry import METRICS
+
+try:
+    import jsonlines
+except ImportError:
+    jsonlines = None
+
+
+@METRICS.register_module()
+class DumpODVGResults(BaseMetric):
+    default_prefix: Optional[str] = 'pl_odvg'
+
+    def __init__(self,
+                 outfile_path,
+                 img_prefix: str,
+                 score_thr: float = 0.1,
+                 collect_device: str = 'cpu',
+                 nms_thr: float = 0.5,
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.outfile_path = outfile_path
+        self.score_thr = score_thr
+        self.img_prefix = img_prefix
+        self.nms_thr = nms_thr
+
+        if jsonlines is None:
+            raise ImportError('Please run "pip install jsonlines" to install '
+                              'this package.')
+
+    def process(self, data_batch: Any, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = {}
+
+            filename = data_sample['img_path']
+            filename = filename.replace(self.img_prefix, '')
+            if filename.startswith('/'):
+                filename = filename[1:]
+            result['filename'] = filename
+
+            height = data_sample['ori_shape'][0]
+            width = data_sample['ori_shape'][1]
+            result['height'] = height
+            result['width'] = width
+
+            pred_instances = data_sample['pred_instances']
+
+            bboxes = pred_instances['bboxes'].cpu()
+            scores = pred_instances['scores'].cpu()
+            labels = pred_instances['labels'].cpu()
+
+            bboxes = bboxes[scores > self.score_thr]
+            labels = labels[scores > self.score_thr]
+            scores = scores[scores > self.score_thr]
+
+            if 'tokens_positive' in data_sample:
+                task = 'vg'
+            else:
+                task = 'od'
+
+            if task == 'od':
+                classes_name = data_sample['text']
+                result['detection'] = {}
+
+                if len(bboxes) > 0:
+                    det_bboxes, keep = batched_nms(
+                        bboxes, scores, labels,
+                        dict(type='nms', iou_threshold=self.nms_thr))
+                    _scores = det_bboxes[:, -1]
+                    _bboxes = det_bboxes[:, :-1]
+                    _labels = labels[keep]
+
+                    instances = []
+                    _bboxes = _bboxes.numpy().tolist()
+                    _scores = _scores.numpy().tolist()
+                    _labels = _labels.numpy().tolist()
+                    for bbox, score, label in zip(_bboxes, _scores, _labels):
+                        round_bbox = [round(b, 2) for b in bbox]
+                        round_score = round(score, 2)
+                        instances.append({
+                            'bbox': round_bbox,
+                            'score': round_score,
+                            'label': label,
+                            'category': classes_name[label]
+                        })
+                    result['detection']['instances'] = instances
+                else:
+                    result['detection']['instances'] = []
+                self.results.append(result)
+            else:
+                caption = data_sample['text']
+                result['grounding'] = {}
+                result['grounding']['caption'] = caption
+
+                tokens_positive = data_sample['tokens_positive']
+
+                region_list = []
+                for label, positive in enumerate(tokens_positive):
+                    phrase = [caption[pos[0]:pos[1]] for pos in positive]
+
+                    _bboxes = bboxes[labels == label]
+                    _scores = scores[labels == label]
+                    det_bboxes, _ = batched_nms(
+                        _bboxes,
+                        _scores,
+                        None,
+                        dict(type='nms', iou_threshold=self.nms_thr),
+                        class_agnostic=True)
+                    _scores = det_bboxes[:, -1].numpy().tolist()
+                    _bboxes = det_bboxes[:, :-1].numpy().tolist()
+
+                    round_bboxes = []
+                    for bbox in _bboxes:
+                        round_bboxes.append([round(b, 2) for b in bbox])
+                    _scores = [[round(s, 2) for s in _scores]]
+                    region = {
+                        'phrase': phrase,
+                        'bbox': round_bboxes,
+                        'score': _scores,
+                        'tokens_positive': positive
+                    }
+                    region_list.append(region)
+                result['grounding']['regions'] = region_list
+                self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        with jsonlines.open(self.outfile_path, mode='w') as writer:
+            writer.write_all(results)
+        print_log(
+            f'Results has been saved to {self.outfile_path}.',
+            logger='current')
+        return {}
diff --git a/head_extractor/src/mmdet/evaluation/metrics/dump_proposals_metric.py b/head_extractor/src/mmdet/evaluation/metrics/dump_proposals_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e9c53654c15d4b1f7e6555a9a7c53f844cb071f
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/dump_proposals_metric.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from typing import Optional, Sequence
+
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import dump
+from mmengine.logging import MMLogger
+from mmengine.structures import InstanceData
+
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class DumpProposals(BaseMetric):
+    """Dump proposals pseudo metric.
+
+    Args:
+        output_dir (str): The root directory for ``proposals_file``.
+            Defaults to ''.
+        proposals_file (str): Proposals file path. Defaults to 'proposals.pkl'.
+        num_max_proposals (int, optional): Maximum number of proposals to dump.
+            If not specified, all proposals will be dumped.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    default_prefix: Optional[str] = 'dump_proposals'
+
+    def __init__(self,
+                 output_dir: str = '',
+                 proposals_file: str = 'proposals.pkl',
+                 num_max_proposals: Optional[int] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.num_max_proposals = num_max_proposals
+        # TODO: update after mmengine finish refactor fileio.
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+        self.output_dir = output_dir
+        assert proposals_file.endswith(('.pkl', '.pickle')), \
+            'The output file must be a pkl file.'
+
+        self.proposals_file = os.path.join(self.output_dir, proposals_file)
+        if is_main_process():
+            os.makedirs(self.output_dir, exist_ok=True)
+
+    def process(self, data_batch: Sequence[dict],
+                data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            pred = data_sample['pred_instances']
+            # `bboxes` is sorted by `scores`
+            ranked_scores, rank_inds = pred['scores'].sort(descending=True)
+            ranked_bboxes = pred['bboxes'][rank_inds, :]
+
+            ranked_bboxes = ranked_bboxes.cpu().numpy()
+            ranked_scores = ranked_scores.cpu().numpy()
+
+            pred_instance = InstanceData()
+            pred_instance.bboxes = ranked_bboxes
+            pred_instance.scores = ranked_scores
+            if self.num_max_proposals is not None:
+                pred_instance = pred_instance[:self.num_max_proposals]
+
+            img_path = data_sample['img_path']
+            # `file_name` is the key to obtain the proposals from the
+            # `proposals_list`.
+            file_name = osp.join(
+                osp.split(osp.split(img_path)[0])[-1],
+                osp.split(img_path)[-1])
+            result = {file_name: pred_instance}
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        """Dump the processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: An empty dict.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        dump_results = {}
+        for result in results:
+            dump_results.update(result)
+        dump(
+            dump_results,
+            file=self.proposals_file,
+            backend_args=self.backend_args)
+        logger.info(f'Results are saved at {self.proposals_file}')
+        return {}
diff --git a/head_extractor/src/mmdet/evaluation/metrics/flickr30k_metric.py b/head_extractor/src/mmdet/evaluation/metrics/flickr30k_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8b64bfda46b3e8cc4a1053d10082eff9bc421e8
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/flickr30k_metric.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved
+from collections import defaultdict
+from typing import Dict, List, Optional, Sequence
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS
+from ..functional import bbox_overlaps
+
+
+class RecallTracker:
+    """Utility class to track recall@k for various k, split by categories."""
+
+    def __init__(self, topk: Sequence[int]):
+        """
+        Parameters:
+           - topk : tuple of ints corresponding to the recalls being
+           tracked (eg, recall@1, recall@10, ...)
+        """
+
+        self.total_byk_bycat: Dict[int, Dict[str, int]] = {
+            k: defaultdict(int)
+            for k in topk
+        }
+        self.positives_byk_bycat: Dict[int, Dict[str, int]] = {
+            k: defaultdict(int)
+            for k in topk
+        }
+
+    def add_positive(self, k: int, category: str):
+        """Log a positive hit @k for given category."""
+        if k not in self.total_byk_bycat:
+            raise RuntimeError(f'{k} is not a valid recall threshold')
+        self.total_byk_bycat[k][category] += 1
+        self.positives_byk_bycat[k][category] += 1
+
+    def add_negative(self, k: int, category: str):
+        """Log a negative hit @k for given category."""
+        if k not in self.total_byk_bycat:
+            raise RuntimeError(f'{k} is not a valid recall threshold')
+        self.total_byk_bycat[k][category] += 1
+
+    def report(self) -> Dict[str, Dict[str, float]]:
+        """Return a condensed report of the results as a dict of dict.
+
+        report[k][cat] is the recall@k for the given category
+        """
+        report: Dict[str, Dict[str, float]] = {}
+        for k in self.total_byk_bycat:
+            assert k in self.positives_byk_bycat
+            report[str(k)] = {
+                cat:
+                self.positives_byk_bycat[k][cat] / self.total_byk_bycat[k][cat]
+                for cat in self.total_byk_bycat[k]
+            }
+        return report
+
+
+@METRICS.register_module()
+class Flickr30kMetric(BaseMetric):
+    """Phrase Grounding Metric."""
+
+    def __init__(
+        self,
+        topk: Sequence[int] = (1, 5, 10, -1),
+        iou_thrs: float = 0.5,
+        merge_boxes: bool = False,
+        collect_device: str = 'cpu',
+        prefix: Optional[str] = None,
+    ) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.iou_thrs = iou_thrs
+        self.topk = topk
+        self.merge = merge_boxes
+
+    def merge_boxes(self, boxes: List[List[int]]) -> List[List[int]]:
+        """Return the boxes corresponding to the smallest enclosing box
+        containing all the provided boxes The boxes are expected in [x1, y1,
+        x2, y2] format."""
+        if len(boxes) == 1:
+            return boxes
+
+        np_boxes = np.asarray(boxes)
+
+        return [[
+            np.boxes[:, 0].min(), np_boxes[:, 1].min(), np_boxes[:, 2].max(),
+            np_boxes[:, 3].max()
+        ]]
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            pred = data_sample['pred_instances']
+            gt = data_sample['gt_instances']['bboxes']
+            gt_label = data_sample['phrase_ids']
+            phrases = data_sample['phrases']
+            assert len(gt) == len(gt_label)
+
+            self.results.append((pred, gt, gt_label, phrases))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        pred_list, gt_list, gt_label_list, phrase_list = zip(*results)
+
+        recall_tracker = RecallTracker(self.topk)
+
+        for pred, gt_boxes, gt_labels, phrases in zip(pred_list, gt_list,
+                                                      gt_label_list,
+                                                      phrase_list):
+            pred_boxes = pred['bboxes'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+            for i, phrase in enumerate(phrases):
+                cur_index = pred_labels == i
+                cur_boxes = pred_boxes[cur_index]
+                tar_index = [
+                    index for index, value in enumerate(gt_labels)
+                    if value == i
+                ]
+                tar_boxes = gt_boxes[tar_index]
+                if self.merge:
+                    tar_boxes = self.merge_boxes(tar_boxes)
+                if len(cur_boxes) == 0:
+                    cur_boxes = [[0., 0., 0., 0.]]
+                ious = bbox_overlaps(
+                    np.asarray(cur_boxes), np.asarray(tar_boxes))
+                for k in self.topk:
+                    if k == -1:
+                        maxi = ious.max()
+                    else:
+                        assert k > 0
+                        maxi = ious[:k].max()
+                    if maxi >= self.iou_thrs:
+                        recall_tracker.add_positive(k, 'all')
+                        # TODO: do not support class-wise evaluation yet
+                        # for phrase_type in phrase['phrase_type']:
+                        #     recall_tracker.add_positive(k, phrase_type)
+                    else:
+                        recall_tracker.add_negative(k, 'all')
+                        # for phrase_type in phrase['phrase_type']:
+                        #     recall_tracker.add_negative(k, phrase_type)
+
+        results = recall_tracker.report()
+        logger.info(results)
+        return results
diff --git a/head_extractor/src/mmdet/evaluation/metrics/grefcoco_metric.py b/head_extractor/src/mmdet/evaluation/metrics/grefcoco_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..55cc638c5e4de11480a6858d15309017ba59a16a
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/grefcoco_metric.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger
+
+from mmdet.datasets.api_wrappers import COCO
+from mmdet.registry import METRICS
+from ..functional import bbox_overlaps
+
+
+# refer from https://github.com/henghuiding/gRefCOCO/blob/main/mdetr/datasets/refexp.py # noqa
+@METRICS.register_module()
+class gRefCOCOMetric(BaseMetric):
+    default_prefix: Optional[str] = 'grefcoco'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: str = 'bbox',
+                 iou_thrs: float = 0.5,
+                 thresh_score: float = 0.7,
+                 thresh_f1: float = 1.0,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.metric = metric
+        self.iou_thrs = iou_thrs
+        self.thresh_score = thresh_score
+        self.thresh_f1 = thresh_f1
+
+        with get_local_path(ann_file) as local_path:
+            self.coco = COCO(local_path)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu()
+            result['scores'] = pred['scores'].cpu()
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        correct_image = 0
+        num_image = 0
+        nt = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}
+
+        for result in results:
+            img_id = result['img_id']
+            TP = 0
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id)
+            target = self.coco.loadAnns(ann_ids[0])
+
+            converted_bbox_all = []
+            no_target_flag = False
+            for one_target in target:
+                if one_target['category_id'] == -1:
+                    no_target_flag = True
+                target_bbox = one_target['bbox']
+                converted_bbox = [
+                    target_bbox[0],
+                    target_bbox[1],
+                    target_bbox[2] + target_bbox[0],
+                    target_bbox[3] + target_bbox[1],
+                ]
+                converted_bbox_all.append(
+                    np.array(converted_bbox).reshape(-1, 4))
+            gt_bbox_all = np.concatenate(converted_bbox_all, axis=0)
+
+            idx = result['scores'] >= self.thresh_score
+            filtered_boxes = result['bboxes'][idx]
+
+            iou = bbox_overlaps(filtered_boxes.numpy(), gt_bbox_all)
+            iou = torch.from_numpy(iou)
+
+            num_prediction = filtered_boxes.shape[0]
+            num_gt = gt_bbox_all.shape[0]
+            if no_target_flag:
+                if num_prediction >= 1:
+                    nt['FN'] += 1
+                else:
+                    nt['TP'] += 1
+                if num_prediction >= 1:
+                    f_1 = 0.
+                else:
+                    f_1 = 1.0
+            else:
+                if num_prediction >= 1:
+                    nt['TN'] += 1
+                else:
+                    nt['FP'] += 1
+                for i in range(min(num_prediction, num_gt)):
+                    top_value, top_index = torch.topk(iou.flatten(0, 1), 1)
+                    if top_value < self.iou_thrs:
+                        break
+                    else:
+                        top_index_x = top_index // num_gt
+                        top_index_y = top_index % num_gt
+                        TP += 1
+                        iou[top_index_x[0], :] = 0.0
+                        iou[:, top_index_y[0]] = 0.0
+                FP = num_prediction - TP
+                FN = num_gt - TP
+                f_1 = 2 * TP / (2 * TP + FP + FN)
+
+            if f_1 >= self.thresh_f1:
+                correct_image += 1
+            num_image += 1
+
+        score = correct_image / max(num_image, 1)
+        results = {
+            'F1_score': score,
+            'T_acc': nt['TN'] / (nt['TN'] + nt['FP']),
+            'N_acc': nt['TP'] / (nt['TP'] + nt['FN'])
+        }
+        logger.info(results)
+        return results
diff --git a/head_extractor/src/mmdet/evaluation/metrics/lvis_metric.py b/head_extractor/src/mmdet/evaluation/metrics/lvis_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..a861c6ee7b48adb2e428dcdaa97e8dc7ba476a6c
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/lvis_metric.py
@@ -0,0 +1,534 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+import logging
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict, defaultdict
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmengine.dist import (all_gather_object, broadcast_object_list,
+                           is_main_process)
+from mmengine.evaluator import BaseMetric
+from mmengine.evaluator.metric import _to_cpu
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger, print_log
+from terminaltables import AsciiTable
+
+from mmdet.registry import METRICS
+from mmdet.structures.mask import encode_mask_results
+from ..functional import eval_recalls
+from .coco_metric import CocoMetric
+
+try:
+    import lvis
+
+    if getattr(lvis, '__version__', '0') >= '10.5.3':
+        warnings.warn(
+            'mmlvis is deprecated, please install official lvis-api by "pip install git+https://github.com/lvis-dataset/lvis-api.git"',  # noqa: E501
+            UserWarning)
+    from lvis import LVIS, LVISEval, LVISResults
+except ImportError:
+    lvis = None
+    LVISEval = None
+    LVISResults = None
+
+
+@METRICS.register_module()
+class LVISMetric(CocoMetric):
+    """LVIS evaluation metric.
+
+    Args:
+        ann_file (str, optional): Path to the coco format annotation file.
+            If not specified, ground truth annotations from the dataset will
+            be converted to coco format. Defaults to None.
+        metric (str | List[str]): Metrics to be evaluated. Valid metrics
+            include 'bbox', 'segm', 'proposal', and 'proposal_fast'.
+            Defaults to 'bbox'.
+        classwise (bool): Whether to evaluate the metric class-wise.
+            Defaults to False.
+        proposal_nums (Sequence[int]): Numbers of proposals to be evaluated.
+            Defaults to (100, 300, 1000).
+        iou_thrs (float | List[float], optional): IoU threshold to compute AP
+            and AR. If not specified, IoUs from 0.5 to 0.95 will be used.
+            Defaults to None.
+        metric_items (List[str], optional): Metric result names to be
+            recorded in the evaluation result. Defaults to None.
+        format_only (bool): Format the output results without perform
+            evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        outfile_prefix (str, optional): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+        file_client_args (dict, optional): Arguments to instantiate the
+            corresponding backend in mmdet <= 3.0.0rc6. Defaults to None.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+    """
+
+    default_prefix: Optional[str] = 'lvis'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: Union[str, List[str]] = 'bbox',
+                 classwise: bool = False,
+                 proposal_nums: Sequence[int] = (100, 300, 1000),
+                 iou_thrs: Optional[Union[float, Sequence[float]]] = None,
+                 metric_items: Optional[Sequence[str]] = None,
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 file_client_args: dict = None,
+                 backend_args: dict = None) -> None:
+        if lvis is None:
+            raise RuntimeError(
+                'Package lvis is not installed. Please run "pip install '
+                'git+https://github.com/lvis-dataset/lvis-api.git".')
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        # coco evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(
+                    "metric should be one of 'bbox', 'segm', 'proposal', "
+                    f"'proposal_fast', but got {metric}.")
+
+        # do class wise evaluation, default False
+        self.classwise = classwise
+
+        # proposal_nums used to compute recall or precision.
+        self.proposal_nums = list(proposal_nums)
+
+        # iou_thrs used to compute recall or precision.
+        if iou_thrs is None:
+            iou_thrs = np.linspace(
+                .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        self.iou_thrs = iou_thrs
+        self.metric_items = metric_items
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+
+        self.outfile_prefix = outfile_prefix
+        self.backend_args = backend_args
+        if file_client_args is not None:
+            raise RuntimeError(
+                'The `file_client_args` is deprecated, '
+                'please use `backend_args` instead, please refer to'
+                'https://github.com/open-mmlab/mmdetection/blob/main/configs/_base_/datasets/coco_detection.py'  # noqa: E501
+            )
+
+        # if ann_file is not specified,
+        # initialize lvis api with the converted dataset
+        if ann_file is not None:
+            with get_local_path(
+                    ann_file, backend_args=self.backend_args) as local_path:
+                self._lvis_api = LVIS(local_path)
+        else:
+            self._lvis_api = None
+
+        # handle dataset lazy init
+        self.cat_ids = None
+        self.img_ids = None
+
+    def fast_eval_recall(self,
+                         results: List[dict],
+                         proposal_nums: Sequence[int],
+                         iou_thrs: Sequence[float],
+                         logger: Optional[MMLogger] = None) -> np.ndarray:
+        """Evaluate proposal recall with LVIS's fast_eval_recall.
+
+        Args:
+            results (List[dict]): Results of the dataset.
+            proposal_nums (Sequence[int]): Proposal numbers used for
+                evaluation.
+            iou_thrs (Sequence[float]): IoU thresholds used for evaluation.
+            logger (MMLogger, optional): Logger used for logging the recall
+                summary.
+        Returns:
+            np.ndarray: Averaged recall results.
+        """
+        gt_bboxes = []
+        pred_bboxes = [result['bboxes'] for result in results]
+        for i in range(len(self.img_ids)):
+            ann_ids = self._lvis_api.get_ann_ids(img_ids=[self.img_ids[i]])
+            ann_info = self._lvis_api.load_anns(ann_ids)
+            if len(ann_info) == 0:
+                gt_bboxes.append(np.zeros((0, 4)))
+                continue
+            bboxes = []
+            for ann in ann_info:
+                x1, y1, w, h = ann['bbox']
+                bboxes.append([x1, y1, x1 + w, y1 + h])
+            bboxes = np.array(bboxes, dtype=np.float32)
+            if bboxes.shape[0] == 0:
+                bboxes = np.zeros((0, 4))
+            gt_bboxes.append(bboxes)
+
+        recalls = eval_recalls(
+            gt_bboxes, pred_bboxes, proposal_nums, iou_thrs, logger=logger)
+        ar = recalls.mean(axis=1)
+        return ar
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            result['labels'] = pred['labels'].cpu().numpy()
+            # encode mask to RLE
+            if 'masks' in pred:
+                result['masks'] = encode_mask_results(
+                    pred['masks'].detach().cpu().numpy())
+            # some detectors use different scores for bbox and mask
+            if 'mask_scores' in pred:
+                result['mask_scores'] = pred['mask_scores'].cpu().numpy()
+
+            # parse gt
+            gt = dict()
+            gt['width'] = data_sample['ori_shape'][1]
+            gt['height'] = data_sample['ori_shape'][0]
+            gt['img_id'] = data_sample['img_id']
+            if self._lvis_api is None:
+                # TODO: Need to refactor to support LoadAnnotations
+                assert 'instances' in data_sample, \
+                    'ground truth is required for evaluation when ' \
+                    '`ann_file` is not provided'
+                gt['anns'] = data_sample['instances']
+            # add converted result to the results list
+            self.results.append((gt, result))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # split gt and prediction list
+        gts, preds = zip(*results)
+
+        tmp_dir = None
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+
+        if self._lvis_api is None:
+            # use converted gt json file to initialize coco api
+            logger.info('Converting ground truth to coco format...')
+            coco_json_path = self.gt_to_coco_json(
+                gt_dicts=gts, outfile_prefix=outfile_prefix)
+            self._lvis_api = LVIS(coco_json_path)
+
+        # handle lazy init
+        if self.cat_ids is None:
+            self.cat_ids = self._lvis_api.get_cat_ids()
+        if self.img_ids is None:
+            self.img_ids = self._lvis_api.get_img_ids()
+
+        # convert predictions to coco format and dump to json file
+        result_files = self.results2json(preds, outfile_prefix)
+
+        eval_results = OrderedDict()
+        if self.format_only:
+            logger.info('results are saved in '
+                        f'{osp.dirname(outfile_prefix)}')
+            return eval_results
+
+        lvis_gt = self._lvis_api
+
+        for metric in self.metrics:
+            logger.info(f'Evaluating {metric}...')
+
+            # TODO: May refactor fast_eval_recall to an independent metric?
+            # fast eval recall
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    preds, self.proposal_nums, self.iou_thrs, logger=logger)
+                log_msg = []
+                for i, num in enumerate(self.proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+                log_msg = ''.join(log_msg)
+                logger.info(log_msg)
+                continue
+
+            try:
+                lvis_dt = LVISResults(lvis_gt, result_files[metric])
+            except IndexError:
+                logger.info(
+                    'The testing results of the whole dataset is empty.')
+                break
+
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            lvis_eval = LVISEval(lvis_gt, lvis_dt, iou_type)
+            lvis_eval.params.imgIds = self.img_ids
+            metric_items = self.metric_items
+            if metric == 'proposal':
+                lvis_eval.params.useCats = 0
+                lvis_eval.params.maxDets = list(self.proposal_nums)
+                lvis_eval.evaluate()
+                lvis_eval.accumulate()
+                lvis_eval.summarize()
+                if metric_items is None:
+                    metric_items = ['AR@300', 'ARs@300', 'ARm@300', 'ARl@300']
+                for k, v in lvis_eval.get_results().items():
+                    if k in metric_items:
+                        val = float('{:.3f}'.format(float(v)))
+                        eval_results[k] = val
+
+            else:
+                lvis_eval.evaluate()
+                lvis_eval.accumulate()
+                lvis_eval.summarize()
+                lvis_results = lvis_eval.get_results()
+                if self.classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = lvis_eval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, catId in enumerate(self.cat_ids):
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        # the dimensions of precisions are
+                        # [num_thrs, num_recalls, num_cats, num_area_rngs]
+                        nm = self._lvis_api.load_cats([catId])[0]
+                        precision = precisions[:, :, idx, 0]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        results_per_category.append(
+                            (f'{nm["name"]}', f'{float(ap):0.3f}'))
+                        eval_results[f'{nm["name"]}_precision'] = round(ap, 3)
+
+                    num_columns = min(6, len(results_per_category) * 2)
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = ['category', 'AP'] * (num_columns // 2)
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    logger.info('\n' + table.table)
+
+                if metric_items is None:
+                    metric_items = [
+                        'AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'APr',
+                        'APc', 'APf'
+                    ]
+
+                for k, v in lvis_results.items():
+                    if k in metric_items:
+                        key = '{}_{}'.format(metric, k)
+                        val = float('{:.3f}'.format(float(v)))
+                        eval_results[key] = val
+
+            lvis_eval.print_results()
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
+
+
+def _merge_lists(listA, listB, maxN, key):
+    result = []
+    indA, indB = 0, 0
+    while (indA < len(listA) or indB < len(listB)) and len(result) < maxN:
+        if (indB < len(listB)) and (indA >= len(listA)
+                                    or key(listA[indA]) < key(listB[indB])):
+            result.append(listB[indB])
+            indB += 1
+        else:
+            result.append(listA[indA])
+            indA += 1
+    return result
+
+
+@METRICS.register_module()
+class LVISFixedAPMetric(BaseMetric):
+    default_prefix: Optional[str] = 'lvis_fixed_ap'
+
+    def __init__(self,
+                 ann_file: str,
+                 topk: int = 10000,
+                 format_only: bool = False,
+                 outfile_prefix: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 backend_args: dict = None) -> None:
+
+        if lvis is None:
+            raise RuntimeError(
+                'Package lvis is not installed. Please run "pip install '
+                'git+https://github.com/lvis-dataset/lvis-api.git".')
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+
+        self.outfile_prefix = outfile_prefix
+        self.backend_args = backend_args
+
+        with get_local_path(
+                ann_file, backend_args=self.backend_args) as local_path:
+            self._lvis_api = LVIS(local_path)
+
+        self.cat_ids = self._lvis_api.get_cat_ids()
+
+        self.results = {}
+        self.topk = topk
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        cur_results = []
+        for data_sample in data_samples:
+            pred = data_sample['pred_instances']
+            xmin, ymin, xmax, ymax = pred['bboxes'].cpu().unbind(1)
+            boxes = torch.stack((xmin, ymin, xmax - xmin, ymax - ymin),
+                                dim=1).tolist()
+
+            scores = pred['scores'].cpu().numpy()
+            labels = pred['labels'].cpu().numpy()
+
+            if len(boxes) == 0:
+                continue
+
+            cur_results.extend([{
+                'image_id': data_sample['img_id'],
+                'category_id': self.cat_ids[labels[k]],
+                'bbox': box,
+                'score': scores[k],
+            } for k, box in enumerate(boxes)])
+
+        by_cat = defaultdict(list)
+        for ann in cur_results:
+            by_cat[ann['category_id']].append(ann)
+
+        for cat, cat_anns in by_cat.items():
+            if cat not in self.results:
+                self.results[cat] = []
+
+            cur = sorted(
+                cat_anns, key=lambda x: x['score'], reverse=True)[:self.topk]
+            self.results[cat] = _merge_lists(
+                self.results[cat], cur, self.topk, key=lambda x: x['score'])
+
+    def compute_metrics(self, results: dict) -> dict:
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        new_results = []
+
+        missing_dets_cats = set()
+        for cat, cat_anns in results.items():
+            if len(cat_anns) < self.topk:
+                missing_dets_cats.add(cat)
+            new_results.extend(
+                sorted(cat_anns, key=lambda x: x['score'],
+                       reverse=True)[:self.topk])
+
+        if missing_dets_cats:
+            logger.info(
+                f'\n===\n'
+                f'{len(missing_dets_cats)} classes had less than {self.topk} '
+                f'detections!\n Outputting {self.topk} detections for each '
+                f'class will improve AP further.\n ===')
+
+        new_results = LVISResults(self._lvis_api, new_results, max_dets=-1)
+        lvis_eval = LVISEval(self._lvis_api, new_results, iou_type='bbox')
+        params = lvis_eval.params
+        params.max_dets = -1  # No limit on detections per image.
+        lvis_eval.run()
+        lvis_eval.print_results()
+        metrics = {
+            k: v
+            for k, v in lvis_eval.results.items() if k.startswith('AP')
+        }
+        logger.info(f'mAP_copypaste: {metrics}')
+        return metrics
+
+    def evaluate(self, size: int) -> dict:
+        if len(self.results) == 0:
+            print_log(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.',
+                logger='current',
+                level=logging.WARNING)
+
+        all_cats = all_gather_object(self.results)
+        results = defaultdict(list)
+        for cats in all_cats:
+            for cat, cat_anns in cats.items():
+                results[cat].extend(cat_anns)
+
+        if is_main_process():
+            # cast all tensors in results list to cpu
+            results = _to_cpu(results)
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results
+        self.results = {}
+        return metrics[0]
diff --git a/head_extractor/src/mmdet/evaluation/metrics/mot_challenge_metric.py b/head_extractor/src/mmdet/evaluation/metrics/mot_challenge_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5513c44e81de7dd869d4c5c802bfac0387bdbf6
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/mot_challenge_metric.py
@@ -0,0 +1,443 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+import tempfile
+from collections import defaultdict
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+
+try:
+    import trackeval
+except ImportError:
+    trackeval = None
+from mmengine.dist import (all_gather_object, barrier, broadcast,
+                           broadcast_object_list, get_dist_info,
+                           is_main_process)
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS, TASK_UTILS
+from .base_video_metric import BaseVideoMetric
+
+
+def get_tmpdir() -> str:
+    """return the same tmpdir for all processes."""
+    rank, world_size = get_dist_info()
+    MAX_LEN = 512
+    # 32 is whitespace
+    dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8)
+    if rank == 0:
+        tmpdir = tempfile.mkdtemp()
+        tmpdir = torch.tensor(bytearray(tmpdir.encode()), dtype=torch.uint8)
+        dir_tensor[:len(tmpdir)] = tmpdir
+    broadcast(dir_tensor, 0)
+    tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    return tmpdir
+
+
+@METRICS.register_module()
+class MOTChallengeMetric(BaseVideoMetric):
+    """Evaluation metrics for MOT Challenge.
+
+    Args:
+        metric (str | list[str]): Metrics to be evaluated. Options are
+            'HOTA', 'CLEAR', 'Identity'.
+            Defaults to ['HOTA', 'CLEAR', 'Identity'].
+        outfile_prefix (str, optional): Path to save the formatted results.
+            Defaults to None.
+        track_iou_thr (float): IoU threshold for tracking evaluation.
+            Defaults to 0.5.
+        benchmark (str): Benchmark to be evaluated. Defaults to 'MOT17'.
+        format_only (bool): If True, only formatting the results to the
+            official format and not performing evaluation. Defaults to False.
+        postprocess_tracklet_cfg (List[dict], optional): configs for tracklets
+            postprocessing methods. `InterpolateTracklets` is supported.
+            Defaults to []
+            - InterpolateTracklets:
+                - min_num_frames (int, optional): The minimum length of a
+                    track that will be interpolated. Defaults to 5.
+                - max_num_frames (int, optional): The maximum disconnected
+                    length in a track. Defaults to 20.
+                - use_gsi (bool, optional): Whether to use the GSI (Gaussian-
+                    smoothed interpolation) method. Defaults to False.
+                - smooth_tau (int, optional): smoothing parameter in GSI.
+                    Defaults to 10.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+    Returns:
+    """
+    TRACKER = 'default-tracker'
+    allowed_metrics = ['HOTA', 'CLEAR', 'Identity']
+    allowed_benchmarks = ['MOT15', 'MOT16', 'MOT17', 'MOT20', 'DanceTrack']
+    default_prefix: Optional[str] = 'motchallenge-metric'
+
+    def __init__(self,
+                 metric: Union[str, List[str]] = ['HOTA', 'CLEAR', 'Identity'],
+                 outfile_prefix: Optional[str] = None,
+                 track_iou_thr: float = 0.5,
+                 benchmark: str = 'MOT17',
+                 format_only: bool = False,
+                 use_postprocess: bool = False,
+                 postprocess_tracklet_cfg: Optional[List[dict]] = [],
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        if trackeval is None:
+            raise RuntimeError(
+                'trackeval is not installed,'
+                'please install it by: pip install'
+                'git+https://github.com/JonathonLuiten/TrackEval.git'
+                'trackeval need low version numpy, please install it'
+                'by: pip install -U numpy==1.23.5')
+        if isinstance(metric, list):
+            metrics = metric
+        elif isinstance(metric, str):
+            metrics = [metric]
+        else:
+            raise TypeError('metric must be a list or a str.')
+        for metric in metrics:
+            if metric not in self.allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported.')
+        self.metrics = metrics
+        self.format_only = format_only
+        if self.format_only:
+            assert outfile_prefix is not None, 'outfile_prefix must be not'
+            'None when format_only is True, otherwise the result files will'
+            'be saved to a temp directory which will be cleaned up at the end.'
+        self.use_postprocess = use_postprocess
+        self.postprocess_tracklet_cfg = postprocess_tracklet_cfg.copy()
+        self.postprocess_tracklet_methods = [
+            TASK_UTILS.build(cfg) for cfg in self.postprocess_tracklet_cfg
+        ]
+        assert benchmark in self.allowed_benchmarks
+        self.benchmark = benchmark
+        self.track_iou_thr = track_iou_thr
+        self.tmp_dir = tempfile.TemporaryDirectory()
+        self.tmp_dir.name = get_tmpdir()
+        self.seq_info = defaultdict(
+            lambda: dict(seq_length=-1, gt_tracks=[], pred_tracks=[]))
+        self.gt_dir = self._get_gt_dir()
+        self.pred_dir = self._get_pred_dir(outfile_prefix)
+        self.seqmap = osp.join(self.pred_dir, 'videoseq.txt')
+        with open(self.seqmap, 'w') as f:
+            f.write('name\n')
+
+    def __del__(self):
+        # To avoid tmpdir being cleaned up too early, because in multiple
+        # consecutive ValLoops, the value of `self.tmp_dir.name` is unchanged,
+        # and calling `tmp_dir.cleanup()` in compute_metrics will cause errors.
+        self.tmp_dir.cleanup()
+
+    def _get_pred_dir(self, outfile_prefix):
+        """Get directory to save the prediction results."""
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        if outfile_prefix is None:
+            outfile_prefix = self.tmp_dir.name
+        else:
+            if osp.exists(outfile_prefix) and is_main_process():
+                logger.info('remove previous results.')
+                shutil.rmtree(outfile_prefix)
+        pred_dir = osp.join(outfile_prefix, self.TRACKER)
+        os.makedirs(pred_dir, exist_ok=True)
+        return pred_dir
+
+    def _get_gt_dir(self):
+        """Get directory to save the gt files."""
+        output_dir = osp.join(self.tmp_dir.name, 'gt')
+        os.makedirs(output_dir, exist_ok=True)
+        return output_dir
+
+    def transform_gt_and_pred(self, img_data_sample, video, frame_id):
+
+        video = img_data_sample['img_path'].split(os.sep)[-3]
+        # load gts
+        if 'instances' in img_data_sample:
+            gt_instances = img_data_sample['instances']
+            gt_tracks = [
+                np.array([
+                    frame_id + 1, gt_instances[i]['instance_id'],
+                    gt_instances[i]['bbox'][0], gt_instances[i]['bbox'][1],
+                    gt_instances[i]['bbox'][2] - gt_instances[i]['bbox'][0],
+                    gt_instances[i]['bbox'][3] - gt_instances[i]['bbox'][1],
+                    gt_instances[i]['mot_conf'],
+                    gt_instances[i]['category_id'],
+                    gt_instances[i]['visibility']
+                ]) for i in range(len(gt_instances))
+            ]
+            self.seq_info[video]['gt_tracks'].extend(gt_tracks)
+
+        # load predictions
+        assert 'pred_track_instances' in img_data_sample
+        if self.use_postprocess:
+            pred_instances = img_data_sample['pred_track_instances']
+            pred_tracks = [
+                pred_instances['bboxes'][i]
+                for i in range(len(pred_instances['bboxes']))
+            ]
+        else:
+            pred_instances = img_data_sample['pred_track_instances']
+            pred_tracks = [
+                np.array([
+                    frame_id + 1, pred_instances['instances_id'][i].cpu(),
+                    pred_instances['bboxes'][i][0].cpu(),
+                    pred_instances['bboxes'][i][1].cpu(),
+                    (pred_instances['bboxes'][i][2] -
+                     pred_instances['bboxes'][i][0]).cpu(),
+                    (pred_instances['bboxes'][i][3] -
+                     pred_instances['bboxes'][i][1]).cpu(),
+                    pred_instances['scores'][i].cpu()
+                ]) for i in range(len(pred_instances['instances_id']))
+            ]
+        self.seq_info[video]['pred_tracks'].extend(pred_tracks)
+
+    def process_image(self, data_samples, video_len):
+
+        img_data_sample = data_samples[0].to_dict()
+        video = img_data_sample['img_path'].split(os.sep)[-3]
+        frame_id = img_data_sample['frame_id']
+        if self.seq_info[video]['seq_length'] == -1:
+            self.seq_info[video]['seq_length'] = video_len
+        self.transform_gt_and_pred(img_data_sample, video, frame_id)
+
+        if frame_id == video_len - 1:
+            # postprocessing
+            if self.postprocess_tracklet_cfg:
+                info = self.seq_info[video]
+                pred_tracks = np.array(info['pred_tracks'])
+                for postprocess_tracklet_methods in \
+                        self.postprocess_tracklet_methods:
+                    pred_tracks = postprocess_tracklet_methods\
+                        .forward(pred_tracks)
+                info['pred_tracks'] = pred_tracks
+            self._save_one_video_gts_preds(video)
+
+    def process_video(self, data_samples):
+
+        video_len = len(data_samples)
+        for frame_id in range(video_len):
+            img_data_sample = data_samples[frame_id].to_dict()
+            # load basic info
+            video = img_data_sample['img_path'].split(os.sep)[-3]
+            if self.seq_info[video]['seq_length'] == -1:
+                self.seq_info[video]['seq_length'] = video_len
+            self.transform_gt_and_pred(img_data_sample, video, frame_id)
+
+        if self.postprocess_tracklet_cfg:
+            info = self.seq_info[video]
+            pred_tracks = np.array(info['pred_tracks'])
+            for postprocess_tracklet_methods in \
+                    self.postprocess_tracklet_methods:
+                pred_tracks = postprocess_tracklet_methods \
+                    .forward(pred_tracks)
+            info['pred_tracks'] = pred_tracks
+        self._save_one_video_gts_preds(video)
+
+    def _save_one_video_gts_preds(self, seq: str) -> None:
+        """Save the gt and prediction results."""
+        info = self.seq_info[seq]
+        # save predictions
+        pred_file = osp.join(self.pred_dir, seq + '.txt')
+
+        pred_tracks = np.array(info['pred_tracks'])
+
+        with open(pred_file, 'wt') as f:
+            for tracks in pred_tracks:
+                line = '%d,%d,%.3f,%.3f,%.3f,%.3f,%.3f,-1,-1,-1\n' % (
+                    tracks[0], tracks[1], tracks[2], tracks[3], tracks[4],
+                    tracks[5], tracks[6])
+                f.writelines(line)
+
+        info['pred_tracks'] = []
+        # save gts
+        if info['gt_tracks']:
+            gt_file = osp.join(self.gt_dir, seq + '.txt')
+            with open(gt_file, 'wt') as f:
+                for tracks in info['gt_tracks']:
+                    line = '%d,%d,%d,%d,%d,%d,%d,%d,%.5f\n' % (
+                        tracks[0], tracks[1], tracks[2], tracks[3], tracks[4],
+                        tracks[5], tracks[6], tracks[7], tracks[8])
+                    f.writelines(line)
+            info['gt_tracks'].clear()
+        # save seq info
+        with open(self.seqmap, 'a') as f:
+            f.write(seq + '\n')
+            f.close()
+
+    def compute_metrics(self, results: list = None) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+                Defaults to None.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # NOTICE: don't access `self.results` from the method.
+        eval_results = dict()
+
+        if self.format_only:
+            return eval_results
+
+        eval_config = trackeval.Evaluator.get_default_eval_config()
+
+        # need to split out the tracker name
+        # caused by the implementation of TrackEval
+        pred_dir_tmp = self.pred_dir.rsplit(osp.sep, 1)[0]
+        dataset_config = self.get_dataset_cfg(self.gt_dir, pred_dir_tmp)
+
+        evaluator = trackeval.Evaluator(eval_config)
+        dataset = [trackeval.datasets.MotChallenge2DBox(dataset_config)]
+        metrics = [
+            getattr(trackeval.metrics,
+                    metric)(dict(METRICS=[metric], THRESHOLD=0.5))
+            for metric in self.metrics
+        ]
+        output_res, _ = evaluator.evaluate(dataset, metrics)
+        output_res = output_res['MotChallenge2DBox'][
+            self.TRACKER]['COMBINED_SEQ']['pedestrian']
+
+        if 'HOTA' in self.metrics:
+            logger.info('Evaluating HOTA Metrics...')
+            eval_results['HOTA'] = np.average(output_res['HOTA']['HOTA'])
+            eval_results['AssA'] = np.average(output_res['HOTA']['AssA'])
+            eval_results['DetA'] = np.average(output_res['HOTA']['DetA'])
+
+        if 'CLEAR' in self.metrics:
+            logger.info('Evaluating CLEAR Metrics...')
+            eval_results['MOTA'] = np.average(output_res['CLEAR']['MOTA'])
+            eval_results['MOTP'] = np.average(output_res['CLEAR']['MOTP'])
+            eval_results['IDSW'] = np.average(output_res['CLEAR']['IDSW'])
+            eval_results['TP'] = np.average(output_res['CLEAR']['CLR_TP'])
+            eval_results['FP'] = np.average(output_res['CLEAR']['CLR_FP'])
+            eval_results['FN'] = np.average(output_res['CLEAR']['CLR_FN'])
+            eval_results['Frag'] = np.average(output_res['CLEAR']['Frag'])
+            eval_results['MT'] = np.average(output_res['CLEAR']['MT'])
+            eval_results['ML'] = np.average(output_res['CLEAR']['ML'])
+
+        if 'Identity' in self.metrics:
+            logger.info('Evaluating Identity Metrics...')
+            eval_results['IDF1'] = np.average(output_res['Identity']['IDF1'])
+            eval_results['IDTP'] = np.average(output_res['Identity']['IDTP'])
+            eval_results['IDFN'] = np.average(output_res['Identity']['IDFN'])
+            eval_results['IDFP'] = np.average(output_res['Identity']['IDFP'])
+            eval_results['IDP'] = np.average(output_res['Identity']['IDP'])
+            eval_results['IDR'] = np.average(output_res['Identity']['IDR'])
+
+        return eval_results
+
+    def evaluate(self, size: int = 1) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+                Defaults to None.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        # wait for all processes to complete prediction.
+        barrier()
+
+        # gather seq_info and convert the list of dict to a dict.
+        # convert self.seq_info to dict first to make it picklable.
+        gathered_seq_info = all_gather_object(dict(self.seq_info))
+        all_seq_info = dict()
+        for _seq_info in gathered_seq_info:
+            all_seq_info.update(_seq_info)
+        self.seq_info = all_seq_info
+
+        if is_main_process():
+            _metrics = self.compute_metrics()  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
+
+    def get_dataset_cfg(self, gt_folder: str, tracker_folder: str):
+        """Get default configs for trackeval.datasets.MotChallenge2DBox.
+
+        Args:
+            gt_folder (str): the name of the GT folder
+            tracker_folder (str): the name of the tracker folder
+
+        Returns:
+            Dataset Configs for MotChallenge2DBox.
+        """
+        dataset_config = dict(
+            # Location of GT data
+            GT_FOLDER=gt_folder,
+            # Trackers location
+            TRACKERS_FOLDER=tracker_folder,
+            # Where to save eval results
+            # (if None, same as TRACKERS_FOLDER)
+            OUTPUT_FOLDER=None,
+            # Use self.TRACKER as the default tracker
+            TRACKERS_TO_EVAL=[self.TRACKER],
+            # Option values: ['pedestrian']
+            CLASSES_TO_EVAL=['pedestrian'],
+            # Option Values: 'MOT15', 'MOT16', 'MOT17', 'MOT20', 'DanceTrack'
+            BENCHMARK=self.benchmark,
+            # Option Values: 'train', 'test'
+            SPLIT_TO_EVAL='val' if self.benchmark == 'DanceTrack' else 'train',
+            # Whether tracker input files are zipped
+            INPUT_AS_ZIP=False,
+            # Whether to print current config
+            PRINT_CONFIG=True,
+            # Whether to perform preprocessing
+            # (never done for MOT15)
+            DO_PREPROC=False if self.benchmark == 'MOT15' else True,
+            # Tracker files are in
+            # TRACKER_FOLDER/tracker_name/TRACKER_SUB_FOLDER
+            TRACKER_SUB_FOLDER='',
+            # Output files are saved in
+            # OUTPUT_FOLDER/tracker_name/OUTPUT_SUB_FOLDER
+            OUTPUT_SUB_FOLDER='',
+            # Names of trackers to display
+            # (if None: TRACKERS_TO_EVAL)
+            TRACKER_DISPLAY_NAMES=None,
+            # Where seqmaps are found
+            # (if None: GT_FOLDER/seqmaps)
+            SEQMAP_FOLDER=None,
+            # Directly specify seqmap file
+            # (if none use seqmap_folder/benchmark-split_to_eval)
+            SEQMAP_FILE=self.seqmap,
+            # If not None, specify sequences to eval
+            # and their number of timesteps
+            SEQ_INFO={
+                seq: info['seq_length']
+                for seq, info in self.seq_info.items()
+            },
+            # '{gt_folder}/{seq}.txt'
+            GT_LOC_FORMAT='{gt_folder}/{seq}.txt',
+            # If False, data is in GT_FOLDER/BENCHMARK-SPLIT_TO_EVAL/ and in
+            # TRACKERS_FOLDER/BENCHMARK-SPLIT_TO_EVAL/tracker/
+            # If True, the middle 'benchmark-split' folder is skipped for both.
+            SKIP_SPLIT_FOL=True,
+        )
+
+        return dataset_config
diff --git a/head_extractor/src/mmdet/evaluation/metrics/openimages_metric.py b/head_extractor/src/mmdet/evaluation/metrics/openimages_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d75c59e0e711c90bb1e5fbcc1529e95864e99e9a
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/openimages_metric.py
@@ -0,0 +1,237 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from collections import OrderedDict
+from typing import List, Optional, Sequence, Union
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+
+from mmdet.registry import METRICS
+from ..functional import eval_map
+
+
+@METRICS.register_module()
+class OpenImagesMetric(BaseMetric):
+    """OpenImages evaluation metric.
+
+    Evaluate detection mAP for OpenImages. Please refer to
+    https://storage.googleapis.com/openimages/web/evaluation.html for more
+    details.
+
+    Args:
+        iou_thrs (float or List[float]): IoU threshold. Defaults to 0.5.
+        ioa_thrs (float or List[float]): IoA threshold. Defaults to 0.5.
+        scale_ranges (List[tuple], optional): Scale ranges for evaluating
+            mAP. If not specified, all bounding boxes would be included in
+            evaluation. Defaults to None
+        use_group_of (bool): Whether consider group of groud truth bboxes
+            during evaluating. Defaults to True.
+        get_supercategory (bool): Whether to get parent class of the
+            current class. Default: True.
+        filter_labels (bool): Whether filter unannotated classes.
+            Default: True.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+    default_prefix: Optional[str] = 'openimages'
+
+    def __init__(self,
+                 iou_thrs: Union[float, List[float]] = 0.5,
+                 ioa_thrs: Union[float, List[float]] = 0.5,
+                 scale_ranges: Optional[List[tuple]] = None,
+                 use_group_of: bool = True,
+                 get_supercategory: bool = True,
+                 filter_labels: bool = True,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.iou_thrs = [iou_thrs] if isinstance(iou_thrs, float) else iou_thrs
+        self.ioa_thrs = [ioa_thrs] if (isinstance(ioa_thrs, float)
+                                       or ioa_thrs is None) else ioa_thrs
+        assert isinstance(self.iou_thrs, list) and isinstance(
+            self.ioa_thrs, list)
+        assert len(self.iou_thrs) == len(self.ioa_thrs)
+
+        self.scale_ranges = scale_ranges
+        self.use_group_of = use_group_of
+        self.get_supercategory = get_supercategory
+        self.filter_labels = filter_labels
+
+    def _get_supercategory_ann(self, instances: List[dict]) -> List[dict]:
+        """Get parent classes's annotation of the corresponding class.
+
+        Args:
+            instances (List[dict]): A list of annotations of the instances.
+
+        Returns:
+            List[dict]: Annotations extended with super-category.
+        """
+        supercat_instances = []
+        relation_matrix = self.dataset_meta['RELATION_MATRIX']
+        for instance in instances:
+            labels = np.where(relation_matrix[instance['bbox_label']])[0]
+            for label in labels:
+                if label == instance['bbox_label']:
+                    continue
+                new_instance = copy.deepcopy(instance)
+                new_instance['bbox_label'] = label
+                supercat_instances.append(new_instance)
+        return supercat_instances
+
+    def _process_predictions(self, pred_bboxes: np.ndarray,
+                             pred_scores: np.ndarray, pred_labels: np.ndarray,
+                             gt_instances: list,
+                             image_level_labels: np.ndarray) -> tuple:
+        """Process results of the corresponding class of the detection bboxes.
+
+        Note: It will choose to do the following two processing according to
+        the parameters:
+
+        1. Whether to add parent classes of the corresponding class of the
+        detection bboxes.
+
+        2. Whether to ignore the classes that unannotated on that image.
+
+        Args:
+            pred_bboxes (np.ndarray): bboxes predicted by the model
+            pred_scores (np.ndarray): scores predicted by the model
+            pred_labels (np.ndarray): labels predicted by the model
+            gt_instances (list): ground truth annotations
+            image_level_labels (np.ndarray): human-verified image level labels
+
+        Returns:
+            tuple: Processed bboxes, scores, and labels.
+        """
+        processed_bboxes = copy.deepcopy(pred_bboxes)
+        processed_scores = copy.deepcopy(pred_scores)
+        processed_labels = copy.deepcopy(pred_labels)
+        gt_labels = np.array([ins['bbox_label'] for ins in gt_instances],
+                             dtype=np.int64)
+        if image_level_labels is not None:
+            allowed_classes = np.unique(
+                np.append(gt_labels, image_level_labels))
+        else:
+            allowed_classes = np.unique(gt_labels)
+        relation_matrix = self.dataset_meta['RELATION_MATRIX']
+        pred_classes = np.unique(pred_labels)
+        for pred_class in pred_classes:
+            classes = np.where(relation_matrix[pred_class])[0]
+            for cls in classes:
+                if (cls in allowed_classes and cls != pred_class
+                        and self.get_supercategory):
+                    # add super-supercategory preds
+                    index = np.where(pred_labels == pred_class)[0]
+                    processed_scores = np.concatenate(
+                        [processed_scores, pred_scores[index]])
+                    processed_bboxes = np.concatenate(
+                        [processed_bboxes, pred_bboxes[index]])
+                    extend_labels = np.full(index.shape, cls, dtype=np.int64)
+                    processed_labels = np.concatenate(
+                        [processed_labels, extend_labels])
+                elif cls not in allowed_classes and self.filter_labels:
+                    # remove unannotated preds
+                    index = np.where(processed_labels != cls)[0]
+                    processed_scores = processed_scores[index]
+                    processed_bboxes = processed_bboxes[index]
+                    processed_labels = processed_labels[index]
+        return processed_bboxes, processed_scores, processed_labels
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            gt = copy.deepcopy(data_sample)
+            # add super-category instances
+            # TODO: Need to refactor to support LoadAnnotations
+            instances = gt['instances']
+            if self.get_supercategory:
+                supercat_instances = self._get_supercategory_ann(instances)
+                instances.extend(supercat_instances)
+            gt_labels = []
+            gt_bboxes = []
+            is_group_ofs = []
+            for ins in instances:
+                gt_labels.append(ins['bbox_label'])
+                gt_bboxes.append(ins['bbox'])
+                is_group_ofs.append(ins['is_group_of'])
+            ann = dict(
+                labels=np.array(gt_labels, dtype=np.int64),
+                bboxes=np.array(gt_bboxes, dtype=np.float32).reshape((-1, 4)),
+                gt_is_group_ofs=np.array(is_group_ofs, dtype=bool))
+
+            image_level_labels = gt.get('image_level_labels', None)
+            pred = data_sample['pred_instances']
+            pred_bboxes = pred['bboxes'].cpu().numpy()
+            pred_scores = pred['scores'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+
+            pred_bboxes, pred_scores, pred_labels = self._process_predictions(
+                pred_bboxes, pred_scores, pred_labels, instances,
+                image_level_labels)
+
+            dets = []
+            for label in range(len(self.dataset_meta['classes'])):
+                index = np.where(pred_labels == label)[0]
+                pred_bbox_scores = np.hstack(
+                    [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
+                dets.append(pred_bbox_scores)
+            self.results.append((ann, dets))
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        logger = MMLogger.get_current_instance()
+        gts, preds = zip(*results)
+        eval_results = OrderedDict()
+        # get dataset type
+        dataset_type = self.dataset_meta.get('dataset_type')
+        if dataset_type not in ['oid_challenge', 'oid_v6']:
+            dataset_type = 'oid_v6'
+            print_log(
+                'Cannot infer dataset type from the length of the'
+                ' classes. Set `oid_v6` as dataset type.',
+                logger='current')
+        mean_aps = []
+        for i, (iou_thr,
+                ioa_thr) in enumerate(zip(self.iou_thrs, self.ioa_thrs)):
+            if self.use_group_of:
+                assert ioa_thr is not None, 'ioa_thr must have value when' \
+                                            ' using group_of in evaluation.'
+            print_log(f'\n{"-" * 15}iou_thr, ioa_thr: {iou_thr}, {ioa_thr}'
+                      f'{"-" * 15}')
+            mean_ap, _ = eval_map(
+                preds,
+                gts,
+                scale_ranges=self.scale_ranges,
+                iou_thr=iou_thr,
+                ioa_thr=ioa_thr,
+                dataset=dataset_type,
+                logger=logger,
+                use_group_of=self.use_group_of)
+
+            mean_aps.append(mean_ap)
+            eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
+        eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
+        return eval_results
diff --git a/head_extractor/src/mmdet/evaluation/metrics/ov_coco_metric.py b/head_extractor/src/mmdet/evaluation/metrics/ov_coco_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..08cb902514914947551a5047c9900947738adf24
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/ov_coco_metric.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+import os.path as osp
+import tempfile
+from collections import OrderedDict
+from typing import Dict
+
+import numpy as np
+from mmengine.fileio import load
+from mmengine.logging import MMLogger
+from terminaltables import AsciiTable
+
+from mmdet.datasets.api_wrappers import COCO, COCOeval, COCOevalMP
+from mmdet.registry import METRICS
+from .coco_metric import CocoMetric
+
+
+@METRICS.register_module()
+class OVCocoMetric(CocoMetric):
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        # split gt and prediction list
+        gts, preds = zip(*results)
+
+        tmp_dir = None
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+
+        if self._coco_api is None:
+            # use converted gt json file to initialize coco api
+            logger.info('Converting ground truth to coco format...')
+            coco_json_path = self.gt_to_coco_json(
+                gt_dicts=gts, outfile_prefix=outfile_prefix)
+            self._coco_api = COCO(coco_json_path)
+
+        # handle lazy init
+        if self.cat_ids is None:
+            self.cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['classes'])
+            self.base_cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['base_classes'])
+            self.novel_cat_ids = self._coco_api.get_cat_ids(
+                cat_names=self.dataset_meta['novel_classes'])
+
+        if self.img_ids is None:
+            self.img_ids = self._coco_api.get_img_ids()
+
+        # convert predictions to coco format and dump to json file
+        result_files = self.results2json(preds, outfile_prefix)
+
+        eval_results = OrderedDict()
+        if self.format_only:
+            logger.info('results are saved in '
+                        f'{osp.dirname(outfile_prefix)}')
+            return eval_results
+
+        for metric in self.metrics:
+            logger.info(f'Evaluating {metric}...')
+
+            # TODO: May refactor fast_eval_recall to an independent metric?
+            # fast eval recall
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    preds, self.proposal_nums, self.iou_thrs, logger=logger)
+                log_msg = []
+                for i, num in enumerate(self.proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+                log_msg = ''.join(log_msg)
+                logger.info(log_msg)
+                continue
+
+            # evaluate proposal, bbox and segm
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            if metric not in result_files:
+                raise KeyError(f'{metric} is not in results')
+            try:
+                predictions = load(result_files[metric])
+                if iou_type == 'segm':
+                    # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331  # noqa
+                    # When evaluating mask AP, if the results contain bbox,
+                    # cocoapi will use the box area instead of the mask area
+                    # for calculating the instance area. Though the overall AP
+                    # is not affected, this leads to different
+                    # small/medium/large mask AP results.
+                    for x in predictions:
+                        x.pop('bbox')
+                coco_dt = self._coco_api.loadRes(predictions)
+
+            except IndexError:
+                logger.error(
+                    'The testing results of the whole dataset is empty.')
+                break
+
+            if self.use_mp_eval:
+                coco_eval = COCOevalMP(self._coco_api, coco_dt, iou_type)
+            else:
+                coco_eval = COCOeval(self._coco_api, coco_dt, iou_type)
+
+            coco_eval.params.catIds = self.cat_ids
+            coco_eval.params.imgIds = self.img_ids
+            coco_eval.params.maxDets = list(self.proposal_nums)
+            coco_eval.params.iouThrs = self.iou_thrs
+
+            # mapping of cocoEval.stats
+            coco_metric_names = {
+                'mAP': 0,
+                'mAP_50': 1,
+                'mAP_75': 2,
+                'mAP_s': 3,
+                'mAP_m': 4,
+                'mAP_l': 5,
+                'AR@100': 6,
+                'AR@300': 7,
+                'AR@1000': 8,
+                'AR_s@1000': 9,
+                'AR_m@1000': 10,
+                'AR_l@1000': 11
+            }
+            metric_items = self.metric_items
+            if metric_items is not None:
+                for metric_item in metric_items:
+                    if metric_item not in coco_metric_names:
+                        raise KeyError(
+                            f'metric item "{metric_item}" is not supported')
+
+            if metric == 'proposal':
+                coco_eval.params.useCats = 0
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if metric_items is None:
+                    metric_items = [
+                        'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
+                        'AR_m@1000', 'AR_l@1000'
+                    ]
+
+                for item in metric_items:
+                    val = float(
+                        f'{coco_eval.stats[coco_metric_names[item]]:.3f}')
+                    eval_results[item] = val
+            else:
+                coco_eval.evaluate()
+                coco_eval.accumulate()
+                coco_eval.summarize()
+                if self.classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = coco_eval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, cat_id in enumerate(self.cat_ids):
+                        t = []
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        nm = self._coco_api.loadCats(cat_id)[0]
+                        precision = precisions[:, :, idx, 0, -1]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        t.append(f'{nm["name"]}')
+                        t.append(f'{round(ap, 3)}')
+                        eval_results[f'{nm["name"]}_precision'] = round(ap, 3)
+
+                        # indexes of IoU  @50 and @75
+                        for iou in [0, 5]:
+                            precision = precisions[iou, :, idx, 0, -1]
+                            precision = precision[precision > -1]
+                            if precision.size:
+                                ap = np.mean(precision)
+                            else:
+                                ap = float('nan')
+                            t.append(f'{round(ap, 3)}')
+
+                        # indexes of area of small, median and large
+                        for area in [1, 2, 3]:
+                            precision = precisions[:, :, idx, area, -1]
+                            precision = precision[precision > -1]
+                            if precision.size:
+                                ap = np.mean(precision)
+                            else:
+                                ap = float('nan')
+                            t.append(f'{round(ap, 3)}')
+                        results_per_category.append(tuple(t))
+
+                    num_columns = len(results_per_category[0])
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = [
+                        'category', 'mAP', 'mAP_50', 'mAP_75', 'mAP_s',
+                        'mAP_m', 'mAP_l'
+                    ]
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    logger.info('\n' + table.table)
+
+                # ------------get novel_ap50 and base_ap50---------
+                precisions = coco_eval.eval['precision']
+                assert len(self.cat_ids) == precisions.shape[2]
+                base_inds, novel_inds = [], []
+
+                for idx, catId in enumerate(self.cat_ids):
+                    if catId in self.base_cat_ids:
+                        base_inds.append(idx)
+                    if catId in self.novel_cat_ids:
+                        novel_inds.append(idx)
+
+                base_ap = precisions[:, :, base_inds, 0, -1]
+                novel_ap = precisions[:, :, novel_inds, 0, -1]
+                base_ap50 = precisions[0, :, base_inds, 0, -1]
+                novel_ap50 = precisions[0, :, novel_inds, 0, -1]
+
+                eval_results['base_ap'] = np.mean(
+                    base_ap[base_ap > -1]) if len(
+                        base_ap[base_ap > -1]) else -1
+                eval_results['novel_ap'] = np.mean(
+                    novel_ap[novel_ap > -1]) if len(
+                        novel_ap[novel_ap > -1]) else -1
+                eval_results['base_ap50'] = np.mean(
+                    base_ap50[base_ap50 > -1]) if len(
+                        base_ap50[base_ap50 > -1]) else -1
+                eval_results['novel_ap50'] = np.mean(
+                    novel_ap50[novel_ap50 > -1]) if len(
+                        novel_ap50[novel_ap50 > -1]) else -1
+                # ------------get novel_ap50 and base_ap50---------
+                if metric_items is None:
+                    metric_items = [
+                        'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+                    ]
+
+                for metric_item in metric_items:
+                    key = f'{metric}_{metric_item}'
+                    val = coco_eval.stats[coco_metric_names[metric_item]]
+                    eval_results[key] = float(f'{round(val, 3)}')
+
+                ap = coco_eval.stats[:6]
+                logger.info(f'{metric}_mAP_copypaste: {ap[0]:.3f} '
+                            f'{ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                            f'{ap[4]:.3f} {ap[5]:.3f}')
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
diff --git a/head_extractor/src/mmdet/evaluation/metrics/refexp_metric.py b/head_extractor/src/mmdet/evaluation/metrics/refexp_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bcdf1629b9bcd9519e0160769810168017a6d0d
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/refexp_metric.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.fileio import get_local_path
+from mmengine.logging import MMLogger
+
+from mmdet.datasets.api_wrappers import COCO
+from mmdet.registry import METRICS
+from ..functional import bbox_overlaps
+
+
+@METRICS.register_module()
+class RefExpMetric(BaseMetric):
+    default_prefix: Optional[str] = 'refexp'
+
+    def __init__(self,
+                 ann_file: Optional[str] = None,
+                 metric: str = 'bbox',
+                 topk=(1, 5, 10),
+                 iou_thrs: float = 0.5,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.metric = metric
+        self.topk = topk
+        self.iou_thrs = iou_thrs
+
+        with get_local_path(ann_file) as local_path:
+            self.coco = COCO(local_path)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        for data_sample in data_samples:
+            result = dict()
+            pred = data_sample['pred_instances']
+            result['img_id'] = data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        logger: MMLogger = MMLogger.get_current_instance()
+
+        dataset2score = {
+            'refcoco': {k: 0.0
+                        for k in self.topk},
+            'refcoco+': {k: 0.0
+                         for k in self.topk},
+            'refcocog': {k: 0.0
+                         for k in self.topk},
+        }
+        dataset2count = {'refcoco': 0.0, 'refcoco+': 0.0, 'refcocog': 0.0}
+
+        for result in results:
+            img_id = result['img_id']
+
+            ann_ids = self.coco.getAnnIds(imgIds=img_id)
+            assert len(ann_ids) == 1
+            img_info = self.coco.loadImgs(img_id)[0]
+            target = self.coco.loadAnns(ann_ids[0])
+
+            target_bbox = target[0]['bbox']
+            converted_bbox = [
+                target_bbox[0],
+                target_bbox[1],
+                target_bbox[2] + target_bbox[0],
+                target_bbox[3] + target_bbox[1],
+            ]
+            iou = bbox_overlaps(result['bboxes'],
+                                np.array(converted_bbox).reshape(-1, 4))
+            for k in self.topk:
+                if max(iou[:k]) >= self.iou_thrs:
+                    dataset2score[img_info['dataset_name']][k] += 1.0
+            dataset2count[img_info['dataset_name']] += 1.0
+
+        for key, value in dataset2score.items():
+            for k in self.topk:
+                try:
+                    value[k] /= dataset2count[key]
+                except Exception as e:
+                    print(e)
+
+        results = {}
+        mean_precision = 0.0
+        for key, value in dataset2score.items():
+            results[key] = sorted([v for k, v in value.items()])
+            mean_precision += sum(results[key])
+            logger.info(
+                f' Dataset: {key} - Precision @ 1, 5, 10: {results[key]}')
+
+        # `mean_precision` key is used for saving the best checkpoint
+        out_results = {'mean_precision': mean_precision / 9.0}
+
+        for i, k in enumerate(self.topk):
+            out_results[f'refcoco_precision@{k}'] = results['refcoco'][i]
+        for i, k in enumerate(self.topk):
+            out_results[f'refcoco+_precision@{k}'] = results['refcoco+'][i]
+        for i, k in enumerate(self.topk):
+            out_results[f'refcocog_precision@{k}'] = results['refcocog'][i]
+        return out_results
diff --git a/head_extractor/src/mmdet/evaluation/metrics/refseg_metric.py b/head_extractor/src/mmdet/evaluation/metrics/refseg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..0faee07007e809ef08e86a88e8b11c2be1a64034
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/refseg_metric.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence
+
+import torch
+from mmengine.evaluator import BaseMetric
+
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class RefSegMetric(BaseMetric):
+    """Referring Expression Segmentation Metric."""
+
+    def __init__(self, metric: Sequence = ('cIoU', 'mIoU'), **kwargs):
+        super().__init__(**kwargs)
+        assert set(metric).issubset(['cIoU', 'mIoU']), \
+            f'Only support cIoU and mIoU, but got {metric}'
+        assert len(metric) > 0, 'metrics should not be empty'
+        self.metrics = metric
+
+    def compute_iou(self, pred_seg: torch.Tensor,
+                    gt_seg: torch.Tensor) -> tuple:
+        overlap = pred_seg & gt_seg
+        union = pred_seg | gt_seg
+        return overlap, union
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_instances']['masks'].bool()
+            label = data_sample['gt_masks'].to_tensor(
+                pred_label.dtype, pred_label.device).bool()
+            # calculate iou
+            overlap, union = self.compute_iou(pred_label, label)
+
+            bs = len(pred_label)
+            iou = overlap.reshape(bs, -1).sum(-1) * 1.0 / union.reshape(
+                bs, -1).sum(-1)
+            iou = torch.nan_to_num_(iou, nan=0.0)
+            self.results.append((overlap.sum(), union.sum(), iou.sum(), bs))
+
+    def compute_metrics(self, results: list) -> dict:
+        results = tuple(zip(*results))
+        assert len(results) == 4
+        cum_i = sum(results[0])
+        cum_u = sum(results[1])
+        iou = sum(results[2])
+        seg_total = sum(results[3])
+
+        metrics = {}
+        if 'cIoU' in self.metrics:
+            metrics['cIoU'] = cum_i * 100 / cum_u
+        if 'mIoU' in self.metrics:
+            metrics['mIoU'] = iou * 100 / seg_total
+        return metrics
diff --git a/head_extractor/src/mmdet/evaluation/metrics/reid_metric.py b/head_extractor/src/mmdet/evaluation/metrics/reid_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..d74df1433cdb093cfb0377b734fc5479401e09e7
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/reid_metric.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmengine.evaluator import BaseMetric
+
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class ReIDMetrics(BaseMetric):
+    """mAP and CMC evaluation metrics for the ReID task.
+
+    Args:
+        metric (str | list[str]): Metrics to be evaluated.
+            Default value is `mAP`.
+        metric_options: (dict, optional): Options for calculating metrics.
+            Allowed keys are 'rank_list' and 'max_rank'. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+    """
+    allowed_metrics = ['mAP', 'CMC']
+    default_prefix: Optional[str] = 'reid-metric'
+
+    def __init__(self,
+                 metric: Union[str, Sequence[str]] = 'mAP',
+                 metric_options: Optional[dict] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device, prefix)
+
+        if isinstance(metric, list):
+            metrics = metric
+        elif isinstance(metric, str):
+            metrics = [metric]
+        else:
+            raise TypeError('metric must be a list or a str.')
+        for metric in metrics:
+            if metric not in self.allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported.')
+        self.metrics = metrics
+
+        self.metric_options = metric_options or dict(
+            rank_list=[1, 5, 10, 20], max_rank=20)
+        for rank in self.metric_options['rank_list']:
+            assert 1 <= rank <= self.metric_options['max_rank']
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            pred_feature = data_sample['pred_feature']
+            assert isinstance(pred_feature, torch.Tensor)
+            gt_label = data_sample.get('gt_label', data_sample['gt_label'])
+            assert isinstance(gt_label['label'], torch.Tensor)
+            result = dict(
+                pred_feature=pred_feature.data.cpu(),
+                gt_label=gt_label['label'].cpu())
+            self.results.append(result)
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        # NOTICE: don't access `self.results` from the method.
+        metrics = {}
+
+        pids = torch.cat([result['gt_label'] for result in results]).numpy()
+        features = torch.stack([result['pred_feature'] for result in results])
+
+        n, c = features.size()
+        mat = torch.pow(features, 2).sum(dim=1, keepdim=True).expand(n, n)
+        distmat = mat + mat.t()
+        distmat.addmm_(features, features.t(), beta=1, alpha=-2)
+        distmat = distmat.numpy()
+
+        indices = np.argsort(distmat, axis=1)
+        matches = (pids[indices] == pids[:, np.newaxis]).astype(np.int32)
+
+        all_cmc = []
+        all_AP = []
+        num_valid_q = 0.
+        for q_idx in range(n):
+            # remove self
+            raw_cmc = matches[q_idx][1:]
+            if not np.any(raw_cmc):
+                # this condition is true when query identity
+                # does not appear in gallery
+                continue
+
+            cmc = raw_cmc.cumsum()
+            cmc[cmc > 1] = 1
+
+            all_cmc.append(cmc[:self.metric_options['max_rank']])
+            num_valid_q += 1.
+
+            # compute average precision
+            num_rel = raw_cmc.sum()
+            tmp_cmc = raw_cmc.cumsum()
+            tmp_cmc = [x / (i + 1.) for i, x in enumerate(tmp_cmc)]
+            tmp_cmc = np.asarray(tmp_cmc) * raw_cmc
+            AP = tmp_cmc.sum() / num_rel
+            all_AP.append(AP)
+
+        assert num_valid_q > 0, \
+            'Error: all query identities do not appear in gallery'
+
+        all_cmc = np.asarray(all_cmc)
+        all_cmc = all_cmc.sum(0) / num_valid_q
+        mAP = np.mean(all_AP)
+
+        if 'mAP' in self.metrics:
+            metrics['mAP'] = np.around(mAP, decimals=3)
+        if 'CMC' in self.metrics:
+            for rank in self.metric_options['rank_list']:
+                metrics[f'R{rank}'] = np.around(all_cmc[rank - 1], decimals=3)
+
+        return metrics
diff --git a/head_extractor/src/mmdet/evaluation/metrics/semseg_metric.py b/head_extractor/src/mmdet/evaluation/metrics/semseg_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..3215f6788a6155bdbceb6a91259008b4d851868e
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/semseg_metric.py
@@ -0,0 +1,279 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import OrderedDict
+from typing import Dict, Optional, Sequence, Union
+
+import numpy as np
+import torch
+from mmcv import imwrite
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+from mmengine.utils import mkdir_or_exist
+from PIL import Image
+
+try:
+    from prettytable import PrettyTable
+except ImportError:
+    PrettyTable = None
+
+from mmdet.registry import METRICS
+
+
+@METRICS.register_module()
+class SemSegMetric(BaseMetric):
+    """mIoU evaluation metric.
+
+    Args:
+        iou_metrics (list[str] | str): Metrics to be calculated, the options
+            includes 'mIoU', 'mDice' and 'mFscore'.
+        beta (int): Determines the weight of recall in the combined score.
+            Default: 1.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        output_dir (str): The directory for output prediction. Defaults to
+            None.
+        format_only (bool): Only format result for results commit without
+            perform evaluation. It is useful when you want to save the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    def __init__(self,
+                 iou_metrics: Sequence[str] = ['mIoU'],
+                 beta: int = 1,
+                 collect_device: str = 'cpu',
+                 output_dir: Optional[str] = None,
+                 format_only: bool = False,
+                 backend_args: dict = None,
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        if isinstance(iou_metrics, str):
+            iou_metrics = [iou_metrics]
+        if not set(iou_metrics).issubset(set(['mIoU', 'mDice', 'mFscore'])):
+            raise KeyError(f'metrics {iou_metrics} is not supported. '
+                           f'Only supports mIoU/mDice/mFscore.')
+        self.metrics = iou_metrics
+        self.beta = beta
+        self.output_dir = output_dir
+        if self.output_dir and is_main_process():
+            mkdir_or_exist(self.output_dir)
+        self.format_only = format_only
+        self.backend_args = backend_args
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        num_classes = len(self.dataset_meta['classes'])
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_sem_seg']['sem_seg'].squeeze()
+            # format_only always for test dataset without ground truth
+            if not self.format_only:
+                label = data_sample['gt_sem_seg']['sem_seg'].squeeze().to(
+                    pred_label)
+                ignore_index = data_sample['pred_sem_seg'].get(
+                    'ignore_index', 255)
+                self.results.append(
+                    self._compute_pred_stats(pred_label, label, num_classes,
+                                             ignore_index))
+
+            # format_result
+            if self.output_dir is not None:
+                basename = osp.splitext(osp.basename(
+                    data_sample['img_path']))[0]
+                png_filename = osp.abspath(
+                    osp.join(self.output_dir, f'{basename}.png'))
+                output_mask = pred_label.cpu().numpy()
+                output = Image.fromarray(output_mask.astype(np.uint8))
+                imwrite(output, png_filename, backend_args=self.backend_args)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results. The key
+                mainly includes aAcc, mIoU, mAcc, mDice, mFscore, mPrecision,
+                mRecall.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.format_only:
+            logger.info(f'results are saved to {osp.dirname(self.output_dir)}')
+            return OrderedDict()
+
+        ret_metrics = self.get_return_metrics(results)
+
+        # summary table
+        ret_metrics_summary = OrderedDict({
+            ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2)
+            for ret_metric, ret_metric_value in ret_metrics.items()
+        })
+        metrics = dict()
+        for key, val in ret_metrics_summary.items():
+            if key == 'aAcc':
+                metrics[key] = val
+            else:
+                metrics['m' + key] = val
+
+        print_semantic_table(ret_metrics, self.dataset_meta['classes'], logger)
+
+        return metrics
+
+    def _compute_pred_stats(self, pred_label: torch.tensor,
+                            label: torch.tensor, num_classes: int,
+                            ignore_index: int):
+        """Parse semantic segmentation predictions.
+
+        Args:
+            pred_label (torch.tensor): Prediction segmentation map
+                or predict result filename. The shape is (H, W).
+            label (torch.tensor): Ground truth segmentation map
+                or label filename. The shape is (H, W).
+            num_classes (int): Number of categories.
+
+        Returns:
+            torch.Tensor: The intersection of prediction and ground truth
+                histogram on all classes.
+            torch.Tensor: The union of prediction and ground truth histogram on
+                all classes.
+            torch.Tensor: The prediction histogram on all classes.
+            torch.Tensor: The ground truth histogram on all classes.
+        """
+        assert pred_label.shape == label.shape
+        mask = label != ignore_index
+        label, pred_label = label[mask], pred_label[mask]
+
+        intersect = pred_label[pred_label == label]
+        area_intersect = torch.histc(
+            intersect.float(), bins=num_classes, min=0, max=num_classes - 1)
+        area_pred_label = torch.histc(
+            pred_label.float(), bins=num_classes, min=0, max=num_classes - 1)
+        area_label = torch.histc(
+            label.float(), bins=num_classes, min=0, max=num_classes - 1)
+        area_union = area_pred_label + area_label - area_intersect
+        result = dict(
+            area_intersect=area_intersect,
+            area_union=area_union,
+            area_pred_label=area_pred_label,
+            area_label=area_label)
+        return result
+
+    def get_return_metrics(self, results: list) -> dict:
+        """Calculate evaluation metrics.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, np.ndarray]: per category evaluation metrics,
+                shape (num_classes, ).
+        """
+
+        def f_score(precision, recall, beta=1):
+            """calculate the f-score value.
+
+            Args:
+                precision (float | torch.Tensor): The precision value.
+                recall (float | torch.Tensor): The recall value.
+                beta (int): Determines the weight of recall in the combined
+                    score. Default: 1.
+
+            Returns:
+                [torch.tensor]: The f-score value.
+            """
+            score = (1 + beta**2) * (precision * recall) / (
+                (beta**2 * precision) + recall)
+            return score
+
+        total_area_intersect = sum([r['area_intersect'] for r in results])
+        total_area_union = sum([r['area_union'] for r in results])
+        total_area_pred_label = sum([r['area_pred_label'] for r in results])
+        total_area_label = sum([r['area_label'] for r in results])
+
+        all_acc = total_area_intersect / total_area_label
+        ret_metrics = OrderedDict({'aAcc': all_acc})
+        for metric in self.metrics:
+            if metric == 'mIoU':
+                iou = total_area_intersect / total_area_union
+                acc = total_area_intersect / total_area_label
+                ret_metrics['IoU'] = iou
+                ret_metrics['Acc'] = acc
+            elif metric == 'mDice':
+                dice = 2 * total_area_intersect / (
+                    total_area_pred_label + total_area_label)
+                acc = total_area_intersect / total_area_label
+                ret_metrics['Dice'] = dice
+                ret_metrics['Acc'] = acc
+            elif metric == 'mFscore':
+                precision = total_area_intersect / total_area_pred_label
+                recall = total_area_intersect / total_area_label
+                f_value = torch.tensor([
+                    f_score(x[0], x[1], self.beta)
+                    for x in zip(precision, recall)
+                ])
+                ret_metrics['Fscore'] = f_value
+                ret_metrics['Precision'] = precision
+                ret_metrics['Recall'] = recall
+
+        ret_metrics = {
+            metric: value.cpu().numpy()
+            for metric, value in ret_metrics.items()
+        }
+
+        return ret_metrics
+
+
+def print_semantic_table(
+        results: dict,
+        class_names: list,
+        logger: Optional[Union['MMLogger', str]] = None) -> None:
+    """Print semantic segmentation evaluation results table.
+
+    Args:
+        results (dict): The evaluation results.
+        class_names (list): Class names.
+        logger (MMLogger | str, optional): Logger used for printing.
+            Default: None.
+    """
+    # each class table
+    results.pop('aAcc', None)
+    ret_metrics_class = OrderedDict({
+        ret_metric: np.round(ret_metric_value * 100, 2)
+        for ret_metric, ret_metric_value in results.items()
+    })
+
+    print_log('per class results:', logger)
+    if PrettyTable:
+        class_table_data = PrettyTable()
+        ret_metrics_class.update({'Class': class_names})
+        ret_metrics_class.move_to_end('Class', last=False)
+        for key, val in ret_metrics_class.items():
+            class_table_data.add_column(key, val)
+        print_log('\n' + class_table_data.get_string(), logger=logger)
+    else:
+        logger.warning(
+            '`prettytable` is not installed, for better table format, '
+            'please consider installing it with "pip install prettytable"')
+        print_result = {}
+        for class_name, iou, acc in zip(class_names, ret_metrics_class['IoU'],
+                                        ret_metrics_class['Acc']):
+            print_result[class_name] = {'IoU': iou, 'Acc': acc}
+        print_log(print_result, logger)
diff --git a/head_extractor/src/mmdet/evaluation/metrics/voc_metric.py b/head_extractor/src/mmdet/evaluation/metrics/voc_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..32d8c075de9c8b4fb842ad7f64f87a10c4d68546
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/voc_metric.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from collections import OrderedDict
+from typing import List, Optional, Sequence, Union
+
+import numpy as np
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS
+from ..functional import eval_map, eval_recalls
+
+
+@METRICS.register_module()
+class VOCMetric(BaseMetric):
+    """Pascal VOC evaluation metric.
+
+    Args:
+        iou_thrs (float or List[float]): IoU threshold. Defaults to 0.5.
+        scale_ranges (List[tuple], optional): Scale ranges for evaluating
+            mAP. If not specified, all bounding boxes would be included in
+            evaluation. Defaults to None.
+        metric (str | list[str]): Metrics to be evaluated. Options are
+            'mAP', 'recall'. If is list, the first setting in the list will
+             be used to evaluate metric.
+        proposal_nums (Sequence[int]): Proposal number used for evaluating
+            recalls, such as recall@100, recall@1000.
+            Default: (100, 300, 1000).
+        eval_mode (str): 'area' or '11points', 'area' means calculating the
+            area under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1].
+            The PASCAL VOC2007 defaults to use '11points', while PASCAL
+            VOC2012 defaults to use 'area'.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    default_prefix: Optional[str] = 'pascal_voc'
+
+    def __init__(self,
+                 iou_thrs: Union[float, List[float]] = 0.5,
+                 scale_ranges: Optional[List[tuple]] = None,
+                 metric: Union[str, List[str]] = 'mAP',
+                 proposal_nums: Sequence[int] = (100, 300, 1000),
+                 eval_mode: str = '11points',
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        self.iou_thrs = [iou_thrs] if isinstance(iou_thrs, float) \
+            else iou_thrs
+        self.scale_ranges = scale_ranges
+        # voc evaluation metrics
+        if not isinstance(metric, str):
+            assert len(metric) == 1
+            metric = metric[0]
+        allowed_metrics = ['recall', 'mAP']
+        if metric not in allowed_metrics:
+            raise KeyError(
+                f"metric should be one of 'recall', 'mAP', but got {metric}.")
+        self.metric = metric
+        self.proposal_nums = proposal_nums
+        assert eval_mode in ['area', '11points'], \
+            'Unrecognized mode, only "area" and "11points" are supported'
+        self.eval_mode = eval_mode
+
+    # TODO: data_batch is no longer needed, consider adjusting the
+    #  parameter position
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of data samples that
+                contain annotations and predictions.
+        """
+        for data_sample in data_samples:
+            gt = copy.deepcopy(data_sample)
+            # TODO: Need to refactor to support LoadAnnotations
+            gt_instances = gt['gt_instances']
+            gt_ignore_instances = gt['ignored_instances']
+            ann = dict(
+                labels=gt_instances['labels'].cpu().numpy(),
+                bboxes=gt_instances['bboxes'].cpu().numpy(),
+                bboxes_ignore=gt_ignore_instances['bboxes'].cpu().numpy(),
+                labels_ignore=gt_ignore_instances['labels'].cpu().numpy())
+
+            pred = data_sample['pred_instances']
+            pred_bboxes = pred['bboxes'].cpu().numpy()
+            pred_scores = pred['scores'].cpu().numpy()
+            pred_labels = pred['labels'].cpu().numpy()
+
+            dets = []
+            for label in range(len(self.dataset_meta['classes'])):
+                index = np.where(pred_labels == label)[0]
+                pred_bbox_scores = np.hstack(
+                    [pred_bboxes[index], pred_scores[index].reshape((-1, 1))])
+                dets.append(pred_bbox_scores)
+
+            self.results.append((ann, dets))
+
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        gts, preds = zip(*results)
+        eval_results = OrderedDict()
+        if self.metric == 'mAP':
+            assert isinstance(self.iou_thrs, list)
+            dataset_type = self.dataset_meta.get('dataset_type')
+            if dataset_type in ['VOC2007', 'VOC2012']:
+                dataset_name = 'voc'
+                if dataset_type == 'VOC2007' and self.eval_mode != '11points':
+                    warnings.warn('Pascal VOC2007 uses `11points` as default '
+                                  'evaluate mode, but you are using '
+                                  f'{self.eval_mode}.')
+                elif dataset_type == 'VOC2012' and self.eval_mode != 'area':
+                    warnings.warn('Pascal VOC2012 uses `area` as default '
+                                  'evaluate mode, but you are using '
+                                  f'{self.eval_mode}.')
+            else:
+                dataset_name = self.dataset_meta['classes']
+
+            mean_aps = []
+            for iou_thr in self.iou_thrs:
+                logger.info(f'\n{"-" * 15}iou_thr: {iou_thr}{"-" * 15}')
+                # Follow the official implementation,
+                # http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar
+                # we should use the legacy coordinate system in mmdet 1.x,
+                # which means w, h should be computed as 'x2 - x1 + 1` and
+                # `y2 - y1 + 1`
+                mean_ap, _ = eval_map(
+                    preds,
+                    gts,
+                    scale_ranges=self.scale_ranges,
+                    iou_thr=iou_thr,
+                    dataset=dataset_name,
+                    logger=logger,
+                    eval_mode=self.eval_mode,
+                    use_legacy_coordinate=True)
+                mean_aps.append(mean_ap)
+                eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
+            eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
+            eval_results.move_to_end('mAP', last=False)
+        elif self.metric == 'recall':
+            gt_bboxes = [gt['bboxes'] for gt in gts]
+            pr_bboxes = [pred[0] for pred in preds]
+            recalls = eval_recalls(
+                gt_bboxes,
+                pr_bboxes,
+                self.proposal_nums,
+                self.iou_thrs,
+                logger=logger,
+                use_legacy_coordinate=True)
+            for i, num in enumerate(self.proposal_nums):
+                for j, iou_thr in enumerate(self.iou_thrs):
+                    eval_results[f'recall@{num}@{iou_thr}'] = recalls[i, j]
+            if recalls.shape[1] > 1:
+                ar = recalls.mean(axis=1)
+                for i, num in enumerate(self.proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+        return eval_results
diff --git a/head_extractor/src/mmdet/evaluation/metrics/youtube_vis_metric.py b/head_extractor/src/mmdet/evaluation/metrics/youtube_vis_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..5abc77a591c7ee5d67cdf4dc4c4926c84894ba1d
--- /dev/null
+++ b/head_extractor/src/mmdet/evaluation/metrics/youtube_vis_metric.py
@@ -0,0 +1,426 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import tempfile
+import warnings
+import zipfile
+from collections import OrderedDict, defaultdict
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import mmengine
+import numpy as np
+from mmengine.dist import (all_gather_object, barrier, broadcast_object_list,
+                           is_main_process)
+from mmengine.logging import MMLogger
+
+from mmdet.registry import METRICS
+from mmdet.structures.mask import encode_mask_results
+from ..functional import YTVIS, YTVISeval
+from .base_video_metric import BaseVideoMetric, collect_tracking_results
+
+
+@METRICS.register_module()
+class YouTubeVISMetric(BaseVideoMetric):
+    """mAP evaluation metrics for the VIS task.
+
+    Args:
+        metric (str | list[str]): Metrics to be evaluated.
+            Default value is `youtube_vis_ap`.
+        metric_items (List[str], optional): Metric result names to be
+            recorded in the evaluation result. Defaults to None.
+        outfile_prefix (str | None): The prefix of json files. It includes
+            the file path and the prefix of filename, e.g., "a/b/prefix".
+            If not specified, a temp file will be created. Defaults to None.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonyms metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+        format_only (bool): If True, only formatting the results to the
+            official format and not performing evaluation. Defaults to False.
+    """
+
+    default_prefix: Optional[str] = 'youtube_vis'
+
+    def __init__(self,
+                 metric: Union[str, List[str]] = 'youtube_vis_ap',
+                 metric_items: Optional[Sequence[str]] = None,
+                 outfile_prefix: Optional[str] = None,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 format_only: bool = False) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        # vis evaluation metrics
+        self.metrics = metric if isinstance(metric, list) else [metric]
+        self.format_only = format_only
+        allowed_metrics = ['youtube_vis_ap']
+        for metric in self.metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(
+                    f"metric should be 'youtube_vis_ap', but got {metric}.")
+
+        self.metric_items = metric_items
+        self.outfile_prefix = outfile_prefix
+        self.per_video_res = []
+        self.categories = []
+        self._vis_meta_info = defaultdict(list)  # record video and image infos
+
+    def process_video(self, data_samples):
+
+        video_length = len(data_samples)
+        for frame_id in range(video_length):
+            result = dict()
+            img_data_sample = data_samples[frame_id].to_dict()
+            pred = img_data_sample['pred_track_instances']
+            video_id = img_data_sample['video_id']
+
+            result['img_id'] = img_data_sample['img_id']
+            result['bboxes'] = pred['bboxes'].cpu().numpy()
+            result['scores'] = pred['scores'].cpu().numpy()
+            result['labels'] = pred['labels'].cpu().numpy()
+            result['instances_id'] = pred['instances_id'].cpu().numpy()
+            # encode mask to RLE
+            assert 'masks' in pred, \
+                'masks must exist in YouTube-VIS metric'
+            result['masks'] = encode_mask_results(
+                pred['masks'].detach().cpu().numpy())
+
+            # parse gt
+            gt = dict()
+            gt['width'] = img_data_sample['ori_shape'][1]
+            gt['height'] = img_data_sample['ori_shape'][0]
+            gt['img_id'] = img_data_sample['img_id']
+            gt['frame_id'] = frame_id
+            gt['video_id'] = video_id
+            gt['video_length'] = video_length
+
+            if 'instances' in img_data_sample:
+                gt['anns'] = img_data_sample['instances']
+            else:
+                gt['anns'] = dict()
+            self.per_video_res.append((result, gt))
+
+        preds, gts = zip(*self.per_video_res)
+        # format the results
+        # we must format gts first to update self._vis_meta_info
+        gt_results = self._format_one_video_gts(gts)
+        pred_results = self._format_one_video_preds(preds)
+        self.per_video_res.clear()
+        # add converted result to the results list
+        self.results.append((pred_results, gt_results))
+
+    def compute_metrics(self, results: List) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (List): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+            the metrics, and the values are corresponding results.
+        """
+        # split gt and prediction list
+        tmp_pred_results, tmp_gt_results = zip(*results)
+        gt_results = self.format_gts(tmp_gt_results)
+        pred_results = self.format_preds(tmp_pred_results)
+
+        if self.format_only:
+            self.save_pred_results(pred_results)
+            return dict()
+
+        ytvis = YTVIS(gt_results)
+
+        ytvis_dets = ytvis.loadRes(pred_results)
+        vid_ids = ytvis.getVidIds()
+
+        iou_type = metric = 'segm'
+        eval_results = OrderedDict()
+        ytvisEval = YTVISeval(ytvis, ytvis_dets, iou_type)
+        ytvisEval.params.vidIds = vid_ids
+        ytvisEval.evaluate()
+        ytvisEval.accumulate()
+        ytvisEval.summarize()
+
+        coco_metric_names = {
+            'mAP': 0,
+            'mAP_50': 1,
+            'mAP_75': 2,
+            'mAP_s': 3,
+            'mAP_m': 4,
+            'mAP_l': 5,
+            'AR@1': 6,
+            'AR@10': 7,
+            'AR@100': 8,
+            'AR_s@100': 9,
+            'AR_m@100': 10,
+            'AR_l@100': 11
+        }
+        metric_items = self.metric_items
+        if metric_items is not None:
+            for metric_item in metric_items:
+                if metric_item not in coco_metric_names:
+                    raise KeyError(
+                        f'metric item "{metric_item}" is not supported')
+
+        if metric_items is None:
+            metric_items = [
+                'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+            ]
+        for metric_item in metric_items:
+            key = f'{metric}_{metric_item}'
+            val = float(
+                f'{ytvisEval.stats[coco_metric_names[metric_item]]:.3f}')
+            eval_results[key] = val
+
+        return eval_results
+
+    def format_gts(self, gts: Tuple[List]) -> dict:
+        """Gather all ground-truth from self.results."""
+        self.categories = [
+            dict(id=id + 1, name=name)
+            for id, name in enumerate(self.dataset_meta['classes'])
+        ]
+        gt_results = dict(
+            categories=self.categories,
+            videos=self._vis_meta_info['videos'],
+            annotations=[])
+        for gt_result in gts:
+            gt_results['annotations'].extend(gt_result)
+        return gt_results
+
+    def format_preds(self, preds: Tuple[List]) -> List:
+        """Gather all predictions from self.results."""
+        pred_results = []
+        for pred_result in preds:
+            pred_results.extend(pred_result)
+        return pred_results
+
+    def _format_one_video_preds(self, pred_dicts: Tuple[dict]) -> List:
+        """Convert the annotation to the format of YouTube-VIS.
+
+        This operation is to make it easier to use the official eval API.
+
+        Args:
+            pred_dicts (Tuple[dict]): Prediction of the dataset.
+
+        Returns:
+            List: The formatted predictions.
+        """
+        # Collate preds scatters (tuple of dict to dict of list)
+        preds = defaultdict(list)
+        for pred in pred_dicts:
+            for key in pred.keys():
+                preds[key].append(pred[key])
+
+        img_infos = self._vis_meta_info['images']
+        vid_infos = self._vis_meta_info['videos']
+        inds = [i for i, _ in enumerate(img_infos) if _['frame_id'] == 0]
+        inds.append(len(img_infos))
+        json_results = []
+        video_id = vid_infos[-1]['id']
+        # collect data for each instances in a video.
+        collect_data = dict()
+        for frame_id, (masks, scores, labels, ids) in enumerate(
+                zip(preds['masks'], preds['scores'], preds['labels'],
+                    preds['instances_id'])):
+
+            assert len(masks) == len(labels)
+            for j, id in enumerate(ids):
+                if id not in collect_data:
+                    collect_data[id] = dict(
+                        category_ids=[], scores=[], segmentations=dict())
+                collect_data[id]['category_ids'].append(labels[j])
+                collect_data[id]['scores'].append(scores[j])
+                if isinstance(masks[j]['counts'], bytes):
+                    masks[j]['counts'] = masks[j]['counts'].decode()
+                collect_data[id]['segmentations'][frame_id] = masks[j]
+
+        # transform the collected data into official format
+        for id, id_data in collect_data.items():
+            output = dict()
+            output['video_id'] = video_id
+            output['score'] = np.array(id_data['scores']).mean().item()
+            # majority voting for sequence category
+            output['category_id'] = np.bincount(
+                np.array(id_data['category_ids'])).argmax().item() + 1
+            output['segmentations'] = []
+            for frame_id in range(inds[-1] - inds[-2]):
+                if frame_id in id_data['segmentations']:
+                    output['segmentations'].append(
+                        id_data['segmentations'][frame_id])
+                else:
+                    output['segmentations'].append(None)
+            json_results.append(output)
+
+        return json_results
+
+    def _format_one_video_gts(self, gt_dicts: Tuple[dict]) -> List:
+        """Convert the annotation to the format of YouTube-VIS.
+
+        This operation is to make it easier to use the official eval API.
+
+        Args:
+            gt_dicts (Tuple[dict]): Ground truth of the dataset.
+
+        Returns:
+            list: The formatted gts.
+        """
+        video_infos = []
+        image_infos = []
+        instance_infos = defaultdict(list)
+        len_videos = dict()  # mapping from instance_id to video_length
+        vis_anns = []
+
+        # get video infos
+        for gt_dict in gt_dicts:
+            frame_id = gt_dict['frame_id']
+            video_id = gt_dict['video_id']
+            img_id = gt_dict['img_id']
+            image_info = dict(
+                id=img_id,
+                width=gt_dict['width'],
+                height=gt_dict['height'],
+                frame_id=frame_id,
+                file_name='')
+            image_infos.append(image_info)
+            if frame_id == 0:
+                video_info = dict(
+                    id=video_id,
+                    width=gt_dict['width'],
+                    height=gt_dict['height'],
+                    file_name='')
+                video_infos.append(video_info)
+
+            for ann in gt_dict['anns']:
+                label = ann['bbox_label']
+                bbox = ann['bbox']
+                instance_id = ann['instance_id']
+                # update video length
+                len_videos[instance_id] = gt_dict['video_length']
+                coco_bbox = [
+                    bbox[0],
+                    bbox[1],
+                    bbox[2] - bbox[0],
+                    bbox[3] - bbox[1],
+                ]
+
+                annotation = dict(
+                    video_id=video_id,
+                    frame_id=frame_id,
+                    bbox=coco_bbox,
+                    instance_id=instance_id,
+                    iscrowd=ann.get('ignore_flag', 0),
+                    category_id=int(label) + 1,
+                    area=coco_bbox[2] * coco_bbox[3])
+                if ann.get('mask', None):
+                    mask = ann['mask']
+                    # area = mask_util.area(mask)
+                    if isinstance(mask, dict) and isinstance(
+                            mask['counts'], bytes):
+                        mask['counts'] = mask['counts'].decode()
+                    annotation['segmentation'] = mask
+
+                instance_infos[instance_id].append(annotation)
+
+        # update vis meta info
+        self._vis_meta_info['images'].extend(image_infos)
+        self._vis_meta_info['videos'].extend(video_infos)
+
+        for instance_id, ann_infos in instance_infos.items():
+            cur_video_len = len_videos[instance_id]
+            segm = [None] * cur_video_len
+            bbox = [None] * cur_video_len
+            area = [None] * cur_video_len
+            # In the official format, no instances are represented by
+            # 'None', however, only images with instances are recorded
+            # in the current annotations, so we need to use 'None' to
+            # initialize these lists.
+            for ann_info in ann_infos:
+                frame_id = ann_info['frame_id']
+                segm[frame_id] = ann_info['segmentation']
+                bbox[frame_id] = ann_info['bbox']
+                area[frame_id] = ann_info['area']
+            instance = dict(
+                category_id=ann_infos[0]['category_id'],
+                segmentations=segm,
+                bboxes=bbox,
+                video_id=ann_infos[0]['video_id'],
+                areas=area,
+                id=instance_id,
+                iscrowd=ann_infos[0]['iscrowd'])
+            vis_anns.append(instance)
+        return vis_anns
+
+    def save_pred_results(self, pred_results: List) -> None:
+        """Save the results to a zip file (standard format for YouTube-VIS
+        Challenge).
+
+        Args:
+            pred_results (list): Testing results of the
+                dataset.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.outfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            outfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            outfile_prefix = self.outfile_prefix
+        mmengine.dump(pred_results, f'{outfile_prefix}.json')
+        # zip the json file in order to submit to the test server.
+        zip_file_name = f'{outfile_prefix}.submission_file.zip'
+        zf = zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED)
+        logger.info(f"zip the 'results.json' into '{zip_file_name}', "
+                    'please submmit the zip file to the test server')
+        zf.write(f'{outfile_prefix}.json', 'results.json')
+        zf.close()
+
+    def evaluate(self, size: int) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        # wait for all processes to complete prediction.
+        barrier()
+
+        if len(self.results) == 0:
+            warnings.warn(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.')
+
+        results = collect_tracking_results(self.results, self.collect_device)
+
+        # gather seq_info
+        gathered_seq_info = all_gather_object(self._vis_meta_info['videos'])
+        all_seq_info = []
+        for _seq_info in gathered_seq_info:
+            all_seq_info.extend(_seq_info)
+        # update self._vis_meta_info
+        self._vis_meta_info = dict(videos=all_seq_info)
+
+        if is_main_process():
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        # reset the vis_meta_info
+        self._vis_meta_info.clear()
+        return metrics[0]
diff --git a/head_extractor/src/mmdet/models/__init__.py b/head_extractor/src/mmdet/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0a0d5e8d350d81e72787ff73fd85c2176783b43
--- /dev/null
+++ b/head_extractor/src/mmdet/models/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .backbones import *  # noqa: F401,F403
+from .data_preprocessors import *  # noqa: F401,F403
+from .dense_heads import *  # noqa: F401,F403
+from .detectors import *  # noqa: F401,F403
+from .language_models import *  # noqa: F401,F403
+from .layers import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .mot import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .reid import *  # noqa: F401,F403
+from .roi_heads import *  # noqa: F401,F403
+from .seg_heads import *  # noqa: F401,F403
+from .task_modules import *  # noqa: F401,F403
+from .test_time_augs import *  # noqa: F401,F403
+from .trackers import *  # noqa: F401,F403
+from .tracking_heads import *  # noqa: F401,F403
+from .vis import *  # noqa: F401,F403
diff --git a/head_extractor/src/mmdet/models/backbones/__init__.py b/head_extractor/src/mmdet/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e16ff85f7037b36fb2046fcbcd3af523050a6516
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .csp_darknet import CSPDarknet
+from .cspnext import CSPNeXt
+from .darknet import Darknet
+from .detectors_resnet import DetectoRS_ResNet
+from .detectors_resnext import DetectoRS_ResNeXt
+from .efficientnet import EfficientNet
+from .hourglass import HourglassNet
+from .hrnet import HRNet
+from .mobilenet_v2 import MobileNetV2
+from .pvt import PyramidVisionTransformer, PyramidVisionTransformerV2
+from .regnet import RegNet
+from .res2net import Res2Net
+from .resnest import ResNeSt
+from .resnet import ResNet, ResNetV1d
+from .resnext import ResNeXt
+from .ssd_vgg import SSDVGG
+from .swin import SwinTransformer
+from .trident_resnet import TridentResNet
+
+__all__ = [
+    'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet',
+    'MobileNetV2', 'Res2Net', 'HourglassNet', 'DetectoRS_ResNet',
+    'DetectoRS_ResNeXt', 'Darknet', 'ResNeSt', 'TridentResNet', 'CSPDarknet',
+    'SwinTransformer', 'PyramidVisionTransformer',
+    'PyramidVisionTransformerV2', 'EfficientNet', 'CSPNeXt'
+]
diff --git a/head_extractor/src/mmdet/models/backbones/csp_darknet.py b/head_extractor/src/mmdet/models/backbones/csp_darknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a890b486f255befa23fe5a3e9746f8f9298ac33f
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/csp_darknet.py
@@ -0,0 +1,286 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from ..layers import CSPLayer
+
+
+class Focus(nn.Module):
+    """Focus width and height information into channel space.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        kernel_size (int): The kernel size of the convolution. Default: 1
+        stride (int): The stride of the convolution. Default: 1
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish')):
+        super().__init__()
+        self.conv = ConvModule(
+            in_channels * 4,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        # shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
+        patch_top_left = x[..., ::2, ::2]
+        patch_top_right = x[..., ::2, 1::2]
+        patch_bot_left = x[..., 1::2, ::2]
+        patch_bot_right = x[..., 1::2, 1::2]
+        x = torch.cat(
+            (
+                patch_top_left,
+                patch_bot_left,
+                patch_top_right,
+                patch_bot_right,
+            ),
+            dim=1,
+        )
+        return self.conv(x)
+
+
+class SPPBottleneck(BaseModule):
+    """Spatial pyramid pooling layer used in YOLOv3-SPP.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        kernel_sizes (tuple[int]): Sequential of kernel sizes of pooling
+            layers. Default: (5, 9, 13).
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        mid_channels = in_channels // 2
+        self.conv1 = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.poolings = nn.ModuleList([
+            nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = mid_channels * (len(kernel_sizes) + 1)
+        self.conv2 = ConvModule(
+            conv2_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        with torch.cuda.amp.autocast(enabled=False):
+            x = torch.cat(
+                [x] + [pooling(x) for pooling in self.poolings], dim=1)
+        x = self.conv2(x)
+        return x
+
+
+@MODELS.register_module()
+class CSPDarknet(BaseModule):
+    """CSP-Darknet backbone used in YOLOv5 and YOLOX.
+
+    Args:
+        arch (str): Architecture of CSP-Darknet, from {P5, P6}.
+            Default: P5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Default: 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Default: -1.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Default: False.
+        arch_ovewrite(list): Overwrite default arch settings. Default: None.
+        spp_kernal_sizes: (tuple[int]): Sequential of kernel sizes of SPP
+            layers. Default: (5, 9, 13).
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Example:
+        >>> from mmdet.models import CSPDarknet
+        >>> import torch
+        >>> self = CSPDarknet(depth=53)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+               [256, 512, 9, True, False], [512, 1024, 3, False, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+               [256, 512, 9, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, False, True]]
+    }
+
+    def __init__(self,
+                 arch='P5',
+                 deepen_factor=1.0,
+                 widen_factor=1.0,
+                 out_indices=(2, 3, 4),
+                 frozen_stages=-1,
+                 use_depthwise=False,
+                 arch_ovewrite=None,
+                 spp_kernal_sizes=(5, 9, 13),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 norm_eval=False,
+                 init_cfg=dict(
+                     type='Kaiming',
+                     layer='Conv2d',
+                     a=math.sqrt(5),
+                     distribution='uniform',
+                     mode='fan_in',
+                     nonlinearity='leaky_relu')):
+        super().__init__(init_cfg)
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        assert set(out_indices).issubset(
+            i for i in range(len(arch_setting) + 1))
+        if frozen_stages not in range(-1, len(arch_setting) + 1):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             'len(arch_setting) + 1). But received '
+                             f'{frozen_stages}')
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.use_depthwise = use_depthwise
+        self.norm_eval = norm_eval
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        self.stem = Focus(
+            3,
+            int(arch_setting[0][0] * widen_factor),
+            kernel_size=3,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.layers = ['stem']
+
+        for i, (in_channels, out_channels, num_blocks, add_identity,
+                use_spp) in enumerate(arch_setting):
+            in_channels = int(in_channels * widen_factor)
+            out_channels = int(out_channels * widen_factor)
+            num_blocks = max(round(num_blocks * deepen_factor), 1)
+            stage = []
+            conv_layer = conv(
+                in_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(conv_layer)
+            if use_spp:
+                spp = SPPBottleneck(
+                    out_channels,
+                    out_channels,
+                    kernel_sizes=spp_kernal_sizes,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg)
+                stage.append(spp)
+            csp_layer = CSPLayer(
+                out_channels,
+                out_channels,
+                num_blocks=num_blocks,
+                add_identity=add_identity,
+                use_depthwise=use_depthwise,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(csp_layer)
+            self.add_module(f'stage{i + 1}', nn.Sequential(*stage))
+            self.layers.append(f'stage{i + 1}')
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages + 1):
+                m = getattr(self, self.layers[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        super(CSPDarknet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/backbones/cspnext.py b/head_extractor/src/mmdet/models/backbones/cspnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..269725a70224047a1f7f7564ba8199e38df25cc8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/cspnext.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from ..layers import CSPLayer
+from .csp_darknet import SPPBottleneck
+
+
+@MODELS.register_module()
+class CSPNeXt(BaseModule):
+    """CSPNeXt backbone used in RTMDet.
+
+    Args:
+        arch (str): Architecture of CSPNeXt, from {P5, P6}.
+            Defaults to P5.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Defaults to 0.5.
+        deepen_factor (float): Depth multiplier, multiply number of
+            blocks in CSP layer by this amount. Defaults to 1.0.
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Defaults to 1.0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (2, 3, 4).
+        frozen_stages (int): Stages to be frozen (stop grad and set eval
+            mode). -1 means not freezing any parameters. Defaults to -1.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        arch_ovewrite (list): Overwrite default arch settings.
+            Defaults to None.
+        spp_kernel_sizes: (tuple[int]): Sequential of kernel sizes of SPP
+            layers. Defaults to (5, 9, 13).
+        channel_attention (bool): Whether to add channel attention in each
+            stage. Defaults to True.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    """
+    # From left to right:
+    # in_channels, out_channels, num_blocks, add_identity, use_spp
+    arch_settings = {
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 1024, 3, False, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 6, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, False, True]]
+    }
+
+    def __init__(
+        self,
+        arch: str = 'P5',
+        deepen_factor: float = 1.0,
+        widen_factor: float = 1.0,
+        out_indices: Sequence[int] = (2, 3, 4),
+        frozen_stages: int = -1,
+        use_depthwise: bool = False,
+        expand_ratio: float = 0.5,
+        arch_ovewrite: dict = None,
+        spp_kernel_sizes: Sequence[int] = (5, 9, 13),
+        channel_attention: bool = True,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='SiLU'),
+        norm_eval: bool = False,
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        arch_setting = self.arch_settings[arch]
+        if arch_ovewrite:
+            arch_setting = arch_ovewrite
+        assert set(out_indices).issubset(
+            i for i in range(len(arch_setting) + 1))
+        if frozen_stages not in range(-1, len(arch_setting) + 1):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             'len(arch_setting) + 1). But received '
+                             f'{frozen_stages}')
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.use_depthwise = use_depthwise
+        self.norm_eval = norm_eval
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.stem = nn.Sequential(
+            ConvModule(
+                3,
+                int(arch_setting[0][0] * widen_factor // 2),
+                3,
+                padding=1,
+                stride=2,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                int(arch_setting[0][0] * widen_factor // 2),
+                int(arch_setting[0][0] * widen_factor // 2),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                int(arch_setting[0][0] * widen_factor // 2),
+                int(arch_setting[0][0] * widen_factor),
+                3,
+                padding=1,
+                stride=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+        self.layers = ['stem']
+
+        for i, (in_channels, out_channels, num_blocks, add_identity,
+                use_spp) in enumerate(arch_setting):
+            in_channels = int(in_channels * widen_factor)
+            out_channels = int(out_channels * widen_factor)
+            num_blocks = max(round(num_blocks * deepen_factor), 1)
+            stage = []
+            conv_layer = conv(
+                in_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(conv_layer)
+            if use_spp:
+                spp = SPPBottleneck(
+                    out_channels,
+                    out_channels,
+                    kernel_sizes=spp_kernel_sizes,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg)
+                stage.append(spp)
+            csp_layer = CSPLayer(
+                out_channels,
+                out_channels,
+                num_blocks=num_blocks,
+                add_identity=add_identity,
+                use_depthwise=use_depthwise,
+                use_cspnext_block=True,
+                expand_ratio=expand_ratio,
+                channel_attention=channel_attention,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            stage.append(csp_layer)
+            self.add_module(f'stage{i + 1}', nn.Sequential(*stage))
+            self.layers.append(f'stage{i + 1}')
+
+    def _freeze_stages(self) -> None:
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages + 1):
+                m = getattr(self, self.layers[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True) -> None:
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def forward(self, x: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]:
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/backbones/darknet.py b/head_extractor/src/mmdet/models/backbones/darknet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d44da1e03f04a7e0801c10e5338277cf6244ab1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/darknet.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+
+
+class ResBlock(BaseModule):
+    """The basic residual block used in Darknet. Each ResBlock consists of two
+    ConvModules and the input is added to the final output. Each ConvModule is
+    composed of Conv, BN, and LeakyReLU. In YoloV3 paper, the first convLayer
+    has half of the number of the filters as much as the second convLayer. The
+    first convLayer has filter size of 1x1 and the second one has the filter
+    size of 3x3.
+
+    Args:
+        in_channels (int): The input channels. Must be even.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 init_cfg=None):
+        super(ResBlock, self).__init__(init_cfg)
+        assert in_channels % 2 == 0  # ensure the in_channels is even
+        half_in_channels = in_channels // 2
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        self.conv1 = ConvModule(in_channels, half_in_channels, 1, **cfg)
+        self.conv2 = ConvModule(
+            half_in_channels, in_channels, 3, padding=1, **cfg)
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = out + residual
+
+        return out
+
+
+@MODELS.register_module()
+class Darknet(BaseModule):
+    """Darknet backbone.
+
+    Args:
+        depth (int): Depth of Darknet. Currently only support 53.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import Darknet
+        >>> import torch
+        >>> self = Darknet(depth=53)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 416, 416)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        ...
+        (1, 256, 52, 52)
+        (1, 512, 26, 26)
+        (1, 1024, 13, 13)
+    """
+
+    # Dict(depth: (layers, channels))
+    arch_settings = {
+        53: ((1, 2, 8, 8, 4), ((32, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 1024)))
+    }
+
+    def __init__(self,
+                 depth=53,
+                 out_indices=(3, 4, 5),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
+                 norm_eval=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(Darknet, self).__init__(init_cfg)
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for darknet')
+
+        self.depth = depth
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.layers, self.channels = self.arch_settings[depth]
+
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        self.conv1 = ConvModule(3, 32, 3, padding=1, **cfg)
+
+        self.cr_blocks = ['conv1']
+        for i, n_layers in enumerate(self.layers):
+            layer_name = f'conv_res_block{i + 1}'
+            in_c, out_c = self.channels[i]
+            self.add_module(
+                layer_name,
+                self.make_conv_res_block(in_c, out_c, n_layers, **cfg))
+            self.cr_blocks.append(layer_name)
+
+        self.norm_eval = norm_eval
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.cr_blocks):
+            cr_block = getattr(self, layer_name)
+            x = cr_block(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for i in range(self.frozen_stages):
+                m = getattr(self, self.cr_blocks[i])
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def train(self, mode=True):
+        super(Darknet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    @staticmethod
+    def make_conv_res_block(in_channels,
+                            out_channels,
+                            res_repeat,
+                            conv_cfg=None,
+                            norm_cfg=dict(type='BN', requires_grad=True),
+                            act_cfg=dict(type='LeakyReLU',
+                                         negative_slope=0.1)):
+        """In Darknet backbone, ConvLayer is usually followed by ResBlock. This
+        function will make that. The Conv layers always have 3x3 filters with
+        stride=2. The number of the filters in Conv layer is the same as the
+        out channels of the ResBlock.
+
+        Args:
+            in_channels (int): The number of input channels.
+            out_channels (int): The number of output channels.
+            res_repeat (int): The number of ResBlocks.
+            conv_cfg (dict): Config dict for convolution layer. Default: None.
+            norm_cfg (dict): Dictionary to construct and config norm layer.
+                Default: dict(type='BN', requires_grad=True)
+            act_cfg (dict): Config dict for activation layer.
+                Default: dict(type='LeakyReLU', negative_slope=0.1).
+        """
+
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        model = nn.Sequential()
+        model.add_module(
+            'conv',
+            ConvModule(
+                in_channels, out_channels, 3, stride=2, padding=1, **cfg))
+        for idx in range(res_repeat):
+            model.add_module('res{}'.format(idx),
+                             ResBlock(out_channels, **cfg))
+        return model
diff --git a/head_extractor/src/mmdet/models/backbones/detectors_resnet.py b/head_extractor/src/mmdet/models/backbones/detectors_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f33424fce4a933d675f1f1d3d4ad89e0173c5f9e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/detectors_resnet.py
@@ -0,0 +1,353 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.logging import MMLogger
+from mmengine.model import Sequential, constant_init, kaiming_init
+from mmengine.runner.checkpoint import load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from .resnet import BasicBlock
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+    r"""Bottleneck for the ResNet backbone in `DetectoRS
+    <https://arxiv.org/pdf/2006.02334.pdf>`_.
+
+    This bottleneck allows the users to specify whether to use
+    SAC (Switchable Atrous Convolution) and RFP (Recursive Feature Pyramid).
+
+    Args:
+         inplanes (int): The number of input channels.
+         planes (int): The number of output channels before expansion.
+         rfp_inplanes (int, optional): The number of channels from RFP.
+             Default: None. If specified, an additional conv layer will be
+             added for ``rfp_feat``. Otherwise, the structure is the same as
+             base class.
+         sac (dict, optional): Dictionary to construct SAC. Default: None.
+         init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 rfp_inplanes=None,
+                 sac=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(Bottleneck, self).__init__(
+            inplanes, planes, init_cfg=init_cfg, **kwargs)
+
+        assert sac is None or isinstance(sac, dict)
+        self.sac = sac
+        self.with_sac = sac is not None
+        if self.with_sac:
+            self.conv2 = build_conv_layer(
+                self.sac,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                bias=False)
+
+        self.rfp_inplanes = rfp_inplanes
+        if self.rfp_inplanes:
+            self.rfp_conv = build_conv_layer(
+                None,
+                self.rfp_inplanes,
+                planes * self.expansion,
+                1,
+                stride=1,
+                bias=True)
+            if init_cfg is None:
+                self.init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='rfp_conv'))
+
+    def rfp_forward(self, x, rfp_feat):
+        """The forward function that also takes the RFP features as input."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        if self.rfp_inplanes:
+            rfp_feat = self.rfp_conv(rfp_feat)
+            out = out + rfp_feat
+
+        out = self.relu(out)
+
+        return out
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone for RPF in detectoRS.
+
+    The difference between this module and base class is that we pass
+    ``rfp_inplanes`` to the first block.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+        rfp_inplanes (int, optional): The number of channels from RFP.
+            Default: None. If specified, an additional conv layer will be
+            added for ``rfp_feat``. Otherwise, the structure is the same as
+            base class.
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 rfp_inplanes=None,
+                 **kwargs):
+        self.block = block
+        assert downsample_first, f'downsample_first={downsample_first} is ' \
+                                 'not supported in DetectoRS'
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down and stride != 1:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                rfp_inplanes=rfp_inplanes,
+                **kwargs))
+        inplanes = planes * block.expansion
+        for _ in range(1, num_blocks):
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+
+        super(ResLayer, self).__init__(*layers)
+
+
+@MODELS.register_module()
+class DetectoRS_ResNet(ResNet):
+    """ResNet backbone for DetectoRS.
+
+    Args:
+        sac (dict, optional): Dictionary to construct SAC (Switchable Atrous
+            Convolution). Default: None.
+        stage_with_sac (list): Which stage to use sac. Default: (False, False,
+            False, False).
+        rfp_inplanes (int, optional): The number of channels from RFP.
+            Default: None. If specified, an additional conv layer will be
+            added for ``rfp_feat``. Otherwise, the structure is the same as
+            base class.
+        output_img (bool): If ``True``, the input image will be inserted into
+            the starting position of output. Default: False.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 sac=None,
+                 stage_with_sac=(False, False, False, False),
+                 rfp_inplanes=None,
+                 output_img=False,
+                 pretrained=None,
+                 init_cfg=None,
+                 **kwargs):
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        self.pretrained = pretrained
+        if init_cfg is not None:
+            assert isinstance(init_cfg, dict), \
+                f'init_cfg must be a dict, but got {type(init_cfg)}'
+            if 'type' in init_cfg:
+                assert init_cfg.get('type') == 'Pretrained', \
+                    'Only can initialize module by loading a pretrained model'
+            else:
+                raise KeyError('`init_cfg` must contain the key "type"')
+            self.pretrained = init_cfg.get('checkpoint')
+        self.sac = sac
+        self.stage_with_sac = stage_with_sac
+        self.rfp_inplanes = rfp_inplanes
+        self.output_img = output_img
+        super(DetectoRS_ResNet, self).__init__(**kwargs)
+
+        self.inplanes = self.stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            sac = self.sac if self.stage_with_sac[i] else None
+            if self.plugins is not None:
+                stage_plugins = self.make_stage_plugins(self.plugins, i)
+            else:
+                stage_plugins = None
+            planes = self.base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                dcn=dcn,
+                sac=sac,
+                rfp_inplanes=rfp_inplanes if i > 0 else None,
+                plugins=stage_plugins)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+    # In order to be properly initialized by RFP
+    def init_weights(self):
+        # Calling this method will cause parameter initialization exception
+        # super(DetectoRS_ResNet, self).init_weights()
+
+        if isinstance(self.pretrained, str):
+            logger = MMLogger.get_current_instance()
+            load_checkpoint(self, self.pretrained, strict=False, logger=logger)
+        elif self.pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
+                    constant_init(m, 1)
+
+            if self.dcn is not None:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck) and hasattr(
+                            m.conv2, 'conv_offset'):
+                        constant_init(m.conv2.conv_offset, 0)
+
+            if self.zero_init_residual:
+                for m in self.modules():
+                    if isinstance(m, Bottleneck):
+                        constant_init(m.norm3, 0)
+                    elif isinstance(m, BasicBlock):
+                        constant_init(m.norm2, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer`` for DetectoRS."""
+        return ResLayer(**kwargs)
+
+    def forward(self, x):
+        """Forward function."""
+        outs = list(super(DetectoRS_ResNet, self).forward(x))
+        if self.output_img:
+            outs.insert(0, x)
+        return tuple(outs)
+
+    def rfp_forward(self, x, rfp_feats):
+        """Forward function for RFP."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            rfp_feat = rfp_feats[i] if i > 0 else None
+            for layer in res_layer:
+                x = layer.rfp_forward(x, rfp_feat)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/backbones/detectors_resnext.py b/head_extractor/src/mmdet/models/backbones/detectors_resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bbd63154bb47910e27cf6a75e4b359e050063e1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/detectors_resnext.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from mmdet.registry import MODELS
+from .detectors_resnet import Bottleneck as _Bottleneck
+from .detectors_resnet import DetectoRS_ResNet
+
+
+class Bottleneck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 **kwargs):
+        """Bottleneck block for ResNeXt.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if self.with_sac:
+            self.conv2 = build_conv_layer(
+                self.sac,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        elif not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@MODELS.register_module()
+class DetectoRS_ResNeXt(DetectoRS_ResNet):
+    """ResNeXt backbone for DetectoRS.
+
+    Args:
+        groups (int): The number of groups in ResNeXt.
+        base_width (int): The base width of ResNeXt.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        super(DetectoRS_ResNeXt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return super().make_res_layer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/head_extractor/src/mmdet/models/backbones/efficientnet.py b/head_extractor/src/mmdet/models/backbones/efficientnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8484afe2e34e2bf8327e8aefedb968bd9a1e7792
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/efficientnet.py
@@ -0,0 +1,418 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn.bricks import ConvModule, DropPath
+from mmengine.model import BaseModule, Sequential
+
+from mmdet.registry import MODELS
+from ..layers import InvertedResidual, SELayer
+from ..utils import make_divisible
+
+
+class EdgeResidual(BaseModule):
+    """Edge Residual Block.
+
+    Args:
+        in_channels (int): The input channels of this module.
+        out_channels (int): The output channels of this module.
+        mid_channels (int): The input channels of the second convolution.
+        kernel_size (int): The kernel size of the first convolution.
+            Defaults to 3.
+        stride (int): The stride of the first convolution. Defaults to 1.
+        se_cfg (dict, optional): Config dict for se layer. Defaults to None,
+            which means no se layer.
+        with_residual (bool): Use residual connection. Defaults to True.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to ``dict(type='BN')``.
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to ``dict(type='ReLU')``.
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+        init_cfg (dict | list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_residual=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 drop_path_rate=0.,
+                 with_cp=False,
+                 init_cfg=None,
+                 **kwargs):
+        super(EdgeResidual, self).__init__(init_cfg=init_cfg)
+        assert stride in [1, 2]
+        self.with_cp = with_cp
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.with_se = se_cfg is not None
+        self.with_residual = (
+            stride == 1 and in_channels == out_channels and with_residual)
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.conv2 = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+            out = self.conv1(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.conv2(out)
+
+            if self.with_residual:
+                return x + self.drop_path(out)
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+def model_scaling(layer_setting, arch_setting):
+    """Scaling operation to the layer's parameters according to the
+    arch_setting."""
+    # scale width
+    new_layer_setting = copy.deepcopy(layer_setting)
+    for layer_cfg in new_layer_setting:
+        for block_cfg in layer_cfg:
+            block_cfg[1] = make_divisible(block_cfg[1] * arch_setting[0], 8)
+
+    # scale depth
+    split_layer_setting = [new_layer_setting[0]]
+    for layer_cfg in new_layer_setting[1:-1]:
+        tmp_index = [0]
+        for i in range(len(layer_cfg) - 1):
+            if layer_cfg[i + 1][1] != layer_cfg[i][1]:
+                tmp_index.append(i + 1)
+        tmp_index.append(len(layer_cfg))
+        for i in range(len(tmp_index) - 1):
+            split_layer_setting.append(layer_cfg[tmp_index[i]:tmp_index[i +
+                                                                        1]])
+    split_layer_setting.append(new_layer_setting[-1])
+
+    num_of_layers = [len(layer_cfg) for layer_cfg in split_layer_setting[1:-1]]
+    new_layers = [
+        int(math.ceil(arch_setting[1] * num)) for num in num_of_layers
+    ]
+
+    merge_layer_setting = [split_layer_setting[0]]
+    for i, layer_cfg in enumerate(split_layer_setting[1:-1]):
+        if new_layers[i] <= num_of_layers[i]:
+            tmp_layer_cfg = layer_cfg[:new_layers[i]]
+        else:
+            tmp_layer_cfg = copy.deepcopy(layer_cfg) + [layer_cfg[-1]] * (
+                new_layers[i] - num_of_layers[i])
+        if tmp_layer_cfg[0][3] == 1 and i != 0:
+            merge_layer_setting[-1] += tmp_layer_cfg.copy()
+        else:
+            merge_layer_setting.append(tmp_layer_cfg.copy())
+    merge_layer_setting.append(split_layer_setting[-1])
+
+    return merge_layer_setting
+
+
+@MODELS.register_module()
+class EfficientNet(BaseModule):
+    """EfficientNet backbone.
+
+    Args:
+        arch (str): Architecture of efficientnet. Defaults to b0.
+        out_indices (Sequence[int]): Output from which stages.
+            Defaults to (6, ).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Defaults to 0, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Defaults to False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Defaults to False.
+    """
+
+    # Parameters to build layers.
+    # 'b' represents the architecture of normal EfficientNet family includes
+    # 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8'.
+    # 'e' represents the architecture of EfficientNet-EdgeTPU including 'es',
+    # 'em', 'el'.
+    # 6 parameters are needed to construct a layer, From left to right:
+    # - kernel_size: The kernel size of the block
+    # - out_channel: The number of out_channels of the block
+    # - se_ratio: The sequeeze ratio of SELayer.
+    # - stride: The stride of the block
+    # - expand_ratio: The expand_ratio of the mid_channels
+    # - block_type: -1: Not a block, 0: InvertedResidual, 1: EdgeResidual
+    layer_settings = {
+        'b': [[[3, 32, 0, 2, 0, -1]],
+              [[3, 16, 4, 1, 1, 0]],
+              [[3, 24, 4, 2, 6, 0],
+               [3, 24, 4, 1, 6, 0]],
+              [[5, 40, 4, 2, 6, 0],
+               [5, 40, 4, 1, 6, 0]],
+              [[3, 80, 4, 2, 6, 0],
+               [3, 80, 4, 1, 6, 0],
+               [3, 80, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0],
+               [5, 112, 4, 1, 6, 0]],
+              [[5, 192, 4, 2, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [5, 192, 4, 1, 6, 0],
+               [3, 320, 4, 1, 6, 0]],
+              [[1, 1280, 0, 1, 0, -1]]
+              ],
+        'e': [[[3, 32, 0, 2, 0, -1]],
+              [[3, 24, 0, 1, 3, 1]],
+              [[3, 32, 0, 2, 8, 1],
+               [3, 32, 0, 1, 8, 1]],
+              [[3, 48, 0, 2, 8, 1],
+               [3, 48, 0, 1, 8, 1],
+               [3, 48, 0, 1, 8, 1],
+               [3, 48, 0, 1, 8, 1]],
+              [[5, 96, 0, 2, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 96, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0],
+               [5, 144, 0, 1, 8, 0]],
+              [[5, 192, 0, 2, 8, 0],
+               [5, 192, 0, 1, 8, 0]],
+              [[1, 1280, 0, 1, 0, -1]]
+              ]
+    }  # yapf: disable
+
+    # Parameters to build different kinds of architecture.
+    # From left to right: scaling factor for width, scaling factor for depth,
+    # resolution.
+    arch_settings = {
+        'b0': (1.0, 1.0, 224),
+        'b1': (1.0, 1.1, 240),
+        'b2': (1.1, 1.2, 260),
+        'b3': (1.2, 1.4, 300),
+        'b4': (1.4, 1.8, 380),
+        'b5': (1.6, 2.2, 456),
+        'b6': (1.8, 2.6, 528),
+        'b7': (2.0, 3.1, 600),
+        'b8': (2.2, 3.6, 672),
+        'es': (1.0, 1.0, 224),
+        'em': (1.0, 1.1, 240),
+        'el': (1.2, 1.4, 300)
+    }
+
+    def __init__(self,
+                 arch='b0',
+                 drop_path_rate=0.,
+                 out_indices=(6, ),
+                 frozen_stages=0,
+                 conv_cfg=dict(type='Conv2dAdaptivePadding'),
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='Swish'),
+                 norm_eval=False,
+                 with_cp=False,
+                 init_cfg=[
+                     dict(type='Kaiming', layer='Conv2d'),
+                     dict(
+                         type='Constant',
+                         layer=['_BatchNorm', 'GroupNorm'],
+                         val=1)
+                 ]):
+        super(EfficientNet, self).__init__(init_cfg)
+        assert arch in self.arch_settings, \
+            f'"{arch}" is not one of the arch_settings ' \
+            f'({", ".join(self.arch_settings.keys())})'
+        self.arch_setting = self.arch_settings[arch]
+        self.layer_setting = self.layer_settings[arch[:1]]
+        for index in out_indices:
+            if index not in range(0, len(self.layer_setting)):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, {len(self.layer_setting)}). '
+                                 f'But received {index}')
+
+        if frozen_stages not in range(len(self.layer_setting) + 1):
+            raise ValueError('frozen_stages must be in range(0, '
+                             f'{len(self.layer_setting) + 1}). '
+                             f'But received {frozen_stages}')
+        self.drop_path_rate = drop_path_rate
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.layer_setting = model_scaling(self.layer_setting,
+                                           self.arch_setting)
+        block_cfg_0 = self.layer_setting[0][0]
+        block_cfg_last = self.layer_setting[-1][0]
+        self.in_channels = make_divisible(block_cfg_0[1], 8)
+        self.out_channels = block_cfg_last[1]
+        self.layers = nn.ModuleList()
+        self.layers.append(
+            ConvModule(
+                in_channels=3,
+                out_channels=self.in_channels,
+                kernel_size=block_cfg_0[0],
+                stride=block_cfg_0[3],
+                padding=block_cfg_0[0] // 2,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        self.make_layer()
+        # Avoid building unused layers in mmdetection.
+        if len(self.layers) < max(self.out_indices) + 1:
+            self.layers.append(
+                ConvModule(
+                    in_channels=self.in_channels,
+                    out_channels=self.out_channels,
+                    kernel_size=block_cfg_last[0],
+                    stride=block_cfg_last[3],
+                    padding=block_cfg_last[0] // 2,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+    def make_layer(self):
+        # Without the first and the final conv block.
+        layer_setting = self.layer_setting[1:-1]
+
+        total_num_blocks = sum([len(x) for x in layer_setting])
+        block_idx = 0
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, total_num_blocks)
+        ]  # stochastic depth decay rule
+
+        for i, layer_cfg in enumerate(layer_setting):
+            # Avoid building unused layers in mmdetection.
+            if i > max(self.out_indices) - 1:
+                break
+            layer = []
+            for i, block_cfg in enumerate(layer_cfg):
+                (kernel_size, out_channels, se_ratio, stride, expand_ratio,
+                 block_type) = block_cfg
+
+                mid_channels = int(self.in_channels * expand_ratio)
+                out_channels = make_divisible(out_channels, 8)
+                if se_ratio <= 0:
+                    se_cfg = None
+                else:
+                    # In mmdetection, the `divisor` is deleted to align
+                    # the logic of SELayer with mmpretrain.
+                    se_cfg = dict(
+                        channels=mid_channels,
+                        ratio=expand_ratio * se_ratio,
+                        act_cfg=(self.act_cfg, dict(type='Sigmoid')))
+                if block_type == 1:  # edge tpu
+                    if i > 0 and expand_ratio == 3:
+                        with_residual = False
+                        expand_ratio = 4
+                    else:
+                        with_residual = True
+                    mid_channels = int(self.in_channels * expand_ratio)
+                    if se_cfg is not None:
+                        # In mmdetection, the `divisor` is deleted to align
+                        # the logic of SELayer with mmpretrain.
+                        se_cfg = dict(
+                            channels=mid_channels,
+                            ratio=se_ratio * expand_ratio,
+                            act_cfg=(self.act_cfg, dict(type='Sigmoid')))
+                    block = partial(EdgeResidual, with_residual=with_residual)
+                else:
+                    block = InvertedResidual
+                layer.append(
+                    block(
+                        in_channels=self.in_channels,
+                        out_channels=out_channels,
+                        mid_channels=mid_channels,
+                        kernel_size=kernel_size,
+                        stride=stride,
+                        se_cfg=se_cfg,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        drop_path_rate=dpr[block_idx],
+                        with_cp=self.with_cp,
+                        # In mmdetection, `with_expand_conv` is set to align
+                        # the logic of InvertedResidual with mmpretrain.
+                        with_expand_conv=(mid_channels != self.in_channels)))
+                self.in_channels = out_channels
+                block_idx += 1
+            self.layers.append(Sequential(*layer))
+
+    def forward(self, x):
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return tuple(outs)
+
+    def _freeze_stages(self):
+        for i in range(self.frozen_stages):
+            m = self.layers[i]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super(EfficientNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/head_extractor/src/mmdet/models/backbones/hourglass.py b/head_extractor/src/mmdet/models/backbones/hourglass.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb58799f7b32138b3f58383419ddce9aa6d5ca18
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/hourglass.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from ..layers import ResLayer
+from .resnet import BasicBlock
+
+
+class HourglassModule(BaseModule):
+    """Hourglass Module for HourglassNet backbone.
+
+    Generate module recursively and use BasicBlock as the base unit.
+
+    Args:
+        depth (int): Depth of current HourglassModule.
+        stage_channels (list[int]): Feature channels of sub-modules in current
+            and follow-up HourglassModule.
+        stage_blocks (list[int]): Number of sub-modules stacked in current and
+            follow-up HourglassModule.
+        norm_cfg (ConfigType): Dictionary to construct and config norm layer.
+            Defaults to `dict(type='BN', requires_grad=True)`
+        upsample_cfg (ConfigType): Config dict for interpolate layer.
+            Defaults to `dict(mode='nearest')`
+       init_cfg (dict or ConfigDict, optional): the config to control the
+           initialization.
+    """
+
+    def __init__(self,
+                 depth: int,
+                 stage_channels: List[int],
+                 stage_blocks: List[int],
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 upsample_cfg: ConfigType = dict(mode='nearest'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg)
+
+        self.depth = depth
+
+        cur_block = stage_blocks[0]
+        next_block = stage_blocks[1]
+
+        cur_channel = stage_channels[0]
+        next_channel = stage_channels[1]
+
+        self.up1 = ResLayer(
+            BasicBlock, cur_channel, cur_channel, cur_block, norm_cfg=norm_cfg)
+
+        self.low1 = ResLayer(
+            BasicBlock,
+            cur_channel,
+            next_channel,
+            cur_block,
+            stride=2,
+            norm_cfg=norm_cfg)
+
+        if self.depth > 1:
+            self.low2 = HourglassModule(depth - 1, stage_channels[1:],
+                                        stage_blocks[1:])
+        else:
+            self.low2 = ResLayer(
+                BasicBlock,
+                next_channel,
+                next_channel,
+                next_block,
+                norm_cfg=norm_cfg)
+
+        self.low3 = ResLayer(
+            BasicBlock,
+            next_channel,
+            cur_channel,
+            cur_block,
+            norm_cfg=norm_cfg,
+            downsample_first=False)
+
+        self.up2 = F.interpolate
+        self.upsample_cfg = upsample_cfg
+
+    def forward(self, x: torch.Tensor) -> nn.Module:
+        """Forward function."""
+        up1 = self.up1(x)
+        low1 = self.low1(x)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        # Fixing `scale factor` (e.g. 2) is common for upsampling, but
+        # in some cases the spatial size is mismatched and error will arise.
+        if 'scale_factor' in self.upsample_cfg:
+            up2 = self.up2(low3, **self.upsample_cfg)
+        else:
+            shape = up1.shape[2:]
+            up2 = self.up2(low3, size=shape, **self.upsample_cfg)
+        return up1 + up2
+
+
+@MODELS.register_module()
+class HourglassNet(BaseModule):
+    """HourglassNet backbone.
+
+    Stacked Hourglass Networks for Human Pose Estimation.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1603.06937>`_ .
+
+    Args:
+        downsample_times (int): Downsample times in a HourglassModule.
+        num_stacks (int): Number of HourglassModule modules stacked,
+            1 for Hourglass-52, 2 for Hourglass-104.
+        stage_channels (Sequence[int]): Feature channel of each sub-module in a
+            HourglassModule.
+        stage_blocks (Sequence[int]): Number of sub-modules stacked in a
+            HourglassModule.
+        feat_channel (int): Feature channel of conv after a HourglassModule.
+        norm_cfg (norm_cfg): Dictionary to construct and config norm layer.
+       init_cfg (dict or ConfigDict, optional): the config to control the
+           initialization.
+
+    Example:
+        >>> from mmdet.models import HourglassNet
+        >>> import torch
+        >>> self = HourglassNet()
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 511, 511)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_output in level_outputs:
+        ...     print(tuple(level_output.shape))
+        (1, 256, 128, 128)
+        (1, 256, 128, 128)
+    """
+
+    def __init__(self,
+                 downsample_times: int = 5,
+                 num_stacks: int = 2,
+                 stage_channels: Sequence = (256, 256, 384, 384, 384, 512),
+                 stage_blocks: Sequence = (2, 2, 2, 2, 2, 4),
+                 feat_channel: int = 256,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg)
+
+        self.num_stacks = num_stacks
+        assert self.num_stacks >= 1
+        assert len(stage_channels) == len(stage_blocks)
+        assert len(stage_channels) > downsample_times
+
+        cur_channel = stage_channels[0]
+
+        self.stem = nn.Sequential(
+            ConvModule(
+                3, cur_channel // 2, 7, padding=3, stride=2,
+                norm_cfg=norm_cfg),
+            ResLayer(
+                BasicBlock,
+                cur_channel // 2,
+                cur_channel,
+                1,
+                stride=2,
+                norm_cfg=norm_cfg))
+
+        self.hourglass_modules = nn.ModuleList([
+            HourglassModule(downsample_times, stage_channels, stage_blocks)
+            for _ in range(num_stacks)
+        ])
+
+        self.inters = ResLayer(
+            BasicBlock,
+            cur_channel,
+            cur_channel,
+            num_stacks - 1,
+            norm_cfg=norm_cfg)
+
+        self.conv1x1s = nn.ModuleList([
+            ConvModule(
+                cur_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.out_convs = nn.ModuleList([
+            ConvModule(
+                cur_channel, feat_channel, 3, padding=1, norm_cfg=norm_cfg)
+            for _ in range(num_stacks)
+        ])
+
+        self.remap_convs = nn.ModuleList([
+            ConvModule(
+                feat_channel, cur_channel, 1, norm_cfg=norm_cfg, act_cfg=None)
+            for _ in range(num_stacks - 1)
+        ])
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self) -> None:
+        """Init module weights."""
+        # Training Centripetal Model needs to reset parameters for Conv2d
+        super().init_weights()
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                m.reset_parameters()
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        """Forward function."""
+        inter_feat = self.stem(x)
+        out_feats = []
+
+        for ind in range(self.num_stacks):
+            single_hourglass = self.hourglass_modules[ind]
+            out_conv = self.out_convs[ind]
+
+            hourglass_feat = single_hourglass(inter_feat)
+            out_feat = out_conv(hourglass_feat)
+            out_feats.append(out_feat)
+
+            if ind < self.num_stacks - 1:
+                inter_feat = self.conv1x1s[ind](
+                    inter_feat) + self.remap_convs[ind](
+                        out_feat)
+                inter_feat = self.inters[ind](self.relu(inter_feat))
+
+        return out_feats
diff --git a/head_extractor/src/mmdet/models/backbones/hrnet.py b/head_extractor/src/mmdet/models/backbones/hrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..77bd3cc7125bb7ba03cd201ab3a55174b01dde50
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/hrnet.py
@@ -0,0 +1,589 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule, ModuleList, Sequential
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from .resnet import BasicBlock, Bottleneck
+
+
+class HRModule(BaseModule):
+    """High-Resolution Module for HRNet.
+
+    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
+    is in this module.
+    """
+
+    def __init__(self,
+                 num_branches,
+                 blocks,
+                 num_blocks,
+                 in_channels,
+                 num_channels,
+                 multiscale_output=True,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 block_init_cfg=None,
+                 init_cfg=None):
+        super(HRModule, self).__init__(init_cfg)
+        self.block_init_cfg = block_init_cfg
+        self._check_branches(num_branches, num_blocks, in_channels,
+                             num_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.multiscale_output = multiscale_output
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.with_cp = with_cp
+        self.branches = self._make_branches(num_branches, blocks, num_blocks,
+                                            num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=False)
+
+    def _check_branches(self, num_branches, num_blocks, in_channels,
+                        num_channels):
+        if num_branches != len(num_blocks):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_BLOCKS({len(num_blocks)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_CHANNELS({len(num_channels)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_INCHANNELS({len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        downsample = None
+        if stride != 1 or \
+                self.in_channels[branch_index] != \
+                num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, num_channels[branch_index] *
+                                 block.expansion)[1])
+
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index],
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=self.block_init_cfg))
+        self.in_channels[branch_index] = \
+            num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index],
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=self.block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            nn.Upsample(
+                                scale_factor=2**(j - i), mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=False)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = 0
+            for j in range(self.num_branches):
+                if i == j:
+                    y += x[j]
+                else:
+                    y += self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+
+
+@MODELS.register_module()
+class HRNet(BaseModule):
+    """HRNet backbone.
+
+    `High-Resolution Representations for Labeling Pixels and Regions
+    arXiv: <https://arxiv.org/abs/1904.04514>`_.
+
+    Args:
+        extra (dict): Detailed configuration for each stage of HRNet.
+            There must be 4 stages, the configuration for each stage must have
+            5 keys:
+
+                - num_modules(int): The number of HRModule in this stage.
+                - num_branches(int): The number of branches in the HRModule.
+                - block(str): The type of convolution block.
+                - num_blocks(tuple): The number of blocks in each branch.
+                    The length must be equal to num_branches.
+                - num_channels(tuple): The number of channels in each branch.
+                    The length must be equal to num_branches.
+        in_channels (int): Number of input image channels. Default: 3.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: True.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: False.
+        multiscale_output (bool): Whether to output multi-level features
+            produced by multiple branches. If False, only the first level
+            feature will be output. Default: True.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Example:
+        >>> from mmdet.models import HRNet
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(4, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=3,
+        >>>         num_branches=4,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4, 4),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+        (1, 64, 4, 4)
+        (1, 128, 2, 2)
+        (1, 256, 1, 1)
+    """
+
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 norm_eval=True,
+                 with_cp=False,
+                 zero_init_residual=False,
+                 multiscale_output=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(HRNet, self).__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        # Assert configurations of 4 stages are in extra
+        assert 'stage1' in extra and 'stage2' in extra \
+               and 'stage3' in extra and 'stage4' in extra
+        # Assert whether the length of `num_blocks` and `num_channels` are
+        # equal to `num_branches`
+        for i in range(4):
+            cfg = extra[f'stage{i + 1}']
+            assert len(cfg['num_blocks']) == cfg['num_branches'] and \
+                   len(cfg['num_channels']) == cfg['num_branches']
+
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+
+        # stem net
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            64,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # stage 1
+        self.stage1_cfg = self.extra['stage1']
+        num_channels = self.stage1_cfg['num_channels'][0]
+        block_type = self.stage1_cfg['block']
+        num_blocks = self.stage1_cfg['num_blocks'][0]
+
+        block = self.blocks_dict[block_type]
+        stage1_out_channels = num_channels * block.expansion
+        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
+
+        # stage 2
+        self.stage2_cfg = self.extra['stage2']
+        num_channels = self.stage2_cfg['num_channels']
+        block_type = self.stage2_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition1 = self._make_transition_layer([stage1_out_channels],
+                                                       num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        # stage 3
+        self.stage3_cfg = self.extra['stage3']
+        num_channels = self.stage3_cfg['num_channels']
+        block_type = self.stage3_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition2 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        # stage 4
+        self.stage4_cfg = self.extra['stage4']
+        num_channels = self.stage4_cfg['num_channels']
+        block_type = self.stage4_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition3 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multiscale_output=multiscale_output)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, planes * block.expansion)[1])
+
+        layers = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=block_init_cfg,
+            ))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+
+        hr_modules = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+
+        for i in range(num_modules):
+            # multi_scale_output is only used for the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            hr_modules.append(
+                HRModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    in_channels,
+                    num_channels,
+                    reset_multiscale_output,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    block_init_cfg=block_init_cfg))
+
+        return Sequential(*hr_modules), in_channels
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['num_branches']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['num_branches']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['num_branches']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        return y_list
+
+    def train(self, mode=True):
+        """Convert the model into training mode will keeping the normalization
+        layer freezed."""
+        super(HRNet, self).train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/head_extractor/src/mmdet/models/backbones/mobilenet_v2.py b/head_extractor/src/mmdet/models/backbones/mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4fd0519ad4d5106e1acb82624d6393052596ce8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/mobilenet_v2.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from ..layers import InvertedResidual
+from ..utils import make_divisible
+
+
+@MODELS.register_module()
+class MobileNetV2(BaseModule):
+    """MobileNetV2 backbone.
+
+    Args:
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        out_indices (Sequence[int], optional): Output from which stages.
+            Default: (1, 2, 4, 7).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    # Parameters to build layers. 4 parameters are needed to construct a
+    # layer, from left to right: expand_ratio, channel, num_blocks, stride.
+    arch_settings = [[1, 16, 1, 1], [6, 24, 2, 2], [6, 32, 3, 2],
+                     [6, 64, 4, 2], [6, 96, 3, 1], [6, 160, 3, 2],
+                     [6, 320, 1, 1]]
+
+    def __init__(self,
+                 widen_factor=1.,
+                 out_indices=(1, 2, 4, 7),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super(MobileNetV2, self).__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.widen_factor = widen_factor
+        self.out_indices = out_indices
+        if not set(out_indices).issubset(set(range(0, 8))):
+            raise ValueError('out_indices must be a subset of range'
+                             f'(0, 8). But received {out_indices}')
+
+        if frozen_stages not in range(-1, 8):
+            raise ValueError('frozen_stages must be in range(-1, 8). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = make_divisible(32 * widen_factor, 8)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.layers = []
+
+        for i, layer_cfg in enumerate(self.arch_settings):
+            expand_ratio, channel, num_blocks, stride = layer_cfg
+            out_channels = make_divisible(channel * widen_factor, 8)
+            inverted_res_layer = self.make_layer(
+                out_channels=out_channels,
+                num_blocks=num_blocks,
+                stride=stride,
+                expand_ratio=expand_ratio)
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, inverted_res_layer)
+            self.layers.append(layer_name)
+
+        if widen_factor > 1.0:
+            self.out_channel = int(1280 * widen_factor)
+        else:
+            self.out_channel = 1280
+
+        layer = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.add_module('conv2', layer)
+        self.layers.append('conv2')
+
+    def make_layer(self, out_channels, num_blocks, stride, expand_ratio):
+        """Stack InvertedResidual blocks to build a layer for MobileNetV2.
+
+        Args:
+            out_channels (int): out_channels of block.
+            num_blocks (int): number of blocks.
+            stride (int): stride of the first block. Default: 1
+            expand_ratio (int): Expand the number of channels of the
+                hidden layer in InvertedResidual by this ratio. Default: 6.
+        """
+        layers = []
+        for i in range(num_blocks):
+            if i >= 1:
+                stride = 1
+            layers.append(
+                InvertedResidual(
+                    self.in_channels,
+                    out_channels,
+                    mid_channels=int(round(self.in_channels * expand_ratio)),
+                    stride=stride,
+                    with_expand_conv=expand_ratio != 1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        frozen."""
+        super(MobileNetV2, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/head_extractor/src/mmdet/models/backbones/pvt.py b/head_extractor/src/mmdet/models/backbones/pvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b250f63c1b22f21a892faf4c41ccc2d20e83e13
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/pvt.py
@@ -0,0 +1,665 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import MultiheadAttention
+from mmengine.logging import MMLogger
+from mmengine.model import (BaseModule, ModuleList, Sequential, constant_init,
+                            normal_init, trunc_normal_init)
+from mmengine.model.weight_init import trunc_normal_
+from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict
+from torch.nn.modules.utils import _pair as to_2tuple
+
+from mmdet.registry import MODELS
+from ..layers import PatchEmbed, nchw_to_nlc, nlc_to_nchw
+
+
+class MixFFN(BaseModule):
+    """An implementation of MixFFN of PVT.
+
+    The differences between MixFFN & FFN:
+        1. Use 1X1 Conv to replace Linear layer.
+        2. Introduce 3X3 Depth-wise Conv to encode positional information.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`.
+        feedforward_channels (int): The hidden dimension of FFNs.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='GELU').
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+            Default: None.
+        use_conv (bool): If True, add 3x3 DWConv between two Linear layers.
+            Defaults: False.
+        init_cfg (obj:`mmengine.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 feedforward_channels,
+                 act_cfg=dict(type='GELU'),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 use_conv=False,
+                 init_cfg=None):
+        super(MixFFN, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.act_cfg = act_cfg
+        activate = build_activation_layer(act_cfg)
+
+        in_channels = embed_dims
+        fc1 = Conv2d(
+            in_channels=in_channels,
+            out_channels=feedforward_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        if use_conv:
+            # 3x3 depth wise conv to provide positional encode information
+            dw_conv = Conv2d(
+                in_channels=feedforward_channels,
+                out_channels=feedforward_channels,
+                kernel_size=3,
+                stride=1,
+                padding=(3 - 1) // 2,
+                bias=True,
+                groups=feedforward_channels)
+        fc2 = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        drop = nn.Dropout(ffn_drop)
+        layers = [fc1, activate, drop, fc2, drop]
+        if use_conv:
+            layers.insert(1, dw_conv)
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+
+    def forward(self, x, hw_shape, identity=None):
+        out = nlc_to_nchw(x, hw_shape)
+        out = self.layers(out)
+        out = nchw_to_nlc(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+class SpatialReductionAttention(MultiheadAttention):
+    """An implementation of Spatial Reduction Attention of PVT.
+
+    This module is modified from MultiheadAttention which is a module from
+    mmcv.cnn.bricks.transformer.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut. Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        qkv_bias (bool): enable bias for qkv if True. Default: True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention of PVT. Default: 1.
+        init_cfg (obj:`mmengine.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=None,
+                 batch_first=True,
+                 qkv_bias=True,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 init_cfg=None):
+        super().__init__(
+            embed_dims,
+            num_heads,
+            attn_drop,
+            proj_drop,
+            batch_first=batch_first,
+            dropout_layer=dropout_layer,
+            bias=qkv_bias,
+            init_cfg=init_cfg)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = Conv2d(
+                in_channels=embed_dims,
+                out_channels=embed_dims,
+                kernel_size=sr_ratio,
+                stride=sr_ratio)
+            # The ret[0] of build_norm_layer is norm name.
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        # handle the BC-breaking from https://github.com/open-mmlab/mmcv/pull/1418 # noqa
+        from mmdet import digit_version, mmcv_version
+        if mmcv_version < digit_version('1.3.17'):
+            warnings.warn('The legacy version of forward function in'
+                          'SpatialReductionAttention is deprecated in'
+                          'mmcv>=1.3.17 and will no longer support in the'
+                          'future. Please upgrade your mmcv.')
+            self.forward = self.legacy_forward
+
+    def forward(self, x, hw_shape, identity=None):
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_queries, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_queries, embed_dims) to num_queries_first
+        # (num_queries ,batch, embed_dims), and recover ``attn_output``
+        # from num_queries_first to batch_first.
+        if self.batch_first:
+            x_q = x_q.transpose(0, 1)
+            x_kv = x_kv.transpose(0, 1)
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+    def legacy_forward(self, x, hw_shape, identity=None):
+        """multi head attention forward in mmcv version < 1.3.17."""
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+class PVTEncoderLayer(BaseModule):
+    """Implements one encoder layer in PVT.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed.
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): stochastic depth rate. Default: 0.0.
+        qkv_bias (bool): enable bias for qkv if True.
+            Default: True.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Spatial Reduction
+            Attention of PVT. Default: 1.
+        use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN.
+            Default: False.
+        init_cfg (dict, optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 use_conv_ffn=False,
+                 init_cfg=None):
+        super(PVTEncoderLayer, self).__init__(init_cfg=init_cfg)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.attn = SpatialReductionAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.ffn = MixFFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            use_conv=use_conv_ffn,
+            act_cfg=act_cfg)
+
+    def forward(self, x, hw_shape):
+        x = self.attn(self.norm1(x), hw_shape, identity=x)
+        x = self.ffn(self.norm2(x), hw_shape, identity=x)
+
+        return x
+
+
+class AbsolutePositionEmbedding(BaseModule):
+    """An implementation of the absolute position embedding in PVT.
+
+    Args:
+        pos_shape (int): The shape of the absolute position embedding.
+        pos_dim (int): The dimension of the absolute position embedding.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default: 0.0.
+    """
+
+    def __init__(self, pos_shape, pos_dim, drop_rate=0., init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(pos_shape, int):
+            pos_shape = to_2tuple(pos_shape)
+        elif isinstance(pos_shape, tuple):
+            if len(pos_shape) == 1:
+                pos_shape = to_2tuple(pos_shape[0])
+            assert len(pos_shape) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pos_shape)}'
+        self.pos_shape = pos_shape
+        self.pos_dim = pos_dim
+
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, pos_shape[0] * pos_shape[1], pos_dim))
+        self.drop = nn.Dropout(p=drop_rate)
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+
+    def resize_pos_embed(self, pos_embed, input_shape, mode='bilinear'):
+        """Resize pos_embed weights.
+
+        Resize pos_embed using bilinear interpolate method.
+
+        Args:
+            pos_embed (torch.Tensor): Position embedding weights.
+            input_shape (tuple): Tuple for (downsampled input image height,
+                downsampled input image width).
+            mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'bilinear'``.
+
+        Return:
+            torch.Tensor: The resized pos_embed of shape [B, L_new, C].
+        """
+        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
+        pos_h, pos_w = self.pos_shape
+        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
+        pos_embed_weight = pos_embed_weight.reshape(
+            1, pos_h, pos_w, self.pos_dim).permute(0, 3, 1, 2).contiguous()
+        pos_embed_weight = F.interpolate(
+            pos_embed_weight, size=input_shape, mode=mode)
+        pos_embed_weight = torch.flatten(pos_embed_weight,
+                                         2).transpose(1, 2).contiguous()
+        pos_embed = pos_embed_weight
+
+        return pos_embed
+
+    def forward(self, x, hw_shape, mode='bilinear'):
+        pos_embed = self.resize_pos_embed(self.pos_embed, hw_shape, mode)
+        return self.drop(x + pos_embed)
+
+
+@MODELS.register_module()
+class PyramidVisionTransformer(BaseModule):
+    """Pyramid Vision Transformer (PVT)
+
+    Implementation of `Pyramid Vision Transformer: A Versatile Backbone for
+    Dense Prediction without Convolutions
+    <https://arxiv.org/pdf/2102.12122.pdf>`_.
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 64.
+        num_stags (int): The num of stages. Default: 4.
+        num_layers (Sequence[int]): The layer number of each transformer encode
+            layer. Default: [3, 4, 6, 3].
+        num_heads (Sequence[int]): The attention heads of each transformer
+            encode layer. Default: [1, 2, 5, 8].
+        patch_sizes (Sequence[int]): The patch_size of each patch embedding.
+            Default: [4, 2, 2, 2].
+        strides (Sequence[int]): The stride of each patch embedding.
+            Default: [4, 2, 2, 2].
+        paddings (Sequence[int]): The padding of each patch embedding.
+            Default: [0, 0, 0, 0].
+        sr_ratios (Sequence[int]): The spatial reduction rate of each
+            transformer encode layer. Default: [8, 4, 2, 1].
+        out_indices (Sequence[int] | int): Output from which stages.
+            Default: (0, 1, 2, 3).
+        mlp_ratios (Sequence[int]): The ratio of the mlp hidden dim to the
+            embedding dim of each transformer encode layer.
+            Default: [8, 8, 4, 4].
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: True.
+        use_conv_ffn (bool): If True, use Convolutional FFN to replace FFN.
+            Default: False.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        pretrained (str, optional): model pretrained path. Default: None.
+        convert_weights (bool): The flag indicates whether the
+            pre-trained model is from the original repo. We may need
+            to convert some keys to make it compatible.
+            Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 in_channels=3,
+                 embed_dims=64,
+                 num_stages=4,
+                 num_layers=[3, 4, 6, 3],
+                 num_heads=[1, 2, 5, 8],
+                 patch_sizes=[4, 2, 2, 2],
+                 strides=[4, 2, 2, 2],
+                 paddings=[0, 0, 0, 0],
+                 sr_ratios=[8, 4, 2, 1],
+                 out_indices=(0, 1, 2, 3),
+                 mlp_ratios=[8, 8, 4, 4],
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_abs_pos_embed=True,
+                 norm_after_stage=False,
+                 use_conv_ffn=False,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 pretrained=None,
+                 convert_weights=True,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.convert_weights = convert_weights
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            self.init_cfg = init_cfg
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.embed_dims = embed_dims
+
+        self.num_stages = num_stages
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.sr_ratios = sr_ratios
+        assert num_stages == len(num_layers) == len(num_heads) \
+               == len(patch_sizes) == len(strides) == len(sr_ratios)
+
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_stages
+        self.pretrained = pretrained
+
+        # transformer encoder
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, sum(num_layers))
+        ]  # stochastic num_layer decay rule
+
+        cur = 0
+        self.layers = ModuleList()
+        for i, num_layer in enumerate(num_layers):
+            embed_dims_i = embed_dims * num_heads[i]
+            patch_embed = PatchEmbed(
+                in_channels=in_channels,
+                embed_dims=embed_dims_i,
+                kernel_size=patch_sizes[i],
+                stride=strides[i],
+                padding=paddings[i],
+                bias=True,
+                norm_cfg=norm_cfg)
+
+            layers = ModuleList()
+            if use_abs_pos_embed:
+                pos_shape = pretrain_img_size // np.prod(patch_sizes[:i + 1])
+                pos_embed = AbsolutePositionEmbedding(
+                    pos_shape=pos_shape,
+                    pos_dim=embed_dims_i,
+                    drop_rate=drop_rate)
+                layers.append(pos_embed)
+            layers.extend([
+                PVTEncoderLayer(
+                    embed_dims=embed_dims_i,
+                    num_heads=num_heads[i],
+                    feedforward_channels=mlp_ratios[i] * embed_dims_i,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=dpr[cur + idx],
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    sr_ratio=sr_ratios[i],
+                    use_conv_ffn=use_conv_ffn) for idx in range(num_layer)
+            ])
+            in_channels = embed_dims_i
+            # The ret[0] of build_norm_layer is norm name.
+            if norm_after_stage:
+                norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
+            else:
+                norm = nn.Identity()
+            self.layers.append(ModuleList([patch_embed, layers, norm]))
+            cur += num_layer
+
+    def init_weights(self):
+        logger = MMLogger.get_current_instance()
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, 1.0)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(m, 0, math.sqrt(2.0 / fan_out))
+                elif isinstance(m, AbsolutePositionEmbedding):
+                    m.init_weights()
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            checkpoint = CheckpointLoader.load_checkpoint(
+                self.init_cfg.checkpoint, logger=logger, map_location='cpu')
+            logger.warn(f'Load pre-trained model for '
+                        f'{self.__class__.__name__} from original repo')
+            if 'state_dict' in checkpoint:
+                state_dict = checkpoint['state_dict']
+            elif 'model' in checkpoint:
+                state_dict = checkpoint['model']
+            else:
+                state_dict = checkpoint
+            if self.convert_weights:
+                # Because pvt backbones are not supported by mmpretrain,
+                # so we need to convert pre-trained weights to match this
+                # implementation.
+                state_dict = pvt_convert(state_dict)
+            load_state_dict(self, state_dict, strict=False, logger=logger)
+
+    def forward(self, x):
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            x, hw_shape = layer[0](x)
+
+            for block in layer[1]:
+                x = block(x, hw_shape)
+            x = layer[2](x)
+            x = nlc_to_nchw(x, hw_shape)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return outs
+
+
+@MODELS.register_module()
+class PyramidVisionTransformerV2(PyramidVisionTransformer):
+    """Implementation of `PVTv2: Improved Baselines with Pyramid Vision
+    Transformer <https://arxiv.org/pdf/2106.13797.pdf>`_."""
+
+    def __init__(self, **kwargs):
+        super(PyramidVisionTransformerV2, self).__init__(
+            patch_sizes=[7, 3, 3, 3],
+            paddings=[3, 1, 1, 1],
+            use_abs_pos_embed=False,
+            norm_after_stage=True,
+            use_conv_ffn=True,
+            **kwargs)
+
+
+def pvt_convert(ckpt):
+    new_ckpt = OrderedDict()
+    # Process the concat between q linear weights and kv linear weights
+    use_abs_pos_embed = False
+    use_conv_ffn = False
+    for k in ckpt.keys():
+        if k.startswith('pos_embed'):
+            use_abs_pos_embed = True
+        if k.find('dwconv') >= 0:
+            use_conv_ffn = True
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        if k.startswith('norm.'):
+            continue
+        if k.startswith('cls_token'):
+            continue
+        if k.startswith('pos_embed'):
+            stage_i = int(k.replace('pos_embed', ''))
+            new_k = k.replace(f'pos_embed{stage_i}',
+                              f'layers.{stage_i - 1}.1.0.pos_embed')
+            if stage_i == 4 and v.size(1) == 50:  # 1 (cls token) + 7 * 7
+                new_v = v[:, 1:, :]  # remove cls token
+            else:
+                new_v = v
+        elif k.startswith('patch_embed'):
+            stage_i = int(k.split('.')[0].replace('patch_embed', ''))
+            new_k = k.replace(f'patch_embed{stage_i}',
+                              f'layers.{stage_i - 1}.0')
+            new_v = v
+            if 'proj.' in new_k:
+                new_k = new_k.replace('proj.', 'projection.')
+        elif k.startswith('block'):
+            stage_i = int(k.split('.')[0].replace('block', ''))
+            layer_i = int(k.split('.')[1])
+            new_layer_i = layer_i + use_abs_pos_embed
+            new_k = k.replace(f'block{stage_i}.{layer_i}',
+                              f'layers.{stage_i - 1}.1.{new_layer_i}')
+            new_v = v
+            if 'attn.q.' in new_k:
+                sub_item_k = k.replace('q.', 'kv.')
+                new_k = new_k.replace('q.', 'attn.in_proj_')
+                new_v = torch.cat([v, ckpt[sub_item_k]], dim=0)
+            elif 'attn.kv.' in new_k:
+                continue
+            elif 'attn.proj.' in new_k:
+                new_k = new_k.replace('proj.', 'attn.out_proj.')
+            elif 'attn.sr.' in new_k:
+                new_k = new_k.replace('sr.', 'sr.')
+            elif 'mlp.' in new_k:
+                string = f'{new_k}-'
+                new_k = new_k.replace('mlp.', 'ffn.layers.')
+                if 'fc1.weight' in new_k or 'fc2.weight' in new_k:
+                    new_v = v.reshape((*v.shape, 1, 1))
+                new_k = new_k.replace('fc1.', '0.')
+                new_k = new_k.replace('dwconv.dwconv.', '1.')
+                if use_conv_ffn:
+                    new_k = new_k.replace('fc2.', '4.')
+                else:
+                    new_k = new_k.replace('fc2.', '3.')
+                string += f'{new_k} {v.shape}-{new_v.shape}'
+        elif k.startswith('norm'):
+            stage_i = int(k[4])
+            new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i - 1}.2')
+            new_v = v
+        else:
+            new_k = k
+            new_v = v
+        new_ckpt[new_k] = new_v
+
+    return new_ckpt
diff --git a/head_extractor/src/mmdet/models/backbones/regnet.py b/head_extractor/src/mmdet/models/backbones/regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..55d3ce075f0cec68de4537a71ed569151d684562
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/regnet.py
@@ -0,0 +1,356 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from mmdet.registry import MODELS
+from .resnet import ResNet
+from .resnext import Bottleneck
+
+
+@MODELS.register_module()
+class RegNet(ResNet):
+    """RegNet backbone.
+
+    More details can be found in `paper <https://arxiv.org/abs/2003.13678>`_ .
+
+    Args:
+        arch (dict): The parameter of RegNets.
+
+            - w0 (int): initial width
+            - wa (float): slope of width
+            - wm (float): quantization parameter to quantize the width
+            - depth (int): depth of the backbone
+            - group_w (int): width of group
+            - bot_mul (float): bottleneck ratio, i.e. expansion of bottleneck.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        base_channels (int): Base channels after stem layer.
+        in_channels (int): Number of input image channels. Default: 3.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import RegNet
+        >>> import torch
+        >>> self = RegNet(
+                arch=dict(
+                    w0=88,
+                    wa=26.31,
+                    wm=2.25,
+                    group_w=48,
+                    depth=25,
+                    bot_mul=1.0))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 96, 8, 8)
+        (1, 192, 4, 4)
+        (1, 432, 2, 2)
+        (1, 1008, 1, 1)
+    """
+    arch_settings = {
+        'regnetx_400mf':
+        dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
+        'regnetx_800mf':
+        dict(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16, bot_mul=1.0),
+        'regnetx_1.6gf':
+        dict(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18, bot_mul=1.0),
+        'regnetx_3.2gf':
+        dict(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0),
+        'regnetx_4.0gf':
+        dict(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23, bot_mul=1.0),
+        'regnetx_6.4gf':
+        dict(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17, bot_mul=1.0),
+        'regnetx_8.0gf':
+        dict(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23, bot_mul=1.0),
+        'regnetx_12gf':
+        dict(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, bot_mul=1.0),
+    }
+
+    def __init__(self,
+                 arch,
+                 in_channels=3,
+                 stem_channels=32,
+                 base_channels=32,
+                 strides=(2, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResNet, self).__init__(init_cfg)
+
+        # Generate RegNet parameters first
+        if isinstance(arch, str):
+            assert arch in self.arch_settings, \
+                f'"arch": "{arch}" is not one of the' \
+                ' arch_settings'
+            arch = self.arch_settings[arch]
+        elif not isinstance(arch, dict):
+            raise ValueError('Expect "arch" to be either a string '
+                             f'or a dict, got {type(arch)}')
+
+        widths, num_stages = self.generate_regnet(
+            arch['w0'],
+            arch['wa'],
+            arch['wm'],
+            arch['depth'],
+        )
+        # Convert to per stage format
+        stage_widths, stage_blocks = self.get_stages_from_blocks(widths)
+        # Generate group widths and bot muls
+        group_widths = [arch['group_w'] for _ in range(num_stages)]
+        self.bottleneck_ratio = [arch['bot_mul'] for _ in range(num_stages)]
+        # Adjust the compatibility of stage_widths and group_widths
+        stage_widths, group_widths = self.adjust_width_group(
+            stage_widths, self.bottleneck_ratio, group_widths)
+
+        # Group params by stage
+        self.stage_widths = stage_widths
+        self.group_widths = group_widths
+        self.depth = sum(stage_blocks)
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.zero_init_residual = zero_init_residual
+        self.block = Bottleneck
+        expansion_bak = self.block.expansion
+        self.block.expansion = 1
+        self.stage_blocks = stage_blocks[:num_stages]
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                if self.zero_init_residual:
+                    block_init_cfg = dict(
+                        type='Constant', val=0, override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.inplanes = stem_channels
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            group_width = self.group_widths[i]
+            width = int(round(self.stage_widths[i] * self.bottleneck_ratio[i]))
+            stage_groups = width // group_width
+
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if self.plugins is not None:
+                stage_plugins = self.make_stage_plugins(self.plugins, i)
+            else:
+                stage_plugins = None
+
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=self.stage_widths[i],
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=self.with_cp,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                groups=stage_groups,
+                base_width=group_width,
+                base_channels=self.stage_widths[i],
+                init_cfg=block_init_cfg)
+            self.inplanes = self.stage_widths[i]
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = stage_widths[-1]
+        self.block.expansion = expansion_bak
+
+    def _make_stem_layer(self, in_channels, base_channels):
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            base_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, base_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def generate_regnet(self,
+                        initial_width,
+                        width_slope,
+                        width_parameter,
+                        depth,
+                        divisor=8):
+        """Generates per block width from RegNet parameters.
+
+        Args:
+            initial_width ([int]): Initial width of the backbone
+            width_slope ([float]): Slope of the quantized linear function
+            width_parameter ([int]): Parameter used to quantize the width.
+            depth ([int]): Depth of the backbone.
+            divisor (int, optional): The divisor of channels. Defaults to 8.
+
+        Returns:
+            list, int: return a list of widths of each stage and the number \
+                of stages
+        """
+        assert width_slope >= 0
+        assert initial_width > 0
+        assert width_parameter > 1
+        assert initial_width % divisor == 0
+        widths_cont = np.arange(depth) * width_slope + initial_width
+        ks = np.round(
+            np.log(widths_cont / initial_width) / np.log(width_parameter))
+        widths = initial_width * np.power(width_parameter, ks)
+        widths = np.round(np.divide(widths, divisor)) * divisor
+        num_stages = len(np.unique(widths))
+        widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist()
+        return widths, num_stages
+
+    @staticmethod
+    def quantize_float(number, divisor):
+        """Converts a float to closest non-zero int divisible by divisor.
+
+        Args:
+            number (int): Original number to be quantized.
+            divisor (int): Divisor used to quantize the number.
+
+        Returns:
+            int: quantized number that is divisible by devisor.
+        """
+        return int(round(number / divisor) * divisor)
+
+    def adjust_width_group(self, widths, bottleneck_ratio, groups):
+        """Adjusts the compatibility of widths and groups.
+
+        Args:
+            widths (list[int]): Width of each stage.
+            bottleneck_ratio (float): Bottleneck ratio.
+            groups (int): number of groups in each stage
+
+        Returns:
+            tuple(list): The adjusted widths and groups of each stage.
+        """
+        bottleneck_width = [
+            int(w * b) for w, b in zip(widths, bottleneck_ratio)
+        ]
+        groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_width)]
+        bottleneck_width = [
+            self.quantize_float(w_bot, g)
+            for w_bot, g in zip(bottleneck_width, groups)
+        ]
+        widths = [
+            int(w_bot / b)
+            for w_bot, b in zip(bottleneck_width, bottleneck_ratio)
+        ]
+        return widths, groups
+
+    def get_stages_from_blocks(self, widths):
+        """Gets widths/stage_blocks of network at each stage.
+
+        Args:
+            widths (list[int]): Width in each stage.
+
+        Returns:
+            tuple(list): width and depth of each stage
+        """
+        width_diff = [
+            width != width_prev
+            for width, width_prev in zip(widths + [0], [0] + widths)
+        ]
+        stage_widths = [
+            width for width, diff in zip(widths, width_diff[:-1]) if diff
+        ]
+        stage_blocks = np.diff([
+            depth for depth, diff in zip(range(len(width_diff)), width_diff)
+            if diff
+        ]).tolist()
+        return stage_widths, stage_blocks
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/backbones/res2net.py b/head_extractor/src/mmdet/models/backbones/res2net.py
new file mode 100644
index 0000000000000000000000000000000000000000..958fc88465c6769cb4c50907c92335331e8b7834
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/res2net.py
@@ -0,0 +1,327 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import Sequential
+
+from mmdet.registry import MODELS
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottle2neck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 scales=4,
+                 base_width=26,
+                 base_channels=64,
+                 stage_type='normal',
+                 **kwargs):
+        """Bottle2neck block for Res2Net.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottle2neck, self).__init__(inplanes, planes, **kwargs)
+        assert scales > 1, 'Res2Net degenerates to ResNet when scales = 1.'
+        width = int(math.floor(self.planes * (base_width / base_channels)))
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width * scales, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width * scales,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+
+        if stage_type == 'stage' and self.conv2_stride != 1:
+            self.pool = nn.AvgPool2d(
+                kernel_size=3, stride=self.conv2_stride, padding=1)
+        convs = []
+        bns = []
+
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            for i in range(scales - 1):
+                convs.append(
+                    build_conv_layer(
+                        self.conv_cfg,
+                        width,
+                        width,
+                        kernel_size=3,
+                        stride=self.conv2_stride,
+                        padding=self.dilation,
+                        dilation=self.dilation,
+                        bias=False))
+                bns.append(
+                    build_norm_layer(self.norm_cfg, width, postfix=i + 1)[1])
+            self.convs = nn.ModuleList(convs)
+            self.bns = nn.ModuleList(bns)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            for i in range(scales - 1):
+                convs.append(
+                    build_conv_layer(
+                        self.dcn,
+                        width,
+                        width,
+                        kernel_size=3,
+                        stride=self.conv2_stride,
+                        padding=self.dilation,
+                        dilation=self.dilation,
+                        bias=False))
+                bns.append(
+                    build_norm_layer(self.norm_cfg, width, postfix=i + 1)[1])
+            self.convs = nn.ModuleList(convs)
+            self.bns = nn.ModuleList(bns)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width * scales,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.stage_type = stage_type
+        self.scales = scales
+        self.width = width
+        delattr(self, 'conv2')
+        delattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            spx = torch.split(out, self.width, 1)
+            sp = self.convs[0](spx[0].contiguous())
+            sp = self.relu(self.bns[0](sp))
+            out = sp
+            for i in range(1, self.scales - 1):
+                if self.stage_type == 'stage':
+                    sp = spx[i]
+                else:
+                    sp = sp + spx[i]
+                sp = self.convs[i](sp.contiguous())
+                sp = self.relu(self.bns[i](sp))
+                out = torch.cat((out, sp), 1)
+
+            if self.stage_type == 'normal' or self.conv2_stride == 1:
+                out = torch.cat((out, spx[self.scales - 1]), 1)
+            elif self.stage_type == 'stage':
+                out = torch.cat((out, self.pool(spx[self.scales - 1])), 1)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Res2Layer(Sequential):
+    """Res2Layer to build Res2Net style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottle2neck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        scales (int): Scales used in Res2Net. Default: 4
+        base_width (int): Basic width of each scale. Default: 26
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 scales=4,
+                 base_width=26,
+                 **kwargs):
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.AvgPool2d(
+                    kernel_size=stride,
+                    stride=stride,
+                    ceil_mode=True,
+                    count_include_pad=False),
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=1,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1],
+            )
+
+        layers = []
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                scales=scales,
+                base_width=base_width,
+                stage_type='stage',
+                **kwargs))
+        inplanes = planes * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    scales=scales,
+                    base_width=base_width,
+                    **kwargs))
+        super(Res2Layer, self).__init__(*layers)
+
+
+@MODELS.register_module()
+class Res2Net(ResNet):
+    """Res2Net backbone.
+
+    Args:
+        scales (int): Scales used in Res2Net. Default: 4
+        base_width (int): Basic width of each scale. Default: 26
+        depth (int): Depth of res2net, from {50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Res2net stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottle2neck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import Res2Net
+        >>> import torch
+        >>> self = Res2Net(depth=50, scales=4, base_width=26)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 8, 8)
+        (1, 512, 4, 4)
+        (1, 1024, 2, 2)
+        (1, 2048, 1, 1)
+    """
+
+    arch_settings = {
+        50: (Bottle2neck, (3, 4, 6, 3)),
+        101: (Bottle2neck, (3, 4, 23, 3)),
+        152: (Bottle2neck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 scales=4,
+                 base_width=26,
+                 style='pytorch',
+                 deep_stem=True,
+                 avg_down=True,
+                 pretrained=None,
+                 init_cfg=None,
+                 **kwargs):
+        self.scales = scales
+        self.base_width = base_width
+        super(Res2Net, self).__init__(
+            style='pytorch',
+            deep_stem=True,
+            avg_down=True,
+            pretrained=pretrained,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def make_res_layer(self, **kwargs):
+        return Res2Layer(
+            scales=self.scales,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/head_extractor/src/mmdet/models/backbones/resnest.py b/head_extractor/src/mmdet/models/backbones/resnest.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4466c4cc416237bee1f870b52e3c20a849c5a60
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/resnest.py
@@ -0,0 +1,322 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from ..layers import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNetV1d
+
+
+class RSoftmax(nn.Module):
+    """Radix Softmax module in ``SplitAttentionConv2d``.
+
+    Args:
+        radix (int): Radix of input.
+        groups (int): Groups of input.
+    """
+
+    def __init__(self, radix, groups):
+        super().__init__()
+        self.radix = radix
+        self.groups = groups
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class SplitAttentionConv2d(BaseModule):
+    """Split-Attention Conv2d in ResNeSt.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        channels (int): Number of intermediate channels.
+        kernel_size (int | tuple[int]): Size of the convolution kernel.
+        stride (int | tuple[int]): Stride of the convolution.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+        dilation (int | tuple[int]): Spacing between kernel elements.
+        groups (int): Number of blocked connections from input channels to
+            output channels.
+        groups (int): Same as nn.Conv2d.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels. Default: 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        dcn (dict): Config dict for DCN. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 radix=2,
+                 reduction_factor=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 init_cfg=None):
+        super(SplitAttentionConv2d, self).__init__(init_cfg)
+        inter_channels = max(in_channels * radix // reduction_factor, 32)
+        self.radix = radix
+        self.groups = groups
+        self.channels = channels
+        self.with_dcn = dcn is not None
+        self.dcn = dcn
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if self.with_dcn and not fallback_on_stride:
+            assert conv_cfg is None, 'conv_cfg must be None for DCN'
+            conv_cfg = dcn
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            channels * radix,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups * radix,
+            bias=False)
+        # To be consistent with original implementation, starting from 0
+        self.norm0_name, norm0 = build_norm_layer(
+            norm_cfg, channels * radix, postfix=0)
+        self.add_module(self.norm0_name, norm0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc1 = build_conv_layer(
+            None, channels, inter_channels, 1, groups=self.groups)
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, inter_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.fc2 = build_conv_layer(
+            None, inter_channels, channels * radix, 1, groups=self.groups)
+        self.rsoftmax = RSoftmax(radix, groups)
+
+    @property
+    def norm0(self):
+        """nn.Module: the normalization layer named "norm0" """
+        return getattr(self, self.norm0_name)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm0(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        batch = x.size(0)
+        if self.radix > 1:
+            splits = x.view(batch, self.radix, -1, *x.shape[2:])
+            gap = splits.sum(dim=1)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        gap = self.norm1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            attens = atten.view(batch, self.radix, -1, *atten.shape[2:])
+            out = torch.sum(attens * splits, dim=1)
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeSt.
+
+    Args:
+        inplane (int): Input planes of this block.
+        planes (int): Middle planes of this block.
+        groups (int): Groups of conv2.
+        base_width (int): Base of width in terms of base channels. Default: 4.
+        base_channels (int): Base of channels for calculating width.
+            Default: 64.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Key word arguments for base class.
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        """Bottleneck block for ResNeSt."""
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.with_modulated_dcn = False
+        self.conv2 = SplitAttentionConv2d(
+            width,
+            width,
+            kernel_size=3,
+            stride=1 if self.avg_down_stride else self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            radix=radix,
+            reduction_factor=reduction_factor,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dcn=self.dcn)
+        delattr(self, self.norm2_name)
+
+        if self.avg_down_stride:
+            self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+
+            if self.avg_down_stride:
+                out = self.avd_layer(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class ResNeSt(ResNetV1d):
+    """ResNeSt backbone.
+
+    Args:
+        groups (int): Number of groups of Bottleneck. Default: 1
+        base_width (int): Base width of Bottleneck. Default: 4
+        radix (int): Radix of SplitAttentionConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Keyword arguments for ResNet.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3)),
+        200: (Bottleneck, (3, 24, 36, 3))
+    }
+
+    def __init__(self,
+                 groups=1,
+                 base_width=4,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        self.radix = radix
+        self.reduction_factor = reduction_factor
+        self.avg_down_stride = avg_down_stride
+        super(ResNeSt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            radix=self.radix,
+            reduction_factor=self.reduction_factor,
+            avg_down_stride=self.avg_down_stride,
+            **kwargs)
diff --git a/head_extractor/src/mmdet/models/backbones/resnet.py b/head_extractor/src/mmdet/models/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6f48f94f286e3c5e3179f752a7b36ea77c0d45
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/resnet.py
@@ -0,0 +1,672 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_plugin_layer
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmdet.registry import MODELS
+from ..layers import ResLayer
+
+
+class BasicBlock(BaseModule):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super(BasicBlock, self).__init__(init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        """Bottleneck block for ResNet.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(init_cfg)
+        assert style in ['pytorch', 'caffe']
+        assert dcn is None or isinstance(dcn, dict)
+        assert plugins is None or isinstance(plugins, list)
+        if plugins is not None:
+            allowed_position = ['after_conv1', 'after_conv2', 'after_conv3']
+            assert all(p['position'] in allowed_position for p in plugins)
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+        self.plugins = plugins
+        self.with_plugins = plugins is not None
+
+        if self.with_plugins:
+            # collect plugins for conv1/conv2/conv3
+            self.after_conv1_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv1'
+            ]
+            self.after_conv2_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv2'
+            ]
+            self.after_conv3_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv3'
+            ]
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                conv_cfg,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                dcn,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+        if self.with_plugins:
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                planes, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                planes, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                planes * self.expansion, self.after_conv3_plugins)
+
+    def make_block_plugins(self, in_channels, plugins):
+        """make plugins for block.
+
+        Args:
+            in_channels (int): Input channels of plugin.
+            plugins (list[dict]): List of plugins cfg to build.
+
+        Returns:
+            list[str]: List of the names of plugin.
+        """
+        assert isinstance(plugins, list)
+        plugin_names = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            name, layer = build_plugin_layer(
+                plugin,
+                in_channels=in_channels,
+                postfix=plugin.pop('postfix', ''))
+            assert not hasattr(self, name), f'duplicate plugin {name}'
+            self.add_module(name, layer)
+            plugin_names.append(name)
+        return plugin_names
+
+    def forward_plugin(self, x, plugin_names):
+        out = x
+        for name in plugin_names:
+            out = getattr(self, name)(out)
+        return out
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class ResNet(BaseModule):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        stem_channels (int | None): Number of stem channels. If not specified,
+            it will be the same as `base_channels`. Default: None.
+        base_channels (int): Number of base channels of res layer. Default: 64.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmdet.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=None,
+                 base_channels=64,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResNet, self).__init__(init_cfg)
+        self.zero_init_residual = zero_init_residual
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                block = self.arch_settings[depth][0]
+                if self.zero_init_residual:
+                    if block is BasicBlock:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm2'))
+                    elif block is Bottleneck:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depth = depth
+        if stem_channels is None:
+            stem_channels = base_channels
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = stem_channels
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if plugins is not None:
+                stage_plugins = self.make_stage_plugins(plugins, i)
+            else:
+                stage_plugins = None
+            planes = base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                init_cfg=block_init_cfg)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    def make_stage_plugins(self, plugins, stage_idx):
+        """Make plugins for ResNet ``stage_idx`` th stage.
+
+        Currently we support to insert ``context_block``,
+        ``empirical_attention_block``, ``nonlocal_block`` into the backbone
+        like ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
+        Bottleneck.
+
+        An example of plugins format could be:
+
+        Examples:
+            >>> plugins=[
+            ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+            ...          stages=(False, True, True, True),
+            ...          position='after_conv2'),
+            ...     dict(cfg=dict(type='yyy'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='1'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='2'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3')
+            ... ]
+            >>> self = ResNet(depth=18)
+            >>> stage_plugins = self.make_stage_plugins(plugins, 0)
+            >>> assert len(stage_plugins) == 3
+
+        Suppose ``stage_idx=0``, the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->conv3->yyy->zzz1->zzz2
+
+        Suppose 'stage_idx=1', the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2
+
+        If stages is missing, the plugin would be applied to all stages.
+
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+
+        Returns:
+            list[dict]: Plugins for current stage
+        """
+        stage_plugins = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            # whether to insert plugin into current stage
+            if stages is None or stages[stage_idx]:
+                stage_plugins.append(plugin)
+
+        return stage_plugins
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels)[1],
+                nn.ReLU(inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(ResNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@MODELS.register_module()
+class ResNetV1d(ResNet):
+    r"""ResNetV1d variant described in `Bag of Tricks
+    <https://arxiv.org/pdf/1812.01187.pdf>`_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super(ResNetV1d, self).__init__(
+            deep_stem=True, avg_down=True, **kwargs)
diff --git a/head_extractor/src/mmdet/models/backbones/resnext.py b/head_extractor/src/mmdet/models/backbones/resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..df3d79e046c3ab9b289bcfeb6f937c87f6c09bfa
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/resnext.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from mmdet.registry import MODELS
+from ..layers import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 **kwargs):
+        """Bottleneck block for ResNeXt.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        if self.with_plugins:
+            self._del_block_plugins(self.after_conv1_plugin_names +
+                                    self.after_conv2_plugin_names +
+                                    self.after_conv3_plugin_names)
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                width, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                width, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                self.planes * self.expansion, self.after_conv3_plugins)
+
+    def _del_block_plugins(self, plugin_names):
+        """delete plugins for block if exist.
+
+        Args:
+            plugin_names (list[str]): List of plugins name to delete.
+        """
+        assert isinstance(plugin_names, list)
+        for plugin_name in plugin_names:
+            del self._modules[plugin_name]
+
+
+@MODELS.register_module()
+class ResNeXt(ResNet):
+    """ResNeXt backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        groups (int): Group of resnext.
+        base_width (int): Base width of resnext.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        super(ResNeXt, self).__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``"""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/head_extractor/src/mmdet/models/backbones/ssd_vgg.py b/head_extractor/src/mmdet/models/backbones/ssd_vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..843e82e2722f93b9b2abb5180c827c8f2a430b48
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/ssd_vgg.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import VGG
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from ..necks import ssd_neck
+
+
+@MODELS.register_module()
+class SSDVGG(VGG, BaseModule):
+    """VGG Backbone network for single-shot-detection.
+
+    Args:
+        depth (int): Depth of vgg, from {11, 13, 16, 19}.
+        with_last_pool (bool): Whether to add a pooling layer at the last
+            of the model
+        ceil_mode (bool): When True, will use `ceil` instead of `floor`
+            to compute the output shape.
+        out_indices (Sequence[int]): Output from which stages.
+        out_feature_indices (Sequence[int]): Output from which feature map.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+        input_size (int, optional): Deprecated argumment.
+            Width and height of input, from {300, 512}.
+        l2_norm_scale (float, optional) : Deprecated argumment.
+            L2 normalization layer init scale.
+
+    Example:
+        >>> self = SSDVGG(input_size=300, depth=11)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 300, 300)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 1024, 19, 19)
+        (1, 512, 10, 10)
+        (1, 256, 5, 5)
+        (1, 256, 3, 3)
+        (1, 256, 1, 1)
+    """
+    extra_setting = {
+        300: (256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256),
+        512: (256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128),
+    }
+
+    def __init__(self,
+                 depth,
+                 with_last_pool=False,
+                 ceil_mode=True,
+                 out_indices=(3, 4),
+                 out_feature_indices=(22, 34),
+                 pretrained=None,
+                 init_cfg=None,
+                 input_size=None,
+                 l2_norm_scale=None):
+        # TODO: in_channels for mmcv.VGG
+        super(SSDVGG, self).__init__(
+            depth,
+            with_last_pool=with_last_pool,
+            ceil_mode=ceil_mode,
+            out_indices=out_indices)
+
+        self.features.add_module(
+            str(len(self.features)),
+            nn.MaxPool2d(kernel_size=3, stride=1, padding=1))
+        self.features.add_module(
+            str(len(self.features)),
+            nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6))
+        self.features.add_module(
+            str(len(self.features)), nn.ReLU(inplace=True))
+        self.features.add_module(
+            str(len(self.features)), nn.Conv2d(1024, 1024, kernel_size=1))
+        self.features.add_module(
+            str(len(self.features)), nn.ReLU(inplace=True))
+        self.out_feature_indices = out_feature_indices
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+
+        if init_cfg is not None:
+            self.init_cfg = init_cfg
+        elif isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            self.init_cfg = [
+                dict(type='Kaiming', layer='Conv2d'),
+                dict(type='Constant', val=1, layer='BatchNorm2d'),
+                dict(type='Normal', std=0.01, layer='Linear'),
+            ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        if input_size is not None:
+            warnings.warn('DeprecationWarning: input_size is deprecated')
+        if l2_norm_scale is not None:
+            warnings.warn('DeprecationWarning: l2_norm_scale in VGG is '
+                          'deprecated, it has been moved to SSDNeck.')
+
+    def init_weights(self, pretrained=None):
+        super(VGG, self).init_weights()
+
+    def forward(self, x):
+        """Forward function."""
+        outs = []
+        for i, layer in enumerate(self.features):
+            x = layer(x)
+            if i in self.out_feature_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+
+class L2Norm(ssd_neck.L2Norm):
+
+    def __init__(self, **kwargs):
+        super(L2Norm, self).__init__(**kwargs)
+        warnings.warn('DeprecationWarning: L2Norm in ssd_vgg.py '
+                      'is deprecated, please use L2Norm in '
+                      'mmdet/models/necks/ssd_neck.py instead')
diff --git a/head_extractor/src/mmdet/models/backbones/swin.py b/head_extractor/src/mmdet/models/backbones/swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..062190fa077d7b01e0c1db76bea0cfb5dc7b6620
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/swin.py
@@ -0,0 +1,819 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, build_dropout
+from mmengine.logging import MMLogger
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, trunc_normal_,
+                                        trunc_normal_init)
+from mmengine.runner.checkpoint import CheckpointLoader
+from mmengine.utils import to_2tuple
+
+from mmdet.registry import MODELS
+from ..layers import PatchEmbed, PatchMerging
+
+
+class WindowMSA(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 init_cfg=None):
+
+        super().__init__()
+        self.embed_dims = embed_dims
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.scale = qk_scale or head_embed_dims**-0.5
+        self.init_cfg = init_cfg
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # About 2x faster than original impl
+        Wh, Ww = self.window_size
+        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
+        rel_position_index = rel_index_coords + rel_index_coords.T
+        rel_position_index = rel_position_index.flip(1).contiguous()
+        self.register_buffer('relative_position_index', rel_position_index)
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def init_weights(self):
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+
+            x (tensor): input features with shape of (num_windows*B, N, C)
+            mask (tensor | None, Optional): mask with shape of (num_windows,
+                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        # make torchscript happy (cannot use tensor as tuple)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def double_step_seq(step1, len1, step2, len2):
+        seq1 = torch.arange(0, step1 * len1, step1)
+        seq2 = torch.arange(0, step2 * len2, step2)
+        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
+
+
+class ShiftWindowMSA(BaseModule):
+    """Shifted Window Multihead Self-Attention Module.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): The height and width of the window.
+        shift_size (int, optional): The shift step of each window towards
+            right-bottom. If zero, act as regular window-msa. Defaults to 0.
+        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
+            Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Defaults: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Defaults: 0.
+        proj_drop_rate (float, optional): Dropout ratio of output.
+            Defaults: 0.
+        dropout_layer (dict, optional): The dropout_layer used before output.
+            Defaults: dict(type='DropPath', drop_prob=0.).
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 shift_size=0,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0,
+                 proj_drop_rate=0,
+                 dropout_layer=dict(type='DropPath', drop_prob=0.),
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.window_size = window_size
+        self.shift_size = shift_size
+        assert 0 <= self.shift_size < self.window_size
+
+        self.w_msa = WindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=to_2tuple(window_size),
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate,
+            init_cfg=None)
+
+        self.drop = build_dropout(dropout_layer)
+
+    def forward(self, query, hw_shape):
+        B, L, C = query.shape
+        H, W = hw_shape
+        assert L == H * W, 'input feature has wrong size'
+        query = query.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
+        H_pad, W_pad = query.shape[1], query.shape[2]
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_query = torch.roll(
+                query,
+                shifts=(-self.shift_size, -self.shift_size),
+                dims=(1, 2))
+
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device)
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            # nW, window_size, window_size, 1
+            mask_windows = self.window_partition(img_mask)
+            mask_windows = mask_windows.view(
+                -1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                              float(-100.0)).masked_fill(
+                                                  attn_mask == 0, float(0.0))
+        else:
+            shifted_query = query
+            attn_mask = None
+
+        # nW*B, window_size, window_size, C
+        query_windows = self.window_partition(shifted_query)
+        # nW*B, window_size*window_size, C
+        query_windows = query_windows.view(-1, self.window_size**2, C)
+
+        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
+        attn_windows = self.w_msa(query_windows, mask=attn_mask)
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+
+        # B H' W' C
+        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        x = self.drop(x)
+        return x
+
+    def window_reverse(self, windows, H, W):
+        """
+        Args:
+            windows: (num_windows*B, window_size, window_size, C)
+            H (int): Height of image
+            W (int): Width of image
+        Returns:
+            x: (B, H, W, C)
+        """
+        window_size = self.window_size
+        B = int(windows.shape[0] / (H * W / window_size / window_size))
+        x = windows.view(B, H // window_size, W // window_size, window_size,
+                         window_size, -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+        return x
+
+    def window_partition(self, x):
+        """
+        Args:
+            x: (B, H, W, C)
+        Returns:
+            windows: (num_windows*B, window_size, window_size, C)
+        """
+        B, H, W, C = x.shape
+        window_size = self.window_size
+        x = x.view(B, H // window_size, window_size, W // window_size,
+                   window_size, C)
+        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+        windows = windows.view(-1, window_size, window_size, C)
+        return windows
+
+
+class SwinBlock(BaseModule):
+    """"
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        window_size (int, optional): The local window scale. Default: 7.
+        shift (bool, optional): whether to shift window or not. Default False.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 window_size=7,
+                 shift=False,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+
+        super(SwinBlock, self).__init__()
+
+        self.init_cfg = init_cfg
+        self.with_cp = with_cp
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.attn = ShiftWindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            shift_size=window_size // 2 if shift else 0,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            init_cfg=None)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=2,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=True,
+            init_cfg=None)
+
+    def forward(self, x, hw_shape):
+
+        def _inner_forward(x):
+            identity = x
+            x = self.norm1(x)
+            x = self.attn(x, hw_shape)
+
+            x = x + identity
+
+            identity = x
+            x = self.norm2(x)
+            x = self.ffn(x, identity=identity)
+
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+
+        return x
+
+
+class SwinBlockSequence(BaseModule):
+    """Implements one stage in Swin Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        depth (int): The number of blocks in this stage.
+        window_size (int, optional): The local window scale. Default: 7.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float | list[float], optional): Stochastic depth
+            rate. Default: 0.
+        downsample (BaseModule | None, optional): The downsample operation
+            module. Default: None.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 depth,
+                 window_size=7,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 downsample=None,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(drop_path_rate, list):
+            drop_path_rates = drop_path_rate
+            assert len(drop_path_rates) == depth
+        else:
+            drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)]
+
+        self.blocks = ModuleList()
+        for i in range(depth):
+            block = SwinBlock(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                feedforward_channels=feedforward_channels,
+                window_size=window_size,
+                shift=False if i % 2 == 0 else True,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=drop_path_rates[i],
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.blocks.append(block)
+
+        self.downsample = downsample
+
+    def forward(self, x, hw_shape):
+        for block in self.blocks:
+            x = block(x, hw_shape)
+
+        if self.downsample:
+            x_down, down_hw_shape = self.downsample(x, hw_shape)
+            return x_down, down_hw_shape, x, hw_shape
+        else:
+            return x, hw_shape, x, hw_shape
+
+
+@MODELS.register_module()
+class SwinTransformer(BaseModule):
+    """ Swin Transformer
+    A PyTorch implement of : `Swin Transformer:
+    Hierarchical Vision Transformer using Shifted Windows`  -
+        https://arxiv.org/abs/2103.14030
+
+    Inspiration from
+    https://github.com/microsoft/Swin-Transformer
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): The num of input channels.
+            Defaults: 3.
+        embed_dims (int): The feature dimension. Default: 96.
+        patch_size (int | tuple[int]): Patch size. Default: 4.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+            Default: (2, 2, 6, 2).
+        num_heads (tuple[int]): Parallel attention heads of each Swin
+            Transformer stage. Default: (3, 6, 12, 24).
+        strides (tuple[int]): The patch merging or patch embedding stride of
+            each Swin Transformer stage. (In swin, we set kernel size equal to
+            stride.) Default: (4, 2, 2, 2).
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key,
+            value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        patch_norm (bool): If add a norm layer for patch embed and patch
+            merging. Default: True.
+        drop_rate (float): Dropout rate. Defaults: 0.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults: 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: False.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer at
+            output of backone. Defaults: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        pretrained (str, optional): model pretrained path. Default: None.
+        convert_weights (bool): The flag indicates whether the
+            pre-trained model is from the original repo. We may need
+            to convert some keys to make it compatible.
+            Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            Default: -1 (-1 means not freezing any parameters).
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 in_channels=3,
+                 embed_dims=96,
+                 patch_size=4,
+                 window_size=7,
+                 mlp_ratio=4,
+                 depths=(2, 2, 6, 2),
+                 num_heads=(3, 6, 12, 24),
+                 strides=(4, 2, 2, 2),
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=True,
+                 qk_scale=None,
+                 patch_norm=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_abs_pos_embed=False,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 pretrained=None,
+                 convert_weights=False,
+                 frozen_stages=-1,
+                 init_cfg=None):
+        self.convert_weights = convert_weights
+        self.frozen_stages = frozen_stages
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            self.init_cfg = init_cfg
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        super(SwinTransformer, self).__init__(init_cfg=init_cfg)
+
+        num_layers = len(depths)
+        self.out_indices = out_indices
+        self.use_abs_pos_embed = use_abs_pos_embed
+
+        assert strides[0] == patch_size, 'Use non-overlapping patch embed.'
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=strides[0],
+            norm_cfg=norm_cfg if patch_norm else None,
+            init_cfg=None)
+
+        if self.use_abs_pos_embed:
+            patch_row = pretrain_img_size[0] // patch_size
+            patch_col = pretrain_img_size[1] // patch_size
+            num_patches = patch_row * patch_col
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros((1, num_patches, embed_dims)))
+
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+
+        # set stochastic depth decay rule
+        total_depth = sum(depths)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
+        ]
+
+        self.stages = ModuleList()
+        in_channels = embed_dims
+        for i in range(num_layers):
+            if i < num_layers - 1:
+                downsample = PatchMerging(
+                    in_channels=in_channels,
+                    out_channels=2 * in_channels,
+                    stride=strides[i + 1],
+                    norm_cfg=norm_cfg if patch_norm else None,
+                    init_cfg=None)
+            else:
+                downsample = None
+
+            stage = SwinBlockSequence(
+                embed_dims=in_channels,
+                num_heads=num_heads[i],
+                feedforward_channels=mlp_ratio * in_channels,
+                depth=depths[i],
+                window_size=window_size,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                downsample=downsample,
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.stages.append(stage)
+            if downsample:
+                in_channels = downsample.out_channels
+
+        self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)]
+        # Add a norm layer for each output
+        for i in out_indices:
+            layer = build_norm_layer(norm_cfg, self.num_features[i])[1]
+            layer_name = f'norm{i}'
+            self.add_module(layer_name, layer)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            if self.use_abs_pos_embed:
+                self.absolute_pos_embed.requires_grad = False
+            self.drop_after_pos.eval()
+
+        for i in range(1, self.frozen_stages + 1):
+
+            if (i - 1) in self.out_indices:
+                norm_layer = getattr(self, f'norm{i-1}')
+                norm_layer.eval()
+                for param in norm_layer.parameters():
+                    param.requires_grad = False
+
+            m = self.stages[i - 1]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self):
+        logger = MMLogger.get_current_instance()
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+            if self.use_abs_pos_embed:
+                trunc_normal_(self.absolute_pos_embed, std=0.02)
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, 1.0)
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            ckpt = CheckpointLoader.load_checkpoint(
+                self.init_cfg.checkpoint, logger=logger, map_location='cpu')
+            if 'state_dict' in ckpt:
+                _state_dict = ckpt['state_dict']
+            elif 'model' in ckpt:
+                _state_dict = ckpt['model']
+            else:
+                _state_dict = ckpt
+            if self.convert_weights:
+                # supported loading weight from original repo,
+                _state_dict = swin_converter(_state_dict)
+
+            state_dict = OrderedDict()
+            for k, v in _state_dict.items():
+                if k.startswith('backbone.'):
+                    state_dict[k[9:]] = v
+
+            # strip prefix of state_dict
+            if list(state_dict.keys())[0].startswith('module.'):
+                state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+            # reshape absolute position embedding
+            if state_dict.get('absolute_pos_embed') is not None:
+                absolute_pos_embed = state_dict['absolute_pos_embed']
+                N1, L, C1 = absolute_pos_embed.size()
+                N2, C2, H, W = self.absolute_pos_embed.size()
+                if N1 != N2 or C1 != C2 or L != H * W:
+                    logger.warning('Error in loading absolute_pos_embed, pass')
+                else:
+                    state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
+                        N2, H, W, C2).permute(0, 3, 1, 2).contiguous()
+
+            # interpolate position bias table if needed
+            relative_position_bias_table_keys = [
+                k for k in state_dict.keys()
+                if 'relative_position_bias_table' in k
+            ]
+            for table_key in relative_position_bias_table_keys:
+                table_pretrained = state_dict[table_key]
+                table_current = self.state_dict()[table_key]
+                L1, nH1 = table_pretrained.size()
+                L2, nH2 = table_current.size()
+                if nH1 != nH2:
+                    logger.warning(f'Error in loading {table_key}, pass')
+                elif L1 != L2:
+                    S1 = int(L1**0.5)
+                    S2 = int(L2**0.5)
+                    table_pretrained_resized = F.interpolate(
+                        table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1),
+                        size=(S2, S2),
+                        mode='bicubic')
+                    state_dict[table_key] = table_pretrained_resized.view(
+                        nH2, L2).permute(1, 0).contiguous()
+
+            # load state_dict
+            self.load_state_dict(state_dict, False)
+
+    def forward(self, x):
+        x, hw_shape = self.patch_embed(x)
+
+        if self.use_abs_pos_embed:
+            x = x + self.absolute_pos_embed
+        x = self.drop_after_pos(x)
+
+        outs = []
+        for i, stage in enumerate(self.stages):
+            x, hw_shape, out, out_hw_shape = stage(x, hw_shape)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                out = norm_layer(out)
+                out = out.view(-1, *out_hw_shape,
+                               self.num_features[i]).permute(0, 3, 1,
+                                                             2).contiguous()
+                outs.append(out)
+
+        return outs
+
+
+def swin_converter(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    def correct_unfold_reduction_order(x):
+        out_channel, in_channel = x.shape
+        x = x.reshape(out_channel, 4, in_channel // 4)
+        x = x[:, [0, 2, 1, 3], :].transpose(1,
+                                            2).reshape(out_channel, in_channel)
+        return x
+
+    def correct_unfold_norm_order(x):
+        in_channel = x.shape[0]
+        x = x.reshape(4, in_channel // 4)
+        x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+        return x
+
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        elif k.startswith('layers'):
+            new_v = v
+            if 'attn.' in k:
+                new_k = k.replace('attn.', 'attn.w_msa.')
+            elif 'mlp.' in k:
+                if 'mlp.fc1.' in k:
+                    new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.')
+                elif 'mlp.fc2.' in k:
+                    new_k = k.replace('mlp.fc2.', 'ffn.layers.1.')
+                else:
+                    new_k = k.replace('mlp.', 'ffn.')
+            elif 'downsample' in k:
+                new_k = k
+                if 'reduction.' in k:
+                    new_v = correct_unfold_reduction_order(v)
+                elif 'norm.' in k:
+                    new_v = correct_unfold_norm_order(v)
+            else:
+                new_k = k
+            new_k = new_k.replace('layers', 'stages', 1)
+        elif k.startswith('patch_embed'):
+            new_v = v
+            if 'proj' in k:
+                new_k = k.replace('proj', 'projection')
+            else:
+                new_k = k
+        else:
+            new_v = v
+            new_k = k
+
+        new_ckpt['backbone.' + new_k] = new_v
+
+    return new_ckpt
diff --git a/head_extractor/src/mmdet/models/backbones/trident_resnet.py b/head_extractor/src/mmdet/models/backbones/trident_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..22c76354522ff8533b094df6858ec361ba400c1e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/backbones/trident_resnet.py
@@ -0,0 +1,298 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.backbones.resnet import Bottleneck, ResNet
+from mmdet.registry import MODELS
+
+
+class TridentConv(BaseModule):
+    """Trident Convolution Module.
+
+    Args:
+        in_channels (int): Number of channels in input.
+        out_channels (int): Number of channels in output.
+        kernel_size (int): Size of convolution kernel.
+        stride (int, optional): Convolution stride. Default: 1.
+        trident_dilations (tuple[int, int, int], optional): Dilations of
+            different trident branch. Default: (1, 2, 3).
+        test_branch_idx (int, optional): In inference, all 3 branches will
+            be used if `test_branch_idx==-1`, otherwise only branch with
+            index `test_branch_idx` will be used. Default: 1.
+        bias (bool, optional): Whether to use bias in convolution or not.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 trident_dilations=(1, 2, 3),
+                 test_branch_idx=1,
+                 bias=False,
+                 init_cfg=None):
+        super(TridentConv, self).__init__(init_cfg)
+        self.num_branch = len(trident_dilations)
+        self.with_bias = bias
+        self.test_branch_idx = test_branch_idx
+        self.stride = _pair(stride)
+        self.kernel_size = _pair(kernel_size)
+        self.paddings = _pair(trident_dilations)
+        self.dilations = trident_dilations
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.bias = bias
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels, *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+
+    def extra_repr(self):
+        tmpstr = f'in_channels={self.in_channels}'
+        tmpstr += f', out_channels={self.out_channels}'
+        tmpstr += f', kernel_size={self.kernel_size}'
+        tmpstr += f', num_branch={self.num_branch}'
+        tmpstr += f', test_branch_idx={self.test_branch_idx}'
+        tmpstr += f', stride={self.stride}'
+        tmpstr += f', paddings={self.paddings}'
+        tmpstr += f', dilations={self.dilations}'
+        tmpstr += f', bias={self.bias}'
+        return tmpstr
+
+    def forward(self, inputs):
+        if self.training or self.test_branch_idx == -1:
+            outputs = [
+                F.conv2d(input, self.weight, self.bias, self.stride, padding,
+                         dilation) for input, dilation, padding in zip(
+                             inputs, self.dilations, self.paddings)
+            ]
+        else:
+            assert len(inputs) == 1
+            outputs = [
+                F.conv2d(inputs[0], self.weight, self.bias, self.stride,
+                         self.paddings[self.test_branch_idx],
+                         self.dilations[self.test_branch_idx])
+            ]
+
+        return outputs
+
+
+# Since TridentNet is defined over ResNet50 and ResNet101, here we
+# only support TridentBottleneckBlock.
+class TridentBottleneck(Bottleneck):
+    """BottleBlock for TridentResNet.
+
+    Args:
+        trident_dilations (tuple[int, int, int]): Dilations of different
+            trident branch.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+        concat_output (bool): Whether to concat the output list to a Tensor.
+            `True` only in the last Block.
+    """
+
+    def __init__(self, trident_dilations, test_branch_idx, concat_output,
+                 **kwargs):
+
+        super(TridentBottleneck, self).__init__(**kwargs)
+        self.trident_dilations = trident_dilations
+        self.num_branch = len(trident_dilations)
+        self.concat_output = concat_output
+        self.test_branch_idx = test_branch_idx
+        self.conv2 = TridentConv(
+            self.planes,
+            self.planes,
+            kernel_size=3,
+            stride=self.conv2_stride,
+            bias=False,
+            trident_dilations=self.trident_dilations,
+            test_branch_idx=test_branch_idx,
+            init_cfg=dict(
+                type='Kaiming',
+                distribution='uniform',
+                mode='fan_in',
+                override=dict(name='conv2')))
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            num_branch = (
+                self.num_branch
+                if self.training or self.test_branch_idx == -1 else 1)
+            identity = x
+            if not isinstance(x, list):
+                x = (x, ) * num_branch
+                identity = x
+                if self.downsample is not None:
+                    identity = [self.downsample(b) for b in x]
+
+            out = [self.conv1(b) for b in x]
+            out = [self.norm1(b) for b in out]
+            out = [self.relu(b) for b in out]
+
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = [self.norm2(b) for b in out]
+            out = [self.relu(b) for b in out]
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv2_plugin_names)
+
+            out = [self.conv3(b) for b in out]
+            out = [self.norm3(b) for b in out]
+
+            if self.with_plugins:
+                for k in range(len(out)):
+                    out[k] = self.forward_plugin(out[k],
+                                                 self.after_conv3_plugin_names)
+
+            out = [
+                out_b + identity_b for out_b, identity_b in zip(out, identity)
+            ]
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = [self.relu(b) for b in out]
+        if self.concat_output:
+            out = torch.cat(out, dim=0)
+        return out
+
+
+def make_trident_res_layer(block,
+                           inplanes,
+                           planes,
+                           num_blocks,
+                           stride=1,
+                           trident_dilations=(1, 2, 3),
+                           style='pytorch',
+                           with_cp=False,
+                           conv_cfg=None,
+                           norm_cfg=dict(type='BN'),
+                           dcn=None,
+                           plugins=None,
+                           test_branch_idx=-1):
+    """Build Trident Res Layers."""
+
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = []
+        conv_stride = stride
+        downsample.extend([
+            build_conv_layer(
+                conv_cfg,
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=conv_stride,
+                bias=False),
+            build_norm_layer(norm_cfg, planes * block.expansion)[1]
+        ])
+        downsample = nn.Sequential(*downsample)
+
+    layers = []
+    for i in range(num_blocks):
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride if i == 0 else 1,
+                trident_dilations=trident_dilations,
+                downsample=downsample if i == 0 else None,
+                style=style,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=plugins,
+                test_branch_idx=test_branch_idx,
+                concat_output=True if i == num_blocks - 1 else False))
+        inplanes = planes * block.expansion
+    return nn.Sequential(*layers)
+
+
+@MODELS.register_module()
+class TridentResNet(ResNet):
+    """The stem layer, stage 1 and stage 2 in Trident ResNet are identical to
+    ResNet, while in stage 3, Trident BottleBlock is utilized to replace the
+    normal BottleBlock to yield trident output. Different branch shares the
+    convolution weight but uses different dilations to achieve multi-scale
+    output.
+
+                               / stage3(b0) \
+    x - stem - stage1 - stage2 - stage3(b1) - output
+                               \ stage3(b2) /
+
+    Args:
+        depth (int): Depth of resnet, from {50, 101, 152}.
+        num_branch (int): Number of branches in TridentNet.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+        trident_dilations (tuple[int]): Dilations of different trident branch.
+            len(trident_dilations) should be equal to num_branch.
+    """  # noqa
+
+    def __init__(self, depth, num_branch, test_branch_idx, trident_dilations,
+                 **kwargs):
+
+        assert num_branch == len(trident_dilations)
+        assert depth in (50, 101, 152)
+        super(TridentResNet, self).__init__(depth, **kwargs)
+        assert self.num_stages == 3
+        self.test_branch_idx = test_branch_idx
+        self.num_branch = num_branch
+
+        last_stage_idx = self.num_stages - 1
+        stride = self.strides[last_stage_idx]
+        dilation = trident_dilations
+        dcn = self.dcn if self.stage_with_dcn[last_stage_idx] else None
+        if self.plugins is not None:
+            stage_plugins = self.make_stage_plugins(self.plugins,
+                                                    last_stage_idx)
+        else:
+            stage_plugins = None
+        planes = self.base_channels * 2**last_stage_idx
+        res_layer = make_trident_res_layer(
+            TridentBottleneck,
+            inplanes=(self.block.expansion * self.base_channels *
+                      2**(last_stage_idx - 1)),
+            planes=planes,
+            num_blocks=self.stage_blocks[last_stage_idx],
+            stride=stride,
+            trident_dilations=dilation,
+            style=self.style,
+            with_cp=self.with_cp,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dcn=dcn,
+            plugins=stage_plugins,
+            test_branch_idx=self.test_branch_idx)
+
+        layer_name = f'layer{last_stage_idx + 1}'
+
+        self.__setattr__(layer_name, res_layer)
+        self.res_layers.pop(last_stage_idx)
+        self.res_layers.insert(last_stage_idx, layer_name)
+
+        self._freeze_stages()
diff --git a/head_extractor/src/mmdet/models/data_preprocessors/__init__.py b/head_extractor/src/mmdet/models/data_preprocessors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..201a1da6a4f320a17cea9c65d5c102bfdd7700d8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/data_preprocessors/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_preprocessor import (BatchFixedSizePad, BatchResize,
+                                BatchSyncRandomResize, BoxInstDataPreprocessor,
+                                DetDataPreprocessor,
+                                MultiBranchDataPreprocessor)
+from .reid_data_preprocessor import ReIDDataPreprocessor
+from .track_data_preprocessor import TrackDataPreprocessor
+
+__all__ = [
+    'DetDataPreprocessor', 'BatchSyncRandomResize', 'BatchFixedSizePad',
+    'MultiBranchDataPreprocessor', 'BatchResize', 'BoxInstDataPreprocessor',
+    'TrackDataPreprocessor', 'ReIDDataPreprocessor'
+]
diff --git a/head_extractor/src/mmdet/models/data_preprocessors/data_preprocessor.py b/head_extractor/src/mmdet/models/data_preprocessors/data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..55b5c35b3a4888c95c6646df3fa080347afe4704
--- /dev/null
+++ b/head_extractor/src/mmdet/models/data_preprocessors/data_preprocessor.py
@@ -0,0 +1,793 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from numbers import Number
+from typing import List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.dist import barrier, broadcast, get_dist_info
+from mmengine.logging import MessageHub
+from mmengine.model import BaseDataPreprocessor, ImgDataPreprocessor
+from mmengine.structures import PixelData
+from mmengine.utils import is_seq_of
+from torch import Tensor
+
+from mmdet.models.utils import unfold_wo_center
+from mmdet.models.utils.misc import samplelist_boxtype2tensor
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.mask import BitmapMasks
+from mmdet.utils import ConfigType
+
+try:
+    import skimage
+except ImportError:
+    skimage = None
+
+
+@MODELS.register_module()
+class DetDataPreprocessor(ImgDataPreprocessor):
+    """Image pre-processor for detection tasks.
+
+    Comparing with the :class:`mmengine.ImgDataPreprocessor`,
+
+    1. It supports batch augmentations.
+    2. It will additionally append batch_input_shape and pad_shape
+    to data_samples considering the object detection task.
+
+    It provides the data pre-processing as follows
+
+    - Collate and move data to the target device.
+    - Pad inputs to the maximum size of current batch with defined
+      ``pad_value``. The padding size can be divisible by a defined
+      ``pad_size_divisor``
+    - Stack inputs to batch_inputs.
+    - Convert inputs from bgr to rgb if the shape of input is (3, H, W).
+    - Normalize image with defined std and mean.
+    - Do batch augmentations during training.
+
+    Args:
+        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
+            Defaults to None.
+        std (Sequence[Number], optional): The pixel standard deviation of
+            R, G, B channels. Defaults to None.
+        pad_size_divisor (int): The size of padded image should be
+            divisible by ``pad_size_divisor``. Defaults to 1.
+        pad_value (Number): The padded pixel value. Defaults to 0.
+        pad_mask (bool): Whether to pad instance masks. Defaults to False.
+        mask_pad_value (int): The padded pixel value for instance masks.
+            Defaults to 0.
+        pad_seg (bool): Whether to pad semantic segmentation maps.
+            Defaults to False.
+        seg_pad_value (int): The padded pixel value for semantic
+            segmentation maps. Defaults to 255.
+        bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): whether to convert image from RGB to RGB.
+            Defaults to False.
+        boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of
+            bboxes data to ``Tensor`` type. Defaults to True.
+        non_blocking (bool): Whether block current process
+            when transferring data to device. Defaults to False.
+        batch_augments (list[dict], optional): Batch-level augmentations
+    """
+
+    def __init__(self,
+                 mean: Sequence[Number] = None,
+                 std: Sequence[Number] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Union[float, int] = 0,
+                 pad_mask: bool = False,
+                 mask_pad_value: int = 0,
+                 pad_seg: bool = False,
+                 seg_pad_value: int = 255,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False,
+                 boxtype2tensor: bool = True,
+                 non_blocking: Optional[bool] = False,
+                 batch_augments: Optional[List[dict]] = None):
+        super().__init__(
+            mean=mean,
+            std=std,
+            pad_size_divisor=pad_size_divisor,
+            pad_value=pad_value,
+            bgr_to_rgb=bgr_to_rgb,
+            rgb_to_bgr=rgb_to_bgr,
+            non_blocking=non_blocking)
+        if batch_augments is not None:
+            self.batch_augments = nn.ModuleList(
+                [MODELS.build(aug) for aug in batch_augments])
+        else:
+            self.batch_augments = None
+        self.pad_mask = pad_mask
+        self.mask_pad_value = mask_pad_value
+        self.pad_seg = pad_seg
+        self.seg_pad_value = seg_pad_value
+        self.boxtype2tensor = boxtype2tensor
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization,padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        batch_pad_shape = self._get_pad_shape(data)
+        data = super().forward(data=data, training=training)
+        inputs, data_samples = data['inputs'], data['data_samples']
+
+        if data_samples is not None:
+            # NOTE the batched image size information may be useful, e.g.
+            # in DETR, this is needed for the construction of masks, which is
+            # then used for the transformer_head.
+            batch_input_shape = tuple(inputs[0].size()[-2:])
+            for data_sample, pad_shape in zip(data_samples, batch_pad_shape):
+                data_sample.set_metainfo({
+                    'batch_input_shape': batch_input_shape,
+                    'pad_shape': pad_shape
+                })
+
+            if self.boxtype2tensor:
+                samplelist_boxtype2tensor(data_samples)
+
+            if self.pad_mask and training:
+                self.pad_gt_masks(data_samples)
+
+            if self.pad_seg and training:
+                self.pad_gt_sem_seg(data_samples)
+
+        if training and self.batch_augments is not None:
+            for batch_aug in self.batch_augments:
+                inputs, data_samples = batch_aug(inputs, data_samples)
+
+        return {'inputs': inputs, 'data_samples': data_samples}
+
+    def _get_pad_shape(self, data: dict) -> List[tuple]:
+        """Get the pad_shape of each image based on data and
+        pad_size_divisor."""
+        _batch_inputs = data['inputs']
+        # Process data with `pseudo_collate`.
+        if is_seq_of(_batch_inputs, torch.Tensor):
+            batch_pad_shape = []
+            for ori_input in _batch_inputs:
+                pad_h = int(
+                    np.ceil(ori_input.shape[1] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                pad_w = int(
+                    np.ceil(ori_input.shape[2] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                batch_pad_shape.append((pad_h, pad_w))
+        # Process data with `default_collate`.
+        elif isinstance(_batch_inputs, torch.Tensor):
+            assert _batch_inputs.dim() == 4, (
+                'The input of `ImgDataPreprocessor` should be a NCHW tensor '
+                'or a list of tensor, but got a tensor with shape: '
+                f'{_batch_inputs.shape}')
+            pad_h = int(
+                np.ceil(_batch_inputs.shape[2] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            pad_w = int(
+                np.ceil(_batch_inputs.shape[3] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0]
+        else:
+            raise TypeError('Output of `cast_data` should be a dict '
+                            'or a tuple with inputs and data_samples, but got'
+                            f'{type(data)}: {data}')
+        return batch_pad_shape
+
+    def pad_gt_masks(self,
+                     batch_data_samples: Sequence[DetDataSample]) -> None:
+        """Pad gt_masks to shape of batch_input_shape."""
+        if 'masks' in batch_data_samples[0].gt_instances:
+            for data_samples in batch_data_samples:
+                masks = data_samples.gt_instances.masks
+                data_samples.gt_instances.masks = masks.pad(
+                    data_samples.batch_input_shape,
+                    pad_val=self.mask_pad_value)
+
+    def pad_gt_sem_seg(self,
+                       batch_data_samples: Sequence[DetDataSample]) -> None:
+        """Pad gt_sem_seg to shape of batch_input_shape."""
+        if 'gt_sem_seg' in batch_data_samples[0]:
+            for data_samples in batch_data_samples:
+                gt_sem_seg = data_samples.gt_sem_seg.sem_seg
+                h, w = gt_sem_seg.shape[-2:]
+                pad_h, pad_w = data_samples.batch_input_shape
+                gt_sem_seg = F.pad(
+                    gt_sem_seg,
+                    pad=(0, max(pad_w - w, 0), 0, max(pad_h - h, 0)),
+                    mode='constant',
+                    value=self.seg_pad_value)
+                data_samples.gt_sem_seg = PixelData(sem_seg=gt_sem_seg)
+
+
+@MODELS.register_module()
+class BatchSyncRandomResize(nn.Module):
+    """Batch random resize which synchronizes the random size across ranks.
+
+    Args:
+        random_size_range (tuple): The multi-scale random range during
+            multi-scale training.
+        interval (int): The iter interval of change
+            image size. Defaults to 10.
+        size_divisor (int): Image size divisible factor.
+            Defaults to 32.
+    """
+
+    def __init__(self,
+                 random_size_range: Tuple[int, int],
+                 interval: int = 10,
+                 size_divisor: int = 32) -> None:
+        super().__init__()
+        self.rank, self.world_size = get_dist_info()
+        self._input_size = None
+        self._random_size_range = (round(random_size_range[0] / size_divisor),
+                                   round(random_size_range[1] / size_divisor))
+        self._interval = interval
+        self._size_divisor = size_divisor
+
+    def forward(
+        self, inputs: Tensor, data_samples: List[DetDataSample]
+    ) -> Tuple[Tensor, List[DetDataSample]]:
+        """resize a batch of images and bboxes to shape ``self._input_size``"""
+        h, w = inputs.shape[-2:]
+        if self._input_size is None:
+            self._input_size = (h, w)
+        scale_y = self._input_size[0] / h
+        scale_x = self._input_size[1] / w
+        if scale_x != 1 or scale_y != 1:
+            inputs = F.interpolate(
+                inputs,
+                size=self._input_size,
+                mode='bilinear',
+                align_corners=False)
+            for data_sample in data_samples:
+                img_shape = (int(data_sample.img_shape[0] * scale_y),
+                             int(data_sample.img_shape[1] * scale_x))
+                pad_shape = (int(data_sample.pad_shape[0] * scale_y),
+                             int(data_sample.pad_shape[1] * scale_x))
+                data_sample.set_metainfo({
+                    'img_shape': img_shape,
+                    'pad_shape': pad_shape,
+                    'batch_input_shape': self._input_size
+                })
+                data_sample.gt_instances.bboxes[
+                    ...,
+                    0::2] = data_sample.gt_instances.bboxes[...,
+                                                            0::2] * scale_x
+                data_sample.gt_instances.bboxes[
+                    ...,
+                    1::2] = data_sample.gt_instances.bboxes[...,
+                                                            1::2] * scale_y
+                if 'ignored_instances' in data_sample:
+                    data_sample.ignored_instances.bboxes[
+                        ..., 0::2] = data_sample.ignored_instances.bboxes[
+                            ..., 0::2] * scale_x
+                    data_sample.ignored_instances.bboxes[
+                        ..., 1::2] = data_sample.ignored_instances.bboxes[
+                            ..., 1::2] * scale_y
+        message_hub = MessageHub.get_current_instance()
+        if (message_hub.get_info('iter') + 1) % self._interval == 0:
+            self._input_size = self._get_random_size(
+                aspect_ratio=float(w / h), device=inputs.device)
+        return inputs, data_samples
+
+    def _get_random_size(self, aspect_ratio: float,
+                         device: torch.device) -> Tuple[int, int]:
+        """Randomly generate a shape in ``_random_size_range`` and broadcast to
+        all ranks."""
+        tensor = torch.LongTensor(2).to(device)
+        if self.rank == 0:
+            size = random.randint(*self._random_size_range)
+            size = (self._size_divisor * size,
+                    self._size_divisor * int(aspect_ratio * size))
+            tensor[0] = size[0]
+            tensor[1] = size[1]
+        barrier()
+        broadcast(tensor, 0)
+        input_size = (tensor[0].item(), tensor[1].item())
+        return input_size
+
+
+@MODELS.register_module()
+class BatchFixedSizePad(nn.Module):
+    """Fixed size padding for batch images.
+
+    Args:
+        size (Tuple[int, int]): Fixed padding size. Expected padding
+            shape (h, w). Defaults to None.
+        img_pad_value (int): The padded pixel value for images.
+            Defaults to 0.
+        pad_mask (bool): Whether to pad instance masks. Defaults to False.
+        mask_pad_value (int): The padded pixel value for instance masks.
+            Defaults to 0.
+        pad_seg (bool): Whether to pad semantic segmentation maps.
+            Defaults to False.
+        seg_pad_value (int): The padded pixel value for semantic
+            segmentation maps. Defaults to 255.
+    """
+
+    def __init__(self,
+                 size: Tuple[int, int],
+                 img_pad_value: int = 0,
+                 pad_mask: bool = False,
+                 mask_pad_value: int = 0,
+                 pad_seg: bool = False,
+                 seg_pad_value: int = 255) -> None:
+        super().__init__()
+        self.size = size
+        self.pad_mask = pad_mask
+        self.pad_seg = pad_seg
+        self.img_pad_value = img_pad_value
+        self.mask_pad_value = mask_pad_value
+        self.seg_pad_value = seg_pad_value
+
+    def forward(
+        self,
+        inputs: Tensor,
+        data_samples: Optional[List[dict]] = None
+    ) -> Tuple[Tensor, Optional[List[dict]]]:
+        """Pad image, instance masks, segmantic segmentation maps."""
+        src_h, src_w = inputs.shape[-2:]
+        dst_h, dst_w = self.size
+
+        if src_h >= dst_h and src_w >= dst_w:
+            return inputs, data_samples
+
+        inputs = F.pad(
+            inputs,
+            pad=(0, max(0, dst_w - src_w), 0, max(0, dst_h - src_h)),
+            mode='constant',
+            value=self.img_pad_value)
+
+        if data_samples is not None:
+            # update batch_input_shape
+            for data_sample in data_samples:
+                data_sample.set_metainfo({
+                    'batch_input_shape': (dst_h, dst_w),
+                    'pad_shape': (dst_h, dst_w)
+                })
+
+            if self.pad_mask:
+                for data_sample in data_samples:
+                    masks = data_sample.gt_instances.masks
+                    data_sample.gt_instances.masks = masks.pad(
+                        (dst_h, dst_w), pad_val=self.mask_pad_value)
+
+            if self.pad_seg:
+                for data_sample in data_samples:
+                    gt_sem_seg = data_sample.gt_sem_seg.sem_seg
+                    h, w = gt_sem_seg.shape[-2:]
+                    gt_sem_seg = F.pad(
+                        gt_sem_seg,
+                        pad=(0, max(0, dst_w - w), 0, max(0, dst_h - h)),
+                        mode='constant',
+                        value=self.seg_pad_value)
+                    data_sample.gt_sem_seg = PixelData(sem_seg=gt_sem_seg)
+
+        return inputs, data_samples
+
+
+@MODELS.register_module()
+class MultiBranchDataPreprocessor(BaseDataPreprocessor):
+    """DataPreprocessor wrapper for multi-branch data.
+
+    Take semi-supervised object detection as an example, assume that
+    the ratio of labeled data and unlabeled data in a batch is 1:2,
+    `sup` indicates the branch where the labeled data is augmented,
+    `unsup_teacher` and `unsup_student` indicate the branches where
+    the unlabeled data is augmented by different pipeline.
+
+    The input format of multi-branch data is shown as below :
+
+    .. code-block:: none
+        {
+            'inputs':
+                {
+                    'sup': [Tensor, None, None],
+                    'unsup_teacher': [None, Tensor, Tensor],
+                    'unsup_student': [None, Tensor, Tensor],
+                },
+            'data_sample':
+                {
+                    'sup': [DetDataSample, None, None],
+                    'unsup_teacher': [None, DetDataSample, DetDataSample],
+                    'unsup_student': [NOne, DetDataSample, DetDataSample],
+                }
+        }
+
+    The format of multi-branch data
+    after filtering None is shown as below :
+
+    .. code-block:: none
+        {
+            'inputs':
+                {
+                    'sup': [Tensor],
+                    'unsup_teacher': [Tensor, Tensor],
+                    'unsup_student': [Tensor, Tensor],
+                },
+            'data_sample':
+                {
+                    'sup': [DetDataSample],
+                    'unsup_teacher': [DetDataSample, DetDataSample],
+                    'unsup_student': [DetDataSample, DetDataSample],
+                }
+        }
+
+    In order to reuse `DetDataPreprocessor` for the data
+    from different branches, the format of multi-branch data
+    grouped by branch is as below :
+
+    .. code-block:: none
+        {
+            'sup':
+                {
+                    'inputs': [Tensor]
+                    'data_sample': [DetDataSample, DetDataSample]
+                },
+            'unsup_teacher':
+                {
+                    'inputs': [Tensor, Tensor]
+                    'data_sample': [DetDataSample, DetDataSample]
+                },
+            'unsup_student':
+                {
+                    'inputs': [Tensor, Tensor]
+                    'data_sample': [DetDataSample, DetDataSample]
+                },
+        }
+
+    After preprocessing data from different branches,
+    the multi-branch data needs to be reformatted as:
+
+    .. code-block:: none
+        {
+            'inputs':
+                {
+                    'sup': [Tensor],
+                    'unsup_teacher': [Tensor, Tensor],
+                    'unsup_student': [Tensor, Tensor],
+                },
+            'data_sample':
+                {
+                    'sup': [DetDataSample],
+                    'unsup_teacher': [DetDataSample, DetDataSample],
+                    'unsup_student': [DetDataSample, DetDataSample],
+                }
+        }
+
+    Args:
+        data_preprocessor (:obj:`ConfigDict` or dict): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+    """
+
+    def __init__(self, data_preprocessor: ConfigType) -> None:
+        super().__init__()
+        self.data_preprocessor = MODELS.build(data_preprocessor)
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization,padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor`` for multi-branch data.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict:
+
+            - 'inputs' (Dict[str, obj:`torch.Tensor`]): The forward data of
+                models from different branches.
+            - 'data_sample' (Dict[str, obj:`DetDataSample`]): The annotation
+                info of the sample from different branches.
+        """
+
+        if training is False:
+            return self.data_preprocessor(data, training)
+
+        # Filter out branches with a value of None
+        for key in data.keys():
+            for branch in data[key].keys():
+                data[key][branch] = list(
+                    filter(lambda x: x is not None, data[key][branch]))
+
+        # Group data by branch
+        multi_branch_data = {}
+        for key in data.keys():
+            for branch in data[key].keys():
+                if multi_branch_data.get(branch, None) is None:
+                    multi_branch_data[branch] = {key: data[key][branch]}
+                elif multi_branch_data[branch].get(key, None) is None:
+                    multi_branch_data[branch][key] = data[key][branch]
+                else:
+                    multi_branch_data[branch][key].append(data[key][branch])
+
+        # Preprocess data from different branches
+        for branch, _data in multi_branch_data.items():
+            multi_branch_data[branch] = self.data_preprocessor(_data, training)
+
+        # Format data by inputs and data_samples
+        format_data = {}
+        for branch in multi_branch_data.keys():
+            for key in multi_branch_data[branch].keys():
+                if format_data.get(key, None) is None:
+                    format_data[key] = {branch: multi_branch_data[branch][key]}
+                elif format_data[key].get(branch, None) is None:
+                    format_data[key][branch] = multi_branch_data[branch][key]
+                else:
+                    format_data[key][branch].append(
+                        multi_branch_data[branch][key])
+
+        return format_data
+
+    @property
+    def device(self):
+        return self.data_preprocessor.device
+
+    def to(self, device: Optional[Union[int, torch.device]], *args,
+           **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Args:
+            device (int or torch.device, optional): The desired device of the
+                parameters and buffers in this module.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+
+        return self.data_preprocessor.to(device, *args, **kwargs)
+
+    def cuda(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+
+        return self.data_preprocessor.cuda(*args, **kwargs)
+
+    def cpu(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+
+        return self.data_preprocessor.cpu(*args, **kwargs)
+
+
+@MODELS.register_module()
+class BatchResize(nn.Module):
+    """Batch resize during training. This implementation is modified from
+    https://github.com/Purkialo/CrowdDet/blob/master/lib/data/CrowdHuman.py.
+
+    It provides the data pre-processing as follows:
+    - A batch of all images will pad to a uniform size and stack them into
+      a torch.Tensor by `DetDataPreprocessor`.
+    - `BatchFixShapeResize` resize all images to the target size.
+    - Padding images to make sure the size of image can be divisible by
+      ``pad_size_divisor``.
+
+    Args:
+        scale (tuple): Images scales for resizing.
+        pad_size_divisor (int): Image size divisible factor.
+            Defaults to 1.
+        pad_value (Number): The padded pixel value. Defaults to 0.
+    """
+
+    def __init__(
+        self,
+        scale: tuple,
+        pad_size_divisor: int = 1,
+        pad_value: Union[float, int] = 0,
+    ) -> None:
+        super().__init__()
+        self.min_size = min(scale)
+        self.max_size = max(scale)
+        self.pad_size_divisor = pad_size_divisor
+        self.pad_value = pad_value
+
+    def forward(
+        self, inputs: Tensor, data_samples: List[DetDataSample]
+    ) -> Tuple[Tensor, List[DetDataSample]]:
+        """resize a batch of images and bboxes."""
+
+        batch_height, batch_width = inputs.shape[-2:]
+        target_height, target_width, scale = self.get_target_size(
+            batch_height, batch_width)
+
+        inputs = F.interpolate(
+            inputs,
+            size=(target_height, target_width),
+            mode='bilinear',
+            align_corners=False)
+
+        inputs = self.get_padded_tensor(inputs, self.pad_value)
+
+        if data_samples is not None:
+            batch_input_shape = tuple(inputs.size()[-2:])
+            for data_sample in data_samples:
+                img_shape = [
+                    int(scale * _) for _ in list(data_sample.img_shape)
+                ]
+                data_sample.set_metainfo({
+                    'img_shape': tuple(img_shape),
+                    'batch_input_shape': batch_input_shape,
+                    'pad_shape': batch_input_shape,
+                    'scale_factor': (scale, scale)
+                })
+
+                data_sample.gt_instances.bboxes *= scale
+                data_sample.ignored_instances.bboxes *= scale
+
+        return inputs, data_samples
+
+    def get_target_size(self, height: int,
+                        width: int) -> Tuple[int, int, float]:
+        """Get the target size of a batch of images based on data and scale."""
+        im_size_min = np.min([height, width])
+        im_size_max = np.max([height, width])
+        scale = self.min_size / im_size_min
+        if scale * im_size_max > self.max_size:
+            scale = self.max_size / im_size_max
+        target_height, target_width = int(round(height * scale)), int(
+            round(width * scale))
+        return target_height, target_width, scale
+
+    def get_padded_tensor(self, tensor: Tensor, pad_value: int) -> Tensor:
+        """Pad images according to pad_size_divisor."""
+        assert tensor.ndim == 4
+        target_height, target_width = tensor.shape[-2], tensor.shape[-1]
+        divisor = self.pad_size_divisor
+        padded_height = (target_height + divisor - 1) // divisor * divisor
+        padded_width = (target_width + divisor - 1) // divisor * divisor
+        padded_tensor = torch.ones([
+            tensor.shape[0], tensor.shape[1], padded_height, padded_width
+        ]) * pad_value
+        padded_tensor = padded_tensor.type_as(tensor)
+        padded_tensor[:, :, :target_height, :target_width] = tensor
+        return padded_tensor
+
+
+@MODELS.register_module()
+class BoxInstDataPreprocessor(DetDataPreprocessor):
+    """Pseudo mask pre-processor for BoxInst.
+
+    Comparing with the :class:`mmdet.DetDataPreprocessor`,
+
+    1. It generates masks using box annotations.
+    2. It computes the images color similarity in LAB color space.
+
+    Args:
+        mask_stride (int): The mask output stride in boxinst. Defaults to 4.
+        pairwise_size (int): The size of neighborhood for each pixel.
+            Defaults to 3.
+        pairwise_dilation (int): The dilation of neighborhood for each pixel.
+            Defaults to 2.
+        pairwise_color_thresh (float): The thresh of image color similarity.
+            Defaults to 0.3.
+        bottom_pixels_removed (int): The length of removed pixels in bottom.
+            It is caused by the annotation error in coco dataset.
+            Defaults to 10.
+    """
+
+    def __init__(self,
+                 *arg,
+                 mask_stride: int = 4,
+                 pairwise_size: int = 3,
+                 pairwise_dilation: int = 2,
+                 pairwise_color_thresh: float = 0.3,
+                 bottom_pixels_removed: int = 10,
+                 **kwargs) -> None:
+        super().__init__(*arg, **kwargs)
+        self.mask_stride = mask_stride
+        self.pairwise_size = pairwise_size
+        self.pairwise_dilation = pairwise_dilation
+        self.pairwise_color_thresh = pairwise_color_thresh
+        self.bottom_pixels_removed = bottom_pixels_removed
+
+        if skimage is None:
+            raise RuntimeError('skimage is not installed,\
+                 please install it by: pip install scikit-image')
+
+    def get_images_color_similarity(self, inputs: Tensor,
+                                    image_masks: Tensor) -> Tensor:
+        """Compute the image color similarity in LAB color space."""
+        assert inputs.dim() == 4
+        assert inputs.size(0) == 1
+
+        unfolded_images = unfold_wo_center(
+            inputs,
+            kernel_size=self.pairwise_size,
+            dilation=self.pairwise_dilation)
+        diff = inputs[:, :, None] - unfolded_images
+        similarity = torch.exp(-torch.norm(diff, dim=1) * 0.5)
+
+        unfolded_weights = unfold_wo_center(
+            image_masks[None, None],
+            kernel_size=self.pairwise_size,
+            dilation=self.pairwise_dilation)
+        unfolded_weights = torch.max(unfolded_weights, dim=1)[0]
+
+        return similarity * unfolded_weights
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Get pseudo mask labels using color similarity."""
+        det_data = super().forward(data, training)
+        inputs, data_samples = det_data['inputs'], det_data['data_samples']
+
+        if training:
+            # get image masks and remove bottom pixels
+            b_img_h, b_img_w = data_samples[0].batch_input_shape
+            img_masks = []
+            for i in range(inputs.shape[0]):
+                img_h, img_w = data_samples[i].img_shape
+                img_mask = inputs.new_ones((img_h, img_w))
+                pixels_removed = int(self.bottom_pixels_removed *
+                                     float(img_h) / float(b_img_h))
+                if pixels_removed > 0:
+                    img_mask[-pixels_removed:, :] = 0
+                pad_w = b_img_w - img_w
+                pad_h = b_img_h - img_h
+                img_mask = F.pad(img_mask, (0, pad_w, 0, pad_h), 'constant',
+                                 0.)
+                img_masks.append(img_mask)
+            img_masks = torch.stack(img_masks, dim=0)
+            start = int(self.mask_stride // 2)
+            img_masks = img_masks[:, start::self.mask_stride,
+                                  start::self.mask_stride]
+
+            # Get origin rgb image for color similarity
+            ori_imgs = inputs * self.std + self.mean
+            downsampled_imgs = F.avg_pool2d(
+                ori_imgs.float(),
+                kernel_size=self.mask_stride,
+                stride=self.mask_stride,
+                padding=0)
+
+            # Compute color similarity for pseudo mask generation
+            for im_i, data_sample in enumerate(data_samples):
+                # TODO: Support rgb2lab in mmengine?
+                images_lab = skimage.color.rgb2lab(
+                    downsampled_imgs[im_i].byte().permute(1, 2,
+                                                          0).cpu().numpy())
+                images_lab = torch.as_tensor(
+                    images_lab, device=ori_imgs.device, dtype=torch.float32)
+                images_lab = images_lab.permute(2, 0, 1)[None]
+                images_color_similarity = self.get_images_color_similarity(
+                    images_lab, img_masks[im_i])
+                pairwise_mask = (images_color_similarity >=
+                                 self.pairwise_color_thresh).float()
+
+                per_im_bboxes = data_sample.gt_instances.bboxes
+                if per_im_bboxes.shape[0] > 0:
+                    per_im_masks = []
+                    for per_box in per_im_bboxes:
+                        mask_full = torch.zeros((b_img_h, b_img_w),
+                                                device=self.device).float()
+                        mask_full[int(per_box[1]):int(per_box[3] + 1),
+                                  int(per_box[0]):int(per_box[2] + 1)] = 1.0
+                        per_im_masks.append(mask_full)
+                    per_im_masks = torch.stack(per_im_masks, dim=0)
+                    pairwise_masks = torch.cat(
+                        [pairwise_mask for _ in range(per_im_bboxes.shape[0])],
+                        dim=0)
+                else:
+                    per_im_masks = torch.zeros((0, b_img_h, b_img_w))
+                    pairwise_masks = torch.zeros(
+                        (0, self.pairwise_size**2 - 1, b_img_h, b_img_w))
+
+                # TODO: Support BitmapMasks with tensor?
+                data_sample.gt_instances.masks = BitmapMasks(
+                    per_im_masks.cpu().numpy(), b_img_h, b_img_w)
+                data_sample.gt_instances.pairwise_masks = pairwise_masks
+        return {'inputs': inputs, 'data_samples': data_samples}
diff --git a/head_extractor/src/mmdet/models/data_preprocessors/reid_data_preprocessor.py b/head_extractor/src/mmdet/models/data_preprocessors/reid_data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d0a1d45d97ba350e8845c6620f3b73f05545e61
--- /dev/null
+++ b/head_extractor/src/mmdet/models/data_preprocessors/reid_data_preprocessor.py
@@ -0,0 +1,216 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from numbers import Number
+from typing import Optional, Sequence
+
+import torch
+import torch.nn.functional as F
+from mmengine.model import BaseDataPreprocessor, stack_batch
+
+from mmdet.registry import MODELS
+
+try:
+    import mmpretrain
+    from mmpretrain.models.utils.batch_augments import RandomBatchAugment
+    from mmpretrain.structures import (batch_label_to_onehot, cat_batch_labels,
+                                       tensor_split)
+except ImportError:
+    mmpretrain = None
+
+
+def stack_batch_scores(elements, device=None):
+    """Stack the ``score`` of a batch of :obj:`LabelData` to a tensor.
+
+    Args:
+        elements (List[LabelData]): A batch of :obj`LabelData`.
+        device (torch.device, optional): The output device of the batch label.
+            Defaults to None.
+    Returns:
+        torch.Tensor: The stacked score tensor.
+    """
+    item = elements[0]
+    if 'score' not in item._data_fields:
+        return None
+
+    batch_score = torch.stack([element.score for element in elements])
+    if device is not None:
+        batch_score = batch_score.to(device)
+    return batch_score
+
+
+@MODELS.register_module()
+class ReIDDataPreprocessor(BaseDataPreprocessor):
+    """Image pre-processor for classification tasks.
+
+    Comparing with the :class:`mmengine.model.ImgDataPreprocessor`,
+
+    1. It won't do normalization if ``mean`` is not specified.
+    2. It does normalization and color space conversion after stacking batch.
+    3. It supports batch augmentations like mixup and cutmix.
+
+    It provides the data pre-processing as follows
+
+    - Collate and move data to the target device.
+    - Pad inputs to the maximum size of current batch with defined
+      ``pad_value``. The padding size can be divisible by a defined
+      ``pad_size_divisor``
+    - Stack inputs to batch_inputs.
+    - Convert inputs from bgr to rgb if the shape of input is (3, H, W).
+    - Normalize image with defined std and mean.
+    - Do batch augmentations like Mixup and Cutmix during training.
+
+    Args:
+        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
+            Defaults to None.
+        std (Sequence[Number], optional): The pixel standard deviation of
+            R, G, B channels. Defaults to None.
+        pad_size_divisor (int): The size of padded image should be
+            divisible by ``pad_size_divisor``. Defaults to 1.
+        pad_value (Number): The padded pixel value. Defaults to 0.
+        to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        to_onehot (bool): Whether to generate one-hot format gt-labels and set
+            to data samples. Defaults to False.
+        num_classes (int, optional): The number of classes. Defaults to None.
+        batch_augments (dict, optional): The batch augmentations settings,
+            including "augments" and "probs". For more details, see
+            :class:`mmpretrain.models.RandomBatchAugment`.
+    """
+
+    def __init__(self,
+                 mean: Sequence[Number] = None,
+                 std: Sequence[Number] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Number = 0,
+                 to_rgb: bool = False,
+                 to_onehot: bool = False,
+                 num_classes: Optional[int] = None,
+                 batch_augments: Optional[dict] = None):
+        if mmpretrain is None:
+            raise RuntimeError('Please run "pip install openmim" and '
+                               'run "mim install mmpretrain" to '
+                               'install mmpretrain first.')
+        super().__init__()
+        self.pad_size_divisor = pad_size_divisor
+        self.pad_value = pad_value
+        self.to_rgb = to_rgb
+        self.to_onehot = to_onehot
+        self.num_classes = num_classes
+
+        if mean is not None:
+            assert std is not None, 'To enable the normalization in ' \
+                'preprocessing, please specify both `mean` and `std`.'
+            # Enable the normalization in preprocessing.
+            self._enable_normalize = True
+            self.register_buffer('mean',
+                                 torch.tensor(mean).view(-1, 1, 1), False)
+            self.register_buffer('std',
+                                 torch.tensor(std).view(-1, 1, 1), False)
+        else:
+            self._enable_normalize = False
+
+        if batch_augments is not None:
+            self.batch_augments = RandomBatchAugment(**batch_augments)
+            if not self.to_onehot:
+                from mmengine.logging import MMLogger
+                MMLogger.get_current_instance().info(
+                    'Because batch augmentations are enabled, the data '
+                    'preprocessor automatically enables the `to_onehot` '
+                    'option to generate one-hot format labels.')
+                self.to_onehot = True
+        else:
+            self.batch_augments = None
+
+    def forward(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization, padding, bgr2rgb conversion and batch
+        augmentation based on ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict): data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        inputs = self.cast_data(data['inputs'])
+
+        if isinstance(inputs, torch.Tensor):
+            # The branch if use `default_collate` as the collate_fn in the
+            # dataloader.
+
+            # ------ To RGB ------
+            if self.to_rgb and inputs.size(1) == 3:
+                inputs = inputs.flip(1)
+
+            # -- Normalization ---
+            inputs = inputs.float()
+            if self._enable_normalize:
+                inputs = (inputs - self.mean) / self.std
+
+            # ------ Padding -----
+            if self.pad_size_divisor > 1:
+                h, w = inputs.shape[-2:]
+
+                target_h = math.ceil(
+                    h / self.pad_size_divisor) * self.pad_size_divisor
+                target_w = math.ceil(
+                    w / self.pad_size_divisor) * self.pad_size_divisor
+                pad_h = target_h - h
+                pad_w = target_w - w
+                inputs = F.pad(inputs, (0, pad_w, 0, pad_h), 'constant',
+                               self.pad_value)
+        else:
+            # The branch if use `pseudo_collate` as the collate_fn in the
+            # dataloader.
+
+            processed_inputs = []
+            for input_ in inputs:
+                # ------ To RGB ------
+                if self.to_rgb and input_.size(0) == 3:
+                    input_ = input_.flip(0)
+
+                # -- Normalization ---
+                input_ = input_.float()
+                if self._enable_normalize:
+                    input_ = (input_ - self.mean) / self.std
+
+                processed_inputs.append(input_)
+            # Combine padding and stack
+            inputs = stack_batch(processed_inputs, self.pad_size_divisor,
+                                 self.pad_value)
+
+        data_samples = data.get('data_samples', None)
+        sample_item = data_samples[0] if data_samples is not None else None
+        if 'gt_label' in sample_item:
+            gt_labels = [sample.gt_label for sample in data_samples]
+            gt_labels_tensor = [gt_label.label for gt_label in gt_labels]
+            batch_label, label_indices = cat_batch_labels(gt_labels_tensor)
+            batch_label = batch_label.to(self.device)
+
+            batch_score = stack_batch_scores(gt_labels, device=self.device)
+            if batch_score is None and self.to_onehot:
+                assert batch_label is not None, \
+                    'Cannot generate onehot format labels because no labels.'
+                num_classes = self.num_classes or data_samples[0].get(
+                    'num_classes')
+                assert num_classes is not None, \
+                    'Cannot generate one-hot format labels because not set ' \
+                    '`num_classes` in `data_preprocessor`.'
+                batch_score = batch_label_to_onehot(batch_label, label_indices,
+                                                    num_classes)
+
+            # ----- Batch Augmentations ----
+            if training and self.batch_augments is not None:
+                inputs, batch_score = self.batch_augments(inputs, batch_score)
+
+            # ----- scatter labels and scores to data samples ---
+            if batch_label is not None:
+                for sample, label in zip(
+                        data_samples, tensor_split(batch_label,
+                                                   label_indices)):
+                    sample.set_gt_label(label)
+            if batch_score is not None:
+                for sample, score in zip(data_samples, batch_score):
+                    sample.set_gt_score(score)
+
+        return {'inputs': inputs, 'data_samples': data_samples}
diff --git a/head_extractor/src/mmdet/models/data_preprocessors/track_data_preprocessor.py b/head_extractor/src/mmdet/models/data_preprocessors/track_data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..40a65b8eaebacdaddd574768fbb00e8c5a072d85
--- /dev/null
+++ b/head_extractor/src/mmdet/models/data_preprocessors/track_data_preprocessor.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmengine.model.utils import stack_batch
+
+from mmdet.models.utils.misc import samplelist_boxtype2tensor
+from mmdet.registry import MODELS
+from mmdet.structures import TrackDataSample
+from mmdet.structures.mask import BitmapMasks
+from .data_preprocessor import DetDataPreprocessor
+
+
+@MODELS.register_module()
+class TrackDataPreprocessor(DetDataPreprocessor):
+    """Image pre-processor for tracking tasks.
+
+        Accepts the data sampled by the dataloader, and preprocesses
+        it into the format of the model input. ``TrackDataPreprocessor``
+        provides the tracking data pre-processing as follows:
+
+        - Collate and move data to the target device.
+        - Pad inputs to the maximum size of current batch with defined
+          ``pad_value``. The padding size can be divisible by a defined
+          ``pad_size_divisor``
+        - Stack inputs to inputs.
+        - Convert inputs from bgr to rgb if the shape of input is (1, 3, H, W).
+        - Normalize image with defined std and mean.
+        - Do batch augmentations during training.
+        - Record the information of ``batch_input_shape`` and ``pad_shape``.
+
+        Args:
+            mean (Sequence[Number], optional): The pixel mean of R, G, B
+                channels. Defaults to None.
+            std (Sequence[Number], optional): The pixel standard deviation of
+                R, G, B channels. Defaults to None.
+            pad_size_divisor (int): The size of padded image should be
+                divisible by ``pad_size_divisor``. Defaults to 1.
+            pad_value (Number): The padded pixel value. Defaults to 0.
+            pad_mask (bool): Whether to pad instance masks. Defaults to False.
+            mask_pad_value (int): The padded pixel value for instance masks.
+                Defaults to 0.
+            bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+                Defaults to False.
+            rgb_to_bgr (bool): whether to convert image from RGB to RGB.
+                Defaults to False.
+            use_det_processor: (bool): whether to use DetDataPreprocessor
+                in training phrase. This is mainly for some tracking models
+                fed into one image rather than a group of image in training.
+                Defaults to False.
+    .       boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of
+                bboxes data to ``Tensor`` type. Defaults to True.
+            batch_augments (list[dict], optional): Batch-level augmentations
+    """
+
+    def __init__(self,
+                 mean: Optional[Sequence[Union[float, int]]] = None,
+                 std: Optional[Sequence[Union[float, int]]] = None,
+                 use_det_processor: bool = False,
+                 **kwargs):
+        super().__init__(mean=mean, std=std, **kwargs)
+        self.use_det_processor = use_det_processor
+        if mean is not None and not self.use_det_processor:
+            # overwrite the ``register_bufffer`` in ``ImgDataPreprocessor``
+            # since the shape of ``mean`` and ``std`` in tracking tasks must be
+            # (T, C, H, W), which T is the temporal length of the video.
+            self.register_buffer('mean',
+                                 torch.tensor(mean).view(1, -1, 1, 1), False)
+            self.register_buffer('std',
+                                 torch.tensor(std).view(1, -1, 1, 1), False)
+
+    def forward(self, data: dict, training: bool = False) -> Dict:
+        """Perform normalization,padding and bgr2rgb conversion based on
+        ``TrackDataPreprocessor``.
+
+        Args:
+            data (dict): data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            Tuple[Dict[str, List[torch.Tensor]], OptSampleList]: Data in the
+            same format as the model input.
+        """
+        if self.use_det_processor and training:
+            batch_pad_shape = self._get_pad_shape(data)
+        else:
+            batch_pad_shape = self._get_track_pad_shape(data)
+
+        data = self.cast_data(data)
+        imgs, data_samples = data['inputs'], data['data_samples']
+
+        if self.use_det_processor and training:
+            assert imgs[0].dim() == 3, \
+                'Only support the 3 dims when use detpreprocessor in training'
+            if self._channel_conversion:
+                imgs = [_img[[2, 1, 0], ...] for _img in imgs]
+            # Convert to `float`
+            imgs = [_img.float() for _img in imgs]
+            if self._enable_normalize:
+                imgs = [(_img - self.mean) / self.std for _img in imgs]
+            inputs = stack_batch(imgs, self.pad_size_divisor, self.pad_value)
+        else:
+            assert imgs[0].dim() == 4, \
+                'Only support the 4 dims when use trackprocessor in training'
+            # The shape of imgs[0] is (T, C, H, W).
+            channel = imgs[0].size(1)
+            if self._channel_conversion and channel == 3:
+                imgs = [_img[:, [2, 1, 0], ...] for _img in imgs]
+            # change to `float`
+            imgs = [_img.float() for _img in imgs]
+            if self._enable_normalize:
+                imgs = [(_img - self.mean) / self.std for _img in imgs]
+            inputs = stack_track_batch(imgs, self.pad_size_divisor,
+                                       self.pad_value)
+
+        if data_samples is not None:
+            # NOTE the batched image size information may be useful, e.g.
+            # in DETR, this is needed for the construction of masks, which is
+            # then used for the transformer_head.
+            batch_input_shape = tuple(inputs.size()[-2:])
+            if self.use_det_processor and training:
+                for data_sample, pad_shape in zip(data_samples,
+                                                  batch_pad_shape):
+                    data_sample.set_metainfo({
+                        'batch_input_shape': batch_input_shape,
+                        'pad_shape': pad_shape
+                    })
+                if self.boxtype2tensor:
+                    samplelist_boxtype2tensor(data_samples)
+                if self.pad_mask:
+                    self.pad_gt_masks(data_samples)
+            else:
+                for track_data_sample, pad_shapes in zip(
+                        data_samples, batch_pad_shape):
+                    for i in range(len(track_data_sample)):
+                        det_data_sample = track_data_sample[i]
+                        det_data_sample.set_metainfo({
+                            'batch_input_shape': batch_input_shape,
+                            'pad_shape': pad_shapes[i]
+                        })
+                if self.pad_mask and training:
+                    self.pad_track_gt_masks(data_samples)
+
+        if training and self.batch_augments is not None:
+            for batch_aug in self.batch_augments:
+                if self.use_det_processor and training:
+                    inputs, data_samples = batch_aug(inputs, data_samples)
+                else:
+                    # we only support T==1 when using batch augments.
+                    # Only yolox need batch_aug, and yolox can only process
+                    # (N, C, H, W) shape.
+                    # The shape of `inputs` is (N, T, C, H, W), hence, we use
+                    # inputs[:, 0] to change the shape to (N, C, H, W).
+                    assert inputs.size(1) == 1 and len(
+                        data_samples[0]
+                    ) == 1, 'Only support the number of sequence images equals to 1 when using batch augment.'  # noqa: E501
+                    det_data_samples = [
+                        track_data_sample[0]
+                        for track_data_sample in data_samples
+                    ]
+                    aug_inputs, aug_det_samples = batch_aug(
+                        inputs[:, 0], det_data_samples)
+                    inputs = aug_inputs.unsqueeze(1)
+                    for track_data_sample, det_sample in zip(
+                            data_samples, aug_det_samples):
+                        track_data_sample.video_data_samples = [det_sample]
+
+        # Note: inputs may contain large number of frames, so we must make
+        # sure that the mmeory is contiguous for stable forward
+        inputs = inputs.contiguous()
+
+        return dict(inputs=inputs, data_samples=data_samples)
+
+    def _get_track_pad_shape(self, data: dict) -> Dict[str, List]:
+        """Get the pad_shape of each image based on data and pad_size_divisor.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+
+        Returns:
+            Dict[str, List]: The shape of padding.
+        """
+        batch_pad_shape = dict()
+        batch_pad_shape = []
+        for imgs in data['inputs']:
+            # The sequence images in one sample among a batch have the same
+            # original shape
+            pad_h = int(np.ceil(imgs.shape[-2] /
+                                self.pad_size_divisor)) * self.pad_size_divisor
+            pad_w = int(np.ceil(imgs.shape[-1] /
+                                self.pad_size_divisor)) * self.pad_size_divisor
+            pad_shapes = [(pad_h, pad_w)] * imgs.size(0)
+            batch_pad_shape.append(pad_shapes)
+        return batch_pad_shape
+
+    def pad_track_gt_masks(self,
+                           data_samples: Sequence[TrackDataSample]) -> None:
+        """Pad gt_masks to shape of batch_input_shape."""
+        if 'masks' in data_samples[0][0].get('gt_instances', None):
+            for track_data_sample in data_samples:
+                for i in range(len(track_data_sample)):
+                    det_data_sample = track_data_sample[i]
+                    masks = det_data_sample.gt_instances.masks
+                    # TODO: whether to use BitmapMasks
+                    assert isinstance(masks, BitmapMasks)
+                    batch_input_shape = det_data_sample.batch_input_shape
+                    det_data_sample.gt_instances.masks = masks.pad(
+                        batch_input_shape, pad_val=self.mask_pad_value)
+
+
+def stack_track_batch(tensors: List[torch.Tensor],
+                      pad_size_divisor: int = 0,
+                      pad_value: Union[int, float] = 0) -> torch.Tensor:
+    """Stack multiple tensors to form a batch and pad the images to the max
+    shape use the right bottom padding mode in these images. If
+    ``pad_size_divisor > 0``, add padding to ensure the common height and width
+    is divisible by ``pad_size_divisor``. The difference between this function
+    and ``stack_batch`` in MMEngine is that this function can process batch
+    sequence images with shape (N, T, C, H, W).
+
+    Args:
+        tensors (List[Tensor]): The input multiple tensors. each is a
+            TCHW 4D-tensor. T denotes the number of key/reference frames.
+        pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding
+            to ensure the common height and width is divisible by
+            ``pad_size_divisor``. This depends on the model, and many
+            models need a divisibility of 32. Defaults to 0
+        pad_value (int, float): The padding value. Defaults to 0
+
+    Returns:
+       Tensor: The NTCHW 5D-tensor. N denotes the batch size.
+    """
+    assert isinstance(tensors, list), \
+        f'Expected input type to be list, but got {type(tensors)}'
+    assert len(set([tensor.ndim for tensor in tensors])) == 1, \
+        f'Expected the dimensions of all tensors must be the same, ' \
+        f'but got {[tensor.ndim for tensor in tensors]}'
+    assert tensors[0].ndim == 4, f'Expected tensor dimension to be 4, ' \
+                                 f'but got {tensors[0].ndim}'
+    assert len(set([tensor.shape[0] for tensor in tensors])) == 1, \
+        f'Expected the channels of all tensors must be the same, ' \
+        f'but got {[tensor.shape[0] for tensor in tensors]}'
+
+    tensor_sizes = [(tensor.shape[-2], tensor.shape[-1]) for tensor in tensors]
+    max_size = np.stack(tensor_sizes).max(0)
+
+    if pad_size_divisor > 1:
+        # the last two dims are H,W, both subject to divisibility requirement
+        max_size = (
+            max_size +
+            (pad_size_divisor - 1)) // pad_size_divisor * pad_size_divisor
+
+    padded_samples = []
+    for tensor in tensors:
+        padding_size = [
+            0, max_size[-1] - tensor.shape[-1], 0,
+            max_size[-2] - tensor.shape[-2]
+        ]
+        if sum(padding_size) == 0:
+            padded_samples.append(tensor)
+        else:
+            padded_samples.append(F.pad(tensor, padding_size, value=pad_value))
+
+    return torch.stack(padded_samples, dim=0)
diff --git a/head_extractor/src/mmdet/models/dense_heads/__init__.py b/head_extractor/src/mmdet/models/dense_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b55ec2a4230a741e9a2c696ec434bf9cc8bafa
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/__init__.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor_free_head import AnchorFreeHead
+from .anchor_head import AnchorHead
+from .atss_head import ATSSHead
+from .atss_vlfusion_head import ATSSVLFusionHead
+from .autoassign_head import AutoAssignHead
+from .boxinst_head import BoxInstBboxHead, BoxInstMaskHead
+from .cascade_rpn_head import CascadeRPNHead, StageCascadeRPNHead
+from .centernet_head import CenterNetHead
+from .centernet_update_head import CenterNetUpdateHead
+from .centripetal_head import CentripetalHead
+from .condinst_head import CondInstBboxHead, CondInstMaskHead
+from .conditional_detr_head import ConditionalDETRHead
+from .corner_head import CornerHead
+from .dab_detr_head import DABDETRHead
+from .ddod_head import DDODHead
+from .ddq_detr_head import DDQDETRHead
+from .deformable_detr_head import DeformableDETRHead
+from .detr_head import DETRHead
+from .dino_head import DINOHead
+from .embedding_rpn_head import EmbeddingRPNHead
+from .fcos_head import FCOSHead
+from .fovea_head import FoveaHead
+from .free_anchor_retina_head import FreeAnchorRetinaHead
+from .fsaf_head import FSAFHead
+from .ga_retina_head import GARetinaHead
+from .ga_rpn_head import GARPNHead
+from .gfl_head import GFLHead
+from .grounding_dino_head import GroundingDINOHead
+from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
+from .lad_head import LADHead
+from .ld_head import LDHead
+from .mask2former_head import Mask2FormerHead
+from .maskformer_head import MaskFormerHead
+from .nasfcos_head import NASFCOSHead
+from .paa_head import PAAHead
+from .pisa_retinanet_head import PISARetinaHead
+from .pisa_ssd_head import PISASSDHead
+from .reppoints_head import RepPointsHead
+from .retina_head import RetinaHead
+from .retina_sepbn_head import RetinaSepBNHead
+from .rpn_head import RPNHead
+from .rtmdet_head import RTMDetHead, RTMDetSepBNHead
+from .rtmdet_ins_head import RTMDetInsHead, RTMDetInsSepBNHead
+from .sabl_retina_head import SABLRetinaHead
+from .solo_head import DecoupledSOLOHead, DecoupledSOLOLightHead, SOLOHead
+from .solov2_head import SOLOV2Head
+from .ssd_head import SSDHead
+from .tood_head import TOODHead
+from .vfnet_head import VFNetHead
+from .yolact_head import YOLACTHead, YOLACTProtonet
+from .yolo_head import YOLOV3Head
+from .yolof_head import YOLOFHead
+from .yolox_head import YOLOXHead
+
+__all__ = [
+    'AnchorFreeHead', 'AnchorHead', 'GuidedAnchorHead', 'FeatureAdaption',
+    'RPNHead', 'GARPNHead', 'RetinaHead', 'RetinaSepBNHead', 'GARetinaHead',
+    'SSDHead', 'FCOSHead', 'RepPointsHead', 'FoveaHead',
+    'FreeAnchorRetinaHead', 'ATSSHead', 'FSAFHead', 'NASFCOSHead',
+    'PISARetinaHead', 'PISASSDHead', 'GFLHead', 'CornerHead', 'YOLACTHead',
+    'YOLACTProtonet', 'YOLOV3Head', 'PAAHead', 'SABLRetinaHead',
+    'CentripetalHead', 'VFNetHead', 'StageCascadeRPNHead', 'CascadeRPNHead',
+    'EmbeddingRPNHead', 'LDHead', 'AutoAssignHead', 'DETRHead', 'YOLOFHead',
+    'DeformableDETRHead', 'CenterNetHead', 'YOLOXHead', 'SOLOHead',
+    'DecoupledSOLOHead', 'DecoupledSOLOLightHead', 'SOLOV2Head', 'LADHead',
+    'TOODHead', 'MaskFormerHead', 'Mask2FormerHead', 'DDODHead',
+    'CenterNetUpdateHead', 'RTMDetHead', 'RTMDetSepBNHead', 'CondInstBboxHead',
+    'CondInstMaskHead', 'RTMDetInsHead', 'RTMDetInsSepBNHead',
+    'BoxInstBboxHead', 'BoxInstMaskHead', 'ConditionalDETRHead', 'DINOHead',
+    'ATSSVLFusionHead', 'DABDETRHead', 'DDQDETRHead', 'GroundingDINOHead'
+]
diff --git a/head_extractor/src/mmdet/models/dense_heads/anchor_free_head.py b/head_extractor/src/mmdet/models/dense_heads/anchor_free_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..90a9b3625b8fef12a2ee3a964c89597b597cb2ec
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/anchor_free_head.py
@@ -0,0 +1,317 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Any, List, Sequence, Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from numpy import ndarray
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList)
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..utils import multi_apply
+from .base_dense_head import BaseDenseHead
+
+StrideType = Union[Sequence[int], Sequence[Tuple[int, int]]]
+
+
+@MODELS.register_module()
+class AnchorFreeHead(BaseDenseHead):
+    """Anchor-free head (FCOS, Fovea, RepPoints, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        stacked_convs (int): Number of stacking convs of the head.
+        strides (Sequence[int] or Sequence[Tuple[int, int]]): Downsample
+            factor of each feature map.
+        dcn_on_last_conv (bool): If true, use dcn in the last layer of
+            towers. Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Default: "auto".
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. Defaults
+            'DistancePointBBoxCoder'.
+        conv_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, Optional): Config dict for
+            normalization layer. Defaults to None.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Training config of
+            anchor-free head.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of
+            anchor-free head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """  # noqa: W605
+
+    _version = 1
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 4,
+        strides: StrideType = (4, 8, 16, 32, 64),
+        dcn_on_last_conv: bool = False,
+        conv_bias: Union[bool, str] = 'auto',
+        loss_cls: ConfigType = dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox: ConfigType = dict(type='IoULoss', loss_weight=1.0),
+        bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        init_cfg: MultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='conv_cls', std=0.01, bias_prob=0.01))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+
+        self.prior_generator = MlvlPointGenerator(strides)
+
+        # In order to keep a more general interface and be consistent with
+        # anchor_head. We can think of point like one anchor
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self._init_cls_convs()
+        self._init_reg_convs()
+        self._init_predictor()
+
+    def _init_cls_convs(self) -> None:
+        """Initialize classification conv layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_reg_convs(self) -> None:
+        """Initialize bbox regression conv layers of the head."""
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_predictor(self) -> None:
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Hack some keys of the model state dict so that can load checkpoints
+        of previous version."""
+        version = local_metadata.get('version', None)
+        if version is None:
+            # the key is different in early versions
+            # for example, 'fcos_cls' become 'conv_cls' now
+            bbox_head_keys = [
+                k for k in state_dict.keys() if k.startswith(prefix)
+            ]
+            ori_predictor_keys = []
+            new_predictor_keys = []
+            # e.g. 'fcos_cls' or 'fcos_reg'
+            for key in bbox_head_keys:
+                ori_predictor_keys.append(key)
+                key = key.split('.')
+                if len(key) < 2:
+                    conv_name = None
+                elif key[1].endswith('cls'):
+                    conv_name = 'conv_cls'
+                elif key[1].endswith('reg'):
+                    conv_name = 'conv_reg'
+                elif key[1].endswith('centerness'):
+                    conv_name = 'conv_centerness'
+                else:
+                    conv_name = None
+                if conv_name is not None:
+                    key[1] = conv_name
+                    new_predictor_keys.append('.'.join(key))
+                else:
+                    ori_predictor_keys.pop(-1)
+            for i in range(len(new_predictor_keys)):
+                state_dict[new_predictor_keys[i]] = state_dict.pop(
+                    ori_predictor_keys[i])
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually contain classification scores and bbox predictions.
+
+            - cls_scores (list[Tensor]): Box scores for each scale level, \
+            each is a 4D-tensor, the channel number is \
+            num_points * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for each scale \
+            level, each is a 4D-tensor, the channel number is num_points * 4.
+        """
+        return multi_apply(self.forward_single, x)[:2]
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, ...]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+
+        Returns:
+            tuple: Scores for each class, bbox predictions, features
+            after classification and regression conv layers, some
+            models needs these features like FCOS.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        cls_score = self.conv_cls(cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = self.conv_reg(reg_feat)
+        return cls_score, bbox_pred, cls_feat, reg_feat
+
+    @abstractmethod
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_targets(self, points: List[Tensor],
+                    batch_gt_instances: InstanceList) -> Any:
+        """Compute regression, classification and centerness targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+        """
+        raise NotImplementedError
+
+    # TODO refactor aug_test
+    def aug_test(self,
+                 aug_batch_feats: List[Tensor],
+                 aug_batch_img_metas: List[List[Tensor]],
+                 rescale: bool = False) -> List[ndarray]:
+        """Test function with test time augmentation.
+
+        Args:
+            aug_batch_feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            aug_batch_img_metas (list[list[dict]]): the outer list indicates
+                test-time augs (multiscale, flip, etc.) and the inner list
+                indicates images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[ndarray]: bbox results of each class
+        """
+        return self.aug_test_bboxes(
+            aug_batch_feats, aug_batch_img_metas, rescale=rescale)
diff --git a/head_extractor/src/mmdet/models/dense_heads/anchor_head.py b/head_extractor/src/mmdet/models/dense_heads/anchor_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4578caca818550397875a0df34c128f461e6ec75
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/anchor_head.py
@@ -0,0 +1,530 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, cat_boxes, get_box_tensor
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from ..task_modules.prior_generators import (AnchorGenerator,
+                                             anchor_inside_flags)
+from ..task_modules.samplers import PseudoSampler
+from ..utils import images_to_levels, multi_apply, unmap
+from .base_dense_head import BaseDenseHead
+
+
+@MODELS.register_module()
+class AnchorHead(BaseDenseHead):
+    """Anchor-based head (RPN, RetinaNet, SSD, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        anchor_generator (dict): Config dict for anchor generator
+        bbox_coder (dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            scales=[8, 16, 32],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder: ConfigType = dict(
+            type='DeltaXYWHBBoxCoder',
+            clip_border=True,
+            target_means=(.0, .0, .0, .0),
+            target_stds=(1.0, 1.0, 1.0, 1.0)),
+        reg_decoded_bbox: bool = False,
+        loss_cls: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox: ConfigType = dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        init_cfg: OptMultiConfig = dict(
+            type='Normal', layer='Conv2d', std=0.01)
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        if self.cls_out_channels <= 0:
+            raise ValueError(f'num_classes={num_classes} is too small')
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        self.fp16_enabled = False
+
+        self.prior_generator = TASK_UTILS.build(anchor_generator)
+
+        # Usually the numbers of anchors for each level are the same
+        # except SSD detectors. So it is an int in the most dense
+        # heads but a list of int in SSDHead
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+        self._init_layers()
+
+    @property
+    def num_anchors(self) -> int:
+        warnings.warn('DeprecationWarning: `num_anchors` is deprecated, '
+                      'for consistency or also use '
+                      '`num_base_priors` instead')
+        return self.prior_generator.num_base_priors[0]
+
+    @property
+    def anchor_generator(self) -> AnchorGenerator:
+        warnings.warn('DeprecationWarning: anchor_generator is deprecated, '
+                      'please use "prior_generator" instead')
+        return self.prior_generator
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.conv_cls = nn.Conv2d(self.in_channels,
+                                  self.num_base_priors * self.cls_out_channels,
+                                  1)
+        reg_dim = self.bbox_coder.encode_size
+        self.conv_reg = nn.Conv2d(self.in_channels,
+                                  self.num_base_priors * reg_dim, 1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level \
+                    the channels number is num_base_priors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_base_priors * 4.
+        """
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        return cls_score, bbox_pred
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_scores (list[Tensor]): Classification scores for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * num_classes.
+                - bbox_preds (list[Tensor]): Box energies / deltas for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * 4.
+        """
+        return multi_apply(self.forward_single, x)
+
+    def get_anchors(self,
+                    featmap_sizes: List[tuple],
+                    batch_img_metas: List[dict],
+                    device: Union[torch.device, str] = 'cuda') \
+            -> Tuple[List[List[Tensor]], List[List[Tensor]]]:
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+            device (torch.device | str): Device for returned tensors.
+                Defaults to cuda.
+
+        Returns:
+            tuple:
+
+                - anchor_list (list[list[Tensor]]): Anchors of each image.
+                - valid_flag_list (list[list[Tensor]]): Valid flags of each
+                  image.
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = self.prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device)
+            valid_flag_list.append(multi_level_flags)
+
+        return anchor_list, valid_flag_list
+
+    def _get_targets_single(self,
+                            flat_anchors: Union[Tensor, BaseBoxes],
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            flat_anchors (Tensor or :obj:`BaseBoxes`): Multi-level anchors
+                of the image, which are concatenated into a single tensor
+                or box type of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors, ).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.  Defaults to True.
+
+        Returns:
+            tuple:
+
+                - labels (Tensor): Labels of each level.
+                - label_weights (Tensor): Label weights of each level.
+                - bbox_targets (Tensor): BBox targets of each level.
+                - bbox_weights (Tensor): BBox weights of each level.
+                - pos_inds (Tensor): positive samples indexes.
+                - neg_inds (Tensor): negative samples indexes.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags]
+
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+        # No sampling is required except for RPN and
+        # Guided Anchoring algorithms
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        target_dim = gt_instances.bboxes.size(-1) if self.reg_decoded_bbox \
+            else self.bbox_coder.encode_size
+        bbox_targets = anchors.new_zeros(num_valid_anchors, target_dim)
+        bbox_weights = anchors.new_zeros(num_valid_anchors, target_dim)
+
+        # TODO: Considering saving memory, is it necessary to be long?
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        # `bbox_coder.encode` accepts tensor or box type inputs and generates
+        # tensor targets. If regressing decoded boxes, the code will convert
+        # box type `pos_bbox_targets` to tensor.
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_priors, sampling_result.pos_gt_bboxes)
+            else:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+                pos_bbox_targets = get_box_tensor(pos_bbox_targets)
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def get_targets(self,
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True,
+                    return_sampling_results: bool = False) -> tuple:
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+            return_sampling_results (bool): Whether to return the sampling
+                results. Defaults to False.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - avg_factor (int): Average factor that is used to average
+                  the loss. When using sampling method, avg_factor is usually
+                  the sum of positive and negative priors. When using
+                  `PseudoSampler`, `avg_factor` is usually equal to the number
+                  of positive priors.
+
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(cat_boxes(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        results = multi_apply(
+            self._get_targets_single,
+            concat_anchor_list,
+            concat_valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore,
+            unmap_outputs=unmap_outputs)
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         pos_inds_list, neg_inds_list, sampling_results_list) = results[:7]
+        rest_results = list(results[7:])  # user-added return values
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # update `_raw_positive_infos`, which will be used when calling
+        # `get_positive_infos`.
+        self._raw_positive_infos.update(sampling_results=sampling_results_list)
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        res = (labels_list, label_weights_list, bbox_targets_list,
+               bbox_weights_list, avg_factor)
+        if return_sampling_results:
+            res = res + (sampling_results_list, )
+        for i, r in enumerate(rest_results):  # user-added return values
+            rest_results[i] = images_to_levels(r, num_level_anchors)
+
+        return res + tuple(rest_results)
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            anchors: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            bbox_weights: Tensor, avg_factor: int) -> tuple:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor
+                weight shape (N, num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (N, num_total_anchors, 4).
+            avg_factor (int): Average factor that is used to average the loss.
+
+        Returns:
+            tuple: loss components.
+        """
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+        # regression loss
+        target_dim = bbox_targets.size(-1)
+        bbox_targets = bbox_targets.reshape(-1, target_dim)
+        bbox_weights = bbox_weights.reshape(-1, target_dim)
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1,
+                                                 self.bbox_coder.encode_size)
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            anchors = anchors.reshape(-1, anchors.size(-1))
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+            bbox_pred = get_box_tensor(bbox_pred)
+        loss_bbox = self.loss_bbox(
+            bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor)
+        return loss_cls, loss_bbox
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor) = cls_reg_targets
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(cat_boxes(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            avg_factor=avg_factor)
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
diff --git a/head_extractor/src/mmdet/models/dense_heads/atss_head.py b/head_extractor/src/mmdet/models/dense_heads/atss_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ce71b3eff5e0ed624ec7ae16e8db80c90e8ffa1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/atss_head.py
@@ -0,0 +1,524 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import images_to_levels, multi_apply, unmap
+from .anchor_head import AnchorHead
+
+
+@MODELS.register_module()
+class ATSSHead(AnchorHead):
+    """Detection Head of `ATSS <https://arxiv.org/abs/1912.02424>`_.
+
+    ATSS head structure is similar with FCOS, however ATSS use anchor boxes
+    and assign label by Adaptive Training Sample Selection instead max-iou.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        pred_kernel_size (int): Kernel size of ``nn.Conv2d``
+        stacked_convs (int): Number of stacking convs of the head.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to ``dict(type='GN', num_groups=32,
+            requires_grad=True)``.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Defaults to False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        loss_centerness (:obj:`ConfigDict` or dict): Config of centerness loss.
+            Defaults to ``dict(type='CrossEntropyLoss', use_sigmoid=True,
+            loss_weight=1.0)``.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 pred_kernel_size: int = 3,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 reg_decoded_bbox: bool = True,
+                 loss_centerness: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='atss_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.pred_kernel_size = pred_kernel_size
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            reg_decoded_bbox=reg_decoded_bbox,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        self.sampling = False
+        self.loss_centerness = MODELS.build(loss_centerness)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        pred_pad_size = self.pred_kernel_size // 2
+        self.atss_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_anchors * self.cls_out_channels,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.atss_reg = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * 4,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.atss_centerness = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * 1,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * 4.
+        """
+        return multi_apply(self.forward_single, x, self.scales)
+
+    def forward_single(self, x: Tensor, scale: Scale) -> Sequence[Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_anchors * 4.
+                centerness (Tensor): Centerness for a single scale level, the
+                    channel number is (N, num_anchors * 1, H, W).
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.atss_cls(cls_feat)
+        # we just follow atss, not apply exp in bbox_pred
+        bbox_pred = scale(self.atss_reg(reg_feat)).float()
+        centerness = self.atss_centerness(reg_feat)
+        return cls_score, bbox_pred, centerness
+
+    def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor,
+                            bbox_pred: Tensor, centerness: Tensor,
+                            labels: Tensor, label_weights: Tensor,
+                            bbox_targets: Tensor, avg_factor: float) -> dict:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            avg_factor (float): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        centerness = centerness.permute(0, 2, 3, 1).reshape(-1)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # classification loss
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_centerness = centerness[pos_inds]
+
+            centerness_targets = self.centerness_target(
+                pos_anchors, pos_bbox_targets)
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_pred)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_bbox_targets,
+                weight=centerness_targets,
+                avg_factor=1.0)
+
+            # centerness loss
+            loss_centerness = self.loss_centerness(
+                pos_centerness, centerness_targets, avg_factor=avg_factor)
+
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_centerness = centerness.sum() * 0
+            centerness_targets = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, loss_centerness, centerness_targets.sum()
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            centernesses: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            centernesses (list[Tensor]): Centerness for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        losses_cls, losses_bbox, loss_centerness, \
+            bbox_avg_factor = multi_apply(
+                self.loss_by_feat_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                centernesses,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                avg_factor=avg_factor)
+
+        bbox_avg_factor = sum(bbox_avg_factor)
+        bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_centerness=loss_centerness)
+
+    def centerness_target(self, anchors: Tensor, gts: Tensor) -> Tensor:
+        """Calculate the centerness between anchors and gts.
+
+        Only calculate pos centerness targets, otherwise there may be nan.
+
+        Args:
+            anchors (Tensor): Anchors with shape (N, 4), "xyxy" format.
+            gts (Tensor): Ground truth bboxes with shape (N, 4), "xyxy" format.
+
+        Returns:
+            Tensor: Centerness between anchors and gts.
+        """
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        l_ = anchors_cx - gts[:, 0]
+        t_ = anchors_cy - gts[:, 1]
+        r_ = gts[:, 2] - anchors_cx
+        b_ = gts[:, 3] - anchors_cy
+
+        left_right = torch.stack([l_, r_], dim=1)
+        top_bottom = torch.stack([t_, b_], dim=1)
+        centerness = torch.sqrt(
+            (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) *
+            (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]))
+        assert not torch.isnan(centerness).any()
+        return centerness
+
+    def get_targets(self,
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True) -> tuple:
+        """Get targets for ATSS head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. Besides
+        returning the targets as the parent method does, it also returns the
+        anchors as the first element of the returned tuple.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             anchor_list,
+             valid_flag_list,
+             num_level_anchors_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs)
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, avg_factor)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            num_level_anchors: List[int],
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors (List[int]): Number of anchors of each scale
+                level.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                bbox_weights (Tensor): BBox weights of all anchors in the
+                    image with shape (N, 4)
+                pos_inds (Tensor): Indices of positive anchor with shape
+                    (num_pos,).
+                neg_inds (Tensor): Indices of negative anchor with shape
+                    (num_neg,).
+                sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(pred_instances,
+                                             num_level_anchors_inside,
+                                             gt_instances, gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if self.reg_decoded_bbox:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            else:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_priors, sampling_result.pos_gt_bboxes)
+
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds, sampling_result)
+
+    def get_num_level_anchors_inside(self, num_level_anchors, inside_flags):
+        """Get the number of valid anchors in every level."""
+
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/head_extractor/src/mmdet/models/dense_heads/atss_vlfusion_head.py b/head_extractor/src/mmdet/models/dense_heads/atss_vlfusion_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5cd28b4a040ba447130aed07629f6312f95dcf3
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/atss_vlfusion_head.py
@@ -0,0 +1,949 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from typing import Callable, List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Scale
+from mmcv.ops.modulated_deform_conv import ModulatedDeformConv2d
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModel
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+try:
+    from transformers import BertConfig
+except ImportError:
+    BertConfig = None
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import cat_boxes
+from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
+from ..utils import (BertEncoderLayer, VLFuse, filter_scores_and_topk,
+                     permute_and_flatten, select_single_mlvl,
+                     unpack_gt_instances)
+from ..utils.vlfuse_helper import MAX_CLAMP_VALUE
+from .atss_head import ATSSHead
+
+
+def convert_grounding_to_cls_scores(logits: Tensor,
+                                    positive_maps: List[dict]) -> Tensor:
+    """Convert logits to class scores."""
+    assert len(positive_maps) == logits.shape[0]  # batch size
+
+    scores = torch.zeros(logits.shape[0], logits.shape[1],
+                         len(positive_maps[0])).to(logits.device)
+    if positive_maps is not None:
+        if all(x == positive_maps[0] for x in positive_maps):
+            # only need to compute once
+            positive_map = positive_maps[0]
+            for label_j in positive_map:
+                scores[:, :, label_j -
+                       1] = logits[:, :,
+                                   torch.LongTensor(positive_map[label_j]
+                                                    )].mean(-1)
+        else:
+            for i, positive_map in enumerate(positive_maps):
+                for label_j in positive_map:
+                    scores[i, :, label_j - 1] = logits[
+                        i, :, torch.LongTensor(positive_map[label_j])].mean(-1)
+    return scores
+
+
+class Conv3x3Norm(nn.Module):
+    """Conv3x3 and norm."""
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 stride: int,
+                 groups: int = 1,
+                 use_dcn: bool = False,
+                 norm_type: Optional[Union[Sequence, str]] = None):
+        super().__init__()
+
+        if use_dcn:
+            self.conv = ModulatedDeformConv2d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                groups=groups)
+        else:
+            self.conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                groups=groups)
+
+        if isinstance(norm_type, Sequence):
+            assert len(norm_type) == 2
+            assert norm_type[0] == 'gn'
+            gn_group = norm_type[1]
+            norm_type = norm_type[0]
+
+        if norm_type == 'bn':
+            bn_op = nn.BatchNorm2d(out_channels)
+        elif norm_type == 'gn':
+            bn_op = nn.GroupNorm(
+                num_groups=gn_group, num_channels=out_channels)
+        if norm_type is not None:
+            self.bn = bn_op
+        else:
+            self.bn = None
+
+    def forward(self, x, **kwargs):
+        x = self.conv(x, **kwargs)
+        if self.bn:
+            x = self.bn(x)
+        return x
+
+
+class DyReLU(nn.Module):
+    """Dynamic ReLU."""
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expand_ratio: int = 4):
+        super().__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.expand_ratio = expand_ratio
+        self.out_channels = out_channels
+
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, in_channels // expand_ratio),
+            nn.ReLU(inplace=True),
+            nn.Linear(in_channels // expand_ratio,
+                      out_channels * self.expand_ratio),
+            nn.Hardsigmoid(inplace=True))
+
+    def forward(self, x) -> Tensor:
+        x_out = x
+        b, c, h, w = x.size()
+        x = self.avg_pool(x).view(b, c)
+        x = self.fc(x).view(b, -1, 1, 1)
+
+        a1, b1, a2, b2 = torch.split(x, self.out_channels, dim=1)
+        a1 = (a1 - 0.5) * 2 + 1.0
+        a2 = (a2 - 0.5) * 2
+        b1 = b1 - 0.5
+        b2 = b2 - 0.5
+        out = torch.max(x_out * a1 + b1, x_out * a2 + b2)
+        return out
+
+
+class DyConv(nn.Module):
+    """Dynamic Convolution."""
+
+    def __init__(self,
+                 conv_func: Callable,
+                 in_channels: int,
+                 out_channels: int,
+                 use_dyfuse: bool = True,
+                 use_dyrelu: bool = False,
+                 use_dcn: bool = False):
+        super().__init__()
+
+        self.dyconvs = nn.ModuleList()
+        self.dyconvs.append(conv_func(in_channels, out_channels, 1))
+        self.dyconvs.append(conv_func(in_channels, out_channels, 1))
+        self.dyconvs.append(conv_func(in_channels, out_channels, 2))
+
+        if use_dyfuse:
+            self.attnconv = nn.Sequential(
+                nn.AdaptiveAvgPool2d(1),
+                nn.Conv2d(in_channels, 1, kernel_size=1),
+                nn.ReLU(inplace=True))
+            self.h_sigmoid = nn.Hardsigmoid(inplace=True)
+        else:
+            self.attnconv = None
+
+        if use_dyrelu:
+            self.relu = DyReLU(in_channels, out_channels)
+        else:
+            self.relu = nn.ReLU()
+
+        if use_dcn:
+            self.offset = nn.Conv2d(
+                in_channels, 27, kernel_size=3, stride=1, padding=1)
+        else:
+            self.offset = None
+
+        self.init_weights()
+
+    def init_weights(self):
+        for m in self.dyconvs.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight.data, 0, 0.01)
+                if m.bias is not None:
+                    m.bias.data.zero_()
+        if self.attnconv is not None:
+            for m in self.attnconv.modules():
+                if isinstance(m, nn.Conv2d):
+                    nn.init.normal_(m.weight.data, 0, 0.01)
+                    if m.bias is not None:
+                        m.bias.data.zero_()
+
+    def forward(self, inputs: dict) -> dict:
+        visual_feats = inputs['visual']
+
+        out_vis_feats = []
+        for level, feature in enumerate(visual_feats):
+
+            offset_conv_args = {}
+            if self.offset is not None:
+                offset_mask = self.offset(feature)
+                offset = offset_mask[:, :18, :, :]
+                mask = offset_mask[:, 18:, :, :].sigmoid()
+                offset_conv_args = dict(offset=offset, mask=mask)
+
+            temp_feats = [self.dyconvs[1](feature, **offset_conv_args)]
+
+            if level > 0:
+                temp_feats.append(self.dyconvs[2](visual_feats[level - 1],
+                                                  **offset_conv_args))
+            if level < len(visual_feats) - 1:
+                temp_feats.append(
+                    F.upsample_bilinear(
+                        self.dyconvs[0](visual_feats[level + 1],
+                                        **offset_conv_args),
+                        size=[feature.size(2),
+                              feature.size(3)]))
+            mean_feats = torch.mean(
+                torch.stack(temp_feats), dim=0, keepdim=False)
+
+            if self.attnconv is not None:
+                attn_feat = []
+                res_feat = []
+                for feat in temp_feats:
+                    res_feat.append(feat)
+                    attn_feat.append(self.attnconv(feat))
+
+                res_feat = torch.stack(res_feat)
+                spa_pyr_attn = self.h_sigmoid(torch.stack(attn_feat))
+
+                mean_feats = torch.mean(
+                    res_feat * spa_pyr_attn, dim=0, keepdim=False)
+
+            out_vis_feats.append(mean_feats)
+
+        out_vis_feats = [self.relu(item) for item in out_vis_feats]
+
+        features_dict = {'visual': out_vis_feats, 'lang': inputs['lang']}
+
+        return features_dict
+
+
+class VLFusionModule(BaseModel):
+    """Visual-lang Fusion Module."""
+
+    def __init__(self,
+                 in_channels: int,
+                 feat_channels: int,
+                 num_base_priors: int,
+                 early_fuse: bool = False,
+                 num_dyhead_blocks: int = 6,
+                 lang_model_name: str = 'bert-base-uncased',
+                 use_dyrelu: bool = True,
+                 use_dyfuse: bool = True,
+                 use_dcn: bool = True,
+                 use_checkpoint: bool = False,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        if BertConfig is None:
+            raise RuntimeError(
+                'transformers is not installed, please install it by: '
+                'pip install transformers.')
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.num_base_priors = num_base_priors
+        self.early_fuse = early_fuse
+        self.num_dyhead_blocks = num_dyhead_blocks
+        self.use_dyrelu = use_dyrelu
+        self.use_dyfuse = use_dyfuse
+        self.use_dcn = use_dcn
+        self.use_checkpoint = use_checkpoint
+
+        self.lang_cfg = BertConfig.from_pretrained(lang_model_name)
+        self.lang_dim = self.lang_cfg.hidden_size
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the model."""
+        bias_value = -math.log((1 - 0.01) / 0.01)
+
+        dyhead_tower = []
+        for i in range(self.num_dyhead_blocks):
+            if self.early_fuse:
+                # cross-modality fusion
+                dyhead_tower.append(VLFuse(use_checkpoint=self.use_checkpoint))
+                # lang branch
+                dyhead_tower.append(
+                    BertEncoderLayer(
+                        self.lang_cfg,
+                        clamp_min_for_underflow=True,
+                        clamp_max_for_overflow=True))
+
+            # vision branch
+            dyhead_tower.append(
+                DyConv(
+                    lambda i, o, s: Conv3x3Norm(
+                        i, o, s, use_dcn=self.use_dcn, norm_type=['gn', 16]),
+                    self.in_channels if i == 0 else self.feat_channels,
+                    self.feat_channels,
+                    use_dyrelu=(self.use_dyrelu
+                                and self.in_channels == self.feat_channels)
+                    if i == 0 else self.use_dyrelu,
+                    use_dyfuse=(self.use_dyfuse
+                                and self.in_channels == self.feat_channels)
+                    if i == 0 else self.use_dyfuse,
+                    use_dcn=(self.use_dcn
+                             and self.in_channels == self.feat_channels)
+                    if i == 0 else self.use_dcn,
+                ))
+
+        self.add_module('dyhead_tower', nn.Sequential(*dyhead_tower))
+
+        self.bbox_pred = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, kernel_size=1)
+        self.centerness = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 1, kernel_size=1)
+        self.dot_product_projection_text = nn.Linear(
+            self.lang_dim,
+            self.num_base_priors * self.feat_channels,
+            bias=True)
+        self.log_scale = nn.Parameter(torch.Tensor([0.0]), requires_grad=True)
+        self.bias_lang = nn.Parameter(
+            torch.zeros(self.lang_dim), requires_grad=True)
+        self.bias0 = nn.Parameter(
+            torch.Tensor([bias_value]), requires_grad=True)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in range(5)])
+
+    def forward(self, visual_feats: Tuple[Tensor],
+                language_feats: dict) -> Tuple:
+        feat_inputs = {'visual': visual_feats, 'lang': language_feats}
+        dyhead_tower = self.dyhead_tower(feat_inputs)
+
+        if self.early_fuse:
+            embedding = dyhead_tower['lang']['hidden']
+        else:
+            embedding = language_feats['embedded']
+
+        embedding = F.normalize(embedding, p=2, dim=-1)
+        dot_product_proj_tokens = self.dot_product_projection_text(embedding /
+                                                                   2.0)
+        dot_product_proj_tokens_bias = torch.matmul(
+            embedding, self.bias_lang) + self.bias0
+
+        bbox_preds = []
+        centerness = []
+        cls_logits = []
+
+        for i, feature in enumerate(visual_feats):
+            visual = dyhead_tower['visual'][i]
+            B, C, H, W = visual.shape
+
+            bbox_pred = self.scales[i](self.bbox_pred(visual))
+            bbox_preds.append(bbox_pred)
+            centerness.append(self.centerness(visual))
+
+            dot_product_proj_queries = permute_and_flatten(
+                visual, B, self.num_base_priors, C, H, W)
+
+            bias = dot_product_proj_tokens_bias.unsqueeze(1).repeat(
+                1, self.num_base_priors, 1)
+            dot_product_logit = (
+                torch.matmul(dot_product_proj_queries,
+                             dot_product_proj_tokens.transpose(-1, -2)) /
+                self.log_scale.exp()) + bias
+            dot_product_logit = torch.clamp(
+                dot_product_logit, max=MAX_CLAMP_VALUE)
+            dot_product_logit = torch.clamp(
+                dot_product_logit, min=-MAX_CLAMP_VALUE)
+            cls_logits.append(dot_product_logit)
+
+        return bbox_preds, centerness, cls_logits
+
+
+@MODELS.register_module()
+class ATSSVLFusionHead(ATSSHead):
+    """ATSS head with visual-language fusion module.
+
+    Args:
+        early_fuse (bool): Whether to fuse visual and language features
+            Defaults to False.
+        use_checkpoint (bool): Whether to use checkpoint. Defaults to False.
+        num_dyhead_blocks (int): Number of dynamic head blocks. Defaults to 6.
+        lang_model_name (str): Name of the language model.
+            Defaults to 'bert-base-uncased'.
+    """
+
+    def __init__(self,
+                 *args,
+                 early_fuse: bool = False,
+                 use_checkpoint: bool = False,
+                 num_dyhead_blocks: int = 6,
+                 lang_model_name: str = 'bert-base-uncased',
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(*args, **kwargs, init_cfg=init_cfg)
+        self.head = VLFusionModule(
+            in_channels=self.in_channels,
+            feat_channels=self.feat_channels,
+            num_base_priors=self.num_base_priors,
+            early_fuse=early_fuse,
+            use_checkpoint=use_checkpoint,
+            num_dyhead_blocks=num_dyhead_blocks,
+            lang_model_name=lang_model_name)
+        self.text_masks = None
+
+    def _init_layers(self) -> None:
+        """No need to initialize the ATSS head layer."""
+        pass
+
+    def forward(self, visual_feats: Tuple[Tensor],
+                language_feats: dict) -> Tuple[Tensor]:
+        """Forward function."""
+        bbox_preds, centerness, cls_logits = self.head(visual_feats,
+                                                       language_feats)
+        return cls_logits, bbox_preds, centerness
+
+    def loss(self, visual_feats: Tuple[Tensor], language_feats: dict,
+             batch_data_samples):
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        outs = self(visual_feats, language_feats)
+        self.text_masks = language_feats['masks']
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            centernesses: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            centernesses (list[Tensor]): Centerness for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        anchors = torch.cat(anchor_list, dim=1)
+        labels = torch.cat(labels_list, dim=1)
+        label_weights = torch.cat(label_weights_list, dim=1)
+        bbox_targets = torch.cat(bbox_targets_list, dim=1)
+        cls_scores = torch.cat(cls_scores, dim=1)
+
+        centernesses_ = []
+        bbox_preds_ = []
+        for bbox_pred, centerness in zip(bbox_preds, centernesses):
+            centernesses_.append(
+                centerness.permute(0, 2, 3,
+                                   1).reshape(cls_scores.size(0), -1, 1))
+            bbox_preds_.append(
+                bbox_pred.permute(0, 2, 3,
+                                  1).reshape(cls_scores.size(0), -1, 4))
+        bbox_preds = torch.cat(bbox_preds_, dim=1)
+        centernesses = torch.cat(centernesses_, dim=1)
+
+        losses_cls, losses_bbox, loss_centerness, bbox_avg_factor = \
+            self._loss_by_feat(
+                anchors,
+                cls_scores,
+                bbox_preds,
+                centernesses,
+                labels,
+                label_weights,
+                bbox_targets,
+                avg_factor=avg_factor)
+
+        bbox_avg_factor = reduce_mean(bbox_avg_factor).clamp_(min=1).item()
+        losses_bbox = losses_bbox / bbox_avg_factor
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_centerness=loss_centerness)
+
+    def _loss_by_feat(self, anchors: Tensor, cls_score: Tensor,
+                      bbox_pred: Tensor, centerness: Tensor, labels: Tensor,
+                      label_weights: Tensor, bbox_targets: Tensor,
+                      avg_factor: float) -> dict:
+        """Calculate the loss of all scale level based on the features
+        extracted by the detection head.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        anchors = anchors.reshape(-1, 4)
+
+        # ===== this change =====
+        pos_inds = (labels.sum(-1) > 0).reshape(-1)
+
+        # Loss is not computed for the padded regions of the text.
+        assert (self.text_masks.dim() == 2)
+        text_mask = (self.text_masks > 0).unsqueeze(1)
+        text_mask = text_mask.repeat(1, cls_score.size(1), 1)
+        cls_score = torch.masked_select(cls_score, text_mask).contiguous()
+        labels = torch.masked_select(labels, text_mask)
+        label_weights = label_weights[...,
+                                      None].repeat(1, 1, text_mask.size(-1))
+        label_weights = torch.masked_select(label_weights, text_mask)
+
+        bbox_pred = bbox_pred.reshape(-1, 4)
+        centerness = centerness.reshape(-1)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # classification loss
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+
+        if pos_inds.sum() > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_centerness = centerness[pos_inds]
+
+            centerness_targets = self.centerness_target(
+                pos_anchors, pos_bbox_targets)
+
+            if torch.isnan(centerness_targets).any():
+                print('=====Centerness includes NaN=====')
+                mask = ~torch.isnan(centerness_targets)
+                centerness_targets = centerness_targets[mask]
+                pos_centerness = pos_centerness[mask]
+                pos_anchors = pos_anchors[mask]
+                pos_bbox_targets = pos_bbox_targets[mask]
+                pos_bbox_pred = pos_bbox_pred[mask]
+
+                if pos_bbox_targets.shape[0] == 0:
+                    loss_bbox = bbox_pred.sum() * 0
+                    loss_centerness = centerness.sum() * 0
+                    centerness_targets = bbox_targets.new_tensor(0.)
+                    return loss_cls, loss_bbox, loss_centerness, \
+                        centerness_targets.sum()
+
+            # The decoding process takes the offset into consideration.
+            pos_anchors[:, 2:] += 1
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_pred)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_bbox_targets,
+                weight=centerness_targets,
+                avg_factor=1.0)
+
+            # centerness loss
+            loss_centerness = self.loss_centerness(
+                pos_centerness, centerness_targets, avg_factor=avg_factor)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_centerness = centerness.sum() * 0
+            centerness_targets = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, loss_centerness, centerness_targets.sum()
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            num_level_anchors: List[int],
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors (List[int]): Number of anchors of each scale
+                level.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                bbox_weights (Tensor): BBox weights of all anchors in the
+                    image with shape (N, 4)
+                pos_inds (Tensor): Indices of positive anchor with shape
+                    (num_pos,).
+                neg_inds (Tensor): Indices of negative anchor with shape
+                    (num_neg,).
+                sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        anchors = flat_anchors
+        # Align the official implementation
+        anchors[:, 2:] -= 1
+
+        num_level_anchors_inside = num_level_anchors
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(pred_instances,
+                                             num_level_anchors_inside,
+                                             gt_instances, gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+
+        # ===== this change =====
+        labels = anchors.new_full((num_valid_anchors, self.feat_channels),
+                                  0,
+                                  dtype=torch.float32)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if self.reg_decoded_bbox:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            else:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_priors, sampling_result.pos_gt_bboxes)
+
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            # ===== this change =====
+            labels[pos_inds] = gt_instances.positive_maps[
+                sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds, sampling_result)
+
+    def centerness_target(self, anchors: Tensor, gts: Tensor) -> Tensor:
+        """Calculate the centerness between anchors and gts.
+
+        Only calculate pos centerness targets, otherwise there may be nan.
+
+        Args:
+            anchors (Tensor): Anchors with shape (N, 4), "xyxy" format.
+            gts (Tensor): Ground truth bboxes with shape (N, 4), "xyxy" format.
+
+        Returns:
+            Tensor: Centerness between anchors and gts.
+        """
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        l_ = anchors_cx - gts[:, 0]
+        t_ = anchors_cy - gts[:, 1]
+        r_ = gts[:, 2] - anchors_cx
+        b_ = gts[:, 3] - anchors_cy
+
+        left_right = torch.stack([l_, r_], dim=1)
+        top_bottom = torch.stack([t_, b_], dim=1)
+        centerness = torch.sqrt(
+            (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) *
+            (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0]))
+        # assert not torch.isnan(centerness).any()
+        return centerness
+
+    def predict(self,
+                visual_feats: Tuple[Tensor],
+                language_feats: dict,
+                batch_data_samples,
+                rescale: bool = True):
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            visual_feats (tuple[Tensor]): Multi-level visual features from the
+                upstream network, each is a 4D-tensor.
+            language_feats (dict): Language features from the upstream network.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        batch_token_positive_maps = [
+            data_samples.token_positive_map
+            for data_samples in batch_data_samples
+        ]
+        outs = self(visual_feats, language_feats)
+
+        predictions = self.predict_by_feat(
+            *outs,
+            batch_img_metas=batch_img_metas,
+            batch_token_positive_maps=batch_token_positive_maps,
+            rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        cls_logits: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        score_factors: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        batch_token_positive_maps: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_logits (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            batch_token_positive_maps (list[dict], Optional): Batch token
+                positive map. Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(bbox_preds) == len(score_factors)
+        num_levels = len(bbox_preds)
+
+        featmap_sizes = [bbox_preds[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+
+        result_list = []
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            token_positive_maps = batch_token_positive_maps[img_id]
+            bbox_pred_list = select_single_mlvl(
+                bbox_preds, img_id, detach=True)
+            score_factor_list = select_single_mlvl(
+                score_factors, img_id, detach=True)
+            cls_logit_list = select_single_mlvl(
+                cls_logits, img_id, detach=True)
+
+            results = self._predict_by_feat_single(
+                bbox_pred_list=bbox_pred_list,
+                score_factor_list=score_factor_list,
+                cls_logit_list=cls_logit_list,
+                mlvl_priors=mlvl_priors,
+                token_positive_maps=token_positive_maps,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                cls_logit_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                token_positive_maps: dict,
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = True,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            cls_logit_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            token_positive_maps (dict): Token positive map.
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+        score_thr = cfg.get('score_thr', 0)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_labels = []
+
+        for level_idx, (bbox_pred, score_factor, cls_logit, priors) in \
+                enumerate(zip(bbox_pred_list,
+                              score_factor_list, cls_logit_list, mlvl_priors)):
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(
+                -1, self.bbox_coder.encode_size)
+            score_factor = score_factor.permute(1, 2, 0).reshape(-1).sigmoid()
+
+            scores = convert_grounding_to_cls_scores(
+                logits=cls_logit.sigmoid()[None],
+                positive_maps=[token_positive_maps])[0]
+
+            results = filter_scores_and_topk(
+                scores, score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+            score_factor = score_factor[keep_idxs]
+            scores = torch.sqrt(scores * score_factor)
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_priors)
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        predictions = self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+        if len(predictions) > 0:
+            # Note: GLIP adopts a very strange bbox decoder logic,
+            # and if 1 is not added here, it will not align with
+            # the official mAP.
+            predictions.bboxes[:, 2:] = predictions.bboxes[:, 2:] + 1
+        return predictions
diff --git a/head_extractor/src/mmdet/models/dense_heads/autoassign_head.py b/head_extractor/src/mmdet/models/dense_heads/autoassign_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2b30ff0d7d41205f0a92ede7b8eb10a234c5942
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/autoassign_head.py
@@ -0,0 +1,524 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Scale
+from mmengine.model import bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..utils import levels_to_images, multi_apply
+from .fcos_head import FCOSHead
+
+EPS = 1e-12
+
+
+class CenterPrior(nn.Module):
+    """Center Weighting module to adjust the category-specific prior
+    distributions.
+
+    Args:
+        force_topk (bool): When no point falls into gt_bbox, forcibly
+            select the k points closest to the center to calculate
+            the center prior. Defaults to False.
+        topk (int): The number of points used to calculate the
+            center prior when no point falls in gt_bbox. Only work when
+            force_topk if True. Defaults to 9.
+        num_classes (int): The class number of dataset. Defaults to 80.
+        strides (Sequence[int]): The stride of each input feature map.
+            Defaults to (8, 16, 32, 64, 128).
+    """
+
+    def __init__(
+        self,
+        force_topk: bool = False,
+        topk: int = 9,
+        num_classes: int = 80,
+        strides: Sequence[int] = (8, 16, 32, 64, 128)
+    ) -> None:
+        super().__init__()
+        self.mean = nn.Parameter(torch.zeros(num_classes, 2))
+        self.sigma = nn.Parameter(torch.ones(num_classes, 2))
+        self.strides = strides
+        self.force_topk = force_topk
+        self.topk = topk
+
+    def forward(self, anchor_points_list: List[Tensor],
+                gt_instances: InstanceData,
+                inside_gt_bbox_mask: Tensor) -> Tuple[Tensor, Tensor]:
+        """Get the center prior of each point on the feature map for each
+        instance.
+
+        Args:
+            anchor_points_list (list[Tensor]): list of coordinate
+                of points on feature map. Each with shape
+                (num_points, 2).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            inside_gt_bbox_mask (Tensor): Tensor of bool type,
+                with shape of (num_points, num_gt), each
+                value is used to mark whether this point falls
+                within a certain gt.
+
+        Returns:
+            tuple[Tensor, Tensor]:
+
+            - center_prior_weights(Tensor): Float tensor with shape  of \
+            (num_points, num_gt). Each value represents the center \
+            weighting coefficient.
+            - inside_gt_bbox_mask (Tensor): Tensor of bool type, with shape \
+            of (num_points, num_gt), each value is used to mark whether this \
+            point falls within a certain gt or is the topk nearest points for \
+            a specific gt_bbox.
+        """
+        gt_bboxes = gt_instances.bboxes
+        labels = gt_instances.labels
+
+        inside_gt_bbox_mask = inside_gt_bbox_mask.clone()
+        num_gts = len(labels)
+        num_points = sum([len(item) for item in anchor_points_list])
+        if num_gts == 0:
+            return gt_bboxes.new_zeros(num_points,
+                                       num_gts), inside_gt_bbox_mask
+        center_prior_list = []
+        for slvl_points, stride in zip(anchor_points_list, self.strides):
+            # slvl_points: points from single level in FPN, has shape (h*w, 2)
+            # single_level_points has shape (h*w, num_gt, 2)
+            single_level_points = slvl_points[:, None, :].expand(
+                (slvl_points.size(0), len(gt_bboxes), 2))
+            gt_center_x = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2)
+            gt_center_y = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2)
+            gt_center = torch.stack((gt_center_x, gt_center_y), dim=1)
+            gt_center = gt_center[None]
+            # instance_center has shape (1, num_gt, 2)
+            instance_center = self.mean[labels][None]
+            # instance_sigma has shape (1, num_gt, 2)
+            instance_sigma = self.sigma[labels][None]
+            # distance has shape (num_points, num_gt, 2)
+            distance = (((single_level_points - gt_center) / float(stride) -
+                         instance_center)**2)
+            center_prior = torch.exp(-distance /
+                                     (2 * instance_sigma**2)).prod(dim=-1)
+            center_prior_list.append(center_prior)
+        center_prior_weights = torch.cat(center_prior_list, dim=0)
+
+        if self.force_topk:
+            gt_inds_no_points_inside = torch.nonzero(
+                inside_gt_bbox_mask.sum(0) == 0).reshape(-1)
+            if gt_inds_no_points_inside.numel():
+                topk_center_index = \
+                    center_prior_weights[:, gt_inds_no_points_inside].topk(
+                                                             self.topk,
+                                                             dim=0)[1]
+                temp_mask = inside_gt_bbox_mask[:, gt_inds_no_points_inside]
+                inside_gt_bbox_mask[:, gt_inds_no_points_inside] = \
+                    torch.scatter(temp_mask,
+                                  dim=0,
+                                  index=topk_center_index,
+                                  src=torch.ones_like(
+                                    topk_center_index,
+                                    dtype=torch.bool))
+
+        center_prior_weights[~inside_gt_bbox_mask] = 0
+        return center_prior_weights, inside_gt_bbox_mask
+
+
+@MODELS.register_module()
+class AutoAssignHead(FCOSHead):
+    """AutoAssignHead head used in AutoAssign.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.03496>`_ .
+
+    Args:
+        force_topk (bool): Used in center prior initialization to
+            handle extremely small gt. Default is False.
+        topk (int): The number of points used to calculate the
+            center prior when no point falls in gt_bbox. Only work when
+            force_topk if True. Defaults to 9.
+        pos_loss_weight (float): The loss weight of positive loss
+            and with default value 0.25.
+        neg_loss_weight (float): The loss weight of negative loss
+            and with default value 0.75.
+        center_loss_weight (float): The loss weight of center prior
+            loss and with default value 0.75.
+    """
+
+    def __init__(self,
+                 *args,
+                 force_topk: bool = False,
+                 topk: int = 9,
+                 pos_loss_weight: float = 0.25,
+                 neg_loss_weight: float = 0.75,
+                 center_loss_weight: float = 0.75,
+                 **kwargs) -> None:
+        super().__init__(*args, conv_bias=True, **kwargs)
+        self.center_prior = CenterPrior(
+            force_topk=force_topk,
+            topk=topk,
+            num_classes=self.num_classes,
+            strides=self.strides)
+        self.pos_loss_weight = pos_loss_weight
+        self.neg_loss_weight = neg_loss_weight
+        self.center_loss_weight = center_loss_weight
+        self.prior_generator = MlvlPointGenerator(self.strides, offset=0)
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head.
+
+        In particular, we have special initialization for classified conv's and
+        regression conv's bias
+        """
+
+        super(AutoAssignHead, self).init_weights()
+        bias_cls = bias_init_with_prob(0.02)
+        normal_init(self.conv_cls, std=0.01, bias=bias_cls)
+        normal_init(self.conv_reg, std=0.01, bias=4.0)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple[Tensor, Tensor, Tensor]: scores for each class, bbox
+            predictions and centerness predictions of input feature maps.
+        """
+        cls_score, bbox_pred, cls_feat, reg_feat = super(
+            FCOSHead, self).forward_single(x)
+        centerness = self.conv_centerness(reg_feat)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        # bbox_pred needed for gradient computation has been modified
+        # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+        # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+        bbox_pred = bbox_pred.clamp(min=0)
+        bbox_pred *= stride
+        return cls_score, bbox_pred, centerness
+
+    def get_pos_loss_single(self, cls_score: Tensor, objectness: Tensor,
+                            reg_loss: Tensor, gt_instances: InstanceData,
+                            center_prior_weights: Tensor) -> Tuple[Tensor]:
+        """Calculate the positive loss of all points in gt_bboxes.
+
+        Args:
+            cls_score (Tensor): All category scores for each point on
+                the feature map. The shape is (num_points, num_class).
+            objectness (Tensor): Foreground probability of all points,
+                has shape (num_points, 1).
+            reg_loss (Tensor): The regression loss of each gt_bbox and each
+                prediction box, has shape of (num_points, num_gt).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            center_prior_weights (Tensor): Float tensor with shape
+                of (num_points, num_gt). Each value represents
+                the center weighting coefficient.
+
+        Returns:
+            tuple[Tensor]:
+
+            - pos_loss (Tensor): The positive loss of all points in the \
+            gt_bboxes.
+        """
+        gt_labels = gt_instances.labels
+        # p_loc: localization confidence
+        p_loc = torch.exp(-reg_loss)
+        # p_cls: classification confidence
+        p_cls = (cls_score * objectness)[:, gt_labels]
+        # p_pos: joint confidence indicator
+        p_pos = p_cls * p_loc
+
+        # 3 is a hyper-parameter to control the contributions of high and
+        # low confidence locations towards positive losses.
+        confidence_weight = torch.exp(p_pos * 3)
+        p_pos_weight = (confidence_weight * center_prior_weights) / (
+            (confidence_weight * center_prior_weights).sum(
+                0, keepdim=True)).clamp(min=EPS)
+        reweighted_p_pos = (p_pos * p_pos_weight).sum(0)
+        pos_loss = F.binary_cross_entropy(
+            reweighted_p_pos,
+            torch.ones_like(reweighted_p_pos),
+            reduction='none')
+        pos_loss = pos_loss.sum() * self.pos_loss_weight
+        return pos_loss,
+
+    def get_neg_loss_single(self, cls_score: Tensor, objectness: Tensor,
+                            gt_instances: InstanceData, ious: Tensor,
+                            inside_gt_bbox_mask: Tensor) -> Tuple[Tensor]:
+        """Calculate the negative loss of all points in feature map.
+
+        Args:
+            cls_score (Tensor): All category scores for each point on
+                the feature map. The shape is (num_points, num_class).
+            objectness (Tensor): Foreground probability of all points
+                and is shape of (num_points, 1).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            ious (Tensor): Float tensor with shape of (num_points, num_gt).
+                Each value represent the iou of pred_bbox and gt_bboxes.
+            inside_gt_bbox_mask (Tensor): Tensor of bool type,
+                with shape of (num_points, num_gt), each
+                value is used to mark whether this point falls
+                within a certain gt.
+
+        Returns:
+            tuple[Tensor]:
+
+            - neg_loss (Tensor): The negative loss of all points in the \
+            feature map.
+        """
+        gt_labels = gt_instances.labels
+        num_gts = len(gt_labels)
+        joint_conf = (cls_score * objectness)
+        p_neg_weight = torch.ones_like(joint_conf)
+        if num_gts > 0:
+            # the order of dinmension would affect the value of
+            # p_neg_weight, we strictly follow the original
+            # implementation.
+            inside_gt_bbox_mask = inside_gt_bbox_mask.permute(1, 0)
+            ious = ious.permute(1, 0)
+
+            foreground_idxs = torch.nonzero(inside_gt_bbox_mask, as_tuple=True)
+            temp_weight = (1 / (1 - ious[foreground_idxs]).clamp_(EPS))
+
+            def normalize(x):
+                return (x - x.min() + EPS) / (x.max() - x.min() + EPS)
+
+            for instance_idx in range(num_gts):
+                idxs = foreground_idxs[0] == instance_idx
+                if idxs.any():
+                    temp_weight[idxs] = normalize(temp_weight[idxs])
+
+            p_neg_weight[foreground_idxs[1],
+                         gt_labels[foreground_idxs[0]]] = 1 - temp_weight
+
+        logits = (joint_conf * p_neg_weight)
+        neg_loss = (
+            logits**2 * F.binary_cross_entropy(
+                logits, torch.zeros_like(logits), reduction='none'))
+        neg_loss = neg_loss.sum() * self.neg_loss_weight
+        return neg_loss,
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        objectnesses: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            objectnesses (list[Tensor]): objectness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        assert len(cls_scores) == len(bbox_preds) == len(objectnesses)
+        all_num_gt = sum([len(item) for item in batch_gt_instances])
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+        inside_gt_bbox_mask_list, bbox_targets_list = self.get_targets(
+            all_level_points, batch_gt_instances)
+
+        center_prior_weight_list = []
+        temp_inside_gt_bbox_mask_list = []
+        for gt_instances, inside_gt_bbox_mask in zip(batch_gt_instances,
+                                                     inside_gt_bbox_mask_list):
+            center_prior_weight, inside_gt_bbox_mask = \
+                self.center_prior(all_level_points, gt_instances,
+                                  inside_gt_bbox_mask)
+            center_prior_weight_list.append(center_prior_weight)
+            temp_inside_gt_bbox_mask_list.append(inside_gt_bbox_mask)
+        inside_gt_bbox_mask_list = temp_inside_gt_bbox_mask_list
+        mlvl_points = torch.cat(all_level_points, dim=0)
+        bbox_preds = levels_to_images(bbox_preds)
+        cls_scores = levels_to_images(cls_scores)
+        objectnesses = levels_to_images(objectnesses)
+
+        reg_loss_list = []
+        ious_list = []
+        num_points = len(mlvl_points)
+
+        for bbox_pred, encoded_targets, inside_gt_bbox_mask in zip(
+                bbox_preds, bbox_targets_list, inside_gt_bbox_mask_list):
+            temp_num_gt = encoded_targets.size(1)
+            expand_mlvl_points = mlvl_points[:, None, :].expand(
+                num_points, temp_num_gt, 2).reshape(-1, 2)
+            encoded_targets = encoded_targets.reshape(-1, 4)
+            expand_bbox_pred = bbox_pred[:, None, :].expand(
+                num_points, temp_num_gt, 4).reshape(-1, 4)
+            decoded_bbox_preds = self.bbox_coder.decode(
+                expand_mlvl_points, expand_bbox_pred)
+            decoded_target_preds = self.bbox_coder.decode(
+                expand_mlvl_points, encoded_targets)
+            with torch.no_grad():
+                ious = bbox_overlaps(
+                    decoded_bbox_preds, decoded_target_preds, is_aligned=True)
+                ious = ious.reshape(num_points, temp_num_gt)
+                if temp_num_gt:
+                    ious = ious.max(
+                        dim=-1, keepdim=True).values.repeat(1, temp_num_gt)
+                else:
+                    ious = ious.new_zeros(num_points, temp_num_gt)
+                ious[~inside_gt_bbox_mask] = 0
+                ious_list.append(ious)
+            loss_bbox = self.loss_bbox(
+                decoded_bbox_preds,
+                decoded_target_preds,
+                weight=None,
+                reduction_override='none')
+            reg_loss_list.append(loss_bbox.reshape(num_points, temp_num_gt))
+
+        cls_scores = [item.sigmoid() for item in cls_scores]
+        objectnesses = [item.sigmoid() for item in objectnesses]
+        pos_loss_list, = multi_apply(self.get_pos_loss_single, cls_scores,
+                                     objectnesses, reg_loss_list,
+                                     batch_gt_instances,
+                                     center_prior_weight_list)
+        pos_avg_factor = reduce_mean(
+            bbox_pred.new_tensor(all_num_gt)).clamp_(min=1)
+        pos_loss = sum(pos_loss_list) / pos_avg_factor
+
+        neg_loss_list, = multi_apply(self.get_neg_loss_single, cls_scores,
+                                     objectnesses, batch_gt_instances,
+                                     ious_list, inside_gt_bbox_mask_list)
+        neg_avg_factor = sum(item.data.sum()
+                             for item in center_prior_weight_list)
+        neg_avg_factor = reduce_mean(neg_avg_factor).clamp_(min=1)
+        neg_loss = sum(neg_loss_list) / neg_avg_factor
+
+        center_loss = []
+        for i in range(len(batch_img_metas)):
+
+            if inside_gt_bbox_mask_list[i].any():
+                center_loss.append(
+                    len(batch_gt_instances[i]) /
+                    center_prior_weight_list[i].sum().clamp_(min=EPS))
+            # when width or height of gt_bbox is smaller than stride of p3
+            else:
+                center_loss.append(center_prior_weight_list[i].sum() * 0)
+
+        center_loss = torch.stack(center_loss).mean() * self.center_loss_weight
+
+        # avoid dead lock in DDP
+        if all_num_gt == 0:
+            pos_loss = bbox_preds[0].sum() * 0
+            dummy_center_prior_loss = self.center_prior.mean.sum(
+            ) * 0 + self.center_prior.sigma.sum() * 0
+            center_loss = objectnesses[0].sum() * 0 + dummy_center_prior_loss
+
+        loss = dict(
+            loss_pos=pos_loss, loss_neg=neg_loss, loss_center=center_loss)
+
+        return loss
+
+    def get_targets(
+            self, points: List[Tensor], batch_gt_instances: InstanceList
+    ) -> Tuple[List[Tensor], List[Tensor]]:
+        """Compute regression targets and each point inside or outside gt_bbox
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of all fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple(list[Tensor], list[Tensor]):
+
+            - inside_gt_bbox_mask_list (list[Tensor]): Each Tensor is with \
+            bool type and shape of (num_points, num_gt), each value is used \
+            to mark whether this point falls within a certain gt.
+            - concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+            level. Each tensor has shape (num_points, num_gt, 4).
+        """
+
+        concat_points = torch.cat(points, dim=0)
+        # the number of points per img, per lvl
+        inside_gt_bbox_mask_list, bbox_targets_list = multi_apply(
+            self._get_targets_single, batch_gt_instances, points=concat_points)
+        return inside_gt_bbox_mask_list, bbox_targets_list
+
+    def _get_targets_single(self, gt_instances: InstanceData,
+                            points: Tensor) -> Tuple[Tensor, Tensor]:
+        """Compute regression targets and each point inside or outside gt_bbox
+        for a single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            points (Tensor): Points of all fpn level, has shape
+                (num_points, 2).
+
+        Returns:
+            tuple[Tensor, Tensor]: Containing the following Tensors:
+
+            - inside_gt_bbox_mask (Tensor): Bool tensor with shape \
+            (num_points, num_gt), each value is used to mark whether this \
+            point falls within a certain gt.
+            - bbox_targets (Tensor): BBox targets of each points with each \
+            gt_bboxes, has shape (num_points, num_gt, 4).
+        """
+        gt_bboxes = gt_instances.bboxes
+        num_points = points.size(0)
+        num_gts = gt_bboxes.size(0)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None]
+        ys = ys[:, None]
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+        if num_gts:
+            inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
+        else:
+            inside_gt_bbox_mask = bbox_targets.new_zeros((num_points, num_gts),
+                                                         dtype=torch.bool)
+
+        return inside_gt_bbox_mask, bbox_targets
diff --git a/head_extractor/src/mmdet/models/dense_heads/base_dense_head.py b/head_extractor/src/mmdet/models/dense_heads/base_dense_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0a4469e02c469d029cc2791289dbf41554d6a53
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/base_dense_head.py
@@ -0,0 +1,583 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from abc import ABCMeta, abstractmethod
+from inspect import signature
+from typing import List, Optional, Tuple
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, constant_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import (cat_boxes, get_box_tensor, get_box_wh,
+                                   scale_boxes)
+from mmdet.utils import InstanceList, OptMultiConfig
+from ..test_time_augs import merge_aug_results
+from ..utils import (filter_scores_and_topk, select_single_mlvl,
+                     unpack_gt_instances)
+
+
+class BaseDenseHead(BaseModule, metaclass=ABCMeta):
+    """Base class for DenseHeads.
+
+    1. The ``init_weights`` method is used to initialize densehead's
+    model parameters. After detector initialization, ``init_weights``
+    is triggered when ``detector.init_weights()`` is called externally.
+
+    2. The ``loss`` method is used to calculate the loss of densehead,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``loss_by_feat`` method
+    is called based on the feature maps to calculate the loss.
+
+    .. code:: text
+
+    loss(): forward() -> loss_by_feat()
+
+    3. The ``predict`` method is used to predict detection results,
+    which includes two steps: (1) the densehead model performs forward
+    propagation to obtain the feature maps (2) The ``predict_by_feat`` method
+    is called based on the feature maps to predict detection results including
+    post-processing.
+
+    .. code:: text
+
+    predict(): forward() -> predict_by_feat()
+
+    4. The ``loss_and_predict`` method is used to return loss and detection
+    results at the same time. It will call densehead's ``forward``,
+    ``loss_by_feat`` and ``predict_by_feat`` methods in order.  If one-stage is
+    used as RPN, the densehead needs to return both losses and predictions.
+    This predictions is used as the proposal of roihead.
+
+    .. code:: text
+
+    loss_and_predict(): forward() -> loss_by_feat() -> predict_by_feat()
+    """
+
+    def __init__(self, init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        # `_raw_positive_infos` will be used in `get_positive_infos`, which
+        # can get positive information.
+        self._raw_positive_infos = dict()
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+        super().init_weights()
+        # avoid init_cfg overwrite the initialization of `conv_offset`
+        for m in self.modules():
+            # DeformConv2dPack, ModulatedDeformConv2dPack
+            if hasattr(m, 'conv_offset'):
+                constant_init(m.conv_offset, 0)
+
+    def get_positive_infos(self) -> InstanceList:
+        """Get positive information from sampling results.
+
+        Returns:
+            list[:obj:`InstanceData`]: Positive information of each image,
+            usually including positive bboxes, positive labels, positive
+            priors, etc.
+        """
+        if len(self._raw_positive_infos) == 0:
+            return None
+
+        sampling_results = self._raw_positive_infos.get(
+            'sampling_results', None)
+        assert sampling_results is not None
+        positive_infos = []
+        for sampling_result in enumerate(sampling_results):
+            pos_info = InstanceData()
+            pos_info.bboxes = sampling_result.pos_gt_bboxes
+            pos_info.labels = sampling_result.pos_gt_labels
+            pos_info.priors = sampling_result.pos_priors
+            pos_info.pos_assigned_gt_inds = \
+                sampling_result.pos_assigned_gt_inds
+            pos_info.pos_inds = sampling_result.pos_inds
+            positive_infos.append(pos_info)
+        return positive_infos
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        outs = self(x)
+
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    @abstractmethod
+    def loss_by_feat(self, **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head."""
+        pass
+
+    def loss_and_predict(
+        self,
+        x: Tuple[Tensor],
+        batch_data_samples: SampleList,
+        proposal_cfg: Optional[ConfigDict] = None
+    ) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            proposal_cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+                - losses: (dict[str, Tensor]): A dictionary of loss components.
+                - predictions (list[:obj:`InstanceData`]): Detection
+                  results of each image after the post process.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        outs = self(x)
+
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas,
+                              batch_gt_instances_ignore)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, cfg=proposal_cfg)
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        outs = self(x)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        score_factors: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        if score_factors is None:
+            # e.g. Retina, FreeAnchor, Foveabox, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, AutoAssign, etc.
+            with_score_factors = True
+            assert len(cls_scores) == len(score_factors)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device)
+
+        result_list = []
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            cls_score_list = select_single_mlvl(
+                cls_scores, img_id, detach=True)
+            bbox_pred_list = select_single_mlvl(
+                bbox_preds, img_id, detach=True)
+            if with_score_factors:
+                score_factor_list = select_single_mlvl(
+                    score_factors, img_id, detach=True)
+            else:
+                score_factor_list = [None for _ in range(num_levels)]
+
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                score_factor_list=score_factor_list,
+                mlvl_priors=mlvl_priors,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if score_factor_list[0] is None:
+            # e.g. Retina, FreeAnchor, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_labels = []
+        if with_score_factors:
+            mlvl_score_factors = []
+        else:
+            mlvl_score_factors = None
+        for level_idx, (cls_score, bbox_pred, score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, mlvl_priors)):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim)
+            if with_score_factors:
+                score_factor = score_factor.permute(1, 2,
+                                                    0).reshape(-1).sigmoid()
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+
+            # the `custom_cls_channels` parameter is derived from
+            # CrossEntropyCustomLoss and FocalCustomLoss, and is currently used
+            # in v3det.
+            if getattr(self.loss_cls, 'custom_cls_channels', False):
+                scores = self.loss_cls.get_activation(cls_score)
+            elif self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            score_thr = cfg.get('score_thr', 0)
+
+            results = filter_scores_and_topk(
+                scores, score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            if with_score_factors:
+                score_factor = score_factor[keep_idxs]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+            if with_score_factors:
+                mlvl_score_factors.append(score_factor)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_priors)
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+        if with_score_factors:
+            results.score_factors = torch.cat(mlvl_score_factors)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def _bbox_post_process(self,
+                           results: InstanceData,
+                           cfg: ConfigDict,
+                           rescale: bool = False,
+                           with_nms: bool = True,
+                           img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (ConfigDict): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            results.bboxes = scale_boxes(results.bboxes, scale_factor)
+
+        if hasattr(results, 'score_factors'):
+            # TODO: Add sqrt operation in order to be consistent with
+            #  the paper.
+            score_factors = results.pop('score_factors')
+            results.scores = results.scores * score_factors
+
+        # filter small size bboxes
+        if cfg.get('min_bbox_size', -1) >= 0:
+            w, h = get_box_wh(results.bboxes)
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                results = results[valid_mask]
+
+        # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg
+        if with_nms and results.bboxes.numel() > 0:
+            bboxes = get_box_tensor(results.bboxes)
+            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
+                                                results.labels, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:cfg.max_per_img]
+
+        return results
+
+    def aug_test(self,
+                 aug_batch_feats,
+                 aug_batch_img_metas,
+                 rescale=False,
+                 with_ori_nms=False,
+                 **kwargs):
+        """Test function with test time augmentation.
+
+        Args:
+            aug_batch_feats (list[tuple[Tensor]]): The outer list
+                indicates test-time augmentations and inner tuple
+                indicate the multi-level feats from
+                FPN, each Tensor should have a shape (B, C, H, W),
+            aug_batch_img_metas (list[list[dict]]): Meta information
+                of images under the different test-time augs
+                (multiscale, flip, etc.). The outer list indicate
+                the
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+            with_ori_nms (bool): Whether execute the nms in original head.
+                Defaults to False. It will be `True` when the head is
+                adopted as `rpn_head`.
+
+        Returns:
+            list(obj:`InstanceData`): Detection results of the
+            input images. Each item usually contains\
+            following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance,)
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances,).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        # TODO: remove this for detr and deformdetr
+        sig_of_get_results = signature(self.get_results)
+        get_results_args = [
+            p.name for p in sig_of_get_results.parameters.values()
+        ]
+        get_results_single_sig = signature(self._get_results_single)
+        get_results_single_sig_args = [
+            p.name for p in get_results_single_sig.parameters.values()
+        ]
+        assert ('with_nms' in get_results_args) and \
+               ('with_nms' in get_results_single_sig_args), \
+               f'{self.__class__.__name__}' \
+               'does not support test-time augmentation '
+
+        num_imgs = len(aug_batch_img_metas[0])
+        aug_batch_results = []
+        for x, img_metas in zip(aug_batch_feats, aug_batch_img_metas):
+            outs = self.forward(x)
+            batch_instance_results = self.get_results(
+                *outs,
+                img_metas=img_metas,
+                cfg=self.test_cfg,
+                rescale=False,
+                with_nms=with_ori_nms,
+                **kwargs)
+            aug_batch_results.append(batch_instance_results)
+
+        # after merging, bboxes will be rescaled to the original image
+        batch_results = merge_aug_results(aug_batch_results,
+                                          aug_batch_img_metas)
+
+        final_results = []
+        for img_id in range(num_imgs):
+            results = batch_results[img_id]
+            det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores,
+                                                results.labels,
+                                                self.test_cfg.nms)
+            results = results[keep_idxs]
+            # some nms operation may reweight the score such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:self.test_cfg.max_per_img]
+            if rescale:
+                # all results have been mapped to the original scale
+                # in `merge_aug_results`, so just pass
+                pass
+            else:
+                # map to the first aug image scale
+                scale_factor = results.bboxes.new_tensor(
+                    aug_batch_img_metas[0][img_id]['scale_factor'])
+                results.bboxes = \
+                    results.bboxes * scale_factor
+
+            final_results.append(results)
+
+        return final_results
diff --git a/head_extractor/src/mmdet/models/dense_heads/base_mask_head.py b/head_extractor/src/mmdet/models/dense_heads/base_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7183d782829aa15bf12b9e2f7ade999c84d0593f
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/base_mask_head.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Tuple, Union
+
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig
+from ..utils import unpack_gt_instances
+
+
+class BaseMaskHead(BaseModule, metaclass=ABCMeta):
+    """Base class for mask heads used in One-Stage Instance Segmentation."""
+
+    def __init__(self, init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+    @abstractmethod
+    def loss_by_feat(self, *args, **kwargs):
+        """Calculate the loss based on the features extracted by the mask
+        head."""
+        pass
+
+    @abstractmethod
+    def predict_by_feat(self, *args, **kwargs):
+        """Transform a batch of output features extracted from the head into
+        mask results."""
+        pass
+
+    def loss(self,
+             x: Union[List[Tensor], Tuple[Tensor]],
+             batch_data_samples: SampleList,
+             positive_infos: OptInstanceList = None,
+             **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the mask head on
+        the features of the upstream network.
+
+        Args:
+            x (list[Tensor] | tuple[Tensor]): Features from FPN.
+                Each has a shape (B, C, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            positive_infos (list[:obj:`InstanceData`], optional): Information
+                of positive samples. Used when the label assignment is
+                done outside the MaskHead, e.g., BboxHead in
+                YOLACT or CondInst, etc. When the label assignment is done in
+                MaskHead, it would be None, like SOLO or SOLOv2. All values
+                in it should have shape (num_positive_samples, *).
+
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        if positive_infos is None:
+            outs = self(x)
+        else:
+            outs = self(x, positive_infos)
+
+        assert isinstance(outs, tuple), 'Forward results should be a tuple, ' \
+                                        'even if only one item is returned'
+
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+        for gt_instances, img_metas in zip(batch_gt_instances,
+                                           batch_img_metas):
+            img_shape = img_metas['batch_input_shape']
+            gt_masks = gt_instances.masks.pad(img_shape)
+            gt_instances.masks = gt_masks
+
+        losses = self.loss_by_feat(
+            *outs,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+            positive_infos=positive_infos,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            **kwargs)
+        return losses
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False,
+                results_list: OptInstanceList = None,
+                **kwargs) -> InstanceList:
+        """Test function without test-time augmentation.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+            results_list (list[obj:`InstanceData`], optional): Detection
+                results of each image after the post process. Only exist
+                if there is a `bbox_head`, like `YOLACT`, `CondInst`, etc.
+
+        Returns:
+            list[obj:`InstanceData`]: Instance segmentation
+            results of each image after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance,)
+                - labels (Tensor): Has a shape (num_instances,).
+                - masks (Tensor): Processed mask results, has a
+                  shape (num_instances, h, w).
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        if results_list is None:
+            outs = self(x)
+        else:
+            outs = self(x, results_list)
+
+        results_list = self.predict_by_feat(
+            *outs,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale,
+            results_list=results_list,
+            **kwargs)
+
+        return results_list
diff --git a/head_extractor/src/mmdet/models/dense_heads/boxinst_head.py b/head_extractor/src/mmdet/models/dense_heads/boxinst_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d6e8f7777a852cad89b709e59af2d8e12b343a6
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/boxinst_head.py
@@ -0,0 +1,252 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from mmengine import MessageHub
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList
+from ..utils.misc import unfold_wo_center
+from .condinst_head import CondInstBboxHead, CondInstMaskHead
+
+
+@MODELS.register_module()
+class BoxInstBboxHead(CondInstBboxHead):
+    """BoxInst box head used in https://arxiv.org/abs/2012.02310."""
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+
+@MODELS.register_module()
+class BoxInstMaskHead(CondInstMaskHead):
+    """BoxInst mask head used in https://arxiv.org/abs/2012.02310.
+
+    This head outputs the mask for BoxInst.
+
+    Args:
+        pairwise_size (dict): The size of neighborhood for each pixel.
+            Defaults to 3.
+        pairwise_dilation (int): The dilation of neighborhood for each pixel.
+            Defaults to 2.
+        warmup_iters (int): Warmup iterations for pair-wise loss.
+            Defaults to 10000.
+    """
+
+    def __init__(self,
+                 *arg,
+                 pairwise_size: int = 3,
+                 pairwise_dilation: int = 2,
+                 warmup_iters: int = 10000,
+                 **kwargs) -> None:
+        self.pairwise_size = pairwise_size
+        self.pairwise_dilation = pairwise_dilation
+        self.warmup_iters = warmup_iters
+        super().__init__(*arg, **kwargs)
+
+    def get_pairwise_affinity(self, mask_logits: Tensor) -> Tensor:
+        """Compute the pairwise affinity for each pixel."""
+        log_fg_prob = F.logsigmoid(mask_logits).unsqueeze(1)
+        log_bg_prob = F.logsigmoid(-mask_logits).unsqueeze(1)
+
+        log_fg_prob_unfold = unfold_wo_center(
+            log_fg_prob,
+            kernel_size=self.pairwise_size,
+            dilation=self.pairwise_dilation)
+        log_bg_prob_unfold = unfold_wo_center(
+            log_bg_prob,
+            kernel_size=self.pairwise_size,
+            dilation=self.pairwise_dilation)
+
+        # the probability of making the same prediction:
+        # p_i * p_j + (1 - p_i) * (1 - p_j)
+        # we compute the the probability in log space
+        # to avoid numerical instability
+        log_same_fg_prob = log_fg_prob[:, :, None] + log_fg_prob_unfold
+        log_same_bg_prob = log_bg_prob[:, :, None] + log_bg_prob_unfold
+
+        # TODO: Figure out the difference between it and directly sum
+        max_ = torch.max(log_same_fg_prob, log_same_bg_prob)
+        log_same_prob = torch.log(
+            torch.exp(log_same_fg_prob - max_) +
+            torch.exp(log_same_bg_prob - max_)) + max_
+
+        return -log_same_prob[:, 0]
+
+    def loss_by_feat(self, mask_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], positive_infos: InstanceList,
+                     **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (list[Tensor]): List of predicted masks, each has
+                shape (num_classes, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+            positive_infos (List[:obj:``InstanceData``]): Information of
+                positive samples of each image that are assigned in detection
+                head.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert positive_infos is not None, \
+            'positive_infos should not be None in `BoxInstMaskHead`'
+        losses = dict()
+
+        loss_mask_project = 0.
+        loss_mask_pairwise = 0.
+        num_imgs = len(mask_preds)
+        total_pos = 0.
+        avg_fatcor = 0.
+
+        for idx in range(num_imgs):
+            (mask_pred, pos_mask_targets, pos_pairwise_masks, num_pos) = \
+                self._get_targets_single(
+                mask_preds[idx], batch_gt_instances[idx],
+                positive_infos[idx])
+            # mask loss
+            total_pos += num_pos
+            if num_pos == 0 or pos_mask_targets is None:
+                loss_project = mask_pred.new_zeros(1).mean()
+                loss_pairwise = mask_pred.new_zeros(1).mean()
+                avg_fatcor += 0.
+            else:
+                # compute the project term
+                loss_project_x = self.loss_mask(
+                    mask_pred.max(dim=1, keepdim=True)[0],
+                    pos_mask_targets.max(dim=1, keepdim=True)[0],
+                    reduction_override='none').sum()
+                loss_project_y = self.loss_mask(
+                    mask_pred.max(dim=2, keepdim=True)[0],
+                    pos_mask_targets.max(dim=2, keepdim=True)[0],
+                    reduction_override='none').sum()
+                loss_project = loss_project_x + loss_project_y
+                # compute the pairwise term
+                pairwise_affinity = self.get_pairwise_affinity(mask_pred)
+                avg_fatcor += pos_pairwise_masks.sum().clamp(min=1.0)
+                loss_pairwise = (pairwise_affinity * pos_pairwise_masks).sum()
+
+            loss_mask_project += loss_project
+            loss_mask_pairwise += loss_pairwise
+
+        if total_pos == 0:
+            total_pos += 1  # avoid nan
+        if avg_fatcor == 0:
+            avg_fatcor += 1  # avoid nan
+        loss_mask_project = loss_mask_project / total_pos
+        loss_mask_pairwise = loss_mask_pairwise / avg_fatcor
+        message_hub = MessageHub.get_current_instance()
+        iter = message_hub.get_info('iter')
+        warmup_factor = min(iter / float(self.warmup_iters), 1.0)
+        loss_mask_pairwise *= warmup_factor
+
+        losses.update(
+            loss_mask_project=loss_mask_project,
+            loss_mask_pairwise=loss_mask_pairwise)
+        return losses
+
+    def _get_targets_single(self, mask_preds: Tensor,
+                            gt_instances: InstanceData,
+                            positive_info: InstanceData):
+        """Compute targets for predictions of single image.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes with shape
+                (num_classes, H, W).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            positive_info (:obj:`InstanceData`): Information of positive
+                samples that are assigned in detection head. It usually
+                contains following keys.
+
+                    - pos_assigned_gt_inds (Tensor): Assigner GT indexes of
+                      positive proposals, has shape (num_pos, )
+                    - pos_inds (Tensor): Positive index of image, has
+                      shape (num_pos, ).
+                    - param_pred (Tensor): Positive param preditions
+                      with shape (num_pos, num_params).
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+            - mask_preds (Tensor): Positive predicted mask with shape
+              (num_pos, mask_h, mask_w).
+            - pos_mask_targets (Tensor): Positive mask targets with shape
+              (num_pos, mask_h, mask_w).
+            - pos_pairwise_masks (Tensor): Positive pairwise masks with
+              shape: (num_pos, num_neighborhood, mask_h, mask_w).
+            - num_pos (int): Positive numbers.
+        """
+        gt_bboxes = gt_instances.bboxes
+        device = gt_bboxes.device
+        # Note that gt_masks are generated by full box
+        # from BoxInstDataPreprocessor
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device).float()
+        # Note that pairwise_masks are generated by image color similarity
+        # from BoxInstDataPreprocessor
+        pairwise_masks = gt_instances.pairwise_masks
+        pairwise_masks = pairwise_masks.to(device=device)
+
+        # process with mask targets
+        pos_assigned_gt_inds = positive_info.get('pos_assigned_gt_inds')
+        scores = positive_info.get('scores')
+        centernesses = positive_info.get('centernesses')
+        num_pos = pos_assigned_gt_inds.size(0)
+
+        if gt_masks.size(0) == 0 or num_pos == 0:
+            return mask_preds, None, None, 0
+        # Since we're producing (near) full image masks,
+        # it'd take too much vram to backprop on every single mask.
+        # Thus we select only a subset.
+        if (self.max_masks_to_train != -1) and \
+           (num_pos > self.max_masks_to_train):
+            perm = torch.randperm(num_pos)
+            select = perm[:self.max_masks_to_train]
+            mask_preds = mask_preds[select]
+            pos_assigned_gt_inds = pos_assigned_gt_inds[select]
+            num_pos = self.max_masks_to_train
+        elif self.topk_masks_per_img != -1:
+            unique_gt_inds = pos_assigned_gt_inds.unique()
+            num_inst_per_gt = max(
+                int(self.topk_masks_per_img / len(unique_gt_inds)), 1)
+
+            keep_mask_preds = []
+            keep_pos_assigned_gt_inds = []
+            for gt_ind in unique_gt_inds:
+                per_inst_pos_inds = (pos_assigned_gt_inds == gt_ind)
+                mask_preds_per_inst = mask_preds[per_inst_pos_inds]
+                gt_inds_per_inst = pos_assigned_gt_inds[per_inst_pos_inds]
+                if sum(per_inst_pos_inds) > num_inst_per_gt:
+                    per_inst_scores = scores[per_inst_pos_inds].sigmoid().max(
+                        dim=1)[0]
+                    per_inst_centerness = centernesses[
+                        per_inst_pos_inds].sigmoid().reshape(-1, )
+                    select = (per_inst_scores * per_inst_centerness).topk(
+                        k=num_inst_per_gt, dim=0)[1]
+                    mask_preds_per_inst = mask_preds_per_inst[select]
+                    gt_inds_per_inst = gt_inds_per_inst[select]
+                keep_mask_preds.append(mask_preds_per_inst)
+                keep_pos_assigned_gt_inds.append(gt_inds_per_inst)
+            mask_preds = torch.cat(keep_mask_preds)
+            pos_assigned_gt_inds = torch.cat(keep_pos_assigned_gt_inds)
+            num_pos = pos_assigned_gt_inds.size(0)
+
+        # Follow the origin implement
+        start = int(self.mask_out_stride // 2)
+        gt_masks = gt_masks[:, start::self.mask_out_stride,
+                            start::self.mask_out_stride]
+        gt_masks = gt_masks.gt(0.5).float()
+        pos_mask_targets = gt_masks[pos_assigned_gt_inds]
+        pos_pairwise_masks = pairwise_masks[pos_assigned_gt_inds]
+        pos_pairwise_masks = pos_pairwise_masks * pos_mask_targets.unsqueeze(1)
+
+        return (mask_preds, pos_mask_targets, pos_pairwise_masks, num_pos)
diff --git a/head_extractor/src/mmdet/models/dense_heads/cascade_rpn_head.py b/head_extractor/src/mmdet/models/dense_heads/cascade_rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8686cc2c9118094df34a04fdeabd87daa636707
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/cascade_rpn_head.py
@@ -0,0 +1,1110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import division
+import copy
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.ops import DeformConv2d
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, ModuleList
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig,
+                         OptInstanceList, OptMultiConfig)
+from ..task_modules.assigners import RegionAssigner
+from ..task_modules.samplers import PseudoSampler
+from ..utils import (images_to_levels, multi_apply, select_single_mlvl,
+                     unpack_gt_instances)
+from .base_dense_head import BaseDenseHead
+from .rpn_head import RPNHead
+
+
+class AdaptiveConv(BaseModule):
+    """AdaptiveConv used to adapt the sampling location with the anchors.
+
+    Args:
+        in_channels (int): Number of channels in the input image.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size (int or tuple[int]): Size of the conv kernel.
+            Defaults to 3.
+        stride (int or tuple[int]): Stride of the convolution. Defaults to 1.
+        padding (int or tuple[int]): Zero-padding added to both sides of
+            the input. Defaults to 1.
+        dilation (int or tuple[int]): Spacing between kernel elements.
+            Defaults to 3.
+        groups (int): Number of blocked connections from input channels to
+            output channels. Defaults to 1.
+        bias (bool): If set True, adds a learnable bias to the output.
+            Defaults to False.
+        adapt_type (str): Type of adaptive conv, can be either ``offset``
+            (arbitrary anchors) or 'dilation' (uniform anchor).
+            Defaults to 'dilation'.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict]): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int]] = 3,
+        stride: Union[int, Tuple[int]] = 1,
+        padding: Union[int, Tuple[int]] = 1,
+        dilation: Union[int, Tuple[int]] = 3,
+        groups: int = 1,
+        bias: bool = False,
+        adapt_type: str = 'dilation',
+        init_cfg: MultiConfig = dict(
+            type='Normal', std=0.01, override=dict(name='conv'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert adapt_type in ['offset', 'dilation']
+        self.adapt_type = adapt_type
+
+        assert kernel_size == 3, 'Adaptive conv only supports kernels 3'
+        if self.adapt_type == 'offset':
+            assert stride == 1 and padding == 1 and groups == 1, \
+                'Adaptive conv offset mode only supports padding: {1}, ' \
+                f'stride: {1}, groups: {1}'
+            self.conv = DeformConv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                padding=padding,
+                stride=stride,
+                groups=groups,
+                bias=bias)
+        else:
+            self.conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                padding=dilation,
+                dilation=dilation)
+
+    def forward(self, x: Tensor, offset: Tensor) -> Tensor:
+        """Forward function."""
+        if self.adapt_type == 'offset':
+            N, _, H, W = x.shape
+            assert offset is not None
+            assert H * W == offset.shape[1]
+            # reshape [N, NA, 18] to (N, 18, H, W)
+            offset = offset.permute(0, 2, 1).reshape(N, -1, H, W)
+            offset = offset.contiguous()
+            x = self.conv(x, offset)
+        else:
+            assert offset is None
+            x = self.conv(x)
+        return x
+
+
+@MODELS.register_module()
+class StageCascadeRPNHead(RPNHead):
+    """Stage of CascadeRPNHead.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        anchor_generator (:obj:`ConfigDict` or dict): anchor generator config.
+        adapt_cfg (:obj:`ConfigDict` or dict): adaptation config.
+        bridged_feature (bool): whether update rpn feature. Defaults to False.
+        with_cls (bool): whether use classification branch. Defaults to True.
+        init_cfg :obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 anchor_generator: ConfigType = dict(
+                     type='AnchorGenerator',
+                     scales=[8],
+                     ratios=[1.0],
+                     strides=[4, 8, 16, 32, 64]),
+                 adapt_cfg: ConfigType = dict(type='dilation', dilation=3),
+                 bridged_feature: bool = False,
+                 with_cls: bool = True,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        self.with_cls = with_cls
+        self.anchor_strides = anchor_generator['strides']
+        self.anchor_scales = anchor_generator['scales']
+        self.bridged_feature = bridged_feature
+        self.adapt_cfg = adapt_cfg
+        super().__init__(
+            in_channels=in_channels,
+            anchor_generator=anchor_generator,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        # override sampling and sampler
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            # use PseudoSampler when sampling is False
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        if init_cfg is None:
+            self.init_cfg = dict(
+                type='Normal', std=0.01, override=[dict(name='rpn_reg')])
+            if self.with_cls:
+                self.init_cfg['override'].append(dict(name='rpn_cls'))
+
+    def _init_layers(self) -> None:
+        """Init layers of a CascadeRPN stage."""
+        adapt_cfg = copy.deepcopy(self.adapt_cfg)
+        adapt_cfg['adapt_type'] = adapt_cfg.pop('type')
+        self.rpn_conv = AdaptiveConv(self.in_channels, self.feat_channels,
+                                     **adapt_cfg)
+        if self.with_cls:
+            self.rpn_cls = nn.Conv2d(self.feat_channels,
+                                     self.num_anchors * self.cls_out_channels,
+                                     1)
+        self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward_single(self, x: Tensor, offset: Tensor) -> Tuple[Tensor]:
+        """Forward function of single scale."""
+        bridged_x = x
+        x = self.relu(self.rpn_conv(x, offset))
+        if self.bridged_feature:
+            bridged_x = x  # update feature
+        cls_score = self.rpn_cls(x) if self.with_cls else None
+        bbox_pred = self.rpn_reg(x)
+        return bridged_x, cls_score, bbox_pred
+
+    def forward(
+            self,
+            feats: List[Tensor],
+            offset_list: Optional[List[Tensor]] = None) -> Tuple[List[Tensor]]:
+        """Forward function."""
+        if offset_list is None:
+            offset_list = [None for _ in range(len(feats))]
+        return multi_apply(self.forward_single, feats, offset_list)
+
+    def _region_targets_single(self, flat_anchors: Tensor, valid_flags: Tensor,
+                               gt_instances: InstanceData, img_meta: dict,
+                               gt_instances_ignore: InstanceData,
+                               featmap_sizes: List[Tuple[int, int]],
+                               num_level_anchors: List[int]) -> tuple:
+        """Get anchor targets based on region for single level.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors, ).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            featmap_sizes (list[Tuple[int, int]]): Feature map size each level.
+            num_level_anchors (list[int]): The number of anchors in each level.
+
+        Returns:
+            tuple:
+
+                - labels (Tensor): Labels of each level.
+                - label_weights (Tensor): Label weights of each level.
+                - bbox_targets (Tensor): BBox targets of each level.
+                - bbox_weights (Tensor): BBox weights of each level.
+                - pos_inds (Tensor): positive samples indexes.
+                - neg_inds (Tensor): negative samples indexes.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        pred_instances = InstanceData()
+        pred_instances.priors = flat_anchors
+        pred_instances.valid_flags = valid_flags
+
+        assign_result = self.assigner.assign(
+            pred_instances,
+            gt_instances,
+            img_meta,
+            featmap_sizes,
+            num_level_anchors,
+            self.anchor_scales[0],
+            self.anchor_strides,
+            gt_instances_ignore=gt_instances_ignore,
+            allowed_border=self.train_cfg['allowed_border'])
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_anchors = flat_anchors.shape[0]
+        bbox_targets = torch.zeros_like(flat_anchors)
+        bbox_weights = torch.zeros_like(flat_anchors)
+        labels = flat_anchors.new_zeros(num_anchors, dtype=torch.long)
+        label_weights = flat_anchors.new_zeros(num_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def region_targets(
+        self,
+        anchor_list: List[List[Tensor]],
+        valid_flag_list: List[List[Tensor]],
+        featmap_sizes: List[Tuple[int, int]],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None,
+        return_sampling_results: bool = False,
+    ) -> tuple:
+        """Compute regression and classification targets for anchors when using
+        RegionAssigner.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image.
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image.
+            featmap_sizes (list[Tuple[int, int]]): Feature map size each level.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            tuple:
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - avg_factor (int): Average factor that is used to average
+                  the loss. When using sampling method, avg_factor is usually
+                  the sum of positive and negative priors. When using
+                  ``PseudoSampler``, ``avg_factor`` is usually equal to the
+                  number of positive priors.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         pos_inds_list, neg_inds_list, sampling_results_list) = multi_apply(
+             self._region_targets_single,
+             concat_anchor_list,
+             concat_valid_flag_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             featmap_sizes=featmap_sizes,
+             num_level_anchors=num_level_anchors)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        res = (labels_list, label_weights_list, bbox_targets_list,
+               bbox_weights_list, avg_factor)
+        if return_sampling_results:
+            res = res + (sampling_results_list, )
+        return res
+
+    def get_targets(
+        self,
+        anchor_list: List[List[Tensor]],
+        valid_flag_list: List[List[Tensor]],
+        featmap_sizes: List[Tuple[int, int]],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None,
+        return_sampling_results: bool = False,
+    ) -> tuple:
+        """Compute regression and classification targets for anchors.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image.
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image.
+            featmap_sizes (list[Tuple[int, int]]): Feature map size each level.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            return_sampling_results (bool): Whether to return the sampling
+                results. Defaults to False.
+
+        Returns:
+            tuple:
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - avg_factor (int): Average factor that is used to average
+                  the loss. When using sampling method, avg_factor is usually
+                  the sum of positive and negative priors. When using
+                  ``PseudoSampler``, ``avg_factor`` is usually equal to the
+                  number of positive priors.
+        """
+        if isinstance(self.assigner, RegionAssigner):
+            cls_reg_targets = self.region_targets(
+                anchor_list,
+                valid_flag_list,
+                featmap_sizes,
+                batch_gt_instances,
+                batch_img_metas,
+                batch_gt_instances_ignore=batch_gt_instances_ignore,
+                return_sampling_results=return_sampling_results)
+        else:
+            cls_reg_targets = super().get_targets(
+                anchor_list,
+                valid_flag_list,
+                batch_gt_instances,
+                batch_img_metas,
+                batch_gt_instances_ignore=batch_gt_instances_ignore,
+                return_sampling_results=return_sampling_results)
+        return cls_reg_targets
+
+    def anchor_offset(self, anchor_list: List[List[Tensor]],
+                      anchor_strides: List[int],
+                      featmap_sizes: List[Tuple[int, int]]) -> List[Tensor]:
+        """ Get offset for deformable conv based on anchor shape
+        NOTE: currently support deformable kernel_size=3 and dilation=1
+
+        Args:
+            anchor_list (list[list[tensor])): [NI, NLVL, NA, 4] list of
+                multi-level anchors
+            anchor_strides (list[int]): anchor stride of each level
+
+        Returns:
+            list[tensor]: offset of DeformConv kernel with shapes of
+            [NLVL, NA, 2, 18].
+        """
+
+        def _shape_offset(anchors, stride, ks=3, dilation=1):
+            # currently support kernel_size=3 and dilation=1
+            assert ks == 3 and dilation == 1
+            pad = (ks - 1) // 2
+            idx = torch.arange(-pad, pad + 1, dtype=dtype, device=device)
+            yy, xx = torch.meshgrid(idx, idx)  # return order matters
+            xx = xx.reshape(-1)
+            yy = yy.reshape(-1)
+            w = (anchors[:, 2] - anchors[:, 0]) / stride
+            h = (anchors[:, 3] - anchors[:, 1]) / stride
+            w = w / (ks - 1) - dilation
+            h = h / (ks - 1) - dilation
+            offset_x = w[:, None] * xx  # (NA, ks**2)
+            offset_y = h[:, None] * yy  # (NA, ks**2)
+            return offset_x, offset_y
+
+        def _ctr_offset(anchors, stride, featmap_size):
+            feat_h, feat_w = featmap_size
+            assert len(anchors) == feat_h * feat_w
+
+            x = (anchors[:, 0] + anchors[:, 2]) * 0.5
+            y = (anchors[:, 1] + anchors[:, 3]) * 0.5
+            # compute centers on feature map
+            x = x / stride
+            y = y / stride
+            # compute predefine centers
+            xx = torch.arange(0, feat_w, device=anchors.device)
+            yy = torch.arange(0, feat_h, device=anchors.device)
+            yy, xx = torch.meshgrid(yy, xx)
+            xx = xx.reshape(-1).type_as(x)
+            yy = yy.reshape(-1).type_as(y)
+
+            offset_x = x - xx  # (NA, )
+            offset_y = y - yy  # (NA, )
+            return offset_x, offset_y
+
+        num_imgs = len(anchor_list)
+        num_lvls = len(anchor_list[0])
+        dtype = anchor_list[0][0].dtype
+        device = anchor_list[0][0].device
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        offset_list = []
+        for i in range(num_imgs):
+            mlvl_offset = []
+            for lvl in range(num_lvls):
+                c_offset_x, c_offset_y = _ctr_offset(anchor_list[i][lvl],
+                                                     anchor_strides[lvl],
+                                                     featmap_sizes[lvl])
+                s_offset_x, s_offset_y = _shape_offset(anchor_list[i][lvl],
+                                                       anchor_strides[lvl])
+
+                # offset = ctr_offset + shape_offset
+                offset_x = s_offset_x + c_offset_x[:, None]
+                offset_y = s_offset_y + c_offset_y[:, None]
+
+                # offset order (y0, x0, y1, x2, .., y8, x8, y9, x9)
+                offset = torch.stack([offset_y, offset_x], dim=-1)
+                offset = offset.reshape(offset.size(0), -1)  # [NA, 2*ks**2]
+                mlvl_offset.append(offset)
+            offset_list.append(torch.cat(mlvl_offset))  # [totalNA, 2*ks**2]
+        offset_list = images_to_levels(offset_list, num_level_anchors)
+        return offset_list
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            anchors: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            bbox_weights: Tensor, avg_factor: int) -> tuple:
+        """Loss function on single scale."""
+        # classification loss
+        if self.with_cls:
+            labels = labels.reshape(-1)
+            label_weights = label_weights.reshape(-1)
+            cls_score = cls_score.permute(0, 2, 3,
+                                          1).reshape(-1, self.cls_out_channels)
+            loss_cls = self.loss_cls(
+                cls_score, labels, label_weights, avg_factor=avg_factor)
+        # regression loss
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        bbox_weights = bbox_weights.reshape(-1, 4)
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            anchors = anchors.reshape(-1, 4)
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+        loss_reg = self.loss_bbox(
+            bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor)
+        if self.with_cls:
+            return loss_cls, loss_reg
+        return None, loss_reg
+
+    def loss_by_feat(
+        self,
+        anchor_list: List[List[Tensor]],
+        valid_flag_list: List[List[Tensor]],
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Compute losses of the head.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image.
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in bbox_preds]
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            featmap_sizes,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            return_sampling_results=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results_list) = cls_reg_targets
+        if not sampling_results_list[0].avg_factor_with_neg:
+            # 200 is hard-coded average factor,
+            # which follows guided anchoring.
+            avg_factor = sum([label.numel() for label in labels_list]) / 200.0
+
+        # change per image, per level anchor_list to per_level, per_image
+        mlvl_anchor_list = list(zip(*anchor_list))
+        # concat mlvl_anchor_list
+        mlvl_anchor_list = [
+            torch.cat(anchors, dim=0) for anchors in mlvl_anchor_list
+        ]
+
+        losses = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            mlvl_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            avg_factor=avg_factor)
+        if self.with_cls:
+            return dict(loss_rpn_cls=losses[0], loss_rpn_reg=losses[1])
+        return dict(loss_rpn_reg=losses[1])
+
+    def predict_by_feat(self,
+                        anchor_list: List[List[Tensor]],
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        batch_img_metas: List[dict],
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False) -> InstanceList:
+        """Get proposal predict. Overriding to enable input ``anchor_list``
+        from outside.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image.
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            batch_img_metas (list[dict], Optional): Image meta info.
+            cfg (:obj:`ConfigDict`, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score_list = select_single_mlvl(cls_scores, img_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
+            proposals = self._predict_by_feat_single(
+                cls_scores=cls_score_list,
+                bbox_preds=bbox_pred_list,
+                mlvl_anchors=anchor_list[img_id],
+                img_meta=batch_img_metas[img_id],
+                cfg=cfg,
+                rescale=rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                mlvl_anchors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False) -> InstanceData:
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has
+                shape (num_anchors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Box reference from all scale
+                levels of a single image, each item has shape
+                (num_total_anchors, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (:obj:`ConfigDict`): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        # bboxes from different level should be independent during NMS,
+        # level_ids are used as labels for batched NMS to separate them
+        level_ids = []
+        mlvl_scores = []
+        mlvl_bbox_preds = []
+        mlvl_valid_anchors = []
+        nms_pre = cfg.get('nms_pre', -1)
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                # We set FG labels to [0, num_class-1] and BG label to
+                # num_class in RPN head since mmdet v2.5, which is unified to
+                # be consistent with other head since mmdet v2.0. In mmdet v2.0
+                # to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.
+                scores = rpn_cls_score.softmax(dim=1)[:, 0]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            anchors = mlvl_anchors[idx]
+
+            if 0 < nms_pre < scores.shape[0]:
+                # sort is faster than topk
+                # _, topk_inds = scores.topk(cfg.nms_pre)
+                ranked_scores, rank_inds = scores.sort(descending=True)
+                topk_inds = rank_inds[:nms_pre]
+                scores = ranked_scores[:nms_pre]
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+            mlvl_scores.append(scores)
+            mlvl_bbox_preds.append(rpn_bbox_pred)
+            mlvl_valid_anchors.append(anchors)
+            level_ids.append(
+                scores.new_full((scores.size(0), ), idx, dtype=torch.long))
+
+        anchors = torch.cat(mlvl_valid_anchors)
+        rpn_bbox_pred = torch.cat(mlvl_bbox_preds)
+        bboxes = self.bbox_coder.decode(
+            anchors, rpn_bbox_pred, max_shape=img_meta['img_shape'])
+
+        proposals = InstanceData()
+        proposals.bboxes = bboxes
+        proposals.scores = torch.cat(mlvl_scores)
+        proposals.level_ids = torch.cat(level_ids)
+
+        return self._bbox_post_process(
+            results=proposals, cfg=cfg, rescale=rescale, img_meta=img_meta)
+
+    def refine_bboxes(self, anchor_list: List[List[Tensor]],
+                      bbox_preds: List[Tensor],
+                      img_metas: List[dict]) -> List[List[Tensor]]:
+        """Refine bboxes through stages."""
+        num_levels = len(bbox_preds)
+        new_anchor_list = []
+        for img_id in range(len(img_metas)):
+            mlvl_anchors = []
+            for i in range(num_levels):
+                bbox_pred = bbox_preds[i][img_id].detach()
+                bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+                img_shape = img_metas[img_id]['img_shape']
+                bboxes = self.bbox_coder.decode(anchor_list[img_id][i],
+                                                bbox_pred, img_shape)
+                mlvl_anchors.append(bboxes)
+            new_anchor_list.append(mlvl_anchors)
+        return new_anchor_list
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, _, batch_img_metas = outputs
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        if self.adapt_cfg['type'] == 'offset':
+            offset_list = self.anchor_offset(anchor_list, self.anchor_strides,
+                                             featmap_sizes)
+        else:
+            offset_list = None
+
+        x, cls_score, bbox_pred = self(x, offset_list)
+        rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score, bbox_pred,
+                           batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*rpn_loss_inputs)
+
+        return losses
+
+    def loss_and_predict(
+        self,
+        x: Tuple[Tensor],
+        batch_data_samples: SampleList,
+        proposal_cfg: Optional[ConfigDict] = None,
+    ) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            proposal_cfg (:obj`ConfigDict`, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+                - losses: (dict[str, Tensor]): A dictionary of loss components.
+                - predictions (list[:obj:`InstanceData`]): Detection
+                  results of each image after the post process.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, _, batch_img_metas = outputs
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        if self.adapt_cfg['type'] == 'offset':
+            offset_list = self.anchor_offset(anchor_list, self.anchor_strides,
+                                             featmap_sizes)
+        else:
+            offset_list = None
+
+        x, cls_score, bbox_pred = self(x, offset_list)
+        rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score, bbox_pred,
+                           batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*rpn_loss_inputs)
+
+        predictions = self.predict_by_feat(
+            anchor_list,
+            cls_score,
+            bbox_pred,
+            batch_img_metas=batch_img_metas,
+            cfg=proposal_cfg)
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, _ = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        if self.adapt_cfg['type'] == 'offset':
+            offset_list = self.anchor_offset(anchor_list, self.anchor_strides,
+                                             featmap_sizes)
+        else:
+            offset_list = None
+
+        x, cls_score, bbox_pred = self(x, offset_list)
+        predictions = self.stages[-1].predict_by_feat(
+            anchor_list,
+            cls_score,
+            bbox_pred,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale)
+        return predictions
+
+
+@MODELS.register_module()
+class CascadeRPNHead(BaseDenseHead):
+    """The CascadeRPNHead will predict more accurate region proposals, which is
+    required for two-stage detectors (such as Fast/Faster R-CNN). CascadeRPN
+    consists of a sequence of RPNStage to progressively improve the accuracy of
+    the detected proposals.
+
+    More details can be found in ``https://arxiv.org/abs/1909.06720``.
+
+    Args:
+        num_stages (int): number of CascadeRPN stages.
+        stages (list[:obj:`ConfigDict` or dict]): list of configs to build
+            the stages.
+        train_cfg (list[:obj:`ConfigDict` or dict]): list of configs at
+            training time each stage.
+        test_cfg (:obj:`ConfigDict` or dict): config at testing time.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict]): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 num_stages: int,
+                 stages: List[ConfigType],
+                 train_cfg: List[ConfigType],
+                 test_cfg: ConfigType,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert num_classes == 1, 'Only support num_classes == 1'
+        assert num_stages == len(stages)
+        self.num_stages = num_stages
+        # Be careful! Pretrained weights cannot be loaded when use
+        # nn.ModuleList
+        self.stages = ModuleList()
+        for i in range(len(stages)):
+            train_cfg_i = train_cfg[i] if train_cfg is not None else None
+            stages[i].update(train_cfg=train_cfg_i)
+            stages[i].update(test_cfg=test_cfg)
+            self.stages.append(MODELS.build(stages[i]))
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def loss_by_feat(self):
+        """loss_by_feat() is implemented in StageCascadeRPNHead."""
+        pass
+
+    def predict_by_feat(self):
+        """predict_by_feat() is implemented in StageCascadeRPNHead."""
+        pass
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, _, batch_img_metas = outputs
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, valid_flag_list = self.stages[0].get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        losses = dict()
+
+        for i in range(self.num_stages):
+            stage = self.stages[i]
+
+            if stage.adapt_cfg['type'] == 'offset':
+                offset_list = stage.anchor_offset(anchor_list,
+                                                  stage.anchor_strides,
+                                                  featmap_sizes)
+            else:
+                offset_list = None
+            x, cls_score, bbox_pred = stage(x, offset_list)
+            rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score,
+                               bbox_pred, batch_gt_instances, batch_img_metas)
+            stage_loss = stage.loss_by_feat(*rpn_loss_inputs)
+            for name, value in stage_loss.items():
+                losses['s{}.{}'.format(i, name)] = value
+
+            # refine boxes
+            if i < self.num_stages - 1:
+                anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
+                                                  batch_img_metas)
+
+        return losses
+
+    def loss_and_predict(
+        self,
+        x: Tuple[Tensor],
+        batch_data_samples: SampleList,
+        proposal_cfg: Optional[ConfigDict] = None,
+    ) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+            proposal_cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+                - losses: (dict[str, Tensor]): A dictionary of loss components.
+                - predictions (list[:obj:`InstanceData`]): Detection
+                  results of each image after the post process.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, _, batch_img_metas = outputs
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, valid_flag_list = self.stages[0].get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        losses = dict()
+
+        for i in range(self.num_stages):
+            stage = self.stages[i]
+
+            if stage.adapt_cfg['type'] == 'offset':
+                offset_list = stage.anchor_offset(anchor_list,
+                                                  stage.anchor_strides,
+                                                  featmap_sizes)
+            else:
+                offset_list = None
+            x, cls_score, bbox_pred = stage(x, offset_list)
+            rpn_loss_inputs = (anchor_list, valid_flag_list, cls_score,
+                               bbox_pred, batch_gt_instances, batch_img_metas)
+            stage_loss = stage.loss_by_feat(*rpn_loss_inputs)
+            for name, value in stage_loss.items():
+                losses['s{}.{}'.format(i, name)] = value
+
+            # refine boxes
+            if i < self.num_stages - 1:
+                anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
+                                                  batch_img_metas)
+
+        predictions = self.stages[-1].predict_by_feat(
+            anchor_list,
+            cls_score,
+            bbox_pred,
+            batch_img_metas=batch_img_metas,
+            cfg=proposal_cfg)
+        return losses, predictions
+
+    def predict(self,
+                x: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in x]
+        device = x[0].device
+        anchor_list, _ = self.stages[0].get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        for i in range(self.num_stages):
+            stage = self.stages[i]
+            if stage.adapt_cfg['type'] == 'offset':
+                offset_list = stage.anchor_offset(anchor_list,
+                                                  stage.anchor_strides,
+                                                  featmap_sizes)
+            else:
+                offset_list = None
+            x, cls_score, bbox_pred = stage(x, offset_list)
+            if i < self.num_stages - 1:
+                anchor_list = stage.refine_bboxes(anchor_list, bbox_pred,
+                                                  batch_img_metas)
+
+        predictions = self.stages[-1].predict_by_feat(
+            anchor_list,
+            cls_score,
+            bbox_pred,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale)
+        return predictions
diff --git a/head_extractor/src/mmdet/models/dense_heads/centernet_head.py b/head_extractor/src/mmdet/models/dense_heads/centernet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..09f3e599eb176965e53f270014cbd326858b7c17
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/centernet_head.py
@@ -0,0 +1,447 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.ops import batched_nms
+from mmengine.config import ConfigDict
+from mmengine.model import bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from ..utils import (gaussian_radius, gen_gaussian_target, get_local_maximum,
+                     get_topk_from_heatmap, multi_apply,
+                     transpose_and_gather_feat)
+from .base_dense_head import BaseDenseHead
+
+
+@MODELS.register_module()
+class CenterNetHead(BaseDenseHead):
+    """Objects as Points Head. CenterHead use center_point to indicate object's
+    position. Paper link <https://arxiv.org/abs/1904.07850>
+
+    Args:
+        in_channels (int): Number of channel in the input feature map.
+        feat_channels (int): Number of channel in the intermediate feature map.
+        num_classes (int): Number of categories excluding the background
+            category.
+        loss_center_heatmap (:obj:`ConfigDict` or dict): Config of center
+            heatmap loss. Defaults to
+            dict(type='GaussianFocalLoss', loss_weight=1.0)
+        loss_wh (:obj:`ConfigDict` or dict): Config of wh loss. Defaults to
+             dict(type='L1Loss', loss_weight=0.1).
+        loss_offset (:obj:`ConfigDict` or dict): Config of offset loss.
+            Defaults to dict(type='L1Loss', loss_weight=1.0).
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
+            Useless in CenterNet, but we keep this variable for
+            SingleStageDetector.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config
+            of CenterNet.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`], optional): Initialization
+            config dict.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 feat_channels: int,
+                 num_classes: int,
+                 loss_center_heatmap: ConfigType = dict(
+                     type='GaussianFocalLoss', loss_weight=1.0),
+                 loss_wh: ConfigType = dict(type='L1Loss', loss_weight=0.1),
+                 loss_offset: ConfigType = dict(
+                     type='L1Loss', loss_weight=1.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.heatmap_head = self._build_head(in_channels, feat_channels,
+                                             num_classes)
+        self.wh_head = self._build_head(in_channels, feat_channels, 2)
+        self.offset_head = self._build_head(in_channels, feat_channels, 2)
+
+        self.loss_center_heatmap = MODELS.build(loss_center_heatmap)
+        self.loss_wh = MODELS.build(loss_wh)
+        self.loss_offset = MODELS.build(loss_offset)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.fp16_enabled = False
+
+    def _build_head(self, in_channels: int, feat_channels: int,
+                    out_channels: int) -> nn.Sequential:
+        """Build head for each branch."""
+        layer = nn.Sequential(
+            nn.Conv2d(in_channels, feat_channels, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(feat_channels, out_channels, kernel_size=1))
+        return layer
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        bias_init = bias_init_with_prob(0.1)
+        self.heatmap_head[-1].bias.data.fill_(bias_init)
+        for head in [self.wh_head, self.offset_head]:
+            for m in head.modules():
+                if isinstance(m, nn.Conv2d):
+                    normal_init(m, std=0.001)
+
+    def forward(self, x: Tuple[Tensor, ...]) -> Tuple[List[Tensor]]:
+        """Forward features. Notice CenterNet head does not use FPN.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            center_heatmap_preds (list[Tensor]): center predict heatmaps for
+                all levels, the channels number is num_classes.
+            wh_preds (list[Tensor]): wh predicts for all levels, the channels
+                number is 2.
+            offset_preds (list[Tensor]): offset predicts for all levels, the
+               channels number is 2.
+        """
+        return multi_apply(self.forward_single, x)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, ...]:
+        """Forward feature of a single level.
+
+        Args:
+            x (Tensor): Feature of a single level.
+
+        Returns:
+            center_heatmap_pred (Tensor): center predict heatmaps, the
+               channels number is num_classes.
+            wh_pred (Tensor): wh predicts, the channels number is 2.
+            offset_pred (Tensor): offset predicts, the channels number is 2.
+        """
+        center_heatmap_pred = self.heatmap_head(x).sigmoid()
+        wh_pred = self.wh_head(x)
+        offset_pred = self.offset_head(x)
+        return center_heatmap_pred, wh_pred, offset_pred
+
+    def loss_by_feat(
+            self,
+            center_heatmap_preds: List[Tensor],
+            wh_preds: List[Tensor],
+            offset_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            center_heatmap_preds (list[Tensor]): center predict heatmaps for
+               all levels with shape (B, num_classes, H, W).
+            wh_preds (list[Tensor]): wh predicts for all levels with
+               shape (B, 2, H, W).
+            offset_preds (list[Tensor]): offset predicts for all levels
+               with shape (B, 2, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: which has components below:
+                - loss_center_heatmap (Tensor): loss of center heatmap.
+                - loss_wh (Tensor): loss of hw heatmap
+                - loss_offset (Tensor): loss of offset heatmap.
+        """
+        assert len(center_heatmap_preds) == len(wh_preds) == len(
+            offset_preds) == 1
+        center_heatmap_pred = center_heatmap_preds[0]
+        wh_pred = wh_preds[0]
+        offset_pred = offset_preds[0]
+
+        gt_bboxes = [
+            gt_instances.bboxes for gt_instances in batch_gt_instances
+        ]
+        gt_labels = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        target_result, avg_factor = self.get_targets(gt_bboxes, gt_labels,
+                                                     center_heatmap_pred.shape,
+                                                     img_shape)
+
+        center_heatmap_target = target_result['center_heatmap_target']
+        wh_target = target_result['wh_target']
+        offset_target = target_result['offset_target']
+        wh_offset_target_weight = target_result['wh_offset_target_weight']
+
+        # Since the channel of wh_target and offset_target is 2, the avg_factor
+        # of loss_center_heatmap is always 1/2 of loss_wh and loss_offset.
+        loss_center_heatmap = self.loss_center_heatmap(
+            center_heatmap_pred, center_heatmap_target, avg_factor=avg_factor)
+        loss_wh = self.loss_wh(
+            wh_pred,
+            wh_target,
+            wh_offset_target_weight,
+            avg_factor=avg_factor * 2)
+        loss_offset = self.loss_offset(
+            offset_pred,
+            offset_target,
+            wh_offset_target_weight,
+            avg_factor=avg_factor * 2)
+        return dict(
+            loss_center_heatmap=loss_center_heatmap,
+            loss_wh=loss_wh,
+            loss_offset=loss_offset)
+
+    def get_targets(self, gt_bboxes: List[Tensor], gt_labels: List[Tensor],
+                    feat_shape: tuple, img_shape: tuple) -> Tuple[dict, int]:
+        """Compute regression and classification targets in multiple images.
+
+        Args:
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box.
+            feat_shape (tuple): feature map shape with value [B, _, H, W]
+            img_shape (tuple): image shape.
+
+        Returns:
+            tuple[dict, float]: The float value is mean avg_factor, the dict
+            has components below:
+               - center_heatmap_target (Tensor): targets of center heatmap, \
+                   shape (B, num_classes, H, W).
+               - wh_target (Tensor): targets of wh predict, shape \
+                   (B, 2, H, W).
+               - offset_target (Tensor): targets of offset predict, shape \
+                   (B, 2, H, W).
+               - wh_offset_target_weight (Tensor): weights of wh and offset \
+                   predict, shape (B, 2, H, W).
+        """
+        img_h, img_w = img_shape[:2]
+        bs, _, feat_h, feat_w = feat_shape
+
+        width_ratio = float(feat_w / img_w)
+        height_ratio = float(feat_h / img_h)
+
+        center_heatmap_target = gt_bboxes[-1].new_zeros(
+            [bs, self.num_classes, feat_h, feat_w])
+        wh_target = gt_bboxes[-1].new_zeros([bs, 2, feat_h, feat_w])
+        offset_target = gt_bboxes[-1].new_zeros([bs, 2, feat_h, feat_w])
+        wh_offset_target_weight = gt_bboxes[-1].new_zeros(
+            [bs, 2, feat_h, feat_w])
+
+        for batch_id in range(bs):
+            gt_bbox = gt_bboxes[batch_id]
+            gt_label = gt_labels[batch_id]
+            center_x = (gt_bbox[:, [0]] + gt_bbox[:, [2]]) * width_ratio / 2
+            center_y = (gt_bbox[:, [1]] + gt_bbox[:, [3]]) * height_ratio / 2
+            gt_centers = torch.cat((center_x, center_y), dim=1)
+
+            for j, ct in enumerate(gt_centers):
+                ctx_int, cty_int = ct.int()
+                ctx, cty = ct
+                scale_box_h = (gt_bbox[j][3] - gt_bbox[j][1]) * height_ratio
+                scale_box_w = (gt_bbox[j][2] - gt_bbox[j][0]) * width_ratio
+                radius = gaussian_radius([scale_box_h, scale_box_w],
+                                         min_overlap=0.3)
+                radius = max(0, int(radius))
+                ind = gt_label[j]
+                gen_gaussian_target(center_heatmap_target[batch_id, ind],
+                                    [ctx_int, cty_int], radius)
+
+                wh_target[batch_id, 0, cty_int, ctx_int] = scale_box_w
+                wh_target[batch_id, 1, cty_int, ctx_int] = scale_box_h
+
+                offset_target[batch_id, 0, cty_int, ctx_int] = ctx - ctx_int
+                offset_target[batch_id, 1, cty_int, ctx_int] = cty - cty_int
+
+                wh_offset_target_weight[batch_id, :, cty_int, ctx_int] = 1
+
+        avg_factor = max(1, center_heatmap_target.eq(1).sum())
+        target_result = dict(
+            center_heatmap_target=center_heatmap_target,
+            wh_target=wh_target,
+            offset_target=offset_target,
+            wh_offset_target_weight=wh_offset_target_weight)
+        return target_result, avg_factor
+
+    def predict_by_feat(self,
+                        center_heatmap_preds: List[Tensor],
+                        wh_preds: List[Tensor],
+                        offset_preds: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        rescale: bool = True,
+                        with_nms: bool = False) -> InstanceList:
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            center_heatmap_preds (list[Tensor]): Center predict heatmaps for
+                all levels with shape (B, num_classes, H, W).
+            wh_preds (list[Tensor]): WH predicts for all levels with
+                shape (B, 2, H, W).
+            offset_preds (list[Tensor]): Offset predicts for all levels
+                with shape (B, 2, H, W).
+            batch_img_metas (list[dict], optional): Batch image meta info.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to True.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Instance segmentation
+            results of each image after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(center_heatmap_preds) == len(wh_preds) == len(
+            offset_preds) == 1
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            result_list.append(
+                self._predict_by_feat_single(
+                    center_heatmap_preds[0][img_id:img_id + 1, ...],
+                    wh_preds[0][img_id:img_id + 1, ...],
+                    offset_preds[0][img_id:img_id + 1, ...],
+                    batch_img_metas[img_id],
+                    rescale=rescale,
+                    with_nms=with_nms))
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                center_heatmap_pred: Tensor,
+                                wh_pred: Tensor,
+                                offset_pred: Tensor,
+                                img_meta: dict,
+                                rescale: bool = True,
+                                with_nms: bool = False) -> InstanceData:
+        """Transform outputs of a single image into bbox results.
+
+        Args:
+            center_heatmap_pred (Tensor): Center heatmap for current level with
+                shape (1, num_classes, H, W).
+            wh_pred (Tensor): WH heatmap for current level with shape
+                (1, num_classes, H, W).
+            offset_pred (Tensor): Offset for current level with shape
+                (1, corner_offset_channels, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to True.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        batch_det_bboxes, batch_labels = self._decode_heatmap(
+            center_heatmap_pred,
+            wh_pred,
+            offset_pred,
+            img_meta['batch_input_shape'],
+            k=self.test_cfg.topk,
+            kernel=self.test_cfg.local_maximum_kernel)
+
+        det_bboxes = batch_det_bboxes.view([-1, 5])
+        det_labels = batch_labels.view(-1)
+
+        batch_border = det_bboxes.new_tensor(img_meta['border'])[...,
+                                                                 [2, 0, 2, 0]]
+        det_bboxes[..., :4] -= batch_border
+
+        if rescale and 'scale_factor' in img_meta:
+            det_bboxes[..., :4] /= det_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        if with_nms:
+            det_bboxes, det_labels = self._bboxes_nms(det_bboxes, det_labels,
+                                                      self.test_cfg)
+        results = InstanceData()
+        results.bboxes = det_bboxes[..., :4]
+        results.scores = det_bboxes[..., 4]
+        results.labels = det_labels
+        return results
+
+    def _decode_heatmap(self,
+                        center_heatmap_pred: Tensor,
+                        wh_pred: Tensor,
+                        offset_pred: Tensor,
+                        img_shape: tuple,
+                        k: int = 100,
+                        kernel: int = 3) -> Tuple[Tensor, Tensor]:
+        """Transform outputs into detections raw bbox prediction.
+
+        Args:
+            center_heatmap_pred (Tensor): center predict heatmap,
+               shape (B, num_classes, H, W).
+            wh_pred (Tensor): wh predict, shape (B, 2, H, W).
+            offset_pred (Tensor): offset predict, shape (B, 2, H, W).
+            img_shape (tuple): image shape in hw format.
+            k (int): Get top k center keypoints from heatmap. Defaults to 100.
+            kernel (int): Max pooling kernel for extract local maximum pixels.
+               Defaults to 3.
+
+        Returns:
+            tuple[Tensor]: Decoded output of CenterNetHead, containing
+               the following Tensors:
+
+              - batch_bboxes (Tensor): Coords of each box with shape (B, k, 5)
+              - batch_topk_labels (Tensor): Categories of each box with \
+                  shape (B, k)
+        """
+        height, width = center_heatmap_pred.shape[2:]
+        inp_h, inp_w = img_shape
+
+        center_heatmap_pred = get_local_maximum(
+            center_heatmap_pred, kernel=kernel)
+
+        *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
+            center_heatmap_pred, k=k)
+        batch_scores, batch_index, batch_topk_labels = batch_dets
+
+        wh = transpose_and_gather_feat(wh_pred, batch_index)
+        offset = transpose_and_gather_feat(offset_pred, batch_index)
+        topk_xs = topk_xs + offset[..., 0]
+        topk_ys = topk_ys + offset[..., 1]
+        tl_x = (topk_xs - wh[..., 0] / 2) * (inp_w / width)
+        tl_y = (topk_ys - wh[..., 1] / 2) * (inp_h / height)
+        br_x = (topk_xs + wh[..., 0] / 2) * (inp_w / width)
+        br_y = (topk_ys + wh[..., 1] / 2) * (inp_h / height)
+
+        batch_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], dim=2)
+        batch_bboxes = torch.cat((batch_bboxes, batch_scores[..., None]),
+                                 dim=-1)
+        return batch_bboxes, batch_topk_labels
+
+    def _bboxes_nms(self, bboxes: Tensor, labels: Tensor,
+                    cfg: ConfigDict) -> Tuple[Tensor, Tensor]:
+        """bboxes nms."""
+        if labels.numel() > 0:
+            max_num = cfg.max_per_img
+            bboxes, keep = batched_nms(bboxes[:, :4], bboxes[:,
+                                                             -1].contiguous(),
+                                       labels, cfg.nms)
+            if max_num > 0:
+                bboxes = bboxes[:max_num]
+                labels = labels[keep][:max_num]
+
+        return bboxes, labels
diff --git a/head_extractor/src/mmdet/models/dense_heads/centernet_update_head.py b/head_extractor/src/mmdet/models/dense_heads/centernet_update_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..00cfcb89806209c9416b1bd7e9a14d82a4911175
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/centernet_update_head.py
@@ -0,0 +1,624 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Scale
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox2distance
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..utils import multi_apply
+from .anchor_free_head import AnchorFreeHead
+
+INF = 1000000000
+RangeType = Sequence[Tuple[int, int]]
+
+
+def _transpose(tensor_list: List[Tensor],
+               num_point_list: list) -> List[Tensor]:
+    """This function is used to transpose image first tensors to level first
+    ones."""
+    for img_idx in range(len(tensor_list)):
+        tensor_list[img_idx] = torch.split(
+            tensor_list[img_idx], num_point_list, dim=0)
+
+    tensors_level_first = []
+    for targets_per_level in zip(*tensor_list):
+        tensors_level_first.append(torch.cat(targets_per_level, dim=0))
+    return tensors_level_first
+
+
+@MODELS.register_module()
+class CenterNetUpdateHead(AnchorFreeHead):
+    """CenterNetUpdateHead is an improved version of CenterNet in CenterNet2.
+    Paper link `<https://arxiv.org/abs/2103.07461>`_.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channel in the input feature map.
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        hm_min_radius (int): Heatmap target minimum radius of cls branch.
+            Defaults to 4.
+        hm_min_overlap (float): Heatmap target minimum overlap of cls branch.
+            Defaults to 0.8.
+        more_pos_thresh (float): The filtering threshold when the cls branch
+            adds more positive samples. Defaults to 0.2.
+        more_pos_topk (int): The maximum number of additional positive samples
+            added to each gt. Defaults to 9.
+        soft_weight_on_reg (bool): Whether to use the soft target of the
+            cls branch as the soft weight of the bbox branch.
+            Defaults to False.
+        loss_cls (:obj:`ConfigDict` or dict): Config of cls loss. Defaults to
+            dict(type='GaussianFocalLoss', loss_weight=1.0)
+        loss_bbox (:obj:`ConfigDict` or dict): Config of bbox loss. Defaults to
+             dict(type='GIoULoss', loss_weight=2.0).
+        norm_cfg (:obj:`ConfigDict` or dict, optional): dictionary to construct
+            and config norm layer.  Defaults to
+            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
+            Unused in CenterNet. Reserved for compatibility with
+            SingleStageDetector.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config
+            of CenterNet.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 regress_ranges: RangeType = ((0, 80), (64, 160), (128, 320),
+                                              (256, 640), (512, INF)),
+                 hm_min_radius: int = 4,
+                 hm_min_overlap: float = 0.8,
+                 more_pos_thresh: float = 0.2,
+                 more_pos_topk: int = 9,
+                 soft_weight_on_reg: bool = False,
+                 loss_cls: ConfigType = dict(
+                     type='GaussianFocalLoss',
+                     pos_weight=0.25,
+                     neg_weight=0.75,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='GIoULoss', loss_weight=2.0),
+                 norm_cfg: OptConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            norm_cfg=norm_cfg,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            **kwargs)
+        self.soft_weight_on_reg = soft_weight_on_reg
+        self.hm_min_radius = hm_min_radius
+        self.more_pos_thresh = more_pos_thresh
+        self.more_pos_topk = more_pos_topk
+        self.delta = (1 - hm_min_overlap) / (1 + hm_min_overlap)
+        self.sigmoid_clamp = 0.0001
+
+        # GaussianFocalLoss must be sigmoid mode
+        self.use_sigmoid_cls = True
+        self.cls_out_channels = num_classes
+
+        self.regress_ranges = regress_ranges
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+    def _init_predictor(self) -> None:
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.num_classes, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of each level outputs.
+
+            - cls_scores (list[Tensor]): Box scores for each scale level, \
+            each is a 4D-tensor, the channel number is num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for each \
+            scale level, each is a 4D-tensor, the channel number is 4.
+        """
+        return multi_apply(self.forward_single, x, self.scales, self.strides)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps.
+
+        Returns:
+            tuple: scores for each class, bbox predictions of
+            input feature maps.
+        """
+        cls_score, bbox_pred, _, _ = super().forward_single(x)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        # bbox_pred needed for gradient computation has been modified
+        # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+        # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+        bbox_pred = bbox_pred.clamp(min=0)
+        if not self.training:
+            bbox_pred *= stride
+        return cls_score, bbox_pred
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = cls_scores[0].size(0)
+        assert len(cls_scores) == len(bbox_preds)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+
+        # 1 flatten outputs
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        assert (torch.isfinite(flatten_bbox_preds).all().item())
+
+        # 2 calc reg and cls branch targets
+        cls_targets, bbox_targets = self.get_targets(all_level_points,
+                                                     batch_gt_instances)
+
+        # 3 add more pos index for cls branch
+        featmap_sizes = flatten_points.new_tensor(featmap_sizes)
+        pos_inds, cls_labels = self.add_cls_pos_inds(flatten_points,
+                                                     flatten_bbox_preds,
+                                                     featmap_sizes,
+                                                     batch_gt_instances)
+
+        # 4 calc cls loss
+        if pos_inds is None:
+            # num_gts=0
+            num_pos_cls = bbox_preds[0].new_tensor(0, dtype=torch.float)
+        else:
+            num_pos_cls = bbox_preds[0].new_tensor(
+                len(pos_inds), dtype=torch.float)
+        num_pos_cls = max(reduce_mean(num_pos_cls), 1.0)
+        flatten_cls_scores = flatten_cls_scores.sigmoid().clamp(
+            min=self.sigmoid_clamp, max=1 - self.sigmoid_clamp)
+        cls_loss = self.loss_cls(
+            flatten_cls_scores,
+            cls_targets,
+            pos_inds=pos_inds,
+            pos_labels=cls_labels,
+            avg_factor=num_pos_cls)
+
+        # 5 calc reg loss
+        pos_bbox_inds = torch.nonzero(
+            bbox_targets.max(dim=1)[0] >= 0).squeeze(1)
+        pos_bbox_preds = flatten_bbox_preds[pos_bbox_inds]
+        pos_bbox_targets = bbox_targets[pos_bbox_inds]
+
+        bbox_weight_map = cls_targets.max(dim=1)[0]
+        bbox_weight_map = bbox_weight_map[pos_bbox_inds]
+        bbox_weight_map = bbox_weight_map if self.soft_weight_on_reg \
+            else torch.ones_like(bbox_weight_map)
+        num_pos_bbox = max(reduce_mean(bbox_weight_map.sum()), 1.0)
+
+        if len(pos_bbox_inds) > 0:
+            pos_points = flatten_points[pos_bbox_inds]
+            pos_decoded_bbox_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_preds)
+            pos_decoded_target_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_targets)
+            bbox_loss = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds,
+                weight=bbox_weight_map,
+                avg_factor=num_pos_bbox)
+        else:
+            bbox_loss = flatten_bbox_preds.sum() * 0
+
+        return dict(loss_cls=cls_loss, loss_bbox=bbox_loss)
+
+    def get_targets(
+        self,
+        points: List[Tensor],
+        batch_gt_instances: InstanceList,
+    ) -> Tuple[Tensor, Tensor]:
+        """Compute classification and bbox targets for points in multiple
+        images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple: Targets of each level.
+
+            - concat_lvl_labels (Tensor): Labels of all level and batch.
+            - concat_lvl_bbox_targets (Tensor): BBox targets of all \
+            level and batch.
+        """
+        assert len(points) == len(self.regress_ranges)
+
+        num_levels = len(points)
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+        concat_strides = torch.cat([
+            concat_points.new_ones(num_points[i]) * self.strides[i]
+            for i in range(num_levels)
+        ])
+
+        # get labels and bbox_targets of each image
+        cls_targets_list, bbox_targets_list = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            points=concat_points,
+            regress_ranges=concat_regress_ranges,
+            strides=concat_strides)
+
+        bbox_targets_list = _transpose(bbox_targets_list, num_points)
+        cls_targets_list = _transpose(cls_targets_list, num_points)
+        concat_lvl_bbox_targets = torch.cat(bbox_targets_list, 0)
+        concat_lvl_cls_targets = torch.cat(cls_targets_list, dim=0)
+        return concat_lvl_cls_targets, concat_lvl_bbox_targets
+
+    def _get_targets_single(self, gt_instances: InstanceData, points: Tensor,
+                            regress_ranges: Tensor,
+                            strides: Tensor) -> Tuple[Tensor, Tensor]:
+        """Compute classification and bbox targets for a single image."""
+        num_points = points.size(0)
+        num_gts = len(gt_instances)
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,
+                                       self.num_classes),
+                                      self.num_classes), \
+                   gt_bboxes.new_full((num_points, 4), -1)
+
+        # Calculate the regression tblr target corresponding to all points
+        points = points[:, None].expand(num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        strides = strides[:, None, None].expand(num_points, num_gts, 2)
+
+        bbox_target = bbox2distance(points, gt_bboxes)  # M x N x 4
+
+        # condition1: inside a gt bbox
+        inside_gt_bbox_mask = bbox_target.min(dim=2)[0] > 0  # M x N
+
+        # condition2: Calculate the nearest points from
+        # the upper, lower, left and right ranges from
+        # the center of the gt bbox
+        centers = ((gt_bboxes[..., [0, 1]] + gt_bboxes[..., [2, 3]]) / 2)
+        centers_discret = ((centers / strides).int() * strides).float() + \
+            strides / 2
+
+        centers_discret_dist = points - centers_discret
+        dist_x = centers_discret_dist[..., 0].abs()
+        dist_y = centers_discret_dist[..., 1].abs()
+        inside_gt_center3x3_mask = (dist_x <= strides[..., 0]) & \
+                                   (dist_y <= strides[..., 0])
+
+        # condition3: limit the regression range for each location
+        bbox_target_wh = bbox_target[..., :2] + bbox_target[..., 2:]
+        crit = (bbox_target_wh**2).sum(dim=2)**0.5 / 2
+        inside_fpn_level_mask = (crit >= regress_ranges[:, [0]]) & \
+                                (crit <= regress_ranges[:, [1]])
+        bbox_target_mask = inside_gt_bbox_mask & \
+            inside_gt_center3x3_mask & \
+            inside_fpn_level_mask
+
+        # Calculate the distance weight map
+        gt_center_peak_mask = ((centers_discret_dist**2).sum(dim=2) == 0)
+        weighted_dist = ((points - centers)**2).sum(dim=2)  # M x N
+        weighted_dist[gt_center_peak_mask] = 0
+
+        areas = (gt_bboxes[..., 2] - gt_bboxes[..., 0]) * (
+            gt_bboxes[..., 3] - gt_bboxes[..., 1])
+        radius = self.delta**2 * 2 * areas
+        radius = torch.clamp(radius, min=self.hm_min_radius**2)
+        weighted_dist = weighted_dist / radius
+
+        # Calculate bbox_target
+        bbox_weighted_dist = weighted_dist.clone()
+        bbox_weighted_dist[bbox_target_mask == 0] = INF * 1.0
+        min_dist, min_inds = bbox_weighted_dist.min(dim=1)
+        bbox_target = bbox_target[range(len(bbox_target)),
+                                  min_inds]  # M x N x 4 --> M x 4
+        bbox_target[min_dist == INF] = -INF
+
+        # Convert to feature map scale
+        bbox_target /= strides[:, 0, :].repeat(1, 2)
+
+        # Calculate cls_target
+        cls_target = self._create_heatmaps_from_dist(weighted_dist, gt_labels)
+
+        return cls_target, bbox_target
+
+    @torch.no_grad()
+    def add_cls_pos_inds(
+        self, flatten_points: Tensor, flatten_bbox_preds: Tensor,
+        featmap_sizes: Tensor, batch_gt_instances: InstanceList
+    ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        """Provide additional adaptive positive samples to the classification
+        branch.
+
+        Args:
+            flatten_points (Tensor): The point after flatten, including
+                batch image and all levels. The shape is (N, 2).
+            flatten_bbox_preds (Tensor): The bbox predicts after flatten,
+                including batch image and all levels. The shape is (N, 4).
+            featmap_sizes (Tensor): Feature map size of all layers.
+                The shape is (5, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+           tuple:
+
+           - pos_inds (Tensor): Adaptively selected positive sample index.
+           - cls_labels (Tensor): Corresponding positive class label.
+        """
+        outputs = self._get_center3x3_region_index_targets(
+            batch_gt_instances, featmap_sizes)
+        cls_labels, fpn_level_masks, center3x3_inds, \
+            center3x3_bbox_targets, center3x3_masks = outputs
+
+        num_gts, total_level, K = cls_labels.shape[0], len(
+            self.strides), center3x3_masks.shape[-1]
+
+        if num_gts == 0:
+            return None, None
+
+        # The out-of-bounds index is forcibly set to 0
+        # to prevent loss calculation errors
+        center3x3_inds[center3x3_masks == 0] = 0
+        reg_pred_center3x3 = flatten_bbox_preds[center3x3_inds]
+        center3x3_points = flatten_points[center3x3_inds].view(-1, 2)
+
+        center3x3_bbox_targets_expand = center3x3_bbox_targets.view(
+            -1, 4).clamp(min=0)
+
+        pos_decoded_bbox_preds = self.bbox_coder.decode(
+            center3x3_points, reg_pred_center3x3.view(-1, 4))
+        pos_decoded_target_preds = self.bbox_coder.decode(
+            center3x3_points, center3x3_bbox_targets_expand)
+        center3x3_bbox_loss = self.loss_bbox(
+            pos_decoded_bbox_preds,
+            pos_decoded_target_preds,
+            None,
+            reduction_override='none').view(num_gts, total_level,
+                                            K) / self.loss_bbox.loss_weight
+
+        # Invalid index Loss set to infinity
+        center3x3_bbox_loss[center3x3_masks == 0] = INF
+
+        # 4 is the center point of the sampled 9 points, the center point
+        # of gt bbox after discretization.
+        # The center point of gt bbox after discretization
+        # must be a positive sample, so we force its loss to be set to 0.
+        center3x3_bbox_loss.view(-1, K)[fpn_level_masks.view(-1), 4] = 0
+        center3x3_bbox_loss = center3x3_bbox_loss.view(num_gts, -1)
+
+        loss_thr = torch.kthvalue(
+            center3x3_bbox_loss, self.more_pos_topk, dim=1)[0]
+
+        loss_thr[loss_thr > self.more_pos_thresh] = self.more_pos_thresh
+        new_pos = center3x3_bbox_loss < loss_thr.view(num_gts, 1)
+        pos_inds = center3x3_inds.view(num_gts, -1)[new_pos]
+        cls_labels = cls_labels.view(num_gts,
+                                     1).expand(num_gts,
+                                               total_level * K)[new_pos]
+        return pos_inds, cls_labels
+
+    def _create_heatmaps_from_dist(self, weighted_dist: Tensor,
+                                   cls_labels: Tensor) -> Tensor:
+        """Generate heatmaps of classification branch based on weighted
+        distance map."""
+        heatmaps = weighted_dist.new_zeros(
+            (weighted_dist.shape[0], self.num_classes))
+        for c in range(self.num_classes):
+            inds = (cls_labels == c)  # N
+            if inds.int().sum() == 0:
+                continue
+            heatmaps[:, c] = torch.exp(-weighted_dist[:, inds].min(dim=1)[0])
+            zeros = heatmaps[:, c] < 1e-4
+            heatmaps[zeros, c] = 0
+        return heatmaps
+
+    def _get_center3x3_region_index_targets(self,
+                                            bacth_gt_instances: InstanceList,
+                                            shapes_per_level: Tensor) -> tuple:
+        """Get the center (and the 3x3 region near center) locations and target
+        of each objects."""
+        cls_labels = []
+        inside_fpn_level_masks = []
+        center3x3_inds = []
+        center3x3_masks = []
+        center3x3_bbox_targets = []
+
+        total_levels = len(self.strides)
+        batch = len(bacth_gt_instances)
+
+        shapes_per_level = shapes_per_level.long()
+        area_per_level = (shapes_per_level[:, 0] * shapes_per_level[:, 1])
+
+        # Select a total of 9 positions of 3x3 in the center of the gt bbox
+        # as candidate positive samples
+        K = 9
+        dx = shapes_per_level.new_tensor([-1, 0, 1, -1, 0, 1, -1, 0,
+                                          1]).view(1, 1, K)
+        dy = shapes_per_level.new_tensor([-1, -1, -1, 0, 0, 0, 1, 1,
+                                          1]).view(1, 1, K)
+
+        regress_ranges = shapes_per_level.new_tensor(self.regress_ranges).view(
+            len(self.regress_ranges), 2)  # L x 2
+        strides = shapes_per_level.new_tensor(self.strides)
+
+        start_coord_pre_level = []
+        _start = 0
+        for level in range(total_levels):
+            start_coord_pre_level.append(_start)
+            _start = _start + batch * area_per_level[level]
+        start_coord_pre_level = shapes_per_level.new_tensor(
+            start_coord_pre_level).view(1, total_levels, 1)
+        area_per_level = area_per_level.view(1, total_levels, 1)
+
+        for im_i in range(batch):
+            gt_instance = bacth_gt_instances[im_i]
+            gt_bboxes = gt_instance.bboxes
+            gt_labels = gt_instance.labels
+            num_gts = gt_bboxes.shape[0]
+            if num_gts == 0:
+                continue
+
+            cls_labels.append(gt_labels)
+
+            gt_bboxes = gt_bboxes[:, None].expand(num_gts, total_levels, 4)
+            expanded_strides = strides[None, :,
+                                       None].expand(num_gts, total_levels, 2)
+            expanded_regress_ranges = regress_ranges[None].expand(
+                num_gts, total_levels, 2)
+            expanded_shapes_per_level = shapes_per_level[None].expand(
+                num_gts, total_levels, 2)
+
+            # calc reg_target
+            centers = ((gt_bboxes[..., [0, 1]] + gt_bboxes[..., [2, 3]]) / 2)
+            centers_inds = (centers / expanded_strides).long()
+            centers_discret = centers_inds * expanded_strides \
+                + expanded_strides // 2
+
+            bbox_target = bbox2distance(centers_discret,
+                                        gt_bboxes)  # M x N x 4
+
+            # calc inside_fpn_level_mask
+            bbox_target_wh = bbox_target[..., :2] + bbox_target[..., 2:]
+            crit = (bbox_target_wh**2).sum(dim=2)**0.5 / 2
+            inside_fpn_level_mask = \
+                (crit >= expanded_regress_ranges[..., 0]) & \
+                (crit <= expanded_regress_ranges[..., 1])
+
+            inside_gt_bbox_mask = bbox_target.min(dim=2)[0] >= 0
+            inside_fpn_level_mask = inside_gt_bbox_mask & inside_fpn_level_mask
+            inside_fpn_level_masks.append(inside_fpn_level_mask)
+
+            # calc center3x3_ind and mask
+            expand_ws = expanded_shapes_per_level[..., 1:2].expand(
+                num_gts, total_levels, K)
+            expand_hs = expanded_shapes_per_level[..., 0:1].expand(
+                num_gts, total_levels, K)
+            centers_inds_x = centers_inds[..., 0:1]
+            centers_inds_y = centers_inds[..., 1:2]
+
+            center3x3_idx = start_coord_pre_level + \
+                im_i * area_per_level + \
+                (centers_inds_y + dy) * expand_ws + \
+                (centers_inds_x + dx)
+            center3x3_mask = \
+                ((centers_inds_y + dy) < expand_hs) & \
+                ((centers_inds_y + dy) >= 0) & \
+                ((centers_inds_x + dx) < expand_ws) & \
+                ((centers_inds_x + dx) >= 0)
+
+            # recalc center3x3 region reg target
+            bbox_target = bbox_target / expanded_strides.repeat(1, 1, 2)
+            center3x3_bbox_target = bbox_target[..., None, :].expand(
+                num_gts, total_levels, K, 4).clone()
+            center3x3_bbox_target[..., 0] += dx
+            center3x3_bbox_target[..., 1] += dy
+            center3x3_bbox_target[..., 2] -= dx
+            center3x3_bbox_target[..., 3] -= dy
+            # update center3x3_mask
+            center3x3_mask = center3x3_mask & (
+                center3x3_bbox_target.min(dim=3)[0] >= 0)  # n x L x K
+
+            center3x3_inds.append(center3x3_idx)
+            center3x3_masks.append(center3x3_mask)
+            center3x3_bbox_targets.append(center3x3_bbox_target)
+
+        if len(inside_fpn_level_masks) > 0:
+            cls_labels = torch.cat(cls_labels, dim=0)
+            inside_fpn_level_masks = torch.cat(inside_fpn_level_masks, dim=0)
+            center3x3_inds = torch.cat(center3x3_inds, dim=0).long()
+            center3x3_bbox_targets = torch.cat(center3x3_bbox_targets, dim=0)
+            center3x3_masks = torch.cat(center3x3_masks, dim=0)
+        else:
+            cls_labels = shapes_per_level.new_zeros(0).long()
+            inside_fpn_level_masks = shapes_per_level.new_zeros(
+                (0, total_levels)).bool()
+            center3x3_inds = shapes_per_level.new_zeros(
+                (0, total_levels, K)).long()
+            center3x3_bbox_targets = shapes_per_level.new_zeros(
+                (0, total_levels, K, 4)).float()
+            center3x3_masks = shapes_per_level.new_zeros(
+                (0, total_levels, K)).bool()
+        return cls_labels, inside_fpn_level_masks, center3x3_inds, \
+            center3x3_bbox_targets, center3x3_masks
diff --git a/head_extractor/src/mmdet/models/dense_heads/centripetal_head.py b/head_extractor/src/mmdet/models/dense_heads/centripetal_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..18f6601ff82394864d53351b10b40f51eb2aec6b
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/centripetal_head.py
@@ -0,0 +1,459 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import DeformConv2d
+from mmengine.model import normal_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, OptInstanceList,
+                         OptMultiConfig)
+from ..utils import multi_apply
+from .corner_head import CornerHead
+
+
+@MODELS.register_module()
+class CentripetalHead(CornerHead):
+    """Head of CentripetalNet: Pursuing High-quality Keypoint Pairs for Object
+    Detection.
+
+    CentripetalHead inherits from :class:`CornerHead`. It removes the
+    embedding branch and adds guiding shift and centripetal shift branches.
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2003.09119>`_ .
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        num_feat_levels (int): Levels of feature from the previous module.
+            2 for HourglassNet-104 and 1 for HourglassNet-52. HourglassNet-104
+            outputs the final feature and intermediate supervision feature and
+            HourglassNet-52 only outputs the final feature. Defaults to 2.
+        corner_emb_channels (int): Channel of embedding vector. Defaults to 1.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
+            Useless in CornerHead, but we keep this variable for
+            SingleStageDetector.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            CornerHead.
+        loss_heatmap (:obj:`ConfigDict` or dict): Config of corner heatmap
+            loss. Defaults to GaussianFocalLoss.
+        loss_embedding (:obj:`ConfigDict` or dict): Config of corner embedding
+            loss. Defaults to AssociativeEmbeddingLoss.
+        loss_offset (:obj:`ConfigDict` or dict): Config of corner offset loss.
+            Defaults to SmoothL1Loss.
+        loss_guiding_shift (:obj:`ConfigDict` or dict): Config of
+            guiding shift loss. Defaults to SmoothL1Loss.
+        loss_centripetal_shift (:obj:`ConfigDict` or dict): Config of
+            centripetal shift loss. Defaults to SmoothL1Loss.
+       init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+           the initialization.
+    """
+
+    def __init__(self,
+                 *args,
+                 centripetal_shift_channels: int = 2,
+                 guiding_shift_channels: int = 2,
+                 feat_adaption_conv_kernel: int = 3,
+                 loss_guiding_shift: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=0.05),
+                 loss_centripetal_shift: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1),
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        assert centripetal_shift_channels == 2, (
+            'CentripetalHead only support centripetal_shift_channels == 2')
+        self.centripetal_shift_channels = centripetal_shift_channels
+        assert guiding_shift_channels == 2, (
+            'CentripetalHead only support guiding_shift_channels == 2')
+        self.guiding_shift_channels = guiding_shift_channels
+        self.feat_adaption_conv_kernel = feat_adaption_conv_kernel
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        self.loss_guiding_shift = MODELS.build(loss_guiding_shift)
+        self.loss_centripetal_shift = MODELS.build(loss_centripetal_shift)
+
+    def _init_centripetal_layers(self) -> None:
+        """Initialize centripetal layers.
+
+        Including feature adaption deform convs (feat_adaption), deform offset
+        prediction convs (dcn_off), guiding shift (guiding_shift) and
+        centripetal shift ( centripetal_shift). Each branch has two parts:
+        prefix `tl_` for top-left and `br_` for bottom-right.
+        """
+        self.tl_feat_adaption = nn.ModuleList()
+        self.br_feat_adaption = nn.ModuleList()
+        self.tl_dcn_offset = nn.ModuleList()
+        self.br_dcn_offset = nn.ModuleList()
+        self.tl_guiding_shift = nn.ModuleList()
+        self.br_guiding_shift = nn.ModuleList()
+        self.tl_centripetal_shift = nn.ModuleList()
+        self.br_centripetal_shift = nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_feat_adaption.append(
+                DeformConv2d(self.in_channels, self.in_channels,
+                             self.feat_adaption_conv_kernel, 1, 1))
+            self.br_feat_adaption.append(
+                DeformConv2d(self.in_channels, self.in_channels,
+                             self.feat_adaption_conv_kernel, 1, 1))
+
+            self.tl_guiding_shift.append(
+                self._make_layers(
+                    out_channels=self.guiding_shift_channels,
+                    in_channels=self.in_channels))
+            self.br_guiding_shift.append(
+                self._make_layers(
+                    out_channels=self.guiding_shift_channels,
+                    in_channels=self.in_channels))
+
+            self.tl_dcn_offset.append(
+                ConvModule(
+                    self.guiding_shift_channels,
+                    self.feat_adaption_conv_kernel**2 *
+                    self.guiding_shift_channels,
+                    1,
+                    bias=False,
+                    act_cfg=None))
+            self.br_dcn_offset.append(
+                ConvModule(
+                    self.guiding_shift_channels,
+                    self.feat_adaption_conv_kernel**2 *
+                    self.guiding_shift_channels,
+                    1,
+                    bias=False,
+                    act_cfg=None))
+
+            self.tl_centripetal_shift.append(
+                self._make_layers(
+                    out_channels=self.centripetal_shift_channels,
+                    in_channels=self.in_channels))
+            self.br_centripetal_shift.append(
+                self._make_layers(
+                    out_channels=self.centripetal_shift_channels,
+                    in_channels=self.in_channels))
+
+    def _init_layers(self) -> None:
+        """Initialize layers for CentripetalHead.
+
+        Including two parts: CornerHead layers and CentripetalHead layers
+        """
+        super()._init_layers()  # using _init_layers in CornerHead
+        self._init_centripetal_layers()
+
+    def init_weights(self) -> None:
+        super().init_weights()
+        for i in range(self.num_feat_levels):
+            normal_init(self.tl_feat_adaption[i], std=0.01)
+            normal_init(self.br_feat_adaption[i], std=0.01)
+            normal_init(self.tl_dcn_offset[i].conv, std=0.1)
+            normal_init(self.br_dcn_offset[i].conv, std=0.1)
+            _ = [x.conv.reset_parameters() for x in self.tl_guiding_shift[i]]
+            _ = [x.conv.reset_parameters() for x in self.br_guiding_shift[i]]
+            _ = [
+                x.conv.reset_parameters() for x in self.tl_centripetal_shift[i]
+            ]
+            _ = [
+                x.conv.reset_parameters() for x in self.br_centripetal_shift[i]
+            ]
+
+    def forward_single(self, x: Tensor, lvl_ind: int) -> List[Tensor]:
+        """Forward feature of a single level.
+
+        Args:
+            x (Tensor): Feature of a single level.
+            lvl_ind (int): Level index of current feature.
+
+        Returns:
+            tuple[Tensor]: A tuple of CentripetalHead's output for current
+            feature level. Containing the following Tensors:
+
+                - tl_heat (Tensor): Predicted top-left corner heatmap.
+                - br_heat (Tensor): Predicted bottom-right corner heatmap.
+                - tl_off (Tensor): Predicted top-left offset heatmap.
+                - br_off (Tensor): Predicted bottom-right offset heatmap.
+                - tl_guiding_shift (Tensor): Predicted top-left guiding shift
+                  heatmap.
+                - br_guiding_shift (Tensor): Predicted bottom-right guiding
+                  shift heatmap.
+                - tl_centripetal_shift (Tensor): Predicted top-left centripetal
+                  shift heatmap.
+                - br_centripetal_shift (Tensor): Predicted bottom-right
+                  centripetal shift heatmap.
+        """
+        tl_heat, br_heat, _, _, tl_off, br_off, tl_pool, br_pool = super(
+        ).forward_single(
+            x, lvl_ind, return_pool=True)
+
+        tl_guiding_shift = self.tl_guiding_shift[lvl_ind](tl_pool)
+        br_guiding_shift = self.br_guiding_shift[lvl_ind](br_pool)
+
+        tl_dcn_offset = self.tl_dcn_offset[lvl_ind](tl_guiding_shift.detach())
+        br_dcn_offset = self.br_dcn_offset[lvl_ind](br_guiding_shift.detach())
+
+        tl_feat_adaption = self.tl_feat_adaption[lvl_ind](tl_pool,
+                                                          tl_dcn_offset)
+        br_feat_adaption = self.br_feat_adaption[lvl_ind](br_pool,
+                                                          br_dcn_offset)
+
+        tl_centripetal_shift = self.tl_centripetal_shift[lvl_ind](
+            tl_feat_adaption)
+        br_centripetal_shift = self.br_centripetal_shift[lvl_ind](
+            br_feat_adaption)
+
+        result_list = [
+            tl_heat, br_heat, tl_off, br_off, tl_guiding_shift,
+            br_guiding_shift, tl_centripetal_shift, br_centripetal_shift
+        ]
+        return result_list
+
+    def loss_by_feat(
+            self,
+            tl_heats: List[Tensor],
+            br_heats: List[Tensor],
+            tl_offs: List[Tensor],
+            br_offs: List[Tensor],
+            tl_guiding_shifts: List[Tensor],
+            br_guiding_shifts: List[Tensor],
+            tl_centripetal_shifts: List[Tensor],
+            br_centripetal_shifts: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            tl_guiding_shifts (list[Tensor]): Top-left guiding shifts for each
+                level with shape (N, guiding_shift_channels, H, W).
+            br_guiding_shifts (list[Tensor]): Bottom-right guiding shifts for
+                each level with shape (N, guiding_shift_channels, H, W).
+            tl_centripetal_shifts (list[Tensor]): Top-left centripetal shifts
+                for each level with shape (N, centripetal_shift_channels, H,
+                W).
+            br_centripetal_shifts (list[Tensor]): Bottom-right centripetal
+                shifts for each level with shape (N,
+                centripetal_shift_channels, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Specify which bounding boxes can be ignored when computing
+                the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components. Containing the
+            following losses:
+
+                - det_loss (list[Tensor]): Corner keypoint losses of all
+                  feature levels.
+                - off_loss (list[Tensor]): Corner offset losses of all feature
+                  levels.
+                - guiding_loss (list[Tensor]): Guiding shift losses of all
+                  feature levels.
+                - centripetal_loss (list[Tensor]): Centripetal shift losses of
+                  all feature levels.
+        """
+        gt_bboxes = [
+            gt_instances.bboxes for gt_instances in batch_gt_instances
+        ]
+        gt_labels = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+
+        targets = self.get_targets(
+            gt_bboxes,
+            gt_labels,
+            tl_heats[-1].shape,
+            batch_img_metas[0]['batch_input_shape'],
+            with_corner_emb=self.with_corner_emb,
+            with_guiding_shift=True,
+            with_centripetal_shift=True)
+        mlvl_targets = [targets for _ in range(self.num_feat_levels)]
+        [det_losses, off_losses, guiding_losses, centripetal_losses
+         ] = multi_apply(self.loss_by_feat_single, tl_heats, br_heats, tl_offs,
+                         br_offs, tl_guiding_shifts, br_guiding_shifts,
+                         tl_centripetal_shifts, br_centripetal_shifts,
+                         mlvl_targets)
+        loss_dict = dict(
+            det_loss=det_losses,
+            off_loss=off_losses,
+            guiding_loss=guiding_losses,
+            centripetal_loss=centripetal_losses)
+        return loss_dict
+
+    def loss_by_feat_single(self, tl_hmp: Tensor, br_hmp: Tensor,
+                            tl_off: Tensor, br_off: Tensor,
+                            tl_guiding_shift: Tensor, br_guiding_shift: Tensor,
+                            tl_centripetal_shift: Tensor,
+                            br_centripetal_shift: Tensor,
+                            targets: dict) -> Tuple[Tensor, ...]:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            tl_hmp (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_hmp (Tensor): Bottom-right corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            tl_guiding_shift (Tensor): Top-left guiding shift for current level
+                with shape (N, guiding_shift_channels, H, W).
+            br_guiding_shift (Tensor): Bottom-right guiding shift for current
+                level with shape (N, guiding_shift_channels, H, W).
+            tl_centripetal_shift (Tensor): Top-left centripetal shift for
+                current level with shape (N, centripetal_shift_channels, H, W).
+            br_centripetal_shift (Tensor): Bottom-right centripetal shift for
+                current level with shape (N, centripetal_shift_channels, H, W).
+            targets (dict): Corner target generated by `get_targets`.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of the head's different branches
+            containing the following losses:
+
+                - det_loss (Tensor): Corner keypoint loss.
+                - off_loss (Tensor): Corner offset loss.
+                - guiding_loss (Tensor): Guiding shift loss.
+                - centripetal_loss (Tensor): Centripetal shift loss.
+        """
+        targets['corner_embedding'] = None
+
+        det_loss, _, _, off_loss = super().loss_by_feat_single(
+            tl_hmp, br_hmp, None, None, tl_off, br_off, targets)
+
+        gt_tl_guiding_shift = targets['topleft_guiding_shift']
+        gt_br_guiding_shift = targets['bottomright_guiding_shift']
+        gt_tl_centripetal_shift = targets['topleft_centripetal_shift']
+        gt_br_centripetal_shift = targets['bottomright_centripetal_shift']
+
+        gt_tl_heatmap = targets['topleft_heatmap']
+        gt_br_heatmap = targets['bottomright_heatmap']
+        # We only compute the offset loss at the real corner position.
+        # The value of real corner would be 1 in heatmap ground truth.
+        # The mask is computed in class agnostic mode and its shape is
+        # batch * 1 * width * height.
+        tl_mask = gt_tl_heatmap.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_tl_heatmap)
+        br_mask = gt_br_heatmap.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_br_heatmap)
+
+        # Guiding shift loss
+        tl_guiding_loss = self.loss_guiding_shift(
+            tl_guiding_shift,
+            gt_tl_guiding_shift,
+            tl_mask,
+            avg_factor=tl_mask.sum())
+        br_guiding_loss = self.loss_guiding_shift(
+            br_guiding_shift,
+            gt_br_guiding_shift,
+            br_mask,
+            avg_factor=br_mask.sum())
+        guiding_loss = (tl_guiding_loss + br_guiding_loss) / 2.0
+        # Centripetal shift loss
+        tl_centripetal_loss = self.loss_centripetal_shift(
+            tl_centripetal_shift,
+            gt_tl_centripetal_shift,
+            tl_mask,
+            avg_factor=tl_mask.sum())
+        br_centripetal_loss = self.loss_centripetal_shift(
+            br_centripetal_shift,
+            gt_br_centripetal_shift,
+            br_mask,
+            avg_factor=br_mask.sum())
+        centripetal_loss = (tl_centripetal_loss + br_centripetal_loss) / 2.0
+
+        return det_loss, off_loss, guiding_loss, centripetal_loss
+
+    def predict_by_feat(self,
+                        tl_heats: List[Tensor],
+                        br_heats: List[Tensor],
+                        tl_offs: List[Tensor],
+                        br_offs: List[Tensor],
+                        tl_guiding_shifts: List[Tensor],
+                        br_guiding_shifts: List[Tensor],
+                        tl_centripetal_shifts: List[Tensor],
+                        br_centripetal_shifts: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            tl_guiding_shifts (list[Tensor]): Top-left guiding shifts for each
+                level with shape (N, guiding_shift_channels, H, W). Useless in
+                this function, we keep this arg because it's the raw output
+                from CentripetalHead.
+            br_guiding_shifts (list[Tensor]): Bottom-right guiding shifts for
+                each level with shape (N, guiding_shift_channels, H, W).
+                Useless in this function, we keep this arg because it's the
+                raw output from CentripetalHead.
+            tl_centripetal_shifts (list[Tensor]): Top-left centripetal shifts
+                for each level with shape (N, centripetal_shift_channels, H,
+                W).
+            br_centripetal_shifts (list[Tensor]): Bottom-right centripetal
+                shifts for each level with shape (N,
+                centripetal_shift_channels, H, W).
+            batch_img_metas (list[dict], optional): Batch image meta info.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len(
+            batch_img_metas)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            result_list.append(
+                self._predict_by_feat_single(
+                    tl_heats[-1][img_id:img_id + 1, :],
+                    br_heats[-1][img_id:img_id + 1, :],
+                    tl_offs[-1][img_id:img_id + 1, :],
+                    br_offs[-1][img_id:img_id + 1, :],
+                    batch_img_metas[img_id],
+                    tl_emb=None,
+                    br_emb=None,
+                    tl_centripetal_shift=tl_centripetal_shifts[-1][
+                        img_id:img_id + 1, :],
+                    br_centripetal_shift=br_centripetal_shifts[-1][
+                        img_id:img_id + 1, :],
+                    rescale=rescale,
+                    with_nms=with_nms))
+
+        return result_list
diff --git a/head_extractor/src/mmdet/models/dense_heads/condinst_head.py b/head_extractor/src/mmdet/models/dense_heads/condinst_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..35a25e6339a8161314cb0523e7181f9d400023ac
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/condinst_head.py
@@ -0,0 +1,1226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, kaiming_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import cat_boxes
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..utils import (aligned_bilinear, filter_scores_and_topk, multi_apply,
+                     relative_coordinate_maps, select_single_mlvl)
+from ..utils.misc import empty_instances
+from .base_mask_head import BaseMaskHead
+from .fcos_head import FCOSHead
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class CondInstBboxHead(FCOSHead):
+    """CondInst box head used in https://arxiv.org/abs/1904.02689.
+
+    Note that CondInst Bbox Head is a extension of FCOS head.
+    Two differences are described as follows:
+
+    1. CondInst box head predicts a set of params for each instance.
+    2. CondInst box head return the pos_gt_inds and pos_inds.
+
+    Args:
+        num_params (int): Number of params for instance segmentation.
+    """
+
+    def __init__(self, *args, num_params: int = 169, **kwargs) -> None:
+        self.num_params = num_params
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        super()._init_layers()
+        self.controller = nn.Conv2d(
+            self.feat_channels, self.num_params, 3, padding=1)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox predictions, centerness
+            predictions and param predictions of input feature maps.
+        """
+        cls_score, bbox_pred, cls_feat, reg_feat = \
+            super(FCOSHead, self).forward_single(x)
+        if self.centerness_on_reg:
+            centerness = self.conv_centerness(reg_feat)
+        else:
+            centerness = self.conv_centerness(cls_feat)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        if self.norm_on_bbox:
+            # bbox_pred needed for gradient computation has been modified
+            # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+            # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+            bbox_pred = bbox_pred.clamp(min=0)
+            if not self.training:
+                bbox_pred *= stride
+        else:
+            bbox_pred = bbox_pred.exp()
+        param_pred = self.controller(reg_feat)
+        return cls_score, bbox_pred, centerness, param_pred
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        centernesses: List[Tensor],
+        param_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            centernesses (list[Tensor]): centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            param_preds (List[Tensor]): param_pred for each scale level, each
+                is a 4D-tensor, the channel number is num_params.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(centernesses)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        # Need stride for rel coord compute
+        all_level_points_strides = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device,
+            with_stride=True)
+        all_level_points = [i[:, :2] for i in all_level_points_strides]
+        all_level_strides = [i[:, 2] for i in all_level_points_strides]
+        labels, bbox_targets, pos_inds_list, pos_gt_inds_list = \
+            self.get_targets(all_level_points, batch_gt_instances)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and centerness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_centerness = [
+            centerness.permute(0, 2, 3, 1).reshape(-1)
+            for centerness in centernesses
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_centerness = torch.cat(flatten_centerness)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < bg_class_ind)).nonzero().reshape(-1)
+        num_pos = torch.tensor(
+            len(pos_inds), dtype=torch.float, device=bbox_preds[0].device)
+        num_pos = max(reduce_mean(num_pos), 1.0)
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels, avg_factor=num_pos)
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_centerness = flatten_centerness[pos_inds]
+        pos_bbox_targets = flatten_bbox_targets[pos_inds]
+        pos_centerness_targets = self.centerness_target(pos_bbox_targets)
+        # centerness weighted iou loss
+        centerness_denorm = max(
+            reduce_mean(pos_centerness_targets.sum().detach()), 1e-6)
+
+        if len(pos_inds) > 0:
+            pos_points = flatten_points[pos_inds]
+            pos_decoded_bbox_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_preds)
+            pos_decoded_target_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_targets)
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds,
+                weight=pos_centerness_targets,
+                avg_factor=centerness_denorm)
+            loss_centerness = self.loss_centerness(
+                pos_centerness, pos_centerness_targets, avg_factor=num_pos)
+        else:
+            loss_bbox = pos_bbox_preds.sum()
+            loss_centerness = pos_centerness.sum()
+
+        self._raw_positive_infos.update(cls_scores=cls_scores)
+        self._raw_positive_infos.update(centernesses=centernesses)
+        self._raw_positive_infos.update(param_preds=param_preds)
+        self._raw_positive_infos.update(all_level_points=all_level_points)
+        self._raw_positive_infos.update(all_level_strides=all_level_strides)
+        self._raw_positive_infos.update(pos_gt_inds_list=pos_gt_inds_list)
+        self._raw_positive_infos.update(pos_inds_list=pos_inds_list)
+
+        return dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_centerness=loss_centerness)
+
+    def get_targets(
+        self, points: List[Tensor], batch_gt_instances: InstanceList
+    ) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]:
+        """Compute regression, classification and centerness targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple: Targets of each level.
+
+            - concat_lvl_labels (list[Tensor]): Labels of each level.
+            - concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+            level.
+            - pos_inds_list (list[Tensor]): pos_inds of each image.
+            - pos_gt_inds_list (List[Tensor]): pos_gt_inds of each image.
+        """
+        assert len(points) == len(self.regress_ranges)
+        num_levels = len(points)
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        # get labels and bbox_targets of each image
+        labels_list, bbox_targets_list, pos_inds_list, pos_gt_inds_list = \
+            multi_apply(
+                self._get_targets_single,
+                batch_gt_instances,
+                points=concat_points,
+                regress_ranges=concat_regress_ranges,
+                num_points_per_lvl=num_points)
+
+        # split to per img, per level
+        labels_list = [labels.split(num_points, 0) for labels in labels_list]
+        bbox_targets_list = [
+            bbox_targets.split(num_points, 0)
+            for bbox_targets in bbox_targets_list
+        ]
+
+        # concat per level image
+        concat_lvl_labels = []
+        concat_lvl_bbox_targets = []
+        for i in range(num_levels):
+            concat_lvl_labels.append(
+                torch.cat([labels[i] for labels in labels_list]))
+            bbox_targets = torch.cat(
+                [bbox_targets[i] for bbox_targets in bbox_targets_list])
+            if self.norm_on_bbox:
+                bbox_targets = bbox_targets / self.strides[i]
+            concat_lvl_bbox_targets.append(bbox_targets)
+        return (concat_lvl_labels, concat_lvl_bbox_targets, pos_inds_list,
+                pos_gt_inds_list)
+
+    def _get_targets_single(
+        self, gt_instances: InstanceData, points: Tensor,
+        regress_ranges: Tensor, num_points_per_lvl: List[int]
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+        """Compute regression and classification targets for a single image."""
+        num_points = points.size(0)
+        num_gts = len(gt_instances)
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        gt_masks = gt_instances.get('masks', None)
+
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,), self.num_classes), \
+                   gt_bboxes.new_zeros((num_points, 4)), \
+                   gt_bboxes.new_zeros((0,), dtype=torch.int64), \
+                   gt_bboxes.new_zeros((0,), dtype=torch.int64)
+
+        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+            gt_bboxes[:, 3] - gt_bboxes[:, 1])
+        # TODO: figure out why these two are different
+        # areas = areas[None].expand(num_points, num_gts)
+        areas = areas[None].repeat(num_points, 1)
+        regress_ranges = regress_ranges[:, None, :].expand(
+            num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None].expand(num_points, num_gts)
+        ys = ys[:, None].expand(num_points, num_gts)
+
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+
+        if self.center_sampling:
+            # condition1: inside a `center bbox`
+            radius = self.center_sample_radius
+            # if gt_mask not None, use gt mask's centroid to determine
+            # the center region rather than gt_bbox center
+            if gt_masks is None:
+                center_xs = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) / 2
+                center_ys = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) / 2
+            else:
+                h, w = gt_masks.height, gt_masks.width
+                masks = gt_masks.to_tensor(
+                    dtype=torch.bool, device=gt_bboxes.device)
+                yys = torch.arange(
+                    0, h, dtype=torch.float32, device=masks.device)
+                xxs = torch.arange(
+                    0, w, dtype=torch.float32, device=masks.device)
+                # m00/m10/m01 represent the moments of a contour
+                # centroid is computed by m00/m10 and m00/m01
+                m00 = masks.sum(dim=-1).sum(dim=-1).clamp(min=1e-6)
+                m10 = (masks * xxs).sum(dim=-1).sum(dim=-1)
+                m01 = (masks * yys[:, None]).sum(dim=-1).sum(dim=-1)
+                center_xs = m10 / m00
+                center_ys = m01 / m00
+
+                center_xs = center_xs[None].expand(num_points, num_gts)
+                center_ys = center_ys[None].expand(num_points, num_gts)
+            center_gts = torch.zeros_like(gt_bboxes)
+            stride = center_xs.new_zeros(center_xs.shape)
+
+            # project the points on current lvl back to the `original` sizes
+            lvl_begin = 0
+            for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
+                lvl_end = lvl_begin + num_points_lvl
+                stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
+                lvl_begin = lvl_end
+
+            x_mins = center_xs - stride
+            y_mins = center_ys - stride
+            x_maxs = center_xs + stride
+            y_maxs = center_ys + stride
+            center_gts[..., 0] = torch.where(x_mins > gt_bboxes[..., 0],
+                                             x_mins, gt_bboxes[..., 0])
+            center_gts[..., 1] = torch.where(y_mins > gt_bboxes[..., 1],
+                                             y_mins, gt_bboxes[..., 1])
+            center_gts[..., 2] = torch.where(x_maxs > gt_bboxes[..., 2],
+                                             gt_bboxes[..., 2], x_maxs)
+            center_gts[..., 3] = torch.where(y_maxs > gt_bboxes[..., 3],
+                                             gt_bboxes[..., 3], y_maxs)
+
+            cb_dist_left = xs - center_gts[..., 0]
+            cb_dist_right = center_gts[..., 2] - xs
+            cb_dist_top = ys - center_gts[..., 1]
+            cb_dist_bottom = center_gts[..., 3] - ys
+            center_bbox = torch.stack(
+                (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
+            inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+        else:
+            # condition1: inside a gt bbox
+            inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
+
+        # condition2: limit the regression range for each location
+        max_regress_distance = bbox_targets.max(-1)[0]
+        inside_regress_range = (
+            (max_regress_distance >= regress_ranges[..., 0])
+            & (max_regress_distance <= regress_ranges[..., 1]))
+
+        # if there are still more than one objects for a location,
+        # we choose the one with minimal area
+        areas[inside_gt_bbox_mask == 0] = INF
+        areas[inside_regress_range == 0] = INF
+        min_area, min_area_inds = areas.min(dim=1)
+
+        labels = gt_labels[min_area_inds]
+        labels[min_area == INF] = self.num_classes  # set as BG
+        bbox_targets = bbox_targets[range(num_points), min_area_inds]
+
+        # return pos_inds & pos_gt_inds
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().reshape(-1)
+        pos_gt_inds = min_area_inds[labels < self.num_classes]
+        return labels, bbox_targets, pos_inds, pos_gt_inds
+
+    def get_positive_infos(self) -> InstanceList:
+        """Get positive information from sampling results.
+
+        Returns:
+            list[:obj:`InstanceData`]: Positive information of each image,
+            usually including positive bboxes, positive labels, positive
+            priors, etc.
+        """
+        assert len(self._raw_positive_infos) > 0
+
+        pos_gt_inds_list = self._raw_positive_infos['pos_gt_inds_list']
+        pos_inds_list = self._raw_positive_infos['pos_inds_list']
+        num_imgs = len(pos_gt_inds_list)
+
+        cls_score_list = []
+        centerness_list = []
+        param_pred_list = []
+        point_list = []
+        stride_list = []
+        for cls_score_per_lvl, centerness_per_lvl, param_pred_per_lvl,\
+            point_per_lvl, stride_per_lvl in \
+            zip(self._raw_positive_infos['cls_scores'],
+                self._raw_positive_infos['centernesses'],
+                self._raw_positive_infos['param_preds'],
+                self._raw_positive_infos['all_level_points'],
+                self._raw_positive_infos['all_level_strides']):
+            cls_score_per_lvl = \
+                cls_score_per_lvl.permute(
+                    0, 2, 3, 1).reshape(num_imgs, -1, self.num_classes)
+            centerness_per_lvl = \
+                centerness_per_lvl.permute(
+                    0, 2, 3, 1).reshape(num_imgs, -1, 1)
+            param_pred_per_lvl = \
+                param_pred_per_lvl.permute(
+                    0, 2, 3, 1).reshape(num_imgs, -1, self.num_params)
+            point_per_lvl = point_per_lvl.unsqueeze(0).repeat(num_imgs, 1, 1)
+            stride_per_lvl = stride_per_lvl.unsqueeze(0).repeat(num_imgs, 1)
+
+            cls_score_list.append(cls_score_per_lvl)
+            centerness_list.append(centerness_per_lvl)
+            param_pred_list.append(param_pred_per_lvl)
+            point_list.append(point_per_lvl)
+            stride_list.append(stride_per_lvl)
+        cls_scores = torch.cat(cls_score_list, dim=1)
+        centernesses = torch.cat(centerness_list, dim=1)
+        param_preds = torch.cat(param_pred_list, dim=1)
+        all_points = torch.cat(point_list, dim=1)
+        all_strides = torch.cat(stride_list, dim=1)
+
+        positive_infos = []
+        for i, (pos_gt_inds,
+                pos_inds) in enumerate(zip(pos_gt_inds_list, pos_inds_list)):
+            pos_info = InstanceData()
+            pos_info.points = all_points[i][pos_inds]
+            pos_info.strides = all_strides[i][pos_inds]
+            pos_info.scores = cls_scores[i][pos_inds]
+            pos_info.centernesses = centernesses[i][pos_inds]
+            pos_info.param_preds = param_preds[i][pos_inds]
+            pos_info.pos_assigned_gt_inds = pos_gt_inds
+            pos_info.pos_inds = pos_inds
+            positive_infos.append(pos_info)
+        return positive_infos
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        score_factors: Optional[List[Tensor]] = None,
+                        param_preds: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            param_preds (list[Tensor], optional): Params for all scale
+                level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_params, H, W)
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        if score_factors is None:
+            # e.g. Retina, FreeAnchor, Foveabox, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, AutoAssign, etc.
+            with_score_factors = True
+            assert len(cls_scores) == len(score_factors)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        all_level_points_strides = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device,
+            with_stride=True)
+        all_level_points = [i[:, :2] for i in all_level_points_strides]
+        all_level_strides = [i[:, 2] for i in all_level_points_strides]
+
+        result_list = []
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            cls_score_list = select_single_mlvl(
+                cls_scores, img_id, detach=True)
+            bbox_pred_list = select_single_mlvl(
+                bbox_preds, img_id, detach=True)
+            if with_score_factors:
+                score_factor_list = select_single_mlvl(
+                    score_factors, img_id, detach=True)
+            else:
+                score_factor_list = [None for _ in range(num_levels)]
+            param_pred_list = select_single_mlvl(
+                param_preds, img_id, detach=True)
+
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                score_factor_list=score_factor_list,
+                param_pred_list=param_pred_list,
+                mlvl_points=all_level_points,
+                mlvl_strides=all_level_strides,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                param_pred_list: List[Tensor],
+                                mlvl_points: List[Tensor],
+                                mlvl_strides: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            param_pred_list (List[Tensor]): Param predition from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_params, H, W).
+            mlvl_points (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid.
+                It has shape (num_priors, 2)
+            mlvl_strides (List[Tensor]):  Each element in the list is
+                the stride of a single level in feature pyramid.
+                It has shape (num_priors, 1)
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if score_factor_list[0] is None:
+            # e.g. Retina, FreeAnchor, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_param_preds = []
+        mlvl_valid_points = []
+        mlvl_valid_strides = []
+        mlvl_scores = []
+        mlvl_labels = []
+        if with_score_factors:
+            mlvl_score_factors = []
+        else:
+            mlvl_score_factors = None
+        for level_idx, (cls_score, bbox_pred, score_factor,
+                        param_pred, points, strides) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, param_pred_list,
+                              mlvl_points, mlvl_strides)):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim)
+            if with_score_factors:
+                score_factor = score_factor.permute(1, 2,
+                                                    0).reshape(-1).sigmoid()
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            param_pred = param_pred.permute(1, 2,
+                                            0).reshape(-1, self.num_params)
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            score_thr = cfg.get('score_thr', 0)
+
+            results = filter_scores_and_topk(
+                scores, score_thr, nms_pre,
+                dict(
+                    bbox_pred=bbox_pred,
+                    param_pred=param_pred,
+                    points=points,
+                    strides=strides))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            param_pred = filtered_results['param_pred']
+            points = filtered_results['points']
+            strides = filtered_results['strides']
+
+            if with_score_factors:
+                score_factor = score_factor[keep_idxs]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_param_preds.append(param_pred)
+            mlvl_valid_points.append(points)
+            mlvl_valid_strides.append(strides)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+            if with_score_factors:
+                mlvl_score_factors.append(score_factor)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_points)
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+        results.param_preds = torch.cat(mlvl_param_preds)
+        results.points = torch.cat(mlvl_valid_points)
+        results.strides = torch.cat(mlvl_valid_strides)
+        if with_score_factors:
+            results.score_factors = torch.cat(mlvl_score_factors)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+
+class MaskFeatModule(BaseModule):
+    """CondInst mask feature map branch used in \
+    https://arxiv.org/abs/1904.02689.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels of the mask feature
+             map branch.
+        start_level (int): The starting feature map level from RPN that
+             will be used to predict the mask feature map.
+        end_level (int): The ending feature map level from rpn that
+             will be used to predict the mask feature map.
+        out_channels (int): Number of output channels of the mask feature
+             map branch. This is the channel count of the mask
+             feature map that to be dynamically convolved with the predicted
+             kernel.
+        mask_stride (int): Downsample factor of the mask feature map output.
+            Defaults to 4.
+        num_stacked_convs (int): Number of convs in mask feature branch.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 feat_channels: int,
+                 start_level: int,
+                 end_level: int,
+                 out_channels: int,
+                 mask_stride: int = 4,
+                 num_stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 init_cfg: MultiConfig = [
+                     dict(type='Normal', layer='Conv2d', std=0.01)
+                 ],
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.start_level = start_level
+        self.end_level = end_level
+        self.mask_stride = mask_stride
+        self.num_stacked_convs = num_stacked_convs
+        assert start_level >= 0 and end_level >= start_level
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.convs_all_levels = nn.ModuleList()
+        for i in range(self.start_level, self.end_level + 1):
+            convs_per_level = nn.Sequential()
+            convs_per_level.add_module(
+                f'conv{i}',
+                ConvModule(
+                    self.in_channels,
+                    self.feat_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=False,
+                    bias=False))
+            self.convs_all_levels.append(convs_per_level)
+
+        conv_branch = []
+        for _ in range(self.num_stacked_convs):
+            conv_branch.append(
+                ConvModule(
+                    self.feat_channels,
+                    self.feat_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=False))
+        self.conv_branch = nn.Sequential(*conv_branch)
+
+        self.conv_pred = nn.Conv2d(
+            self.feat_channels, self.out_channels, 1, stride=1)
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        super().init_weights()
+        kaiming_init(self.convs_all_levels, a=1, distribution='uniform')
+        kaiming_init(self.conv_branch, a=1, distribution='uniform')
+        kaiming_init(self.conv_pred, a=1, distribution='uniform')
+
+    def forward(self, x: Tuple[Tensor]) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: The predicted mask feature map.
+        """
+        inputs = x[self.start_level:self.end_level + 1]
+        assert len(inputs) == (self.end_level - self.start_level + 1)
+        feature_add_all_level = self.convs_all_levels[0](inputs[0])
+        target_h, target_w = feature_add_all_level.size()[2:]
+        for i in range(1, len(inputs)):
+            input_p = inputs[i]
+            x_p = self.convs_all_levels[i](input_p)
+            h, w = x_p.size()[2:]
+            factor_h = target_h // h
+            factor_w = target_w // w
+            assert factor_h == factor_w
+            feature_per_level = aligned_bilinear(x_p, factor_h)
+            feature_add_all_level = feature_add_all_level + \
+                feature_per_level
+
+        feature_add_all_level = self.conv_branch(feature_add_all_level)
+        feature_pred = self.conv_pred(feature_add_all_level)
+        return feature_pred
+
+
+@MODELS.register_module()
+class CondInstMaskHead(BaseMaskHead):
+    """CondInst mask head used in https://arxiv.org/abs/1904.02689.
+
+    This head outputs the mask for CondInst.
+
+    Args:
+        mask_feature_head (dict): Config of CondInstMaskFeatHead.
+        num_layers (int): Number of dynamic conv layers.
+        feat_channels (int): Number of channels in the dynamic conv.
+        mask_out_stride (int): The stride of the mask feat.
+        size_of_interest (int): The size of the region used in rel coord.
+        max_masks_to_train (int): Maximum number of masks to train for
+            each image.
+        loss_segm (:obj:`ConfigDict` or dict, optional): Config of
+            segmentation loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config
+            of head.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            head.
+    """
+
+    def __init__(self,
+                 mask_feature_head: ConfigType,
+                 num_layers: int = 3,
+                 feat_channels: int = 8,
+                 mask_out_stride: int = 4,
+                 size_of_interest: int = 8,
+                 max_masks_to_train: int = -1,
+                 topk_masks_per_img: int = -1,
+                 loss_mask: ConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None) -> None:
+        super().__init__()
+        self.mask_feature_head = MaskFeatModule(**mask_feature_head)
+        self.mask_feat_stride = self.mask_feature_head.mask_stride
+        self.in_channels = self.mask_feature_head.out_channels
+        self.num_layers = num_layers
+        self.feat_channels = feat_channels
+        self.size_of_interest = size_of_interest
+        self.mask_out_stride = mask_out_stride
+        self.max_masks_to_train = max_masks_to_train
+        self.topk_masks_per_img = topk_masks_per_img
+        self.prior_generator = MlvlPointGenerator([self.mask_feat_stride])
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.loss_mask = MODELS.build(loss_mask)
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        weight_nums, bias_nums = [], []
+        for i in range(self.num_layers):
+            if i == 0:
+                weight_nums.append((self.in_channels + 2) * self.feat_channels)
+                bias_nums.append(self.feat_channels)
+            elif i == self.num_layers - 1:
+                weight_nums.append(self.feat_channels * 1)
+                bias_nums.append(1)
+            else:
+                weight_nums.append(self.feat_channels * self.feat_channels)
+                bias_nums.append(self.feat_channels)
+
+        self.weight_nums = weight_nums
+        self.bias_nums = bias_nums
+        self.num_params = sum(weight_nums) + sum(bias_nums)
+
+    def parse_dynamic_params(
+            self, params: Tensor) -> Tuple[List[Tensor], List[Tensor]]:
+        """parse the dynamic params for dynamic conv."""
+        num_insts = params.size(0)
+        params_splits = list(
+            torch.split_with_sizes(
+                params, self.weight_nums + self.bias_nums, dim=1))
+        weight_splits = params_splits[:self.num_layers]
+        bias_splits = params_splits[self.num_layers:]
+        for i in range(self.num_layers):
+            if i < self.num_layers - 1:
+                weight_splits[i] = weight_splits[i].reshape(
+                    num_insts * self.in_channels, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(num_insts *
+                                                        self.in_channels)
+            else:
+                # out_channels x in_channels x 1 x 1
+                weight_splits[i] = weight_splits[i].reshape(
+                    num_insts * 1, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(num_insts)
+
+        return weight_splits, bias_splits
+
+    def dynamic_conv_forward(self, features: Tensor, weights: List[Tensor],
+                             biases: List[Tensor], num_insts: int) -> Tensor:
+        """dynamic forward, each layer follow a relu."""
+        n_layers = len(weights)
+        x = features
+        for i, (w, b) in enumerate(zip(weights, biases)):
+            x = F.conv2d(x, w, bias=b, stride=1, padding=0, groups=num_insts)
+            if i < n_layers - 1:
+                x = F.relu(x)
+        return x
+
+    def forward(self, x: tuple, positive_infos: InstanceList) -> tuple:
+        """Forward feature from the upstream network to get prototypes and
+        linearly combine the prototypes, using masks coefficients, into
+        instance masks. Finally, crop the instance masks with given bboxes.
+
+        Args:
+            x (Tuple[Tensor]): Feature from the upstream network, which is
+                a 4D-tensor.
+            positive_infos (List[:obj:``InstanceData``]): Positive information
+                that calculate from detect head.
+
+        Returns:
+            tuple: Predicted instance segmentation masks
+        """
+        mask_feats = self.mask_feature_head(x)
+        return multi_apply(self.forward_single, mask_feats, positive_infos)
+
+    def forward_single(self, mask_feat: Tensor,
+                       positive_info: InstanceData) -> Tensor:
+        """Forward features of a each image."""
+        pos_param_preds = positive_info.get('param_preds')
+        pos_points = positive_info.get('points')
+        pos_strides = positive_info.get('strides')
+
+        num_inst = pos_param_preds.shape[0]
+        mask_feat = mask_feat[None].repeat(num_inst, 1, 1, 1)
+        _, _, H, W = mask_feat.size()
+        if num_inst == 0:
+            return (pos_param_preds.new_zeros((0, 1, H, W)), )
+
+        locations = self.prior_generator.single_level_grid_priors(
+            mask_feat.size()[2:], 0, device=mask_feat.device)
+
+        rel_coords = relative_coordinate_maps(locations, pos_points,
+                                              pos_strides,
+                                              self.size_of_interest,
+                                              mask_feat.size()[2:])
+        mask_head_inputs = torch.cat([rel_coords, mask_feat], dim=1)
+        mask_head_inputs = mask_head_inputs.reshape(1, -1, H, W)
+
+        weights, biases = self.parse_dynamic_params(pos_param_preds)
+        mask_preds = self.dynamic_conv_forward(mask_head_inputs, weights,
+                                               biases, num_inst)
+        mask_preds = mask_preds.reshape(-1, H, W)
+        mask_preds = aligned_bilinear(
+            mask_preds.unsqueeze(0),
+            int(self.mask_feat_stride / self.mask_out_stride)).squeeze(0)
+
+        return (mask_preds, )
+
+    def loss_by_feat(self, mask_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], positive_infos: InstanceList,
+                     **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (list[Tensor]): List of predicted masks, each has
+                shape (num_classes, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+            positive_infos (List[:obj:``InstanceData``]): Information of
+                positive samples of each image that are assigned in detection
+                head.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert positive_infos is not None, \
+            'positive_infos should not be None in `CondInstMaskHead`'
+        losses = dict()
+
+        loss_mask = 0.
+        num_imgs = len(mask_preds)
+        total_pos = 0
+
+        for idx in range(num_imgs):
+            (mask_pred, pos_mask_targets, num_pos) = \
+                self._get_targets_single(
+                mask_preds[idx], batch_gt_instances[idx],
+                positive_infos[idx])
+            # mask loss
+            total_pos += num_pos
+            if num_pos == 0 or pos_mask_targets is None:
+                loss = mask_pred.new_zeros(1).mean()
+            else:
+                loss = self.loss_mask(
+                    mask_pred, pos_mask_targets,
+                    reduction_override='none').sum()
+            loss_mask += loss
+
+        if total_pos == 0:
+            total_pos += 1  # avoid nan
+        loss_mask = loss_mask / total_pos
+        losses.update(loss_mask=loss_mask)
+        return losses
+
+    def _get_targets_single(self, mask_preds: Tensor,
+                            gt_instances: InstanceData,
+                            positive_info: InstanceData):
+        """Compute targets for predictions of single image.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes with shape
+                (num_classes, H, W).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            positive_info (:obj:`InstanceData`): Information of positive
+                samples that are assigned in detection head. It usually
+                contains following keys.
+
+                    - pos_assigned_gt_inds (Tensor): Assigner GT indexes of
+                      positive proposals, has shape (num_pos, )
+                    - pos_inds (Tensor): Positive index of image, has
+                      shape (num_pos, ).
+                    - param_pred (Tensor): Positive param preditions
+                      with shape (num_pos, num_params).
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+            - mask_preds (Tensor): Positive predicted mask with shape
+              (num_pos, mask_h, mask_w).
+            - pos_mask_targets (Tensor): Positive mask targets with shape
+              (num_pos, mask_h, mask_w).
+            - num_pos (int): Positive numbers.
+        """
+        gt_bboxes = gt_instances.bboxes
+        device = gt_bboxes.device
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device).float()
+
+        # process with mask targets
+        pos_assigned_gt_inds = positive_info.get('pos_assigned_gt_inds')
+        scores = positive_info.get('scores')
+        centernesses = positive_info.get('centernesses')
+        num_pos = pos_assigned_gt_inds.size(0)
+
+        if gt_masks.size(0) == 0 or num_pos == 0:
+            return mask_preds, None, 0
+        # Since we're producing (near) full image masks,
+        # it'd take too much vram to backprop on every single mask.
+        # Thus we select only a subset.
+        if (self.max_masks_to_train != -1) and \
+           (num_pos > self.max_masks_to_train):
+            perm = torch.randperm(num_pos)
+            select = perm[:self.max_masks_to_train]
+            mask_preds = mask_preds[select]
+            pos_assigned_gt_inds = pos_assigned_gt_inds[select]
+            num_pos = self.max_masks_to_train
+        elif self.topk_masks_per_img != -1:
+            unique_gt_inds = pos_assigned_gt_inds.unique()
+            num_inst_per_gt = max(
+                int(self.topk_masks_per_img / len(unique_gt_inds)), 1)
+
+            keep_mask_preds = []
+            keep_pos_assigned_gt_inds = []
+            for gt_ind in unique_gt_inds:
+                per_inst_pos_inds = (pos_assigned_gt_inds == gt_ind)
+                mask_preds_per_inst = mask_preds[per_inst_pos_inds]
+                gt_inds_per_inst = pos_assigned_gt_inds[per_inst_pos_inds]
+                if sum(per_inst_pos_inds) > num_inst_per_gt:
+                    per_inst_scores = scores[per_inst_pos_inds].sigmoid().max(
+                        dim=1)[0]
+                    per_inst_centerness = centernesses[
+                        per_inst_pos_inds].sigmoid().reshape(-1, )
+                    select = (per_inst_scores * per_inst_centerness).topk(
+                        k=num_inst_per_gt, dim=0)[1]
+                    mask_preds_per_inst = mask_preds_per_inst[select]
+                    gt_inds_per_inst = gt_inds_per_inst[select]
+                keep_mask_preds.append(mask_preds_per_inst)
+                keep_pos_assigned_gt_inds.append(gt_inds_per_inst)
+            mask_preds = torch.cat(keep_mask_preds)
+            pos_assigned_gt_inds = torch.cat(keep_pos_assigned_gt_inds)
+            num_pos = pos_assigned_gt_inds.size(0)
+
+        # Follow the origin implement
+        start = int(self.mask_out_stride // 2)
+        gt_masks = gt_masks[:, start::self.mask_out_stride,
+                            start::self.mask_out_stride]
+        gt_masks = gt_masks.gt(0.5).float()
+        pos_mask_targets = gt_masks[pos_assigned_gt_inds]
+
+        return (mask_preds, pos_mask_targets, num_pos)
+
+    def predict_by_feat(self,
+                        mask_preds: List[Tensor],
+                        results_list: InstanceList,
+                        batch_img_metas: List[dict],
+                        rescale: bool = True,
+                        **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (list[Tensor]): Predicted prototypes with shape
+                (num_classes, H, W).
+            results_list (List[:obj:``InstanceData``]): BBoxHead results.
+            batch_img_metas (list[dict]): Meta information of all images.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        assert len(mask_preds) == len(results_list) == len(batch_img_metas)
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            results = results_list[img_id]
+            bboxes = results.bboxes
+            mask_pred = mask_preds[img_id]
+            if bboxes.shape[0] == 0 or mask_pred.shape[0] == 0:
+                results_list[img_id] = empty_instances(
+                    [img_meta],
+                    bboxes.device,
+                    task_type='mask',
+                    instance_results=[results])[0]
+            else:
+                im_mask = self._predict_by_feat_single(
+                    mask_preds=mask_pred,
+                    bboxes=bboxes,
+                    img_meta=img_meta,
+                    rescale=rescale)
+                results.masks = im_mask
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                mask_preds: Tensor,
+                                bboxes: Tensor,
+                                img_meta: dict,
+                                rescale: bool,
+                                cfg: OptConfigType = None):
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes, has shape [H, W, N].
+            img_meta (dict): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If rescale is False, then returned masks will
+                fit the scale of imgs[0].
+            cfg (dict, optional): Config used in test phase.
+                Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+            (1, 2))
+        img_h, img_w = img_meta['img_shape'][:2]
+        ori_h, ori_w = img_meta['ori_shape'][:2]
+
+        mask_preds = mask_preds.sigmoid().unsqueeze(0)
+        mask_preds = aligned_bilinear(mask_preds, self.mask_out_stride)
+        mask_preds = mask_preds[:, :, :img_h, :img_w]
+        if rescale:  # in-placed rescale the bboxes
+            scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+            bboxes /= scale_factor
+
+            masks = F.interpolate(
+                mask_preds, (ori_h, ori_w),
+                mode='bilinear',
+                align_corners=False).squeeze(0) > cfg.mask_thr
+        else:
+            masks = mask_preds.squeeze(0) > cfg.mask_thr
+
+        return masks
diff --git a/head_extractor/src/mmdet/models/dense_heads/conditional_detr_head.py b/head_extractor/src/mmdet/models/dense_heads/conditional_detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc2df2c215667121c5fe329f369510ecd4666faf
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/conditional_detr_head.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+from mmengine.model import bias_init_with_prob
+from torch import Tensor
+
+from mmdet.models.layers.transformer import inverse_sigmoid
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList
+from .detr_head import DETRHead
+
+
+@MODELS.register_module()
+class ConditionalDETRHead(DETRHead):
+    """Head of Conditional DETR. Conditional DETR: Conditional DETR for Fast
+    Training Convergence. More details can be found in the `paper.
+
+    <https://arxiv.org/abs/2108.06152>`_ .
+    """
+
+    def init_weights(self):
+        """Initialize weights of the transformer head."""
+        super().init_weights()
+        # The initialization below for transformer head is very
+        # important as we use Focal_loss for loss_cls
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            nn.init.constant_(self.fc_cls.bias, bias_init)
+
+    def forward(self, hidden_states: Tensor,
+                references: Tensor) -> Tuple[Tensor, Tensor]:
+        """"Forward function.
+
+        Args:
+            hidden_states (Tensor): Features from transformer decoder. If
+                `return_intermediate_dec` is True output has shape
+                (num_decoder_layers, bs, num_queries, dim), else has shape (1,
+                bs, num_queries, dim) which only contains the last layer
+                outputs.
+            references (Tensor): References from transformer decoder, has
+                shape (bs, num_queries, 2).
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - layers_cls_scores (Tensor): Outputs from the classification head,
+              shape (num_decoder_layers, bs, num_queries, cls_out_channels).
+              Note cls_out_channels should include background.
+            - layers_bbox_preds (Tensor): Sigmoid outputs from the regression
+              head with normalized coordinate format (cx, cy, w, h), has shape
+              (num_decoder_layers, bs, num_queries, 4).
+        """
+
+        references_unsigmoid = inverse_sigmoid(references)
+        layers_bbox_preds = []
+        for layer_id in range(hidden_states.shape[0]):
+            tmp_reg_preds = self.fc_reg(
+                self.activate(self.reg_ffn(hidden_states[layer_id])))
+            tmp_reg_preds[..., :2] += references_unsigmoid
+            outputs_coord = tmp_reg_preds.sigmoid()
+            layers_bbox_preds.append(outputs_coord)
+        layers_bbox_preds = torch.stack(layers_bbox_preds)
+
+        layers_cls_scores = self.fc_cls(hidden_states)
+        return layers_cls_scores, layers_bbox_preds
+
+    def loss(self, hidden_states: Tensor, references: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Features from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, dim).
+            references (Tensor): References from the transformer decoder, has
+               shape (num_decoder_layers, bs, num_queries, 2).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_and_predict(
+            self, hidden_states: Tensor, references: Tensor,
+            batch_data_samples: SampleList) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples. Over-write because
+        img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (Tensor): Features from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, dim).
+            references (Tensor): References from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, 2).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns:
+            tuple: The return value is a tuple contains:
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+            - predictions (list[:obj:`InstanceData`]): Detection
+              results of each image after the post process.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas)
+        return losses, predictions
+
+    def predict(self,
+                hidden_states: Tensor,
+                references: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network. Over-write
+        because img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (Tensor): Features from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, dim).
+            references (Tensor): References from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, 2).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        last_layer_hidden_state = hidden_states[-1].unsqueeze(0)
+        outs = self(last_layer_hidden_state, references)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+
+        return predictions
diff --git a/head_extractor/src/mmdet/models/dense_heads/corner_head.py b/head_extractor/src/mmdet/models/dense_heads/corner_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cec71d50947ff58224ae698ec9c2f9406b58efb
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/corner_head.py
@@ -0,0 +1,1084 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from logging import warning
+from math import ceil, log
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import CornerPool, batched_nms
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from ..utils import (gather_feat, gaussian_radius, gen_gaussian_target,
+                     get_local_maximum, get_topk_from_heatmap, multi_apply,
+                     transpose_and_gather_feat)
+from .base_dense_head import BaseDenseHead
+
+
+class BiCornerPool(BaseModule):
+    """Bidirectional Corner Pooling Module (TopLeft, BottomRight, etc.)
+
+    Args:
+        in_channels (int): Input channels of module.
+        directions (list[str]): Directions of two CornerPools.
+        out_channels (int): Output channels of module.
+        feat_channels (int): Feature channels of module.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct
+            and config norm layer.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to
+            control the initialization.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 directions: List[int],
+                 feat_channels: int = 128,
+                 out_channels: int = 128,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg)
+        self.direction1_conv = ConvModule(
+            in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg)
+        self.direction2_conv = ConvModule(
+            in_channels, feat_channels, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.aftpool_conv = ConvModule(
+            feat_channels,
+            out_channels,
+            3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.conv1 = ConvModule(
+            in_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        self.conv2 = ConvModule(
+            in_channels, out_channels, 3, padding=1, norm_cfg=norm_cfg)
+
+        self.direction1_pool = CornerPool(directions[0])
+        self.direction2_pool = CornerPool(directions[1])
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tensor): Input feature of BiCornerPool.
+
+        Returns:
+            conv2 (tensor): Output feature of BiCornerPool.
+        """
+        direction1_conv = self.direction1_conv(x)
+        direction2_conv = self.direction2_conv(x)
+        direction1_feat = self.direction1_pool(direction1_conv)
+        direction2_feat = self.direction2_pool(direction2_conv)
+        aftpool_conv = self.aftpool_conv(direction1_feat + direction2_feat)
+        conv1 = self.conv1(x)
+        relu = self.relu(aftpool_conv + conv1)
+        conv2 = self.conv2(relu)
+        return conv2
+
+
+@MODELS.register_module()
+class CornerHead(BaseDenseHead):
+    """Head of CornerNet: Detecting Objects as Paired Keypoints.
+
+    Code is modified from the `official github repo
+    <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/
+    kp.py#L73>`_ .
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1808.01244>`_ .
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        num_feat_levels (int): Levels of feature from the previous module.
+            2 for HourglassNet-104 and 1 for HourglassNet-52. Because
+            HourglassNet-104 outputs the final feature and intermediate
+            supervision feature and HourglassNet-52 only outputs the final
+            feature. Defaults to 2.
+        corner_emb_channels (int): Channel of embedding vector. Defaults to 1.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config.
+            Useless in CornerHead, but we keep this variable for
+            SingleStageDetector.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            CornerHead.
+        loss_heatmap (:obj:`ConfigDict` or dict): Config of corner heatmap
+            loss. Defaults to GaussianFocalLoss.
+        loss_embedding (:obj:`ConfigDict` or dict): Config of corner embedding
+            loss. Defaults to AssociativeEmbeddingLoss.
+        loss_offset (:obj:`ConfigDict` or dict): Config of corner offset loss.
+            Defaults to SmoothL1Loss.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 num_feat_levels: int = 2,
+                 corner_emb_channels: int = 1,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 loss_heatmap: ConfigType = dict(
+                     type='GaussianFocalLoss',
+                     alpha=2.0,
+                     gamma=4.0,
+                     loss_weight=1),
+                 loss_embedding: ConfigType = dict(
+                     type='AssociativeEmbeddingLoss',
+                     pull_weight=0.25,
+                     push_weight=0.25),
+                 loss_offset: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1),
+                 init_cfg: OptMultiConfig = None) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.corner_emb_channels = corner_emb_channels
+        self.with_corner_emb = self.corner_emb_channels > 0
+        self.corner_offset_channels = 2
+        self.num_feat_levels = num_feat_levels
+        self.loss_heatmap = MODELS.build(
+            loss_heatmap) if loss_heatmap is not None else None
+        self.loss_embedding = MODELS.build(
+            loss_embedding) if loss_embedding is not None else None
+        self.loss_offset = MODELS.build(
+            loss_offset) if loss_offset is not None else None
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        self._init_layers()
+
+    def _make_layers(self,
+                     out_channels: int,
+                     in_channels: int = 256,
+                     feat_channels: int = 256) -> nn.Sequential:
+        """Initialize conv sequential for CornerHead."""
+        return nn.Sequential(
+            ConvModule(in_channels, feat_channels, 3, padding=1),
+            ConvModule(
+                feat_channels, out_channels, 1, norm_cfg=None, act_cfg=None))
+
+    def _init_corner_kpt_layers(self) -> None:
+        """Initialize corner keypoint layers.
+
+        Including corner heatmap branch and corner offset branch. Each branch
+        has two parts: prefix `tl_` for top-left and `br_` for bottom-right.
+        """
+        self.tl_pool, self.br_pool = nn.ModuleList(), nn.ModuleList()
+        self.tl_heat, self.br_heat = nn.ModuleList(), nn.ModuleList()
+        self.tl_off, self.br_off = nn.ModuleList(), nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_pool.append(
+                BiCornerPool(
+                    self.in_channels, ['top', 'left'],
+                    out_channels=self.in_channels))
+            self.br_pool.append(
+                BiCornerPool(
+                    self.in_channels, ['bottom', 'right'],
+                    out_channels=self.in_channels))
+
+            self.tl_heat.append(
+                self._make_layers(
+                    out_channels=self.num_classes,
+                    in_channels=self.in_channels))
+            self.br_heat.append(
+                self._make_layers(
+                    out_channels=self.num_classes,
+                    in_channels=self.in_channels))
+
+            self.tl_off.append(
+                self._make_layers(
+                    out_channels=self.corner_offset_channels,
+                    in_channels=self.in_channels))
+            self.br_off.append(
+                self._make_layers(
+                    out_channels=self.corner_offset_channels,
+                    in_channels=self.in_channels))
+
+    def _init_corner_emb_layers(self) -> None:
+        """Initialize corner embedding layers.
+
+        Only include corner embedding branch with two parts: prefix `tl_` for
+        top-left and `br_` for bottom-right.
+        """
+        self.tl_emb, self.br_emb = nn.ModuleList(), nn.ModuleList()
+
+        for _ in range(self.num_feat_levels):
+            self.tl_emb.append(
+                self._make_layers(
+                    out_channels=self.corner_emb_channels,
+                    in_channels=self.in_channels))
+            self.br_emb.append(
+                self._make_layers(
+                    out_channels=self.corner_emb_channels,
+                    in_channels=self.in_channels))
+
+    def _init_layers(self) -> None:
+        """Initialize layers for CornerHead.
+
+        Including two parts: corner keypoint layers and corner embedding layers
+        """
+        self._init_corner_kpt_layers()
+        if self.with_corner_emb:
+            self._init_corner_emb_layers()
+
+    def init_weights(self) -> None:
+        super().init_weights()
+        bias_init = bias_init_with_prob(0.1)
+        for i in range(self.num_feat_levels):
+            # The initialization of parameters are different between
+            # nn.Conv2d and ConvModule. Our experiments show that
+            # using the original initialization of nn.Conv2d increases
+            # the final mAP by about 0.2%
+            self.tl_heat[i][-1].conv.reset_parameters()
+            self.tl_heat[i][-1].conv.bias.data.fill_(bias_init)
+            self.br_heat[i][-1].conv.reset_parameters()
+            self.br_heat[i][-1].conv.bias.data.fill_(bias_init)
+            self.tl_off[i][-1].conv.reset_parameters()
+            self.br_off[i][-1].conv.reset_parameters()
+            if self.with_corner_emb:
+                self.tl_emb[i][-1].conv.reset_parameters()
+                self.br_emb[i][-1].conv.reset_parameters()
+
+    def forward(self, feats: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of corner heatmaps, offset heatmaps and
+            embedding heatmaps.
+                - tl_heats (list[Tensor]): Top-left corner heatmaps for all
+                  levels, each is a 4D-tensor, the channels number is
+                  num_classes.
+                - br_heats (list[Tensor]): Bottom-right corner heatmaps for all
+                  levels, each is a 4D-tensor, the channels number is
+                  num_classes.
+                - tl_embs (list[Tensor] | list[None]): Top-left embedding
+                  heatmaps for all levels, each is a 4D-tensor or None.
+                  If not None, the channels number is corner_emb_channels.
+                - br_embs (list[Tensor] | list[None]): Bottom-right embedding
+                  heatmaps for all levels, each is a 4D-tensor or None.
+                  If not None, the channels number is corner_emb_channels.
+                - tl_offs (list[Tensor]): Top-left offset heatmaps for all
+                  levels, each is a 4D-tensor. The channels number is
+                  corner_offset_channels.
+                - br_offs (list[Tensor]): Bottom-right offset heatmaps for all
+                  levels, each is a 4D-tensor. The channels number is
+                  corner_offset_channels.
+        """
+        lvl_ind = list(range(self.num_feat_levels))
+        return multi_apply(self.forward_single, feats, lvl_ind)
+
+    def forward_single(self,
+                       x: Tensor,
+                       lvl_ind: int,
+                       return_pool: bool = False) -> List[Tensor]:
+        """Forward feature of a single level.
+
+        Args:
+            x (Tensor): Feature of a single level.
+            lvl_ind (int): Level index of current feature.
+            return_pool (bool): Return corner pool feature or not.
+                Defaults to False.
+
+        Returns:
+            tuple[Tensor]: A tuple of CornerHead's output for current feature
+            level. Containing the following Tensors:
+
+                - tl_heat (Tensor): Predicted top-left corner heatmap.
+                - br_heat (Tensor): Predicted bottom-right corner heatmap.
+                - tl_emb (Tensor | None): Predicted top-left embedding heatmap.
+                  None for `self.with_corner_emb == False`.
+                - br_emb (Tensor | None): Predicted bottom-right embedding
+                  heatmap. None for `self.with_corner_emb == False`.
+                - tl_off (Tensor): Predicted top-left offset heatmap.
+                - br_off (Tensor): Predicted bottom-right offset heatmap.
+                - tl_pool (Tensor): Top-left corner pool feature. Not must
+                  have.
+                - br_pool (Tensor): Bottom-right corner pool feature. Not must
+                  have.
+        """
+        tl_pool = self.tl_pool[lvl_ind](x)
+        tl_heat = self.tl_heat[lvl_ind](tl_pool)
+        br_pool = self.br_pool[lvl_ind](x)
+        br_heat = self.br_heat[lvl_ind](br_pool)
+
+        tl_emb, br_emb = None, None
+        if self.with_corner_emb:
+            tl_emb = self.tl_emb[lvl_ind](tl_pool)
+            br_emb = self.br_emb[lvl_ind](br_pool)
+
+        tl_off = self.tl_off[lvl_ind](tl_pool)
+        br_off = self.br_off[lvl_ind](br_pool)
+
+        result_list = [tl_heat, br_heat, tl_emb, br_emb, tl_off, br_off]
+        if return_pool:
+            result_list.append(tl_pool)
+            result_list.append(br_pool)
+
+        return result_list
+
+    def get_targets(self,
+                    gt_bboxes: List[Tensor],
+                    gt_labels: List[Tensor],
+                    feat_shape: Sequence[int],
+                    img_shape: Sequence[int],
+                    with_corner_emb: bool = False,
+                    with_guiding_shift: bool = False,
+                    with_centripetal_shift: bool = False) -> dict:
+        """Generate corner targets.
+
+        Including corner heatmap, corner offset.
+
+        Optional: corner embedding, corner guiding shift, centripetal shift.
+
+        For CornerNet, we generate corner heatmap, corner offset and corner
+        embedding from this function.
+
+        For CentripetalNet, we generate corner heatmap, corner offset, guiding
+        shift and centripetal shift from this function.
+
+        Args:
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image, each
+                has shape (num_gt, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box, each has
+                shape (num_gt, ).
+            feat_shape (Sequence[int]): Shape of output feature,
+                [batch, channel, height, width].
+            img_shape (Sequence[int]): Shape of input image,
+                [height, width, channel].
+            with_corner_emb (bool): Generate corner embedding target or not.
+                Defaults to False.
+            with_guiding_shift (bool): Generate guiding shift target or not.
+                Defaults to False.
+            with_centripetal_shift (bool): Generate centripetal shift target or
+                not. Defaults to False.
+
+        Returns:
+            dict: Ground truth of corner heatmap, corner offset, corner
+            embedding, guiding shift and centripetal shift. Containing the
+            following keys:
+
+                - topleft_heatmap (Tensor): Ground truth top-left corner
+                  heatmap.
+                - bottomright_heatmap (Tensor): Ground truth bottom-right
+                  corner heatmap.
+                - topleft_offset (Tensor): Ground truth top-left corner offset.
+                - bottomright_offset (Tensor): Ground truth bottom-right corner
+                  offset.
+                - corner_embedding (list[list[list[int]]]): Ground truth corner
+                  embedding. Not must have.
+                - topleft_guiding_shift (Tensor): Ground truth top-left corner
+                  guiding shift. Not must have.
+                - bottomright_guiding_shift (Tensor): Ground truth bottom-right
+                  corner guiding shift. Not must have.
+                - topleft_centripetal_shift (Tensor): Ground truth top-left
+                  corner centripetal shift. Not must have.
+                - bottomright_centripetal_shift (Tensor): Ground truth
+                  bottom-right corner centripetal shift. Not must have.
+        """
+        batch_size, _, height, width = feat_shape
+        img_h, img_w = img_shape[:2]
+
+        width_ratio = float(width / img_w)
+        height_ratio = float(height / img_h)
+
+        gt_tl_heatmap = gt_bboxes[-1].new_zeros(
+            [batch_size, self.num_classes, height, width])
+        gt_br_heatmap = gt_bboxes[-1].new_zeros(
+            [batch_size, self.num_classes, height, width])
+        gt_tl_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width])
+        gt_br_offset = gt_bboxes[-1].new_zeros([batch_size, 2, height, width])
+
+        if with_corner_emb:
+            match = []
+
+        # Guiding shift is a kind of offset, from center to corner
+        if with_guiding_shift:
+            gt_tl_guiding_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+            gt_br_guiding_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+        # Centripetal shift is also a kind of offset, from center to corner
+        # and normalized by log.
+        if with_centripetal_shift:
+            gt_tl_centripetal_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+            gt_br_centripetal_shift = gt_bboxes[-1].new_zeros(
+                [batch_size, 2, height, width])
+
+        for batch_id in range(batch_size):
+            # Ground truth of corner embedding per image is a list of coord set
+            corner_match = []
+            for box_id in range(len(gt_labels[batch_id])):
+                left, top, right, bottom = gt_bboxes[batch_id][box_id]
+                center_x = (left + right) / 2.0
+                center_y = (top + bottom) / 2.0
+                label = gt_labels[batch_id][box_id]
+
+                # Use coords in the feature level to generate ground truth
+                scale_left = left * width_ratio
+                scale_right = right * width_ratio
+                scale_top = top * height_ratio
+                scale_bottom = bottom * height_ratio
+                scale_center_x = center_x * width_ratio
+                scale_center_y = center_y * height_ratio
+
+                # Int coords on feature map/ground truth tensor
+                left_idx = int(min(scale_left, width - 1))
+                right_idx = int(min(scale_right, width - 1))
+                top_idx = int(min(scale_top, height - 1))
+                bottom_idx = int(min(scale_bottom, height - 1))
+
+                # Generate gaussian heatmap
+                scale_box_width = ceil(scale_right - scale_left)
+                scale_box_height = ceil(scale_bottom - scale_top)
+                radius = gaussian_radius((scale_box_height, scale_box_width),
+                                         min_overlap=0.3)
+                radius = max(0, int(radius))
+                gt_tl_heatmap[batch_id, label] = gen_gaussian_target(
+                    gt_tl_heatmap[batch_id, label], [left_idx, top_idx],
+                    radius)
+                gt_br_heatmap[batch_id, label] = gen_gaussian_target(
+                    gt_br_heatmap[batch_id, label], [right_idx, bottom_idx],
+                    radius)
+
+                # Generate corner offset
+                left_offset = scale_left - left_idx
+                top_offset = scale_top - top_idx
+                right_offset = scale_right - right_idx
+                bottom_offset = scale_bottom - bottom_idx
+                gt_tl_offset[batch_id, 0, top_idx, left_idx] = left_offset
+                gt_tl_offset[batch_id, 1, top_idx, left_idx] = top_offset
+                gt_br_offset[batch_id, 0, bottom_idx, right_idx] = right_offset
+                gt_br_offset[batch_id, 1, bottom_idx,
+                             right_idx] = bottom_offset
+
+                # Generate corner embedding
+                if with_corner_emb:
+                    corner_match.append([[top_idx, left_idx],
+                                         [bottom_idx, right_idx]])
+                # Generate guiding shift
+                if with_guiding_shift:
+                    gt_tl_guiding_shift[batch_id, 0, top_idx,
+                                        left_idx] = scale_center_x - left_idx
+                    gt_tl_guiding_shift[batch_id, 1, top_idx,
+                                        left_idx] = scale_center_y - top_idx
+                    gt_br_guiding_shift[batch_id, 0, bottom_idx,
+                                        right_idx] = right_idx - scale_center_x
+                    gt_br_guiding_shift[
+                        batch_id, 1, bottom_idx,
+                        right_idx] = bottom_idx - scale_center_y
+                # Generate centripetal shift
+                if with_centripetal_shift:
+                    gt_tl_centripetal_shift[batch_id, 0, top_idx,
+                                            left_idx] = log(scale_center_x -
+                                                            scale_left)
+                    gt_tl_centripetal_shift[batch_id, 1, top_idx,
+                                            left_idx] = log(scale_center_y -
+                                                            scale_top)
+                    gt_br_centripetal_shift[batch_id, 0, bottom_idx,
+                                            right_idx] = log(scale_right -
+                                                             scale_center_x)
+                    gt_br_centripetal_shift[batch_id, 1, bottom_idx,
+                                            right_idx] = log(scale_bottom -
+                                                             scale_center_y)
+
+            if with_corner_emb:
+                match.append(corner_match)
+
+        target_result = dict(
+            topleft_heatmap=gt_tl_heatmap,
+            topleft_offset=gt_tl_offset,
+            bottomright_heatmap=gt_br_heatmap,
+            bottomright_offset=gt_br_offset)
+
+        if with_corner_emb:
+            target_result.update(corner_embedding=match)
+        if with_guiding_shift:
+            target_result.update(
+                topleft_guiding_shift=gt_tl_guiding_shift,
+                bottomright_guiding_shift=gt_br_guiding_shift)
+        if with_centripetal_shift:
+            target_result.update(
+                topleft_centripetal_shift=gt_tl_centripetal_shift,
+                bottomright_centripetal_shift=gt_br_centripetal_shift)
+
+        return target_result
+
+    def loss_by_feat(
+            self,
+            tl_heats: List[Tensor],
+            br_heats: List[Tensor],
+            tl_embs: List[Tensor],
+            br_embs: List[Tensor],
+            tl_offs: List[Tensor],
+            br_offs: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_embs (list[Tensor]): Top-left corner embeddings for each level
+                with shape (N, corner_emb_channels, H, W).
+            br_embs (list[Tensor]): Bottom-right corner embeddings for each
+                level with shape (N, corner_emb_channels, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Specify which bounding boxes can be ignored when computing
+                the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components. Containing the
+            following losses:
+
+                - det_loss (list[Tensor]): Corner keypoint losses of all
+                  feature levels.
+                - pull_loss (list[Tensor]): Part one of AssociativeEmbedding
+                  losses of all feature levels.
+                - push_loss (list[Tensor]): Part two of AssociativeEmbedding
+                  losses of all feature levels.
+                - off_loss (list[Tensor]): Corner offset losses of all feature
+                  levels.
+        """
+        gt_bboxes = [
+            gt_instances.bboxes for gt_instances in batch_gt_instances
+        ]
+        gt_labels = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+
+        targets = self.get_targets(
+            gt_bboxes,
+            gt_labels,
+            tl_heats[-1].shape,
+            batch_img_metas[0]['batch_input_shape'],
+            with_corner_emb=self.with_corner_emb)
+        mlvl_targets = [targets for _ in range(self.num_feat_levels)]
+        det_losses, pull_losses, push_losses, off_losses = multi_apply(
+            self.loss_by_feat_single, tl_heats, br_heats, tl_embs, br_embs,
+            tl_offs, br_offs, mlvl_targets)
+        loss_dict = dict(det_loss=det_losses, off_loss=off_losses)
+        if self.with_corner_emb:
+            loss_dict.update(pull_loss=pull_losses, push_loss=push_losses)
+        return loss_dict
+
+    def loss_by_feat_single(self, tl_hmp: Tensor, br_hmp: Tensor,
+                            tl_emb: Optional[Tensor], br_emb: Optional[Tensor],
+                            tl_off: Tensor, br_off: Tensor,
+                            targets: dict) -> Tuple[Tensor, ...]:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            tl_hmp (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_hmp (Tensor): Bottom-right corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            tl_emb (Tensor, optional): Top-left corner embedding for current
+                level with shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor, optional): Bottom-right corner embedding for
+                current level with shape (N, corner_emb_channels, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            targets (dict): Corner target generated by `get_targets`.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of the head's different branches
+            containing the following losses:
+
+                - det_loss (Tensor): Corner keypoint loss.
+                - pull_loss (Tensor): Part one of AssociativeEmbedding loss.
+                - push_loss (Tensor): Part two of AssociativeEmbedding loss.
+                - off_loss (Tensor): Corner offset loss.
+        """
+        gt_tl_hmp = targets['topleft_heatmap']
+        gt_br_hmp = targets['bottomright_heatmap']
+        gt_tl_off = targets['topleft_offset']
+        gt_br_off = targets['bottomright_offset']
+        gt_embedding = targets['corner_embedding']
+
+        # Detection loss
+        tl_det_loss = self.loss_heatmap(
+            tl_hmp.sigmoid(),
+            gt_tl_hmp,
+            avg_factor=max(1,
+                           gt_tl_hmp.eq(1).sum()))
+        br_det_loss = self.loss_heatmap(
+            br_hmp.sigmoid(),
+            gt_br_hmp,
+            avg_factor=max(1,
+                           gt_br_hmp.eq(1).sum()))
+        det_loss = (tl_det_loss + br_det_loss) / 2.0
+
+        # AssociativeEmbedding loss
+        if self.with_corner_emb and self.loss_embedding is not None:
+            pull_loss, push_loss = self.loss_embedding(tl_emb, br_emb,
+                                                       gt_embedding)
+        else:
+            pull_loss, push_loss = None, None
+
+        # Offset loss
+        # We only compute the offset loss at the real corner position.
+        # The value of real corner would be 1 in heatmap ground truth.
+        # The mask is computed in class agnostic mode and its shape is
+        # batch * 1 * width * height.
+        tl_off_mask = gt_tl_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_tl_hmp)
+        br_off_mask = gt_br_hmp.eq(1).sum(1).gt(0).unsqueeze(1).type_as(
+            gt_br_hmp)
+        tl_off_loss = self.loss_offset(
+            tl_off,
+            gt_tl_off,
+            tl_off_mask,
+            avg_factor=max(1, tl_off_mask.sum()))
+        br_off_loss = self.loss_offset(
+            br_off,
+            gt_br_off,
+            br_off_mask,
+            avg_factor=max(1, br_off_mask.sum()))
+
+        off_loss = (tl_off_loss + br_off_loss) / 2.0
+
+        return det_loss, pull_loss, push_loss, off_loss
+
+    def predict_by_feat(self,
+                        tl_heats: List[Tensor],
+                        br_heats: List[Tensor],
+                        tl_embs: List[Tensor],
+                        br_embs: List[Tensor],
+                        tl_offs: List[Tensor],
+                        br_offs: List[Tensor],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            tl_heats (list[Tensor]): Top-left corner heatmaps for each level
+                with shape (N, num_classes, H, W).
+            br_heats (list[Tensor]): Bottom-right corner heatmaps for each
+                level with shape (N, num_classes, H, W).
+            tl_embs (list[Tensor]): Top-left corner embeddings for each level
+                with shape (N, corner_emb_channels, H, W).
+            br_embs (list[Tensor]): Bottom-right corner embeddings for each
+                level with shape (N, corner_emb_channels, H, W).
+            tl_offs (list[Tensor]): Top-left corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            br_offs (list[Tensor]): Bottom-right corner offsets for each level
+                with shape (N, corner_offset_channels, H, W).
+            batch_img_metas (list[dict], optional): Batch image meta info.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert tl_heats[-1].shape[0] == br_heats[-1].shape[0] == len(
+            batch_img_metas)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            result_list.append(
+                self._predict_by_feat_single(
+                    tl_heats[-1][img_id:img_id + 1, :],
+                    br_heats[-1][img_id:img_id + 1, :],
+                    tl_offs[-1][img_id:img_id + 1, :],
+                    br_offs[-1][img_id:img_id + 1, :],
+                    batch_img_metas[img_id],
+                    tl_emb=tl_embs[-1][img_id:img_id + 1, :],
+                    br_emb=br_embs[-1][img_id:img_id + 1, :],
+                    rescale=rescale,
+                    with_nms=with_nms))
+
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                tl_heat: Tensor,
+                                br_heat: Tensor,
+                                tl_off: Tensor,
+                                br_off: Tensor,
+                                img_meta: dict,
+                                tl_emb: Optional[Tensor] = None,
+                                br_emb: Optional[Tensor] = None,
+                                tl_centripetal_shift: Optional[Tensor] = None,
+                                br_centripetal_shift: Optional[Tensor] = None,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            tl_heat (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_heat (Tensor): Bottom-right corner heatmap for current level
+                with shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            tl_emb (Tensor): Top-left corner embedding for current level with
+                shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor): Bottom-right corner embedding for current level
+                with shape (N, corner_emb_channels, H, W).
+            tl_centripetal_shift: Top-left corner's centripetal shift for
+                current level with shape (N, 2, H, W).
+            br_centripetal_shift: Bottom-right corner's centripetal shift for
+                current level with shape (N, 2, H, W).
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if isinstance(img_meta, (list, tuple)):
+            img_meta = img_meta[0]
+
+        batch_bboxes, batch_scores, batch_clses = self._decode_heatmap(
+            tl_heat=tl_heat.sigmoid(),
+            br_heat=br_heat.sigmoid(),
+            tl_off=tl_off,
+            br_off=br_off,
+            tl_emb=tl_emb,
+            br_emb=br_emb,
+            tl_centripetal_shift=tl_centripetal_shift,
+            br_centripetal_shift=br_centripetal_shift,
+            img_meta=img_meta,
+            k=self.test_cfg.corner_topk,
+            kernel=self.test_cfg.local_maximum_kernel,
+            distance_threshold=self.test_cfg.distance_threshold)
+
+        if rescale and 'scale_factor' in img_meta:
+            batch_bboxes /= batch_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        bboxes = batch_bboxes.view([-1, 4])
+        scores = batch_scores.view(-1)
+        clses = batch_clses.view(-1)
+
+        det_bboxes = torch.cat([bboxes, scores.unsqueeze(-1)], -1)
+        keepinds = (det_bboxes[:, -1] > -0.1)
+        det_bboxes = det_bboxes[keepinds]
+        det_labels = clses[keepinds]
+
+        if with_nms:
+            det_bboxes, det_labels = self._bboxes_nms(det_bboxes, det_labels,
+                                                      self.test_cfg)
+
+        results = InstanceData()
+        results.bboxes = det_bboxes[..., :4]
+        results.scores = det_bboxes[..., 4]
+        results.labels = det_labels
+        return results
+
+    def _bboxes_nms(self, bboxes: Tensor, labels: Tensor,
+                    cfg: ConfigDict) -> Tuple[Tensor, Tensor]:
+        """bboxes nms."""
+        if 'nms_cfg' in cfg:
+            warning.warn('nms_cfg in test_cfg will be deprecated. '
+                         'Please rename it as nms')
+        if 'nms' not in cfg:
+            cfg.nms = cfg.nms_cfg
+
+        if labels.numel() > 0:
+            max_num = cfg.max_per_img
+            bboxes, keep = batched_nms(bboxes[:, :4], bboxes[:,
+                                                             -1].contiguous(),
+                                       labels, cfg.nms)
+            if max_num > 0:
+                bboxes = bboxes[:max_num]
+                labels = labels[keep][:max_num]
+
+        return bboxes, labels
+
+    def _decode_heatmap(self,
+                        tl_heat: Tensor,
+                        br_heat: Tensor,
+                        tl_off: Tensor,
+                        br_off: Tensor,
+                        tl_emb: Optional[Tensor] = None,
+                        br_emb: Optional[Tensor] = None,
+                        tl_centripetal_shift: Optional[Tensor] = None,
+                        br_centripetal_shift: Optional[Tensor] = None,
+                        img_meta: Optional[dict] = None,
+                        k: int = 100,
+                        kernel: int = 3,
+                        distance_threshold: float = 0.5,
+                        num_dets: int = 1000) -> Tuple[Tensor, Tensor, Tensor]:
+        """Transform outputs into detections raw bbox prediction.
+
+        Args:
+            tl_heat (Tensor): Top-left corner heatmap for current level with
+                shape (N, num_classes, H, W).
+            br_heat (Tensor): Bottom-right corner heatmap for current level
+                with shape (N, num_classes, H, W).
+            tl_off (Tensor): Top-left corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            br_off (Tensor): Bottom-right corner offset for current level with
+                shape (N, corner_offset_channels, H, W).
+            tl_emb (Tensor, Optional): Top-left corner embedding for current
+                level with shape (N, corner_emb_channels, H, W).
+            br_emb (Tensor, Optional): Bottom-right corner embedding for
+                current level with shape (N, corner_emb_channels, H, W).
+            tl_centripetal_shift (Tensor, Optional): Top-left centripetal shift
+                for current level with shape (N, 2, H, W).
+            br_centripetal_shift (Tensor, Optional): Bottom-right centripetal
+                shift for current level with shape (N, 2, H, W).
+            img_meta (dict): Meta information of current image, e.g.,
+                image size, scaling factor, etc.
+            k (int): Get top k corner keypoints from heatmap.
+            kernel (int): Max pooling kernel for extract local maximum pixels.
+            distance_threshold (float): Distance threshold. Top-left and
+                bottom-right corner keypoints with feature distance less than
+                the threshold will be regarded as keypoints from same object.
+            num_dets (int): Num of raw boxes before doing nms.
+
+        Returns:
+            tuple[torch.Tensor]: Decoded output of CornerHead, containing the
+            following Tensors:
+
+            - bboxes (Tensor): Coords of each box.
+            - scores (Tensor): Scores of each box.
+            - clses (Tensor): Categories of each box.
+        """
+        with_embedding = tl_emb is not None and br_emb is not None
+        with_centripetal_shift = (
+            tl_centripetal_shift is not None
+            and br_centripetal_shift is not None)
+        assert with_embedding + with_centripetal_shift == 1
+        batch, _, height, width = tl_heat.size()
+        if torch.onnx.is_in_onnx_export():
+            inp_h, inp_w = img_meta['pad_shape_for_onnx'][:2]
+        else:
+            inp_h, inp_w = img_meta['batch_input_shape'][:2]
+
+        # perform nms on heatmaps
+        tl_heat = get_local_maximum(tl_heat, kernel=kernel)
+        br_heat = get_local_maximum(br_heat, kernel=kernel)
+
+        tl_scores, tl_inds, tl_clses, tl_ys, tl_xs = get_topk_from_heatmap(
+            tl_heat, k=k)
+        br_scores, br_inds, br_clses, br_ys, br_xs = get_topk_from_heatmap(
+            br_heat, k=k)
+
+        # We use repeat instead of expand here because expand is a
+        # shallow-copy function. Thus it could cause unexpected testing result
+        # sometimes. Using expand will decrease about 10% mAP during testing
+        # compared to repeat.
+        tl_ys = tl_ys.view(batch, k, 1).repeat(1, 1, k)
+        tl_xs = tl_xs.view(batch, k, 1).repeat(1, 1, k)
+        br_ys = br_ys.view(batch, 1, k).repeat(1, k, 1)
+        br_xs = br_xs.view(batch, 1, k).repeat(1, k, 1)
+
+        tl_off = transpose_and_gather_feat(tl_off, tl_inds)
+        tl_off = tl_off.view(batch, k, 1, 2)
+        br_off = transpose_and_gather_feat(br_off, br_inds)
+        br_off = br_off.view(batch, 1, k, 2)
+
+        tl_xs = tl_xs + tl_off[..., 0]
+        tl_ys = tl_ys + tl_off[..., 1]
+        br_xs = br_xs + br_off[..., 0]
+        br_ys = br_ys + br_off[..., 1]
+
+        if with_centripetal_shift:
+            tl_centripetal_shift = transpose_and_gather_feat(
+                tl_centripetal_shift, tl_inds).view(batch, k, 1, 2).exp()
+            br_centripetal_shift = transpose_and_gather_feat(
+                br_centripetal_shift, br_inds).view(batch, 1, k, 2).exp()
+
+            tl_ctxs = tl_xs + tl_centripetal_shift[..., 0]
+            tl_ctys = tl_ys + tl_centripetal_shift[..., 1]
+            br_ctxs = br_xs - br_centripetal_shift[..., 0]
+            br_ctys = br_ys - br_centripetal_shift[..., 1]
+
+        # all possible boxes based on top k corners (ignoring class)
+        tl_xs *= (inp_w / width)
+        tl_ys *= (inp_h / height)
+        br_xs *= (inp_w / width)
+        br_ys *= (inp_h / height)
+
+        if with_centripetal_shift:
+            tl_ctxs *= (inp_w / width)
+            tl_ctys *= (inp_h / height)
+            br_ctxs *= (inp_w / width)
+            br_ctys *= (inp_h / height)
+
+        x_off, y_off = 0, 0  # no crop
+        if not torch.onnx.is_in_onnx_export():
+            # since `RandomCenterCropPad` is done on CPU with numpy and it's
+            # not dynamic traceable when exporting to ONNX, thus 'border'
+            # does not appears as key in 'img_meta'. As a tmp solution,
+            # we move this 'border' handle part to the postprocess after
+            # finished exporting to ONNX, which is handle in
+            # `mmdet/core/export/model_wrappers.py`. Though difference between
+            # pytorch and exported onnx model, it might be ignored since
+            # comparable performance is achieved between them (e.g. 40.4 vs
+            # 40.6 on COCO val2017, for CornerNet without test-time flip)
+            if 'border' in img_meta:
+                x_off = img_meta['border'][2]
+                y_off = img_meta['border'][0]
+
+        tl_xs -= x_off
+        tl_ys -= y_off
+        br_xs -= x_off
+        br_ys -= y_off
+
+        zeros = tl_xs.new_zeros(*tl_xs.size())
+        tl_xs = torch.where(tl_xs > 0.0, tl_xs, zeros)
+        tl_ys = torch.where(tl_ys > 0.0, tl_ys, zeros)
+        br_xs = torch.where(br_xs > 0.0, br_xs, zeros)
+        br_ys = torch.where(br_ys > 0.0, br_ys, zeros)
+
+        bboxes = torch.stack((tl_xs, tl_ys, br_xs, br_ys), dim=3)
+        area_bboxes = ((br_xs - tl_xs) * (br_ys - tl_ys)).abs()
+
+        if with_centripetal_shift:
+            tl_ctxs -= x_off
+            tl_ctys -= y_off
+            br_ctxs -= x_off
+            br_ctys -= y_off
+
+            tl_ctxs *= tl_ctxs.gt(0.0).type_as(tl_ctxs)
+            tl_ctys *= tl_ctys.gt(0.0).type_as(tl_ctys)
+            br_ctxs *= br_ctxs.gt(0.0).type_as(br_ctxs)
+            br_ctys *= br_ctys.gt(0.0).type_as(br_ctys)
+
+            ct_bboxes = torch.stack((tl_ctxs, tl_ctys, br_ctxs, br_ctys),
+                                    dim=3)
+            area_ct_bboxes = ((br_ctxs - tl_ctxs) * (br_ctys - tl_ctys)).abs()
+
+            rcentral = torch.zeros_like(ct_bboxes)
+            # magic nums from paper section 4.1
+            mu = torch.ones_like(area_bboxes) / 2.4
+            mu[area_bboxes > 3500] = 1 / 2.1  # large bbox have smaller mu
+
+            bboxes_center_x = (bboxes[..., 0] + bboxes[..., 2]) / 2
+            bboxes_center_y = (bboxes[..., 1] + bboxes[..., 3]) / 2
+            rcentral[..., 0] = bboxes_center_x - mu * (bboxes[..., 2] -
+                                                       bboxes[..., 0]) / 2
+            rcentral[..., 1] = bboxes_center_y - mu * (bboxes[..., 3] -
+                                                       bboxes[..., 1]) / 2
+            rcentral[..., 2] = bboxes_center_x + mu * (bboxes[..., 2] -
+                                                       bboxes[..., 0]) / 2
+            rcentral[..., 3] = bboxes_center_y + mu * (bboxes[..., 3] -
+                                                       bboxes[..., 1]) / 2
+            area_rcentral = ((rcentral[..., 2] - rcentral[..., 0]) *
+                             (rcentral[..., 3] - rcentral[..., 1])).abs()
+            dists = area_ct_bboxes / area_rcentral
+
+            tl_ctx_inds = (ct_bboxes[..., 0] <= rcentral[..., 0]) | (
+                ct_bboxes[..., 0] >= rcentral[..., 2])
+            tl_cty_inds = (ct_bboxes[..., 1] <= rcentral[..., 1]) | (
+                ct_bboxes[..., 1] >= rcentral[..., 3])
+            br_ctx_inds = (ct_bboxes[..., 2] <= rcentral[..., 0]) | (
+                ct_bboxes[..., 2] >= rcentral[..., 2])
+            br_cty_inds = (ct_bboxes[..., 3] <= rcentral[..., 1]) | (
+                ct_bboxes[..., 3] >= rcentral[..., 3])
+
+        if with_embedding:
+            tl_emb = transpose_and_gather_feat(tl_emb, tl_inds)
+            tl_emb = tl_emb.view(batch, k, 1)
+            br_emb = transpose_and_gather_feat(br_emb, br_inds)
+            br_emb = br_emb.view(batch, 1, k)
+            dists = torch.abs(tl_emb - br_emb)
+
+        tl_scores = tl_scores.view(batch, k, 1).repeat(1, 1, k)
+        br_scores = br_scores.view(batch, 1, k).repeat(1, k, 1)
+
+        scores = (tl_scores + br_scores) / 2  # scores for all possible boxes
+
+        # tl and br should have same class
+        tl_clses = tl_clses.view(batch, k, 1).repeat(1, 1, k)
+        br_clses = br_clses.view(batch, 1, k).repeat(1, k, 1)
+        cls_inds = (tl_clses != br_clses)
+
+        # reject boxes based on distances
+        dist_inds = dists > distance_threshold
+
+        # reject boxes based on widths and heights
+        width_inds = (br_xs <= tl_xs)
+        height_inds = (br_ys <= tl_ys)
+
+        # No use `scores[cls_inds]`, instead we use `torch.where` here.
+        # Since only 1-D indices with type 'tensor(bool)' are supported
+        # when exporting to ONNX, any other bool indices with more dimensions
+        # (e.g. 2-D bool tensor) as input parameter in node is invalid
+        negative_scores = -1 * torch.ones_like(scores)
+        scores = torch.where(cls_inds, negative_scores, scores)
+        scores = torch.where(width_inds, negative_scores, scores)
+        scores = torch.where(height_inds, negative_scores, scores)
+        scores = torch.where(dist_inds, negative_scores, scores)
+
+        if with_centripetal_shift:
+            scores[tl_ctx_inds] = -1
+            scores[tl_cty_inds] = -1
+            scores[br_ctx_inds] = -1
+            scores[br_cty_inds] = -1
+
+        scores = scores.view(batch, -1)
+        scores, inds = torch.topk(scores, num_dets)
+        scores = scores.unsqueeze(2)
+
+        bboxes = bboxes.view(batch, -1, 4)
+        bboxes = gather_feat(bboxes, inds)
+
+        clses = tl_clses.contiguous().view(batch, -1, 1)
+        clses = gather_feat(clses, inds)
+
+        return bboxes, scores, clses
diff --git a/head_extractor/src/mmdet/models/dense_heads/dab_detr_head.py b/head_extractor/src/mmdet/models/dense_heads/dab_detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..892833ffce5f17f6f9e82e67b7d32c6b9c1bafc0
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/dab_detr_head.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn as nn
+from mmcv.cnn import Linear
+from mmengine.model import bias_init_with_prob, constant_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList
+from ..layers import MLP, inverse_sigmoid
+from .conditional_detr_head import ConditionalDETRHead
+
+
+@MODELS.register_module()
+class DABDETRHead(ConditionalDETRHead):
+    """Head of DAB-DETR. DAB-DETR: Dynamic Anchor Boxes are Better Queries for
+    DETR.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2201.12329>`_ .
+    """
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the transformer head."""
+        # cls branch
+        self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        # reg branch
+        self.fc_reg = MLP(self.embed_dims, self.embed_dims, 4, 3)
+
+    def init_weights(self) -> None:
+        """initialize weights."""
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            nn.init.constant_(self.fc_cls.bias, bias_init)
+        constant_init(self.fc_reg.layers[-1], 0., bias=0.)
+
+    def forward(self, hidden_states: Tensor,
+                references: Tensor) -> Tuple[Tensor, Tensor]:
+        """"Forward function.
+
+        Args:
+            hidden_states (Tensor): Features from transformer decoder. If
+                `return_intermediate_dec` is True output has shape
+                (num_decoder_layers, bs, num_queries, dim), else has shape (1,
+                bs, num_queries, dim) which only contains the last layer
+                outputs.
+            references (Tensor): References from transformer decoder. If
+                `return_intermediate_dec` is True output has shape
+                (num_decoder_layers, bs, num_queries, 2/4), else has shape (1,
+                bs, num_queries, 2/4)
+                which only contains the last layer reference.
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - layers_cls_scores (Tensor): Outputs from the classification head,
+              shape (num_decoder_layers, bs, num_queries, cls_out_channels).
+              Note cls_out_channels should include background.
+            - layers_bbox_preds (Tensor): Sigmoid outputs from the regression
+              head with normalized coordinate format (cx, cy, w, h), has shape
+              (num_decoder_layers, bs, num_queries, 4).
+        """
+        layers_cls_scores = self.fc_cls(hidden_states)
+        references_before_sigmoid = inverse_sigmoid(references, eps=1e-3)
+        tmp_reg_preds = self.fc_reg(hidden_states)
+        tmp_reg_preds[..., :references_before_sigmoid.
+                      size(-1)] += references_before_sigmoid
+        layers_bbox_preds = tmp_reg_preds.sigmoid()
+        return layers_cls_scores, layers_bbox_preds
+
+    def predict(self,
+                hidden_states: Tensor,
+                references: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network. Over-write
+        because img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (Tensor): Feature from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, dim).
+            references (Tensor): references from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, 2/4).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        last_layer_hidden_state = hidden_states[-1].unsqueeze(0)
+        last_layer_reference = references[-1].unsqueeze(0)
+        outs = self(last_layer_hidden_state, last_layer_reference)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+        return predictions
diff --git a/head_extractor/src/mmdet/models/dense_heads/ddod_head.py b/head_extractor/src/mmdet/models/dense_heads/ddod_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..64e91ff0135230a8d634c5964eb520e1461c872a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/ddod_head.py
@@ -0,0 +1,794 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+from mmengine.model import bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import images_to_levels, multi_apply, unmap
+from .anchor_head import AnchorHead
+
+EPS = 1e-12
+
+
+@MODELS.register_module()
+class DDODHead(AnchorHead):
+    """Detection Head of `DDOD <https://arxiv.org/abs/2107.02963>`_.
+
+    DDOD head decomposes conjunctions lying in most current one-stage
+    detectors via label assignment disentanglement, spatial feature
+    disentanglement, and pyramid supervision disentanglement.
+
+    Args:
+        num_classes (int): Number of categories excluding the
+            background category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): The number of stacked Conv. Defaults to 4.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        use_dcn (bool): Use dcn, Same as ATSS when False. Defaults to True.
+        norm_cfg (:obj:`ConfigDict` or dict): Normal config of ddod head.
+            Defaults to dict(type='GN', num_groups=32, requires_grad=True).
+        loss_iou (:obj:`ConfigDict` or dict): Config of IoU loss. Defaults to
+            dict(type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0).
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 use_dcn: bool = True,
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 loss_iou: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 **kwargs) -> None:
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.use_dcn = use_dcn
+        super().__init__(num_classes, in_channels, **kwargs)
+
+        if self.train_cfg:
+            self.cls_assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            self.reg_assigner = TASK_UTILS.build(
+                self.train_cfg['reg_assigner'])
+        self.loss_iou = MODELS.build(loss_iou)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=dict(type='DCN', deform_groups=1)
+                    if i == 0 and self.use_dcn else self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=dict(type='DCN', deform_groups=1)
+                    if i == 0 and self.use_dcn else self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.atss_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.atss_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+        self.atss_iou = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 1, 3, padding=1)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+        # we use the global list in loss
+        self.cls_num_pos_samples_per_level = [
+            0. for _ in range(len(self.prior_generator.strides))
+        ]
+        self.reg_num_pos_samples_per_level = [
+            0. for _ in range(len(self.prior_generator.strides))
+        ]
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.cls_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            normal_init(m.conv, std=0.01)
+        normal_init(self.atss_reg, std=0.01)
+        normal_init(self.atss_iou, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.atss_cls, std=0.01, bias=bias_cls)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores, bbox predictions,
+            and iou predictions.
+
+            - cls_scores (list[Tensor]): Classification scores for all \
+            scale levels, each is a 4D-tensor, the channels number is \
+            num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all \
+            scale levels, each is a 4D-tensor, the channels number is \
+            num_base_priors * 4.
+            - iou_preds (list[Tensor]): IoU scores for all scale levels, \
+            each is a 4D-tensor, the channels number is num_base_priors * 1.
+        """
+        return multi_apply(self.forward_single, x, self.scales)
+
+    def forward_single(self, x: Tensor, scale: Scale) -> Sequence[Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+
+            - cls_score (Tensor): Cls scores for a single scale level \
+            the channels number is num_base_priors * num_classes.
+            - bbox_pred (Tensor): Box energies / deltas for a single \
+            scale level, the channels number is num_base_priors * 4.
+            - iou_pred (Tensor): Iou for a single scale level, the \
+            channel number is (N, num_base_priors * 1, H, W).
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.atss_cls(cls_feat)
+        # we just follow atss, not apply exp in bbox_pred
+        bbox_pred = scale(self.atss_reg(reg_feat)).float()
+        iou_pred = self.atss_iou(reg_feat)
+        return cls_score, bbox_pred, iou_pred
+
+    def loss_cls_by_feat_single(self, cls_score: Tensor, labels: Tensor,
+                                label_weights: Tensor,
+                                reweight_factor: List[float],
+                                avg_factor: float) -> Tuple[Tensor]:
+        """Compute cls loss of a single scale level.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_base_priors * num_classes, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            reweight_factor (List[float]): Reweight factor for cls and reg
+                loss.
+            avg_factor (float): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            Tuple[Tensor]: A tuple of loss components.
+        """
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+        return reweight_factor * loss_cls,
+
+    def loss_reg_by_feat_single(self, anchors: Tensor, bbox_pred: Tensor,
+                                iou_pred: Tensor, labels,
+                                label_weights: Tensor, bbox_targets: Tensor,
+                                bbox_weights: Tensor,
+                                reweight_factor: List[float],
+                                avg_factor: float) -> Tuple[Tensor, Tensor]:
+        """Compute reg loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_base_priors * 4, H, W).
+            iou_pred (Tensor): Iou for a single scale level, the
+                channel number is (N, num_base_priors * 1, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            bbox_weights (Tensor): BBox weights of all anchors in the
+                image with shape (N, 4)
+            reweight_factor (List[float]): Reweight factor for cls and reg
+                loss.
+            avg_factor (float): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+        Returns:
+            Tuple[Tensor, Tensor]: A tuple of loss components.
+        """
+        anchors = anchors.reshape(-1, 4)
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        iou_pred = iou_pred.permute(0, 2, 3, 1).reshape(-1, )
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        bbox_weights = bbox_weights.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        iou_targets = label_weights.new_zeros(labels.shape)
+        iou_weights = label_weights.new_zeros(labels.shape)
+        iou_weights[(bbox_weights.sum(axis=1) > 0).nonzero(
+            as_tuple=False)] = 1.
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    &
+                    (labels < bg_class_ind)).nonzero(as_tuple=False).squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_pred)
+            pos_decode_bbox_targets = self.bbox_coder.decode(
+                pos_anchors, pos_bbox_targets)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                avg_factor=avg_factor)
+
+            iou_targets[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            loss_iou = self.loss_iou(
+                iou_pred, iou_targets, iou_weights, avg_factor=avg_factor)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_iou = iou_pred.sum() * 0
+
+        return reweight_factor * loss_bbox, reweight_factor * loss_iou
+
+    def calc_reweight_factor(self, labels_list: List[Tensor]) -> List[float]:
+        """Compute reweight_factor for regression and classification loss."""
+        # get pos samples for each level
+        bg_class_ind = self.num_classes
+        for ii, each_level_label in enumerate(labels_list):
+            pos_inds = ((each_level_label >= 0) &
+                        (each_level_label < bg_class_ind)).nonzero(
+                            as_tuple=False).squeeze(1)
+            self.cls_num_pos_samples_per_level[ii] += len(pos_inds)
+        # get reweight factor from 1 ~ 2 with bilinear interpolation
+        min_pos_samples = min(self.cls_num_pos_samples_per_level)
+        max_pos_samples = max(self.cls_num_pos_samples_per_level)
+        interval = 1. / (max_pos_samples - min_pos_samples + 1e-10)
+        reweight_factor_per_level = []
+        for pos_samples in self.cls_num_pos_samples_per_level:
+            factor = 2. - (pos_samples - min_pos_samples) * interval
+            reweight_factor_per_level.append(factor)
+        return reweight_factor_per_level
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            iou_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_base_priors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_base_priors * 4, H, W)
+            iou_preds (list[Tensor]): Score factor for all scale level,
+                each is a 4D-tensor, has shape (batch_size, 1, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        # calculate common vars for cls and reg assigners at once
+        targets_com = self.process_predictions_and_anchors(
+            anchor_list, valid_flag_list, cls_scores, bbox_preds,
+            batch_img_metas, batch_gt_instances_ignore)
+        (anchor_list, valid_flag_list, num_level_anchors_list, cls_score_list,
+         bbox_pred_list, batch_gt_instances_ignore) = targets_com
+
+        # classification branch assigner
+        cls_targets = self.get_cls_targets(
+            anchor_list,
+            valid_flag_list,
+            num_level_anchors_list,
+            cls_score_list,
+            bbox_pred_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (cls_anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_targets
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+        avg_factor = max(avg_factor, 1.0)
+
+        reweight_factor_per_level = self.calc_reweight_factor(labels_list)
+
+        cls_losses_cls, = multi_apply(
+            self.loss_cls_by_feat_single,
+            cls_scores,
+            labels_list,
+            label_weights_list,
+            reweight_factor_per_level,
+            avg_factor=avg_factor)
+
+        # regression branch assigner
+        reg_targets = self.get_reg_targets(
+            anchor_list,
+            valid_flag_list,
+            num_level_anchors_list,
+            cls_score_list,
+            bbox_pred_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (reg_anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = reg_targets
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+        avg_factor = max(avg_factor, 1.0)
+
+        reweight_factor_per_level = self.calc_reweight_factor(labels_list)
+
+        reg_losses_bbox, reg_losses_iou = multi_apply(
+            self.loss_reg_by_feat_single,
+            reg_anchor_list,
+            bbox_preds,
+            iou_preds,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            reweight_factor_per_level,
+            avg_factor=avg_factor)
+
+        return dict(
+            loss_cls=cls_losses_cls,
+            loss_bbox=reg_losses_bbox,
+            loss_iou=reg_losses_iou)
+
+    def process_predictions_and_anchors(
+            self,
+            anchor_list: List[List[Tensor]],
+            valid_flag_list: List[List[Tensor]],
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> tuple:
+        """Compute common vars for regression and classification targets.
+
+        Args:
+            anchor_list (List[List[Tensor]]): anchors of each image.
+            valid_flag_list (List[List[Tensor]]): Valid flags of each image.
+            cls_scores (List[Tensor]): Classification scores for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * 4.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Return:
+            tuple[Tensor]: A tuple of common loss vars.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        anchor_list_ = []
+        valid_flag_list_ = []
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list_.append(torch.cat(anchor_list[i]))
+            valid_flag_list_.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None for _ in range(num_imgs)]
+
+        num_levels = len(cls_scores)
+        cls_score_list = []
+        bbox_pred_list = []
+
+        mlvl_cls_score_list = [
+            cls_score.permute(0, 2, 3, 1).reshape(
+                num_imgs, -1, self.num_base_priors * self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        mlvl_bbox_pred_list = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.num_base_priors * 4)
+            for bbox_pred in bbox_preds
+        ]
+
+        for i in range(num_imgs):
+            mlvl_cls_tensor_list = [
+                mlvl_cls_score_list[j][i] for j in range(num_levels)
+            ]
+            mlvl_bbox_tensor_list = [
+                mlvl_bbox_pred_list[j][i] for j in range(num_levels)
+            ]
+            cat_mlvl_cls_score = torch.cat(mlvl_cls_tensor_list, dim=0)
+            cat_mlvl_bbox_pred = torch.cat(mlvl_bbox_tensor_list, dim=0)
+            cls_score_list.append(cat_mlvl_cls_score)
+            bbox_pred_list.append(cat_mlvl_bbox_pred)
+        return (anchor_list_, valid_flag_list_, num_level_anchors_list,
+                cls_score_list, bbox_pred_list, batch_gt_instances_ignore)
+
+    def get_cls_targets(self,
+                        anchor_list: List[Tensor],
+                        valid_flag_list: List[Tensor],
+                        num_level_anchors_list: List[int],
+                        cls_score_list: List[Tensor],
+                        bbox_pred_list: List[Tensor],
+                        batch_gt_instances: InstanceList,
+                        batch_img_metas: List[dict],
+                        batch_gt_instances_ignore: OptInstanceList = None,
+                        unmap_outputs: bool = True) -> tuple:
+        """Get cls targets for DDOD head.
+
+        This method is almost the same as `AnchorHead.get_targets()`.
+        Besides returning the targets as the parent  method does,
+        it also returns the anchors as the first element of the
+        returned tuple.
+
+        Args:
+            anchor_list (list[Tensor]): anchors of each image.
+            valid_flag_list (list[Tensor]): Valid flags of each image.
+            num_level_anchors_list (list[Tensor]): Number of anchors of each
+                scale level of all image.
+            cls_score_list (list[Tensor]): Classification scores for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * num_classes.
+            bbox_pred_list (list[Tensor]): Box energies / deltas for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Return:
+            tuple[Tensor]: A tuple of cls targets components.
+        """
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             anchor_list,
+             valid_flag_list,
+             cls_score_list,
+             bbox_pred_list,
+             num_level_anchors_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs,
+             is_cls_assigner=True)
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors_list[0])
+        labels_list = images_to_levels(all_labels, num_level_anchors_list[0])
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors_list[0])
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors_list[0])
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors_list[0])
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, avg_factor)
+
+    def get_reg_targets(self,
+                        anchor_list: List[Tensor],
+                        valid_flag_list: List[Tensor],
+                        num_level_anchors_list: List[int],
+                        cls_score_list: List[Tensor],
+                        bbox_pred_list: List[Tensor],
+                        batch_gt_instances: InstanceList,
+                        batch_img_metas: List[dict],
+                        batch_gt_instances_ignore: OptInstanceList = None,
+                        unmap_outputs: bool = True) -> tuple:
+        """Get reg targets for DDOD head.
+
+        This method is almost the same as `AnchorHead.get_targets()` when
+        is_cls_assigner is False. Besides returning the targets as the parent
+        method does, it also returns the anchors as the first element of the
+        returned tuple.
+
+        Args:
+            anchor_list (list[Tensor]): anchors of each image.
+            valid_flag_list (list[Tensor]): Valid flags of each image.
+            num_level_anchors_list (list[Tensor]): Number of anchors of each
+                scale level of all image.
+            cls_score_list (list[Tensor]): Classification scores for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * num_classes.
+            bbox_pred_list (list[Tensor]): Box energies / deltas for all scale
+                levels, each is a 4D-tensor, the channels number is
+                num_base_priors * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Return:
+            tuple[Tensor]: A tuple of reg targets components.
+        """
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             anchor_list,
+             valid_flag_list,
+             cls_score_list,
+             bbox_pred_list,
+             num_level_anchors_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs,
+             is_cls_assigner=False)
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors_list[0])
+        labels_list = images_to_levels(all_labels, num_level_anchors_list[0])
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors_list[0])
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors_list[0])
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors_list[0])
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, avg_factor)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            cls_scores: Tensor,
+                            bbox_preds: Tensor,
+                            num_level_anchors: List[int],
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True,
+                            is_cls_assigner: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image,
+                which are concatenated into a single tensor of shape
+                (num_base_priors, 4).
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                shape (num_base_priors,).
+            cls_scores (Tensor): Classification scores for all scale
+                levels of the image.
+            bbox_preds (Tensor): Box energies / deltas for all scale
+                levels of the image.
+            num_level_anchors (List[int]): Number of anchors of each
+                scale level.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+            is_cls_assigner (bool): Classification or regression.
+                Defaults to True.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+            - anchors (Tensor): all anchors in the image with shape (N, 4).
+            - labels (Tensor): Labels of all anchors in the image with \
+            shape (N, ).
+            - label_weights (Tensor): Label weights of all anchor in the \
+            image with shape (N, ).
+            - bbox_targets (Tensor): BBox targets of all anchors in the \
+            image with shape (N, 4).
+            - bbox_weights (Tensor): BBox weights of all anchors in the \
+            image with shape (N, 4)
+            - pos_inds (Tensor): Indices of positive anchor with shape \
+            (num_pos, ).
+            - neg_inds (Tensor): Indices of negative anchor with shape \
+            (num_neg, ).
+            - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        bbox_preds_valid = bbox_preds[inside_flags, :]
+        cls_scores_valid = cls_scores[inside_flags, :]
+
+        assigner = self.cls_assigner if is_cls_assigner else self.reg_assigner
+
+        # decode prediction out of assigner
+        bbox_preds_valid = self.bbox_coder.decode(anchors, bbox_preds_valid)
+        pred_instances = InstanceData(
+            priors=anchors, bboxes=bbox_preds_valid, scores=cls_scores_valid)
+
+        assign_result = assigner.assign(
+            pred_instances=pred_instances,
+            num_level_priors=num_level_anchors_inside,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            pos_bbox_targets = self.bbox_coder.encode(
+                sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds, sampling_result)
+
+    def get_num_level_anchors_inside(self, num_level_anchors: List[int],
+                                     inside_flags: Tensor) -> List[int]:
+        """Get the anchors of each scale level inside.
+
+        Args:
+            num_level_anchors (list[int]): Number of anchors of each
+                scale level.
+            inside_flags (Tensor): Multi level inside flags of the image,
+                which are concatenated into a single tensor of
+                shape (num_base_priors,).
+
+        Returns:
+            list[int]: Number of anchors of each scale level inside.
+        """
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/head_extractor/src/mmdet/models/dense_heads/ddq_detr_head.py b/head_extractor/src/mmdet/models/dense_heads/ddq_detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0580653ac264ea0a597eec76624ab7eb3c7f6a10
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/ddq_detr_head.py
@@ -0,0 +1,550 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Tuple
+
+import torch
+from mmengine.model import bias_init_with_prob, constant_init
+from torch import Tensor, nn
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
+from ..layers import inverse_sigmoid
+from ..losses import DDQAuxLoss
+from ..utils import multi_apply
+from .dino_head import DINOHead
+
+
+@MODELS.register_module()
+class DDQDETRHead(DINOHead):
+    r"""Head of DDQDETR: Dense Distinct Query for
+        End-to-End Object Detection.
+
+    Code is modified from the `official github repo
+        <https://github.com/jshilong/DDQ>`_.
+
+    More details can be found in the `paper
+        <https://arxiv.org/abs/2303.12776>`_ .
+
+    Args:
+        aux_num_pos (int): Number of positive targets assigned to a
+            perdicted object. Defaults to 4.
+    """
+
+    def __init__(self, *args, aux_num_pos=4, **kwargs):
+        super(DDQDETRHead, self).__init__(*args, **kwargs)
+        self.aux_loss_for_dense = DDQAuxLoss(
+            train_cfg=dict(
+                assigner=dict(type='TopkHungarianAssigner', topk=aux_num_pos),
+                alpha=1,
+                beta=6))
+
+    def _init_layers(self) -> None:
+        """Initialize classification branch and regression branch of aux head
+        for dense queries."""
+        super(DDQDETRHead, self)._init_layers()
+        # If decoder `num_layers` = 6 and `as_two_stage` = True, then:
+        #   1) 6 main heads are required for
+        #       each decoder output of distinct queries.
+        #   2) 1 main head is required for `output_memory` of distinct queries.
+        #   3) 1 aux head is required for `output_memory` of dense queries,
+        #       which is done by code below this comment.
+        # So 8 heads are required in sum.
+        # aux head for dense queries on encoder feature map
+        self.cls_branches.append(copy.deepcopy(self.cls_branches[-1]))
+        self.reg_branches.append(copy.deepcopy(self.reg_branches[-1]))
+
+        # If decoder `num_layers` = 6 and `as_two_stage` = True, then:
+        #   6 aux heads are required for each decoder output of dense queries.
+        # So 8 + 6 = 14 heads and heads are requires in sum.
+        # self.num_pred_layer is 7
+        # aux head for dense queries in decoder
+        self.aux_cls_branches = nn.ModuleList([
+            copy.deepcopy(self.cls_branches[-1])
+            for _ in range(self.num_pred_layer - 1)
+        ])
+        self.aux_reg_branches = nn.ModuleList([
+            copy.deepcopy(self.reg_branches[-1])
+            for _ in range(self.num_pred_layer - 1)
+        ])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the Deformable DETR head."""
+        bias_init = bias_init_with_prob(0.01)
+        for m in self.cls_branches:
+            nn.init.constant_(m.bias, bias_init)
+        for m in self.aux_cls_branches:
+            nn.init.constant_(m.bias, bias_init)
+        for m in self.reg_branches:
+            constant_init(m[-1], 0, bias=0)
+        for m in self.reg_branches:
+            nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+        for m in self.aux_reg_branches:
+            constant_init(m[-1], 0, bias=0)
+
+        for m in self.aux_reg_branches:
+            nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+    def forward(self, hidden_states: Tensor,
+                references: List[Tensor]) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries_total,
+                dim), where `num_queries_total` is the sum of
+                `num_denoising_queries`, `num_queries` and `num_dense_queries`
+                when `self.training` is `True`, else `num_queries`.
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). Each reference has shape (bs,
+                num_queries_total, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+
+        Returns:
+            tuple[Tensor]: results of head containing the following tensors.
+
+            - all_layers_outputs_classes (Tensor): Outputs from the
+              classification head, has shape (num_decoder_layers, bs,
+              num_queries_total, cls_out_channels).
+            - all_layers_outputs_coords (Tensor): Sigmoid outputs from the
+              regression head with normalized coordinate format (cx, cy, w,
+              h), has shape (num_decoder_layers, bs, num_queries_total, 4)
+              with the last dimension arranged as (cx, cy, w, h).
+        """
+        all_layers_outputs_classes = []
+        all_layers_outputs_coords = []
+        if self.training:
+            num_dense = self.cache_dict['num_dense_queries']
+        for layer_id in range(hidden_states.shape[0]):
+            reference = inverse_sigmoid(references[layer_id])
+            hidden_state = hidden_states[layer_id]
+            if self.training:
+                dense_hidden_state = hidden_state[:, -num_dense:]
+                hidden_state = hidden_state[:, :-num_dense]
+
+            outputs_class = self.cls_branches[layer_id](hidden_state)
+            tmp_reg_preds = self.reg_branches[layer_id](hidden_state)
+            if self.training:
+                dense_outputs_class = self.aux_cls_branches[layer_id](
+                    dense_hidden_state)
+                dense_tmp_reg_preds = self.aux_reg_branches[layer_id](
+                    dense_hidden_state)
+                outputs_class = torch.cat([outputs_class, dense_outputs_class],
+                                          dim=1)
+                tmp_reg_preds = torch.cat([tmp_reg_preds, dense_tmp_reg_preds],
+                                          dim=1)
+
+            if reference.shape[-1] == 4:
+                tmp_reg_preds += reference
+            else:
+                assert reference.shape[-1] == 2
+                tmp_reg_preds[..., :2] += reference
+            outputs_coord = tmp_reg_preds.sigmoid()
+            all_layers_outputs_classes.append(outputs_class)
+            all_layers_outputs_coords.append(outputs_coord)
+
+        all_layers_outputs_classes = torch.stack(all_layers_outputs_classes)
+        all_layers_outputs_coords = torch.stack(all_layers_outputs_coords)
+
+        return all_layers_outputs_classes, all_layers_outputs_coords
+
+    def loss(self,
+             hidden_states: Tensor,
+             references: List[Tensor],
+             enc_outputs_class: Tensor,
+             enc_outputs_coord: Tensor,
+             batch_data_samples: SampleList,
+             dn_meta: Dict[str, int],
+             aux_enc_outputs_class=None,
+             aux_enc_outputs_coord=None) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries_total,
+                dim), where `num_queries_total` is the sum of
+                `num_denoising_queries`, `num_queries` and `num_dense_queries`
+                when `self.training` is `True`, else `num_queries`.
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). Each reference has shape (bs,
+                num_queries_total, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            enc_outputs_class (Tensor): The top k classification score of
+                each point on encoder feature map, has shape (bs, num_queries,
+                cls_out_channels).
+            enc_outputs_coord (Tensor): The proposal generated from points
+                with top k score, has shape (bs, num_queries, 4) with the
+                last dimension arranged as (cx, cy, w, h).
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+            aux_enc_outputs_class (Tensor): The `dense_topk` classification
+                score of each point on encoder feature map, has shape (bs,
+                num_dense_queries, cls_out_channels).
+                It is `None` when `self.training` is `False`.
+            aux_enc_outputs_coord (Tensor): The proposal generated from points
+                with `dense_topk` score, has shape (bs, num_dense_queries, 4)
+                with the last dimension arranged as (cx, cy, w, h).
+                It is `None` when `self.training` is `False`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
+                              batch_gt_instances, batch_img_metas, dn_meta)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        aux_enc_outputs_coord = bbox_cxcywh_to_xyxy(aux_enc_outputs_coord)
+        aux_enc_outputs_coord_list = []
+        for img_id in range(len(aux_enc_outputs_coord)):
+            det_bboxes = aux_enc_outputs_coord[img_id]
+            img_shape = batch_img_metas[img_id]['img_shape']
+            det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+            det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+            aux_enc_outputs_coord_list.append(det_bboxes)
+        aux_enc_outputs_coord = torch.stack(aux_enc_outputs_coord_list)
+        aux_loss = self.aux_loss_for_dense.loss(
+            aux_enc_outputs_class.sigmoid(), aux_enc_outputs_coord,
+            [item.bboxes for item in batch_gt_instances],
+            [item.labels for item in batch_gt_instances], batch_img_metas)
+        for k, v in aux_loss.items():
+            losses[f'aux_enc_{k}'] = v
+
+        return losses
+
+    def loss_by_feat(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        enc_cls_scores: Tensor,
+        enc_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        dn_meta: Dict[str, int],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries_total, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Bbox coordinates of all decoder
+                layers. Each has shape (num_decoder_layers, bs,
+                num_queries_total, 4) with normalized coordinate format
+                (cx, cy, w, h).
+            enc_cls_scores (Tensor): The top k score of each point on
+                encoder feature map, has shape (bs, num_queries,
+                cls_out_channels).
+            enc_bbox_preds (Tensor): The proposal generated from points
+                with top k score, has shape (bs, num_queries, 4) with the
+                last dimension arranged as (cx, cy, w, h).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+                group collation, including 'num_denoising_queries' and
+                'num_denoising_groups'. It will be used for split outputs of
+                denoising and matching parts and loss calculation.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        (all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+         all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds) = \
+            self.split_outputs(
+                all_layers_cls_scores, all_layers_bbox_preds, dn_meta)
+
+        num_dense_queries = dn_meta['num_dense_queries']
+        num_layer = all_layers_matching_bbox_preds.size(0)
+        dense_all_layers_matching_cls_scores = all_layers_matching_cls_scores[:, :,  # noqa: E501
+                                                                              -num_dense_queries:]  # noqa: E501
+        dense_all_layers_matching_bbox_preds = all_layers_matching_bbox_preds[:, :,  # noqa: E501
+                                                                              -num_dense_queries:]  # noqa: E501
+
+        all_layers_matching_cls_scores = all_layers_matching_cls_scores[:, :, :  # noqa: E501
+                                                                        -num_dense_queries]  # noqa: E501
+        all_layers_matching_bbox_preds = all_layers_matching_bbox_preds[:, :, :  # noqa: E501
+                                                                        -num_dense_queries]  # noqa: E501
+
+        loss_dict = self.loss_for_distinct_queries(
+            all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+            batch_gt_instances, batch_img_metas, batch_gt_instances_ignore)
+
+        if enc_cls_scores is not None:
+
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_by_feat_single(
+                    enc_cls_scores, enc_bbox_preds,
+                    batch_gt_instances=batch_gt_instances,
+                    batch_img_metas=batch_img_metas)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+            loss_dict['enc_loss_iou'] = enc_losses_iou
+
+        if all_layers_denoising_cls_scores is not None:
+            dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn(
+                all_layers_denoising_cls_scores,
+                all_layers_denoising_bbox_preds,
+                batch_gt_instances=batch_gt_instances,
+                batch_img_metas=batch_img_metas,
+                dn_meta=dn_meta)
+            loss_dict['dn_loss_cls'] = dn_losses_cls[-1]
+            loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1]
+            loss_dict['dn_loss_iou'] = dn_losses_iou[-1]
+            for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in \
+                    enumerate(zip(dn_losses_cls[:-1], dn_losses_bbox[:-1],
+                                  dn_losses_iou[:-1])):
+                loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i
+
+        for l_id in range(num_layer):
+            cls_scores = dense_all_layers_matching_cls_scores[l_id].sigmoid()
+            bbox_preds = dense_all_layers_matching_bbox_preds[l_id]
+
+            bbox_preds = bbox_cxcywh_to_xyxy(bbox_preds)
+            bbox_preds_list = []
+            for img_id in range(len(bbox_preds)):
+                det_bboxes = bbox_preds[img_id]
+                img_shape = batch_img_metas[img_id]['img_shape']
+                det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+                det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+                bbox_preds_list.append(det_bboxes)
+            bbox_preds = torch.stack(bbox_preds_list)
+            aux_loss = self.aux_loss_for_dense.loss(
+                cls_scores, bbox_preds,
+                [item.bboxes for item in batch_gt_instances],
+                [item.labels for item in batch_gt_instances], batch_img_metas)
+            for k, v in aux_loss.items():
+                loss_dict[f'{l_id}_aux_{k}'] = v
+
+        return loss_dict
+
+    def loss_for_distinct_queries(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss of distinct queries, that is, excluding denoising
+        and dense queries. Only select the distinct queries in decoder for
+        loss.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Bbox coordinates of all decoder
+                layers. It has shape (num_decoder_layers, bs,
+                num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image,
+            e.g., image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert batch_gt_instances_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            'for batch_gt_instances_ignore setting to None.'
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self._loss_for_distinct_queries_single,
+            all_layers_cls_scores,
+            all_layers_bbox_preds,
+            [i for i in range(len(all_layers_bbox_preds))],
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in \
+                zip(losses_cls[:-1], losses_bbox[:-1], losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def _loss_for_distinct_queries_single(self, cls_scores, bbox_preds, l_id,
+                                          batch_gt_instances, batch_img_metas):
+        """Calculate the loss for outputs from a single decoder layer of
+        distinct queries, that is, excluding denoising and dense queries. Only
+        select the distinct queries in decoder for loss.
+
+        Args:
+            cls_scores (Tensor): Classification scores of a single
+                decoder layer, has shape (bs, num_queries, cls_out_channels).
+            bbox_preds (Tensor): Bbox coordinates of a single decoder
+                layer. It has shape (bs, num_queries, 4) with the last
+                dimension arranged as (cx, cy, w, h).
+            l_id (int): Decoder layer index for these outputs.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image,
+            e.g., image size, scaling factor, etc.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        num_imgs = cls_scores.size(0)
+        if 0 < l_id:
+            batch_mask = [
+                self.cache_dict['distinct_query_mask'][l_id - 1][
+                    img_id * self.cache_dict['num_heads']][0]
+                for img_id in range(num_imgs)
+            ]
+        else:
+            batch_mask = [
+                torch.ones(len(cls_scores[i]),
+                           device=cls_scores.device).bool()
+                for i in range(num_imgs)
+            ]
+        # only select the distinct queries in decoder for loss
+        cls_scores_list = [
+            cls_scores[i][batch_mask[i]] for i in range(num_imgs)
+        ]
+        bbox_preds_list = [
+            bbox_preds[i][batch_mask[i]] for i in range(num_imgs)
+        ]
+        cls_scores = torch.cat(cls_scores_list)
+
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           batch_gt_instances, batch_img_metas)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds_list):
+            img_h, img_w, = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = torch.cat(bbox_preds_list)
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def predict_by_feat(self,
+                        layer_cls_scores: Tensor,
+                        layer_bbox_preds: Tensor,
+                        batch_img_metas: List[dict],
+                        rescale: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            layer_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries, cls_out_channels).
+            layer_bbox_preds (Tensor): Bbox coordinates of all decoder layers.
+                Each has shape (num_decoder_layers, bs, num_queries, 4)
+                with normalized coordinate format (cx, cy, w, h).
+            batch_img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Default `False`.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        cls_scores = layer_cls_scores[-1]
+        bbox_preds = layer_bbox_preds[-1]
+
+        num_imgs = cls_scores.size(0)
+        # -1 is last layer input query mask
+
+        batch_mask = [
+            self.cache_dict['distinct_query_mask'][-1][
+                img_id * self.cache_dict['num_heads']][0]
+            for img_id in range(num_imgs)
+        ]
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score = cls_scores[img_id][batch_mask[img_id]]
+            bbox_pred = bbox_preds[img_id][batch_mask[img_id]]
+            img_meta = batch_img_metas[img_id]
+            results = self._predict_by_feat_single(cls_score, bbox_pred,
+                                                   img_meta, rescale)
+            result_list.append(results)
+        return result_list
diff --git a/head_extractor/src/mmdet/models/dense_heads/deformable_detr_head.py b/head_extractor/src/mmdet/models/dense_heads/deformable_detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..adedd4aa6b533bcfece618eed4045c95bf0fdebb
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/deformable_detr_head.py
@@ -0,0 +1,329 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Linear
+from mmengine.model import bias_init_with_prob, constant_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList, OptInstanceList
+from ..layers import inverse_sigmoid
+from .detr_head import DETRHead
+
+
+@MODELS.register_module()
+class DeformableDETRHead(DETRHead):
+    r"""Head of DeformDETR: Deformable DETR: Deformable Transformers for
+    End-to-End Object Detection.
+
+    Code is modified from the `official github repo
+    <https://github.com/fundamentalvision/Deformable-DETR>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2010.04159>`_ .
+
+    Args:
+        share_pred_layer (bool): Whether to share parameters for all the
+            prediction layers. Defaults to `False`.
+        num_pred_layer (int): The number of the prediction layers.
+            Defaults to 6.
+        as_two_stage (bool, optional): Whether to generate the proposal
+            from the outputs of encoder. Defaults to `False`.
+    """
+
+    def __init__(self,
+                 *args,
+                 share_pred_layer: bool = False,
+                 num_pred_layer: int = 6,
+                 as_two_stage: bool = False,
+                 **kwargs) -> None:
+        self.share_pred_layer = share_pred_layer
+        self.num_pred_layer = num_pred_layer
+        self.as_two_stage = as_two_stage
+
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize classification branch and regression branch of head."""
+        fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, 4))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        if self.share_pred_layer:
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(self.num_pred_layer)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(self.num_pred_layer)])
+        else:
+            self.cls_branches = nn.ModuleList(
+                [copy.deepcopy(fc_cls) for _ in range(self.num_pred_layer)])
+            self.reg_branches = nn.ModuleList([
+                copy.deepcopy(reg_branch) for _ in range(self.num_pred_layer)
+            ])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the Deformable DETR head."""
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias, bias_init)
+        for m in self.reg_branches:
+            constant_init(m[-1], 0, bias=0)
+        nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)
+        if self.as_two_stage:
+            for m in self.reg_branches:
+                nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+    def forward(self, hidden_states: Tensor,
+                references: List[Tensor]) -> Tuple[Tensor, Tensor]:
+        """Forward function.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries, dim).
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - all_layers_outputs_classes (Tensor): Outputs from the
+              classification head, has shape (num_decoder_layers, bs,
+              num_queries, cls_out_channels).
+            - all_layers_outputs_coords (Tensor): Sigmoid outputs from the
+              regression head with normalized coordinate format (cx, cy, w,
+              h), has shape (num_decoder_layers, bs, num_queries, 4) with the
+              last dimension arranged as (cx, cy, w, h).
+        """
+        all_layers_outputs_classes = []
+        all_layers_outputs_coords = []
+
+        for layer_id in range(hidden_states.shape[0]):
+            reference = inverse_sigmoid(references[layer_id])
+            # NOTE The last reference will not be used.
+            hidden_state = hidden_states[layer_id]
+            outputs_class = self.cls_branches[layer_id](hidden_state)
+            tmp_reg_preds = self.reg_branches[layer_id](hidden_state)
+            if reference.shape[-1] == 4:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `True`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `True`.
+                tmp_reg_preds += reference
+            else:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `False`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `False`.
+                assert reference.shape[-1] == 2
+                tmp_reg_preds[..., :2] += reference
+            outputs_coord = tmp_reg_preds.sigmoid()
+            all_layers_outputs_classes.append(outputs_class)
+            all_layers_outputs_coords.append(outputs_coord)
+
+        all_layers_outputs_classes = torch.stack(all_layers_outputs_classes)
+        all_layers_outputs_coords = torch.stack(all_layers_outputs_coords)
+
+        return all_layers_outputs_classes, all_layers_outputs_coords
+
+    def loss(self, hidden_states: Tensor, references: List[Tensor],
+             enc_outputs_class: Tensor, enc_outputs_coord: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, num_queries, bs, dim).
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+            enc_outputs_class (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+                Only when `as_two_stage` is `True` it would be passed in,
+                otherwise it would be `None`.
+            enc_outputs_coord (Tensor): The proposal generate from the encode
+                feature map, has shape (bs, num_feat_points, 4) with the last
+                dimension arranged as (cx, cy, w, h). Only when `as_two_stage`
+                is `True` it would be passed in, otherwise it would be `None`.
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
+                              batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        enc_cls_scores: Tensor,
+        enc_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs, num_queries,
+                cls_out_channels).
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
+                num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            enc_cls_scores (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+                Only when `as_two_stage` is `True` it would be passes in,
+                otherwise, it would be `None`.
+            enc_bbox_preds (Tensor): The proposal generate from the encode
+                feature map, has shape (bs, num_feat_points, 4) with the last
+                dimension arranged as (cx, cy, w, h). Only when `as_two_stage`
+                is `True` it would be passed in, otherwise it would be `None`.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        loss_dict = super().loss_by_feat(all_layers_cls_scores,
+                                         all_layers_bbox_preds,
+                                         batch_gt_instances, batch_img_metas,
+                                         batch_gt_instances_ignore)
+
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            proposal_gt_instances = copy.deepcopy(batch_gt_instances)
+            for i in range(len(proposal_gt_instances)):
+                proposal_gt_instances[i].labels = torch.zeros_like(
+                    proposal_gt_instances[i].labels)
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_by_feat_single(
+                    enc_cls_scores, enc_bbox_preds,
+                    batch_gt_instances=proposal_gt_instances,
+                    batch_img_metas=batch_img_metas)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+            loss_dict['enc_loss_iou'] = enc_losses_iou
+        return loss_dict
+
+    def predict(self,
+                hidden_states: Tensor,
+                references: List[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, num_queries, bs, dim).
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Defaults to `True`.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        outs = self(hidden_states, references)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        all_layers_cls_scores: Tensor,
+                        all_layers_bbox_preds: Tensor,
+                        batch_img_metas: List[Dict],
+                        rescale: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs, num_queries,
+                cls_out_channels).
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and shape (num_decoder_layers, bs, num_queries,
+                4) with the last dimension arranged as (cx, cy, w, h).
+            batch_img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Default `False`.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        cls_scores = all_layers_cls_scores[-1]
+        bbox_preds = all_layers_bbox_preds[-1]
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_meta = batch_img_metas[img_id]
+            results = self._predict_by_feat_single(cls_score, bbox_pred,
+                                                   img_meta, rescale)
+            result_list.append(results)
+        return result_list
diff --git a/head_extractor/src/mmdet/models/dense_heads/dense_test_mixins.py b/head_extractor/src/mmdet/models/dense_heads/dense_test_mixins.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7526d48430d6bc6b82777980d0bef418e80b91c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/dense_test_mixins.py
@@ -0,0 +1,215 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+import warnings
+from inspect import signature
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.structures import InstanceData
+
+from mmdet.structures.bbox import bbox_mapping_back
+from ..test_time_augs import merge_aug_proposals
+
+if sys.version_info >= (3, 7):
+    from mmdet.utils.contextmanagers import completed
+
+
+class BBoxTestMixin(object):
+    """Mixin class for testing det bboxes via DenseHead."""
+
+    def simple_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes without test-time augmentation, can be applied in
+        DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``,
+        etc.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each
+                image after the post process. \
+                Each item usually contains following keys. \
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance,)
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances,).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        warnings.warn('You are calling `simple_test_bboxes` in '
+                      '`dense_test_mixins`, but the `dense_test_mixins`'
+                      'will be deprecated soon. Please use '
+                      '`simple_test` instead.')
+        outs = self.forward(feats)
+        results_list = self.get_results(
+            *outs, img_metas=img_metas, rescale=rescale)
+        return results_list
+
+    def aug_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes with test time augmentation, can be applied in
+        DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``,
+        etc.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,). The length of list should always be 1.
+        """
+
+        warnings.warn('You are calling `aug_test_bboxes` in '
+                      '`dense_test_mixins`, but the `dense_test_mixins`'
+                      'will be deprecated soon. Please use '
+                      '`aug_test` instead.')
+        # check with_nms argument
+        gb_sig = signature(self.get_results)
+        gb_args = [p.name for p in gb_sig.parameters.values()]
+        gbs_sig = signature(self._get_results_single)
+        gbs_args = [p.name for p in gbs_sig.parameters.values()]
+        assert ('with_nms' in gb_args) and ('with_nms' in gbs_args), \
+            f'{self.__class__.__name__}' \
+            ' does not support test-time augmentation'
+
+        aug_bboxes = []
+        aug_scores = []
+        aug_labels = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            outs = self.forward(x)
+            bbox_outputs = self.get_results(
+                *outs,
+                img_metas=img_meta,
+                cfg=self.test_cfg,
+                rescale=False,
+                with_nms=False)[0]
+            aug_bboxes.append(bbox_outputs.bboxes)
+            aug_scores.append(bbox_outputs.scores)
+            if len(bbox_outputs) >= 3:
+                aug_labels.append(bbox_outputs.labels)
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = self.merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas)
+        merged_labels = torch.cat(aug_labels, dim=0) if aug_labels else None
+
+        if merged_bboxes.numel() == 0:
+            det_bboxes = torch.cat([merged_bboxes, merged_scores[:, None]], -1)
+            return [
+                (det_bboxes, merged_labels),
+            ]
+
+        det_bboxes, keep_idxs = batched_nms(merged_bboxes, merged_scores,
+                                            merged_labels, self.test_cfg.nms)
+        det_bboxes = det_bboxes[:self.test_cfg.max_per_img]
+        det_labels = merged_labels[keep_idxs][:self.test_cfg.max_per_img]
+
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= det_bboxes.new_tensor(
+                img_metas[0][0]['scale_factor'])
+
+        results = InstanceData()
+        results.bboxes = _det_bboxes[:, :4]
+        results.scores = _det_bboxes[:, 4]
+        results.labels = det_labels
+        return [results]
+
+    def aug_test_rpn(self, feats, img_metas):
+        """Test with augmentation for only for ``RPNHead`` and its variants,
+        e.g., ``GARPNHead``, etc.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                        a 4D-tensor.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            list[Tensor]: Proposals of each image, each item has shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+        """
+        samples_per_gpu = len(img_metas[0])
+        aug_proposals = [[] for _ in range(samples_per_gpu)]
+        for x, img_meta in zip(feats, img_metas):
+            results_list = self.simple_test_rpn(x, img_meta)
+            for i, results in enumerate(results_list):
+                proposals = torch.cat(
+                    [results.bboxes, results.scores[:, None]], dim=-1)
+                aug_proposals[i].append(proposals)
+        # reorganize the order of 'img_metas' to match the dimensions
+        # of 'aug_proposals'
+        aug_img_metas = []
+        for i in range(samples_per_gpu):
+            aug_img_meta = []
+            for j in range(len(img_metas)):
+                aug_img_meta.append(img_metas[j][i])
+            aug_img_metas.append(aug_img_meta)
+        # after merging, proposals will be rescaled to the original image size
+
+        merged_proposals = []
+        for proposals, aug_img_meta in zip(aug_proposals, aug_img_metas):
+            merged_proposal = merge_aug_proposals(proposals, aug_img_meta,
+                                                  self.test_cfg)
+            results = InstanceData()
+            results.bboxes = merged_proposal[:, :4]
+            results.scores = merged_proposal[:, 4]
+            merged_proposals.append(results)
+        return merged_proposals
+
+    if sys.version_info >= (3, 7):
+
+        async def async_simple_test_rpn(self, x, img_metas):
+            sleep_interval = self.test_cfg.pop('async_sleep_interval', 0.025)
+            async with completed(
+                    __name__, 'rpn_head_forward',
+                    sleep_interval=sleep_interval):
+                rpn_outs = self(x)
+
+            proposal_list = self.get_results(*rpn_outs, img_metas=img_metas)
+            return proposal_list
+
+    def merge_aug_bboxes(self, aug_bboxes, aug_scores, img_metas):
+        """Merge augmented detection bboxes and scores.
+
+        Args:
+            aug_bboxes (list[Tensor]): shape (n, 4*#class)
+            aug_scores (list[Tensor] or None): shape (n, #class)
+            img_shapes (list[Tensor]): shape (3, ).
+
+        Returns:
+            tuple[Tensor]: ``bboxes`` with shape (n,4), where
+            4 represent (tl_x, tl_y, br_x, br_y)
+            and ``scores`` with shape (n,).
+        """
+        recovered_bboxes = []
+        for bboxes, img_info in zip(aug_bboxes, img_metas):
+            img_shape = img_info[0]['img_shape']
+            scale_factor = img_info[0]['scale_factor']
+            flip = img_info[0]['flip']
+            flip_direction = img_info[0]['flip_direction']
+            bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
+                                       flip_direction)
+            recovered_bboxes.append(bboxes)
+        bboxes = torch.cat(recovered_bboxes, dim=0)
+        if aug_scores is None:
+            return bboxes
+        else:
+            scores = torch.cat(aug_scores, dim=0)
+            return bboxes, scores
diff --git a/head_extractor/src/mmdet/models/dense_heads/detr_head.py b/head_extractor/src/mmdet/models/dense_heads/detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9daeb4740057c1f07095ffbf97b73ea40fc93106
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/detr_head.py
@@ -0,0 +1,634 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Linear
+from mmcv.cnn.bricks.transformer import FFN
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps,
+                                   bbox_xyxy_to_cxcywh)
+from mmdet.utils import (ConfigType, InstanceList, OptInstanceList,
+                         OptMultiConfig, reduce_mean)
+from ..losses import QualityFocalLoss
+from ..utils import multi_apply
+
+
+@MODELS.register_module()
+class DETRHead(BaseModule):
+    r"""Head of DETR. DETR:End-to-End Object Detection with Transformers.
+
+    More details can be found in the `paper
+    <https://arxiv.org/pdf/2005.12872>`_ .
+
+    Args:
+        num_classes (int): Number of categories excluding the background.
+        embed_dims (int): The dims of Transformer embedding.
+        num_reg_fcs (int): Number of fully-connected layers used in `FFN`,
+            which is then used for the regression head. Defaults to 2.
+        sync_cls_avg_factor (bool): Whether to sync the `avg_factor` of
+            all ranks. Default to `False`.
+        loss_cls (:obj:`ConfigDict` or dict): Config of the classification
+            loss. Defaults to `CrossEntropyLoss`.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of the regression bbox
+            loss. Defaults to `L1Loss`.
+        loss_iou (:obj:`ConfigDict` or dict): Config of the regression iou
+            loss. Defaults to `GIoULoss`.
+        train_cfg (:obj:`ConfigDict` or dict): Training config of transformer
+            head.
+        test_cfg (:obj:`ConfigDict` or dict): Testing config of transformer
+            head.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    _version = 2
+
+    def __init__(
+            self,
+            num_classes: int,
+            embed_dims: int = 256,
+            num_reg_fcs: int = 2,
+            sync_cls_avg_factor: bool = False,
+            loss_cls: ConfigType = dict(
+                type='CrossEntropyLoss',
+                bg_cls_weight=0.1,
+                use_sigmoid=False,
+                loss_weight=1.0,
+                class_weight=1.0),
+            loss_bbox: ConfigType = dict(type='L1Loss', loss_weight=5.0),
+            loss_iou: ConfigType = dict(type='GIoULoss', loss_weight=2.0),
+            train_cfg: ConfigType = dict(
+                assigner=dict(
+                    type='HungarianAssigner',
+                    match_costs=[
+                        dict(type='ClassificationCost', weight=1.),
+                        dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                        dict(type='IoUCost', iou_mode='giou', weight=2.0)
+                    ])),
+            test_cfg: ConfigType = dict(max_per_img=100),
+            init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.bg_cls_weight = 0
+        self.sync_cls_avg_factor = sync_cls_avg_factor
+        class_weight = loss_cls.get('class_weight', None)
+        if class_weight is not None and (self.__class__ is DETRHead):
+            assert isinstance(class_weight, float), 'Expected ' \
+                'class_weight to have type float. Found ' \
+                f'{type(class_weight)}.'
+            # NOTE following the official DETR repo, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
+            assert isinstance(bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(bg_cls_weight)}.'
+            class_weight = torch.ones(num_classes + 1) * class_weight
+            # set background class as the last indice
+            class_weight[num_classes] = bg_cls_weight
+            loss_cls.update({'class_weight': class_weight})
+            if 'bg_cls_weight' in loss_cls:
+                loss_cls.pop('bg_cls_weight')
+            self.bg_cls_weight = bg_cls_weight
+
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided ' \
+                                            'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            self.assigner = TASK_UTILS.build(assigner)
+            if train_cfg.get('sampler', None) is not None:
+                raise RuntimeError('DETR do not build sampler.')
+        self.num_classes = num_classes
+        self.embed_dims = embed_dims
+        self.num_reg_fcs = num_reg_fcs
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_iou = MODELS.build(loss_iou)
+
+        if self.loss_cls.use_sigmoid:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the transformer head."""
+        # cls branch
+        self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        # reg branch
+        self.activate = nn.ReLU()
+        self.reg_ffn = FFN(
+            self.embed_dims,
+            self.embed_dims,
+            self.num_reg_fcs,
+            dict(type='ReLU', inplace=True),
+            dropout=0.0,
+            add_residual=False)
+        # NOTE the activations of reg_branch here is the same as
+        # those in transformer, but they are actually different
+        # in DAB-DETR (prelu in transformer and relu in reg_branch)
+        self.fc_reg = Linear(self.embed_dims, 4)
+
+    def forward(self, hidden_states: Tensor) -> Tuple[Tensor]:
+        """"Forward function.
+
+        Args:
+            hidden_states (Tensor): Features from transformer decoder. If
+                `return_intermediate_dec` in detr.py is True output has shape
+                (num_decoder_layers, bs, num_queries, dim), else has shape
+                (1, bs, num_queries, dim) which only contains the last layer
+                outputs.
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - layers_cls_scores (Tensor): Outputs from the classification head,
+              shape (num_decoder_layers, bs, num_queries, cls_out_channels).
+              Note cls_out_channels should include background.
+            - layers_bbox_preds (Tensor): Sigmoid outputs from the regression
+              head with normalized coordinate format (cx, cy, w, h), has shape
+              (num_decoder_layers, bs, num_queries, 4).
+        """
+        layers_cls_scores = self.fc_cls(hidden_states)
+        layers_bbox_preds = self.fc_reg(
+            self.activate(self.reg_ffn(hidden_states))).sigmoid()
+        return layers_cls_scores, layers_bbox_preds
+
+    def loss(self, hidden_states: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Feature from the transformer decoder, has
+                shape (num_decoder_layers, bs, num_queries, cls_out_channels)
+                or (num_decoder_layers, num_queries, bs, cls_out_channels).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """"Loss function.
+
+        Only outputs from the last feature level are used for computing
+        losses by default.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification outputs
+                of each decoder layers. Each is a 4D-tensor, has shape
+                (num_decoder_layers, bs, num_queries, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Sigmoid regression
+                outputs of each decoder layers. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                (num_decoder_layers, bs, num_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert batch_gt_instances_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            'for batch_gt_instances_ignore setting to None.'
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_by_feat_single,
+            all_layers_cls_scores,
+            all_layers_bbox_preds,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in \
+                zip(losses_cls[:-1], losses_bbox[:-1], losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor,
+                            batch_gt_instances: InstanceList,
+                            batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images, has shape (bs, num_queries, cls_out_channels).
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape (bs, num_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           batch_gt_instances, batch_img_metas)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if isinstance(self.loss_cls, QualityFocalLoss):
+            bg_class_ind = self.num_classes
+            pos_inds = ((labels >= 0)
+                        & (labels < bg_class_ind)).nonzero().squeeze(1)
+            scores = label_weights.new_zeros(labels.shape)
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_decode_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets)
+            pos_bbox_pred = bbox_preds.reshape(-1, 4)[pos_inds]
+            pos_decode_bbox_pred = bbox_cxcywh_to_xyxy(pos_bbox_pred)
+            scores[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            loss_cls = self.loss_cls(
+                cls_scores, (labels, scores),
+                label_weights,
+                avg_factor=cls_avg_factor)
+        else:
+            loss_cls = self.loss_cls(
+                cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds):
+            img_h, img_w, = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_targets(self, cls_scores_list: List[Tensor],
+                    bbox_preds_list: List[Tensor],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict]) -> tuple:
+        """Compute regression and classification targets for a batch image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image, has shape [num_queries,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_queries, 4].
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+            - labels_list (list[Tensor]): Labels for all images.
+            - label_weights_list (list[Tensor]): Label weights for all images.
+            - bbox_targets_list (list[Tensor]): BBox targets for all images.
+            - bbox_weights_list (list[Tensor]): BBox weights for all images.
+            - num_total_pos (int): Number of positive samples in all images.
+            - num_total_neg (int): Number of negative samples in all images.
+        """
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pos_inds_list,
+         neg_inds_list) = multi_apply(self._get_targets_single,
+                                      cls_scores_list, bbox_preds_list,
+                                      batch_gt_instances, batch_img_metas)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> tuple:
+        """Compute regression and classification targets for one image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_queries, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_queries, 4].
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        img_h, img_w = img_meta['img_shape']
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        num_bboxes = bbox_pred.size(0)
+        # convert bbox_pred from xywh, normalized to xyxy, unnormalized
+        bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
+        bbox_pred = bbox_pred * factor
+
+        pred_instances = InstanceData(scores=cls_score, bboxes=bbox_pred)
+        # assigner and sampler
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            img_meta=img_meta)
+
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+        pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :]
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype)
+        bbox_weights = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype)
+        bbox_weights[pos_inds] = 1.0
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        pos_gt_bboxes_normalized = pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    def loss_and_predict(
+            self, hidden_states: Tuple[Tensor],
+            batch_data_samples: SampleList) -> Tuple[dict, InstanceList]:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples. Over-write because
+        img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (tuple[Tensor]): Feature from the transformer
+                decoder, has shape (num_decoder_layers, bs, num_queries, dim).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns:
+            tuple: the return value is a tuple contains:
+
+            - losses: (dict[str, Tensor]): A dictionary of loss components.
+            - predictions (list[:obj:`InstanceData`]): Detection
+              results of each image after the post process.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(*loss_inputs)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas)
+        return losses, predictions
+
+    def predict(self,
+                hidden_states: Tuple[Tensor],
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network. Over-write
+        because img_metas are needed as inputs for bbox_head.
+
+        Args:
+            hidden_states (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image
+            after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        last_layer_hidden_state = hidden_states[-1].unsqueeze(0)
+        outs = self(last_layer_hidden_state)
+
+        predictions = self.predict_by_feat(
+            *outs, batch_img_metas=batch_img_metas, rescale=rescale)
+
+        return predictions
+
+    def predict_by_feat(self,
+                        layer_cls_scores: Tensor,
+                        layer_bbox_preds: Tensor,
+                        batch_img_metas: List[dict],
+                        rescale: bool = True) -> InstanceList:
+        """Transform network outputs for a batch into bbox predictions.
+
+        Args:
+            layer_cls_scores (Tensor): Classification outputs of the last or
+                all decoder layer. Each is a 4D-tensor, has shape
+                (num_decoder_layers, bs, num_queries, cls_out_channels).
+            layer_bbox_preds (Tensor): Sigmoid regression outputs of the last
+                or all decoder layer. Each is a 4D-tensor with normalized
+                coordinate format (cx, cy, w, h) and shape
+                (num_decoder_layers, bs, num_queries, 4).
+            batch_img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Defaults to `True`.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        # NOTE only using outputs from the last feature level,
+        # and only the outputs from the last decoder layer is used.
+        cls_scores = layer_cls_scores[-1]
+        bbox_preds = layer_bbox_preds[-1]
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_meta = batch_img_metas[img_id]
+            results = self._predict_by_feat_single(cls_score, bbox_pred,
+                                                   img_meta, rescale)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score: Tensor,
+                                bbox_pred: Tensor,
+                                img_meta: dict,
+                                rescale: bool = True) -> InstanceData:
+        """Transform outputs from the last decoder layer into bbox predictions
+        for each image.
+
+        Args:
+            cls_score (Tensor): Box score logits from the last decoder layer
+                for each image. Shape [num_queries, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
+                for each image, with coordinate format (cx, cy, w, h) and
+                shape [num_queries, 4].
+            img_meta (dict): Image meta info.
+            rescale (bool): If True, return boxes in original image
+                space. Default True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_score) == len(bbox_pred)  # num_queries
+        max_per_img = self.test_cfg.get('max_per_img', len(cls_score))
+        img_shape = img_meta['img_shape']
+        # exclude background
+        if self.loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+            scores, indexes = cls_score.view(-1).topk(max_per_img)
+            det_labels = indexes % self.num_classes
+            bbox_index = indexes // self.num_classes
+            bbox_pred = bbox_pred[bbox_index]
+        else:
+            scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
+            scores, bbox_index = scores.topk(max_per_img)
+            bbox_pred = bbox_pred[bbox_index]
+            det_labels = det_labels[bbox_index]
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
+        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
+        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            det_bboxes /= det_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        results = InstanceData()
+        results.bboxes = det_bboxes
+        results.scores = scores
+        results.labels = det_labels
+        return results
diff --git a/head_extractor/src/mmdet/models/dense_heads/dino_head.py b/head_extractor/src/mmdet/models/dense_heads/dino_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..54f46d1474f97f2d183926a6dc68a0be79f7cef1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/dino_head.py
@@ -0,0 +1,479 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import (bbox_cxcywh_to_xyxy, bbox_overlaps,
+                                   bbox_xyxy_to_cxcywh)
+from mmdet.utils import InstanceList, OptInstanceList, reduce_mean
+from ..losses import QualityFocalLoss
+from ..utils import multi_apply
+from .deformable_detr_head import DeformableDETRHead
+
+
+@MODELS.register_module()
+class DINOHead(DeformableDETRHead):
+    r"""Head of the DINO: DETR with Improved DeNoising Anchor Boxes
+    for End-to-End Object Detection
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/DINO>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2203.03605>`_ .
+    """
+
+    def loss(self, hidden_states: Tensor, references: List[Tensor],
+             enc_outputs_class: Tensor, enc_outputs_coord: Tensor,
+             batch_data_samples: SampleList, dn_meta: Dict[str, int]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries_total,
+                dim), where `num_queries_total` is the sum of
+                `num_denoising_queries` and `num_matching_queries` when
+                `self.training` is `True`, else `num_matching_queries`.
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries_total, 4) and each `inter_reference` has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            enc_outputs_class (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+            enc_outputs_coord (Tensor): The proposal generate from the
+                encode feature map, has shape (bs, num_feat_points, 4) with the
+                last dimension arranged as (cx, cy, w, h).
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references)
+        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
+                              batch_gt_instances, batch_img_metas, dn_meta)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat(
+        self,
+        all_layers_cls_scores: Tensor,
+        all_layers_bbox_preds: Tensor,
+        enc_cls_scores: Tensor,
+        enc_bbox_preds: Tensor,
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        dn_meta: Dict[str, int],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries_total, cls_out_channels), where
+                `num_queries_total` is the sum of `num_denoising_queries`
+                and `num_matching_queries`.
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
+                num_queries_total, 4).
+            enc_cls_scores (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+            enc_bbox_preds (Tensor): The proposal generate from the encode
+                feature map, has shape (bs, num_feat_points, 4) with the last
+                dimension arranged as (cx, cy, w, h).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+                group collation, including 'num_denoising_queries' and
+                'num_denoising_groups'. It will be used for split outputs of
+                denoising and matching parts and loss calculation.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # extract denoising and matching part of outputs
+        (all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+         all_layers_denoising_cls_scores, all_layers_denoising_bbox_preds) = \
+            self.split_outputs(
+                all_layers_cls_scores, all_layers_bbox_preds, dn_meta)
+
+        loss_dict = super(DeformableDETRHead, self).loss_by_feat(
+            all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+            batch_gt_instances, batch_img_metas, batch_gt_instances_ignore)
+        # NOTE DETRHead.loss_by_feat but not DeformableDETRHead.loss_by_feat
+        # is called, because the encoder loss calculations are different
+        # between DINO and DeformableDETR.
+
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            # NOTE The enc_loss calculation of the DINO is
+            # different from that of Deformable DETR.
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_by_feat_single(
+                    enc_cls_scores, enc_bbox_preds,
+                    batch_gt_instances=batch_gt_instances,
+                    batch_img_metas=batch_img_metas)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+            loss_dict['enc_loss_iou'] = enc_losses_iou
+
+        if all_layers_denoising_cls_scores is not None:
+            # calculate denoising loss from all decoder layers
+            dn_losses_cls, dn_losses_bbox, dn_losses_iou = self.loss_dn(
+                all_layers_denoising_cls_scores,
+                all_layers_denoising_bbox_preds,
+                batch_gt_instances=batch_gt_instances,
+                batch_img_metas=batch_img_metas,
+                dn_meta=dn_meta)
+            # collate denoising loss
+            loss_dict['dn_loss_cls'] = dn_losses_cls[-1]
+            loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1]
+            loss_dict['dn_loss_iou'] = dn_losses_iou[-1]
+            for num_dec_layer, (loss_cls_i, loss_bbox_i, loss_iou_i) in \
+                    enumerate(zip(dn_losses_cls[:-1], dn_losses_bbox[:-1],
+                                  dn_losses_iou[:-1])):
+                loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i
+                loss_dict[f'd{num_dec_layer}.dn_loss_iou'] = loss_iou_i
+        return loss_dict
+
+    def loss_dn(self, all_layers_denoising_cls_scores: Tensor,
+                all_layers_denoising_bbox_preds: Tensor,
+                batch_gt_instances: InstanceList, batch_img_metas: List[dict],
+                dn_meta: Dict[str, int]) -> Tuple[List[Tensor]]:
+        """Calculate denoising loss.
+
+        Args:
+            all_layers_denoising_cls_scores (Tensor): Classification scores of
+                all decoder layers in denoising part, has shape (
+                num_decoder_layers, bs, num_denoising_queries,
+                cls_out_channels).
+            all_layers_denoising_bbox_preds (Tensor): Regression outputs of all
+                decoder layers in denoising part. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and has shape
+                (num_decoder_layers, bs, num_denoising_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            Tuple[List[Tensor]]: The loss_dn_cls, loss_dn_bbox, and loss_dn_iou
+            of each decoder layers.
+        """
+        return multi_apply(
+            self._loss_dn_single,
+            all_layers_denoising_cls_scores,
+            all_layers_denoising_bbox_preds,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+            dn_meta=dn_meta)
+
+    def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor,
+                        batch_gt_instances: InstanceList,
+                        batch_img_metas: List[dict],
+                        dn_meta: Dict[str, int]) -> Tuple[Tensor]:
+        """Denoising loss for outputs from a single decoder layer.
+
+        Args:
+            dn_cls_scores (Tensor): Classification scores of a single decoder
+                layer in denoising part, has shape (bs, num_denoising_queries,
+                cls_out_channels).
+            dn_bbox_preds (Tensor): Regression outputs of a single decoder
+                layer in denoising part. Each is a 4D-tensor with normalized
+                coordinate format (cx, cy, w, h) and has shape
+                (bs, num_denoising_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        cls_reg_targets = self.get_dn_targets(batch_gt_instances,
+                                              batch_img_metas, dn_meta)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = dn_cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = \
+            num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if len(cls_scores) > 0:
+            if isinstance(self.loss_cls, QualityFocalLoss):
+                bg_class_ind = self.num_classes
+                pos_inds = ((labels >= 0)
+                            & (labels < bg_class_ind)).nonzero().squeeze(1)
+                scores = label_weights.new_zeros(labels.shape)
+                pos_bbox_targets = bbox_targets[pos_inds]
+                pos_decode_bbox_targets = bbox_cxcywh_to_xyxy(pos_bbox_targets)
+                pos_bbox_pred = dn_bbox_preds.reshape(-1, 4)[pos_inds]
+                pos_decode_bbox_pred = bbox_cxcywh_to_xyxy(pos_bbox_pred)
+                scores[pos_inds] = bbox_overlaps(
+                    pos_decode_bbox_pred.detach(),
+                    pos_decode_bbox_targets,
+                    is_aligned=True)
+                loss_cls = self.loss_cls(
+                    cls_scores, (labels, scores),
+                    weight=label_weights,
+                    avg_factor=cls_avg_factor)
+            else:
+                loss_cls = self.loss_cls(
+                    cls_scores,
+                    labels,
+                    label_weights,
+                    avg_factor=cls_avg_factor)
+        else:
+            loss_cls = torch.zeros(
+                1, dtype=cls_scores.dtype, device=cls_scores.device)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, dn_bbox_preds):
+            img_h, img_w = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = dn_bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_dn_targets(self, batch_gt_instances: InstanceList,
+                       batch_img_metas: dict, dn_meta: Dict[str,
+                                                            int]) -> tuple:
+        """Get targets in denoising part for a batch of images.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+            - labels_list (list[Tensor]): Labels for all images.
+            - label_weights_list (list[Tensor]): Label weights for all images.
+            - bbox_targets_list (list[Tensor]): BBox targets for all images.
+            - bbox_weights_list (list[Tensor]): BBox weights for all images.
+            - num_total_pos (int): Number of positive samples in all images.
+            - num_total_neg (int): Number of negative samples in all images.
+        """
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_dn_targets_single,
+             batch_gt_instances,
+             batch_img_metas,
+             dn_meta=dn_meta)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_dn_targets_single(self, gt_instances: InstanceData,
+                               img_meta: dict, dn_meta: Dict[str,
+                                                             int]) -> tuple:
+        """Get targets in denoising part for one image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        num_groups = dn_meta['num_denoising_groups']
+        num_denoising_queries = dn_meta['num_denoising_queries']
+        num_queries_each_group = int(num_denoising_queries / num_groups)
+        device = gt_bboxes.device
+
+        if len(gt_labels) > 0:
+            t = torch.arange(len(gt_labels), dtype=torch.long, device=device)
+            t = t.unsqueeze(0).repeat(num_groups, 1)
+            pos_assigned_gt_inds = t.flatten()
+            pos_inds = torch.arange(
+                num_groups, dtype=torch.long, device=device)
+            pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t
+            pos_inds = pos_inds.flatten()
+        else:
+            pos_inds = pos_assigned_gt_inds = \
+                gt_bboxes.new_tensor([], dtype=torch.long)
+
+        neg_inds = pos_inds + num_queries_each_group // 2
+
+        # label targets
+        labels = gt_bboxes.new_full((num_denoising_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_denoising_queries)
+
+        # bbox targets
+        bbox_targets = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        gt_bboxes_normalized = gt_bboxes / factor
+        gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized)
+        bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1])
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    @staticmethod
+    def split_outputs(all_layers_cls_scores: Tensor,
+                      all_layers_bbox_preds: Tensor,
+                      dn_meta: Dict[str, int]) -> Tuple[Tensor]:
+        """Split outputs of the denoising part and the matching part.
+
+        For the total outputs of `num_queries_total` length, the former
+        `num_denoising_queries` outputs are from denoising queries, and
+        the rest `num_matching_queries` ones are from matching queries,
+        where `num_queries_total` is the sum of `num_denoising_queries` and
+        `num_matching_queries`.
+
+        Args:
+            all_layers_cls_scores (Tensor): Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs,
+                num_queries_total, cls_out_channels).
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and has shape (num_decoder_layers, bs,
+                num_queries_total, 4).
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'.
+
+        Returns:
+            Tuple[Tensor]: a tuple containing the following outputs.
+
+            - all_layers_matching_cls_scores (Tensor): Classification scores
+              of all decoder layers in matching part, has shape
+              (num_decoder_layers, bs, num_matching_queries, cls_out_channels).
+            - all_layers_matching_bbox_preds (Tensor): Regression outputs of
+              all decoder layers in matching part. Each is a 4D-tensor with
+              normalized coordinate format (cx, cy, w, h) and has shape
+              (num_decoder_layers, bs, num_matching_queries, 4).
+            - all_layers_denoising_cls_scores (Tensor): Classification scores
+              of all decoder layers in denoising part, has shape
+              (num_decoder_layers, bs, num_denoising_queries,
+              cls_out_channels).
+            - all_layers_denoising_bbox_preds (Tensor): Regression outputs of
+              all decoder layers in denoising part. Each is a 4D-tensor with
+              normalized coordinate format (cx, cy, w, h) and has shape
+              (num_decoder_layers, bs, num_denoising_queries, 4).
+        """
+        num_denoising_queries = dn_meta['num_denoising_queries']
+        if dn_meta is not None:
+            all_layers_denoising_cls_scores = \
+                all_layers_cls_scores[:, :, : num_denoising_queries, :]
+            all_layers_denoising_bbox_preds = \
+                all_layers_bbox_preds[:, :, : num_denoising_queries, :]
+            all_layers_matching_cls_scores = \
+                all_layers_cls_scores[:, :, num_denoising_queries:, :]
+            all_layers_matching_bbox_preds = \
+                all_layers_bbox_preds[:, :, num_denoising_queries:, :]
+        else:
+            all_layers_denoising_cls_scores = None
+            all_layers_denoising_bbox_preds = None
+            all_layers_matching_cls_scores = all_layers_cls_scores
+            all_layers_matching_bbox_preds = all_layers_bbox_preds
+        return (all_layers_matching_cls_scores, all_layers_matching_bbox_preds,
+                all_layers_denoising_cls_scores,
+                all_layers_denoising_bbox_preds)
diff --git a/head_extractor/src/mmdet/models/dense_heads/embedding_rpn_head.py b/head_extractor/src/mmdet/models/dense_heads/embedding_rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..97e84fa83b892c0274615d582fe43a6693541617
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/embedding_rpn_head.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from mmdet.structures.det_data_sample import SampleList
+from mmdet.utils import InstanceList, OptConfigType
+
+
+@MODELS.register_module()
+class EmbeddingRPNHead(BaseModule):
+    """RPNHead in the `Sparse R-CNN <https://arxiv.org/abs/2011.12450>`_ .
+
+    Unlike traditional RPNHead, this module does not need FPN input, but just
+    decode `init_proposal_bboxes` and expand the first dimension of
+    `init_proposal_bboxes` and `init_proposal_features` to the batch_size.
+
+    Args:
+        num_proposals (int): Number of init_proposals. Defaults to 100.
+        proposal_feature_channel (int): Channel number of
+            init_proposal_feature. Defaults to 256.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_proposals: int = 100,
+                 proposal_feature_channel: int = 256,
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        # `**kwargs` is necessary to avoid some potential error.
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.num_proposals = num_proposals
+        self.proposal_feature_channel = proposal_feature_channel
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize a sparse set of proposal boxes and proposal features."""
+        self.init_proposal_bboxes = nn.Embedding(self.num_proposals, 4)
+        self.init_proposal_features = nn.Embedding(
+            self.num_proposals, self.proposal_feature_channel)
+
+    def init_weights(self) -> None:
+        """Initialize the init_proposal_bboxes as normalized.
+
+        [c_x, c_y, w, h], and we initialize it to the size of  the entire
+        image.
+        """
+        super().init_weights()
+        nn.init.constant_(self.init_proposal_bboxes.weight[:, :2], 0.5)
+        nn.init.constant_(self.init_proposal_bboxes.weight[:, 2:], 1)
+
+    def _decode_init_proposals(self, x: List[Tensor],
+                               batch_data_samples: SampleList) -> InstanceList:
+        """Decode init_proposal_bboxes according to the size of images and
+        expand dimension of init_proposal_features to batch_size.
+
+        Args:
+            x (list[Tensor]): List of FPN features.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            List[:obj:`InstanceData`:] Detection results of each image.
+            Each item usually contains following keys.
+
+            - proposals: Decoded proposal bboxes,
+              has shape (num_proposals, 4).
+            - features: init_proposal_features, expanded proposal
+              features, has shape
+              (num_proposals, proposal_feature_channel).
+            - imgs_whwh: Tensor with shape
+              (num_proposals, 4), the dimension means
+              [img_width, img_height, img_width, img_height].
+        """
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+
+        proposals = self.init_proposal_bboxes.weight.clone()
+        proposals = bbox_cxcywh_to_xyxy(proposals)
+        imgs_whwh = []
+        for meta in batch_img_metas:
+            h, w = meta['img_shape'][:2]
+            imgs_whwh.append(x[0].new_tensor([[w, h, w, h]]))
+        imgs_whwh = torch.cat(imgs_whwh, dim=0)
+        imgs_whwh = imgs_whwh[:, None, :]
+        proposals = proposals * imgs_whwh
+
+        rpn_results_list = []
+        for idx in range(len(batch_img_metas)):
+            rpn_results = InstanceData()
+            rpn_results.bboxes = proposals[idx]
+            rpn_results.imgs_whwh = imgs_whwh[idx].repeat(
+                self.num_proposals, 1)
+            rpn_results.features = self.init_proposal_features.weight.clone()
+            rpn_results_list.append(rpn_results)
+        return rpn_results_list
+
+    def loss(self, *args, **kwargs):
+        """Perform forward propagation and loss calculation of the detection
+        head on the features of the upstream network."""
+        raise NotImplementedError(
+            'EmbeddingRPNHead does not have `loss`, please use '
+            '`predict` or `loss_and_predict` instead.')
+
+    def predict(self, x: List[Tensor], batch_data_samples: SampleList,
+                **kwargs) -> InstanceList:
+        """Perform forward propagation of the detection head and predict
+        detection results on the features of the upstream network."""
+        # `**kwargs` is necessary to avoid some potential error.
+        return self._decode_init_proposals(
+            x=x, batch_data_samples=batch_data_samples)
+
+    def loss_and_predict(self, x: List[Tensor], batch_data_samples: SampleList,
+                         **kwargs) -> tuple:
+        """Perform forward propagation of the head, then calculate loss and
+        predictions from the features and data samples."""
+        # `**kwargs` is necessary to avoid some potential error.
+        predictions = self._decode_init_proposals(
+            x=x, batch_data_samples=batch_data_samples)
+
+        return dict(), predictions
diff --git a/head_extractor/src/mmdet/models/dense_heads/fcos_head.py b/head_extractor/src/mmdet/models/dense_heads/fcos_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba4d4640010c7e8e7c6a4db3e0fce887b4105217
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/fcos_head.py
@@ -0,0 +1,476 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Scale
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.layers import NormedConv2d
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig,
+                         OptInstanceList, RangeType, reduce_mean)
+from ..utils import multi_apply
+from .anchor_free_head import AnchorFreeHead
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class FCOSHead(AnchorFreeHead):
+    """Anchor-free head used in `FCOS <https://arxiv.org/abs/1904.01355>`_.
+
+    The FCOS head does not use anchor boxes. Instead bounding boxes are
+    predicted at each pixel and a centerness measure is used to suppress
+    low-quality predictions.
+    Here norm_on_bbox, centerness_on_reg, dcn_on_last_conv are training
+    tricks used in official repo, which will bring remarkable mAP gains
+    of up to 4.9. Please see https://github.com/tianzhi0549/FCOS for
+    more detail.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        strides (Sequence[int] or Sequence[Tuple[int, int]]): Strides of points
+            in multiple feature levels. Defaults to (4, 8, 16, 32, 64).
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling.
+            Defaults to False.
+        center_sample_radius (float): Radius of center sampling.
+            Defaults to 1.5.
+        norm_on_bbox (bool): If true, normalize the regression targets with
+            FPN strides. Defaults to False.
+        centerness_on_reg (bool): If true, position centerness on the
+            regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
+            Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Defaults to "auto".
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_centerness (:obj:`ConfigDict`, or dict): Config of centerness
+            loss.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer.  Defaults to
+            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
+        cls_predictor_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config conv_cls. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+
+    Example:
+        >>> self = FCOSHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred, centerness = self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 regress_ranges: RangeType = ((-1, 64), (64, 128), (128, 256),
+                                              (256, 512), (512, INF)),
+                 center_sampling: bool = False,
+                 center_sample_radius: float = 1.5,
+                 norm_on_bbox: bool = False,
+                 centerness_on_reg: bool = False,
+                 loss_cls: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(type='IoULoss', loss_weight=1.0),
+                 loss_centerness: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 cls_predictor_cfg=None,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.regress_ranges = regress_ranges
+        self.center_sampling = center_sampling
+        self.center_sample_radius = center_sample_radius
+        self.norm_on_bbox = norm_on_bbox
+        self.centerness_on_reg = centerness_on_reg
+        self.cls_predictor_cfg = cls_predictor_cfg
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.loss_centerness = MODELS.build(loss_centerness)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        super()._init_layers()
+        self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+        if self.cls_predictor_cfg is not None:
+            self.cls_predictor_cfg.pop('type')
+            self.conv_cls = NormedConv2d(
+                self.feat_channels,
+                self.cls_out_channels,
+                1,
+                padding=0,
+                **self.cls_predictor_cfg)
+
+    def forward(
+            self, x: Tuple[Tensor]
+    ) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of each level outputs.
+
+            - cls_scores (list[Tensor]): Box scores for each scale level, \
+            each is a 4D-tensor, the channel number is \
+            num_points * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for each \
+            scale level, each is a 4D-tensor, the channel number is \
+            num_points * 4.
+            - centernesses (list[Tensor]): centerness for each scale level, \
+            each is a 4D-tensor, the channel number is num_points * 1.
+        """
+        return multi_apply(self.forward_single, x, self.scales, self.strides)
+
+    def forward_single(self, x: Tensor, scale: Scale,
+                       stride: int) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj:`mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox predictions and centerness
+            predictions of input feature maps.
+        """
+        cls_score, bbox_pred, cls_feat, reg_feat = super().forward_single(x)
+        if self.centerness_on_reg:
+            centerness = self.conv_centerness(reg_feat)
+        else:
+            centerness = self.conv_centerness(cls_feat)
+        # scale the bbox_pred of different level
+        # float to avoid overflow when enabling FP16
+        bbox_pred = scale(bbox_pred).float()
+        if self.norm_on_bbox:
+            # bbox_pred needed for gradient computation has been modified
+            # by F.relu(bbox_pred) when run with PyTorch 1.10. So replace
+            # F.relu(bbox_pred) with bbox_pred.clamp(min=0)
+            bbox_pred = bbox_pred.clamp(min=0)
+            if not self.training:
+                bbox_pred *= stride
+        else:
+            bbox_pred = bbox_pred.exp()
+        return cls_score, bbox_pred, centerness
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        centernesses: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            centernesses (list[Tensor]): centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(centernesses)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+        labels, bbox_targets = self.get_targets(all_level_points,
+                                                batch_gt_instances)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and centerness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_centerness = [
+            centerness.permute(0, 2, 3, 1).reshape(-1)
+            for centerness in centernesses
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_centerness = torch.cat(flatten_centerness)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        losses = dict()
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < bg_class_ind)).nonzero().reshape(-1)
+        num_pos = torch.tensor(
+            len(pos_inds), dtype=torch.float, device=bbox_preds[0].device)
+        num_pos = max(reduce_mean(num_pos), 1.0)
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels, avg_factor=num_pos)
+
+        if getattr(self.loss_cls, 'custom_accuracy', False):
+            acc = self.loss_cls.get_accuracy(flatten_cls_scores,
+                                             flatten_labels)
+            losses.update(acc)
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_centerness = flatten_centerness[pos_inds]
+        pos_bbox_targets = flatten_bbox_targets[pos_inds]
+        pos_centerness_targets = self.centerness_target(pos_bbox_targets)
+        # centerness weighted iou loss
+        centerness_denorm = max(
+            reduce_mean(pos_centerness_targets.sum().detach()), 1e-6)
+
+        if len(pos_inds) > 0:
+            pos_points = flatten_points[pos_inds]
+            pos_decoded_bbox_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_preds)
+            pos_decoded_target_preds = self.bbox_coder.decode(
+                pos_points, pos_bbox_targets)
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds,
+                weight=pos_centerness_targets,
+                avg_factor=centerness_denorm)
+            loss_centerness = self.loss_centerness(
+                pos_centerness, pos_centerness_targets, avg_factor=num_pos)
+        else:
+            loss_bbox = pos_bbox_preds.sum()
+            loss_centerness = pos_centerness.sum()
+
+        losses['loss_cls'] = loss_cls
+        losses['loss_bbox'] = loss_bbox
+        losses['loss_centerness'] = loss_centerness
+
+        return losses
+
+    def get_targets(
+            self, points: List[Tensor], batch_gt_instances: InstanceList
+    ) -> Tuple[List[Tensor], List[Tensor]]:
+        """Compute regression, classification and centerness targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple: Targets of each level.
+
+            - concat_lvl_labels (list[Tensor]): Labels of each level.
+            - concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+            level.
+        """
+        assert len(points) == len(self.regress_ranges)
+        num_levels = len(points)
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        # get labels and bbox_targets of each image
+        labels_list, bbox_targets_list = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            points=concat_points,
+            regress_ranges=concat_regress_ranges,
+            num_points_per_lvl=num_points)
+
+        # split to per img, per level
+        labels_list = [labels.split(num_points, 0) for labels in labels_list]
+        bbox_targets_list = [
+            bbox_targets.split(num_points, 0)
+            for bbox_targets in bbox_targets_list
+        ]
+
+        # concat per level image
+        concat_lvl_labels = []
+        concat_lvl_bbox_targets = []
+        for i in range(num_levels):
+            concat_lvl_labels.append(
+                torch.cat([labels[i] for labels in labels_list]))
+            bbox_targets = torch.cat(
+                [bbox_targets[i] for bbox_targets in bbox_targets_list])
+            if self.norm_on_bbox:
+                bbox_targets = bbox_targets / self.strides[i]
+            concat_lvl_bbox_targets.append(bbox_targets)
+        return concat_lvl_labels, concat_lvl_bbox_targets
+
+    def _get_targets_single(
+            self, gt_instances: InstanceData, points: Tensor,
+            regress_ranges: Tensor,
+            num_points_per_lvl: List[int]) -> Tuple[Tensor, Tensor]:
+        """Compute regression and classification targets for a single image."""
+        num_points = points.size(0)
+        num_gts = len(gt_instances)
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,), self.num_classes), \
+                   gt_bboxes.new_zeros((num_points, 4))
+
+        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+            gt_bboxes[:, 3] - gt_bboxes[:, 1])
+        # TODO: figure out why these two are different
+        # areas = areas[None].expand(num_points, num_gts)
+        areas = areas[None].repeat(num_points, 1)
+        regress_ranges = regress_ranges[:, None, :].expand(
+            num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None].expand(num_points, num_gts)
+        ys = ys[:, None].expand(num_points, num_gts)
+
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+
+        if self.center_sampling:
+            # condition1: inside a `center bbox`
+            radius = self.center_sample_radius
+            center_xs = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) / 2
+            center_ys = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) / 2
+            center_gts = torch.zeros_like(gt_bboxes)
+            stride = center_xs.new_zeros(center_xs.shape)
+
+            # project the points on current lvl back to the `original` sizes
+            lvl_begin = 0
+            for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
+                lvl_end = lvl_begin + num_points_lvl
+                stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
+                lvl_begin = lvl_end
+
+            x_mins = center_xs - stride
+            y_mins = center_ys - stride
+            x_maxs = center_xs + stride
+            y_maxs = center_ys + stride
+            center_gts[..., 0] = torch.where(x_mins > gt_bboxes[..., 0],
+                                             x_mins, gt_bboxes[..., 0])
+            center_gts[..., 1] = torch.where(y_mins > gt_bboxes[..., 1],
+                                             y_mins, gt_bboxes[..., 1])
+            center_gts[..., 2] = torch.where(x_maxs > gt_bboxes[..., 2],
+                                             gt_bboxes[..., 2], x_maxs)
+            center_gts[..., 3] = torch.where(y_maxs > gt_bboxes[..., 3],
+                                             gt_bboxes[..., 3], y_maxs)
+
+            cb_dist_left = xs - center_gts[..., 0]
+            cb_dist_right = center_gts[..., 2] - xs
+            cb_dist_top = ys - center_gts[..., 1]
+            cb_dist_bottom = center_gts[..., 3] - ys
+            center_bbox = torch.stack(
+                (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
+            inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+        else:
+            # condition1: inside a gt bbox
+            inside_gt_bbox_mask = bbox_targets.min(-1)[0] > 0
+
+        # condition2: limit the regression range for each location
+        max_regress_distance = bbox_targets.max(-1)[0]
+        inside_regress_range = (
+            (max_regress_distance >= regress_ranges[..., 0])
+            & (max_regress_distance <= regress_ranges[..., 1]))
+
+        # if there are still more than one objects for a location,
+        # we choose the one with minimal area
+        areas[inside_gt_bbox_mask == 0] = INF
+        areas[inside_regress_range == 0] = INF
+        min_area, min_area_inds = areas.min(dim=1)
+
+        labels = gt_labels[min_area_inds]
+        labels[min_area == INF] = self.num_classes  # set as BG
+        bbox_targets = bbox_targets[range(num_points), min_area_inds]
+
+        return labels, bbox_targets
+
+    def centerness_target(self, pos_bbox_targets: Tensor) -> Tensor:
+        """Compute centerness targets.
+
+        Args:
+            pos_bbox_targets (Tensor): BBox targets of positive bboxes in shape
+                (num_pos, 4)
+
+        Returns:
+            Tensor: Centerness target.
+        """
+        # only calculate pos centerness targets, otherwise there may be nan
+        left_right = pos_bbox_targets[:, [0, 2]]
+        top_bottom = pos_bbox_targets[:, [1, 3]]
+        if len(left_right) == 0:
+            centerness_targets = left_right[..., 0]
+        else:
+            centerness_targets = (
+                left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * (
+                    top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+        return torch.sqrt(centerness_targets)
diff --git a/head_extractor/src/mmdet/models/dense_heads/fovea_head.py b/head_extractor/src/mmdet/models/dense_heads/fovea_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..89353deac7f0189c1e464288521ee8e4238f0107
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/fovea_head.py
@@ -0,0 +1,509 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import DeformConv2d
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig
+from ..utils import filter_scores_and_topk, multi_apply
+from .anchor_free_head import AnchorFreeHead
+
+INF = 1e8
+
+
+class FeatureAlign(BaseModule):
+    """Feature Align Module.
+
+    Feature Align Module is implemented based on DCN v1.
+    It uses anchor shape prediction rather than feature map to
+    predict offsets of deform conv layer.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        out_channels (int): Number of channels in the output feature map.
+        kernel_size (int): Size of the convolution kernel.
+            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
+        deform_groups: (int): Group number of DCN in
+            FeatureAdaption module.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        deform_groups: int = 4,
+        init_cfg: OptMultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.1,
+            override=dict(type='Normal', name='conv_adaption', std=0.01))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        offset_channels = kernel_size * kernel_size * 2
+        self.conv_offset = nn.Conv2d(
+            4, deform_groups * offset_channels, 1, bias=False)
+        self.conv_adaption = DeformConv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(kernel_size - 1) // 2,
+            deform_groups=deform_groups)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor, shape: Tensor) -> Tensor:
+        """Forward function of feature align module.
+
+        Args:
+            x (Tensor): Features from the upstream network.
+            shape (Tensor): Exponential of bbox predictions.
+
+        Returns:
+            x (Tensor): The aligned features.
+        """
+        offset = self.conv_offset(shape)
+        x = self.relu(self.conv_adaption(x, offset))
+        return x
+
+
+@MODELS.register_module()
+class FoveaHead(AnchorFreeHead):
+    """Detection Head of `FoveaBox: Beyond Anchor-based Object Detector.
+
+    <https://arxiv.org/abs/1904.03797>`_.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        base_edge_list (list[int]): List of edges.
+        scale_ranges (list[tuple]): Range of scales.
+        sigma (float): Super parameter of ``FoveaHead``.
+        with_deform (bool):  Whether use deform conv.
+        deform_groups (int): Deformable conv group size.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 base_edge_list: List[int] = (16, 32, 64, 128, 256),
+                 scale_ranges: List[tuple] = ((8, 32), (16, 64), (32, 128),
+                                              (64, 256), (128, 512)),
+                 sigma: float = 0.4,
+                 with_deform: bool = False,
+                 deform_groups: int = 4,
+                 init_cfg: OptMultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.base_edge_list = base_edge_list
+        self.scale_ranges = scale_ranges
+        self.sigma = sigma
+        self.with_deform = with_deform
+        self.deform_groups = deform_groups
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        # box branch
+        super()._init_reg_convs()
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+        # cls branch
+        if not self.with_deform:
+            super()._init_cls_convs()
+            self.conv_cls = nn.Conv2d(
+                self.feat_channels, self.cls_out_channels, 3, padding=1)
+        else:
+            self.cls_convs = nn.ModuleList()
+            self.cls_convs.append(
+                ConvModule(
+                    self.feat_channels, (self.feat_channels * 4),
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None))
+            self.cls_convs.append(
+                ConvModule((self.feat_channels * 4), (self.feat_channels * 4),
+                           1,
+                           stride=1,
+                           padding=0,
+                           conv_cfg=self.conv_cfg,
+                           norm_cfg=self.norm_cfg,
+                           bias=self.norm_cfg is None))
+            self.feature_adaption = FeatureAlign(
+                self.feat_channels,
+                self.feat_channels,
+                kernel_size=3,
+                deform_groups=self.deform_groups)
+            self.conv_cls = nn.Conv2d(
+                int(self.feat_channels * 4),
+                self.cls_out_channels,
+                3,
+                padding=1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+
+        Returns:
+            tuple: scores for each class and bbox predictions of input
+            feature maps.
+        """
+        cls_feat = x
+        reg_feat = x
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = self.conv_reg(reg_feat)
+        if self.with_deform:
+            cls_feat = self.feature_adaption(cls_feat, bbox_pred.exp())
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        cls_score = self.conv_cls(cls_feat)
+        return cls_score, bbox_pred
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=bbox_preds[0].dtype,
+            device=bbox_preds[0].device)
+        num_imgs = cls_scores[0].size(0)
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_labels, flatten_bbox_targets = self.get_targets(
+            batch_gt_instances, featmap_sizes, priors)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((flatten_labels >= 0)
+                    & (flatten_labels < self.num_classes)).nonzero().view(-1)
+        num_pos = len(pos_inds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_scores, flatten_labels, avg_factor=num_pos + num_imgs)
+        if num_pos > 0:
+            pos_bbox_preds = flatten_bbox_preds[pos_inds]
+            pos_bbox_targets = flatten_bbox_targets[pos_inds]
+            pos_weights = pos_bbox_targets.new_ones(pos_bbox_targets.size())
+            loss_bbox = self.loss_bbox(
+                pos_bbox_preds,
+                pos_bbox_targets,
+                pos_weights,
+                avg_factor=num_pos)
+        else:
+            loss_bbox = torch.tensor(
+                0,
+                dtype=flatten_bbox_preds.dtype,
+                device=flatten_bbox_preds.device)
+        return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
+
+    def get_targets(
+            self, batch_gt_instances: InstanceList, featmap_sizes: List[tuple],
+            priors_list: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Compute regression and classification for priors in multiple images.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            featmap_sizes (list[tuple]): Size tuple of feature maps.
+            priors_list (list[Tensor]): Priors list of each fpn level, each has
+                shape (num_priors, 2).
+
+        Returns:
+            tuple: Targets of each level.
+
+            - flatten_labels (list[Tensor]): Labels of each level.
+            - flatten_bbox_targets (list[Tensor]): BBox targets of each
+              level.
+        """
+        label_list, bbox_target_list = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            featmap_size_list=featmap_sizes,
+            priors_list=priors_list)
+        flatten_labels = [
+            torch.cat([
+                labels_level_img.flatten() for labels_level_img in labels_level
+            ]) for labels_level in zip(*label_list)
+        ]
+        flatten_bbox_targets = [
+            torch.cat([
+                bbox_targets_level_img.reshape(-1, 4)
+                for bbox_targets_level_img in bbox_targets_level
+            ]) for bbox_targets_level in zip(*bbox_target_list)
+        ]
+        flatten_labels = torch.cat(flatten_labels)
+        flatten_bbox_targets = torch.cat(flatten_bbox_targets)
+        return flatten_labels, flatten_bbox_targets
+
+    def _get_targets_single(self,
+                            gt_instances: InstanceData,
+                            featmap_size_list: List[tuple] = None,
+                            priors_list: List[Tensor] = None) -> tuple:
+        """Compute regression and classification targets for a single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            featmap_size_list (list[tuple]): Size tuple of feature maps.
+            priors_list (list[Tensor]): Priors of each fpn level, each has
+                shape (num_priors, 2).
+
+        Returns:
+            tuple:
+
+            - label_list (list[Tensor]): Labels of all anchors in the image.
+            - box_target_list (list[Tensor]): BBox targets of all anchors in
+              the image.
+        """
+        gt_bboxes_raw = gt_instances.bboxes
+        gt_labels_raw = gt_instances.labels
+        gt_areas = torch.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *
+                              (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))
+        label_list = []
+        bbox_target_list = []
+        # for each pyramid, find the cls and box target
+        for base_len, (lower_bound, upper_bound), stride, featmap_size, \
+            priors in zip(self.base_edge_list, self.scale_ranges,
+                          self.strides, featmap_size_list, priors_list):
+            # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+            priors = priors.view(*featmap_size, 2)
+            x, y = priors[..., 0], priors[..., 1]
+            labels = gt_labels_raw.new_full(featmap_size, self.num_classes)
+            bbox_targets = gt_bboxes_raw.new_ones(featmap_size[0],
+                                                  featmap_size[1], 4)
+            # scale assignment
+            hit_indices = ((gt_areas >= lower_bound) &
+                           (gt_areas <= upper_bound)).nonzero().flatten()
+            if len(hit_indices) == 0:
+                label_list.append(labels)
+                bbox_target_list.append(torch.log(bbox_targets))
+                continue
+            _, hit_index_order = torch.sort(-gt_areas[hit_indices])
+            hit_indices = hit_indices[hit_index_order]
+            gt_bboxes = gt_bboxes_raw[hit_indices, :] / stride
+            gt_labels = gt_labels_raw[hit_indices]
+            half_w = 0.5 * (gt_bboxes[:, 2] - gt_bboxes[:, 0])
+            half_h = 0.5 * (gt_bboxes[:, 3] - gt_bboxes[:, 1])
+            # valid fovea area: left, right, top, down
+            pos_left = torch.ceil(
+                gt_bboxes[:, 0] + (1 - self.sigma) * half_w - 0.5).long(). \
+                clamp(0, featmap_size[1] - 1)
+            pos_right = torch.floor(
+                gt_bboxes[:, 0] + (1 + self.sigma) * half_w - 0.5).long(). \
+                clamp(0, featmap_size[1] - 1)
+            pos_top = torch.ceil(
+                gt_bboxes[:, 1] + (1 - self.sigma) * half_h - 0.5).long(). \
+                clamp(0, featmap_size[0] - 1)
+            pos_down = torch.floor(
+                gt_bboxes[:, 1] + (1 + self.sigma) * half_h - 0.5).long(). \
+                clamp(0, featmap_size[0] - 1)
+            for px1, py1, px2, py2, label, (gt_x1, gt_y1, gt_x2, gt_y2) in \
+                    zip(pos_left, pos_top, pos_right, pos_down, gt_labels,
+                        gt_bboxes_raw[hit_indices, :]):
+                labels[py1:py2 + 1, px1:px2 + 1] = label
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 0] = \
+                    (x[py1:py2 + 1, px1:px2 + 1] - gt_x1) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 1] = \
+                    (y[py1:py2 + 1, px1:px2 + 1] - gt_y1) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 2] = \
+                    (gt_x2 - x[py1:py2 + 1, px1:px2 + 1]) / base_len
+                bbox_targets[py1:py2 + 1, px1:px2 + 1, 3] = \
+                    (gt_y2 - y[py1:py2 + 1, px1:px2 + 1]) / base_len
+            bbox_targets = bbox_targets.clamp(min=1. / 16, max=16.)
+            label_list.append(labels)
+            bbox_target_list.append(torch.log(bbox_targets))
+        return label_list, bbox_target_list
+
+    # Same as base_dense_head/_predict_by_feat_single except self._bbox_decode
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: Optional[ConfigDict] = None,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 2).
+            img_meta (dict): Image meta info.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for level_idx, (cls_score, bbox_pred, stride, base_len, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list, self.strides,
+                              self.base_edge_list, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, _, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            bboxes = self._bbox_decode(priors, bbox_pred, base_len, img_shape)
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def _bbox_decode(self, priors: Tensor, bbox_pred: Tensor, base_len: int,
+                     max_shape: int) -> Tensor:
+        """Function to decode bbox.
+
+        Args:
+            priors (Tensor): Center proiors of an image, has shape
+                (num_instances, 2).
+            bbox_preds (Tensor): Box energies / deltas for all instances,
+                has shape (batch_size, num_instances, 4).
+            base_len (int): The base length.
+            max_shape (int): The max shape of bbox.
+
+        Returns:
+            Tensor: Decoded bboxes in (tl_x, tl_y, br_x, br_y) format. Has
+            shape (batch_size, num_instances, 4).
+        """
+        bbox_pred = bbox_pred.exp()
+
+        y = priors[:, 1]
+        x = priors[:, 0]
+        x1 = (x - base_len * bbox_pred[:, 0]). \
+            clamp(min=0, max=max_shape[1] - 1)
+        y1 = (y - base_len * bbox_pred[:, 1]). \
+            clamp(min=0, max=max_shape[0] - 1)
+        x2 = (x + base_len * bbox_pred[:, 2]). \
+            clamp(min=0, max=max_shape[1] - 1)
+        y2 = (y + base_len * bbox_pred[:, 3]). \
+            clamp(min=0, max=max_shape[0] - 1)
+        decoded_bboxes = torch.stack([x1, y1, x2, y2], -1)
+        return decoded_bboxes
diff --git a/head_extractor/src/mmdet/models/dense_heads/free_anchor_retina_head.py b/head_extractor/src/mmdet/models/dense_heads/free_anchor_retina_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..df6fb9202c32735121bf7738e332fbfc5ac7e6bd
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/free_anchor_retina_head.py
@@ -0,0 +1,312 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import InstanceList, OptConfigType, OptInstanceList
+from ..utils import multi_apply
+from .retina_head import RetinaHead
+
+EPS = 1e-12
+
+
+@MODELS.register_module()
+class FreeAnchorRetinaHead(RetinaHead):
+    """FreeAnchor RetinaHead used in https://arxiv.org/abs/1909.02466.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Defaults to 4.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): dictionary to
+            construct and config conv layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): dictionary to
+            construct and config norm layer. Defaults to
+            norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+        pre_anchor_topk (int): Number of boxes that be token in each bag.
+            Defaults to 50
+        bbox_thr (float): The threshold of the saturated linear function.
+            It is usually the same with the IoU threshold used in NMS.
+            Defaults to 0.6.
+        gamma (float): Gamma parameter in focal loss. Defaults to 2.0.
+        alpha (float): Alpha parameter in focal loss. Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 pre_anchor_topk: int = 50,
+                 bbox_thr: float = 0.6,
+                 gamma: float = 2.0,
+                 alpha: float = 0.5,
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            stacked_convs=stacked_convs,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            **kwargs)
+
+        self.pre_anchor_topk = pre_anchor_topk
+        self.bbox_thr = bbox_thr
+        self.gamma = gamma
+        self.alpha = alpha
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, _ = self.get_anchors(
+            featmap_sizes=featmap_sizes,
+            batch_img_metas=batch_img_metas,
+            device=device)
+        concat_anchor_list = [torch.cat(anchor) for anchor in anchor_list]
+
+        # concatenate each level
+        cls_scores = [
+            cls.permute(0, 2, 3,
+                        1).reshape(cls.size(0), -1, self.cls_out_channels)
+            for cls in cls_scores
+        ]
+        bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(bbox_pred.size(0), -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        cls_scores = torch.cat(cls_scores, dim=1)
+        cls_probs = torch.sigmoid(cls_scores)
+        bbox_preds = torch.cat(bbox_preds, dim=1)
+
+        box_probs, positive_losses, num_pos_list = multi_apply(
+            self.positive_loss_single, cls_probs, bbox_preds,
+            concat_anchor_list, batch_gt_instances)
+
+        num_pos = sum(num_pos_list)
+        positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos)
+
+        # box_prob: P{a_{j} \in A_{+}}
+        box_probs = torch.stack(box_probs, dim=0)
+
+        # negative_loss:
+        # \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B||
+        negative_loss = self.negative_bag_loss(cls_probs, box_probs).sum() / \
+            max(1, num_pos * self.pre_anchor_topk)
+
+        # avoid the absence of gradients in regression subnet
+        # when no ground-truth in a batch
+        if num_pos == 0:
+            positive_loss = bbox_preds.sum() * 0
+
+        losses = {
+            'positive_bag_loss': positive_loss,
+            'negative_bag_loss': negative_loss
+        }
+        return losses
+
+    def positive_loss_single(self, cls_prob: Tensor, bbox_pred: Tensor,
+                             flat_anchors: Tensor,
+                             gt_instances: InstanceData) -> tuple:
+        """Compute positive loss.
+
+        Args:
+            cls_prob (Tensor): Classification probability of shape
+                (num_anchors, num_classes).
+            bbox_pred (Tensor): Box probability of shape (num_anchors, 4).
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple:
+
+                - box_prob (Tensor): Box probability of shape (num_anchors, 4).
+                - positive_loss (Tensor): Positive loss of shape (num_pos, ).
+                - num_pos (int): positive samples indexes.
+        """
+
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        with torch.no_grad():
+            if len(gt_bboxes) == 0:
+                image_box_prob = torch.zeros(
+                    flat_anchors.size(0),
+                    self.cls_out_channels).type_as(bbox_pred)
+            else:
+                # box_localization: a_{j}^{loc}, shape: [j, 4]
+                pred_boxes = self.bbox_coder.decode(flat_anchors, bbox_pred)
+
+                # object_box_iou: IoU_{ij}^{loc}, shape: [i, j]
+                object_box_iou = bbox_overlaps(gt_bboxes, pred_boxes)
+
+                # object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j]
+                t1 = self.bbox_thr
+                t2 = object_box_iou.max(
+                    dim=1, keepdim=True).values.clamp(min=t1 + 1e-12)
+                object_box_prob = ((object_box_iou - t1) / (t2 - t1)).clamp(
+                    min=0, max=1)
+
+                # object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j]
+                num_obj = gt_labels.size(0)
+                indices = torch.stack(
+                    [torch.arange(num_obj).type_as(gt_labels), gt_labels],
+                    dim=0)
+                object_cls_box_prob = torch.sparse_coo_tensor(
+                    indices, object_box_prob)
+
+                # image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j]
+                """
+                from "start" to "end" implement:
+                image_box_iou = torch.sparse.max(object_cls_box_prob,
+                                                 dim=0).t()
+
+                """
+                # start
+                box_cls_prob = torch.sparse.sum(
+                    object_cls_box_prob, dim=0).to_dense()
+
+                indices = torch.nonzero(box_cls_prob, as_tuple=False).t_()
+                if indices.numel() == 0:
+                    image_box_prob = torch.zeros(
+                        flat_anchors.size(0),
+                        self.cls_out_channels).type_as(object_box_prob)
+                else:
+                    nonzero_box_prob = torch.where(
+                        (gt_labels.unsqueeze(dim=-1) == indices[0]),
+                        object_box_prob[:, indices[1]],
+                        torch.tensor(
+                            [0]).type_as(object_box_prob)).max(dim=0).values
+
+                    # upmap to shape [j, c]
+                    image_box_prob = torch.sparse_coo_tensor(
+                        indices.flip([0]),
+                        nonzero_box_prob,
+                        size=(flat_anchors.size(0),
+                              self.cls_out_channels)).to_dense()
+                # end
+            box_prob = image_box_prob
+
+        # construct bags for objects
+        match_quality_matrix = bbox_overlaps(gt_bboxes, flat_anchors)
+        _, matched = torch.topk(
+            match_quality_matrix, self.pre_anchor_topk, dim=1, sorted=False)
+        del match_quality_matrix
+
+        # matched_cls_prob: P_{ij}^{cls}
+        matched_cls_prob = torch.gather(
+            cls_prob[matched], 2,
+            gt_labels.view(-1, 1, 1).repeat(1, self.pre_anchor_topk,
+                                            1)).squeeze(2)
+
+        # matched_box_prob: P_{ij}^{loc}
+        matched_anchors = flat_anchors[matched]
+        matched_object_targets = self.bbox_coder.encode(
+            matched_anchors,
+            gt_bboxes.unsqueeze(dim=1).expand_as(matched_anchors))
+        loss_bbox = self.loss_bbox(
+            bbox_pred[matched],
+            matched_object_targets,
+            reduction_override='none').sum(-1)
+        matched_box_prob = torch.exp(-loss_bbox)
+
+        # positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )}
+        num_pos = len(gt_bboxes)
+        positive_loss = self.positive_bag_loss(matched_cls_prob,
+                                               matched_box_prob)
+
+        return box_prob, positive_loss, num_pos
+
+    def positive_bag_loss(self, matched_cls_prob: Tensor,
+                          matched_box_prob: Tensor) -> Tensor:
+        """Compute positive bag loss.
+
+        :math:`-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )`.
+
+        :math:`P_{ij}^{cls}`: matched_cls_prob, classification probability of matched samples.
+
+        :math:`P_{ij}^{loc}`: matched_box_prob, box probability of matched samples.
+
+        Args:
+            matched_cls_prob (Tensor): Classification probability of matched
+                samples in shape (num_gt, pre_anchor_topk).
+            matched_box_prob (Tensor): BBox probability of matched samples,
+                in shape (num_gt, pre_anchor_topk).
+
+        Returns:
+            Tensor: Positive bag loss in shape (num_gt,).
+        """  # noqa: E501, W605
+        # bag_prob = Mean-max(matched_prob)
+        matched_prob = matched_cls_prob * matched_box_prob
+        weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None)
+        weight /= weight.sum(dim=1).unsqueeze(dim=-1)
+        bag_prob = (weight * matched_prob).sum(dim=1)
+        # positive_bag_loss = -self.alpha * log(bag_prob)
+        return self.alpha * F.binary_cross_entropy(
+            bag_prob, torch.ones_like(bag_prob), reduction='none')
+
+    def negative_bag_loss(self, cls_prob: Tensor, box_prob: Tensor) -> Tensor:
+        """Compute negative bag loss.
+
+        :math:`FL((1 - P_{a_{j} \in A_{+}}) * (1 - P_{j}^{bg}))`.
+
+        :math:`P_{a_{j} \in A_{+}}`: Box_probability of matched samples.
+
+        :math:`P_{j}^{bg}`: Classification probability of negative samples.
+
+        Args:
+            cls_prob (Tensor): Classification probability, in shape
+                (num_img, num_anchors, num_classes).
+            box_prob (Tensor): Box probability, in shape
+                (num_img, num_anchors, num_classes).
+
+        Returns:
+            Tensor: Negative bag loss in shape (num_img, num_anchors,
+            num_classes).
+        """  # noqa: E501, W605
+        prob = cls_prob * (1 - box_prob)
+        # There are some cases when neg_prob = 0.
+        # This will cause the neg_prob.log() to be inf without clamp.
+        prob = prob.clamp(min=EPS, max=1 - EPS)
+        negative_bag_loss = prob**self.gamma * F.binary_cross_entropy(
+            prob, torch.zeros_like(prob), reduction='none')
+        return (1 - self.alpha) * negative_bag_loss
diff --git a/head_extractor/src/mmdet/models/dense_heads/fsaf_head.py b/head_extractor/src/mmdet/models/dense_heads/fsaf_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a01c487406693253eb17b883cac9ed06cf95802
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/fsaf_head.py
@@ -0,0 +1,458 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptInstanceList, OptMultiConfig
+from ..losses.accuracy import accuracy
+from ..losses.utils import weight_reduce_loss
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import images_to_levels, multi_apply, unmap
+from .retina_head import RetinaHead
+
+
+@MODELS.register_module()
+class FSAFHead(RetinaHead):
+    """Anchor-free head used in `FSAF <https://arxiv.org/abs/1903.00621>`_.
+
+    The head contains two subnetworks. The first classifies anchor boxes and
+    the second regresses deltas for the anchors (num_anchors is 1 for anchor-
+    free methods)
+
+    Args:
+        *args: Same as its base class in :class:`RetinaHead`
+        score_threshold (float, optional): The score_threshold to calculate
+            positive recall. If given, prediction scores lower than this value
+            is counted as incorrect prediction. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+        **kwargs: Same as its base class in :class:`RetinaHead`
+
+    Example:
+        >>> import torch
+        >>> self = FSAFHead(11, 7)
+        >>> x = torch.rand(1, 7, 32, 32)
+        >>> cls_score, bbox_pred = self.forward_single(x)
+        >>> # Each anchor predicts a score for each class except background
+        >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
+        >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
+        >>> assert cls_per_anchor == self.num_classes
+        >>> assert box_per_anchor == 4
+    """
+
+    def __init__(self,
+                 *args,
+                 score_threshold: Optional[float] = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        # The positive bias in self.retina_reg conv is to prevent predicted \
+        #  bbox with 0 area
+        if init_cfg is None:
+            init_cfg = dict(
+                type='Normal',
+                layer='Conv2d',
+                std=0.01,
+                override=[
+                    dict(
+                        type='Normal',
+                        name='retina_cls',
+                        std=0.01,
+                        bias_prob=0.01),
+                    dict(
+                        type='Normal', name='retina_reg', std=0.01, bias=0.25)
+                ])
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        self.score_threshold = score_threshold
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward feature map of a single scale level.
+
+        Args:
+            x (Tensor): Feature map of a single scale level.
+
+        Returns:
+            tuple[Tensor, Tensor]:
+
+            - cls_score (Tensor): Box scores for each scale level Has \
+            shape (N, num_points * num_classes, H, W).
+            - bbox_pred (Tensor): Box energies / deltas for each scale \
+            level with shape (N, num_points * 4, H, W).
+        """
+        cls_score, bbox_pred = super().forward_single(x)
+        # relu: TBLR encoder only accepts positive bbox_pred
+        return cls_score, self.relu(bbox_pred)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Most of the codes are the same with the base class :obj: `AnchorHead`,
+        except that it also collects and returns the matched gt index in the
+        image (from 0 to num_gt-1). If the anchor bbox is not matched to any
+        gt, the corresponding value in pos_gt_inds is -1.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors, ).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.  Defaults to True.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # Assign gt and sample anchors
+        anchors = flat_anchors[inside_flags.type(torch.bool), :]
+
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(
+            (num_valid_anchors, self.cls_out_channels), dtype=torch.float)
+        pos_gt_inds = anchors.new_full((num_valid_anchors, ),
+                                       -1,
+                                       dtype=torch.long)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+                # is applied directly on the decoded bounding boxes, both
+                # the predicted boxes and regression targets should be with
+                # absolute coordinate format.
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            # The assigned gt_index for each anchor. (0-based)
+            pos_gt_inds[pos_inds] = sampling_result.pos_assigned_gt_inds
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # shadowed_labels is a tensor composed of tuples
+        #  (anchor_inds, class_label) that indicate those anchors lying in the
+        #  outer region of a gt or overlapped by another gt with a smaller
+        #  area.
+        #
+        # Therefore, only the shadowed labels are ignored for loss calculation.
+        # the key `shadowed_labels` is defined in :obj:`CenterRegionAssigner`
+        shadowed_labels = assign_result.get_extra_property('shadowed_labels')
+        if shadowed_labels is not None and shadowed_labels.numel():
+            if len(shadowed_labels.shape) == 2:
+                idx_, label_ = shadowed_labels[:, 0], shadowed_labels[:, 1]
+                assert (labels[idx_] != label_).all(), \
+                    'One label cannot be both positive and ignored'
+                label_weights[idx_, label_] = 0
+            else:
+                label_weights[shadowed_labels] = 0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+            pos_gt_inds = unmap(
+                pos_gt_inds, num_total_anchors, inside_flags, fill=-1)
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result, pos_gt_inds)
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        for i in range(len(bbox_preds)):  # loop over fpn level
+            # avoid 0 area of the predicted bbox
+            bbox_preds[i] = bbox_preds[i].clamp(min=1e-4)
+        # TODO: It may directly use the base-class loss function.
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+        batch_size = len(batch_img_metas)
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            return_sampling_results=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results_list,
+         pos_assigned_gt_inds_list) = cls_reg_targets
+
+        num_gts = np.array(list(map(len, batch_gt_instances)))
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            avg_factor=avg_factor)
+
+        # `pos_assigned_gt_inds_list` (length: fpn_levels) stores the assigned
+        # gt index of each anchor bbox in each fpn level.
+        cum_num_gts = list(np.cumsum(num_gts))  # length of batch_size
+        for i, assign in enumerate(pos_assigned_gt_inds_list):
+            # loop over fpn levels
+            for j in range(1, batch_size):
+                # loop over batch size
+                # Convert gt indices in each img to those in the batch
+                assign[j][assign[j] >= 0] += int(cum_num_gts[j - 1])
+            pos_assigned_gt_inds_list[i] = assign.flatten()
+            labels_list[i] = labels_list[i].flatten()
+        num_gts = num_gts.sum()  # total number of gt in the batch
+        # The unique label index of each gt in the batch
+        label_sequence = torch.arange(num_gts, device=device)
+        # Collect the average loss of each gt in each level
+        with torch.no_grad():
+            loss_levels, = multi_apply(
+                self.collect_loss_level_single,
+                losses_cls,
+                losses_bbox,
+                pos_assigned_gt_inds_list,
+                labels_seq=label_sequence)
+            # Shape: (fpn_levels, num_gts). Loss of each gt at each fpn level
+            loss_levels = torch.stack(loss_levels, dim=0)
+            # Locate the best fpn level for loss back-propagation
+            if loss_levels.numel() == 0:  # zero gt
+                argmin = loss_levels.new_empty((num_gts, ), dtype=torch.long)
+            else:
+                _, argmin = loss_levels.min(dim=0)
+
+        # Reweight the loss of each (anchor, label) pair, so that only those
+        #  at the best gt level are back-propagated.
+        losses_cls, losses_bbox, pos_inds = multi_apply(
+            self.reweight_loss_single,
+            losses_cls,
+            losses_bbox,
+            pos_assigned_gt_inds_list,
+            labels_list,
+            list(range(len(losses_cls))),
+            min_levels=argmin)
+        num_pos = torch.cat(pos_inds, 0).sum().float()
+        pos_recall = self.calculate_pos_recall(cls_scores, labels_list,
+                                               pos_inds)
+
+        if num_pos == 0:  # No gt
+            num_total_neg = sum(
+                [results.num_neg for results in sampling_results_list])
+            avg_factor = num_pos + num_total_neg
+        else:
+            avg_factor = num_pos
+        for i in range(len(losses_cls)):
+            losses_cls[i] /= avg_factor
+            losses_bbox[i] /= avg_factor
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            num_pos=num_pos / batch_size,
+            pos_recall=pos_recall)
+
+    def calculate_pos_recall(self, cls_scores: List[Tensor],
+                             labels_list: List[Tensor],
+                             pos_inds: List[Tensor]) -> Tensor:
+        """Calculate positive recall with score threshold.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores at all fpn levels.
+                Each tensor is in shape (N, num_classes * num_anchors, H, W)
+            labels_list (list[Tensor]): The label that each anchor is assigned
+                to. Shape (N * H * W * num_anchors, )
+            pos_inds (list[Tensor]): List of bool tensors indicating whether
+                the anchor is assigned to a positive label.
+                Shape (N * H * W * num_anchors, )
+
+        Returns:
+            Tensor: A single float number indicating the positive recall.
+        """
+        with torch.no_grad():
+            num_class = self.num_classes
+            scores = [
+                cls.permute(0, 2, 3, 1).reshape(-1, num_class)[pos]
+                for cls, pos in zip(cls_scores, pos_inds)
+            ]
+            labels = [
+                label.reshape(-1)[pos]
+                for label, pos in zip(labels_list, pos_inds)
+            ]
+            scores = torch.cat(scores, dim=0)
+            labels = torch.cat(labels, dim=0)
+            if self.use_sigmoid_cls:
+                scores = scores.sigmoid()
+            else:
+                scores = scores.softmax(dim=1)
+
+            return accuracy(scores, labels, thresh=self.score_threshold)
+
+    def collect_loss_level_single(self, cls_loss: Tensor, reg_loss: Tensor,
+                                  assigned_gt_inds: Tensor,
+                                  labels_seq: Tensor) -> Tensor:
+        """Get the average loss in each FPN level w.r.t. each gt label.
+
+        Args:
+            cls_loss (Tensor): Classification loss of each feature map pixel,
+              shape (num_anchor, num_class)
+            reg_loss (Tensor): Regression loss of each feature map pixel,
+              shape (num_anchor, 4)
+            assigned_gt_inds (Tensor): It indicates which gt the prior is
+              assigned to (0-based, -1: no assignment). shape (num_anchor),
+            labels_seq: The rank of labels. shape (num_gt)
+
+        Returns:
+            Tensor: shape (num_gt), average loss of each gt in this level
+        """
+        if len(reg_loss.shape) == 2:  # iou loss has shape (num_prior, 4)
+            reg_loss = reg_loss.sum(dim=-1)  # sum loss in tblr dims
+        if len(cls_loss.shape) == 2:
+            cls_loss = cls_loss.sum(dim=-1)  # sum loss in class dims
+        loss = cls_loss + reg_loss
+        assert loss.size(0) == assigned_gt_inds.size(0)
+        # Default loss value is 1e6 for a layer where no anchor is positive
+        #  to ensure it will not be chosen to back-propagate gradient
+        losses_ = loss.new_full(labels_seq.shape, 1e6)
+        for i, l in enumerate(labels_seq):
+            match = assigned_gt_inds == l
+            if match.any():
+                losses_[i] = loss[match].mean()
+        return losses_,
+
+    def reweight_loss_single(self, cls_loss: Tensor, reg_loss: Tensor,
+                             assigned_gt_inds: Tensor, labels: Tensor,
+                             level: int, min_levels: Tensor) -> tuple:
+        """Reweight loss values at each level.
+
+        Reassign loss values at each level by masking those where the
+        pre-calculated loss is too large. Then return the reduced losses.
+
+        Args:
+            cls_loss (Tensor): Element-wise classification loss.
+              Shape: (num_anchors, num_classes)
+            reg_loss (Tensor): Element-wise regression loss.
+              Shape: (num_anchors, 4)
+            assigned_gt_inds (Tensor): The gt indices that each anchor bbox
+              is assigned to. -1 denotes a negative anchor, otherwise it is the
+              gt index (0-based). Shape: (num_anchors, ),
+            labels (Tensor): Label assigned to anchors. Shape: (num_anchors, ).
+            level (int): The current level index in the pyramid
+              (0-4 for RetinaNet)
+            min_levels (Tensor): The best-matching level for each gt.
+              Shape: (num_gts, ),
+
+        Returns:
+            tuple:
+
+            - cls_loss: Reduced corrected classification loss. Scalar.
+            - reg_loss: Reduced corrected regression loss. Scalar.
+            - pos_flags (Tensor): Corrected bool tensor indicating the \
+            final positive anchors. Shape: (num_anchors, ).
+        """
+        loc_weight = torch.ones_like(reg_loss)
+        cls_weight = torch.ones_like(cls_loss)
+        pos_flags = assigned_gt_inds >= 0  # positive pixel flag
+        pos_indices = torch.nonzero(pos_flags, as_tuple=False).flatten()
+
+        if pos_flags.any():  # pos pixels exist
+            pos_assigned_gt_inds = assigned_gt_inds[pos_flags]
+            zeroing_indices = (min_levels[pos_assigned_gt_inds] != level)
+            neg_indices = pos_indices[zeroing_indices]
+
+            if neg_indices.numel():
+                pos_flags[neg_indices] = 0
+                loc_weight[neg_indices] = 0
+                # Only the weight corresponding to the label is
+                #  zeroed out if not selected
+                zeroing_labels = labels[neg_indices]
+                assert (zeroing_labels >= 0).all()
+                cls_weight[neg_indices, zeroing_labels] = 0
+
+        # Weighted loss for both cls and reg loss
+        cls_loss = weight_reduce_loss(cls_loss, cls_weight, reduction='sum')
+        reg_loss = weight_reduce_loss(reg_loss, loc_weight, reduction='sum')
+
+        return cls_loss, reg_loss, pos_flags
diff --git a/head_extractor/src/mmdet/models/dense_heads/ga_retina_head.py b/head_extractor/src/mmdet/models/dense_heads/ga_retina_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..569910b365126e90638256f0d10addfa230fd141
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/ga_retina_head.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import MaskedConv2d
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead
+
+
+@MODELS.register_module()
+class GARetinaHead(GuidedAnchorHead):
+    """Guided-Anchor-based RetinaNet head."""
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        if init_cfg is None:
+            init_cfg = dict(
+                type='Normal',
+                layer='Conv2d',
+                std=0.01,
+                override=[
+                    dict(
+                        type='Normal',
+                        name='conv_loc',
+                        std=0.01,
+                        bias_prob=0.01),
+                    dict(
+                        type='Normal',
+                        name='retina_cls',
+                        std=0.01,
+                        bias_prob=0.01)
+                ])
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+        self.conv_loc = nn.Conv2d(self.feat_channels, 1, 1)
+        num_anchors = self.square_anchor_generator.num_base_priors[0]
+        self.conv_shape = nn.Conv2d(self.feat_channels, num_anchors * 2, 1)
+        self.feature_adaption_cls = FeatureAdaption(
+            self.feat_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.feature_adaption_reg = FeatureAdaption(
+            self.feat_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.retina_cls = MaskedConv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.retina_reg = MaskedConv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward feature map of a single scale level."""
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+
+        loc_pred = self.conv_loc(cls_feat)
+        shape_pred = self.conv_shape(reg_feat)
+
+        cls_feat = self.feature_adaption_cls(cls_feat, shape_pred)
+        reg_feat = self.feature_adaption_reg(reg_feat, shape_pred)
+
+        if not self.training:
+            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
+        else:
+            mask = None
+        cls_score = self.retina_cls(cls_feat, mask)
+        bbox_pred = self.retina_reg(reg_feat, mask)
+        return cls_score, bbox_pred, shape_pred, loc_pred
diff --git a/head_extractor/src/mmdet/models/dense_heads/ga_rpn_head.py b/head_extractor/src/mmdet/models/dense_heads/ga_rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9614463165533358b8465420a87dfa47e7de1177
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/ga_rpn_head.py
@@ -0,0 +1,222 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops import nms
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptInstanceList
+from .guided_anchor_head import GuidedAnchorHead
+
+
+@MODELS.register_module()
+class GARPNHead(GuidedAnchorHead):
+    """Guided-Anchor-based RPN head."""
+
+    def __init__(self,
+                 in_channels: int,
+                 num_classes: int = 1,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_loc',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.rpn_conv = nn.Conv2d(
+            self.in_channels, self.feat_channels, 3, padding=1)
+        super(GARPNHead, self)._init_layers()
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward feature of a single scale level."""
+
+        x = self.rpn_conv(x)
+        x = F.relu(x, inplace=True)
+        (cls_score, bbox_pred, shape_pred,
+         loc_pred) = super().forward_single(x)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            shape_preds: List[Tensor],
+            loc_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            shape_preds (list[Tensor]): shape predictions for each scale
+                level with shape (N, 1, H, W).
+            loc_preds (list[Tensor]): location predictions for each scale
+                level with shape (N, num_anchors * 2, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        losses = super().loss_by_feat(
+            cls_scores,
+            bbox_preds,
+            shape_preds,
+            loc_preds,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        return dict(
+            loss_rpn_cls=losses['loss_cls'],
+            loss_rpn_bbox=losses['loss_bbox'],
+            loss_anchor_shape=losses['loss_shape'],
+            loss_anchor_loc=losses['loss_loc'])
+
+    def _predict_by_feat_single(self,
+                                cls_scores: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                mlvl_anchors: List[Tensor],
+                                mlvl_masks: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = False) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Each element in the list is
+                the anchors of a single level in feature pyramid. it has
+                shape (num_priors, 4).
+            mlvl_masks (list[Tensor]): Each element in the list is location
+                masks of a single level.
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict` or dict): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last
+              dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        assert cfg.nms.get('type', 'nms') == 'nms', 'GARPNHead only support ' \
+            'naive nms.'
+
+        mlvl_proposals = []
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            anchors = mlvl_anchors[idx]
+            mask = mlvl_masks[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = rpn_cls_score.softmax(dim=1)[:, :-1]
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1,
+                                                                   4)[mask, :]
+            if scores.dim() == 0:
+                rpn_bbox_pred = rpn_bbox_pred.unsqueeze(0)
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
+                _, topk_inds = scores.topk(cfg.nms_pre)
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+                scores = scores[topk_inds]
+            # get proposals w.r.t. anchors and rpn_bbox_pred
+            proposals = self.bbox_coder.decode(
+                anchors, rpn_bbox_pred, max_shape=img_meta['img_shape'])
+            # filter out too small bboxes
+            if cfg.min_bbox_size >= 0:
+                w = proposals[:, 2] - proposals[:, 0]
+                h = proposals[:, 3] - proposals[:, 1]
+                valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+                if not valid_mask.all():
+                    proposals = proposals[valid_mask]
+                    scores = scores[valid_mask]
+
+            # NMS in current level
+            proposals, _ = nms(proposals, scores, cfg.nms.iou_threshold)
+            proposals = proposals[:cfg.nms_post, :]
+            mlvl_proposals.append(proposals)
+        proposals = torch.cat(mlvl_proposals, 0)
+        if cfg.get('nms_across_levels', False):
+            # NMS across multi levels
+            proposals, _ = nms(proposals[:, :4], proposals[:, -1],
+                               cfg.nms.iou_threshold)
+            proposals = proposals[:cfg.max_per_img, :]
+        else:
+            scores = proposals[:, 4]
+            num = min(cfg.max_per_img, proposals.shape[0])
+            _, topk_inds = scores.topk(num)
+            proposals = proposals[topk_inds, :]
+
+        bboxes = proposals[:, :-1]
+        scores = proposals[:, -1]
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            bboxes /= bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = scores
+        results.labels = scores.new_zeros(scores.size(0), dtype=torch.long)
+        return results
diff --git a/head_extractor/src/mmdet/models/dense_heads/gfl_head.py b/head_extractor/src/mmdet/models/dense_heads/gfl_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..be43d9b4da39da602b3b87bd3c9739c67367615b
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/gfl_head.py
@@ -0,0 +1,667 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..task_modules.samplers import PseudoSampler
+from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply,
+                     unmap)
+from .anchor_head import AnchorHead
+
+
+class Integral(nn.Module):
+    """A fixed layer for calculating integral result from distribution.
+
+    This layer calculates the target location by :math: ``sum{P(y_i) * y_i}``,
+    P(y_i) denotes the softmax vector that represents the discrete distribution
+    y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
+
+    Args:
+        reg_max (int): The maximal value of the discrete set. Defaults to 16.
+            You may want to reset it according to your new dataset or related
+            settings.
+    """
+
+    def __init__(self, reg_max: int = 16) -> None:
+        super().__init__()
+        self.reg_max = reg_max
+        self.register_buffer('project',
+                             torch.linspace(0, self.reg_max, self.reg_max + 1))
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward feature from the regression head to get integral result of
+        bounding box location.
+
+        Args:
+            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
+                n is self.reg_max.
+
+        Returns:
+            x (Tensor): Integral result of box locations, i.e., distance
+                offsets from the box center in four directions, shape (N, 4).
+        """
+        x = F.softmax(x.reshape(-1, self.reg_max + 1), dim=1)
+        x = F.linear(x, self.project.type_as(x)).reshape(-1, 4)
+        return x
+
+
+@MODELS.register_module()
+class GFLHead(AnchorHead):
+    """Generalized Focal Loss: Learning Qualified and Distributed Bounding
+    Boxes for Dense Object Detection.
+
+    GFL head structure is similar with ATSS, however GFL uses
+    1) joint representation for classification and localization quality, and
+    2) flexible General distribution for bounding box locations,
+    which are supervised by
+    Quality Focal Loss (QFL) and Distribution Focal Loss (DFL), respectively
+
+    https://arxiv.org/abs/2006.04388
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Defaults to 4.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): dictionary to construct
+            and config conv layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer. Default: dict(type='GN', num_groups=32,
+            requires_grad=True).
+        loss_qfl (:obj:`ConfigDict` or dict): Config of Quality Focal Loss
+            (QFL).
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bbox coder. Defaults
+             to 'DistancePointBBoxCoder'.
+        reg_max (int): Max value of integral set :math: ``{0, ..., reg_max}``
+            in QFL setting. Defaults to 16.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+    Example:
+        >>> self = GFLHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_quality_score, bbox_pred = self.forward(feats)
+        >>> assert len(cls_quality_score) == len(self.scales)
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 loss_dfl: ConfigType = dict(
+                     type='DistributionFocalLoss', loss_weight=0.25),
+                 bbox_coder: ConfigType = dict(type='DistancePointBBoxCoder'),
+                 reg_max: int = 16,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='gfl_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.reg_max = reg_max
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            bbox_coder=bbox_coder,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        self.integral = Integral(self.reg_max)
+        self.loss_dfl = MODELS.build(loss_dfl)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU()
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        assert self.num_anchors == 1, 'anchor free version'
+        self.gfl_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.gfl_reg = nn.Conv2d(
+            self.feat_channels, 4 * (self.reg_max + 1), 3, padding=1)
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+
+            - cls_scores (list[Tensor]): Classification and quality (IoU)
+              joint scores for all scale levels, each is a 4D-tensor,
+              the channel number is num_classes.
+            - bbox_preds (list[Tensor]): Box distribution logits for all
+              scale levels, each is a 4D-tensor, the channel number is
+              4*(n+1), n is max value of integral set.
+        """
+        return multi_apply(self.forward_single, x, self.scales)
+
+    def forward_single(self, x: Tensor, scale: Scale) -> Sequence[Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+
+        Returns:
+            tuple:
+
+            - cls_score (Tensor): Cls and quality joint scores for a single
+              scale level the channel number is num_classes.
+            - bbox_pred (Tensor): Box distribution logits for a single scale
+              level, the channel number is 4*(n+1), n is max value of
+              integral set.
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.gfl_cls(cls_feat)
+        bbox_pred = scale(self.gfl_reg(reg_feat)).float()
+        return cls_score, bbox_pred
+
+    def anchor_center(self, anchors: Tensor) -> Tensor:
+        """Get anchor centers from anchors.
+
+        Args:
+            anchors (Tensor): Anchor list with shape (N, 4), ``xyxy`` format.
+
+        Returns:
+            Tensor: Anchor centers with shape (N, 2), ``xy`` format.
+        """
+        anchors_cx = (anchors[..., 2] + anchors[..., 0]) / 2
+        anchors_cy = (anchors[..., 3] + anchors[..., 1]) / 2
+        return torch.stack([anchors_cx, anchors_cy], dim=-1)
+
+    def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor,
+                            bbox_pred: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            stride: Tuple[int], avg_factor: int) -> dict:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Cls and quality joint scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_pred (Tensor): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            stride (Tuple[int]): Stride in this scale level.
+            avg_factor (int): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1, 4 * (self.reg_max + 1))
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        score = label_weights.new_zeros(labels.shape)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0]
+
+            weight_targets = cls_score.detach().sigmoid()
+            weight_targets = weight_targets.max(dim=1)[0][pos_inds]
+            pos_bbox_pred_corners = self.integral(pos_bbox_pred)
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchor_centers, pos_bbox_pred_corners)
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+            score[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1)
+            target_corners = self.bbox_coder.encode(pos_anchor_centers,
+                                                    pos_decode_bbox_targets,
+                                                    self.reg_max).reshape(-1)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=weight_targets,
+                avg_factor=1.0)
+
+            # dfl loss
+            loss_dfl = self.loss_dfl(
+                pred_corners,
+                target_corners,
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            loss_dfl = bbox_pred.sum() * 0
+            weight_targets = bbox_pred.new_tensor(0)
+
+        # cls (qfl) loss
+        loss_cls = self.loss_cls(
+            cls_score, (labels, score),
+            weight=label_weights,
+            avg_factor=avg_factor)
+
+        return loss_cls, loss_bbox, loss_dfl, weight_targets.sum()
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Cls and quality scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_preds (list[Tensor]): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        losses_cls, losses_bbox, losses_dfl,\
+            avg_factor = multi_apply(
+                self.loss_by_feat_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                self.prior_generator.strides,
+                avg_factor=avg_factor)
+
+        avg_factor = sum(avg_factor)
+        avg_factor = reduce_mean(avg_factor).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / avg_factor, losses_bbox))
+        losses_dfl = list(map(lambda x: x / avg_factor, losses_dfl))
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dfl=losses_dfl)
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image. GFL head does not need this value.
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (:obj: `ConfigDict`): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+            is False and mlvl_score_factor is None, return mlvl_bboxes and
+            mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+            mlvl_score_factor. Usually with_nms is False is used for aug
+            test. If with_nms is True, then return the following format
+
+            - det_bboxes (Tensor): Predicted bboxes with shape
+              [num_bboxes, 5], where the first 4 columns are bounding
+              box positions (tl_x, tl_y, br_x, br_y) and the 5-th
+              column are scores between 0 and 1.
+            - det_labels (Tensor): Predicted labels of the corresponding
+              box with shape [num_bboxes].
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for level_idx, (cls_score, bbox_pred, stride, priors) in enumerate(
+                zip(cls_score_list, bbox_pred_list,
+                    self.prior_generator.strides, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert stride[0] == stride[1]
+
+            bbox_pred = bbox_pred.permute(1, 2, 0)
+            bbox_pred = self.integral(bbox_pred) * stride[0]
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, _, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            bboxes = self.bbox_coder.decode(
+                self.anchor_center(priors), bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def get_targets(self,
+                    anchor_list: List[Tensor],
+                    valid_flag_list: List[Tensor],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs=True) -> tuple:
+        """Get targets for GFL head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. Besides
+        returning the targets as the parent method does, it also returns the
+        anchors as the first element of the returned tuple.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_bbox_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             anchor_list,
+             valid_flag_list,
+             num_level_anchors_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs)
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, bbox_weights_list, avg_factor)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            num_level_anchors: List[int],
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors, 4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            num_level_anchors (list[int]): Number of anchors of each scale
+                level.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+
+            - anchors (Tensor): All anchors in the image with shape (N, 4).
+            - labels (Tensor): Labels of all anchors in the image with
+              shape (N,).
+            - label_weights (Tensor): Label weights of all anchor in the
+              image with shape (N,).
+            - bbox_targets (Tensor): BBox targets of all anchors in the
+              image with shape (N, 4).
+            - bbox_weights (Tensor): BBox weights of all anchors in the
+              image with shape (N, 4).
+            - pos_inds (Tensor): Indices of positive anchor with shape
+              (num_pos,).
+            - neg_inds (Tensor): Indices of negative anchor with shape
+              (num_neg,).
+            - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+        num_level_anchors_inside = self.get_num_level_anchors_inside(
+            num_level_anchors, inside_flags)
+        pred_instances = InstanceData(priors=anchors)
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            num_level_priors=num_level_anchors_inside,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (anchors, labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds, sampling_result)
+
+    def get_num_level_anchors_inside(self, num_level_anchors: List[int],
+                                     inside_flags: Tensor) -> List[int]:
+        """Get the number of valid anchors in every level."""
+
+        split_inside_flags = torch.split(inside_flags, num_level_anchors)
+        num_level_anchors_inside = [
+            int(flags.sum()) for flags in split_inside_flags
+        ]
+        return num_level_anchors_inside
diff --git a/head_extractor/src/mmdet/models/dense_heads/grounding_dino_head.py b/head_extractor/src/mmdet/models/dense_heads/grounding_dino_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8088322546f24ae6f3e60aff1378d5c2feefdcf0
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/grounding_dino_head.py
@@ -0,0 +1,774 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import Linear
+from mmengine.model import constant_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.losses import QualityFocalLoss
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
+from mmdet.utils import InstanceList, reduce_mean
+from ..layers import inverse_sigmoid
+from .atss_vlfusion_head import convert_grounding_to_cls_scores
+from .dino_head import DINOHead
+
+
+class ContrastiveEmbed(nn.Module):
+    """text visual ContrastiveEmbed layer.
+
+    Args:
+        max_text_len (int, optional): Maximum length of text.
+        log_scale (Optional[Union[str, float]]):  The initial value of a
+          learnable parameter to multiply with the similarity
+          matrix to normalize the output.  Defaults to 0.0.
+          - If set to 'auto', the similarity matrix will be normalized by
+            a fixed value ``sqrt(d_c)`` where ``d_c`` is the channel number.
+          - If set to 'none' or ``None``, there is no normalization applied.
+          - If set to a float number, the similarity matrix will be multiplied
+            by ``exp(log_scale)``, where ``log_scale`` is learnable.
+        bias (bool, optional): Whether to add bias to the output.
+          If set to ``True``, a learnable bias that is initialized as -4.6
+          will be added to the output. Useful when training from scratch.
+          Defaults to False.
+    """
+
+    def __init__(self,
+                 max_text_len: int = 256,
+                 log_scale: Optional[Union[str, float]] = None,
+                 bias: bool = False):
+        super().__init__()
+        self.max_text_len = max_text_len
+        self.log_scale = log_scale
+        if isinstance(log_scale, float):
+            self.log_scale = nn.Parameter(
+                torch.Tensor([float(log_scale)]), requires_grad=True)
+        elif log_scale not in ['auto', 'none', None]:
+            raise ValueError(f'log_scale should be one of '
+                             f'"auto", "none", None, but got {log_scale}')
+
+        self.bias = None
+        if bias:
+            bias_value = -math.log((1 - 0.01) / 0.01)
+            self.bias = nn.Parameter(
+                torch.Tensor([bias_value]), requires_grad=True)
+
+    def forward(self, visual_feat: Tensor, text_feat: Tensor,
+                text_token_mask: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            visual_feat (Tensor): Visual features.
+            text_feat (Tensor): Text features.
+            text_token_mask (Tensor): A mask used for text feats.
+
+        Returns:
+            Tensor: Classification score.
+        """
+        res = visual_feat @ text_feat.transpose(-1, -2)
+        if isinstance(self.log_scale, nn.Parameter):
+            res = res * self.log_scale.exp()
+        elif self.log_scale == 'auto':
+            # NOTE: similar to the normalizer in self-attention
+            res = res / math.sqrt(visual_feat.shape[-1])
+        if self.bias is not None:
+            res = res + self.bias
+        res.masked_fill_(~text_token_mask[:, None, :], float('-inf'))
+
+        new_res = torch.full((*res.shape[:-1], self.max_text_len),
+                             float('-inf'),
+                             device=res.device)
+        new_res[..., :res.shape[-1]] = res
+
+        return new_res
+
+
+@MODELS.register_module()
+class GroundingDINOHead(DINOHead):
+    """Head of the Grounding DINO: Marrying DINO with Grounded Pre-Training for
+    Open-Set Object Detection.
+
+    Args:
+        contrastive_cfg (dict, optional): Contrastive config that contains
+          keys like ``max_text_len``. Defaults to dict(max_text_len=256).
+    """
+
+    def __init__(self, contrastive_cfg=dict(max_text_len=256), **kwargs):
+        self.contrastive_cfg = contrastive_cfg
+        self.max_text_len = contrastive_cfg.get('max_text_len', 256)
+        super().__init__(**kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize classification branch and regression branch of head."""
+        fc_cls = ContrastiveEmbed(**self.contrastive_cfg)
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, 4))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        # NOTE: due to the fc_cls is a contrastive embedding and don't
+        # have any trainable parameters,we do not need to copy it.
+        if self.share_pred_layer:
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(self.num_pred_layer)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(self.num_pred_layer)])
+        else:
+            self.cls_branches = nn.ModuleList(
+                [copy.deepcopy(fc_cls) for _ in range(self.num_pred_layer)])
+            self.reg_branches = nn.ModuleList([
+                copy.deepcopy(reg_branch) for _ in range(self.num_pred_layer)
+            ])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the Deformable DETR head."""
+        for m in self.reg_branches:
+            constant_init(m[-1], 0, bias=0)
+        nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)
+        if self.as_two_stage:
+            for m in self.reg_branches:
+                nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+    def _get_targets_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> tuple:
+        """Compute regression and classification targets for one image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_queries, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_queries, 4].
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        img_h, img_w = img_meta['img_shape']
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        num_bboxes = bbox_pred.size(0)
+        # convert bbox_pred from xywh, normalized to xyxy, unnormalized
+        bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
+        bbox_pred = bbox_pred * factor
+
+        pred_instances = InstanceData(scores=cls_score, bboxes=bbox_pred)
+        # assigner and sampler
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            img_meta=img_meta)
+        gt_bboxes = gt_instances.bboxes
+
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+        pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds.long(), :]
+
+        # Major changes. The labels are 0-1 binary labels for each bbox
+        # and text tokens.
+        labels = gt_bboxes.new_full((num_bboxes, self.max_text_len),
+                                    0,
+                                    dtype=torch.float32)
+        labels[pos_inds] = gt_instances.positive_maps[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype)
+        bbox_weights = torch.zeros_like(bbox_pred, dtype=gt_bboxes.dtype)
+        bbox_weights[pos_inds] = 1.0
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        pos_gt_bboxes_normalized = pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        references: List[Tensor],
+        memory_text: Tensor,
+        text_token_mask: Tensor,
+    ) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries, dim).
+            references (List[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+            memory_text (Tensor): Memory text. It has shape (bs, len_text,
+                text_embed_dims).
+            text_token_mask (Tensor): Text token mask. It has shape (bs,
+                len_text).
+
+        Returns:
+            tuple[Tensor]: results of head containing the following tensor.
+
+            - all_layers_outputs_classes (Tensor): Outputs from the
+              classification head, has shape (num_decoder_layers, bs,
+              num_queries, cls_out_channels).
+            - all_layers_outputs_coords (Tensor): Sigmoid outputs from the
+              regression head with normalized coordinate format (cx, cy, w,
+              h), has shape (num_decoder_layers, bs, num_queries, 4) with the
+              last dimension arranged as (cx, cy, w, h).
+        """
+        all_layers_outputs_classes = []
+        all_layers_outputs_coords = []
+
+        for layer_id in range(hidden_states.shape[0]):
+            reference = inverse_sigmoid(references[layer_id])
+            # NOTE The last reference will not be used.
+            hidden_state = hidden_states[layer_id]
+            outputs_class = self.cls_branches[layer_id](hidden_state,
+                                                        memory_text,
+                                                        text_token_mask)
+            tmp_reg_preds = self.reg_branches[layer_id](hidden_state)
+            if reference.shape[-1] == 4:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `True`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `True`.
+                tmp_reg_preds += reference
+            else:
+                # When `layer` is 0 and `as_two_stage` of the detector
+                # is `False`, or when `layer` is greater than 0 and
+                # `with_box_refine` of the detector is `False`.
+                assert reference.shape[-1] == 2
+                tmp_reg_preds[..., :2] += reference
+            outputs_coord = tmp_reg_preds.sigmoid()
+            all_layers_outputs_classes.append(outputs_class)
+            all_layers_outputs_coords.append(outputs_coord)
+
+        all_layers_outputs_classes = torch.stack(all_layers_outputs_classes)
+        all_layers_outputs_coords = torch.stack(all_layers_outputs_coords)
+
+        return all_layers_outputs_classes, all_layers_outputs_coords
+
+    def predict(self,
+                hidden_states: Tensor,
+                references: List[Tensor],
+                memory_text: Tensor,
+                text_token_mask: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> InstanceList:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, num_queries, bs, dim).
+            references (List[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries, 4) when `as_two_stage` of the detector is `True`,
+                otherwise (bs, num_queries, 2). Each `inter_reference` has
+                shape (bs, num_queries, 4) when `with_box_refine` of the
+                detector is `True`, otherwise (bs, num_queries, 2). The
+                coordinates are arranged as (cx, cy) when the last dimension is
+                2, and (cx, cy, w, h) when it is 4.
+            memory_text (Tensor): Memory text. It has shape (bs, len_text,
+                text_embed_dims).
+            text_token_mask (Tensor): Text token mask. It has shape (bs,
+                len_text).
+            batch_data_samples (SampleList): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool, optional): If `True`, return boxes in original
+                image space. Defaults to `True`.
+
+        Returns:
+            InstanceList: Detection results of each image
+                after the post process.
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        batch_token_positive_maps = [
+            data_samples.token_positive_map
+            for data_samples in batch_data_samples
+        ]
+
+        outs = self(hidden_states, references, memory_text, text_token_mask)
+
+        predictions = self.predict_by_feat(
+            *outs,
+            batch_img_metas=batch_img_metas,
+            batch_token_positive_maps=batch_token_positive_maps,
+            rescale=rescale)
+        return predictions
+
+    def predict_by_feat(self,
+                        all_layers_cls_scores: Tensor,
+                        all_layers_bbox_preds: Tensor,
+                        batch_img_metas: List[Dict],
+                        batch_token_positive_maps: Optional[List[dict]] = None,
+                        rescale: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            all_layers_cls_scores (Tensor):  Classification scores of all
+                decoder layers, has shape (num_decoder_layers, bs, num_queries,
+                cls_out_channels).
+            all_layers_bbox_preds (Tensor): Regression outputs of all decoder
+                layers. Each is a 4D-tensor with normalized coordinate format
+                (cx, cy, w, h) and shape (num_decoder_layers, bs, num_queries,
+                4) with the last dimension arranged as (cx, cy, w, h).
+            batch_img_metas (List[Dict]): _description_
+            batch_token_positive_maps (list[dict], Optional): Batch token
+                positive map. Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cls_scores = all_layers_cls_scores[-1]
+        bbox_preds = all_layers_bbox_preds[-1]
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_meta = batch_img_metas[img_id]
+            token_positive_maps = batch_token_positive_maps[img_id]
+            results = self._predict_by_feat_single(cls_score, bbox_pred,
+                                                   token_positive_maps,
+                                                   img_meta, rescale)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score: Tensor,
+                                bbox_pred: Tensor,
+                                token_positive_maps: dict,
+                                img_meta: dict,
+                                rescale: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score (Tensor): Box score logits from the last decoder layer
+                for each image. Shape [num_queries, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
+                for each image, with coordinate format (cx, cy, w, h) and
+                shape [num_queries, 4].
+            token_positive_maps (dict): Token positive map.
+            img_meta (dict): Image meta info.
+            rescale (bool, optional): If True, return boxes in original image
+                space. Default True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_score) == len(bbox_pred)  # num_queries
+        max_per_img = self.test_cfg.get('max_per_img', len(cls_score))
+        img_shape = img_meta['img_shape']
+
+        if token_positive_maps is not None:
+            cls_score = convert_grounding_to_cls_scores(
+                logits=cls_score.sigmoid()[None],
+                positive_maps=[token_positive_maps])[0]
+            scores, indexes = cls_score.view(-1).topk(max_per_img)
+            num_classes = cls_score.shape[-1]
+            det_labels = indexes % num_classes
+            bbox_index = indexes // num_classes
+            bbox_pred = bbox_pred[bbox_index]
+        else:
+            cls_score = cls_score.sigmoid()
+            scores, _ = cls_score.max(-1)
+            scores, indexes = scores.topk(max_per_img)
+            bbox_pred = bbox_pred[indexes]
+            det_labels = scores.new_zeros(scores.shape, dtype=torch.long)
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
+        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
+        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            det_bboxes /= det_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+        results = InstanceData()
+        results.bboxes = det_bboxes
+        results.scores = scores
+        results.labels = det_labels
+        return results
+
+    def loss(self, hidden_states: Tensor, references: List[Tensor],
+             memory_text: Tensor, text_token_mask: Tensor,
+             enc_outputs_class: Tensor, enc_outputs_coord: Tensor,
+             batch_data_samples: SampleList, dn_meta: Dict[str, int]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        head on the queries of the upstream network.
+
+        Args:
+            hidden_states (Tensor): Hidden states output from each decoder
+                layer, has shape (num_decoder_layers, bs, num_queries_total,
+                dim), where `num_queries_total` is the sum of
+                `num_denoising_queries` and `num_matching_queries` when
+                `self.training` is `True`, else `num_matching_queries`.
+            references (list[Tensor]): List of the reference from the decoder.
+                The first reference is the `init_reference` (initial) and the
+                other num_decoder_layers(6) references are `inter_references`
+                (intermediate). The `init_reference` has shape (bs,
+                num_queries_total, 4) and each `inter_reference` has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            memory_text (Tensor): Memory text. It has shape (bs, len_text,
+                text_embed_dims).
+            enc_outputs_class (Tensor): The score of each point on encode
+                feature map, has shape (bs, num_feat_points, cls_out_channels).
+            enc_outputs_coord (Tensor): The proposal generate from the
+                encode feature map, has shape (bs, num_feat_points, 4) with the
+                last dimension arranged as (cx, cy, w, h).
+            batch_data_samples (list[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        batch_gt_instances = []
+        batch_img_metas = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+
+        outs = self(hidden_states, references, memory_text, text_token_mask)
+        self.text_masks = text_token_mask
+        loss_inputs = outs + (enc_outputs_class, enc_outputs_coord,
+                              batch_gt_instances, batch_img_metas, dn_meta)
+        losses = self.loss_by_feat(*loss_inputs)
+        return losses
+
+    def loss_by_feat_single(self, cls_scores: Tensor, bbox_preds: Tensor,
+                            batch_gt_instances: InstanceList,
+                            batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images, has shape (bs, num_queries, cls_out_channels).
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape (bs, num_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        with torch.no_grad():
+            cls_reg_targets = self.get_targets(cls_scores_list,
+                                               bbox_preds_list,
+                                               batch_gt_instances,
+                                               batch_img_metas)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.stack(labels_list, 0)
+        label_weights = torch.stack(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # ===== this change =====
+        # Loss is not computed for the padded regions of the text.
+        assert (self.text_masks.dim() == 2)
+        text_masks = self.text_masks.new_zeros(
+            (self.text_masks.size(0), self.max_text_len))
+        text_masks[:, :self.text_masks.size(1)] = self.text_masks
+        text_mask = (text_masks > 0).unsqueeze(1)
+        text_mask = text_mask.repeat(1, cls_scores.size(1), 1)
+        cls_scores = torch.masked_select(cls_scores, text_mask).contiguous()
+
+        labels = torch.masked_select(labels, text_mask)
+        label_weights = label_weights[...,
+                                      None].repeat(1, 1, text_mask.size(-1))
+        label_weights = torch.masked_select(label_weights, text_mask)
+
+        # classification loss
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if isinstance(self.loss_cls, QualityFocalLoss):
+            raise NotImplementedError(
+                'QualityFocalLoss for GroundingDINOHead is not supported yet.')
+        else:
+            loss_cls = self.loss_cls(
+                cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, bbox_preds):
+            img_h, img_w, = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def _loss_dn_single(self, dn_cls_scores: Tensor, dn_bbox_preds: Tensor,
+                        batch_gt_instances: InstanceList,
+                        batch_img_metas: List[dict],
+                        dn_meta: Dict[str, int]) -> Tuple[Tensor]:
+        """Denoising loss for outputs from a single decoder layer.
+
+        Args:
+            dn_cls_scores (Tensor): Classification scores of a single decoder
+                layer in denoising part, has shape (bs, num_denoising_queries,
+                cls_out_channels).
+            dn_bbox_preds (Tensor): Regression outputs of a single decoder
+                layer in denoising part. Each is a 4D-tensor with normalized
+                coordinate format (cx, cy, w, h) and has shape
+                (bs, num_denoising_queries, 4).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            Tuple[Tensor]: A tuple including `loss_cls`, `loss_box` and
+            `loss_iou`.
+        """
+        cls_reg_targets = self.get_dn_targets(batch_gt_instances,
+                                              batch_img_metas, dn_meta)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.stack(labels_list, 0)
+        label_weights = torch.stack(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+        # ===== this change =====
+        # Loss is not computed for the padded regions of the text.
+        assert (self.text_masks.dim() == 2)
+        text_masks = self.text_masks.new_zeros(
+            (self.text_masks.size(0), self.max_text_len))
+        text_masks[:, :self.text_masks.size(1)] = self.text_masks
+        text_mask = (text_masks > 0).unsqueeze(1)
+        text_mask = text_mask.repeat(1, dn_cls_scores.size(1), 1)
+        cls_scores = torch.masked_select(dn_cls_scores, text_mask).contiguous()
+        labels = torch.masked_select(labels, text_mask)
+        label_weights = label_weights[...,
+                                      None].repeat(1, 1, text_mask.size(-1))
+        label_weights = torch.masked_select(label_weights, text_mask)
+        # =======================
+
+        # classification loss
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = \
+            num_total_pos * 1.0 + num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        if len(cls_scores) > 0:
+            if isinstance(self.loss_cls, QualityFocalLoss):
+                raise NotImplementedError('QualityFocalLoss is not supported')
+            else:
+                loss_cls = self.loss_cls(
+                    cls_scores,
+                    labels,
+                    label_weights,
+                    avg_factor=cls_avg_factor)
+        else:
+            loss_cls = torch.zeros(
+                1, dtype=cls_scores.dtype, device=cls_scores.device)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(batch_img_metas, dn_bbox_preds):
+            img_h, img_w = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = dn_bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def _get_dn_targets_single(self, gt_instances: InstanceData,
+                               img_meta: dict, dn_meta: Dict[str,
+                                                             int]) -> tuple:
+        """Get targets in denoising part for one image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for one image.
+            dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels of each image.
+            - label_weights (Tensor]): Label weights of each image.
+            - bbox_targets (Tensor): BBox targets of each image.
+            - bbox_weights (Tensor): BBox weights of each image.
+            - pos_inds (Tensor): Sampled positive indices for each image.
+            - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        num_groups = dn_meta['num_denoising_groups']
+        num_denoising_queries = dn_meta['num_denoising_queries']
+        num_queries_each_group = int(num_denoising_queries / num_groups)
+        device = gt_bboxes.device
+
+        if len(gt_labels) > 0:
+            t = torch.arange(len(gt_labels), dtype=torch.long, device=device)
+            t = t.unsqueeze(0).repeat(num_groups, 1)
+            pos_assigned_gt_inds = t.flatten()
+            pos_inds = torch.arange(
+                num_groups, dtype=torch.long, device=device)
+            pos_inds = pos_inds.unsqueeze(1) * num_queries_each_group + t
+            pos_inds = pos_inds.flatten()
+        else:
+            pos_inds = pos_assigned_gt_inds = \
+                gt_bboxes.new_tensor([], dtype=torch.long)
+
+        neg_inds = pos_inds + num_queries_each_group // 2
+        # label targets
+        # this change
+        labels = gt_bboxes.new_full((num_denoising_queries, self.max_text_len),
+                                    0,
+                                    dtype=torch.float32)
+        labels[pos_inds] = gt_instances.positive_maps[pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_denoising_queries)
+
+        # bbox targets
+        bbox_targets = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights = torch.zeros(num_denoising_queries, 4, device=device)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        gt_bboxes_normalized = gt_bboxes / factor
+        gt_bboxes_targets = bbox_xyxy_to_cxcywh(gt_bboxes_normalized)
+        bbox_targets[pos_inds] = gt_bboxes_targets.repeat([num_groups, 1])
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
diff --git a/head_extractor/src/mmdet/models/dense_heads/guided_anchor_head.py b/head_extractor/src/mmdet/models/dense_heads/guided_anchor_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..59f6dd3336e66065dc88b702e925965d4089c72f
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/guided_anchor_head.py
@@ -0,0 +1,994 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.ops import DeformConv2d, MaskedConv2d
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList)
+from ..layers import multiclass_nms
+from ..task_modules.prior_generators import anchor_inside_flags, calc_region
+from ..task_modules.samplers import PseudoSampler
+from ..utils import images_to_levels, multi_apply, unmap
+from .anchor_head import AnchorHead
+
+
+class FeatureAdaption(BaseModule):
+    """Feature Adaption Module.
+
+    Feature Adaption Module is implemented based on DCN v1.
+    It uses anchor shape prediction rather than feature map to
+    predict offsets of deform conv layer.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        out_channels (int): Number of channels in the output feature map.
+        kernel_size (int): Deformable conv kernel size. Defaults to 3.
+        deform_groups (int): Deformable conv group size. Defaults to 4.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        deform_groups: int = 4,
+        init_cfg: MultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.1,
+            override=dict(type='Normal', name='conv_adaption', std=0.01))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        offset_channels = kernel_size * kernel_size * 2
+        self.conv_offset = nn.Conv2d(
+            2, deform_groups * offset_channels, 1, bias=False)
+        self.conv_adaption = DeformConv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(kernel_size - 1) // 2,
+            deform_groups=deform_groups)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor, shape: Tensor) -> Tensor:
+        offset = self.conv_offset(shape.detach())
+        x = self.relu(self.conv_adaption(x, offset))
+        return x
+
+
+@MODELS.register_module()
+class GuidedAnchorHead(AnchorHead):
+    """Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.).
+
+    This GuidedAnchorHead will predict high-quality feature guided
+    anchors and locations where anchors will be kept in inference.
+    There are mainly 3 categories of bounding-boxes.
+
+    - Sampled 9 pairs for target assignment. (approxes)
+    - The square boxes where the predicted anchors are based on. (squares)
+    - Guided anchors.
+
+    Please refer to https://arxiv.org/abs/1901.03278 for more details.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Defaults to 256.
+        approx_anchor_generator (:obj:`ConfigDict` or dict): Config dict
+            for approx generator
+        square_anchor_generator (:obj:`ConfigDict` or dict): Config dict
+            for square generator
+        anchor_coder (:obj:`ConfigDict` or dict): Config dict for anchor coder
+        bbox_coder (:obj:`ConfigDict` or dict): Config dict for bbox coder
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Defaults to False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        deform_groups: (int): Group number of DCN in FeatureAdaption module.
+            Defaults to 4.
+        loc_filter_thr (float): Threshold to filter out unconcerned regions.
+            Defaults to 0.01.
+        loss_loc (:obj:`ConfigDict` or dict): Config of location loss.
+        loss_shape (:obj:`ConfigDict` or dict): Config of anchor shape loss.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of bbox regression loss.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        approx_anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            octave_base_scale=8,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        square_anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[8],
+            strides=[4, 8, 16, 32, 64]),
+        anchor_coder: ConfigType = dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        bbox_coder: ConfigType = dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        reg_decoded_bbox: bool = False,
+        deform_groups: int = 4,
+        loc_filter_thr: float = 0.01,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        loss_loc: ConfigType = dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape: ConfigType = dict(
+            type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+        loss_cls: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox: ConfigType = dict(
+            type='SmoothL1Loss', beta=1.0, loss_weight=1.0),
+        init_cfg: MultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='conv_loc', std=0.01, lbias_prob=0.01))
+    ) -> None:
+        super(AnchorHead, self).__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.deform_groups = deform_groups
+        self.loc_filter_thr = loc_filter_thr
+
+        # build approx_anchor_generator and square_anchor_generator
+        assert (approx_anchor_generator['octave_base_scale'] ==
+                square_anchor_generator['scales'][0])
+        assert (approx_anchor_generator['strides'] ==
+                square_anchor_generator['strides'])
+        self.approx_anchor_generator = TASK_UTILS.build(
+            approx_anchor_generator)
+        self.square_anchor_generator = TASK_UTILS.build(
+            square_anchor_generator)
+        self.approxs_per_octave = self.approx_anchor_generator \
+            .num_base_priors[0]
+
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        # one anchor per location
+        self.num_base_priors = self.square_anchor_generator.num_base_priors[0]
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        self.loc_focal_loss = loss_loc['type'] in ['FocalLoss']
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes
+        else:
+            self.cls_out_channels = self.num_classes + 1
+
+        # build bbox_coder
+        self.anchor_coder = TASK_UTILS.build(anchor_coder)
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+
+        # build losses
+        self.loss_loc = MODELS.build(loss_loc)
+        self.loss_shape = MODELS.build(loss_shape)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            # use PseudoSampler when no sampler in train_cfg
+            if train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler()
+
+            self.ga_assigner = TASK_UTILS.build(self.train_cfg['ga_assigner'])
+            if train_cfg.get('ga_sampler', None) is not None:
+                self.ga_sampler = TASK_UTILS.build(
+                    self.train_cfg['ga_sampler'],
+                    default_args=dict(context=self))
+            else:
+                self.ga_sampler = PseudoSampler()
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.conv_loc = nn.Conv2d(self.in_channels, 1, 1)
+        self.conv_shape = nn.Conv2d(self.in_channels, self.num_base_priors * 2,
+                                    1)
+        self.feature_adaption = FeatureAdaption(
+            self.in_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.conv_cls = MaskedConv2d(
+            self.feat_channels, self.num_base_priors * self.cls_out_channels,
+            1)
+        self.conv_reg = MaskedConv2d(self.feat_channels,
+                                     self.num_base_priors * 4, 1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward feature of a single scale level."""
+        loc_pred = self.conv_loc(x)
+        shape_pred = self.conv_shape(x)
+        x = self.feature_adaption(x, shape_pred)
+        # masked conv is only used during inference for speed-up
+        if not self.training:
+            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
+        else:
+            mask = None
+        cls_score = self.conv_cls(x, mask)
+        bbox_pred = self.conv_reg(x, mask)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def forward(self, x: List[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network."""
+        return multi_apply(self.forward_single, x)
+
+    def get_sampled_approxs(self,
+                            featmap_sizes: List[Tuple[int, int]],
+                            batch_img_metas: List[dict],
+                            device: str = 'cuda') -> tuple:
+        """Get sampled approxs and inside flags according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+            device (str): device for returned tensors
+
+        Returns:
+            tuple: approxes of each image, inside flags of each image
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # approxes for one time
+        multi_level_approxs = self.approx_anchor_generator.grid_priors(
+            featmap_sizes, device=device)
+        approxs_list = [multi_level_approxs for _ in range(num_imgs)]
+
+        # for each image, we compute inside flags of multi level approxes
+        inside_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = []
+            multi_level_approxs = approxs_list[img_id]
+
+            # obtain valid flags for each approx first
+            multi_level_approx_flags = self.approx_anchor_generator \
+                .valid_flags(featmap_sizes,
+                             img_meta['pad_shape'],
+                             device=device)
+
+            for i, flags in enumerate(multi_level_approx_flags):
+                approxs = multi_level_approxs[i]
+                inside_flags_list = []
+                for j in range(self.approxs_per_octave):
+                    split_valid_flags = flags[j::self.approxs_per_octave]
+                    split_approxs = approxs[j::self.approxs_per_octave, :]
+                    inside_flags = anchor_inside_flags(
+                        split_approxs, split_valid_flags,
+                        img_meta['img_shape'][:2],
+                        self.train_cfg['allowed_border'])
+                    inside_flags_list.append(inside_flags)
+                # inside_flag for a position is true if any anchor in this
+                # position is true
+                inside_flags = (
+                    torch.stack(inside_flags_list, 0).sum(dim=0) > 0)
+                multi_level_flags.append(inside_flags)
+            inside_flag_list.append(multi_level_flags)
+        return approxs_list, inside_flag_list
+
+    def get_anchors(self,
+                    featmap_sizes: List[Tuple[int, int]],
+                    shape_preds: List[Tensor],
+                    loc_preds: List[Tensor],
+                    batch_img_metas: List[dict],
+                    use_loc_filter: bool = False,
+                    device: str = 'cuda') -> tuple:
+        """Get squares according to feature map sizes and guided anchors.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            shape_preds (list[tensor]): Multi-level shape predictions.
+            loc_preds (list[tensor]): Multi-level location predictions.
+            batch_img_metas (list[dict]): Image meta info.
+            use_loc_filter (bool): Use loc filter or not. Defaults to False
+            device (str): device for returned tensors.
+                Defaults to `cuda`.
+
+        Returns:
+            tuple: square approxs of each image, guided anchors of each image,
+            loc masks of each image.
+        """
+        num_imgs = len(batch_img_metas)
+        num_levels = len(featmap_sizes)
+
+        # since feature map sizes of all images are the same, we only compute
+        # squares for one time
+        multi_level_squares = self.square_anchor_generator.grid_priors(
+            featmap_sizes, device=device)
+        squares_list = [multi_level_squares for _ in range(num_imgs)]
+
+        # for each image, we compute multi level guided anchors
+        guided_anchors_list = []
+        loc_mask_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_guided_anchors = []
+            multi_level_loc_mask = []
+            for i in range(num_levels):
+                squares = squares_list[img_id][i]
+                shape_pred = shape_preds[i][img_id]
+                loc_pred = loc_preds[i][img_id]
+                guided_anchors, loc_mask = self._get_guided_anchors_single(
+                    squares,
+                    shape_pred,
+                    loc_pred,
+                    use_loc_filter=use_loc_filter)
+                multi_level_guided_anchors.append(guided_anchors)
+                multi_level_loc_mask.append(loc_mask)
+            guided_anchors_list.append(multi_level_guided_anchors)
+            loc_mask_list.append(multi_level_loc_mask)
+        return squares_list, guided_anchors_list, loc_mask_list
+
+    def _get_guided_anchors_single(
+            self,
+            squares: Tensor,
+            shape_pred: Tensor,
+            loc_pred: Tensor,
+            use_loc_filter: bool = False) -> Tuple[Tensor]:
+        """Get guided anchors and loc masks for a single level.
+
+        Args:
+            squares (tensor): Squares of a single level.
+            shape_pred (tensor): Shape predictions of a single level.
+            loc_pred (tensor): Loc predictions of a single level.
+            use_loc_filter (list[tensor]): Use loc filter or not.
+                Defaults to False.
+
+        Returns:
+            tuple: guided anchors, location masks
+        """
+        # calculate location filtering mask
+        loc_pred = loc_pred.sigmoid().detach()
+        if use_loc_filter:
+            loc_mask = loc_pred >= self.loc_filter_thr
+        else:
+            loc_mask = loc_pred >= 0.0
+        mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_base_priors)
+        mask = mask.contiguous().view(-1)
+        # calculate guided anchors
+        squares = squares[mask]
+        anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view(
+            -1, 2).detach()[mask]
+        bbox_deltas = anchor_deltas.new_full(squares.size(), 0)
+        bbox_deltas[:, 2:] = anchor_deltas
+        guided_anchors = self.anchor_coder.decode(
+            squares, bbox_deltas, wh_ratio_clip=1e-6)
+        return guided_anchors, mask
+
+    def ga_loc_targets(self, batch_gt_instances: InstanceList,
+                       featmap_sizes: List[Tuple[int, int]]) -> tuple:
+        """Compute location targets for guided anchoring.
+
+        Each feature map is divided into positive, negative and ignore regions.
+        - positive regions: target 1, weight 1
+        - ignore regions: target 0, weight 0
+        - negative regions: target 0, weight 0.1
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            featmap_sizes (list[tuple]): Multi level sizes of each feature
+                maps.
+
+        Returns:
+            tuple: Returns a tuple containing location targets.
+        """
+        anchor_scale = self.approx_anchor_generator.octave_base_scale
+        anchor_strides = self.approx_anchor_generator.strides
+        # Currently only supports same stride in x and y direction.
+        for stride in anchor_strides:
+            assert (stride[0] == stride[1])
+        anchor_strides = [stride[0] for stride in anchor_strides]
+
+        center_ratio = self.train_cfg['center_ratio']
+        ignore_ratio = self.train_cfg['ignore_ratio']
+        img_per_gpu = len(batch_gt_instances)
+        num_lvls = len(featmap_sizes)
+        r1 = (1 - center_ratio) / 2
+        r2 = (1 - ignore_ratio) / 2
+        all_loc_targets = []
+        all_loc_weights = []
+        all_ignore_map = []
+        for lvl_id in range(num_lvls):
+            h, w = featmap_sizes[lvl_id]
+            loc_targets = torch.zeros(
+                img_per_gpu,
+                1,
+                h,
+                w,
+                device=batch_gt_instances[0].bboxes.device,
+                dtype=torch.float32)
+            loc_weights = torch.full_like(loc_targets, -1)
+            ignore_map = torch.zeros_like(loc_targets)
+            all_loc_targets.append(loc_targets)
+            all_loc_weights.append(loc_weights)
+            all_ignore_map.append(ignore_map)
+        for img_id in range(img_per_gpu):
+            gt_bboxes = batch_gt_instances[img_id].bboxes
+            scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                               (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+            min_anchor_size = scale.new_full(
+                (1, ), float(anchor_scale * anchor_strides[0]))
+            # assign gt bboxes to different feature levels w.r.t. their scales
+            target_lvls = torch.floor(
+                torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
+            target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
+            for gt_id in range(gt_bboxes.size(0)):
+                lvl = target_lvls[gt_id].item()
+                # rescaled to corresponding feature map
+                gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl]
+                # calculate ignore regions
+                ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                    gt_, r2, featmap_sizes[lvl])
+                # calculate positive (center) regions
+                ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region(
+                    gt_, r1, featmap_sizes[lvl])
+                all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
+                                     ctr_x1:ctr_x2 + 1] = 1
+                all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                     ignore_x1:ignore_x2 + 1] = 0
+                all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
+                                     ctr_x1:ctr_x2 + 1] = 1
+                # calculate ignore map on nearby low level feature
+                if lvl > 0:
+                    d_lvl = lvl - 1
+                    # rescaled to corresponding feature map
+                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl]
+                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                        gt_, r2, featmap_sizes[d_lvl])
+                    all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                          ignore_x1:ignore_x2 + 1] = 1
+                # calculate ignore map on nearby high level feature
+                if lvl < num_lvls - 1:
+                    u_lvl = lvl + 1
+                    # rescaled to corresponding feature map
+                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl]
+                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                        gt_, r2, featmap_sizes[u_lvl])
+                    all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                          ignore_x1:ignore_x2 + 1] = 1
+        for lvl_id in range(num_lvls):
+            # ignore negative regions w.r.t. ignore map
+            all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0)
+                                    & (all_ignore_map[lvl_id] > 0)] = 0
+            # set negative regions with weight 0.1
+            all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1
+        # loc average factor to balance loss
+        loc_avg_factor = sum(
+            [t.size(0) * t.size(-1) * t.size(-2)
+             for t in all_loc_targets]) / 200
+        return all_loc_targets, all_loc_weights, loc_avg_factor
+
+    def _ga_shape_target_single(self,
+                                flat_approxs: Tensor,
+                                inside_flags: Tensor,
+                                flat_squares: Tensor,
+                                gt_instances: InstanceData,
+                                gt_instances_ignore: Optional[InstanceData],
+                                img_meta: dict,
+                                unmap_outputs: bool = True) -> tuple:
+        """Compute guided anchoring targets.
+
+        This function returns sampled anchors and gt bboxes directly
+        rather than calculates regression targets.
+
+        Args:
+            flat_approxs (Tensor): flat approxs of a single image,
+                shape (n, 4)
+            inside_flags (Tensor): inside flags of a single image,
+                shape (n, ).
+            flat_squares (Tensor): flat squares of a single image,
+                shape (approxs_per_octave * n, 4)
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+            img_meta (dict): Meta info of a single image.
+            unmap_outputs (bool): unmap outputs or not.
+
+        Returns:
+            tuple: Returns a tuple containing shape targets of each image.
+        """
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        num_square = flat_squares.size(0)
+        approxs = flat_approxs.view(num_square, self.approxs_per_octave, 4)
+        approxs = approxs[inside_flags, ...]
+        squares = flat_squares[inside_flags, :]
+
+        pred_instances = InstanceData()
+        pred_instances.priors = squares
+        pred_instances.approxs = approxs
+
+        assign_result = self.ga_assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+        sampling_result = self.ga_sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+
+        bbox_anchors = torch.zeros_like(squares)
+        bbox_gts = torch.zeros_like(squares)
+        bbox_weights = torch.zeros_like(squares)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes
+            bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes
+            bbox_weights[pos_inds, :] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_squares.size(0)
+            bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags)
+            bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds,
+                sampling_result)
+
+    def ga_shape_targets(self,
+                         approx_list: List[List[Tensor]],
+                         inside_flag_list: List[List[Tensor]],
+                         square_list: List[List[Tensor]],
+                         batch_gt_instances: InstanceList,
+                         batch_img_metas: List[dict],
+                         batch_gt_instances_ignore: OptInstanceList = None,
+                         unmap_outputs: bool = True) -> tuple:
+        """Compute guided anchoring targets.
+
+        Args:
+            approx_list (list[list[Tensor]]): Multi level approxs of each
+                image.
+            inside_flag_list (list[list[Tensor]]): Multi level inside flags
+                of each image.
+            square_list (list[list[Tensor]]): Multi level squares of each
+                image.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): unmap outputs or not. Defaults to None.
+
+        Returns:
+            tuple:  Returns a tuple containing shape targets.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(approx_list) == len(inside_flag_list) == len(
+            square_list) == num_imgs
+        # anchor number of multi levels
+        num_level_squares = [squares.size(0) for squares in square_list[0]]
+        # concat all level anchors and flags to a single tensor
+        inside_flag_flat_list = []
+        approx_flat_list = []
+        square_flat_list = []
+        for i in range(num_imgs):
+            assert len(square_list[i]) == len(inside_flag_list[i])
+            inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
+            approx_flat_list.append(torch.cat(approx_list[i]))
+            square_flat_list.append(torch.cat(square_list[i]))
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None for _ in range(num_imgs)]
+        (all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list,
+         neg_inds_list, sampling_results_list) = multi_apply(
+             self._ga_shape_target_single,
+             approx_flat_list,
+             inside_flag_flat_list,
+             square_flat_list,
+             batch_gt_instances,
+             batch_gt_instances_ignore,
+             batch_img_metas,
+             unmap_outputs=unmap_outputs)
+        # sampled anchors of all images
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        bbox_anchors_list = images_to_levels(all_bbox_anchors,
+                                             num_level_squares)
+        bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_squares)
+        return (bbox_anchors_list, bbox_gts_list, bbox_weights_list,
+                avg_factor)
+
+    def loss_shape_single(self, shape_pred: Tensor, bbox_anchors: Tensor,
+                          bbox_gts: Tensor, anchor_weights: Tensor,
+                          avg_factor: int) -> Tensor:
+        """Compute shape loss in single level."""
+        shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2)
+        bbox_anchors = bbox_anchors.contiguous().view(-1, 4)
+        bbox_gts = bbox_gts.contiguous().view(-1, 4)
+        anchor_weights = anchor_weights.contiguous().view(-1, 4)
+        bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0)
+        bbox_deltas[:, 2:] += shape_pred
+        # filter out negative samples to speed-up weighted_bounded_iou_loss
+        inds = torch.nonzero(
+            anchor_weights[:, 0] > 0, as_tuple=False).squeeze(1)
+        bbox_deltas_ = bbox_deltas[inds]
+        bbox_anchors_ = bbox_anchors[inds]
+        bbox_gts_ = bbox_gts[inds]
+        anchor_weights_ = anchor_weights[inds]
+        pred_anchors_ = self.anchor_coder.decode(
+            bbox_anchors_, bbox_deltas_, wh_ratio_clip=1e-6)
+        loss_shape = self.loss_shape(
+            pred_anchors_, bbox_gts_, anchor_weights_, avg_factor=avg_factor)
+        return loss_shape
+
+    def loss_loc_single(self, loc_pred: Tensor, loc_target: Tensor,
+                        loc_weight: Tensor, avg_factor: float) -> Tensor:
+        """Compute location loss in single level."""
+        loss_loc = self.loss_loc(
+            loc_pred.reshape(-1, 1),
+            loc_target.reshape(-1).long(),
+            loc_weight.reshape(-1),
+            avg_factor=avg_factor)
+        return loss_loc
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            shape_preds: List[Tensor],
+            loc_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            shape_preds (list[Tensor]): shape predictions for each scale
+                level with shape (N, 1, H, W).
+            loc_preds (list[Tensor]): location predictions for each scale
+                level with shape (N, num_anchors * 2, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        # get loc targets
+        loc_targets, loc_weights, loc_avg_factor = self.ga_loc_targets(
+            batch_gt_instances, featmap_sizes)
+
+        # get sampled approxes
+        approxs_list, inside_flag_list = self.get_sampled_approxs(
+            featmap_sizes, batch_img_metas, device=device)
+        # get squares and guided anchors
+        squares_list, guided_anchors_list, _ = self.get_anchors(
+            featmap_sizes,
+            shape_preds,
+            loc_preds,
+            batch_img_metas,
+            device=device)
+
+        # get shape targets
+        shape_targets = self.ga_shape_targets(approxs_list, inside_flag_list,
+                                              squares_list, batch_gt_instances,
+                                              batch_img_metas)
+        (bbox_anchors_list, bbox_gts_list, anchor_weights_list,
+         ga_avg_factor) = shape_targets
+
+        # get anchor targets
+        cls_reg_targets = self.get_targets(
+            guided_anchors_list,
+            inside_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor) = cls_reg_targets
+
+        # anchor number of multi levels
+        num_level_anchors = [
+            anchors.size(0) for anchors in guided_anchors_list[0]
+        ]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        for i in range(len(guided_anchors_list)):
+            concat_anchor_list.append(torch.cat(guided_anchors_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        # get classification and bbox regression losses
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            avg_factor=avg_factor)
+
+        # get anchor location loss
+        losses_loc = []
+        for i in range(len(loc_preds)):
+            loss_loc = self.loss_loc_single(
+                loc_preds[i],
+                loc_targets[i],
+                loc_weights[i],
+                avg_factor=loc_avg_factor)
+            losses_loc.append(loss_loc)
+
+        # get anchor shape loss
+        losses_shape = []
+        for i in range(len(shape_preds)):
+            loss_shape = self.loss_shape_single(
+                shape_preds[i],
+                bbox_anchors_list[i],
+                bbox_gts_list[i],
+                anchor_weights_list[i],
+                avg_factor=ga_avg_factor)
+            losses_shape.append(loss_shape)
+
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_shape=losses_shape,
+            loss_loc=losses_loc)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        shape_preds: List[Tensor],
+                        loc_preds: List[Tensor],
+                        batch_img_metas: List[dict],
+                        cfg: OptConfigType = None,
+                        rescale: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            shape_preds (list[Tensor]): shape predictions for each scale
+                level with shape (N, 1, H, W).
+            loc_preds (list[Tensor]): location predictions for each scale
+                level with shape (N, num_anchors * 2, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last
+              dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len(
+            loc_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        device = cls_scores[0].device
+        # get guided anchors
+        _, guided_anchors, loc_masks = self.get_anchors(
+            featmap_sizes,
+            shape_preds,
+            loc_preds,
+            batch_img_metas,
+            use_loc_filter=not self.training,
+            device=device)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            guided_anchor_list = [
+                guided_anchors[img_id][i].detach() for i in range(num_levels)
+            ]
+            loc_mask_list = [
+                loc_masks[img_id][i].detach() for i in range(num_levels)
+            ]
+            proposals = self._predict_by_feat_single(
+                cls_scores=cls_score_list,
+                bbox_preds=bbox_pred_list,
+                mlvl_anchors=guided_anchor_list,
+                mlvl_masks=loc_mask_list,
+                img_meta=batch_img_metas[img_id],
+                cfg=cfg,
+                rescale=rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: List[Tensor],
+                                bbox_preds: List[Tensor],
+                                mlvl_anchors: List[Tensor],
+                                mlvl_masks: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = False) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Each element in the list is
+                the anchors of a single level in feature pyramid. it has
+                shape (num_priors, 4).
+            mlvl_masks (list[Tensor]): Each element in the list is location
+                masks of a single level.
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict` or dict): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last
+              dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bbox_preds = []
+        mlvl_valid_anchors = []
+        mlvl_scores = []
+        for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds,
+                                                       mlvl_anchors,
+                                                       mlvl_masks):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            # reshape scores and bbox_pred
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask, :]
+            bbox_pred = bbox_pred[mask, :]
+            if scores.dim() == 0:
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+                bbox_pred = bbox_pred.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_anchors.append(anchors)
+            mlvl_scores.append(scores)
+
+        mlvl_bbox_preds = torch.cat(mlvl_bbox_preds)
+        mlvl_anchors = torch.cat(mlvl_valid_anchors)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_bboxes = self.bbox_coder.decode(
+            mlvl_anchors, mlvl_bbox_preds, max_shape=img_meta['img_shape'])
+
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+            # BG cat_id: num_class
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        # multi class NMS
+        det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
+                                                cfg.score_thr, cfg.nms,
+                                                cfg.max_per_img)
+
+        results = InstanceData()
+        results.bboxes = det_bboxes[:, :-1]
+        results.scores = det_bboxes[:, -1]
+        results.labels = det_labels
+        return results
diff --git a/head_extractor/src/mmdet/models/dense_heads/lad_head.py b/head_extractor/src/mmdet/models/dense_heads/lad_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1218e1f88206704d4f414d151ccd34a189ac5d0
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/lad_head.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import InstanceList, OptInstanceList
+from ..utils import levels_to_images, multi_apply, unpack_gt_instances
+from .paa_head import PAAHead
+
+
+@MODELS.register_module()
+class LADHead(PAAHead):
+    """Label Assignment Head from the paper: `Improving Object Detection by
+    Label Assignment Distillation <https://arxiv.org/pdf/2108.10520.pdf>`_"""
+
+    def get_label_assignment(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            iou_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> tuple:
+        """Get label assignment (from teacher).
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            iou_preds (list[Tensor]): iou_preds for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            tuple: Returns a tuple containing label assignment variables.
+
+            - labels (Tensor): Labels of all anchors, each with
+              shape (num_anchors,).
+            - labels_weight (Tensor): Label weights of all anchor.
+              each with shape (num_anchors,).
+            - bboxes_target (Tensor): BBox targets of all anchors.
+              each with shape (num_anchors, 4).
+            - bboxes_weight (Tensor): BBox weights of all anchors.
+              each with shape (num_anchors, 4).
+            - pos_inds_flatten (Tensor): Contains all index of positive
+              sample in all anchor.
+            - pos_anchors (Tensor): Positive anchors.
+            - num_pos (int): Number of positive anchors.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+        )
+        (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds,
+         pos_gt_index) = cls_reg_targets
+        cls_scores = levels_to_images(cls_scores)
+        cls_scores = [
+            item.reshape(-1, self.cls_out_channels) for item in cls_scores
+        ]
+        bbox_preds = levels_to_images(bbox_preds)
+        bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
+        pos_losses_list, = multi_apply(self.get_pos_loss, anchor_list,
+                                       cls_scores, bbox_preds, labels,
+                                       labels_weight, bboxes_target,
+                                       bboxes_weight, pos_inds)
+
+        with torch.no_grad():
+            reassign_labels, reassign_label_weight, \
+                reassign_bbox_weights, num_pos = multi_apply(
+                    self.paa_reassign,
+                    pos_losses_list,
+                    labels,
+                    labels_weight,
+                    bboxes_weight,
+                    pos_inds,
+                    pos_gt_index,
+                    anchor_list)
+            num_pos = sum(num_pos)
+        # convert all tensor list to a flatten tensor
+        labels = torch.cat(reassign_labels, 0).view(-1)
+        flatten_anchors = torch.cat(
+            [torch.cat(item, 0) for item in anchor_list])
+        labels_weight = torch.cat(reassign_label_weight, 0).view(-1)
+        bboxes_target = torch.cat(bboxes_target,
+                                  0).view(-1, bboxes_target[0].size(-1))
+
+        pos_inds_flatten = ((labels >= 0)
+                            &
+                            (labels < self.num_classes)).nonzero().reshape(-1)
+
+        if num_pos:
+            pos_anchors = flatten_anchors[pos_inds_flatten]
+        else:
+            pos_anchors = None
+
+        label_assignment_results = (labels, labels_weight, bboxes_target,
+                                    bboxes_weight, pos_inds_flatten,
+                                    pos_anchors, num_pos)
+        return label_assignment_results
+
+    def loss(self, x: List[Tensor], label_assignment_results: tuple,
+             batch_data_samples: SampleList) -> dict:
+        """Forward train with the available label assignment (student receives
+        from teacher).
+
+        Args:
+            x (list[Tensor]): Features from FPN.
+            label_assignment_results (tuple): As the outputs defined in the
+                function `self.get_label_assignment`.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            losses: (dict[str, Tensor]): A dictionary of loss components.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        outs = self(x)
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas)
+        losses = self.loss_by_feat(
+            *loss_inputs,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            label_assignment_results=label_assignment_results)
+        return losses
+
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     iou_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None,
+                     label_assignment_results: Optional[tuple] = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            iou_preds (list[Tensor]): iou_preds for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            label_assignment_results (tuple, optional): As the outputs defined
+                in the function `self.get_
+                label_assignment`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss gmm_assignment.
+        """
+
+        (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds_flatten,
+         pos_anchors, num_pos) = label_assignment_results
+
+        cls_scores = levels_to_images(cls_scores)
+        cls_scores = [
+            item.reshape(-1, self.cls_out_channels) for item in cls_scores
+        ]
+        bbox_preds = levels_to_images(bbox_preds)
+        bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
+        iou_preds = levels_to_images(iou_preds)
+        iou_preds = [item.reshape(-1, 1) for item in iou_preds]
+
+        # convert all tensor list to a flatten tensor
+        cls_scores = torch.cat(cls_scores, 0).view(-1, cls_scores[0].size(-1))
+        bbox_preds = torch.cat(bbox_preds, 0).view(-1, bbox_preds[0].size(-1))
+        iou_preds = torch.cat(iou_preds, 0).view(-1, iou_preds[0].size(-1))
+
+        losses_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            labels_weight,
+            avg_factor=max(num_pos, len(batch_img_metas)))  # avoid num_pos=0
+        if num_pos:
+            pos_bbox_pred = self.bbox_coder.decode(
+                pos_anchors, bbox_preds[pos_inds_flatten])
+            pos_bbox_target = bboxes_target[pos_inds_flatten]
+            iou_target = bbox_overlaps(
+                pos_bbox_pred.detach(), pos_bbox_target, is_aligned=True)
+            losses_iou = self.loss_centerness(
+                iou_preds[pos_inds_flatten],
+                iou_target.unsqueeze(-1),
+                avg_factor=num_pos)
+            losses_bbox = self.loss_bbox(
+                pos_bbox_pred, pos_bbox_target, avg_factor=num_pos)
+
+        else:
+            losses_iou = iou_preds.sum() * 0
+            losses_bbox = bbox_preds.sum() * 0
+
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_iou=losses_iou)
diff --git a/head_extractor/src/mmdet/models/dense_heads/ld_head.py b/head_extractor/src/mmdet/models/dense_heads/ld_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2558fac97ee26ff89c5fa1b386f5ce68c3ad384d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/ld_head.py
@@ -0,0 +1,257 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
+from ..utils import multi_apply, unpack_gt_instances
+from .gfl_head import GFLHead
+
+
+@MODELS.register_module()
+class LDHead(GFLHead):
+    """Localization distillation Head. (Short description)
+
+    It utilizes the learned bbox distributions to transfer the localization
+    dark knowledge from teacher to student. Original paper: `Localization
+    Distillation for Object Detection. <https://arxiv.org/abs/2102.12252>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        loss_ld (:obj:`ConfigDict` or dict): Config of Localization
+            Distillation Loss (LD), T is the temperature for distillation.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 loss_ld: ConfigType = dict(
+                     type='LocalizationDistillationLoss',
+                     loss_weight=0.25,
+                     T=10),
+                 **kwargs) -> dict:
+
+        super().__init__(
+            num_classes=num_classes, in_channels=in_channels, **kwargs)
+        self.loss_ld = MODELS.build(loss_ld)
+
+    def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor,
+                            bbox_pred: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            stride: Tuple[int], soft_targets: Tensor,
+                            avg_factor: int):
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Cls and quality joint scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_pred (Tensor): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            stride (tuple): Stride in this scale level.
+            soft_targets (Tensor): Soft BBox regression targets.
+            avg_factor (int): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            dict[tuple, Tensor]: Loss components and weight targets.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1, 4 * (self.reg_max + 1))
+        soft_targets = soft_targets.permute(0, 2, 3,
+                                            1).reshape(-1,
+                                                       4 * (self.reg_max + 1))
+
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+        score = label_weights.new_zeros(labels.shape)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+            pos_anchor_centers = self.anchor_center(pos_anchors) / stride[0]
+
+            weight_targets = cls_score.detach().sigmoid()
+            weight_targets = weight_targets.max(dim=1)[0][pos_inds]
+            pos_bbox_pred_corners = self.integral(pos_bbox_pred)
+            pos_decode_bbox_pred = self.bbox_coder.decode(
+                pos_anchor_centers, pos_bbox_pred_corners)
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+            score[pos_inds] = bbox_overlaps(
+                pos_decode_bbox_pred.detach(),
+                pos_decode_bbox_targets,
+                is_aligned=True)
+            pred_corners = pos_bbox_pred.reshape(-1, self.reg_max + 1)
+            pos_soft_targets = soft_targets[pos_inds]
+            soft_corners = pos_soft_targets.reshape(-1, self.reg_max + 1)
+
+            target_corners = self.bbox_coder.encode(pos_anchor_centers,
+                                                    pos_decode_bbox_targets,
+                                                    self.reg_max).reshape(-1)
+
+            # regression loss
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=weight_targets,
+                avg_factor=1.0)
+
+            # dfl loss
+            loss_dfl = self.loss_dfl(
+                pred_corners,
+                target_corners,
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0)
+
+            # ld loss
+            loss_ld = self.loss_ld(
+                pred_corners,
+                soft_corners,
+                weight=weight_targets[:, None].expand(-1, 4).reshape(-1),
+                avg_factor=4.0)
+
+        else:
+            loss_ld = bbox_pred.sum() * 0
+            loss_bbox = bbox_pred.sum() * 0
+            loss_dfl = bbox_pred.sum() * 0
+            weight_targets = bbox_pred.new_tensor(0)
+
+        # cls (qfl) loss
+        loss_cls = self.loss_cls(
+            cls_score, (labels, score),
+            weight=label_weights,
+            avg_factor=avg_factor)
+
+        return loss_cls, loss_bbox, loss_dfl, loss_ld, weight_targets.sum()
+
+    def loss(self, x: List[Tensor], out_teacher: Tuple[Tensor],
+             batch_data_samples: SampleList) -> dict:
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            out_teacher (tuple[Tensor]): The output of teacher.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            tuple[dict, list]: The loss components and proposals of each image.
+
+            - losses (dict[str, Tensor]): A dictionary of loss components.
+            - proposal_list (list[Tensor]): Proposals of each image.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        outs = self(x)
+        soft_targets = out_teacher[1]
+        loss_inputs = outs + (batch_gt_instances, batch_img_metas,
+                              soft_targets)
+        losses = self.loss_by_feat(
+            *loss_inputs, batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        return losses
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            soft_targets: List[Tensor],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Cls and quality scores for each scale
+                level has shape (N, num_classes, H, W).
+            bbox_preds (list[Tensor]): Box distribution logits for each scale
+                level with shape (N, 4*(n+1), H, W), n is max value of integral
+                set.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            soft_targets (list[Tensor]): Soft BBox regression targets.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        losses_cls, losses_bbox, losses_dfl, losses_ld, \
+            avg_factor = multi_apply(
+                self.loss_by_feat_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                self.prior_generator.strides,
+                soft_targets,
+                avg_factor=avg_factor)
+
+        avg_factor = sum(avg_factor) + 1e-6
+        avg_factor = reduce_mean(avg_factor).item()
+        losses_bbox = [x / avg_factor for x in losses_bbox]
+        losses_dfl = [x / avg_factor for x in losses_dfl]
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_dfl=losses_dfl,
+            loss_ld=losses_ld)
diff --git a/head_extractor/src/mmdet/models/dense_heads/mask2former_head.py b/head_extractor/src/mmdet/models/dense_heads/mask2former_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d47c655255f92819646b8ea304b9736ec30660
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/mask2former_head.py
@@ -0,0 +1,459 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d
+from mmcv.ops import point_sample
+from mmengine.model import ModuleList, caffe2_xavier_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig, reduce_mean
+from ..layers import Mask2FormerTransformerDecoder, SinePositionalEncoding
+from ..utils import get_uncertain_point_coords_with_randomness
+from .anchor_free_head import AnchorFreeHead
+from .maskformer_head import MaskFormerHead
+
+
+@MODELS.register_module()
+class Mask2FormerHead(MaskFormerHead):
+    """Implements the Mask2Former head.
+
+    See `Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for features.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer decoder.
+        pixel_decoder (:obj:`ConfigDict` or dict): Config for pixel
+            decoder. Defaults to None.
+        enforce_decoder_input_project (bool, optional): Whether to add
+            a layer to change the embed_dim of tranformer encoder in
+            pixel decoder to the embed_dim of transformer decoder.
+            Defaults to False.
+        transformer_decoder (:obj:`ConfigDict` or dict): Config for
+            transformer decoder. Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer decoder position encoding. Defaults to
+            dict(num_feats=128, normalize=True).
+        loss_cls (:obj:`ConfigDict` or dict): Config of the classification
+            loss. Defaults to None.
+        loss_mask (:obj:`ConfigDict` or dict): Config of the mask loss.
+            Defaults to None.
+        loss_dice (:obj:`ConfigDict` or dict): Config of the dice loss.
+            Defaults to None.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            Mask2Former head.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            Mask2Former head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 feat_channels: int,
+                 out_channels: int,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 num_queries: int = 100,
+                 num_transformer_feat_level: int = 3,
+                 pixel_decoder: ConfigType = ...,
+                 enforce_decoder_input_project: bool = False,
+                 transformer_decoder: ConfigType = ...,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=2.0,
+                     reduction='mean',
+                     class_weight=[1.0] * 133 + [0.1]),
+                 loss_mask: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='mean',
+                     loss_weight=5.0),
+                 loss_dice: ConfigType = dict(
+                     type='DiceLoss',
+                     use_sigmoid=True,
+                     activate=True,
+                     reduction='mean',
+                     naive_dice=True,
+                     eps=1.0,
+                     loss_weight=5.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(AnchorFreeHead, self).__init__(init_cfg=init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_heads = transformer_decoder.layer_cfg.cross_attn_cfg.num_heads
+        self.num_transformer_decoder_layers = transformer_decoder.num_layers
+        assert pixel_decoder.encoder.layer_cfg. \
+            self_attn_cfg.num_levels == num_transformer_feat_level
+        pixel_decoder_ = copy.deepcopy(pixel_decoder)
+        pixel_decoder_.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = MODELS.build(pixel_decoder_)
+        self.transformer_decoder = Mask2FormerTransformerDecoder(
+            **transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+
+        self.decoder_input_projs = ModuleList()
+        # from low resolution to high resolution
+        for _ in range(num_transformer_feat_level):
+            if (self.decoder_embed_dims != feat_channels
+                    or enforce_decoder_input_project):
+                self.decoder_input_projs.append(
+                    Conv2d(
+                        feat_channels, self.decoder_embed_dims, kernel_size=1))
+            else:
+                self.decoder_input_projs.append(nn.Identity())
+        self.decoder_positional_encoding = SinePositionalEncoding(
+            **positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
+        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
+        # from low resolution to high resolution
+        self.level_embed = nn.Embedding(self.num_transformer_feat_level,
+                                        feat_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            self.sampler = TASK_UTILS.build(
+                self.train_cfg['sampler'], default_args=dict(context=self))
+            self.num_points = self.train_cfg.get('num_points', 12544)
+            self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
+            self.importance_sample_ratio = self.train_cfg.get(
+                'importance_sample_ratio', 0.75)
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_mask = MODELS.build(loss_mask)
+        self.loss_dice = MODELS.build(loss_dice)
+
+    def init_weights(self) -> None:
+        for m in self.decoder_input_projs:
+            if isinstance(m, Conv2d):
+                caffe2_xavier_init(m, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> Tuple[Tensor]:
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_instances (:obj:`InstanceData`): It contains ``labels`` and
+                ``masks``.
+            img_meta (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each \
+                    image.
+                - neg_inds (Tensor): Sampled negative indices for each \
+                    image.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        gt_labels = gt_instances.labels
+        gt_masks = gt_instances.masks
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(
+            mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1,
+                                                        1)).squeeze(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(
+            gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1,
+                                                               1)).squeeze(1)
+
+        sampled_gt_instances = InstanceData(
+            labels=gt_labels, masks=gt_points_masks)
+        sampled_pred_instances = InstanceData(
+            scores=cls_score, masks=mask_points_pred)
+        # assign and sample
+        assign_result = self.assigner.assign(
+            pred_instances=sampled_pred_instances,
+            gt_instances=sampled_gt_instances,
+            img_meta=img_meta)
+        pred_instances = InstanceData(scores=cls_score, masks=mask_pred)
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones((self.num_queries, ))
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def _loss_by_feat_single(self, cls_scores: Tensor, mask_preds: Tensor,
+                             batch_gt_instances: List[InstanceData],
+                             batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single \
+                decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         avg_factor) = self.get_targets(cls_scores_list, mask_preds_list,
+                                        batch_gt_instances, batch_img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([avg_factor]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        with torch.no_grad():
+            points_coords = get_uncertain_point_coords_with_randomness(
+                mask_preds.unsqueeze(1), None, self.num_points,
+                self.oversample_ratio, self.importance_sample_ratio)
+            # shape (num_total_gts, h, w) -> (num_total_gts, num_points)
+            mask_point_targets = point_sample(
+                mask_targets.unsqueeze(1).float(), points_coords).squeeze(1)
+        # shape (num_queries, h, w) -> (num_queries, num_points)
+        mask_point_preds = point_sample(
+            mask_preds.unsqueeze(1), points_coords).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_point_preds, mask_point_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # shape (num_queries, num_points) -> (num_queries * num_points, )
+        mask_point_preds = mask_point_preds.reshape(-1)
+        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
+        mask_point_targets = mask_point_targets.reshape(-1)
+        loss_mask = self.loss_mask(
+            mask_point_preds,
+            mask_point_targets,
+            avg_factor=num_total_masks * self.num_points)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def _forward_head(self, decoder_out: Tensor, mask_feature: Tensor,
+                      attn_mask_target_size: Tuple[int, int]) -> Tuple[Tensor]:
+        """Forward for head part which is called after every decoder layer.
+
+        Args:
+            decoder_out (Tensor): in shape (batch_size, num_queries, c).
+            mask_feature (Tensor): in shape (batch_size, c, h, w).
+            attn_mask_target_size (tuple[int, int]): target attention
+                mask size.
+
+        Returns:
+            tuple: A tuple contain three elements.
+
+                - cls_pred (Tensor): Classification scores in shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should includes background.
+                - mask_pred (Tensor): Mask scores in shape \
+                    (batch_size, num_queries,h, w).
+                - attn_mask (Tensor): Attention mask in shape \
+                    (batch_size * num_heads, num_queries, h, w).
+        """
+        decoder_out = self.transformer_decoder.post_norm(decoder_out)
+        # shape (num_queries, batch_size, c)
+        cls_pred = self.cls_embed(decoder_out)
+        # shape (num_queries, batch_size, c)
+        mask_embed = self.mask_embed(decoder_out)
+        # shape (num_queries, batch_size, h, w)
+        mask_pred = torch.einsum('bqc,bchw->bqhw', mask_embed, mask_feature)
+        attn_mask = F.interpolate(
+            mask_pred,
+            attn_mask_target_size,
+            mode='bilinear',
+            align_corners=False)
+        # shape (num_queries, batch_size, h, w) ->
+        #   (batch_size * num_head, num_queries, h, w)
+        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
+            (1, self.num_heads, 1, 1)).flatten(0, 1)
+        attn_mask = attn_mask.sigmoid() < 0.5
+        attn_mask = attn_mask.detach()
+
+        return cls_pred, mask_pred, attn_mask
+
+    def forward(self, x: List[Tensor],
+                batch_data_samples: SampleList) -> Tuple[List[Tensor]]:
+        """Forward function.
+
+        Args:
+            x (list[Tensor]): Multi scale Features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            tuple[list[Tensor]]: A tuple contains two elements.
+
+                - cls_pred_list (list[Tensor)]: Classification logits \
+                    for each decoder layer. Each is a 3D-tensor with shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should includes background.
+                - mask_pred_list (list[Tensor]): Mask logits for each \
+                    decoder layer. Each with shape (batch_size, num_queries, \
+                    h, w).
+        """
+        batch_size = x[0].shape[0]
+        mask_features, multi_scale_memorys = self.pixel_decoder(x)
+        # multi_scale_memorys (from low resolution to high resolution)
+        decoder_inputs = []
+        decoder_positional_encodings = []
+        for i in range(self.num_transformer_feat_level):
+            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
+            # shape (batch_size, c, h, w) -> (batch_size, h*w, c)
+            decoder_input = decoder_input.flatten(2).permute(0, 2, 1)
+            level_embed = self.level_embed.weight[i].view(1, 1, -1)
+            decoder_input = decoder_input + level_embed
+            # shape (batch_size, c, h, w) -> (batch_size, h*w, c)
+            mask = decoder_input.new_zeros(
+                (batch_size, ) + multi_scale_memorys[i].shape[-2:],
+                dtype=torch.bool)
+            decoder_positional_encoding = self.decoder_positional_encoding(
+                mask)
+            decoder_positional_encoding = decoder_positional_encoding.flatten(
+                2).permute(0, 2, 1)
+            decoder_inputs.append(decoder_input)
+            decoder_positional_encodings.append(decoder_positional_encoding)
+        # shape (num_queries, c) -> (batch_size, num_queries, c)
+        query_feat = self.query_feat.weight.unsqueeze(0).repeat(
+            (batch_size, 1, 1))
+        query_embed = self.query_embed.weight.unsqueeze(0).repeat(
+            (batch_size, 1, 1))
+
+        cls_pred_list = []
+        mask_pred_list = []
+        cls_pred, mask_pred, attn_mask = self._forward_head(
+            query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
+        cls_pred_list.append(cls_pred)
+        mask_pred_list.append(mask_pred)
+
+        for i in range(self.num_transformer_decoder_layers):
+            level_idx = i % self.num_transformer_feat_level
+            # if a mask is all True(all background), then set it all False.
+            mask_sum = (attn_mask.sum(-1) != attn_mask.shape[-1]).unsqueeze(-1)
+            attn_mask = attn_mask & mask_sum
+            # cross_attn + self_attn
+            layer = self.transformer_decoder.layers[i]
+            query_feat = layer(
+                query=query_feat,
+                key=decoder_inputs[level_idx],
+                value=decoder_inputs[level_idx],
+                query_pos=query_embed,
+                key_pos=decoder_positional_encodings[level_idx],
+                cross_attn_mask=attn_mask,
+                query_key_padding_mask=None,
+                # here we do not apply masking on padded region
+                key_padding_mask=None)
+            cls_pred, mask_pred, attn_mask = self._forward_head(
+                query_feat, mask_features, multi_scale_memorys[
+                    (i + 1) % self.num_transformer_feat_level].shape[-2:])
+
+            cls_pred_list.append(cls_pred)
+            mask_pred_list.append(mask_pred)
+
+        return cls_pred_list, mask_pred_list
diff --git a/head_extractor/src/mmdet/models/dense_heads/maskformer_head.py b/head_extractor/src/mmdet/models/dense_heads/maskformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..24c0655ee1c36e0110cf6578d1c095c50a297d81
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/maskformer_head.py
@@ -0,0 +1,601 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d
+from mmengine.model import caffe2_xavier_init
+from mmengine.structures import InstanceData, PixelData
+from torch import Tensor
+
+from mmdet.models.layers.pixel_decoder import PixelDecoder
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptMultiConfig, reduce_mean)
+from ..layers import DetrTransformerDecoder, SinePositionalEncoding
+from ..utils import multi_apply, preprocess_panoptic_gt
+from .anchor_free_head import AnchorFreeHead
+
+
+@MODELS.register_module()
+class MaskFormerHead(AnchorFreeHead):
+    """Implements the MaskFormer head.
+
+    See `Per-Pixel Classification is Not All You Need for Semantic
+    Segmentation <https://arxiv.org/pdf/2107.06278>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for feature.
+        out_channels (int): Number of channels for output.
+        num_things_classes (int): Number of things.
+        num_stuff_classes (int): Number of stuff.
+        num_queries (int): Number of query in Transformer.
+        pixel_decoder (:obj:`ConfigDict` or dict): Config for pixel
+            decoder.
+        enforce_decoder_input_project (bool): Whether to add a layer
+            to change the embed_dim of transformer encoder in pixel decoder to
+            the embed_dim of transformer decoder. Defaults to False.
+        transformer_decoder (:obj:`ConfigDict` or dict): Config for
+            transformer decoder.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer decoder position encoding.
+        loss_cls (:obj:`ConfigDict` or dict): Config of the classification
+            loss. Defaults to `CrossEntropyLoss`.
+        loss_mask (:obj:`ConfigDict` or dict): Config of the mask loss.
+            Defaults to `FocalLoss`.
+        loss_dice (:obj:`ConfigDict` or dict): Config of the dice loss.
+            Defaults to `DiceLoss`.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            MaskFormer head.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            MaskFormer head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 feat_channels: int,
+                 out_channels: int,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 num_queries: int = 100,
+                 pixel_decoder: ConfigType = ...,
+                 enforce_decoder_input_project: bool = False,
+                 transformer_decoder: ConfigType = ...,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0,
+                     class_weight=[1.0] * 133 + [0.1]),
+                 loss_mask: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=20.0),
+                 loss_dice: ConfigType = dict(
+                     type='DiceLoss',
+                     use_sigmoid=True,
+                     activate=True,
+                     naive_dice=True,
+                     loss_weight=1.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(AnchorFreeHead, self).__init__(init_cfg=init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = self.num_things_classes + self.num_stuff_classes
+        self.num_queries = num_queries
+
+        pixel_decoder.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = MODELS.build(pixel_decoder)
+        self.transformer_decoder = DetrTransformerDecoder(
+            **transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+        if type(self.pixel_decoder) == PixelDecoder and (
+                self.decoder_embed_dims != in_channels[-1]
+                or enforce_decoder_input_project):
+            self.decoder_input_proj = Conv2d(
+                in_channels[-1], self.decoder_embed_dims, kernel_size=1)
+        else:
+            self.decoder_input_proj = nn.Identity()
+        self.decoder_pe = SinePositionalEncoding(**positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, out_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = TASK_UTILS.build(train_cfg['assigner'])
+            self.sampler = TASK_UTILS.build(
+                train_cfg['sampler'], default_args=dict(context=self))
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_mask = MODELS.build(loss_mask)
+        self.loss_dice = MODELS.build(loss_dice)
+
+    def init_weights(self) -> None:
+        if isinstance(self.decoder_input_proj, Conv2d):
+            caffe2_xavier_init(self.decoder_input_proj, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def preprocess_gt(
+            self, batch_gt_instances: InstanceList,
+            batch_gt_semantic_segs: List[Optional[PixelData]]) -> InstanceList:
+        """Preprocess the ground truth for all images.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``labels``, each is
+                ground truth labels of each bbox, with shape (num_gts, )
+                and ``masks``, each is ground truth masks of each instances
+                of a image, shape (num_gts, h, w).
+            gt_semantic_seg (list[Optional[PixelData]]): Ground truth of
+                semantic segmentation, each with the shape (1, h, w).
+                [0, num_thing_class - 1] means things,
+                [num_thing_class, num_class-1] means stuff,
+                255 means VOID. It's None when training instance segmentation.
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+
+                - labels (Tensor): Ground truth class indices\
+                    for a image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in a image.
+                - masks (Tensor): Ground truth mask for a\
+                    image, with shape (n, h, w).
+        """
+        num_things_list = [self.num_things_classes] * len(batch_gt_instances)
+        num_stuff_list = [self.num_stuff_classes] * len(batch_gt_instances)
+        gt_labels_list = [
+            gt_instances['labels'] for gt_instances in batch_gt_instances
+        ]
+        gt_masks_list = [
+            gt_instances['masks'] for gt_instances in batch_gt_instances
+        ]
+        gt_semantic_segs = [
+            None if gt_semantic_seg is None else gt_semantic_seg.sem_seg
+            for gt_semantic_seg in batch_gt_semantic_segs
+        ]
+        targets = multi_apply(preprocess_panoptic_gt, gt_labels_list,
+                              gt_masks_list, gt_semantic_segs, num_things_list,
+                              num_stuff_list)
+        labels, masks = targets
+        batch_gt_instances = [
+            InstanceData(labels=label, masks=mask)
+            for label, mask in zip(labels, masks)
+        ]
+        return batch_gt_instances
+
+    def get_targets(
+        self,
+        cls_scores_list: List[Tensor],
+        mask_preds_list: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        return_sampling_results: bool = False
+    ) -> Tuple[List[Union[Tensor, int]]]:
+        """Compute classification and mask targets for all images for a decoder
+        layer.
+
+        Args:
+            cls_scores_list (list[Tensor]): Mask score logits from a single
+                decoder layer for all images. Each with shape (num_queries,
+                cls_out_channels).
+            mask_preds_list (list[Tensor]): Mask logits from a single decoder
+                layer for all images. Each with shape (num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+            return_sampling_results (bool): Whether to return the sampling
+                results. Defaults to False.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels of all images.\
+                    Each with shape (num_queries, ).
+                - label_weights_list (list[Tensor]): Label weights\
+                    of all images. Each with shape (num_queries, ).
+                - mask_targets_list (list[Tensor]): Mask targets of\
+                    all images. Each with shape (num_queries, h, w).
+                - mask_weights_list (list[Tensor]): Mask weights of\
+                    all images. Each with shape (num_queries, ).
+                - avg_factor (int): Average factor that is used to average\
+                    the loss. When using sampling method, avg_factor is
+                    usually the sum of positive and negative priors. When
+                    using `MaskPseudoSampler`, `avg_factor` is usually equal
+                    to the number of positive priors.
+
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end.
+        """
+        results = multi_apply(self._get_targets_single, cls_scores_list,
+                              mask_preds_list, batch_gt_instances,
+                              batch_img_metas)
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         pos_inds_list, neg_inds_list, sampling_results_list) = results[:7]
+        rest_results = list(results[7:])
+
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+
+        res = (labels_list, label_weights_list, mask_targets_list,
+               mask_weights_list, avg_factor)
+        if return_sampling_results:
+            res = res + (sampling_results_list)
+
+        return res + tuple(rest_results)
+
+    def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> Tuple[Tensor]:
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_instances (:obj:`InstanceData`): It contains ``labels`` and
+                ``masks``.
+            img_meta (dict): Image informtation.
+
+        Returns:
+            tuple: a tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image.
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image.
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image.
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image.
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        gt_masks = gt_instances.masks
+        gt_labels = gt_instances.labels
+
+        target_shape = mask_pred.shape[-2:]
+        if gt_masks.shape[0] > 0:
+            gt_masks_downsampled = F.interpolate(
+                gt_masks.unsqueeze(1).float(), target_shape,
+                mode='nearest').squeeze(1).long()
+        else:
+            gt_masks_downsampled = gt_masks
+
+        pred_instances = InstanceData(scores=cls_score, masks=mask_pred)
+        downsampled_gt_instances = InstanceData(
+            labels=gt_labels, masks=gt_masks_downsampled)
+        # assign and sample
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=downsampled_gt_instances,
+            img_meta=img_meta)
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones(self.num_queries)
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def loss_by_feat(self, all_cls_scores: Tensor, all_mask_preds: Tensor,
+                     batch_gt_instances: List[InstanceData],
+                     batch_img_metas: List[dict]) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification scores for all decoder
+                layers with shape (num_decoder, batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            all_mask_preds (Tensor): Mask scores for all decoder layers with
+                shape (num_decoder, batch_size, num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_dec_layers = len(all_cls_scores)
+        batch_gt_instances_list = [
+            batch_gt_instances for _ in range(num_dec_layers)
+        ]
+        img_metas_list = [batch_img_metas for _ in range(num_dec_layers)]
+        losses_cls, losses_mask, losses_dice = multi_apply(
+            self._loss_by_feat_single, all_cls_scores, all_mask_preds,
+            batch_gt_instances_list, img_metas_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_mask'] = losses_mask[-1]
+        loss_dict['loss_dice'] = losses_dice[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_mask_i, loss_dice_i in zip(
+                losses_cls[:-1], losses_mask[:-1], losses_dice[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_mask'] = loss_mask_i
+            loss_dict[f'd{num_dec_layer}.loss_dice'] = loss_dice_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def _loss_by_feat_single(self, cls_scores: Tensor, mask_preds: Tensor,
+                             batch_gt_instances: List[InstanceData],
+                             batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single decoder\
+                layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         avg_factor) = self.get_targets(cls_scores_list, mask_preds_list,
+                                        batch_gt_instances, batch_img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([avg_factor]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+        target_shape = mask_targets.shape[-2:]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        # upsample to shape of target
+        # shape (num_total_gts, h, w)
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(1),
+            target_shape,
+            mode='bilinear',
+            align_corners=False).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_preds, mask_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # FocalLoss support input of shape (n, num_class)
+        h, w = mask_preds.shape[-2:]
+        # shape (num_total_gts, h, w) -> (num_total_gts * h * w, 1)
+        mask_preds = mask_preds.reshape(-1, 1)
+        # shape (num_total_gts, h, w) -> (num_total_gts * h * w)
+        mask_targets = mask_targets.reshape(-1)
+        # target is (1 - mask_targets) !!!
+        loss_mask = self.loss_mask(
+            mask_preds, 1 - mask_targets, avg_factor=num_total_masks * h * w)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def forward(self, x: Tuple[Tensor],
+                batch_data_samples: SampleList) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: a tuple contains two elements.
+
+                - all_cls_scores (Tensor): Classification scores for each\
+                    scale level. Each is a 4D-tensor with shape\
+                    (num_decoder, batch_size, num_queries, cls_out_channels).\
+                    Note `cls_out_channels` should includes background.
+                - all_mask_preds (Tensor): Mask scores for each decoder\
+                    layer. Each with shape (num_decoder, batch_size,\
+                    num_queries, h, w).
+        """
+        batch_img_metas = [
+            data_sample.metainfo for data_sample in batch_data_samples
+        ]
+        batch_size = x[0].shape[0]
+        input_img_h, input_img_w = batch_img_metas[0]['batch_input_shape']
+        padding_mask = x[-1].new_ones((batch_size, input_img_h, input_img_w),
+                                      dtype=torch.float32)
+        for i in range(batch_size):
+            img_h, img_w = batch_img_metas[i]['img_shape']
+            padding_mask[i, :img_h, :img_w] = 0
+        padding_mask = F.interpolate(
+            padding_mask.unsqueeze(1), size=x[-1].shape[-2:],
+            mode='nearest').to(torch.bool).squeeze(1)
+        # when backbone is swin, memory is output of last stage of swin.
+        # when backbone is r50, memory is output of tranformer encoder.
+        mask_features, memory = self.pixel_decoder(x, batch_img_metas)
+        pos_embed = self.decoder_pe(padding_mask)
+        memory = self.decoder_input_proj(memory)
+        # shape (batch_size, c, h, w) -> (batch_size, h*w, c)
+        memory = memory.flatten(2).permute(0, 2, 1)
+        pos_embed = pos_embed.flatten(2).permute(0, 2, 1)
+        # shape (batch_size, h * w)
+        padding_mask = padding_mask.flatten(1)
+        # shape = (num_queries, embed_dims)
+        query_embed = self.query_embed.weight
+        # shape = (batch_size, num_queries, embed_dims)
+        query_embed = query_embed.unsqueeze(0).repeat(batch_size, 1, 1)
+        target = torch.zeros_like(query_embed)
+        # shape (num_decoder, num_queries, batch_size, embed_dims)
+        out_dec = self.transformer_decoder(
+            query=target,
+            key=memory,
+            value=memory,
+            query_pos=query_embed,
+            key_pos=pos_embed,
+            key_padding_mask=padding_mask)
+
+        # cls_scores
+        all_cls_scores = self.cls_embed(out_dec)
+
+        # mask_preds
+        mask_embed = self.mask_embed(out_dec)
+        all_mask_preds = torch.einsum('lbqc,bchw->lbqhw', mask_embed,
+                                      mask_features)
+
+        return all_cls_scores, all_mask_preds
+
+    def loss(
+        self,
+        x: Tuple[Tensor],
+        batch_data_samples: SampleList,
+    ) -> Dict[str, Tensor]:
+        """Perform forward propagation and loss calculation of the panoptic
+        head on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+        batch_gt_semantic_segs = []
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            batch_gt_instances.append(data_sample.gt_instances)
+            if 'gt_sem_seg' in data_sample:
+                batch_gt_semantic_segs.append(data_sample.gt_sem_seg)
+            else:
+                batch_gt_semantic_segs.append(None)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+
+        # preprocess ground truth
+        batch_gt_instances = self.preprocess_gt(batch_gt_instances,
+                                                batch_gt_semantic_segs)
+
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self, x: Tuple[Tensor],
+                batch_data_samples: SampleList) -> Tuple[Tensor]:
+        """Test without augmentaton.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: A tuple contains two tensors.
+
+                - mask_cls_results (Tensor): Mask classification logits,\
+                    shape (batch_size, num_queries, cls_out_channels).
+                    Note `cls_out_channels` should includes background.
+                - mask_pred_results (Tensor): Mask logits, shape \
+                    (batch_size, num_queries, h, w).
+        """
+        batch_img_metas = [
+            data_sample.metainfo for data_sample in batch_data_samples
+        ]
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        # upsample masks
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=(img_shape[0], img_shape[1]),
+            mode='bilinear',
+            align_corners=False)
+
+        return mask_cls_results, mask_pred_results
diff --git a/head_extractor/src/mmdet/models/dense_heads/nasfcos_head.py b/head_extractor/src/mmdet/models/dense_heads/nasfcos_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..14ee62a7910d90a108fefb2acef00c91ab83ecc8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/nasfcos_head.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+
+from mmdet.models.dense_heads.fcos_head import FCOSHead
+from mmdet.registry import MODELS
+from mmdet.utils import OptMultiConfig
+
+
+@MODELS.register_module()
+class NASFCOSHead(FCOSHead):
+    """Anchor-free head used in `NASFCOS <https://arxiv.org/abs/1906.04423>`_.
+
+    It is quite similar with FCOS head, except for the searched structure of
+    classification branch and bbox regression branch, where a structure of
+    "dconv3x3, conv3x3, dconv3x3, conv1x1" is utilized instead.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        strides (Sequence[int] or Sequence[Tuple[int, int]]): Strides of points
+            in multiple feature levels. Defaults to (4, 8, 16, 32, 64).
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling.
+            Defaults to False.
+        center_sample_radius (float): Radius of center sampling.
+            Defaults to 1.5.
+        norm_on_bbox (bool): If true, normalize the regression targets with
+            FPN strides. Defaults to False.
+        centerness_on_reg (bool): If true, position centerness on the
+            regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
+            Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Defaults to "auto".
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_centerness (:obj:`ConfigDict`, or dict): Config of centerness
+            loss.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer.  Defaults to
+            ``norm_cfg=dict(type='GN', num_groups=32, requires_grad=True)``.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], opitonal): Initialization config dict.
+    """  # noqa: E501
+
+    def __init__(self,
+                 *args,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        if init_cfg is None:
+            init_cfg = [
+                dict(type='Caffe2Xavier', layer=['ConvModule', 'Conv2d']),
+                dict(
+                    type='Normal',
+                    std=0.01,
+                    override=[
+                        dict(name='conv_reg'),
+                        dict(name='conv_centerness'),
+                        dict(
+                            name='conv_cls',
+                            type='Normal',
+                            std=0.01,
+                            bias_prob=0.01)
+                    ]),
+            ]
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        dconv3x3_config = dict(
+            type='DCNv2',
+            kernel_size=3,
+            use_bias=True,
+            deform_groups=2,
+            padding=1)
+        conv3x3_config = dict(type='Conv', kernel_size=3, padding=1)
+        conv1x1_config = dict(type='Conv', kernel_size=1)
+
+        self.arch_config = [
+            dconv3x3_config, conv3x3_config, dconv3x3_config, conv1x1_config
+        ]
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i, op_ in enumerate(self.arch_config):
+            op = copy.deepcopy(op_)
+            chn = self.in_channels if i == 0 else self.feat_channels
+            assert isinstance(op, dict)
+            use_bias = op.pop('use_bias', False)
+            padding = op.pop('padding', 0)
+            kernel_size = op.pop('kernel_size')
+            module = ConvModule(
+                chn,
+                self.feat_channels,
+                kernel_size,
+                stride=1,
+                padding=padding,
+                norm_cfg=self.norm_cfg,
+                bias=use_bias,
+                conv_cfg=op)
+
+            self.cls_convs.append(copy.deepcopy(module))
+            self.reg_convs.append(copy.deepcopy(module))
+
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.conv_centerness = nn.Conv2d(self.feat_channels, 1, 3, padding=1)
+
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
diff --git a/head_extractor/src/mmdet/models/dense_heads/paa_head.py b/head_extractor/src/mmdet/models/dense_heads/paa_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c1f453d2788b354970254e8875068e824c370d4
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/paa_head.py
@@ -0,0 +1,730 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList)
+from ..layers import multiclass_nms
+from ..utils import levels_to_images, multi_apply
+from . import ATSSHead
+
+EPS = 1e-12
+try:
+    import sklearn.mixture as skm
+except ImportError:
+    skm = None
+
+
+@MODELS.register_module()
+class PAAHead(ATSSHead):
+    """Head of PAAAssignment: Probabilistic Anchor Assignment with IoU
+    Prediction for Object Detection.
+
+    Code is modified from the `official github repo
+    <https://github.com/kkhoot/PAA/blob/master/paa_core
+    /modeling/rpn/paa/loss.py>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2007.08103>`_ .
+
+    Args:
+        topk (int): Select topk samples with smallest loss in
+            each level.
+        score_voting (bool): Whether to use score voting in post-process.
+        covariance_type : String describing the type of covariance parameters
+            to be used in :class:`sklearn.mixture.GaussianMixture`.
+            It must be one of:
+
+            - 'full': each component has its own general covariance matrix
+            - 'tied': all components share the same general covariance matrix
+            - 'diag': each component has its own diagonal covariance matrix
+            - 'spherical': each component has its own single variance
+            Default: 'diag'. From 'full' to 'spherical', the gmm fitting
+            process is faster yet the performance could be influenced. For most
+            cases, 'diag' should be a good choice.
+    """
+
+    def __init__(self,
+                 *args,
+                 topk: int = 9,
+                 score_voting: bool = True,
+                 covariance_type: str = 'diag',
+                 **kwargs):
+        # topk used in paa reassign process
+        self.topk = topk
+        self.with_score_voting = score_voting
+        self.covariance_type = covariance_type
+        super().__init__(*args, **kwargs)
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            iou_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            iou_preds (list[Tensor]): iou_preds for each scale
+                level with shape (N, num_anchors * 1, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss gmm_assignment.
+        """
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+        )
+        (labels, labels_weight, bboxes_target, bboxes_weight, pos_inds,
+         pos_gt_index) = cls_reg_targets
+        cls_scores = levels_to_images(cls_scores)
+        cls_scores = [
+            item.reshape(-1, self.cls_out_channels) for item in cls_scores
+        ]
+        bbox_preds = levels_to_images(bbox_preds)
+        bbox_preds = [item.reshape(-1, 4) for item in bbox_preds]
+        iou_preds = levels_to_images(iou_preds)
+        iou_preds = [item.reshape(-1, 1) for item in iou_preds]
+        pos_losses_list, = multi_apply(self.get_pos_loss, anchor_list,
+                                       cls_scores, bbox_preds, labels,
+                                       labels_weight, bboxes_target,
+                                       bboxes_weight, pos_inds)
+
+        with torch.no_grad():
+            reassign_labels, reassign_label_weight, \
+                reassign_bbox_weights, num_pos = multi_apply(
+                    self.paa_reassign,
+                    pos_losses_list,
+                    labels,
+                    labels_weight,
+                    bboxes_weight,
+                    pos_inds,
+                    pos_gt_index,
+                    anchor_list)
+            num_pos = sum(num_pos)
+        # convert all tensor list to a flatten tensor
+        cls_scores = torch.cat(cls_scores, 0).view(-1, cls_scores[0].size(-1))
+        bbox_preds = torch.cat(bbox_preds, 0).view(-1, bbox_preds[0].size(-1))
+        iou_preds = torch.cat(iou_preds, 0).view(-1, iou_preds[0].size(-1))
+        labels = torch.cat(reassign_labels, 0).view(-1)
+        flatten_anchors = torch.cat(
+            [torch.cat(item, 0) for item in anchor_list])
+        labels_weight = torch.cat(reassign_label_weight, 0).view(-1)
+        bboxes_target = torch.cat(bboxes_target,
+                                  0).view(-1, bboxes_target[0].size(-1))
+
+        pos_inds_flatten = ((labels >= 0)
+                            &
+                            (labels < self.num_classes)).nonzero().reshape(-1)
+
+        losses_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            labels_weight,
+            avg_factor=max(num_pos, len(batch_img_metas)))  # avoid num_pos=0
+        if num_pos:
+            pos_bbox_pred = self.bbox_coder.decode(
+                flatten_anchors[pos_inds_flatten],
+                bbox_preds[pos_inds_flatten])
+            pos_bbox_target = bboxes_target[pos_inds_flatten]
+            iou_target = bbox_overlaps(
+                pos_bbox_pred.detach(), pos_bbox_target, is_aligned=True)
+            losses_iou = self.loss_centerness(
+                iou_preds[pos_inds_flatten],
+                iou_target.unsqueeze(-1),
+                avg_factor=num_pos)
+            losses_bbox = self.loss_bbox(
+                pos_bbox_pred,
+                pos_bbox_target,
+                iou_target.clamp(min=EPS),
+                avg_factor=iou_target.sum())
+        else:
+            losses_iou = iou_preds.sum() * 0
+            losses_bbox = bbox_preds.sum() * 0
+
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_iou=losses_iou)
+
+    def get_pos_loss(self, anchors: List[Tensor], cls_score: Tensor,
+                     bbox_pred: Tensor, label: Tensor, label_weight: Tensor,
+                     bbox_target: dict, bbox_weight: Tensor,
+                     pos_inds: Tensor) -> Tensor:
+        """Calculate loss of all potential positive samples obtained from first
+        match process.
+
+        Args:
+            anchors (list[Tensor]): Anchors of each scale.
+            cls_score (Tensor): Box scores of single image with shape
+                (num_anchors, num_classes)
+            bbox_pred (Tensor): Box energies / deltas of single image
+                with shape (num_anchors, 4)
+            label (Tensor): classification target of each anchor with
+                shape (num_anchors,)
+            label_weight (Tensor): Classification loss weight of each
+                anchor with shape (num_anchors).
+            bbox_target (dict): Regression target of each anchor with
+                shape (num_anchors, 4).
+            bbox_weight (Tensor): Bbox weight of each anchor with shape
+                (num_anchors, 4).
+            pos_inds (Tensor): Index of all positive samples got from
+                first assign process.
+
+        Returns:
+            Tensor: Losses of all positive samples in single image.
+        """
+        if not len(pos_inds):
+            return cls_score.new([]),
+        anchors_all_level = torch.cat(anchors, 0)
+        pos_scores = cls_score[pos_inds]
+        pos_bbox_pred = bbox_pred[pos_inds]
+        pos_label = label[pos_inds]
+        pos_label_weight = label_weight[pos_inds]
+        pos_bbox_target = bbox_target[pos_inds]
+        pos_bbox_weight = bbox_weight[pos_inds]
+        pos_anchors = anchors_all_level[pos_inds]
+        pos_bbox_pred = self.bbox_coder.decode(pos_anchors, pos_bbox_pred)
+
+        # to keep loss dimension
+        loss_cls = self.loss_cls(
+            pos_scores,
+            pos_label,
+            pos_label_weight,
+            avg_factor=1.0,
+            reduction_override='none')
+
+        loss_bbox = self.loss_bbox(
+            pos_bbox_pred,
+            pos_bbox_target,
+            pos_bbox_weight,
+            avg_factor=1.0,  # keep same loss weight before reassign
+            reduction_override='none')
+
+        loss_cls = loss_cls.sum(-1)
+        pos_loss = loss_bbox + loss_cls
+        return pos_loss,
+
+    def paa_reassign(self, pos_losses: Tensor, label: Tensor,
+                     label_weight: Tensor, bbox_weight: Tensor,
+                     pos_inds: Tensor, pos_gt_inds: Tensor,
+                     anchors: List[Tensor]) -> tuple:
+        """Fit loss to GMM distribution and separate positive, ignore, negative
+        samples again with GMM model.
+
+        Args:
+            pos_losses (Tensor): Losses of all positive samples in
+                single image.
+            label (Tensor): classification target of each anchor with
+                shape (num_anchors,)
+            label_weight (Tensor): Classification loss weight of each
+                anchor with shape (num_anchors).
+            bbox_weight (Tensor): Bbox weight of each anchor with shape
+                (num_anchors, 4).
+            pos_inds (Tensor): Index of all positive samples got from
+                first assign process.
+            pos_gt_inds (Tensor): Gt_index of all positive samples got
+                from first assign process.
+            anchors (list[Tensor]): Anchors of each scale.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - label (Tensor): classification target of each anchor after
+                  paa assign, with shape (num_anchors,)
+                - label_weight (Tensor): Classification loss weight of each
+                  anchor after paa assign, with shape (num_anchors).
+                - bbox_weight (Tensor): Bbox weight of each anchor with shape
+                  (num_anchors, 4).
+                - num_pos (int): The number of positive samples after paa
+                  assign.
+        """
+        if not len(pos_inds):
+            return label, label_weight, bbox_weight, 0
+        label = label.clone()
+        label_weight = label_weight.clone()
+        bbox_weight = bbox_weight.clone()
+        num_gt = pos_gt_inds.max() + 1
+        num_level = len(anchors)
+        num_anchors_each_level = [item.size(0) for item in anchors]
+        num_anchors_each_level.insert(0, 0)
+        inds_level_interval = np.cumsum(num_anchors_each_level)
+        pos_level_mask = []
+        for i in range(num_level):
+            mask = (pos_inds >= inds_level_interval[i]) & (
+                pos_inds < inds_level_interval[i + 1])
+            pos_level_mask.append(mask)
+        pos_inds_after_paa = [label.new_tensor([])]
+        ignore_inds_after_paa = [label.new_tensor([])]
+        for gt_ind in range(num_gt):
+            pos_inds_gmm = []
+            pos_loss_gmm = []
+            gt_mask = pos_gt_inds == gt_ind
+            for level in range(num_level):
+                level_mask = pos_level_mask[level]
+                level_gt_mask = level_mask & gt_mask
+                value, topk_inds = pos_losses[level_gt_mask].topk(
+                    min(level_gt_mask.sum(), self.topk), largest=False)
+                pos_inds_gmm.append(pos_inds[level_gt_mask][topk_inds])
+                pos_loss_gmm.append(value)
+            pos_inds_gmm = torch.cat(pos_inds_gmm)
+            pos_loss_gmm = torch.cat(pos_loss_gmm)
+            # fix gmm need at least two sample
+            if len(pos_inds_gmm) < 2:
+                continue
+            device = pos_inds_gmm.device
+            pos_loss_gmm, sort_inds = pos_loss_gmm.sort()
+            pos_inds_gmm = pos_inds_gmm[sort_inds]
+            pos_loss_gmm = pos_loss_gmm.view(-1, 1).cpu().numpy()
+            min_loss, max_loss = pos_loss_gmm.min(), pos_loss_gmm.max()
+            means_init = np.array([min_loss, max_loss]).reshape(2, 1)
+            weights_init = np.array([0.5, 0.5])
+            precisions_init = np.array([1.0, 1.0]).reshape(2, 1, 1)  # full
+            if self.covariance_type == 'spherical':
+                precisions_init = precisions_init.reshape(2)
+            elif self.covariance_type == 'diag':
+                precisions_init = precisions_init.reshape(2, 1)
+            elif self.covariance_type == 'tied':
+                precisions_init = np.array([[1.0]])
+            if skm is None:
+                raise ImportError('Please run "pip install sklearn" '
+                                  'to install sklearn first.')
+            gmm = skm.GaussianMixture(
+                2,
+                weights_init=weights_init,
+                means_init=means_init,
+                precisions_init=precisions_init,
+                covariance_type=self.covariance_type)
+            gmm.fit(pos_loss_gmm)
+            gmm_assignment = gmm.predict(pos_loss_gmm)
+            scores = gmm.score_samples(pos_loss_gmm)
+            gmm_assignment = torch.from_numpy(gmm_assignment).to(device)
+            scores = torch.from_numpy(scores).to(device)
+
+            pos_inds_temp, ignore_inds_temp = self.gmm_separation_scheme(
+                gmm_assignment, scores, pos_inds_gmm)
+            pos_inds_after_paa.append(pos_inds_temp)
+            ignore_inds_after_paa.append(ignore_inds_temp)
+
+        pos_inds_after_paa = torch.cat(pos_inds_after_paa)
+        ignore_inds_after_paa = torch.cat(ignore_inds_after_paa)
+        reassign_mask = (pos_inds.unsqueeze(1) != pos_inds_after_paa).all(1)
+        reassign_ids = pos_inds[reassign_mask]
+        label[reassign_ids] = self.num_classes
+        label_weight[ignore_inds_after_paa] = 0
+        bbox_weight[reassign_ids] = 0
+        num_pos = len(pos_inds_after_paa)
+        return label, label_weight, bbox_weight, num_pos
+
+    def gmm_separation_scheme(self, gmm_assignment: Tensor, scores: Tensor,
+                              pos_inds_gmm: Tensor) -> Tuple[Tensor, Tensor]:
+        """A general separation scheme for gmm model.
+
+        It separates a GMM distribution of candidate samples into three
+        parts, 0 1 and uncertain areas, and you can implement other
+        separation schemes by rewriting this function.
+
+        Args:
+            gmm_assignment (Tensor): The prediction of GMM which is of shape
+                (num_samples,). The 0/1 value indicates the distribution
+                that each sample comes from.
+            scores (Tensor): The probability of sample coming from the
+                fit GMM distribution. The tensor is of shape (num_samples,).
+            pos_inds_gmm (Tensor): All the indexes of samples which are used
+                to fit GMM model. The tensor is of shape (num_samples,)
+
+        Returns:
+            tuple[Tensor, Tensor]: The indices of positive and ignored samples.
+
+                - pos_inds_temp (Tensor): Indices of positive samples.
+                - ignore_inds_temp (Tensor): Indices of ignore samples.
+        """
+        # The implementation is (c) in Fig.3 in origin paper instead of (b).
+        # You can refer to issues such as
+        # https://github.com/kkhoot/PAA/issues/8 and
+        # https://github.com/kkhoot/PAA/issues/9.
+        fgs = gmm_assignment == 0
+        pos_inds_temp = fgs.new_tensor([], dtype=torch.long)
+        ignore_inds_temp = fgs.new_tensor([], dtype=torch.long)
+        if fgs.nonzero().numel():
+            _, pos_thr_ind = scores[fgs].topk(1)
+            pos_inds_temp = pos_inds_gmm[fgs][:pos_thr_ind + 1]
+            ignore_inds_temp = pos_inds_gmm.new_tensor([])
+        return pos_inds_temp, ignore_inds_temp
+
+    def get_targets(self,
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True) -> tuple:
+        """Get targets for PAA head.
+
+        This method is almost the same as `AnchorHead.get_targets()`. We direct
+        return the results from _get_targets_single instead map it to levels
+        by images_to_levels function.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels (list[Tensor]): Labels of all anchors, each with
+                    shape (num_anchors,).
+                - label_weights (list[Tensor]): Label weights of all anchor.
+                    each with shape (num_anchors,).
+                - bbox_targets (list[Tensor]): BBox targets of all anchors.
+                    each with shape (num_anchors, 4).
+                - bbox_weights (list[Tensor]): BBox weights of all anchors.
+                    each with shape (num_anchors, 4).
+                - pos_inds (list[Tensor]): Contains all index of positive
+                    sample in all anchor.
+                - gt_inds (list[Tensor]): Contains all gt_index of positive
+                    sample in all anchor.
+        """
+
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        results = multi_apply(
+            self._get_targets_single,
+            concat_anchor_list,
+            concat_valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore,
+            unmap_outputs=unmap_outputs)
+
+        (labels, label_weights, bbox_targets, bbox_weights, valid_pos_inds,
+         valid_neg_inds, sampling_result) = results
+
+        # Due to valid flag of anchors, we have to calculate the real pos_inds
+        # in origin anchor set.
+        pos_inds = []
+        for i, single_labels in enumerate(labels):
+            pos_mask = (0 <= single_labels) & (
+                single_labels < self.num_classes)
+            pos_inds.append(pos_mask.nonzero().view(-1))
+
+        gt_inds = [item.pos_assigned_gt_inds for item in sampling_result]
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                gt_inds)
+
+    def _get_targets_single(self,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        This method is same as `AnchorHead._get_targets_single()`.
+        """
+        assert unmap_outputs, 'We must map outputs back to the original' \
+                              'set of anchors in PAAhead'
+        return super(ATSSHead, self)._get_targets_single(
+            flat_anchors,
+            valid_flags,
+            gt_instances,
+            img_meta,
+            gt_instances_ignore,
+            unmap_outputs=True)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        score_factors: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: OptConfigType = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        This method is same as `BaseDenseHead.get_results()`.
+        """
+        assert with_nms, 'PAA only supports "with_nms=True" now and it ' \
+                         'means PAAHead does not support ' \
+                         'test-time augmentation'
+        return super().predict_by_feat(
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            score_factors=score_factors,
+            batch_img_metas=batch_img_metas,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms)
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: OptConfigType = None,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factors from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict` or dict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_score_factors = []
+        for level_idx, (cls_score, bbox_pred, score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              score_factor_list, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            score_factor = score_factor.permute(1, 2, 0).reshape(-1).sigmoid()
+
+            if 0 < nms_pre < scores.shape[0]:
+                max_scores, _ = (scores *
+                                 score_factor[:, None]).sqrt().max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                priors = priors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                score_factor = score_factor[topk_inds]
+
+            bboxes = self.bbox_coder.decode(
+                priors, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_score_factors.append(score_factor)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.score_factors = torch.cat(mlvl_score_factors)
+
+        return self._bbox_post_process(results, cfg, rescale, with_nms,
+                                       img_meta)
+
+    def _bbox_post_process(self,
+                           results: InstanceData,
+                           cfg: ConfigType,
+                           rescale: bool = False,
+                           with_nms: bool = True,
+                           img_meta: Optional[dict] = None):
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually with_nms is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (:obj:`ConfigDict` or dict): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        if rescale:
+            results.bboxes /= results.bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+        # Add a dummy background class to the backend when using sigmoid
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        padding = results.scores.new_zeros(results.scores.shape[0], 1)
+        mlvl_scores = torch.cat([results.scores, padding], dim=1)
+
+        mlvl_nms_scores = (mlvl_scores * results.score_factors[:, None]).sqrt()
+        det_bboxes, det_labels = multiclass_nms(
+            results.bboxes,
+            mlvl_nms_scores,
+            cfg.score_thr,
+            cfg.nms,
+            cfg.max_per_img,
+            score_factors=None)
+        if self.with_score_voting and len(det_bboxes) > 0:
+            det_bboxes, det_labels = self.score_voting(det_bboxes, det_labels,
+                                                       results.bboxes,
+                                                       mlvl_nms_scores,
+                                                       cfg.score_thr)
+        nms_results = InstanceData()
+        nms_results.bboxes = det_bboxes[:, :-1]
+        nms_results.scores = det_bboxes[:, -1]
+        nms_results.labels = det_labels
+        return nms_results
+
+    def score_voting(self, det_bboxes: Tensor, det_labels: Tensor,
+                     mlvl_bboxes: Tensor, mlvl_nms_scores: Tensor,
+                     score_thr: float) -> Tuple[Tensor, Tensor]:
+        """Implementation of score voting method works on each remaining boxes
+        after NMS procedure.
+
+        Args:
+            det_bboxes (Tensor): Remaining boxes after NMS procedure,
+                with shape (k, 5), each dimension means
+                (x1, y1, x2, y2, score).
+            det_labels (Tensor): The label of remaining boxes, with shape
+                (k, 1),Labels are 0-based.
+            mlvl_bboxes (Tensor): All boxes before the NMS procedure,
+                with shape (num_anchors,4).
+            mlvl_nms_scores (Tensor): The scores of all boxes which is used
+                in the NMS procedure, with shape (num_anchors, num_class)
+            score_thr (float): The score threshold of bboxes.
+
+        Returns:
+            tuple: Usually returns a tuple containing voting results.
+
+                - det_bboxes_voted (Tensor): Remaining boxes after
+                    score voting procedure, with shape (k, 5), each
+                    dimension means (x1, y1, x2, y2, score).
+                - det_labels_voted (Tensor): Label of remaining bboxes
+                    after voting, with shape (num_anchors,).
+        """
+        candidate_mask = mlvl_nms_scores > score_thr
+        candidate_mask_nonzeros = candidate_mask.nonzero(as_tuple=False)
+        candidate_inds = candidate_mask_nonzeros[:, 0]
+        candidate_labels = candidate_mask_nonzeros[:, 1]
+        candidate_bboxes = mlvl_bboxes[candidate_inds]
+        candidate_scores = mlvl_nms_scores[candidate_mask]
+        det_bboxes_voted = []
+        det_labels_voted = []
+        for cls in range(self.cls_out_channels):
+            candidate_cls_mask = candidate_labels == cls
+            if not candidate_cls_mask.any():
+                continue
+            candidate_cls_scores = candidate_scores[candidate_cls_mask]
+            candidate_cls_bboxes = candidate_bboxes[candidate_cls_mask]
+            det_cls_mask = det_labels == cls
+            det_cls_bboxes = det_bboxes[det_cls_mask].view(
+                -1, det_bboxes.size(-1))
+            det_candidate_ious = bbox_overlaps(det_cls_bboxes[:, :4],
+                                               candidate_cls_bboxes)
+            for det_ind in range(len(det_cls_bboxes)):
+                single_det_ious = det_candidate_ious[det_ind]
+                pos_ious_mask = single_det_ious > 0.01
+                pos_ious = single_det_ious[pos_ious_mask]
+                pos_bboxes = candidate_cls_bboxes[pos_ious_mask]
+                pos_scores = candidate_cls_scores[pos_ious_mask]
+                pis = (torch.exp(-(1 - pos_ious)**2 / 0.025) *
+                       pos_scores)[:, None]
+                voted_box = torch.sum(
+                    pis * pos_bboxes, dim=0) / torch.sum(
+                        pis, dim=0)
+                voted_score = det_cls_bboxes[det_ind][-1:][None, :]
+                det_bboxes_voted.append(
+                    torch.cat((voted_box[None, :], voted_score), dim=1))
+                det_labels_voted.append(cls)
+
+        det_bboxes_voted = torch.cat(det_bboxes_voted, dim=0)
+        det_labels_voted = det_labels.new_tensor(det_labels_voted)
+        return det_bboxes_voted, det_labels_voted
diff --git a/head_extractor/src/mmdet/models/dense_heads/pisa_retinanet_head.py b/head_extractor/src/mmdet/models/dense_heads/pisa_retinanet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..85fd54f5be3605d0994c2a2d4d9d7deac4c0f284
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/pisa_retinanet_head.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptInstanceList
+from ..losses import carl_loss, isr_p
+from ..utils import images_to_levels
+from .retina_head import RetinaHead
+
+
+@MODELS.register_module()
+class PISARetinaHead(RetinaHead):
+    """PISA Retinanet Head.
+
+    The head owns the same structure with Retinanet Head, but differs in two
+        aspects:
+        1. Importance-based Sample Reweighting Positive (ISR-P) is applied to
+            change the positive loss weights.
+        2. Classification-aware regression loss is adopted as a third loss.
+    """
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: Loss dict, comprise classification loss, regression loss and
+            carl loss.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            return_sampling_results=True)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results_list) = cls_reg_targets
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        num_imgs = len(batch_img_metas)
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1, label_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_cls_scores = torch.cat(
+            flatten_cls_scores, dim=1).reshape(-1,
+                                               flatten_cls_scores[0].size(-1))
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_bbox_preds = torch.cat(
+            flatten_bbox_preds, dim=1).view(-1, flatten_bbox_preds[0].size(-1))
+        flatten_labels = torch.cat(labels_list, dim=1).reshape(-1)
+        flatten_label_weights = torch.cat(
+            label_weights_list, dim=1).reshape(-1)
+        flatten_anchors = torch.cat(all_anchor_list, dim=1).reshape(-1, 4)
+        flatten_bbox_targets = torch.cat(
+            bbox_targets_list, dim=1).reshape(-1, 4)
+        flatten_bbox_weights = torch.cat(
+            bbox_weights_list, dim=1).reshape(-1, 4)
+
+        # Apply ISR-P
+        isr_cfg = self.train_cfg.get('isr', None)
+        if isr_cfg is not None:
+            all_targets = (flatten_labels, flatten_label_weights,
+                           flatten_bbox_targets, flatten_bbox_weights)
+            with torch.no_grad():
+                all_targets = isr_p(
+                    flatten_cls_scores,
+                    flatten_bbox_preds,
+                    all_targets,
+                    flatten_anchors,
+                    sampling_results_list,
+                    bbox_coder=self.bbox_coder,
+                    loss_cls=self.loss_cls,
+                    num_class=self.num_classes,
+                    **self.train_cfg['isr'])
+            (flatten_labels, flatten_label_weights, flatten_bbox_targets,
+             flatten_bbox_weights) = all_targets
+
+        # For convenience we compute loss once instead separating by fpn level,
+        # so that we don't need to separate the weights by level again.
+        # The result should be the same
+        losses_cls = self.loss_cls(
+            flatten_cls_scores,
+            flatten_labels,
+            flatten_label_weights,
+            avg_factor=avg_factor)
+        losses_bbox = self.loss_bbox(
+            flatten_bbox_preds,
+            flatten_bbox_targets,
+            flatten_bbox_weights,
+            avg_factor=avg_factor)
+        loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+        # CARL Loss
+        carl_cfg = self.train_cfg.get('carl', None)
+        if carl_cfg is not None:
+            loss_carl = carl_loss(
+                flatten_cls_scores,
+                flatten_labels,
+                flatten_bbox_preds,
+                flatten_bbox_targets,
+                self.loss_bbox,
+                **self.train_cfg['carl'],
+                avg_factor=avg_factor,
+                sigmoid=True,
+                num_class=self.num_classes)
+            loss_dict.update(loss_carl)
+
+        return loss_dict
diff --git a/head_extractor/src/mmdet/models/dense_heads/pisa_ssd_head.py b/head_extractor/src/mmdet/models/dense_heads/pisa_ssd_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec09cb40a9c95d3f9889d736b80dfccef07f6fd1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/pisa_ssd_head.py
@@ -0,0 +1,182 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptInstanceList
+from ..losses import CrossEntropyLoss, SmoothL1Loss, carl_loss, isr_p
+from ..utils import multi_apply
+from .ssd_head import SSDHead
+
+
+# TODO: add loss evaluator for SSD
+@MODELS.register_module()
+class PISASSDHead(SSDHead):
+    """Implementation of `PISA SSD head <https://arxiv.org/abs/1904.04821>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Sequence[int]): Number of channels in the input feature
+            map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Defaults to 0.
+        feat_channels (int): Number of hidden channels when stacked_convs
+            > 0. Defaults to 256.
+        use_depthwise (bool): Whether to use DepthwiseSeparableConv.
+            Defaults to False.
+        conv_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config conv layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config norm layer. Defaults to None.
+        act_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config activation layer. Defaults to None.
+        anchor_generator (:obj:`ConfigDict` or dict): Config dict for anchor
+            generator.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Defaults to False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Training config of
+            anchor head.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of
+            anchor head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], Optional): Initialization config dict.
+    """  # noqa: W605
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Union[List[Tensor], Tensor]]:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Union[List[Tensor], Tensor]]: A dictionary of loss
+            components. the dict has components below:
+
+            - loss_cls (list[Tensor]): A list containing each feature map \
+            classification loss.
+            - loss_bbox (list[Tensor]): A list containing each feature map \
+            regression loss.
+            - loss_carl (Tensor): The loss of CARL.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            unmap_outputs=False,
+            return_sampling_results=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results_list) = cls_reg_targets
+
+        num_images = len(batch_img_metas)
+        all_cls_scores = torch.cat([
+            s.permute(0, 2, 3, 1).reshape(
+                num_images, -1, self.cls_out_channels) for s in cls_scores
+        ], 1)
+        all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+        all_label_weights = torch.cat(label_weights_list,
+                                      -1).view(num_images, -1)
+        all_bbox_preds = torch.cat([
+            b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+            for b in bbox_preds
+        ], -2)
+        all_bbox_targets = torch.cat(bbox_targets_list,
+                                     -2).view(num_images, -1, 4)
+        all_bbox_weights = torch.cat(bbox_weights_list,
+                                     -2).view(num_images, -1, 4)
+
+        # concat all level anchors to a single tensor
+        all_anchors = []
+        for i in range(num_images):
+            all_anchors.append(torch.cat(anchor_list[i]))
+
+        isr_cfg = self.train_cfg.get('isr', None)
+        all_targets = (all_labels.view(-1), all_label_weights.view(-1),
+                       all_bbox_targets.view(-1,
+                                             4), all_bbox_weights.view(-1, 4))
+        # apply ISR-P
+        if isr_cfg is not None:
+            all_targets = isr_p(
+                all_cls_scores.view(-1, all_cls_scores.size(-1)),
+                all_bbox_preds.view(-1, 4),
+                all_targets,
+                torch.cat(all_anchors),
+                sampling_results_list,
+                loss_cls=CrossEntropyLoss(),
+                bbox_coder=self.bbox_coder,
+                **self.train_cfg['isr'],
+                num_class=self.num_classes)
+            (new_labels, new_label_weights, new_bbox_targets,
+             new_bbox_weights) = all_targets
+            all_labels = new_labels.view(all_labels.shape)
+            all_label_weights = new_label_weights.view(all_label_weights.shape)
+            all_bbox_targets = new_bbox_targets.view(all_bbox_targets.shape)
+            all_bbox_weights = new_bbox_weights.view(all_bbox_weights.shape)
+
+        # add CARL loss
+        carl_loss_cfg = self.train_cfg.get('carl', None)
+        if carl_loss_cfg is not None:
+            loss_carl = carl_loss(
+                all_cls_scores.view(-1, all_cls_scores.size(-1)),
+                all_targets[0],
+                all_bbox_preds.view(-1, 4),
+                all_targets[2],
+                SmoothL1Loss(beta=1.),
+                **self.train_cfg['carl'],
+                avg_factor=avg_factor,
+                num_class=self.num_classes)
+
+        # check NaN and Inf
+        assert torch.isfinite(all_cls_scores).all().item(), \
+            'classification scores become infinite or NaN!'
+        assert torch.isfinite(all_bbox_preds).all().item(), \
+            'bbox predications become infinite or NaN!'
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            all_cls_scores,
+            all_bbox_preds,
+            all_anchors,
+            all_labels,
+            all_label_weights,
+            all_bbox_targets,
+            all_bbox_weights,
+            avg_factor=avg_factor)
+        loss_dict = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+        if carl_loss_cfg is not None:
+            loss_dict.update(loss_carl)
+        return loss_dict
diff --git a/head_extractor/src/mmdet/models/dense_heads/reppoints_head.py b/head_extractor/src/mmdet/models/dense_heads/reppoints_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..22f3e3401a4abd9cc35b41d24efe23e5655a905e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/reppoints_head.py
@@ -0,0 +1,885 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Sequence, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import DeformConv2d
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptInstanceList
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..task_modules.samplers import PseudoSampler
+from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply,
+                     unmap)
+from .anchor_free_head import AnchorFreeHead
+
+
+@MODELS.register_module()
+class RepPointsHead(AnchorFreeHead):
+    """RepPoint head.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        point_feat_channels (int): Number of channels of points features.
+        num_points (int): Number of points.
+        gradient_mul (float): The multiplier to gradients from
+            points refinement and recognition.
+        point_strides (Sequence[int]): points strides.
+        point_base_scale (int): bbox scale for assigning labels.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox_init (:obj:`ConfigDict` or dict): Config of initial points
+            loss.
+        loss_bbox_refine (:obj:`ConfigDict` or dict): Config of points loss in
+            refinement.
+        use_grid_points (bool): If we use bounding box representation, the
+        reppoints is represented as grid points on the bounding box.
+        center_init (bool): Whether to use center point assignment.
+        transform_method (str): The methods to transform RepPoints to bbox.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 point_feat_channels: int = 256,
+                 num_points: int = 9,
+                 gradient_mul: float = 0.1,
+                 point_strides: Sequence[int] = [8, 16, 32, 64, 128],
+                 point_base_scale: int = 4,
+                 loss_cls: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox_init: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.5),
+                 loss_bbox_refine: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+                 use_grid_points: bool = False,
+                 center_init: bool = True,
+                 transform_method: str = 'moment',
+                 moment_mul: float = 0.01,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='reppoints_cls_out',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        self.num_points = num_points
+        self.point_feat_channels = point_feat_channels
+        self.use_grid_points = use_grid_points
+        self.center_init = center_init
+
+        # we use deform conv to extract points features
+        self.dcn_kernel = int(np.sqrt(num_points))
+        self.dcn_pad = int((self.dcn_kernel - 1) / 2)
+        assert self.dcn_kernel * self.dcn_kernel == num_points, \
+            'The points number should be a square number.'
+        assert self.dcn_kernel % 2 == 1, \
+            'The points number should be an odd square number.'
+        dcn_base = np.arange(-self.dcn_pad,
+                             self.dcn_pad + 1).astype(np.float64)
+        dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
+        dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
+        dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
+            (-1))
+        self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)
+
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            init_cfg=init_cfg,
+            **kwargs)
+
+        self.gradient_mul = gradient_mul
+        self.point_base_scale = point_base_scale
+        self.point_strides = point_strides
+        self.prior_generator = MlvlPointGenerator(
+            self.point_strides, offset=0.)
+
+        if self.train_cfg:
+            self.init_assigner = TASK_UTILS.build(
+                self.train_cfg['init']['assigner'])
+            self.refine_assigner = TASK_UTILS.build(
+                self.train_cfg['refine']['assigner'])
+
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        self.transform_method = transform_method
+        if self.transform_method == 'moment':
+            self.moment_transfer = nn.Parameter(
+                data=torch.zeros(2), requires_grad=True)
+            self.moment_mul = moment_mul
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes
+        else:
+            self.cls_out_channels = self.num_classes + 1
+        self.loss_bbox_init = MODELS.build(loss_bbox_init)
+        self.loss_bbox_refine = MODELS.build(loss_bbox_refine)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        pts_out_dim = 4 if self.use_grid_points else 2 * self.num_points
+        self.reppoints_cls_conv = DeformConv2d(self.feat_channels,
+                                               self.point_feat_channels,
+                                               self.dcn_kernel, 1,
+                                               self.dcn_pad)
+        self.reppoints_cls_out = nn.Conv2d(self.point_feat_channels,
+                                           self.cls_out_channels, 1, 1, 0)
+        self.reppoints_pts_init_conv = nn.Conv2d(self.feat_channels,
+                                                 self.point_feat_channels, 3,
+                                                 1, 1)
+        self.reppoints_pts_init_out = nn.Conv2d(self.point_feat_channels,
+                                                pts_out_dim, 1, 1, 0)
+        self.reppoints_pts_refine_conv = DeformConv2d(self.feat_channels,
+                                                      self.point_feat_channels,
+                                                      self.dcn_kernel, 1,
+                                                      self.dcn_pad)
+        self.reppoints_pts_refine_out = nn.Conv2d(self.point_feat_channels,
+                                                  pts_out_dim, 1, 1, 0)
+
+    def points2bbox(self, pts: Tensor, y_first: bool = True) -> Tensor:
+        """Converting the points set into bounding box.
+
+        Args:
+            pts (Tensor): the input points sets (fields), each points
+                set (fields) is represented as 2n scalar.
+            y_first (bool): if y_first=True, the point set is
+                represented as [y1, x1, y2, x2 ... yn, xn], otherwise
+                the point set is represented as
+                [x1, y1, x2, y2 ... xn, yn]. Defaults to True.
+
+        Returns:
+            Tensor: each points set is converting to a bbox [x1, y1, x2, y2].
+        """
+        pts_reshape = pts.view(pts.shape[0], -1, 2, *pts.shape[2:])
+        pts_y = pts_reshape[:, :, 0, ...] if y_first else pts_reshape[:, :, 1,
+                                                                      ...]
+        pts_x = pts_reshape[:, :, 1, ...] if y_first else pts_reshape[:, :, 0,
+                                                                      ...]
+        if self.transform_method == 'minmax':
+            bbox_left = pts_x.min(dim=1, keepdim=True)[0]
+            bbox_right = pts_x.max(dim=1, keepdim=True)[0]
+            bbox_up = pts_y.min(dim=1, keepdim=True)[0]
+            bbox_bottom = pts_y.max(dim=1, keepdim=True)[0]
+            bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom],
+                             dim=1)
+        elif self.transform_method == 'partial_minmax':
+            pts_y = pts_y[:, :4, ...]
+            pts_x = pts_x[:, :4, ...]
+            bbox_left = pts_x.min(dim=1, keepdim=True)[0]
+            bbox_right = pts_x.max(dim=1, keepdim=True)[0]
+            bbox_up = pts_y.min(dim=1, keepdim=True)[0]
+            bbox_bottom = pts_y.max(dim=1, keepdim=True)[0]
+            bbox = torch.cat([bbox_left, bbox_up, bbox_right, bbox_bottom],
+                             dim=1)
+        elif self.transform_method == 'moment':
+            pts_y_mean = pts_y.mean(dim=1, keepdim=True)
+            pts_x_mean = pts_x.mean(dim=1, keepdim=True)
+            pts_y_std = torch.std(pts_y - pts_y_mean, dim=1, keepdim=True)
+            pts_x_std = torch.std(pts_x - pts_x_mean, dim=1, keepdim=True)
+            moment_transfer = (self.moment_transfer * self.moment_mul) + (
+                self.moment_transfer.detach() * (1 - self.moment_mul))
+            moment_width_transfer = moment_transfer[0]
+            moment_height_transfer = moment_transfer[1]
+            half_width = pts_x_std * torch.exp(moment_width_transfer)
+            half_height = pts_y_std * torch.exp(moment_height_transfer)
+            bbox = torch.cat([
+                pts_x_mean - half_width, pts_y_mean - half_height,
+                pts_x_mean + half_width, pts_y_mean + half_height
+            ],
+                             dim=1)
+        else:
+            raise NotImplementedError
+        return bbox
+
+    def gen_grid_from_reg(self, reg: Tensor,
+                          previous_boxes: Tensor) -> Tuple[Tensor]:
+        """Base on the previous bboxes and regression values, we compute the
+        regressed bboxes and generate the grids on the bboxes.
+
+        Args:
+            reg (Tensor): the regression value to previous bboxes.
+            previous_boxes (Tensor): previous bboxes.
+
+        Returns:
+            Tuple[Tensor]: generate grids on the regressed bboxes.
+        """
+        b, _, h, w = reg.shape
+        bxy = (previous_boxes[:, :2, ...] + previous_boxes[:, 2:, ...]) / 2.
+        bwh = (previous_boxes[:, 2:, ...] -
+               previous_boxes[:, :2, ...]).clamp(min=1e-6)
+        grid_topleft = bxy + bwh * reg[:, :2, ...] - 0.5 * bwh * torch.exp(
+            reg[:, 2:, ...])
+        grid_wh = bwh * torch.exp(reg[:, 2:, ...])
+        grid_left = grid_topleft[:, [0], ...]
+        grid_top = grid_topleft[:, [1], ...]
+        grid_width = grid_wh[:, [0], ...]
+        grid_height = grid_wh[:, [1], ...]
+        intervel = torch.linspace(0., 1., self.dcn_kernel).view(
+            1, self.dcn_kernel, 1, 1).type_as(reg)
+        grid_x = grid_left + grid_width * intervel
+        grid_x = grid_x.unsqueeze(1).repeat(1, self.dcn_kernel, 1, 1, 1)
+        grid_x = grid_x.view(b, -1, h, w)
+        grid_y = grid_top + grid_height * intervel
+        grid_y = grid_y.unsqueeze(2).repeat(1, 1, self.dcn_kernel, 1, 1)
+        grid_y = grid_y.view(b, -1, h, w)
+        grid_yx = torch.stack([grid_y, grid_x], dim=2)
+        grid_yx = grid_yx.view(b, -1, h, w)
+        regressed_bbox = torch.cat([
+            grid_left, grid_top, grid_left + grid_width, grid_top + grid_height
+        ], 1)
+        return grid_yx, regressed_bbox
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor]:
+        return multi_apply(self.forward_single, feats)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward feature map of a single FPN level."""
+        dcn_base_offset = self.dcn_base_offset.type_as(x)
+        # If we use center_init, the initial reppoints is from center points.
+        # If we use bounding bbox representation, the initial reppoints is
+        #   from regular grid placed on a pre-defined bbox.
+        if self.use_grid_points or not self.center_init:
+            scale = self.point_base_scale / 2
+            points_init = dcn_base_offset / dcn_base_offset.max() * scale
+            bbox_init = x.new_tensor([-scale, -scale, scale,
+                                      scale]).view(1, 4, 1, 1)
+        else:
+            points_init = 0
+        cls_feat = x
+        pts_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            pts_feat = reg_conv(pts_feat)
+        # initialize reppoints
+        pts_out_init = self.reppoints_pts_init_out(
+            self.relu(self.reppoints_pts_init_conv(pts_feat)))
+        if self.use_grid_points:
+            pts_out_init, bbox_out_init = self.gen_grid_from_reg(
+                pts_out_init, bbox_init.detach())
+        else:
+            pts_out_init = pts_out_init + points_init
+        # refine and classify reppoints
+        pts_out_init_grad_mul = (1 - self.gradient_mul) * pts_out_init.detach(
+        ) + self.gradient_mul * pts_out_init
+        dcn_offset = pts_out_init_grad_mul - dcn_base_offset
+        cls_out = self.reppoints_cls_out(
+            self.relu(self.reppoints_cls_conv(cls_feat, dcn_offset)))
+        pts_out_refine = self.reppoints_pts_refine_out(
+            self.relu(self.reppoints_pts_refine_conv(pts_feat, dcn_offset)))
+        if self.use_grid_points:
+            pts_out_refine, bbox_out_refine = self.gen_grid_from_reg(
+                pts_out_refine, bbox_out_init.detach())
+        else:
+            pts_out_refine = pts_out_refine + pts_out_init.detach()
+
+        if self.training:
+            return cls_out, pts_out_init, pts_out_refine
+        else:
+            return cls_out, self.points2bbox(pts_out_refine)
+
+    def get_points(self, featmap_sizes: List[Tuple[int]],
+                   batch_img_metas: List[dict], device: str) -> tuple:
+        """Get points according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+
+        Returns:
+            tuple: points of each image, valid flags of each image
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # points center for one time
+        multi_level_points = self.prior_generator.grid_priors(
+            featmap_sizes, device=device, with_stride=True)
+        points_list = [[point.clone() for point in multi_level_points]
+                       for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level grids
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = self.prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device=device)
+            valid_flag_list.append(multi_level_flags)
+
+        return points_list, valid_flag_list
+
+    def centers_to_bboxes(self, point_list: List[Tensor]) -> List[Tensor]:
+        """Get bboxes according to center points.
+
+        Only used in :class:`MaxIoUAssigner`.
+        """
+        bbox_list = []
+        for i_img, point in enumerate(point_list):
+            bbox = []
+            for i_lvl in range(len(self.point_strides)):
+                scale = self.point_base_scale * self.point_strides[i_lvl] * 0.5
+                bbox_shift = torch.Tensor([-scale, -scale, scale,
+                                           scale]).view(1, 4).type_as(point[0])
+                bbox_center = torch.cat(
+                    [point[i_lvl][:, :2], point[i_lvl][:, :2]], dim=1)
+                bbox.append(bbox_center + bbox_shift)
+            bbox_list.append(bbox)
+        return bbox_list
+
+    def offset_to_pts(self, center_list: List[Tensor],
+                      pred_list: List[Tensor]) -> List[Tensor]:
+        """Change from point offset to point coordinate."""
+        pts_list = []
+        for i_lvl in range(len(self.point_strides)):
+            pts_lvl = []
+            for i_img in range(len(center_list)):
+                pts_center = center_list[i_img][i_lvl][:, :2].repeat(
+                    1, self.num_points)
+                pts_shift = pred_list[i_lvl][i_img]
+                yx_pts_shift = pts_shift.permute(1, 2, 0).view(
+                    -1, 2 * self.num_points)
+                y_pts_shift = yx_pts_shift[..., 0::2]
+                x_pts_shift = yx_pts_shift[..., 1::2]
+                xy_pts_shift = torch.stack([x_pts_shift, y_pts_shift], -1)
+                xy_pts_shift = xy_pts_shift.view(*yx_pts_shift.shape[:-1], -1)
+                pts = xy_pts_shift * self.point_strides[i_lvl] + pts_center
+                pts_lvl.append(pts)
+            pts_lvl = torch.stack(pts_lvl, 0)
+            pts_list.append(pts_lvl)
+        return pts_list
+
+    def _get_targets_single(self,
+                            flat_proposals: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            gt_instances_ignore: InstanceData,
+                            stage: str = 'init',
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute corresponding GT box and classification targets for
+        proposals.
+
+        Args:
+            flat_proposals (Tensor): Multi level points of a image.
+            valid_flags (Tensor): Multi level valid flags of a image.
+            gt_instances (InstanceData): It usually includes ``bboxes`` and
+                ``labels`` attributes.
+            gt_instances_ignore (InstanceData): It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+            stage (str): 'init' or 'refine'. Generate target for
+                init stage or refine stage. Defaults to 'init'.
+            unmap_outputs (bool): Whether to map outputs back to
+                the original set of anchors. Defaults to True.
+
+        Returns:
+            tuple:
+
+                - labels (Tensor): Labels of each level.
+                - label_weights (Tensor): Label weights of each level.
+                - bbox_targets (Tensor): BBox targets of each level.
+                - bbox_weights (Tensor): BBox weights of each level.
+                - pos_inds (Tensor): positive samples indexes.
+                - neg_inds (Tensor): negative samples indexes.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        inside_flags = valid_flags
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid proposal inside the image boundary. Please '
+                'check the image size.')
+        # assign gt and sample proposals
+        proposals = flat_proposals[inside_flags, :]
+        pred_instances = InstanceData(priors=proposals)
+
+        if stage == 'init':
+            assigner = self.init_assigner
+            pos_weight = self.train_cfg['init']['pos_weight']
+        else:
+            assigner = self.refine_assigner
+            pos_weight = self.train_cfg['refine']['pos_weight']
+
+        assign_result = assigner.assign(pred_instances, gt_instances,
+                                        gt_instances_ignore)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_proposals = proposals.shape[0]
+        bbox_gt = proposals.new_zeros([num_valid_proposals, 4])
+        pos_proposals = torch.zeros_like(proposals)
+        proposals_weights = proposals.new_zeros([num_valid_proposals, 4])
+        labels = proposals.new_full((num_valid_proposals, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        label_weights = proposals.new_zeros(
+            num_valid_proposals, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            bbox_gt[pos_inds, :] = sampling_result.pos_gt_bboxes
+            pos_proposals[pos_inds, :] = proposals[pos_inds, :]
+            proposals_weights[pos_inds, :] = 1.0
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of proposals
+        if unmap_outputs:
+            num_total_proposals = flat_proposals.size(0)
+            labels = unmap(
+                labels,
+                num_total_proposals,
+                inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_proposals,
+                                  inside_flags)
+            bbox_gt = unmap(bbox_gt, num_total_proposals, inside_flags)
+            pos_proposals = unmap(pos_proposals, num_total_proposals,
+                                  inside_flags)
+            proposals_weights = unmap(proposals_weights, num_total_proposals,
+                                      inside_flags)
+
+        return (labels, label_weights, bbox_gt, pos_proposals,
+                proposals_weights, pos_inds, neg_inds, sampling_result)
+
+    def get_targets(self,
+                    proposals_list: List[Tensor],
+                    valid_flag_list: List[Tensor],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    stage: str = 'init',
+                    unmap_outputs: bool = True,
+                    return_sampling_results: bool = False) -> tuple:
+        """Compute corresponding GT box and classification targets for
+        proposals.
+
+        Args:
+            proposals_list (list[Tensor]): Multi level points/bboxes of each
+                image.
+            valid_flag_list (list[Tensor]): Multi level valid flags of each
+                image.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            stage (str): 'init' or 'refine'. Generate target for init stage or
+                refine stage.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+            return_sampling_results (bool): Whether to return the sampling
+                results. Defaults to False.
+
+        Returns:
+            tuple:
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_gt_list (list[Tensor]): Ground truth bbox of each level.
+                - proposals_list (list[Tensor]): Proposals(points/bboxes) of
+                  each level.
+                - proposal_weights_list (list[Tensor]): Proposal weights of
+                  each level.
+                - avg_factor (int): Average factor that is used to average
+                  the loss. When using sampling method, avg_factor is usually
+                  the sum of positive and negative priors. When using
+                  `PseudoSampler`, `avg_factor` is usually equal to the number
+                  of positive priors.
+        """
+        assert stage in ['init', 'refine']
+        num_imgs = len(batch_img_metas)
+        assert len(proposals_list) == len(valid_flag_list) == num_imgs
+
+        # points number of multi levels
+        num_level_proposals = [points.size(0) for points in proposals_list[0]]
+
+        # concat all level points and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(proposals_list[i]) == len(valid_flag_list[i])
+            proposals_list[i] = torch.cat(proposals_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        (all_labels, all_label_weights, all_bbox_gt, all_proposals,
+         all_proposal_weights, pos_inds_list, neg_inds_list,
+         sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             proposals_list,
+             valid_flag_list,
+             batch_gt_instances,
+             batch_gt_instances_ignore,
+             stage=stage,
+             unmap_outputs=unmap_outputs)
+
+        # sampled points of all images
+        avg_refactor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        labels_list = images_to_levels(all_labels, num_level_proposals)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_proposals)
+        bbox_gt_list = images_to_levels(all_bbox_gt, num_level_proposals)
+        proposals_list = images_to_levels(all_proposals, num_level_proposals)
+        proposal_weights_list = images_to_levels(all_proposal_weights,
+                                                 num_level_proposals)
+        res = (labels_list, label_weights_list, bbox_gt_list, proposals_list,
+               proposal_weights_list, avg_refactor)
+        if return_sampling_results:
+            res = res + (sampling_results_list, )
+
+        return res
+
+    def loss_by_feat_single(self, cls_score: Tensor, pts_pred_init: Tensor,
+                            pts_pred_refine: Tensor, labels: Tensor,
+                            label_weights, bbox_gt_init: Tensor,
+                            bbox_weights_init: Tensor, bbox_gt_refine: Tensor,
+                            bbox_weights_refine: Tensor, stride: int,
+                            avg_factor_init: int,
+                            avg_factor_refine: int) -> Tuple[Tensor]:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_classes, h_i, w_i).
+            pts_pred_init (Tensor): Points of shape
+                (batch_size, h_i * w_i, num_points * 2).
+            pts_pred_refine (Tensor): Points refined of shape
+                (batch_size, h_i * w_i, num_points * 2).
+            labels (Tensor): Ground truth class indices with shape
+                (batch_size, h_i * w_i).
+            label_weights (Tensor): Label weights of shape
+                (batch_size, h_i * w_i).
+            bbox_gt_init (Tensor): BBox regression targets in the init stage
+                of shape (batch_size, h_i * w_i, 4).
+            bbox_weights_init (Tensor): BBox regression loss weights in the
+                init stage of shape (batch_size, h_i * w_i, 4).
+            bbox_gt_refine (Tensor): BBox regression targets in the refine
+                stage of shape (batch_size, h_i * w_i, 4).
+            bbox_weights_refine (Tensor): BBox regression loss weights in the
+                refine stage of shape (batch_size, h_i * w_i, 4).
+            stride (int): Point stride.
+            avg_factor_init (int): Average factor that is used to average
+                the loss in the init stage.
+            avg_factor_refine (int): Average factor that is used to average
+                the loss in the refine stage.
+
+        Returns:
+            Tuple[Tensor]: loss components.
+        """
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        cls_score = cls_score.contiguous()
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor_refine)
+
+        # points loss
+        bbox_gt_init = bbox_gt_init.reshape(-1, 4)
+        bbox_weights_init = bbox_weights_init.reshape(-1, 4)
+        bbox_pred_init = self.points2bbox(
+            pts_pred_init.reshape(-1, 2 * self.num_points), y_first=False)
+        bbox_gt_refine = bbox_gt_refine.reshape(-1, 4)
+        bbox_weights_refine = bbox_weights_refine.reshape(-1, 4)
+        bbox_pred_refine = self.points2bbox(
+            pts_pred_refine.reshape(-1, 2 * self.num_points), y_first=False)
+        normalize_term = self.point_base_scale * stride
+        loss_pts_init = self.loss_bbox_init(
+            bbox_pred_init / normalize_term,
+            bbox_gt_init / normalize_term,
+            bbox_weights_init,
+            avg_factor=avg_factor_init)
+        loss_pts_refine = self.loss_bbox_refine(
+            bbox_pred_refine / normalize_term,
+            bbox_gt_refine / normalize_term,
+            bbox_weights_refine,
+            avg_factor=avg_factor_refine)
+        return loss_cls, loss_pts_init, loss_pts_refine
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        pts_preds_init: List[Tensor],
+        pts_preds_refine: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, Tensor]:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, of shape (batch_size, num_classes, h, w).
+            pts_preds_init (list[Tensor]): Points for each scale level, each is
+                a 3D-tensor, of shape (batch_size, h_i * w_i, num_points * 2).
+            pts_preds_refine (list[Tensor]): Points refined for each scale
+                level, each is a 3D-tensor, of shape
+                (batch_size, h_i * w_i, num_points * 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        device = cls_scores[0].device
+
+        # target for initial stage
+        center_list, valid_flag_list = self.get_points(featmap_sizes,
+                                                       batch_img_metas, device)
+        pts_coordinate_preds_init = self.offset_to_pts(center_list,
+                                                       pts_preds_init)
+        if self.train_cfg['init']['assigner']['type'] == 'PointAssigner':
+            # Assign target for center list
+            candidate_list = center_list
+        else:
+            # transform center list to bbox list and
+            #   assign target for bbox list
+            bbox_list = self.centers_to_bboxes(center_list)
+            candidate_list = bbox_list
+        cls_reg_targets_init = self.get_targets(
+            proposals_list=candidate_list,
+            valid_flag_list=valid_flag_list,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            stage='init',
+            return_sampling_results=False)
+        (*_, bbox_gt_list_init, candidate_list_init, bbox_weights_list_init,
+         avg_factor_init) = cls_reg_targets_init
+
+        # target for refinement stage
+        center_list, valid_flag_list = self.get_points(featmap_sizes,
+                                                       batch_img_metas, device)
+        pts_coordinate_preds_refine = self.offset_to_pts(
+            center_list, pts_preds_refine)
+        bbox_list = []
+        for i_img, center in enumerate(center_list):
+            bbox = []
+            for i_lvl in range(len(pts_preds_refine)):
+                bbox_preds_init = self.points2bbox(
+                    pts_preds_init[i_lvl].detach())
+                bbox_shift = bbox_preds_init * self.point_strides[i_lvl]
+                bbox_center = torch.cat(
+                    [center[i_lvl][:, :2], center[i_lvl][:, :2]], dim=1)
+                bbox.append(bbox_center +
+                            bbox_shift[i_img].permute(1, 2, 0).reshape(-1, 4))
+            bbox_list.append(bbox)
+        cls_reg_targets_refine = self.get_targets(
+            proposals_list=bbox_list,
+            valid_flag_list=valid_flag_list,
+            batch_gt_instances=batch_gt_instances,
+            batch_img_metas=batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            stage='refine',
+            return_sampling_results=False)
+        (labels_list, label_weights_list, bbox_gt_list_refine,
+         candidate_list_refine, bbox_weights_list_refine,
+         avg_factor_refine) = cls_reg_targets_refine
+
+        # compute loss
+        losses_cls, losses_pts_init, losses_pts_refine = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            pts_coordinate_preds_init,
+            pts_coordinate_preds_refine,
+            labels_list,
+            label_weights_list,
+            bbox_gt_list_init,
+            bbox_weights_list_init,
+            bbox_gt_list_refine,
+            bbox_weights_list_refine,
+            self.point_strides,
+            avg_factor_init=avg_factor_init,
+            avg_factor_refine=avg_factor_refine)
+        loss_dict_all = {
+            'loss_cls': losses_cls,
+            'loss_pts_init': losses_pts_init,
+            'loss_pts_refine': losses_pts_refine
+        }
+        return loss_dict_all
+
+    # Same as base_dense_head/_get_bboxes_single except self._bbox_decode
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image. RepPoints head does not need
+                this value.
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid, has shape
+                (num_priors, 2).
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict`): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_score_list) == len(bbox_pred_list)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for level_idx, (cls_score, bbox_pred, priors) in enumerate(
+                zip(cls_score_list, bbox_pred_list, mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, _, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+
+            bboxes = self._bbox_decode(priors, bbox_pred,
+                                       self.point_strides[level_idx],
+                                       img_shape)
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def _bbox_decode(self, points: Tensor, bbox_pred: Tensor, stride: int,
+                     max_shape: Tuple[int, int]) -> Tensor:
+        """Decode the prediction to bounding box.
+
+        Args:
+            points (Tensor): shape (h_i * w_i, 2).
+            bbox_pred (Tensor): shape (h_i * w_i, 4).
+            stride (int): Stride for bbox_pred in different level.
+            max_shape (Tuple[int, int]): image shape.
+
+        Returns:
+            Tensor: Bounding boxes decoded.
+        """
+        bbox_pos_center = torch.cat([points[:, :2], points[:, :2]], dim=1)
+        bboxes = bbox_pred * stride + bbox_pos_center
+        x1 = bboxes[:, 0].clamp(min=0, max=max_shape[1])
+        y1 = bboxes[:, 1].clamp(min=0, max=max_shape[0])
+        x2 = bboxes[:, 2].clamp(min=0, max=max_shape[1])
+        y2 = bboxes[:, 3].clamp(min=0, max=max_shape[0])
+        decoded_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+        return decoded_bboxes
diff --git a/head_extractor/src/mmdet/models/dense_heads/retina_head.py b/head_extractor/src/mmdet/models/dense_heads/retina_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..be3ae74d81ba38609646f0d0406098ecbdcef688
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/retina_head.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmdet.registry import MODELS
+from .anchor_head import AnchorHead
+
+
+@MODELS.register_module()
+class RetinaHead(AnchorHead):
+    r"""An anchor-based head used in `RetinaNet
+    <https://arxiv.org/pdf/1708.02002.pdf>`_.
+
+    The head contains two subnetworks. The first classifies anchor boxes and
+    the second regresses deltas for the anchors.
+
+    Example:
+        >>> import torch
+        >>> self = RetinaHead(11, 7)
+        >>> x = torch.rand(1, 7, 32, 32)
+        >>> cls_score, bbox_pred = self.forward_single(x)
+        >>> # Each anchor predicts a score for each class except background
+        >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
+        >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
+        >>> assert cls_per_anchor == (self.num_classes)
+        >>> assert box_per_anchor == 4
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     octave_base_scale=4,
+                     scales_per_octave=3,
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[8, 16, 32, 64, 128]),
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='retina_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs):
+        assert stacked_convs >= 0, \
+            '`stacked_convs` must be non-negative integers, ' \
+            f'but got {stacked_convs} instead.'
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super(RetinaHead, self).__init__(
+            num_classes,
+            in_channels,
+            anchor_generator=anchor_generator,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        in_channels = self.in_channels
+        for i in range(self.stacked_convs):
+            self.cls_convs.append(
+                ConvModule(
+                    in_channels,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    in_channels,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            in_channels = self.feat_channels
+        self.retina_cls = nn.Conv2d(
+            in_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        reg_dim = self.bbox_coder.encode_size
+        self.retina_reg = nn.Conv2d(
+            in_channels, self.num_base_priors * reg_dim, 3, padding=1)
+
+    def forward_single(self, x):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale
+                    level, the channels number is num_anchors * 4.
+        """
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.retina_cls(cls_feat)
+        bbox_pred = self.retina_reg(reg_feat)
+        return cls_score, bbox_pred
diff --git a/head_extractor/src/mmdet/models/dense_heads/retina_sepbn_head.py b/head_extractor/src/mmdet/models/dense_heads/retina_sepbn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..681a39983a08670adaa3e24a4099c4f26bc967ce
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/retina_sepbn_head.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import bias_init_with_prob, normal_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .anchor_head import AnchorHead
+
+
+@MODELS.register_module()
+class RetinaSepBNHead(AnchorHead):
+    """"RetinaHead with separate BN.
+
+    In RetinaHead, conv/norm layers are shared across different FPN levels,
+    while in RetinaSepBNHead, conv layers are shared across different FPN
+    levels, but BN layers are separated.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 num_ins: int,
+                 in_channels: int,
+                 stacked_convs: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.num_ins = num_ins
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.num_ins):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            for j in range(self.stacked_convs):
+                chn = self.in_channels if j == 0 else self.feat_channels
+                cls_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+                reg_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+        for i in range(self.stacked_convs):
+            for j in range(1, self.num_ins):
+                self.cls_convs[j][i].conv = self.cls_convs[0][i].conv
+                self.reg_convs[j][i].conv = self.reg_convs[0][i].conv
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.retina_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        super().init_weights()
+        for m in self.cls_convs[0]:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs[0]:
+            normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.retina_cls, std=0.01, bias=bias_cls)
+        normal_init(self.retina_reg, std=0.01)
+
+    def forward(self, feats: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+
+                - cls_scores (list[Tensor]): Classification scores for all
+                  scale levels, each is a 4D-tensor, the channels number is
+                  num_anchors * num_classes.
+                - bbox_preds (list[Tensor]): Box energies / deltas for all
+                  scale levels, each is a 4D-tensor, the channels number is
+                  num_anchors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for i, x in enumerate(feats):
+            cls_feat = feats[i]
+            reg_feat = feats[i]
+            for cls_conv in self.cls_convs[i]:
+                cls_feat = cls_conv(cls_feat)
+            for reg_conv in self.reg_convs[i]:
+                reg_feat = reg_conv(reg_feat)
+            cls_score = self.retina_cls(cls_feat)
+            bbox_pred = self.retina_reg(reg_feat)
+            cls_scores.append(cls_score)
+            bbox_preds.append(bbox_pred)
+        return cls_scores, bbox_preds
diff --git a/head_extractor/src/mmdet/models/dense_heads/rpn_head.py b/head_extractor/src/mmdet/models/dense_heads/rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b544009d2ffc4c3c9065707a0a8a72c577eb432
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/rpn_head.py
@@ -0,0 +1,302 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.ops import batched_nms
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import (cat_boxes, empty_box_as, get_box_tensor,
+                                   get_box_wh, scale_boxes)
+from mmdet.utils import InstanceList, MultiConfig, OptInstanceList
+from .anchor_head import AnchorHead
+
+
+@MODELS.register_module()
+class RPNHead(AnchorHead):
+    """Implementation of RPN head.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        num_classes (int): Number of categories excluding the background
+            category. Defaults to 1.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or \
+            list[dict]): Initialization config dict.
+        num_convs (int): Number of convolution layers in the head.
+            Defaults to 1.
+    """  # noqa: W605
+
+    def __init__(self,
+                 in_channels: int,
+                 num_classes: int = 1,
+                 init_cfg: MultiConfig = dict(
+                     type='Normal', layer='Conv2d', std=0.01),
+                 num_convs: int = 1,
+                 **kwargs) -> None:
+        self.num_convs = num_convs
+        assert num_classes == 1
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        if self.num_convs > 1:
+            rpn_convs = []
+            for i in range(self.num_convs):
+                if i == 0:
+                    in_channels = self.in_channels
+                else:
+                    in_channels = self.feat_channels
+                # use ``inplace=False`` to avoid error: one of the variables
+                # needed for gradient computation has been modified by an
+                # inplace operation.
+                rpn_convs.append(
+                    ConvModule(
+                        in_channels,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        inplace=False))
+            self.rpn_conv = nn.Sequential(*rpn_convs)
+        else:
+            self.rpn_conv = nn.Conv2d(
+                self.in_channels, self.feat_channels, 3, padding=1)
+        self.rpn_cls = nn.Conv2d(self.feat_channels,
+                                 self.num_base_priors * self.cls_out_channels,
+                                 1)
+        reg_dim = self.bbox_coder.encode_size
+        self.rpn_reg = nn.Conv2d(self.feat_channels,
+                                 self.num_base_priors * reg_dim, 1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level \
+                    the channels number is num_base_priors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_base_priors * 4.
+        """
+        x = self.rpn_conv(x)
+        x = F.relu(x)
+        rpn_cls_score = self.rpn_cls(x)
+        rpn_bbox_pred = self.rpn_reg(x)
+        return rpn_cls_score, rpn_bbox_pred
+
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None) \
+            -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[obj:InstanceData]): Batch of gt_instance.
+                It usually includes ``bboxes`` and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[obj:InstanceData], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        losses = super().loss_by_feat(
+            cls_scores,
+            bbox_preds,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        return dict(
+            loss_rpn_cls=losses['loss_cls'], loss_rpn_bbox=losses['loss_bbox'])
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Be compatible with
+                BaseDenseHead. Not used in RPNHead.
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (ConfigDict, optional): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        level_ids = []
+        for level_idx, (cls_score, bbox_pred, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list,
+                              mlvl_priors)):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            reg_dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, reg_dim)
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0] since mmdet v2.0
+                # BG cat_id: 1
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            scores = torch.squeeze(scores)
+            if 0 < nms_pre < scores.shape[0]:
+                # sort is faster than topk
+                # _, topk_inds = scores.topk(cfg.nms_pre)
+                ranked_scores, rank_inds = scores.sort(descending=True)
+                topk_inds = rank_inds[:nms_pre]
+                scores = ranked_scores[:nms_pre]
+                bbox_pred = bbox_pred[topk_inds, :]
+                priors = priors[topk_inds]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+
+            # use level id to implement the separate level nms
+            level_ids.append(
+                scores.new_full((scores.size(0), ),
+                                level_idx,
+                                dtype=torch.long))
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_priors)
+        bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.scores = torch.cat(mlvl_scores)
+        results.level_ids = torch.cat(level_ids)
+
+        return self._bbox_post_process(
+            results=results, cfg=cfg, rescale=rescale, img_meta=img_meta)
+
+    def _bbox_post_process(self,
+                           results: InstanceData,
+                           cfg: ConfigDict,
+                           rescale: bool = False,
+                           with_nms: bool = True,
+                           img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (ConfigDict): Test / postprocessing configuration.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert with_nms, '`with_nms` must be True in RPNHead'
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            results.bboxes = scale_boxes(results.bboxes, scale_factor)
+
+        # filter small size bboxes
+        if cfg.get('min_bbox_size', -1) >= 0:
+            w, h = get_box_wh(results.bboxes)
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                results = results[valid_mask]
+
+        if results.bboxes.numel() > 0:
+            bboxes = get_box_tensor(results.bboxes)
+            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
+                                                results.level_ids, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:cfg.max_per_img]
+            # TODO: This would unreasonably show the 0th class label
+            #  in visualization
+            results.labels = results.scores.new_zeros(
+                len(results), dtype=torch.long)
+            del results.level_ids
+        else:
+            # To avoid some potential error
+            results_ = InstanceData()
+            results_.bboxes = empty_box_as(results.bboxes)
+            results_.scores = results.scores.new_zeros(0)
+            results_.labels = results.scores.new_zeros(0)
+            results = results_
+        return results
diff --git a/head_extractor/src/mmdet/models/dense_heads/rtmdet_head.py b/head_extractor/src/mmdet/models/dense_heads/rtmdet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae0ee6d2f35a0fa46ba0b8de21054433d0420b65
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/rtmdet_head.py
@@ -0,0 +1,692 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule, Scale, is_norm
+from mmengine.model import bias_init_with_prob, constant_init, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import distance2bbox
+from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
+from ..layers.transformer import inverse_sigmoid
+from ..task_modules import anchor_inside_flags
+from ..utils import (images_to_levels, multi_apply, sigmoid_geometric_mean,
+                     unmap)
+from .atss_head import ATSSHead
+
+
+@MODELS.register_module()
+class RTMDetHead(ATSSHead):
+    """Detection Head of RTMDet.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        with_objectness (bool): Whether to add an objectness branch.
+            Defaults to True.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='ReLU')
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 with_objectness: bool = True,
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 **kwargs) -> None:
+        self.act_cfg = act_cfg
+        self.with_objectness = with_objectness
+        super().__init__(num_classes, in_channels, **kwargs)
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        pred_pad_size = self.pred_kernel_size // 2
+        self.rtm_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.rtm_reg = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * 4,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        if self.with_objectness:
+            self.rtm_obj = nn.Conv2d(
+                self.feat_channels,
+                1,
+                self.pred_kernel_size,
+                padding=pred_pad_size)
+
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.rtm_cls, std=0.01, bias=bias_cls)
+        normal_init(self.rtm_reg, std=0.01)
+        if self.with_objectness:
+            normal_init(self.rtm_obj, std=0.01, bias=bias_cls)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+        """
+
+        cls_scores = []
+        bbox_preds = []
+        for idx, (x, scale, stride) in enumerate(
+                zip(feats, self.scales, self.prior_generator.strides)):
+            cls_feat = x
+            reg_feat = x
+
+            for cls_layer in self.cls_convs:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls(cls_feat)
+
+            for reg_layer in self.reg_convs:
+                reg_feat = reg_layer(reg_feat)
+
+            if self.with_objectness:
+                objectness = self.rtm_obj(reg_feat)
+                cls_score = inverse_sigmoid(
+                    sigmoid_geometric_mean(cls_score, objectness))
+
+            reg_dist = scale(self.rtm_reg(reg_feat).exp()).float() * stride[0]
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+        return tuple(cls_scores), tuple(bbox_preds)
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            labels: Tensor, label_weights: Tensor,
+                            bbox_targets: Tensor, assign_metrics: Tensor,
+                            stride: List[int]):
+        """Compute loss of a single scale level.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Decoded bboxes for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors).
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            assign_metrics (Tensor): Assign metrics with shape
+                (N, num_total_anchors).
+            stride (List[int]): Downsample stride of the feature map.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        bbox_pred = bbox_pred.reshape(-1, 4)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        assign_metrics = assign_metrics.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        targets = (labels, assign_metrics)
+
+        loss_cls = self.loss_cls(
+            cls_score, targets, label_weights, avg_factor=1.0)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+
+            pos_decode_bbox_pred = pos_bbox_pred
+            pos_decode_bbox_targets = pos_bbox_targets
+
+            # regression loss
+            pos_bbox_weight = assign_metrics[pos_inds]
+
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=pos_bbox_weight,
+                avg_factor=1.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            pos_bbox_weight = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, assign_metrics.sum(), pos_bbox_weight.sum()
+
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1)
+        decoded_bboxes = []
+        for anchor, bbox_pred in zip(anchor_list[0], bbox_preds):
+            anchor = anchor.reshape(-1, 4)
+            bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            bbox_pred = distance2bbox(anchor, bbox_pred)
+            decoded_bboxes.append(bbox_pred)
+
+        flatten_bboxes = torch.cat(decoded_bboxes, 1)
+
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bboxes,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         assign_metrics_list, sampling_results_list) = cls_reg_targets
+
+        losses_cls, losses_bbox,\
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_by_feat_single,
+                cls_scores,
+                decoded_bboxes,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                assign_metrics_list,
+                self.prior_generator.strides)
+
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+    def get_targets(self,
+                    cls_scores: Tensor,
+                    bbox_preds: Tensor,
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs=True):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            cls_scores (Tensor): Classification predictions of images,
+                a 3D-Tensor with shape [num_imgs, num_priors, num_classes].
+            bbox_preds (Tensor): Decoded bboxes predictions of one image,
+                a 3D-Tensor with shape [num_imgs, num_priors, 4] in [tl_x,
+                tl_y, br_x, br_y] format.
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: a tuple containing learning targets.
+
+            - anchors_list (list[list[Tensor]]): Anchors of each level.
+            - labels_list (list[Tensor]): Labels of each level.
+            - label_weights_list (list[Tensor]): Label weights of each
+              level.
+            - bbox_targets_list (list[Tensor]): BBox targets of each level.
+            - assign_metrics_list (list[Tensor]): alignment metrics of each
+              level.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        # anchor_list: list(b * [-1, 4])
+        (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+         all_assign_metrics, sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             cls_scores.detach(),
+             bbox_preds.detach(),
+             anchor_list,
+             valid_flag_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        assign_metrics_list = images_to_levels(all_assign_metrics,
+                                               num_level_anchors)
+
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, assign_metrics_list, sampling_results_list)
+
+    def _get_targets_single(self,
+                            cls_scores: Tensor,
+                            bbox_preds: Tensor,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs=True):
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            cls_scores (list(Tensor)): Box scores for each image.
+            bbox_preds (list(Tensor)): Box energies / deltas for each image.
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+
+            - anchors (Tensor): All anchors in the image with shape (N, 4).
+            - labels (Tensor): Labels of all anchors in the image with shape
+              (N,).
+            - label_weights (Tensor): Label weights of all anchor in the
+              image with shape (N,).
+            - bbox_targets (Tensor): BBox targets of all anchors in the
+              image with shape (N, 4).
+            - norm_alignment_metrics (Tensor): Normalized alignment metrics
+              of all priors in the image with shape (N,).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        pred_instances = InstanceData(
+            scores=cls_scores[inside_flags, :],
+            bboxes=bbox_preds[inside_flags, :],
+            priors=anchors)
+
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        assign_metrics = anchors.new_zeros(
+            num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            # point-based
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        class_assigned_gt_inds = torch.unique(
+            sampling_result.pos_assigned_gt_inds)
+        for gt_inds in class_assigned_gt_inds:
+            gt_class_inds = pos_inds[sampling_result.pos_assigned_gt_inds ==
+                                     gt_inds]
+            assign_metrics[gt_class_inds] = assign_result.max_overlaps[
+                gt_class_inds]
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            assign_metrics = unmap(assign_metrics, num_total_anchors,
+                                   inside_flags)
+        return (anchors, labels, label_weights, bbox_targets, assign_metrics,
+                sampling_result)
+
+    def get_anchors(self,
+                    featmap_sizes: List[tuple],
+                    batch_img_metas: List[dict],
+                    device: Union[torch.device, str] = 'cuda') \
+            -> Tuple[List[List[Tensor]], List[List[Tensor]]]:
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+            device (torch.device or str): Device for returned tensors.
+                Defaults to cuda.
+
+        Returns:
+            tuple:
+
+            - anchor_list (list[list[Tensor]]): Anchors of each image.
+            - valid_flag_list (list[list[Tensor]]): Valid flags of each
+              image.
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device, with_stride=True)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = self.prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device)
+            valid_flag_list.append(multi_level_flags)
+        return anchor_list, valid_flag_list
+
+
+@MODELS.register_module()
+class RTMDetSepBNHead(RTMDetHead):
+    """RTMDetHead with separated BN layers and shared conv layers.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        share_conv (bool): Whether to share conv layers between stages.
+            Defaults to True.
+        use_depthwise (bool): Whether to use depthwise separable convolution in
+            head. Defaults to False.
+        norm_cfg (:obj:`ConfigDict` or dict)): Config dict for normalization
+            layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (:obj:`ConfigDict` or dict)): Config dict for activation layer.
+            Defaults to dict(type='SiLU').
+        pred_kernel_size (int): Kernel size of prediction layer. Defaults to 1.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 share_conv: bool = True,
+                 use_depthwise: bool = False,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU'),
+                 pred_kernel_size: int = 1,
+                 exp_on_reg=False,
+                 **kwargs) -> None:
+        self.share_conv = share_conv
+        self.exp_on_reg = exp_on_reg
+        self.use_depthwise = use_depthwise
+        super().__init__(
+            num_classes,
+            in_channels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            pred_kernel_size=pred_kernel_size,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+
+        self.rtm_cls = nn.ModuleList()
+        self.rtm_reg = nn.ModuleList()
+        if self.with_objectness:
+            self.rtm_obj = nn.ModuleList()
+        for n in range(len(self.prior_generator.strides)):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                cls_convs.append(
+                    conv(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_convs.append(
+                    conv(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(reg_convs)
+
+            self.rtm_cls.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * self.cls_out_channels,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2))
+            self.rtm_reg.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * 4,
+                    self.pred_kernel_size,
+                    padding=self.pred_kernel_size // 2))
+            if self.with_objectness:
+                self.rtm_obj.append(
+                    nn.Conv2d(
+                        self.feat_channels,
+                        1,
+                        self.pred_kernel_size,
+                        padding=self.pred_kernel_size // 2))
+
+        if self.share_conv:
+            for n in range(len(self.prior_generator.strides)):
+                for i in range(self.stacked_convs):
+                    self.cls_convs[n][i].conv = self.cls_convs[0][i].conv
+                    self.reg_convs[n][i].conv = self.reg_convs[0][i].conv
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+        bias_cls = bias_init_with_prob(0.01)
+        for rtm_cls, rtm_reg in zip(self.rtm_cls, self.rtm_reg):
+            normal_init(rtm_cls, std=0.01, bias=bias_cls)
+            normal_init(rtm_reg, std=0.01)
+        if self.with_objectness:
+            for rtm_obj in self.rtm_obj:
+                normal_init(rtm_obj, std=0.01, bias=bias_cls)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+
+            - cls_scores (tuple[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_anchors * num_classes.
+            - bbox_preds (tuple[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_anchors * 4.
+        """
+
+        cls_scores = []
+        bbox_preds = []
+        for idx, (x, stride) in enumerate(
+                zip(feats, self.prior_generator.strides)):
+            cls_feat = x
+            reg_feat = x
+
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls[idx](cls_feat)
+
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+
+            if self.with_objectness:
+                objectness = self.rtm_obj[idx](reg_feat)
+                cls_score = inverse_sigmoid(
+                    sigmoid_geometric_mean(cls_score, objectness))
+            if self.exp_on_reg:
+                reg_dist = self.rtm_reg[idx](reg_feat).exp() * stride[0]
+            else:
+                reg_dist = self.rtm_reg[idx](reg_feat) * stride[0]
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+        return tuple(cls_scores), tuple(bbox_preds)
diff --git a/head_extractor/src/mmdet/models/dense_heads/rtmdet_ins_head.py b/head_extractor/src/mmdet/models/dense_heads/rtmdet_ins_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..261a57fe485245dcbe41696c9237258f829ca25a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/rtmdet_ins_head.py
@@ -0,0 +1,1034 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, is_norm
+from mmcv.ops import batched_nms
+from mmengine.model import (BaseModule, bias_init_with_prob, constant_init,
+                            normal_init)
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.layers.transformer import inverse_sigmoid
+from mmdet.models.utils import (filter_scores_and_topk, multi_apply,
+                                select_single_mlvl, sigmoid_geometric_mean)
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import (cat_boxes, distance2bbox, get_box_tensor,
+                                   get_box_wh, scale_boxes)
+from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
+from .rtmdet_head import RTMDetHead
+
+
+@MODELS.register_module()
+class RTMDetInsHead(RTMDetHead):
+    """Detection Head of RTMDet-Ins.
+
+    Args:
+        num_prototypes (int): Number of mask prototype features extracted
+            from the mask head. Defaults to 8.
+        dyconv_channels (int): Channel of the dynamic conv layers.
+            Defaults to 8.
+        num_dyconvs (int): Number of the dynamic convolution layers.
+            Defaults to 3.
+        mask_loss_stride (int): Down sample stride of the masks for loss
+            computation. Defaults to 4.
+        loss_mask (:obj:`ConfigDict` or dict): Config dict for mask loss.
+    """
+
+    def __init__(self,
+                 *args,
+                 num_prototypes: int = 8,
+                 dyconv_channels: int = 8,
+                 num_dyconvs: int = 3,
+                 mask_loss_stride: int = 4,
+                 loss_mask=dict(
+                     type='DiceLoss',
+                     loss_weight=2.0,
+                     eps=5e-6,
+                     reduction='mean'),
+                 **kwargs) -> None:
+        self.num_prototypes = num_prototypes
+        self.num_dyconvs = num_dyconvs
+        self.dyconv_channels = dyconv_channels
+        self.mask_loss_stride = mask_loss_stride
+        super().__init__(*args, **kwargs)
+        self.loss_mask = MODELS.build(loss_mask)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        super()._init_layers()
+        # a branch to predict kernels of dynamic convs
+        self.kernel_convs = nn.ModuleList()
+        # calculate num dynamic parameters
+        weight_nums, bias_nums = [], []
+        for i in range(self.num_dyconvs):
+            if i == 0:
+                weight_nums.append(
+                    # mask prototype and coordinate features
+                    (self.num_prototypes + 2) * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels * 1)
+            elif i == self.num_dyconvs - 1:
+                weight_nums.append(self.dyconv_channels * 1)
+                bias_nums.append(1)
+            else:
+                weight_nums.append(self.dyconv_channels * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels * 1)
+        self.weight_nums = weight_nums
+        self.bias_nums = bias_nums
+        self.num_gen_params = sum(weight_nums) + sum(bias_nums)
+
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.kernel_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        pred_pad_size = self.pred_kernel_size // 2
+        self.rtm_kernel = nn.Conv2d(
+            self.feat_channels,
+            self.num_gen_params,
+            self.pred_kernel_size,
+            padding=pred_pad_size)
+        self.mask_head = MaskFeatModule(
+            in_channels=self.in_channels,
+            feat_channels=self.feat_channels,
+            stacked_convs=4,
+            num_levels=len(self.prior_generator.strides),
+            num_prototypes=self.num_prototypes,
+            act_cfg=self.act_cfg,
+            norm_cfg=self.norm_cfg)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+            - kernel_preds (list[Tensor]): Dynamic conv kernels for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_gen_params.
+            - mask_feat (Tensor): Output feature of the mask head. Each is a
+              4D-tensor, the channels number is num_prototypes.
+        """
+        mask_feat = self.mask_head(feats)
+
+        cls_scores = []
+        bbox_preds = []
+        kernel_preds = []
+        for idx, (x, scale, stride) in enumerate(
+                zip(feats, self.scales, self.prior_generator.strides)):
+            cls_feat = x
+            reg_feat = x
+            kernel_feat = x
+
+            for cls_layer in self.cls_convs:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls(cls_feat)
+
+            for kernel_layer in self.kernel_convs:
+                kernel_feat = kernel_layer(kernel_feat)
+            kernel_pred = self.rtm_kernel(kernel_feat)
+
+            for reg_layer in self.reg_convs:
+                reg_feat = reg_layer(reg_feat)
+
+            if self.with_objectness:
+                objectness = self.rtm_obj(reg_feat)
+                cls_score = inverse_sigmoid(
+                    sigmoid_geometric_mean(cls_score, objectness))
+
+            reg_dist = scale(self.rtm_reg(reg_feat)) * stride[0]
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+            kernel_preds.append(kernel_pred)
+        return tuple(cls_scores), tuple(bbox_preds), tuple(
+            kernel_preds), mask_feat
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        kernel_preds: List[Tensor],
+                        mask_feat: Tensor,
+                        score_factors: Optional[List[Tensor]] = None,
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigType] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            kernel_preds (list[Tensor]): Kernel predictions of dynamic
+                convs for all scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_params, H, W).
+            mask_feat (Tensor): Mask prototype features extracted from the
+                mask head, has shape (batch_size, num_prototypes, H, W).
+            score_factors (list[Tensor], optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Defaults to None.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        if score_factors is None:
+            # e.g. Retina, FreeAnchor, Foveabox, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, AutoAssign, etc.
+            with_score_factors = True
+            assert len(cls_scores) == len(score_factors)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+
+        result_list = []
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            cls_score_list = select_single_mlvl(
+                cls_scores, img_id, detach=True)
+            bbox_pred_list = select_single_mlvl(
+                bbox_preds, img_id, detach=True)
+            kernel_pred_list = select_single_mlvl(
+                kernel_preds, img_id, detach=True)
+            if with_score_factors:
+                score_factor_list = select_single_mlvl(
+                    score_factors, img_id, detach=True)
+            else:
+                score_factor_list = [None for _ in range(num_levels)]
+
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                kernel_pred_list=kernel_pred_list,
+                mask_feat=mask_feat[img_id],
+                score_factor_list=score_factor_list,
+                mlvl_priors=mlvl_priors,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                kernel_pred_list: List[Tensor],
+                                mask_feat: Tensor,
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox and mask results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            kernel_preds (list[Tensor]): Kernel predictions of dynamic
+                convs for all scale levels of a single image, each is a
+                4D-tensor, has shape (num_params, H, W).
+            mask_feat (Tensor): Mask prototype features of a single image
+                extracted from the mask head, has shape (num_prototypes, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        if score_factor_list[0] is None:
+            # e.g. Retina, FreeAnchor, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_kernels = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_labels = []
+        if with_score_factors:
+            mlvl_score_factors = []
+        else:
+            mlvl_score_factors = None
+
+        for level_idx, (cls_score, bbox_pred, kernel_pred,
+                        score_factor, priors) in \
+                enumerate(zip(cls_score_list, bbox_pred_list, kernel_pred_list,
+                              score_factor_list, mlvl_priors)):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            dim = self.bbox_coder.encode_size
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, dim)
+            if with_score_factors:
+                score_factor = score_factor.permute(1, 2,
+                                                    0).reshape(-1).sigmoid()
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            kernel_pred = kernel_pred.permute(1, 2, 0).reshape(
+                -1, self.num_gen_params)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            score_thr = cfg.get('score_thr', 0)
+
+            results = filter_scores_and_topk(
+                scores, score_thr, nms_pre,
+                dict(
+                    bbox_pred=bbox_pred,
+                    priors=priors,
+                    kernel_pred=kernel_pred))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results['bbox_pred']
+            priors = filtered_results['priors']
+            kernel_pred = filtered_results['kernel_pred']
+
+            if with_score_factors:
+                score_factor = score_factor[keep_idxs]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+            mlvl_kernels.append(kernel_pred)
+
+            if with_score_factors:
+                mlvl_score_factors.append(score_factor)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = cat_boxes(mlvl_valid_priors)
+        bboxes = self.bbox_coder.decode(
+            priors[..., :2], bbox_pred, max_shape=img_shape)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.priors = priors
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+        results.kernels = torch.cat(mlvl_kernels)
+        if with_score_factors:
+            results.score_factors = torch.cat(mlvl_score_factors)
+
+        return self._bbox_mask_post_process(
+            results=results,
+            mask_feat=mask_feat,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def _bbox_mask_post_process(
+            self,
+            results: InstanceData,
+            mask_feat,
+            cfg: ConfigType,
+            rescale: bool = False,
+            with_nms: bool = True,
+            img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox and mask post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (ConfigDict): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, h, w).
+        """
+        stride = self.prior_generator.strides[0][0]
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            results.bboxes = scale_boxes(results.bboxes, scale_factor)
+
+        if hasattr(results, 'score_factors'):
+            # TODO: Add sqrt operation in order to be consistent with
+            #  the paper.
+            score_factors = results.pop('score_factors')
+            results.scores = results.scores * score_factors
+
+        # filter small size bboxes
+        if cfg.get('min_bbox_size', -1) >= 0:
+            w, h = get_box_wh(results.bboxes)
+            valid_mask = (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size)
+            if not valid_mask.all():
+                results = results[valid_mask]
+
+        # TODO: deal with `with_nms` and `nms_cfg=None` in test_cfg
+        assert with_nms, 'with_nms must be True for RTMDet-Ins'
+        if results.bboxes.numel() > 0:
+            bboxes = get_box_tensor(results.bboxes)
+            det_bboxes, keep_idxs = batched_nms(bboxes, results.scores,
+                                                results.labels, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+            results = results[:cfg.max_per_img]
+
+            # process masks
+            mask_logits = self._mask_predict_by_feat_single(
+                mask_feat, results.kernels, results.priors)
+
+            mask_logits = F.interpolate(
+                mask_logits.unsqueeze(0), scale_factor=stride, mode='bilinear')
+            if rescale:
+                ori_h, ori_w = img_meta['ori_shape'][:2]
+                mask_logits = F.interpolate(
+                    mask_logits,
+                    size=[
+                        math.ceil(mask_logits.shape[-2] * scale_factor[0]),
+                        math.ceil(mask_logits.shape[-1] * scale_factor[1])
+                    ],
+                    mode='bilinear',
+                    align_corners=False)[..., :ori_h, :ori_w]
+            masks = mask_logits.sigmoid().squeeze(0)
+            masks = masks > cfg.mask_thr_binary
+            results.masks = masks
+        else:
+            h, w = img_meta['ori_shape'][:2] if rescale else img_meta[
+                'img_shape'][:2]
+            results.masks = torch.zeros(
+                size=(results.bboxes.shape[0], h, w),
+                dtype=torch.bool,
+                device=results.bboxes.device)
+
+        return results
+
+    def parse_dynamic_params(self, flatten_kernels: Tensor) -> tuple:
+        """split kernel head prediction to conv weight and bias."""
+        n_inst = flatten_kernels.size(0)
+        n_layers = len(self.weight_nums)
+        params_splits = list(
+            torch.split_with_sizes(
+                flatten_kernels, self.weight_nums + self.bias_nums, dim=1))
+        weight_splits = params_splits[:n_layers]
+        bias_splits = params_splits[n_layers:]
+        for i in range(n_layers):
+            if i < n_layers - 1:
+                weight_splits[i] = weight_splits[i].reshape(
+                    n_inst * self.dyconv_channels, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst *
+                                                        self.dyconv_channels)
+            else:
+                weight_splits[i] = weight_splits[i].reshape(n_inst, -1, 1, 1)
+                bias_splits[i] = bias_splits[i].reshape(n_inst)
+
+        return weight_splits, bias_splits
+
+    def _mask_predict_by_feat_single(self, mask_feat: Tensor, kernels: Tensor,
+                                     priors: Tensor) -> Tensor:
+        """Generate mask logits from mask features with dynamic convs.
+
+        Args:
+            mask_feat (Tensor): Mask prototype features.
+                Has shape (num_prototypes, H, W).
+            kernels (Tensor): Kernel parameters for each instance.
+                Has shape (num_instance, num_params)
+            priors (Tensor): Center priors for each instance.
+                Has shape (num_instance, 4).
+        Returns:
+            Tensor: Instance segmentation masks for each instance.
+                Has shape (num_instance, H, W).
+        """
+        num_inst = priors.shape[0]
+        h, w = mask_feat.size()[-2:]
+        if num_inst < 1:
+            return torch.empty(
+                size=(num_inst, h, w),
+                dtype=mask_feat.dtype,
+                device=mask_feat.device)
+        if len(mask_feat.shape) < 4:
+            mask_feat.unsqueeze(0)
+
+        coord = self.prior_generator.single_level_grid_priors(
+            (h, w), level_idx=0, device=mask_feat.device).reshape(1, -1, 2)
+        num_inst = priors.shape[0]
+        points = priors[:, :2].reshape(-1, 1, 2)
+        strides = priors[:, 2:].reshape(-1, 1, 2)
+        relative_coord = (points - coord).permute(0, 2, 1) / (
+            strides[..., 0].reshape(-1, 1, 1) * 8)
+        relative_coord = relative_coord.reshape(num_inst, 2, h, w)
+
+        mask_feat = torch.cat(
+            [relative_coord,
+             mask_feat.repeat(num_inst, 1, 1, 1)], dim=1)
+        weights, biases = self.parse_dynamic_params(kernels)
+
+        n_layers = len(weights)
+        x = mask_feat.reshape(1, -1, h, w)
+        for i, (weight, bias) in enumerate(zip(weights, biases)):
+            x = F.conv2d(
+                x, weight, bias=bias, stride=1, padding=0, groups=num_inst)
+            if i < n_layers - 1:
+                x = F.relu(x)
+        x = x.reshape(num_inst, h, w)
+        return x
+
+    def loss_mask_by_feat(self, mask_feats: Tensor, flatten_kernels: Tensor,
+                          sampling_results_list: list,
+                          batch_gt_instances: InstanceList) -> Tensor:
+        """Compute instance segmentation loss.
+
+        Args:
+            mask_feats (list[Tensor]): Mask prototype features extracted from
+                the mask head. Has shape (N, num_prototypes, H, W)
+            flatten_kernels (list[Tensor]): Kernels of the dynamic conv layers.
+                Has shape (N, num_instances, num_params)
+            sampling_results_list (list[:obj:`SamplingResults`]) Batch of
+                assignment results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            Tensor: The mask loss tensor.
+        """
+        batch_pos_mask_logits = []
+        pos_gt_masks = []
+        for idx, (mask_feat, kernels, sampling_results,
+                  gt_instances) in enumerate(
+                      zip(mask_feats, flatten_kernels, sampling_results_list,
+                          batch_gt_instances)):
+            pos_priors = sampling_results.pos_priors
+            pos_inds = sampling_results.pos_inds
+            pos_kernels = kernels[pos_inds]  # n_pos, num_gen_params
+            pos_mask_logits = self._mask_predict_by_feat_single(
+                mask_feat, pos_kernels, pos_priors)
+            if gt_instances.masks.numel() == 0:
+                gt_masks = torch.empty_like(gt_instances.masks)
+            else:
+                gt_masks = gt_instances.masks[
+                    sampling_results.pos_assigned_gt_inds, :]
+            batch_pos_mask_logits.append(pos_mask_logits)
+            pos_gt_masks.append(gt_masks)
+
+        pos_gt_masks = torch.cat(pos_gt_masks, 0)
+        batch_pos_mask_logits = torch.cat(batch_pos_mask_logits, 0)
+
+        # avg_factor
+        num_pos = batch_pos_mask_logits.shape[0]
+        num_pos = reduce_mean(mask_feats.new_tensor([num_pos
+                                                     ])).clamp_(min=1).item()
+
+        if batch_pos_mask_logits.shape[0] == 0:
+            return mask_feats.sum() * 0
+
+        scale = self.prior_generator.strides[0][0] // self.mask_loss_stride
+        # upsample pred masks
+        batch_pos_mask_logits = F.interpolate(
+            batch_pos_mask_logits.unsqueeze(0),
+            scale_factor=scale,
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        # downsample gt masks
+        pos_gt_masks = pos_gt_masks[:, self.mask_loss_stride //
+                                    2::self.mask_loss_stride,
+                                    self.mask_loss_stride //
+                                    2::self.mask_loss_stride]
+
+        loss_mask = self.loss_mask(
+            batch_pos_mask_logits,
+            pos_gt_masks,
+            weight=None,
+            avg_factor=num_pos)
+
+        return loss_mask
+
+    def loss_by_feat(self,
+                     cls_scores: List[Tensor],
+                     bbox_preds: List[Tensor],
+                     kernel_preds: List[Tensor],
+                     mask_feat: Tensor,
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict],
+                     batch_gt_instances_ignore: OptInstanceList = None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1)
+        flatten_kernels = torch.cat([
+            kernel_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                    self.num_gen_params)
+            for kernel_pred in kernel_preds
+        ], 1)
+        decoded_bboxes = []
+        for anchor, bbox_pred in zip(anchor_list[0], bbox_preds):
+            anchor = anchor.reshape(-1, 4)
+            bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            bbox_pred = distance2bbox(anchor, bbox_pred)
+            decoded_bboxes.append(bbox_pred)
+
+        flatten_bboxes = torch.cat(decoded_bboxes, 1)
+        for gt_instances in batch_gt_instances:
+            gt_instances.masks = gt_instances.masks.to_tensor(
+                dtype=torch.bool, device=device)
+
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bboxes,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         assign_metrics_list, sampling_results_list) = cls_reg_targets
+
+        losses_cls, losses_bbox,\
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_by_feat_single,
+                cls_scores,
+                decoded_bboxes,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                assign_metrics_list,
+                self.prior_generator.strides)
+
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+
+        loss_mask = self.loss_mask_by_feat(mask_feat, flatten_kernels,
+                                           sampling_results_list,
+                                           batch_gt_instances)
+        loss = dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_mask=loss_mask)
+        return loss
+
+
+class MaskFeatModule(BaseModule):
+    """Mask feature head used in RTMDet-Ins.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels of the mask feature
+             map branch.
+        num_levels (int): The starting feature map level from RPN that
+             will be used to predict the mask feature map.
+        num_prototypes (int): Number of output channel of the mask feature
+             map branch. This is the channel count of the mask
+             feature map that to be dynamically convolved with the predicted
+             kernel.
+        stacked_convs (int): Number of convs in mask feature branch.
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True)
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 4,
+        num_levels: int = 3,
+        num_prototypes: int = 8,
+        act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+        norm_cfg: ConfigType = dict(type='BN')
+    ) -> None:
+        super().__init__(init_cfg=None)
+        self.num_levels = num_levels
+        self.fusion_conv = nn.Conv2d(num_levels * in_channels, in_channels, 1)
+        convs = []
+        for i in range(stacked_convs):
+            in_c = in_channels if i == 0 else feat_channels
+            convs.append(
+                ConvModule(
+                    in_c,
+                    feat_channels,
+                    3,
+                    padding=1,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg))
+        self.stacked_convs = nn.Sequential(*convs)
+        self.projection = nn.Conv2d(
+            feat_channels, num_prototypes, kernel_size=1)
+
+    def forward(self, features: Tuple[Tensor, ...]) -> Tensor:
+        # multi-level feature fusion
+        fusion_feats = [features[0]]
+        size = features[0].shape[-2:]
+        for i in range(1, self.num_levels):
+            f = F.interpolate(features[i], size=size, mode='bilinear')
+            fusion_feats.append(f)
+        fusion_feats = torch.cat(fusion_feats, dim=1)
+        fusion_feats = self.fusion_conv(fusion_feats)
+        # pred mask feats
+        mask_features = self.stacked_convs(fusion_feats)
+        mask_features = self.projection(mask_features)
+        return mask_features
+
+
+@MODELS.register_module()
+class RTMDetInsSepBNHead(RTMDetInsHead):
+    """Detection Head of RTMDet-Ins with sep-bn layers.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        share_conv (bool): Whether to share conv layers between stages.
+            Defaults to True.
+        norm_cfg (:obj:`ConfigDict` or dict)): Config dict for normalization
+            layer. Defaults to dict(type='BN').
+        act_cfg (:obj:`ConfigDict` or dict)): Config dict for activation layer.
+            Defaults to dict(type='SiLU', inplace=True).
+        pred_kernel_size (int): Kernel size of prediction layer. Defaults to 1.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 share_conv: bool = True,
+                 with_objectness: bool = False,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 pred_kernel_size: int = 1,
+                 **kwargs) -> None:
+        self.share_conv = share_conv
+        super().__init__(
+            num_classes,
+            in_channels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            pred_kernel_size=pred_kernel_size,
+            with_objectness=with_objectness,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.kernel_convs = nn.ModuleList()
+
+        self.rtm_cls = nn.ModuleList()
+        self.rtm_reg = nn.ModuleList()
+        self.rtm_kernel = nn.ModuleList()
+        self.rtm_obj = nn.ModuleList()
+
+        # calculate num dynamic parameters
+        weight_nums, bias_nums = [], []
+        for i in range(self.num_dyconvs):
+            if i == 0:
+                weight_nums.append(
+                    (self.num_prototypes + 2) * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels)
+            elif i == self.num_dyconvs - 1:
+                weight_nums.append(self.dyconv_channels)
+                bias_nums.append(1)
+            else:
+                weight_nums.append(self.dyconv_channels * self.dyconv_channels)
+                bias_nums.append(self.dyconv_channels)
+        self.weight_nums = weight_nums
+        self.bias_nums = bias_nums
+        self.num_gen_params = sum(weight_nums) + sum(bias_nums)
+        pred_pad_size = self.pred_kernel_size // 2
+
+        for n in range(len(self.prior_generator.strides)):
+            cls_convs = nn.ModuleList()
+            reg_convs = nn.ModuleList()
+            kernel_convs = nn.ModuleList()
+            for i in range(self.stacked_convs):
+                chn = self.in_channels if i == 0 else self.feat_channels
+                cls_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                kernel_convs.append(
+                    ConvModule(
+                        chn,
+                        self.feat_channels,
+                        3,
+                        stride=1,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            self.cls_convs.append(cls_convs)
+            self.reg_convs.append(cls_convs)
+            self.kernel_convs.append(kernel_convs)
+
+            self.rtm_cls.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * self.cls_out_channels,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+            self.rtm_reg.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_base_priors * 4,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+            self.rtm_kernel.append(
+                nn.Conv2d(
+                    self.feat_channels,
+                    self.num_gen_params,
+                    self.pred_kernel_size,
+                    padding=pred_pad_size))
+            if self.with_objectness:
+                self.rtm_obj.append(
+                    nn.Conv2d(
+                        self.feat_channels,
+                        1,
+                        self.pred_kernel_size,
+                        padding=pred_pad_size))
+
+        if self.share_conv:
+            for n in range(len(self.prior_generator.strides)):
+                for i in range(self.stacked_convs):
+                    self.cls_convs[n][i].conv = self.cls_convs[0][i].conv
+                    self.reg_convs[n][i].conv = self.reg_convs[0][i].conv
+
+        self.mask_head = MaskFeatModule(
+            in_channels=self.in_channels,
+            feat_channels=self.feat_channels,
+            stacked_convs=4,
+            num_levels=len(self.prior_generator.strides),
+            num_prototypes=self.num_prototypes,
+            act_cfg=self.act_cfg,
+            norm_cfg=self.norm_cfg)
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+        bias_cls = bias_init_with_prob(0.01)
+        for rtm_cls, rtm_reg, rtm_kernel in zip(self.rtm_cls, self.rtm_reg,
+                                                self.rtm_kernel):
+            normal_init(rtm_cls, std=0.01, bias=bias_cls)
+            normal_init(rtm_reg, std=0.01, bias=1)
+        if self.with_objectness:
+            for rtm_obj in self.rtm_obj:
+                normal_init(rtm_obj, std=0.01, bias=bias_cls)
+
+    def forward(self, feats: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+            - cls_scores (list[Tensor]): Classification scores for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_base_priors * 4.
+            - kernel_preds (list[Tensor]): Dynamic conv kernels for all scale
+              levels, each is a 4D-tensor, the channels number is
+              num_gen_params.
+            - mask_feat (Tensor): Output feature of the mask head. Each is a
+              4D-tensor, the channels number is num_prototypes.
+        """
+        mask_feat = self.mask_head(feats)
+
+        cls_scores = []
+        bbox_preds = []
+        kernel_preds = []
+        for idx, (x, stride) in enumerate(
+                zip(feats, self.prior_generator.strides)):
+            cls_feat = x
+            reg_feat = x
+            kernel_feat = x
+
+            for cls_layer in self.cls_convs[idx]:
+                cls_feat = cls_layer(cls_feat)
+            cls_score = self.rtm_cls[idx](cls_feat)
+
+            for kernel_layer in self.kernel_convs[idx]:
+                kernel_feat = kernel_layer(kernel_feat)
+            kernel_pred = self.rtm_kernel[idx](kernel_feat)
+
+            for reg_layer in self.reg_convs[idx]:
+                reg_feat = reg_layer(reg_feat)
+
+            if self.with_objectness:
+                objectness = self.rtm_obj[idx](reg_feat)
+                cls_score = inverse_sigmoid(
+                    sigmoid_geometric_mean(cls_score, objectness))
+
+            reg_dist = F.relu(self.rtm_reg[idx](reg_feat)) * stride[0]
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(reg_dist)
+            kernel_preds.append(kernel_pred)
+        return tuple(cls_scores), tuple(bbox_preds), tuple(
+            kernel_preds), mask_feat
diff --git a/head_extractor/src/mmdet/models/dense_heads/sabl_retina_head.py b/head_extractor/src/mmdet/models/dense_heads/sabl_retina_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cd1b71cc2c80035a0378180da70caddf853375d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/sabl_retina_head.py
@@ -0,0 +1,706 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptInstanceList)
+from ..task_modules.samplers import PseudoSampler
+from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply,
+                     unmap)
+from .base_dense_head import BaseDenseHead
+from .guided_anchor_head import GuidedAnchorHead
+
+
+@MODELS.register_module()
+class SABLRetinaHead(BaseDenseHead):
+    """Side-Aware Boundary Localization (SABL) for RetinaNet.
+
+    The anchor generation, assigning and sampling in SABLRetinaHead
+    are the same as GuidedAnchorHead for guided anchoring.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        stacked_convs (int): Number of Convs for classification and
+            regression branches. Defaults to 4.
+        feat_channels (int): Number of hidden channels. Defaults to 256.
+        approx_anchor_generator (:obj:`ConfigType` or dict): Config dict for
+            approx generator.
+        square_anchor_generator (:obj:`ConfigDict` or dict): Config dict for
+            square generator.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            ConvModule. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            Norm Layer. Defaults to None.
+        bbox_coder (:obj:`ConfigDict` or dict): Config dict for bbox coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be ``True`` when
+            using ``IoULoss``, ``GIoULoss``, or ``DIoULoss`` in the bbox head.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            SABLRetinaHead.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            SABLRetinaHead.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox_cls (:obj:`ConfigDict` or dict): Config of classification
+            loss for bbox branch.
+        loss_bbox_reg (:obj:`ConfigDict` or dict): Config of regression loss
+            for bbox branch.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        stacked_convs: int = 4,
+        feat_channels: int = 256,
+        approx_anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            octave_base_scale=4,
+            scales_per_octave=3,
+            ratios=[0.5, 1.0, 2.0],
+            strides=[8, 16, 32, 64, 128]),
+        square_anchor_generator: ConfigType = dict(
+            type='AnchorGenerator',
+            ratios=[1.0],
+            scales=[4],
+            strides=[8, 16, 32, 64, 128]),
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        bbox_coder: ConfigType = dict(
+            type='BucketingBBoxCoder', num_buckets=14, scale_factor=3.0),
+        reg_decoded_bbox: bool = False,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        loss_cls: ConfigType = dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox_cls: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.5),
+        loss_bbox_reg: ConfigType = dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.5),
+        init_cfg: MultiConfig = dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='retina_cls', std=0.01, bias_prob=0.01))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.num_buckets = bbox_coder['num_buckets']
+        self.side_num = int(np.ceil(self.num_buckets / 2))
+
+        assert (approx_anchor_generator['octave_base_scale'] ==
+                square_anchor_generator['scales'][0])
+        assert (approx_anchor_generator['strides'] ==
+                square_anchor_generator['strides'])
+
+        self.approx_anchor_generator = TASK_UTILS.build(
+            approx_anchor_generator)
+        self.square_anchor_generator = TASK_UTILS.build(
+            square_anchor_generator)
+        self.approxs_per_octave = (
+            self.approx_anchor_generator.num_base_priors[0])
+
+        # one anchor per location
+        self.num_base_priors = self.square_anchor_generator.num_base_priors[0]
+
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox_cls = MODELS.build(loss_bbox_cls)
+        self.loss_bbox_reg = MODELS.build(loss_bbox_reg)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            # use PseudoSampler when sampling is False
+            if 'sampler' in self.train_cfg:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.retina_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.retina_bbox_reg = nn.Conv2d(
+            self.feat_channels, self.side_num * 4, 3, padding=1)
+        self.retina_bbox_cls = nn.Conv2d(
+            self.feat_channels, self.side_num * 4, 3, padding=1)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+        cls_score = self.retina_cls(cls_feat)
+        bbox_cls_pred = self.retina_bbox_cls(reg_feat)
+        bbox_reg_pred = self.retina_bbox_reg(reg_feat)
+        bbox_pred = (bbox_cls_pred, bbox_reg_pred)
+        return cls_score, bbox_pred
+
+    def forward(self, feats: List[Tensor]) -> Tuple[List[Tensor]]:
+        return multi_apply(self.forward_single, feats)
+
+    def get_anchors(
+        self,
+        featmap_sizes: List[tuple],
+        img_metas: List[dict],
+        device: Union[torch.device, str] = 'cuda'
+    ) -> Tuple[List[List[Tensor]], List[List[Tensor]]]:
+        """Get squares according to feature map sizes and guided anchors.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): device for returned tensors
+
+        Returns:
+            tuple: square approxs of each image
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # squares for one time
+        multi_level_squares = self.square_anchor_generator.grid_priors(
+            featmap_sizes, device=device)
+        squares_list = [multi_level_squares for _ in range(num_imgs)]
+
+        return squares_list
+
+    def get_targets(self,
+                    approx_list: List[List[Tensor]],
+                    inside_flag_list: List[List[Tensor]],
+                    square_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas,
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs=True) -> tuple:
+        """Compute bucketing targets.
+
+        Args:
+            approx_list (list[list[Tensor]]): Multi level approxs of each
+                image.
+            inside_flag_list (list[list[Tensor]]): Multi level inside flags of
+                each image.
+            square_list (list[list[Tensor]]): Multi level squares of each
+                image.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors. Defaults to True.
+
+        Returns:
+            tuple: Returns a tuple containing learning targets.
+
+            - labels_list (list[Tensor]): Labels of each level.
+            - label_weights_list (list[Tensor]): Label weights of each level.
+            - bbox_cls_targets_list (list[Tensor]): BBox cls targets of \
+            each level.
+            - bbox_cls_weights_list (list[Tensor]): BBox cls weights of \
+            each level.
+            - bbox_reg_targets_list (list[Tensor]): BBox reg targets of \
+            each level.
+            - bbox_reg_weights_list (list[Tensor]): BBox reg weights of \
+            each level.
+            - num_total_pos (int): Number of positive samples in all images.
+            - num_total_neg (int): Number of negative samples in all images.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(approx_list) == len(inside_flag_list) == len(
+            square_list) == num_imgs
+        # anchor number of multi levels
+        num_level_squares = [squares.size(0) for squares in square_list[0]]
+        # concat all level anchors and flags to a single tensor
+        inside_flag_flat_list = []
+        approx_flat_list = []
+        square_flat_list = []
+        for i in range(num_imgs):
+            assert len(square_list[i]) == len(inside_flag_list[i])
+            inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
+            approx_flat_list.append(torch.cat(approx_list[i]))
+            square_flat_list.append(torch.cat(square_list[i]))
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None for _ in range(num_imgs)]
+        (all_labels, all_label_weights, all_bbox_cls_targets,
+         all_bbox_cls_weights, all_bbox_reg_targets, all_bbox_reg_weights,
+         pos_inds_list, neg_inds_list, sampling_results_list) = multi_apply(
+             self._get_targets_single,
+             approx_flat_list,
+             inside_flag_flat_list,
+             square_flat_list,
+             batch_gt_instances,
+             batch_img_metas,
+             batch_gt_instances_ignore,
+             unmap_outputs=unmap_outputs)
+
+        # sampled anchors of all images
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_squares)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_squares)
+        bbox_cls_targets_list = images_to_levels(all_bbox_cls_targets,
+                                                 num_level_squares)
+        bbox_cls_weights_list = images_to_levels(all_bbox_cls_weights,
+                                                 num_level_squares)
+        bbox_reg_targets_list = images_to_levels(all_bbox_reg_targets,
+                                                 num_level_squares)
+        bbox_reg_weights_list = images_to_levels(all_bbox_reg_weights,
+                                                 num_level_squares)
+        return (labels_list, label_weights_list, bbox_cls_targets_list,
+                bbox_cls_weights_list, bbox_reg_targets_list,
+                bbox_reg_weights_list, avg_factor)
+
+    def _get_targets_single(self,
+                            flat_approxs: Tensor,
+                            inside_flags: Tensor,
+                            flat_squares: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            flat_approxs (Tensor): flat approxs of a single image,
+                shape (n, 4)
+            inside_flags (Tensor): inside flags of a single image,
+                shape (n, ).
+            flat_squares (Tensor): flat squares of a single image,
+                shape (approxs_per_octave * n, 4)
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.  Defaults to True.
+
+        Returns:
+            tuple:
+
+            - labels_list (Tensor): Labels in a single image.
+            - label_weights (Tensor): Label weights in a single image.
+            - bbox_cls_targets (Tensor): BBox cls targets in a single image.
+            - bbox_cls_weights (Tensor): BBox cls weights in a single image.
+            - bbox_reg_targets (Tensor): BBox reg targets in a single image.
+            - bbox_reg_weights (Tensor): BBox reg weights in a single image.
+            - num_total_pos (int): Number of positive samples in a single \
+            image.
+            - num_total_neg (int): Number of negative samples in a single \
+            image.
+            - sampling_result (:obj:`SamplingResult`): Sampling result object.
+        """
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        num_square = flat_squares.size(0)
+        approxs = flat_approxs.view(num_square, self.approxs_per_octave, 4)
+        approxs = approxs[inside_flags, ...]
+        squares = flat_squares[inside_flags, :]
+
+        pred_instances = InstanceData()
+        pred_instances.priors = squares
+        pred_instances.approxs = approxs
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_squares = squares.shape[0]
+        bbox_cls_targets = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_cls_weights = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_reg_targets = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        bbox_reg_weights = squares.new_zeros(
+            (num_valid_squares, self.side_num * 4))
+        labels = squares.new_full((num_valid_squares, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = squares.new_zeros(num_valid_squares, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            (pos_bbox_reg_targets, pos_bbox_reg_weights, pos_bbox_cls_targets,
+             pos_bbox_cls_weights) = self.bbox_coder.encode(
+                 sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+
+            bbox_cls_targets[pos_inds, :] = pos_bbox_cls_targets
+            bbox_reg_targets[pos_inds, :] = pos_bbox_reg_targets
+            bbox_cls_weights[pos_inds, :] = pos_bbox_cls_weights
+            bbox_reg_weights[pos_inds, :] = pos_bbox_reg_weights
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_squares.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_cls_targets = unmap(bbox_cls_targets, num_total_anchors,
+                                     inside_flags)
+            bbox_cls_weights = unmap(bbox_cls_weights, num_total_anchors,
+                                     inside_flags)
+            bbox_reg_targets = unmap(bbox_reg_targets, num_total_anchors,
+                                     inside_flags)
+            bbox_reg_weights = unmap(bbox_reg_weights, num_total_anchors,
+                                     inside_flags)
+        return (labels, label_weights, bbox_cls_targets, bbox_cls_weights,
+                bbox_reg_targets, bbox_reg_weights, pos_inds, neg_inds,
+                sampling_result)
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            labels: Tensor, label_weights: Tensor,
+                            bbox_cls_targets: Tensor, bbox_cls_weights: Tensor,
+                            bbox_reg_targets: Tensor, bbox_reg_weights: Tensor,
+                            avg_factor: float) -> Tuple[Tensor]:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            labels (Tensor): Labels in a single image.
+            label_weights (Tensor): Label weights in a single level.
+            bbox_cls_targets (Tensor): BBox cls targets in a single level.
+            bbox_cls_weights (Tensor): BBox cls weights in a single level.
+            bbox_reg_targets (Tensor): BBox reg targets in a single level.
+            bbox_reg_weights (Tensor): BBox reg weights in a single level.
+            avg_factor (int): Average factor that is used to average the loss.
+
+        Returns:
+            tuple: loss components.
+        """
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=avg_factor)
+        # regression loss
+        bbox_cls_targets = bbox_cls_targets.reshape(-1, self.side_num * 4)
+        bbox_cls_weights = bbox_cls_weights.reshape(-1, self.side_num * 4)
+        bbox_reg_targets = bbox_reg_targets.reshape(-1, self.side_num * 4)
+        bbox_reg_weights = bbox_reg_weights.reshape(-1, self.side_num * 4)
+        (bbox_cls_pred, bbox_reg_pred) = bbox_pred
+        bbox_cls_pred = bbox_cls_pred.permute(0, 2, 3, 1).reshape(
+            -1, self.side_num * 4)
+        bbox_reg_pred = bbox_reg_pred.permute(0, 2, 3, 1).reshape(
+            -1, self.side_num * 4)
+        loss_bbox_cls = self.loss_bbox_cls(
+            bbox_cls_pred,
+            bbox_cls_targets.long(),
+            bbox_cls_weights,
+            avg_factor=avg_factor * 4 * self.side_num)
+        loss_bbox_reg = self.loss_bbox_reg(
+            bbox_reg_pred,
+            bbox_reg_targets,
+            bbox_reg_weights,
+            avg_factor=avg_factor * 4 * self.bbox_coder.offset_topk)
+        return loss_cls, loss_bbox_cls, loss_bbox_reg
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        # get sampled approxes
+        approxs_list, inside_flag_list = GuidedAnchorHead.get_sampled_approxs(
+            self, featmap_sizes, batch_img_metas, device=device)
+
+        square_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = self.get_targets(
+            approxs_list,
+            inside_flag_list,
+            square_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (labels_list, label_weights_list, bbox_cls_targets_list,
+         bbox_cls_weights_list, bbox_reg_targets_list, bbox_reg_weights_list,
+         avg_factor) = cls_reg_targets
+
+        losses_cls, losses_bbox_cls, losses_bbox_reg = multi_apply(
+            self.loss_by_feat_single,
+            cls_scores,
+            bbox_preds,
+            labels_list,
+            label_weights_list,
+            bbox_cls_targets_list,
+            bbox_cls_weights_list,
+            bbox_reg_targets_list,
+            bbox_reg_weights_list,
+            avg_factor=avg_factor)
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox_cls=losses_bbox_cls,
+            loss_bbox_reg=losses_bbox_reg)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        batch_img_metas: List[dict],
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+            cfg (:obj:`ConfigDict`, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+
+        device = cls_scores[0].device
+        mlvl_anchors = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_cls_pred_list = [
+                bbox_preds[i][0][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_reg_pred_list = [
+                bbox_preds[i][1][img_id].detach() for i in range(num_levels)
+            ]
+            proposals = self._predict_by_feat_single(
+                cls_scores=cls_score_list,
+                bbox_cls_preds=bbox_cls_pred_list,
+                bbox_reg_preds=bbox_reg_pred_list,
+                mlvl_anchors=mlvl_anchors[img_id],
+                img_meta=batch_img_metas[img_id],
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms)
+            result_list.append(proposals)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: List[Tensor],
+                                bbox_cls_preds: List[Tensor],
+                                bbox_reg_preds: List[Tensor],
+                                mlvl_anchors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigDict,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        cfg = self.test_cfg if cfg is None else cfg
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_confids = []
+        mlvl_labels = []
+        assert len(cls_scores) == len(bbox_cls_preds) == len(
+            bbox_reg_preds) == len(mlvl_anchors)
+        for cls_score, bbox_cls_pred, bbox_reg_pred, anchors in zip(
+                cls_scores, bbox_cls_preds, bbox_reg_preds, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_cls_pred.size(
+            )[-2:] == bbox_reg_pred.size()[-2::]
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)[:, :-1]
+            bbox_cls_pred = bbox_cls_pred.permute(1, 2, 0).reshape(
+                -1, self.side_num * 4)
+            bbox_reg_pred = bbox_reg_pred.permute(1, 2, 0).reshape(
+                -1, self.side_num * 4)
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(
+                    anchors=anchors,
+                    bbox_cls_pred=bbox_cls_pred,
+                    bbox_reg_pred=bbox_reg_pred))
+            scores, labels, _, filtered_results = results
+
+            anchors = filtered_results['anchors']
+            bbox_cls_pred = filtered_results['bbox_cls_pred']
+            bbox_reg_pred = filtered_results['bbox_reg_pred']
+
+            bbox_preds = [
+                bbox_cls_pred.contiguous(),
+                bbox_reg_pred.contiguous()
+            ]
+            bboxes, confids = self.bbox_coder.decode(
+                anchors.contiguous(),
+                bbox_preds,
+                max_shape=img_meta['img_shape'])
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_confids.append(confids)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.score_factors = torch.cat(mlvl_confids)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
diff --git a/head_extractor/src/mmdet/models/dense_heads/solo_head.py b/head_extractor/src/mmdet/models/dense_heads/solo_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cf338451358b01899faa4b299d33fafd7262d21
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/solo_head.py
@@ -0,0 +1,1263 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.utils.misc import floordiv
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType
+from ..layers import mask_matrix_nms
+from ..utils import center_of_mass, generate_coordinate, multi_apply
+from .base_mask_head import BaseMaskHead
+
+
+@MODELS.register_module()
+class SOLOHead(BaseMaskHead):
+    """SOLO mask head used in `SOLO: Segmenting Objects by Locations.
+
+    <https://arxiv.org/abs/1912.04488>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+            Defaults to 256.
+        stacked_convs (int): Number of stacking convs of the head.
+            Defaults to 4.
+        strides (tuple): Downsample factor of each feature map.
+        scale_ranges (tuple[tuple[int, int]]): Area range of multiple
+            level masks, in the format [(min1, max1), (min2, max2), ...].
+            A range of (16, 64) means the area range between (16, 64).
+        pos_scale (float): Constant scale factor to control the center region.
+        num_grids (list[int]): Divided image into a uniform grids, each
+            feature map has a different grid value. The number of output
+            channels is grid ** 2. Defaults to [40, 36, 24, 16, 12].
+        cls_down_index (int): The index of downsample operation in
+            classification branch. Defaults to 0.
+        loss_mask (dict): Config of mask loss.
+        loss_cls (dict): Config of classification loss.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Defaults to norm_cfg=dict(type='GN', num_groups=32,
+            requires_grad=True).
+        train_cfg (dict): Training config of head.
+        test_cfg (dict): Testing config of head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 4,
+        strides: tuple = (4, 8, 16, 32, 64),
+        scale_ranges: tuple = ((8, 32), (16, 64), (32, 128), (64, 256), (128,
+                                                                         512)),
+        pos_scale: float = 0.2,
+        num_grids: list = [40, 36, 24, 16, 12],
+        cls_down_index: int = 0,
+        loss_mask: ConfigType = dict(
+            type='DiceLoss', use_sigmoid=True, loss_weight=3.0),
+        loss_cls: ConfigType = dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        norm_cfg: ConfigType = dict(
+            type='GN', num_groups=32, requires_grad=True),
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        init_cfg: MultiConfig = [
+            dict(type='Normal', layer='Conv2d', std=0.01),
+            dict(
+                type='Normal',
+                std=0.01,
+                bias_prob=0.01,
+                override=dict(name='conv_mask_list')),
+            dict(
+                type='Normal',
+                std=0.01,
+                bias_prob=0.01,
+                override=dict(name='conv_cls'))
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.cls_out_channels = self.num_classes
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.num_grids = num_grids
+        # number of FPN feats
+        self.num_levels = len(strides)
+        assert self.num_levels == len(scale_ranges) == len(num_grids)
+        self.scale_ranges = scale_ranges
+        self.pos_scale = pos_scale
+
+        self.cls_down_index = cls_down_index
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_mask = MODELS.build(loss_mask)
+        self.norm_cfg = norm_cfg
+        self.init_cfg = init_cfg
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.mask_convs = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels + 2 if i == 0 else self.feat_channels
+            self.mask_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+        self.conv_mask_list = nn.ModuleList()
+        for num_grid in self.num_grids:
+            self.conv_mask_list.append(
+                nn.Conv2d(self.feat_channels, num_grid**2, 1))
+
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def resize_feats(self, x: Tuple[Tensor]) -> List[Tensor]:
+        """Downsample the first feat and upsample last feat in feats.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            list[Tensor]: Features after resizing, each is a 4D-tensor.
+        """
+        out = []
+        for i in range(len(x)):
+            if i == 0:
+                out.append(
+                    F.interpolate(x[0], scale_factor=0.5, mode='bilinear'))
+            elif i == len(x) - 1:
+                out.append(
+                    F.interpolate(
+                        x[i], size=x[i - 1].shape[-2:], mode='bilinear'))
+            else:
+                out.append(x[i])
+        return out
+
+    def forward(self, x: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and mask prediction.
+
+                - mlvl_mask_preds (list[Tensor]): Multi-level mask prediction.
+                  Each element in the list has shape
+                  (batch_size, num_grids**2 ,h ,w).
+                - mlvl_cls_preds (list[Tensor]): Multi-level scores.
+                  Each element in the list has shape
+                  (batch_size, num_classes, num_grids ,num_grids).
+        """
+        assert len(x) == self.num_levels
+        feats = self.resize_feats(x)
+        mlvl_mask_preds = []
+        mlvl_cls_preds = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            mask_feat = x
+            cls_feat = x
+            # generate and concat the coordinate
+            coord_feat = generate_coordinate(mask_feat.size(),
+                                             mask_feat.device)
+            mask_feat = torch.cat([mask_feat, coord_feat], 1)
+
+            for mask_layer in (self.mask_convs):
+                mask_feat = mask_layer(mask_feat)
+
+            mask_feat = F.interpolate(
+                mask_feat, scale_factor=2, mode='bilinear')
+            mask_preds = self.conv_mask_list[i](mask_feat)
+
+            # cls branch
+            for j, cls_layer in enumerate(self.cls_convs):
+                if j == self.cls_down_index:
+                    num_grid = self.num_grids[i]
+                    cls_feat = F.interpolate(
+                        cls_feat, size=num_grid, mode='bilinear')
+                cls_feat = cls_layer(cls_feat)
+
+            cls_pred = self.conv_cls(cls_feat)
+
+            if not self.training:
+                feat_wh = feats[0].size()[-2:]
+                upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2)
+                mask_preds = F.interpolate(
+                    mask_preds.sigmoid(), size=upsampled_size, mode='bilinear')
+                cls_pred = cls_pred.sigmoid()
+                # get local maximum
+                local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1)
+                keep_mask = local_max[:, :, :-1, :-1] == cls_pred
+                cls_pred = cls_pred * keep_mask
+
+            mlvl_mask_preds.append(mask_preds)
+            mlvl_cls_preds.append(cls_pred)
+        return mlvl_mask_preds, mlvl_cls_preds
+
+    def loss_by_feat(self, mlvl_mask_preds: List[Tensor],
+                     mlvl_cls_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mlvl_mask_preds (list[Tensor]): Multi-level mask prediction.
+                Each element in the list has shape
+                (batch_size, num_grids**2 ,h ,w).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_levels = self.num_levels
+        num_imgs = len(batch_img_metas)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in mlvl_mask_preds]
+
+        # `BoolTensor` in `pos_masks` represent
+        # whether the corresponding point is
+        # positive
+        pos_mask_targets, labels, pos_masks = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            featmap_sizes=featmap_sizes)
+
+        # change from the outside list meaning multi images
+        # to the outside list meaning multi levels
+        mlvl_pos_mask_targets = [[] for _ in range(num_levels)]
+        mlvl_pos_mask_preds = [[] for _ in range(num_levels)]
+        mlvl_pos_masks = [[] for _ in range(num_levels)]
+        mlvl_labels = [[] for _ in range(num_levels)]
+        for img_id in range(num_imgs):
+            assert num_levels == len(pos_mask_targets[img_id])
+            for lvl in range(num_levels):
+                mlvl_pos_mask_targets[lvl].append(
+                    pos_mask_targets[img_id][lvl])
+                mlvl_pos_mask_preds[lvl].append(
+                    mlvl_mask_preds[lvl][img_id, pos_masks[img_id][lvl], ...])
+                mlvl_pos_masks[lvl].append(pos_masks[img_id][lvl].flatten())
+                mlvl_labels[lvl].append(labels[img_id][lvl].flatten())
+
+        # cat multiple image
+        temp_mlvl_cls_preds = []
+        for lvl in range(num_levels):
+            mlvl_pos_mask_targets[lvl] = torch.cat(
+                mlvl_pos_mask_targets[lvl], dim=0)
+            mlvl_pos_mask_preds[lvl] = torch.cat(
+                mlvl_pos_mask_preds[lvl], dim=0)
+            mlvl_pos_masks[lvl] = torch.cat(mlvl_pos_masks[lvl], dim=0)
+            mlvl_labels[lvl] = torch.cat(mlvl_labels[lvl], dim=0)
+            temp_mlvl_cls_preds.append(mlvl_cls_preds[lvl].permute(
+                0, 2, 3, 1).reshape(-1, self.cls_out_channels))
+
+        num_pos = sum(item.sum() for item in mlvl_pos_masks)
+        # dice loss
+        loss_mask = []
+        for pred, target in zip(mlvl_pos_mask_preds, mlvl_pos_mask_targets):
+            if pred.size()[0] == 0:
+                loss_mask.append(pred.sum().unsqueeze(0))
+                continue
+            loss_mask.append(
+                self.loss_mask(pred, target, reduction_override='none'))
+        if num_pos > 0:
+            loss_mask = torch.cat(loss_mask).sum() / num_pos
+        else:
+            loss_mask = torch.cat(loss_mask).mean()
+
+        flatten_labels = torch.cat(mlvl_labels)
+        flatten_cls_preds = torch.cat(temp_mlvl_cls_preds)
+        loss_cls = self.loss_cls(
+            flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1)
+        return dict(loss_mask=loss_mask, loss_cls=loss_cls)
+
+    def _get_targets_single(self,
+                            gt_instances: InstanceData,
+                            featmap_sizes: Optional[list] = None) -> tuple:
+        """Compute targets for predictions of single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            featmap_sizes (list[:obj:`torch.size`]): Size of each
+                feature map from feature pyramid, each element
+                means (feat_h, feat_w). Defaults to None.
+
+        Returns:
+            Tuple: Usually returns a tuple containing targets for predictions.
+
+                - mlvl_pos_mask_targets (list[Tensor]): Each element represent
+                  the binary mask targets for positive points in this
+                  level, has shape (num_pos, out_h, out_w).
+                - mlvl_labels (list[Tensor]): Each element is
+                  classification labels for all
+                  points in this level, has shape
+                  (num_grid, num_grid).
+                - mlvl_pos_masks (list[Tensor]): Each element is
+                  a `BoolTensor` to represent whether the
+                  corresponding point in single level
+                  is positive, has shape (num_grid **2).
+        """
+        gt_labels = gt_instances.labels
+        device = gt_labels.device
+
+        gt_bboxes = gt_instances.bboxes
+        gt_areas = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                              (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device)
+
+        mlvl_pos_mask_targets = []
+        mlvl_labels = []
+        mlvl_pos_masks = []
+        for (lower_bound, upper_bound), stride, featmap_size, num_grid \
+                in zip(self.scale_ranges, self.strides,
+                       featmap_sizes, self.num_grids):
+
+            mask_target = torch.zeros(
+                [num_grid**2, featmap_size[0], featmap_size[1]],
+                dtype=torch.uint8,
+                device=device)
+            # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+            labels = torch.zeros([num_grid, num_grid],
+                                 dtype=torch.int64,
+                                 device=device) + self.num_classes
+            pos_mask = torch.zeros([num_grid**2],
+                                   dtype=torch.bool,
+                                   device=device)
+
+            gt_inds = ((gt_areas >= lower_bound) &
+                       (gt_areas <= upper_bound)).nonzero().flatten()
+            if len(gt_inds) == 0:
+                mlvl_pos_mask_targets.append(
+                    mask_target.new_zeros(0, featmap_size[0], featmap_size[1]))
+                mlvl_labels.append(labels)
+                mlvl_pos_masks.append(pos_mask)
+                continue
+            hit_gt_bboxes = gt_bboxes[gt_inds]
+            hit_gt_labels = gt_labels[gt_inds]
+            hit_gt_masks = gt_masks[gt_inds, ...]
+
+            pos_w_ranges = 0.5 * (hit_gt_bboxes[:, 2] -
+                                  hit_gt_bboxes[:, 0]) * self.pos_scale
+            pos_h_ranges = 0.5 * (hit_gt_bboxes[:, 3] -
+                                  hit_gt_bboxes[:, 1]) * self.pos_scale
+
+            # Make sure hit_gt_masks has a value
+            valid_mask_flags = hit_gt_masks.sum(dim=-1).sum(dim=-1) > 0
+            output_stride = stride / 2
+
+            for gt_mask, gt_label, pos_h_range, pos_w_range, \
+                valid_mask_flag in \
+                    zip(hit_gt_masks, hit_gt_labels, pos_h_ranges,
+                        pos_w_ranges, valid_mask_flags):
+                if not valid_mask_flag:
+                    continue
+                upsampled_size = (featmap_sizes[0][0] * 4,
+                                  featmap_sizes[0][1] * 4)
+                center_h, center_w = center_of_mass(gt_mask)
+
+                coord_w = int(
+                    floordiv((center_w / upsampled_size[1]), (1. / num_grid),
+                             rounding_mode='trunc'))
+                coord_h = int(
+                    floordiv((center_h / upsampled_size[0]), (1. / num_grid),
+                             rounding_mode='trunc'))
+
+                # left, top, right, down
+                top_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_h - pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                down_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_h + pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                left_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_w - pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                right_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_w + pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+
+                top = max(top_box, coord_h - 1)
+                down = min(down_box, coord_h + 1)
+                left = max(coord_w - 1, left_box)
+                right = min(right_box, coord_w + 1)
+
+                labels[top:(down + 1), left:(right + 1)] = gt_label
+                # ins
+                gt_mask = np.uint8(gt_mask.cpu().numpy())
+                # Follow the original implementation, F.interpolate is
+                # different from cv2 and opencv
+                gt_mask = mmcv.imrescale(gt_mask, scale=1. / output_stride)
+                gt_mask = torch.from_numpy(gt_mask).to(device=device)
+
+                for i in range(top, down + 1):
+                    for j in range(left, right + 1):
+                        index = int(i * num_grid + j)
+                        mask_target[index, :gt_mask.shape[0], :gt_mask.
+                                    shape[1]] = gt_mask
+                        pos_mask[index] = True
+            mlvl_pos_mask_targets.append(mask_target[pos_mask])
+            mlvl_labels.append(labels)
+            mlvl_pos_masks.append(pos_mask)
+        return mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks
+
+    def predict_by_feat(self, mlvl_mask_preds: List[Tensor],
+                        mlvl_cls_scores: List[Tensor],
+                        batch_img_metas: List[dict], **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mlvl_mask_preds (list[Tensor]): Multi-level mask prediction.
+                Each element in the list has shape
+                (batch_size, num_grids**2 ,h ,w).
+            mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids ,num_grids).
+            batch_img_metas (list[dict]): Meta information of all images.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        mlvl_cls_scores = [
+            item.permute(0, 2, 3, 1) for item in mlvl_cls_scores
+        ]
+        assert len(mlvl_mask_preds) == len(mlvl_cls_scores)
+        num_levels = len(mlvl_cls_scores)
+
+        results_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_pred_list = [
+                mlvl_cls_scores[lvl][img_id].view(-1, self.cls_out_channels)
+                for lvl in range(num_levels)
+            ]
+            mask_pred_list = [
+                mlvl_mask_preds[lvl][img_id] for lvl in range(num_levels)
+            ]
+
+            cls_pred_list = torch.cat(cls_pred_list, dim=0)
+            mask_pred_list = torch.cat(mask_pred_list, dim=0)
+            img_meta = batch_img_metas[img_id]
+
+            results = self._predict_by_feat_single(
+                cls_pred_list, mask_pred_list, img_meta=img_meta)
+            results_list.append(results)
+
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: Tensor,
+                                mask_preds: Tensor,
+                                img_meta: dict,
+                                cfg: OptConfigType = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            cls_scores (Tensor): Classification score of all points
+                in single image, has shape (num_points, num_classes).
+            mask_preds (Tensor): Mask prediction of all points in
+                single image, has shape (num_points, feat_h, feat_w).
+            img_meta (dict): Meta information of corresponding image.
+            cfg (dict, optional): Config used in test phase.
+                Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+
+        def empty_results(cls_scores, ori_shape):
+            """Generate a empty results."""
+            results = InstanceData()
+            results.scores = cls_scores.new_ones(0)
+            results.masks = cls_scores.new_zeros(0, *ori_shape)
+            results.labels = cls_scores.new_ones(0)
+            results.bboxes = cls_scores.new_zeros(0, 4)
+            return results
+
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(mask_preds)
+
+        featmap_size = mask_preds.size()[-2:]
+
+        h, w = img_meta['img_shape'][:2]
+        upsampled_size = (featmap_size[0] * 4, featmap_size[1] * 4)
+
+        score_mask = (cls_scores > cfg.score_thr)
+        cls_scores = cls_scores[score_mask]
+        if len(cls_scores) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+
+        inds = score_mask.nonzero()
+        cls_labels = inds[:, 1]
+
+        # Filter the mask mask with an area is smaller than
+        # stride of corresponding feature level
+        lvl_interval = cls_labels.new_tensor(self.num_grids).pow(2).cumsum(0)
+        strides = cls_scores.new_ones(lvl_interval[-1])
+        strides[:lvl_interval[0]] *= self.strides[0]
+        for lvl in range(1, self.num_levels):
+            strides[lvl_interval[lvl -
+                                 1]:lvl_interval[lvl]] *= self.strides[lvl]
+        strides = strides[inds[:, 0]]
+        mask_preds = mask_preds[inds[:, 0]]
+
+        masks = mask_preds > cfg.mask_thr
+        sum_masks = masks.sum((1, 2)).float()
+        keep = sum_masks > strides
+        if keep.sum() == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        masks = masks[keep]
+        mask_preds = mask_preds[keep]
+        sum_masks = sum_masks[keep]
+        cls_scores = cls_scores[keep]
+        cls_labels = cls_labels[keep]
+
+        # maskness.
+        mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks
+        cls_scores *= mask_scores
+
+        scores, labels, _, keep_inds = mask_matrix_nms(
+            masks,
+            cls_labels,
+            cls_scores,
+            mask_area=sum_masks,
+            nms_pre=cfg.nms_pre,
+            max_num=cfg.max_per_img,
+            kernel=cfg.kernel,
+            sigma=cfg.sigma,
+            filter_thr=cfg.filter_thr)
+        # mask_matrix_nms may return an empty Tensor
+        if len(keep_inds) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        mask_preds = mask_preds[keep_inds]
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(0), size=upsampled_size,
+            mode='bilinear')[:, :, :h, :w]
+        mask_preds = F.interpolate(
+            mask_preds, size=img_meta['ori_shape'][:2],
+            mode='bilinear').squeeze(0)
+        masks = mask_preds > cfg.mask_thr
+
+        results = InstanceData()
+        results.masks = masks
+        results.labels = labels
+        results.scores = scores
+        # create an empty bbox in InstanceData to avoid bugs when
+        # calculating metrics.
+        results.bboxes = results.scores.new_zeros(len(scores), 4)
+        return results
+
+
+@MODELS.register_module()
+class DecoupledSOLOHead(SOLOHead):
+    """Decoupled SOLO mask head used in `SOLO: Segmenting Objects by Locations.
+
+    <https://arxiv.org/abs/1912.04488>`_
+
+    Args:
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 *args,
+                 init_cfg: MultiConfig = [
+                     dict(type='Normal', layer='Conv2d', std=0.01),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_x')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_y')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_cls'))
+                 ],
+                 **kwargs) -> None:
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self) -> None:
+        self.mask_convs_x = nn.ModuleList()
+        self.mask_convs_y = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+
+        for i in range(self.stacked_convs):
+            chn = self.in_channels + 1 if i == 0 else self.feat_channels
+            self.mask_convs_x.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+            self.mask_convs_y.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+
+        self.conv_mask_list_x = nn.ModuleList()
+        self.conv_mask_list_y = nn.ModuleList()
+        for num_grid in self.num_grids:
+            self.conv_mask_list_x.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+            self.conv_mask_list_y.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and mask prediction.
+
+                - mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                  from x branch. Each element in the list has shape
+                  (batch_size, num_grids ,h ,w).
+                - mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction
+                  from y branch. Each element in the list has shape
+                  (batch_size, num_grids ,h ,w).
+                - mlvl_cls_preds (list[Tensor]): Multi-level scores.
+                  Each element in the list has shape
+                  (batch_size, num_classes, num_grids ,num_grids).
+        """
+        assert len(x) == self.num_levels
+        feats = self.resize_feats(x)
+        mask_preds_x = []
+        mask_preds_y = []
+        cls_preds = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            mask_feat = x
+            cls_feat = x
+            # generate and concat the coordinate
+            coord_feat = generate_coordinate(mask_feat.size(),
+                                             mask_feat.device)
+            mask_feat_x = torch.cat([mask_feat, coord_feat[:, 0:1, ...]], 1)
+            mask_feat_y = torch.cat([mask_feat, coord_feat[:, 1:2, ...]], 1)
+
+            for mask_layer_x, mask_layer_y in \
+                    zip(self.mask_convs_x, self.mask_convs_y):
+                mask_feat_x = mask_layer_x(mask_feat_x)
+                mask_feat_y = mask_layer_y(mask_feat_y)
+
+            mask_feat_x = F.interpolate(
+                mask_feat_x, scale_factor=2, mode='bilinear')
+            mask_feat_y = F.interpolate(
+                mask_feat_y, scale_factor=2, mode='bilinear')
+
+            mask_pred_x = self.conv_mask_list_x[i](mask_feat_x)
+            mask_pred_y = self.conv_mask_list_y[i](mask_feat_y)
+
+            # cls branch
+            for j, cls_layer in enumerate(self.cls_convs):
+                if j == self.cls_down_index:
+                    num_grid = self.num_grids[i]
+                    cls_feat = F.interpolate(
+                        cls_feat, size=num_grid, mode='bilinear')
+                cls_feat = cls_layer(cls_feat)
+
+            cls_pred = self.conv_cls(cls_feat)
+
+            if not self.training:
+                feat_wh = feats[0].size()[-2:]
+                upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2)
+                mask_pred_x = F.interpolate(
+                    mask_pred_x.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                mask_pred_y = F.interpolate(
+                    mask_pred_y.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                cls_pred = cls_pred.sigmoid()
+                # get local maximum
+                local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1)
+                keep_mask = local_max[:, :, :-1, :-1] == cls_pred
+                cls_pred = cls_pred * keep_mask
+
+            mask_preds_x.append(mask_pred_x)
+            mask_preds_y.append(mask_pred_y)
+            cls_preds.append(cls_pred)
+        return mask_preds_x, mask_preds_y, cls_preds
+
+    def loss_by_feat(self, mlvl_mask_preds_x: List[Tensor],
+                     mlvl_mask_preds_y: List[Tensor],
+                     mlvl_cls_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                from x branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction
+                from y branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_cls_preds (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids ,num_grids).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_levels = self.num_levels
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in mlvl_mask_preds_x]
+
+        pos_mask_targets, labels, xy_pos_indexes = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            featmap_sizes=featmap_sizes)
+
+        # change from the outside list meaning multi images
+        # to the outside list meaning multi levels
+        mlvl_pos_mask_targets = [[] for _ in range(num_levels)]
+        mlvl_pos_mask_preds_x = [[] for _ in range(num_levels)]
+        mlvl_pos_mask_preds_y = [[] for _ in range(num_levels)]
+        mlvl_labels = [[] for _ in range(num_levels)]
+        for img_id in range(num_imgs):
+
+            for lvl in range(num_levels):
+                mlvl_pos_mask_targets[lvl].append(
+                    pos_mask_targets[img_id][lvl])
+                mlvl_pos_mask_preds_x[lvl].append(
+                    mlvl_mask_preds_x[lvl][img_id,
+                                           xy_pos_indexes[img_id][lvl][:, 1]])
+                mlvl_pos_mask_preds_y[lvl].append(
+                    mlvl_mask_preds_y[lvl][img_id,
+                                           xy_pos_indexes[img_id][lvl][:, 0]])
+                mlvl_labels[lvl].append(labels[img_id][lvl].flatten())
+
+        # cat multiple image
+        temp_mlvl_cls_preds = []
+        for lvl in range(num_levels):
+            mlvl_pos_mask_targets[lvl] = torch.cat(
+                mlvl_pos_mask_targets[lvl], dim=0)
+            mlvl_pos_mask_preds_x[lvl] = torch.cat(
+                mlvl_pos_mask_preds_x[lvl], dim=0)
+            mlvl_pos_mask_preds_y[lvl] = torch.cat(
+                mlvl_pos_mask_preds_y[lvl], dim=0)
+            mlvl_labels[lvl] = torch.cat(mlvl_labels[lvl], dim=0)
+            temp_mlvl_cls_preds.append(mlvl_cls_preds[lvl].permute(
+                0, 2, 3, 1).reshape(-1, self.cls_out_channels))
+
+        num_pos = 0.
+        # dice loss
+        loss_mask = []
+        for pred_x, pred_y, target in \
+                zip(mlvl_pos_mask_preds_x,
+                    mlvl_pos_mask_preds_y, mlvl_pos_mask_targets):
+            num_masks = pred_x.size(0)
+            if num_masks == 0:
+                # make sure can get grad
+                loss_mask.append((pred_x.sum() + pred_y.sum()).unsqueeze(0))
+                continue
+            num_pos += num_masks
+            pred_mask = pred_y.sigmoid() * pred_x.sigmoid()
+            loss_mask.append(
+                self.loss_mask(pred_mask, target, reduction_override='none'))
+        if num_pos > 0:
+            loss_mask = torch.cat(loss_mask).sum() / num_pos
+        else:
+            loss_mask = torch.cat(loss_mask).mean()
+
+        # cate
+        flatten_labels = torch.cat(mlvl_labels)
+        flatten_cls_preds = torch.cat(temp_mlvl_cls_preds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1)
+        return dict(loss_mask=loss_mask, loss_cls=loss_cls)
+
+    def _get_targets_single(self,
+                            gt_instances: InstanceData,
+                            featmap_sizes: Optional[list] = None) -> tuple:
+        """Compute targets for predictions of single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            featmap_sizes (list[:obj:`torch.size`]): Size of each
+                feature map from feature pyramid, each element
+                means (feat_h, feat_w). Defaults to None.
+
+        Returns:
+            Tuple: Usually returns a tuple containing targets for predictions.
+
+                - mlvl_pos_mask_targets (list[Tensor]): Each element represent
+                  the binary mask targets for positive points in this
+                  level, has shape (num_pos, out_h, out_w).
+                - mlvl_labels (list[Tensor]): Each element is
+                  classification labels for all
+                  points in this level, has shape
+                  (num_grid, num_grid).
+                - mlvl_xy_pos_indexes (list[Tensor]): Each element
+                  in the list contains the index of positive samples in
+                  corresponding level, has shape (num_pos, 2), last
+                  dimension 2 present (index_x, index_y).
+        """
+        mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks = \
+            super()._get_targets_single(gt_instances,
+                                        featmap_sizes=featmap_sizes)
+
+        mlvl_xy_pos_indexes = [(item - self.num_classes).nonzero()
+                               for item in mlvl_labels]
+
+        return mlvl_pos_mask_targets, mlvl_labels, mlvl_xy_pos_indexes
+
+    def predict_by_feat(self, mlvl_mask_preds_x: List[Tensor],
+                        mlvl_mask_preds_y: List[Tensor],
+                        mlvl_cls_scores: List[Tensor],
+                        batch_img_metas: List[dict], **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                from x branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction
+                from y branch. Each element in the list has shape
+                (batch_size, num_grids ,h ,w).
+            mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes ,num_grids ,num_grids).
+            batch_img_metas (list[dict]): Meta information of all images.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        mlvl_cls_scores = [
+            item.permute(0, 2, 3, 1) for item in mlvl_cls_scores
+        ]
+        assert len(mlvl_mask_preds_x) == len(mlvl_cls_scores)
+        num_levels = len(mlvl_cls_scores)
+
+        results_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_pred_list = [
+                mlvl_cls_scores[i][img_id].view(
+                    -1, self.cls_out_channels).detach()
+                for i in range(num_levels)
+            ]
+            mask_pred_list_x = [
+                mlvl_mask_preds_x[i][img_id] for i in range(num_levels)
+            ]
+            mask_pred_list_y = [
+                mlvl_mask_preds_y[i][img_id] for i in range(num_levels)
+            ]
+
+            cls_pred_list = torch.cat(cls_pred_list, dim=0)
+            mask_pred_list_x = torch.cat(mask_pred_list_x, dim=0)
+            mask_pred_list_y = torch.cat(mask_pred_list_y, dim=0)
+            img_meta = batch_img_metas[img_id]
+
+            results = self._predict_by_feat_single(
+                cls_pred_list,
+                mask_pred_list_x,
+                mask_pred_list_y,
+                img_meta=img_meta)
+            results_list.append(results)
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                cls_scores: Tensor,
+                                mask_preds_x: Tensor,
+                                mask_preds_y: Tensor,
+                                img_meta: dict,
+                                cfg: OptConfigType = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            cls_scores (Tensor): Classification score of all points
+                in single image, has shape (num_points, num_classes).
+            mask_preds_x (Tensor): Mask prediction of x branch of
+                all points in single image, has shape
+                (sum_num_grids, feat_h, feat_w).
+            mask_preds_y (Tensor): Mask prediction of y branch of
+                all points in single image, has shape
+                (sum_num_grids, feat_h, feat_w).
+            img_meta (dict): Meta information of corresponding image.
+            cfg (dict): Config used in test phase.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+
+        def empty_results(cls_scores, ori_shape):
+            """Generate a empty results."""
+            results = InstanceData()
+            results.scores = cls_scores.new_ones(0)
+            results.masks = cls_scores.new_zeros(0, *ori_shape)
+            results.labels = cls_scores.new_ones(0)
+            results.bboxes = cls_scores.new_zeros(0, 4)
+            return results
+
+        cfg = self.test_cfg if cfg is None else cfg
+
+        featmap_size = mask_preds_x.size()[-2:]
+
+        h, w = img_meta['img_shape'][:2]
+        upsampled_size = (featmap_size[0] * 4, featmap_size[1] * 4)
+
+        score_mask = (cls_scores > cfg.score_thr)
+        cls_scores = cls_scores[score_mask]
+        inds = score_mask.nonzero()
+        lvl_interval = inds.new_tensor(self.num_grids).pow(2).cumsum(0)
+        num_all_points = lvl_interval[-1]
+        lvl_start_index = inds.new_ones(num_all_points)
+        num_grids = inds.new_ones(num_all_points)
+        seg_size = inds.new_tensor(self.num_grids).cumsum(0)
+        mask_lvl_start_index = inds.new_ones(num_all_points)
+        strides = inds.new_ones(num_all_points)
+
+        lvl_start_index[:lvl_interval[0]] *= 0
+        mask_lvl_start_index[:lvl_interval[0]] *= 0
+        num_grids[:lvl_interval[0]] *= self.num_grids[0]
+        strides[:lvl_interval[0]] *= self.strides[0]
+
+        for lvl in range(1, self.num_levels):
+            lvl_start_index[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                lvl_interval[lvl - 1]
+            mask_lvl_start_index[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                seg_size[lvl - 1]
+            num_grids[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                self.num_grids[lvl]
+            strides[lvl_interval[lvl - 1]:lvl_interval[lvl]] *= \
+                self.strides[lvl]
+
+        lvl_start_index = lvl_start_index[inds[:, 0]]
+        mask_lvl_start_index = mask_lvl_start_index[inds[:, 0]]
+        num_grids = num_grids[inds[:, 0]]
+        strides = strides[inds[:, 0]]
+
+        y_lvl_offset = (inds[:, 0] - lvl_start_index) // num_grids
+        x_lvl_offset = (inds[:, 0] - lvl_start_index) % num_grids
+        y_inds = mask_lvl_start_index + y_lvl_offset
+        x_inds = mask_lvl_start_index + x_lvl_offset
+
+        cls_labels = inds[:, 1]
+        mask_preds = mask_preds_x[x_inds, ...] * mask_preds_y[y_inds, ...]
+
+        masks = mask_preds > cfg.mask_thr
+        sum_masks = masks.sum((1, 2)).float()
+        keep = sum_masks > strides
+        if keep.sum() == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+
+        masks = masks[keep]
+        mask_preds = mask_preds[keep]
+        sum_masks = sum_masks[keep]
+        cls_scores = cls_scores[keep]
+        cls_labels = cls_labels[keep]
+
+        # maskness.
+        mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks
+        cls_scores *= mask_scores
+
+        scores, labels, _, keep_inds = mask_matrix_nms(
+            masks,
+            cls_labels,
+            cls_scores,
+            mask_area=sum_masks,
+            nms_pre=cfg.nms_pre,
+            max_num=cfg.max_per_img,
+            kernel=cfg.kernel,
+            sigma=cfg.sigma,
+            filter_thr=cfg.filter_thr)
+        # mask_matrix_nms may return an empty Tensor
+        if len(keep_inds) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        mask_preds = mask_preds[keep_inds]
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(0), size=upsampled_size,
+            mode='bilinear')[:, :, :h, :w]
+        mask_preds = F.interpolate(
+            mask_preds, size=img_meta['ori_shape'][:2],
+            mode='bilinear').squeeze(0)
+        masks = mask_preds > cfg.mask_thr
+
+        results = InstanceData()
+        results.masks = masks
+        results.labels = labels
+        results.scores = scores
+        # create an empty bbox in InstanceData to avoid bugs when
+        # calculating metrics.
+        results.bboxes = results.scores.new_zeros(len(scores), 4)
+
+        return results
+
+
+@MODELS.register_module()
+class DecoupledSOLOLightHead(DecoupledSOLOHead):
+    """Decoupled Light SOLO mask head used in `SOLO: Segmenting Objects by
+    Locations <https://arxiv.org/abs/1912.04488>`_
+
+    Args:
+        with_dcn (bool): Whether use dcn in mask_convs and cls_convs,
+            Defaults to False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 *args,
+                 dcn_cfg: OptConfigType = None,
+                 init_cfg: MultiConfig = [
+                     dict(type='Normal', layer='Conv2d', std=0.01),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_x')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_mask_list_y')),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_cls'))
+                 ],
+                 **kwargs) -> None:
+        assert dcn_cfg is None or isinstance(dcn_cfg, dict)
+        self.dcn_cfg = dcn_cfg
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self) -> None:
+        self.mask_convs = nn.ModuleList()
+        self.cls_convs = nn.ModuleList()
+
+        for i in range(self.stacked_convs):
+            if self.dcn_cfg is not None \
+                    and i == self.stacked_convs - 1:
+                conv_cfg = self.dcn_cfg
+            else:
+                conv_cfg = None
+
+            chn = self.in_channels + 2 if i == 0 else self.feat_channels
+            self.mask_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+        self.conv_mask_list_x = nn.ModuleList()
+        self.conv_mask_list_y = nn.ModuleList()
+        for num_grid in self.num_grids:
+            self.conv_mask_list_x.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+            self.conv_mask_list_y.append(
+                nn.Conv2d(self.feat_channels, num_grid, 3, padding=1))
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and mask prediction.
+
+                - mlvl_mask_preds_x (list[Tensor]): Multi-level mask prediction
+                  from x branch. Each element in the list has shape
+                  (batch_size, num_grids ,h ,w).
+                - mlvl_mask_preds_y (list[Tensor]): Multi-level mask prediction
+                  from y branch. Each element in the list has shape
+                  (batch_size, num_grids ,h ,w).
+                - mlvl_cls_preds (list[Tensor]): Multi-level scores.
+                  Each element in the list has shape
+                  (batch_size, num_classes, num_grids ,num_grids).
+        """
+        assert len(x) == self.num_levels
+        feats = self.resize_feats(x)
+        mask_preds_x = []
+        mask_preds_y = []
+        cls_preds = []
+        for i in range(self.num_levels):
+            x = feats[i]
+            mask_feat = x
+            cls_feat = x
+            # generate and concat the coordinate
+            coord_feat = generate_coordinate(mask_feat.size(),
+                                             mask_feat.device)
+            mask_feat = torch.cat([mask_feat, coord_feat], 1)
+
+            for mask_layer in self.mask_convs:
+                mask_feat = mask_layer(mask_feat)
+
+            mask_feat = F.interpolate(
+                mask_feat, scale_factor=2, mode='bilinear')
+
+            mask_pred_x = self.conv_mask_list_x[i](mask_feat)
+            mask_pred_y = self.conv_mask_list_y[i](mask_feat)
+
+            # cls branch
+            for j, cls_layer in enumerate(self.cls_convs):
+                if j == self.cls_down_index:
+                    num_grid = self.num_grids[i]
+                    cls_feat = F.interpolate(
+                        cls_feat, size=num_grid, mode='bilinear')
+                cls_feat = cls_layer(cls_feat)
+
+            cls_pred = self.conv_cls(cls_feat)
+
+            if not self.training:
+                feat_wh = feats[0].size()[-2:]
+                upsampled_size = (feat_wh[0] * 2, feat_wh[1] * 2)
+                mask_pred_x = F.interpolate(
+                    mask_pred_x.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                mask_pred_y = F.interpolate(
+                    mask_pred_y.sigmoid(),
+                    size=upsampled_size,
+                    mode='bilinear')
+                cls_pred = cls_pred.sigmoid()
+                # get local maximum
+                local_max = F.max_pool2d(cls_pred, 2, stride=1, padding=1)
+                keep_mask = local_max[:, :, :-1, :-1] == cls_pred
+                cls_pred = cls_pred * keep_mask
+
+            mask_preds_x.append(mask_pred_x)
+            mask_preds_y.append(mask_pred_y)
+            cls_preds.append(cls_pred)
+        return mask_preds_x, mask_preds_y, cls_preds
diff --git a/head_extractor/src/mmdet/models/dense_heads/solov2_head.py b/head_extractor/src/mmdet/models/dense_heads/solov2_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..35b9df0c45148cb18e8afb659b10dd0b9e866b99
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/solov2_head.py
@@ -0,0 +1,799 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional, Tuple
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.utils.misc import floordiv
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType
+from ..layers import mask_matrix_nms
+from ..utils import center_of_mass, generate_coordinate, multi_apply
+from .solo_head import SOLOHead
+
+
+class MaskFeatModule(BaseModule):
+    """SOLOv2 mask feature map branch used in `SOLOv2: Dynamic and Fast
+    Instance Segmentation. <https://arxiv.org/pdf/2003.10152>`_
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels of the mask feature
+             map branch.
+        start_level (int): The starting feature map level from RPN that
+             will be used to predict the mask feature map.
+        end_level (int): The ending feature map level from rpn that
+             will be used to predict the mask feature map.
+        out_channels (int): Number of output channels of the mask feature
+             map branch. This is the channel count of the mask
+             feature map that to be dynamically convolved with the predicted
+             kernel.
+        mask_stride (int): Downsample factor of the mask feature map output.
+            Defaults to 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        feat_channels: int,
+        start_level: int,
+        end_level: int,
+        out_channels: int,
+        mask_stride: int = 4,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        init_cfg: MultiConfig = [
+            dict(type='Normal', layer='Conv2d', std=0.01)
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.start_level = start_level
+        self.end_level = end_level
+        self.mask_stride = mask_stride
+        assert start_level >= 0 and end_level >= start_level
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self._init_layers()
+        self.fp16_enabled = False
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.convs_all_levels = nn.ModuleList()
+        for i in range(self.start_level, self.end_level + 1):
+            convs_per_level = nn.Sequential()
+            if i == 0:
+                convs_per_level.add_module(
+                    f'conv{i}',
+                    ConvModule(
+                        self.in_channels,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        inplace=False))
+                self.convs_all_levels.append(convs_per_level)
+                continue
+
+            for j in range(i):
+                if j == 0:
+                    if i == self.end_level:
+                        chn = self.in_channels + 2
+                    else:
+                        chn = self.in_channels
+                    convs_per_level.add_module(
+                        f'conv{j}',
+                        ConvModule(
+                            chn,
+                            self.feat_channels,
+                            3,
+                            padding=1,
+                            conv_cfg=self.conv_cfg,
+                            norm_cfg=self.norm_cfg,
+                            inplace=False))
+                    convs_per_level.add_module(
+                        f'upsample{j}',
+                        nn.Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=False))
+                    continue
+
+                convs_per_level.add_module(
+                    f'conv{j}',
+                    ConvModule(
+                        self.feat_channels,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        inplace=False))
+                convs_per_level.add_module(
+                    f'upsample{j}',
+                    nn.Upsample(
+                        scale_factor=2, mode='bilinear', align_corners=False))
+
+            self.convs_all_levels.append(convs_per_level)
+
+        self.conv_pred = ConvModule(
+            self.feat_channels,
+            self.out_channels,
+            1,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+
+    def forward(self, x: Tuple[Tensor]) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: The predicted mask feature map.
+        """
+        inputs = x[self.start_level:self.end_level + 1]
+        assert len(inputs) == (self.end_level - self.start_level + 1)
+        feature_add_all_level = self.convs_all_levels[0](inputs[0])
+        for i in range(1, len(inputs)):
+            input_p = inputs[i]
+            if i == len(inputs) - 1:
+                coord_feat = generate_coordinate(input_p.size(),
+                                                 input_p.device)
+                input_p = torch.cat([input_p, coord_feat], 1)
+
+            feature_add_all_level = feature_add_all_level + \
+                self.convs_all_levels[i](input_p)
+
+        feature_pred = self.conv_pred(feature_add_all_level)
+        return feature_pred
+
+
+@MODELS.register_module()
+class SOLOV2Head(SOLOHead):
+    """SOLOv2 mask head used in `SOLOv2: Dynamic and Fast Instance
+    Segmentation. <https://arxiv.org/pdf/2003.10152>`_
+
+    Args:
+        mask_feature_head (dict): Config of SOLOv2MaskFeatHead.
+        dynamic_conv_size (int): Dynamic Conv kernel size. Defaults to 1.
+        dcn_cfg (dict): Dcn conv configurations in kernel_convs and cls_conv.
+            Defaults to None.
+        dcn_apply_to_all_conv (bool): Whether to use dcn in every layer of
+            kernel_convs and cls_convs, or only the last layer. It shall be set
+            `True` for the normal version of SOLOv2 and `False` for the
+            light-weight version. Defaults to True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 *args,
+                 mask_feature_head: ConfigType,
+                 dynamic_conv_size: int = 1,
+                 dcn_cfg: OptConfigType = None,
+                 dcn_apply_to_all_conv: bool = True,
+                 init_cfg: MultiConfig = [
+                     dict(type='Normal', layer='Conv2d', std=0.01),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         bias_prob=0.01,
+                         override=dict(name='conv_cls'))
+                 ],
+                 **kwargs) -> None:
+        assert dcn_cfg is None or isinstance(dcn_cfg, dict)
+        self.dcn_cfg = dcn_cfg
+        self.with_dcn = dcn_cfg is not None
+        self.dcn_apply_to_all_conv = dcn_apply_to_all_conv
+        self.dynamic_conv_size = dynamic_conv_size
+        mask_out_channels = mask_feature_head.get('out_channels')
+        self.kernel_out_channels = \
+            mask_out_channels * self.dynamic_conv_size * self.dynamic_conv_size
+
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+
+        # update the in_channels of mask_feature_head
+        if mask_feature_head.get('in_channels', None) is not None:
+            if mask_feature_head.in_channels != self.in_channels:
+                warnings.warn('The `in_channels` of SOLOv2MaskFeatHead and '
+                              'SOLOv2Head should be same, changing '
+                              'mask_feature_head.in_channels to '
+                              f'{self.in_channels}')
+                mask_feature_head.update(in_channels=self.in_channels)
+        else:
+            mask_feature_head.update(in_channels=self.in_channels)
+
+        self.mask_feature_head = MaskFeatModule(**mask_feature_head)
+        self.mask_stride = self.mask_feature_head.mask_stride
+        self.fp16_enabled = False
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.kernel_convs = nn.ModuleList()
+        conv_cfg = None
+        for i in range(self.stacked_convs):
+            if self.with_dcn:
+                if self.dcn_apply_to_all_conv:
+                    conv_cfg = self.dcn_cfg
+                elif i == self.stacked_convs - 1:
+                    # light head
+                    conv_cfg = self.dcn_cfg
+
+            chn = self.in_channels + 2 if i == 0 else self.feat_channels
+            self.kernel_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None))
+
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.norm_cfg is None))
+
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+        self.conv_kernel = nn.Conv2d(
+            self.feat_channels, self.kernel_out_channels, 3, padding=1)
+
+    def forward(self, x):
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores, mask prediction,
+            and mask features.
+
+                - mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel
+                  prediction. The kernel is used to generate instance
+                  segmentation masks by dynamic convolution. Each element in
+                  the list has shape
+                  (batch_size, kernel_out_channels, num_grids, num_grids).
+                - mlvl_cls_preds (list[Tensor]): Multi-level scores. Each
+                  element in the list has shape
+                  (batch_size, num_classes, num_grids, num_grids).
+                - mask_feats (Tensor): Unified mask feature map used to
+                  generate instance segmentation masks by dynamic convolution.
+                  Has shape (batch_size, mask_out_channels, h, w).
+        """
+        assert len(x) == self.num_levels
+        mask_feats = self.mask_feature_head(x)
+        ins_kernel_feats = self.resize_feats(x)
+        mlvl_kernel_preds = []
+        mlvl_cls_preds = []
+        for i in range(self.num_levels):
+            ins_kernel_feat = ins_kernel_feats[i]
+            # ins branch
+            # concat coord
+            coord_feat = generate_coordinate(ins_kernel_feat.size(),
+                                             ins_kernel_feat.device)
+            ins_kernel_feat = torch.cat([ins_kernel_feat, coord_feat], 1)
+
+            # kernel branch
+            kernel_feat = ins_kernel_feat
+            kernel_feat = F.interpolate(
+                kernel_feat,
+                size=self.num_grids[i],
+                mode='bilinear',
+                align_corners=False)
+
+            cate_feat = kernel_feat[:, :-2, :, :]
+
+            kernel_feat = kernel_feat.contiguous()
+            for i, kernel_conv in enumerate(self.kernel_convs):
+                kernel_feat = kernel_conv(kernel_feat)
+            kernel_pred = self.conv_kernel(kernel_feat)
+
+            # cate branch
+            cate_feat = cate_feat.contiguous()
+            for i, cls_conv in enumerate(self.cls_convs):
+                cate_feat = cls_conv(cate_feat)
+            cate_pred = self.conv_cls(cate_feat)
+
+            mlvl_kernel_preds.append(kernel_pred)
+            mlvl_cls_preds.append(cate_pred)
+
+        return mlvl_kernel_preds, mlvl_cls_preds, mask_feats
+
+    def _get_targets_single(self,
+                            gt_instances: InstanceData,
+                            featmap_sizes: Optional[list] = None) -> tuple:
+        """Compute targets for predictions of single image.
+
+        Args:
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            featmap_sizes (list[:obj:`torch.size`]): Size of each
+                feature map from feature pyramid, each element
+                means (feat_h, feat_w). Defaults to None.
+
+        Returns:
+            Tuple: Usually returns a tuple containing targets for predictions.
+
+                - mlvl_pos_mask_targets (list[Tensor]): Each element represent
+                  the binary mask targets for positive points in this
+                  level, has shape (num_pos, out_h, out_w).
+                - mlvl_labels (list[Tensor]): Each element is
+                  classification labels for all
+                  points in this level, has shape
+                  (num_grid, num_grid).
+                - mlvl_pos_masks  (list[Tensor]): Each element is
+                  a `BoolTensor` to represent whether the
+                  corresponding point in single level
+                  is positive, has shape (num_grid **2).
+                - mlvl_pos_indexes  (list[list]): Each element
+                  in the list contains the positive index in
+                  corresponding level, has shape (num_pos).
+        """
+        gt_labels = gt_instances.labels
+        device = gt_labels.device
+
+        gt_bboxes = gt_instances.bboxes
+        gt_areas = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                              (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device)
+
+        mlvl_pos_mask_targets = []
+        mlvl_pos_indexes = []
+        mlvl_labels = []
+        mlvl_pos_masks = []
+        for (lower_bound, upper_bound), num_grid \
+                in zip(self.scale_ranges, self.num_grids):
+            mask_target = []
+            # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+            pos_index = []
+            labels = torch.zeros([num_grid, num_grid],
+                                 dtype=torch.int64,
+                                 device=device) + self.num_classes
+            pos_mask = torch.zeros([num_grid**2],
+                                   dtype=torch.bool,
+                                   device=device)
+
+            gt_inds = ((gt_areas >= lower_bound) &
+                       (gt_areas <= upper_bound)).nonzero().flatten()
+            if len(gt_inds) == 0:
+                mlvl_pos_mask_targets.append(
+                    torch.zeros([0, featmap_sizes[0], featmap_sizes[1]],
+                                dtype=torch.uint8,
+                                device=device))
+                mlvl_labels.append(labels)
+                mlvl_pos_masks.append(pos_mask)
+                mlvl_pos_indexes.append([])
+                continue
+            hit_gt_bboxes = gt_bboxes[gt_inds]
+            hit_gt_labels = gt_labels[gt_inds]
+            hit_gt_masks = gt_masks[gt_inds, ...]
+
+            pos_w_ranges = 0.5 * (hit_gt_bboxes[:, 2] -
+                                  hit_gt_bboxes[:, 0]) * self.pos_scale
+            pos_h_ranges = 0.5 * (hit_gt_bboxes[:, 3] -
+                                  hit_gt_bboxes[:, 1]) * self.pos_scale
+
+            # Make sure hit_gt_masks has a value
+            valid_mask_flags = hit_gt_masks.sum(dim=-1).sum(dim=-1) > 0
+
+            for gt_mask, gt_label, pos_h_range, pos_w_range, \
+                valid_mask_flag in \
+                    zip(hit_gt_masks, hit_gt_labels, pos_h_ranges,
+                        pos_w_ranges, valid_mask_flags):
+                if not valid_mask_flag:
+                    continue
+                upsampled_size = (featmap_sizes[0] * self.mask_stride,
+                                  featmap_sizes[1] * self.mask_stride)
+                center_h, center_w = center_of_mass(gt_mask)
+
+                coord_w = int(
+                    floordiv((center_w / upsampled_size[1]), (1. / num_grid),
+                             rounding_mode='trunc'))
+                coord_h = int(
+                    floordiv((center_h / upsampled_size[0]), (1. / num_grid),
+                             rounding_mode='trunc'))
+
+                # left, top, right, down
+                top_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_h - pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                down_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_h + pos_h_range) / upsampled_size[0],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                left_box = max(
+                    0,
+                    int(
+                        floordiv(
+                            (center_w - pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+                right_box = min(
+                    num_grid - 1,
+                    int(
+                        floordiv(
+                            (center_w + pos_w_range) / upsampled_size[1],
+                            (1. / num_grid),
+                            rounding_mode='trunc')))
+
+                top = max(top_box, coord_h - 1)
+                down = min(down_box, coord_h + 1)
+                left = max(coord_w - 1, left_box)
+                right = min(right_box, coord_w + 1)
+
+                labels[top:(down + 1), left:(right + 1)] = gt_label
+                # ins
+                gt_mask = np.uint8(gt_mask.cpu().numpy())
+                # Follow the original implementation, F.interpolate is
+                # different from cv2 and opencv
+                gt_mask = mmcv.imrescale(gt_mask, scale=1. / self.mask_stride)
+                gt_mask = torch.from_numpy(gt_mask).to(device=device)
+
+                for i in range(top, down + 1):
+                    for j in range(left, right + 1):
+                        index = int(i * num_grid + j)
+                        this_mask_target = torch.zeros(
+                            [featmap_sizes[0], featmap_sizes[1]],
+                            dtype=torch.uint8,
+                            device=device)
+                        this_mask_target[:gt_mask.shape[0], :gt_mask.
+                                         shape[1]] = gt_mask
+                        mask_target.append(this_mask_target)
+                        pos_mask[index] = True
+                        pos_index.append(index)
+            if len(mask_target) == 0:
+                mask_target = torch.zeros(
+                    [0, featmap_sizes[0], featmap_sizes[1]],
+                    dtype=torch.uint8,
+                    device=device)
+            else:
+                mask_target = torch.stack(mask_target, 0)
+            mlvl_pos_mask_targets.append(mask_target)
+            mlvl_labels.append(labels)
+            mlvl_pos_masks.append(pos_mask)
+            mlvl_pos_indexes.append(pos_index)
+        return (mlvl_pos_mask_targets, mlvl_labels, mlvl_pos_masks,
+                mlvl_pos_indexes)
+
+    def loss_by_feat(self, mlvl_kernel_preds: List[Tensor],
+                     mlvl_cls_preds: List[Tensor], mask_feats: Tensor,
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel
+                prediction. The kernel is used to generate instance
+                segmentation masks by dynamic convolution. Each element in the
+                list has shape
+                (batch_size, kernel_out_channels, num_grids, num_grids).
+            mlvl_cls_preds (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids, num_grids).
+            mask_feats (Tensor): Unified mask feature map used to generate
+                instance segmentation masks by dynamic convolution. Has shape
+                (batch_size, mask_out_channels, h, w).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = mask_feats.size()[-2:]
+
+        pos_mask_targets, labels, pos_masks, pos_indexes = multi_apply(
+            self._get_targets_single,
+            batch_gt_instances,
+            featmap_sizes=featmap_sizes)
+
+        mlvl_mask_targets = [
+            torch.cat(lvl_mask_targets, 0)
+            for lvl_mask_targets in zip(*pos_mask_targets)
+        ]
+
+        mlvl_pos_kernel_preds = []
+        for lvl_kernel_preds, lvl_pos_indexes in zip(mlvl_kernel_preds,
+                                                     zip(*pos_indexes)):
+            lvl_pos_kernel_preds = []
+            for img_lvl_kernel_preds, img_lvl_pos_indexes in zip(
+                    lvl_kernel_preds, lvl_pos_indexes):
+                img_lvl_pos_kernel_preds = img_lvl_kernel_preds.view(
+                    img_lvl_kernel_preds.shape[0], -1)[:, img_lvl_pos_indexes]
+                lvl_pos_kernel_preds.append(img_lvl_pos_kernel_preds)
+            mlvl_pos_kernel_preds.append(lvl_pos_kernel_preds)
+
+        # make multilevel mlvl_mask_pred
+        mlvl_mask_preds = []
+        for lvl_pos_kernel_preds in mlvl_pos_kernel_preds:
+            lvl_mask_preds = []
+            for img_id, img_lvl_pos_kernel_pred in enumerate(
+                    lvl_pos_kernel_preds):
+                if img_lvl_pos_kernel_pred.size()[-1] == 0:
+                    continue
+                img_mask_feats = mask_feats[[img_id]]
+                h, w = img_mask_feats.shape[-2:]
+                num_kernel = img_lvl_pos_kernel_pred.shape[1]
+                img_lvl_mask_pred = F.conv2d(
+                    img_mask_feats,
+                    img_lvl_pos_kernel_pred.permute(1, 0).view(
+                        num_kernel, -1, self.dynamic_conv_size,
+                        self.dynamic_conv_size),
+                    stride=1).view(-1, h, w)
+                lvl_mask_preds.append(img_lvl_mask_pred)
+            if len(lvl_mask_preds) == 0:
+                lvl_mask_preds = None
+            else:
+                lvl_mask_preds = torch.cat(lvl_mask_preds, 0)
+            mlvl_mask_preds.append(lvl_mask_preds)
+        # dice loss
+        num_pos = 0
+        for img_pos_masks in pos_masks:
+            for lvl_img_pos_masks in img_pos_masks:
+                # Fix `Tensor` object has no attribute `count_nonzero()`
+                # in PyTorch 1.6, the type of `lvl_img_pos_masks`
+                # should be `torch.bool`.
+                num_pos += lvl_img_pos_masks.nonzero().numel()
+        loss_mask = []
+        for lvl_mask_preds, lvl_mask_targets in zip(mlvl_mask_preds,
+                                                    mlvl_mask_targets):
+            if lvl_mask_preds is None:
+                continue
+            loss_mask.append(
+                self.loss_mask(
+                    lvl_mask_preds,
+                    lvl_mask_targets,
+                    reduction_override='none'))
+        if num_pos > 0:
+            loss_mask = torch.cat(loss_mask).sum() / num_pos
+        else:
+            loss_mask = mask_feats.sum() * 0
+
+        # cate
+        flatten_labels = [
+            torch.cat(
+                [img_lvl_labels.flatten() for img_lvl_labels in lvl_labels])
+            for lvl_labels in zip(*labels)
+        ]
+        flatten_labels = torch.cat(flatten_labels)
+
+        flatten_cls_preds = [
+            lvl_cls_preds.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
+            for lvl_cls_preds in mlvl_cls_preds
+        ]
+        flatten_cls_preds = torch.cat(flatten_cls_preds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_preds, flatten_labels, avg_factor=num_pos + 1)
+        return dict(loss_mask=loss_mask, loss_cls=loss_cls)
+
+    def predict_by_feat(self, mlvl_kernel_preds: List[Tensor],
+                        mlvl_cls_scores: List[Tensor], mask_feats: Tensor,
+                        batch_img_metas: List[dict], **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mlvl_kernel_preds (list[Tensor]): Multi-level dynamic kernel
+                prediction. The kernel is used to generate instance
+                segmentation masks by dynamic convolution. Each element in the
+                list has shape
+                (batch_size, kernel_out_channels, num_grids, num_grids).
+            mlvl_cls_scores (list[Tensor]): Multi-level scores. Each element
+                in the list has shape
+                (batch_size, num_classes, num_grids, num_grids).
+            mask_feats (Tensor): Unified mask feature map used to generate
+                instance segmentation masks by dynamic convolution. Has shape
+                (batch_size, mask_out_channels, h, w).
+            batch_img_metas (list[dict]): Meta information of all images.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        num_levels = len(mlvl_cls_scores)
+        assert len(mlvl_kernel_preds) == len(mlvl_cls_scores)
+
+        for lvl in range(num_levels):
+            cls_scores = mlvl_cls_scores[lvl]
+            cls_scores = cls_scores.sigmoid()
+            local_max = F.max_pool2d(cls_scores, 2, stride=1, padding=1)
+            keep_mask = local_max[:, :, :-1, :-1] == cls_scores
+            cls_scores = cls_scores * keep_mask
+            mlvl_cls_scores[lvl] = cls_scores.permute(0, 2, 3, 1)
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            img_cls_pred = [
+                mlvl_cls_scores[lvl][img_id].view(-1, self.cls_out_channels)
+                for lvl in range(num_levels)
+            ]
+            img_mask_feats = mask_feats[[img_id]]
+            img_kernel_pred = [
+                mlvl_kernel_preds[lvl][img_id].permute(1, 2, 0).view(
+                    -1, self.kernel_out_channels) for lvl in range(num_levels)
+            ]
+            img_cls_pred = torch.cat(img_cls_pred, dim=0)
+            img_kernel_pred = torch.cat(img_kernel_pred, dim=0)
+            result = self._predict_by_feat_single(
+                img_kernel_pred,
+                img_cls_pred,
+                img_mask_feats,
+                img_meta=batch_img_metas[img_id])
+            result_list.append(result)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                kernel_preds: Tensor,
+                                cls_scores: Tensor,
+                                mask_feats: Tensor,
+                                img_meta: dict,
+                                cfg: OptConfigType = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            kernel_preds (Tensor): Dynamic kernel prediction of all points
+                in single image, has shape
+                (num_points, kernel_out_channels).
+            cls_scores (Tensor): Classification score of all points
+                in single image, has shape (num_points, num_classes).
+            mask_feats (Tensor): Mask prediction of all points in
+                single image, has shape (num_points, feat_h, feat_w).
+            img_meta (dict): Meta information of corresponding image.
+            cfg (dict, optional): Config used in test phase.
+                Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+
+        def empty_results(cls_scores, ori_shape):
+            """Generate a empty results."""
+            results = InstanceData()
+            results.scores = cls_scores.new_ones(0)
+            results.masks = cls_scores.new_zeros(0, *ori_shape)
+            results.labels = cls_scores.new_ones(0)
+            results.bboxes = cls_scores.new_zeros(0, 4)
+            return results
+
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(kernel_preds) == len(cls_scores)
+
+        featmap_size = mask_feats.size()[-2:]
+
+        # overall info
+        h, w = img_meta['img_shape'][:2]
+        upsampled_size = (featmap_size[0] * self.mask_stride,
+                          featmap_size[1] * self.mask_stride)
+
+        # process.
+        score_mask = (cls_scores > cfg.score_thr)
+        cls_scores = cls_scores[score_mask]
+        if len(cls_scores) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+
+        # cate_labels & kernel_preds
+        inds = score_mask.nonzero()
+        cls_labels = inds[:, 1]
+        kernel_preds = kernel_preds[inds[:, 0]]
+
+        # trans vector.
+        lvl_interval = cls_labels.new_tensor(self.num_grids).pow(2).cumsum(0)
+        strides = kernel_preds.new_ones(lvl_interval[-1])
+
+        strides[:lvl_interval[0]] *= self.strides[0]
+        for lvl in range(1, self.num_levels):
+            strides[lvl_interval[lvl -
+                                 1]:lvl_interval[lvl]] *= self.strides[lvl]
+        strides = strides[inds[:, 0]]
+
+        # mask encoding.
+        kernel_preds = kernel_preds.view(
+            kernel_preds.size(0), -1, self.dynamic_conv_size,
+            self.dynamic_conv_size)
+        mask_preds = F.conv2d(
+            mask_feats, kernel_preds, stride=1).squeeze(0).sigmoid()
+        # mask.
+        masks = mask_preds > cfg.mask_thr
+        sum_masks = masks.sum((1, 2)).float()
+        keep = sum_masks > strides
+        if keep.sum() == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        masks = masks[keep]
+        mask_preds = mask_preds[keep]
+        sum_masks = sum_masks[keep]
+        cls_scores = cls_scores[keep]
+        cls_labels = cls_labels[keep]
+
+        # maskness.
+        mask_scores = (mask_preds * masks).sum((1, 2)) / sum_masks
+        cls_scores *= mask_scores
+
+        scores, labels, _, keep_inds = mask_matrix_nms(
+            masks,
+            cls_labels,
+            cls_scores,
+            mask_area=sum_masks,
+            nms_pre=cfg.nms_pre,
+            max_num=cfg.max_per_img,
+            kernel=cfg.kernel,
+            sigma=cfg.sigma,
+            filter_thr=cfg.filter_thr)
+        if len(keep_inds) == 0:
+            return empty_results(cls_scores, img_meta['ori_shape'][:2])
+        mask_preds = mask_preds[keep_inds]
+        mask_preds = F.interpolate(
+            mask_preds.unsqueeze(0),
+            size=upsampled_size,
+            mode='bilinear',
+            align_corners=False)[:, :, :h, :w]
+        mask_preds = F.interpolate(
+            mask_preds,
+            size=img_meta['ori_shape'][:2],
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        masks = mask_preds > cfg.mask_thr
+
+        results = InstanceData()
+        results.masks = masks
+        results.labels = labels
+        results.scores = scores
+        # create an empty bbox in InstanceData to avoid bugs when
+        # calculating metrics.
+        results.bboxes = results.scores.new_zeros(len(scores), 4)
+
+        return results
diff --git a/head_extractor/src/mmdet/models/dense_heads/ssd_head.py b/head_extractor/src/mmdet/models/dense_heads/ssd_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..950df29110d914cc888bc16c6cbf1856f604a1de
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/ssd_head.py
@@ -0,0 +1,362 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptInstanceList
+from ..losses import smooth_l1_loss
+from ..task_modules.samplers import PseudoSampler
+from ..utils import multi_apply
+from .anchor_head import AnchorHead
+
+
+# TODO: add loss evaluator for SSD
+@MODELS.register_module()
+class SSDHead(AnchorHead):
+    """Implementation of `SSD head <https://arxiv.org/abs/1512.02325>`_
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (Sequence[int]): Number of channels in the input feature
+            map.
+        stacked_convs (int): Number of conv layers in cls and reg tower.
+            Defaults to 0.
+        feat_channels (int): Number of hidden channels when stacked_convs
+            > 0. Defaults to 256.
+        use_depthwise (bool): Whether to use DepthwiseSeparableConv.
+            Defaults to False.
+        conv_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config conv layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config norm layer. Defaults to None.
+        act_cfg (:obj:`ConfigDict` or dict, Optional): Dictionary to construct
+            and config activation layer. Defaults to None.
+        anchor_generator (:obj:`ConfigDict` or dict): Config dict for anchor
+            generator.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Defaults to False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Training config of
+            anchor head.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Testing config of
+            anchor head.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], Optional): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(
+        self,
+        num_classes: int = 80,
+        in_channels: Sequence[int] = (512, 1024, 512, 256, 256, 256),
+        stacked_convs: int = 0,
+        feat_channels: int = 256,
+        use_depthwise: bool = False,
+        conv_cfg: Optional[ConfigType] = None,
+        norm_cfg: Optional[ConfigType] = None,
+        act_cfg: Optional[ConfigType] = None,
+        anchor_generator: ConfigType = dict(
+            type='SSDAnchorGenerator',
+            scale_major=False,
+            input_size=300,
+            strides=[8, 16, 32, 64, 100, 300],
+            ratios=([2], [2, 3], [2, 3], [2, 3], [2], [2]),
+            basesize_ratio_range=(0.1, 0.9)),
+        bbox_coder: ConfigType = dict(
+            type='DeltaXYWHBBoxCoder',
+            clip_border=True,
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0],
+        ),
+        reg_decoded_bbox: bool = False,
+        train_cfg: Optional[ConfigType] = None,
+        test_cfg: Optional[ConfigType] = None,
+        init_cfg: MultiConfig = dict(
+            type='Xavier', layer='Conv2d', distribution='uniform', bias=0)
+    ) -> None:
+        super(AnchorHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.stacked_convs = stacked_convs
+        self.feat_channels = feat_channels
+        self.use_depthwise = use_depthwise
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.cls_out_channels = num_classes + 1  # add background class
+        self.prior_generator = TASK_UTILS.build(anchor_generator)
+
+        # Usually the numbers of anchors for each level are the same
+        # except SSD detectors. So it is an int in the most dense
+        # heads but a list of int in SSDHead
+        self.num_base_priors = self.prior_generator.num_base_priors
+
+        self._init_layers()
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.use_sigmoid_cls = False
+        self.cls_focal_loss = False
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler(context=self)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        # TODO: Use registry to choose ConvModule type
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+
+        for channel, num_base_priors in zip(self.in_channels,
+                                            self.num_base_priors):
+            cls_layers = []
+            reg_layers = []
+            in_channel = channel
+            # build stacked conv tower, not used in default ssd
+            for i in range(self.stacked_convs):
+                cls_layers.append(
+                    conv(
+                        in_channel,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_layers.append(
+                    conv(
+                        in_channel,
+                        self.feat_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                in_channel = self.feat_channels
+            # SSD-Lite head
+            if self.use_depthwise:
+                cls_layers.append(
+                    ConvModule(
+                        in_channel,
+                        in_channel,
+                        3,
+                        padding=1,
+                        groups=in_channel,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                reg_layers.append(
+                    ConvModule(
+                        in_channel,
+                        in_channel,
+                        3,
+                        padding=1,
+                        groups=in_channel,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            cls_layers.append(
+                nn.Conv2d(
+                    in_channel,
+                    num_base_priors * self.cls_out_channels,
+                    kernel_size=1 if self.use_depthwise else 3,
+                    padding=0 if self.use_depthwise else 1))
+            reg_layers.append(
+                nn.Conv2d(
+                    in_channel,
+                    num_base_priors * 4,
+                    kernel_size=1 if self.use_depthwise else 3,
+                    padding=0 if self.use_depthwise else 1))
+            self.cls_convs.append(nn.Sequential(*cls_layers))
+            self.reg_convs.append(nn.Sequential(*reg_layers))
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: A tuple of cls_scores list and
+            bbox_preds list.
+
+            - cls_scores (list[Tensor]): Classification scores for all scale \
+            levels, each is a 4D-tensor, the channels number is \
+            num_anchors * num_classes.
+            - bbox_preds (list[Tensor]): Box energies / deltas for all scale \
+            levels, each is a 4D-tensor, the channels number is \
+            num_anchors * 4.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for feat, reg_conv, cls_conv in zip(x, self.reg_convs, self.cls_convs):
+            cls_scores.append(cls_conv(feat))
+            bbox_preds.append(reg_conv(feat))
+        return cls_scores, bbox_preds
+
+    def loss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                            anchor: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            bbox_weights: Tensor,
+                            avg_factor: int) -> Tuple[Tensor, Tensor]:
+        """Compute loss of a single image.
+
+        Args:
+            cls_score (Tensor): Box scores for eachimage
+                Has shape (num_total_anchors, num_classes).
+            bbox_pred (Tensor): Box energies / deltas for each image
+                level with shape (num_total_anchors, 4).
+            anchors (Tensor): Box reference for each scale level with shape
+                (num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (num_total_anchors,).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (num_total_anchors,)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (num_total_anchors, 4).
+            avg_factor (int): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            Tuple[Tensor, Tensor]: A tuple of cls loss and bbox loss of one
+            feature map.
+        """
+
+        loss_cls_all = F.cross_entropy(
+            cls_score, labels, reduction='none') * label_weights
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero(
+            as_tuple=False).reshape(-1)
+        neg_inds = (labels == self.num_classes).nonzero(
+            as_tuple=False).view(-1)
+
+        num_pos_samples = pos_inds.size(0)
+        num_neg_samples = self.train_cfg['neg_pos_ratio'] * num_pos_samples
+        if num_neg_samples > neg_inds.size(0):
+            num_neg_samples = neg_inds.size(0)
+        topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
+        loss_cls_pos = loss_cls_all[pos_inds].sum()
+        loss_cls_neg = topk_loss_cls_neg.sum()
+        loss_cls = (loss_cls_pos + loss_cls_neg) / avg_factor
+
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            bbox_pred = self.bbox_coder.decode(anchor, bbox_pred)
+
+        loss_bbox = smooth_l1_loss(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            beta=self.train_cfg['smoothl1_beta'],
+            avg_factor=avg_factor)
+        return loss_cls[None], loss_bbox
+
+    def loss_by_feat(
+        self,
+        cls_scores: List[Tensor],
+        bbox_preds: List[Tensor],
+        batch_gt_instances: InstanceList,
+        batch_img_metas: List[dict],
+        batch_gt_instances_ignore: OptInstanceList = None
+    ) -> Dict[str, List[Tensor]]:
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, list[Tensor]]: A dictionary of loss components. the dict
+            has components below:
+
+            - loss_cls (list[Tensor]): A list containing each feature map \
+            classification loss.
+            - loss_bbox (list[Tensor]): A list containing each feature map \
+            regression loss.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            unmap_outputs=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor) = cls_reg_targets
+
+        num_images = len(batch_img_metas)
+        all_cls_scores = torch.cat([
+            s.permute(0, 2, 3, 1).reshape(
+                num_images, -1, self.cls_out_channels) for s in cls_scores
+        ], 1)
+        all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+        all_label_weights = torch.cat(label_weights_list,
+                                      -1).view(num_images, -1)
+        all_bbox_preds = torch.cat([
+            b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+            for b in bbox_preds
+        ], -2)
+        all_bbox_targets = torch.cat(bbox_targets_list,
+                                     -2).view(num_images, -1, 4)
+        all_bbox_weights = torch.cat(bbox_weights_list,
+                                     -2).view(num_images, -1, 4)
+
+        # concat all level anchors to a single tensor
+        all_anchors = []
+        for i in range(num_images):
+            all_anchors.append(torch.cat(anchor_list[i]))
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_by_feat_single,
+            all_cls_scores,
+            all_bbox_preds,
+            all_anchors,
+            all_labels,
+            all_label_weights,
+            all_bbox_targets,
+            all_bbox_weights,
+            avg_factor=avg_factor)
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
diff --git a/head_extractor/src/mmdet/models/dense_heads/tood_head.py b/head_extractor/src/mmdet/models/dense_heads/tood_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c59598d89289df6d1a87c7b6fde112429ac8f45
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/tood_head.py
@@ -0,0 +1,805 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale
+from mmcv.ops import deform_conv2d
+from mmengine import MessageHub
+from mmengine.config import ConfigDict
+from mmengine.model import bias_init_with_prob, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import distance2bbox
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, reduce_mean)
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import (filter_scores_and_topk, images_to_levels, multi_apply,
+                     sigmoid_geometric_mean, unmap)
+from .atss_head import ATSSHead
+
+
+class TaskDecomposition(nn.Module):
+    """Task decomposition module in task-aligned predictor of TOOD.
+
+    Args:
+        feat_channels (int): Number of feature channels in TOOD head.
+        stacked_convs (int): Number of conv layers in TOOD head.
+        la_down_rate (int): Downsample rate of layer attention.
+            Defaults to 8.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional):  Config dict for
+        normalization layer. Defaults to None.
+    """
+
+    def __init__(self,
+                 feat_channels: int,
+                 stacked_convs: int,
+                 la_down_rate: int = 8,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None) -> None:
+        super().__init__()
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.in_channels = self.feat_channels * self.stacked_convs
+        self.norm_cfg = norm_cfg
+        self.layer_attention = nn.Sequential(
+            nn.Conv2d(self.in_channels, self.in_channels // la_down_rate, 1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(
+                self.in_channels // la_down_rate,
+                self.stacked_convs,
+                1,
+                padding=0), nn.Sigmoid())
+
+        self.reduction_conv = ConvModule(
+            self.in_channels,
+            self.feat_channels,
+            1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=norm_cfg is None)
+
+    def init_weights(self) -> None:
+        """Initialize the parameters."""
+        for m in self.layer_attention.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001)
+        normal_init(self.reduction_conv.conv, std=0.01)
+
+    def forward(self,
+                feat: Tensor,
+                avg_feat: Optional[Tensor] = None) -> Tensor:
+        """Forward function of task decomposition module."""
+        b, c, h, w = feat.shape
+        if avg_feat is None:
+            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
+        weight = self.layer_attention(avg_feat)
+
+        # here we first compute the product between layer attention weight and
+        # conv weight, and then compute the convolution between new conv weight
+        # and feature map, in order to save memory and FLOPs.
+        conv_weight = weight.reshape(
+            b, 1, self.stacked_convs,
+            1) * self.reduction_conv.conv.weight.reshape(
+                1, self.feat_channels, self.stacked_convs, self.feat_channels)
+        conv_weight = conv_weight.reshape(b, self.feat_channels,
+                                          self.in_channels)
+        feat = feat.reshape(b, self.in_channels, h * w)
+        feat = torch.bmm(conv_weight, feat).reshape(b, self.feat_channels, h,
+                                                    w)
+        if self.norm_cfg is not None:
+            feat = self.reduction_conv.norm(feat)
+        feat = self.reduction_conv.activate(feat)
+
+        return feat
+
+
+@MODELS.register_module()
+class TOODHead(ATSSHead):
+    """TOODHead used in `TOOD: Task-aligned One-stage Object Detection.
+
+    <https://arxiv.org/abs/2108.07755>`_.
+
+    TOOD uses Task-aligned head (T-head) and is optimized by Task Alignment
+    Learning (TAL).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        num_dcn (int): Number of deformable convolution in the head.
+            Defaults to 0.
+        anchor_type (str): If set to ``anchor_free``, the head will use centers
+            to regress bboxes. If set to ``anchor_based``, the head will
+            regress bboxes based on anchors. Defaults to ``anchor_free``.
+        initial_loss_cls (:obj:`ConfigDict` or dict): Config of initial loss.
+
+    Example:
+        >>> self = TOODHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred = self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 num_dcn: int = 0,
+                 anchor_type: str = 'anchor_free',
+                 initial_loss_cls: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     activated=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 **kwargs) -> None:
+        assert anchor_type in ['anchor_free', 'anchor_based']
+        self.num_dcn = num_dcn
+        self.anchor_type = anchor_type
+        super().__init__(
+            num_classes=num_classes, in_channels=in_channels, **kwargs)
+
+        if self.train_cfg:
+            self.initial_epoch = self.train_cfg['initial_epoch']
+            self.initial_assigner = TASK_UTILS.build(
+                self.train_cfg['initial_assigner'])
+            self.initial_loss_cls = MODELS.build(initial_loss_cls)
+            self.assigner = self.initial_assigner
+            self.alignment_assigner = TASK_UTILS.build(
+                self.train_cfg['assigner'])
+            self.alpha = self.train_cfg['alpha']
+            self.beta = self.train_cfg['beta']
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.inter_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            if i < self.num_dcn:
+                conv_cfg = dict(type='DCNv2', deform_groups=4)
+            else:
+                conv_cfg = self.conv_cfg
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.inter_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg))
+
+        self.cls_decomp = TaskDecomposition(self.feat_channels,
+                                            self.stacked_convs,
+                                            self.stacked_convs * 8,
+                                            self.conv_cfg, self.norm_cfg)
+        self.reg_decomp = TaskDecomposition(self.feat_channels,
+                                            self.stacked_convs,
+                                            self.stacked_convs * 8,
+                                            self.conv_cfg, self.norm_cfg)
+
+        self.tood_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.tood_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+
+        self.cls_prob_module = nn.Sequential(
+            nn.Conv2d(self.feat_channels * self.stacked_convs,
+                      self.feat_channels // 4, 1), nn.ReLU(inplace=True),
+            nn.Conv2d(self.feat_channels // 4, 1, 3, padding=1))
+        self.reg_offset_module = nn.Sequential(
+            nn.Conv2d(self.feat_channels * self.stacked_convs,
+                      self.feat_channels // 4, 1), nn.ReLU(inplace=True),
+            nn.Conv2d(self.feat_channels // 4, 4 * 2, 3, padding=1))
+
+        self.scales = nn.ModuleList(
+            [Scale(1.0) for _ in self.prior_generator.strides])
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        bias_cls = bias_init_with_prob(0.01)
+        for m in self.inter_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.cls_prob_module:
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.01)
+        for m in self.reg_offset_module:
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001)
+        normal_init(self.cls_prob_module[-1], std=0.01, bias=bias_cls)
+
+        self.cls_decomp.init_weights()
+        self.reg_decomp.init_weights()
+
+        normal_init(self.tood_cls, std=0.01, bias=bias_cls)
+        normal_init(self.tood_reg, std=0.01)
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually a tuple of classification scores and bbox prediction
+                cls_scores (list[Tensor]): Classification scores for all scale
+                    levels, each is a 4D-tensor, the channels number is
+                    num_anchors * num_classes.
+                bbox_preds (list[Tensor]): Decoded box for all scale levels,
+                    each is a 4D-tensor, the channels number is
+                    num_anchors * 4. In [tl_x, tl_y, br_x, br_y] format.
+        """
+        cls_scores = []
+        bbox_preds = []
+        for idx, (x, scale, stride) in enumerate(
+                zip(feats, self.scales, self.prior_generator.strides)):
+            b, c, h, w = x.shape
+            anchor = self.prior_generator.single_level_grid_priors(
+                (h, w), idx, device=x.device)
+            anchor = torch.cat([anchor for _ in range(b)])
+            # extract task interactive features
+            inter_feats = []
+            for inter_conv in self.inter_convs:
+                x = inter_conv(x)
+                inter_feats.append(x)
+            feat = torch.cat(inter_feats, 1)
+
+            # task decomposition
+            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
+            cls_feat = self.cls_decomp(feat, avg_feat)
+            reg_feat = self.reg_decomp(feat, avg_feat)
+
+            # cls prediction and alignment
+            cls_logits = self.tood_cls(cls_feat)
+            cls_prob = self.cls_prob_module(feat)
+            cls_score = sigmoid_geometric_mean(cls_logits, cls_prob)
+
+            # reg prediction and alignment
+            if self.anchor_type == 'anchor_free':
+                reg_dist = scale(self.tood_reg(reg_feat).exp()).float()
+                reg_dist = reg_dist.permute(0, 2, 3, 1).reshape(-1, 4)
+                reg_bbox = distance2bbox(
+                    self.anchor_center(anchor) / stride[0],
+                    reg_dist).reshape(b, h, w, 4).permute(0, 3, 1,
+                                                          2)  # (b, c, h, w)
+            elif self.anchor_type == 'anchor_based':
+                reg_dist = scale(self.tood_reg(reg_feat)).float()
+                reg_dist = reg_dist.permute(0, 2, 3, 1).reshape(-1, 4)
+                reg_bbox = self.bbox_coder.decode(anchor, reg_dist).reshape(
+                    b, h, w, 4).permute(0, 3, 1, 2) / stride[0]
+            else:
+                raise NotImplementedError(
+                    f'Unknown anchor type: {self.anchor_type}.'
+                    f'Please use `anchor_free` or `anchor_based`.')
+            reg_offset = self.reg_offset_module(feat)
+            bbox_pred = self.deform_sampling(reg_bbox.contiguous(),
+                                             reg_offset.contiguous())
+
+            # After deform_sampling, some boxes will become invalid (The
+            # left-top point is at the right or bottom of the right-bottom
+            # point), which will make the GIoULoss negative.
+            invalid_bbox_idx = (bbox_pred[:, [0]] > bbox_pred[:, [2]]) | \
+                               (bbox_pred[:, [1]] > bbox_pred[:, [3]])
+            invalid_bbox_idx = invalid_bbox_idx.expand_as(bbox_pred)
+            bbox_pred = torch.where(invalid_bbox_idx, reg_bbox, bbox_pred)
+
+            cls_scores.append(cls_score)
+            bbox_preds.append(bbox_pred)
+        return tuple(cls_scores), tuple(bbox_preds)
+
+    def deform_sampling(self, feat: Tensor, offset: Tensor) -> Tensor:
+        """Sampling the feature x according to offset.
+
+        Args:
+            feat (Tensor): Feature
+            offset (Tensor): Spatial offset for feature sampling
+        """
+        # it is an equivalent implementation of bilinear interpolation
+        b, c, h, w = feat.shape
+        weight = feat.new_ones(c, 1, 1, 1)
+        y = deform_conv2d(feat, offset, weight, 1, 0, 1, c, c)
+        return y
+
+    def anchor_center(self, anchors: Tensor) -> Tensor:
+        """Get anchor centers from anchors.
+
+        Args:
+            anchors (Tensor): Anchor list with shape (N, 4), "xyxy" format.
+
+        Returns:
+            Tensor: Anchor centers with shape (N, 2), "xy" format.
+        """
+        anchors_cx = (anchors[:, 2] + anchors[:, 0]) / 2
+        anchors_cy = (anchors[:, 3] + anchors[:, 1]) / 2
+        return torch.stack([anchors_cx, anchors_cy], dim=-1)
+
+    def loss_by_feat_single(self, anchors: Tensor, cls_score: Tensor,
+                            bbox_pred: Tensor, labels: Tensor,
+                            label_weights: Tensor, bbox_targets: Tensor,
+                            alignment_metrics: Tensor,
+                            stride: Tuple[int, int]) -> dict:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Decoded bboxes for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors).
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (N, num_total_anchors, 4).
+            alignment_metrics (Tensor): Alignment metrics with shape
+                (N, num_total_anchors).
+            stride (Tuple[int, int]): Downsample stride of the feature map.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert stride[0] == stride[1], 'h stride is not equal to w stride!'
+        anchors = anchors.reshape(-1, 4)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(
+            -1, self.cls_out_channels).contiguous()
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        alignment_metrics = alignment_metrics.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        targets = labels if self.epoch < self.initial_epoch else (
+            labels, alignment_metrics)
+        cls_loss_func = self.initial_loss_cls \
+            if self.epoch < self.initial_epoch else self.loss_cls
+
+        loss_cls = cls_loss_func(
+            cls_score, targets, label_weights, avg_factor=1.0)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+            pos_anchors = anchors[pos_inds]
+
+            pos_decode_bbox_pred = pos_bbox_pred
+            pos_decode_bbox_targets = pos_bbox_targets / stride[0]
+
+            # regression loss
+            pos_bbox_weight = self.centerness_target(
+                pos_anchors, pos_bbox_targets
+            ) if self.epoch < self.initial_epoch else alignment_metrics[
+                pos_inds]
+
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=pos_bbox_weight,
+                avg_factor=1.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            pos_bbox_weight = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, alignment_metrics.sum(
+        ), pos_bbox_weight.sum()
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Decoded box for each scale
+                level with shape (N, num_anchors * 4, H, W) in
+                [tl_x, tl_y, br_x, br_y] format.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        flatten_cls_scores = torch.cat([
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ], 1)
+        flatten_bbox_preds = torch.cat([
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) * stride[0]
+            for bbox_pred, stride in zip(bbox_preds,
+                                         self.prior_generator.strides)
+        ], 1)
+
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bbox_preds,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         alignment_metrics_list) = cls_reg_targets
+
+        losses_cls, losses_bbox, \
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_by_feat_single,
+                anchor_list,
+                cls_scores,
+                bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                alignment_metrics_list,
+                self.prior_generator.strides)
+
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                score_factor_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: Optional[ConfigDict] = None,
+                                rescale: bool = False,
+                                with_nms: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (:obj:`ConfigDict`, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+
+        cfg = self.test_cfg if cfg is None else cfg
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        for cls_score, bbox_pred, priors, stride in zip(
+                cls_score_list, bbox_pred_list, mlvl_priors,
+                self.prior_generator.strides):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) * stride[0]
+            scores = cls_score.permute(1, 2,
+                                       0).reshape(-1, self.cls_out_channels)
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(
+                scores, cfg.score_thr, nms_pre,
+                dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bboxes = filtered_results['bbox_pred']
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+
+        results = InstanceData()
+        results.bboxes = torch.cat(mlvl_bboxes)
+        results.scores = torch.cat(mlvl_scores)
+        results.labels = torch.cat(mlvl_labels)
+
+        return self._bbox_post_process(
+            results=results,
+            cfg=cfg,
+            rescale=rescale,
+            with_nms=with_nms,
+            img_meta=img_meta)
+
+    def get_targets(self,
+                    cls_scores: List[List[Tensor]],
+                    bbox_preds: List[List[Tensor]],
+                    anchor_list: List[List[Tensor]],
+                    valid_flag_list: List[List[Tensor]],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            cls_scores (list[list[Tensor]]): Classification predictions of
+                images, a 3D-Tensor with shape [num_imgs, num_priors,
+                num_classes].
+            bbox_preds (list[list[Tensor]]): Decoded bboxes predictions of one
+                image, a 3D-Tensor with shape [num_imgs, num_priors, 4] in
+                [tl_x, tl_y, br_x, br_y] format.
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: a tuple containing learning targets.
+
+                - anchors_list (list[list[Tensor]]): Anchors of each level.
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each
+                  level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - norm_alignment_metrics_list (list[Tensor]): Normalized
+                  alignment metrics of each level.
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        num_level_anchors_list = [num_level_anchors] * num_imgs
+
+        # concat all level anchors and flags to a single tensor
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            anchor_list[i] = torch.cat(anchor_list[i])
+            valid_flag_list[i] = torch.cat(valid_flag_list[i])
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        # anchor_list: list(b * [-1, 4])
+
+        # get epoch information from message hub
+        message_hub = MessageHub.get_current_instance()
+        self.epoch = message_hub.get_info('epoch')
+
+        if self.epoch < self.initial_epoch:
+            (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+             all_bbox_weights, pos_inds_list, neg_inds_list,
+             sampling_result) = multi_apply(
+                 super()._get_targets_single,
+                 anchor_list,
+                 valid_flag_list,
+                 num_level_anchors_list,
+                 batch_gt_instances,
+                 batch_img_metas,
+                 batch_gt_instances_ignore,
+                 unmap_outputs=unmap_outputs)
+            all_assign_metrics = [
+                weight[..., 0] for weight in all_bbox_weights
+            ]
+        else:
+            (all_anchors, all_labels, all_label_weights, all_bbox_targets,
+             all_assign_metrics) = multi_apply(
+                 self._get_targets_single,
+                 cls_scores,
+                 bbox_preds,
+                 anchor_list,
+                 valid_flag_list,
+                 batch_gt_instances,
+                 batch_img_metas,
+                 batch_gt_instances_ignore,
+                 unmap_outputs=unmap_outputs)
+
+        # split targets to a list w.r.t. multiple levels
+        anchors_list = images_to_levels(all_anchors, num_level_anchors)
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        norm_alignment_metrics_list = images_to_levels(all_assign_metrics,
+                                                       num_level_anchors)
+
+        return (anchors_list, labels_list, label_weights_list,
+                bbox_targets_list, norm_alignment_metrics_list)
+
+    def _get_targets_single(self,
+                            cls_scores: Tensor,
+                            bbox_preds: Tensor,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression, classification targets for anchors in a single
+        image.
+
+        Args:
+            cls_scores (Tensor): Box scores for each image.
+            bbox_preds (Tensor): Box energies / deltas for each image.
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: N is the number of total anchors in the image.
+                anchors (Tensor): All anchors in the image with shape (N, 4).
+                labels (Tensor): Labels of all anchors in the image with shape
+                    (N,).
+                label_weights (Tensor): Label weights of all anchor in the
+                    image with shape (N,).
+                bbox_targets (Tensor): BBox targets of all anchors in the
+                    image with shape (N, 4).
+                norm_alignment_metrics (Tensor): Normalized alignment metrics
+                    of all priors in the image with shape (N,).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+        pred_instances = InstanceData(
+            priors=anchors,
+            scores=cls_scores[inside_flags, :],
+            bboxes=bbox_preds[inside_flags, :])
+        assign_result = self.alignment_assigner.assign(pred_instances,
+                                                       gt_instances,
+                                                       gt_instances_ignore,
+                                                       self.alpha, self.beta)
+        assign_ious = assign_result.max_overlaps
+        assign_metrics = assign_result.assign_metrics
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        norm_alignment_metrics = anchors.new_zeros(
+            num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            # point-based
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        class_assigned_gt_inds = torch.unique(
+            sampling_result.pos_assigned_gt_inds)
+        for gt_inds in class_assigned_gt_inds:
+            gt_class_inds = pos_inds[sampling_result.pos_assigned_gt_inds ==
+                                     gt_inds]
+            pos_alignment_metrics = assign_metrics[gt_class_inds]
+            pos_ious = assign_ious[gt_class_inds]
+            pos_norm_alignment_metrics = pos_alignment_metrics / (
+                pos_alignment_metrics.max() + 10e-8) * pos_ious.max()
+            norm_alignment_metrics[gt_class_inds] = pos_norm_alignment_metrics
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            anchors = unmap(anchors, num_total_anchors, inside_flags)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags, fill=self.num_classes)
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            norm_alignment_metrics = unmap(norm_alignment_metrics,
+                                           num_total_anchors, inside_flags)
+        return (anchors, labels, label_weights, bbox_targets,
+                norm_alignment_metrics)
diff --git a/head_extractor/src/mmdet/models/dense_heads/vfnet_head.py b/head_extractor/src/mmdet/models/dense_heads/vfnet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..430b06d085d94760d56a7ea083eaf23bd32b1f53
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/vfnet_head.py
@@ -0,0 +1,722 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Scale
+from mmcv.ops import DeformConv2d
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig,
+                         OptInstanceList, RangeType, reduce_mean)
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..task_modules.samplers import PseudoSampler
+from ..utils import multi_apply
+from .atss_head import ATSSHead
+from .fcos_head import FCOSHead
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class VFNetHead(ATSSHead, FCOSHead):
+    """Head of `VarifocalNet (VFNet): An IoU-aware Dense Object
+    Detector.<https://arxiv.org/abs/2008.13367>`_.
+
+    The VFNet predicts IoU-aware classification scores which mix the
+    object presence confidence and object localization accuracy as the
+    detection score. It is built on the FCOS architecture and uses ATSS
+    for defining positive/negative training examples. The VFNet is trained
+    with Varifocal Loss and empolys star-shaped deformable convolution to
+    extract features for a bbox.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        regress_ranges (Sequence[Tuple[int, int]]): Regress range of multiple
+            level points.
+        center_sampling (bool): If true, use center sampling. Defaults to False.
+        center_sample_radius (float): Radius of center sampling. Defaults to 1.5.
+        sync_num_pos (bool): If true, synchronize the number of positive
+            examples across GPUs. Defaults to True
+        gradient_mul (float): The multiplier to gradients from bbox refinement
+            and recognition. Defaults to 0.1.
+        bbox_norm_type (str): The bbox normalization type, 'reg_denom' or
+            'stride'. Defaults to reg_denom
+        loss_cls_fl (:obj:`ConfigDict` or dict): Config of focal loss.
+        use_vfl (bool): If true, use varifocal loss for training.
+            Defaults to True.
+        loss_cls (:obj:`ConfigDict` or dict): Config of varifocal loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss,
+            GIoU Loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization
+            refinement loss, GIoU Loss.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer. Defaults to norm_cfg=dict(type='GN',
+            num_groups=32, requires_grad=True).
+        use_atss (bool): If true, use ATSS to define positive/negative
+            examples. Defaults to True.
+        anchor_generator (:obj:`ConfigDict` or dict): Config of anchor
+            generator for ATSS.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict.
+
+    Example:
+        >>> self = VFNetHead(11, 7)
+        >>> feats = [torch.rand(1, 7, s, s) for s in [4, 8, 16, 32, 64]]
+        >>> cls_score, bbox_pred, bbox_pred_refine= self.forward(feats)
+        >>> assert len(cls_score) == len(self.scales)
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 regress_ranges: RangeType = ((-1, 64), (64, 128), (128, 256),
+                                              (256, 512), (512, INF)),
+                 center_sampling: bool = False,
+                 center_sample_radius: float = 1.5,
+                 sync_num_pos: bool = True,
+                 gradient_mul: float = 0.1,
+                 bbox_norm_type: str = 'reg_denom',
+                 loss_cls_fl: ConfigType = dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 use_vfl: bool = True,
+                 loss_cls: ConfigType = dict(
+                     type='VarifocalLoss',
+                     use_sigmoid=True,
+                     alpha=0.75,
+                     gamma=2.0,
+                     iou_weighted=True,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='GIoULoss', loss_weight=1.5),
+                 loss_bbox_refine: ConfigType = dict(
+                     type='GIoULoss', loss_weight=2.0),
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 use_atss: bool = True,
+                 reg_decoded_bbox: bool = True,
+                 anchor_generator: ConfigType = dict(
+                     type='AnchorGenerator',
+                     ratios=[1.0],
+                     octave_base_scale=8,
+                     scales_per_octave=1,
+                     center_offset=0.0,
+                     strides=[8, 16, 32, 64, 128]),
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='vfnet_cls',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs) -> None:
+        # dcn base offsets, adapted from reppoints_head.py
+        self.num_dconv_points = 9
+        self.dcn_kernel = int(np.sqrt(self.num_dconv_points))
+        self.dcn_pad = int((self.dcn_kernel - 1) / 2)
+        dcn_base = np.arange(-self.dcn_pad,
+                             self.dcn_pad + 1).astype(np.float64)
+        dcn_base_y = np.repeat(dcn_base, self.dcn_kernel)
+        dcn_base_x = np.tile(dcn_base, self.dcn_kernel)
+        dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape(
+            (-1))
+        self.dcn_base_offset = torch.tensor(dcn_base_offset).view(1, -1, 1, 1)
+
+        super(FCOSHead, self).__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.regress_ranges = regress_ranges
+        self.reg_denoms = [
+            regress_range[-1] for regress_range in regress_ranges
+        ]
+        self.reg_denoms[-1] = self.reg_denoms[-2] * 2
+        self.center_sampling = center_sampling
+        self.center_sample_radius = center_sample_radius
+        self.sync_num_pos = sync_num_pos
+        self.bbox_norm_type = bbox_norm_type
+        self.gradient_mul = gradient_mul
+        self.use_vfl = use_vfl
+        if self.use_vfl:
+            self.loss_cls = MODELS.build(loss_cls)
+        else:
+            self.loss_cls = MODELS.build(loss_cls_fl)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.loss_bbox_refine = MODELS.build(loss_bbox_refine)
+
+        # for getting ATSS targets
+        self.use_atss = use_atss
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+
+        self.anchor_center_offset = anchor_generator['center_offset']
+
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if self.train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], default_args=dict(context=self))
+            else:
+                self.sampler = PseudoSampler()
+        # only be used in `get_atss_targets` when `use_atss` is True
+        self.atss_prior_generator = TASK_UTILS.build(anchor_generator)
+
+        self.fcos_prior_generator = MlvlPointGenerator(
+            anchor_generator['strides'],
+            self.anchor_center_offset if self.use_atss else 0.5)
+
+        # In order to reuse the `get_bboxes` in `BaseDenseHead.
+        # Only be used in testing phase.
+        self.prior_generator = self.fcos_prior_generator
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        super(FCOSHead, self)._init_cls_convs()
+        super(FCOSHead, self)._init_reg_convs()
+        self.relu = nn.ReLU()
+        self.vfnet_reg_conv = ConvModule(
+            self.feat_channels,
+            self.feat_channels,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            bias=self.conv_bias)
+        self.vfnet_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.scales = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+        self.vfnet_reg_refine_dconv = DeformConv2d(
+            self.feat_channels,
+            self.feat_channels,
+            self.dcn_kernel,
+            1,
+            padding=self.dcn_pad)
+        self.vfnet_reg_refine = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+        self.scales_refine = nn.ModuleList([Scale(1.0) for _ in self.strides])
+
+        self.vfnet_cls_dconv = DeformConv2d(
+            self.feat_channels,
+            self.feat_channels,
+            self.dcn_kernel,
+            1,
+            padding=self.dcn_pad)
+        self.vfnet_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List[Tensor]]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+
+            - cls_scores (list[Tensor]): Box iou-aware scores for each scale
+              level, each is a 4D-tensor, the channel number is
+              num_points * num_classes.
+            - bbox_preds (list[Tensor]): Box offsets for each
+              scale level, each is a 4D-tensor, the channel number is
+              num_points * 4.
+            - bbox_preds_refine (list[Tensor]): Refined Box offsets for
+              each scale level, each is a 4D-tensor, the channel
+              number is num_points * 4.
+        """
+        return multi_apply(self.forward_single, x, self.scales,
+                           self.scales_refine, self.strides, self.reg_denoms)
+
+    def forward_single(self, x: Tensor, scale: Scale, scale_refine: Scale,
+                       stride: int, reg_denom: int) -> tuple:
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            scale_refine (:obj: `mmcv.cnn.Scale`): Learnable scale module to
+                resize the refined bbox prediction.
+            stride (int): The corresponding stride for feature maps,
+                used to normalize the bbox prediction when
+                bbox_norm_type = 'stride'.
+            reg_denom (int): The corresponding regression range for feature
+                maps, only used to normalize the bbox prediction when
+                bbox_norm_type = 'reg_denom'.
+
+        Returns:
+            tuple: iou-aware cls scores for each box, bbox predictions and
+            refined bbox predictions of input feature maps.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+
+        # predict the bbox_pred of different level
+        reg_feat_init = self.vfnet_reg_conv(reg_feat)
+        if self.bbox_norm_type == 'reg_denom':
+            bbox_pred = scale(
+                self.vfnet_reg(reg_feat_init)).float().exp() * reg_denom
+        elif self.bbox_norm_type == 'stride':
+            bbox_pred = scale(
+                self.vfnet_reg(reg_feat_init)).float().exp() * stride
+        else:
+            raise NotImplementedError
+
+        # compute star deformable convolution offsets
+        # converting dcn_offset to reg_feat.dtype thus VFNet can be
+        # trained with FP16
+        dcn_offset = self.star_dcn_offset(bbox_pred, self.gradient_mul,
+                                          stride).to(reg_feat.dtype)
+
+        # refine the bbox_pred
+        reg_feat = self.relu(self.vfnet_reg_refine_dconv(reg_feat, dcn_offset))
+        bbox_pred_refine = scale_refine(
+            self.vfnet_reg_refine(reg_feat)).float().exp()
+        bbox_pred_refine = bbox_pred_refine * bbox_pred.detach()
+
+        # predict the iou-aware cls score
+        cls_feat = self.relu(self.vfnet_cls_dconv(cls_feat, dcn_offset))
+        cls_score = self.vfnet_cls(cls_feat)
+
+        if self.training:
+            return cls_score, bbox_pred, bbox_pred_refine
+        else:
+            return cls_score, bbox_pred_refine
+
+    def star_dcn_offset(self, bbox_pred: Tensor, gradient_mul: float,
+                        stride: int) -> Tensor:
+        """Compute the star deformable conv offsets.
+
+        Args:
+            bbox_pred (Tensor): Predicted bbox distance offsets (l, r, t, b).
+            gradient_mul (float): Gradient multiplier.
+            stride (int): The corresponding stride for feature maps,
+                used to project the bbox onto the feature map.
+
+        Returns:
+            Tensor: The offsets for deformable convolution.
+        """
+        dcn_base_offset = self.dcn_base_offset.type_as(bbox_pred)
+        bbox_pred_grad_mul = (1 - gradient_mul) * bbox_pred.detach() + \
+            gradient_mul * bbox_pred
+        # map to the feature map scale
+        bbox_pred_grad_mul = bbox_pred_grad_mul / stride
+        N, C, H, W = bbox_pred.size()
+
+        x1 = bbox_pred_grad_mul[:, 0, :, :]
+        y1 = bbox_pred_grad_mul[:, 1, :, :]
+        x2 = bbox_pred_grad_mul[:, 2, :, :]
+        y2 = bbox_pred_grad_mul[:, 3, :, :]
+        bbox_pred_grad_mul_offset = bbox_pred.new_zeros(
+            N, 2 * self.num_dconv_points, H, W)
+        bbox_pred_grad_mul_offset[:, 0, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 1, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 2, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 4, :, :] = -1.0 * y1  # -y1
+        bbox_pred_grad_mul_offset[:, 5, :, :] = x2  # x2
+        bbox_pred_grad_mul_offset[:, 7, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 11, :, :] = x2  # x2
+        bbox_pred_grad_mul_offset[:, 12, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 13, :, :] = -1.0 * x1  # -x1
+        bbox_pred_grad_mul_offset[:, 14, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 16, :, :] = y2  # y2
+        bbox_pred_grad_mul_offset[:, 17, :, :] = x2  # x2
+        dcn_offset = bbox_pred_grad_mul_offset - dcn_base_offset
+
+        return dcn_offset
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            bbox_preds_refine: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box offsets for each
+                scale level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            bbox_preds_refine (list[Tensor]): Refined Box offsets for
+                each scale level, each is a 4D-tensor, the channel
+                number is num_points * 4.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(bbox_preds_refine)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.fcos_prior_generator.grid_priors(
+            featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device)
+        labels, label_weights, bbox_targets, bbox_weights = self.get_targets(
+            cls_scores,
+            all_level_points,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds and bbox_preds_refine
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3,
+                              1).reshape(-1,
+                                         self.cls_out_channels).contiguous()
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4).contiguous()
+            for bbox_pred in bbox_preds
+        ]
+        flatten_bbox_preds_refine = [
+            bbox_pred_refine.permute(0, 2, 3, 1).reshape(-1, 4).contiguous()
+            for bbox_pred_refine in bbox_preds_refine
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_bbox_preds_refine = torch.cat(flatten_bbox_preds_refine)
+        flatten_labels = torch.cat(labels)
+        flatten_bbox_targets = torch.cat(bbox_targets)
+        # repeat points to align with bbox_preds
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+
+        # FG cat_id: [0, num_classes - 1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = torch.where(
+            ((flatten_labels >= 0) & (flatten_labels < bg_class_ind)) > 0)[0]
+        num_pos = len(pos_inds)
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_bbox_preds_refine = flatten_bbox_preds_refine[pos_inds]
+        pos_labels = flatten_labels[pos_inds]
+
+        # sync num_pos across all gpus
+        if self.sync_num_pos:
+            num_pos_avg_per_gpu = reduce_mean(
+                pos_inds.new_tensor(num_pos).float()).item()
+            num_pos_avg_per_gpu = max(num_pos_avg_per_gpu, 1.0)
+        else:
+            num_pos_avg_per_gpu = num_pos
+
+        pos_bbox_targets = flatten_bbox_targets[pos_inds]
+        pos_points = flatten_points[pos_inds]
+
+        pos_decoded_bbox_preds = self.bbox_coder.decode(
+            pos_points, pos_bbox_preds)
+        pos_decoded_target_preds = self.bbox_coder.decode(
+            pos_points, pos_bbox_targets)
+        iou_targets_ini = bbox_overlaps(
+            pos_decoded_bbox_preds,
+            pos_decoded_target_preds.detach(),
+            is_aligned=True).clamp(min=1e-6)
+        bbox_weights_ini = iou_targets_ini.clone().detach()
+        bbox_avg_factor_ini = reduce_mean(
+            bbox_weights_ini.sum()).clamp_(min=1).item()
+
+        pos_decoded_bbox_preds_refine = \
+            self.bbox_coder.decode(pos_points, pos_bbox_preds_refine)
+        iou_targets_rf = bbox_overlaps(
+            pos_decoded_bbox_preds_refine,
+            pos_decoded_target_preds.detach(),
+            is_aligned=True).clamp(min=1e-6)
+        bbox_weights_rf = iou_targets_rf.clone().detach()
+        bbox_avg_factor_rf = reduce_mean(
+            bbox_weights_rf.sum()).clamp_(min=1).item()
+
+        if num_pos > 0:
+            loss_bbox = self.loss_bbox(
+                pos_decoded_bbox_preds,
+                pos_decoded_target_preds.detach(),
+                weight=bbox_weights_ini,
+                avg_factor=bbox_avg_factor_ini)
+
+            loss_bbox_refine = self.loss_bbox_refine(
+                pos_decoded_bbox_preds_refine,
+                pos_decoded_target_preds.detach(),
+                weight=bbox_weights_rf,
+                avg_factor=bbox_avg_factor_rf)
+
+            # build IoU-aware cls_score targets
+            if self.use_vfl:
+                pos_ious = iou_targets_rf.clone().detach()
+                cls_iou_targets = torch.zeros_like(flatten_cls_scores)
+                cls_iou_targets[pos_inds, pos_labels] = pos_ious
+        else:
+            loss_bbox = pos_bbox_preds.sum() * 0
+            loss_bbox_refine = pos_bbox_preds_refine.sum() * 0
+            if self.use_vfl:
+                cls_iou_targets = torch.zeros_like(flatten_cls_scores)
+
+        if self.use_vfl:
+            loss_cls = self.loss_cls(
+                flatten_cls_scores,
+                cls_iou_targets,
+                avg_factor=num_pos_avg_per_gpu)
+        else:
+            loss_cls = self.loss_cls(
+                flatten_cls_scores,
+                flatten_labels,
+                weight=label_weights,
+                avg_factor=num_pos_avg_per_gpu)
+
+        return dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_bbox_rf=loss_bbox_refine)
+
+    def get_targets(
+            self,
+            cls_scores: List[Tensor],
+            mlvl_points: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> tuple:
+        """A wrapper for computing ATSS and FCOS targets for points in multiple
+        images.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level with shape (N, num_points * num_classes, H, W).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            tuple:
+
+            - labels_list (list[Tensor]): Labels of each level.
+            - label_weights (Tensor/None): Label weights of all levels.
+            - bbox_targets_list (list[Tensor]): Regression targets of each
+              level, (l, t, r, b).
+            - bbox_weights (Tensor/None): Bbox weights of all levels.
+        """
+        if self.use_atss:
+            return self.get_atss_targets(cls_scores, mlvl_points,
+                                         batch_gt_instances, batch_img_metas,
+                                         batch_gt_instances_ignore)
+        else:
+            self.norm_on_bbox = False
+            return self.get_fcos_targets(mlvl_points, batch_gt_instances)
+
+    def _get_targets_single(self, *args, **kwargs):
+        """Avoid ambiguity in multiple inheritance."""
+        if self.use_atss:
+            return ATSSHead._get_targets_single(self, *args, **kwargs)
+        else:
+            return FCOSHead._get_targets_single(self, *args, **kwargs)
+
+    def get_fcos_targets(self, points: List[Tensor],
+                         batch_gt_instances: InstanceList) -> tuple:
+        """Compute FCOS regression and classification targets for points in
+        multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple:
+
+            - labels (list[Tensor]): Labels of each level.
+            - label_weights: None, to be compatible with ATSS targets.
+            - bbox_targets (list[Tensor]): BBox targets of each level.
+            - bbox_weights: None, to be compatible with ATSS targets.
+        """
+        labels, bbox_targets = FCOSHead.get_targets(self, points,
+                                                    batch_gt_instances)
+        label_weights = None
+        bbox_weights = None
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_anchors(self,
+                    featmap_sizes: List[Tuple],
+                    batch_img_metas: List[dict],
+                    device: str = 'cuda') -> tuple:
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            batch_img_metas (list[dict]): Image meta info.
+            device (str): Device for returned tensors
+
+        Returns:
+            tuple:
+
+            - anchor_list (list[Tensor]): Anchors of each image.
+            - valid_flag_list (list[Tensor]): Valid flags of each image.
+        """
+        num_imgs = len(batch_img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.atss_prior_generator.grid_priors(
+            featmap_sizes, device=device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            multi_level_flags = self.atss_prior_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device=device)
+            valid_flag_list.append(multi_level_flags)
+
+        return anchor_list, valid_flag_list
+
+    def get_atss_targets(
+            self,
+            cls_scores: List[Tensor],
+            mlvl_points: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> tuple:
+        """A wrapper for computing ATSS targets for points in multiple images.
+
+        Args:
+            cls_scores (list[Tensor]): Box iou-aware scores for each scale
+                level with shape (N, num_points * num_classes, H, W).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], Optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            tuple:
+
+            - labels_list (list[Tensor]): Labels of each level.
+            - label_weights (Tensor): Label weights of all levels.
+            - bbox_targets_list (list[Tensor]): Regression targets of each
+              level, (l, t, r, b).
+            - bbox_weights (Tensor): Bbox weights of all levels.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(
+            featmap_sizes
+        ) == self.atss_prior_generator.num_levels == \
+            self.fcos_prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        cls_reg_targets = ATSSHead.get_targets(
+            self,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore,
+            unmap_outputs=True)
+
+        (anchor_list, labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, avg_factor) = cls_reg_targets
+
+        bbox_targets_list = [
+            bbox_targets.reshape(-1, 4) for bbox_targets in bbox_targets_list
+        ]
+
+        num_imgs = len(batch_img_metas)
+        # transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format
+        bbox_targets_list = self.transform_bbox_targets(
+            bbox_targets_list, mlvl_points, num_imgs)
+
+        labels_list = [labels.reshape(-1) for labels in labels_list]
+        label_weights_list = [
+            label_weights.reshape(-1) for label_weights in label_weights_list
+        ]
+        bbox_weights_list = [
+            bbox_weights.reshape(-1) for bbox_weights in bbox_weights_list
+        ]
+        label_weights = torch.cat(label_weights_list)
+        bbox_weights = torch.cat(bbox_weights_list)
+        return labels_list, label_weights, bbox_targets_list, bbox_weights
+
+    def transform_bbox_targets(self, decoded_bboxes: List[Tensor],
+                               mlvl_points: List[Tensor],
+                               num_imgs: int) -> List[Tensor]:
+        """Transform bbox_targets (x1, y1, x2, y2) into (l, t, r, b) format.
+
+        Args:
+            decoded_bboxes (list[Tensor]): Regression targets of each level,
+                in the form of (x1, y1, x2, y2).
+            mlvl_points (list[Tensor]): Points of each fpn level, each has
+                shape (num_points, 2).
+            num_imgs (int): the number of images in a batch.
+
+        Returns:
+            bbox_targets (list[Tensor]): Regression targets of each level in
+                the form of (l, t, r, b).
+        """
+        # TODO: Re-implemented in Class PointCoder
+        assert len(decoded_bboxes) == len(mlvl_points)
+        num_levels = len(decoded_bboxes)
+        mlvl_points = [points.repeat(num_imgs, 1) for points in mlvl_points]
+        bbox_targets = []
+        for i in range(num_levels):
+            bbox_target = self.bbox_coder.encode(mlvl_points[i],
+                                                 decoded_bboxes[i])
+            bbox_targets.append(bbox_target)
+
+        return bbox_targets
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Override the method in the parent class to avoid changing para's
+        name."""
+        pass
diff --git a/head_extractor/src/mmdet/models/dense_heads/yolact_head.py b/head_extractor/src/mmdet/models/dense_heads/yolact_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3390c136a31bee81134667eb28ad8829ddb84cc3
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/yolact_head.py
@@ -0,0 +1,1193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList, OptMultiConfig)
+from ..layers import fast_nms
+from ..utils import images_to_levels, multi_apply, select_single_mlvl
+from ..utils.misc import empty_instances
+from .anchor_head import AnchorHead
+from .base_mask_head import BaseMaskHead
+
+
+@MODELS.register_module()
+class YOLACTHead(AnchorHead):
+    """YOLACT box head used in https://arxiv.org/abs/1904.02689.
+
+    Note that YOLACT head is a light version of RetinaNet head.
+    Four differences are described as follows:
+
+    1. YOLACT box head has three-times fewer anchors.
+    2. YOLACT box head shares the convs for box and cls branches.
+    3. YOLACT box head uses OHEM instead of Focal loss.
+    4. YOLACT box head predicts a set of mask coefficients for each box.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        anchor_generator (:obj:`ConfigDict` or dict): Config dict for
+            anchor generator
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        num_head_convs (int): Number of the conv layers shared by
+            box and cls branches.
+        num_protos (int): Number of the mask coefficients.
+        use_ohem (bool): If true, ``loss_single_OHEM`` will be used for
+            cls loss calculation. If false, ``loss_single`` will be used.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Dictionary to
+            construct and config conv layer.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Dictionary to
+            construct and config norm layer.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: int,
+                 anchor_generator: ConfigType = dict(
+                     type='AnchorGenerator',
+                     octave_base_scale=3,
+                     scales_per_octave=1,
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[8, 16, 32, 64, 128]),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     reduction='none',
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1.5),
+                 num_head_convs: int = 1,
+                 num_protos: int = 32,
+                 use_ohem: bool = True,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = dict(
+                     type='Xavier',
+                     distribution='uniform',
+                     bias=0,
+                     layer='Conv2d'),
+                 **kwargs) -> None:
+        self.num_head_convs = num_head_convs
+        self.num_protos = num_protos
+        self.use_ohem = use_ohem
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super().__init__(
+            num_classes=num_classes,
+            in_channels=in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            anchor_generator=anchor_generator,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.relu = nn.ReLU(inplace=True)
+        self.head_convs = ModuleList()
+        for i in range(self.num_head_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.head_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.cls_out_channels,
+            3,
+            padding=1)
+        self.conv_reg = nn.Conv2d(
+            self.feat_channels, self.num_base_priors * 4, 3, padding=1)
+        self.conv_coeff = nn.Conv2d(
+            self.feat_channels,
+            self.num_base_priors * self.num_protos,
+            3,
+            padding=1)
+
+    def forward_single(self, x: Tensor) -> tuple:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+
+            - cls_score (Tensor): Cls scores for a single scale level
+              the channels number is num_anchors * num_classes.
+            - bbox_pred (Tensor): Box energies / deltas for a single scale
+              level, the channels number is num_anchors * 4.
+            - coeff_pred (Tensor): Mask coefficients for a single scale
+              level, the channels number is num_anchors * num_protos.
+        """
+        for head_conv in self.head_convs:
+            x = head_conv(x)
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        coeff_pred = self.conv_coeff(x).tanh()
+        return cls_score, bbox_pred, coeff_pred
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            coeff_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the bbox head.
+
+        When ``self.use_ohem == True``, it functions like ``SSDHead.loss``,
+        otherwise, it follows ``AnchorHead.loss``.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            coeff_preds (list[Tensor]): Mask coefficients for each scale
+                level with shape (N, num_anchors * num_protos, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.prior_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore,
+            unmap_outputs=not self.use_ohem,
+            return_sampling_results=True)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         avg_factor, sampling_results) = cls_reg_targets
+
+        if self.use_ohem:
+            num_images = len(batch_img_metas)
+            all_cls_scores = torch.cat([
+                s.permute(0, 2, 3, 1).reshape(
+                    num_images, -1, self.cls_out_channels) for s in cls_scores
+            ], 1)
+            all_labels = torch.cat(labels_list, -1).view(num_images, -1)
+            all_label_weights = torch.cat(label_weights_list,
+                                          -1).view(num_images, -1)
+            all_bbox_preds = torch.cat([
+                b.permute(0, 2, 3, 1).reshape(num_images, -1, 4)
+                for b in bbox_preds
+            ], -2)
+            all_bbox_targets = torch.cat(bbox_targets_list,
+                                         -2).view(num_images, -1, 4)
+            all_bbox_weights = torch.cat(bbox_weights_list,
+                                         -2).view(num_images, -1, 4)
+
+            # concat all level anchors to a single tensor
+            all_anchors = []
+            for i in range(num_images):
+                all_anchors.append(torch.cat(anchor_list[i]))
+
+            # check NaN and Inf
+            assert torch.isfinite(all_cls_scores).all().item(), \
+                'classification scores become infinite or NaN!'
+            assert torch.isfinite(all_bbox_preds).all().item(), \
+                'bbox predications become infinite or NaN!'
+
+            losses_cls, losses_bbox = multi_apply(
+                self.OHEMloss_by_feat_single,
+                all_cls_scores,
+                all_bbox_preds,
+                all_anchors,
+                all_labels,
+                all_label_weights,
+                all_bbox_targets,
+                all_bbox_weights,
+                avg_factor=avg_factor)
+        else:
+            # anchor number of multi levels
+            num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+            # concat all level anchors and flags to a single tensor
+            concat_anchor_list = []
+            for i in range(len(anchor_list)):
+                concat_anchor_list.append(torch.cat(anchor_list[i]))
+            all_anchor_list = images_to_levels(concat_anchor_list,
+                                               num_level_anchors)
+            losses_cls, losses_bbox = multi_apply(
+                self.loss_by_feat_single,
+                cls_scores,
+                bbox_preds,
+                all_anchor_list,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                bbox_weights_list,
+                avg_factor=avg_factor)
+        losses = dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+        # update `_raw_positive_infos`, which will be used when calling
+        # `get_positive_infos`.
+        self._raw_positive_infos.update(coeff_preds=coeff_preds)
+        return losses
+
+    def OHEMloss_by_feat_single(self, cls_score: Tensor, bbox_pred: Tensor,
+                                anchors: Tensor, labels: Tensor,
+                                label_weights: Tensor, bbox_targets: Tensor,
+                                bbox_weights: Tensor,
+                                avg_factor: int) -> tuple:
+        """Compute loss of a single image. Similar to
+        func:``SSDHead.loss_by_feat_single``
+
+        Args:
+            cls_score (Tensor): Box scores for eachimage
+                Has shape (num_total_anchors, num_classes).
+            bbox_pred (Tensor): Box energies / deltas for each image
+                level with shape (num_total_anchors, 4).
+            anchors (Tensor): Box reference for each scale level with shape
+                (num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (num_total_anchors,).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (num_total_anchors,)
+            bbox_targets (Tensor): BBox regression targets of each anchor with
+                shape (num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (num_total_anchors, 4).
+            avg_factor (int): Average factor that is used to average
+                the loss. When using sampling method, avg_factor is usually
+                the sum of positive and negative priors. When using
+                `PseudoSampler`, `avg_factor` is usually equal to the number
+                of positive priors.
+
+        Returns:
+            Tuple[Tensor, Tensor]: A tuple of cls loss and bbox loss of one
+            feature map.
+        """
+
+        loss_cls_all = self.loss_cls(cls_score, labels, label_weights)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        pos_inds = ((labels >= 0) & (labels < self.num_classes)).nonzero(
+            as_tuple=False).reshape(-1)
+        neg_inds = (labels == self.num_classes).nonzero(
+            as_tuple=False).view(-1)
+
+        num_pos_samples = pos_inds.size(0)
+        if num_pos_samples == 0:
+            num_neg_samples = neg_inds.size(0)
+        else:
+            num_neg_samples = self.train_cfg['neg_pos_ratio'] * \
+                              num_pos_samples
+            if num_neg_samples > neg_inds.size(0):
+                num_neg_samples = neg_inds.size(0)
+        topk_loss_cls_neg, _ = loss_cls_all[neg_inds].topk(num_neg_samples)
+        loss_cls_pos = loss_cls_all[pos_inds].sum()
+        loss_cls_neg = topk_loss_cls_neg.sum()
+        loss_cls = (loss_cls_pos + loss_cls_neg) / avg_factor
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+        loss_bbox = self.loss_bbox(
+            bbox_pred, bbox_targets, bbox_weights, avg_factor=avg_factor)
+        return loss_cls[None], loss_bbox
+
+    def get_positive_infos(self) -> InstanceList:
+        """Get positive information from sampling results.
+
+        Returns:
+            list[:obj:`InstanceData`]: Positive Information of each image,
+            usually including positive bboxes, positive labels, positive
+            priors, positive coeffs, etc.
+        """
+        assert len(self._raw_positive_infos) > 0
+        sampling_results = self._raw_positive_infos['sampling_results']
+        num_imgs = len(sampling_results)
+
+        coeff_pred_list = []
+        for coeff_pred_per_level in self._raw_positive_infos['coeff_preds']:
+            coeff_pred_per_level = \
+                coeff_pred_per_level.permute(
+                    0, 2, 3, 1).reshape(num_imgs, -1, self.num_protos)
+            coeff_pred_list.append(coeff_pred_per_level)
+        coeff_preds = torch.cat(coeff_pred_list, dim=1)
+
+        pos_info_list = []
+        for idx, sampling_result in enumerate(sampling_results):
+            pos_info = InstanceData()
+            coeff_preds_single = coeff_preds[idx]
+            pos_info.pos_assigned_gt_inds = \
+                sampling_result.pos_assigned_gt_inds
+            pos_info.pos_inds = sampling_result.pos_inds
+            pos_info.coeffs = coeff_preds_single[sampling_result.pos_inds]
+            pos_info.bboxes = sampling_result.pos_gt_bboxes
+            pos_info_list.append(pos_info)
+        return pos_info_list
+
+    def predict_by_feat(self,
+                        cls_scores,
+                        bbox_preds,
+                        coeff_preds,
+                        batch_img_metas,
+                        cfg=None,
+                        rescale=True,
+                        **kwargs):
+        """Similar to func:``AnchorHead.get_bboxes``, but additionally
+        processes coeff_preds.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                with shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            coeff_preds (list[Tensor]): Mask coefficients for each scale
+                level with shape (N, num_anchors * num_protos, H, W)
+            batch_img_metas (list[dict]): Batch image meta info.
+            cfg (:obj:`Config` | None): Test / postprocessing configuration,
+                if None, test_cfg would be used
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - coeffs (Tensor): the predicted mask coefficients of
+                  instance inside the corresponding box has a shape
+                  (n, num_protos).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+
+        device = cls_scores[0].device
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device)
+
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            cls_score_list = select_single_mlvl(cls_scores, img_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
+            coeff_pred_list = select_single_mlvl(coeff_preds, img_id)
+            results = self._predict_by_feat_single(
+                cls_score_list=cls_score_list,
+                bbox_pred_list=bbox_pred_list,
+                coeff_preds_list=coeff_pred_list,
+                mlvl_priors=mlvl_priors,
+                img_meta=img_meta,
+                cfg=cfg,
+                rescale=rescale)
+            result_list.append(results)
+        return result_list
+
+    def _predict_by_feat_single(self,
+                                cls_score_list: List[Tensor],
+                                bbox_pred_list: List[Tensor],
+                                coeff_preds_list: List[Tensor],
+                                mlvl_priors: List[Tensor],
+                                img_meta: dict,
+                                cfg: ConfigType,
+                                rescale: bool = True) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results. Similar to func:``AnchorHead._predict_by_feat_single``,
+        but additionally processes coeff_preds_list and uses fast NMS instead
+        of traditional NMS.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores for a single scale level
+                Has shape (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas for a single
+                scale level with shape (num_priors * 4, H, W).
+            coeff_preds_list (list[Tensor]): Mask coefficients for a single
+                scale level with shape (num_priors * num_protos, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid,
+                has shape (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - coeffs (Tensor): the predicted mask coefficients of
+                  instance inside the corresponding box has a shape
+                  (n, num_protos).
+        """
+        assert len(cls_score_list) == len(bbox_pred_list) == len(mlvl_priors)
+
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        img_shape = img_meta['img_shape']
+        nms_pre = cfg.get('nms_pre', -1)
+
+        mlvl_bbox_preds = []
+        mlvl_valid_priors = []
+        mlvl_scores = []
+        mlvl_coeffs = []
+        for cls_score, bbox_pred, coeff_pred, priors in \
+                zip(cls_score_list, bbox_pred_list,
+                    coeff_preds_list, mlvl_priors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            coeff_pred = coeff_pred.permute(1, 2,
+                                            0).reshape(-1, self.num_protos)
+
+            if 0 < nms_pre < scores.shape[0]:
+                # Get maximum scores for foreground classes.
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                priors = priors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                coeff_pred = coeff_pred[topk_inds, :]
+
+            mlvl_bbox_preds.append(bbox_pred)
+            mlvl_valid_priors.append(priors)
+            mlvl_scores.append(scores)
+            mlvl_coeffs.append(coeff_pred)
+
+        bbox_pred = torch.cat(mlvl_bbox_preds)
+        priors = torch.cat(mlvl_valid_priors)
+        multi_bboxes = self.bbox_coder.decode(
+            priors, bbox_pred, max_shape=img_shape)
+
+        multi_scores = torch.cat(mlvl_scores)
+        multi_coeffs = torch.cat(mlvl_coeffs)
+
+        return self._bbox_post_process(
+            multi_bboxes=multi_bboxes,
+            multi_scores=multi_scores,
+            multi_coeffs=multi_coeffs,
+            cfg=cfg,
+            rescale=rescale,
+            img_meta=img_meta)
+
+    def _bbox_post_process(self,
+                           multi_bboxes: Tensor,
+                           multi_scores: Tensor,
+                           multi_coeffs: Tensor,
+                           cfg: ConfigType,
+                           rescale: bool = False,
+                           img_meta: Optional[dict] = None,
+                           **kwargs) -> InstanceData:
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            multi_bboxes (Tensor): Predicted bbox that concat all levels.
+            multi_scores (Tensor): Bbox scores that concat all levels.
+            multi_coeffs (Tensor): Mask coefficients  that concat all levels.
+            cfg (ConfigDict): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default to False.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - coeffs (Tensor): the predicted mask coefficients of
+                  instance inside the corresponding box has a shape
+                  (n, num_protos).
+        """
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            multi_bboxes /= multi_bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+            # mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+            # BG cat_id: num_class
+
+            padding = multi_scores.new_zeros(multi_scores.shape[0], 1)
+            multi_scores = torch.cat([multi_scores, padding], dim=1)
+        det_bboxes, det_labels, det_coeffs = fast_nms(
+            multi_bboxes, multi_scores, multi_coeffs, cfg.score_thr,
+            cfg.iou_thr, cfg.top_k, cfg.max_per_img)
+        results = InstanceData()
+        results.bboxes = det_bboxes[:, :4]
+        results.scores = det_bboxes[:, -1]
+        results.labels = det_labels
+        results.coeffs = det_coeffs
+        return results
+
+
+@MODELS.register_module()
+class YOLACTProtonet(BaseMaskHead):
+    """YOLACT mask head used in https://arxiv.org/abs/1904.02689.
+
+    This head outputs the mask prototypes for YOLACT.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        proto_channels (tuple[int]): Output channels of protonet convs.
+        proto_kernel_sizes (tuple[int]): Kernel sizes of protonet convs.
+        include_last_relu (bool): If keep the last relu of protonet.
+        num_protos (int): Number of prototypes.
+        num_classes (int): Number of categories excluding the background
+            category.
+        loss_mask_weight (float): Reweight the mask loss by this factor.
+        max_masks_to_train (int): Maximum number of masks to train for
+            each image.
+        with_seg_branch (bool): Whether to apply a semantic segmentation
+            branch and calculate loss during training to increase
+            performance with no speed penalty. Defaults to True.
+        loss_segm (:obj:`ConfigDict` or dict, optional): Config of
+            semantic segmentation loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config
+            of head.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            head.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int = 256,
+        proto_channels: tuple = (256, 256, 256, None, 256, 32),
+        proto_kernel_sizes: tuple = (3, 3, 3, -2, 3, 1),
+        include_last_relu: bool = True,
+        num_protos: int = 32,
+        loss_mask_weight: float = 1.0,
+        max_masks_to_train: int = 100,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        with_seg_branch: bool = True,
+        loss_segm: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        init_cfg=dict(
+            type='Xavier',
+            distribution='uniform',
+            override=dict(name='protonet'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.proto_channels = proto_channels
+        self.proto_kernel_sizes = proto_kernel_sizes
+        self.include_last_relu = include_last_relu
+
+        # Segmentation branch
+        self.with_seg_branch = with_seg_branch
+        self.segm_branch = SegmentationModule(
+            num_classes=num_classes, in_channels=in_channels) \
+            if with_seg_branch else None
+        self.loss_segm = MODELS.build(loss_segm) if with_seg_branch else None
+
+        self.loss_mask_weight = loss_mask_weight
+        self.num_protos = num_protos
+        self.num_classes = num_classes
+        self.max_masks_to_train = max_masks_to_train
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        # Possible patterns:
+        # ( 256, 3) -> conv
+        # ( 256,-2) -> deconv
+        # (None,-2) -> bilinear interpolate
+        in_channels = self.in_channels
+        protonets = ModuleList()
+        for num_channels, kernel_size in zip(self.proto_channels,
+                                             self.proto_kernel_sizes):
+            if kernel_size > 0:
+                layer = nn.Conv2d(
+                    in_channels,
+                    num_channels,
+                    kernel_size,
+                    padding=kernel_size // 2)
+            else:
+                if num_channels is None:
+                    layer = InterpolateModule(
+                        scale_factor=-kernel_size,
+                        mode='bilinear',
+                        align_corners=False)
+                else:
+                    layer = nn.ConvTranspose2d(
+                        in_channels,
+                        num_channels,
+                        -kernel_size,
+                        padding=kernel_size // 2)
+            protonets.append(layer)
+            protonets.append(nn.ReLU(inplace=True))
+            in_channels = num_channels if num_channels is not None \
+                else in_channels
+        if not self.include_last_relu:
+            protonets = protonets[:-1]
+        self.protonet = nn.Sequential(*protonets)
+
+    def forward(self, x: tuple, positive_infos: InstanceList) -> tuple:
+        """Forward feature from the upstream network to get prototypes and
+        linearly combine the prototypes, using masks coefficients, into
+        instance masks. Finally, crop the instance masks with given bboxes.
+
+        Args:
+            x (Tuple[Tensor]): Feature from the upstream network, which is
+                a 4D-tensor.
+            positive_infos (List[:obj:``InstanceData``]): Positive information
+                that calculate from detect head.
+
+        Returns:
+            tuple: Predicted instance segmentation masks and
+            semantic segmentation map.
+        """
+        # YOLACT used single feature map to get segmentation masks
+        single_x = x[0]
+
+        # YOLACT segmentation branch, if not training or segmentation branch
+        # is None, will not process the forward function.
+        if self.segm_branch is not None and self.training:
+            segm_preds = self.segm_branch(single_x)
+        else:
+            segm_preds = None
+        # YOLACT mask head
+        prototypes = self.protonet(single_x)
+        prototypes = prototypes.permute(0, 2, 3, 1).contiguous()
+
+        num_imgs = single_x.size(0)
+
+        mask_pred_list = []
+        for idx in range(num_imgs):
+            cur_prototypes = prototypes[idx]
+            pos_coeffs = positive_infos[idx].coeffs
+
+            # Linearly combine the prototypes with the mask coefficients
+            mask_preds = cur_prototypes @ pos_coeffs.t()
+            mask_preds = torch.sigmoid(mask_preds)
+            mask_pred_list.append(mask_preds)
+        return mask_pred_list, segm_preds
+
+    def loss_by_feat(self, mask_preds: List[Tensor], segm_preds: List[Tensor],
+                     batch_gt_instances: InstanceList,
+                     batch_img_metas: List[dict], positive_infos: InstanceList,
+                     **kwargs) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (list[Tensor]): List of predicted prototypes, each has
+                shape (num_classes, H, W).
+            segm_preds (Tensor):  Predicted semantic segmentation map with
+                shape (N, num_classes, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``masks``,
+                and ``labels`` attributes.
+            batch_img_metas (list[dict]): Meta information of multiple images.
+            positive_infos (List[:obj:``InstanceData``]): Information of
+                positive samples of each image that are assigned in detection
+                head.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert positive_infos is not None, \
+            'positive_infos should not be None in `YOLACTProtonet`'
+        losses = dict()
+
+        # crop
+        croped_mask_pred = self.crop_mask_preds(mask_preds, batch_img_metas,
+                                                positive_infos)
+
+        loss_mask = []
+        loss_segm = []
+        num_imgs, _, mask_h, mask_w = segm_preds.size()
+        assert num_imgs == len(croped_mask_pred)
+        segm_avg_factor = num_imgs * mask_h * mask_w
+        total_pos = 0
+
+        if self.segm_branch is not None:
+            assert segm_preds is not None
+
+        for idx in range(num_imgs):
+            img_meta = batch_img_metas[idx]
+
+            (mask_preds, pos_mask_targets, segm_targets, num_pos,
+             gt_bboxes_for_reweight) = self._get_targets_single(
+                 croped_mask_pred[idx], segm_preds[idx],
+                 batch_gt_instances[idx], positive_infos[idx])
+
+            # segmentation loss
+            if self.with_seg_branch:
+                if segm_targets is None:
+                    loss = segm_preds[idx].sum() * 0.
+                else:
+                    loss = self.loss_segm(
+                        segm_preds[idx],
+                        segm_targets,
+                        avg_factor=segm_avg_factor)
+                loss_segm.append(loss)
+            # mask loss
+            total_pos += num_pos
+            if num_pos == 0 or pos_mask_targets is None:
+                loss = mask_preds.sum() * 0.
+            else:
+                mask_preds = torch.clamp(mask_preds, 0, 1)
+                loss = F.binary_cross_entropy(
+                    mask_preds, pos_mask_targets,
+                    reduction='none') * self.loss_mask_weight
+
+                h, w = img_meta['img_shape'][:2]
+                gt_bboxes_width = (gt_bboxes_for_reweight[:, 2] -
+                                   gt_bboxes_for_reweight[:, 0]) / w
+                gt_bboxes_height = (gt_bboxes_for_reweight[:, 3] -
+                                    gt_bboxes_for_reweight[:, 1]) / h
+                loss = loss.mean(dim=(1,
+                                      2)) / gt_bboxes_width / gt_bboxes_height
+                loss = torch.sum(loss)
+            loss_mask.append(loss)
+
+        if total_pos == 0:
+            total_pos += 1  # avoid nan
+        loss_mask = [x / total_pos for x in loss_mask]
+
+        losses.update(loss_mask=loss_mask)
+        if self.with_seg_branch:
+            losses.update(loss_segm=loss_segm)
+
+        return losses
+
+    def _get_targets_single(self, mask_preds: Tensor, segm_pred: Tensor,
+                            gt_instances: InstanceData,
+                            positive_info: InstanceData):
+        """Compute targets for predictions of single image.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes with shape
+                (num_classes, H, W).
+            segm_pred (Tensor): Predicted semantic segmentation map
+                with shape (num_classes, H, W).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes``, ``labels``,
+                and ``masks`` attributes.
+            positive_info (:obj:`InstanceData`): Information of positive
+                samples that are assigned in detection head. It usually
+                contains following keys.
+
+                    - pos_assigned_gt_inds (Tensor): Assigner GT indexes of
+                      positive proposals, has shape (num_pos, )
+                    - pos_inds (Tensor): Positive index of image, has
+                      shape (num_pos, ).
+                    - coeffs (Tensor): Positive mask coefficients
+                      with shape (num_pos, num_protos).
+                    - bboxes (Tensor): Positive bboxes with shape
+                      (num_pos, 4)
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+            - mask_preds (Tensor): Positive predicted mask with shape
+              (num_pos, mask_h, mask_w).
+            - pos_mask_targets (Tensor): Positive mask targets with shape
+              (num_pos, mask_h, mask_w).
+            - segm_targets (Tensor): Semantic segmentation targets with shape
+              (num_classes, segm_h, segm_w).
+            - num_pos (int): Positive numbers.
+            - gt_bboxes_for_reweight (Tensor): GT bboxes that match to the
+              positive priors has shape (num_pos, 4).
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        device = gt_bboxes.device
+        gt_masks = gt_instances.masks.to_tensor(
+            dtype=torch.bool, device=device).float()
+        if gt_masks.size(0) == 0:
+            return mask_preds, None, None, 0, None
+
+        # process with semantic segmentation targets
+        if segm_pred is not None:
+            num_classes, segm_h, segm_w = segm_pred.size()
+            with torch.no_grad():
+                downsampled_masks = F.interpolate(
+                    gt_masks.unsqueeze(0), (segm_h, segm_w),
+                    mode='bilinear',
+                    align_corners=False).squeeze(0)
+                downsampled_masks = downsampled_masks.gt(0.5).float()
+                segm_targets = torch.zeros_like(segm_pred, requires_grad=False)
+                for obj_idx in range(downsampled_masks.size(0)):
+                    segm_targets[gt_labels[obj_idx] - 1] = torch.max(
+                        segm_targets[gt_labels[obj_idx] - 1],
+                        downsampled_masks[obj_idx])
+        else:
+            segm_targets = None
+        # process with mask targets
+        pos_assigned_gt_inds = positive_info.pos_assigned_gt_inds
+        num_pos = pos_assigned_gt_inds.size(0)
+        # Since we're producing (near) full image masks,
+        # it'd take too much vram to backprop on every single mask.
+        # Thus we select only a subset.
+        if num_pos > self.max_masks_to_train:
+            perm = torch.randperm(num_pos)
+            select = perm[:self.max_masks_to_train]
+            mask_preds = mask_preds[select]
+            pos_assigned_gt_inds = pos_assigned_gt_inds[select]
+            num_pos = self.max_masks_to_train
+
+        gt_bboxes_for_reweight = gt_bboxes[pos_assigned_gt_inds]
+
+        mask_h, mask_w = mask_preds.shape[-2:]
+        gt_masks = F.interpolate(
+            gt_masks.unsqueeze(0), (mask_h, mask_w),
+            mode='bilinear',
+            align_corners=False).squeeze(0)
+        gt_masks = gt_masks.gt(0.5).float()
+        pos_mask_targets = gt_masks[pos_assigned_gt_inds]
+
+        return (mask_preds, pos_mask_targets, segm_targets, num_pos,
+                gt_bboxes_for_reweight)
+
+    def crop_mask_preds(self, mask_preds: List[Tensor],
+                        batch_img_metas: List[dict],
+                        positive_infos: InstanceList) -> list:
+        """Crop predicted masks by zeroing out everything not in the predicted
+        bbox.
+
+        Args:
+            mask_preds (list[Tensor]): Predicted prototypes with shape
+                (num_classes, H, W).
+            batch_img_metas (list[dict]): Meta information of multiple images.
+            positive_infos (List[:obj:``InstanceData``]): Positive
+                information that calculate from detect head.
+
+        Returns:
+            list: The cropped masks.
+        """
+        croped_mask_preds = []
+        for img_meta, mask_preds, cur_info in zip(batch_img_metas, mask_preds,
+                                                  positive_infos):
+            bboxes_for_cropping = copy.deepcopy(cur_info.bboxes)
+            h, w = img_meta['img_shape'][:2]
+            bboxes_for_cropping[:, 0::2] /= w
+            bboxes_for_cropping[:, 1::2] /= h
+            mask_preds = self.crop_single(mask_preds, bboxes_for_cropping)
+            mask_preds = mask_preds.permute(2, 0, 1).contiguous()
+            croped_mask_preds.append(mask_preds)
+        return croped_mask_preds
+
+    def crop_single(self,
+                    masks: Tensor,
+                    boxes: Tensor,
+                    padding: int = 1) -> Tensor:
+        """Crop single predicted masks by zeroing out everything not in the
+        predicted bbox.
+
+        Args:
+            masks (Tensor): Predicted prototypes, has shape [H, W, N].
+            boxes (Tensor): Bbox coords in relative point form with
+                shape [N, 4].
+            padding (int): Image padding size.
+
+        Return:
+            Tensor: The cropped masks.
+        """
+        h, w, n = masks.size()
+        x1, x2 = self.sanitize_coordinates(
+            boxes[:, 0], boxes[:, 2], w, padding, cast=False)
+        y1, y2 = self.sanitize_coordinates(
+            boxes[:, 1], boxes[:, 3], h, padding, cast=False)
+
+        rows = torch.arange(
+            w, device=masks.device, dtype=x1.dtype).view(1, -1,
+                                                         1).expand(h, w, n)
+        cols = torch.arange(
+            h, device=masks.device, dtype=x1.dtype).view(-1, 1,
+                                                         1).expand(h, w, n)
+
+        masks_left = rows >= x1.view(1, 1, -1)
+        masks_right = rows < x2.view(1, 1, -1)
+        masks_up = cols >= y1.view(1, 1, -1)
+        masks_down = cols < y2.view(1, 1, -1)
+
+        crop_mask = masks_left * masks_right * masks_up * masks_down
+
+        return masks * crop_mask.float()
+
+    def sanitize_coordinates(self,
+                             x1: Tensor,
+                             x2: Tensor,
+                             img_size: int,
+                             padding: int = 0,
+                             cast: bool = True) -> tuple:
+        """Sanitizes the input coordinates so that x1 < x2, x1 != x2, x1 >= 0,
+        and x2 <= image_size. Also converts from relative to absolute
+        coordinates and casts the results to long tensors.
+
+        Warning: this does things in-place behind the scenes so
+        copy if necessary.
+
+        Args:
+            x1 (Tensor): shape (N, ).
+            x2 (Tensor): shape (N, ).
+            img_size (int): Size of the input image.
+            padding (int): x1 >= padding, x2 <= image_size-padding.
+            cast (bool): If cast is false, the result won't be cast to longs.
+
+        Returns:
+            tuple:
+
+            - x1 (Tensor): Sanitized _x1.
+            - x2 (Tensor): Sanitized _x2.
+        """
+        x1 = x1 * img_size
+        x2 = x2 * img_size
+        if cast:
+            x1 = x1.long()
+            x2 = x2.long()
+        x1 = torch.min(x1, x2)
+        x2 = torch.max(x1, x2)
+        x1 = torch.clamp(x1 - padding, min=0)
+        x2 = torch.clamp(x2 + padding, max=img_size)
+        return x1, x2
+
+    def predict_by_feat(self,
+                        mask_preds: List[Tensor],
+                        segm_preds: Tensor,
+                        results_list: InstanceList,
+                        batch_img_metas: List[dict],
+                        rescale: bool = True,
+                        **kwargs) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (list[Tensor]): Predicted prototypes with shape
+                (num_classes, H, W).
+            results_list (List[:obj:``InstanceData``]): BBoxHead results.
+            batch_img_metas (list[dict]): Meta information of all images.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Processed results of multiple
+            images.Each :obj:`InstanceData` usually contains
+            following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        assert len(mask_preds) == len(results_list) == len(batch_img_metas)
+
+        croped_mask_pred = self.crop_mask_preds(mask_preds, batch_img_metas,
+                                                results_list)
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            results = results_list[img_id]
+            bboxes = results.bboxes
+            mask_preds = croped_mask_pred[img_id]
+            if bboxes.shape[0] == 0 or mask_preds.shape[0] == 0:
+                results_list[img_id] = empty_instances(
+                    [img_meta],
+                    bboxes.device,
+                    task_type='mask',
+                    instance_results=[results])[0]
+            else:
+                im_mask = self._predict_by_feat_single(
+                    mask_preds=croped_mask_pred[img_id],
+                    bboxes=bboxes,
+                    img_meta=img_meta,
+                    rescale=rescale)
+                results.masks = im_mask
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                mask_preds: Tensor,
+                                bboxes: Tensor,
+                                img_meta: dict,
+                                rescale: bool,
+                                cfg: OptConfigType = None):
+        """Transform a single image's features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (Tensor): Predicted prototypes, has shape [H, W, N].
+            bboxes (Tensor): Bbox coords in relative point form with
+                shape [N, 4].
+            img_meta (dict): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If rescale is False, then returned masks will
+                fit the scale of imgs[0].
+            cfg (dict, optional): Config used in test phase.
+                Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Processed results of single image.
+             it usually contains following keys.
+
+                - scores (Tensor): Classification scores, has shape
+                  (num_instance,).
+                - labels (Tensor): Has shape (num_instances,).
+                - masks (Tensor): Processed mask results, has
+                  shape (num_instances, h, w).
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+            (1, 2))
+        img_h, img_w = img_meta['ori_shape'][:2]
+        if rescale:  # in-placed rescale the bboxes
+            scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+            bboxes /= scale_factor
+        else:
+            w_scale, h_scale = scale_factor[0, 0], scale_factor[0, 1]
+            img_h = np.round(img_h * h_scale.item()).astype(np.int32)
+            img_w = np.round(img_w * w_scale.item()).astype(np.int32)
+
+        masks = F.interpolate(
+            mask_preds.unsqueeze(0), (img_h, img_w),
+            mode='bilinear',
+            align_corners=False).squeeze(0) > cfg.mask_thr
+
+        if cfg.mask_thr_binary < 0:
+            # for visualization and debugging
+            masks = (masks * 255).to(dtype=torch.uint8)
+
+        return masks
+
+
+class SegmentationModule(BaseModule):
+    """YOLACT segmentation branch used in <https://arxiv.org/abs/1904.02689>`_
+
+    In mmdet v2.x `segm_loss` is calculated in YOLACTSegmHead, while in
+    mmdet v3.x `SegmentationModule` is used to obtain the predicted semantic
+    segmentation map and `segm_loss` is calculated in YOLACTProtonet.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int = 256,
+        init_cfg: ConfigType = dict(
+            type='Xavier',
+            distribution='uniform',
+            override=dict(name='segm_conv'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize layers of the head."""
+        self.segm_conv = nn.Conv2d(
+            self.in_channels, self.num_classes, kernel_size=1)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward feature from the upstream network.
+
+        Args:
+            x (Tensor): Feature from the upstream network, which is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: Predicted semantic segmentation map with shape
+                (N, num_classes, H, W).
+        """
+        return self.segm_conv(x)
+
+
+class InterpolateModule(BaseModule):
+    """This is a module version of F.interpolate.
+
+    Any arguments you give it just get passed along for the ride.
+    """
+
+    def __init__(self, *args, init_cfg=None, **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.args = args
+        self.kwargs = kwargs
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tensor): Feature from the upstream network, which is
+                a 4D-tensor.
+
+        Returns:
+            Tensor: A 4D-tensor feature map.
+        """
+        return F.interpolate(x, *self.args, **self.kwargs)
diff --git a/head_extractor/src/mmdet/models/dense_heads/yolo_head.py b/head_extractor/src/mmdet/models/dense_heads/yolo_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f63afbbc94353e16e4c67ec5bc0b6cd1200de07
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/yolo_head.py
@@ -0,0 +1,527 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+import copy
+import warnings
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, is_norm
+from mmengine.model import bias_init_with_prob, constant_init, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptInstanceList)
+from ..task_modules.samplers import PseudoSampler
+from ..utils import filter_scores_and_topk, images_to_levels, multi_apply
+from .base_dense_head import BaseDenseHead
+
+
+@MODELS.register_module()
+class YOLOV3Head(BaseDenseHead):
+    """YOLOV3Head Paper link: https://arxiv.org/abs/1804.02767.
+
+    Args:
+        num_classes (int): The number of object classes (w/o background)
+        in_channels (Sequence[int]): Number of input channels per scale.
+        out_channels (Sequence[int]): The number of output channels per scale
+            before the final 1x1 layer. Default: (1024, 512, 256).
+        anchor_generator (:obj:`ConfigDict` or dict): Config dict for anchor
+            generator.
+        bbox_coder (:obj:`ConfigDict` or dict): Config of bounding box coder.
+        featmap_strides (Sequence[int]): The stride of each scale.
+            Should be in descending order. Defaults to (32, 16, 8).
+        one_hot_smoother (float): Set a non-zero value to enable label-smooth
+            Defaults to 0.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config norm layer. Defaults to dict(type='BN', requires_grad=True).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to dict(type='LeakyReLU', negative_slope=0.1).
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_conf (:obj:`ConfigDict` or dict): Config of confidence loss.
+        loss_xy (:obj:`ConfigDict` or dict): Config of xy coordinate loss.
+        loss_wh (:obj:`ConfigDict` or dict): Config of wh coordinate loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            YOLOV3 head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            YOLOV3 head. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: Sequence[int],
+                 out_channels: Sequence[int] = (1024, 512, 256),
+                 anchor_generator: ConfigType = dict(
+                     type='YOLOAnchorGenerator',
+                     base_sizes=[[(116, 90), (156, 198), (373, 326)],
+                                 [(30, 61), (62, 45), (59, 119)],
+                                 [(10, 13), (16, 30), (33, 23)]],
+                     strides=[32, 16, 8]),
+                 bbox_coder: ConfigType = dict(type='YOLOBBoxCoder'),
+                 featmap_strides: Sequence[int] = (32, 16, 8),
+                 one_hot_smoother: float = 0.,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(
+                     type='LeakyReLU', negative_slope=0.1),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_conf: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_xy: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_wh: ConfigType = dict(type='MSELoss', loss_weight=1.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None) -> None:
+        super().__init__(init_cfg=None)
+        # Check params
+        assert (len(in_channels) == len(out_channels) == len(featmap_strides))
+
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.featmap_strides = featmap_strides
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            if train_cfg.get('sampler', None) is not None:
+                self.sampler = TASK_UTILS.build(
+                    self.train_cfg['sampler'], context=self)
+            else:
+                self.sampler = PseudoSampler()
+
+        self.one_hot_smoother = one_hot_smoother
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+
+        self.prior_generator = TASK_UTILS.build(anchor_generator)
+
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_conf = MODELS.build(loss_conf)
+        self.loss_xy = MODELS.build(loss_xy)
+        self.loss_wh = MODELS.build(loss_wh)
+
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+        assert len(
+            self.prior_generator.num_base_priors) == len(featmap_strides)
+        self._init_layers()
+
+    @property
+    def num_levels(self) -> int:
+        """int: number of feature map levels"""
+        return len(self.featmap_strides)
+
+    @property
+    def num_attrib(self) -> int:
+        """int: number of attributes in pred_map, bboxes (4) +
+        objectness (1) + num_classes"""
+
+        return 5 + self.num_classes
+
+    def _init_layers(self) -> None:
+        """initialize conv layers in YOLOv3 head."""
+        self.convs_bridge = nn.ModuleList()
+        self.convs_pred = nn.ModuleList()
+        for i in range(self.num_levels):
+            conv_bridge = ConvModule(
+                self.in_channels[i],
+                self.out_channels[i],
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            conv_pred = nn.Conv2d(self.out_channels[i],
+                                  self.num_base_priors * self.num_attrib, 1)
+
+            self.convs_bridge.append(conv_bridge)
+            self.convs_pred.append(conv_pred)
+
+    def init_weights(self) -> None:
+        """initialize weights."""
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+
+        # Use prior in model initialization to improve stability
+        for conv_pred, stride in zip(self.convs_pred, self.featmap_strides):
+            bias = conv_pred.bias.reshape(self.num_base_priors, -1)
+            # init objectness with prior of 8 objects per feature map
+            # refer to https://github.com/ultralytics/yolov3
+            nn.init.constant_(bias.data[:, 4],
+                              bias_init_with_prob(8 / (608 / stride)**2))
+            nn.init.constant_(bias.data[:, 5:], bias_init_with_prob(0.01))
+
+    def forward(self, x: Tuple[Tensor, ...]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple[Tensor]: A tuple of multi-level predication map, each is a
+                4D-tensor of shape (batch_size, 5+num_classes, height, width).
+        """
+
+        assert len(x) == self.num_levels
+        pred_maps = []
+        for i in range(self.num_levels):
+            feat = x[i]
+            feat = self.convs_bridge[i](feat)
+            pred_map = self.convs_pred[i](feat)
+            pred_maps.append(pred_map)
+
+        return tuple(pred_maps),
+
+    def predict_by_feat(self,
+                        pred_maps: Sequence[Tensor],
+                        batch_img_metas: Optional[List[dict]],
+                        cfg: OptConfigType = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results. It has been accelerated since PR #5991.
+
+        Args:
+            pred_maps (Sequence[Tensor]): Raw predictions for a batch of
+                images.
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (:obj:`ConfigDict` or dict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(pred_maps) == self.num_levels
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [pred_map.shape[-2:] for pred_map in pred_maps]
+
+        mlvl_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=pred_maps[0].device)
+        flatten_preds = []
+        flatten_strides = []
+        for pred, stride in zip(pred_maps, self.featmap_strides):
+            pred = pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                    self.num_attrib)
+            pred[..., :2].sigmoid_()
+            flatten_preds.append(pred)
+            flatten_strides.append(
+                pred.new_tensor(stride).expand(pred.size(1)))
+
+        flatten_preds = torch.cat(flatten_preds, dim=1)
+        flatten_bbox_preds = flatten_preds[..., :4]
+        flatten_objectness = flatten_preds[..., 4].sigmoid()
+        flatten_cls_scores = flatten_preds[..., 5:].sigmoid()
+        flatten_anchors = torch.cat(mlvl_anchors)
+        flatten_strides = torch.cat(flatten_strides)
+        flatten_bboxes = self.bbox_coder.decode(flatten_anchors,
+                                                flatten_bbox_preds,
+                                                flatten_strides.unsqueeze(-1))
+        results_list = []
+        for (bboxes, scores, objectness,
+             img_meta) in zip(flatten_bboxes, flatten_cls_scores,
+                              flatten_objectness, batch_img_metas):
+            # Filtering out all predictions with conf < conf_thr
+            conf_thr = cfg.get('conf_thr', -1)
+            if conf_thr > 0:
+                conf_inds = objectness >= conf_thr
+                bboxes = bboxes[conf_inds, :]
+                scores = scores[conf_inds, :]
+                objectness = objectness[conf_inds]
+
+            score_thr = cfg.get('score_thr', 0)
+            nms_pre = cfg.get('nms_pre', -1)
+            scores, labels, keep_idxs, _ = filter_scores_and_topk(
+                scores, score_thr, nms_pre)
+
+            results = InstanceData(
+                scores=scores,
+                labels=labels,
+                bboxes=bboxes[keep_idxs],
+                score_factors=objectness[keep_idxs],
+            )
+            results = self._bbox_post_process(
+                results=results,
+                cfg=cfg,
+                rescale=rescale,
+                with_nms=with_nms,
+                img_meta=img_meta)
+            results_list.append(results)
+        return results_list
+
+    def loss_by_feat(
+            self,
+            pred_maps: Sequence[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            pred_maps (list[Tensor]): Prediction map for each scale level,
+                shape (N, num_anchors * num_attrib, H, W)
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        num_imgs = len(batch_img_metas)
+        device = pred_maps[0][0].device
+
+        featmap_sizes = [
+            pred_maps[i].shape[-2:] for i in range(self.num_levels)
+        ]
+        mlvl_anchors = self.prior_generator.grid_priors(
+            featmap_sizes, device=device)
+        anchor_list = [mlvl_anchors for _ in range(num_imgs)]
+
+        responsible_flag_list = []
+        for img_id in range(num_imgs):
+            responsible_flag_list.append(
+                self.responsible_flags(featmap_sizes,
+                                       batch_gt_instances[img_id].bboxes,
+                                       device))
+
+        target_maps_list, neg_maps_list = self.get_targets(
+            anchor_list, responsible_flag_list, batch_gt_instances)
+
+        losses_cls, losses_conf, losses_xy, losses_wh = multi_apply(
+            self.loss_by_feat_single, pred_maps, target_maps_list,
+            neg_maps_list)
+
+        return dict(
+            loss_cls=losses_cls,
+            loss_conf=losses_conf,
+            loss_xy=losses_xy,
+            loss_wh=losses_wh)
+
+    def loss_by_feat_single(self, pred_map: Tensor, target_map: Tensor,
+                            neg_map: Tensor) -> tuple:
+        """Calculate the loss of a single scale level based on the features
+        extracted by the detection head.
+
+        Args:
+            pred_map (Tensor): Raw predictions for a single level.
+            target_map (Tensor): The Ground-Truth target for a single level.
+            neg_map (Tensor): The negative masks for a single level.
+
+        Returns:
+            tuple:
+                loss_cls (Tensor): Classification loss.
+                loss_conf (Tensor): Confidence loss.
+                loss_xy (Tensor): Regression loss of x, y coordinate.
+                loss_wh (Tensor): Regression loss of w, h coordinate.
+        """
+
+        num_imgs = len(pred_map)
+        pred_map = pred_map.permute(0, 2, 3,
+                                    1).reshape(num_imgs, -1, self.num_attrib)
+        neg_mask = neg_map.float()
+        pos_mask = target_map[..., 4]
+        pos_and_neg_mask = neg_mask + pos_mask
+        pos_mask = pos_mask.unsqueeze(dim=-1)
+        if torch.max(pos_and_neg_mask) > 1.:
+            warnings.warn('There is overlap between pos and neg sample.')
+            pos_and_neg_mask = pos_and_neg_mask.clamp(min=0., max=1.)
+
+        pred_xy = pred_map[..., :2]
+        pred_wh = pred_map[..., 2:4]
+        pred_conf = pred_map[..., 4]
+        pred_label = pred_map[..., 5:]
+
+        target_xy = target_map[..., :2]
+        target_wh = target_map[..., 2:4]
+        target_conf = target_map[..., 4]
+        target_label = target_map[..., 5:]
+
+        loss_cls = self.loss_cls(pred_label, target_label, weight=pos_mask)
+        loss_conf = self.loss_conf(
+            pred_conf, target_conf, weight=pos_and_neg_mask)
+        loss_xy = self.loss_xy(pred_xy, target_xy, weight=pos_mask)
+        loss_wh = self.loss_wh(pred_wh, target_wh, weight=pos_mask)
+
+        return loss_cls, loss_conf, loss_xy, loss_wh
+
+    def get_targets(self, anchor_list: List[List[Tensor]],
+                    responsible_flag_list: List[List[Tensor]],
+                    batch_gt_instances: List[InstanceData]) -> tuple:
+        """Compute target maps for anchors in multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_total_anchors, 4).
+            responsible_flag_list (list[list[Tensor]]): Multi level responsible
+                flags of each image. Each element is a tensor of shape
+                (num_total_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+                - target_map_list (list[Tensor]): Target map of each level.
+                - neg_map_list (list[Tensor]): Negative map of each level.
+        """
+        num_imgs = len(anchor_list)
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+
+        results = multi_apply(self._get_targets_single, anchor_list,
+                              responsible_flag_list, batch_gt_instances)
+
+        all_target_maps, all_neg_maps = results
+        assert num_imgs == len(all_target_maps) == len(all_neg_maps)
+        target_maps_list = images_to_levels(all_target_maps, num_level_anchors)
+        neg_maps_list = images_to_levels(all_neg_maps, num_level_anchors)
+
+        return target_maps_list, neg_maps_list
+
+    def _get_targets_single(self, anchors: List[Tensor],
+                            responsible_flags: List[Tensor],
+                            gt_instances: InstanceData) -> tuple:
+        """Generate matching bounding box prior and converted GT.
+
+        Args:
+            anchors (List[Tensor]): Multi-level anchors of the image.
+            responsible_flags (List[Tensor]): Multi-level responsible flags of
+                anchors
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            tuple:
+                target_map (Tensor): Predication target map of each
+                    scale level, shape (num_total_anchors,
+                    5+num_classes)
+                neg_map (Tensor): Negative map of each scale level,
+                    shape (num_total_anchors,)
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        anchor_strides = []
+        for i in range(len(anchors)):
+            anchor_strides.append(
+                torch.tensor(self.featmap_strides[i],
+                             device=gt_bboxes.device).repeat(len(anchors[i])))
+        concat_anchors = torch.cat(anchors)
+        concat_responsible_flags = torch.cat(responsible_flags)
+
+        anchor_strides = torch.cat(anchor_strides)
+        assert len(anchor_strides) == len(concat_anchors) == \
+               len(concat_responsible_flags)
+        pred_instances = InstanceData(
+            priors=concat_anchors, responsible_flags=concat_responsible_flags)
+
+        assign_result = self.assigner.assign(pred_instances, gt_instances)
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        target_map = concat_anchors.new_zeros(
+            concat_anchors.size(0), self.num_attrib)
+
+        target_map[sampling_result.pos_inds, :4] = self.bbox_coder.encode(
+            sampling_result.pos_priors, sampling_result.pos_gt_bboxes,
+            anchor_strides[sampling_result.pos_inds])
+
+        target_map[sampling_result.pos_inds, 4] = 1
+
+        gt_labels_one_hot = F.one_hot(
+            gt_labels, num_classes=self.num_classes).float()
+        if self.one_hot_smoother != 0:  # label smooth
+            gt_labels_one_hot = gt_labels_one_hot * (
+                1 - self.one_hot_smoother
+            ) + self.one_hot_smoother / self.num_classes
+        target_map[sampling_result.pos_inds, 5:] = gt_labels_one_hot[
+            sampling_result.pos_assigned_gt_inds]
+
+        neg_map = concat_anchors.new_zeros(
+            concat_anchors.size(0), dtype=torch.uint8)
+        neg_map[sampling_result.neg_inds] = 1
+
+        return target_map, neg_map
+
+    def responsible_flags(self, featmap_sizes: List[tuple], gt_bboxes: Tensor,
+                          device: str) -> List[Tensor]:
+        """Generate responsible anchor flags of grid cells in multiple scales.
+
+        Args:
+            featmap_sizes (List[tuple]): List of feature map sizes in multiple
+                feature levels.
+            gt_bboxes (Tensor): Ground truth boxes, shape (n, 4).
+            device (str): Device where the anchors will be put on.
+
+        Return:
+            List[Tensor]: responsible flags of anchors in multiple level
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_responsible_flags = []
+        for i in range(self.num_levels):
+            anchor_stride = self.prior_generator.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            gt_cx = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) * 0.5).to(device)
+            gt_cy = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) * 0.5).to(device)
+            gt_grid_x = torch.floor(gt_cx / anchor_stride[0]).long()
+            gt_grid_y = torch.floor(gt_cy / anchor_stride[1]).long()
+            # row major indexing
+            gt_bboxes_grid_idx = gt_grid_y * feat_w + gt_grid_x
+
+            responsible_grid = torch.zeros(
+                feat_h * feat_w, dtype=torch.uint8, device=device)
+            responsible_grid[gt_bboxes_grid_idx] = 1
+
+            responsible_grid = responsible_grid[:, None].expand(
+                responsible_grid.size(0),
+                self.prior_generator.num_base_priors[i]).contiguous().view(-1)
+
+            multi_level_responsible_flags.append(responsible_grid)
+        return multi_level_responsible_flags
diff --git a/head_extractor/src/mmdet/models/dense_heads/yolof_head.py b/head_extractor/src/mmdet/models/dense_heads/yolof_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5e5e6b7a92861bcd2ba3824df1f94270ba51160
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/yolof_head.py
@@ -0,0 +1,399 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, is_norm
+from mmengine.model import bias_init_with_prob, constant_init, normal_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, OptInstanceList, reduce_mean
+from ..task_modules.prior_generators import anchor_inside_flags
+from ..utils import levels_to_images, multi_apply, unmap
+from .anchor_head import AnchorHead
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class YOLOFHead(AnchorHead):
+    """Detection Head of `YOLOF <https://arxiv.org/abs/2103.09460>`_
+
+    Args:
+        num_classes (int): The number of object classes (w/o background)
+        in_channels (list[int]): The number of input channels per scale.
+        cls_num_convs (int): The number of convolutions of cls branch.
+           Defaults to 2.
+        reg_num_convs (int): The number of convolutions of reg branch.
+           Defaults to 4.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to ``dict(type='BN', requires_grad=True)``.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 in_channels: List[int],
+                 num_cls_convs: int = 2,
+                 num_reg_convs: int = 4,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 **kwargs) -> None:
+        self.num_cls_convs = num_cls_convs
+        self.num_reg_convs = num_reg_convs
+        self.norm_cfg = norm_cfg
+        super().__init__(
+            num_classes=num_classes, in_channels=in_channels, **kwargs)
+
+    def _init_layers(self) -> None:
+        cls_subnet = []
+        bbox_subnet = []
+        for i in range(self.num_cls_convs):
+            cls_subnet.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+        for i in range(self.num_reg_convs):
+            bbox_subnet.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg))
+        self.cls_subnet = nn.Sequential(*cls_subnet)
+        self.bbox_subnet = nn.Sequential(*bbox_subnet)
+        self.cls_score = nn.Conv2d(
+            self.in_channels,
+            self.num_base_priors * self.num_classes,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.bbox_pred = nn.Conv2d(
+            self.in_channels,
+            self.num_base_priors * 4,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.object_pred = nn.Conv2d(
+            self.in_channels,
+            self.num_base_priors,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+
+    def init_weights(self) -> None:
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+
+        # Use prior in model initialization to improve stability
+        bias_cls = bias_init_with_prob(0.01)
+        torch.nn.init.constant_(self.cls_score.bias, bias_cls)
+
+    def forward_single(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                normalized_cls_score (Tensor): Normalized Cls scores for a \
+                    single scale level, the channels number is \
+                    num_base_priors * num_classes.
+                bbox_reg (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_base_priors * 4.
+        """
+        cls_score = self.cls_score(self.cls_subnet(x))
+        N, _, H, W = cls_score.shape
+        cls_score = cls_score.view(N, -1, self.num_classes, H, W)
+
+        reg_feat = self.bbox_subnet(x)
+        bbox_reg = self.bbox_pred(reg_feat)
+        objectness = self.object_pred(reg_feat)
+
+        # implicit objectness
+        objectness = objectness.view(N, -1, 1, H, W)
+        normalized_cls_score = cls_score + objectness - torch.log(
+            1. + torch.clamp(cls_score.exp(), max=INF) +
+            torch.clamp(objectness.exp(), max=INF))
+        normalized_cls_score = normalized_cls_score.view(N, -1, H, W)
+        return normalized_cls_score, bbox_reg
+
+    def loss_by_feat(
+            self,
+            cls_scores: List[Tensor],
+            bbox_preds: List[Tensor],
+            batch_gt_instances: InstanceList,
+            batch_img_metas: List[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                has shape (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        assert len(cls_scores) == 1
+        assert self.prior_generator.num_levels == 1
+
+        device = cls_scores[0].device
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, batch_img_metas, device=device)
+
+        # The output level is always 1
+        anchor_list = [anchors[0] for anchors in anchor_list]
+        valid_flag_list = [valid_flags[0] for valid_flags in valid_flag_list]
+
+        cls_scores_list = levels_to_images(cls_scores)
+        bbox_preds_list = levels_to_images(bbox_preds)
+
+        cls_reg_targets = self.get_targets(
+            cls_scores_list,
+            bbox_preds_list,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore=batch_gt_instances_ignore)
+        if cls_reg_targets is None:
+            return None
+        (batch_labels, batch_label_weights, avg_factor, batch_bbox_weights,
+         batch_pos_predicted_boxes, batch_target_boxes) = cls_reg_targets
+
+        flatten_labels = batch_labels.reshape(-1)
+        batch_label_weights = batch_label_weights.reshape(-1)
+        cls_score = cls_scores[0].permute(0, 2, 3,
+                                          1).reshape(-1, self.cls_out_channels)
+
+        avg_factor = reduce_mean(
+            torch.tensor(avg_factor, dtype=torch.float, device=device)).item()
+
+        # classification loss
+        loss_cls = self.loss_cls(
+            cls_score,
+            flatten_labels,
+            batch_label_weights,
+            avg_factor=avg_factor)
+
+        # regression loss
+        if batch_pos_predicted_boxes.shape[0] == 0:
+            # no pos sample
+            loss_bbox = batch_pos_predicted_boxes.sum() * 0
+        else:
+            loss_bbox = self.loss_bbox(
+                batch_pos_predicted_boxes,
+                batch_target_boxes,
+                batch_bbox_weights.float(),
+                avg_factor=avg_factor)
+
+        return dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
+
+    def get_targets(self,
+                    cls_scores_list: List[Tensor],
+                    bbox_preds_list: List[Tensor],
+                    anchor_list: List[Tensor],
+                    valid_flag_list: List[Tensor],
+                    batch_gt_instances: InstanceList,
+                    batch_img_metas: List[dict],
+                    batch_gt_instances_ignore: OptInstanceList = None,
+                    unmap_outputs: bool = True):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            cls_scores_list (list[Tensor]): Classification scores of
+                each image. each is a 4D-tensor, the shape is
+                (h * w, num_anchors * num_classes).
+            bbox_preds_list (list[Tensor]): Bbox preds of each image.
+                each is a 4D-tensor, the shape is (h * w, num_anchors * 4).
+            anchor_list (list[Tensor]): Anchors of each image. Each element of
+                is a tensor of shape (h * w * num_anchors, 4).
+            valid_flag_list (list[Tensor]): Valid flags of each image. Each
+               element of is a tensor of shape (h * w * num_anchors, )
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - batch_labels (Tensor): Label of all images. Each element \
+                    of is a tensor of shape (batch, h * w * num_anchors)
+                - batch_label_weights (Tensor): Label weights of all images \
+                    of is a tensor of shape (batch, h * w * num_anchors)
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end
+        """
+        num_imgs = len(batch_img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # compute targets for each image
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        results = multi_apply(
+            self._get_targets_single,
+            bbox_preds_list,
+            anchor_list,
+            valid_flag_list,
+            batch_gt_instances,
+            batch_img_metas,
+            batch_gt_instances_ignore,
+            unmap_outputs=unmap_outputs)
+        (all_labels, all_label_weights, pos_inds, neg_inds,
+         sampling_results_list) = results[:5]
+        # Get `avg_factor` of all images, which calculate in `SamplingResult`.
+        # When using sampling method, avg_factor is usually the sum of
+        # positive and negative priors. When using `PseudoSampler`,
+        # `avg_factor` is usually equal to the number of positive priors.
+        avg_factor = sum(
+            [results.avg_factor for results in sampling_results_list])
+        rest_results = list(results[5:])  # user-added return values
+
+        batch_labels = torch.stack(all_labels, 0)
+        batch_label_weights = torch.stack(all_label_weights, 0)
+
+        res = (batch_labels, batch_label_weights, avg_factor)
+        for i, rests in enumerate(rest_results):  # user-added return values
+            rest_results[i] = torch.cat(rests, 0)
+
+        return res + tuple(rest_results)
+
+    def _get_targets_single(self,
+                            bbox_preds: Tensor,
+                            flat_anchors: Tensor,
+                            valid_flags: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict,
+                            gt_instances_ignore: Optional[InstanceData] = None,
+                            unmap_outputs: bool = True) -> tuple:
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            bbox_preds (Tensor): Bbox prediction of the image, which
+                shape is (h * w ,4)
+            flat_anchors (Tensor): Anchors of the image, which shape is
+                (h * w * num_anchors ,4)
+            valid_flags (Tensor): Valid flags of the image, which shape is
+                (h * w * num_anchors,).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple:
+                labels (Tensor): Labels of image, which shape is
+                    (h * w * num_anchors, ).
+                label_weights (Tensor): Label weights of image, which shape is
+                    (h * w * num_anchors, ).
+                pos_inds (Tensor): Pos index of image.
+                neg_inds (Tensor): Neg index of image.
+                sampling_result (obj:`SamplingResult`): Sampling result.
+                pos_bbox_weights (Tensor): The Weight of using to calculate
+                    the bbox branch loss, which shape is (num, ).
+                pos_predicted_boxes (Tensor): boxes predicted value of
+                    using to calculate the bbox branch loss, which shape is
+                    (num, 4).
+                pos_target_boxes (Tensor): boxes target value of
+                    using to calculate the bbox branch loss, which shape is
+                    (num, 4).
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg['allowed_border'])
+        if not inside_flags.any():
+            raise ValueError(
+                'There is no valid anchor inside the image boundary. Please '
+                'check the image size and anchor sizes, or set '
+                '``allowed_border`` to -1 to skip the condition.')
+
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bbox_preds = bbox_preds[inside_flags, :]
+
+        # decoded bbox
+        decoder_bbox_preds = self.bbox_coder.decode(anchors, bbox_preds)
+        pred_instances = InstanceData(
+            priors=anchors, decoder_priors=decoder_bbox_preds)
+        assign_result = self.assigner.assign(pred_instances, gt_instances,
+                                             gt_instances_ignore)
+
+        pos_bbox_weights = assign_result.get_extra_property('pos_idx')
+        pos_predicted_boxes = assign_result.get_extra_property(
+            'pos_predicted_boxes')
+        pos_target_boxes = assign_result.get_extra_property('target_boxes')
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+        num_valid_anchors = anchors.shape[0]
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            labels[pos_inds] = sampling_result.pos_gt_labels
+            if self.train_cfg['pos_weight'] <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg['pos_weight']
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+
+        return (labels, label_weights, pos_inds, neg_inds, sampling_result,
+                pos_bbox_weights, pos_predicted_boxes, pos_target_boxes)
diff --git a/head_extractor/src/mmdet/models/dense_heads/yolox_head.py b/head_extractor/src/mmdet/models/dense_heads/yolox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..00fe1e42766e4ca0052cf31d2e940dfab73fb200
--- /dev/null
+++ b/head_extractor/src/mmdet/models/dense_heads/yolox_head.py
@@ -0,0 +1,618 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmcv.ops.nms import batched_nms
+from mmengine.config import ConfigDict
+from mmengine.model import bias_init_with_prob
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import bbox_xyxy_to_cxcywh
+from mmdet.utils import (ConfigType, OptConfigType, OptInstanceList,
+                         OptMultiConfig, reduce_mean)
+from ..task_modules.prior_generators import MlvlPointGenerator
+from ..task_modules.samplers import PseudoSampler
+from ..utils import multi_apply
+from .base_dense_head import BaseDenseHead
+
+
+@MODELS.register_module()
+class YOLOXHead(BaseDenseHead):
+    """YOLOXHead head used in `YOLOX <https://arxiv.org/abs/2107.08430>`_.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels in stacking convs.
+            Defaults to 256
+        stacked_convs (int): Number of stacking convs of the head.
+            Defaults to (8, 16, 32).
+        strides (Sequence[int]): Downsample factor of each feature map.
+             Defaults to None.
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Defaults to False.
+        dcn_on_last_conv (bool): If true, use dcn in the last layer of
+            towers. Defaults to False.
+        conv_bias (bool or str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Defaults to "auto".
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (:obj:`ConfigDict` or dict): Config dict for activation layer.
+            Defaults to None.
+        loss_cls (:obj:`ConfigDict` or dict): Config of classification loss.
+        loss_bbox (:obj:`ConfigDict` or dict): Config of localization loss.
+        loss_obj (:obj:`ConfigDict` or dict): Config of objectness loss.
+        loss_l1 (:obj:`ConfigDict` or dict): Config of L1 loss.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            anchor head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            anchor head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        in_channels: int,
+        feat_channels: int = 256,
+        stacked_convs: int = 2,
+        strides: Sequence[int] = (8, 16, 32),
+        use_depthwise: bool = False,
+        dcn_on_last_conv: bool = False,
+        conv_bias: Union[bool, str] = 'auto',
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='Swish'),
+        loss_cls: ConfigType = dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        loss_bbox: ConfigType = dict(
+            type='IoULoss',
+            mode='square',
+            eps=1e-16,
+            reduction='sum',
+            loss_weight=5.0),
+        loss_obj: ConfigType = dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        loss_l1: ConfigType = dict(
+            type='L1Loss', reduction='sum', loss_weight=1.0),
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.cls_out_channels = num_classes
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.use_depthwise = use_depthwise
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.use_sigmoid_cls = True
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.loss_cls: nn.Module = MODELS.build(loss_cls)
+        self.loss_bbox: nn.Module = MODELS.build(loss_bbox)
+        self.loss_obj: nn.Module = MODELS.build(loss_obj)
+
+        self.use_l1 = False  # This flag will be modified by hooks.
+        self.loss_l1: nn.Module = MODELS.build(loss_l1)
+
+        self.prior_generator = MlvlPointGenerator(strides, offset=0)
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+
+        if self.train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+            # YOLOX does not support sampling
+            self.sampler = PseudoSampler()
+
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize heads for all level feature maps."""
+        self.multi_level_cls_convs = nn.ModuleList()
+        self.multi_level_reg_convs = nn.ModuleList()
+        self.multi_level_conv_cls = nn.ModuleList()
+        self.multi_level_conv_reg = nn.ModuleList()
+        self.multi_level_conv_obj = nn.ModuleList()
+        for _ in self.strides:
+            self.multi_level_cls_convs.append(self._build_stacked_convs())
+            self.multi_level_reg_convs.append(self._build_stacked_convs())
+            conv_cls, conv_reg, conv_obj = self._build_predictor()
+            self.multi_level_conv_cls.append(conv_cls)
+            self.multi_level_conv_reg.append(conv_reg)
+            self.multi_level_conv_obj.append(conv_obj)
+
+    def _build_stacked_convs(self) -> nn.Sequential:
+        """Initialize conv layers of a single level head."""
+        conv = DepthwiseSeparableConvModule \
+            if self.use_depthwise else ConvModule
+        stacked_convs = []
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            stacked_convs.append(
+                conv(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    bias=self.conv_bias))
+        return nn.Sequential(*stacked_convs)
+
+    def _build_predictor(self) -> Tuple[nn.Module, nn.Module, nn.Module]:
+        """Initialize predictor layers of a single level head."""
+        conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1)
+        conv_reg = nn.Conv2d(self.feat_channels, 4, 1)
+        conv_obj = nn.Conv2d(self.feat_channels, 1, 1)
+        return conv_cls, conv_reg, conv_obj
+
+    def init_weights(self) -> None:
+        """Initialize weights of the head."""
+        super(YOLOXHead, self).init_weights()
+        # Use prior in model initialization to improve stability
+        bias_init = bias_init_with_prob(0.01)
+        for conv_cls, conv_obj in zip(self.multi_level_conv_cls,
+                                      self.multi_level_conv_obj):
+            conv_cls.bias.data.fill_(bias_init)
+            conv_obj.bias.data.fill_(bias_init)
+
+    def forward_single(self, x: Tensor, cls_convs: nn.Module,
+                       reg_convs: nn.Module, conv_cls: nn.Module,
+                       conv_reg: nn.Module,
+                       conv_obj: nn.Module) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward feature of a single scale level."""
+
+        cls_feat = cls_convs(x)
+        reg_feat = reg_convs(x)
+
+        cls_score = conv_cls(cls_feat)
+        bbox_pred = conv_reg(reg_feat)
+        objectness = conv_obj(reg_feat)
+
+        return cls_score, bbox_pred, objectness
+
+    def forward(self, x: Tuple[Tensor]) -> Tuple[List]:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+        Returns:
+            Tuple[List]: A tuple of multi-level classification scores, bbox
+            predictions, and objectnesses.
+        """
+
+        return multi_apply(self.forward_single, x, self.multi_level_cls_convs,
+                           self.multi_level_reg_convs,
+                           self.multi_level_conv_cls,
+                           self.multi_level_conv_reg,
+                           self.multi_level_conv_obj)
+
+    def predict_by_feat(self,
+                        cls_scores: List[Tensor],
+                        bbox_preds: List[Tensor],
+                        objectnesses: Optional[List[Tensor]],
+                        batch_img_metas: Optional[List[dict]] = None,
+                        cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False,
+                        with_nms: bool = True) -> List[InstanceData]:
+        """Transform a batch of output features extracted by the head into
+        bbox results.
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            objectnesses (list[Tensor], Optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_img_metas (list[dict], Optional): Batch image meta info.
+                Defaults to None.
+            cfg (ConfigDict, optional): Test / postprocessing
+                configuration, if None, test_cfg would be used.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`InstanceData`]: Object detection results of each image
+            after the post process. Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(objectnesses)
+        cfg = self.test_cfg if cfg is None else cfg
+
+        num_imgs = len(batch_img_metas)
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+
+        # flatten cls_scores, bbox_preds and objectness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                  self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_objectness = [
+            objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+            for objectness in objectnesses
+        ]
+
+        flatten_cls_scores = torch.cat(flatten_cls_scores, dim=1).sigmoid()
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_objectness = torch.cat(flatten_objectness, dim=1).sigmoid()
+        flatten_priors = torch.cat(mlvl_priors)
+
+        flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds)
+
+        result_list = []
+        for img_id, img_meta in enumerate(batch_img_metas):
+            max_scores, labels = torch.max(flatten_cls_scores[img_id], 1)
+            valid_mask = flatten_objectness[
+                img_id] * max_scores >= cfg.score_thr
+            results = InstanceData(
+                bboxes=flatten_bboxes[img_id][valid_mask],
+                scores=max_scores[valid_mask] *
+                flatten_objectness[img_id][valid_mask],
+                labels=labels[valid_mask])
+
+            result_list.append(
+                self._bbox_post_process(
+                    results=results,
+                    cfg=cfg,
+                    rescale=rescale,
+                    with_nms=with_nms,
+                    img_meta=img_meta))
+
+        return result_list
+
+    def _bbox_decode(self, priors: Tensor, bbox_preds: Tensor) -> Tensor:
+        """Decode regression results (delta_x, delta_x, w, h) to bboxes (tl_x,
+        tl_y, br_x, br_y).
+
+        Args:
+            priors (Tensor): Center proiors of an image, has shape
+                (num_instances, 2).
+            bbox_preds (Tensor): Box energies / deltas for all instances,
+                has shape (batch_size, num_instances, 4).
+
+        Returns:
+            Tensor: Decoded bboxes in (tl_x, tl_y, br_x, br_y) format. Has
+            shape (batch_size, num_instances, 4).
+        """
+        xys = (bbox_preds[..., :2] * priors[:, 2:]) + priors[:, :2]
+        whs = bbox_preds[..., 2:].exp() * priors[:, 2:]
+
+        tl_x = (xys[..., 0] - whs[..., 0] / 2)
+        tl_y = (xys[..., 1] - whs[..., 1] / 2)
+        br_x = (xys[..., 0] + whs[..., 0] / 2)
+        br_y = (xys[..., 1] + whs[..., 1] / 2)
+
+        decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1)
+        return decoded_bboxes
+
+    def _bbox_post_process(self,
+                           results: InstanceData,
+                           cfg: ConfigDict,
+                           rescale: bool = False,
+                           with_nms: bool = True,
+                           img_meta: Optional[dict] = None) -> InstanceData:
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            results (:obj:`InstaceData`): Detection instance results,
+                each item has shape (num_bboxes, ).
+            cfg (mmengine.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default to False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default to True.
+            img_meta (dict, optional): Image meta info. Defaults to None.
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            results.bboxes /= results.bboxes.new_tensor(
+                img_meta['scale_factor']).repeat((1, 2))
+
+        if with_nms and results.bboxes.numel() > 0:
+            det_bboxes, keep_idxs = batched_nms(results.bboxes, results.scores,
+                                                results.labels, cfg.nms)
+            results = results[keep_idxs]
+            # some nms would reweight the score, such as softnms
+            results.scores = det_bboxes[:, -1]
+        return results
+
+    def loss_by_feat(
+            self,
+            cls_scores: Sequence[Tensor],
+            bbox_preds: Sequence[Tensor],
+            objectnesses: Sequence[Tensor],
+            batch_gt_instances: Sequence[InstanceData],
+            batch_img_metas: Sequence[dict],
+            batch_gt_instances_ignore: OptInstanceList = None) -> dict:
+        """Calculate the loss based on the features extracted by the detection
+        head.
+
+        Args:
+            cls_scores (Sequence[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_priors * num_classes.
+            bbox_preds (Sequence[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_priors * 4.
+            objectnesses (Sequence[Tensor]): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, 1, H, W).
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            dict[str, Tensor]: A dictionary of losses.
+        """
+        num_imgs = len(batch_img_metas)
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+
+        featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes,
+            dtype=cls_scores[0].dtype,
+            device=cls_scores[0].device,
+            with_stride=True)
+
+        flatten_cls_preds = [
+            cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1,
+                                                 self.cls_out_channels)
+            for cls_pred in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4)
+            for bbox_pred in bbox_preds
+        ]
+        flatten_objectness = [
+            objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1)
+            for objectness in objectnesses
+        ]
+
+        flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1)
+        flatten_objectness = torch.cat(flatten_objectness, dim=1)
+        flatten_priors = torch.cat(mlvl_priors)
+        flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds)
+
+        (pos_masks, cls_targets, obj_targets, bbox_targets, l1_targets,
+         num_fg_imgs) = multi_apply(
+             self._get_targets_single,
+             flatten_priors.unsqueeze(0).repeat(num_imgs, 1, 1),
+             flatten_cls_preds.detach(), flatten_bboxes.detach(),
+             flatten_objectness.detach(), batch_gt_instances, batch_img_metas,
+             batch_gt_instances_ignore)
+
+        # The experimental results show that 'reduce_mean' can improve
+        # performance on the COCO dataset.
+        num_pos = torch.tensor(
+            sum(num_fg_imgs),
+            dtype=torch.float,
+            device=flatten_cls_preds.device)
+        num_total_samples = max(reduce_mean(num_pos), 1.0)
+
+        pos_masks = torch.cat(pos_masks, 0)
+        cls_targets = torch.cat(cls_targets, 0)
+        obj_targets = torch.cat(obj_targets, 0)
+        bbox_targets = torch.cat(bbox_targets, 0)
+        if self.use_l1:
+            l1_targets = torch.cat(l1_targets, 0)
+
+        loss_obj = self.loss_obj(flatten_objectness.view(-1, 1),
+                                 obj_targets) / num_total_samples
+        if num_pos > 0:
+            loss_cls = self.loss_cls(
+                flatten_cls_preds.view(-1, self.num_classes)[pos_masks],
+                cls_targets) / num_total_samples
+            loss_bbox = self.loss_bbox(
+                flatten_bboxes.view(-1, 4)[pos_masks],
+                bbox_targets) / num_total_samples
+        else:
+            # Avoid cls and reg branch not participating in the gradient
+            # propagation when there is no ground-truth in the images.
+            # For more details, please refer to
+            # https://github.com/open-mmlab/mmdetection/issues/7298
+            loss_cls = flatten_cls_preds.sum() * 0
+            loss_bbox = flatten_bboxes.sum() * 0
+
+        loss_dict = dict(
+            loss_cls=loss_cls, loss_bbox=loss_bbox, loss_obj=loss_obj)
+
+        if self.use_l1:
+            if num_pos > 0:
+                loss_l1 = self.loss_l1(
+                    flatten_bbox_preds.view(-1, 4)[pos_masks],
+                    l1_targets) / num_total_samples
+            else:
+                # Avoid cls and reg branch not participating in the gradient
+                # propagation when there is no ground-truth in the images.
+                # For more details, please refer to
+                # https://github.com/open-mmlab/mmdetection/issues/7298
+                loss_l1 = flatten_bbox_preds.sum() * 0
+            loss_dict.update(loss_l1=loss_l1)
+
+        return loss_dict
+
+    @torch.no_grad()
+    def _get_targets_single(
+            self,
+            priors: Tensor,
+            cls_preds: Tensor,
+            decoded_bboxes: Tensor,
+            objectness: Tensor,
+            gt_instances: InstanceData,
+            img_meta: dict,
+            gt_instances_ignore: Optional[InstanceData] = None) -> tuple:
+        """Compute classification, regression, and objectness targets for
+        priors in a single image.
+
+        Args:
+            priors (Tensor): All priors of one image, a 2D-Tensor with shape
+                [num_priors, 4] in [cx, xy, stride_w, stride_y] format.
+            cls_preds (Tensor): Classification predictions of one image,
+                a 2D-Tensor with shape [num_priors, num_classes]
+            decoded_bboxes (Tensor): Decoded bboxes predictions of one image,
+                a 2D-Tensor with shape [num_priors, 4] in [tl_x, tl_y,
+                br_x, br_y] format.
+            objectness (Tensor): Objectness predictions of one image,
+                a 1D-Tensor with shape [num_priors]
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It should includes ``bboxes`` and ``labels``
+                attributes.
+            img_meta (dict): Meta information for current image.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            tuple:
+                foreground_mask (list[Tensor]): Binary mask of foreground
+                targets.
+                cls_target (list[Tensor]): Classification targets of an image.
+                obj_target (list[Tensor]): Objectness targets of an image.
+                bbox_target (list[Tensor]): BBox targets of an image.
+                l1_target (int): BBox L1 targets of an image.
+                num_pos_per_img (int): Number of positive samples in an image.
+        """
+
+        num_priors = priors.size(0)
+        num_gts = len(gt_instances)
+        # No target
+        if num_gts == 0:
+            cls_target = cls_preds.new_zeros((0, self.num_classes))
+            bbox_target = cls_preds.new_zeros((0, 4))
+            l1_target = cls_preds.new_zeros((0, 4))
+            obj_target = cls_preds.new_zeros((num_priors, 1))
+            foreground_mask = cls_preds.new_zeros(num_priors).bool()
+            return (foreground_mask, cls_target, obj_target, bbox_target,
+                    l1_target, 0)
+
+        # YOLOX uses center priors with 0.5 offset to assign targets,
+        # but use center priors without offset to regress bboxes.
+        offset_priors = torch.cat(
+            [priors[:, :2] + priors[:, 2:] * 0.5, priors[:, 2:]], dim=-1)
+
+        scores = cls_preds.sigmoid() * objectness.unsqueeze(1).sigmoid()
+        pred_instances = InstanceData(
+            bboxes=decoded_bboxes, scores=scores.sqrt_(), priors=offset_priors)
+        assign_result = self.assigner.assign(
+            pred_instances=pred_instances,
+            gt_instances=gt_instances,
+            gt_instances_ignore=gt_instances_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+        pos_inds = sampling_result.pos_inds
+        num_pos_per_img = pos_inds.size(0)
+
+        pos_ious = assign_result.max_overlaps[pos_inds]
+        # IOU aware classification score
+        cls_target = F.one_hot(sampling_result.pos_gt_labels,
+                               self.num_classes) * pos_ious.unsqueeze(-1)
+        obj_target = torch.zeros_like(objectness).unsqueeze(-1)
+        obj_target[pos_inds] = 1
+        bbox_target = sampling_result.pos_gt_bboxes
+        l1_target = cls_preds.new_zeros((num_pos_per_img, 4))
+        if self.use_l1:
+            l1_target = self._get_l1_target(l1_target, bbox_target,
+                                            priors[pos_inds])
+        foreground_mask = torch.zeros_like(objectness).to(torch.bool)
+        foreground_mask[pos_inds] = 1
+        return (foreground_mask, cls_target, obj_target, bbox_target,
+                l1_target, num_pos_per_img)
+
+    def _get_l1_target(self,
+                       l1_target: Tensor,
+                       gt_bboxes: Tensor,
+                       priors: Tensor,
+                       eps: float = 1e-8) -> Tensor:
+        """Convert gt bboxes to center offset and log width height."""
+        gt_cxcywh = bbox_xyxy_to_cxcywh(gt_bboxes)
+        l1_target[:, :2] = (gt_cxcywh[:, :2] - priors[:, :2]) / priors[:, 2:]
+        l1_target[:, 2:] = torch.log(gt_cxcywh[:, 2:] / priors[:, 2:] + eps)
+        return l1_target
diff --git a/head_extractor/src/mmdet/models/detectors/__init__.py b/head_extractor/src/mmdet/models/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5a06d2813c810504e12592506be9347111d6696
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/__init__.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .atss import ATSS
+from .autoassign import AutoAssign
+from .base import BaseDetector
+from .base_detr import DetectionTransformer
+from .boxinst import BoxInst
+from .cascade_rcnn import CascadeRCNN
+from .centernet import CenterNet
+from .condinst import CondInst
+from .conditional_detr import ConditionalDETR
+from .cornernet import CornerNet
+from .crowddet import CrowdDet
+from .d2_wrapper import Detectron2Wrapper
+from .dab_detr import DABDETR
+from .ddod import DDOD
+from .ddq_detr import DDQDETR
+from .deformable_detr import DeformableDETR
+from .detr import DETR
+from .dino import DINO
+from .fast_rcnn import FastRCNN
+from .faster_rcnn import FasterRCNN
+from .fcos import FCOS
+from .fovea import FOVEA
+from .fsaf import FSAF
+from .gfl import GFL
+from .glip import GLIP
+from .grid_rcnn import GridRCNN
+from .grounding_dino import GroundingDINO
+from .htc import HybridTaskCascade
+from .kd_one_stage import KnowledgeDistillationSingleStageDetector
+from .lad import LAD
+from .mask2former import Mask2Former
+from .mask_rcnn import MaskRCNN
+from .mask_scoring_rcnn import MaskScoringRCNN
+from .maskformer import MaskFormer
+from .nasfcos import NASFCOS
+from .paa import PAA
+from .panoptic_fpn import PanopticFPN
+from .panoptic_two_stage_segmentor import TwoStagePanopticSegmentor
+from .point_rend import PointRend
+from .queryinst import QueryInst
+from .reppoints_detector import RepPointsDetector
+from .retinanet import RetinaNet
+from .rpn import RPN
+from .rtmdet import RTMDet
+from .scnet import SCNet
+from .semi_base import SemiBaseDetector
+from .single_stage import SingleStageDetector
+from .soft_teacher import SoftTeacher
+from .solo import SOLO
+from .solov2 import SOLOv2
+from .sparse_rcnn import SparseRCNN
+from .tood import TOOD
+from .trident_faster_rcnn import TridentFasterRCNN
+from .two_stage import TwoStageDetector
+from .vfnet import VFNet
+from .yolact import YOLACT
+from .yolo import YOLOV3
+from .yolof import YOLOF
+from .yolox import YOLOX
+
+__all__ = [
+    'ATSS', 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN',
+    'KnowledgeDistillationSingleStageDetector', 'FastRCNN', 'FasterRCNN',
+    'MaskRCNN', 'CascadeRCNN', 'HybridTaskCascade', 'RetinaNet', 'FCOS',
+    'GridRCNN', 'MaskScoringRCNN', 'RepPointsDetector', 'FOVEA', 'FSAF',
+    'NASFCOS', 'PointRend', 'GFL', 'CornerNet', 'PAA', 'YOLOV3', 'YOLACT',
+    'VFNet', 'DETR', 'TridentFasterRCNN', 'SparseRCNN', 'SCNet', 'SOLO',
+    'SOLOv2', 'DeformableDETR', 'AutoAssign', 'YOLOF', 'CenterNet', 'YOLOX',
+    'TwoStagePanopticSegmentor', 'PanopticFPN', 'QueryInst', 'LAD', 'TOOD',
+    'MaskFormer', 'DDOD', 'Mask2Former', 'SemiBaseDetector', 'SoftTeacher',
+    'RTMDet', 'Detectron2Wrapper', 'CrowdDet', 'CondInst', 'BoxInst',
+    'DetectionTransformer', 'ConditionalDETR', 'DINO', 'DABDETR', 'GLIP',
+    'DDQDETR', 'GroundingDINO'
+]
diff --git a/head_extractor/src/mmdet/models/detectors/atss.py b/head_extractor/src/mmdet/models/detectors/atss.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bfcc728dc4cc33c0b705a2ab22a4e3f4ad7386d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/atss.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class ATSS(SingleStageDetector):
+    """Implementation of `ATSS <https://arxiv.org/abs/1912.02424>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of ATSS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of ATSS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/autoassign.py b/head_extractor/src/mmdet/models/detectors/autoassign.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0b3570fe6e0c3812a72bc677038bb4e76b05576
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/autoassign.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class AutoAssign(SingleStageDetector):
+    """Implementation of `AutoAssign: Differentiable Label Assignment for Dense
+    Object Detection <https://arxiv.org/abs/2007.03496>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of AutoAssign. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of AutoAssign. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/base.py b/head_extractor/src/mmdet/models/detectors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a193b0ca9ca3d2b42fda452004d5c97421f426c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/base.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple, Union
+
+import torch
+from mmengine.model import BaseModel
+from torch import Tensor
+
+from mmdet.structures import DetDataSample, OptSampleList, SampleList
+from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig
+from ..utils import samplelist_boxtype2tensor
+
+ForwardResults = Union[Dict[str, torch.Tensor], List[DetDataSample],
+                       Tuple[torch.Tensor], torch.Tensor]
+
+
+class BaseDetector(BaseModel, metaclass=ABCMeta):
+    """Base class for detectors.
+
+    Args:
+       data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`BaseDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+       init_cfg (dict or ConfigDict, optional): the config to control the
+           initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+    @property
+    def with_neck(self) -> bool:
+        """bool: whether the detector has a neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    # TODO: these properties need to be carefully handled
+    # for both single stage & two stage detectors
+    @property
+    def with_shared_head(self) -> bool:
+        """bool: whether the detector has a shared head in the RoI Head"""
+        return hasattr(self, 'roi_head') and self.roi_head.with_shared_head
+
+    @property
+    def with_bbox(self) -> bool:
+        """bool: whether the detector has a bbox head"""
+        return ((hasattr(self, 'roi_head') and self.roi_head.with_bbox)
+                or (hasattr(self, 'bbox_head') and self.bbox_head is not None))
+
+    @property
+    def with_mask(self) -> bool:
+        """bool: whether the detector has a mask head"""
+        return ((hasattr(self, 'roi_head') and self.roi_head.with_mask)
+                or (hasattr(self, 'mask_head') and self.mask_head is not None))
+
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: OptSampleList = None,
+                mode: str = 'tensor') -> ForwardResults:
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+        processed to a list of :obj:`DetDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle either back propagation or
+        parameter update, which are supposed to be done in :meth:`train_step`.
+
+        Args:
+            inputs (torch.Tensor): The input tensor with shape
+                (N, C, ...) in general.
+            data_samples (list[:obj:`DetDataSample`], optional): A batch of
+                data samples that contain annotations and predictions.
+                Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'tensor'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`DetDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if mode == 'loss':
+            return self.loss(inputs, data_samples)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    @abstractmethod
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples."""
+        pass
+
+    @abstractmethod
+    def predict(self, batch_inputs: Tensor,
+                batch_data_samples: SampleList) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing."""
+        pass
+
+    @abstractmethod
+    def _forward(self,
+                 batch_inputs: Tensor,
+                 batch_data_samples: OptSampleList = None):
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        pass
+
+    @abstractmethod
+    def extract_feat(self, batch_inputs: Tensor):
+        """Extract features from images."""
+        pass
+
+    def add_pred_to_datasample(self, data_samples: SampleList,
+                               results_list: InstanceList) -> SampleList:
+        """Add predictions to `DetDataSample`.
+
+        Args:
+            data_samples (list[:obj:`DetDataSample`], optional): A batch of
+                data samples that contain annotations and predictions.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        for data_sample, pred_instances in zip(data_samples, results_list):
+            data_sample.pred_instances = pred_instances
+        samplelist_boxtype2tensor(data_samples)
+        return data_samples
diff --git a/head_extractor/src/mmdet/models/detectors/base_detr.py b/head_extractor/src/mmdet/models/detectors/base_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..88f00ec7408c389a1eb06beac6b383007f80b893
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/base_detr.py
@@ -0,0 +1,332 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple, Union
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+
+@MODELS.register_module()
+class DetectionTransformer(BaseDetector, metaclass=ABCMeta):
+    r"""Base class for Detection Transformer.
+
+    In Detection Transformer, an encoder is used to process output features of
+    neck, then several queries interact with the encoder features using a
+    decoder and do the regression and classification with the bounding box
+    head.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): Config of the backbone.
+        neck (:obj:`ConfigDict` or dict, optional): Config of the neck.
+            Defaults to None.
+        encoder (:obj:`ConfigDict` or dict, optional): Config of the
+            Transformer encoder. Defaults to None.
+        decoder (:obj:`ConfigDict` or dict, optional): Config of the
+            Transformer decoder. Defaults to None.
+        bbox_head (:obj:`ConfigDict` or dict, optional): Config for the
+            bounding box head module. Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict, optional): Config
+            of the positional encoding module. Defaults to None.
+        num_queries (int, optional): Number of decoder query in Transformer.
+            Defaults to 100.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            the bounding box head module. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            the bounding box head module. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+            config of :class:`BaseDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 encoder: OptConfigType = None,
+                 decoder: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 positional_encoding: OptConfigType = None,
+                 num_queries: int = 100,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        # process args
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.encoder = encoder
+        self.decoder = decoder
+        self.positional_encoding = positional_encoding
+        self.num_queries = num_queries
+
+        # init model layers
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        self.bbox_head = MODELS.build(bbox_head)
+        self._init_layers()
+
+    @abstractmethod
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        pass
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, list]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (bs, dim, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+        img_feats = self.extract_feat(batch_inputs)
+        head_inputs_dict = self.forward_transformer(img_feats,
+                                                    batch_data_samples)
+        losses = self.bbox_head.loss(
+            **head_inputs_dict, batch_data_samples=batch_data_samples)
+
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs, has shape (bs, dim, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the input images.
+            Each DetDataSample usually contain 'pred_instances'. And the
+            `pred_instances` usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        img_feats = self.extract_feat(batch_inputs)
+        head_inputs_dict = self.forward_transformer(img_feats,
+                                                    batch_data_samples)
+        results_list = self.bbox_head.predict(
+            **head_inputs_dict,
+            rescale=rescale,
+            batch_data_samples=batch_data_samples)
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
+
+    def _forward(
+            self,
+            batch_inputs: Tensor,
+            batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            batch_inputs (Tensor): Inputs, has shape (bs, dim, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`], optional): The
+                batch data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[Tensor]: A tuple of features from ``bbox_head`` forward.
+        """
+        img_feats = self.extract_feat(batch_inputs)
+        head_inputs_dict = self.forward_transformer(img_feats,
+                                                    batch_data_samples)
+        results = self.bbox_head.forward(**head_inputs_dict)
+        return results
+
+    def forward_transformer(self,
+                            img_feats: Tuple[Tensor],
+                            batch_data_samples: OptSampleList = None) -> Dict:
+        """Forward process of Transformer, which includes four steps:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'. We
+        summarized the parameters flow of the existing DETR-like detector,
+        which can be illustrated as follow:
+
+        .. code:: text
+
+                 img_feats & batch_data_samples
+                               |
+                               V
+                      +-----------------+
+                      | pre_transformer |
+                      +-----------------+
+                          |          |
+                          |          V
+                          |    +-----------------+
+                          |    | forward_encoder |
+                          |    +-----------------+
+                          |             |
+                          |             V
+                          |     +---------------+
+                          |     |  pre_decoder  |
+                          |     +---------------+
+                          |         |       |
+                          V         V       |
+                      +-----------------+   |
+                      | forward_decoder |   |
+                      +-----------------+   |
+                                |           |
+                                V           V
+                               head_inputs_dict
+
+        Args:
+            img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each
+                    feature map has shape (bs, dim, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`], optional): The
+                batch data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            dict: The dictionary of bbox_head function inputs, which always
+            includes the `hidden_states` of the decoder output and may contain
+            `references` including the initial and intermediate references.
+        """
+        encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer(
+            img_feats, batch_data_samples)
+
+        encoder_outputs_dict = self.forward_encoder(**encoder_inputs_dict)
+
+        tmp_dec_in, head_inputs_dict = self.pre_decoder(**encoder_outputs_dict)
+        decoder_inputs_dict.update(tmp_dec_in)
+
+        decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict)
+        head_inputs_dict.update(decoder_outputs_dict)
+        return head_inputs_dict
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor, has shape (bs, dim, H, W).
+
+        Returns:
+            tuple[Tensor]: Tuple of feature maps from neck. Each feature map
+            has shape (bs, dim, H, W).
+        """
+        x = self.backbone(batch_inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    @abstractmethod
+    def pre_transformer(
+            self,
+            img_feats: Tuple[Tensor],
+            batch_data_samples: OptSampleList = None) -> Tuple[Dict, Dict]:
+        """Process image features before feeding them to the transformer.
+
+        Args:
+            img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each
+                feature map has shape (bs, dim, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`], optional): The
+                batch data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of encoder
+            and the second dict contains the inputs of decoder.
+
+            - encoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_encoder()`, which includes 'feat', 'feat_mask',
+              'feat_pos', and other algorithm-specific arguments.
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_decoder()`, which includes 'memory_mask', and
+              other algorithm-specific arguments.
+        """
+        pass
+
+    @abstractmethod
+    def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
+                        feat_pos: Tensor, **kwargs) -> Dict:
+        """Forward with Transformer encoder.
+
+        Args:
+            feat (Tensor): Sequential features, has shape (bs, num_feat_points,
+                dim).
+            feat_mask (Tensor): ByteTensor, the padding mask of the features,
+                has shape (bs, num_feat_points).
+            feat_pos (Tensor): The positional embeddings of the features, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of encoder outputs, which includes the
+            `memory` of the encoder output and other algorithm-specific
+            arguments.
+        """
+        pass
+
+    @abstractmethod
+    def pre_decoder(self, memory: Tensor, **kwargs) -> Tuple[Dict, Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`, and `reference_points`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of decoder
+            and the second dict contains the inputs of the bbox_head function.
+
+            - decoder_inputs_dict (dict): The keyword dictionary args of
+              `self.forward_decoder()`, which includes 'query', 'query_pos',
+              'memory', and other algorithm-specific arguments.
+            - head_inputs_dict (dict): The keyword dictionary args of the
+              bbox_head functions, which is usually empty, or includes
+              `enc_outputs_class` and `enc_outputs_class` when the detector
+              support 'two stage' or 'query selection' strategies.
+        """
+        pass
+
+    @abstractmethod
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        **kwargs) -> Dict:
+        """Forward with Transformer decoder.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` of the decoder output, `references` including
+            the initial and intermediate reference_points, and other
+            algorithm-specific arguments.
+        """
+        pass
diff --git a/head_extractor/src/mmdet/models/detectors/boxinst.py b/head_extractor/src/mmdet/models/detectors/boxinst.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca6b0bdd90a2a7e78f429a6822dbde6f809426da
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/boxinst.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class BoxInst(SingleStageInstanceSegmentor):
+    """Implementation of `BoxInst <https://arxiv.org/abs/2012.02310>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 mask_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/cascade_rcnn.py b/head_extractor/src/mmdet/models/detectors/cascade_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecf733ff104b99436fcc74130b0ccea12a0fa6d0
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/cascade_rcnn.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class CascadeRCNN(TwoStageDetector):
+    r"""Implementation of `Cascade R-CNN: Delving into High Quality Object
+    Detection <https://arxiv.org/abs/1906.09756>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 rpn_head: OptConfigType = None,
+                 roi_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/centernet.py b/head_extractor/src/mmdet/models/detectors/centernet.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c6622d6280227ecba9ede4aabf72c22a764e11d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/centernet.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class CenterNet(SingleStageDetector):
+    """Implementation of CenterNet(Objects as Points)
+
+    <https://arxiv.org/abs/1904.07850>.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/condinst.py b/head_extractor/src/mmdet/models/detectors/condinst.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed2dc99eea3faf7b03a3970d46a372d28eb89fe1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/condinst.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class CondInst(SingleStageInstanceSegmentor):
+    """Implementation of `CondInst <https://arxiv.org/abs/2003.05664>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 mask_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/conditional_detr.py b/head_extractor/src/mmdet/models/detectors/conditional_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..d57868e63a2ece085a7e5b67ee93c921ba334830
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/conditional_detr.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from ..layers import (ConditionalDetrTransformerDecoder,
+                      DetrTransformerEncoder, SinePositionalEncoding)
+from .detr import DETR
+
+
+@MODELS.register_module()
+class ConditionalDETR(DETR):
+    r"""Implementation of `Conditional DETR for Fast Training Convergence.
+
+    <https://arxiv.org/abs/2108.06152>`_.
+
+    Code is modified from the `official github repo
+    <https://github.com/Atten4Vis/ConditionalDETR>`_.
+    """
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DetrTransformerEncoder(**self.encoder)
+        self.decoder = ConditionalDetrTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        # NOTE The embed_dims is typically passed from the inside out.
+        # For example in DETR, The embed_dims is passed as
+        # self_attn -> the first encoder layer -> encoder -> detector.
+        self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims)
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            f'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        memory_mask: Tensor, memory_pos: Tensor) -> Dict:
+        """Forward with Transformer decoder.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            memory_pos (Tensor): The positional embeddings of memory, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` and `references` of the decoder output.
+
+            - hidden_states (Tensor): Has shape
+                (num_decoder_layers, bs, num_queries, dim)
+            - references (Tensor): Has shape
+                (bs, num_queries, 2)
+        """
+
+        hidden_states, references = self.decoder(
+            query=query,
+            key=memory,
+            query_pos=query_pos,
+            key_pos=memory_pos,
+            key_padding_mask=memory_mask)
+        head_inputs_dict = dict(
+            hidden_states=hidden_states, references=references)
+        return head_inputs_dict
diff --git a/head_extractor/src/mmdet/models/detectors/cornernet.py b/head_extractor/src/mmdet/models/detectors/cornernet.py
new file mode 100644
index 0000000000000000000000000000000000000000..946af4dbe6ae339d44f8db265ff7f11b9e02d239
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/cornernet.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class CornerNet(SingleStageDetector):
+    """CornerNet.
+
+    This detector is the implementation of the paper `CornerNet: Detecting
+    Objects as Paired Keypoints <https://arxiv.org/abs/1808.01244>`_ .
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/crowddet.py b/head_extractor/src/mmdet/models/detectors/crowddet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f43bc08aa95756324381ee4182f001a008613c8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/crowddet.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class CrowdDet(TwoStageDetector):
+    """Implementation of `CrowdDet <https://arxiv.org/abs/2003.09163>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        rpn_head (:obj:`ConfigDict` or dict): The rpn config.
+        roi_head (:obj:`ConfigDict` or dict): The roi config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of FCOS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of FCOS. Defaults to None.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmdet/models/detectors/d2_wrapper.py b/head_extractor/src/mmdet/models/detectors/d2_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a2daa413e8fe0397ec37008d781ce449e7a26fd
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/d2_wrapper.py
@@ -0,0 +1,291 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import BaseBoxes
+from mmdet.structures.mask import BitmapMasks, PolygonMasks
+from mmdet.utils import ConfigType
+from .base import BaseDetector
+
+try:
+    import detectron2
+    from detectron2.config import get_cfg
+    from detectron2.modeling import build_model
+    from detectron2.structures.masks import BitMasks as D2_BitMasks
+    from detectron2.structures.masks import PolygonMasks as D2_PolygonMasks
+    from detectron2.utils.events import EventStorage
+except ImportError:
+    detectron2 = None
+
+
+def _to_cfgnode_list(cfg: ConfigType,
+                     config_list: list = [],
+                     father_name: str = 'MODEL') -> tuple:
+    """Convert the key and value of mmengine.ConfigDict into a list.
+
+    Args:
+        cfg (ConfigDict): The detectron2 model config.
+        config_list (list): A list contains the key and value of ConfigDict.
+            Defaults to [].
+        father_name (str): The father name add before the key.
+            Defaults to "MODEL".
+
+    Returns:
+        tuple:
+
+        - config_list: A list contains the key and value of ConfigDict.
+        - father_name (str): The father name add before the key.
+          Defaults to "MODEL".
+    """
+    for key, value in cfg.items():
+        name = f'{father_name}.{key.upper()}'
+        if isinstance(value, ConfigDict) or isinstance(value, dict):
+            config_list, fater_name = \
+                _to_cfgnode_list(value, config_list, name)
+        else:
+            config_list.append(name)
+            config_list.append(value)
+
+    return config_list, father_name
+
+
+def convert_d2_pred_to_datasample(data_samples: SampleList,
+                                  d2_results_list: list) -> SampleList:
+    """Convert the Detectron2's result to DetDataSample.
+
+    Args:
+        data_samples (list[:obj:`DetDataSample`]): The batch
+            data samples. It usually includes information such
+            as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+        d2_results_list (list): The list of the results of Detectron2's model.
+
+    Returns:
+        list[:obj:`DetDataSample`]: Detection results of the
+        input images. Each DetDataSample usually contain
+        'pred_instances'. And the ``pred_instances`` usually
+        contains following keys.
+
+        - scores (Tensor): Classification scores, has a shape
+          (num_instance, )
+        - labels (Tensor): Labels of bboxes, has a shape
+          (num_instances, ).
+        - bboxes (Tensor): Has a shape (num_instances, 4),
+          the last dimension 4 arrange as (x1, y1, x2, y2).
+    """
+    assert len(data_samples) == len(d2_results_list)
+    for data_sample, d2_results in zip(data_samples, d2_results_list):
+        d2_instance = d2_results['instances']
+
+        results = InstanceData()
+        results.bboxes = d2_instance.pred_boxes.tensor
+        results.scores = d2_instance.scores
+        results.labels = d2_instance.pred_classes
+
+        if d2_instance.has('pred_masks'):
+            results.masks = d2_instance.pred_masks
+        data_sample.pred_instances = results
+
+    return data_samples
+
+
+@MODELS.register_module()
+class Detectron2Wrapper(BaseDetector):
+    """Wrapper of a Detectron2 model. Input/output formats of this class follow
+    MMDetection's convention, so a Detectron2 model can be trained and
+    evaluated in MMDetection.
+
+    Args:
+        detector (:obj:`ConfigDict` or dict): The module config of
+            Detectron2.
+        bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): whether to convert image from RGB to BGR.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 detector: ConfigType,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False) -> None:
+        if detectron2 is None:
+            raise ImportError('Please install Detectron2 first')
+        assert not (bgr_to_rgb and rgb_to_bgr), (
+            '`bgr2rgb` and `rgb2bgr` cannot be set to True at the same time')
+        super().__init__()
+        self._channel_conversion = rgb_to_bgr or bgr_to_rgb
+        cfgnode_list, _ = _to_cfgnode_list(detector)
+        self.cfg = get_cfg()
+        self.cfg.merge_from_list(cfgnode_list)
+        self.d2_model = build_model(self.cfg)
+        self.storage = EventStorage()
+
+    def init_weights(self) -> None:
+        """Initialization Backbone.
+
+        NOTE: The initialization of other layers are in Detectron2,
+        if users want to change the initialization way, please
+        change the code in Detectron2.
+        """
+        from detectron2.checkpoint import DetectionCheckpointer
+        checkpointer = DetectionCheckpointer(model=self.d2_model)
+        checkpointer.load(self.cfg.MODEL.WEIGHTS, checkpointables=[])
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        The inputs will first convert to the Detectron2 type and feed into
+        D2 models.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        d2_batched_inputs = self._convert_to_d2_inputs(
+            batch_inputs=batch_inputs,
+            batch_data_samples=batch_data_samples,
+            training=True)
+
+        with self.storage as storage:  # noqa
+            losses = self.d2_model(d2_batched_inputs)
+        # storage contains some training information, such as cls_accuracy.
+        # you can use storage.latest() to get the detail information
+        return losses
+
+    def predict(self, batch_inputs: Tensor,
+                batch_data_samples: SampleList) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        The inputs will first convert to the Detectron2 type and feed into
+        D2 models. And the results will convert back to the MMDet type.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        d2_batched_inputs = self._convert_to_d2_inputs(
+            batch_inputs=batch_inputs,
+            batch_data_samples=batch_data_samples,
+            training=False)
+        # results in detectron2 has already rescale
+        d2_results_list = self.d2_model(d2_batched_inputs)
+        batch_data_samples = convert_d2_pred_to_datasample(
+            data_samples=batch_data_samples, d2_results_list=d2_results_list)
+
+        return batch_data_samples
+
+    def _forward(self, *args, **kwargs):
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        raise NotImplementedError(
+            f'`_forward` is not implemented in {self.__class__.__name__}')
+
+    def extract_feat(self, *args, **kwargs):
+        """Extract features from images.
+
+        `extract_feat` will not be used in obj:``Detectron2Wrapper``.
+        """
+        pass
+
+    def _convert_to_d2_inputs(self,
+                              batch_inputs: Tensor,
+                              batch_data_samples: SampleList,
+                              training=True) -> list:
+        """Convert inputs type to support Detectron2's model.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+            training (bool): Whether to enable training time processing.
+
+        Returns:
+            list[dict]: A list of dict, which will be fed into Detectron2's
+            model. And the dict usually contains following keys.
+
+            - image (Tensor): Image in (C, H, W) format.
+            - instances (Instances): GT Instance.
+            - height (int): the output height resolution of the model
+            - width (int): the output width resolution of the model
+        """
+        from detectron2.data.detection_utils import filter_empty_instances
+        from detectron2.structures import Boxes, Instances
+
+        batched_d2_inputs = []
+        for image, data_samples in zip(batch_inputs, batch_data_samples):
+            d2_inputs = dict()
+            # deal with metainfo
+            meta_info = data_samples.metainfo
+            d2_inputs['file_name'] = meta_info['img_path']
+            d2_inputs['height'], d2_inputs['width'] = meta_info['ori_shape']
+            d2_inputs['image_id'] = meta_info['img_id']
+            # deal with image
+            if self._channel_conversion:
+                image = image[[2, 1, 0], ...]
+            d2_inputs['image'] = image
+            # deal with gt_instances
+            gt_instances = data_samples.gt_instances
+            d2_instances = Instances(meta_info['img_shape'])
+
+            gt_boxes = gt_instances.bboxes
+            # TODO: use mmdet.structures.box.get_box_tensor after PR 8658
+            #  has merged
+            if isinstance(gt_boxes, BaseBoxes):
+                gt_boxes = gt_boxes.tensor
+            d2_instances.gt_boxes = Boxes(gt_boxes)
+
+            d2_instances.gt_classes = gt_instances.labels
+            if gt_instances.get('masks', None) is not None:
+                gt_masks = gt_instances.masks
+                if isinstance(gt_masks, PolygonMasks):
+                    d2_instances.gt_masks = D2_PolygonMasks(gt_masks.masks)
+                elif isinstance(gt_masks, BitmapMasks):
+                    d2_instances.gt_masks = D2_BitMasks(gt_masks.masks)
+                else:
+                    raise TypeError('The type of `gt_mask` can be '
+                                    '`PolygonMasks` or `BitMasks`, but get '
+                                    f'{type(gt_masks)}.')
+            # convert to cpu and convert back to cuda to avoid
+            # some potential error
+            if training:
+                device = gt_boxes.device
+                d2_instances = filter_empty_instances(
+                    d2_instances.to('cpu')).to(device)
+                d2_inputs['instances'] = d2_instances
+            batched_d2_inputs.append(d2_inputs)
+
+        return batched_d2_inputs
diff --git a/head_extractor/src/mmdet/models/detectors/dab_detr.py b/head_extractor/src/mmdet/models/detectors/dab_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..b61301cf6660924f0832f4068841a4664797c585
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/dab_detr.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+from mmengine.model import uniform_init
+from torch import Tensor, nn
+
+from mmdet.registry import MODELS
+from ..layers import SinePositionalEncoding
+from ..layers.transformer import (DABDetrTransformerDecoder,
+                                  DABDetrTransformerEncoder, inverse_sigmoid)
+from .detr import DETR
+
+
+@MODELS.register_module()
+class DABDETR(DETR):
+    r"""Implementation of `DAB-DETR:
+    Dynamic Anchor Boxes are Better Queries for DETR.
+
+    <https://arxiv.org/abs/2201.12329>`_.
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/DAB-DETR>`_.
+
+    Args:
+        with_random_refpoints (bool): Whether to randomly initialize query
+            embeddings and not update them during training.
+            Defaults to False.
+        num_patterns (int): Inspired by Anchor-DETR. Defaults to 0.
+    """
+
+    def __init__(self,
+                 *args,
+                 with_random_refpoints: bool = False,
+                 num_patterns: int = 0,
+                 **kwargs) -> None:
+        self.with_random_refpoints = with_random_refpoints
+        assert isinstance(num_patterns, int), \
+            f'num_patterns should be int but {num_patterns}.'
+        self.num_patterns = num_patterns
+
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DABDetrTransformerEncoder(**self.encoder)
+        self.decoder = DABDetrTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        self.query_dim = self.decoder.query_dim
+        self.query_embedding = nn.Embedding(self.num_queries, self.query_dim)
+        if self.num_patterns > 0:
+            self.patterns = nn.Embedding(self.num_patterns, self.embed_dims)
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            f'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super(DABDETR, self).init_weights()
+        if self.with_random_refpoints:
+            uniform_init(self.query_embedding)
+            self.query_embedding.weight.data[:, :2] = \
+                inverse_sigmoid(self.query_embedding.weight.data[:, :2])
+            self.query_embedding.weight.data[:, :2].requires_grad = False
+
+    def pre_decoder(self, memory: Tensor) -> Tuple[Dict, Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of decoder
+            and the second dict contains the inputs of the bbox_head function.
+
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+                `self.forward_decoder()`, which includes 'query', 'query_pos',
+                'memory' and 'reg_branches'.
+            - head_inputs_dict (dict): The keyword args dictionary of the
+                bbox_head functions, which is usually empty, or includes
+                `enc_outputs_class` and `enc_outputs_class` when the detector
+                support 'two stage' or 'query selection' strategies.
+        """
+        batch_size = memory.size(0)
+        query_pos = self.query_embedding.weight
+        query_pos = query_pos.unsqueeze(0).repeat(batch_size, 1, 1)
+        if self.num_patterns == 0:
+            query = query_pos.new_zeros(batch_size, self.num_queries,
+                                        self.embed_dims)
+        else:
+            query = self.patterns.weight[:, None, None, :]\
+                .repeat(1, self.num_queries, batch_size, 1)\
+                .view(-1, batch_size, self.embed_dims)\
+                .permute(1, 0, 2)
+            query_pos = query_pos.repeat(1, self.num_patterns, 1)
+
+        decoder_inputs_dict = dict(
+            query_pos=query_pos, query=query, memory=memory)
+        head_inputs_dict = dict()
+        return decoder_inputs_dict, head_inputs_dict
+
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        memory_mask: Tensor, memory_pos: Tensor) -> Dict:
+        """Forward with Transformer decoder.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            memory_pos (Tensor): The positional embeddings of memory, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` and `references` of the decoder output.
+        """
+
+        hidden_states, references = self.decoder(
+            query=query,
+            key=memory,
+            query_pos=query_pos,
+            key_pos=memory_pos,
+            key_padding_mask=memory_mask,
+            reg_branches=self.bbox_head.
+            fc_reg  # iterative refinement for anchor boxes
+        )
+        head_inputs_dict = dict(
+            hidden_states=hidden_states, references=references)
+        return head_inputs_dict
diff --git a/head_extractor/src/mmdet/models/detectors/ddod.py b/head_extractor/src/mmdet/models/detectors/ddod.py
new file mode 100644
index 0000000000000000000000000000000000000000..3503a40c8eb6d6c0496ea0f31740acecf774113a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/ddod.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class DDOD(SingleStageDetector):
+    """Implementation of `DDOD <https://arxiv.org/pdf/2107.02963.pdf>`_.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of ATSS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of ATSS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/ddq_detr.py b/head_extractor/src/mmdet/models/detectors/ddq_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..57d4959d50ddd7a761d5e5c7a29d1f7f233f838a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/ddq_detr.py
@@ -0,0 +1,274 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import MultiScaleDeformableAttention, batched_nms
+from torch import Tensor, nn
+from torch.nn.init import normal_
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from mmdet.utils import OptConfigType
+from ..layers import DDQTransformerDecoder
+from ..utils import align_tensor
+from .deformable_detr import DeformableDETR
+from .dino import DINO
+
+
+@MODELS.register_module()
+class DDQDETR(DINO):
+    r"""Implementation of `Dense Distinct Query for
+    End-to-End Object Detection <https://arxiv.org/abs/2303.12776>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/jshilong/DDQ>`_.
+
+    Args:
+        dense_topk_ratio (float): Ratio of num_dense queries to num_queries.
+            Defaults to 1.5.
+        dqs_cfg (:obj:`ConfigDict` or dict, optional): Config of
+            Distinct Queries Selection. Defaults to nms with
+            `iou_threshold` = 0.8.
+    """
+
+    def __init__(self,
+                 *args,
+                 dense_topk_ratio: float = 1.5,
+                 dqs_cfg: OptConfigType = dict(type='nms', iou_threshold=0.8),
+                 **kwargs):
+        self.dense_topk_ratio = dense_topk_ratio
+        self.decoder_cfg = kwargs['decoder']
+        self.dqs_cfg = dqs_cfg
+        super().__init__(*args, **kwargs)
+
+        # a share dict in all moduls
+        # pass some intermediate results and config parameters
+        cache_dict = dict()
+        for m in self.modules():
+            m.cache_dict = cache_dict
+        # first element is the start index of matching queries
+        # second element is the number of matching queries
+        self.cache_dict['dis_query_info'] = [0, 0]
+
+        # mask for distinct queries in each decoder layer
+        self.cache_dict['distinct_query_mask'] = []
+        # pass to decoder do the dqs
+        self.cache_dict['cls_branches'] = self.bbox_head.cls_branches
+        # Used to construct the attention mask after dqs
+        self.cache_dict['num_heads'] = self.encoder.layers[
+            0].self_attn.num_heads
+        # pass to decoder to do the dqs
+        self.cache_dict['dqs_cfg'] = self.dqs_cfg
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        super(DDQDETR, self)._init_layers()
+        self.decoder = DDQTransformerDecoder(**self.decoder_cfg)
+        self.query_embedding = None
+        self.query_map = nn.Linear(self.embed_dims, self.embed_dims)
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super(DeformableDETR, self).init_weights()
+        for coder in self.encoder, self.decoder:
+            for p in coder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        nn.init.xavier_uniform_(self.memory_trans_fc.weight)
+        normal_(self.level_embed)
+
+    def pre_decoder(
+        self,
+        memory: Tensor,
+        memory_mask: Tensor,
+        spatial_shapes: Tensor,
+        batch_data_samples: OptSampleList = None,
+    ) -> Tuple[Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `memory`, and `reference_points`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points). Will only be used when
+                `as_two_stage` is `True`.
+            spatial_shapes (Tensor): Spatial shapes of features in all levels.
+                With shape (num_levels, 2), last dimension represents (h, w).
+                Will only be used when `as_two_stage` is `True`.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict]: The decoder_inputs_dict and head_inputs_dict.
+
+            - decoder_inputs_dict (dict): The keyword dictionary args of
+              `self.forward_decoder()`, which includes 'query', 'memory',
+              `reference_points`, and `dn_mask`. The reference points of
+              decoder input here are 4D boxes, although it has `points`
+              in its name.
+            - head_inputs_dict (dict): The keyword dictionary args of the
+              bbox_head functions, which includes `topk_score`, `topk_coords`,
+              `dense_topk_score`, `dense_topk_coords`,
+              and `dn_meta`, when `self.training` is `True`, else is empty.
+        """
+        bs, _, c = memory.shape
+        output_memory, output_proposals = self.gen_encoder_output_proposals(
+            memory, memory_mask, spatial_shapes)
+        enc_outputs_class = self.bbox_head.cls_branches[
+            self.decoder.num_layers](
+                output_memory)
+        enc_outputs_coord_unact = self.bbox_head.reg_branches[
+            self.decoder.num_layers](output_memory) + output_proposals
+
+        if self.training:
+            # aux dense branch particularly in DDQ DETR, which doesn't exist
+            #   in DINO.
+            # -1 is the aux head for the encoder
+            dense_enc_outputs_class = self.bbox_head.cls_branches[-1](
+                output_memory)
+            dense_enc_outputs_coord_unact = self.bbox_head.reg_branches[-1](
+                output_memory) + output_proposals
+
+        topk = self.num_queries
+        dense_topk = int(topk * self.dense_topk_ratio)
+
+        proposals = enc_outputs_coord_unact.sigmoid()
+        proposals = bbox_cxcywh_to_xyxy(proposals)
+        scores = enc_outputs_class.max(-1)[0].sigmoid()
+
+        if self.training:
+            # aux dense branch particularly in DDQ DETR, which doesn't exist
+            #   in DINO.
+            dense_proposals = dense_enc_outputs_coord_unact.sigmoid()
+            dense_proposals = bbox_cxcywh_to_xyxy(dense_proposals)
+            dense_scores = dense_enc_outputs_class.max(-1)[0].sigmoid()
+
+        num_imgs = len(scores)
+        topk_score = []
+        topk_coords_unact = []
+        # Distinct query.
+        query = []
+
+        dense_topk_score = []
+        dense_topk_coords_unact = []
+        dense_query = []
+
+        for img_id in range(num_imgs):
+            single_proposals = proposals[img_id]
+            single_scores = scores[img_id]
+
+            # `batched_nms` of class scores and bbox coordinations is used
+            #   particularly by DDQ DETR for region proposal generation,
+            #   instead of `topk` of class scores by DINO.
+            _, keep_idxs = batched_nms(
+                single_proposals, single_scores,
+                torch.ones(len(single_scores), device=single_scores.device),
+                self.cache_dict['dqs_cfg'])
+
+            if self.training:
+                # aux dense branch particularly in DDQ DETR, which doesn't
+                #   exist in DINO.
+                dense_single_proposals = dense_proposals[img_id]
+                dense_single_scores = dense_scores[img_id]
+                # sort according the score
+                # Only sort by classification score, neither nms nor topk is
+                #   required. So input parameter `nms_cfg` = None.
+                _, dense_keep_idxs = batched_nms(
+                    dense_single_proposals, dense_single_scores,
+                    torch.ones(
+                        len(dense_single_scores),
+                        device=dense_single_scores.device), None)
+
+                dense_topk_score.append(dense_enc_outputs_class[img_id]
+                                        [dense_keep_idxs][:dense_topk])
+                dense_topk_coords_unact.append(
+                    dense_enc_outputs_coord_unact[img_id][dense_keep_idxs]
+                    [:dense_topk])
+
+            topk_score.append(enc_outputs_class[img_id][keep_idxs][:topk])
+
+            # Instead of initializing the content part with transformed
+            #   coordinates in Deformable DETR, we fuse the feature map
+            #   embedding of distinct positions as the content part, which
+            #   makes the initial queries more distinct.
+            topk_coords_unact.append(
+                enc_outputs_coord_unact[img_id][keep_idxs][:topk])
+
+            map_memory = self.query_map(memory[img_id].detach())
+            query.append(map_memory[keep_idxs][:topk])
+            if self.training:
+                # aux dense branch particularly in DDQ DETR, which doesn't
+                # exist in DINO.
+                dense_query.append(map_memory[dense_keep_idxs][:dense_topk])
+
+        topk_score = align_tensor(topk_score, topk)
+        topk_coords_unact = align_tensor(topk_coords_unact, topk)
+        query = align_tensor(query, topk)
+        if self.training:
+            dense_topk_score = align_tensor(dense_topk_score)
+            dense_topk_coords_unact = align_tensor(dense_topk_coords_unact)
+
+            dense_query = align_tensor(dense_query)
+            num_dense_queries = dense_query.size(1)
+        if self.training:
+            query = torch.cat([query, dense_query], dim=1)
+            topk_coords_unact = torch.cat(
+                [topk_coords_unact, dense_topk_coords_unact], dim=1)
+
+        topk_coords = topk_coords_unact.sigmoid()
+        if self.training:
+            dense_topk_coords = topk_coords[:, -num_dense_queries:]
+            topk_coords = topk_coords[:, :-num_dense_queries]
+
+        topk_coords_unact = topk_coords_unact.detach()
+
+        if self.training:
+            dn_label_query, dn_bbox_query, dn_mask, dn_meta = \
+                self.dn_query_generator(batch_data_samples)
+            query = torch.cat([dn_label_query, query], dim=1)
+            reference_points = torch.cat([dn_bbox_query, topk_coords_unact],
+                                         dim=1)
+
+            # Update `dn_mask` to add mask for dense queries.
+            ori_size = dn_mask.size(-1)
+            new_size = dn_mask.size(-1) + num_dense_queries
+            new_dn_mask = dn_mask.new_ones((new_size, new_size)).bool()
+            dense_mask = torch.zeros(num_dense_queries,
+                                     num_dense_queries).bool()
+            self.cache_dict['dis_query_info'] = [dn_label_query.size(1), topk]
+
+            new_dn_mask[ori_size:, ori_size:] = dense_mask
+            new_dn_mask[:ori_size, :ori_size] = dn_mask
+            dn_meta['num_dense_queries'] = num_dense_queries
+            dn_mask = new_dn_mask
+            self.cache_dict['num_dense_queries'] = num_dense_queries
+            self.decoder.aux_reg_branches = self.bbox_head.aux_reg_branches
+
+        else:
+            self.cache_dict['dis_query_info'] = [0, topk]
+            reference_points = topk_coords_unact
+            dn_mask, dn_meta = None, None
+
+        reference_points = reference_points.sigmoid()
+
+        decoder_inputs_dict = dict(
+            query=query,
+            memory=memory,
+            reference_points=reference_points,
+            dn_mask=dn_mask)
+        head_inputs_dict = dict(
+            enc_outputs_class=topk_score,
+            enc_outputs_coord=topk_coords,
+            aux_enc_outputs_class=dense_topk_score,
+            aux_enc_outputs_coord=dense_topk_coords,
+            dn_meta=dn_meta) if self.training else dict()
+
+        return decoder_inputs_dict, head_inputs_dict
diff --git a/head_extractor/src/mmdet/models/detectors/deformable_detr.py b/head_extractor/src/mmdet/models/detectors/deformable_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eb5cd2f95204542d5a9ace1a6d92e0b858c139f
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/deformable_detr.py
@@ -0,0 +1,572 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Dict, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention
+from mmengine.model import xavier_init
+from torch import Tensor, nn
+from torch.nn.init import normal_
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList
+from mmdet.utils import OptConfigType
+from ..layers import (DeformableDetrTransformerDecoder,
+                      DeformableDetrTransformerEncoder, SinePositionalEncoding)
+from .base_detr import DetectionTransformer
+
+
+@MODELS.register_module()
+class DeformableDETR(DetectionTransformer):
+    r"""Implementation of `Deformable DETR: Deformable Transformers for
+    End-to-End Object Detection <https://arxiv.org/abs/2010.04159>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/fundamentalvision/Deformable-DETR>`_.
+
+    Args:
+        decoder (:obj:`ConfigDict` or dict, optional): Config of the
+            Transformer decoder. Defaults to None.
+        bbox_head (:obj:`ConfigDict` or dict, optional): Config for the
+            bounding box head module. Defaults to None.
+        with_box_refine (bool, optional): Whether to refine the references
+            in the decoder. Defaults to `False`.
+        as_two_stage (bool, optional): Whether to generate the proposal
+            from the outputs of encoder. Defaults to `False`.
+        num_feature_levels (int, optional): Number of feature levels.
+            Defaults to 4.
+    """
+
+    def __init__(self,
+                 *args,
+                 decoder: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 with_box_refine: bool = False,
+                 as_two_stage: bool = False,
+                 num_feature_levels: int = 4,
+                 **kwargs) -> None:
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        self.num_feature_levels = num_feature_levels
+
+        if bbox_head is not None:
+            assert 'share_pred_layer' not in bbox_head and \
+                   'num_pred_layer' not in bbox_head and \
+                   'as_two_stage' not in bbox_head, \
+                'The two keyword args `share_pred_layer`, `num_pred_layer`, ' \
+                'and `as_two_stage are set in `detector.__init__()`, users ' \
+                'should not set them in `bbox_head` config.'
+            # The last prediction layer is used to generate proposal
+            # from encode feature map when `as_two_stage` is `True`.
+            # And all the prediction layers should share parameters
+            # when `with_box_refine` is `True`.
+            bbox_head['share_pred_layer'] = not with_box_refine
+            bbox_head['num_pred_layer'] = (decoder['num_layers'] + 1) \
+                if self.as_two_stage else decoder['num_layers']
+            bbox_head['as_two_stage'] = as_two_stage
+
+        super().__init__(*args, decoder=decoder, bbox_head=bbox_head, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DeformableDetrTransformerEncoder(**self.encoder)
+        self.decoder = DeformableDetrTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        if not self.as_two_stage:
+            self.query_embedding = nn.Embedding(self.num_queries,
+                                                self.embed_dims * 2)
+            # NOTE The query_embedding will be split into query and query_pos
+            # in self.pre_decoder, hence, the embed_dims are doubled.
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+        self.level_embed = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+
+        if self.as_two_stage:
+            self.memory_trans_fc = nn.Linear(self.embed_dims, self.embed_dims)
+            self.memory_trans_norm = nn.LayerNorm(self.embed_dims)
+            self.pos_trans_fc = nn.Linear(self.embed_dims * 2,
+                                          self.embed_dims * 2)
+            self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2)
+        else:
+            self.reference_points_fc = nn.Linear(self.embed_dims, 2)
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super().init_weights()
+        for coder in self.encoder, self.decoder:
+            for p in coder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        if self.as_two_stage:
+            nn.init.xavier_uniform_(self.memory_trans_fc.weight)
+            nn.init.xavier_uniform_(self.pos_trans_fc.weight)
+        else:
+            xavier_init(
+                self.reference_points_fc, distribution='uniform', bias=0.)
+        normal_(self.level_embed)
+
+    def pre_transformer(
+            self,
+            mlvl_feats: Tuple[Tensor],
+            batch_data_samples: OptSampleList = None) -> Tuple[Dict]:
+        """Process image features before feeding them to the transformer.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            mlvl_feats (tuple[Tensor]): Multi-level features that may have
+                different resolutions, output from neck. Each feature has
+                shape (bs, dim, h_lvl, w_lvl), where 'lvl' means 'layer'.
+            batch_data_samples (list[:obj:`DetDataSample`], optional): The
+                batch data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict]: The first dict contains the inputs of encoder and the
+            second dict contains the inputs of decoder.
+
+            - encoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_encoder()`, which includes 'feat', 'feat_mask',
+              and 'feat_pos'.
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_decoder()`, which includes 'memory_mask'.
+        """
+        batch_size = mlvl_feats[0].size(0)
+
+        # construct binary masks for the transformer.
+        assert batch_data_samples is not None
+        batch_input_shape = batch_data_samples[0].batch_input_shape
+        input_img_h, input_img_w = batch_input_shape
+        img_shape_list = [sample.img_shape for sample in batch_data_samples]
+        same_shape_flag = all([
+            s[0] == input_img_h and s[1] == input_img_w for s in img_shape_list
+        ])
+        # support torch2onnx without feeding masks
+        if torch.onnx.is_in_onnx_export() or same_shape_flag:
+            mlvl_masks = []
+            mlvl_pos_embeds = []
+            for feat in mlvl_feats:
+                mlvl_masks.append(None)
+                mlvl_pos_embeds.append(
+                    self.positional_encoding(None, input=feat))
+        else:
+            masks = mlvl_feats[0].new_ones(
+                (batch_size, input_img_h, input_img_w))
+            for img_id in range(batch_size):
+                img_h, img_w = img_shape_list[img_id]
+                masks[img_id, :img_h, :img_w] = 0
+            # NOTE following the official DETR repo, non-zero
+            # values representing ignored positions, while
+            # zero values means valid positions.
+
+            mlvl_masks = []
+            mlvl_pos_embeds = []
+            for feat in mlvl_feats:
+                mlvl_masks.append(
+                    F.interpolate(masks[None], size=feat.shape[-2:]).to(
+                        torch.bool).squeeze(0))
+                mlvl_pos_embeds.append(
+                    self.positional_encoding(mlvl_masks[-1]))
+
+        feat_flatten = []
+        lvl_pos_embed_flatten = []
+        mask_flatten = []
+        spatial_shapes = []
+        for lvl, (feat, mask, pos_embed) in enumerate(
+                zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
+            batch_size, c, h, w = feat.shape
+            spatial_shape = torch._shape_as_tensor(feat)[2:].to(feat.device)
+            # [bs, c, h_lvl, w_lvl] -> [bs, h_lvl*w_lvl, c]
+            feat = feat.view(batch_size, c, -1).permute(0, 2, 1)
+            pos_embed = pos_embed.view(batch_size, c, -1).permute(0, 2, 1)
+            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            # [bs, h_lvl, w_lvl] -> [bs, h_lvl*w_lvl]
+            if mask is not None:
+                mask = mask.flatten(1)
+
+            feat_flatten.append(feat)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            mask_flatten.append(mask)
+            spatial_shapes.append(spatial_shape)
+
+        # (bs, num_feat_points, dim)
+        feat_flatten = torch.cat(feat_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        # (bs, num_feat_points), where num_feat_points = sum_lvl(h_lvl*w_lvl)
+        if mask_flatten[0] is not None:
+            mask_flatten = torch.cat(mask_flatten, 1)
+        else:
+            mask_flatten = None
+
+        # (num_level, 2)
+        spatial_shapes = torch.cat(spatial_shapes).view(-1, 2)
+        level_start_index = torch.cat((
+            spatial_shapes.new_zeros((1, )),  # (num_level)
+            spatial_shapes.prod(1).cumsum(0)[:-1]))
+        if mlvl_masks[0] is not None:
+            valid_ratios = torch.stack(  # (bs, num_level, 2)
+                [self.get_valid_ratio(m) for m in mlvl_masks], 1)
+        else:
+            valid_ratios = mlvl_feats[0].new_ones(batch_size, len(mlvl_feats),
+                                                  2)
+
+        encoder_inputs_dict = dict(
+            feat=feat_flatten,
+            feat_mask=mask_flatten,
+            feat_pos=lvl_pos_embed_flatten,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios)
+        decoder_inputs_dict = dict(
+            memory_mask=mask_flatten,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios)
+        return encoder_inputs_dict, decoder_inputs_dict
+
+    def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
+                        feat_pos: Tensor, spatial_shapes: Tensor,
+                        level_start_index: Tensor,
+                        valid_ratios: Tensor) -> Dict:
+        """Forward with Transformer encoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            feat (Tensor): Sequential features, has shape (bs, num_feat_points,
+                dim).
+            feat_mask (Tensor): ByteTensor, the padding mask of the features,
+                has shape (bs, num_feat_points).
+            feat_pos (Tensor): The positional embeddings of the features, has
+                shape (bs, num_feat_points, dim).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+
+        Returns:
+            dict: The dictionary of encoder outputs, which includes the
+            `memory` of the encoder output.
+        """
+        memory = self.encoder(
+            query=feat,
+            query_pos=feat_pos,
+            key_padding_mask=feat_mask,  # for self_attn
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios)
+        encoder_outputs_dict = dict(
+            memory=memory,
+            memory_mask=feat_mask,
+            spatial_shapes=spatial_shapes)
+        return encoder_outputs_dict
+
+    def pre_decoder(self, memory: Tensor, memory_mask: Tensor,
+                    spatial_shapes: Tensor) -> Tuple[Dict, Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`, and `reference_points`.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points). It will only be used when
+                `as_two_stage` is `True`.
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+                It will only be used when `as_two_stage` is `True`.
+
+        Returns:
+            tuple[dict, dict]: The decoder_inputs_dict and head_inputs_dict.
+
+            - decoder_inputs_dict (dict): The keyword dictionary args of
+              `self.forward_decoder()`, which includes 'query', 'query_pos',
+              'memory', and `reference_points`. The reference_points of
+              decoder input here are 4D boxes when `as_two_stage` is `True`,
+              otherwise 2D points, although it has `points` in its name.
+              The reference_points in encoder is always 2D points.
+            - head_inputs_dict (dict): The keyword dictionary args of the
+              bbox_head functions, which includes `enc_outputs_class` and
+              `enc_outputs_coord`. They are both `None` when 'as_two_stage'
+              is `False`. The dict is empty when `self.training` is `False`.
+        """
+        batch_size, _, c = memory.shape
+        if self.as_two_stage:
+            output_memory, output_proposals = \
+                self.gen_encoder_output_proposals(
+                    memory, memory_mask, spatial_shapes)
+            enc_outputs_class = self.bbox_head.cls_branches[
+                self.decoder.num_layers](
+                    output_memory)
+            enc_outputs_coord_unact = self.bbox_head.reg_branches[
+                self.decoder.num_layers](output_memory) + output_proposals
+            enc_outputs_coord = enc_outputs_coord_unact.sigmoid()
+            # We only use the first channel in enc_outputs_class as foreground,
+            # the other (num_classes - 1) channels are actually not used.
+            # Its targets are set to be 0s, which indicates the first
+            # class (foreground) because we use [0, num_classes - 1] to
+            # indicate class labels, background class is indicated by
+            # num_classes (similar convention in RPN).
+            # See https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/deformable_detr_head.py#L241 # noqa
+            # This follows the official implementation of Deformable DETR.
+            topk_proposals = torch.topk(
+                enc_outputs_class[..., 0], self.num_queries, dim=1)[1]
+            topk_coords_unact = torch.gather(
+                enc_outputs_coord_unact, 1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+            topk_coords_unact = topk_coords_unact.detach()
+            reference_points = topk_coords_unact.sigmoid()
+            pos_trans_out = self.pos_trans_fc(
+                self.get_proposal_pos_embed(topk_coords_unact))
+            pos_trans_out = self.pos_trans_norm(pos_trans_out)
+            query_pos, query = torch.split(pos_trans_out, c, dim=2)
+        else:
+            enc_outputs_class, enc_outputs_coord = None, None
+            query_embed = self.query_embedding.weight
+            query_pos, query = torch.split(query_embed, c, dim=1)
+            query_pos = query_pos.unsqueeze(0).expand(batch_size, -1, -1)
+            query = query.unsqueeze(0).expand(batch_size, -1, -1)
+            reference_points = self.reference_points_fc(query_pos).sigmoid()
+
+        decoder_inputs_dict = dict(
+            query=query,
+            query_pos=query_pos,
+            memory=memory,
+            reference_points=reference_points)
+        head_inputs_dict = dict(
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord=enc_outputs_coord) if self.training else dict()
+        return decoder_inputs_dict, head_inputs_dict
+
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        memory_mask: Tensor, reference_points: Tensor,
+                        spatial_shapes: Tensor, level_start_index: Tensor,
+                        valid_ratios: Tensor) -> Dict:
+        """Forward with Transformer decoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h) when `as_two_stage` is `True`, otherwise has
+                shape (bs, num_queries, 2) with the last dimension arranged as
+                (cx, cy).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` of the decoder output and `references` including
+            the initial and intermediate reference_points.
+        """
+        inter_states, inter_references = self.decoder(
+            query=query,
+            value=memory,
+            query_pos=query_pos,
+            key_padding_mask=memory_mask,  # for cross_attn
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=self.bbox_head.reg_branches
+            if self.with_box_refine else None)
+        references = [reference_points, *inter_references]
+        decoder_outputs_dict = dict(
+            hidden_states=inter_states, references=references)
+        return decoder_outputs_dict
+
+    @staticmethod
+    def get_valid_ratio(mask: Tensor) -> Tensor:
+        """Get the valid radios of feature map in a level.
+
+        .. code:: text
+
+                    |---> valid_W <---|
+                 ---+-----------------+-----+---
+                  A |                 |     | A
+                  | |                 |     | |
+                  | |                 |     | |
+            valid_H |                 |     | |
+                  | |                 |     | H
+                  | |                 |     | |
+                  V |                 |     | |
+                 ---+-----------------+     | |
+                    |                       | V
+                    +-----------------------+---
+                    |---------> W <---------|
+
+          The valid_ratios are defined as:
+                r_h = valid_H / H,  r_w = valid_W / W
+          They are the factors to re-normalize the relative coordinates of the
+          image to the relative coordinates of the current level feature map.
+
+        Args:
+            mask (Tensor): Binary mask of a feature map, has shape (bs, H, W).
+
+        Returns:
+            Tensor: valid ratios [r_w, r_h] of a feature map, has shape (1, 2).
+        """
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def gen_encoder_output_proposals(
+            self, memory: Tensor, memory_mask: Tensor,
+            spatial_shapes: Tensor) -> Tuple[Tensor, Tensor]:
+        """Generate proposals from encoded memory. The function will only be
+        used when `as_two_stage` is `True`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+
+        Returns:
+            tuple: A tuple of transformed memory and proposals.
+
+            - output_memory (Tensor): The transformed memory for obtaining
+              top-k proposals, has shape (bs, num_feat_points, dim).
+            - output_proposals (Tensor): The inverse-normalized proposal, has
+              shape (batch_size, num_keys, 4) with the last dimension arranged
+              as (cx, cy, w, h).
+        """
+
+        bs = memory.size(0)
+        proposals = []
+        _cur = 0  # start index in the sequence of the current level
+        for lvl, HW in enumerate(spatial_shapes):
+            H, W = HW
+
+            if memory_mask is not None:
+                mask_flatten_ = memory_mask[:, _cur:(_cur + H * W)].view(
+                    bs, H, W, 1)
+                valid_H = torch.sum(~mask_flatten_[:, :, 0, 0],
+                                    1).unsqueeze(-1)
+                valid_W = torch.sum(~mask_flatten_[:, 0, :, 0],
+                                    1).unsqueeze(-1)
+                scale = torch.cat([valid_W, valid_H], 1).view(bs, 1, 1, 2)
+            else:
+                if not isinstance(HW, torch.Tensor):
+                    HW = memory.new_tensor(HW)
+                scale = HW.unsqueeze(0).flip(dims=[0, 1]).view(1, 1, 1, 2)
+            grid_y, grid_x = torch.meshgrid(
+                torch.linspace(
+                    0, H - 1, H, dtype=torch.float32, device=memory.device),
+                torch.linspace(
+                    0, W - 1, W, dtype=torch.float32, device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+            grid = (grid.unsqueeze(0).expand(bs, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0**lvl)
+            proposal = torch.cat((grid, wh), -1).view(bs, -1, 4)
+            proposals.append(proposal)
+            _cur += (H * W)
+        output_proposals = torch.cat(proposals, 1)
+        # do not use `all` to make it exportable to onnx
+        output_proposals_valid = (
+            (output_proposals > 0.01) & (output_proposals < 0.99)).sum(
+                -1, keepdim=True) == output_proposals.shape[-1]
+        # inverse_sigmoid
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        if memory_mask is not None:
+            output_proposals = output_proposals.masked_fill(
+                memory_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(
+            ~output_proposals_valid, float('inf'))
+
+        output_memory = memory
+        if memory_mask is not None:
+            output_memory = output_memory.masked_fill(
+                memory_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid,
+                                                  float(0))
+        output_memory = self.memory_trans_fc(output_memory)
+        output_memory = self.memory_trans_norm(output_memory)
+        # [bs, sum(hw), 2]
+        return output_memory, output_proposals
+
+    @staticmethod
+    def get_proposal_pos_embed(proposals: Tensor,
+                               num_pos_feats: int = 128,
+                               temperature: int = 10000) -> Tensor:
+        """Get the position embedding of the proposal.
+
+        Args:
+            proposals (Tensor): Not normalized proposals, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            num_pos_feats (int, optional): The feature dimension for each
+                position along x, y, w, and h-axis. Note the final returned
+                dimension for each position is 4 times of num_pos_feats.
+                Default to 128.
+            temperature (int, optional): The temperature used for scaling the
+                position embedding. Defaults to 10000.
+
+        Returns:
+            Tensor: The position embedding of proposal, has shape
+            (bs, num_queries, num_pos_feats * 4), with the last dimension
+            arranged as (cx, cy, w, h)
+        """
+        scale = 2 * math.pi
+        dim_t = torch.arange(
+            num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = proposals.sigmoid() * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()),
+                          dim=4).flatten(2)
+        return pos
diff --git a/head_extractor/src/mmdet/models/detectors/detr.py b/head_extractor/src/mmdet/models/detectors/detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..7895e9ecb4eb66cb75d173c191c2128c3f55c197
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/detr.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList
+from ..layers import (DetrTransformerDecoder, DetrTransformerEncoder,
+                      SinePositionalEncoding)
+from .base_detr import DetectionTransformer
+
+
+@MODELS.register_module()
+class DETR(DetectionTransformer):
+    r"""Implementation of `DETR: End-to-End Object Detection with Transformers.
+
+    <https://arxiv.org/pdf/2005.12872>`_.
+
+    Code is modified from the `official github repo
+    <https://github.com/facebookresearch/detr>`_.
+    """
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DetrTransformerEncoder(**self.encoder)
+        self.decoder = DetrTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        # NOTE The embed_dims is typically passed from the inside out.
+        # For example in DETR, The embed_dims is passed as
+        # self_attn -> the first encoder layer -> encoder -> detector.
+        self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims)
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super().init_weights()
+        for coder in self.encoder, self.decoder:
+            for p in coder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+
+    def pre_transformer(
+            self,
+            img_feats: Tuple[Tensor],
+            batch_data_samples: OptSampleList = None) -> Tuple[Dict, Dict]:
+        """Prepare the inputs of the Transformer.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            img_feats (Tuple[Tensor]): Tuple of features output from the neck,
+                has shape (bs, c, h, w).
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such as
+                `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of encoder
+            and the second dict contains the inputs of decoder.
+
+            - encoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_encoder()`, which includes 'feat', 'feat_mask',
+              and 'feat_pos'.
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_decoder()`, which includes 'memory_mask',
+              and 'memory_pos'.
+        """
+
+        feat = img_feats[-1]  # NOTE img_feats contains only one feature.
+        batch_size, feat_dim, _, _ = feat.shape
+        # construct binary masks which for the transformer.
+        assert batch_data_samples is not None
+        batch_input_shape = batch_data_samples[0].batch_input_shape
+        input_img_h, input_img_w = batch_input_shape
+        img_shape_list = [sample.img_shape for sample in batch_data_samples]
+        same_shape_flag = all([
+            s[0] == input_img_h and s[1] == input_img_w for s in img_shape_list
+        ])
+        if torch.onnx.is_in_onnx_export() or same_shape_flag:
+            masks = None
+            # [batch_size, embed_dim, h, w]
+            pos_embed = self.positional_encoding(masks, input=feat)
+        else:
+            masks = feat.new_ones((batch_size, input_img_h, input_img_w))
+            for img_id in range(batch_size):
+                img_h, img_w = img_shape_list[img_id]
+                masks[img_id, :img_h, :img_w] = 0
+            # NOTE following the official DETR repo, non-zero values represent
+            # ignored positions, while zero values mean valid positions.
+
+            masks = F.interpolate(
+                masks.unsqueeze(1),
+                size=feat.shape[-2:]).to(torch.bool).squeeze(1)
+            # [batch_size, embed_dim, h, w]
+            pos_embed = self.positional_encoding(masks)
+
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        # [bs, c, h, w] -> [bs, h*w, c]
+        feat = feat.view(batch_size, feat_dim, -1).permute(0, 2, 1)
+        pos_embed = pos_embed.view(batch_size, feat_dim, -1).permute(0, 2, 1)
+        # [bs, h, w] -> [bs, h*w]
+        if masks is not None:
+            masks = masks.view(batch_size, -1)
+
+        # prepare transformer_inputs_dict
+        encoder_inputs_dict = dict(
+            feat=feat, feat_mask=masks, feat_pos=pos_embed)
+        decoder_inputs_dict = dict(memory_mask=masks, memory_pos=pos_embed)
+        return encoder_inputs_dict, decoder_inputs_dict
+
+    def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
+                        feat_pos: Tensor) -> Dict:
+        """Forward with Transformer encoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            feat (Tensor): Sequential features, has shape (bs, num_feat_points,
+                dim).
+            feat_mask (Tensor): ByteTensor, the padding mask of the features,
+                has shape (bs, num_feat_points).
+            feat_pos (Tensor): The positional embeddings of the features, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of encoder outputs, which includes the
+            `memory` of the encoder output.
+        """
+        memory = self.encoder(
+            query=feat, query_pos=feat_pos,
+            key_padding_mask=feat_mask)  # for self_attn
+        encoder_outputs_dict = dict(memory=memory)
+        return encoder_outputs_dict
+
+    def pre_decoder(self, memory: Tensor) -> Tuple[Dict, Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+
+        Returns:
+            tuple[dict, dict]: The first dict contains the inputs of decoder
+            and the second dict contains the inputs of the bbox_head function.
+
+            - decoder_inputs_dict (dict): The keyword args dictionary of
+              `self.forward_decoder()`, which includes 'query', 'query_pos',
+              'memory'.
+            - head_inputs_dict (dict): The keyword args dictionary of the
+              bbox_head functions, which is usually empty, or includes
+              `enc_outputs_class` and `enc_outputs_class` when the detector
+              support 'two stage' or 'query selection' strategies.
+        """
+
+        batch_size = memory.size(0)  # (bs, num_feat_points, dim)
+        query_pos = self.query_embedding.weight
+        # (num_queries, dim) -> (bs, num_queries, dim)
+        query_pos = query_pos.unsqueeze(0).repeat(batch_size, 1, 1)
+        query = torch.zeros_like(query_pos)
+
+        decoder_inputs_dict = dict(
+            query_pos=query_pos, query=query, memory=memory)
+        head_inputs_dict = dict()
+        return decoder_inputs_dict, head_inputs_dict
+
+    def forward_decoder(self, query: Tensor, query_pos: Tensor, memory: Tensor,
+                        memory_mask: Tensor, memory_pos: Tensor) -> Dict:
+        """Forward with Transformer decoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional queries of decoder inputs,
+                has shape (bs, num_queries, dim).
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            memory_pos (Tensor): The positional embeddings of memory, has
+                shape (bs, num_feat_points, dim).
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` of the decoder output.
+
+            - hidden_states (Tensor): Has shape
+              (num_decoder_layers, bs, num_queries, dim)
+        """
+
+        hidden_states = self.decoder(
+            query=query,
+            key=memory,
+            value=memory,
+            query_pos=query_pos,
+            key_pos=memory_pos,
+            key_padding_mask=memory_mask)  # for cross_attn
+
+        head_inputs_dict = dict(hidden_states=hidden_states)
+        return head_inputs_dict
diff --git a/head_extractor/src/mmdet/models/detectors/dino.py b/head_extractor/src/mmdet/models/detectors/dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..ade47f531d27246511cafc2997a07d58677538a7
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/dino.py
@@ -0,0 +1,287 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple
+
+import torch
+from torch import Tensor, nn
+from torch.nn.init import normal_
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList
+from mmdet.utils import OptConfigType
+from ..layers import (CdnQueryGenerator, DeformableDetrTransformerEncoder,
+                      DinoTransformerDecoder, SinePositionalEncoding)
+from .deformable_detr import DeformableDETR, MultiScaleDeformableAttention
+
+
+@MODELS.register_module()
+class DINO(DeformableDETR):
+    r"""Implementation of `DINO: DETR with Improved DeNoising Anchor Boxes
+    for End-to-End Object Detection <https://arxiv.org/abs/2203.03605>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/DINO>`_.
+
+    Args:
+        dn_cfg (:obj:`ConfigDict` or dict, optional): Config of denoising
+            query generator. Defaults to `None`.
+    """
+
+    def __init__(self, *args, dn_cfg: OptConfigType = None, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        assert self.as_two_stage, 'as_two_stage must be True for DINO'
+        assert self.with_box_refine, 'with_box_refine must be True for DINO'
+
+        if dn_cfg is not None:
+            assert 'num_classes' not in dn_cfg and \
+                   'num_queries' not in dn_cfg and \
+                   'hidden_dim' not in dn_cfg, \
+                'The three keyword args `num_classes`, `embed_dims`, and ' \
+                '`num_matching_queries` are set in `detector.__init__()`, ' \
+                'users should not set them in `dn_cfg` config.'
+            dn_cfg['num_classes'] = self.bbox_head.num_classes
+            dn_cfg['embed_dims'] = self.embed_dims
+            dn_cfg['num_matching_queries'] = self.num_queries
+        self.dn_query_generator = CdnQueryGenerator(**dn_cfg)
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = DeformableDetrTransformerEncoder(**self.encoder)
+        self.decoder = DinoTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims)
+        # NOTE In DINO, the query_embedding only contains content
+        # queries, while in Deformable DETR, the query_embedding
+        # contains both content and spatial queries, and in DETR,
+        # it only contains spatial queries.
+
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            f'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+        self.level_embed = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+        self.memory_trans_fc = nn.Linear(self.embed_dims, self.embed_dims)
+        self.memory_trans_norm = nn.LayerNorm(self.embed_dims)
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super(DeformableDETR, self).init_weights()
+        for coder in self.encoder, self.decoder:
+            for p in coder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        nn.init.xavier_uniform_(self.memory_trans_fc.weight)
+        nn.init.xavier_uniform_(self.query_embedding.weight)
+        normal_(self.level_embed)
+
+    def forward_transformer(
+        self,
+        img_feats: Tuple[Tensor],
+        batch_data_samples: OptSampleList = None,
+    ) -> Dict:
+        """Forward process of Transformer.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+        The difference is that the ground truth in `batch_data_samples` is
+        required for the `pre_decoder` to prepare the query of DINO.
+        Additionally, DINO inherits the `pre_transformer` method and the
+        `forward_encoder` method of DeformableDETR. More details about the
+        two methods can be found in `mmdet/detector/deformable_detr.py`.
+
+        Args:
+            img_feats (tuple[Tensor]): Tuple of feature maps from neck. Each
+                feature map has shape (bs, dim, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            dict: The dictionary of bbox_head function inputs, which always
+            includes the `hidden_states` of the decoder output and may contain
+            `references` including the initial and intermediate references.
+        """
+        encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer(
+            img_feats, batch_data_samples)
+
+        encoder_outputs_dict = self.forward_encoder(**encoder_inputs_dict)
+
+        tmp_dec_in, head_inputs_dict = self.pre_decoder(
+            **encoder_outputs_dict, batch_data_samples=batch_data_samples)
+        decoder_inputs_dict.update(tmp_dec_in)
+
+        decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict)
+        head_inputs_dict.update(decoder_outputs_dict)
+        return head_inputs_dict
+
+    def pre_decoder(
+        self,
+        memory: Tensor,
+        memory_mask: Tensor,
+        spatial_shapes: Tensor,
+        batch_data_samples: OptSampleList = None,
+    ) -> Tuple[Dict]:
+        """Prepare intermediate variables before entering Transformer decoder,
+        such as `query`, `query_pos`, and `reference_points`.
+
+        Args:
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points). Will only be used when
+                `as_two_stage` is `True`.
+            spatial_shapes (Tensor): Spatial shapes of features in all levels.
+                With shape (num_levels, 2), last dimension represents (h, w).
+                Will only be used when `as_two_stage` is `True`.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+                Defaults to None.
+
+        Returns:
+            tuple[dict]: The decoder_inputs_dict and head_inputs_dict.
+
+            - decoder_inputs_dict (dict): The keyword dictionary args of
+              `self.forward_decoder()`, which includes 'query', 'memory',
+              `reference_points`, and `dn_mask`. The reference points of
+              decoder input here are 4D boxes, although it has `points`
+              in its name.
+            - head_inputs_dict (dict): The keyword dictionary args of the
+              bbox_head functions, which includes `topk_score`, `topk_coords`,
+              and `dn_meta` when `self.training` is `True`, else is empty.
+        """
+        bs, _, c = memory.shape
+        cls_out_features = self.bbox_head.cls_branches[
+            self.decoder.num_layers].out_features
+
+        output_memory, output_proposals = self.gen_encoder_output_proposals(
+            memory, memory_mask, spatial_shapes)
+        enc_outputs_class = self.bbox_head.cls_branches[
+            self.decoder.num_layers](
+                output_memory)
+        enc_outputs_coord_unact = self.bbox_head.reg_branches[
+            self.decoder.num_layers](output_memory) + output_proposals
+
+        # NOTE The DINO selects top-k proposals according to scores of
+        # multi-class classification, while DeformDETR, where the input
+        # is `enc_outputs_class[..., 0]` selects according to scores of
+        # binary classification.
+        topk_indices = torch.topk(
+            enc_outputs_class.max(-1)[0], k=self.num_queries, dim=1)[1]
+        topk_score = torch.gather(
+            enc_outputs_class, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features))
+        topk_coords_unact = torch.gather(
+            enc_outputs_coord_unact, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, 4))
+        topk_coords = topk_coords_unact.sigmoid()
+        topk_coords_unact = topk_coords_unact.detach()
+
+        query = self.query_embedding.weight[:, None, :]
+        query = query.repeat(1, bs, 1).transpose(0, 1)
+        if self.training:
+            dn_label_query, dn_bbox_query, dn_mask, dn_meta = \
+                self.dn_query_generator(batch_data_samples)
+            query = torch.cat([dn_label_query, query], dim=1)
+            reference_points = torch.cat([dn_bbox_query, topk_coords_unact],
+                                         dim=1)
+        else:
+            reference_points = topk_coords_unact
+            dn_mask, dn_meta = None, None
+        reference_points = reference_points.sigmoid()
+
+        decoder_inputs_dict = dict(
+            query=query,
+            memory=memory,
+            reference_points=reference_points,
+            dn_mask=dn_mask)
+        # NOTE DINO calculates encoder losses on scores and coordinates
+        # of selected top-k encoder queries, while DeformDETR is of all
+        # encoder queries.
+        head_inputs_dict = dict(
+            enc_outputs_class=topk_score,
+            enc_outputs_coord=topk_coords,
+            dn_meta=dn_meta) if self.training else dict()
+        return decoder_inputs_dict, head_inputs_dict
+
+    def forward_decoder(self,
+                        query: Tensor,
+                        memory: Tensor,
+                        memory_mask: Tensor,
+                        reference_points: Tensor,
+                        spatial_shapes: Tensor,
+                        level_start_index: Tensor,
+                        valid_ratios: Tensor,
+                        dn_mask: Optional[Tensor] = None,
+                        **kwargs) -> Dict:
+        """Forward with Transformer decoder.
+
+        The forward procedure of the transformer is defined as:
+        'pre_transformer' -> 'encoder' -> 'pre_decoder' -> 'decoder'
+        More details can be found at `TransformerDetector.forward_transformer`
+        in `mmdet/detector/base_detr.py`.
+
+        Args:
+            query (Tensor): The queries of decoder inputs, has shape
+                (bs, num_queries_total, dim), where `num_queries_total` is the
+                sum of `num_denoising_queries` and `num_matching_queries` when
+                `self.training` is `True`, else `num_matching_queries`.
+            memory (Tensor): The output embeddings of the Transformer encoder,
+                has shape (bs, num_feat_points, dim).
+            memory_mask (Tensor): ByteTensor, the padding mask of the memory,
+                has shape (bs, num_feat_points).
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries_total, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            dn_mask (Tensor, optional): The attention mask to prevent
+                information leakage from different denoising groups and
+                matching parts, will be used as `self_attn_mask` of the
+                `self.decoder`, has shape (num_queries_total,
+                num_queries_total).
+                It is `None` when `self.training` is `False`.
+
+        Returns:
+            dict: The dictionary of decoder outputs, which includes the
+            `hidden_states` of the decoder output and `references` including
+            the initial and intermediate reference_points.
+        """
+        inter_states, references = self.decoder(
+            query=query,
+            value=memory,
+            key_padding_mask=memory_mask,
+            self_attn_mask=dn_mask,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=self.bbox_head.reg_branches,
+            **kwargs)
+
+        if len(query) == self.num_queries:
+            # NOTE: This is to make sure label_embeding can be involved to
+            # produce loss even if there is no denoising query (no ground truth
+            # target in this GPU), otherwise, this will raise runtime error in
+            # distributed training.
+            inter_states[0] += \
+                self.dn_query_generator.label_embedding.weight[0, 0] * 0.0
+
+        decoder_outputs_dict = dict(
+            hidden_states=inter_states, references=list(references))
+        return decoder_outputs_dict
diff --git a/head_extractor/src/mmdet/models/detectors/fast_rcnn.py b/head_extractor/src/mmdet/models/detectors/fast_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b39050fdc2989eb5c870704e1c1417987d53d46
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/fast_rcnn.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class FastRCNN(TwoStageDetector):
+    """Implementation of `Fast R-CNN <https://arxiv.org/abs/1504.08083>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmdet/models/detectors/faster_rcnn.py b/head_extractor/src/mmdet/models/detectors/faster_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..36109e3200a2d8e7d8a1032f7028e47a7699fb6a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/faster_rcnn.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class FasterRCNN(TwoStageDetector):
+    """Implementation of `Faster R-CNN <https://arxiv.org/abs/1506.01497>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmdet/models/detectors/fcos.py b/head_extractor/src/mmdet/models/detectors/fcos.py
new file mode 100644
index 0000000000000000000000000000000000000000..c628059313ac80644ec2ba2c806e7baf2e418a41
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/fcos.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class FCOS(SingleStageDetector):
+    """Implementation of `FCOS <https://arxiv.org/abs/1904.01355>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of FCOS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of FCOS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/fovea.py b/head_extractor/src/mmdet/models/detectors/fovea.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e4f21caa239147e3b81e66280aa1da043715b42
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/fovea.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class FOVEA(SingleStageDetector):
+    """Implementation of `FoveaBox <https://arxiv.org/abs/1904.03797>`_
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of FOVEA. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of FOVEA. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/fsaf.py b/head_extractor/src/mmdet/models/detectors/fsaf.py
new file mode 100644
index 0000000000000000000000000000000000000000..01b40273341f2a85cfa427f8adfc945a1b7da58a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/fsaf.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class FSAF(SingleStageDetector):
+    """Implementation of `FSAF <https://arxiv.org/abs/1903.00621>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/gfl.py b/head_extractor/src/mmdet/models/detectors/gfl.py
new file mode 100644
index 0000000000000000000000000000000000000000..c26821af68c224d4b55a1ca3d2be4c6e1d1b155d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/gfl.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class GFL(SingleStageDetector):
+    """Implementation of `GFL <https://arxiv.org/abs/2006.04388>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of GFL. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of GFL. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/glip.py b/head_extractor/src/mmdet/models/detectors/glip.py
new file mode 100644
index 0000000000000000000000000000000000000000..45cfe7d39fd7b8d9e9bc37c49fe369ff87bc68d9
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/glip.py
@@ -0,0 +1,590 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import re
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+def find_noun_phrases(caption: str) -> list:
+    """Find noun phrases in a caption using nltk.
+    Args:
+        caption (str): The caption to analyze.
+
+    Returns:
+        list: List of noun phrases found in the caption.
+
+    Examples:
+        >>> caption = 'There is two cat and a remote in the picture'
+        >>> find_noun_phrases(caption) # ['cat', 'a remote', 'the picture']
+    """
+    try:
+        import nltk
+        nltk.download('punkt', download_dir='~/nltk_data')
+        nltk.download('averaged_perceptron_tagger', download_dir='~/nltk_data')
+    except ImportError:
+        raise RuntimeError('nltk is not installed, please install it by: '
+                           'pip install nltk.')
+
+    caption = caption.lower()
+    tokens = nltk.word_tokenize(caption)
+    pos_tags = nltk.pos_tag(tokens)
+
+    grammar = 'NP: {<DT>?<JJ.*>*<NN.*>+}'
+    cp = nltk.RegexpParser(grammar)
+    result = cp.parse(pos_tags)
+
+    noun_phrases = []
+    for subtree in result.subtrees():
+        if subtree.label() == 'NP':
+            noun_phrases.append(' '.join(t[0] for t in subtree.leaves()))
+
+    return noun_phrases
+
+
+def remove_punctuation(text: str) -> str:
+    """Remove punctuation from a text.
+    Args:
+        text (str): The input text.
+
+    Returns:
+        str: The text with punctuation removed.
+    """
+    punctuation = [
+        '|', ':', ';', '@', '(', ')', '[', ']', '{', '}', '^', '\'', '\"', '’',
+        '`', '?', '$', '%', '#', '!', '&', '*', '+', ',', '.'
+    ]
+    for p in punctuation:
+        text = text.replace(p, '')
+    return text.strip()
+
+
+def run_ner(caption: str) -> Tuple[list, list]:
+    """Run NER on a caption and return the tokens and noun phrases.
+    Args:
+        caption (str): The input caption.
+
+    Returns:
+        Tuple[List, List]: A tuple containing the tokens and noun phrases.
+            - tokens_positive (List): A list of token positions.
+            - noun_phrases (List): A list of noun phrases.
+    """
+    noun_phrases = find_noun_phrases(caption)
+    noun_phrases = [remove_punctuation(phrase) for phrase in noun_phrases]
+    noun_phrases = [phrase for phrase in noun_phrases if phrase != '']
+    print('noun_phrases:', noun_phrases)
+    relevant_phrases = noun_phrases
+    labels = noun_phrases
+
+    tokens_positive = []
+    for entity, label in zip(relevant_phrases, labels):
+        try:
+            # search all occurrences and mark them as different entities
+            # TODO: Not Robust
+            for m in re.finditer(entity, caption.lower()):
+                tokens_positive.append([[m.start(), m.end()]])
+        except Exception:
+            print('noun entities:', noun_phrases)
+            print('entity:', entity)
+            print('caption:', caption.lower())
+    return tokens_positive, noun_phrases
+
+
+def create_positive_map(tokenized,
+                        tokens_positive: list,
+                        max_num_entities: int = 256) -> Tensor:
+    """construct a map such that positive_map[i,j] = True
+    if box i is associated to token j
+
+    Args:
+        tokenized: The tokenized input.
+        tokens_positive (list): A list of token ranges
+            associated with positive boxes.
+        max_num_entities (int, optional): The maximum number of entities.
+            Defaults to 256.
+
+    Returns:
+        torch.Tensor: The positive map.
+
+    Raises:
+        Exception: If an error occurs during token-to-char mapping.
+    """
+    positive_map = torch.zeros((len(tokens_positive), max_num_entities),
+                               dtype=torch.float)
+
+    for j, tok_list in enumerate(tokens_positive):
+        for (beg, end) in tok_list:
+            try:
+                beg_pos = tokenized.char_to_token(beg)
+                end_pos = tokenized.char_to_token(end - 1)
+            except Exception as e:
+                print('beg:', beg, 'end:', end)
+                print('token_positive:', tokens_positive)
+                raise e
+            if beg_pos is None:
+                try:
+                    beg_pos = tokenized.char_to_token(beg + 1)
+                    if beg_pos is None:
+                        beg_pos = tokenized.char_to_token(beg + 2)
+                except Exception:
+                    beg_pos = None
+            if end_pos is None:
+                try:
+                    end_pos = tokenized.char_to_token(end - 2)
+                    if end_pos is None:
+                        end_pos = tokenized.char_to_token(end - 3)
+                except Exception:
+                    end_pos = None
+            if beg_pos is None or end_pos is None:
+                continue
+
+            assert beg_pos is not None and end_pos is not None
+            positive_map[j, beg_pos:end_pos + 1].fill_(1)
+    return positive_map / (positive_map.sum(-1)[:, None] + 1e-6)
+
+
+def create_positive_map_label_to_token(positive_map: Tensor,
+                                       plus: int = 0) -> dict:
+    """Create a dictionary mapping the label to the token.
+    Args:
+        positive_map (Tensor): The positive map tensor.
+        plus (int, optional): Value added to the label for indexing.
+            Defaults to 0.
+
+    Returns:
+        dict: The dictionary mapping the label to the token.
+    """
+    positive_map_label_to_token = {}
+    for i in range(len(positive_map)):
+        positive_map_label_to_token[i + plus] = torch.nonzero(
+            positive_map[i], as_tuple=True)[0].tolist()
+    return positive_map_label_to_token
+
+
+def clean_label_name(name: str) -> str:
+    name = re.sub(r'\(.*\)', '', name)
+    name = re.sub(r'_', ' ', name)
+    name = re.sub(r'  ', ' ', name)
+    return name
+
+
+def chunks(lst: list, n: int) -> list:
+    """Yield successive n-sized chunks from lst."""
+    all_ = []
+    for i in range(0, len(lst), n):
+        data_index = lst[i:i + n]
+        all_.append(data_index)
+    counter = 0
+    for i in all_:
+        counter += len(i)
+    assert (counter == len(lst))
+
+    return all_
+
+
+@MODELS.register_module()
+class GLIP(SingleStageDetector):
+    """Implementation of `GLIP <https://arxiv.org/abs/2112.03857>`_
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        language_model (:obj:`ConfigDict` or dict): The language model config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of GLIP. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of GLIP. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 language_model: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+        self.language_model = MODELS.build(language_model)
+
+        self._special_tokens = '. '
+
+    def to_enhance_text_prompts(self, original_caption, enhanced_text_prompts):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            if word in enhanced_text_prompts:
+                enhanced_text_dict = enhanced_text_prompts[word]
+                if 'prefix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['prefix']
+                start_i = len(caption_string)
+                if 'name' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['name']
+                else:
+                    caption_string += word
+                end_i = len(caption_string)
+                tokens_positive.append([[start_i, end_i]])
+
+                if 'suffix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['suffix']
+            else:
+                tokens_positive.append(
+                    [[len(caption_string),
+                      len(caption_string) + len(word)]])
+                caption_string += word
+
+            if idx != len(original_caption) - 1:
+                caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
+    def to_plain_text_prompts(self, original_caption):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            tokens_positive.append(
+                [[len(caption_string),
+                  len(caption_string) + len(word)]])
+            caption_string += word
+            if idx != len(original_caption) - 1:
+                caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
+    def get_tokens_and_prompts(
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompts: Optional[ConfigType] = None
+    ) -> Tuple[dict, str, list, list]:
+        """Get the tokens positive and prompts for the caption."""
+        if isinstance(original_caption, (list, tuple)) or custom_entities:
+            if custom_entities and isinstance(original_caption, str):
+                original_caption = original_caption.strip(self._special_tokens)
+                original_caption = original_caption.split(self._special_tokens)
+                original_caption = list(
+                    filter(lambda x: len(x) > 0, original_caption))
+
+            original_caption = [clean_label_name(i) for i in original_caption]
+
+            if custom_entities and enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption, enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption)
+
+            tokenized = self.language_model.tokenizer([caption_string],
+                                                      return_tensors='pt')
+            entities = original_caption
+        else:
+            original_caption = original_caption.strip(self._special_tokens)
+            tokenized = self.language_model.tokenizer([original_caption],
+                                                      return_tensors='pt')
+            tokens_positive, noun_phrases = run_ner(original_caption)
+            entities = noun_phrases
+            caption_string = original_caption
+
+        return tokenized, caption_string, tokens_positive, entities
+
+    def get_positive_map(self, tokenized, tokens_positive):
+        positive_map = create_positive_map(tokenized, tokens_positive)
+        positive_map_label_to_token = create_positive_map_label_to_token(
+            positive_map, plus=1)
+        return positive_map_label_to_token, positive_map
+
+    def get_tokens_positive_and_prompts(
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompt: Optional[ConfigType] = None,
+        tokens_positive: Optional[list] = None,
+    ) -> Tuple[dict, str, Tensor, list]:
+        if tokens_positive is not None:
+            if tokens_positive == -1:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                return None, original_caption, None, original_caption
+            else:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                tokenized = self.language_model.tokenizer([original_caption],
+                                                          return_tensors='pt')
+                positive_map_label_to_token, positive_map = \
+                    self.get_positive_map(tokenized, tokens_positive)
+
+                entities = []
+                for token_positive in tokens_positive:
+                    instance_entities = []
+                    for t in token_positive:
+                        instance_entities.append(original_caption[t[0]:t[1]])
+                    entities.append(' / '.join(instance_entities))
+                return positive_map_label_to_token, original_caption, \
+                    positive_map, entities
+
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        if not self.training and chunked_size > 0:
+            assert isinstance(original_caption,
+                              (list, tuple)) or custom_entities is True
+            all_output = self.get_tokens_positive_and_prompts_chunked(
+                original_caption, enhanced_text_prompt)
+            positive_map_label_to_token, \
+                caption_string, \
+                positive_map, \
+                entities = all_output
+        else:
+            tokenized, caption_string, tokens_positive, entities = \
+                self.get_tokens_and_prompts(
+                    original_caption, custom_entities, enhanced_text_prompt)
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
+            if tokenized.input_ids.shape[1] > self.language_model.max_tokens:
+                warnings.warn('Inputting a text that is too long will result '
+                              'in poor prediction performance. '
+                              'Please reduce the text length.')
+        return positive_map_label_to_token, caption_string, \
+            positive_map, entities
+
+    def get_tokens_positive_and_prompts_chunked(
+            self,
+            original_caption: Union[list, tuple],
+            enhanced_text_prompts: Optional[ConfigType] = None):
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        original_caption = [clean_label_name(i) for i in original_caption]
+
+        original_caption_chunked = chunks(original_caption, chunked_size)
+        ids_chunked = chunks(
+            list(range(1,
+                       len(original_caption) + 1)), chunked_size)
+
+        positive_map_label_to_token_chunked = []
+        caption_string_chunked = []
+        positive_map_chunked = []
+        entities_chunked = []
+
+        for i in range(len(ids_chunked)):
+            if enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption_chunked[i], enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption_chunked[i])
+            tokenized = self.language_model.tokenizer([caption_string],
+                                                      return_tensors='pt')
+            if tokenized.input_ids.shape[1] > self.language_model.max_tokens:
+                warnings.warn('Inputting a text that is too long will result '
+                              'in poor prediction performance. '
+                              'Please reduce the --chunked-size.')
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
+
+            caption_string_chunked.append(caption_string)
+            positive_map_label_to_token_chunked.append(
+                positive_map_label_to_token)
+            positive_map_chunked.append(positive_map)
+            entities_chunked.append(original_caption_chunked[i])
+
+        return positive_map_label_to_token_chunked, \
+            caption_string_chunked, \
+            positive_map_chunked, \
+            entities_chunked
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, list]:
+        # TODO: Only open vocabulary tasks are supported for training now.
+        text_prompts = [
+            data_samples.text for data_samples in batch_data_samples
+        ]
+
+        gt_labels = [
+            data_samples.gt_instances.labels
+            for data_samples in batch_data_samples
+        ]
+
+        new_text_prompts = []
+        positive_maps = []
+        if len(set(text_prompts)) == 1:
+            # All the text prompts are the same,
+            # so there is no need to calculate them multiple times.
+            tokenized, caption_string, tokens_positive, _ = \
+                self.get_tokens_and_prompts(
+                    text_prompts[0], True)
+            new_text_prompts = [caption_string] * len(batch_inputs)
+            for gt_label in gt_labels:
+                new_tokens_positive = [
+                    tokens_positive[label] for label in gt_label
+                ]
+                _, positive_map = self.get_positive_map(
+                    tokenized, new_tokens_positive)
+                positive_maps.append(positive_map)
+        else:
+            for text_prompt, gt_label in zip(text_prompts, gt_labels):
+                tokenized, caption_string, tokens_positive, _ = \
+                    self.get_tokens_and_prompts(
+                        text_prompt, True)
+                new_tokens_positive = [
+                    tokens_positive[label] for label in gt_label
+                ]
+                _, positive_map = self.get_positive_map(
+                    tokenized, new_tokens_positive)
+                positive_maps.append(positive_map)
+                new_text_prompts.append(caption_string)
+
+        language_dict_features = self.language_model(new_text_prompts)
+        for i, data_samples in enumerate(batch_data_samples):
+            # .bool().float() is very important
+            positive_map = positive_maps[i].to(
+                batch_inputs.device).bool().float()
+            data_samples.gt_instances.positive_maps = positive_map
+
+        visual_features = self.extract_feat(batch_inputs)
+
+        losses = self.bbox_head.loss(visual_features, language_dict_features,
+                                     batch_data_samples)
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - label_names (List[str]): Label names of bboxes.
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        text_prompts = []
+        enhanced_text_prompts = []
+        tokens_positives = []
+        for data_samples in batch_data_samples:
+            text_prompts.append(data_samples.text)
+            if 'caption_prompt' in data_samples:
+                enhanced_text_prompts.append(data_samples.caption_prompt)
+            else:
+                enhanced_text_prompts.append(None)
+            tokens_positives.append(data_samples.get('tokens_positive', None))
+
+        if 'custom_entities' in batch_data_samples[0]:
+            # Assuming that the `custom_entities` flag
+            # inside a batch is always the same. For single image inference
+            custom_entities = batch_data_samples[0].custom_entities
+        else:
+            custom_entities = False
+
+        if len(set(text_prompts)) == 1:
+            # All the text prompts are the same,
+            # so there is no need to calculate them multiple times.
+            _positive_maps_and_prompts = [
+                self.get_tokens_positive_and_prompts(
+                    text_prompts[0], custom_entities, enhanced_text_prompts[0],
+                    tokens_positives[0])
+            ] * len(batch_inputs)
+        else:
+            _positive_maps_and_prompts = [
+                self.get_tokens_positive_and_prompts(text_prompt,
+                                                     custom_entities,
+                                                     enhanced_text_prompt,
+                                                     tokens_positive)
+                for text_prompt, enhanced_text_prompt, tokens_positive in zip(
+                    text_prompts, enhanced_text_prompts, tokens_positives)
+            ]
+
+        token_positive_maps, text_prompts, _, entities = zip(
+            *_positive_maps_and_prompts)
+
+        visual_features = self.extract_feat(batch_inputs)
+
+        if isinstance(text_prompts[0], list):
+            # chunked text prompts, only bs=1 is supported
+            assert len(batch_inputs) == 1
+            count = 0
+            results_list = []
+
+            entities = [[item for lst in entities[0] for item in lst]]
+
+            for b in range(len(text_prompts[0])):
+                text_prompts_once = [text_prompts[0][b]]
+                token_positive_maps_once = token_positive_maps[0][b]
+                language_dict_features = self.language_model(text_prompts_once)
+                batch_data_samples[
+                    0].token_positive_map = token_positive_maps_once
+
+                pred_instances = self.bbox_head.predict(
+                    copy.deepcopy(visual_features),
+                    language_dict_features,
+                    batch_data_samples,
+                    rescale=rescale)[0]
+
+                if len(pred_instances) > 0:
+                    pred_instances.labels += count
+                count += len(token_positive_maps_once)
+                results_list.append(pred_instances)
+            results_list = [results_list[0].cat(results_list)]
+        else:
+            language_dict_features = self.language_model(list(text_prompts))
+
+            for i, data_samples in enumerate(batch_data_samples):
+                data_samples.token_positive_map = token_positive_maps[i]
+
+            results_list = self.bbox_head.predict(
+                visual_features,
+                language_dict_features,
+                batch_data_samples,
+                rescale=rescale)
+
+        for data_sample, pred_instances, entity in zip(batch_data_samples,
+                                                       results_list, entities):
+            if len(pred_instances) > 0:
+                label_names = []
+                for labels in pred_instances.labels:
+                    if labels >= len(entity):
+                        warnings.warn(
+                            'The unexpected output indicates an issue with '
+                            'named entity recognition. You can try '
+                            'setting custom_entities=True and running '
+                            'again to see if it helps.')
+                        label_names.append('unobject')
+                    else:
+                        label_names.append(entity[labels])
+                # for visualization
+                pred_instances.label_names = label_names
+            data_sample.pred_instances = pred_instances
+        return batch_data_samples
diff --git a/head_extractor/src/mmdet/models/detectors/grid_rcnn.py b/head_extractor/src/mmdet/models/detectors/grid_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bcb5b033edc620f1cf61b986c345961b719e6f1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/grid_rcnn.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class GridRCNN(TwoStageDetector):
+    """Grid R-CNN.
+
+    This detector is the implementation of:
+    - Grid R-CNN (https://arxiv.org/abs/1811.12030)
+    - Grid R-CNN Plus: Faster and Better (https://arxiv.org/abs/1906.05688)
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/grounding_dino.py b/head_extractor/src/mmdet/models/detectors/grounding_dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1ab7c2da16453e4aa43020681811a8b24767ad0
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/grounding_dino.py
@@ -0,0 +1,621 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import re
+import warnings
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.runner.amp import autocast
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType
+from ..layers import SinePositionalEncoding
+from ..layers.transformer.grounding_dino_layers import (
+    GroundingDinoTransformerDecoder, GroundingDinoTransformerEncoder)
+from .dino import DINO
+from .glip import (create_positive_map, create_positive_map_label_to_token,
+                   run_ner)
+
+
+def clean_label_name(name: str) -> str:
+    name = re.sub(r'\(.*\)', '', name)
+    name = re.sub(r'_', ' ', name)
+    name = re.sub(r'  ', ' ', name)
+    return name
+
+
+def chunks(lst: list, n: int) -> list:
+    """Yield successive n-sized chunks from lst."""
+    all_ = []
+    for i in range(0, len(lst), n):
+        data_index = lst[i:i + n]
+        all_.append(data_index)
+    counter = 0
+    for i in all_:
+        counter += len(i)
+    assert (counter == len(lst))
+
+    return all_
+
+
+@MODELS.register_module()
+class GroundingDINO(DINO):
+    """Implementation of `Grounding DINO: Marrying DINO with Grounded Pre-
+    Training for Open-Set Object Detection.
+
+    <https://arxiv.org/abs/2303.05499>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/GroundingDINO>`_.
+    """
+
+    def __init__(self,
+                 language_model,
+                 *args,
+                 use_autocast=False,
+                 **kwargs) -> None:
+
+        self.language_model_cfg = language_model
+        self._special_tokens = '. '
+        self.use_autocast = use_autocast
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize layers except for backbone, neck and bbox_head."""
+        self.positional_encoding = SinePositionalEncoding(
+            **self.positional_encoding)
+        self.encoder = GroundingDinoTransformerEncoder(**self.encoder)
+        self.decoder = GroundingDinoTransformerDecoder(**self.decoder)
+        self.embed_dims = self.encoder.embed_dims
+        self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims)
+        num_feats = self.positional_encoding.num_feats
+        assert num_feats * 2 == self.embed_dims, \
+            f'embed_dims should be exactly 2 times of num_feats. ' \
+            f'Found {self.embed_dims} and {num_feats}.'
+
+        self.level_embed = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+        self.memory_trans_fc = nn.Linear(self.embed_dims, self.embed_dims)
+        self.memory_trans_norm = nn.LayerNorm(self.embed_dims)
+
+        # text modules
+        self.language_model = MODELS.build(self.language_model_cfg)
+        self.text_feat_map = nn.Linear(
+            self.language_model.language_backbone.body.language_dim,
+            self.embed_dims,
+            bias=True)
+
+    def init_weights(self) -> None:
+        """Initialize weights for Transformer and other components."""
+        super().init_weights()
+        nn.init.constant_(self.text_feat_map.bias.data, 0)
+        nn.init.xavier_uniform_(self.text_feat_map.weight.data)
+
+    def to_enhance_text_prompts(self, original_caption, enhanced_text_prompts):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            if word in enhanced_text_prompts:
+                enhanced_text_dict = enhanced_text_prompts[word]
+                if 'prefix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['prefix']
+                start_i = len(caption_string)
+                if 'name' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['name']
+                else:
+                    caption_string += word
+                end_i = len(caption_string)
+                tokens_positive.append([[start_i, end_i]])
+
+                if 'suffix' in enhanced_text_dict:
+                    caption_string += enhanced_text_dict['suffix']
+            else:
+                tokens_positive.append(
+                    [[len(caption_string),
+                      len(caption_string) + len(word)]])
+                caption_string += word
+            caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
+    def to_plain_text_prompts(self, original_caption):
+        caption_string = ''
+        tokens_positive = []
+        for idx, word in enumerate(original_caption):
+            tokens_positive.append(
+                [[len(caption_string),
+                  len(caption_string) + len(word)]])
+            caption_string += word
+            caption_string += self._special_tokens
+        return caption_string, tokens_positive
+
+    def get_tokens_and_prompts(
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompts: Optional[ConfigType] = None
+    ) -> Tuple[dict, str, list]:
+        """Get the tokens positive and prompts for the caption."""
+        if isinstance(original_caption, (list, tuple)) or custom_entities:
+            if custom_entities and isinstance(original_caption, str):
+                original_caption = original_caption.strip(self._special_tokens)
+                original_caption = original_caption.split(self._special_tokens)
+                original_caption = list(
+                    filter(lambda x: len(x) > 0, original_caption))
+
+            original_caption = [clean_label_name(i) for i in original_caption]
+
+            if custom_entities and enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption, enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption)
+
+            # NOTE: Tokenizer in Grounding DINO is different from
+            # that in GLIP. The tokenizer in GLIP will pad the
+            # caption_string to max_length, while the tokenizer
+            # in Grounding DINO will not.
+            tokenized = self.language_model.tokenizer(
+                [caption_string],
+                padding='max_length'
+                if self.language_model.pad_to_max else 'longest',
+                return_tensors='pt')
+            entities = original_caption
+        else:
+            if not original_caption.endswith('.'):
+                original_caption = original_caption + self._special_tokens
+            # NOTE: Tokenizer in Grounding DINO is different from
+            # that in GLIP. The tokenizer in GLIP will pad the
+            # caption_string to max_length, while the tokenizer
+            # in Grounding DINO will not.
+            tokenized = self.language_model.tokenizer(
+                [original_caption],
+                padding='max_length'
+                if self.language_model.pad_to_max else 'longest',
+                return_tensors='pt')
+            tokens_positive, noun_phrases = run_ner(original_caption)
+            entities = noun_phrases
+            caption_string = original_caption
+
+        return tokenized, caption_string, tokens_positive, entities
+
+    def get_positive_map(self, tokenized, tokens_positive):
+        positive_map = create_positive_map(
+            tokenized,
+            tokens_positive,
+            max_num_entities=self.bbox_head.cls_branches[
+                self.decoder.num_layers].max_text_len)
+        positive_map_label_to_token = create_positive_map_label_to_token(
+            positive_map, plus=1)
+        return positive_map_label_to_token, positive_map
+
+    def get_tokens_positive_and_prompts(
+        self,
+        original_caption: Union[str, list, tuple],
+        custom_entities: bool = False,
+        enhanced_text_prompt: Optional[ConfigType] = None,
+        tokens_positive: Optional[list] = None,
+    ) -> Tuple[dict, str, Tensor, list]:
+        """Get the tokens positive and prompts for the caption.
+
+        Args:
+            original_caption (str): The original caption, e.g. 'bench . car .'
+            custom_entities (bool, optional): Whether to use custom entities.
+                If ``True``, the ``original_caption`` should be a list of
+                strings, each of which is a word. Defaults to False.
+
+        Returns:
+            Tuple[dict, str, dict, str]: The dict is a mapping from each entity
+            id, which is numbered from 1, to its positive token id.
+            The str represents the prompts.
+        """
+        if tokens_positive is not None:
+            if tokens_positive == -1:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                return None, original_caption, None, original_caption
+            else:
+                if not original_caption.endswith('.'):
+                    original_caption = original_caption + self._special_tokens
+                tokenized = self.language_model.tokenizer(
+                    [original_caption],
+                    padding='max_length'
+                    if self.language_model.pad_to_max else 'longest',
+                    return_tensors='pt')
+                positive_map_label_to_token, positive_map = \
+                    self.get_positive_map(tokenized, tokens_positive)
+
+                entities = []
+                for token_positive in tokens_positive:
+                    instance_entities = []
+                    for t in token_positive:
+                        instance_entities.append(original_caption[t[0]:t[1]])
+                    entities.append(' / '.join(instance_entities))
+                return positive_map_label_to_token, original_caption, \
+                    positive_map, entities
+
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        if not self.training and chunked_size > 0:
+            assert isinstance(original_caption,
+                              (list, tuple)) or custom_entities is True
+            all_output = self.get_tokens_positive_and_prompts_chunked(
+                original_caption, enhanced_text_prompt)
+            positive_map_label_to_token, \
+                caption_string, \
+                positive_map, \
+                entities = all_output
+        else:
+            tokenized, caption_string, tokens_positive, entities = \
+                self.get_tokens_and_prompts(
+                    original_caption, custom_entities, enhanced_text_prompt)
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
+        return positive_map_label_to_token, caption_string, \
+            positive_map, entities
+
+    def get_tokens_positive_and_prompts_chunked(
+            self,
+            original_caption: Union[list, tuple],
+            enhanced_text_prompts: Optional[ConfigType] = None):
+        chunked_size = self.test_cfg.get('chunked_size', -1)
+        original_caption = [clean_label_name(i) for i in original_caption]
+
+        original_caption_chunked = chunks(original_caption, chunked_size)
+        ids_chunked = chunks(
+            list(range(1,
+                       len(original_caption) + 1)), chunked_size)
+
+        positive_map_label_to_token_chunked = []
+        caption_string_chunked = []
+        positive_map_chunked = []
+        entities_chunked = []
+
+        for i in range(len(ids_chunked)):
+            if enhanced_text_prompts is not None:
+                caption_string, tokens_positive = self.to_enhance_text_prompts(
+                    original_caption_chunked[i], enhanced_text_prompts)
+            else:
+                caption_string, tokens_positive = self.to_plain_text_prompts(
+                    original_caption_chunked[i])
+            tokenized = self.language_model.tokenizer([caption_string],
+                                                      return_tensors='pt')
+            if tokenized.input_ids.shape[1] > self.language_model.max_tokens:
+                warnings.warn('Inputting a text that is too long will result '
+                              'in poor prediction performance. '
+                              'Please reduce the --chunked-size.')
+            positive_map_label_to_token, positive_map = self.get_positive_map(
+                tokenized, tokens_positive)
+
+            caption_string_chunked.append(caption_string)
+            positive_map_label_to_token_chunked.append(
+                positive_map_label_to_token)
+            positive_map_chunked.append(positive_map)
+            entities_chunked.append(original_caption_chunked[i])
+
+        return positive_map_label_to_token_chunked, \
+            caption_string_chunked, \
+            positive_map_chunked, \
+            entities_chunked
+
+    def forward_transformer(
+        self,
+        img_feats: Tuple[Tensor],
+        text_dict: Dict,
+        batch_data_samples: OptSampleList = None,
+    ) -> Dict:
+        encoder_inputs_dict, decoder_inputs_dict = self.pre_transformer(
+            img_feats, batch_data_samples)
+
+        encoder_outputs_dict = self.forward_encoder(
+            **encoder_inputs_dict, text_dict=text_dict)
+
+        tmp_dec_in, head_inputs_dict = self.pre_decoder(
+            **encoder_outputs_dict, batch_data_samples=batch_data_samples)
+        decoder_inputs_dict.update(tmp_dec_in)
+
+        decoder_outputs_dict = self.forward_decoder(**decoder_inputs_dict)
+        head_inputs_dict.update(decoder_outputs_dict)
+        return head_inputs_dict
+
+    def forward_encoder(self, feat: Tensor, feat_mask: Tensor,
+                        feat_pos: Tensor, spatial_shapes: Tensor,
+                        level_start_index: Tensor, valid_ratios: Tensor,
+                        text_dict: Dict) -> Dict:
+        text_token_mask = text_dict['text_token_mask']
+        memory, memory_text = self.encoder(
+            query=feat,
+            query_pos=feat_pos,
+            key_padding_mask=feat_mask,  # for self_attn
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            # for text encoder
+            memory_text=text_dict['embedded'],
+            text_attention_mask=~text_token_mask,
+            position_ids=text_dict['position_ids'],
+            text_self_attention_masks=text_dict['masks'])
+        encoder_outputs_dict = dict(
+            memory=memory,
+            memory_mask=feat_mask,
+            spatial_shapes=spatial_shapes,
+            memory_text=memory_text,
+            text_token_mask=text_token_mask)
+        return encoder_outputs_dict
+
+    def pre_decoder(
+        self,
+        memory: Tensor,
+        memory_mask: Tensor,
+        spatial_shapes: Tensor,
+        memory_text: Tensor,
+        text_token_mask: Tensor,
+        batch_data_samples: OptSampleList = None,
+    ) -> Tuple[Dict]:
+        bs, _, c = memory.shape
+
+        output_memory, output_proposals = self.gen_encoder_output_proposals(
+            memory, memory_mask, spatial_shapes)
+
+        enc_outputs_class = self.bbox_head.cls_branches[
+            self.decoder.num_layers](output_memory, memory_text,
+                                     text_token_mask)
+        cls_out_features = self.bbox_head.cls_branches[
+            self.decoder.num_layers].max_text_len
+        enc_outputs_coord_unact = self.bbox_head.reg_branches[
+            self.decoder.num_layers](output_memory) + output_proposals
+
+        # NOTE The DINO selects top-k proposals according to scores of
+        # multi-class classification, while DeformDETR, where the input
+        # is `enc_outputs_class[..., 0]` selects according to scores of
+        # binary classification.
+        topk_indices = torch.topk(
+            enc_outputs_class.max(-1)[0], k=self.num_queries, dim=1)[1]
+
+        topk_score = torch.gather(
+            enc_outputs_class, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, cls_out_features))
+        topk_coords_unact = torch.gather(
+            enc_outputs_coord_unact, 1,
+            topk_indices.unsqueeze(-1).repeat(1, 1, 4))
+        topk_coords = topk_coords_unact.sigmoid()
+        topk_coords_unact = topk_coords_unact.detach()
+
+        query = self.query_embedding.weight[:, None, :]
+        query = query.repeat(1, bs, 1).transpose(0, 1)
+        if self.training:
+            dn_label_query, dn_bbox_query, dn_mask, dn_meta = \
+                self.dn_query_generator(batch_data_samples)
+            query = torch.cat([dn_label_query, query], dim=1)
+            reference_points = torch.cat([dn_bbox_query, topk_coords_unact],
+                                         dim=1)
+        else:
+            reference_points = topk_coords_unact
+            dn_mask, dn_meta = None, None
+        reference_points = reference_points.sigmoid()
+
+        decoder_inputs_dict = dict(
+            query=query,
+            memory=memory,
+            reference_points=reference_points,
+            dn_mask=dn_mask,
+            memory_text=memory_text,
+            text_attention_mask=~text_token_mask,
+        )
+        # NOTE DINO calculates encoder losses on scores and coordinates
+        # of selected top-k encoder queries, while DeformDETR is of all
+        # encoder queries.
+        head_inputs_dict = dict(
+            enc_outputs_class=topk_score,
+            enc_outputs_coord=topk_coords,
+            dn_meta=dn_meta) if self.training else dict()
+        # append text_feats to head_inputs_dict
+        head_inputs_dict['memory_text'] = memory_text
+        head_inputs_dict['text_token_mask'] = text_token_mask
+        return decoder_inputs_dict, head_inputs_dict
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, list]:
+        text_prompts = [
+            data_samples.text for data_samples in batch_data_samples
+        ]
+
+        gt_labels = [
+            data_samples.gt_instances.labels
+            for data_samples in batch_data_samples
+        ]
+
+        if 'tokens_positive' in batch_data_samples[0]:
+            tokens_positive = [
+                data_samples.tokens_positive
+                for data_samples in batch_data_samples
+            ]
+            positive_maps = []
+            for token_positive, text_prompt, gt_label in zip(
+                    tokens_positive, text_prompts, gt_labels):
+                tokenized = self.language_model.tokenizer(
+                    [text_prompt],
+                    padding='max_length'
+                    if self.language_model.pad_to_max else 'longest',
+                    return_tensors='pt')
+                new_tokens_positive = [
+                    token_positive[label.item()] for label in gt_label
+                ]
+                _, positive_map = self.get_positive_map(
+                    tokenized, new_tokens_positive)
+                positive_maps.append(positive_map)
+            new_text_prompts = text_prompts
+        else:
+            new_text_prompts = []
+            positive_maps = []
+            if len(set(text_prompts)) == 1:
+                # All the text prompts are the same,
+                # so there is no need to calculate them multiple times.
+                tokenized, caption_string, tokens_positive, _ = \
+                    self.get_tokens_and_prompts(
+                        text_prompts[0], True)
+                new_text_prompts = [caption_string] * len(batch_inputs)
+                for gt_label in gt_labels:
+                    new_tokens_positive = [
+                        tokens_positive[label] for label in gt_label
+                    ]
+                    _, positive_map = self.get_positive_map(
+                        tokenized, new_tokens_positive)
+                    positive_maps.append(positive_map)
+            else:
+                for text_prompt, gt_label in zip(text_prompts, gt_labels):
+                    tokenized, caption_string, tokens_positive, _ = \
+                        self.get_tokens_and_prompts(
+                            text_prompt, True)
+                    new_tokens_positive = [
+                        tokens_positive[label] for label in gt_label
+                    ]
+                    _, positive_map = self.get_positive_map(
+                        tokenized, new_tokens_positive)
+                    positive_maps.append(positive_map)
+                    new_text_prompts.append(caption_string)
+
+        text_dict = self.language_model(new_text_prompts)
+        if self.text_feat_map is not None:
+            text_dict['embedded'] = self.text_feat_map(text_dict['embedded'])
+
+        for i, data_samples in enumerate(batch_data_samples):
+            positive_map = positive_maps[i].to(
+                batch_inputs.device).bool().float()
+            text_token_mask = text_dict['text_token_mask'][i]
+            data_samples.gt_instances.positive_maps = positive_map
+            data_samples.gt_instances.text_token_mask = \
+                text_token_mask.unsqueeze(0).repeat(
+                    len(positive_map), 1)
+        if self.use_autocast:
+            with autocast(enabled=True):
+                visual_features = self.extract_feat(batch_inputs)
+        else:
+            visual_features = self.extract_feat(batch_inputs)
+        head_inputs_dict = self.forward_transformer(visual_features, text_dict,
+                                                    batch_data_samples)
+
+        losses = self.bbox_head.loss(
+            **head_inputs_dict, batch_data_samples=batch_data_samples)
+        return losses
+
+    def predict(self, batch_inputs, batch_data_samples, rescale: bool = True):
+        text_prompts = []
+        enhanced_text_prompts = []
+        tokens_positives = []
+        for data_samples in batch_data_samples:
+            text_prompts.append(data_samples.text)
+            if 'caption_prompt' in data_samples:
+                enhanced_text_prompts.append(data_samples.caption_prompt)
+            else:
+                enhanced_text_prompts.append(None)
+            tokens_positives.append(data_samples.get('tokens_positive', None))
+
+        if 'custom_entities' in batch_data_samples[0]:
+            # Assuming that the `custom_entities` flag
+            # inside a batch is always the same. For single image inference
+            custom_entities = batch_data_samples[0].custom_entities
+        else:
+            custom_entities = False
+        if len(text_prompts) == 1:
+            # All the text prompts are the same,
+            # so there is no need to calculate them multiple times.
+            _positive_maps_and_prompts = [
+                self.get_tokens_positive_and_prompts(
+                    text_prompts[0], custom_entities, enhanced_text_prompts[0],
+                    tokens_positives[0])
+            ] * len(batch_inputs)
+        else:
+            _positive_maps_and_prompts = [
+                self.get_tokens_positive_and_prompts(text_prompt,
+                                                     custom_entities,
+                                                     enhanced_text_prompt,
+                                                     tokens_positive)
+                for text_prompt, enhanced_text_prompt, tokens_positive in zip(
+                    text_prompts, enhanced_text_prompts, tokens_positives)
+            ]
+        token_positive_maps, text_prompts, _, entities = zip(
+            *_positive_maps_and_prompts)
+
+        # image feature extraction
+        visual_feats = self.extract_feat(batch_inputs)
+
+        if isinstance(text_prompts[0], list):
+            # chunked text prompts, only bs=1 is supported
+            assert len(batch_inputs) == 1
+            count = 0
+            results_list = []
+
+            entities = [[item for lst in entities[0] for item in lst]]
+
+            for b in range(len(text_prompts[0])):
+                text_prompts_once = [text_prompts[0][b]]
+                token_positive_maps_once = token_positive_maps[0][b]
+                text_dict = self.language_model(text_prompts_once)
+                # text feature map layer
+                if self.text_feat_map is not None:
+                    text_dict['embedded'] = self.text_feat_map(
+                        text_dict['embedded'])
+
+                batch_data_samples[
+                    0].token_positive_map = token_positive_maps_once
+
+                head_inputs_dict = self.forward_transformer(
+                    copy.deepcopy(visual_feats), text_dict, batch_data_samples)
+                pred_instances = self.bbox_head.predict(
+                    **head_inputs_dict,
+                    rescale=rescale,
+                    batch_data_samples=batch_data_samples)[0]
+
+                if len(pred_instances) > 0:
+                    pred_instances.labels += count
+                count += len(token_positive_maps_once)
+                results_list.append(pred_instances)
+            results_list = [results_list[0].cat(results_list)]
+            is_rec_tasks = [False] * len(results_list)
+        else:
+            # extract text feats
+            text_dict = self.language_model(list(text_prompts))
+            # text feature map layer
+            if self.text_feat_map is not None:
+                text_dict['embedded'] = self.text_feat_map(
+                    text_dict['embedded'])
+
+            is_rec_tasks = []
+            for i, data_samples in enumerate(batch_data_samples):
+                if token_positive_maps[i] is not None:
+                    is_rec_tasks.append(False)
+                else:
+                    is_rec_tasks.append(True)
+                data_samples.token_positive_map = token_positive_maps[i]
+
+            head_inputs_dict = self.forward_transformer(
+                visual_feats, text_dict, batch_data_samples)
+            results_list = self.bbox_head.predict(
+                **head_inputs_dict,
+                rescale=rescale,
+                batch_data_samples=batch_data_samples)
+
+        for data_sample, pred_instances, entity, is_rec_task in zip(
+                batch_data_samples, results_list, entities, is_rec_tasks):
+            if len(pred_instances) > 0:
+                label_names = []
+                for labels in pred_instances.labels:
+                    if is_rec_task:
+                        label_names.append(entity)
+                        continue
+                    if labels >= len(entity):
+                        warnings.warn(
+                            'The unexpected output indicates an issue with '
+                            'named entity recognition. You can try '
+                            'setting custom_entities=True and running '
+                            'again to see if it helps.')
+                        label_names.append('unobject')
+                    else:
+                        label_names.append(entity[labels])
+                # for visualization
+                pred_instances.label_names = label_names
+            data_sample.pred_instances = pred_instances
+        return batch_data_samples
diff --git a/head_extractor/src/mmdet/models/detectors/htc.py b/head_extractor/src/mmdet/models/detectors/htc.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a2aa889a59fd0e0afeb95a7369028def6e4fa9
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/htc.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from .cascade_rcnn import CascadeRCNN
+
+
+@MODELS.register_module()
+class HybridTaskCascade(CascadeRCNN):
+    """Implementation of `HTC <https://arxiv.org/abs/1901.07518>`_"""
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+
+    @property
+    def with_semantic(self) -> bool:
+        """bool: whether the detector has a semantic head"""
+        return self.roi_head.with_semantic
diff --git a/head_extractor/src/mmdet/models/detectors/kd_one_stage.py b/head_extractor/src/mmdet/models/detectors/kd_one_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a4a1bb564c0f6e4cabe32a5c01cfea252ecfb7d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/kd_one_stage.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from pathlib import Path
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmengine.config import Config
+from mmengine.runner import load_checkpoint
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class KnowledgeDistillationSingleStageDetector(SingleStageDetector):
+    r"""Implementation of `Distilling the Knowledge in a Neural Network.
+    <https://arxiv.org/abs/1503.02531>`_.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        teacher_config (:obj:`ConfigDict` | dict | str | Path): Config file
+            path or the config object of teacher model.
+        teacher_ckpt (str, optional): Checkpoint path of teacher model.
+            If left as None, the model will not load any weights.
+            Defaults to True.
+        eval_teacher (bool): Set the train mode for teacher.
+            Defaults to True.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of ATSS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of ATSS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+    """
+
+    def __init__(
+        self,
+        backbone: ConfigType,
+        neck: ConfigType,
+        bbox_head: ConfigType,
+        teacher_config: Union[ConfigType, str, Path],
+        teacher_ckpt: Optional[str] = None,
+        eval_teacher: bool = True,
+        train_cfg: OptConfigType = None,
+        test_cfg: OptConfigType = None,
+        data_preprocessor: OptConfigType = None,
+    ) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor)
+        self.eval_teacher = eval_teacher
+        # Build teacher model
+        if isinstance(teacher_config, (str, Path)):
+            teacher_config = Config.fromfile(teacher_config)
+        self.teacher_model = MODELS.build(teacher_config['model'])
+        if teacher_ckpt is not None:
+            load_checkpoint(
+                self.teacher_model, teacher_ckpt, map_location='cpu')
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+        with torch.no_grad():
+            teacher_x = self.teacher_model.extract_feat(batch_inputs)
+            out_teacher = self.teacher_model.bbox_head(teacher_x)
+        losses = self.bbox_head.loss(x, out_teacher, batch_data_samples)
+        return losses
+
+    def cuda(self, device: Optional[str] = None) -> nn.Module:
+        """Since teacher_model is registered as a plain object, it is necessary
+        to put the teacher model to cuda when calling ``cuda`` function."""
+        self.teacher_model.cuda(device=device)
+        return super().cuda(device=device)
+
+    def to(self, device: Optional[str] = None) -> nn.Module:
+        """Since teacher_model is registered as a plain object, it is necessary
+        to put the teacher model to other device when calling ``to``
+        function."""
+        self.teacher_model.to(device=device)
+        return super().to(device=device)
+
+    def train(self, mode: bool = True) -> None:
+        """Set the same train mode for teacher and student model."""
+        if self.eval_teacher:
+            self.teacher_model.train(False)
+        else:
+            self.teacher_model.train(mode)
+        super().train(mode)
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        """Set attribute, i.e. self.name = value
+
+        This reloading prevent the teacher model from being registered as a
+        nn.Module. The teacher module is registered as a plain object, so that
+        the teacher parameters will not show up when calling
+        ``self.parameters``, ``self.modules``, ``self.children`` methods.
+        """
+        if name == 'teacher_model':
+            object.__setattr__(self, name, value)
+        else:
+            super().__setattr__(name, value)
diff --git a/head_extractor/src/mmdet/models/detectors/lad.py b/head_extractor/src/mmdet/models/detectors/lad.py
new file mode 100644
index 0000000000000000000000000000000000000000..008f898772988715c67783d9218ff39c4dd95d80
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/lad.py
@@ -0,0 +1,93 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmengine.runner import load_checkpoint
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType
+from ..utils.misc import unpack_gt_instances
+from .kd_one_stage import KnowledgeDistillationSingleStageDetector
+
+
+@MODELS.register_module()
+class LAD(KnowledgeDistillationSingleStageDetector):
+    """Implementation of `LAD <https://arxiv.org/pdf/2108.10520.pdf>`_."""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 teacher_backbone: ConfigType,
+                 teacher_neck: ConfigType,
+                 teacher_bbox_head: ConfigType,
+                 teacher_ckpt: Optional[str] = None,
+                 eval_teacher: bool = True,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None) -> None:
+        super(KnowledgeDistillationSingleStageDetector, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor)
+        self.eval_teacher = eval_teacher
+        self.teacher_model = nn.Module()
+        self.teacher_model.backbone = MODELS.build(teacher_backbone)
+        if teacher_neck is not None:
+            self.teacher_model.neck = MODELS.build(teacher_neck)
+        teacher_bbox_head.update(train_cfg=train_cfg)
+        teacher_bbox_head.update(test_cfg=test_cfg)
+        self.teacher_model.bbox_head = MODELS.build(teacher_bbox_head)
+        if teacher_ckpt is not None:
+            load_checkpoint(
+                self.teacher_model, teacher_ckpt, map_location='cpu')
+
+    @property
+    def with_teacher_neck(self) -> bool:
+        """bool: whether the detector has a teacher_neck"""
+        return hasattr(self.teacher_model, 'neck') and \
+            self.teacher_model.neck is not None
+
+    def extract_teacher_feat(self, batch_inputs: Tensor) -> Tensor:
+        """Directly extract teacher features from the backbone+neck."""
+        x = self.teacher_model.backbone(batch_inputs)
+        if self.with_teacher_neck:
+            x = self.teacher_model.neck(x)
+        return x
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+        # get label assignment from the teacher
+        with torch.no_grad():
+            x_teacher = self.extract_teacher_feat(batch_inputs)
+            outs_teacher = self.teacher_model.bbox_head(x_teacher)
+            label_assignment_results = \
+                self.teacher_model.bbox_head.get_label_assignment(
+                    *outs_teacher, batch_gt_instances, batch_img_metas,
+                    batch_gt_instances_ignore)
+
+        # the student use the label assignment from the teacher to learn
+        x = self.extract_feat(batch_inputs)
+        losses = self.bbox_head.loss(x, label_assignment_results,
+                                     batch_data_samples)
+        return losses
diff --git a/head_extractor/src/mmdet/models/detectors/mask2former.py b/head_extractor/src/mmdet/models/detectors/mask2former.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f38ef44e482039fdf7476d048eee5df2a96fd9b
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/mask2former.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .maskformer import MaskFormer
+
+
+@MODELS.register_module()
+class Mask2Former(MaskFormer):
+    r"""Implementation of `Masked-attention Mask
+    Transformer for Universal Image Segmentation
+    <https://arxiv.org/pdf/2112.01527>`_."""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 panoptic_head: OptConfigType = None,
+                 panoptic_fusion_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            panoptic_head=panoptic_head,
+            panoptic_fusion_head=panoptic_fusion_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/mask_rcnn.py b/head_extractor/src/mmdet/models/detectors/mask_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..880ee1e8ac3926d618ef47985549d3214175ee73
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/mask_rcnn.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import ConfigDict
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class MaskRCNN(TwoStageDetector):
+    """Implementation of `Mask R-CNN <https://arxiv.org/abs/1703.06870>`_"""
+
+    def __init__(self,
+                 backbone: ConfigDict,
+                 rpn_head: ConfigDict,
+                 roi_head: ConfigDict,
+                 train_cfg: ConfigDict,
+                 test_cfg: ConfigDict,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmdet/models/detectors/mask_scoring_rcnn.py b/head_extractor/src/mmdet/models/detectors/mask_scoring_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e09d3a1041f929113962e42bdf8b169e52dabe25
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/mask_scoring_rcnn.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class MaskScoringRCNN(TwoStageDetector):
+    """Mask Scoring RCNN.
+
+    https://arxiv.org/abs/1903.00241
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/maskformer.py b/head_extractor/src/mmdet/models/detectors/maskformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7493c00e1b87cf9b2fbd2c80f1e642f6eb2bea55
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/maskformer.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class MaskFormer(SingleStageDetector):
+    r"""Implementation of `Per-Pixel Classification is
+    NOT All You Need for Semantic Segmentation
+    <https://arxiv.org/pdf/2107.06278>`_."""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 panoptic_head: OptConfigType = None,
+                 panoptic_fusion_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super(SingleStageDetector, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+
+        panoptic_head_ = panoptic_head.deepcopy()
+        panoptic_head_.update(train_cfg=train_cfg)
+        panoptic_head_.update(test_cfg=test_cfg)
+        self.panoptic_head = MODELS.build(panoptic_head_)
+
+        panoptic_fusion_head_ = panoptic_fusion_head.deepcopy()
+        panoptic_fusion_head_.update(test_cfg=test_cfg)
+        self.panoptic_fusion_head = MODELS.build(panoptic_fusion_head_)
+
+        self.num_things_classes = self.panoptic_head.num_things_classes
+        self.num_stuff_classes = self.panoptic_head.num_stuff_classes
+        self.num_classes = self.panoptic_head.num_classes
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        x = self.extract_feat(batch_inputs)
+        losses = self.panoptic_head.loss(x, batch_data_samples)
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances' and `pred_panoptic_seg`. And the
+            ``pred_instances`` usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+
+            And the ``pred_panoptic_seg`` contains the following key
+
+                - sem_seg (Tensor): panoptic segmentation mask, has a
+                    shape (1, h, w).
+        """
+        feats = self.extract_feat(batch_inputs)
+        mask_cls_results, mask_pred_results = self.panoptic_head.predict(
+            feats, batch_data_samples)
+        results_list = self.panoptic_fusion_head.predict(
+            mask_cls_results,
+            mask_pred_results,
+            batch_data_samples,
+            rescale=rescale)
+        results = self.add_pred_to_datasample(batch_data_samples, results_list)
+
+        return results
+
+    def add_pred_to_datasample(self, data_samples: SampleList,
+                               results_list: List[dict]) -> SampleList:
+        """Add predictions to `DetDataSample`.
+
+        Args:
+            data_samples (list[:obj:`DetDataSample`], optional): A batch of
+                data samples that contain annotations and predictions.
+            results_list (List[dict]): Instance segmentation, segmantic
+                segmentation and panoptic segmentation results.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances' and `pred_panoptic_seg`. And the
+            ``pred_instances`` usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+
+            And the ``pred_panoptic_seg`` contains the following key
+
+                - sem_seg (Tensor): panoptic segmentation mask, has a
+                    shape (1, h, w).
+        """
+        for data_sample, pred_results in zip(data_samples, results_list):
+            if 'pan_results' in pred_results:
+                data_sample.pred_panoptic_seg = pred_results['pan_results']
+
+            if 'ins_results' in pred_results:
+                data_sample.pred_instances = pred_results['ins_results']
+
+            assert 'sem_results' not in pred_results, 'segmantic ' \
+                'segmentation results are not supported yet.'
+
+        return data_samples
+
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> Tuple[List[Tensor]]:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            tuple[List[Tensor]]: A tuple of features from ``panoptic_head``
+            forward.
+        """
+        feats = self.extract_feat(batch_inputs)
+        results = self.panoptic_head.forward(feats, batch_data_samples)
+        return results
diff --git a/head_extractor/src/mmdet/models/detectors/nasfcos.py b/head_extractor/src/mmdet/models/detectors/nasfcos.py
new file mode 100644
index 0000000000000000000000000000000000000000..da2b911bcfc6b0ba51b00d9b3948a3df7af2e74f
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/nasfcos.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class NASFCOS(SingleStageDetector):
+    """Implementation of `NAS-FCOS: Fast Neural Architecture Search for Object
+    Detection. <https://arxiv.org/abs/1906.0442>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of NASFCOS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of NASFCOS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/paa.py b/head_extractor/src/mmdet/models/detectors/paa.py
new file mode 100644
index 0000000000000000000000000000000000000000..094306b2fbd18ba45536470ec80443e4ff793e67
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/paa.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class PAA(SingleStageDetector):
+    """Implementation of `PAA <https://arxiv.org/pdf/2007.08103.pdf>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of PAA. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of PAA. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/panoptic_fpn.py b/head_extractor/src/mmdet/models/detectors/panoptic_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae63ccc38931daa60b4e62f94dcf9f44574d3669
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/panoptic_fpn.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .panoptic_two_stage_segmentor import TwoStagePanopticSegmentor
+
+
+@MODELS.register_module()
+class PanopticFPN(TwoStagePanopticSegmentor):
+    r"""Implementation of `Panoptic feature pyramid
+    networks <https://arxiv.org/pdf/1901.02446>`_"""
+
+    def __init__(
+            self,
+            backbone: ConfigType,
+            neck: OptConfigType = None,
+            rpn_head: OptConfigType = None,
+            roi_head: OptConfigType = None,
+            train_cfg: OptConfigType = None,
+            test_cfg: OptConfigType = None,
+            data_preprocessor: OptConfigType = None,
+            init_cfg: OptMultiConfig = None,
+            # for panoptic segmentation
+            semantic_head: OptConfigType = None,
+            panoptic_fusion_head: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg,
+            semantic_head=semantic_head,
+            panoptic_fusion_head=panoptic_fusion_head)
diff --git a/head_extractor/src/mmdet/models/detectors/panoptic_two_stage_segmentor.py b/head_extractor/src/mmdet/models/detectors/panoptic_two_stage_segmentor.py
new file mode 100644
index 0000000000000000000000000000000000000000..879edbe1ac6a0f482fdd740f4058e508e728414d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/panoptic_two_stage_segmentor.py
@@ -0,0 +1,234 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List
+
+import torch
+from mmengine.structures import PixelData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class TwoStagePanopticSegmentor(TwoStageDetector):
+    """Base class of Two-stage Panoptic Segmentor.
+
+    As well as the components in TwoStageDetector, Panoptic Segmentor has extra
+    semantic_head and panoptic_fusion_head.
+    """
+
+    def __init__(
+            self,
+            backbone: ConfigType,
+            neck: OptConfigType = None,
+            rpn_head: OptConfigType = None,
+            roi_head: OptConfigType = None,
+            train_cfg: OptConfigType = None,
+            test_cfg: OptConfigType = None,
+            data_preprocessor: OptConfigType = None,
+            init_cfg: OptMultiConfig = None,
+            # for panoptic segmentation
+            semantic_head: OptConfigType = None,
+            panoptic_fusion_head: OptConfigType = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+        if semantic_head is not None:
+            self.semantic_head = MODELS.build(semantic_head)
+
+        if panoptic_fusion_head is not None:
+            panoptic_cfg = test_cfg.panoptic if test_cfg is not None else None
+            panoptic_fusion_head_ = panoptic_fusion_head.deepcopy()
+            panoptic_fusion_head_.update(test_cfg=panoptic_cfg)
+            self.panoptic_fusion_head = MODELS.build(panoptic_fusion_head_)
+
+            self.num_things_classes = self.panoptic_fusion_head.\
+                num_things_classes
+            self.num_stuff_classes = self.panoptic_fusion_head.\
+                num_stuff_classes
+            self.num_classes = self.panoptic_fusion_head.num_classes
+
+    @property
+    def with_semantic_head(self) -> bool:
+        """bool: whether the detector has semantic head"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    @property
+    def with_panoptic_fusion_head(self) -> bool:
+        """bool: whether the detector has panoptic fusion head"""
+        return hasattr(self, 'panoptic_fusion_head') and \
+            self.panoptic_fusion_head is not None
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.with_rpn:
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            rpn_data_samples = copy.deepcopy(batch_data_samples)
+            # set cat_id of gt_labels to 0 in RPN
+            for data_sample in rpn_data_samples:
+                data_sample.gt_instances.labels = \
+                    torch.zeros_like(data_sample.gt_instances.labels)
+
+            rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict(
+                x, rpn_data_samples, proposal_cfg=proposal_cfg)
+            # avoid get same name with roi_head loss
+            keys = rpn_losses.keys()
+            for key in list(keys):
+                if 'loss' in key and 'rpn' not in key:
+                    rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+            losses.update(rpn_losses)
+        else:
+            # TODO: Not support currently, should have a check at Fast R-CNN
+            assert batch_data_samples[0].get('proposals', None) is not None
+            # use pre-defined proposals in InstanceData for the second stage
+            # to extract ROI features.
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        roi_losses = self.roi_head.loss(x, rpn_results_list,
+                                        batch_data_samples)
+        losses.update(roi_losses)
+
+        semantic_loss = self.semantic_head.loss(x, batch_data_samples)
+        losses.update(semantic_loss)
+
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            List[:obj:`DetDataSample`]: Return the packed panoptic segmentation
+                results of input images. Each DetDataSample usually contains
+                'pred_panoptic_seg'. And the 'pred_panoptic_seg' has a key
+                ``sem_seg``, which is a tensor of shape (1, h, w).
+        """
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        x = self.extract_feat(batch_inputs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            rpn_results_list = self.rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        results_list = self.roi_head.predict(
+            x, rpn_results_list, batch_data_samples, rescale=rescale)
+
+        seg_preds = self.semantic_head.predict(x, batch_img_metas, rescale)
+
+        results_list = self.panoptic_fusion_head.predict(
+            results_list, seg_preds)
+
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
+
+    # TODO the code has not been verified and needs to be refactored later.
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+
+        Returns:
+            tuple: A tuple of features from ``rpn_head``, ``roi_head`` and
+                ``semantic_head`` forward.
+        """
+        results = ()
+        x = self.extract_feat(batch_inputs)
+        rpn_outs = self.rpn_head.forward(x)
+        results = results + (rpn_outs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            batch_img_metas = [
+                data_samples.metainfo for data_samples in batch_data_samples
+            ]
+            rpn_results_list = self.rpn_head.predict_by_feat(
+                *rpn_outs, batch_img_metas=batch_img_metas, rescale=False)
+        else:
+            # TODO: Not checked currently.
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        # roi_head
+        roi_outs = self.roi_head(x, rpn_results_list)
+        results = results + (roi_outs)
+
+        # semantic_head
+        sem_outs = self.semantic_head.forward(x)
+        results = results + (sem_outs['seg_preds'], )
+
+        return results
+
+    def add_pred_to_datasample(self, data_samples: SampleList,
+                               results_list: List[PixelData]) -> SampleList:
+        """Add predictions to `DetDataSample`.
+
+        Args:
+            data_samples (list[:obj:`DetDataSample`]): The
+                annotation data of every samples.
+            results_list (List[PixelData]): Panoptic segmentation results of
+                each image.
+
+        Returns:
+            List[:obj:`DetDataSample`]: Return the packed panoptic segmentation
+                results of input images. Each DetDataSample usually contains
+                'pred_panoptic_seg'. And the 'pred_panoptic_seg' has a key
+                ``sem_seg``, which is a tensor of shape (1, h, w).
+        """
+
+        for data_sample, pred_panoptic_seg in zip(data_samples, results_list):
+            data_sample.pred_panoptic_seg = pred_panoptic_seg
+        return data_samples
diff --git a/head_extractor/src/mmdet/models/detectors/point_rend.py b/head_extractor/src/mmdet/models/detectors/point_rend.py
new file mode 100644
index 0000000000000000000000000000000000000000..5062ac0c945e79bd53e66e1642aec51113475cad
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/point_rend.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.config import ConfigDict
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class PointRend(TwoStageDetector):
+    """PointRend: Image Segmentation as Rendering
+
+    This detector is the implementation of
+    `PointRend <https://arxiv.org/abs/1912.08193>`_.
+
+    """
+
+    def __init__(self,
+                 backbone: ConfigDict,
+                 rpn_head: ConfigDict,
+                 roi_head: ConfigDict,
+                 train_cfg: ConfigDict,
+                 test_cfg: ConfigDict,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmdet/models/detectors/queryinst.py b/head_extractor/src/mmdet/models/detectors/queryinst.py
new file mode 100644
index 0000000000000000000000000000000000000000..400ce20c01f5c3825e343f2d32accf740c5dd55c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/queryinst.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .sparse_rcnn import SparseRCNN
+
+
+@MODELS.register_module()
+class QueryInst(SparseRCNN):
+    r"""Implementation of
+    `Instances as Queries <http://arxiv.org/abs/2105.01928>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/reppoints_detector.py b/head_extractor/src/mmdet/models/detectors/reppoints_detector.py
new file mode 100644
index 0000000000000000000000000000000000000000..d86cec2ecda0671939e227c50f00379e81d3ac9c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/reppoints_detector.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class RepPointsDetector(SingleStageDetector):
+    """RepPoints: Point Set Representation for Object Detection.
+
+        This detector is the implementation of:
+        - RepPoints detector (https://arxiv.org/pdf/1904.11490)
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/retinanet.py b/head_extractor/src/mmdet/models/detectors/retinanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..03e3cb20e5bda603e9384d83688a56fa590e6de8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/retinanet.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class RetinaNet(SingleStageDetector):
+    """Implementation of `RetinaNet <https://arxiv.org/abs/1708.02002>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/rpn.py b/head_extractor/src/mmdet/models/detectors/rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..72fe8521fcc9bc796801b2dd68269bb57aaab984
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/rpn.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class RPN(SingleStageDetector):
+    """Implementation of Region Proposal Network.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 rpn_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(SingleStageDetector, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        self.neck = MODELS.build(neck) if neck is not None else None
+        rpn_train_cfg = train_cfg['rpn'] if train_cfg is not None else None
+        rpn_head_num_classes = rpn_head.get('num_classes', 1)
+        if rpn_head_num_classes != 1:
+            warnings.warn('The `num_classes` should be 1 in RPN, but get '
+                          f'{rpn_head_num_classes}, please set '
+                          'rpn_head.num_classes = 1 in your config file.')
+            rpn_head.update(num_classes=1)
+        rpn_head.update(train_cfg=rpn_train_cfg)
+        rpn_head.update(test_cfg=test_cfg['rpn'])
+        self.bbox_head = MODELS.build(rpn_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+
+        # set cat_id of gt_labels to 0 in RPN
+        rpn_data_samples = copy.deepcopy(batch_data_samples)
+        for data_sample in rpn_data_samples:
+            data_sample.gt_instances.labels = \
+                torch.zeros_like(data_sample.gt_instances.labels)
+
+        losses = self.bbox_head.loss(x, rpn_data_samples)
+        return losses
diff --git a/head_extractor/src/mmdet/models/detectors/rtmdet.py b/head_extractor/src/mmdet/models/detectors/rtmdet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b43e053fc41a4b8400bbc0946fffedfa735b9451
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/rtmdet.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.dist import get_world_size
+from mmengine.logging import print_log
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class RTMDet(SingleStageDetector):
+    """Implementation of RTMDet.
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of ATSS. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of ATSS. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+        use_syncbn (bool): Whether to use SyncBatchNorm. Defaults to True.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 use_syncbn: bool = True) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+        # TODO: Waiting for mmengine support
+        if use_syncbn and get_world_size() > 1:
+            torch.nn.SyncBatchNorm.convert_sync_batchnorm(self)
+            print_log('Using SyncBatchNorm()', 'current')
diff --git a/head_extractor/src/mmdet/models/detectors/scnet.py b/head_extractor/src/mmdet/models/detectors/scnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..606a0203869f1731a21d811f06c4781f5cd90d8d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/scnet.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from .cascade_rcnn import CascadeRCNN
+
+
+@MODELS.register_module()
+class SCNet(CascadeRCNN):
+    """Implementation of `SCNet <https://arxiv.org/abs/2012.10150>`_"""
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
diff --git a/head_extractor/src/mmdet/models/detectors/semi_base.py b/head_extractor/src/mmdet/models/detectors/semi_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3f0c8c030830e188bf3ad245d5b3cb471ecb04f
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/semi_base.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.models.utils import (filter_gt_instances, rename_loss_dict,
+                                reweight_loss_dict)
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_project
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+
+@MODELS.register_module()
+class SemiBaseDetector(BaseDetector):
+    """Base class for semi-supervised detectors.
+
+    Semi-supervised detectors typically consisting of a teacher model
+    updated by exponential moving average and a student model updated
+    by gradient descent.
+
+    Args:
+        detector (:obj:`ConfigDict` or dict): The detector config.
+        semi_train_cfg (:obj:`ConfigDict` or dict, optional):
+            The semi-supervised training config.
+        semi_test_cfg (:obj:`ConfigDict` or dict, optional):
+            The semi-supervised testing config.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: ConfigType,
+                 semi_train_cfg: OptConfigType = None,
+                 semi_test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.student = MODELS.build(detector)
+        self.teacher = MODELS.build(detector)
+        self.semi_train_cfg = semi_train_cfg
+        self.semi_test_cfg = semi_test_cfg
+        if self.semi_train_cfg.get('freeze_teacher', True) is True:
+            self.freeze(self.teacher)
+
+    @staticmethod
+    def freeze(model: nn.Module):
+        """Freeze the model."""
+        model.eval()
+        for param in model.parameters():
+            param.requires_grad = False
+
+    def loss(self, multi_batch_inputs: Dict[str, Tensor],
+             multi_batch_data_samples: Dict[str, SampleList]) -> dict:
+        """Calculate losses from multi-branch inputs and data samples.
+
+        Args:
+            multi_batch_inputs (Dict[str, Tensor]): The dict of multi-branch
+                input images, each value with shape (N, C, H, W).
+                Each value should usually be mean centered and std scaled.
+            multi_batch_data_samples (Dict[str, List[:obj:`DetDataSample`]]):
+                The dict of multi-branch data samples.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+        losses = dict()
+        losses.update(**self.loss_by_gt_instances(
+            multi_batch_inputs['sup'], multi_batch_data_samples['sup']))
+
+        origin_pseudo_data_samples, batch_info = self.get_pseudo_instances(
+            multi_batch_inputs['unsup_teacher'],
+            multi_batch_data_samples['unsup_teacher'])
+        multi_batch_data_samples[
+            'unsup_student'] = self.project_pseudo_instances(
+                origin_pseudo_data_samples,
+                multi_batch_data_samples['unsup_student'])
+        losses.update(**self.loss_by_pseudo_instances(
+            multi_batch_inputs['unsup_student'],
+            multi_batch_data_samples['unsup_student'], batch_info))
+        return losses
+
+    def loss_by_gt_instances(self, batch_inputs: Tensor,
+                             batch_data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and ground-truth data
+        samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+
+        losses = self.student.loss(batch_inputs, batch_data_samples)
+        sup_weight = self.semi_train_cfg.get('sup_weight', 1.)
+        return rename_loss_dict('sup_', reweight_loss_dict(losses, sup_weight))
+
+    def loss_by_pseudo_instances(self,
+                                 batch_inputs: Tensor,
+                                 batch_data_samples: SampleList,
+                                 batch_info: Optional[dict] = None) -> dict:
+        """Calculate losses from a batch of inputs and pseudo data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+            batch_info (dict): Batch information of teacher model
+                forward propagation process. Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+        batch_data_samples = filter_gt_instances(
+            batch_data_samples, score_thr=self.semi_train_cfg.cls_pseudo_thr)
+        losses = self.student.loss(batch_inputs, batch_data_samples)
+        pseudo_instances_num = sum([
+            len(data_samples.gt_instances)
+            for data_samples in batch_data_samples
+        ])
+        unsup_weight = self.semi_train_cfg.get(
+            'unsup_weight', 1.) if pseudo_instances_num > 0 else 0.
+        return rename_loss_dict('unsup_',
+                                reweight_loss_dict(losses, unsup_weight))
+
+    @torch.no_grad()
+    def get_pseudo_instances(
+            self, batch_inputs: Tensor, batch_data_samples: SampleList
+    ) -> Tuple[SampleList, Optional[dict]]:
+        """Get pseudo instances from teacher model."""
+        self.teacher.eval()
+        results_list = self.teacher.predict(
+            batch_inputs, batch_data_samples, rescale=False)
+        batch_info = {}
+        for data_samples, results in zip(batch_data_samples, results_list):
+            data_samples.gt_instances = results.pred_instances
+            data_samples.gt_instances.bboxes = bbox_project(
+                data_samples.gt_instances.bboxes,
+                torch.from_numpy(data_samples.homography_matrix).inverse().to(
+                    self.data_preprocessor.device), data_samples.ori_shape)
+        return batch_data_samples, batch_info
+
+    def project_pseudo_instances(self, batch_pseudo_instances: SampleList,
+                                 batch_data_samples: SampleList) -> SampleList:
+        """Project pseudo instances."""
+        for pseudo_instances, data_samples in zip(batch_pseudo_instances,
+                                                  batch_data_samples):
+            data_samples.gt_instances = copy.deepcopy(
+                pseudo_instances.gt_instances)
+            data_samples.gt_instances.bboxes = bbox_project(
+                data_samples.gt_instances.bboxes,
+                torch.tensor(data_samples.homography_matrix).to(
+                    self.data_preprocessor.device), data_samples.img_shape)
+        wh_thr = self.semi_train_cfg.get('min_pseudo_bbox_wh', (1e-2, 1e-2))
+        return filter_gt_instances(batch_data_samples, wh_thr=wh_thr)
+
+    def predict(self, batch_inputs: Tensor,
+                batch_data_samples: SampleList) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Return the detection results of the
+            input images. The returns value is DetDataSample,
+            which usually contain 'pred_instances'. And the
+            ``pred_instances`` usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        if self.semi_test_cfg.get('predict_on', 'teacher') == 'teacher':
+            return self.teacher(
+                batch_inputs, batch_data_samples, mode='predict')
+        else:
+            return self.student(
+                batch_inputs, batch_data_samples, mode='predict')
+
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> SampleList:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+
+        Returns:
+            tuple: A tuple of features from ``rpn_head`` and ``roi_head``
+            forward.
+        """
+        if self.semi_test_cfg.get('forward_on', 'teacher') == 'teacher':
+            return self.teacher(
+                batch_inputs, batch_data_samples, mode='tensor')
+        else:
+            return self.student(
+                batch_inputs, batch_data_samples, mode='tensor')
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
+
+        Returns:
+            tuple[Tensor]: Multi-level features that may have
+            different resolutions.
+        """
+        if self.semi_test_cfg.get('extract_feat_on', 'teacher') == 'teacher':
+            return self.teacher.extract_feat(batch_inputs)
+        else:
+            return self.student.extract_feat(batch_inputs)
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Add teacher and student prefixes to model parameter names."""
+        if not any([
+                'student' in key or 'teacher' in key
+                for key in state_dict.keys()
+        ]):
+            keys = list(state_dict.keys())
+            state_dict.update({'teacher.' + k: state_dict[k] for k in keys})
+            state_dict.update({'student.' + k: state_dict[k] for k in keys})
+            for k in keys:
+                state_dict.pop(k)
+        return super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
diff --git a/head_extractor/src/mmdet/models/detectors/single_stage.py b/head_extractor/src/mmdet/models/detectors/single_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..06c074085967bbc9040d93e5eb446b67a006087e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/single_stage.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+
+@MODELS.register_module()
+class SingleStageDetector(BaseDetector):
+    """Base class for single-stage detectors.
+
+    Single-stage detectors directly and densely predict bounding boxes on the
+    output features of the backbone+neck.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = MODELS.build(bbox_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Exchange bbox_head key to rpn_head key when loading two-stage
+        weights into single-stage model."""
+        bbox_head_prefix = prefix + '.bbox_head' if prefix else 'bbox_head'
+        bbox_head_keys = [
+            k for k in state_dict.keys() if k.startswith(bbox_head_prefix)
+        ]
+        rpn_head_prefix = prefix + '.rpn_head' if prefix else 'rpn_head'
+        rpn_head_keys = [
+            k for k in state_dict.keys() if k.startswith(rpn_head_prefix)
+        ]
+        if len(bbox_head_keys) == 0 and len(rpn_head_keys) != 0:
+            for rpn_head_key in rpn_head_keys:
+                bbox_head_key = bbox_head_prefix + \
+                                rpn_head_key[len(rpn_head_prefix):]
+                state_dict[bbox_head_key] = state_dict.pop(rpn_head_key)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> Union[dict, list]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+        losses = self.bbox_head.loss(x, batch_data_samples)
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        x = self.extract_feat(batch_inputs)
+        results_list = self.bbox_head.predict(
+            x, batch_data_samples, rescale=rescale)
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
+
+    def _forward(
+            self,
+            batch_inputs: Tensor,
+            batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns:
+            tuple[list]: A tuple of features from ``bbox_head`` forward.
+        """
+        x = self.extract_feat(batch_inputs)
+        results = self.bbox_head.forward(x)
+        return results
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
+
+        Returns:
+            tuple[Tensor]: Multi-level features that may have
+            different resolutions.
+        """
+        x = self.backbone(batch_inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
diff --git a/head_extractor/src/mmdet/models/detectors/single_stage_instance_seg.py b/head_extractor/src/mmdet/models/detectors/single_stage_instance_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..acb5f0d2f8e4636b86b4b66cbf5c4916d0dae16f
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/single_stage_instance_seg.py
@@ -0,0 +1,180 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Tuple
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptSampleList, SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+INF = 1e8
+
+
+@MODELS.register_module()
+class SingleStageInstanceSegmentor(BaseDetector):
+    """Base class for single-stage instance segmentors."""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 mask_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        else:
+            self.neck = None
+        if bbox_head is not None:
+            bbox_head.update(train_cfg=copy.deepcopy(train_cfg))
+            bbox_head.update(test_cfg=copy.deepcopy(test_cfg))
+            self.bbox_head = MODELS.build(bbox_head)
+        else:
+            self.bbox_head = None
+
+        assert mask_head, f'`mask_head` must ' \
+                          f'be implemented in {self.__class__.__name__}'
+        mask_head.update(train_cfg=copy.deepcopy(train_cfg))
+        mask_head.update(test_cfg=copy.deepcopy(test_cfg))
+        self.mask_head = MODELS.build(mask_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
+
+        Returns:
+            tuple[Tensor]: Multi-level features that may have different
+            resolutions.
+        """
+        x = self.backbone(batch_inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def _forward(self,
+                 batch_inputs: Tensor,
+                 batch_data_samples: OptSampleList = None,
+                 **kwargs) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+
+        Returns:
+            tuple: A tuple of features from ``bbox_head`` forward.
+        """
+        outs = ()
+        # backbone
+        x = self.extract_feat(batch_inputs)
+        # bbox_head
+        positive_infos = None
+        if self.with_bbox:
+            assert batch_data_samples is not None
+            bbox_outs = self.bbox_head.forward(x)
+            outs = outs + (bbox_outs, )
+            # It is necessary to use `bbox_head.loss` to update
+            # `_raw_positive_infos` which will be used in `get_positive_infos`
+            # positive_infos will be used in the following mask head.
+            _ = self.bbox_head.loss(x, batch_data_samples, **kwargs)
+            positive_infos = self.bbox_head.get_positive_infos()
+        # mask_head
+        if positive_infos is None:
+            mask_outs = self.mask_head.forward(x)
+        else:
+            mask_outs = self.mask_head.forward(x, positive_infos)
+        outs = outs + (mask_outs, )
+        return outs
+
+    def loss(self, batch_inputs: Tensor, batch_data_samples: SampleList,
+             **kwargs) -> dict:
+        """
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        x = self.extract_feat(batch_inputs)
+        losses = dict()
+
+        positive_infos = None
+        # CondInst and YOLACT have bbox_head
+        if self.with_bbox:
+            bbox_losses = self.bbox_head.loss(x, batch_data_samples, **kwargs)
+            losses.update(bbox_losses)
+            # get positive information from bbox head, which will be used
+            # in the following mask head.
+            positive_infos = self.bbox_head.get_positive_infos()
+
+        mask_loss = self.mask_head.loss(
+            x, batch_data_samples, positive_infos=positive_infos, **kwargs)
+        # avoid loss override
+        assert not set(mask_loss.keys()) & set(losses.keys())
+
+        losses.update(mask_loss)
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True,
+                **kwargs) -> SampleList:
+        """Perform forward propagation of the mask head and predict mask
+        results on the features of the upstream network.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Detection results of the
+            input images. Each DetDataSample usually contain
+            'pred_instances'. And the ``pred_instances`` usually
+            contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+                (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+                (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+                the last dimension 4 arrange as (x1, y1, x2, y2).
+            - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        x = self.extract_feat(batch_inputs)
+        if self.with_bbox:
+            # the bbox branch does not need to be scaled to the original
+            # image scale, because the mask branch will scale both bbox
+            # and mask at the same time.
+            bbox_rescale = rescale if not self.with_mask else False
+            results_list = self.bbox_head.predict(
+                x, batch_data_samples, rescale=bbox_rescale)
+        else:
+            results_list = None
+
+        results_list = self.mask_head.predict(
+            x, batch_data_samples, rescale=rescale, results_list=results_list)
+
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
diff --git a/head_extractor/src/mmdet/models/detectors/soft_teacher.py b/head_extractor/src/mmdet/models/detectors/soft_teacher.py
new file mode 100644
index 0000000000000000000000000000000000000000..80853f1d8399c70008923067777a2581671ede0b
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/soft_teacher.py
@@ -0,0 +1,378 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import List, Optional, Tuple
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.utils import (filter_gt_instances, rename_loss_dict,
+                                reweight_loss_dict)
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi, bbox_project
+from mmdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig
+from ..utils.misc import unpack_gt_instances
+from .semi_base import SemiBaseDetector
+
+
+@MODELS.register_module()
+class SoftTeacher(SemiBaseDetector):
+    r"""Implementation of `End-to-End Semi-Supervised Object Detection
+    with Soft Teacher <https://arxiv.org/abs/2106.09018>`_
+
+    Args:
+        detector (:obj:`ConfigDict` or dict): The detector config.
+        semi_train_cfg (:obj:`ConfigDict` or dict, optional):
+            The semi-supervised training config.
+        semi_test_cfg (:obj:`ConfigDict` or dict, optional):
+            The semi-supervised testing config.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: ConfigType,
+                 semi_train_cfg: OptConfigType = None,
+                 semi_test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            detector=detector,
+            semi_train_cfg=semi_train_cfg,
+            semi_test_cfg=semi_test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+
+    def loss_by_pseudo_instances(self,
+                                 batch_inputs: Tensor,
+                                 batch_data_samples: SampleList,
+                                 batch_info: Optional[dict] = None) -> dict:
+        """Calculate losses from a batch of inputs and pseudo data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+            batch_info (dict): Batch information of teacher model
+                forward propagation process. Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+
+        x = self.student.extract_feat(batch_inputs)
+
+        losses = {}
+        rpn_losses, rpn_results_list = self.rpn_loss_by_pseudo_instances(
+            x, batch_data_samples)
+        losses.update(**rpn_losses)
+        losses.update(**self.rcnn_cls_loss_by_pseudo_instances(
+            x, rpn_results_list, batch_data_samples, batch_info))
+        losses.update(**self.rcnn_reg_loss_by_pseudo_instances(
+            x, rpn_results_list, batch_data_samples))
+        unsup_weight = self.semi_train_cfg.get('unsup_weight', 1.)
+        return rename_loss_dict('unsup_',
+                                reweight_loss_dict(losses, unsup_weight))
+
+    @torch.no_grad()
+    def get_pseudo_instances(
+            self, batch_inputs: Tensor, batch_data_samples: SampleList
+    ) -> Tuple[SampleList, Optional[dict]]:
+        """Get pseudo instances from teacher model."""
+        assert self.teacher.with_bbox, 'Bbox head must be implemented.'
+        x = self.teacher.extract_feat(batch_inputs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            rpn_results_list = self.teacher.rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        results_list = self.teacher.roi_head.predict(
+            x, rpn_results_list, batch_data_samples, rescale=False)
+
+        for data_samples, results in zip(batch_data_samples, results_list):
+            data_samples.gt_instances = results
+
+        batch_data_samples = filter_gt_instances(
+            batch_data_samples,
+            score_thr=self.semi_train_cfg.pseudo_label_initial_score_thr)
+
+        reg_uncs_list = self.compute_uncertainty_with_aug(
+            x, batch_data_samples)
+
+        for data_samples, reg_uncs in zip(batch_data_samples, reg_uncs_list):
+            data_samples.gt_instances['reg_uncs'] = reg_uncs
+            data_samples.gt_instances.bboxes = bbox_project(
+                data_samples.gt_instances.bboxes,
+                torch.from_numpy(data_samples.homography_matrix).inverse().to(
+                    self.data_preprocessor.device), data_samples.ori_shape)
+
+        batch_info = {
+            'feat': x,
+            'img_shape': [],
+            'homography_matrix': [],
+            'metainfo': []
+        }
+        for data_samples in batch_data_samples:
+            batch_info['img_shape'].append(data_samples.img_shape)
+            batch_info['homography_matrix'].append(
+                torch.from_numpy(data_samples.homography_matrix).to(
+                    self.data_preprocessor.device))
+            batch_info['metainfo'].append(data_samples.metainfo)
+        return batch_data_samples, batch_info
+
+    def rpn_loss_by_pseudo_instances(self, x: Tuple[Tensor],
+                                     batch_data_samples: SampleList) -> dict:
+        """Calculate rpn loss from a batch of inputs and pseudo data samples.
+
+        Args:
+            x (tuple[Tensor]): Features from FPN.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+        Returns:
+            dict: A dictionary of rpn loss components
+        """
+
+        rpn_data_samples = copy.deepcopy(batch_data_samples)
+        rpn_data_samples = filter_gt_instances(
+            rpn_data_samples, score_thr=self.semi_train_cfg.rpn_pseudo_thr)
+        proposal_cfg = self.student.train_cfg.get('rpn_proposal',
+                                                  self.student.test_cfg.rpn)
+        # set cat_id of gt_labels to 0 in RPN
+        for data_sample in rpn_data_samples:
+            data_sample.gt_instances.labels = \
+                torch.zeros_like(data_sample.gt_instances.labels)
+
+        rpn_losses, rpn_results_list = self.student.rpn_head.loss_and_predict(
+            x, rpn_data_samples, proposal_cfg=proposal_cfg)
+        for key in rpn_losses.keys():
+            if 'loss' in key and 'rpn' not in key:
+                rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+        return rpn_losses, rpn_results_list
+
+    def rcnn_cls_loss_by_pseudo_instances(self, x: Tuple[Tensor],
+                                          unsup_rpn_results_list: InstanceList,
+                                          batch_data_samples: SampleList,
+                                          batch_info: dict) -> dict:
+        """Calculate classification loss from a batch of inputs and pseudo data
+        samples.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            unsup_rpn_results_list (list[:obj:`InstanceData`]):
+                List of region proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+            batch_info (dict): Batch information of teacher model
+                forward propagation process.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of rcnn
+                classification loss components
+        """
+        rpn_results_list = copy.deepcopy(unsup_rpn_results_list)
+        cls_data_samples = copy.deepcopy(batch_data_samples)
+        cls_data_samples = filter_gt_instances(
+            cls_data_samples, score_thr=self.semi_train_cfg.cls_pseudo_thr)
+
+        outputs = unpack_gt_instances(cls_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(cls_data_samples)
+        sampling_results = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+            assign_result = self.student.roi_head.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.student.roi_head.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            sampling_results.append(sampling_result)
+
+        selected_bboxes = [res.priors for res in sampling_results]
+        rois = bbox2roi(selected_bboxes)
+        bbox_results = self.student.roi_head._bbox_forward(x, rois)
+        # cls_reg_targets is a tuple of labels, label_weights,
+        # and bbox_targets, bbox_weights
+        cls_reg_targets = self.student.roi_head.bbox_head.get_targets(
+            sampling_results, self.student.train_cfg.rcnn)
+
+        selected_results_list = []
+        for bboxes, data_samples, teacher_matrix, teacher_img_shape in zip(
+                selected_bboxes, batch_data_samples,
+                batch_info['homography_matrix'], batch_info['img_shape']):
+            student_matrix = torch.tensor(
+                data_samples.homography_matrix, device=teacher_matrix.device)
+            homography_matrix = teacher_matrix @ student_matrix.inverse()
+            projected_bboxes = bbox_project(bboxes, homography_matrix,
+                                            teacher_img_shape)
+            selected_results_list.append(InstanceData(bboxes=projected_bboxes))
+
+        with torch.no_grad():
+            results_list = self.teacher.roi_head.predict_bbox(
+                batch_info['feat'],
+                batch_info['metainfo'],
+                selected_results_list,
+                rcnn_test_cfg=None,
+                rescale=False)
+            bg_score = torch.cat(
+                [results.scores[:, -1] for results in results_list])
+            # cls_reg_targets[0] is labels
+            neg_inds = cls_reg_targets[
+                0] == self.student.roi_head.bbox_head.num_classes
+            # cls_reg_targets[1] is label_weights
+            cls_reg_targets[1][neg_inds] = bg_score[neg_inds].detach()
+
+        losses = self.student.roi_head.bbox_head.loss(
+            bbox_results['cls_score'], bbox_results['bbox_pred'], rois,
+            *cls_reg_targets)
+        # cls_reg_targets[1] is label_weights
+        losses['loss_cls'] = losses['loss_cls'] * len(
+            cls_reg_targets[1]) / max(sum(cls_reg_targets[1]), 1.0)
+        return losses
+
+    def rcnn_reg_loss_by_pseudo_instances(
+            self, x: Tuple[Tensor], unsup_rpn_results_list: InstanceList,
+            batch_data_samples: SampleList) -> dict:
+        """Calculate rcnn regression loss from a batch of inputs and pseudo
+        data samples.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            unsup_rpn_results_list (list[:obj:`InstanceData`]):
+                List of region proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of rcnn
+                regression loss components
+        """
+        rpn_results_list = copy.deepcopy(unsup_rpn_results_list)
+        reg_data_samples = copy.deepcopy(batch_data_samples)
+        for data_samples in reg_data_samples:
+            if data_samples.gt_instances.bboxes.shape[0] > 0:
+                data_samples.gt_instances = data_samples.gt_instances[
+                    data_samples.gt_instances.reg_uncs <
+                    self.semi_train_cfg.reg_pseudo_thr]
+        roi_losses = self.student.roi_head.loss(x, rpn_results_list,
+                                                reg_data_samples)
+        return {'loss_bbox': roi_losses['loss_bbox']}
+
+    def compute_uncertainty_with_aug(
+            self, x: Tuple[Tensor],
+            batch_data_samples: SampleList) -> List[Tensor]:
+        """Compute uncertainty with augmented bboxes.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`,
+                which are `pseudo_instance` or `pseudo_panoptic_seg`
+                or `pseudo_sem_seg` in fact.
+
+        Returns:
+            list[Tensor]: A list of uncertainty for pseudo bboxes.
+        """
+        auged_results_list = self.aug_box(batch_data_samples,
+                                          self.semi_train_cfg.jitter_times,
+                                          self.semi_train_cfg.jitter_scale)
+        # flatten
+        auged_results_list = [
+            InstanceData(bboxes=auged.reshape(-1, auged.shape[-1]))
+            for auged in auged_results_list
+        ]
+
+        self.teacher.roi_head.test_cfg = None
+        results_list = self.teacher.roi_head.predict(
+            x, auged_results_list, batch_data_samples, rescale=False)
+        self.teacher.roi_head.test_cfg = self.teacher.test_cfg.rcnn
+
+        reg_channel = max(
+            [results.bboxes.shape[-1] for results in results_list]) // 4
+        bboxes = [
+            results.bboxes.reshape(self.semi_train_cfg.jitter_times, -1,
+                                   results.bboxes.shape[-1])
+            if results.bboxes.numel() > 0 else results.bboxes.new_zeros(
+                self.semi_train_cfg.jitter_times, 0, 4 * reg_channel).float()
+            for results in results_list
+        ]
+
+        box_unc = [bbox.std(dim=0) for bbox in bboxes]
+        bboxes = [bbox.mean(dim=0) for bbox in bboxes]
+        labels = [
+            data_samples.gt_instances.labels
+            for data_samples in batch_data_samples
+        ]
+        if reg_channel != 1:
+            bboxes = [
+                bbox.reshape(bbox.shape[0], reg_channel,
+                             4)[torch.arange(bbox.shape[0]), label]
+                for bbox, label in zip(bboxes, labels)
+            ]
+            box_unc = [
+                unc.reshape(unc.shape[0], reg_channel,
+                            4)[torch.arange(unc.shape[0]), label]
+                for unc, label in zip(box_unc, labels)
+            ]
+
+        box_shape = [(bbox[:, 2:4] - bbox[:, :2]).clamp(min=1.0)
+                     for bbox in bboxes]
+        box_unc = [
+            torch.mean(
+                unc / wh[:, None, :].expand(-1, 2, 2).reshape(-1, 4), dim=-1)
+            if wh.numel() > 0 else unc for unc, wh in zip(box_unc, box_shape)
+        ]
+        return box_unc
+
+    @staticmethod
+    def aug_box(batch_data_samples, times, frac):
+        """Augment bboxes with jitter."""
+
+        def _aug_single(box):
+            box_scale = box[:, 2:4] - box[:, :2]
+            box_scale = (
+                box_scale.clamp(min=1)[:, None, :].expand(-1, 2,
+                                                          2).reshape(-1, 4))
+            aug_scale = box_scale * frac  # [n,4]
+
+            offset = (
+                torch.randn(times, box.shape[0], 4, device=box.device) *
+                aug_scale[None, ...])
+            new_box = box.clone()[None, ...].expand(times, box.shape[0],
+                                                    -1) + offset
+            return new_box
+
+        return [
+            _aug_single(data_samples.gt_instances.bboxes)
+            for data_samples in batch_data_samples
+        ]
diff --git a/head_extractor/src/mmdet/models/detectors/solo.py b/head_extractor/src/mmdet/models/detectors/solo.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bf47ba24941e09fd795b241a3f6aa0b67ae3380
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/solo.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class SOLO(SingleStageInstanceSegmentor):
+    """`SOLO: Segmenting Objects by Locations
+    <https://arxiv.org/abs/1912.04488>`_
+
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 mask_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/solov2.py b/head_extractor/src/mmdet/models/detectors/solov2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eefe4c532267be1480d13b8d73fc54bf694e81c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/solov2.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class SOLOv2(SingleStageInstanceSegmentor):
+    """`SOLOv2: Dynamic and Fast Instance Segmentation
+    <https://arxiv.org/abs/2003.10152>`_
+
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 bbox_head: OptConfigType = None,
+                 mask_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/sparse_rcnn.py b/head_extractor/src/mmdet/models/detectors/sparse_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..75442a69e472953854ded9fc8c30ac4ab30535d3
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/sparse_rcnn.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .two_stage import TwoStageDetector
+
+
+@MODELS.register_module()
+class SparseRCNN(TwoStageDetector):
+    r"""Implementation of `Sparse R-CNN: End-to-End Object Detection with
+    Learnable Proposals <https://arxiv.org/abs/2011.12450>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 rpn_head: OptConfigType = None,
+                 roi_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+        assert self.with_rpn, 'Sparse R-CNN and QueryInst ' \
+            'do not support external proposals'
diff --git a/head_extractor/src/mmdet/models/detectors/tood.py b/head_extractor/src/mmdet/models/detectors/tood.py
new file mode 100644
index 0000000000000000000000000000000000000000..38720482c5451471f5a66a6cf689dbed6100c9fa
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/tood.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class TOOD(SingleStageDetector):
+    r"""Implementation of `TOOD: Task-aligned One-stage Object Detection.
+    <https://arxiv.org/abs/2108.07755>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of TOOD. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of TOOD. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/trident_faster_rcnn.py b/head_extractor/src/mmdet/models/detectors/trident_faster_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4244925beaebea820f836b41ab5463f5f499f4d0
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/trident_faster_rcnn.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .faster_rcnn import FasterRCNN
+
+
+@MODELS.register_module()
+class TridentFasterRCNN(FasterRCNN):
+    """Implementation of `TridentNet <https://arxiv.org/abs/1901.01892>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 rpn_head: ConfigType,
+                 roi_head: ConfigType,
+                 train_cfg: ConfigType,
+                 test_cfg: ConfigType,
+                 neck: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
+        assert self.backbone.num_branch == self.roi_head.num_branch
+        assert self.backbone.test_branch_idx == self.roi_head.test_branch_idx
+        self.num_branch = self.backbone.num_branch
+        self.test_branch_idx = self.backbone.test_branch_idx
+
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> tuple:
+        """copy the ``batch_data_samples`` to fit multi-branch."""
+        num_branch = self.num_branch \
+            if self.training or self.test_branch_idx == -1 else 1
+        trident_data_samples = batch_data_samples * num_branch
+        return super()._forward(
+            batch_inputs=batch_inputs, batch_data_samples=trident_data_samples)
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """copy the ``batch_data_samples`` to fit multi-branch."""
+        num_branch = self.num_branch \
+            if self.training or self.test_branch_idx == -1 else 1
+        trident_data_samples = batch_data_samples * num_branch
+        return super().loss(
+            batch_inputs=batch_inputs, batch_data_samples=trident_data_samples)
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """copy the ``batch_data_samples`` to fit multi-branch."""
+        num_branch = self.num_branch \
+            if self.training or self.test_branch_idx == -1 else 1
+        trident_data_samples = batch_data_samples * num_branch
+        return super().predict(
+            batch_inputs=batch_inputs,
+            batch_data_samples=trident_data_samples,
+            rescale=rescale)
+
+    # TODO need to refactor
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        x = self.extract_feats(imgs)
+        num_branch = (self.num_branch if self.test_branch_idx == -1 else 1)
+        trident_img_metas = [img_metas * num_branch for img_metas in img_metas]
+        proposal_list = self.rpn_head.aug_test_rpn(x, trident_img_metas)
+        return self.roi_head.aug_test(
+            x, proposal_list, img_metas, rescale=rescale)
diff --git a/head_extractor/src/mmdet/models/detectors/two_stage.py b/head_extractor/src/mmdet/models/detectors/two_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e83df9eb5ce837636e10c4592fe26a7edce1657
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/two_stage.py
@@ -0,0 +1,243 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List, Tuple, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .base import BaseDetector
+
+
+@MODELS.register_module()
+class TwoStageDetector(BaseDetector):
+    """Base class for two-stage detectors.
+
+    Two-stage detectors typically consisting of a region proposal network and a
+    task-specific regression head.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: OptConfigType = None,
+                 rpn_head: OptConfigType = None,
+                 roi_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone)
+
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+
+        if rpn_head is not None:
+            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
+            rpn_head_ = rpn_head.copy()
+            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
+            rpn_head_num_classes = rpn_head_.get('num_classes', None)
+            if rpn_head_num_classes is None:
+                rpn_head_.update(num_classes=1)
+            else:
+                if rpn_head_num_classes != 1:
+                    warnings.warn(
+                        'The `num_classes` should be 1 in RPN, but get '
+                        f'{rpn_head_num_classes}, please set '
+                        'rpn_head.num_classes = 1 in your config file.')
+                    rpn_head_.update(num_classes=1)
+            self.rpn_head = MODELS.build(rpn_head_)
+
+        if roi_head is not None:
+            # update train and test cfg here for now
+            # TODO: refactor assigner & sampler
+            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
+            roi_head.update(train_cfg=rcnn_train_cfg)
+            roi_head.update(test_cfg=test_cfg.rcnn)
+            self.roi_head = MODELS.build(roi_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def _load_from_state_dict(self, state_dict: dict, prefix: str,
+                              local_metadata: dict, strict: bool,
+                              missing_keys: Union[List[str], str],
+                              unexpected_keys: Union[List[str], str],
+                              error_msgs: Union[List[str], str]) -> None:
+        """Exchange bbox_head key to rpn_head key when loading single-stage
+        weights into two-stage model."""
+        bbox_head_prefix = prefix + '.bbox_head' if prefix else 'bbox_head'
+        bbox_head_keys = [
+            k for k in state_dict.keys() if k.startswith(bbox_head_prefix)
+        ]
+        rpn_head_prefix = prefix + '.rpn_head' if prefix else 'rpn_head'
+        rpn_head_keys = [
+            k for k in state_dict.keys() if k.startswith(rpn_head_prefix)
+        ]
+        if len(bbox_head_keys) != 0 and len(rpn_head_keys) == 0:
+            for bbox_head_key in bbox_head_keys:
+                rpn_head_key = rpn_head_prefix + \
+                               bbox_head_key[len(bbox_head_prefix):]
+                state_dict[rpn_head_key] = state_dict.pop(bbox_head_key)
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    @property
+    def with_rpn(self) -> bool:
+        """bool: whether the detector has RPN"""
+        return hasattr(self, 'rpn_head') and self.rpn_head is not None
+
+    @property
+    def with_roi_head(self) -> bool:
+        """bool: whether the detector has a RoI head"""
+        return hasattr(self, 'roi_head') and self.roi_head is not None
+
+    def extract_feat(self, batch_inputs: Tensor) -> Tuple[Tensor]:
+        """Extract features.
+
+        Args:
+            batch_inputs (Tensor): Image tensor with shape (N, C, H ,W).
+
+        Returns:
+            tuple[Tensor]: Multi-level features that may have
+            different resolutions.
+        """
+        x = self.backbone(batch_inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def _forward(self, batch_inputs: Tensor,
+                 batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns:
+            tuple: A tuple of features from ``rpn_head`` and ``roi_head``
+            forward.
+        """
+        results = ()
+        x = self.extract_feat(batch_inputs)
+
+        if self.with_rpn:
+            rpn_results_list = self.rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            assert batch_data_samples[0].get('proposals', None) is not None
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+        roi_outs = self.roi_head.forward(x, rpn_results_list,
+                                         batch_data_samples)
+        results = results + (roi_outs, )
+        return results
+
+    def loss(self, batch_inputs: Tensor,
+             batch_data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            batch_inputs (Tensor): Input images of shape (N, C, H, W).
+                These should usually be mean centered and std scaled.
+            batch_data_samples (List[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: A dictionary of loss components
+        """
+        x = self.extract_feat(batch_inputs)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.with_rpn:
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            rpn_data_samples = copy.deepcopy(batch_data_samples)
+            # set cat_id of gt_labels to 0 in RPN
+            for data_sample in rpn_data_samples:
+                data_sample.gt_instances.labels = \
+                    torch.zeros_like(data_sample.gt_instances.labels)
+
+            rpn_losses, rpn_results_list = self.rpn_head.loss_and_predict(
+                x, rpn_data_samples, proposal_cfg=proposal_cfg)
+            # avoid get same name with roi_head loss
+            keys = rpn_losses.keys()
+            for key in list(keys):
+                if 'loss' in key and 'rpn' not in key:
+                    rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+            losses.update(rpn_losses)
+        else:
+            assert batch_data_samples[0].get('proposals', None) is not None
+            # use pre-defined proposals in InstanceData for the second stage
+            # to extract ROI features.
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        roi_losses = self.roi_head.loss(x, rpn_results_list,
+                                        batch_data_samples)
+        losses.update(roi_losses)
+
+        return losses
+
+    def predict(self,
+                batch_inputs: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = True) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results.
+                Defaults to True.
+
+        Returns:
+            list[:obj:`DetDataSample`]: Return the detection results of the
+            input images. The returns value is DetDataSample,
+            which usually contain 'pred_instances'. And the
+            ``pred_instances`` usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        x = self.extract_feat(batch_inputs)
+
+        # If there are no pre-defined proposals, use RPN to get proposals
+        if batch_data_samples[0].get('proposals', None) is None:
+            rpn_results_list = self.rpn_head.predict(
+                x, batch_data_samples, rescale=False)
+        else:
+            rpn_results_list = [
+                data_sample.proposals for data_sample in batch_data_samples
+            ]
+
+        results_list = self.roi_head.predict(
+            x, rpn_results_list, batch_data_samples, rescale=rescale)
+
+        batch_data_samples = self.add_pred_to_datasample(
+            batch_data_samples, results_list)
+        return batch_data_samples
diff --git a/head_extractor/src/mmdet/models/detectors/vfnet.py b/head_extractor/src/mmdet/models/detectors/vfnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a695513faa7d37756d7716cbca0e457060400518
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/vfnet.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class VFNet(SingleStageDetector):
+    """Implementation of `VarifocalNet
+    (VFNet).<https://arxiv.org/abs/2008.13367>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of VFNet. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of VFNet. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/yolact.py b/head_extractor/src/mmdet/models/detectors/yolact.py
new file mode 100644
index 0000000000000000000000000000000000000000..f15fb7b70263b0c4018751067771b1365af96f67
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/yolact.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage_instance_seg import SingleStageInstanceSegmentor
+
+
+@MODELS.register_module()
+class YOLACT(SingleStageInstanceSegmentor):
+    """Implementation of `YOLACT <https://arxiv.org/abs/1904.02689>`_"""
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 mask_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/yolo.py b/head_extractor/src/mmdet/models/detectors/yolo.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cb9a9cd250a2c26af22032b1ed4bb5a7a8af605
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/yolo.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class YOLOV3(SingleStageDetector):
+    r"""Implementation of `Yolov3: An incremental improvement
+    <https://arxiv.org/abs/1804.02767>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of YOLOX. Default: None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of YOLOX. Default: None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional):
+            Model preprocessing config for processing the input data.
+            it usually includes ``to_rgb``, ``pad_size_divisor``,
+            ``pad_value``, ``mean`` and ``std``. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/yolof.py b/head_extractor/src/mmdet/models/detectors/yolof.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6d98b9134a7f422fa7ea1f1a1e0d548d36603e8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/yolof.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class YOLOF(SingleStageDetector):
+    r"""Implementation of `You Only Look One-level Feature
+    <https://arxiv.org/abs/2103.09460>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone module.
+        neck (:obj:`ConfigDict` or dict): The neck module.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head module.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of YOLOF. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of YOLOF. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional):
+            Model preprocessing config for processing the input data.
+            it usually includes ``to_rgb``, ``pad_size_divisor``,
+            ``pad_value``, ``mean`` and ``std``. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/detectors/yolox.py b/head_extractor/src/mmdet/models/detectors/yolox.py
new file mode 100644
index 0000000000000000000000000000000000000000..df9190c93f7b043910fbce3bd5ee8dc0ef7b5f68
--- /dev/null
+++ b/head_extractor/src/mmdet/models/detectors/yolox.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .single_stage import SingleStageDetector
+
+
+@MODELS.register_module()
+class YOLOX(SingleStageDetector):
+    r"""Implementation of `YOLOX: Exceeding YOLO Series in 2021
+    <https://arxiv.org/abs/2107.08430>`_
+
+    Args:
+        backbone (:obj:`ConfigDict` or dict): The backbone config.
+        neck (:obj:`ConfigDict` or dict): The neck config.
+        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        train_cfg (:obj:`ConfigDict` or dict, optional): The training config
+            of YOLOX. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): The testing config
+            of YOLOX. Defaults to None.
+        data_preprocessor (:obj:`ConfigDict` or dict, optional): Config of
+            :class:`DetDataPreprocessor` to process the input data.
+            Defaults to None.
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 neck: ConfigType,
+                 bbox_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            init_cfg=init_cfg)
diff --git a/head_extractor/src/mmdet/models/language_models/__init__.py b/head_extractor/src/mmdet/models/language_models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..70f1a22c7c01624ba3235f1737f8aea1e26a19fe
--- /dev/null
+++ b/head_extractor/src/mmdet/models/language_models/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bert import BertModel
+
+__all__ = ['BertModel']
diff --git a/head_extractor/src/mmdet/models/language_models/bert.py b/head_extractor/src/mmdet/models/language_models/bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..efb0f46bad6eb0734a324c32a7b05f2795604265
--- /dev/null
+++ b/head_extractor/src/mmdet/models/language_models/bert.py
@@ -0,0 +1,231 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Sequence
+
+import torch
+from mmengine.model import BaseModel
+from torch import nn
+
+try:
+    from transformers import AutoTokenizer, BertConfig
+    from transformers import BertModel as HFBertModel
+except ImportError:
+    AutoTokenizer = None
+    HFBertModel = None
+
+from mmdet.registry import MODELS
+
+
+def generate_masks_with_special_tokens_and_transfer_map(
+        tokenized, special_tokens_list):
+    """Generate attention mask between each pair of special tokens.
+
+    Only token pairs in between two special tokens are attended to
+    and thus the attention mask for these pairs is positive.
+
+    Args:
+        input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+        special_tokens_mask (list): special tokens mask.
+
+    Returns:
+        Tuple(Tensor, Tensor):
+        - attention_mask is the attention mask between each tokens.
+          Only token pairs in between two special tokens are positive.
+          Shape: [bs, num_token, num_token].
+        - position_ids is the position id of tokens within each valid sentence.
+          The id starts from 0 whenenver a special token is encountered.
+          Shape: [bs, num_token]
+    """
+    input_ids = tokenized['input_ids']
+    bs, num_token = input_ids.shape
+    # special_tokens_mask:
+    # bs, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((bs, num_token),
+                                      device=input_ids.device).bool()
+
+    for special_token in special_tokens_list:
+        special_tokens_mask |= input_ids == special_token
+
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+
+    # generate attention mask and positional ids
+    attention_mask = (
+        torch.eye(num_token,
+                  device=input_ids.device).bool().unsqueeze(0).repeat(
+                      bs, 1, 1))
+    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1:col + 1,
+                           previous_col + 1:col + 1] = True
+            position_ids[row, previous_col + 1:col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device)
+        previous_col = col
+
+    return attention_mask, position_ids.to(torch.long)
+
+
+@MODELS.register_module()
+class BertModel(BaseModel):
+    """BERT model for language embedding only encoder.
+
+    Args:
+        name (str, optional): name of the pretrained BERT model from
+            HuggingFace. Defaults to bert-base-uncased.
+        max_tokens (int, optional): maximum number of tokens to be
+            used for BERT. Defaults to 256.
+        pad_to_max (bool, optional): whether to pad the tokens to max_tokens.
+             Defaults to True.
+        use_sub_sentence_represent (bool, optional): whether to use sub
+            sentence represent introduced in `Grounding DINO
+            <https://arxiv.org/abs/2303.05499>`. Defaults to False.
+        special_tokens_list (list, optional): special tokens used to split
+            subsentence. It cannot be None when `use_sub_sentence_represent`
+            is True. Defaults to None.
+        add_pooling_layer (bool, optional): whether to adding pooling
+            layer in bert encoder. Defaults to False.
+        num_layers_of_embedded (int, optional): number of layers of
+            the embedded model. Defaults to 1.
+        use_checkpoint (bool, optional): whether to use gradient checkpointing.
+             Defaults to False.
+    """
+
+    def __init__(self,
+                 name: str = 'bert-base-uncased',
+                 max_tokens: int = 256,
+                 pad_to_max: bool = True,
+                 use_sub_sentence_represent: bool = False,
+                 special_tokens_list: list = None,
+                 add_pooling_layer: bool = False,
+                 num_layers_of_embedded: int = 1,
+                 use_checkpoint: bool = False,
+                 **kwargs) -> None:
+
+        super().__init__(**kwargs)
+        self.max_tokens = max_tokens
+        self.pad_to_max = pad_to_max
+
+        if AutoTokenizer is None:
+            raise RuntimeError(
+                'transformers is not installed, please install it by: '
+                'pip install transformers.')
+
+        self.tokenizer = AutoTokenizer.from_pretrained(name)
+        self.language_backbone = nn.Sequential(
+            OrderedDict([('body',
+                          BertEncoder(
+                              name,
+                              add_pooling_layer=add_pooling_layer,
+                              num_layers_of_embedded=num_layers_of_embedded,
+                              use_checkpoint=use_checkpoint))]))
+
+        self.use_sub_sentence_represent = use_sub_sentence_represent
+        if self.use_sub_sentence_represent:
+            assert special_tokens_list is not None, \
+                'special_tokens should not be None \
+                    if use_sub_sentence_represent is True'
+
+            self.special_tokens = self.tokenizer.convert_tokens_to_ids(
+                special_tokens_list)
+
+    def forward(self, captions: Sequence[str], **kwargs) -> dict:
+        """Forward function."""
+        device = next(self.language_backbone.parameters()).device
+        tokenized = self.tokenizer.batch_encode_plus(
+            captions,
+            max_length=self.max_tokens,
+            padding='max_length' if self.pad_to_max else 'longest',
+            return_special_tokens_mask=True,
+            return_tensors='pt',
+            truncation=True).to(device)
+        input_ids = tokenized.input_ids
+        if self.use_sub_sentence_represent:
+            attention_mask, position_ids = \
+                generate_masks_with_special_tokens_and_transfer_map(
+                    tokenized, self.special_tokens)
+            token_type_ids = tokenized['token_type_ids']
+
+        else:
+            attention_mask = tokenized.attention_mask
+            position_ids = None
+            token_type_ids = None
+
+        tokenizer_input = {
+            'input_ids': input_ids,
+            'attention_mask': attention_mask,
+            'position_ids': position_ids,
+            'token_type_ids': token_type_ids
+        }
+        language_dict_features = self.language_backbone(tokenizer_input)
+        if self.use_sub_sentence_represent:
+            language_dict_features['position_ids'] = position_ids
+            language_dict_features[
+                'text_token_mask'] = tokenized.attention_mask.bool()
+        return language_dict_features
+
+
+class BertEncoder(nn.Module):
+    """BERT encoder for language embedding.
+
+    Args:
+        name (str): name of the pretrained BERT model from HuggingFace.
+                Defaults to bert-base-uncased.
+        add_pooling_layer (bool): whether to add a pooling layer.
+        num_layers_of_embedded (int): number of layers of the embedded model.
+                Defaults to 1.
+        use_checkpoint (bool): whether to use gradient checkpointing.
+                Defaults to False.
+    """
+
+    def __init__(self,
+                 name: str,
+                 add_pooling_layer: bool = False,
+                 num_layers_of_embedded: int = 1,
+                 use_checkpoint: bool = False):
+        super().__init__()
+        if BertConfig is None:
+            raise RuntimeError(
+                'transformers is not installed, please install it by: '
+                'pip install transformers.')
+        config = BertConfig.from_pretrained(name)
+        config.gradient_checkpointing = use_checkpoint
+        # only encoder
+        self.model = HFBertModel.from_pretrained(
+            name, add_pooling_layer=add_pooling_layer, config=config)
+        self.language_dim = config.hidden_size
+        self.num_layers_of_embedded = num_layers_of_embedded
+
+    def forward(self, x) -> dict:
+        mask = x['attention_mask']
+
+        outputs = self.model(
+            input_ids=x['input_ids'],
+            attention_mask=mask,
+            position_ids=x['position_ids'],
+            token_type_ids=x['token_type_ids'],
+            output_hidden_states=True,
+        )
+
+        # outputs has 13 layers, 1 input layer and 12 hidden layers
+        encoded_layers = outputs.hidden_states[1:]
+        features = torch.stack(encoded_layers[-self.num_layers_of_embedded:],
+                               1).mean(1)
+        # language embedding has shape [len(phrase), seq_len, language_dim]
+        features = features / self.num_layers_of_embedded
+        if mask.dim() == 2:
+            embedded = features * mask.unsqueeze(-1).float()
+        else:
+            embedded = features
+
+        results = {
+            'embedded': embedded,
+            'masks': mask,
+            'hidden': encoded_layers[-1]
+        }
+        return results
diff --git a/head_extractor/src/mmdet/models/layers/__init__.py b/head_extractor/src/mmdet/models/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3c41f64d11bbdb7f2c8e128a2e28b2845159589
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/__init__.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .activations import SiLU
+from .bbox_nms import fast_nms, multiclass_nms
+from .brick_wrappers import (AdaptiveAvgPool2d, FrozenBatchNorm2d,
+                             adaptive_avg_pool2d)
+from .conv_upsample import ConvUpsample
+from .csp_layer import CSPLayer
+from .dropblock import DropBlock
+from .ema import ExpMomentumEMA
+from .inverted_residual import InvertedResidual
+from .matrix_nms import mask_matrix_nms
+from .msdeformattn_pixel_decoder import MSDeformAttnPixelDecoder
+from .normed_predictor import NormedConv2d, NormedLinear
+from .pixel_decoder import PixelDecoder, TransformerEncoderPixelDecoder
+from .positional_encoding import (LearnedPositionalEncoding,
+                                  SinePositionalEncoding,
+                                  SinePositionalEncoding3D)
+from .res_layer import ResLayer, SimplifiedBasicBlock
+from .se_layer import ChannelAttention, DyReLU, SELayer
+# yapf: disable
+from .transformer import (MLP, AdaptivePadding, CdnQueryGenerator,
+                          ConditionalAttention,
+                          ConditionalDetrTransformerDecoder,
+                          ConditionalDetrTransformerDecoderLayer,
+                          DABDetrTransformerDecoder,
+                          DABDetrTransformerDecoderLayer,
+                          DABDetrTransformerEncoder, DDQTransformerDecoder,
+                          DeformableDetrTransformerDecoder,
+                          DeformableDetrTransformerDecoderLayer,
+                          DeformableDetrTransformerEncoder,
+                          DeformableDetrTransformerEncoderLayer,
+                          DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DetrTransformerEncoder, DetrTransformerEncoderLayer,
+                          DinoTransformerDecoder, DynamicConv,
+                          Mask2FormerTransformerDecoder,
+                          Mask2FormerTransformerDecoderLayer,
+                          Mask2FormerTransformerEncoder, PatchEmbed,
+                          PatchMerging, coordinate_to_encoding,
+                          inverse_sigmoid, nchw_to_nlc, nlc_to_nchw)
+
+# yapf: enable
+
+__all__ = [
+    'fast_nms', 'multiclass_nms', 'mask_matrix_nms', 'DropBlock',
+    'PixelDecoder', 'TransformerEncoderPixelDecoder',
+    'MSDeformAttnPixelDecoder', 'ResLayer', 'PatchMerging',
+    'SinePositionalEncoding', 'LearnedPositionalEncoding', 'DynamicConv',
+    'SimplifiedBasicBlock', 'NormedLinear', 'NormedConv2d', 'InvertedResidual',
+    'SELayer', 'ConvUpsample', 'CSPLayer', 'adaptive_avg_pool2d',
+    'AdaptiveAvgPool2d', 'PatchEmbed', 'nchw_to_nlc', 'nlc_to_nchw', 'DyReLU',
+    'ExpMomentumEMA', 'inverse_sigmoid', 'ChannelAttention', 'SiLU', 'MLP',
+    'DetrTransformerEncoderLayer', 'DetrTransformerDecoderLayer',
+    'DetrTransformerEncoder', 'DetrTransformerDecoder',
+    'DeformableDetrTransformerEncoder', 'DeformableDetrTransformerDecoder',
+    'DeformableDetrTransformerEncoderLayer',
+    'DeformableDetrTransformerDecoderLayer', 'AdaptivePadding',
+    'coordinate_to_encoding', 'ConditionalAttention',
+    'DABDetrTransformerDecoderLayer', 'DABDetrTransformerDecoder',
+    'DABDetrTransformerEncoder', 'DDQTransformerDecoder',
+    'ConditionalDetrTransformerDecoder',
+    'ConditionalDetrTransformerDecoderLayer', 'DinoTransformerDecoder',
+    'CdnQueryGenerator', 'Mask2FormerTransformerEncoder',
+    'Mask2FormerTransformerDecoderLayer', 'Mask2FormerTransformerDecoder',
+    'SinePositionalEncoding3D', 'FrozenBatchNorm2d'
+]
diff --git a/head_extractor/src/mmdet/models/layers/activations.py b/head_extractor/src/mmdet/models/layers/activations.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e73ef42180ccd3dddb4bcca224c0b4eb5da807c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/activations.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.utils import digit_version
+
+from mmdet.registry import MODELS
+
+if digit_version(torch.__version__) >= digit_version('1.7.0'):
+    from torch.nn import SiLU
+else:
+
+    class SiLU(nn.Module):
+        """Sigmoid Weighted Liner Unit."""
+
+        def __init__(self, inplace=True):
+            super().__init__()
+
+        def forward(self, inputs) -> torch.Tensor:
+            return inputs * torch.sigmoid(inputs)
+
+
+MODELS.register_module(module=SiLU, name='SiLU')
diff --git a/head_extractor/src/mmdet/models/layers/bbox_nms.py b/head_extractor/src/mmdet/models/layers/bbox_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd67a45f60ca98c354e095127ab7dbb9653deca5
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/bbox_nms.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from mmcv.ops.nms import batched_nms
+from torch import Tensor
+
+from mmdet.structures.bbox import bbox_overlaps
+from mmdet.utils import ConfigType
+
+
+def multiclass_nms(
+    multi_bboxes: Tensor,
+    multi_scores: Tensor,
+    score_thr: float,
+    nms_cfg: ConfigType,
+    max_num: int = -1,
+    score_factors: Optional[Tensor] = None,
+    return_inds: bool = False,
+    box_dim: int = 4
+) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_cfg (Union[:obj:`ConfigDict`, dict]): a dict that contains
+            the arguments of nms operations.
+        max_num (int, optional): if there are more than max_num bboxes after
+            NMS, only top max_num will be kept. Default to -1.
+        score_factors (Tensor, optional): The factors multiplied to scores
+            before applying NMS. Default to None.
+        return_inds (bool, optional): Whether return the indices of kept
+            bboxes. Default to False.
+        box_dim (int): The dimension of boxes. Defaults to 4.
+
+    Returns:
+        Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+            (dets, labels, indices (optional)), tensors of shape (k, 5),
+            (k), and (k). Dets are boxes with scores. Labels are 0-based.
+    """
+    num_classes = multi_scores.size(1) - 1
+    # exclude background category
+    if multi_bboxes.shape[1] > box_dim:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, box_dim)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, box_dim)
+
+    scores = multi_scores[:, :-1]
+
+    labels = torch.arange(num_classes, dtype=torch.long, device=scores.device)
+    labels = labels.view(1, -1).expand_as(scores)
+
+    bboxes = bboxes.reshape(-1, box_dim)
+    scores = scores.reshape(-1)
+    labels = labels.reshape(-1)
+
+    if not torch.onnx.is_in_onnx_export():
+        # NonZero not supported  in TensorRT
+        # remove low scoring boxes
+        valid_mask = scores > score_thr
+    # multiply score_factor after threshold to preserve more bboxes, improve
+    # mAP by 1% for YOLOv3
+    if score_factors is not None:
+        # expand the shape to match original shape of score
+        score_factors = score_factors.view(-1, 1).expand(
+            multi_scores.size(0), num_classes)
+        score_factors = score_factors.reshape(-1)
+        scores = scores * score_factors
+
+    if not torch.onnx.is_in_onnx_export():
+        # NonZero not supported  in TensorRT
+        inds = valid_mask.nonzero(as_tuple=False).squeeze(1)
+        bboxes, scores, labels = bboxes[inds], scores[inds], labels[inds]
+    else:
+        # TensorRT NMS plugin has invalid output filled with -1
+        # add dummy data to make detection output correct.
+        bboxes = torch.cat([bboxes, bboxes.new_zeros(1, box_dim)], dim=0)
+        scores = torch.cat([scores, scores.new_zeros(1)], dim=0)
+        labels = torch.cat([labels, labels.new_zeros(1)], dim=0)
+
+    if bboxes.numel() == 0:
+        if torch.onnx.is_in_onnx_export():
+            raise RuntimeError('[ONNX Error] Can not record NMS '
+                               'as it has not been executed this time')
+        dets = torch.cat([bboxes, scores[:, None]], -1)
+        if return_inds:
+            return dets, labels, inds
+        else:
+            return dets, labels
+
+    dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)
+
+    if max_num > 0:
+        dets = dets[:max_num]
+        keep = keep[:max_num]
+
+    if return_inds:
+        return dets, labels[keep], inds[keep]
+    else:
+        return dets, labels[keep]
+
+
+def fast_nms(
+    multi_bboxes: Tensor,
+    multi_scores: Tensor,
+    multi_coeffs: Tensor,
+    score_thr: float,
+    iou_thr: float,
+    top_k: int,
+    max_num: int = -1
+) -> Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+    """Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_.
+
+    Fast NMS allows already-removed detections to suppress other detections so
+    that every instance can be decided to be kept or discarded in parallel,
+    which is not possible in traditional NMS. This relaxation allows us to
+    implement Fast NMS entirely in standard GPU-accelerated matrix operations.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class+1), where the last column
+            contains scores of the background class, but this will be ignored.
+        multi_coeffs (Tensor): shape (n, #class*coeffs_dim).
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        iou_thr (float): IoU threshold to be considered as conflicted.
+        top_k (int): if there are more than top_k bboxes before NMS,
+            only top top_k will be kept.
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept. If -1, keep all the bboxes.
+            Default: -1.
+
+    Returns:
+        Union[Tuple[Tensor, Tensor, Tensor], Tuple[Tensor, Tensor]]:
+            (dets, labels, coefficients), tensors of shape (k, 5), (k, 1),
+            and (k, coeffs_dim). Dets are boxes with scores.
+            Labels are 0-based.
+    """
+
+    scores = multi_scores[:, :-1].t()  # [#class, n]
+    scores, idx = scores.sort(1, descending=True)
+
+    idx = idx[:, :top_k].contiguous()
+    scores = scores[:, :top_k]  # [#class, topk]
+    num_classes, num_dets = idx.size()
+    boxes = multi_bboxes[idx.view(-1), :].view(num_classes, num_dets, 4)
+    coeffs = multi_coeffs[idx.view(-1), :].view(num_classes, num_dets, -1)
+
+    iou = bbox_overlaps(boxes, boxes)  # [#class, topk, topk]
+    iou.triu_(diagonal=1)
+    iou_max, _ = iou.max(dim=1)
+
+    # Now just filter out the ones higher than the threshold
+    keep = iou_max <= iou_thr
+
+    # Second thresholding introduces 0.2 mAP gain at negligible time cost
+    keep *= scores > score_thr
+
+    # Assign each kept detection to its corresponding class
+    classes = torch.arange(
+        num_classes, device=boxes.device)[:, None].expand_as(keep)
+    classes = classes[keep]
+
+    boxes = boxes[keep]
+    coeffs = coeffs[keep]
+    scores = scores[keep]
+
+    # Only keep the top max_num highest scores across all classes
+    scores, idx = scores.sort(0, descending=True)
+    if max_num > 0:
+        idx = idx[:max_num]
+        scores = scores[:max_num]
+
+    classes = classes[idx]
+    boxes = boxes[idx]
+    coeffs = coeffs[idx]
+
+    cls_dets = torch.cat([boxes, scores[:, None]], dim=1)
+    return cls_dets, classes, coeffs
diff --git a/head_extractor/src/mmdet/models/layers/brick_wrappers.py b/head_extractor/src/mmdet/models/layers/brick_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ecb8499de329132561dfedb8f55c36080787b31
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/brick_wrappers.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn.bricks.wrappers import NewEmptyTensorOp, obsolete_torch_version
+
+from mmdet.registry import MODELS
+
+if torch.__version__ == 'parrots':
+    TORCH_VERSION = torch.__version__
+else:
+    # torch.__version__ could be 1.3.1+cu92, we only need the first two
+    # for comparison
+    TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
+
+
+def adaptive_avg_pool2d(input, output_size):
+    """Handle empty batch dimension to adaptive_avg_pool2d.
+
+    Args:
+        input (tensor): 4D tensor.
+        output_size (int, tuple[int,int]): the target output size.
+    """
+    if input.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+        output_size = [*input.shape[:2], *output_size]
+        empty = NewEmptyTensorOp.apply(input, output_size)
+        return empty
+    else:
+        return F.adaptive_avg_pool2d(input, output_size)
+
+
+class AdaptiveAvgPool2d(nn.AdaptiveAvgPool2d):
+    """Handle empty batch dimension to AdaptiveAvgPool2d."""
+
+    def forward(self, x):
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+            output_size = self.output_size
+            if isinstance(output_size, int):
+                output_size = [output_size, output_size]
+            else:
+                output_size = [
+                    v if v is not None else d
+                    for v, d in zip(output_size,
+                                    x.size()[-2:])
+                ]
+            output_size = [*x.shape[:2], *output_size]
+            empty = NewEmptyTensorOp.apply(x, output_size)
+            return empty
+
+        return super().forward(x)
+
+
+# Modified from
+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py#L13 # noqa
+@MODELS.register_module('FrozenBN')
+class FrozenBatchNorm2d(nn.Module):
+    """BatchNorm2d where the batch statistics and the affine parameters are
+    fixed.
+
+    It contains non-trainable buffers called
+    "weight" and "bias", "running_mean", "running_var",
+    initialized to perform identity transformation.
+    Args:
+       num_features (int):  :math:`C` from an expected input of size
+            :math:`(N, C, H, W)`.
+       eps (float): a value added to the denominator for numerical stability.
+            Default: 1e-5
+    """
+
+    def __init__(self, num_features, eps=1e-5, **kwargs):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.register_buffer('weight', torch.ones(num_features))
+        self.register_buffer('bias', torch.zeros(num_features))
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features) - eps)
+
+    def forward(self, x):
+        if x.requires_grad:
+            # When gradients are needed, F.batch_norm will use extra memory
+            # because its backward op computes gradients for weight/bias
+            # as well.
+            scale = self.weight * (self.running_var + self.eps).rsqrt()
+            bias = self.bias - self.running_mean * scale
+            scale = scale.reshape(1, -1, 1, 1)
+            bias = bias.reshape(1, -1, 1, 1)
+            out_dtype = x.dtype  # may be half
+            return x * scale.to(out_dtype) + bias.to(out_dtype)
+        else:
+            # When gradients are not needed, F.batch_norm is a single fused op
+            # and provide more optimization opportunities.
+            return F.batch_norm(
+                x,
+                self.running_mean,
+                self.running_var,
+                self.weight,
+                self.bias,
+                training=False,
+                eps=self.eps,
+            )
+
+    def __repr__(self):
+        return 'FrozenBatchNorm2d(num_features={}, eps={})'.format(
+            self.num_features, self.eps)
+
+    @classmethod
+    def convert_frozen_batchnorm(cls, module):
+        """Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+
+        Args:
+            module (torch.nn.Module):
+        Returns:
+            If module is BatchNorm/SyncBatchNorm, returns a new module.
+            Otherwise, in-place convert module and return it.
+        Similar to convert_sync_batchnorm in
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+        """
+        bn_module = nn.modules.batchnorm
+        bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+        res = module
+        if isinstance(module, bn_module):
+            res = cls(module.num_features)
+            if module.affine:
+                res.weight.data = module.weight.data.clone().detach()
+                res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data
+            res.running_var.data = module.running_var.data
+            res.eps = module.eps
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozen_batchnorm(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
diff --git a/head_extractor/src/mmdet/models/layers/conv_upsample.py b/head_extractor/src/mmdet/models/layers/conv_upsample.py
new file mode 100644
index 0000000000000000000000000000000000000000..32505875a2162330ed7d00455f088d08d94f679e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/conv_upsample.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList
+
+
+class ConvUpsample(BaseModule):
+    """ConvUpsample performs 2x upsampling after Conv.
+
+    There are several `ConvModule` layers. In the first few layers, upsampling
+    will be applied after each layer of convolution. The number of upsampling
+    must be no more than the number of ConvModule layers.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        inner_channels (int): Number of channels produced by the convolution.
+        num_layers (int): Number of convolution layers.
+        num_upsample (int | optional): Number of upsampling layer. Must be no
+            more than num_layers. Upsampling will be applied after the first
+            ``num_upsample`` layers of convolution. Default: ``num_layers``.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict): Config dict for initialization. Default: None.
+        kwargs (key word augments): Other augments used in ConvModule.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 inner_channels,
+                 num_layers=1,
+                 num_upsample=None,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=None,
+                 **kwargs):
+        super(ConvUpsample, self).__init__(init_cfg)
+        if num_upsample is None:
+            num_upsample = num_layers
+        assert num_upsample <= num_layers, \
+            f'num_upsample({num_upsample})must be no more than ' \
+            f'num_layers({num_layers})'
+        self.num_layers = num_layers
+        self.num_upsample = num_upsample
+        self.conv = ModuleList()
+        for i in range(num_layers):
+            self.conv.append(
+                ConvModule(
+                    in_channels,
+                    inner_channels,
+                    3,
+                    padding=1,
+                    stride=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            in_channels = inner_channels
+
+    def forward(self, x):
+        num_upsample = self.num_upsample
+        for i in range(self.num_layers):
+            x = self.conv[i](x)
+            if num_upsample > 0:
+                num_upsample -= 1
+                x = F.interpolate(
+                    x, scale_factor=2, mode='bilinear', align_corners=False)
+        return x
diff --git a/head_extractor/src/mmdet/models/layers/csp_layer.py b/head_extractor/src/mmdet/models/layers/csp_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8b547b8994862bfe14739033bb6b254ef886f29
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/csp_layer.py
@@ -0,0 +1,246 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from .se_layer import ChannelAttention
+
+
+class DarknetBottleneck(BaseModule):
+    """The basic bottleneck block used in Darknet.
+
+    Each ResBlock consists of two ConvModules and the input is added to the
+    final output. Each ConvModule is composed of Conv, BN, and LeakyReLU.
+    The first convLayer has filter size of 1x1 and the second one has the
+    filter size of 3x3.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        expansion (float): The kernel size of the convolution.
+            Defaults to 0.5.
+        add_identity (bool): Whether to add identity to the out.
+            Defaults to True.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expansion: float = 0.5,
+                 add_identity: bool = True,
+                 use_depthwise: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='Swish'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        hidden_channels = int(out_channels * expansion)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.conv1 = ConvModule(
+            in_channels,
+            hidden_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = conv(
+            hidden_channels,
+            out_channels,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.add_identity = \
+            add_identity and in_channels == out_channels
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.add_identity:
+            return out + identity
+        else:
+            return out
+
+
+class CSPNeXtBlock(BaseModule):
+    """The basic bottleneck block used in CSPNeXt.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        expansion (float): Expand ratio of the hidden channel. Defaults to 0.5.
+        add_identity (bool): Whether to add identity to the out. Only works
+            when in_channels == out_channels. Defaults to True.
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Defaults to False.
+        kernel_size (int): The kernel size of the second convolution layer.
+            Defaults to 5.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN', momentum=0.03, eps=0.001).
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='SiLU').
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expansion: float = 0.5,
+                 add_identity: bool = True,
+                 use_depthwise: bool = False,
+                 kernel_size: int = 5,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        hidden_channels = int(out_channels * expansion)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+        self.conv1 = conv(
+            in_channels,
+            hidden_channels,
+            3,
+            stride=1,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = DepthwiseSeparableConvModule(
+            hidden_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.add_identity = \
+            add_identity and in_channels == out_channels
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.add_identity:
+            return out + identity
+        else:
+            return out
+
+
+class CSPLayer(BaseModule):
+    """Cross Stage Partial Layer.
+
+    Args:
+        in_channels (int): The input channels of the CSP layer.
+        out_channels (int): The output channels of the CSP layer.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Defaults to 0.5.
+        num_blocks (int): Number of blocks. Defaults to 1.
+        add_identity (bool): Whether to add identity in blocks.
+            Defaults to True.
+        use_cspnext_block (bool): Whether to use CSPNeXt block.
+            Defaults to False.
+        use_depthwise (bool): Whether to use depthwise separable convolution in
+            blocks. Defaults to False.
+        channel_attention (bool): Whether to add channel attention in each
+            stage. Defaults to True.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults to dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Defaults to dict(type='Swish')
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 expand_ratio: float = 0.5,
+                 num_blocks: int = 1,
+                 add_identity: bool = True,
+                 use_depthwise: bool = False,
+                 use_cspnext_block: bool = False,
+                 channel_attention: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='BN', momentum=0.03, eps=0.001),
+                 act_cfg: ConfigType = dict(type='Swish'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        block = CSPNeXtBlock if use_cspnext_block else DarknetBottleneck
+        mid_channels = int(out_channels * expand_ratio)
+        self.channel_attention = channel_attention
+        self.main_conv = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.short_conv = ConvModule(
+            in_channels,
+            mid_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.final_conv = ConvModule(
+            2 * mid_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.blocks = nn.Sequential(*[
+            block(
+                mid_channels,
+                mid_channels,
+                1.0,
+                add_identity,
+                use_depthwise,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg) for _ in range(num_blocks)
+        ])
+        if channel_attention:
+            self.attention = ChannelAttention(2 * mid_channels)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        x_short = self.short_conv(x)
+
+        x_main = self.main_conv(x)
+        x_main = self.blocks(x_main)
+
+        x_final = torch.cat((x_main, x_short), dim=1)
+
+        if self.channel_attention:
+            x_final = self.attention(x_final)
+        return self.final_conv(x_final)
diff --git a/head_extractor/src/mmdet/models/layers/dropblock.py b/head_extractor/src/mmdet/models/layers/dropblock.py
new file mode 100644
index 0000000000000000000000000000000000000000..7938199b761d637afdb1b2c62dbca01d1bf629eb
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/dropblock.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.registry import MODELS
+
+eps = 1e-6
+
+
+@MODELS.register_module()
+class DropBlock(nn.Module):
+    """Randomly drop some regions of feature maps.
+
+     Please refer to the method proposed in `DropBlock
+     <https://arxiv.org/abs/1810.12890>`_ for details.
+
+    Args:
+        drop_prob (float): The probability of dropping each block.
+        block_size (int): The size of dropped blocks.
+        warmup_iters (int): The drop probability will linearly increase
+            from `0` to `drop_prob` during the first `warmup_iters` iterations.
+            Default: 2000.
+    """
+
+    def __init__(self, drop_prob, block_size, warmup_iters=2000, **kwargs):
+        super(DropBlock, self).__init__()
+        assert block_size % 2 == 1
+        assert 0 < drop_prob <= 1
+        assert warmup_iters >= 0
+        self.drop_prob = drop_prob
+        self.block_size = block_size
+        self.warmup_iters = warmup_iters
+        self.iter_cnt = 0
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Input feature map on which some areas will be randomly
+                dropped.
+
+        Returns:
+            Tensor: The tensor after DropBlock layer.
+        """
+        if not self.training:
+            return x
+        self.iter_cnt += 1
+        N, C, H, W = list(x.shape)
+        gamma = self._compute_gamma((H, W))
+        mask_shape = (N, C, H - self.block_size + 1, W - self.block_size + 1)
+        mask = torch.bernoulli(torch.full(mask_shape, gamma, device=x.device))
+
+        mask = F.pad(mask, [self.block_size // 2] * 4, value=0)
+        mask = F.max_pool2d(
+            input=mask,
+            stride=(1, 1),
+            kernel_size=(self.block_size, self.block_size),
+            padding=self.block_size // 2)
+        mask = 1 - mask
+        x = x * mask * mask.numel() / (eps + mask.sum())
+        return x
+
+    def _compute_gamma(self, feat_size):
+        """Compute the value of gamma according to paper. gamma is the
+        parameter of bernoulli distribution, which controls the number of
+        features to drop.
+
+        gamma = (drop_prob * fm_area) / (drop_area * keep_area)
+
+        Args:
+            feat_size (tuple[int, int]): The height and width of feature map.
+
+        Returns:
+            float: The value of gamma.
+        """
+        gamma = (self.drop_prob * feat_size[0] * feat_size[1])
+        gamma /= ((feat_size[0] - self.block_size + 1) *
+                  (feat_size[1] - self.block_size + 1))
+        gamma /= (self.block_size**2)
+        factor = (1.0 if self.iter_cnt > self.warmup_iters else self.iter_cnt /
+                  self.warmup_iters)
+        return gamma * factor
+
+    def extra_repr(self):
+        return (f'drop_prob={self.drop_prob}, block_size={self.block_size}, '
+                f'warmup_iters={self.warmup_iters}')
diff --git a/head_extractor/src/mmdet/models/layers/ema.py b/head_extractor/src/mmdet/models/layers/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..73a0ca67c2888a0b17476e60b60eaf0b7eba4a6a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/ema.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmengine.model import ExponentialMovingAverage
+from torch import Tensor
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class ExpMomentumEMA(ExponentialMovingAverage):
+    """Exponential moving average (EMA) with exponential momentum strategy,
+    which is used in YOLOX.
+
+    Args:
+        model (nn.Module): The model to be averaged.
+        momentum (float): The momentum used for updating ema parameter.
+            Ema's parameter are updated with the formula:
+           `averaged_param = (1-momentum) * averaged_param + momentum *
+           source_param`. Defaults to 0.0002.
+        gamma (int): Use a larger momentum early in training and gradually
+            annealing to a smaller value to update the ema model smoothly. The
+            momentum is calculated as
+            `(1 - momentum) * exp(-(1 + steps) / gamma) + momentum`.
+            Defaults to 2000.
+        interval (int): Interval between two updates. Defaults to 1.
+        device (torch.device, optional): If provided, the averaged model will
+            be stored on the :attr:`device`. Defaults to None.
+        update_buffers (bool): if True, it will compute running averages for
+            both the parameters and the buffers of the model. Defaults to
+            False.
+    """
+
+    def __init__(self,
+                 model: nn.Module,
+                 momentum: float = 0.0002,
+                 gamma: int = 2000,
+                 interval=1,
+                 device: Optional[torch.device] = None,
+                 update_buffers: bool = False) -> None:
+        super().__init__(
+            model=model,
+            momentum=momentum,
+            interval=interval,
+            device=device,
+            update_buffers=update_buffers)
+        assert gamma > 0, f'gamma must be greater than 0, but got {gamma}'
+        self.gamma = gamma
+
+    def avg_func(self, averaged_param: Tensor, source_param: Tensor,
+                 steps: int) -> None:
+        """Compute the moving average of the parameters using the exponential
+        momentum strategy.
+
+        Args:
+            averaged_param (Tensor): The averaged parameters.
+            source_param (Tensor): The source parameters.
+            steps (int): The number of times the parameters have been
+                updated.
+        """
+        momentum = (1 - self.momentum) * math.exp(
+            -float(1 + steps) / self.gamma) + self.momentum
+        averaged_param.lerp_(source_param, momentum)
diff --git a/head_extractor/src/mmdet/models/layers/inverted_residual.py b/head_extractor/src/mmdet/models/layers/inverted_residual.py
new file mode 100644
index 0000000000000000000000000000000000000000..a174ccc8835a1ee720f9cdaa7c5be210f5be8113
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/inverted_residual.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import DropPath
+from mmengine.model import BaseModule
+
+from .se_layer import SELayer
+
+
+class InvertedResidual(BaseModule):
+    """Inverted Residual Block.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        mid_channels (int): The input channels of the depthwise convolution.
+        kernel_size (int): The kernel size of the depthwise convolution.
+            Default: 3.
+        stride (int): The stride of the depthwise convolution. Default: 1.
+        se_cfg (dict): Config dict for se layer. Default: None, which means no
+            se layer.
+        with_expand_conv (bool): Use expand conv or not. If set False,
+            mid_channels must be the same with in_channels.
+            Default: True.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        drop_path_rate (float): stochastic depth rate. Defaults to 0.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_expand_conv=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 drop_path_rate=0.,
+                 with_cp=False,
+                 init_cfg=None):
+        super(InvertedResidual, self).__init__(init_cfg)
+        self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
+        assert stride in [1, 2], f'stride must in [1, 2]. ' \
+            f'But received {stride}.'
+        self.with_cp = with_cp
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+        self.with_se = se_cfg is not None
+        self.with_expand_conv = with_expand_conv
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+        if not self.with_expand_conv:
+            assert mid_channels == in_channels
+
+        if self.with_expand_conv:
+            self.expand_conv = ConvModule(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.depthwise_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=mid_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.linear_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+
+            if self.with_expand_conv:
+                out = self.expand_conv(out)
+
+            out = self.depthwise_conv(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.linear_conv(out)
+
+            if self.with_res_shortcut:
+                return x + self.drop_path(out)
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
diff --git a/head_extractor/src/mmdet/models/layers/matrix_nms.py b/head_extractor/src/mmdet/models/layers/matrix_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dc8c4f74e28127fb69ccc684f0bdb2bd3943b20
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/matrix_nms.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def mask_matrix_nms(masks,
+                    labels,
+                    scores,
+                    filter_thr=-1,
+                    nms_pre=-1,
+                    max_num=-1,
+                    kernel='gaussian',
+                    sigma=2.0,
+                    mask_area=None):
+    """Matrix NMS for multi-class masks.
+
+    Args:
+        masks (Tensor): Has shape (num_instances, h, w)
+        labels (Tensor): Labels of corresponding masks,
+            has shape (num_instances,).
+        scores (Tensor): Mask scores of corresponding masks,
+            has shape (num_instances).
+        filter_thr (float): Score threshold to filter the masks
+            after matrix nms. Default: -1, which means do not
+            use filter_thr.
+        nms_pre (int): The max number of instances to do the matrix nms.
+            Default: -1, which means do not use nms_pre.
+        max_num (int, optional): If there are more than max_num masks after
+            matrix, only top max_num will be kept. Default: -1, which means
+            do not use max_num.
+        kernel (str): 'linear' or 'gaussian'.
+        sigma (float): std in gaussian method.
+        mask_area (Tensor): The sum of seg_masks.
+
+    Returns:
+        tuple(Tensor): Processed mask results.
+
+            - scores (Tensor): Updated scores, has shape (n,).
+            - labels (Tensor): Remained labels, has shape (n,).
+            - masks (Tensor): Remained masks, has shape (n, w, h).
+            - keep_inds (Tensor): The indices number of
+                the remaining mask in the input mask, has shape (n,).
+    """
+    assert len(labels) == len(masks) == len(scores)
+    if len(labels) == 0:
+        return scores.new_zeros(0), labels.new_zeros(0), masks.new_zeros(
+            0, *masks.shape[-2:]), labels.new_zeros(0)
+    if mask_area is None:
+        mask_area = masks.sum((1, 2)).float()
+    else:
+        assert len(masks) == len(mask_area)
+
+    # sort and keep top nms_pre
+    scores, sort_inds = torch.sort(scores, descending=True)
+
+    keep_inds = sort_inds
+    if nms_pre > 0 and len(sort_inds) > nms_pre:
+        sort_inds = sort_inds[:nms_pre]
+        keep_inds = keep_inds[:nms_pre]
+        scores = scores[:nms_pre]
+    masks = masks[sort_inds]
+    mask_area = mask_area[sort_inds]
+    labels = labels[sort_inds]
+
+    num_masks = len(labels)
+    flatten_masks = masks.reshape(num_masks, -1).float()
+    # inter.
+    inter_matrix = torch.mm(flatten_masks, flatten_masks.transpose(1, 0))
+    expanded_mask_area = mask_area.expand(num_masks, num_masks)
+    # Upper triangle iou matrix.
+    iou_matrix = (inter_matrix /
+                  (expanded_mask_area + expanded_mask_area.transpose(1, 0) -
+                   inter_matrix)).triu(diagonal=1)
+    # label_specific matrix.
+    expanded_labels = labels.expand(num_masks, num_masks)
+    # Upper triangle label matrix.
+    label_matrix = (expanded_labels == expanded_labels.transpose(
+        1, 0)).triu(diagonal=1)
+
+    # IoU compensation
+    compensate_iou, _ = (iou_matrix * label_matrix).max(0)
+    compensate_iou = compensate_iou.expand(num_masks,
+                                           num_masks).transpose(1, 0)
+
+    # IoU decay
+    decay_iou = iou_matrix * label_matrix
+
+    # Calculate the decay_coefficient
+    if kernel == 'gaussian':
+        decay_matrix = torch.exp(-1 * sigma * (decay_iou**2))
+        compensate_matrix = torch.exp(-1 * sigma * (compensate_iou**2))
+        decay_coefficient, _ = (decay_matrix / compensate_matrix).min(0)
+    elif kernel == 'linear':
+        decay_matrix = (1 - decay_iou) / (1 - compensate_iou)
+        decay_coefficient, _ = decay_matrix.min(0)
+    else:
+        raise NotImplementedError(
+            f'{kernel} kernel is not supported in matrix nms!')
+    # update the score.
+    scores = scores * decay_coefficient
+
+    if filter_thr > 0:
+        keep = scores >= filter_thr
+        keep_inds = keep_inds[keep]
+        if not keep.any():
+            return scores.new_zeros(0), labels.new_zeros(0), masks.new_zeros(
+                0, *masks.shape[-2:]), labels.new_zeros(0)
+        masks = masks[keep]
+        scores = scores[keep]
+        labels = labels[keep]
+
+    # sort and keep top max_num
+    scores, sort_inds = torch.sort(scores, descending=True)
+    keep_inds = keep_inds[sort_inds]
+    if max_num > 0 and len(sort_inds) > max_num:
+        sort_inds = sort_inds[:max_num]
+        keep_inds = keep_inds[:max_num]
+        scores = scores[:max_num]
+    masks = masks[sort_inds]
+    labels = labels[sort_inds]
+
+    return scores, labels, masks, keep_inds
diff --git a/head_extractor/src/mmdet/models/layers/msdeformattn_pixel_decoder.py b/head_extractor/src/mmdet/models/layers/msdeformattn_pixel_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a67dc3c4437f83ebe1c82d12b3ed91f429030ce7
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/msdeformattn_pixel_decoder.py
@@ -0,0 +1,246 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, ConvModule
+from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention
+from mmengine.model import (BaseModule, ModuleList, caffe2_xavier_init,
+                            normal_init, xavier_init)
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from ..task_modules.prior_generators import MlvlPointGenerator
+from .positional_encoding import SinePositionalEncoding
+from .transformer import Mask2FormerTransformerEncoder
+
+
+@MODELS.register_module()
+class MSDeformAttnPixelDecoder(BaseModule):
+    """Pixel decoder with multi-scale deformable attention.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        strides (list[int] | tuple[int]): Output strides of feature from
+            backbone.
+        feat_channels (int): Number of channels for feature.
+        out_channels (int): Number of channels for output.
+        num_outs (int): Number of output scales.
+        norm_cfg (:obj:`ConfigDict` or dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`ConfigDict` or dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`ConfigDict` or dict): Config for transformer
+            encoder. Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(num_feats=128, normalize=True).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: Union[List[int],
+                                    Tuple[int]] = [256, 512, 1024, 2048],
+                 strides: Union[List[int], Tuple[int]] = [4, 8, 16, 32],
+                 feat_channels: int = 256,
+                 out_channels: int = 256,
+                 num_outs: int = 3,
+                 norm_cfg: ConfigType = dict(type='GN', num_groups=32),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 encoder: ConfigType = None,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.strides = strides
+        self.num_input_levels = len(in_channels)
+        self.num_encoder_levels = \
+            encoder.layer_cfg.self_attn_cfg.num_levels
+        assert self.num_encoder_levels >= 1, \
+            'num_levels in attn_cfgs must be at least one'
+        input_conv_list = []
+        # from top to down (low to high resolution)
+        for i in range(self.num_input_levels - 1,
+                       self.num_input_levels - self.num_encoder_levels - 1,
+                       -1):
+            input_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                bias=True)
+            input_conv_list.append(input_conv)
+        self.input_convs = ModuleList(input_conv_list)
+
+        self.encoder = Mask2FormerTransformerEncoder(**encoder)
+        self.postional_encoding = SinePositionalEncoding(**positional_encoding)
+        # high resolution to low resolution
+        self.level_encoding = nn.Embedding(self.num_encoder_levels,
+                                           feat_channels)
+
+        # fpn-like structure
+        self.lateral_convs = ModuleList()
+        self.output_convs = ModuleList()
+        self.use_bias = norm_cfg is None
+        # from top to down (low to high resolution)
+        # fpn for the rest features that didn't pass in encoder
+        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1,
+                       -1):
+            lateral_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            output_conv = ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.lateral_convs.append(lateral_conv)
+            self.output_convs.append(output_conv)
+
+        self.mask_feature = Conv2d(
+            feat_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+        self.num_outs = num_outs
+        self.point_generator = MlvlPointGenerator(strides)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        for i in range(0, self.num_encoder_levels):
+            xavier_init(
+                self.input_convs[i].conv,
+                gain=1,
+                bias=0,
+                distribution='uniform')
+
+        for i in range(0, self.num_input_levels - self.num_encoder_levels):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+
+        normal_init(self.level_encoding, mean=0, std=1)
+        for p in self.encoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+        # init_weights defined in MultiScaleDeformableAttention
+        for m in self.encoder.layers.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+
+    def forward(self, feats: List[Tensor]) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+
+        Returns:
+            tuple: A tuple containing the following:
+
+                - mask_feature (Tensor): shape (batch_size, c, h, w).
+                - multi_scale_features (list[Tensor]): Multi scale \
+                        features, each in shape (batch_size, c, h, w).
+        """
+        # generate padding mask for each level, for each image
+        batch_size = feats[0].shape[0]
+        encoder_input_list = []
+        padding_mask_list = []
+        level_positional_encoding_list = []
+        spatial_shapes = []
+        reference_points_list = []
+        for i in range(self.num_encoder_levels):
+            level_idx = self.num_input_levels - i - 1
+            feat = feats[level_idx]
+            feat_projected = self.input_convs[i](feat)
+            feat_hw = torch._shape_as_tensor(feat)[2:].to(feat.device)
+
+            # no padding
+            padding_mask_resized = feat.new_zeros(
+                (batch_size, ) + feat.shape[-2:], dtype=torch.bool)
+            pos_embed = self.postional_encoding(padding_mask_resized)
+            level_embed = self.level_encoding.weight[i]
+            level_pos_embed = level_embed.view(1, -1, 1, 1) + pos_embed
+            # (h_i * w_i, 2)
+            reference_points = self.point_generator.single_level_grid_priors(
+                feat.shape[-2:], level_idx, device=feat.device)
+            # normalize
+            feat_wh = feat_hw.unsqueeze(0).flip(dims=[0, 1])
+            factor = feat_wh * self.strides[level_idx]
+            reference_points = reference_points / factor
+
+            # shape (batch_size, c, h_i, w_i) -> (h_i * w_i, batch_size, c)
+            feat_projected = feat_projected.flatten(2).permute(0, 2, 1)
+            level_pos_embed = level_pos_embed.flatten(2).permute(0, 2, 1)
+            padding_mask_resized = padding_mask_resized.flatten(1)
+
+            encoder_input_list.append(feat_projected)
+            padding_mask_list.append(padding_mask_resized)
+            level_positional_encoding_list.append(level_pos_embed)
+            spatial_shapes.append(feat_hw)
+            reference_points_list.append(reference_points)
+        # shape (batch_size, total_num_queries),
+        # total_num_queries=sum([., h_i * w_i,.])
+        padding_masks = torch.cat(padding_mask_list, dim=1)
+        # shape (total_num_queries, batch_size, c)
+        encoder_inputs = torch.cat(encoder_input_list, dim=1)
+        level_positional_encodings = torch.cat(
+            level_positional_encoding_list, dim=1)
+        # shape (num_encoder_levels, 2), from low
+        # resolution to high resolution
+        num_queries_per_level = [e[0] * e[1] for e in spatial_shapes]
+        spatial_shapes = torch.cat(spatial_shapes).view(-1, 2)
+        # shape (0, h_0*w_0, h_0*w_0+h_1*w_1, ...)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        reference_points = torch.cat(reference_points_list, dim=0)
+        reference_points = reference_points[None, :, None].repeat(
+            batch_size, 1, self.num_encoder_levels, 1)
+        valid_radios = reference_points.new_ones(
+            (batch_size, self.num_encoder_levels, 2))
+        # shape (num_total_queries, batch_size, c)
+        memory = self.encoder(
+            query=encoder_inputs,
+            query_pos=level_positional_encodings,
+            key_padding_mask=padding_masks,
+            spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            level_start_index=level_start_index,
+            valid_ratios=valid_radios)
+        # (batch_size, c, num_total_queries)
+        memory = memory.permute(0, 2, 1)
+
+        # from low resolution to high resolution
+        outs = torch.split(memory, num_queries_per_level, dim=-1)
+        outs = [
+            x.reshape(batch_size, -1, spatial_shapes[i][0],
+                      spatial_shapes[i][1]) for i, x in enumerate(outs)
+        ]
+
+        for i in range(self.num_input_levels - self.num_encoder_levels - 1, -1,
+                       -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + F.interpolate(
+                outs[-1],
+                size=cur_feat.shape[-2:],
+                mode='bilinear',
+                align_corners=False)
+            y = self.output_convs[i](y)
+            outs.append(y)
+        multi_scale_features = outs[:self.num_outs]
+
+        mask_feature = self.mask_feature(outs[-1])
+        return mask_feature, multi_scale_features
diff --git a/head_extractor/src/mmdet/models/layers/normed_predictor.py b/head_extractor/src/mmdet/models/layers/normed_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..592194b1dbbb8582f4c642bf29135573e1f8c3c8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/normed_predictor.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.utils import digit_version
+from torch import Tensor
+
+from mmdet.registry import MODELS
+
+MODELS.register_module('Linear', module=nn.Linear)
+
+
+@MODELS.register_module(name='NormedLinear')
+class NormedLinear(nn.Linear):
+    """Normalized Linear Layer.
+
+    Args:
+        tempeature (float, optional): Tempeature term. Defaults to 20.
+        power (int, optional): Power term. Defaults to 1.0.
+        eps (float, optional): The minimal value of divisor to
+             keep numerical stability. Defaults to 1e-6.
+    """
+
+    def __init__(self,
+                 *args,
+                 tempearture: float = 20,
+                 power: int = 1.0,
+                 eps: float = 1e-6,
+                 **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.tempearture = tempearture
+        self.power = power
+        self.eps = eps
+        self.init_weights()
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+        nn.init.normal_(self.weight, mean=0, std=0.01)
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for `NormedLinear`."""
+        weight_ = self.weight / (
+            self.weight.norm(dim=1, keepdim=True).pow(self.power) + self.eps)
+        x_ = x / (x.norm(dim=1, keepdim=True).pow(self.power) + self.eps)
+        x_ = x_ * self.tempearture
+
+        return F.linear(x_, weight_, self.bias)
+
+
+@MODELS.register_module(name='NormedConv2d')
+class NormedConv2d(nn.Conv2d):
+    """Normalized Conv2d Layer.
+
+    Args:
+        tempeature (float, optional): Tempeature term. Defaults to 20.
+        power (int, optional): Power term. Defaults to 1.0.
+        eps (float, optional): The minimal value of divisor to
+             keep numerical stability. Defaults to 1e-6.
+        norm_over_kernel (bool, optional): Normalize over kernel.
+             Defaults to False.
+    """
+
+    def __init__(self,
+                 *args,
+                 tempearture: float = 20,
+                 power: int = 1.0,
+                 eps: float = 1e-6,
+                 norm_over_kernel: bool = False,
+                 **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.tempearture = tempearture
+        self.power = power
+        self.norm_over_kernel = norm_over_kernel
+        self.eps = eps
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for `NormedConv2d`."""
+        if not self.norm_over_kernel:
+            weight_ = self.weight / (
+                self.weight.norm(dim=1, keepdim=True).pow(self.power) +
+                self.eps)
+        else:
+            weight_ = self.weight / (
+                self.weight.view(self.weight.size(0), -1).norm(
+                    dim=1, keepdim=True).pow(self.power)[..., None, None] +
+                self.eps)
+        x_ = x / (x.norm(dim=1, keepdim=True).pow(self.power) + self.eps)
+        x_ = x_ * self.tempearture
+
+        if hasattr(self, 'conv2d_forward'):
+            x_ = self.conv2d_forward(x_, weight_)
+        else:
+            if digit_version(torch.__version__) >= digit_version('1.8'):
+                x_ = self._conv_forward(x_, weight_, self.bias)
+            else:
+                x_ = self._conv_forward(x_, weight_)
+        return x_
diff --git a/head_extractor/src/mmdet/models/layers/pixel_decoder.py b/head_extractor/src/mmdet/models/layers/pixel_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb61434045eb9996276518577800132e4a25eb3e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/pixel_decoder.py
@@ -0,0 +1,249 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, ConvModule
+from mmengine.model import BaseModule, ModuleList, caffe2_xavier_init
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from .positional_encoding import SinePositionalEncoding
+from .transformer import DetrTransformerEncoder
+
+
+@MODELS.register_module()
+class PixelDecoder(BaseModule):
+    """Pixel decoder with a structure like fpn.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        feat_channels (int): Number channels for feature.
+        out_channels (int): Number channels for output.
+        norm_cfg (:obj:`ConfigDict` or dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`ConfigDict` or dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`ConfigDict` or dict): Config for transorformer
+            encoder.Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(type='SinePositionalEncoding', num_feats=128,
+            normalize=True).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: Union[List[int], Tuple[int]],
+                 feat_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType = dict(type='GN', num_groups=32),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_inputs = len(in_channels)
+        self.lateral_convs = ModuleList()
+        self.output_convs = ModuleList()
+        self.use_bias = norm_cfg is None
+        for i in range(0, self.num_inputs - 1):
+            lateral_conv = ConvModule(
+                in_channels[i],
+                feat_channels,
+                kernel_size=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            output_conv = ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=self.use_bias,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.lateral_convs.append(lateral_conv)
+            self.output_convs.append(output_conv)
+
+        self.last_feat_conv = ConvModule(
+            in_channels[-1],
+            feat_channels,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            bias=self.use_bias,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.mask_feature = Conv2d(
+            feat_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        for i in range(0, self.num_inputs - 2):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+        caffe2_xavier_init(self.last_feat_conv, bias=0)
+
+    def forward(self, feats: List[Tensor],
+                batch_img_metas: List[dict]) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+            batch_img_metas (list[dict]): List of image information.
+                Pass in for creating more accurate padding mask. Not
+                used here.
+
+        Returns:
+            tuple[Tensor, Tensor]: a tuple containing the following:
+
+                - mask_feature (Tensor): Shape (batch_size, c, h, w).
+                - memory (Tensor): Output of last stage of backbone.\
+                        Shape (batch_size, c, h, w).
+        """
+        y = self.last_feat_conv(feats[-1])
+        for i in range(self.num_inputs - 2, -1, -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + \
+                F.interpolate(y, size=cur_feat.shape[-2:], mode='nearest')
+            y = self.output_convs[i](y)
+
+        mask_feature = self.mask_feature(y)
+        memory = feats[-1]
+        return mask_feature, memory
+
+
+@MODELS.register_module()
+class TransformerEncoderPixelDecoder(PixelDecoder):
+    """Pixel decoder with transormer encoder inside.
+
+    Args:
+        in_channels (list[int] | tuple[int]): Number of channels in the
+            input feature maps.
+        feat_channels (int): Number channels for feature.
+        out_channels (int): Number channels for output.
+        norm_cfg (:obj:`ConfigDict` or dict): Config for normalization.
+            Defaults to dict(type='GN', num_groups=32).
+        act_cfg (:obj:`ConfigDict` or dict): Config for activation.
+            Defaults to dict(type='ReLU').
+        encoder (:obj:`ConfigDict` or dict): Config for transformer encoder.
+            Defaults to None.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer encoder position encoding. Defaults to
+            dict(num_feats=128, normalize=True).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: Union[List[int], Tuple[int]],
+                 feat_channels: int,
+                 out_channels: int,
+                 norm_cfg: ConfigType = dict(type='GN', num_groups=32),
+                 act_cfg: ConfigType = dict(type='ReLU'),
+                 encoder: ConfigType = None,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=init_cfg)
+        self.last_feat_conv = None
+
+        self.encoder = DetrTransformerEncoder(**encoder)
+        self.encoder_embed_dims = self.encoder.embed_dims
+        assert self.encoder_embed_dims == feat_channels, 'embed_dims({}) of ' \
+            'tranformer encoder must equal to feat_channels({})'.format(
+                feat_channels, self.encoder_embed_dims)
+        self.positional_encoding = SinePositionalEncoding(
+            **positional_encoding)
+        self.encoder_in_proj = Conv2d(
+            in_channels[-1], feat_channels, kernel_size=1)
+        self.encoder_out_proj = ConvModule(
+            feat_channels,
+            feat_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=self.use_bias,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        for i in range(0, self.num_inputs - 2):
+            caffe2_xavier_init(self.lateral_convs[i].conv, bias=0)
+            caffe2_xavier_init(self.output_convs[i].conv, bias=0)
+
+        caffe2_xavier_init(self.mask_feature, bias=0)
+        caffe2_xavier_init(self.encoder_in_proj, bias=0)
+        caffe2_xavier_init(self.encoder_out_proj.conv, bias=0)
+
+        for p in self.encoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, feats: List[Tensor],
+                batch_img_metas: List[dict]) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            feats (list[Tensor]): Feature maps of each level. Each has
+                shape of (batch_size, c, h, w).
+            batch_img_metas (list[dict]): List of image information. Pass in
+                for creating more accurate padding mask.
+
+        Returns:
+            tuple: a tuple containing the following:
+
+                - mask_feature (Tensor): shape (batch_size, c, h, w).
+                - memory (Tensor): shape (batch_size, c, h, w).
+        """
+        feat_last = feats[-1]
+        bs, c, h, w = feat_last.shape
+        input_img_h, input_img_w = batch_img_metas[0]['batch_input_shape']
+        padding_mask = feat_last.new_ones((bs, input_img_h, input_img_w),
+                                          dtype=torch.float32)
+        for i in range(bs):
+            img_h, img_w = batch_img_metas[i]['img_shape']
+            padding_mask[i, :img_h, :img_w] = 0
+        padding_mask = F.interpolate(
+            padding_mask.unsqueeze(1),
+            size=feat_last.shape[-2:],
+            mode='nearest').to(torch.bool).squeeze(1)
+
+        pos_embed = self.positional_encoding(padding_mask)
+        feat_last = self.encoder_in_proj(feat_last)
+        # (batch_size, c, h, w) -> (batch_size, num_queries, c)
+        feat_last = feat_last.flatten(2).permute(0, 2, 1)
+        pos_embed = pos_embed.flatten(2).permute(0, 2, 1)
+        # (batch_size, h, w) -> (batch_size, h*w)
+        padding_mask = padding_mask.flatten(1)
+        memory = self.encoder(
+            query=feat_last,
+            query_pos=pos_embed,
+            key_padding_mask=padding_mask)
+        # (batch_size, num_queries, c) -> (batch_size, c, h, w)
+        memory = memory.permute(0, 2, 1).view(bs, self.encoder_embed_dims, h,
+                                              w)
+        y = self.encoder_out_proj(memory)
+        for i in range(self.num_inputs - 2, -1, -1):
+            x = feats[i]
+            cur_feat = self.lateral_convs[i](x)
+            y = cur_feat + \
+                F.interpolate(y, size=cur_feat.shape[-2:], mode='nearest')
+            y = self.output_convs[i](y)
+
+        mask_feature = self.mask_feature(y)
+        return mask_feature, memory
diff --git a/head_extractor/src/mmdet/models/layers/positional_encoding.py b/head_extractor/src/mmdet/models/layers/positional_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..87080d81a9f155839d453b8671103e5d51fbf88a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/positional_encoding.py
@@ -0,0 +1,269 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig, OptMultiConfig
+
+
+@MODELS.register_module()
+class SinePositionalEncoding(BaseModule):
+    """Position encoding with sine and cosine functions.
+
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None
+    """
+
+    def __init__(self,
+                 num_feats: int,
+                 temperature: int = 10000,
+                 normalize: bool = False,
+                 scale: float = 2 * math.pi,
+                 eps: float = 1e-6,
+                 offset: float = 0.,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), 'when normalize is set,' \
+                'scale should be provided and in float or int type, ' \
+                f'found {type(scale)}'
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+
+    def forward(self, mask: Tensor, input: Optional[Tensor] = None) -> Tensor:
+        """Forward function for `SinePositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+            input (Tensor, optional): Input image/feature Tensor.
+                Shape [bs, c, h, w]
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        assert not (mask is None and input is None)
+
+        if mask is not None:
+            B, H, W = mask.size()
+            device = mask.device
+            # For convenience of exporting to ONNX,
+            # it's required to convert
+            # `masks` from bool to int.
+            mask = mask.to(torch.int)
+            not_mask = 1 - mask  # logical_not
+            y_embed = not_mask.cumsum(1, dtype=torch.float32)
+            x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        else:
+            # single image or batch image with no padding
+            B, _, H, W = input.shape
+            device = input.device
+            x_embed = torch.arange(
+                1, W + 1, dtype=torch.float32, device=device)
+            x_embed = x_embed.view(1, 1, -1).repeat(B, H, 1)
+            y_embed = torch.arange(
+                1, H + 1, dtype=torch.float32, device=device)
+            y_embed = y_embed.view(1, -1, 1).repeat(B, 1, W)
+        if self.normalize:
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+    def __repr__(self) -> str:
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'temperature={self.temperature}, '
+        repr_str += f'normalize={self.normalize}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'eps={self.eps})'
+        return repr_str
+
+
+@MODELS.register_module()
+class LearnedPositionalEncoding(BaseModule):
+    """Position embedding with learnable embedding weights.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. The final returned dimension for
+            each position is 2 times of this value.
+        row_num_embed (int, optional): The dictionary size of row embeddings.
+            Defaults to 50.
+        col_num_embed (int, optional): The dictionary size of col embeddings.
+            Defaults to 50.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_feats: int,
+        row_num_embed: int = 50,
+        col_num_embed: int = 50,
+        init_cfg: MultiConfig = dict(type='Uniform', layer='Embedding')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.row_embed = nn.Embedding(row_num_embed, num_feats)
+        self.col_embed = nn.Embedding(col_num_embed, num_feats)
+        self.num_feats = num_feats
+        self.row_num_embed = row_num_embed
+        self.col_num_embed = col_num_embed
+
+    def forward(self, mask: Tensor) -> Tensor:
+        """Forward function for `LearnedPositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        h, w = mask.shape[-2:]
+        x = torch.arange(w, device=mask.device)
+        y = torch.arange(h, device=mask.device)
+        x_embed = self.col_embed(x)
+        y_embed = self.row_embed(y)
+        pos = torch.cat(
+            (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(
+                1, w, 1)),
+            dim=-1).permute(2, 0,
+                            1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1)
+        return pos
+
+    def __repr__(self) -> str:
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'row_num_embed={self.row_num_embed}, '
+        repr_str += f'col_num_embed={self.col_num_embed})'
+        return repr_str
+
+
+@MODELS.register_module()
+class SinePositionalEncoding3D(SinePositionalEncoding):
+    """Position encoding with sine and cosine functions.
+
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def forward(self, mask: Tensor) -> Tensor:
+        """Forward function for `SinePositionalEncoding3D`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, t, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        assert mask.dim() == 4,\
+            f'{mask.shape} should be a 4-dimensional Tensor,' \
+            f' got {mask.dim()}-dimensional Tensor instead '
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        z_embed = not_mask.cumsum(1, dtype=torch.float32)
+        y_embed = not_mask.cumsum(2, dtype=torch.float32)
+        x_embed = not_mask.cumsum(3, dtype=torch.float32)
+        if self.normalize:
+            z_embed = (z_embed + self.offset) / \
+                      (z_embed[:, -1:, :, :] + self.eps) * self.scale
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, :, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+
+        dim_t_z = torch.arange((self.num_feats * 2),
+                               dtype=torch.float32,
+                               device=mask.device)
+        dim_t_z = self.temperature**(2 * (dim_t_z // 2) / (self.num_feats * 2))
+
+        pos_x = x_embed[:, :, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, :, None] / dim_t
+        pos_z = z_embed[:, :, :, :, None] / dim_t_z
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, T, H, W = mask.size()
+        pos_x = torch.stack(
+            (pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()),
+            dim=5).view(B, T, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()),
+            dim=5).view(B, T, H, W, -1)
+        pos_z = torch.stack(
+            (pos_z[:, :, :, :, 0::2].sin(), pos_z[:, :, :, :, 1::2].cos()),
+            dim=5).view(B, T, H, W, -1)
+        pos = (torch.cat((pos_y, pos_x), dim=4) + pos_z).permute(0, 1, 4, 2, 3)
+        return pos
diff --git a/head_extractor/src/mmdet/models/layers/res_layer.py b/head_extractor/src/mmdet/models/layers/res_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff24d3e8562d1c3c724b35f7dc10cafe48e47650
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/res_layer.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule, Sequential
+from torch import Tensor
+from torch import nn as nn
+
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Defaults to 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Defaults to False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Defaults to None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Defaults to dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Defaults to True
+    """
+
+    def __init__(self,
+                 block: BaseModule,
+                 inplanes: int,
+                 planes: int,
+                 num_blocks: int,
+                 stride: int = 1,
+                 avg_down: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 downsample_first: bool = True,
+                 **kwargs) -> None:
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            inplanes = planes * block.expansion
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(
+                        inplanes=inplanes,
+                        planes=planes,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+
+        else:  # downsample_first=False is for HourglassModule
+            for _ in range(num_blocks - 1):
+                layers.append(
+                    block(
+                        inplanes=inplanes,
+                        planes=inplanes,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+        super().__init__(*layers)
+
+
+class SimplifiedBasicBlock(BaseModule):
+    """Simplified version of original basic residual block. This is used in
+    `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    - Norm layer is now optional
+    - Last ReLU in forward function is removed
+    """
+    expansion = 1
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[Sequential] = None,
+                 style: ConfigType = 'pytorch',
+                 with_cp: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 dcn: OptConfigType = None,
+                 plugins: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+        assert not with_cp, 'Not implemented yet.'
+        self.with_norm = norm_cfg is not None
+        with_bias = True if norm_cfg is None else False
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=with_bias)
+        if self.with_norm:
+            self.norm1_name, norm1 = build_norm_layer(
+                norm_cfg, planes, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=with_bias)
+        if self.with_norm:
+            self.norm2_name, norm2 = build_norm_layer(
+                norm_cfg, planes, postfix=2)
+            self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self) -> Optional[BaseModule]:
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name) if self.with_norm else None
+
+    @property
+    def norm2(self) -> Optional[BaseModule]:
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name) if self.with_norm else None
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for SimplifiedBasicBlock."""
+
+        identity = x
+
+        out = self.conv1(x)
+        if self.with_norm:
+            out = self.norm1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        if self.with_norm:
+            out = self.norm2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+
+        return out
diff --git a/head_extractor/src/mmdet/models/layers/se_layer.py b/head_extractor/src/mmdet/models/layers/se_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5598dabaf6f3b3a09f4348fcd65ff39897b7068f
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/se_layer.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from mmengine.utils import digit_version, is_tuple_of
+from torch import Tensor
+
+from mmdet.utils import MultiConfig, OptConfigType, OptMultiConfig
+
+
+class SELayer(BaseModule):
+    """Squeeze-and-Excitation Module.
+
+    Args:
+        channels (int): The input (and output) channels of the SE layer.
+        ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
+            ``int(channels/ratio)``. Defaults to 16.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Defaults to (dict(type='ReLU'), dict(type='Sigmoid'))
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None
+    """
+
+    def __init__(self,
+                 channels: int,
+                 ratio: int = 16,
+                 conv_cfg: OptConfigType = None,
+                 act_cfg: MultiConfig = (dict(type='ReLU'),
+                                         dict(type='Sigmoid')),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for SELayer."""
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
+
+
+class DyReLU(BaseModule):
+    """Dynamic ReLU (DyReLU) module.
+
+    See `Dynamic ReLU <https://arxiv.org/abs/2003.10027>`_ for details.
+    Current implementation is specialized for task-aware attention in DyHead.
+    HSigmoid arguments in default act_cfg follow DyHead official code.
+    https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py
+
+    Args:
+        channels (int): The input (and output) channels of DyReLU module.
+        ratio (int): Squeeze ratio in Squeeze-and-Excitation-like module,
+            the intermediate channel will be ``int(channels/ratio)``.
+            Defaults to 4.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Defaults to None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configurated
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configurated by the first dict and the
+            second activation layer will be configurated by the second dict.
+            Defaults to (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0,
+            divisor=6.0))
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None
+    """
+
+    def __init__(self,
+                 channels: int,
+                 ratio: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 act_cfg: MultiConfig = (dict(type='ReLU'),
+                                         dict(
+                                             type='HSigmoid',
+                                             bias=3.0,
+                                             divisor=6.0)),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert is_tuple_of(act_cfg, dict)
+        self.channels = channels
+        self.expansion = 4  # for a1, b1, a2, b2
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=int(channels / ratio),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=int(channels / ratio),
+            out_channels=channels * self.expansion,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        coeffs = self.global_avgpool(x)
+        coeffs = self.conv1(coeffs)
+        coeffs = self.conv2(coeffs) - 0.5  # value range: [-0.5, 0.5]
+        a1, b1, a2, b2 = torch.split(coeffs, self.channels, dim=1)
+        a1 = a1 * 2.0 + 1.0  # [-1.0, 1.0] + 1.0
+        a2 = a2 * 2.0  # [-1.0, 1.0]
+        out = torch.max(x * a1 + b1, x * a2 + b2)
+        return out
+
+
+class ChannelAttention(BaseModule):
+    """Channel attention Module.
+
+    Args:
+        channels (int): The input (and output) channels of the attention layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None
+    """
+
+    def __init__(self, channels: int, init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
+        if digit_version(torch.__version__) < (1, 7, 0):
+            self.act = nn.Hardsigmoid()
+        else:
+            self.act = nn.Hardsigmoid(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function for ChannelAttention."""
+        with torch.cuda.amp.autocast(enabled=False):
+            out = self.global_avgpool(x)
+        out = self.fc(out)
+        out = self.act(out)
+        return x * out
diff --git a/head_extractor/src/mmdet/models/layers/transformer/__init__.py b/head_extractor/src/mmdet/models/layers/transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..839d936412673d765cd9f89a44a366a64976bb9c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/transformer/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .conditional_detr_layers import (ConditionalDetrTransformerDecoder,
+                                      ConditionalDetrTransformerDecoderLayer)
+from .dab_detr_layers import (DABDetrTransformerDecoder,
+                              DABDetrTransformerDecoderLayer,
+                              DABDetrTransformerEncoder)
+from .ddq_detr_layers import DDQTransformerDecoder
+from .deformable_detr_layers import (DeformableDetrTransformerDecoder,
+                                     DeformableDetrTransformerDecoderLayer,
+                                     DeformableDetrTransformerEncoder,
+                                     DeformableDetrTransformerEncoderLayer)
+from .detr_layers import (DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DetrTransformerEncoder, DetrTransformerEncoderLayer)
+from .dino_layers import CdnQueryGenerator, DinoTransformerDecoder
+from .grounding_dino_layers import (GroundingDinoTransformerDecoder,
+                                    GroundingDinoTransformerDecoderLayer,
+                                    GroundingDinoTransformerEncoder)
+from .mask2former_layers import (Mask2FormerTransformerDecoder,
+                                 Mask2FormerTransformerDecoderLayer,
+                                 Mask2FormerTransformerEncoder)
+from .utils import (MLP, AdaptivePadding, ConditionalAttention, DynamicConv,
+                    PatchEmbed, PatchMerging, coordinate_to_encoding,
+                    inverse_sigmoid, nchw_to_nlc, nlc_to_nchw)
+
+__all__ = [
+    'nlc_to_nchw', 'nchw_to_nlc', 'AdaptivePadding', 'PatchEmbed',
+    'PatchMerging', 'inverse_sigmoid', 'DynamicConv', 'MLP',
+    'DetrTransformerEncoder', 'DetrTransformerDecoder',
+    'DetrTransformerEncoderLayer', 'DetrTransformerDecoderLayer',
+    'DeformableDetrTransformerEncoder', 'DeformableDetrTransformerDecoder',
+    'DeformableDetrTransformerEncoderLayer',
+    'DeformableDetrTransformerDecoderLayer', 'coordinate_to_encoding',
+    'ConditionalAttention', 'DABDetrTransformerDecoderLayer',
+    'DABDetrTransformerDecoder', 'DABDetrTransformerEncoder',
+    'DDQTransformerDecoder', 'ConditionalDetrTransformerDecoder',
+    'ConditionalDetrTransformerDecoderLayer', 'DinoTransformerDecoder',
+    'CdnQueryGenerator', 'Mask2FormerTransformerEncoder',
+    'Mask2FormerTransformerDecoderLayer', 'Mask2FormerTransformerDecoder',
+    'GroundingDinoTransformerDecoderLayer', 'GroundingDinoTransformerEncoder',
+    'GroundingDinoTransformerDecoder'
+]
diff --git a/head_extractor/src/mmdet/models/layers/transformer/conditional_detr_layers.py b/head_extractor/src/mmdet/models/layers/transformer/conditional_detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6db12a1340c758996e8c0e96f0b21cbc6fa928c9
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/transformer/conditional_detr_layers.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN
+from torch import Tensor
+from torch.nn import ModuleList
+
+from .detr_layers import DetrTransformerDecoder, DetrTransformerDecoderLayer
+from .utils import MLP, ConditionalAttention, coordinate_to_encoding
+
+
+class ConditionalDetrTransformerDecoder(DetrTransformerDecoder):
+    """Decoder of Conditional DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers and other layers."""
+        self.layers = ModuleList([
+            ConditionalDetrTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        self.post_norm = build_norm_layer(self.post_norm_cfg,
+                                          self.embed_dims)[1]
+        # conditional detr affline
+        self.query_scale = MLP(self.embed_dims, self.embed_dims,
+                               self.embed_dims, 2)
+        self.ref_point_head = MLP(self.embed_dims, self.embed_dims, 2, 2)
+        # we have substitute 'qpos_proj' with 'qpos_sine_proj' except for
+        # the first decoder layer), so 'qpos_proj' should be deleted
+        # in other layers.
+        for layer_id in range(self.num_layers - 1):
+            self.layers[layer_id + 1].cross_attn.qpos_proj = None
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                key_padding_mask: Tensor = None):
+        """Forward function of decoder.
+
+        Args:
+            query (Tensor): The input query with shape
+                (bs, num_queries, dim).
+            key (Tensor): The input key with shape (bs, num_keys, dim) If
+                `None`, the `query` will be used. Defaults to `None`.
+            query_pos (Tensor): The positional encoding for `query`, with the
+                same shape as `query`. If not `None`, it will be added to
+                `query` before forward function. Defaults to `None`.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. If not `None`, it will be added to
+                `key` before forward function. If `None`, and `query_pos`
+                has the same shape as `key`, then `query_pos` will be used
+                as `key_pos`. Defaults to `None`.
+            key_padding_mask (Tensor): ByteTensor with shape (bs, num_keys).
+                Defaults to `None`.
+        Returns:
+            List[Tensor]: forwarded results with shape (num_decoder_layers,
+            bs, num_queries, dim) if `return_intermediate` is True, otherwise
+            with shape (1, bs, num_queries, dim). References with shape
+            (bs, num_queries, 2).
+        """
+        reference_unsigmoid = self.ref_point_head(
+            query_pos)  # [bs, num_queries, 2]
+        reference = reference_unsigmoid.sigmoid()
+        reference_xy = reference[..., :2]
+        intermediate = []
+        for layer_id, layer in enumerate(self.layers):
+            if layer_id == 0:
+                pos_transformation = 1
+            else:
+                pos_transformation = self.query_scale(query)
+            # get sine embedding for the query reference
+            ref_sine_embed = coordinate_to_encoding(coord_tensor=reference_xy)
+            # apply transformation
+            ref_sine_embed = ref_sine_embed * pos_transformation
+            query = layer(
+                query,
+                key=key,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                key_padding_mask=key_padding_mask,
+                ref_sine_embed=ref_sine_embed,
+                is_first=(layer_id == 0))
+            if self.return_intermediate:
+                intermediate.append(self.post_norm(query))
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), reference
+
+        query = self.post_norm(query)
+        return query.unsqueeze(0), reference
+
+
+class ConditionalDetrTransformerDecoderLayer(DetrTransformerDecoderLayer):
+    """Implements decoder layer in Conditional DETR transformer."""
+
+    def _init_layers(self):
+        """Initialize self-attention, cross-attention, FFN, and
+        normalization."""
+        self.self_attn = ConditionalAttention(**self.self_attn_cfg)
+        self.cross_attn = ConditionalAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(3)
+        ]
+        self.norms = ModuleList(norms_list)
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                self_attn_masks: Tensor = None,
+                cross_attn_masks: Tensor = None,
+                key_padding_mask: Tensor = None,
+                ref_sine_embed: Tensor = None,
+                is_first: bool = False):
+        """
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim)
+            key (Tensor, optional): The input key, has shape (bs, num_keys,
+                dim). If `None`, the `query` will be used. Defaults to `None`.
+            query_pos (Tensor, optional): The positional encoding for `query`,
+                has the same shape as `query`. If not `None`, it will be
+                added to `query` before forward function. Defaults to `None`.
+            ref_sine_embed (Tensor): The positional encoding for query in
+                cross attention, with the same shape as `x`. Defaults to None.
+            key_pos (Tensor, optional): The positional encoding for `key`, has
+                the same shape as `key`. If not None, it will be added to
+                `key` before forward function. If None, and `query_pos` has
+                the same shape as `key`, then `query_pos` will be used for
+                `key_pos`. Defaults to None.
+            self_attn_masks (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), Same in `nn.MultiheadAttention.
+                forward`. Defaults to None.
+            cross_attn_masks (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), Same in `nn.MultiheadAttention.
+                forward`. Defaults to None.
+            key_padding_mask (Tensor, optional): ByteTensor, has shape
+                (bs, num_keys). Defaults to None.
+            is_first (bool): A indicator to tell whether the current layer
+                is the first layer of the decoder. Defaults to False.
+
+        Returns:
+            Tensor: Forwarded results, has shape (bs, num_queries, dim).
+        """
+        query = self.self_attn(
+            query=query,
+            key=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_masks)
+        query = self.norms[0](query)
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=cross_attn_masks,
+            key_padding_mask=key_padding_mask,
+            ref_sine_embed=ref_sine_embed,
+            is_first=is_first)
+        query = self.norms[1](query)
+        query = self.ffn(query)
+        query = self.norms[2](query)
+
+        return query
diff --git a/head_extractor/src/mmdet/models/layers/transformer/dab_detr_layers.py b/head_extractor/src/mmdet/models/layers/transformer/dab_detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8a6e7724a1b1ca18f26dd10455f3e3a4d696460
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/transformer/dab_detr_layers.py
@@ -0,0 +1,298 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from .detr_layers import (DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DetrTransformerEncoder, DetrTransformerEncoderLayer)
+from .utils import (MLP, ConditionalAttention, coordinate_to_encoding,
+                    inverse_sigmoid)
+
+
+class DABDetrTransformerDecoderLayer(DetrTransformerDecoderLayer):
+    """Implements decoder layer in DAB-DETR transformer."""
+
+    def _init_layers(self):
+        """Initialize self-attention, cross-attention, FFN, normalization and
+        others."""
+        self.self_attn = ConditionalAttention(**self.self_attn_cfg)
+        self.cross_attn = ConditionalAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(3)
+        ]
+        self.norms = ModuleList(norms_list)
+        self.keep_query_pos = self.cross_attn.keep_query_pos
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                query_pos: Tensor,
+                key_pos: Tensor,
+                ref_sine_embed: Tensor = None,
+                self_attn_masks: Tensor = None,
+                cross_attn_masks: Tensor = None,
+                key_padding_mask: Tensor = None,
+                is_first: bool = False,
+                **kwargs) -> Tensor:
+        """
+        Args:
+            query (Tensor): The input query with shape [bs, num_queries,
+                dim].
+            key (Tensor): The key tensor with shape [bs, num_keys,
+                dim].
+            query_pos (Tensor): The positional encoding for query in self
+                attention, with the same shape as `x`.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`.
+            ref_sine_embed (Tensor): The positional encoding for query in
+                cross attention, with the same shape as `x`.
+                Defaults to None.
+            self_attn_masks (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            cross_attn_masks (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+            is_first (bool): A indicator to tell whether the current layer
+                is the first layer of the decoder.
+                Defaults to False.
+
+        Returns:
+            Tensor: forwarded results with shape
+            [bs, num_queries, dim].
+        """
+
+        query = self.self_attn(
+            query=query,
+            key=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_masks,
+            **kwargs)
+        query = self.norms[0](query)
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            ref_sine_embed=ref_sine_embed,
+            attn_mask=cross_attn_masks,
+            key_padding_mask=key_padding_mask,
+            is_first=is_first,
+            **kwargs)
+        query = self.norms[1](query)
+        query = self.ffn(query)
+        query = self.norms[2](query)
+
+        return query
+
+
+class DABDetrTransformerDecoder(DetrTransformerDecoder):
+    """Decoder of DAB-DETR.
+
+    Args:
+        query_dim (int): The last dimension of query pos,
+            4 for anchor format, 2 for point format.
+            Defaults to 4.
+        query_scale_type (str): Type of transformation applied
+            to content query. Defaults to `cond_elewise`.
+        with_modulated_hw_attn (bool): Whether to inject h&w info
+            during cross conditional attention. Defaults to True.
+    """
+
+    def __init__(self,
+                 *args,
+                 query_dim: int = 4,
+                 query_scale_type: str = 'cond_elewise',
+                 with_modulated_hw_attn: bool = True,
+                 **kwargs):
+
+        self.query_dim = query_dim
+        self.query_scale_type = query_scale_type
+        self.with_modulated_hw_attn = with_modulated_hw_attn
+
+        super().__init__(*args, **kwargs)
+
+    def _init_layers(self):
+        """Initialize decoder layers and other layers."""
+        assert self.query_dim in [2, 4], \
+            f'{"dab-detr only supports anchor prior or reference point prior"}'
+        assert self.query_scale_type in [
+            'cond_elewise', 'cond_scalar', 'fix_elewise'
+        ]
+
+        self.layers = ModuleList([
+            DABDetrTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+
+        embed_dims = self.layers[0].embed_dims
+        self.embed_dims = embed_dims
+
+        self.post_norm = build_norm_layer(self.post_norm_cfg, embed_dims)[1]
+        if self.query_scale_type == 'cond_elewise':
+            self.query_scale = MLP(embed_dims, embed_dims, embed_dims, 2)
+        elif self.query_scale_type == 'cond_scalar':
+            self.query_scale = MLP(embed_dims, embed_dims, 1, 2)
+        elif self.query_scale_type == 'fix_elewise':
+            self.query_scale = nn.Embedding(self.num_layers, embed_dims)
+        else:
+            raise NotImplementedError('Unknown query_scale_type: {}'.format(
+                self.query_scale_type))
+
+        self.ref_point_head = MLP(self.query_dim // 2 * embed_dims, embed_dims,
+                                  embed_dims, 2)
+
+        if self.with_modulated_hw_attn and self.query_dim == 4:
+            self.ref_anchor_head = MLP(embed_dims, embed_dims, 2, 2)
+
+        self.keep_query_pos = self.layers[0].keep_query_pos
+        if not self.keep_query_pos:
+            for layer_id in range(self.num_layers - 1):
+                self.layers[layer_id + 1].cross_attn.qpos_proj = None
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                query_pos: Tensor,
+                key_pos: Tensor,
+                reg_branches: nn.Module,
+                key_padding_mask: Tensor = None,
+                **kwargs) -> List[Tensor]:
+        """Forward function of decoder.
+
+        Args:
+            query (Tensor): The input query with shape (bs, num_queries, dim).
+            key (Tensor): The input key with shape (bs, num_keys, dim).
+            query_pos (Tensor): The positional encoding for `query`, with the
+                same shape as `query`.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`.
+            reg_branches (nn.Module): The regression branch for dynamically
+                updating references in each layer.
+            key_padding_mask (Tensor): ByteTensor with shape (bs, num_keys).
+                Defaults to `None`.
+
+        Returns:
+            List[Tensor]: forwarded results with shape (num_decoder_layers,
+            bs, num_queries, dim) if `return_intermediate` is True, otherwise
+            with shape (1, bs, num_queries, dim). references with shape
+            (num_decoder_layers, bs, num_queries, 2/4).
+        """
+        output = query
+        unsigmoid_references = query_pos
+
+        reference_points = unsigmoid_references.sigmoid()
+        intermediate_reference_points = [reference_points]
+
+        intermediate = []
+        for layer_id, layer in enumerate(self.layers):
+            obj_center = reference_points[..., :self.query_dim]
+            ref_sine_embed = coordinate_to_encoding(
+                coord_tensor=obj_center, num_feats=self.embed_dims // 2)
+            query_pos = self.ref_point_head(
+                ref_sine_embed)  # [bs, nq, 2c] -> [bs, nq, c]
+            # For the first decoder layer, do not apply transformation
+            if self.query_scale_type != 'fix_elewise':
+                if layer_id == 0:
+                    pos_transformation = 1
+                else:
+                    pos_transformation = self.query_scale(output)
+            else:
+                pos_transformation = self.query_scale.weight[layer_id]
+            # apply transformation
+            ref_sine_embed = ref_sine_embed[
+                ..., :self.embed_dims] * pos_transformation
+            # modulated height and weight attention
+            if self.with_modulated_hw_attn:
+                assert obj_center.size(-1) == 4
+                ref_hw = self.ref_anchor_head(output).sigmoid()
+                ref_sine_embed[..., self.embed_dims // 2:] *= \
+                    (ref_hw[..., 0] / obj_center[..., 2]).unsqueeze(-1)
+                ref_sine_embed[..., : self.embed_dims // 2] *= \
+                    (ref_hw[..., 1] / obj_center[..., 3]).unsqueeze(-1)
+
+            output = layer(
+                output,
+                key,
+                query_pos=query_pos,
+                ref_sine_embed=ref_sine_embed,
+                key_pos=key_pos,
+                key_padding_mask=key_padding_mask,
+                is_first=(layer_id == 0),
+                **kwargs)
+            # iter update
+            tmp_reg_preds = reg_branches(output)
+            tmp_reg_preds[..., :self.query_dim] += inverse_sigmoid(
+                reference_points)
+            new_reference_points = tmp_reg_preds[
+                ..., :self.query_dim].sigmoid()
+            if layer_id != self.num_layers - 1:
+                intermediate_reference_points.append(new_reference_points)
+            reference_points = new_reference_points.detach()
+
+            if self.return_intermediate:
+                intermediate.append(self.post_norm(output))
+
+        output = self.post_norm(output)
+
+        if self.return_intermediate:
+            return [
+                torch.stack(intermediate),
+                torch.stack(intermediate_reference_points),
+            ]
+        else:
+            return [
+                output.unsqueeze(0),
+                torch.stack(intermediate_reference_points)
+            ]
+
+
+class DABDetrTransformerEncoder(DetrTransformerEncoder):
+    """Encoder of DAB-DETR."""
+
+    def _init_layers(self):
+        """Initialize encoder layers."""
+        self.layers = ModuleList([
+            DetrTransformerEncoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        embed_dims = self.layers[0].embed_dims
+        self.embed_dims = embed_dims
+        self.query_scale = MLP(embed_dims, embed_dims, embed_dims, 2)
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, **kwargs):
+        """Forward function of encoder.
+
+        Args:
+            query (Tensor): Input queries of encoder, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional embeddings of the queries, has
+                shape (bs, num_feat_points, dim).
+            key_padding_mask (Tensor): ByteTensor, the key padding mask
+                of the queries, has shape (bs, num_feat_points).
+
+        Returns:
+            Tensor: With shape (num_queries, bs, dim).
+        """
+
+        for layer in self.layers:
+            pos_scales = self.query_scale(query)
+            query = layer(
+                query,
+                query_pos=query_pos * pos_scales,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+
+        return query
diff --git a/head_extractor/src/mmdet/models/layers/transformer/ddq_detr_layers.py b/head_extractor/src/mmdet/models/layers/transformer/ddq_detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..57664c7ea2bdd17681ccdabe9140eb043a99e155
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/transformer/ddq_detr_layers.py
@@ -0,0 +1,223 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+from mmcv.ops import batched_nms
+from torch import Tensor, nn
+
+from mmdet.structures.bbox import bbox_cxcywh_to_xyxy
+from .deformable_detr_layers import DeformableDetrTransformerDecoder
+from .utils import MLP, coordinate_to_encoding, inverse_sigmoid
+
+
+class DDQTransformerDecoder(DeformableDetrTransformerDecoder):
+    """Transformer decoder of DDQ."""
+
+    def _init_layers(self) -> None:
+        """Initialize encoder layers."""
+        super()._init_layers()
+        self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims,
+                                  self.embed_dims, 2)
+        self.norm = nn.LayerNorm(self.embed_dims)
+
+    def select_distinct_queries(self, reference_points: Tensor, query: Tensor,
+                                self_attn_mask: Tensor, layer_index):
+        """Get updated `self_attn_mask` for distinct queries selection, it is
+        used in self attention layers of decoder.
+
+        Args:
+            reference_points (Tensor): The input reference of decoder,
+                has shape (bs, num_queries, 4) with the last dimension
+                arranged as (cx, cy, w, h).
+            query (Tensor): The input query of decoder, has shape
+                (bs, num_queries, dims).
+            self_attn_mask (Tensor): The input self attention mask of
+                last decoder layer, has shape (bs, num_queries_total,
+                num_queries_total).
+            layer_index (int): Last decoder layer index, used to get
+                classification score of last layer output, for
+                distinct queries selection.
+
+        Returns:
+            Tensor: `self_attn_mask` used in self attention layers
+                of decoder, has shape (bs, num_queries_total,
+                num_queries_total).
+        """
+        num_imgs = len(reference_points)
+        dis_start, num_dis = self.cache_dict['dis_query_info']
+        # shape of self_attn_mask
+        # (batch⋅num_heads, num_queries, embed_dims)
+        dis_mask = self_attn_mask[:, dis_start:dis_start + num_dis,
+                                  dis_start:dis_start + num_dis]
+        # cls_branches from DDQDETRHead
+        scores = self.cache_dict['cls_branches'][layer_index](
+            query[:, dis_start:dis_start + num_dis]).sigmoid().max(-1).values
+        proposals = reference_points[:, dis_start:dis_start + num_dis]
+        proposals = bbox_cxcywh_to_xyxy(proposals)
+
+        attn_mask_list = []
+        for img_id in range(num_imgs):
+            single_proposals = proposals[img_id]
+            single_scores = scores[img_id]
+            attn_mask = ~dis_mask[img_id * self.cache_dict['num_heads']][0]
+            # distinct query inds in this layer
+            ori_index = attn_mask.nonzero().view(-1)
+            _, keep_idxs = batched_nms(single_proposals[ori_index],
+                                       single_scores[ori_index],
+                                       torch.ones(len(ori_index)),
+                                       self.cache_dict['dqs_cfg'])
+
+            real_keep_index = ori_index[keep_idxs]
+
+            attn_mask = torch.ones_like(dis_mask[0]).bool()
+            # such a attn_mask give best result
+            # If it requires to keep index i, then all cells in row or column
+            #   i should be kept in `attn_mask` . For example, if
+            #   `real_keep_index` = [1, 4], and `attn_mask` size = [8, 8],
+            #   then all cells at rows or columns [1, 4] should be kept, and
+            #   all the other cells should be masked out. So the value of
+            #  `attn_mask` should be:
+            #
+            # target\source   0 1 2 3 4 5 6 7
+            #             0 [ 0 1 0 0 1 0 0 0 ]
+            #             1 [ 1 1 1 1 1 1 1 1 ]
+            #             2 [ 0 1 0 0 1 0 0 0 ]
+            #             3 [ 0 1 0 0 1 0 0 0 ]
+            #             4 [ 1 1 1 1 1 1 1 1 ]
+            #             5 [ 0 1 0 0 1 0 0 0 ]
+            #             6 [ 0 1 0 0 1 0 0 0 ]
+            #             7 [ 0 1 0 0 1 0 0 0 ]
+            attn_mask[real_keep_index] = False
+            attn_mask[:, real_keep_index] = False
+
+            attn_mask = attn_mask[None].repeat(self.cache_dict['num_heads'], 1,
+                                               1)
+            attn_mask_list.append(attn_mask)
+        attn_mask = torch.cat(attn_mask_list)
+        self_attn_mask = copy.deepcopy(self_attn_mask)
+        self_attn_mask[:, dis_start:dis_start + num_dis,
+                       dis_start:dis_start + num_dis] = attn_mask
+        # will be used in loss and inference
+        self.cache_dict['distinct_query_mask'].append(~attn_mask)
+        return self_attn_mask
+
+    def forward(self, query: Tensor, value: Tensor, key_padding_mask: Tensor,
+                self_attn_mask: Tensor, reference_points: Tensor,
+                spatial_shapes: Tensor, level_start_index: Tensor,
+                valid_ratios: Tensor, reg_branches: nn.ModuleList,
+                **kwargs) -> Tensor:
+        """Forward function of Transformer decoder.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries,
+                dims).
+            value (Tensor): The input values, has shape (bs, num_value, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `cross_attn`
+                input. ByteTensor, has shape (bs, num_value).
+            self_attn_mask (Tensor): The attention mask to prevent information
+                leakage from different denoising groups, distinct queries and
+                dense queries, has shape (num_queries_total,
+                num_queries_total). It will be updated for distinct queries
+                selection in this forward function. It is `None` when
+                `self.training` is `False`.
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            reg_branches: (obj:`nn.ModuleList`): Used for refining the
+                regression results.
+
+        Returns:
+            tuple[Tensor]: Output queries and references of Transformer
+                decoder
+
+            - query (Tensor): Output embeddings of the last decoder, has
+              shape (bs, num_queries, embed_dims) when `return_intermediate`
+              is `False`. Otherwise, Intermediate output embeddings of all
+              decoder layers, has shape (num_decoder_layers, bs, num_queries,
+              embed_dims).
+            - reference_points (Tensor): The reference of the last decoder
+              layer, has shape (bs, num_queries, 4) when `return_intermediate`
+              is `False`. Otherwise, Intermediate references of all decoder
+              layers, has shape (1 + num_decoder_layers, bs, num_queries, 4).
+              The coordinates are arranged as (cx, cy, w, h).
+        """
+        intermediate = []
+        intermediate_reference_points = [reference_points]
+        self.cache_dict['distinct_query_mask'] = []
+        if self_attn_mask is None:
+            self_attn_mask = torch.zeros((query.size(1), query.size(1)),
+                                         device=query.device).bool()
+        # shape is (batch*number_heads, num_queries, num_queries)
+        self_attn_mask = self_attn_mask[None].repeat(
+            len(query) * self.cache_dict['num_heads'], 1, 1)
+        for layer_index, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = \
+                    reference_points[:, :, None] * torch.cat(
+                        [valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = \
+                    reference_points[:, :, None] * valid_ratios[:, None]
+
+            query_sine_embed = coordinate_to_encoding(
+                reference_points_input[:, :, 0, :],
+                num_feats=self.embed_dims // 2)
+            query_pos = self.ref_point_head(query_sine_embed)
+
+            query = layer(
+                query,
+                query_pos=query_pos,
+                value=value,
+                key_padding_mask=key_padding_mask,
+                self_attn_mask=self_attn_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points_input,
+                **kwargs)
+
+            if not self.training:
+                tmp = reg_branches[layer_index](query)
+                assert reference_points.shape[-1] == 4
+                new_reference_points = tmp + inverse_sigmoid(
+                    reference_points, eps=1e-3)
+                new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+                if layer_index < (len(self.layers) - 1):
+                    self_attn_mask = self.select_distinct_queries(
+                        reference_points, query, self_attn_mask, layer_index)
+
+            else:
+                num_dense = self.cache_dict['num_dense_queries']
+                tmp = reg_branches[layer_index](query[:, :-num_dense])
+                tmp_dense = self.aux_reg_branches[layer_index](
+                    query[:, -num_dense:])
+
+                tmp = torch.cat([tmp, tmp_dense], dim=1)
+                assert reference_points.shape[-1] == 4
+                new_reference_points = tmp + inverse_sigmoid(
+                    reference_points, eps=1e-3)
+                new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+                if layer_index < (len(self.layers) - 1):
+                    self_attn_mask = self.select_distinct_queries(
+                        reference_points, query, self_attn_mask, layer_index)
+
+            if self.return_intermediate:
+                intermediate.append(self.norm(query))
+                intermediate_reference_points.append(new_reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return query, reference_points
diff --git a/head_extractor/src/mmdet/models/layers/transformer/deformable_detr_layers.py b/head_extractor/src/mmdet/models/layers/transformer/deformable_detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..da6325d61270eb3546a39d5487587bc0610434d6
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/transformer/deformable_detr_layers.py
@@ -0,0 +1,265 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmcv.ops import MultiScaleDeformableAttention
+from mmengine.model import ModuleList
+from torch import Tensor, nn
+
+from .detr_layers import (DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DetrTransformerEncoder, DetrTransformerEncoderLayer)
+from .utils import inverse_sigmoid
+
+try:
+    from fairscale.nn.checkpoint import checkpoint_wrapper
+except Exception:
+    checkpoint_wrapper = None
+
+
+class DeformableDetrTransformerEncoder(DetrTransformerEncoder):
+    """Transformer encoder of Deformable DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize encoder layers."""
+        self.layers = ModuleList([
+            DeformableDetrTransformerEncoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+
+        if self.num_cp > 0:
+            if checkpoint_wrapper is None:
+                raise NotImplementedError(
+                    'If you want to reduce GPU memory usage, \
+                    please install fairscale by executing the \
+                    following command: pip install fairscale.')
+            for i in range(self.num_cp):
+                self.layers[i] = checkpoint_wrapper(self.layers[i])
+
+        self.embed_dims = self.layers[0].embed_dims
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, spatial_shapes: Tensor,
+                level_start_index: Tensor, valid_ratios: Tensor,
+                **kwargs) -> Tensor:
+        """Forward function of Transformer encoder.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            query_pos (Tensor): The positional encoding for query, has shape
+                (bs, num_queries, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (bs, num_queries).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+
+        Returns:
+            Tensor: Output queries of Transformer encoder, which is also
+            called 'encoder output embeddings' or 'memory', has shape
+            (bs, num_queries, dim)
+        """
+        reference_points = self.get_encoder_reference_points(
+            spatial_shapes, valid_ratios, device=query.device)
+        for layer in self.layers:
+            query = layer(
+                query=query,
+                query_pos=query_pos,
+                key_padding_mask=key_padding_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points,
+                **kwargs)
+        return query
+
+    @staticmethod
+    def get_encoder_reference_points(
+            spatial_shapes: Tensor, valid_ratios: Tensor,
+            device: Union[torch.device, str]) -> Tensor:
+        """Get the reference points used in encoder.
+
+        Args:
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            device (obj:`device` or str): The device acquired by the
+                `reference_points`.
+
+        Returns:
+            Tensor: Reference points used in decoder, has shape (bs, length,
+            num_levels, 2).
+        """
+
+        reference_points_list = []
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(
+                    0.5, H - 0.5, H, dtype=torch.float32, device=device),
+                torch.linspace(
+                    0.5, W - 0.5, W, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (
+                valid_ratios[:, None, lvl, 1] * H)
+            ref_x = ref_x.reshape(-1)[None] / (
+                valid_ratios[:, None, lvl, 0] * W)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        # [bs, sum(hw), num_level, 2]
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+
+class DeformableDetrTransformerDecoder(DetrTransformerDecoder):
+    """Transformer Decoder of Deformable DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        self.layers = ModuleList([
+            DeformableDetrTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        if self.post_norm_cfg is not None:
+            raise ValueError('There is not post_norm in '
+                             f'{self._get_name()}')
+
+    def forward(self,
+                query: Tensor,
+                query_pos: Tensor,
+                value: Tensor,
+                key_padding_mask: Tensor,
+                reference_points: Tensor,
+                spatial_shapes: Tensor,
+                level_start_index: Tensor,
+                valid_ratios: Tensor,
+                reg_branches: Optional[nn.Module] = None,
+                **kwargs) -> Tuple[Tensor]:
+        """Forward function of Transformer decoder.
+
+        Args:
+            query (Tensor): The input queries, has shape (bs, num_queries,
+                dim).
+            query_pos (Tensor): The input positional query, has shape
+                (bs, num_queries, dim). It will be added to `query` before
+                forward function.
+            value (Tensor): The input values, has shape (bs, num_value, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `cross_attn`
+                input. ByteTensor, has shape (bs, num_value).
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h) when `as_two_stage` is `True`, otherwise has
+                shape (bs, num_queries, 2) with the last dimension arranged
+                as (cx, cy).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            reg_branches: (obj:`nn.ModuleList`, optional): Used for refining
+                the regression results. Only would be passed when
+                `with_box_refine` is `True`, otherwise would be `None`.
+
+        Returns:
+            tuple[Tensor]: Outputs of Deformable Transformer Decoder.
+
+            - output (Tensor): Output embeddings of the last decoder, has
+              shape (num_queries, bs, embed_dims) when `return_intermediate`
+              is `False`. Otherwise, Intermediate output embeddings of all
+              decoder layers, has shape (num_decoder_layers, num_queries, bs,
+              embed_dims).
+            - reference_points (Tensor): The reference of the last decoder
+              layer, has shape (bs, num_queries, 4)  when `return_intermediate`
+              is `False`. Otherwise, Intermediate references of all decoder
+              layers, has shape (num_decoder_layers, bs, num_queries, 4). The
+              coordinates are arranged as (cx, cy, w, h)
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for layer_id, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = \
+                    reference_points[:, :, None] * \
+                    torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = \
+                    reference_points[:, :, None] * \
+                    valid_ratios[:, None]
+            output = layer(
+                output,
+                query_pos=query_pos,
+                value=value,
+                key_padding_mask=key_padding_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points_input,
+                **kwargs)
+
+            if reg_branches is not None:
+                tmp_reg_preds = reg_branches[layer_id](output)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp_reg_preds + inverse_sigmoid(
+                        reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    assert reference_points.shape[-1] == 2
+                    new_reference_points = tmp_reg_preds
+                    new_reference_points[..., :2] = tmp_reg_preds[
+                        ..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+class DeformableDetrTransformerEncoderLayer(DetrTransformerEncoderLayer):
+    """Encoder layer of Deformable DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize self_attn, ffn, and norms."""
+        self.self_attn = MultiScaleDeformableAttention(**self.self_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(2)
+        ]
+        self.norms = ModuleList(norms_list)
+
+
+class DeformableDetrTransformerDecoderLayer(DetrTransformerDecoderLayer):
+    """Decoder layer of Deformable DETR."""
+
+    def _init_layers(self) -> None:
+        """Initialize self_attn, cross-attn, ffn, and norms."""
+        self.self_attn = MultiheadAttention(**self.self_attn_cfg)
+        self.cross_attn = MultiScaleDeformableAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(3)
+        ]
+        self.norms = ModuleList(norms_list)
diff --git a/head_extractor/src/mmdet/models/layers/transformer/detr_layers.py b/head_extractor/src/mmdet/models/layers/transformer/detr_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a83dd2faa660ed8f54bdd08271db1fcf6b53886
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/transformer/detr_layers.py
@@ -0,0 +1,374 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmengine import ConfigDict
+from mmengine.model import BaseModule, ModuleList
+from torch import Tensor
+
+from mmdet.utils import ConfigType, OptConfigType
+
+try:
+    from fairscale.nn.checkpoint import checkpoint_wrapper
+except Exception:
+    checkpoint_wrapper = None
+
+
+class DetrTransformerEncoder(BaseModule):
+    """Encoder of DETR.
+
+    Args:
+        num_layers (int): Number of encoder layers.
+        layer_cfg (:obj:`ConfigDict` or dict): the config of each encoder
+            layer. All the layers will share the same config.
+        num_cp (int): Number of checkpointing blocks in encoder layer.
+            Default to -1.
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_layers: int,
+                 layer_cfg: ConfigType,
+                 num_cp: int = -1,
+                 init_cfg: OptConfigType = None) -> None:
+
+        super().__init__(init_cfg=init_cfg)
+        self.num_layers = num_layers
+        self.layer_cfg = layer_cfg
+        self.num_cp = num_cp
+        assert self.num_cp <= self.num_layers
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize encoder layers."""
+        self.layers = ModuleList([
+            DetrTransformerEncoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+
+        if self.num_cp > 0:
+            if checkpoint_wrapper is None:
+                raise NotImplementedError(
+                    'If you want to reduce GPU memory usage, \
+                    please install fairscale by executing the \
+                    following command: pip install fairscale.')
+            for i in range(self.num_cp):
+                self.layers[i] = checkpoint_wrapper(self.layers[i])
+
+        self.embed_dims = self.layers[0].embed_dims
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, **kwargs) -> Tensor:
+        """Forward function of encoder.
+
+        Args:
+            query (Tensor): Input queries of encoder, has shape
+                (bs, num_queries, dim).
+            query_pos (Tensor): The positional embeddings of the queries, has
+                shape (bs, num_queries, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (bs, num_queries).
+
+        Returns:
+            Tensor: Has shape (bs, num_queries, dim) if `batch_first` is
+            `True`, otherwise (num_queries, bs, dim).
+        """
+        for layer in self.layers:
+            query = layer(query, query_pos, key_padding_mask, **kwargs)
+        return query
+
+
+class DetrTransformerDecoder(BaseModule):
+    """Decoder of DETR.
+
+    Args:
+        num_layers (int): Number of decoder layers.
+        layer_cfg (:obj:`ConfigDict` or dict): the config of each encoder
+            layer. All the layers will share the same config.
+        post_norm_cfg (:obj:`ConfigDict` or dict, optional): Config of the
+            post normalization layer. Defaults to `LN`.
+        return_intermediate (bool, optional): Whether to return outputs of
+            intermediate layers. Defaults to `True`,
+        init_cfg (:obj:`ConfigDict` or dict, optional): the config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_layers: int,
+                 layer_cfg: ConfigType,
+                 post_norm_cfg: OptConfigType = dict(type='LN'),
+                 return_intermediate: bool = True,
+                 init_cfg: Union[dict, ConfigDict] = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.layer_cfg = layer_cfg
+        self.num_layers = num_layers
+        self.post_norm_cfg = post_norm_cfg
+        self.return_intermediate = return_intermediate
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        self.layers = ModuleList([
+            DetrTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        self.post_norm = build_norm_layer(self.post_norm_cfg,
+                                          self.embed_dims)[1]
+
+    def forward(self, query: Tensor, key: Tensor, value: Tensor,
+                query_pos: Tensor, key_pos: Tensor, key_padding_mask: Tensor,
+                **kwargs) -> Tensor:
+        """Forward function of decoder
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            key (Tensor): The input key, has shape (bs, num_keys, dim).
+            value (Tensor): The input value with the same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`, with the
+                same shape as `query`.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`.
+            key_padding_mask (Tensor): The `key_padding_mask` of `cross_attn`
+                input. ByteTensor, has shape (bs, num_value).
+
+        Returns:
+            Tensor: The forwarded results will have shape
+            (num_decoder_layers, bs, num_queries, dim) if
+            `return_intermediate` is `True` else (1, bs, num_queries, dim).
+        """
+        intermediate = []
+        for layer in self.layers:
+            query = layer(
+                query,
+                key=key,
+                value=value,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+            if self.return_intermediate:
+                intermediate.append(self.post_norm(query))
+        query = self.post_norm(query)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return query.unsqueeze(0)
+
+
+class DetrTransformerEncoderLayer(BaseModule):
+    """Implements encoder layer in DETR transformer.
+
+    Args:
+        self_attn_cfg (:obj:`ConfigDict` or dict, optional): Config for self
+            attention.
+        ffn_cfg (:obj:`ConfigDict` or dict, optional): Config for FFN.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config for
+            normalization layers. All the layers will share the same
+            config. Defaults to `LN`.
+        init_cfg (:obj:`ConfigDict` or dict, optional): Config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 self_attn_cfg: OptConfigType = dict(
+                     embed_dims=256, num_heads=8, dropout=0.0),
+                 ffn_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True)),
+                 norm_cfg: OptConfigType = dict(type='LN'),
+                 init_cfg: OptConfigType = None) -> None:
+
+        super().__init__(init_cfg=init_cfg)
+
+        self.self_attn_cfg = self_attn_cfg
+        if 'batch_first' not in self.self_attn_cfg:
+            self.self_attn_cfg['batch_first'] = True
+        else:
+            assert self.self_attn_cfg['batch_first'] is True, 'First \
+            dimension of all DETRs in mmdet is `batch`, \
+            please set `batch_first` flag.'
+
+        self.ffn_cfg = ffn_cfg
+        self.norm_cfg = norm_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize self-attention, FFN, and normalization."""
+        self.self_attn = MultiheadAttention(**self.self_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(2)
+        ]
+        self.norms = ModuleList(norms_list)
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, **kwargs) -> Tensor:
+        """Forward function of an encoder layer.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `query`.
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor. has shape (bs, num_queries).
+        Returns:
+            Tensor: forwarded results, has shape (bs, num_queries, dim).
+        """
+        query = self.self_attn(
+            query=query,
+            key=query,
+            value=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+        query = self.norms[0](query)
+        query = self.ffn(query)
+        query = self.norms[1](query)
+
+        return query
+
+
+class DetrTransformerDecoderLayer(BaseModule):
+    """Implements decoder layer in DETR transformer.
+
+    Args:
+        self_attn_cfg (:obj:`ConfigDict` or dict, optional): Config for self
+            attention.
+        cross_attn_cfg (:obj:`ConfigDict` or dict, optional): Config for cross
+            attention.
+        ffn_cfg (:obj:`ConfigDict` or dict, optional): Config for FFN.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config for
+            normalization layers. All the layers will share the same
+            config. Defaults to `LN`.
+        init_cfg (:obj:`ConfigDict` or dict, optional): Config to control
+            the initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 self_attn_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     num_heads=8,
+                     dropout=0.0,
+                     batch_first=True),
+                 cross_attn_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     num_heads=8,
+                     dropout=0.0,
+                     batch_first=True),
+                 ffn_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 norm_cfg: OptConfigType = dict(type='LN'),
+                 init_cfg: OptConfigType = None) -> None:
+
+        super().__init__(init_cfg=init_cfg)
+
+        self.self_attn_cfg = self_attn_cfg
+        self.cross_attn_cfg = cross_attn_cfg
+        if 'batch_first' not in self.self_attn_cfg:
+            self.self_attn_cfg['batch_first'] = True
+        else:
+            assert self.self_attn_cfg['batch_first'] is True, 'First \
+            dimension of all DETRs in mmdet is `batch`, \
+            please set `batch_first` flag.'
+
+        if 'batch_first' not in self.cross_attn_cfg:
+            self.cross_attn_cfg['batch_first'] = True
+        else:
+            assert self.cross_attn_cfg['batch_first'] is True, 'First \
+            dimension of all DETRs in mmdet is `batch`, \
+            please set `batch_first` flag.'
+
+        self.ffn_cfg = ffn_cfg
+        self.norm_cfg = norm_cfg
+        self._init_layers()
+
+    def _init_layers(self) -> None:
+        """Initialize self-attention, FFN, and normalization."""
+        self.self_attn = MultiheadAttention(**self.self_attn_cfg)
+        self.cross_attn = MultiheadAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(3)
+        ]
+        self.norms = ModuleList(norms_list)
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                value: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                self_attn_mask: Tensor = None,
+                cross_attn_mask: Tensor = None,
+                key_padding_mask: Tensor = None,
+                **kwargs) -> Tensor:
+        """
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            key (Tensor, optional): The input key, has shape (bs, num_keys,
+                dim). If `None`, the `query` will be used. Defaults to `None`.
+            value (Tensor, optional): The input value, has the same shape as
+                `key`, as in `nn.MultiheadAttention.forward`. If `None`, the
+                `key` will be used. Defaults to `None`.
+            query_pos (Tensor, optional): The positional encoding for `query`,
+                has the same shape as `query`. If not `None`, it will be added
+                to `query` before forward function. Defaults to `None`.
+            key_pos (Tensor, optional): The positional encoding for `key`, has
+                the same shape as `key`. If not `None`, it will be added to
+                `key` before forward function. If None, and `query_pos` has the
+                same shape as `key`, then `query_pos` will be used for
+                `key_pos`. Defaults to None.
+            self_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            cross_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor, optional): The `key_padding_mask` of
+                `self_attn` input. ByteTensor, has shape (bs, num_value).
+                Defaults to None.
+
+        Returns:
+            Tensor: forwarded results, has shape (bs, num_queries, dim).
+        """
+
+        query = self.self_attn(
+            query=query,
+            key=query,
+            value=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_mask,
+            **kwargs)
+        query = self.norms[0](query)
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            value=value,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=cross_attn_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+        query = self.norms[1](query)
+        query = self.ffn(query)
+        query = self.norms[2](query)
+
+        return query
diff --git a/head_extractor/src/mmdet/models/layers/transformer/dino_layers.py b/head_extractor/src/mmdet/models/layers/transformer/dino_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..64610d0a7c0121a88f5e4279b6f854924230237e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/transformer/dino_layers.py
@@ -0,0 +1,562 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Tuple, Union
+
+import torch
+from mmengine.model import BaseModule
+from torch import Tensor, nn
+
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox_xyxy_to_cxcywh
+from mmdet.utils import OptConfigType
+from .deformable_detr_layers import DeformableDetrTransformerDecoder
+from .utils import MLP, coordinate_to_encoding, inverse_sigmoid
+
+
+class DinoTransformerDecoder(DeformableDetrTransformerDecoder):
+    """Transformer decoder of DINO."""
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        super()._init_layers()
+        self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims,
+                                  self.embed_dims, 2)
+        self.norm = nn.LayerNorm(self.embed_dims)
+
+    def forward(self, query: Tensor, value: Tensor, key_padding_mask: Tensor,
+                self_attn_mask: Tensor, reference_points: Tensor,
+                spatial_shapes: Tensor, level_start_index: Tensor,
+                valid_ratios: Tensor, reg_branches: nn.ModuleList,
+                **kwargs) -> Tuple[Tensor]:
+        """Forward function of Transformer decoder.
+
+        Args:
+            query (Tensor): The input query, has shape (num_queries, bs, dim).
+            value (Tensor): The input values, has shape (num_value, bs, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (num_queries, bs).
+            self_attn_mask (Tensor): The attention mask to prevent information
+                leakage from different denoising groups and matching parts, has
+                shape (num_queries_total, num_queries_total). It is `None` when
+                `self.training` is `False`.
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 4) with the last dimension arranged as
+                (cx, cy, w, h).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            reg_branches: (obj:`nn.ModuleList`): Used for refining the
+                regression results.
+
+        Returns:
+            tuple[Tensor]: Output queries and references of Transformer
+                decoder
+
+            - query (Tensor): Output embeddings of the last decoder, has
+              shape (num_queries, bs, embed_dims) when `return_intermediate`
+              is `False`. Otherwise, Intermediate output embeddings of all
+              decoder layers, has shape (num_decoder_layers, num_queries, bs,
+              embed_dims).
+            - reference_points (Tensor): The reference of the last decoder
+              layer, has shape (bs, num_queries, 4)  when `return_intermediate`
+              is `False`. Otherwise, Intermediate references of all decoder
+              layers, has shape (num_decoder_layers, bs, num_queries, 4). The
+              coordinates are arranged as (cx, cy, w, h)
+        """
+        intermediate = []
+        intermediate_reference_points = [reference_points]
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = \
+                    reference_points[:, :, None] * torch.cat(
+                        [valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = \
+                    reference_points[:, :, None] * valid_ratios[:, None]
+
+            query_sine_embed = coordinate_to_encoding(
+                reference_points_input[:, :, 0, :])
+            query_pos = self.ref_point_head(query_sine_embed)
+
+            query = layer(
+                query,
+                query_pos=query_pos,
+                value=value,
+                key_padding_mask=key_padding_mask,
+                self_attn_mask=self_attn_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points_input,
+                **kwargs)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](query)
+                assert reference_points.shape[-1] == 4
+                new_reference_points = tmp + inverse_sigmoid(
+                    reference_points, eps=1e-3)
+                new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            if self.return_intermediate:
+                intermediate.append(self.norm(query))
+                intermediate_reference_points.append(new_reference_points)
+                # NOTE this is for the "Look Forward Twice" module,
+                # in the DeformDETR, reference_points was appended.
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return query, reference_points
+
+
+class CdnQueryGenerator(BaseModule):
+    """Implement query generator of the Contrastive denoising (CDN) proposed in
+    `DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object
+    Detection <https://arxiv.org/abs/2203.03605>`_
+
+    Code is modified from the `official github repo
+    <https://github.com/IDEA-Research/DINO>`_.
+
+    Args:
+        num_classes (int): Number of object classes.
+        embed_dims (int): The embedding dimensions of the generated queries.
+        num_matching_queries (int): The queries number of the matching part.
+            Used for generating dn_mask.
+        label_noise_scale (float): The scale of label noise, defaults to 0.5.
+        box_noise_scale (float): The scale of box noise, defaults to 1.0.
+        group_cfg (:obj:`ConfigDict` or dict, optional): The config of the
+            denoising queries grouping, includes `dynamic`, `num_dn_queries`,
+            and `num_groups`. Two grouping strategies, 'static dn groups' and
+            'dynamic dn groups', are supported. When `dynamic` is `False`,
+            the `num_groups` should be set, and the number of denoising query
+            groups will always be `num_groups`. When `dynamic` is `True`, the
+            `num_dn_queries` should be set, and the group number will be
+            dynamic to ensure that the denoising queries number will not exceed
+            `num_dn_queries` to prevent large fluctuations of memory. Defaults
+            to `None`.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 embed_dims: int,
+                 num_matching_queries: int,
+                 label_noise_scale: float = 0.5,
+                 box_noise_scale: float = 1.0,
+                 group_cfg: OptConfigType = None) -> None:
+        super().__init__()
+        self.num_classes = num_classes
+        self.embed_dims = embed_dims
+        self.num_matching_queries = num_matching_queries
+        self.label_noise_scale = label_noise_scale
+        self.box_noise_scale = box_noise_scale
+
+        # prepare grouping strategy
+        group_cfg = {} if group_cfg is None else group_cfg
+        self.dynamic_dn_groups = group_cfg.get('dynamic', True)
+        if self.dynamic_dn_groups:
+            if 'num_dn_queries' not in group_cfg:
+                warnings.warn("'num_dn_queries' should be set when using "
+                              'dynamic dn groups, use 100 as default.')
+            self.num_dn_queries = group_cfg.get('num_dn_queries', 100)
+            assert isinstance(self.num_dn_queries, int), \
+                f'Expected the num_dn_queries to have type int, but got ' \
+                f'{self.num_dn_queries}({type(self.num_dn_queries)}). '
+        else:
+            assert 'num_groups' in group_cfg, \
+                'num_groups should be set when using static dn groups'
+            self.num_groups = group_cfg['num_groups']
+            assert isinstance(self.num_groups, int), \
+                f'Expected the num_groups to have type int, but got ' \
+                f'{self.num_groups}({type(self.num_groups)}). '
+
+        # NOTE The original repo of DINO set the num_embeddings 92 for coco,
+        # 91 (0~90) of which represents target classes and the 92 (91)
+        # indicates `Unknown` class. However, the embedding of `unknown` class
+        # is not used in the original DINO.
+        # TODO: num_classes + 1 or num_classes ?
+        self.label_embedding = nn.Embedding(self.num_classes, self.embed_dims)
+
+    def __call__(self, batch_data_samples: SampleList) -> tuple:
+        """Generate contrastive denoising (cdn) queries with ground truth.
+
+        Descriptions of the Number Values in code and comments:
+            - num_target_total: the total target number of the input batch
+              samples.
+            - max_num_target: the max target number of the input batch samples.
+            - num_noisy_targets: the total targets number after adding noise,
+              i.e., num_target_total * num_groups * 2.
+            - num_denoising_queries: the length of the output batched queries,
+              i.e., max_num_target * num_groups * 2.
+
+        NOTE The format of input bboxes in batch_data_samples is unnormalized
+        (x, y, x, y), and the output bbox queries are embedded by normalized
+        (cx, cy, w, h) format bboxes going through inverse_sigmoid.
+
+        Args:
+            batch_data_samples (list[:obj:`DetDataSample`]): List of the batch
+                data samples, each includes `gt_instance` which has attributes
+                `bboxes` and `labels`. The `bboxes` has unnormalized coordinate
+                format (x, y, x, y).
+
+        Returns:
+            tuple: The outputs of the dn query generator.
+
+            - dn_label_query (Tensor): The output content queries for denoising
+              part, has shape (bs, num_denoising_queries, dim), where
+              `num_denoising_queries = max_num_target * num_groups * 2`.
+            - dn_bbox_query (Tensor): The output reference bboxes as positions
+              of queries for denoising part, which are embedded by normalized
+              (cx, cy, w, h) format bboxes going through inverse_sigmoid, has
+              shape (bs, num_denoising_queries, 4) with the last dimension
+              arranged as (cx, cy, w, h).
+            - attn_mask (Tensor): The attention mask to prevent information
+              leakage from different denoising groups and matching parts,
+              will be used as `self_attn_mask` of the `decoder`, has shape
+              (num_queries_total, num_queries_total), where `num_queries_total`
+              is the sum of `num_denoising_queries` and `num_matching_queries`.
+            - dn_meta (Dict[str, int]): The dictionary saves information about
+              group collation, including 'num_denoising_queries' and
+              'num_denoising_groups'. It will be used for split outputs of
+              denoising and matching parts and loss calculation.
+        """
+        # normalize bbox and collate ground truth (gt)
+        gt_labels_list = []
+        gt_bboxes_list = []
+        for sample in batch_data_samples:
+            img_h, img_w = sample.img_shape
+            bboxes = sample.gt_instances.bboxes
+            factor = bboxes.new_tensor([img_w, img_h, img_w,
+                                        img_h]).unsqueeze(0)
+            bboxes_normalized = bboxes / factor
+            gt_bboxes_list.append(bboxes_normalized)
+            gt_labels_list.append(sample.gt_instances.labels)
+        gt_labels = torch.cat(gt_labels_list)  # (num_target_total, 4)
+        gt_bboxes = torch.cat(gt_bboxes_list)
+
+        num_target_list = [len(bboxes) for bboxes in gt_bboxes_list]
+        max_num_target = max(num_target_list)
+        num_groups = self.get_num_groups(max_num_target)
+
+        dn_label_query = self.generate_dn_label_query(gt_labels, num_groups)
+        dn_bbox_query = self.generate_dn_bbox_query(gt_bboxes, num_groups)
+
+        # The `batch_idx` saves the batch index of the corresponding sample
+        # for each target, has shape (num_target_total).
+        batch_idx = torch.cat([
+            torch.full_like(t.long(), i) for i, t in enumerate(gt_labels_list)
+        ])
+        dn_label_query, dn_bbox_query = self.collate_dn_queries(
+            dn_label_query, dn_bbox_query, batch_idx, len(batch_data_samples),
+            num_groups)
+
+        attn_mask = self.generate_dn_mask(
+            max_num_target, num_groups, device=dn_label_query.device)
+
+        dn_meta = dict(
+            num_denoising_queries=int(max_num_target * 2 * num_groups),
+            num_denoising_groups=num_groups)
+
+        return dn_label_query, dn_bbox_query, attn_mask, dn_meta
+
+    def get_num_groups(self, max_num_target: int = None) -> int:
+        """Calculate denoising query groups number.
+
+        Two grouping strategies, 'static dn groups' and 'dynamic dn groups',
+        are supported. When `self.dynamic_dn_groups` is `False`, the number
+        of denoising query groups will always be `self.num_groups`. When
+        `self.dynamic_dn_groups` is `True`, the group number will be dynamic,
+        ensuring the denoising queries number will not exceed
+        `self.num_dn_queries` to prevent large fluctuations of memory.
+
+        NOTE The `num_group` is shared for different samples in a batch. When
+        the target numbers in the samples varies, the denoising queries of the
+        samples containing fewer targets are padded to the max length.
+
+        Args:
+            max_num_target (int, optional): The max target number of the batch
+                samples. It will only be used when `self.dynamic_dn_groups` is
+                `True`. Defaults to `None`.
+
+        Returns:
+            int: The denoising group number of the current batch.
+        """
+        if self.dynamic_dn_groups:
+            assert max_num_target is not None, \
+                'group_queries should be provided when using ' \
+                'dynamic dn groups'
+            if max_num_target == 0:
+                num_groups = 1
+            else:
+                num_groups = self.num_dn_queries // max_num_target
+        else:
+            num_groups = self.num_groups
+        if num_groups < 1:
+            num_groups = 1
+        return int(num_groups)
+
+    def generate_dn_label_query(self, gt_labels: Tensor,
+                                num_groups: int) -> Tensor:
+        """Generate noisy labels and their query embeddings.
+
+        The strategy for generating noisy labels is: Randomly choose labels of
+        `self.label_noise_scale * 0.5` proportion and override each of them
+        with a random object category label.
+
+        NOTE Not add noise to all labels. Besides, the `self.label_noise_scale
+        * 0.5` arg is the ratio of the chosen positions, which is higher than
+        the actual proportion of noisy labels, because the labels to override
+        may be correct. And the gap becomes larger as the number of target
+        categories decreases. The users should notice this and modify the scale
+        arg or the corresponding logic according to specific dataset.
+
+        Args:
+            gt_labels (Tensor): The concatenated gt labels of all samples
+                in the batch, has shape (num_target_total, ) where
+                `num_target_total = sum(num_target_list)`.
+            num_groups (int): The number of denoising query groups.
+
+        Returns:
+            Tensor: The query embeddings of noisy labels, has shape
+            (num_noisy_targets, embed_dims), where `num_noisy_targets =
+            num_target_total * num_groups * 2`.
+        """
+        assert self.label_noise_scale > 0
+        gt_labels_expand = gt_labels.repeat(2 * num_groups,
+                                            1).view(-1)  # Note `* 2`  # noqa
+        p = torch.rand_like(gt_labels_expand.float())
+        chosen_indice = torch.nonzero(p < (self.label_noise_scale * 0.5)).view(
+            -1)  # Note `* 0.5`
+        new_labels = torch.randint_like(chosen_indice, 0, self.num_classes)
+        noisy_labels_expand = gt_labels_expand.scatter(0, chosen_indice,
+                                                       new_labels)
+        dn_label_query = self.label_embedding(noisy_labels_expand)
+        return dn_label_query
+
+    def generate_dn_bbox_query(self, gt_bboxes: Tensor,
+                               num_groups: int) -> Tensor:
+        """Generate noisy bboxes and their query embeddings.
+
+        The strategy for generating noisy bboxes is as follow:
+
+        .. code:: text
+
+            +--------------------+
+            |      negative      |
+            |    +----------+    |
+            |    | positive |    |
+            |    |    +-----|----+------------+
+            |    |    |     |    |            |
+            |    +----+-----+    |            |
+            |         |          |            |
+            +---------+----------+            |
+                      |                       |
+                      |        gt bbox        |
+                      |                       |
+                      |             +---------+----------+
+                      |             |         |          |
+                      |             |    +----+-----+    |
+                      |             |    |    |     |    |
+                      +-------------|--- +----+     |    |
+                                    |    | positive |    |
+                                    |    +----------+    |
+                                    |      negative      |
+                                    +--------------------+
+
+         The random noise is added to the top-left and down-right point
+         positions, hence, normalized (x, y, x, y) format of bboxes are
+         required. The noisy bboxes of positive queries have the points
+         both within the inner square, while those of negative queries
+         have the points both between the inner and outer squares.
+
+        Besides, the length of outer square is twice as long as that of
+        the inner square, i.e., self.box_noise_scale * w_or_h / 2.
+        NOTE The noise is added to all the bboxes. Moreover, there is still
+        unconsidered case when one point is within the positive square and
+        the others is between the inner and outer squares.
+
+        Args:
+            gt_bboxes (Tensor): The concatenated gt bboxes of all samples
+                in the batch, has shape (num_target_total, 4) with the last
+                dimension arranged as (cx, cy, w, h) where
+                `num_target_total = sum(num_target_list)`.
+            num_groups (int): The number of denoising query groups.
+
+        Returns:
+            Tensor: The output noisy bboxes, which are embedded by normalized
+            (cx, cy, w, h) format bboxes going through inverse_sigmoid, has
+            shape (num_noisy_targets, 4) with the last dimension arranged as
+            (cx, cy, w, h), where
+            `num_noisy_targets = num_target_total * num_groups * 2`.
+        """
+        assert self.box_noise_scale > 0
+        device = gt_bboxes.device
+
+        # expand gt_bboxes as groups
+        gt_bboxes_expand = gt_bboxes.repeat(2 * num_groups, 1)  # xyxy
+
+        # obtain index of negative queries in gt_bboxes_expand
+        positive_idx = torch.arange(
+            len(gt_bboxes), dtype=torch.long, device=device)
+        positive_idx = positive_idx.unsqueeze(0).repeat(num_groups, 1)
+        positive_idx += 2 * len(gt_bboxes) * torch.arange(
+            num_groups, dtype=torch.long, device=device)[:, None]
+        positive_idx = positive_idx.flatten()
+        negative_idx = positive_idx + len(gt_bboxes)
+
+        # determine the sign of each element in the random part of the added
+        # noise to be positive or negative randomly.
+        rand_sign = torch.randint_like(
+            gt_bboxes_expand, low=0, high=2,
+            dtype=torch.float32) * 2.0 - 1.0  # [low, high), 1 or -1, randomly
+
+        # calculate the random part of the added noise
+        rand_part = torch.rand_like(gt_bboxes_expand)  # [0, 1)
+        rand_part[negative_idx] += 1.0  # pos: [0, 1); neg: [1, 2)
+        rand_part *= rand_sign  # pos: (-1, 1); neg: (-2, -1] U [1, 2)
+
+        # add noise to the bboxes
+        bboxes_whwh = bbox_xyxy_to_cxcywh(gt_bboxes_expand)[:, 2:].repeat(1, 2)
+        noisy_bboxes_expand = gt_bboxes_expand + torch.mul(
+            rand_part, bboxes_whwh) * self.box_noise_scale / 2  # xyxy
+        noisy_bboxes_expand = noisy_bboxes_expand.clamp(min=0.0, max=1.0)
+        noisy_bboxes_expand = bbox_xyxy_to_cxcywh(noisy_bboxes_expand)
+
+        dn_bbox_query = inverse_sigmoid(noisy_bboxes_expand, eps=1e-3)
+        return dn_bbox_query
+
+    def collate_dn_queries(self, input_label_query: Tensor,
+                           input_bbox_query: Tensor, batch_idx: Tensor,
+                           batch_size: int, num_groups: int) -> Tuple[Tensor]:
+        """Collate generated queries to obtain batched dn queries.
+
+        The strategy for query collation is as follow:
+
+        .. code:: text
+
+                    input_queries (num_target_total, query_dim)
+            P_A1 P_B1 P_B2 N_A1 N_B1 N_B2 P'A1 P'B1 P'B2 N'A1 N'B1 N'B2
+              |________ group1 ________|    |________ group2 ________|
+                                         |
+                                         V
+                      P_A1 Pad0 N_A1 Pad0 P'A1 Pad0 N'A1 Pad0
+                      P_B1 P_B2 N_B1 N_B2 P'B1 P'B2 N'B1 N'B2
+                       |____ group1 ____| |____ group2 ____|
+             batched_queries (batch_size, max_num_target, query_dim)
+
+            where query_dim is 4 for bbox and self.embed_dims for label.
+            Notation: _-group 1; '-group 2;
+                      A-Sample1(has 1 target); B-sample2(has 2 targets)
+
+        Args:
+            input_label_query (Tensor): The generated label queries of all
+                targets, has shape (num_target_total, embed_dims) where
+                `num_target_total = sum(num_target_list)`.
+            input_bbox_query (Tensor): The generated bbox queries of all
+                targets, has shape (num_target_total, 4) with the last
+                dimension arranged as (cx, cy, w, h).
+            batch_idx (Tensor): The batch index of the corresponding sample
+                for each target, has shape (num_target_total).
+            batch_size (int): The size of the input batch.
+            num_groups (int): The number of denoising query groups.
+
+        Returns:
+            tuple[Tensor]: Output batched label and bbox queries.
+            - batched_label_query (Tensor): The output batched label queries,
+              has shape (batch_size, max_num_target, embed_dims).
+            - batched_bbox_query (Tensor): The output batched bbox queries,
+              has shape (batch_size, max_num_target, 4) with the last dimension
+              arranged as (cx, cy, w, h).
+        """
+        device = input_label_query.device
+        num_target_list = [
+            torch.sum(batch_idx == idx) for idx in range(batch_size)
+        ]
+        max_num_target = max(num_target_list)
+        num_denoising_queries = int(max_num_target * 2 * num_groups)
+
+        map_query_index = torch.cat([
+            torch.arange(num_target, device=device)
+            for num_target in num_target_list
+        ])
+        map_query_index = torch.cat([
+            map_query_index + max_num_target * i for i in range(2 * num_groups)
+        ]).long()
+        batch_idx_expand = batch_idx.repeat(2 * num_groups, 1).view(-1)
+        mapper = (batch_idx_expand, map_query_index)
+
+        batched_label_query = torch.zeros(
+            batch_size, num_denoising_queries, self.embed_dims, device=device)
+        batched_bbox_query = torch.zeros(
+            batch_size, num_denoising_queries, 4, device=device)
+
+        batched_label_query[mapper] = input_label_query
+        batched_bbox_query[mapper] = input_bbox_query
+        return batched_label_query, batched_bbox_query
+
+    def generate_dn_mask(self, max_num_target: int, num_groups: int,
+                         device: Union[torch.device, str]) -> Tensor:
+        """Generate attention mask to prevent information leakage from
+        different denoising groups and matching parts.
+
+        .. code:: text
+
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        0 0 0 0 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 0 0 0 0 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+                        1 1 1 1 1 1 1 1 0 0 0 0 0
+         max_num_target |_|           |_________| num_matching_queries
+                        |_____________| num_denoising_queries
+
+               1 -> True  (Masked), means 'can not see'.
+               0 -> False (UnMasked), means 'can see'.
+
+        Args:
+            max_num_target (int): The max target number of the input batch
+                samples.
+            num_groups (int): The number of denoising query groups.
+            device (obj:`device` or str): The device of generated mask.
+
+        Returns:
+            Tensor: The attention mask to prevent information leakage from
+            different denoising groups and matching parts, will be used as
+            `self_attn_mask` of the `decoder`, has shape (num_queries_total,
+            num_queries_total), where `num_queries_total` is the sum of
+            `num_denoising_queries` and `num_matching_queries`.
+        """
+        num_denoising_queries = int(max_num_target * 2 * num_groups)
+        num_queries_total = num_denoising_queries + self.num_matching_queries
+        attn_mask = torch.zeros(
+            num_queries_total,
+            num_queries_total,
+            device=device,
+            dtype=torch.bool)
+        # Make the matching part cannot see the denoising groups
+        attn_mask[num_denoising_queries:, :num_denoising_queries] = True
+        # Make the denoising groups cannot see each other
+        for i in range(num_groups):
+            # Mask rows of one group per step.
+            row_scope = slice(max_num_target * 2 * i,
+                              max_num_target * 2 * (i + 1))
+            left_scope = slice(max_num_target * 2 * i)
+            right_scope = slice(max_num_target * 2 * (i + 1),
+                                num_denoising_queries)
+            attn_mask[row_scope, right_scope] = True
+            attn_mask[row_scope, left_scope] = True
+        return attn_mask
diff --git a/head_extractor/src/mmdet/models/layers/transformer/grounding_dino_layers.py b/head_extractor/src/mmdet/models/layers/transformer/grounding_dino_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c285768f36af98075607b43e48e6f1018125ad1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/transformer/grounding_dino_layers.py
@@ -0,0 +1,270 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmcv.ops import MultiScaleDeformableAttention
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from mmdet.models.utils.vlfuse_helper import SingleScaleBiAttentionBlock
+from mmdet.utils import ConfigType, OptConfigType
+from .deformable_detr_layers import (DeformableDetrTransformerDecoderLayer,
+                                     DeformableDetrTransformerEncoder,
+                                     DeformableDetrTransformerEncoderLayer)
+from .detr_layers import DetrTransformerEncoderLayer
+from .dino_layers import DinoTransformerDecoder
+from .utils import MLP, get_text_sine_pos_embed
+
+try:
+    from fairscale.nn.checkpoint import checkpoint_wrapper
+except Exception:
+    checkpoint_wrapper = None
+
+
+class GroundingDinoTransformerDecoderLayer(
+        DeformableDetrTransformerDecoderLayer):
+
+    def __init__(self,
+                 cross_attn_text_cfg: OptConfigType = dict(
+                     embed_dims=256,
+                     num_heads=8,
+                     dropout=0.0,
+                     batch_first=True),
+                 **kwargs) -> None:
+        """Decoder layer of Deformable DETR."""
+        self.cross_attn_text_cfg = cross_attn_text_cfg
+        if 'batch_first' not in self.cross_attn_text_cfg:
+            self.cross_attn_text_cfg['batch_first'] = True
+        super().__init__(**kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize self_attn, cross-attn, ffn, and norms."""
+        self.self_attn = MultiheadAttention(**self.self_attn_cfg)
+        self.cross_attn_text = MultiheadAttention(**self.cross_attn_text_cfg)
+        self.cross_attn = MultiScaleDeformableAttention(**self.cross_attn_cfg)
+        self.embed_dims = self.self_attn.embed_dims
+        self.ffn = FFN(**self.ffn_cfg)
+        norms_list = [
+            build_norm_layer(self.norm_cfg, self.embed_dims)[1]
+            for _ in range(4)
+        ]
+        self.norms = ModuleList(norms_list)
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                value: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                self_attn_mask: Tensor = None,
+                cross_attn_mask: Tensor = None,
+                key_padding_mask: Tensor = None,
+                memory_text: Tensor = None,
+                text_attention_mask: Tensor = None,
+                **kwargs) -> Tensor:
+        """Implements decoder layer in Grounding DINO transformer.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            key (Tensor, optional): The input key, has shape (bs, num_keys,
+                dim). If `None`, the `query` will be used. Defaults to `None`.
+            value (Tensor, optional): The input value, has the same shape as
+                `key`, as in `nn.MultiheadAttention.forward`. If `None`, the
+                `key` will be used. Defaults to `None`.
+            query_pos (Tensor, optional): The positional encoding for `query`,
+                has the same shape as `query`. If not `None`, it will be added
+                to `query` before forward function. Defaults to `None`.
+            key_pos (Tensor, optional): The positional encoding for `key`, has
+                the same shape as `key`. If not `None`, it will be added to
+                `key` before forward function. If None, and `query_pos` has the
+                same shape as `key`, then `query_pos` will be used for
+                `key_pos`. Defaults to None.
+            self_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            cross_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor, optional): The `key_padding_mask` of
+                `self_attn` input. ByteTensor, has shape (bs, num_value).
+                Defaults to None.
+            memory_text (Tensor): Memory text. It has shape (bs, len_text,
+                text_embed_dims).
+            text_attention_mask (Tensor): Text token mask. It has shape (bs,
+                len_text).
+
+        Returns:
+            Tensor: forwarded results, has shape (bs, num_queries, dim).
+        """
+        # self attention
+        query = self.self_attn(
+            query=query,
+            key=query,
+            value=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_mask,
+            **kwargs)
+        query = self.norms[0](query)
+        # cross attention between query and text
+        query = self.cross_attn_text(
+            query=query,
+            query_pos=query_pos,
+            key=memory_text,
+            value=memory_text,
+            key_padding_mask=text_attention_mask)
+        query = self.norms[1](query)
+        # cross attention between query and image
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            value=value,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=cross_attn_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+        query = self.norms[2](query)
+        query = self.ffn(query)
+        query = self.norms[3](query)
+
+        return query
+
+
+class GroundingDinoTransformerEncoder(DeformableDetrTransformerEncoder):
+
+    def __init__(self, text_layer_cfg: ConfigType,
+                 fusion_layer_cfg: ConfigType, **kwargs) -> None:
+        self.text_layer_cfg = text_layer_cfg
+        self.fusion_layer_cfg = fusion_layer_cfg
+        super().__init__(**kwargs)
+
+    def _init_layers(self) -> None:
+        """Initialize encoder layers."""
+        self.layers = ModuleList([
+            DeformableDetrTransformerEncoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.text_layers = ModuleList([
+            DetrTransformerEncoderLayer(**self.text_layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.fusion_layers = ModuleList([
+            SingleScaleBiAttentionBlock(**self.fusion_layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        if self.num_cp > 0:
+            if checkpoint_wrapper is None:
+                raise NotImplementedError(
+                    'If you want to reduce GPU memory usage, \
+                    please install fairscale by executing the \
+                    following command: pip install fairscale.')
+            for i in range(self.num_cp):
+                self.layers[i] = checkpoint_wrapper(self.layers[i])
+                self.fusion_layers[i] = checkpoint_wrapper(
+                    self.fusion_layers[i])
+
+    def forward(self,
+                query: Tensor,
+                query_pos: Tensor,
+                key_padding_mask: Tensor,
+                spatial_shapes: Tensor,
+                level_start_index: Tensor,
+                valid_ratios: Tensor,
+                memory_text: Tensor = None,
+                text_attention_mask: Tensor = None,
+                pos_text: Tensor = None,
+                text_self_attention_masks: Tensor = None,
+                position_ids: Tensor = None):
+        """Forward function of Transformer encoder.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            query_pos (Tensor): The positional encoding for query, has shape
+                (bs, num_queries, dim).
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (bs, num_queries).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            memory_text (Tensor, optional): Memory text. It has shape (bs,
+                len_text, text_embed_dims).
+            text_attention_mask (Tensor, optional): Text token mask. It has
+                shape (bs,len_text).
+            pos_text (Tensor, optional): The positional encoding for text.
+                Defaults to None.
+            text_self_attention_masks (Tensor, optional): Text self attention
+                mask. Defaults to None.
+            position_ids (Tensor, optional): Text position ids.
+                Defaults to None.
+        """
+        output = query
+        reference_points = self.get_encoder_reference_points(
+            spatial_shapes, valid_ratios, device=query.device)
+        if self.text_layers:
+            # generate pos_text
+            bs, n_text, _ = memory_text.shape
+            if pos_text is None and position_ids is None:
+                pos_text = (
+                    torch.arange(n_text,
+                                 device=memory_text.device).float().unsqueeze(
+                                     0).unsqueeze(-1).repeat(bs, 1, 1))
+                pos_text = get_text_sine_pos_embed(
+                    pos_text, num_pos_feats=256, exchange_xy=False)
+            if position_ids is not None:
+                pos_text = get_text_sine_pos_embed(
+                    position_ids[..., None],
+                    num_pos_feats=256,
+                    exchange_xy=False)
+
+        # main process
+        for layer_id, layer in enumerate(self.layers):
+            if self.fusion_layers:
+                output, memory_text = self.fusion_layers[layer_id](
+                    visual_feature=output,
+                    lang_feature=memory_text,
+                    attention_mask_v=key_padding_mask,
+                    attention_mask_l=text_attention_mask,
+                )
+            if self.text_layers:
+                text_num_heads = self.text_layers[
+                    layer_id].self_attn_cfg.num_heads
+                memory_text = self.text_layers[layer_id](
+                    query=memory_text,
+                    query_pos=(pos_text if pos_text is not None else None),
+                    attn_mask=~text_self_attention_masks.repeat(
+                        text_num_heads, 1, 1),  # note we use ~ for mask here
+                    key_padding_mask=None,
+                )
+            output = layer(
+                query=output,
+                query_pos=query_pos,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                key_padding_mask=key_padding_mask)
+        return output, memory_text
+
+
+class GroundingDinoTransformerDecoder(DinoTransformerDecoder):
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        self.layers = ModuleList([
+            GroundingDinoTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        if self.post_norm_cfg is not None:
+            raise ValueError('There is not post_norm in '
+                             f'{self._get_name()}')
+        self.ref_point_head = MLP(self.embed_dims * 2, self.embed_dims,
+                                  self.embed_dims, 2)
+        self.norm = nn.LayerNorm(self.embed_dims)
diff --git a/head_extractor/src/mmdet/models/layers/transformer/mask2former_layers.py b/head_extractor/src/mmdet/models/layers/transformer/mask2former_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcc604e277d91151334ed520d78e6a5a8f388036
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/transformer/mask2former_layers.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_norm_layer
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from .deformable_detr_layers import DeformableDetrTransformerEncoder
+from .detr_layers import DetrTransformerDecoder, DetrTransformerDecoderLayer
+
+
+class Mask2FormerTransformerEncoder(DeformableDetrTransformerEncoder):
+    """Encoder in PixelDecoder of Mask2Former."""
+
+    def forward(self, query: Tensor, query_pos: Tensor,
+                key_padding_mask: Tensor, spatial_shapes: Tensor,
+                level_start_index: Tensor, valid_ratios: Tensor,
+                reference_points: Tensor, **kwargs) -> Tensor:
+        """Forward function of Transformer encoder.
+
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            query_pos (Tensor): The positional encoding for query, has shape
+                (bs, num_queries, dim). If not None, it will be added to the
+                `query` before forward function. Defaults to None.
+            key_padding_mask (Tensor): The `key_padding_mask` of `self_attn`
+                input. ByteTensor, has shape (bs, num_queries).
+            spatial_shapes (Tensor): Spatial shapes of features in all levels,
+                has shape (num_levels, 2), last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels, ) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+            valid_ratios (Tensor): The ratios of the valid width and the valid
+                height relative to the width and the height of features in all
+                levels, has shape (bs, num_levels, 2).
+            reference_points (Tensor): The initial reference, has shape
+                (bs, num_queries, 2) with the last dimension arranged
+                as (cx, cy).
+
+        Returns:
+            Tensor: Output queries of Transformer encoder, which is also
+            called 'encoder output embeddings' or 'memory', has shape
+            (bs, num_queries, dim)
+        """
+        for layer in self.layers:
+            query = layer(
+                query=query,
+                query_pos=query_pos,
+                key_padding_mask=key_padding_mask,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                reference_points=reference_points,
+                **kwargs)
+        return query
+
+
+class Mask2FormerTransformerDecoder(DetrTransformerDecoder):
+    """Decoder of Mask2Former."""
+
+    def _init_layers(self) -> None:
+        """Initialize decoder layers."""
+        self.layers = ModuleList([
+            Mask2FormerTransformerDecoderLayer(**self.layer_cfg)
+            for _ in range(self.num_layers)
+        ])
+        self.embed_dims = self.layers[0].embed_dims
+        self.post_norm = build_norm_layer(self.post_norm_cfg,
+                                          self.embed_dims)[1]
+
+
+class Mask2FormerTransformerDecoderLayer(DetrTransformerDecoderLayer):
+    """Implements decoder layer in Mask2Former transformer."""
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor = None,
+                value: Tensor = None,
+                query_pos: Tensor = None,
+                key_pos: Tensor = None,
+                self_attn_mask: Tensor = None,
+                cross_attn_mask: Tensor = None,
+                key_padding_mask: Tensor = None,
+                **kwargs) -> Tensor:
+        """
+        Args:
+            query (Tensor): The input query, has shape (bs, num_queries, dim).
+            key (Tensor, optional): The input key, has shape (bs, num_keys,
+                dim). If `None`, the `query` will be used. Defaults to `None`.
+            value (Tensor, optional): The input value, has the same shape as
+                `key`, as in `nn.MultiheadAttention.forward`. If `None`, the
+                `key` will be used. Defaults to `None`.
+            query_pos (Tensor, optional): The positional encoding for `query`,
+                has the same shape as `query`. If not `None`, it will be added
+                to `query` before forward function. Defaults to `None`.
+            key_pos (Tensor, optional): The positional encoding for `key`, has
+                the same shape as `key`. If not `None`, it will be added to
+                `key` before forward function. If None, and `query_pos` has the
+                same shape as `key`, then `query_pos` will be used for
+                `key_pos`. Defaults to None.
+            self_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            cross_attn_mask (Tensor, optional): ByteTensor mask, has shape
+                (num_queries, num_keys), as in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor, optional): The `key_padding_mask` of
+                `self_attn` input. ByteTensor, has shape (bs, num_value).
+                Defaults to None.
+
+        Returns:
+            Tensor: forwarded results, has shape (bs, num_queries, dim).
+        """
+
+        query = self.cross_attn(
+            query=query,
+            key=key,
+            value=value,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=cross_attn_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+        query = self.norms[0](query)
+        query = self.self_attn(
+            query=query,
+            key=query,
+            value=query,
+            query_pos=query_pos,
+            key_pos=query_pos,
+            attn_mask=self_attn_mask,
+            **kwargs)
+        query = self.norms[1](query)
+        query = self.ffn(query)
+        query = self.norms[2](query)
+
+        return query
diff --git a/head_extractor/src/mmdet/models/layers/transformer/utils.py b/head_extractor/src/mmdet/models/layers/transformer/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e43a172ca7175b23c82f60894faf38ec6c437e3
--- /dev/null
+++ b/head_extractor/src/mmdet/models/layers/transformer/utils.py
@@ -0,0 +1,915 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from typing import Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer,
+                      build_norm_layer)
+from mmcv.cnn.bricks.drop import Dropout
+from mmengine.model import BaseModule, ModuleList
+from mmengine.utils import to_2tuple
+from torch import Tensor, nn
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+def nlc_to_nchw(x: Tensor, hw_shape: Sequence[int]) -> Tensor:
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len does not match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+def coordinate_to_encoding(coord_tensor: Tensor,
+                           num_feats: int = 128,
+                           temperature: int = 10000,
+                           scale: float = 2 * math.pi):
+    """Convert coordinate tensor to positional encoding.
+
+    Args:
+        coord_tensor (Tensor): Coordinate tensor to be converted to
+            positional encoding. With the last dimension as 2 or 4.
+        num_feats (int, optional): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value. Defaults to 128.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+    Returns:
+        Tensor: Returned encoded positional tensor.
+    """
+    dim_t = torch.arange(
+        num_feats, dtype=torch.float32, device=coord_tensor.device)
+    dim_t = temperature**(2 * (dim_t // 2) / num_feats)
+    x_embed = coord_tensor[..., 0] * scale
+    y_embed = coord_tensor[..., 1] * scale
+    pos_x = x_embed[..., None] / dim_t
+    pos_y = y_embed[..., None] / dim_t
+    pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()),
+                        dim=-1).flatten(2)
+    pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()),
+                        dim=-1).flatten(2)
+    if coord_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=-1)
+    elif coord_tensor.size(-1) == 4:
+        w_embed = coord_tensor[..., 2] * scale
+        pos_w = w_embed[..., None] / dim_t
+        pos_w = torch.stack((pos_w[..., 0::2].sin(), pos_w[..., 1::2].cos()),
+                            dim=-1).flatten(2)
+
+        h_embed = coord_tensor[..., 3] * scale
+        pos_h = h_embed[..., None] / dim_t
+        pos_h = torch.stack((pos_h[..., 0::2].sin(), pos_h[..., 1::2].cos()),
+                            dim=-1).flatten(2)
+
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=-1)
+    else:
+        raise ValueError('Unknown pos_tensor shape(-1):{}'.format(
+            coord_tensor.size(-1)))
+    return pos
+
+
+def inverse_sigmoid(x: Tensor, eps: float = 1e-5) -> Tensor:
+    """Inverse function of sigmoid.
+
+    Args:
+        x (Tensor): The tensor to do the inverse.
+        eps (float): EPS avoid numerical overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse function of sigmoid, has the same
+        shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding to input (if needed) so that input can get fully covered
+    by filter you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
+    input. The "corner"  mode would pad zero to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel:
+        stride (int | tuple): Stride of the filter. Default: 1:
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+
+        super(AdaptivePadding, self).__init__()
+
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        padding = to_2tuple(padding)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The config dict for embedding
+            conv layer type selection. Default: "Conv2d.
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: None (Would be set as `kernel_size`).
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only work when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmengine.ConfigDict`, optional): The Config for
+            initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 3,
+                 embed_dims: int = 768,
+                 conv_type: str = 'Conv2d',
+                 kernel_size: int = 16,
+                 stride: int = 16,
+                 padding: Union[int, tuple, str] = 'corner',
+                 dilation: int = 1,
+                 bias: bool = True,
+                 norm_cfg: OptConfigType = None,
+                 input_size: Union[int, tuple] = None,
+                 init_cfg: OptConfigType = None) -> None:
+        super(PatchEmbed, self).__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adap_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adap_padding:
+                pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x: Tensor) -> Tuple[Tensor, Tuple[int]]:
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (out_h, out_w).
+        """
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
+    merge patch, which is about 25% faster than original implementation.
+    Instead, we need to modify pretrained models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+            to gets fully covered by filter and stride you specified..
+            Default: True.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Optional[Union[int, tuple]] = 2,
+                 stride: Optional[Union[int, tuple]] = None,
+                 padding: Union[int, tuple, str] = 'corner',
+                 dilation: Optional[Union[int, tuple]] = 1,
+                 bias: Optional[bool] = False,
+                 norm_cfg: OptConfigType = dict(type='LN'),
+                 init_cfg: OptConfigType = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adap_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x: Tensor,
+                input_size: Tuple[int]) -> Tuple[Tensor, Tuple[int]]:
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+            H, W = x.shape[-2:]
+
+        x = self.sampler(x)
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
+
+
+class ConditionalAttention(BaseModule):
+    """A wrapper of conditional attention, dropout and residual connection.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop: A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        cross_attn (bool): Whether the attention module is for cross attention.
+            Default: False
+        keep_query_pos (bool): Whether to transform query_pos before cross
+            attention.
+            Default: False.
+        batch_first (bool): When it is True, Key, Query and Value are shape of
+            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+             Default: True.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims: int,
+                 num_heads: int,
+                 attn_drop: float = 0.,
+                 proj_drop: float = 0.,
+                 cross_attn: bool = False,
+                 keep_query_pos: bool = False,
+                 batch_first: bool = True,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+
+        assert batch_first is True, 'Set `batch_first`\
+        to False is NOT supported in ConditionalAttention. \
+        First dimension of all DETRs in mmdet is `batch`, \
+        please set `batch_first` to True.'
+
+        self.cross_attn = cross_attn
+        self.keep_query_pos = keep_query_pos
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.attn_drop = Dropout(attn_drop)
+        self.proj_drop = Dropout(proj_drop)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers for qkv projection."""
+        embed_dims = self.embed_dims
+        self.qcontent_proj = Linear(embed_dims, embed_dims)
+        self.qpos_proj = Linear(embed_dims, embed_dims)
+        self.kcontent_proj = Linear(embed_dims, embed_dims)
+        self.kpos_proj = Linear(embed_dims, embed_dims)
+        self.v_proj = Linear(embed_dims, embed_dims)
+        if self.cross_attn:
+            self.qpos_sine_proj = Linear(embed_dims, embed_dims)
+        self.out_proj = Linear(embed_dims, embed_dims)
+
+        nn.init.constant_(self.out_proj.bias, 0.)
+
+    def forward_attn(self,
+                     query: Tensor,
+                     key: Tensor,
+                     value: Tensor,
+                     attn_mask: Tensor = None,
+                     key_padding_mask: Tensor = None) -> Tuple[Tensor]:
+        """Forward process for `ConditionalAttention`.
+
+        Args:
+            query (Tensor): The input query with shape [bs, num_queries,
+                embed_dims].
+            key (Tensor): The key tensor with shape [bs, num_keys,
+                embed_dims].
+                If None, the `query` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+        Returns:
+            Tuple[Tensor]: Attention outputs of shape :math:`(N, L, E)`,
+            where :math:`N` is the batch size, :math:`L` is the target
+            sequence length , and :math:`E` is the embedding dimension
+            `embed_dim`. Attention weights per head of shape :math:`
+            (num_heads, L, S)`. where :math:`N` is batch size, :math:`L`
+            is target sequence length, and :math:`S` is the source sequence
+            length.
+        """
+        assert key.size(1) == value.size(1), \
+            f'{"key, value must have the same sequence length"}'
+        assert query.size(0) == key.size(0) == value.size(0), \
+            f'{"batch size must be equal for query, key, value"}'
+        assert query.size(2) == key.size(2), \
+            f'{"q_dims, k_dims must be equal"}'
+        assert value.size(2) == self.embed_dims, \
+            f'{"v_dims must be equal to embed_dims"}'
+
+        bs, tgt_len, hidden_dims = query.size()
+        _, src_len, _ = key.size()
+        head_dims = hidden_dims // self.num_heads
+        v_head_dims = self.embed_dims // self.num_heads
+        assert head_dims * self.num_heads == hidden_dims, \
+            f'{"hidden_dims must be divisible by num_heads"}'
+        scaling = float(head_dims)**-0.5
+
+        q = query * scaling
+        k = key
+        v = value
+
+        if attn_mask is not None:
+            assert attn_mask.dtype == torch.float32 or \
+                   attn_mask.dtype == torch.float64 or \
+                   attn_mask.dtype == torch.float16 or \
+                   attn_mask.dtype == torch.uint8 or \
+                   attn_mask.dtype == torch.bool, \
+                   'Only float, byte, and bool types are supported for \
+                    attn_mask'
+
+            if attn_mask.dtype == torch.uint8:
+                warnings.warn('Byte tensor for attn_mask is deprecated.\
+                     Use bool tensor instead.')
+                attn_mask = attn_mask.to(torch.bool)
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask.unsqueeze(0)
+                if list(attn_mask.size()) != [1, query.size(1), key.size(1)]:
+                    raise RuntimeError(
+                        'The size of the 2D attn_mask is not correct.')
+            elif attn_mask.dim() == 3:
+                if list(attn_mask.size()) != [
+                        bs * self.num_heads,
+                        query.size(1),
+                        key.size(1)
+                ]:
+                    raise RuntimeError(
+                        'The size of the 3D attn_mask is not correct.')
+            else:
+                raise RuntimeError(
+                    "attn_mask's dimension {} is not supported".format(
+                        attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+
+        if key_padding_mask is not None and key_padding_mask.dtype == int:
+            key_padding_mask = key_padding_mask.to(torch.bool)
+
+        q = q.contiguous().view(bs, tgt_len, self.num_heads,
+                                head_dims).permute(0, 2, 1, 3).flatten(0, 1)
+        if k is not None:
+            k = k.contiguous().view(bs, src_len, self.num_heads,
+                                    head_dims).permute(0, 2, 1,
+                                                       3).flatten(0, 1)
+        if v is not None:
+            v = v.contiguous().view(bs, src_len, self.num_heads,
+                                    v_head_dims).permute(0, 2, 1,
+                                                         3).flatten(0, 1)
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bs
+            assert key_padding_mask.size(1) == src_len
+
+        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+        assert list(attn_output_weights.size()) == [
+            bs * self.num_heads, tgt_len, src_len
+        ]
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+            else:
+                attn_output_weights += attn_mask
+
+        if key_padding_mask is not None:
+            attn_output_weights = attn_output_weights.view(
+                bs, self.num_heads, tgt_len, src_len)
+            attn_output_weights = attn_output_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2),
+                float('-inf'),
+            )
+            attn_output_weights = attn_output_weights.view(
+                bs * self.num_heads, tgt_len, src_len)
+
+        attn_output_weights = F.softmax(
+            attn_output_weights -
+            attn_output_weights.max(dim=-1, keepdim=True)[0],
+            dim=-1)
+        attn_output_weights = self.attn_drop(attn_output_weights)
+
+        attn_output = torch.bmm(attn_output_weights, v)
+        assert list(
+            attn_output.size()) == [bs * self.num_heads, tgt_len, v_head_dims]
+        attn_output = attn_output.view(bs, self.num_heads, tgt_len,
+                                       v_head_dims).permute(0, 2, 1,
+                                                            3).flatten(2)
+        attn_output = self.out_proj(attn_output)
+
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bs, self.num_heads,
+                                                       tgt_len, src_len)
+        return attn_output, attn_output_weights.sum(dim=1) / self.num_heads
+
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                query_pos: Tensor = None,
+                ref_sine_embed: Tensor = None,
+                key_pos: Tensor = None,
+                attn_mask: Tensor = None,
+                key_padding_mask: Tensor = None,
+                is_first: bool = False) -> Tensor:
+        """Forward function for `ConditionalAttention`.
+        Args:
+            query (Tensor): The input query with shape [bs, num_queries,
+                embed_dims].
+            key (Tensor): The key tensor with shape [bs, num_keys,
+                embed_dims].
+                If None, the `query` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query in self
+                attention, with the same shape as `x`. If not None, it will
+                be added to `x` before forward function.
+                Defaults to None.
+            query_sine_embed (Tensor): The positional encoding for query in
+                cross attention, with the same shape as `x`. If not None, it
+                will be added to `x` before forward function.
+                Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+            is_first (bool): A indicator to tell whether the current layer
+                is the first layer of the decoder.
+                Defaults to False.
+        Returns:
+            Tensor: forwarded results with shape
+            [bs, num_queries, embed_dims].
+        """
+
+        if self.cross_attn:
+            q_content = self.qcontent_proj(query)
+            k_content = self.kcontent_proj(key)
+            v = self.v_proj(key)
+
+            bs, nq, c = q_content.size()
+            _, hw, _ = k_content.size()
+
+            k_pos = self.kpos_proj(key_pos)
+            if is_first or self.keep_query_pos:
+                q_pos = self.qpos_proj(query_pos)
+                q = q_content + q_pos
+                k = k_content + k_pos
+            else:
+                q = q_content
+                k = k_content
+            q = q.view(bs, nq, self.num_heads, c // self.num_heads)
+            query_sine_embed = self.qpos_sine_proj(ref_sine_embed)
+            query_sine_embed = query_sine_embed.view(bs, nq, self.num_heads,
+                                                     c // self.num_heads)
+            q = torch.cat([q, query_sine_embed], dim=3).view(bs, nq, 2 * c)
+            k = k.view(bs, hw, self.num_heads, c // self.num_heads)
+            k_pos = k_pos.view(bs, hw, self.num_heads, c // self.num_heads)
+            k = torch.cat([k, k_pos], dim=3).view(bs, hw, 2 * c)
+            ca_output = self.forward_attn(
+                query=q,
+                key=k,
+                value=v,
+                attn_mask=attn_mask,
+                key_padding_mask=key_padding_mask)[0]
+            query = query + self.proj_drop(ca_output)
+        else:
+            q_content = self.qcontent_proj(query)
+            q_pos = self.qpos_proj(query_pos)
+            k_content = self.kcontent_proj(query)
+            k_pos = self.kpos_proj(query_pos)
+            v = self.v_proj(query)
+            q = q_content if q_pos is None else q_content + q_pos
+            k = k_content if k_pos is None else k_content + k_pos
+            sa_output = self.forward_attn(
+                query=q,
+                key=k,
+                value=v,
+                attn_mask=attn_mask,
+                key_padding_mask=key_padding_mask)[0]
+            query = query + self.proj_drop(sa_output)
+
+        return query
+
+
+class MLP(BaseModule):
+    """Very simple multi-layer perceptron (also called FFN) with relu. Mostly
+    used in DETR series detectors.
+
+    Args:
+        input_dim (int): Feature dim of the input tensor.
+        hidden_dim (int): Feature dim of the hidden layer.
+        output_dim (int): Feature dim of the output tensor.
+        num_layers (int): Number of FFN layers. As the last
+            layer of MLP only contains FFN (Linear).
+    """
+
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int,
+                 num_layers: int) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = ModuleList(
+            Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function of MLP.
+
+        Args:
+            x (Tensor): The input feature, has shape
+                (num_queries, bs, input_dim).
+        Returns:
+            Tensor: The output feature, has shape
+                (num_queries, bs, output_dim).
+        """
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+@MODELS.register_module()
+class DynamicConv(BaseModule):
+    """Implements Dynamic Convolution.
+
+    This module generate parameters for each sample and
+    use bmm to implement 1*1 convolution. Code is modified
+    from the `official github repo <https://github.com/PeizeSun/
+    SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py#L258>`_ .
+
+    Args:
+        in_channels (int): The input feature channel.
+            Defaults to 256.
+        feat_channels (int): The inner feature channel.
+            Defaults to 64.
+        out_channels (int, optional): The output feature channel.
+            When not specified, it will be set to `in_channels`
+            by default
+        input_feat_shape (int): The shape of input feature.
+            Defaults to 7.
+        with_proj (bool): Project two-dimentional feature to
+            one-dimentional feature. Default to True.
+        act_cfg (dict): The activation config for DynamicConv.
+        norm_cfg (dict): Config dict for normalization layer. Default
+            layer normalization.
+        init_cfg (obj:`mmengine.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 256,
+                 feat_channels: int = 64,
+                 out_channels: Optional[int] = None,
+                 input_feat_shape: int = 7,
+                 with_proj: bool = True,
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 norm_cfg: OptConfigType = dict(type='LN'),
+                 init_cfg: OptConfigType = None) -> None:
+        super(DynamicConv, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.out_channels_raw = out_channels
+        self.input_feat_shape = input_feat_shape
+        self.with_proj = with_proj
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.out_channels = out_channels if out_channels else in_channels
+
+        self.num_params_in = self.in_channels * self.feat_channels
+        self.num_params_out = self.out_channels * self.feat_channels
+        self.dynamic_layer = nn.Linear(
+            self.in_channels, self.num_params_in + self.num_params_out)
+
+        self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.norm_out = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+        self.activation = build_activation_layer(act_cfg)
+
+        num_output = self.out_channels * input_feat_shape**2
+        if self.with_proj:
+            self.fc_layer = nn.Linear(num_output, self.out_channels)
+            self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+    def forward(self, param_feature: Tensor, input_feature: Tensor) -> Tensor:
+        """Forward function for `DynamicConv`.
+
+        Args:
+            param_feature (Tensor): The feature can be used
+                to generate the parameter, has shape
+                (num_all_proposals, in_channels).
+            input_feature (Tensor): Feature that
+                interact with parameters, has shape
+                (num_all_proposals, in_channels, H, W).
+
+        Returns:
+            Tensor: The output feature has shape
+            (num_all_proposals, out_channels).
+        """
+        input_feature = input_feature.flatten(2).permute(2, 0, 1)
+
+        input_feature = input_feature.permute(1, 0, 2)
+        parameters = self.dynamic_layer(param_feature)
+
+        param_in = parameters[:, :self.num_params_in].view(
+            -1, self.in_channels, self.feat_channels)
+        param_out = parameters[:, -self.num_params_out:].view(
+            -1, self.feat_channels, self.out_channels)
+
+        # input_feature has shape (num_all_proposals, H*W, in_channels)
+        # param_in has shape (num_all_proposals, in_channels, feat_channels)
+        # feature has shape (num_all_proposals, H*W, feat_channels)
+        features = torch.bmm(input_feature, param_in)
+        features = self.norm_in(features)
+        features = self.activation(features)
+
+        # param_out has shape (batch_size, feat_channels, out_channels)
+        features = torch.bmm(features, param_out)
+        features = self.norm_out(features)
+        features = self.activation(features)
+
+        if self.with_proj:
+            features = features.flatten(1)
+            features = self.fc_layer(features)
+            features = self.fc_norm(features)
+            features = self.activation(features)
+
+        return features
+
+
+def get_text_sine_pos_embed(
+    pos_tensor: torch.Tensor,
+    num_pos_feats: int = 128,
+    temperature: int = 10000,
+    exchange_xy: bool = True,
+):
+    """generate sine position embedding from a position tensor
+    Args:
+        pos_tensor (torch.Tensor): shape: [..., n].
+        num_pos_feats (int): projected shape for each float in the tensor.
+        temperature (int): temperature in the sine/cosine function.
+        exchange_xy (bool, optional): exchange pos x and pos y. For example,
+            input tensor is [x,y], the results will be [pos(y), pos(x)].
+            Defaults to True.
+    Returns:
+        pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(
+        num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = temperature**(2 * torch.div(dim_t, 2, rounding_mode='floor') /
+                          num_pos_feats)
+
+    def sine_func(x: torch.Tensor):
+        sin_x = x * scale / dim_t
+        sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()),
+                            dim=3).flatten(2)
+        return sin_x
+
+    pos_res = [
+        sine_func(x)
+        for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)
+    ]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = torch.cat(pos_res, dim=-1)
+    return pos_res
diff --git a/head_extractor/src/mmdet/models/losses/__init__.py b/head_extractor/src/mmdet/models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c57a3a96879c6bd5eb61c300d316e2b4579b287
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .accuracy import Accuracy, accuracy
+from .ae_loss import AssociativeEmbeddingLoss
+from .balanced_l1_loss import BalancedL1Loss, balanced_l1_loss
+from .cross_entropy_loss import (CrossEntropyCustomLoss, CrossEntropyLoss,
+                                 binary_cross_entropy, cross_entropy,
+                                 mask_cross_entropy)
+from .ddq_detr_aux_loss import DDQAuxLoss
+from .dice_loss import DiceLoss
+from .eqlv2_loss import EQLV2Loss
+from .focal_loss import FocalCustomLoss, FocalLoss, sigmoid_focal_loss
+from .gaussian_focal_loss import GaussianFocalLoss
+from .gfocal_loss import DistributionFocalLoss, QualityFocalLoss
+from .ghm_loss import GHMC, GHMR
+from .iou_loss import (BoundedIoULoss, CIoULoss, DIoULoss, EIoULoss, GIoULoss,
+                       IoULoss, SIoULoss, bounded_iou_loss, iou_loss)
+from .kd_loss import KnowledgeDistillationKLDivLoss
+from .l2_loss import L2Loss
+from .margin_loss import MarginL2Loss
+from .mse_loss import MSELoss, mse_loss
+from .multipos_cross_entropy_loss import MultiPosCrossEntropyLoss
+from .pisa_loss import carl_loss, isr_p
+from .seesaw_loss import SeesawLoss
+from .smooth_l1_loss import L1Loss, SmoothL1Loss, l1_loss, smooth_l1_loss
+from .triplet_loss import TripletLoss
+from .utils import reduce_loss, weight_reduce_loss, weighted_loss
+from .varifocal_loss import VarifocalLoss
+
+__all__ = [
+    'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
+    'mask_cross_entropy', 'CrossEntropyLoss', 'sigmoid_focal_loss',
+    'FocalLoss', 'smooth_l1_loss', 'SmoothL1Loss', 'balanced_l1_loss',
+    'BalancedL1Loss', 'mse_loss', 'MSELoss', 'iou_loss', 'bounded_iou_loss',
+    'IoULoss', 'BoundedIoULoss', 'GIoULoss', 'DIoULoss', 'CIoULoss',
+    'EIoULoss', 'SIoULoss', 'GHMC', 'GHMR', 'reduce_loss',
+    'weight_reduce_loss', 'weighted_loss', 'L1Loss', 'l1_loss', 'isr_p',
+    'carl_loss', 'AssociativeEmbeddingLoss', 'GaussianFocalLoss',
+    'QualityFocalLoss', 'DistributionFocalLoss', 'VarifocalLoss',
+    'KnowledgeDistillationKLDivLoss', 'SeesawLoss', 'DiceLoss', 'EQLV2Loss',
+    'MarginL2Loss', 'MultiPosCrossEntropyLoss', 'L2Loss', 'TripletLoss',
+    'DDQAuxLoss', 'CrossEntropyCustomLoss', 'FocalCustomLoss'
+]
diff --git a/head_extractor/src/mmdet/models/losses/accuracy.py b/head_extractor/src/mmdet/models/losses/accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..d68484e13965ced3bd6b104071d22657a9b3fde6
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/accuracy.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+
+def accuracy(pred, target, topk=1, thresh=None):
+    """Calculate accuracy according to the prediction and target.
+
+    Args:
+        pred (torch.Tensor): The model prediction, shape (N, num_class)
+        target (torch.Tensor): The target of each prediction, shape (N, )
+        topk (int | tuple[int], optional): If the predictions in ``topk``
+            matches the target, the predictions will be regarded as
+            correct ones. Defaults to 1.
+        thresh (float, optional): If not None, predictions with scores under
+            this threshold are considered incorrect. Default to None.
+
+    Returns:
+        float | tuple[float]: If the input ``topk`` is a single integer,
+            the function will return a single float as accuracy. If
+            ``topk`` is a tuple containing multiple integers, the
+            function will return a tuple containing accuracies of
+            each ``topk`` number.
+    """
+    assert isinstance(topk, (int, tuple))
+    if isinstance(topk, int):
+        topk = (topk, )
+        return_single = True
+    else:
+        return_single = False
+
+    maxk = max(topk)
+    if pred.size(0) == 0:
+        accu = [pred.new_tensor(0.) for i in range(len(topk))]
+        return accu[0] if return_single else accu
+    assert pred.ndim == 2 and target.ndim == 1
+    assert pred.size(0) == target.size(0)
+    assert maxk <= pred.size(1), \
+        f'maxk {maxk} exceeds pred dimension {pred.size(1)}'
+    pred_value, pred_label = pred.topk(maxk, dim=1)
+    pred_label = pred_label.t()  # transpose to shape (maxk, N)
+    correct = pred_label.eq(target.view(1, -1).expand_as(pred_label))
+    if thresh is not None:
+        # Only prediction values larger than thresh are counted as correct
+        correct = correct & (pred_value > thresh).t()
+    res = []
+    for k in topk:
+        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / pred.size(0)))
+    return res[0] if return_single else res
+
+
+class Accuracy(nn.Module):
+
+    def __init__(self, topk=(1, ), thresh=None):
+        """Module to calculate the accuracy.
+
+        Args:
+            topk (tuple, optional): The criterion used to calculate the
+                accuracy. Defaults to (1,).
+            thresh (float, optional): If not None, predictions with scores
+                under this threshold are considered incorrect. Default to None.
+        """
+        super().__init__()
+        self.topk = topk
+        self.thresh = thresh
+
+    def forward(self, pred, target):
+        """Forward function to calculate accuracy.
+
+        Args:
+            pred (torch.Tensor): Prediction of models.
+            target (torch.Tensor): Target for each prediction.
+
+        Returns:
+            tuple[float]: The accuracies under different topk criterions.
+        """
+        return accuracy(pred, target, self.topk, self.thresh)
diff --git a/head_extractor/src/mmdet/models/losses/ae_loss.py b/head_extractor/src/mmdet/models/losses/ae_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aa7d696be4b937a2d45545a8309aaa936fe5f22
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/ae_loss.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.registry import MODELS
+
+
+def ae_loss_per_image(tl_preds, br_preds, match):
+    """Associative Embedding Loss in one image.
+
+    Associative Embedding Loss including two parts: pull loss and push loss.
+    Pull loss makes embedding vectors from same object closer to each other.
+    Push loss distinguish embedding vector from different objects, and makes
+        the gap between them is large enough.
+
+    During computing, usually there are 3 cases:
+        - no object in image: both pull loss and push loss will be 0.
+        - one object in image: push loss will be 0 and pull loss is computed
+            by the two corner of the only object.
+        - more than one objects in image: pull loss is computed by corner pairs
+            from each object, push loss is computed by each object with all
+            other objects. We use confusion matrix with 0 in diagonal to
+            compute the push loss.
+
+    Args:
+        tl_preds (tensor): Embedding feature map of left-top corner.
+        br_preds (tensor): Embedding feature map of bottim-right corner.
+        match (list): Downsampled coordinates pair of each ground truth box.
+    """
+
+    tl_list, br_list, me_list = [], [], []
+    if len(match) == 0:  # no object in image
+        pull_loss = tl_preds.sum() * 0.
+        push_loss = tl_preds.sum() * 0.
+    else:
+        for m in match:
+            [tl_y, tl_x], [br_y, br_x] = m
+            tl_e = tl_preds[:, tl_y, tl_x].view(-1, 1)
+            br_e = br_preds[:, br_y, br_x].view(-1, 1)
+            tl_list.append(tl_e)
+            br_list.append(br_e)
+            me_list.append((tl_e + br_e) / 2.0)
+
+        tl_list = torch.cat(tl_list)
+        br_list = torch.cat(br_list)
+        me_list = torch.cat(me_list)
+
+        assert tl_list.size() == br_list.size()
+
+        # N is object number in image, M is dimension of embedding vector
+        N, M = tl_list.size()
+
+        pull_loss = (tl_list - me_list).pow(2) + (br_list - me_list).pow(2)
+        pull_loss = pull_loss.sum() / N
+
+        margin = 1  # exp setting of CornerNet, details in section 3.3 of paper
+
+        # confusion matrix of push loss
+        conf_mat = me_list.expand((N, N, M)).permute(1, 0, 2) - me_list
+        conf_weight = 1 - torch.eye(N).type_as(me_list)
+        conf_mat = conf_weight * (margin - conf_mat.sum(-1).abs())
+
+        if N > 1:  # more than one object in current image
+            push_loss = F.relu(conf_mat).sum() / (N * (N - 1))
+        else:
+            push_loss = tl_preds.sum() * 0.
+
+    return pull_loss, push_loss
+
+
+@MODELS.register_module()
+class AssociativeEmbeddingLoss(nn.Module):
+    """Associative Embedding Loss.
+
+    More details can be found in
+    `Associative Embedding <https://arxiv.org/abs/1611.05424>`_ and
+    `CornerNet <https://arxiv.org/abs/1808.01244>`_ .
+    Code is modified from `kp_utils.py <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/kp_utils.py#L180>`_  # noqa: E501
+
+    Args:
+        pull_weight (float): Loss weight for corners from same object.
+        push_weight (float): Loss weight for corners from different object.
+    """
+
+    def __init__(self, pull_weight=0.25, push_weight=0.25):
+        super(AssociativeEmbeddingLoss, self).__init__()
+        self.pull_weight = pull_weight
+        self.push_weight = push_weight
+
+    def forward(self, pred, target, match):
+        """Forward function."""
+        batch = pred.size(0)
+        pull_all, push_all = 0.0, 0.0
+        for i in range(batch):
+            pull, push = ae_loss_per_image(pred[i], target[i], match[i])
+
+            pull_all += self.pull_weight * pull
+            push_all += self.push_weight * push
+
+        return pull_all, push_all
diff --git a/head_extractor/src/mmdet/models/losses/balanced_l1_loss.py b/head_extractor/src/mmdet/models/losses/balanced_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..25adaab2239e871476d9d4e3cbb1a238c3043041
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/balanced_l1_loss.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def balanced_l1_loss(pred,
+                     target,
+                     beta=1.0,
+                     alpha=0.5,
+                     gamma=1.5,
+                     reduction='mean'):
+    """Calculate balanced L1 loss.
+
+    Please see the `Libra R-CNN <https://arxiv.org/pdf/1904.02701.pdf>`_
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 4).
+        target (torch.Tensor): The learning target of the prediction with
+            shape (N, 4).
+        beta (float): The loss is a piecewise function of prediction and target
+            and ``beta`` serves as a threshold for the difference between the
+            prediction and target. Defaults to 1.0.
+        alpha (float): The denominator ``alpha`` in the balanced L1 loss.
+            Defaults to 0.5.
+        gamma (float): The ``gamma`` in the balanced L1 loss.
+            Defaults to 1.5.
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    assert beta > 0
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    assert pred.size() == target.size()
+
+    diff = torch.abs(pred - target)
+    b = np.e**(gamma / alpha) - 1
+    loss = torch.where(
+        diff < beta, alpha / b *
+        (b * diff + 1) * torch.log(b * diff / beta + 1) - alpha * diff,
+        gamma * diff + gamma / b - alpha * beta)
+
+    return loss
+
+
+@MODELS.register_module()
+class BalancedL1Loss(nn.Module):
+    """Balanced L1 Loss.
+
+    arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019)
+
+    Args:
+        alpha (float): The denominator ``alpha`` in the balanced L1 loss.
+            Defaults to 0.5.
+        gamma (float): The ``gamma`` in the balanced L1 loss. Defaults to 1.5.
+        beta (float, optional): The loss is a piecewise function of prediction
+            and target. ``beta`` serves as a threshold for the difference
+            between the prediction and target. Defaults to 1.0.
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+
+    def __init__(self,
+                 alpha=0.5,
+                 gamma=1.5,
+                 beta=1.0,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(BalancedL1Loss, self).__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function of loss.
+
+        Args:
+            pred (torch.Tensor): The prediction with shape (N, 4).
+            target (torch.Tensor): The learning target of the prediction with
+                shape (N, 4).
+            weight (torch.Tensor, optional): Sample-wise loss weight with
+                shape (N, ).
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * balanced_l1_loss(
+            pred,
+            target,
+            weight,
+            alpha=self.alpha,
+            gamma=self.gamma,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_bbox
diff --git a/head_extractor/src/mmdet/models/losses/cross_entropy_loss.py b/head_extractor/src/mmdet/models/losses/cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..49fac7743ceddd2454f44b76c63d514de43b5aef
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/cross_entropy_loss.py
@@ -0,0 +1,401 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.registry import MODELS
+from .accuracy import accuracy
+from .utils import weight_reduce_loss
+
+
+def cross_entropy(pred,
+                  label,
+                  weight=None,
+                  reduction='mean',
+                  avg_factor=None,
+                  class_weight=None,
+                  ignore_index=-100,
+                  avg_non_ignore=False):
+    """Calculate the CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        label (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int | None): The label index to be ignored.
+            If None, it will be set to default value. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    # The default value of ignore_index is the same as F.cross_entropy
+    ignore_index = -100 if ignore_index is None else ignore_index
+    # element-wise losses
+    loss = F.cross_entropy(
+        pred,
+        label,
+        weight=class_weight,
+        reduction='none',
+        ignore_index=ignore_index)
+
+    # average loss over non-ignored elements
+    # pytorch's official cross_entropy average loss over non-ignored elements
+    # refer to https://github.com/pytorch/pytorch/blob/56b43f4fec1f76953f15a627694d4bba34588969/torch/nn/functional.py#L2660  # noqa
+    if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
+        avg_factor = label.numel() - (label == ignore_index).sum().item()
+
+    # apply weights and do the reduction
+    if weight is not None:
+        weight = weight.float()
+    loss = weight_reduce_loss(
+        loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def _expand_onehot_labels(labels, label_weights, label_channels, ignore_index):
+    """Expand onehot labels to match the size of prediction."""
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    valid_mask = (labels >= 0) & (labels != ignore_index)
+    inds = torch.nonzero(
+        valid_mask & (labels < label_channels), as_tuple=False)
+
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds]] = 1
+
+    valid_mask = valid_mask.view(-1, 1).expand(labels.size(0),
+                                               label_channels).float()
+    if label_weights is None:
+        bin_label_weights = valid_mask
+    else:
+        bin_label_weights = label_weights.view(-1, 1).repeat(1, label_channels)
+        bin_label_weights *= valid_mask
+
+    return bin_labels, bin_label_weights, valid_mask
+
+
+def binary_cross_entropy(pred,
+                         label,
+                         weight=None,
+                         reduction='mean',
+                         avg_factor=None,
+                         class_weight=None,
+                         ignore_index=-100,
+                         avg_non_ignore=False):
+    """Calculate the binary CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1) or (N, ).
+            When the shape of pred is (N, 1), label will be expanded to
+            one-hot format, and when the shape of pred is (N, ), label
+            will not be expanded to one-hot format.
+        label (torch.Tensor): The learning label of the prediction,
+            with shape (N, ).
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int | None): The label index to be ignored.
+            If None, it will be set to default value. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    # The default value of ignore_index is the same as F.cross_entropy
+    ignore_index = -100 if ignore_index is None else ignore_index
+
+    if pred.dim() != label.dim():
+        label, weight, valid_mask = _expand_onehot_labels(
+            label, weight, pred.size(-1), ignore_index)
+    else:
+        # should mask out the ignored elements
+        valid_mask = ((label >= 0) & (label != ignore_index)).float()
+        if weight is not None:
+            # The inplace writing method will have a mismatched broadcast
+            # shape error if the weight and valid_mask dimensions
+            # are inconsistent such as (B,N,1) and (B,N,C).
+            weight = weight * valid_mask
+        else:
+            weight = valid_mask
+
+    # average loss over non-ignored elements
+    if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
+        avg_factor = valid_mask.sum().item()
+
+    # weighted element-wise losses
+    weight = weight.float()
+    loss = F.binary_cross_entropy_with_logits(
+        pred, label.float(), pos_weight=class_weight, reduction='none')
+    # do the reduction for the weighted loss
+    loss = weight_reduce_loss(
+        loss, weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def mask_cross_entropy(pred,
+                       target,
+                       label,
+                       reduction='mean',
+                       avg_factor=None,
+                       class_weight=None,
+                       ignore_index=None,
+                       **kwargs):
+    """Calculate the CrossEntropy loss for masks.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C, *), C is the
+            number of classes. The trailing * indicates arbitrary shape.
+        target (torch.Tensor): The learning label of the prediction.
+        label (torch.Tensor): ``label`` indicates the class label of the mask
+            corresponding object. This will be used to select the mask in the
+            of the class which the object belongs to when the mask prediction
+            if not class-agnostic.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (None): Placeholder, to be consistent with other loss.
+            Default: None.
+
+    Returns:
+        torch.Tensor: The calculated loss
+
+    Example:
+        >>> N, C = 3, 11
+        >>> H, W = 2, 2
+        >>> pred = torch.randn(N, C, H, W) * 1000
+        >>> target = torch.rand(N, H, W)
+        >>> label = torch.randint(0, C, size=(N,))
+        >>> reduction = 'mean'
+        >>> avg_factor = None
+        >>> class_weights = None
+        >>> loss = mask_cross_entropy(pred, target, label, reduction,
+        >>>                           avg_factor, class_weights)
+        >>> assert loss.shape == (1,)
+    """
+    assert ignore_index is None, 'BCE loss does not support ignore_index'
+    # TODO: handle these two reserved arguments
+    assert reduction == 'mean' and avg_factor is None
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(
+        pred_slice, target, weight=class_weight, reduction='mean')[None]
+
+
+@MODELS.register_module()
+class CrossEntropyLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 class_weight=None,
+                 ignore_index=None,
+                 loss_weight=1.0,
+                 avg_non_ignore=False):
+        """CrossEntropyLoss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to False.
+            use_mask (bool, optional): Whether to use mask cross entropy loss.
+                Defaults to False.
+            reduction (str, optional): . Defaults to 'mean'.
+                Options are "none", "mean" and "sum".
+            class_weight (list[float], optional): Weight of each class.
+                Defaults to None.
+            ignore_index (int | None): The label index to be ignored.
+                Defaults to None.
+            loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+            avg_non_ignore (bool): The flag decides to whether the loss is
+                only averaged over non-ignored targets. Default: False.
+        """
+        super(CrossEntropyLoss, self).__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = class_weight
+        self.ignore_index = ignore_index
+        self.avg_non_ignore = avg_non_ignore
+        if ((ignore_index is not None) and not self.avg_non_ignore
+                and self.reduction == 'mean'):
+            warnings.warn(
+                'Default ``avg_non_ignore`` is False, if you would like to '
+                'ignore the certain label and average loss over non-ignore '
+                'labels, which is the same with PyTorch official '
+                'cross_entropy, set ``avg_non_ignore=True``.')
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'avg_non_ignore={self.avg_non_ignore}'
+        return s
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                ignore_index=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The prediction.
+            label (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): Sample-wise loss weight.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The method used to reduce the
+                loss. Options are "none", "mean" and "sum".
+            ignore_index (int | None): The label index to be ignored.
+                If not None, it will override the default value. Default: None.
+        Returns:
+            torch.Tensor: The calculated loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if ignore_index is None:
+            ignore_index = self.ignore_index
+
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(
+                self.class_weight, device=cls_score.device)
+        else:
+            class_weight = None
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            weight,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            ignore_index=ignore_index,
+            avg_non_ignore=self.avg_non_ignore,
+            **kwargs)
+        return loss_cls
+
+
+@MODELS.register_module()
+class CrossEntropyCustomLoss(CrossEntropyLoss):
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 num_classes=-1,
+                 class_weight=None,
+                 ignore_index=None,
+                 loss_weight=1.0,
+                 avg_non_ignore=False):
+        """CrossEntropyCustomLoss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to False.
+            use_mask (bool, optional): Whether to use mask cross entropy loss.
+                Defaults to False.
+            reduction (str, optional): . Defaults to 'mean'.
+                Options are "none", "mean" and "sum".
+            num_classes (int): Number of classes to classify.
+            class_weight (list[float], optional): Weight of each class.
+                Defaults to None.
+            ignore_index (int | None): The label index to be ignored.
+                Defaults to None.
+            loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+            avg_non_ignore (bool): The flag decides to whether the loss is
+                only averaged over non-ignored targets. Default: False.
+        """
+        super(CrossEntropyCustomLoss, self).__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = class_weight
+        self.ignore_index = ignore_index
+        self.avg_non_ignore = avg_non_ignore
+        if ((ignore_index is not None) and not self.avg_non_ignore
+                and self.reduction == 'mean'):
+            warnings.warn(
+                'Default ``avg_non_ignore`` is False, if you would like to '
+                'ignore the certain label and average loss over non-ignore '
+                'labels, which is the same with PyTorch official '
+                'cross_entropy, set ``avg_non_ignore=True``.')
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+
+        self.num_classes = num_classes
+
+        assert self.num_classes != -1
+
+        # custom output channels of the classifier
+        self.custom_cls_channels = True
+        # custom activation of cls_score
+        self.custom_activation = True
+        # custom accuracy of the classsifier
+        self.custom_accuracy = True
+
+    def get_cls_channels(self, num_classes):
+        assert num_classes == self.num_classes
+        if not self.use_sigmoid:
+            return num_classes + 1
+        else:
+            return num_classes
+
+    def get_activation(self, cls_score):
+
+        fine_cls_score = cls_score[:, :self.num_classes]
+
+        if not self.use_sigmoid:
+            bg_score = cls_score[:, [-1]]
+            new_score = torch.cat([fine_cls_score, bg_score], dim=-1)
+            scores = F.softmax(new_score, dim=-1)
+        else:
+            score_classes = fine_cls_score.sigmoid()
+            score_neg = 1 - score_classes.sum(dim=1, keepdim=True)
+            score_neg = score_neg.clamp(min=0, max=1)
+            scores = torch.cat([score_classes, score_neg], dim=1)
+
+        return scores
+
+    def get_accuracy(self, cls_score, labels):
+
+        fine_cls_score = cls_score[:, :self.num_classes]
+
+        pos_inds = labels < self.num_classes
+        acc_classes = accuracy(fine_cls_score[pos_inds], labels[pos_inds])
+        acc = dict()
+        acc['acc_classes'] = acc_classes
+        return acc
diff --git a/head_extractor/src/mmdet/models/losses/ddq_detr_aux_loss.py b/head_extractor/src/mmdet/models/losses/ddq_detr_aux_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..41f1c7166e6c7d05c5414cd04ad3eb3cd467f1b6
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/ddq_detr_aux_loss.py
@@ -0,0 +1,303 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.structures import BaseDataElement
+
+from mmdet.models.utils import multi_apply
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import reduce_mean
+
+
+class DDQAuxLoss(nn.Module):
+    """DDQ auxiliary branches loss for dense queries.
+
+    Args:
+        loss_cls (dict):
+            Configuration of classification loss function.
+        loss_bbox (dict):
+            Configuration of bbox regression loss function.
+        train_cfg (dict):
+            Configuration of gt targets assigner for each predicted bbox.
+    """
+
+    def __init__(
+        self,
+        loss_cls=dict(
+            type='QualityFocalLoss',
+            use_sigmoid=True,
+            activated=True,  # use probability instead of logit as input
+            beta=2.0,
+            loss_weight=1.0),
+        loss_bbox=dict(type='GIoULoss', loss_weight=2.0),
+        train_cfg=dict(
+            assigner=dict(type='TopkHungarianAssigner', topk=8),
+            alpha=1,
+            beta=6),
+    ):
+        super(DDQAuxLoss, self).__init__()
+        self.train_cfg = train_cfg
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+        self.assigner = TASK_UTILS.build(self.train_cfg['assigner'])
+
+        sampler_cfg = dict(type='PseudoSampler')
+        self.sampler = TASK_UTILS.build(sampler_cfg)
+
+    def loss_single(self, cls_score, bbox_pred, labels, label_weights,
+                    bbox_targets, alignment_metrics):
+        """Calculate auxiliary branches loss for dense queries for one image.
+
+        Args:
+            cls_score (Tensor): Predicted normalized classification
+                scores for one image, has shape (num_dense_queries,
+                cls_out_channels).
+            bbox_pred (Tensor): Predicted unnormalized bbox coordinates
+                for one image, has shape (num_dense_queries, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+            labels (Tensor): Labels for one image.
+            label_weights (Tensor): Label weights for one image.
+            bbox_targets (Tensor): Bbox targets for one image.
+            alignment_metrics (Tensor): Normalized alignment metrics for one
+                image.
+
+        Returns:
+            tuple: A tuple of loss components and loss weights.
+        """
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        labels = labels.reshape(-1)
+        alignment_metrics = alignment_metrics.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        targets = (labels, alignment_metrics)
+        cls_loss_func = self.loss_cls
+
+        loss_cls = cls_loss_func(
+            cls_score, targets, label_weights, avg_factor=1.0)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = cls_score.size(-1)
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero().squeeze(1)
+
+        if len(pos_inds) > 0:
+            pos_bbox_targets = bbox_targets[pos_inds]
+            pos_bbox_pred = bbox_pred[pos_inds]
+
+            pos_decode_bbox_pred = pos_bbox_pred
+            pos_decode_bbox_targets = pos_bbox_targets
+
+            # regression loss
+            pos_bbox_weight = alignment_metrics[pos_inds]
+
+            loss_bbox = self.loss_bbox(
+                pos_decode_bbox_pred,
+                pos_decode_bbox_targets,
+                weight=pos_bbox_weight,
+                avg_factor=1.0)
+        else:
+            loss_bbox = bbox_pred.sum() * 0
+            pos_bbox_weight = bbox_targets.new_tensor(0.)
+
+        return loss_cls, loss_bbox, alignment_metrics.sum(
+        ), pos_bbox_weight.sum()
+
+    def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas,
+             **kwargs):
+        """Calculate auxiliary branches loss for dense queries.
+
+        Args:
+            cls_scores (Tensor): Predicted normalized classification
+                scores, has shape (bs, num_dense_queries,
+                cls_out_channels).
+            bbox_preds (Tensor): Predicted unnormalized bbox coordinates,
+                has shape (bs, num_dense_queries, 4) with the last
+                dimension arranged as (x1, y1, x2, y2).
+            gt_bboxes (list[Tensor]): List of unnormalized ground truth
+                bboxes for each image, each has shape (num_gt, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+                NOTE: num_gt is dynamic for each image.
+            gt_labels (list[Tensor]): List of ground truth classification
+                index for each image, each has shape (num_gt,).
+                NOTE: num_gt is dynamic for each image.
+            img_metas (list[dict]): Meta information for one image,
+                e.g., image size, scaling factor, etc.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        flatten_cls_scores = cls_scores
+        flatten_bbox_preds = bbox_preds
+
+        cls_reg_targets = self.get_targets(
+            flatten_cls_scores,
+            flatten_bbox_preds,
+            gt_bboxes,
+            img_metas,
+            gt_labels_list=gt_labels,
+        )
+        (labels_list, label_weights_list, bbox_targets_list,
+         alignment_metrics_list) = cls_reg_targets
+
+        losses_cls, losses_bbox, \
+            cls_avg_factors, bbox_avg_factors = multi_apply(
+                self.loss_single,
+                flatten_cls_scores,
+                flatten_bbox_preds,
+                labels_list,
+                label_weights_list,
+                bbox_targets_list,
+                alignment_metrics_list,
+                )
+
+        cls_avg_factor = reduce_mean(sum(cls_avg_factors)).clamp_(min=1).item()
+        losses_cls = list(map(lambda x: x / cls_avg_factor, losses_cls))
+
+        bbox_avg_factor = reduce_mean(
+            sum(bbox_avg_factors)).clamp_(min=1).item()
+        losses_bbox = list(map(lambda x: x / bbox_avg_factor, losses_bbox))
+        return dict(aux_loss_cls=losses_cls, aux_loss_bbox=losses_bbox)
+
+    def get_targets(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_labels_list=None,
+                    **kwargs):
+        """Compute regression and classification targets for a batch images.
+
+        Args:
+            cls_scores (Tensor): Predicted normalized classification
+                scores, has shape (bs, num_dense_queries,
+                cls_out_channels).
+            bbox_preds (Tensor): Predicted unnormalized bbox coordinates,
+                has shape (bs, num_dense_queries, 4) with the last
+                dimension arranged as (x1, y1, x2, y2).
+            gt_bboxes_list (List[Tensor]): List of unnormalized ground truth
+                bboxes for each image, each has shape (num_gt, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+                NOTE: num_gt is dynamic for each image.
+            img_metas (list[dict]): Meta information for one image,
+                e.g., image size, scaling factor, etc.
+            gt_labels_list (list[Tensor]): List of ground truth classification
+                    index for each image, each has shape (num_gt,).
+                    NOTE: num_gt is dynamic for each image.
+                    Default: None.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+            - all_labels (list[Tensor]): Labels for all images.
+            - all_label_weights (list[Tensor]): Label weights for all images.
+            - all_bbox_targets (list[Tensor]): Bbox targets for all images.
+            - all_assign_metrics (list[Tensor]): Normalized alignment metrics
+                for all images.
+        """
+        (all_labels, all_label_weights, all_bbox_targets,
+         all_assign_metrics) = multi_apply(self._get_target_single, cls_scores,
+                                           bbox_preds, gt_bboxes_list,
+                                           gt_labels_list, img_metas)
+
+        return (all_labels, all_label_weights, all_bbox_targets,
+                all_assign_metrics)
+
+    def _get_target_single(self, cls_scores, bbox_preds, gt_bboxes, gt_labels,
+                           img_meta, **kwargs):
+        """Compute regression and classification targets for one image.
+
+        Args:
+            cls_scores (Tensor): Predicted normalized classification
+                scores for one image, has shape (num_dense_queries,
+                cls_out_channels).
+            bbox_preds (Tensor): Predicted unnormalized bbox coordinates
+                for one image, has shape (num_dense_queries, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+            gt_bboxes (Tensor): Unnormalized ground truth
+                bboxes for one image, has shape (num_gt, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+                NOTE: num_gt is dynamic for each image.
+            gt_labels (Tensor): Ground truth classification
+                    index for the image, has shape (num_gt,).
+                    NOTE: num_gt is dynamic for each image.
+            img_meta (dict): Meta information for one image.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+            - labels (Tensor): Labels for one image.
+            - label_weights (Tensor): Label weights for one image.
+            - bbox_targets (Tensor): Bbox targets for one image.
+            - norm_alignment_metrics (Tensor): Normalized alignment
+                metrics for one image.
+        """
+        if len(gt_labels) == 0:
+            num_valid_anchors = len(cls_scores)
+            bbox_targets = torch.zeros_like(bbox_preds)
+            labels = bbox_preds.new_full((num_valid_anchors, ),
+                                         cls_scores.size(-1),
+                                         dtype=torch.long)
+            label_weights = bbox_preds.new_zeros(
+                num_valid_anchors, dtype=torch.float)
+            norm_alignment_metrics = bbox_preds.new_zeros(
+                num_valid_anchors, dtype=torch.float)
+            return (labels, label_weights, bbox_targets,
+                    norm_alignment_metrics)
+
+        assign_result = self.assigner.assign(cls_scores, bbox_preds, gt_bboxes,
+                                             gt_labels, img_meta)
+        assign_ious = assign_result.max_overlaps
+        assign_metrics = assign_result.assign_metrics
+
+        pred_instances = BaseDataElement()
+        gt_instances = BaseDataElement()
+
+        pred_instances.bboxes = bbox_preds
+        gt_instances.bboxes = gt_bboxes
+
+        pred_instances.priors = cls_scores
+        gt_instances.labels = gt_labels
+
+        sampling_result = self.sampler.sample(assign_result, pred_instances,
+                                              gt_instances)
+
+        num_valid_anchors = len(cls_scores)
+        bbox_targets = torch.zeros_like(bbox_preds)
+        labels = bbox_preds.new_full((num_valid_anchors, ),
+                                     cls_scores.size(-1),
+                                     dtype=torch.long)
+        label_weights = bbox_preds.new_zeros(
+            num_valid_anchors, dtype=torch.float)
+        norm_alignment_metrics = bbox_preds.new_zeros(
+            num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            # point-based
+            pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+
+            if gt_labels is None:
+                # Only dense_heads gives gt_labels as None
+                # Foreground is the first class since v2.5.0
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+
+            label_weights[pos_inds] = 1.0
+
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        class_assigned_gt_inds = torch.unique(
+            sampling_result.pos_assigned_gt_inds)
+        for gt_inds in class_assigned_gt_inds:
+            gt_class_inds = sampling_result.pos_assigned_gt_inds == gt_inds
+            pos_alignment_metrics = assign_metrics[gt_class_inds]
+            pos_ious = assign_ious[gt_class_inds]
+            pos_norm_alignment_metrics = pos_alignment_metrics / (
+                pos_alignment_metrics.max() + 10e-8) * pos_ious.max()
+            norm_alignment_metrics[
+                pos_inds[gt_class_inds]] = pos_norm_alignment_metrics
+
+        return (labels, label_weights, bbox_targets, norm_alignment_metrics)
diff --git a/head_extractor/src/mmdet/models/losses/dice_loss.py b/head_extractor/src/mmdet/models/losses/dice_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d5cac1e9710a6a72fe0401db22b8b72cfe058f9
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/dice_loss.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def dice_loss(pred,
+              target,
+              weight=None,
+              eps=1e-3,
+              reduction='mean',
+              naive_dice=False,
+              avg_factor=None):
+    """Calculate dice loss, there are two forms of dice loss is supported:
+
+        - the one proposed in `V-Net: Fully Convolutional Neural
+            Networks for Volumetric Medical Image Segmentation
+            <https://arxiv.org/abs/1606.04797>`_.
+        - the dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (n, *)
+        target (torch.Tensor): The learning label of the prediction,
+            shape (n, *), same shape of pred.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction, has a shape (n,). Defaults to None.
+        eps (float): Avoid dividing by zero. Default: 1e-3.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power.Defaults to False.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+
+    input = pred.flatten(1)
+    target = target.flatten(1).float()
+
+    a = torch.sum(input * target, 1)
+    if naive_dice:
+        b = torch.sum(input, 1)
+        c = torch.sum(target, 1)
+        d = (2 * a + eps) / (b + c + eps)
+    else:
+        b = torch.sum(input * input, 1) + eps
+        c = torch.sum(target * target, 1) + eps
+        d = (2 * a) / (b + c)
+
+    loss = 1 - d
+    if weight is not None:
+        assert weight.ndim == loss.ndim
+        assert len(weight) == len(pred)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class DiceLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 activate=True,
+                 reduction='mean',
+                 naive_dice=False,
+                 loss_weight=1.0,
+                 eps=1e-3):
+        """Compute dice loss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            activate (bool): Whether to activate the predictions inside,
+                this will disable the inside sigmoid operation.
+                Defaults to True.
+            reduction (str, optional): The method used
+                to reduce the loss. Options are "none",
+                "mean" and "sum". Defaults to 'mean'.
+            naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power. Defaults to False.
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            eps (float): Avoid dividing by zero. Defaults to 1e-3.
+        """
+
+        super(DiceLoss, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        self.reduction = reduction
+        self.naive_dice = naive_dice
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self.activate = activate
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                reduction_override=None,
+                avg_factor=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction, has a shape (n, *).
+            target (torch.Tensor): The label of the prediction,
+                shape (n, *), same shape of pred.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction, has a shape (n,). Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        if self.activate:
+            if self.use_sigmoid:
+                pred = pred.sigmoid()
+            else:
+                raise NotImplementedError
+
+        loss = self.loss_weight * dice_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            naive_dice=self.naive_dice,
+            avg_factor=avg_factor)
+
+        return loss
diff --git a/head_extractor/src/mmdet/models/losses/eqlv2_loss.py b/head_extractor/src/mmdet/models/losses/eqlv2_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea1f4a9a8f7c71119c2bed743d714a34ab4db82c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/eqlv2_loss.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from functools import partial
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.logging import print_log
+from torch import Tensor
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class EQLV2Loss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid: bool = True,
+                 reduction: str = 'mean',
+                 class_weight: Optional[Tensor] = None,
+                 loss_weight: float = 1.0,
+                 num_classes: int = 1203,
+                 use_distributed: bool = False,
+                 mu: float = 0.8,
+                 alpha: float = 4.0,
+                 gamma: int = 12,
+                 vis_grad: bool = False,
+                 test_with_obj: bool = True) -> None:
+        """`Equalization Loss v2 <https://arxiv.org/abs/2012.08548>`_
+
+        Args:
+            use_sigmoid (bool): EQLv2 uses the sigmoid function to transform
+                the predicted logits to an estimated probability distribution.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'.
+            class_weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            loss_weight (float, optional): The weight of the total EQLv2 loss.
+                Defaults to 1.0.
+            num_classes (int): 1203 for lvis v1.0, 1230 for lvis v0.5.
+            use_distributed (bool, float): EQLv2 will calculate the gradients
+                on all GPUs if there is any. Change to True if you are using
+                distributed training. Default to False.
+            mu (float, optional): Defaults to 0.8
+            alpha (float, optional): A balance factor for the negative part of
+                EQLV2 Loss. Defaults to 4.0.
+            gamma (int, optional): The gamma for calculating the modulating
+                factor. Defaults to 12.
+            vis_grad (bool, optional): Default to False.
+            test_with_obj (bool, optional): Default to True.
+
+        Returns:
+            None.
+        """
+        super().__init__()
+        self.use_sigmoid = True
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = class_weight
+        self.num_classes = num_classes
+        self.group = True
+
+        # cfg for eqlv2
+        self.vis_grad = vis_grad
+        self.mu = mu
+        self.alpha = alpha
+        self.gamma = gamma
+        self.use_distributed = use_distributed
+
+        # initial variables
+        self.register_buffer('pos_grad', torch.zeros(self.num_classes))
+        self.register_buffer('neg_grad', torch.zeros(self.num_classes))
+        # At the beginning of training, we set a high value (eg. 100)
+        # for the initial gradient ratio so that the weight for pos
+        # gradients and neg gradients are 1.
+        self.register_buffer('pos_neg', torch.ones(self.num_classes) * 100)
+
+        self.test_with_obj = test_with_obj
+
+        def _func(x, gamma, mu):
+            return 1 / (1 + torch.exp(-gamma * (x - mu)))
+
+        self.map_func = partial(_func, gamma=self.gamma, mu=self.mu)
+
+        print_log(
+            f'build EQL v2, gamma: {gamma}, mu: {mu}, alpha: {alpha}',
+            logger='current',
+            level=logging.DEBUG)
+
+    def forward(self,
+                cls_score: Tensor,
+                label: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[Tensor] = None) -> Tensor:
+        """`Equalization Loss v2 <https://arxiv.org/abs/2012.08548>`_
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C), C is the
+                number of classes.
+            label (Tensor): The ground truth label of the predicted target with
+                shape (N, C), C is the number of classes.
+            weight (Tensor, optional): The weight of loss for each prediction.
+                Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+           Tensor: The calculated loss
+        """
+        self.n_i, self.n_c = cls_score.size()
+        self.gt_classes = label
+        self.pred_class_logits = cls_score
+
+        def expand_label(pred, gt_classes):
+            target = pred.new_zeros(self.n_i, self.n_c)
+            target[torch.arange(self.n_i), gt_classes] = 1
+            return target
+
+        target = expand_label(cls_score, label)
+
+        pos_w, neg_w = self.get_weight(cls_score)
+
+        weight = pos_w * target + neg_w * (1 - target)
+
+        cls_loss = F.binary_cross_entropy_with_logits(
+            cls_score, target, reduction='none')
+        cls_loss = torch.sum(cls_loss * weight) / self.n_i
+
+        self.collect_grad(cls_score.detach(), target.detach(), weight.detach())
+
+        return self.loss_weight * cls_loss
+
+    def get_channel_num(self, num_classes):
+        num_channel = num_classes + 1
+        return num_channel
+
+    def get_activation(self, pred):
+        pred = torch.sigmoid(pred)
+        n_i, n_c = pred.size()
+        bg_score = pred[:, -1].view(n_i, 1)
+        if self.test_with_obj:
+            pred[:, :-1] *= (1 - bg_score)
+        return pred
+
+    def collect_grad(self, pred, target, weight):
+        prob = torch.sigmoid(pred)
+        grad = target * (prob - 1) + (1 - target) * prob
+        grad = torch.abs(grad)
+
+        # do not collect grad for objectiveness branch [:-1]
+        pos_grad = torch.sum(grad * target * weight, dim=0)[:-1]
+        neg_grad = torch.sum(grad * (1 - target) * weight, dim=0)[:-1]
+
+        if self.use_distributed:
+            dist.all_reduce(pos_grad)
+            dist.all_reduce(neg_grad)
+
+        self.pos_grad += pos_grad
+        self.neg_grad += neg_grad
+        self.pos_neg = self.pos_grad / (self.neg_grad + 1e-10)
+
+    def get_weight(self, pred):
+        neg_w = torch.cat([self.map_func(self.pos_neg), pred.new_ones(1)])
+        pos_w = 1 + self.alpha * (1 - neg_w)
+        neg_w = neg_w.view(1, -1).expand(self.n_i, self.n_c)
+        pos_w = pos_w.view(1, -1).expand(self.n_i, self.n_c)
+        return pos_w, neg_w
diff --git a/head_extractor/src/mmdet/models/losses/focal_loss.py b/head_extractor/src/mmdet/models/losses/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..15bef293a591a7f4c099febdaa82abaf7fb4928a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/focal_loss.py
@@ -0,0 +1,371 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss
+
+from mmdet.registry import MODELS
+from .accuracy import accuracy
+from .utils import weight_reduce_loss
+
+
+# This method is only for debugging
+def py_sigmoid_focal_loss(pred,
+                          target,
+                          weight=None,
+                          gamma=2.0,
+                          alpha=0.25,
+                          reduction='mean',
+                          avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    # Actually, pt here denotes (1 - pt) in the Focal Loss paper
+    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    # Thus it's pt.pow(gamma) rather than (1 - pt).pow(gamma)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def py_focal_loss_with_prob(pred,
+                            target,
+                            weight=None,
+                            gamma=2.0,
+                            alpha=0.25,
+                            reduction='mean',
+                            avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+    Different from `py_sigmoid_focal_loss`, this function accepts probability
+    as input.
+
+    Args:
+        pred (torch.Tensor): The prediction probability with shape (N, C),
+            C is the number of classes.
+        target (torch.Tensor): The learning label of the prediction.
+            The target shape support (N,C) or (N,), (N,C) means one-hot form.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    if pred.dim() != target.dim():
+        num_classes = pred.size(1)
+        target = F.one_hot(target, num_classes=num_classes + 1)
+        target = target[:, :num_classes]
+
+    target = target.type_as(pred)
+    pt = (1 - pred) * target + pred * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy(
+        pred, target, reduction='none') * focal_weight
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       weight=None,
+                       gamma=2.0,
+                       alpha=0.25,
+                       reduction='mean',
+                       avg_factor=None):
+    r"""A wrapper of cuda version `Focal Loss
+    <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    # Function.apply does not accept keyword arguments, so the decorator
+    # "weighted_loss" is not applicable
+    loss = _sigmoid_focal_loss(pred.contiguous(), target.contiguous(), gamma,
+                               alpha, None, 'none')
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 activated=False):
+        """`Focal Loss <https://arxiv.org/abs/1708.02002>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float, optional): A balanced form for Focal Loss.
+                Defaults to 0.25.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            activated (bool, optional): Whether the input is activated.
+                If True, it means the input has been activated and can be
+                treated as probabilities. Else, it should be treated as logits.
+                Defaults to False.
+        """
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid focal loss supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.activated = activated
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning label of the prediction.
+                The target shape support (N,C) or (N,), (N,C) means
+                one-hot form.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            if self.activated:
+                calculate_loss_func = py_focal_loss_with_prob
+            else:
+                if pred.dim() == target.dim():
+                    # this means that target is already in One-Hot form.
+                    calculate_loss_func = py_sigmoid_focal_loss
+                elif torch.cuda.is_available() and pred.is_cuda:
+                    calculate_loss_func = sigmoid_focal_loss
+                else:
+                    num_classes = pred.size(1)
+                    target = F.one_hot(target, num_classes=num_classes + 1)
+                    target = target[:, :num_classes]
+                    calculate_loss_func = py_sigmoid_focal_loss
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                reduction=reduction,
+                avg_factor=avg_factor)
+
+        else:
+            raise NotImplementedError
+        return loss_cls
+
+
+@MODELS.register_module()
+class FocalCustomLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 num_classes=-1,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 activated=False):
+        """`Focal Loss for V3Det <https://arxiv.org/abs/1708.02002>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            num_classes (int): Number of classes to classify.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float, optional): A balanced form for Focal Loss.
+                Defaults to 0.25.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            activated (bool, optional): Whether the input is activated.
+                If True, it means the input has been activated and can be
+                treated as probabilities. Else, it should be treated as logits.
+                Defaults to False.
+        """
+        super(FocalCustomLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid focal loss supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.num_classes = num_classes
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.activated = activated
+
+        assert self.num_classes != -1
+
+        # custom output channels of the classifier
+        self.custom_cls_channels = True
+        # custom activation of cls_score
+        self.custom_activation = True
+        # custom accuracy of the classsifier
+        self.custom_accuracy = True
+
+    def get_cls_channels(self, num_classes):
+        assert num_classes == self.num_classes
+        return num_classes
+
+    def get_activation(self, cls_score):
+
+        fine_cls_score = cls_score[:, :self.num_classes]
+
+        score_classes = fine_cls_score.sigmoid()
+
+        return score_classes
+
+    def get_accuracy(self, cls_score, labels):
+
+        fine_cls_score = cls_score[:, :self.num_classes]
+
+        pos_inds = labels < self.num_classes
+        acc_classes = accuracy(fine_cls_score[pos_inds], labels[pos_inds])
+        acc = dict()
+        acc['acc_classes'] = acc_classes
+        return acc
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+
+            num_classes = pred.size(1)
+            target = F.one_hot(target, num_classes=num_classes + 1)
+            target = target[:, :num_classes]
+            calculate_loss_func = py_sigmoid_focal_loss
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                reduction=reduction,
+                avg_factor=avg_factor)
+
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/head_extractor/src/mmdet/models/losses/gaussian_focal_loss.py b/head_extractor/src/mmdet/models/losses/gaussian_focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..14fa8da462a5e7cabde2166878a1b9f2ccc16d62
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/gaussian_focal_loss.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss, weighted_loss
+
+
+@weighted_loss
+def gaussian_focal_loss(pred: Tensor,
+                        gaussian_target: Tensor,
+                        alpha: float = 2.0,
+                        gamma: float = 4.0,
+                        pos_weight: float = 1.0,
+                        neg_weight: float = 1.0) -> Tensor:
+    """`Focal Loss <https://arxiv.org/abs/1708.02002>`_ for targets in gaussian
+    distribution.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        gaussian_target (torch.Tensor): The learning target of the prediction
+            in gaussian distribution.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 2.0.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 4.0.
+        pos_weight(float): Positive sample loss weight. Defaults to 1.0.
+        neg_weight(float): Negative sample loss weight. Defaults to 1.0.
+    """
+    eps = 1e-12
+    pos_weights = gaussian_target.eq(1)
+    neg_weights = (1 - gaussian_target).pow(gamma)
+    pos_loss = -(pred + eps).log() * (1 - pred).pow(alpha) * pos_weights
+    neg_loss = -(1 - pred + eps).log() * pred.pow(alpha) * neg_weights
+    return pos_weight * pos_loss + neg_weight * neg_loss
+
+
+def gaussian_focal_loss_with_pos_inds(
+        pred: Tensor,
+        gaussian_target: Tensor,
+        pos_inds: Tensor,
+        pos_labels: Tensor,
+        alpha: float = 2.0,
+        gamma: float = 4.0,
+        pos_weight: float = 1.0,
+        neg_weight: float = 1.0,
+        reduction: str = 'mean',
+        avg_factor: Optional[Union[int, float]] = None) -> Tensor:
+    """`Focal Loss <https://arxiv.org/abs/1708.02002>`_ for targets in gaussian
+    distribution.
+
+    Note: The index with a value of 1 in ``gaussian_target`` in the
+    ``gaussian_focal_loss`` function is a positive sample, but in
+    ``gaussian_focal_loss_with_pos_inds`` the positive sample is passed
+    in through the ``pos_inds`` parameter.
+
+    Args:
+        pred (torch.Tensor): The prediction. The shape is (N, num_classes).
+        gaussian_target (torch.Tensor): The learning target of the prediction
+            in gaussian distribution. The shape is (N, num_classes).
+        pos_inds (torch.Tensor): The positive sample index.
+            The shape is (M, ).
+        pos_labels (torch.Tensor): The label corresponding to the positive
+            sample index. The shape is (M, ).
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 2.0.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 4.0.
+        pos_weight(float): Positive sample loss weight. Defaults to 1.0.
+        neg_weight(float): Negative sample loss weight. Defaults to 1.0.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to 'mean`.
+        avg_factor (int, float, optional): Average factor that is used to
+            average the loss. Defaults to None.
+    """
+    eps = 1e-12
+    neg_weights = (1 - gaussian_target).pow(gamma)
+
+    pos_pred_pix = pred[pos_inds]
+    pos_pred = pos_pred_pix.gather(1, pos_labels.unsqueeze(1))
+    pos_loss = -(pos_pred + eps).log() * (1 - pos_pred).pow(alpha)
+    pos_loss = weight_reduce_loss(pos_loss, None, reduction, avg_factor)
+
+    neg_loss = -(1 - pred + eps).log() * pred.pow(alpha) * neg_weights
+    neg_loss = weight_reduce_loss(neg_loss, None, reduction, avg_factor)
+
+    return pos_weight * pos_loss + neg_weight * neg_loss
+
+
+@MODELS.register_module()
+class GaussianFocalLoss(nn.Module):
+    """GaussianFocalLoss is a variant of focal loss.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1808.01244>`_
+    Code is modified from `kp_utils.py
+    <https://github.com/princeton-vl/CornerNet/blob/master/models/py_utils/kp_utils.py#L152>`_  # noqa: E501
+    Please notice that the target in GaussianFocalLoss is a gaussian heatmap,
+    not 0/1 binary target.
+
+    Args:
+        alpha (float): Power of prediction.
+        gamma (float): Power of target for negative samples.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+        pos_weight(float): Positive sample loss weight. Defaults to 1.0.
+        neg_weight(float): Negative sample loss weight. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 alpha: float = 2.0,
+                 gamma: float = 4.0,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 pos_weight: float = 1.0,
+                 neg_weight: float = 1.0) -> None:
+        super().__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.pos_weight = pos_weight
+        self.neg_weight = neg_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                pos_inds: Optional[Tensor] = None,
+                pos_labels: Optional[Tensor] = None,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[Union[int, float]] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        If you want to manually determine which positions are
+        positive samples, you can set the pos_index and pos_label
+        parameter. Currently, only the CenterNet update version uses
+        the parameter.
+
+        Args:
+            pred (torch.Tensor): The prediction. The shape is (N, num_classes).
+            target (torch.Tensor): The learning target of the prediction
+                in gaussian distribution. The shape is (N, num_classes).
+            pos_inds (torch.Tensor): The positive sample index.
+                Defaults to None.
+            pos_labels (torch.Tensor): The label corresponding to the positive
+                sample index. Defaults to None.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if pos_inds is not None:
+            assert pos_labels is not None
+            # Only used by centernet update version
+            loss_reg = self.loss_weight * gaussian_focal_loss_with_pos_inds(
+                pred,
+                target,
+                pos_inds,
+                pos_labels,
+                alpha=self.alpha,
+                gamma=self.gamma,
+                pos_weight=self.pos_weight,
+                neg_weight=self.neg_weight,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            loss_reg = self.loss_weight * gaussian_focal_loss(
+                pred,
+                target,
+                weight,
+                alpha=self.alpha,
+                gamma=self.gamma,
+                pos_weight=self.pos_weight,
+                neg_weight=self.neg_weight,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        return loss_reg
diff --git a/head_extractor/src/mmdet/models/losses/gfocal_loss.py b/head_extractor/src/mmdet/models/losses/gfocal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3a1172207e859039ca5ed7e0604d8b787131c29
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/gfocal_loss.py
@@ -0,0 +1,295 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.models.losses.utils import weighted_loss
+from mmdet.registry import MODELS
+
+
+@weighted_loss
+def quality_focal_loss(pred, target, beta=2.0):
+    r"""Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted joint representation of classification
+            and quality (IoU) estimation with shape (N, C), C is the number of
+            classes.
+        target (tuple([torch.Tensor])): Target category label with shape (N,)
+            and target quality label with shape (N,).
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    assert len(target) == 2, """target for QFL must be a tuple of two elements,
+        including category label and quality label, respectively"""
+    # label denotes the category id, score denotes the quality score
+    label, score = target
+
+    # negatives are supervised by 0 quality score
+    pred_sigmoid = pred.sigmoid()
+    scale_factor = pred_sigmoid
+    zerolabel = scale_factor.new_zeros(pred.shape)
+    loss = F.binary_cross_entropy_with_logits(
+        pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+    bg_class_ind = pred.size(1)
+    pos = ((label >= 0) & (label < bg_class_ind)).nonzero().squeeze(1)
+    pos_label = label[pos].long()
+    # positives are supervised by bbox quality (IoU) score
+    scale_factor = score[pos] - pred_sigmoid[pos, pos_label]
+    loss[pos, pos_label] = F.binary_cross_entropy_with_logits(
+        pred[pos, pos_label], score[pos],
+        reduction='none') * scale_factor.abs().pow(beta)
+
+    loss = loss.sum(dim=1, keepdim=False)
+    return loss
+
+
+@weighted_loss
+def quality_focal_loss_tensor_target(pred, target, beta=2.0, activated=False):
+    """`QualityFocal Loss <https://arxiv.org/abs/2008.13367>`_
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning target of the iou-aware
+            classification score with shape (N, C), C is the number of classes.
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+        activated (bool): Whether the input is activated.
+            If True, it means the input has been activated and can be
+            treated as probabilities. Else, it should be treated as logits.
+            Defaults to False.
+    """
+    # pred and target should be of the same size
+    assert pred.size() == target.size()
+    if activated:
+        pred_sigmoid = pred
+        loss_function = F.binary_cross_entropy
+    else:
+        pred_sigmoid = pred.sigmoid()
+        loss_function = F.binary_cross_entropy_with_logits
+
+    scale_factor = pred_sigmoid
+    target = target.type_as(pred)
+
+    zerolabel = scale_factor.new_zeros(pred.shape)
+    loss = loss_function(
+        pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    pos = (target != 0)
+    scale_factor = target[pos] - pred_sigmoid[pos]
+    loss[pos] = loss_function(
+        pred[pos], target[pos],
+        reduction='none') * scale_factor.abs().pow(beta)
+
+    loss = loss.sum(dim=1, keepdim=False)
+    return loss
+
+
+@weighted_loss
+def quality_focal_loss_with_prob(pred, target, beta=2.0):
+    r"""Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+    Different from `quality_focal_loss`, this function accepts probability
+    as input.
+
+    Args:
+        pred (torch.Tensor): Predicted joint representation of classification
+            and quality (IoU) estimation with shape (N, C), C is the number of
+            classes.
+        target (tuple([torch.Tensor])): Target category label with shape (N,)
+            and target quality label with shape (N,).
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    assert len(target) == 2, """target for QFL must be a tuple of two elements,
+        including category label and quality label, respectively"""
+    # label denotes the category id, score denotes the quality score
+    label, score = target
+
+    # negatives are supervised by 0 quality score
+    pred_sigmoid = pred
+    scale_factor = pred_sigmoid
+    zerolabel = scale_factor.new_zeros(pred.shape)
+    loss = F.binary_cross_entropy(
+        pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+    bg_class_ind = pred.size(1)
+    pos = ((label >= 0) & (label < bg_class_ind)).nonzero().squeeze(1)
+    pos_label = label[pos].long()
+    # positives are supervised by bbox quality (IoU) score
+    scale_factor = score[pos] - pred_sigmoid[pos, pos_label]
+    loss[pos, pos_label] = F.binary_cross_entropy(
+        pred[pos, pos_label], score[pos],
+        reduction='none') * scale_factor.abs().pow(beta)
+
+    loss = loss.sum(dim=1, keepdim=False)
+    return loss
+
+
+@weighted_loss
+def distribution_focal_loss(pred, label):
+    r"""Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted general distribution of bounding boxes
+            (before softmax) with shape (N, n+1), n is the max value of the
+            integral set `{0, ..., n}` in paper.
+        label (torch.Tensor): Target distance label for bounding boxes with
+            shape (N,).
+
+    Returns:
+        torch.Tensor: Loss tensor with shape (N,).
+    """
+    dis_left = label.long()
+    dis_right = dis_left + 1
+    weight_left = dis_right.float() - label
+    weight_right = label - dis_left.float()
+    loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \
+        + F.cross_entropy(pred, dis_right, reduction='none') * weight_right
+    return loss
+
+
+@MODELS.register_module()
+class QualityFocalLoss(nn.Module):
+    r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.
+            Defaults to True.
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+        activated (bool, optional): Whether the input is activated.
+            If True, it means the input has been activated and can be
+            treated as probabilities. Else, it should be treated as logits.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 beta=2.0,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 activated=False):
+        super(QualityFocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid in QFL supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.activated = activated
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): Predicted joint representation of
+                classification and quality (IoU) estimation with shape (N, C),
+                C is the number of classes.
+            target (Union(tuple([torch.Tensor]),Torch.Tensor)): The type is
+                tuple, it should be included Target category label with
+                shape (N,) and target quality label with shape (N,).The type
+                is torch.Tensor, the target should be one-hot form with
+                soft weights.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            if self.activated:
+                calculate_loss_func = quality_focal_loss_with_prob
+            else:
+                calculate_loss_func = quality_focal_loss
+            if isinstance(target, torch.Tensor):
+                # the target shape with (N,C) or (N,C,...), which means
+                # the target is one-hot form with soft weights.
+                calculate_loss_func = partial(
+                    quality_focal_loss_tensor_target, activated=self.activated)
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                weight,
+                beta=self.beta,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            raise NotImplementedError
+        return loss_cls
+
+
+@MODELS.register_module()
+class DistributionFocalLoss(nn.Module):
+    r"""Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(DistributionFocalLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): Predicted general distribution of bounding
+                boxes (before softmax) with shape (N, n+1), n is the max value
+                of the integral set `{0, ..., n}` in paper.
+            target (torch.Tensor): Target distance label for bounding boxes
+                with shape (N,).
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_cls = self.loss_weight * distribution_focal_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_cls
diff --git a/head_extractor/src/mmdet/models/losses/ghm_loss.py b/head_extractor/src/mmdet/models/losses/ghm_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a874c0038cc4a77769705a3a06a95a56d3e8dd2d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/ghm_loss.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def _expand_onehot_labels(labels, label_weights, label_channels):
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    inds = torch.nonzero(
+        (labels >= 0) & (labels < label_channels), as_tuple=False).squeeze()
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds]] = 1
+    bin_label_weights = label_weights.view(-1, 1).expand(
+        label_weights.size(0), label_channels)
+    return bin_labels, bin_label_weights
+
+
+# TODO: code refactoring to make it consistent with other losses
+@MODELS.register_module()
+class GHMC(nn.Module):
+    """GHM Classification Loss.
+
+    Details of the theorem can be viewed in the paper
+    `Gradient Harmonized Single-stage Detector
+    <https://arxiv.org/abs/1811.05181>`_.
+
+    Args:
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        use_sigmoid (bool): Can only be true for BCE based loss now.
+        loss_weight (float): The weight of the total GHM-C loss.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to "mean"
+    """
+
+    def __init__(self,
+                 bins=10,
+                 momentum=0,
+                 use_sigmoid=True,
+                 loss_weight=1.0,
+                 reduction='mean'):
+        super(GHMC, self).__init__()
+        self.bins = bins
+        self.momentum = momentum
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] += 1e-6
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.use_sigmoid = use_sigmoid
+        if not self.use_sigmoid:
+            raise NotImplementedError
+        self.loss_weight = loss_weight
+        self.reduction = reduction
+
+    def forward(self,
+                pred,
+                target,
+                label_weight,
+                reduction_override=None,
+                **kwargs):
+        """Calculate the GHM-C loss.
+
+        Args:
+            pred (float tensor of size [batch_num, class_num]):
+                The direct prediction of classification fc layer.
+            target (float tensor of size [batch_num, class_num]):
+                Binary class target for each sample.
+            label_weight (float tensor of size [batch_num, class_num]):
+                the value is 1 if the sample is valid and 0 if ignored.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        Returns:
+            The gradient harmonized loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # the target should be binary class label
+        if pred.dim() != target.dim():
+            target, label_weight = _expand_onehot_labels(
+                target, label_weight, pred.size(-1))
+        target, label_weight = target.float(), label_weight.float()
+        edges = self.edges
+        mmt = self.momentum
+        weights = torch.zeros_like(pred)
+
+        # gradient length
+        g = torch.abs(pred.sigmoid().detach() - target)
+
+        valid = label_weight > 0
+        tot = max(valid.float().sum().item(), 1.0)
+        n = 0  # n valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+                n += 1
+        if n > 0:
+            weights = weights / n
+
+        loss = F.binary_cross_entropy_with_logits(
+            pred, target, reduction='none')
+        loss = weight_reduce_loss(
+            loss, weights, reduction=reduction, avg_factor=tot)
+        return loss * self.loss_weight
+
+
+# TODO: code refactoring to make it consistent with other losses
+@MODELS.register_module()
+class GHMR(nn.Module):
+    """GHM Regression Loss.
+
+    Details of the theorem can be viewed in the paper
+    `Gradient Harmonized Single-stage Detector
+    <https://arxiv.org/abs/1811.05181>`_.
+
+    Args:
+        mu (float): The parameter for the Authentic Smooth L1 loss.
+        bins (int): Number of the unit regions for distribution calculation.
+        momentum (float): The parameter for moving average.
+        loss_weight (float): The weight of the total GHM-R loss.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to "mean"
+    """
+
+    def __init__(self,
+                 mu=0.02,
+                 bins=10,
+                 momentum=0,
+                 loss_weight=1.0,
+                 reduction='mean'):
+        super(GHMR, self).__init__()
+        self.mu = mu
+        self.bins = bins
+        edges = torch.arange(bins + 1).float() / bins
+        self.register_buffer('edges', edges)
+        self.edges[-1] = 1e3
+        self.momentum = momentum
+        if momentum > 0:
+            acc_sum = torch.zeros(bins)
+            self.register_buffer('acc_sum', acc_sum)
+        self.loss_weight = loss_weight
+        self.reduction = reduction
+
+    # TODO: support reduction parameter
+    def forward(self,
+                pred,
+                target,
+                label_weight,
+                avg_factor=None,
+                reduction_override=None):
+        """Calculate the GHM-R loss.
+
+        Args:
+            pred (float tensor of size [batch_num, 4 (* class_num)]):
+                The prediction of box regression layer. Channel number can be 4
+                or 4 * class_num depending on whether it is class-agnostic.
+            target (float tensor of size [batch_num, 4 (* class_num)]):
+                The target regression values with the same size of pred.
+            label_weight (float tensor of size [batch_num, 4 (* class_num)]):
+                The weight of each sample, 0 if ignored.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        Returns:
+            The gradient harmonized loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        mu = self.mu
+        edges = self.edges
+        mmt = self.momentum
+
+        # ASL1 loss
+        diff = pred - target
+        loss = torch.sqrt(diff * diff + mu * mu) - mu
+
+        # gradient length
+        g = torch.abs(diff / torch.sqrt(mu * mu + diff * diff)).detach()
+        weights = torch.zeros_like(g)
+
+        valid = label_weight > 0
+        tot = max(label_weight.float().sum().item(), 1.0)
+        n = 0  # n: valid bins
+        for i in range(self.bins):
+            inds = (g >= edges[i]) & (g < edges[i + 1]) & valid
+            num_in_bin = inds.sum().item()
+            if num_in_bin > 0:
+                n += 1
+                if mmt > 0:
+                    self.acc_sum[i] = mmt * self.acc_sum[i] \
+                        + (1 - mmt) * num_in_bin
+                    weights[inds] = tot / self.acc_sum[i]
+                else:
+                    weights[inds] = tot / num_in_bin
+        if n > 0:
+            weights /= n
+        loss = weight_reduce_loss(
+            loss, weights, reduction=reduction, avg_factor=tot)
+        return loss * self.loss_weight
diff --git a/head_extractor/src/mmdet/models/losses/iou_loss.py b/head_extractor/src/mmdet/models/losses/iou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8a2b977868cef6f4039b49277bfc853ffc720bd
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/iou_loss.py
@@ -0,0 +1,926 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+from .utils import weighted_loss
+
+
+@weighted_loss
+def iou_loss(pred: Tensor,
+             target: Tensor,
+             linear: bool = False,
+             mode: str = 'log',
+             eps: float = 1e-6) -> Tensor:
+    """IoU loss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+    The loss is calculated as negative log of IoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        linear (bool, optional): If True, use linear scale of loss instead of
+            log scale. Default: False.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'log'
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    assert mode in ['linear', 'square', 'log']
+    if linear:
+        mode = 'linear'
+        warnings.warn('DeprecationWarning: Setting "linear=True" in '
+                      'iou_loss is deprecated, please use "mode=`linear`" '
+                      'instead.')
+    # avoid fp16 overflow
+    if pred.dtype == torch.float16:
+        fp16 = True
+        pred = pred.to(torch.float32)
+    else:
+        fp16 = False
+
+    ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps)
+
+    if fp16:
+        ious = ious.to(torch.float16)
+
+    if mode == 'linear':
+        loss = 1 - ious
+    elif mode == 'square':
+        loss = 1 - ious**2
+    elif mode == 'log':
+        loss = -ious.log()
+    else:
+        raise NotImplementedError
+    return loss
+
+
+@weighted_loss
+def bounded_iou_loss(pred: Tensor,
+                     target: Tensor,
+                     beta: float = 0.2,
+                     eps: float = 1e-3) -> Tensor:
+    """BIoULoss.
+
+    This is an implementation of paper
+    `Improving Object Localization with Fitness NMS and Bounded IoU Loss.
+    <https://arxiv.org/abs/1711.00164>`_.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        beta (float, optional): Beta parameter in smoothl1.
+        eps (float, optional): Epsilon to avoid NaN values.
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    pred_ctrx = (pred[:, 0] + pred[:, 2]) * 0.5
+    pred_ctry = (pred[:, 1] + pred[:, 3]) * 0.5
+    pred_w = pred[:, 2] - pred[:, 0]
+    pred_h = pred[:, 3] - pred[:, 1]
+    with torch.no_grad():
+        target_ctrx = (target[:, 0] + target[:, 2]) * 0.5
+        target_ctry = (target[:, 1] + target[:, 3]) * 0.5
+        target_w = target[:, 2] - target[:, 0]
+        target_h = target[:, 3] - target[:, 1]
+
+    dx = target_ctrx - pred_ctrx
+    dy = target_ctry - pred_ctry
+
+    loss_dx = 1 - torch.max(
+        (target_w - 2 * dx.abs()) /
+        (target_w + 2 * dx.abs() + eps), torch.zeros_like(dx))
+    loss_dy = 1 - torch.max(
+        (target_h - 2 * dy.abs()) /
+        (target_h + 2 * dy.abs() + eps), torch.zeros_like(dy))
+    loss_dw = 1 - torch.min(target_w / (pred_w + eps), pred_w /
+                            (target_w + eps))
+    loss_dh = 1 - torch.min(target_h / (pred_h + eps), pred_h /
+                            (target_h + eps))
+    # view(..., -1) does not work for empty tensor
+    loss_comb = torch.stack([loss_dx, loss_dy, loss_dw, loss_dh],
+                            dim=-1).flatten(1)
+
+    loss = torch.where(loss_comb < beta, 0.5 * loss_comb * loss_comb / beta,
+                       loss_comb - 0.5 * beta)
+    return loss
+
+
+@weighted_loss
+def giou_loss(pred: Tensor, target: Tensor, eps: float = 1e-7) -> Tensor:
+    r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding
+    Box Regression <https://arxiv.org/abs/1902.09630>`_.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    # avoid fp16 overflow
+    if pred.dtype == torch.float16:
+        fp16 = True
+        pred = pred.to(torch.float32)
+    else:
+        fp16 = False
+
+    gious = bbox_overlaps(pred, target, mode='giou', is_aligned=True, eps=eps)
+
+    if fp16:
+        gious = gious.to(torch.float16)
+
+    loss = 1 - gious
+    return loss
+
+
+@weighted_loss
+def diou_loss(pred: Tensor, target: Tensor, eps: float = 1e-7) -> Tensor:
+    r"""Implementation of `Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression https://arxiv.org/abs/1911.08287`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    # DIoU
+    dious = ious - rho2 / c2
+    loss = 1 - dious
+    return loss
+
+
+@weighted_loss
+def ciou_loss(pred: Tensor, target: Tensor, eps: float = 1e-7) -> Tensor:
+    r"""`Implementation of paper `Enhancing Geometric Factors into
+    Model Learning and Inference for Object Detection and Instance
+    Segmentation <https://arxiv.org/abs/2005.03572>`_.
+
+    Code is modified from https://github.com/Zzh-tju/CIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    factor = 4 / math.pi**2
+    v = factor * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
+
+    with torch.no_grad():
+        alpha = (ious > 0.5).float() * v / (1 - ious + v)
+
+    # CIoU
+    cious = ious - (rho2 / c2 + alpha * v)
+    loss = 1 - cious.clamp(min=-1.0, max=1.0)
+    return loss
+
+
+@weighted_loss
+def eiou_loss(pred: Tensor,
+              target: Tensor,
+              smooth_point: float = 0.1,
+              eps: float = 1e-7) -> Tensor:
+    r"""Implementation of paper `Extended-IoU Loss: A Systematic
+    IoU-Related Method: Beyond Simplified Regression for Better
+    Localization <https://ieeexplore.ieee.org/abstract/document/9429909>`_
+
+    Code is modified from https://github.com//ShiqiYu/libfacedetection.train.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        smooth_point (float): hyperparameter, default is 0.1.
+        eps (float): Epsilon to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    px1, py1, px2, py2 = pred[:, 0], pred[:, 1], pred[:, 2], pred[:, 3]
+    tx1, ty1, tx2, ty2 = target[:, 0], target[:, 1], target[:, 2], target[:, 3]
+
+    # extent top left
+    ex1 = torch.min(px1, tx1)
+    ey1 = torch.min(py1, ty1)
+
+    # intersection coordinates
+    ix1 = torch.max(px1, tx1)
+    iy1 = torch.max(py1, ty1)
+    ix2 = torch.min(px2, tx2)
+    iy2 = torch.min(py2, ty2)
+
+    # extra
+    xmin = torch.min(ix1, ix2)
+    ymin = torch.min(iy1, iy2)
+    xmax = torch.max(ix1, ix2)
+    ymax = torch.max(iy1, iy2)
+
+    # Intersection
+    intersection = (ix2 - ex1) * (iy2 - ey1) + (xmin - ex1) * (ymin - ey1) - (
+        ix1 - ex1) * (ymax - ey1) - (xmax - ex1) * (
+            iy1 - ey1)
+    # Union
+    union = (px2 - px1) * (py2 - py1) + (tx2 - tx1) * (
+        ty2 - ty1) - intersection + eps
+    # IoU
+    ious = 1 - (intersection / union)
+
+    # Smooth-EIoU
+    smooth_sign = (ious < smooth_point).detach().float()
+    loss = 0.5 * smooth_sign * (ious**2) / smooth_point + (1 - smooth_sign) * (
+        ious - 0.5 * smooth_point)
+    return loss
+
+
+@weighted_loss
+def siou_loss(pred, target, eps=1e-7, neg_gamma=False):
+    r"""`Implementation of paper `SIoU Loss: More Powerful Learning
+    for Bounding Box Regression <https://arxiv.org/abs/2205.12740>`_.
+
+    Code is modified from https://github.com/meituan/YOLOv6.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+        neg_gamma (bool): `True` follows original implementation in paper.
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    # modified clamp threshold zero to eps to avoid NaN
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=eps)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+
+    # angle cost
+    s_cw = (b2_x1 + b2_x2 - b1_x1 - b1_x2) * 0.5 + eps
+    s_ch = (b2_y1 + b2_y2 - b1_y1 - b1_y2) * 0.5 + eps
+
+    sigma = torch.pow(s_cw**2 + s_ch**2, 0.5)
+
+    sin_alpha_1 = torch.abs(s_cw) / sigma
+    sin_alpha_2 = torch.abs(s_ch) / sigma
+    threshold = pow(2, 0.5) / 2
+    sin_alpha = torch.where(sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1)
+    angle_cost = torch.cos(torch.asin(sin_alpha) * 2 - math.pi / 2)
+
+    # distance cost
+    rho_x = (s_cw / cw)**2
+    rho_y = (s_ch / ch)**2
+
+    # `neg_gamma=True` follows original implementation in paper
+    # but setting `neg_gamma=False` makes training more stable.
+    gamma = angle_cost - 2 if neg_gamma else 2 - angle_cost
+    distance_cost = 2 - torch.exp(gamma * rho_x) - torch.exp(gamma * rho_y)
+
+    # shape cost
+    omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2)
+    omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2)
+    shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), 4) + torch.pow(
+        1 - torch.exp(-1 * omiga_h), 4)
+
+    # SIoU
+    sious = ious - 0.5 * (distance_cost + shape_cost)
+    loss = 1 - sious.clamp(min=-1.0, max=1.0)
+    return loss
+
+
+@MODELS.register_module()
+class IoULoss(nn.Module):
+    """IoULoss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+
+    Args:
+        linear (bool): If True, use linear scale of loss else determined
+            by mode. Default: False.
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'log'
+    """
+
+    def __init__(self,
+                 linear: bool = False,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 mode: str = 'log') -> None:
+        super().__init__()
+        assert mode in ['linear', 'square', 'log']
+        if linear:
+            mode = 'linear'
+            warnings.warn('DeprecationWarning: Setting "linear=True" in '
+                          'IOULoss is deprecated, please use "mode=`linear`" '
+                          'instead.')
+        self.mode = mode
+        self.linear = linear
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Return:
+            Tensor: Loss tensor.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # iou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * iou_loss(
+            pred,
+            target,
+            weight,
+            mode=self.mode,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class BoundedIoULoss(nn.Module):
+    """BIoULoss.
+
+    This is an implementation of paper
+    `Improving Object Localization with Fitness NMS and Bounded IoU Loss.
+    <https://arxiv.org/abs/1711.00164>`_.
+
+    Args:
+        beta (float, optional): Beta parameter in smoothl1.
+        eps (float, optional): Epsilon to avoid NaN values.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 beta: float = 0.2,
+                 eps: float = 1e-3,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.beta = beta
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * bounded_iou_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class GIoULoss(nn.Module):
+    r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding
+    Box Regression <https://arxiv.org/abs/1902.09630>`_.
+
+    Args:
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * giou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class DIoULoss(nn.Module):
+    r"""Implementation of `Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression https://arxiv.org/abs/1911.08287`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * diou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class CIoULoss(nn.Module):
+    r"""`Implementation of paper `Enhancing Geometric Factors into
+    Model Learning and Inference for Object Detection and Instance
+    Segmentation <https://arxiv.org/abs/2005.03572>`_.
+
+    Code is modified from https://github.com/Zzh-tju/CIoU.
+
+    Args:
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * ciou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class EIoULoss(nn.Module):
+    r"""Implementation of paper `Extended-IoU Loss: A Systematic
+    IoU-Related Method: Beyond Simplified Regression for Better
+    Localization <https://ieeexplore.ieee.org/abstract/document/9429909>`_
+
+    Code is modified from https://github.com//ShiqiYu/libfacedetection.train.
+
+    Args:
+        eps (float): Epsilon to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+        smooth_point (float): hyperparameter, default is 0.1.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 smooth_point: float = 0.1) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.smooth_point = smooth_point
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * eiou_loss(
+            pred,
+            target,
+            weight,
+            smooth_point=self.smooth_point,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@MODELS.register_module()
+class SIoULoss(nn.Module):
+    r"""`Implementation of paper `SIoU Loss: More Powerful Learning
+    for Bounding Box Regression <https://arxiv.org/abs/2205.12740>`_.
+
+    Code is modified from https://github.com/meituan/YOLOv6.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+        neg_gamma (bool): `True` follows original implementation in paper.
+
+    Return:
+        Tensor: Loss tensor.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 neg_gamma: bool = False) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.neg_gamma = neg_gamma
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * siou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            neg_gamma=self.neg_gamma,
+            **kwargs)
+        return loss
diff --git a/head_extractor/src/mmdet/models/losses/kd_loss.py b/head_extractor/src/mmdet/models/losses/kd_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a7d5ef24a0b0d7d7390a27c7cd9cbfdbe61d823
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/kd_loss.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def knowledge_distillation_kl_div_loss(pred: Tensor,
+                                       soft_label: Tensor,
+                                       T: int,
+                                       detach_target: bool = True) -> Tensor:
+    r"""Loss function for knowledge distilling using KL divergence.
+
+    Args:
+        pred (Tensor): Predicted logits with shape (N, n + 1).
+        soft_label (Tensor): Target logits with shape (N, N + 1).
+        T (int): Temperature for distillation.
+        detach_target (bool): Remove soft_label from automatic differentiation
+
+    Returns:
+        Tensor: Loss tensor with shape (N,).
+    """
+    assert pred.size() == soft_label.size()
+    target = F.softmax(soft_label / T, dim=1)
+    if detach_target:
+        target = target.detach()
+
+    kd_loss = F.kl_div(
+        F.log_softmax(pred / T, dim=1), target, reduction='none').mean(1) * (
+            T * T)
+
+    return kd_loss
+
+
+@MODELS.register_module()
+class KnowledgeDistillationKLDivLoss(nn.Module):
+    """Loss function for knowledge distilling using KL divergence.
+
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+        T (int): Temperature for distillation.
+    """
+
+    def __init__(self,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 T: int = 10) -> None:
+        super().__init__()
+        assert T >= 1
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.T = T
+
+    def forward(self,
+                pred: Tensor,
+                soft_label: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted logits with shape (N, n + 1).
+            soft_label (Tensor): Target logits with shape (N, N + 1).
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        loss_kd = self.loss_weight * knowledge_distillation_kl_div_loss(
+            pred,
+            soft_label,
+            weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            T=self.T)
+
+        return loss_kd
diff --git a/head_extractor/src/mmdet/models/losses/l2_loss.py b/head_extractor/src/mmdet/models/losses/l2_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..6210a3007b2c39540f022925cc93181c7328e42d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/l2_loss.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def l2_loss(pred: Tensor, target: Tensor) -> Tensor:
+    """L2 loss.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)**2
+    return loss
+
+
+@MODELS.register_module()
+class L2Loss(BaseModule):
+    """L2 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self,
+                 neg_pos_ub: int = -1,
+                 pos_margin: float = -1,
+                 neg_margin: float = -1,
+                 hard_mining: bool = False,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0):
+        super(L2Loss, self).__init__()
+        self.neg_pos_ub = neg_pos_ub
+        self.pos_margin = pos_margin
+        self.neg_margin = neg_margin
+        self.hard_mining = hard_mining
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        pred, weight, avg_factor = self.update_weight(pred, target, weight,
+                                                      avg_factor)
+        loss_bbox = self.loss_weight * l2_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+    def update_weight(self, pred: Tensor, target: Tensor, weight: Tensor,
+                      avg_factor: float) -> Tuple[Tensor, Tensor, float]:
+        """Update the weight according to targets."""
+        if weight is None:
+            weight = target.new_ones(target.size())
+
+        invalid_inds = weight <= 0
+        target[invalid_inds] = -1
+        pos_inds = target == 1
+        neg_inds = target == 0
+
+        if self.pos_margin > 0:
+            pred[pos_inds] -= self.pos_margin
+        if self.neg_margin > 0:
+            pred[neg_inds] -= self.neg_margin
+        pred = torch.clamp(pred, min=0, max=1)
+
+        num_pos = int((target == 1).sum())
+        num_neg = int((target == 0).sum())
+        if self.neg_pos_ub > 0 and num_neg / (num_pos +
+                                              1e-6) > self.neg_pos_ub:
+            num_neg = num_pos * self.neg_pos_ub
+            neg_idx = torch.nonzero(target == 0, as_tuple=False)
+
+            if self.hard_mining:
+                costs = l2_loss(
+                    pred, target, reduction='none')[neg_idx[:, 0],
+                                                    neg_idx[:, 1]].detach()
+                neg_idx = neg_idx[costs.topk(num_neg)[1], :]
+            else:
+                neg_idx = self.random_choice(neg_idx, num_neg)
+
+            new_neg_inds = neg_inds.new_zeros(neg_inds.size()).bool()
+            new_neg_inds[neg_idx[:, 0], neg_idx[:, 1]] = True
+
+            invalid_neg_inds = torch.logical_xor(neg_inds, new_neg_inds)
+            weight[invalid_neg_inds] = 0
+
+        avg_factor = (weight > 0).sum()
+        return pred, weight, avg_factor
+
+    @staticmethod
+    def random_choice(gallery: Union[list, np.ndarray, Tensor],
+                      num: int) -> np.ndarray:
+        """Random select some elements from the gallery.
+
+        It seems that Pytorch's implementation is slower than numpy so we use
+        numpy to randperm the indices.
+        """
+        assert len(gallery) >= num
+        if isinstance(gallery, list):
+            gallery = np.array(gallery)
+        cands = np.arange(len(gallery))
+        np.random.shuffle(cands)
+        rand_inds = cands[:num]
+        if not isinstance(gallery, np.ndarray):
+            rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
+        return gallery[rand_inds]
diff --git a/head_extractor/src/mmdet/models/losses/margin_loss.py b/head_extractor/src/mmdet/models/losses/margin_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..0609e1db50edf89c8ae8b65709e8ab786f580366
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/margin_loss.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .mse_loss import mse_loss
+
+
+@MODELS.register_module()
+class MarginL2Loss(BaseModule):
+    """L2 loss with margin.
+
+    Args:
+        neg_pos_ub (int, optional): The upper bound of negative to positive
+            samples in hard mining. Defaults to -1.
+        pos_margin (float, optional): The similarity margin for positive
+            samples in hard mining. Defaults to -1.
+        neg_margin (float, optional): The similarity margin for negative
+            samples in hard mining. Defaults to -1.
+        hard_mining (bool, optional): Whether to use hard mining. Defaults to
+            False.
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum". Defaults to "mean".
+        loss_weight (float, optional): The weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 neg_pos_ub: int = -1,
+                 pos_margin: float = -1,
+                 neg_margin: float = -1,
+                 hard_mining: bool = False,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0):
+        super(MarginL2Loss, self).__init__()
+        self.neg_pos_ub = neg_pos_ub
+        self.pos_margin = pos_margin
+        self.neg_margin = neg_margin
+        self.hard_mining = hard_mining
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (float, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        pred, weight, avg_factor = self.update_weight(pred, target, weight,
+                                                      avg_factor)
+        loss_bbox = self.loss_weight * mse_loss(
+            pred,
+            target.float(),
+            weight.float(),
+            reduction=reduction,
+            avg_factor=avg_factor)
+        return loss_bbox
+
+    def update_weight(self, pred: Tensor, target: Tensor, weight: Tensor,
+                      avg_factor: float) -> Tuple[Tensor, Tensor, float]:
+        """Update the weight according to targets.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor): The weight of loss for each prediction.
+            avg_factor (float): Average factor that is used to average the
+                loss.
+
+        Returns:
+            tuple[torch.Tensor]: The updated prediction, weight and average
+            factor.
+        """
+        if weight is None:
+            weight = target.new_ones(target.size())
+
+        invalid_inds = weight <= 0
+        target[invalid_inds] = -1
+        pos_inds = target == 1
+        neg_inds = target == 0
+
+        if self.pos_margin > 0:
+            pred[pos_inds] -= self.pos_margin
+        if self.neg_margin > 0:
+            pred[neg_inds] -= self.neg_margin
+        pred = torch.clamp(pred, min=0, max=1)
+
+        num_pos = int((target == 1).sum())
+        num_neg = int((target == 0).sum())
+        if self.neg_pos_ub > 0 and num_neg / (num_pos +
+                                              1e-6) > self.neg_pos_ub:
+            num_neg = num_pos * self.neg_pos_ub
+            neg_idx = torch.nonzero(target == 0, as_tuple=False)
+
+            if self.hard_mining:
+                costs = mse_loss(
+                    pred, target.float(),
+                    reduction='none')[neg_idx[:, 0], neg_idx[:, 1]].detach()
+                neg_idx = neg_idx[costs.topk(num_neg)[1], :]
+            else:
+                neg_idx = self.random_choice(neg_idx, num_neg)
+
+            new_neg_inds = neg_inds.new_zeros(neg_inds.size()).bool()
+            new_neg_inds[neg_idx[:, 0], neg_idx[:, 1]] = True
+
+            invalid_neg_inds = torch.logical_xor(neg_inds, new_neg_inds)
+            weight[invalid_neg_inds] = 0
+
+        avg_factor = (weight > 0).sum()
+        return pred, weight, avg_factor
+
+    @staticmethod
+    def random_choice(gallery: Union[list, np.ndarray, Tensor],
+                      num: int) -> np.ndarray:
+        """Random select some elements from the gallery.
+
+        It seems that Pytorch's implementation is slower than numpy so we use
+        numpy to randperm the indices.
+
+        Args:
+            gallery (list | np.ndarray | torch.Tensor): The gallery from
+                which to sample.
+            num (int): The number of elements to sample.
+        """
+        assert len(gallery) >= num
+        if isinstance(gallery, list):
+            gallery = np.array(gallery)
+        cands = np.arange(len(gallery))
+        np.random.shuffle(cands)
+        rand_inds = cands[:num]
+        if not isinstance(gallery, np.ndarray):
+            rand_inds = torch.from_numpy(rand_inds).long().to(gallery.device)
+        return gallery[rand_inds]
diff --git a/head_extractor/src/mmdet/models/losses/mse_loss.py b/head_extractor/src/mmdet/models/losses/mse_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..6048218ad36a8105e7fa182f40fae93ef7c9268f
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/mse_loss.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def mse_loss(pred: Tensor, target: Tensor) -> Tensor:
+    """A Wrapper of MSE loss.
+    Args:
+        pred (Tensor): The prediction.
+        target (Tensor): The learning target of the prediction.
+
+    Returns:
+        Tensor: loss Tensor
+    """
+    return F.mse_loss(pred, target, reduction='none')
+
+
+@MODELS.register_module()
+class MSELoss(nn.Module):
+    """MSELoss.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+
+    def __init__(self,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function of loss.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            weight (Tensor, optional): Weight of the loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: The calculated loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * mse_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss
diff --git a/head_extractor/src/mmdet/models/losses/multipos_cross_entropy_loss.py b/head_extractor/src/mmdet/models/losses/multipos_cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d1561ed414b7c15412b5e746dff39ca0c53ba1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/multipos_cross_entropy_loss.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+@MODELS.register_module()
+class MultiPosCrossEntropyLoss(BaseModule):
+    """multi-positive targets cross entropy loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum". Defaults to "mean".
+        loss_weight (float, optional): The weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self, reduction: str = 'mean', loss_weight: float = 1.0):
+        super(MultiPosCrossEntropyLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def multi_pos_cross_entropy(self,
+                                pred: Tensor,
+                                label: Tensor,
+                                weight: Optional[Tensor] = None,
+                                reduction: str = 'mean',
+                                avg_factor: Optional[float] = None) -> Tensor:
+        """Multi-positive targets cross entropy loss.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            label (torch.Tensor): The assigned label of the prediction.
+            weight (torch.Tensor): The element-wise weight.
+            reduction (str): Same as built-in losses of PyTorch.
+            avg_factor (float): Average factor when computing
+                the mean of losses.
+
+        Returns:
+            torch.Tensor: Calculated loss
+        """
+
+        pos_inds = (label >= 1)
+        neg_inds = (label == 0)
+        pred_pos = pred * pos_inds.float()
+        pred_neg = pred * neg_inds.float()
+        # use -inf to mask out unwanted elements.
+        pred_pos[neg_inds] = pred_pos[neg_inds] + float('inf')
+        pred_neg[pos_inds] = pred_neg[pos_inds] + float('-inf')
+
+        _pos_expand = torch.repeat_interleave(pred_pos, pred.shape[1], dim=1)
+        _neg_expand = pred_neg.repeat(1, pred.shape[1])
+
+        x = torch.nn.functional.pad((_neg_expand - _pos_expand), (0, 1),
+                                    'constant', 0)
+        loss = torch.logsumexp(x, dim=1)
+
+        # apply weights and do the reduction
+        if weight is not None:
+            weight = weight.float()
+        loss = weight_reduce_loss(
+            loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+        return loss
+
+    def forward(self,
+                cls_score: Tensor,
+                label: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[float] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The classification score.
+            label (torch.Tensor): The assigned label of the prediction.
+            weight (torch.Tensor): The element-wise weight.
+            avg_factor (float): Average factor when computing
+                the mean of losses.
+            reduction_override (str): Same as built-in losses of PyTorch.
+
+        Returns:
+            torch.Tensor: Calculated loss
+        """
+        assert cls_score.size() == label.size()
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_cls = self.loss_weight * self.multi_pos_cross_entropy(
+            cls_score,
+            label,
+            weight,
+            reduction=reduction,
+            avg_factor=avg_factor)
+        return loss_cls
diff --git a/head_extractor/src/mmdet/models/losses/pisa_loss.py b/head_extractor/src/mmdet/models/losses/pisa_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b192aa0dbc7eb554755eb2f242eab0ea7f1fc650
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/pisa_loss.py
@@ -0,0 +1,187 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.structures.bbox import bbox_overlaps
+from ..task_modules.coders import BaseBBoxCoder
+from ..task_modules.samplers import SamplingResult
+
+
+def isr_p(cls_score: Tensor,
+          bbox_pred: Tensor,
+          bbox_targets: Tuple[Tensor],
+          rois: Tensor,
+          sampling_results: List[SamplingResult],
+          loss_cls: nn.Module,
+          bbox_coder: BaseBBoxCoder,
+          k: float = 2,
+          bias: float = 0,
+          num_class: int = 80) -> tuple:
+    """Importance-based Sample Reweighting (ISR_P), positive part.
+
+    Args:
+        cls_score (Tensor): Predicted classification scores.
+        bbox_pred (Tensor): Predicted bbox deltas.
+        bbox_targets (tuple[Tensor]): A tuple of bbox targets, the are
+            labels, label_weights, bbox_targets, bbox_weights, respectively.
+        rois (Tensor): Anchors (single_stage) in shape (n, 4) or RoIs
+            (two_stage) in shape (n, 5).
+        sampling_results (:obj:`SamplingResult`): Sampling results.
+        loss_cls (:obj:`nn.Module`): Classification loss func of the head.
+        bbox_coder (:obj:`BaseBBoxCoder`): BBox coder of the head.
+        k (float): Power of the non-linear mapping. Defaults to 2.
+        bias (float): Shift of the non-linear mapping. Defaults to 0.
+        num_class (int): Number of classes, defaults to 80.
+
+    Return:
+        tuple([Tensor]): labels, imp_based_label_weights, bbox_targets,
+            bbox_target_weights
+    """
+
+    labels, label_weights, bbox_targets, bbox_weights = bbox_targets
+    pos_label_inds = ((labels >= 0) &
+                      (labels < num_class)).nonzero().reshape(-1)
+    pos_labels = labels[pos_label_inds]
+
+    # if no positive samples, return the original targets
+    num_pos = float(pos_label_inds.size(0))
+    if num_pos == 0:
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    # merge pos_assigned_gt_inds of per image to a single tensor
+    gts = list()
+    last_max_gt = 0
+    for i in range(len(sampling_results)):
+        gt_i = sampling_results[i].pos_assigned_gt_inds
+        gts.append(gt_i + last_max_gt)
+        if len(gt_i) != 0:
+            last_max_gt = gt_i.max() + 1
+    gts = torch.cat(gts)
+    assert len(gts) == num_pos
+
+    cls_score = cls_score.detach()
+    bbox_pred = bbox_pred.detach()
+
+    # For single stage detectors, rois here indicate anchors, in shape (N, 4)
+    # For two stage detectors, rois are in shape (N, 5)
+    if rois.size(-1) == 5:
+        pos_rois = rois[pos_label_inds][:, 1:]
+    else:
+        pos_rois = rois[pos_label_inds]
+
+    if bbox_pred.size(-1) > 4:
+        bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4)
+        pos_delta_pred = bbox_pred[pos_label_inds, pos_labels].view(-1, 4)
+    else:
+        pos_delta_pred = bbox_pred[pos_label_inds].view(-1, 4)
+
+    # compute iou of the predicted bbox and the corresponding GT
+    pos_delta_target = bbox_targets[pos_label_inds].view(-1, 4)
+    pos_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_pred)
+    target_bbox_pred = bbox_coder.decode(pos_rois, pos_delta_target)
+    ious = bbox_overlaps(pos_bbox_pred, target_bbox_pred, is_aligned=True)
+
+    pos_imp_weights = label_weights[pos_label_inds]
+    # Two steps to compute IoU-HLR. Samples are first sorted by IoU locally,
+    # then sorted again within the same-rank group
+    max_l_num = pos_labels.bincount().max()
+    for label in pos_labels.unique():
+        l_inds = (pos_labels == label).nonzero().view(-1)
+        l_gts = gts[l_inds]
+        for t in l_gts.unique():
+            t_inds = l_inds[l_gts == t]
+            t_ious = ious[t_inds]
+            _, t_iou_rank_idx = t_ious.sort(descending=True)
+            _, t_iou_rank = t_iou_rank_idx.sort()
+            ious[t_inds] += max_l_num - t_iou_rank.float()
+        l_ious = ious[l_inds]
+        _, l_iou_rank_idx = l_ious.sort(descending=True)
+        _, l_iou_rank = l_iou_rank_idx.sort()  # IoU-HLR
+        # linearly map HLR to label weights
+        pos_imp_weights[l_inds] *= (max_l_num - l_iou_rank.float()) / max_l_num
+
+    pos_imp_weights = (bias + pos_imp_weights * (1 - bias)).pow(k)
+
+    # normalize to make the new weighted loss value equal to the original loss
+    pos_loss_cls = loss_cls(
+        cls_score[pos_label_inds], pos_labels, reduction_override='none')
+    if pos_loss_cls.dim() > 1:
+        ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds][:,
+                                                                        None]
+        new_pos_loss_cls = pos_loss_cls * pos_imp_weights[:, None]
+    else:
+        ori_pos_loss_cls = pos_loss_cls * label_weights[pos_label_inds]
+        new_pos_loss_cls = pos_loss_cls * pos_imp_weights
+    pos_loss_cls_ratio = ori_pos_loss_cls.sum() / new_pos_loss_cls.sum()
+    pos_imp_weights = pos_imp_weights * pos_loss_cls_ratio
+    label_weights[pos_label_inds] = pos_imp_weights
+
+    bbox_targets = labels, label_weights, bbox_targets, bbox_weights
+    return bbox_targets
+
+
+def carl_loss(cls_score: Tensor,
+              labels: Tensor,
+              bbox_pred: Tensor,
+              bbox_targets: Tensor,
+              loss_bbox: nn.Module,
+              k: float = 1,
+              bias: float = 0.2,
+              avg_factor: Optional[int] = None,
+              sigmoid: bool = False,
+              num_class: int = 80) -> dict:
+    """Classification-Aware Regression Loss (CARL).
+
+    Args:
+        cls_score (Tensor): Predicted classification scores.
+        labels (Tensor): Targets of classification.
+        bbox_pred (Tensor): Predicted bbox deltas.
+        bbox_targets (Tensor): Target of bbox regression.
+        loss_bbox (func): Regression loss func of the head.
+        bbox_coder (obj): BBox coder of the head.
+        k (float): Power of the non-linear mapping. Defaults to 1.
+        bias (float): Shift of the non-linear mapping. Defaults to 0.2.
+        avg_factor (int, optional): Average factor used in regression loss.
+        sigmoid (bool): Activation of the classification score.
+        num_class (int): Number of classes, defaults to 80.
+
+    Return:
+        dict: CARL loss dict.
+    """
+    pos_label_inds = ((labels >= 0) &
+                      (labels < num_class)).nonzero().reshape(-1)
+    if pos_label_inds.numel() == 0:
+        return dict(loss_carl=cls_score.sum()[None] * 0.)
+    pos_labels = labels[pos_label_inds]
+
+    # multiply pos_cls_score with the corresponding bbox weight
+    # and remain gradient
+    if sigmoid:
+        pos_cls_score = cls_score.sigmoid()[pos_label_inds, pos_labels]
+    else:
+        pos_cls_score = cls_score.softmax(-1)[pos_label_inds, pos_labels]
+    carl_loss_weights = (bias + (1 - bias) * pos_cls_score).pow(k)
+
+    # normalize carl_loss_weight to make its sum equal to num positive
+    num_pos = float(pos_cls_score.size(0))
+    weight_ratio = num_pos / carl_loss_weights.sum()
+    carl_loss_weights *= weight_ratio
+
+    if avg_factor is None:
+        avg_factor = bbox_targets.size(0)
+    # if is class agnostic, bbox pred is in shape (N, 4)
+    # otherwise, bbox pred is in shape (N, #classes, 4)
+    if bbox_pred.size(-1) > 4:
+        bbox_pred = bbox_pred.view(bbox_pred.size(0), -1, 4)
+        pos_bbox_preds = bbox_pred[pos_label_inds, pos_labels]
+    else:
+        pos_bbox_preds = bbox_pred[pos_label_inds]
+    ori_loss_reg = loss_bbox(
+        pos_bbox_preds,
+        bbox_targets[pos_label_inds],
+        reduction_override='none') / avg_factor
+    loss_carl = (ori_loss_reg * carl_loss_weights[:, None]).sum()
+    return dict(loss_carl=loss_carl[None])
diff --git a/head_extractor/src/mmdet/models/losses/seesaw_loss.py b/head_extractor/src/mmdet/models/losses/seesaw_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dec62b0afdc01e848e0c7f53ba0b6b10b899ea4
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/seesaw_loss.py
@@ -0,0 +1,278 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .accuracy import accuracy
+from .cross_entropy_loss import cross_entropy
+from .utils import weight_reduce_loss
+
+
+def seesaw_ce_loss(cls_score: Tensor,
+                   labels: Tensor,
+                   label_weights: Tensor,
+                   cum_samples: Tensor,
+                   num_classes: int,
+                   p: float,
+                   q: float,
+                   eps: float,
+                   reduction: str = 'mean',
+                   avg_factor: Optional[int] = None) -> Tensor:
+    """Calculate the Seesaw CrossEntropy loss.
+
+    Args:
+        cls_score (Tensor): The prediction with shape (N, C),
+             C is the number of classes.
+        labels (Tensor): The learning label of the prediction.
+        label_weights (Tensor): Sample-wise loss weight.
+        cum_samples (Tensor): Cumulative samples for each category.
+        num_classes (int): The number of classes.
+        p (float): The ``p`` in the mitigation factor.
+        q (float): The ``q`` in the compenstation factor.
+        eps (float): The minimal value of divisor to smooth
+             the computation of compensation factor
+        reduction (str, optional): The method used to reduce the loss.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+
+    Returns:
+        Tensor: The calculated loss
+    """
+    assert cls_score.size(-1) == num_classes
+    assert len(cum_samples) == num_classes
+
+    onehot_labels = F.one_hot(labels, num_classes)
+    seesaw_weights = cls_score.new_ones(onehot_labels.size())
+
+    # mitigation factor
+    if p > 0:
+        sample_ratio_matrix = cum_samples[None, :].clamp(
+            min=1) / cum_samples[:, None].clamp(min=1)
+        index = (sample_ratio_matrix < 1.0).float()
+        sample_weights = sample_ratio_matrix.pow(p) * index + (1 - index)
+        mitigation_factor = sample_weights[labels.long(), :]
+        seesaw_weights = seesaw_weights * mitigation_factor
+
+    # compensation factor
+    if q > 0:
+        scores = F.softmax(cls_score.detach(), dim=1)
+        self_scores = scores[
+            torch.arange(0, len(scores)).to(scores.device).long(),
+            labels.long()]
+        score_matrix = scores / self_scores[:, None].clamp(min=eps)
+        index = (score_matrix > 1.0).float()
+        compensation_factor = score_matrix.pow(q) * index + (1 - index)
+        seesaw_weights = seesaw_weights * compensation_factor
+
+    cls_score = cls_score + (seesaw_weights.log() * (1 - onehot_labels))
+
+    loss = F.cross_entropy(cls_score, labels, weight=None, reduction='none')
+
+    if label_weights is not None:
+        label_weights = label_weights.float()
+    loss = weight_reduce_loss(
+        loss, weight=label_weights, reduction=reduction, avg_factor=avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class SeesawLoss(nn.Module):
+    """
+    Seesaw Loss for Long-Tailed Instance Segmentation (CVPR 2021)
+    arXiv: https://arxiv.org/abs/2008.10032
+
+    Args:
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+             of softmax. Only False is supported.
+        p (float, optional): The ``p`` in the mitigation factor.
+             Defaults to 0.8.
+        q (float, optional): The ``q`` in the compenstation factor.
+             Defaults to 2.0.
+        num_classes (int, optional): The number of classes.
+             Default to 1203 for LVIS v1 dataset.
+        eps (float, optional): The minimal value of divisor to smooth
+             the computation of compensation factor
+        reduction (str, optional): The method that reduces the loss to a
+             scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+        return_dict (bool, optional): Whether return the losses as a dict.
+             Default to True.
+    """
+
+    def __init__(self,
+                 use_sigmoid: bool = False,
+                 p: float = 0.8,
+                 q: float = 2.0,
+                 num_classes: int = 1203,
+                 eps: float = 1e-2,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 return_dict: bool = True) -> None:
+        super().__init__()
+        assert not use_sigmoid
+        self.use_sigmoid = False
+        self.p = p
+        self.q = q
+        self.num_classes = num_classes
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.return_dict = return_dict
+
+        # 0 for pos, 1 for neg
+        self.cls_criterion = seesaw_ce_loss
+
+        # cumulative samples for each category
+        self.register_buffer(
+            'cum_samples',
+            torch.zeros(self.num_classes + 1, dtype=torch.float))
+
+        # custom output channels of the classifier
+        self.custom_cls_channels = True
+        # custom activation of cls_score
+        self.custom_activation = True
+        # custom accuracy of the classsifier
+        self.custom_accuracy = True
+
+    def _split_cls_score(self, cls_score: Tensor) -> Tuple[Tensor, Tensor]:
+        """split cls_score.
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C + 2).
+
+        Returns:
+            Tuple[Tensor, Tensor]: The score for classes and objectness,
+                 respectively
+        """
+        # split cls_score to cls_score_classes and cls_score_objectness
+        assert cls_score.size(-1) == self.num_classes + 2
+        cls_score_classes = cls_score[..., :-2]
+        cls_score_objectness = cls_score[..., -2:]
+        return cls_score_classes, cls_score_objectness
+
+    def get_cls_channels(self, num_classes: int) -> int:
+        """Get custom classification channels.
+
+        Args:
+            num_classes (int): The number of classes.
+
+        Returns:
+            int: The custom classification channels.
+        """
+        assert num_classes == self.num_classes
+        return num_classes + 2
+
+    def get_activation(self, cls_score: Tensor) -> Tensor:
+        """Get custom activation of cls_score.
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C + 2).
+
+        Returns:
+            Tensor: The custom activation of cls_score with shape
+                 (N, C + 1).
+        """
+        cls_score_classes, cls_score_objectness = self._split_cls_score(
+            cls_score)
+        score_classes = F.softmax(cls_score_classes, dim=-1)
+        score_objectness = F.softmax(cls_score_objectness, dim=-1)
+        score_pos = score_objectness[..., [0]]
+        score_neg = score_objectness[..., [1]]
+        score_classes = score_classes * score_pos
+        scores = torch.cat([score_classes, score_neg], dim=-1)
+        return scores
+
+    def get_accuracy(self, cls_score: Tensor,
+                     labels: Tensor) -> Dict[str, Tensor]:
+        """Get custom accuracy w.r.t. cls_score and labels.
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C + 2).
+            labels (Tensor): The learning label of the prediction.
+
+        Returns:
+            Dict [str, Tensor]: The accuracy for objectness and classes,
+                 respectively.
+        """
+        pos_inds = labels < self.num_classes
+        obj_labels = (labels == self.num_classes).long()
+        cls_score_classes, cls_score_objectness = self._split_cls_score(
+            cls_score)
+        acc_objectness = accuracy(cls_score_objectness, obj_labels)
+        acc_classes = accuracy(cls_score_classes[pos_inds], labels[pos_inds])
+        acc = dict()
+        acc['acc_objectness'] = acc_objectness
+        acc['acc_classes'] = acc_classes
+        return acc
+
+    def forward(
+        self,
+        cls_score: Tensor,
+        labels: Tensor,
+        label_weights: Optional[Tensor] = None,
+        avg_factor: Optional[int] = None,
+        reduction_override: Optional[str] = None
+    ) -> Union[Tensor, Dict[str, Tensor]]:
+        """Forward function.
+
+        Args:
+            cls_score (Tensor): The prediction with shape (N, C + 2).
+            labels (Tensor): The learning label of the prediction.
+            label_weights (Tensor, optional): Sample-wise loss weight.
+            avg_factor (int, optional): Average factor that is used to average
+                 the loss. Defaults to None.
+            reduction (str, optional): The method used to reduce the loss.
+                 Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor | Dict [str, Tensor]:
+                 if return_dict == False: The calculated loss |
+                 if return_dict == True: The dict of calculated losses
+                 for objectness and classes, respectively.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        assert cls_score.size(-1) == self.num_classes + 2
+        pos_inds = labels < self.num_classes
+        # 0 for pos, 1 for neg
+        obj_labels = (labels == self.num_classes).long()
+
+        # accumulate the samples for each category
+        unique_labels = labels.unique()
+        for u_l in unique_labels:
+            inds_ = labels == u_l.item()
+            self.cum_samples[u_l] += inds_.sum()
+
+        if label_weights is not None:
+            label_weights = label_weights.float()
+        else:
+            label_weights = labels.new_ones(labels.size(), dtype=torch.float)
+
+        cls_score_classes, cls_score_objectness = self._split_cls_score(
+            cls_score)
+        # calculate loss_cls_classes (only need pos samples)
+        if pos_inds.sum() > 0:
+            loss_cls_classes = self.loss_weight * self.cls_criterion(
+                cls_score_classes[pos_inds], labels[pos_inds],
+                label_weights[pos_inds], self.cum_samples[:self.num_classes],
+                self.num_classes, self.p, self.q, self.eps, reduction,
+                avg_factor)
+        else:
+            loss_cls_classes = cls_score_classes[pos_inds].sum()
+        # calculate loss_cls_objectness
+        loss_cls_objectness = self.loss_weight * cross_entropy(
+            cls_score_objectness, obj_labels, label_weights, reduction,
+            avg_factor)
+
+        if self.return_dict:
+            loss_cls = dict()
+            loss_cls['loss_cls_objectness'] = loss_cls_objectness
+            loss_cls['loss_cls_classes'] = loss_cls_classes
+        else:
+            loss_cls = loss_cls_classes + loss_cls_objectness
+        return loss_cls
diff --git a/head_extractor/src/mmdet/models/losses/smooth_l1_loss.py b/head_extractor/src/mmdet/models/losses/smooth_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..102f9780706172a44ade2ebe1709c7a1e847db7c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/smooth_l1_loss.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weighted_loss
+
+
+@weighted_loss
+def smooth_l1_loss(pred: Tensor, target: Tensor, beta: float = 1.0) -> Tensor:
+    """Smooth L1 loss.
+
+    Args:
+        pred (Tensor): The prediction.
+        target (Tensor): The learning target of the prediction.
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+
+    Returns:
+        Tensor: Calculated loss
+    """
+    assert beta > 0
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    assert pred.size() == target.size()
+    diff = torch.abs(pred - target)
+    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+                       diff - 0.5 * beta)
+    return loss
+
+
+@weighted_loss
+def l1_loss(pred: Tensor, target: Tensor) -> Tensor:
+    """L1 loss.
+
+    Args:
+        pred (Tensor): The prediction.
+        target (Tensor): The learning target of the prediction.
+
+    Returns:
+        Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)
+    return loss
+
+
+@MODELS.register_module()
+class SmoothL1Loss(nn.Module):
+    """Smooth L1 loss.
+
+    Args:
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum". Defaults to "mean".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self,
+                 beta: float = 1.0,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: Calculated loss
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * smooth_l1_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_bbox
+
+
+@MODELS.register_module()
+class L1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+
+        Returns:
+            Tensor: Calculated loss
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
diff --git a/head_extractor/src/mmdet/models/losses/triplet_loss.py b/head_extractor/src/mmdet/models/losses/triplet_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..4528239beb4bf122fa1a05ee2ce21cb1cb144bde
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/triplet_loss.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class TripletLoss(BaseModule):
+    """Triplet loss with hard positive/negative mining.
+
+    Reference:
+        Hermans et al. In Defense of the Triplet Loss for
+            Person Re-Identification. arXiv:1703.07737.
+    Imported from `<https://github.com/KaiyangZhou/deep-person-reid/blob/
+        master/torchreid/losses/hard_mine_triplet_loss.py>`_.
+    Args:
+        margin (float, optional): Margin for triplet loss. Defaults to 0.3.
+        loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+        hard_mining (bool, optional): Whether to perform hard mining.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 margin: float = 0.3,
+                 loss_weight: float = 1.0,
+                 hard_mining=True):
+        super(TripletLoss, self).__init__()
+        self.margin = margin
+        self.ranking_loss = nn.MarginRankingLoss(margin=margin)
+        self.loss_weight = loss_weight
+        self.hard_mining = hard_mining
+
+    def hard_mining_triplet_loss_forward(
+            self, inputs: torch.Tensor,
+            targets: torch.LongTensor) -> torch.Tensor:
+        """
+        Args:
+            inputs (torch.Tensor): feature matrix with shape
+                (batch_size, feat_dim).
+            targets (torch.LongTensor): ground truth labels with shape
+                (batch_size).
+
+        Returns:
+            torch.Tensor: triplet loss with hard mining.
+        """
+
+        batch_size = inputs.size(0)
+
+        # Compute Euclidean distance
+        dist = torch.pow(inputs, 2).sum(
+            dim=1, keepdim=True).expand(batch_size, batch_size)
+        dist = dist + dist.t()
+        dist.addmm_(inputs, inputs.t(), beta=1, alpha=-2)
+        dist = dist.clamp(min=1e-12).sqrt()  # for numerical stability
+
+        # For each anchor, find the furthest positive sample
+        # and nearest negative sample in the embedding space
+        mask = targets.expand(batch_size, batch_size).eq(
+            targets.expand(batch_size, batch_size).t())
+        dist_ap, dist_an = [], []
+        for i in range(batch_size):
+            dist_ap.append(dist[i][mask[i]].max().unsqueeze(0))
+            dist_an.append(dist[i][mask[i] == 0].min().unsqueeze(0))
+        dist_ap = torch.cat(dist_ap)
+        dist_an = torch.cat(dist_an)
+
+        # Compute ranking hinge loss
+        y = torch.ones_like(dist_an)
+        return self.loss_weight * self.ranking_loss(dist_an, dist_ap, y)
+
+    def forward(self, inputs: torch.Tensor,
+                targets: torch.LongTensor) -> torch.Tensor:
+        """
+        Args:
+            inputs (torch.Tensor): feature matrix with shape
+                (batch_size, feat_dim).
+            targets (torch.LongTensor): ground truth labels with shape
+                (num_classes).
+
+        Returns:
+            torch.Tensor: triplet loss.
+        """
+        if self.hard_mining:
+            return self.hard_mining_triplet_loss_forward(inputs, targets)
+        else:
+            raise NotImplementedError()
diff --git a/head_extractor/src/mmdet/models/losses/utils.py b/head_extractor/src/mmdet/models/losses/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e6e7859f353f3e5456f0cfc1f66b4b0ad535427
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/utils.py
@@ -0,0 +1,125 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+from typing import Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def reduce_loss(loss: Tensor, reduction: str) -> Tensor:
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+def weight_reduce_loss(loss: Tensor,
+                       weight: Optional[Tensor] = None,
+                       reduction: str = 'mean',
+                       avg_factor: Optional[float] = None) -> Tensor:
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Optional[Tensor], optional): Element-wise weights.
+            Defaults to None.
+        reduction (str, optional): Same as built-in losses of PyTorch.
+            Defaults to 'mean'.
+        avg_factor (Optional[float], optional): Average factor when
+            computing the mean of losses. Defaults to None.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = torch.finfo(torch.float32).eps
+            loss = loss.sum() / (avg_factor + eps)
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+def weighted_loss(loss_func: Callable) -> Callable:
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                reduction: str = 'mean',
+                avg_factor: Optional[int] = None,
+                **kwargs) -> Tensor:
+        """
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): Target bboxes.
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            reduction (str, optional): Options are "none", "mean" and "sum".
+                Defaults to 'mean'.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
diff --git a/head_extractor/src/mmdet/models/losses/varifocal_loss.py b/head_extractor/src/mmdet/models/losses/varifocal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..58ab167352e1ae32566f5e731339966d5fd10759
--- /dev/null
+++ b/head_extractor/src/mmdet/models/losses/varifocal_loss.py
@@ -0,0 +1,141 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def varifocal_loss(pred: Tensor,
+                   target: Tensor,
+                   weight: Optional[Tensor] = None,
+                   alpha: float = 0.75,
+                   gamma: float = 2.0,
+                   iou_weighted: bool = True,
+                   reduction: str = 'mean',
+                   avg_factor: Optional[int] = None) -> Tensor:
+    """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+    Args:
+        pred (Tensor): The prediction with shape (N, C), C is the
+            number of classes.
+        target (Tensor): The learning target of the iou-aware
+            classification score with shape (N, C), C is the number of classes.
+        weight (Tensor, optional): The weight of loss for each
+            prediction. Defaults to None.
+        alpha (float, optional): A balance factor for the negative part of
+            Varifocal Loss, which is different from the alpha of Focal Loss.
+            Defaults to 0.75.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        iou_weighted (bool, optional): Whether to weight the loss of the
+            positive example with the iou target. Defaults to True.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and
+            "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+
+    Returns:
+        Tensor: Loss tensor.
+    """
+    # pred and target should be of the same size
+    assert pred.size() == target.size()
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    if iou_weighted:
+        focal_weight = target * (target > 0.0).float() + \
+            alpha * (pred_sigmoid - target).abs().pow(gamma) * \
+            (target <= 0.0).float()
+    else:
+        focal_weight = (target > 0.0).float() + \
+            alpha * (pred_sigmoid - target).abs().pow(gamma) * \
+            (target <= 0.0).float()
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class VarifocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid: bool = True,
+                 alpha: float = 0.75,
+                 gamma: float = 2.0,
+                 iou_weighted: bool = True,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            alpha (float, optional): A balance factor for the negative part of
+                Varifocal Loss, which is different from the alpha of Focal
+                Loss. Defaults to 0.75.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            iou_weighted (bool, optional): Whether to weight the loss of the
+                positive examples with the iou target. Defaults to True.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+        """
+        super().__init__()
+        assert use_sigmoid is True, \
+            'Only sigmoid varifocal loss supported now.'
+        assert alpha >= 0.0
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.iou_weighted = iou_weighted
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction with shape (N, C), C is the
+                number of classes.
+            target (Tensor): The learning target of the iou-aware
+                classification score with shape (N, C), C is
+                the number of classes.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            loss_cls = self.loss_weight * varifocal_loss(
+                pred,
+                target,
+                weight,
+                alpha=self.alpha,
+                gamma=self.gamma,
+                iou_weighted=self.iou_weighted,
+                reduction=reduction,
+                avg_factor=avg_factor)
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/head_extractor/src/mmdet/models/mot/__init__.py b/head_extractor/src/mmdet/models/mot/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd3c8d3ba53daad736e05b5d29a6abb377fd595
--- /dev/null
+++ b/head_extractor/src/mmdet/models/mot/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseMOTModel
+from .bytetrack import ByteTrack
+from .deep_sort import DeepSORT
+from .ocsort import OCSORT
+from .qdtrack import QDTrack
+from .strongsort import StrongSORT
+
+__all__ = [
+    'BaseMOTModel', 'ByteTrack', 'QDTrack', 'DeepSORT', 'StrongSORT', 'OCSORT'
+]
diff --git a/head_extractor/src/mmdet/models/mot/base.py b/head_extractor/src/mmdet/models/mot/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9981417924af3970319b0cbe6a9cc8d8a1095451
--- /dev/null
+++ b/head_extractor/src/mmdet/models/mot/base.py
@@ -0,0 +1,147 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple, Union
+
+from mmengine.model import BaseModel
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import OptTrackSampleList, TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class BaseMOTModel(BaseModel, metaclass=ABCMeta):
+    """Base class for multiple object tracking.
+
+    Args:
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Initialization config dict.
+    """
+
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+    def freeze_module(self, module: Union[List[str], Tuple[str], str]) -> None:
+        """Freeze module during training."""
+        if isinstance(module, str):
+            modules = [module]
+        else:
+            if not (isinstance(module, list) or isinstance(module, tuple)):
+                raise TypeError('module must be a str or a list.')
+            else:
+                modules = module
+        for module in modules:
+            m = getattr(self, module)
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    @property
+    def with_detector(self) -> bool:
+        """bool: whether the framework has a detector."""
+        return hasattr(self, 'detector') and self.detector is not None
+
+    @property
+    def with_reid(self) -> bool:
+        """bool: whether the framework has a reid model."""
+        return hasattr(self, 'reid') and self.reid is not None
+
+    @property
+    def with_motion(self) -> bool:
+        """bool: whether the framework has a motion model."""
+        return hasattr(self, 'motion') and self.motion is not None
+
+    @property
+    def with_track_head(self) -> bool:
+        """bool: whether the framework has a track_head."""
+        return hasattr(self, 'track_head') and self.track_head is not None
+
+    @property
+    def with_tracker(self) -> bool:
+        """bool: whether the framework has a tracker."""
+        return hasattr(self, 'tracker') and self.tracker is not None
+
+    def forward(self,
+                inputs: Dict[str, Tensor],
+                data_samples: OptTrackSampleList = None,
+                mode: str = 'predict',
+                **kwargs):
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+        processed to a list of :obj:`TrackDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W)
+                encoding input images. Typically these should be mean centered
+                and std scaled. The N denotes batch size. The T denotes the
+                number of key/reference frames.
+                - img (Tensor) : The key images.
+                - ref_img (Tensor): The reference images.
+            data_samples (list[:obj:`TrackDataSample`], optional): The
+                annotation data of every samples. Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'predict'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`TrackDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if mode == 'loss':
+            return self.loss(inputs, data_samples, **kwargs)
+        elif mode == 'predict':
+            return self.predict(inputs, data_samples, **kwargs)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples, **kwargs)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    @abstractmethod
+    def loss(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList,
+             **kwargs) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples."""
+        pass
+
+    @abstractmethod
+    def predict(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing."""
+        pass
+
+    def _forward(self,
+                 inputs: Dict[str, Tensor],
+                 data_samples: OptTrackSampleList = None,
+                 **kwargs):
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+         Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W).
+            data_samples (List[:obj:`TrackDataSample`], optional): The
+                Data Samples. It usually includes information such as
+                `gt_instance`.
+
+        Returns:
+            tuple[list]: A tuple of features from ``head`` forward.
+        """
+        raise NotImplementedError(
+            "_forward function (namely 'tensor' mode) is not supported now")
diff --git a/head_extractor/src/mmdet/models/mot/bytetrack.py b/head_extractor/src/mmdet/models/mot/bytetrack.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a3bb867cb284aad9854de44b2942341a4a33be8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/mot/bytetrack.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList, TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .base import BaseMOTModel
+
+
+@MODELS.register_module()
+class ByteTrack(BaseMOTModel):
+    """ByteTrack: Multi-Object Tracking by Associating Every Detection Box.
+
+    This multi object tracker is the implementation of `ByteTrack
+    <https://arxiv.org/abs/2110.06864>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+    def loss(self, inputs: Tensor, data_samples: SampleList, **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): of shape (N, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size
+            data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        return self.detector.loss(inputs, data_samples, **kwargs)
+
+    def predict(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post-processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'Bytetrack inference only support ' \
+            '1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'Bytetrack inference only support 1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            # det_results List[DetDataSample]
+            det_results = self.detector.predict(single_img, [img_data_sample])
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+
+            pred_track_instances = self.tracker.track(
+                data_sample=det_results[0], **kwargs)
+            img_data_sample.pred_track_instances = pred_track_instances
+
+        return [track_data_sample]
diff --git a/head_extractor/src/mmdet/models/mot/deep_sort.py b/head_extractor/src/mmdet/models/mot/deep_sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..70b30c7b07b2211fd0ad70767f479e57b6cd33f6
--- /dev/null
+++ b/head_extractor/src/mmdet/models/mot/deep_sort.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType
+from .base import BaseMOTModel
+
+
+@MODELS.register_module()
+class DeepSORT(BaseMOTModel):
+    """Simple online and realtime tracking with a deep association metric.
+
+    Details can be found at `DeepSORT<https://arxiv.org/abs/1703.07402>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        reid (dict): Configuration of reid. Defaults to None
+        tracker (dict): Configuration of tracker. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 reid: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if reid is not None:
+            self.reid = MODELS.build(reid)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+        self.preprocess_cfg = data_preprocessor
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+        raise NotImplementedError(
+            'Please train `detector` and `reid` models firstly, then \
+                inference with SORT/DeepSORT.')
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post- processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of key frames
+                and reference frames.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: List[TrackDataSample]
+            Tracking results of the input videos.
+            Each DetDataSample usually contains ``pred_track_instances``.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+        if track_data_sample[0].frame_id == 0:
+            self.tracker.reset()
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            # det_results List[DetDataSample]
+            det_results = self.detector.predict(single_img, [img_data_sample])
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+
+            pred_track_instances = self.tracker.track(
+                model=self,
+                img=single_img,
+                feats=None,
+                data_sample=det_results[0],
+                data_preprocessor=self.preprocess_cfg,
+                rescale=rescale,
+                **kwargs)
+            img_data_sample.pred_track_instances = pred_track_instances
+
+        return [track_data_sample]
diff --git a/head_extractor/src/mmdet/models/mot/ocsort.py b/head_extractor/src/mmdet/models/mot/ocsort.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf4eb3b06e2b1b223fe948f30dac877248377e3
--- /dev/null
+++ b/head_extractor/src/mmdet/models/mot/ocsort.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Dict, Optional
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .base import BaseMOTModel
+
+
+@MODELS.register_module()
+class OCSORT(BaseMOTModel):
+    """OCOSRT: Observation-Centric SORT: Rethinking SORT for Robust
+    Multi-Object Tracking
+
+    This multi object tracker is the implementation of `OC-SORT
+    <https://arxiv.org/abs/2203.14360>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        motion (dict): Configuration of motion. Defaults to None.
+        init_cfg (dict): Configuration of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+        return self.detector.loss(inputs, data_samples, **kwargs)
+
+    def predict(self, inputs: Dict[str, Tensor], data_samples: TrackSampleList,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post-processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'OCSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'OCSORT inference only support 1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            # det_results List[DetDataSample]
+            det_results = self.detector.predict(single_img, [img_data_sample])
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+
+            pred_track_instances = self.tracker.track(
+                data_sample=det_results[0], **kwargs)
+            img_data_sample.pred_track_instances = pred_track_instances
+
+        return [track_data_sample]
diff --git a/head_extractor/src/mmdet/models/mot/qdtrack.py b/head_extractor/src/mmdet/models/mot/qdtrack.py
new file mode 100644
index 0000000000000000000000000000000000000000..43d5dd60b8af8a6200e21a196c47d00dd2812a46
--- /dev/null
+++ b/head_extractor/src/mmdet/models/mot/qdtrack.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .base import BaseMOTModel
+
+
+@MODELS.register_module()
+class QDTrack(BaseMOTModel):
+    """Quasi-Dense Similarity Learning for Multiple Object Tracking.
+
+    This multi object tracker is the implementation of `QDTrack
+    <https://arxiv.org/abs/2006.06664>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        track_head (dict): Configuration of track head. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        freeze_detector (bool): If True, freeze the detector weights.
+            Defaults to False.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 track_head: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 freeze_detector: bool = False,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+
+        if track_head is not None:
+            self.track_head = MODELS.build(track_head)
+
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+        self.freeze_detector = freeze_detector
+        if self.freeze_detector:
+            self.freeze_module('detector')
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post- processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'QDTrack inference only support 1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'QDTrack only support 1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+        if track_data_sample[0].frame_id == 0:
+            self.tracker.reset()
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            x = self.detector.extract_feat(single_img)
+            rpn_results_list = self.detector.rpn_head.predict(
+                x, [img_data_sample])
+            # det_results List[InstanceData]
+            det_results = self.detector.roi_head.predict(
+                x, rpn_results_list, [img_data_sample], rescale=rescale)
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+            img_data_sample.pred_instances = det_results[0]
+            frame_pred_track_instances = self.tracker.track(
+                model=self,
+                img=single_img,
+                feats=x,
+                data_sample=img_data_sample,
+                **kwargs)
+            img_data_sample.pred_track_instances = frame_pred_track_instances
+
+        return [track_data_sample]
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> Union[dict, tuple]:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size. The T denotes the number of
+                frames.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        # modify the inputs shape to fit mmdet
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(1) == 2, \
+            'QDTrack can only have 1 key frame and 1 reference frame.'
+
+        # split the data_samples into two aspects: key frames and reference
+        # frames
+        ref_data_samples, key_data_samples = [], []
+        key_frame_inds, ref_frame_inds = [], []
+        # set cat_id of gt_labels to 0 in RPN
+        for track_data_sample in data_samples:
+            key_frame_inds.append(track_data_sample.key_frames_inds[0])
+            ref_frame_inds.append(track_data_sample.ref_frames_inds[0])
+            key_data_sample = track_data_sample.get_key_frames()[0]
+            key_data_sample.gt_instances.labels = \
+                torch.zeros_like(key_data_sample.gt_instances.labels)
+            key_data_samples.append(key_data_sample)
+            ref_data_sample = track_data_sample.get_ref_frames()[0]
+            ref_data_samples.append(ref_data_sample)
+
+        key_frame_inds = torch.tensor(key_frame_inds, dtype=torch.int64)
+        ref_frame_inds = torch.tensor(ref_frame_inds, dtype=torch.int64)
+        batch_inds = torch.arange(len(inputs))
+        key_imgs = inputs[batch_inds, key_frame_inds].contiguous()
+        ref_imgs = inputs[batch_inds, ref_frame_inds].contiguous()
+
+        x = self.detector.extract_feat(key_imgs)
+        ref_x = self.detector.extract_feat(ref_imgs)
+
+        losses = dict()
+        # RPN head forward and loss
+        assert self.detector.with_rpn, \
+            'QDTrack only support detector with RPN.'
+
+        proposal_cfg = self.detector.train_cfg.get('rpn_proposal',
+                                                   self.detector.test_cfg.rpn)
+        rpn_losses, rpn_results_list = self.detector.rpn_head. \
+            loss_and_predict(x,
+                             key_data_samples,
+                             proposal_cfg=proposal_cfg,
+                             **kwargs)
+        ref_rpn_results_list = self.detector.rpn_head.predict(
+            ref_x, ref_data_samples, **kwargs)
+
+        # avoid get same name with roi_head loss
+        keys = rpn_losses.keys()
+        for key in keys:
+            if 'loss' in key and 'rpn' not in key:
+                rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+        losses.update(rpn_losses)
+
+        # roi_head loss
+        losses_detect = self.detector.roi_head.loss(x, rpn_results_list,
+                                                    key_data_samples, **kwargs)
+        losses.update(losses_detect)
+
+        # tracking head loss
+        losses_track = self.track_head.loss(x, ref_x, rpn_results_list,
+                                            ref_rpn_results_list, data_samples,
+                                            **kwargs)
+        losses.update(losses_track)
+
+        return losses
diff --git a/head_extractor/src/mmdet/models/mot/strongsort.py b/head_extractor/src/mmdet/models/mot/strongsort.py
new file mode 100644
index 0000000000000000000000000000000000000000..6129bf49972233206b3c05daa2174f99723d1b9d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/mot/strongsort.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import numpy as np
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType
+from .deep_sort import DeepSORT
+
+
+@MODELS.register_module()
+class StrongSORT(DeepSORT):
+    """StrongSORT: Make DeepSORT Great Again.
+
+    Details can be found at `StrongSORT<https://arxiv.org/abs/2202.13514>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        reid (dict): Configuration of reid. Defaults to None
+        tracker (dict): Configuration of tracker. Defaults to None.
+        kalman (dict): Configuration of Kalman filter. Defaults to None.
+        cmc (dict): Configuration of camera model compensation.
+            Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 reid: Optional[dict] = None,
+                 cmc: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 postprocess_model: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(detector, reid, tracker, data_preprocessor, init_cfg)
+
+        if cmc is not None:
+            self.cmc = TASK_UTILS.build(cmc)
+
+        if postprocess_model is not None:
+            self.postprocess_model = TASK_UTILS.build(postprocess_model)
+
+    @property
+    def with_cmc(self):
+        """bool: whether the framework has a camera model compensation
+                model.
+        """
+        return hasattr(self, 'cmc') and self.cmc is not None
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True,
+                **kwargs) -> TrackSampleList:
+        """Predict results from a video and data samples with post- processing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of key frames
+                and reference frames.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: List[TrackDataSample]
+            Tracking results of the input videos.
+            Each DetDataSample usually contains ``pred_track_instances``.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(0) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        assert len(data_samples) == 1, \
+            'SORT/DeepSORT inference only support ' \
+            '1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+
+        video_track_instances = []
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            # det_results List[DetDataSample]
+            det_results = self.detector.predict(single_img, [img_data_sample])
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+
+            pred_track_instances = self.tracker.track(
+                model=self,
+                img=single_img,
+                data_sample=det_results[0],
+                data_preprocessor=self.preprocess_cfg,
+                rescale=rescale,
+                **kwargs)
+            for i in range(len(pred_track_instances.instances_id)):
+                video_track_instances.append(
+                    np.array([
+                        frame_id + 1,
+                        pred_track_instances.instances_id[i].cpu(),
+                        pred_track_instances.bboxes[i][0].cpu(),
+                        pred_track_instances.bboxes[i][1].cpu(),
+                        (pred_track_instances.bboxes[i][2] -
+                         pred_track_instances.bboxes[i][0]).cpu(),
+                        (pred_track_instances.bboxes[i][3] -
+                         pred_track_instances.bboxes[i][1]).cpu(),
+                        pred_track_instances.scores[i].cpu()
+                    ]))
+        video_track_instances = np.array(video_track_instances).reshape(-1, 7)
+        video_track_instances = self.postprocess_model.forward(
+            video_track_instances)
+        for frame_id in range(video_len):
+            track_data_sample[frame_id].pred_track_instances = \
+                    InstanceData(bboxes=video_track_instances[
+                        video_track_instances[:, 0] == frame_id + 1, :])
+
+        return [track_data_sample]
diff --git a/head_extractor/src/mmdet/models/necks/__init__.py b/head_extractor/src/mmdet/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..343fbfefbd871d00e855d1c3cf4b531345e4dcf1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bfp import BFP
+from .channel_mapper import ChannelMapper
+from .cspnext_pafpn import CSPNeXtPAFPN
+from .ct_resnet_neck import CTResNetNeck
+from .dilated_encoder import DilatedEncoder
+from .dyhead import DyHead
+from .fpg import FPG
+from .fpn import FPN
+from .fpn_carafe import FPN_CARAFE
+from .fpn_dropblock import FPN_DropBlock
+from .hrfpn import HRFPN
+from .nas_fpn import NASFPN
+from .nasfcos_fpn import NASFCOS_FPN
+from .pafpn import PAFPN
+from .rfp import RFP
+from .ssd_neck import SSDNeck
+from .ssh import SSH
+from .yolo_neck import YOLOV3Neck
+from .yolox_pafpn import YOLOXPAFPN
+
+__all__ = [
+    'FPN', 'BFP', 'ChannelMapper', 'HRFPN', 'NASFPN', 'FPN_CARAFE', 'PAFPN',
+    'NASFCOS_FPN', 'RFP', 'YOLOV3Neck', 'FPG', 'DilatedEncoder',
+    'CTResNetNeck', 'SSDNeck', 'YOLOXPAFPN', 'DyHead', 'CSPNeXtPAFPN', 'SSH',
+    'FPN_DropBlock'
+]
diff --git a/head_extractor/src/mmdet/models/necks/bfp.py b/head_extractor/src/mmdet/models/necks/bfp.py
new file mode 100644
index 0000000000000000000000000000000000000000..401cdb0f552b06c9e8eb185c3e8ae0ba7112a9d8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/bfp.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import NonLocal2d
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class BFP(BaseModule):
+    """BFP (Balanced Feature Pyramids)
+
+    BFP takes multi-level features as inputs and gather them into a single one,
+    then refine the gathered feature and scatter the refined results to
+    multi-level features. This module is used in Libra R-CNN (CVPR 2019), see
+    the paper `Libra R-CNN: Towards Balanced Learning for Object Detection
+    <https://arxiv.org/abs/1904.02701>`_ for details.
+
+    Args:
+        in_channels (int): Number of input channels (feature maps of all levels
+            should have the same channels).
+        num_levels (int): Number of input feature levels.
+        refine_level (int): Index of integration and refine level of BSF in
+            multi-level features from bottom to top.
+        refine_type (str): Type of the refine op, currently support
+            [None, 'conv', 'non_local'].
+        conv_cfg (:obj:`ConfigDict` or dict, optional): The config dict for
+            convolution layers.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): The config dict for
+            normalization layers.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        num_levels: int,
+        refine_level: int = 2,
+        refine_type: str = None,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        init_cfg: OptMultiConfig = dict(
+            type='Xavier', layer='Conv2d', distribution='uniform')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert refine_type in [None, 'conv', 'non_local']
+
+        self.in_channels = in_channels
+        self.num_levels = num_levels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.refine_level = refine_level
+        self.refine_type = refine_type
+        assert 0 <= self.refine_level < self.num_levels
+
+        if self.refine_type == 'conv':
+            self.refine = ConvModule(
+                self.in_channels,
+                self.in_channels,
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+        elif self.refine_type == 'non_local':
+            self.refine = NonLocal2d(
+                self.in_channels,
+                reduction=1,
+                use_scale=False,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+
+    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward function."""
+        assert len(inputs) == self.num_levels
+
+        # step 1: gather multi-level features by resize and average
+        feats = []
+        gather_size = inputs[self.refine_level].size()[2:]
+        for i in range(self.num_levels):
+            if i < self.refine_level:
+                gathered = F.adaptive_max_pool2d(
+                    inputs[i], output_size=gather_size)
+            else:
+                gathered = F.interpolate(
+                    inputs[i], size=gather_size, mode='nearest')
+            feats.append(gathered)
+
+        bsf = sum(feats) / len(feats)
+
+        # step 2: refine gathered features
+        if self.refine_type is not None:
+            bsf = self.refine(bsf)
+
+        # step 3: scatter refined features to multi-levels by a residual path
+        outs = []
+        for i in range(self.num_levels):
+            out_size = inputs[i].size()[2:]
+            if i < self.refine_level:
+                residual = F.interpolate(bsf, size=out_size, mode='nearest')
+            else:
+                residual = F.adaptive_max_pool2d(bsf, output_size=out_size)
+            outs.append(residual + inputs[i])
+
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/necks/channel_mapper.py b/head_extractor/src/mmdet/models/necks/channel_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..74293618f2b8a649328ae4a5a0571809de9991dd
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/channel_mapper.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class ChannelMapper(BaseModule):
+    """Channel Mapper to reduce/increase channels of backbone features.
+
+    This is used to reduce/increase channels of backbone features.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        kernel_size (int, optional): kernel_size for reducing channels (used
+            at each scale). Default: 3.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Default: None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Default: None.
+        act_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            activation layer in ConvModule. Default: dict(type='ReLU').
+        bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
+            False. Default: "auto".
+        num_outs (int, optional): Number of output feature maps. There would
+            be extra_convs when num_outs larger than the length of in_channels.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or dict],
+            optional): Initialization config dict.
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = ChannelMapper(in_channels, 11, 3).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        kernel_size: int = 3,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        act_cfg: OptConfigType = dict(type='ReLU'),
+        bias: Union[bool, str] = 'auto',
+        num_outs: int = None,
+        init_cfg: OptMultiConfig = dict(
+            type='Xavier', layer='Conv2d', distribution='uniform')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(in_channels, list)
+        self.extra_convs = None
+        if num_outs is None:
+            num_outs = len(in_channels)
+        self.convs = nn.ModuleList()
+        for in_channel in in_channels:
+            self.convs.append(
+                ConvModule(
+                    in_channel,
+                    out_channels,
+                    kernel_size,
+                    padding=(kernel_size - 1) // 2,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    bias=bias))
+        if num_outs > len(in_channels):
+            self.extra_convs = nn.ModuleList()
+            for i in range(len(in_channels), num_outs):
+                if i == len(in_channels):
+                    in_channel = in_channels[-1]
+                else:
+                    in_channel = out_channels
+                self.extra_convs.append(
+                    ConvModule(
+                        in_channel,
+                        out_channels,
+                        3,
+                        stride=2,
+                        padding=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg,
+                        bias=bias))
+
+    def forward(self, inputs: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward function."""
+        assert len(inputs) == len(self.convs)
+        outs = [self.convs[i](inputs[i]) for i in range(len(inputs))]
+        if self.extra_convs:
+            for i in range(len(self.extra_convs)):
+                if i == 0:
+                    outs.append(self.extra_convs[0](inputs[-1]))
+                else:
+                    outs.append(self.extra_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/necks/cspnext_pafpn.py b/head_extractor/src/mmdet/models/necks/cspnext_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a52ba72d9b3e48c4866fb16507bc2118eb23010e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/cspnext_pafpn.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from ..layers import CSPLayer
+
+
+@MODELS.register_module()
+class CSPNeXtPAFPN(BaseModule):
+    """Path Aggregation Network with CSPNeXt blocks.
+
+    Args:
+        in_channels (Sequence[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer.
+            Defaults to 3.
+        use_depthwise (bool): Whether to use depthwise separable convolution in
+            blocks. Defaults to False.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Default: 0.5
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(scale_factor=2, mode='nearest')`
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish')
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(
+        self,
+        in_channels: Sequence[int],
+        out_channels: int,
+        num_csp_blocks: int = 3,
+        use_depthwise: bool = False,
+        expand_ratio: float = 0.5,
+        upsample_cfg: ConfigType = dict(scale_factor=2, mode='nearest'),
+        conv_cfg: bool = None,
+        norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+        act_cfg: ConfigType = dict(type='Swish'),
+        init_cfg: OptMultiConfig = dict(
+            type='Kaiming',
+            layer='Conv2d',
+            a=math.sqrt(5),
+            distribution='uniform',
+            mode='fan_in',
+            nonlinearity='leaky_relu')
+    ) -> None:
+        super().__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(**upsample_cfg)
+        self.reduce_layers = nn.ModuleList()
+        self.top_down_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.reduce_layers.append(
+                ConvModule(
+                    in_channels[idx],
+                    in_channels[idx - 1],
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.top_down_blocks.append(
+                CSPLayer(
+                    in_channels[idx - 1] * 2,
+                    in_channels[idx - 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    use_cspnext_block=True,
+                    expand_ratio=expand_ratio,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        # build bottom-up blocks
+        self.downsamples = nn.ModuleList()
+        self.bottom_up_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv(
+                    in_channels[idx],
+                    in_channels[idx],
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.bottom_up_blocks.append(
+                CSPLayer(
+                    in_channels[idx] * 2,
+                    in_channels[idx + 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    use_cspnext_block=True,
+                    expand_ratio=expand_ratio,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        self.out_convs = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.out_convs.append(
+                conv(
+                    in_channels[i],
+                    out_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs: Tuple[Tensor, ...]) -> Tuple[Tensor, ...]:
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+
+        Returns:
+            tuple[Tensor]: YOLOXPAFPN features.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = inputs[idx - 1]
+            feat_heigh = self.reduce_layers[len(self.in_channels) - 1 - idx](
+                feat_heigh)
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = self.upsample(feat_heigh)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                torch.cat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](
+                torch.cat([downsample_feat, feat_height], 1))
+            outs.append(out)
+
+        # out convs
+        for idx, conv in enumerate(self.out_convs):
+            outs[idx] = conv(outs[idx])
+
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/necks/ct_resnet_neck.py b/head_extractor/src/mmdet/models/necks/ct_resnet_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..9109fe79290fafecd954f223d5365ef619c0c301
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/ct_resnet_neck.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptMultiConfig
+
+
+@MODELS.register_module()
+class CTResNetNeck(BaseModule):
+    """The neck used in `CenterNet <https://arxiv.org/abs/1904.07850>`_ for
+    object classification and box regression.
+
+    Args:
+         in_channels (int): Number of input channels.
+         num_deconv_filters (tuple[int]): Number of filters per stage.
+         num_deconv_kernels (tuple[int]): Number of kernels per stage.
+         use_dcn (bool): If True, use DCNv2. Defaults to True.
+         init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+             list[:obj:`ConfigDict`], optional): Initialization
+             config dict.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 num_deconv_filters: Tuple[int, ...],
+                 num_deconv_kernels: Tuple[int, ...],
+                 use_dcn: bool = True,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert len(num_deconv_filters) == len(num_deconv_kernels)
+        self.fp16_enabled = False
+        self.use_dcn = use_dcn
+        self.in_channels = in_channels
+        self.deconv_layers = self._make_deconv_layer(num_deconv_filters,
+                                                     num_deconv_kernels)
+
+    def _make_deconv_layer(
+            self, num_deconv_filters: Tuple[int, ...],
+            num_deconv_kernels: Tuple[int, ...]) -> nn.Sequential:
+        """use deconv layers to upsample backbone's output."""
+        layers = []
+        for i in range(len(num_deconv_filters)):
+            feat_channels = num_deconv_filters[i]
+            conv_module = ConvModule(
+                self.in_channels,
+                feat_channels,
+                3,
+                padding=1,
+                conv_cfg=dict(type='DCNv2') if self.use_dcn else None,
+                norm_cfg=dict(type='BN'))
+            layers.append(conv_module)
+            upsample_module = ConvModule(
+                feat_channels,
+                feat_channels,
+                num_deconv_kernels[i],
+                stride=2,
+                padding=1,
+                conv_cfg=dict(type='deconv'),
+                norm_cfg=dict(type='BN'))
+            layers.append(upsample_module)
+            self.in_channels = feat_channels
+
+        return nn.Sequential(*layers)
+
+    def init_weights(self) -> None:
+        """Initialize the parameters."""
+        for m in self.modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                # In order to be consistent with the source code,
+                # reset the ConvTranspose2d initialization parameters
+                m.reset_parameters()
+                # Simulated bilinear upsampling kernel
+                w = m.weight.data
+                f = math.ceil(w.size(2) / 2)
+                c = (2 * f - 1 - f % 2) / (2. * f)
+                for i in range(w.size(2)):
+                    for j in range(w.size(3)):
+                        w[0, 0, i, j] = \
+                            (1 - math.fabs(i / f - c)) * (
+                                    1 - math.fabs(j / f - c))
+                for c in range(1, w.size(0)):
+                    w[c, 0, :, :] = w[0, 0, :, :]
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            # self.use_dcn is False
+            elif not self.use_dcn and isinstance(m, nn.Conv2d):
+                # In order to be consistent with the source code,
+                # reset the Conv2d initialization parameters
+                m.reset_parameters()
+
+    def forward(self, x: Sequence[torch.Tensor]) -> Tuple[torch.Tensor]:
+        """model forward."""
+        assert isinstance(x, (list, tuple))
+        outs = self.deconv_layers(x[-1])
+        return outs,
diff --git a/head_extractor/src/mmdet/models/necks/dilated_encoder.py b/head_extractor/src/mmdet/models/necks/dilated_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9beb3ea9b4289da8d0100ae7759927f045829bb
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/dilated_encoder.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, is_norm
+from mmengine.model import caffe2_xavier_init, constant_init, normal_init
+from torch.nn import BatchNorm2d
+
+from mmdet.registry import MODELS
+
+
+class Bottleneck(nn.Module):
+    """Bottleneck block for DilatedEncoder used in `YOLOF.
+
+    <https://arxiv.org/abs/2103.09460>`.
+
+    The Bottleneck contains three ConvLayers and one residual connection.
+
+    Args:
+        in_channels (int): The number of input channels.
+        mid_channels (int): The number of middle output channels.
+        dilation (int): Dilation rate.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 dilation,
+                 norm_cfg=dict(type='BN', requires_grad=True)):
+        super(Bottleneck, self).__init__()
+        self.conv1 = ConvModule(
+            in_channels, mid_channels, 1, norm_cfg=norm_cfg)
+        self.conv2 = ConvModule(
+            mid_channels,
+            mid_channels,
+            3,
+            padding=dilation,
+            dilation=dilation,
+            norm_cfg=norm_cfg)
+        self.conv3 = ConvModule(
+            mid_channels, in_channels, 1, norm_cfg=norm_cfg)
+
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = self.conv3(out)
+        out = out + identity
+        return out
+
+
+@MODELS.register_module()
+class DilatedEncoder(nn.Module):
+    """Dilated Encoder for YOLOF <https://arxiv.org/abs/2103.09460>`.
+
+    This module contains two types of components:
+        - the original FPN lateral convolution layer and fpn convolution layer,
+              which are 1x1 conv + 3x3 conv
+        - the dilated residual block
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        block_mid_channels (int): The number of middle block output channels
+        num_residual_blocks (int): The number of residual blocks.
+        block_dilations (list): The list of residual blocks dilation.
+    """
+
+    def __init__(self, in_channels, out_channels, block_mid_channels,
+                 num_residual_blocks, block_dilations):
+        super(DilatedEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.block_mid_channels = block_mid_channels
+        self.num_residual_blocks = num_residual_blocks
+        self.block_dilations = block_dilations
+        self._init_layers()
+
+    def _init_layers(self):
+        self.lateral_conv = nn.Conv2d(
+            self.in_channels, self.out_channels, kernel_size=1)
+        self.lateral_norm = BatchNorm2d(self.out_channels)
+        self.fpn_conv = nn.Conv2d(
+            self.out_channels, self.out_channels, kernel_size=3, padding=1)
+        self.fpn_norm = BatchNorm2d(self.out_channels)
+        encoder_blocks = []
+        for i in range(self.num_residual_blocks):
+            dilation = self.block_dilations[i]
+            encoder_blocks.append(
+                Bottleneck(
+                    self.out_channels,
+                    self.block_mid_channels,
+                    dilation=dilation))
+        self.dilated_encoder_blocks = nn.Sequential(*encoder_blocks)
+
+    def init_weights(self):
+        caffe2_xavier_init(self.lateral_conv)
+        caffe2_xavier_init(self.fpn_conv)
+        for m in [self.lateral_norm, self.fpn_norm]:
+            constant_init(m, 1)
+        for m in self.dilated_encoder_blocks.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, mean=0, std=0.01)
+            if is_norm(m):
+                constant_init(m, 1)
+
+    def forward(self, feature):
+        out = self.lateral_norm(self.lateral_conv(feature[-1]))
+        out = self.fpn_norm(self.fpn_conv(out))
+        return self.dilated_encoder_blocks(out),
diff --git a/head_extractor/src/mmdet/models/necks/dyhead.py b/head_extractor/src/mmdet/models/necks/dyhead.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f5ae0b285c20558a0c7bcc59cbb7b214684eab2
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/dyhead.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.ops.modulated_deform_conv import ModulatedDeformConv2d
+from mmengine.model import BaseModule, constant_init, normal_init
+
+from mmdet.registry import MODELS
+from ..layers import DyReLU
+
+# Reference:
+# https://github.com/microsoft/DynamicHead
+# https://github.com/jshilong/SEPC
+
+
+class DyDCNv2(nn.Module):
+    """ModulatedDeformConv2d with normalization layer used in DyHead.
+
+    This module cannot be configured with `conv_cfg=dict(type='DCNv2')`
+    because DyHead calculates offset and mask from middle-level feature.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        stride (int | tuple[int], optional): Stride of the convolution.
+            Default: 1.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='GN', num_groups=16, requires_grad=True).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 norm_cfg=dict(type='GN', num_groups=16, requires_grad=True)):
+        super().__init__()
+        self.with_norm = norm_cfg is not None
+        bias = not self.with_norm
+        self.conv = ModulatedDeformConv2d(
+            in_channels, out_channels, 3, stride=stride, padding=1, bias=bias)
+        if self.with_norm:
+            self.norm = build_norm_layer(norm_cfg, out_channels)[1]
+
+    def forward(self, x, offset, mask):
+        """Forward function."""
+        x = self.conv(x.contiguous(), offset, mask)
+        if self.with_norm:
+            x = self.norm(x)
+        return x
+
+
+class DyHeadBlock(nn.Module):
+    """DyHead Block with three types of attention.
+
+    HSigmoid arguments in default act_cfg follow official code, not paper.
+    https://github.com/microsoft/DynamicHead/blob/master/dyhead/dyrelu.py
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        zero_init_offset (bool, optional): Whether to use zero init for
+            `spatial_conv_offset`. Default: True.
+        act_cfg (dict, optional): Config dict for the last activation layer of
+            scale-aware attention. Default: dict(type='HSigmoid', bias=3.0,
+            divisor=6.0).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 zero_init_offset=True,
+                 act_cfg=dict(type='HSigmoid', bias=3.0, divisor=6.0)):
+        super().__init__()
+        self.zero_init_offset = zero_init_offset
+        # (offset_x, offset_y, mask) * kernel_size_y * kernel_size_x
+        self.offset_and_mask_dim = 3 * 3 * 3
+        self.offset_dim = 2 * 3 * 3
+
+        self.spatial_conv_high = DyDCNv2(in_channels, out_channels)
+        self.spatial_conv_mid = DyDCNv2(in_channels, out_channels)
+        self.spatial_conv_low = DyDCNv2(in_channels, out_channels, stride=2)
+        self.spatial_conv_offset = nn.Conv2d(
+            in_channels, self.offset_and_mask_dim, 3, padding=1)
+        self.scale_attn_module = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1), nn.Conv2d(out_channels, 1, 1),
+            nn.ReLU(inplace=True), build_activation_layer(act_cfg))
+        self.task_attn_module = DyReLU(out_channels)
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, 0, 0.01)
+        if self.zero_init_offset:
+            constant_init(self.spatial_conv_offset, 0)
+
+    def forward(self, x):
+        """Forward function."""
+        outs = []
+        for level in range(len(x)):
+            # calculate offset and mask of DCNv2 from middle-level feature
+            offset_and_mask = self.spatial_conv_offset(x[level])
+            offset = offset_and_mask[:, :self.offset_dim, :, :]
+            mask = offset_and_mask[:, self.offset_dim:, :, :].sigmoid()
+
+            mid_feat = self.spatial_conv_mid(x[level], offset, mask)
+            sum_feat = mid_feat * self.scale_attn_module(mid_feat)
+            summed_levels = 1
+            if level > 0:
+                low_feat = self.spatial_conv_low(x[level - 1], offset, mask)
+                sum_feat += low_feat * self.scale_attn_module(low_feat)
+                summed_levels += 1
+            if level < len(x) - 1:
+                # this upsample order is weird, but faster than natural order
+                # https://github.com/microsoft/DynamicHead/issues/25
+                high_feat = F.interpolate(
+                    self.spatial_conv_high(x[level + 1], offset, mask),
+                    size=x[level].shape[-2:],
+                    mode='bilinear',
+                    align_corners=True)
+                sum_feat += high_feat * self.scale_attn_module(high_feat)
+                summed_levels += 1
+            outs.append(self.task_attn_module(sum_feat / summed_levels))
+
+        return outs
+
+
+@MODELS.register_module()
+class DyHead(BaseModule):
+    """DyHead neck consisting of multiple DyHead Blocks.
+
+    See `Dynamic Head: Unifying Object Detection Heads with Attentions
+    <https://arxiv.org/abs/2106.08322>`_ for details.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_blocks (int, optional): Number of DyHead Blocks. Default: 6.
+        zero_init_offset (bool, optional): Whether to use zero init for
+            `spatial_conv_offset`. Default: True.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=6,
+                 zero_init_offset=True,
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_blocks = num_blocks
+        self.zero_init_offset = zero_init_offset
+
+        dyhead_blocks = []
+        for i in range(num_blocks):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+            dyhead_blocks.append(
+                DyHeadBlock(
+                    in_channels,
+                    self.out_channels,
+                    zero_init_offset=zero_init_offset))
+        self.dyhead_blocks = nn.Sequential(*dyhead_blocks)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert isinstance(inputs, (tuple, list))
+        outs = self.dyhead_blocks(inputs)
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/necks/fpg.py b/head_extractor/src/mmdet/models/necks/fpg.py
new file mode 100644
index 0000000000000000000000000000000000000000..73ee799bb83645ab2556fe871dcd8b1c5bbff89e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/fpg.py
@@ -0,0 +1,406 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+class Transition(BaseModule):
+    """Base class for transition.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+    """
+
+    def __init__(self, in_channels, out_channels, init_cfg=None):
+        super().__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+    def forward(x):
+        pass
+
+
+class UpInterpolationConv(Transition):
+    """A transition used for up-sampling.
+
+    Up-sample the input by interpolation then refines the feature by
+    a convolution layer.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        scale_factor (int): Up-sampling factor. Default: 2.
+        mode (int): Interpolation mode. Default: nearest.
+        align_corners (bool): Whether align corners when interpolation.
+            Default: None.
+        kernel_size (int): Kernel size for the conv. Default: 3.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 scale_factor=2,
+                 mode='nearest',
+                 align_corners=None,
+                 kernel_size=3,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, init_cfg)
+        self.mode = mode
+        self.scale_factor = scale_factor
+        self.align_corners = align_corners
+        self.conv = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=(kernel_size - 1) // 2,
+            **kwargs)
+
+    def forward(self, x):
+        x = F.interpolate(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners)
+        x = self.conv(x)
+        return x
+
+
+class LastConv(Transition):
+    """A transition used for refining the output of the last stage.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_inputs (int): Number of inputs of the FPN features.
+        kernel_size (int): Kernel size for the conv. Default: 3.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_inputs,
+                 kernel_size=3,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, init_cfg)
+        self.num_inputs = num_inputs
+        self.conv_out = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=(kernel_size - 1) // 2,
+            **kwargs)
+
+    def forward(self, inputs):
+        assert len(inputs) == self.num_inputs
+        return self.conv_out(inputs[-1])
+
+
+@MODELS.register_module()
+class FPG(BaseModule):
+    """FPG.
+
+    Implementation of `Feature Pyramid Grids (FPG)
+    <https://arxiv.org/abs/2004.03580>`_.
+    This implementation only gives the basic structure stated in the paper.
+    But users can implement different type of transitions to fully explore the
+    the potential power of the structure of FPG.
+
+    Args:
+        in_channels (int): Number of input channels (feature maps of all levels
+            should have the same channels).
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        stack_times (int): The number of times the pyramid architecture will
+            be stacked.
+        paths (list[str]): Specify the path order of each stack level.
+            Each element in the list should be either 'bu' (bottom-up) or
+            'td' (top-down).
+        inter_channels (int): Number of inter channels.
+        same_up_trans (dict): Transition that goes down at the same stage.
+        same_down_trans (dict): Transition that goes up at the same stage.
+        across_lateral_trans (dict): Across-pathway same-stage
+        across_down_trans (dict): Across-pathway bottom-up connection.
+        across_up_trans (dict): Across-pathway top-down connection.
+        across_skip_trans (dict): Across-pathway skip connection.
+        output_trans (dict): Transition that trans the output of the
+            last stage.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool): It decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    transition_types = {
+        'conv': ConvModule,
+        'interpolation_conv': UpInterpolationConv,
+        'last_conv': LastConv,
+    }
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 stack_times,
+                 paths,
+                 inter_channels=None,
+                 same_down_trans=None,
+                 same_up_trans=dict(
+                     type='conv', kernel_size=3, stride=2, padding=1),
+                 across_lateral_trans=dict(type='conv', kernel_size=1),
+                 across_down_trans=dict(type='conv', kernel_size=3),
+                 across_up_trans=None,
+                 across_skip_trans=dict(type='identity'),
+                 output_trans=dict(type='last_conv', kernel_size=3),
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 norm_cfg=None,
+                 skip_inds=None,
+                 init_cfg=[
+                     dict(type='Caffe2Xavier', layer='Conv2d'),
+                     dict(
+                         type='Constant',
+                         layer=[
+                             '_BatchNorm', '_InstanceNorm', 'GroupNorm',
+                             'LayerNorm'
+                         ],
+                         val=1.0)
+                 ]):
+        super(FPG, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        if inter_channels is None:
+            self.inter_channels = [out_channels for _ in range(num_outs)]
+        elif isinstance(inter_channels, int):
+            self.inter_channels = [inter_channels for _ in range(num_outs)]
+        else:
+            assert isinstance(inter_channels, list)
+            assert len(inter_channels) == num_outs
+            self.inter_channels = inter_channels
+        self.stack_times = stack_times
+        self.paths = paths
+        assert isinstance(paths, list) and len(paths) == stack_times
+        for d in paths:
+            assert d in ('bu', 'td')
+
+        self.same_down_trans = same_down_trans
+        self.same_up_trans = same_up_trans
+        self.across_lateral_trans = across_lateral_trans
+        self.across_down_trans = across_down_trans
+        self.across_up_trans = across_up_trans
+        self.output_trans = output_trans
+        self.across_skip_trans = across_skip_trans
+
+        self.with_bias = norm_cfg is None
+        # skip inds must be specified if across skip trans is not None
+        if self.across_skip_trans is not None:
+            skip_inds is not None
+        self.skip_inds = skip_inds
+        assert len(self.skip_inds[0]) <= self.stack_times
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+
+        # build lateral 1x1 convs to reduce channels
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = nn.Conv2d(self.in_channels[i],
+                               self.inter_channels[i - self.start_level], 1)
+            self.lateral_convs.append(l_conv)
+
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            if self.add_extra_convs:
+                fpn_idx = self.backbone_end_level - self.start_level + i
+                extra_conv = nn.Conv2d(
+                    self.inter_channels[fpn_idx - 1],
+                    self.inter_channels[fpn_idx],
+                    3,
+                    stride=2,
+                    padding=1)
+                self.extra_downsamples.append(extra_conv)
+            else:
+                self.extra_downsamples.append(nn.MaxPool2d(1, stride=2))
+
+        self.fpn_transitions = nn.ModuleList()  # stack times
+        for s in range(self.stack_times):
+            stage_trans = nn.ModuleList()  # num of feature levels
+            for i in range(self.num_outs):
+                # same, across_lateral, across_down, across_up
+                trans = nn.ModuleDict()
+                if s in self.skip_inds[i]:
+                    stage_trans.append(trans)
+                    continue
+                # build same-stage down trans (used in bottom-up paths)
+                if i == 0 or self.same_up_trans is None:
+                    same_up_trans = None
+                else:
+                    same_up_trans = self.build_trans(
+                        self.same_up_trans, self.inter_channels[i - 1],
+                        self.inter_channels[i])
+                trans['same_up'] = same_up_trans
+                # build same-stage up trans (used in top-down paths)
+                if i == self.num_outs - 1 or self.same_down_trans is None:
+                    same_down_trans = None
+                else:
+                    same_down_trans = self.build_trans(
+                        self.same_down_trans, self.inter_channels[i + 1],
+                        self.inter_channels[i])
+                trans['same_down'] = same_down_trans
+                # build across lateral trans
+                across_lateral_trans = self.build_trans(
+                    self.across_lateral_trans, self.inter_channels[i],
+                    self.inter_channels[i])
+                trans['across_lateral'] = across_lateral_trans
+                # build across down trans
+                if i == self.num_outs - 1 or self.across_down_trans is None:
+                    across_down_trans = None
+                else:
+                    across_down_trans = self.build_trans(
+                        self.across_down_trans, self.inter_channels[i + 1],
+                        self.inter_channels[i])
+                trans['across_down'] = across_down_trans
+                # build across up trans
+                if i == 0 or self.across_up_trans is None:
+                    across_up_trans = None
+                else:
+                    across_up_trans = self.build_trans(
+                        self.across_up_trans, self.inter_channels[i - 1],
+                        self.inter_channels[i])
+                trans['across_up'] = across_up_trans
+                if self.across_skip_trans is None:
+                    across_skip_trans = None
+                else:
+                    across_skip_trans = self.build_trans(
+                        self.across_skip_trans, self.inter_channels[i - 1],
+                        self.inter_channels[i])
+                trans['across_skip'] = across_skip_trans
+                # build across_skip trans
+                stage_trans.append(trans)
+            self.fpn_transitions.append(stage_trans)
+
+        self.output_transition = nn.ModuleList()  # output levels
+        for i in range(self.num_outs):
+            trans = self.build_trans(
+                self.output_trans,
+                self.inter_channels[i],
+                self.out_channels,
+                num_inputs=self.stack_times + 1)
+            self.output_transition.append(trans)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def build_trans(self, cfg, in_channels, out_channels, **extra_args):
+        cfg_ = cfg.copy()
+        trans_type = cfg_.pop('type')
+        trans_cls = self.transition_types[trans_type]
+        return trans_cls(in_channels, out_channels, **cfg_, **extra_args)
+
+    def fuse(self, fuse_dict):
+        out = None
+        for item in fuse_dict.values():
+            if item is not None:
+                if out is None:
+                    out = item
+                else:
+                    out = out + item
+        return out
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # build all levels from original feature maps
+        feats = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        for downsample in self.extra_downsamples:
+            feats.append(downsample(feats[-1]))
+
+        outs = [feats]
+
+        for i in range(self.stack_times):
+            current_outs = outs[-1]
+            next_outs = []
+            direction = self.paths[i]
+            for j in range(self.num_outs):
+                if i in self.skip_inds[j]:
+                    next_outs.append(outs[-1][j])
+                    continue
+                # feature level
+                if direction == 'td':
+                    lvl = self.num_outs - j - 1
+                else:
+                    lvl = j
+                # get transitions
+                if direction == 'td':
+                    same_trans = self.fpn_transitions[i][lvl]['same_down']
+                else:
+                    same_trans = self.fpn_transitions[i][lvl]['same_up']
+                across_lateral_trans = self.fpn_transitions[i][lvl][
+                    'across_lateral']
+                across_down_trans = self.fpn_transitions[i][lvl]['across_down']
+                across_up_trans = self.fpn_transitions[i][lvl]['across_up']
+                across_skip_trans = self.fpn_transitions[i][lvl]['across_skip']
+                # init output
+                to_fuse = dict(
+                    same=None, lateral=None, across_up=None, across_down=None)
+                # same downsample/upsample
+                if same_trans is not None:
+                    to_fuse['same'] = same_trans(next_outs[-1])
+                # across lateral
+                if across_lateral_trans is not None:
+                    to_fuse['lateral'] = across_lateral_trans(
+                        current_outs[lvl])
+                # across downsample
+                if lvl > 0 and across_up_trans is not None:
+                    to_fuse['across_up'] = across_up_trans(current_outs[lvl -
+                                                                        1])
+                # across upsample
+                if (lvl < self.num_outs - 1 and across_down_trans is not None):
+                    to_fuse['across_down'] = across_down_trans(
+                        current_outs[lvl + 1])
+                if across_skip_trans is not None:
+                    to_fuse['across_skip'] = across_skip_trans(outs[0][lvl])
+                x = self.fuse(to_fuse)
+                next_outs.append(x)
+
+            if direction == 'td':
+                outs.append(next_outs[::-1])
+            else:
+                outs.append(next_outs)
+
+        # output trans
+        final_outs = []
+        for i in range(self.num_outs):
+            lvl_out_list = []
+            for s in range(len(outs)):
+                lvl_out_list.append(outs[s][i])
+            lvl_out = self.output_transition[i](lvl_out_list)
+            final_outs.append(lvl_out)
+
+        return final_outs
diff --git a/head_extractor/src/mmdet/models/necks/fpn.py b/head_extractor/src/mmdet/models/necks/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..67bd8879641f8539f329e6ffb94f88d25e417244
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/fpn.py
@@ -0,0 +1,221 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class FPN(BaseModule):
+    r"""Feature Pyramid Network.
+
+    This is an implementation of paper `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Defaults to 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Defaults to -1, which means the
+            last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Defaults to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral': Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Defaults to False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Defaults to False.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
+        act_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            activation layer in ConvModule. Defaults to None.
+        upsample_cfg (:obj:`ConfigDict` or dict, optional): Config dict
+            for interpolate layer. Defaults to dict(mode='nearest').
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        num_outs: int,
+        start_level: int = 0,
+        end_level: int = -1,
+        add_extra_convs: Union[bool, str] = False,
+        relu_before_extra_convs: bool = False,
+        no_norm_on_lateral: bool = False,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        act_cfg: OptConfigType = None,
+        upsample_cfg: ConfigType = dict(mode='nearest'),
+        init_cfg: MultiConfig = dict(
+            type='Xavier', layer='Conv2d', distribution='uniform')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            self.add_extra_convs = 'on_input'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    def forward(self, inputs: Tuple[Tensor]) -> tuple:
+        """Forward function.
+
+        Args:
+            inputs (tuple[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+
+        Returns:
+            tuple: Feature maps, each is a 4D-tensor.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/necks/fpn_carafe.py b/head_extractor/src/mmdet/models/necks/fpn_carafe.py
new file mode 100644
index 0000000000000000000000000000000000000000..b393ff7c340c0c343fc4c91a4d87d341f66a3177
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/fpn_carafe.py
@@ -0,0 +1,275 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_upsample_layer
+from mmcv.ops.carafe import CARAFEPack
+from mmengine.model import BaseModule, ModuleList, xavier_init
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class FPN_CARAFE(BaseModule):
+    """FPN_CARAFE is a more flexible implementation of FPN. It allows more
+    choice for upsample methods during the top-down pathway.
+
+    It can reproduce the performance of ICCV 2019 paper
+    CARAFE: Content-Aware ReAssembly of FEatures
+    Please refer to https://arxiv.org/abs/1905.02188 for more details.
+
+    Args:
+        in_channels (list[int]): Number of channels for each input feature map.
+        out_channels (int): Output channels of feature pyramids.
+        num_outs (int): Number of output stages.
+        start_level (int): Start level of feature pyramids.
+            (Default: 0)
+        end_level (int): End level of feature pyramids.
+            (Default: -1 indicates the last level).
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        activate (str): Type of activation function in ConvModule
+            (Default: None indicates w/o activation).
+        order (dict): Order of components in ConvModule.
+        upsample (str): Type of upsample layer.
+        upsample_cfg (dict): Dictionary to construct and config upsample layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 order=('conv', 'norm', 'act'),
+                 upsample_cfg=dict(
+                     type='carafe',
+                     up_kernel=5,
+                     up_group=1,
+                     encoder_kernel=3,
+                     encoder_dilation=1),
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super(FPN_CARAFE, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.with_bias = norm_cfg is None
+        self.upsample_cfg = upsample_cfg.copy()
+        self.upsample = self.upsample_cfg.get('type')
+        self.relu = nn.ReLU(inplace=False)
+
+        self.order = order
+        assert order in [('conv', 'norm', 'act'), ('act', 'conv', 'norm')]
+
+        assert self.upsample in [
+            'nearest', 'bilinear', 'deconv', 'pixel_shuffle', 'carafe', None
+        ]
+        if self.upsample in ['deconv', 'pixel_shuffle']:
+            assert hasattr(
+                self.upsample_cfg,
+                'upsample_kernel') and self.upsample_cfg.upsample_kernel > 0
+            self.upsample_kernel = self.upsample_cfg.pop('upsample_kernel')
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+
+        self.lateral_convs = ModuleList()
+        self.fpn_convs = ModuleList()
+        self.upsample_modules = ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                norm_cfg=norm_cfg,
+                bias=self.with_bias,
+                act_cfg=act_cfg,
+                inplace=False,
+                order=self.order)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                bias=self.with_bias,
+                act_cfg=act_cfg,
+                inplace=False,
+                order=self.order)
+            if i != self.backbone_end_level - 1:
+                upsample_cfg_ = self.upsample_cfg.copy()
+                if self.upsample == 'deconv':
+                    upsample_cfg_.update(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        kernel_size=self.upsample_kernel,
+                        stride=2,
+                        padding=(self.upsample_kernel - 1) // 2,
+                        output_padding=(self.upsample_kernel - 1) // 2)
+                elif self.upsample == 'pixel_shuffle':
+                    upsample_cfg_.update(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        scale_factor=2,
+                        upsample_kernel=self.upsample_kernel)
+                elif self.upsample == 'carafe':
+                    upsample_cfg_.update(channels=out_channels, scale_factor=2)
+                else:
+                    # suppress warnings
+                    align_corners = (None
+                                     if self.upsample == 'nearest' else False)
+                    upsample_cfg_.update(
+                        scale_factor=2,
+                        mode=self.upsample,
+                        align_corners=align_corners)
+                upsample_module = build_upsample_layer(upsample_cfg_)
+                self.upsample_modules.append(upsample_module)
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_out_levels = (
+            num_outs - self.backbone_end_level + self.start_level)
+        if extra_out_levels >= 1:
+            for i in range(extra_out_levels):
+                in_channels = (
+                    self.in_channels[self.backbone_end_level -
+                                     1] if i == 0 else out_channels)
+                extra_l_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    bias=self.with_bias,
+                    act_cfg=act_cfg,
+                    inplace=False,
+                    order=self.order)
+                if self.upsample == 'deconv':
+                    upsampler_cfg_ = dict(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        kernel_size=self.upsample_kernel,
+                        stride=2,
+                        padding=(self.upsample_kernel - 1) // 2,
+                        output_padding=(self.upsample_kernel - 1) // 2)
+                elif self.upsample == 'pixel_shuffle':
+                    upsampler_cfg_ = dict(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        scale_factor=2,
+                        upsample_kernel=self.upsample_kernel)
+                elif self.upsample == 'carafe':
+                    upsampler_cfg_ = dict(
+                        channels=out_channels,
+                        scale_factor=2,
+                        **self.upsample_cfg)
+                else:
+                    # suppress warnings
+                    align_corners = (None
+                                     if self.upsample == 'nearest' else False)
+                    upsampler_cfg_ = dict(
+                        scale_factor=2,
+                        mode=self.upsample,
+                        align_corners=align_corners)
+                upsampler_cfg_['type'] = self.upsample
+                upsample_module = build_upsample_layer(upsampler_cfg_)
+                extra_fpn_conv = ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.with_bias,
+                    act_cfg=act_cfg,
+                    inplace=False,
+                    order=self.order)
+                self.upsample_modules.append(upsample_module)
+                self.fpn_convs.append(extra_fpn_conv)
+                self.lateral_convs.append(extra_l_conv)
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        """Initialize the weights of module."""
+        super(FPN_CARAFE, self).init_weights()
+        for m in self.modules():
+            if isinstance(m, (nn.Conv2d, nn.ConvTranspose2d)):
+                xavier_init(m, distribution='uniform')
+        for m in self.modules():
+            if isinstance(m, CARAFEPack):
+                m.init_weights()
+
+    def slice_as(self, src, dst):
+        """Slice ``src`` as ``dst``
+
+        Note:
+            ``src`` should have the same or larger size than ``dst``.
+
+        Args:
+            src (torch.Tensor): Tensors to be sliced.
+            dst (torch.Tensor): ``src`` will be sliced to have the same
+                size as ``dst``.
+
+        Returns:
+            torch.Tensor: Sliced tensor.
+        """
+        assert (src.size(2) >= dst.size(2)) and (src.size(3) >= dst.size(3))
+        if src.size(2) == dst.size(2) and src.size(3) == dst.size(3):
+            return src
+        else:
+            return src[:, :, :dst.size(2), :dst.size(3)]
+
+    def tensor_add(self, a, b):
+        """Add tensors ``a`` and ``b`` that might have different sizes."""
+        if a.size() == b.size():
+            c = a + b
+        else:
+            c = a + self.slice_as(b, a)
+        return c
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = []
+        for i, lateral_conv in enumerate(self.lateral_convs):
+            if i <= self.backbone_end_level - self.start_level:
+                input = inputs[min(i + self.start_level, len(inputs) - 1)]
+            else:
+                input = laterals[-1]
+            lateral = lateral_conv(input)
+            laterals.append(lateral)
+
+        # build top-down path
+        for i in range(len(laterals) - 1, 0, -1):
+            if self.upsample is not None:
+                upsample_feat = self.upsample_modules[i - 1](laterals[i])
+            else:
+                upsample_feat = laterals[i]
+            laterals[i - 1] = self.tensor_add(laterals[i - 1], upsample_feat)
+
+        # build outputs
+        num_conv_outs = len(self.fpn_convs)
+        outs = []
+        for i in range(num_conv_outs):
+            out = self.fpn_convs[i](laterals[i])
+            outs.append(out)
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/necks/fpn_dropblock.py b/head_extractor/src/mmdet/models/necks/fpn_dropblock.py
new file mode 100644
index 0000000000000000000000000000000000000000..473af924cdaaecf88aa4a0a6e1500511530b91a2
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/fpn_dropblock.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .fpn import FPN
+
+
+@MODELS.register_module()
+class FPN_DropBlock(FPN):
+
+    def __init__(self,
+                 *args,
+                 plugin: Optional[dict] = dict(
+                     type='DropBlock',
+                     drop_prob=0.3,
+                     block_size=3,
+                     warmup_iters=0),
+                 **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.plugin = None
+        if plugin is not None:
+            self.plugin = MODELS.build(plugin)
+
+    def forward(self, inputs: Tuple[Tensor]) -> tuple:
+        """Forward function.
+
+        Args:
+            inputs (tuple[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+
+        Returns:
+            tuple: Feature maps, each is a 4D-tensor.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+            if self.plugin is not None:
+                laterals[i - 1] = self.plugin(laterals[i - 1])
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/necks/hrfpn.py b/head_extractor/src/mmdet/models/necks/hrfpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2627549b4cb8acc6833bc40425e459c28aa5c20
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/hrfpn.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch.utils.checkpoint import checkpoint
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class HRFPN(BaseModule):
+    """HRFPN (High Resolution Feature Pyramids)
+
+    paper: `High-Resolution Representations for Labeling Pixels and Regions
+    <https://arxiv.org/abs/1904.04514>`_.
+
+    Args:
+        in_channels (list): number of channels for each branch.
+        out_channels (int): output channels of feature pyramids.
+        num_outs (int): number of output stages.
+        pooling_type (str): pooling for generating feature pyramids
+            from {MAX, AVG}.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        with_cp  (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        stride (int): stride of 3x3 convolutional layers
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs=5,
+                 pooling_type='AVG',
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 with_cp=False,
+                 stride=1,
+                 init_cfg=dict(type='Caffe2Xavier', layer='Conv2d')):
+        super(HRFPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        self.reduction_conv = ConvModule(
+            sum(in_channels),
+            out_channels,
+            kernel_size=1,
+            conv_cfg=self.conv_cfg,
+            act_cfg=None)
+
+        self.fpn_convs = nn.ModuleList()
+        for i in range(self.num_outs):
+            self.fpn_convs.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    stride=stride,
+                    conv_cfg=self.conv_cfg,
+                    act_cfg=None))
+
+        if pooling_type == 'MAX':
+            self.pooling = F.max_pool2d
+        else:
+            self.pooling = F.avg_pool2d
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == self.num_ins
+        outs = [inputs[0]]
+        for i in range(1, self.num_ins):
+            outs.append(
+                F.interpolate(inputs[i], scale_factor=2**i, mode='bilinear'))
+        out = torch.cat(outs, dim=1)
+        if out.requires_grad and self.with_cp:
+            out = checkpoint(self.reduction_conv, out)
+        else:
+            out = self.reduction_conv(out)
+        outs = [out]
+        for i in range(1, self.num_outs):
+            outs.append(self.pooling(out, kernel_size=2**i, stride=2**i))
+        outputs = []
+
+        for i in range(self.num_outs):
+            if outs[i].requires_grad and self.with_cp:
+                tmp_out = checkpoint(self.fpn_convs[i], outs[i])
+            else:
+                tmp_out = self.fpn_convs[i](outs[i])
+            outputs.append(tmp_out)
+        return tuple(outputs)
diff --git a/head_extractor/src/mmdet/models/necks/nas_fpn.py b/head_extractor/src/mmdet/models/necks/nas_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ec90cd6eed3aa65a3a192d332cbfd8c16d5bc36
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/nas_fpn.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops.merge_cells import GlobalPoolingCell, SumCell
+from mmengine.model import BaseModule, ModuleList
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class NASFPN(BaseModule):
+    """NAS-FPN.
+
+    Implementation of `NAS-FPN: Learning Scalable Feature Pyramid Architecture
+    for Object Detection <https://arxiv.org/abs/1904.07392>`_
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        stack_times (int): The number of times the pyramid architecture will
+            be stacked.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Defaults to 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Defaults to -1, which means the
+            last level.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            normalization layer. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        num_outs: int,
+        stack_times: int,
+        start_level: int = 0,
+        end_level: int = -1,
+        norm_cfg: OptConfigType = None,
+        init_cfg: MultiConfig = dict(type='Caffe2Xavier', layer='Conv2d')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)  # num of input feature levels
+        self.num_outs = num_outs  # num of output feature levels
+        self.stack_times = stack_times
+        self.norm_cfg = norm_cfg
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+
+        # add lateral connections
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+            self.lateral_convs.append(l_conv)
+
+        # add extra downsample layers (stride-2 pooling or conv)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            extra_conv = ConvModule(
+                out_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+            self.extra_downsamples.append(
+                nn.Sequential(extra_conv, nn.MaxPool2d(2, 2)))
+
+        # add NAS FPN connections
+        self.fpn_stages = ModuleList()
+        for _ in range(self.stack_times):
+            stage = nn.ModuleDict()
+            # gp(p6, p4) -> p4_1
+            stage['gp_64_4'] = GlobalPoolingCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p4_1, p4) -> p4_2
+            stage['sum_44_4'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p4_2, p3) -> p3_out
+            stage['sum_43_3'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p3_out, p4_2) -> p4_out
+            stage['sum_34_4'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p5, gp(p4_out, p3_out)) -> p5_out
+            stage['gp_43_5'] = GlobalPoolingCell(with_out_conv=False)
+            stage['sum_55_5'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # sum(p7, gp(p5_out, p4_2)) -> p7_out
+            stage['gp_54_7'] = GlobalPoolingCell(with_out_conv=False)
+            stage['sum_77_7'] = SumCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            # gp(p7_out, p5_out) -> p6_out
+            stage['gp_75_6'] = GlobalPoolingCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                out_norm_cfg=norm_cfg)
+            self.fpn_stages.append(stage)
+
+    def forward(self, inputs: Tuple[Tensor]) -> tuple:
+        """Forward function.
+
+         Args:
+            inputs (tuple[Tensor]): Features from the upstream network, each
+                is a 4D-tensor.
+
+        Returns:
+            tuple: Feature maps, each is a 4D-tensor.
+        """
+        # build P3-P5
+        feats = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        # build P6-P7 on top of P5
+        for downsample in self.extra_downsamples:
+            feats.append(downsample(feats[-1]))
+
+        p3, p4, p5, p6, p7 = feats
+
+        for stage in self.fpn_stages:
+            # gp(p6, p4) -> p4_1
+            p4_1 = stage['gp_64_4'](p6, p4, out_size=p4.shape[-2:])
+            # sum(p4_1, p4) -> p4_2
+            p4_2 = stage['sum_44_4'](p4_1, p4, out_size=p4.shape[-2:])
+            # sum(p4_2, p3) -> p3_out
+            p3 = stage['sum_43_3'](p4_2, p3, out_size=p3.shape[-2:])
+            # sum(p3_out, p4_2) -> p4_out
+            p4 = stage['sum_34_4'](p3, p4_2, out_size=p4.shape[-2:])
+            # sum(p5, gp(p4_out, p3_out)) -> p5_out
+            p5_tmp = stage['gp_43_5'](p4, p3, out_size=p5.shape[-2:])
+            p5 = stage['sum_55_5'](p5, p5_tmp, out_size=p5.shape[-2:])
+            # sum(p7, gp(p5_out, p4_2)) -> p7_out
+            p7_tmp = stage['gp_54_7'](p5, p4_2, out_size=p7.shape[-2:])
+            p7 = stage['sum_77_7'](p7, p7_tmp, out_size=p7.shape[-2:])
+            # gp(p7_out, p5_out) -> p6_out
+            p6 = stage['gp_75_6'](p7, p5, out_size=p6.shape[-2:])
+
+        return p3, p4, p5, p6, p7
diff --git a/head_extractor/src/mmdet/models/necks/nasfcos_fpn.py b/head_extractor/src/mmdet/models/necks/nasfcos_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d0848f7634bb0113e0b5a16b5b65ba8b7ebb9c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/nasfcos_fpn.py
@@ -0,0 +1,170 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.ops.merge_cells import ConcatCell
+from mmengine.model import BaseModule, caffe2_xavier_init
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class NASFCOS_FPN(BaseModule):
+    """FPN structure in NASFPN.
+
+    Implementation of paper `NAS-FCOS: Fast Neural Architecture Search for
+    Object Detection <https://arxiv.org/abs/1906.04423>`_
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool): It decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+        conv_cfg (dict): dictionary to construct and config conv layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=1,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super(NASFCOS_FPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+
+        if end_level == -1 or end_level == self.num_ins - 1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level is not the last level, no extra level is allowed
+            self.backbone_end_level = end_level + 1
+            assert end_level < self.num_ins
+            assert num_outs == end_level - start_level + 1
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+
+        self.adapt_convs = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            adapt_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                stride=1,
+                padding=0,
+                bias=False,
+                norm_cfg=dict(type='BN'),
+                act_cfg=dict(type='ReLU', inplace=False))
+            self.adapt_convs.append(adapt_conv)
+
+        # C2 is omitted according to the paper
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+
+        def build_concat_cell(with_input1_conv, with_input2_conv):
+            cell_conv_cfg = dict(
+                kernel_size=1, padding=0, bias=False, groups=out_channels)
+            return ConcatCell(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                with_out_conv=True,
+                out_conv_cfg=cell_conv_cfg,
+                out_norm_cfg=dict(type='BN'),
+                out_conv_order=('norm', 'act', 'conv'),
+                with_input1_conv=with_input1_conv,
+                with_input2_conv=with_input2_conv,
+                input_conv_cfg=conv_cfg,
+                input_norm_cfg=norm_cfg,
+                upsample_mode='nearest')
+
+        # Denote c3=f0, c4=f1, c5=f2 for convince
+        self.fpn = nn.ModuleDict()
+        self.fpn['c22_1'] = build_concat_cell(True, True)
+        self.fpn['c22_2'] = build_concat_cell(True, True)
+        self.fpn['c32'] = build_concat_cell(True, False)
+        self.fpn['c02'] = build_concat_cell(True, False)
+        self.fpn['c42'] = build_concat_cell(True, True)
+        self.fpn['c36'] = build_concat_cell(True, True)
+        self.fpn['c61'] = build_concat_cell(True, True)  # f9
+        self.extra_downsamples = nn.ModuleList()
+        for i in range(extra_levels):
+            extra_act_cfg = None if i == 0 \
+                else dict(type='ReLU', inplace=False)
+            self.extra_downsamples.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    act_cfg=extra_act_cfg,
+                    order=('act', 'norm', 'conv')))
+
+    def forward(self, inputs):
+        """Forward function."""
+        feats = [
+            adapt_conv(inputs[i + self.start_level])
+            for i, adapt_conv in enumerate(self.adapt_convs)
+        ]
+
+        for (i, module_name) in enumerate(self.fpn):
+            idx_1, idx_2 = int(module_name[1]), int(module_name[2])
+            res = self.fpn[module_name](feats[idx_1], feats[idx_2])
+            feats.append(res)
+
+        ret = []
+        for (idx, input_idx) in zip([9, 8, 7], [1, 2, 3]):  # add P3, P4, P5
+            feats1, feats2 = feats[idx], feats[5]
+            feats2_resize = F.interpolate(
+                feats2,
+                size=feats1.size()[2:],
+                mode='bilinear',
+                align_corners=False)
+
+            feats_sum = feats1 + feats2_resize
+            ret.append(
+                F.interpolate(
+                    feats_sum,
+                    size=inputs[input_idx].size()[2:],
+                    mode='bilinear',
+                    align_corners=False))
+
+        for submodule in self.extra_downsamples:
+            ret.append(submodule(ret[-1]))
+
+        return tuple(ret)
+
+    def init_weights(self):
+        """Initialize the weights of module."""
+        super(NASFCOS_FPN, self).init_weights()
+        for module in self.fpn.values():
+            if hasattr(module, 'conv_out'):
+                caffe2_xavier_init(module.out_conv.conv)
+
+        for modules in [
+                self.adapt_convs.modules(),
+                self.extra_downsamples.modules()
+        ]:
+            for module in modules:
+                if isinstance(module, nn.Conv2d):
+                    caffe2_xavier_init(module)
diff --git a/head_extractor/src/mmdet/models/necks/pafpn.py b/head_extractor/src/mmdet/models/necks/pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..557638f48a629691f780d3e1466e234bbe987518
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/pafpn.py
@@ -0,0 +1,157 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmdet.registry import MODELS
+from .fpn import FPN
+
+
+@MODELS.register_module()
+class PAFPN(FPN):
+    """Path Aggregation Network for Instance Segmentation.
+
+    This is an implementation of the `PAFPN in Path Aggregation Network
+    <https://arxiv.org/abs/1803.01534>`_.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral':  Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (str): Config dict for activation layer in ConvModule.
+            Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(PAFPN, self).__init__(
+            in_channels,
+            out_channels,
+            num_outs,
+            start_level,
+            end_level,
+            add_extra_convs,
+            relu_before_extra_convs,
+            no_norm_on_lateral,
+            conv_cfg,
+            norm_cfg,
+            act_cfg,
+            init_cfg=init_cfg)
+        # add extra bottom up pathway
+        self.downsample_convs = nn.ModuleList()
+        self.pafpn_convs = nn.ModuleList()
+        for i in range(self.start_level + 1, self.backbone_end_level):
+            d_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+            pafpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+            self.downsample_convs.append(d_conv)
+            self.pafpn_convs.append(pafpn_conv)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            prev_shape = laterals[i - 1].shape[2:]
+            laterals[i - 1] = laterals[i - 1] + F.interpolate(
+                laterals[i], size=prev_shape, mode='nearest')
+
+        # build outputs
+        # part 1: from original levels
+        inter_outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+
+        # part 2: add bottom-up path
+        for i in range(0, used_backbone_levels - 1):
+            inter_outs[i + 1] = inter_outs[i + 1] + \
+                                self.downsample_convs[i](inter_outs[i])
+
+        outs = []
+        outs.append(inter_outs[0])
+        outs.extend([
+            self.pafpn_convs[i - 1](inter_outs[i])
+            for i in range(1, used_backbone_levels)
+        ])
+
+        # part 3: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    orig = inputs[self.backbone_end_level - 1]
+                    outs.append(self.fpn_convs[used_backbone_levels](orig))
+                elif self.add_extra_convs == 'on_lateral':
+                    outs.append(self.fpn_convs[used_backbone_levels](
+                        laterals[-1]))
+                elif self.add_extra_convs == 'on_output':
+                    outs.append(self.fpn_convs[used_backbone_levels](outs[-1]))
+                else:
+                    raise NotImplementedError
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/necks/rfp.py b/head_extractor/src/mmdet/models/necks/rfp.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ec9b3753c5031bb12a2b4c88733f13bf27c44e2
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/rfp.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule, ModuleList, constant_init, xavier_init
+
+from mmdet.registry import MODELS
+from .fpn import FPN
+
+
+class ASPP(BaseModule):
+    """ASPP (Atrous Spatial Pyramid Pooling)
+
+    This is an implementation of the ASPP module used in DetectoRS
+    (https://arxiv.org/pdf/2006.02334.pdf)
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of channels produced by this module
+        dilations (tuple[int]): Dilations of the four branches.
+            Default: (1, 3, 6, 1)
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 dilations=(1, 3, 6, 1),
+                 init_cfg=dict(type='Kaiming', layer='Conv2d')):
+        super().__init__(init_cfg)
+        assert dilations[-1] == 1
+        self.aspp = nn.ModuleList()
+        for dilation in dilations:
+            kernel_size = 3 if dilation > 1 else 1
+            padding = dilation if dilation > 1 else 0
+            conv = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                dilation=dilation,
+                padding=padding,
+                bias=True)
+            self.aspp.append(conv)
+        self.gap = nn.AdaptiveAvgPool2d(1)
+
+    def forward(self, x):
+        avg_x = self.gap(x)
+        out = []
+        for aspp_idx in range(len(self.aspp)):
+            inp = avg_x if (aspp_idx == len(self.aspp) - 1) else x
+            out.append(F.relu_(self.aspp[aspp_idx](inp)))
+        out[-1] = out[-1].expand_as(out[-2])
+        out = torch.cat(out, dim=1)
+        return out
+
+
+@MODELS.register_module()
+class RFP(FPN):
+    """RFP (Recursive Feature Pyramid)
+
+    This is an implementation of RFP in `DetectoRS
+    <https://arxiv.org/pdf/2006.02334.pdf>`_. Different from standard FPN, the
+    input of RFP should be multi level features along with origin input image
+    of backbone.
+
+    Args:
+        rfp_steps (int): Number of unrolled steps of RFP.
+        rfp_backbone (dict): Configuration of the backbone for RFP.
+        aspp_out_channels (int): Number of output channels of ASPP module.
+        aspp_dilations (tuple[int]): Dilation rates of four branches.
+            Default: (1, 3, 6, 1)
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 rfp_steps,
+                 rfp_backbone,
+                 aspp_out_channels,
+                 aspp_dilations=(1, 3, 6, 1),
+                 init_cfg=None,
+                 **kwargs):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg, **kwargs)
+        self.rfp_steps = rfp_steps
+        # Be careful! Pretrained weights cannot be loaded when use
+        # nn.ModuleList
+        self.rfp_modules = ModuleList()
+        for rfp_idx in range(1, rfp_steps):
+            rfp_module = MODELS.build(rfp_backbone)
+            self.rfp_modules.append(rfp_module)
+        self.rfp_aspp = ASPP(self.out_channels, aspp_out_channels,
+                             aspp_dilations)
+        self.rfp_weight = nn.Conv2d(
+            self.out_channels,
+            1,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True)
+
+    def init_weights(self):
+        # Avoid using super().init_weights(), which may alter the default
+        # initialization of the modules in self.rfp_modules that have missing
+        # keys in the pretrained checkpoint.
+        for convs in [self.lateral_convs, self.fpn_convs]:
+            for m in convs.modules():
+                if isinstance(m, nn.Conv2d):
+                    xavier_init(m, distribution='uniform')
+        for rfp_idx in range(self.rfp_steps - 1):
+            self.rfp_modules[rfp_idx].init_weights()
+        constant_init(self.rfp_weight, 0)
+
+    def forward(self, inputs):
+        inputs = list(inputs)
+        assert len(inputs) == len(self.in_channels) + 1  # +1 for input image
+        img = inputs.pop(0)
+        # FPN forward
+        x = super().forward(tuple(inputs))
+        for rfp_idx in range(self.rfp_steps - 1):
+            rfp_feats = [x[0]] + list(
+                self.rfp_aspp(x[i]) for i in range(1, len(x)))
+            x_idx = self.rfp_modules[rfp_idx].rfp_forward(img, rfp_feats)
+            # FPN forward
+            x_idx = super().forward(x_idx)
+            x_new = []
+            for ft_idx in range(len(x_idx)):
+                add_weight = torch.sigmoid(self.rfp_weight(x_idx[ft_idx]))
+                x_new.append(add_weight * x_idx[ft_idx] +
+                             (1 - add_weight) * x[ft_idx])
+            x = x_new
+        return x
diff --git a/head_extractor/src/mmdet/models/necks/ssd_neck.py b/head_extractor/src/mmdet/models/necks/ssd_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ba319370b988b9c7e2d98c2f10607ff8f8b5c3
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/ssd_neck.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class SSDNeck(BaseModule):
+    """Extra layers of SSD backbone to generate multi-scale feature maps.
+
+    Args:
+        in_channels (Sequence[int]): Number of input channels per scale.
+        out_channels (Sequence[int]): Number of output channels per scale.
+        level_strides (Sequence[int]): Stride of 3x3 conv per level.
+        level_paddings (Sequence[int]): Padding size of 3x3 conv per level.
+        l2_norm_scale (float|None): L2 normalization layer init scale.
+            If None, not use L2 normalization on the first input feature.
+        last_kernel_size (int): Kernel size of the last conv layer.
+            Default: 3.
+        use_depthwise (bool): Whether to use DepthwiseSeparableConv.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 level_strides,
+                 level_paddings,
+                 l2_norm_scale=20.,
+                 last_kernel_size=3,
+                 use_depthwise=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=[
+                     dict(
+                         type='Xavier', distribution='uniform',
+                         layer='Conv2d'),
+                     dict(type='Constant', val=1, layer='BatchNorm2d'),
+                 ]):
+        super(SSDNeck, self).__init__(init_cfg)
+        assert len(out_channels) > len(in_channels)
+        assert len(out_channels) - len(in_channels) == len(level_strides)
+        assert len(level_strides) == len(level_paddings)
+        assert in_channels == out_channels[:len(in_channels)]
+
+        if l2_norm_scale:
+            self.l2_norm = L2Norm(in_channels[0], l2_norm_scale)
+            self.init_cfg += [
+                dict(
+                    type='Constant',
+                    val=self.l2_norm.scale,
+                    override=dict(name='l2_norm'))
+            ]
+
+        self.extra_layers = nn.ModuleList()
+        extra_layer_channels = out_channels[len(in_channels):]
+        second_conv = DepthwiseSeparableConvModule if \
+            use_depthwise else ConvModule
+
+        for i, (out_channel, stride, padding) in enumerate(
+                zip(extra_layer_channels, level_strides, level_paddings)):
+            kernel_size = last_kernel_size \
+                if i == len(extra_layer_channels) - 1 else 3
+            per_lvl_convs = nn.Sequential(
+                ConvModule(
+                    out_channels[len(in_channels) - 1 + i],
+                    out_channel // 2,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+                second_conv(
+                    out_channel // 2,
+                    out_channel,
+                    kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.extra_layers.append(per_lvl_convs)
+
+    def forward(self, inputs):
+        """Forward function."""
+        outs = [feat for feat in inputs]
+        if hasattr(self, 'l2_norm'):
+            outs[0] = self.l2_norm(outs[0])
+
+        feat = outs[-1]
+        for layer in self.extra_layers:
+            feat = layer(feat)
+            outs.append(feat)
+        return tuple(outs)
+
+
+class L2Norm(nn.Module):
+
+    def __init__(self, n_dims, scale=20., eps=1e-10):
+        """L2 normalization layer.
+
+        Args:
+            n_dims (int): Number of dimensions to be normalized
+            scale (float, optional): Defaults to 20..
+            eps (float, optional): Used to avoid division by zero.
+                Defaults to 1e-10.
+        """
+        super(L2Norm, self).__init__()
+        self.n_dims = n_dims
+        self.weight = nn.Parameter(torch.Tensor(self.n_dims))
+        self.eps = eps
+        self.scale = scale
+
+    def forward(self, x):
+        """Forward function."""
+        # normalization layer convert to FP32 in FP16 training
+        x_float = x.float()
+        norm = x_float.pow(2).sum(1, keepdim=True).sqrt() + self.eps
+        return (self.weight[None, :, None, None].float().expand_as(x_float) *
+                x_float / norm).type_as(x)
diff --git a/head_extractor/src/mmdet/models/necks/ssh.py b/head_extractor/src/mmdet/models/necks/ssh.py
new file mode 100644
index 0000000000000000000000000000000000000000..75a6561489d8d3634fc34829dafe819bbf066ed4
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/ssh.py
@@ -0,0 +1,216 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+
+
+class SSHContextModule(BaseModule):
+    """This is an implementation of `SSH context module` described in `SSH:
+    Single Stage Headless Face Detector.
+
+    <https://arxiv.org/pdf/1708.03979.pdf>`_.
+
+    Args:
+        in_channels (int): Number of input channels used at each scale.
+        out_channels (int): Number of output channels used at each scale.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN').
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        assert out_channels % 4 == 0
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.conv5x5_1 = ConvModule(
+            self.in_channels,
+            self.out_channels // 4,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+        )
+
+        self.conv5x5_2 = ConvModule(
+            self.out_channels // 4,
+            self.out_channels // 4,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.conv7x7_2 = ConvModule(
+            self.out_channels // 4,
+            self.out_channels // 4,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+        )
+
+        self.conv7x7_3 = ConvModule(
+            self.out_channels // 4,
+            self.out_channels // 4,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None,
+        )
+
+    def forward(self, x: torch.Tensor) -> tuple:
+        conv5x5_1 = self.conv5x5_1(x)
+        conv5x5 = self.conv5x5_2(conv5x5_1)
+        conv7x7_2 = self.conv7x7_2(conv5x5_1)
+        conv7x7 = self.conv7x7_3(conv7x7_2)
+
+        return (conv5x5, conv7x7)
+
+
+class SSHDetModule(BaseModule):
+    """This is an implementation of `SSH detection module` described in `SSH:
+    Single Stage Headless Face Detector.
+
+    <https://arxiv.org/pdf/1708.03979.pdf>`_.
+
+    Args:
+        in_channels (int): Number of input channels used at each scale.
+        out_channels (int): Number of output channels used at each scale.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN').
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(init_cfg=init_cfg)
+        assert out_channels % 4 == 0
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.conv3x3 = ConvModule(
+            self.in_channels,
+            self.out_channels // 2,
+            3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.context_module = SSHContextModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        conv3x3 = self.conv3x3(x)
+        conv5x5, conv7x7 = self.context_module(x)
+        out = torch.cat([conv3x3, conv5x5, conv7x7], dim=1)
+        out = F.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class SSH(BaseModule):
+    """`SSH Neck` used in `SSH: Single Stage Headless Face Detector.
+
+    <https://arxiv.org/pdf/1708.03979.pdf>`_.
+
+    Args:
+        num_scales (int): The number of scales / stages.
+        in_channels (list[int]): The number of input channels per scale.
+        out_channels (list[int]): The number of output channels  per scale.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): Config dict for
+            convolution layer. Defaults to None.
+        norm_cfg (:obj:`ConfigDict` or dict): Config dict for normalization
+            layer. Defaults to dict(type='BN').
+        init_cfg (:obj:`ConfigDict` or list[:obj:`ConfigDict`] or dict or
+            list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [8, 16, 32, 64]
+        >>> out_channels = [16, 32, 64, 128]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = SSH(num_scales=4, in_channels=in_channels,
+        ...           out_channels=out_channels)
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 16, 340, 340])
+        outputs[1].shape = torch.Size([1, 32, 170, 170])
+        outputs[2].shape = torch.Size([1, 64, 84, 84])
+        outputs[3].shape = torch.Size([1, 128, 43, 43])
+    """
+
+    def __init__(self,
+                 num_scales: int,
+                 in_channels: List[int],
+                 out_channels: List[int],
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: OptMultiConfig = dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super().__init__(init_cfg=init_cfg)
+        assert (num_scales == len(in_channels) == len(out_channels))
+        self.num_scales = num_scales
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        for idx in range(self.num_scales):
+            in_c, out_c = self.in_channels[idx], self.out_channels[idx]
+            self.add_module(
+                f'ssh_module{idx}',
+                SSHDetModule(
+                    in_channels=in_c,
+                    out_channels=out_c,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg))
+
+    def forward(self, inputs: Tuple[torch.Tensor]) -> tuple:
+        assert len(inputs) == self.num_scales
+
+        outs = []
+        for idx, x in enumerate(inputs):
+            ssh_module = getattr(self, f'ssh_module{idx}')
+            out = ssh_module(x)
+            outs.append(out)
+
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/necks/yolo_neck.py b/head_extractor/src/mmdet/models/necks/yolo_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a6b1a4897c85083aa1e1e7d692263f66de67c3
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/yolo_neck.py
@@ -0,0 +1,145 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) 2019 Western Digital Corporation or its affiliates.
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+
+
+class DetectionBlock(BaseModule):
+    """Detection block in YOLO neck.
+
+    Let out_channels = n, the DetectionBlock contains:
+    Six ConvLayers, 1 Conv2D Layer and 1 YoloLayer.
+    The first 6 ConvLayers are formed the following way:
+        1x1xn, 3x3x2n, 1x1xn, 3x3x2n, 1x1xn, 3x3x2n.
+    The Conv2D layer is 1x1x255.
+    Some block will have branch after the fifth ConvLayer.
+    The input channel is arbitrary (in_channels)
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(
+                     type='LeakyReLU', negative_slope=0.1),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(DetectionBlock, self).__init__(init_cfg)
+        double_out_channels = out_channels * 2
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.conv1 = ConvModule(in_channels, out_channels, 1, **cfg)
+        self.conv2 = ConvModule(
+            out_channels, double_out_channels, 3, padding=1, **cfg)
+        self.conv3 = ConvModule(double_out_channels, out_channels, 1, **cfg)
+        self.conv4 = ConvModule(
+            out_channels, double_out_channels, 3, padding=1, **cfg)
+        self.conv5 = ConvModule(double_out_channels, out_channels, 1, **cfg)
+
+    def forward(self, x: Tensor) -> Tensor:
+        tmp = self.conv1(x)
+        tmp = self.conv2(tmp)
+        tmp = self.conv3(tmp)
+        tmp = self.conv4(tmp)
+        out = self.conv5(tmp)
+        return out
+
+
+@MODELS.register_module()
+class YOLOV3Neck(BaseModule):
+    """The neck of YOLOV3.
+
+    It can be treated as a simplified version of FPN. It
+    will take the result from Darknet backbone and do some upsampling and
+    concatenation. It will finally output the detection result.
+
+    Note:
+        The input feats should be from top to bottom.
+            i.e., from high-lvl to low-lvl
+        But YOLOV3Neck will process them in reversed order.
+            i.e., from bottom (high-lvl) to top (low-lvl)
+
+    Args:
+        num_scales (int): The number of scales / stages.
+        in_channels (List[int]): The number of input channels per scale.
+        out_channels (List[int]): The number of output channels  per scale.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict, optional): Dictionary to construct and config norm
+            layer. Default: dict(type='BN', requires_grad=True)
+        act_cfg (dict, optional): Config dict for activation layer.
+            Default: dict(type='LeakyReLU', negative_slope=0.1).
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_scales: int,
+                 in_channels: List[int],
+                 out_channels: List[int],
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: ConfigType = dict(
+                     type='LeakyReLU', negative_slope=0.1),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(YOLOV3Neck, self).__init__(init_cfg)
+        assert (num_scales == len(in_channels) == len(out_channels))
+        self.num_scales = num_scales
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        # shortcut
+        cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
+
+        # To support arbitrary scales, the code looks awful, but it works.
+        # Better solution is welcomed.
+        self.detect1 = DetectionBlock(in_channels[0], out_channels[0], **cfg)
+        for i in range(1, self.num_scales):
+            in_c, out_c = self.in_channels[i], self.out_channels[i]
+            inter_c = out_channels[i - 1]
+            self.add_module(f'conv{i}', ConvModule(inter_c, out_c, 1, **cfg))
+            # in_c + out_c : High-lvl feats will be cat with low-lvl feats
+            self.add_module(f'detect{i+1}',
+                            DetectionBlock(in_c + out_c, out_c, **cfg))
+
+    def forward(self, feats=Tuple[Tensor]) -> Tuple[Tensor]:
+        assert len(feats) == self.num_scales
+
+        # processed from bottom (high-lvl) to top (low-lvl)
+        outs = []
+        out = self.detect1(feats[-1])
+        outs.append(out)
+
+        for i, x in enumerate(reversed(feats[:-1])):
+            conv = getattr(self, f'conv{i+1}')
+            tmp = conv(out)
+
+            # Cat with low-lvl feats
+            tmp = F.interpolate(tmp, scale_factor=2)
+            tmp = torch.cat((tmp, x), 1)
+
+            detect = getattr(self, f'detect{i+2}')
+            out = detect(tmp)
+            outs.append(out)
+
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/necks/yolox_pafpn.py b/head_extractor/src/mmdet/models/necks/yolox_pafpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ec3d12bfde8158c1a817fbf223a8eea94798667
--- /dev/null
+++ b/head_extractor/src/mmdet/models/necks/yolox_pafpn.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from ..layers import CSPLayer
+
+
+@MODELS.register_module()
+class YOLOXPAFPN(BaseModule):
+    """Path Aggregation Network used in YOLOX.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 3
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: False
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(scale_factor=2, mode='nearest')`
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='Swish')
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_csp_blocks=3,
+                 use_depthwise=False,
+                 upsample_cfg=dict(scale_factor=2, mode='nearest'),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
+                 act_cfg=dict(type='Swish'),
+                 init_cfg=dict(
+                     type='Kaiming',
+                     layer='Conv2d',
+                     a=math.sqrt(5),
+                     distribution='uniform',
+                     mode='fan_in',
+                     nonlinearity='leaky_relu')):
+        super(YOLOXPAFPN, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(**upsample_cfg)
+        self.reduce_layers = nn.ModuleList()
+        self.top_down_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.reduce_layers.append(
+                ConvModule(
+                    in_channels[idx],
+                    in_channels[idx - 1],
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.top_down_blocks.append(
+                CSPLayer(
+                    in_channels[idx - 1] * 2,
+                    in_channels[idx - 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        # build bottom-up blocks
+        self.downsamples = nn.ModuleList()
+        self.bottom_up_blocks = nn.ModuleList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv(
+                    in_channels[idx],
+                    in_channels[idx],
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.bottom_up_blocks.append(
+                CSPLayer(
+                    in_channels[idx] * 2,
+                    in_channels[idx + 1],
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        self.out_convs = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.out_convs.append(
+                ConvModule(
+                    in_channels[i],
+                    out_channels,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+
+        Returns:
+            tuple[Tensor]: YOLOXPAFPN features.
+        """
+        assert len(inputs) == len(self.in_channels)
+
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = inputs[idx - 1]
+            feat_heigh = self.reduce_layers[len(self.in_channels) - 1 - idx](
+                feat_heigh)
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = self.upsample(feat_heigh)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                torch.cat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](
+                torch.cat([downsample_feat, feat_height], 1))
+            outs.append(out)
+
+        # out convs
+        for idx, conv in enumerate(self.out_convs):
+            outs[idx] = conv(outs[idx])
+
+        return tuple(outs)
diff --git a/head_extractor/src/mmdet/models/reid/__init__.py b/head_extractor/src/mmdet/models/reid/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aca617f7dea0b8047891c666ddb684dbbd018c81
--- /dev/null
+++ b/head_extractor/src/mmdet/models/reid/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_reid import BaseReID
+from .fc_module import FcModule
+from .gap import GlobalAveragePooling
+from .linear_reid_head import LinearReIDHead
+
+__all__ = ['BaseReID', 'GlobalAveragePooling', 'LinearReIDHead', 'FcModule']
diff --git a/head_extractor/src/mmdet/models/reid/base_reid.py b/head_extractor/src/mmdet/models/reid/base_reid.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c45964394aa1651f846f2a7e63da3ee70b78909
--- /dev/null
+++ b/head_extractor/src/mmdet/models/reid/base_reid.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch
+
+try:
+    import mmpretrain
+    from mmpretrain.models.classifiers import ImageClassifier
+except ImportError:
+    mmpretrain = None
+    ImageClassifier = object
+
+from mmdet.registry import MODELS
+from mmdet.structures import ReIDDataSample
+
+
+@MODELS.register_module()
+class BaseReID(ImageClassifier):
+    """Base model for re-identification."""
+
+    def __init__(self, *args, **kwargs):
+        if mmpretrain is None:
+            raise RuntimeError('Please run "pip install openmim" and '
+                               'run "mim install mmpretrain" to '
+                               'install mmpretrain first.')
+        super().__init__(*args, **kwargs)
+
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: Optional[List[ReIDDataSample]] = None,
+                mode: str = 'tensor'):
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+          tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+          processed to a list of :obj:`ReIDDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+          inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (torch.Tensor): The input tensor with shape
+                (N, C, H, W) or (N, T, C, H, W).
+            data_samples (List[ReIDDataSample], optional): The annotation
+                data of every sample. It's required if ``mode="loss"``.
+                Defaults to None.
+            mode (str): Return what kind of value. Defaults to 'tensor'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of
+              :obj:`ReIDDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if len(inputs.size()) == 5:
+            assert inputs.size(0) == 1
+            inputs = inputs[0]
+        return super().forward(inputs, data_samples, mode)
diff --git a/head_extractor/src/mmdet/models/reid/fc_module.py b/head_extractor/src/mmdet/models/reid/fc_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..76e7efd66e300a242bb250cc6ba5cc68ed722034
--- /dev/null
+++ b/head_extractor/src/mmdet/models/reid/fc_module.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class FcModule(BaseModule):
+    """Fully-connected layer module.
+
+    Args:
+        in_channels (int): Input channels.
+        out_channels (int): Ourput channels.
+        norm_cfg (dict, optional): Configuration of normlization method
+            after fc. Defaults to None.
+        act_cfg (dict, optional): Configuration of activation method after fc.
+            Defaults to dict(type='ReLU').
+        inplace (bool, optional): Whether inplace the activatation module.
+            Defaults to True.
+        init_cfg (dict, optional): Initialization config dict.
+            Defaults to dict(type='Kaiming', layer='Linear').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: dict = None,
+                 act_cfg: dict = dict(type='ReLU'),
+                 inplace: bool = True,
+                 init_cfg=dict(type='Kaiming', layer='Linear')):
+        super(FcModule, self).__init__(init_cfg)
+        assert norm_cfg is None or isinstance(norm_cfg, dict)
+        assert act_cfg is None or isinstance(act_cfg, dict)
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.inplace = inplace
+
+        self.with_norm = norm_cfg is not None
+        self.with_activation = act_cfg is not None
+
+        self.fc = nn.Linear(in_channels, out_channels)
+        # build normalization layers
+        if self.with_norm:
+            self.norm_name, norm = build_norm_layer(norm_cfg, out_channels)
+            self.add_module(self.norm_name, norm)
+
+        # build activation layer
+        if self.with_activation:
+            act_cfg_ = act_cfg.copy()
+            # nn.Tanh has no 'inplace' argument
+            if act_cfg_['type'] not in [
+                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish'
+            ]:
+                act_cfg_.setdefault('inplace', inplace)
+            self.activate = build_activation_layer(act_cfg_)
+
+    @property
+    def norm(self):
+        """Normalization."""
+        return getattr(self, self.norm_name)
+
+    def forward(self, x, activate=True, norm=True):
+        """Model forward."""
+        x = self.fc(x)
+        if norm and self.with_norm:
+            x = self.norm(x)
+        if activate and self.with_activation:
+            x = self.activate(x)
+        return x
diff --git a/head_extractor/src/mmdet/models/reid/gap.py b/head_extractor/src/mmdet/models/reid/gap.py
new file mode 100644
index 0000000000000000000000000000000000000000..aadc25e7144f2ca9efb66b496bf8ffa5504619ff
--- /dev/null
+++ b/head_extractor/src/mmdet/models/reid/gap.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class GlobalAveragePooling(BaseModule):
+    """Global Average Pooling neck.
+
+    Note that we use `view` to remove extra channel after pooling. We do not
+    use `squeeze` as it will also remove the batch dimension when the tensor
+    has a batch dimension of size 1, which can lead to unexpected errors.
+    """
+
+    def __init__(self, kernel_size=None, stride=None):
+        super(GlobalAveragePooling, self).__init__()
+        if kernel_size is None and stride is None:
+            self.gap = nn.AdaptiveAvgPool2d((1, 1))
+        else:
+            self.gap = nn.AvgPool2d(kernel_size, stride)
+
+    def forward(self, inputs):
+        if isinstance(inputs, tuple):
+            outs = tuple([self.gap(x) for x in inputs])
+            outs = tuple([
+                out.view(x.size(0),
+                         torch.tensor(out.size()[1:]).prod())
+                for out, x in zip(outs, inputs)
+            ])
+        elif isinstance(inputs, torch.Tensor):
+            outs = self.gap(inputs)
+            outs = outs.view(
+                inputs.size(0),
+                torch.tensor(outs.size()[1:]).prod())
+        else:
+            raise TypeError('neck inputs should be tuple or torch.tensor')
+        return outs
diff --git a/head_extractor/src/mmdet/models/reid/linear_reid_head.py b/head_extractor/src/mmdet/models/reid/linear_reid_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f35aaf6c2fc57b60e36017268e2a632df60ed342
--- /dev/null
+++ b/head_extractor/src/mmdet/models/reid/linear_reid_head.py
@@ -0,0 +1,202 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+try:
+    import mmpretrain
+    from mmpretrain.evaluation.metrics import Accuracy
+except ImportError:
+    mmpretrain = None
+
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.structures import ReIDDataSample
+from .fc_module import FcModule
+
+
+@MODELS.register_module()
+class LinearReIDHead(BaseModule):
+    """Linear head for re-identification.
+
+    Args:
+        num_fcs (int): Number of fcs.
+        in_channels (int): Number of channels in the input.
+        fc_channels (int): Number of channels in the fcs.
+        out_channels (int): Number of channels in the output.
+        norm_cfg (dict, optional): Configuration of normlization method
+            after fc. Defaults to None.
+        act_cfg (dict, optional): Configuration of activation method after fc.
+            Defaults to None.
+        num_classes (int, optional): Number of the identities. Default to None.
+        loss_cls (dict, optional): Cross entropy loss to train the ReID module.
+            Defaults to None.
+        loss_triplet (dict, optional): Triplet loss to train the ReID module.
+            Defaults to None.
+        topk (int | Tuple[int]): Top-k accuracy. Defaults to ``(1, )``.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to dict(type='Normal',layer='Linear', mean=0, std=0.01,
+            bias=0).
+    """
+
+    def __init__(self,
+                 num_fcs: int,
+                 in_channels: int,
+                 fc_channels: int,
+                 out_channels: int,
+                 norm_cfg: Optional[dict] = None,
+                 act_cfg: Optional[dict] = None,
+                 num_classes: Optional[int] = None,
+                 loss_cls: Optional[dict] = None,
+                 loss_triplet: Optional[dict] = None,
+                 topk: Union[int, Tuple[int]] = (1, ),
+                 init_cfg: Union[dict, List[dict]] = dict(
+                     type='Normal', layer='Linear', mean=0, std=0.01, bias=0)):
+        if mmpretrain is None:
+            raise RuntimeError('Please run "pip install openmim" and '
+                               'run "mim install mmpretrain" to '
+                               'install mmpretrain first.')
+        super(LinearReIDHead, self).__init__(init_cfg=init_cfg)
+
+        assert isinstance(topk, (int, tuple))
+        if isinstance(topk, int):
+            topk = (topk, )
+        for _topk in topk:
+            assert _topk > 0, 'Top-k should be larger than 0'
+        self.topk = topk
+
+        if loss_cls is None:
+            if isinstance(num_classes, int):
+                warnings.warn('Since cross entropy is not set, '
+                              'the num_classes will be ignored.')
+            if loss_triplet is None:
+                raise ValueError('Please choose at least one loss in '
+                                 'triplet loss and cross entropy loss.')
+        elif not isinstance(num_classes, int):
+            raise TypeError('The num_classes must be a current number, '
+                            'if there is cross entropy loss.')
+        self.loss_cls = MODELS.build(loss_cls) if loss_cls else None
+        self.loss_triplet = MODELS.build(loss_triplet) \
+            if loss_triplet else None
+
+        self.num_fcs = num_fcs
+        self.in_channels = in_channels
+        self.fc_channels = fc_channels
+        self.out_channels = out_channels
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.num_classes = num_classes
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize fc layers."""
+        self.fcs = nn.ModuleList()
+        for i in range(self.num_fcs):
+            in_channels = self.in_channels if i == 0 else self.fc_channels
+            self.fcs.append(
+                FcModule(in_channels, self.fc_channels, self.norm_cfg,
+                         self.act_cfg))
+        in_channels = self.in_channels if self.num_fcs == 0 else \
+            self.fc_channels
+        self.fc_out = nn.Linear(in_channels, self.out_channels)
+        if self.loss_cls:
+            self.bn = nn.BatchNorm1d(self.out_channels)
+            self.classifier = nn.Linear(self.out_channels, self.num_classes)
+
+    def forward(self, feats: Tuple[torch.Tensor]) -> torch.Tensor:
+        """The forward process."""
+        # Multiple stage inputs are acceptable
+        # but only the last stage will be used.
+        feats = feats[-1]
+
+        for m in self.fcs:
+            feats = m(feats)
+        feats = self.fc_out(feats)
+        return feats
+
+    def loss(self, feats: Tuple[torch.Tensor],
+             data_samples: List[ReIDDataSample]) -> dict:
+        """Calculate losses.
+
+        Args:
+            feats (tuple[Tensor]): The features extracted from the backbone.
+            data_samples (List[ReIDDataSample]): The annotation data of
+                every samples.
+
+        Returns:
+            dict: a dictionary of loss components
+        """
+        # The part can be traced by torch.fx
+        feats = self(feats)
+
+        # The part can not be traced by torch.fx
+        losses = self.loss_by_feat(feats, data_samples)
+        return losses
+
+    def loss_by_feat(self, feats: torch.Tensor,
+                     data_samples: List[ReIDDataSample]) -> dict:
+        """Unpack data samples and compute loss."""
+        losses = dict()
+        gt_label = torch.cat([i.gt_label.label for i in data_samples])
+        gt_label = gt_label.to(feats.device)
+
+        if self.loss_triplet:
+            losses['triplet_loss'] = self.loss_triplet(feats, gt_label)
+
+        if self.loss_cls:
+            feats_bn = self.bn(feats)
+            cls_score = self.classifier(feats_bn)
+            losses['ce_loss'] = self.loss_cls(cls_score, gt_label)
+            acc = Accuracy.calculate(cls_score, gt_label, topk=self.topk)
+            losses.update(
+                {f'accuracy_top-{k}': a
+                 for k, a in zip(self.topk, acc)})
+
+        return losses
+
+    def predict(
+            self,
+            feats: Tuple[torch.Tensor],
+            data_samples: List[ReIDDataSample] = None) -> List[ReIDDataSample]:
+        """Inference without augmentation.
+
+        Args:
+            feats (Tuple[Tensor]): The features extracted from the backbone.
+                Multiple stage inputs are acceptable but only the last stage
+                will be used.
+            data_samples (List[ReIDDataSample], optional): The annotation
+                data of every samples. If not None, set ``pred_label`` of
+                the input data samples. Defaults to None.
+
+        Returns:
+            List[ReIDDataSample]: A list of data samples which contains the
+            predicted results.
+        """
+        # The part can be traced by torch.fx
+        feats = self(feats)
+
+        # The part can not be traced by torch.fx
+        data_samples = self.predict_by_feat(feats, data_samples)
+
+        return data_samples
+
+    def predict_by_feat(
+            self,
+            feats: torch.Tensor,
+            data_samples: List[ReIDDataSample] = None) -> List[ReIDDataSample]:
+        """Add prediction features to data samples."""
+        if data_samples is not None:
+            for data_sample, feat in zip(data_samples, feats):
+                data_sample.pred_feature = feat
+        else:
+            data_samples = []
+            for feat in feats:
+                data_sample = ReIDDataSample()
+                data_sample.pred_feature = feat
+                data_samples.append(data_sample)
+
+        return data_samples
diff --git a/head_extractor/src/mmdet/models/roi_heads/__init__.py b/head_extractor/src/mmdet/models/roi_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bba5664cc5ae5229ddebcb42f7583364ca9f77d8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_roi_head import BaseRoIHead
+from .bbox_heads import (BBoxHead, ConvFCBBoxHead, DIIHead,
+                         DoubleConvFCBBoxHead, SABLHead, SCNetBBoxHead,
+                         Shared2FCBBoxHead, Shared4Conv1FCBBoxHead)
+from .cascade_roi_head import CascadeRoIHead
+from .double_roi_head import DoubleHeadRoIHead
+from .dynamic_roi_head import DynamicRoIHead
+from .grid_roi_head import GridRoIHead
+from .htc_roi_head import HybridTaskCascadeRoIHead
+from .mask_heads import (CoarseMaskHead, FCNMaskHead, FeatureRelayHead,
+                         FusedSemanticHead, GlobalContextHead, GridHead,
+                         HTCMaskHead, MaskIoUHead, MaskPointHead,
+                         SCNetMaskHead, SCNetSemanticHead)
+from .mask_scoring_roi_head import MaskScoringRoIHead
+from .multi_instance_roi_head import MultiInstanceRoIHead
+from .pisa_roi_head import PISARoIHead
+from .point_rend_roi_head import PointRendRoIHead
+from .roi_extractors import (BaseRoIExtractor, GenericRoIExtractor,
+                             SingleRoIExtractor)
+from .scnet_roi_head import SCNetRoIHead
+from .shared_heads import ResLayer
+from .sparse_roi_head import SparseRoIHead
+from .standard_roi_head import StandardRoIHead
+from .trident_roi_head import TridentRoIHead
+
+__all__ = [
+    'BaseRoIHead', 'CascadeRoIHead', 'DoubleHeadRoIHead', 'MaskScoringRoIHead',
+    'HybridTaskCascadeRoIHead', 'GridRoIHead', 'ResLayer', 'BBoxHead',
+    'ConvFCBBoxHead', 'DIIHead', 'SABLHead', 'Shared2FCBBoxHead',
+    'StandardRoIHead', 'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead',
+    'FCNMaskHead', 'HTCMaskHead', 'FusedSemanticHead', 'GridHead',
+    'MaskIoUHead', 'BaseRoIExtractor', 'GenericRoIExtractor',
+    'SingleRoIExtractor', 'PISARoIHead', 'PointRendRoIHead', 'MaskPointHead',
+    'CoarseMaskHead', 'DynamicRoIHead', 'SparseRoIHead', 'TridentRoIHead',
+    'SCNetRoIHead', 'SCNetMaskHead', 'SCNetSemanticHead', 'SCNetBBoxHead',
+    'FeatureRelayHead', 'GlobalContextHead', 'MultiInstanceRoIHead'
+]
diff --git a/head_extractor/src/mmdet/models/roi_heads/base_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/base_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..405f80a73ecc5db7343d81ca55518160fcbc2b63
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/base_roi_head.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Tuple
+
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig
+
+
+class BaseRoIHead(BaseModule, metaclass=ABCMeta):
+    """Base class for RoIHeads."""
+
+    def __init__(self,
+                 bbox_roi_extractor: OptMultiConfig = None,
+                 bbox_head: OptMultiConfig = None,
+                 mask_roi_extractor: OptMultiConfig = None,
+                 mask_head: OptMultiConfig = None,
+                 shared_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if shared_head is not None:
+            self.shared_head = MODELS.build(shared_head)
+
+        if bbox_head is not None:
+            self.init_bbox_head(bbox_roi_extractor, bbox_head)
+
+        if mask_head is not None:
+            self.init_mask_head(mask_roi_extractor, mask_head)
+
+        self.init_assigner_sampler()
+
+    @property
+    def with_bbox(self) -> bool:
+        """bool: whether the RoI head contains a `bbox_head`"""
+        return hasattr(self, 'bbox_head') and self.bbox_head is not None
+
+    @property
+    def with_mask(self) -> bool:
+        """bool: whether the RoI head contains a `mask_head`"""
+        return hasattr(self, 'mask_head') and self.mask_head is not None
+
+    @property
+    def with_shared_head(self) -> bool:
+        """bool: whether the RoI head contains a `shared_head`"""
+        return hasattr(self, 'shared_head') and self.shared_head is not None
+
+    @abstractmethod
+    def init_bbox_head(self, *args, **kwargs):
+        """Initialize ``bbox_head``"""
+        pass
+
+    @abstractmethod
+    def init_mask_head(self, *args, **kwargs):
+        """Initialize ``mask_head``"""
+        pass
+
+    @abstractmethod
+    def init_assigner_sampler(self, *args, **kwargs):
+        """Initialize assigner and sampler."""
+        pass
+
+    @abstractmethod
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList):
+        """Perform forward propagation and loss calculation of the roi head on
+        the features of the upstream network."""
+
+    def predict(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (N, C, H, W).
+            rpn_results_list (list[:obj:`InstanceData`]): list of region
+                proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results to
+                the original image. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        # TODO: nms_op in mmcv need be enhanced, the bbox result may get
+        #  difference when not rescale in bbox_head
+
+        # If it has the mask branch, the bbox branch does not need
+        # to be scaled to the original image scale, because the mask
+        # branch will scale both bbox and mask at the same time.
+        bbox_rescale = rescale if not self.with_mask else False
+        results_list = self.predict_bbox(
+            x,
+            batch_img_metas,
+            rpn_results_list,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=bbox_rescale)
+
+        if self.with_mask:
+            results_list = self.predict_mask(
+                x, batch_img_metas, results_list, rescale=rescale)
+
+        return results_list
diff --git a/head_extractor/src/mmdet/models/roi_heads/bbox_heads/__init__.py b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9e742abfecfc9dfe37b78822407fc92e9d64cc3
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .bbox_head import BBoxHead
+from .convfc_bbox_head import (ConvFCBBoxHead, Shared2FCBBoxHead,
+                               Shared4Conv1FCBBoxHead)
+from .dii_head import DIIHead
+from .double_bbox_head import DoubleConvFCBBoxHead
+from .multi_instance_bbox_head import MultiInstanceBBoxHead
+from .sabl_head import SABLHead
+from .scnet_bbox_head import SCNetBBoxHead
+
+__all__ = [
+    'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead',
+    'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'SABLHead', 'DIIHead',
+    'SCNetBBoxHead', 'MultiInstanceBBoxHead'
+]
diff --git a/head_extractor/src/mmdet/models/roi_heads/bbox_heads/bbox_head.py b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b2e8aae0833ae0351b544099d79d296f082a76e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/bbox_head.py
@@ -0,0 +1,708 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.layers import multiclass_nms
+from mmdet.models.losses import accuracy
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import empty_instances, multi_apply
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures.bbox import get_box_tensor, scale_boxes
+from mmdet.utils import ConfigType, InstanceList, OptMultiConfig
+
+
+@MODELS.register_module()
+class BBoxHead(BaseModule):
+    """Simplest RoI head, with only two fc layers for classification and
+    regression respectively."""
+
+    def __init__(self,
+                 with_avg_pool: bool = False,
+                 with_cls: bool = True,
+                 with_reg: bool = True,
+                 roi_feat_size: int = 7,
+                 in_channels: int = 256,
+                 num_classes: int = 80,
+                 bbox_coder: ConfigType = dict(
+                     type='DeltaXYWHBBoxCoder',
+                     clip_border=True,
+                     target_means=[0., 0., 0., 0.],
+                     target_stds=[0.1, 0.1, 0.2, 0.2]),
+                 predict_box_type: str = 'hbox',
+                 reg_class_agnostic: bool = False,
+                 reg_decoded_bbox: bool = False,
+                 reg_predictor_cfg: ConfigType = dict(type='Linear'),
+                 cls_predictor_cfg: ConfigType = dict(type='Linear'),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_bbox: ConfigType = dict(
+                     type='SmoothL1Loss', beta=1.0, loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert with_cls or with_reg
+        self.with_avg_pool = with_avg_pool
+        self.with_cls = with_cls
+        self.with_reg = with_reg
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1]
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.predict_box_type = predict_box_type
+        self.reg_class_agnostic = reg_class_agnostic
+        self.reg_decoded_bbox = reg_decoded_bbox
+        self.reg_predictor_cfg = reg_predictor_cfg
+        self.cls_predictor_cfg = cls_predictor_cfg
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox = MODELS.build(loss_bbox)
+
+        in_channels = self.in_channels
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(self.roi_feat_size)
+        else:
+            in_channels *= self.roi_feat_area
+        if self.with_cls:
+            # need to add background class
+            if self.custom_cls_channels:
+                cls_channels = self.loss_cls.get_cls_channels(self.num_classes)
+            else:
+                cls_channels = num_classes + 1
+            cls_predictor_cfg_ = self.cls_predictor_cfg.copy()
+            cls_predictor_cfg_.update(
+                in_features=in_channels, out_features=cls_channels)
+            self.fc_cls = MODELS.build(cls_predictor_cfg_)
+        if self.with_reg:
+            box_dim = self.bbox_coder.encode_size
+            out_dim_reg = box_dim if reg_class_agnostic else \
+                box_dim * num_classes
+            reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
+            if isinstance(reg_predictor_cfg_, (dict, ConfigDict)):
+                reg_predictor_cfg_.update(
+                    in_features=in_channels, out_features=out_dim_reg)
+            self.fc_reg = MODELS.build(reg_predictor_cfg_)
+        self.debug_imgs = None
+        if init_cfg is None:
+            self.init_cfg = []
+            if self.with_cls:
+                self.init_cfg += [
+                    dict(
+                        type='Normal', std=0.01, override=dict(name='fc_cls'))
+                ]
+            if self.with_reg:
+                self.init_cfg += [
+                    dict(
+                        type='Normal', std=0.001, override=dict(name='fc_reg'))
+                ]
+
+    # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead
+    @property
+    def custom_cls_channels(self) -> bool:
+        """get custom_cls_channels from loss_cls."""
+        return getattr(self.loss_cls, 'custom_cls_channels', False)
+
+    # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead
+    @property
+    def custom_activation(self) -> bool:
+        """get custom_activation from loss_cls."""
+        return getattr(self.loss_cls, 'custom_activation', False)
+
+    # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead
+    @property
+    def custom_accuracy(self) -> bool:
+        """get custom_accuracy from loss_cls."""
+        return getattr(self.loss_cls, 'custom_accuracy', False)
+
+    def forward(self, x: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_score (Tensor): Classification scores for all
+                  scale levels, each is a 4D-tensor, the channels number
+                  is num_base_priors * num_classes.
+                - bbox_pred (Tensor): Box energies / deltas for all
+                  scale levels, each is a 4D-tensor, the channels number
+                  is num_base_priors * 4.
+        """
+        if self.with_avg_pool:
+            if x.numel() > 0:
+                x = self.avg_pool(x)
+                x = x.view(x.size(0), -1)
+            else:
+                # avg_pool does not support empty tensor,
+                # so use torch.mean instead it
+                x = torch.mean(x, dim=(-1, -2))
+        cls_score = self.fc_cls(x) if self.with_cls else None
+        bbox_pred = self.fc_reg(x) if self.with_reg else None
+        return cls_score, bbox_pred
+
+    def _get_targets_single(self, pos_priors: Tensor, neg_priors: Tensor,
+                            pos_gt_bboxes: Tensor, pos_gt_labels: Tensor,
+                            cfg: ConfigDict) -> tuple:
+        """Calculate the ground truth for proposals in the single image
+        according to the sampling results.
+
+        Args:
+            pos_priors (Tensor): Contains all the positive boxes,
+                has shape (num_pos, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            neg_priors (Tensor): Contains all the negative boxes,
+                has shape (num_neg, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_bboxes (Tensor): Contains gt_boxes for
+                all positive samples, has shape (num_pos, 4),
+                the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_labels (Tensor): Contains gt_labels for
+                all positive samples, has shape (num_pos, ).
+            cfg (obj:`ConfigDict`): `train_cfg` of R-CNN.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals
+            in a single image. Containing the following Tensors:
+
+                - labels(Tensor): Gt_labels for all proposals, has
+                  shape (num_proposals,).
+                - label_weights(Tensor): Labels_weights for all
+                  proposals, has shape (num_proposals,).
+                - bbox_targets(Tensor):Regression target for all
+                  proposals, has shape (num_proposals, 4), the
+                  last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+                - bbox_weights(Tensor):Regression weights for all
+                  proposals, has shape (num_proposals, 4).
+        """
+        num_pos = pos_priors.size(0)
+        num_neg = neg_priors.size(0)
+        num_samples = num_pos + num_neg
+
+        # original implementation uses new_zeros since BG are set to be 0
+        # now use empty & fill because BG cat_id = num_classes,
+        # FG cat_id = [0, num_classes-1]
+        labels = pos_priors.new_full((num_samples, ),
+                                     self.num_classes,
+                                     dtype=torch.long)
+        reg_dim = pos_gt_bboxes.size(-1) if self.reg_decoded_bbox \
+            else self.bbox_coder.encode_size
+        label_weights = pos_priors.new_zeros(num_samples)
+        bbox_targets = pos_priors.new_zeros(num_samples, reg_dim)
+        bbox_weights = pos_priors.new_zeros(num_samples, reg_dim)
+        if num_pos > 0:
+            labels[:num_pos] = pos_gt_labels
+            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
+            label_weights[:num_pos] = pos_weight
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    pos_priors, pos_gt_bboxes)
+            else:
+                # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+                # is applied directly on the decoded bounding boxes, both
+                # the predicted boxes and regression targets should be with
+                # absolute coordinate format.
+                pos_bbox_targets = get_box_tensor(pos_gt_bboxes)
+            bbox_targets[:num_pos, :] = pos_bbox_targets
+            bbox_weights[:num_pos, :] = 1
+        if num_neg > 0:
+            label_weights[-num_neg:] = 1.0
+
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_targets(self,
+                    sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict,
+                    concat: bool = True) -> tuple:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Almost the same as the implementation in bbox_head, we passed
+        additional parameters pos_inds_list and neg_inds_list to
+        `_get_targets_single` function.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following list of Tensors:
+
+            - labels (list[Tensor],Tensor): Gt_labels for all
+                proposals in a batch, each tensor in list has
+                shape (num_proposals,) when `concat=False`, otherwise
+                just a single tensor has shape (num_all_proposals,).
+            - label_weights (list[Tensor]): Labels_weights for
+                all proposals in a batch, each tensor in list has
+                shape (num_proposals,) when `concat=False`, otherwise
+                just a single tensor has shape (num_all_proposals,).
+            - bbox_targets (list[Tensor],Tensor): Regression target
+                for all proposals in a batch, each tensor in list
+                has shape (num_proposals, 4) when `concat=False`,
+                otherwise just a single tensor has shape
+                (num_all_proposals, 4), the last dimension 4 represents
+                [tl_x, tl_y, br_x, br_y].
+            - bbox_weights (list[tensor],Tensor): Regression weights for
+                all proposals in a batch, each tensor in list has shape
+                (num_proposals, 4) when `concat=False`, otherwise just a
+                single tensor has shape (num_all_proposals, 4).
+        """
+        pos_priors_list = [res.pos_priors for res in sampling_results]
+        neg_priors_list = [res.neg_priors for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
+        labels, label_weights, bbox_targets, bbox_weights = multi_apply(
+            self._get_targets_single,
+            pos_priors_list,
+            neg_priors_list,
+            pos_gt_bboxes_list,
+            pos_gt_labels_list,
+            cfg=rcnn_train_cfg)
+
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            bbox_weights = torch.cat(bbox_weights, 0)
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def loss_and_target(self,
+                        cls_score: Tensor,
+                        bbox_pred: Tensor,
+                        rois: Tensor,
+                        sampling_results: List[SamplingResult],
+                        rcnn_train_cfg: ConfigDict,
+                        concat: bool = True,
+                        reduction_override: Optional[str] = None) -> dict:
+        """Calculate the loss based on the features extracted by the bbox head.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            bbox_pred (Tensor): Regression prediction results,
+                has shape
+                (batch_size * num_proposals_single_image, 4), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            rois (Tensor): RoIs with the shape
+                (batch_size * num_proposals_single_image, 5) where the first
+                column indicates batch id of each RoI.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch. Defaults to True.
+            reduction_override (str, optional): The reduction
+                method used to override the original reduction
+                method of the loss. Options are "none",
+                "mean" and "sum". Defaults to None,
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+                The targets are only used for cascade rcnn.
+        """
+
+        cls_reg_targets = self.get_targets(
+            sampling_results, rcnn_train_cfg, concat=concat)
+        losses = self.loss(
+            cls_score,
+            bbox_pred,
+            rois,
+            *cls_reg_targets,
+            reduction_override=reduction_override)
+
+        # cls_reg_targets is only for cascade rcnn
+        return dict(loss_bbox=losses, bbox_targets=cls_reg_targets)
+
+    def loss(self,
+             cls_score: Tensor,
+             bbox_pred: Tensor,
+             rois: Tensor,
+             labels: Tensor,
+             label_weights: Tensor,
+             bbox_targets: Tensor,
+             bbox_weights: Tensor,
+             reduction_override: Optional[str] = None) -> dict:
+        """Calculate the loss based on the network predictions and targets.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            bbox_pred (Tensor): Regression prediction results,
+                has shape
+                (batch_size * num_proposals_single_image, 4), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            rois (Tensor): RoIs with the shape
+                (batch_size * num_proposals_single_image, 5) where the first
+                column indicates batch id of each RoI.
+            labels (Tensor): Gt_labels for all proposals in a batch, has
+                shape (batch_size * num_proposals_single_image, ).
+            label_weights (Tensor): Labels_weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, ).
+            bbox_targets (Tensor): Regression target for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, 4),
+                the last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            bbox_weights (Tensor): Regression weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, 4).
+            reduction_override (str, optional): The reduction
+                method used to override the original reduction
+                method of the loss. Options are "none",
+                "mean" and "sum". Defaults to None,
+
+        Returns:
+            dict: A dictionary of loss.
+        """
+
+        losses = dict()
+
+        if cls_score is not None:
+            avg_factor = max(torch.sum(label_weights > 0).float().item(), 1.)
+            if cls_score.numel() > 0:
+                loss_cls_ = self.loss_cls(
+                    cls_score,
+                    labels,
+                    label_weights,
+                    avg_factor=avg_factor,
+                    reduction_override=reduction_override)
+                if isinstance(loss_cls_, dict):
+                    losses.update(loss_cls_)
+                else:
+                    losses['loss_cls'] = loss_cls_
+                if self.custom_activation:
+                    acc_ = self.loss_cls.get_accuracy(cls_score, labels)
+                    losses.update(acc_)
+                else:
+                    losses['acc'] = accuracy(cls_score, labels)
+        if bbox_pred is not None:
+            bg_class_ind = self.num_classes
+            # 0~self.num_classes-1 are FG, self.num_classes is BG
+            pos_inds = (labels >= 0) & (labels < bg_class_ind)
+            # do not perform bounding box regression for BG anymore.
+            if pos_inds.any():
+                if self.reg_decoded_bbox:
+                    # When the regression loss (e.g. `IouLoss`,
+                    # `GIouLoss`, `DIouLoss`) is applied directly on
+                    # the decoded bounding boxes, it decodes the
+                    # already encoded coordinates to absolute format.
+                    bbox_pred = self.bbox_coder.decode(rois[:, 1:], bbox_pred)
+                    bbox_pred = get_box_tensor(bbox_pred)
+                if self.reg_class_agnostic:
+                    pos_bbox_pred = bbox_pred.view(
+                        bbox_pred.size(0), -1)[pos_inds.type(torch.bool)]
+                else:
+                    pos_bbox_pred = bbox_pred.view(
+                        bbox_pred.size(0), self.num_classes,
+                        -1)[pos_inds.type(torch.bool),
+                            labels[pos_inds.type(torch.bool)]]
+                losses['loss_bbox'] = self.loss_bbox(
+                    pos_bbox_pred,
+                    bbox_targets[pos_inds.type(torch.bool)],
+                    bbox_weights[pos_inds.type(torch.bool)],
+                    avg_factor=bbox_targets.size(0),
+                    reduction_override=reduction_override)
+            else:
+                losses['loss_bbox'] = bbox_pred[pos_inds].sum()
+
+        return losses
+
+    def predict_by_feat(self,
+                        rois: Tuple[Tensor],
+                        cls_scores: Tuple[Tensor],
+                        bbox_preds: Tuple[Tensor],
+                        batch_img_metas: List[dict],
+                        rcnn_test_cfg: Optional[ConfigDict] = None,
+                        rescale: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        bbox results.
+
+        Args:
+            rois (tuple[Tensor]): Tuple of boxes to be transformed.
+                Each has shape  (num_boxes, 5). last dimension 5 arrange as
+                (batch_index, x1, y1, x2, y2).
+            cls_scores (tuple[Tensor]): Tuple of box scores, each has shape
+                (num_boxes, num_classes + 1).
+            bbox_preds (tuple[Tensor]): Tuple of box energies / deltas, each
+                has shape (num_boxes, num_classes * 4).
+            batch_img_metas (list[dict]): List of image information.
+            rcnn_test_cfg (obj:`ConfigDict`, optional): `test_cfg` of R-CNN.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Instance segmentation
+            results of each image after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        result_list = []
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            results = self._predict_by_feat_single(
+                roi=rois[img_id],
+                cls_score=cls_scores[img_id],
+                bbox_pred=bbox_preds[img_id],
+                img_meta=img_meta,
+                rescale=rescale,
+                rcnn_test_cfg=rcnn_test_cfg)
+            result_list.append(results)
+
+        return result_list
+
+    def _predict_by_feat_single(
+            self,
+            roi: Tensor,
+            cls_score: Tensor,
+            bbox_pred: Tensor,
+            img_meta: dict,
+            rescale: bool = False,
+            rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5).
+                last dimension 5 arrange as (batch_index, x1, y1, x2, y2).
+            cls_score (Tensor): Box scores, has shape
+                (num_boxes, num_classes + 1).
+            bbox_pred (Tensor): Box energies / deltas.
+                has shape (num_boxes, num_classes * 4).
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image\
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results = InstanceData()
+        if roi.shape[0] == 0:
+            return empty_instances([img_meta],
+                                   roi.device,
+                                   task_type='bbox',
+                                   instance_results=[results],
+                                   box_type=self.predict_box_type,
+                                   use_box_type=False,
+                                   num_classes=self.num_classes,
+                                   score_per_cls=rcnn_test_cfg is None)[0]
+
+        # some loss (Seesaw loss..) may have custom activation
+        if self.custom_cls_channels:
+            scores = self.loss_cls.get_activation(cls_score)
+        else:
+            scores = F.softmax(
+                cls_score, dim=-1) if cls_score is not None else None
+
+        img_shape = img_meta['img_shape']
+        num_rois = roi.size(0)
+        # bbox_pred would be None in some detector when with_reg is False,
+        # e.g. Grid R-CNN.
+        if bbox_pred is not None:
+            num_classes = 1 if self.reg_class_agnostic else self.num_classes
+            roi = roi.repeat_interleave(num_classes, dim=0)
+            bbox_pred = bbox_pred.view(-1, self.bbox_coder.encode_size)
+            bboxes = self.bbox_coder.decode(
+                roi[..., 1:], bbox_pred, max_shape=img_shape)
+        else:
+            bboxes = roi[:, 1:].clone()
+            if img_shape is not None and bboxes.size(-1) == 4:
+                bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1])
+                bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0])
+
+        if rescale and bboxes.size(0) > 0:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = [1 / s for s in img_meta['scale_factor']]
+            bboxes = scale_boxes(bboxes, scale_factor)
+
+        # Get the inside tensor when `bboxes` is a box type
+        bboxes = get_box_tensor(bboxes)
+        box_dim = bboxes.size(-1)
+        bboxes = bboxes.view(num_rois, -1)
+
+        if rcnn_test_cfg is None:
+            # This means that it is aug test.
+            # It needs to return the raw results without nms.
+            results.bboxes = bboxes
+            results.scores = scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes,
+                scores,
+                rcnn_test_cfg.score_thr,
+                rcnn_test_cfg.nms,
+                rcnn_test_cfg.max_per_img,
+                box_dim=box_dim)
+            results.bboxes = det_bboxes[:, :-1]
+            results.scores = det_bboxes[:, -1]
+            results.labels = det_labels
+        return results
+
+    def refine_bboxes(self, sampling_results: Union[List[SamplingResult],
+                                                    InstanceList],
+                      bbox_results: dict,
+                      batch_img_metas: List[dict]) -> InstanceList:
+        """Refine bboxes during training.
+
+        Args:
+            sampling_results (List[:obj:`SamplingResult`] or
+                List[:obj:`InstanceData`]): Sampling results.
+                :obj:`SamplingResult` is the real sampling results
+                calculate from bbox_head, while :obj:`InstanceData` is
+                fake sampling results, e.g., in Sparse R-CNN or QueryInst, etc.
+            bbox_results (dict): Usually is a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+            batch_img_metas (List[dict]): List of image information.
+
+        Returns:
+            list[:obj:`InstanceData`]: Refined bboxes of each image.
+
+        Example:
+            >>> # xdoctest: +REQUIRES(module:kwarray)
+            >>> import numpy as np
+            >>> from mmdet.models.task_modules.samplers.
+            ... sampling_result import random_boxes
+            >>> from mmdet.models.task_modules.samplers import SamplingResult
+            >>> self = BBoxHead(reg_class_agnostic=True)
+            >>> n_roi = 2
+            >>> n_img = 4
+            >>> scale = 512
+            >>> rng = np.random.RandomState(0)
+            ... batch_img_metas = [{'img_shape': (scale, scale)}
+            >>>                     for _ in range(n_img)]
+            >>> sampling_results = [SamplingResult.random(rng=10)
+            ...                     for _ in range(n_img)]
+            >>> # Create rois in the expected format
+            >>> roi_boxes = random_boxes(n_roi, scale=scale, rng=rng)
+            >>> img_ids = torch.randint(0, n_img, (n_roi,))
+            >>> img_ids = img_ids.float()
+            >>> rois = torch.cat([img_ids[:, None], roi_boxes], dim=1)
+            >>> # Create other args
+            >>> labels = torch.randint(0, 81, (scale,)).long()
+            >>> bbox_preds = random_boxes(n_roi, scale=scale, rng=rng)
+            >>> cls_score = torch.randn((scale, 81))
+            ... # For each image, pretend random positive boxes are gts
+            >>> bbox_targets = (labels, None, None, None)
+            ... bbox_results = dict(rois=rois, bbox_pred=bbox_preds,
+            ...                     cls_score=cls_score,
+            ...                     bbox_targets=bbox_targets)
+            >>> bboxes_list = self.refine_bboxes(sampling_results,
+            ...                                  bbox_results,
+            ...                                  batch_img_metas)
+            >>> print(bboxes_list)
+        """
+        pos_is_gts = [res.pos_is_gt for res in sampling_results]
+        # bbox_targets is a tuple
+        labels = bbox_results['bbox_targets'][0]
+        cls_scores = bbox_results['cls_score']
+        rois = bbox_results['rois']
+        bbox_preds = bbox_results['bbox_pred']
+        if self.custom_activation:
+            # TODO: Create a SeasawBBoxHead to simplified logic in BBoxHead
+            cls_scores = self.loss_cls.get_activation(cls_scores)
+        if cls_scores.numel() == 0:
+            return None
+        if cls_scores.shape[-1] == self.num_classes + 1:
+            # remove background class
+            cls_scores = cls_scores[:, :-1]
+        elif cls_scores.shape[-1] != self.num_classes:
+            raise ValueError('The last dim of `cls_scores` should equal to '
+                             '`num_classes` or `num_classes + 1`,'
+                             f'but got {cls_scores.shape[-1]}.')
+        labels = torch.where(labels == self.num_classes, cls_scores.argmax(1),
+                             labels)
+
+        img_ids = rois[:, 0].long().unique(sorted=True)
+        assert img_ids.numel() <= len(batch_img_metas)
+
+        results_list = []
+        for i in range(len(batch_img_metas)):
+            inds = torch.nonzero(
+                rois[:, 0] == i, as_tuple=False).squeeze(dim=1)
+            num_rois = inds.numel()
+
+            bboxes_ = rois[inds, 1:]
+            label_ = labels[inds]
+            bbox_pred_ = bbox_preds[inds]
+            img_meta_ = batch_img_metas[i]
+            pos_is_gts_ = pos_is_gts[i]
+
+            bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_,
+                                           img_meta_)
+            # filter gt bboxes
+            pos_keep = 1 - pos_is_gts_
+            keep_inds = pos_is_gts_.new_ones(num_rois)
+            keep_inds[:len(pos_is_gts_)] = pos_keep
+            results = InstanceData(bboxes=bboxes[keep_inds.type(torch.bool)])
+            results_list.append(results)
+
+        return results_list
+
+    def regress_by_class(self, priors: Tensor, label: Tensor,
+                         bbox_pred: Tensor, img_meta: dict) -> Tensor:
+        """Regress the bbox for the predicted class. Used in Cascade R-CNN.
+
+        Args:
+            priors (Tensor): Priors from `rpn_head` or last stage
+                `bbox_head`, has shape (num_proposals, 4).
+            label (Tensor): Only used when `self.reg_class_agnostic`
+                is False, has shape (num_proposals, ).
+            bbox_pred (Tensor): Regression prediction of
+                current stage `bbox_head`. When `self.reg_class_agnostic`
+                is False, it has shape (n, num_classes * 4), otherwise
+                it has shape (n, 4).
+            img_meta (dict): Image meta info.
+
+        Returns:
+            Tensor: Regressed bboxes, the same shape as input rois.
+        """
+        reg_dim = self.bbox_coder.encode_size
+        if not self.reg_class_agnostic:
+            label = label * reg_dim
+            inds = torch.stack([label + i for i in range(reg_dim)], 1)
+            bbox_pred = torch.gather(bbox_pred, 1, inds)
+        assert bbox_pred.size()[1] == reg_dim
+
+        max_shape = img_meta['img_shape']
+        regressed_bboxes = self.bbox_coder.decode(
+            priors, bbox_pred, max_shape=max_shape)
+        return regressed_bboxes
diff --git a/head_extractor/src/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb6aadd86d34af3605d432492931442026432cc8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/convfc_bbox_head.py
@@ -0,0 +1,249 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .bbox_head import BBoxHead
+
+
+@MODELS.register_module()
+class ConvFCBBoxHead(BBoxHead):
+    r"""More general bbox head, with shared conv and fc layers and two optional
+    separated branches.
+
+    .. code-block:: none
+
+                                    /-> cls convs -> cls fcs -> cls
+        shared convs -> shared fcs
+                                    \-> reg convs -> reg fcs -> reg
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_shared_convs: int = 0,
+                 num_shared_fcs: int = 0,
+                 num_cls_convs: int = 0,
+                 num_cls_fcs: int = 0,
+                 num_reg_convs: int = 0,
+                 num_reg_fcs: int = 0,
+                 conv_out_channels: int = 256,
+                 fc_out_channels: int = 1024,
+                 conv_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 norm_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 init_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 *args,
+                 **kwargs) -> None:
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        assert (num_shared_convs + num_shared_fcs + num_cls_convs +
+                num_cls_fcs + num_reg_convs + num_reg_fcs > 0)
+        if num_cls_convs > 0 or num_reg_convs > 0:
+            assert num_shared_fcs == 0
+        if not self.with_cls:
+            assert num_cls_convs == 0 and num_cls_fcs == 0
+        if not self.with_reg:
+            assert num_reg_convs == 0 and num_reg_fcs == 0
+        self.num_shared_convs = num_shared_convs
+        self.num_shared_fcs = num_shared_fcs
+        self.num_cls_convs = num_cls_convs
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_convs = num_reg_convs
+        self.num_reg_fcs = num_reg_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        # add shared convs and fcs
+        self.shared_convs, self.shared_fcs, last_layer_dim = \
+            self._add_conv_fc_branch(
+                self.num_shared_convs, self.num_shared_fcs, self.in_channels,
+                True)
+        self.shared_out_channels = last_layer_dim
+
+        # add cls specific branch
+        self.cls_convs, self.cls_fcs, self.cls_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels)
+
+        # add reg specific branch
+        self.reg_convs, self.reg_fcs, self.reg_last_dim = \
+            self._add_conv_fc_branch(
+                self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels)
+
+        if self.num_shared_fcs == 0 and not self.with_avg_pool:
+            if self.num_cls_fcs == 0:
+                self.cls_last_dim *= self.roi_feat_area
+            if self.num_reg_fcs == 0:
+                self.reg_last_dim *= self.roi_feat_area
+
+        self.relu = nn.ReLU(inplace=True)
+        # reconstruct fc_cls and fc_reg since input channels are changed
+        if self.with_cls:
+            if self.custom_cls_channels:
+                cls_channels = self.loss_cls.get_cls_channels(self.num_classes)
+            else:
+                cls_channels = self.num_classes + 1
+            cls_predictor_cfg_ = self.cls_predictor_cfg.copy()
+            cls_predictor_cfg_.update(
+                in_features=self.cls_last_dim, out_features=cls_channels)
+            self.fc_cls = MODELS.build(cls_predictor_cfg_)
+        if self.with_reg:
+            box_dim = self.bbox_coder.encode_size
+            out_dim_reg = box_dim if self.reg_class_agnostic else \
+                box_dim * self.num_classes
+            reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
+            if isinstance(reg_predictor_cfg_, (dict, ConfigDict)):
+                reg_predictor_cfg_.update(
+                    in_features=self.reg_last_dim, out_features=out_dim_reg)
+            self.fc_reg = MODELS.build(reg_predictor_cfg_)
+
+        if init_cfg is None:
+            # when init_cfg is None,
+            # It has been set to
+            # [[dict(type='Normal', std=0.01, override=dict(name='fc_cls'))],
+            #  [dict(type='Normal', std=0.001, override=dict(name='fc_reg'))]
+            # after `super(ConvFCBBoxHead, self).__init__()`
+            # we only need to append additional configuration
+            # for `shared_fcs`, `cls_fcs` and `reg_fcs`
+            self.init_cfg += [
+                dict(
+                    type='Xavier',
+                    distribution='uniform',
+                    override=[
+                        dict(name='shared_fcs'),
+                        dict(name='cls_fcs'),
+                        dict(name='reg_fcs')
+                    ])
+            ]
+
+    def _add_conv_fc_branch(self,
+                            num_branch_convs: int,
+                            num_branch_fcs: int,
+                            in_channels: int,
+                            is_shared: bool = False) -> tuple:
+        """Add shared or separable branch.
+
+        convs -> avg pool (optional) -> fcs
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels,
+                        self.conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            last_layer_dim = self.conv_out_channels
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            # for shared branch, only consider self.with_avg_pool
+            # for separated branches, also consider self.num_shared_fcs
+            if (is_shared
+                    or self.num_shared_fcs == 0) and not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+        return branch_convs, branch_fcs, last_layer_dim
+
+    def forward(self, x: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_score (Tensor): Classification scores for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * num_classes.
+                - bbox_pred (Tensor): Box energies / deltas for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_base_priors * 4.
+        """
+        # shared part
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+
+            x = x.flatten(1)
+
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+        # separate branches
+        x_cls = x
+        x_reg = x
+
+        for conv in self.cls_convs:
+            x_cls = conv(x_cls)
+        if x_cls.dim() > 2:
+            if self.with_avg_pool:
+                x_cls = self.avg_pool(x_cls)
+            x_cls = x_cls.flatten(1)
+        for fc in self.cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+
+        for conv in self.reg_convs:
+            x_reg = conv(x_reg)
+        if x_reg.dim() > 2:
+            if self.with_avg_pool:
+                x_reg = self.avg_pool(x_reg)
+            x_reg = x_reg.flatten(1)
+        for fc in self.reg_fcs:
+            x_reg = self.relu(fc(x_reg))
+
+        cls_score = self.fc_cls(x_cls) if self.with_cls else None
+        bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
+        return cls_score, bbox_pred
+
+
+@MODELS.register_module()
+class Shared2FCBBoxHead(ConvFCBBoxHead):
+
+    def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None:
+        super().__init__(
+            num_shared_convs=0,
+            num_shared_fcs=2,
+            num_cls_convs=0,
+            num_cls_fcs=0,
+            num_reg_convs=0,
+            num_reg_fcs=0,
+            fc_out_channels=fc_out_channels,
+            *args,
+            **kwargs)
+
+
+@MODELS.register_module()
+class Shared4Conv1FCBBoxHead(ConvFCBBoxHead):
+
+    def __init__(self, fc_out_channels: int = 1024, *args, **kwargs) -> None:
+        super().__init__(
+            num_shared_convs=4,
+            num_shared_fcs=1,
+            num_cls_convs=0,
+            num_cls_fcs=0,
+            num_reg_convs=0,
+            num_reg_fcs=0,
+            fc_out_channels=fc_out_channels,
+            *args,
+            **kwargs)
diff --git a/head_extractor/src/mmdet/models/roi_heads/bbox_heads/dii_head.py b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/dii_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae9a31bbeb2a8f1da62b457363fa05031d21925a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/dii_head.py
@@ -0,0 +1,422 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmengine.config import ConfigDict
+from mmengine.model import bias_init_with_prob
+from torch import Tensor
+
+from mmdet.models.losses import accuracy
+from mmdet.models.task_modules import SamplingResult
+from mmdet.models.utils import multi_apply
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptConfigType, reduce_mean
+from .bbox_head import BBoxHead
+
+
+@MODELS.register_module()
+class DIIHead(BBoxHead):
+    r"""Dynamic Instance Interactive Head for `Sparse R-CNN: End-to-End Object
+    Detection with Learnable Proposals <https://arxiv.org/abs/2011.12450>`_
+
+    Args:
+        num_classes (int): Number of class in dataset.
+            Defaults to 80.
+        num_ffn_fcs (int): The number of fully-connected
+            layers in FFNs. Defaults to 2.
+        num_heads (int): The hidden dimension of FFNs.
+            Defaults to 8.
+        num_cls_fcs (int): The number of fully-connected
+            layers in classification subnet. Defaults to 1.
+        num_reg_fcs (int): The number of fully-connected
+            layers in regression subnet. Defaults to 3.
+        feedforward_channels (int): The hidden dimension
+            of FFNs. Defaults to 2048
+        in_channels (int): Hidden_channels of MultiheadAttention.
+            Defaults to 256.
+        dropout (float): Probability of drop the channel.
+            Defaults to 0.0
+        ffn_act_cfg (:obj:`ConfigDict` or dict): The activation config
+            for FFNs.
+        dynamic_conv_cfg (:obj:`ConfigDict` or dict): The convolution
+            config for DynamicConv.
+        loss_iou (:obj:`ConfigDict` or dict): The config for iou or
+            giou loss.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int = 80,
+                 num_ffn_fcs: int = 2,
+                 num_heads: int = 8,
+                 num_cls_fcs: int = 1,
+                 num_reg_fcs: int = 3,
+                 feedforward_channels: int = 2048,
+                 in_channels: int = 256,
+                 dropout: float = 0.0,
+                 ffn_act_cfg: ConfigType = dict(type='ReLU', inplace=True),
+                 dynamic_conv_cfg: ConfigType = dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     input_feat_shape=7,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN')),
+                 loss_iou: ConfigType = dict(type='GIoULoss', loss_weight=2.0),
+                 init_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(
+            num_classes=num_classes,
+            reg_decoded_bbox=True,
+            reg_class_agnostic=True,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.loss_iou = MODELS.build(loss_iou)
+        self.in_channels = in_channels
+        self.fp16_enabled = False
+        self.attention = MultiheadAttention(in_channels, num_heads, dropout)
+        self.attention_norm = build_norm_layer(dict(type='LN'), in_channels)[1]
+
+        self.instance_interactive_conv = MODELS.build(dynamic_conv_cfg)
+        self.instance_interactive_conv_dropout = nn.Dropout(dropout)
+        self.instance_interactive_conv_norm = build_norm_layer(
+            dict(type='LN'), in_channels)[1]
+
+        self.ffn = FFN(
+            in_channels,
+            feedforward_channels,
+            num_ffn_fcs,
+            act_cfg=ffn_act_cfg,
+            dropout=dropout)
+        self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1]
+
+        self.cls_fcs = nn.ModuleList()
+        for _ in range(num_cls_fcs):
+            self.cls_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.cls_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.cls_fcs.append(
+                build_activation_layer(dict(type='ReLU', inplace=True)))
+
+        # over load the self.fc_cls in BBoxHead
+        if self.loss_cls.use_sigmoid:
+            self.fc_cls = nn.Linear(in_channels, self.num_classes)
+        else:
+            self.fc_cls = nn.Linear(in_channels, self.num_classes + 1)
+
+        self.reg_fcs = nn.ModuleList()
+        for _ in range(num_reg_fcs):
+            self.reg_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.reg_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.reg_fcs.append(
+                build_activation_layer(dict(type='ReLU', inplace=True)))
+        # over load the self.fc_cls in BBoxHead
+        self.fc_reg = nn.Linear(in_channels, 4)
+
+        assert self.reg_class_agnostic, 'DIIHead only ' \
+            'suppport `reg_class_agnostic=True` '
+        assert self.reg_decoded_bbox, 'DIIHead only ' \
+            'suppport `reg_decoded_bbox=True`'
+
+    def init_weights(self) -> None:
+        """Use xavier initialization for all weight parameter and set
+        classification head bias as a specific value when use focal loss."""
+        super().init_weights()
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+            else:
+                # adopt the default initialization for
+                # the weight and bias of the layer norm
+                pass
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            nn.init.constant_(self.fc_cls.bias, bias_init)
+
+    def forward(self, roi_feat: Tensor, proposal_feat: Tensor) -> tuple:
+        """Forward function of Dynamic Instance Interactive Head.
+
+        Args:
+            roi_feat (Tensor): Roi-pooling features with shape
+                (batch_size*num_proposals, feature_dimensions,
+                pooling_h , pooling_w).
+            proposal_feat (Tensor): Intermediate feature get from
+                diihead in last stage, has shape
+                (batch_size, num_proposals, feature_dimensions)
+
+        Returns:
+            tuple[Tensor]: Usually a tuple of classification scores
+            and bbox prediction and a intermediate feature.
+
+            - cls_scores (Tensor): Classification scores for
+              all proposals, has shape
+              (batch_size, num_proposals, num_classes).
+            - bbox_preds (Tensor): Box energies / deltas for
+              all proposals, has shape
+              (batch_size, num_proposals, 4).
+            - obj_feat (Tensor): Object feature before classification
+              and regression subnet, has shape
+              (batch_size, num_proposal, feature_dimensions).
+            - attn_feats (Tensor): Intermediate feature.
+        """
+        N, num_proposals = proposal_feat.shape[:2]
+
+        # Self attention
+        proposal_feat = proposal_feat.permute(1, 0, 2)
+        proposal_feat = self.attention_norm(self.attention(proposal_feat))
+        attn_feats = proposal_feat.permute(1, 0, 2)
+
+        # instance interactive
+        proposal_feat = attn_feats.reshape(-1, self.in_channels)
+        proposal_feat_iic = self.instance_interactive_conv(
+            proposal_feat, roi_feat)
+        proposal_feat = proposal_feat + self.instance_interactive_conv_dropout(
+            proposal_feat_iic)
+        obj_feat = self.instance_interactive_conv_norm(proposal_feat)
+
+        # FFN
+        obj_feat = self.ffn_norm(self.ffn(obj_feat))
+
+        cls_feat = obj_feat
+        reg_feat = obj_feat
+
+        for cls_layer in self.cls_fcs:
+            cls_feat = cls_layer(cls_feat)
+        for reg_layer in self.reg_fcs:
+            reg_feat = reg_layer(reg_feat)
+
+        cls_score = self.fc_cls(cls_feat).view(
+            N, num_proposals, self.num_classes
+            if self.loss_cls.use_sigmoid else self.num_classes + 1)
+        bbox_delta = self.fc_reg(reg_feat).view(N, num_proposals, 4)
+
+        return cls_score, bbox_delta, obj_feat.view(
+            N, num_proposals, self.in_channels), attn_feats
+
+    def loss_and_target(self,
+                        cls_score: Tensor,
+                        bbox_pred: Tensor,
+                        sampling_results: List[SamplingResult],
+                        rcnn_train_cfg: ConfigType,
+                        imgs_whwh: Tensor,
+                        concat: bool = True,
+                        reduction_override: str = None) -> dict:
+        """Calculate the loss based on the features extracted by the DIIHead.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            bbox_pred (Tensor): Regression prediction results, has shape
+                (batch_size * num_proposals_single_image, 4), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            imgs_whwh (Tensor): imgs_whwh (Tensor): Tensor with\
+                shape (batch_size, num_proposals, 4), the last
+                dimension means
+                [img_width,img_height, img_width, img_height].
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch. Defaults to True.
+            reduction_override (str, optional): The reduction
+                method used to override the original reduction
+                method of the loss. Options are "none",
+                "mean" and "sum". Defaults to None.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+            The targets are only used for cascade rcnn.
+        """
+        cls_reg_targets = self.get_targets(
+            sampling_results=sampling_results,
+            rcnn_train_cfg=rcnn_train_cfg,
+            concat=concat)
+        (labels, label_weights, bbox_targets, bbox_weights) = cls_reg_targets
+
+        losses = dict()
+        bg_class_ind = self.num_classes
+        # note in spare rcnn num_gt == num_pos
+        pos_inds = (labels >= 0) & (labels < bg_class_ind)
+        num_pos = pos_inds.sum().float()
+        avg_factor = reduce_mean(num_pos)
+        if cls_score is not None:
+            if cls_score.numel() > 0:
+                losses['loss_cls'] = self.loss_cls(
+                    cls_score,
+                    labels,
+                    label_weights,
+                    avg_factor=avg_factor,
+                    reduction_override=reduction_override)
+                losses['pos_acc'] = accuracy(cls_score[pos_inds],
+                                             labels[pos_inds])
+        if bbox_pred is not None:
+            # 0~self.num_classes-1 are FG, self.num_classes is BG
+            # do not perform bounding box regression for BG anymore.
+            if pos_inds.any():
+                pos_bbox_pred = bbox_pred.reshape(bbox_pred.size(0),
+                                                  4)[pos_inds.type(torch.bool)]
+                imgs_whwh = imgs_whwh.reshape(bbox_pred.size(0),
+                                              4)[pos_inds.type(torch.bool)]
+                losses['loss_bbox'] = self.loss_bbox(
+                    pos_bbox_pred / imgs_whwh,
+                    bbox_targets[pos_inds.type(torch.bool)] / imgs_whwh,
+                    bbox_weights[pos_inds.type(torch.bool)],
+                    avg_factor=avg_factor)
+                losses['loss_iou'] = self.loss_iou(
+                    pos_bbox_pred,
+                    bbox_targets[pos_inds.type(torch.bool)],
+                    bbox_weights[pos_inds.type(torch.bool)],
+                    avg_factor=avg_factor)
+            else:
+                losses['loss_bbox'] = bbox_pred.sum() * 0
+                losses['loss_iou'] = bbox_pred.sum() * 0
+        return dict(loss_bbox=losses, bbox_targets=cls_reg_targets)
+
+    def _get_targets_single(self, pos_inds: Tensor, neg_inds: Tensor,
+                            pos_priors: Tensor, neg_priors: Tensor,
+                            pos_gt_bboxes: Tensor, pos_gt_labels: Tensor,
+                            cfg: ConfigDict) -> tuple:
+        """Calculate the ground truth for proposals in the single image
+        according to the sampling results.
+
+        Almost the same as the implementation in `bbox_head`,
+        we add pos_inds and neg_inds to select positive and
+        negative samples instead of selecting the first num_pos
+        as positive samples.
+
+        Args:
+            pos_inds (Tensor): The length is equal to the
+                positive sample numbers contain all index
+                of the positive sample in the origin proposal set.
+            neg_inds (Tensor): The length is equal to the
+                negative sample numbers contain all index
+                of the negative sample in the origin proposal set.
+            pos_priors (Tensor): Contains all the positive boxes,
+                has shape (num_pos, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            neg_priors (Tensor): Contains all the negative boxes,
+                has shape (num_neg, 4), the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_bboxes (Tensor): Contains gt_boxes for
+                all positive samples, has shape (num_pos, 4),
+                the last dimension 4
+                represents [tl_x, tl_y, br_x, br_y].
+            pos_gt_labels (Tensor): Contains gt_labels for
+                all positive samples, has shape (num_pos, ).
+            cfg (obj:`ConfigDict`): `train_cfg` of R-CNN.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following Tensors:
+
+            - labels(Tensor): Gt_labels for all proposals, has
+              shape (num_proposals,).
+            - label_weights(Tensor): Labels_weights for all proposals, has
+              shape (num_proposals,).
+            - bbox_targets(Tensor):Regression target for all proposals, has
+              shape (num_proposals, 4), the last dimension 4
+              represents [tl_x, tl_y, br_x, br_y].
+            - bbox_weights(Tensor):Regression weights for all proposals,
+              has shape (num_proposals, 4).
+        """
+        num_pos = pos_priors.size(0)
+        num_neg = neg_priors.size(0)
+        num_samples = num_pos + num_neg
+
+        # original implementation uses new_zeros since BG are set to be 0
+        # now use empty & fill because BG cat_id = num_classes,
+        # FG cat_id = [0, num_classes-1]
+        labels = pos_priors.new_full((num_samples, ),
+                                     self.num_classes,
+                                     dtype=torch.long)
+        label_weights = pos_priors.new_zeros(num_samples)
+        bbox_targets = pos_priors.new_zeros(num_samples, 4)
+        bbox_weights = pos_priors.new_zeros(num_samples, 4)
+        if num_pos > 0:
+            labels[pos_inds] = pos_gt_labels
+            pos_weight = 1.0 if cfg.pos_weight <= 0 else cfg.pos_weight
+            label_weights[pos_inds] = pos_weight
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    pos_priors, pos_gt_bboxes)
+            else:
+                pos_bbox_targets = pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1
+        if num_neg > 0:
+            label_weights[neg_inds] = 1.0
+
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def get_targets(self,
+                    sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict,
+                    concat: bool = True) -> tuple:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Almost the same as the implementation in bbox_head, we passed
+        additional parameters pos_inds_list and neg_inds_list to
+        `_get_targets_single` function.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following list of Tensors:
+
+            - labels (list[Tensor],Tensor): Gt_labels for all
+              proposals in a batch, each tensor in list has
+              shape (num_proposals,) when `concat=False`, otherwise just
+              a single tensor has shape (num_all_proposals,).
+            - label_weights (list[Tensor]): Labels_weights for
+              all proposals in a batch, each tensor in list has shape
+              (num_proposals,) when `concat=False`, otherwise just a
+              single tensor has shape (num_all_proposals,).
+            - bbox_targets (list[Tensor],Tensor): Regression target
+              for all proposals in a batch, each tensor in list has
+              shape (num_proposals, 4) when `concat=False`, otherwise
+              just a single tensor has shape (num_all_proposals, 4),
+              the last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            - bbox_weights (list[tensor],Tensor): Regression weights for
+              all proposals in a batch, each tensor in list has shape
+              (num_proposals, 4) when `concat=False`, otherwise just a
+              single tensor has shape (num_all_proposals, 4).
+        """
+        pos_inds_list = [res.pos_inds for res in sampling_results]
+        neg_inds_list = [res.neg_inds for res in sampling_results]
+        pos_priors_list = [res.pos_priors for res in sampling_results]
+        neg_priors_list = [res.neg_priors for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels_list = [res.pos_gt_labels for res in sampling_results]
+        labels, label_weights, bbox_targets, bbox_weights = multi_apply(
+            self._get_targets_single,
+            pos_inds_list,
+            neg_inds_list,
+            pos_priors_list,
+            neg_priors_list,
+            pos_gt_bboxes_list,
+            pos_gt_labels_list,
+            cfg=rcnn_train_cfg)
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            bbox_weights = torch.cat(bbox_weights, 0)
+        return labels, label_weights, bbox_targets, bbox_weights
diff --git a/head_extractor/src/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..076c35843375c7aef5e58786d55ebacd281d54a3
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/double_bbox_head.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList
+from torch import Tensor
+
+from mmdet.models.backbones.resnet import Bottleneck
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, MultiConfig, OptConfigType, OptMultiConfig
+from .bbox_head import BBoxHead
+
+
+class BasicResBlock(BaseModule):
+    """Basic residual block.
+
+    This block is a little different from the block in the ResNet backbone.
+    The kernel size of conv1 is 1 in this block while 3 in ResNet BasicBlock.
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        out_channels (int): Channels of the output feature map.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): The config dict
+            for convolution layers.
+        norm_cfg (:obj:`ConfigDict` or dict): The config dict for
+            normalization layers.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+
+        # main path
+        self.conv1 = ConvModule(
+            in_channels,
+            in_channels,
+            kernel_size=3,
+            padding=1,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+        self.conv2 = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            bias=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        # identity path
+        self.conv_identity = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        identity = x
+
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        identity = self.conv_identity(identity)
+        out = x + identity
+
+        out = self.relu(out)
+        return out
+
+
+@MODELS.register_module()
+class DoubleConvFCBBoxHead(BBoxHead):
+    r"""Bbox head used in Double-Head R-CNN
+
+    .. code-block:: none
+
+                                          /-> cls
+                      /-> shared convs ->
+                                          \-> reg
+        roi features
+                                          /-> cls
+                      \-> shared fc    ->
+                                          \-> reg
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_convs: int = 0,
+                 num_fcs: int = 0,
+                 conv_out_channels: int = 1024,
+                 fc_out_channels: int = 1024,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN'),
+                 init_cfg: MultiConfig = dict(
+                     type='Normal',
+                     override=[
+                         dict(type='Normal', name='fc_cls', std=0.01),
+                         dict(type='Normal', name='fc_reg', std=0.001),
+                         dict(
+                             type='Xavier',
+                             name='fc_branch',
+                             distribution='uniform')
+                     ]),
+                 **kwargs) -> None:
+        kwargs.setdefault('with_avg_pool', True)
+        super().__init__(init_cfg=init_cfg, **kwargs)
+        assert self.with_avg_pool
+        assert num_convs > 0
+        assert num_fcs > 0
+        self.num_convs = num_convs
+        self.num_fcs = num_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        # increase the channel of input features
+        self.res_block = BasicResBlock(self.in_channels,
+                                       self.conv_out_channels)
+
+        # add conv heads
+        self.conv_branch = self._add_conv_branch()
+        # add fc heads
+        self.fc_branch = self._add_fc_branch()
+
+        out_dim_reg = 4 if self.reg_class_agnostic else 4 * self.num_classes
+        self.fc_reg = nn.Linear(self.conv_out_channels, out_dim_reg)
+
+        self.fc_cls = nn.Linear(self.fc_out_channels, self.num_classes + 1)
+        self.relu = nn.ReLU()
+
+    def _add_conv_branch(self) -> None:
+        """Add the fc branch which consists of a sequential of conv layers."""
+        branch_convs = ModuleList()
+        for i in range(self.num_convs):
+            branch_convs.append(
+                Bottleneck(
+                    inplanes=self.conv_out_channels,
+                    planes=self.conv_out_channels // 4,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        return branch_convs
+
+    def _add_fc_branch(self) -> None:
+        """Add the fc branch which consists of a sequential of fc layers."""
+        branch_fcs = ModuleList()
+        for i in range(self.num_fcs):
+            fc_in_channels = (
+                self.in_channels *
+                self.roi_feat_area if i == 0 else self.fc_out_channels)
+            branch_fcs.append(nn.Linear(fc_in_channels, self.fc_out_channels))
+        return branch_fcs
+
+    def forward(self, x_cls: Tensor, x_reg: Tensor) -> Tuple[Tensor]:
+        """Forward features from the upstream network.
+
+        Args:
+            x_cls (Tensor): Classification features of rois
+            x_reg (Tensor): Regression features from the upstream network.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_score (Tensor): Classification score predictions of rois.
+                  each roi predicts num_classes + 1 channels.
+                - bbox_pred (Tensor): BBox deltas predictions of rois. each roi
+                  predicts 4 * num_classes channels.
+        """
+        # conv head
+        x_conv = self.res_block(x_reg)
+
+        for conv in self.conv_branch:
+            x_conv = conv(x_conv)
+
+        if self.with_avg_pool:
+            x_conv = self.avg_pool(x_conv)
+
+        x_conv = x_conv.view(x_conv.size(0), -1)
+        bbox_pred = self.fc_reg(x_conv)
+
+        # fc head
+        x_fc = x_cls.view(x_cls.size(0), -1)
+        for fc in self.fc_branch:
+            x_fc = self.relu(fc(x_fc))
+
+        cls_score = self.fc_cls(x_fc)
+
+        return cls_score, bbox_pred
diff --git a/head_extractor/src/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e57d2eddd580b13256da63c9bd8723be98e764
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/multi_instance_bbox_head.py
@@ -0,0 +1,626 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor, nn
+
+from mmdet.models.roi_heads.bbox_heads.bbox_head import BBoxHead
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import empty_instances
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox_overlaps
+
+
+@MODELS.register_module()
+class MultiInstanceBBoxHead(BBoxHead):
+    r"""Bbox head used in CrowdDet.
+
+    .. code-block:: none
+
+                                      /-> cls convs_1 -> cls fcs_1 -> cls_1
+                                   |--
+                                   |  \-> reg convs_1 -> reg fcs_1 -> reg_1
+                                   |
+                                   |  /-> cls convs_2 -> cls fcs_2 -> cls_2
+        shared convs -> shared fcs |--
+                                   |  \-> reg convs_2 -> reg fcs_2 -> reg_2
+                                   |
+                                   |                     ...
+                                   |
+                                   |  /-> cls convs_k -> cls fcs_k -> cls_k
+                                   |--
+                                      \-> reg convs_k -> reg fcs_k -> reg_k
+
+
+    Args:
+        num_instance (int): The number of branches after shared fcs.
+            Defaults to 2.
+        with_refine (bool): Whether to use refine module. Defaults to False.
+        num_shared_convs (int): The number of shared convs. Defaults to 0.
+        num_shared_fcs (int): The number of shared fcs. Defaults to 2.
+        num_cls_convs (int): The number of cls convs. Defaults to 0.
+        num_cls_fcs (int): The number of cls fcs. Defaults to 0.
+        num_reg_convs (int): The number of reg convs. Defaults to 0.
+        num_reg_fcs (int): The number of reg fcs. Defaults to 0.
+        conv_out_channels (int): The number of conv out channels.
+            Defaults to 256.
+        fc_out_channels (int): The number of fc out channels. Defaults to 1024.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_instance: int = 2,
+                 with_refine: bool = False,
+                 num_shared_convs: int = 0,
+                 num_shared_fcs: int = 2,
+                 num_cls_convs: int = 0,
+                 num_cls_fcs: int = 0,
+                 num_reg_convs: int = 0,
+                 num_reg_fcs: int = 0,
+                 conv_out_channels: int = 256,
+                 fc_out_channels: int = 1024,
+                 init_cfg: Optional[Union[dict, ConfigDict]] = None,
+                 *args,
+                 **kwargs) -> None:
+        super().__init__(*args, init_cfg=init_cfg, **kwargs)
+        assert (num_shared_convs + num_shared_fcs + num_cls_convs +
+                num_cls_fcs + num_reg_convs + num_reg_fcs > 0)
+        assert num_instance == 2, 'Currently only 2 instances are supported'
+        if num_cls_convs > 0 or num_reg_convs > 0:
+            assert num_shared_fcs == 0
+        if not self.with_cls:
+            assert num_cls_convs == 0 and num_cls_fcs == 0
+        if not self.with_reg:
+            assert num_reg_convs == 0 and num_reg_fcs == 0
+        self.num_instance = num_instance
+        self.num_shared_convs = num_shared_convs
+        self.num_shared_fcs = num_shared_fcs
+        self.num_cls_convs = num_cls_convs
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_convs = num_reg_convs
+        self.num_reg_fcs = num_reg_fcs
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.with_refine = with_refine
+
+        # add shared convs and fcs
+        self.shared_convs, self.shared_fcs, last_layer_dim = \
+            self._add_conv_fc_branch(
+                self.num_shared_convs, self.num_shared_fcs, self.in_channels,
+                True)
+        self.shared_out_channels = last_layer_dim
+        self.relu = nn.ReLU(inplace=True)
+
+        if self.with_refine:
+            refine_model_cfg = {
+                'type': 'Linear',
+                'in_features': self.shared_out_channels + 20,
+                'out_features': self.shared_out_channels
+            }
+            self.shared_fcs_ref = MODELS.build(refine_model_cfg)
+            self.fc_cls_ref = nn.ModuleList()
+            self.fc_reg_ref = nn.ModuleList()
+
+        self.cls_convs = nn.ModuleList()
+        self.cls_fcs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.reg_fcs = nn.ModuleList()
+        self.cls_last_dim = list()
+        self.reg_last_dim = list()
+        self.fc_cls = nn.ModuleList()
+        self.fc_reg = nn.ModuleList()
+        for k in range(self.num_instance):
+            # add cls specific branch
+            cls_convs, cls_fcs, cls_last_dim = self._add_conv_fc_branch(
+                self.num_cls_convs, self.num_cls_fcs, self.shared_out_channels)
+            self.cls_convs.append(cls_convs)
+            self.cls_fcs.append(cls_fcs)
+            self.cls_last_dim.append(cls_last_dim)
+
+            # add reg specific branch
+            reg_convs, reg_fcs, reg_last_dim = self._add_conv_fc_branch(
+                self.num_reg_convs, self.num_reg_fcs, self.shared_out_channels)
+            self.reg_convs.append(reg_convs)
+            self.reg_fcs.append(reg_fcs)
+            self.reg_last_dim.append(reg_last_dim)
+
+            if self.num_shared_fcs == 0 and not self.with_avg_pool:
+                if self.num_cls_fcs == 0:
+                    self.cls_last_dim *= self.roi_feat_area
+                if self.num_reg_fcs == 0:
+                    self.reg_last_dim *= self.roi_feat_area
+
+            if self.with_cls:
+                if self.custom_cls_channels:
+                    cls_channels = self.loss_cls.get_cls_channels(
+                        self.num_classes)
+                else:
+                    cls_channels = self.num_classes + 1
+                cls_predictor_cfg_ = self.cls_predictor_cfg.copy()  # deepcopy
+                cls_predictor_cfg_.update(
+                    in_features=self.cls_last_dim[k],
+                    out_features=cls_channels)
+                self.fc_cls.append(MODELS.build(cls_predictor_cfg_))
+                if self.with_refine:
+                    self.fc_cls_ref.append(MODELS.build(cls_predictor_cfg_))
+
+            if self.with_reg:
+                out_dim_reg = (4 if self.reg_class_agnostic else 4 *
+                               self.num_classes)
+                reg_predictor_cfg_ = self.reg_predictor_cfg.copy()
+                reg_predictor_cfg_.update(
+                    in_features=self.reg_last_dim[k], out_features=out_dim_reg)
+                self.fc_reg.append(MODELS.build(reg_predictor_cfg_))
+                if self.with_refine:
+                    self.fc_reg_ref.append(MODELS.build(reg_predictor_cfg_))
+
+        if init_cfg is None:
+            # when init_cfg is None,
+            # It has been set to
+            # [[dict(type='Normal', std=0.01, override=dict(name='fc_cls'))],
+            #  [dict(type='Normal', std=0.001, override=dict(name='fc_reg'))]
+            # after `super(ConvFCBBoxHead, self).__init__()`
+            # we only need to append additional configuration
+            # for `shared_fcs`, `cls_fcs` and `reg_fcs`
+            self.init_cfg += [
+                dict(
+                    type='Xavier',
+                    distribution='uniform',
+                    override=[
+                        dict(name='shared_fcs'),
+                        dict(name='cls_fcs'),
+                        dict(name='reg_fcs')
+                    ])
+            ]
+
+    def _add_conv_fc_branch(self,
+                            num_branch_convs: int,
+                            num_branch_fcs: int,
+                            in_channels: int,
+                            is_shared: bool = False) -> tuple:
+        """Add shared or separable branch.
+
+        convs -> avg pool (optional) -> fcs
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels, self.conv_out_channels, 3,
+                        padding=1))
+            last_layer_dim = self.conv_out_channels
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            # for shared branch, only consider self.with_avg_pool
+            # for separated branches, also consider self.num_shared_fcs
+            if (is_shared
+                    or self.num_shared_fcs == 0) and not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+        return branch_convs, branch_fcs, last_layer_dim
+
+    def forward(self, x: Tuple[Tensor]) -> tuple:
+        """Forward features from the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_score (Tensor): Classification scores for all scale
+                  levels, each is a 4D-tensor, the channels number is
+                  num_base_priors * num_classes.
+                - bbox_pred (Tensor): Box energies / deltas for all scale
+                  levels, each is a 4D-tensor, the channels number is
+                  num_base_priors * 4.
+                - cls_score_ref (Tensor): The cls_score after refine model.
+                - bbox_pred_ref (Tensor): The bbox_pred after refine model.
+        """
+        # shared part
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+
+            x = x.flatten(1)
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+
+        x_cls = x
+        x_reg = x
+        # separate branches
+        cls_score = list()
+        bbox_pred = list()
+        for k in range(self.num_instance):
+            for conv in self.cls_convs[k]:
+                x_cls = conv(x_cls)
+            if x_cls.dim() > 2:
+                if self.with_avg_pool:
+                    x_cls = self.avg_pool(x_cls)
+                x_cls = x_cls.flatten(1)
+            for fc in self.cls_fcs[k]:
+                x_cls = self.relu(fc(x_cls))
+
+            for conv in self.reg_convs[k]:
+                x_reg = conv(x_reg)
+            if x_reg.dim() > 2:
+                if self.with_avg_pool:
+                    x_reg = self.avg_pool(x_reg)
+                x_reg = x_reg.flatten(1)
+            for fc in self.reg_fcs[k]:
+                x_reg = self.relu(fc(x_reg))
+
+            cls_score.append(self.fc_cls[k](x_cls) if self.with_cls else None)
+            bbox_pred.append(self.fc_reg[k](x_reg) if self.with_reg else None)
+
+        if self.with_refine:
+            x_ref = x
+            cls_score_ref = list()
+            bbox_pred_ref = list()
+            for k in range(self.num_instance):
+                feat_ref = cls_score[k].softmax(dim=-1)
+                feat_ref = torch.cat((bbox_pred[k], feat_ref[:, 1][:, None]),
+                                     dim=1).repeat(1, 4)
+                feat_ref = torch.cat((x_ref, feat_ref), dim=1)
+                feat_ref = F.relu_(self.shared_fcs_ref(feat_ref))
+
+                cls_score_ref.append(self.fc_cls_ref[k](feat_ref))
+                bbox_pred_ref.append(self.fc_reg_ref[k](feat_ref))
+
+            cls_score = torch.cat(cls_score, dim=1)
+            bbox_pred = torch.cat(bbox_pred, dim=1)
+            cls_score_ref = torch.cat(cls_score_ref, dim=1)
+            bbox_pred_ref = torch.cat(bbox_pred_ref, dim=1)
+            return cls_score, bbox_pred, cls_score_ref, bbox_pred_ref
+
+        cls_score = torch.cat(cls_score, dim=1)
+        bbox_pred = torch.cat(bbox_pred, dim=1)
+
+        return cls_score, bbox_pred
+
+    def get_targets(self,
+                    sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict,
+                    concat: bool = True) -> tuple:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Almost the same as the implementation in bbox_head, we passed
+        additional parameters pos_inds_list and neg_inds_list to
+        `_get_targets_single` function.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+            concat (bool): Whether to concatenate the results of all
+                the images in a single batch.
+
+        Returns:
+            Tuple[Tensor]: Ground truth for proposals in a single image.
+            Containing the following list of Tensors:
+
+            - labels (list[Tensor],Tensor): Gt_labels for all proposals in a
+              batch, each tensor in list has shape (num_proposals,) when
+              `concat=False`, otherwise just a single tensor has shape
+              (num_all_proposals,).
+            - label_weights (list[Tensor]): Labels_weights for
+              all proposals in a batch, each tensor in list has shape
+              (num_proposals,) when `concat=False`, otherwise just a single
+              tensor has shape (num_all_proposals,).
+            - bbox_targets (list[Tensor],Tensor): Regression target for all
+              proposals in a batch, each tensor in list has shape
+              (num_proposals, 4) when `concat=False`, otherwise just a single
+              tensor has shape (num_all_proposals, 4), the last dimension 4
+              represents [tl_x, tl_y, br_x, br_y].
+            - bbox_weights (list[tensor],Tensor): Regression weights for
+              all proposals in a batch, each tensor in list has shape
+              (num_proposals, 4) when `concat=False`, otherwise just a
+              single tensor has shape (num_all_proposals, 4).
+        """
+        labels = []
+        bbox_targets = []
+        bbox_weights = []
+        label_weights = []
+        for i in range(len(sampling_results)):
+            sample_bboxes = torch.cat([
+                sampling_results[i].pos_gt_bboxes,
+                sampling_results[i].neg_gt_bboxes
+            ])
+            sample_priors = sampling_results[i].priors
+            sample_priors = sample_priors.repeat(1, self.num_instance).reshape(
+                -1, 4)
+            sample_bboxes = sample_bboxes.reshape(-1, 4)
+
+            if not self.reg_decoded_bbox:
+                _bbox_targets = self.bbox_coder.encode(sample_priors,
+                                                       sample_bboxes)
+            else:
+                _bbox_targets = sample_priors
+            _bbox_targets = _bbox_targets.reshape(-1, self.num_instance * 4)
+            _bbox_weights = torch.ones(_bbox_targets.shape)
+            _labels = torch.cat([
+                sampling_results[i].pos_gt_labels,
+                sampling_results[i].neg_gt_labels
+            ])
+            _labels_weights = torch.ones(_labels.shape)
+
+            bbox_targets.append(_bbox_targets)
+            bbox_weights.append(_bbox_weights)
+            labels.append(_labels)
+            label_weights.append(_labels_weights)
+
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            bbox_weights = torch.cat(bbox_weights, 0)
+        return labels, label_weights, bbox_targets, bbox_weights
+
+    def loss(self, cls_score: Tensor, bbox_pred: Tensor, rois: Tensor,
+             labels: Tensor, label_weights: Tensor, bbox_targets: Tensor,
+             bbox_weights: Tensor, **kwargs) -> dict:
+        """Calculate the loss based on the network predictions and targets.
+
+        Args:
+            cls_score (Tensor): Classification prediction results of all class,
+                has shape (batch_size * num_proposals_single_image,
+                (num_classes + 1) * k), k represents the number of prediction
+                boxes generated by each proposal box.
+            bbox_pred (Tensor): Regression prediction results, has shape
+                (batch_size * num_proposals_single_image, 4 * k), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            rois (Tensor): RoIs with the shape
+                (batch_size * num_proposals_single_image, 5) where the first
+                column indicates batch id of each RoI.
+            labels (Tensor): Gt_labels for all proposals in a batch, has
+                shape (batch_size * num_proposals_single_image, k).
+            label_weights (Tensor): Labels_weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, k).
+            bbox_targets (Tensor): Regression target for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image,
+                4 * k), the last dimension 4 represents [tl_x, tl_y, br_x,
+                br_y].
+            bbox_weights (Tensor): Regression weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image,
+                4 * k).
+
+        Returns:
+            dict: A dictionary of loss.
+        """
+        losses = dict()
+        if bbox_pred.numel():
+            loss_0 = self.emd_loss(bbox_pred[:, 0:4], cls_score[:, 0:2],
+                                   bbox_pred[:, 4:8], cls_score[:, 2:4],
+                                   bbox_targets, labels)
+            loss_1 = self.emd_loss(bbox_pred[:, 4:8], cls_score[:, 2:4],
+                                   bbox_pred[:, 0:4], cls_score[:, 0:2],
+                                   bbox_targets, labels)
+            loss = torch.cat([loss_0, loss_1], dim=1)
+            _, min_indices = loss.min(dim=1)
+            loss_emd = loss[torch.arange(loss.shape[0]), min_indices]
+            loss_emd = loss_emd.mean()
+        else:
+            loss_emd = bbox_pred.sum()
+        losses['loss_rcnn_emd'] = loss_emd
+        return losses
+
+    def emd_loss(self, bbox_pred_0: Tensor, cls_score_0: Tensor,
+                 bbox_pred_1: Tensor, cls_score_1: Tensor, targets: Tensor,
+                 labels: Tensor) -> Tensor:
+        """Calculate the emd loss.
+
+        Note:
+            This implementation is modified from https://github.com/Purkialo/
+            CrowdDet/blob/master/lib/det_oprs/loss_opr.py
+
+        Args:
+            bbox_pred_0 (Tensor): Part of regression prediction results, has
+                shape (batch_size * num_proposals_single_image, 4), the last
+                dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            cls_score_0 (Tensor): Part of classification prediction results,
+                has shape (batch_size * num_proposals_single_image,
+                (num_classes + 1)), where 1 represents the background.
+            bbox_pred_1 (Tensor): The other part of regression prediction
+                results, has shape (batch_size*num_proposals_single_image, 4).
+            cls_score_1 (Tensor):The other part of classification prediction
+                results, has shape (batch_size * num_proposals_single_image,
+                (num_classes + 1)).
+            targets (Tensor):Regression target for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image,
+                4 * k), the last dimension 4 represents [tl_x, tl_y, br_x,
+                br_y], k represents the number of prediction boxes generated
+                by each proposal box.
+            labels (Tensor): Gt_labels for all proposals in a batch, has
+                shape (batch_size * num_proposals_single_image, k).
+
+        Returns:
+            torch.Tensor: The calculated loss.
+        """
+
+        bbox_pred = torch.cat([bbox_pred_0, bbox_pred_1],
+                              dim=1).reshape(-1, bbox_pred_0.shape[-1])
+        cls_score = torch.cat([cls_score_0, cls_score_1],
+                              dim=1).reshape(-1, cls_score_0.shape[-1])
+        targets = targets.reshape(-1, 4)
+        labels = labels.long().flatten()
+
+        # masks
+        valid_masks = labels >= 0
+        fg_masks = labels > 0
+
+        # multiple class
+        bbox_pred = bbox_pred.reshape(-1, self.num_classes, 4)
+        fg_gt_classes = labels[fg_masks]
+        bbox_pred = bbox_pred[fg_masks, fg_gt_classes - 1, :]
+
+        # loss for regression
+        loss_bbox = self.loss_bbox(bbox_pred, targets[fg_masks])
+        loss_bbox = loss_bbox.sum(dim=1)
+
+        # loss for classification
+        labels = labels * valid_masks
+        loss_cls = self.loss_cls(cls_score, labels)
+
+        loss_cls[fg_masks] = loss_cls[fg_masks] + loss_bbox
+        loss = loss_cls.reshape(-1, 2).sum(dim=1)
+        return loss.reshape(-1, 1)
+
+    def _predict_by_feat_single(
+            self,
+            roi: Tensor,
+            cls_score: Tensor,
+            bbox_pred: Tensor,
+            img_meta: dict,
+            rescale: bool = False,
+            rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5).
+                last dimension 5 arrange as (batch_index, x1, y1, x2, y2).
+            cls_score (Tensor): Box scores, has shape
+                (num_boxes, num_classes + 1).
+            bbox_pred (Tensor): Box energies / deltas. has shape
+                (num_boxes, num_classes * 4).
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+
+        cls_score = cls_score.reshape(-1, self.num_classes + 1)
+        bbox_pred = bbox_pred.reshape(-1, 4)
+        roi = roi.repeat_interleave(self.num_instance, dim=0)
+
+        results = InstanceData()
+        if roi.shape[0] == 0:
+            return empty_instances([img_meta],
+                                   roi.device,
+                                   task_type='bbox',
+                                   instance_results=[results])[0]
+
+        scores = cls_score.softmax(dim=-1) if cls_score is not None else None
+        img_shape = img_meta['img_shape']
+        bboxes = self.bbox_coder.decode(
+            roi[..., 1:], bbox_pred, max_shape=img_shape)
+
+        if rescale and bboxes.size(0) > 0:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+            bboxes = (bboxes.view(bboxes.size(0), -1, 4) / scale_factor).view(
+                bboxes.size()[0], -1)
+
+        if rcnn_test_cfg is None:
+            # This means that it is aug test.
+            # It needs to return the raw results without nms.
+            results.bboxes = bboxes
+            results.scores = scores
+        else:
+            roi_idx = np.tile(
+                np.arange(bboxes.shape[0] / self.num_instance)[:, None],
+                (1, self.num_instance)).reshape(-1, 1)[:, 0]
+            roi_idx = torch.from_numpy(roi_idx).to(bboxes.device).reshape(
+                -1, 1)
+            bboxes = torch.cat([bboxes, roi_idx], dim=1)
+            det_bboxes, det_scores = self.set_nms(
+                bboxes, scores[:, 1], rcnn_test_cfg.score_thr,
+                rcnn_test_cfg.nms['iou_threshold'], rcnn_test_cfg.max_per_img)
+
+            results.bboxes = det_bboxes[:, :-1]
+            results.scores = det_scores
+            results.labels = torch.zeros_like(det_scores)
+
+        return results
+
+    @staticmethod
+    def set_nms(bboxes: Tensor,
+                scores: Tensor,
+                score_thr: float,
+                iou_threshold: float,
+                max_num: int = -1) -> Tuple[Tensor, Tensor]:
+        """NMS for multi-instance prediction. Please refer to
+        https://github.com/Purkialo/CrowdDet for more details.
+
+        Args:
+            bboxes (Tensor): predict bboxes.
+            scores (Tensor): The score of each predict bbox.
+            score_thr (float): bbox threshold, bboxes with scores lower than it
+                will not be considered.
+            iou_threshold (float): IoU threshold to be considered as
+                conflicted.
+            max_num (int, optional): if there are more than max_num bboxes
+                after NMS, only top max_num will be kept. Default to -1.
+
+        Returns:
+            Tuple[Tensor, Tensor]: (bboxes, scores).
+        """
+
+        bboxes = bboxes[scores > score_thr]
+        scores = scores[scores > score_thr]
+
+        ordered_scores, order = scores.sort(descending=True)
+        ordered_bboxes = bboxes[order]
+        roi_idx = ordered_bboxes[:, -1]
+
+        keep = torch.ones(len(ordered_bboxes)) == 1
+        ruler = torch.arange(len(ordered_bboxes))
+
+        keep = keep.to(bboxes.device)
+        ruler = ruler.to(bboxes.device)
+
+        while ruler.shape[0] > 0:
+            basement = ruler[0]
+            ruler = ruler[1:]
+            idx = roi_idx[basement]
+            # calculate the body overlap
+            basement_bbox = ordered_bboxes[:, :4][basement].reshape(-1, 4)
+            ruler_bbox = ordered_bboxes[:, :4][ruler].reshape(-1, 4)
+            overlap = bbox_overlaps(basement_bbox, ruler_bbox)
+            indices = torch.where(overlap > iou_threshold)[1]
+            loc = torch.where(roi_idx[ruler][indices] == idx)
+            # the mask won't change in the step
+            mask = keep[ruler[indices][loc]]
+            keep[ruler[indices]] = False
+            keep[ruler[indices][loc][mask]] = True
+            ruler[~keep[ruler]] = -1
+            ruler = ruler[ruler > 0]
+
+        keep = keep[order.sort()[1]]
+        return bboxes[keep][:max_num, :], scores[keep][:max_num]
diff --git a/head_extractor/src/mmdet/models/roi_heads/bbox_heads/sabl_head.py b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/sabl_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a9ee6aba9669514ec8ce7218e8c97e026830f6c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/sabl_head.py
@@ -0,0 +1,684 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.layers import multiclass_nms
+from mmdet.models.losses import accuracy
+from mmdet.models.task_modules import SamplingResult
+from mmdet.models.utils import multi_apply
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig
+from .bbox_head import BBoxHead
+
+
+@MODELS.register_module()
+class SABLHead(BBoxHead):
+    """Side-Aware Boundary Localization (SABL) for RoI-Head.
+
+    Side-Aware features are extracted by conv layers
+    with an attention mechanism.
+    Boundary Localization with Bucketing and Bucketing Guided Rescoring
+    are implemented in BucketingBBoxCoder.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        cls_in_channels (int): Input channels of cls RoI feature. \
+            Defaults to 256.
+        reg_in_channels (int): Input channels of reg RoI feature. \
+            Defaults to 256.
+        roi_feat_size (int): Size of RoI features. Defaults to 7.
+        reg_feat_up_ratio (int): Upsample ratio of reg features. \
+            Defaults to 2.
+        reg_pre_kernel (int): Kernel of 2D conv layers before \
+            attention pooling. Defaults to 3.
+        reg_post_kernel (int): Kernel of 1D conv layers after \
+            attention pooling. Defaults to 3.
+        reg_pre_num (int): Number of pre convs. Defaults to 2.
+        reg_post_num (int): Number of post convs. Defaults to 1.
+        num_classes (int): Number of classes in dataset. Defaults to 80.
+        cls_out_channels (int): Hidden channels in cls fcs. Defaults to 1024.
+        reg_offset_out_channels (int): Hidden and output channel \
+            of reg offset branch. Defaults to 256.
+        reg_cls_out_channels (int): Hidden and output channel \
+            of reg cls branch. Defaults to 256.
+        num_cls_fcs (int): Number of fcs for cls branch. Defaults to 1.
+        num_reg_fcs (int): Number of fcs for reg branch.. Defaults to 0.
+        reg_class_agnostic (bool): Class agnostic regression or not. \
+            Defaults to True.
+        norm_cfg (dict): Config of norm layers. Defaults to None.
+        bbox_coder (dict): Config of bbox coder. Defaults 'BucketingBBoxCoder'.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox_cls (dict): Config of classification loss for bbox branch.
+        loss_bbox_reg (dict): Config of regression loss for bbox branch.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 cls_in_channels: int = 256,
+                 reg_in_channels: int = 256,
+                 roi_feat_size: int = 7,
+                 reg_feat_up_ratio: int = 2,
+                 reg_pre_kernel: int = 3,
+                 reg_post_kernel: int = 3,
+                 reg_pre_num: int = 2,
+                 reg_post_num: int = 1,
+                 cls_out_channels: int = 1024,
+                 reg_offset_out_channels: int = 256,
+                 reg_cls_out_channels: int = 256,
+                 num_cls_fcs: int = 1,
+                 num_reg_fcs: int = 0,
+                 reg_class_agnostic: bool = True,
+                 norm_cfg: OptConfigType = None,
+                 bbox_coder: ConfigType = dict(
+                     type='BucketingBBoxCoder',
+                     num_buckets=14,
+                     scale_factor=1.7),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_bbox_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox_reg: ConfigType = dict(
+                     type='SmoothL1Loss', beta=0.1, loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super(BBoxHead, self).__init__(init_cfg=init_cfg)
+        self.cls_in_channels = cls_in_channels
+        self.reg_in_channels = reg_in_channels
+        self.roi_feat_size = roi_feat_size
+        self.reg_feat_up_ratio = int(reg_feat_up_ratio)
+        self.num_buckets = bbox_coder['num_buckets']
+        assert self.reg_feat_up_ratio // 2 >= 1
+        self.up_reg_feat_size = roi_feat_size * self.reg_feat_up_ratio
+        assert self.up_reg_feat_size == bbox_coder['num_buckets']
+        self.reg_pre_kernel = reg_pre_kernel
+        self.reg_post_kernel = reg_post_kernel
+        self.reg_pre_num = reg_pre_num
+        self.reg_post_num = reg_post_num
+        self.num_classes = num_classes
+        self.cls_out_channels = cls_out_channels
+        self.reg_offset_out_channels = reg_offset_out_channels
+        self.reg_cls_out_channels = reg_cls_out_channels
+        self.num_cls_fcs = num_cls_fcs
+        self.num_reg_fcs = num_reg_fcs
+        self.reg_class_agnostic = reg_class_agnostic
+        assert self.reg_class_agnostic
+        self.norm_cfg = norm_cfg
+
+        self.bbox_coder = TASK_UTILS.build(bbox_coder)
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_bbox_cls = MODELS.build(loss_bbox_cls)
+        self.loss_bbox_reg = MODELS.build(loss_bbox_reg)
+
+        self.cls_fcs = self._add_fc_branch(self.num_cls_fcs,
+                                           self.cls_in_channels,
+                                           self.roi_feat_size,
+                                           self.cls_out_channels)
+
+        self.side_num = int(np.ceil(self.num_buckets / 2))
+
+        if self.reg_feat_up_ratio > 1:
+            self.upsample_x = nn.ConvTranspose1d(
+                reg_in_channels,
+                reg_in_channels,
+                self.reg_feat_up_ratio,
+                stride=self.reg_feat_up_ratio)
+            self.upsample_y = nn.ConvTranspose1d(
+                reg_in_channels,
+                reg_in_channels,
+                self.reg_feat_up_ratio,
+                stride=self.reg_feat_up_ratio)
+
+        self.reg_pre_convs = nn.ModuleList()
+        for i in range(self.reg_pre_num):
+            reg_pre_conv = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=reg_pre_kernel,
+                padding=reg_pre_kernel // 2,
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_pre_convs.append(reg_pre_conv)
+
+        self.reg_post_conv_xs = nn.ModuleList()
+        for i in range(self.reg_post_num):
+            reg_post_conv_x = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=(1, reg_post_kernel),
+                padding=(0, reg_post_kernel // 2),
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_post_conv_xs.append(reg_post_conv_x)
+        self.reg_post_conv_ys = nn.ModuleList()
+        for i in range(self.reg_post_num):
+            reg_post_conv_y = ConvModule(
+                reg_in_channels,
+                reg_in_channels,
+                kernel_size=(reg_post_kernel, 1),
+                padding=(reg_post_kernel // 2, 0),
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='ReLU'))
+            self.reg_post_conv_ys.append(reg_post_conv_y)
+
+        self.reg_conv_att_x = nn.Conv2d(reg_in_channels, 1, 1)
+        self.reg_conv_att_y = nn.Conv2d(reg_in_channels, 1, 1)
+
+        self.fc_cls = nn.Linear(self.cls_out_channels, self.num_classes + 1)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.reg_cls_fcs = self._add_fc_branch(self.num_reg_fcs,
+                                               self.reg_in_channels, 1,
+                                               self.reg_cls_out_channels)
+        self.reg_offset_fcs = self._add_fc_branch(self.num_reg_fcs,
+                                                  self.reg_in_channels, 1,
+                                                  self.reg_offset_out_channels)
+        self.fc_reg_cls = nn.Linear(self.reg_cls_out_channels, 1)
+        self.fc_reg_offset = nn.Linear(self.reg_offset_out_channels, 1)
+
+        if init_cfg is None:
+            self.init_cfg = [
+                dict(
+                    type='Xavier',
+                    layer='Linear',
+                    distribution='uniform',
+                    override=[
+                        dict(type='Normal', name='reg_conv_att_x', std=0.01),
+                        dict(type='Normal', name='reg_conv_att_y', std=0.01),
+                        dict(type='Normal', name='fc_reg_cls', std=0.01),
+                        dict(type='Normal', name='fc_cls', std=0.01),
+                        dict(type='Normal', name='fc_reg_offset', std=0.001)
+                    ])
+            ]
+            if self.reg_feat_up_ratio > 1:
+                self.init_cfg += [
+                    dict(
+                        type='Kaiming',
+                        distribution='normal',
+                        override=[
+                            dict(name='upsample_x'),
+                            dict(name='upsample_y')
+                        ])
+                ]
+
+    def _add_fc_branch(self, num_branch_fcs: int, in_channels: int,
+                       roi_feat_size: int,
+                       fc_out_channels: int) -> nn.ModuleList:
+        """build fc layers."""
+        in_channels = in_channels * roi_feat_size * roi_feat_size
+        branch_fcs = nn.ModuleList()
+        for i in range(num_branch_fcs):
+            fc_in_channels = (in_channels if i == 0 else fc_out_channels)
+            branch_fcs.append(nn.Linear(fc_in_channels, fc_out_channels))
+        return branch_fcs
+
+    def cls_forward(self, cls_x: Tensor) -> Tensor:
+        """forward of classification fc layers."""
+        cls_x = cls_x.view(cls_x.size(0), -1)
+        for fc in self.cls_fcs:
+            cls_x = self.relu(fc(cls_x))
+        cls_score = self.fc_cls(cls_x)
+        return cls_score
+
+    def attention_pool(self, reg_x: Tensor) -> tuple:
+        """Extract direction-specific features fx and fy with attention
+        methanism."""
+        reg_fx = reg_x
+        reg_fy = reg_x
+        reg_fx_att = self.reg_conv_att_x(reg_fx).sigmoid()
+        reg_fy_att = self.reg_conv_att_y(reg_fy).sigmoid()
+        reg_fx_att = reg_fx_att / reg_fx_att.sum(dim=2).unsqueeze(2)
+        reg_fy_att = reg_fy_att / reg_fy_att.sum(dim=3).unsqueeze(3)
+        reg_fx = (reg_fx * reg_fx_att).sum(dim=2)
+        reg_fy = (reg_fy * reg_fy_att).sum(dim=3)
+        return reg_fx, reg_fy
+
+    def side_aware_feature_extractor(self, reg_x: Tensor) -> tuple:
+        """Refine and extract side-aware features without split them."""
+        for reg_pre_conv in self.reg_pre_convs:
+            reg_x = reg_pre_conv(reg_x)
+        reg_fx, reg_fy = self.attention_pool(reg_x)
+
+        if self.reg_post_num > 0:
+            reg_fx = reg_fx.unsqueeze(2)
+            reg_fy = reg_fy.unsqueeze(3)
+            for i in range(self.reg_post_num):
+                reg_fx = self.reg_post_conv_xs[i](reg_fx)
+                reg_fy = self.reg_post_conv_ys[i](reg_fy)
+            reg_fx = reg_fx.squeeze(2)
+            reg_fy = reg_fy.squeeze(3)
+        if self.reg_feat_up_ratio > 1:
+            reg_fx = self.relu(self.upsample_x(reg_fx))
+            reg_fy = self.relu(self.upsample_y(reg_fy))
+        reg_fx = torch.transpose(reg_fx, 1, 2)
+        reg_fy = torch.transpose(reg_fy, 1, 2)
+        return reg_fx.contiguous(), reg_fy.contiguous()
+
+    def reg_pred(self, x: Tensor, offset_fcs: nn.ModuleList,
+                 cls_fcs: nn.ModuleList) -> tuple:
+        """Predict bucketing estimation (cls_pred) and fine regression (offset
+        pred) with side-aware features."""
+        x_offset = x.view(-1, self.reg_in_channels)
+        x_cls = x.view(-1, self.reg_in_channels)
+
+        for fc in offset_fcs:
+            x_offset = self.relu(fc(x_offset))
+        for fc in cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+        offset_pred = self.fc_reg_offset(x_offset)
+        cls_pred = self.fc_reg_cls(x_cls)
+
+        offset_pred = offset_pred.view(x.size(0), -1)
+        cls_pred = cls_pred.view(x.size(0), -1)
+
+        return offset_pred, cls_pred
+
+    def side_aware_split(self, feat: Tensor) -> Tensor:
+        """Split side-aware features aligned with orders of bucketing
+        targets."""
+        l_end = int(np.ceil(self.up_reg_feat_size / 2))
+        r_start = int(np.floor(self.up_reg_feat_size / 2))
+        feat_fl = feat[:, :l_end]
+        feat_fr = feat[:, r_start:].flip(dims=(1, ))
+        feat_fl = feat_fl.contiguous()
+        feat_fr = feat_fr.contiguous()
+        feat = torch.cat([feat_fl, feat_fr], dim=-1)
+        return feat
+
+    def bbox_pred_split(self, bbox_pred: tuple,
+                        num_proposals_per_img: Sequence[int]) -> tuple:
+        """Split batch bbox prediction back to each image."""
+        bucket_cls_preds, bucket_offset_preds = bbox_pred
+        bucket_cls_preds = bucket_cls_preds.split(num_proposals_per_img, 0)
+        bucket_offset_preds = bucket_offset_preds.split(
+            num_proposals_per_img, 0)
+        bbox_pred = tuple(zip(bucket_cls_preds, bucket_offset_preds))
+        return bbox_pred
+
+    def reg_forward(self, reg_x: Tensor) -> tuple:
+        """forward of regression branch."""
+        outs = self.side_aware_feature_extractor(reg_x)
+        edge_offset_preds = []
+        edge_cls_preds = []
+        reg_fx = outs[0]
+        reg_fy = outs[1]
+        offset_pred_x, cls_pred_x = self.reg_pred(reg_fx, self.reg_offset_fcs,
+                                                  self.reg_cls_fcs)
+        offset_pred_y, cls_pred_y = self.reg_pred(reg_fy, self.reg_offset_fcs,
+                                                  self.reg_cls_fcs)
+        offset_pred_x = self.side_aware_split(offset_pred_x)
+        offset_pred_y = self.side_aware_split(offset_pred_y)
+        cls_pred_x = self.side_aware_split(cls_pred_x)
+        cls_pred_y = self.side_aware_split(cls_pred_y)
+        edge_offset_preds = torch.cat([offset_pred_x, offset_pred_y], dim=-1)
+        edge_cls_preds = torch.cat([cls_pred_x, cls_pred_y], dim=-1)
+
+        return edge_cls_preds, edge_offset_preds
+
+    def forward(self, x: Tensor) -> tuple:
+        """Forward features from the upstream network."""
+        bbox_pred = self.reg_forward(x)
+        cls_score = self.cls_forward(x)
+
+        return cls_score, bbox_pred
+
+    def get_targets(self,
+                    sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict,
+                    concat: bool = True) -> tuple:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results."""
+        pos_proposals = [res.pos_bboxes for res in sampling_results]
+        neg_proposals = [res.neg_bboxes for res in sampling_results]
+        pos_gt_bboxes = [res.pos_gt_bboxes for res in sampling_results]
+        pos_gt_labels = [res.pos_gt_labels for res in sampling_results]
+        cls_reg_targets = self.bucket_target(
+            pos_proposals,
+            neg_proposals,
+            pos_gt_bboxes,
+            pos_gt_labels,
+            rcnn_train_cfg,
+            concat=concat)
+        (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+         bucket_offset_targets, bucket_offset_weights) = cls_reg_targets
+        return (labels, label_weights, (bucket_cls_targets,
+                                        bucket_offset_targets),
+                (bucket_cls_weights, bucket_offset_weights))
+
+    def bucket_target(self,
+                      pos_proposals_list: list,
+                      neg_proposals_list: list,
+                      pos_gt_bboxes_list: list,
+                      pos_gt_labels_list: list,
+                      rcnn_train_cfg: ConfigDict,
+                      concat: bool = True) -> tuple:
+        """Compute bucketing estimation targets and fine regression targets for
+        a batch of images."""
+        (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+         bucket_offset_targets, bucket_offset_weights) = multi_apply(
+             self._bucket_target_single,
+             pos_proposals_list,
+             neg_proposals_list,
+             pos_gt_bboxes_list,
+             pos_gt_labels_list,
+             cfg=rcnn_train_cfg)
+
+        if concat:
+            labels = torch.cat(labels, 0)
+            label_weights = torch.cat(label_weights, 0)
+            bucket_cls_targets = torch.cat(bucket_cls_targets, 0)
+            bucket_cls_weights = torch.cat(bucket_cls_weights, 0)
+            bucket_offset_targets = torch.cat(bucket_offset_targets, 0)
+            bucket_offset_weights = torch.cat(bucket_offset_weights, 0)
+        return (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+                bucket_offset_targets, bucket_offset_weights)
+
+    def _bucket_target_single(self, pos_proposals: Tensor,
+                              neg_proposals: Tensor, pos_gt_bboxes: Tensor,
+                              pos_gt_labels: Tensor, cfg: ConfigDict) -> tuple:
+        """Compute bucketing estimation targets and fine regression targets for
+        a single image.
+
+        Args:
+            pos_proposals (Tensor): positive proposals of a single image,
+                 Shape (n_pos, 4)
+            neg_proposals (Tensor): negative proposals of a single image,
+                 Shape (n_neg, 4).
+            pos_gt_bboxes (Tensor): gt bboxes assigned to positive proposals
+                 of a single image, Shape (n_pos, 4).
+            pos_gt_labels (Tensor): gt labels assigned to positive proposals
+                 of a single image, Shape (n_pos, ).
+            cfg (dict): Config of calculating targets
+
+        Returns:
+            tuple:
+
+            - labels (Tensor): Labels in a single image. Shape (n,).
+            - label_weights (Tensor): Label weights in a single image.
+                Shape (n,)
+            - bucket_cls_targets (Tensor): Bucket cls targets in
+                a single image. Shape (n, num_buckets*2).
+            - bucket_cls_weights (Tensor): Bucket cls weights in
+                a single image. Shape (n, num_buckets*2).
+            - bucket_offset_targets (Tensor): Bucket offset targets
+                in a single image. Shape (n, num_buckets*2).
+            - bucket_offset_targets (Tensor): Bucket offset weights
+                in a single image. Shape (n, num_buckets*2).
+        """
+        num_pos = pos_proposals.size(0)
+        num_neg = neg_proposals.size(0)
+        num_samples = num_pos + num_neg
+        labels = pos_gt_bboxes.new_full((num_samples, ),
+                                        self.num_classes,
+                                        dtype=torch.long)
+        label_weights = pos_proposals.new_zeros(num_samples)
+        bucket_cls_targets = pos_proposals.new_zeros(num_samples,
+                                                     4 * self.side_num)
+        bucket_cls_weights = pos_proposals.new_zeros(num_samples,
+                                                     4 * self.side_num)
+        bucket_offset_targets = pos_proposals.new_zeros(
+            num_samples, 4 * self.side_num)
+        bucket_offset_weights = pos_proposals.new_zeros(
+            num_samples, 4 * self.side_num)
+        if num_pos > 0:
+            labels[:num_pos] = pos_gt_labels
+            label_weights[:num_pos] = 1.0
+            (pos_bucket_offset_targets, pos_bucket_offset_weights,
+             pos_bucket_cls_targets,
+             pos_bucket_cls_weights) = self.bbox_coder.encode(
+                 pos_proposals, pos_gt_bboxes)
+            bucket_cls_targets[:num_pos, :] = pos_bucket_cls_targets
+            bucket_cls_weights[:num_pos, :] = pos_bucket_cls_weights
+            bucket_offset_targets[:num_pos, :] = pos_bucket_offset_targets
+            bucket_offset_weights[:num_pos, :] = pos_bucket_offset_weights
+        if num_neg > 0:
+            label_weights[-num_neg:] = 1.0
+        return (labels, label_weights, bucket_cls_targets, bucket_cls_weights,
+                bucket_offset_targets, bucket_offset_weights)
+
+    def loss(self,
+             cls_score: Tensor,
+             bbox_pred: Tuple[Tensor, Tensor],
+             rois: Tensor,
+             labels: Tensor,
+             label_weights: Tensor,
+             bbox_targets: Tuple[Tensor, Tensor],
+             bbox_weights: Tuple[Tensor, Tensor],
+             reduction_override: Optional[str] = None) -> dict:
+        """Calculate the loss based on the network predictions and targets.
+
+        Args:
+            cls_score (Tensor): Classification prediction
+                results of all class, has shape
+                (batch_size * num_proposals_single_image, num_classes)
+            bbox_pred (Tensor): A tuple of regression prediction results
+                containing `bucket_cls_preds and` `bucket_offset_preds`.
+            rois (Tensor): RoIs with the shape
+                (batch_size * num_proposals_single_image, 5) where the first
+                column indicates batch id of each RoI.
+            labels (Tensor): Gt_labels for all proposals in a batch, has
+                shape (batch_size * num_proposals_single_image, ).
+            label_weights (Tensor): Labels_weights for all proposals in a
+                batch, has shape (batch_size * num_proposals_single_image, ).
+            bbox_targets (Tuple[Tensor, Tensor]): A tuple of regression target
+                containing `bucket_cls_targets` and `bucket_offset_targets`.
+                the last dimension 4 represents [tl_x, tl_y, br_x, br_y].
+            bbox_weights (Tuple[Tensor, Tensor]): A tuple of regression
+                weights containing `bucket_cls_weights` and
+                `bucket_offset_weights`.
+            reduction_override (str, optional): The reduction
+                method used to override the original reduction
+                method of the loss. Options are "none",
+                "mean" and "sum". Defaults to None,
+
+        Returns:
+            dict: A dictionary of loss.
+        """
+        losses = dict()
+        if cls_score is not None:
+            avg_factor = max(torch.sum(label_weights > 0).float().item(), 1.)
+            losses['loss_cls'] = self.loss_cls(
+                cls_score,
+                labels,
+                label_weights,
+                avg_factor=avg_factor,
+                reduction_override=reduction_override)
+            losses['acc'] = accuracy(cls_score, labels)
+
+        if bbox_pred is not None:
+            bucket_cls_preds, bucket_offset_preds = bbox_pred
+            bucket_cls_targets, bucket_offset_targets = bbox_targets
+            bucket_cls_weights, bucket_offset_weights = bbox_weights
+            # edge cls
+            bucket_cls_preds = bucket_cls_preds.view(-1, self.side_num)
+            bucket_cls_targets = bucket_cls_targets.view(-1, self.side_num)
+            bucket_cls_weights = bucket_cls_weights.view(-1, self.side_num)
+            losses['loss_bbox_cls'] = self.loss_bbox_cls(
+                bucket_cls_preds,
+                bucket_cls_targets,
+                bucket_cls_weights,
+                avg_factor=bucket_cls_targets.size(0),
+                reduction_override=reduction_override)
+
+            losses['loss_bbox_reg'] = self.loss_bbox_reg(
+                bucket_offset_preds,
+                bucket_offset_targets,
+                bucket_offset_weights,
+                avg_factor=bucket_offset_targets.size(0),
+                reduction_override=reduction_override)
+
+        return losses
+
+    def _predict_by_feat_single(
+            self,
+            roi: Tensor,
+            cls_score: Tensor,
+            bbox_pred: Tuple[Tensor, Tensor],
+            img_meta: dict,
+            rescale: bool = False,
+            rcnn_test_cfg: Optional[ConfigDict] = None) -> InstanceData:
+        """Transform a single image's features extracted from the head into
+        bbox results.
+
+        Args:
+            roi (Tensor): Boxes to be transformed. Has shape (num_boxes, 5).
+                last dimension 5 arrange as (batch_index, x1, y1, x2, y2).
+            cls_score (Tensor): Box scores, has shape
+                (num_boxes, num_classes + 1).
+            bbox_pred (Tuple[Tensor, Tensor]): Box cls preds and offset preds.
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None
+
+        Returns:
+            :obj:`InstanceData`: Detection results of each image
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results = InstanceData()
+        if isinstance(cls_score, list):
+            cls_score = sum(cls_score) / float(len(cls_score))
+        scores = F.softmax(cls_score, dim=1) if cls_score is not None else None
+        img_shape = img_meta['img_shape']
+        if bbox_pred is not None:
+            bboxes, confidences = self.bbox_coder.decode(
+                roi[:, 1:], bbox_pred, img_shape)
+        else:
+            bboxes = roi[:, 1:].clone()
+            confidences = None
+            if img_shape is not None:
+                bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1] - 1)
+                bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0] - 1)
+
+        if rescale and bboxes.size(0) > 0:
+            assert img_meta.get('scale_factor') is not None
+            scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+            bboxes = (bboxes.view(bboxes.size(0), -1, 4) / scale_factor).view(
+                bboxes.size()[0], -1)
+
+        if rcnn_test_cfg is None:
+            results.bboxes = bboxes
+            results.scores = scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes,
+                scores,
+                rcnn_test_cfg.score_thr,
+                rcnn_test_cfg.nms,
+                rcnn_test_cfg.max_per_img,
+                score_factors=confidences)
+            results.bboxes = det_bboxes[:, :4]
+            results.scores = det_bboxes[:, -1]
+            results.labels = det_labels
+        return results
+
+    def refine_bboxes(self, sampling_results: List[SamplingResult],
+                      bbox_results: dict,
+                      batch_img_metas: List[dict]) -> InstanceList:
+        """Refine bboxes during training.
+
+        Args:
+            sampling_results (List[:obj:`SamplingResult`]): Sampling results.
+            bbox_results (dict): Usually is a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+            batch_img_metas (List[dict]): List of image information.
+
+        Returns:
+            list[:obj:`InstanceData`]: Refined bboxes of each image.
+        """
+        pos_is_gts = [res.pos_is_gt for res in sampling_results]
+        # bbox_targets is a tuple
+        labels = bbox_results['bbox_targets'][0]
+        cls_scores = bbox_results['cls_score']
+        rois = bbox_results['rois']
+        bbox_preds = bbox_results['bbox_pred']
+
+        if cls_scores.numel() == 0:
+            return None
+
+        labels = torch.where(labels == self.num_classes,
+                             cls_scores[:, :-1].argmax(1), labels)
+
+        img_ids = rois[:, 0].long().unique(sorted=True)
+        assert img_ids.numel() <= len(batch_img_metas)
+
+        results_list = []
+        for i in range(len(batch_img_metas)):
+            inds = torch.nonzero(
+                rois[:, 0] == i, as_tuple=False).squeeze(dim=1)
+            num_rois = inds.numel()
+
+            bboxes_ = rois[inds, 1:]
+            label_ = labels[inds]
+            edge_cls_preds, edge_offset_preds = bbox_preds
+            edge_cls_preds_ = edge_cls_preds[inds]
+            edge_offset_preds_ = edge_offset_preds[inds]
+            bbox_pred_ = (edge_cls_preds_, edge_offset_preds_)
+            img_meta_ = batch_img_metas[i]
+            pos_is_gts_ = pos_is_gts[i]
+
+            bboxes = self.regress_by_class(bboxes_, label_, bbox_pred_,
+                                           img_meta_)
+            # filter gt bboxes
+            pos_keep = 1 - pos_is_gts_
+            keep_inds = pos_is_gts_.new_ones(num_rois)
+            keep_inds[:len(pos_is_gts_)] = pos_keep
+            results = InstanceData(bboxes=bboxes[keep_inds.type(torch.bool)])
+            results_list.append(results)
+
+        return results_list
+
+    def regress_by_class(self, rois: Tensor, label: Tensor, bbox_pred: tuple,
+                         img_meta: dict) -> Tensor:
+        """Regress the bbox for the predicted class. Used in Cascade R-CNN.
+
+        Args:
+            rois (Tensor): shape (n, 4) or (n, 5)
+            label (Tensor): shape (n, )
+            bbox_pred (Tuple[Tensor]): shape [(n, num_buckets *2), \
+                (n, num_buckets *2)]
+            img_meta (dict): Image meta info.
+
+        Returns:
+            Tensor: Regressed bboxes, the same shape as input rois.
+        """
+        assert rois.size(1) == 4 or rois.size(1) == 5
+
+        if rois.size(1) == 4:
+            new_rois, _ = self.bbox_coder.decode(rois, bbox_pred,
+                                                 img_meta['img_shape'])
+        else:
+            bboxes, _ = self.bbox_coder.decode(rois[:, 1:], bbox_pred,
+                                               img_meta['img_shape'])
+            new_rois = torch.cat((rois[:, [0]], bboxes), dim=1)
+
+        return new_rois
diff --git a/head_extractor/src/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..790b08fb207970927c7925cb8b3fb365bc183dc4
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/bbox_heads/scnet_bbox_head.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .convfc_bbox_head import ConvFCBBoxHead
+
+
+@MODELS.register_module()
+class SCNetBBoxHead(ConvFCBBoxHead):
+    """BBox head for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    This inherits ``ConvFCBBoxHead`` with modified forward() function, allow us
+    to get intermediate shared feature.
+    """
+
+    def _forward_shared(self, x: Tensor) -> Tensor:
+        """Forward function for shared part.
+
+        Args:
+            x (Tensor): Input feature.
+
+        Returns:
+            Tensor: Shared feature.
+        """
+        if self.num_shared_convs > 0:
+            for conv in self.shared_convs:
+                x = conv(x)
+
+        if self.num_shared_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+
+            x = x.flatten(1)
+
+            for fc in self.shared_fcs:
+                x = self.relu(fc(x))
+
+        return x
+
+    def _forward_cls_reg(self, x: Tensor) -> Tuple[Tensor]:
+        """Forward function for classification and regression parts.
+
+        Args:
+            x (Tensor): Input feature.
+
+        Returns:
+            tuple[Tensor]:
+
+                - cls_score (Tensor): classification prediction.
+                - bbox_pred (Tensor): bbox prediction.
+        """
+        x_cls = x
+        x_reg = x
+
+        for conv in self.cls_convs:
+            x_cls = conv(x_cls)
+        if x_cls.dim() > 2:
+            if self.with_avg_pool:
+                x_cls = self.avg_pool(x_cls)
+            x_cls = x_cls.flatten(1)
+        for fc in self.cls_fcs:
+            x_cls = self.relu(fc(x_cls))
+
+        for conv in self.reg_convs:
+            x_reg = conv(x_reg)
+        if x_reg.dim() > 2:
+            if self.with_avg_pool:
+                x_reg = self.avg_pool(x_reg)
+            x_reg = x_reg.flatten(1)
+        for fc in self.reg_fcs:
+            x_reg = self.relu(fc(x_reg))
+
+        cls_score = self.fc_cls(x_cls) if self.with_cls else None
+        bbox_pred = self.fc_reg(x_reg) if self.with_reg else None
+
+        return cls_score, bbox_pred
+
+    def forward(
+            self,
+            x: Tensor,
+            return_shared_feat: bool = False) -> Union[Tensor, Tuple[Tensor]]:
+        """Forward function.
+
+        Args:
+            x (Tensor): input features
+            return_shared_feat (bool): If True, return cls-reg-shared feature.
+
+        Return:
+            out (tuple[Tensor]): contain ``cls_score`` and ``bbox_pred``,
+                if  ``return_shared_feat`` is True, append ``x_shared`` to the
+                returned tuple.
+        """
+        x_shared = self._forward_shared(x)
+        out = self._forward_cls_reg(x_shared)
+
+        if return_shared_feat:
+            out += (x_shared, )
+
+        return out
diff --git a/head_extractor/src/mmdet/models/roi_heads/cascade_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/cascade_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..81db671113a63beb7849abdc0e432a738ee46f5e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/cascade_roi_head.py
@@ -0,0 +1,568 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmengine.model import ModuleList
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.test_time_augs import merge_aug_masks
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi, get_box_tensor
+from mmdet.utils import (ConfigType, InstanceList, MultiConfig, OptConfigType,
+                         OptMultiConfig)
+from ..utils.misc import empty_instances, unpack_gt_instances
+from .base_roi_head import BaseRoIHead
+
+
+@MODELS.register_module()
+class CascadeRoIHead(BaseRoIHead):
+    """Cascade roi head including one bbox head and one mask head.
+
+    https://arxiv.org/abs/1712.00726
+    """
+
+    def __init__(self,
+                 num_stages: int,
+                 stage_loss_weights: Union[List[float], Tuple[float]],
+                 bbox_roi_extractor: OptMultiConfig = None,
+                 bbox_head: OptMultiConfig = None,
+                 mask_roi_extractor: OptMultiConfig = None,
+                 mask_head: OptMultiConfig = None,
+                 shared_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None) -> None:
+        assert bbox_roi_extractor is not None
+        assert bbox_head is not None
+        assert shared_head is None, \
+            'Shared head is not supported in Cascade RCNN anymore'
+
+        self.num_stages = num_stages
+        self.stage_loss_weights = stage_loss_weights
+        super().__init__(
+            bbox_roi_extractor=bbox_roi_extractor,
+            bbox_head=bbox_head,
+            mask_roi_extractor=mask_roi_extractor,
+            mask_head=mask_head,
+            shared_head=shared_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+
+    def init_bbox_head(self, bbox_roi_extractor: MultiConfig,
+                       bbox_head: MultiConfig) -> None:
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (:obj:`ConfigDict`, dict or list):
+                Config of box roi extractor.
+            bbox_head (:obj:`ConfigDict`, dict or list): Config
+                of box in box head.
+        """
+        self.bbox_roi_extractor = ModuleList()
+        self.bbox_head = ModuleList()
+        if not isinstance(bbox_roi_extractor, list):
+            bbox_roi_extractor = [
+                bbox_roi_extractor for _ in range(self.num_stages)
+            ]
+        if not isinstance(bbox_head, list):
+            bbox_head = [bbox_head for _ in range(self.num_stages)]
+        assert len(bbox_roi_extractor) == len(bbox_head) == self.num_stages
+        for roi_extractor, head in zip(bbox_roi_extractor, bbox_head):
+            self.bbox_roi_extractor.append(MODELS.build(roi_extractor))
+            self.bbox_head.append(MODELS.build(head))
+
+    def init_mask_head(self, mask_roi_extractor: MultiConfig,
+                       mask_head: MultiConfig) -> None:
+        """Initialize mask head and mask roi extractor.
+
+        Args:
+            mask_head (dict): Config of mask in mask head.
+            mask_roi_extractor (:obj:`ConfigDict`, dict or list):
+                Config of mask roi extractor.
+        """
+        self.mask_head = nn.ModuleList()
+        if not isinstance(mask_head, list):
+            mask_head = [mask_head for _ in range(self.num_stages)]
+        assert len(mask_head) == self.num_stages
+        for head in mask_head:
+            self.mask_head.append(MODELS.build(head))
+        if mask_roi_extractor is not None:
+            self.share_roi_extractor = False
+            self.mask_roi_extractor = ModuleList()
+            if not isinstance(mask_roi_extractor, list):
+                mask_roi_extractor = [
+                    mask_roi_extractor for _ in range(self.num_stages)
+                ]
+            assert len(mask_roi_extractor) == self.num_stages
+            for roi_extractor in mask_roi_extractor:
+                self.mask_roi_extractor.append(MODELS.build(roi_extractor))
+        else:
+            self.share_roi_extractor = True
+            self.mask_roi_extractor = self.bbox_roi_extractor
+
+    def init_assigner_sampler(self) -> None:
+        """Initialize assigner and sampler for each stage."""
+        self.bbox_assigner = []
+        self.bbox_sampler = []
+        if self.train_cfg is not None:
+            for idx, rcnn_train_cfg in enumerate(self.train_cfg):
+                self.bbox_assigner.append(
+                    TASK_UTILS.build(rcnn_train_cfg.assigner))
+                self.current_stage = idx
+                self.bbox_sampler.append(
+                    TASK_UTILS.build(
+                        rcnn_train_cfg.sampler,
+                        default_args=dict(context=self)))
+
+    def _bbox_forward(self, stage: int, x: Tuple[Tensor],
+                      rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        cls_score, bbox_pred = bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def bbox_loss(self, stage: int, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult]) -> dict:
+        """Run forward function and calculate loss for box head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+        """
+        bbox_head = self.bbox_head[stage]
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(stage, x, rois)
+        bbox_results.update(rois=rois)
+
+        bbox_loss_and_target = bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg[stage])
+        bbox_results.update(bbox_loss_and_target)
+
+        return bbox_results
+
+    def _mask_forward(self, stage: int, x: Tuple[Tensor],
+                      rois: Tensor) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        mask_preds = mask_head(mask_feats)
+
+        mask_results = dict(mask_preds=mask_preds)
+        return mask_results
+
+    def mask_loss(self, stage: int, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  batch_gt_instances: InstanceList) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+        mask_results = self._mask_forward(stage, x, pos_rois)
+
+        mask_head = self.mask_head[stage]
+
+        mask_loss_and_target = mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg[stage])
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        # TODO: May add a new function in baseroihead
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        num_imgs = len(batch_data_samples)
+        losses = dict()
+        results_list = rpn_results_list
+        for stage in range(self.num_stages):
+            self.current_stage = stage
+
+            stage_loss_weight = self.stage_loss_weights[stage]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            if self.with_bbox or self.with_mask:
+                bbox_assigner = self.bbox_assigner[stage]
+                bbox_sampler = self.bbox_sampler[stage]
+
+                for i in range(num_imgs):
+                    results = results_list[i]
+                    # rename rpn_results.bboxes to rpn_results.priors
+                    results.priors = results.pop('bboxes')
+
+                    assign_result = bbox_assigner.assign(
+                        results, batch_gt_instances[i],
+                        batch_gt_instances_ignore[i])
+
+                    sampling_result = bbox_sampler.sample(
+                        assign_result,
+                        results,
+                        batch_gt_instances[i],
+                        feats=[lvl_feat[i][None] for lvl_feat in x])
+                    sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = self.bbox_loss(stage, x, sampling_results)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{stage}.{name}'] = (
+                    value * stage_loss_weight if 'loss' in name else value)
+
+            # mask head forward and loss
+            if self.with_mask:
+                mask_results = self.mask_loss(stage, x, sampling_results,
+                                              batch_gt_instances)
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{stage}.{name}'] = (
+                        value * stage_loss_weight if 'loss' in name else value)
+
+            # refine bboxes
+            if stage < self.num_stages - 1:
+                bbox_head = self.bbox_head[stage]
+                with torch.no_grad():
+                    results_list = bbox_head.refine_bboxes(
+                        sampling_results, bbox_results, batch_img_metas)
+                    # Empty proposal
+                    if results_list is None:
+                        break
+        return losses
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False,
+                     **kwargs) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposals = [res.bboxes for res in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+
+        if rois.shape[0] == 0:
+            return empty_instances(
+                batch_img_metas,
+                rois.device,
+                task_type='bbox',
+                box_type=self.bbox_head[-1].predict_box_type,
+                num_classes=self.bbox_head[-1].num_classes,
+                score_per_cls=rcnn_test_cfg is None)
+
+        rois, cls_scores, bbox_preds = self._refine_roi(
+            x=x,
+            rois=rois,
+            batch_img_metas=batch_img_metas,
+            num_proposals_per_img=num_proposals_per_img,
+            **kwargs)
+
+        results_list = self.bbox_head[-1].predict_by_feat(
+            rois=rois,
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            batch_img_metas=batch_img_metas,
+            rescale=rescale,
+            rcnn_test_cfg=rcnn_test_cfg)
+        return results_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: List[InstanceData],
+                     rescale: bool = False) -> List[InstanceData]:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        aug_masks = []
+        for stage in range(self.num_stages):
+            mask_results = self._mask_forward(stage, x, mask_rois)
+            mask_preds = mask_results['mask_preds']
+            # split batch mask prediction back to each image
+            mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+            aug_masks.append([m.sigmoid().detach() for m in mask_preds])
+
+        merged_masks = []
+        for i in range(len(batch_img_metas)):
+            aug_mask = [mask[i] for mask in aug_masks]
+            merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+            merged_masks.append(merged_mask)
+        results_list = self.mask_head[-1].predict_by_feat(
+            mask_preds=merged_masks,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale,
+            activate_map=True)
+        return results_list
+
+    def _refine_roi(self, x: Tuple[Tensor], rois: Tensor,
+                    batch_img_metas: List[dict],
+                    num_proposals_per_img: Sequence[int], **kwargs) -> tuple:
+        """Multi-stage refinement of RoI.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): shape (n, 5), [batch_ind, x1, y1, x2, y2]
+            batch_img_metas (list[dict]): List of image information.
+            num_proposals_per_img (sequence[int]): number of proposals
+                in each image.
+
+        Returns:
+            tuple:
+
+               - rois (Tensor): Refined RoI.
+               - cls_scores (list[Tensor]): Average predicted
+                   cls score per image.
+               - bbox_preds (list[Tensor]): Bbox branch predictions
+                   for the last stage of per image.
+        """
+        # "ms" in variable names means multi-stage
+        ms_scores = []
+        for stage in range(self.num_stages):
+            bbox_results = self._bbox_forward(
+                stage=stage, x=x, rois=rois, **kwargs)
+
+            # split batch bbox prediction back to each image
+            cls_scores = bbox_results['cls_score']
+            bbox_preds = bbox_results['bbox_pred']
+
+            rois = rois.split(num_proposals_per_img, 0)
+            cls_scores = cls_scores.split(num_proposals_per_img, 0)
+            ms_scores.append(cls_scores)
+
+            # some detector with_reg is False, bbox_preds will be None
+            if bbox_preds is not None:
+                # TODO move this to a sabl_roi_head
+                # the bbox prediction of some detectors like SABL is not Tensor
+                if isinstance(bbox_preds, torch.Tensor):
+                    bbox_preds = bbox_preds.split(num_proposals_per_img, 0)
+                else:
+                    bbox_preds = self.bbox_head[stage].bbox_pred_split(
+                        bbox_preds, num_proposals_per_img)
+            else:
+                bbox_preds = (None, ) * len(batch_img_metas)
+
+            if stage < self.num_stages - 1:
+                bbox_head = self.bbox_head[stage]
+                if bbox_head.custom_activation:
+                    cls_scores = [
+                        bbox_head.loss_cls.get_activation(s)
+                        for s in cls_scores
+                    ]
+                refine_rois_list = []
+                for i in range(len(batch_img_metas)):
+                    if rois[i].shape[0] > 0:
+                        bbox_label = cls_scores[i][:, :-1].argmax(dim=1)
+                        # Refactor `bbox_head.regress_by_class` to only accept
+                        # box tensor without img_idx concatenated.
+                        refined_bboxes = bbox_head.regress_by_class(
+                            rois[i][:, 1:], bbox_label, bbox_preds[i],
+                            batch_img_metas[i])
+                        refined_bboxes = get_box_tensor(refined_bboxes)
+                        refined_rois = torch.cat(
+                            [rois[i][:, [0]], refined_bboxes], dim=1)
+                        refine_rois_list.append(refined_rois)
+                rois = torch.cat(refine_rois_list)
+
+        # average scores of each image by stages
+        cls_scores = [
+            sum([score[i] for score in ms_scores]) / float(len(ms_scores))
+            for i in range(len(batch_img_metas))
+        ]
+        return rois, cls_scores, bbox_preds
+
+    def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+                batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            rois, cls_scores, bbox_preds = self._refine_roi(
+                x, rois, batch_img_metas, num_proposals_per_img)
+            results = results + (cls_scores, bbox_preds)
+        # mask head
+        if self.with_mask:
+            aug_masks = []
+            rois = torch.cat(rois)
+            for stage in range(self.num_stages):
+                mask_results = self._mask_forward(stage, x, rois)
+                mask_preds = mask_results['mask_preds']
+                mask_preds = mask_preds.split(num_proposals_per_img, 0)
+                aug_masks.append([m.sigmoid().detach() for m in mask_preds])
+
+            merged_masks = []
+            for i in range(len(batch_img_metas)):
+                aug_mask = [mask[i] for mask in aug_masks]
+                merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+                merged_masks.append(merged_mask)
+            results = results + (merged_masks, )
+        return results
diff --git a/head_extractor/src/mmdet/models/roi_heads/double_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/double_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9464ff55bafcca9f3545a3a72dde1eb3939cece
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/double_roi_head.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class DoubleHeadRoIHead(StandardRoIHead):
+    """RoI head for `Double Head RCNN <https://arxiv.org/abs/1904.06493>`_.
+
+    Args:
+        reg_roi_scale_factor (float): The scale factor to extend the rois
+            used to extract the regression features.
+    """
+
+    def __init__(self, reg_roi_scale_factor: float, **kwargs):
+        super().__init__(**kwargs)
+        self.reg_roi_scale_factor = reg_roi_scale_factor
+
+    def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_cls_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        bbox_reg_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs],
+            rois,
+            roi_scale_factor=self.reg_roi_scale_factor)
+        if self.with_shared_head:
+            bbox_cls_feats = self.shared_head(bbox_cls_feats)
+            bbox_reg_feats = self.shared_head(bbox_reg_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_cls_feats, bbox_reg_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            bbox_feats=bbox_cls_feats)
+        return bbox_results
diff --git a/head_extractor/src/mmdet/models/roi_heads/dynamic_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/dynamic_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c7f7bd2f68cab0fcdec725501f74b65274eb30e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/dynamic_roi_head.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.models.losses import SmoothL1Loss
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList
+from ..utils.misc import unpack_gt_instances
+from .standard_roi_head import StandardRoIHead
+
+EPS = 1e-15
+
+
+@MODELS.register_module()
+class DynamicRoIHead(StandardRoIHead):
+    """RoI head for `Dynamic R-CNN <https://arxiv.org/abs/2004.06002>`_."""
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        assert isinstance(self.bbox_head.loss_bbox, SmoothL1Loss)
+        # the IoU history of the past `update_iter_interval` iterations
+        self.iou_history = []
+        # the beta history of the past `update_iter_interval` iterations
+        self.beta_history = []
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Forward function for training.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(batch_data_samples)
+        sampling_results = []
+        cur_iou = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            # record the `iou_topk`-th largest IoU in an image
+            iou_topk = min(self.train_cfg.dynamic_rcnn.iou_topk,
+                           len(assign_result.max_overlaps))
+            ious, _ = torch.topk(assign_result.max_overlaps, iou_topk)
+            cur_iou.append(ious[-1].item())
+            sampling_results.append(sampling_result)
+        # average the current IoUs over images
+        cur_iou = np.mean(cur_iou)
+        self.iou_history.append(cur_iou)
+
+        losses = dict()
+        # bbox head forward and loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(x, sampling_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self.mask_loss(x, sampling_results,
+                                          bbox_results['bbox_feats'],
+                                          batch_gt_instances)
+            losses.update(mask_results['loss_mask'])
+
+        # update IoU threshold and SmoothL1 beta
+        update_iter_interval = self.train_cfg.dynamic_rcnn.update_iter_interval
+        if len(self.iou_history) % update_iter_interval == 0:
+            new_iou_thr, new_beta = self.update_hyperparameters()
+
+        return losses
+
+    def bbox_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult]) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        bbox_loss_and_target = self.bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg)
+        bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
+
+        # record the `beta_topk`-th smallest target
+        # `bbox_targets[2]` and `bbox_targets[3]` stand for bbox_targets
+        # and bbox_weights, respectively
+        bbox_targets = bbox_loss_and_target['bbox_targets']
+        pos_inds = bbox_targets[3][:, 0].nonzero().squeeze(1)
+        num_pos = len(pos_inds)
+        num_imgs = len(sampling_results)
+        if num_pos > 0:
+            cur_target = bbox_targets[2][pos_inds, :2].abs().mean(dim=1)
+            beta_topk = min(self.train_cfg.dynamic_rcnn.beta_topk * num_imgs,
+                            num_pos)
+            cur_target = torch.kthvalue(cur_target, beta_topk)[0].item()
+            self.beta_history.append(cur_target)
+
+        return bbox_results
+
+    def update_hyperparameters(self):
+        """Update hyperparameters like IoU thresholds for assigner and beta for
+        SmoothL1 loss based on the training statistics.
+
+        Returns:
+            tuple[float]: the updated ``iou_thr`` and ``beta``.
+        """
+        new_iou_thr = max(self.train_cfg.dynamic_rcnn.initial_iou,
+                          np.mean(self.iou_history))
+        self.iou_history = []
+        self.bbox_assigner.pos_iou_thr = new_iou_thr
+        self.bbox_assigner.neg_iou_thr = new_iou_thr
+        self.bbox_assigner.min_pos_iou = new_iou_thr
+        if (not self.beta_history) or (np.median(self.beta_history) < EPS):
+            # avoid 0 or too small value for new_beta
+            new_beta = self.bbox_head.loss_bbox.beta
+        else:
+            new_beta = min(self.train_cfg.dynamic_rcnn.initial_beta,
+                           np.median(self.beta_history))
+        self.beta_history = []
+        self.bbox_head.loss_bbox.beta = new_beta
+        return new_iou_thr, new_beta
diff --git a/head_extractor/src/mmdet/models/roi_heads/grid_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/grid_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eda7f01bcd4e44faca14b61ec4956ee2c372ad6
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/grid_roi_head.py
@@ -0,0 +1,280 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils.misc import unpack_gt_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class GridRoIHead(StandardRoIHead):
+    """Implementation of `Grid RoI Head <https://arxiv.org/abs/1811.12030>`_
+
+    Args:
+        grid_roi_extractor (:obj:`ConfigDict` or dict): Config of
+            roi extractor.
+        grid_head (:obj:`ConfigDict` or dict): Config of grid head
+    """
+
+    def __init__(self, grid_roi_extractor: ConfigType, grid_head: ConfigType,
+                 **kwargs) -> None:
+        assert grid_head is not None
+        super().__init__(**kwargs)
+        if grid_roi_extractor is not None:
+            self.grid_roi_extractor = MODELS.build(grid_roi_extractor)
+            self.share_roi_extractor = False
+        else:
+            self.share_roi_extractor = True
+            self.grid_roi_extractor = self.bbox_roi_extractor
+        self.grid_head = MODELS.build(grid_head)
+
+    def _random_jitter(self,
+                       sampling_results: List[SamplingResult],
+                       batch_img_metas: List[dict],
+                       amplitude: float = 0.15) -> List[SamplingResult]:
+        """Ramdom jitter positive proposals for training.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_img_metas (list[dict]): List of image information.
+            amplitude (float): Amplitude of random offset. Defaults to 0.15.
+
+        Returns:
+            list[obj:SamplingResult]: SamplingResults after random jittering.
+        """
+        for sampling_result, img_meta in zip(sampling_results,
+                                             batch_img_metas):
+            bboxes = sampling_result.pos_priors
+            random_offsets = bboxes.new_empty(bboxes.shape[0], 4).uniform_(
+                -amplitude, amplitude)
+            # before jittering
+            cxcy = (bboxes[:, 2:4] + bboxes[:, :2]) / 2
+            wh = (bboxes[:, 2:4] - bboxes[:, :2]).abs()
+            # after jittering
+            new_cxcy = cxcy + wh * random_offsets[:, :2]
+            new_wh = wh * (1 + random_offsets[:, 2:])
+            # xywh to xyxy
+            new_x1y1 = (new_cxcy - new_wh / 2)
+            new_x2y2 = (new_cxcy + new_wh / 2)
+            new_bboxes = torch.cat([new_x1y1, new_x2y2], dim=1)
+            # clip bboxes
+            max_shape = img_meta['img_shape']
+            if max_shape is not None:
+                new_bboxes[:, 0::2].clamp_(min=0, max=max_shape[1] - 1)
+                new_bboxes[:, 1::2].clamp_(min=0, max=max_shape[0] - 1)
+
+            sampling_result.pos_priors = new_bboxes
+        return sampling_results
+
+    # TODO: Forward is incorrect and need to refactor.
+    def forward(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList = None) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (Tuple[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+            the meta information of each image and corresponding
+            annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            bbox_results = self._bbox_forward(x, rois)
+            results = results + (bbox_results['cls_score'], )
+            if self.bbox_head.with_reg:
+                results = results + (bbox_results['bbox_pred'], )
+
+            # grid head
+            grid_rois = rois[:100]
+            grid_feats = self.grid_roi_extractor(
+                x[:len(self.grid_roi_extractor.featmap_strides)], grid_rois)
+            if self.with_shared_head:
+                grid_feats = self.shared_head(grid_feats)
+            self.grid_head.test_mode = True
+            grid_preds = self.grid_head(grid_feats)
+            results = results + (grid_preds, )
+
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_results = self._mask_forward(x, mask_rois)
+            results = results + (mask_results['mask_preds'], )
+        return results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList, **kwargs) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(batch_data_samples)
+        sampling_results = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            sampling_results.append(sampling_result)
+
+        losses = dict()
+        # bbox head loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(x, sampling_results, batch_img_metas)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self.mask_loss(x, sampling_results,
+                                          bbox_results['bbox_feats'],
+                                          batch_gt_instances)
+            losses.update(mask_results['loss_mask'])
+
+        return losses
+
+    def bbox_loss(self,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  batch_img_metas: Optional[List[dict]] = None) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list[:obj:`SamplingResult`]): Sampling results.
+            batch_img_metas (list[dict], optional): Meta information of each
+                image, e.g., image size, scaling factor, etc.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+            - `cls_score` (Tensor): Classification scores.
+            - `bbox_pred` (Tensor): Box energies / deltas.
+            - `bbox_feats` (Tensor): Extract bbox RoI features.
+            - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        assert batch_img_metas is not None
+        bbox_results = super().bbox_loss(x, sampling_results)
+
+        # Grid head forward and loss
+        sampling_results = self._random_jitter(sampling_results,
+                                               batch_img_metas)
+        pos_rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+
+        # GN in head does not support zero shape input
+        if pos_rois.shape[0] == 0:
+            return bbox_results
+
+        grid_feats = self.grid_roi_extractor(
+            x[:self.grid_roi_extractor.num_inputs], pos_rois)
+        if self.with_shared_head:
+            grid_feats = self.shared_head(grid_feats)
+        # Accelerate training
+        max_sample_num_grid = self.train_cfg.get('max_num_grid', 192)
+        sample_idx = torch.randperm(
+            grid_feats.shape[0])[:min(grid_feats.shape[0], max_sample_num_grid
+                                      )]
+        grid_feats = grid_feats[sample_idx]
+        grid_pred = self.grid_head(grid_feats)
+
+        loss_grid = self.grid_head.loss(grid_pred, sample_idx,
+                                        sampling_results, self.train_cfg)
+
+        bbox_results['loss_bbox'].update(loss_grid)
+        return bbox_results
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (:obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape \
+            (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last \
+            dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results_list = super().predict_bbox(
+            x,
+            batch_img_metas=batch_img_metas,
+            rpn_results_list=rpn_results_list,
+            rcnn_test_cfg=rcnn_test_cfg,
+            rescale=False)
+
+        grid_rois = bbox2roi([res.bboxes for res in results_list])
+        if grid_rois.shape[0] != 0:
+            grid_feats = self.grid_roi_extractor(
+                x[:len(self.grid_roi_extractor.featmap_strides)], grid_rois)
+            if self.with_shared_head:
+                grid_feats = self.shared_head(grid_feats)
+            self.grid_head.test_mode = True
+            grid_preds = self.grid_head(grid_feats)
+            results_list = self.grid_head.predict_by_feat(
+                grid_preds=grid_preds,
+                results_list=results_list,
+                batch_img_metas=batch_img_metas,
+                rescale=rescale)
+
+        return results_list
diff --git a/head_extractor/src/mmdet/models/roi_heads/htc_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/htc_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fdd99ddd5ce4d9d42345d1f1d14ecbcae658124
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/htc_roi_head.py
@@ -0,0 +1,581 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.models.test_time_augs import merge_aug_masks
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList, OptConfigType
+from ..layers import adaptive_avg_pool2d
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances, unpack_gt_instances
+from .cascade_roi_head import CascadeRoIHead
+
+
+@MODELS.register_module()
+class HybridTaskCascadeRoIHead(CascadeRoIHead):
+    """Hybrid task cascade roi head including one bbox head and one mask head.
+
+    https://arxiv.org/abs/1901.07518
+
+    Args:
+        num_stages (int): Number of cascade stages.
+        stage_loss_weights (list[float]): Loss weight for every stage.
+        semantic_roi_extractor (:obj:`ConfigDict` or dict, optional):
+            Config of semantic roi extractor. Defaults to None.
+        Semantic_head (:obj:`ConfigDict` or dict, optional):
+            Config of semantic head. Defaults to None.
+        interleaved (bool): Whether to interleaves the box branch and mask
+            branch. If True, the mask branch can take the refined bounding
+            box predictions. Defaults to True.
+        mask_info_flow (bool): Whether to turn on the mask information flow,
+            which means that feeding the mask features of the preceding stage
+            to the current stage. Defaults to True.
+    """
+
+    def __init__(self,
+                 num_stages: int,
+                 stage_loss_weights: List[float],
+                 semantic_roi_extractor: OptConfigType = None,
+                 semantic_head: OptConfigType = None,
+                 semantic_fusion: Tuple[str] = ('bbox', 'mask'),
+                 interleaved: bool = True,
+                 mask_info_flow: bool = True,
+                 **kwargs) -> None:
+        super().__init__(
+            num_stages=num_stages,
+            stage_loss_weights=stage_loss_weights,
+            **kwargs)
+        assert self.with_bbox
+        assert not self.with_shared_head  # shared head is not supported
+
+        if semantic_head is not None:
+            self.semantic_roi_extractor = MODELS.build(semantic_roi_extractor)
+            self.semantic_head = MODELS.build(semantic_head)
+
+        self.semantic_fusion = semantic_fusion
+        self.interleaved = interleaved
+        self.mask_info_flow = mask_info_flow
+
+    # TODO move to base_roi_head later
+    @property
+    def with_semantic(self) -> bool:
+        """bool: whether the head has semantic head"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    def _bbox_forward(
+            self,
+            stage: int,
+            x: Tuple[Tensor],
+            rois: Tensor,
+            semantic_feat: Optional[Tensor] = None) -> Dict[str, Tensor]:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            semantic_feat (Tensor, optional): Semantic feature. Defaults to
+                None.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        if self.with_semantic and 'bbox' in self.semantic_fusion:
+            bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]:
+                bbox_semantic_feat = adaptive_avg_pool2d(
+                    bbox_semantic_feat, bbox_feats.shape[-2:])
+            bbox_feats += bbox_semantic_feat
+        cls_score, bbox_pred = bbox_head(bbox_feats)
+
+        bbox_results = dict(cls_score=cls_score, bbox_pred=bbox_pred)
+        return bbox_results
+
+    def bbox_loss(self,
+                  stage: int,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  semantic_feat: Optional[Tensor] = None) -> dict:
+        """Run forward function and calculate loss for box head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            semantic_feat (Tensor, optional): Semantic feature. Defaults to
+                None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+        """
+        bbox_head = self.bbox_head[stage]
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(
+            stage, x, rois, semantic_feat=semantic_feat)
+        bbox_results.update(rois=rois)
+
+        bbox_loss_and_target = bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg[stage])
+        bbox_results.update(bbox_loss_and_target)
+        return bbox_results
+
+    def _mask_forward(self,
+                      stage: int,
+                      x: Tuple[Tensor],
+                      rois: Tensor,
+                      semantic_feat: Optional[Tensor] = None,
+                      training: bool = True) -> Dict[str, Tensor]:
+        """Mask head forward function used only in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            semantic_feat (Tensor, optional): Semantic feature. Defaults to
+                None.
+            training (bool): Mask Forward is different between training and
+                testing. If True, use the mask forward in training.
+                Defaults to True.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        rois)
+
+        # semantic feature fusion
+        # element-wise sum for original features and pooled semantic features
+        if self.with_semantic and 'mask' in self.semantic_fusion:
+            mask_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]:
+                mask_semantic_feat = F.adaptive_avg_pool2d(
+                    mask_semantic_feat, mask_feats.shape[-2:])
+            mask_feats = mask_feats + mask_semantic_feat
+
+        # mask information flow
+        # forward all previous mask heads to obtain last_feat, and fuse it
+        # with the normal mask feature
+        if training:
+            if self.mask_info_flow:
+                last_feat = None
+                for i in range(stage):
+                    last_feat = self.mask_head[i](
+                        mask_feats, last_feat, return_logits=False)
+                mask_preds = mask_head(
+                    mask_feats, last_feat, return_feat=False)
+            else:
+                mask_preds = mask_head(mask_feats, return_feat=False)
+
+            mask_results = dict(mask_preds=mask_preds)
+        else:
+            aug_masks = []
+            last_feat = None
+            for i in range(self.num_stages):
+                mask_head = self.mask_head[i]
+                if self.mask_info_flow:
+                    mask_preds, last_feat = mask_head(mask_feats, last_feat)
+                else:
+                    mask_preds = mask_head(mask_feats)
+            aug_masks.append(mask_preds)
+
+            mask_results = dict(mask_preds=aug_masks)
+
+        return mask_results
+
+    def mask_loss(self,
+                  stage: int,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  batch_gt_instances: InstanceList,
+                  semantic_feat: Optional[Tensor] = None) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            semantic_feat (Tensor, optional): Semantic feature. Defaults to
+                None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+        mask_results = self._mask_forward(
+            stage=stage,
+            x=x,
+            rois=pos_rois,
+            semantic_feat=semantic_feat,
+            training=True)
+
+        mask_head = self.mask_head[stage]
+        mask_loss_and_target = mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg[stage])
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        # semantic segmentation part
+        # 2 outputs: segmentation prediction and embedded features
+        losses = dict()
+        if self.with_semantic:
+            gt_semantic_segs = [
+                data_sample.gt_sem_seg.sem_seg
+                for data_sample in batch_data_samples
+            ]
+            gt_semantic_segs = torch.stack(gt_semantic_segs)
+            semantic_pred, semantic_feat = self.semantic_head(x)
+            loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_segs)
+            losses['loss_semantic_seg'] = loss_seg
+        else:
+            semantic_feat = None
+
+        results_list = rpn_results_list
+        num_imgs = len(batch_img_metas)
+        for stage in range(self.num_stages):
+            self.current_stage = stage
+
+            stage_loss_weight = self.stage_loss_weights[stage]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            bbox_assigner = self.bbox_assigner[stage]
+            bbox_sampler = self.bbox_sampler[stage]
+            for i in range(num_imgs):
+                results = results_list[i]
+                # rename rpn_results.bboxes to rpn_results.priors
+                if 'bboxes' in results:
+                    results.priors = results.pop('bboxes')
+
+                assign_result = bbox_assigner.assign(
+                    results, batch_gt_instances[i],
+                    batch_gt_instances_ignore[i])
+                sampling_result = bbox_sampler.sample(
+                    assign_result,
+                    results,
+                    batch_gt_instances[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = self.bbox_loss(
+                stage=stage,
+                x=x,
+                sampling_results=sampling_results,
+                semantic_feat=semantic_feat)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{stage}.{name}'] = (
+                    value * stage_loss_weight if 'loss' in name else value)
+
+            # mask head forward and loss
+            if self.with_mask:
+                # interleaved execution: use regressed bboxes by the box branch
+                # to train the mask branch
+                if self.interleaved:
+                    bbox_head = self.bbox_head[stage]
+                    with torch.no_grad():
+                        results_list = bbox_head.refine_bboxes(
+                            sampling_results, bbox_results, batch_img_metas)
+                        # re-assign and sample 512 RoIs from 512 RoIs
+                        sampling_results = []
+                        for i in range(num_imgs):
+                            results = results_list[i]
+                            # rename rpn_results.bboxes to rpn_results.priors
+                            results.priors = results.pop('bboxes')
+                            assign_result = bbox_assigner.assign(
+                                results, batch_gt_instances[i],
+                                batch_gt_instances_ignore[i])
+                            sampling_result = bbox_sampler.sample(
+                                assign_result,
+                                results,
+                                batch_gt_instances[i],
+                                feats=[lvl_feat[i][None] for lvl_feat in x])
+                            sampling_results.append(sampling_result)
+                mask_results = self.mask_loss(
+                    stage=stage,
+                    x=x,
+                    sampling_results=sampling_results,
+                    batch_gt_instances=batch_gt_instances,
+                    semantic_feat=semantic_feat)
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{stage}.{name}'] = (
+                        value * stage_loss_weight if 'loss' in name else value)
+
+            # refine bboxes (same as Cascade R-CNN)
+            if stage < self.num_stages - 1 and not self.interleaved:
+                bbox_head = self.bbox_head[stage]
+                with torch.no_grad():
+                    results_list = bbox_head.refine_bboxes(
+                        sampling_results=sampling_results,
+                        bbox_results=bbox_results,
+                        batch_img_metas=batch_img_metas)
+
+        return losses
+
+    def predict(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (N, C, H, W).
+            rpn_results_list (list[:obj:`InstanceData`]): list of region
+                proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results to
+                the original image. Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        # TODO: nms_op in mmcv need be enhanced, the bbox result may get
+        #  difference when not rescale in bbox_head
+
+        # If it has the mask branch, the bbox branch does not need
+        # to be scaled to the original image scale, because the mask
+        # branch will scale both bbox and mask at the same time.
+        bbox_rescale = rescale if not self.with_mask else False
+        results_list = self.predict_bbox(
+            x=x,
+            semantic_feat=semantic_feat,
+            batch_img_metas=batch_img_metas,
+            rpn_results_list=rpn_results_list,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=bbox_rescale)
+
+        if self.with_mask:
+            results_list = self.predict_mask(
+                x=x,
+                semantic_heat=semantic_feat,
+                batch_img_metas=batch_img_metas,
+                results_list=results_list,
+                rescale=rescale)
+
+        return results_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     semantic_heat: Tensor,
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            semantic_feat (Tensor): Semantic feature.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        num_imgs = len(batch_img_metas)
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas=batch_img_metas,
+                device=mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        mask_results = self._mask_forward(
+            stage=-1,
+            x=x,
+            rois=mask_rois,
+            semantic_feat=semantic_heat,
+            training=False)
+        # split batch mask prediction back to each image
+        aug_masks = [[
+            mask.sigmoid().detach()
+            for mask in mask_preds.split(num_mask_rois_per_img, 0)
+        ] for mask_preds in mask_results['mask_preds']]
+
+        merged_masks = []
+        for i in range(num_imgs):
+            aug_mask = [mask[i] for mask in aug_masks]
+            merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+            merged_masks.append(merged_mask)
+
+        results_list = self.mask_head[-1].predict_by_feat(
+            mask_preds=merged_masks,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale,
+            activate_map=True)
+
+        return results_list
+
+    def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+                batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+        num_imgs = len(batch_img_metas)
+
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            rois, cls_scores, bbox_preds = self._refine_roi(
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                batch_img_metas=batch_img_metas,
+                num_proposals_per_img=num_proposals_per_img)
+            results = results + (cls_scores, bbox_preds)
+        # mask head
+        if self.with_mask:
+            rois = torch.cat(rois)
+            mask_results = self._mask_forward(
+                stage=-1,
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                training=False)
+            aug_masks = [[
+                mask.sigmoid().detach()
+                for mask in mask_preds.split(num_proposals_per_img, 0)
+            ] for mask_preds in mask_results['mask_preds']]
+
+            merged_masks = []
+            for i in range(num_imgs):
+                aug_mask = [mask[i] for mask in aug_masks]
+                merged_mask = merge_aug_masks(aug_mask, batch_img_metas[i])
+                merged_masks.append(merged_mask)
+            results = results + (merged_masks, )
+        return results
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_heads/__init__.py b/head_extractor/src/mmdet/models/roi_heads/mask_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a5d4227be41b8985403251e1803f78cf500636
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_heads/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .coarse_mask_head import CoarseMaskHead
+from .dynamic_mask_head import DynamicMaskHead
+from .fcn_mask_head import FCNMaskHead
+from .feature_relay_head import FeatureRelayHead
+from .fused_semantic_head import FusedSemanticHead
+from .global_context_head import GlobalContextHead
+from .grid_head import GridHead
+from .htc_mask_head import HTCMaskHead
+from .mask_point_head import MaskPointHead
+from .maskiou_head import MaskIoUHead
+from .scnet_mask_head import SCNetMaskHead
+from .scnet_semantic_head import SCNetSemanticHead
+
+__all__ = [
+    'FCNMaskHead', 'HTCMaskHead', 'FusedSemanticHead', 'GridHead',
+    'MaskIoUHead', 'CoarseMaskHead', 'MaskPointHead', 'SCNetMaskHead',
+    'SCNetSemanticHead', 'GlobalContextHead', 'FeatureRelayHead',
+    'DynamicMaskHead'
+]
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py b/head_extractor/src/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1caa901228f2439492b82d1890eba468963eb28d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_heads/coarse_mask_head.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule, Linear
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig
+from .fcn_mask_head import FCNMaskHead
+
+
+@MODELS.register_module()
+class CoarseMaskHead(FCNMaskHead):
+    """Coarse mask head used in PointRend.
+
+    Compared with standard ``FCNMaskHead``, ``CoarseMaskHead`` will downsample
+    the input feature map instead of upsample it.
+
+    Args:
+        num_convs (int): Number of conv layers in the head. Defaults to 0.
+        num_fcs (int): Number of fc layers in the head. Defaults to 2.
+        fc_out_channels (int): Number of output channels of fc layer.
+            Defaults to 1024.
+        downsample_factor (int): The factor that feature map is downsampled by.
+            Defaults to 2.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_convs: int = 0,
+                 num_fcs: int = 2,
+                 fc_out_channels: int = 1024,
+                 downsample_factor: int = 2,
+                 init_cfg: MultiConfig = dict(
+                     type='Xavier',
+                     override=[
+                         dict(name='fcs'),
+                         dict(type='Constant', val=0.001, name='fc_logits')
+                     ]),
+                 *arg,
+                 **kwarg) -> None:
+        super().__init__(
+            *arg,
+            num_convs=num_convs,
+            upsample_cfg=dict(type=None),
+            init_cfg=None,
+            **kwarg)
+        self.init_cfg = init_cfg
+        self.num_fcs = num_fcs
+        assert self.num_fcs > 0
+        self.fc_out_channels = fc_out_channels
+        self.downsample_factor = downsample_factor
+        assert self.downsample_factor >= 1
+        # remove conv_logit
+        delattr(self, 'conv_logits')
+
+        if downsample_factor > 1:
+            downsample_in_channels = (
+                self.conv_out_channels
+                if self.num_convs > 0 else self.in_channels)
+            self.downsample_conv = ConvModule(
+                downsample_in_channels,
+                self.conv_out_channels,
+                kernel_size=downsample_factor,
+                stride=downsample_factor,
+                padding=0,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+        else:
+            self.downsample_conv = None
+
+        self.output_size = (self.roi_feat_size[0] // downsample_factor,
+                            self.roi_feat_size[1] // downsample_factor)
+        self.output_area = self.output_size[0] * self.output_size[1]
+
+        last_layer_dim = self.conv_out_channels * self.output_area
+
+        self.fcs = ModuleList()
+        for i in range(num_fcs):
+            fc_in_channels = (
+                last_layer_dim if i == 0 else self.fc_out_channels)
+            self.fcs.append(Linear(fc_in_channels, self.fc_out_channels))
+        last_layer_dim = self.fc_out_channels
+        output_channels = self.num_classes * self.output_area
+        self.fc_logits = Linear(last_layer_dim, output_channels)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        super(FCNMaskHead, self).init_weights()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tensor): Extract mask RoI features.
+
+        Returns:
+            Tensor: Predicted foreground masks.
+        """
+        for conv in self.convs:
+            x = conv(x)
+
+        if self.downsample_conv is not None:
+            x = self.downsample_conv(x)
+
+        x = x.flatten(1)
+        for fc in self.fcs:
+            x = self.relu(fc(x))
+        mask_preds = self.fc_logits(x).view(
+            x.size(0), self.num_classes, *self.output_size)
+        return mask_preds
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py b/head_extractor/src/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f33612b1b141668d0463435975c14a26fbe5a0cd
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_heads/dynamic_mask_head.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmengine.config import ConfigDict
+from torch import Tensor
+
+from mmdet.models.task_modules import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, OptConfigType, reduce_mean
+from .fcn_mask_head import FCNMaskHead
+
+
+@MODELS.register_module()
+class DynamicMaskHead(FCNMaskHead):
+    r"""Dynamic Mask Head for
+    `Instances as Queries <http://arxiv.org/abs/2105.01928>`_
+
+    Args:
+        num_convs (int): Number of convolution layer.
+            Defaults to 4.
+        roi_feat_size (int): The output size of RoI extractor,
+            Defaults to 14.
+        in_channels (int): Input feature channels.
+            Defaults to 256.
+        conv_kernel_size (int): Kernel size of convolution layers.
+            Defaults to 3.
+        conv_out_channels (int): Output channels of convolution layers.
+            Defaults to 256.
+        num_classes (int): Number of classes.
+            Defaults to 80
+        class_agnostic (int): Whether generate class agnostic prediction.
+            Defaults to False.
+        dropout (float): Probability of drop the channel.
+            Defaults to 0.0
+        upsample_cfg (:obj:`ConfigDict` or dict): The config for
+            upsample layer.
+        conv_cfg (:obj:`ConfigDict` or dict, optional): The convolution
+            layer config.
+        norm_cfg (:obj:`ConfigDict` or dict, optional): The norm layer config.
+        dynamic_conv_cfg (:obj:`ConfigDict` or dict): The dynamic convolution
+            layer config.
+        loss_mask (:obj:`ConfigDict` or dict): The config for mask loss.
+    """
+
+    def __init__(self,
+                 num_convs: int = 4,
+                 roi_feat_size: int = 14,
+                 in_channels: int = 256,
+                 conv_kernel_size: int = 3,
+                 conv_out_channels: int = 256,
+                 num_classes: int = 80,
+                 class_agnostic: bool = False,
+                 upsample_cfg: ConfigType = dict(
+                     type='deconv', scale_factor=2),
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 dynamic_conv_cfg: ConfigType = dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     input_feat_shape=14,
+                     with_proj=False,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN')),
+                 loss_mask: ConfigType = dict(
+                     type='DiceLoss', loss_weight=8.0),
+                 **kwargs) -> None:
+        super().__init__(
+            num_convs=num_convs,
+            roi_feat_size=roi_feat_size,
+            in_channels=in_channels,
+            conv_kernel_size=conv_kernel_size,
+            conv_out_channels=conv_out_channels,
+            num_classes=num_classes,
+            class_agnostic=class_agnostic,
+            upsample_cfg=upsample_cfg,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            loss_mask=loss_mask,
+            **kwargs)
+        assert class_agnostic is False, \
+            'DynamicMaskHead only support class_agnostic=False'
+        self.fp16_enabled = False
+
+        self.instance_interactive_conv = MODELS.build(dynamic_conv_cfg)
+
+    def init_weights(self) -> None:
+        """Use xavier initialization for all weight parameter and set
+        classification head bias as a specific value when use focal loss."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+            nn.init.constant_(self.conv_logits.bias, 0.)
+
+    def forward(self, roi_feat: Tensor, proposal_feat: Tensor) -> Tensor:
+        """Forward function of DynamicMaskHead.
+
+        Args:
+            roi_feat (Tensor): Roi-pooling features with shape
+                (batch_size*num_proposals, feature_dimensions,
+                pooling_h , pooling_w).
+            proposal_feat (Tensor): Intermediate feature get from
+                diihead in last stage, has shape
+                (batch_size*num_proposals, feature_dimensions)
+
+          Returns:
+            mask_preds (Tensor): Predicted foreground masks with shape
+            (batch_size*num_proposals, num_classes, pooling_h*2, pooling_w*2).
+        """
+
+        proposal_feat = proposal_feat.reshape(-1, self.in_channels)
+        proposal_feat_iic = self.instance_interactive_conv(
+            proposal_feat, roi_feat)
+
+        x = proposal_feat_iic.permute(0, 2, 1).reshape(roi_feat.size())
+
+        for conv in self.convs:
+            x = conv(x)
+        if self.upsample is not None:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+        mask_preds = self.conv_logits(x)
+        return mask_preds
+
+    def loss_and_target(self, mask_preds: Tensor,
+                        sampling_results: List[SamplingResult],
+                        batch_gt_instances: InstanceList,
+                        rcnn_train_cfg: ConfigDict) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (Tensor): Predicted foreground masks, has shape
+                (num_pos, num_classes, h, w).
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+        """
+        mask_targets = self.get_targets(
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=rcnn_train_cfg)
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+
+        num_pos = pos_labels.new_ones(pos_labels.size()).float().sum()
+        avg_factor = torch.clamp(reduce_mean(num_pos), min=1.).item()
+        loss = dict()
+        if mask_preds.size(0) == 0:
+            loss_mask = mask_preds.sum()
+        else:
+            loss_mask = self.loss_mask(
+                mask_preds[torch.arange(num_pos).long(), pos_labels,
+                           ...].sigmoid(),
+                mask_targets,
+                avg_factor=avg_factor)
+        loss['loss_mask'] = loss_mask
+        return dict(loss_mask=loss, mask_targets=mask_targets)
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py b/head_extractor/src/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a089dfafcb69784f2fc266f0945e6d56b0466d3
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_heads/fcn_mask_head.py
@@ -0,0 +1,474 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_conv_layer, build_upsample_layer
+from mmcv.ops.carafe import CARAFEPack
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule, ModuleList
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import empty_instances
+from mmdet.registry import MODELS
+from mmdet.structures.mask import mask_target
+from mmdet.utils import ConfigType, InstanceList, OptConfigType, OptMultiConfig
+
+BYTES_PER_FLOAT = 4
+# TODO: This memory limit may be too much or too little. It would be better to
+#  determine it based on available resources.
+GPU_MEM_LIMIT = 1024**3  # 1 GB memory limit
+
+
+@MODELS.register_module()
+class FCNMaskHead(BaseModule):
+
+    def __init__(self,
+                 num_convs: int = 4,
+                 roi_feat_size: int = 14,
+                 in_channels: int = 256,
+                 conv_kernel_size: int = 3,
+                 conv_out_channels: int = 256,
+                 num_classes: int = 80,
+                 class_agnostic: int = False,
+                 upsample_cfg: ConfigType = dict(
+                     type='deconv', scale_factor=2),
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: OptConfigType = None,
+                 predictor_cfg: ConfigType = dict(type='Conv'),
+                 loss_mask: ConfigType = dict(
+                     type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+                                 'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.upsample_cfg = upsample_cfg.copy()
+        if self.upsample_cfg['type'] not in [
+                None, 'deconv', 'nearest', 'bilinear', 'carafe'
+        ]:
+            raise ValueError(
+                f'Invalid upsample method {self.upsample_cfg["type"]}, '
+                'accepted methods are "deconv", "nearest", "bilinear", '
+                '"carafe"')
+        self.num_convs = num_convs
+        # WARN: roi_feat_size is reserved and not used
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.in_channels = in_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.conv_out_channels = conv_out_channels
+        self.upsample_method = self.upsample_cfg.get('type')
+        self.scale_factor = self.upsample_cfg.pop('scale_factor', None)
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.predictor_cfg = predictor_cfg
+        self.loss_mask = MODELS.build(loss_mask)
+
+        self.convs = ModuleList()
+        for i in range(self.num_convs):
+            in_channels = (
+                self.in_channels if i == 0 else self.conv_out_channels)
+            padding = (self.conv_kernel_size - 1) // 2
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    self.conv_out_channels,
+                    self.conv_kernel_size,
+                    padding=padding,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg))
+        upsample_in_channels = (
+            self.conv_out_channels if self.num_convs > 0 else in_channels)
+        upsample_cfg_ = self.upsample_cfg.copy()
+        if self.upsample_method is None:
+            self.upsample = None
+        elif self.upsample_method == 'deconv':
+            upsample_cfg_.update(
+                in_channels=upsample_in_channels,
+                out_channels=self.conv_out_channels,
+                kernel_size=self.scale_factor,
+                stride=self.scale_factor)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+        elif self.upsample_method == 'carafe':
+            upsample_cfg_.update(
+                channels=upsample_in_channels, scale_factor=self.scale_factor)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+        else:
+            # suppress warnings
+            align_corners = (None
+                             if self.upsample_method == 'nearest' else False)
+            upsample_cfg_.update(
+                scale_factor=self.scale_factor,
+                mode=self.upsample_method,
+                align_corners=align_corners)
+            self.upsample = build_upsample_layer(upsample_cfg_)
+
+        out_channels = 1 if self.class_agnostic else self.num_classes
+        logits_in_channel = (
+            self.conv_out_channels
+            if self.upsample_method == 'deconv' else upsample_in_channels)
+        self.conv_logits = build_conv_layer(self.predictor_cfg,
+                                            logits_in_channel, out_channels, 1)
+        self.relu = nn.ReLU(inplace=True)
+        self.debug_imgs = None
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+        super().init_weights()
+        for m in [self.upsample, self.conv_logits]:
+            if m is None:
+                continue
+            elif isinstance(m, CARAFEPack):
+                m.init_weights()
+            elif hasattr(m, 'weight') and hasattr(m, 'bias'):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward features from the upstream network.
+
+        Args:
+            x (Tensor): Extract mask RoI features.
+
+        Returns:
+            Tensor: Predicted foreground masks.
+        """
+        for conv in self.convs:
+            x = conv(x)
+        if self.upsample is not None:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+        mask_preds = self.conv_logits(x)
+        return mask_preds
+
+    def get_targets(self, sampling_results: List[SamplingResult],
+                    batch_gt_instances: InstanceList,
+                    rcnn_train_cfg: ConfigDict) -> Tensor:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+
+        Returns:
+            Tensor: Mask target of each positive proposals in the image.
+        """
+        pos_proposals = [res.pos_priors for res in sampling_results]
+        pos_assigned_gt_inds = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        gt_masks = [res.masks for res in batch_gt_instances]
+        mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds,
+                                   gt_masks, rcnn_train_cfg)
+        return mask_targets
+
+    def loss_and_target(self, mask_preds: Tensor,
+                        sampling_results: List[SamplingResult],
+                        batch_gt_instances: InstanceList,
+                        rcnn_train_cfg: ConfigDict) -> dict:
+        """Calculate the loss based on the features extracted by the mask head.
+
+        Args:
+            mask_preds (Tensor): Predicted foreground masks, has shape
+                (num_pos, num_classes, h, w).
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+        """
+        mask_targets = self.get_targets(
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=rcnn_train_cfg)
+
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+
+        loss = dict()
+        if mask_preds.size(0) == 0:
+            loss_mask = mask_preds.sum()
+        else:
+            if self.class_agnostic:
+                loss_mask = self.loss_mask(mask_preds, mask_targets,
+                                           torch.zeros_like(pos_labels))
+            else:
+                loss_mask = self.loss_mask(mask_preds, mask_targets,
+                                           pos_labels)
+        loss['loss_mask'] = loss_mask
+        # TODO: which algorithm requires mask_targets?
+        return dict(loss_mask=loss, mask_targets=mask_targets)
+
+    def predict_by_feat(self,
+                        mask_preds: Tuple[Tensor],
+                        results_list: List[InstanceData],
+                        batch_img_metas: List[dict],
+                        rcnn_test_cfg: ConfigDict,
+                        rescale: bool = False,
+                        activate_map: bool = False) -> InstanceList:
+        """Transform a batch of output features extracted from the head into
+        mask results.
+
+        Args:
+            mask_preds (tuple[Tensor]): Tuple of predicted foreground masks,
+                each has shape (n, num_classes, h, w).
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            batch_img_metas (list[dict]): List of image information.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            activate_map (book): Whether get results with augmentations test.
+                If True, the `mask_preds` will not process with sigmoid.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert len(mask_preds) == len(results_list) == len(batch_img_metas)
+
+        for img_id in range(len(batch_img_metas)):
+            img_meta = batch_img_metas[img_id]
+            results = results_list[img_id]
+            bboxes = results.bboxes
+            if bboxes.shape[0] == 0:
+                results_list[img_id] = empty_instances(
+                    [img_meta],
+                    bboxes.device,
+                    task_type='mask',
+                    instance_results=[results],
+                    mask_thr_binary=rcnn_test_cfg.mask_thr_binary)[0]
+            else:
+                im_mask = self._predict_by_feat_single(
+                    mask_preds=mask_preds[img_id],
+                    bboxes=bboxes,
+                    labels=results.labels,
+                    img_meta=img_meta,
+                    rcnn_test_cfg=rcnn_test_cfg,
+                    rescale=rescale,
+                    activate_map=activate_map)
+                results.masks = im_mask
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                mask_preds: Tensor,
+                                bboxes: Tensor,
+                                labels: Tensor,
+                                img_meta: dict,
+                                rcnn_test_cfg: ConfigDict,
+                                rescale: bool = False,
+                                activate_map: bool = False) -> Tensor:
+        """Get segmentation masks from mask_preds and bboxes.
+
+        Args:
+            mask_preds (Tensor): Predicted foreground masks, has shape
+                (n, num_classes, h, w).
+            bboxes (Tensor): Predicted bboxes, has shape (n, 4)
+            labels (Tensor): Labels of bboxes, has shape (n, )
+            img_meta (dict): image information.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of Bbox Head.
+                Defaults to None.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+            activate_map (book): Whether get results with augmentations test.
+                If True, the `mask_preds` will not process with sigmoid.
+                Defaults to False.
+
+        Returns:
+            Tensor: Encoded masks, has shape (n, img_w, img_h)
+
+        Example:
+            >>> from mmengine.config import Config
+            >>> from mmdet.models.roi_heads.mask_heads.fcn_mask_head import *  # NOQA
+            >>> N = 7  # N = number of extracted ROIs
+            >>> C, H, W = 11, 32, 32
+            >>> # Create example instance of FCN Mask Head.
+            >>> self = FCNMaskHead(num_classes=C, num_convs=0)
+            >>> inputs = torch.rand(N, self.in_channels, H, W)
+            >>> mask_preds = self.forward(inputs)
+            >>> # Each input is associated with some bounding box
+            >>> bboxes = torch.Tensor([[1, 1, 42, 42 ]] * N)
+            >>> labels = torch.randint(0, C, size=(N,))
+            >>> rcnn_test_cfg = Config({'mask_thr_binary': 0, })
+            >>> ori_shape = (H * 4, W * 4)
+            >>> scale_factor = (1, 1)
+            >>> rescale = False
+            >>> img_meta = {'scale_factor': scale_factor,
+            ...             'ori_shape': ori_shape}
+            >>> # Encoded masks are a list for each category.
+            >>> encoded_masks = self._get_seg_masks_single(
+            ...     mask_preds, bboxes, labels,
+            ...     img_meta, rcnn_test_cfg, rescale)
+            >>> assert encoded_masks.size()[0] == N
+            >>> assert encoded_masks.size()[1:] == ori_shape
+        """
+        scale_factor = bboxes.new_tensor(img_meta['scale_factor']).repeat(
+            (1, 2))
+        img_h, img_w = img_meta['ori_shape'][:2]
+        device = bboxes.device
+
+        if not activate_map:
+            mask_preds = mask_preds.sigmoid()
+        else:
+            # In AugTest, has been activated before
+            mask_preds = bboxes.new_tensor(mask_preds)
+
+        if rescale:  # in-placed rescale the bboxes
+            bboxes /= scale_factor
+        else:
+            w_scale, h_scale = scale_factor[0, 0], scale_factor[0, 1]
+            img_h = np.round(img_h * h_scale.item()).astype(np.int32)
+            img_w = np.round(img_w * w_scale.item()).astype(np.int32)
+
+        N = len(mask_preds)
+        # The actual implementation split the input into chunks,
+        # and paste them chunk by chunk.
+        if device.type == 'cpu':
+            # CPU is most efficient when they are pasted one by one with
+            # skip_empty=True, so that it performs minimal number of
+            # operations.
+            num_chunks = N
+        else:
+            # GPU benefits from parallelism for larger chunks,
+            # but may have memory issue
+            # the types of img_w and img_h are np.int32,
+            # when the image resolution is large,
+            # the calculation of num_chunks will overflow.
+            # so we need to change the types of img_w and img_h to int.
+            # See https://github.com/open-mmlab/mmdetection/pull/5191
+            num_chunks = int(
+                np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT /
+                        GPU_MEM_LIMIT))
+            assert (num_chunks <=
+                    N), 'Default GPU_MEM_LIMIT is too small; try increasing it'
+        chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
+
+        threshold = rcnn_test_cfg.mask_thr_binary
+        im_mask = torch.zeros(
+            N,
+            img_h,
+            img_w,
+            device=device,
+            dtype=torch.bool if threshold >= 0 else torch.uint8)
+
+        if not self.class_agnostic:
+            mask_preds = mask_preds[range(N), labels][:, None]
+
+        for inds in chunks:
+            masks_chunk, spatial_inds = _do_paste_mask(
+                mask_preds[inds],
+                bboxes[inds],
+                img_h,
+                img_w,
+                skip_empty=device.type == 'cpu')
+
+            if threshold >= 0:
+                masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
+            else:
+                # for visualization and debugging
+                masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
+
+            im_mask[(inds, ) + spatial_inds] = masks_chunk
+        return im_mask
+
+
+def _do_paste_mask(masks: Tensor,
+                   boxes: Tensor,
+                   img_h: int,
+                   img_w: int,
+                   skip_empty: bool = True) -> tuple:
+    """Paste instance masks according to boxes.
+
+    This implementation is modified from
+    https://github.com/facebookresearch/detectron2/
+
+    Args:
+        masks (Tensor): N, 1, H, W
+        boxes (Tensor): N, 4
+        img_h (int): Height of the image to be pasted.
+        img_w (int): Width of the image to be pasted.
+        skip_empty (bool): Only paste masks within the region that
+            tightly bound all boxes, and returns the results this region only.
+            An important optimization for CPU.
+
+    Returns:
+        tuple: (Tensor, tuple). The first item is mask tensor, the second one
+        is the slice object.
+
+            If skip_empty == False, the whole image will be pasted. It will
+            return a mask of shape (N, img_h, img_w) and an empty tuple.
+
+            If skip_empty == True, only area around the mask will be pasted.
+            A mask of shape (N, h', w') and its start and end coordinates
+            in the original image will be returned.
+    """
+    # On GPU, paste all masks together (up to chunk size)
+    # by using the entire image to sample the masks
+    # Compared to pasting them one by one,
+    # this has more operations but is faster on COCO-scale dataset.
+    device = masks.device
+    if skip_empty:
+        x0_int, y0_int = torch.clamp(
+            boxes.min(dim=0).values.floor()[:2] - 1,
+            min=0).to(dtype=torch.int32)
+        x1_int = torch.clamp(
+            boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
+        y1_int = torch.clamp(
+            boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
+    else:
+        x0_int, y0_int = 0, 0
+        x1_int, y1_int = img_w, img_h
+    x0, y0, x1, y1 = torch.split(boxes, 1, dim=1)  # each is Nx1
+
+    N = masks.shape[0]
+
+    img_y = torch.arange(y0_int, y1_int, device=device).to(torch.float32) + 0.5
+    img_x = torch.arange(x0_int, x1_int, device=device).to(torch.float32) + 0.5
+    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
+    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
+    # img_x, img_y have shapes (N, w), (N, h)
+    # IsInf op is not supported with ONNX<=1.7.0
+    if not torch.onnx.is_in_onnx_export():
+        if torch.isinf(img_x).any():
+            inds = torch.where(torch.isinf(img_x))
+            img_x[inds] = 0
+        if torch.isinf(img_y).any():
+            inds = torch.where(torch.isinf(img_y))
+            img_y[inds] = 0
+
+    gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
+    gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
+    grid = torch.stack([gx, gy], dim=3)
+
+    img_masks = F.grid_sample(
+        masks.to(dtype=torch.float32), grid, align_corners=False)
+
+    if skip_empty:
+        return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
+    else:
+        return img_masks[:, 0], ()
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_heads/feature_relay_head.py b/head_extractor/src/mmdet/models/roi_heads/mask_heads/feature_relay_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c34561fa5fd749329eda164465ce9787278d357
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_heads/feature_relay_head.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig
+
+
+@MODELS.register_module()
+class FeatureRelayHead(BaseModule):
+    """Feature Relay Head used in `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        in_channels (int): number of input channels. Defaults to 256.
+        conv_out_channels (int): number of output channels before
+            classification layer. Defaults to 256.
+        roi_feat_size (int): roi feat size at box head. Default: 7.
+        scale_factor (int): scale factor to match roi feat size
+            at mask head. Defaults to 2.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict. Defaults to
+            dict(type='Kaiming', layer='Linear').
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 1024,
+        out_conv_channels: int = 256,
+        roi_feat_size: int = 7,
+        scale_factor: int = 2,
+        init_cfg: MultiConfig = dict(type='Kaiming', layer='Linear')
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(roi_feat_size, int)
+
+        self.in_channels = in_channels
+        self.out_conv_channels = out_conv_channels
+        self.roi_feat_size = roi_feat_size
+        self.out_channels = (roi_feat_size**2) * out_conv_channels
+        self.scale_factor = scale_factor
+        self.fp16_enabled = False
+
+        self.fc = nn.Linear(self.in_channels, self.out_channels)
+        self.upsample = nn.Upsample(
+            scale_factor=scale_factor, mode='bilinear', align_corners=True)
+
+    def forward(self, x: Tensor) -> Optional[Tensor]:
+        """Forward function.
+
+        Args:
+            x (Tensor): Input feature.
+
+        Returns:
+            Optional[Tensor]: Output feature. When the first dim of input is
+            0, None is returned.
+        """
+        N, _ = x.shape
+        if N > 0:
+            out_C = self.out_conv_channels
+            out_HW = self.roi_feat_size
+            x = self.fc(x)
+            x = x.reshape(N, out_C, out_HW, out_HW)
+            x = self.upsample(x)
+            return x
+        return None
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py b/head_extractor/src/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20beb2975a563f03e7b6b2afcef287cb41af05a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_heads/fused_semantic_head.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Tuple
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class FusedSemanticHead(BaseModule):
+    r"""Multi-level fused semantic segmentation head.
+
+    .. code-block:: none
+
+        in_1 -> 1x1 conv ---
+                            |
+        in_2 -> 1x1 conv -- |
+                           ||
+        in_3 -> 1x1 conv - ||
+                          |||                  /-> 1x1 conv (mask prediction)
+        in_4 -> 1x1 conv -----> 3x3 convs (*4)
+                            |                  \-> 1x1 conv (feature)
+        in_5 -> 1x1 conv ---
+    """  # noqa: W605
+
+    def __init__(
+        self,
+        num_ins: int,
+        fusion_level: int,
+        seg_scale_factor=1 / 8,
+        num_convs: int = 4,
+        in_channels: int = 256,
+        conv_out_channels: int = 256,
+        num_classes: int = 183,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        ignore_label: int = None,
+        loss_weight: float = None,
+        loss_seg: ConfigDict = dict(
+            type='CrossEntropyLoss', ignore_index=255, loss_weight=0.2),
+        init_cfg: MultiConfig = dict(
+            type='Kaiming', override=dict(name='conv_logits'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_ins = num_ins
+        self.fusion_level = fusion_level
+        self.seg_scale_factor = seg_scale_factor
+        self.num_convs = num_convs
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.num_classes = num_classes
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.num_ins):
+            self.lateral_convs.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=False))
+
+        self.convs = nn.ModuleList()
+        for i in range(self.num_convs):
+            in_channels = self.in_channels if i == 0 else conv_out_channels
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    conv_out_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.conv_embedding = ConvModule(
+            conv_out_channels,
+            conv_out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+        self.conv_logits = nn.Conv2d(conv_out_channels, self.num_classes, 1)
+        if ignore_label:
+            loss_seg['ignore_index'] = ignore_label
+        if loss_weight:
+            loss_seg['loss_weight'] = loss_weight
+        if ignore_label or loss_weight:
+            warnings.warn('``ignore_label`` and ``loss_weight`` would be '
+                          'deprecated soon. Please set ``ingore_index`` and '
+                          '``loss_weight`` in ``loss_seg`` instead.')
+        self.criterion = MODELS.build(loss_seg)
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            feats (tuple[Tensor]): Multi scale feature maps.
+
+        Returns:
+            tuple[Tensor]:
+
+                - mask_preds (Tensor): Predicted mask logits.
+                - x (Tensor): Fused feature.
+        """
+        x = self.lateral_convs[self.fusion_level](feats[self.fusion_level])
+        fused_size = tuple(x.shape[-2:])
+        for i, feat in enumerate(feats):
+            if i != self.fusion_level:
+                feat = F.interpolate(
+                    feat, size=fused_size, mode='bilinear', align_corners=True)
+                # fix runtime error of "+=" inplace operation in PyTorch 1.10
+                x = x + self.lateral_convs[i](feat)
+
+        for i in range(self.num_convs):
+            x = self.convs[i](x)
+
+        mask_preds = self.conv_logits(x)
+        x = self.conv_embedding(x)
+        return mask_preds, x
+
+    def loss(self, mask_preds: Tensor, labels: Tensor) -> Tensor:
+        """Loss function.
+
+        Args:
+            mask_preds (Tensor): Predicted mask logits.
+            labels (Tensor): Ground truth.
+
+        Returns:
+            Tensor: Semantic segmentation loss.
+        """
+        labels = F.interpolate(
+            labels.float(), scale_factor=self.seg_scale_factor, mode='nearest')
+        labels = labels.squeeze(1).long()
+        loss_semantic_seg = self.criterion(mask_preds, labels)
+        return loss_semantic_seg
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_heads/global_context_head.py b/head_extractor/src/mmdet/models/roi_heads/mask_heads/global_context_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb947ea582227d2b74112cbb930e1a3f85b77ff5
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_heads/global_context_head.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.models.layers import ResLayer, SimplifiedBasicBlock
+from mmdet.registry import MODELS
+from mmdet.utils import MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class GlobalContextHead(BaseModule):
+    """Global context head used in `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        num_convs (int, optional): number of convolutional layer in GlbCtxHead.
+            Defaults to 4.
+        in_channels (int, optional): number of input channels. Defaults to 256.
+        conv_out_channels (int, optional): number of output channels before
+            classification layer. Defaults to 256.
+        num_classes (int, optional): number of classes. Defaults to 80.
+        loss_weight (float, optional): global context loss weight.
+            Defaults to 1.
+        conv_cfg (dict, optional): config to init conv layer. Defaults to None.
+        norm_cfg (dict, optional): config to init norm layer. Defaults to None.
+        conv_to_res (bool, optional): if True, 2 convs will be grouped into
+            1 `SimplifiedBasicBlock` using a skip connection.
+            Defaults to False.
+        init_cfg (:obj:`ConfigDict` or dict or list[dict] or
+            list[:obj:`ConfigDict`]): Initialization config dict. Defaults to
+            dict(type='Normal', std=0.01, override=dict(name='fc')).
+    """
+
+    def __init__(
+        self,
+        num_convs: int = 4,
+        in_channels: int = 256,
+        conv_out_channels: int = 256,
+        num_classes: int = 80,
+        loss_weight: float = 1.0,
+        conv_cfg: OptConfigType = None,
+        norm_cfg: OptConfigType = None,
+        conv_to_res: bool = False,
+        init_cfg: MultiConfig = dict(
+            type='Normal', std=0.01, override=dict(name='fc'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_convs = num_convs
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.num_classes = num_classes
+        self.loss_weight = loss_weight
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.conv_to_res = conv_to_res
+        self.fp16_enabled = False
+
+        if self.conv_to_res:
+            num_res_blocks = num_convs // 2
+            self.convs = ResLayer(
+                SimplifiedBasicBlock,
+                in_channels,
+                self.conv_out_channels,
+                num_res_blocks,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+            self.num_convs = num_res_blocks
+        else:
+            self.convs = nn.ModuleList()
+            for i in range(self.num_convs):
+                in_channels = self.in_channels if i == 0 else conv_out_channels
+                self.convs.append(
+                    ConvModule(
+                        in_channels,
+                        conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(conv_out_channels, num_classes)
+
+        self.criterion = nn.BCEWithLogitsLoss()
+
+    def forward(self, feats: Tuple[Tensor]) -> Tuple[Tensor]:
+        """Forward function.
+
+        Args:
+            feats (Tuple[Tensor]): Multi-scale feature maps.
+
+        Returns:
+            Tuple[Tensor]:
+
+                - mc_pred (Tensor): Multi-class prediction.
+                - x (Tensor): Global context feature.
+        """
+        x = feats[-1]
+        for i in range(self.num_convs):
+            x = self.convs[i](x)
+        x = self.pool(x)
+
+        # multi-class prediction
+        mc_pred = x.reshape(x.size(0), -1)
+        mc_pred = self.fc(mc_pred)
+
+        return mc_pred, x
+
+    def loss(self, pred: Tensor, labels: List[Tensor]) -> Tensor:
+        """Loss function.
+
+        Args:
+            pred (Tensor): Logits.
+            labels (list[Tensor]): Grouth truths.
+
+        Returns:
+            Tensor: Loss.
+        """
+        labels = [lbl.unique() for lbl in labels]
+        targets = pred.new_zeros(pred.size())
+        for i, label in enumerate(labels):
+            targets[i, label] = 1.0
+        loss = self.loss_weight * self.criterion(pred, targets)
+        return loss
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_heads/grid_head.py b/head_extractor/src/mmdet/models/roi_heads/mask_heads/grid_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9514ae7bcfc1b7d5613fa0107e9bd087e13dd46
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_heads/grid_head.py
@@ -0,0 +1,490 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class GridHead(BaseModule):
+    """Implementation of `Grid Head <https://arxiv.org/abs/1811.12030>`_
+
+    Args:
+        grid_points (int): The number of grid points. Defaults to 9.
+        num_convs (int): The number of convolution layers. Defaults to 8.
+        roi_feat_size (int): RoI feature size. Default to 14.
+        in_channels (int): The channel number of inputs features.
+            Defaults to 256.
+        conv_kernel_size (int): The kernel size of convolution layers.
+            Defaults to 3.
+        point_feat_channels (int): The number of channels of each point
+            features. Defaults to 64.
+        class_agnostic (bool): Whether use class agnostic classification.
+            If so, the output channels of logits will be 1. Defaults to False.
+        loss_grid (:obj:`ConfigDict` or dict): Config of grid loss.
+        conv_cfg (:obj:`ConfigDict` or dict, optional) dictionary to
+            construct and config conv layer.
+        norm_cfg (:obj:`ConfigDict` or dict): dictionary to construct and
+            config norm layer.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        grid_points: int = 9,
+        num_convs: int = 8,
+        roi_feat_size: int = 14,
+        in_channels: int = 256,
+        conv_kernel_size: int = 3,
+        point_feat_channels: int = 64,
+        deconv_kernel_size: int = 4,
+        class_agnostic: bool = False,
+        loss_grid: ConfigType = dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=15),
+        conv_cfg: OptConfigType = None,
+        norm_cfg: ConfigType = dict(type='GN', num_groups=36),
+        init_cfg: MultiConfig = [
+            dict(type='Kaiming', layer=['Conv2d', 'Linear']),
+            dict(
+                type='Normal',
+                layer='ConvTranspose2d',
+                std=0.001,
+                override=dict(
+                    type='Normal',
+                    name='deconv2',
+                    std=0.001,
+                    bias=-np.log(0.99 / 0.01)))
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.grid_points = grid_points
+        self.num_convs = num_convs
+        self.roi_feat_size = roi_feat_size
+        self.in_channels = in_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.point_feat_channels = point_feat_channels
+        self.conv_out_channels = self.point_feat_channels * self.grid_points
+        self.class_agnostic = class_agnostic
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        if isinstance(norm_cfg, dict) and norm_cfg['type'] == 'GN':
+            assert self.conv_out_channels % norm_cfg['num_groups'] == 0
+
+        assert self.grid_points >= 4
+        self.grid_size = int(np.sqrt(self.grid_points))
+        if self.grid_size * self.grid_size != self.grid_points:
+            raise ValueError('grid_points must be a square number')
+
+        # the predicted heatmap is half of whole_map_size
+        if not isinstance(self.roi_feat_size, int):
+            raise ValueError('Only square RoIs are supporeted in Grid R-CNN')
+        self.whole_map_size = self.roi_feat_size * 4
+
+        # compute point-wise sub-regions
+        self.sub_regions = self.calc_sub_regions()
+
+        self.convs = []
+        for i in range(self.num_convs):
+            in_channels = (
+                self.in_channels if i == 0 else self.conv_out_channels)
+            stride = 2 if i == 0 else 1
+            padding = (self.conv_kernel_size - 1) // 2
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    self.conv_out_channels,
+                    self.conv_kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=True))
+        self.convs = nn.Sequential(*self.convs)
+
+        self.deconv1 = nn.ConvTranspose2d(
+            self.conv_out_channels,
+            self.conv_out_channels,
+            kernel_size=deconv_kernel_size,
+            stride=2,
+            padding=(deconv_kernel_size - 2) // 2,
+            groups=grid_points)
+        self.norm1 = nn.GroupNorm(grid_points, self.conv_out_channels)
+        self.deconv2 = nn.ConvTranspose2d(
+            self.conv_out_channels,
+            grid_points,
+            kernel_size=deconv_kernel_size,
+            stride=2,
+            padding=(deconv_kernel_size - 2) // 2,
+            groups=grid_points)
+
+        # find the 4-neighbor of each grid point
+        self.neighbor_points = []
+        grid_size = self.grid_size
+        for i in range(grid_size):  # i-th column
+            for j in range(grid_size):  # j-th row
+                neighbors = []
+                if i > 0:  # left: (i - 1, j)
+                    neighbors.append((i - 1) * grid_size + j)
+                if j > 0:  # up: (i, j - 1)
+                    neighbors.append(i * grid_size + j - 1)
+                if j < grid_size - 1:  # down: (i, j + 1)
+                    neighbors.append(i * grid_size + j + 1)
+                if i < grid_size - 1:  # right: (i + 1, j)
+                    neighbors.append((i + 1) * grid_size + j)
+                self.neighbor_points.append(tuple(neighbors))
+        # total edges in the grid
+        self.num_edges = sum([len(p) for p in self.neighbor_points])
+
+        self.forder_trans = nn.ModuleList()  # first-order feature transition
+        self.sorder_trans = nn.ModuleList()  # second-order feature transition
+        for neighbors in self.neighbor_points:
+            fo_trans = nn.ModuleList()
+            so_trans = nn.ModuleList()
+            for _ in range(len(neighbors)):
+                # each transition module consists of a 5x5 depth-wise conv and
+                # 1x1 conv.
+                fo_trans.append(
+                    nn.Sequential(
+                        nn.Conv2d(
+                            self.point_feat_channels,
+                            self.point_feat_channels,
+                            5,
+                            stride=1,
+                            padding=2,
+                            groups=self.point_feat_channels),
+                        nn.Conv2d(self.point_feat_channels,
+                                  self.point_feat_channels, 1)))
+                so_trans.append(
+                    nn.Sequential(
+                        nn.Conv2d(
+                            self.point_feat_channels,
+                            self.point_feat_channels,
+                            5,
+                            1,
+                            2,
+                            groups=self.point_feat_channels),
+                        nn.Conv2d(self.point_feat_channels,
+                                  self.point_feat_channels, 1)))
+            self.forder_trans.append(fo_trans)
+            self.sorder_trans.append(so_trans)
+
+        self.loss_grid = MODELS.build(loss_grid)
+
+    def forward(self, x: Tensor) -> Dict[str, Tensor]:
+        """forward function of ``GridHead``.
+
+        Args:
+            x (Tensor): RoI features, has shape
+                (num_rois, num_channels, roi_feat_size, roi_feat_size).
+
+        Returns:
+            Dict[str, Tensor]: Return a dict including fused and unfused
+            heatmap.
+        """
+        assert x.shape[-1] == x.shape[-2] == self.roi_feat_size
+        # RoI feature transformation, downsample 2x
+        x = self.convs(x)
+
+        c = self.point_feat_channels
+        # first-order fusion
+        x_fo = [None for _ in range(self.grid_points)]
+        for i, points in enumerate(self.neighbor_points):
+            x_fo[i] = x[:, i * c:(i + 1) * c]
+            for j, point_idx in enumerate(points):
+                x_fo[i] = x_fo[i] + self.forder_trans[i][j](
+                    x[:, point_idx * c:(point_idx + 1) * c])
+
+        # second-order fusion
+        x_so = [None for _ in range(self.grid_points)]
+        for i, points in enumerate(self.neighbor_points):
+            x_so[i] = x[:, i * c:(i + 1) * c]
+            for j, point_idx in enumerate(points):
+                x_so[i] = x_so[i] + self.sorder_trans[i][j](x_fo[point_idx])
+
+        # predicted heatmap with fused features
+        x2 = torch.cat(x_so, dim=1)
+        x2 = self.deconv1(x2)
+        x2 = F.relu(self.norm1(x2), inplace=True)
+        heatmap = self.deconv2(x2)
+
+        # predicted heatmap with original features (applicable during training)
+        if self.training:
+            x1 = x
+            x1 = self.deconv1(x1)
+            x1 = F.relu(self.norm1(x1), inplace=True)
+            heatmap_unfused = self.deconv2(x1)
+        else:
+            heatmap_unfused = heatmap
+
+        return dict(fused=heatmap, unfused=heatmap_unfused)
+
+    def calc_sub_regions(self) -> List[Tuple[float]]:
+        """Compute point specific representation regions.
+
+        See `Grid R-CNN Plus <https://arxiv.org/abs/1906.05688>`_ for details.
+        """
+        # to make it consistent with the original implementation, half_size
+        # is computed as 2 * quarter_size, which is smaller
+        half_size = self.whole_map_size // 4 * 2
+        sub_regions = []
+        for i in range(self.grid_points):
+            x_idx = i // self.grid_size
+            y_idx = i % self.grid_size
+            if x_idx == 0:
+                sub_x1 = 0
+            elif x_idx == self.grid_size - 1:
+                sub_x1 = half_size
+            else:
+                ratio = x_idx / (self.grid_size - 1) - 0.25
+                sub_x1 = max(int(ratio * self.whole_map_size), 0)
+
+            if y_idx == 0:
+                sub_y1 = 0
+            elif y_idx == self.grid_size - 1:
+                sub_y1 = half_size
+            else:
+                ratio = y_idx / (self.grid_size - 1) - 0.25
+                sub_y1 = max(int(ratio * self.whole_map_size), 0)
+            sub_regions.append(
+                (sub_x1, sub_y1, sub_x1 + half_size, sub_y1 + half_size))
+        return sub_regions
+
+    def get_targets(self, sampling_results: List[SamplingResult],
+                    rcnn_train_cfg: ConfigDict) -> Tensor:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.".
+
+        Args:
+            sampling_results (List[:obj:`SamplingResult`]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (:obj:`ConfigDict`): `train_cfg` of RCNN.
+
+        Returns:
+            Tensor: Grid heatmap targets.
+        """
+        # mix all samples (across images) together.
+        pos_bboxes = torch.cat([res.pos_bboxes for res in sampling_results],
+                               dim=0).cpu()
+        pos_gt_bboxes = torch.cat(
+            [res.pos_gt_bboxes for res in sampling_results], dim=0).cpu()
+        assert pos_bboxes.shape == pos_gt_bboxes.shape
+
+        # expand pos_bboxes to 2x of original size
+        x1 = pos_bboxes[:, 0] - (pos_bboxes[:, 2] - pos_bboxes[:, 0]) / 2
+        y1 = pos_bboxes[:, 1] - (pos_bboxes[:, 3] - pos_bboxes[:, 1]) / 2
+        x2 = pos_bboxes[:, 2] + (pos_bboxes[:, 2] - pos_bboxes[:, 0]) / 2
+        y2 = pos_bboxes[:, 3] + (pos_bboxes[:, 3] - pos_bboxes[:, 1]) / 2
+        pos_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+        pos_bbox_ws = (pos_bboxes[:, 2] - pos_bboxes[:, 0]).unsqueeze(-1)
+        pos_bbox_hs = (pos_bboxes[:, 3] - pos_bboxes[:, 1]).unsqueeze(-1)
+
+        num_rois = pos_bboxes.shape[0]
+        map_size = self.whole_map_size
+        # this is not the final target shape
+        targets = torch.zeros((num_rois, self.grid_points, map_size, map_size),
+                              dtype=torch.float)
+
+        # pre-compute interpolation factors for all grid points.
+        # the first item is the factor of x-dim, and the second is y-dim.
+        # for a 9-point grid, factors are like (1, 0), (0.5, 0.5), (0, 1)
+        factors = []
+        for j in range(self.grid_points):
+            x_idx = j // self.grid_size
+            y_idx = j % self.grid_size
+            factors.append((1 - x_idx / (self.grid_size - 1),
+                            1 - y_idx / (self.grid_size - 1)))
+
+        radius = rcnn_train_cfg.pos_radius
+        radius2 = radius**2
+        for i in range(num_rois):
+            # ignore small bboxes
+            if (pos_bbox_ws[i] <= self.grid_size
+                    or pos_bbox_hs[i] <= self.grid_size):
+                continue
+            # for each grid point, mark a small circle as positive
+            for j in range(self.grid_points):
+                factor_x, factor_y = factors[j]
+                gridpoint_x = factor_x * pos_gt_bboxes[i, 0] + (
+                    1 - factor_x) * pos_gt_bboxes[i, 2]
+                gridpoint_y = factor_y * pos_gt_bboxes[i, 1] + (
+                    1 - factor_y) * pos_gt_bboxes[i, 3]
+
+                cx = int((gridpoint_x - pos_bboxes[i, 0]) / pos_bbox_ws[i] *
+                         map_size)
+                cy = int((gridpoint_y - pos_bboxes[i, 1]) / pos_bbox_hs[i] *
+                         map_size)
+
+                for x in range(cx - radius, cx + radius + 1):
+                    for y in range(cy - radius, cy + radius + 1):
+                        if x >= 0 and x < map_size and y >= 0 and y < map_size:
+                            if (x - cx)**2 + (y - cy)**2 <= radius2:
+                                targets[i, j, y, x] = 1
+        # reduce the target heatmap size by a half
+        # proposed in Grid R-CNN Plus (https://arxiv.org/abs/1906.05688).
+        sub_targets = []
+        for i in range(self.grid_points):
+            sub_x1, sub_y1, sub_x2, sub_y2 = self.sub_regions[i]
+            sub_targets.append(targets[:, [i], sub_y1:sub_y2, sub_x1:sub_x2])
+        sub_targets = torch.cat(sub_targets, dim=1)
+        sub_targets = sub_targets.to(sampling_results[0].pos_bboxes.device)
+        return sub_targets
+
+    def loss(self, grid_pred: Tensor, sample_idx: Tensor,
+             sampling_results: List[SamplingResult],
+             rcnn_train_cfg: ConfigDict) -> dict:
+        """Calculate the loss based on the features extracted by the grid head.
+
+        Args:
+            grid_pred (dict[str, Tensor]): Outputs of grid_head forward.
+            sample_idx (Tensor): The sampling index of ``grid_pred``.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            rcnn_train_cfg (obj:`ConfigDict`): `train_cfg` of RCNN.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+        """
+        grid_targets = self.get_targets(sampling_results, rcnn_train_cfg)
+        grid_targets = grid_targets[sample_idx]
+
+        loss_fused = self.loss_grid(grid_pred['fused'], grid_targets)
+        loss_unfused = self.loss_grid(grid_pred['unfused'], grid_targets)
+        loss_grid = loss_fused + loss_unfused
+        return dict(loss_grid=loss_grid)
+
+    def predict_by_feat(self,
+                        grid_preds: Dict[str, Tensor],
+                        results_list: List[InstanceData],
+                        batch_img_metas: List[dict],
+                        rescale: bool = False) -> InstanceList:
+        """Adjust the predicted bboxes from bbox head.
+
+        Args:
+            grid_preds (dict[str, Tensor]): dictionary outputted by forward
+                function.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            batch_img_metas (list[dict]): List of image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process. Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape \
+            (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4), the last \
+            dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        num_roi_per_img = tuple(res.bboxes.size(0) for res in results_list)
+        grid_preds = {
+            k: v.split(num_roi_per_img, 0)
+            for k, v in grid_preds.items()
+        }
+
+        for i, results in enumerate(results_list):
+            if len(results) != 0:
+                bboxes = self._predict_by_feat_single(
+                    grid_pred=grid_preds['fused'][i],
+                    bboxes=results.bboxes,
+                    img_meta=batch_img_metas[i],
+                    rescale=rescale)
+                results.bboxes = bboxes
+        return results_list
+
+    def _predict_by_feat_single(self,
+                                grid_pred: Tensor,
+                                bboxes: Tensor,
+                                img_meta: dict,
+                                rescale: bool = False) -> Tensor:
+        """Adjust ``bboxes`` according to ``grid_pred``.
+
+        Args:
+            grid_pred (Tensor): Grid fused heatmap.
+            bboxes (Tensor): Predicted bboxes, has shape (n, 4)
+            img_meta (dict): image information.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            Tensor: adjusted bboxes.
+        """
+        assert bboxes.size(0) == grid_pred.size(0)
+        grid_pred = grid_pred.sigmoid()
+
+        R, c, h, w = grid_pred.shape
+        half_size = self.whole_map_size // 4 * 2
+        assert h == w == half_size
+        assert c == self.grid_points
+
+        # find the point with max scores in the half-sized heatmap
+        grid_pred = grid_pred.view(R * c, h * w)
+        pred_scores, pred_position = grid_pred.max(dim=1)
+        xs = pred_position % w
+        ys = pred_position // w
+
+        # get the position in the whole heatmap instead of half-sized heatmap
+        for i in range(self.grid_points):
+            xs[i::self.grid_points] += self.sub_regions[i][0]
+            ys[i::self.grid_points] += self.sub_regions[i][1]
+
+        # reshape to (num_rois, grid_points)
+        pred_scores, xs, ys = tuple(
+            map(lambda x: x.view(R, c), [pred_scores, xs, ys]))
+
+        # get expanded pos_bboxes
+        widths = (bboxes[:, 2] - bboxes[:, 0]).unsqueeze(-1)
+        heights = (bboxes[:, 3] - bboxes[:, 1]).unsqueeze(-1)
+        x1 = (bboxes[:, 0, None] - widths / 2)
+        y1 = (bboxes[:, 1, None] - heights / 2)
+        # map the grid point to the absolute coordinates
+        abs_xs = (xs.float() + 0.5) / w * widths + x1
+        abs_ys = (ys.float() + 0.5) / h * heights + y1
+
+        # get the grid points indices that fall on the bbox boundaries
+        x1_inds = [i for i in range(self.grid_size)]
+        y1_inds = [i * self.grid_size for i in range(self.grid_size)]
+        x2_inds = [
+            self.grid_points - self.grid_size + i
+            for i in range(self.grid_size)
+        ]
+        y2_inds = [(i + 1) * self.grid_size - 1 for i in range(self.grid_size)]
+
+        # voting of all grid points on some boundary
+        bboxes_x1 = (abs_xs[:, x1_inds] * pred_scores[:, x1_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, x1_inds].sum(dim=1, keepdim=True))
+        bboxes_y1 = (abs_ys[:, y1_inds] * pred_scores[:, y1_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, y1_inds].sum(dim=1, keepdim=True))
+        bboxes_x2 = (abs_xs[:, x2_inds] * pred_scores[:, x2_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, x2_inds].sum(dim=1, keepdim=True))
+        bboxes_y2 = (abs_ys[:, y2_inds] * pred_scores[:, y2_inds]).sum(
+            dim=1, keepdim=True) / (
+                pred_scores[:, y2_inds].sum(dim=1, keepdim=True))
+
+        bboxes = torch.cat([bboxes_x1, bboxes_y1, bboxes_x2, bboxes_y2], dim=1)
+        bboxes[:, [0, 2]].clamp_(min=0, max=img_meta['img_shape'][1])
+        bboxes[:, [1, 3]].clamp_(min=0, max=img_meta['img_shape'][0])
+
+        if rescale:
+            assert img_meta.get('scale_factor') is not None
+            bboxes /= bboxes.new_tensor(img_meta['scale_factor']).repeat(
+                (1, 2))
+
+        return bboxes
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_heads/htc_mask_head.py b/head_extractor/src/mmdet/models/roi_heads/mask_heads/htc_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..73ac1e6e5f115927e1a2accdd693aae512cac753
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_heads/htc_mask_head.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+from mmcv.cnn import ConvModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from .fcn_mask_head import FCNMaskHead
+
+
+@MODELS.register_module()
+class HTCMaskHead(FCNMaskHead):
+    """Mask head for HTC.
+
+    Args:
+        with_conv_res (bool): Whether add conv layer for ``res_feat``.
+            Defaults to True.
+    """
+
+    def __init__(self, with_conv_res: bool = True, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.with_conv_res = with_conv_res
+        if self.with_conv_res:
+            self.conv_res = ConvModule(
+                self.conv_out_channels,
+                self.conv_out_channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+
+    def forward(self,
+                x: Tensor,
+                res_feat: Optional[Tensor] = None,
+                return_logits: bool = True,
+                return_feat: bool = True) -> Union[Tensor, List[Tensor]]:
+        """
+        Args:
+            x (Tensor): Feature map.
+            res_feat (Tensor, optional): Feature for residual connection.
+                Defaults to None.
+            return_logits (bool): Whether return mask logits. Defaults to True.
+            return_feat (bool): Whether return feature map. Defaults to True.
+
+        Returns:
+            Union[Tensor, List[Tensor]]: The return result is one of three
+                results: res_feat, logits, or [logits, res_feat].
+        """
+        assert not (not return_logits and not return_feat)
+        if res_feat is not None:
+            assert self.with_conv_res
+            res_feat = self.conv_res(res_feat)
+            x = x + res_feat
+        for conv in self.convs:
+            x = conv(x)
+        res_feat = x
+        outs = []
+        if return_logits:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+            mask_preds = self.conv_logits(x)
+            outs.append(mask_preds)
+        if return_feat:
+            outs.append(res_feat)
+        return outs if len(outs) > 1 else outs[0]
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_heads/mask_point_head.py b/head_extractor/src/mmdet/models/roi_heads/mask_heads/mask_point_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2084f59f07b48bf2e5b05bb7af61172df8737478
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_heads/mask_point_head.py
@@ -0,0 +1,284 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py  # noqa
+
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.models.utils import (get_uncertain_point_coords_with_randomness,
+                                get_uncertainty)
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList, MultiConfig, OptConfigType
+
+
+@MODELS.register_module()
+class MaskPointHead(BaseModule):
+    """A mask point head use in PointRend.
+
+    ``MaskPointHead`` use shared multi-layer perceptron (equivalent to
+    nn.Conv1d) to predict the logit of input points. The fine-grained feature
+    and coarse feature will be concatenate together for predication.
+
+    Args:
+        num_fcs (int): Number of fc layers in the head. Defaults to 3.
+        in_channels (int): Number of input channels. Defaults to 256.
+        fc_channels (int): Number of fc channels. Defaults to 256.
+        num_classes (int): Number of classes for logits. Defaults to 80.
+        class_agnostic (bool): Whether use class agnostic classification.
+            If so, the output channels of logits will be 1. Defaults to False.
+        coarse_pred_each_layer (bool): Whether concatenate coarse feature with
+            the output of each fc layer. Defaults to True.
+        conv_cfg (:obj:`ConfigDict` or dict): Dictionary to construct
+            and config conv layer. Defaults to dict(type='Conv1d')).
+        norm_cfg (:obj:`ConfigDict` or dict, optional): Dictionary to construct
+            and config norm layer. Defaults to None.
+        loss_point (:obj:`ConfigDict` or dict): Dictionary to construct and
+            config loss layer of point head. Defaults to
+            dict(type='CrossEntropyLoss', use_mask=True, loss_weight=1.0).
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        num_fcs: int = 3,
+        in_channels: int = 256,
+        fc_channels: int = 256,
+        class_agnostic: bool = False,
+        coarse_pred_each_layer: bool = True,
+        conv_cfg: ConfigType = dict(type='Conv1d'),
+        norm_cfg: OptConfigType = None,
+        act_cfg: ConfigType = dict(type='ReLU'),
+        loss_point: ConfigType = dict(
+            type='CrossEntropyLoss', use_mask=True, loss_weight=1.0),
+        init_cfg: MultiConfig = dict(
+            type='Normal', std=0.001, override=dict(name='fc_logits'))
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_fcs = num_fcs
+        self.in_channels = in_channels
+        self.fc_channels = fc_channels
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+        self.coarse_pred_each_layer = coarse_pred_each_layer
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.loss_point = MODELS.build(loss_point)
+
+        fc_in_channels = in_channels + num_classes
+        self.fcs = nn.ModuleList()
+        for _ in range(num_fcs):
+            fc = ConvModule(
+                fc_in_channels,
+                fc_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.fcs.append(fc)
+            fc_in_channels = fc_channels
+            fc_in_channels += num_classes if self.coarse_pred_each_layer else 0
+
+        out_channels = 1 if self.class_agnostic else self.num_classes
+        self.fc_logits = nn.Conv1d(
+            fc_in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, fine_grained_feats: Tensor,
+                coarse_feats: Tensor) -> Tensor:
+        """Classify each point base on fine grained and coarse feats.
+
+        Args:
+            fine_grained_feats (Tensor): Fine grained feature sampled from FPN,
+                shape (num_rois, in_channels, num_points).
+            coarse_feats (Tensor): Coarse feature sampled from CoarseMaskHead,
+                shape (num_rois, num_classes, num_points).
+
+        Returns:
+            Tensor: Point classification results,
+            shape (num_rois, num_class, num_points).
+        """
+
+        x = torch.cat([fine_grained_feats, coarse_feats], dim=1)
+        for fc in self.fcs:
+            x = fc(x)
+            if self.coarse_pred_each_layer:
+                x = torch.cat((x, coarse_feats), dim=1)
+        return self.fc_logits(x)
+
+    def get_targets(self, rois: Tensor, rel_roi_points: Tensor,
+                    sampling_results: List[SamplingResult],
+                    batch_gt_instances: InstanceList,
+                    cfg: ConfigType) -> Tensor:
+        """Get training targets of MaskPointHead for all images.
+
+        Args:
+            rois (Tensor): Region of Interest, shape (num_rois, 5).
+            rel_roi_points (Tensor): Points coordinates relative to RoI, shape
+                (num_rois, num_points, 2).
+            sampling_results (:obj:`SamplingResult`): Sampling result after
+                sampling and assignment.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            cfg (obj:`ConfigDict` or dict): Training cfg.
+
+        Returns:
+            Tensor: Point target, shape (num_rois, num_points).
+        """
+
+        num_imgs = len(sampling_results)
+        rois_list = []
+        rel_roi_points_list = []
+        for batch_ind in range(num_imgs):
+            inds = (rois[:, 0] == batch_ind)
+            rois_list.append(rois[inds])
+            rel_roi_points_list.append(rel_roi_points[inds])
+        pos_assigned_gt_inds_list = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        cfg_list = [cfg for _ in range(num_imgs)]
+
+        point_targets = map(self._get_targets_single, rois_list,
+                            rel_roi_points_list, pos_assigned_gt_inds_list,
+                            batch_gt_instances, cfg_list)
+        point_targets = list(point_targets)
+
+        if len(point_targets) > 0:
+            point_targets = torch.cat(point_targets)
+
+        return point_targets
+
+    def _get_targets_single(self, rois: Tensor, rel_roi_points: Tensor,
+                            pos_assigned_gt_inds: Tensor,
+                            gt_instances: InstanceData,
+                            cfg: ConfigType) -> Tensor:
+        """Get training target of MaskPointHead for each image."""
+        num_pos = rois.size(0)
+        num_points = cfg.num_points
+        if num_pos > 0:
+            gt_masks_th = (
+                gt_instances.masks.to_tensor(rois.dtype,
+                                             rois.device).index_select(
+                                                 0, pos_assigned_gt_inds))
+            gt_masks_th = gt_masks_th.unsqueeze(1)
+            rel_img_points = rel_roi_point_to_rel_img_point(
+                rois, rel_roi_points, gt_masks_th)
+            point_targets = point_sample(gt_masks_th,
+                                         rel_img_points).squeeze(1)
+        else:
+            point_targets = rois.new_zeros((0, num_points))
+        return point_targets
+
+    def loss_and_target(self, point_pred: Tensor, rel_roi_points: Tensor,
+                        sampling_results: List[SamplingResult],
+                        batch_gt_instances: InstanceList,
+                        cfg: ConfigType) -> dict:
+        """Calculate loss for MaskPointHead.
+
+        Args:
+            point_pred (Tensor): Point predication result, shape
+                (num_rois, num_classes, num_points).
+            rel_roi_points (Tensor): Points coordinates relative to RoI, shape
+                (num_rois, num_points, 2).
+             sampling_results (:obj:`SamplingResult`): Sampling result after
+                sampling and assignment.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            cfg (obj:`ConfigDict` or dict): Training cfg.
+
+        Returns:
+            dict: a dictionary of point loss and point target.
+        """
+        rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+
+        point_target = self.get_targets(rois, rel_roi_points, sampling_results,
+                                        batch_gt_instances, cfg)
+        if self.class_agnostic:
+            loss_point = self.loss_point(point_pred, point_target,
+                                         torch.zeros_like(pos_labels))
+        else:
+            loss_point = self.loss_point(point_pred, point_target, pos_labels)
+
+        return dict(loss_point=loss_point, point_target=point_target)
+
+    def get_roi_rel_points_train(self, mask_preds: Tensor, labels: Tensor,
+                                 cfg: ConfigType) -> Tensor:
+        """Get ``num_points`` most uncertain points with random points during
+        train.
+
+        Sample points in [0, 1] x [0, 1] coordinate space based on their
+        uncertainty. The uncertainties are calculated for each point using
+        '_get_uncertainty()' function that takes point's logit prediction as
+        input.
+
+        Args:
+            mask_preds (Tensor): A tensor of shape (num_rois, num_classes,
+                mask_height, mask_width) for class-specific or class-agnostic
+                prediction.
+            labels (Tensor): The ground truth class for each instance.
+            cfg (:obj:`ConfigDict` or dict): Training config of point head.
+
+        Returns:
+            point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+            that contains the coordinates sampled points.
+        """
+        point_coords = get_uncertain_point_coords_with_randomness(
+            mask_preds, labels, cfg.num_points, cfg.oversample_ratio,
+            cfg.importance_sample_ratio)
+        return point_coords
+
+    def get_roi_rel_points_test(self, mask_preds: Tensor, label_preds: Tensor,
+                                cfg: ConfigType) -> Tuple[Tensor, Tensor]:
+        """Get ``num_points`` most uncertain points during test.
+
+        Args:
+            mask_preds (Tensor): A tensor of shape (num_rois, num_classes,
+                mask_height, mask_width) for class-specific or class-agnostic
+                prediction.
+            label_preds (Tensor): The predication class for each instance.
+            cfg (:obj:`ConfigDict` or dict): Testing config of point head.
+
+        Returns:
+            tuple:
+
+            - point_indices (Tensor): A tensor of shape (num_rois, num_points)
+              that contains indices from [0, mask_height x mask_width) of the
+              most uncertain points.
+            - point_coords (Tensor): A tensor of shape (num_rois, num_points,
+              2) that contains [0, 1] x [0, 1] normalized coordinates of the
+              most uncertain points from the [mask_height, mask_width] grid.
+        """
+        num_points = cfg.subdivision_num_points
+        uncertainty_map = get_uncertainty(mask_preds, label_preds)
+        num_rois, _, mask_height, mask_width = uncertainty_map.shape
+
+        # During ONNX exporting, the type of each elements of 'shape' is
+        # `Tensor(float)`, while it is `float` during PyTorch inference.
+        if isinstance(mask_height, torch.Tensor):
+            h_step = 1.0 / mask_height.float()
+            w_step = 1.0 / mask_width.float()
+        else:
+            h_step = 1.0 / mask_height
+            w_step = 1.0 / mask_width
+        # cast to int to avoid dynamic K for TopK op in ONNX
+        mask_size = int(mask_height * mask_width)
+        uncertainty_map = uncertainty_map.view(num_rois, mask_size)
+        num_points = min(mask_size, num_points)
+        point_indices = uncertainty_map.topk(num_points, dim=1)[1]
+        xs = w_step / 2.0 + (point_indices % mask_width).float() * w_step
+        ys = h_step / 2.0 + (point_indices // mask_width).float() * h_step
+        point_coords = torch.stack([xs, ys], dim=2)
+        return point_indices, point_coords
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_heads/maskiou_head.py b/head_extractor/src/mmdet/models/roi_heads/mask_heads/maskiou_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8901871e754c491f7bc94eb68a27fa1b50e29148
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_heads/maskiou_head.py
@@ -0,0 +1,277 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import Conv2d, Linear, MaxPool2d
+from mmengine.config import ConfigDict
+from mmengine.model import BaseModule
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.task_modules.samplers import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, InstanceList, OptMultiConfig
+
+
+@MODELS.register_module()
+class MaskIoUHead(BaseModule):
+    """Mask IoU Head.
+
+    This head predicts the IoU of predicted masks and corresponding gt masks.
+
+    Args:
+        num_convs (int): The number of convolution layers. Defaults to 4.
+        num_fcs (int): The number of fully connected layers. Defaults to 2.
+        roi_feat_size (int): RoI feature size. Default to 14.
+        in_channels (int): The channel number of inputs features.
+            Defaults to 256.
+        conv_out_channels (int): The feature channels of convolution layers.
+            Defaults to 256.
+        fc_out_channels (int): The feature channels of fully connected layers.
+            Defaults to 1024.
+        num_classes (int): Number of categories excluding the background
+            category. Defaults to 80.
+        loss_iou (:obj:`ConfigDict` or dict): IoU loss.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+        self,
+        num_convs: int = 4,
+        num_fcs: int = 2,
+        roi_feat_size: int = 14,
+        in_channels: int = 256,
+        conv_out_channels: int = 256,
+        fc_out_channels: int = 1024,
+        num_classes: int = 80,
+        loss_iou: ConfigType = dict(type='MSELoss', loss_weight=0.5),
+        init_cfg: OptMultiConfig = [
+            dict(type='Kaiming', override=dict(name='convs')),
+            dict(type='Caffe2Xavier', override=dict(name='fcs')),
+            dict(type='Normal', std=0.01, override=dict(name='fc_mask_iou'))
+        ]
+    ) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.fc_out_channels = fc_out_channels
+        self.num_classes = num_classes
+
+        self.convs = nn.ModuleList()
+        for i in range(num_convs):
+            if i == 0:
+                # concatenation of mask feature and mask prediction
+                in_channels = self.in_channels + 1
+            else:
+                in_channels = self.conv_out_channels
+            stride = 2 if i == num_convs - 1 else 1
+            self.convs.append(
+                Conv2d(
+                    in_channels,
+                    self.conv_out_channels,
+                    3,
+                    stride=stride,
+                    padding=1))
+
+        roi_feat_size = _pair(roi_feat_size)
+        pooled_area = (roi_feat_size[0] // 2) * (roi_feat_size[1] // 2)
+        self.fcs = nn.ModuleList()
+        for i in range(num_fcs):
+            in_channels = (
+                self.conv_out_channels *
+                pooled_area if i == 0 else self.fc_out_channels)
+            self.fcs.append(Linear(in_channels, self.fc_out_channels))
+
+        self.fc_mask_iou = Linear(self.fc_out_channels, self.num_classes)
+        self.relu = nn.ReLU()
+        self.max_pool = MaxPool2d(2, 2)
+        self.loss_iou = MODELS.build(loss_iou)
+
+    def forward(self, mask_feat: Tensor, mask_preds: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            mask_feat (Tensor): Mask features from upstream models.
+            mask_preds (Tensor): Mask predictions from mask head.
+
+        Returns:
+            Tensor: Mask IoU predictions.
+        """
+        mask_preds = mask_preds.sigmoid()
+        mask_pred_pooled = self.max_pool(mask_preds.unsqueeze(1))
+
+        x = torch.cat((mask_feat, mask_pred_pooled), 1)
+
+        for conv in self.convs:
+            x = self.relu(conv(x))
+        x = x.flatten(1)
+        for fc in self.fcs:
+            x = self.relu(fc(x))
+        mask_iou = self.fc_mask_iou(x)
+        return mask_iou
+
+    def loss_and_target(self, mask_iou_pred: Tensor, mask_preds: Tensor,
+                        mask_targets: Tensor,
+                        sampling_results: List[SamplingResult],
+                        batch_gt_instances: InstanceList,
+                        rcnn_train_cfg: ConfigDict) -> dict:
+        """Calculate the loss and targets of MaskIoUHead.
+
+        Args:
+            mask_iou_pred (Tensor): Mask IoU predictions results, has shape
+                (num_pos, num_classes)
+            mask_preds (Tensor): Mask predictions from mask head, has shape
+                (num_pos, mask_size, mask_size).
+            mask_targets (Tensor): The ground truth masks assigned with
+                predictions, has shape
+                (num_pos, mask_size, mask_size).
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It includes ``masks`` inside.
+            rcnn_train_cfg (obj:`ConfigDict`): `train_cfg` of RCNN.
+
+        Returns:
+            dict: A dictionary of loss and targets components.
+                The targets are only used for cascade rcnn.
+        """
+        mask_iou_targets = self.get_targets(
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            mask_preds=mask_preds,
+            mask_targets=mask_targets,
+            rcnn_train_cfg=rcnn_train_cfg)
+
+        pos_inds = mask_iou_targets > 0
+        if pos_inds.sum() > 0:
+            loss_mask_iou = self.loss_iou(mask_iou_pred[pos_inds],
+                                          mask_iou_targets[pos_inds])
+        else:
+            loss_mask_iou = mask_iou_pred.sum() * 0
+        return dict(loss_mask_iou=loss_mask_iou)
+
+    def get_targets(self, sampling_results: List[SamplingResult],
+                    batch_gt_instances: InstanceList, mask_preds: Tensor,
+                    mask_targets: Tensor,
+                    rcnn_train_cfg: ConfigDict) -> Tensor:
+        """Compute target of mask IoU.
+
+        Mask IoU target is the IoU of the predicted mask (inside a bbox) and
+        the gt mask of corresponding gt mask (the whole instance).
+        The intersection area is computed inside the bbox, and the gt mask area
+        is computed with two steps, firstly we compute the gt area inside the
+        bbox, then divide it by the area ratio of gt area inside the bbox and
+        the gt area of the whole instance.
+
+        Args:
+            sampling_results (list[:obj:`SamplingResult`]): sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance.  It includes ``masks`` inside.
+            mask_preds (Tensor): Predicted masks of each positive proposal,
+                shape (num_pos, h, w).
+            mask_targets (Tensor): Gt mask of each positive proposal,
+                binary map of the shape (num_pos, h, w).
+            rcnn_train_cfg (obj:`ConfigDict`): Training config for R-CNN part.
+
+        Returns:
+            Tensor: mask iou target (length == num positive).
+        """
+        pos_proposals = [res.pos_priors for res in sampling_results]
+        pos_assigned_gt_inds = [
+            res.pos_assigned_gt_inds for res in sampling_results
+        ]
+        gt_masks = [res.masks for res in batch_gt_instances]
+
+        # compute the area ratio of gt areas inside the proposals and
+        # the whole instance
+        area_ratios = map(self._get_area_ratio, pos_proposals,
+                          pos_assigned_gt_inds, gt_masks)
+        area_ratios = torch.cat(list(area_ratios))
+        assert mask_targets.size(0) == area_ratios.size(0)
+
+        mask_preds = (mask_preds > rcnn_train_cfg.mask_thr_binary).float()
+        mask_pred_areas = mask_preds.sum((-1, -2))
+
+        # mask_preds and mask_targets are binary maps
+        overlap_areas = (mask_preds * mask_targets).sum((-1, -2))
+
+        # compute the mask area of the whole instance
+        gt_full_areas = mask_targets.sum((-1, -2)) / (area_ratios + 1e-7)
+
+        mask_iou_targets = overlap_areas / (
+            mask_pred_areas + gt_full_areas - overlap_areas)
+        return mask_iou_targets
+
+    def _get_area_ratio(self, pos_proposals: Tensor,
+                        pos_assigned_gt_inds: Tensor,
+                        gt_masks: InstanceData) -> Tensor:
+        """Compute area ratio of the gt mask inside the proposal and the gt
+        mask of the corresponding instance.
+
+        Args:
+            pos_proposals (Tensor): Positive proposals, has shape (num_pos, 4).
+            pos_assigned_gt_inds (Tensor): positive proposals assigned ground
+                truth index.
+            gt_masks (BitmapMask or PolygonMask): Gt masks (the whole instance)
+                of each image, with the same shape of the input image.
+
+        Returns:
+            Tensor: The area ratio of the gt mask inside the proposal and the
+            gt mask of the corresponding instance.
+        """
+        num_pos = pos_proposals.size(0)
+        if num_pos > 0:
+            area_ratios = []
+            proposals_np = pos_proposals.cpu().numpy()
+            pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+            # compute mask areas of gt instances (batch processing for speedup)
+            gt_instance_mask_area = gt_masks.areas
+            for i in range(num_pos):
+                gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+
+                # crop the gt mask inside the proposal
+                bbox = proposals_np[i, :].astype(np.int32)
+                gt_mask_in_proposal = gt_mask.crop(bbox)
+
+                ratio = gt_mask_in_proposal.areas[0] / (
+                    gt_instance_mask_area[pos_assigned_gt_inds[i]] + 1e-7)
+                area_ratios.append(ratio)
+            area_ratios = torch.from_numpy(np.stack(area_ratios)).float().to(
+                pos_proposals.device)
+        else:
+            area_ratios = pos_proposals.new_zeros((0, ))
+        return area_ratios
+
+    def predict_by_feat(self, mask_iou_preds: Tuple[Tensor],
+                        results_list: InstanceList) -> InstanceList:
+        """Predict the mask iou and calculate it into ``results.scores``.
+
+        Args:
+            mask_iou_preds (Tensor): Mask IoU predictions results, has shape
+                (num_proposals, num_classes)
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process. Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert len(mask_iou_preds) == len(results_list)
+        for results, mask_iou_pred in zip(results_list, mask_iou_preds):
+            labels = results.labels
+            scores = results.scores
+            results.scores = scores * mask_iou_pred[range(labels.size(0)),
+                                                    labels]
+        return results_list
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py b/head_extractor/src/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd30c337c37f4e280980e459c126df177fe7efa
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_heads/scnet_mask_head.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.layers import ResLayer, SimplifiedBasicBlock
+from mmdet.registry import MODELS
+from .fcn_mask_head import FCNMaskHead
+
+
+@MODELS.register_module()
+class SCNetMaskHead(FCNMaskHead):
+    """Mask head for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        conv_to_res (bool, optional): if True, change the conv layers to
+            ``SimplifiedBasicBlock``.
+    """
+
+    def __init__(self, conv_to_res: bool = True, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.conv_to_res = conv_to_res
+        if conv_to_res:
+            assert self.conv_kernel_size == 3
+            self.num_res_blocks = self.num_convs // 2
+            self.convs = ResLayer(
+                SimplifiedBasicBlock,
+                self.in_channels,
+                self.conv_out_channels,
+                self.num_res_blocks,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py b/head_extractor/src/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..55c5c8e4fae7d4e941a770d985c7253fd70f2226
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_heads/scnet_semantic_head.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.layers import ResLayer, SimplifiedBasicBlock
+from mmdet.registry import MODELS
+from .fused_semantic_head import FusedSemanticHead
+
+
+@MODELS.register_module()
+class SCNetSemanticHead(FusedSemanticHead):
+    """Mask head for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        conv_to_res (bool, optional): if True, change the conv layers to
+            ``SimplifiedBasicBlock``.
+    """
+
+    def __init__(self, conv_to_res: bool = True, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.conv_to_res = conv_to_res
+        if self.conv_to_res:
+            num_res_blocks = self.num_convs // 2
+            self.convs = ResLayer(
+                SimplifiedBasicBlock,
+                self.in_channels,
+                self.conv_out_channels,
+                num_res_blocks,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg)
+            self.num_convs = num_res_blocks
diff --git a/head_extractor/src/mmdet/models/roi_heads/mask_scoring_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/mask_scoring_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6545c0ed41ee7ad17b5f1b841f8bc8d65a7b6391
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/mask_scoring_roi_head.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils.misc import empty_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class MaskScoringRoIHead(StandardRoIHead):
+    """Mask Scoring RoIHead for `Mask Scoring RCNN.
+
+    <https://arxiv.org/abs/1903.00241>`_.
+
+    Args:
+        mask_iou_head (:obj`ConfigDict`, dict): The config of mask_iou_head.
+    """
+
+    def __init__(self, mask_iou_head: ConfigType, **kwargs):
+        assert mask_iou_head is not None
+        super().__init__(**kwargs)
+        self.mask_iou_head = MODELS.build(mask_iou_head)
+
+    def forward(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList = None) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+            the meta information of each image and corresponding
+            annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            bbox_results = self._bbox_forward(x, rois)
+            results = results + (bbox_results['cls_score'],
+                                 bbox_results['bbox_pred'])
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_results = self._mask_forward(x, mask_rois)
+            results = results + (mask_results['mask_preds'], )
+
+            # mask iou head
+            cls_score = bbox_results['cls_score'][:100]
+            mask_preds = mask_results['mask_preds']
+            mask_feats = mask_results['mask_feats']
+            _, labels = cls_score[:, :self.bbox_head.num_classes].max(dim=1)
+            mask_iou_preds = self.mask_iou_head(
+                mask_feats, mask_preds[range(labels.size(0)), labels])
+            results = results + (mask_iou_preds, )
+
+        return results
+
+    def mask_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult], bbox_feats,
+                  batch_gt_instances: InstanceList) -> dict:
+        """Perform forward propagation and loss calculation of the mask head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            bbox_feats (Tensor): Extract bbox RoI features.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `mask_feats` (Tensor): Extract mask RoI features.
+                - `mask_targets` (Tensor): Mask target of each positive\
+                    proposals in the image.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+                - `loss_mask_iou` (Tensor): mask iou loss.
+        """
+        if not self.share_roi_extractor:
+            pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+            mask_results = self._mask_forward(x, pos_rois)
+        else:
+            pos_inds = []
+            device = bbox_feats.device
+            for res in sampling_results:
+                pos_inds.append(
+                    torch.ones(
+                        res.pos_priors.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+                pos_inds.append(
+                    torch.zeros(
+                        res.neg_priors.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+            pos_inds = torch.cat(pos_inds)
+
+            mask_results = self._mask_forward(
+                x, pos_inds=pos_inds, bbox_feats=bbox_feats)
+
+        mask_loss_and_target = self.mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg)
+        mask_targets = mask_loss_and_target['mask_targets']
+        mask_results.update(loss_mask=mask_loss_and_target['loss_mask'])
+        if mask_results['loss_mask'] is None:
+            return mask_results
+
+        # mask iou head forward and loss
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        pos_mask_pred = mask_results['mask_preds'][
+            range(mask_results['mask_preds'].size(0)), pos_labels]
+        mask_iou_pred = self.mask_iou_head(mask_results['mask_feats'],
+                                           pos_mask_pred)
+        pos_mask_iou_pred = mask_iou_pred[range(mask_iou_pred.size(0)),
+                                          pos_labels]
+
+        loss_mask_iou = self.mask_iou_head.loss_and_target(
+            pos_mask_iou_pred, pos_mask_pred, mask_targets, sampling_results,
+            batch_gt_instances, self.train_cfg)
+        mask_results['loss_mask'].update(loss_mask_iou)
+        return mask_results
+
+    def predict_mask(self,
+                     x: Tensor,
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        mask_results = self._mask_forward(x, mask_rois)
+        mask_preds = mask_results['mask_preds']
+        mask_feats = mask_results['mask_feats']
+        # get mask scores with mask iou head
+        labels = torch.cat([res.labels for res in results_list])
+        mask_iou_preds = self.mask_iou_head(
+            mask_feats, mask_preds[range(labels.size(0)), labels])
+        # split batch mask prediction back to each image
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+        mask_iou_preds = mask_iou_preds.split(num_mask_rois_per_img, 0)
+
+        # TODO: Handle the case where rescale is false
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=mask_preds,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+        results_list = self.mask_iou_head.predict_by_feat(
+            mask_iou_preds=mask_iou_preds, results_list=results_list)
+        return results_list
diff --git a/head_extractor/src/mmdet/models/roi_heads/multi_instance_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/multi_instance_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..fee55b0a5d341c03165649f59737fd34d85c207e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/multi_instance_roi_head.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances, unpack_gt_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class MultiInstanceRoIHead(StandardRoIHead):
+    """The roi head for Multi-instance prediction."""
+
+    def __init__(self, num_instance: int = 2, *args, **kwargs) -> None:
+        self.num_instance = num_instance
+        super().__init__(*args, **kwargs)
+
+    def init_bbox_head(self, bbox_roi_extractor: ConfigType,
+                       bbox_head: ConfigType) -> None:
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (dict or ConfigDict): Config of box
+                roi extractor.
+            bbox_head (dict or ConfigDict): Config of box in box head.
+        """
+        self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor)
+        self.bbox_head = MODELS.build(bbox_head)
+
+    def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `cls_score_ref` (Tensor): The cls_score after refine model.
+                - `bbox_pred_ref` (Tensor): The bbox_pred after refine model.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        # TODO: a more flexible way to decide which feature maps to use
+        bbox_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        bbox_results = self.bbox_head(bbox_feats)
+
+        if self.bbox_head.with_refine:
+            bbox_results = dict(
+                cls_score=bbox_results[0],
+                bbox_pred=bbox_results[1],
+                cls_score_ref=bbox_results[2],
+                bbox_pred_ref=bbox_results[3],
+                bbox_feats=bbox_feats)
+        else:
+            bbox_results = dict(
+                cls_score=bbox_results[0],
+                bbox_pred=bbox_results[1],
+                bbox_feats=bbox_feats)
+
+        return bbox_results
+
+    def bbox_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult]) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        # If there is a refining process, add refine loss.
+        if 'cls_score_ref' in bbox_results:
+            bbox_loss_and_target = self.bbox_head.loss_and_target(
+                cls_score=bbox_results['cls_score'],
+                bbox_pred=bbox_results['bbox_pred'],
+                rois=rois,
+                sampling_results=sampling_results,
+                rcnn_train_cfg=self.train_cfg)
+            bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
+            bbox_loss_and_target_ref = self.bbox_head.loss_and_target(
+                cls_score=bbox_results['cls_score_ref'],
+                bbox_pred=bbox_results['bbox_pred_ref'],
+                rois=rois,
+                sampling_results=sampling_results,
+                rcnn_train_cfg=self.train_cfg)
+            bbox_results['loss_bbox']['loss_rcnn_emd_ref'] = \
+                bbox_loss_and_target_ref['loss_bbox']['loss_rcnn_emd']
+        else:
+            bbox_loss_and_target = self.bbox_head.loss_and_target(
+                cls_score=bbox_results['cls_score'],
+                bbox_pred=bbox_results['bbox_pred'],
+                rois=rois,
+                sampling_results=sampling_results,
+                rcnn_train_cfg=self.train_cfg)
+            bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
+
+        return bbox_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: List[DetDataSample]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        sampling_results = []
+        for i in range(len(batch_data_samples)):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                batch_gt_instances_ignore=batch_gt_instances_ignore[i])
+            sampling_results.append(sampling_result)
+
+        losses = dict()
+        # bbox head loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(x, sampling_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        return losses
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposals = [res.bboxes for res in rpn_results_list]
+        rois = bbox2roi(proposals)
+
+        if rois.shape[0] == 0:
+            return empty_instances(
+                batch_img_metas, rois.device, task_type='bbox')
+
+        bbox_results = self._bbox_forward(x, rois)
+
+        # split batch bbox prediction back to each image
+        if 'cls_score_ref' in bbox_results:
+            cls_scores = bbox_results['cls_score_ref']
+            bbox_preds = bbox_results['bbox_pred_ref']
+        else:
+            cls_scores = bbox_results['cls_score']
+            bbox_preds = bbox_results['bbox_pred']
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = rois.split(num_proposals_per_img, 0)
+        cls_scores = cls_scores.split(num_proposals_per_img, 0)
+
+        if bbox_preds is not None:
+            bbox_preds = bbox_preds.split(num_proposals_per_img, 0)
+        else:
+            bbox_preds = (None, ) * len(proposals)
+
+        result_list = self.bbox_head.predict_by_feat(
+            rois=rois,
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=rcnn_test_cfg,
+            rescale=rescale)
+        return result_list
diff --git a/head_extractor/src/mmdet/models/roi_heads/pisa_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/pisa_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..45d59879da73b48df790c55d40a4a88f1d099111
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/pisa_roi_head.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+from torch import Tensor
+
+from mmdet.models.task_modules import SamplingResult
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList
+from ..losses.pisa_loss import carl_loss, isr_p
+from ..utils import unpack_gt_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class PISARoIHead(StandardRoIHead):
+    r"""The RoI head for `Prime Sample Attention in Object Detection
+    <https://arxiv.org/abs/1904.04821>`_."""
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: List[DetDataSample]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(batch_data_samples)
+        sampling_results = []
+        neg_label_weights = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            if isinstance(sampling_result, tuple):
+                sampling_result, neg_label_weight = sampling_result
+            sampling_results.append(sampling_result)
+            neg_label_weights.append(neg_label_weight)
+
+        losses = dict()
+        # bbox head forward and loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(
+                x, sampling_results, neg_label_weights=neg_label_weights)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self.mask_loss(x, sampling_results,
+                                          bbox_results['bbox_feats'],
+                                          batch_gt_instances)
+            losses.update(mask_results['loss_mask'])
+
+        return losses
+
+    def bbox_loss(self,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  neg_label_weights: List[Tensor] = None) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+            - `cls_score` (Tensor): Classification scores.
+            - `bbox_pred` (Tensor): Box energies / deltas.
+            - `bbox_feats` (Tensor): Extract bbox RoI features.
+            - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+        bbox_targets = self.bbox_head.get_targets(sampling_results,
+                                                  self.train_cfg)
+
+        # neg_label_weights obtained by sampler is image-wise, mapping back to
+        # the corresponding location in label weights
+        if neg_label_weights[0] is not None:
+            label_weights = bbox_targets[1]
+            cur_num_rois = 0
+            for i in range(len(sampling_results)):
+                num_pos = sampling_results[i].pos_inds.size(0)
+                num_neg = sampling_results[i].neg_inds.size(0)
+                label_weights[cur_num_rois + num_pos:cur_num_rois + num_pos +
+                              num_neg] = neg_label_weights[i]
+                cur_num_rois += num_pos + num_neg
+
+        cls_score = bbox_results['cls_score']
+        bbox_pred = bbox_results['bbox_pred']
+
+        # Apply ISR-P
+        isr_cfg = self.train_cfg.get('isr', None)
+        if isr_cfg is not None:
+            bbox_targets = isr_p(
+                cls_score,
+                bbox_pred,
+                bbox_targets,
+                rois,
+                sampling_results,
+                self.bbox_head.loss_cls,
+                self.bbox_head.bbox_coder,
+                **isr_cfg,
+                num_class=self.bbox_head.num_classes)
+        loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, rois,
+                                        *bbox_targets)
+
+        # Add CARL Loss
+        carl_cfg = self.train_cfg.get('carl', None)
+        if carl_cfg is not None:
+            loss_carl = carl_loss(
+                cls_score,
+                bbox_targets[0],
+                bbox_pred,
+                bbox_targets[2],
+                self.bbox_head.loss_bbox,
+                **carl_cfg,
+                num_class=self.bbox_head.num_classes)
+            loss_bbox.update(loss_carl)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
diff --git a/head_extractor/src/mmdet/models/roi_heads/point_rend_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/point_rend_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a0641549631e243c3db25039b01fed64fb1e0d1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/point_rend_roi_head.py
@@ -0,0 +1,236 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.ops import point_sample, rel_roi_point_to_rel_img_point
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class PointRendRoIHead(StandardRoIHead):
+    """`PointRend <https://arxiv.org/abs/1912.08193>`_."""
+
+    def __init__(self, point_head: ConfigType, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        assert self.with_bbox and self.with_mask
+        self.init_point_head(point_head)
+
+    def init_point_head(self, point_head: ConfigType) -> None:
+        """Initialize ``point_head``"""
+        self.point_head = MODELS.build(point_head)
+
+    def mask_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult], bbox_feats: Tensor,
+                  batch_gt_instances: InstanceList) -> dict:
+        """Run forward function and calculate loss for mask head and point head
+        in training."""
+        mask_results = super().mask_loss(
+            x=x,
+            sampling_results=sampling_results,
+            bbox_feats=bbox_feats,
+            batch_gt_instances=batch_gt_instances)
+
+        mask_point_results = self._mask_point_loss(
+            x=x,
+            sampling_results=sampling_results,
+            mask_preds=mask_results['mask_preds'],
+            batch_gt_instances=batch_gt_instances)
+        mask_results['loss_mask'].update(
+            loss_point=mask_point_results['loss_point'])
+
+        return mask_results
+
+    def _mask_point_loss(self, x: Tuple[Tensor],
+                         sampling_results: List[SamplingResult],
+                         mask_preds: Tensor,
+                         batch_gt_instances: InstanceList) -> dict:
+        """Run forward function and calculate loss for point head in
+        training."""
+        pos_labels = torch.cat([res.pos_gt_labels for res in sampling_results])
+        rel_roi_points = self.point_head.get_roi_rel_points_train(
+            mask_preds, pos_labels, cfg=self.train_cfg)
+        rois = bbox2roi([res.pos_bboxes for res in sampling_results])
+
+        fine_grained_point_feats = self._get_fine_grained_point_feats(
+            x, rois, rel_roi_points)
+        coarse_point_feats = point_sample(mask_preds, rel_roi_points)
+        mask_point_pred = self.point_head(fine_grained_point_feats,
+                                          coarse_point_feats)
+
+        loss_and_target = self.point_head.loss_and_target(
+            point_pred=mask_point_pred,
+            rel_roi_points=rel_roi_points,
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            cfg=self.train_cfg)
+
+        return loss_and_target
+
+    def _mask_point_forward_test(self, x: Tuple[Tensor], rois: Tensor,
+                                 label_preds: Tensor,
+                                 mask_preds: Tensor) -> Tensor:
+        """Mask refining process with point head in testing.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            rois (Tensor): shape (num_rois, 5).
+            label_preds (Tensor): The predication class for each rois.
+            mask_preds (Tensor): The predication coarse masks of
+                shape (num_rois, num_classes, small_size, small_size).
+
+        Returns:
+            Tensor: The refined masks of shape (num_rois, num_classes,
+            large_size, large_size).
+        """
+        refined_mask_pred = mask_preds.clone()
+        for subdivision_step in range(self.test_cfg.subdivision_steps):
+            refined_mask_pred = F.interpolate(
+                refined_mask_pred,
+                scale_factor=self.test_cfg.scale_factor,
+                mode='bilinear',
+                align_corners=False)
+            # If `subdivision_num_points` is larger or equal to the
+            # resolution of the next step, then we can skip this step
+            num_rois, channels, mask_height, mask_width = \
+                refined_mask_pred.shape
+            if (self.test_cfg.subdivision_num_points >=
+                    self.test_cfg.scale_factor**2 * mask_height * mask_width
+                    and
+                    subdivision_step < self.test_cfg.subdivision_steps - 1):
+                continue
+            point_indices, rel_roi_points = \
+                self.point_head.get_roi_rel_points_test(
+                    refined_mask_pred, label_preds, cfg=self.test_cfg)
+
+            fine_grained_point_feats = self._get_fine_grained_point_feats(
+                x=x, rois=rois, rel_roi_points=rel_roi_points)
+            coarse_point_feats = point_sample(mask_preds, rel_roi_points)
+            mask_point_pred = self.point_head(fine_grained_point_feats,
+                                              coarse_point_feats)
+
+            point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1)
+            refined_mask_pred = refined_mask_pred.reshape(
+                num_rois, channels, mask_height * mask_width)
+            refined_mask_pred = refined_mask_pred.scatter_(
+                2, point_indices, mask_point_pred)
+            refined_mask_pred = refined_mask_pred.view(num_rois, channels,
+                                                       mask_height, mask_width)
+
+        return refined_mask_pred
+
+    def _get_fine_grained_point_feats(self, x: Tuple[Tensor], rois: Tensor,
+                                      rel_roi_points: Tensor) -> Tensor:
+        """Sample fine grained feats from each level feature map and
+        concatenate them together.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            rois (Tensor): shape (num_rois, 5).
+            rel_roi_points (Tensor): A tensor of shape (num_rois, num_points,
+                2) that contains [0, 1] x [0, 1] normalized coordinates of the
+                most uncertain points from the [mask_height, mask_width] grid.
+
+        Returns:
+            Tensor: The fine grained features for each points,
+            has shape (num_rois, feats_channels, num_points).
+        """
+        assert rois.shape[0] > 0, 'RoI is a empty tensor.'
+        num_imgs = x[0].shape[0]
+        fine_grained_feats = []
+        for idx in range(self.mask_roi_extractor.num_inputs):
+            feats = x[idx]
+            spatial_scale = 1. / float(
+                self.mask_roi_extractor.featmap_strides[idx])
+            point_feats = []
+            for batch_ind in range(num_imgs):
+                # unravel batch dim
+                feat = feats[batch_ind].unsqueeze(0)
+                inds = (rois[:, 0].long() == batch_ind)
+                if inds.any():
+                    rel_img_points = rel_roi_point_to_rel_img_point(
+                        rois=rois[inds],
+                        rel_roi_points=rel_roi_points[inds],
+                        img=feat.shape[2:],
+                        spatial_scale=spatial_scale).unsqueeze(0)
+                    point_feat = point_sample(feat, rel_img_points)
+                    point_feat = point_feat.squeeze(0).transpose(0, 1)
+                    point_feats.append(point_feat)
+            fine_grained_feats.append(torch.cat(point_feats, dim=0))
+        return torch.cat(fine_grained_feats, dim=1)
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+            - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        # don't need to consider aug_test.
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        mask_results = self._mask_forward(x, mask_rois)
+        mask_preds = mask_results['mask_preds']
+        # split batch mask prediction back to each image
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+
+        # refine mask_preds
+        mask_rois = mask_rois.split(num_mask_rois_per_img, 0)
+        mask_preds_refined = []
+        for i in range(len(batch_img_metas)):
+            labels = results_list[i].labels
+            x_i = [xx[[i]] for xx in x]
+            mask_rois_i = mask_rois[i]
+            mask_rois_i[:, 0] = 0
+            mask_pred_i = self._mask_point_forward_test(
+                x_i, mask_rois_i, labels, mask_preds[i])
+            mask_preds_refined.append(mask_pred_i)
+
+        # TODO: Handle the case where rescale is false
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=mask_preds_refined,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+        return results_list
diff --git a/head_extractor/src/mmdet/models/roi_heads/roi_extractors/__init__.py b/head_extractor/src/mmdet/models/roi_heads/roi_extractors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f60214991b0ed14cdbc3964aee15356c6aaf2aa
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/roi_extractors/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_roi_extractor import BaseRoIExtractor
+from .generic_roi_extractor import GenericRoIExtractor
+from .single_level_roi_extractor import SingleRoIExtractor
+
+__all__ = ['BaseRoIExtractor', 'SingleRoIExtractor', 'GenericRoIExtractor']
diff --git a/head_extractor/src/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py b/head_extractor/src/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8de0518818aba8d9aac7b807e3215d0da6c9b99
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/roi_extractors/base_roi_extractor.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv import ops
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.utils import ConfigType, OptMultiConfig
+
+
+class BaseRoIExtractor(BaseModule, metaclass=ABCMeta):
+    """Base class for RoI extractor.
+
+    Args:
+        roi_layer (:obj:`ConfigDict` or dict): Specify RoI layer type and
+            arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (list[int]): Strides of input feature maps.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 roi_layer: ConfigType,
+                 out_channels: int,
+                 featmap_strides: List[int],
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides)
+        self.out_channels = out_channels
+        self.featmap_strides = featmap_strides
+
+    @property
+    def num_inputs(self) -> int:
+        """int: Number of input feature maps."""
+        return len(self.featmap_strides)
+
+    def build_roi_layers(self, layer_cfg: ConfigType,
+                         featmap_strides: List[int]) -> nn.ModuleList:
+        """Build RoI operator to extract feature from each level feature map.
+
+        Args:
+            layer_cfg (:obj:`ConfigDict` or dict): Dictionary to construct and
+                config RoI layer operation. Options are modules under
+                ``mmcv/ops`` such as ``RoIAlign``.
+            featmap_strides (list[int]): The stride of input feature map w.r.t
+                to the original image size, which would be used to scale RoI
+                coordinate (original image coordinate system) to feature
+                coordinate system.
+
+        Returns:
+            :obj:`nn.ModuleList`: The RoI extractor modules for each level
+                feature map.
+        """
+
+        cfg = layer_cfg.copy()
+        layer_type = cfg.pop('type')
+        if isinstance(layer_type, str):
+            assert hasattr(ops, layer_type)
+            layer_cls = getattr(ops, layer_type)
+        else:
+            layer_cls = layer_type
+        roi_layers = nn.ModuleList(
+            [layer_cls(spatial_scale=1 / s, **cfg) for s in featmap_strides])
+        return roi_layers
+
+    def roi_rescale(self, rois: Tensor, scale_factor: float) -> Tensor:
+        """Scale RoI coordinates by scale factor.
+
+        Args:
+            rois (Tensor): RoI (Region of Interest), shape (n, 5)
+            scale_factor (float): Scale factor that RoI will be multiplied by.
+
+        Returns:
+            Tensor: Scaled RoI.
+        """
+
+        cx = (rois[:, 1] + rois[:, 3]) * 0.5
+        cy = (rois[:, 2] + rois[:, 4]) * 0.5
+        w = rois[:, 3] - rois[:, 1]
+        h = rois[:, 4] - rois[:, 2]
+        new_w = w * scale_factor
+        new_h = h * scale_factor
+        x1 = cx - new_w * 0.5
+        x2 = cx + new_w * 0.5
+        y1 = cy - new_h * 0.5
+        y2 = cy + new_h * 0.5
+        new_rois = torch.stack((rois[:, 0], x1, y1, x2, y2), dim=-1)
+        return new_rois
+
+    @abstractmethod
+    def forward(self,
+                feats: Tuple[Tensor],
+                rois: Tensor,
+                roi_scale_factor: Optional[float] = None) -> Tensor:
+        """Extractor ROI feats.
+
+        Args:
+            feats (Tuple[Tensor]): Multi-scale features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            roi_scale_factor (Optional[float]): RoI scale factor.
+                Defaults to None.
+
+        Returns:
+            Tensor: RoI feature.
+        """
+        pass
diff --git a/head_extractor/src/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py b/head_extractor/src/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d4c90135d853404d564391f029558841ac9cac
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/roi_extractors/generic_roi_extractor.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+from mmcv.cnn.bricks import build_plugin_layer
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType
+from .base_roi_extractor import BaseRoIExtractor
+
+
+@MODELS.register_module()
+class GenericRoIExtractor(BaseRoIExtractor):
+    """Extract RoI features from all level feature maps levels.
+
+    This is the implementation of `A novel Region of Interest Extraction Layer
+    for Instance Segmentation <https://arxiv.org/abs/2004.13665>`_.
+
+    Args:
+        aggregation (str): The method to aggregate multiple feature maps.
+            Options are 'sum', 'concat'. Defaults to 'sum'.
+        pre_cfg (:obj:`ConfigDict` or dict): Specify pre-processing modules.
+            Defaults to None.
+        post_cfg (:obj:`ConfigDict` or dict): Specify post-processing modules.
+            Defaults to None.
+        kwargs (keyword arguments): Arguments that are the same
+            as :class:`BaseRoIExtractor`.
+    """
+
+    def __init__(self,
+                 aggregation: str = 'sum',
+                 pre_cfg: OptConfigType = None,
+                 post_cfg: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+
+        assert aggregation in ['sum', 'concat']
+
+        self.aggregation = aggregation
+        self.with_post = post_cfg is not None
+        self.with_pre = pre_cfg is not None
+        # build pre/post processing modules
+        if self.with_post:
+            self.post_module = build_plugin_layer(post_cfg, '_post_module')[1]
+        if self.with_pre:
+            self.pre_module = build_plugin_layer(pre_cfg, '_pre_module')[1]
+
+    def forward(self,
+                feats: Tuple[Tensor],
+                rois: Tensor,
+                roi_scale_factor: Optional[float] = None) -> Tensor:
+        """Extractor ROI feats.
+
+        Args:
+            feats (Tuple[Tensor]): Multi-scale features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            roi_scale_factor (Optional[float]): RoI scale factor.
+                Defaults to None.
+
+        Returns:
+            Tensor: RoI feature.
+        """
+        out_size = self.roi_layers[0].output_size
+        num_levels = len(feats)
+        roi_feats = feats[0].new_zeros(
+            rois.size(0), self.out_channels, *out_size)
+
+        # some times rois is an empty tensor
+        if roi_feats.shape[0] == 0:
+            return roi_feats
+
+        if num_levels == 1:
+            return self.roi_layers[0](feats[0], rois)
+
+        if roi_scale_factor is not None:
+            rois = self.roi_rescale(rois, roi_scale_factor)
+
+        # mark the starting channels for concat mode
+        start_channels = 0
+        for i in range(num_levels):
+            roi_feats_t = self.roi_layers[i](feats[i], rois)
+            end_channels = start_channels + roi_feats_t.size(1)
+            if self.with_pre:
+                # apply pre-processing to a RoI extracted from each layer
+                roi_feats_t = self.pre_module(roi_feats_t)
+            if self.aggregation == 'sum':
+                # and sum them all
+                roi_feats += roi_feats_t
+            else:
+                # and concat them along channel dimension
+                roi_feats[:, start_channels:end_channels] = roi_feats_t
+            # update channels starting position
+            start_channels = end_channels
+        # check if concat channels match at the end
+        if self.aggregation == 'concat':
+            assert start_channels == self.out_channels
+
+        if self.with_post:
+            # apply post-processing before return the result
+            roi_feats = self.post_module(roi_feats)
+        return roi_feats
diff --git a/head_extractor/src/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py b/head_extractor/src/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..59229e0b0b0a18dff81abca6f5c20cb50b0d542c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/roi_extractors/single_level_roi_extractor.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.utils import ConfigType, OptMultiConfig
+from .base_roi_extractor import BaseRoIExtractor
+
+
+@MODELS.register_module()
+class SingleRoIExtractor(BaseRoIExtractor):
+    """Extract RoI features from a single level feature map.
+
+    If there are multiple input feature levels, each RoI is mapped to a level
+    according to its scale. The mapping rule is proposed in
+    `FPN <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        roi_layer (:obj:`ConfigDict` or dict): Specify RoI layer type and
+            arguments.
+        out_channels (int): Output channels of RoI layers.
+        featmap_strides (List[int]): Strides of input feature maps.
+        finest_scale (int): Scale threshold of mapping to level 0.
+            Defaults to 56.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 roi_layer: ConfigType,
+                 out_channels: int,
+                 featmap_strides: List[int],
+                 finest_scale: int = 56,
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(
+            roi_layer=roi_layer,
+            out_channels=out_channels,
+            featmap_strides=featmap_strides,
+            init_cfg=init_cfg)
+        self.finest_scale = finest_scale
+
+    def map_roi_levels(self, rois: Tensor, num_levels: int) -> Tensor:
+        """Map rois to corresponding feature levels by scales.
+
+        - scale < finest_scale * 2: level 0
+        - finest_scale * 2 <= scale < finest_scale * 4: level 1
+        - finest_scale * 4 <= scale < finest_scale * 8: level 2
+        - scale >= finest_scale * 8: level 3
+
+        Args:
+            rois (Tensor): Input RoIs, shape (k, 5).
+            num_levels (int): Total level number.
+
+        Returns:
+            Tensor: Level index (0-based) of each RoI, shape (k, )
+        """
+        scale = torch.sqrt(
+            (rois[:, 3] - rois[:, 1]) * (rois[:, 4] - rois[:, 2]))
+        target_lvls = torch.floor(torch.log2(scale / self.finest_scale + 1e-6))
+        target_lvls = target_lvls.clamp(min=0, max=num_levels - 1).long()
+        return target_lvls
+
+    def forward(self,
+                feats: Tuple[Tensor],
+                rois: Tensor,
+                roi_scale_factor: Optional[float] = None):
+        """Extractor ROI feats.
+
+        Args:
+            feats (Tuple[Tensor]): Multi-scale features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            roi_scale_factor (Optional[float]): RoI scale factor.
+                Defaults to None.
+
+        Returns:
+            Tensor: RoI feature.
+        """
+        # convert fp32 to fp16 when amp is on
+        rois = rois.type_as(feats[0])
+        out_size = self.roi_layers[0].output_size
+        num_levels = len(feats)
+        roi_feats = feats[0].new_zeros(
+            rois.size(0), self.out_channels, *out_size)
+
+        # TODO: remove this when parrots supports
+        if torch.__version__ == 'parrots':
+            roi_feats.requires_grad = True
+
+        if num_levels == 1:
+            if len(rois) == 0:
+                return roi_feats
+            return self.roi_layers[0](feats[0], rois)
+
+        target_lvls = self.map_roi_levels(rois, num_levels)
+
+        if roi_scale_factor is not None:
+            rois = self.roi_rescale(rois, roi_scale_factor)
+
+        for i in range(num_levels):
+            mask = target_lvls == i
+            inds = mask.nonzero(as_tuple=False).squeeze(1)
+            if inds.numel() > 0:
+                rois_ = rois[inds]
+                roi_feats_t = self.roi_layers[i](feats[i], rois_)
+                roi_feats[inds] = roi_feats_t
+            else:
+                # Sometimes some pyramid levels will not be used for RoI
+                # feature extraction and this will cause an incomplete
+                # computation graph in one GPU, which is different from those
+                # in other GPUs and will cause a hanging error.
+                # Therefore, we add it to ensure each feature pyramid is
+                # included in the computation graph to avoid runtime bugs.
+                roi_feats += sum(
+                    x.view(-1)[0]
+                    for x in self.parameters()) * 0. + feats[i].sum() * 0.
+        return roi_feats
diff --git a/head_extractor/src/mmdet/models/roi_heads/scnet_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/scnet_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6d2bc1915bae38011cc75a720e48ed53b51ddb5
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/scnet_roi_head.py
@@ -0,0 +1,677 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList, OptConfigType
+from ..layers import adaptive_avg_pool2d
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances, unpack_gt_instances
+from .cascade_roi_head import CascadeRoIHead
+
+
+@MODELS.register_module()
+class SCNetRoIHead(CascadeRoIHead):
+    """RoIHead for `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    Args:
+        num_stages (int): number of cascade stages.
+        stage_loss_weights (list): loss weight of cascade stages.
+        semantic_roi_extractor (dict): config to init semantic roi extractor.
+        semantic_head (dict): config to init semantic head.
+        feat_relay_head (dict): config to init feature_relay_head.
+        glbctx_head (dict): config to init global context head.
+    """
+
+    def __init__(self,
+                 num_stages: int,
+                 stage_loss_weights: List[float],
+                 semantic_roi_extractor: OptConfigType = None,
+                 semantic_head: OptConfigType = None,
+                 feat_relay_head: OptConfigType = None,
+                 glbctx_head: OptConfigType = None,
+                 **kwargs) -> None:
+        super().__init__(
+            num_stages=num_stages,
+            stage_loss_weights=stage_loss_weights,
+            **kwargs)
+        assert self.with_bbox and self.with_mask
+        assert not self.with_shared_head  # shared head is not supported
+
+        if semantic_head is not None:
+            self.semantic_roi_extractor = MODELS.build(semantic_roi_extractor)
+            self.semantic_head = MODELS.build(semantic_head)
+
+        if feat_relay_head is not None:
+            self.feat_relay_head = MODELS.build(feat_relay_head)
+
+        if glbctx_head is not None:
+            self.glbctx_head = MODELS.build(glbctx_head)
+
+    def init_mask_head(self, mask_roi_extractor: ConfigType,
+                       mask_head: ConfigType) -> None:
+        """Initialize ``mask_head``"""
+        if mask_roi_extractor is not None:
+            self.mask_roi_extractor = MODELS.build(mask_roi_extractor)
+            self.mask_head = MODELS.build(mask_head)
+
+    # TODO move to base_roi_head later
+    @property
+    def with_semantic(self) -> bool:
+        """bool: whether the head has semantic head"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    @property
+    def with_feat_relay(self) -> bool:
+        """bool: whether the head has feature relay head"""
+        return (hasattr(self, 'feat_relay_head')
+                and self.feat_relay_head is not None)
+
+    @property
+    def with_glbctx(self) -> bool:
+        """bool: whether the head has global context head"""
+        return hasattr(self, 'glbctx_head') and self.glbctx_head is not None
+
+    def _fuse_glbctx(self, roi_feats: Tensor, glbctx_feat: Tensor,
+                     rois: Tensor) -> Tensor:
+        """Fuse global context feats with roi feats.
+
+        Args:
+            roi_feats (Tensor): RoI features.
+            glbctx_feat (Tensor): Global context feature..
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+            Tensor: Fused feature.
+        """
+        assert roi_feats.size(0) == rois.size(0)
+        # RuntimeError: isDifferentiableType(variable.scalar_type())
+        # INTERNAL ASSERT FAILED if detach() is not used when calling
+        # roi_head.predict().
+        img_inds = torch.unique(rois[:, 0].detach().cpu(), sorted=True).long()
+        fused_feats = torch.zeros_like(roi_feats)
+        for img_id in img_inds:
+            inds = (rois[:, 0] == img_id.item())
+            fused_feats[inds] = roi_feats[inds] + glbctx_feat[img_id]
+        return fused_feats
+
+    def _slice_pos_feats(self, feats: Tensor,
+                         sampling_results: List[SamplingResult]) -> Tensor:
+        """Get features from pos rois.
+
+        Args:
+            feats (Tensor): Input features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            Tensor: Sliced features.
+        """
+        num_rois = [res.priors.size(0) for res in sampling_results]
+        num_pos_rois = [res.pos_priors.size(0) for res in sampling_results]
+        inds = torch.zeros(sum(num_rois), dtype=torch.bool)
+        start = 0
+        for i in range(len(num_rois)):
+            start = 0 if i == 0 else start + num_rois[i - 1]
+            stop = start + num_pos_rois[i]
+            inds[start:stop] = 1
+        sliced_feats = feats[inds]
+        return sliced_feats
+
+    def _bbox_forward(self,
+                      stage: int,
+                      x: Tuple[Tensor],
+                      rois: Tensor,
+                      semantic_feat: Optional[Tensor] = None,
+                      glbctx_feat: Optional[Tensor] = None) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            semantic_feat (Tensor): Semantic feature. Defaults to None.
+            glbctx_feat (Tensor): Global context feature. Defaults to None.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        if self.with_semantic and semantic_feat is not None:
+            bbox_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if bbox_semantic_feat.shape[-2:] != bbox_feats.shape[-2:]:
+                bbox_semantic_feat = adaptive_avg_pool2d(
+                    bbox_semantic_feat, bbox_feats.shape[-2:])
+            bbox_feats += bbox_semantic_feat
+        if self.with_glbctx and glbctx_feat is not None:
+            bbox_feats = self._fuse_glbctx(bbox_feats, glbctx_feat, rois)
+        cls_score, bbox_pred, relayed_feat = bbox_head(
+            bbox_feats, return_shared_feat=True)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            relayed_feat=relayed_feat)
+        return bbox_results
+
+    def _mask_forward(self,
+                      x: Tuple[Tensor],
+                      rois: Tensor,
+                      semantic_feat: Optional[Tensor] = None,
+                      glbctx_feat: Optional[Tensor] = None,
+                      relayed_feat: Optional[Tensor] = None) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            semantic_feat (Tensor): Semantic feature. Defaults to None.
+            glbctx_feat (Tensor): Global context feature. Defaults to None.
+            relayed_feat (Tensor): Relayed feature. Defaults to None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_feats = self.mask_roi_extractor(
+            x[:self.mask_roi_extractor.num_inputs], rois)
+        if self.with_semantic and semantic_feat is not None:
+            mask_semantic_feat = self.semantic_roi_extractor([semantic_feat],
+                                                             rois)
+            if mask_semantic_feat.shape[-2:] != mask_feats.shape[-2:]:
+                mask_semantic_feat = F.adaptive_avg_pool2d(
+                    mask_semantic_feat, mask_feats.shape[-2:])
+            mask_feats += mask_semantic_feat
+        if self.with_glbctx and glbctx_feat is not None:
+            mask_feats = self._fuse_glbctx(mask_feats, glbctx_feat, rois)
+        if self.with_feat_relay and relayed_feat is not None:
+            mask_feats = mask_feats + relayed_feat
+        mask_preds = self.mask_head(mask_feats)
+        mask_results = dict(mask_preds=mask_preds)
+
+        return mask_results
+
+    def bbox_loss(self,
+                  stage: int,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  semantic_feat: Optional[Tensor] = None,
+                  glbctx_feat: Optional[Tensor] = None) -> dict:
+        """Run forward function and calculate loss for box head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            semantic_feat (Tensor): Semantic feature. Defaults to None.
+            glbctx_feat (Tensor): Global context feature. Defaults to None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+                - `rois` (Tensor): RoIs with the shape (n, 5) where the first
+                  column indicates batch id of each RoI.
+                - `bbox_targets` (tuple):  Ground truth for proposals in a
+                  single image. Containing the following list of Tensors:
+                  (labels, label_weights, bbox_targets, bbox_weights)
+        """
+        bbox_head = self.bbox_head[stage]
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(
+            stage,
+            x,
+            rois,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat)
+        bbox_results.update(rois=rois)
+
+        bbox_loss_and_target = bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg[stage])
+
+        bbox_results.update(bbox_loss_and_target)
+        return bbox_results
+
+    def mask_loss(self,
+                  x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult],
+                  batch_gt_instances: InstanceList,
+                  semantic_feat: Optional[Tensor] = None,
+                  glbctx_feat: Optional[Tensor] = None,
+                  relayed_feat: Optional[Tensor] = None) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            semantic_feat (Tensor): Semantic feature. Defaults to None.
+            glbctx_feat (Tensor): Global context feature. Defaults to None.
+            relayed_feat (Tensor): Relayed feature. Defaults to None.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+        mask_results = self._mask_forward(
+            x,
+            pos_rois,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat,
+            relayed_feat=relayed_feat)
+
+        mask_loss_and_target = self.mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg[-1])
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def semantic_loss(self, x: Tuple[Tensor],
+                      batch_data_samples: SampleList) -> dict:
+        """Semantic segmentation loss.
+
+        Args:
+            x (Tuple[Tensor]): Tuple of multi-level img features.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `semantic_feat` (Tensor): Semantic feature.
+                - `loss_seg` (dict): Semantic segmentation loss.
+        """
+        gt_semantic_segs = [
+            data_sample.gt_sem_seg.sem_seg
+            for data_sample in batch_data_samples
+        ]
+        gt_semantic_segs = torch.stack(gt_semantic_segs)
+        semantic_pred, semantic_feat = self.semantic_head(x)
+        loss_seg = self.semantic_head.loss(semantic_pred, gt_semantic_segs)
+
+        semantic_results = dict(loss_seg=loss_seg, semantic_feat=semantic_feat)
+
+        return semantic_results
+
+    def global_context_loss(self, x: Tuple[Tensor],
+                            batch_gt_instances: InstanceList) -> dict:
+        """Global context loss.
+
+        Args:
+            x (Tuple[Tensor]): Tuple of multi-level img features.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `glbctx_feat` (Tensor): Global context feature.
+                - `loss_glbctx` (dict): Global context loss.
+        """
+        gt_labels = [
+            gt_instances.labels for gt_instances in batch_gt_instances
+        ]
+        mc_pred, glbctx_feat = self.glbctx_head(x)
+        loss_glbctx = self.glbctx_head.loss(mc_pred, gt_labels)
+        global_context_results = dict(
+            loss_glbctx=loss_glbctx, glbctx_feat=glbctx_feat)
+
+        return global_context_results
+
+    def loss(self, x: Tensor, rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        losses = dict()
+
+        # semantic segmentation branch
+        if self.with_semantic:
+            semantic_results = self.semantic_loss(
+                x=x, batch_data_samples=batch_data_samples)
+            losses['loss_semantic_seg'] = semantic_results['loss_seg']
+            semantic_feat = semantic_results['semantic_feat']
+        else:
+            semantic_feat = None
+
+        # global context branch
+        if self.with_glbctx:
+            global_context_results = self.global_context_loss(
+                x=x, batch_gt_instances=batch_gt_instances)
+            losses['loss_glbctx'] = global_context_results['loss_glbctx']
+            glbctx_feat = global_context_results['glbctx_feat']
+        else:
+            glbctx_feat = None
+
+        results_list = rpn_results_list
+        num_imgs = len(batch_img_metas)
+        for stage in range(self.num_stages):
+            stage_loss_weight = self.stage_loss_weights[stage]
+
+            # assign gts and sample proposals
+            sampling_results = []
+            bbox_assigner = self.bbox_assigner[stage]
+            bbox_sampler = self.bbox_sampler[stage]
+            for i in range(num_imgs):
+                results = results_list[i]
+                # rename rpn_results.bboxes to rpn_results.priors
+                results.priors = results.pop('bboxes')
+
+                assign_result = bbox_assigner.assign(
+                    results, batch_gt_instances[i],
+                    batch_gt_instances_ignore[i])
+                sampling_result = bbox_sampler.sample(
+                    assign_result,
+                    results,
+                    batch_gt_instances[i],
+                    feats=[lvl_feat[i][None] for lvl_feat in x])
+                sampling_results.append(sampling_result)
+
+            # bbox head forward and loss
+            bbox_results = self.bbox_loss(
+                stage=stage,
+                x=x,
+                sampling_results=sampling_results,
+                semantic_feat=semantic_feat,
+                glbctx_feat=glbctx_feat)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{stage}.{name}'] = (
+                    value * stage_loss_weight if 'loss' in name else value)
+
+            # refine bboxes
+            if stage < self.num_stages - 1:
+                bbox_head = self.bbox_head[stage]
+                with torch.no_grad():
+                    results_list = bbox_head.refine_bboxes(
+                        sampling_results=sampling_results,
+                        bbox_results=bbox_results,
+                        batch_img_metas=batch_img_metas)
+
+        if self.with_feat_relay:
+            relayed_feat = self._slice_pos_feats(bbox_results['relayed_feat'],
+                                                 sampling_results)
+            relayed_feat = self.feat_relay_head(relayed_feat)
+        else:
+            relayed_feat = None
+
+        # mask head forward and loss
+        mask_results = self.mask_loss(
+            x=x,
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat,
+            relayed_feat=relayed_feat)
+        mask_stage_loss_weight = sum(self.stage_loss_weights)
+        losses['loss_mask'] = mask_stage_loss_weight * mask_results[
+            'loss_mask']['loss_mask']
+
+        return losses
+
+    def predict(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (N, C, H, W).
+            rpn_results_list (list[:obj:`InstanceData`]): list of region
+                proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results to
+                the original image. Defaults to False.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        if self.with_glbctx:
+            _, glbctx_feat = self.glbctx_head(x)
+        else:
+            glbctx_feat = None
+
+        # TODO: nms_op in mmcv need be enhanced, the bbox result may get
+        #  difference when not rescale in bbox_head
+
+        # If it has the mask branch, the bbox branch does not need
+        # to be scaled to the original image scale, because the mask
+        # branch will scale both bbox and mask at the same time.
+        bbox_rescale = rescale if not self.with_mask else False
+        results_list = self.predict_bbox(
+            x=x,
+            semantic_feat=semantic_feat,
+            glbctx_feat=glbctx_feat,
+            batch_img_metas=batch_img_metas,
+            rpn_results_list=rpn_results_list,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=bbox_rescale)
+
+        if self.with_mask:
+            results_list = self.predict_mask(
+                x=x,
+                semantic_heat=semantic_feat,
+                glbctx_feat=glbctx_feat,
+                batch_img_metas=batch_img_metas,
+                results_list=results_list,
+                rescale=rescale)
+
+        return results_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     semantic_heat: Tensor,
+                     glbctx_feat: Tensor,
+                     batch_img_metas: List[dict],
+                     results_list: List[InstanceData],
+                     rescale: bool = False) -> List[InstanceData]:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            semantic_feat (Tensor): Semantic feature.
+            glbctx_feat (Tensor): Global context feature.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas=batch_img_metas,
+                device=mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        bboxes_results = self._bbox_forward(
+            stage=-1,
+            x=x,
+            rois=mask_rois,
+            semantic_feat=semantic_heat,
+            glbctx_feat=glbctx_feat)
+        relayed_feat = bboxes_results['relayed_feat']
+        relayed_feat = self.feat_relay_head(relayed_feat)
+
+        mask_results = self._mask_forward(
+            x=x,
+            rois=mask_rois,
+            semantic_feat=semantic_heat,
+            glbctx_feat=glbctx_feat,
+            relayed_feat=relayed_feat)
+        mask_preds = mask_results['mask_preds']
+
+        # split batch mask prediction back to each image
+        num_bbox_per_img = tuple(len(_bbox) for _bbox in bboxes)
+        mask_preds = mask_preds.split(num_bbox_per_img, 0)
+
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=mask_preds,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+
+        return results_list
+
+    def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+                batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+                the meta information of each image and corresponding
+                annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        batch_img_metas = [
+            data_samples.metainfo for data_samples in batch_data_samples
+        ]
+
+        if self.with_semantic:
+            _, semantic_feat = self.semantic_head(x)
+        else:
+            semantic_feat = None
+
+        if self.with_glbctx:
+            _, glbctx_feat = self.glbctx_head(x)
+        else:
+            glbctx_feat = None
+
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            rois, cls_scores, bbox_preds = self._refine_roi(
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                glbctx_feat=glbctx_feat,
+                batch_img_metas=batch_img_metas,
+                num_proposals_per_img=num_proposals_per_img)
+            results = results + (cls_scores, bbox_preds)
+        # mask head
+        if self.with_mask:
+            rois = torch.cat(rois)
+            bboxes_results = self._bbox_forward(
+                stage=-1,
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                glbctx_feat=glbctx_feat)
+            relayed_feat = bboxes_results['relayed_feat']
+            relayed_feat = self.feat_relay_head(relayed_feat)
+            mask_results = self._mask_forward(
+                x=x,
+                rois=rois,
+                semantic_feat=semantic_feat,
+                glbctx_feat=glbctx_feat,
+                relayed_feat=relayed_feat)
+            mask_preds = mask_results['mask_preds']
+            mask_preds = mask_preds.split(num_proposals_per_img, 0)
+            results = results + (mask_preds, )
+        return results
diff --git a/head_extractor/src/mmdet/models/roi_heads/shared_heads/__init__.py b/head_extractor/src/mmdet/models/roi_heads/shared_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d56636ab34d1dd2592828238099bcdccf179d6d3
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/shared_heads/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .res_layer import ResLayer
+
+__all__ = ['ResLayer']
diff --git a/head_extractor/src/mmdet/models/roi_heads/shared_heads/res_layer.py b/head_extractor/src/mmdet/models/roi_heads/shared_heads/res_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9210cb928fec92135a195d44d13a8588382b947
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/shared_heads/res_layer.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmdet.models.backbones import ResNet
+from mmdet.models.layers import ResLayer as _ResLayer
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class ResLayer(BaseModule):
+
+    def __init__(self,
+                 depth,
+                 stage=3,
+                 stride=2,
+                 dilation=1,
+                 style='pytorch',
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 with_cp=False,
+                 dcn=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResLayer, self).__init__(init_cfg)
+
+        self.norm_eval = norm_eval
+        self.norm_cfg = norm_cfg
+        self.stage = stage
+        self.fp16_enabled = False
+        block, stage_blocks = ResNet.arch_settings[depth]
+        stage_block = stage_blocks[stage]
+        planes = 64 * 2**stage
+        inplanes = 64 * 2**(stage - 1) * block.expansion
+
+        res_layer = _ResLayer(
+            block,
+            inplanes,
+            planes,
+            stage_block,
+            stride=stride,
+            dilation=dilation,
+            style=style,
+            with_cp=with_cp,
+            norm_cfg=self.norm_cfg,
+            dcn=dcn)
+        self.add_module(f'layer{stage + 1}', res_layer)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        res_layer = getattr(self, f'layer{self.stage + 1}')
+        out = res_layer(x)
+        return out
+
+    def train(self, mode=True):
+        super(ResLayer, self).train(mode)
+        if self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
diff --git a/head_extractor/src/mmdet/models/roi_heads/sparse_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/sparse_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..19c3e1e335ca4e4a9d5befcbffcf4665b459cb5a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/sparse_roi_head.py
@@ -0,0 +1,601 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.task_modules.samplers import PseudoSampler
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList, OptConfigType
+from ..utils.misc import empty_instances, unpack_gt_instances
+from .cascade_roi_head import CascadeRoIHead
+
+
+@MODELS.register_module()
+class SparseRoIHead(CascadeRoIHead):
+    r"""The RoIHead for `Sparse R-CNN: End-to-End Object Detection with
+    Learnable Proposals <https://arxiv.org/abs/2011.12450>`_
+    and `Instances as Queries <http://arxiv.org/abs/2105.01928>`_
+
+    Args:
+        num_stages (int): Number of stage whole iterative process.
+            Defaults to 6.
+        stage_loss_weights (Tuple[float]): The loss
+            weight of each stage. By default all stages have
+            the same weight 1.
+        bbox_roi_extractor (:obj:`ConfigDict` or dict): Config of box
+            roi extractor.
+        mask_roi_extractor (:obj:`ConfigDict` or dict): Config of mask
+            roi extractor.
+        bbox_head (:obj:`ConfigDict` or dict): Config of box head.
+        mask_head (:obj:`ConfigDict` or dict): Config of mask head.
+        train_cfg (:obj:`ConfigDict` or dict, Optional): Configuration
+            information in train stage. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, Optional): Configuration
+            information in test stage. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_stages: int = 6,
+                 stage_loss_weights: Tuple[float] = (1, 1, 1, 1, 1, 1),
+                 proposal_feature_channel: int = 256,
+                 bbox_roi_extractor: ConfigType = dict(
+                     type='SingleRoIExtractor',
+                     roi_layer=dict(
+                         type='RoIAlign', output_size=7, sampling_ratio=2),
+                     out_channels=256,
+                     featmap_strides=[4, 8, 16, 32]),
+                 mask_roi_extractor: OptConfigType = None,
+                 bbox_head: ConfigType = dict(
+                     type='DIIHead',
+                     num_classes=80,
+                     num_fcs=2,
+                     num_heads=8,
+                     num_cls_fcs=1,
+                     num_reg_fcs=3,
+                     feedforward_channels=2048,
+                     hidden_channels=256,
+                     dropout=0.0,
+                     roi_feat_size=7,
+                     ffn_act_cfg=dict(type='ReLU', inplace=True)),
+                 mask_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptConfigType = None) -> None:
+        assert bbox_roi_extractor is not None
+        assert bbox_head is not None
+        assert len(stage_loss_weights) == num_stages
+        self.num_stages = num_stages
+        self.stage_loss_weights = stage_loss_weights
+        self.proposal_feature_channel = proposal_feature_channel
+        super().__init__(
+            num_stages=num_stages,
+            stage_loss_weights=stage_loss_weights,
+            bbox_roi_extractor=bbox_roi_extractor,
+            mask_roi_extractor=mask_roi_extractor,
+            bbox_head=bbox_head,
+            mask_head=mask_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        # train_cfg would be None when run the test.py
+        if train_cfg is not None:
+            for stage in range(num_stages):
+                assert isinstance(self.bbox_sampler[stage], PseudoSampler), \
+                    'Sparse R-CNN and QueryInst only support `PseudoSampler`'
+
+    def bbox_loss(self, stage: int, x: Tuple[Tensor],
+                  results_list: InstanceList, object_feats: Tensor,
+                  batch_img_metas: List[dict],
+                  batch_gt_instances: InstanceList) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            stage (int): The current stage in iterative process.
+            x (tuple[Tensor]): List of multi-level img features.
+            results_list (List[:obj:`InstanceData`]) : List of region
+                proposals.
+            object_feats (Tensor): The object feature extracted from
+                the previous stage.
+            batch_img_metas (list[dict]): Meta information of each image.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+            - `cls_score` (Tensor): Classification scores.
+            - `bbox_pred` (Tensor): Box energies / deltas.
+            - `bbox_feats` (Tensor): Extract bbox RoI features.
+            - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        proposal_list = [res.bboxes for res in results_list]
+        rois = bbox2roi(proposal_list)
+        bbox_results = self._bbox_forward(stage, x, rois, object_feats,
+                                          batch_img_metas)
+        imgs_whwh = torch.cat(
+            [res.imgs_whwh[None, ...] for res in results_list])
+        cls_pred_list = bbox_results['detached_cls_scores']
+        proposal_list = bbox_results['detached_proposals']
+
+        sampling_results = []
+        bbox_head = self.bbox_head[stage]
+        for i in range(len(batch_img_metas)):
+            pred_instances = InstanceData()
+            # TODO: Enhance the logic
+            pred_instances.bboxes = proposal_list[i]  # for assinger
+            pred_instances.scores = cls_pred_list[i]
+            pred_instances.priors = proposal_list[i]  # for sampler
+
+            assign_result = self.bbox_assigner[stage].assign(
+                pred_instances=pred_instances,
+                gt_instances=batch_gt_instances[i],
+                gt_instances_ignore=None,
+                img_meta=batch_img_metas[i])
+
+            sampling_result = self.bbox_sampler[stage].sample(
+                assign_result, pred_instances, batch_gt_instances[i])
+            sampling_results.append(sampling_result)
+
+        bbox_results.update(sampling_results=sampling_results)
+
+        cls_score = bbox_results['cls_score']
+        decoded_bboxes = bbox_results['decoded_bboxes']
+        cls_score = cls_score.view(-1, cls_score.size(-1))
+        decoded_bboxes = decoded_bboxes.view(-1, 4)
+        bbox_loss_and_target = bbox_head.loss_and_target(
+            cls_score,
+            decoded_bboxes,
+            sampling_results,
+            self.train_cfg[stage],
+            imgs_whwh=imgs_whwh,
+            concat=True)
+        bbox_results.update(bbox_loss_and_target)
+
+        # propose for the new proposal_list
+        proposal_list = []
+        for idx in range(len(batch_img_metas)):
+            results = InstanceData()
+            results.imgs_whwh = results_list[idx].imgs_whwh
+            results.bboxes = bbox_results['detached_proposals'][idx]
+            proposal_list.append(results)
+        bbox_results.update(results_list=proposal_list)
+        return bbox_results
+
+    def _bbox_forward(self, stage: int, x: Tuple[Tensor], rois: Tensor,
+                      object_feats: Tensor,
+                      batch_img_metas: List[dict]) -> dict:
+        """Box head forward function used in both training and testing. Returns
+        all regression, classification results and a intermediate feature.
+
+        Args:
+            stage (int): The current stage in iterative process.
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+                Each dimension means (img_index, x1, y1, x2, y2).
+            object_feats (Tensor): The object feature extracted from
+                the previous stage.
+            batch_img_metas (list[dict]): Meta information of each image.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of bbox head outputs,
+            Containing the following results:
+
+            - cls_score (Tensor): The score of each class, has
+              shape (batch_size, num_proposals, num_classes)
+              when use focal loss or
+              (batch_size, num_proposals, num_classes+1)
+              otherwise.
+            - decoded_bboxes (Tensor): The regression results
+              with shape (batch_size, num_proposal, 4).
+              The last dimension 4 represents
+              [tl_x, tl_y, br_x, br_y].
+            - object_feats (Tensor): The object feature extracted
+              from current stage
+            - detached_cls_scores (list[Tensor]): The detached
+              classification results, length is batch_size, and
+              each tensor has shape (num_proposal, num_classes).
+            - detached_proposals (list[tensor]): The detached
+              regression results, length is batch_size, and each
+              tensor has shape (num_proposal, 4). The last
+              dimension 4 represents [tl_x, tl_y, br_x, br_y].
+        """
+        num_imgs = len(batch_img_metas)
+        bbox_roi_extractor = self.bbox_roi_extractor[stage]
+        bbox_head = self.bbox_head[stage]
+        bbox_feats = bbox_roi_extractor(x[:bbox_roi_extractor.num_inputs],
+                                        rois)
+        cls_score, bbox_pred, object_feats, attn_feats = bbox_head(
+            bbox_feats, object_feats)
+
+        fake_bbox_results = dict(
+            rois=rois,
+            bbox_targets=(rois.new_zeros(len(rois), dtype=torch.long), None),
+            bbox_pred=bbox_pred.view(-1, bbox_pred.size(-1)),
+            cls_score=cls_score.view(-1, cls_score.size(-1)))
+        fake_sampling_results = [
+            InstanceData(pos_is_gt=rois.new_zeros(object_feats.size(1)))
+            for _ in range(len(batch_img_metas))
+        ]
+
+        results_list = bbox_head.refine_bboxes(
+            sampling_results=fake_sampling_results,
+            bbox_results=fake_bbox_results,
+            batch_img_metas=batch_img_metas)
+        proposal_list = [res.bboxes for res in results_list]
+        bbox_results = dict(
+            cls_score=cls_score,
+            decoded_bboxes=torch.cat(proposal_list),
+            object_feats=object_feats,
+            attn_feats=attn_feats,
+            # detach then use it in label assign
+            detached_cls_scores=[
+                cls_score[i].detach() for i in range(num_imgs)
+            ],
+            detached_proposals=[item.detach() for item in proposal_list])
+
+        return bbox_results
+
+    def _mask_forward(self, stage: int, x: Tuple[Tensor], rois: Tensor,
+                      attn_feats) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            attn_feats (Tensot): Intermediate feature get from the last
+                diihead, has shape
+                (batch_size*num_proposals, feature_dimensions)
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+            - `mask_preds` (Tensor): Mask prediction.
+        """
+        mask_roi_extractor = self.mask_roi_extractor[stage]
+        mask_head = self.mask_head[stage]
+        mask_feats = mask_roi_extractor(x[:mask_roi_extractor.num_inputs],
+                                        rois)
+        # do not support caffe_c4 model anymore
+        mask_preds = mask_head(mask_feats, attn_feats)
+
+        mask_results = dict(mask_preds=mask_preds)
+        return mask_results
+
+    def mask_loss(self, stage: int, x: Tuple[Tensor], bbox_results: dict,
+                  batch_gt_instances: InstanceList,
+                  rcnn_train_cfg: ConfigDict) -> dict:
+        """Run forward function and calculate loss for mask head in training.
+
+        Args:
+            stage (int): The current stage in Cascade RoI Head.
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            bbox_results (dict): Results obtained from `bbox_loss`.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+            rcnn_train_cfg (obj:ConfigDict): `train_cfg` of RCNN.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+            - `mask_preds` (Tensor): Mask prediction.
+            - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        attn_feats = bbox_results['attn_feats']
+        sampling_results = bbox_results['sampling_results']
+
+        pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+
+        attn_feats = torch.cat([
+            feats[res.pos_inds]
+            for (feats, res) in zip(attn_feats, sampling_results)
+        ])
+        mask_results = self._mask_forward(stage, x, pos_rois, attn_feats)
+
+        mask_loss_and_target = self.mask_head[stage].loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=rcnn_train_cfg)
+        mask_results.update(mask_loss_and_target)
+
+        return mask_results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: SampleList) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (List[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict: a dictionary of loss components of all stage.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, batch_img_metas \
+            = outputs
+
+        object_feats = torch.cat(
+            [res.pop('features')[None, ...] for res in rpn_results_list])
+        results_list = rpn_results_list
+        losses = {}
+        for stage in range(self.num_stages):
+            stage_loss_weight = self.stage_loss_weights[stage]
+
+            # bbox head forward and loss
+            bbox_results = self.bbox_loss(
+                stage=stage,
+                x=x,
+                object_feats=object_feats,
+                results_list=results_list,
+                batch_img_metas=batch_img_metas,
+                batch_gt_instances=batch_gt_instances)
+
+            for name, value in bbox_results['loss_bbox'].items():
+                losses[f's{stage}.{name}'] = (
+                    value * stage_loss_weight if 'loss' in name else value)
+
+            if self.with_mask:
+                mask_results = self.mask_loss(
+                    stage=stage,
+                    x=x,
+                    bbox_results=bbox_results,
+                    batch_gt_instances=batch_gt_instances,
+                    rcnn_train_cfg=self.train_cfg[stage])
+
+                for name, value in mask_results['loss_mask'].items():
+                    losses[f's{stage}.{name}'] = (
+                        value * stage_loss_weight if 'loss' in name else value)
+
+            object_feats = bbox_results['object_feats']
+            results_list = bbox_results['results_list']
+        return losses
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x(tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposal_list = [res.bboxes for res in rpn_results_list]
+        object_feats = torch.cat(
+            [res.pop('features')[None, ...] for res in rpn_results_list])
+        if all([proposal.shape[0] == 0 for proposal in proposal_list]):
+            # There is no proposal in the whole batch
+            return empty_instances(
+                batch_img_metas, x[0].device, task_type='bbox')
+
+        for stage in range(self.num_stages):
+            rois = bbox2roi(proposal_list)
+            bbox_results = self._bbox_forward(stage, x, rois, object_feats,
+                                              batch_img_metas)
+            object_feats = bbox_results['object_feats']
+            cls_score = bbox_results['cls_score']
+            proposal_list = bbox_results['detached_proposals']
+
+        num_classes = self.bbox_head[-1].num_classes
+
+        if self.bbox_head[-1].loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+        else:
+            cls_score = cls_score.softmax(-1)[..., :-1]
+
+        topk_inds_list = []
+        results_list = []
+        for img_id in range(len(batch_img_metas)):
+            cls_score_per_img = cls_score[img_id]
+            scores_per_img, topk_inds = cls_score_per_img.flatten(0, 1).topk(
+                self.test_cfg.max_per_img, sorted=False)
+            labels_per_img = topk_inds % num_classes
+            bboxes_per_img = proposal_list[img_id][topk_inds // num_classes]
+            topk_inds_list.append(topk_inds)
+            if rescale and bboxes_per_img.size(0) > 0:
+                assert batch_img_metas[img_id].get('scale_factor') is not None
+                scale_factor = bboxes_per_img.new_tensor(
+                    batch_img_metas[img_id]['scale_factor']).repeat((1, 2))
+                bboxes_per_img = (
+                    bboxes_per_img.view(bboxes_per_img.size(0), -1, 4) /
+                    scale_factor).view(bboxes_per_img.size()[0], -1)
+
+            results = InstanceData()
+            results.bboxes = bboxes_per_img
+            results.scores = scores_per_img
+            results.labels = labels_per_img
+            results_list.append(results)
+        if self.with_mask:
+            for img_id in range(len(batch_img_metas)):
+                # add positive information in InstanceData to predict
+                # mask results in `mask_head`.
+                proposals = bbox_results['detached_proposals'][img_id]
+                topk_inds = topk_inds_list[img_id]
+                attn_feats = bbox_results['attn_feats'][img_id]
+
+                results_list[img_id].proposals = proposals
+                results_list[img_id].topk_inds = topk_inds
+                results_list[img_id].attn_feats = attn_feats
+        return results_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image. Each item usually contains following keys:
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - proposal (Tensor): Bboxes predicted from bbox_head,
+                  has a shape (num_instances, 4).
+                - topk_inds (Tensor): Topk indices of each image, has
+                  shape (num_instances, )
+                - attn_feats (Tensor): Intermediate feature get from the last
+                  diihead, has shape (num_instances, feature_dimensions)
+
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+            - scores (Tensor): Classification scores, has a shape
+              (num_instance, )
+            - labels (Tensor): Labels of bboxes, has a shape
+              (num_instances, ).
+            - bboxes (Tensor): Has a shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+            - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        proposal_list = [res.pop('proposals') for res in results_list]
+        topk_inds_list = [res.pop('topk_inds') for res in results_list]
+        attn_feats = torch.cat(
+            [res.pop('attn_feats')[None, ...] for res in results_list])
+
+        rois = bbox2roi(proposal_list)
+
+        if rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        last_stage = self.num_stages - 1
+        mask_results = self._mask_forward(last_stage, x, rois, attn_feats)
+
+        num_imgs = len(batch_img_metas)
+        mask_results['mask_preds'] = mask_results['mask_preds'].reshape(
+            num_imgs, -1, *mask_results['mask_preds'].size()[1:])
+        num_classes = self.bbox_head[-1].num_classes
+
+        mask_preds = []
+        for img_id in range(num_imgs):
+            topk_inds = topk_inds_list[img_id]
+            masks_per_img = mask_results['mask_preds'][img_id].flatten(
+                0, 1)[topk_inds]
+            masks_per_img = masks_per_img[:, None,
+                                          ...].repeat(1, num_classes, 1, 1)
+            mask_preds.append(masks_per_img)
+        results_list = self.mask_head[-1].predict_by_feat(
+            mask_preds,
+            results_list,
+            batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+
+        return results_list
+
+    # TODO: Need to refactor later
+    def forward(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+                batch_data_samples: SampleList) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (List[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        outputs = unpack_gt_instances(batch_data_samples)
+        (batch_gt_instances, batch_gt_instances_ignore,
+         batch_img_metas) = outputs
+
+        all_stage_bbox_results = []
+        object_feats = torch.cat(
+            [res.pop('features')[None, ...] for res in rpn_results_list])
+        results_list = rpn_results_list
+        if self.with_bbox:
+            for stage in range(self.num_stages):
+                bbox_results = self.bbox_loss(
+                    stage=stage,
+                    x=x,
+                    results_list=results_list,
+                    object_feats=object_feats,
+                    batch_img_metas=batch_img_metas,
+                    batch_gt_instances=batch_gt_instances)
+                bbox_results.pop('loss_bbox')
+                # torch.jit does not support obj:SamplingResult
+                bbox_results.pop('results_list')
+                bbox_res = bbox_results.copy()
+                bbox_res.pop('sampling_results')
+                all_stage_bbox_results.append((bbox_res, ))
+
+                if self.with_mask:
+                    attn_feats = bbox_results['attn_feats']
+                    sampling_results = bbox_results['sampling_results']
+
+                    pos_rois = bbox2roi(
+                        [res.pos_priors for res in sampling_results])
+
+                    attn_feats = torch.cat([
+                        feats[res.pos_inds]
+                        for (feats, res) in zip(attn_feats, sampling_results)
+                    ])
+                    mask_results = self._mask_forward(stage, x, pos_rois,
+                                                      attn_feats)
+                    all_stage_bbox_results[-1] += (mask_results, )
+        return tuple(all_stage_bbox_results)
diff --git a/head_extractor/src/mmdet/models/roi_heads/standard_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/standard_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d168eba0fb2ccf6aa89bde5c637160f10aea83a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/standard_roi_head.py
@@ -0,0 +1,419 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import DetDataSample, SampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import ConfigType, InstanceList
+from ..task_modules.samplers import SamplingResult
+from ..utils import empty_instances, unpack_gt_instances
+from .base_roi_head import BaseRoIHead
+
+
+@MODELS.register_module()
+class StandardRoIHead(BaseRoIHead):
+    """Simplest base roi head including one bbox head and one mask head."""
+
+    def init_assigner_sampler(self) -> None:
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.bbox_sampler = TASK_UTILS.build(
+                self.train_cfg.sampler, default_args=dict(context=self))
+
+    def init_bbox_head(self, bbox_roi_extractor: ConfigType,
+                       bbox_head: ConfigType) -> None:
+        """Initialize box head and box roi extractor.
+
+        Args:
+            bbox_roi_extractor (dict or ConfigDict): Config of box
+                roi extractor.
+            bbox_head (dict or ConfigDict): Config of box in box head.
+        """
+        self.bbox_roi_extractor = MODELS.build(bbox_roi_extractor)
+        self.bbox_head = MODELS.build(bbox_head)
+
+    def init_mask_head(self, mask_roi_extractor: ConfigType,
+                       mask_head: ConfigType) -> None:
+        """Initialize mask head and mask roi extractor.
+
+        Args:
+            mask_roi_extractor (dict or ConfigDict): Config of mask roi
+                extractor.
+            mask_head (dict or ConfigDict): Config of mask in mask head.
+        """
+        if mask_roi_extractor is not None:
+            self.mask_roi_extractor = MODELS.build(mask_roi_extractor)
+            self.share_roi_extractor = False
+        else:
+            self.share_roi_extractor = True
+            self.mask_roi_extractor = self.bbox_roi_extractor
+        self.mask_head = MODELS.build(mask_head)
+
+    # TODO: Need to refactor later
+    def forward(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList = None) -> tuple:
+        """Network forward process. Usually includes backbone, neck and head
+        forward without any post-processing.
+
+        Args:
+            x (List[Tensor]): Multi-level features that may have different
+                resolutions.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): Each item contains
+            the meta information of each image and corresponding
+            annotations.
+
+        Returns
+            tuple: A tuple of features from ``bbox_head`` and ``mask_head``
+            forward.
+        """
+        results = ()
+        proposals = [rpn_results.bboxes for rpn_results in rpn_results_list]
+        rois = bbox2roi(proposals)
+        # bbox head
+        if self.with_bbox:
+            bbox_results = self._bbox_forward(x, rois)
+            results = results + (bbox_results['cls_score'],
+                                 bbox_results['bbox_pred'])
+        # mask head
+        if self.with_mask:
+            mask_rois = rois[:100]
+            mask_results = self._mask_forward(x, mask_rois)
+            results = results + (mask_results['mask_preds'], )
+        return results
+
+    def loss(self, x: Tuple[Tensor], rpn_results_list: InstanceList,
+             batch_data_samples: List[DetDataSample]) -> dict:
+        """Perform forward propagation and loss calculation of the detection
+        roi on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components
+        """
+        assert len(rpn_results_list) == len(batch_data_samples)
+        outputs = unpack_gt_instances(batch_data_samples)
+        batch_gt_instances, batch_gt_instances_ignore, _ = outputs
+
+        # assign gts and sample proposals
+        num_imgs = len(batch_data_samples)
+        sampling_results = []
+        for i in range(num_imgs):
+            # rename rpn_results.bboxes to rpn_results.priors
+            rpn_results = rpn_results_list[i]
+            rpn_results.priors = rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in x])
+            sampling_results.append(sampling_result)
+
+        losses = dict()
+        # bbox head loss
+        if self.with_bbox:
+            bbox_results = self.bbox_loss(x, sampling_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        # mask head forward and loss
+        if self.with_mask:
+            mask_results = self.mask_loss(x, sampling_results,
+                                          bbox_results['bbox_feats'],
+                                          batch_gt_instances)
+            losses.update(mask_results['loss_mask'])
+
+        return losses
+
+    def _bbox_forward(self, x: Tuple[Tensor], rois: Tensor) -> dict:
+        """Box head forward function used in both training and testing.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+
+        Returns:
+             dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+        """
+        # TODO: a more flexible way to decide which feature maps to use
+        bbox_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        if self.with_shared_head:
+            bbox_feats = self.shared_head(bbox_feats)
+        cls_score, bbox_pred = self.bbox_head(bbox_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score, bbox_pred=bbox_pred, bbox_feats=bbox_feats)
+        return bbox_results
+
+    def bbox_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult]) -> dict:
+        """Perform forward propagation and loss calculation of the bbox head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): List of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `cls_score` (Tensor): Classification scores.
+                - `bbox_pred` (Tensor): Box energies / deltas.
+                - `bbox_feats` (Tensor): Extract bbox RoI features.
+                - `loss_bbox` (dict): A dictionary of bbox loss components.
+        """
+        rois = bbox2roi([res.priors for res in sampling_results])
+        bbox_results = self._bbox_forward(x, rois)
+
+        bbox_loss_and_target = self.bbox_head.loss_and_target(
+            cls_score=bbox_results['cls_score'],
+            bbox_pred=bbox_results['bbox_pred'],
+            rois=rois,
+            sampling_results=sampling_results,
+            rcnn_train_cfg=self.train_cfg)
+
+        bbox_results.update(loss_bbox=bbox_loss_and_target['loss_bbox'])
+        return bbox_results
+
+    def mask_loss(self, x: Tuple[Tensor],
+                  sampling_results: List[SamplingResult], bbox_feats: Tensor,
+                  batch_gt_instances: InstanceList) -> dict:
+        """Perform forward propagation and loss calculation of the mask head on
+        the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            sampling_results (list["obj:`SamplingResult`]): Sampling results.
+            bbox_feats (Tensor): Extract bbox RoI features.
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes``, ``labels``, and
+                ``masks`` attributes.
+
+        Returns:
+            dict: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `mask_feats` (Tensor): Extract mask RoI features.
+                - `mask_targets` (Tensor): Mask target of each positive\
+                    proposals in the image.
+                - `loss_mask` (dict): A dictionary of mask loss components.
+        """
+        if not self.share_roi_extractor:
+            pos_rois = bbox2roi([res.pos_priors for res in sampling_results])
+            mask_results = self._mask_forward(x, pos_rois)
+        else:
+            pos_inds = []
+            device = bbox_feats.device
+            for res in sampling_results:
+                pos_inds.append(
+                    torch.ones(
+                        res.pos_priors.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+                pos_inds.append(
+                    torch.zeros(
+                        res.neg_priors.shape[0],
+                        device=device,
+                        dtype=torch.uint8))
+            pos_inds = torch.cat(pos_inds)
+
+            mask_results = self._mask_forward(
+                x, pos_inds=pos_inds, bbox_feats=bbox_feats)
+
+        mask_loss_and_target = self.mask_head.loss_and_target(
+            mask_preds=mask_results['mask_preds'],
+            sampling_results=sampling_results,
+            batch_gt_instances=batch_gt_instances,
+            rcnn_train_cfg=self.train_cfg)
+
+        mask_results.update(loss_mask=mask_loss_and_target['loss_mask'])
+        return mask_results
+
+    def _mask_forward(self,
+                      x: Tuple[Tensor],
+                      rois: Tensor = None,
+                      pos_inds: Optional[Tensor] = None,
+                      bbox_feats: Optional[Tensor] = None) -> dict:
+        """Mask head forward function used in both training and testing.
+
+        Args:
+            x (tuple[Tensor]): Tuple of multi-level img features.
+            rois (Tensor): RoIs with the shape (n, 5) where the first
+                column indicates batch id of each RoI.
+            pos_inds (Tensor, optional): Indices of positive samples.
+                Defaults to None.
+            bbox_feats (Tensor): Extract bbox RoI features. Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: Usually returns a dictionary with keys:
+
+                - `mask_preds` (Tensor): Mask prediction.
+                - `mask_feats` (Tensor): Extract mask RoI features.
+        """
+        assert ((rois is not None) ^
+                (pos_inds is not None and bbox_feats is not None))
+        if rois is not None:
+            mask_feats = self.mask_roi_extractor(
+                x[:self.mask_roi_extractor.num_inputs], rois)
+            if self.with_shared_head:
+                mask_feats = self.shared_head(mask_feats)
+        else:
+            assert bbox_feats is not None
+            mask_feats = bbox_feats[pos_inds]
+
+        mask_preds = self.mask_head(mask_feats)
+        mask_results = dict(mask_preds=mask_preds, mask_feats=mask_feats)
+        return mask_results
+
+    def predict_bbox(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     rpn_results_list: InstanceList,
+                     rcnn_test_cfg: ConfigType,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the bbox head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            rcnn_test_cfg (obj:`ConfigDict`): `test_cfg` of R-CNN.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        proposals = [res.bboxes for res in rpn_results_list]
+        rois = bbox2roi(proposals)
+
+        if rois.shape[0] == 0:
+            return empty_instances(
+                batch_img_metas,
+                rois.device,
+                task_type='bbox',
+                box_type=self.bbox_head.predict_box_type,
+                num_classes=self.bbox_head.num_classes,
+                score_per_cls=rcnn_test_cfg is None)
+
+        bbox_results = self._bbox_forward(x, rois)
+
+        # split batch bbox prediction back to each image
+        cls_scores = bbox_results['cls_score']
+        bbox_preds = bbox_results['bbox_pred']
+        num_proposals_per_img = tuple(len(p) for p in proposals)
+        rois = rois.split(num_proposals_per_img, 0)
+        cls_scores = cls_scores.split(num_proposals_per_img, 0)
+
+        # some detector with_reg is False, bbox_preds will be None
+        if bbox_preds is not None:
+            # TODO move this to a sabl_roi_head
+            # the bbox prediction of some detectors like SABL is not Tensor
+            if isinstance(bbox_preds, torch.Tensor):
+                bbox_preds = bbox_preds.split(num_proposals_per_img, 0)
+            else:
+                bbox_preds = self.bbox_head.bbox_pred_split(
+                    bbox_preds, num_proposals_per_img)
+        else:
+            bbox_preds = (None, ) * len(proposals)
+
+        result_list = self.bbox_head.predict_by_feat(
+            rois=rois,
+            cls_scores=cls_scores,
+            bbox_preds=bbox_preds,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=rcnn_test_cfg,
+            rescale=rescale)
+        return result_list
+
+    def predict_mask(self,
+                     x: Tuple[Tensor],
+                     batch_img_metas: List[dict],
+                     results_list: InstanceList,
+                     rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the mask head and predict detection
+        results on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Feature maps of all scale level.
+            batch_img_metas (list[dict]): List of image information.
+            results_list (list[:obj:`InstanceData`]): Detection results of
+                each image.
+            rescale (bool): If True, return boxes in original image space.
+                Defaults to False.
+
+        Returns:
+            list[:obj:`InstanceData`]: Detection results of each image
+            after the post process.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        # don't need to consider aug_test.
+        bboxes = [res.bboxes for res in results_list]
+        mask_rois = bbox2roi(bboxes)
+        if mask_rois.shape[0] == 0:
+            results_list = empty_instances(
+                batch_img_metas,
+                mask_rois.device,
+                task_type='mask',
+                instance_results=results_list,
+                mask_thr_binary=self.test_cfg.mask_thr_binary)
+            return results_list
+
+        mask_results = self._mask_forward(x, mask_rois)
+        mask_preds = mask_results['mask_preds']
+        # split batch mask prediction back to each image
+        num_mask_rois_per_img = [len(res) for res in results_list]
+        mask_preds = mask_preds.split(num_mask_rois_per_img, 0)
+
+        # TODO: Handle the case where rescale is false
+        results_list = self.mask_head.predict_by_feat(
+            mask_preds=mask_preds,
+            results_list=results_list,
+            batch_img_metas=batch_img_metas,
+            rcnn_test_cfg=self.test_cfg,
+            rescale=rescale)
+        return results_list
diff --git a/head_extractor/src/mmdet/models/roi_heads/test_mixins.py b/head_extractor/src/mmdet/models/roi_heads/test_mixins.py
new file mode 100644
index 0000000000000000000000000000000000000000..940490454d9cf1fde4d69c1f890c173b92d522a1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/test_mixins.py
@@ -0,0 +1,171 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# TODO: delete this file after refactor
+import sys
+
+import torch
+
+from mmdet.models.layers import multiclass_nms
+from mmdet.models.test_time_augs import merge_aug_bboxes, merge_aug_masks
+from mmdet.structures.bbox import bbox2roi, bbox_mapping
+
+if sys.version_info >= (3, 7):
+    from mmdet.utils.contextmanagers import completed
+
+
+class BBoxTestMixin:
+
+    if sys.version_info >= (3, 7):
+        # TODO: Currently not supported
+        async def async_test_bboxes(self,
+                                    x,
+                                    img_metas,
+                                    proposals,
+                                    rcnn_test_cfg,
+                                    rescale=False,
+                                    **kwargs):
+            """Asynchronized test for box head without augmentation."""
+            rois = bbox2roi(proposals)
+            roi_feats = self.bbox_roi_extractor(
+                x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+            if self.with_shared_head:
+                roi_feats = self.shared_head(roi_feats)
+            sleep_interval = rcnn_test_cfg.get('async_sleep_interval', 0.017)
+
+            async with completed(
+                    __name__, 'bbox_head_forward',
+                    sleep_interval=sleep_interval):
+                cls_score, bbox_pred = self.bbox_head(roi_feats)
+
+            img_shape = img_metas[0]['img_shape']
+            scale_factor = img_metas[0]['scale_factor']
+            det_bboxes, det_labels = self.bbox_head.get_bboxes(
+                rois,
+                cls_score,
+                bbox_pred,
+                img_shape,
+                scale_factor,
+                rescale=rescale,
+                cfg=rcnn_test_cfg)
+            return det_bboxes, det_labels
+
+    # TODO: Currently not supported
+    def aug_test_bboxes(self, feats, img_metas, rpn_results_list,
+                        rcnn_test_cfg):
+        """Test det bboxes with test time augmentation."""
+        aug_bboxes = []
+        aug_scores = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            img_shape = img_meta[0]['img_shape']
+            scale_factor = img_meta[0]['scale_factor']
+            flip = img_meta[0]['flip']
+            flip_direction = img_meta[0]['flip_direction']
+            # TODO more flexible
+            proposals = bbox_mapping(rpn_results_list[0][:, :4], img_shape,
+                                     scale_factor, flip, flip_direction)
+            rois = bbox2roi([proposals])
+            bbox_results = self.bbox_forward(x, rois)
+            bboxes, scores = self.bbox_head.get_bboxes(
+                rois,
+                bbox_results['cls_score'],
+                bbox_results['bbox_pred'],
+                img_shape,
+                scale_factor,
+                rescale=False,
+                cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, rcnn_test_cfg)
+        if merged_bboxes.shape[0] == 0:
+            # There is no proposal in the single image
+            det_bboxes = merged_bboxes.new_zeros(0, 5)
+            det_labels = merged_bboxes.new_zeros((0, ), dtype=torch.long)
+        else:
+            det_bboxes, det_labels = multiclass_nms(merged_bboxes,
+                                                    merged_scores,
+                                                    rcnn_test_cfg.score_thr,
+                                                    rcnn_test_cfg.nms,
+                                                    rcnn_test_cfg.max_per_img)
+        return det_bboxes, det_labels
+
+
+class MaskTestMixin:
+
+    if sys.version_info >= (3, 7):
+        # TODO: Currently not supported
+        async def async_test_mask(self,
+                                  x,
+                                  img_metas,
+                                  det_bboxes,
+                                  det_labels,
+                                  rescale=False,
+                                  mask_test_cfg=None):
+            """Asynchronized test for mask head without augmentation."""
+            # image shape of the first image in the batch (only one)
+            ori_shape = img_metas[0]['ori_shape']
+            scale_factor = img_metas[0]['scale_factor']
+            if det_bboxes.shape[0] == 0:
+                segm_result = [[] for _ in range(self.mask_head.num_classes)]
+            else:
+                if rescale and not isinstance(scale_factor,
+                                              (float, torch.Tensor)):
+                    scale_factor = det_bboxes.new_tensor(scale_factor)
+                _bboxes = (
+                    det_bboxes[:, :4] *
+                    scale_factor if rescale else det_bboxes)
+                mask_rois = bbox2roi([_bboxes])
+                mask_feats = self.mask_roi_extractor(
+                    x[:len(self.mask_roi_extractor.featmap_strides)],
+                    mask_rois)
+
+                if self.with_shared_head:
+                    mask_feats = self.shared_head(mask_feats)
+                if mask_test_cfg and \
+                        mask_test_cfg.get('async_sleep_interval'):
+                    sleep_interval = mask_test_cfg['async_sleep_interval']
+                else:
+                    sleep_interval = 0.035
+                async with completed(
+                        __name__,
+                        'mask_head_forward',
+                        sleep_interval=sleep_interval):
+                    mask_pred = self.mask_head(mask_feats)
+                segm_result = self.mask_head.get_results(
+                    mask_pred, _bboxes, det_labels, self.test_cfg, ori_shape,
+                    scale_factor, rescale)
+            return segm_result
+
+    # TODO: Currently not supported
+    def aug_test_mask(self, feats, img_metas, det_bboxes, det_labels):
+        """Test for mask head with test time augmentation."""
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes)]
+        else:
+            aug_masks = []
+            for x, img_meta in zip(feats, img_metas):
+                img_shape = img_meta[0]['img_shape']
+                scale_factor = img_meta[0]['scale_factor']
+                flip = img_meta[0]['flip']
+                flip_direction = img_meta[0]['flip_direction']
+                _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape,
+                                       scale_factor, flip, flip_direction)
+                mask_rois = bbox2roi([_bboxes])
+                mask_results = self._mask_forward(x, mask_rois)
+                # convert to numpy array to save memory
+                aug_masks.append(
+                    mask_results['mask_pred'].sigmoid().cpu().numpy())
+            merged_masks = merge_aug_masks(aug_masks, img_metas, self.test_cfg)
+
+            ori_shape = img_metas[0][0]['ori_shape']
+            scale_factor = det_bboxes.new_ones(4)
+            segm_result = self.mask_head.get_results(
+                merged_masks,
+                det_bboxes,
+                det_labels,
+                self.test_cfg,
+                ori_shape,
+                scale_factor=scale_factor,
+                rescale=False)
+        return segm_result
diff --git a/head_extractor/src/mmdet/models/roi_heads/trident_roi_head.py b/head_extractor/src/mmdet/models/roi_heads/trident_roi_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..5215327296282a8e7ca502f3321aced8a4f840b7
--- /dev/null
+++ b/head_extractor/src/mmdet/models/roi_heads/trident_roi_head.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import InstanceList
+from .standard_roi_head import StandardRoIHead
+
+
+@MODELS.register_module()
+class TridentRoIHead(StandardRoIHead):
+    """Trident roi head.
+
+    Args:
+        num_branch (int): Number of branches in TridentNet.
+        test_branch_idx (int): In inference, all 3 branches will be used
+            if `test_branch_idx==-1`, otherwise only branch with index
+            `test_branch_idx` will be used.
+    """
+
+    def __init__(self, num_branch: int, test_branch_idx: int,
+                 **kwargs) -> None:
+        self.num_branch = num_branch
+        self.test_branch_idx = test_branch_idx
+        super().__init__(**kwargs)
+
+    def merge_trident_bboxes(self,
+                             trident_results: InstanceList) -> InstanceData:
+        """Merge bbox predictions of each branch.
+
+        Args:
+            trident_results (List[:obj:`InstanceData`]): A list of InstanceData
+                predicted from every branch.
+
+        Returns:
+            :obj:`InstanceData`: merged InstanceData.
+        """
+        bboxes = torch.cat([res.bboxes for res in trident_results])
+        scores = torch.cat([res.scores for res in trident_results])
+        labels = torch.cat([res.labels for res in trident_results])
+
+        nms_cfg = self.test_cfg['nms']
+        results = InstanceData()
+        if bboxes.numel() == 0:
+            results.bboxes = bboxes
+            results.scores = scores
+            results.labels = labels
+        else:
+            det_bboxes, keep = batched_nms(bboxes, scores, labels, nms_cfg)
+            results.bboxes = det_bboxes[:, :-1]
+            results.scores = det_bboxes[:, -1]
+            results.labels = labels[keep]
+
+        if self.test_cfg['max_per_img'] > 0:
+            results = results[:self.test_cfg['max_per_img']]
+        return results
+
+    def predict(self,
+                x: Tuple[Tensor],
+                rpn_results_list: InstanceList,
+                batch_data_samples: SampleList,
+                rescale: bool = False) -> InstanceList:
+        """Perform forward propagation of the roi head and predict detection
+        results on the features of the upstream network.
+
+        - Compute prediction bbox and label per branch.
+        - Merge predictions of each branch according to scores of
+          bboxes, i.e., bboxes with higher score are kept to give
+          top-k prediction.
+
+        Args:
+            x (tuple[Tensor]): Features from upstream network. Each
+                has shape (N, C, H, W).
+            rpn_results_list (list[:obj:`InstanceData`]): list of region
+                proposals.
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): Whether to rescale the results to
+                the original image. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: Detection results of each image.
+            Each item usually contains following keys.
+
+                - scores (Tensor): Classification scores, has a shape
+                  (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                  (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                  the last dimension 4 arrange as (x1, y1, x2, y2).
+        """
+        results_list = super().predict(
+            x=x,
+            rpn_results_list=rpn_results_list,
+            batch_data_samples=batch_data_samples,
+            rescale=rescale)
+
+        num_branch = self.num_branch \
+            if self.training or self.test_branch_idx == -1 else 1
+
+        merged_results_list = []
+        for i in range(len(batch_data_samples) // num_branch):
+            merged_results_list.append(
+                self.merge_trident_bboxes(results_list[i * num_branch:(i + 1) *
+                                                       num_branch]))
+        return merged_results_list
diff --git a/head_extractor/src/mmdet/models/seg_heads/__init__.py b/head_extractor/src/mmdet/models/seg_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b489a905b1e9b6cef2e8b9575600990563128e4e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/seg_heads/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .panoptic_fpn_head import PanopticFPNHead  # noqa: F401,F403
+from .panoptic_fusion_heads import *  # noqa: F401,F403
diff --git a/head_extractor/src/mmdet/models/seg_heads/base_semantic_head.py b/head_extractor/src/mmdet/models/seg_heads/base_semantic_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1db71549d89766c45012517c20cef443f4760419
--- /dev/null
+++ b/head_extractor/src/mmdet/models/seg_heads/base_semantic_head.py
@@ -0,0 +1,113 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List, Tuple, Union
+
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class BaseSemanticHead(BaseModule, metaclass=ABCMeta):
+    """Base module of Semantic Head.
+
+    Args:
+        num_classes (int): the number of classes.
+        seg_rescale_factor (float): the rescale factor for ``gt_sem_seg``,
+            which equals to ``1 / output_strides``. The output_strides is
+            for ``seg_preds``. Defaults to  1 / 4.
+        init_cfg (Optional[Union[:obj:`ConfigDict`, dict]]): the initialization
+            config.
+        loss_seg (Union[:obj:`ConfigDict`, dict]): the loss of the semantic
+            head.
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 seg_rescale_factor: float = 1 / 4.,
+                 loss_seg: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     ignore_index=255,
+                     loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.loss_seg = MODELS.build(loss_seg)
+        self.num_classes = num_classes
+        self.seg_rescale_factor = seg_rescale_factor
+
+    @abstractmethod
+    def forward(self, x: Union[Tensor, Tuple[Tensor]]) -> Dict[str, Tensor]:
+        """Placeholder of forward function.
+
+        Args:
+            x (Tensor): Feature maps.
+
+        Returns:
+            Dict[str, Tensor]: A dictionary, including features
+                and predicted scores. Required keys: 'seg_preds'
+                and 'feats'.
+        """
+        pass
+
+    @abstractmethod
+    def loss(self, x: Union[Tensor, Tuple[Tensor]],
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """
+        Args:
+            x (Union[Tensor, Tuple[Tensor]]): Feature maps.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Args:
+            x (Tensor): Feature maps.
+
+        Returns:
+            Dict[str, Tensor]: The loss of semantic head.
+        """
+        pass
+
+    def predict(self,
+                x: Union[Tensor, Tuple[Tensor]],
+                batch_img_metas: List[dict],
+                rescale: bool = False) -> List[Tensor]:
+        """Test without Augmentation.
+
+        Args:
+            x (Union[Tensor, Tuple[Tensor]]): Feature maps.
+            batch_img_metas (List[dict]): List of image information.
+            rescale (bool): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[Tensor]: semantic segmentation logits.
+        """
+        seg_preds = self.forward(x)['seg_preds']
+        seg_preds = F.interpolate(
+            seg_preds,
+            size=batch_img_metas[0]['batch_input_shape'],
+            mode='bilinear',
+            align_corners=False)
+        seg_preds = [seg_preds[i] for i in range(len(batch_img_metas))]
+
+        if rescale:
+            seg_pred_list = []
+            for i in range(len(batch_img_metas)):
+                h, w = batch_img_metas[i]['img_shape']
+                seg_pred = seg_preds[i][:, :h, :w]
+
+                h, w = batch_img_metas[i]['ori_shape']
+                seg_pred = F.interpolate(
+                    seg_pred[None],
+                    size=(h, w),
+                    mode='bilinear',
+                    align_corners=False)[0]
+                seg_pred_list.append(seg_pred)
+        else:
+            seg_pred_list = seg_preds
+
+        return seg_pred_list
diff --git a/head_extractor/src/mmdet/models/seg_heads/panoptic_fpn_head.py b/head_extractor/src/mmdet/models/seg_heads/panoptic_fpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d8b901360922f6cdb9f8d15b60dac8d7514ee75
--- /dev/null
+++ b/head_extractor/src/mmdet/models/seg_heads/panoptic_fpn_head.py
@@ -0,0 +1,174 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import ModuleList
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from ..layers import ConvUpsample
+from ..utils import interpolate_as
+from .base_semantic_head import BaseSemanticHead
+
+
+@MODELS.register_module()
+class PanopticFPNHead(BaseSemanticHead):
+    """PanopticFPNHead used in Panoptic FPN.
+
+    In this head, the number of output channels is ``num_stuff_classes
+    + 1``, including all stuff classes and one thing class. The stuff
+    classes will be reset from ``0`` to ``num_stuff_classes - 1``, the
+    thing classes will be merged to ``num_stuff_classes``-th channel.
+
+    Arg:
+        num_things_classes (int): Number of thing classes. Default: 80.
+        num_stuff_classes (int): Number of stuff classes. Default: 53.
+        in_channels (int): Number of channels in the input feature
+            map.
+        inner_channels (int): Number of channels in inner features.
+        start_level (int): The start level of the input features
+            used in PanopticFPN.
+        end_level (int): The end level of the used features, the
+            ``end_level``-th layer will not be used.
+        conv_cfg (Optional[Union[ConfigDict, dict]]): Dictionary to construct
+            and config conv layer.
+        norm_cfg (Union[ConfigDict, dict]): Dictionary to construct and config
+            norm layer. Use ``GN`` by default.
+        init_cfg (Optional[Union[ConfigDict, dict]]): Initialization config
+            dict.
+        loss_seg (Union[ConfigDict, dict]): the loss of the semantic head.
+    """
+
+    def __init__(self,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 in_channels: int = 256,
+                 inner_channels: int = 128,
+                 start_level: int = 0,
+                 end_level: int = 4,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(
+                     type='GN', num_groups=32, requires_grad=True),
+                 loss_seg: ConfigType = dict(
+                     type='CrossEntropyLoss', ignore_index=-1,
+                     loss_weight=1.0),
+                 init_cfg: OptMultiConfig = None) -> None:
+        seg_rescale_factor = 1 / 2**(start_level + 2)
+        super().__init__(
+            num_classes=num_stuff_classes + 1,
+            seg_rescale_factor=seg_rescale_factor,
+            loss_seg=loss_seg,
+            init_cfg=init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        # Used feature layers are [start_level, end_level)
+        self.start_level = start_level
+        self.end_level = end_level
+        self.num_stages = end_level - start_level
+        self.inner_channels = inner_channels
+
+        self.conv_upsample_layers = ModuleList()
+        for i in range(start_level, end_level):
+            self.conv_upsample_layers.append(
+                ConvUpsample(
+                    in_channels,
+                    inner_channels,
+                    num_layers=i if i > 0 else 1,
+                    num_upsample=i if i > 0 else 0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                ))
+        self.conv_logits = nn.Conv2d(inner_channels, self.num_classes, 1)
+
+    def _set_things_to_void(self, gt_semantic_seg: Tensor) -> Tensor:
+        """Merge thing classes to one class.
+
+        In PanopticFPN, the background labels will be reset from `0` to
+        `self.num_stuff_classes-1`, the foreground labels will be merged to
+        `self.num_stuff_classes`-th channel.
+        """
+        gt_semantic_seg = gt_semantic_seg.int()
+        fg_mask = gt_semantic_seg < self.num_things_classes
+        bg_mask = (gt_semantic_seg >= self.num_things_classes) * (
+            gt_semantic_seg < self.num_things_classes + self.num_stuff_classes)
+
+        new_gt_seg = torch.clone(gt_semantic_seg)
+        new_gt_seg = torch.where(bg_mask,
+                                 gt_semantic_seg - self.num_things_classes,
+                                 new_gt_seg)
+        new_gt_seg = torch.where(fg_mask,
+                                 fg_mask.int() * self.num_stuff_classes,
+                                 new_gt_seg)
+        return new_gt_seg
+
+    def loss(self, x: Union[Tensor, Tuple[Tensor]],
+             batch_data_samples: SampleList) -> Dict[str, Tensor]:
+        """
+        Args:
+            x (Union[Tensor, Tuple[Tensor]]): Feature maps.
+            batch_data_samples (list[:obj:`DetDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.
+
+        Returns:
+            Dict[str, Tensor]: The loss of semantic head.
+        """
+        seg_preds = self(x)['seg_preds']
+        gt_semantic_segs = [
+            data_sample.gt_sem_seg.sem_seg
+            for data_sample in batch_data_samples
+        ]
+
+        gt_semantic_segs = torch.stack(gt_semantic_segs)
+        if self.seg_rescale_factor != 1.0:
+            gt_semantic_segs = F.interpolate(
+                gt_semantic_segs.float(),
+                scale_factor=self.seg_rescale_factor,
+                mode='nearest').squeeze(1)
+
+        # Things classes will be merged to one class in PanopticFPN.
+        gt_semantic_segs = self._set_things_to_void(gt_semantic_segs)
+
+        if seg_preds.shape[-2:] != gt_semantic_segs.shape[-2:]:
+            seg_preds = interpolate_as(seg_preds, gt_semantic_segs)
+        seg_preds = seg_preds.permute((0, 2, 3, 1))
+
+        loss_seg = self.loss_seg(
+            seg_preds.reshape(-1, self.num_classes),  # => [NxHxW, C]
+            gt_semantic_segs.reshape(-1).long())
+
+        return dict(loss_seg=loss_seg)
+
+    def init_weights(self) -> None:
+        """Initialize weights."""
+        super().init_weights()
+        nn.init.normal_(self.conv_logits.weight.data, 0, 0.01)
+        self.conv_logits.bias.data.zero_()
+
+    def forward(self, x: Tuple[Tensor]) -> Dict[str, Tensor]:
+        """Forward.
+
+        Args:
+            x (Tuple[Tensor]): Multi scale Feature maps.
+
+        Returns:
+            dict[str, Tensor]: semantic segmentation predictions and
+                feature maps.
+        """
+        # the number of subnets must be not more than
+        # the length of features.
+        assert self.num_stages <= len(x)
+
+        feats = []
+        for i, layer in enumerate(self.conv_upsample_layers):
+            f = layer(x[self.start_level + i])
+            feats.append(f)
+
+        seg_feats = torch.sum(torch.stack(feats, dim=0), dim=0)
+        seg_preds = self.conv_logits(seg_feats)
+        out = dict(seg_preds=seg_preds, seg_feats=seg_feats)
+        return out
diff --git a/head_extractor/src/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py b/head_extractor/src/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..41625a61d6d1c38c633062c24b1e3455bd3ae2df
--- /dev/null
+++ b/head_extractor/src/mmdet/models/seg_heads/panoptic_fusion_heads/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_panoptic_fusion_head import \
+    BasePanopticFusionHead  # noqa: F401,F403
+from .heuristic_fusion_head import HeuristicFusionHead  # noqa: F401,F403
+from .maskformer_fusion_head import MaskFormerFusionHead  # noqa: F401,F403
diff --git a/head_extractor/src/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py b/head_extractor/src/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6b20e1cd144eaebd042b8017f143c0a643adde1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/seg_heads/panoptic_fusion_heads/base_panoptic_fusion_head.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmengine.model import BaseModule
+
+from mmdet.registry import MODELS
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class BasePanopticFusionHead(BaseModule, metaclass=ABCMeta):
+    """Base class for panoptic heads."""
+
+    def __init__(self,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 test_cfg: OptConfigType = None,
+                 loss_panoptic: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.num_classes = num_things_classes + num_stuff_classes
+        self.test_cfg = test_cfg
+
+        if loss_panoptic:
+            self.loss_panoptic = MODELS.build(loss_panoptic)
+        else:
+            self.loss_panoptic = None
+
+    @property
+    def with_loss(self) -> bool:
+        """bool: whether the panoptic head contains loss function."""
+        return self.loss_panoptic is not None
+
+    @abstractmethod
+    def loss(self, **kwargs):
+        """Loss function."""
+
+    @abstractmethod
+    def predict(self, **kwargs):
+        """Predict function."""
diff --git a/head_extractor/src/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py b/head_extractor/src/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a4a4200edd97f42e9a138e14a1d07328ad9b139
--- /dev/null
+++ b/head_extractor/src/mmdet/models/seg_heads/panoptic_fusion_heads/heuristic_fusion_head.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmengine.structures import InstanceData, PixelData
+from torch import Tensor
+
+from mmdet.evaluation.functional import INSTANCE_OFFSET
+from mmdet.registry import MODELS
+from mmdet.utils import InstanceList, OptConfigType, OptMultiConfig, PixelList
+from .base_panoptic_fusion_head import BasePanopticFusionHead
+
+
+@MODELS.register_module()
+class HeuristicFusionHead(BasePanopticFusionHead):
+    """Fusion Head with Heuristic method."""
+
+    def __init__(self,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super().__init__(
+            num_things_classes=num_things_classes,
+            num_stuff_classes=num_stuff_classes,
+            test_cfg=test_cfg,
+            loss_panoptic=None,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def loss(self, **kwargs) -> dict:
+        """HeuristicFusionHead has no training loss."""
+        return dict()
+
+    def _lay_masks(self,
+                   mask_results: InstanceData,
+                   overlap_thr: float = 0.5) -> Tensor:
+        """Lay instance masks to a result map.
+
+        Args:
+            mask_results (:obj:`InstanceData`): Instance segmentation results,
+                each contains ``bboxes``, ``labels``, ``scores`` and ``masks``.
+            overlap_thr (float): Threshold to determine whether two masks
+                overlap. default: 0.5.
+
+        Returns:
+            Tensor: The result map, (H, W).
+        """
+        bboxes = mask_results.bboxes
+        scores = mask_results.scores
+        labels = mask_results.labels
+        masks = mask_results.masks
+
+        num_insts = bboxes.shape[0]
+        id_map = torch.zeros(
+            masks.shape[-2:], device=bboxes.device, dtype=torch.long)
+        if num_insts == 0:
+            return id_map, labels
+
+        # Sort by score to use heuristic fusion
+        order = torch.argsort(-scores)
+        bboxes = bboxes[order]
+        labels = labels[order]
+        segm_masks = masks[order]
+
+        instance_id = 1
+        left_labels = []
+        for idx in range(bboxes.shape[0]):
+            _cls = labels[idx]
+            _mask = segm_masks[idx]
+            instance_id_map = torch.ones_like(
+                _mask, dtype=torch.long) * instance_id
+            area = _mask.sum()
+            if area == 0:
+                continue
+
+            pasted = id_map > 0
+            intersect = (_mask * pasted).sum()
+            if (intersect / (area + 1e-5)) > overlap_thr:
+                continue
+
+            _part = _mask * (~pasted)
+            id_map = torch.where(_part, instance_id_map, id_map)
+            left_labels.append(_cls)
+            instance_id += 1
+
+        if len(left_labels) > 0:
+            instance_labels = torch.stack(left_labels)
+        else:
+            instance_labels = bboxes.new_zeros((0, ), dtype=torch.long)
+        assert instance_id == (len(instance_labels) + 1)
+        return id_map, instance_labels
+
+    def _predict_single(self, mask_results: InstanceData, seg_preds: Tensor,
+                        **kwargs) -> PixelData:
+        """Fuse the results of instance and semantic segmentations.
+
+        Args:
+            mask_results (:obj:`InstanceData`): Instance segmentation results,
+                each contains ``bboxes``, ``labels``, ``scores`` and ``masks``.
+            seg_preds (Tensor): The semantic segmentation results,
+                (num_stuff + 1, H, W).
+
+        Returns:
+            Tensor: The panoptic segmentation result, (H, W).
+        """
+        id_map, labels = self._lay_masks(mask_results,
+                                         self.test_cfg.mask_overlap)
+
+        seg_results = seg_preds.argmax(dim=0)
+        seg_results = seg_results + self.num_things_classes
+
+        pan_results = seg_results
+        instance_id = 1
+        for idx in range(len(mask_results)):
+            _mask = id_map == (idx + 1)
+            if _mask.sum() == 0:
+                continue
+            _cls = labels[idx]
+            # simply trust detection
+            segment_id = _cls + instance_id * INSTANCE_OFFSET
+            pan_results[_mask] = segment_id
+            instance_id += 1
+
+        ids, counts = torch.unique(
+            pan_results % INSTANCE_OFFSET, return_counts=True)
+        stuff_ids = ids[ids >= self.num_things_classes]
+        stuff_counts = counts[ids >= self.num_things_classes]
+        ignore_stuff_ids = stuff_ids[
+            stuff_counts < self.test_cfg.stuff_area_limit]
+
+        assert pan_results.ndim == 2
+        pan_results[(pan_results.unsqueeze(2) == ignore_stuff_ids.reshape(
+            1, 1, -1)).any(dim=2)] = self.num_classes
+
+        pan_results = PixelData(sem_seg=pan_results[None].int())
+        return pan_results
+
+    def predict(self, mask_results_list: InstanceList,
+                seg_preds_list: List[Tensor], **kwargs) -> PixelList:
+        """Predict results by fusing the results of instance and semantic
+        segmentations.
+
+        Args:
+            mask_results_list (list[:obj:`InstanceData`]): Instance
+                segmentation results, each contains ``bboxes``, ``labels``,
+                ``scores`` and ``masks``.
+            seg_preds_list (Tensor): List of semantic segmentation results.
+
+        Returns:
+            List[PixelData]: Panoptic segmentation result.
+        """
+        results_list = [
+            self._predict_single(mask_results_list[i], seg_preds_list[i])
+            for i in range(len(mask_results_list))
+        ]
+
+        return results_list
diff --git a/head_extractor/src/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py b/head_extractor/src/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b76e6b45bb9be2584f8b3eca2e5e1c0809249fa
--- /dev/null
+++ b/head_extractor/src/mmdet/models/seg_heads/panoptic_fusion_heads/maskformer_fusion_head.py
@@ -0,0 +1,266 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData, PixelData
+from torch import Tensor
+
+from mmdet.evaluation.functional import INSTANCE_OFFSET
+from mmdet.registry import MODELS
+from mmdet.structures import SampleList
+from mmdet.structures.mask import mask2bbox
+from mmdet.utils import OptConfigType, OptMultiConfig
+from .base_panoptic_fusion_head import BasePanopticFusionHead
+
+
+@MODELS.register_module()
+class MaskFormerFusionHead(BasePanopticFusionHead):
+    """MaskFormer fusion head which postprocesses results for panoptic
+    segmentation, instance segmentation and semantic segmentation."""
+
+    def __init__(self,
+                 num_things_classes: int = 80,
+                 num_stuff_classes: int = 53,
+                 test_cfg: OptConfigType = None,
+                 loss_panoptic: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs):
+        super().__init__(
+            num_things_classes=num_things_classes,
+            num_stuff_classes=num_stuff_classes,
+            test_cfg=test_cfg,
+            loss_panoptic=loss_panoptic,
+            init_cfg=init_cfg,
+            **kwargs)
+
+    def loss(self, **kwargs):
+        """MaskFormerFusionHead has no training loss."""
+        return dict()
+
+    def panoptic_postprocess(self, mask_cls: Tensor,
+                             mask_pred: Tensor) -> PixelData:
+        """Panoptic segmengation inference.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            :obj:`PixelData`: Panoptic segment result of shape \
+                (h, w), each element in Tensor means: \
+                ``segment_id = _cls + instance_id * INSTANCE_OFFSET``.
+        """
+        object_mask_thr = self.test_cfg.get('object_mask_thr', 0.8)
+        iou_thr = self.test_cfg.get('iou_thr', 0.8)
+        filter_low_score = self.test_cfg.get('filter_low_score', False)
+
+        scores, labels = F.softmax(mask_cls, dim=-1).max(-1)
+        mask_pred = mask_pred.sigmoid()
+
+        keep = labels.ne(self.num_classes) & (scores > object_mask_thr)
+        cur_scores = scores[keep]
+        cur_classes = labels[keep]
+        cur_masks = mask_pred[keep]
+
+        cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks
+
+        h, w = cur_masks.shape[-2:]
+        panoptic_seg = torch.full((h, w),
+                                  self.num_classes,
+                                  dtype=torch.int32,
+                                  device=cur_masks.device)
+        if cur_masks.shape[0] == 0:
+            # We didn't detect any mask :(
+            pass
+        else:
+            cur_mask_ids = cur_prob_masks.argmax(0)
+            instance_id = 1
+            for k in range(cur_classes.shape[0]):
+                pred_class = int(cur_classes[k].item())
+                isthing = pred_class < self.num_things_classes
+                mask = cur_mask_ids == k
+                mask_area = mask.sum().item()
+                original_area = (cur_masks[k] >= 0.5).sum().item()
+
+                if filter_low_score:
+                    mask = mask & (cur_masks[k] >= 0.5)
+
+                if mask_area > 0 and original_area > 0:
+                    if mask_area / original_area < iou_thr:
+                        continue
+
+                    if not isthing:
+                        # different stuff regions of same class will be
+                        # merged here, and stuff share the instance_id 0.
+                        panoptic_seg[mask] = pred_class
+                    else:
+                        panoptic_seg[mask] = (
+                            pred_class + instance_id * INSTANCE_OFFSET)
+                        instance_id += 1
+
+        return PixelData(sem_seg=panoptic_seg[None])
+
+    def semantic_postprocess(self, mask_cls: Tensor,
+                             mask_pred: Tensor) -> PixelData:
+        """Semantic segmengation postprocess.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            :obj:`PixelData`: Semantic segment result.
+        """
+        # TODO add semantic segmentation result
+        raise NotImplementedError
+
+    def instance_postprocess(self, mask_cls: Tensor,
+                             mask_pred: Tensor) -> InstanceData:
+        """Instance segmengation postprocess.
+
+        Args:
+            mask_cls (Tensor): Classfication outputs of shape
+                (num_queries, cls_out_channels) for a image.
+                Note `cls_out_channels` should includes
+                background.
+            mask_pred (Tensor): Mask outputs of shape
+                (num_queries, h, w) for a image.
+
+        Returns:
+            :obj:`InstanceData`: Instance segmentation results.
+
+                - scores (Tensor): Classification scores, has a shape
+                    (num_instance, )
+                - labels (Tensor): Labels of bboxes, has a shape
+                    (num_instances, ).
+                - bboxes (Tensor): Has a shape (num_instances, 4),
+                    the last dimension 4 arrange as (x1, y1, x2, y2).
+                - masks (Tensor): Has a shape (num_instances, H, W).
+        """
+        max_per_image = self.test_cfg.get('max_per_image', 100)
+        num_queries = mask_cls.shape[0]
+        # shape (num_queries, num_class)
+        scores = F.softmax(mask_cls, dim=-1)[:, :-1]
+        # shape (num_queries * num_class, )
+        labels = torch.arange(self.num_classes, device=mask_cls.device).\
+            unsqueeze(0).repeat(num_queries, 1).flatten(0, 1)
+        scores_per_image, top_indices = scores.flatten(0, 1).topk(
+            max_per_image, sorted=False)
+        labels_per_image = labels[top_indices]
+
+        query_indices = top_indices // self.num_classes
+        mask_pred = mask_pred[query_indices]
+
+        # extract things
+        is_thing = labels_per_image < self.num_things_classes
+        scores_per_image = scores_per_image[is_thing]
+        labels_per_image = labels_per_image[is_thing]
+        mask_pred = mask_pred[is_thing]
+
+        mask_pred_binary = (mask_pred > 0).float()
+        mask_scores_per_image = (mask_pred.sigmoid() *
+                                 mask_pred_binary).flatten(1).sum(1) / (
+                                     mask_pred_binary.flatten(1).sum(1) + 1e-6)
+        det_scores = scores_per_image * mask_scores_per_image
+        mask_pred_binary = mask_pred_binary.bool()
+        bboxes = mask2bbox(mask_pred_binary)
+
+        results = InstanceData()
+        results.bboxes = bboxes
+        results.labels = labels_per_image
+        results.scores = det_scores
+        results.masks = mask_pred_binary
+        return results
+
+    def predict(self,
+                mask_cls_results: Tensor,
+                mask_pred_results: Tensor,
+                batch_data_samples: SampleList,
+                rescale: bool = False,
+                **kwargs) -> List[dict]:
+        """Test segment without test-time aumengtation.
+
+        Only the output of last decoder layers was used.
+
+        Args:
+            mask_cls_results (Tensor): Mask classification logits,
+                shape (batch_size, num_queries, cls_out_channels).
+                Note `cls_out_channels` should includes background.
+            mask_pred_results (Tensor): Mask logits, shape
+                (batch_size, num_queries, h, w).
+            batch_data_samples (List[:obj:`DetDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+            rescale (bool): If True, return boxes in
+                original image space. Default False.
+
+        Returns:
+            list[dict]: Instance segmentation \
+                results and panoptic segmentation results for each \
+                image.
+
+            .. code-block:: none
+
+                [
+                    {
+                        'pan_results': PixelData,
+                        'ins_results': InstanceData,
+                        # semantic segmentation results are not supported yet
+                        'sem_results': PixelData
+                    },
+                    ...
+                ]
+        """
+        batch_img_metas = [
+            data_sample.metainfo for data_sample in batch_data_samples
+        ]
+        panoptic_on = self.test_cfg.get('panoptic_on', True)
+        semantic_on = self.test_cfg.get('semantic_on', False)
+        instance_on = self.test_cfg.get('instance_on', False)
+        assert not semantic_on, 'segmantic segmentation '\
+            'results are not supported yet.'
+
+        results = []
+        for mask_cls_result, mask_pred_result, meta in zip(
+                mask_cls_results, mask_pred_results, batch_img_metas):
+            # remove padding
+            img_height, img_width = meta['img_shape'][:2]
+            mask_pred_result = mask_pred_result[:, :img_height, :img_width]
+
+            if rescale:
+                # return result in original resolution
+                ori_height, ori_width = meta['ori_shape'][:2]
+                mask_pred_result = F.interpolate(
+                    mask_pred_result[:, None],
+                    size=(ori_height, ori_width),
+                    mode='bilinear',
+                    align_corners=False)[:, 0]
+
+            result = dict()
+            if panoptic_on:
+                pan_results = self.panoptic_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['pan_results'] = pan_results
+
+            if instance_on:
+                ins_results = self.instance_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['ins_results'] = ins_results
+
+            if semantic_on:
+                sem_results = self.semantic_postprocess(
+                    mask_cls_result, mask_pred_result)
+                result['sem_results'] = sem_results
+
+            results.append(result)
+
+        return results
diff --git a/head_extractor/src/mmdet/models/task_modules/__init__.py b/head_extractor/src/mmdet/models/task_modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bfd8f058ed656760e0b1a3fd6118f31a799cb11
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .assigners import *  # noqa: F401,F403
+from .builder import (ANCHOR_GENERATORS, BBOX_ASSIGNERS, BBOX_CODERS,
+                      BBOX_SAMPLERS, IOU_CALCULATORS, MATCH_COSTS,
+                      PRIOR_GENERATORS, build_anchor_generator, build_assigner,
+                      build_bbox_coder, build_iou_calculator, build_match_cost,
+                      build_prior_generator, build_sampler)
+from .coders import *  # noqa: F401,F403
+from .prior_generators import *  # noqa: F401,F403
+from .samplers import *  # noqa: F401,F403
+from .tracking import *  # noqa: F401,F403
+
+__all__ = [
+    'ANCHOR_GENERATORS', 'PRIOR_GENERATORS', 'BBOX_ASSIGNERS', 'BBOX_SAMPLERS',
+    'MATCH_COSTS', 'BBOX_CODERS', 'IOU_CALCULATORS', 'build_anchor_generator',
+    'build_prior_generator', 'build_assigner', 'build_sampler',
+    'build_iou_calculator', 'build_match_cost', 'build_bbox_coder'
+]
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/__init__.py b/head_extractor/src/mmdet/models/task_modules/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e564f24c95b1cc6be8a35a1a309ebf10e582032
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .approx_max_iou_assigner import ApproxMaxIoUAssigner
+from .assign_result import AssignResult
+from .atss_assigner import ATSSAssigner
+from .base_assigner import BaseAssigner
+from .center_region_assigner import CenterRegionAssigner
+from .dynamic_soft_label_assigner import DynamicSoftLabelAssigner
+from .grid_assigner import GridAssigner
+from .hungarian_assigner import HungarianAssigner
+from .iou2d_calculator import BboxOverlaps2D, BboxOverlaps2D_GLIP
+from .match_cost import (BBoxL1Cost, BinaryFocalLossCost, ClassificationCost,
+                         CrossEntropyLossCost, DiceCost, FocalLossCost,
+                         IoUCost)
+from .max_iou_assigner import MaxIoUAssigner
+from .multi_instance_assigner import MultiInstanceAssigner
+from .point_assigner import PointAssigner
+from .region_assigner import RegionAssigner
+from .sim_ota_assigner import SimOTAAssigner
+from .task_aligned_assigner import TaskAlignedAssigner
+from .topk_hungarian_assigner import TopkHungarianAssigner
+from .uniform_assigner import UniformAssigner
+
+__all__ = [
+    'BaseAssigner', 'BinaryFocalLossCost', 'MaxIoUAssigner',
+    'ApproxMaxIoUAssigner', 'AssignResult', 'PointAssigner', 'ATSSAssigner',
+    'CenterRegionAssigner', 'GridAssigner', 'HungarianAssigner',
+    'RegionAssigner', 'UniformAssigner', 'SimOTAAssigner',
+    'TaskAlignedAssigner', 'TopkHungarianAssigner', 'BBoxL1Cost',
+    'ClassificationCost', 'CrossEntropyLossCost', 'DiceCost', 'FocalLossCost',
+    'IoUCost', 'BboxOverlaps2D', 'DynamicSoftLabelAssigner',
+    'MultiInstanceAssigner', 'BboxOverlaps2D_GLIP'
+]
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..471d54e578d640da242355b54cebe05658309ca2
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/approx_max_iou_assigner.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .max_iou_assigner import MaxIoUAssigner
+
+
+@TASK_UTILS.register_module()
+class ApproxMaxIoUAssigner(MaxIoUAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with an integer indicating the ground truth
+     index. (semi-positive index: gt label (0-based), -1: background)
+
+    - -1: negative sample, no assigned gt
+    - semi-positive integer: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+        match_low_quality (bool): Whether to allow quality matches. This is
+            usually allowed for RPN and single stage detectors, but not allowed
+            in the second stage.
+        gpu_assign_thr (int): The upper bound of the number of GT for GPU
+            assign. When the number of gt is above this threshold, will assign
+            on CPU device. Negative values mean not assign on CPU.
+        iou_calculator (:obj:`ConfigDict` or dict): Config of overlaps
+            Calculator.
+    """
+
+    def __init__(
+        self,
+        pos_iou_thr: float,
+        neg_iou_thr: Union[float, tuple],
+        min_pos_iou: float = .0,
+        gt_max_assign_all: bool = True,
+        ignore_iof_thr: float = -1,
+        ignore_wrt_candidates: bool = True,
+        match_low_quality: bool = True,
+        gpu_assign_thr: int = -1,
+        iou_calculator: Union[ConfigDict, dict] = dict(type='BboxOverlaps2D')
+    ) -> None:
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to approxs.
+
+        This method assign a gt bbox to each group of approxs (bboxes),
+        each group of approxs is represent by a base approx (bbox) and
+        will be assigned with -1, or a semi-positive number.
+        background_label (-1) means negative sample,
+        semi-positive number is the index (0-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to background_label (-1)
+        2. use the max IoU of each group of approxs to assign
+        2. assign proposals whose iou with all gts < neg_iou_thr to background
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). ``approxs`` means the
+                group of approxs aligned with ``priors``, has shape
+                (n, num_approxs, 4).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        squares = pred_instances.priors
+        approxs = pred_instances.approxs
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        gt_bboxes_ignore = None if gt_instances_ignore is None else \
+            gt_instances_ignore.get('bboxes', None)
+        approxs_per_octave = approxs.size(1)
+
+        num_squares = squares.size(0)
+        num_gts = gt_bboxes.size(0)
+
+        if num_squares == 0 or num_gts == 0:
+            # No predictions and/or truth, return empty assignment
+            overlaps = approxs.new(num_gts, num_squares)
+            assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+            return assign_result
+
+        # re-organize anchors by approxs_per_octave x num_squares
+        approxs = torch.transpose(approxs, 0, 1).contiguous().view(-1, 4)
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            num_gts > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = approxs.device
+            approxs = approxs.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+            if gt_labels is not None:
+                gt_labels = gt_labels.cpu()
+        all_overlaps = self.iou_calculator(approxs, gt_bboxes)
+
+        overlaps, _ = all_overlaps.view(approxs_per_octave, num_squares,
+                                        num_gts).max(dim=0)
+        overlaps = torch.transpose(overlaps, 0, 1)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and squares.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = self.iou_calculator(
+                    squares, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = self.iou_calculator(
+                    gt_bboxes_ignore, squares, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/assign_result.py b/head_extractor/src/mmdet/models/task_modules/assigners/assign_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..56ca2c3c18fee94cc4a039b769e42521bd14907d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/assign_result.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import Tensor
+
+from mmdet.utils import util_mixins
+
+
+class AssignResult(util_mixins.NiceRepr):
+    """Stores assignments between predicted and truth boxes.
+
+    Attributes:
+        num_gts (int): the number of truth boxes considered when computing this
+            assignment
+        gt_inds (Tensor): for each predicted box indicates the 1-based
+            index of the assigned truth box. 0 means unassigned and -1 means
+            ignore.
+        max_overlaps (Tensor): the iou between the predicted box and its
+            assigned truth box.
+        labels (Tensor): If specified, for each predicted box
+            indicates the category label of the assigned truth box.
+
+    Example:
+        >>> # An assign result between 4 predicted boxes and 9 true boxes
+        >>> # where only two boxes were assigned.
+        >>> num_gts = 9
+        >>> max_overlaps = torch.LongTensor([0, .5, .9, 0])
+        >>> gt_inds = torch.LongTensor([-1, 1, 2, 0])
+        >>> labels = torch.LongTensor([0, 3, 4, 0])
+        >>> self = AssignResult(num_gts, gt_inds, max_overlaps, labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(4,), max_overlaps.shape=(4,),
+                      labels.shape=(4,))>
+        >>> # Force addition of gt labels (when adding gt as proposals)
+        >>> new_labels = torch.LongTensor([3, 4, 5])
+        >>> self.add_gt_(new_labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(7,), max_overlaps.shape=(7,),
+                      labels.shape=(7,))>
+    """
+
+    def __init__(self, num_gts: int, gt_inds: Tensor, max_overlaps: Tensor,
+                 labels: Tensor) -> None:
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+        # Interface for possible user-defined properties
+        self._extra_properties = {}
+
+    @property
+    def num_preds(self):
+        """int: the number of predictions in this assignment"""
+        return len(self.gt_inds)
+
+    def set_extra_property(self, key, value):
+        """Set user-defined new property."""
+        assert key not in self.info
+        self._extra_properties[key] = value
+
+    def get_extra_property(self, key):
+        """Get user-defined property."""
+        return self._extra_properties.get(key, None)
+
+    @property
+    def info(self):
+        """dict: a dictionary of info about the object"""
+        basic_info = {
+            'num_gts': self.num_gts,
+            'num_preds': self.num_preds,
+            'gt_inds': self.gt_inds,
+            'max_overlaps': self.max_overlaps,
+            'labels': self.labels,
+        }
+        basic_info.update(self._extra_properties)
+        return basic_info
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this assign result"""
+        parts = []
+        parts.append(f'num_gts={self.num_gts!r}')
+        if self.gt_inds is None:
+            parts.append(f'gt_inds={self.gt_inds!r}')
+        else:
+            parts.append(f'gt_inds.shape={tuple(self.gt_inds.shape)!r}')
+        if self.max_overlaps is None:
+            parts.append(f'max_overlaps={self.max_overlaps!r}')
+        else:
+            parts.append('max_overlaps.shape='
+                         f'{tuple(self.max_overlaps.shape)!r}')
+        if self.labels is None:
+            parts.append(f'labels={self.labels!r}')
+        else:
+            parts.append(f'labels.shape={tuple(self.labels.shape)!r}')
+        return ', '.join(parts)
+
+    @classmethod
+    def random(cls, **kwargs):
+        """Create random AssignResult for tests or debugging.
+
+        Args:
+            num_preds: number of predicted boxes
+            num_gts: number of true boxes
+            p_ignore (float): probability of a predicted box assigned to an
+                ignored truth
+            p_assigned (float): probability of a predicted box not being
+                assigned
+            p_use_label (float | bool): with labels or not
+            rng (None | int | numpy.random.RandomState): seed or state
+
+        Returns:
+            :obj:`AssignResult`: Randomly generated assign results.
+
+        Example:
+            >>> from mmdet.models.task_modules.assigners.assign_result import *  # NOQA
+            >>> self = AssignResult.random()
+            >>> print(self.info)
+        """
+        from ..samplers.sampling_result import ensure_rng
+        rng = ensure_rng(kwargs.get('rng', None))
+
+        num_gts = kwargs.get('num_gts', None)
+        num_preds = kwargs.get('num_preds', None)
+        p_ignore = kwargs.get('p_ignore', 0.3)
+        p_assigned = kwargs.get('p_assigned', 0.7)
+        num_classes = kwargs.get('num_classes', 3)
+
+        if num_gts is None:
+            num_gts = rng.randint(0, 8)
+        if num_preds is None:
+            num_preds = rng.randint(0, 16)
+
+        if num_gts == 0:
+            max_overlaps = torch.zeros(num_preds, dtype=torch.float32)
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+            labels = torch.zeros(num_preds, dtype=torch.int64)
+
+        else:
+            import numpy as np
+
+            # Create an overlap for each predicted box
+            max_overlaps = torch.from_numpy(rng.rand(num_preds))
+
+            # Construct gt_inds for each predicted box
+            is_assigned = torch.from_numpy(rng.rand(num_preds) < p_assigned)
+            # maximum number of assignments constraints
+            n_assigned = min(num_preds, min(num_gts, is_assigned.sum()))
+
+            assigned_idxs = np.where(is_assigned)[0]
+            rng.shuffle(assigned_idxs)
+            assigned_idxs = assigned_idxs[0:n_assigned]
+            assigned_idxs.sort()
+
+            is_assigned[:] = 0
+            is_assigned[assigned_idxs] = True
+
+            is_ignore = torch.from_numpy(
+                rng.rand(num_preds) < p_ignore) & is_assigned
+
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+
+            true_idxs = np.arange(num_gts)
+            rng.shuffle(true_idxs)
+            true_idxs = torch.from_numpy(true_idxs)
+            gt_inds[is_assigned] = true_idxs[:n_assigned].long()
+
+            gt_inds = torch.from_numpy(
+                rng.randint(1, num_gts + 1, size=num_preds))
+            gt_inds[is_ignore] = -1
+            gt_inds[~is_assigned] = 0
+            max_overlaps[~is_assigned] = 0
+
+            if num_classes == 0:
+                labels = torch.zeros(num_preds, dtype=torch.int64)
+            else:
+                labels = torch.from_numpy(
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    rng.randint(0, num_classes, size=num_preds))
+                labels[~is_assigned] = 0
+
+        self = cls(num_gts, gt_inds, max_overlaps, labels)
+        return self
+
+    def add_gt_(self, gt_labels):
+        """Add ground truth as assigned results.
+
+        Args:
+            gt_labels (torch.Tensor): Labels of gt boxes
+        """
+        self_inds = torch.arange(
+            1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device)
+        self.gt_inds = torch.cat([self_inds, self.gt_inds])
+
+        self.max_overlaps = torch.cat(
+            [self.max_overlaps.new_ones(len(gt_labels)), self.max_overlaps])
+
+        self.labels = torch.cat([gt_labels, self.labels])
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/atss_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/atss_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..2796b990c5ae4c56bcf314e1342671d950232ae6
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/atss_assigner.py
@@ -0,0 +1,254 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def bbox_center_distance(bboxes: Tensor, priors: Tensor) -> Tensor:
+    """Compute the center distance between bboxes and priors.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for , "xyxy" format.
+        priors (Tensor): Shape (n, 4) for priors, "xyxy" format.
+
+    Returns:
+        Tensor: Center distances between bboxes and priors.
+    """
+    bbox_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
+    bbox_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
+    bbox_points = torch.stack((bbox_cx, bbox_cy), dim=1)
+
+    priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0
+    priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0
+    priors_points = torch.stack((priors_cx, priors_cy), dim=1)
+
+    distances = (priors_points[:, None, :] -
+                 bbox_points[None, :, :]).pow(2).sum(-1).sqrt()
+
+    return distances
+
+
+@TASK_UTILS.register_module()
+class ATSSAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each prior.
+
+    Each proposals will be assigned with `0` or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    If ``alpha`` is not None, it means that the dynamic cost
+    ATSSAssigner is adopted, which is currently only used in the DDOD.
+
+    Args:
+        topk (int): number of priors selected in each level
+        alpha (float, optional): param of cost rate for each proposal only
+            in DDOD. Defaults to None.
+        iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou
+            calculator. Defaults to ``dict(type='BboxOverlaps2D')``
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes. Defaults to -1.
+    """
+
+    def __init__(self,
+                 topk: int,
+                 alpha: Optional[float] = None,
+                 iou_calculator: ConfigType = dict(type='BboxOverlaps2D'),
+                 ignore_iof_thr: float = -1) -> None:
+        self.topk = topk
+        self.alpha = alpha
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+        self.ignore_iof_thr = ignore_iof_thr
+
+    # https://github.com/sfzhang15/ATSS/blob/master/atss_core/modeling/rpn/atss/loss.py
+    def assign(
+            self,
+            pred_instances: InstanceData,
+            num_level_priors: List[int],
+            gt_instances: InstanceData,
+            gt_instances_ignore: Optional[InstanceData] = None
+    ) -> AssignResult:
+        """Assign gt to priors.
+
+        The assignment is done in following steps
+
+        1. compute iou between all prior (prior of all pyramid levels) and gt
+        2. compute center distance between all prior and gt
+        3. on each pyramid level, for each gt, select k prior whose center
+           are closest to the gt center, so we total select k*l prior as
+           candidates for each gt
+        4. get corresponding iou for the these candidates, and compute the
+           mean and std, set mean + std as the iou threshold
+        5. select these candidates whose iou are greater than or equal to
+           the threshold as positive
+        6. limit the positive sample's center in gt
+
+        If ``alpha`` is not None, and ``cls_scores`` and `bbox_preds`
+        are not None, the overlaps calculation in the first step
+        will also include dynamic cost, which is currently only used in
+        the DDOD.
+
+        Args:
+            pred_instances (:obj:`InstaceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            num_level_priors (List): Number of bboxes in each level
+            gt_instances (:obj:`InstaceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            gt_instances_ignore (:obj:`InstaceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+        if gt_instances_ignore is not None:
+            gt_bboxes_ignore = gt_instances_ignore.bboxes
+        else:
+            gt_bboxes_ignore = None
+
+        INF = 100000000
+        priors = priors[:, :4]
+        num_gt, num_priors = gt_bboxes.size(0), priors.size(0)
+
+        message = 'Invalid alpha parameter because cls_scores or ' \
+                  'bbox_preds are None. If you want to use the ' \
+                  'cost-based ATSSAssigner,  please set cls_scores, ' \
+                  'bbox_preds and self.alpha at the same time. '
+
+        # compute iou between all bbox and gt
+        if self.alpha is None:
+            # ATSSAssigner
+            overlaps = self.iou_calculator(priors, gt_bboxes)
+            if ('scores' in pred_instances or 'bboxes' in pred_instances):
+                warnings.warn(message)
+
+        else:
+            # Dynamic cost ATSSAssigner in DDOD
+            assert ('scores' in pred_instances
+                    and 'bboxes' in pred_instances), message
+            cls_scores = pred_instances.scores
+            bbox_preds = pred_instances.bboxes
+
+            # compute cls cost for bbox and GT
+            cls_cost = torch.sigmoid(cls_scores[:, gt_labels])
+
+            # compute iou between all bbox and gt
+            overlaps = self.iou_calculator(bbox_preds, gt_bboxes)
+
+            # make sure that we are in element-wise multiplication
+            assert cls_cost.shape == overlaps.shape
+
+            # overlaps is actually a cost matrix
+            overlaps = cls_cost**(1 - self.alpha) * overlaps**self.alpha
+
+        # assign 0 by default
+        assigned_gt_inds = overlaps.new_full((num_priors, ),
+                                             0,
+                                             dtype=torch.long)
+
+        if num_gt == 0 or num_priors == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_priors, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            assigned_labels = overlaps.new_full((num_priors, ),
+                                                -1,
+                                                dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        # compute center distance between all bbox and gt
+        distances = bbox_center_distance(gt_bboxes, priors)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and priors.numel() > 0):
+            ignore_overlaps = self.iou_calculator(
+                priors, gt_bboxes_ignore, mode='iof')
+            ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            ignore_idxs = ignore_max_overlaps > self.ignore_iof_thr
+            distances[ignore_idxs, :] = INF
+            assigned_gt_inds[ignore_idxs] = -1
+
+        # Selecting candidates based on the center distance
+        candidate_idxs = []
+        start_idx = 0
+        for level, priors_per_level in enumerate(num_level_priors):
+            # on each pyramid level, for each gt,
+            # select k bbox whose center are closest to the gt center
+            end_idx = start_idx + priors_per_level
+            distances_per_level = distances[start_idx:end_idx, :]
+            selectable_k = min(self.topk, priors_per_level)
+            _, topk_idxs_per_level = distances_per_level.topk(
+                selectable_k, dim=0, largest=False)
+            candidate_idxs.append(topk_idxs_per_level + start_idx)
+            start_idx = end_idx
+        candidate_idxs = torch.cat(candidate_idxs, dim=0)
+
+        # get corresponding iou for the these candidates, and compute the
+        # mean and std, set mean + std as the iou threshold
+        candidate_overlaps = overlaps[candidate_idxs, torch.arange(num_gt)]
+        overlaps_mean_per_gt = candidate_overlaps.mean(0)
+        overlaps_std_per_gt = candidate_overlaps.std(0)
+        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
+
+        is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :]
+
+        # limit the positive sample's center in gt
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_priors
+        priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0
+        priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0
+        ep_priors_cx = priors_cx.view(1, -1).expand(
+            num_gt, num_priors).contiguous().view(-1)
+        ep_priors_cy = priors_cy.view(1, -1).expand(
+            num_gt, num_priors).contiguous().view(-1)
+        candidate_idxs = candidate_idxs.view(-1)
+
+        # calculate the left, top, right, bottom distance between positive
+        # prior center and gt side
+        l_ = ep_priors_cx[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 0]
+        t_ = ep_priors_cy[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - ep_priors_cx[candidate_idxs].view(-1, num_gt)
+        b_ = gt_bboxes[:, 3] - ep_priors_cy[candidate_idxs].view(-1, num_gt)
+        is_in_gts = torch.stack([l_, t_, r_, b_], dim=1).min(dim=1)[0] > 0.01
+
+        is_pos = is_pos & is_in_gts
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest IoU will be selected.
+        overlaps_inf = torch.full_like(overlaps,
+                                       -INF).t().contiguous().view(-1)
+        index = candidate_idxs.view(-1)[is_pos.view(-1)]
+        overlaps_inf[index] = overlaps.t().contiguous().view(-1)[index]
+        overlaps_inf = overlaps_inf.view(num_gt, -1).t()
+
+        max_overlaps, argmax_overlaps = overlaps_inf.max(dim=1)
+        assigned_gt_inds[
+            max_overlaps != -INF] = argmax_overlaps[max_overlaps != -INF] + 1
+
+        assigned_labels = assigned_gt_inds.new_full((num_priors, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/base_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/base_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..b12280ad746c7557008313dd936a62a99e8c78d5
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/base_assigner.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Optional
+
+from mmengine.structures import InstanceData
+
+
+class BaseAssigner(metaclass=ABCMeta):
+    """Base assigner that assigns boxes to ground truth boxes."""
+
+    @abstractmethod
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs):
+        """Assign boxes to either a ground truth boxes or a negative boxes."""
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/center_region_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/center_region_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..11c8055c67cdf46c1ae0f877e88192db33795581
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/center_region_assigner.py
@@ -0,0 +1,366 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def scale_boxes(bboxes: Tensor, scale: float) -> Tensor:
+    """Expand an array of boxes by a given scale.
+
+    Args:
+        bboxes (Tensor): Shape (m, 4)
+        scale (float): The scale factor of bboxes
+
+    Returns:
+        Tensor: Shape (m, 4). Scaled bboxes
+    """
+    assert bboxes.size(1) == 4
+    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
+    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
+    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
+    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_scaled = torch.zeros_like(bboxes)
+    boxes_scaled[:, 0] = x_c - w_half
+    boxes_scaled[:, 2] = x_c + w_half
+    boxes_scaled[:, 1] = y_c - h_half
+    boxes_scaled[:, 3] = y_c + h_half
+    return boxes_scaled
+
+
+def is_located_in(points: Tensor, bboxes: Tensor) -> Tensor:
+    """Are points located in bboxes.
+
+    Args:
+        points (Tensor): Points, shape: (m, 2).
+        bboxes (Tensor): Bounding boxes, shape: (n, 4).
+
+    Return:
+        Tensor: Flags indicating if points are located in bboxes,
+        shape: (m, n).
+    """
+    assert points.size(1) == 2
+    assert bboxes.size(1) == 4
+    return (points[:, 0].unsqueeze(1) > bboxes[:, 0].unsqueeze(0)) & \
+           (points[:, 0].unsqueeze(1) < bboxes[:, 2].unsqueeze(0)) & \
+           (points[:, 1].unsqueeze(1) > bboxes[:, 1].unsqueeze(0)) & \
+           (points[:, 1].unsqueeze(1) < bboxes[:, 3].unsqueeze(0))
+
+
+def bboxes_area(bboxes: Tensor) -> Tensor:
+    """Compute the area of an array of bboxes.
+
+    Args:
+        bboxes (Tensor): The coordinates ox bboxes. Shape: (m, 4)
+
+    Returns:
+        Tensor: Area of the bboxes. Shape: (m, )
+    """
+    assert bboxes.size(1) == 4
+    w = (bboxes[:, 2] - bboxes[:, 0])
+    h = (bboxes[:, 3] - bboxes[:, 1])
+    areas = w * h
+    return areas
+
+
+@TASK_UTILS.register_module()
+class CenterRegionAssigner(BaseAssigner):
+    """Assign pixels at the center region of a bbox as positive.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+    - -1: negative samples
+    - semi-positive numbers: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_scale (float): Threshold within which pixels are
+            labelled as positive.
+        neg_scale (float): Threshold above which pixels are
+            labelled as positive.
+        min_pos_iof (float): Minimum iof of a pixel with a gt to be
+            labelled as positive. Default: 1e-2
+        ignore_gt_scale (float): Threshold within which the pixels
+            are ignored when the gt is labelled as shadowed. Default: 0.5
+        foreground_dominate (bool): If True, the bbox will be assigned as
+            positive when a gt's kernel region overlaps with another's shadowed
+            (ignored) region, otherwise it is set as ignored. Default to False.
+        iou_calculator (:obj:`ConfigDict` or dict): Config of overlaps
+            Calculator.
+    """
+
+    def __init__(
+        self,
+        pos_scale: float,
+        neg_scale: float,
+        min_pos_iof: float = 1e-2,
+        ignore_gt_scale: float = 0.5,
+        foreground_dominate: bool = False,
+        iou_calculator: ConfigType = dict(type='BboxOverlaps2D')
+    ) -> None:
+        self.pos_scale = pos_scale
+        self.neg_scale = neg_scale
+        self.min_pos_iof = min_pos_iof
+        self.ignore_gt_scale = ignore_gt_scale
+        self.foreground_dominate = foreground_dominate
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def get_gt_priorities(self, gt_bboxes: Tensor) -> Tensor:
+        """Get gt priorities according to their areas.
+
+        Smaller gt has higher priority.
+
+        Args:
+            gt_bboxes (Tensor): Ground truth boxes, shape (k, 4).
+
+        Returns:
+            Tensor: The priority of gts so that gts with larger priority is
+            more likely to be assigned. Shape (k, )
+        """
+        gt_areas = bboxes_area(gt_bboxes)
+        # Rank all gt bbox areas. Smaller objects has larger priority
+        _, sort_idx = gt_areas.sort(descending=True)
+        sort_idx = sort_idx.argsort()
+        return sort_idx
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes.
+
+        This method assigns gts to every prior (proposal/anchor), each prior
+        will be assigned with -1, or a semi-positive number. -1 means
+        negative sample, semi-positive number is the index (0-based) of
+        assigned gt.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result. Note that shadowed_labels
+            of shape (N, 2) is also added as an `assign_result` attribute.
+            `shadowed_labels` is a tensor composed of N pairs of anchor_ind,
+            class_label], where N is the number of anchors that lie in the
+            outer region of a gt, anchor_ind is the shadowed anchor index
+            and class_label is the shadowed class label.
+
+        Example:
+            >>> from mmengine.structures import InstanceData
+            >>> self = CenterRegionAssigner(0.2, 0.2)
+            >>> pred_instances.priors = torch.Tensor([[0, 0, 10, 10],
+            ...                                      [10, 10, 20, 20]])
+            >>> gt_instances = InstanceData()
+            >>> gt_instances.bboxes = torch.Tensor([[0, 0, 10, 10]])
+            >>> gt_instances.labels = torch.Tensor([0])
+            >>> assign_result = self.assign(pred_instances, gt_instances)
+            >>> expected_gt_inds = torch.LongTensor([1, 0])
+            >>> assert torch.all(assign_result.gt_inds == expected_gt_inds)
+        """
+        # There are in total 5 steps in the pixel assignment
+        # 1. Find core (the center region, say inner 0.2)
+        #     and shadow (the relatively ourter part, say inner 0.2-0.5)
+        #     regions of every gt.
+        # 2. Find all prior bboxes that lie in gt_core and gt_shadow regions
+        # 3. Assign prior bboxes in gt_core with a one-hot id of the gt in
+        #      the image.
+        #    3.1. For overlapping objects, the prior bboxes in gt_core is
+        #           assigned with the object with smallest area
+        # 4. Assign prior bboxes with class label according to its gt id.
+        #    4.1. Assign -1 to prior bboxes lying in shadowed gts
+        #    4.2. Assign positive prior boxes with the corresponding label
+        # 5. Find pixels lying in the shadow of an object and assign them with
+        #      background label, but set the loss weight of its corresponding
+        #      gt to zero.
+
+        # TODO not extract bboxes in assign.
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+
+        assert priors.size(1) == 4, 'priors must have size of 4'
+        # 1. Find core positive and shadow region of every gt
+        gt_core = scale_boxes(gt_bboxes, self.pos_scale)
+        gt_shadow = scale_boxes(gt_bboxes, self.neg_scale)
+
+        # 2. Find prior bboxes that lie in gt_core and gt_shadow regions
+        prior_centers = (priors[:, 2:4] + priors[:, 0:2]) / 2
+        # The center points lie within the gt boxes
+        is_prior_in_gt = is_located_in(prior_centers, gt_bboxes)
+        # Only calculate prior and gt_core IoF. This enables small prior bboxes
+        #   to match large gts
+        prior_and_gt_core_overlaps = self.iou_calculator(
+            priors, gt_core, mode='iof')
+        # The center point of effective priors should be within the gt box
+        is_prior_in_gt_core = is_prior_in_gt & (
+            prior_and_gt_core_overlaps > self.min_pos_iof)  # shape (n, k)
+
+        is_prior_in_gt_shadow = (
+            self.iou_calculator(priors, gt_shadow, mode='iof') >
+            self.min_pos_iof)
+        # Rule out center effective positive pixels
+        is_prior_in_gt_shadow &= (~is_prior_in_gt_core)
+
+        num_gts, num_priors = gt_bboxes.size(0), priors.size(0)
+        if num_gts == 0 or num_priors == 0:
+            # If no gts exist, assign all pixels to negative
+            assigned_gt_ids = \
+                is_prior_in_gt_core.new_zeros((num_priors,),
+                                              dtype=torch.long)
+            pixels_in_gt_shadow = assigned_gt_ids.new_empty((0, 2))
+        else:
+            # Step 3: assign a one-hot gt id to each pixel, and smaller objects
+            #    have high priority to assign the pixel.
+            sort_idx = self.get_gt_priorities(gt_bboxes)
+            assigned_gt_ids, pixels_in_gt_shadow = \
+                self.assign_one_hot_gt_indices(is_prior_in_gt_core,
+                                               is_prior_in_gt_shadow,
+                                               gt_priority=sort_idx)
+
+        if (gt_instances_ignore is not None
+                and gt_instances_ignore.bboxes.numel() > 0):
+            # No ground truth or boxes, return empty assignment
+            gt_bboxes_ignore = gt_instances_ignore.bboxes
+            gt_bboxes_ignore = scale_boxes(
+                gt_bboxes_ignore, scale=self.ignore_gt_scale)
+            is_prior_in_ignored_gts = is_located_in(prior_centers,
+                                                    gt_bboxes_ignore)
+            is_prior_in_ignored_gts = is_prior_in_ignored_gts.any(dim=1)
+            assigned_gt_ids[is_prior_in_ignored_gts] = -1
+
+        # 4. Assign prior bboxes with class label according to its gt id.
+        # Default assigned label is the background (-1)
+        assigned_labels = assigned_gt_ids.new_full((num_priors, ), -1)
+        pos_inds = torch.nonzero(assigned_gt_ids > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_ids[pos_inds] -
+                                                  1]
+        # 5. Find pixels lying in the shadow of an object
+        shadowed_pixel_labels = pixels_in_gt_shadow.clone()
+        if pixels_in_gt_shadow.numel() > 0:
+            pixel_idx, gt_idx =\
+                pixels_in_gt_shadow[:, 0], pixels_in_gt_shadow[:, 1]
+            assert (assigned_gt_ids[pixel_idx] != gt_idx).all(), \
+                'Some pixels are dually assigned to ignore and gt!'
+            shadowed_pixel_labels[:, 1] = gt_labels[gt_idx - 1]
+            override = (
+                assigned_labels[pixel_idx] == shadowed_pixel_labels[:, 1])
+            if self.foreground_dominate:
+                # When a pixel is both positive and shadowed, set it as pos
+                shadowed_pixel_labels = shadowed_pixel_labels[~override]
+            else:
+                # When a pixel is both pos and shadowed, set it as shadowed
+                assigned_labels[pixel_idx[override]] = -1
+                assigned_gt_ids[pixel_idx[override]] = 0
+
+        assign_result = AssignResult(
+            num_gts, assigned_gt_ids, None, labels=assigned_labels)
+        # Add shadowed_labels as assign_result property. Shape: (num_shadow, 2)
+        assign_result.set_extra_property('shadowed_labels',
+                                         shadowed_pixel_labels)
+        return assign_result
+
+    def assign_one_hot_gt_indices(
+            self,
+            is_prior_in_gt_core: Tensor,
+            is_prior_in_gt_shadow: Tensor,
+            gt_priority: Optional[Tensor] = None) -> Tuple[Tensor, Tensor]:
+        """Assign only one gt index to each prior box.
+
+        Gts with large gt_priority are more likely to be assigned.
+
+        Args:
+            is_prior_in_gt_core (Tensor): Bool tensor indicating the prior
+                center is in the core area of a gt (e.g. 0-0.2).
+                Shape: (num_prior, num_gt).
+            is_prior_in_gt_shadow (Tensor): Bool tensor indicating the prior
+                center is in the shadowed area of a gt (e.g. 0.2-0.5).
+                Shape: (num_prior, num_gt).
+            gt_priority (Tensor): Priorities of gts. The gt with a higher
+                priority is more likely to be assigned to the bbox when the
+                bbox match with multiple gts. Shape: (num_gt, ).
+
+        Returns:
+            tuple: Returns (assigned_gt_inds, shadowed_gt_inds).
+
+            - assigned_gt_inds: The assigned gt index of each prior bbox \
+            (i.e. index from 1 to num_gts). Shape: (num_prior, ).
+            - shadowed_gt_inds: shadowed gt indices. It is a tensor of \
+            shape (num_ignore, 2) with first column being the shadowed prior \
+            bbox indices and the second column the shadowed gt \
+            indices (1-based).
+        """
+        num_bboxes, num_gts = is_prior_in_gt_core.shape
+
+        if gt_priority is None:
+            gt_priority = torch.arange(
+                num_gts, device=is_prior_in_gt_core.device)
+        assert gt_priority.size(0) == num_gts
+        # The bigger gt_priority, the more preferable to be assigned
+        # The assigned inds are by default 0 (background)
+        assigned_gt_inds = is_prior_in_gt_core.new_zeros((num_bboxes, ),
+                                                         dtype=torch.long)
+        # Shadowed bboxes are assigned to be background. But the corresponding
+        #   label is ignored during loss calculation, which is done through
+        #   shadowed_gt_inds
+        shadowed_gt_inds = torch.nonzero(is_prior_in_gt_shadow, as_tuple=False)
+        if is_prior_in_gt_core.sum() == 0:  # No gt match
+            shadowed_gt_inds[:, 1] += 1  # 1-based. For consistency issue
+            return assigned_gt_inds, shadowed_gt_inds
+
+        # The priority of each prior box and gt pair. If one prior box is
+        #  matched bo multiple gts. Only the pair with the highest priority
+        #  is saved
+        pair_priority = is_prior_in_gt_core.new_full((num_bboxes, num_gts),
+                                                     -1,
+                                                     dtype=torch.long)
+
+        # Each bbox could match with multiple gts.
+        # The following codes deal with this situation
+        # Matched  bboxes (to any gt). Shape: (num_pos_anchor, )
+        inds_of_match = torch.any(is_prior_in_gt_core, dim=1)
+        # The matched gt index of each positive bbox. Length >= num_pos_anchor
+        #   , since one bbox could match multiple gts
+        matched_bbox_gt_inds = torch.nonzero(
+            is_prior_in_gt_core, as_tuple=False)[:, 1]
+        # Assign priority to each bbox-gt pair.
+        pair_priority[is_prior_in_gt_core] = gt_priority[matched_bbox_gt_inds]
+        _, argmax_priority = pair_priority[inds_of_match].max(dim=1)
+        assigned_gt_inds[inds_of_match] = argmax_priority + 1  # 1-based
+        # Zero-out the assigned anchor box to filter the shadowed gt indices
+        is_prior_in_gt_core[inds_of_match, argmax_priority] = 0
+        # Concat the shadowed indices due to overlapping with that out side of
+        #   effective scale. shape: (total_num_ignore, 2)
+        shadowed_gt_inds = torch.cat(
+            (shadowed_gt_inds,
+             torch.nonzero(is_prior_in_gt_core, as_tuple=False)),
+            dim=0)
+        # Change `is_prior_in_gt_core` back to keep arguments intact.
+        is_prior_in_gt_core[inds_of_match, argmax_priority] = 1
+        # 1-based shadowed gt indices, to be consistent with `assigned_gt_inds`
+        if shadowed_gt_inds.numel() > 0:
+            shadowed_gt_inds[:, 1] += 1
+        return assigned_gt_inds, shadowed_gt_inds
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fc7af39b22cd6dc00248e330547176787c23963
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/dynamic_soft_label_assigner.py
@@ -0,0 +1,227 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+INF = 100000000
+EPS = 1.0e-7
+
+
+def center_of_mass(masks: Tensor, eps: float = 1e-7) -> Tensor:
+    """Compute the masks center of mass.
+
+    Args:
+        masks: Mask tensor, has shape (num_masks, H, W).
+        eps: a small number to avoid normalizer to be zero.
+            Defaults to 1e-7.
+    Returns:
+        Tensor: The masks center of mass. Has shape (num_masks, 2).
+    """
+    n, h, w = masks.shape
+    grid_h = torch.arange(h, device=masks.device)[:, None]
+    grid_w = torch.arange(w, device=masks.device)
+    normalizer = masks.sum(dim=(1, 2)).float().clamp(min=eps)
+    center_y = (masks * grid_h).sum(dim=(1, 2)) / normalizer
+    center_x = (masks * grid_w).sum(dim=(1, 2)) / normalizer
+    center = torch.cat([center_x[:, None], center_y[:, None]], dim=1)
+    return center
+
+
+@TASK_UTILS.register_module()
+class DynamicSoftLabelAssigner(BaseAssigner):
+    """Computes matching between predictions and ground truth with dynamic soft
+    label assignment.
+
+    Args:
+        soft_center_radius (float): Radius of the soft center prior.
+            Defaults to 3.0.
+        topk (int): Select top-k predictions to calculate dynamic k
+            best matches for each gt. Defaults to 13.
+        iou_weight (float): The scale factor of iou cost. Defaults to 3.0.
+        iou_calculator (ConfigType): Config of overlaps Calculator.
+            Defaults to dict(type='BboxOverlaps2D').
+    """
+
+    def __init__(
+        self,
+        soft_center_radius: float = 3.0,
+        topk: int = 13,
+        iou_weight: float = 3.0,
+        iou_calculator: ConfigType = dict(type='BboxOverlaps2D')
+    ) -> None:
+        self.soft_center_radius = soft_center_radius
+        self.topk = topk
+        self.iou_weight = iou_weight
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to priors.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            obj:`AssignResult`: The assigned result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        num_gt = gt_bboxes.size(0)
+
+        decoded_bboxes = pred_instances.bboxes
+        pred_scores = pred_instances.scores
+        priors = pred_instances.priors
+        num_bboxes = decoded_bboxes.size(0)
+
+        # assign 0 by default
+        assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ),
+                                                   0,
+                                                   dtype=torch.long)
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        prior_center = priors[:, :2]
+        if isinstance(gt_bboxes, BaseBoxes):
+            is_in_gts = gt_bboxes.find_inside_points(prior_center)
+        else:
+            # Tensor boxes will be treated as horizontal boxes by defaults
+            lt_ = prior_center[:, None] - gt_bboxes[:, :2]
+            rb_ = gt_bboxes[:, 2:] - prior_center[:, None]
+
+            deltas = torch.cat([lt_, rb_], dim=-1)
+            is_in_gts = deltas.min(dim=-1).values > 0
+
+        valid_mask = is_in_gts.sum(dim=1) > 0
+
+        valid_decoded_bbox = decoded_bboxes[valid_mask]
+        valid_pred_scores = pred_scores[valid_mask]
+        num_valid = valid_decoded_bbox.size(0)
+
+        if num_valid == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+        if hasattr(gt_instances, 'masks'):
+            gt_center = center_of_mass(gt_instances.masks, eps=EPS)
+        elif isinstance(gt_bboxes, BaseBoxes):
+            gt_center = gt_bboxes.centers
+        else:
+            # Tensor boxes will be treated as horizontal boxes by defaults
+            gt_center = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) / 2.0
+        valid_prior = priors[valid_mask]
+        strides = valid_prior[:, 2]
+        distance = (valid_prior[:, None, :2] - gt_center[None, :, :]
+                    ).pow(2).sum(-1).sqrt() / strides[:, None]
+        soft_center_prior = torch.pow(10, distance - self.soft_center_radius)
+
+        pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes)
+        iou_cost = -torch.log(pairwise_ious + EPS) * self.iou_weight
+
+        gt_onehot_label = (
+            F.one_hot(gt_labels.to(torch.int64),
+                      pred_scores.shape[-1]).float().unsqueeze(0).repeat(
+                          num_valid, 1, 1))
+        valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(1, num_gt, 1)
+
+        soft_label = gt_onehot_label * pairwise_ious[..., None]
+        scale_factor = soft_label - valid_pred_scores.sigmoid()
+        soft_cls_cost = F.binary_cross_entropy_with_logits(
+            valid_pred_scores, soft_label,
+            reduction='none') * scale_factor.abs().pow(2.0)
+        soft_cls_cost = soft_cls_cost.sum(dim=-1)
+
+        cost_matrix = soft_cls_cost + iou_cost + soft_center_prior
+
+        matched_pred_ious, matched_gt_inds = self.dynamic_k_matching(
+            cost_matrix, pairwise_ious, num_gt, valid_mask)
+
+        # convert to AssignResult format
+        assigned_gt_inds[valid_mask] = matched_gt_inds + 1
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long()
+        max_overlaps = assigned_gt_inds.new_full((num_bboxes, ),
+                                                 -INF,
+                                                 dtype=torch.float32)
+        max_overlaps[valid_mask] = matched_pred_ious
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+    def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor,
+                           num_gt: int,
+                           valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
+        """Use IoU and matching cost to calculate the dynamic top-k positive
+        targets. Same as SimOTA.
+
+        Args:
+            cost (Tensor): Cost matrix.
+            pairwise_ious (Tensor): Pairwise iou matrix.
+            num_gt (int): Number of gt.
+            valid_mask (Tensor): Mask for valid bboxes.
+
+        Returns:
+            tuple: matched ious and gt indexes.
+        """
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+        # select candidate topk ious for dynamic-k calculation
+        candidate_topk = min(self.topk, pairwise_ious.size(0))
+        topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0)
+        # calculate dynamic k for each gt
+        dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
+            matching_matrix[:, gt_idx][pos_idx] = 1
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        prior_match_gt_mask = matching_matrix.sum(1) > 1
+        if prior_match_gt_mask.sum() > 0:
+            cost_min, cost_argmin = torch.min(
+                cost[prior_match_gt_mask, :], dim=1)
+            matching_matrix[prior_match_gt_mask, :] *= 0
+            matching_matrix[prior_match_gt_mask, cost_argmin] = 1
+        # get foreground mask inside box and center prior
+        fg_mask_inboxes = matching_matrix.sum(1) > 0
+        valid_mask[valid_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1)
+        matched_pred_ious = (matching_matrix *
+                             pairwise_ious).sum(1)[fg_mask_inboxes]
+        return matched_pred_ious, matched_gt_inds
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/grid_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/grid_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8935d2df2937f90c71599e5b45ed9a3dff8cd7e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/grid_assigner.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class GridAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple[float, float]): IoU threshold for negative
+        bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+            Defaults to 0.
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        iou_calculator (:obj:`ConfigDict` or dict): Config of overlaps
+            Calculator.
+    """
+
+    def __init__(
+        self,
+        pos_iou_thr: float,
+        neg_iou_thr: Union[float, Tuple[float, float]],
+        min_pos_iou: float = .0,
+        gt_max_assign_all: bool = True,
+        iou_calculator: ConfigType = dict(type='BboxOverlaps2D')
+    ) -> None:
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes. The process is very much like the max iou
+        assigner, except that positive samples are constrained within the cell
+        that the gt boxes fell in.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, 0, or a positive number. -1 means don't care,
+        0 means negative sample, positive number is the index (1-based) of
+        assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to -1
+        2. assign proposals whose iou with all gts <= neg_iou_thr to 0
+        3. for each bbox within a cell, if the iou with its nearest gt >
+            pos_iou_thr and the center of that gt falls inside the cell,
+            assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals within the cell the
+            gt bbox falls in to itself.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+
+        priors = pred_instances.priors
+        responsible_flags = pred_instances.responsible_flags
+
+        num_gts, num_priors = gt_bboxes.size(0), priors.size(0)
+
+        # compute iou between all gt and priors
+        overlaps = self.iou_calculator(gt_bboxes, priors)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = overlaps.new_full((num_priors, ),
+                                             -1,
+                                             dtype=torch.long)
+
+        if num_gts == 0 or num_priors == 0:
+            # No ground truth or priors, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_priors, ))
+            if num_gts == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            assigned_labels = overlaps.new_full((num_priors, ),
+                                                -1,
+                                                dtype=torch.long)
+            return AssignResult(
+                num_gts,
+                assigned_gt_inds,
+                max_overlaps,
+                labels=assigned_labels)
+
+        # 2. assign negative: below
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        # shape of max_overlaps == argmax_overlaps == num_priors
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+        if isinstance(self.neg_iou_thr, float):
+            assigned_gt_inds[(max_overlaps >= 0)
+                             & (max_overlaps <= self.neg_iou_thr)] = 0
+        elif isinstance(self.neg_iou_thr, (tuple, list)):
+            assert len(self.neg_iou_thr) == 2
+            assigned_gt_inds[(max_overlaps > self.neg_iou_thr[0])
+                             & (max_overlaps <= self.neg_iou_thr[1])] = 0
+
+        # 3. assign positive: falls into responsible cell and above
+        # positive IOU threshold, the order matters.
+        # the prior condition of comparison is to filter out all
+        # unrelated anchors, i.e. not responsible_flags
+        overlaps[:, ~responsible_flags.type(torch.bool)] = -1.
+
+        # calculate max_overlaps again, but this time we only consider IOUs
+        # for anchors responsible for prediction
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        # shape of gt_max_overlaps == gt_argmax_overlaps == num_gts
+        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
+
+        pos_inds = (max_overlaps > self.pos_iou_thr) & responsible_flags.type(
+            torch.bool)
+        assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+        # 4. assign positive to max overlapped anchors within responsible cell
+        for i in range(num_gts):
+            if gt_max_overlaps[i] > self.min_pos_iou:
+                if self.gt_max_assign_all:
+                    max_iou_inds = (overlaps[i, :] == gt_max_overlaps[i]) & \
+                         responsible_flags.type(torch.bool)
+                    assigned_gt_inds[max_iou_inds] = i + 1
+                elif responsible_flags[gt_argmax_overlaps[i]]:
+                    assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        # assign labels of positive anchors
+        assigned_labels = assigned_gt_inds.new_full((num_priors, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+
+        return AssignResult(
+            num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/hungarian_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/hungarian_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6745a36cdc713c74f801f62dae0d8fe3d03828f
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/hungarian_assigner.py
@@ -0,0 +1,145 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import torch
+from mmengine import ConfigDict
+from mmengine.structures import InstanceData
+from scipy.optimize import linear_sum_assignment
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class HungarianAssigner(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of some components.
+    For DETR the costs are weighted sum of classification cost, regression L1
+    cost and regression iou cost. The targets don't include the no_object, so
+    generally there are more predictions than targets. After the one-to-one
+    matching, the un-matched are treated as backgrounds. Thus each query
+    prediction will be assigned with `0` or a positive integer indicating the
+    ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        match_costs (:obj:`ConfigDict` or dict or \
+            List[Union[:obj:`ConfigDict`, dict]]): Match cost configs.
+    """
+
+    def __init__(
+        self, match_costs: Union[List[Union[dict, ConfigDict]], dict,
+                                 ConfigDict]
+    ) -> None:
+
+        if isinstance(match_costs, dict):
+            match_costs = [match_costs]
+        elif isinstance(match_costs, list):
+            assert len(match_costs) > 0, \
+                'match_costs must not be a empty list.'
+
+        self.match_costs = [
+            TASK_UTILS.build(match_cost) for match_cost in match_costs
+        ]
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               img_meta: Optional[dict] = None,
+               **kwargs) -> AssignResult:
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places. It may includes ``masks``, with shape
+                (n, h, w) or (n, l).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                ``labels``, with shape (k, ) and ``masks``, with shape
+                (k, h, w) or (k, l).
+            img_meta (dict): Image information.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert isinstance(gt_instances.labels, Tensor)
+        num_gts, num_preds = len(gt_instances), len(pred_instances)
+        gt_labels = gt_instances.labels
+        device = gt_labels.device
+
+        # 1. assign -1 by default
+        assigned_gt_inds = torch.full((num_preds, ),
+                                      -1,
+                                      dtype=torch.long,
+                                      device=device)
+        assigned_labels = torch.full((num_preds, ),
+                                     -1,
+                                     dtype=torch.long,
+                                     device=device)
+
+        if num_gts == 0 or num_preds == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts=num_gts,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=None,
+                labels=assigned_labels)
+
+        # 2. compute weighted cost
+        cost_list = []
+        for match_cost in self.match_costs:
+            cost = match_cost(
+                pred_instances=pred_instances,
+                gt_instances=gt_instances,
+                img_meta=img_meta)
+            cost_list.append(cost)
+        cost = torch.stack(cost_list).sum(dim=0)
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts=num_gts,
+            gt_inds=assigned_gt_inds,
+            max_overlaps=None,
+            labels=assigned_labels)
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/iou2d_calculator.py b/head_extractor/src/mmdet/models/task_modules/assigners/iou2d_calculator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6daa94feb46ac2f188df41c7be59ffdc3905e58
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/iou2d_calculator.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps, get_box_tensor
+
+
+def cast_tensor_type(x, scale=1., dtype=None):
+    if dtype == 'fp16':
+        # scale is for preventing overflows
+        x = (x / scale).half()
+    return x
+
+
+@TASK_UTILS.register_module()
+class BboxOverlaps2D:
+    """2D Overlaps (e.g. IoUs, GIoUs) Calculator."""
+
+    def __init__(self, scale=1., dtype=None):
+        self.scale = scale
+        self.dtype = dtype
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate IoU between 2D bboxes.
+
+        Args:
+            bboxes1 (Tensor or :obj:`BaseBoxes`): bboxes have shape (m, 4)
+                in <x1, y1, x2, y2> format, or shape (m, 5) in <x1, y1, x2,
+                y2, score> format.
+            bboxes2 (Tensor or :obj:`BaseBoxes`): bboxes have shape (m, 4)
+                in <x1, y1, x2, y2> format, shape (m, 5) in <x1, y1, x2, y2,
+                score> format, or be empty. If ``is_aligned `` is ``True``,
+                then m and n must be equal.
+            mode (str): "iou" (intersection over union), "iof" (intersection
+                over foreground), or "giou" (generalized intersection over
+                union).
+            is_aligned (bool, optional): If True, then m and n must be equal.
+                Default False.
+
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+        """
+        bboxes1 = get_box_tensor(bboxes1)
+        bboxes2 = get_box_tensor(bboxes2)
+        assert bboxes1.size(-1) in [0, 4, 5]
+        assert bboxes2.size(-1) in [0, 4, 5]
+        if bboxes2.size(-1) == 5:
+            bboxes2 = bboxes2[..., :4]
+        if bboxes1.size(-1) == 5:
+            bboxes1 = bboxes1[..., :4]
+
+        if self.dtype == 'fp16':
+            # change tensor type to save cpu and cuda memory and keep speed
+            bboxes1 = cast_tensor_type(bboxes1, self.scale, self.dtype)
+            bboxes2 = cast_tensor_type(bboxes2, self.scale, self.dtype)
+            overlaps = bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+            if not overlaps.is_cuda and overlaps.dtype == torch.float16:
+                # resume cpu float32
+                overlaps = overlaps.float()
+            return overlaps
+
+        return bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+
+    def __repr__(self):
+        """str: a string describing the module"""
+        repr_str = self.__class__.__name__ + f'(' \
+            f'scale={self.scale}, dtype={self.dtype})'
+        return repr_str
+
+
+@TASK_UTILS.register_module()
+class BboxOverlaps2D_GLIP(BboxOverlaps2D):
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        TO_REMOVE = 1
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + TO_REMOVE) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + TO_REMOVE)
+        area2 = (bboxes2[:, 2] - bboxes2[:, 0] + TO_REMOVE) * (
+            bboxes2[:, 3] - bboxes2[:, 1] + TO_REMOVE)
+
+        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [N,M,2]
+        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [N,M,2]
+
+        wh = (rb - lt + TO_REMOVE).clamp(min=0)  # [N,M,2]
+        inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+        iou = inter / (area1[:, None] + area2 - inter)
+        return iou
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/match_cost.py b/head_extractor/src/mmdet/models/task_modules/assigners/match_cost.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fc62f01f29138cba31ef2b41254f497351fe0d0
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/match_cost.py
@@ -0,0 +1,525 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox_overlaps, bbox_xyxy_to_cxcywh
+
+
+class BaseMatchCost:
+    """Base match cost class.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self, weight: Union[float, int] = 1.) -> None:
+        self.weight = weight
+
+    @abstractmethod
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            img_meta (dict, optional): Image information.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pass
+
+
+@TASK_UTILS.register_module()
+class BBoxL1Cost(BaseMatchCost):
+    """BBoxL1Cost.
+
+    Note: ``bboxes`` in ``InstanceData`` passed in is of format 'xyxy'
+    and its coordinates are unnormalized.
+
+    Args:
+        box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN.
+            Defaults to 'xyxy'.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+
+    Examples:
+        >>> from mmdet.models.task_modules.assigners.
+        ... match_costs.match_cost import BBoxL1Cost
+        >>> import torch
+        >>> self = BBoxL1Cost()
+        >>> bbox_pred = torch.rand(1, 4)
+        >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+        >>> factor = torch.tensor([10, 8, 10, 8])
+        >>> self(bbox_pred, gt_bboxes, factor)
+        tensor([[1.6172, 1.6422]])
+    """
+
+    def __init__(self,
+                 box_format: str = 'xyxy',
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        assert box_format in ['xyxy', 'xywh']
+        self.box_format = box_format
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): ``bboxes`` inside is
+                predicted boxes with unnormalized coordinate
+                (x, y, x, y).
+            gt_instances (:obj:`InstanceData`): ``bboxes`` inside is gt
+                bboxes with unnormalized coordinate (x, y, x, y).
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_bboxes = pred_instances.bboxes
+        gt_bboxes = gt_instances.bboxes
+
+        # convert box format
+        if self.box_format == 'xywh':
+            gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes)
+            pred_bboxes = bbox_xyxy_to_cxcywh(pred_bboxes)
+
+        # normalized
+        img_h, img_w = img_meta['img_shape']
+        factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        gt_bboxes = gt_bboxes / factor
+        pred_bboxes = pred_bboxes / factor
+
+        bbox_cost = torch.cdist(pred_bboxes, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class IoUCost(BaseMatchCost):
+    """IoUCost.
+
+    Note: ``bboxes`` in ``InstanceData`` passed in is of format 'xyxy'
+    and its coordinates are unnormalized.
+
+    Args:
+        iou_mode (str): iou mode such as 'iou', 'giou'. Defaults to 'giou'.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+
+    Examples:
+        >>> from mmdet.models.task_modules.assigners.
+        ... match_costs.match_cost import IoUCost
+        >>> import torch
+        >>> self = IoUCost()
+        >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+        >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+        >>> self(bboxes, gt_bboxes)
+        tensor([[-0.1250,  0.1667],
+            [ 0.1667, -0.5000]])
+    """
+
+    def __init__(self, iou_mode: str = 'giou', weight: Union[float, int] = 1.):
+        super().__init__(weight=weight)
+        self.iou_mode = iou_mode
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs):
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): ``bboxes`` inside is
+                predicted boxes with unnormalized coordinate
+                (x, y, x, y).
+            gt_instances (:obj:`InstanceData`): ``bboxes`` inside is gt
+                bboxes with unnormalized coordinate (x, y, x, y).
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_bboxes = pred_instances.bboxes
+        gt_bboxes = gt_instances.bboxes
+
+        # avoid fp16 overflow
+        if pred_bboxes.dtype == torch.float16:
+            fp16 = True
+            pred_bboxes = pred_bboxes.to(torch.float32)
+        else:
+            fp16 = False
+
+        overlaps = bbox_overlaps(
+            pred_bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False)
+
+        if fp16:
+            overlaps = overlaps.to(torch.float16)
+
+        # The 1 is a constant that doesn't change the matching, so omitted.
+        iou_cost = -overlaps
+        return iou_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class ClassificationCost(BaseMatchCost):
+    """ClsSoftmaxCost.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+
+    Examples:
+        >>> from mmdet.models.task_modules.assigners.
+        ...  match_costs.match_cost import ClassificationCost
+        >>> import torch
+        >>> self = ClassificationCost()
+        >>> cls_pred = torch.rand(4, 3)
+        >>> gt_labels = torch.tensor([0, 1, 2])
+        >>> factor = torch.tensor([10, 8, 10, 8])
+        >>> self(cls_pred, gt_labels)
+        tensor([[-0.3430, -0.3525, -0.3045],
+            [-0.3077, -0.2931, -0.3992],
+            [-0.3664, -0.3455, -0.2881],
+            [-0.3343, -0.2701, -0.3956]])
+    """
+
+    def __init__(self, weight: Union[float, int] = 1) -> None:
+        super().__init__(weight=weight)
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): ``scores`` inside is
+                predicted classification logits, of shape
+                (num_queries, num_class).
+            gt_instances (:obj:`InstanceData`): ``labels`` inside should have
+                shape (num_gt, ).
+            img_meta (Optional[dict]): _description_. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_scores = pred_instances.scores
+        gt_labels = gt_instances.labels
+
+        pred_scores = pred_scores.softmax(-1)
+        cls_cost = -pred_scores[:, gt_labels]
+
+        return cls_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class FocalLossCost(BaseMatchCost):
+    """FocalLossCost.
+
+    Args:
+        alpha (Union[float, int]): focal_loss alpha. Defaults to 0.25.
+        gamma (Union[float, int]): focal_loss gamma. Defaults to 2.
+        eps (float): Defaults to 1e-12.
+        binary_input (bool): Whether the input is binary. Currently,
+            binary_input = True is for masks input, binary_input = False
+            is for label input. Defaults to False.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 alpha: Union[float, int] = 0.25,
+                 gamma: Union[float, int] = 2,
+                 eps: float = 1e-12,
+                 binary_input: bool = False,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.alpha = alpha
+        self.gamma = gamma
+        self.eps = eps
+        self.binary_input = binary_input
+
+    def _focal_loss_cost(self, cls_pred: Tensor, gt_labels: Tensor) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_queries, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels]
+        return cls_cost * self.weight
+
+    def _mask_focal_loss_cost(self, cls_pred, gt_labels) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits.
+                in shape (num_queries, d1, ..., dn), dtype=torch.float32.
+            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
+                dtype=torch.long. Labels should be binary.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_queries, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
+        return cls_cost / n * self.weight
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``scores`` or ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``labels`` or ``mask``.
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        if self.binary_input:
+            pred_masks = pred_instances.masks
+            gt_masks = gt_instances.masks
+            return self._mask_focal_loss_cost(pred_masks, gt_masks)
+        else:
+            pred_scores = pred_instances.scores
+            gt_labels = gt_instances.labels
+            return self._focal_loss_cost(pred_scores, gt_labels)
+
+
+@TASK_UTILS.register_module()
+class BinaryFocalLossCost(FocalLossCost):
+
+    def _focal_loss_cost(self, cls_pred: Tensor, gt_labels: Tensor) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_queries, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
+        return cls_cost * self.weight
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``scores`` or ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``labels`` or ``mask``.
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        # gt_instances.text_token_mask is a repeated tensor of the same length
+        # of instances. Only gt_instances.text_token_mask[0] is useful
+        text_token_mask = torch.nonzero(
+            gt_instances.text_token_mask[0]).squeeze(-1)
+        pred_scores = pred_instances.scores[:, text_token_mask]
+        gt_labels = gt_instances.positive_maps[:, text_token_mask]
+        return self._focal_loss_cost(pred_scores, gt_labels)
+
+
+@TASK_UTILS.register_module()
+class DiceCost(BaseMatchCost):
+    """Cost of mask assignments based on dice losses.
+
+    Args:
+        pred_act (bool): Whether to apply sigmoid to mask_pred.
+            Defaults to False.
+        eps (float): Defaults to 1e-3.
+        naive_dice (bool): If True, use the naive dice loss
+            in which the power of the number in the denominator is
+            the first power. If False, use the second power that
+            is adopted by K-Net and SOLO. Defaults to True.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 pred_act: bool = False,
+                 eps: float = 1e-3,
+                 naive_dice: bool = True,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.pred_act = pred_act
+        self.eps = eps
+        self.naive_dice = naive_dice
+
+    def _binary_mask_dice_loss(self, mask_preds: Tensor,
+                               gt_masks: Tensor) -> Tensor:
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction in shape (num_queries, *).
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+                store 0 or 1, 0 for negative class and 1 for
+                positive class.
+
+        Returns:
+            Tensor: Dice cost matrix in shape (num_queries, num_gt).
+        """
+        mask_preds = mask_preds.flatten(1)
+        gt_masks = gt_masks.flatten(1).float()
+        numerator = 2 * torch.einsum('nc,mc->nm', mask_preds, gt_masks)
+        if self.naive_dice:
+            denominator = mask_preds.sum(-1)[:, None] + \
+                          gt_masks.sum(-1)[None, :]
+        else:
+            denominator = mask_preds.pow(2).sum(1)[:, None] + \
+                          gt_masks.pow(2).sum(1)[None, :]
+        loss = 1 - (numerator + self.eps) / (denominator + self.eps)
+        return loss
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``mask``.
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+
+        if self.pred_act:
+            pred_masks = pred_masks.sigmoid()
+        dice_cost = self._binary_mask_dice_loss(pred_masks, gt_masks)
+        return dice_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class CrossEntropyLossCost(BaseMatchCost):
+    """CrossEntropyLossCost.
+
+    Args:
+        use_sigmoid (bool): Whether the prediction uses sigmoid
+                of softmax. Defaults to True.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 use_sigmoid: bool = True,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.use_sigmoid = use_sigmoid
+
+    def _binary_cross_entropy(self, cls_pred: Tensor,
+                              gt_labels: Tensor) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): The prediction with shape (num_queries, 1, *) or
+                (num_queries, *).
+            gt_labels (Tensor): The learning label of prediction with
+                shape (num_gt, *).
+
+        Returns:
+            Tensor: Cross entropy cost matrix in shape (num_queries, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1).float()
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        pos = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.ones_like(cls_pred), reduction='none')
+        neg = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.zeros_like(cls_pred), reduction='none')
+        cls_cost = torch.einsum('nc,mc->nm', pos, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg, 1 - gt_labels)
+        cls_cost = cls_cost / n
+
+        return cls_cost
+
+    def __call__(self,
+                 pred_instances: InstanceData,
+                 gt_instances: InstanceData,
+                 img_meta: Optional[dict] = None,
+                 **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``scores`` or ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``labels`` or ``masks``.
+            img_meta (Optional[dict]): Image information. Defaults to None.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+        if self.use_sigmoid:
+            cls_cost = self._binary_cross_entropy(pred_masks, gt_masks)
+        else:
+            raise NotImplementedError
+
+        return cls_cost * self.weight
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/max_iou_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/max_iou_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..71da54429ae0526bf52277bc3b1d24630acceaed
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/max_iou_assigner.py
@@ -0,0 +1,325 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Optional, Union
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def _perm_box(bboxes,
+              iou_calculator,
+              iou_thr=0.97,
+              perm_range=0.01,
+              counter=0,
+              max_iter=5):
+    """Compute the permuted bboxes.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for , "xyxy" format.
+        iou_calculator (obj): Overlaps Calculator.
+        iou_thr (float): The permuted bboxes should have IoU > iou_thr.
+        perm_range (float): The scale of permutation.
+        counter (int): Counter of permutation iteration.
+        max_iter (int): The max iterations of permutation.
+    Returns:
+        Tensor: The permuted bboxes.
+    """
+    ori_bboxes = copy.deepcopy(bboxes)
+    is_valid = True
+    N = bboxes.size(0)
+    perm_factor = bboxes.new_empty(N, 4).uniform_(1 - perm_range,
+                                                  1 + perm_range)
+    bboxes *= perm_factor
+    new_wh = bboxes[:, 2:] - bboxes[:, :2]
+    if (new_wh <= 0).any():
+        is_valid = False
+    iou = iou_calculator(ori_bboxes.unique(dim=0), bboxes)
+    if (iou < iou_thr).any():
+        is_valid = False
+    if not is_valid and counter < max_iter:
+        return _perm_box(
+            ori_bboxes,
+            iou_calculator,
+            perm_range=max(perm_range - counter * 0.001, 1e-3),
+            counter=counter + 1)
+    return bboxes
+
+
+def perm_repeat_bboxes(bboxes, iou_calculator=None, perm_repeat_cfg=None):
+    """Permute the repeated bboxes.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for , "xyxy" format.
+        iou_calculator (obj): Overlaps Calculator.
+        perm_repeat_cfg (Dict): Config of permutation.
+    Returns:
+        Tensor: Bboxes after permuted repeated bboxes.
+    """
+    assert isinstance(bboxes, torch.Tensor)
+    if iou_calculator is None:
+        import torchvision
+        iou_calculator = torchvision.ops.box_iou
+    bboxes = copy.deepcopy(bboxes)
+    unique_bboxes = bboxes.unique(dim=0)
+    iou_thr = perm_repeat_cfg.get('iou_thr', 0.97)
+    perm_range = perm_repeat_cfg.get('perm_range', 0.01)
+    for box in unique_bboxes:
+        inds = (bboxes == box).sum(-1).float() == 4
+        if inds.float().sum().item() == 1:
+            continue
+        bboxes[inds] = _perm_box(
+            bboxes[inds],
+            iou_calculator,
+            iou_thr=iou_thr,
+            perm_range=perm_range,
+            counter=0)
+    return bboxes
+
+
+@TASK_UTILS.register_module()
+class MaxIoUAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, or a semi-positive integer
+    indicating the ground truth index.
+
+    - -1: negative sample, no assigned gt
+    - semi-positive integer: positive sample, index (0-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+            `min_pos_iou` is set to avoid assigning bboxes that have extremely
+            small iou with GT as positive samples. It brings about 0.3 mAP
+            improvements in 1x schedule but does not affect the performance of
+            3x schedule. More comparisons can be found in
+            `PR #7464 <https://github.com/open-mmlab/mmdetection/pull/7464>`_.
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+        match_low_quality (bool): Whether to allow low quality matches. This is
+            usually allowed for RPN and single stage detectors, but not allowed
+            in the second stage. Details are demonstrated in Step 4.
+        gpu_assign_thr (int): The upper bound of the number of GT for GPU
+            assign. When the number of gt is above this threshold, will assign
+            on CPU device. Negative values mean not assign on CPU.
+        iou_calculator (dict): Config of overlaps Calculator.
+        perm_repeat_gt_cfg (dict): Config of permute repeated gt bboxes.
+    """
+
+    def __init__(self,
+                 pos_iou_thr: float,
+                 neg_iou_thr: Union[float, tuple],
+                 min_pos_iou: float = .0,
+                 gt_max_assign_all: bool = True,
+                 ignore_iof_thr: float = -1,
+                 ignore_wrt_candidates: bool = True,
+                 match_low_quality: bool = True,
+                 gpu_assign_thr: float = -1,
+                 iou_calculator: dict = dict(type='BboxOverlaps2D'),
+                 perm_repeat_gt_cfg=None):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+        self.gpu_assign_thr = gpu_assign_thr
+        self.match_low_quality = match_low_quality
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+        self.perm_repeat_gt_cfg = perm_repeat_gt_cfg
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, or a semi-positive number. -1 means negative
+        sample, semi-positive number is the index (0-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to the background
+        2. assign proposals whose iou with all gts < neg_iou_thr to 0
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+
+        Example:
+            >>> from mmengine.structures import InstanceData
+            >>> self = MaxIoUAssigner(0.5, 0.5)
+            >>> pred_instances = InstanceData()
+            >>> pred_instances.priors = torch.Tensor([[0, 0, 10, 10],
+            ...                                      [10, 10, 20, 20]])
+            >>> gt_instances = InstanceData()
+            >>> gt_instances.bboxes = torch.Tensor([[0, 0, 10, 9]])
+            >>> gt_instances.labels = torch.Tensor([0])
+            >>> assign_result = self.assign(pred_instances, gt_instances)
+            >>> expected_gt_inds = torch.LongTensor([1, 0])
+            >>> assert torch.all(assign_result.gt_inds == expected_gt_inds)
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+        if gt_instances_ignore is not None:
+            gt_bboxes_ignore = gt_instances_ignore.bboxes
+        else:
+            gt_bboxes_ignore = None
+
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            gt_bboxes.shape[0] > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = priors.device
+            priors = priors.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            gt_labels = gt_labels.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+
+        if self.perm_repeat_gt_cfg is not None and priors.numel() > 0:
+            gt_bboxes_unique = perm_repeat_bboxes(gt_bboxes,
+                                                  self.iou_calculator,
+                                                  self.perm_repeat_gt_cfg)
+        else:
+            gt_bboxes_unique = gt_bboxes
+        overlaps = self.iou_calculator(gt_bboxes_unique, priors)
+
+        if (self.ignore_iof_thr > 0 and gt_bboxes_ignore is not None
+                and gt_bboxes_ignore.numel() > 0 and priors.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = self.iou_calculator(
+                    priors, gt_bboxes_ignore, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = self.iou_calculator(
+                    gt_bboxes_ignore, priors, mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
+
+    def assign_wrt_overlaps(self, overlaps: Tensor,
+                            gt_labels: Tensor) -> AssignResult:
+        """Assign w.r.t. the overlaps of priors with gts.
+
+        Args:
+            overlaps (Tensor): Overlaps between k gt_bboxes and n bboxes,
+                shape(k, n).
+            gt_labels (Tensor): Labels of k gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        num_gts, num_bboxes = overlaps.size(0), overlaps.size(1)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = overlaps.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = overlaps.new_zeros((num_bboxes, ))
+            assigned_labels = overlaps.new_full((num_bboxes, ),
+                                                -1,
+                                                dtype=torch.long)
+            if num_gts == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts=num_gts,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=max_overlaps,
+                labels=assigned_labels)
+
+        # for each anchor, which gt best overlaps with it
+        # for each anchor, the max iou of all gts
+        max_overlaps, argmax_overlaps = overlaps.max(dim=0)
+        # for each gt, which anchor best overlaps with it
+        # for each gt, the max iou of all proposals
+        gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=1)
+
+        # 2. assign negative: below
+        # the negative inds are set to be 0
+        if isinstance(self.neg_iou_thr, float):
+            assigned_gt_inds[(max_overlaps >= 0)
+                             & (max_overlaps < self.neg_iou_thr)] = 0
+        elif isinstance(self.neg_iou_thr, tuple):
+            assert len(self.neg_iou_thr) == 2
+            assigned_gt_inds[(max_overlaps >= self.neg_iou_thr[0])
+                             & (max_overlaps < self.neg_iou_thr[1])] = 0
+
+        # 3. assign positive: above positive IoU threshold
+        pos_inds = max_overlaps >= self.pos_iou_thr
+        assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+        if self.match_low_quality:
+            # Low-quality matching will overwrite the assigned_gt_inds assigned
+            # in Step 3. Thus, the assigned gt might not be the best one for
+            # prediction.
+            # For example, if bbox A has 0.9 and 0.8 iou with GT bbox 1 & 2,
+            # bbox 1 will be assigned as the best target for bbox A in step 3.
+            # However, if GT bbox 2's gt_argmax_overlaps = A, bbox A's
+            # assigned_gt_inds will be overwritten to be bbox 2.
+            # This might be the reason that it is not used in ROI Heads.
+            for i in range(num_gts):
+                if gt_max_overlaps[i] >= self.min_pos_iou:
+                    if self.gt_max_assign_all:
+                        max_iou_inds = overlaps[i, :] == gt_max_overlaps[i]
+                        assigned_gt_inds[max_iou_inds] = i + 1
+                    else:
+                        assigned_gt_inds[gt_argmax_overlaps[i]] = i + 1
+
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+
+        return AssignResult(
+            num_gts=num_gts,
+            gt_inds=assigned_gt_inds,
+            max_overlaps=max_overlaps,
+            labels=assigned_labels)
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/multi_instance_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/multi_instance_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ba32afe856b3c2ad03ed89562d080f15b6ccf30
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/multi_instance_assigner.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .max_iou_assigner import MaxIoUAssigner
+
+
+@TASK_UTILS.register_module()
+class MultiInstanceAssigner(MaxIoUAssigner):
+    """Assign a corresponding gt bbox or background to each proposal bbox. If
+    we need to use a proposal box to generate multiple predict boxes,
+    `MultiInstanceAssigner` can assign multiple gt to each proposal box.
+
+    Args:
+        num_instance (int): How many bboxes are predicted by each proposal box.
+    """
+
+    def __init__(self, num_instance: int = 2, **kwargs):
+        super().__init__(**kwargs)
+        self.num_instance = num_instance
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to bboxes.
+
+        This method assign gt bboxes to every bbox (proposal/anchor), each bbox
+        is assigned a set of gts, and the number of gts in this set is defined
+        by `self.num_instance`.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        # Set the FG label to 1 and add ignored annotations
+        gt_labels = gt_instances.labels + 1
+        if gt_instances_ignore is not None:
+            gt_bboxes_ignore = gt_instances_ignore.bboxes
+            if hasattr(gt_instances_ignore, 'labels'):
+                gt_labels_ignore = gt_instances_ignore.labels
+            else:
+                gt_labels_ignore = torch.ones_like(gt_bboxes_ignore)[:, 0] * -1
+        else:
+            gt_bboxes_ignore = None
+            gt_labels_ignore = None
+
+        assign_on_cpu = True if (self.gpu_assign_thr > 0) and (
+            gt_bboxes.shape[0] > self.gpu_assign_thr) else False
+        # compute overlap and assign gt on CPU when number of GT is large
+        if assign_on_cpu:
+            device = priors.device
+            priors = priors.cpu()
+            gt_bboxes = gt_bboxes.cpu()
+            gt_labels = gt_labels.cpu()
+            if gt_bboxes_ignore is not None:
+                gt_bboxes_ignore = gt_bboxes_ignore.cpu()
+                gt_labels_ignore = gt_labels_ignore.cpu()
+
+        if gt_bboxes_ignore is not None:
+            all_bboxes = torch.cat([gt_bboxes, gt_bboxes_ignore], dim=0)
+            all_labels = torch.cat([gt_labels, gt_labels_ignore], dim=0)
+        else:
+            all_bboxes = gt_bboxes
+            all_labels = gt_labels
+        all_priors = torch.cat([priors, all_bboxes], dim=0)
+
+        overlaps_normal = self.iou_calculator(
+            all_priors, all_bboxes, mode='iou')
+        overlaps_ignore = self.iou_calculator(
+            all_priors, all_bboxes, mode='iof')
+        gt_ignore_mask = all_labels.eq(-1).repeat(all_priors.shape[0], 1)
+        overlaps_normal = overlaps_normal * ~gt_ignore_mask
+        overlaps_ignore = overlaps_ignore * gt_ignore_mask
+
+        overlaps_normal, overlaps_normal_indices = overlaps_normal.sort(
+            descending=True, dim=1)
+        overlaps_ignore, overlaps_ignore_indices = overlaps_ignore.sort(
+            descending=True, dim=1)
+
+        # select the roi with the higher score
+        max_overlaps_normal = overlaps_normal[:, :self.num_instance].flatten()
+        gt_assignment_normal = overlaps_normal_indices[:, :self.
+                                                       num_instance].flatten()
+        max_overlaps_ignore = overlaps_ignore[:, :self.num_instance].flatten()
+        gt_assignment_ignore = overlaps_ignore_indices[:, :self.
+                                                       num_instance].flatten()
+
+        # ignore or not
+        ignore_assign_mask = (max_overlaps_normal < self.pos_iou_thr) * (
+            max_overlaps_ignore > max_overlaps_normal)
+        overlaps = (max_overlaps_normal * ~ignore_assign_mask) + (
+            max_overlaps_ignore * ignore_assign_mask)
+        gt_assignment = (gt_assignment_normal * ~ignore_assign_mask) + (
+            gt_assignment_ignore * ignore_assign_mask)
+
+        assigned_labels = all_labels[gt_assignment]
+        fg_mask = (overlaps >= self.pos_iou_thr) * (assigned_labels != -1)
+        bg_mask = (overlaps < self.neg_iou_thr) * (overlaps >= 0)
+        assigned_labels[fg_mask] = 1
+        assigned_labels[bg_mask] = 0
+
+        overlaps = overlaps.reshape(-1, self.num_instance)
+        gt_assignment = gt_assignment.reshape(-1, self.num_instance)
+        assigned_labels = assigned_labels.reshape(-1, self.num_instance)
+
+        assign_result = AssignResult(
+            num_gts=all_bboxes.size(0),
+            gt_inds=gt_assignment,
+            max_overlaps=overlaps,
+            labels=assigned_labels)
+
+        if assign_on_cpu:
+            assign_result.gt_inds = assign_result.gt_inds.to(device)
+            assign_result.max_overlaps = assign_result.max_overlaps.to(device)
+            if assign_result.labels is not None:
+                assign_result.labels = assign_result.labels.to(device)
+        return assign_result
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/point_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/point_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..4da60a490b0022ac76c46db8a34f814bc9da8e2e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/point_assigner.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class PointAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each point.
+
+    Each proposals will be assigned with `0`, or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    """
+
+    def __init__(self, scale: int = 4, pos_num: int = 3) -> None:
+        self.scale = scale
+        self.pos_num = pos_num
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to points.
+
+        This method assign a gt bbox to every points set, each points set
+        will be assigned with  the background_label (-1), or a label number.
+        -1 is background, and semi-positive number is the index (0-based) of
+        assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every points to the background_label (-1)
+        2. A point is assigned to some gt bbox if
+            (i) the point is within the k closest points to the gt bbox
+            (ii) the distance between this point and the gt is smaller than
+                other gt bboxes
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+
+
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        # points to be assigned, shape(n, 3) while last
+        # dimension stands for (x, y, stride).
+        points = pred_instances.priors
+
+        num_points = points.shape[0]
+        num_gts = gt_bboxes.shape[0]
+
+        if num_gts == 0 or num_points == 0:
+            # If no truth assign everything to the background
+            assigned_gt_inds = points.new_full((num_points, ),
+                                               0,
+                                               dtype=torch.long)
+            assigned_labels = points.new_full((num_points, ),
+                                              -1,
+                                              dtype=torch.long)
+            return AssignResult(
+                num_gts=num_gts,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=None,
+                labels=assigned_labels)
+
+        points_xy = points[:, :2]
+        points_stride = points[:, 2]
+        points_lvl = torch.log2(
+            points_stride).int()  # [3...,4...,5...,6...,7...]
+        lvl_min, lvl_max = points_lvl.min(), points_lvl.max()
+
+        # assign gt box
+        gt_bboxes_xy = (gt_bboxes[:, :2] + gt_bboxes[:, 2:]) / 2
+        gt_bboxes_wh = (gt_bboxes[:, 2:] - gt_bboxes[:, :2]).clamp(min=1e-6)
+        scale = self.scale
+        gt_bboxes_lvl = ((torch.log2(gt_bboxes_wh[:, 0] / scale) +
+                          torch.log2(gt_bboxes_wh[:, 1] / scale)) / 2).int()
+        gt_bboxes_lvl = torch.clamp(gt_bboxes_lvl, min=lvl_min, max=lvl_max)
+
+        # stores the assigned gt index of each point
+        assigned_gt_inds = points.new_zeros((num_points, ), dtype=torch.long)
+        # stores the assigned gt dist (to this point) of each point
+        assigned_gt_dist = points.new_full((num_points, ), float('inf'))
+        points_range = torch.arange(points.shape[0])
+
+        for idx in range(num_gts):
+            gt_lvl = gt_bboxes_lvl[idx]
+            # get the index of points in this level
+            lvl_idx = gt_lvl == points_lvl
+            points_index = points_range[lvl_idx]
+            # get the points in this level
+            lvl_points = points_xy[lvl_idx, :]
+            # get the center point of gt
+            gt_point = gt_bboxes_xy[[idx], :]
+            # get width and height of gt
+            gt_wh = gt_bboxes_wh[[idx], :]
+            # compute the distance between gt center and
+            #   all points in this level
+            points_gt_dist = ((lvl_points - gt_point) / gt_wh).norm(dim=1)
+            # find the nearest k points to gt center in this level
+            min_dist, min_dist_index = torch.topk(
+                points_gt_dist, self.pos_num, largest=False)
+            # the index of nearest k points to gt center in this level
+            min_dist_points_index = points_index[min_dist_index]
+            # The less_than_recorded_index stores the index
+            #   of min_dist that is less then the assigned_gt_dist. Where
+            #   assigned_gt_dist stores the dist from previous assigned gt
+            #   (if exist) to each point.
+            less_than_recorded_index = min_dist < assigned_gt_dist[
+                min_dist_points_index]
+            # The min_dist_points_index stores the index of points satisfy:
+            #   (1) it is k nearest to current gt center in this level.
+            #   (2) it is closer to current gt center than other gt center.
+            min_dist_points_index = min_dist_points_index[
+                less_than_recorded_index]
+            # assign the result
+            assigned_gt_inds[min_dist_points_index] = idx + 1
+            assigned_gt_dist[min_dist_points_index] = min_dist[
+                less_than_recorded_index]
+
+        assigned_labels = assigned_gt_inds.new_full((num_points, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+
+        return AssignResult(
+            num_gts=num_gts,
+            gt_inds=assigned_gt_inds,
+            max_overlaps=None,
+            labels=assigned_labels)
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/region_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/region_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..df549143086c1195efaf12a2f3e81259da0e6c97
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/region_assigner.py
@@ -0,0 +1,239 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from ..prior_generators import anchor_inside_flags
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+def calc_region(
+        bbox: Tensor,
+        ratio: float,
+        stride: int,
+        featmap_size: Optional[Tuple[int, int]] = None) -> Tuple[Tensor]:
+    """Calculate region of the box defined by the ratio, the ratio is from the
+    center of the box to every edge."""
+    # project bbox on the feature
+    f_bbox = bbox / stride
+    x1 = torch.round((1 - ratio) * f_bbox[0] + ratio * f_bbox[2])
+    y1 = torch.round((1 - ratio) * f_bbox[1] + ratio * f_bbox[3])
+    x2 = torch.round(ratio * f_bbox[0] + (1 - ratio) * f_bbox[2])
+    y2 = torch.round(ratio * f_bbox[1] + (1 - ratio) * f_bbox[3])
+    if featmap_size is not None:
+        x1 = x1.clamp(min=0, max=featmap_size[1])
+        y1 = y1.clamp(min=0, max=featmap_size[0])
+        x2 = x2.clamp(min=0, max=featmap_size[1])
+        y2 = y2.clamp(min=0, max=featmap_size[0])
+    return (x1, y1, x2, y2)
+
+
+def anchor_ctr_inside_region_flags(anchors: Tensor, stride: int,
+                                   region: Tuple[Tensor]) -> Tensor:
+    """Get the flag indicate whether anchor centers are inside regions."""
+    x1, y1, x2, y2 = region
+    f_anchors = anchors / stride
+    x = (f_anchors[:, 0] + f_anchors[:, 2]) * 0.5
+    y = (f_anchors[:, 1] + f_anchors[:, 3]) * 0.5
+    flags = (x >= x1) & (x <= x2) & (y >= y1) & (y <= y2)
+    return flags
+
+
+@TASK_UTILS.register_module()
+class RegionAssigner(BaseAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        center_ratio (float): ratio of the region in the center of the bbox to
+            define positive sample.
+        ignore_ratio (float): ratio of the region to define ignore samples.
+    """
+
+    def __init__(self,
+                 center_ratio: float = 0.2,
+                 ignore_ratio: float = 0.5) -> None:
+        self.center_ratio = center_ratio
+        self.ignore_ratio = ignore_ratio
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               img_meta: dict,
+               featmap_sizes: List[Tuple[int, int]],
+               num_level_anchors: List[int],
+               anchor_scale: int,
+               anchor_strides: List[int],
+               gt_instances_ignore: Optional[InstanceData] = None,
+               allowed_border: int = 0) -> AssignResult:
+        """Assign gt to anchors.
+
+        This method assign a gt bbox to every bbox (proposal/anchor), each bbox
+        will be assigned with -1, 0, or a positive number. -1 means don't care,
+        0 means negative sample, positive number is the index (1-based) of
+        assigned gt.
+
+        The assignment is done in following steps, and the order matters.
+
+        1. Assign every anchor to 0 (negative)
+        2. (For each gt_bboxes) Compute ignore flags based on ignore_region
+           then assign -1 to anchors w.r.t. ignore flags
+        3. (For each gt_bboxes) Compute pos flags based on center_region then
+           assign gt_bboxes to anchors w.r.t. pos flags
+        4. (For each gt_bboxes) Compute ignore flags based on adjacent anchor
+           level then assign -1 to anchors w.r.t. ignore flags
+        5. Assign anchor outside of image to -1
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            img_meta (dict): Meta info of image.
+            featmap_sizes (list[tuple[int, int]]): Feature map size each level.
+            num_level_anchors (list[int]): The number of anchors in each level.
+            anchor_scale (int): Scale of the anchor.
+            anchor_strides (list[int]): Stride of the anchor.
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+            allowed_border (int, optional): The border to allow the valid
+                anchor. Defaults to 0.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+        if gt_instances_ignore is not None:
+            raise NotImplementedError
+
+        num_gts = len(gt_instances)
+        num_bboxes = len(pred_instances)
+
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        flat_anchors = pred_instances.priors
+        flat_valid_flags = pred_instances.valid_flags
+        mlvl_anchors = torch.split(flat_anchors, num_level_anchors)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = gt_bboxes.new_zeros((num_bboxes, ))
+            assigned_gt_inds = gt_bboxes.new_zeros((num_bboxes, ),
+                                                   dtype=torch.long)
+            assigned_labels = gt_bboxes.new_full((num_bboxes, ),
+                                                 -1,
+                                                 dtype=torch.long)
+            return AssignResult(
+                num_gts=num_gts,
+                gt_inds=assigned_gt_inds,
+                max_overlaps=max_overlaps,
+                labels=assigned_labels)
+
+        num_lvls = len(mlvl_anchors)
+        r1 = (1 - self.center_ratio) / 2
+        r2 = (1 - self.ignore_ratio) / 2
+
+        scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                           (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+        min_anchor_size = scale.new_full(
+            (1, ), float(anchor_scale * anchor_strides[0]))
+        target_lvls = torch.floor(
+            torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
+        target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
+
+        # 1. assign 0 (negative) by default
+        mlvl_assigned_gt_inds = []
+        mlvl_ignore_flags = []
+        for lvl in range(num_lvls):
+            assigned_gt_inds = gt_bboxes.new_full((num_level_anchors[lvl], ),
+                                                  0,
+                                                  dtype=torch.long)
+            ignore_flags = torch.zeros_like(assigned_gt_inds)
+            mlvl_assigned_gt_inds.append(assigned_gt_inds)
+            mlvl_ignore_flags.append(ignore_flags)
+
+        for gt_id in range(num_gts):
+            lvl = target_lvls[gt_id].item()
+            featmap_size = featmap_sizes[lvl]
+            stride = anchor_strides[lvl]
+            anchors = mlvl_anchors[lvl]
+            gt_bbox = gt_bboxes[gt_id, :4]
+
+            # Compute regions
+            ignore_region = calc_region(gt_bbox, r2, stride, featmap_size)
+            ctr_region = calc_region(gt_bbox, r1, stride, featmap_size)
+
+            # 2. Assign -1 to ignore flags
+            ignore_flags = anchor_ctr_inside_region_flags(
+                anchors, stride, ignore_region)
+            mlvl_assigned_gt_inds[lvl][ignore_flags] = -1
+
+            # 3. Assign gt_bboxes to pos flags
+            pos_flags = anchor_ctr_inside_region_flags(anchors, stride,
+                                                       ctr_region)
+            mlvl_assigned_gt_inds[lvl][pos_flags] = gt_id + 1
+
+            # 4. Assign -1 to ignore adjacent lvl
+            if lvl > 0:
+                d_lvl = lvl - 1
+                d_anchors = mlvl_anchors[d_lvl]
+                d_featmap_size = featmap_sizes[d_lvl]
+                d_stride = anchor_strides[d_lvl]
+                d_ignore_region = calc_region(gt_bbox, r2, d_stride,
+                                              d_featmap_size)
+                ignore_flags = anchor_ctr_inside_region_flags(
+                    d_anchors, d_stride, d_ignore_region)
+                mlvl_ignore_flags[d_lvl][ignore_flags] = 1
+            if lvl < num_lvls - 1:
+                u_lvl = lvl + 1
+                u_anchors = mlvl_anchors[u_lvl]
+                u_featmap_size = featmap_sizes[u_lvl]
+                u_stride = anchor_strides[u_lvl]
+                u_ignore_region = calc_region(gt_bbox, r2, u_stride,
+                                              u_featmap_size)
+                ignore_flags = anchor_ctr_inside_region_flags(
+                    u_anchors, u_stride, u_ignore_region)
+                mlvl_ignore_flags[u_lvl][ignore_flags] = 1
+
+        # 4. (cont.) Assign -1 to ignore adjacent lvl
+        for lvl in range(num_lvls):
+            ignore_flags = mlvl_ignore_flags[lvl]
+            mlvl_assigned_gt_inds[lvl][ignore_flags == 1] = -1
+
+        # 5. Assign -1 to anchor outside of image
+        flat_assigned_gt_inds = torch.cat(mlvl_assigned_gt_inds)
+        assert (flat_assigned_gt_inds.shape[0] == flat_anchors.shape[0] ==
+                flat_valid_flags.shape[0])
+        inside_flags = anchor_inside_flags(flat_anchors, flat_valid_flags,
+                                           img_meta['img_shape'],
+                                           allowed_border)
+        outside_flags = ~inside_flags
+        flat_assigned_gt_inds[outside_flags] = -1
+
+        assigned_labels = torch.zeros_like(flat_assigned_gt_inds)
+        pos_flags = flat_assigned_gt_inds > 0
+        assigned_labels[pos_flags] = gt_labels[flat_assigned_gt_inds[pos_flags]
+                                               - 1]
+
+        return AssignResult(
+            num_gts=num_gts,
+            gt_inds=flat_assigned_gt_inds,
+            max_overlaps=None,
+            labels=assigned_labels)
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/sim_ota_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/sim_ota_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..d54a8b91d132d9bf661267de666bfed7e915a65a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/sim_ota_assigner.py
@@ -0,0 +1,223 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+INF = 100000.0
+EPS = 1.0e-7
+
+
+@TASK_UTILS.register_module()
+class SimOTAAssigner(BaseAssigner):
+    """Computes matching between predictions and ground truth.
+
+    Args:
+        center_radius (float): Ground truth center size
+            to judge whether a prior is in center. Defaults to 2.5.
+        candidate_topk (int): The candidate top-k which used to
+            get top-k ious to calculate dynamic-k. Defaults to 10.
+        iou_weight (float): The scale factor for regression
+            iou cost. Defaults to 3.0.
+        cls_weight (float): The scale factor for classification
+            cost. Defaults to 1.0.
+        iou_calculator (ConfigType): Config of overlaps Calculator.
+            Defaults to dict(type='BboxOverlaps2D').
+    """
+
+    def __init__(self,
+                 center_radius: float = 2.5,
+                 candidate_topk: int = 10,
+                 iou_weight: float = 3.0,
+                 cls_weight: float = 1.0,
+                 iou_calculator: ConfigType = dict(type='BboxOverlaps2D')):
+        self.center_radius = center_radius
+        self.candidate_topk = candidate_topk
+        self.iou_weight = iou_weight
+        self.cls_weight = cls_weight
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs) -> AssignResult:
+        """Assign gt to priors using SimOTA.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+            gt_instances_ignore (:obj:`InstanceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+        Returns:
+            obj:`AssignResult`: The assigned result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        num_gt = gt_bboxes.size(0)
+
+        decoded_bboxes = pred_instances.bboxes
+        pred_scores = pred_instances.scores
+        priors = pred_instances.priors
+        num_bboxes = decoded_bboxes.size(0)
+
+        # assign 0 by default
+        assigned_gt_inds = decoded_bboxes.new_full((num_bboxes, ),
+                                                   0,
+                                                   dtype=torch.long)
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        valid_mask, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(
+            priors, gt_bboxes)
+        valid_decoded_bbox = decoded_bboxes[valid_mask]
+        valid_pred_scores = pred_scores[valid_mask]
+        num_valid = valid_decoded_bbox.size(0)
+        if num_valid == 0:
+            # No valid bboxes, return empty assignment
+            max_overlaps = decoded_bboxes.new_zeros((num_bboxes, ))
+            assigned_labels = decoded_bboxes.new_full((num_bboxes, ),
+                                                      -1,
+                                                      dtype=torch.long)
+            return AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+        pairwise_ious = self.iou_calculator(valid_decoded_bbox, gt_bboxes)
+        iou_cost = -torch.log(pairwise_ious + EPS)
+
+        gt_onehot_label = (
+            F.one_hot(gt_labels.to(torch.int64),
+                      pred_scores.shape[-1]).float().unsqueeze(0).repeat(
+                          num_valid, 1, 1))
+
+        valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(1, num_gt, 1)
+        # disable AMP autocast and calculate BCE with FP32 to avoid overflow
+        with torch.cuda.amp.autocast(enabled=False):
+            cls_cost = (
+                F.binary_cross_entropy(
+                    valid_pred_scores.to(dtype=torch.float32),
+                    gt_onehot_label,
+                    reduction='none',
+                ).sum(-1).to(dtype=valid_pred_scores.dtype))
+
+        cost_matrix = (
+            cls_cost * self.cls_weight + iou_cost * self.iou_weight +
+            (~is_in_boxes_and_center) * INF)
+
+        matched_pred_ious, matched_gt_inds = \
+            self.dynamic_k_matching(
+                cost_matrix, pairwise_ious, num_gt, valid_mask)
+
+        # convert to AssignResult format
+        assigned_gt_inds[valid_mask] = matched_gt_inds + 1
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        assigned_labels[valid_mask] = gt_labels[matched_gt_inds].long()
+        max_overlaps = assigned_gt_inds.new_full((num_bboxes, ),
+                                                 -INF,
+                                                 dtype=torch.float32)
+        max_overlaps[valid_mask] = matched_pred_ious
+        return AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+    def get_in_gt_and_in_center_info(
+            self, priors: Tensor, gt_bboxes: Tensor) -> Tuple[Tensor, Tensor]:
+        """Get the information of which prior is in gt bboxes and gt center
+        priors."""
+        num_gt = gt_bboxes.size(0)
+
+        repeated_x = priors[:, 0].unsqueeze(1).repeat(1, num_gt)
+        repeated_y = priors[:, 1].unsqueeze(1).repeat(1, num_gt)
+        repeated_stride_x = priors[:, 2].unsqueeze(1).repeat(1, num_gt)
+        repeated_stride_y = priors[:, 3].unsqueeze(1).repeat(1, num_gt)
+
+        # is prior centers in gt bboxes, shape: [n_prior, n_gt]
+        l_ = repeated_x - gt_bboxes[:, 0]
+        t_ = repeated_y - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - repeated_x
+        b_ = gt_bboxes[:, 3] - repeated_y
+
+        deltas = torch.stack([l_, t_, r_, b_], dim=1)
+        is_in_gts = deltas.min(dim=1).values > 0
+        is_in_gts_all = is_in_gts.sum(dim=1) > 0
+
+        # is prior centers in gt centers
+        gt_cxs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_cys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        ct_box_l = gt_cxs - self.center_radius * repeated_stride_x
+        ct_box_t = gt_cys - self.center_radius * repeated_stride_y
+        ct_box_r = gt_cxs + self.center_radius * repeated_stride_x
+        ct_box_b = gt_cys + self.center_radius * repeated_stride_y
+
+        cl_ = repeated_x - ct_box_l
+        ct_ = repeated_y - ct_box_t
+        cr_ = ct_box_r - repeated_x
+        cb_ = ct_box_b - repeated_y
+
+        ct_deltas = torch.stack([cl_, ct_, cr_, cb_], dim=1)
+        is_in_cts = ct_deltas.min(dim=1).values > 0
+        is_in_cts_all = is_in_cts.sum(dim=1) > 0
+
+        # in boxes or in centers, shape: [num_priors]
+        is_in_gts_or_centers = is_in_gts_all | is_in_cts_all
+
+        # both in boxes and centers, shape: [num_fg, num_gt]
+        is_in_boxes_and_centers = (
+            is_in_gts[is_in_gts_or_centers, :]
+            & is_in_cts[is_in_gts_or_centers, :])
+        return is_in_gts_or_centers, is_in_boxes_and_centers
+
+    def dynamic_k_matching(self, cost: Tensor, pairwise_ious: Tensor,
+                           num_gt: int,
+                           valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
+        """Use IoU and matching cost to calculate the dynamic top-k positive
+        targets."""
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+        # select candidate topk ious for dynamic-k calculation
+        candidate_topk = min(self.candidate_topk, pairwise_ious.size(0))
+        topk_ious, _ = torch.topk(pairwise_ious, candidate_topk, dim=0)
+        # calculate dynamic k for each gt
+        dynamic_ks = torch.clamp(topk_ious.sum(0).int(), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
+            matching_matrix[:, gt_idx][pos_idx] = 1
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        prior_match_gt_mask = matching_matrix.sum(1) > 1
+        if prior_match_gt_mask.sum() > 0:
+            cost_min, cost_argmin = torch.min(
+                cost[prior_match_gt_mask, :], dim=1)
+            matching_matrix[prior_match_gt_mask, :] *= 0
+            matching_matrix[prior_match_gt_mask, cost_argmin] = 1
+        # get foreground mask inside box and center prior
+        fg_mask_inboxes = matching_matrix.sum(1) > 0
+        valid_mask[valid_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1)
+        matched_pred_ious = (matching_matrix *
+                             pairwise_ious).sum(1)[fg_mask_inboxes]
+        return matched_pred_ious, matched_gt_inds
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/task_aligned_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/task_aligned_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..220ea8485933ab3243f6c1e205dbf1b973df08d7
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/task_aligned_assigner.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+INF = 100000000
+
+
+@TASK_UTILS.register_module()
+class TaskAlignedAssigner(BaseAssigner):
+    """Task aligned assigner used in the paper:
+    `TOOD: Task-aligned One-stage Object Detection.
+    <https://arxiv.org/abs/2108.07755>`_.
+
+    Assign a corresponding gt bbox or background to each predicted bbox.
+    Each bbox will be assigned with `0` or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        topk (int): number of bbox selected in each level
+        iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou
+            calculator. Defaults to ``dict(type='BboxOverlaps2D')``
+    """
+
+    def __init__(self,
+                 topk: int,
+                 iou_calculator: ConfigType = dict(type='BboxOverlaps2D')):
+        assert topk >= 1
+        self.topk = topk
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               alpha: int = 1,
+               beta: int = 6) -> AssignResult:
+        """Assign gt to bboxes.
+
+        The assignment is done in following steps
+
+        1. compute alignment metric between all bbox (bbox of all pyramid
+           levels) and gt
+        2. select top-k bbox as candidates for each gt
+        3. limit the positive sample's center in gt (because the anchor-free
+           detector only can predict positive distance)
+
+
+        Args:
+            pred_instances (:obj:`InstaceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            gt_instances (:obj:`InstaceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            gt_instances_ignore (:obj:`InstaceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+            alpha (int): Hyper-parameters related to alignment_metrics.
+                Defaults to 1.
+            beta (int): Hyper-parameters related to alignment_metrics.
+                Defaults to 6.
+
+        Returns:
+            :obj:`TaskAlignedAssignResult`: The assign result.
+        """
+        priors = pred_instances.priors
+        decode_bboxes = pred_instances.bboxes
+        pred_scores = pred_instances.scores
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+
+        priors = priors[:, :4]
+        num_gt, num_bboxes = gt_bboxes.size(0), priors.size(0)
+        # compute alignment metric between all bbox and gt
+        overlaps = self.iou_calculator(decode_bboxes, gt_bboxes).detach()
+        bbox_scores = pred_scores[:, gt_labels].detach()
+        # assign 0 by default
+        assigned_gt_inds = priors.new_full((num_bboxes, ), 0, dtype=torch.long)
+        assign_metrics = priors.new_zeros((num_bboxes, ))
+
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = priors.new_zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No gt boxes, assign everything to background
+                assigned_gt_inds[:] = 0
+            assigned_labels = priors.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+            assign_result = AssignResult(
+                num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+            assign_result.assign_metrics = assign_metrics
+            return assign_result
+
+        # select top-k bboxes as candidates for each gt
+        alignment_metrics = bbox_scores**alpha * overlaps**beta
+        topk = min(self.topk, alignment_metrics.size(0))
+        _, candidate_idxs = alignment_metrics.topk(topk, dim=0, largest=True)
+        candidate_metrics = alignment_metrics[candidate_idxs,
+                                              torch.arange(num_gt)]
+        is_pos = candidate_metrics > 0
+
+        # limit the positive sample's center in gt
+        priors_cx = (priors[:, 0] + priors[:, 2]) / 2.0
+        priors_cy = (priors[:, 1] + priors[:, 3]) / 2.0
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
+        ep_priors_cx = priors_cx.view(1, -1).expand(
+            num_gt, num_bboxes).contiguous().view(-1)
+        ep_priors_cy = priors_cy.view(1, -1).expand(
+            num_gt, num_bboxes).contiguous().view(-1)
+        candidate_idxs = candidate_idxs.view(-1)
+
+        # calculate the left, top, right, bottom distance between positive
+        # bbox center and gt side
+        l_ = ep_priors_cx[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 0]
+        t_ = ep_priors_cy[candidate_idxs].view(-1, num_gt) - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - ep_priors_cx[candidate_idxs].view(-1, num_gt)
+        b_ = gt_bboxes[:, 3] - ep_priors_cy[candidate_idxs].view(-1, num_gt)
+        is_in_gts = torch.stack([l_, t_, r_, b_], dim=1).min(dim=1)[0] > 0.01
+        is_pos = is_pos & is_in_gts
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest iou will be selected.
+        overlaps_inf = torch.full_like(overlaps,
+                                       -INF).t().contiguous().view(-1)
+        index = candidate_idxs.view(-1)[is_pos.view(-1)]
+        overlaps_inf[index] = overlaps.t().contiguous().view(-1)[index]
+        overlaps_inf = overlaps_inf.view(num_gt, -1).t()
+
+        max_overlaps, argmax_overlaps = overlaps_inf.max(dim=1)
+        assigned_gt_inds[
+            max_overlaps != -INF] = argmax_overlaps[max_overlaps != -INF] + 1
+        assign_metrics[max_overlaps != -INF] = alignment_metrics[
+            max_overlaps != -INF, argmax_overlaps[max_overlaps != -INF]]
+
+        assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+        pos_inds = torch.nonzero(
+            assigned_gt_inds > 0, as_tuple=False).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+        assign_result = AssignResult(
+            num_gt, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+        assign_result.assign_metrics = assign_metrics
+        return assign_result
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..e48f092ac1ae99eadfdf7502b591b57c782e6354
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/topk_hungarian_assigner.py
@@ -0,0 +1,182 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.structures import BaseDataElement
+from scipy.optimize import linear_sum_assignment
+
+from mmdet.registry import TASK_UTILS
+from .assign_result import AssignResult
+from .task_aligned_assigner import TaskAlignedAssigner
+
+
+@TASK_UTILS.register_module()
+class TopkHungarianAssigner(TaskAlignedAssigner):
+    """Computes 1-to-k matching between ground truth and predictions.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of some components.
+    For DETR the costs are weighted sum of classification cost, regression L1
+    cost and regression iou cost. The targets don't include the no_object, so
+    generally there are more predictions than targets. After the 1-to-k
+    gt-pred matching, the un-matched are treated as backgrounds. Thus each
+    query prediction will be assigned with `0` or a positive integer
+    indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        cls_cost (dict): Classification cost configuration.
+        reg_cost (dict): Regression L1  cost configuration.
+        iou_cost (dict): Regression iou cost configuration.
+    """
+
+    def __init__(self,
+                 *args,
+                 cls_cost=dict(type='FocalLossCost', weight=2.0),
+                 reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+                 iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                 **kwargs):
+        super(TopkHungarianAssigner, self).__init__(*args, **kwargs)
+
+        self.cls_cost = TASK_UTILS.build(cls_cost)
+        self.reg_cost = TASK_UTILS.build(reg_cost)
+        self.iou_cost = TASK_UTILS.build(iou_cost)
+
+    def assign(self,
+               pred_scores,
+               decode_bboxes,
+               gt_bboxes,
+               gt_labels,
+               img_meta,
+               alpha=1,
+               beta=6,
+               **kwargs):
+        """Computes 1-to-k gt-pred matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. Assign every prediction to -1.
+        2. Compute the weighted costs, each cost has shape (num_pred, num_gt).
+        3. Update topk to be min(topk, int(num_pred / num_gt)), then repeat
+            costs topk times to shape: (num_pred, num_gt * topk), so that each
+            gt will match topk predictions.
+        3. Do Hungarian matching on CPU based on the costs.
+        4. Assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        5. Calculate alignment metrics and overlaps of each matched pred-gt
+            pair.
+
+        Args:
+            pred_scores (Tensor): Predicted normalized classification
+                scores for one image, has shape (num_dense_queries,
+                cls_out_channels).
+            decode_bboxes (Tensor): Predicted unnormalized bbox coordinates
+                for one image, has shape (num_dense_queries, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+            gt_bboxes (Tensor): Unnormalized ground truth
+                bboxes for one image, has shape (num_gt, 4) with the
+                last dimension arranged as (x1, y1, x2, y2).
+                NOTE: num_gt is dynamic for each image.
+            gt_labels (Tensor): Ground truth classification
+                    index for the image, has shape (num_gt,).
+                    NOTE: num_gt is dynamic for each image.
+            img_meta (dict): Meta information for one image.
+            alpha (int): Hyper-parameters related to alignment_metrics.
+                Defaults to 1.
+            beta (int): Hyper-parameters related to alignment_metrics.
+                Defaults to 6.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        pred_scores = pred_scores.detach()
+        decode_bboxes = decode_bboxes.detach()
+        temp_overlaps = self.iou_calculator(decode_bboxes, gt_bboxes).detach()
+        bbox_scores = pred_scores[:, gt_labels].detach()
+        alignment_metrics = bbox_scores**alpha * temp_overlaps**beta
+
+        pred_instances = BaseDataElement()
+        gt_instances = BaseDataElement()
+
+        pred_instances.bboxes = decode_bboxes
+        gt_instances.bboxes = gt_bboxes
+
+        pred_instances.scores = pred_scores
+        gt_instances.labels = gt_labels
+
+        reg_cost = self.reg_cost(pred_instances, gt_instances, img_meta)
+        iou_cost = self.iou_cost(pred_instances, gt_instances, img_meta)
+        cls_cost = self.cls_cost(pred_instances, gt_instances, img_meta)
+        all_cost = cls_cost + reg_cost + iou_cost
+
+        num_gt, num_bboxes = gt_bboxes.size(0), pred_scores.size(0)
+        if num_gt > 0:
+            # assign 0 by default
+            assigned_gt_inds = pred_scores.new_full((num_bboxes, ),
+                                                    0,
+                                                    dtype=torch.long)
+            select_cost = all_cost
+
+            topk = min(self.topk, int(len(select_cost) / num_gt))
+
+            # Repeat the ground truth `topk` times to perform 1-to-k gt-pred
+            #   matching. For example, if `num_pred` = 900, `num_gt` = 3, then
+            #   there are only 3 gt-pred pairs in sum for 1-1 matching.
+            #   However, for 1-k gt-pred matching, if `topk` = 4, then each
+            #   gt is assigned 4 unique predictions, so there would be 12
+            #   gt-pred pairs in sum.
+            repeat_select_cost = select_cost[...,
+                                             None].repeat(1, 1, topk).view(
+                                                 select_cost.size(0), -1)
+            # anchor index and gt index
+            matched_row_inds, matched_col_inds = linear_sum_assignment(
+                repeat_select_cost.detach().cpu().numpy())
+            matched_row_inds = torch.from_numpy(matched_row_inds).to(
+                pred_scores.device)
+            matched_col_inds = torch.from_numpy(matched_col_inds).to(
+                pred_scores.device)
+
+            match_gt_ids = matched_col_inds // topk
+            candidate_idxs = matched_row_inds
+
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+
+            if candidate_idxs.numel() > 0:
+                assigned_labels[candidate_idxs] = gt_labels[match_gt_ids]
+            else:
+                assigned_labels = None
+
+            assigned_gt_inds[candidate_idxs] = match_gt_ids + 1
+
+            overlaps = self.iou_calculator(
+                decode_bboxes[candidate_idxs],
+                gt_bboxes[match_gt_ids],
+                is_aligned=True).detach()
+
+            temp_pos_alignment_metrics = alignment_metrics[candidate_idxs]
+            pos_alignment_metrics = torch.gather(temp_pos_alignment_metrics, 1,
+                                                 match_gt_ids[:,
+                                                              None]).view(-1)
+            assign_result = AssignResult(
+                num_gt, assigned_gt_inds, overlaps, labels=assigned_labels)
+
+            assign_result.assign_metrics = pos_alignment_metrics
+            return assign_result
+        else:
+
+            assigned_gt_inds = pred_scores.new_full((num_bboxes, ),
+                                                    -1,
+                                                    dtype=torch.long)
+
+            assigned_labels = pred_scores.new_full((num_bboxes, ),
+                                                   -1,
+                                                   dtype=torch.long)
+
+            assigned_gt_inds[:] = 0
+            return AssignResult(
+                0, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/head_extractor/src/mmdet/models/task_modules/assigners/uniform_assigner.py b/head_extractor/src/mmdet/models/task_modules/assigners/uniform_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a83bfd0b46a3690dce9cf0adf2c1e676f304d06
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/assigners/uniform_assigner.py
@@ -0,0 +1,173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox_xyxy_to_cxcywh
+from mmdet.utils import ConfigType
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class UniformAssigner(BaseAssigner):
+    """Uniform Matching between the priors and gt boxes, which can achieve
+    balance in positive priors, and gt_bboxes_ignore was not considered for
+    now.
+
+    Args:
+        pos_ignore_thr (float): the threshold to ignore positive priors
+        neg_ignore_thr (float): the threshold to ignore negative priors
+        match_times(int): Number of positive priors for each gt box.
+           Defaults to 4.
+        iou_calculator (:obj:`ConfigDict` or dict): Config dict for iou
+            calculator. Defaults to ``dict(type='BboxOverlaps2D')``
+    """
+
+    def __init__(self,
+                 pos_ignore_thr: float,
+                 neg_ignore_thr: float,
+                 match_times: int = 4,
+                 iou_calculator: ConfigType = dict(type='BboxOverlaps2D')):
+        self.match_times = match_times
+        self.pos_ignore_thr = pos_ignore_thr
+        self.neg_ignore_thr = neg_ignore_thr
+        self.iou_calculator = TASK_UTILS.build(iou_calculator)
+
+    def assign(
+            self,
+            pred_instances: InstanceData,
+            gt_instances: InstanceData,
+            gt_instances_ignore: Optional[InstanceData] = None
+    ) -> AssignResult:
+        """Assign gt to priors.
+
+        The assignment is done in following steps
+
+        1. assign -1 by default
+        2. compute the L1 cost between boxes. Note that we use priors and
+           predict boxes both
+        3. compute the ignore indexes use gt_bboxes and predict boxes
+        4. compute the ignore indexes of positive sample use priors and
+           predict boxes
+
+
+        Args:
+            pred_instances (:obj:`InstaceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be priors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            gt_instances (:obj:`InstaceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            gt_instances_ignore (:obj:`InstaceData`, optional): Instances
+                to be ignored during training. It includes ``bboxes``
+                attribute data that is ignored during training and testing.
+                Defaults to None.
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+
+        gt_bboxes = gt_instances.bboxes
+        gt_labels = gt_instances.labels
+        priors = pred_instances.priors
+        bbox_pred = pred_instances.decoder_priors
+
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              0,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            assign_result = AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+            assign_result.set_extra_property(
+                'pos_idx', bbox_pred.new_empty(0, dtype=torch.bool))
+            assign_result.set_extra_property('pos_predicted_boxes',
+                                             bbox_pred.new_empty((0, 4)))
+            assign_result.set_extra_property('target_boxes',
+                                             bbox_pred.new_empty((0, 4)))
+            return assign_result
+
+        # 2. Compute the L1 cost between boxes
+        # Note that we use priors and predict boxes both
+        cost_bbox = torch.cdist(
+            bbox_xyxy_to_cxcywh(bbox_pred),
+            bbox_xyxy_to_cxcywh(gt_bboxes),
+            p=1)
+        cost_bbox_priors = torch.cdist(
+            bbox_xyxy_to_cxcywh(priors), bbox_xyxy_to_cxcywh(gt_bboxes), p=1)
+
+        # We found that topk function has different results in cpu and
+        # cuda mode. In order to ensure consistency with the source code,
+        # we also use cpu mode.
+        # TODO: Check whether the performance of cpu and cuda are the same.
+        C = cost_bbox.cpu()
+        C1 = cost_bbox_priors.cpu()
+
+        # self.match_times x n
+        index = torch.topk(
+            C,  # c=b,n,x c[i]=n,x
+            k=self.match_times,
+            dim=0,
+            largest=False)[1]
+
+        # self.match_times x n
+        index1 = torch.topk(C1, k=self.match_times, dim=0, largest=False)[1]
+        # (self.match_times*2) x n
+        indexes = torch.cat((index, index1),
+                            dim=1).reshape(-1).to(bbox_pred.device)
+
+        pred_overlaps = self.iou_calculator(bbox_pred, gt_bboxes)
+        anchor_overlaps = self.iou_calculator(priors, gt_bboxes)
+        pred_max_overlaps, _ = pred_overlaps.max(dim=1)
+        anchor_max_overlaps, _ = anchor_overlaps.max(dim=0)
+
+        # 3. Compute the ignore indexes use gt_bboxes and predict boxes
+        ignore_idx = pred_max_overlaps > self.neg_ignore_thr
+        assigned_gt_inds[ignore_idx] = -1
+
+        # 4. Compute the ignore indexes of positive sample use priors
+        # and predict boxes
+        pos_gt_index = torch.arange(
+            0, C1.size(1),
+            device=bbox_pred.device).repeat(self.match_times * 2)
+        pos_ious = anchor_overlaps[indexes, pos_gt_index]
+        pos_ignore_idx = pos_ious < self.pos_ignore_thr
+
+        pos_gt_index_with_ignore = pos_gt_index + 1
+        pos_gt_index_with_ignore[pos_ignore_idx] = -1
+        assigned_gt_inds[indexes] = pos_gt_index_with_ignore
+
+        if gt_labels is not None:
+            assigned_labels = assigned_gt_inds.new_full((num_bboxes, ), -1)
+            pos_inds = torch.nonzero(
+                assigned_gt_inds > 0, as_tuple=False).squeeze()
+            if pos_inds.numel() > 0:
+                assigned_labels[pos_inds] = gt_labels[
+                    assigned_gt_inds[pos_inds] - 1]
+        else:
+            assigned_labels = None
+
+        assign_result = AssignResult(
+            num_gts,
+            assigned_gt_inds,
+            anchor_max_overlaps,
+            labels=assigned_labels)
+        assign_result.set_extra_property('pos_idx', ~pos_ignore_idx)
+        assign_result.set_extra_property('pos_predicted_boxes',
+                                         bbox_pred[indexes])
+        assign_result.set_extra_property('target_boxes',
+                                         gt_bboxes[pos_gt_index])
+        return assign_result
diff --git a/head_extractor/src/mmdet/models/task_modules/builder.py b/head_extractor/src/mmdet/models/task_modules/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6736049fef688e0d663d6195c79ec9688dc4c5d7
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/builder.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmdet.registry import TASK_UTILS
+
+PRIOR_GENERATORS = TASK_UTILS
+ANCHOR_GENERATORS = TASK_UTILS
+BBOX_ASSIGNERS = TASK_UTILS
+BBOX_SAMPLERS = TASK_UTILS
+BBOX_CODERS = TASK_UTILS
+MATCH_COSTS = TASK_UTILS
+IOU_CALCULATORS = TASK_UTILS
+
+
+def build_bbox_coder(cfg, **default_args):
+    """Builder of box coder."""
+    warnings.warn('``build_sampler`` would be deprecated soon, please use '
+                  '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_iou_calculator(cfg, default_args=None):
+    """Builder of IoU calculator."""
+    warnings.warn(
+        '``build_iou_calculator`` would be deprecated soon, please use '
+        '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_match_cost(cfg, default_args=None):
+    """Builder of IoU calculator."""
+    warnings.warn('``build_match_cost`` would be deprecated soon, please use '
+                  '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_assigner(cfg, **default_args):
+    """Builder of box assigner."""
+    warnings.warn('``build_assigner`` would be deprecated soon, please use '
+                  '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box sampler."""
+    warnings.warn('``build_sampler`` would be deprecated soon, please use '
+                  '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_prior_generator(cfg, default_args=None):
+    warnings.warn(
+        '``build_prior_generator`` would be deprecated soon, please use '
+        '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
+
+
+def build_anchor_generator(cfg, default_args=None):
+    warnings.warn(
+        '``build_anchor_generator`` would be deprecated soon, please use '
+        '``mmdet.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
diff --git a/head_extractor/src/mmdet/models/task_modules/coders/__init__.py b/head_extractor/src/mmdet/models/task_modules/coders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97c3982140021958dabdd03f8040519f946250ff
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/coders/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_bbox_coder import BaseBBoxCoder
+from .bucketing_bbox_coder import BucketingBBoxCoder
+from .delta_xywh_bbox_coder import (DeltaXYWHBBoxCoder,
+                                    DeltaXYWHBBoxCoderForGLIP)
+from .distance_point_bbox_coder import DistancePointBBoxCoder
+from .legacy_delta_xywh_bbox_coder import LegacyDeltaXYWHBBoxCoder
+from .pseudo_bbox_coder import PseudoBBoxCoder
+from .tblr_bbox_coder import TBLRBBoxCoder
+from .yolo_bbox_coder import YOLOBBoxCoder
+
+__all__ = [
+    'BaseBBoxCoder', 'PseudoBBoxCoder', 'DeltaXYWHBBoxCoder',
+    'LegacyDeltaXYWHBBoxCoder', 'TBLRBBoxCoder', 'YOLOBBoxCoder',
+    'BucketingBBoxCoder', 'DistancePointBBoxCoder', 'DeltaXYWHBBoxCoderForGLIP'
+]
diff --git a/head_extractor/src/mmdet/models/task_modules/coders/base_bbox_coder.py b/head_extractor/src/mmdet/models/task_modules/coders/base_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..806d2651869e02173578c9eb331758743a068dd9
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/coders/base_bbox_coder.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseBBoxCoder(metaclass=ABCMeta):
+    """Base bounding box coder.
+
+    Args:
+        use_box_type (bool): Whether to warp decoded boxes with the
+            box type data structure. Defaults to False.
+    """
+
+    # The size of the last of dimension of the encoded tensor.
+    encode_size = 4
+
+    def __init__(self, use_box_type: bool = False, **kwargs):
+        self.use_box_type = use_box_type
+
+    @abstractmethod
+    def encode(self, bboxes, gt_bboxes):
+        """Encode deltas between bboxes and ground truth boxes."""
+
+    @abstractmethod
+    def decode(self, bboxes, bboxes_pred):
+        """Decode the predicted bboxes according to prediction and base
+        boxes."""
diff --git a/head_extractor/src/mmdet/models/task_modules/coders/bucketing_bbox_coder.py b/head_extractor/src/mmdet/models/task_modules/coders/bucketing_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..4044e1cd91d619521606f3c03032a40a9fc27130
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/coders/bucketing_bbox_coder.py
@@ -0,0 +1,366 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import (BaseBoxes, HorizontalBoxes, bbox_rescale,
+                                   get_box_tensor)
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class BucketingBBoxCoder(BaseBBoxCoder):
+    """Bucketing BBox Coder for Side-Aware Boundary Localization (SABL).
+
+    Boundary Localization with Bucketing and Bucketing Guided Rescoring
+    are implemented here.
+
+    Please refer to https://arxiv.org/abs/1912.04260 for more details.
+
+    Args:
+        num_buckets (int): Number of buckets.
+        scale_factor (int): Scale factor of proposals to generate buckets.
+        offset_topk (int): Topk buckets are used to generate
+             bucket fine regression targets. Defaults to 2.
+        offset_upperbound (float): Offset upperbound to generate
+             bucket fine regression targets.
+             To avoid too large offset displacements. Defaults to 1.0.
+        cls_ignore_neighbor (bool): Ignore second nearest bucket or Not.
+             Defaults to True.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 num_buckets: int,
+                 scale_factor: int,
+                 offset_topk: int = 2,
+                 offset_upperbound: float = 1.0,
+                 cls_ignore_neighbor: bool = True,
+                 clip_border: bool = True,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.num_buckets = num_buckets
+        self.scale_factor = scale_factor
+        self.offset_topk = offset_topk
+        self.offset_upperbound = offset_upperbound
+        self.cls_ignore_neighbor = cls_ignore_neighbor
+        self.clip_border = clip_border
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tuple[Tensor]:
+        """Get bucketing estimation and fine regression targets during
+        training.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): target of the
+                transformation, e.g., ground truth boxes.
+
+        Returns:
+           encoded_bboxes(tuple[Tensor]): bucketing estimation
+            and fine regression targets and weights
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bbox2bucket(bboxes, gt_bboxes, self.num_buckets,
+                                     self.scale_factor, self.offset_topk,
+                                     self.offset_upperbound,
+                                     self.cls_ignore_neighbor)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Tuple[int]] = None
+    ) -> Tuple[Union[Tensor, BaseBoxes], Tensor]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+        Args:
+            boxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes.
+            pred_bboxes (torch.Tensor): Predictions for bucketing estimation
+                and fine regression
+            max_shape (tuple[int], optional): Maximum shape of boxes.
+                Defaults to None.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert len(pred_bboxes) == 2
+        cls_preds, offset_preds = pred_bboxes
+        assert cls_preds.size(0) == bboxes.size(0) and offset_preds.size(
+            0) == bboxes.size(0)
+        bboxes, loc_confidence = bucket2bbox(bboxes, cls_preds, offset_preds,
+                                             self.num_buckets,
+                                             self.scale_factor, max_shape,
+                                             self.clip_border)
+        if self.use_box_type:
+            bboxes = HorizontalBoxes(bboxes, clone=False)
+        return bboxes, loc_confidence
+
+
+def generat_buckets(proposals: Tensor,
+                    num_buckets: int,
+                    scale_factor: float = 1.0) -> Tuple[Tensor]:
+    """Generate buckets w.r.t bucket number and scale factor of proposals.
+
+    Args:
+        proposals (Tensor): Shape (n, 4)
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+
+    Returns:
+        tuple[Tensor]: (bucket_w, bucket_h, l_buckets, r_buckets,
+         t_buckets, d_buckets)
+
+            - bucket_w: Width of buckets on x-axis. Shape (n, ).
+            - bucket_h: Height of buckets on y-axis. Shape (n, ).
+            - l_buckets: Left buckets. Shape (n, ceil(side_num/2)).
+            - r_buckets: Right buckets. Shape (n, ceil(side_num/2)).
+            - t_buckets: Top buckets. Shape (n, ceil(side_num/2)).
+            - d_buckets: Down buckets. Shape (n, ceil(side_num/2)).
+    """
+    proposals = bbox_rescale(proposals, scale_factor)
+
+    # number of buckets in each side
+    side_num = int(np.ceil(num_buckets / 2.0))
+    pw = proposals[..., 2] - proposals[..., 0]
+    ph = proposals[..., 3] - proposals[..., 1]
+    px1 = proposals[..., 0]
+    py1 = proposals[..., 1]
+    px2 = proposals[..., 2]
+    py2 = proposals[..., 3]
+
+    bucket_w = pw / num_buckets
+    bucket_h = ph / num_buckets
+
+    # left buckets
+    l_buckets = px1[:, None] + (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_w[:, None]
+    # right buckets
+    r_buckets = px2[:, None] - (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_w[:, None]
+    # top buckets
+    t_buckets = py1[:, None] + (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_h[:, None]
+    # down buckets
+    d_buckets = py2[:, None] - (0.5 + torch.arange(
+        0, side_num).to(proposals).float())[None, :] * bucket_h[:, None]
+    return bucket_w, bucket_h, l_buckets, r_buckets, t_buckets, d_buckets
+
+
+def bbox2bucket(proposals: Tensor,
+                gt: Tensor,
+                num_buckets: int,
+                scale_factor: float,
+                offset_topk: int = 2,
+                offset_upperbound: float = 1.0,
+                cls_ignore_neighbor: bool = True) -> Tuple[Tensor]:
+    """Generate buckets estimation and fine regression targets.
+
+    Args:
+        proposals (Tensor): Shape (n, 4)
+        gt (Tensor): Shape (n, 4)
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+        offset_topk (int): Topk buckets are used to generate
+             bucket fine regression targets. Defaults to 2.
+        offset_upperbound (float): Offset allowance to generate
+             bucket fine regression targets.
+             To avoid too large offset displacements. Defaults to 1.0.
+        cls_ignore_neighbor (bool): Ignore second nearest bucket or Not.
+             Defaults to True.
+
+    Returns:
+        tuple[Tensor]: (offsets, offsets_weights, bucket_labels, cls_weights).
+
+            - offsets: Fine regression targets. \
+                Shape (n, num_buckets*2).
+            - offsets_weights: Fine regression weights. \
+                Shape (n, num_buckets*2).
+            - bucket_labels: Bucketing estimation labels. \
+                Shape (n, num_buckets*2).
+            - cls_weights: Bucketing estimation weights. \
+                Shape (n, num_buckets*2).
+    """
+    assert proposals.size() == gt.size()
+
+    # generate buckets
+    proposals = proposals.float()
+    gt = gt.float()
+    (bucket_w, bucket_h, l_buckets, r_buckets, t_buckets,
+     d_buckets) = generat_buckets(proposals, num_buckets, scale_factor)
+
+    gx1 = gt[..., 0]
+    gy1 = gt[..., 1]
+    gx2 = gt[..., 2]
+    gy2 = gt[..., 3]
+
+    # generate offset targets and weights
+    # offsets from buckets to gts
+    l_offsets = (l_buckets - gx1[:, None]) / bucket_w[:, None]
+    r_offsets = (r_buckets - gx2[:, None]) / bucket_w[:, None]
+    t_offsets = (t_buckets - gy1[:, None]) / bucket_h[:, None]
+    d_offsets = (d_buckets - gy2[:, None]) / bucket_h[:, None]
+
+    # select top-k nearest buckets
+    l_topk, l_label = l_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    r_topk, r_label = r_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    t_topk, t_label = t_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+    d_topk, d_label = d_offsets.abs().topk(
+        offset_topk, dim=1, largest=False, sorted=True)
+
+    offset_l_weights = l_offsets.new_zeros(l_offsets.size())
+    offset_r_weights = r_offsets.new_zeros(r_offsets.size())
+    offset_t_weights = t_offsets.new_zeros(t_offsets.size())
+    offset_d_weights = d_offsets.new_zeros(d_offsets.size())
+    inds = torch.arange(0, proposals.size(0)).to(proposals).long()
+
+    # generate offset weights of top-k nearest buckets
+    for k in range(offset_topk):
+        if k >= 1:
+            offset_l_weights[inds, l_label[:,
+                                           k]] = (l_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_r_weights[inds, r_label[:,
+                                           k]] = (r_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_t_weights[inds, t_label[:,
+                                           k]] = (t_topk[:, k] <
+                                                  offset_upperbound).float()
+            offset_d_weights[inds, d_label[:,
+                                           k]] = (d_topk[:, k] <
+                                                  offset_upperbound).float()
+        else:
+            offset_l_weights[inds, l_label[:, k]] = 1.0
+            offset_r_weights[inds, r_label[:, k]] = 1.0
+            offset_t_weights[inds, t_label[:, k]] = 1.0
+            offset_d_weights[inds, d_label[:, k]] = 1.0
+
+    offsets = torch.cat([l_offsets, r_offsets, t_offsets, d_offsets], dim=-1)
+    offsets_weights = torch.cat([
+        offset_l_weights, offset_r_weights, offset_t_weights, offset_d_weights
+    ],
+                                dim=-1)
+
+    # generate bucket labels and weight
+    side_num = int(np.ceil(num_buckets / 2.0))
+    labels = torch.stack(
+        [l_label[:, 0], r_label[:, 0], t_label[:, 0], d_label[:, 0]], dim=-1)
+
+    batch_size = labels.size(0)
+    bucket_labels = F.one_hot(labels.view(-1), side_num).view(batch_size,
+                                                              -1).float()
+    bucket_cls_l_weights = (l_offsets.abs() < 1).float()
+    bucket_cls_r_weights = (r_offsets.abs() < 1).float()
+    bucket_cls_t_weights = (t_offsets.abs() < 1).float()
+    bucket_cls_d_weights = (d_offsets.abs() < 1).float()
+    bucket_cls_weights = torch.cat([
+        bucket_cls_l_weights, bucket_cls_r_weights, bucket_cls_t_weights,
+        bucket_cls_d_weights
+    ],
+                                   dim=-1)
+    # ignore second nearest buckets for cls if necessary
+    if cls_ignore_neighbor:
+        bucket_cls_weights = (~((bucket_cls_weights == 1) &
+                                (bucket_labels == 0))).float()
+    else:
+        bucket_cls_weights[:] = 1.0
+    return offsets, offsets_weights, bucket_labels, bucket_cls_weights
+
+
+def bucket2bbox(proposals: Tensor,
+                cls_preds: Tensor,
+                offset_preds: Tensor,
+                num_buckets: int,
+                scale_factor: float = 1.0,
+                max_shape: Optional[Union[Sequence[int], Tensor,
+                                          Sequence[Sequence[int]]]] = None,
+                clip_border: bool = True) -> Tuple[Tensor]:
+    """Apply bucketing estimation (cls preds) and fine regression (offset
+    preds) to generate det bboxes.
+
+    Args:
+        proposals (Tensor): Boxes to be transformed. Shape (n, 4)
+        cls_preds (Tensor): bucketing estimation. Shape (n, num_buckets*2).
+        offset_preds (Tensor): fine regression. Shape (n, num_buckets*2).
+        num_buckets (int): Number of buckets.
+        scale_factor (float): Scale factor to rescale proposals.
+        max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+
+    Returns:
+        tuple[Tensor]: (bboxes, loc_confidence).
+
+            - bboxes: predicted bboxes. Shape (n, 4)
+            - loc_confidence: localization confidence of predicted bboxes.
+                Shape (n,).
+    """
+
+    side_num = int(np.ceil(num_buckets / 2.0))
+    cls_preds = cls_preds.view(-1, side_num)
+    offset_preds = offset_preds.view(-1, side_num)
+
+    scores = F.softmax(cls_preds, dim=1)
+    score_topk, score_label = scores.topk(2, dim=1, largest=True, sorted=True)
+
+    rescaled_proposals = bbox_rescale(proposals, scale_factor)
+
+    pw = rescaled_proposals[..., 2] - rescaled_proposals[..., 0]
+    ph = rescaled_proposals[..., 3] - rescaled_proposals[..., 1]
+    px1 = rescaled_proposals[..., 0]
+    py1 = rescaled_proposals[..., 1]
+    px2 = rescaled_proposals[..., 2]
+    py2 = rescaled_proposals[..., 3]
+
+    bucket_w = pw / num_buckets
+    bucket_h = ph / num_buckets
+
+    score_inds_l = score_label[0::4, 0]
+    score_inds_r = score_label[1::4, 0]
+    score_inds_t = score_label[2::4, 0]
+    score_inds_d = score_label[3::4, 0]
+    l_buckets = px1 + (0.5 + score_inds_l.float()) * bucket_w
+    r_buckets = px2 - (0.5 + score_inds_r.float()) * bucket_w
+    t_buckets = py1 + (0.5 + score_inds_t.float()) * bucket_h
+    d_buckets = py2 - (0.5 + score_inds_d.float()) * bucket_h
+
+    offsets = offset_preds.view(-1, 4, side_num)
+    inds = torch.arange(proposals.size(0)).to(proposals).long()
+    l_offsets = offsets[:, 0, :][inds, score_inds_l]
+    r_offsets = offsets[:, 1, :][inds, score_inds_r]
+    t_offsets = offsets[:, 2, :][inds, score_inds_t]
+    d_offsets = offsets[:, 3, :][inds, score_inds_d]
+
+    x1 = l_buckets - l_offsets * bucket_w
+    x2 = r_buckets - r_offsets * bucket_w
+    y1 = t_buckets - t_offsets * bucket_h
+    y2 = d_buckets - d_offsets * bucket_h
+
+    if clip_border and max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+    bboxes = torch.cat([x1[:, None], y1[:, None], x2[:, None], y2[:, None]],
+                       dim=-1)
+
+    # bucketing guided rescoring
+    loc_confidence = score_topk[:, 0]
+    top2_neighbor_inds = (score_label[:, 0] - score_label[:, 1]).abs() == 1
+    loc_confidence += score_topk[:, 1] * top2_neighbor_inds.float()
+    loc_confidence = loc_confidence.view(-1, 4).mean(dim=1)
+
+    return bboxes, loc_confidence
diff --git a/head_extractor/src/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py b/head_extractor/src/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2b60b5ee791e05ce4f5f8d8e1876f7f61e964ed
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/coders/delta_xywh_bbox_coder.py
@@ -0,0 +1,579 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class DeltaXYWHBBoxCoder(BaseBBoxCoder):
+    """Delta XYWH BBox coder.
+
+    Following the practice in `R-CNN <https://arxiv.org/abs/1311.2524>`_,
+    this coder encodes bbox (x1, y1, x2, y2) into delta (dx, dy, dw, dh) and
+    decodes delta (dx, dy, dw, dh) back to original bbox (x1, y1, x2, y2).
+
+    Args:
+        target_means (Sequence[float]): Denormalizing means of target for
+            delta coordinates
+        target_stds (Sequence[float]): Denormalizing standard deviation of
+            target for delta coordinates
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+        add_ctr_clamp (bool): Whether to add center clamp, when added, the
+            predicted box is clamped is its center is too far away from
+            the original anchor's center. Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+    """
+
+    def __init__(self,
+                 target_means: Sequence[float] = (0., 0., 0., 0.),
+                 target_stds: Sequence[float] = (1., 1., 1., 1.),
+                 clip_border: bool = True,
+                 add_ctr_clamp: bool = False,
+                 ctr_clamp: int = 32,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.means = target_means
+        self.stds = target_stds
+        self.clip_border = clip_border
+        self.add_ctr_clamp = add_ctr_clamp
+        self.ctr_clamp = ctr_clamp
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the
+                transformation, e.g., ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bbox2delta(bboxes, gt_bboxes, self.means, self.stds)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None,
+        wh_ratio_clip: Optional[float] = 16 / 1000
+    ) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes. Shape
+                (B, N, 4) or (N, 4)
+            pred_bboxes (Tensor): Encoded offsets with respect to each roi.
+               Has shape (B, N, num_classes * 4) or (B, N, 4) or
+               (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H
+               when rois is a grid of anchors.Offset encoding follows [1]_.
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+               Sequence[int]],optional): Maximum bounds for boxes, specifies
+               (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then
+               the max_shape should be a Sequence[Sequence[int]]
+               and the length of max_shape should also be B.
+            wh_ratio_clip (float, optional): The allowed ratio between
+                width and height.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        if pred_bboxes.ndim == 3:
+            assert pred_bboxes.size(1) == bboxes.size(1)
+
+        if pred_bboxes.ndim == 2 and not torch.onnx.is_in_onnx_export():
+            # single image decode
+            decoded_bboxes = delta2bbox(bboxes, pred_bboxes, self.means,
+                                        self.stds, max_shape, wh_ratio_clip,
+                                        self.clip_border, self.add_ctr_clamp,
+                                        self.ctr_clamp)
+        else:
+            if pred_bboxes.ndim == 3 and not torch.onnx.is_in_onnx_export():
+                warnings.warn(
+                    'DeprecationWarning: onnx_delta2bbox is deprecated '
+                    'in the case of batch decoding and non-ONNX, '
+                    'please use “delta2bbox” instead. In order to improve '
+                    'the decoding speed, the batch function will no '
+                    'longer be supported. ')
+            decoded_bboxes = onnx_delta2bbox(bboxes, pred_bboxes, self.means,
+                                             self.stds, max_shape,
+                                             wh_ratio_clip, self.clip_border,
+                                             self.add_ctr_clamp,
+                                             self.ctr_clamp)
+
+        if self.use_box_type:
+            assert decoded_bboxes.size(-1) == 4, \
+                ('Cannot warp decoded boxes with box type when decoded boxes'
+                 'have shape of (N, num_classes * 4)')
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
+
+
+@TASK_UTILS.register_module()
+class DeltaXYWHBBoxCoderForGLIP(DeltaXYWHBBoxCoder):
+    """This is designed specifically for the GLIP algorithm.
+
+    In order to completely match the official performance, we need to perform
+    special calculations in the encoding and decoding processes, such as
+    additional +1 and -1 calculations. However, this is not a user-friendly
+    design.
+    """
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the
+                transformation, e.g., ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bbox2delta(bboxes, gt_bboxes, self.means, self.stds)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None,
+        wh_ratio_clip: Optional[float] = 16 / 1000
+    ) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes. Shape
+                (B, N, 4) or (N, 4)
+            pred_bboxes (Tensor): Encoded offsets with respect to each roi.
+               Has shape (B, N, num_classes * 4) or (B, N, 4) or
+               (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H
+               when rois is a grid of anchors.Offset encoding follows [1]_.
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+               Sequence[int]],optional): Maximum bounds for boxes, specifies
+               (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then
+               the max_shape should be a Sequence[Sequence[int]]
+               and the length of max_shape should also be B.
+            wh_ratio_clip (float, optional): The allowed ratio between
+                width and height.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        if pred_bboxes.ndim == 3:
+            assert pred_bboxes.size(1) == bboxes.size(1)
+
+        if pred_bboxes.ndim == 2 and not torch.onnx.is_in_onnx_export():
+            # single image decode
+            decoded_bboxes = delta2bbox_glip(bboxes, pred_bboxes, self.means,
+                                             self.stds, max_shape,
+                                             wh_ratio_clip, self.clip_border,
+                                             self.add_ctr_clamp,
+                                             self.ctr_clamp)
+        else:
+            raise NotImplementedError()
+
+        if self.use_box_type:
+            assert decoded_bboxes.size(-1) == 4, \
+                ('Cannot warp decoded boxes with box type when decoded boxes'
+                 'have shape of (N, num_classes * 4)')
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
+
+
+def bbox2delta(
+    proposals: Tensor,
+    gt: Tensor,
+    means: Sequence[float] = (0., 0., 0., 0.),
+    stds: Sequence[float] = (1., 1., 1., 1.)
+) -> Tensor:
+    """Compute deltas of proposals w.r.t. gt.
+
+    We usually compute the deltas of x, y, w, h of proposals w.r.t ground
+    truth bboxes to get regression target.
+    This is the inverse function of :func:`delta2bbox`.
+
+    Args:
+        proposals (Tensor): Boxes to be transformed, shape (N, ..., 4)
+        gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4)
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+
+    Returns:
+        Tensor: deltas with shape (N, 4), where columns represent dx, dy,
+            dw, dh.
+    """
+    assert proposals.size() == gt.size()
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0]
+    ph = proposals[..., 3] - proposals[..., 1]
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0]
+    gh = gt[..., 3] - gt[..., 1]
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw / pw)
+    dh = torch.log(gh / ph)
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+def delta2bbox(rois: Tensor,
+               deltas: Tensor,
+               means: Sequence[float] = (0., 0., 0., 0.),
+               stds: Sequence[float] = (1., 1., 1., 1.),
+               max_shape: Optional[Union[Sequence[int], Tensor,
+                                         Sequence[Sequence[int]]]] = None,
+               wh_ratio_clip: float = 16 / 1000,
+               clip_border: bool = True,
+               add_ctr_clamp: bool = False,
+               ctr_clamp: int = 32) -> Tensor:
+    """Apply deltas to shift/scale base boxes.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of :func:`bbox2delta`.
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4).
+        deltas (Tensor): Encoded offsets relative to each roi.
+            Has shape (N, num_classes * 4) or (N, 4). Note
+            N = num_base_anchors * W * H, when rois is a grid of
+            anchors. Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates.
+            Default (0., 0., 0., 0.).
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates. Default (1., 1., 1., 1.).
+        max_shape (tuple[int, int]): Maximum bounds for boxes, specifies
+           (H, W). Default None.
+        wh_ratio_clip (float): Maximum aspect ratio for boxes. Default
+            16 / 1000.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Default True.
+        add_ctr_clamp (bool): Whether to add center clamp. When set to True,
+            the center of the prediction bounding box will be clamped to
+            avoid being too far away from the center of the anchor.
+            Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+
+    Returns:
+        Tensor: Boxes with shape (N, num_classes * 4) or (N, 4), where 4
+           represent tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3))
+        tensor([[0.0000, 0.0000, 1.0000, 1.0000],
+                [0.1409, 0.1409, 2.8591, 2.8591],
+                [0.0000, 0.3161, 4.1945, 0.6839],
+                [5.0000, 5.0000, 5.0000, 5.0000]])
+    """
+    num_bboxes, num_classes = deltas.size(0), deltas.size(1) // 4
+    if num_bboxes == 0:
+        return deltas
+
+    deltas = deltas.reshape(-1, 4)
+
+    means = deltas.new_tensor(means).view(1, -1)
+    stds = deltas.new_tensor(stds).view(1, -1)
+    denorm_deltas = deltas * stds + means
+
+    dxy = denorm_deltas[:, :2]
+    dwh = denorm_deltas[:, 2:]
+
+    # Compute width/height of each roi
+    rois_ = rois.repeat(1, num_classes).reshape(-1, 4)
+    pxy = ((rois_[:, :2] + rois_[:, 2:]) * 0.5)
+    pwh = (rois_[:, 2:] - rois_[:, :2])
+
+    dxy_wh = pwh * dxy
+
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    if add_ctr_clamp:
+        dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
+        dwh = torch.clamp(dwh, max=max_ratio)
+    else:
+        dwh = dwh.clamp(min=-max_ratio, max=max_ratio)
+
+    gxy = pxy + dxy_wh
+    gwh = pwh * dwh.exp()
+    x1y1 = gxy - (gwh * 0.5)
+    x2y2 = gxy + (gwh * 0.5)
+    bboxes = torch.cat([x1y1, x2y2], dim=-1)
+    if clip_border and max_shape is not None:
+        bboxes[..., 0::2].clamp_(min=0, max=max_shape[1])
+        bboxes[..., 1::2].clamp_(min=0, max=max_shape[0])
+    bboxes = bboxes.reshape(num_bboxes, -1)
+    return bboxes
+
+
+def onnx_delta2bbox(rois: Tensor,
+                    deltas: Tensor,
+                    means: Sequence[float] = (0., 0., 0., 0.),
+                    stds: Sequence[float] = (1., 1., 1., 1.),
+                    max_shape: Optional[Union[Sequence[int], Tensor,
+                                              Sequence[Sequence[int]]]] = None,
+                    wh_ratio_clip: float = 16 / 1000,
+                    clip_border: Optional[bool] = True,
+                    add_ctr_clamp: bool = False,
+                    ctr_clamp: int = 32) -> Tensor:
+    """Apply deltas to shift/scale base boxes.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of :func:`bbox2delta`.
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4) or (B, N, 4)
+        deltas (Tensor): Encoded offsets with respect to each roi.
+            Has shape (B, N, num_classes * 4) or (B, N, 4) or
+            (N, num_classes * 4) or (N, 4). Note N = num_anchors * W * H
+            when rois is a grid of anchors.Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates.
+            Default (0., 0., 0., 0.).
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates. Default (1., 1., 1., 1.).
+        max_shape (Sequence[int] or torch.Tensor or Sequence[
+            Sequence[int]],optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If rois shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B. Default None.
+        wh_ratio_clip (float): Maximum aspect ratio for boxes.
+            Default 16 / 1000.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Default True.
+        add_ctr_clamp (bool): Whether to add center clamp, when added, the
+            predicted box is clamped is its center is too far away from
+            the original anchor's center. Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+
+    Returns:
+        Tensor: Boxes with shape (B, N, num_classes * 4) or (B, N, 4) or
+           (N, num_classes * 4) or (N, 4), where 4 represent
+           tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> delta2bbox(rois, deltas, max_shape=(32, 32, 3))
+        tensor([[0.0000, 0.0000, 1.0000, 1.0000],
+                [0.1409, 0.1409, 2.8591, 2.8591],
+                [0.0000, 0.3161, 4.1945, 0.6839],
+                [5.0000, 5.0000, 5.0000, 5.0000]])
+    """
+    means = deltas.new_tensor(means).view(1,
+                                          -1).repeat(1,
+                                                     deltas.size(-1) // 4)
+    stds = deltas.new_tensor(stds).view(1, -1).repeat(1, deltas.size(-1) // 4)
+    denorm_deltas = deltas * stds + means
+    dx = denorm_deltas[..., 0::4]
+    dy = denorm_deltas[..., 1::4]
+    dw = denorm_deltas[..., 2::4]
+    dh = denorm_deltas[..., 3::4]
+
+    x1, y1 = rois[..., 0], rois[..., 1]
+    x2, y2 = rois[..., 2], rois[..., 3]
+    # Compute center of each roi
+    px = ((x1 + x2) * 0.5).unsqueeze(-1).expand_as(dx)
+    py = ((y1 + y2) * 0.5).unsqueeze(-1).expand_as(dy)
+    # Compute width/height of each roi
+    pw = (x2 - x1).unsqueeze(-1).expand_as(dw)
+    ph = (y2 - y1).unsqueeze(-1).expand_as(dh)
+
+    dx_width = pw * dx
+    dy_height = ph * dy
+
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    if add_ctr_clamp:
+        dx_width = torch.clamp(dx_width, max=ctr_clamp, min=-ctr_clamp)
+        dy_height = torch.clamp(dy_height, max=ctr_clamp, min=-ctr_clamp)
+        dw = torch.clamp(dw, max=max_ratio)
+        dh = torch.clamp(dh, max=max_ratio)
+    else:
+        dw = dw.clamp(min=-max_ratio, max=max_ratio)
+        dh = dh.clamp(min=-max_ratio, max=max_ratio)
+    # Use exp(network energy) to enlarge/shrink each roi
+    gw = pw * dw.exp()
+    gh = ph * dh.exp()
+    # Use network energy to shift the center of each roi
+    gx = px + dx_width
+    gy = py + dy_height
+    # Convert center-xy/width/height to top-left, bottom-right
+    x1 = gx - gw * 0.5
+    y1 = gy - gh * 0.5
+    x2 = gx + gw * 0.5
+    y2 = gy + gh * 0.5
+
+    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size())
+
+    if clip_border and max_shape is not None:
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            from mmdet.core.export import dynamic_clip_for_onnx
+            x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape)
+            bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view(deltas.size())
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = x1.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(x1)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = x1.new_tensor(0)
+        max_xy = torch.cat(
+            [max_shape] * (deltas.size(-1) // 2),
+            dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
+
+
+def delta2bbox_glip(rois: Tensor,
+                    deltas: Tensor,
+                    means: Sequence[float] = (0., 0., 0., 0.),
+                    stds: Sequence[float] = (1., 1., 1., 1.),
+                    max_shape: Optional[Union[Sequence[int], Tensor,
+                                              Sequence[Sequence[int]]]] = None,
+                    wh_ratio_clip: float = 16 / 1000,
+                    clip_border: bool = True,
+                    add_ctr_clamp: bool = False,
+                    ctr_clamp: int = 32) -> Tensor:
+    """Apply deltas to shift/scale base boxes.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of :func:`bbox2delta`.
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4).
+        deltas (Tensor): Encoded offsets relative to each roi.
+            Has shape (N, num_classes * 4) or (N, 4). Note
+            N = num_base_anchors * W * H, when rois is a grid of
+            anchors. Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates.
+            Default (0., 0., 0., 0.).
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates. Default (1., 1., 1., 1.).
+        max_shape (tuple[int, int]): Maximum bounds for boxes, specifies
+           (H, W). Default None.
+        wh_ratio_clip (float): Maximum aspect ratio for boxes. Default
+            16 / 1000.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Default True.
+        add_ctr_clamp (bool): Whether to add center clamp. When set to True,
+            the center of the prediction bounding box will be clamped to
+            avoid being too far away from the center of the anchor.
+            Only used by YOLOF. Default False.
+        ctr_clamp (int): the maximum pixel shift to clamp. Only used by YOLOF.
+            Default 32.
+
+    Returns:
+        Tensor: Boxes with shape (N, num_classes * 4) or (N, 4), where 4
+           represent tl_x, tl_y, br_x, br_y.
+    """
+    num_bboxes, num_classes = deltas.size(0), deltas.size(1) // 4
+    if num_bboxes == 0:
+        return deltas
+
+    deltas = deltas.reshape(-1, 4)
+
+    means = deltas.new_tensor(means).view(1, -1)
+    stds = deltas.new_tensor(stds).view(1, -1)
+    denorm_deltas = deltas * stds + means
+
+    dxy = denorm_deltas[:, :2]
+    dwh = denorm_deltas[:, 2:]
+
+    # Compute width/height of each roi
+    rois_ = rois.repeat(1, num_classes).reshape(-1, 4)
+    pxy = ((rois_[:, :2] + rois_[:, 2:] - 1) * 0.5)  # note
+    pwh = (rois_[:, 2:] - rois_[:, :2])
+
+    dxy_wh = pwh * dxy
+
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    if add_ctr_clamp:
+        dxy_wh = torch.clamp(dxy_wh, max=ctr_clamp, min=-ctr_clamp)
+        dwh = torch.clamp(dwh, max=max_ratio)
+    else:
+        dwh = dwh.clamp(min=-max_ratio, max=max_ratio)
+
+    gxy = pxy + dxy_wh
+    gwh = pwh * dwh.exp()
+
+    x1y1 = gxy - (gwh - 1) * 0.5  # Note
+    x2y2 = gxy + (gwh - 1) * 0.5  # Note
+
+    bboxes = torch.cat([x1y1, x2y2], dim=-1)
+
+    if clip_border and max_shape is not None:
+        bboxes[..., 0::2].clamp_(min=0, max=max_shape[1] - 1)  # Note
+        bboxes[..., 1::2].clamp_(min=0, max=max_shape[0] - 1)  # Note
+    bboxes = bboxes.reshape(num_bboxes, -1)
+    return bboxes
diff --git a/head_extractor/src/mmdet/models/task_modules/coders/distance_point_bbox_coder.py b/head_extractor/src/mmdet/models/task_modules/coders/distance_point_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab26bf4b96c48df689da3722c23aa65e646348db
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/coders/distance_point_bbox_coder.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import (BaseBoxes, HorizontalBoxes, bbox2distance,
+                                   distance2bbox, get_box_tensor)
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class DistancePointBBoxCoder(BaseBBoxCoder):
+    """Distance Point BBox coder.
+
+    This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,
+    right) and decode it back to the original.
+
+    Args:
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self, clip_border: Optional[bool] = True, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.clip_border = clip_border
+
+    def encode(self,
+               points: Tensor,
+               gt_bboxes: Union[Tensor, BaseBoxes],
+               max_dis: Optional[float] = None,
+               eps: float = 0.1) -> Tensor:
+        """Encode bounding box to distances.
+
+        Args:
+            points (Tensor): Shape (N, 2), The format is [x, y].
+            gt_bboxes (Tensor or :obj:`BaseBoxes`): Shape (N, 4), The format
+                is "xyxy"
+            max_dis (float): Upper bound of the distance. Default None.
+            eps (float): a small value to ensure target < max_dis, instead <=.
+                Default 0.1.
+
+        Returns:
+            Tensor: Box transformation deltas. The shape is (N, 4).
+        """
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert points.size(0) == gt_bboxes.size(0)
+        assert points.size(-1) == 2
+        assert gt_bboxes.size(-1) == 4
+        return bbox2distance(points, gt_bboxes, max_dis, eps)
+
+    def decode(
+        self,
+        points: Tensor,
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None
+    ) -> Union[Tensor, BaseBoxes]:
+        """Decode distance prediction to bounding box.
+
+        Args:
+            points (Tensor): Shape (B, N, 2) or (N, 2).
+            pred_bboxes (Tensor): Distance from the given point to 4
+                boundaries (left, top, right, bottom). Shape (B, N, 4)
+                or (N, 4)
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+                Sequence[int]],optional): Maximum bounds for boxes, specifies
+                (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+                the max_shape should be a Sequence[Sequence[int]],
+                and the length of max_shape should also be B.
+                Default None.
+        Returns:
+            Union[Tensor, :obj:`BaseBoxes`]: Boxes with shape (N, 4) or
+            (B, N, 4)
+        """
+        assert points.size(0) == pred_bboxes.size(0)
+        assert points.size(-1) == 2
+        assert pred_bboxes.size(-1) == 4
+        if self.clip_border is False:
+            max_shape = None
+        bboxes = distance2bbox(points, pred_bboxes, max_shape)
+
+        if self.use_box_type:
+            bboxes = HorizontalBoxes(bboxes)
+        return bboxes
diff --git a/head_extractor/src/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py b/head_extractor/src/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eb1bedb3fbe19433c8bdb37f80891efa2cb72fc
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/coders/legacy_delta_xywh_bbox_coder.py
@@ -0,0 +1,235 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class LegacyDeltaXYWHBBoxCoder(BaseBBoxCoder):
+    """Legacy Delta XYWH BBox coder used in MMDet V1.x.
+
+    Following the practice in R-CNN [1]_, this coder encodes bbox (x1, y1, x2,
+    y2) into delta (dx, dy, dw, dh) and decodes delta (dx, dy, dw, dh)
+    back to original bbox (x1, y1, x2, y2).
+
+    Note:
+        The main difference between :class`LegacyDeltaXYWHBBoxCoder` and
+        :class:`DeltaXYWHBBoxCoder` is whether ``+ 1`` is used during width and
+        height calculation. We suggest to only use this coder when testing with
+        MMDet V1.x models.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Args:
+        target_means (Sequence[float]): denormalizing means of target for
+            delta coordinates
+        target_stds (Sequence[float]): denormalizing standard deviation of
+            target for delta coordinates
+    """
+
+    def __init__(self,
+                 target_means: Sequence[float] = (0., 0., 0., 0.),
+                 target_stds: Sequence[float] = (1., 1., 1., 1.),
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.means = target_means
+        self.stds = target_stds
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): target of the
+                transformation, e.g., ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = legacy_bbox2delta(bboxes, gt_bboxes, self.means,
+                                           self.stds)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None,
+        wh_ratio_clip: Optional[float] = 16 / 1000
+    ) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            boxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            max_shape (tuple[int], optional): Maximum shape of boxes.
+                Defaults to None.
+            wh_ratio_clip (float, optional): The allowed ratio between
+                width and height.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert pred_bboxes.size(0) == bboxes.size(0)
+        decoded_bboxes = legacy_delta2bbox(bboxes, pred_bboxes, self.means,
+                                           self.stds, max_shape, wh_ratio_clip)
+
+        if self.use_box_type:
+            assert decoded_bboxes.size(-1) == 4, \
+                ('Cannot warp decoded boxes with box type when decoded boxes'
+                 'have shape of (N, num_classes * 4)')
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
+
+
+def legacy_bbox2delta(
+    proposals: Tensor,
+    gt: Tensor,
+    means: Sequence[float] = (0., 0., 0., 0.),
+    stds: Sequence[float] = (1., 1., 1., 1.)
+) -> Tensor:
+    """Compute deltas of proposals w.r.t. gt in the MMDet V1.x manner.
+
+    We usually compute the deltas of x, y, w, h of proposals w.r.t ground
+    truth bboxes to get regression target.
+    This is the inverse function of `delta2bbox()`
+
+    Args:
+        proposals (Tensor): Boxes to be transformed, shape (N, ..., 4)
+        gt (Tensor): Gt bboxes to be used as base, shape (N, ..., 4)
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+
+    Returns:
+        Tensor: deltas with shape (N, 4), where columns represent dx, dy,
+            dw, dh.
+    """
+    assert proposals.size() == gt.size()
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0] + 1.0
+    ph = proposals[..., 3] - proposals[..., 1] + 1.0
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0] + 1.0
+    gh = gt[..., 3] - gt[..., 1] + 1.0
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw / pw)
+    dh = torch.log(gh / ph)
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+def legacy_delta2bbox(rois: Tensor,
+                      deltas: Tensor,
+                      means: Sequence[float] = (0., 0., 0., 0.),
+                      stds: Sequence[float] = (1., 1., 1., 1.),
+                      max_shape: Optional[
+                          Union[Sequence[int], Tensor,
+                                Sequence[Sequence[int]]]] = None,
+                      wh_ratio_clip: float = 16 / 1000) -> Tensor:
+    """Apply deltas to shift/scale base boxes in the MMDet V1.x manner.
+
+    Typically the rois are anchor or proposed bounding boxes and the deltas are
+    network outputs used to shift/scale those boxes.
+    This is the inverse function of `bbox2delta()`
+
+    Args:
+        rois (Tensor): Boxes to be transformed. Has shape (N, 4)
+        deltas (Tensor): Encoded offsets with respect to each roi.
+            Has shape (N, 4 * num_classes). Note N = num_anchors * W * H when
+            rois is a grid of anchors. Offset encoding follows [1]_.
+        means (Sequence[float]): Denormalizing means for delta coordinates
+        stds (Sequence[float]): Denormalizing standard deviation for delta
+            coordinates
+        max_shape (tuple[int, int]): Maximum bounds for boxes. specifies (H, W)
+        wh_ratio_clip (float): Maximum aspect ratio for boxes.
+
+    Returns:
+        Tensor: Boxes with shape (N, 4), where columns represent
+            tl_x, tl_y, br_x, br_y.
+
+    References:
+        .. [1] https://arxiv.org/abs/1311.2524
+
+    Example:
+        >>> rois = torch.Tensor([[ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 0.,  0.,  1.,  1.],
+        >>>                      [ 5.,  5.,  5.,  5.]])
+        >>> deltas = torch.Tensor([[  0.,   0.,   0.,   0.],
+        >>>                        [  1.,   1.,   1.,   1.],
+        >>>                        [  0.,   0.,   2.,  -1.],
+        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
+        >>> legacy_delta2bbox(rois, deltas, max_shape=(32, 32))
+        tensor([[0.0000, 0.0000, 1.5000, 1.5000],
+                [0.0000, 0.0000, 5.2183, 5.2183],
+                [0.0000, 0.1321, 7.8891, 0.8679],
+                [5.3967, 2.4251, 6.0033, 3.7749]])
+    """
+    means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4)
+    stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4)
+    denorm_deltas = deltas * stds + means
+    dx = denorm_deltas[:, 0::4]
+    dy = denorm_deltas[:, 1::4]
+    dw = denorm_deltas[:, 2::4]
+    dh = denorm_deltas[:, 3::4]
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    dw = dw.clamp(min=-max_ratio, max=max_ratio)
+    dh = dh.clamp(min=-max_ratio, max=max_ratio)
+    # Compute center of each roi
+    px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
+    py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
+    # Compute width/height of each roi
+    pw = (rois[:, 2] - rois[:, 0] + 1.0).unsqueeze(1).expand_as(dw)
+    ph = (rois[:, 3] - rois[:, 1] + 1.0).unsqueeze(1).expand_as(dh)
+    # Use exp(network energy) to enlarge/shrink each roi
+    gw = pw * dw.exp()
+    gh = ph * dh.exp()
+    # Use network energy to shift the center of each roi
+    gx = px + pw * dx
+    gy = py + ph * dy
+    # Convert center-xy/width/height to top-left, bottom-right
+
+    # The true legacy box coder should +- 0.5 here.
+    # However, current implementation improves the performance when testing
+    # the models trained in MMDetection 1.X (~0.5 bbox AP, 0.2 mask AP)
+    x1 = gx - gw * 0.5
+    y1 = gy - gh * 0.5
+    x2 = gx + gw * 0.5
+    y2 = gy + gh * 0.5
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas)
+    return bboxes
diff --git a/head_extractor/src/mmdet/models/task_modules/coders/pseudo_bbox_coder.py b/head_extractor/src/mmdet/models/task_modules/coders/pseudo_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ee74311f6d12bde49d0c678edb60540a8c95c8b
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/coders/pseudo_bbox_coder.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class PseudoBBoxCoder(BaseBBoxCoder):
+    """Pseudo bounding box coder."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def encode(self, bboxes: Tensor, gt_bboxes: Union[Tensor,
+                                                      BaseBoxes]) -> Tensor:
+        """torch.Tensor: return the given ``bboxes``"""
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        return gt_bboxes
+
+    def decode(self, bboxes: Tensor, pred_bboxes: Union[Tensor,
+                                                        BaseBoxes]) -> Tensor:
+        """torch.Tensor: return the given ``pred_bboxes``"""
+        if self.use_box_type:
+            pred_bboxes = HorizontalBoxes(pred_bboxes)
+        return pred_bboxes
diff --git a/head_extractor/src/mmdet/models/task_modules/coders/tblr_bbox_coder.py b/head_extractor/src/mmdet/models/task_modules/coders/tblr_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..74b388f7bad6ebc1911cee5b0b7d73bbd04de17a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/coders/tblr_bbox_coder.py
@@ -0,0 +1,228 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class TBLRBBoxCoder(BaseBBoxCoder):
+    """TBLR BBox coder.
+
+    Following the practice in `FSAF <https://arxiv.org/abs/1903.00621>`_,
+    this coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,
+    right) and decode it back to the original.
+
+    Args:
+        normalizer (list | float): Normalization factor to be
+          divided with when coding the coordinates. If it is a list, it should
+          have length of 4 indicating normalization factor in tblr dims.
+          Otherwise it is a unified float factor for all dims. Default: 4.0
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 normalizer: Union[Sequence[float], float] = 4.0,
+                 clip_border: bool = True,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.normalizer = normalizer
+        self.clip_border = clip_border
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes`` in the (top, left,
+        bottom, right) order.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): source boxes,
+                e.g., object proposals.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): target of the
+                transformation, e.g., ground truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        encoded_bboxes = bboxes2tblr(
+            bboxes, gt_bboxes, normalizer=self.normalizer)
+        return encoded_bboxes
+
+    def decode(
+        self,
+        bboxes: Union[Tensor, BaseBoxes],
+        pred_bboxes: Tensor,
+        max_shape: Optional[Union[Sequence[int], Tensor,
+                                  Sequence[Sequence[int]]]] = None
+    ) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes.Shape
+                (B, N, 4) or (N, 4)
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+               (B, N, 4) or (N, 4)
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+               Sequence[int]],optional): Maximum bounds for boxes, specifies
+               (H, W, C) or (H, W). If bboxes shape is (B, N, 4), then
+               the max_shape should be a Sequence[Sequence[int]]
+               and the length of max_shape should also be B.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        decoded_bboxes = tblr2bboxes(
+            bboxes,
+            pred_bboxes,
+            normalizer=self.normalizer,
+            max_shape=max_shape,
+            clip_border=self.clip_border)
+
+        if self.use_box_type:
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
+
+
+def bboxes2tblr(priors: Tensor,
+                gts: Tensor,
+                normalizer: Union[Sequence[float], float] = 4.0,
+                normalize_by_wh: bool = True) -> Tensor:
+    """Encode ground truth boxes to tblr coordinate.
+
+    It first convert the gt coordinate to tblr format,
+     (top, bottom, left, right), relative to prior box centers.
+     The tblr coordinate may be normalized by the side length of prior bboxes
+     if `normalize_by_wh` is specified as True, and it is then normalized by
+     the `normalizer` factor.
+
+    Args:
+        priors (Tensor): Prior boxes in point form
+            Shape: (num_proposals,4).
+        gts (Tensor): Coords of ground truth for each prior in point-form
+            Shape: (num_proposals, 4).
+        normalizer (Sequence[float] | float): normalization parameter of
+            encoded boxes. If it is a list, it has to have length = 4.
+            Default: 4.0
+        normalize_by_wh (bool): Whether to normalize tblr coordinate by the
+            side length (wh) of prior bboxes.
+
+    Return:
+        encoded boxes (Tensor), Shape: (num_proposals, 4)
+    """
+
+    # dist b/t match center and prior's center
+    if not isinstance(normalizer, float):
+        normalizer = torch.tensor(normalizer, device=priors.device)
+        assert len(normalizer) == 4, 'Normalizer must have length = 4'
+    assert priors.size(0) == gts.size(0)
+    prior_centers = (priors[:, 0:2] + priors[:, 2:4]) / 2
+    xmin, ymin, xmax, ymax = gts.split(1, dim=1)
+    top = prior_centers[:, 1].unsqueeze(1) - ymin
+    bottom = ymax - prior_centers[:, 1].unsqueeze(1)
+    left = prior_centers[:, 0].unsqueeze(1) - xmin
+    right = xmax - prior_centers[:, 0].unsqueeze(1)
+    loc = torch.cat((top, bottom, left, right), dim=1)
+    if normalize_by_wh:
+        # Normalize tblr by anchor width and height
+        wh = priors[:, 2:4] - priors[:, 0:2]
+        w, h = torch.split(wh, 1, dim=1)
+        loc[:, :2] /= h  # tb is normalized by h
+        loc[:, 2:] /= w  # lr is normalized by w
+    # Normalize tblr by the given normalization factor
+    return loc / normalizer
+
+
+def tblr2bboxes(priors: Tensor,
+                tblr: Tensor,
+                normalizer: Union[Sequence[float], float] = 4.0,
+                normalize_by_wh: bool = True,
+                max_shape: Optional[Union[Sequence[int], Tensor,
+                                          Sequence[Sequence[int]]]] = None,
+                clip_border: bool = True) -> Tensor:
+    """Decode tblr outputs to prediction boxes.
+
+    The process includes 3 steps: 1) De-normalize tblr coordinates by
+    multiplying it with `normalizer`; 2) De-normalize tblr coordinates by the
+    prior bbox width and height if `normalize_by_wh` is `True`; 3) Convert
+    tblr (top, bottom, left, right) pair relative to the center of priors back
+    to (xmin, ymin, xmax, ymax) coordinate.
+
+    Args:
+        priors (Tensor): Prior boxes in point form (x0, y0, x1, y1)
+          Shape: (N,4) or (B, N, 4).
+        tblr (Tensor): Coords of network output in tblr form
+          Shape: (N, 4) or (B, N, 4).
+        normalizer (Sequence[float] | float): Normalization parameter of
+          encoded boxes. By list, it represents the normalization factors at
+          tblr dims. By float, it is the unified normalization factor at all
+          dims. Default: 4.0
+        normalize_by_wh (bool): Whether the tblr coordinates have been
+          normalized by the side length (wh) of prior bboxes.
+        max_shape (Sequence[int] or torch.Tensor or Sequence[
+            Sequence[int]],optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B.
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+
+    Return:
+        encoded boxes (Tensor): Boxes with shape (N, 4) or (B, N, 4)
+    """
+    if not isinstance(normalizer, float):
+        normalizer = torch.tensor(normalizer, device=priors.device)
+        assert len(normalizer) == 4, 'Normalizer must have length = 4'
+    assert priors.size(0) == tblr.size(0)
+    if priors.ndim == 3:
+        assert priors.size(1) == tblr.size(1)
+
+    loc_decode = tblr * normalizer
+    prior_centers = (priors[..., 0:2] + priors[..., 2:4]) / 2
+    if normalize_by_wh:
+        wh = priors[..., 2:4] - priors[..., 0:2]
+        w, h = torch.split(wh, 1, dim=-1)
+        # Inplace operation with slice would failed for exporting to ONNX
+        th = h * loc_decode[..., :2]  # tb
+        tw = w * loc_decode[..., 2:]  # lr
+        loc_decode = torch.cat([th, tw], dim=-1)
+    # Cannot be exported using onnx when loc_decode.split(1, dim=-1)
+    top, bottom, left, right = loc_decode.split((1, 1, 1, 1), dim=-1)
+    xmin = prior_centers[..., 0].unsqueeze(-1) - left
+    xmax = prior_centers[..., 0].unsqueeze(-1) + right
+    ymin = prior_centers[..., 1].unsqueeze(-1) - top
+    ymax = prior_centers[..., 1].unsqueeze(-1) + bottom
+
+    bboxes = torch.cat((xmin, ymin, xmax, ymax), dim=-1)
+
+    if clip_border and max_shape is not None:
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            from mmdet.core.export import dynamic_clip_for_onnx
+            xmin, ymin, xmax, ymax = dynamic_clip_for_onnx(
+                xmin, ymin, xmax, ymax, max_shape)
+            bboxes = torch.cat([xmin, ymin, xmax, ymax], dim=-1)
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = priors.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(priors)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = priors.new_tensor(0)
+        max_xy = torch.cat([max_shape, max_shape],
+                           dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
diff --git a/head_extractor/src/mmdet/models/task_modules/coders/yolo_bbox_coder.py b/head_extractor/src/mmdet/models/task_modules/coders/yolo_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e1c766789bec844ff359e225435bc3b2f5dd736
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/coders/yolo_bbox_coder.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import BaseBoxes, HorizontalBoxes, get_box_tensor
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@TASK_UTILS.register_module()
+class YOLOBBoxCoder(BaseBBoxCoder):
+    """YOLO BBox coder.
+
+    Following `YOLO <https://arxiv.org/abs/1506.02640>`_, this coder divide
+    image into grids, and encode bbox (x1, y1, x2, y2) into (cx, cy, dw, dh).
+    cx, cy in [0., 1.], denotes relative center position w.r.t the center of
+    bboxes. dw, dh are the same as :obj:`DeltaXYWHBBoxCoder`.
+
+    Args:
+        eps (float): Min value of cx, cy when encoding.
+    """
+
+    def __init__(self, eps: float = 1e-6, **kwargs):
+        super().__init__(**kwargs)
+        self.eps = eps
+
+    def encode(self, bboxes: Union[Tensor, BaseBoxes],
+               gt_bboxes: Union[Tensor, BaseBoxes],
+               stride: Union[Tensor, int]) -> Tensor:
+        """Get box regression transformation deltas that can be used to
+        transform the ``bboxes`` into the ``gt_bboxes``.
+
+        Args:
+            bboxes (torch.Tensor or :obj:`BaseBoxes`): Source boxes,
+                e.g., anchors.
+            gt_bboxes (torch.Tensor or :obj:`BaseBoxes`): Target of the
+                transformation, e.g., ground-truth boxes.
+            stride (torch.Tensor | int): Stride of bboxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas
+        """
+        bboxes = get_box_tensor(bboxes)
+        gt_bboxes = get_box_tensor(gt_bboxes)
+        assert bboxes.size(0) == gt_bboxes.size(0)
+        assert bboxes.size(-1) == gt_bboxes.size(-1) == 4
+        x_center_gt = (gt_bboxes[..., 0] + gt_bboxes[..., 2]) * 0.5
+        y_center_gt = (gt_bboxes[..., 1] + gt_bboxes[..., 3]) * 0.5
+        w_gt = gt_bboxes[..., 2] - gt_bboxes[..., 0]
+        h_gt = gt_bboxes[..., 3] - gt_bboxes[..., 1]
+        x_center = (bboxes[..., 0] + bboxes[..., 2]) * 0.5
+        y_center = (bboxes[..., 1] + bboxes[..., 3]) * 0.5
+        w = bboxes[..., 2] - bboxes[..., 0]
+        h = bboxes[..., 3] - bboxes[..., 1]
+        w_target = torch.log((w_gt / w).clamp(min=self.eps))
+        h_target = torch.log((h_gt / h).clamp(min=self.eps))
+        x_center_target = ((x_center_gt - x_center) / stride + 0.5).clamp(
+            self.eps, 1 - self.eps)
+        y_center_target = ((y_center_gt - y_center) / stride + 0.5).clamp(
+            self.eps, 1 - self.eps)
+        encoded_bboxes = torch.stack(
+            [x_center_target, y_center_target, w_target, h_target], dim=-1)
+        return encoded_bboxes
+
+    def decode(self, bboxes: Union[Tensor, BaseBoxes], pred_bboxes: Tensor,
+               stride: Union[Tensor, int]) -> Union[Tensor, BaseBoxes]:
+        """Apply transformation `pred_bboxes` to `boxes`.
+
+        Args:
+            boxes (torch.Tensor or :obj:`BaseBoxes`): Basic boxes,
+                e.g. anchors.
+            pred_bboxes (torch.Tensor): Encoded boxes with shape
+            stride (torch.Tensor | int): Strides of bboxes.
+
+        Returns:
+            Union[torch.Tensor, :obj:`BaseBoxes`]: Decoded boxes.
+        """
+        bboxes = get_box_tensor(bboxes)
+        assert pred_bboxes.size(-1) == bboxes.size(-1) == 4
+        xy_centers = (bboxes[..., :2] + bboxes[..., 2:]) * 0.5 + (
+            pred_bboxes[..., :2] - 0.5) * stride
+        whs = (bboxes[..., 2:] -
+               bboxes[..., :2]) * 0.5 * pred_bboxes[..., 2:].exp()
+        decoded_bboxes = torch.stack(
+            (xy_centers[..., 0] - whs[..., 0], xy_centers[..., 1] -
+             whs[..., 1], xy_centers[..., 0] + whs[..., 0],
+             xy_centers[..., 1] + whs[..., 1]),
+            dim=-1)
+
+        if self.use_box_type:
+            decoded_bboxes = HorizontalBoxes(decoded_bboxes)
+        return decoded_bboxes
diff --git a/head_extractor/src/mmdet/models/task_modules/prior_generators/__init__.py b/head_extractor/src/mmdet/models/task_modules/prior_generators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7795e98ca77bb5ffc77ff1da848130717d8f85a6
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/prior_generators/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor_generator import (AnchorGenerator, LegacyAnchorGenerator,
+                               SSDAnchorGenerator, YOLOAnchorGenerator)
+from .point_generator import MlvlPointGenerator, PointGenerator
+from .utils import anchor_inside_flags, calc_region
+
+__all__ = [
+    'AnchorGenerator', 'LegacyAnchorGenerator', 'anchor_inside_flags',
+    'PointGenerator', 'calc_region', 'YOLOAnchorGenerator',
+    'MlvlPointGenerator', 'SSDAnchorGenerator'
+]
diff --git a/head_extractor/src/mmdet/models/task_modules/prior_generators/anchor_generator.py b/head_extractor/src/mmdet/models/task_modules/prior_generators/anchor_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..2757697ce2283ec8b46ba89325e63fad0be4a7e8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/prior_generators/anchor_generator.py
@@ -0,0 +1,848 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.utils import is_tuple_of
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import HorizontalBoxes
+
+DeviceType = Union[str, torch.device]
+
+
+@TASK_UTILS.register_module()
+class AnchorGenerator:
+    """Standard anchor generator for 2D anchor-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int], Optional): Anchor scales for anchors
+            in a single level. It cannot be set at the same time
+            if `octave_base_scale` and `scales_per_octave` are set.
+        base_sizes (list[int], Optional): The basic sizes
+            of anchors in multiple levels.
+            If None is given, strides will be used as base_sizes.
+            (If strides are non square, the shortest stride is taken.)
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int, Optional): The base scale of octave.
+        scales_per_octave (int, Optional): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float]], Optional): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. If a list of tuple of
+            float is given, they will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in proportion to anchors'
+            width and height. By default it is 0 in V2.0.
+        use_box_type (bool): Whether to warp anchors with the box type data
+            structure. Defaults to False.
+
+    Examples:
+        >>> from mmdet.models.task_modules.
+        ... prior_generators import AnchorGenerator
+        >>> self = AnchorGenerator([16], [1.], [1.], [9])
+        >>> all_anchors = self.grid_priors([(2, 2)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]])]
+        >>> self = AnchorGenerator([16, 32], [1.], [1.], [9, 18])
+        >>> all_anchors = self.grid_priors([(2, 2), (1, 1)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]]), \
+        tensor([[-9., -9., 9., 9.]])]
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 ratios: List[float],
+                 scales: Optional[List[int]] = None,
+                 base_sizes: Optional[List[int]] = None,
+                 scale_major: bool = True,
+                 octave_base_scale: Optional[int] = None,
+                 scales_per_octave: Optional[int] = None,
+                 centers: Optional[List[Tuple[float, float]]] = None,
+                 center_offset: float = 0.,
+                 use_box_type: bool = False) -> None:
+        # check center and center_offset
+        if center_offset != 0:
+            assert centers is None, 'center cannot be set when center_offset' \
+                                    f'!=0, {centers} is given.'
+        if not (0 <= center_offset <= 1):
+            raise ValueError('center_offset should be in range [0, 1], '
+                             f'{center_offset} is given.')
+        if centers is not None:
+            assert len(centers) == len(strides), \
+                'The number of strides should be the same as centers, got ' \
+                f'{strides} and {centers}'
+
+        # calculate base sizes of anchors
+        self.strides = [_pair(stride) for stride in strides]
+        self.base_sizes = [min(stride) for stride in self.strides
+                           ] if base_sizes is None else base_sizes
+        assert len(self.base_sizes) == len(self.strides), \
+            'The number of strides should be the same as base sizes, got ' \
+            f'{self.strides} and {self.base_sizes}'
+
+        # calculate scales of anchors
+        assert ((octave_base_scale is not None
+                 and scales_per_octave is not None) ^ (scales is not None)), \
+            'scales and octave_base_scale with scales_per_octave cannot' \
+            ' be set at the same time'
+        if scales is not None:
+            self.scales = torch.Tensor(scales)
+        elif octave_base_scale is not None and scales_per_octave is not None:
+            octave_scales = np.array(
+                [2**(i / scales_per_octave) for i in range(scales_per_octave)])
+            scales = octave_scales * octave_base_scale
+            self.scales = torch.Tensor(scales)
+        else:
+            raise ValueError('Either scales or octave_base_scale with '
+                             'scales_per_octave should be set')
+
+        self.octave_base_scale = octave_base_scale
+        self.scales_per_octave = scales_per_octave
+        self.ratios = torch.Tensor(ratios)
+        self.scale_major = scale_major
+        self.centers = centers
+        self.center_offset = center_offset
+        self.base_anchors = self.gen_base_anchors()
+        self.use_box_type = use_box_type
+
+    @property
+    def num_base_anchors(self) -> List[int]:
+        """list[int]: total number of base anchors in a feature grid"""
+        return self.num_base_priors
+
+    @property
+    def num_base_priors(self) -> List[int]:
+        """list[int]: The number of priors (anchors) at a point
+        on the feature grid"""
+        return [base_anchors.size(0) for base_anchors in self.base_anchors]
+
+    @property
+    def num_levels(self) -> int:
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    def gen_base_anchors(self) -> List[Tensor]:
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(
+                self.gen_single_level_base_anchors(
+                    base_size,
+                    scales=self.scales,
+                    ratios=self.ratios,
+                    center=center))
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self,
+                                      base_size: Union[int, float],
+                                      scales: Tensor,
+                                      ratios: Tensor,
+                                      center: Optional[Tuple[float]] = None) \
+            -> Tensor:
+        """Generate base anchors of a single level.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between the height
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * w
+            y_center = self.center_offset * h
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [
+            x_center - 0.5 * ws, y_center - 0.5 * hs, x_center + 0.5 * ws,
+            y_center + 0.5 * hs
+        ]
+        base_anchors = torch.stack(base_anchors, dim=-1)
+
+        return base_anchors
+
+    def _meshgrid(self,
+                  x: Tensor,
+                  y: Tensor,
+                  row_major: bool = True) -> Tuple[Tensor]:
+        """Generate mesh grid of x and y.
+
+        Args:
+            x (torch.Tensor): Grids of x dimension.
+            y (torch.Tensor): Grids of y dimension.
+            row_major (bool): Whether to return y grids first.
+                Defaults to True.
+
+        Returns:
+            tuple[torch.Tensor]: The mesh grids of x and y.
+        """
+        # use shape instead of len to keep tracing while exporting to onnx
+        xx = x.repeat(y.shape[0])
+        yy = y.view(-1, 1).repeat(1, x.shape[0]).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_priors(self,
+                    featmap_sizes: List[Tuple],
+                    dtype: torch.dtype = torch.float32,
+                    device: DeviceType = 'cuda') -> List[Tensor]:
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            dtype (:obj:`torch.dtype`): Dtype of priors.
+                Defaults to torch.float32.
+            device (str | torch.device): The device where the anchors
+                will be put on.
+
+        Return:
+            list[torch.Tensor]: Anchors in multiple feature levels. \
+                The sizes of each tensor should be [N, 4], where \
+                N = width * height * num_base_anchors, width and height \
+                are the sizes of the corresponding feature level, \
+                num_base_anchors is the number of anchors for that level.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_priors(
+                featmap_sizes[i], level_idx=i, dtype=dtype, device=device)
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_priors(self,
+                                 featmap_size: Tuple[int, int],
+                                 level_idx: int,
+                                 dtype: torch.dtype = torch.float32,
+                                 device: DeviceType = 'cuda') -> Tensor:
+        """Generate grid anchors of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int, int]): Size of the feature maps.
+            level_idx (int): The index of corresponding feature map level.
+            dtype (obj:`torch.dtype`): Date type of points.Defaults to
+                ``torch.float32``.
+            device (str | torch.device): The device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature maps.
+        """
+
+        base_anchors = self.base_anchors[level_idx].to(device).to(dtype)
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        # First create Range with the default dtype, than convert to
+        # target `dtype` for onnx exporting.
+        shift_x = torch.arange(0, feat_w, device=device).to(dtype) * stride_w
+        shift_y = torch.arange(0, feat_h, device=device).to(dtype) * stride_h
+
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        if self.use_box_type:
+            all_anchors = HorizontalBoxes(all_anchors)
+        return all_anchors
+
+    def sparse_priors(self,
+                      prior_idxs: Tensor,
+                      featmap_size: Tuple[int, int],
+                      level_idx: int,
+                      dtype: torch.dtype = torch.float32,
+                      device: DeviceType = 'cuda') -> Tensor:
+        """Generate sparse anchors according to the ``prior_idxs``.
+
+        Args:
+            prior_idxs (Tensor): The index of corresponding anchors
+                in the feature map.
+            featmap_size (tuple[int, int]): feature map size arrange as (h, w).
+            level_idx (int): The level index of corresponding feature
+                map.
+            dtype (obj:`torch.dtype`): Date type of points.Defaults to
+                ``torch.float32``.
+            device (str | torch.device): The device where the points is
+                located.
+        Returns:
+            Tensor: Anchor with shape (N, 4), N should be equal to
+                the length of ``prior_idxs``.
+        """
+
+        height, width = featmap_size
+        num_base_anchors = self.num_base_anchors[level_idx]
+        base_anchor_id = prior_idxs % num_base_anchors
+        x = (prior_idxs //
+             num_base_anchors) % width * self.strides[level_idx][0]
+        y = (prior_idxs // width //
+             num_base_anchors) % height * self.strides[level_idx][1]
+        priors = torch.stack([x, y, x, y], 1).to(dtype).to(device) + \
+            self.base_anchors[level_idx][base_anchor_id, :].to(device)
+
+        return priors
+
+    def grid_anchors(self,
+                     featmap_sizes: List[Tuple],
+                     device: DeviceType = 'cuda') -> List[Tensor]:
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            device (str | torch.device): Device where the anchors will be
+                put on.
+
+        Return:
+            list[torch.Tensor]: Anchors in multiple feature levels. \
+                The sizes of each tensor should be [N, 4], where \
+                N = width * height * num_base_anchors, width and height \
+                are the sizes of the corresponding feature level, \
+                num_base_anchors is the number of anchors for that level.
+        """
+        warnings.warn('``grid_anchors`` would be deprecated soon. '
+                      'Please use ``grid_priors`` ')
+
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_anchors(
+                self.base_anchors[i].to(device),
+                featmap_sizes[i],
+                self.strides[i],
+                device=device)
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_anchors(self,
+                                  base_anchors: Tensor,
+                                  featmap_size: Tuple[int, int],
+                                  stride: Tuple[int, int] = (16, 16),
+                                  device: DeviceType = 'cuda') -> Tensor:
+        """Generate grid anchors of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_anchors``.
+
+        Args:
+            base_anchors (torch.Tensor): The base anchors of a feature grid.
+            featmap_size (tuple[int]): Size of the feature maps.
+            stride (tuple[int, int]): Stride of the feature map in order
+                (w, h). Defaults to (16, 16).
+            device (str | torch.device): Device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature maps.
+        """
+
+        warnings.warn(
+            '``single_level_grid_anchors`` would be deprecated soon. '
+            'Please use ``single_level_grid_priors`` ')
+
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0, feat_w, device=device) * stride[0]
+        shift_y = torch.arange(0, feat_h, device=device) * stride[1]
+
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        shifts = shifts.type_as(base_anchors)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        return all_anchors
+
+    def valid_flags(self,
+                    featmap_sizes: List[Tuple[int, int]],
+                    pad_shape: Tuple,
+                    device: DeviceType = 'cuda') -> List[Tensor]:
+        """Generate valid flags of anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple[int, int])): List of feature map sizes in
+                multiple feature levels.
+            pad_shape (tuple): The padded shape of the image.
+            device (str | torch.device): Device where the anchors will be
+                put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of anchors in multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            anchor_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / anchor_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / anchor_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w),
+                                                  (valid_feat_h, valid_feat_w),
+                                                  self.num_base_anchors[i],
+                                                  device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self,
+                                 featmap_size: Tuple[int, int],
+                                 valid_size: Tuple[int, int],
+                                 num_base_anchors: int,
+                                 device: DeviceType = 'cuda') -> Tensor:
+        """Generate the valid flags of anchor in a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+            num_base_anchors (int): The number of base anchors.
+            device (str | torch.device): Device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each anchor in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        valid = valid[:, None].expand(valid.size(0),
+                                      num_base_anchors).contiguous().view(-1)
+        return valid
+
+    def __repr__(self) -> str:
+        """str: a string that describes the module"""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}strides={self.strides},\n'
+        repr_str += f'{indent_str}ratios={self.ratios},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+        repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+        repr_str += f'{indent_str}octave_base_scale='
+        repr_str += f'{self.octave_base_scale},\n'
+        repr_str += f'{indent_str}scales_per_octave='
+        repr_str += f'{self.scales_per_octave},\n'
+        repr_str += f'{indent_str}num_levels={self.num_levels}\n'
+        repr_str += f'{indent_str}centers={self.centers},\n'
+        repr_str += f'{indent_str}center_offset={self.center_offset})'
+        return repr_str
+
+
+@TASK_UTILS.register_module()
+class SSDAnchorGenerator(AnchorGenerator):
+    """Anchor generator for SSD.
+
+    Args:
+        strides (list[int]  | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        min_sizes (list[float]): The list of minimum anchor sizes on each
+            level.
+        max_sizes (list[float]): The list of maximum anchor sizes on each
+            level.
+        basesize_ratio_range (tuple(float)): Ratio range of anchors. Being
+            used when not setting min_sizes and max_sizes.
+        input_size (int): Size of feature map, 300 for SSD300, 512 for
+            SSD512. Being used when not setting min_sizes and max_sizes.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. It is always set to be False in SSD.
+        use_box_type (bool): Whether to warp anchors with the box type data
+            structure. Defaults to False.
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 ratios: List[float],
+                 min_sizes: Optional[List[float]] = None,
+                 max_sizes: Optional[List[float]] = None,
+                 basesize_ratio_range: Tuple[float] = (0.15, 0.9),
+                 input_size: int = 300,
+                 scale_major: bool = True,
+                 use_box_type: bool = False) -> None:
+        assert len(strides) == len(ratios)
+        assert not (min_sizes is None) ^ (max_sizes is None)
+        self.strides = [_pair(stride) for stride in strides]
+        self.centers = [(stride[0] / 2., stride[1] / 2.)
+                        for stride in self.strides]
+
+        if min_sizes is None and max_sizes is None:
+            # use hard code to generate SSD anchors
+            self.input_size = input_size
+            assert is_tuple_of(basesize_ratio_range, float)
+            self.basesize_ratio_range = basesize_ratio_range
+            # calculate anchor ratios and sizes
+            min_ratio, max_ratio = basesize_ratio_range
+            min_ratio = int(min_ratio * 100)
+            max_ratio = int(max_ratio * 100)
+            step = int(np.floor(max_ratio - min_ratio) / (self.num_levels - 2))
+            min_sizes = []
+            max_sizes = []
+            for ratio in range(int(min_ratio), int(max_ratio) + 1, step):
+                min_sizes.append(int(self.input_size * ratio / 100))
+                max_sizes.append(int(self.input_size * (ratio + step) / 100))
+            if self.input_size == 300:
+                if basesize_ratio_range[0] == 0.15:  # SSD300 COCO
+                    min_sizes.insert(0, int(self.input_size * 7 / 100))
+                    max_sizes.insert(0, int(self.input_size * 15 / 100))
+                elif basesize_ratio_range[0] == 0.2:  # SSD300 VOC
+                    min_sizes.insert(0, int(self.input_size * 10 / 100))
+                    max_sizes.insert(0, int(self.input_size * 20 / 100))
+                else:
+                    raise ValueError(
+                        'basesize_ratio_range[0] should be either 0.15'
+                        'or 0.2 when input_size is 300, got '
+                        f'{basesize_ratio_range[0]}.')
+            elif self.input_size == 512:
+                if basesize_ratio_range[0] == 0.1:  # SSD512 COCO
+                    min_sizes.insert(0, int(self.input_size * 4 / 100))
+                    max_sizes.insert(0, int(self.input_size * 10 / 100))
+                elif basesize_ratio_range[0] == 0.15:  # SSD512 VOC
+                    min_sizes.insert(0, int(self.input_size * 7 / 100))
+                    max_sizes.insert(0, int(self.input_size * 15 / 100))
+                else:
+                    raise ValueError(
+                        'When not setting min_sizes and max_sizes,'
+                        'basesize_ratio_range[0] should be either 0.1'
+                        'or 0.15 when input_size is 512, got'
+                        f' {basesize_ratio_range[0]}.')
+            else:
+                raise ValueError(
+                    'Only support 300 or 512 in SSDAnchorGenerator when '
+                    'not setting min_sizes and max_sizes, '
+                    f'got {self.input_size}.')
+
+        assert len(min_sizes) == len(max_sizes) == len(strides)
+
+        anchor_ratios = []
+        anchor_scales = []
+        for k in range(len(self.strides)):
+            scales = [1., np.sqrt(max_sizes[k] / min_sizes[k])]
+            anchor_ratio = [1.]
+            for r in ratios[k]:
+                anchor_ratio += [1 / r, r]  # 4 or 6 ratio
+            anchor_ratios.append(torch.Tensor(anchor_ratio))
+            anchor_scales.append(torch.Tensor(scales))
+
+        self.base_sizes = min_sizes
+        self.scales = anchor_scales
+        self.ratios = anchor_ratios
+        self.scale_major = scale_major
+        self.center_offset = 0
+        self.base_anchors = self.gen_base_anchors()
+        self.use_box_type = use_box_type
+
+    def gen_base_anchors(self) -> List[Tensor]:
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            base_anchors = self.gen_single_level_base_anchors(
+                base_size,
+                scales=self.scales[i],
+                ratios=self.ratios[i],
+                center=self.centers[i])
+            indices = list(range(len(self.ratios[i])))
+            indices.insert(1, len(indices))
+            base_anchors = torch.index_select(base_anchors, 0,
+                                              torch.LongTensor(indices))
+            multi_level_base_anchors.append(base_anchors)
+        return multi_level_base_anchors
+
+    def __repr__(self) -> str:
+        """str: a string that describes the module"""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}strides={self.strides},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+        repr_str += f'{indent_str}input_size={self.input_size},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}ratios={self.ratios},\n'
+        repr_str += f'{indent_str}num_levels={self.num_levels},\n'
+        repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+        repr_str += f'{indent_str}basesize_ratio_range='
+        repr_str += f'{self.basesize_ratio_range})'
+        return repr_str
+
+
+@TASK_UTILS.register_module()
+class LegacyAnchorGenerator(AnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    Note:
+        Difference to the V2.0 anchor generator:
+
+        1. The center offset of V1.x anchors are set to be 0.5 rather than 0.
+        2. The width/height are minused by 1 when calculating the anchors' \
+            centers and corners to meet the V1.x coordinate system.
+        3. The anchors' corners are quantized.
+
+    Args:
+        strides (list[int] | list[tuple[int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int] | None): Anchor scales for anchors in a single level.
+            It cannot be set at the same time if `octave_base_scale` and
+            `scales_per_octave` are set.
+        base_sizes (list[int]): The basic sizes of anchors in multiple levels.
+            If None is given, strides will be used to generate base_sizes.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int): The base scale of octave.
+        scales_per_octave (int): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float, float]] | None): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. It a list of float
+            is given, this list will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in proportion to anchors'
+            width and height. By default it is 0.5 in V2.0 but it should be 0.5
+            in v1.x models.
+        use_box_type (bool): Whether to warp anchors with the box type data
+            structure. Defaults to False.
+
+    Examples:
+        >>> from mmdet.models.task_modules.
+        ... prior_generators import LegacyAnchorGenerator
+        >>> self = LegacyAnchorGenerator(
+        >>>     [16], [1.], [1.], [9], center_offset=0.5)
+        >>> all_anchors = self.grid_anchors(((2, 2),), device='cpu')
+        >>> print(all_anchors)
+        [tensor([[ 0.,  0.,  8.,  8.],
+                [16.,  0., 24.,  8.],
+                [ 0., 16.,  8., 24.],
+                [16., 16., 24., 24.]])]
+    """
+
+    def gen_single_level_base_anchors(self,
+                                      base_size: Union[int, float],
+                                      scales: Tensor,
+                                      ratios: Tensor,
+                                      center: Optional[Tuple[float]] = None) \
+            -> Tensor:
+        """Generate base anchors of a single level.
+
+        Note:
+            The width/height of anchors are minused by 1 when calculating \
+                the centers and corners to meet the V1.x coordinate system.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between the height.
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature map.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * (w - 1)
+            y_center = self.center_offset * (h - 1)
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [
+            x_center - 0.5 * (ws - 1), y_center - 0.5 * (hs - 1),
+            x_center + 0.5 * (ws - 1), y_center + 0.5 * (hs - 1)
+        ]
+        base_anchors = torch.stack(base_anchors, dim=-1).round()
+
+        return base_anchors
+
+
+@TASK_UTILS.register_module()
+class LegacySSDAnchorGenerator(SSDAnchorGenerator, LegacyAnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    The difference between `LegacySSDAnchorGenerator` and `SSDAnchorGenerator`
+    can be found in `LegacyAnchorGenerator`.
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 ratios: List[float],
+                 basesize_ratio_range: Tuple[float],
+                 input_size: int = 300,
+                 scale_major: bool = True,
+                 use_box_type: bool = False) -> None:
+        super(LegacySSDAnchorGenerator, self).__init__(
+            strides=strides,
+            ratios=ratios,
+            basesize_ratio_range=basesize_ratio_range,
+            input_size=input_size,
+            scale_major=scale_major,
+            use_box_type=use_box_type)
+        self.centers = [((stride - 1) / 2., (stride - 1) / 2.)
+                        for stride in strides]
+        self.base_anchors = self.gen_base_anchors()
+
+
+@TASK_UTILS.register_module()
+class YOLOAnchorGenerator(AnchorGenerator):
+    """Anchor generator for YOLO.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        base_sizes (list[list[tuple[int, int]]]): The basic sizes
+            of anchors in multiple levels.
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 base_sizes: List[List[Tuple[int, int]]],
+                 use_box_type: bool = False) -> None:
+        self.strides = [_pair(stride) for stride in strides]
+        self.centers = [(stride[0] / 2., stride[1] / 2.)
+                        for stride in self.strides]
+        self.base_sizes = []
+        num_anchor_per_level = len(base_sizes[0])
+        for base_sizes_per_level in base_sizes:
+            assert num_anchor_per_level == len(base_sizes_per_level)
+            self.base_sizes.append(
+                [_pair(base_size) for base_size in base_sizes_per_level])
+        self.base_anchors = self.gen_base_anchors()
+        self.use_box_type = use_box_type
+
+    @property
+    def num_levels(self) -> int:
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.base_sizes)
+
+    def gen_base_anchors(self) -> List[Tensor]:
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_sizes_per_level in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(
+                self.gen_single_level_base_anchors(base_sizes_per_level,
+                                                   center))
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self,
+                                      base_sizes_per_level: List[Tuple[int]],
+                                      center: Optional[Tuple[float]] = None) \
+            -> Tensor:
+        """Generate base anchors of a single level.
+
+        Args:
+            base_sizes_per_level (list[tuple[int]]): Basic sizes of
+                anchors.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        x_center, y_center = center
+        base_anchors = []
+        for base_size in base_sizes_per_level:
+            w, h = base_size
+
+            # use float anchor and the anchor's center is aligned with the
+            # pixel center
+            base_anchor = torch.Tensor([
+                x_center - 0.5 * w, y_center - 0.5 * h, x_center + 0.5 * w,
+                y_center + 0.5 * h
+            ])
+            base_anchors.append(base_anchor)
+        base_anchors = torch.stack(base_anchors, dim=0)
+
+        return base_anchors
diff --git a/head_extractor/src/mmdet/models/task_modules/prior_generators/point_generator.py b/head_extractor/src/mmdet/models/task_modules/prior_generators/point_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..c87ad656c61cb251bfdfcbd23b1cc5263c68bf5f
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/prior_generators/point_generator.py
@@ -0,0 +1,321 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.registry import TASK_UTILS
+
+DeviceType = Union[str, torch.device]
+
+
+@TASK_UTILS.register_module()
+class PointGenerator:
+
+    def _meshgrid(self,
+                  x: Tensor,
+                  y: Tensor,
+                  row_major: bool = True) -> Tuple[Tensor, Tensor]:
+        """Generate mesh grid of x and y.
+
+        Args:
+            x (torch.Tensor): Grids of x dimension.
+            y (torch.Tensor): Grids of y dimension.
+            row_major (bool): Whether to return y grids first.
+                Defaults to True.
+
+        Returns:
+            tuple[torch.Tensor]: The mesh grids of x and y.
+        """
+        xx = x.repeat(len(y))
+        yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_points(self,
+                    featmap_size: Tuple[int, int],
+                    stride=16,
+                    device: DeviceType = 'cuda') -> Tensor:
+        """Generate grid points of a single level.
+
+        Args:
+            featmap_size (tuple[int, int]): Size of the feature maps.
+            stride (int): The stride of corresponding feature map.
+            device (str | torch.device): The device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: grid point in a feature map.
+        """
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0., feat_w, device=device) * stride
+        shift_y = torch.arange(0., feat_h, device=device) * stride
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        stride = shift_x.new_full((shift_xx.shape[0], ), stride)
+        shifts = torch.stack([shift_xx, shift_yy, stride], dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self,
+                    featmap_size: Tuple[int, int],
+                    valid_size: Tuple[int, int],
+                    device: DeviceType = 'cuda') -> Tensor:
+        """Generate valid flags of anchors in a feature map.
+
+        Args:
+            featmap_sizes (list(tuple[int, int])): List of feature map sizes in
+                multiple feature levels.
+            valid_shape (tuple[int, int]): The valid shape of the image.
+            device (str | torch.device): Device where the anchors will be
+                put on.
+
+        Return:
+            torch.Tensor: Valid flags of anchors in a level.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
+
+
+@TASK_UTILS.register_module()
+class MlvlPointGenerator:
+    """Standard points generator for multi-level (Mlvl) feature maps in 2D
+    points-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        offset (float): The offset of points, the value is normalized with
+            corresponding stride. Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 strides: Union[List[int], List[Tuple[int, int]]],
+                 offset: float = 0.5) -> None:
+        self.strides = [_pair(stride) for stride in strides]
+        self.offset = offset
+
+    @property
+    def num_levels(self) -> int:
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    @property
+    def num_base_priors(self) -> List[int]:
+        """list[int]: The number of priors (points) at a point
+        on the feature grid"""
+        return [1 for _ in range(len(self.strides))]
+
+    def _meshgrid(self,
+                  x: Tensor,
+                  y: Tensor,
+                  row_major: bool = True) -> Tuple[Tensor, Tensor]:
+        yy, xx = torch.meshgrid(y, x)
+        if row_major:
+            # warning .flatten() would cause error in ONNX exporting
+            # have to use reshape here
+            return xx.reshape(-1), yy.reshape(-1)
+
+        else:
+            return yy.reshape(-1), xx.reshape(-1)
+
+    def grid_priors(self,
+                    featmap_sizes: List[Tuple],
+                    dtype: torch.dtype = torch.float32,
+                    device: DeviceType = 'cuda',
+                    with_stride: bool = False) -> List[Tensor]:
+        """Generate grid points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            dtype (:obj:`dtype`): Dtype of priors. Defaults to torch.float32.
+            device (str | torch.device): The device where the anchors will be
+                put on.
+            with_stride (bool): Whether to concatenate the stride to
+                the last dimension of points.
+
+        Return:
+            list[torch.Tensor]: Points of  multiple feature levels.
+            The sizes of each tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_priors = []
+        for i in range(self.num_levels):
+            priors = self.single_level_grid_priors(
+                featmap_sizes[i],
+                level_idx=i,
+                dtype=dtype,
+                device=device,
+                with_stride=with_stride)
+            multi_level_priors.append(priors)
+        return multi_level_priors
+
+    def single_level_grid_priors(self,
+                                 featmap_size: Tuple[int],
+                                 level_idx: int,
+                                 dtype: torch.dtype = torch.float32,
+                                 device: DeviceType = 'cuda',
+                                 with_stride: bool = False) -> Tensor:
+        """Generate grid Points of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature maps, arrange as
+                (h, w).
+            level_idx (int): The index of corresponding feature map level.
+            dtype (:obj:`dtype`): Dtype of priors. Defaults to torch.float32.
+            device (str | torch.device): The device the tensor will be put on.
+                Defaults to 'cuda'.
+            with_stride (bool): Concatenate the stride to the last dimension
+                of points.
+
+        Return:
+            Tensor: Points of single feature levels.
+            The shape of tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        shift_x = (torch.arange(0, feat_w, device=device) +
+                   self.offset) * stride_w
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_x = shift_x.to(dtype)
+
+        shift_y = (torch.arange(0, feat_h, device=device) +
+                   self.offset) * stride_h
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_y = shift_y.to(dtype)
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        if not with_stride:
+            shifts = torch.stack([shift_xx, shift_yy], dim=-1)
+        else:
+            # use `shape[0]` instead of `len(shift_xx)` for ONNX export
+            stride_w = shift_xx.new_full((shift_xx.shape[0], ),
+                                         stride_w).to(dtype)
+            stride_h = shift_xx.new_full((shift_yy.shape[0], ),
+                                         stride_h).to(dtype)
+            shifts = torch.stack([shift_xx, shift_yy, stride_w, stride_h],
+                                 dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self,
+                    featmap_sizes: List[Tuple[int, int]],
+                    pad_shape: Tuple[int],
+                    device: DeviceType = 'cuda') -> List[Tensor]:
+        """Generate valid flags of points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            pad_shape (tuple(int)): The padded shape of the image,
+                arrange as (h, w).
+            device (str | torch.device): The device where the anchors will be
+                put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of points of multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            point_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / point_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / point_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w),
+                                                  (valid_feat_h, valid_feat_w),
+                                                  device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self,
+                                 featmap_size: Tuple[int, int],
+                                 valid_size: Tuple[int, int],
+                                 device: DeviceType = 'cuda') -> Tensor:
+        """Generate the valid flags of points of a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange as
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+                The size arrange as as (h, w).
+            device (str | torch.device): The device where the flags will be
+            put on. Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each points in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
+
+    def sparse_priors(self,
+                      prior_idxs: Tensor,
+                      featmap_size: Tuple[int],
+                      level_idx: int,
+                      dtype: torch.dtype = torch.float32,
+                      device: DeviceType = 'cuda') -> Tensor:
+        """Generate sparse points according to the ``prior_idxs``.
+
+        Args:
+            prior_idxs (Tensor): The index of corresponding anchors
+                in the feature map.
+            featmap_size (tuple[int]): feature map size arrange as (w, h).
+            level_idx (int): The level index of corresponding feature
+                map.
+            dtype (obj:`torch.dtype`): Date type of points. Defaults to
+                ``torch.float32``.
+            device (str | torch.device): The device where the points is
+                located.
+        Returns:
+            Tensor: Anchor with shape (N, 2), N should be equal to
+            the length of ``prior_idxs``. And last dimension
+            2 represent (coord_x, coord_y).
+        """
+        height, width = featmap_size
+        x = (prior_idxs % width + self.offset) * self.strides[level_idx][0]
+        y = ((prior_idxs // width) % height +
+             self.offset) * self.strides[level_idx][1]
+        prioris = torch.stack([x, y], 1).to(dtype)
+        prioris = prioris.to(device)
+        return prioris
diff --git a/head_extractor/src/mmdet/models/task_modules/prior_generators/utils.py b/head_extractor/src/mmdet/models/task_modules/prior_generators/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aa2dfd49669ba931d20ad9482cb841698cceb8a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/prior_generators/utils.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+
+from mmdet.structures.bbox import BaseBoxes
+
+
+def anchor_inside_flags(flat_anchors: Tensor,
+                        valid_flags: Tensor,
+                        img_shape: Tuple[int],
+                        allowed_border: int = 0) -> Tensor:
+    """Check whether the anchors are inside the border.
+
+    Args:
+        flat_anchors (torch.Tensor): Flatten anchors, shape (n, 4).
+        valid_flags (torch.Tensor): An existing valid flags of anchors.
+        img_shape (tuple(int)): Shape of current image.
+        allowed_border (int): The border to allow the valid anchor.
+            Defaults to 0.
+
+    Returns:
+        torch.Tensor: Flags indicating whether the anchors are inside a \
+            valid range.
+    """
+    img_h, img_w = img_shape[:2]
+    if allowed_border >= 0:
+        if isinstance(flat_anchors, BaseBoxes):
+            inside_flags = valid_flags & \
+                flat_anchors.is_inside([img_h, img_w],
+                                       all_inside=True,
+                                       allowed_border=allowed_border)
+        else:
+            inside_flags = valid_flags & \
+                (flat_anchors[:, 0] >= -allowed_border) & \
+                (flat_anchors[:, 1] >= -allowed_border) & \
+                (flat_anchors[:, 2] < img_w + allowed_border) & \
+                (flat_anchors[:, 3] < img_h + allowed_border)
+    else:
+        inside_flags = valid_flags
+    return inside_flags
+
+
+def calc_region(bbox: Tensor,
+                ratio: float,
+                featmap_size: Optional[Tuple] = None) -> Tuple[int]:
+    """Calculate a proportional bbox region.
+
+    The bbox center are fixed and the new h' and w' is h * ratio and w * ratio.
+
+    Args:
+        bbox (Tensor): Bboxes to calculate regions, shape (n, 4).
+        ratio (float): Ratio of the output region.
+        featmap_size (tuple, Optional): Feature map size in (height, width)
+            order used for clipping the boundary. Defaults to None.
+
+    Returns:
+        tuple: x1, y1, x2, y2
+    """
+    x1 = torch.round((1 - ratio) * bbox[0] + ratio * bbox[2]).long()
+    y1 = torch.round((1 - ratio) * bbox[1] + ratio * bbox[3]).long()
+    x2 = torch.round(ratio * bbox[0] + (1 - ratio) * bbox[2]).long()
+    y2 = torch.round(ratio * bbox[1] + (1 - ratio) * bbox[3]).long()
+    if featmap_size is not None:
+        x1 = x1.clamp(min=0, max=featmap_size[1])
+        y1 = y1.clamp(min=0, max=featmap_size[0])
+        x2 = x2.clamp(min=0, max=featmap_size[1])
+        y2 = y2.clamp(min=0, max=featmap_size[0])
+    return (x1, y1, x2, y2)
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/__init__.py b/head_extractor/src/mmdet/models/task_modules/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3782eb898cf8acace63b4f16204cae6c07eb6e30
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_sampler import BaseSampler
+from .combined_sampler import CombinedSampler
+from .instance_balanced_pos_sampler import InstanceBalancedPosSampler
+from .iou_balanced_neg_sampler import IoUBalancedNegSampler
+from .mask_pseudo_sampler import MaskPseudoSampler
+from .mask_sampling_result import MaskSamplingResult
+from .multi_instance_random_sampler import MultiInsRandomSampler
+from .multi_instance_sampling_result import MultiInstanceSamplingResult
+from .ohem_sampler import OHEMSampler
+from .pseudo_sampler import PseudoSampler
+from .random_sampler import RandomSampler
+from .sampling_result import SamplingResult
+from .score_hlr_sampler import ScoreHLRSampler
+
+__all__ = [
+    'BaseSampler', 'PseudoSampler', 'RandomSampler',
+    'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
+    'OHEMSampler', 'SamplingResult', 'ScoreHLRSampler', 'MaskPseudoSampler',
+    'MaskSamplingResult', 'MultiInstanceSamplingResult',
+    'MultiInsRandomSampler'
+]
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/base_sampler.py b/head_extractor/src/mmdet/models/task_modules/samplers/base_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..be8a9a5ee3ec4e70b19aeea21b7998cf2b131d59
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/base_sampler.py
@@ -0,0 +1,136 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.structures.bbox import BaseBoxes, cat_boxes
+from ..assigners import AssignResult
+from .sampling_result import SamplingResult
+
+
+class BaseSampler(metaclass=ABCMeta):
+    """Base class of samplers.
+
+    Args:
+        num (int): Number of samples
+        pos_fraction (float): Fraction of positive samples
+        neg_pos_up (int): Upper bound number of negative and
+            positive samples. Defaults to -1.
+        add_gt_as_proposals (bool): Whether to add ground truth
+            boxes as proposals. Defaults to True.
+    """
+
+    def __init__(self,
+                 num: int,
+                 pos_fraction: float,
+                 neg_pos_ub: int = -1,
+                 add_gt_as_proposals: bool = True,
+                 **kwargs) -> None:
+        self.num = num
+        self.pos_fraction = pos_fraction
+        self.neg_pos_ub = neg_pos_ub
+        self.add_gt_as_proposals = add_gt_as_proposals
+        self.pos_sampler = self
+        self.neg_sampler = self
+
+    @abstractmethod
+    def _sample_pos(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs):
+        """Sample positive samples."""
+        pass
+
+    @abstractmethod
+    def _sample_neg(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs):
+        """Sample negative samples."""
+        pass
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, **kwargs) -> SamplingResult:
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigning results.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+
+        Example:
+            >>> from mmengine.structures import InstanceData
+            >>> from mmdet.models.task_modules.samplers import RandomSampler,
+            >>> from mmdet.models.task_modules.assigners import AssignResult
+            >>> from mmdet.models.task_modules.samplers.
+            ... sampling_result import ensure_rng, random_boxes
+            >>> rng = ensure_rng(None)
+            >>> assign_result = AssignResult.random(rng=rng)
+            >>> pred_instances = InstanceData()
+            >>> pred_instances.priors = random_boxes(assign_result.num_preds,
+            ...                                      rng=rng)
+            >>> gt_instances = InstanceData()
+            >>> gt_instances.bboxes = random_boxes(assign_result.num_gts,
+            ...                                    rng=rng)
+            >>> gt_instances.labels = torch.randint(
+            ...     0, 5, (assign_result.num_gts,), dtype=torch.long)
+            >>> self = RandomSampler(num=32, pos_fraction=0.5, neg_pos_ub=-1,
+            >>>                      add_gt_as_proposals=False)
+            >>> self = self.sample(assign_result, pred_instances, gt_instances)
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+        if len(priors.shape) < 2:
+            priors = priors[None, :]
+
+        gt_flags = priors.new_zeros((priors.shape[0], ), dtype=torch.uint8)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            # When `gt_bboxes` and `priors` are all box type, convert
+            # `gt_bboxes` type to `priors` type.
+            if (isinstance(gt_bboxes, BaseBoxes)
+                    and isinstance(priors, BaseBoxes)):
+                gt_bboxes_ = gt_bboxes.convert_to(type(priors))
+            else:
+                gt_bboxes_ = gt_bboxes
+            priors = cat_boxes([gt_bboxes_, priors], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = priors.new_ones(gt_bboxes_.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=priors, **kwargs)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(
+            assign_result, num_expected_neg, bboxes=priors, **kwargs)
+        neg_inds = neg_inds.unique()
+
+        sampling_result = SamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags)
+        return sampling_result
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/combined_sampler.py b/head_extractor/src/mmdet/models/task_modules/samplers/combined_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e0560e372efffe865fa32028d823280a8bd5d87
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/combined_sampler.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.registry import TASK_UTILS
+from .base_sampler import BaseSampler
+
+
+@TASK_UTILS.register_module()
+class CombinedSampler(BaseSampler):
+    """A sampler that combines positive sampler and negative sampler."""
+
+    def __init__(self, pos_sampler, neg_sampler, **kwargs):
+        super(CombinedSampler, self).__init__(**kwargs)
+        self.pos_sampler = TASK_UTILS.build(pos_sampler, default_args=kwargs)
+        self.neg_sampler = TASK_UTILS.build(neg_sampler, default_args=kwargs)
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py b/head_extractor/src/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..e48d8e9158e8dabf0bb4072b8e421de9b6410d00
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/instance_balanced_pos_sampler.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.registry import TASK_UTILS
+from .random_sampler import RandomSampler
+
+
+@TASK_UTILS.register_module()
+class InstanceBalancedPosSampler(RandomSampler):
+    """Instance balanced sampler that samples equal number of positive samples
+    for each instance."""
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Sample positive boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): The assigned results of boxes.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            unique_gt_inds = assign_result.gt_inds[pos_inds].unique()
+            num_gts = len(unique_gt_inds)
+            num_per_gt = int(round(num_expected / float(num_gts)) + 1)
+            sampled_inds = []
+            for i in unique_gt_inds:
+                inds = torch.nonzero(
+                    assign_result.gt_inds == i.item(), as_tuple=False)
+                if inds.numel() != 0:
+                    inds = inds.squeeze(1)
+                else:
+                    continue
+                if len(inds) > num_per_gt:
+                    inds = self.random_choice(inds, num_per_gt)
+                sampled_inds.append(inds)
+            sampled_inds = torch.cat(sampled_inds)
+            if len(sampled_inds) < num_expected:
+                num_extra = num_expected - len(sampled_inds)
+                extra_inds = np.array(
+                    list(set(pos_inds.cpu()) - set(sampled_inds.cpu())))
+                if len(extra_inds) > num_extra:
+                    extra_inds = self.random_choice(extra_inds, num_extra)
+                extra_inds = torch.from_numpy(extra_inds).to(
+                    assign_result.gt_inds.device).long()
+                sampled_inds = torch.cat([sampled_inds, extra_inds])
+            elif len(sampled_inds) > num_expected:
+                sampled_inds = self.random_choice(sampled_inds, num_expected)
+            return sampled_inds
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py b/head_extractor/src/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc1f46413c99d115f31ef190b4fb198b588a156e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/iou_balanced_neg_sampler.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.registry import TASK_UTILS
+from .random_sampler import RandomSampler
+
+
+@TASK_UTILS.register_module()
+class IoUBalancedNegSampler(RandomSampler):
+    """IoU Balanced Sampling.
+
+    arXiv: https://arxiv.org/pdf/1904.02701.pdf (CVPR 2019)
+
+    Sampling proposals according to their IoU. `floor_fraction` of needed RoIs
+    are sampled from proposals whose IoU are lower than `floor_thr` randomly.
+    The others are sampled from proposals whose IoU are higher than
+    `floor_thr`. These proposals are sampled from some bins evenly, which are
+    split by `num_bins` via IoU evenly.
+
+    Args:
+        num (int): number of proposals.
+        pos_fraction (float): fraction of positive proposals.
+        floor_thr (float): threshold (minimum) IoU for IoU balanced sampling,
+            set to -1 if all using IoU balanced sampling.
+        floor_fraction (float): sampling fraction of proposals under floor_thr.
+        num_bins (int): number of bins in IoU balanced sampling.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 floor_thr=-1,
+                 floor_fraction=0,
+                 num_bins=3,
+                 **kwargs):
+        super(IoUBalancedNegSampler, self).__init__(num, pos_fraction,
+                                                    **kwargs)
+        assert floor_thr >= 0 or floor_thr == -1
+        assert 0 <= floor_fraction <= 1
+        assert num_bins >= 1
+
+        self.floor_thr = floor_thr
+        self.floor_fraction = floor_fraction
+        self.num_bins = num_bins
+
+    def sample_via_interval(self, max_overlaps, full_set, num_expected):
+        """Sample according to the iou interval.
+
+        Args:
+            max_overlaps (torch.Tensor): IoU between bounding boxes and ground
+                truth boxes.
+            full_set (set(int)): A full set of indices of boxes。
+            num_expected (int): Number of expected samples。
+
+        Returns:
+            np.ndarray: Indices  of samples
+        """
+        max_iou = max_overlaps.max()
+        iou_interval = (max_iou - self.floor_thr) / self.num_bins
+        per_num_expected = int(num_expected / self.num_bins)
+
+        sampled_inds = []
+        for i in range(self.num_bins):
+            start_iou = self.floor_thr + i * iou_interval
+            end_iou = self.floor_thr + (i + 1) * iou_interval
+            tmp_set = set(
+                np.where(
+                    np.logical_and(max_overlaps >= start_iou,
+                                   max_overlaps < end_iou))[0])
+            tmp_inds = list(tmp_set & full_set)
+            if len(tmp_inds) > per_num_expected:
+                tmp_sampled_set = self.random_choice(tmp_inds,
+                                                     per_num_expected)
+            else:
+                tmp_sampled_set = np.array(tmp_inds, dtype=np.int64)
+            sampled_inds.append(tmp_sampled_set)
+
+        sampled_inds = np.concatenate(sampled_inds)
+        if len(sampled_inds) < num_expected:
+            num_extra = num_expected - len(sampled_inds)
+            extra_inds = np.array(list(full_set - set(sampled_inds)))
+            if len(extra_inds) > num_extra:
+                extra_inds = self.random_choice(extra_inds, num_extra)
+            sampled_inds = np.concatenate([sampled_inds, extra_inds])
+
+        return sampled_inds
+
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Sample negative boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): The assigned results of boxes.
+            num_expected (int): The number of expected negative samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            max_overlaps = assign_result.max_overlaps.cpu().numpy()
+            # balance sampling for negative samples
+            neg_set = set(neg_inds.cpu().numpy())
+
+            if self.floor_thr > 0:
+                floor_set = set(
+                    np.where(
+                        np.logical_and(max_overlaps >= 0,
+                                       max_overlaps < self.floor_thr))[0])
+                iou_sampling_set = set(
+                    np.where(max_overlaps >= self.floor_thr)[0])
+            elif self.floor_thr == 0:
+                floor_set = set(np.where(max_overlaps == 0)[0])
+                iou_sampling_set = set(
+                    np.where(max_overlaps > self.floor_thr)[0])
+            else:
+                floor_set = set()
+                iou_sampling_set = set(
+                    np.where(max_overlaps > self.floor_thr)[0])
+                # for sampling interval calculation
+                self.floor_thr = 0
+
+            floor_neg_inds = list(floor_set & neg_set)
+            iou_sampling_neg_inds = list(iou_sampling_set & neg_set)
+            num_expected_iou_sampling = int(num_expected *
+                                            (1 - self.floor_fraction))
+            if len(iou_sampling_neg_inds) > num_expected_iou_sampling:
+                if self.num_bins >= 2:
+                    iou_sampled_inds = self.sample_via_interval(
+                        max_overlaps, set(iou_sampling_neg_inds),
+                        num_expected_iou_sampling)
+                else:
+                    iou_sampled_inds = self.random_choice(
+                        iou_sampling_neg_inds, num_expected_iou_sampling)
+            else:
+                iou_sampled_inds = np.array(
+                    iou_sampling_neg_inds, dtype=np.int64)
+            num_expected_floor = num_expected - len(iou_sampled_inds)
+            if len(floor_neg_inds) > num_expected_floor:
+                sampled_floor_inds = self.random_choice(
+                    floor_neg_inds, num_expected_floor)
+            else:
+                sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int64)
+            sampled_inds = np.concatenate(
+                (sampled_floor_inds, iou_sampled_inds))
+            if len(sampled_inds) < num_expected:
+                num_extra = num_expected - len(sampled_inds)
+                extra_inds = np.array(list(neg_set - set(sampled_inds)))
+                if len(extra_inds) > num_extra:
+                    extra_inds = self.random_choice(extra_inds, num_extra)
+                sampled_inds = np.concatenate((sampled_inds, extra_inds))
+            sampled_inds = torch.from_numpy(sampled_inds).long().to(
+                assign_result.gt_inds.device)
+            return sampled_inds
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py b/head_extractor/src/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..307dd5d15c962b97dc60b899e60170d0bfed90a7
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/mask_pseudo_sampler.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""copy from
+https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py."""
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from ..assigners import AssignResult
+from .base_sampler import BaseSampler
+from .mask_sampling_result import MaskSamplingResult
+
+
+@TASK_UTILS.register_module()
+class MaskPseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, *args, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Mask assigning results.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``scores`` and ``masks`` predicted
+                by the model.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``labels`` and ``masks``
+                attributes.
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        gt_flags = pred_masks.new_zeros(pred_masks.shape[0], dtype=torch.uint8)
+        sampling_result = MaskSamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            masks=pred_masks,
+            gt_masks=gt_masks,
+            assign_result=assign_result,
+            gt_flags=gt_flags,
+            avg_factor_with_neg=False)
+        return sampling_result
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/mask_sampling_result.py b/head_extractor/src/mmdet/models/task_modules/samplers/mask_sampling_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..adaa62e8a0af28bb004a34b961f672ec03988d2c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/mask_sampling_result.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""copy from
+https://github.com/ZwwWayne/K-Net/blob/main/knet/det/mask_pseudo_sampler.py."""
+
+import torch
+from torch import Tensor
+
+from ..assigners import AssignResult
+from .sampling_result import SamplingResult
+
+
+class MaskSamplingResult(SamplingResult):
+    """Mask sampling result."""
+
+    def __init__(self,
+                 pos_inds: Tensor,
+                 neg_inds: Tensor,
+                 masks: Tensor,
+                 gt_masks: Tensor,
+                 assign_result: AssignResult,
+                 gt_flags: Tensor,
+                 avg_factor_with_neg: bool = True) -> None:
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.num_pos = max(pos_inds.numel(), 1)
+        self.num_neg = max(neg_inds.numel(), 1)
+        self.avg_factor = self.num_pos + self.num_neg \
+            if avg_factor_with_neg else self.num_pos
+
+        self.pos_masks = masks[pos_inds]
+        self.neg_masks = masks[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_masks.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_masks.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_masks = torch.empty_like(gt_masks)
+        else:
+            self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :]
+
+    @property
+    def masks(self) -> Tensor:
+        """torch.Tensor: concatenated positive and negative masks."""
+        return torch.cat([self.pos_masks, self.neg_masks])
+
+    def __nice__(self) -> str:
+        data = self.info.copy()
+        data['pos_masks'] = data.pop('pos_masks').shape
+        data['neg_masks'] = data.pop('neg_masks').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self) -> dict:
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_masks': self.pos_masks,
+            'neg_masks': self.neg_masks,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+        }
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py b/head_extractor/src/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b74054e3a11ed6025e98e90bd0addb131a1dc02
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/multi_instance_random_sampler.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from mmengine.structures import InstanceData
+from numpy import ndarray
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from ..assigners import AssignResult
+from .multi_instance_sampling_result import MultiInstanceSamplingResult
+from .random_sampler import RandomSampler
+
+
+@TASK_UTILS.register_module()
+class MultiInsRandomSampler(RandomSampler):
+    """Random sampler for multi instance.
+
+    Note:
+        Multi-instance means to predict multiple detection boxes with
+        one proposal box. `AssignResult` may assign multiple gt boxes
+        to each proposal box, in this case `RandomSampler` should be
+        replaced by `MultiInsRandomSampler`
+    """
+
+    def _sample_pos(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some positive samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(
+            assign_result.labels[:, 0] > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some negative samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(
+            assign_result.labels[:, 0] == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            return self.random_choice(neg_inds, num_expected)
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               **kwargs) -> MultiInstanceSamplingResult:
+        """Sample positive and negative bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigning results from
+                MultiInstanceAssigner.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+
+        Returns:
+            :obj:`MultiInstanceSamplingResult`: Sampling result.
+        """
+
+        assert 'batch_gt_instances_ignore' in kwargs, \
+            'batch_gt_instances_ignore is necessary for MultiInsRandomSampler'
+
+        gt_bboxes = gt_instances.bboxes
+        ignore_bboxes = kwargs['batch_gt_instances_ignore'].bboxes
+        gt_and_ignore_bboxes = torch.cat([gt_bboxes, ignore_bboxes], dim=0)
+        priors = pred_instances.priors
+        if len(priors.shape) < 2:
+            priors = priors[None, :]
+        priors = priors[:, :4]
+
+        gt_flags = priors.new_zeros((priors.shape[0], ), dtype=torch.uint8)
+        priors = torch.cat([priors, gt_and_ignore_bboxes], dim=0)
+        gt_ones = priors.new_ones(
+            gt_and_ignore_bboxes.shape[0], dtype=torch.uint8)
+        gt_flags = torch.cat([gt_flags, gt_ones])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(assign_result,
+                                                num_expected_pos)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(assign_result,
+                                                num_expected_neg)
+        neg_inds = neg_inds.unique()
+
+        sampling_result = MultiInstanceSamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_and_ignore_bboxes=gt_and_ignore_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags)
+        return sampling_result
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py b/head_extractor/src/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..438a0aa91c0cc8904f6d8bba7139408dd99b98cf
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/multi_instance_sampling_result.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import Tensor
+
+from ..assigners import AssignResult
+from .sampling_result import SamplingResult
+
+
+class MultiInstanceSamplingResult(SamplingResult):
+    """Bbox sampling result. Further encapsulation of SamplingResult. Three
+    attributes neg_assigned_gt_inds, neg_gt_labels, and neg_gt_bboxes have been
+    added for SamplingResult.
+
+    Args:
+        pos_inds (Tensor): Indices of positive samples.
+        neg_inds (Tensor): Indices of negative samples.
+        priors (Tensor): The priors can be anchors or points,
+            or the bboxes predicted by the previous stage.
+        gt_and_ignore_bboxes (Tensor): Ground truth and ignore bboxes.
+        assign_result (:obj:`AssignResult`): Assigning results.
+        gt_flags (Tensor): The Ground truth flags.
+        avg_factor_with_neg (bool):  If True, ``avg_factor`` equal to
+            the number of total priors; Otherwise, it is the number of
+            positive priors. Defaults to True.
+    """
+
+    def __init__(self,
+                 pos_inds: Tensor,
+                 neg_inds: Tensor,
+                 priors: Tensor,
+                 gt_and_ignore_bboxes: Tensor,
+                 assign_result: AssignResult,
+                 gt_flags: Tensor,
+                 avg_factor_with_neg: bool = True) -> None:
+        self.neg_assigned_gt_inds = assign_result.gt_inds[neg_inds]
+        self.neg_gt_labels = assign_result.labels[neg_inds]
+
+        if gt_and_ignore_bboxes.numel() == 0:
+            self.neg_gt_bboxes = torch.empty_like(gt_and_ignore_bboxes).view(
+                -1, 4)
+        else:
+            if len(gt_and_ignore_bboxes.shape) < 2:
+                gt_and_ignore_bboxes = gt_and_ignore_bboxes.view(-1, 4)
+            self.neg_gt_bboxes = gt_and_ignore_bboxes[
+                self.neg_assigned_gt_inds.long(), :]
+
+        # To resist the minus 1 operation in `SamplingResult.init()`.
+        assign_result.gt_inds += 1
+        super().__init__(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_and_ignore_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags,
+            avg_factor_with_neg=avg_factor_with_neg)
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/ohem_sampler.py b/head_extractor/src/mmdet/models/task_modules/samplers/ohem_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f478a448cde00d64caeba1d0ba613d2497a7fb12
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/ohem_sampler.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox2roi
+from .base_sampler import BaseSampler
+
+
+@TASK_UTILS.register_module()
+class OHEMSampler(BaseSampler):
+    r"""Online Hard Example Mining Sampler described in `Training Region-based
+    Object Detectors with Online Hard Example Mining
+    <https://arxiv.org/abs/1604.03540>`_.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 context,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 loss_key='loss_cls',
+                 **kwargs):
+        super(OHEMSampler, self).__init__(num, pos_fraction, neg_pos_ub,
+                                          add_gt_as_proposals)
+        self.context = context
+        if not hasattr(self.context, 'num_stages'):
+            self.bbox_head = self.context.bbox_head
+        else:
+            self.bbox_head = self.context.bbox_head[self.context.current_stage]
+
+        self.loss_key = loss_key
+
+    def hard_mining(self, inds, num_expected, bboxes, labels, feats):
+        with torch.no_grad():
+            rois = bbox2roi([bboxes])
+            if not hasattr(self.context, 'num_stages'):
+                bbox_results = self.context._bbox_forward(feats, rois)
+            else:
+                bbox_results = self.context._bbox_forward(
+                    self.context.current_stage, feats, rois)
+            cls_score = bbox_results['cls_score']
+            loss = self.bbox_head.loss(
+                cls_score=cls_score,
+                bbox_pred=None,
+                rois=rois,
+                labels=labels,
+                label_weights=cls_score.new_ones(cls_score.size(0)),
+                bbox_targets=None,
+                bbox_weights=None,
+                reduction_override='none')[self.loss_key]
+            _, topk_loss_inds = loss.topk(num_expected)
+        return inds[topk_loss_inds]
+
+    def _sample_pos(self,
+                    assign_result,
+                    num_expected,
+                    bboxes=None,
+                    feats=None,
+                    **kwargs):
+        """Sample positive boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            num_expected (int): Number of expected positive samples
+            bboxes (torch.Tensor, optional): Boxes. Defaults to None.
+            feats (list[torch.Tensor], optional): Multi-level features.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Indices  of positive samples
+        """
+        # Sample some hard positive samples
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.hard_mining(pos_inds, num_expected, bboxes[pos_inds],
+                                    assign_result.labels[pos_inds], feats)
+
+    def _sample_neg(self,
+                    assign_result,
+                    num_expected,
+                    bboxes=None,
+                    feats=None,
+                    **kwargs):
+        """Sample negative boxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            num_expected (int): Number of expected negative samples
+            bboxes (torch.Tensor, optional): Boxes. Defaults to None.
+            feats (list[torch.Tensor], optional): Multi-level features.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Indices  of negative samples
+        """
+        # Sample some hard negative samples
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            neg_labels = assign_result.labels.new_empty(
+                neg_inds.size(0)).fill_(self.bbox_head.num_classes)
+            return self.hard_mining(neg_inds, num_expected, bboxes[neg_inds],
+                                    neg_labels, feats)
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/pseudo_sampler.py b/head_extractor/src/mmdet/models/task_modules/samplers/pseudo_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8186cc3364516f34abe1c293017db6e2042d92a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/pseudo_sampler.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import TASK_UTILS
+from ..assigners import AssignResult
+from .base_sampler import BaseSampler
+from .sampling_result import SamplingResult
+
+
+@TASK_UTILS.register_module()
+class PseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, *args, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors, points, or bboxes predicted by the model,
+                shape(n, 4).
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes`` and ``labels``
+                attributes.
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+
+        gt_flags = priors.new_zeros(priors.shape[0], dtype=torch.uint8)
+        sampling_result = SamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags,
+            avg_factor_with_neg=False)
+        return sampling_result
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/random_sampler.py b/head_extractor/src/mmdet/models/task_modules/samplers/random_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa03665fc36cc6a0084431324b16727b2dc8993e
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/random_sampler.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from numpy import ndarray
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from ..assigners import AssignResult
+from .base_sampler import BaseSampler
+
+
+@TASK_UTILS.register_module()
+class RandomSampler(BaseSampler):
+    """Random sampler.
+
+    Args:
+        num (int): Number of samples
+        pos_fraction (float): Fraction of positive samples
+        neg_pos_up (int): Upper bound number of negative and
+            positive samples. Defaults to -1.
+        add_gt_as_proposals (bool): Whether to add ground truth
+            boxes as proposals. Defaults to True.
+    """
+
+    def __init__(self,
+                 num: int,
+                 pos_fraction: float,
+                 neg_pos_ub: int = -1,
+                 add_gt_as_proposals: bool = True,
+                 **kwargs):
+        from .sampling_result import ensure_rng
+        super().__init__(
+            num=num,
+            pos_fraction=pos_fraction,
+            neg_pos_ub=neg_pos_ub,
+            add_gt_as_proposals=add_gt_as_proposals)
+        self.rng = ensure_rng(kwargs.get('rng', None))
+
+    def random_choice(self, gallery: Union[Tensor, ndarray, list],
+                      num: int) -> Union[Tensor, ndarray]:
+        """Random select some elements from the gallery.
+
+        If `gallery` is a Tensor, the returned indices will be a Tensor;
+        If `gallery` is a ndarray or list, the returned indices will be a
+        ndarray.
+
+        Args:
+            gallery (Tensor | ndarray | list): indices pool.
+            num (int): expected sample num.
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        assert len(gallery) >= num
+
+        is_tensor = isinstance(gallery, torch.Tensor)
+        if not is_tensor:
+            if torch.cuda.is_available():
+                device = torch.cuda.current_device()
+            else:
+                device = 'cpu'
+            gallery = torch.tensor(gallery, dtype=torch.long, device=device)
+        # This is a temporary fix. We can revert the following code
+        # when PyTorch fixes the abnormal return of torch.randperm.
+        # See: https://github.com/open-mmlab/mmdetection/pull/5014
+        perm = torch.randperm(gallery.numel())[:num].to(device=gallery.device)
+        rand_inds = gallery[perm]
+        if not is_tensor:
+            rand_inds = rand_inds.cpu().numpy()
+        return rand_inds
+
+    def _sample_pos(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some positive samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some negative samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= num_expected:
+            return neg_inds
+        else:
+            return self.random_choice(neg_inds, num_expected)
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/sampling_result.py b/head_extractor/src/mmdet/models/task_modules/samplers/sampling_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb510ee68f24b8c444b6ed447016bfc785b825c2
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/sampling_result.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.structures.bbox import BaseBoxes, cat_boxes
+from mmdet.utils import util_mixins
+from mmdet.utils.util_random import ensure_rng
+from ..assigners import AssignResult
+
+
+def random_boxes(num=1, scale=1, rng=None):
+    """Simple version of ``kwimage.Boxes.random``
+
+    Returns:
+        Tensor: shape (n, 4) in x1, y1, x2, y2 format.
+
+    References:
+        https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390
+
+    Example:
+        >>> num = 3
+        >>> scale = 512
+        >>> rng = 0
+        >>> boxes = random_boxes(num, scale, rng)
+        >>> print(boxes)
+        tensor([[280.9925, 278.9802, 308.6148, 366.1769],
+                [216.9113, 330.6978, 224.0446, 456.5878],
+                [405.3632, 196.3221, 493.3953, 270.7942]])
+    """
+    rng = ensure_rng(rng)
+
+    tlbr = rng.rand(num, 4).astype(np.float32)
+
+    tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2])
+    tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3])
+    br_x = np.maximum(tlbr[:, 0], tlbr[:, 2])
+    br_y = np.maximum(tlbr[:, 1], tlbr[:, 3])
+
+    tlbr[:, 0] = tl_x * scale
+    tlbr[:, 1] = tl_y * scale
+    tlbr[:, 2] = br_x * scale
+    tlbr[:, 3] = br_y * scale
+
+    boxes = torch.from_numpy(tlbr)
+    return boxes
+
+
+class SamplingResult(util_mixins.NiceRepr):
+    """Bbox sampling result.
+
+    Args:
+        pos_inds (Tensor): Indices of positive samples.
+        neg_inds (Tensor): Indices of negative samples.
+        priors (Tensor): The priors can be anchors or points,
+            or the bboxes predicted by the previous stage.
+        gt_bboxes (Tensor): Ground truth of bboxes.
+        assign_result (:obj:`AssignResult`): Assigning results.
+        gt_flags (Tensor): The Ground truth flags.
+        avg_factor_with_neg (bool):  If True, ``avg_factor`` equal to
+            the number of total priors; Otherwise, it is the number of
+            positive priors. Defaults to True.
+
+    Example:
+        >>> # xdoctest: +IGNORE_WANT
+        >>> from mmdet.models.task_modules.samplers.sampling_result import *  # NOQA
+        >>> self = SamplingResult.random(rng=10)
+        >>> print(f'self = {self}')
+        self = <SamplingResult({
+            'neg_inds': tensor([1,  2,  3,  5,  6,  7,  8,
+                                9, 10, 11, 12, 13]),
+            'neg_priors': torch.Size([12, 4]),
+            'num_gts': 1,
+            'num_neg': 12,
+            'num_pos': 1,
+            'avg_factor': 13,
+            'pos_assigned_gt_inds': tensor([0]),
+            'pos_inds': tensor([0]),
+            'pos_is_gt': tensor([1], dtype=torch.uint8),
+            'pos_priors': torch.Size([1, 4])
+        })>
+    """
+
+    def __init__(self,
+                 pos_inds: Tensor,
+                 neg_inds: Tensor,
+                 priors: Tensor,
+                 gt_bboxes: Tensor,
+                 assign_result: AssignResult,
+                 gt_flags: Tensor,
+                 avg_factor_with_neg: bool = True) -> None:
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.num_pos = max(pos_inds.numel(), 1)
+        self.num_neg = max(neg_inds.numel(), 1)
+        self.avg_factor_with_neg = avg_factor_with_neg
+        self.avg_factor = self.num_pos + self.num_neg \
+            if avg_factor_with_neg else self.num_pos
+        self.pos_priors = priors[pos_inds]
+        self.neg_priors = priors[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_bboxes.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+        self.pos_gt_labels = assign_result.labels[pos_inds]
+        box_dim = gt_bboxes.box_dim if isinstance(gt_bboxes, BaseBoxes) else 4
+        if gt_bboxes.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_bboxes = gt_bboxes.view(-1, box_dim)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, box_dim)
+            self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds.long()]
+
+    @property
+    def priors(self):
+        """torch.Tensor: concatenated positive and negative priors"""
+        return cat_boxes([self.pos_priors, self.neg_priors])
+
+    @property
+    def bboxes(self):
+        """torch.Tensor: concatenated positive and negative boxes"""
+        warnings.warn('DeprecationWarning: bboxes is deprecated, '
+                      'please use "priors" instead')
+        return self.priors
+
+    @property
+    def pos_bboxes(self):
+        warnings.warn('DeprecationWarning: pos_bboxes is deprecated, '
+                      'please use "pos_priors" instead')
+        return self.pos_priors
+
+    @property
+    def neg_bboxes(self):
+        warnings.warn('DeprecationWarning: neg_bboxes is deprecated, '
+                      'please use "neg_priors" instead')
+        return self.neg_priors
+
+    def to(self, device):
+        """Change the device of the data inplace.
+
+        Example:
+            >>> self = SamplingResult.random()
+            >>> print(f'self = {self.to(None)}')
+            >>> # xdoctest: +REQUIRES(--gpu)
+            >>> print(f'self = {self.to(0)}')
+        """
+        _dict = self.__dict__
+        for key, value in _dict.items():
+            if isinstance(value, (torch.Tensor, BaseBoxes)):
+                _dict[key] = value.to(device)
+        return self
+
+    def __nice__(self):
+        data = self.info.copy()
+        data['pos_priors'] = data.pop('pos_priors').shape
+        data['neg_priors'] = data.pop('neg_priors').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_priors': self.pos_priors,
+            'neg_priors': self.neg_priors,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+            'num_pos': self.num_pos,
+            'num_neg': self.num_neg,
+            'avg_factor': self.avg_factor
+        }
+
+    @classmethod
+    def random(cls, rng=None, **kwargs):
+        """
+        Args:
+            rng (None | int | numpy.random.RandomState): seed or state.
+            kwargs (keyword arguments):
+                - num_preds: Number of predicted boxes.
+                - num_gts: Number of true boxes.
+                - p_ignore (float): Probability of a predicted box assigned to
+                    an ignored truth.
+                - p_assigned (float): probability of a predicted box not being
+                    assigned.
+
+        Returns:
+            :obj:`SamplingResult`: Randomly generated sampling result.
+
+        Example:
+            >>> from mmdet.models.task_modules.samplers.sampling_result import *  # NOQA
+            >>> self = SamplingResult.random()
+            >>> print(self.__dict__)
+        """
+        from mmengine.structures import InstanceData
+
+        from mmdet.models.task_modules.assigners import AssignResult
+        from mmdet.models.task_modules.samplers import RandomSampler
+        rng = ensure_rng(rng)
+
+        # make probabilistic?
+        num = 32
+        pos_fraction = 0.5
+        neg_pos_ub = -1
+
+        assign_result = AssignResult.random(rng=rng, **kwargs)
+
+        # Note we could just compute an assignment
+        priors = random_boxes(assign_result.num_preds, rng=rng)
+        gt_bboxes = random_boxes(assign_result.num_gts, rng=rng)
+        gt_labels = torch.randint(
+            0, 5, (assign_result.num_gts, ), dtype=torch.long)
+
+        pred_instances = InstanceData()
+        pred_instances.priors = priors
+
+        gt_instances = InstanceData()
+        gt_instances.bboxes = gt_bboxes
+        gt_instances.labels = gt_labels
+
+        add_gt_as_proposals = True
+
+        sampler = RandomSampler(
+            num,
+            pos_fraction,
+            neg_pos_ub=neg_pos_ub,
+            add_gt_as_proposals=add_gt_as_proposals,
+            rng=rng)
+        self = sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+        return self
diff --git a/head_extractor/src/mmdet/models/task_modules/samplers/score_hlr_sampler.py b/head_extractor/src/mmdet/models/task_modules/samplers/score_hlr_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..0227585b92329625d053f1e9f8c161fd02af8aef
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/samplers/score_hlr_sampler.py
@@ -0,0 +1,290 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from mmcv.ops import nms_match
+from mmengine.structures import InstanceData
+from numpy import ndarray
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox2roi
+from ..assigners import AssignResult
+from .base_sampler import BaseSampler
+from .sampling_result import SamplingResult
+
+
+@TASK_UTILS.register_module()
+class ScoreHLRSampler(BaseSampler):
+    r"""Importance-based Sample Reweighting (ISR_N), described in `Prime Sample
+    Attention in Object Detection <https://arxiv.org/abs/1904.04821>`_.
+
+    Score hierarchical local rank (HLR) differentiates with RandomSampler in
+    negative part. It firstly computes Score-HLR in a two-step way,
+    then linearly maps score hlr to the loss weights.
+
+    Args:
+        num (int): Total number of sampled RoIs.
+        pos_fraction (float): Fraction of positive samples.
+        context (:obj:`BaseRoIHead`): RoI head that the sampler belongs to.
+        neg_pos_ub (int): Upper bound of the ratio of num negative to num
+            positive, -1 means no upper bound. Defaults to -1.
+        add_gt_as_proposals (bool): Whether to add ground truth as proposals.
+            Defaults to True.
+        k (float): Power of the non-linear mapping. Defaults to 0.5
+        bias (float): Shift of the non-linear mapping. Defaults to 0.
+        score_thr (float): Minimum score that a negative sample is to be
+            considered as valid bbox. Defaults to 0.05.
+        iou_thr (float): IoU threshold for NMS match. Defaults to 0.5.
+    """
+
+    def __init__(self,
+                 num: int,
+                 pos_fraction: float,
+                 context,
+                 neg_pos_ub: int = -1,
+                 add_gt_as_proposals: bool = True,
+                 k: float = 0.5,
+                 bias: float = 0,
+                 score_thr: float = 0.05,
+                 iou_thr: float = 0.5,
+                 **kwargs) -> None:
+        super().__init__(
+            num=num,
+            pos_fraction=pos_fraction,
+            neg_pos_ub=neg_pos_ub,
+            add_gt_as_proposals=add_gt_as_proposals)
+        self.k = k
+        self.bias = bias
+        self.score_thr = score_thr
+        self.iou_thr = iou_thr
+        self.context = context
+        # context of cascade detectors is a list, so distinguish them here.
+        if not hasattr(context, 'num_stages'):
+            self.bbox_roi_extractor = context.bbox_roi_extractor
+            self.bbox_head = context.bbox_head
+            self.with_shared_head = context.with_shared_head
+            if self.with_shared_head:
+                self.shared_head = context.shared_head
+        else:
+            self.bbox_roi_extractor = context.bbox_roi_extractor[
+                context.current_stage]
+            self.bbox_head = context.bbox_head[context.current_stage]
+
+    @staticmethod
+    def random_choice(gallery: Union[Tensor, ndarray, list],
+                      num: int) -> Union[Tensor, ndarray]:
+        """Randomly select some elements from the gallery.
+
+        If `gallery` is a Tensor, the returned indices will be a Tensor;
+        If `gallery` is a ndarray or list, the returned indices will be a
+        ndarray.
+
+        Args:
+            gallery (Tensor or ndarray or list): indices pool.
+            num (int): expected sample num.
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        assert len(gallery) >= num
+
+        is_tensor = isinstance(gallery, torch.Tensor)
+        if not is_tensor:
+            if torch.cuda.is_available():
+                device = torch.cuda.current_device()
+            else:
+                device = 'cpu'
+            gallery = torch.tensor(gallery, dtype=torch.long, device=device)
+        perm = torch.randperm(gallery.numel(), device=gallery.device)[:num]
+        rand_inds = gallery[perm]
+        if not is_tensor:
+            rand_inds = rand_inds.cpu().numpy()
+        return rand_inds
+
+    def _sample_pos(self, assign_result: AssignResult, num_expected: int,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Randomly sample some positive samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            num_expected (int): The number of expected positive samples
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0).flatten()
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result: AssignResult, num_expected: int,
+                    bboxes: Tensor, feats: Tensor,
+                    **kwargs) -> Union[Tensor, ndarray]:
+        """Sample negative samples.
+
+        Score-HLR sampler is done in the following steps:
+        1. Take the maximum positive score prediction of each negative samples
+            as s_i.
+        2. Filter out negative samples whose s_i <= score_thr, the left samples
+            are called valid samples.
+        3. Use NMS-Match to divide valid samples into different groups,
+            samples in the same group will greatly overlap with each other
+        4. Rank the matched samples in two-steps to get Score-HLR.
+            (1) In the same group, rank samples with their scores.
+            (2) In the same score rank across different groups,
+                rank samples with their scores again.
+        5. Linearly map Score-HLR to the final label weights.
+
+        Args:
+            assign_result (:obj:`AssignResult`): result of assigner.
+            num_expected (int): Expected number of samples.
+            bboxes (Tensor): bbox to be sampled.
+            feats (Tensor): Features come from FPN.
+
+        Returns:
+            Tensor or ndarray: sampled indices.
+        """
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0).flatten()
+        num_neg = neg_inds.size(0)
+        if num_neg == 0:
+            return neg_inds, None
+        with torch.no_grad():
+            neg_bboxes = bboxes[neg_inds]
+            neg_rois = bbox2roi([neg_bboxes])
+            bbox_result = self.context._bbox_forward(feats, neg_rois)
+            cls_score, bbox_pred = bbox_result['cls_score'], bbox_result[
+                'bbox_pred']
+
+            ori_loss = self.bbox_head.loss(
+                cls_score=cls_score,
+                bbox_pred=None,
+                rois=None,
+                labels=neg_inds.new_full((num_neg, ),
+                                         self.bbox_head.num_classes),
+                label_weights=cls_score.new_ones(num_neg),
+                bbox_targets=None,
+                bbox_weights=None,
+                reduction_override='none')['loss_cls']
+
+            # filter out samples with the max score lower than score_thr
+            max_score, argmax_score = cls_score.softmax(-1)[:, :-1].max(-1)
+            valid_inds = (max_score > self.score_thr).nonzero().view(-1)
+            invalid_inds = (max_score <= self.score_thr).nonzero().view(-1)
+            num_valid = valid_inds.size(0)
+            num_invalid = invalid_inds.size(0)
+
+            num_expected = min(num_neg, num_expected)
+            num_hlr = min(num_valid, num_expected)
+            num_rand = num_expected - num_hlr
+            if num_valid > 0:
+                valid_rois = neg_rois[valid_inds]
+                valid_max_score = max_score[valid_inds]
+                valid_argmax_score = argmax_score[valid_inds]
+                valid_bbox_pred = bbox_pred[valid_inds]
+
+                # valid_bbox_pred shape: [num_valid, #num_classes, 4]
+                valid_bbox_pred = valid_bbox_pred.view(
+                    valid_bbox_pred.size(0), -1, 4)
+                selected_bbox_pred = valid_bbox_pred[range(num_valid),
+                                                     valid_argmax_score]
+                pred_bboxes = self.bbox_head.bbox_coder.decode(
+                    valid_rois[:, 1:], selected_bbox_pred)
+                pred_bboxes_with_score = torch.cat(
+                    [pred_bboxes, valid_max_score[:, None]], -1)
+                group = nms_match(pred_bboxes_with_score, self.iou_thr)
+
+                # imp: importance
+                imp = cls_score.new_zeros(num_valid)
+                for g in group:
+                    g_score = valid_max_score[g]
+                    # g_score has already sorted
+                    rank = g_score.new_tensor(range(g_score.size(0)))
+                    imp[g] = num_valid - rank + g_score
+                _, imp_rank_inds = imp.sort(descending=True)
+                _, imp_rank = imp_rank_inds.sort()
+                hlr_inds = imp_rank_inds[:num_expected]
+
+                if num_rand > 0:
+                    rand_inds = torch.randperm(num_invalid)[:num_rand]
+                    select_inds = torch.cat(
+                        [valid_inds[hlr_inds], invalid_inds[rand_inds]])
+                else:
+                    select_inds = valid_inds[hlr_inds]
+
+                neg_label_weights = cls_score.new_ones(num_expected)
+
+                up_bound = max(num_expected, num_valid)
+                imp_weights = (up_bound -
+                               imp_rank[hlr_inds].float()) / up_bound
+                neg_label_weights[:num_hlr] = imp_weights
+                neg_label_weights[num_hlr:] = imp_weights.min()
+                neg_label_weights = (self.bias +
+                                     (1 - self.bias) * neg_label_weights).pow(
+                                         self.k)
+                ori_selected_loss = ori_loss[select_inds]
+                new_loss = ori_selected_loss * neg_label_weights
+                norm_ratio = ori_selected_loss.sum() / new_loss.sum()
+                neg_label_weights *= norm_ratio
+            else:
+                neg_label_weights = cls_score.new_ones(num_expected)
+                select_inds = torch.randperm(num_neg)[:num_expected]
+
+            return neg_inds[select_inds], neg_label_weights
+
+    def sample(self, assign_result: AssignResult, pred_instances: InstanceData,
+               gt_instances: InstanceData, **kwargs) -> SamplingResult:
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigning results.
+            pred_instances (:obj:`InstanceData`): Instances of model
+                predictions. It includes ``priors``, and the priors can
+                be anchors or points, or the bboxes predicted by the
+                previous stage, has shape (n, 4). The bboxes predicted by
+                the current model or stage will be named ``bboxes``,
+                ``labels``, and ``scores``, the same as the ``InstanceData``
+                in other places.
+            gt_instances (:obj:`InstanceData`): Ground truth of instance
+                annotations. It usually includes ``bboxes``, with shape (k, 4),
+                and ``labels``, with shape (k, ).
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+        """
+        gt_bboxes = gt_instances.bboxes
+        priors = pred_instances.priors
+        gt_labels = gt_instances.labels
+
+        gt_flags = priors.new_zeros((priors.shape[0], ), dtype=torch.uint8)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            priors = torch.cat([gt_bboxes, priors], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = priors.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=priors, **kwargs)
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds, neg_label_weights = self.neg_sampler._sample_neg(
+            assign_result, num_expected_neg, bboxes=priors, **kwargs)
+
+        sampling_result = SamplingResult(
+            pos_inds=pos_inds,
+            neg_inds=neg_inds,
+            priors=priors,
+            gt_bboxes=gt_bboxes,
+            assign_result=assign_result,
+            gt_flags=gt_flags)
+        return sampling_result, neg_label_weights
diff --git a/head_extractor/src/mmdet/models/task_modules/tracking/__init__.py b/head_extractor/src/mmdet/models/task_modules/tracking/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..57a86d739d586e47e007d26de4542d6bdeced755
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/tracking/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .aflink import AppearanceFreeLink
+from .camera_motion_compensation import CameraMotionCompensation
+from .interpolation import InterpolateTracklets
+from .kalman_filter import KalmanFilter
+from .similarity import embed_similarity
+
+__all__ = [
+    'KalmanFilter', 'InterpolateTracklets', 'embed_similarity',
+    'AppearanceFreeLink', 'CameraMotionCompensation'
+]
diff --git a/head_extractor/src/mmdet/models/task_modules/tracking/aflink.py b/head_extractor/src/mmdet/models/task_modules/tracking/aflink.py
new file mode 100644
index 0000000000000000000000000000000000000000..52461067e372b30bbd28325ead00f5381c546326
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/tracking/aflink.py
@@ -0,0 +1,281 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import Tuple
+
+import numpy as np
+import torch
+from mmengine.model import BaseModule
+from mmengine.runner.checkpoint import load_checkpoint
+from scipy.optimize import linear_sum_assignment
+from torch import Tensor, nn
+
+from mmdet.registry import TASK_UTILS
+
+INFINITY = 1e5
+
+
+class TemporalBlock(BaseModule):
+    """The temporal block of AFLink model.
+
+    Args:
+        in_channel (int): the dimension of the input channels.
+        out_channel (int): the dimension of the output channels.
+    """
+
+    def __init__(self,
+                 in_channel: int,
+                 out_channel: int,
+                 kernel_size: tuple = (7, 1)):
+        super(TemporalBlock, self).__init__()
+        self.conv = nn.Conv2d(in_channel, out_channel, kernel_size, bias=False)
+        self.relu = nn.ReLU(inplace=True)
+        self.bnf = nn.BatchNorm1d(out_channel)
+        self.bnx = nn.BatchNorm1d(out_channel)
+        self.bny = nn.BatchNorm1d(out_channel)
+
+    def bn(self, x: Tensor) -> Tensor:
+        x[:, :, :, 0] = self.bnf(x[:, :, :, 0])
+        x[:, :, :, 1] = self.bnx(x[:, :, :, 1])
+        x[:, :, :, 2] = self.bny(x[:, :, :, 2])
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class FusionBlock(BaseModule):
+    """The fusion block of AFLink model.
+
+    Args:
+        in_channel (int): the dimension of the input channels.
+        out_channel (int): the dimension of the output channels.
+    """
+
+    def __init__(self, in_channel: int, out_channel: int):
+        super(FusionBlock, self).__init__()
+        self.conv = nn.Conv2d(in_channel, out_channel, (1, 3), bias=False)
+        self.bn = nn.BatchNorm2d(out_channel)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class Classifier(BaseModule):
+    """The classifier of AFLink model.
+
+    Args:
+        in_channel (int): the dimension of the input channels.
+    """
+
+    def __init__(self, in_channel: int, out_channel: int):
+        super(Classifier, self).__init__()
+        self.fc1 = nn.Linear(in_channel * 2, in_channel // 2)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc2 = nn.Linear(in_channel // 2, out_channel)
+
+    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        x = torch.cat((x1, x2), dim=1)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        return x
+
+
+class AFLinkModel(BaseModule):
+    """Appearance-Free Link Model."""
+
+    def __init__(self,
+                 temporal_module_channels: list = [1, 32, 64, 128, 256],
+                 fusion_module_channels: list = [256, 256],
+                 classifier_channels: list = [256, 2]):
+        super(AFLinkModel, self).__init__()
+        self.TemporalModule_1 = nn.Sequential(*[
+            TemporalBlock(temporal_module_channels[i],
+                          temporal_module_channels[i + 1])
+            for i in range(len(temporal_module_channels) - 1)
+        ])
+
+        self.TemporalModule_2 = nn.Sequential(*[
+            TemporalBlock(temporal_module_channels[i],
+                          temporal_module_channels[i + 1])
+            for i in range(len(temporal_module_channels) - 1)
+        ])
+
+        self.FusionBlock_1 = FusionBlock(*fusion_module_channels)
+        self.FusionBlock_2 = FusionBlock(*fusion_module_channels)
+
+        self.pooling = nn.AdaptiveAvgPool2d((1, 1))
+        self.classifier = Classifier(*classifier_channels)
+
+    def forward(self, x1: Tensor, x2: Tensor) -> Tensor:
+        assert not self.training, 'Only testing is supported for AFLink.'
+        x1 = x1[:, :, :, :3]
+        x2 = x2[:, :, :, :3]
+        x1 = self.TemporalModule_1(x1)  # [B,1,30,3] -> [B,256,6,3]
+        x2 = self.TemporalModule_2(x2)
+        x1 = self.FusionBlock_1(x1)
+        x2 = self.FusionBlock_2(x2)
+        x1 = self.pooling(x1).squeeze(-1).squeeze(-1)
+        x2 = self.pooling(x2).squeeze(-1).squeeze(-1)
+        y = self.classifier(x1, x2)
+        y = torch.softmax(y, dim=1)[0, 1]
+        return y
+
+
+@TASK_UTILS.register_module()
+class AppearanceFreeLink(BaseModule):
+    """Appearance-Free Link method.
+
+    This method is proposed in
+    "StrongSORT: Make DeepSORT Great Again"
+    `StrongSORT<https://arxiv.org/abs/2202.13514>`_.
+
+    Args:
+        checkpoint (str): Checkpoint path.
+        temporal_threshold (tuple, optional): The temporal constraint
+            for tracklets association. Defaults to (0, 30).
+        spatial_threshold (int, optional): The spatial constraint for
+            tracklets association. Defaults to 75.
+        confidence_threshold (float, optional): The minimum confidence
+            threshold for tracklets association. Defaults to 0.95.
+    """
+
+    def __init__(self,
+                 checkpoint: str,
+                 temporal_threshold: tuple = (0, 30),
+                 spatial_threshold: int = 75,
+                 confidence_threshold: float = 0.95):
+        super(AppearanceFreeLink, self).__init__()
+        self.temporal_threshold = temporal_threshold
+        self.spatial_threshold = spatial_threshold
+        self.confidence_threshold = confidence_threshold
+
+        self.model = AFLinkModel()
+        if checkpoint:
+            load_checkpoint(self.model, checkpoint)
+        if torch.cuda.is_available():
+            self.model.cuda()
+        self.model.eval()
+
+        self.device = next(self.model.parameters()).device
+        self.fn_l2 = lambda x, y: np.sqrt(x**2 + y**2)
+
+    def data_transform(self,
+                       track1: np.ndarray,
+                       track2: np.ndarray,
+                       length: int = 30) -> Tuple[np.ndarray]:
+        """Data Transformation. This is used to standardize the length of
+        tracks to a unified length. Then perform min-max normalization to the
+        motion embeddings.
+
+        Args:
+            track1 (ndarray): the first track with shape (N,C).
+            track2 (ndarray): the second track with shape (M,C).
+            length (int): the unified length of tracks. Defaults to 30.
+
+        Returns:
+            Tuple[ndarray]: the transformed track1 and track2.
+        """
+        # fill or cut track1
+        length_1 = track1.shape[0]
+        track1 = track1[-length:] if length_1 >= length else \
+            np.pad(track1, ((length - length_1, 0), (0, 0)))
+
+        # fill or cut track1
+        length_2 = track2.shape[0]
+        track2 = track2[:length] if length_2 >= length else \
+            np.pad(track2, ((0, length - length_2), (0, 0)))
+
+        # min-max normalization
+        min_ = np.concatenate((track1, track2), axis=0).min(axis=0)
+        max_ = np.concatenate((track1, track2), axis=0).max(axis=0)
+        subtractor = (max_ + min_) / 2
+        divisor = (max_ - min_) / 2 + 1e-5
+        track1 = (track1 - subtractor) / divisor
+        track2 = (track2 - subtractor) / divisor
+
+        return track1, track2
+
+    def forward(self, pred_tracks: np.ndarray) -> np.ndarray:
+        """Forward function.
+
+        pred_tracks (ndarray): With shape (N, 7). Each row denotes
+            (frame_id, track_id, x1, y1, x2, y2, score).
+
+        Returns:
+            ndarray: The linked tracks with shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score)
+        """
+        # sort tracks by the frame id
+        pred_tracks = pred_tracks[np.argsort(pred_tracks[:, 0])]
+
+        # gather tracks information
+        id2info = defaultdict(list)
+        for row in pred_tracks:
+            frame_id, track_id, x1, y1, x2, y2 = row[:6]
+            id2info[track_id].append([frame_id, x1, y1, x2 - x1, y2 - y1])
+        id2info = {k: np.array(v) for k, v in id2info.items()}
+        num_track = len(id2info)
+        track_ids = np.array(list(id2info))
+        cost_matrix = np.full((num_track, num_track), INFINITY)
+
+        # compute the cost matrix
+        for i, id_i in enumerate(track_ids):
+            for j, id_j in enumerate(track_ids):
+                if id_i == id_j:
+                    continue
+                info_i, info_j = id2info[id_i], id2info[id_j]
+                frame_i, box_i = info_i[-1][0], info_i[-1][1:3]
+                frame_j, box_j = info_j[0][0], info_j[0][1:3]
+                # temporal constraint
+                if not self.temporal_threshold[0] <= \
+                        frame_j - frame_i <= self.temporal_threshold[1]:
+                    continue
+                # spatial constraint
+                if self.fn_l2(box_i[0] - box_j[0], box_i[1] - box_j[1]) \
+                        > self.spatial_threshold:
+                    continue
+                # confidence constraint
+                track_i, track_j = self.data_transform(info_i, info_j)
+
+                # numpy to torch
+                track_i = torch.tensor(
+                    track_i, dtype=torch.float).to(self.device)
+                track_j = torch.tensor(
+                    track_j, dtype=torch.float).to(self.device)
+                track_i = track_i.unsqueeze(0).unsqueeze(0)
+                track_j = track_j.unsqueeze(0).unsqueeze(0)
+
+                confidence = self.model(track_i,
+                                        track_j).detach().cpu().numpy()
+                if confidence >= self.confidence_threshold:
+                    cost_matrix[i, j] = 1 - confidence
+
+        # linear assignment
+        indices = linear_sum_assignment(cost_matrix)
+        _id2id = dict()  # the temporary assignment results
+        id2id = dict()  # the final assignment results
+        for i, j in zip(indices[0], indices[1]):
+            if cost_matrix[i, j] < INFINITY:
+                _id2id[i] = j
+        for k, v in _id2id.items():
+            if k in id2id:
+                id2id[v] = id2id[k]
+            else:
+                id2id[v] = k
+
+        # link
+        for k, v in id2id.items():
+            pred_tracks[pred_tracks[:, 1] == k, 1] = v
+
+        # deduplicate
+        _, index = np.unique(pred_tracks[:, :2], return_index=True, axis=0)
+
+        return pred_tracks[index]
diff --git a/head_extractor/src/mmdet/models/task_modules/tracking/camera_motion_compensation.py b/head_extractor/src/mmdet/models/task_modules/tracking/camera_motion_compensation.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a6298494fd1c24e0e7bba457dd50864725f98c8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/tracking/camera_motion_compensation.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.registry import TASK_UTILS
+from mmdet.structures.bbox import bbox_cxcyah_to_xyxy, bbox_xyxy_to_cxcyah
+
+
+@TASK_UTILS.register_module()
+class CameraMotionCompensation:
+    """Camera motion compensation.
+
+    Args:
+        warp_mode (str): Warp mode in opencv.
+            Defaults to 'cv2.MOTION_EUCLIDEAN'.
+        num_iters (int): Number of the iterations. Defaults to 50.
+        stop_eps (float): Terminate threshold. Defaults to 0.001.
+    """
+
+    def __init__(self,
+                 warp_mode: str = 'cv2.MOTION_EUCLIDEAN',
+                 num_iters: int = 50,
+                 stop_eps: float = 0.001):
+        self.warp_mode = eval(warp_mode)
+        self.num_iters = num_iters
+        self.stop_eps = stop_eps
+
+    def get_warp_matrix(self, img: np.ndarray, ref_img: np.ndarray) -> Tensor:
+        """Calculate warping matrix between two images."""
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        ref_img = cv2.cvtColor(ref_img, cv2.COLOR_BGR2GRAY)
+
+        warp_matrix = np.eye(2, 3, dtype=np.float32)
+        criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT,
+                    self.num_iters, self.stop_eps)
+        cc, warp_matrix = cv2.findTransformECC(img, ref_img, warp_matrix,
+                                               self.warp_mode, criteria, None,
+                                               1)
+        warp_matrix = torch.from_numpy(warp_matrix)
+        return warp_matrix
+
+    def warp_bboxes(self, bboxes: Tensor, warp_matrix: Tensor) -> Tensor:
+        """Warp bounding boxes according to the warping matrix."""
+        tl, br = bboxes[:, :2], bboxes[:, 2:]
+        tl = torch.cat((tl, torch.ones(tl.shape[0], 1).to(bboxes.device)),
+                       dim=1)
+        br = torch.cat((br, torch.ones(tl.shape[0], 1).to(bboxes.device)),
+                       dim=1)
+        trans_tl = torch.mm(warp_matrix, tl.t()).t()
+        trans_br = torch.mm(warp_matrix, br.t()).t()
+        trans_bboxes = torch.cat((trans_tl, trans_br), dim=1)
+        return trans_bboxes.to(bboxes.device)
+
+    def warp_means(self, means: np.ndarray, warp_matrix: Tensor) -> np.ndarray:
+        """Warp track.mean according to the warping matrix."""
+        cxcyah = torch.from_numpy(means[:, :4]).float()
+        xyxy = bbox_cxcyah_to_xyxy(cxcyah)
+        warped_xyxy = self.warp_bboxes(xyxy, warp_matrix)
+        warped_cxcyah = bbox_xyxy_to_cxcyah(warped_xyxy).numpy()
+        means[:, :4] = warped_cxcyah
+        return means
+
+    def track(self, img: Tensor, ref_img: Tensor, tracks: dict,
+              num_samples: int, frame_id: int, metainfo: dict) -> dict:
+        """Tracking forward."""
+        img = img.squeeze(0).cpu().numpy().transpose((1, 2, 0))
+        ref_img = ref_img.squeeze(0).cpu().numpy().transpose((1, 2, 0))
+        warp_matrix = self.get_warp_matrix(img, ref_img)
+
+        # rescale the warp_matrix due to the `resize` in pipeline
+        scale_factor_h, scale_factor_w = metainfo['scale_factor']
+        warp_matrix[0, 2] = warp_matrix[0, 2] / scale_factor_w
+        warp_matrix[1, 2] = warp_matrix[1, 2] / scale_factor_h
+
+        bboxes = []
+        num_bboxes = []
+        means = []
+        for k, v in tracks.items():
+            if int(v['frame_ids'][-1]) < frame_id - 1:
+                _num = 1
+            else:
+                _num = min(num_samples, len(v.bboxes))
+            num_bboxes.append(_num)
+            bboxes.extend(v.bboxes[-_num:])
+            if len(v.mean) > 0:
+                means.append(v.mean)
+        bboxes = torch.cat(bboxes, dim=0)
+        warped_bboxes = self.warp_bboxes(bboxes, warp_matrix.to(bboxes.device))
+
+        warped_bboxes = torch.split(warped_bboxes, num_bboxes)
+        for b, (k, v) in zip(warped_bboxes, tracks.items()):
+            _num = b.shape[0]
+            b = torch.split(b, [1] * _num)
+            tracks[k].bboxes[-_num:] = b
+
+        if means:
+            means = np.asarray(means)
+            warped_means = self.warp_means(means, warp_matrix)
+            for m, (k, v) in zip(warped_means, tracks.items()):
+                tracks[k].mean = m
+
+        return tracks
diff --git a/head_extractor/src/mmdet/models/task_modules/tracking/interpolation.py b/head_extractor/src/mmdet/models/task_modules/tracking/interpolation.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb6a25af4f253e3ec6b9781831ff43c6bafe50e1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/tracking/interpolation.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+try:
+    from sklearn.gaussian_process import GaussianProcessRegressor as GPR
+    from sklearn.gaussian_process.kernels import RBF
+    HAS_SKIKIT_LEARN = True
+except ImportError:
+    HAS_SKIKIT_LEARN = False
+
+from mmdet.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class InterpolateTracklets:
+    """Interpolate tracks to make tracks more complete.
+
+    Args:
+        min_num_frames (int, optional): The minimum length of a track that will
+            be interpolated. Defaults to 5.
+        max_num_frames (int, optional): The maximum disconnected length in
+            a track. Defaults to 20.
+        use_gsi (bool, optional): Whether to use the GSI (Gaussian-smoothed
+            interpolation) method. Defaults to False.
+        smooth_tau (int, optional): smoothing parameter in GSI. Defaults to 10.
+    """
+
+    def __init__(self,
+                 min_num_frames: int = 5,
+                 max_num_frames: int = 20,
+                 use_gsi: bool = False,
+                 smooth_tau: int = 10):
+        if not HAS_SKIKIT_LEARN:
+            raise RuntimeError('sscikit-learn is not installed,\
+                 please install it by: pip install scikit-learn')
+        self.min_num_frames = min_num_frames
+        self.max_num_frames = max_num_frames
+        self.use_gsi = use_gsi
+        self.smooth_tau = smooth_tau
+
+    def _interpolate_track(self,
+                           track: np.ndarray,
+                           track_id: int,
+                           max_num_frames: int = 20) -> np.ndarray:
+        """Interpolate a track linearly to make the track more complete.
+
+        This function is proposed in
+        "ByteTrack: Multi-Object Tracking by Associating Every Detection Box."
+        `ByteTrack<https://arxiv.org/abs/2110.06864>`_.
+
+        Args:
+            track (ndarray): With shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score).
+            max_num_frames (int, optional): The maximum disconnected length in
+                the track. Defaults to 20.
+
+        Returns:
+            ndarray: The interpolated track with shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score)
+        """
+        assert (track[:, 1] == track_id).all(), \
+            'The track id should not changed when interpolate a track.'
+
+        frame_ids = track[:, 0]
+        interpolated_track = np.zeros((0, 7))
+        # perform interpolation for the disconnected frames in the track.
+        for i in np.where(np.diff(frame_ids) > 1)[0]:
+            left_frame_id = frame_ids[i]
+            right_frame_id = frame_ids[i + 1]
+            num_disconnected_frames = int(right_frame_id - left_frame_id)
+
+            if 1 < num_disconnected_frames < max_num_frames:
+                left_bbox = track[i, 2:6]
+                right_bbox = track[i + 1, 2:6]
+
+                # perform interpolation for two adjacent tracklets.
+                for j in range(1, num_disconnected_frames):
+                    cur_bbox = j / (num_disconnected_frames) * (
+                        right_bbox - left_bbox) + left_bbox
+                    cur_result = np.ones((7, ))
+                    cur_result[0] = j + left_frame_id
+                    cur_result[1] = track_id
+                    cur_result[2:6] = cur_bbox
+
+                    interpolated_track = np.concatenate(
+                        (interpolated_track, cur_result[None]), axis=0)
+
+        interpolated_track = np.concatenate((track, interpolated_track),
+                                            axis=0)
+        return interpolated_track
+
+    def gaussian_smoothed_interpolation(self,
+                                        track: np.ndarray,
+                                        smooth_tau: int = 10) -> np.ndarray:
+        """Gaussian-Smoothed Interpolation.
+
+        This function is proposed in
+        "StrongSORT: Make DeepSORT Great Again"
+        `StrongSORT<https://arxiv.org/abs/2202.13514>`_.
+
+        Args:
+            track (ndarray): With shape (N, 7). Each row denotes
+                (frame_id, track_id, x1, y1, x2, y2, score).
+            smooth_tau (int, optional): smoothing parameter in GSI.
+                Defaults to 10.
+
+        Returns:
+            ndarray: The interpolated tracks with shape (N, 7). Each row
+                denotes (frame_id, track_id, x1, y1, x2, y2, score)
+        """
+        len_scale = np.clip(smooth_tau * np.log(smooth_tau**3 / len(track)),
+                            smooth_tau**-1, smooth_tau**2)
+        gpr = GPR(RBF(len_scale, 'fixed'))
+        t = track[:, 0].reshape(-1, 1)
+        x1 = track[:, 2].reshape(-1, 1)
+        y1 = track[:, 3].reshape(-1, 1)
+        x2 = track[:, 4].reshape(-1, 1)
+        y2 = track[:, 5].reshape(-1, 1)
+        gpr.fit(t, x1)
+        x1_gpr = gpr.predict(t)
+        gpr.fit(t, y1)
+        y1_gpr = gpr.predict(t)
+        gpr.fit(t, x2)
+        x2_gpr = gpr.predict(t)
+        gpr.fit(t, y2)
+        y2_gpr = gpr.predict(t)
+        gsi_track = [[
+            t[i, 0], track[i, 1], x1_gpr[i], y1_gpr[i], x2_gpr[i], y2_gpr[i],
+            track[i, 6]
+        ] for i in range(len(t))]
+        return np.array(gsi_track)
+
+    def forward(self, pred_tracks: np.ndarray) -> np.ndarray:
+        """Forward function.
+
+        pred_tracks (ndarray): With shape (N, 7). Each row denotes
+            (frame_id, track_id, x1, y1, x2, y2, score).
+
+        Returns:
+            ndarray: The interpolated tracks with shape (N, 7). Each row
+            denotes (frame_id, track_id, x1, y1, x2, y2, score).
+        """
+        max_track_id = int(np.max(pred_tracks[:, 1]))
+        min_track_id = int(np.min(pred_tracks[:, 1]))
+
+        # perform interpolation for each track
+        interpolated_tracks = []
+        for track_id in range(min_track_id, max_track_id + 1):
+            inds = pred_tracks[:, 1] == track_id
+            track = pred_tracks[inds]
+            num_frames = len(track)
+            if num_frames <= 2:
+                continue
+
+            if num_frames > self.min_num_frames:
+                interpolated_track = self._interpolate_track(
+                    track, track_id, self.max_num_frames)
+            else:
+                interpolated_track = track
+
+            if self.use_gsi:
+                interpolated_track = self.gaussian_smoothed_interpolation(
+                    interpolated_track, self.smooth_tau)
+
+            interpolated_tracks.append(interpolated_track)
+
+        interpolated_tracks = np.concatenate(interpolated_tracks)
+        return interpolated_tracks[interpolated_tracks[:, 0].argsort()]
diff --git a/head_extractor/src/mmdet/models/task_modules/tracking/kalman_filter.py b/head_extractor/src/mmdet/models/task_modules/tracking/kalman_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8ae1416af69bce17fd20dd5231eba2f12f7ed64
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/tracking/kalman_filter.py
@@ -0,0 +1,267 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import numpy as np
+import torch
+
+try:
+    import scipy.linalg
+    HAS_SCIPY = True
+except ImportError:
+    HAS_SCIPY = False
+
+from mmdet.registry import TASK_UTILS
+
+
+@TASK_UTILS.register_module()
+class KalmanFilter:
+    """A simple Kalman filter for tracking bounding boxes in image space.
+
+    The implementation is referred to https://github.com/nwojke/deep_sort.
+
+    Args:
+        center_only (bool): If True, distance computation is done with
+            respect to the bounding box center position only.
+            Defaults to False.
+        use_nsa (bool): Whether to use the NSA (Noise Scale Adaptive) Kalman
+            Filter, which adaptively modulates the noise scale according to
+            the quality of detections. More details in
+            https://arxiv.org/abs/2202.11983. Defaults to False.
+    """
+    chi2inv95 = {
+        1: 3.8415,
+        2: 5.9915,
+        3: 7.8147,
+        4: 9.4877,
+        5: 11.070,
+        6: 12.592,
+        7: 14.067,
+        8: 15.507,
+        9: 16.919
+    }
+
+    def __init__(self, center_only: bool = False, use_nsa: bool = False):
+        if not HAS_SCIPY:
+            raise RuntimeError('sscikit-learn is not installed,\
+                 please install it by: pip install scikit-learn')
+        self.center_only = center_only
+        if self.center_only:
+            self.gating_threshold = self.chi2inv95[2]
+        else:
+            self.gating_threshold = self.chi2inv95[4]
+
+        self.use_nsa = use_nsa
+        ndim, dt = 4, 1.
+
+        # Create Kalman filter model matrices.
+        self._motion_mat = np.eye(2 * ndim, 2 * ndim)
+        for i in range(ndim):
+            self._motion_mat[i, ndim + i] = dt
+        self._update_mat = np.eye(ndim, 2 * ndim)
+
+        # Motion and observation uncertainty are chosen relative to the current
+        # state estimate. These weights control the amount of uncertainty in
+        # the model. This is a bit hacky.
+        self._std_weight_position = 1. / 20
+        self._std_weight_velocity = 1. / 160
+
+    def initiate(self, measurement: np.array) -> Tuple[np.array, np.array]:
+        """Create track from unassociated measurement.
+
+        Args:
+            measurement (ndarray):  Bounding box coordinates (x, y, a, h) with
+            center position (x, y), aspect ratio a, and height h.
+
+        Returns:
+             (ndarray, ndarray): Returns the mean vector (8 dimensional) and
+                covariance matrix (8x8 dimensional) of the new track.
+                Unobserved velocities are initialized to 0 mean.
+        """
+        mean_pos = measurement
+        mean_vel = np.zeros_like(mean_pos)
+        mean = np.r_[mean_pos, mean_vel]
+
+        std = [
+            2 * self._std_weight_position * measurement[3],
+            2 * self._std_weight_position * measurement[3], 1e-2,
+            2 * self._std_weight_position * measurement[3],
+            10 * self._std_weight_velocity * measurement[3],
+            10 * self._std_weight_velocity * measurement[3], 1e-5,
+            10 * self._std_weight_velocity * measurement[3]
+        ]
+        covariance = np.diag(np.square(std))
+        return mean, covariance
+
+    def predict(self, mean: np.array,
+                covariance: np.array) -> Tuple[np.array, np.array]:
+        """Run Kalman filter prediction step.
+
+        Args:
+            mean (ndarray): The 8 dimensional mean vector of the object
+                state at the previous time step.
+
+            covariance (ndarray): The 8x8 dimensional covariance matrix
+                of the object state at the previous time step.
+
+        Returns:
+            (ndarray, ndarray): Returns the mean vector and covariance
+                matrix of the predicted state. Unobserved velocities are
+                initialized to 0 mean.
+        """
+        std_pos = [
+            self._std_weight_position * mean[3],
+            self._std_weight_position * mean[3], 1e-2,
+            self._std_weight_position * mean[3]
+        ]
+        std_vel = [
+            self._std_weight_velocity * mean[3],
+            self._std_weight_velocity * mean[3], 1e-5,
+            self._std_weight_velocity * mean[3]
+        ]
+        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))
+
+        mean = np.dot(self._motion_mat, mean)
+        covariance = np.linalg.multi_dot(
+            (self._motion_mat, covariance, self._motion_mat.T)) + motion_cov
+
+        return mean, covariance
+
+    def project(self,
+                mean: np.array,
+                covariance: np.array,
+                bbox_score: float = 0.) -> Tuple[np.array, np.array]:
+        """Project state distribution to measurement space.
+
+        Args:
+            mean (ndarray): The state's mean vector (8 dimensional array).
+            covariance (ndarray): The state's covariance matrix (8x8
+                dimensional).
+            bbox_score (float): The confidence score of the bbox.
+                Defaults to 0.
+
+        Returns:
+            (ndarray, ndarray):  Returns the projected mean and covariance
+            matrix of the given state estimate.
+        """
+        std = [
+            self._std_weight_position * mean[3],
+            self._std_weight_position * mean[3], 1e-1,
+            self._std_weight_position * mean[3]
+        ]
+
+        if self.use_nsa:
+            std = [(1 - bbox_score) * x for x in std]
+
+        innovation_cov = np.diag(np.square(std))
+
+        mean = np.dot(self._update_mat, mean)
+        covariance = np.linalg.multi_dot(
+            (self._update_mat, covariance, self._update_mat.T))
+        return mean, covariance + innovation_cov
+
+    def update(self,
+               mean: np.array,
+               covariance: np.array,
+               measurement: np.array,
+               bbox_score: float = 0.) -> Tuple[np.array, np.array]:
+        """Run Kalman filter correction step.
+
+        Args:
+            mean (ndarray): The predicted state's mean vector (8 dimensional).
+            covariance (ndarray): The state's covariance matrix (8x8
+                dimensional).
+            measurement (ndarray): The 4 dimensional measurement vector
+                (x, y, a, h), where (x, y) is the center position, a the
+                aspect ratio, and h the height of the bounding box.
+            bbox_score (float): The confidence score of the bbox.
+                Defaults to 0.
+
+        Returns:
+             (ndarray, ndarray): Returns the measurement-corrected state
+             distribution.
+        """
+        projected_mean, projected_cov = \
+            self.project(mean, covariance, bbox_score)
+
+        chol_factor, lower = scipy.linalg.cho_factor(
+            projected_cov, lower=True, check_finite=False)
+        kalman_gain = scipy.linalg.cho_solve((chol_factor, lower),
+                                             np.dot(covariance,
+                                                    self._update_mat.T).T,
+                                             check_finite=False).T
+        innovation = measurement - projected_mean
+
+        new_mean = mean + np.dot(innovation, kalman_gain.T)
+        new_covariance = covariance - np.linalg.multi_dot(
+            (kalman_gain, projected_cov, kalman_gain.T))
+        return new_mean, new_covariance
+
+    def gating_distance(self,
+                        mean: np.array,
+                        covariance: np.array,
+                        measurements: np.array,
+                        only_position: bool = False) -> np.array:
+        """Compute gating distance between state distribution and measurements.
+
+        A suitable distance threshold can be obtained from `chi2inv95`. If
+        `only_position` is False, the chi-square distribution has 4 degrees of
+        freedom, otherwise 2.
+
+        Args:
+            mean (ndarray): Mean vector over the state distribution (8
+                dimensional).
+            covariance (ndarray): Covariance of the state distribution (8x8
+                dimensional).
+            measurements (ndarray): An Nx4 dimensional matrix of N
+                measurements, each in format (x, y, a, h) where (x, y) is the
+                bounding box center position, a the aspect ratio, and h the
+                height.
+            only_position (bool, optional): If True, distance computation is
+                done with respect to the bounding box center position only.
+                Defaults to False.
+
+        Returns:
+            ndarray: Returns an array of length N, where the i-th element
+            contains the squared Mahalanobis distance between
+            (mean, covariance) and `measurements[i]`.
+        """
+        mean, covariance = self.project(mean, covariance)
+        if only_position:
+            mean, covariance = mean[:2], covariance[:2, :2]
+            measurements = measurements[:, :2]
+
+        cholesky_factor = np.linalg.cholesky(covariance)
+        d = measurements - mean
+        z = scipy.linalg.solve_triangular(
+            cholesky_factor,
+            d.T,
+            lower=True,
+            check_finite=False,
+            overwrite_b=True)
+        squared_maha = np.sum(z * z, axis=0)
+        return squared_maha
+
+    def track(self, tracks: dict,
+              bboxes: torch.Tensor) -> Tuple[dict, np.array]:
+        """Track forward.
+
+        Args:
+            tracks (dict[int:dict]): Track buffer.
+            bboxes (Tensor): Detected bounding boxes.
+
+        Returns:
+            (dict[int:dict], ndarray): Updated tracks and bboxes.
+        """
+        costs = []
+        for id, track in tracks.items():
+            track.mean, track.covariance = self.predict(
+                track.mean, track.covariance)
+            gating_distance = self.gating_distance(track.mean,
+                                                   track.covariance,
+                                                   bboxes.cpu().numpy(),
+                                                   self.center_only)
+            costs.append(gating_distance)
+
+        costs = np.stack(costs, 0)
+        costs[costs > self.gating_threshold] = np.nan
+        return tracks, costs
diff --git a/head_extractor/src/mmdet/models/task_modules/tracking/similarity.py b/head_extractor/src/mmdet/models/task_modules/tracking/similarity.py
new file mode 100644
index 0000000000000000000000000000000000000000..730e43b86214ae92ffdcab8ae39e6f9261075caa
--- /dev/null
+++ b/head_extractor/src/mmdet/models/task_modules/tracking/similarity.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def embed_similarity(key_embeds: Tensor,
+                     ref_embeds: Tensor,
+                     method: str = 'dot_product',
+                     temperature: int = -1) -> Tensor:
+    """Calculate feature similarity from embeddings.
+
+    Args:
+        key_embeds (Tensor): Shape (N1, C).
+        ref_embeds (Tensor): Shape (N2, C).
+        method (str, optional): Method to calculate the similarity,
+            options are 'dot_product' and 'cosine'. Defaults to
+            'dot_product'.
+        temperature (int, optional): Softmax temperature. Defaults to -1.
+
+    Returns:
+        Tensor: Similarity matrix of shape (N1, N2).
+    """
+    assert method in ['dot_product', 'cosine']
+
+    if method == 'cosine':
+        key_embeds = F.normalize(key_embeds, p=2, dim=1)
+        ref_embeds = F.normalize(ref_embeds, p=2, dim=1)
+
+    similarity = torch.mm(key_embeds, ref_embeds.T)
+
+    if temperature > 0:
+        similarity /= float(temperature)
+    return similarity
diff --git a/head_extractor/src/mmdet/models/test_time_augs/__init__.py b/head_extractor/src/mmdet/models/test_time_augs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e4926efb011b45b3ab7d3d303fb2d105aaa192
--- /dev/null
+++ b/head_extractor/src/mmdet/models/test_time_augs/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .det_tta import DetTTAModel
+from .merge_augs import (merge_aug_bboxes, merge_aug_masks,
+                         merge_aug_proposals, merge_aug_results,
+                         merge_aug_scores)
+
+__all__ = [
+    'merge_aug_bboxes', 'merge_aug_masks', 'merge_aug_proposals',
+    'merge_aug_scores', 'merge_aug_results', 'DetTTAModel'
+]
diff --git a/head_extractor/src/mmdet/models/test_time_augs/det_tta.py b/head_extractor/src/mmdet/models/test_time_augs/det_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..95f91db9e1250358db0e1a572cf4c37cc7fe6e6f
--- /dev/null
+++ b/head_extractor/src/mmdet/models/test_time_augs/det_tta.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.model import BaseTTAModel
+from mmengine.registry import MODELS
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox_flip
+
+
+@MODELS.register_module()
+class DetTTAModel(BaseTTAModel):
+    """Merge augmented detection results, only bboxes corresponding score under
+    flipping and multi-scale resizing can be processed now.
+
+    Examples:
+        >>> tta_model = dict(
+        >>>     type='DetTTAModel',
+        >>>     tta_cfg=dict(nms=dict(
+        >>>                     type='nms',
+        >>>                     iou_threshold=0.5),
+        >>>                     max_per_img=100))
+        >>>
+        >>> tta_pipeline = [
+        >>>     dict(type='LoadImageFromFile',
+        >>>          backend_args=None),
+        >>>     dict(
+        >>>         type='TestTimeAug',
+        >>>         transforms=[[
+        >>>             dict(type='Resize',
+        >>>                  scale=(1333, 800),
+        >>>                  keep_ratio=True),
+        >>>         ], [
+        >>>             dict(type='RandomFlip', prob=1.),
+        >>>             dict(type='RandomFlip', prob=0.)
+        >>>         ], [
+        >>>             dict(
+        >>>                 type='PackDetInputs',
+        >>>                 meta_keys=('img_id', 'img_path', 'ori_shape',
+        >>>                         'img_shape', 'scale_factor', 'flip',
+        >>>                         'flip_direction'))
+        >>>         ]])]
+    """
+
+    def __init__(self, tta_cfg=None, **kwargs):
+        super().__init__(**kwargs)
+        self.tta_cfg = tta_cfg
+
+    def merge_aug_bboxes(self, aug_bboxes: List[Tensor],
+                         aug_scores: List[Tensor],
+                         img_metas: List[str]) -> Tuple[Tensor, Tensor]:
+        """Merge augmented detection bboxes and scores.
+
+        Args:
+            aug_bboxes (list[Tensor]): shape (n, 4*#class)
+            aug_scores (list[Tensor] or None): shape (n, #class)
+        Returns:
+            tuple[Tensor]: ``bboxes`` with shape (n,4), where
+            4 represent (tl_x, tl_y, br_x, br_y)
+            and ``scores`` with shape (n,).
+        """
+        recovered_bboxes = []
+        for bboxes, img_info in zip(aug_bboxes, img_metas):
+            ori_shape = img_info['ori_shape']
+            flip = img_info['flip']
+            flip_direction = img_info['flip_direction']
+            if flip:
+                bboxes = bbox_flip(
+                    bboxes=bboxes,
+                    img_shape=ori_shape,
+                    direction=flip_direction)
+            recovered_bboxes.append(bboxes)
+        bboxes = torch.cat(recovered_bboxes, dim=0)
+        if aug_scores is None:
+            return bboxes
+        else:
+            scores = torch.cat(aug_scores, dim=0)
+            return bboxes, scores
+
+    def merge_preds(self, data_samples_list: List[List[DetDataSample]]):
+        """Merge batch predictions of enhanced data.
+
+        Args:
+            data_samples_list (List[List[DetDataSample]]): List of predictions
+                of all enhanced data. The outer list indicates images, and the
+                inner list corresponds to the different views of one image.
+                Each element of the inner list is a ``DetDataSample``.
+        Returns:
+            List[DetDataSample]: Merged batch prediction.
+        """
+        merged_data_samples = []
+        for data_samples in data_samples_list:
+            merged_data_samples.append(self._merge_single_sample(data_samples))
+        return merged_data_samples
+
+    def _merge_single_sample(
+            self, data_samples: List[DetDataSample]) -> DetDataSample:
+        """Merge predictions which come form the different views of one image
+        to one prediction.
+
+        Args:
+            data_samples (List[DetDataSample]): List of predictions
+            of enhanced data which come form one image.
+        Returns:
+            List[DetDataSample]: Merged prediction.
+        """
+        aug_bboxes = []
+        aug_scores = []
+        aug_labels = []
+        img_metas = []
+        # TODO: support instance segmentation TTA
+        assert data_samples[0].pred_instances.get('masks', None) is None, \
+            'TTA of instance segmentation does not support now.'
+        for data_sample in data_samples:
+            aug_bboxes.append(data_sample.pred_instances.bboxes)
+            aug_scores.append(data_sample.pred_instances.scores)
+            aug_labels.append(data_sample.pred_instances.labels)
+            img_metas.append(data_sample.metainfo)
+
+        merged_bboxes, merged_scores = self.merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas)
+        merged_labels = torch.cat(aug_labels, dim=0)
+
+        if merged_bboxes.numel() == 0:
+            return data_samples[0]
+
+        det_bboxes, keep_idxs = batched_nms(merged_bboxes, merged_scores,
+                                            merged_labels, self.tta_cfg.nms)
+
+        det_bboxes = det_bboxes[:self.tta_cfg.max_per_img]
+        det_labels = merged_labels[keep_idxs][:self.tta_cfg.max_per_img]
+
+        results = InstanceData()
+        _det_bboxes = det_bboxes.clone()
+        results.bboxes = _det_bboxes[:, :-1]
+        results.scores = _det_bboxes[:, -1]
+        results.labels = det_labels
+        det_results = data_samples[0]
+        det_results.pred_instances = results
+        return det_results
diff --git a/head_extractor/src/mmdet/models/test_time_augs/merge_augs.py b/head_extractor/src/mmdet/models/test_time_augs/merge_augs.py
new file mode 100644
index 0000000000000000000000000000000000000000..5935a8614c39d70253a09a339f51c144661c64fb
--- /dev/null
+++ b/head_extractor/src/mmdet/models/test_time_augs/merge_augs.py
@@ -0,0 +1,219 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+from mmcv.ops import nms
+from mmengine.config import ConfigDict
+from torch import Tensor
+
+from mmdet.structures.bbox import bbox_mapping_back
+
+
+# TODO remove this, never be used in mmdet
+def merge_aug_proposals(aug_proposals, img_metas, cfg):
+    """Merge augmented proposals (multiscale, flip, etc.)
+
+    Args:
+        aug_proposals (list[Tensor]): proposals from different testing
+            schemes, shape (n, 5). Note that they are not rescaled to the
+            original image size.
+
+        img_metas (list[dict]): list of image info dict where each dict has:
+            'img_shape', 'scale_factor', 'flip', and may also contain
+            'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            For details on the values of these keys see
+            `mmdet/datasets/pipelines/formatting.py:Collect`.
+
+        cfg (dict): rpn test config.
+
+    Returns:
+        Tensor: shape (n, 4), proposals corresponding to original image scale.
+    """
+
+    cfg = copy.deepcopy(cfg)
+
+    # deprecate arguments warning
+    if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg:
+        warnings.warn(
+            'In rpn_proposal or test_cfg, '
+            'nms_thr has been moved to a dict named nms as '
+            'iou_threshold, max_num has been renamed as max_per_img, '
+            'name of original arguments and the way to specify '
+            'iou_threshold of NMS will be deprecated.')
+    if 'nms' not in cfg:
+        cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr))
+    if 'max_num' in cfg:
+        if 'max_per_img' in cfg:
+            assert cfg.max_num == cfg.max_per_img, f'You set max_num and ' \
+                f'max_per_img at the same time, but get {cfg.max_num} ' \
+                f'and {cfg.max_per_img} respectively' \
+                f'Please delete max_num which will be deprecated.'
+        else:
+            cfg.max_per_img = cfg.max_num
+    if 'nms_thr' in cfg:
+        assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set ' \
+            f'iou_threshold in nms and ' \
+            f'nms_thr at the same time, but get ' \
+            f'{cfg.nms.iou_threshold} and {cfg.nms_thr}' \
+            f' respectively. Please delete the nms_thr ' \
+            f'which will be deprecated.'
+
+    recovered_proposals = []
+    for proposals, img_info in zip(aug_proposals, img_metas):
+        img_shape = img_info['img_shape']
+        scale_factor = img_info['scale_factor']
+        flip = img_info['flip']
+        flip_direction = img_info['flip_direction']
+        _proposals = proposals.clone()
+        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape,
+                                              scale_factor, flip,
+                                              flip_direction)
+        recovered_proposals.append(_proposals)
+    aug_proposals = torch.cat(recovered_proposals, dim=0)
+    merged_proposals, _ = nms(aug_proposals[:, :4].contiguous(),
+                              aug_proposals[:, -1].contiguous(),
+                              cfg.nms.iou_threshold)
+    scores = merged_proposals[:, 4]
+    _, order = scores.sort(0, descending=True)
+    num = min(cfg.max_per_img, merged_proposals.shape[0])
+    order = order[:num]
+    merged_proposals = merged_proposals[order, :]
+    return merged_proposals
+
+
+# TODO remove this, never be used in mmdet
+def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
+    """Merge augmented detection bboxes and scores.
+
+    Args:
+        aug_bboxes (list[Tensor]): shape (n, 4*#class)
+        aug_scores (list[Tensor] or None): shape (n, #class)
+        img_shapes (list[Tensor]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_bboxes = []
+    for bboxes, img_info in zip(aug_bboxes, img_metas):
+        img_shape = img_info[0]['img_shape']
+        scale_factor = img_info[0]['scale_factor']
+        flip = img_info[0]['flip']
+        flip_direction = img_info[0]['flip_direction']
+        bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
+                                   flip_direction)
+        recovered_bboxes.append(bboxes)
+    bboxes = torch.stack(recovered_bboxes).mean(dim=0)
+    if aug_scores is None:
+        return bboxes
+    else:
+        scores = torch.stack(aug_scores).mean(dim=0)
+        return bboxes, scores
+
+
+def merge_aug_results(aug_batch_results, aug_batch_img_metas):
+    """Merge augmented detection results, only bboxes corresponding score under
+    flipping and multi-scale resizing can be processed now.
+
+    Args:
+        aug_batch_results (list[list[[obj:`InstanceData`]]):
+            Detection results of multiple images with
+            different augmentations.
+            The outer list indicate the augmentation . The inter
+            list indicate the batch dimension.
+            Each item usually contains the following keys.
+
+            - scores (Tensor): Classification scores, in shape
+              (num_instance,)
+            - labels (Tensor): Labels of bboxes, in shape
+              (num_instances,).
+            - bboxes (Tensor): In shape (num_instances, 4),
+              the last dimension 4 arrange as (x1, y1, x2, y2).
+        aug_batch_img_metas (list[list[dict]]): The outer list
+            indicates test-time augs (multiscale, flip, etc.)
+            and the inner list indicates
+            images in a batch. Each dict in the list contains
+            information of an image in the batch.
+
+    Returns:
+        batch_results (list[obj:`InstanceData`]): Same with
+        the input `aug_results` except that all bboxes have
+        been mapped to the original scale.
+    """
+    num_augs = len(aug_batch_results)
+    num_imgs = len(aug_batch_results[0])
+
+    batch_results = []
+    aug_batch_results = copy.deepcopy(aug_batch_results)
+    for img_id in range(num_imgs):
+        aug_results = []
+        for aug_id in range(num_augs):
+            img_metas = aug_batch_img_metas[aug_id][img_id]
+            results = aug_batch_results[aug_id][img_id]
+
+            img_shape = img_metas['img_shape']
+            scale_factor = img_metas['scale_factor']
+            flip = img_metas['flip']
+            flip_direction = img_metas['flip_direction']
+            bboxes = bbox_mapping_back(results.bboxes, img_shape, scale_factor,
+                                       flip, flip_direction)
+            results.bboxes = bboxes
+            aug_results.append(results)
+        merged_aug_results = results.cat(aug_results)
+        batch_results.append(merged_aug_results)
+
+    return batch_results
+
+
+def merge_aug_scores(aug_scores):
+    """Merge augmented bbox scores."""
+    if isinstance(aug_scores[0], torch.Tensor):
+        return torch.mean(torch.stack(aug_scores), dim=0)
+    else:
+        return np.mean(aug_scores, axis=0)
+
+
+def merge_aug_masks(aug_masks: List[Tensor],
+                    img_metas: dict,
+                    weights: Optional[Union[list, Tensor]] = None) -> Tensor:
+    """Merge augmented mask prediction.
+
+    Args:
+        aug_masks (list[Tensor]): each has shape
+            (n, c, h, w).
+        img_metas (dict): Image information.
+        weights (list or Tensor): Weight of each aug_masks,
+            the length should be n.
+
+    Returns:
+        Tensor: has shape (n, c, h, w)
+    """
+    recovered_masks = []
+    for i, mask in enumerate(aug_masks):
+        if weights is not None:
+            assert len(weights) == len(aug_masks)
+            weight = weights[i]
+        else:
+            weight = 1
+        flip = img_metas.get('flip', False)
+        if flip:
+            flip_direction = img_metas['flip_direction']
+            if flip_direction == 'horizontal':
+                mask = mask[:, :, :, ::-1]
+            elif flip_direction == 'vertical':
+                mask = mask[:, :, ::-1, :]
+            elif flip_direction == 'diagonal':
+                mask = mask[:, :, :, ::-1]
+                mask = mask[:, :, ::-1, :]
+            else:
+                raise ValueError(
+                    f"Invalid flipping direction '{flip_direction}'")
+        recovered_masks.append(mask[None, :] * weight)
+
+    merged_masks = torch.cat(recovered_masks, 0).mean(dim=0)
+    if weights is not None:
+        merged_masks = merged_masks * len(weights) / sum(weights)
+    return merged_masks
diff --git a/head_extractor/src/mmdet/models/trackers/__init__.py b/head_extractor/src/mmdet/models/trackers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..00284bb7b40dd007c28b6cc9175ac26a52c6c528
--- /dev/null
+++ b/head_extractor/src/mmdet/models/trackers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_tracker import BaseTracker
+from .byte_tracker import ByteTracker
+from .masktrack_rcnn_tracker import MaskTrackRCNNTracker
+from .ocsort_tracker import OCSORTTracker
+from .quasi_dense_tracker import QuasiDenseTracker
+from .sort_tracker import SORTTracker
+from .strongsort_tracker import StrongSORTTracker
+
+__all__ = [
+    'BaseTracker', 'ByteTracker', 'QuasiDenseTracker', 'SORTTracker',
+    'StrongSORTTracker', 'OCSORTTracker', 'MaskTrackRCNNTracker'
+]
diff --git a/head_extractor/src/mmdet/models/trackers/base_tracker.py b/head_extractor/src/mmdet/models/trackers/base_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cf188653cd9adda59decd45f65fc4ede63fe3a7
--- /dev/null
+++ b/head_extractor/src/mmdet/models/trackers/base_tracker.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from addict import Dict
+
+
+class BaseTracker(metaclass=ABCMeta):
+    """Base tracker model.
+
+    Args:
+        momentums (dict[str:float], optional): Momentums to update the buffers.
+            The `str` indicates the name of the buffer while the `float`
+            indicates the momentum. Defaults to None.
+        num_frames_retain (int, optional). If a track is disappeared more than
+            `num_frames_retain` frames, it will be deleted in the memo.
+             Defaults to 10.
+    """
+
+    def __init__(self,
+                 momentums: Optional[dict] = None,
+                 num_frames_retain: int = 10) -> None:
+        super().__init__()
+        if momentums is not None:
+            assert isinstance(momentums, dict), 'momentums must be a dict'
+        self.momentums = momentums
+        self.num_frames_retain = num_frames_retain
+
+        self.reset()
+
+    def reset(self) -> None:
+        """Reset the buffer of the tracker."""
+        self.num_tracks = 0
+        self.tracks = dict()
+
+    @property
+    def empty(self) -> bool:
+        """Whether the buffer is empty or not."""
+        return False if self.tracks else True
+
+    @property
+    def ids(self) -> List[dict]:
+        """All ids in the tracker."""
+        return list(self.tracks.keys())
+
+    @property
+    def with_reid(self) -> bool:
+        """bool: whether the framework has a reid model"""
+        return hasattr(self, 'reid') and self.reid is not None
+
+    def update(self, **kwargs) -> None:
+        """Update the tracker.
+
+        Args:
+            kwargs (dict[str: Tensor | int]): The `str` indicates the
+                name of the input variable. `ids` and `frame_ids` are
+                obligatory in the keys.
+        """
+        memo_items = [k for k, v in kwargs.items() if v is not None]
+        rm_items = [k for k in kwargs.keys() if k not in memo_items]
+        for item in rm_items:
+            kwargs.pop(item)
+        if not hasattr(self, 'memo_items'):
+            self.memo_items = memo_items
+        else:
+            assert memo_items == self.memo_items
+
+        assert 'ids' in memo_items
+        num_objs = len(kwargs['ids'])
+        id_indice = memo_items.index('ids')
+        assert 'frame_ids' in memo_items
+        frame_id = int(kwargs['frame_ids'])
+        if isinstance(kwargs['frame_ids'], int):
+            kwargs['frame_ids'] = torch.tensor([kwargs['frame_ids']] *
+                                               num_objs)
+        # cur_frame_id = int(kwargs['frame_ids'][0])
+        for k, v in kwargs.items():
+            if len(v) != num_objs:
+                raise ValueError('kwargs value must both equal')
+
+        for obj in zip(*kwargs.values()):
+            id = int(obj[id_indice])
+            if id in self.tracks:
+                self.update_track(id, obj)
+            else:
+                self.init_track(id, obj)
+
+        self.pop_invalid_tracks(frame_id)
+
+    def pop_invalid_tracks(self, frame_id: int) -> None:
+        """Pop out invalid tracks."""
+        invalid_ids = []
+        for k, v in self.tracks.items():
+            if frame_id - v['frame_ids'][-1] >= self.num_frames_retain:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracks.pop(invalid_id)
+
+    def update_track(self, id: int, obj: Tuple[torch.Tensor]):
+        """Update a track."""
+        for k, v in zip(self.memo_items, obj):
+            v = v[None]
+            if self.momentums is not None and k in self.momentums:
+                m = self.momentums[k]
+                self.tracks[id][k] = (1 - m) * self.tracks[id][k] + m * v
+            else:
+                self.tracks[id][k].append(v)
+
+    def init_track(self, id: int, obj: Tuple[torch.Tensor]):
+        """Initialize a track."""
+        self.tracks[id] = Dict()
+        for k, v in zip(self.memo_items, obj):
+            v = v[None]
+            if self.momentums is not None and k in self.momentums:
+                self.tracks[id][k] = v
+            else:
+                self.tracks[id][k] = [v]
+
+    @property
+    def memo(self) -> dict:
+        """Return all buffers in the tracker."""
+        outs = Dict()
+        for k in self.memo_items:
+            outs[k] = []
+
+        for id, objs in self.tracks.items():
+            for k, v in objs.items():
+                if k not in outs:
+                    continue
+                if self.momentums is not None and k in self.momentums:
+                    v = v
+                else:
+                    v = v[-1]
+                outs[k].append(v)
+
+        for k, v in outs.items():
+            outs[k] = torch.cat(v, dim=0)
+        return outs
+
+    def get(self,
+            item: str,
+            ids: Optional[list] = None,
+            num_samples: Optional[int] = None,
+            behavior: Optional[str] = None) -> torch.Tensor:
+        """Get the buffer of a specific item.
+
+        Args:
+            item (str): The demanded item.
+            ids (list[int], optional): The demanded ids. Defaults to None.
+            num_samples (int, optional): Number of samples to calculate the
+                results. Defaults to None.
+            behavior (str, optional): Behavior to calculate the results.
+                Options are `mean` | None. Defaults to None.
+
+        Returns:
+            Tensor: The results of the demanded item.
+        """
+        if ids is None:
+            ids = self.ids
+
+        outs = []
+        for id in ids:
+            out = self.tracks[id][item]
+            if isinstance(out, list):
+                if num_samples is not None:
+                    out = out[-num_samples:]
+                    out = torch.cat(out, dim=0)
+                    if behavior == 'mean':
+                        out = out.mean(dim=0, keepdim=True)
+                    elif behavior is None:
+                        out = out[None]
+                    else:
+                        raise NotImplementedError()
+                else:
+                    out = out[-1]
+            outs.append(out)
+        return torch.cat(outs, dim=0)
+
+    @abstractmethod
+    def track(self, *args, **kwargs):
+        """Tracking forward function."""
+        pass
+
+    def crop_imgs(self,
+                  img: torch.Tensor,
+                  meta_info: dict,
+                  bboxes: torch.Tensor,
+                  rescale: bool = False) -> torch.Tensor:
+        """Crop the images according to some bounding boxes. Typically for re-
+        identification sub-module.
+
+        Args:
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+            meta_info (dict): image information dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            bboxes (Tensor): of shape (N, 4) or (N, 5).
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the scale of the image. Defaults to False.
+
+        Returns:
+            Tensor: Image tensor of shape (T, C, H, W).
+        """
+        h, w = meta_info['img_shape']
+        img = img[:, :, :h, :w]
+        if rescale:
+            factor_x, factor_y = meta_info['scale_factor']
+            bboxes[:, :4] *= torch.tensor(
+                [factor_x, factor_y, factor_x, factor_y]).to(bboxes.device)
+        bboxes[:, 0] = torch.clamp(bboxes[:, 0], min=0, max=w - 1)
+        bboxes[:, 1] = torch.clamp(bboxes[:, 1], min=0, max=h - 1)
+        bboxes[:, 2] = torch.clamp(bboxes[:, 2], min=1, max=w)
+        bboxes[:, 3] = torch.clamp(bboxes[:, 3], min=1, max=h)
+
+        crop_imgs = []
+        for bbox in bboxes:
+            x1, y1, x2, y2 = map(int, bbox)
+            if x2 <= x1:
+                x2 = x1 + 1
+            if y2 <= y1:
+                y2 = y1 + 1
+            crop_img = img[:, :, y1:y2, x1:x2]
+            if self.reid.get('img_scale', False):
+                crop_img = F.interpolate(
+                    crop_img,
+                    size=self.reid['img_scale'],
+                    mode='bilinear',
+                    align_corners=False)
+            crop_imgs.append(crop_img)
+
+        if len(crop_imgs) > 0:
+            return torch.cat(crop_imgs, dim=0)
+        elif self.reid.get('img_scale', False):
+            _h, _w = self.reid['img_scale']
+            return img.new_zeros((0, 3, _h, _w))
+        else:
+            return img.new_zeros((0, 3, h, w))
diff --git a/head_extractor/src/mmdet/models/trackers/byte_tracker.py b/head_extractor/src/mmdet/models/trackers/byte_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..11f3adc53c58339f6289cbfa77aed738259fc98c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/trackers/byte_tracker.py
@@ -0,0 +1,334 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+try:
+    import lap
+except ImportError:
+    lap = None
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import (bbox_cxcyah_to_xyxy, bbox_overlaps,
+                                   bbox_xyxy_to_cxcyah)
+from .base_tracker import BaseTracker
+
+
+@MODELS.register_module()
+class ByteTracker(BaseTracker):
+    """Tracker for ByteTrack.
+
+    Args:
+        motion (dict): Configuration of motion. Defaults to None.
+        obj_score_thrs (dict): Detection score threshold for matching objects.
+            - high (float): Threshold of the first matching. Defaults to 0.6.
+            - low (float): Threshold of the second matching. Defaults to 0.1.
+        init_track_thr (float): Detection score threshold for initializing a
+            new tracklet. Defaults to 0.7.
+        weight_iou_with_det_scores (bool): Whether using detection scores to
+            weight IOU which is used for matching. Defaults to True.
+        match_iou_thrs (dict): IOU distance threshold for matching between two
+            frames.
+            - high (float): Threshold of the first matching. Defaults to 0.1.
+            - low (float): Threshold of the second matching. Defaults to 0.5.
+            - tentative (float): Threshold of the matching for tentative
+                tracklets. Defaults to 0.3.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 3.
+    """
+
+    def __init__(self,
+                 motion: Optional[dict] = None,
+                 obj_score_thrs: dict = dict(high=0.6, low=0.1),
+                 init_track_thr: float = 0.7,
+                 weight_iou_with_det_scores: bool = True,
+                 match_iou_thrs: dict = dict(high=0.1, low=0.5, tentative=0.3),
+                 num_tentatives: int = 3,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        if lap is None:
+            raise RuntimeError('lap is not installed,\
+                 please install it by: pip install lap')
+        if motion is not None:
+            self.motion = TASK_UTILS.build(motion)
+
+        self.obj_score_thrs = obj_score_thrs
+        self.init_track_thr = init_track_thr
+
+        self.weight_iou_with_det_scores = weight_iou_with_det_scores
+        self.match_iou_thrs = match_iou_thrs
+
+        self.num_tentatives = num_tentatives
+
+    @property
+    def confirmed_ids(self) -> List:
+        """Confirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if not track.tentative]
+        return ids
+
+    @property
+    def unconfirmed_ids(self) -> List:
+        """Unconfirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if track.tentative]
+        return ids
+
+    def init_track(self, id: int, obj: Tuple[torch.Tensor]) -> None:
+        """Initialize a track."""
+        super().init_track(id, obj)
+        if self.tracks[id].frame_ids[-1] == 0:
+            self.tracks[id].tentative = False
+        else:
+            self.tracks[id].tentative = True
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate(
+            bbox)
+
+    def update_track(self, id: int, obj: Tuple[torch.Tensor]) -> None:
+        """Update a track."""
+        super().update_track(id, obj)
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        track_label = self.tracks[id]['labels'][-1]
+        label_idx = self.memo_items.index('labels')
+        obj_label = obj[label_idx]
+        assert obj_label == track_label
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox)
+
+    def pop_invalid_tracks(self, frame_id: int) -> None:
+        """Pop out invalid tracks."""
+        invalid_ids = []
+        for k, v in self.tracks.items():
+            # case1: disappeared frames >= self.num_frames_retrain
+            case1 = frame_id - v['frame_ids'][-1] >= self.num_frames_retain
+            # case2: tentative tracks but not matched in this frame
+            case2 = v.tentative and v['frame_ids'][-1] != frame_id
+            if case1 or case2:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracks.pop(invalid_id)
+
+    def assign_ids(
+            self,
+            ids: List[int],
+            det_bboxes: torch.Tensor,
+            det_labels: torch.Tensor,
+            det_scores: torch.Tensor,
+            weight_iou_with_det_scores: Optional[bool] = False,
+            match_iou_thr: Optional[float] = 0.5
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        """Assign ids.
+
+        Args:
+            ids (list[int]): Tracking ids.
+            det_bboxes (Tensor): of shape (N, 4)
+            det_labels (Tensor): of shape (N,)
+            det_scores (Tensor): of shape (N,)
+            weight_iou_with_det_scores (bool, optional): Whether using
+                detection scores to weight IOU which is used for matching.
+                Defaults to False.
+            match_iou_thr (float, optional): Matching threshold.
+                Defaults to 0.5.
+
+        Returns:
+            tuple(np.ndarray, np.ndarray): The assigning ids.
+        """
+        # get track_bboxes
+        track_bboxes = np.zeros((0, 4))
+        for id in ids:
+            track_bboxes = np.concatenate(
+                (track_bboxes, self.tracks[id].mean[:4][None]), axis=0)
+        track_bboxes = torch.from_numpy(track_bboxes).to(det_bboxes)
+        track_bboxes = bbox_cxcyah_to_xyxy(track_bboxes)
+
+        # compute distance
+        ious = bbox_overlaps(track_bboxes, det_bboxes)
+        if weight_iou_with_det_scores:
+            ious *= det_scores
+        # support multi-class association
+        track_labels = torch.tensor([
+            self.tracks[id]['labels'][-1] for id in ids
+        ]).to(det_bboxes.device)
+
+        cate_match = det_labels[None, :] == track_labels[:, None]
+        # to avoid det and track of different categories are matched
+        cate_cost = (1 - cate_match.int()) * 1e6
+
+        dists = (1 - ious + cate_cost).cpu().numpy()
+
+        # bipartite match
+        if dists.size > 0:
+            cost, row, col = lap.lapjv(
+                dists, extend_cost=True, cost_limit=1 - match_iou_thr)
+        else:
+            row = np.zeros(len(ids)).astype(np.int32) - 1
+            col = np.zeros(len(det_bboxes)).astype(np.int32) - 1
+        return row, col
+
+    def track(self, data_sample: DetDataSample, **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            data_sample (:obj:`DetDataSample`): The data sample.
+                It includes information such as `pred_instances`.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = self.motion
+
+        if self.empty or bboxes.size(0) == 0:
+            valid_inds = scores > self.init_track_thr
+            scores = scores[valid_inds]
+            bboxes = bboxes[valid_inds]
+            labels = labels[valid_inds]
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(self.num_tracks,
+                               self.num_tracks + num_new_tracks).to(labels)
+            self.num_tracks += num_new_tracks
+
+        else:
+            # 0. init
+            ids = torch.full((bboxes.size(0), ),
+                             -1,
+                             dtype=labels.dtype,
+                             device=labels.device)
+
+            # get the detection bboxes for the first association
+            first_det_inds = scores > self.obj_score_thrs['high']
+            first_det_bboxes = bboxes[first_det_inds]
+            first_det_labels = labels[first_det_inds]
+            first_det_scores = scores[first_det_inds]
+            first_det_ids = ids[first_det_inds]
+
+            # get the detection bboxes for the second association
+            second_det_inds = (~first_det_inds) & (
+                scores > self.obj_score_thrs['low'])
+            second_det_bboxes = bboxes[second_det_inds]
+            second_det_labels = labels[second_det_inds]
+            second_det_scores = scores[second_det_inds]
+            second_det_ids = ids[second_det_inds]
+
+            # 1. use Kalman Filter to predict current location
+            for id in self.confirmed_ids:
+                # track is lost in previous frame
+                if self.tracks[id].frame_ids[-1] != frame_id - 1:
+                    self.tracks[id].mean[7] = 0
+                (self.tracks[id].mean,
+                 self.tracks[id].covariance) = self.kf.predict(
+                     self.tracks[id].mean, self.tracks[id].covariance)
+
+            # 2. first match
+            first_match_track_inds, first_match_det_inds = self.assign_ids(
+                self.confirmed_ids, first_det_bboxes, first_det_labels,
+                first_det_scores, self.weight_iou_with_det_scores,
+                self.match_iou_thrs['high'])
+            # '-1' mean a detection box is not matched with tracklets in
+            # previous frame
+            valid = first_match_det_inds > -1
+            first_det_ids[valid] = torch.tensor(
+                self.confirmed_ids)[first_match_det_inds[valid]].to(labels)
+
+            first_match_det_bboxes = first_det_bboxes[valid]
+            first_match_det_labels = first_det_labels[valid]
+            first_match_det_scores = first_det_scores[valid]
+            first_match_det_ids = first_det_ids[valid]
+            assert (first_match_det_ids > -1).all()
+
+            first_unmatch_det_bboxes = first_det_bboxes[~valid]
+            first_unmatch_det_labels = first_det_labels[~valid]
+            first_unmatch_det_scores = first_det_scores[~valid]
+            first_unmatch_det_ids = first_det_ids[~valid]
+            assert (first_unmatch_det_ids == -1).all()
+
+            # 3. use unmatched detection bboxes from the first match to match
+            # the unconfirmed tracks
+            (tentative_match_track_inds,
+             tentative_match_det_inds) = self.assign_ids(
+                 self.unconfirmed_ids, first_unmatch_det_bboxes,
+                 first_unmatch_det_labels, first_unmatch_det_scores,
+                 self.weight_iou_with_det_scores,
+                 self.match_iou_thrs['tentative'])
+            valid = tentative_match_det_inds > -1
+            first_unmatch_det_ids[valid] = torch.tensor(self.unconfirmed_ids)[
+                tentative_match_det_inds[valid]].to(labels)
+
+            # 4. second match for unmatched tracks from the first match
+            first_unmatch_track_ids = []
+            for i, id in enumerate(self.confirmed_ids):
+                # tracklet is not matched in the first match
+                case_1 = first_match_track_inds[i] == -1
+                # tracklet is not lost in the previous frame
+                case_2 = self.tracks[id].frame_ids[-1] == frame_id - 1
+                if case_1 and case_2:
+                    first_unmatch_track_ids.append(id)
+
+            second_match_track_inds, second_match_det_inds = self.assign_ids(
+                first_unmatch_track_ids, second_det_bboxes, second_det_labels,
+                second_det_scores, False, self.match_iou_thrs['low'])
+            valid = second_match_det_inds > -1
+            second_det_ids[valid] = torch.tensor(first_unmatch_track_ids)[
+                second_match_det_inds[valid]].to(ids)
+
+            # 5. gather all matched detection bboxes from step 2-4
+            # we only keep matched detection bboxes in second match, which
+            # means the id != -1
+            valid = second_det_ids > -1
+            bboxes = torch.cat(
+                (first_match_det_bboxes, first_unmatch_det_bboxes), dim=0)
+            bboxes = torch.cat((bboxes, second_det_bboxes[valid]), dim=0)
+
+            labels = torch.cat(
+                (first_match_det_labels, first_unmatch_det_labels), dim=0)
+            labels = torch.cat((labels, second_det_labels[valid]), dim=0)
+
+            scores = torch.cat(
+                (first_match_det_scores, first_unmatch_det_scores), dim=0)
+            scores = torch.cat((scores, second_det_scores[valid]), dim=0)
+
+            ids = torch.cat((first_match_det_ids, first_unmatch_det_ids),
+                            dim=0)
+            ids = torch.cat((ids, second_det_ids[valid]), dim=0)
+
+            # 6. assign new ids
+            new_track_inds = ids == -1
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum()).to(labels)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            scores=scores,
+            labels=labels,
+            frame_ids=frame_id)
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/head_extractor/src/mmdet/models/trackers/masktrack_rcnn_tracker.py b/head_extractor/src/mmdet/models/trackers/masktrack_rcnn_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc167786b8b412629885a4f134a1bf79f3dfaa93
--- /dev/null
+++ b/head_extractor/src/mmdet/models/trackers/masktrack_rcnn_tracker.py
@@ -0,0 +1,189 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox_overlaps
+from .base_tracker import BaseTracker
+
+
+@MODELS.register_module()
+class MaskTrackRCNNTracker(BaseTracker):
+    """Tracker for MaskTrack R-CNN.
+
+    Args:
+        match_weights (dict[str : float]): The Weighting factor when computing
+        the match score. It contains keys as follows:
+
+            - det_score (float): The coefficient of `det_score` when computing
+                match score.
+            - iou (float): The coefficient of `ious` when computing match
+                score.
+            - det_label (float): The coefficient of `label_deltas` when
+                computing match score.
+    """
+
+    def __init__(self,
+                 match_weights: dict = dict(
+                     det_score=1.0, iou=2.0, det_label=10.0),
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.match_weights = match_weights
+
+    def get_match_score(self, bboxes: Tensor, labels: Tensor, scores: Tensor,
+                        prev_bboxes: Tensor, prev_labels: Tensor,
+                        similarity_logits: Tensor) -> Tensor:
+        """Get the match score.
+
+        Args:
+            bboxes (torch.Tensor): of shape (num_current_bboxes, 4) in
+                [tl_x, tl_y, br_x, br_y] format. Denoting the detection
+                bboxes of current frame.
+            labels (torch.Tensor): of shape (num_current_bboxes, )
+            scores (torch.Tensor): of shape (num_current_bboxes, )
+            prev_bboxes (torch.Tensor): of shape (num_previous_bboxes, 4) in
+                [tl_x, tl_y, br_x, br_y] format.  Denoting the detection bboxes
+                of previous frame.
+            prev_labels (torch.Tensor): of shape (num_previous_bboxes, )
+            similarity_logits (torch.Tensor): of shape (num_current_bboxes,
+                num_previous_bboxes + 1). Denoting the similarity logits from
+                track head.
+
+        Returns:
+            torch.Tensor: The matching score of shape (num_current_bboxes,
+            num_previous_bboxes + 1)
+        """
+        similarity_scores = similarity_logits.softmax(dim=1)
+
+        ious = bbox_overlaps(bboxes, prev_bboxes)
+        iou_dummy = ious.new_zeros(ious.shape[0], 1)
+        ious = torch.cat((iou_dummy, ious), dim=1)
+
+        label_deltas = (labels.view(-1, 1) == prev_labels).float()
+        label_deltas_dummy = label_deltas.new_ones(label_deltas.shape[0], 1)
+        label_deltas = torch.cat((label_deltas_dummy, label_deltas), dim=1)
+
+        match_score = similarity_scores.log()
+        match_score += self.match_weights['det_score'] * \
+            scores.view(-1, 1).log()
+        match_score += self.match_weights['iou'] * ious
+        match_score += self.match_weights['det_label'] * label_deltas
+
+        return match_score
+
+    def assign_ids(self, match_scores: Tensor):
+        num_prev_bboxes = match_scores.shape[1] - 1
+        _, match_ids = match_scores.max(dim=1)
+
+        ids = match_ids.new_zeros(match_ids.shape[0]) - 1
+        best_match_scores = match_scores.new_zeros(num_prev_bboxes) - 1e6
+        for idx, match_id in enumerate(match_ids):
+            if match_id == 0:
+                ids[idx] = self.num_tracks
+                self.num_tracks += 1
+            else:
+                match_score = match_scores[idx, match_id]
+                # TODO: fix the bug where multiple candidate might match
+                # with the same previous object.
+                if match_score > best_match_scores[match_id - 1]:
+                    ids[idx] = self.ids[match_id - 1]
+                    best_match_scores[match_id - 1] = match_score
+        return ids, best_match_scores
+
+    def track(self,
+              model: torch.nn.Module,
+              feats: List[torch.Tensor],
+              data_sample: DetDataSample,
+              rescale=True,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): VIS model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                MaskTrackRCNN method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                True.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        masks = data_sample.pred_instances.masks
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        # create pred_track_instances
+        pred_track_instances = InstanceData()
+
+        if bboxes.shape[0] == 0:
+            ids = torch.zeros_like(labels)
+            pred_track_instances = data_sample.pred_instances.clone()
+            pred_track_instances.instances_id = ids
+            return pred_track_instances
+
+        rescaled_bboxes = bboxes.clone()
+        if rescale:
+            scale_factor = rescaled_bboxes.new_tensor(
+                metainfo['scale_factor']).repeat((1, 2))
+            rescaled_bboxes = rescaled_bboxes * scale_factor
+        roi_feats, _ = model.track_head.extract_roi_feats(
+            feats, [rescaled_bboxes])
+
+        if self.empty:
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(
+                self.num_tracks,
+                self.num_tracks + num_new_tracks,
+                dtype=torch.long)
+            self.num_tracks += num_new_tracks
+        else:
+            prev_bboxes = self.get('bboxes')
+            prev_labels = self.get('labels')
+            prev_roi_feats = self.get('roi_feats')
+
+            similarity_logits = model.track_head.predict(
+                roi_feats, prev_roi_feats)
+            match_scores = self.get_match_score(bboxes, labels, scores,
+                                                prev_bboxes, prev_labels,
+                                                similarity_logits)
+            ids, _ = self.assign_ids(match_scores)
+
+        valid_inds = ids > -1
+        ids = ids[valid_inds]
+        bboxes = bboxes[valid_inds]
+        labels = labels[valid_inds]
+        scores = scores[valid_inds]
+        masks = masks[valid_inds]
+        roi_feats = roi_feats[valid_inds]
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            labels=labels,
+            scores=scores,
+            masks=masks,
+            roi_feats=roi_feats,
+            frame_ids=frame_id)
+        # update pred_track_instances
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.masks = masks
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/head_extractor/src/mmdet/models/trackers/ocsort_tracker.py b/head_extractor/src/mmdet/models/trackers/ocsort_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e09990c603aee8ced3bf3a65ceb530142e6e873
--- /dev/null
+++ b/head_extractor/src/mmdet/models/trackers/ocsort_tracker.py
@@ -0,0 +1,531 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+try:
+    import lap
+except ImportError:
+    lap = None
+import numpy as np
+import torch
+from addict import Dict
+from mmengine.structures import InstanceData
+
+from mmdet.registry import MODELS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import (bbox_cxcyah_to_xyxy, bbox_overlaps,
+                                   bbox_xyxy_to_cxcyah)
+from .sort_tracker import SORTTracker
+
+
+@MODELS.register_module()
+class OCSORTTracker(SORTTracker):
+    """Tracker for OC-SORT.
+
+    Args:
+        motion (dict): Configuration of motion. Defaults to None.
+        obj_score_thrs (float): Detection score threshold for matching objects.
+            Defaults to 0.3.
+        init_track_thr (float): Detection score threshold for initializing a
+            new tracklet. Defaults to 0.7.
+        weight_iou_with_det_scores (bool): Whether using detection scores to
+            weight IOU which is used for matching. Defaults to True.
+        match_iou_thr (float): IOU distance threshold for matching between two
+            frames. Defaults to 0.3.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 3.
+        vel_consist_weight (float): Weight of the velocity consistency term in
+            association (OCM term in the paper).
+        vel_delta_t (int): The difference of time step for calculating of the
+            velocity direction of tracklets.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 motion: Optional[dict] = None,
+                 obj_score_thr: float = 0.3,
+                 init_track_thr: float = 0.7,
+                 weight_iou_with_det_scores: bool = True,
+                 match_iou_thr: float = 0.3,
+                 num_tentatives: int = 3,
+                 vel_consist_weight: float = 0.2,
+                 vel_delta_t: int = 3,
+                 **kwargs):
+        if lap is None:
+            raise RuntimeError('lap is not installed,\
+                 please install it by: pip install lap')
+        super().__init__(motion=motion, **kwargs)
+        self.obj_score_thr = obj_score_thr
+        self.init_track_thr = init_track_thr
+
+        self.weight_iou_with_det_scores = weight_iou_with_det_scores
+        self.match_iou_thr = match_iou_thr
+        self.vel_consist_weight = vel_consist_weight
+        self.vel_delta_t = vel_delta_t
+
+        self.num_tentatives = num_tentatives
+
+    @property
+    def unconfirmed_ids(self):
+        """Unconfirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if track.tentative]
+        return ids
+
+    def init_track(self, id: int, obj: Tuple[torch.Tensor]):
+        """Initialize a track."""
+        super().init_track(id, obj)
+        if self.tracks[id].frame_ids[-1] == 0:
+            self.tracks[id].tentative = False
+        else:
+            self.tracks[id].tentative = True
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate(
+            bbox)
+        # track.obs maintains the history associated detections to this track
+        self.tracks[id].obs = []
+        bbox_id = self.memo_items.index('bboxes')
+        self.tracks[id].obs.append(obj[bbox_id])
+        # a placefolder to save mean/covariance before losing tracking it
+        # parameters to save: mean, covariance, measurement
+        self.tracks[id].tracked = True
+        self.tracks[id].saved_attr = Dict()
+        self.tracks[id].velocity = torch.tensor(
+            (-1, -1)).to(obj[bbox_id].device)  # placeholder
+
+    def update_track(self, id: int, obj: Tuple[torch.Tensor]):
+        """Update a track."""
+        super().update_track(id, obj)
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox)
+        self.tracks[id].tracked = True
+        bbox_id = self.memo_items.index('bboxes')
+        self.tracks[id].obs.append(obj[bbox_id])
+
+        bbox1 = self.k_step_observation(self.tracks[id])
+        bbox2 = obj[bbox_id]
+        self.tracks[id].velocity = self.vel_direction(bbox1, bbox2).to(
+            obj[bbox_id].device)
+
+    def vel_direction(self, bbox1: torch.Tensor, bbox2: torch.Tensor):
+        """Estimate the direction vector between two boxes."""
+        if bbox1.sum() < 0 or bbox2.sum() < 0:
+            return torch.tensor((-1, -1))
+        cx1, cy1 = (bbox1[0] + bbox1[2]) / 2.0, (bbox1[1] + bbox1[3]) / 2.0
+        cx2, cy2 = (bbox2[0] + bbox2[2]) / 2.0, (bbox2[1] + bbox2[3]) / 2.0
+        speed = torch.tensor([cy2 - cy1, cx2 - cx1])
+        norm = torch.sqrt((speed[0])**2 + (speed[1])**2) + 1e-6
+        return speed / norm
+
+    def vel_direction_batch(self, bboxes1: torch.Tensor,
+                            bboxes2: torch.Tensor):
+        """Estimate the direction vector given two batches of boxes."""
+        cx1, cy1 = (bboxes1[:, 0] + bboxes1[:, 2]) / 2.0, (bboxes1[:, 1] +
+                                                           bboxes1[:, 3]) / 2.0
+        cx2, cy2 = (bboxes2[:, 0] + bboxes2[:, 2]) / 2.0, (bboxes2[:, 1] +
+                                                           bboxes2[:, 3]) / 2.0
+        speed_diff_y = cy2[None, :] - cy1[:, None]
+        speed_diff_x = cx2[None, :] - cx1[:, None]
+        speed = torch.cat((speed_diff_y[..., None], speed_diff_x[..., None]),
+                          dim=-1)
+        norm = torch.sqrt((speed[:, :, 0])**2 + (speed[:, :, 1])**2) + 1e-6
+        speed[:, :, 0] /= norm
+        speed[:, :, 1] /= norm
+        return speed
+
+    def k_step_observation(self, track: Dict):
+        """return the observation k step away before."""
+        obs_seqs = track.obs
+        num_obs = len(obs_seqs)
+        if num_obs == 0:
+            return torch.tensor((-1, -1, -1, -1)).to(track.obs[0].device)
+        elif num_obs > self.vel_delta_t:
+            if obs_seqs[num_obs - 1 - self.vel_delta_t] is not None:
+                return obs_seqs[num_obs - 1 - self.vel_delta_t]
+            else:
+                return self.last_obs(track)
+        else:
+            return self.last_obs(track)
+
+    def ocm_assign_ids(self,
+                       ids: List[int],
+                       det_bboxes: torch.Tensor,
+                       det_labels: torch.Tensor,
+                       det_scores: torch.Tensor,
+                       weight_iou_with_det_scores: Optional[bool] = False,
+                       match_iou_thr: Optional[float] = 0.5):
+        """Apply Observation-Centric Momentum (OCM) to assign ids.
+
+        OCM adds movement direction consistency into the association cost
+        matrix. This term requires no additional assumption but from the
+        same linear motion assumption as the canonical Kalman Filter in SORT.
+
+        Args:
+            ids (list[int]): Tracking ids.
+            det_bboxes (Tensor): of shape (N, 4)
+            det_labels (Tensor): of shape (N,)
+            det_scores (Tensor): of shape (N,)
+            weight_iou_with_det_scores (bool, optional): Whether using
+                detection scores to weight IOU which is used for matching.
+                Defaults to False.
+            match_iou_thr (float, optional): Matching threshold.
+                Defaults to 0.5.
+
+        Returns:
+            tuple(int): The assigning ids.
+
+        OC-SORT uses velocity consistency besides IoU for association
+        """
+        # get track_bboxes
+        track_bboxes = np.zeros((0, 4))
+        for id in ids:
+            track_bboxes = np.concatenate(
+                (track_bboxes, self.tracks[id].mean[:4][None]), axis=0)
+        track_bboxes = torch.from_numpy(track_bboxes).to(det_bboxes)
+        track_bboxes = bbox_cxcyah_to_xyxy(track_bboxes)
+
+        # compute distance
+        ious = bbox_overlaps(track_bboxes, det_bboxes)
+        if weight_iou_with_det_scores:
+            ious *= det_scores
+
+        # support multi-class association
+        track_labels = torch.tensor([
+            self.tracks[id]['labels'][-1] for id in ids
+        ]).to(det_bboxes.device)
+        cate_match = det_labels[None, :] == track_labels[:, None]
+        # to avoid det and track of different categories are matched
+        cate_cost = (1 - cate_match.int()) * 1e6
+
+        dists = (1 - ious + cate_cost).cpu().numpy()
+
+        if len(ids) > 0 and len(det_bboxes) > 0:
+            track_velocities = torch.stack(
+                [self.tracks[id].velocity for id in ids]).to(det_bboxes.device)
+            k_step_observations = torch.stack([
+                self.k_step_observation(self.tracks[id]) for id in ids
+            ]).to(det_bboxes.device)
+            # valid1: if the track has previous observations to estimate speed
+            # valid2: if the associated observation k steps ago is a detection
+            valid1 = track_velocities.sum(dim=1) != -2
+            valid2 = k_step_observations.sum(dim=1) != -4
+            valid = valid1 & valid2
+
+            vel_to_match = self.vel_direction_batch(k_step_observations,
+                                                    det_bboxes)
+            track_velocities = track_velocities[:, None, :].repeat(
+                1, det_bboxes.shape[0], 1)
+
+            angle_cos = (vel_to_match * track_velocities).sum(dim=-1)
+            angle_cos = torch.clamp(angle_cos, min=-1, max=1)
+            angle = torch.acos(angle_cos)  # [0, pi]
+            norm_angle = (angle - np.pi / 2.) / np.pi  # [-0.5, 0.5]
+            valid_matrix = valid[:, None].int().repeat(1, det_bboxes.shape[0])
+            # set non-valid entries 0
+            valid_norm_angle = norm_angle * valid_matrix
+
+            dists += valid_norm_angle.cpu().numpy() * self.vel_consist_weight
+
+        # bipartite match
+        if dists.size > 0:
+            cost, row, col = lap.lapjv(
+                dists, extend_cost=True, cost_limit=1 - match_iou_thr)
+        else:
+            row = np.zeros(len(ids)).astype(np.int32) - 1
+            col = np.zeros(len(det_bboxes)).astype(np.int32) - 1
+        return row, col
+
+    def last_obs(self, track: Dict):
+        """extract the last associated observation."""
+        for bbox in track.obs[::-1]:
+            if bbox is not None:
+                return bbox
+
+    def ocr_assign_ids(self,
+                       track_obs: torch.Tensor,
+                       last_track_labels: torch.Tensor,
+                       det_bboxes: torch.Tensor,
+                       det_labels: torch.Tensor,
+                       det_scores: torch.Tensor,
+                       weight_iou_with_det_scores: Optional[bool] = False,
+                       match_iou_thr: Optional[float] = 0.5):
+        """association for Observation-Centric Recovery.
+
+        As try to recover tracks from being lost whose estimated velocity is
+        out- to-date, we use IoU-only matching strategy.
+
+        Args:
+            track_obs (Tensor): the list of historical associated
+                detections of tracks
+            det_bboxes (Tensor): of shape (N, 5), unmatched detections
+            det_labels (Tensor): of shape (N,)
+            det_scores (Tensor): of shape (N,)
+            weight_iou_with_det_scores (bool, optional): Whether using
+                detection scores to weight IOU which is used for matching.
+                Defaults to False.
+            match_iou_thr (float, optional): Matching threshold.
+                Defaults to 0.5.
+
+        Returns:
+            tuple(int): The assigning ids.
+        """
+        # compute distance
+        ious = bbox_overlaps(track_obs, det_bboxes)
+        if weight_iou_with_det_scores:
+            ious *= det_scores
+
+        # support multi-class association
+        cate_match = det_labels[None, :] == last_track_labels[:, None]
+        # to avoid det and track of different categories are matched
+        cate_cost = (1 - cate_match.int()) * 1e6
+
+        dists = (1 - ious + cate_cost).cpu().numpy()
+
+        # bipartite match
+        if dists.size > 0:
+            cost, row, col = lap.lapjv(
+                dists, extend_cost=True, cost_limit=1 - match_iou_thr)
+        else:
+            row = np.zeros(len(track_obs)).astype(np.int32) - 1
+            col = np.zeros(len(det_bboxes)).astype(np.int32) - 1
+        return row, col
+
+    def online_smooth(self, track: Dict, obj: torch.Tensor):
+        """Once a track is recovered from being lost, online smooth its
+        parameters to fix the error accumulated during being lost.
+
+        NOTE: you can use different virtual trajectory generation
+        strategies, we adopt the naive linear interpolation as default
+        """
+        last_match_bbox = self.last_obs(track)
+        new_match_bbox = obj
+        unmatch_len = 0
+        for bbox in track.obs[::-1]:
+            if bbox is None:
+                unmatch_len += 1
+            else:
+                break
+        bbox_shift_per_step = (new_match_bbox - last_match_bbox) / (
+            unmatch_len + 1)
+        track.mean = track.saved_attr.mean
+        track.covariance = track.saved_attr.covariance
+        for i in range(unmatch_len):
+            virtual_bbox = last_match_bbox + (i + 1) * bbox_shift_per_step
+            virtual_bbox = bbox_xyxy_to_cxcyah(virtual_bbox[None, :])
+            virtual_bbox = virtual_bbox.squeeze(0).cpu().numpy()
+            track.mean, track.covariance = self.kf.update(
+                track.mean, track.covariance, virtual_bbox)
+
+    def track(self, data_sample: DetDataSample, **kwargs) -> InstanceData:
+        """Tracking forward function.
+        NOTE: this implementation is slightly different from the original
+        OC-SORT implementation (https://github.com/noahcao/OC_SORT)that we
+        do association between detections and tentative/non-tentative tracks
+        independently while the original implementation combines them together.
+
+        Args:
+            data_sample (:obj:`DetDataSample`): The data sample.
+                It includes information such as `pred_instances`.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = self.motion
+
+        if self.empty or bboxes.size(0) == 0:
+            valid_inds = scores > self.init_track_thr
+            scores = scores[valid_inds]
+            bboxes = bboxes[valid_inds]
+            labels = labels[valid_inds]
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(self.num_tracks,
+                               self.num_tracks + num_new_tracks).to(labels)
+            self.num_tracks += num_new_tracks
+        else:
+            # 0. init
+            ids = torch.full((bboxes.size(0), ),
+                             -1,
+                             dtype=labels.dtype,
+                             device=labels.device)
+
+            # get the detection bboxes for the first association
+            det_inds = scores > self.obj_score_thr
+            det_bboxes = bboxes[det_inds]
+            det_labels = labels[det_inds]
+            det_scores = scores[det_inds]
+            det_ids = ids[det_inds]
+
+            # 1. predict by Kalman Filter
+            for id in self.confirmed_ids:
+                # track is lost in previous frame
+                if self.tracks[id].frame_ids[-1] != frame_id - 1:
+                    self.tracks[id].mean[7] = 0
+                if self.tracks[id].tracked:
+                    self.tracks[id].saved_attr.mean = self.tracks[id].mean
+                    self.tracks[id].saved_attr.covariance = self.tracks[
+                        id].covariance
+                (self.tracks[id].mean,
+                 self.tracks[id].covariance) = self.kf.predict(
+                     self.tracks[id].mean, self.tracks[id].covariance)
+
+            # 2. match detections and tracks' predicted locations
+            match_track_inds, raw_match_det_inds = self.ocm_assign_ids(
+                self.confirmed_ids, det_bboxes, det_labels, det_scores,
+                self.weight_iou_with_det_scores, self.match_iou_thr)
+            # '-1' mean a detection box is not matched with tracklets in
+            # previous frame
+            valid = raw_match_det_inds > -1
+            det_ids[valid] = torch.tensor(
+                self.confirmed_ids)[raw_match_det_inds[valid]].to(labels)
+
+            match_det_bboxes = det_bboxes[valid]
+            match_det_labels = det_labels[valid]
+            match_det_scores = det_scores[valid]
+            match_det_ids = det_ids[valid]
+            assert (match_det_ids > -1).all()
+
+            # unmatched tracks and detections
+            unmatch_det_bboxes = det_bboxes[~valid]
+            unmatch_det_labels = det_labels[~valid]
+            unmatch_det_scores = det_scores[~valid]
+            unmatch_det_ids = det_ids[~valid]
+            assert (unmatch_det_ids == -1).all()
+
+            # 3. use unmatched detection bboxes from the first match to match
+            # the unconfirmed tracks
+            (tentative_match_track_inds,
+             tentative_match_det_inds) = self.ocm_assign_ids(
+                 self.unconfirmed_ids, unmatch_det_bboxes, unmatch_det_labels,
+                 unmatch_det_scores, self.weight_iou_with_det_scores,
+                 self.match_iou_thr)
+            valid = tentative_match_det_inds > -1
+            unmatch_det_ids[valid] = torch.tensor(self.unconfirmed_ids)[
+                tentative_match_det_inds[valid]].to(labels)
+
+            match_det_bboxes = torch.cat(
+                (match_det_bboxes, unmatch_det_bboxes[valid]), dim=0)
+            match_det_labels = torch.cat(
+                (match_det_labels, unmatch_det_labels[valid]), dim=0)
+            match_det_scores = torch.cat(
+                (match_det_scores, unmatch_det_scores[valid]), dim=0)
+            match_det_ids = torch.cat((match_det_ids, unmatch_det_ids[valid]),
+                                      dim=0)
+            assert (match_det_ids > -1).all()
+
+            unmatch_det_bboxes = unmatch_det_bboxes[~valid]
+            unmatch_det_labels = unmatch_det_labels[~valid]
+            unmatch_det_scores = unmatch_det_scores[~valid]
+            unmatch_det_ids = unmatch_det_ids[~valid]
+            assert (unmatch_det_ids == -1).all()
+
+            all_track_ids = [id for id, _ in self.tracks.items()]
+            unmatched_track_inds = torch.tensor(
+                [ind for ind in all_track_ids if ind not in match_det_ids])
+
+            if len(unmatched_track_inds) > 0:
+                # 4. still some tracks not associated yet, perform OCR
+                last_observations = []
+                for id in unmatched_track_inds:
+                    last_box = self.last_obs(self.tracks[id.item()])
+                    last_observations.append(last_box)
+                last_observations = torch.stack(last_observations)
+                last_track_labels = torch.tensor([
+                    self.tracks[id.item()]['labels'][-1]
+                    for id in unmatched_track_inds
+                ]).to(det_bboxes.device)
+
+                remain_det_ids = torch.full((unmatch_det_bboxes.size(0), ),
+                                            -1,
+                                            dtype=labels.dtype,
+                                            device=labels.device)
+
+                _, ocr_match_det_inds = self.ocr_assign_ids(
+                    last_observations, last_track_labels, unmatch_det_bboxes,
+                    unmatch_det_labels, unmatch_det_scores,
+                    self.weight_iou_with_det_scores, self.match_iou_thr)
+
+                valid = ocr_match_det_inds > -1
+                remain_det_ids[valid] = unmatched_track_inds.clone()[
+                    ocr_match_det_inds[valid]].to(labels)
+
+                ocr_match_det_bboxes = unmatch_det_bboxes[valid]
+                ocr_match_det_labels = unmatch_det_labels[valid]
+                ocr_match_det_scores = unmatch_det_scores[valid]
+                ocr_match_det_ids = remain_det_ids[valid]
+                assert (ocr_match_det_ids > -1).all()
+
+                ocr_unmatch_det_bboxes = unmatch_det_bboxes[~valid]
+                ocr_unmatch_det_labels = unmatch_det_labels[~valid]
+                ocr_unmatch_det_scores = unmatch_det_scores[~valid]
+                ocr_unmatch_det_ids = remain_det_ids[~valid]
+                assert (ocr_unmatch_det_ids == -1).all()
+
+                unmatch_det_bboxes = ocr_unmatch_det_bboxes
+                unmatch_det_labels = ocr_unmatch_det_labels
+                unmatch_det_scores = ocr_unmatch_det_scores
+                unmatch_det_ids = ocr_unmatch_det_ids
+                match_det_bboxes = torch.cat(
+                    (match_det_bboxes, ocr_match_det_bboxes), dim=0)
+                match_det_labels = torch.cat(
+                    (match_det_labels, ocr_match_det_labels), dim=0)
+                match_det_scores = torch.cat(
+                    (match_det_scores, ocr_match_det_scores), dim=0)
+                match_det_ids = torch.cat((match_det_ids, ocr_match_det_ids),
+                                          dim=0)
+
+            # 5. summarize the track results
+            for i in range(len(match_det_ids)):
+                det_bbox = match_det_bboxes[i]
+                track_id = match_det_ids[i].item()
+                if not self.tracks[track_id].tracked:
+                    # the track is lost before this step
+                    self.online_smooth(self.tracks[track_id], det_bbox)
+
+            for track_id in all_track_ids:
+                if track_id not in match_det_ids:
+                    self.tracks[track_id].tracked = False
+                    self.tracks[track_id].obs.append(None)
+
+            bboxes = torch.cat((match_det_bboxes, unmatch_det_bboxes), dim=0)
+            labels = torch.cat((match_det_labels, unmatch_det_labels), dim=0)
+            scores = torch.cat((match_det_scores, unmatch_det_scores), dim=0)
+            ids = torch.cat((match_det_ids, unmatch_det_ids), dim=0)
+            # 6. assign new ids
+            new_track_inds = ids == -1
+
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum()).to(labels)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            labels=labels,
+            scores=scores,
+            frame_ids=frame_id)
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+        return pred_track_instances
diff --git a/head_extractor/src/mmdet/models/trackers/quasi_dense_tracker.py b/head_extractor/src/mmdet/models/trackers/quasi_dense_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..c93c3c4c3bd5c8939e77195f30a7eb2f0314e225
--- /dev/null
+++ b/head_extractor/src/mmdet/models/trackers/quasi_dense_tracker.py
@@ -0,0 +1,316 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.registry import MODELS
+from mmdet.structures import TrackDataSample
+from mmdet.structures.bbox import bbox_overlaps
+from .base_tracker import BaseTracker
+
+
+@MODELS.register_module()
+class QuasiDenseTracker(BaseTracker):
+    """Tracker for Quasi-Dense Tracking.
+
+    Args:
+        init_score_thr (float): The cls_score threshold to
+            initialize a new tracklet. Defaults to 0.8.
+        obj_score_thr (float): The cls_score threshold to
+            update a tracked tracklet. Defaults to 0.5.
+        match_score_thr (float): The match threshold. Defaults to 0.5.
+        memo_tracklet_frames (int): The most frames in a tracklet memory.
+            Defaults to 10.
+        memo_backdrop_frames (int): The most frames in the backdrops.
+            Defaults to 1.
+        memo_momentum (float): The momentum value for embeds updating.
+            Defaults to 0.8.
+        nms_conf_thr (float): The nms threshold for confidence.
+            Defaults to 0.5.
+        nms_backdrop_iou_thr (float): The nms threshold for backdrop IoU.
+            Defaults to 0.3.
+        nms_class_iou_thr (float): The nms threshold for class IoU.
+            Defaults to 0.7.
+        with_cats (bool): Whether to track with the same category.
+            Defaults to True.
+        match_metric (str): The match metric. Defaults to 'bisoftmax'.
+    """
+
+    def __init__(self,
+                 init_score_thr: float = 0.8,
+                 obj_score_thr: float = 0.5,
+                 match_score_thr: float = 0.5,
+                 memo_tracklet_frames: int = 10,
+                 memo_backdrop_frames: int = 1,
+                 memo_momentum: float = 0.8,
+                 nms_conf_thr: float = 0.5,
+                 nms_backdrop_iou_thr: float = 0.3,
+                 nms_class_iou_thr: float = 0.7,
+                 with_cats: bool = True,
+                 match_metric: str = 'bisoftmax',
+                 **kwargs):
+        super().__init__(**kwargs)
+        assert 0 <= memo_momentum <= 1.0
+        assert memo_tracklet_frames >= 0
+        assert memo_backdrop_frames >= 0
+        self.init_score_thr = init_score_thr
+        self.obj_score_thr = obj_score_thr
+        self.match_score_thr = match_score_thr
+        self.memo_tracklet_frames = memo_tracklet_frames
+        self.memo_backdrop_frames = memo_backdrop_frames
+        self.memo_momentum = memo_momentum
+        self.nms_conf_thr = nms_conf_thr
+        self.nms_backdrop_iou_thr = nms_backdrop_iou_thr
+        self.nms_class_iou_thr = nms_class_iou_thr
+        self.with_cats = with_cats
+        assert match_metric in ['bisoftmax', 'softmax', 'cosine']
+        self.match_metric = match_metric
+
+        self.num_tracks = 0
+        self.tracks = dict()
+        self.backdrops = []
+
+    def reset(self):
+        """Reset the buffer of the tracker."""
+        self.num_tracks = 0
+        self.tracks = dict()
+        self.backdrops = []
+
+    def update(self, ids: Tensor, bboxes: Tensor, embeds: Tensor,
+               labels: Tensor, scores: Tensor, frame_id: int) -> None:
+        """Tracking forward function.
+
+        Args:
+            ids (Tensor): of shape(N, ).
+            bboxes (Tensor): of shape (N, 5).
+            embeds (Tensor): of shape (N, 256).
+            labels (Tensor): of shape (N, ).
+            scores (Tensor): of shape (N, ).
+            frame_id (int): The id of current frame, 0-index.
+        """
+        tracklet_inds = ids > -1
+
+        for id, bbox, embed, label, score in zip(ids[tracklet_inds],
+                                                 bboxes[tracklet_inds],
+                                                 embeds[tracklet_inds],
+                                                 labels[tracklet_inds],
+                                                 scores[tracklet_inds]):
+            id = int(id)
+            # update the tracked ones and initialize new tracks
+            if id in self.tracks.keys():
+                velocity = (bbox - self.tracks[id]['bbox']) / (
+                    frame_id - self.tracks[id]['last_frame'])
+                self.tracks[id]['bbox'] = bbox
+                self.tracks[id]['embed'] = (
+                    1 - self.memo_momentum
+                ) * self.tracks[id]['embed'] + self.memo_momentum * embed
+                self.tracks[id]['last_frame'] = frame_id
+                self.tracks[id]['label'] = label
+                self.tracks[id]['score'] = score
+                self.tracks[id]['velocity'] = (
+                    self.tracks[id]['velocity'] * self.tracks[id]['acc_frame']
+                    + velocity) / (
+                        self.tracks[id]['acc_frame'] + 1)
+                self.tracks[id]['acc_frame'] += 1
+            else:
+                self.tracks[id] = dict(
+                    bbox=bbox,
+                    embed=embed,
+                    label=label,
+                    score=score,
+                    last_frame=frame_id,
+                    velocity=torch.zeros_like(bbox),
+                    acc_frame=0)
+        # backdrop update according to IoU
+        backdrop_inds = torch.nonzero(ids == -1, as_tuple=False).squeeze(1)
+        ious = bbox_overlaps(bboxes[backdrop_inds], bboxes)
+        for i, ind in enumerate(backdrop_inds):
+            if (ious[i, :ind] > self.nms_backdrop_iou_thr).any():
+                backdrop_inds[i] = -1
+        backdrop_inds = backdrop_inds[backdrop_inds > -1]
+        # old backdrops would be removed at first
+        self.backdrops.insert(
+            0,
+            dict(
+                bboxes=bboxes[backdrop_inds],
+                embeds=embeds[backdrop_inds],
+                labels=labels[backdrop_inds]))
+
+        # pop memo
+        invalid_ids = []
+        for k, v in self.tracks.items():
+            if frame_id - v['last_frame'] >= self.memo_tracklet_frames:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracks.pop(invalid_id)
+
+        if len(self.backdrops) > self.memo_backdrop_frames:
+            self.backdrops.pop()
+
+    @property
+    def memo(self) -> Tuple[Tensor, ...]:
+        """Get tracks memory."""
+        memo_embeds = []
+        memo_ids = []
+        memo_bboxes = []
+        memo_labels = []
+        # velocity of tracks
+        memo_vs = []
+        # get tracks
+        for k, v in self.tracks.items():
+            memo_bboxes.append(v['bbox'][None, :])
+            memo_embeds.append(v['embed'][None, :])
+            memo_ids.append(k)
+            memo_labels.append(v['label'].view(1, 1))
+            memo_vs.append(v['velocity'][None, :])
+        memo_ids = torch.tensor(memo_ids, dtype=torch.long).view(1, -1)
+        # get backdrops
+        for backdrop in self.backdrops:
+            backdrop_ids = torch.full((1, backdrop['embeds'].size(0)),
+                                      -1,
+                                      dtype=torch.long)
+            backdrop_vs = torch.zeros_like(backdrop['bboxes'])
+            memo_bboxes.append(backdrop['bboxes'])
+            memo_embeds.append(backdrop['embeds'])
+            memo_ids = torch.cat([memo_ids, backdrop_ids], dim=1)
+            memo_labels.append(backdrop['labels'][:, None])
+            memo_vs.append(backdrop_vs)
+
+        memo_bboxes = torch.cat(memo_bboxes, dim=0)
+        memo_embeds = torch.cat(memo_embeds, dim=0)
+        memo_labels = torch.cat(memo_labels, dim=0).squeeze(1)
+        memo_vs = torch.cat(memo_vs, dim=0)
+        return memo_bboxes, memo_labels, memo_embeds, memo_ids.squeeze(
+            0), memo_vs
+
+    def track(self,
+              model: torch.nn.Module,
+              img: torch.Tensor,
+              feats: List[torch.Tensor],
+              data_sample: TrackDataSample,
+              rescale=True,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                QDTrack method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_instances`.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                True.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        # create pred_track_instances
+        pred_track_instances = InstanceData()
+
+        # return zero bboxes if there is no track targets
+        if bboxes.shape[0] == 0:
+            ids = torch.zeros_like(labels)
+            pred_track_instances = data_sample.pred_instances.clone()
+            pred_track_instances.instances_id = ids
+            return pred_track_instances
+
+        # get track feats
+        rescaled_bboxes = bboxes.clone()
+        if rescale:
+            scale_factor = rescaled_bboxes.new_tensor(
+                metainfo['scale_factor']).repeat((1, 2))
+            rescaled_bboxes = rescaled_bboxes * scale_factor
+        track_feats = model.track_head.predict(feats, [rescaled_bboxes])
+        # sort according to the object_score
+        _, inds = scores.sort(descending=True)
+        bboxes = bboxes[inds]
+        scores = scores[inds]
+        labels = labels[inds]
+        embeds = track_feats[inds, :]
+
+        # duplicate removal for potential backdrops and cross classes
+        valids = bboxes.new_ones((bboxes.size(0)))
+        ious = bbox_overlaps(bboxes, bboxes)
+        for i in range(1, bboxes.size(0)):
+            thr = self.nms_backdrop_iou_thr if scores[
+                i] < self.obj_score_thr else self.nms_class_iou_thr
+            if (ious[i, :i] > thr).any():
+                valids[i] = 0
+        valids = valids == 1
+        bboxes = bboxes[valids]
+        scores = scores[valids]
+        labels = labels[valids]
+        embeds = embeds[valids, :]
+
+        # init ids container
+        ids = torch.full((bboxes.size(0), ), -1, dtype=torch.long)
+
+        # match if buffer is not empty
+        if bboxes.size(0) > 0 and not self.empty:
+            (memo_bboxes, memo_labels, memo_embeds, memo_ids,
+             memo_vs) = self.memo
+
+            if self.match_metric == 'bisoftmax':
+                feats = torch.mm(embeds, memo_embeds.t())
+                d2t_scores = feats.softmax(dim=1)
+                t2d_scores = feats.softmax(dim=0)
+                match_scores = (d2t_scores + t2d_scores) / 2
+            elif self.match_metric == 'softmax':
+                feats = torch.mm(embeds, memo_embeds.t())
+                match_scores = feats.softmax(dim=1)
+            elif self.match_metric == 'cosine':
+                match_scores = torch.mm(
+                    F.normalize(embeds, p=2, dim=1),
+                    F.normalize(memo_embeds, p=2, dim=1).t())
+            else:
+                raise NotImplementedError
+            # track with the same category
+            if self.with_cats:
+                cat_same = labels.view(-1, 1) == memo_labels.view(1, -1)
+                match_scores *= cat_same.float().to(match_scores.device)
+            # track according to match_scores
+            for i in range(bboxes.size(0)):
+                conf, memo_ind = torch.max(match_scores[i, :], dim=0)
+                id = memo_ids[memo_ind]
+                if conf > self.match_score_thr:
+                    if id > -1:
+                        # keep bboxes with high object score
+                        # and remove background bboxes
+                        if scores[i] > self.obj_score_thr:
+                            ids[i] = id
+                            match_scores[:i, memo_ind] = 0
+                            match_scores[i + 1:, memo_ind] = 0
+                        else:
+                            if conf > self.nms_conf_thr:
+                                ids[i] = -2
+        # initialize new tracks
+        new_inds = (ids == -1) & (scores > self.init_score_thr).cpu()
+        num_news = new_inds.sum()
+        ids[new_inds] = torch.arange(
+            self.num_tracks, self.num_tracks + num_news, dtype=torch.long)
+        self.num_tracks += num_news
+
+        self.update(ids, bboxes, embeds, labels, scores, frame_id)
+        tracklet_inds = ids > -1
+        # update pred_track_instances
+        pred_track_instances.bboxes = bboxes[tracklet_inds]
+        pred_track_instances.labels = labels[tracklet_inds]
+        pred_track_instances.scores = scores[tracklet_inds]
+        pred_track_instances.instances_id = ids[tracklet_inds]
+
+        return pred_track_instances
diff --git a/head_extractor/src/mmdet/models/trackers/sort_tracker.py b/head_extractor/src/mmdet/models/trackers/sort_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4a4fed92702f7d1ea66917a7157fcf5d0773a30
--- /dev/null
+++ b/head_extractor/src/mmdet/models/trackers/sort_tracker.py
@@ -0,0 +1,268 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+try:
+    import motmetrics
+    from motmetrics.lap import linear_sum_assignment
+except ImportError:
+    motmetrics = None
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import DetDataSample
+from mmdet.structures.bbox import bbox_overlaps, bbox_xyxy_to_cxcyah
+from mmdet.utils import OptConfigType
+from ..utils import imrenormalize
+from .base_tracker import BaseTracker
+
+
+@MODELS.register_module()
+class SORTTracker(BaseTracker):
+    """Tracker for SORT/DeepSORT.
+
+    Args:
+        obj_score_thr (float, optional): Threshold to filter the objects.
+            Defaults to 0.3.
+        motion (dict): Configuration of motion. Defaults to None.
+        reid (dict, optional): Configuration for the ReID model.
+            - num_samples (int, optional): Number of samples to calculate the
+                feature embeddings of a track. Default to 10.
+            - image_scale (tuple, optional): Input scale of the ReID model.
+                Default to (256, 128).
+            - img_norm_cfg (dict, optional): Configuration to normalize the
+                input. Default to None.
+            - match_score_thr (float, optional): Similarity threshold for the
+                matching process. Default to 2.0.
+        match_iou_thr (float, optional): Threshold of the IoU matching process.
+            Defaults to 0.7.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 3.
+    """
+
+    def __init__(self,
+                 motion: Optional[dict] = None,
+                 obj_score_thr: float = 0.3,
+                 reid: dict = dict(
+                     num_samples=10,
+                     img_scale=(256, 128),
+                     img_norm_cfg=None,
+                     match_score_thr=2.0),
+                 match_iou_thr: float = 0.7,
+                 num_tentatives: int = 3,
+                 **kwargs):
+        if motmetrics is None:
+            raise RuntimeError('motmetrics is not installed,\
+                 please install it by: pip install motmetrics')
+        super().__init__(**kwargs)
+        if motion is not None:
+            self.motion = TASK_UTILS.build(motion)
+            assert self.motion is not None, 'SORT/Deep SORT need KalmanFilter'
+        self.obj_score_thr = obj_score_thr
+        self.reid = reid
+        self.match_iou_thr = match_iou_thr
+        self.num_tentatives = num_tentatives
+
+    @property
+    def confirmed_ids(self) -> List:
+        """Confirmed ids in the tracker."""
+        ids = [id for id, track in self.tracks.items() if not track.tentative]
+        return ids
+
+    def init_track(self, id: int, obj: Tuple[Tensor]) -> None:
+        """Initialize a track."""
+        super().init_track(id, obj)
+        self.tracks[id].tentative = True
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.initiate(
+            bbox)
+
+    def update_track(self, id: int, obj: Tuple[Tensor]) -> None:
+        """Update a track."""
+        super().update_track(id, obj)
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox)
+
+    def pop_invalid_tracks(self, frame_id: int) -> None:
+        """Pop out invalid tracks."""
+        invalid_ids = []
+        for k, v in self.tracks.items():
+            # case1: disappeared frames >= self.num_frames_retrain
+            case1 = frame_id - v['frame_ids'][-1] >= self.num_frames_retain
+            # case2: tentative tracks but not matched in this frame
+            case2 = v.tentative and v['frame_ids'][-1] != frame_id
+            if case1 or case2:
+                invalid_ids.append(k)
+        for invalid_id in invalid_ids:
+            self.tracks.pop(invalid_id)
+
+    def track(self,
+              model: torch.nn.Module,
+              img: Tensor,
+              data_sample: DetDataSample,
+              data_preprocessor: OptConfigType = None,
+              rescale: bool = False,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                SORT method.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            data_preprocessor (dict or ConfigDict, optional): The pre-process
+               config of :class:`TrackDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                False.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = self.motion
+
+        if self.with_reid:
+            if self.reid.get('img_norm_cfg', False):
+                img_norm_cfg = dict(
+                    mean=data_preprocessor['mean'],
+                    std=data_preprocessor['std'],
+                    to_bgr=data_preprocessor['rgb_to_bgr'])
+                reid_img = imrenormalize(img, img_norm_cfg,
+                                         self.reid['img_norm_cfg'])
+            else:
+                reid_img = img.clone()
+
+        valid_inds = scores > self.obj_score_thr
+        bboxes = bboxes[valid_inds]
+        labels = labels[valid_inds]
+        scores = scores[valid_inds]
+
+        if self.empty or bboxes.size(0) == 0:
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(
+                self.num_tracks,
+                self.num_tracks + num_new_tracks,
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += num_new_tracks
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                if crops.size(0) > 0:
+                    embeds = model.reid(crops, mode='tensor')
+                else:
+                    embeds = crops.new_zeros((0, model.reid.head.out_channels))
+        else:
+            ids = torch.full((bboxes.size(0), ), -1,
+                             dtype=torch.long).to(bboxes.device)
+
+            # motion
+            self.tracks, costs = self.motion.track(self.tracks,
+                                                   bbox_xyxy_to_cxcyah(bboxes))
+
+            active_ids = self.confirmed_ids
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                embeds = model.reid(crops, mode='tensor')
+
+                # reid
+                if len(active_ids) > 0:
+                    track_embeds = self.get(
+                        'embeds',
+                        active_ids,
+                        self.reid.get('num_samples', None),
+                        behavior='mean')
+                    reid_dists = torch.cdist(track_embeds, embeds)
+
+                    # support multi-class association
+                    track_labels = torch.tensor([
+                        self.tracks[id]['labels'][-1] for id in active_ids
+                    ]).to(bboxes.device)
+                    cate_match = labels[None, :] == track_labels[:, None]
+                    cate_cost = (1 - cate_match.int()) * 1e6
+                    reid_dists = (reid_dists + cate_cost).cpu().numpy()
+
+                    valid_inds = [list(self.ids).index(_) for _ in active_ids]
+                    reid_dists[~np.isfinite(costs[valid_inds, :])] = np.nan
+
+                    row, col = linear_sum_assignment(reid_dists)
+                    for r, c in zip(row, col):
+                        dist = reid_dists[r, c]
+                        if not np.isfinite(dist):
+                            continue
+                        if dist <= self.reid['match_score_thr']:
+                            ids[c] = active_ids[r]
+
+            active_ids = [
+                id for id in self.ids if id not in ids
+                and self.tracks[id].frame_ids[-1] == frame_id - 1
+            ]
+            if len(active_ids) > 0:
+                active_dets = torch.nonzero(ids == -1).squeeze(1)
+                track_bboxes = self.get('bboxes', active_ids)
+                ious = bbox_overlaps(track_bboxes, bboxes[active_dets])
+
+                # support multi-class association
+                track_labels = torch.tensor([
+                    self.tracks[id]['labels'][-1] for id in active_ids
+                ]).to(bboxes.device)
+                cate_match = labels[None, active_dets] == track_labels[:, None]
+                cate_cost = (1 - cate_match.int()) * 1e6
+
+                dists = (1 - ious + cate_cost).cpu().numpy()
+
+                row, col = linear_sum_assignment(dists)
+                for r, c in zip(row, col):
+                    dist = dists[r, c]
+                    if dist < 1 - self.match_iou_thr:
+                        ids[active_dets[c]] = active_ids[r]
+
+            new_track_inds = ids == -1
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum(),
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            scores=scores,
+            labels=labels,
+            embeds=embeds if self.with_reid else None,
+            frame_ids=frame_id)
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/head_extractor/src/mmdet/models/trackers/strongsort_tracker.py b/head_extractor/src/mmdet/models/trackers/strongsort_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d7075701bc3205b9ea30f03790cfa1c42a97822
--- /dev/null
+++ b/head_extractor/src/mmdet/models/trackers/strongsort_tracker.py
@@ -0,0 +1,273 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+
+try:
+    import motmetrics
+    from motmetrics.lap import linear_sum_assignment
+except ImportError:
+    motmetrics = None
+from torch import Tensor
+
+from mmdet.models.utils import imrenormalize
+from mmdet.registry import MODELS
+from mmdet.structures import TrackDataSample
+from mmdet.structures.bbox import bbox_overlaps, bbox_xyxy_to_cxcyah
+from mmdet.utils import OptConfigType
+from .sort_tracker import SORTTracker
+
+
+def cosine_distance(x: Tensor, y: Tensor) -> np.ndarray:
+    """compute the cosine distance.
+
+    Args:
+        x (Tensor): embeddings with shape (N,C).
+        y (Tensor): embeddings with shape (M,C).
+
+    Returns:
+        ndarray: cosine distance with shape (N,M).
+    """
+    x = x.cpu().numpy()
+    y = y.cpu().numpy()
+    x = x / np.linalg.norm(x, axis=1, keepdims=True)
+    y = y / np.linalg.norm(y, axis=1, keepdims=True)
+    dists = 1. - np.dot(x, y.T)
+    return dists
+
+
+@MODELS.register_module()
+class StrongSORTTracker(SORTTracker):
+    """Tracker for StrongSORT.
+
+    Args:
+        obj_score_thr (float, optional): Threshold to filter the objects.
+            Defaults to 0.6.
+        motion (dict): Configuration of motion. Defaults to None.
+        reid (dict, optional): Configuration for the ReID model.
+            - num_samples (int, optional): Number of samples to calculate the
+                feature embeddings of a track. Default to None.
+            - image_scale (tuple, optional): Input scale of the ReID model.
+                Default to (256, 128).
+            - img_norm_cfg (dict, optional): Configuration to normalize the
+                input. Default to None.
+            - match_score_thr (float, optional): Similarity threshold for the
+                matching process. Default to 0.3.
+            - motion_weight (float, optional): the weight of the motion cost.
+                Defaults to 0.02.
+        match_iou_thr (float, optional): Threshold of the IoU matching process.
+            Defaults to 0.7.
+        num_tentatives (int, optional): Number of continuous frames to confirm
+            a track. Defaults to 2.
+    """
+
+    def __init__(self,
+                 motion: Optional[dict] = None,
+                 obj_score_thr: float = 0.6,
+                 reid: dict = dict(
+                     num_samples=None,
+                     img_scale=(256, 128),
+                     img_norm_cfg=None,
+                     match_score_thr=0.3,
+                     motion_weight=0.02),
+                 match_iou_thr: float = 0.7,
+                 num_tentatives: int = 2,
+                 **kwargs):
+        if motmetrics is None:
+            raise RuntimeError('motmetrics is not installed,\
+                 please install it by: pip install motmetrics')
+        super().__init__(motion, obj_score_thr, reid, match_iou_thr,
+                         num_tentatives, **kwargs)
+
+    def update_track(self, id: int, obj: Tuple[Tensor]) -> None:
+        """Update a track."""
+        for k, v in zip(self.memo_items, obj):
+            v = v[None]
+            if self.momentums is not None and k in self.momentums:
+                m = self.momentums[k]
+                self.tracks[id][k] = (1 - m) * self.tracks[id][k] + m * v
+            else:
+                self.tracks[id][k].append(v)
+
+        if self.tracks[id].tentative:
+            if len(self.tracks[id]['bboxes']) >= self.num_tentatives:
+                self.tracks[id].tentative = False
+        bbox = bbox_xyxy_to_cxcyah(self.tracks[id].bboxes[-1])  # size = (1, 4)
+        assert bbox.ndim == 2 and bbox.shape[0] == 1
+        bbox = bbox.squeeze(0).cpu().numpy()
+        score = float(self.tracks[id].scores[-1].cpu())
+        self.tracks[id].mean, self.tracks[id].covariance = self.kf.update(
+            self.tracks[id].mean, self.tracks[id].covariance, bbox, score)
+
+    def track(self,
+              model: torch.nn.Module,
+              img: Tensor,
+              data_sample: TrackDataSample,
+              data_preprocessor: OptConfigType = None,
+              rescale: bool = False,
+              **kwargs) -> InstanceData:
+        """Tracking forward function.
+
+        Args:
+            model (nn.Module): MOT model.
+            img (Tensor): of shape (T, C, H, W) encoding input image.
+                Typically these should be mean centered and std scaled.
+                The T denotes the number of key images and usually is 1 in
+                SORT method.
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            data_sample (:obj:`TrackDataSample`): The data sample.
+                It includes information such as `pred_det_instances`.
+            data_preprocessor (dict or ConfigDict, optional): The pre-process
+               config of :class:`TrackDataPreprocessor`.  it usually includes,
+                ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            rescale (bool, optional): If True, the bounding boxes should be
+                rescaled to fit the original scale of the image. Defaults to
+                False.
+
+        Returns:
+            :obj:`InstanceData`: Tracking results of the input images.
+            Each InstanceData usually contains ``bboxes``, ``labels``,
+            ``scores`` and ``instances_id``.
+        """
+        metainfo = data_sample.metainfo
+        bboxes = data_sample.pred_instances.bboxes
+        labels = data_sample.pred_instances.labels
+        scores = data_sample.pred_instances.scores
+
+        frame_id = metainfo.get('frame_id', -1)
+        if frame_id == 0:
+            self.reset()
+        if not hasattr(self, 'kf'):
+            self.kf = self.motion
+
+        if self.with_reid:
+            if self.reid.get('img_norm_cfg', False):
+                img_norm_cfg = dict(
+                    mean=data_preprocessor.get('mean', [0, 0, 0]),
+                    std=data_preprocessor.get('std', [1, 1, 1]),
+                    to_bgr=data_preprocessor.get('rgb_to_bgr', False))
+                reid_img = imrenormalize(img, img_norm_cfg,
+                                         self.reid['img_norm_cfg'])
+            else:
+                reid_img = img.clone()
+
+        valid_inds = scores > self.obj_score_thr
+        bboxes = bboxes[valid_inds]
+        labels = labels[valid_inds]
+        scores = scores[valid_inds]
+
+        if self.empty or bboxes.size(0) == 0:
+            num_new_tracks = bboxes.size(0)
+            ids = torch.arange(
+                self.num_tracks,
+                self.num_tracks + num_new_tracks,
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += num_new_tracks
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                if crops.size(0) > 0:
+                    embeds = model.reid(crops, mode='tensor')
+                else:
+                    embeds = crops.new_zeros((0, model.reid.head.out_channels))
+        else:
+            ids = torch.full((bboxes.size(0), ), -1,
+                             dtype=torch.long).to(bboxes.device)
+
+            # motion
+            if model.with_cmc:
+                num_samples = 1
+                self.tracks = model.cmc.track(self.last_img, img, self.tracks,
+                                              num_samples, frame_id, metainfo)
+
+            self.tracks, motion_dists = self.motion.track(
+                self.tracks, bbox_xyxy_to_cxcyah(bboxes))
+
+            active_ids = self.confirmed_ids
+            if self.with_reid:
+                crops = self.crop_imgs(reid_img, metainfo, bboxes.clone(),
+                                       rescale)
+                embeds = model.reid(crops, mode='tensor')
+
+                # reid
+                if len(active_ids) > 0:
+                    track_embeds = self.get(
+                        'embeds',
+                        active_ids,
+                        self.reid.get('num_samples', None),
+                        behavior='mean')
+                    reid_dists = cosine_distance(track_embeds, embeds)
+                    valid_inds = [list(self.ids).index(_) for _ in active_ids]
+                    reid_dists[~np.isfinite(motion_dists[
+                        valid_inds, :])] = np.nan
+
+                    weight_motion = self.reid.get('motion_weight')
+                    match_dists = (1 - weight_motion) * reid_dists + \
+                        weight_motion * motion_dists[valid_inds]
+
+                    # support multi-class association
+                    track_labels = torch.tensor([
+                        self.tracks[id]['labels'][-1] for id in active_ids
+                    ]).to(bboxes.device)
+                    cate_match = labels[None, :] == track_labels[:, None]
+                    cate_cost = ((1 - cate_match.int()) * 1e6).cpu().numpy()
+                    match_dists = match_dists + cate_cost
+
+                    row, col = linear_sum_assignment(match_dists)
+                    for r, c in zip(row, col):
+                        dist = match_dists[r, c]
+                        if not np.isfinite(dist):
+                            continue
+                        if dist <= self.reid['match_score_thr']:
+                            ids[c] = active_ids[r]
+
+            active_ids = [
+                id for id in self.ids if id not in ids
+                and self.tracks[id].frame_ids[-1] == frame_id - 1
+            ]
+            if len(active_ids) > 0:
+                active_dets = torch.nonzero(ids == -1).squeeze(1)
+                track_bboxes = self.get('bboxes', active_ids)
+                ious = bbox_overlaps(track_bboxes, bboxes[active_dets])
+
+                # support multi-class association
+                track_labels = torch.tensor([
+                    self.tracks[id]['labels'][-1] for id in active_ids
+                ]).to(bboxes.device)
+                cate_match = labels[None, active_dets] == track_labels[:, None]
+                cate_cost = (1 - cate_match.int()) * 1e6
+
+                dists = (1 - ious + cate_cost).cpu().numpy()
+
+                row, col = linear_sum_assignment(dists)
+                for r, c in zip(row, col):
+                    dist = dists[r, c]
+                    if dist < 1 - self.match_iou_thr:
+                        ids[active_dets[c]] = active_ids[r]
+
+            new_track_inds = ids == -1
+            ids[new_track_inds] = torch.arange(
+                self.num_tracks,
+                self.num_tracks + new_track_inds.sum(),
+                dtype=torch.long).to(bboxes.device)
+            self.num_tracks += new_track_inds.sum()
+
+        self.update(
+            ids=ids,
+            bboxes=bboxes,
+            scores=scores,
+            labels=labels,
+            embeds=embeds if self.with_reid else None,
+            frame_ids=frame_id)
+        self.last_img = img
+
+        # update pred_track_instances
+        pred_track_instances = InstanceData()
+        pred_track_instances.bboxes = bboxes
+        pred_track_instances.labels = labels
+        pred_track_instances.scores = scores
+        pred_track_instances.instances_id = ids
+
+        return pred_track_instances
diff --git a/head_extractor/src/mmdet/models/tracking_heads/__init__.py b/head_extractor/src/mmdet/models/tracking_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd1f0561cc076f2a603a64eb479cc6de0372a438
--- /dev/null
+++ b/head_extractor/src/mmdet/models/tracking_heads/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mask2former_track_head import Mask2FormerTrackHead
+from .quasi_dense_embed_head import QuasiDenseEmbedHead
+from .quasi_dense_track_head import QuasiDenseTrackHead
+from .roi_embed_head import RoIEmbedHead
+from .roi_track_head import RoITrackHead
+
+__all__ = [
+    'QuasiDenseEmbedHead', 'QuasiDenseTrackHead', 'Mask2FormerTrackHead',
+    'RoIEmbedHead', 'RoITrackHead'
+]
diff --git a/head_extractor/src/mmdet/models/tracking_heads/mask2former_track_head.py b/head_extractor/src/mmdet/models/tracking_heads/mask2former_track_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0877241bc33fcd1ef8f7ed154d503d9dbd8ab938
--- /dev/null
+++ b/head_extractor/src/mmdet/models/tracking_heads/mask2former_track_head.py
@@ -0,0 +1,729 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from collections import defaultdict
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d
+from mmcv.ops import point_sample
+from mmengine.model import ModuleList
+from mmengine.model.weight_init import caffe2_xavier_init
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmdet.models.dense_heads import AnchorFreeHead, MaskFormerHead
+from mmdet.models.utils import get_uncertain_point_coords_with_randomness
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import TrackDataSample, TrackSampleList
+from mmdet.structures.mask import mask2bbox
+from mmdet.utils import (ConfigType, InstanceList, OptConfigType,
+                         OptMultiConfig, reduce_mean)
+from ..layers import Mask2FormerTransformerDecoder
+
+
+@MODELS.register_module()
+class Mask2FormerTrackHead(MaskFormerHead):
+    """Implements the Mask2Former head.
+
+    See `Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/pdf/2112.01527>`_ for details.
+
+    Args:
+        in_channels (list[int]): Number of channels in the input feature map.
+        feat_channels (int): Number of channels for features.
+        out_channels (int): Number of channels for output.
+        num_classes (int): Number of VIS classes.
+        num_queries (int): Number of query in Transformer decoder.
+            Defaults to 100.
+        num_transformer_feat_level (int): Number of feats levels.
+            Defaults to 3.
+        pixel_decoder (:obj:`ConfigDict` or dict): Config for pixel
+            decoder.
+        enforce_decoder_input_project (bool, optional): Whether to add
+            a layer to change the embed_dim of transformer encoder in
+            pixel decoder to the embed_dim of transformer decoder.
+            Defaults to False.
+        transformer_decoder (:obj:`ConfigDict` or dict): Config for
+            transformer decoder.
+        positional_encoding (:obj:`ConfigDict` or dict): Config for
+            transformer decoder position encoding.
+            Defaults to `SinePositionalEncoding3D`.
+        loss_cls (:obj:`ConfigDict` or dict): Config of the classification
+            loss. Defaults to `CrossEntropyLoss`.
+        loss_mask (:obj:`ConfigDict` or dict): Config of the mask loss.
+            Defaults to 'CrossEntropyLoss'.
+        loss_dice (:obj:`ConfigDict` or dict): Config of the dice loss.
+            Defaults to 'DiceLoss'.
+        train_cfg (:obj:`ConfigDict` or dict, optional): Training config of
+            Mask2Former head. Defaults to None.
+        test_cfg (:obj:`ConfigDict` or dict, optional): Testing config of
+            Mask2Former head. Defaults to None.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict], optional): Initialization config dict. Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels: List[int],
+                 feat_channels: int,
+                 out_channels: int,
+                 num_classes: int,
+                 num_frames: int = 2,
+                 num_queries: int = 100,
+                 num_transformer_feat_level: int = 3,
+                 pixel_decoder: ConfigType = ...,
+                 enforce_decoder_input_project: bool = False,
+                 transformer_decoder: ConfigType = ...,
+                 positional_encoding: ConfigType = dict(
+                     num_feats=128, normalize=True),
+                 loss_cls: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=2.0,
+                     reduction='mean',
+                     class_weight=[1.0] * 133 + [0.1]),
+                 loss_mask: ConfigType = dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='mean',
+                     loss_weight=5.0),
+                 loss_dice: ConfigType = dict(
+                     type='DiceLoss',
+                     use_sigmoid=True,
+                     activate=True,
+                     reduction='mean',
+                     naive_dice=True,
+                     eps=1.0,
+                     loss_weight=5.0),
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None,
+                 **kwargs) -> None:
+        super(AnchorFreeHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.num_frames = num_frames
+        self.num_queries = num_queries
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_transformer_feat_level = num_transformer_feat_level
+        self.num_heads = transformer_decoder.layer_cfg.cross_attn_cfg.num_heads
+        self.num_transformer_decoder_layers = transformer_decoder.num_layers
+        assert pixel_decoder.encoder.layer_cfg. \
+            self_attn_cfg.num_levels == num_transformer_feat_level
+        pixel_decoder_ = copy.deepcopy(pixel_decoder)
+        pixel_decoder_.update(
+            in_channels=in_channels,
+            feat_channels=feat_channels,
+            out_channels=out_channels)
+        self.pixel_decoder = MODELS.build(pixel_decoder_)
+        self.transformer_decoder = Mask2FormerTransformerDecoder(
+            **transformer_decoder)
+        self.decoder_embed_dims = self.transformer_decoder.embed_dims
+
+        self.decoder_input_projs = ModuleList()
+        # from low resolution to high resolution
+        for _ in range(num_transformer_feat_level):
+            if (self.decoder_embed_dims != feat_channels
+                    or enforce_decoder_input_project):
+                self.decoder_input_projs.append(
+                    Conv2d(
+                        feat_channels, self.decoder_embed_dims, kernel_size=1))
+            else:
+                self.decoder_input_projs.append(nn.Identity())
+        self.decoder_positional_encoding = MODELS.build(positional_encoding)
+        self.query_embed = nn.Embedding(self.num_queries, feat_channels)
+        self.query_feat = nn.Embedding(self.num_queries, feat_channels)
+        # from low resolution to high resolution
+        self.level_embed = nn.Embedding(self.num_transformer_feat_level,
+                                        feat_channels)
+
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+        self.mask_embed = nn.Sequential(
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True),
+            nn.Linear(feat_channels, out_channels))
+
+        self.test_cfg = test_cfg
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.sampler = TASK_UTILS.build(
+                # self.train_cfg.sampler, default_args=dict(context=self))
+                self.train_cfg['sampler'],
+                default_args=dict(context=self))
+            self.num_points = self.train_cfg.get('num_points', 12544)
+            self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0)
+            self.importance_sample_ratio = self.train_cfg.get(
+                'importance_sample_ratio', 0.75)
+
+        self.class_weight = loss_cls.class_weight
+        self.loss_cls = MODELS.build(loss_cls)
+        self.loss_mask = MODELS.build(loss_mask)
+        self.loss_dice = MODELS.build(loss_dice)
+
+    def init_weights(self) -> None:
+        for m in self.decoder_input_projs:
+            if isinstance(m, Conv2d):
+                caffe2_xavier_init(m, bias=0)
+
+        self.pixel_decoder.init_weights()
+
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def preprocess_gt(self, batch_gt_instances: InstanceList) -> InstanceList:
+        """Preprocess the ground truth for all images.
+
+        It aims to reorganize the `gt`. For example, in the
+        `batch_data_sample.gt_instances.mask`, its shape is
+        `(all_num_gts, h, w)`, but we don't know each gt belongs to which `img`
+        (assume `num_frames` is 2). So, this func used to reshape the `gt_mask`
+        to `(num_gts_per_img, num_frames, h, w)`. In addition, we can't
+        guarantee that the number of instances in these two images is equal,
+        so `-1` refers to nonexistent instances.
+
+        Args:
+            batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``labels``, each is
+                ground truth labels of each bbox, with shape (num_gts, )
+                and ``masks``, each is ground truth masks of each instances
+                of an image, shape (num_gts, h, w).
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+
+                - labels (Tensor): Ground truth class indices\
+                    for an image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in an image.
+                - masks (Tensor): Ground truth mask for a\
+                    image, with shape (n, t, h, w).
+        """
+        final_batch_gt_instances = []
+        batch_size = len(batch_gt_instances) // self.num_frames
+        for batch_idx in range(batch_size):
+            pair_gt_insatences = batch_gt_instances[batch_idx *
+                                                    self.num_frames:batch_idx *
+                                                    self.num_frames +
+                                                    self.num_frames]
+
+            assert len(
+                pair_gt_insatences
+            ) > 1, f'mask2former for vis need multi frames to train, \
+                but you only use {len(pair_gt_insatences)} frames'
+
+            _device = pair_gt_insatences[0].labels.device
+
+            for gt_instances in pair_gt_insatences:
+                gt_instances.masks = gt_instances.masks.to_tensor(
+                    dtype=torch.bool, device=_device)
+            all_ins_id = torch.cat([
+                gt_instances.instances_ids
+                for gt_instances in pair_gt_insatences
+            ])
+            all_ins_id = all_ins_id.unique().tolist()
+            map_ins_id = dict()
+            for i, ins_id in enumerate(all_ins_id):
+                map_ins_id[ins_id] = i
+
+            num_instances = len(all_ins_id)
+            mask_shape = [
+                num_instances, self.num_frames,
+                pair_gt_insatences[0].masks.shape[1],
+                pair_gt_insatences[0].masks.shape[2]
+            ]
+            gt_masks_per_video = torch.zeros(
+                mask_shape, dtype=torch.bool, device=_device)
+            gt_ids_per_video = torch.full((num_instances, self.num_frames),
+                                          -1,
+                                          dtype=torch.long,
+                                          device=_device)
+            gt_labels_per_video = torch.full((num_instances, ),
+                                             -1,
+                                             dtype=torch.long,
+                                             device=_device)
+
+            for frame_id in range(self.num_frames):
+                cur_frame_gts = pair_gt_insatences[frame_id]
+                ins_ids = cur_frame_gts.instances_ids.tolist()
+                for i, id in enumerate(ins_ids):
+                    gt_masks_per_video[map_ins_id[id],
+                                       frame_id, :, :] = cur_frame_gts.masks[i]
+                    gt_ids_per_video[map_ins_id[id],
+                                     frame_id] = cur_frame_gts.instances_ids[i]
+                    gt_labels_per_video[
+                        map_ins_id[id]] = cur_frame_gts.labels[i]
+
+            tmp_instances = InstanceData(
+                labels=gt_labels_per_video,
+                masks=gt_masks_per_video.long(),
+                instances_id=gt_ids_per_video)
+            final_batch_gt_instances.append(tmp_instances)
+
+        return final_batch_gt_instances
+
+    def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor,
+                            gt_instances: InstanceData,
+                            img_meta: dict) -> Tuple[Tensor]:
+        """Compute classification and mask targets for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, num_frames, h, w).
+            gt_instances (:obj:`InstanceData`): It contains ``labels`` and
+                ``masks``.
+            img_meta (dict): Image informtation.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - label_weights (Tensor): Label weights of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, num_frames, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+                - pos_inds (Tensor): Sampled positive indices for each \
+                    image.
+                - neg_inds (Tensor): Sampled negative indices for each \
+                    image.
+                - sampling_result (:obj:`SamplingResult`): Sampling results.
+        """
+        # (num_gts, )
+        gt_labels = gt_instances.labels
+        # (num_gts, num_frames, h, w)
+        gt_masks = gt_instances.masks
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(mask_pred,
+                                        point_coords.repeat(num_queries, 1,
+                                                            1)).flatten(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(gt_masks.float(),
+                                       point_coords.repeat(num_gts, 1,
+                                                           1)).flatten(1)
+
+        sampled_gt_instances = InstanceData(
+            labels=gt_labels, masks=gt_points_masks)
+        sampled_pred_instances = InstanceData(
+            scores=cls_score, masks=mask_points_pred)
+        # assign and sample
+        assign_result = self.assigner.assign(
+            pred_instances=sampled_pred_instances,
+            gt_instances=sampled_gt_instances,
+            img_meta=img_meta)
+        pred_instances = InstanceData(scores=cls_score, masks=mask_pred)
+        sampling_result = self.sampler.sample(
+            assign_result=assign_result,
+            pred_instances=pred_instances,
+            gt_instances=gt_instances)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label target
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_labels.new_ones((self.num_queries, ))
+
+        # mask target
+        mask_targets = gt_masks[sampling_result.pos_assigned_gt_inds]
+        mask_weights = mask_pred.new_zeros((self.num_queries, ))
+        mask_weights[pos_inds] = 1.0
+
+        return (labels, label_weights, mask_targets, mask_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def _loss_by_feat_single(self, cls_scores: Tensor, mask_preds: Tensor,
+                             batch_gt_instances: List[InstanceData],
+                             batch_img_metas: List[dict]) -> Tuple[Tensor]:
+        """Loss function for outputs from a single decoder layer.
+
+        Args:
+            cls_scores (Tensor): Mask score logits from a single decoder layer
+                for all images. Shape (batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should include
+                background.
+            mask_preds (Tensor): Mask logits for a pixel decoder for all
+                images. Shape (batch_size, num_queries, num_frames,h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+            batch_img_metas (list[dict]): List of image meta information.
+
+        Returns:
+            tuple[Tensor]: Loss components for outputs from a single \
+                decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        mask_preds_list = [mask_preds[i] for i in range(num_imgs)]
+        (labels_list, label_weights_list, mask_targets_list, mask_weights_list,
+         avg_factor) = self.get_targets(cls_scores_list, mask_preds_list,
+                                        batch_gt_instances, batch_img_metas)
+        # shape (batch_size, num_queries)
+        labels = torch.stack(labels_list, dim=0)
+        # shape (batch_size, num_queries)
+        label_weights = torch.stack(label_weights_list, dim=0)
+        # shape (num_total_gts, num_frames, h, w)
+        mask_targets = torch.cat(mask_targets_list, dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(mask_weights_list, dim=0)
+
+        # classfication loss
+        # shape (batch_size * num_queries, )
+        cls_scores = cls_scores.flatten(0, 1)
+        labels = labels.flatten(0, 1)
+        label_weights = label_weights.flatten(0, 1)
+
+        class_weight = cls_scores.new_tensor(self.class_weight)
+        loss_cls = self.loss_cls(
+            cls_scores,
+            labels,
+            label_weights,
+            avg_factor=class_weight[labels].sum())
+
+        num_total_masks = reduce_mean(cls_scores.new_tensor([avg_factor]))
+        num_total_masks = max(num_total_masks, 1)
+
+        # extract positive ones
+        # shape (batch_size, num_queries, num_frames, h, w)
+        # -> (num_total_gts, num_frames, h, w)
+        mask_preds = mask_preds[mask_weights > 0]
+
+        if mask_targets.shape[0] == 0:
+            # zero match
+            loss_dice = mask_preds.sum()
+            loss_mask = mask_preds.sum()
+            return loss_cls, loss_mask, loss_dice
+
+        with torch.no_grad():
+            points_coords = get_uncertain_point_coords_with_randomness(
+                mask_preds.flatten(0, 1).unsqueeze(1), None, self.num_points,
+                self.oversample_ratio, self.importance_sample_ratio)
+            # shape (num_total_gts * num_frames, h, w) ->
+            # (num_total_gts, num_points)
+            mask_point_targets = point_sample(
+                mask_targets.flatten(0, 1).unsqueeze(1).float(),
+                points_coords).squeeze(1)
+        # shape (num_total_gts * num_frames, num_points)
+        mask_point_preds = point_sample(
+            mask_preds.flatten(0, 1).unsqueeze(1), points_coords).squeeze(1)
+
+        # dice loss
+        loss_dice = self.loss_dice(
+            mask_point_preds, mask_point_targets, avg_factor=num_total_masks)
+
+        # mask loss
+        # shape (num_total_gts * num_frames, num_points) ->
+        # (num_total_gts * num_frames * num_points, )
+        mask_point_preds = mask_point_preds.reshape(-1)
+        # shape (num_total_gts, num_points) -> (num_total_gts * num_points, )
+        mask_point_targets = mask_point_targets.reshape(-1)
+        loss_mask = self.loss_mask(
+            mask_point_preds,
+            mask_point_targets,
+            avg_factor=num_total_masks * self.num_points / self.num_frames)
+
+        return loss_cls, loss_mask, loss_dice
+
+    def _forward_head(
+        self, decoder_out: Tensor, mask_feature: Tensor,
+        attn_mask_target_size: Tuple[int,
+                                     int]) -> Tuple[Tensor, Tensor, Tensor]:
+        """Forward for head part which is called after every decoder layer.
+
+        Args:
+            decoder_out (Tensor): in shape (num_queries, batch_size, c).
+            mask_feature (Tensor): in shape (batch_size, t, c, h, w).
+            attn_mask_target_size (tuple[int, int]): target attention
+                mask size.
+
+        Returns:
+            tuple: A tuple contain three elements.
+
+                - cls_pred (Tensor): Classification scores in shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should include background.
+                - mask_pred (Tensor): Mask scores in shape \
+                    (batch_size, num_queries,h, w).
+                - attn_mask (Tensor): Attention mask in shape \
+                    (batch_size * num_heads, num_queries, h, w).
+        """
+        decoder_out = self.transformer_decoder.post_norm(decoder_out)
+        cls_pred = self.cls_embed(decoder_out)
+        mask_embed = self.mask_embed(decoder_out)
+
+        # shape (batch_size, num_queries, t, h, w)
+        mask_pred = torch.einsum('bqc,btchw->bqthw', mask_embed, mask_feature)
+        b, q, t, _, _ = mask_pred.shape
+
+        attn_mask = F.interpolate(
+            mask_pred.flatten(0, 1),
+            attn_mask_target_size,
+            mode='bilinear',
+            align_corners=False).view(b, q, t, attn_mask_target_size[0],
+                                      attn_mask_target_size[1])
+
+        # shape (batch_size, num_queries, t, h, w) ->
+        # (batch_size, num_queries, t*h*w) ->
+        # (batch_size, num_head, num_queries, t*h*w) ->
+        # (batch_size*num_head, num_queries, t*h*w)
+        attn_mask = attn_mask.flatten(2).unsqueeze(1).repeat(
+            (1, self.num_heads, 1, 1)).flatten(0, 1)
+        attn_mask = attn_mask.sigmoid() < 0.5
+        attn_mask = attn_mask.detach()
+
+        return cls_pred, mask_pred, attn_mask
+
+    def forward(
+            self, x: List[Tensor], data_samples: TrackDataSample
+    ) -> Tuple[List[Tensor], List[Tensor]]:
+        """Forward function.
+
+        Args:
+            x (list[Tensor]): Multi scale Features from the
+                upstream network, each is a 4D-tensor.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+
+        Returns:
+            tuple[list[Tensor]]: A tuple contains two elements.
+
+                - cls_pred_list (list[Tensor)]: Classification logits \
+                    for each decoder layer. Each is a 3D-tensor with shape \
+                    (batch_size, num_queries, cls_out_channels). \
+                    Note `cls_out_channels` should include background.
+                - mask_pred_list (list[Tensor]): Mask logits for each \
+                    decoder layer. Each with shape (batch_size, num_queries, \
+                    h, w).
+        """
+        mask_features, multi_scale_memorys = self.pixel_decoder(x)
+        bt, c_m, h_m, w_m = mask_features.shape
+        batch_size = bt // self.num_frames if self.training else 1
+        t = bt // batch_size
+        mask_features = mask_features.view(batch_size, t, c_m, h_m, w_m)
+        # multi_scale_memorys (from low resolution to high resolution)
+        decoder_inputs = []
+        decoder_positional_encodings = []
+        for i in range(self.num_transformer_feat_level):
+            decoder_input = self.decoder_input_projs[i](multi_scale_memorys[i])
+            decoder_input = decoder_input.flatten(2)
+            level_embed = self.level_embed.weight[i][None, :, None]
+            decoder_input = decoder_input + level_embed
+            _, c, hw = decoder_input.shape
+            # shape (batch_size*t, c, h, w) ->
+            # (batch_size, t, c, hw) ->
+            # (batch_size, t*h*w, c)
+            decoder_input = decoder_input.view(batch_size, t, c,
+                                               hw).permute(0, 1, 3,
+                                                           2).flatten(1, 2)
+            # shape (batch_size, c, h, w) -> (h*w, batch_size, c)
+            mask = decoder_input.new_zeros(
+                (batch_size, t) + multi_scale_memorys[i].shape[-2:],
+                dtype=torch.bool)
+            decoder_positional_encoding = self.decoder_positional_encoding(
+                mask)
+            decoder_positional_encoding = decoder_positional_encoding.flatten(
+                3).permute(0, 1, 3, 2).flatten(1, 2)
+            decoder_inputs.append(decoder_input)
+            decoder_positional_encodings.append(decoder_positional_encoding)
+        # shape (num_queries, c) -> (batch_size, num_queries, c)
+        query_feat = self.query_feat.weight.unsqueeze(0).repeat(
+            (batch_size, 1, 1))
+        query_embed = self.query_embed.weight.unsqueeze(0).repeat(
+            (batch_size, 1, 1))
+
+        cls_pred_list = []
+        mask_pred_list = []
+        cls_pred, mask_pred, attn_mask = self._forward_head(
+            query_feat, mask_features, multi_scale_memorys[0].shape[-2:])
+        cls_pred_list.append(cls_pred)
+        mask_pred_list.append(mask_pred)
+
+        for i in range(self.num_transformer_decoder_layers):
+            level_idx = i % self.num_transformer_feat_level
+            # if a mask is all True(all background), then set it all False.
+            attn_mask[torch.where(
+                attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+
+            # cross_attn + self_attn
+            layer = self.transformer_decoder.layers[i]
+            query_feat = layer(
+                query=query_feat,
+                key=decoder_inputs[level_idx],
+                value=decoder_inputs[level_idx],
+                query_pos=query_embed,
+                key_pos=decoder_positional_encodings[level_idx],
+                cross_attn_mask=attn_mask,
+                query_key_padding_mask=None,
+                # here we do not apply masking on padded region
+                key_padding_mask=None)
+            cls_pred, mask_pred, attn_mask = self._forward_head(
+                query_feat, mask_features, multi_scale_memorys[
+                    (i + 1) % self.num_transformer_feat_level].shape[-2:])
+
+            cls_pred_list.append(cls_pred)
+            mask_pred_list.append(mask_pred)
+
+        return cls_pred_list, mask_pred_list
+
+    def loss(
+        self,
+        x: Tuple[Tensor],
+        data_samples: TrackSampleList,
+    ) -> Dict[str, Tensor]:
+        """Perform forward propagation and loss calculation of the track head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+
+        for data_sample in data_samples:
+            video_img_metas = defaultdict(list)
+            for image_idx in range(len(data_sample)):
+                batch_gt_instances.append(data_sample[image_idx].gt_instances)
+                for key, value in data_sample[image_idx].metainfo.items():
+                    video_img_metas[key].append(value)
+            batch_img_metas.append(video_img_metas)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, data_samples)
+
+        # preprocess ground truth
+        batch_gt_instances = self.preprocess_gt(batch_gt_instances)
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self,
+                x: Tuple[Tensor],
+                data_samples: TrackDataSample,
+                rescale: bool = True) -> InstanceList:
+        """Test without augmentation.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            data_samples (List[:obj:`TrackDataSample`]): The Data
+                Samples. It usually includes information such as `gt_instance`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+                - labels (Tensor): Prediction class indices\
+                    for an image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in an image.
+                - masks (Tensor): Prediction mask for a\
+                    image, with shape (n, t, h, w).
+        """
+
+        batch_img_metas = [
+            data_samples[img_idx].metainfo
+            for img_idx in range(len(data_samples))
+        ]
+        all_cls_scores, all_mask_preds = self(x, data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        mask_cls_results = mask_cls_results[0]
+        # upsample masks
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results[0],
+            size=(img_shape[0], img_shape[1]),
+            mode='bilinear',
+            align_corners=False)
+
+        results = self.predict_by_feat(mask_cls_results, mask_pred_results,
+                                       batch_img_metas)
+        return results
+
+    def predict_by_feat(self,
+                        mask_cls_results: List[Tensor],
+                        mask_pred_results: List[Tensor],
+                        batch_img_metas: List[dict],
+                        rescale: bool = True) -> InstanceList:
+        """Get top-10 predictions.
+
+        Args:
+            mask_cls_results (Tensor): Mask classification logits,\
+                shape (batch_size, num_queries, cls_out_channels).
+                Note `cls_out_channels` should include background.
+            mask_pred_results (Tensor): Mask logits, shape \
+                (batch_size, num_queries, h, w).
+            batch_img_metas (list[dict]): List of image meta information.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            list[obj:`InstanceData`]: each contains the following keys
+                - labels (Tensor): Prediction class indices\
+                    for an image, with shape (n, ), n is the sum of\
+                    number of stuff type and number of instance in an image.
+                - masks (Tensor): Prediction mask for a\
+                    image, with shape (n, t, h, w).
+        """
+        results = []
+        if len(mask_cls_results) > 0:
+            scores = F.softmax(mask_cls_results, dim=-1)[:, :-1]
+            labels = torch.arange(self.num_classes).unsqueeze(0).repeat(
+                self.num_queries, 1).flatten(0, 1).to(scores.device)
+            # keep top-10 predictions
+            scores_per_image, topk_indices = scores.flatten(0, 1).topk(
+                10, sorted=False)
+            labels_per_image = labels[topk_indices]
+            topk_indices = topk_indices // self.num_classes
+            mask_pred_results = mask_pred_results[topk_indices]
+
+            img_shape = batch_img_metas[0]['img_shape']
+            mask_pred_results = \
+                mask_pred_results[:, :, :img_shape[0], :img_shape[1]]
+            if rescale:
+                # return result in original resolution
+                ori_height, ori_width = batch_img_metas[0]['ori_shape'][:2]
+                mask_pred_results = F.interpolate(
+                    mask_pred_results,
+                    size=(ori_height, ori_width),
+                    mode='bilinear',
+                    align_corners=False)
+
+            masks = mask_pred_results > 0.
+
+            # format top-10 predictions
+            for img_idx in range(len(batch_img_metas)):
+                pred_track_instances = InstanceData()
+
+                pred_track_instances.masks = masks[:, img_idx]
+                pred_track_instances.bboxes = mask2bbox(masks[:, img_idx])
+                pred_track_instances.labels = labels_per_image
+                pred_track_instances.scores = scores_per_image
+                pred_track_instances.instances_id = torch.arange(10)
+
+                results.append(pred_track_instances)
+
+            return results
diff --git a/head_extractor/src/mmdet/models/tracking_heads/quasi_dense_embed_head.py b/head_extractor/src/mmdet/models/tracking_heads/quasi_dense_embed_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..55e3c05b7aba188608f7dd2fdda54e0759cee03c
--- /dev/null
+++ b/head_extractor/src/mmdet/models/tracking_heads/quasi_dense_embed_head.py
@@ -0,0 +1,347 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.task_modules import SamplingResult
+from mmdet.registry import MODELS
+from ..task_modules.tracking import embed_similarity
+
+
+@MODELS.register_module()
+class QuasiDenseEmbedHead(BaseModule):
+    """The quasi-dense roi embed head.
+
+    Args:
+        embed_channels (int): The input channel of embed features.
+            Defaults to 256.
+        softmax_temp (int): Softmax temperature. Defaults to -1.
+        loss_track (dict): The loss function for tracking. Defaults to
+            MultiPosCrossEntropyLoss.
+        loss_track_aux (dict): The auxiliary loss function for tracking.
+            Defaults to MarginL2Loss.
+        init_cfg (:obj:`ConfigDict` or dict or list[:obj:`ConfigDict` or \
+            dict]): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_convs: int = 0,
+                 num_fcs: int = 0,
+                 roi_feat_size: int = 7,
+                 in_channels: int = 256,
+                 conv_out_channels: int = 256,
+                 with_avg_pool: bool = False,
+                 fc_out_channels: int = 1024,
+                 conv_cfg: Optional[dict] = None,
+                 norm_cfg: Optional[dict] = None,
+                 embed_channels: int = 256,
+                 softmax_temp: int = -1,
+                 loss_track: Optional[dict] = None,
+                 loss_track_aux: dict = dict(
+                     type='MarginL2Loss',
+                     sample_ratio=3,
+                     margin=0.3,
+                     loss_weight=1.0,
+                     hard_mining=True),
+                 init_cfg: dict = dict(
+                     type='Xavier',
+                     layer='Linear',
+                     distribution='uniform',
+                     bias=0,
+                     override=dict(
+                         type='Normal',
+                         name='fc_embed',
+                         mean=0,
+                         std=0.01,
+                         bias=0))):
+        super(QuasiDenseEmbedHead, self).__init__(init_cfg=init_cfg)
+        self.num_convs = num_convs
+        self.num_fcs = num_fcs
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1]
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.with_avg_pool = with_avg_pool
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(self.roi_feat_size)
+        # add convs and fcs
+        self.convs, self.fcs, self.last_layer_dim = self._add_conv_fc_branch(
+            self.num_convs, self.num_fcs, self.in_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+        if loss_track is None:
+            loss_track = dict(
+                type='MultiPosCrossEntropyLoss', loss_weight=0.25)
+
+        self.fc_embed = nn.Linear(self.last_layer_dim, embed_channels)
+        self.softmax_temp = softmax_temp
+        self.loss_track = MODELS.build(loss_track)
+        if loss_track_aux is not None:
+            self.loss_track_aux = MODELS.build(loss_track_aux)
+        else:
+            self.loss_track_aux = None
+
+    def _add_conv_fc_branch(
+            self, num_branch_convs: int, num_branch_fcs: int,
+            in_channels: int) -> Tuple[nn.ModuleList, nn.ModuleList, int]:
+        """Add shared or separable branch. convs -> avg pool (optional) -> fcs.
+
+        Args:
+            num_branch_convs (int): The number of convoluational layers.
+            num_branch_fcs (int): The number of fully connection layers.
+            in_channels (int): The input channel of roi features.
+
+        Returns:
+            Tuple[nn.ModuleList, nn.ModuleList, int]: The convs, fcs and the
+                last layer dimension.
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels,
+                        self.conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            last_layer_dim = self.conv_out_channels
+
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            if not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+
+        return branch_convs, branch_fcs, last_layer_dim
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x (Tensor): The input features from ROI head.
+
+        Returns:
+            Tensor: The embedding feature map.
+        """
+
+        if self.num_convs > 0:
+            for conv in self.convs:
+                x = conv(x)
+        x = x.flatten(1)
+        if self.num_fcs > 0:
+            for fc in self.fcs:
+                x = self.relu(fc(x))
+        x = self.fc_embed(x)
+        return x
+
+    def get_targets(
+            self, gt_match_indices: List[Tensor],
+            key_sampling_results: List[SamplingResult],
+            ref_sampling_results: List[SamplingResult]) -> Tuple[List, List]:
+        """Calculate the track targets and track weights for all samples in a
+        batch according to the sampling_results.
+
+        Args:
+            gt_match_indices (list(Tensor)): Mapping from gt_instance_ids to
+                ref_gt_instance_ids of the same tracklet in a pair of images.
+            key_sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResult]): Assign results of
+                all reference images in a batch after sampling.
+
+        Returns:
+            Tuple[list[Tensor]]: Association results.
+            Containing the following list of Tensors:
+
+                - track_targets (list[Tensor]): The mapping instance ids from
+                    all positive proposals in the key image to all proposals
+                    in the reference image, each tensor in list has
+                    shape (len(key_pos_bboxes), len(ref_bboxes)).
+                - track_weights (list[Tensor]): Loss weights for all positive
+                    proposals in a batch, each tensor in list has
+                    shape (len(key_pos_bboxes),).
+        """
+
+        track_targets = []
+        track_weights = []
+        for _gt_match_indices, key_res, ref_res in zip(gt_match_indices,
+                                                       key_sampling_results,
+                                                       ref_sampling_results):
+            targets = _gt_match_indices.new_zeros(
+                (key_res.pos_bboxes.size(0), ref_res.bboxes.size(0)),
+                dtype=torch.int)
+            _match_indices = _gt_match_indices[key_res.pos_assigned_gt_inds]
+            pos2pos = (_match_indices.view(
+                -1, 1) == ref_res.pos_assigned_gt_inds.view(1, -1)).int()
+            targets[:, :pos2pos.size(1)] = pos2pos
+            weights = (targets.sum(dim=1) > 0).float()
+            track_targets.append(targets)
+            track_weights.append(weights)
+        return track_targets, track_weights
+
+    def match(
+        self, key_embeds: Tensor, ref_embeds: Tensor,
+        key_sampling_results: List[SamplingResult],
+        ref_sampling_results: List[SamplingResult]
+    ) -> Tuple[List[Tensor], List[Tensor]]:
+        """Calculate the dist matrixes for loss measurement.
+
+        Args:
+            key_embeds (Tensor): Embeds of positive bboxes in sampling results
+                of key image.
+            ref_embeds (Tensor): Embeds of all bboxes in sampling results
+                of the reference image.
+            key_sampling_results (List[obj:SamplingResults]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResults]): Assign results of
+                all reference images in a batch after sampling.
+
+        Returns:
+            Tuple[list[Tensor]]: Calculation results.
+            Containing the following list of Tensors:
+
+                - dists (list[Tensor]): Dot-product dists between
+                    key_embeds and ref_embeds, each tensor in list has
+                    shape (len(key_pos_bboxes), len(ref_bboxes)).
+                - cos_dists (list[Tensor]): Cosine dists between
+                    key_embeds and ref_embeds, each tensor in list has
+                    shape (len(key_pos_bboxes), len(ref_bboxes)).
+        """
+
+        num_key_rois = [res.pos_bboxes.size(0) for res in key_sampling_results]
+        key_embeds = torch.split(key_embeds, num_key_rois)
+        num_ref_rois = [res.bboxes.size(0) for res in ref_sampling_results]
+        ref_embeds = torch.split(ref_embeds, num_ref_rois)
+
+        dists, cos_dists = [], []
+        for key_embed, ref_embed in zip(key_embeds, ref_embeds):
+            dist = embed_similarity(
+                key_embed,
+                ref_embed,
+                method='dot_product',
+                temperature=self.softmax_temp)
+            dists.append(dist)
+            if self.loss_track_aux is not None:
+                cos_dist = embed_similarity(
+                    key_embed, ref_embed, method='cosine')
+                cos_dists.append(cos_dist)
+            else:
+                cos_dists.append(None)
+        return dists, cos_dists
+
+    def loss(self, key_roi_feats: Tensor, ref_roi_feats: Tensor,
+             key_sampling_results: List[SamplingResult],
+             ref_sampling_results: List[SamplingResult],
+             gt_match_indices_list: List[Tensor]) -> dict:
+        """Calculate the track loss and the auxiliary track loss.
+
+        Args:
+            key_roi_feats (Tensor): Embeds of positive bboxes in sampling
+                results of key image.
+            ref_roi_feats (Tensor): Embeds of all bboxes in sampling results
+                of the reference image.
+            key_sampling_results (List[obj:SamplingResults]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResults]): Assign results of
+                all reference images in a batch after sampling.
+            gt_match_indices_list (list(Tensor)): Mapping from gt_instances_ids
+                to ref_gt_instances_ids of the same tracklet in a pair of
+                images.
+
+        Returns:
+            Dict [str: Tensor]: Calculation results.
+            Containing the following list of Tensors:
+
+                - loss_track (Tensor): Results of loss_track function.
+                - loss_track_aux (Tensor): Results of loss_track_aux function.
+        """
+        key_track_feats = self(key_roi_feats)
+        ref_track_feats = self(ref_roi_feats)
+
+        losses = self.loss_by_feat(key_track_feats, ref_track_feats,
+                                   key_sampling_results, ref_sampling_results,
+                                   gt_match_indices_list)
+        return losses
+
+    def loss_by_feat(self, key_track_feats: Tensor, ref_track_feats: Tensor,
+                     key_sampling_results: List[SamplingResult],
+                     ref_sampling_results: List[SamplingResult],
+                     gt_match_indices_list: List[Tensor]) -> dict:
+        """Calculate the track loss and the auxiliary track loss.
+
+        Args:
+            key_track_feats (Tensor): Embeds of positive bboxes in sampling
+                results of key image.
+            ref_track_feats (Tensor): Embeds of all bboxes in sampling results
+                of the reference image.
+            key_sampling_results (List[obj:SamplingResults]): Assign results of
+                all images in a batch after sampling.
+            ref_sampling_results (List[obj:SamplingResults]): Assign results of
+                all reference images in a batch after sampling.
+            gt_match_indices_list (list(Tensor)): Mapping from instances_ids
+                from key image to reference image of the same tracklet in a
+                pair of images.
+
+        Returns:
+            Dict [str: Tensor]: Calculation results.
+            Containing the following list of Tensors:
+
+                - loss_track (Tensor): Results of loss_track function.
+                - loss_track_aux (Tensor): Results of loss_track_aux function.
+        """
+        dists, cos_dists = self.match(key_track_feats, ref_track_feats,
+                                      key_sampling_results,
+                                      ref_sampling_results)
+        targets, weights = self.get_targets(gt_match_indices_list,
+                                            key_sampling_results,
+                                            ref_sampling_results)
+        losses = dict()
+
+        loss_track = 0.
+        loss_track_aux = 0.
+        for _dists, _cos_dists, _targets, _weights in zip(
+                dists, cos_dists, targets, weights):
+            loss_track += self.loss_track(
+                _dists, _targets, _weights, avg_factor=_weights.sum())
+            if self.loss_track_aux is not None:
+                loss_track_aux += self.loss_track_aux(_cos_dists, _targets)
+        losses['loss_track'] = loss_track / len(dists)
+
+        if self.loss_track_aux is not None:
+            losses['loss_track_aux'] = loss_track_aux / len(dists)
+
+        return losses
+
+    def predict(self, bbox_feats: Tensor) -> Tensor:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            bbox_feats: The extracted roi features.
+
+        Returns:
+            Tensor: The extracted track features.
+        """
+        track_feats = self(bbox_feats)
+        return track_feats
diff --git a/head_extractor/src/mmdet/models/tracking_heads/quasi_dense_track_head.py b/head_extractor/src/mmdet/models/tracking_heads/quasi_dense_track_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd078dac827e35c7514330870cf884001985156b
--- /dev/null
+++ b/head_extractor/src/mmdet/models/tracking_heads/quasi_dense_track_head.py
@@ -0,0 +1,178 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import TrackSampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList
+
+
+@MODELS.register_module()
+class QuasiDenseTrackHead(BaseModule):
+    """The quasi-dense track head."""
+
+    def __init__(self,
+                 roi_extractor: Optional[dict] = None,
+                 embed_head: Optional[dict] = None,
+                 regress_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 **kwargs):
+        super().__init__(init_cfg=init_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if embed_head is not None:
+            self.init_embed_head(roi_extractor, embed_head)
+
+        if regress_head is not None:
+            raise NotImplementedError('Regression head is not supported yet.')
+
+        self.init_assigner_sampler()
+
+    def init_embed_head(self, roi_extractor, embed_head) -> None:
+        """Initialize ``embed_head``
+
+        Args:
+            roi_extractor (dict, optional): Configuration of roi extractor.
+                Defaults to None.
+            embed_head (dict, optional): Configuration of embed head. Defaults
+                to None.
+        """
+        self.roi_extractor = MODELS.build(roi_extractor)
+        self.embed_head = MODELS.build(embed_head)
+
+    def init_assigner_sampler(self) -> None:
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.bbox_sampler = TASK_UTILS.build(
+                self.train_cfg.sampler, default_args=dict(context=self))
+
+    @property
+    def with_track(self) -> bool:
+        """bool: whether the multi-object tracker has an embed head"""
+        return hasattr(self, 'embed_head') and self.embed_head is not None
+
+    def extract_roi_feats(self, feats: List[Tensor],
+                          bboxes: List[Tensor]) -> Tensor:
+        """Extract roi features.
+
+        Args:
+            feats (list[Tensor]): list of multi-level image features.
+            bboxes (list[Tensor]): list of bboxes in sampling result.
+
+        Returns:
+            Tensor: The extracted roi features.
+        """
+        rois = bbox2roi(bboxes)
+        bbox_feats = self.roi_extractor(feats[:self.roi_extractor.num_inputs],
+                                        rois)
+        return bbox_feats
+
+    def loss(self, key_feats: List[Tensor], ref_feats: List[Tensor],
+             rpn_results_list: InstanceList,
+             ref_rpn_results_list: InstanceList, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            key_feats (list[Tensor]): list of multi-level image features.
+            ref_feats (list[Tensor]): list of multi-level ref_img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals of key img.
+            ref_rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals of ref img.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        assert self.with_track
+        num_imgs = len(data_samples)
+        batch_gt_instances = []
+        ref_batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        gt_match_indices_list = []
+        for track_data_sample in data_samples:
+            key_data_sample = track_data_sample.get_key_frames()[0]
+            ref_data_sample = track_data_sample.get_ref_frames()[0]
+            batch_gt_instances.append(key_data_sample.gt_instances)
+            ref_batch_gt_instances.append(ref_data_sample.gt_instances)
+            if 'ignored_instances' in key_data_sample:
+                batch_gt_instances_ignore.append(
+                    key_data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+            # get gt_match_indices
+            ins_ids = key_data_sample.gt_instances.instances_ids.tolist()
+            ref_ins_ids = ref_data_sample.gt_instances.instances_ids.tolist()
+            match_indices = Tensor([
+                ref_ins_ids.index(i) if (i in ref_ins_ids and i > 0) else -1
+                for i in ins_ids
+            ]).to(key_feats[0].device)
+            gt_match_indices_list.append(match_indices)
+
+        key_sampling_results, ref_sampling_results = [], []
+        for i in range(num_imgs):
+            rpn_results = rpn_results_list[i]
+            ref_rpn_results = ref_rpn_results_list[i]
+            # rename ref_rpn_results.bboxes to ref_rpn_results.priors
+            ref_rpn_results.priors = ref_rpn_results.pop('bboxes')
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in key_feats])
+            key_sampling_results.append(sampling_result)
+
+            ref_assign_result = self.bbox_assigner.assign(
+                ref_rpn_results, ref_batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            ref_sampling_result = self.bbox_sampler.sample(
+                ref_assign_result,
+                ref_rpn_results,
+                ref_batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in ref_feats])
+            ref_sampling_results.append(ref_sampling_result)
+
+        key_bboxes = [res.pos_bboxes for res in key_sampling_results]
+        key_roi_feats = self.extract_roi_feats(key_feats, key_bboxes)
+        ref_bboxes = [res.bboxes for res in ref_sampling_results]
+        ref_roi_feats = self.extract_roi_feats(ref_feats, ref_bboxes)
+
+        loss_track = self.embed_head.loss(key_roi_feats, ref_roi_feats,
+                                          key_sampling_results,
+                                          ref_sampling_results,
+                                          gt_match_indices_list)
+
+        return loss_track
+
+    def predict(self, feats: List[Tensor],
+                rescaled_bboxes: List[Tensor]) -> Tensor:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            feats (list[Tensor]): Multi level feature maps of `img`.
+            rescaled_bboxes (list[Tensor]): list of rescaled bboxes in sampling
+                result.
+
+        Returns:
+            Tensor: The extracted track features.
+        """
+        bbox_feats = self.extract_roi_feats(feats, rescaled_bboxes)
+        track_feats = self.embed_head.predict(bbox_feats)
+        return track_feats
diff --git a/head_extractor/src/mmdet/models/tracking_heads/roi_embed_head.py b/head_extractor/src/mmdet/models/tracking_heads/roi_embed_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e18b81fbe52e109e7afb3e6d5e8e6624ef48242f
--- /dev/null
+++ b/head_extractor/src/mmdet/models/tracking_heads/roi_embed_head.py
@@ -0,0 +1,391 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+
+from mmdet.models.losses import accuracy
+from mmdet.models.task_modules import SamplingResult
+from mmdet.models.task_modules.tracking import embed_similarity
+from mmdet.registry import MODELS
+
+
+@MODELS.register_module()
+class RoIEmbedHead(BaseModule):
+    """The roi embed head.
+
+    This module is used in multi-object tracking methods, such as MaskTrack
+    R-CNN.
+
+    Args:
+        num_convs (int): The number of convoluational layers to embed roi
+            features. Defaults to 0.
+        num_fcs (int): The number of fully connection layers to embed roi
+            features. Defaults to 0.
+        roi_feat_size (int|tuple(int)): The spatial size of roi features.
+            Defaults to 7.
+        in_channels (int): The input channel of roi features. Defaults to 256.
+        conv_out_channels (int): The output channel of roi features after
+            forwarding convoluational layers. Defaults to 256.
+        with_avg_pool (bool): Whether use average pooling before passing roi
+            features into fully connection layers. Defaults to False.
+        fc_out_channels (int): The output channel of roi features after
+            forwarding fully connection layers. Defaults to 1024.
+        conv_cfg (dict): Config dict for convolution layer. Defaults to None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Defaults to None.
+        loss_match (dict): The loss function. Defaults to
+            dict(type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)
+        init_cfg (dict): Configuration of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_convs: int = 0,
+                 num_fcs: int = 0,
+                 roi_feat_size: int = 7,
+                 in_channels: int = 256,
+                 conv_out_channels: int = 256,
+                 with_avg_pool: bool = False,
+                 fc_out_channels: int = 1024,
+                 conv_cfg: Optional[dict] = None,
+                 norm_cfg: Optional[dict] = None,
+                 loss_match: dict = dict(
+                     type='mmdet.CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 init_cfg: Optional[dict] = None,
+                 **kwargs):
+        super(RoIEmbedHead, self).__init__(init_cfg=init_cfg)
+        self.num_convs = num_convs
+        self.num_fcs = num_fcs
+        self.roi_feat_size = _pair(roi_feat_size)
+        self.roi_feat_area = self.roi_feat_size[0] * self.roi_feat_size[1]
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.with_avg_pool = with_avg_pool
+        self.fc_out_channels = fc_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.loss_match = MODELS.build(loss_match)
+        self.fp16_enabled = False
+
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(self.roi_feat_size)
+        # add convs and fcs
+        self.convs, self.fcs, self.last_layer_dim = self._add_conv_fc_branch(
+            self.num_convs, self.num_fcs, self.in_channels)
+        self.relu = nn.ReLU(inplace=True)
+
+    def _add_conv_fc_branch(
+            self, num_branch_convs: int, num_branch_fcs: int,
+            in_channels: int) -> Tuple[nn.ModuleList, nn.ModuleList, int]:
+        """Add shared or separable branch.
+
+        convs -> avg pool (optional) -> fcs
+        """
+        last_layer_dim = in_channels
+        # add branch specific conv layers
+        branch_convs = nn.ModuleList()
+        if num_branch_convs > 0:
+            for i in range(num_branch_convs):
+                conv_in_channels = (
+                    last_layer_dim if i == 0 else self.conv_out_channels)
+                branch_convs.append(
+                    ConvModule(
+                        conv_in_channels,
+                        self.conv_out_channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg))
+            last_layer_dim = self.conv_out_channels
+
+        # add branch specific fc layers
+        branch_fcs = nn.ModuleList()
+        if num_branch_fcs > 0:
+            if not self.with_avg_pool:
+                last_layer_dim *= self.roi_feat_area
+            for i in range(num_branch_fcs):
+                fc_in_channels = (
+                    last_layer_dim if i == 0 else self.fc_out_channels)
+                branch_fcs.append(
+                    nn.Linear(fc_in_channels, self.fc_out_channels))
+            last_layer_dim = self.fc_out_channels
+
+        return branch_convs, branch_fcs, last_layer_dim
+
+    @property
+    def custom_activation(self):
+        return getattr(self.loss_match, 'custom_activation', False)
+
+    def extract_feat(self, x: Tensor,
+                     num_x_per_img: List[int]) -> Tuple[Tensor]:
+        """Extract feature from the input `x`, and split the output to a list.
+
+        Args:
+            x (Tensor): of shape [N, C, H, W]. N is the number of proposals.
+            num_x_per_img (list[int]): The `x` contains proposals of
+                multi-images. `num_x_per_img` denotes the number of proposals
+                for each image.
+
+        Returns:
+            list[Tensor]: Each Tensor denotes the embed features belonging to
+            an image in a batch.
+        """
+        if self.num_convs > 0:
+            for conv in self.convs:
+                x = conv(x)
+
+        if self.num_fcs > 0:
+            if self.with_avg_pool:
+                x = self.avg_pool(x)
+            x = x.flatten(1)
+            for fc in self.fcs:
+                x = self.relu(fc(x))
+        else:
+            x = x.flatten(1)
+
+        x_split = torch.split(x, num_x_per_img, dim=0)
+        return x_split
+
+    def forward(
+            self, x: Tensor, ref_x: Tensor, num_x_per_img: List[int],
+            num_x_per_ref_img: List[int]
+    ) -> Tuple[Tuple[Tensor], Tuple[Tensor]]:
+        """Computing the similarity scores between `x` and `ref_x`.
+
+        Args:
+            x (Tensor): of shape [N, C, H, W]. N is the number of key frame
+                proposals.
+            ref_x (Tensor): of shape [M, C, H, W]. M is the number of reference
+                frame proposals.
+            num_x_per_img (list[int]): The `x` contains proposals of
+                multi-images. `num_x_per_img` denotes the number of proposals
+                for each key image.
+            num_x_per_ref_img (list[int]): The `ref_x` contains proposals of
+                multi-images. `num_x_per_ref_img` denotes the number of
+                proposals for each reference image.
+
+        Returns:
+            tuple[tuple[Tensor], tuple[Tensor]]: Each tuple of tensor denotes
+            the embed features belonging to an image in a batch.
+        """
+        x_split = self.extract_feat(x, num_x_per_img)
+        ref_x_split = self.extract_feat(ref_x, num_x_per_ref_img)
+
+        return x_split, ref_x_split
+
+    def get_targets(self, sampling_results: List[SamplingResult],
+                    gt_instance_ids: List[Tensor],
+                    ref_gt_instance_ids: List[Tensor]) -> Tuple[List, List]:
+        """Calculate the ground truth for all samples in a batch according to
+        the sampling_results.
+
+        Args:
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of
+                all images in a batch, each tensor has shape (num_gt, ).
+            ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes
+                of all reference images in a batch, each tensor has shape
+                (num_gt, ).
+
+        Returns:
+            Tuple[list[Tensor]]: Ground truth for proposals in a batch.
+            Containing the following list of Tensors:
+
+                - track_id_targets (list[Tensor]): The instance ids of
+                  Gt_labels for all proposals in a batch, each tensor in list
+                  has shape (num_proposals,).
+                - track_id_weights (list[Tensor]): Labels_weights for
+                  all proposals in a batch, each tensor in list has
+                  shape (num_proposals,).
+        """
+        track_id_targets = []
+        track_id_weights = []
+
+        for res, gt_instance_id, ref_gt_instance_id in zip(
+                sampling_results, gt_instance_ids, ref_gt_instance_ids):
+            pos_instance_ids = gt_instance_id[res.pos_assigned_gt_inds]
+            pos_match_id = gt_instance_id.new_zeros(len(pos_instance_ids))
+            for i, id in enumerate(pos_instance_ids):
+                if id in ref_gt_instance_id:
+                    pos_match_id[i] = ref_gt_instance_id.tolist().index(id) + 1
+
+            track_id_target = gt_instance_id.new_zeros(
+                len(res.bboxes), dtype=torch.int64)
+            track_id_target[:len(res.pos_bboxes)] = pos_match_id
+            track_id_weight = res.bboxes.new_zeros(len(res.bboxes))
+            track_id_weight[:len(res.pos_bboxes)] = 1.0
+
+            track_id_targets.append(track_id_target)
+            track_id_weights.append(track_id_weight)
+
+        return track_id_targets, track_id_weights
+
+    def loss(
+        self,
+        bbox_feats: Tensor,
+        ref_bbox_feats: Tensor,
+        num_bbox_per_img: int,
+        num_bbox_per_ref_img: int,
+        sampling_results: List[SamplingResult],
+        gt_instance_ids: List[Tensor],
+        ref_gt_instance_ids: List[Tensor],
+        reduction_override: Optional[str] = None,
+    ) -> dict:
+        """Calculate the loss in a batch.
+
+        Args:
+            bbox_feats (Tensor): of shape [N, C, H, W]. N is the number of
+                bboxes.
+            ref_bbox_feats (Tensor): of shape [M, C, H, W]. M is the number of
+                reference bboxes.
+            num_bbox_per_img (list[int]): The `bbox_feats` contains proposals
+                of multi-images. `num_bbox_per_img` denotes the number of
+                proposals for each key image.
+            num_bbox_per_ref_img (list[int]): The `ref_bbox_feats` contains
+                proposals of multi-images. `num_bbox_per_ref_img` denotes the
+                number of proposals for each reference image.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of
+                all images in a batch, each tensor has shape (num_gt, ).
+            ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes
+                of all reference images in a batch, each tensor has shape
+                (num_gt, ).
+            reduction_override (str, optional): The method used to reduce the
+                loss. Options are "none", "mean" and "sum".
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        x_split, ref_x_split = self(bbox_feats, ref_bbox_feats,
+                                    num_bbox_per_img, num_bbox_per_ref_img)
+
+        losses = self.loss_by_feat(x_split, ref_x_split, sampling_results,
+                                   gt_instance_ids, ref_gt_instance_ids,
+                                   reduction_override)
+        return losses
+
+    def loss_by_feat(self,
+                     x_split: Tuple[Tensor],
+                     ref_x_split: Tuple[Tensor],
+                     sampling_results: List[SamplingResult],
+                     gt_instance_ids: List[Tensor],
+                     ref_gt_instance_ids: List[Tensor],
+                     reduction_override: Optional[str] = None) -> dict:
+        """Calculate losses.
+
+        Args:
+            x_split (Tensor): The embed features belonging to key image.
+            ref_x_split (Tensor): The embed features belonging to ref image.
+            sampling_results (List[obj:SamplingResult]): Assign results of
+                all images in a batch after sampling.
+            gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes of
+                all images in a batch, each tensor has shape (num_gt, ).
+            ref_gt_instance_ids (list[Tensor]): The instance ids of gt_bboxes
+                of all reference images in a batch, each tensor has shape
+                (num_gt, ).
+            reduction_override (str, optional): The method used to reduce the
+                loss. Options are "none", "mean" and "sum".
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        track_id_targets, track_id_weights = self.get_targets(
+            sampling_results, gt_instance_ids, ref_gt_instance_ids)
+        assert isinstance(track_id_targets, list)
+        assert isinstance(track_id_weights, list)
+        assert len(track_id_weights) == len(track_id_targets)
+
+        losses = defaultdict(list)
+        similarity_logits = []
+        for one_x, one_ref_x in zip(x_split, ref_x_split):
+            similarity_logit = embed_similarity(
+                one_x, one_ref_x, method='dot_product')
+            dummy = similarity_logit.new_zeros(one_x.shape[0], 1)
+            similarity_logit = torch.cat((dummy, similarity_logit), dim=1)
+            similarity_logits.append(similarity_logit)
+        assert isinstance(similarity_logits, list)
+        assert len(similarity_logits) == len(track_id_targets)
+
+        for similarity_logit, track_id_target, track_id_weight in zip(
+                similarity_logits, track_id_targets, track_id_weights):
+            avg_factor = max(torch.sum(track_id_target > 0).float().item(), 1.)
+            if similarity_logit.numel() > 0:
+                loss_match = self.loss_match(
+                    similarity_logit,
+                    track_id_target,
+                    track_id_weight,
+                    avg_factor=avg_factor,
+                    reduction_override=reduction_override)
+                if isinstance(loss_match, dict):
+                    for key, value in loss_match.items():
+                        losses[key].append(value)
+                else:
+                    losses['loss_match'].append(loss_match)
+
+                valid_index = track_id_weight > 0
+                valid_similarity_logit = similarity_logit[valid_index]
+                valid_track_id_target = track_id_target[valid_index]
+                if self.custom_activation:
+                    match_accuracy = self.loss_match.get_accuracy(
+                        valid_similarity_logit, valid_track_id_target)
+                    for key, value in match_accuracy.items():
+                        losses[key].append(value)
+                else:
+                    losses['match_accuracy'].append(
+                        accuracy(valid_similarity_logit,
+                                 valid_track_id_target))
+
+        for key, value in losses.items():
+            losses[key] = sum(losses[key]) / len(similarity_logits)
+        return losses
+
+    def predict(self, roi_feats: Tensor,
+                prev_roi_feats: Tensor) -> List[Tensor]:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            roi_feats (Tensor): Feature map of current images rois.
+            prev_roi_feats (Tensor): Feature map of previous images rois.
+
+        Returns:
+            list[Tensor]: The predicted similarity_logits of each pair of key
+            image and reference image.
+        """
+        x_split, ref_x_split = self(roi_feats, prev_roi_feats,
+                                    [roi_feats.shape[0]],
+                                    [prev_roi_feats.shape[0]])
+
+        similarity_logits = self.predict_by_feat(x_split, ref_x_split)
+
+        return similarity_logits
+
+    def predict_by_feat(self, x_split: Tuple[Tensor],
+                        ref_x_split: Tuple[Tensor]) -> List[Tensor]:
+        """Get similarity_logits.
+
+        Args:
+            x_split (Tensor): The embed features belonging to key image.
+            ref_x_split (Tensor): The embed features belonging to ref image.
+
+        Returns:
+            list[Tensor]: The predicted similarity_logits of each pair of key
+            image and reference image.
+        """
+        similarity_logits = []
+        for one_x, one_ref_x in zip(x_split, ref_x_split):
+            similarity_logit = embed_similarity(
+                one_x, one_ref_x, method='dot_product')
+            dummy = similarity_logit.new_zeros(one_x.shape[0], 1)
+            similarity_logit = torch.cat((dummy, similarity_logit), dim=1)
+            similarity_logits.append(similarity_logit)
+        return similarity_logits
diff --git a/head_extractor/src/mmdet/models/tracking_heads/roi_track_head.py b/head_extractor/src/mmdet/models/tracking_heads/roi_track_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c51c810022cc856411e1de83278e38fdc2b670c8
--- /dev/null
+++ b/head_extractor/src/mmdet/models/tracking_heads/roi_track_head.py
@@ -0,0 +1,178 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+from typing import List, Optional, Tuple
+
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmdet.registry import MODELS, TASK_UTILS
+from mmdet.structures import TrackSampleList
+from mmdet.structures.bbox import bbox2roi
+from mmdet.utils import InstanceList
+
+
+@MODELS.register_module()
+class RoITrackHead(BaseModule, metaclass=ABCMeta):
+    """The roi track head.
+
+    This module is used in multi-object tracking methods, such as MaskTrack
+    R-CNN.
+
+    Args:
+        roi_extractor (dict): Configuration of roi extractor. Defaults to None.
+        embed_head (dict): Configuration of embed head. Defaults to None.
+        train_cfg (dict): Configuration when training. Defaults to None.
+        test_cfg (dict): Configuration when testing. Defaults to None.
+        init_cfg (dict): Configuration of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 roi_extractor: Optional[dict] = None,
+                 embed_head: Optional[dict] = None,
+                 regress_head: Optional[dict] = None,
+                 train_cfg: Optional[dict] = None,
+                 test_cfg: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None,
+                 *args,
+                 **kwargs):
+        super().__init__(init_cfg=init_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if embed_head is not None:
+            self.init_embed_head(roi_extractor, embed_head)
+
+        if regress_head is not None:
+            raise NotImplementedError('Regression head is not supported yet.')
+
+        self.init_assigner_sampler()
+
+    def init_embed_head(self, roi_extractor, embed_head) -> None:
+        """Initialize ``embed_head``"""
+        self.roi_extractor = MODELS.build(roi_extractor)
+        self.embed_head = MODELS.build(embed_head)
+
+    def init_assigner_sampler(self) -> None:
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            self.bbox_assigner = TASK_UTILS.build(self.train_cfg.assigner)
+            self.bbox_sampler = TASK_UTILS.build(
+                self.train_cfg.sampler, default_args=dict(context=self))
+
+    @property
+    def with_track(self) -> bool:
+        """bool: whether the multi-object tracker has an embed head"""
+        return hasattr(self, 'embed_head') and self.embed_head is not None
+
+    def extract_roi_feats(
+            self, feats: List[Tensor],
+            bboxes: List[Tensor]) -> Tuple[Tuple[Tensor], List[int]]:
+        """Extract roi features.
+
+        Args:
+            feats (list[Tensor]): list of multi-level image features.
+            bboxes (list[Tensor]): list of bboxes in sampling result.
+
+        Returns:
+            tuple[tuple[Tensor], list[int]]: The extracted roi features and
+            the number of bboxes in each image.
+        """
+        rois = bbox2roi(bboxes)
+        bbox_feats = self.roi_extractor(feats[:self.roi_extractor.num_inputs],
+                                        rois)
+        num_bbox_per_img = [len(bbox) for bbox in bboxes]
+        return bbox_feats, num_bbox_per_img
+
+    def loss(self, key_feats: List[Tensor], ref_feats: List[Tensor],
+             rpn_results_list: InstanceList, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            key_feats (list[Tensor]): list of multi-level image features.
+            ref_feats (list[Tensor]): list of multi-level ref_img features.
+            rpn_results_list (list[:obj:`InstanceData`]): List of region
+                proposals.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+        assert self.with_track
+        batch_gt_instances = []
+        ref_batch_gt_instances = []
+        batch_gt_instances_ignore = []
+        gt_instance_ids = []
+        ref_gt_instance_ids = []
+        for track_data_sample in data_samples:
+            key_data_sample = track_data_sample.get_key_frames()[0]
+            ref_data_sample = track_data_sample.get_ref_frames()[0]
+            batch_gt_instances.append(key_data_sample.gt_instances)
+            ref_batch_gt_instances.append(ref_data_sample.gt_instances)
+            if 'ignored_instances' in key_data_sample:
+                batch_gt_instances_ignore.append(
+                    key_data_sample.ignored_instances)
+            else:
+                batch_gt_instances_ignore.append(None)
+
+            gt_instance_ids.append(key_data_sample.gt_instances.instances_ids)
+            ref_gt_instance_ids.append(
+                ref_data_sample.gt_instances.instances_ids)
+
+        losses = dict()
+        num_imgs = len(data_samples)
+        if batch_gt_instances_ignore is None:
+            batch_gt_instances_ignore = [None] * num_imgs
+        sampling_results = []
+        for i in range(num_imgs):
+            rpn_results = rpn_results_list[i]
+
+            assign_result = self.bbox_assigner.assign(
+                rpn_results, batch_gt_instances[i],
+                batch_gt_instances_ignore[i])
+            sampling_result = self.bbox_sampler.sample(
+                assign_result,
+                rpn_results,
+                batch_gt_instances[i],
+                feats=[lvl_feat[i][None] for lvl_feat in key_feats])
+            sampling_results.append(sampling_result)
+
+        bboxes = [res.bboxes for res in sampling_results]
+        bbox_feats, num_bbox_per_img = self.extract_roi_feats(
+            key_feats, bboxes)
+
+        # batch_size is 1
+        ref_gt_bboxes = [
+            ref_batch_gt_instance.bboxes
+            for ref_batch_gt_instance in ref_batch_gt_instances
+        ]
+        ref_bbox_feats, num_bbox_per_ref_img = self.extract_roi_feats(
+            ref_feats, ref_gt_bboxes)
+
+        loss_track = self.embed_head.loss(bbox_feats, ref_bbox_feats,
+                                          num_bbox_per_img,
+                                          num_bbox_per_ref_img,
+                                          sampling_results, gt_instance_ids,
+                                          ref_gt_instance_ids)
+        losses.update(loss_track)
+
+        return losses
+
+    def predict(self, roi_feats: Tensor,
+                prev_roi_feats: Tensor) -> List[Tensor]:
+        """Perform forward propagation of the tracking head and predict
+        tracking results on the features of the upstream network.
+
+        Args:
+            roi_feats (Tensor): Feature map of current images rois.
+            prev_roi_feats (Tensor): Feature map of previous images rois.
+
+        Returns:
+            list[Tensor]: The predicted similarity_logits of each pair of key
+            image and reference image.
+        """
+        return self.embed_head.predict(roi_feats, prev_roi_feats)[0]
diff --git a/head_extractor/src/mmdet/models/utils/__init__.py b/head_extractor/src/mmdet/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a00d9a37f33169dc1c523c68db55f823dd0424fa
--- /dev/null
+++ b/head_extractor/src/mmdet/models/utils/__init__.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .gaussian_target import (gather_feat, gaussian_radius,
+                              gen_gaussian_target, get_local_maximum,
+                              get_topk_from_heatmap, transpose_and_gather_feat)
+from .image import imrenormalize
+from .make_divisible import make_divisible
+# Disable yapf because it conflicts with isort.
+# yapf: disable
+from .misc import (align_tensor, aligned_bilinear, center_of_mass,
+                   empty_instances, filter_gt_instances,
+                   filter_scores_and_topk, flip_tensor, generate_coordinate,
+                   images_to_levels, interpolate_as, levels_to_images,
+                   mask2ndarray, multi_apply, relative_coordinate_maps,
+                   rename_loss_dict, reweight_loss_dict,
+                   samplelist_boxtype2tensor, select_single_mlvl,
+                   sigmoid_geometric_mean, unfold_wo_center, unmap,
+                   unpack_gt_instances)
+from .panoptic_gt_processing import preprocess_panoptic_gt
+from .point_sample import (get_uncertain_point_coords_with_randomness,
+                           get_uncertainty)
+from .vlfuse_helper import BertEncoderLayer, VLFuse, permute_and_flatten
+from .wbf import weighted_boxes_fusion
+
+__all__ = [
+    'gaussian_radius', 'gen_gaussian_target', 'make_divisible',
+    'get_local_maximum', 'get_topk_from_heatmap', 'transpose_and_gather_feat',
+    'interpolate_as', 'sigmoid_geometric_mean', 'gather_feat',
+    'preprocess_panoptic_gt', 'get_uncertain_point_coords_with_randomness',
+    'get_uncertainty', 'unpack_gt_instances', 'empty_instances',
+    'center_of_mass', 'filter_scores_and_topk', 'flip_tensor',
+    'generate_coordinate', 'levels_to_images', 'mask2ndarray', 'multi_apply',
+    'select_single_mlvl', 'unmap', 'images_to_levels',
+    'samplelist_boxtype2tensor', 'filter_gt_instances', 'rename_loss_dict',
+    'reweight_loss_dict', 'relative_coordinate_maps', 'aligned_bilinear',
+    'unfold_wo_center', 'imrenormalize', 'VLFuse', 'permute_and_flatten',
+    'BertEncoderLayer', 'align_tensor', 'weighted_boxes_fusion'
+]
diff --git a/head_extractor/src/mmdet/models/utils/gaussian_target.py b/head_extractor/src/mmdet/models/utils/gaussian_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bf4d558ce05c4f953e1c3fcf75016e5874afce1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/utils/gaussian_target.py
@@ -0,0 +1,268 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from math import sqrt
+
+import torch
+import torch.nn.functional as F
+
+
+def gaussian2D(radius, sigma=1, dtype=torch.float32, device='cpu'):
+    """Generate 2D gaussian kernel.
+
+    Args:
+        radius (int): Radius of gaussian kernel.
+        sigma (int): Sigma of gaussian function. Default: 1.
+        dtype (torch.dtype): Dtype of gaussian tensor. Default: torch.float32.
+        device (str): Device of gaussian tensor. Default: 'cpu'.
+
+    Returns:
+        h (Tensor): Gaussian kernel with a
+            ``(2 * radius + 1) * (2 * radius + 1)`` shape.
+    """
+    x = torch.arange(
+        -radius, radius + 1, dtype=dtype, device=device).view(1, -1)
+    y = torch.arange(
+        -radius, radius + 1, dtype=dtype, device=device).view(-1, 1)
+
+    h = (-(x * x + y * y) / (2 * sigma * sigma)).exp()
+
+    h[h < torch.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def gen_gaussian_target(heatmap, center, radius, k=1):
+    """Generate 2D gaussian heatmap.
+
+    Args:
+        heatmap (Tensor): Input heatmap, the gaussian kernel will cover on
+            it and maintain the max value.
+        center (list[int]): Coord of gaussian kernel's center.
+        radius (int): Radius of gaussian kernel.
+        k (int): Coefficient of gaussian kernel. Default: 1.
+
+    Returns:
+        out_heatmap (Tensor): Updated heatmap covered by gaussian kernel.
+    """
+    diameter = 2 * radius + 1
+    gaussian_kernel = gaussian2D(
+        radius, sigma=diameter / 6, dtype=heatmap.dtype, device=heatmap.device)
+
+    x, y = center
+
+    height, width = heatmap.shape[:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian_kernel[radius - top:radius + bottom,
+                                      radius - left:radius + right]
+    out_heatmap = heatmap
+    torch.max(
+        masked_heatmap,
+        masked_gaussian * k,
+        out=out_heatmap[y - top:y + bottom, x - left:x + right])
+
+    return out_heatmap
+
+
+def gaussian_radius(det_size, min_overlap):
+    r"""Generate 2D gaussian radius.
+
+    This function is modified from the `official github repo
+    <https://github.com/princeton-vl/CornerNet-Lite/blob/master/core/sample/
+    utils.py#L65>`_.
+
+    Given ``min_overlap``, radius could computed by a quadratic equation
+    according to Vieta's formulas.
+
+    There are 3 cases for computing gaussian radius, details are following:
+
+    - Explanation of figure: ``lt`` and ``br`` indicates the left-top and
+      bottom-right corner of ground truth box. ``x`` indicates the
+      generated corner at the limited position when ``radius=r``.
+
+    - Case1: one corner is inside the gt box and the other is outside.
+
+    .. code:: text
+
+        |<   width   >|
+
+        lt-+----------+         -
+        |  |          |         ^
+        +--x----------+--+
+        |  |          |  |
+        |  |          |  |    height
+        |  | overlap  |  |
+        |  |          |  |
+        |  |          |  |      v
+        +--+---------br--+      -
+           |          |  |
+           +----------+--x
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{(w-r)*(h-r)}{w*h+(w+h)r-r^2} \ge {iou} \quad\Rightarrow\quad
+        {r^2-(w+h)r+\cfrac{1-iou}{1+iou}*w*h} \ge 0 \\
+        {a} = 1,\quad{b} = {-(w+h)},\quad{c} = {\cfrac{1-iou}{1+iou}*w*h}
+        {r} \le \cfrac{-b-\sqrt{b^2-4*a*c}}{2*a}
+
+    - Case2: both two corners are inside the gt box.
+
+    .. code:: text
+
+        |<   width   >|
+
+        lt-+----------+         -
+        |  |          |         ^
+        +--x-------+  |
+        |  |       |  |
+        |  |overlap|  |       height
+        |  |       |  |
+        |  +-------x--+
+        |          |  |         v
+        +----------+-br         -
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{(w-2*r)*(h-2*r)}{w*h} \ge {iou} \quad\Rightarrow\quad
+        {4r^2-2(w+h)r+(1-iou)*w*h} \ge 0 \\
+        {a} = 4,\quad {b} = {-2(w+h)},\quad {c} = {(1-iou)*w*h}
+        {r} \le \cfrac{-b-\sqrt{b^2-4*a*c}}{2*a}
+
+    - Case3: both two corners are outside the gt box.
+
+    .. code:: text
+
+           |<   width   >|
+
+        x--+----------------+
+        |  |                |
+        +-lt-------------+  |   -
+        |  |             |  |   ^
+        |  |             |  |
+        |  |   overlap   |  | height
+        |  |             |  |
+        |  |             |  |   v
+        |  +------------br--+   -
+        |                |  |
+        +----------------+--x
+
+    To ensure IoU of generated box and gt box is larger than ``min_overlap``:
+
+    .. math::
+        \cfrac{w*h}{(w+2*r)*(h+2*r)} \ge {iou} \quad\Rightarrow\quad
+        {4*iou*r^2+2*iou*(w+h)r+(iou-1)*w*h} \le 0 \\
+        {a} = {4*iou},\quad {b} = {2*iou*(w+h)},\quad {c} = {(iou-1)*w*h} \\
+        {r} \le \cfrac{-b+\sqrt{b^2-4*a*c}}{2*a}
+
+    Args:
+        det_size (list[int]): Shape of object.
+        min_overlap (float): Min IoU with ground truth for boxes generated by
+            keypoints inside the gaussian kernel.
+
+    Returns:
+        radius (int): Radius of gaussian kernel.
+    """
+    height, width = det_size
+
+    a1 = 1
+    b1 = (height + width)
+    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = sqrt(b1**2 - 4 * a1 * c1)
+    r1 = (b1 - sq1) / (2 * a1)
+
+    a2 = 4
+    b2 = 2 * (height + width)
+    c2 = (1 - min_overlap) * width * height
+    sq2 = sqrt(b2**2 - 4 * a2 * c2)
+    r2 = (b2 - sq2) / (2 * a2)
+
+    a3 = 4 * min_overlap
+    b3 = -2 * min_overlap * (height + width)
+    c3 = (min_overlap - 1) * width * height
+    sq3 = sqrt(b3**2 - 4 * a3 * c3)
+    r3 = (b3 + sq3) / (2 * a3)
+    return min(r1, r2, r3)
+
+
+def get_local_maximum(heat, kernel=3):
+    """Extract local maximum pixel with given kernel.
+
+    Args:
+        heat (Tensor): Target heatmap.
+        kernel (int): Kernel size of max pooling. Default: 3.
+
+    Returns:
+        heat (Tensor): A heatmap where local maximum pixels maintain its
+            own value and other positions are 0.
+    """
+    pad = (kernel - 1) // 2
+    hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)
+    keep = (hmax == heat).float()
+    return heat * keep
+
+
+def get_topk_from_heatmap(scores, k=20):
+    """Get top k positions from heatmap.
+
+    Args:
+        scores (Tensor): Target heatmap with shape
+            [batch, num_classes, height, width].
+        k (int): Target number. Default: 20.
+
+    Returns:
+        tuple[torch.Tensor]: Scores, indexes, categories and coords of
+            topk keypoint. Containing following Tensors:
+
+        - topk_scores (Tensor): Max scores of each topk keypoint.
+        - topk_inds (Tensor): Indexes of each topk keypoint.
+        - topk_clses (Tensor): Categories of each topk keypoint.
+        - topk_ys (Tensor): Y-coord of each topk keypoint.
+        - topk_xs (Tensor): X-coord of each topk keypoint.
+    """
+    batch, _, height, width = scores.size()
+    topk_scores, topk_inds = torch.topk(scores.view(batch, -1), k)
+    topk_clses = topk_inds // (height * width)
+    topk_inds = topk_inds % (height * width)
+    topk_ys = topk_inds // width
+    topk_xs = (topk_inds % width).int().float()
+    return topk_scores, topk_inds, topk_clses, topk_ys, topk_xs
+
+
+def gather_feat(feat, ind, mask=None):
+    """Gather feature according to index.
+
+    Args:
+        feat (Tensor): Target feature map.
+        ind (Tensor): Target coord index.
+        mask (Tensor | None): Mask of feature map. Default: None.
+
+    Returns:
+        feat (Tensor): Gathered feature.
+    """
+    dim = feat.size(2)
+    ind = ind.unsqueeze(2).repeat(1, 1, dim)
+    feat = feat.gather(1, ind)
+    if mask is not None:
+        mask = mask.unsqueeze(2).expand_as(feat)
+        feat = feat[mask]
+        feat = feat.view(-1, dim)
+    return feat
+
+
+def transpose_and_gather_feat(feat, ind):
+    """Transpose and gather feature according to index.
+
+    Args:
+        feat (Tensor): Target feature map.
+        ind (Tensor): Target coord index.
+
+    Returns:
+        feat (Tensor): Transposed and gathered feature.
+    """
+    feat = feat.permute(0, 2, 3, 1).contiguous()
+    feat = feat.view(feat.size(0), -1, feat.size(3))
+    feat = gather_feat(feat, ind)
+    return feat
diff --git a/head_extractor/src/mmdet/models/utils/image.py b/head_extractor/src/mmdet/models/utils/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..16b5787a78232e46f47585c99526ca2b4ca9d1a1
--- /dev/null
+++ b/head_extractor/src/mmdet/models/utils/image.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import mmcv
+import numpy as np
+import torch
+from torch import Tensor
+
+
+def imrenormalize(img: Union[Tensor, np.ndarray], img_norm_cfg: dict,
+                  new_img_norm_cfg: dict) -> Union[Tensor, np.ndarray]:
+    """Re-normalize the image.
+
+    Args:
+        img (Tensor | ndarray): Input image. If the input is a Tensor, the
+            shape is (1, C, H, W). If the input is a ndarray, the shape
+            is (H, W, C).
+        img_norm_cfg (dict): Original configuration for the normalization.
+        new_img_norm_cfg (dict): New configuration for the normalization.
+
+    Returns:
+        Tensor | ndarray: Output image with the same type and shape of
+        the input.
+    """
+    if isinstance(img, torch.Tensor):
+        assert img.ndim == 4 and img.shape[0] == 1
+        new_img = img.squeeze(0).cpu().numpy().transpose(1, 2, 0)
+        new_img = _imrenormalize(new_img, img_norm_cfg, new_img_norm_cfg)
+        new_img = new_img.transpose(2, 0, 1)[None]
+        return torch.from_numpy(new_img).to(img)
+    else:
+        return _imrenormalize(img, img_norm_cfg, new_img_norm_cfg)
+
+
+def _imrenormalize(img: Union[Tensor, np.ndarray], img_norm_cfg: dict,
+                   new_img_norm_cfg: dict) -> Union[Tensor, np.ndarray]:
+    """Re-normalize the image."""
+    img_norm_cfg = img_norm_cfg.copy()
+    new_img_norm_cfg = new_img_norm_cfg.copy()
+    for k, v in img_norm_cfg.items():
+        if (k == 'mean' or k == 'std') and not isinstance(v, np.ndarray):
+            img_norm_cfg[k] = np.array(v, dtype=img.dtype)
+    # reverse cfg
+    if 'bgr_to_rgb' in img_norm_cfg:
+        img_norm_cfg['rgb_to_bgr'] = img_norm_cfg['bgr_to_rgb']
+        img_norm_cfg.pop('bgr_to_rgb')
+    for k, v in new_img_norm_cfg.items():
+        if (k == 'mean' or k == 'std') and not isinstance(v, np.ndarray):
+            new_img_norm_cfg[k] = np.array(v, dtype=img.dtype)
+    img = mmcv.imdenormalize(img, **img_norm_cfg)
+    img = mmcv.imnormalize(img, **new_img_norm_cfg)
+    return img
diff --git a/head_extractor/src/mmdet/models/utils/make_divisible.py b/head_extractor/src/mmdet/models/utils/make_divisible.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed42c2eeea2a6aed03a0be5516b8d1ef1139e486
--- /dev/null
+++ b/head_extractor/src/mmdet/models/utils/make_divisible.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+    """Make divisible function.
+
+    This function rounds the channel number to the nearest value that can be
+    divisible by the divisor. It is taken from the original tf repo. It ensures
+    that all layers have a channel number that is divisible by divisor. It can
+    be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py  # noqa
+
+    Args:
+        value (int): The original channel number.
+        divisor (int): The divisor to fully divide the channel number.
+        min_value (int): The minimum value of the output channel.
+            Default: None, means that the minimum value equal to the divisor.
+        min_ratio (float): The minimum ratio of the rounded channel number to
+            the original channel number. Default: 0.9.
+
+    Returns:
+        int: The modified output channel number.
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than (1-min_ratio).
+    if new_value < min_ratio * value:
+        new_value += divisor
+    return new_value
diff --git a/head_extractor/src/mmdet/models/utils/misc.py b/head_extractor/src/mmdet/models/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cf429153ba7e0be025396b069aef8212144e34d
--- /dev/null
+++ b/head_extractor/src/mmdet/models/utils/misc.py
@@ -0,0 +1,697 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+from typing import List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from mmengine.structures import InstanceData
+from mmengine.utils import digit_version
+from six.moves import map, zip
+from torch import Tensor
+from torch.autograd import Function
+from torch.nn import functional as F
+
+from mmdet.structures import SampleList
+from mmdet.structures.bbox import BaseBoxes, get_box_type, stack_boxes
+from mmdet.structures.mask import BitmapMasks, PolygonMasks
+from mmdet.utils import OptInstanceList
+
+
+class SigmoidGeometricMean(Function):
+    """Forward and backward function of geometric mean of two sigmoid
+    functions.
+
+    This implementation with analytical gradient function substitutes
+    the autograd function of (x.sigmoid() * y.sigmoid()).sqrt(). The
+    original implementation incurs none during gradient backprapagation
+    if both x and y are very small values.
+    """
+
+    @staticmethod
+    def forward(ctx, x, y):
+        x_sigmoid = x.sigmoid()
+        y_sigmoid = y.sigmoid()
+        z = (x_sigmoid * y_sigmoid).sqrt()
+        ctx.save_for_backward(x_sigmoid, y_sigmoid, z)
+        return z
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x_sigmoid, y_sigmoid, z = ctx.saved_tensors
+        grad_x = grad_output * z * (1 - x_sigmoid) / 2
+        grad_y = grad_output * z * (1 - y_sigmoid) / 2
+        return grad_x, grad_y
+
+
+sigmoid_geometric_mean = SigmoidGeometricMean.apply
+
+
+def interpolate_as(source, target, mode='bilinear', align_corners=False):
+    """Interpolate the `source` to the shape of the `target`.
+
+    The `source` must be a Tensor, but the `target` can be a Tensor or a
+    np.ndarray with the shape (..., target_h, target_w).
+
+    Args:
+        source (Tensor): A 3D/4D Tensor with the shape (N, H, W) or
+            (N, C, H, W).
+        target (Tensor | np.ndarray): The interpolation target with the shape
+            (..., target_h, target_w).
+        mode (str): Algorithm used for interpolation. The options are the
+            same as those in F.interpolate(). Default: ``'bilinear'``.
+        align_corners (bool): The same as the argument in F.interpolate().
+
+    Returns:
+        Tensor: The interpolated source Tensor.
+    """
+    assert len(target.shape) >= 2
+
+    def _interpolate_as(source, target, mode='bilinear', align_corners=False):
+        """Interpolate the `source` (4D) to the shape of the `target`."""
+        target_h, target_w = target.shape[-2:]
+        source_h, source_w = source.shape[-2:]
+        if target_h != source_h or target_w != source_w:
+            source = F.interpolate(
+                source,
+                size=(target_h, target_w),
+                mode=mode,
+                align_corners=align_corners)
+        return source
+
+    if len(source.shape) == 3:
+        source = source[:, None, :, :]
+        source = _interpolate_as(source, target, mode, align_corners)
+        return source[:, 0, :, :]
+    else:
+        return _interpolate_as(source, target, mode, align_corners)
+
+
+def unpack_gt_instances(batch_data_samples: SampleList) -> tuple:
+    """Unpack ``gt_instances``, ``gt_instances_ignore`` and ``img_metas`` based
+    on ``batch_data_samples``
+
+    Args:
+        batch_data_samples (List[:obj:`DetDataSample`]): The Data
+            Samples. It usually includes information such as
+            `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+
+    Returns:
+        tuple:
+
+            - batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                gt_instance. It usually includes ``bboxes`` and ``labels``
+                attributes.
+            - batch_gt_instances_ignore (list[:obj:`InstanceData`]):
+                Batch of gt_instances_ignore. It includes ``bboxes`` attribute
+                data that is ignored during training and testing.
+                Defaults to None.
+            - batch_img_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+    """
+    batch_gt_instances = []
+    batch_gt_instances_ignore = []
+    batch_img_metas = []
+    for data_sample in batch_data_samples:
+        batch_img_metas.append(data_sample.metainfo)
+        batch_gt_instances.append(data_sample.gt_instances)
+        if 'ignored_instances' in data_sample:
+            batch_gt_instances_ignore.append(data_sample.ignored_instances)
+        else:
+            batch_gt_instances_ignore.append(None)
+
+    return batch_gt_instances, batch_gt_instances_ignore, batch_img_metas
+
+
+def empty_instances(batch_img_metas: List[dict],
+                    device: torch.device,
+                    task_type: str,
+                    instance_results: OptInstanceList = None,
+                    mask_thr_binary: Union[int, float] = 0,
+                    box_type: Union[str, type] = 'hbox',
+                    use_box_type: bool = False,
+                    num_classes: int = 80,
+                    score_per_cls: bool = False) -> List[InstanceData]:
+    """Handle predicted instances when RoI is empty.
+
+    Note: If ``instance_results`` is not None, it will be modified
+    in place internally, and then return ``instance_results``
+
+    Args:
+        batch_img_metas (list[dict]): List of image information.
+        device (torch.device): Device of tensor.
+        task_type (str): Expected returned task type. it currently
+            supports bbox and mask.
+        instance_results (list[:obj:`InstanceData`]): List of instance
+            results.
+        mask_thr_binary (int, float): mask binarization threshold.
+            Defaults to 0.
+        box_type (str or type): The empty box type. Defaults to `hbox`.
+        use_box_type (bool): Whether to warp boxes with the box type.
+            Defaults to False.
+        num_classes (int): num_classes of bbox_head. Defaults to 80.
+        score_per_cls (bool):  Whether to generate classwise score for
+            the empty instance. ``score_per_cls`` will be True when the model
+            needs to produce raw results without nms. Defaults to False.
+
+    Returns:
+        list[:obj:`InstanceData`]: Detection results of each image
+    """
+    assert task_type in ('bbox', 'mask'), 'Only support bbox and mask,' \
+                                          f' but got {task_type}'
+
+    if instance_results is not None:
+        assert len(instance_results) == len(batch_img_metas)
+
+    results_list = []
+    for img_id in range(len(batch_img_metas)):
+        if instance_results is not None:
+            results = instance_results[img_id]
+            assert isinstance(results, InstanceData)
+        else:
+            results = InstanceData()
+
+        if task_type == 'bbox':
+            _, box_type = get_box_type(box_type)
+            bboxes = torch.zeros(0, box_type.box_dim, device=device)
+            if use_box_type:
+                bboxes = box_type(bboxes, clone=False)
+            results.bboxes = bboxes
+            score_shape = (0, num_classes + 1) if score_per_cls else (0, )
+            results.scores = torch.zeros(score_shape, device=device)
+            results.labels = torch.zeros((0, ),
+                                         device=device,
+                                         dtype=torch.long)
+        else:
+            # TODO: Handle the case where rescale is false
+            img_h, img_w = batch_img_metas[img_id]['ori_shape'][:2]
+            # the type of `im_mask` will be torch.bool or torch.uint8,
+            # where uint8 if for visualization and debugging.
+            im_mask = torch.zeros(
+                0,
+                img_h,
+                img_w,
+                device=device,
+                dtype=torch.bool if mask_thr_binary >= 0 else torch.uint8)
+            results.masks = im_mask
+        results_list.append(results)
+    return results_list
+
+
+def multi_apply(func, *args, **kwargs):
+    """Apply function to a list of arguments.
+
+    Note:
+        This function applies the ``func`` to multiple inputs and
+        map the multiple outputs of the ``func`` into different
+        list. Each list contains the same type of outputs corresponding
+        to different inputs.
+
+    Args:
+        func (Function): A function that will be applied to a list of
+            arguments
+
+    Returns:
+        tuple(list): A tuple containing multiple list, each list contains \
+            a kind of returned results by the function
+    """
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def unmap(data, count, inds, fill=0):
+    """Unmap a subset of item (data) back to the original set of items (of size
+    count)"""
+    if data.dim() == 1:
+        ret = data.new_full((count, ), fill)
+        ret[inds.type(torch.bool)] = data
+    else:
+        new_size = (count, ) + data.size()[1:]
+        ret = data.new_full(new_size, fill)
+        ret[inds.type(torch.bool), :] = data
+    return ret
+
+
+def mask2ndarray(mask):
+    """Convert Mask to ndarray..
+
+    Args:
+        mask (:obj:`BitmapMasks` or :obj:`PolygonMasks` or
+        torch.Tensor or np.ndarray): The mask to be converted.
+
+    Returns:
+        np.ndarray: Ndarray mask of shape (n, h, w) that has been converted
+    """
+    if isinstance(mask, (BitmapMasks, PolygonMasks)):
+        mask = mask.to_ndarray()
+    elif isinstance(mask, torch.Tensor):
+        mask = mask.detach().cpu().numpy()
+    elif not isinstance(mask, np.ndarray):
+        raise TypeError(f'Unsupported {type(mask)} data type')
+    return mask
+
+
+def flip_tensor(src_tensor, flip_direction):
+    """flip tensor base on flip_direction.
+
+    Args:
+        src_tensor (Tensor): input feature map, shape (B, C, H, W).
+        flip_direction (str): The flipping direction. Options are
+          'horizontal', 'vertical', 'diagonal'.
+
+    Returns:
+        out_tensor (Tensor): Flipped tensor.
+    """
+    assert src_tensor.ndim == 4
+    valid_directions = ['horizontal', 'vertical', 'diagonal']
+    assert flip_direction in valid_directions
+    if flip_direction == 'horizontal':
+        out_tensor = torch.flip(src_tensor, [3])
+    elif flip_direction == 'vertical':
+        out_tensor = torch.flip(src_tensor, [2])
+    else:
+        out_tensor = torch.flip(src_tensor, [2, 3])
+    return out_tensor
+
+
+def select_single_mlvl(mlvl_tensors, batch_id, detach=True):
+    """Extract a multi-scale single image tensor from a multi-scale batch
+    tensor based on batch index.
+
+    Note: The default value of detach is True, because the proposal gradient
+    needs to be detached during the training of the two-stage model. E.g
+    Cascade Mask R-CNN.
+
+    Args:
+        mlvl_tensors (list[Tensor]): Batch tensor for all scale levels,
+           each is a 4D-tensor.
+        batch_id (int): Batch index.
+        detach (bool): Whether detach gradient. Default True.
+
+    Returns:
+        list[Tensor]: Multi-scale single image tensor.
+    """
+    assert isinstance(mlvl_tensors, (list, tuple))
+    num_levels = len(mlvl_tensors)
+
+    if detach:
+        mlvl_tensor_list = [
+            mlvl_tensors[i][batch_id].detach() for i in range(num_levels)
+        ]
+    else:
+        mlvl_tensor_list = [
+            mlvl_tensors[i][batch_id] for i in range(num_levels)
+        ]
+    return mlvl_tensor_list
+
+
+def filter_scores_and_topk(scores, score_thr, topk, results=None):
+    """Filter results using score threshold and topk candidates.
+
+    Args:
+        scores (Tensor): The scores, shape (num_bboxes, K).
+        score_thr (float): The score filter threshold.
+        topk (int): The number of topk candidates.
+        results (dict or list or Tensor, Optional): The results to
+           which the filtering rule is to be applied. The shape
+           of each item is (num_bboxes, N).
+
+    Returns:
+        tuple: Filtered results
+
+            - scores (Tensor): The scores after being filtered, \
+                shape (num_bboxes_filtered, ).
+            - labels (Tensor): The class labels, shape \
+                (num_bboxes_filtered, ).
+            - anchor_idxs (Tensor): The anchor indexes, shape \
+                (num_bboxes_filtered, ).
+            - filtered_results (dict or list or Tensor, Optional): \
+                The filtered results. The shape of each item is \
+                (num_bboxes_filtered, N).
+    """
+    valid_mask = scores > score_thr
+    scores = scores[valid_mask]
+    valid_idxs = torch.nonzero(valid_mask)
+
+    num_topk = min(topk, valid_idxs.size(0))
+    # torch.sort is actually faster than .topk (at least on GPUs)
+    scores, idxs = scores.sort(descending=True)
+    scores = scores[:num_topk]
+    topk_idxs = valid_idxs[idxs[:num_topk]]
+    keep_idxs, labels = topk_idxs.unbind(dim=1)
+
+    filtered_results = None
+    if results is not None:
+        if isinstance(results, dict):
+            filtered_results = {k: v[keep_idxs] for k, v in results.items()}
+        elif isinstance(results, list):
+            filtered_results = [result[keep_idxs] for result in results]
+        elif isinstance(results, torch.Tensor):
+            filtered_results = results[keep_idxs]
+        else:
+            raise NotImplementedError(f'Only supports dict or list or Tensor, '
+                                      f'but get {type(results)}.')
+    return scores, labels, keep_idxs, filtered_results
+
+
+def center_of_mass(mask, esp=1e-6):
+    """Calculate the centroid coordinates of the mask.
+
+    Args:
+        mask (Tensor): The mask to be calculated, shape (h, w).
+        esp (float): Avoid dividing by zero. Default: 1e-6.
+
+    Returns:
+        tuple[Tensor]: the coordinates of the center point of the mask.
+
+            - center_h (Tensor): the center point of the height.
+            - center_w (Tensor): the center point of the width.
+    """
+    h, w = mask.shape
+    grid_h = torch.arange(h, device=mask.device)[:, None]
+    grid_w = torch.arange(w, device=mask.device)
+    normalizer = mask.sum().float().clamp(min=esp)
+    center_h = (mask * grid_h).sum() / normalizer
+    center_w = (mask * grid_w).sum() / normalizer
+    return center_h, center_w
+
+
+def generate_coordinate(featmap_sizes, device='cuda'):
+    """Generate the coordinate.
+
+    Args:
+        featmap_sizes (tuple): The feature to be calculated,
+            of shape (N, C, W, H).
+        device (str): The device where the feature will be put on.
+    Returns:
+        coord_feat (Tensor): The coordinate feature, of shape (N, 2, W, H).
+    """
+
+    x_range = torch.linspace(-1, 1, featmap_sizes[-1], device=device)
+    y_range = torch.linspace(-1, 1, featmap_sizes[-2], device=device)
+    y, x = torch.meshgrid(y_range, x_range)
+    y = y.expand([featmap_sizes[0], 1, -1, -1])
+    x = x.expand([featmap_sizes[0], 1, -1, -1])
+    coord_feat = torch.cat([x, y], 1)
+
+    return coord_feat
+
+
+def levels_to_images(mlvl_tensor: List[torch.Tensor]) -> List[torch.Tensor]:
+    """Concat multi-level feature maps by image.
+
+    [feature_level0, feature_level1...] -> [feature_image0, feature_image1...]
+    Convert the shape of each element in mlvl_tensor from (N, C, H, W) to
+    (N, H*W , C), then split the element to N elements with shape (H*W, C), and
+    concat elements in same image of all level along first dimension.
+
+    Args:
+        mlvl_tensor (list[Tensor]): list of Tensor which collect from
+            corresponding level. Each element is of shape (N, C, H, W)
+
+    Returns:
+        list[Tensor]: A list that contains N tensors and each tensor is
+            of shape (num_elements, C)
+    """
+    batch_size = mlvl_tensor[0].size(0)
+    batch_list = [[] for _ in range(batch_size)]
+    channels = mlvl_tensor[0].size(1)
+    for t in mlvl_tensor:
+        t = t.permute(0, 2, 3, 1)
+        t = t.view(batch_size, -1, channels).contiguous()
+        for img in range(batch_size):
+            batch_list[img].append(t[img])
+    return [torch.cat(item, 0) for item in batch_list]
+
+
+def images_to_levels(target, num_levels):
+    """Convert targets by image to targets by feature level.
+
+    [target_img0, target_img1] -> [target_level0, target_level1, ...]
+    """
+    target = stack_boxes(target, 0)
+    level_targets = []
+    start = 0
+    for n in num_levels:
+        end = start + n
+        # level_targets.append(target[:, start:end].squeeze(0))
+        level_targets.append(target[:, start:end])
+        start = end
+    return level_targets
+
+
+def samplelist_boxtype2tensor(batch_data_samples: SampleList) -> SampleList:
+    for data_samples in batch_data_samples:
+        if 'gt_instances' in data_samples:
+            bboxes = data_samples.gt_instances.get('bboxes', None)
+            if isinstance(bboxes, BaseBoxes):
+                data_samples.gt_instances.bboxes = bboxes.tensor
+        if 'pred_instances' in data_samples:
+            bboxes = data_samples.pred_instances.get('bboxes', None)
+            if isinstance(bboxes, BaseBoxes):
+                data_samples.pred_instances.bboxes = bboxes.tensor
+        if 'ignored_instances' in data_samples:
+            bboxes = data_samples.ignored_instances.get('bboxes', None)
+            if isinstance(bboxes, BaseBoxes):
+                data_samples.ignored_instances.bboxes = bboxes.tensor
+
+
+_torch_version_div_indexing = (
+    'parrots' not in torch.__version__
+    and digit_version(torch.__version__) >= digit_version('1.8'))
+
+
+def floordiv(dividend, divisor, rounding_mode='trunc'):
+    if _torch_version_div_indexing:
+        return torch.div(dividend, divisor, rounding_mode=rounding_mode)
+    else:
+        return dividend // divisor
+
+
+def _filter_gt_instances_by_score(batch_data_samples: SampleList,
+                                  score_thr: float) -> SampleList:
+    """Filter ground truth (GT) instances by score.
+
+    Args:
+        batch_data_samples (SampleList): The Data
+            Samples. It usually includes information such as
+            `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+        score_thr (float): The score filter threshold.
+
+    Returns:
+        SampleList: The Data Samples filtered by score.
+    """
+    for data_samples in batch_data_samples:
+        assert 'scores' in data_samples.gt_instances, \
+            'there does not exit scores in instances'
+        if data_samples.gt_instances.bboxes.shape[0] > 0:
+            data_samples.gt_instances = data_samples.gt_instances[
+                data_samples.gt_instances.scores > score_thr]
+    return batch_data_samples
+
+
+def _filter_gt_instances_by_size(batch_data_samples: SampleList,
+                                 wh_thr: tuple) -> SampleList:
+    """Filter ground truth (GT) instances by size.
+
+    Args:
+        batch_data_samples (SampleList): The Data
+            Samples. It usually includes information such as
+            `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+        wh_thr (tuple):  Minimum width and height of bbox.
+
+    Returns:
+        SampleList: The Data Samples filtered by score.
+    """
+    for data_samples in batch_data_samples:
+        bboxes = data_samples.gt_instances.bboxes
+        if bboxes.shape[0] > 0:
+            w = bboxes[:, 2] - bboxes[:, 0]
+            h = bboxes[:, 3] - bboxes[:, 1]
+            data_samples.gt_instances = data_samples.gt_instances[
+                (w > wh_thr[0]) & (h > wh_thr[1])]
+    return batch_data_samples
+
+
+def filter_gt_instances(batch_data_samples: SampleList,
+                        score_thr: float = None,
+                        wh_thr: tuple = None):
+    """Filter ground truth (GT) instances by score and/or size.
+
+    Args:
+        batch_data_samples (SampleList): The Data
+            Samples. It usually includes information such as
+            `gt_instance`, `gt_panoptic_seg` and `gt_sem_seg`.
+        score_thr (float): The score filter threshold.
+        wh_thr (tuple):  Minimum width and height of bbox.
+
+    Returns:
+        SampleList: The Data Samples filtered by score and/or size.
+    """
+
+    if score_thr is not None:
+        batch_data_samples = _filter_gt_instances_by_score(
+            batch_data_samples, score_thr)
+    if wh_thr is not None:
+        batch_data_samples = _filter_gt_instances_by_size(
+            batch_data_samples, wh_thr)
+    return batch_data_samples
+
+
+def rename_loss_dict(prefix: str, losses: dict) -> dict:
+    """Rename the key names in loss dict by adding a prefix.
+
+    Args:
+        prefix (str): The prefix for loss components.
+        losses (dict):  A dictionary of loss components.
+
+    Returns:
+            dict: A dictionary of loss components with prefix.
+    """
+    return {prefix + k: v for k, v in losses.items()}
+
+
+def reweight_loss_dict(losses: dict, weight: float) -> dict:
+    """Reweight losses in the dict by weight.
+
+    Args:
+        losses (dict):  A dictionary of loss components.
+        weight (float): Weight for loss components.
+
+    Returns:
+            dict: A dictionary of weighted loss components.
+    """
+    for name, loss in losses.items():
+        if 'loss' in name:
+            if isinstance(loss, Sequence):
+                losses[name] = [item * weight for item in loss]
+            else:
+                losses[name] = loss * weight
+    return losses
+
+
+def relative_coordinate_maps(
+    locations: Tensor,
+    centers: Tensor,
+    strides: Tensor,
+    size_of_interest: int,
+    feat_sizes: Tuple[int],
+) -> Tensor:
+    """Generate the relative coordinate maps with feat_stride.
+
+    Args:
+        locations (Tensor): The prior location of mask feature map.
+            It has shape (num_priors, 2).
+        centers (Tensor): The prior points of a object in
+            all feature pyramid. It has shape (num_pos, 2)
+        strides (Tensor): The prior strides of a object in
+            all feature pyramid. It has shape (num_pos, 1)
+        size_of_interest (int): The size of the region used in rel coord.
+        feat_sizes (Tuple[int]): The feature size H and W, which has 2 dims.
+    Returns:
+        rel_coord_feat (Tensor): The coordinate feature
+            of shape (num_pos, 2, H, W).
+    """
+
+    H, W = feat_sizes
+    rel_coordinates = centers.reshape(-1, 1, 2) - locations.reshape(1, -1, 2)
+    rel_coordinates = rel_coordinates.permute(0, 2, 1).float()
+    rel_coordinates = rel_coordinates / (
+        strides[:, None, None] * size_of_interest)
+    return rel_coordinates.reshape(-1, 2, H, W)
+
+
+def aligned_bilinear(tensor: Tensor, factor: int) -> Tensor:
+    """aligned bilinear, used in original implement in CondInst:
+
+    https://github.com/aim-uofa/AdelaiDet/blob/\
+    c0b2092ce72442b0f40972f7c6dda8bb52c46d16/adet/utils/comm.py#L23
+    """
+
+    assert tensor.dim() == 4
+    assert factor >= 1
+    assert int(factor) == factor
+
+    if factor == 1:
+        return tensor
+
+    h, w = tensor.size()[2:]
+    tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode='replicate')
+    oh = factor * h + 1
+    ow = factor * w + 1
+    tensor = F.interpolate(
+        tensor, size=(oh, ow), mode='bilinear', align_corners=True)
+    tensor = F.pad(
+        tensor, pad=(factor // 2, 0, factor // 2, 0), mode='replicate')
+
+    return tensor[:, :, :oh - 1, :ow - 1]
+
+
+def unfold_wo_center(x, kernel_size: int, dilation: int) -> Tensor:
+    """unfold_wo_center, used in original implement in BoxInst:
+
+    https://github.com/aim-uofa/AdelaiDet/blob/\
+    4a3a1f7372c35b48ebf5f6adc59f135a0fa28d60/\
+    adet/modeling/condinst/condinst.py#L53
+    """
+    assert x.dim() == 4
+    assert kernel_size % 2 == 1
+
+    # using SAME padding
+    padding = (kernel_size + (dilation - 1) * (kernel_size - 1)) // 2
+    unfolded_x = F.unfold(
+        x, kernel_size=kernel_size, padding=padding, dilation=dilation)
+    unfolded_x = unfolded_x.reshape(
+        x.size(0), x.size(1), -1, x.size(2), x.size(3))
+    # remove the center pixels
+    size = kernel_size**2
+    unfolded_x = torch.cat(
+        (unfolded_x[:, :, :size // 2], unfolded_x[:, :, size // 2 + 1:]),
+        dim=2)
+
+    return unfolded_x
+
+
+def padding_to(input_tensor: Tensor, max_len: int = 300) -> Tensor:
+    """Pad the first dimension of `input_tensor` to `max_len`.
+
+    Args:
+        input_tensor (Tensor): The tensor to be padded,
+        max_len (int): Padding target size in the first dimension.
+            Default: 300
+    https://github.com/jshilong/DDQ/blob/ddq_detr/projects/models/utils.py#L19
+    Returns:
+        Tensor: The tensor padded with the first dimension size `max_len`.
+    """
+    if max_len is None:
+        return input_tensor
+    num_padding = max_len - len(input_tensor)
+    if input_tensor.dim() > 1:
+        padding = input_tensor.new_zeros(
+            num_padding, *input_tensor.size()[1:], dtype=input_tensor.dtype)
+    else:
+        padding = input_tensor.new_zeros(num_padding, dtype=input_tensor.dtype)
+    output_tensor = torch.cat([input_tensor, padding], dim=0)
+    return output_tensor
+
+
+def align_tensor(inputs: List[Tensor],
+                 max_len: Optional[int] = None) -> Tensor:
+    """Pad each input to `max_len`, then stack them. If `max_len` is None, then
+    it is the max size of the first dimension of each input.
+
+        https://github.com/jshilong/DDQ/blob/ddq_detr/projects/models/\
+        utils.py#L12
+
+    Args:
+        inputs (list[Tensor]): The tensors to be padded,
+            Each input should have the same shape except the first dimension.
+        max_len (int): Padding target size in the first dimension.
+            Default: None
+    Returns:
+        Tensor: Stacked inputs after padding in the first dimension.
+    """
+    if max_len is None:
+        max_len = max([len(item) for item in inputs])
+
+    return torch.stack([padding_to(item, max_len) for item in inputs])
diff --git a/head_extractor/src/mmdet/models/utils/panoptic_gt_processing.py b/head_extractor/src/mmdet/models/utils/panoptic_gt_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a3bc95fc04040b4a2a13fa63f2d02f092f725e6
--- /dev/null
+++ b/head_extractor/src/mmdet/models/utils/panoptic_gt_processing.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from torch import Tensor
+
+
+def preprocess_panoptic_gt(gt_labels: Tensor, gt_masks: Tensor,
+                           gt_semantic_seg: Tensor, num_things: int,
+                           num_stuff: int) -> Tuple[Tensor, Tensor]:
+    """Preprocess the ground truth for a image.
+
+    Args:
+        gt_labels (Tensor): Ground truth labels of each bbox,
+            with shape (num_gts, ).
+        gt_masks (BitmapMasks): Ground truth masks of each instances
+            of a image, shape (num_gts, h, w).
+        gt_semantic_seg (Tensor | None): Ground truth of semantic
+            segmentation with the shape (1, h, w).
+            [0, num_thing_class - 1] means things,
+            [num_thing_class, num_class-1] means stuff,
+            255 means VOID. It's None when training instance segmentation.
+
+    Returns:
+        tuple[Tensor, Tensor]: a tuple containing the following targets.
+
+            - labels (Tensor): Ground truth class indices for a
+                image, with shape (n, ), n is the sum of number
+                of stuff type and number of instance in a image.
+            - masks (Tensor): Ground truth mask for a image, with
+                shape (n, h, w). Contains stuff and things when training
+                panoptic segmentation, and things only when training
+                instance segmentation.
+    """
+    num_classes = num_things + num_stuff
+    things_masks = gt_masks.to_tensor(
+        dtype=torch.bool, device=gt_labels.device)
+
+    if gt_semantic_seg is None:
+        masks = things_masks.long()
+        return gt_labels, masks
+
+    things_labels = gt_labels
+    gt_semantic_seg = gt_semantic_seg.squeeze(0)
+
+    semantic_labels = torch.unique(
+        gt_semantic_seg,
+        sorted=False,
+        return_inverse=False,
+        return_counts=False)
+    stuff_masks_list = []
+    stuff_labels_list = []
+    for label in semantic_labels:
+        if label < num_things or label >= num_classes:
+            continue
+        stuff_mask = gt_semantic_seg == label
+        stuff_masks_list.append(stuff_mask)
+        stuff_labels_list.append(label)
+
+    if len(stuff_masks_list) > 0:
+        stuff_masks = torch.stack(stuff_masks_list, dim=0)
+        stuff_labels = torch.stack(stuff_labels_list, dim=0)
+        labels = torch.cat([things_labels, stuff_labels], dim=0)
+        masks = torch.cat([things_masks, stuff_masks], dim=0)
+    else:
+        labels = things_labels
+        masks = things_masks
+
+    masks = masks.long()
+    return labels, masks
diff --git a/head_extractor/src/mmdet/models/utils/point_sample.py b/head_extractor/src/mmdet/models/utils/point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..1afc957f3da7d1dc030c21d40311c768c6952ea4
--- /dev/null
+++ b/head_extractor/src/mmdet/models/utils/point_sample.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import point_sample
+from torch import Tensor
+
+
+def get_uncertainty(mask_preds: Tensor, labels: Tensor) -> Tensor:
+    """Estimate uncertainty based on pred logits.
+
+    We estimate uncertainty as L1 distance between 0.0 and the logits
+    prediction in 'mask_preds' for the foreground class in `classes`.
+
+    Args:
+        mask_preds (Tensor): mask predication logits, shape (num_rois,
+            num_classes, mask_height, mask_width).
+
+        labels (Tensor): Either predicted or ground truth label for
+            each predicted mask, of length num_rois.
+
+    Returns:
+        scores (Tensor): Uncertainty scores with the most uncertain
+            locations having the highest uncertainty score,
+            shape (num_rois, 1, mask_height, mask_width)
+    """
+    if mask_preds.shape[1] == 1:
+        gt_class_logits = mask_preds.clone()
+    else:
+        inds = torch.arange(mask_preds.shape[0], device=mask_preds.device)
+        gt_class_logits = mask_preds[inds, labels].unsqueeze(1)
+    return -torch.abs(gt_class_logits)
+
+
+def get_uncertain_point_coords_with_randomness(
+        mask_preds: Tensor, labels: Tensor, num_points: int,
+        oversample_ratio: float, importance_sample_ratio: float) -> Tensor:
+    """Get ``num_points`` most uncertain points with random points during
+    train.
+
+    Sample points in [0, 1] x [0, 1] coordinate space based on their
+    uncertainty. The uncertainties are calculated for each point using
+    'get_uncertainty()' function that takes point's logit prediction as
+    input.
+
+    Args:
+        mask_preds (Tensor): A tensor of shape (num_rois, num_classes,
+            mask_height, mask_width) for class-specific or class-agnostic
+            prediction.
+        labels (Tensor): The ground truth class for each instance.
+        num_points (int): The number of points to sample.
+        oversample_ratio (float): Oversampling parameter.
+        importance_sample_ratio (float): Ratio of points that are sampled
+            via importnace sampling.
+
+    Returns:
+        point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+            that contains the coordinates sampled points.
+    """
+    assert oversample_ratio >= 1
+    assert 0 <= importance_sample_ratio <= 1
+    batch_size = mask_preds.shape[0]
+    num_sampled = int(num_points * oversample_ratio)
+    point_coords = torch.rand(
+        batch_size, num_sampled, 2, device=mask_preds.device)
+    point_logits = point_sample(mask_preds, point_coords)
+    # It is crucial to calculate uncertainty based on the sampled
+    # prediction value for the points. Calculating uncertainties of the
+    # coarse predictions first and sampling them for points leads to
+    # incorrect results.  To illustrate this: assume uncertainty func(
+    # logits)=-abs(logits), a sampled point between two coarse
+    # predictions with -1 and 1 logits has 0 logits, and therefore 0
+    # uncertainty value. However, if we calculate uncertainties for the
+    # coarse predictions first, both will have -1 uncertainty,
+    # and sampled point will get -1 uncertainty.
+    point_uncertainties = get_uncertainty(point_logits, labels)
+    num_uncertain_points = int(importance_sample_ratio * num_points)
+    num_random_points = num_points - num_uncertain_points
+    idx = torch.topk(
+        point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+    shift = num_sampled * torch.arange(
+        batch_size, dtype=torch.long, device=mask_preds.device)
+    idx += shift[:, None]
+    point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+        batch_size, num_uncertain_points, 2)
+    if num_random_points > 0:
+        rand_roi_coords = torch.rand(
+            batch_size, num_random_points, 2, device=mask_preds.device)
+        point_coords = torch.cat((point_coords, rand_roi_coords), dim=1)
+    return point_coords
diff --git a/head_extractor/src/mmdet/models/utils/vlfuse_helper.py b/head_extractor/src/mmdet/models/utils/vlfuse_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..76b54de317c1f24d7cb40573954f988fd94fef42
--- /dev/null
+++ b/head_extractor/src/mmdet/models/utils/vlfuse_helper.py
@@ -0,0 +1,773 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/microsoft/GLIP/blob/main/maskrcnn_benchmark/utils/fuse_helper.py  # noqa
+# and https://github.com/microsoft/GLIP/blob/main/maskrcnn_benchmark/modeling/rpn/modeling_bert.py  # noqa
+import math
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from mmcv.cnn.bricks import DropPath
+from torch import Tensor
+
+try:
+    from transformers import BertConfig, BertPreTrainedModel
+    from transformers.modeling_utils import apply_chunking_to_forward
+    from transformers.models.bert.modeling_bert import \
+        BertAttention as HFBertAttention
+    from transformers.models.bert.modeling_bert import \
+        BertIntermediate as HFBertIntermediate
+    from transformers.models.bert.modeling_bert import \
+        BertOutput as HFBertOutput
+except ImportError:
+    BertConfig = None
+    BertPreTrainedModel = object
+    apply_chunking_to_forward = None
+    HFBertAttention = object
+    HFBertIntermediate = object
+    HFBertOutput = object
+
+MAX_CLAMP_VALUE = 50000
+
+
+def permute_and_flatten(layer: Tensor, N: int, A: int, C: int, H: int,
+                        W: int) -> Tensor:
+    """Permute and then flatten a tensor,
+
+       from size (N, A, C, H, W) to (N, H * W * A, C).
+
+    Args:
+        layer (Tensor): Tensor of shape (N, C, H, W).
+        N (int): Batch size.
+        A (int): Number of attention heads.
+        C (int): Number of channels.
+        H (int): Height of feature map.
+        W (int): Width of feature map.
+
+    Returns:
+        Tensor: A Tensor of shape (N, H * W * A, C).
+    """
+    layer = layer.view(N, A, C, H, W)
+    layer = layer.permute(0, 3, 4, 1, 2)
+    layer = layer.reshape(N, -1, C)
+    return layer
+
+
+def clamp_values(vector: Tensor) -> Tensor:
+    """Clamp the values of a vector to the range [-MAX_CLAMP_VALUE,
+    MAX_CLAMP_VALUE].
+
+    Args:
+        vector (Tensor): Tensor of shape (N, C, H, W).
+
+    Returns:
+        Tensor: A Tensor of shape (N, C, H, W) with clamped values.
+    """
+    vector = torch.clamp(vector, min=-MAX_CLAMP_VALUE, max=MAX_CLAMP_VALUE)
+    return vector
+
+
+class BiMultiHeadAttention(nn.Module):
+    """Bidirectional fusion Multi-Head Attention layer.
+
+    Args:
+        v_dim (int): The dimension of the vision input.
+        l_dim (int): The dimension of the language input.
+        embed_dim (int): The embedding dimension for the attention operation.
+        num_heads (int): The number of attention heads.
+        dropout (float, optional): The dropout probability. Defaults to 0.1.
+    """
+
+    def __init__(self,
+                 v_dim: int,
+                 l_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 dropout: float = 0.1):
+        super(BiMultiHeadAttention, self).__init__()
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.v_dim = v_dim
+        self.l_dim = l_dim
+
+        assert (
+            self.head_dim * self.num_heads == self.embed_dim
+        ), 'embed_dim must be divisible by num_heads ' \
+           f'(got `embed_dim`: {self.embed_dim} ' \
+           f'and `num_heads`: {self.num_heads}).'
+        self.scale = self.head_dim**(-0.5)
+        self.dropout = dropout
+
+        self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.l_proj = nn.Linear(self.l_dim, self.embed_dim)
+        self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)
+
+        self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)
+        self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)
+
+        self.stable_softmax_2d = False
+        self.clamp_min_for_underflow = True
+        self.clamp_max_for_overflow = True
+
+        self._reset_parameters()
+
+    def _shape(self, tensor: Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads,
+                           self.head_dim).transpose(1, 2).contiguous()
+
+    def _reset_parameters(self):
+        nn.init.xavier_uniform_(self.v_proj.weight)
+        self.v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.l_proj.weight)
+        self.l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_v_proj.weight)
+        self.values_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_l_proj.weight)
+        self.values_l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_v_proj.weight)
+        self.out_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_l_proj.weight)
+        self.out_l_proj.bias.data.fill_(0)
+
+    def forward(
+        self,
+        vision: Tensor,
+        lang: Tensor,
+        attention_mask_v: Optional[Tensor] = None,
+        attention_mask_l: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Tensor]:
+        bsz, tgt_len, _ = vision.size()
+
+        query_states = self.v_proj(vision) * self.scale
+        key_states = self._shape(self.l_proj(lang), -1, bsz)
+        value_v_states = self._shape(self.values_v_proj(vision), -1, bsz)
+        value_l_states = self._shape(self.values_l_proj(lang), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len,
+                                   bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_v_states = value_v_states.view(*proj_shape)
+        value_l_states = value_l_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f'Attention weights should be of '
+                f'size {(bsz * self.num_heads, tgt_len, src_len)}, '
+                f'but is {attn_weights.size()}')
+
+        if self.stable_softmax_2d:
+            attn_weights = attn_weights - attn_weights.max()
+
+        if self.clamp_min_for_underflow:
+            # Do not increase -50000, data type half has quite limited range
+            attn_weights = torch.clamp(attn_weights, min=-MAX_CLAMP_VALUE)
+        if self.clamp_max_for_overflow:
+            # Do not increase 50000, data type half has quite limited range
+            attn_weights = torch.clamp(attn_weights, max=MAX_CLAMP_VALUE)
+
+        attn_weights_T = attn_weights.transpose(1, 2)
+        attn_weights_l = (
+            attn_weights_T -
+            torch.max(attn_weights_T, dim=-1, keepdim=True)[0])
+        if self.clamp_min_for_underflow:
+            # Do not increase -50000, data type half has quite limited range
+            attn_weights_l = torch.clamp(attn_weights_l, min=-MAX_CLAMP_VALUE)
+        if self.clamp_max_for_overflow:
+            # Do not increase 50000, data type half has quite limited range
+            attn_weights_l = torch.clamp(attn_weights_l, max=MAX_CLAMP_VALUE)
+
+        if attention_mask_v is not None:
+            attention_mask_v = (
+                attention_mask_v[:, None,
+                                 None, :].repeat(1, self.num_heads, 1,
+                                                 1).flatten(0, 1))
+            attn_weights_l.masked_fill_(attention_mask_v, float('-inf'))
+
+        attn_weights_l = attn_weights_l.softmax(dim=-1)
+
+        if attention_mask_l is not None:
+            assert (attention_mask_l.dim() == 2)
+            attention_mask = attention_mask_l.unsqueeze(1).unsqueeze(1)
+            attention_mask = attention_mask.expand(bsz, 1, tgt_len, src_len)
+            attention_mask = attention_mask.masked_fill(
+                attention_mask == 0, -9e15)
+
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError('Attention mask should be of '
+                                 f'size {(bsz, 1, tgt_len, src_len)}')
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
+                                             src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len,
+                                             src_len)
+
+        attn_weights_v = nn.functional.softmax(attn_weights, dim=-1)
+
+        attn_probs_v = F.dropout(
+            attn_weights_v, p=self.dropout, training=self.training)
+        attn_probs_l = F.dropout(
+            attn_weights_l, p=self.dropout, training=self.training)
+
+        attn_output_v = torch.bmm(attn_probs_v, value_l_states)
+        attn_output_l = torch.bmm(attn_probs_l, value_v_states)
+
+        if attn_output_v.size() != (bsz * self.num_heads, tgt_len,
+                                    self.head_dim):
+            raise ValueError(
+                '`attn_output_v` should be of '
+                f'size {(bsz, self.num_heads, tgt_len, self.head_dim)}, '
+                f'but is {attn_output_v.size()}')
+
+        if attn_output_l.size() != (bsz * self.num_heads, src_len,
+                                    self.head_dim):
+            raise ValueError(
+                '`attn_output_l` should be of size '
+                f'{(bsz, self.num_heads, src_len, self.head_dim)}, '
+                f'but is {attn_output_l.size()}')
+
+        attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len,
+                                           self.head_dim)
+        attn_output_v = attn_output_v.transpose(1, 2)
+        attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len,
+                                           self.head_dim)
+        attn_output_l = attn_output_l.transpose(1, 2)
+        attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)
+
+        attn_output_v = self.out_v_proj(attn_output_v)
+        attn_output_l = self.out_l_proj(attn_output_l)
+
+        return attn_output_v, attn_output_l
+
+
+class BiAttentionBlock(nn.Module):
+    """BiAttentionBlock Module:
+
+    First, multi-level visual features are concat; Then the concat visual
+    feature and lang feature are fused by attention; Finally the newly visual
+    feature are split into multi levels.
+
+    Args:
+        v_dim (int): The dimension of the visual features.
+        l_dim (int): The dimension of the language feature.
+        embed_dim (int): The embedding dimension for the attention operation.
+        num_heads (int): The number of attention heads.
+        dropout (float, optional): The dropout probability. Defaults to 0.1.
+        drop_path (float, optional): The drop path probability.
+            Defaults to 0.0.
+        init_values (float, optional):
+            The initial value for the scaling parameter.
+            Defaults to 1e-4.
+    """
+
+    def __init__(self,
+                 v_dim: int,
+                 l_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 dropout: float = 0.1,
+                 drop_path: float = .0,
+                 init_values: float = 1e-4):
+        super().__init__()
+
+        # pre layer norm
+        self.layer_norm_v = nn.LayerNorm(v_dim)
+        self.layer_norm_l = nn.LayerNorm(l_dim)
+        self.attn = BiMultiHeadAttention(
+            v_dim=v_dim,
+            l_dim=l_dim,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            dropout=dropout)
+
+        # add layer scale for training stability
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.gamma_v = nn.Parameter(
+            init_values * torch.ones(v_dim), requires_grad=True)
+        self.gamma_l = nn.Parameter(
+            init_values * torch.ones(l_dim), requires_grad=True)
+
+    def forward(self,
+                vf0: Tensor,
+                vf1: Tensor,
+                vf2: Tensor,
+                vf3: Tensor,
+                vf4: Tensor,
+                lang_feature: Tensor,
+                attention_mask_l=None):
+        visual_features = [vf0, vf1, vf2, vf3, vf4]
+        size_per_level, visual_features_flatten = [], []
+        for i, feat_per_level in enumerate(visual_features):
+            bs, c, h, w = feat_per_level.shape
+            size_per_level.append([h, w])
+            feat = permute_and_flatten(feat_per_level, bs, -1, c, h, w)
+            visual_features_flatten.append(feat)
+        visual_features_flatten = torch.cat(visual_features_flatten, dim=1)
+        new_v, new_lang_feature = self.single_attention_call(
+            visual_features_flatten,
+            lang_feature,
+            attention_mask_l=attention_mask_l)
+        # [bs, N, C] -> [bs, C, N]
+        new_v = new_v.transpose(1, 2).contiguous()
+
+        start = 0
+        # fvfs is mean fusion_visual_features
+        fvfs = []
+        for (h, w) in size_per_level:
+            new_v_per_level = new_v[:, :,
+                                    start:start + h * w].view(bs, -1, h,
+                                                              w).contiguous()
+            fvfs.append(new_v_per_level)
+            start += h * w
+
+        return fvfs[0], fvfs[1], fvfs[2], fvfs[3], fvfs[4], new_lang_feature
+
+    def single_attention_call(
+        self,
+        visual: Tensor,
+        lang: Tensor,
+        attention_mask_v: Optional[Tensor] = None,
+        attention_mask_l: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Tensor]:
+        """Perform a single attention call between the visual and language
+        inputs.
+
+        Args:
+        visual (Tensor): The visual input tensor.
+        lang (Tensor): The language input tensor.
+        attention_mask_v (Optional[Tensor]):
+            An optional attention mask tensor for the visual input.
+        attention_mask_l (Optional[Tensor]):
+            An optional attention mask tensor for the language input.
+
+        Returns:
+            Tuple[Tensor, Tensor]: A tuple containing the updated
+                visual and language tensors after the attention call.
+        """
+        visual = self.layer_norm_v(visual)
+        lang = self.layer_norm_l(lang)
+        delta_v, delta_l = self.attn(
+            visual,
+            lang,
+            attention_mask_v=attention_mask_v,
+            attention_mask_l=attention_mask_l)
+        # visual, lang = visual + delta_v, l + delta_l
+        visual = visual + self.drop_path(self.gamma_v * delta_v)
+        lang = lang + self.drop_path(self.gamma_l * delta_l)
+        return visual, lang
+
+
+class SingleScaleBiAttentionBlock(BiAttentionBlock):
+    """This is a single-scale implementation of `BiAttentionBlock`.
+
+    The only differenece between it and `BiAttentionBlock` is that the
+    `forward` function of `SingleScaleBiAttentionBlock` only accepts a single
+    flatten visual feature map, while the `forward` function in
+    `BiAttentionBlock` accepts multiple visual feature maps.
+    """
+
+    def forward(self,
+                visual_feature: Tensor,
+                lang_feature: Tensor,
+                attention_mask_v=None,
+                attention_mask_l=None):
+        """Single-scale forward pass.
+
+        Args:
+            visual_feature (Tensor): The visual input tensor. Tensor of
+                shape (bs, patch_len, ch).
+            lang_feature (Tensor): The language input tensor. Tensor of
+                shape (bs, text_len, ch).
+            attention_mask_v (_type_, optional): Visual feature attention
+                mask. Defaults to None.
+            attention_mask_l (_type_, optional): Language feature attention
+                mask.Defaults to None.
+        """
+        new_v, new_lang_feature = self.single_attention_call(
+            visual_feature,
+            lang_feature,
+            attention_mask_v=attention_mask_v,
+            attention_mask_l=attention_mask_l)
+        return new_v, new_lang_feature
+
+
+class VLFuse(nn.Module):
+    """Early Fusion Module.
+
+    Args:
+        v_dim (int): Dimension of visual features.
+        l_dim (int): Dimension of language features.
+        embed_dim (int): The embedding dimension for the attention operation.
+        num_heads (int): Number of attention heads.
+        dropout (float): Dropout probability.
+        drop_path (float): Drop path probability.
+        use_checkpoint (bool): Whether to use PyTorch's checkpoint function.
+    """
+
+    def __init__(self,
+                 v_dim: int = 256,
+                 l_dim: int = 768,
+                 embed_dim: int = 2048,
+                 num_heads: int = 8,
+                 dropout: float = 0.1,
+                 drop_path: float = 0.0,
+                 use_checkpoint: bool = False):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.b_attn = BiAttentionBlock(
+            v_dim=v_dim,
+            l_dim=l_dim,
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            dropout=dropout,
+            drop_path=drop_path,
+            init_values=1.0 / 6.0)
+
+    def forward(self, x: dict) -> dict:
+        """Forward pass of the VLFuse module."""
+        visual_features = x['visual']
+        language_dict_features = x['lang']
+
+        if self.use_checkpoint:
+            # vf is mean visual_features
+            # checkpoint does not allow complex data structures as input,
+            # such as list, so we must split them.
+            vf0, vf1, vf2, vf3, vf4, language_features = checkpoint.checkpoint(
+                self.b_attn, *visual_features,
+                language_dict_features['hidden'],
+                language_dict_features['masks'])
+        else:
+            vf0, vf1, vf2, vf3, vf4, language_features = self.b_attn(
+                *visual_features, language_dict_features['hidden'],
+                language_dict_features['masks'])
+
+        language_dict_features['hidden'] = language_features
+        fused_language_dict_features = language_dict_features
+
+        features_dict = {
+            'visual': [vf0, vf1, vf2, vf3, vf4],
+            'lang': fused_language_dict_features
+        }
+
+        return features_dict
+
+
+class BertEncoderLayer(BertPreTrainedModel):
+    """A modified version of the `BertLayer` class from the
+    `transformers.models.bert.modeling_bert` module.
+
+    Args:
+        config (:class:`~transformers.BertConfig`):
+            The configuration object that
+            contains various parameters for the model.
+        clamp_min_for_underflow (bool, optional):
+            Whether to clamp the minimum value of the hidden states
+             to prevent underflow. Defaults to `False`.
+        clamp_max_for_overflow (bool, optional):
+            Whether to clamp the maximum value of the hidden states
+            to prevent overflow. Defaults to `False`.
+    """
+
+    def __init__(self,
+                 config: BertConfig,
+                 clamp_min_for_underflow: bool = False,
+                 clamp_max_for_overflow: bool = False):
+        super().__init__(config)
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+
+        self.attention = BertAttention(config, clamp_min_for_underflow,
+                                       clamp_max_for_overflow)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self, inputs: Dict[str, Dict[str, torch.Tensor]]
+    ) -> Dict[str, Dict[str, torch.Tensor]]:
+        """Applies the BertEncoderLayer to the input features."""
+        language_dict_features = inputs['lang']
+        hidden_states = language_dict_features['hidden']
+        attention_mask = language_dict_features['masks']
+
+        device = hidden_states.device
+        input_shape = hidden_states.size()[:-1]
+        extended_attention_mask = self.get_extended_attention_mask(
+            attention_mask, input_shape, device)
+
+        self_attention_outputs = self.attention(
+            hidden_states,
+            extended_attention_mask,
+            None,
+            output_attentions=False,
+            past_key_value=None)
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]
+        layer_output = apply_chunking_to_forward(self.feed_forward_chunk,
+                                                 self.chunk_size_feed_forward,
+                                                 self.seq_len_dim,
+                                                 attention_output)
+        outputs = (layer_output, ) + outputs
+        hidden_states = outputs[0]
+
+        language_dict_features['hidden'] = hidden_states
+
+        features_dict = {
+            'visual': inputs['visual'],
+            'lang': language_dict_features
+        }
+
+        return features_dict
+
+    def feed_forward_chunk(self, attention_output: Tensor) -> Tensor:
+        """Applies the intermediate and output layers of the BertEncoderLayer
+        to a chunk of the input sequence."""
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# The following code is the same as the Huggingface code,
+# with the only difference being the additional clamp operation.
+class BertSelfAttention(nn.Module):
+    """BERT self-attention layer from Huggingface transformers.
+
+    Compared to the BertSelfAttention of Huggingface, only add the clamp.
+
+    Args:
+        config (:class:`~transformers.BertConfig`):
+            The configuration object that
+            contains various parameters for the model.
+        clamp_min_for_underflow (bool, optional):
+            Whether to clamp the minimum value of the hidden states
+             to prevent underflow. Defaults to `False`.
+        clamp_max_for_overflow (bool, optional):
+            Whether to clamp the maximum value of the hidden states
+            to prevent overflow. Defaults to `False`.
+    """
+
+    def __init__(self,
+                 config: BertConfig,
+                 clamp_min_for_underflow: bool = False,
+                 clamp_max_for_overflow: bool = False):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and \
+                not hasattr(config, 'embedding_size'):
+            raise ValueError(f'The hidden size ({config.hidden_size}) is '
+                             'not a multiple of the number of attention '
+                             f'heads ({config.num_attention_heads})')
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size /
+                                       config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * \
+            self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config,
+                                               'position_embedding_type',
+                                               'absolute')
+        if self.position_embedding_type == 'relative_key' or \
+                self.position_embedding_type == 'relative_key_query':
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1,
+                self.attention_head_size)
+        self.clamp_min_for_underflow = clamp_min_for_underflow
+        self.clamp_max_for_overflow = clamp_max_for_overflow
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: Tensor) -> Tensor:
+        """Transpose the dimensions of `x`."""
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads,
+                                       self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        head_mask: Optional[Tensor] = None,
+        encoder_hidden_states: Optional[Tensor] = None,
+        encoder_attention_mask: Optional[Tensor] = None,
+        past_key_value: Optional[Tuple[Tensor, Tensor]] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[Tensor, ...]:
+        """Perform a forward pass through the BERT self-attention layer."""
+
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(
+                self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(
+                self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key"
+        # to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer,
+                                        key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == 'relative_key' or \
+                self.position_embedding_type == 'relative_key_query':
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(
+                seq_length, dtype=torch.long,
+                device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == 'relative_key':
+                relative_position_scores = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == 'relative_key_query':
+                relative_position_scores_query = torch.einsum(
+                    'bhld,lrd->bhlr', query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    'bhrd,lrd->bhlr', key_layer, positional_embedding)
+                attention_scores = attention_scores + \
+                    relative_position_scores_query + \
+                    relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+
+        if self.clamp_min_for_underflow:
+            attention_scores = torch.clamp(
+                attention_scores, min=-MAX_CLAMP_VALUE
+            )  # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attention_scores = torch.clamp(
+                attention_scores, max=MAX_CLAMP_VALUE
+            )  # Do not increase 50000, data type half has quite limited range
+
+        if attention_mask is not None:
+            # Apply the attention mask is
+            # (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (
+            self.all_head_size, )
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer,
+                   attention_probs) if output_attentions else (context_layer, )
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value, )
+        return outputs
+
+
+class BertAttention(HFBertAttention):
+    """BertAttention is made up of self-attention and intermediate+output.
+
+    Compared to the BertAttention of Huggingface, only add the clamp.
+
+    Args:
+        config (:class:`~transformers.BertConfig`):
+            The configuration object that
+            contains various parameters for the model.
+        clamp_min_for_underflow (bool, optional):
+            Whether to clamp the minimum value of the hidden states
+             to prevent underflow. Defaults to `False`.
+        clamp_max_for_overflow (bool, optional):
+            Whether to clamp the maximum value of the hidden states
+            to prevent overflow. Defaults to `False`.
+    """
+
+    def __init__(self,
+                 config: BertConfig,
+                 clamp_min_for_underflow: bool = False,
+                 clamp_max_for_overflow: bool = False):
+        super().__init__(config)
+        self.self = BertSelfAttention(config, clamp_min_for_underflow,
+                                      clamp_max_for_overflow)
+
+
+class BertIntermediate(HFBertIntermediate):
+    """Modified from transformers.models.bert.modeling_bert.BertIntermediate.
+
+    Compared to the BertIntermediate of Huggingface, only add the clamp.
+    """
+
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = clamp_values(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = clamp_values(hidden_states)
+        return hidden_states
+
+
+class BertOutput(HFBertOutput):
+    """Modified from transformers.models.bert.modeling_bert.BertOutput.
+
+    Compared to the BertOutput of Huggingface, only add the clamp.
+    """
+
+    def forward(self, hidden_states: Tensor, input_tensor: Tensor) -> Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = clamp_values(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        hidden_states = clamp_values(hidden_states)
+        return hidden_states
diff --git a/head_extractor/src/mmdet/models/utils/wbf.py b/head_extractor/src/mmdet/models/utils/wbf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b26a2c669a520467c6fcf52d0eec53a69834a16a
--- /dev/null
+++ b/head_extractor/src/mmdet/models/utils/wbf.py
@@ -0,0 +1,250 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import warnings
+from typing import Tuple
+
+import numpy as np
+import torch
+from torch import Tensor
+
+
+# References: https://github.com/ZFTurbo/Weighted-Boxes-Fusion
+def weighted_boxes_fusion(
+        bboxes_list: list,
+        scores_list: list,
+        labels_list: list,
+        weights: list = None,
+        iou_thr: float = 0.55,
+        skip_box_thr: float = 0.0,
+        conf_type: str = 'avg',
+        allows_overflow: bool = False) -> Tuple[Tensor, Tensor, Tensor]:
+    """weighted boxes fusion <https://arxiv.org/abs/1910.13302> is a method for
+    fusing predictions from different object detection models, which utilizes
+    confidence scores of all proposed bounding boxes to construct averaged
+    boxes.
+
+    Args:
+        bboxes_list(list): list of boxes predictions from each model,
+                                    each box is 4 numbers.
+        scores_list(list): list of scores for each model
+        labels_list(list): list of labels for each model
+        weights: list of weights for each model.
+                Default: None, which means weight == 1 for each model
+        iou_thr: IoU value for boxes to be a match
+        skip_box_thr: exclude boxes with score lower than this variable.
+        conf_type: how to calculate confidence in weighted boxes.
+            'avg': average value,
+            'max': maximum value,
+            'box_and_model_avg': box and model wise hybrid weighted average,
+            'absent_model_aware_avg': weighted average that takes into
+                            account the absent model.
+        allows_overflow: false if we want confidence score not exceed 1.0.
+
+    Returns:
+        bboxes(Tensor): boxes coordinates (Order of boxes: x1, y1, x2, y2).
+        scores(Tensor): confidence scores
+        labels(Tensor): boxes labels
+    """
+
+    if weights is None:
+        weights = np.ones(len(bboxes_list))
+    if len(weights) != len(bboxes_list):
+        print('Warning: incorrect number of weights {}. Must be: '
+              '{}. Set weights equal to 1.'.format(
+                  len(weights), len(bboxes_list)))
+        weights = np.ones(len(bboxes_list))
+    weights = np.array(weights)
+
+    if conf_type not in [
+            'avg', 'max', 'box_and_model_avg', 'absent_model_aware_avg'
+    ]:
+        print('Unknown conf_type: {}. Must be "avg", '
+              '"max" or "box_and_model_avg", '
+              'or "absent_model_aware_avg"'.format(conf_type))
+        exit()
+
+    filtered_boxes = prefilter_boxes(bboxes_list, scores_list, labels_list,
+                                     weights, skip_box_thr)
+    if len(filtered_boxes) == 0:
+        return torch.Tensor(), torch.Tensor(), torch.Tensor()
+
+    overall_boxes = []
+
+    for label in filtered_boxes:
+        boxes = filtered_boxes[label]
+        new_boxes = []
+        weighted_boxes = np.empty((0, 8))
+
+        # Clusterize boxes
+        for j in range(0, len(boxes)):
+            index, best_iou = find_matching_box_fast(weighted_boxes, boxes[j],
+                                                     iou_thr)
+
+            if index != -1:
+                new_boxes[index].append(boxes[j])
+                weighted_boxes[index] = get_weighted_box(
+                    new_boxes[index], conf_type)
+            else:
+                new_boxes.append([boxes[j].copy()])
+                weighted_boxes = np.vstack((weighted_boxes, boxes[j].copy()))
+
+        # Rescale confidence based on number of models and boxes
+        for i in range(len(new_boxes)):
+            clustered_boxes = new_boxes[i]
+            if conf_type == 'box_and_model_avg':
+                clustered_boxes = np.array(clustered_boxes)
+                # weighted average for boxes
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] * len(
+                    clustered_boxes) / weighted_boxes[i, 2]
+                # identify unique model index by model index column
+                _, idx = np.unique(clustered_boxes[:, 3], return_index=True)
+                # rescale by unique model weights
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] * clustered_boxes[
+                    idx, 2].sum() / weights.sum()
+            elif conf_type == 'absent_model_aware_avg':
+                clustered_boxes = np.array(clustered_boxes)
+                # get unique model index in the cluster
+                models = np.unique(clustered_boxes[:, 3]).astype(int)
+                # create a mask to get unused model weights
+                mask = np.ones(len(weights), dtype=bool)
+                mask[models] = False
+                # absent model aware weighted average
+                weighted_boxes[
+                    i, 1] = weighted_boxes[i, 1] * len(clustered_boxes) / (
+                        weighted_boxes[i, 2] + weights[mask].sum())
+            elif conf_type == 'max':
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] / weights.max()
+            elif not allows_overflow:
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] * min(
+                    len(weights), len(clustered_boxes)) / weights.sum()
+            else:
+                weighted_boxes[i, 1] = weighted_boxes[i, 1] * len(
+                    clustered_boxes) / weights.sum()
+        overall_boxes.append(weighted_boxes)
+    overall_boxes = np.concatenate(overall_boxes, axis=0)
+    overall_boxes = overall_boxes[overall_boxes[:, 1].argsort()[::-1]]
+
+    bboxes = torch.Tensor(overall_boxes[:, 4:])
+    scores = torch.Tensor(overall_boxes[:, 1])
+    labels = torch.Tensor(overall_boxes[:, 0]).int()
+
+    return bboxes, scores, labels
+
+
+def prefilter_boxes(boxes, scores, labels, weights, thr):
+
+    new_boxes = dict()
+
+    for t in range(len(boxes)):
+
+        if len(boxes[t]) != len(scores[t]):
+            print('Error. Length of boxes arrays not equal to '
+                  'length of scores array: {} != {}'.format(
+                      len(boxes[t]), len(scores[t])))
+            exit()
+
+        if len(boxes[t]) != len(labels[t]):
+            print('Error. Length of boxes arrays not equal to '
+                  'length of labels array: {} != {}'.format(
+                      len(boxes[t]), len(labels[t])))
+            exit()
+
+        for j in range(len(boxes[t])):
+            score = scores[t][j]
+            if score < thr:
+                continue
+            label = int(labels[t][j])
+            box_part = boxes[t][j]
+            x1 = float(box_part[0])
+            y1 = float(box_part[1])
+            x2 = float(box_part[2])
+            y2 = float(box_part[3])
+
+            # Box data checks
+            if x2 < x1:
+                warnings.warn('X2 < X1 value in box. Swap them.')
+                x1, x2 = x2, x1
+            if y2 < y1:
+                warnings.warn('Y2 < Y1 value in box. Swap them.')
+                y1, y2 = y2, y1
+            if (x2 - x1) * (y2 - y1) == 0.0:
+                warnings.warn('Zero area box skipped: {}.'.format(box_part))
+                continue
+
+            # [label, score, weight, model index, x1, y1, x2, y2]
+            b = [
+                int(label),
+                float(score) * weights[t], weights[t], t, x1, y1, x2, y2
+            ]
+
+            if label not in new_boxes:
+                new_boxes[label] = []
+            new_boxes[label].append(b)
+
+    # Sort each list in dict by score and transform it to numpy array
+    for k in new_boxes:
+        current_boxes = np.array(new_boxes[k])
+        new_boxes[k] = current_boxes[current_boxes[:, 1].argsort()[::-1]]
+
+    return new_boxes
+
+
+def get_weighted_box(boxes, conf_type='avg'):
+
+    box = np.zeros(8, dtype=np.float32)
+    conf = 0
+    conf_list = []
+    w = 0
+    for b in boxes:
+        box[4:] += (b[1] * b[4:])
+        conf += b[1]
+        conf_list.append(b[1])
+        w += b[2]
+    box[0] = boxes[0][0]
+    if conf_type in ('avg', 'box_and_model_avg', 'absent_model_aware_avg'):
+        box[1] = conf / len(boxes)
+    elif conf_type == 'max':
+        box[1] = np.array(conf_list).max()
+    box[2] = w
+    box[3] = -1
+    box[4:] /= conf
+
+    return box
+
+
+def find_matching_box_fast(boxes_list, new_box, match_iou):
+
+    def bb_iou_array(boxes, new_box):
+        # bb intersection over union
+        xA = np.maximum(boxes[:, 0], new_box[0])
+        yA = np.maximum(boxes[:, 1], new_box[1])
+        xB = np.minimum(boxes[:, 2], new_box[2])
+        yB = np.minimum(boxes[:, 3], new_box[3])
+
+        interArea = np.maximum(xB - xA, 0) * np.maximum(yB - yA, 0)
+
+        # compute the area of both the prediction and ground-truth rectangles
+        boxAArea = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+        boxBArea = (new_box[2] - new_box[0]) * (new_box[3] - new_box[1])
+
+        iou = interArea / (boxAArea + boxBArea - interArea)
+
+        return iou
+
+    if boxes_list.shape[0] == 0:
+        return -1, match_iou
+
+    boxes = boxes_list
+
+    ious = bb_iou_array(boxes[:, 4:], new_box[4:])
+
+    ious[boxes[:, 0] != new_box[0]] = -1
+
+    best_idx = np.argmax(ious)
+    best_iou = ious[best_idx]
+
+    if best_iou <= match_iou:
+        best_iou = match_iou
+        best_idx = -1
+
+    return best_idx, best_iou
diff --git a/head_extractor/src/mmdet/models/vis/__init__.py b/head_extractor/src/mmdet/models/vis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab63a9066bcf6cd25d7c9063cc66d9b0390b3d42
--- /dev/null
+++ b/head_extractor/src/mmdet/models/vis/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mask2former_vis import Mask2FormerVideo
+from .masktrack_rcnn import MaskTrackRCNN
+
+__all__ = ['Mask2FormerVideo', 'MaskTrackRCNN']
diff --git a/head_extractor/src/mmdet/models/vis/mask2former_vis.py b/head_extractor/src/mmdet/models/vis/mask2former_vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ab04296e120622f4b5e28739f4c3323d253f7d5
--- /dev/null
+++ b/head_extractor/src/mmdet/models/vis/mask2former_vis.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+from torch import Tensor
+
+from mmdet.models.mot import BaseMOTModel
+from mmdet.registry import MODELS
+from mmdet.structures import TrackDataSample, TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class Mask2FormerVideo(BaseMOTModel):
+    r"""Implementation of `Masked-attention Mask
+    Transformer for Universal Image Segmentation
+    <https://arxiv.org/pdf/2112.01527>`_.
+
+    Args:
+        backbone (dict): Configuration of backbone. Defaults to None.
+        track_head (dict): Configuration of track head. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+            Defaults to None.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone: Optional[dict] = None,
+                 track_head: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super(BaseMOTModel, self).__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+        if backbone is not None:
+            self.backbone = MODELS.build(backbone)
+
+        if track_head is not None:
+            self.track_head = MODELS.build(track_head)
+
+        self.num_classes = self.track_head.num_classes
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """Overload in order to load mmdet pretrained ckpt."""
+        for key in list(state_dict):
+            if key.startswith('panoptic_head'):
+                state_dict[key.replace('panoptic',
+                                       'track')] = state_dict.pop(key)
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> Union[dict, tuple]:
+        """
+        Args:
+            inputs (Tensor): Input images of shape (N, T, C, H, W).
+                These should usually be mean centered and std scaled.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        # shape (N * T, C, H, W)
+        img = inputs.flatten(0, 1)
+
+        x = self.backbone(img)
+        losses = self.track_head.loss(x, data_samples)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True) -> TrackSampleList:
+        """Predict results from a batch of inputs and data samples with
+        postprocessing.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+
+        assert len(data_samples) == 1, \
+            'Mask2former only support 1 batch size per gpu for now.'
+
+        # [T, C, H, W]
+        img = inputs[0]
+        track_data_sample = data_samples[0]
+        feats = self.backbone(img)
+        pred_track_ins_list = self.track_head.predict(feats, track_data_sample,
+                                                      rescale)
+
+        det_data_samples_list = []
+        for idx, pred_track_ins in enumerate(pred_track_ins_list):
+            img_data_sample = track_data_sample[idx]
+            img_data_sample.pred_track_instances = pred_track_ins
+            det_data_samples_list.append(img_data_sample)
+
+        results = TrackDataSample()
+        results.video_data_samples = det_data_samples_list
+        return [results]
diff --git a/head_extractor/src/mmdet/models/vis/masktrack_rcnn.py b/head_extractor/src/mmdet/models/vis/masktrack_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c28e7b8529d3d53d5a59ecff0ea46662d035f23
--- /dev/null
+++ b/head_extractor/src/mmdet/models/vis/masktrack_rcnn.py
@@ -0,0 +1,181 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from mmdet.models.mot import BaseMOTModel
+from mmdet.registry import MODELS
+from mmdet.structures import TrackSampleList
+from mmdet.utils import OptConfigType, OptMultiConfig
+
+
+@MODELS.register_module()
+class MaskTrackRCNN(BaseMOTModel):
+    """Video Instance Segmentation.
+
+    This video instance segmentor is the implementation of`MaskTrack R-CNN
+    <https://arxiv.org/abs/1905.04804>`_.
+
+    Args:
+        detector (dict): Configuration of detector. Defaults to None.
+        track_head (dict): Configuration of track head. Defaults to None.
+        tracker (dict): Configuration of tracker. Defaults to None.
+        data_preprocessor (dict or ConfigDict, optional): The pre-process
+           config of :class:`TrackDataPreprocessor`.  it usually includes,
+            ``pad_size_divisor``, ``pad_value``, ``mean`` and ``std``.
+        init_cfg (dict or list[dict]): Configuration of initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 detector: Optional[dict] = None,
+                 track_head: Optional[dict] = None,
+                 tracker: Optional[dict] = None,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(data_preprocessor, init_cfg)
+
+        if detector is not None:
+            self.detector = MODELS.build(detector)
+        assert hasattr(self.detector, 'roi_head'), \
+            'MaskTrack R-CNN only supports two stage detectors.'
+
+        if track_head is not None:
+            self.track_head = MODELS.build(track_head)
+        if tracker is not None:
+            self.tracker = MODELS.build(tracker)
+
+    def loss(self, inputs: Tensor, data_samples: TrackSampleList,
+             **kwargs) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Dict[str, Tensor]): of shape (N, T, C, H, W) encoding
+                input images. Typically these should be mean centered and std
+                scaled. The N denotes batch size. The T denotes the number of
+                frames.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `gt_instance`.
+
+        Returns:
+            dict: A dictionary of loss components.
+        """
+
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+        assert inputs.size(1) == 2, \
+            'MaskTrackRCNN can only have 1 key frame and 1 reference frame.'
+
+        # split the data_samples into two aspects: key frames and reference
+        # frames
+        ref_data_samples, key_data_samples = [], []
+        key_frame_inds, ref_frame_inds = [], []
+
+        # set cat_id of gt_labels to 0 in RPN
+        for track_data_sample in data_samples:
+            key_data_sample = track_data_sample.get_key_frames()[0]
+            key_data_samples.append(key_data_sample)
+            ref_data_sample = track_data_sample.get_ref_frames()[0]
+            ref_data_samples.append(ref_data_sample)
+            key_frame_inds.append(track_data_sample.key_frames_inds[0])
+            ref_frame_inds.append(track_data_sample.ref_frames_inds[0])
+
+        key_frame_inds = torch.tensor(key_frame_inds, dtype=torch.int64)
+        ref_frame_inds = torch.tensor(ref_frame_inds, dtype=torch.int64)
+        batch_inds = torch.arange(len(inputs))
+        key_imgs = inputs[batch_inds, key_frame_inds].contiguous()
+        ref_imgs = inputs[batch_inds, ref_frame_inds].contiguous()
+
+        x = self.detector.extract_feat(key_imgs)
+        ref_x = self.detector.extract_feat(ref_imgs)
+
+        losses = dict()
+
+        # RPN forward and loss
+        if self.detector.with_rpn:
+            proposal_cfg = self.detector.train_cfg.get(
+                'rpn_proposal', self.detector.test_cfg.rpn)
+
+            rpn_losses, rpn_results_list = self.detector.rpn_head. \
+                loss_and_predict(x,
+                                 key_data_samples,
+                                 proposal_cfg=proposal_cfg,
+                                 **kwargs)
+
+            # avoid get same name with roi_head loss
+            keys = rpn_losses.keys()
+            for key in keys:
+                if 'loss' in key and 'rpn' not in key:
+                    rpn_losses[f'rpn_{key}'] = rpn_losses.pop(key)
+            losses.update(rpn_losses)
+        else:
+            # TODO: Not support currently, should have a check at Fast R-CNN
+            assert key_data_samples[0].get('proposals', None) is not None
+            # use pre-defined proposals in InstanceData for the second stage
+            # to extract ROI features.
+            rpn_results_list = [
+                key_data_sample.proposals
+                for key_data_sample in key_data_samples
+            ]
+
+        losses_detect = self.detector.roi_head.loss(x, rpn_results_list,
+                                                    key_data_samples, **kwargs)
+        losses.update(losses_detect)
+
+        losses_track = self.track_head.loss(x, ref_x, rpn_results_list,
+                                            data_samples, **kwargs)
+        losses.update(losses_track)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: TrackSampleList,
+                rescale: bool = True,
+                **kwargs) -> TrackSampleList:
+        """Test without augmentation.
+
+        Args:
+            inputs (Tensor): of shape (N, T, C, H, W) encoding
+                input images. The N denotes batch size.
+                The T denotes the number of frames in a video.
+            data_samples (list[:obj:`TrackDataSample`]): The batch
+                data samples. It usually includes information such
+                as `video_data_samples`.
+            rescale (bool, Optional): If False, then returned bboxes and masks
+                will fit the scale of img, otherwise, returned bboxes and masks
+                will fit the scale of original image shape. Defaults to True.
+
+        Returns:
+            TrackSampleList: Tracking results of the inputs.
+        """
+        assert inputs.dim() == 5, 'The img must be 5D Tensor (N, T, C, H, W).'
+
+        assert len(data_samples) == 1, \
+            'MaskTrackRCNN only support 1 batch size per gpu for now.'
+
+        track_data_sample = data_samples[0]
+        video_len = len(track_data_sample)
+        if track_data_sample[0].frame_id == 0:
+            self.tracker.reset()
+
+        for frame_id in range(video_len):
+            img_data_sample = track_data_sample[frame_id]
+            single_img = inputs[:, frame_id].contiguous()
+            x = self.detector.extract_feat(single_img)
+
+            rpn_results_list = self.detector.rpn_head.predict(
+                x, [img_data_sample])
+            # det_results List[InstanceData]
+            det_results = self.detector.roi_head.predict(
+                x, rpn_results_list, [img_data_sample], rescale=rescale)
+            assert len(det_results) == 1, 'Batch inference is not supported.'
+            assert 'masks' in det_results[0], 'There are no mask results.'
+
+            img_data_sample.pred_instances = det_results[0]
+            frame_pred_track_instances = self.tracker.track(
+                model=self, feats=x, data_sample=img_data_sample, **kwargs)
+            img_data_sample.pred_track_instances = frame_pred_track_instances
+
+        return [track_data_sample]
diff --git a/head_extractor/src/mmdet/registry.py b/head_extractor/src/mmdet/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a5b2b28a4f80a488994b48a99043a20c604e55e
--- /dev/null
+++ b/head_extractor/src/mmdet/registry.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""MMDetection provides 17 registry nodes to support using modules across
+projects. Each node is a child of the root registry in MMEngine.
+
+More details can be found at
+https://mmengine.readthedocs.io/en/latest/tutorials/registry.html.
+"""
+
+from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS
+from mmengine.registry import DATASETS as MMENGINE_DATASETS
+from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR
+from mmengine.registry import HOOKS as MMENGINE_HOOKS
+from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS
+from mmengine.registry import LOOPS as MMENGINE_LOOPS
+from mmengine.registry import METRICS as MMENGINE_METRICS
+from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS
+from mmengine.registry import MODELS as MMENGINE_MODELS
+from mmengine.registry import \
+    OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS
+from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS
+from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS
+from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS
+from mmengine.registry import \
+    RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS
+from mmengine.registry import RUNNERS as MMENGINE_RUNNERS
+from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS
+from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS
+from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS
+from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS
+from mmengine.registry import \
+    WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS
+from mmengine.registry import Registry
+
+# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner`
+RUNNERS = Registry(
+    'runner', parent=MMENGINE_RUNNERS, locations=['mmdet.engine.runner'])
+# manage runner constructors that define how to initialize runners
+RUNNER_CONSTRUCTORS = Registry(
+    'runner constructor',
+    parent=MMENGINE_RUNNER_CONSTRUCTORS,
+    locations=['mmdet.engine.runner'])
+# manage all kinds of loops like `EpochBasedTrainLoop`
+LOOPS = Registry(
+    'loop', parent=MMENGINE_LOOPS, locations=['mmdet.engine.runner'])
+# manage all kinds of hooks like `CheckpointHook`
+HOOKS = Registry(
+    'hook', parent=MMENGINE_HOOKS, locations=['mmdet.engine.hooks'])
+
+# manage data-related modules
+DATASETS = Registry(
+    'dataset', parent=MMENGINE_DATASETS, locations=['mmdet.datasets'])
+DATA_SAMPLERS = Registry(
+    'data sampler',
+    parent=MMENGINE_DATA_SAMPLERS,
+    locations=['mmdet.datasets.samplers'])
+TRANSFORMS = Registry(
+    'transform',
+    parent=MMENGINE_TRANSFORMS,
+    locations=['mmdet.datasets.transforms'])
+
+# manage all kinds of modules inheriting `nn.Module`
+MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['mmdet.models'])
+# manage all kinds of model wrappers like 'MMDistributedDataParallel'
+MODEL_WRAPPERS = Registry(
+    'model_wrapper',
+    parent=MMENGINE_MODEL_WRAPPERS,
+    locations=['mmdet.models'])
+# manage all kinds of weight initialization modules like `Uniform`
+WEIGHT_INITIALIZERS = Registry(
+    'weight initializer',
+    parent=MMENGINE_WEIGHT_INITIALIZERS,
+    locations=['mmdet.models'])
+
+# manage all kinds of optimizers like `SGD` and `Adam`
+OPTIMIZERS = Registry(
+    'optimizer',
+    parent=MMENGINE_OPTIMIZERS,
+    locations=['mmdet.engine.optimizers'])
+# manage optimizer wrapper
+OPTIM_WRAPPERS = Registry(
+    'optim_wrapper',
+    parent=MMENGINE_OPTIM_WRAPPERS,
+    locations=['mmdet.engine.optimizers'])
+# manage constructors that customize the optimization hyperparameters.
+OPTIM_WRAPPER_CONSTRUCTORS = Registry(
+    'optimizer constructor',
+    parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS,
+    locations=['mmdet.engine.optimizers'])
+# manage all kinds of parameter schedulers like `MultiStepLR`
+PARAM_SCHEDULERS = Registry(
+    'parameter scheduler',
+    parent=MMENGINE_PARAM_SCHEDULERS,
+    locations=['mmdet.engine.schedulers'])
+# manage all kinds of metrics
+METRICS = Registry(
+    'metric', parent=MMENGINE_METRICS, locations=['mmdet.evaluation'])
+# manage evaluator
+EVALUATOR = Registry(
+    'evaluator', parent=MMENGINE_EVALUATOR, locations=['mmdet.evaluation'])
+
+# manage task-specific modules like anchor generators and box coders
+TASK_UTILS = Registry(
+    'task util', parent=MMENGINE_TASK_UTILS, locations=['mmdet.models'])
+
+# manage visualizer
+VISUALIZERS = Registry(
+    'visualizer',
+    parent=MMENGINE_VISUALIZERS,
+    locations=['mmdet.visualization'])
+# manage visualizer backend
+VISBACKENDS = Registry(
+    'vis_backend',
+    parent=MMENGINE_VISBACKENDS,
+    locations=['mmdet.visualization'])
+
+# manage logprocessor
+LOG_PROCESSORS = Registry(
+    'log_processor',
+    parent=MMENGINE_LOG_PROCESSORS,
+    # TODO: update the location when mmdet has its own log processor
+    locations=['mmdet.engine'])
diff --git a/head_extractor/src/mmdet/structures/__init__.py b/head_extractor/src/mmdet/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..381c6a4f4549c2c4395d994cbd860a3e52eb9994
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .det_data_sample import DetDataSample, OptSampleList, SampleList
+from .reid_data_sample import ReIDDataSample
+from .track_data_sample import (OptTrackSampleList, TrackDataSample,
+                                TrackSampleList)
+
+__all__ = [
+    'DetDataSample', 'SampleList', 'OptSampleList', 'TrackDataSample',
+    'TrackSampleList', 'OptTrackSampleList', 'ReIDDataSample'
+]
diff --git a/head_extractor/src/mmdet/structures/bbox/__init__.py b/head_extractor/src/mmdet/structures/bbox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d531986509ad1b2141118449aab39343bbde82c
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/bbox/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_boxes import BaseBoxes
+from .bbox_overlaps import bbox_overlaps
+from .box_type import (autocast_box_type, convert_box_type, get_box_type,
+                       register_box, register_box_converter)
+from .horizontal_boxes import HorizontalBoxes
+from .transforms import bbox_cxcyah_to_xyxy  # noqa: E501
+from .transforms import (bbox2corner, bbox2distance, bbox2result, bbox2roi,
+                         bbox_cxcywh_to_xyxy, bbox_flip, bbox_mapping,
+                         bbox_mapping_back, bbox_project, bbox_rescale,
+                         bbox_xyxy_to_cxcyah, bbox_xyxy_to_cxcywh, cat_boxes,
+                         corner2bbox, distance2bbox, empty_box_as,
+                         find_inside_bboxes, get_box_tensor, get_box_wh,
+                         roi2bbox, scale_boxes, stack_boxes)
+
+__all__ = [
+    'bbox_overlaps', 'bbox_flip', 'bbox_mapping', 'bbox_mapping_back',
+    'bbox2roi', 'roi2bbox', 'bbox2result', 'distance2bbox', 'bbox2distance',
+    'bbox_rescale', 'bbox_cxcywh_to_xyxy', 'bbox_xyxy_to_cxcywh',
+    'find_inside_bboxes', 'bbox2corner', 'corner2bbox', 'bbox_project',
+    'BaseBoxes', 'convert_box_type', 'get_box_type', 'register_box',
+    'register_box_converter', 'HorizontalBoxes', 'autocast_box_type',
+    'cat_boxes', 'stack_boxes', 'scale_boxes', 'get_box_wh', 'get_box_tensor',
+    'empty_box_as', 'bbox_xyxy_to_cxcyah', 'bbox_cxcyah_to_xyxy'
+]
diff --git a/head_extractor/src/mmdet/structures/bbox/base_boxes.py b/head_extractor/src/mmdet/structures/bbox/base_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ed667664a8a57a1b9b7e422af03d41274882747
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/bbox/base_boxes.py
@@ -0,0 +1,549 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod, abstractproperty, abstractstaticmethod
+from typing import List, Optional, Sequence, Tuple, Type, TypeVar, Union
+
+import numpy as np
+import torch
+from torch import BoolTensor, Tensor
+
+from mmdet.structures.mask.structures import BitmapMasks, PolygonMasks
+
+T = TypeVar('T')
+DeviceType = Union[str, torch.device]
+IndexType = Union[slice, int, list, torch.LongTensor, torch.cuda.LongTensor,
+                  torch.BoolTensor, torch.cuda.BoolTensor, np.ndarray]
+MaskType = Union[BitmapMasks, PolygonMasks]
+
+
+class BaseBoxes(metaclass=ABCMeta):
+    """The base class for 2D box types.
+
+    The functions of ``BaseBoxes`` lie in three fields:
+
+    - Verify the boxes shape.
+    - Support tensor-like operations.
+    - Define abstract functions for 2D boxes.
+
+    In ``__init__`` , ``BaseBoxes`` verifies the validity of the data shape
+    w.r.t ``box_dim``. The tensor with the dimension >= 2 and the length
+    of the last dimension being ``box_dim`` will be regarded as valid.
+    ``BaseBoxes`` will restore them at the field ``tensor``. It's necessary
+    to override ``box_dim`` in subclass to guarantee the data shape is
+    correct.
+
+    There are many basic tensor-like functions implemented in ``BaseBoxes``.
+    In most cases, users can operate ``BaseBoxes`` instance like a normal
+    tensor. To protect the validity of data shape, All tensor-like functions
+    cannot modify the last dimension of ``self.tensor``.
+
+    When creating a new box type, users need to inherit from ``BaseBoxes``
+    and override abstract methods and specify the ``box_dim``. Then, register
+    the new box type by using the decorator ``register_box_type``.
+
+    Args:
+        data (Tensor or np.ndarray or Sequence): The box data with shape
+            (..., box_dim).
+        dtype (torch.dtype, Optional): data type of boxes. Defaults to None.
+        device (str or torch.device, Optional): device of boxes.
+            Default to None.
+        clone (bool): Whether clone ``boxes`` or not. Defaults to True.
+    """
+
+    # Used to verify the last dimension length
+    # Should override it in subclass.
+    box_dim: int = 0
+
+    def __init__(self,
+                 data: Union[Tensor, np.ndarray, Sequence],
+                 dtype: Optional[torch.dtype] = None,
+                 device: Optional[DeviceType] = None,
+                 clone: bool = True) -> None:
+        if isinstance(data, (np.ndarray, Tensor, Sequence)):
+            data = torch.as_tensor(data)
+        else:
+            raise TypeError('boxes should be Tensor, ndarray, or Sequence, ',
+                            f'but got {type(data)}')
+
+        if device is not None or dtype is not None:
+            data = data.to(dtype=dtype, device=device)
+        # Clone the data to avoid potential bugs
+        if clone:
+            data = data.clone()
+        # handle the empty input like []
+        if data.numel() == 0:
+            data = data.reshape((-1, self.box_dim))
+
+        assert data.dim() >= 2 and data.size(-1) == self.box_dim, \
+            ('The boxes dimension must >= 2 and the length of the last '
+             f'dimension must be {self.box_dim}, but got boxes with '
+             f'shape {data.shape}.')
+        self.tensor = data
+
+    def convert_to(self, dst_type: Union[str, type]) -> 'BaseBoxes':
+        """Convert self to another box type.
+
+        Args:
+            dst_type (str or type): destination box type.
+
+        Returns:
+            :obj:`BaseBoxes`: destination box type object .
+        """
+        from .box_type import convert_box_type
+        return convert_box_type(self, dst_type=dst_type)
+
+    def empty_boxes(self: T,
+                    dtype: Optional[torch.dtype] = None,
+                    device: Optional[DeviceType] = None) -> T:
+        """Create empty box.
+
+        Args:
+            dtype (torch.dtype, Optional): data type of boxes.
+            device (str or torch.device, Optional): device of boxes.
+
+        Returns:
+            T: empty boxes with shape of (0, box_dim).
+        """
+        empty_box = self.tensor.new_zeros(
+            0, self.box_dim, dtype=dtype, device=device)
+        return type(self)(empty_box, clone=False)
+
+    def fake_boxes(self: T,
+                   sizes: Tuple[int],
+                   fill: float = 0,
+                   dtype: Optional[torch.dtype] = None,
+                   device: Optional[DeviceType] = None) -> T:
+        """Create fake boxes with specific sizes and fill values.
+
+        Args:
+            sizes (Tuple[int]): The size of fake boxes. The last value must
+                be equal with ``self.box_dim``.
+            fill (float): filling value. Defaults to 0.
+            dtype (torch.dtype, Optional): data type of boxes.
+            device (str or torch.device, Optional): device of boxes.
+
+        Returns:
+            T: Fake boxes with shape of ``sizes``.
+        """
+        fake_boxes = self.tensor.new_full(
+            sizes, fill, dtype=dtype, device=device)
+        return type(self)(fake_boxes, clone=False)
+
+    def __getitem__(self: T, index: IndexType) -> T:
+        """Rewrite getitem to protect the last dimension shape."""
+        boxes = self.tensor
+        if isinstance(index, np.ndarray):
+            index = torch.as_tensor(index, device=self.device)
+        if isinstance(index, Tensor) and index.dtype == torch.bool:
+            assert index.dim() < boxes.dim()
+        elif isinstance(index, tuple):
+            assert len(index) < boxes.dim()
+            # `Ellipsis`(...) is commonly used in index like [None, ...].
+            # When `Ellipsis` is in index, it must be the last item.
+            if Ellipsis in index:
+                assert index[-1] is Ellipsis
+
+        boxes = boxes[index]
+        if boxes.dim() == 1:
+            boxes = boxes.reshape(1, -1)
+        return type(self)(boxes, clone=False)
+
+    def __setitem__(self: T, index: IndexType, values: Union[Tensor, T]) -> T:
+        """Rewrite setitem to protect the last dimension shape."""
+        assert type(values) is type(self), \
+            'The value to be set must be the same box type as self'
+        values = values.tensor
+
+        if isinstance(index, np.ndarray):
+            index = torch.as_tensor(index, device=self.device)
+        if isinstance(index, Tensor) and index.dtype == torch.bool:
+            assert index.dim() < self.tensor.dim()
+        elif isinstance(index, tuple):
+            assert len(index) < self.tensor.dim()
+            # `Ellipsis`(...) is commonly used in index like [None, ...].
+            # When `Ellipsis` is in index, it must be the last item.
+            if Ellipsis in index:
+                assert index[-1] is Ellipsis
+
+        self.tensor[index] = values
+
+    def __len__(self) -> int:
+        """Return the length of self.tensor first dimension."""
+        return self.tensor.size(0)
+
+    def __deepcopy__(self, memo):
+        """Only clone the ``self.tensor`` when applying deepcopy."""
+        cls = self.__class__
+        other = cls.__new__(cls)
+        memo[id(self)] = other
+        other.tensor = self.tensor.clone()
+        return other
+
+    def __repr__(self) -> str:
+        """Return a strings that describes the object."""
+        return self.__class__.__name__ + '(\n' + str(self.tensor) + ')'
+
+    def new_tensor(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_tensor`` from self.tensor."""
+        return self.tensor.new_tensor(*args, **kwargs)
+
+    def new_full(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_full`` from self.tensor."""
+        return self.tensor.new_full(*args, **kwargs)
+
+    def new_empty(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_empty`` from self.tensor."""
+        return self.tensor.new_empty(*args, **kwargs)
+
+    def new_ones(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_ones`` from self.tensor."""
+        return self.tensor.new_ones(*args, **kwargs)
+
+    def new_zeros(self, *args, **kwargs) -> Tensor:
+        """Reload ``new_zeros`` from self.tensor."""
+        return self.tensor.new_zeros(*args, **kwargs)
+
+    def size(self, dim: Optional[int] = None) -> Union[int, torch.Size]:
+        """Reload new_zeros from self.tensor."""
+        # self.tensor.size(dim) cannot work when dim=None.
+        return self.tensor.size() if dim is None else self.tensor.size(dim)
+
+    def dim(self) -> int:
+        """Reload ``dim`` from self.tensor."""
+        return self.tensor.dim()
+
+    @property
+    def device(self) -> torch.device:
+        """Reload ``device`` from self.tensor."""
+        return self.tensor.device
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """Reload ``dtype`` from self.tensor."""
+        return self.tensor.dtype
+
+    @property
+    def shape(self) -> torch.Size:
+        return self.tensor.shape
+
+    def numel(self) -> int:
+        """Reload ``numel`` from self.tensor."""
+        return self.tensor.numel()
+
+    def numpy(self) -> np.ndarray:
+        """Reload ``numpy`` from self.tensor."""
+        return self.tensor.numpy()
+
+    def to(self: T, *args, **kwargs) -> T:
+        """Reload ``to`` from self.tensor."""
+        return type(self)(self.tensor.to(*args, **kwargs), clone=False)
+
+    def cpu(self: T) -> T:
+        """Reload ``cpu`` from self.tensor."""
+        return type(self)(self.tensor.cpu(), clone=False)
+
+    def cuda(self: T, *args, **kwargs) -> T:
+        """Reload ``cuda`` from self.tensor."""
+        return type(self)(self.tensor.cuda(*args, **kwargs), clone=False)
+
+    def clone(self: T) -> T:
+        """Reload ``clone`` from self.tensor."""
+        return type(self)(self.tensor)
+
+    def detach(self: T) -> T:
+        """Reload ``detach`` from self.tensor."""
+        return type(self)(self.tensor.detach(), clone=False)
+
+    def view(self: T, *shape: Tuple[int]) -> T:
+        """Reload ``view`` from self.tensor."""
+        return type(self)(self.tensor.view(shape), clone=False)
+
+    def reshape(self: T, *shape: Tuple[int]) -> T:
+        """Reload ``reshape`` from self.tensor."""
+        return type(self)(self.tensor.reshape(shape), clone=False)
+
+    def expand(self: T, *sizes: Tuple[int]) -> T:
+        """Reload ``expand`` from self.tensor."""
+        return type(self)(self.tensor.expand(sizes), clone=False)
+
+    def repeat(self: T, *sizes: Tuple[int]) -> T:
+        """Reload ``repeat`` from self.tensor."""
+        return type(self)(self.tensor.repeat(sizes), clone=False)
+
+    def transpose(self: T, dim0: int, dim1: int) -> T:
+        """Reload ``transpose`` from self.tensor."""
+        ndim = self.tensor.dim()
+        assert dim0 != -1 and dim0 != ndim - 1
+        assert dim1 != -1 and dim1 != ndim - 1
+        return type(self)(self.tensor.transpose(dim0, dim1), clone=False)
+
+    def permute(self: T, *dims: Tuple[int]) -> T:
+        """Reload ``permute`` from self.tensor."""
+        assert dims[-1] == -1 or dims[-1] == self.tensor.dim() - 1
+        return type(self)(self.tensor.permute(dims), clone=False)
+
+    def split(self: T,
+              split_size_or_sections: Union[int, Sequence[int]],
+              dim: int = 0) -> List[T]:
+        """Reload ``split`` from self.tensor."""
+        assert dim != -1 and dim != self.tensor.dim() - 1
+        boxes_list = self.tensor.split(split_size_or_sections, dim=dim)
+        return [type(self)(boxes, clone=False) for boxes in boxes_list]
+
+    def chunk(self: T, chunks: int, dim: int = 0) -> List[T]:
+        """Reload ``chunk`` from self.tensor."""
+        assert dim != -1 and dim != self.tensor.dim() - 1
+        boxes_list = self.tensor.chunk(chunks, dim=dim)
+        return [type(self)(boxes, clone=False) for boxes in boxes_list]
+
+    def unbind(self: T, dim: int = 0) -> T:
+        """Reload ``unbind`` from self.tensor."""
+        assert dim != -1 and dim != self.tensor.dim() - 1
+        boxes_list = self.tensor.unbind(dim=dim)
+        return [type(self)(boxes, clone=False) for boxes in boxes_list]
+
+    def flatten(self: T, start_dim: int = 0, end_dim: int = -2) -> T:
+        """Reload ``flatten`` from self.tensor."""
+        assert end_dim != -1 and end_dim != self.tensor.dim() - 1
+        return type(self)(self.tensor.flatten(start_dim, end_dim), clone=False)
+
+    def squeeze(self: T, dim: Optional[int] = None) -> T:
+        """Reload ``squeeze`` from self.tensor."""
+        boxes = self.tensor.squeeze() if dim is None else \
+            self.tensor.squeeze(dim)
+        return type(self)(boxes, clone=False)
+
+    def unsqueeze(self: T, dim: int) -> T:
+        """Reload ``unsqueeze`` from self.tensor."""
+        assert dim != -1 and dim != self.tensor.dim()
+        return type(self)(self.tensor.unsqueeze(dim), clone=False)
+
+    @classmethod
+    def cat(cls: Type[T], box_list: Sequence[T], dim: int = 0) -> T:
+        """Cancatenates a box instance list into one single box instance.
+        Similar to ``torch.cat``.
+
+        Args:
+            box_list (Sequence[T]): A sequence of box instances.
+            dim (int): The dimension over which the box are concatenated.
+                Defaults to 0.
+
+        Returns:
+            T: Concatenated box instance.
+        """
+        assert isinstance(box_list, Sequence)
+        if len(box_list) == 0:
+            raise ValueError('box_list should not be a empty list.')
+
+        assert dim != -1 and dim != box_list[0].dim() - 1
+        assert all(isinstance(boxes, cls) for boxes in box_list)
+
+        th_box_list = [boxes.tensor for boxes in box_list]
+        return cls(torch.cat(th_box_list, dim=dim), clone=False)
+
+    @classmethod
+    def stack(cls: Type[T], box_list: Sequence[T], dim: int = 0) -> T:
+        """Concatenates a sequence of tensors along a new dimension. Similar to
+        ``torch.stack``.
+
+        Args:
+            box_list (Sequence[T]): A sequence of box instances.
+            dim (int): Dimension to insert. Defaults to 0.
+
+        Returns:
+            T: Concatenated box instance.
+        """
+        assert isinstance(box_list, Sequence)
+        if len(box_list) == 0:
+            raise ValueError('box_list should not be a empty list.')
+
+        assert dim != -1 and dim != box_list[0].dim()
+        assert all(isinstance(boxes, cls) for boxes in box_list)
+
+        th_box_list = [boxes.tensor for boxes in box_list]
+        return cls(torch.stack(th_box_list, dim=dim), clone=False)
+
+    @abstractproperty
+    def centers(self) -> Tensor:
+        """Return a tensor representing the centers of boxes."""
+        pass
+
+    @abstractproperty
+    def areas(self) -> Tensor:
+        """Return a tensor representing the areas of boxes."""
+        pass
+
+    @abstractproperty
+    def widths(self) -> Tensor:
+        """Return a tensor representing the widths of boxes."""
+        pass
+
+    @abstractproperty
+    def heights(self) -> Tensor:
+        """Return a tensor representing the heights of boxes."""
+        pass
+
+    @abstractmethod
+    def flip_(self,
+              img_shape: Tuple[int, int],
+              direction: str = 'horizontal') -> None:
+        """Flip boxes horizontally or vertically in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+            direction (str): Flip direction, options are "horizontal",
+                "vertical" and "diagonal". Defaults to "horizontal"
+        """
+        pass
+
+    @abstractmethod
+    def translate_(self, distances: Tuple[float, float]) -> None:
+        """Translate boxes in-place.
+
+        Args:
+            distances (Tuple[float, float]): translate distances. The first
+                is horizontal distance and the second is vertical distance.
+        """
+        pass
+
+    @abstractmethod
+    def clip_(self, img_shape: Tuple[int, int]) -> None:
+        """Clip boxes according to the image shape in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+        """
+        pass
+
+    @abstractmethod
+    def rotate_(self, center: Tuple[float, float], angle: float) -> None:
+        """Rotate all boxes in-place.
+
+        Args:
+            center (Tuple[float, float]): Rotation origin.
+            angle (float): Rotation angle represented in degrees. Positive
+                values mean clockwise rotation.
+        """
+        pass
+
+    @abstractmethod
+    def project_(self, homography_matrix: Union[Tensor, np.ndarray]) -> None:
+        """Geometric transformat boxes in-place.
+
+        Args:
+            homography_matrix (Tensor or np.ndarray]):
+                Shape (3, 3) for geometric transformation.
+        """
+        pass
+
+    @abstractmethod
+    def rescale_(self, scale_factor: Tuple[float, float]) -> None:
+        """Rescale boxes w.r.t. rescale_factor in-place.
+
+        Note:
+            Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
+            w.r.t ``scale_facotr``. The difference is that ``resize_`` only
+            changes the width and the height of boxes, but ``rescale_`` also
+            rescales the box centers simultaneously.
+
+        Args:
+            scale_factor (Tuple[float, float]): factors for scaling boxes.
+                The length should be 2.
+        """
+        pass
+
+    @abstractmethod
+    def resize_(self, scale_factor: Tuple[float, float]) -> None:
+        """Resize the box width and height w.r.t scale_factor in-place.
+
+        Note:
+            Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
+            w.r.t ``scale_facotr``. The difference is that ``resize_`` only
+            changes the width and the height of boxes, but ``rescale_`` also
+            rescales the box centers simultaneously.
+
+        Args:
+            scale_factor (Tuple[float, float]): factors for scaling box
+                shapes. The length should be 2.
+        """
+        pass
+
+    @abstractmethod
+    def is_inside(self,
+                  img_shape: Tuple[int, int],
+                  all_inside: bool = False,
+                  allowed_border: int = 0) -> BoolTensor:
+        """Find boxes inside the image.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+            all_inside (bool): Whether the boxes are all inside the image or
+                part inside the image. Defaults to False.
+            allowed_border (int): Boxes that extend beyond the image shape
+                boundary by more than ``allowed_border`` are considered
+                "outside" Defaults to 0.
+        Returns:
+            BoolTensor: A BoolTensor indicating whether the box is inside
+            the image. Assuming the original boxes have shape (m, n, box_dim),
+            the output has shape (m, n).
+        """
+        pass
+
+    @abstractmethod
+    def find_inside_points(self,
+                           points: Tensor,
+                           is_aligned: bool = False) -> BoolTensor:
+        """Find inside box points. Boxes dimension must be 2.
+
+        Args:
+            points (Tensor): Points coordinates. Has shape of (m, 2).
+            is_aligned (bool): Whether ``points`` has been aligned with boxes
+                or not. If True, the length of boxes and ``points`` should be
+                the same. Defaults to False.
+
+        Returns:
+            BoolTensor: A BoolTensor indicating whether a point is inside
+            boxes. Assuming the boxes has shape of (n, box_dim), if
+            ``is_aligned`` is False. The index has shape of (m, n). If
+            ``is_aligned`` is True, m should be equal to n and the index has
+            shape of (m, ).
+        """
+        pass
+
+    @abstractstaticmethod
+    def overlaps(boxes1: 'BaseBoxes',
+                 boxes2: 'BaseBoxes',
+                 mode: str = 'iou',
+                 is_aligned: bool = False,
+                 eps: float = 1e-6) -> Tensor:
+        """Calculate overlap between two set of boxes with their types
+        converted to the present box type.
+
+        Args:
+            boxes1 (:obj:`BaseBoxes`): BaseBoxes with shape of (m, box_dim)
+                or empty.
+            boxes2 (:obj:`BaseBoxes`): BaseBoxes with shape of (n, box_dim)
+                or empty.
+            mode (str): "iou" (intersection over union), "iof" (intersection
+                over foreground). Defaults to "iou".
+            is_aligned (bool): If True, then m and n must be equal. Defaults
+                to False.
+            eps (float): A value added to the denominator for numerical
+                stability. Defaults to 1e-6.
+
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+        """
+        pass
+
+    @abstractstaticmethod
+    def from_instance_masks(masks: MaskType) -> 'BaseBoxes':
+        """Create boxes from instance masks.
+
+        Args:
+            masks (:obj:`BitmapMasks` or :obj:`PolygonMasks`): BitmapMasks or
+                PolygonMasks instance with length of n.
+
+        Returns:
+            :obj:`BaseBoxes`: Converted boxes with shape of (n, box_dim).
+        """
+        pass
diff --git a/head_extractor/src/mmdet/structures/bbox/bbox_overlaps.py b/head_extractor/src/mmdet/structures/bbox/bbox_overlaps.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e3435d28b38a5479a6c791f52a76d8ba293a6eb
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/bbox/bbox_overlaps.py
@@ -0,0 +1,199 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def fp16_clamp(x, min=None, max=None):
+    if not x.is_cuda and x.dtype == torch.float16:
+        # clamp for cpu float16, tensor fp16 has no clamp implementation
+        return x.float().clamp(min, max).half()
+
+    return x.clamp(min, max)
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
+    """Calculate overlap between two set of bboxes.
+
+    FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889
+    Note:
+        Assume bboxes1 is M x 4, bboxes2 is N x 4, when mode is 'iou',
+        there are some new generated variable when calculating IOU
+        using bbox_overlaps function:
+
+        1) is_aligned is False
+            area1: M x 1
+            area2: N x 1
+            lt: M x N x 2
+            rb: M x N x 2
+            wh: M x N x 2
+            overlap: M x N x 1
+            union: M x N x 1
+            ious: M x N x 1
+
+            Total memory:
+                S = (9 x N x M + N + M) * 4 Byte,
+
+            When using FP16, we can reduce:
+                R = (9 x N x M + N + M) * 4 / 2 Byte
+                R large than (N + M) * 4 * 2 is always true when N and M >= 1.
+                Obviously, N + M <= N * M < 3 * N * M, when N >=2 and M >=2,
+                           N + 1 < 3 * N, when N or M is 1.
+
+            Given M = 40 (ground truth), N = 400000 (three anchor boxes
+            in per grid, FPN, R-CNNs),
+                R = 275 MB (one times)
+
+            A special case (dense detection), M = 512 (ground truth),
+                R = 3516 MB = 3.43 GB
+
+            When the batch size is B, reduce:
+                B x R
+
+            Therefore, CUDA memory runs out frequently.
+
+            Experiments on GeForce RTX 2080Ti (11019 MiB):
+
+            |   dtype   |   M   |   N   |   Use    |   Real   |   Ideal   |
+            |:----:|:----:|:----:|:----:|:----:|:----:|
+            |   FP32   |   512 | 400000 | 8020 MiB |   --   |   --   |
+            |   FP16   |   512 | 400000 |   4504 MiB | 3516 MiB | 3516 MiB |
+            |   FP32   |   40 | 400000 |   1540 MiB |   --   |   --   |
+            |   FP16   |   40 | 400000 |   1264 MiB |   276MiB   | 275 MiB |
+
+        2) is_aligned is True
+            area1: N x 1
+            area2: N x 1
+            lt: N x 2
+            rb: N x 2
+            wh: N x 2
+            overlap: N x 1
+            union: N x 1
+            ious: N x 1
+
+            Total memory:
+                S = 11 x N * 4 Byte
+
+            When using FP16, we can reduce:
+                R = 11 x N * 4 / 2 Byte
+
+        So do the 'giou' (large than 'iou').
+
+        Time-wise, FP16 is generally faster than FP32.
+
+        When gpu_assign_thr is not -1, it takes more time on cpu
+        but not reduce memory.
+        There, we can reduce half the memory and keep the speed.
+
+    If ``is_aligned`` is ``False``, then calculate the overlaps between each
+    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+    pair of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned`` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union), "iof" (intersection over
+            foreground) or "giou" (generalized intersection over union).
+            Default "iou".
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2)
+        >>> assert overlaps.shape == (3, 3)
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+        >>> assert overlaps.shape == (3, )
+
+    Example:
+        >>> empty = torch.empty(0, 4)
+        >>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
+    # Either the boxes are empty or the length of boxes' last dimension is 4
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.size(-2)
+    cols = bboxes2.size(-2)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return bboxes1.new(batch_shape + (rows, ))
+        else:
+            return bboxes1.new(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+        bboxes1[..., 3] - bboxes1[..., 1])
+    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+        bboxes2[..., 3] - bboxes2[..., 1])
+
+    if is_aligned:
+        lt = torch.max(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
+        rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]
+
+        wh = fp16_clamp(rb - lt, min=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
+    else:
+        lt = torch.max(bboxes1[..., :, None, :2],
+                       bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
+        rb = torch.min(bboxes1[..., :, None, 2:],
+                       bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]
+
+        wh = fp16_clamp(rb - lt, min=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        else:
+            union = area1[..., None]
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :, None, :2],
+                                    bboxes2[..., None, :, :2])
+            enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
+                                    bboxes2[..., None, :, 2:])
+
+    eps = union.new_tensor([eps])
+    union = torch.max(union, eps)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    # calculate gious
+    enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
+    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+    enclose_area = torch.max(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return gious
diff --git a/head_extractor/src/mmdet/structures/bbox/box_type.py b/head_extractor/src/mmdet/structures/bbox/box_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7eb5494c36c8efcbb414897f7c2532a6d3a1ddb
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/bbox/box_type.py
@@ -0,0 +1,296 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, Optional, Tuple, Type, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from .base_boxes import BaseBoxes
+
+BoxType = Union[np.ndarray, Tensor, BaseBoxes]
+
+box_types: dict = {}
+_box_type_to_name: dict = {}
+box_converters: dict = {}
+
+
+def _register_box(name: str, box_type: Type, force: bool = False) -> None:
+    """Register a box type.
+
+    Args:
+        name (str): The name of box type.
+        box_type (type): Box mode class to be registered.
+        force (bool): Whether to override an existing class with the same
+            name. Defaults to False.
+    """
+    assert issubclass(box_type, BaseBoxes)
+    name = name.lower()
+
+    if not force and (name in box_types or box_type in _box_type_to_name):
+        raise KeyError(f'box type {name} has been registered')
+    elif name in box_types:
+        _box_type = box_types.pop(name)
+        _box_type_to_name.pop(_box_type)
+    elif box_type in _box_type_to_name:
+        _name = _box_type_to_name.pop(box_type)
+        box_types.pop(_name)
+
+    box_types[name] = box_type
+    _box_type_to_name[box_type] = name
+
+
+def register_box(name: str,
+                 box_type: Type = None,
+                 force: bool = False) -> Union[Type, Callable]:
+    """Register a box type.
+
+    A record will be added to ``bbox_types``, whose key is the box type name
+    and value is the box type itself. Simultaneously, a reverse dictionary
+    ``_box_type_to_name`` will be updated. It can be used as a decorator or
+    a normal function.
+
+    Args:
+        name (str): The name of box type.
+        bbox_type (type, Optional): Box type class to be registered.
+            Defaults to None.
+        force (bool): Whether to override the existing box type with the same
+            name. Defaults to False.
+
+    Examples:
+        >>> from mmdet.structures.bbox import register_box
+        >>> from mmdet.structures.bbox import BaseBoxes
+
+        >>> # as a decorator
+        >>> @register_box('hbox')
+        >>> class HorizontalBoxes(BaseBoxes):
+        >>>     pass
+
+        >>> # as a normal function
+        >>> class RotatedBoxes(BaseBoxes):
+        >>>     pass
+        >>> register_box('rbox', RotatedBoxes)
+    """
+    if not isinstance(force, bool):
+        raise TypeError(f'force must be a boolean, but got {type(force)}')
+
+    # use it as a normal method: register_box(name, box_type=BoxCls)
+    if box_type is not None:
+        _register_box(name=name, box_type=box_type, force=force)
+        return box_type
+
+    # use it as a decorator: @register_box(name)
+    def _register(cls):
+        _register_box(name=name, box_type=cls, force=force)
+        return cls
+
+    return _register
+
+
+def _register_box_converter(src_type: Union[str, type],
+                            dst_type: Union[str, type],
+                            converter: Callable,
+                            force: bool = False) -> None:
+    """Register a box converter.
+
+    Args:
+        src_type (str or type): source box type name or class.
+        dst_type (str or type): destination box type name or class.
+        converter (Callable): Convert function.
+        force (bool): Whether to override the existing box type with the same
+            name. Defaults to False.
+    """
+    assert callable(converter)
+    src_type_name, _ = get_box_type(src_type)
+    dst_type_name, _ = get_box_type(dst_type)
+
+    converter_name = src_type_name + '2' + dst_type_name
+    if not force and converter_name in box_converters:
+        raise KeyError(f'The box converter from {src_type_name} to '
+                       f'{dst_type_name} has been registered.')
+
+    box_converters[converter_name] = converter
+
+
+def register_box_converter(src_type: Union[str, type],
+                           dst_type: Union[str, type],
+                           converter: Optional[Callable] = None,
+                           force: bool = False) -> Callable:
+    """Register a box converter.
+
+    A record will be added to ``box_converter``, whose key is
+    '{src_type_name}2{dst_type_name}' and value is the convert function.
+    It can be used as a decorator or a normal function.
+
+    Args:
+        src_type (str or type): source box type name or class.
+        dst_type (str or type): destination box type name or class.
+        converter (Callable): Convert function. Defaults to None.
+        force (bool): Whether to override the existing box type with the same
+            name. Defaults to False.
+
+    Examples:
+        >>> from mmdet.structures.bbox import register_box_converter
+        >>> # as a decorator
+        >>> @register_box_converter('hbox', 'rbox')
+        >>> def converter_A(boxes):
+        >>>     pass
+
+        >>> # as a normal function
+        >>> def converter_B(boxes):
+        >>>     pass
+        >>> register_box_converter('rbox', 'hbox', converter_B)
+    """
+    if not isinstance(force, bool):
+        raise TypeError(f'force must be a boolean, but got {type(force)}')
+
+    # use it as a normal method:
+    # register_box_converter(src_type, dst_type, converter=Func)
+    if converter is not None:
+        _register_box_converter(
+            src_type=src_type,
+            dst_type=dst_type,
+            converter=converter,
+            force=force)
+        return converter
+
+    # use it as a decorator: @register_box_converter(name)
+    def _register(func):
+        _register_box_converter(
+            src_type=src_type, dst_type=dst_type, converter=func, force=force)
+        return func
+
+    return _register
+
+
+def get_box_type(box_type: Union[str, type]) -> Tuple[str, type]:
+    """get both box type name and class.
+
+    Args:
+        box_type (str or type): Single box type name or class.
+
+    Returns:
+        Tuple[str, type]: A tuple of box type name and class.
+    """
+    if isinstance(box_type, str):
+        type_name = box_type.lower()
+        assert type_name in box_types, \
+            f"Box type {type_name} hasn't been registered in box_types."
+        type_cls = box_types[type_name]
+    elif issubclass(box_type, BaseBoxes):
+        assert box_type in _box_type_to_name, \
+            f"Box type {box_type} hasn't been registered in box_types."
+        type_name = _box_type_to_name[box_type]
+        type_cls = box_type
+    else:
+        raise KeyError('box_type must be a str or class inheriting from '
+                       f'BaseBoxes, but got {type(box_type)}.')
+    return type_name, type_cls
+
+
+def convert_box_type(boxes: BoxType,
+                     *,
+                     src_type: Union[str, type] = None,
+                     dst_type: Union[str, type] = None) -> BoxType:
+    """Convert boxes from source type to destination type.
+
+    If ``boxes`` is a instance of BaseBoxes, the ``src_type`` will be set
+    as the type of ``boxes``.
+
+    Args:
+        boxes (np.ndarray or Tensor or :obj:`BaseBoxes`): boxes need to
+            convert.
+        src_type (str or type, Optional): source box type. Defaults to None.
+        dst_type (str or type, Optional): destination box type. Defaults to
+            None.
+
+    Returns:
+        Union[np.ndarray, Tensor, :obj:`BaseBoxes`]: Converted boxes. It's type
+        is consistent with the input's type.
+    """
+    assert dst_type is not None
+    dst_type_name, dst_type_cls = get_box_type(dst_type)
+
+    is_box_cls = False
+    is_numpy = False
+    if isinstance(boxes, BaseBoxes):
+        src_type_name, _ = get_box_type(type(boxes))
+        is_box_cls = True
+    elif isinstance(boxes, (Tensor, np.ndarray)):
+        assert src_type is not None
+        src_type_name, _ = get_box_type(src_type)
+        if isinstance(boxes, np.ndarray):
+            is_numpy = True
+    else:
+        raise TypeError('boxes must be a instance of BaseBoxes, Tensor or '
+                        f'ndarray, but get {type(boxes)}.')
+
+    if src_type_name == dst_type_name:
+        return boxes
+
+    converter_name = src_type_name + '2' + dst_type_name
+    assert converter_name in box_converters, \
+        "Convert function hasn't been registered in box_converters."
+    converter = box_converters[converter_name]
+
+    if is_box_cls:
+        boxes = converter(boxes.tensor)
+        return dst_type_cls(boxes)
+    elif is_numpy:
+        boxes = converter(torch.from_numpy(boxes))
+        return boxes.numpy()
+    else:
+        return converter(boxes)
+
+
+def autocast_box_type(dst_box_type='hbox') -> Callable:
+    """A decorator which automatically casts results['gt_bboxes'] to the
+    destination box type.
+
+    It commenly used in mmdet.datasets.transforms to make the transforms up-
+    compatible with the np.ndarray type of results['gt_bboxes'].
+
+    The speed of processing of np.ndarray and BaseBoxes data are the same:
+
+    - np.ndarray: 0.0509 img/s
+    - BaseBoxes: 0.0551 img/s
+
+    Args:
+        dst_box_type (str): Destination box type.
+    """
+    _, box_type_cls = get_box_type(dst_box_type)
+
+    def decorator(func: Callable) -> Callable:
+
+        def wrapper(self, results: dict, *args, **kwargs) -> dict:
+            if ('gt_bboxes' not in results
+                    or isinstance(results['gt_bboxes'], BaseBoxes)):
+                return func(self, results)
+            elif isinstance(results['gt_bboxes'], np.ndarray):
+                results['gt_bboxes'] = box_type_cls(
+                    results['gt_bboxes'], clone=False)
+                if 'mix_results' in results:
+                    for res in results['mix_results']:
+                        if isinstance(res['gt_bboxes'], np.ndarray):
+                            res['gt_bboxes'] = box_type_cls(
+                                res['gt_bboxes'], clone=False)
+
+                _results = func(self, results, *args, **kwargs)
+
+                # In some cases, the function will process gt_bboxes in-place
+                # Simultaneously convert inputting and outputting gt_bboxes
+                # back to np.ndarray
+                if isinstance(_results, dict) and 'gt_bboxes' in _results:
+                    if isinstance(_results['gt_bboxes'], BaseBoxes):
+                        _results['gt_bboxes'] = _results['gt_bboxes'].numpy()
+                if isinstance(results['gt_bboxes'], BaseBoxes):
+                    results['gt_bboxes'] = results['gt_bboxes'].numpy()
+                return _results
+            else:
+                raise TypeError(
+                    "auto_box_type requires results['gt_bboxes'] to "
+                    'be BaseBoxes or np.ndarray, but got '
+                    f"{type(results['gt_bboxes'])}")
+
+        return wrapper
+
+    return decorator
diff --git a/head_extractor/src/mmdet/structures/bbox/horizontal_boxes.py b/head_extractor/src/mmdet/structures/bbox/horizontal_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3a78518105fda02cef2d3a2bcaceb410759165c
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/bbox/horizontal_boxes.py
@@ -0,0 +1,432 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, TypeVar, Union
+
+import cv2
+import numpy as np
+import torch
+from torch import BoolTensor, Tensor
+
+from mmdet.structures.mask.structures import BitmapMasks, PolygonMasks
+from .base_boxes import BaseBoxes
+from .bbox_overlaps import bbox_overlaps
+from .box_type import register_box
+
+T = TypeVar('T')
+DeviceType = Union[str, torch.device]
+MaskType = Union[BitmapMasks, PolygonMasks]
+
+
+@register_box(name='hbox')
+class HorizontalBoxes(BaseBoxes):
+    """The horizontal box class used in MMDetection by default.
+
+    The ``box_dim`` of ``HorizontalBoxes`` is 4, which means the length of
+    the last dimension of the data should be 4. Two modes of box data are
+    supported in ``HorizontalBoxes``:
+
+    - 'xyxy': Each row of data indicates (x1, y1, x2, y2), which are the
+      coordinates of the left-top and right-bottom points.
+    - 'cxcywh': Each row of data indicates (x, y, w, h), where (x, y) are the
+      coordinates of the box centers and (w, h) are the width and height.
+
+    ``HorizontalBoxes`` only restores 'xyxy' mode of data. If the the data is
+    in 'cxcywh' mode, users need to input ``in_mode='cxcywh'`` and The code
+    will convert the 'cxcywh' data to 'xyxy' automatically.
+
+    Args:
+        data (Tensor or np.ndarray or Sequence): The box data with shape of
+            (..., 4).
+        dtype (torch.dtype, Optional): data type of boxes. Defaults to None.
+        device (str or torch.device, Optional): device of boxes.
+            Default to None.
+        clone (bool): Whether clone ``boxes`` or not. Defaults to True.
+        mode (str, Optional): the mode of boxes. If it is 'cxcywh', the
+            `data` will be converted to 'xyxy' mode. Defaults to None.
+    """
+
+    box_dim: int = 4
+
+    def __init__(self,
+                 data: Union[Tensor, np.ndarray],
+                 dtype: torch.dtype = None,
+                 device: DeviceType = None,
+                 clone: bool = True,
+                 in_mode: Optional[str] = None) -> None:
+        super().__init__(data=data, dtype=dtype, device=device, clone=clone)
+        if isinstance(in_mode, str):
+            if in_mode not in ('xyxy', 'cxcywh'):
+                raise ValueError(f'Get invalid mode {in_mode}.')
+            if in_mode == 'cxcywh':
+                self.tensor = self.cxcywh_to_xyxy(self.tensor)
+
+    @staticmethod
+    def cxcywh_to_xyxy(boxes: Tensor) -> Tensor:
+        """Convert box coordinates from (cx, cy, w, h) to (x1, y1, x2, y2).
+
+        Args:
+            boxes (Tensor): cxcywh boxes tensor with shape of (..., 4).
+
+        Returns:
+            Tensor: xyxy boxes tensor with shape of (..., 4).
+        """
+        ctr, wh = boxes.split((2, 2), dim=-1)
+        return torch.cat([(ctr - wh / 2), (ctr + wh / 2)], dim=-1)
+
+    @staticmethod
+    def xyxy_to_cxcywh(boxes: Tensor) -> Tensor:
+        """Convert box coordinates from (x1, y1, x2, y2) to (cx, cy, w, h).
+
+        Args:
+            boxes (Tensor): xyxy boxes tensor with shape of (..., 4).
+
+        Returns:
+            Tensor: cxcywh boxes tensor with shape of (..., 4).
+        """
+        xy1, xy2 = boxes.split((2, 2), dim=-1)
+        return torch.cat([(xy2 + xy1) / 2, (xy2 - xy1)], dim=-1)
+
+    @property
+    def cxcywh(self) -> Tensor:
+        """Return a tensor representing the cxcywh boxes."""
+        return self.xyxy_to_cxcywh(self.tensor)
+
+    @property
+    def centers(self) -> Tensor:
+        """Return a tensor representing the centers of boxes."""
+        boxes = self.tensor
+        return (boxes[..., :2] + boxes[..., 2:]) / 2
+
+    @property
+    def areas(self) -> Tensor:
+        """Return a tensor representing the areas of boxes."""
+        boxes = self.tensor
+        return (boxes[..., 2] - boxes[..., 0]) * (
+            boxes[..., 3] - boxes[..., 1])
+
+    @property
+    def widths(self) -> Tensor:
+        """Return a tensor representing the widths of boxes."""
+        boxes = self.tensor
+        return boxes[..., 2] - boxes[..., 0]
+
+    @property
+    def heights(self) -> Tensor:
+        """Return a tensor representing the heights of boxes."""
+        boxes = self.tensor
+        return boxes[..., 3] - boxes[..., 1]
+
+    def flip_(self,
+              img_shape: Tuple[int, int],
+              direction: str = 'horizontal') -> None:
+        """Flip boxes horizontally or vertically in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+            direction (str): Flip direction, options are "horizontal",
+                "vertical" and "diagonal". Defaults to "horizontal"
+        """
+        assert direction in ['horizontal', 'vertical', 'diagonal']
+        flipped = self.tensor
+        boxes = flipped.clone()
+        if direction == 'horizontal':
+            flipped[..., 0] = img_shape[1] - boxes[..., 2]
+            flipped[..., 2] = img_shape[1] - boxes[..., 0]
+        elif direction == 'vertical':
+            flipped[..., 1] = img_shape[0] - boxes[..., 3]
+            flipped[..., 3] = img_shape[0] - boxes[..., 1]
+        else:
+            flipped[..., 0] = img_shape[1] - boxes[..., 2]
+            flipped[..., 1] = img_shape[0] - boxes[..., 3]
+            flipped[..., 2] = img_shape[1] - boxes[..., 0]
+            flipped[..., 3] = img_shape[0] - boxes[..., 1]
+
+    def translate_(self, distances: Tuple[float, float]) -> None:
+        """Translate boxes in-place.
+
+        Args:
+            distances (Tuple[float, float]): translate distances. The first
+                is horizontal distance and the second is vertical distance.
+        """
+        boxes = self.tensor
+        assert len(distances) == 2
+        self.tensor = boxes + boxes.new_tensor(distances).repeat(2)
+
+    def clip_(self, img_shape: Tuple[int, int]) -> None:
+        """Clip boxes according to the image shape in-place.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+        """
+        boxes = self.tensor
+        boxes[..., 0::2] = boxes[..., 0::2].clamp(0, img_shape[1])
+        boxes[..., 1::2] = boxes[..., 1::2].clamp(0, img_shape[0])
+
+    def rotate_(self, center: Tuple[float, float], angle: float) -> None:
+        """Rotate all boxes in-place.
+
+        Args:
+            center (Tuple[float, float]): Rotation origin.
+            angle (float): Rotation angle represented in degrees. Positive
+                values mean clockwise rotation.
+        """
+        boxes = self.tensor
+        rotation_matrix = boxes.new_tensor(
+            cv2.getRotationMatrix2D(center, -angle, 1))
+
+        corners = self.hbox2corner(boxes)
+        corners = torch.cat(
+            [corners, corners.new_ones(*corners.shape[:-1], 1)], dim=-1)
+        corners_T = torch.transpose(corners, -1, -2)
+        corners_T = torch.matmul(rotation_matrix, corners_T)
+        corners = torch.transpose(corners_T, -1, -2)
+        self.tensor = self.corner2hbox(corners)
+
+    def project_(self, homography_matrix: Union[Tensor, np.ndarray]) -> None:
+        """Geometric transformat boxes in-place.
+
+        Args:
+            homography_matrix (Tensor or np.ndarray]):
+                Shape (3, 3) for geometric transformation.
+        """
+        boxes = self.tensor
+        if isinstance(homography_matrix, np.ndarray):
+            homography_matrix = boxes.new_tensor(homography_matrix)
+        corners = self.hbox2corner(boxes)
+        corners = torch.cat(
+            [corners, corners.new_ones(*corners.shape[:-1], 1)], dim=-1)
+        corners_T = torch.transpose(corners, -1, -2)
+        corners_T = torch.matmul(homography_matrix, corners_T)
+        corners = torch.transpose(corners_T, -1, -2)
+        # Convert to homogeneous coordinates by normalization
+        corners = corners[..., :2] / corners[..., 2:3]
+        self.tensor = self.corner2hbox(corners)
+
+    @staticmethod
+    def hbox2corner(boxes: Tensor) -> Tensor:
+        """Convert box coordinates from (x1, y1, x2, y2) to corners ((x1, y1),
+        (x2, y1), (x1, y2), (x2, y2)).
+
+        Args:
+            boxes (Tensor): Horizontal box tensor with shape of (..., 4).
+
+        Returns:
+            Tensor: Corner tensor with shape of (..., 4, 2).
+        """
+        x1, y1, x2, y2 = torch.split(boxes, 1, dim=-1)
+        corners = torch.cat([x1, y1, x2, y1, x1, y2, x2, y2], dim=-1)
+        return corners.reshape(*corners.shape[:-1], 4, 2)
+
+    @staticmethod
+    def corner2hbox(corners: Tensor) -> Tensor:
+        """Convert box coordinates from corners ((x1, y1), (x2, y1), (x1, y2),
+        (x2, y2)) to (x1, y1, x2, y2).
+
+        Args:
+            corners (Tensor): Corner tensor with shape of (..., 4, 2).
+
+        Returns:
+            Tensor: Horizontal box tensor with shape of (..., 4).
+        """
+        if corners.numel() == 0:
+            return corners.new_zeros((0, 4))
+        min_xy = corners.min(dim=-2)[0]
+        max_xy = corners.max(dim=-2)[0]
+        return torch.cat([min_xy, max_xy], dim=-1)
+
+    def rescale_(self, scale_factor: Tuple[float, float]) -> None:
+        """Rescale boxes w.r.t. rescale_factor in-place.
+
+        Note:
+            Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
+            w.r.t ``scale_facotr``. The difference is that ``resize_`` only
+            changes the width and the height of boxes, but ``rescale_`` also
+            rescales the box centers simultaneously.
+
+        Args:
+            scale_factor (Tuple[float, float]): factors for scaling boxes.
+                The length should be 2.
+        """
+        boxes = self.tensor
+        assert len(scale_factor) == 2
+        scale_factor = boxes.new_tensor(scale_factor).repeat(2)
+        self.tensor = boxes * scale_factor
+
+    def resize_(self, scale_factor: Tuple[float, float]) -> None:
+        """Resize the box width and height w.r.t scale_factor in-place.
+
+        Note:
+            Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
+            w.r.t ``scale_facotr``. The difference is that ``resize_`` only
+            changes the width and the height of boxes, but ``rescale_`` also
+            rescales the box centers simultaneously.
+
+        Args:
+            scale_factor (Tuple[float, float]): factors for scaling box
+                shapes. The length should be 2.
+        """
+        boxes = self.tensor
+        assert len(scale_factor) == 2
+        ctrs = (boxes[..., 2:] + boxes[..., :2]) / 2
+        wh = boxes[..., 2:] - boxes[..., :2]
+        scale_factor = boxes.new_tensor(scale_factor)
+        wh = wh * scale_factor
+        xy1 = ctrs - 0.5 * wh
+        xy2 = ctrs + 0.5 * wh
+        self.tensor = torch.cat([xy1, xy2], dim=-1)
+
+    def is_inside(self,
+                  img_shape: Tuple[int, int],
+                  all_inside: bool = False,
+                  allowed_border: int = 0) -> BoolTensor:
+        """Find boxes inside the image.
+
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+            all_inside (bool): Whether the boxes are all inside the image or
+                part inside the image. Defaults to False.
+            allowed_border (int): Boxes that extend beyond the image shape
+                boundary by more than ``allowed_border`` are considered
+                "outside" Defaults to 0.
+        Returns:
+            BoolTensor: A BoolTensor indicating whether the box is inside
+            the image. Assuming the original boxes have shape (m, n, 4),
+            the output has shape (m, n).
+        """
+        img_h, img_w = img_shape
+        boxes = self.tensor
+        if all_inside:
+            return (boxes[:, 0] >= -allowed_border) & \
+                (boxes[:, 1] >= -allowed_border) & \
+                (boxes[:, 2] < img_w + allowed_border) & \
+                (boxes[:, 3] < img_h + allowed_border)
+        else:
+            return (boxes[..., 0] < img_w + allowed_border) & \
+                (boxes[..., 1] < img_h + allowed_border) & \
+                (boxes[..., 2] > -allowed_border) & \
+                (boxes[..., 3] > -allowed_border)
+
+    def find_inside_points(self,
+                           points: Tensor,
+                           is_aligned: bool = False) -> BoolTensor:
+        """Find inside box points. Boxes dimension must be 2.
+
+        Args:
+            points (Tensor): Points coordinates. Has shape of (m, 2).
+            is_aligned (bool): Whether ``points`` has been aligned with boxes
+                or not. If True, the length of boxes and ``points`` should be
+                the same. Defaults to False.
+
+        Returns:
+            BoolTensor: A BoolTensor indicating whether a point is inside
+            boxes. Assuming the boxes has shape of (n, 4), if ``is_aligned``
+            is False. The index has shape of (m, n). If ``is_aligned`` is
+            True, m should be equal to n and the index has shape of (m, ).
+        """
+        boxes = self.tensor
+        assert boxes.dim() == 2, 'boxes dimension must be 2.'
+
+        if not is_aligned:
+            boxes = boxes[None, :, :]
+            points = points[:, None, :]
+        else:
+            assert boxes.size(0) == points.size(0)
+
+        x_min, y_min, x_max, y_max = boxes.unbind(dim=-1)
+        return (points[..., 0] >= x_min) & (points[..., 0] <= x_max) & \
+            (points[..., 1] >= y_min) & (points[..., 1] <= y_max)
+
+    def create_masks(self, img_shape: Tuple[int, int]) -> BitmapMasks:
+        """
+        Args:
+            img_shape (Tuple[int, int]): A tuple of image height and width.
+
+        Returns:
+            :obj:`BitmapMasks`: Converted masks
+        """
+        img_h, img_w = img_shape
+        boxes = self.tensor
+
+        xmin, ymin = boxes[:, 0:1], boxes[:, 1:2]
+        xmax, ymax = boxes[:, 2:3], boxes[:, 3:4]
+        gt_masks = np.zeros((len(boxes), img_h, img_w), dtype=np.uint8)
+        for i in range(len(boxes)):
+            gt_masks[i,
+                     int(ymin[i]):int(ymax[i]),
+                     int(xmin[i]):int(xmax[i])] = 1
+        return BitmapMasks(gt_masks, img_h, img_w)
+
+    @staticmethod
+    def overlaps(boxes1: BaseBoxes,
+                 boxes2: BaseBoxes,
+                 mode: str = 'iou',
+                 is_aligned: bool = False,
+                 eps: float = 1e-6) -> Tensor:
+        """Calculate overlap between two set of boxes with their types
+        converted to ``HorizontalBoxes``.
+
+        Args:
+            boxes1 (:obj:`BaseBoxes`): BaseBoxes with shape of (m, box_dim)
+                or empty.
+            boxes2 (:obj:`BaseBoxes`): BaseBoxes with shape of (n, box_dim)
+                or empty.
+            mode (str): "iou" (intersection over union), "iof" (intersection
+                over foreground). Defaults to "iou".
+            is_aligned (bool): If True, then m and n must be equal. Defaults
+                to False.
+            eps (float): A value added to the denominator for numerical
+                stability. Defaults to 1e-6.
+
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+        """
+        boxes1 = boxes1.convert_to('hbox')
+        boxes2 = boxes2.convert_to('hbox')
+        return bbox_overlaps(
+            boxes1.tensor,
+            boxes2.tensor,
+            mode=mode,
+            is_aligned=is_aligned,
+            eps=eps)
+
+    @staticmethod
+    def from_instance_masks(masks: MaskType) -> 'HorizontalBoxes':
+        """Create horizontal boxes from instance masks.
+
+        Args:
+            masks (:obj:`BitmapMasks` or :obj:`PolygonMasks`): BitmapMasks or
+                PolygonMasks instance with length of n.
+
+        Returns:
+            :obj:`HorizontalBoxes`: Converted boxes with shape of (n, 4).
+        """
+        num_masks = len(masks)
+        boxes = np.zeros((num_masks, 4), dtype=np.float32)
+        if isinstance(masks, BitmapMasks):
+            x_any = masks.masks.any(axis=1)
+            y_any = masks.masks.any(axis=2)
+            for idx in range(num_masks):
+                x = np.where(x_any[idx, :])[0]
+                y = np.where(y_any[idx, :])[0]
+                if len(x) > 0 and len(y) > 0:
+                    # use +1 for x_max and y_max so that the right and bottom
+                    # boundary of instance masks are fully included by the box
+                    boxes[idx, :] = np.array(
+                        [x[0], y[0], x[-1] + 1, y[-1] + 1], dtype=np.float32)
+        elif isinstance(masks, PolygonMasks):
+            for idx, poly_per_obj in enumerate(masks.masks):
+                # simply use a number that is big enough for comparison with
+                # coordinates
+                xy_min = np.array([masks.width * 2, masks.height * 2],
+                                  dtype=np.float32)
+                xy_max = np.zeros(2, dtype=np.float32)
+                for p in poly_per_obj:
+                    xy = np.array(p).reshape(-1, 2).astype(np.float32)
+                    xy_min = np.minimum(xy_min, np.min(xy, axis=0))
+                    xy_max = np.maximum(xy_max, np.max(xy, axis=0))
+                boxes[idx, :2] = xy_min
+                boxes[idx, 2:] = xy_max
+        else:
+            raise TypeError(
+                '`masks` must be `BitmapMasks`  or `PolygonMasks`, '
+                f'but got {type(masks)}.')
+        return HorizontalBoxes(boxes)
diff --git a/head_extractor/src/mmdet/structures/bbox/transforms.py b/head_extractor/src/mmdet/structures/bbox/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..287e6aa6fcaeaf09a8a2838a04a97157cd02a00c
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/bbox/transforms.py
@@ -0,0 +1,498 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmdet.structures.bbox import BaseBoxes
+
+
+def find_inside_bboxes(bboxes: Tensor, img_h: int, img_w: int) -> Tensor:
+    """Find bboxes as long as a part of bboxes is inside the image.
+
+    Args:
+        bboxes (Tensor): Shape (N, 4).
+        img_h (int): Image height.
+        img_w (int): Image width.
+
+    Returns:
+        Tensor: Index of the remaining bboxes.
+    """
+    inside_inds = (bboxes[:, 0] < img_w) & (bboxes[:, 2] > 0) \
+        & (bboxes[:, 1] < img_h) & (bboxes[:, 3] > 0)
+    return inside_inds
+
+
+def bbox_flip(bboxes: Tensor,
+              img_shape: Tuple[int],
+              direction: str = 'horizontal') -> Tensor:
+    """Flip bboxes horizontally or vertically.
+
+    Args:
+        bboxes (Tensor): Shape (..., 4*k)
+        img_shape (Tuple[int]): Image shape.
+        direction (str): Flip direction, options are "horizontal", "vertical",
+            "diagonal". Default: "horizontal"
+
+    Returns:
+        Tensor: Flipped bboxes.
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    flipped = bboxes.clone()
+    if direction == 'horizontal':
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+    elif direction == 'vertical':
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    else:
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    return flipped
+
+
+def bbox_mapping(bboxes: Tensor,
+                 img_shape: Tuple[int],
+                 scale_factor: Union[float, Tuple[float]],
+                 flip: bool,
+                 flip_direction: str = 'horizontal') -> Tensor:
+    """Map bboxes from the original image scale to testing scale."""
+    new_bboxes = bboxes * bboxes.new_tensor(scale_factor)
+    if flip:
+        new_bboxes = bbox_flip(new_bboxes, img_shape, flip_direction)
+    return new_bboxes
+
+
+def bbox_mapping_back(bboxes: Tensor,
+                      img_shape: Tuple[int],
+                      scale_factor: Union[float, Tuple[float]],
+                      flip: bool,
+                      flip_direction: str = 'horizontal') -> Tensor:
+    """Map bboxes from testing scale to original image scale."""
+    new_bboxes = bbox_flip(bboxes, img_shape,
+                           flip_direction) if flip else bboxes
+    new_bboxes = new_bboxes.view(-1, 4) / new_bboxes.new_tensor(scale_factor)
+    return new_bboxes.view(bboxes.shape)
+
+
+def bbox2roi(bbox_list: List[Union[Tensor, BaseBoxes]]) -> Tensor:
+    """Convert a list of bboxes to roi format.
+
+    Args:
+        bbox_list (List[Union[Tensor, :obj:`BaseBoxes`]): a list of bboxes
+            corresponding to a batch of images.
+
+    Returns:
+        Tensor: shape (n, box_dim + 1), where ``box_dim`` depends on the
+        different box types. For example, If the box type in ``bbox_list``
+        is HorizontalBoxes, the output shape is (n, 5). Each row of data
+        indicates [batch_ind, x1, y1, x2, y2].
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        bboxes = get_box_tensor(bboxes)
+        img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+        rois = torch.cat([img_inds, bboxes], dim=-1)
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def roi2bbox(rois: Tensor) -> List[Tensor]:
+    """Convert rois to bounding box format.
+
+    Args:
+        rois (Tensor): RoIs with the shape (n, 5) where the first
+            column indicates batch id of each RoI.
+
+    Returns:
+        List[Tensor]: Converted boxes of corresponding rois.
+    """
+    bbox_list = []
+    img_ids = torch.unique(rois[:, 0].cpu(), sorted=True)
+    for img_id in img_ids:
+        inds = (rois[:, 0] == img_id.item())
+        bbox = rois[inds, 1:]
+        bbox_list.append(bbox)
+    return bbox_list
+
+
+# TODO remove later
+def bbox2result(bboxes: Union[Tensor, np.ndarray], labels: Union[Tensor,
+                                                                 np.ndarray],
+                num_classes: int) -> List[np.ndarray]:
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (Tensor | np.ndarray): shape (n, 5)
+        labels (Tensor | np.ndarray): shape (n, )
+        num_classes (int): class number, including background class
+
+    Returns:
+        List(np.ndarray]): bbox results of each class
+    """
+    if bboxes.shape[0] == 0:
+        return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)]
+    else:
+        if isinstance(bboxes, torch.Tensor):
+            bboxes = bboxes.detach().cpu().numpy()
+            labels = labels.detach().cpu().numpy()
+        return [bboxes[labels == i, :] for i in range(num_classes)]
+
+
+def distance2bbox(
+    points: Tensor,
+    distance: Tensor,
+    max_shape: Optional[Union[Sequence[int], Tensor,
+                              Sequence[Sequence[int]]]] = None
+) -> Tensor:
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (B, N, 2) or (N, 2).
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom). Shape (B, N, 4) or (N, 4)
+        max_shape (Union[Sequence[int], Tensor, Sequence[Sequence[int]]],
+            optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B.
+
+    Returns:
+        Tensor: Boxes with shape (N, 4) or (B, N, 4)
+    """
+
+    x1 = points[..., 0] - distance[..., 0]
+    y1 = points[..., 1] - distance[..., 1]
+    x2 = points[..., 0] + distance[..., 2]
+    y2 = points[..., 1] + distance[..., 3]
+
+    bboxes = torch.stack([x1, y1, x2, y2], -1)
+
+    if max_shape is not None:
+        if bboxes.dim() == 2 and not torch.onnx.is_in_onnx_export():
+            # speed up
+            bboxes[:, 0::2].clamp_(min=0, max=max_shape[1])
+            bboxes[:, 1::2].clamp_(min=0, max=max_shape[0])
+            return bboxes
+
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            # TODO: delete
+            from mmdet.core.export import dynamic_clip_for_onnx
+            x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape)
+            bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = x1.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(x1)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = x1.new_tensor(0)
+        max_xy = torch.cat([max_shape, max_shape],
+                           dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
+
+
+def bbox2distance(points: Tensor,
+                  bbox: Tensor,
+                  max_dis: Optional[float] = None,
+                  eps: float = 0.1) -> Tensor:
+    """Decode bounding box based on distances.
+
+    Args:
+        points (Tensor): Shape (n, 2) or (b, n, 2), [x, y].
+        bbox (Tensor): Shape (n, 4) or (b, n, 4), "xyxy" format
+        max_dis (float, optional): Upper bound of the distance.
+        eps (float): a small value to ensure target < max_dis, instead <=
+
+    Returns:
+        Tensor: Decoded distances.
+    """
+    left = points[..., 0] - bbox[..., 0]
+    top = points[..., 1] - bbox[..., 1]
+    right = bbox[..., 2] - points[..., 0]
+    bottom = bbox[..., 3] - points[..., 1]
+    if max_dis is not None:
+        left = left.clamp(min=0, max=max_dis - eps)
+        top = top.clamp(min=0, max=max_dis - eps)
+        right = right.clamp(min=0, max=max_dis - eps)
+        bottom = bottom.clamp(min=0, max=max_dis - eps)
+    return torch.stack([left, top, right, bottom], -1)
+
+
+def bbox_rescale(bboxes: Tensor, scale_factor: float = 1.0) -> Tensor:
+    """Rescale bounding box w.r.t. scale_factor.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for bboxes or (n, 5) for rois
+        scale_factor (float): rescale factor
+
+    Returns:
+        Tensor: Rescaled bboxes.
+    """
+    if bboxes.size(1) == 5:
+        bboxes_ = bboxes[:, 1:]
+        inds_ = bboxes[:, 0]
+    else:
+        bboxes_ = bboxes
+    cx = (bboxes_[:, 0] + bboxes_[:, 2]) * 0.5
+    cy = (bboxes_[:, 1] + bboxes_[:, 3]) * 0.5
+    w = bboxes_[:, 2] - bboxes_[:, 0]
+    h = bboxes_[:, 3] - bboxes_[:, 1]
+    w = w * scale_factor
+    h = h * scale_factor
+    x1 = cx - 0.5 * w
+    x2 = cx + 0.5 * w
+    y1 = cy - 0.5 * h
+    y2 = cy + 0.5 * h
+    if bboxes.size(1) == 5:
+        rescaled_bboxes = torch.stack([inds_, x1, y1, x2, y2], dim=-1)
+    else:
+        rescaled_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+    return rescaled_bboxes
+
+
+def bbox_cxcywh_to_xyxy(bbox: Tensor) -> Tensor:
+    """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, x2, y2).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), (cx + 0.5 * w), (cy + 0.5 * h)]
+    return torch.cat(bbox_new, dim=-1)
+
+
+def bbox_xyxy_to_cxcywh(bbox: Tensor) -> Tensor:
+    """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, w, h).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    x1, y1, x2, y2 = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)]
+    return torch.cat(bbox_new, dim=-1)
+
+
+def bbox2corner(bboxes: torch.Tensor) -> torch.Tensor:
+    """Convert bbox coordinates from (x1, y1, x2, y2) to corners ((x1, y1),
+    (x2, y1), (x1, y2), (x2, y2)).
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for bboxes.
+    Returns:
+        Tensor: Shape (n*4, 2) for corners.
+    """
+    x1, y1, x2, y2 = torch.split(bboxes, 1, dim=1)
+    return torch.cat([x1, y1, x2, y1, x1, y2, x2, y2], dim=1).reshape(-1, 2)
+
+
+def corner2bbox(corners: torch.Tensor) -> torch.Tensor:
+    """Convert bbox coordinates from corners ((x1, y1), (x2, y1), (x1, y2),
+    (x2, y2)) to (x1, y1, x2, y2).
+
+    Args:
+        corners (Tensor): Shape (n*4, 2) for corners.
+    Returns:
+        Tensor: Shape (n, 4) for bboxes.
+    """
+    corners = corners.reshape(-1, 4, 2)
+    min_xy = corners.min(dim=1)[0]
+    max_xy = corners.max(dim=1)[0]
+    return torch.cat([min_xy, max_xy], dim=1)
+
+
+def bbox_project(
+    bboxes: Union[torch.Tensor, np.ndarray],
+    homography_matrix: Union[torch.Tensor, np.ndarray],
+    img_shape: Optional[Tuple[int, int]] = None
+) -> Union[torch.Tensor, np.ndarray]:
+    """Geometric transformation for bbox.
+
+    Args:
+        bboxes (Union[torch.Tensor, np.ndarray]): Shape (n, 4) for bboxes.
+        homography_matrix (Union[torch.Tensor, np.ndarray]):
+            Shape (3, 3) for geometric transformation.
+        img_shape (Tuple[int, int], optional): Image shape. Defaults to None.
+    Returns:
+        Union[torch.Tensor, np.ndarray]: Converted bboxes.
+    """
+    bboxes_type = type(bboxes)
+    if bboxes_type is np.ndarray:
+        bboxes = torch.from_numpy(bboxes)
+    if isinstance(homography_matrix, np.ndarray):
+        homography_matrix = torch.from_numpy(homography_matrix)
+    corners = bbox2corner(bboxes)
+    corners = torch.cat(
+        [corners, corners.new_ones(corners.shape[0], 1)], dim=1)
+    corners = torch.matmul(homography_matrix, corners.t()).t()
+    # Convert to homogeneous coordinates by normalization
+    corners = corners[:, :2] / corners[:, 2:3]
+    bboxes = corner2bbox(corners)
+    if img_shape is not None:
+        bboxes[:, 0::2] = bboxes[:, 0::2].clamp(0, img_shape[1])
+        bboxes[:, 1::2] = bboxes[:, 1::2].clamp(0, img_shape[0])
+    if bboxes_type is np.ndarray:
+        bboxes = bboxes.numpy()
+    return bboxes
+
+
+def cat_boxes(data_list: List[Union[Tensor, BaseBoxes]],
+              dim: int = 0) -> Union[Tensor, BaseBoxes]:
+    """Concatenate boxes with type of tensor or box type.
+
+    Args:
+        data_list (List[Union[Tensor, :obj:`BaseBoxes`]]): A list of tensors
+            or box types need to be concatenated.
+            dim (int): The dimension over which the box are concatenated.
+                Defaults to 0.
+
+    Returns:
+        Union[Tensor, :obj`BaseBoxes`]: Concatenated results.
+    """
+    if data_list and isinstance(data_list[0], BaseBoxes):
+        return data_list[0].cat(data_list, dim=dim)
+    else:
+        return torch.cat(data_list, dim=dim)
+
+
+def stack_boxes(data_list: List[Union[Tensor, BaseBoxes]],
+                dim: int = 0) -> Union[Tensor, BaseBoxes]:
+    """Stack boxes with type of tensor or box type.
+
+    Args:
+        data_list (List[Union[Tensor, :obj:`BaseBoxes`]]): A list of tensors
+            or box types need to be stacked.
+            dim (int): The dimension over which the box are stacked.
+                Defaults to 0.
+
+    Returns:
+        Union[Tensor, :obj`BaseBoxes`]: Stacked results.
+    """
+    if data_list and isinstance(data_list[0], BaseBoxes):
+        return data_list[0].stack(data_list, dim=dim)
+    else:
+        return torch.stack(data_list, dim=dim)
+
+
+def scale_boxes(boxes: Union[Tensor, BaseBoxes],
+                scale_factor: Tuple[float, float]) -> Union[Tensor, BaseBoxes]:
+    """Scale boxes with type of tensor or box type.
+
+    Args:
+        boxes (Tensor or :obj:`BaseBoxes`): boxes need to be scaled. Its type
+            can be a tensor or a box type.
+        scale_factor (Tuple[float, float]): factors for scaling boxes.
+            The length should be 2.
+
+    Returns:
+        Union[Tensor, :obj:`BaseBoxes`]: Scaled boxes.
+    """
+    if isinstance(boxes, BaseBoxes):
+        boxes.rescale_(scale_factor)
+        return boxes
+    else:
+        # Tensor boxes will be treated as horizontal boxes
+        repeat_num = int(boxes.size(-1) / 2)
+        scale_factor = boxes.new_tensor(scale_factor).repeat((1, repeat_num))
+        return boxes * scale_factor
+
+
+def get_box_wh(boxes: Union[Tensor, BaseBoxes]) -> Tuple[Tensor, Tensor]:
+    """Get the width and height of boxes with type of tensor or box type.
+
+    Args:
+        boxes (Tensor or :obj:`BaseBoxes`): boxes with type of tensor
+            or box type.
+
+    Returns:
+        Tuple[Tensor, Tensor]: the width and height of boxes.
+    """
+    if isinstance(boxes, BaseBoxes):
+        w = boxes.widths
+        h = boxes.heights
+    else:
+        # Tensor boxes will be treated as horizontal boxes by defaults
+        w = boxes[:, 2] - boxes[:, 0]
+        h = boxes[:, 3] - boxes[:, 1]
+    return w, h
+
+
+def get_box_tensor(boxes: Union[Tensor, BaseBoxes]) -> Tensor:
+    """Get tensor data from box type boxes.
+
+    Args:
+        boxes (Tensor or BaseBoxes): boxes with type of tensor or box type.
+            If its type is a tensor, the boxes will be directly returned.
+            If its type is a box type, the `boxes.tensor` will be returned.
+
+    Returns:
+        Tensor: boxes tensor.
+    """
+    if isinstance(boxes, BaseBoxes):
+        boxes = boxes.tensor
+    return boxes
+
+
+def empty_box_as(boxes: Union[Tensor, BaseBoxes]) -> Union[Tensor, BaseBoxes]:
+    """Generate empty box according to input ``boxes` type and device.
+
+    Args:
+        boxes (Tensor or :obj:`BaseBoxes`): boxes with type of tensor
+            or box type.
+
+    Returns:
+        Union[Tensor, BaseBoxes]: Generated empty box.
+    """
+    if isinstance(boxes, BaseBoxes):
+        return boxes.empty_boxes()
+    else:
+        # Tensor boxes will be treated as horizontal boxes by defaults
+        return boxes.new_zeros(0, 4)
+
+
+def bbox_xyxy_to_cxcyah(bboxes: torch.Tensor) -> torch.Tensor:
+    """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, ratio, h).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx = (bboxes[:, 2] + bboxes[:, 0]) / 2
+    cy = (bboxes[:, 3] + bboxes[:, 1]) / 2
+    w = bboxes[:, 2] - bboxes[:, 0]
+    h = bboxes[:, 3] - bboxes[:, 1]
+    xyah = torch.stack([cx, cy, w / h, h], -1)
+    return xyah
+
+
+def bbox_cxcyah_to_xyxy(bboxes: torch.Tensor) -> torch.Tensor:
+    """Convert bbox coordinates from (cx, cy, ratio, h) to (x1, y1, x2, y2).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx, cy, ratio, h = bboxes.split((1, 1, 1, 1), dim=-1)
+    w = ratio * h
+    x1y1x2y2 = [cx - w / 2.0, cy - h / 2.0, cx + w / 2.0, cy + h / 2.0]
+    return torch.cat(x1y1x2y2, dim=-1)
diff --git a/head_extractor/src/mmdet/structures/det_data_sample.py b/head_extractor/src/mmdet/structures/det_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..37dd74725ed2ff5eb8a088c9d23a9ac5469b07a3
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/det_data_sample.py
@@ -0,0 +1,237 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+from mmengine.structures import BaseDataElement, InstanceData, PixelData
+
+
+class DetDataSample(BaseDataElement):
+    """A data structure interface of MMDetection. They are used as interfaces
+    between different components.
+
+    The attributes in ``DetDataSample`` are divided into several parts:
+
+        - ``proposals``(InstanceData): Region proposals used in two-stage
+            detectors.
+        - ``gt_instances``(InstanceData): Ground truth of instance annotations.
+        - ``pred_instances``(InstanceData): Instances of detection predictions.
+        - ``pred_track_instances``(InstanceData): Instances of tracking
+            predictions.
+        - ``ignored_instances``(InstanceData): Instances to be ignored during
+            training/testing.
+        - ``gt_panoptic_seg``(PixelData): Ground truth of panoptic
+            segmentation.
+        - ``pred_panoptic_seg``(PixelData): Prediction of panoptic
+           segmentation.
+        - ``gt_sem_seg``(PixelData): Ground truth of semantic segmentation.
+        - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+
+    Examples:
+         >>> import torch
+         >>> import numpy as np
+         >>> from mmengine.structures import InstanceData
+         >>> from mmdet.structures import DetDataSample
+
+         >>> data_sample = DetDataSample()
+         >>> img_meta = dict(img_shape=(800, 1196),
+         ...                 pad_shape=(800, 1216))
+         >>> gt_instances = InstanceData(metainfo=img_meta)
+         >>> gt_instances.bboxes = torch.rand((5, 4))
+         >>> gt_instances.labels = torch.rand((5,))
+         >>> data_sample.gt_instances = gt_instances
+         >>> assert 'img_shape' in data_sample.gt_instances.metainfo_keys()
+         >>> len(data_sample.gt_instances)
+         5
+         >>> print(data_sample)
+        <DetDataSample(
+
+            META INFORMATION
+
+            DATA FIELDS
+            gt_instances: <InstanceData(
+
+                    META INFORMATION
+                    pad_shape: (800, 1216)
+                    img_shape: (800, 1196)
+
+                    DATA FIELDS
+                    labels: tensor([0.8533, 0.1550, 0.5433, 0.7294, 0.5098])
+                    bboxes:
+                    tensor([[9.7725e-01, 5.8417e-01, 1.7269e-01, 6.5694e-01],
+                            [1.7894e-01, 5.1780e-01, 7.0590e-01, 4.8589e-01],
+                            [7.0392e-01, 6.6770e-01, 1.7520e-01, 1.4267e-01],
+                            [2.2411e-01, 5.1962e-01, 9.6953e-01, 6.6994e-01],
+                            [4.1338e-01, 2.1165e-01, 2.7239e-04, 6.8477e-01]])
+                ) at 0x7f21fb1b9190>
+        ) at 0x7f21fb1b9880>
+         >>> pred_instances = InstanceData(metainfo=img_meta)
+         >>> pred_instances.bboxes = torch.rand((5, 4))
+         >>> pred_instances.scores = torch.rand((5,))
+         >>> data_sample = DetDataSample(pred_instances=pred_instances)
+         >>> assert 'pred_instances' in data_sample
+
+         >>> pred_track_instances = InstanceData(metainfo=img_meta)
+         >>> pred_track_instances.bboxes = torch.rand((5, 4))
+         >>> pred_track_instances.scores = torch.rand((5,))
+         >>> data_sample = DetDataSample(
+         ...    pred_track_instances=pred_track_instances)
+         >>> assert 'pred_track_instances' in data_sample
+
+         >>> data_sample = DetDataSample()
+         >>> gt_instances_data = dict(
+         ...                        bboxes=torch.rand(2, 4),
+         ...                        labels=torch.rand(2),
+         ...                        masks=np.random.rand(2, 2, 2))
+         >>> gt_instances = InstanceData(**gt_instances_data)
+         >>> data_sample.gt_instances = gt_instances
+         >>> assert 'gt_instances' in data_sample
+         >>> assert 'masks' in data_sample.gt_instances
+
+         >>> data_sample = DetDataSample()
+         >>> gt_panoptic_seg_data = dict(panoptic_seg=torch.rand(2, 4))
+         >>> gt_panoptic_seg = PixelData(**gt_panoptic_seg_data)
+         >>> data_sample.gt_panoptic_seg = gt_panoptic_seg
+         >>> print(data_sample)
+        <DetDataSample(
+
+            META INFORMATION
+
+            DATA FIELDS
+            _gt_panoptic_seg: <BaseDataElement(
+
+                    META INFORMATION
+
+                    DATA FIELDS
+                    panoptic_seg: tensor([[0.7586, 0.1262, 0.2892, 0.9341],
+                                [0.3200, 0.7448, 0.1052, 0.5371]])
+                ) at 0x7f66c2bb7730>
+            gt_panoptic_seg: <BaseDataElement(
+
+                    META INFORMATION
+
+                    DATA FIELDS
+                    panoptic_seg: tensor([[0.7586, 0.1262, 0.2892, 0.9341],
+                                [0.3200, 0.7448, 0.1052, 0.5371]])
+                ) at 0x7f66c2bb7730>
+        ) at 0x7f66c2bb7280>
+        >>> data_sample = DetDataSample()
+        >>> gt_segm_seg_data = dict(segm_seg=torch.rand(2, 2, 2))
+        >>> gt_segm_seg = PixelData(**gt_segm_seg_data)
+        >>> data_sample.gt_segm_seg = gt_segm_seg
+        >>> assert 'gt_segm_seg' in data_sample
+        >>> assert 'segm_seg' in data_sample.gt_segm_seg
+    """
+
+    @property
+    def proposals(self) -> InstanceData:
+        return self._proposals
+
+    @proposals.setter
+    def proposals(self, value: InstanceData):
+        self.set_field(value, '_proposals', dtype=InstanceData)
+
+    @proposals.deleter
+    def proposals(self):
+        del self._proposals
+
+    @property
+    def gt_instances(self) -> InstanceData:
+        return self._gt_instances
+
+    @gt_instances.setter
+    def gt_instances(self, value: InstanceData):
+        self.set_field(value, '_gt_instances', dtype=InstanceData)
+
+    @gt_instances.deleter
+    def gt_instances(self):
+        del self._gt_instances
+
+    @property
+    def pred_instances(self) -> InstanceData:
+        return self._pred_instances
+
+    @pred_instances.setter
+    def pred_instances(self, value: InstanceData):
+        self.set_field(value, '_pred_instances', dtype=InstanceData)
+
+    @pred_instances.deleter
+    def pred_instances(self):
+        del self._pred_instances
+
+    # directly add ``pred_track_instances`` in ``DetDataSample``
+    # so that the ``TrackDataSample`` does not bother to access the
+    # instance-level information.
+    @property
+    def pred_track_instances(self) -> InstanceData:
+        return self._pred_track_instances
+
+    @pred_track_instances.setter
+    def pred_track_instances(self, value: InstanceData):
+        self.set_field(value, '_pred_track_instances', dtype=InstanceData)
+
+    @pred_track_instances.deleter
+    def pred_track_instances(self):
+        del self._pred_track_instances
+
+    @property
+    def ignored_instances(self) -> InstanceData:
+        return self._ignored_instances
+
+    @ignored_instances.setter
+    def ignored_instances(self, value: InstanceData):
+        self.set_field(value, '_ignored_instances', dtype=InstanceData)
+
+    @ignored_instances.deleter
+    def ignored_instances(self):
+        del self._ignored_instances
+
+    @property
+    def gt_panoptic_seg(self) -> PixelData:
+        return self._gt_panoptic_seg
+
+    @gt_panoptic_seg.setter
+    def gt_panoptic_seg(self, value: PixelData):
+        self.set_field(value, '_gt_panoptic_seg', dtype=PixelData)
+
+    @gt_panoptic_seg.deleter
+    def gt_panoptic_seg(self):
+        del self._gt_panoptic_seg
+
+    @property
+    def pred_panoptic_seg(self) -> PixelData:
+        return self._pred_panoptic_seg
+
+    @pred_panoptic_seg.setter
+    def pred_panoptic_seg(self, value: PixelData):
+        self.set_field(value, '_pred_panoptic_seg', dtype=PixelData)
+
+    @pred_panoptic_seg.deleter
+    def pred_panoptic_seg(self):
+        del self._pred_panoptic_seg
+
+    @property
+    def gt_sem_seg(self) -> PixelData:
+        return self._gt_sem_seg
+
+    @gt_sem_seg.setter
+    def gt_sem_seg(self, value: PixelData):
+        self.set_field(value, '_gt_sem_seg', dtype=PixelData)
+
+    @gt_sem_seg.deleter
+    def gt_sem_seg(self):
+        del self._gt_sem_seg
+
+    @property
+    def pred_sem_seg(self) -> PixelData:
+        return self._pred_sem_seg
+
+    @pred_sem_seg.setter
+    def pred_sem_seg(self, value: PixelData):
+        self.set_field(value, '_pred_sem_seg', dtype=PixelData)
+
+    @pred_sem_seg.deleter
+    def pred_sem_seg(self):
+        del self._pred_sem_seg
+
+
+SampleList = List[DetDataSample]
+OptSampleList = Optional[SampleList]
diff --git a/head_extractor/src/mmdet/structures/mask/__init__.py b/head_extractor/src/mmdet/structures/mask/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f78394701df1b493259c4c23a79aea5c5cb8be95
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/mask/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .mask_target import mask_target
+from .structures import (BaseInstanceMasks, BitmapMasks, PolygonMasks,
+                         bitmap_to_polygon, polygon_to_bitmap)
+from .utils import encode_mask_results, mask2bbox, split_combined_polys
+
+__all__ = [
+    'split_combined_polys', 'mask_target', 'BaseInstanceMasks', 'BitmapMasks',
+    'PolygonMasks', 'encode_mask_results', 'mask2bbox', 'polygon_to_bitmap',
+    'bitmap_to_polygon'
+]
diff --git a/head_extractor/src/mmdet/structures/mask/mask_target.py b/head_extractor/src/mmdet/structures/mask/mask_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2fc5f1878300446b114c9f57c6a885fea8c927c
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/mask/mask_target.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+
+def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list,
+                cfg):
+    """Compute mask target for positive proposals in multiple images.
+
+    Args:
+        pos_proposals_list (list[Tensor]): Positive proposals in multiple
+            images, each has shape (num_pos, 4).
+        pos_assigned_gt_inds_list (list[Tensor]): Assigned GT indices for each
+            positive proposals, each has shape (num_pos,).
+        gt_masks_list (list[:obj:`BaseInstanceMasks`]): Ground truth masks of
+            each image.
+        cfg (dict): Config dict that specifies the mask size.
+
+    Returns:
+        Tensor: Mask target of each image, has shape (num_pos, w, h).
+
+    Example:
+        >>> from mmengine.config import Config
+        >>> import mmdet
+        >>> from mmdet.data_elements.mask import BitmapMasks
+        >>> from mmdet.data_elements.mask.mask_target import *
+        >>> H, W = 17, 18
+        >>> cfg = Config({'mask_size': (13, 14)})
+        >>> rng = np.random.RandomState(0)
+        >>> # Positive proposals (tl_x, tl_y, br_x, br_y) for each image
+        >>> pos_proposals_list = [
+        >>>     torch.Tensor([
+        >>>         [ 7.2425,  5.5929, 13.9414, 14.9541],
+        >>>         [ 7.3241,  3.6170, 16.3850, 15.3102],
+        >>>     ]),
+        >>>     torch.Tensor([
+        >>>         [ 4.8448, 6.4010, 7.0314, 9.7681],
+        >>>         [ 5.9790, 2.6989, 7.4416, 4.8580],
+        >>>         [ 0.0000, 0.0000, 0.1398, 9.8232],
+        >>>     ]),
+        >>> ]
+        >>> # Corresponding class index for each proposal for each image
+        >>> pos_assigned_gt_inds_list = [
+        >>>     torch.LongTensor([7, 0]),
+        >>>     torch.LongTensor([5, 4, 1]),
+        >>> ]
+        >>> # Ground truth mask for each true object for each image
+        >>> gt_masks_list = [
+        >>>     BitmapMasks(rng.rand(8, H, W), height=H, width=W),
+        >>>     BitmapMasks(rng.rand(6, H, W), height=H, width=W),
+        >>> ]
+        >>> mask_targets = mask_target(
+        >>>     pos_proposals_list, pos_assigned_gt_inds_list,
+        >>>     gt_masks_list, cfg)
+        >>> assert mask_targets.shape == (5,) + cfg['mask_size']
+    """
+    cfg_list = [cfg for _ in range(len(pos_proposals_list))]
+    mask_targets = map(mask_target_single, pos_proposals_list,
+                       pos_assigned_gt_inds_list, gt_masks_list, cfg_list)
+    mask_targets = list(mask_targets)
+    if len(mask_targets) > 0:
+        mask_targets = torch.cat(mask_targets)
+    return mask_targets
+
+
+def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg):
+    """Compute mask target for each positive proposal in the image.
+
+    Args:
+        pos_proposals (Tensor): Positive proposals.
+        pos_assigned_gt_inds (Tensor): Assigned GT inds of positive proposals.
+        gt_masks (:obj:`BaseInstanceMasks`): GT masks in the format of Bitmap
+            or Polygon.
+        cfg (dict): Config dict that indicate the mask size.
+
+    Returns:
+        Tensor: Mask target of each positive proposals in the image.
+
+    Example:
+        >>> from mmengine.config import Config
+        >>> import mmdet
+        >>> from mmdet.data_elements.mask import BitmapMasks
+        >>> from mmdet.data_elements.mask.mask_target import *  # NOQA
+        >>> H, W = 32, 32
+        >>> cfg = Config({'mask_size': (7, 11)})
+        >>> rng = np.random.RandomState(0)
+        >>> # Masks for each ground truth box (relative to the image)
+        >>> gt_masks_data = rng.rand(3, H, W)
+        >>> gt_masks = BitmapMasks(gt_masks_data, height=H, width=W)
+        >>> # Predicted positive boxes in one image
+        >>> pos_proposals = torch.FloatTensor([
+        >>>     [ 16.2,   5.5, 19.9, 20.9],
+        >>>     [ 17.3,  13.6, 19.3, 19.3],
+        >>>     [ 14.8,  16.4, 17.0, 23.7],
+        >>>     [  0.0,   0.0, 16.0, 16.0],
+        >>>     [  4.0,   0.0, 20.0, 16.0],
+        >>> ])
+        >>> # For each predicted proposal, its assignment to a gt mask
+        >>> pos_assigned_gt_inds = torch.LongTensor([0, 1, 2, 1, 1])
+        >>> mask_targets = mask_target_single(
+        >>>     pos_proposals, pos_assigned_gt_inds, gt_masks, cfg)
+        >>> assert mask_targets.shape == (5,) + cfg['mask_size']
+    """
+    device = pos_proposals.device
+    mask_size = _pair(cfg.mask_size)
+    binarize = not cfg.get('soft_mask_target', False)
+    num_pos = pos_proposals.size(0)
+    if num_pos > 0:
+        proposals_np = pos_proposals.cpu().numpy()
+        maxh, maxw = gt_masks.height, gt_masks.width
+        proposals_np[:, [0, 2]] = np.clip(proposals_np[:, [0, 2]], 0, maxw)
+        proposals_np[:, [1, 3]] = np.clip(proposals_np[:, [1, 3]], 0, maxh)
+        pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+
+        mask_targets = gt_masks.crop_and_resize(
+            proposals_np,
+            mask_size,
+            device=device,
+            inds=pos_assigned_gt_inds,
+            binarize=binarize).to_ndarray()
+
+        mask_targets = torch.from_numpy(mask_targets).float().to(device)
+    else:
+        mask_targets = pos_proposals.new_zeros((0, ) + mask_size)
+
+    return mask_targets
diff --git a/head_extractor/src/mmdet/structures/mask/structures.py b/head_extractor/src/mmdet/structures/mask/structures.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4fdd27570b0d11d92eba4e8f854e153750135a4
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/mask/structures.py
@@ -0,0 +1,1193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+from abc import ABCMeta, abstractmethod
+from typing import Sequence, Type, TypeVar
+
+import cv2
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+import shapely.geometry as geometry
+import torch
+from mmcv.ops.roi_align import roi_align
+
+T = TypeVar('T')
+
+
+class BaseInstanceMasks(metaclass=ABCMeta):
+    """Base class for instance masks."""
+
+    @abstractmethod
+    def rescale(self, scale, interpolation='nearest'):
+        """Rescale masks as large as possible while keeping the aspect ratio.
+        For details can refer to `mmcv.imrescale`.
+
+        Args:
+            scale (tuple[int]): The maximum size (h, w) of rescaled mask.
+            interpolation (str): Same as :func:`mmcv.imrescale`.
+
+        Returns:
+            BaseInstanceMasks: The rescaled masks.
+        """
+
+    @abstractmethod
+    def resize(self, out_shape, interpolation='nearest'):
+        """Resize masks to the given out_shape.
+
+        Args:
+            out_shape: Target (h, w) of resized mask.
+            interpolation (str): See :func:`mmcv.imresize`.
+
+        Returns:
+            BaseInstanceMasks: The resized masks.
+        """
+
+    @abstractmethod
+    def flip(self, flip_direction='horizontal'):
+        """Flip masks alone the given direction.
+
+        Args:
+            flip_direction (str): Either 'horizontal' or 'vertical'.
+
+        Returns:
+            BaseInstanceMasks: The flipped masks.
+        """
+
+    @abstractmethod
+    def pad(self, out_shape, pad_val):
+        """Pad masks to the given size of (h, w).
+
+        Args:
+            out_shape (tuple[int]): Target (h, w) of padded mask.
+            pad_val (int): The padded value.
+
+        Returns:
+            BaseInstanceMasks: The padded masks.
+        """
+
+    @abstractmethod
+    def crop(self, bbox):
+        """Crop each mask by the given bbox.
+
+        Args:
+            bbox (ndarray): Bbox in format [x1, y1, x2, y2], shape (4, ).
+
+        Return:
+            BaseInstanceMasks: The cropped masks.
+        """
+
+    @abstractmethod
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device,
+                        interpolation='bilinear',
+                        binarize=True):
+        """Crop and resize masks by the given bboxes.
+
+        This function is mainly used in mask targets computation.
+        It firstly align mask to bboxes by assigned_inds, then crop mask by the
+        assigned bbox and resize to the size of (mask_h, mask_w)
+
+        Args:
+            bboxes (Tensor): Bboxes in format [x1, y1, x2, y2], shape (N, 4)
+            out_shape (tuple[int]): Target (h, w) of resized mask
+            inds (ndarray): Indexes to assign masks to each bbox,
+                shape (N,) and values should be between [0, num_masks - 1].
+            device (str): Device of bboxes
+            interpolation (str): See `mmcv.imresize`
+            binarize (bool): if True fractional values are rounded to 0 or 1
+                after the resize operation. if False and unsupported an error
+                will be raised. Defaults to True.
+
+        Return:
+            BaseInstanceMasks: the cropped and resized masks.
+        """
+
+    @abstractmethod
+    def expand(self, expanded_h, expanded_w, top, left):
+        """see :class:`Expand`."""
+
+    @property
+    @abstractmethod
+    def areas(self):
+        """ndarray: areas of each instance."""
+
+    @abstractmethod
+    def to_ndarray(self):
+        """Convert masks to the format of ndarray.
+
+        Return:
+            ndarray: Converted masks in the format of ndarray.
+        """
+
+    @abstractmethod
+    def to_tensor(self, dtype, device):
+        """Convert masks to the format of Tensor.
+
+        Args:
+            dtype (str): Dtype of converted mask.
+            device (torch.device): Device of converted masks.
+
+        Returns:
+            Tensor: Converted masks in the format of Tensor.
+        """
+
+    @abstractmethod
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  border_value=0,
+                  interpolation='bilinear'):
+        """Translate the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+            border_value (int | float): Border value. Default 0.
+            interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+        Returns:
+            Translated masks.
+        """
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """Shear the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The shear direction, either "horizontal"
+                or "vertical".
+            border_value (int | tuple[int]): Value used in case of a
+                constant border. Default 0.
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+
+        Returns:
+            ndarray: Sheared masks.
+        """
+
+    @abstractmethod
+    def rotate(self, out_shape, angle, center=None, scale=1.0, border_value=0):
+        """Rotate the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            angle (int | float): Rotation angle in degrees. Positive values
+                mean counter-clockwise rotation.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation in source image. If not specified, the center of
+                the image will be used.
+            scale (int | float): Isotropic scale factor.
+            border_value (int | float): Border value. Default 0 for masks.
+
+        Returns:
+            Rotated masks.
+        """
+
+    def get_bboxes(self, dst_type='hbb'):
+        """Get the certain type boxes from masks.
+
+        Please refer to ``mmdet.structures.bbox.box_type`` for more details of
+        the box type.
+
+        Args:
+            dst_type: Destination box type.
+
+        Returns:
+            :obj:`BaseBoxes`: Certain type boxes.
+        """
+        from ..bbox import get_box_type
+        _, box_type_cls = get_box_type(dst_type)
+        return box_type_cls.from_instance_masks(self)
+
+    @classmethod
+    @abstractmethod
+    def cat(cls: Type[T], masks: Sequence[T]) -> T:
+        """Concatenate a sequence of masks into one single mask instance.
+
+        Args:
+            masks (Sequence[T]): A sequence of mask instances.
+
+        Returns:
+            T: Concatenated mask instance.
+        """
+
+
+class BitmapMasks(BaseInstanceMasks):
+    """This class represents masks in the form of bitmaps.
+
+    Args:
+        masks (ndarray): ndarray of masks in shape (N, H, W), where N is
+            the number of objects.
+        height (int): height of masks
+        width (int): width of masks
+
+    Example:
+        >>> from mmdet.data_elements.mask.structures import *  # NOQA
+        >>> num_masks, H, W = 3, 32, 32
+        >>> rng = np.random.RandomState(0)
+        >>> masks = (rng.rand(num_masks, H, W) > 0.1).astype(np.int64)
+        >>> self = BitmapMasks(masks, height=H, width=W)
+
+        >>> # demo crop_and_resize
+        >>> num_boxes = 5
+        >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes)
+        >>> out_shape = (14, 14)
+        >>> inds = torch.randint(0, len(self), size=(num_boxes,))
+        >>> device = 'cpu'
+        >>> interpolation = 'bilinear'
+        >>> new = self.crop_and_resize(
+        ...     bboxes, out_shape, inds, device, interpolation)
+        >>> assert len(new) == num_boxes
+        >>> assert new.height, new.width == out_shape
+    """
+
+    def __init__(self, masks, height, width):
+        self.height = height
+        self.width = width
+        if len(masks) == 0:
+            self.masks = np.empty((0, self.height, self.width), dtype=np.uint8)
+        else:
+            assert isinstance(masks, (list, np.ndarray))
+            if isinstance(masks, list):
+                assert isinstance(masks[0], np.ndarray)
+                assert masks[0].ndim == 2  # (H, W)
+            else:
+                assert masks.ndim == 3  # (N, H, W)
+
+            self.masks = np.stack(masks).reshape(-1, height, width)
+            assert self.masks.shape[1] == self.height
+            assert self.masks.shape[2] == self.width
+
+    def __getitem__(self, index):
+        """Index the BitmapMask.
+
+        Args:
+            index (int | ndarray): Indices in the format of integer or ndarray.
+
+        Returns:
+            :obj:`BitmapMasks`: Indexed bitmap masks.
+        """
+        masks = self.masks[index].reshape(-1, self.height, self.width)
+        return BitmapMasks(masks, self.height, self.width)
+
+    def __iter__(self):
+        return iter(self.masks)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'num_masks={len(self.masks)}, '
+        s += f'height={self.height}, '
+        s += f'width={self.width})'
+        return s
+
+    def __len__(self):
+        """Number of masks."""
+        return len(self.masks)
+
+    def rescale(self, scale, interpolation='nearest'):
+        """See :func:`BaseInstanceMasks.rescale`."""
+        if len(self.masks) == 0:
+            new_w, new_h = mmcv.rescale_size((self.width, self.height), scale)
+            rescaled_masks = np.empty((0, new_h, new_w), dtype=np.uint8)
+        else:
+            rescaled_masks = np.stack([
+                mmcv.imrescale(mask, scale, interpolation=interpolation)
+                for mask in self.masks
+            ])
+        height, width = rescaled_masks.shape[1:]
+        return BitmapMasks(rescaled_masks, height, width)
+
+    def resize(self, out_shape, interpolation='nearest'):
+        """See :func:`BaseInstanceMasks.resize`."""
+        if len(self.masks) == 0:
+            resized_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            resized_masks = np.stack([
+                mmcv.imresize(
+                    mask, out_shape[::-1], interpolation=interpolation)
+                for mask in self.masks
+            ])
+        return BitmapMasks(resized_masks, *out_shape)
+
+    def flip(self, flip_direction='horizontal'):
+        """See :func:`BaseInstanceMasks.flip`."""
+        assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+
+        if len(self.masks) == 0:
+            flipped_masks = self.masks
+        else:
+            flipped_masks = np.stack([
+                mmcv.imflip(mask, direction=flip_direction)
+                for mask in self.masks
+            ])
+        return BitmapMasks(flipped_masks, self.height, self.width)
+
+    def pad(self, out_shape, pad_val=0):
+        """See :func:`BaseInstanceMasks.pad`."""
+        if len(self.masks) == 0:
+            padded_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            padded_masks = np.stack([
+                mmcv.impad(mask, shape=out_shape, pad_val=pad_val)
+                for mask in self.masks
+            ])
+        return BitmapMasks(padded_masks, *out_shape)
+
+    def crop(self, bbox):
+        """See :func:`BaseInstanceMasks.crop`."""
+        assert isinstance(bbox, np.ndarray)
+        assert bbox.ndim == 1
+
+        # clip the boundary
+        bbox = bbox.copy()
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1, 1)
+        h = np.maximum(y2 - y1, 1)
+
+        if len(self.masks) == 0:
+            cropped_masks = np.empty((0, h, w), dtype=np.uint8)
+        else:
+            cropped_masks = self.masks[:, y1:y1 + h, x1:x1 + w]
+        return BitmapMasks(cropped_masks, h, w)
+
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device='cpu',
+                        interpolation='bilinear',
+                        binarize=True):
+        """See :func:`BaseInstanceMasks.crop_and_resize`."""
+        if len(self.masks) == 0:
+            empty_masks = np.empty((0, *out_shape), dtype=np.uint8)
+            return BitmapMasks(empty_masks, *out_shape)
+
+        # convert bboxes to tensor
+        if isinstance(bboxes, np.ndarray):
+            bboxes = torch.from_numpy(bboxes).to(device=device)
+        if isinstance(inds, np.ndarray):
+            inds = torch.from_numpy(inds).to(device=device)
+
+        num_bbox = bboxes.shape[0]
+        fake_inds = torch.arange(
+            num_bbox, device=device).to(dtype=bboxes.dtype)[:, None]
+        rois = torch.cat([fake_inds, bboxes], dim=1)  # Nx5
+        rois = rois.to(device=device)
+        if num_bbox > 0:
+            gt_masks_th = torch.from_numpy(self.masks).to(device).index_select(
+                0, inds).to(dtype=rois.dtype)
+            targets = roi_align(gt_masks_th[:, None, :, :], rois, out_shape,
+                                1.0, 0, 'avg', True).squeeze(1)
+            if binarize:
+                resized_masks = (targets >= 0.5).cpu().numpy()
+            else:
+                resized_masks = targets.cpu().numpy()
+        else:
+            resized_masks = []
+        return BitmapMasks(resized_masks, *out_shape)
+
+    def expand(self, expanded_h, expanded_w, top, left):
+        """See :func:`BaseInstanceMasks.expand`."""
+        if len(self.masks) == 0:
+            expanded_mask = np.empty((0, expanded_h, expanded_w),
+                                     dtype=np.uint8)
+        else:
+            expanded_mask = np.zeros((len(self), expanded_h, expanded_w),
+                                     dtype=np.uint8)
+            expanded_mask[:, top:top + self.height,
+                          left:left + self.width] = self.masks
+        return BitmapMasks(expanded_mask, expanded_h, expanded_w)
+
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  border_value=0,
+                  interpolation='bilinear'):
+        """Translate the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+            border_value (int | float): Border value. Default 0 for masks.
+            interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+        Returns:
+            BitmapMasks: Translated BitmapMasks.
+
+        Example:
+            >>> from mmdet.data_elements.mask.structures import BitmapMasks
+            >>> self = BitmapMasks.random(dtype=np.uint8)
+            >>> out_shape = (32, 32)
+            >>> offset = 4
+            >>> direction = 'horizontal'
+            >>> border_value = 0
+            >>> interpolation = 'bilinear'
+            >>> # Note, There seem to be issues when:
+            >>> # * the mask dtype is not supported by cv2.AffineWarp
+            >>> new = self.translate(out_shape, offset, direction,
+            >>>                      border_value, interpolation)
+            >>> assert len(new) == len(self)
+            >>> assert new.height, new.width == out_shape
+        """
+        if len(self.masks) == 0:
+            translated_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            masks = self.masks
+            if masks.shape[-2:] != out_shape:
+                empty_masks = np.zeros((masks.shape[0], *out_shape),
+                                       dtype=masks.dtype)
+                min_h = min(out_shape[0], masks.shape[1])
+                min_w = min(out_shape[1], masks.shape[2])
+                empty_masks[:, :min_h, :min_w] = masks[:, :min_h, :min_w]
+                masks = empty_masks
+            translated_masks = mmcv.imtranslate(
+                masks.transpose((1, 2, 0)),
+                offset,
+                direction,
+                border_value=border_value,
+                interpolation=interpolation)
+            if translated_masks.ndim == 2:
+                translated_masks = translated_masks[:, :, None]
+            translated_masks = translated_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(translated_masks, *out_shape)
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """Shear the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The shear direction, either "horizontal"
+                or "vertical".
+            border_value (int | tuple[int]): Value used in case of a
+                constant border.
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+
+        Returns:
+            BitmapMasks: The sheared masks.
+        """
+        if len(self.masks) == 0:
+            sheared_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            sheared_masks = mmcv.imshear(
+                self.masks.transpose((1, 2, 0)),
+                magnitude,
+                direction,
+                border_value=border_value,
+                interpolation=interpolation)
+            if sheared_masks.ndim == 2:
+                sheared_masks = sheared_masks[:, :, None]
+            sheared_masks = sheared_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(sheared_masks, *out_shape)
+
+    def rotate(self,
+               out_shape,
+               angle,
+               center=None,
+               scale=1.0,
+               border_value=0,
+               interpolation='bilinear'):
+        """Rotate the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            angle (int | float): Rotation angle in degrees. Positive values
+                mean counter-clockwise rotation.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation in source image. If not specified, the center of
+                the image will be used.
+            scale (int | float): Isotropic scale factor.
+            border_value (int | float): Border value. Default 0 for masks.
+            interpolation (str): Same as in :func:`mmcv.imrotate`.
+
+        Returns:
+            BitmapMasks: Rotated BitmapMasks.
+        """
+        if len(self.masks) == 0:
+            rotated_masks = np.empty((0, *out_shape), dtype=self.masks.dtype)
+        else:
+            rotated_masks = mmcv.imrotate(
+                self.masks.transpose((1, 2, 0)),
+                angle,
+                center=center,
+                scale=scale,
+                border_value=border_value,
+                interpolation=interpolation)
+            if rotated_masks.ndim == 2:
+                # case when only one mask, (h, w)
+                rotated_masks = rotated_masks[:, :, None]  # (h, w, 1)
+            rotated_masks = rotated_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(rotated_masks, *out_shape)
+
+    @property
+    def areas(self):
+        """See :py:attr:`BaseInstanceMasks.areas`."""
+        return self.masks.sum((1, 2))
+
+    def to_ndarray(self):
+        """See :func:`BaseInstanceMasks.to_ndarray`."""
+        return self.masks
+
+    def to_tensor(self, dtype, device):
+        """See :func:`BaseInstanceMasks.to_tensor`."""
+        return torch.tensor(self.masks, dtype=dtype, device=device)
+
+    @classmethod
+    def random(cls,
+               num_masks=3,
+               height=32,
+               width=32,
+               dtype=np.uint8,
+               rng=None):
+        """Generate random bitmap masks for demo / testing purposes.
+
+        Example:
+            >>> from mmdet.data_elements.mask.structures import BitmapMasks
+            >>> self = BitmapMasks.random()
+            >>> print('self = {}'.format(self))
+            self = BitmapMasks(num_masks=3, height=32, width=32)
+        """
+        from mmdet.utils.util_random import ensure_rng
+        rng = ensure_rng(rng)
+        masks = (rng.rand(num_masks, height, width) > 0.1).astype(dtype)
+        self = cls(masks, height=height, width=width)
+        return self
+
+    @classmethod
+    def cat(cls: Type[T], masks: Sequence[T]) -> T:
+        """Concatenate a sequence of masks into one single mask instance.
+
+        Args:
+            masks (Sequence[BitmapMasks]): A sequence of mask instances.
+
+        Returns:
+            BitmapMasks: Concatenated mask instance.
+        """
+        assert isinstance(masks, Sequence)
+        if len(masks) == 0:
+            raise ValueError('masks should not be an empty list.')
+        assert all(isinstance(m, cls) for m in masks)
+
+        mask_array = np.concatenate([m.masks for m in masks], axis=0)
+        return cls(mask_array, *mask_array.shape[1:])
+
+
+class PolygonMasks(BaseInstanceMasks):
+    """This class represents masks in the form of polygons.
+
+    Polygons is a list of three levels. The first level of the list
+    corresponds to objects, the second level to the polys that compose the
+    object, the third level to the poly coordinates
+
+    Args:
+        masks (list[list[ndarray]]): The first level of the list
+            corresponds to objects, the second level to the polys that
+            compose the object, the third level to the poly coordinates
+        height (int): height of masks
+        width (int): width of masks
+
+    Example:
+        >>> from mmdet.data_elements.mask.structures import *  # NOQA
+        >>> masks = [
+        >>>     [ np.array([0, 0, 10, 0, 10, 10., 0, 10, 0, 0]) ]
+        >>> ]
+        >>> height, width = 16, 16
+        >>> self = PolygonMasks(masks, height, width)
+
+        >>> # demo translate
+        >>> new = self.translate((16, 16), 4., direction='horizontal')
+        >>> assert np.all(new.masks[0][0][1::2] == masks[0][0][1::2])
+        >>> assert np.all(new.masks[0][0][0::2] == masks[0][0][0::2] + 4)
+
+        >>> # demo crop_and_resize
+        >>> num_boxes = 3
+        >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes)
+        >>> out_shape = (16, 16)
+        >>> inds = torch.randint(0, len(self), size=(num_boxes,))
+        >>> device = 'cpu'
+        >>> interpolation = 'bilinear'
+        >>> new = self.crop_and_resize(
+        ...     bboxes, out_shape, inds, device, interpolation)
+        >>> assert len(new) == num_boxes
+        >>> assert new.height, new.width == out_shape
+    """
+
+    def __init__(self, masks, height, width):
+        assert isinstance(masks, list)
+        if len(masks) > 0:
+            assert isinstance(masks[0], list)
+            assert isinstance(masks[0][0], np.ndarray)
+
+        self.height = height
+        self.width = width
+        self.masks = masks
+
+    def __getitem__(self, index):
+        """Index the polygon masks.
+
+        Args:
+            index (ndarray | List): The indices.
+
+        Returns:
+            :obj:`PolygonMasks`: The indexed polygon masks.
+        """
+        if isinstance(index, np.ndarray):
+            if index.dtype == bool:
+                index = np.where(index)[0].tolist()
+            else:
+                index = index.tolist()
+        if isinstance(index, list):
+            masks = [self.masks[i] for i in index]
+        else:
+            try:
+                masks = self.masks[index]
+            except Exception:
+                raise ValueError(
+                    f'Unsupported input of type {type(index)} for indexing!')
+        if len(masks) and isinstance(masks[0], np.ndarray):
+            masks = [masks]  # ensure a list of three levels
+        return PolygonMasks(masks, self.height, self.width)
+
+    def __iter__(self):
+        return iter(self.masks)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'num_masks={len(self.masks)}, '
+        s += f'height={self.height}, '
+        s += f'width={self.width})'
+        return s
+
+    def __len__(self):
+        """Number of masks."""
+        return len(self.masks)
+
+    def rescale(self, scale, interpolation=None):
+        """see :func:`BaseInstanceMasks.rescale`"""
+        new_w, new_h = mmcv.rescale_size((self.width, self.height), scale)
+        if len(self.masks) == 0:
+            rescaled_masks = PolygonMasks([], new_h, new_w)
+        else:
+            rescaled_masks = self.resize((new_h, new_w))
+        return rescaled_masks
+
+    def resize(self, out_shape, interpolation=None):
+        """see :func:`BaseInstanceMasks.resize`"""
+        if len(self.masks) == 0:
+            resized_masks = PolygonMasks([], *out_shape)
+        else:
+            h_scale = out_shape[0] / self.height
+            w_scale = out_shape[1] / self.width
+            resized_masks = []
+            for poly_per_obj in self.masks:
+                resized_poly = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    p[0::2] = p[0::2] * w_scale
+                    p[1::2] = p[1::2] * h_scale
+                    resized_poly.append(p)
+                resized_masks.append(resized_poly)
+            resized_masks = PolygonMasks(resized_masks, *out_shape)
+        return resized_masks
+
+    def flip(self, flip_direction='horizontal'):
+        """see :func:`BaseInstanceMasks.flip`"""
+        assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+        if len(self.masks) == 0:
+            flipped_masks = PolygonMasks([], self.height, self.width)
+        else:
+            flipped_masks = []
+            for poly_per_obj in self.masks:
+                flipped_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    if flip_direction == 'horizontal':
+                        p[0::2] = self.width - p[0::2]
+                    elif flip_direction == 'vertical':
+                        p[1::2] = self.height - p[1::2]
+                    else:
+                        p[0::2] = self.width - p[0::2]
+                        p[1::2] = self.height - p[1::2]
+                    flipped_poly_per_obj.append(p)
+                flipped_masks.append(flipped_poly_per_obj)
+            flipped_masks = PolygonMasks(flipped_masks, self.height,
+                                         self.width)
+        return flipped_masks
+
+    def crop(self, bbox):
+        """see :func:`BaseInstanceMasks.crop`"""
+        assert isinstance(bbox, np.ndarray)
+        assert bbox.ndim == 1
+
+        # clip the boundary
+        bbox = bbox.copy()
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1, 1)
+        h = np.maximum(y2 - y1, 1)
+
+        if len(self.masks) == 0:
+            cropped_masks = PolygonMasks([], h, w)
+        else:
+            # reference: https://github.com/facebookresearch/fvcore/blob/main/fvcore/transforms/transform.py  # noqa
+            crop_box = geometry.box(x1, y1, x2, y2).buffer(0.0)
+            cropped_masks = []
+            # suppress shapely warnings util it incorporates GEOS>=3.11.2
+            # reference: https://github.com/shapely/shapely/issues/1345
+            initial_settings = np.seterr()
+            np.seterr(invalid='ignore')
+            for poly_per_obj in self.masks:
+                cropped_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    p = geometry.Polygon(p.reshape(-1, 2)).buffer(0.0)
+                    # polygon must be valid to perform intersection.
+                    if not p.is_valid:
+                        continue
+                    cropped = p.intersection(crop_box)
+                    if cropped.is_empty:
+                        continue
+                    if isinstance(cropped,
+                                  geometry.collection.BaseMultipartGeometry):
+                        cropped = cropped.geoms
+                    else:
+                        cropped = [cropped]
+                    # one polygon may be cropped to multiple ones
+                    for poly in cropped:
+                        # ignore lines or points
+                        if not isinstance(
+                                poly, geometry.Polygon) or not poly.is_valid:
+                            continue
+                        coords = np.asarray(poly.exterior.coords)
+                        # remove an extra identical vertex at the end
+                        coords = coords[:-1]
+                        coords[:, 0] -= x1
+                        coords[:, 1] -= y1
+                        cropped_poly_per_obj.append(coords.reshape(-1))
+                # a dummy polygon to avoid misalignment between masks and boxes
+                if len(cropped_poly_per_obj) == 0:
+                    cropped_poly_per_obj = [np.array([0, 0, 0, 0, 0, 0])]
+                cropped_masks.append(cropped_poly_per_obj)
+            np.seterr(**initial_settings)
+            cropped_masks = PolygonMasks(cropped_masks, h, w)
+        return cropped_masks
+
+    def pad(self, out_shape, pad_val=0):
+        """padding has no effect on polygons`"""
+        return PolygonMasks(self.masks, *out_shape)
+
+    def expand(self, *args, **kwargs):
+        """TODO: Add expand for polygon"""
+        raise NotImplementedError
+
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device='cpu',
+                        interpolation='bilinear',
+                        binarize=True):
+        """see :func:`BaseInstanceMasks.crop_and_resize`"""
+        out_h, out_w = out_shape
+        if len(self.masks) == 0:
+            return PolygonMasks([], out_h, out_w)
+
+        if not binarize:
+            raise ValueError('Polygons are always binary, '
+                             'setting binarize=False is unsupported')
+
+        resized_masks = []
+        for i in range(len(bboxes)):
+            mask = self.masks[inds[i]]
+            bbox = bboxes[i, :]
+            x1, y1, x2, y2 = bbox
+            w = np.maximum(x2 - x1, 1)
+            h = np.maximum(y2 - y1, 1)
+            h_scale = out_h / max(h, 0.1)  # avoid too large scale
+            w_scale = out_w / max(w, 0.1)
+
+            resized_mask = []
+            for p in mask:
+                p = p.copy()
+                # crop
+                # pycocotools will clip the boundary
+                p[0::2] = p[0::2] - bbox[0]
+                p[1::2] = p[1::2] - bbox[1]
+
+                # resize
+                p[0::2] = p[0::2] * w_scale
+                p[1::2] = p[1::2] * h_scale
+                resized_mask.append(p)
+            resized_masks.append(resized_mask)
+        return PolygonMasks(resized_masks, *out_shape)
+
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  border_value=None,
+                  interpolation=None):
+        """Translate the PolygonMasks.
+
+        Example:
+            >>> self = PolygonMasks.random(dtype=np.int64)
+            >>> out_shape = (self.height, self.width)
+            >>> new = self.translate(out_shape, 4., direction='horizontal')
+            >>> assert np.all(new.masks[0][0][1::2] == self.masks[0][0][1::2])
+            >>> assert np.all(new.masks[0][0][0::2] == self.masks[0][0][0::2] + 4)  # noqa: E501
+        """
+        assert border_value is None or border_value == 0, \
+            'Here border_value is not '\
+            f'used, and defaultly should be None or 0. got {border_value}.'
+        if len(self.masks) == 0:
+            translated_masks = PolygonMasks([], *out_shape)
+        else:
+            translated_masks = []
+            for poly_per_obj in self.masks:
+                translated_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    if direction == 'horizontal':
+                        p[0::2] = np.clip(p[0::2] + offset, 0, out_shape[1])
+                    elif direction == 'vertical':
+                        p[1::2] = np.clip(p[1::2] + offset, 0, out_shape[0])
+                    translated_poly_per_obj.append(p)
+                translated_masks.append(translated_poly_per_obj)
+            translated_masks = PolygonMasks(translated_masks, *out_shape)
+        return translated_masks
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """See :func:`BaseInstanceMasks.shear`."""
+        if len(self.masks) == 0:
+            sheared_masks = PolygonMasks([], *out_shape)
+        else:
+            sheared_masks = []
+            if direction == 'horizontal':
+                shear_matrix = np.stack([[1, magnitude],
+                                         [0, 1]]).astype(np.float32)
+            elif direction == 'vertical':
+                shear_matrix = np.stack([[1, 0], [magnitude,
+                                                  1]]).astype(np.float32)
+            for poly_per_obj in self.masks:
+                sheared_poly = []
+                for p in poly_per_obj:
+                    p = np.stack([p[0::2], p[1::2]], axis=0)  # [2, n]
+                    new_coords = np.matmul(shear_matrix, p)  # [2, n]
+                    new_coords[0, :] = np.clip(new_coords[0, :], 0,
+                                               out_shape[1])
+                    new_coords[1, :] = np.clip(new_coords[1, :], 0,
+                                               out_shape[0])
+                    sheared_poly.append(
+                        new_coords.transpose((1, 0)).reshape(-1))
+                sheared_masks.append(sheared_poly)
+            sheared_masks = PolygonMasks(sheared_masks, *out_shape)
+        return sheared_masks
+
+    def rotate(self,
+               out_shape,
+               angle,
+               center=None,
+               scale=1.0,
+               border_value=0,
+               interpolation='bilinear'):
+        """See :func:`BaseInstanceMasks.rotate`."""
+        if len(self.masks) == 0:
+            rotated_masks = PolygonMasks([], *out_shape)
+        else:
+            rotated_masks = []
+            rotate_matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+            for poly_per_obj in self.masks:
+                rotated_poly = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    coords = np.stack([p[0::2], p[1::2]], axis=1)  # [n, 2]
+                    # pad 1 to convert from format [x, y] to homogeneous
+                    # coordinates format [x, y, 1]
+                    coords = np.concatenate(
+                        (coords, np.ones((coords.shape[0], 1), coords.dtype)),
+                        axis=1)  # [n, 3]
+                    rotated_coords = np.matmul(
+                        rotate_matrix[None, :, :],
+                        coords[:, :, None])[..., 0]  # [n, 2, 1] -> [n, 2]
+                    rotated_coords[:, 0] = np.clip(rotated_coords[:, 0], 0,
+                                                   out_shape[1])
+                    rotated_coords[:, 1] = np.clip(rotated_coords[:, 1], 0,
+                                                   out_shape[0])
+                    rotated_poly.append(rotated_coords.reshape(-1))
+                rotated_masks.append(rotated_poly)
+            rotated_masks = PolygonMasks(rotated_masks, *out_shape)
+        return rotated_masks
+
+    def to_bitmap(self):
+        """convert polygon masks to bitmap masks."""
+        bitmap_masks = self.to_ndarray()
+        return BitmapMasks(bitmap_masks, self.height, self.width)
+
+    @property
+    def areas(self):
+        """Compute areas of masks.
+
+        This func is modified from `detectron2
+        <https://github.com/facebookresearch/detectron2/blob/ffff8acc35ea88ad1cb1806ab0f00b4c1c5dbfd9/detectron2/structures/masks.py#L387>`_.
+        The function only works with Polygons using the shoelace formula.
+
+        Return:
+            ndarray: areas of each instance
+        """  # noqa: W501
+        area = []
+        for polygons_per_obj in self.masks:
+            area_per_obj = 0
+            for p in polygons_per_obj:
+                area_per_obj += self._polygon_area(p[0::2], p[1::2])
+            area.append(area_per_obj)
+        return np.asarray(area)
+
+    def _polygon_area(self, x, y):
+        """Compute the area of a component of a polygon.
+
+        Using the shoelace formula:
+        https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+
+        Args:
+            x (ndarray): x coordinates of the component
+            y (ndarray): y coordinates of the component
+
+        Return:
+            float: the are of the component
+        """  # noqa: 501
+        return 0.5 * np.abs(
+            np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
+
+    def to_ndarray(self):
+        """Convert masks to the format of ndarray."""
+        if len(self.masks) == 0:
+            return np.empty((0, self.height, self.width), dtype=np.uint8)
+        bitmap_masks = []
+        for poly_per_obj in self.masks:
+            bitmap_masks.append(
+                polygon_to_bitmap(poly_per_obj, self.height, self.width))
+        return np.stack(bitmap_masks)
+
+    def to_tensor(self, dtype, device):
+        """See :func:`BaseInstanceMasks.to_tensor`."""
+        if len(self.masks) == 0:
+            return torch.empty((0, self.height, self.width),
+                               dtype=dtype,
+                               device=device)
+        ndarray_masks = self.to_ndarray()
+        return torch.tensor(ndarray_masks, dtype=dtype, device=device)
+
+    @classmethod
+    def random(cls,
+               num_masks=3,
+               height=32,
+               width=32,
+               n_verts=5,
+               dtype=np.float32,
+               rng=None):
+        """Generate random polygon masks for demo / testing purposes.
+
+        Adapted from [1]_
+
+        References:
+            .. [1] https://gitlab.kitware.com/computer-vision/kwimage/-/blob/928cae35ca8/kwimage/structs/polygon.py#L379  # noqa: E501
+
+        Example:
+            >>> from mmdet.data_elements.mask.structures import PolygonMasks
+            >>> self = PolygonMasks.random()
+            >>> print('self = {}'.format(self))
+        """
+        from mmdet.utils.util_random import ensure_rng
+        rng = ensure_rng(rng)
+
+        def _gen_polygon(n, irregularity, spikeyness):
+            """Creates the polygon by sampling points on a circle around the
+            centre.  Random noise is added by varying the angular spacing
+            between sequential points, and by varying the radial distance of
+            each point from the centre.
+
+            Based on original code by Mike Ounsworth
+
+            Args:
+                n (int): number of vertices
+                irregularity (float): [0,1] indicating how much variance there
+                    is in the angular spacing of vertices. [0,1] will map to
+                    [0, 2pi/numberOfVerts]
+                spikeyness (float): [0,1] indicating how much variance there is
+                    in each vertex from the circle of radius aveRadius. [0,1]
+                    will map to [0, aveRadius]
+
+            Returns:
+                a list of vertices, in CCW order.
+            """
+            from scipy.stats import truncnorm
+
+            # Generate around the unit circle
+            cx, cy = (0.0, 0.0)
+            radius = 1
+
+            tau = np.pi * 2
+
+            irregularity = np.clip(irregularity, 0, 1) * 2 * np.pi / n
+            spikeyness = np.clip(spikeyness, 1e-9, 1)
+
+            # generate n angle steps
+            lower = (tau / n) - irregularity
+            upper = (tau / n) + irregularity
+            angle_steps = rng.uniform(lower, upper, n)
+
+            # normalize the steps so that point 0 and point n+1 are the same
+            k = angle_steps.sum() / (2 * np.pi)
+            angles = (angle_steps / k).cumsum() + rng.uniform(0, tau)
+
+            # Convert high and low values to be wrt the standard normal range
+            # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.truncnorm.html
+            low = 0
+            high = 2 * radius
+            mean = radius
+            std = spikeyness
+            a = (low - mean) / std
+            b = (high - mean) / std
+            tnorm = truncnorm(a=a, b=b, loc=mean, scale=std)
+
+            # now generate the points
+            radii = tnorm.rvs(n, random_state=rng)
+            x_pts = cx + radii * np.cos(angles)
+            y_pts = cy + radii * np.sin(angles)
+
+            points = np.hstack([x_pts[:, None], y_pts[:, None]])
+
+            # Scale to 0-1 space
+            points = points - points.min(axis=0)
+            points = points / points.max(axis=0)
+
+            # Randomly place within 0-1 space
+            points = points * (rng.rand() * .8 + .2)
+            min_pt = points.min(axis=0)
+            max_pt = points.max(axis=0)
+
+            high = (1 - max_pt)
+            low = (0 - min_pt)
+            offset = (rng.rand(2) * (high - low)) + low
+            points = points + offset
+            return points
+
+        def _order_vertices(verts):
+            """
+            References:
+                https://stackoverflow.com/questions/1709283/how-can-i-sort-a-coordinate-list-for-a-rectangle-counterclockwise
+            """
+            mlat = verts.T[0].sum() / len(verts)
+            mlng = verts.T[1].sum() / len(verts)
+
+            tau = np.pi * 2
+            angle = (np.arctan2(mlat - verts.T[0], verts.T[1] - mlng) +
+                     tau) % tau
+            sortx = angle.argsort()
+            verts = verts.take(sortx, axis=0)
+            return verts
+
+        # Generate a random exterior for each requested mask
+        masks = []
+        for _ in range(num_masks):
+            exterior = _order_vertices(_gen_polygon(n_verts, 0.9, 0.9))
+            exterior = (exterior * [(width, height)]).astype(dtype)
+            masks.append([exterior.ravel()])
+
+        self = cls(masks, height, width)
+        return self
+
+    @classmethod
+    def cat(cls: Type[T], masks: Sequence[T]) -> T:
+        """Concatenate a sequence of masks into one single mask instance.
+
+        Args:
+            masks (Sequence[PolygonMasks]): A sequence of mask instances.
+
+        Returns:
+            PolygonMasks: Concatenated mask instance.
+        """
+        assert isinstance(masks, Sequence)
+        if len(masks) == 0:
+            raise ValueError('masks should not be an empty list.')
+        assert all(isinstance(m, cls) for m in masks)
+
+        mask_list = list(itertools.chain(*[m.masks for m in masks]))
+        return cls(mask_list, masks[0].height, masks[0].width)
+
+
+def polygon_to_bitmap(polygons, height, width):
+    """Convert masks from the form of polygons to bitmaps.
+
+    Args:
+        polygons (list[ndarray]): masks in polygon representation
+        height (int): mask height
+        width (int): mask width
+
+    Return:
+        ndarray: the converted masks in bitmap representation
+    """
+    rles = maskUtils.frPyObjects(polygons, height, width)
+    rle = maskUtils.merge(rles)
+    bitmap_mask = maskUtils.decode(rle).astype(bool)
+    return bitmap_mask
+
+
+def bitmap_to_polygon(bitmap):
+    """Convert masks from the form of bitmaps to polygons.
+
+    Args:
+        bitmap (ndarray): masks in bitmap representation.
+
+    Return:
+        list[ndarray]: the converted mask in polygon representation.
+        bool: whether the mask has holes.
+    """
+    bitmap = np.ascontiguousarray(bitmap).astype(np.uint8)
+    # cv2.RETR_CCOMP: retrieves all of the contours and organizes them
+    #   into a two-level hierarchy. At the top level, there are external
+    #   boundaries of the components. At the second level, there are
+    #   boundaries of the holes. If there is another contour inside a hole
+    #   of a connected component, it is still put at the top level.
+    # cv2.CHAIN_APPROX_NONE: stores absolutely all the contour points.
+    outs = cv2.findContours(bitmap, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
+    contours = outs[-2]
+    hierarchy = outs[-1]
+    if hierarchy is None:
+        return [], False
+    # hierarchy[i]: 4 elements, for the indexes of next, previous,
+    # parent, or nested contours. If there is no corresponding contour,
+    # it will be -1.
+    with_hole = (hierarchy.reshape(-1, 4)[:, 3] >= 0).any()
+    contours = [c.reshape(-1, 2) for c in contours]
+    return contours, with_hole
diff --git a/head_extractor/src/mmdet/structures/mask/utils.py b/head_extractor/src/mmdet/structures/mask/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bd445e4fce1a312949f222d54d230a1a622d726
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/mask/utils.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from mmengine.utils import slice_list
+
+
+def split_combined_polys(polys, poly_lens, polys_per_mask):
+    """Split the combined 1-D polys into masks.
+
+    A mask is represented as a list of polys, and a poly is represented as
+    a 1-D array. In dataset, all masks are concatenated into a single 1-D
+    tensor. Here we need to split the tensor into original representations.
+
+    Args:
+        polys (list): a list (length = image num) of 1-D tensors
+        poly_lens (list): a list (length = image num) of poly length
+        polys_per_mask (list): a list (length = image num) of poly number
+            of each mask
+
+    Returns:
+        list: a list (length = image num) of list (length = mask num) of \
+            list (length = poly num) of numpy array.
+    """
+    mask_polys_list = []
+    for img_id in range(len(polys)):
+        polys_single = polys[img_id]
+        polys_lens_single = poly_lens[img_id].tolist()
+        polys_per_mask_single = polys_per_mask[img_id].tolist()
+
+        split_polys = slice_list(polys_single, polys_lens_single)
+        mask_polys = slice_list(split_polys, polys_per_mask_single)
+        mask_polys_list.append(mask_polys)
+    return mask_polys_list
+
+
+# TODO: move this function to more proper place
+def encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code.
+
+    Args:
+        mask_results (list): bitmap mask results.
+
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    encoded_mask_results = []
+    for mask in mask_results:
+        encoded_mask_results.append(
+            mask_util.encode(
+                np.array(mask[:, :, np.newaxis], order='F',
+                         dtype='uint8'))[0])  # encoded with RLE
+    return encoded_mask_results
+
+
+def mask2bbox(masks):
+    """Obtain tight bounding boxes of binary masks.
+
+    Args:
+        masks (Tensor): Binary mask of shape (n, h, w).
+
+    Returns:
+        Tensor: Bboxe with shape (n, 4) of \
+            positive region in binary mask.
+    """
+    N = masks.shape[0]
+    bboxes = masks.new_zeros((N, 4), dtype=torch.float32)
+    x_any = torch.any(masks, dim=1)
+    y_any = torch.any(masks, dim=2)
+    for i in range(N):
+        x = torch.where(x_any[i, :])[0]
+        y = torch.where(y_any[i, :])[0]
+        if len(x) > 0 and len(y) > 0:
+            bboxes[i, :] = bboxes.new_tensor(
+                [x[0], y[0], x[-1] + 1, y[-1] + 1])
+
+    return bboxes
diff --git a/head_extractor/src/mmdet/structures/reid_data_sample.py b/head_extractor/src/mmdet/structures/reid_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..69958eece3671c9040c1f5561e724ca2d5f8e155
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/reid_data_sample.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from numbers import Number
+from typing import Sequence, Union
+
+import mmengine
+import numpy as np
+import torch
+from mmengine.structures import BaseDataElement, LabelData
+
+
+def format_label(value: Union[torch.Tensor, np.ndarray, Sequence, int],
+                 num_classes: int = None) -> LabelData:
+    """Convert label of various python types to :obj:`mmengine.LabelData`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int`.
+
+    Args:
+        value (torch.Tensor | numpy.ndarray | Sequence | int): Label value.
+        num_classes (int, optional): The number of classes. If not None, set
+            it to the metainfo. Defaults to None.
+
+    Returns:
+        :obj:`mmengine.LabelData`: The foramtted label data.
+    """
+
+    # Handle single number
+    if isinstance(value, (torch.Tensor, np.ndarray)) and value.ndim == 0:
+        value = int(value.item())
+
+    if isinstance(value, np.ndarray):
+        value = torch.from_numpy(value)
+    elif isinstance(value, Sequence) and not mmengine.utils.is_str(value):
+        value = torch.tensor(value)
+    elif isinstance(value, int):
+        value = torch.LongTensor([value])
+    elif not isinstance(value, torch.Tensor):
+        raise TypeError(f'Type {type(value)} is not an available label type.')
+
+    metainfo = {}
+    if num_classes is not None:
+        metainfo['num_classes'] = num_classes
+        if value.max() >= num_classes:
+            raise ValueError(f'The label data ({value}) should not '
+                             f'exceed num_classes ({num_classes}).')
+    label = LabelData(label=value, metainfo=metainfo)
+    return label
+
+
+class ReIDDataSample(BaseDataElement):
+    """A data structure interface of ReID task.
+
+    It's used as interfaces between different components.
+
+    Meta field:
+        img_shape (Tuple): The shape of the corresponding input image.
+            Used for visualization.
+        ori_shape (Tuple): The original shape of the corresponding image.
+            Used for visualization.
+        num_classes (int): The number of all categories.
+            Used for label format conversion.
+
+    Data field:
+        gt_label (LabelData): The ground truth label.
+        pred_label (LabelData): The predicted label.
+        scores (torch.Tensor): The outputs of model.
+    """
+
+    @property
+    def gt_label(self):
+        return self._gt_label
+
+    @gt_label.setter
+    def gt_label(self, value: LabelData):
+        self.set_field(value, '_gt_label', dtype=LabelData)
+
+    @gt_label.deleter
+    def gt_label(self):
+        del self._gt_label
+
+    def set_gt_label(
+        self, value: Union[np.ndarray, torch.Tensor, Sequence[Number], Number]
+    ) -> 'ReIDDataSample':
+        """Set label of ``gt_label``."""
+        label = format_label(value, self.get('num_classes'))
+        if 'gt_label' in self:  # setting for the second time
+            self.gt_label.label = label.label
+        else:  # setting for the first time
+            self.gt_label = label
+        return self
+
+    def set_gt_score(self, value: torch.Tensor) -> 'ReIDDataSample':
+        """Set score of ``gt_label``."""
+        assert isinstance(value, torch.Tensor), \
+            f'The value should be a torch.Tensor but got {type(value)}.'
+        assert value.ndim == 1, \
+            f'The dims of value should be 1, but got {value.ndim}.'
+
+        if 'num_classes' in self:
+            assert value.size(0) == self.num_classes, \
+                f"The length of value ({value.size(0)}) doesn't "\
+                f'match the num_classes ({self.num_classes}).'
+            metainfo = {'num_classes': self.num_classes}
+        else:
+            metainfo = {'num_classes': value.size(0)}
+
+        if 'gt_label' in self:  # setting for the second time
+            self.gt_label.score = value
+        else:  # setting for the first time
+            self.gt_label = LabelData(score=value, metainfo=metainfo)
+        return self
+
+    @property
+    def pred_feature(self):
+        return self._pred_feature
+
+    @pred_feature.setter
+    def pred_feature(self, value: torch.Tensor):
+        self.set_field(value, '_pred_feature', dtype=torch.Tensor)
+
+    @pred_feature.deleter
+    def pred_feature(self):
+        del self._pred_feature
diff --git a/head_extractor/src/mmdet/structures/track_data_sample.py b/head_extractor/src/mmdet/structures/track_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..d005a5a42f57682d0b76d60d3dae463c4b4dc727
--- /dev/null
+++ b/head_extractor/src/mmdet/structures/track_data_sample.py
@@ -0,0 +1,273 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence
+
+import numpy as np
+import torch
+from mmengine.structures import BaseDataElement
+
+from .det_data_sample import DetDataSample
+
+
+class TrackDataSample(BaseDataElement):
+    """A data structure interface of tracking task in MMDetection. It is used
+    as interfaces between different components.
+
+    This data structure can be viewd as a wrapper of multiple DetDataSample to
+    some extent. Specifically, it only contains a property:
+    ``video_data_samples`` which is a list of DetDataSample, each of which
+    corresponds to a single frame. If you want to get the property of a single
+    frame, you must first get the corresponding ``DetDataSample`` by indexing
+    and then get the property of the frame, such as ``gt_instances``,
+    ``pred_instances`` and so on. As for metainfo, it differs from
+    ``DetDataSample`` in that each value corresponds to the metainfo key is a
+    list where each element corresponds to information of a single frame.
+
+    Examples:
+        >>> import torch
+        >>> from mmengine.structures import InstanceData
+        >>> from mmdet.structures import DetDataSample, TrackDataSample
+        >>> track_data_sample = TrackDataSample()
+        >>> # set the 1st frame
+        >>> frame1_data_sample = DetDataSample(metainfo=dict(
+        ...         img_shape=(100, 100), frame_id=0))
+        >>> frame1_gt_instances = InstanceData()
+        >>> frame1_gt_instances.bbox = torch.zeros([2, 4])
+        >>> frame1_data_sample.gt_instances = frame1_gt_instances
+        >>> # set the 2nd frame
+        >>> frame2_data_sample = DetDataSample(metainfo=dict(
+        ...         img_shape=(100, 100), frame_id=1))
+        >>> frame2_gt_instances = InstanceData()
+        >>> frame2_gt_instances.bbox = torch.ones([3, 4])
+        >>> frame2_data_sample.gt_instances = frame2_gt_instances
+        >>> track_data_sample.video_data_samples = [frame1_data_sample,
+        ...                                         frame2_data_sample]
+        >>> # set metainfo for track_data_sample
+        >>> track_data_sample.set_metainfo(dict(key_frames_inds=[0]))
+        >>> track_data_sample.set_metainfo(dict(ref_frames_inds=[1]))
+        >>> print(track_data_sample)
+        <TrackDataSample(
+
+            META INFORMATION
+            key_frames_inds: [0]
+            ref_frames_inds: [1]
+
+            DATA FIELDS
+            video_data_samples: [<DetDataSample(
+
+                    META INFORMATION
+                    img_shape: (100, 100)
+
+                    DATA FIELDS
+                    gt_instances: <InstanceData(
+
+                            META INFORMATION
+
+                            DATA FIELDS
+                            bbox: tensor([[0., 0., 0., 0.],
+                                        [0., 0., 0., 0.]])
+                        ) at 0x7f639320dcd0>
+                ) at 0x7f64bd223340>, <DetDataSample(
+
+                    META INFORMATION
+                    img_shape: (100, 100)
+
+                    DATA FIELDS
+                    gt_instances: <InstanceData(
+
+                            META INFORMATION
+
+                            DATA FIELDS
+                            bbox: tensor([[1., 1., 1., 1.],
+                                        [1., 1., 1., 1.],
+                                        [1., 1., 1., 1.]])
+                        ) at 0x7f64bd128b20>
+                ) at 0x7f64bd1346d0>]
+        ) at 0x7f64bd2237f0>
+        >>> print(len(track_data_sample))
+        2
+        >>> key_data_sample = track_data_sample.get_key_frames()
+        >>> print(key_data_sample[0].frame_id)
+        0
+        >>> ref_data_sample = track_data_sample.get_ref_frames()
+        >>> print(ref_data_sample[0].frame_id)
+        1
+        >>> frame1_data_sample = track_data_sample[0]
+        >>> print(frame1_data_sample.gt_instances.bbox)
+        tensor([[0., 0., 0., 0.],
+                [0., 0., 0., 0.]])
+        >>> # Tensor-like methods
+        >>> cuda_track_data_sample = track_data_sample.to('cuda')
+        >>> cuda_track_data_sample = track_data_sample.cuda()
+        >>> cpu_track_data_sample = track_data_sample.cpu()
+        >>> cpu_track_data_sample = track_data_sample.to('cpu')
+        >>> fp16_instances = cuda_track_data_sample.to(
+        ...     device=None, dtype=torch.float16, non_blocking=False,
+        ...     copy=False, memory_format=torch.preserve_format)
+    """
+
+    @property
+    def video_data_samples(self) -> List[DetDataSample]:
+        return self._video_data_samples
+
+    @video_data_samples.setter
+    def video_data_samples(self, value: List[DetDataSample]):
+        if isinstance(value, DetDataSample):
+            value = [value]
+        assert isinstance(value, list), 'video_data_samples must be a list'
+        assert isinstance(
+            value[0], DetDataSample
+        ), 'video_data_samples must be a list of DetDataSample, but got '
+        f'{value[0]}'
+        self.set_field(value, '_video_data_samples', dtype=list)
+
+    @video_data_samples.deleter
+    def video_data_samples(self):
+        del self._video_data_samples
+
+    def __getitem__(self, index):
+        assert hasattr(self,
+                       '_video_data_samples'), 'video_data_samples not set'
+        return self._video_data_samples[index]
+
+    def get_key_frames(self):
+        assert hasattr(self, 'key_frames_inds'), \
+            'key_frames_inds not set'
+        assert isinstance(self.key_frames_inds, Sequence)
+        key_frames_info = []
+        for index in self.key_frames_inds:
+            key_frames_info.append(self[index])
+        return key_frames_info
+
+    def get_ref_frames(self):
+        assert hasattr(self, 'ref_frames_inds'), \
+            'ref_frames_inds not set'
+        ref_frames_info = []
+        assert isinstance(self.ref_frames_inds, Sequence)
+        for index in self.ref_frames_inds:
+            ref_frames_info.append(self[index])
+        return ref_frames_info
+
+    def __len__(self):
+        return len(self._video_data_samples) if hasattr(
+            self, '_video_data_samples') else 0
+
+    # TODO: add UT for this Tensor-like method
+    # Tensor-like methods
+    def to(self, *args, **kwargs) -> 'BaseDataElement':
+        """Apply same name function to all tensors in data_fields."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if hasattr(v, 'to'):
+                    v = v.to(*args, **kwargs)
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def cpu(self) -> 'BaseDataElement':
+        """Convert all tensors to CPU in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.cpu()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def cuda(self) -> 'BaseDataElement':
+        """Convert all tensors to GPU in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.cuda()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def npu(self) -> 'BaseDataElement':
+        """Convert all tensors to NPU in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.npu()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def detach(self) -> 'BaseDataElement':
+        """Detach all tensors in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.detach()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def numpy(self) -> 'BaseDataElement':
+        """Convert all tensors to np.ndarray in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, (torch.Tensor, BaseDataElement)):
+                    v = v.detach().cpu().numpy()
+                    data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    def to_tensor(self) -> 'BaseDataElement':
+        """Convert all np.ndarray to tensor in data."""
+        new_data = self.new()
+        for k, v_list in self.items():
+            data_list = []
+            for v in v_list:
+                if isinstance(v, np.ndarray):
+                    v = torch.from_numpy(v)
+                elif isinstance(v, BaseDataElement):
+                    v = v.to_tensor()
+                data_list.append(v)
+            if len(data_list) > 0:
+                new_data.set_data({f'{k}': data_list})
+        return new_data
+
+    # Tensor-like methods
+    def clone(self) -> 'BaseDataElement':
+        """Deep copy the current data element.
+
+        Returns:
+            BaseDataElement: The copy of current data element.
+        """
+        clone_data = self.__class__()
+        clone_data.set_metainfo(dict(self.metainfo_items()))
+
+        for k, v_list in self.items():
+            clone_item_list = []
+            for v in v_list:
+                clone_item_list.append(v.clone())
+            clone_data.set_data({k: clone_item_list})
+        return clone_data
+
+
+TrackSampleList = List[TrackDataSample]
+OptTrackSampleList = Optional[TrackSampleList]
diff --git a/head_extractor/src/mmdet/testing/__init__.py b/head_extractor/src/mmdet/testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..766fb471022ee6f2e4e1ff13a52040ae57772e53
--- /dev/null
+++ b/head_extractor/src/mmdet/testing/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ._fast_stop_training_hook import FastStopTrainingHook  # noqa: F401,F403
+from ._utils import (demo_mm_inputs, demo_mm_proposals,
+                     demo_mm_sampling_results, demo_track_inputs,
+                     get_detector_cfg, get_roi_head_cfg, random_boxes,
+                     replace_to_ceph)
+
+__all__ = [
+    'demo_mm_inputs', 'get_detector_cfg', 'get_roi_head_cfg',
+    'demo_mm_proposals', 'demo_mm_sampling_results', 'replace_to_ceph',
+    'demo_track_inputs', 'VideoDataSampleFeeder', 'random_boxes'
+]
diff --git a/head_extractor/src/mmdet/testing/_fast_stop_training_hook.py b/head_extractor/src/mmdet/testing/_fast_stop_training_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8e3d11439f875d2c9a6ce6b8a0b33acc832c2c5
--- /dev/null
+++ b/head_extractor/src/mmdet/testing/_fast_stop_training_hook.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import Hook
+
+from mmdet.registry import HOOKS
+
+
+@HOOKS.register_module()
+class FastStopTrainingHook(Hook):
+    """Set runner's epoch information to the model."""
+
+    def __init__(self, by_epoch, save_ckpt=False, stop_iter_or_epoch=5):
+        self.by_epoch = by_epoch
+        self.save_ckpt = save_ckpt
+        self.stop_iter_or_epoch = stop_iter_or_epoch
+
+    def after_train_iter(self, runner, batch_idx: int, data_batch: None,
+                         outputs: None) -> None:
+        if self.save_ckpt and self.by_epoch:
+            # If it is epoch-based and want to save weights,
+            # we must run at least 1 epoch.
+            return
+        if runner.iter >= self.stop_iter_or_epoch:
+            raise RuntimeError('quick exit')
+
+    def after_train_epoch(self, runner) -> None:
+        if runner.epoch >= self.stop_iter_or_epoch - 1:
+            raise RuntimeError('quick exit')
diff --git a/head_extractor/src/mmdet/testing/_utils.py b/head_extractor/src/mmdet/testing/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4d3a86deab17e9c5acd1b1fe7f42e0bfa78943d
--- /dev/null
+++ b/head_extractor/src/mmdet/testing/_utils.py
@@ -0,0 +1,469 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from os.path import dirname, exists, join
+
+import numpy as np
+import torch
+from mmengine.config import Config
+from mmengine.dataset import pseudo_collate
+from mmengine.structures import InstanceData, PixelData
+
+from mmdet.utils.util_random import ensure_rng
+from ..registry import TASK_UTILS
+from ..structures import DetDataSample, TrackDataSample
+from ..structures.bbox import HorizontalBoxes
+
+
+def _get_config_directory():
+    """Find the predefined detector config directory."""
+    try:
+        # Assume we are running in the source mmdetection repo
+        repo_dpath = dirname(dirname(dirname(__file__)))
+    except NameError:
+        # For IPython development when this __file__ is not defined
+        import mmdet
+        repo_dpath = dirname(dirname(mmdet.__file__))
+    config_dpath = join(repo_dpath, 'configs')
+    if not exists(config_dpath):
+        raise Exception('Cannot find config path')
+    return config_dpath
+
+
+def _get_config_module(fname):
+    """Load a configuration as a python module."""
+    config_dpath = _get_config_directory()
+    config_fpath = join(config_dpath, fname)
+    config_mod = Config.fromfile(config_fpath)
+    return config_mod
+
+
+def get_detector_cfg(fname):
+    """Grab configs necessary to create a detector.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+    return model
+
+
+def get_roi_head_cfg(fname):
+    """Grab configs necessary to create a roi_head.
+
+    These are deep copied to allow for safe modification of parameters without
+    influencing other tests.
+    """
+    config = _get_config_module(fname)
+    model = copy.deepcopy(config.model)
+
+    roi_head = model.roi_head
+    train_cfg = None if model.train_cfg is None else model.train_cfg.rcnn
+    test_cfg = None if model.test_cfg is None else model.test_cfg.rcnn
+    roi_head.update(dict(train_cfg=train_cfg, test_cfg=test_cfg))
+    return roi_head
+
+
+def _rand_bboxes(rng, num_boxes, w, h):
+    cx, cy, bw, bh = rng.rand(num_boxes, 4).T
+
+    tl_x = ((cx * w) - (w * bw / 2)).clip(0, w)
+    tl_y = ((cy * h) - (h * bh / 2)).clip(0, h)
+    br_x = ((cx * w) + (w * bw / 2)).clip(0, w)
+    br_y = ((cy * h) + (h * bh / 2)).clip(0, h)
+
+    bboxes = np.vstack([tl_x, tl_y, br_x, br_y]).T
+    return bboxes
+
+
+def _rand_masks(rng, num_boxes, bboxes, img_w, img_h):
+    from mmdet.structures.mask import BitmapMasks
+    masks = np.zeros((num_boxes, img_h, img_w))
+    for i, bbox in enumerate(bboxes):
+        bbox = bbox.astype(np.int32)
+        mask = (rng.rand(1, bbox[3] - bbox[1], bbox[2] - bbox[0]) >
+                0.3).astype(np.int64)
+        masks[i:i + 1, bbox[1]:bbox[3], bbox[0]:bbox[2]] = mask
+    return BitmapMasks(masks, height=img_h, width=img_w)
+
+
+def demo_mm_inputs(batch_size=2,
+                   image_shapes=(3, 128, 128),
+                   num_items=None,
+                   num_classes=10,
+                   sem_seg_output_strides=1,
+                   with_mask=False,
+                   with_semantic=False,
+                   use_box_type=False,
+                   device='cpu',
+                   texts=None,
+                   custom_entities=False):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        batch_size (int): batch size. Defaults to 2.
+        image_shapes (List[tuple], Optional): image shape.
+            Defaults to (3, 128, 128)
+        num_items (None | List[int]): specifies the number
+            of boxes in each batch item. Default to None.
+        num_classes (int): number of different labels a
+            box might have. Defaults to 10.
+        with_mask (bool): Whether to return mask annotation.
+            Defaults to False.
+        with_semantic (bool): whether to return semantic.
+            Defaults to False.
+        device (str): Destination device type. Defaults to cpu.
+    """
+    rng = np.random.RandomState(0)
+
+    if isinstance(image_shapes, list):
+        assert len(image_shapes) == batch_size
+    else:
+        image_shapes = [image_shapes] * batch_size
+
+    if isinstance(num_items, list):
+        assert len(num_items) == batch_size
+
+    if texts is not None:
+        assert batch_size == len(texts)
+
+    packed_inputs = []
+    for idx in range(batch_size):
+        image_shape = image_shapes[idx]
+        c, h, w = image_shape
+
+        image = rng.randint(0, 255, size=image_shape, dtype=np.uint8)
+
+        mm_inputs = dict()
+        mm_inputs['inputs'] = torch.from_numpy(image).to(device)
+
+        img_meta = {
+            'img_id': idx,
+            'img_shape': image_shape[1:],
+            'ori_shape': image_shape[1:],
+            'filename': '<demo>.png',
+            'scale_factor': np.array([1.1, 1.2]),
+            'flip': False,
+            'flip_direction': None,
+            'border': [1, 1, 1, 1]  # Only used by CenterNet
+        }
+
+        if texts:
+            img_meta['text'] = texts[idx]
+            img_meta['custom_entities'] = custom_entities
+
+        data_sample = DetDataSample()
+        data_sample.set_metainfo(img_meta)
+
+        # gt_instances
+        gt_instances = InstanceData()
+        if num_items is None:
+            num_boxes = rng.randint(1, 10)
+        else:
+            num_boxes = num_items[idx]
+
+        bboxes = _rand_bboxes(rng, num_boxes, w, h)
+        labels = rng.randint(1, num_classes, size=num_boxes)
+        # TODO: remove this part when all model adapted with BaseBoxes
+        if use_box_type:
+            gt_instances.bboxes = HorizontalBoxes(bboxes, dtype=torch.float32)
+        else:
+            gt_instances.bboxes = torch.FloatTensor(bboxes)
+        gt_instances.labels = torch.LongTensor(labels)
+
+        if with_mask:
+            masks = _rand_masks(rng, num_boxes, bboxes, w, h)
+            gt_instances.masks = masks
+
+        # TODO: waiting for ci to be fixed
+        # masks = np.random.randint(0, 2, (len(bboxes), h, w), dtype=np.uint8)
+        # gt_instances.mask = BitmapMasks(masks, h, w)
+
+        data_sample.gt_instances = gt_instances
+
+        # ignore_instances
+        ignore_instances = InstanceData()
+        bboxes = _rand_bboxes(rng, num_boxes, w, h)
+        if use_box_type:
+            ignore_instances.bboxes = HorizontalBoxes(
+                bboxes, dtype=torch.float32)
+        else:
+            ignore_instances.bboxes = torch.FloatTensor(bboxes)
+        data_sample.ignored_instances = ignore_instances
+
+        # gt_sem_seg
+        if with_semantic:
+            # assume gt_semantic_seg using scale 1/8 of the img
+            gt_semantic_seg = torch.from_numpy(
+                np.random.randint(
+                    0,
+                    num_classes, (1, h // sem_seg_output_strides,
+                                  w // sem_seg_output_strides),
+                    dtype=np.uint8))
+            gt_sem_seg_data = dict(sem_seg=gt_semantic_seg)
+            data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data)
+
+        mm_inputs['data_samples'] = data_sample.to(device)
+
+        # TODO: gt_ignore
+
+        packed_inputs.append(mm_inputs)
+    data = pseudo_collate(packed_inputs)
+    return data
+
+
+def demo_mm_proposals(image_shapes, num_proposals, device='cpu'):
+    """Create a list of fake porposals.
+
+    Args:
+        image_shapes (list[tuple[int]]): Batch image shapes.
+        num_proposals (int): The number of fake proposals.
+    """
+    rng = np.random.RandomState(0)
+
+    results = []
+    for img_shape in image_shapes:
+        result = InstanceData()
+        w, h = img_shape[1:]
+        proposals = _rand_bboxes(rng, num_proposals, w, h)
+        result.bboxes = torch.from_numpy(proposals).float()
+        result.scores = torch.from_numpy(rng.rand(num_proposals)).float()
+        result.labels = torch.zeros(num_proposals).long()
+        results.append(result.to(device))
+    return results
+
+
+def demo_mm_sampling_results(proposals_list,
+                             batch_gt_instances,
+                             batch_gt_instances_ignore=None,
+                             assigner_cfg=None,
+                             sampler_cfg=None,
+                             feats=None):
+    """Create sample results that can be passed to BBoxHead.get_targets."""
+    assert len(proposals_list) == len(batch_gt_instances)
+    if batch_gt_instances_ignore is None:
+        batch_gt_instances_ignore = [None for _ in batch_gt_instances]
+    else:
+        assert len(batch_gt_instances_ignore) == len(batch_gt_instances)
+
+    default_assigner_cfg = dict(
+        type='MaxIoUAssigner',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        min_pos_iou=0.5,
+        ignore_iof_thr=-1)
+    assigner_cfg = assigner_cfg if assigner_cfg is not None \
+        else default_assigner_cfg
+    default_sampler_cfg = dict(
+        type='RandomSampler',
+        num=512,
+        pos_fraction=0.25,
+        neg_pos_ub=-1,
+        add_gt_as_proposals=True)
+    sampler_cfg = sampler_cfg if sampler_cfg is not None \
+        else default_sampler_cfg
+    bbox_assigner = TASK_UTILS.build(assigner_cfg)
+    bbox_sampler = TASK_UTILS.build(sampler_cfg)
+
+    sampling_results = []
+    for i in range(len(batch_gt_instances)):
+        if feats is not None:
+            feats = [lvl_feat[i][None] for lvl_feat in feats]
+        # rename proposals.bboxes to proposals.priors
+        proposals = proposals_list[i]
+        proposals.priors = proposals.pop('bboxes')
+
+        assign_result = bbox_assigner.assign(proposals, batch_gt_instances[i],
+                                             batch_gt_instances_ignore[i])
+        sampling_result = bbox_sampler.sample(
+            assign_result, proposals, batch_gt_instances[i], feats=feats)
+        sampling_results.append(sampling_result)
+
+    return sampling_results
+
+
+def demo_track_inputs(batch_size=1,
+                      num_frames=2,
+                      key_frames_inds=None,
+                      image_shapes=(3, 128, 128),
+                      num_items=None,
+                      num_classes=1,
+                      with_mask=False,
+                      with_semantic=False):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        batch_size (int): batch size. Default to 1.
+        num_frames (int): The number of frames.
+        key_frames_inds (List): The indices of key frames.
+        image_shapes (List[tuple], Optional): image shape.
+            Default to (3, 128, 128)
+        num_items (None | List[int]): specifies the number
+            of boxes in each batch item. Default to None.
+        num_classes (int): number of different labels a
+            box might have. Default to 1.
+        with_mask (bool): Whether to return mask annotation.
+            Defaults to False.
+        with_semantic (bool): whether to return semantic.
+            Default to False.
+    """
+    rng = np.random.RandomState(0)
+
+    # Make sure the length of image_shapes is equal to ``batch_size``
+    if isinstance(image_shapes, list):
+        assert len(image_shapes) == batch_size
+    else:
+        image_shapes = [image_shapes] * batch_size
+
+    packed_inputs = []
+    for idx in range(batch_size):
+        mm_inputs = dict(inputs=dict())
+        _, h, w = image_shapes[idx]
+
+        imgs = rng.randint(
+            0, 255, size=(num_frames, *image_shapes[idx]), dtype=np.uint8)
+        mm_inputs['inputs'] = torch.from_numpy(imgs)
+
+        img_meta = {
+            'img_id': idx,
+            'img_shape': image_shapes[idx][-2:],
+            'ori_shape': image_shapes[idx][-2:],
+            'filename': '<demo>.png',
+            'scale_factor': np.array([1.1, 1.2]),
+            'flip': False,
+            'flip_direction': None,
+            'is_video_data': True,
+        }
+
+        video_data_samples = []
+        for i in range(num_frames):
+            data_sample = DetDataSample()
+            img_meta['frame_id'] = i
+            data_sample.set_metainfo(img_meta)
+
+            # gt_instances
+            gt_instances = InstanceData()
+            if num_items is None:
+                num_boxes = rng.randint(1, 10)
+            else:
+                num_boxes = num_items[idx]
+
+            bboxes = _rand_bboxes(rng, num_boxes, w, h)
+            labels = rng.randint(0, num_classes, size=num_boxes)
+            instances_id = rng.randint(100, num_classes + 100, size=num_boxes)
+            gt_instances.bboxes = torch.FloatTensor(bboxes)
+            gt_instances.labels = torch.LongTensor(labels)
+            gt_instances.instances_ids = torch.LongTensor(instances_id)
+
+            if with_mask:
+                masks = _rand_masks(rng, num_boxes, bboxes, w, h)
+                gt_instances.masks = masks
+
+            data_sample.gt_instances = gt_instances
+            # ignore_instances
+            ignore_instances = InstanceData()
+            bboxes = _rand_bboxes(rng, num_boxes, w, h)
+            ignore_instances.bboxes = bboxes
+            data_sample.ignored_instances = ignore_instances
+
+            video_data_samples.append(data_sample)
+
+        track_data_sample = TrackDataSample()
+        track_data_sample.video_data_samples = video_data_samples
+        if key_frames_inds is not None:
+            assert isinstance(
+                key_frames_inds,
+                list) and len(key_frames_inds) < num_frames and max(
+                    key_frames_inds) < num_frames
+            ref_frames_inds = [
+                i for i in range(num_frames) if i not in key_frames_inds
+            ]
+            track_data_sample.set_metainfo(
+                dict(key_frames_inds=key_frames_inds))
+            track_data_sample.set_metainfo(
+                dict(ref_frames_inds=ref_frames_inds))
+        mm_inputs['data_samples'] = track_data_sample
+
+        # TODO: gt_ignore
+        packed_inputs.append(mm_inputs)
+    data = pseudo_collate(packed_inputs)
+    return data
+
+
+def random_boxes(num=1, scale=1, rng=None):
+    """Simple version of ``kwimage.Boxes.random``
+    Returns:
+        Tensor: shape (n, 4) in x1, y1, x2, y2 format.
+    References:
+        https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390 # noqa: E501
+    Example:
+        >>> num = 3
+        >>> scale = 512
+        >>> rng = 0
+        >>> boxes = random_boxes(num, scale, rng)
+        >>> print(boxes)
+        tensor([[280.9925, 278.9802, 308.6148, 366.1769],
+                [216.9113, 330.6978, 224.0446, 456.5878],
+                [405.3632, 196.3221, 493.3953, 270.7942]])
+    """
+    rng = ensure_rng(rng)
+
+    tlbr = rng.rand(num, 4).astype(np.float32)
+
+    tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2])
+    tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3])
+    br_x = np.maximum(tlbr[:, 0], tlbr[:, 2])
+    br_y = np.maximum(tlbr[:, 1], tlbr[:, 3])
+
+    tlbr[:, 0] = tl_x * scale
+    tlbr[:, 1] = tl_y * scale
+    tlbr[:, 2] = br_x * scale
+    tlbr[:, 3] = br_y * scale
+
+    boxes = torch.from_numpy(tlbr)
+    return boxes
+
+
+# TODO: Support full ceph
+def replace_to_ceph(cfg):
+    backend_args = dict(
+        backend='petrel',
+        path_mapping=dict({
+            './data/': 's3://openmmlab/datasets/detection/',
+            'data/': 's3://openmmlab/datasets/detection/'
+        }))
+
+    # TODO: name is a reserved interface, which will be used later.
+    def _process_pipeline(dataset, name):
+
+        def replace_img(pipeline):
+            if pipeline['type'] == 'LoadImageFromFile':
+                pipeline['backend_args'] = backend_args
+
+        def replace_ann(pipeline):
+            if pipeline['type'] == 'LoadAnnotations' or pipeline[
+                    'type'] == 'LoadPanopticAnnotations':
+                pipeline['backend_args'] = backend_args
+
+        if 'pipeline' in dataset:
+            replace_img(dataset.pipeline[0])
+            replace_ann(dataset.pipeline[1])
+            if 'dataset' in dataset:
+                # dataset wrapper
+                replace_img(dataset.dataset.pipeline[0])
+                replace_ann(dataset.dataset.pipeline[1])
+        else:
+            # dataset wrapper
+            replace_img(dataset.dataset.pipeline[0])
+            replace_ann(dataset.dataset.pipeline[1])
+
+    def _process_evaluator(evaluator, name):
+        if evaluator['type'] == 'CocoPanopticMetric':
+            evaluator['backend_args'] = backend_args
+
+    # half ceph
+    _process_pipeline(cfg.train_dataloader.dataset, cfg.filename)
+    _process_pipeline(cfg.val_dataloader.dataset, cfg.filename)
+    _process_pipeline(cfg.test_dataloader.dataset, cfg.filename)
+    _process_evaluator(cfg.val_evaluator, cfg.filename)
+    _process_evaluator(cfg.test_evaluator, cfg.filename)
diff --git a/head_extractor/src/mmdet/utils/__init__.py b/head_extractor/src/mmdet/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..449a890bac411f84790eb3d014175e3a48757847
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .collect_env import collect_env
+from .compat_config import compat_cfg
+from .dist_utils import (all_reduce_dict, allreduce_grads, reduce_mean,
+                         sync_random_seed)
+from .logger import get_caller_name, log_img_scale
+from .memory import AvoidCUDAOOM, AvoidOOM
+from .misc import (find_latest_checkpoint, get_test_pipeline_cfg,
+                   update_data_root)
+from .mot_error_visualize import imshow_mot_errors
+from .replace_cfg_vals import replace_cfg_vals
+from .setup_env import (register_all_modules, setup_cache_size_limit_of_dynamo,
+                        setup_multi_processes)
+from .split_batch import split_batch
+from .typing_utils import (ConfigType, InstanceList, MultiConfig,
+                           OptConfigType, OptInstanceList, OptMultiConfig,
+                           OptPixelList, PixelList, RangeType)
+
+__all__ = [
+    'collect_env', 'find_latest_checkpoint', 'update_data_root',
+    'setup_multi_processes', 'get_caller_name', 'log_img_scale', 'compat_cfg',
+    'split_batch', 'register_all_modules', 'replace_cfg_vals', 'AvoidOOM',
+    'AvoidCUDAOOM', 'all_reduce_dict', 'allreduce_grads', 'reduce_mean',
+    'sync_random_seed', 'ConfigType', 'InstanceList', 'MultiConfig',
+    'OptConfigType', 'OptInstanceList', 'OptMultiConfig', 'OptPixelList',
+    'PixelList', 'RangeType', 'get_test_pipeline_cfg',
+    'setup_cache_size_limit_of_dynamo', 'imshow_mot_errors'
+]
diff --git a/head_extractor/src/mmdet/utils/benchmark.py b/head_extractor/src/mmdet/utils/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..5419b2d175e3c48c063a39ae28758b386f9ab597
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/benchmark.py
@@ -0,0 +1,529 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import time
+from functools import partial
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import fuse_conv_bn
+# TODO need update
+# from mmcv.runner import wrap_fp16_model
+from mmengine import MMLogger
+from mmengine.config import Config
+from mmengine.device import get_max_cuda_memory
+from mmengine.dist import get_world_size
+from mmengine.runner import Runner, load_checkpoint
+from mmengine.utils.dl_utils import set_multi_processing
+from torch.nn.parallel import DistributedDataParallel
+
+from mmdet.registry import DATASETS, MODELS
+
+try:
+    import psutil
+except ImportError:
+    psutil = None
+
+
+def custom_round(value: Union[int, float],
+                 factor: Union[int, float],
+                 precision: int = 2) -> float:
+    """Custom round function."""
+    return round(value / factor, precision)
+
+
+gb_round = partial(custom_round, factor=1024**3)
+
+
+def print_log(msg: str, logger: Optional[MMLogger] = None) -> None:
+    """Print a log message."""
+    if logger is None:
+        print(msg, flush=True)
+    else:
+        logger.info(msg)
+
+
+def print_process_memory(p: psutil.Process,
+                         logger: Optional[MMLogger] = None) -> None:
+    """print process memory info."""
+    mem_used = gb_round(psutil.virtual_memory().used)
+    memory_full_info = p.memory_full_info()
+    uss_mem = gb_round(memory_full_info.uss)
+    if hasattr(memory_full_info, 'pss'):
+        pss_mem = gb_round(memory_full_info.pss)
+
+    for children in p.children():
+        child_mem_info = children.memory_full_info()
+        uss_mem += gb_round(child_mem_info.uss)
+        if hasattr(child_mem_info, 'pss'):
+            pss_mem += gb_round(child_mem_info.pss)
+
+    process_count = 1 + len(p.children())
+
+    log_msg = f'(GB) mem_used: {mem_used:.2f} | uss: {uss_mem:.2f} | '
+    if hasattr(memory_full_info, 'pss'):
+        log_msg += f'pss: {pss_mem:.2f} | '
+    log_msg += f'total_proc: {process_count}'
+    print_log(log_msg, logger)
+
+
+class BaseBenchmark:
+    """The benchmark base class.
+
+    The ``run`` method is an external calling interface, and it will
+    call the ``run_once`` method ``repeat_num`` times for benchmarking.
+    Finally, call the ``average_multiple_runs`` method to further process
+    the results of multiple runs.
+
+    Args:
+        max_iter (int): maximum iterations of benchmark.
+        log_interval (int): interval of logging.
+        num_warmup (int): Number of Warmup.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 max_iter: int,
+                 log_interval: int,
+                 num_warmup: int,
+                 logger: Optional[MMLogger] = None):
+        self.max_iter = max_iter
+        self.log_interval = log_interval
+        self.num_warmup = num_warmup
+        self.logger = logger
+
+    def run(self, repeat_num: int = 1) -> dict:
+        """benchmark entry method.
+
+        Args:
+            repeat_num (int): Number of repeat benchmark.
+                Defaults to 1.
+        """
+        assert repeat_num >= 1
+
+        results = []
+        for _ in range(repeat_num):
+            results.append(self.run_once())
+
+        results = self.average_multiple_runs(results)
+        return results
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        raise NotImplementedError()
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        raise NotImplementedError()
+
+
+class InferenceBenchmark(BaseBenchmark):
+    """The inference benchmark class. It will be statistical inference FPS,
+    CUDA memory and CPU memory information.
+
+    Args:
+        cfg (mmengine.Config): config.
+        checkpoint (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``.
+        distributed (bool): distributed testing flag.
+        is_fuse_conv_bn (bool): Whether to fuse conv and bn, this will
+            slightly increase the inference speed.
+        max_iter (int): maximum iterations of benchmark. Defaults to 2000.
+        log_interval (int): interval of logging. Defaults to 50.
+        num_warmup (int): Number of Warmup. Defaults to 5.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 cfg: Config,
+                 checkpoint: str,
+                 distributed: bool,
+                 is_fuse_conv_bn: bool,
+                 max_iter: int = 2000,
+                 log_interval: int = 50,
+                 num_warmup: int = 5,
+                 logger: Optional[MMLogger] = None):
+        super().__init__(max_iter, log_interval, num_warmup, logger)
+
+        assert get_world_size(
+        ) == 1, 'Inference benchmark does not allow distributed multi-GPU'
+
+        self.cfg = copy.deepcopy(cfg)
+        self.distributed = distributed
+
+        if psutil is None:
+            raise ImportError('psutil is not installed, please install it by: '
+                              'pip install psutil')
+
+        self._process = psutil.Process()
+        env_cfg = self.cfg.get('env_cfg')
+        if env_cfg.get('cudnn_benchmark'):
+            torch.backends.cudnn.benchmark = True
+
+        mp_cfg: dict = env_cfg.get('mp_cfg', {})
+        set_multi_processing(**mp_cfg, distributed=self.distributed)
+
+        print_log('before build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+        self.model = self._init_model(checkpoint, is_fuse_conv_bn)
+
+        # Because multiple processes will occupy additional CPU resources,
+        # FPS statistics will be more unstable when num_workers is not 0.
+        # It is reasonable to set num_workers to 0.
+        dataloader_cfg = cfg.test_dataloader
+        dataloader_cfg['num_workers'] = 0
+        dataloader_cfg['batch_size'] = 1
+        dataloader_cfg['persistent_workers'] = False
+        self.data_loader = Runner.build_dataloader(dataloader_cfg)
+
+        print_log('after build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+    def _init_model(self, checkpoint: str, is_fuse_conv_bn: bool) -> nn.Module:
+        """Initialize the model."""
+        model = MODELS.build(self.cfg.model)
+        # TODO need update
+        # fp16_cfg = self.cfg.get('fp16', None)
+        # if fp16_cfg is not None:
+        #     wrap_fp16_model(model)
+
+        load_checkpoint(model, checkpoint, map_location='cpu')
+        if is_fuse_conv_bn:
+            model = fuse_conv_bn(model)
+
+        model = model.cuda()
+
+        if self.distributed:
+            model = DistributedDataParallel(
+                model,
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=False)
+
+        model.eval()
+        return model
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        pure_inf_time = 0
+        fps = 0
+
+        for i, data in enumerate(self.data_loader):
+
+            if (i + 1) % self.log_interval == 0:
+                print_log('==================================', self.logger)
+
+            torch.cuda.synchronize()
+            start_time = time.perf_counter()
+
+            with torch.no_grad():
+                self.model.test_step(data)
+
+            torch.cuda.synchronize()
+            elapsed = time.perf_counter() - start_time
+
+            if i >= self.num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % self.log_interval == 0:
+                    fps = (i + 1 - self.num_warmup) / pure_inf_time
+                    cuda_memory = get_max_cuda_memory()
+
+                    print_log(
+                        f'Done image [{i + 1:<3}/{self.max_iter}], '
+                        f'fps: {fps:.1f} img/s, '
+                        f'times per image: {1000 / fps:.1f} ms/img, '
+                        f'cuda memory: {cuda_memory} MB', self.logger)
+                    print_process_memory(self._process, self.logger)
+
+            if (i + 1) == self.max_iter:
+                fps = (i + 1 - self.num_warmup) / pure_inf_time
+                break
+
+        return {'fps': fps}
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        print_log('============== Done ==================', self.logger)
+
+        fps_list_ = [round(result['fps'], 1) for result in results]
+        avg_fps_ = sum(fps_list_) / len(fps_list_)
+        outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_}
+
+        if len(fps_list_) > 1:
+            times_pre_image_list_ = [
+                round(1000 / result['fps'], 1) for result in results
+            ]
+            avg_times_pre_image_ = sum(times_pre_image_list_) / len(
+                times_pre_image_list_)
+
+            print_log(
+                f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, '
+                'times per image: '
+                f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] '
+                'ms/img', self.logger)
+        else:
+            print_log(
+                f'Overall fps: {fps_list_[0]:.1f} img/s, '
+                f'times per image: {1000 / fps_list_[0]:.1f} ms/img',
+                self.logger)
+
+        print_log(f'cuda memory: {get_max_cuda_memory()} MB', self.logger)
+        print_process_memory(self._process, self.logger)
+
+        return outputs
+
+
+class DataLoaderBenchmark(BaseBenchmark):
+    """The dataloader benchmark class. It will be statistical inference FPS and
+    CPU memory information.
+
+    Args:
+        cfg (mmengine.Config): config.
+        distributed (bool): distributed testing flag.
+        dataset_type (str): benchmark data type, only supports ``train``,
+            ``val`` and ``test``.
+        max_iter (int): maximum iterations of benchmark. Defaults to 2000.
+        log_interval (int): interval of logging. Defaults to 50.
+        num_warmup (int): Number of Warmup. Defaults to 5.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 cfg: Config,
+                 distributed: bool,
+                 dataset_type: str,
+                 max_iter: int = 2000,
+                 log_interval: int = 50,
+                 num_warmup: int = 5,
+                 logger: Optional[MMLogger] = None):
+        super().__init__(max_iter, log_interval, num_warmup, logger)
+
+        assert dataset_type in ['train', 'val', 'test'], \
+            'dataset_type only supports train,' \
+            f' val and test, but got {dataset_type}'
+        assert get_world_size(
+        ) == 1, 'Dataloader benchmark does not allow distributed multi-GPU'
+
+        self.cfg = copy.deepcopy(cfg)
+        self.distributed = distributed
+
+        if psutil is None:
+            raise ImportError('psutil is not installed, please install it by: '
+                              'pip install psutil')
+        self._process = psutil.Process()
+
+        mp_cfg = self.cfg.get('env_cfg', {}).get('mp_cfg')
+        if mp_cfg is not None:
+            set_multi_processing(distributed=self.distributed, **mp_cfg)
+        else:
+            set_multi_processing(distributed=self.distributed)
+
+        print_log('before build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+        if dataset_type == 'train':
+            self.data_loader = Runner.build_dataloader(cfg.train_dataloader)
+        elif dataset_type == 'test':
+            self.data_loader = Runner.build_dataloader(cfg.test_dataloader)
+        else:
+            self.data_loader = Runner.build_dataloader(cfg.val_dataloader)
+
+        self.batch_size = self.data_loader.batch_size
+        self.num_workers = self.data_loader.num_workers
+
+        print_log('after build: ', self.logger)
+        print_process_memory(self._process, self.logger)
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        pure_inf_time = 0
+        fps = 0
+
+        # benchmark with 2000 image and take the average
+        start_time = time.perf_counter()
+        for i, data in enumerate(self.data_loader):
+            elapsed = time.perf_counter() - start_time
+
+            if (i + 1) % self.log_interval == 0:
+                print_log('==================================', self.logger)
+
+            if i >= self.num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % self.log_interval == 0:
+                    fps = (i + 1 - self.num_warmup) / pure_inf_time
+
+                    print_log(
+                        f'Done batch [{i + 1:<3}/{self.max_iter}], '
+                        f'fps: {fps:.1f} batch/s, '
+                        f'times per batch: {1000 / fps:.1f} ms/batch, '
+                        f'batch size: {self.batch_size}, num_workers: '
+                        f'{self.num_workers}', self.logger)
+                    print_process_memory(self._process, self.logger)
+
+            if (i + 1) == self.max_iter:
+                fps = (i + 1 - self.num_warmup) / pure_inf_time
+                break
+
+            start_time = time.perf_counter()
+
+        return {'fps': fps}
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        print_log('============== Done ==================', self.logger)
+
+        fps_list_ = [round(result['fps'], 1) for result in results]
+        avg_fps_ = sum(fps_list_) / len(fps_list_)
+        outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_}
+
+        if len(fps_list_) > 1:
+            times_pre_image_list_ = [
+                round(1000 / result['fps'], 1) for result in results
+            ]
+            avg_times_pre_image_ = sum(times_pre_image_list_) / len(
+                times_pre_image_list_)
+
+            print_log(
+                f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, '
+                'times per batch: '
+                f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] '
+                f'ms/batch, batch size: {self.batch_size}, num_workers: '
+                f'{self.num_workers}', self.logger)
+        else:
+            print_log(
+                f'Overall fps: {fps_list_[0]:.1f} batch/s, '
+                f'times per batch: {1000 / fps_list_[0]:.1f} ms/batch, '
+                f'batch size: {self.batch_size}, num_workers: '
+                f'{self.num_workers}', self.logger)
+
+        print_process_memory(self._process, self.logger)
+
+        return outputs
+
+
+class DatasetBenchmark(BaseBenchmark):
+    """The dataset benchmark class. It will be statistical inference FPS, FPS
+    pre transform and CPU memory information.
+
+    Args:
+        cfg (mmengine.Config): config.
+        dataset_type (str): benchmark data type, only supports ``train``,
+            ``val`` and ``test``.
+        max_iter (int): maximum iterations of benchmark. Defaults to 2000.
+        log_interval (int): interval of logging. Defaults to 50.
+        num_warmup (int): Number of Warmup. Defaults to 5.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+    """
+
+    def __init__(self,
+                 cfg: Config,
+                 dataset_type: str,
+                 max_iter: int = 2000,
+                 log_interval: int = 50,
+                 num_warmup: int = 5,
+                 logger: Optional[MMLogger] = None):
+        super().__init__(max_iter, log_interval, num_warmup, logger)
+        assert dataset_type in ['train', 'val', 'test'], \
+            'dataset_type only supports train,' \
+            f' val and test, but got {dataset_type}'
+        assert get_world_size(
+        ) == 1, 'Dataset benchmark does not allow distributed multi-GPU'
+        self.cfg = copy.deepcopy(cfg)
+
+        if dataset_type == 'train':
+            dataloader_cfg = copy.deepcopy(cfg.train_dataloader)
+        elif dataset_type == 'test':
+            dataloader_cfg = copy.deepcopy(cfg.test_dataloader)
+        else:
+            dataloader_cfg = copy.deepcopy(cfg.val_dataloader)
+
+        dataset_cfg = dataloader_cfg.pop('dataset')
+        dataset = DATASETS.build(dataset_cfg)
+        if hasattr(dataset, 'full_init'):
+            dataset.full_init()
+        self.dataset = dataset
+
+    def run_once(self) -> dict:
+        """Executes the benchmark once."""
+        pure_inf_time = 0
+        fps = 0
+
+        total_index = list(range(len(self.dataset)))
+        np.random.shuffle(total_index)
+
+        start_time = time.perf_counter()
+        for i, idx in enumerate(total_index):
+            if (i + 1) % self.log_interval == 0:
+                print_log('==================================', self.logger)
+
+            get_data_info_start_time = time.perf_counter()
+            data_info = self.dataset.get_data_info(idx)
+            get_data_info_elapsed = time.perf_counter(
+            ) - get_data_info_start_time
+
+            if (i + 1) % self.log_interval == 0:
+                print_log(f'get_data_info - {get_data_info_elapsed * 1000} ms',
+                          self.logger)
+
+            for t in self.dataset.pipeline.transforms:
+                transform_start_time = time.perf_counter()
+                data_info = t(data_info)
+                transform_elapsed = time.perf_counter() - transform_start_time
+
+                if (i + 1) % self.log_interval == 0:
+                    print_log(
+                        f'{t.__class__.__name__} - '
+                        f'{transform_elapsed * 1000} ms', self.logger)
+
+                if data_info is None:
+                    break
+
+            elapsed = time.perf_counter() - start_time
+
+            if i >= self.num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % self.log_interval == 0:
+                    fps = (i + 1 - self.num_warmup) / pure_inf_time
+
+                    print_log(
+                        f'Done img [{i + 1:<3}/{self.max_iter}], '
+                        f'fps: {fps:.1f} img/s, '
+                        f'times per img: {1000 / fps:.1f} ms/img', self.logger)
+
+            if (i + 1) == self.max_iter:
+                fps = (i + 1 - self.num_warmup) / pure_inf_time
+                break
+
+            start_time = time.perf_counter()
+
+        return {'fps': fps}
+
+    def average_multiple_runs(self, results: List[dict]) -> dict:
+        """Average the results of multiple runs."""
+        print_log('============== Done ==================', self.logger)
+
+        fps_list_ = [round(result['fps'], 1) for result in results]
+        avg_fps_ = sum(fps_list_) / len(fps_list_)
+        outputs = {'avg_fps': avg_fps_, 'fps_list': fps_list_}
+
+        if len(fps_list_) > 1:
+            times_pre_image_list_ = [
+                round(1000 / result['fps'], 1) for result in results
+            ]
+            avg_times_pre_image_ = sum(times_pre_image_list_) / len(
+                times_pre_image_list_)
+
+            print_log(
+                f'Overall fps: {fps_list_}[{avg_fps_:.1f}] img/s, '
+                'times per img: '
+                f'{times_pre_image_list_}[{avg_times_pre_image_:.1f}] '
+                'ms/img', self.logger)
+        else:
+            print_log(
+                f'Overall fps: {fps_list_[0]:.1f} img/s, '
+                f'times per img: {1000 / fps_list_[0]:.1f} ms/img',
+                self.logger)
+
+        return outputs
diff --git a/head_extractor/src/mmdet/utils/collect_env.py b/head_extractor/src/mmdet/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0eed80fe2e4630b78ea3b13fde6046914e47e8b
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/collect_env.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import get_git_hash
+from mmengine.utils.dl_utils import collect_env as collect_base_env
+
+import mmdet
+
+
+def collect_env():
+    """Collect the information of the running environments."""
+    env_info = collect_base_env()
+    env_info['MMDetection'] = mmdet.__version__ + '+' + get_git_hash()[:7]
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/head_extractor/src/mmdet/utils/compat_config.py b/head_extractor/src/mmdet/utils/compat_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..133adb65c2276401eca947e223e5b7c1760de418
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/compat_config.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+from mmengine.config import ConfigDict
+
+
+def compat_cfg(cfg):
+    """This function would modify some filed to keep the compatibility of
+    config.
+
+    For example, it will move some args which will be deprecated to the correct
+    fields.
+    """
+    cfg = copy.deepcopy(cfg)
+    cfg = compat_imgs_per_gpu(cfg)
+    cfg = compat_loader_args(cfg)
+    cfg = compat_runner_args(cfg)
+    return cfg
+
+
+def compat_runner_args(cfg):
+    if 'runner' not in cfg:
+        cfg.runner = ConfigDict({
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        })
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    else:
+        if 'total_epochs' in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+    return cfg
+
+
+def compat_imgs_per_gpu(cfg):
+    cfg = copy.deepcopy(cfg)
+    if 'imgs_per_gpu' in cfg.data:
+        warnings.warn('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                      'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            warnings.warn(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            warnings.warn('Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                          f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+    return cfg
+
+
+def compat_loader_args(cfg):
+    """Deprecated sample_per_gpu in cfg.data."""
+
+    cfg = copy.deepcopy(cfg)
+    if 'train_dataloader' not in cfg.data:
+        cfg.data['train_dataloader'] = ConfigDict()
+    if 'val_dataloader' not in cfg.data:
+        cfg.data['val_dataloader'] = ConfigDict()
+    if 'test_dataloader' not in cfg.data:
+        cfg.data['test_dataloader'] = ConfigDict()
+
+    # special process for train_dataloader
+    if 'samples_per_gpu' in cfg.data:
+
+        samples_per_gpu = cfg.data.pop('samples_per_gpu')
+        assert 'samples_per_gpu' not in \
+               cfg.data.train_dataloader, ('`samples_per_gpu` are set '
+                                           'in `data` field and ` '
+                                           'data.train_dataloader` '
+                                           'at the same time. '
+                                           'Please only set it in '
+                                           '`data.train_dataloader`. ')
+        cfg.data.train_dataloader['samples_per_gpu'] = samples_per_gpu
+
+    if 'persistent_workers' in cfg.data:
+
+        persistent_workers = cfg.data.pop('persistent_workers')
+        assert 'persistent_workers' not in \
+               cfg.data.train_dataloader, ('`persistent_workers` are set '
+                                           'in `data` field and ` '
+                                           'data.train_dataloader` '
+                                           'at the same time. '
+                                           'Please only set it in '
+                                           '`data.train_dataloader`. ')
+        cfg.data.train_dataloader['persistent_workers'] = persistent_workers
+
+    if 'workers_per_gpu' in cfg.data:
+
+        workers_per_gpu = cfg.data.pop('workers_per_gpu')
+        cfg.data.train_dataloader['workers_per_gpu'] = workers_per_gpu
+        cfg.data.val_dataloader['workers_per_gpu'] = workers_per_gpu
+        cfg.data.test_dataloader['workers_per_gpu'] = workers_per_gpu
+
+    # special process for val_dataloader
+    if 'samples_per_gpu' in cfg.data.val:
+        # keep default value of `sample_per_gpu` is 1
+        assert 'samples_per_gpu' not in \
+               cfg.data.val_dataloader, ('`samples_per_gpu` are set '
+                                         'in `data.val` field and ` '
+                                         'data.val_dataloader` at '
+                                         'the same time. '
+                                         'Please only set it in '
+                                         '`data.val_dataloader`. ')
+        cfg.data.val_dataloader['samples_per_gpu'] = \
+            cfg.data.val.pop('samples_per_gpu')
+    # special process for val_dataloader
+
+    # in case the test dataset is concatenated
+    if isinstance(cfg.data.test, dict):
+        if 'samples_per_gpu' in cfg.data.test:
+            assert 'samples_per_gpu' not in \
+                   cfg.data.test_dataloader, ('`samples_per_gpu` are set '
+                                              'in `data.test` field and ` '
+                                              'data.test_dataloader` '
+                                              'at the same time. '
+                                              'Please only set it in '
+                                              '`data.test_dataloader`. ')
+
+            cfg.data.test_dataloader['samples_per_gpu'] = \
+                cfg.data.test.pop('samples_per_gpu')
+
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            if 'samples_per_gpu' in ds_cfg:
+                assert 'samples_per_gpu' not in \
+                       cfg.data.test_dataloader, ('`samples_per_gpu` are set '
+                                                  'in `data.test` field and ` '
+                                                  'data.test_dataloader` at'
+                                                  ' the same time. '
+                                                  'Please only set it in '
+                                                  '`data.test_dataloader`. ')
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        cfg.data.test_dataloader['samples_per_gpu'] = samples_per_gpu
+
+    return cfg
diff --git a/head_extractor/src/mmdet/utils/contextmanagers.py b/head_extractor/src/mmdet/utils/contextmanagers.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa12bfcaff1e781b0a8cc7d7c8b839c2f2955a05
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/contextmanagers.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import asyncio
+import contextlib
+import logging
+import os
+import time
+from typing import List
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+DEBUG_COMPLETED_TIME = bool(os.environ.get('DEBUG_COMPLETED_TIME', False))
+
+
+@contextlib.asynccontextmanager
+async def completed(trace_name='',
+                    name='',
+                    sleep_interval=0.05,
+                    streams: List[torch.cuda.Stream] = None):
+    """Async context manager that waits for work to complete on given CUDA
+    streams."""
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    stream_before_context_switch = torch.cuda.current_stream()
+    if not streams:
+        streams = [stream_before_context_switch]
+    else:
+        streams = [s if s else stream_before_context_switch for s in streams]
+
+    end_events = [
+        torch.cuda.Event(enable_timing=DEBUG_COMPLETED_TIME) for _ in streams
+    ]
+
+    if DEBUG_COMPLETED_TIME:
+        start = torch.cuda.Event(enable_timing=True)
+        stream_before_context_switch.record_event(start)
+
+        cpu_start = time.monotonic()
+    logger.debug('%s %s starting, streams: %s', trace_name, name, streams)
+    grad_enabled_before = torch.is_grad_enabled()
+    try:
+        yield
+    finally:
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_end = time.monotonic()
+        for i, stream in enumerate(streams):
+            event = end_events[i]
+            stream.record_event(event)
+
+        grad_enabled_after = torch.is_grad_enabled()
+
+        # observed change of torch.is_grad_enabled() during concurrent run of
+        # async_test_bboxes code
+        assert (grad_enabled_before == grad_enabled_after
+                ), 'Unexpected is_grad_enabled() value change'
+
+        are_done = [e.query() for e in end_events]
+        logger.debug('%s %s completed: %s streams: %s', trace_name, name,
+                     are_done, streams)
+        with torch.cuda.stream(stream_before_context_switch):
+            while not all(are_done):
+                await asyncio.sleep(sleep_interval)
+                are_done = [e.query() for e in end_events]
+                logger.debug(
+                    '%s %s completed: %s streams: %s',
+                    trace_name,
+                    name,
+                    are_done,
+                    streams,
+                )
+
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_time = (cpu_end - cpu_start) * 1000
+            stream_times_ms = ''
+            for i, stream in enumerate(streams):
+                elapsed_time = start.elapsed_time(end_events[i])
+                stream_times_ms += f' {stream} {elapsed_time:.2f} ms'
+            logger.info('%s %s %.2f ms %s', trace_name, name, cpu_time,
+                        stream_times_ms)
+
+
+@contextlib.asynccontextmanager
+async def concurrent(streamqueue: asyncio.Queue,
+                     trace_name='concurrent',
+                     name='stream'):
+    """Run code concurrently in different streams.
+
+    :param streamqueue: asyncio.Queue instance.
+
+    Queue tasks define the pool of streams used for concurrent execution.
+    """
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    initial_stream = torch.cuda.current_stream()
+
+    with torch.cuda.stream(initial_stream):
+        stream = await streamqueue.get()
+        assert isinstance(stream, torch.cuda.Stream)
+
+        try:
+            with torch.cuda.stream(stream):
+                logger.debug('%s %s is starting, stream: %s', trace_name, name,
+                             stream)
+                yield
+                current = torch.cuda.current_stream()
+                assert current == stream
+                logger.debug('%s %s has finished, stream: %s', trace_name,
+                             name, stream)
+        finally:
+            streamqueue.task_done()
+            streamqueue.put_nowait(stream)
diff --git a/head_extractor/src/mmdet/utils/dist_utils.py b/head_extractor/src/mmdet/utils/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f2c8614a181ec0594ba157002a2760737e2c6e3
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/dist_utils.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import pickle
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmengine.dist import get_dist_info
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters of a model
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+def reduce_mean(tensor):
+    """"Obtain the mean of tensor on different GPUs."""
+    if not (dist.is_available() and dist.is_initialized()):
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+    return tensor
+
+
+def obj2tensor(pyobj, device='cuda'):
+    """Serialize picklable python object to tensor."""
+    storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj))
+    return torch.ByteTensor(storage).to(device=device)
+
+
+def tensor2obj(tensor):
+    """Deserialize tensor to picklable python object."""
+    return pickle.loads(tensor.cpu().numpy().tobytes())
+
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """Return a process group based on gloo backend, containing all the ranks
+    The result is cached."""
+    if dist.get_backend() == 'nccl':
+        return dist.new_group(backend='gloo')
+    else:
+        return dist.group.WORLD
+
+
+def all_reduce_dict(py_dict, op='sum', group=None, to_float=True):
+    """Apply all reduce function for python dict object.
+
+    The code is modified from https://github.com/Megvii-
+    BaseDetection/YOLOX/blob/main/yolox/utils/allreduce_norm.py.
+
+    NOTE: make sure that py_dict in different ranks has the same keys and
+    the values should be in the same shape. Currently only supports
+    nccl backend.
+
+    Args:
+        py_dict (dict): Dict to be applied all reduce op.
+        op (str): Operator, could be 'sum' or 'mean'. Default: 'sum'
+        group (:obj:`torch.distributed.group`, optional): Distributed group,
+            Default: None.
+        to_float (bool): Whether to convert all values of dict to float.
+            Default: True.
+
+    Returns:
+        OrderedDict: reduced python dict object.
+    """
+    warnings.warn(
+        'group` is deprecated. Currently only supports NCCL backend.')
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return py_dict
+
+    # all reduce logic across different devices.
+    py_key = list(py_dict.keys())
+    if not isinstance(py_dict, OrderedDict):
+        py_key_tensor = obj2tensor(py_key)
+        dist.broadcast(py_key_tensor, src=0)
+        py_key = tensor2obj(py_key_tensor)
+
+    tensor_shapes = [py_dict[k].shape for k in py_key]
+    tensor_numels = [py_dict[k].numel() for k in py_key]
+
+    if to_float:
+        warnings.warn('Note: the "to_float" is True, you need to '
+                      'ensure that the behavior is reasonable.')
+        flatten_tensor = torch.cat(
+            [py_dict[k].flatten().float() for k in py_key])
+    else:
+        flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key])
+
+    dist.all_reduce(flatten_tensor, op=dist.ReduceOp.SUM)
+    if op == 'mean':
+        flatten_tensor /= world_size
+
+    split_tensors = [
+        x.reshape(shape) for x, shape in zip(
+            torch.split(flatten_tensor, tensor_numels), tensor_shapes)
+    ]
+    out_dict = {k: v for k, v in zip(py_key, split_tensors)}
+    if isinstance(py_dict, OrderedDict):
+        out_dict = OrderedDict(out_dict)
+    return out_dict
+
+
+def sync_random_seed(seed=None, device='cuda'):
+    """Make sure different ranks share the same seed.
+
+    All workers must call this function, otherwise it will deadlock.
+    This method is generally used in `DistributedSampler`,
+    because the seed should be identical across all processes
+    in the distributed group.
+
+    In distributed sampling, different ranks should sample non-overlapped
+    data in the dataset. Therefore, this function is used to make sure that
+    each rank shuffles the data indices in the same order based
+    on the same seed. Then different ranks could use different indices
+    to select non-overlapped data from the same data list.
+
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is None:
+        seed = np.random.randint(2**31)
+    assert isinstance(seed, int)
+
+    rank, world_size = get_dist_info()
+
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
diff --git a/head_extractor/src/mmdet/utils/large_image.py b/head_extractor/src/mmdet/utils/large_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1f07c2bdc6958f2b3bdd69da0a639276252a91e
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/large_image.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Tuple
+
+import torch
+from mmcv.ops import batched_nms
+from mmengine.structures import InstanceData
+
+from mmdet.structures import DetDataSample, SampleList
+
+
+def shift_rbboxes(bboxes: torch.Tensor, offset: Sequence[int]):
+    """Shift rotated bboxes with offset.
+
+    Args:
+        bboxes (Tensor): The rotated bboxes need to be translated.
+            With shape (n, 5), which means (x, y, w, h, a).
+        offset (Sequence[int]): The translation offsets with shape of (2, ).
+    Returns:
+        Tensor: Shifted rotated bboxes.
+    """
+    offset_tensor = bboxes.new_tensor(offset)
+    shifted_bboxes = bboxes.clone()
+    shifted_bboxes[:, 0:2] = shifted_bboxes[:, 0:2] + offset_tensor
+    return shifted_bboxes
+
+
+def shift_predictions(det_data_samples: SampleList,
+                      offsets: Sequence[Tuple[int, int]],
+                      src_image_shape: Tuple[int, int]) -> SampleList:
+    """Shift predictions to the original image.
+
+    Args:
+        det_data_samples (List[:obj:`DetDataSample`]): A list of patch results.
+        offsets (Sequence[Tuple[int, int]]): Positions of the left top points
+            of patches.
+        src_image_shape (Tuple[int, int]): A (height, width) tuple of the large
+            image's width and height.
+    Returns:
+        (List[:obj:`DetDataSample`]): shifted results.
+    """
+    try:
+        from sahi.slicing import shift_bboxes, shift_masks
+    except ImportError:
+        raise ImportError('Please run "pip install -U sahi" '
+                          'to install sahi first for large image inference.')
+
+    assert len(det_data_samples) == len(
+        offsets), 'The `results` should has the ' 'same length with `offsets`.'
+    shifted_predictions = []
+    for det_data_sample, offset in zip(det_data_samples, offsets):
+        pred_inst = det_data_sample.pred_instances.clone()
+
+        # Check bbox type
+        if pred_inst.bboxes.size(-1) == 4:
+            # Horizontal bboxes
+            shifted_bboxes = shift_bboxes(pred_inst.bboxes, offset)
+        elif pred_inst.bboxes.size(-1) == 5:
+            # Rotated bboxes
+            shifted_bboxes = shift_rbboxes(pred_inst.bboxes, offset)
+        else:
+            raise NotImplementedError
+
+        # shift bboxes and masks
+        pred_inst.bboxes = shifted_bboxes
+        if 'masks' in det_data_sample:
+            pred_inst.masks = shift_masks(pred_inst.masks, offset,
+                                          src_image_shape)
+
+        shifted_predictions.append(pred_inst.clone())
+
+    shifted_predictions = InstanceData.cat(shifted_predictions)
+
+    return shifted_predictions
+
+
+def merge_results_by_nms(results: SampleList, offsets: Sequence[Tuple[int,
+                                                                      int]],
+                         src_image_shape: Tuple[int, int],
+                         nms_cfg: dict) -> DetDataSample:
+    """Merge patch results by nms.
+
+    Args:
+        results (List[:obj:`DetDataSample`]): A list of patch results.
+        offsets (Sequence[Tuple[int, int]]): Positions of the left top points
+            of patches.
+        src_image_shape (Tuple[int, int]): A (height, width) tuple of the large
+            image's width and height.
+        nms_cfg (dict): it should specify nms type and other parameters
+            like `iou_threshold`.
+    Returns:
+        :obj:`DetDataSample`: merged results.
+    """
+    shifted_instances = shift_predictions(results, offsets, src_image_shape)
+
+    _, keeps = batched_nms(
+        boxes=shifted_instances.bboxes,
+        scores=shifted_instances.scores,
+        idxs=shifted_instances.labels,
+        nms_cfg=nms_cfg)
+    merged_instances = shifted_instances[keeps]
+
+    merged_result = results[0].clone()
+    merged_result.pred_instances = merged_instances
+    return merged_result
diff --git a/head_extractor/src/mmdet/utils/logger.py b/head_extractor/src/mmdet/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fec08bbad5517c9169eedb15b4768e7d88d39c7
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/logger.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+
+from mmengine.logging import print_log
+
+
+def get_caller_name():
+    """Get name of caller method."""
+    # this_func_frame = inspect.stack()[0][0]  # i.e., get_caller_name
+    # callee_frame = inspect.stack()[1][0]  # e.g., log_img_scale
+    caller_frame = inspect.stack()[2][0]  # e.g., caller of log_img_scale
+    caller_method = caller_frame.f_code.co_name
+    try:
+        caller_class = caller_frame.f_locals['self'].__class__.__name__
+        return f'{caller_class}.{caller_method}'
+    except KeyError:  # caller is a function
+        return caller_method
+
+
+def log_img_scale(img_scale, shape_order='hw', skip_square=False):
+    """Log image size.
+
+    Args:
+        img_scale (tuple): Image size to be logged.
+        shape_order (str, optional): The order of image shape.
+            'hw' for (height, width) and 'wh' for (width, height).
+            Defaults to 'hw'.
+        skip_square (bool, optional): Whether to skip logging for square
+            img_scale. Defaults to False.
+
+    Returns:
+        bool: Whether to have done logging.
+    """
+    if shape_order == 'hw':
+        height, width = img_scale
+    elif shape_order == 'wh':
+        width, height = img_scale
+    else:
+        raise ValueError(f'Invalid shape_order {shape_order}.')
+
+    if skip_square and (height == width):
+        return False
+
+    caller = get_caller_name()
+    print_log(
+        f'image shape: height={height}, width={width} in {caller}',
+        logger='current')
+
+    return True
diff --git a/head_extractor/src/mmdet/utils/memory.py b/head_extractor/src/mmdet/utils/memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6f9cbc7f9e5f54e2cc429e5e655b2a27d38d61f
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/memory.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import abc
+from contextlib import contextmanager
+from functools import wraps
+
+import torch
+from mmengine.logging import MMLogger
+
+
+def cast_tensor_type(inputs, src_type=None, dst_type=None):
+    """Recursively convert Tensor in inputs from ``src_type`` to ``dst_type``.
+
+    Args:
+        inputs: Inputs that to be casted.
+        src_type (torch.dtype | torch.device): Source type.
+        src_type (torch.dtype | torch.device): Destination type.
+
+    Returns:
+        The same type with inputs, but all contained Tensors have been cast.
+    """
+    assert dst_type is not None
+    if isinstance(inputs, torch.Tensor):
+        if isinstance(dst_type, torch.device):
+            # convert Tensor to dst_device
+            if hasattr(inputs, 'to') and \
+                    hasattr(inputs, 'device') and \
+                    (inputs.device == src_type or src_type is None):
+                return inputs.to(dst_type)
+            else:
+                return inputs
+        else:
+            # convert Tensor to dst_dtype
+            if hasattr(inputs, 'to') and \
+                    hasattr(inputs, 'dtype') and \
+                    (inputs.dtype == src_type or src_type is None):
+                return inputs.to(dst_type)
+            else:
+                return inputs
+        # we need to ensure that the type of inputs to be casted are the same
+        # as the argument `src_type`.
+    elif isinstance(inputs, abc.Mapping):
+        return type(inputs)({
+            k: cast_tensor_type(v, src_type=src_type, dst_type=dst_type)
+            for k, v in inputs.items()
+        })
+    elif isinstance(inputs, abc.Iterable):
+        return type(inputs)(
+            cast_tensor_type(item, src_type=src_type, dst_type=dst_type)
+            for item in inputs)
+    # TODO: Currently not supported
+    # elif isinstance(inputs, InstanceData):
+    #     for key, value in inputs.items():
+    #         inputs[key] = cast_tensor_type(
+    #             value, src_type=src_type, dst_type=dst_type)
+    #     return inputs
+    else:
+        return inputs
+
+
+@contextmanager
+def _ignore_torch_cuda_oom():
+    """A context which ignores CUDA OOM exception from pytorch.
+
+    Code is modified from
+    <https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/memory.py>  # noqa: E501
+    """
+    try:
+        yield
+    except RuntimeError as e:
+        # NOTE: the string may change?
+        if 'CUDA out of memory. ' in str(e):
+            pass
+        else:
+            raise
+
+
+class AvoidOOM:
+    """Try to convert inputs to FP16 and CPU if got a PyTorch's CUDA Out of
+    Memory error. It will do the following steps:
+
+        1. First retry after calling `torch.cuda.empty_cache()`.
+        2. If that still fails, it will then retry by converting inputs
+          to FP16.
+        3. If that still fails trying to convert inputs to CPUs.
+          In this case, it expects the function to dispatch to
+          CPU implementation.
+
+    Args:
+        to_cpu (bool): Whether to convert outputs to CPU if get an OOM
+            error. This will slow down the code significantly.
+            Defaults to True.
+        test (bool): Skip `_ignore_torch_cuda_oom` operate that can use
+            lightweight data in unit test, only used in
+            test unit. Defaults to False.
+
+    Examples:
+        >>> from mmdet.utils.memory import AvoidOOM
+        >>> AvoidCUDAOOM = AvoidOOM()
+        >>> output = AvoidOOM.retry_if_cuda_oom(
+        >>>     some_torch_function)(input1, input2)
+        >>> # To use as a decorator
+        >>> # from mmdet.utils import AvoidCUDAOOM
+        >>> @AvoidCUDAOOM.retry_if_cuda_oom
+        >>> def function(*args, **kwargs):
+        >>>     return None
+    ```
+
+    Note:
+        1. The output may be on CPU even if inputs are on GPU. Processing
+            on CPU will slow down the code significantly.
+        2. When converting inputs to CPU, it will only look at each argument
+            and check if it has `.device` and `.to` for conversion. Nested
+            structures of tensors are not supported.
+        3. Since the function might be called more than once, it has to be
+            stateless.
+    """
+
+    def __init__(self, to_cpu=True, test=False):
+        self.to_cpu = to_cpu
+        self.test = test
+
+    def retry_if_cuda_oom(self, func):
+        """Makes a function retry itself after encountering pytorch's CUDA OOM
+        error.
+
+        The implementation logic is referred to
+        https://github.com/facebookresearch/detectron2/blob/main/detectron2/utils/memory.py
+
+        Args:
+            func: a stateless callable that takes tensor-like objects
+                as arguments.
+        Returns:
+            func: a callable which retries `func` if OOM is encountered.
+        """  # noqa: W605
+
+        @wraps(func)
+        def wrapped(*args, **kwargs):
+
+            # raw function
+            if not self.test:
+                with _ignore_torch_cuda_oom():
+                    return func(*args, **kwargs)
+
+                # Clear cache and retry
+                torch.cuda.empty_cache()
+                with _ignore_torch_cuda_oom():
+                    return func(*args, **kwargs)
+
+            # get the type and device of first tensor
+            dtype, device = None, None
+            values = args + tuple(kwargs.values())
+            for value in values:
+                if isinstance(value, torch.Tensor):
+                    dtype = value.dtype
+                    device = value.device
+                    break
+            if dtype is None or device is None:
+                raise ValueError('There is no tensor in the inputs, '
+                                 'cannot get dtype and device.')
+
+            # Convert to FP16
+            fp16_args = cast_tensor_type(args, dst_type=torch.half)
+            fp16_kwargs = cast_tensor_type(kwargs, dst_type=torch.half)
+            logger = MMLogger.get_current_instance()
+            logger.warning(f'Attempting to copy inputs of {str(func)} '
+                           'to FP16 due to CUDA OOM')
+
+            # get input tensor type, the output type will same as
+            # the first parameter type.
+            with _ignore_torch_cuda_oom():
+                output = func(*fp16_args, **fp16_kwargs)
+                output = cast_tensor_type(
+                    output, src_type=torch.half, dst_type=dtype)
+                if not self.test:
+                    return output
+            logger.warning('Using FP16 still meet CUDA OOM')
+
+            # Try on CPU. This will slow down the code significantly,
+            # therefore print a notice.
+            if self.to_cpu:
+                logger.warning(f'Attempting to copy inputs of {str(func)} '
+                               'to CPU due to CUDA OOM')
+                cpu_device = torch.empty(0).device
+                cpu_args = cast_tensor_type(args, dst_type=cpu_device)
+                cpu_kwargs = cast_tensor_type(kwargs, dst_type=cpu_device)
+
+                # convert outputs to GPU
+                with _ignore_torch_cuda_oom():
+                    logger.warning(f'Convert outputs to GPU (device={device})')
+                    output = func(*cpu_args, **cpu_kwargs)
+                    output = cast_tensor_type(
+                        output, src_type=cpu_device, dst_type=device)
+                    return output
+
+                warnings.warn('Cannot convert output to GPU due to CUDA OOM, '
+                              'the output is now on CPU, which might cause '
+                              'errors if the output need to interact with GPU '
+                              'data in subsequent operations')
+                logger.warning('Cannot convert output to GPU due to '
+                               'CUDA OOM, the output is on CPU now.')
+
+                return func(*cpu_args, **cpu_kwargs)
+            else:
+                # may still get CUDA OOM error
+                return func(*args, **kwargs)
+
+        return wrapped
+
+
+# To use AvoidOOM as a decorator
+AvoidCUDAOOM = AvoidOOM()
diff --git a/head_extractor/src/mmdet/utils/misc.py b/head_extractor/src/mmdet/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dfb394465196cbd1e60c96f5be3aaee416d59cf
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/misc.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import glob
+import os
+import os.path as osp
+import urllib
+import warnings
+from typing import Union
+
+import torch
+from mmengine.config import Config, ConfigDict
+from mmengine.logging import print_log
+from mmengine.utils import scandir
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
+                  '.tiff', '.webp')
+
+
+def find_latest_checkpoint(path, suffix='pth'):
+    """Find the latest checkpoint from the working directory.
+
+    Args:
+        path(str): The path to find checkpoints.
+        suffix(str): File extension.
+            Defaults to pth.
+
+    Returns:
+        latest_path(str | None): File path of the latest checkpoint.
+    References:
+        .. [1] https://github.com/microsoft/SoftTeacher
+                  /blob/main/ssod/utils/patch.py
+    """
+    if not osp.exists(path):
+        warnings.warn('The path of checkpoints does not exist.')
+        return None
+    if osp.exists(osp.join(path, f'latest.{suffix}')):
+        return osp.join(path, f'latest.{suffix}')
+
+    checkpoints = glob.glob(osp.join(path, f'*.{suffix}'))
+    if len(checkpoints) == 0:
+        warnings.warn('There are no checkpoints in the path.')
+        return None
+    latest = -1
+    latest_path = None
+    for checkpoint in checkpoints:
+        count = int(osp.basename(checkpoint).split('_')[-1].split('.')[0])
+        if count > latest:
+            latest = count
+            latest_path = checkpoint
+    return latest_path
+
+
+def update_data_root(cfg, logger=None):
+    """Update data root according to env MMDET_DATASETS.
+
+    If set env MMDET_DATASETS, update cfg.data_root according to
+    MMDET_DATASETS. Otherwise, using cfg.data_root as default.
+
+    Args:
+        cfg (:obj:`Config`): The model config need to modify
+        logger (logging.Logger | str | None): the way to print msg
+    """
+    assert isinstance(cfg, Config), \
+        f'cfg got wrong type: {type(cfg)}, expected mmengine.Config'
+
+    if 'MMDET_DATASETS' in os.environ:
+        dst_root = os.environ['MMDET_DATASETS']
+        print_log(f'MMDET_DATASETS has been set to be {dst_root}.'
+                  f'Using {dst_root} as data root.')
+    else:
+        return
+
+    assert isinstance(cfg, Config), \
+        f'cfg got wrong type: {type(cfg)}, expected mmengine.Config'
+
+    def update(cfg, src_str, dst_str):
+        for k, v in cfg.items():
+            if isinstance(v, ConfigDict):
+                update(cfg[k], src_str, dst_str)
+            if isinstance(v, str) and src_str in v:
+                cfg[k] = v.replace(src_str, dst_str)
+
+    update(cfg.data, cfg.data_root, dst_root)
+    cfg.data_root = dst_root
+
+
+def get_test_pipeline_cfg(cfg: Union[str, ConfigDict]) -> ConfigDict:
+    """Get the test dataset pipeline from entire config.
+
+    Args:
+        cfg (str or :obj:`ConfigDict`): the entire config. Can be a config
+            file or a ``ConfigDict``.
+
+    Returns:
+        :obj:`ConfigDict`: the config of test dataset.
+    """
+    if isinstance(cfg, str):
+        cfg = Config.fromfile(cfg)
+
+    def _get_test_pipeline_cfg(dataset_cfg):
+        if 'pipeline' in dataset_cfg:
+            return dataset_cfg.pipeline
+        # handle dataset wrapper
+        elif 'dataset' in dataset_cfg:
+            return _get_test_pipeline_cfg(dataset_cfg.dataset)
+        # handle dataset wrappers like ConcatDataset
+        elif 'datasets' in dataset_cfg:
+            return _get_test_pipeline_cfg(dataset_cfg.datasets[0])
+
+        raise RuntimeError('Cannot find `pipeline` in `test_dataloader`')
+
+    return _get_test_pipeline_cfg(cfg.test_dataloader.dataset)
+
+
+def get_file_list(source_root: str) -> [list, dict]:
+    """Get file list.
+
+    Args:
+        source_root (str): image or video source path
+
+    Return:
+        source_file_path_list (list): A list for all source file.
+        source_type (dict): Source type: file or url or dir.
+    """
+    is_dir = os.path.isdir(source_root)
+    is_url = source_root.startswith(('http:/', 'https:/'))
+    is_file = os.path.splitext(source_root)[-1].lower() in IMG_EXTENSIONS
+
+    source_file_path_list = []
+    if is_dir:
+        # when input source is dir
+        for file in scandir(source_root, IMG_EXTENSIONS, recursive=True):
+            source_file_path_list.append(os.path.join(source_root, file))
+    elif is_url:
+        # when input source is url
+        filename = os.path.basename(
+            urllib.parse.unquote(source_root).split('?')[0])
+        file_save_path = os.path.join(os.getcwd(), filename)
+        print(f'Downloading source file to {file_save_path}')
+        torch.hub.download_url_to_file(source_root, file_save_path)
+        source_file_path_list = [file_save_path]
+    elif is_file:
+        # when input source is single image
+        source_file_path_list = [source_root]
+    else:
+        print('Cannot find image file.')
+
+    source_type = dict(is_dir=is_dir, is_url=is_url, is_file=is_file)
+
+    return source_file_path_list, source_type
diff --git a/head_extractor/src/mmdet/utils/mot_error_visualize.py b/head_extractor/src/mmdet/utils/mot_error_visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..01bf8645d340aa1f5ab8251211a719f2de9845b1
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/mot_error_visualize.py
@@ -0,0 +1,273 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Union
+
+try:
+    import seaborn as sns
+except ImportError:
+    sns = None
+import cv2
+import matplotlib.pyplot as plt
+import mmcv
+import numpy as np
+from matplotlib.patches import Rectangle
+from mmengine.utils import mkdir_or_exist
+
+
+def imshow_mot_errors(*args, backend: str = 'cv2', **kwargs):
+    """Show the wrong tracks on the input image.
+
+    Args:
+        backend (str, optional): Backend of visualization.
+            Defaults to 'cv2'.
+    """
+    if backend == 'cv2':
+        return _cv2_show_wrong_tracks(*args, **kwargs)
+    elif backend == 'plt':
+        return _plt_show_wrong_tracks(*args, **kwargs)
+    else:
+        raise NotImplementedError()
+
+
+def _cv2_show_wrong_tracks(img: Union[str, np.ndarray],
+                           bboxes: np.ndarray,
+                           ids: np.ndarray,
+                           error_types: np.ndarray,
+                           thickness: int = 2,
+                           font_scale: float = 0.4,
+                           text_width: int = 10,
+                           text_height: int = 15,
+                           show: bool = False,
+                           wait_time: int = 100,
+                           out_file: str = None) -> np.ndarray:
+    """Show the wrong tracks with opencv.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): A ndarray of shape (k, 5).
+        ids (ndarray): A ndarray of shape (k, ).
+        error_types (ndarray): A ndarray of shape (k, ), where 0 denotes
+            false positives, 1 denotes false negative and 2 denotes ID switch.
+        thickness (int, optional): Thickness of lines.
+            Defaults to 2.
+        font_scale (float, optional): Font scale to draw id and score.
+            Defaults to 0.4.
+        text_width (int, optional): Width to draw id and score.
+            Defaults to 10.
+        text_height (int, optional): Height to draw id and score.
+            Defaults to 15.
+        show (bool, optional): Whether to show the image on the fly.
+            Defaults to False.
+        wait_time (int, optional): Value of waitKey param.
+            Defaults to 100.
+        out_file (str, optional): The filename to write the image.
+            Defaults to None.
+
+    Returns:
+        ndarray: Visualized image.
+    """
+    if sns is None:
+        raise ImportError('please run pip install seaborn')
+    assert bboxes.ndim == 2, \
+        f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.'
+    assert ids.ndim == 1, \
+        f' ids ndim should be 1, but its ndim is {ids.ndim}.'
+    assert error_types.ndim == 1, \
+        f' error_types ndim should be 1, but its ndim is {error_types.ndim}.'
+    assert bboxes.shape[0] == ids.shape[0], \
+        'bboxes.shape[0] and ids.shape[0] should have the same length.'
+    assert bboxes.shape[1] == 5, \
+        f' bboxes.shape[1] should be 5, but its {bboxes.shape[1]}.'
+
+    bbox_colors = sns.color_palette()
+    # red, yellow, blue
+    bbox_colors = [bbox_colors[3], bbox_colors[1], bbox_colors[0]]
+    bbox_colors = [[int(255 * _c) for _c in bbox_color][::-1]
+                   for bbox_color in bbox_colors]
+
+    if isinstance(img, str):
+        img = mmcv.imread(img)
+    else:
+        assert img.ndim == 3
+
+    img_shape = img.shape
+    bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+    bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+
+    for bbox, error_type, id in zip(bboxes, error_types, ids):
+        x1, y1, x2, y2 = bbox[:4].astype(np.int32)
+        score = float(bbox[-1])
+
+        # bbox
+        bbox_color = bbox_colors[error_type]
+        cv2.rectangle(img, (x1, y1), (x2, y2), bbox_color, thickness=thickness)
+
+        # FN does not have id and score
+        if error_type == 1:
+            continue
+
+        # score
+        text = '{:.02f}'.format(score)
+        width = (len(text) - 1) * text_width
+        img[y1:y1 + text_height, x1:x1 + width, :] = bbox_color
+        cv2.putText(
+            img,
+            text, (x1, y1 + text_height - 2),
+            cv2.FONT_HERSHEY_COMPLEX,
+            font_scale,
+            color=(0, 0, 0))
+
+        # id
+        text = str(id)
+        width = len(text) * text_width
+        img[y1 + text_height:y1 + text_height * 2,
+            x1:x1 + width, :] = bbox_color
+        cv2.putText(
+            img,
+            str(id), (x1, y1 + text_height * 2 - 2),
+            cv2.FONT_HERSHEY_COMPLEX,
+            font_scale,
+            color=(0, 0, 0))
+
+    if show:
+        mmcv.imshow(img, wait_time=wait_time)
+    if out_file is not None:
+        mmcv.imwrite(img, out_file)
+
+    return img
+
+
+def _plt_show_wrong_tracks(img: Union[str, np.ndarray],
+                           bboxes: np.ndarray,
+                           ids: np.ndarray,
+                           error_types: np.ndarray,
+                           thickness: float = 0.1,
+                           font_scale: float = 3.0,
+                           text_width: int = 8,
+                           text_height: int = 13,
+                           show: bool = False,
+                           wait_time: int = 100,
+                           out_file: str = None) -> np.ndarray:
+    """Show the wrong tracks with matplotlib.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): A ndarray of shape (k, 5).
+        ids (ndarray): A ndarray of shape (k, ).
+        error_types (ndarray): A ndarray of shape (k, ), where 0 denotes
+            false positives, 1 denotes false negative and 2 denotes ID switch.
+        thickness (float, optional): Thickness of lines.
+            Defaults to 0.1.
+        font_scale (float, optional): Font scale to draw id and score.
+            Defaults to 3.0.
+        text_width (int, optional): Width to draw id and score.
+            Defaults to 8.
+        text_height (int, optional): Height to draw id and score.
+            Defaults to 13.
+        show (bool, optional): Whether to show the image on the fly.
+            Defaults to False.
+        wait_time (int, optional): Value of waitKey param.
+            Defaults to 100.
+        out_file (str, optional): The filename to write the image.
+            Defaults to None.
+
+    Returns:
+        ndarray: Original image.
+    """
+    assert bboxes.ndim == 2, \
+        f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.'
+    assert ids.ndim == 1, \
+        f' ids ndim should be 1, but its ndim is {ids.ndim}.'
+    assert error_types.ndim == 1, \
+        f' error_types ndim should be 1, but its ndim is {error_types.ndim}.'
+    assert bboxes.shape[0] == ids.shape[0], \
+        'bboxes.shape[0] and ids.shape[0] should have the same length.'
+    assert bboxes.shape[1] == 5, \
+        f' bboxes.shape[1] should be 5, but its {bboxes.shape[1]}.'
+
+    bbox_colors = sns.color_palette()
+    # red, yellow, blue
+    bbox_colors = [bbox_colors[3], bbox_colors[1], bbox_colors[0]]
+
+    if isinstance(img, str):
+        img = plt.imread(img)
+    else:
+        assert img.ndim == 3
+        img = mmcv.bgr2rgb(img)
+
+    img_shape = img.shape
+    bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+    bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+
+    plt.imshow(img)
+    plt.gca().set_axis_off()
+    plt.autoscale(False)
+    plt.subplots_adjust(
+        top=1, bottom=0, right=1, left=0, hspace=None, wspace=None)
+    plt.margins(0, 0)
+    plt.gca().xaxis.set_major_locator(plt.NullLocator())
+    plt.gca().yaxis.set_major_locator(plt.NullLocator())
+    plt.rcParams['figure.figsize'] = img_shape[1], img_shape[0]
+
+    for bbox, error_type, id in zip(bboxes, error_types, ids):
+        x1, y1, x2, y2, score = bbox
+        w, h = int(x2 - x1), int(y2 - y1)
+        left_top = (int(x1), int(y1))
+
+        # bbox
+        plt.gca().add_patch(
+            Rectangle(
+                left_top,
+                w,
+                h,
+                thickness,
+                edgecolor=bbox_colors[error_type],
+                facecolor='none'))
+
+        # FN does not have id and score
+        if error_type == 1:
+            continue
+
+        # score
+        text = '{:.02f}'.format(score)
+        width = len(text) * text_width
+        plt.gca().add_patch(
+            Rectangle((left_top[0], left_top[1]),
+                      width,
+                      text_height,
+                      thickness,
+                      edgecolor=bbox_colors[error_type],
+                      facecolor=bbox_colors[error_type]))
+
+        plt.text(
+            left_top[0],
+            left_top[1] + text_height + 2,
+            text,
+            fontsize=font_scale)
+
+        # id
+        text = str(id)
+        width = len(text) * text_width
+        plt.gca().add_patch(
+            Rectangle((left_top[0], left_top[1] + text_height + 1),
+                      width,
+                      text_height,
+                      thickness,
+                      edgecolor=bbox_colors[error_type],
+                      facecolor=bbox_colors[error_type]))
+        plt.text(
+            left_top[0],
+            left_top[1] + 2 * (text_height + 1),
+            text,
+            fontsize=font_scale)
+
+    if out_file is not None:
+        mkdir_or_exist(osp.abspath(osp.dirname(out_file)))
+        plt.savefig(out_file, dpi=300, bbox_inches='tight', pad_inches=0.0)
+
+    if show:
+        plt.draw()
+        plt.pause(wait_time / 1000.)
+
+    plt.clf()
+    return img
diff --git a/head_extractor/src/mmdet/utils/profiling.py b/head_extractor/src/mmdet/utils/profiling.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f53f456c72db57bfa69a8d022c92d153580209e
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/profiling.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import contextlib
+import sys
+import time
+
+import torch
+
+if sys.version_info >= (3, 7):
+
+    @contextlib.contextmanager
+    def profile_time(trace_name,
+                     name,
+                     enabled=True,
+                     stream=None,
+                     end_stream=None):
+        """Print time spent by CPU and GPU.
+
+        Useful as a temporary context manager to find sweet spots of code
+        suitable for async implementation.
+        """
+        if (not enabled) or not torch.cuda.is_available():
+            yield
+            return
+        stream = stream if stream else torch.cuda.current_stream()
+        end_stream = end_stream if end_stream else stream
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        stream.record_event(start)
+        try:
+            cpu_start = time.monotonic()
+            yield
+        finally:
+            cpu_end = time.monotonic()
+            end_stream.record_event(end)
+            end.synchronize()
+            cpu_time = (cpu_end - cpu_start) * 1000
+            gpu_time = start.elapsed_time(end)
+            msg = f'{trace_name} {name} cpu_time {cpu_time:.2f} ms '
+            msg += f'gpu_time {gpu_time:.2f} ms stream {stream}'
+            print(msg, end_stream)
diff --git a/head_extractor/src/mmdet/utils/replace_cfg_vals.py b/head_extractor/src/mmdet/utils/replace_cfg_vals.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3331a36ce5a22fcc4d4a955d757f5e8b6bfc6bb
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/replace_cfg_vals.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import re
+
+from mmengine.config import Config
+
+
+def replace_cfg_vals(ori_cfg):
+    """Replace the string "${key}" with the corresponding value.
+
+    Replace the "${key}" with the value of ori_cfg.key in the config. And
+    support replacing the chained ${key}. Such as, replace "${key0.key1}"
+    with the value of cfg.key0.key1. Code is modified from `vars.py
+    < https://github.com/microsoft/SoftTeacher/blob/main/ssod/utils/vars.py>`_  # noqa: E501
+
+    Args:
+        ori_cfg (mmengine.config.Config):
+            The origin config with "${key}" generated from a file.
+
+    Returns:
+        updated_cfg [mmengine.config.Config]:
+            The config with "${key}" replaced by the corresponding value.
+    """
+
+    def get_value(cfg, key):
+        for k in key.split('.'):
+            cfg = cfg[k]
+        return cfg
+
+    def replace_value(cfg):
+        if isinstance(cfg, dict):
+            return {key: replace_value(value) for key, value in cfg.items()}
+        elif isinstance(cfg, list):
+            return [replace_value(item) for item in cfg]
+        elif isinstance(cfg, tuple):
+            return tuple([replace_value(item) for item in cfg])
+        elif isinstance(cfg, str):
+            # the format of string cfg may be:
+            # 1) "${key}", which will be replaced with cfg.key directly
+            # 2) "xxx${key}xxx" or "xxx${key1}xxx${key2}xxx",
+            # which will be replaced with the string of the cfg.key
+            keys = pattern_key.findall(cfg)
+            values = [get_value(ori_cfg, key[2:-1]) for key in keys]
+            if len(keys) == 1 and keys[0] == cfg:
+                # the format of string cfg is "${key}"
+                cfg = values[0]
+            else:
+                for key, value in zip(keys, values):
+                    # the format of string cfg is
+                    # "xxx${key}xxx" or "xxx${key1}xxx${key2}xxx"
+                    assert not isinstance(value, (dict, list, tuple)), \
+                        f'for the format of string cfg is ' \
+                        f"'xxxxx${key}xxxxx' or 'xxx${key}xxx${key}xxx', " \
+                        f"the type of the value of '${key}' " \
+                        f'can not be dict, list, or tuple' \
+                        f'but you input {type(value)} in {cfg}'
+                    cfg = cfg.replace(key, str(value))
+            return cfg
+        else:
+            return cfg
+
+    # the pattern of string "${key}"
+    pattern_key = re.compile(r'\$\{[a-zA-Z\d_.]*\}')
+    # the type of ori_cfg._cfg_dict is mmengine.config.ConfigDict
+    updated_cfg = Config(
+        replace_value(ori_cfg._cfg_dict), filename=ori_cfg.filename)
+    # replace the model with model_wrapper
+    if updated_cfg.get('model_wrapper', None) is not None:
+        updated_cfg.model = updated_cfg.model_wrapper
+        updated_cfg.pop('model_wrapper')
+    return updated_cfg
diff --git a/head_extractor/src/mmdet/utils/setup_env.py b/head_extractor/src/mmdet/utils/setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7b37845a883752a1659fabf62c7404cff971191
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/setup_env.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import logging
+import os
+import platform
+import warnings
+
+import cv2
+import torch.multiprocessing as mp
+from mmengine import DefaultScope
+from mmengine.logging import print_log
+from mmengine.utils import digit_version
+
+
+def setup_cache_size_limit_of_dynamo():
+    """Setup cache size limit of dynamo.
+
+    Note: Due to the dynamic shape of the loss calculation and
+    post-processing parts in the object detection algorithm, these
+    functions must be compiled every time they are run.
+    Setting a large value for torch._dynamo.config.cache_size_limit
+    may result in repeated compilation, which can slow down training
+    and testing speed. Therefore, we need to set the default value of
+    cache_size_limit smaller. An empirical value is 4.
+    """
+
+    import torch
+    if digit_version(torch.__version__) >= digit_version('2.0.0'):
+        if 'DYNAMO_CACHE_SIZE_LIMIT' in os.environ:
+            import torch._dynamo
+            cache_size_limit = int(os.environ['DYNAMO_CACHE_SIZE_LIMIT'])
+            torch._dynamo.config.cache_size_limit = cache_size_limit
+            print_log(
+                f'torch._dynamo.config.cache_size_limit is force '
+                f'set to {cache_size_limit}.',
+                logger='current',
+                level=logging.WARNING)
+
+
+def setup_multi_processes(cfg):
+    """Setup multi-processing environment variables."""
+    # set multi-process start method as `fork` to speed up the training
+    if platform.system() != 'Windows':
+        mp_start_method = cfg.get('mp_start_method', 'fork')
+        current_method = mp.get_start_method(allow_none=True)
+        if current_method is not None and current_method != mp_start_method:
+            warnings.warn(
+                f'Multi-processing start method `{mp_start_method}` is '
+                f'different from the previous setting `{current_method}`.'
+                f'It will be force set to `{mp_start_method}`. You can change '
+                f'this behavior by changing `mp_start_method` in your config.')
+        mp.set_start_method(mp_start_method, force=True)
+
+    # disable opencv multithreading to avoid system being overloaded
+    opencv_num_threads = cfg.get('opencv_num_threads', 0)
+    cv2.setNumThreads(opencv_num_threads)
+
+    # setup OMP threads
+    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
+    workers_per_gpu = cfg.data.get('workers_per_gpu', 1)
+    if 'train_dataloader' in cfg.data:
+        workers_per_gpu = \
+            max(cfg.data.train_dataloader.get('workers_per_gpu', 1),
+                workers_per_gpu)
+
+    if 'OMP_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
+        omp_num_threads = 1
+        warnings.warn(
+            f'Setting OMP_NUM_THREADS environment variable for each process '
+            f'to be {omp_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
+
+    # setup MKL threads
+    if 'MKL_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
+        mkl_num_threads = 1
+        warnings.warn(
+            f'Setting MKL_NUM_THREADS environment variable for each process '
+            f'to be {mkl_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
+
+
+def register_all_modules(init_default_scope: bool = True) -> None:
+    """Register all modules in mmdet into the registries.
+
+    Args:
+        init_default_scope (bool): Whether initialize the mmdet default scope.
+            When `init_default_scope=True`, the global default scope will be
+            set to `mmdet`, and all registries will build modules from mmdet's
+            registry node. To understand more about the registry, please refer
+            to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md
+            Defaults to True.
+    """  # noqa
+    import mmdet.datasets  # noqa: F401,F403
+    import mmdet.engine  # noqa: F401,F403
+    import mmdet.evaluation  # noqa: F401,F403
+    import mmdet.models  # noqa: F401,F403
+    import mmdet.visualization  # noqa: F401,F403
+
+    if init_default_scope:
+        never_created = DefaultScope.get_current_instance() is None \
+                        or not DefaultScope.check_instance_created('mmdet')
+        if never_created:
+            DefaultScope.get_instance('mmdet', scope_name='mmdet')
+            return
+        current_scope = DefaultScope.get_current_instance()
+        if current_scope.scope_name != 'mmdet':
+            warnings.warn('The current default scope '
+                          f'"{current_scope.scope_name}" is not "mmdet", '
+                          '`register_all_modules` will force the current'
+                          'default scope to be "mmdet". If this is not '
+                          'expected, please set `init_default_scope=False`.')
+            # avoid name conflict
+            new_instance_name = f'mmdet-{datetime.datetime.now()}'
+            DefaultScope.get_instance(new_instance_name, scope_name='mmdet')
diff --git a/head_extractor/src/mmdet/utils/split_batch.py b/head_extractor/src/mmdet/utils/split_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..0276fb331f23c1a7f7451faf2a8f768e616d45fd
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/split_batch.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def split_batch(img, img_metas, kwargs):
+    """Split data_batch by tags.
+
+    Code is modified from
+    <https://github.com/microsoft/SoftTeacher/blob/main/ssod/utils/structure_utils.py> # noqa: E501
+
+    Args:
+        img (Tensor): of shape (N, C, H, W) encoding input images.
+            Typically these should be mean centered and std scaled.
+        img_metas (list[dict]): List of image info dict where each dict
+            has: 'img_shape', 'scale_factor', 'flip', and may also contain
+            'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            For details on the values of these keys, see
+            :class:`mmdet.datasets.pipelines.Collect`.
+        kwargs (dict): Specific to concrete implementation.
+
+    Returns:
+        data_groups (dict): a dict that data_batch splited by tags,
+            such as 'sup', 'unsup_teacher', and 'unsup_student'.
+    """
+
+    # only stack img in the batch
+    def fuse_list(obj_list, obj):
+        return torch.stack(obj_list) if isinstance(obj,
+                                                   torch.Tensor) else obj_list
+
+    # select data with tag from data_batch
+    def select_group(data_batch, current_tag):
+        group_flag = [tag == current_tag for tag in data_batch['tag']]
+        return {
+            k: fuse_list([vv for vv, gf in zip(v, group_flag) if gf], v)
+            for k, v in data_batch.items()
+        }
+
+    kwargs.update({'img': img, 'img_metas': img_metas})
+    kwargs.update({'tag': [meta['tag'] for meta in img_metas]})
+    tags = list(set(kwargs['tag']))
+    data_groups = {tag: select_group(kwargs, tag) for tag in tags}
+    for tag, group in data_groups.items():
+        group.pop('tag')
+    return data_groups
diff --git a/head_extractor/src/mmdet/utils/typing_utils.py b/head_extractor/src/mmdet/utils/typing_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6caf6de53274594e139dbe7c1973c747229bf010
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/typing_utils.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Collecting some commonly used type hint in mmdetection."""
+from typing import List, Optional, Sequence, Tuple, Union
+
+from mmengine.config import ConfigDict
+from mmengine.structures import InstanceData, PixelData
+
+# TODO: Need to avoid circular import with assigner and sampler
+# Type hint of config data
+ConfigType = Union[ConfigDict, dict]
+OptConfigType = Optional[ConfigType]
+# Type hint of one or more config data
+MultiConfig = Union[ConfigType, List[ConfigType]]
+OptMultiConfig = Optional[MultiConfig]
+
+InstanceList = List[InstanceData]
+OptInstanceList = Optional[InstanceList]
+
+PixelList = List[PixelData]
+OptPixelList = Optional[PixelList]
+
+RangeType = Sequence[Tuple[int, int]]
diff --git a/head_extractor/src/mmdet/utils/util_mixins.py b/head_extractor/src/mmdet/utils/util_mixins.py
new file mode 100644
index 0000000000000000000000000000000000000000..b83b6617f5e4a202067e1659bf448962a2a2bc72
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/util_mixins.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This module defines the :class:`NiceRepr` mixin class, which defines a
+``__repr__`` and ``__str__`` method that only depend on a custom ``__nice__``
+method, which you must define. This means you only have to overload one
+function instead of two.  Furthermore, if the object defines a ``__len__``
+method, then the ``__nice__`` method defaults to something sensible, otherwise
+it is treated as abstract and raises ``NotImplementedError``.
+
+To use simply have your object inherit from :class:`NiceRepr`
+(multi-inheritance should be ok).
+
+This code was copied from the ubelt library: https://github.com/Erotemic/ubelt
+
+Example:
+    >>> # Objects that define __nice__ have a default __str__ and __repr__
+    >>> class Student(NiceRepr):
+    ...    def __init__(self, name):
+    ...        self.name = name
+    ...    def __nice__(self):
+    ...        return self.name
+    >>> s1 = Student('Alice')
+    >>> s2 = Student('Bob')
+    >>> print(f's1 = {s1}')
+    >>> print(f's2 = {s2}')
+    s1 = <Student(Alice)>
+    s2 = <Student(Bob)>
+
+Example:
+    >>> # Objects that define __len__ have a default __nice__
+    >>> class Group(NiceRepr):
+    ...    def __init__(self, data):
+    ...        self.data = data
+    ...    def __len__(self):
+    ...        return len(self.data)
+    >>> g = Group([1, 2, 3])
+    >>> print(f'g = {g}')
+    g = <Group(3)>
+"""
+import warnings
+
+
+class NiceRepr:
+    """Inherit from this class and define ``__nice__`` to "nicely" print your
+    objects.
+
+    Defines ``__str__`` and ``__repr__`` in terms of ``__nice__`` function
+    Classes that inherit from :class:`NiceRepr` should redefine ``__nice__``.
+    If the inheriting class has a ``__len__``, method then the default
+    ``__nice__`` method will return its length.
+
+    Example:
+        >>> class Foo(NiceRepr):
+        ...    def __nice__(self):
+        ...        return 'info'
+        >>> foo = Foo()
+        >>> assert str(foo) == '<Foo(info)>'
+        >>> assert repr(foo).startswith('<Foo(info) at ')
+
+    Example:
+        >>> class Bar(NiceRepr):
+        ...    pass
+        >>> bar = Bar()
+        >>> import pytest
+        >>> with pytest.warns(None) as record:
+        >>>     assert 'object at' in str(bar)
+        >>>     assert 'object at' in repr(bar)
+
+    Example:
+        >>> class Baz(NiceRepr):
+        ...    def __len__(self):
+        ...        return 5
+        >>> baz = Baz()
+        >>> assert str(baz) == '<Baz(5)>'
+    """
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this module"""
+        if hasattr(self, '__len__'):
+            # It is a common pattern for objects to use __len__ in __nice__
+            # As a convenience we define a default __nice__ for these objects
+            return str(len(self))
+        else:
+            # In all other cases force the subclass to overload __nice__
+            raise NotImplementedError(
+                f'Define the __nice__ method for {self.__class__!r}')
+
+    def __repr__(self):
+        """str: the string of the module"""
+        try:
+            nice = self.__nice__()
+            classname = self.__class__.__name__
+            return f'<{classname}({nice}) at {hex(id(self))}>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
+
+    def __str__(self):
+        """str: the string of the module"""
+        try:
+            classname = self.__class__.__name__
+            nice = self.__nice__()
+            return f'<{classname}({nice})>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
diff --git a/head_extractor/src/mmdet/utils/util_random.py b/head_extractor/src/mmdet/utils/util_random.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc1ecb6c03b026156c9947cb6d356a822448be0f
--- /dev/null
+++ b/head_extractor/src/mmdet/utils/util_random.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Helpers for random number generators."""
+import numpy as np
+
+
+def ensure_rng(rng=None):
+    """Coerces input into a random number generator.
+
+    If the input is None, then a global random state is returned.
+
+    If the input is a numeric value, then that is used as a seed to construct a
+    random state. Otherwise the input is returned as-is.
+
+    Adapted from [1]_.
+
+    Args:
+        rng (int | numpy.random.RandomState | None):
+            if None, then defaults to the global rng. Otherwise this can be an
+            integer or a RandomState class
+    Returns:
+        (numpy.random.RandomState) : rng -
+            a numpy random number generator
+
+    References:
+        .. [1] https://gitlab.kitware.com/computer-vision/kwarray/blob/master/kwarray/util_random.py#L270  # noqa: E501
+    """
+
+    if rng is None:
+        rng = np.random.mtrand._rand
+    elif isinstance(rng, int):
+        rng = np.random.RandomState(rng)
+    else:
+        rng = rng
+    return rng
diff --git a/head_extractor/src/mmdet/version.py b/head_extractor/src/mmdet/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..47989fc0a31f8d8eaa3adff72ab83db61b25b529
--- /dev/null
+++ b/head_extractor/src/mmdet/version.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+__version__ = '3.3.0'
+short_version = __version__
+
+
+def parse_version_info(version_str):
+    """Parse a version string into a tuple.
+
+    Args:
+        version_str (str): The version string.
+    Returns:
+        tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
+            (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1').
+    """
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/head_extractor/src/mmdet/visualization/__init__.py b/head_extractor/src/mmdet/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7edaed9d8701b1be72ff2f7ca646b865007e2eb
--- /dev/null
+++ b/head_extractor/src/mmdet/visualization/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .local_visualizer import DetLocalVisualizer, TrackLocalVisualizer
+from .palette import get_palette, jitter_color, palette_val
+
+__all__ = [
+    'palette_val', 'get_palette', 'DetLocalVisualizer', 'jitter_color',
+    'TrackLocalVisualizer'
+]
diff --git a/head_extractor/src/mmdet/visualization/local_visualizer.py b/head_extractor/src/mmdet/visualization/local_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc6521c56eb167c2c94a3f058594d9e832fb15ad
--- /dev/null
+++ b/head_extractor/src/mmdet/visualization/local_visualizer.py
@@ -0,0 +1,699 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Tuple, Union
+
+import cv2
+import mmcv
+import numpy as np
+
+try:
+    import seaborn as sns
+except ImportError:
+    sns = None
+import torch
+from mmengine.dist import master_only
+from mmengine.structures import InstanceData, PixelData
+from mmengine.visualization import Visualizer
+
+from ..evaluation import INSTANCE_OFFSET
+from ..registry import VISUALIZERS
+from ..structures import DetDataSample
+from ..structures.mask import BitmapMasks, PolygonMasks, bitmap_to_polygon
+from .palette import _get_adaptive_scales, get_palette, jitter_color
+
+
+@VISUALIZERS.register_module()
+class DetLocalVisualizer(Visualizer):
+    """MMDetection Local Visualizer.
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        image (np.ndarray, optional): the origin image to draw. The format
+            should be RGB. Defaults to None.
+        vis_backends (list, optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        bbox_color (str, tuple(int), optional): Color of bbox lines.
+            The tuple of color should be in BGR order. Defaults to None.
+        text_color (str, tuple(int), optional): Color of texts.
+            The tuple of color should be in BGR order.
+            Defaults to (200, 200, 200).
+        mask_color (str, tuple(int), optional): Color of masks.
+            The tuple of color should be in BGR order.
+            Defaults to None.
+        line_width (int, float): The linewidth of lines.
+            Defaults to 3.
+        alpha (int, float): The transparency of bboxes or mask.
+            Defaults to 0.8.
+
+    Examples:
+        >>> import numpy as np
+        >>> import torch
+        >>> from mmengine.structures import InstanceData
+        >>> from mmdet.structures import DetDataSample
+        >>> from mmdet.visualization import DetLocalVisualizer
+
+        >>> det_local_visualizer = DetLocalVisualizer()
+        >>> image = np.random.randint(0, 256,
+        ...                     size=(10, 12, 3)).astype('uint8')
+        >>> gt_instances = InstanceData()
+        >>> gt_instances.bboxes = torch.Tensor([[1, 2, 2, 5]])
+        >>> gt_instances.labels = torch.randint(0, 2, (1,))
+        >>> gt_det_data_sample = DetDataSample()
+        >>> gt_det_data_sample.gt_instances = gt_instances
+        >>> det_local_visualizer.add_datasample('image', image,
+        ...                         gt_det_data_sample)
+        >>> det_local_visualizer.add_datasample(
+        ...                       'image', image, gt_det_data_sample,
+        ...                        out_file='out_file.jpg')
+        >>> det_local_visualizer.add_datasample(
+        ...                        'image', image, gt_det_data_sample,
+        ...                         show=True)
+        >>> pred_instances = InstanceData()
+        >>> pred_instances.bboxes = torch.Tensor([[2, 4, 4, 8]])
+        >>> pred_instances.labels = torch.randint(0, 2, (1,))
+        >>> pred_det_data_sample = DetDataSample()
+        >>> pred_det_data_sample.pred_instances = pred_instances
+        >>> det_local_visualizer.add_datasample('image', image,
+        ...                         gt_det_data_sample,
+        ...                         pred_det_data_sample)
+    """
+
+    def __init__(self,
+                 name: str = 'visualizer',
+                 image: Optional[np.ndarray] = None,
+                 vis_backends: Optional[Dict] = None,
+                 save_dir: Optional[str] = None,
+                 bbox_color: Optional[Union[str, Tuple[int]]] = None,
+                 text_color: Optional[Union[str,
+                                            Tuple[int]]] = (200, 200, 200),
+                 mask_color: Optional[Union[str, Tuple[int]]] = None,
+                 line_width: Union[int, float] = 3,
+                 alpha: float = 0.8) -> None:
+        super().__init__(
+            name=name,
+            image=image,
+            vis_backends=vis_backends,
+            save_dir=save_dir)
+        self.bbox_color = bbox_color
+        self.text_color = text_color
+        self.mask_color = mask_color
+        self.line_width = line_width
+        self.alpha = alpha
+        # Set default value. When calling
+        # `DetLocalVisualizer().dataset_meta=xxx`,
+        # it will override the default value.
+        self.dataset_meta = {}
+
+    def _draw_instances(self, image: np.ndarray, instances: ['InstanceData'],
+                        classes: Optional[List[str]],
+                        palette: Optional[List[tuple]]) -> np.ndarray:
+        """Draw instances of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            instances (:obj:`InstanceData`): Data structure for
+                instance-level annotations or predictions.
+            classes (List[str], optional): Category information.
+            palette (List[tuple], optional): Palette information
+                corresponding to the category.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        self.set_image(image)
+
+        if 'bboxes' in instances and instances.bboxes.sum() > 0:
+            bboxes = instances.bboxes
+            labels = instances.labels
+
+            max_label = int(max(labels) if len(labels) > 0 else 0)
+            text_palette = get_palette(self.text_color, max_label + 1)
+            text_colors = [text_palette[label] for label in labels]
+
+            bbox_color = palette if self.bbox_color is None \
+                else self.bbox_color
+            bbox_palette = get_palette(bbox_color, max_label + 1)
+            colors = [bbox_palette[label] for label in labels]
+            self.draw_bboxes(
+                bboxes,
+                edge_colors=colors,
+                alpha=self.alpha,
+                line_widths=self.line_width)
+
+            positions = bboxes[:, :2] + self.line_width
+            areas = (bboxes[:, 3] - bboxes[:, 1]) * (
+                bboxes[:, 2] - bboxes[:, 0])
+            scales = _get_adaptive_scales(areas)
+
+            for i, (pos, label) in enumerate(zip(positions, labels)):
+                if 'label_names' in instances:
+                    label_text = instances.label_names[i]
+                else:
+                    label_text = classes[
+                        label] if classes is not None else f'class {label}'
+                if 'scores' in instances:
+                    score = round(float(instances.scores[i]) * 100, 1)
+                    label_text += f': {score}'
+
+                self.draw_texts(
+                    label_text,
+                    pos,
+                    colors=text_colors[i],
+                    font_sizes=int(13 * scales[i]),
+                    bboxes=[{
+                        'facecolor': 'black',
+                        'alpha': 0.8,
+                        'pad': 0.7,
+                        'edgecolor': 'none'
+                    }])
+
+        if 'masks' in instances:
+            labels = instances.labels
+            masks = instances.masks
+            if isinstance(masks, torch.Tensor):
+                masks = masks.numpy()
+            elif isinstance(masks, (PolygonMasks, BitmapMasks)):
+                masks = masks.to_ndarray()
+
+            masks = masks.astype(bool)
+
+            max_label = int(max(labels) if len(labels) > 0 else 0)
+            mask_color = palette if self.mask_color is None \
+                else self.mask_color
+            mask_palette = get_palette(mask_color, max_label + 1)
+            colors = [jitter_color(mask_palette[label]) for label in labels]
+            text_palette = get_palette(self.text_color, max_label + 1)
+            text_colors = [text_palette[label] for label in labels]
+
+            polygons = []
+            for i, mask in enumerate(masks):
+                contours, _ = bitmap_to_polygon(mask)
+                polygons.extend(contours)
+            self.draw_polygons(polygons, edge_colors='w', alpha=self.alpha)
+            self.draw_binary_masks(masks, colors=colors, alphas=self.alpha)
+
+            if len(labels) > 0 and \
+                    ('bboxes' not in instances or
+                     instances.bboxes.sum() == 0):
+                # instances.bboxes.sum()==0 represent dummy bboxes.
+                # A typical example of SOLO does not exist bbox branch.
+                areas = []
+                positions = []
+                for mask in masks:
+                    _, _, stats, centroids = cv2.connectedComponentsWithStats(
+                        mask.astype(np.uint8), connectivity=8)
+                    if stats.shape[0] > 1:
+                        largest_id = np.argmax(stats[1:, -1]) + 1
+                        positions.append(centroids[largest_id])
+                        areas.append(stats[largest_id, -1])
+                areas = np.stack(areas, axis=0)
+                scales = _get_adaptive_scales(areas)
+
+                for i, (pos, label) in enumerate(zip(positions, labels)):
+                    if 'label_names' in instances:
+                        label_text = instances.label_names[i]
+                    else:
+                        label_text = classes[
+                            label] if classes is not None else f'class {label}'
+                    if 'scores' in instances:
+                        score = round(float(instances.scores[i]) * 100, 1)
+                        label_text += f': {score}'
+
+                    self.draw_texts(
+                        label_text,
+                        pos,
+                        colors=text_colors[i],
+                        font_sizes=int(13 * scales[i]),
+                        horizontal_alignments='center',
+                        bboxes=[{
+                            'facecolor': 'black',
+                            'alpha': 0.8,
+                            'pad': 0.7,
+                            'edgecolor': 'none'
+                        }])
+        return self.get_image()
+
+    def _draw_panoptic_seg(self, image: np.ndarray,
+                           panoptic_seg: ['PixelData'],
+                           classes: Optional[List[str]],
+                           palette: Optional[List]) -> np.ndarray:
+        """Draw panoptic seg of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            panoptic_seg (:obj:`PixelData`): Data structure for
+                pixel-level annotations or predictions.
+            classes (List[str], optional): Category information.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        # TODO: Is there a way to bypass？
+        num_classes = len(classes)
+
+        panoptic_seg_data = panoptic_seg.sem_seg[0]
+
+        ids = np.unique(panoptic_seg_data)[::-1]
+
+        if 'label_names' in panoptic_seg:
+            # open set panoptic segmentation
+            classes = panoptic_seg.metainfo['label_names']
+            ignore_index = panoptic_seg.metainfo.get('ignore_index',
+                                                     len(classes))
+            ids = ids[ids != ignore_index]
+        else:
+            # for VOID label
+            ids = ids[ids != num_classes]
+
+        labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+        segms = (panoptic_seg_data[None] == ids[:, None, None])
+
+        max_label = int(max(labels) if len(labels) > 0 else 0)
+
+        mask_color = palette if self.mask_color is None \
+            else self.mask_color
+        mask_palette = get_palette(mask_color, max_label + 1)
+        colors = [mask_palette[label] for label in labels]
+
+        self.set_image(image)
+
+        # draw segm
+        polygons = []
+        for i, mask in enumerate(segms):
+            contours, _ = bitmap_to_polygon(mask)
+            polygons.extend(contours)
+        self.draw_polygons(polygons, edge_colors='w', alpha=self.alpha)
+        self.draw_binary_masks(segms, colors=colors, alphas=self.alpha)
+
+        # draw label
+        areas = []
+        positions = []
+        for mask in segms:
+            _, _, stats, centroids = cv2.connectedComponentsWithStats(
+                mask.astype(np.uint8), connectivity=8)
+            max_id = np.argmax(stats[1:, -1]) + 1
+            positions.append(centroids[max_id])
+            areas.append(stats[max_id, -1])
+        areas = np.stack(areas, axis=0)
+        scales = _get_adaptive_scales(areas)
+
+        text_palette = get_palette(self.text_color, max_label + 1)
+        text_colors = [text_palette[label] for label in labels]
+
+        for i, (pos, label) in enumerate(zip(positions, labels)):
+            label_text = classes[label]
+
+            self.draw_texts(
+                label_text,
+                pos,
+                colors=text_colors[i],
+                font_sizes=int(13 * scales[i]),
+                bboxes=[{
+                    'facecolor': 'black',
+                    'alpha': 0.8,
+                    'pad': 0.7,
+                    'edgecolor': 'none'
+                }],
+                horizontal_alignments='center')
+        return self.get_image()
+
+    def _draw_sem_seg(self, image: np.ndarray, sem_seg: PixelData,
+                      classes: Optional[List],
+                      palette: Optional[List]) -> np.ndarray:
+        """Draw semantic seg of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            sem_seg (:obj:`PixelData`): Data structure for pixel-level
+                annotations or predictions.
+            classes (list, optional): Input classes for result rendering, as
+                the prediction of segmentation model is a segment map with
+                label indices, `classes` is a list which includes items
+                responding to the label indices. If classes is not defined,
+                visualizer will take `cityscapes` classes by default.
+                Defaults to None.
+            palette (list, optional): Input palette for result rendering, which
+                is a list of color palette responding to the classes.
+                Defaults to None.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        sem_seg_data = sem_seg.sem_seg
+        if isinstance(sem_seg_data, torch.Tensor):
+            sem_seg_data = sem_seg_data.numpy()
+
+        # 0 ~ num_class, the value 0 means background
+        ids = np.unique(sem_seg_data)
+        ignore_index = sem_seg.metainfo.get('ignore_index', 255)
+        ids = ids[ids != ignore_index]
+
+        if 'label_names' in sem_seg:
+            # open set semseg
+            label_names = sem_seg.metainfo['label_names']
+        else:
+            label_names = classes
+
+        labels = np.array(ids, dtype=np.int64)
+        colors = [palette[label] for label in labels]
+
+        self.set_image(image)
+
+        # draw semantic masks
+        for i, (label, color) in enumerate(zip(labels, colors)):
+            masks = sem_seg_data == label
+            self.draw_binary_masks(masks, colors=[color], alphas=self.alpha)
+            label_text = label_names[label]
+            _, _, stats, centroids = cv2.connectedComponentsWithStats(
+                masks[0].astype(np.uint8), connectivity=8)
+            if stats.shape[0] > 1:
+                largest_id = np.argmax(stats[1:, -1]) + 1
+                centroids = centroids[largest_id]
+
+                areas = stats[largest_id, -1]
+                scales = _get_adaptive_scales(areas)
+
+                self.draw_texts(
+                    label_text,
+                    centroids,
+                    colors=(255, 255, 255),
+                    font_sizes=int(13 * scales),
+                    horizontal_alignments='center',
+                    bboxes=[{
+                        'facecolor': 'black',
+                        'alpha': 0.8,
+                        'pad': 0.7,
+                        'edgecolor': 'none'
+                    }])
+
+        return self.get_image()
+
+    @master_only
+    def add_datasample(
+            self,
+            name: str,
+            image: np.ndarray,
+            data_sample: Optional['DetDataSample'] = None,
+            draw_gt: bool = True,
+            draw_pred: bool = True,
+            show: bool = False,
+            wait_time: float = 0,
+            # TODO: Supported in mmengine's Viusalizer.
+            out_file: Optional[str] = None,
+            pred_score_thr: float = 0.3,
+            step: int = 0) -> None:
+        """Draw datasample and save to all backends.
+
+        - If GT and prediction are plotted at the same time, they are
+        displayed in a stitched image where the left image is the
+        ground truth and the right image is the prediction.
+        - If ``show`` is True, all storage backends are ignored, and
+        the images will be displayed in a local window.
+        - If ``out_file`` is specified, the drawn image will be
+        saved to ``out_file``. t is usually used when the display
+        is not available.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to draw.
+            data_sample (:obj:`DetDataSample`, optional): A data
+                sample that contain annotations and predictions.
+                Defaults to None.
+            draw_gt (bool): Whether to draw GT DetDataSample. Default to True.
+            draw_pred (bool): Whether to draw Prediction DetDataSample.
+                Defaults to True.
+            show (bool): Whether to display the drawn image. Default to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            out_file (str): Path to output file. Defaults to None.
+            pred_score_thr (float): The threshold to visualize the bboxes
+                and masks. Defaults to 0.3.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        image = image.clip(0, 255).astype(np.uint8)
+        classes = self.dataset_meta.get('classes', None)
+        palette = self.dataset_meta.get('palette', None)
+
+        gt_img_data = None
+        pred_img_data = None
+
+        if data_sample is not None:
+            data_sample = data_sample.cpu()
+
+        if draw_gt and data_sample is not None:
+            gt_img_data = image
+            if 'gt_instances' in data_sample:
+                gt_img_data = self._draw_instances(image,
+                                                   data_sample.gt_instances,
+                                                   classes, palette)
+            if 'gt_sem_seg' in data_sample:
+                gt_img_data = self._draw_sem_seg(gt_img_data,
+                                                 data_sample.gt_sem_seg,
+                                                 classes, palette)
+
+            if 'gt_panoptic_seg' in data_sample:
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing panoptic ' \
+                                            'segmentation results.'
+                gt_img_data = self._draw_panoptic_seg(
+                    gt_img_data, data_sample.gt_panoptic_seg, classes, palette)
+
+        if draw_pred and data_sample is not None:
+            pred_img_data = image
+            if 'pred_instances' in data_sample:
+                pred_instances = data_sample.pred_instances
+                pred_instances = pred_instances[
+                    pred_instances.scores > pred_score_thr]
+                pred_img_data = self._draw_instances(image, pred_instances,
+                                                     classes, palette)
+
+            if 'pred_sem_seg' in data_sample:
+                pred_img_data = self._draw_sem_seg(pred_img_data,
+                                                   data_sample.pred_sem_seg,
+                                                   classes, palette)
+
+            if 'pred_panoptic_seg' in data_sample:
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing panoptic ' \
+                                            'segmentation results.'
+                pred_img_data = self._draw_panoptic_seg(
+                    pred_img_data, data_sample.pred_panoptic_seg.numpy(),
+                    classes, palette)
+
+        if gt_img_data is not None and pred_img_data is not None:
+            drawn_img = np.concatenate((gt_img_data, pred_img_data), axis=1)
+        elif gt_img_data is not None:
+            drawn_img = gt_img_data
+        elif pred_img_data is not None:
+            drawn_img = pred_img_data
+        else:
+            # Display the original image directly if nothing is drawn.
+            drawn_img = image
+
+        # It is convenient for users to obtain the drawn image.
+        # For example, the user wants to obtain the drawn image and
+        # save it as a video during video inference.
+        self.set_image(drawn_img)
+
+        if show:
+            self.show(drawn_img, win_name=name, wait_time=wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(drawn_img[..., ::-1], out_file)
+        else:
+            self.add_image(name, drawn_img, step)
+
+
+def random_color(seed):
+    """Random a color according to the input seed."""
+    if sns is None:
+        raise RuntimeError('motmetrics is not installed,\
+                 please install it by: pip install seaborn')
+    np.random.seed(seed)
+    colors = sns.color_palette()
+    color = colors[np.random.choice(range(len(colors)))]
+    color = tuple([int(255 * c) for c in color])
+    return color
+
+
+@VISUALIZERS.register_module()
+class TrackLocalVisualizer(Visualizer):
+    """Tracking Local Visualizer for the MOT, VIS tasks.
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        image (np.ndarray, optional): the origin image to draw. The format
+            should be RGB. Defaults to None.
+        vis_backends (list, optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        line_width (int, float): The linewidth of lines.
+            Defaults to 3.
+        alpha (int, float): The transparency of bboxes or mask.
+                Defaults to 0.8.
+    """
+
+    def __init__(self,
+                 name: str = 'visualizer',
+                 image: Optional[np.ndarray] = None,
+                 vis_backends: Optional[Dict] = None,
+                 save_dir: Optional[str] = None,
+                 line_width: Union[int, float] = 3,
+                 alpha: float = 0.8) -> None:
+        super().__init__(name, image, vis_backends, save_dir)
+        self.line_width = line_width
+        self.alpha = alpha
+        # Set default value. When calling
+        # `TrackLocalVisualizer().dataset_meta=xxx`,
+        # it will override the default value.
+        self.dataset_meta = {}
+
+    def _draw_instances(self, image: np.ndarray,
+                        instances: InstanceData) -> np.ndarray:
+        """Draw instances of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            instances (:obj:`InstanceData`): Data structure for
+                instance-level annotations or predictions.
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        self.set_image(image)
+        classes = self.dataset_meta.get('classes', None)
+
+        # get colors and texts
+        # for the MOT and VIS tasks
+        colors = [random_color(_id) for _id in instances.instances_id]
+        categories = [
+            classes[label] if classes is not None else f'cls{label}'
+            for label in instances.labels
+        ]
+        if 'scores' in instances:
+            texts = [
+                f'{category_name}\n{instance_id} | {score:.2f}'
+                for category_name, instance_id, score in zip(
+                    categories, instances.instances_id, instances.scores)
+            ]
+        else:
+            texts = [
+                f'{category_name}\n{instance_id}' for category_name,
+                instance_id in zip(categories, instances.instances_id)
+            ]
+
+        # draw bboxes and texts
+        if 'bboxes' in instances:
+            # draw bboxes
+            bboxes = instances.bboxes.clone()
+            self.draw_bboxes(
+                bboxes,
+                edge_colors=colors,
+                alpha=self.alpha,
+                line_widths=self.line_width)
+            # draw texts
+            if texts is not None:
+                positions = bboxes[:, :2] + self.line_width
+                areas = (bboxes[:, 3] - bboxes[:, 1]) * (
+                    bboxes[:, 2] - bboxes[:, 0])
+                scales = _get_adaptive_scales(areas.cpu().numpy())
+                for i, pos in enumerate(positions):
+                    self.draw_texts(
+                        texts[i],
+                        pos,
+                        colors='black',
+                        font_sizes=int(13 * scales[i]),
+                        bboxes=[{
+                            'facecolor': [c / 255 for c in colors[i]],
+                            'alpha': 0.8,
+                            'pad': 0.7,
+                            'edgecolor': 'none'
+                        }])
+
+        # draw masks
+        if 'masks' in instances:
+            masks = instances.masks
+            polygons = []
+            for i, mask in enumerate(masks):
+                contours, _ = bitmap_to_polygon(mask)
+                polygons.extend(contours)
+            self.draw_polygons(polygons, edge_colors='w', alpha=self.alpha)
+            self.draw_binary_masks(masks, colors=colors, alphas=self.alpha)
+
+        return self.get_image()
+
+    @master_only
+    def add_datasample(
+            self,
+            name: str,
+            image: np.ndarray,
+            data_sample: DetDataSample = None,
+            draw_gt: bool = True,
+            draw_pred: bool = True,
+            show: bool = False,
+            wait_time: int = 0,
+            # TODO: Supported in mmengine's Viusalizer.
+            out_file: Optional[str] = None,
+            pred_score_thr: float = 0.3,
+            step: int = 0) -> None:
+        """Draw datasample and save to all backends.
+
+        - If GT and prediction are plotted at the same time, they are
+        displayed in a stitched image where the left image is the
+        ground truth and the right image is the prediction.
+        - If ``show`` is True, all storage backends are ignored, and
+        the images will be displayed in a local window.
+        - If ``out_file`` is specified, the drawn image will be
+        saved to ``out_file``. t is usually used when the display
+        is not available.
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to draw.
+            data_sample (OptTrackSampleList): A data
+                sample that contain annotations and predictions.
+                Defaults to None.
+            draw_gt (bool): Whether to draw GT TrackDataSample.
+                Default to True.
+            draw_pred (bool): Whether to draw Prediction TrackDataSample.
+                Defaults to True.
+            show (bool): Whether to display the drawn image. Default to False.
+            wait_time (int): The interval of show (s). Defaults to 0.
+            out_file (str): Path to output file. Defaults to None.
+            pred_score_thr (float): The threshold to visualize the bboxes
+                and masks. Defaults to 0.3.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        gt_img_data = None
+        pred_img_data = None
+
+        if data_sample is not None:
+            data_sample = data_sample.cpu()
+
+        if draw_gt and data_sample is not None:
+            assert 'gt_instances' in data_sample
+            gt_img_data = self._draw_instances(image, data_sample.gt_instances)
+
+        if draw_pred and data_sample is not None:
+            assert 'pred_track_instances' in data_sample
+            pred_instances = data_sample.pred_track_instances
+            if 'scores' in pred_instances:
+                pred_instances = pred_instances[
+                    pred_instances.scores > pred_score_thr].cpu()
+            pred_img_data = self._draw_instances(image, pred_instances)
+
+        if gt_img_data is not None and pred_img_data is not None:
+            drawn_img = np.concatenate((gt_img_data, pred_img_data), axis=1)
+        elif gt_img_data is not None:
+            drawn_img = gt_img_data
+        else:
+            drawn_img = pred_img_data
+
+        if show:
+            self.show(drawn_img, win_name=name, wait_time=wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(drawn_img[..., ::-1], out_file)
+        else:
+            self.add_image(name, drawn_img, step)
diff --git a/head_extractor/src/mmdet/visualization/palette.py b/head_extractor/src/mmdet/visualization/palette.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c402c08823a60759c984093ba7f05f1e310dbd9
--- /dev/null
+++ b/head_extractor/src/mmdet/visualization/palette.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+import mmcv
+import numpy as np
+from mmengine.utils import is_str
+
+
+def palette_val(palette: List[tuple]) -> List[tuple]:
+    """Convert palette to matplotlib palette.
+
+    Args:
+        palette (List[tuple]): A list of color tuples.
+
+    Returns:
+        List[tuple[float]]: A list of RGB matplotlib color tuples.
+    """
+    new_palette = []
+    for color in palette:
+        color = [c / 255 for c in color]
+        new_palette.append(tuple(color))
+    return new_palette
+
+
+def get_palette(palette: Union[List[tuple], str, tuple],
+                num_classes: int) -> List[Tuple[int]]:
+    """Get palette from various inputs.
+
+    Args:
+        palette (list[tuple] | str | tuple): palette inputs.
+        num_classes (int): the number of classes.
+
+    Returns:
+        list[tuple[int]]: A list of color tuples.
+    """
+    assert isinstance(num_classes, int)
+
+    if isinstance(palette, list):
+        dataset_palette = palette
+    elif isinstance(palette, tuple):
+        dataset_palette = [palette] * num_classes
+    elif palette == 'random' or palette is None:
+        state = np.random.get_state()
+        # random color
+        np.random.seed(42)
+        palette = np.random.randint(0, 256, size=(num_classes, 3))
+        np.random.set_state(state)
+        dataset_palette = [tuple(c) for c in palette]
+    elif palette == 'coco':
+        from mmdet.datasets import CocoDataset, CocoPanopticDataset
+        dataset_palette = CocoDataset.METAINFO['palette']
+        if len(dataset_palette) < num_classes:
+            dataset_palette = CocoPanopticDataset.METAINFO['palette']
+    elif palette == 'citys':
+        from mmdet.datasets import CityscapesDataset
+        dataset_palette = CityscapesDataset.METAINFO['palette']
+    elif palette == 'voc':
+        from mmdet.datasets import VOCDataset
+        dataset_palette = VOCDataset.METAINFO['palette']
+    elif is_str(palette):
+        dataset_palette = [mmcv.color_val(palette)[::-1]] * num_classes
+    else:
+        raise TypeError(f'Invalid type for palette: {type(palette)}')
+
+    assert len(dataset_palette) >= num_classes, \
+        'The length of palette should not be less than `num_classes`.'
+    return dataset_palette
+
+
+def _get_adaptive_scales(areas: np.ndarray,
+                         min_area: int = 800,
+                         max_area: int = 30000) -> np.ndarray:
+    """Get adaptive scales according to areas.
+
+    The scale range is [0.5, 1.0]. When the area is less than
+    ``min_area``, the scale is 0.5 while the area is larger than
+    ``max_area``, the scale is 1.0.
+
+    Args:
+        areas (ndarray): The areas of bboxes or masks with the
+            shape of (n, ).
+        min_area (int): Lower bound areas for adaptive scales.
+            Defaults to 800.
+        max_area (int): Upper bound areas for adaptive scales.
+            Defaults to 30000.
+
+    Returns:
+        ndarray: The adaotive scales with the shape of (n, ).
+    """
+    scales = 0.5 + (areas - min_area) // (max_area - min_area)
+    scales = np.clip(scales, 0.5, 1.0)
+    return scales
+
+
+def jitter_color(color: tuple) -> tuple:
+    """Randomly jitter the given color in order to better distinguish instances
+    with the same class.
+
+    Args:
+        color (tuple): The RGB color tuple. Each value is between [0, 255].
+
+    Returns:
+        tuple: The jittered color tuple.
+    """
+    jitter = np.random.rand(3)
+    jitter = (jitter / np.linalg.norm(jitter) - 0.5) * 0.5 * 255
+    color = np.clip(jitter + color, 0, 255).astype(np.uint8)
+    return tuple(color)
diff --git a/head_extractor/src/mmengine/__init__.py b/head_extractor/src/mmengine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a436c950e8ffa39af69304efc180a1ebcaceb582
--- /dev/null
+++ b/head_extractor/src/mmengine/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# flake8: noqa
+from .config import *
+from .fileio import *
+from .logging import *
+from .registry import *
+from .utils import *
+from .version import __version__, version_info
diff --git a/head_extractor/src/mmengine/_strategy/__init__.py b/head_extractor/src/mmengine/_strategy/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..764abcf868912838d490a9fb4f41625851de3bee
--- /dev/null
+++ b/head_extractor/src/mmengine/_strategy/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+from .base import BaseStrategy
+from .colossalai import ColossalAIStrategy
+from .deepspeed import DeepSpeedStrategy
+from .distributed import DDPStrategy
+from .single_device import SingleDeviceStrategy
+
+__all__ = [
+    'BaseStrategy', 'DDPStrategy', 'SingleDeviceStrategy', 'DeepSpeedStrategy',
+    'ColossalAIStrategy'
+]
+
+if digit_version(TORCH_VERSION) >= digit_version('2.0.0'):
+    try:
+        from .fsdp import FSDPStrategy  # noqa:F401
+        __all__.append('FSDPStrategy')
+    except:  # noqa: E722
+        pass
diff --git a/head_extractor/src/mmengine/_strategy/base.py b/head_extractor/src/mmengine/_strategy/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..5df3a79c92da2cfe917113406ad8b9604df53276
--- /dev/null
+++ b/head_extractor/src/mmengine/_strategy/base.py
@@ -0,0 +1,979 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+import platform
+import time
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.optim import Optimizer
+
+import mmengine
+from mmengine.config import Config, ConfigDict
+from mmengine.dist import (broadcast, get_dist_info, infer_launcher,
+                           is_distributed)
+from mmengine.logging import MMLogger
+from mmengine.model.wrappers import is_model_wrapper
+from mmengine.optim import (BaseOptimWrapper, OptimWrapperDict,
+                            _ParamScheduler, build_optim_wrapper)
+from mmengine.registry import MODELS, OPTIM_WRAPPERS, PARAM_SCHEDULERS
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import (TORCH_VERSION, collect_env,
+                                     set_multi_processing)
+
+ParamSchedulerType = Union[List[_ParamScheduler], Dict[str,
+                                                       List[_ParamScheduler]]]
+
+
+class BaseStrategy(metaclass=ABCMeta):
+    """Base class for all strategies.
+
+    In the process of supporting FSDP, DeepSpeed, and ColossalAI, the
+    scalability of the Runner faced challenges, which led to the redefinition
+    of the Runner's responsibilities. The Strategy abstraction was split out,
+    which is responsible for constructing, initializing, and saving/loading
+    the state of training components such as models, optimizers, and parameter
+    schedulers.
+
+    Warning:
+        This is an experimental feature, and its interface is subject to
+        change.
+
+    Keyword Args:
+        work_dir (str): The working directory to save checkpoints. The logs
+            will be saved in the subdirectory of `work_dir` named
+            :attr:`timestamp`. Defaults to 'work_dirs'.
+        experiment_name (str, optional): Name of current experiment. If not
+            specified, timestamp will be used as :attr:`experiment_name`.
+            Defaults to None.
+        env_kwargs (dict, optional): Environment config passed in
+            :meth:`setup_env`. Defaults to None.
+        log_kwargs (dict, optional): Logger config passed in
+            :meth:`build_logger`. Defaults to None.
+        auto_scale_lr (dict, Optional): Config to scale the learning rate
+            automatically. It includes ``base_batch_size`` and ``enable``.
+            ``base_batch_size`` is the batch size that the optimizer lr is
+            based on. ``enable`` is the switch to turn on and off the feature.
+    """
+    model: nn.Module
+    optim_wrapper: BaseOptimWrapper
+    param_schedulers: ParamSchedulerType
+
+    def __init__(
+        self,
+        *,
+        work_dir: str = 'work_dirs',
+        experiment_name: Optional[str] = None,
+        env_kwargs: Optional[dict] = None,
+        log_kwargs: Optional[dict] = None,
+        auto_scale_lr: Optional[dict] = None,
+    ):
+        self._work_dir = osp.abspath(work_dir)
+        mmengine.mkdir_or_exist(self._work_dir)
+
+        self._env_kwargs = env_kwargs or {}
+        self._setup_env(**self._env_kwargs)
+
+        if experiment_name is not None:
+            self._experiment_name = f'{experiment_name}_{self.timestamp}'
+        else:
+            self._experiment_name = self.timestamp
+
+        self._log_dir = osp.join(self.work_dir, self.timestamp)
+        mmengine.mkdir_or_exist(self._log_dir)
+
+        log_kwargs = log_kwargs or {}
+        self.logger = self.build_logger(**log_kwargs)
+
+        self._auto_scale_lr = auto_scale_lr
+
+        self.dispatch_kwargs: dict = {}
+        self._prepared = False
+
+    @property
+    def work_dir(self):
+        return self._work_dir
+
+    @property
+    def log_dir(self):
+        return self._log_dir
+
+    @property
+    def experiment_name(self):
+        return self._experiment_name
+
+    @property
+    def launcher(self):
+        return self._launcher
+
+    @property
+    def distributed(self):
+        return self._distributed
+
+    @property
+    def seed(self):
+        return self._seed
+
+    @property
+    def rank(self):
+        return self._rank
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    @property
+    def timestamp(self):
+        return self._timestamp
+
+    @property
+    def randomness(self):
+        return self._randomness
+
+    @abstractmethod
+    def prepare(
+        self,
+        model: Union[nn.Module, dict],
+        *,
+        optim_wrapper: Union[BaseOptimWrapper, dict, None] = None,
+        param_scheduler: Union[_ParamScheduler, Dict, List, None] = None,
+        compile: Union[dict, bool] = False,
+        dispatch_kwargs: Optional[dict] = None,
+    ):
+        """Prepare model and some components.
+
+        Args:
+            model (:obj:`torch.nn.Module` or dict): The model to be run. It
+                can be a dict used for building a model.
+
+        Keyword Args:
+            optim_wrapper (BaseOptimWrapper or dict, optional): Computing the
+                gradient of model parameters and updating them.
+                Defaults to None.
+                See :meth:`build_optim_wrapper` for examples.
+            param_scheduler (_ParamScheduler or dict or list, optional):
+                Parameter scheduler for updating optimizer parameters. If
+                specified, :attr:`optim_wrapper` should also be specified.
+                Defaults to None.
+                See :meth:`build_param_scheduler` for examples.
+            compile (dict, optional): Config to compile model.
+                Defaults to False. Requires PyTorch>=2.0.
+            dispatch_kwargs (dict, optional): Kwargs to be passed to other
+                methods of Strategy. Defaults to None.
+        """
+
+    def _setup_env(
+            self,
+            *,
+            launcher: Optional[str] = None,
+            cudnn_benchmark: bool = False,
+            mp_cfg: Optional[dict] = None,
+            dist_cfg: Optional[dict] = None,
+            resource_limit: int = 4096,
+            randomness: dict = dict(seed=None),
+    ):
+        """Setup environment.
+
+        This method will do the following things:
+
+        1. setup multi-processing
+        2. setup distributed
+        3. set random seed
+
+        Keyword Args:
+            launcher (str, optional): Way to launcher multi-process. Supported
+                launchers are 'pytorch', 'mpi', 'slurm' and 'none'. If 'none'
+                is provided, non-distributed environment will be launched.
+                If launcher is None, the launcher will be inferred according
+                some specified environments. Defaults to None.
+            cudnn_benchmark (bool): Whether to enable cudnn benchmark.
+                Defaults to False.
+            mp_cfg (dict, optional): Multi-processing config. Defaults to None.
+            dist_cfg (dict, optional): Distributed config. Defaults to None.
+            resource_limit (int): Resource limit. Defaults to 4096.
+            randomness (dict): Some settings to make the experiment as
+                reproducible as possible like seed and deterministic.
+                Defaults to ``dict(seed=None)``. If seed is None, a random
+                number will be generated and it will be broadcasted to all
+                other processes if in distributed environment.
+                If ``cudnn_benchmark`` is ``True`` in but ``deterministic`` is
+                ``True`` in ``randomness``, the value of
+                ``torch.backends.cudnn.benchmark`` will be ``False`` finally.
+        """
+        if launcher is None:
+            launcher = infer_launcher()
+
+        self._launcher = launcher
+        if self._launcher == 'none':
+            self._distributed = False
+        else:
+            self._distributed = True
+
+        if cudnn_benchmark:
+            torch.backends.cudnn.benchmark = True
+
+        mp_cfg = mp_cfg if mp_cfg is not None else {}
+        set_multi_processing(**mp_cfg, distributed=self._distributed)
+
+        # init distributed env first, since logger depends on the dist info.
+        if self._distributed and not is_distributed():
+            dist_cfg = dist_cfg if dist_cfg is not None else {}
+            self._setup_distributed(launcher, **dist_cfg)
+
+        self._rank, self._world_size = get_dist_info()
+
+        timestamp = torch.tensor(time.time(), dtype=torch.float64)
+        # broadcast timestamp from 0 process to other processes
+        broadcast(timestamp)
+        self._timestamp = time.strftime('%Y%m%d_%H%M%S',
+                                        time.localtime(timestamp.item()))
+
+        # https://github.com/pytorch/pytorch/issues/973
+        # set resource limit
+        if platform.system() != 'Windows':
+            import resource
+            rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+            base_soft_limit = rlimit[0]
+            hard_limit = rlimit[1]
+            soft_limit = min(max(resource_limit, base_soft_limit), hard_limit)
+            resource.setrlimit(resource.RLIMIT_NOFILE,
+                               (soft_limit, hard_limit))
+
+        self._randomness = randomness
+        self._set_randomness(**randomness)
+
+    def _setup_distributed(self, *args, **kwargs):
+        """Setup distributed environment."""
+        pass
+
+    def _set_randomness(
+        self,
+        seed: Optional[int] = None,
+        diff_rank_seed: bool = False,
+        deterministic: bool = False,
+    ) -> None:
+        """Set random seed to guarantee reproducible results.
+
+        Args:
+            seed (int, optional): A number to set random modules.
+                Defaults to None.
+            diff_rank_seed (bool): Whether or not set different seeds according
+                to global rank. Defaults to False.
+            deterministic (bool): Whether to set the deterministic option for
+                CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+                to True and `torch.backends.cudnn.benchmark` to False.
+                Defaults to False.
+                See https://pytorch.org/docs/stable/notes/randomness.html for
+                more details.
+        """
+        from mmengine.runner import set_random_seed
+        self._seed = set_random_seed(
+            seed=seed,
+            deterministic=deterministic,
+            diff_rank_seed=diff_rank_seed)
+
+    def build_model(self, model: Union[nn.Module, dict]) -> nn.Module:
+        """Build model.
+
+        If ``model`` is a dict, it will be used to build a ``nn.Module``
+        object. Otherwise, if ``model`` is a ``nn.Module`` object it will be
+        returned directly.
+
+        An example of ``model``::
+
+            model = dict(type='ResNet')
+
+        Args:
+            model (nn.Module or dict): A ``nn.Module`` object or a dict to
+                build ``nn.Module`` object. If ``model`` is a ``nn.Module``
+                object, just returns itself.
+
+        Note:
+            The returned model must implement ``train_step``, ``test_step``
+            if ``runner.train`` or ``runner.test`` will be called. If
+            ``runner.val`` will be called or ``val_cfg`` is configured,
+            model must implement `val_step`.
+
+        Returns:
+            nn.Module: Model build from ``model``.
+        """
+        if isinstance(model, nn.Module):
+            return model
+        elif isinstance(model, dict):
+            model = MODELS.build(model)
+            return model  # type: ignore
+        else:
+            raise TypeError('model should be a nn.Module object or dict, '
+                            f'but got {model}')
+
+    def compile_model(
+        self,
+        model: nn.Module,
+        compile: Union[dict, bool] = False,
+    ) -> nn.Module:
+        """Compile model.
+
+        Args:
+            model (nn.Module): Model to compile.
+
+        Returns:
+            nn.Module: Compiled model.
+        """
+        if isinstance(compile, bool) and not compile:
+            return model
+
+        assert digit_version(TORCH_VERSION) >= digit_version('2.0.0'), (
+            'PyTorch >= 2.0.0 is required to enable torch.compile')
+
+        if isinstance(compile, bool):
+            compile = dict()
+
+        target = compile.pop('target', 'forward')
+        func = getattr(model, target)
+        compiled_func = torch.compile(func, **compile)
+        setattr(model, target, compiled_func)
+        self.logger.info('Model has been "compiled". The first few iterations '
+                         'will be slow, please be patient.')
+
+        return model
+
+    def _init_model_weights(self, model: nn.Module) -> nn.Module:
+        """Initialize the model weights if the model has
+        :meth:`init_weights`"""
+        if (hasattr(model, 'init_weights') and self.dispatch_kwargs.get(
+                'init_weights_for_test_or_val', True)):
+            model.init_weights()
+            # sync params and buffers
+            for _, params in model.state_dict().items():
+                broadcast(params)
+
+        return model
+
+    def build_optim_wrapper(
+        self,
+        optim_wrapper: Union[Optimizer, BaseOptimWrapper, dict],
+        model: Optional[nn.Module] = None,
+    ) -> BaseOptimWrapper:
+        """Build optimizer wrapper.
+
+        If ``optim_wrapper`` is a config dict for only one optimizer,
+        the keys must contain ``optimizer``, and ``type`` is optional.
+        It will build a :obj:`OptimWrapper` by default.
+
+        If ``optim_wrapper`` is a config dict for multiple optimizers, i.e.,
+        it has multiple keys and each key is for an optimizer wrapper. The
+        constructor must be specified since
+        :obj:`DefaultOptimizerConstructor` cannot handle the building of
+        training with multiple optimizers.
+
+        If ``optim_wrapper`` is a dict of pre-built optimizer wrappers, i.e.,
+        each value of ``optim_wrapper`` represents an ``OptimWrapper``
+        instance. ``build_optim_wrapper`` will directly build the
+        :obj:`OptimWrapperDict` instance from ``optim_wrapper``.
+
+        Args:
+            optim_wrapper (BaseOptimWrapper or dict): An OptimWrapper object or a
+                dict to build OptimWrapper objects. If ``optim_wrapper`` is an
+                OptimWrapper, just return an ``OptimizeWrapper`` instance.
+
+        Note:
+            For single optimizer training, if `optim_wrapper` is a config
+            dict, `type` is optional(defaults to :obj:`OptimWrapper`) and it
+            must contain `optimizer` to build the corresponding optimizer.
+
+        Examples:
+            >>> # build an optimizer
+            >>> optim_wrapper_cfg = dict(type='OptimWrapper', optimizer=dict(
+            ...     type='SGD', lr=0.01))
+            >>> # optim_wrapper_cfg = dict(optimizer=dict(type='SGD', lr=0.01))
+            >>> # is also valid.
+            >>> optim_wrapper = runner.build_optim_wrapper(optim_wrapper_cfg)
+            >>> optim_wrapper
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            SGD (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.01
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+            >>> # build optimizer without `type`
+            >>> optim_wrapper_cfg = dict(optimizer=dict(type='SGD', lr=0.01))
+            >>> optim_wrapper = runner.build_optim_wrapper(optim_wrapper_cfg)
+            >>> optim_wrapper
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            SGD (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.01
+                maximize: False
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+            >>> # build multiple optimizers
+            >>> optim_wrapper_cfg = dict(
+            ...    generator=dict(type='OptimWrapper', optimizer=dict(
+            ...        type='SGD', lr=0.01)),
+            ...    discriminator=dict(type='OptimWrapper', optimizer=dict(
+            ...        type='Adam', lr=0.001))
+            ...    # need to customize a multiple optimizer constructor
+            ...    constructor='CustomMultiOptimizerConstructor',
+            ...)
+            >>> optim_wrapper = runner.optim_wrapper(optim_wrapper_cfg)
+            >>> optim_wrapper
+            name: generator
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            SGD (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.1
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+            name: discriminator
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            'discriminator': Adam (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.02
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+
+        Important:
+            If you need to build multiple optimizers, you should implement a
+            MultiOptimWrapperConstructor which gets parameters passed to
+            corresponding optimizers and compose the ``OptimWrapperDict``.
+            More details about how to customize OptimizerConstructor can be
+            found at `optimizer-docs`_.
+
+        Returns:
+            BaseOptimWrapper: Optimizer wrapper build from ``optimizer_cfg``.
+        """  # noqa: E501
+        if isinstance(optim_wrapper, BaseOptimWrapper):
+            return optim_wrapper
+        if isinstance(optim_wrapper, (dict, ConfigDict, Config)):
+            # optimizer must be defined for single optimizer training.
+            optimizer = optim_wrapper.get('optimizer', None)
+
+            # If optimizer is a built `Optimizer` instance, the optimizer
+            # wrapper should be built by `OPTIM_WRAPPERS` registry.
+            if isinstance(optimizer, Optimizer):
+                optim_wrapper.setdefault('type', 'OptimWrapper')
+                return OPTIM_WRAPPERS.build(optim_wrapper)  # type: ignore
+
+            # If `optimizer` is not None or `constructor` is defined, it means,
+            # optimizer wrapper will be built by optimizer wrapper
+            # constructor. Therefore, `build_optim_wrapper` should be called.
+            if optimizer is not None or 'constructor' in optim_wrapper:
+                assert model is not None
+                return build_optim_wrapper(model, optim_wrapper)
+            else:
+                # if `optimizer` is not defined, it should be the case of
+                # training with multiple optimizers. If `constructor` is not
+                # defined either, each value of `optim_wrapper` must be an
+                # `OptimWrapper` instance since `DefaultOptimizerConstructor`
+                # will not handle the case of training with multiple
+                # optimizers. `build_optim_wrapper` will directly build the
+                # `OptimWrapperDict` instance from `optim_wrapper.`
+                optim_wrappers = OrderedDict()
+                for name, optim in optim_wrapper.items():
+                    if not isinstance(optim, BaseOptimWrapper):
+                        raise ValueError(
+                            'each item mush be an optimizer object when '
+                            '"type" and "constructor" are not in '
+                            f'optimizer, but got {name}={optim}')
+                    optim_wrappers[name] = optim
+                return OptimWrapperDict(**optim_wrappers)
+        else:
+            raise TypeError('optimizer wrapper should be an OptimWrapper '
+                            f'object or dict, but got {optim_wrapper}')
+
+    def _build_param_scheduler(
+        self,
+        scheduler: Union[_ParamScheduler, Dict, List],
+        optim_wrapper: BaseOptimWrapper,
+        default_args: dict,
+    ) -> List[_ParamScheduler]:
+        """Build parameter schedulers for a single optimizer.
+
+        Args:
+            scheduler (_ParamScheduler or dict or list): A Param Scheduler
+                object or a dict or list of dict to build parameter schedulers.
+            optim_wrapper (BaseOptimWrapper): An optimizer wrapper object is
+                passed to construct ParamScheduler object.
+
+        Returns:
+            list[_ParamScheduler]: List of parameter schedulers build from
+            ``scheduler``.
+
+        Note:
+            If the train loop is built, when building parameter schedulers,
+            it supports setting the max epochs/iters as the default ``end``
+            of schedulers, and supports converting epoch-based schedulers
+            to iter-based according to the ``convert_to_iter_based`` key.
+        """
+        if not isinstance(scheduler, Sequence):
+            schedulers = [scheduler]
+        else:
+            schedulers = scheduler
+
+        max_epochs = default_args.pop('max_epochs', None)
+        max_iters = default_args.pop('max_iters', None)
+
+        param_schedulers = []
+        for scheduler in schedulers:
+            if isinstance(scheduler, _ParamScheduler):
+                param_schedulers.append(scheduler)
+            elif isinstance(scheduler, dict):
+                _scheduler = copy.deepcopy(scheduler)
+
+                # Set default end
+                if _scheduler.get('by_epoch', True):
+                    if max_epochs is None:
+                        raise ValueError(
+                            'max_epochs must be specified in default_args')
+                    default_end = max_epochs
+                else:
+                    if max_iters is None:
+                        raise ValueError(
+                            'max_iters must be specified in default_args')
+                    default_end = max_iters
+                _scheduler.setdefault('end', default_end)
+                self.logger.debug(
+                    f'The `end` of {_scheduler["type"]} is not set. '
+                    'Use the max epochs/iters of train loop as default.')
+
+                param_schedulers.append(
+                    PARAM_SCHEDULERS.build(
+                        _scheduler,
+                        default_args=dict(
+                            optimizer=optim_wrapper, **default_args)))
+            else:
+                raise TypeError(
+                    'scheduler should be a _ParamScheduler object or dict, '
+                    f'but got {scheduler}')
+        return param_schedulers
+
+    def build_param_scheduler(
+        self,
+        scheduler: Union[_ParamScheduler, Dict, List],
+        optim_wrapper: BaseOptimWrapper,
+        default_args: Optional[dict] = None,
+    ) -> ParamSchedulerType:
+        """Build parameter schedulers.
+
+        ``build_param_scheduler`` should be called after
+        ``build_optim_wrapper`` because the building logic will change
+        according to the number of optimizers built by the runner.
+        The cases are as below:
+
+        - Single optimizer: When only one optimizer is built and used in the
+          runner, ``build_param_scheduler`` will return a list of
+          parameter schedulers.
+        - Multiple optimizers: When two or more optimizers are built and used
+          in runner, ``build_param_scheduler`` will return a dict containing
+          the same keys with multiple optimizers and each value is a list of
+          parameter schedulers. Note that, if you want different optimizers to
+          use different parameter schedulers to update optimizer's
+          hyper-parameters, the input parameter ``scheduler`` also needs to be
+          a dict and its key are consistent with multiple optimizers.
+          Otherwise, the same parameter schedulers will be used to update
+          optimizer's hyper-parameters.
+
+        Args:
+            scheduler (_ParamScheduler or dict or list): A Param Scheduler
+                object or a dict or list of dict to build parameter schedulers.
+
+        Examples:
+            >>> # build one scheduler
+            >>> optim_cfg = dict(dict(type='SGD', lr=0.01))
+            >>> runner.optim_wrapper = runner.build_optim_wrapper(
+            >>>     optim_cfg)
+            >>> scheduler_cfg = dict(type='MultiStepLR', milestones=[1, 2])
+            >>> schedulers = runner.build_param_scheduler(scheduler_cfg)
+            >>> schedulers
+            [<mmengine.optim.scheduler.lr_scheduler.MultiStepLR at 0x7f70f6966290>]  # noqa: E501
+
+            >>> # build multiple schedulers
+            >>> scheduler_cfg = [
+            ...    dict(type='MultiStepLR', milestones=[1, 2]),
+            ...    dict(type='StepLR', step_size=1)
+            ... ]
+            >>> schedulers = runner.build_param_scheduler(scheduler_cfg)
+            >>> schedulers
+            [<mmengine.optim.scheduler.lr_scheduler.MultiStepLR at 0x7f70f60dd3d0>,  # noqa: E501
+            <mmengine.optim.scheduler.lr_scheduler.StepLR at 0x7f70f6eb6150>]
+
+        Above examples only provide the case of one optimizer and one scheduler
+        or multiple schedulers. If you want to know how to set parameter
+        scheduler when using multiple optimizers, you can find more examples
+        `optimizer-docs`_.
+
+        Returns:
+            list[_ParamScheduler] or dict[str, list[_ParamScheduler]]: List of
+            parameter schedulers or a dictionary contains list of parameter
+            schedulers build from ``scheduler``.
+
+        .. _optimizer-docs:
+           https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html
+        """
+        if default_args is None:
+            default_args = {}
+            if 'epoch_length' in self.dispatch_kwargs:
+                default_args['epoch_length'] = self.dispatch_kwargs[
+                    'epoch_length']
+            if 'max_epochs' in self.dispatch_kwargs:
+                default_args['max_epochs'] = self.dispatch_kwargs['max_epochs']
+            if 'max_iters' in self.dispatch_kwargs:
+                default_args['max_iters'] = self.dispatch_kwargs['max_iters']
+
+        param_schedulers: ParamSchedulerType
+        if not isinstance(optim_wrapper, OptimWrapperDict):
+            # Since `OptimWrapperDict` inherits from `OptimWrapper`,
+            # `isinstance(self.optim_wrapper, OptimWrapper)` cannot tell
+            # whether `self.optim_wrapper` is an `OptimizerWrapper` or
+            # `OptimWrapperDict` instance. Therefore, here we simply check
+            # self.optim_wrapper is not an `OptimWrapperDict` instance and
+            # then assert it is an OptimWrapper instance.
+            assert isinstance(optim_wrapper, BaseOptimWrapper), (
+                '`build_optimizer` should be called before'
+                '`build_param_scheduler` because the latter depends '
+                'on the former')
+            param_schedulers = self._build_param_scheduler(
+                scheduler, optim_wrapper, default_args)  # type: ignore
+            return param_schedulers
+        else:
+            param_schedulers = dict()
+            for name, optimizer in optim_wrapper.items():
+                if isinstance(scheduler, dict) and 'type' not in scheduler:
+                    # scheduler is a dict and each item is a ParamScheduler
+                    # object or a config to build ParamScheduler objects
+                    param_schedulers[name] = self._build_param_scheduler(
+                        scheduler[name], optimizer, default_args)
+                else:
+                    param_schedulers[name] = self._build_param_scheduler(
+                        scheduler, optimizer, default_args)
+
+            return param_schedulers
+
+    def _scale_lr(self) -> None:
+        """Automatically scaling learning rate in training according to the
+        ratio of ``base_batch_size`` in ``autoscalelr_cfg`` and real batch
+        size.
+
+        It scales the learning rate linearly according to the
+        `paper <https://arxiv.org/abs/1706.02677>`_.
+
+        Note:
+            ``scale_lr`` must be called after building optimizer wrappers
+            and before building parameter schedulers.
+        """
+        if (self._auto_scale_lr is None
+                or not self._auto_scale_lr.get('enable', False)):
+            return None
+
+        assert 'base_batch_size' in self._auto_scale_lr, \
+            'Lack of `base_batch_size` in `auto_scale_lr`.'
+
+        real_bs = self.world_size * self.dispatch_kwargs[
+            'train_micro_batch_size_per_gpu']
+        base_bs = self._auto_scale_lr['base_batch_size']
+        ratio = float(real_bs) / float(base_bs)
+        self.logger.info(f'LR is set based on batch size of {base_bs} '
+                         f'and the current batch size is {real_bs}. '
+                         f'Scaling the original LR by {ratio}.')
+
+        def _is_built(schedulers):
+            if isinstance(schedulers, dict):
+                return False if 'type' in schedulers else any(
+                    _is_built(s) for s in schedulers.values())
+            if isinstance(schedulers, list):
+                return any(_is_built(s) for s in schedulers)
+            return isinstance(schedulers, _ParamScheduler)
+
+        if hasattr(self, 'param_schedulers') and _is_built(
+                self.param_schedulers):
+            raise RuntimeError('`scale_lr` should be called before building '
+                               'ParamScheduler because ParamScheduler will '
+                               'store initial lr from optimizer wrappers')
+
+        assert isinstance(self.optim_wrapper, BaseOptimWrapper), \
+            '`scale_lr should be called after building OptimWrapper'
+
+        if isinstance(self.optim_wrapper, OptimWrapperDict):
+            wrappers = list(self.optim_wrapper.values())
+        else:
+            wrappers = [self.optim_wrapper]  # type: ignore
+
+        for wrapper in wrappers:
+            for group in wrapper.optimizer.param_groups:
+                group['lr'] = group['lr'] * ratio
+
+    def build_logger(
+        self,
+        log_level: Union[int, str] = 'INFO',
+        log_file: Optional[str] = None,
+        **kwargs,
+    ) -> MMLogger:
+        """Build a global asscessable MMLogger.
+
+        Args:
+            log_level (int or str): The log level of MMLogger handlers.
+                Defaults to 'INFO'.
+            log_file (str, optional): Path of filename to save log.
+                Defaults to None.
+            **kwargs: Remaining parameters passed to ``MMLogger``.
+
+        Returns:
+            MMLogger: A MMLogger object build from ``logger``.
+        """
+        if log_file is None:
+            log_file = osp.join(self.log_dir, f'{self._timestamp}.log')
+
+        log_cfg = dict(log_level=log_level, log_file=log_file, **kwargs)
+        log_cfg.setdefault('name', self.experiment_name)
+        # `torch.compile` in PyTorch 2.0 could close all user defined handlers
+        # unexpectedly. Using file mode 'a' can help prevent abnormal
+        # termination of the FileHandler and ensure that the log file could
+        # be continuously updated during the lifespan of the runner.
+        log_cfg.setdefault('file_mode', 'a')
+
+        return MMLogger.get_instance(**log_cfg)  # type: ignore
+
+    def model_state_dict(self) -> dict:
+        """Get model state dict."""
+        from mmengine.runner import weights_to_cpu
+        return weights_to_cpu(self.model.state_dict())
+
+    def optim_state_dict(self) -> dict:
+        """Get optimizer state dict."""
+        if isinstance(self.optim_wrapper, BaseOptimWrapper):
+            return self.optim_wrapper.state_dict()
+        else:
+            raise TypeError('self.optim_wrapper should be a `BaseOptimWrapper`'
+                            f' instance, but got {self.optim_wrapper}')
+
+    def scheduler_state_dict(self) -> Union[dict, list]:
+        """Get parameter scheduler state dict."""
+        if isinstance(self.param_schedulers, dict):
+            state_dict: dict = dict()
+            for name, schedulers in self.param_schedulers.items():
+                state_dict[name] = []
+                for scheduler in schedulers:
+                    state_dict[name].append(scheduler.state_dict())
+            return state_dict
+        else:
+            state_list = []
+            for scheduler in self.param_schedulers:  # type: ignore
+                state_list.append(scheduler.state_dict())
+            return state_list
+
+    def load_model_state_dict(
+        self,
+        state_dict: dict,
+        *,
+        strict: bool = False,
+        revise_keys: list = [(r'^module.', '')],
+    ) -> None:
+        """Load model state from dict."""
+        from mmengine.runner.checkpoint import _load_checkpoint_to_model
+
+        if is_model_wrapper(self.model):
+            model = self.model.module
+        else:
+            model = self.model
+
+        _load_checkpoint_to_model(
+            model, state_dict, strict=strict, revise_keys=revise_keys)
+
+    def load_optim_state_dict(self, state_dict: dict) -> None:
+        """Load optimizer state from dict."""
+        self.optim_wrapper.load_state_dict(state_dict)
+
+    def load_scheduler_state_dict(self, state_dict: Union[dict, list]) -> None:
+        """Load scheduler state from dict."""
+        if isinstance(self.param_schedulers, dict):
+            assert isinstance(state_dict, dict)
+            for name, schedulers in self.param_schedulers.items():
+                for scheduler, ckpt_scheduler in zip(schedulers,
+                                                     state_dict[name]):
+                    scheduler.load_state_dict(ckpt_scheduler)
+        else:
+            for scheduler, ckpt_scheduler in zip(
+                    self.param_schedulers,  # type: ignore
+                    state_dict):
+                scheduler.load_state_dict(ckpt_scheduler)
+
+    def load_or_resume(
+        self,
+        *,
+        load_from: Optional[str] = None,
+        resume: Union[bool, str] = False,
+    ) -> Optional[dict]:
+        """Load checkpoint or resume from checkpoint.
+
+        Args:
+            load_from (str, optional): The checkpoint file to load from.
+                Defaults to None.
+            resume (bool or str): Whether to resume training. Defaults to
+                False. If ``resume`` is True and ``load_from`` is None,
+                automatically to find latest checkpoint from ``work_dir``.
+                If not found, resuming does nothing. If ``resume`` is a string,
+                it will be treated as the checkpoint file to resume from.
+        """
+        from mmengine.runner import find_latest_checkpoint
+
+        if not resume and load_from is None:
+            return None
+
+        # decide to load from checkpoint or resume from checkpoint
+        resume_from = None
+        if isinstance(resume, str):
+            resume_from = resume
+        elif resume and load_from is None:
+            # auto resume from the latest checkpoint
+            resume_from = find_latest_checkpoint(self._work_dir)
+            self.logger.info(
+                f'Auto resumed from the latest checkpoint {resume_from}.')
+        elif resume and load_from is not None:
+            # resume from the specified checkpoint
+            resume_from = load_from
+
+        if resume_from is not None:
+            return self.resume(resume_from)
+        elif load_from is not None:
+            return self.load_checkpoint(load_from)
+
+        return None
+
+    @abstractmethod
+    def load_checkpoint(
+        self,
+        filename: str,
+        *,
+        map_location: Union[str, Callable] = 'cpu',
+        strict: bool = False,
+        revise_keys: list = [(r'^module.', '')],
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Load checkpoint from given ``filename``.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+
+        Keyword Args:
+            map_location (str or callable): A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'cpu'.
+            strict (bool): strict (bool): Whether to allow different params for
+                the model and checkpoint.
+            revise_keys (list): A list of customized keywords to modify the
+                state_dict in checkpoint. Each item is a (pattern, replacement)
+                pair of the regular expression operations. Defaults to strip
+                the prefix 'module.' by [(r'^module\\.', '')].
+            callback (callable, callable): Callback function to modify the
+                checkpoint after loading the checkpoint.
+                Defaults to None.
+        """
+
+    @abstractmethod
+    def resume(
+        self,
+        filename: str,
+        *,
+        resume_optimizer: bool = True,
+        resume_param_scheduler: bool = True,
+        map_location: Union[str, Callable] = 'default',
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Resume training from given ``filename``.
+
+        Four types of states will be resumed.
+
+        - model state
+        - optimizer state
+        - scheduler state
+        - randomness state
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+
+        Keyword Args:
+            resume_optimizer (bool): Whether to resume optimizer state.
+                Defaults to True.
+            resume_param_scheduler (bool): Whether to resume param scheduler
+                state. Defaults to True.
+            map_location (str or callable):A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'default'.
+            callback (callable, callable): Callback function to modify the
+                checkpoint before saving the checkpoint.
+                Defaults to None.
+        """
+
+    @abstractmethod
+    def save_checkpoint(
+        self,
+        filename: str,
+        *,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        extra_ckpt: Optional[dict] = None,
+        callback: Optional[Callable] = None,
+    ) -> None:
+        """Save checkpoint to given ``filename``.
+
+        Args:
+            filename (str): Filename to save checkpoint.
+
+        Keyword Args:
+            save_optimizer (bool): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            save_param_scheduler (bool): Whether to save the param_scheduler
+                to the checkpoint. Defaults to True.
+            extra_ckpt (dict, optional): Extra checkpoint to save.
+                Defaults to None.
+            callback (callable, callable): Callback function to modify the
+                checkpoint before saving the checkpoint.
+                Defaults to None.
+        """
+
+    def collect_env(self) -> Tuple[dict, dict]:
+        """Collect the information of the running environments."""
+        system_env = collect_env()
+        runtime_env: OrderedDict = OrderedDict()
+        runtime_env.update(self._env_kwargs)
+        runtime_env.update(self.randomness)
+        runtime_env['Distributed launcher'] = self.launcher
+        runtime_env['Distributed training'] = self.distributed
+        runtime_env['GPU number'] = self.world_size
+
+        return system_env, runtime_env
+
+    def _prepared_components(self):
+        return_items = [self.model]
+        if hasattr(self, 'optim_wrapper'):
+            return_items.append(self.optim_wrapper)
+
+        if hasattr(self, 'param_schedulers'):
+            return_items.append(self.param_schedulers)
+
+        return return_items[0] if len(return_items) == 1 else return_items
diff --git a/head_extractor/src/mmengine/_strategy/colossalai.py b/head_extractor/src/mmengine/_strategy/colossalai.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfbb925c670efa7c708dfaa7b395fb874a1ed187
--- /dev/null
+++ b/head_extractor/src/mmengine/_strategy/colossalai.py
@@ -0,0 +1,565 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import os.path as osp
+import time
+from contextlib import contextmanager
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+try:
+    import colossalai
+    import colossalai.booster.mixed_precision as colo_precision
+    import colossalai.booster.plugin as colo_plugin
+    import colossalai.nn.optimizer as colo_optimizer
+    from colossalai.booster import Booster
+    from colossalai.interface import ModelWrapper
+except Exception as e:  # noqa: F841
+    colossalai = None
+    colo_precision = None
+    colo_plugin = None
+    colo_optimizer = None
+    Booster = None
+    ModelWrapper = None
+
+import torch
+import torch.nn as nn
+
+import mmengine
+from mmengine import mkdir_or_exist
+from mmengine._strategy import BaseStrategy
+from mmengine.device import get_device
+from mmengine.dist import init_dist, is_main_process
+from mmengine.fileio import join_path
+from mmengine.model import BaseDataPreprocessor
+from mmengine.optim import BaseOptimWrapper, OptimWrapper, _ParamScheduler
+from mmengine.registry import STRATEGIES, Registry
+from mmengine.registry.root import MODEL_WRAPPERS, OPTIM_WRAPPERS, OPTIMIZERS
+from mmengine.runner.checkpoint import _load_checkpoint, save_checkpoint
+from mmengine.utils import get_git_hash
+
+# Component for colossalai `plugins` and `mixed_precisions`
+PLUGINS = Registry('plugin')
+MIXED_PRECISIONS = Registry('mixed_precision')
+
+
+def register_plugins():
+    _plugins = inspect.getmembers(
+        colo_plugin,
+        lambda x: inspect.isclass(x) and issubclass(x, colo_plugin.Plugin))
+
+    for name, plugin in _plugins:
+        PLUGINS.register_module(name=name, module=plugin)
+
+
+def register_optimizers():
+    _colo_optimizer = inspect.getmembers(
+        colo_optimizer,
+        lambda x: inspect.isclass(x) and issubclass(x, torch.optim.Optimizer))
+    for name, optim_type in _colo_optimizer:
+        OPTIMIZERS.register_module(name=name, module=optim_type, force=True)
+
+
+def register_mixed_precisions():
+    _mixed_precisions = inspect.getmembers(
+        colo_precision, lambda x: inspect.isclass(x) and issubclass(
+            x, colo_precision.MixedPrecision))
+
+    for name, mixed_precision in _mixed_precisions:
+        MIXED_PRECISIONS.register_module(name=name, module=mixed_precision)
+
+
+@OPTIM_WRAPPERS.register_module()
+class ColossalAIOptimWrapper(OptimWrapper):
+    """OptimWrapper for ColossalAI.
+
+    The available optimizers are:
+      - CPUAdam
+      - FusedAdam
+      - FusedLAMB
+      - FusedSGD
+      - HybridAdam
+      - Lamb
+      - Lars
+
+    You can find more details in the `colossalai tutorial`_
+
+    Args:
+        optimizer (dict or torch.optim.Optimizer): The optimizer to be
+            wrapped.
+        accumulative_counts (int): The number of iterations to accumulate
+            gradients. The parameters will be updated per
+            ``accumulative_counts``.
+
+    .. _colossalai tutorial: https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/optimizer
+    """  # noqa: E501
+
+    def __init__(self,
+                 optimizer: torch.optim.Optimizer,
+                 booster: Optional[Booster] = None,
+                 accumulative_counts: int = 1):
+        super().__init__(optimizer, accumulative_counts=accumulative_counts)
+        self.booster = booster
+
+    @contextmanager
+    def optim_context(self, model: nn.Module):
+        assert isinstance(self.booster, Booster), \
+            'Please set the booster attribute before using ' \
+            '`ColossalAIOptimWrapper`.'
+        if self.booster.plugin.support_no_sync():
+            no_sync_context = self.booster.no_sync(model, self.optimizer)
+        else:
+            yield
+            return
+        if self.should_sync():
+            yield
+        else:
+            with no_sync_context:
+                yield
+
+    def backward(self, loss: torch.Tensor, **kwargs) -> None:
+        self._inner_count += 1
+        self.optimizer.backward(loss, **kwargs)
+
+
+@MODEL_WRAPPERS.register_module(
+    name=['ColossalAIModelWrapper', 'CollosalAIModelWrapper'])
+class ColossalAIModelWrapper:
+
+    def __init__(self, model_wrapper: ModelWrapper, model: nn.Module):
+        self.model_wrapper = model_wrapper
+        self.model = model
+
+    def __call__(self, *args, **kwargs) -> Any:
+        return self.model_wrapper(*args, **kwargs)
+
+    def train_step(
+        self,
+        data: Union[dict, tuple, list],
+        optim_wrapper: ColossalAIOptimWrapper,
+    ) -> Dict[str, torch.Tensor]:
+        data = self.model.data_preprocessor(data, training=True)
+        with optim_wrapper.optim_context(self.model):
+            losses = self._run_forward(data, mode='loss')
+        parsed_loss, log_vars = self.model.parse_losses(losses)
+        optim_wrapper.update_params(parsed_loss)
+        return log_vars
+
+    def val_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the prediction of module during validation process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        data = self.model.data_preprocessor(data, False)
+        return self._run_forward(data, mode='predict')
+
+    test_step = val_step
+
+    def _run_forward(self, data: Union[dict, tuple, list], mode: str) -> Any:
+        """Unpacks data for :meth:`forward`
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            mode (str): Mode of forward.
+
+        Returns:
+            dict or list: Results of training or testing mode.
+        """
+        if isinstance(data, dict):
+            results = self.model_wrapper(**data, mode=mode)
+        elif isinstance(data, (list, tuple)):
+            results = self.model_wrapper(*data, mode=mode)
+        else:
+            raise TypeError('Output of `data_preprocessor` should be '
+                            f'list, tuple or dict, but got {type(data)}')
+        return results
+
+    def __getattr__(self, name):
+        if hasattr(self.model_wrapper, name):
+            return getattr(self.model_wrapper, name)
+        elif hasattr(self.model, name):
+            return getattr(self.model, name)
+        else:
+            raise AttributeError(
+                f'{self.model_wrapper} and {self.model} has no '
+                f'attribute {name}')
+
+
+@STRATEGIES.register_module()
+class ColossalAIStrategy(BaseStrategy):
+    """
+    Args:
+        config: (str or dict): The colossalai config file to setup distributed
+            environment. See more details in the `colossalai config tutorial`_.
+        mixed_precision (str or MixedPrecision): The mixed precision to run the
+            training. Defaults to None. If the argument is a string, it can be
+            'fp16', 'fp16_apex', 'bf16', or 'fp8' fp16' would use PyTorch AMP
+            while `fp16_apex` would use Nvidia Apex.
+        plugin (Plugin): The plugin to run the training. The type of `plugin`
+            could be:
+
+            - str: The available plugins are ``gemini`` and ``lowlevel-zero``.
+
+              ``gemini`` means a `ZeRO`_ implementation with chunk-based
+              memory management. You could find more details in the
+              `colossalai gemini tutorial`_. ``lowlevel-zero`` means a
+              Zero-1 and Zero-2 implementation. Although gemini is more
+              memory saving, some unexpceted error could happen for
+              some spectial model structure. lowlevel-zero is more stable.
+
+            - dict: **dict-type style config to build a colossalai plugin**.
+
+              See the `booster plugin tutorial`_ for more details.
+
+        model_wrapper (dict, optional): Dict for model wrapper. Defaults to
+            None.
+        work_dir (str): The working directory to save checkpoints. The logs
+            will be saved in the subdirectory of `work_dir` named
+            :attr:`timestamp`. Defaults to 'work_dirs'.
+        experiment_name (str, optional): Name of current experiment. If not
+            specified, timestamp will be used as :attr:`experiment_name`.
+            Defaults to None.
+        env_kwargs (dict, optional): Environment config passed in
+            :meth:`setup_env`. Defaults to None.
+        log_kwargs (dict, optional): Logger config passed in
+            :meth:`build_logger`. Defaults to None.
+        auto_scale_lr (dict, Optional): Config to scale the learning rate
+            automatically. It includes ``base_batch_size`` and ``enable``.
+            ``base_batch_size`` is the batch size that the optimizer lr is
+            based on. ``enable`` is the switch to turn on and off the feature.
+
+    .. _colossalai config tutorial: https://colossalai.org/docs/basics/configure_parallelization
+    .. _ZeRO: https://arxiv.org/abs/1910.02054
+    .. _colossalai gemini tutorial: https://colossalai.org/docs/features/zero_with_chunk/#geminiddp
+    .. _booster plugin tutorial: https://colossalai.org/docs/basics/booster_plugins
+
+    """  # noqa: E501
+    OPTIMIZER_DIR = 'optimizer'  # directory to save optimizer state.
+    MODEL_DIR = 'model'  # directory to save model
+    SCHEDULER_DIR = 'scheduler'  # directory to save scheduelrs
+    model: ColossalAIModelWrapper  # type: ignore
+    optim_wrapper: ColossalAIOptimWrapper  # type: ignore
+
+    def __init__(
+        self,
+        *,
+        config: Union[str, dict, None] = None,
+        mixed_precision: Union[str, dict, None] = None,
+        plugin: str = 'gemini',
+        model_wrapper: Optional[dict] = None,
+        **kwargs,
+    ):
+        if colossalai is None:
+            raise ModuleNotFoundError(
+                'Please install colossalai by `pip install -U colossalai`')
+        register_plugins()
+        register_mixed_precisions()
+        register_optimizers()
+
+        self.config = config or {}
+        super().__init__(**kwargs)
+        if mixed_precision is not None:
+            mixed_precision = self._build_mixed_precision(mixed_precision)
+
+        if plugin is not None:
+            plugin = self._build_plugin(plugin)
+        self.booster = Booster(mixed_precision=mixed_precision, plugin=plugin)
+        self.model_wrapper = model_wrapper
+
+    def prepare(
+        self,
+        model: Union[nn.Module, dict],
+        *,
+        optim_wrapper: Union[BaseOptimWrapper, dict, None] = None,
+        param_scheduler: Union[_ParamScheduler, Dict, List, None] = None,
+        compile: Union[dict, bool] = False,
+        dispatch_kwargs: Optional[dict] = None,
+    ):
+        """Prepare model and some components.
+
+        Args:
+            model (:obj:`torch.nn.Module` or dict): The model to be run. It
+                can be a dict used for build a model.
+
+        Keyword Args:
+            optim_wrapper (BaseOptimWrapper or dict, optional): Computing the
+                gradient of model parameters and updating them.
+                Defaults to None.
+                See :meth:`build_optim_wrapper` for examples.
+            param_scheduler (_ParamScheduler or dict or list, optional):
+                Parameter scheduler for updating optimizer parameters. If
+                specified, :attr:`optim_wrapper` should also be specified.
+                Defaults to None.
+                See :meth:`build_param_scheduler` for examples.
+            compile (dict, optional): Config to compile model.
+                Defaults to False. Requires PyTorch>=2.0.
+            dispatch_kwargs (dict, optional): Kwargs to be passed to other
+                methods of Strategy. Defaults to None.
+                If ``accumulative_counts`` is set in ``optim_wrapper``, you
+                need to provide ``max_iters`` in ``dispatch_kwargs``.
+        """
+        if self._prepared:
+            return self._prepared_components()
+        if dispatch_kwargs is not None:
+            self.dispatch_kwargs.update(dispatch_kwargs)
+
+        model = self.build_model(model)
+        model = self._init_model_weights(model)
+
+        # optim_wrapper is required by booster
+        if optim_wrapper is not None and isinstance(optim_wrapper, dict):
+            optim_wrapper.setdefault('type', 'ColossalAIOptimWrapper')
+            optim_wrapper_type = OPTIM_WRAPPERS.get(optim_wrapper['type'])
+            if optim_wrapper_type is None:
+                raise ValueError(f'Failed to find {optim_wrapper["type"]} in '
+                                 '`OPTIM_WRAPPERS`.')
+            if 'clip_grad' in optim_wrapper:
+                raise ValueError('`Please configure `clip_grad` in `plugin`')
+            if not issubclass(optim_wrapper_type, ColossalAIOptimWrapper):
+                raise ValueError(
+                    'The type of `optim_wrapper` must be '
+                    '`ColossalAIOptimWrapper` (or subclass), but got '
+                    f'{optim_wrapper_type}')
+            optim_wrapper = self.build_optim_wrapper(optim_wrapper, model)
+            optim_wrapper.booster = self.booster  # type: ignore
+
+        if optim_wrapper is not None:
+            self.model, self.optim_wrapper = self._wrap(
+                model, optim_wrapper)  # type: ignore
+        else:
+            self.model = self._wrap(model)  # type: ignore
+        # TODO: Check whether `compile` is compatible with colossalai.
+
+        if param_scheduler is not None:
+            self.param_schedulers = self.build_param_scheduler(
+                param_scheduler, optim_wrapper)  # type: ignore
+
+        if optim_wrapper is not None:
+            self._scale_lr()
+            accumulative_counts = getattr(self.optim_wrapper,
+                                          '_accumulative_counts', 1)
+            if accumulative_counts > 1:
+                if 'max_iters' not in self.dispatch_kwargs:
+                    raise ValueError(
+                        '"max_iters" must be specified because '
+                        '"accumulative_counts" was set as '
+                        f'{accumulative_counts} which is greater than 1.')
+
+                self.optim_wrapper.initialize_count_status(  # type: ignore
+                    self.model, 0, self.dispatch_kwargs['max_iters'])
+        self._prepared = True
+        return self._prepared_components()
+
+    def resume(
+        self,
+        filename: str,
+        *,
+        resume_optimizer: bool = True,
+        resume_param_scheduler: bool = True,
+        map_location: Union[str, Callable] = 'default',
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """override this method since colossalai resume optimizer from filename
+        directly."""
+        self.logger.info(f'Resume checkpoint from {filename}')
+
+        extra_ckpt = self.load_checkpoint(
+            filename, map_location=map_location, callback=callback)
+
+        if resume_optimizer:
+            self.booster.load_optimizer(
+                self.optim_wrapper.optimizer,
+                join_path(filename, self.OPTIMIZER_DIR))
+
+        if resume_param_scheduler:
+            schedulers_dir = join_path(filename, self.SCHEDULER_DIR)
+            for i, scheduler in enumerate(self.param_schedulers):
+                self.booster.load_lr_scheduler(
+                    scheduler, f'{schedulers_dir}/scheduler_{i}.pth')
+
+        # resume random seed
+        resumed_seed = extra_ckpt['meta'].get('seed', None)
+        current_seed = self._randomness.get('seed')
+        if resumed_seed is not None and resumed_seed != current_seed:
+            if current_seed is not None:
+                self.logger.warning(f'The value of random seed in the '
+                                    f'checkpoint "{resumed_seed}" is '
+                                    f'different from the value in '
+                                    f'`randomness` config "{current_seed}"')
+            self._randomness.update(seed=resumed_seed)
+            self._set_randomness(**self._randomness)
+
+        # resume iter
+        self.dispatch_kwargs['cur_iter'] = extra_ckpt['meta']['iter']
+
+        return extra_ckpt
+
+    def load_checkpoint(
+        self,
+        filename: str,
+        *,
+        map_location: Union[str, Callable] = 'cpu',
+        strict: bool = False,
+        revise_keys: list = [(r'^module.', '')],
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Load checkpoint from given ``filename``.
+
+        Warning:
+            `map_localtion` and `callback` parameters are not supported yet.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+        """
+        self.logger.info(f'Load checkpoint from {filename}')
+        self.booster.load_model(self.model.model_wrapper,
+                                join_path(filename, self.MODEL_DIR))
+        meta = _load_checkpoint(osp.join(filename, 'meta.pth'))
+        return meta
+
+    def save_checkpoint(
+        self,
+        filename: str,
+        *,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        extra_ckpt: Optional[dict] = None,
+        callback: Optional[Callable] = None,
+    ) -> None:
+        # The checkpoint directory will be:
+        # |--epoch_0.pth
+        #    |---model/
+        #    |---optimizer/
+        #    |---scheduler/
+        if extra_ckpt is None:
+            extra_ckpt = dict()
+        if 'meta' not in extra_ckpt:
+            extra_ckpt['meta'] = dict()
+        extra_ckpt['meta'].update(
+            seed=self.seed,
+            time=time.strftime('%Y%m%d_%H%M%S', time.localtime()),
+            mmengine=mmengine.__version__ + get_git_hash())
+
+        model_dir = join_path(filename, self.MODEL_DIR)
+        optimizer_dir = join_path(filename, self.OPTIMIZER_DIR)
+        schedulers_dir = join_path(filename, self.SCHEDULER_DIR)
+        mkdir_or_exist(model_dir)
+        mkdir_or_exist(optimizer_dir)
+        mkdir_or_exist(schedulers_dir)
+
+        self.booster.save_model(
+            self.model.model_wrapper, checkpoint=model_dir, shard=True)
+
+        if save_optimizer:
+            self.booster.save_optimizer(
+                self.optim_wrapper.optimizer,
+                checkpoint=optimizer_dir,
+                shard=True)
+
+        if is_main_process() and save_param_scheduler:
+            for i, scheduler in enumerate(self.param_schedulers):
+                self.booster.save_lr_scheduler(
+                    scheduler, f'{schedulers_dir}/scheduler_{i}.pth')
+
+        save_checkpoint(extra_ckpt, join_path(filename, 'meta.pth'))
+
+    def _build_plugin(self, plugin: Union[str, dict]):
+        if isinstance(plugin, str):
+            if plugin == 'gemini':
+                try:
+                    plugin = colo_plugin.GeminiPlugin(
+                        precision='bf16', placement_policy='auto')
+                except AssertionError:
+                    from colossalai.zero.gemini.placement_policy import \
+                        PlacementPolicyFactory as colo_placement
+                    raise ValueError('placement policy must be one of ' +
+                                     f'{list(colo_placement.policies.keys())}')
+            elif plugin == 'lowlevel-zero':
+                plugin = colo_plugin.LowLevelZeroPlugin()
+            else:
+                raise ValueError('`plugin` must be "gemini" or '
+                                 '"lowlevel-zero"')
+        elif isinstance(plugin, dict):
+            plugin = PLUGINS.build(plugin)
+        else:
+            raise ValueError('`plugin` must be dict or str, but got a '
+                             f'{type(plugin)} object)')
+        return plugin
+
+    def _build_mixed_precision(self, mixed_precision: Union[str, dict]):
+        if isinstance(mixed_precision, str):
+            if mixed_precision == 'fp16':
+                mixed_precision = colo_precision.FP16TorchMixedPrecision()
+            elif mixed_precision == 'fp16_apex':
+                mixed_precision = colo_precision.FP16ApexMixedPrecision()
+            elif mixed_precision == 'bf16':
+                mixed_precision = colo_precision.BF16MixedPrecision()
+            elif mixed_precision == 'fp8':
+                mixed_precision = colo_precision.FP8MixedPrecision()
+            else:
+                raise ValueError(
+                    'If `mixed_precision` is a string, it must be one of '
+                    '"fp16", "fp16_apex", "bf16" and "fp8", but got '
+                    f'{mixed_precision}')
+        elif isinstance(mixed_precision, dict):
+            mixed_precision = MIXED_PRECISIONS.build(mixed_precision)
+        else:
+            raise ValueError('mixed precision should be dict or str, but got '
+                             f'a {type(mixed_precision)} object')
+        return mixed_precision
+
+    def _wrap(
+        self,
+        model: nn.Module,
+        optim_wrapper: Optional[OptimWrapper] = None,
+    ) -> Union[Tuple[ColossalAIModelWrapper, ColossalAIOptimWrapper],
+               ColossalAIModelWrapper]:  # type: ignore
+        """Wrap model with :class:`ModelWrapper`."""
+        if self.model_wrapper is None:
+            self.model_wrapper = {'type': 'ColossalAIModelWrapper'}
+
+        # For zero series parallel, move `data_preprocessor` to current device
+        # is reasonable. We need to `BaseDataPreprocessor.to` manually since
+        # framework like colossalai and deepspeed could not handle it, leading
+        # to `data_preprocessor` move data to cpu.
+        for module in model.modules():
+            if isinstance(module, BaseDataPreprocessor):
+                module.to(get_device())
+
+        if optim_wrapper is not None:
+            optimizer = optim_wrapper.optimizer
+            if not hasattr(optimizer, '_hook_for_profile'):
+                # PyTorch 2.0 removes the `_hook_for_profile` in
+                # `torch.optim.Optimizer`. We maintain this function here to
+                # keep compatibility.
+                # TODO: Remove this hardcode when ColossalAI supports
+                # PyTorch 2.0
+                optimizer.__class__._hook_for_profile = object
+
+            # We do not pass `scheduler` and `Dataloader` here for:
+            # 1. `Booster.boost` cannot accept a list of schedulers.
+            # 2. `Strategy` cannot not accept dataloader now.
+            model_wrapper, optimizer, *_ = self.booster.boost(model, optimizer)
+            optim_wrapper.optimizer = optimizer
+            default_args = {'model_wrapper': model_wrapper, 'model': model}
+            model_wrapper = MODEL_WRAPPERS.build(
+                self.model_wrapper, default_args=default_args)
+            return model_wrapper, optim_wrapper  # type: ignore
+        else:
+            model_wrapper, *_ = self.booster.boost(model)
+            default_args = {'model_wrapper': model_wrapper, 'model': model}
+            model_wrapper = MODEL_WRAPPERS.build(
+                self.model_wrapper, default_args=default_args)
+            return model_wrapper
+
+    def _setup_distributed(  # type: ignore
+        self,
+        launcher: Optional[str] = None,
+        backend: str = 'nccl',
+        **kwargs,
+    ):
+        init_dist(
+            launcher, backend, init_backend='colossalai', config=self.config)
diff --git a/head_extractor/src/mmengine/_strategy/deepspeed.py b/head_extractor/src/mmengine/_strategy/deepspeed.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f89ff760d6bbe5a17a6c3afddc4879026d5f429
--- /dev/null
+++ b/head_extractor/src/mmengine/_strategy/deepspeed.py
@@ -0,0 +1,581 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import time
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+
+from mmengine.logging import print_log
+
+try:
+    import deepspeed
+except ImportError:
+    deepspeed = None
+
+import logging
+
+import torch.nn as nn
+
+import mmengine
+from mmengine.dist import init_dist, is_main_process
+from mmengine.optim import BaseOptimWrapper, _ParamScheduler
+from mmengine.registry import (MODEL_WRAPPERS, OPTIM_WRAPPERS, OPTIMIZERS,
+                               STRATEGIES)
+from mmengine.runner.checkpoint import save_checkpoint, weights_to_cpu
+from mmengine.utils import apply_to, digit_version, get_git_hash
+from .base import BaseStrategy
+
+
+def register_deepspeed_optimizers() -> List[str]:
+    """Register optimizers in ``deepspeed`` to the ``OPTIMIZERS`` registry.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    deepspeed_optimizers = []
+    try:
+        import deepspeed  # noqa: F401
+    except ImportError:
+        pass
+    else:
+        from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
+        from deepspeed.ops.lamb import FusedLamb
+        from deepspeed.runtime.fp16.onebit import (OnebitAdam, OnebitLamb,
+                                                   ZeroOneAdam)
+
+        OPTIMIZERS.register_module(module=DeepSpeedCPUAdam)
+        deepspeed_optimizers.append('DeepSpeedCPUAdam')
+        OPTIMIZERS.register_module(module=FusedAdam)
+        deepspeed_optimizers.append('FusedAdam')
+        OPTIMIZERS.register_module(module=FusedLamb)
+        deepspeed_optimizers.append('FusedLamb')
+        OPTIMIZERS.register_module(module=OnebitAdam)
+        deepspeed_optimizers.append('OnebitAdam')
+        OPTIMIZERS.register_module(module=OnebitLamb)
+        deepspeed_optimizers.append('OnebitLamb')
+        OPTIMIZERS.register_module(module=ZeroOneAdam)
+        deepspeed_optimizers.append('ZeroOneAdam')
+
+    return deepspeed_optimizers
+
+
+@OPTIM_WRAPPERS.register_module()
+class DeepSpeedOptimWrapper(BaseOptimWrapper):
+
+    def __init__(self, optimizer):
+        super().__init__(optimizer)
+        self._model = None
+
+    @property
+    def model(self):
+        if self._model is None:
+            raise ValueError('model attribute should be set before accessing.')
+        return self._model
+
+    @model.setter
+    def model(self, value):
+        self._model = value
+
+    def update_params(self, loss) -> None:  # type: ignore
+        """Update parameters in :attr:`optimizer`."""
+        self.backward(loss)
+        self.step()
+
+    def backward(self, loss: torch.Tensor, **kwargs) -> None:
+        """"Perform gradient back propagation."""
+        self.model.backward(loss)
+
+    def zero_grad(self, **kwargs) -> None:
+        raise NotImplementedError(
+            'DeepSpeedOptimWrapper does not support zero_grad method '
+            'currently.')
+
+    def step(self, **kwargs):
+        self.model.step()
+
+    def state_dict(self) -> dict:
+        state_dict = {}
+        if self.base_param_settings is not None:
+            state_dict['base_param_settings'] = self.base_param_settings
+
+        return state_dict
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        base_param_settings = state_dict.pop('base_param_settings', None)
+
+        if base_param_settings is not None:
+            self.base_param_settings = base_param_settings
+
+
+@MODEL_WRAPPERS.register_module()
+class MMDeepSpeedEngineWrapper:
+
+    def __init__(
+        self,
+        *,
+        model: 'deepspeed.DeepSpeedEngine',
+        inputs_to_half: Optional[List[Union[int, str]]] = None,
+    ):
+        self.model = model
+        self._inputs_to_half = inputs_to_half
+
+    def __getattr__(self, name):
+        return getattr(self.model, name)
+
+    def train_step(
+        self,
+        data: Union[dict, tuple, list],
+        optim_wrapper: DeepSpeedOptimWrapper,
+    ) -> Dict[str, torch.Tensor]:
+        data = self.model.module.data_preprocessor(data, training=True)
+        data = self._cast_inputs_half(data)
+        losses = self._run_forward(data, mode='loss')
+        parsed_loss, log_vars = self.model.module.parse_losses(losses)
+        optim_wrapper.update_params(parsed_loss)
+
+        return log_vars
+
+    def val_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the prediction of module during validation process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        data = self.model.module.data_preprocessor(data, False)
+        data = self._cast_inputs_half(data)
+        return self._run_forward(data, mode='predict')
+
+    def test_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the predictions of module during testing process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        data = self.model.module.data_preprocessor(data, False)
+        data = self._cast_inputs_half(data)
+        return self._run_forward(data, mode='predict')
+
+    def _run_forward(self, data: Union[dict, tuple, list], mode: str) -> Any:
+        """Unpacks data for :meth:`forward`
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            mode (str): Mode of forward.
+
+        Returns:
+            dict or list: Results of training or testing mode.
+        """
+        if isinstance(data, dict):
+            results = self.model(**data, mode=mode)
+        elif isinstance(data, (list, tuple)):
+            results = self.model(*data, mode=mode)
+        else:
+            raise TypeError('Output of `data_preprocessor` should be '
+                            f'list, tuple or dict, but got {type(data)}')
+        return results
+
+    def _cast_inputs_half(self, inputs: Union[list, tuple, dict, None]):
+        """Cast inputs to half precision if needed.
+
+        Args:
+            inputs (list or tuple or dict or None): Inputs to be casted.
+
+        Returns:
+            list or tuple or dict or None: Casted inputs.
+        """
+        if self._inputs_to_half is None:
+            return inputs
+
+        dtype = next(self.model.parameters()).dtype
+        if isinstance(inputs, (list, tuple)):
+            new_inputs = []
+            for i, v in enumerate(inputs):
+                if i in self._inputs_to_half:
+                    new_inputs.append(
+                        apply_to(v, lambda x: hasattr(x, 'to'),
+                                 lambda x: x.to(dtype)))
+                else:
+                    new_inputs.append(v)
+            return inputs.__class__(new_inputs)
+        elif isinstance(inputs, dict):
+            for k, v in inputs.items():
+                if k in self._inputs_to_half:
+                    inputs[k] = apply_to(v, lambda x: hasattr(x, 'to'),
+                                         lambda x: x.to(dtype))
+            return inputs
+        else:
+            raise TypeError('inputs should be list, tuple or dict, '
+                            f'but got {type(inputs)}')
+
+
+@STRATEGIES.register_module()
+class DeepSpeedStrategy(BaseStrategy):
+    """Support training models with DeepSpeed.
+
+    Note:
+        The detailed usage of parameters can be found at
+        https://www.deepspeed.ai/docs/config-json/.
+
+    Args:
+        config (str or dict, optional): If it is a string, it is a path to load
+            config for deepspeed. Defaults to None.
+        zero_optimization (dict, optional): Enabling and configuring ZeRO
+            memory optimizations. Defaults to None.
+        gradient_clipping (float, optional): Enable gradient clipping with
+            value. Defaults to None.
+        fp16 (dict, optional): Configuration for using mixed precision/FP16
+            training that leverages NVIDIA's Apex package. Defaults to None.
+        inputs_to_half (list[int or str], optional): Which inputs are to
+            converted to half precision. Defaults to None.
+            If ``fp16`` is enabled, it also should be set.
+        bf16 (dict, optional): Configuration for using bfloat16 floating-point
+            format as an alternative to FP16. Defaults to None.
+        amp (dict, optional): Configuration for using automatic mixed
+            precision (AMP) training that leverages NVIDIA's Apex AMP package.
+            Defaults to None.
+        activation_checkpointing (dict, optional): Reduce memory usage by
+            clearing activations of certain layers and recomputing them
+            during a backward pass.
+            Defaults to None.
+        aio (dict, optional): Configuring the asynchronous I/O module for
+            offloading parameter and optimizer states to persistent (NVMe)
+            storage. This module uses Linux native asynchronous I/O (libaio).
+            Defaults to None.
+        train_micro_batch_size_per_gpu (int, optional): Batch size to be
+            processed by one GPU in one step (without gradient accumulation).
+            Defaults to None.
+        gradient_accumulation_steps (int, optional): Number of training steps
+            to accumulate gradients before averaging and applying them.
+            Defaults to None.
+        exclude_frozen_parameters (bool, optional): Exclude frozen parameters
+            from saved checkpoint.
+    """
+
+    def __init__(
+        self,
+        *,
+        # the following args are for deepspeed
+        config: Union[str, dict, None] = None,
+        zero_optimization: Optional[dict] = None,
+        gradient_clipping: Optional[float] = None,
+        fp16: Optional[dict] = None,
+        inputs_to_half: Optional[List[Union[int, str]]] = None,
+        bf16: Optional[dict] = None,
+        amp: Optional[dict] = None,
+        activation_checkpointing: Optional[dict] = None,
+        aio: Optional[dict] = None,
+        train_micro_batch_size_per_gpu: Optional[int] = None,
+        gradient_accumulation_steps: Optional[int] = None,
+        # disable the log printed by deepseed
+        steps_per_print: int = 10000000000000,
+        # the following args are for BaseStrategy
+        exclude_frozen_parameters: Optional[bool] = None,
+        **kwargs,
+    ):
+        assert deepspeed is not None, \
+            'DeepSpeed is not installed. Please check ' \
+            'https://github.com/microsoft/DeepSpeed#installation.'
+
+        super().__init__(**kwargs)
+
+        self.config = self._parse_config(config)
+        if zero_optimization is not None:
+            self.config['zero_optimization'] = zero_optimization
+        if gradient_clipping is not None:
+            self.config['gradient_clipping'] = gradient_clipping
+        if fp16 is not None:
+            self.config['fp16'] = fp16
+        if bf16 is not None:
+            self.config['bf16'] = bf16
+        if amp is not None:
+            self.config['amp'] = amp
+        if activation_checkpointing is not None:
+            self.config['activation_checkpointing'] = activation_checkpointing
+        if aio is not None:
+            self.config['aio'] = aio
+        if train_micro_batch_size_per_gpu is not None:
+            self.config['train_micro_batch_size_per_gpu'] = \
+                train_micro_batch_size_per_gpu
+        if gradient_accumulation_steps is not None:
+            self.config['gradient_accumulation_steps'] = \
+                gradient_accumulation_steps
+        else:
+            self.config.setdefault('gradient_accumulation_steps', 1)
+        self.config['steps_per_print'] = steps_per_print
+        self._inputs_to_half = inputs_to_half
+        assert (exclude_frozen_parameters is None or
+                digit_version(deepspeed.__version__) >= digit_version('0.13.2')
+                ), ('DeepSpeed >= 0.13.2 is required to enable '
+                    'exclude_frozen_parameters')
+        self.exclude_frozen_parameters = exclude_frozen_parameters
+
+        register_deepspeed_optimizers()
+
+    def _parse_config(self, config):
+        if config is None:
+            config = dict()
+        elif isinstance(config, str):
+            with open(config) as f:
+                config = json.load(f)
+        return config
+
+    def _setup_distributed(  # type: ignore
+        self,
+        launcher: Optional[str] = None,
+        backend: str = 'nccl',
+        **kwargs,
+    ):
+        """Setup distributed environment.
+
+        Args:
+            launcher (str, optional): Way to launch multi processes.
+                DeepSpeedStrategy does not support the launcher argument.
+            backend (str): Communication Backends. Supported backends are
+                'nccl', 'gloo' and 'mpi'. Defaults to 'nccl'.
+            **kwargs: Other arguments for :func:`deepspeed.init_distributed`.
+        """
+        init_dist(launcher, backend, init_backend='deepspeed', **kwargs)
+
+    def prepare(
+        self,
+        model: Union[nn.Module, dict],
+        *,
+        optim_wrapper: Union[BaseOptimWrapper, dict, None] = None,
+        param_scheduler: Union[_ParamScheduler, Dict, List, None] = None,
+        compile: Union[dict, bool] = False,
+        dispatch_kwargs: Optional[dict] = None,
+    ):
+        """Prepare model and some components.
+
+        Args:
+            model (:obj:`torch.nn.Module` or dict): The model to be run. It
+                can be a dict used for build a model.
+
+        Keyword Args:
+            optim_wrapper (BaseOptimWrapper or dict, optional): Computing the
+                gradient of model parameters and updating them.
+                Defaults to None.
+                See :meth:`build_optim_wrapper` for examples.
+            param_scheduler (_ParamScheduler or dict or list, optional):
+                Parameter scheduler for updating optimizer parameters. If
+                specified, :attr:`optim_wrapper` should also be specified.
+                Defaults to None.
+                See :meth:`build_param_scheduler` for examples.
+            compile (dict, optional): Config to compile model.
+                Defaults to False. Requires PyTorch>=2.0.
+            dispatch_kwargs (dict, optional): Kwargs to be passed to other
+                methods of Strategy. Defaults to None.
+        """
+        if self._prepared:
+            return self._prepared_components()
+        assert dispatch_kwargs is not None
+        self.dispatch_kwargs.update(dispatch_kwargs)
+
+        model = self.build_model(model)
+        model = self._init_model_weights(model)
+
+        if optim_wrapper is not None:
+            self.optim_wrapper = self.build_optim_wrapper(optim_wrapper, model)
+            self.model = self._wrap_model(model)
+
+            self.optim_wrapper.model = self.model  # type: ignore
+
+        else:
+            self.model = self._wrap_model(model)
+
+        if param_scheduler is not None:
+            self.param_schedulers = self.build_param_scheduler(
+                param_scheduler, self.optim_wrapper)
+        self._prepared = True
+        return self._prepared_components()
+
+    def _wrap_model(self, model: nn.Module) -> nn.Module:
+        if hasattr(self, 'optim_wrapper'):
+            engine, self.optim_wrapper.optimizer, *_ = deepspeed.initialize(
+                model=model,
+                optimizer=self.optim_wrapper.optimizer,
+                config=self.config)
+        else:
+            engine, *_ = deepspeed.initialize(model=model, config=self.config)
+
+        wrapper = MMDeepSpeedEngineWrapper(
+            model=engine, inputs_to_half=self._inputs_to_half)
+        return wrapper
+
+    def load_checkpoint(
+        self,
+        filename: str,
+        *,
+        map_location: Union[str, Callable] = 'cpu',
+        strict: bool = False,
+        revise_keys: list = [(r'^module.', '')],
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Load checkpoint from given ``filename``.
+
+        Warning:
+            `map_localtion` and `callback` parameters are not supported yet.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+        """
+        self.logger.info(f'Load checkpoint from {filename}')
+
+        dirname, basename = osp.split(filename)
+        if digit_version(deepspeed.__version__) >= digit_version('0.13.2'):
+            _, extra_ckpt = self.model.load_checkpoint(
+                dirname,
+                tag=basename,
+                load_optimizer_states=False,
+                load_module_strict=not self.exclude_frozen_parameters)
+        else:
+            _, extra_ckpt = self.model.load_checkpoint(
+                dirname, tag=basename, load_optimizer_states=False)
+
+        return extra_ckpt
+
+    def resume(
+        self,
+        filename: str,
+        *,
+        resume_optimizer: bool = True,
+        resume_param_scheduler: bool = True,
+        map_location: Union[str, Callable] = 'default',
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Resume training from given ``filename``.
+
+        Warning:
+            `map_location` and `callback` parameters are not supported yet.
+
+        Args:
+            filename (str): Accept local filepath.
+
+        Keyword Args:
+            resume_optimizer (bool): Whether to resume optimizer state.
+                Defaults to True.
+            resume_param_scheduler (bool): Whether to resume param scheduler
+                state. Defaults to True.
+        """
+        self.logger.info(f'Resume checkpoint from {filename}')
+
+        dirname, basename = osp.split(filename)
+        if digit_version(deepspeed.__version__) >= digit_version('0.13.2'):
+            _, extra_ckpt = self.model.load_checkpoint(
+                dirname,
+                tag=basename,
+                load_optimizer_states=resume_optimizer,
+                load_module_strict=not self.exclude_frozen_parameters)
+        else:
+            _, extra_ckpt = self.model.load_checkpoint(
+                dirname, tag=basename, load_optimizer_states=resume_optimizer)
+
+        if resume_optimizer:
+            self.load_optim_state_dict(extra_ckpt.pop('optim_wrapper'))
+
+        if resume_param_scheduler and hasattr(self, 'param_schedulers'):
+            param_schedulers = extra_ckpt.pop('param_schedulers')
+            self.load_scheduler_state_dict(param_schedulers)
+
+        # resume random seed
+        resumed_seed = extra_ckpt['meta'].get('seed', None)
+        current_seed = self._randomness.get('seed')
+        if resumed_seed is not None and resumed_seed != current_seed:
+            if current_seed is not None:
+                self.logger.warning(f'The value of random seed in the '
+                                    f'checkpoint "{resumed_seed}" is '
+                                    f'different from the value in '
+                                    f'`randomness` config "{current_seed}"')
+            self._randomness.update(seed=resumed_seed)
+            self._set_randomness(**self._randomness)
+
+        return extra_ckpt
+
+    def save_checkpoint(
+        self,
+        filename: str,
+        *,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        extra_ckpt: Optional[dict] = None,
+        callback: Optional[Callable] = None,
+    ) -> None:
+        """Save checkpoint to given ``filename``.
+
+        Warning:
+            `callback` parameter is not supported yet.
+
+        Args:
+            filename (str): Filename to save checkpoint.
+
+        Keyword Args:
+            save_param_scheduler (bool): Whether to save the param_scheduler
+                to the checkpoint. Defaults to True.
+            extra_ckpt (dict, optional): Extra checkpoint to save.
+                Defaults to None.
+        """
+        if extra_ckpt is None:
+            extra_ckpt = dict()
+        if 'meta' not in extra_ckpt:
+            extra_ckpt['meta'] = dict()
+        extra_ckpt['meta'].update(
+            seed=self.seed,
+            time=time.strftime('%Y%m%d_%H%M%S', time.localtime()),
+            mmengine=mmengine.__version__ + get_git_hash(),
+        )
+
+        if save_param_scheduler and hasattr(self, 'param_schedulers'):
+            extra_ckpt['param_schedulers'] = self.scheduler_state_dict()
+
+        if (not save_optimizer
+                and self.model.zero_optimization_partition_weights()
+                and not self.model.zero_gather_16bit_weights_on_model_save()):
+            print_log(
+                'Configured to `save_optimizer=False`, but currently using '
+                "DeepSpeed's ZeRO stage 3 with "
+                '`gather_16bit_weights_on_model_save=False`. In '
+                'this configuration, the model cannot be saved properly '
+                'and will be saved with the optimizer state. '
+                'To support `save_optimizer=False`, please set '
+                '`gather_16bit_weights_on_model_save=True` in your '
+                'DeepSpeed config.',
+                logger='current',
+                level=logging.WARNING)
+            save_optimizer = True
+
+        state_dict_kwargs = {}
+        if digit_version(deepspeed.__version__) >= digit_version('0.13.2'):
+            state_dict_kwargs[
+                'exclude_frozen_parameters'] = self.exclude_frozen_parameters
+
+        if save_optimizer:
+            if hasattr(self, 'optim_wrapper'):
+                # The key can not be 'optimizer', otherwise error will be
+                # thrown when loading or resuming checkpoint.
+                extra_ckpt['optim_wrapper'] = self.optim_state_dict()
+
+            dirname, basename = osp.split(filename)
+            self.model.save_checkpoint(
+                dirname,
+                tag=basename,
+                client_state=extra_ckpt,
+                save_latest=False,
+                **state_dict_kwargs)
+        else:
+            if self.model.zero_optimization_partition_weights():
+                state_dict = self.model._zero3_consolidated_16bit_state_dict(
+                    **state_dict_kwargs)
+            else:
+                state_dict = self.model.module_state_dict(**state_dict_kwargs)
+
+            if is_main_process():
+                ckpt = {'state_dict': weights_to_cpu(state_dict), **extra_ckpt}
+                save_checkpoint(ckpt, filename)
diff --git a/head_extractor/src/mmengine/_strategy/distributed.py b/head_extractor/src/mmengine/_strategy/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c969b85b1d2bba27af6251c25eda94ca078340d
--- /dev/null
+++ b/head_extractor/src/mmengine/_strategy/distributed.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Callable, Optional
+
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel
+
+from mmengine.device import get_device
+from mmengine.dist import init_dist, is_distributed, master_only
+from mmengine.model import convert_sync_batchnorm, is_model_wrapper
+from mmengine.registry import MODEL_WRAPPERS, STRATEGIES
+from .single_device import SingleDeviceStrategy
+
+
+@STRATEGIES.register_module()
+class DDPStrategy(SingleDeviceStrategy):
+    """Distribution strategy for distributed data parallel training.
+
+    Args:
+        model_wrapper (dict): Dict for model wrapper. Defaults to None.
+        sync_bn (str): Type of sync batch norm. Defaults to None.
+            Options are 'torch' and 'mmcv'.
+        **kwargs: Other arguments for :class:`BaseStrategy`.
+    """
+
+    def __init__(
+        self,
+        *,
+        model_wrapper: Optional[dict] = None,
+        sync_bn: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.model_wrapper = model_wrapper
+        self.sync_bn = sync_bn
+
+    def _setup_distributed(  # type: ignore
+        self,
+        launcher: str = 'pytorch',
+        backend: str = 'nccl',
+        **kwargs,
+    ):
+        """Setup distributed environment.
+
+        Args:
+            launcher (str): Way to launcher multi processes. Supported
+                launchers are 'pytorch', 'mpi' and 'slurm'.
+            backend (str): Communication Backends. Supported backends are
+                'nccl', 'gloo' and 'mpi'. Defaults to 'nccl'.
+            **kwargs: Other arguments for :func:`init_dist`.
+        """
+        if not is_distributed():
+            init_dist(launcher, backend, **kwargs)
+
+    def convert_model(self, model: nn.Module) -> nn.Module:
+        """convert all ``BatchNorm`` layers in the model to ``SyncBatchNorm``
+        (SyncBN) or ``mmcv.ops.sync_bn.SyncBatchNorm`` (MMSyncBN) layers.
+
+        Args:
+            model (nn.Module): Model to be converted.
+
+        Returns:
+            nn.Module: Converted model.
+        """
+        if self.sync_bn is not None:
+            try:
+                model = convert_sync_batchnorm(model, self.sync_bn)
+            except ValueError as e:
+                self.logger.error('cfg.sync_bn should be "torch" or '
+                                  f'"mmcv", but got {self.sync_bn}')
+                raise e
+
+        return model
+
+    def _wrap_model(self, model: nn.Module) -> DistributedDataParallel:
+        """Wrap the model to :obj:``MMDistributedDataParallel`` or other custom
+        distributed data-parallel module wrappers.
+
+        Args:
+            model (nn.Module): Model to be wrapped.
+
+        Returns:
+            nn.Module or DistributedDataParallel: nn.Module or subclass of
+            ``DistributedDataParallel``.
+        """
+        if is_model_wrapper(model):
+            return model
+
+        model = model.to(get_device())
+
+        model = self.convert_model(model)
+
+        if self.model_wrapper is None:
+            # set broadcast_buffers as False to keep compatibility with
+            # OpenMMLab repos
+            self.model_wrapper = dict(
+                type='MMDistributedDataParallel', broadcast_buffers=False)
+
+        default_args = dict(
+            type='MMDistributedDataParallel',
+            module=model,
+            device_ids=[int(os.environ['LOCAL_RANK'])])
+        model = MODEL_WRAPPERS.build(
+            self.model_wrapper, default_args=default_args)
+        return model
+
+    @master_only
+    def save_checkpoint(
+        self,
+        filename: str,
+        *,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        extra_ckpt: Optional[dict] = None,
+        callback: Optional[Callable] = None,
+    ) -> None:
+        super().save_checkpoint(
+            filename=filename,
+            save_optimizer=save_optimizer,
+            save_param_scheduler=save_param_scheduler,
+            extra_ckpt=extra_ckpt,
+            callback=callback)
diff --git a/head_extractor/src/mmengine/_strategy/fsdp.py b/head_extractor/src/mmengine/_strategy/fsdp.py
new file mode 100644
index 0000000000000000000000000000000000000000..0788fafdabb21824ed6c1fce4e5af5f75db03a54
--- /dev/null
+++ b/head_extractor/src/mmengine/_strategy/fsdp.py
@@ -0,0 +1,643 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+import os
+import os.path as osp
+import time
+from collections import OrderedDict
+from functools import partial
+from typing import Callable, Dict, List, Optional, Sequence, Union
+
+import torch.nn as nn
+from torch.distributed.fsdp import (FullStateDictConfig,
+                                    FullyShardedDataParallel,
+                                    LocalStateDictConfig, StateDictType)
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    FullOptimStateDictConfig, LocalOptimStateDictConfig, OptimStateDictConfig,
+    StateDictConfig)
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler
+
+import mmengine
+from mmengine.config import Config, ConfigDict
+from mmengine.device import get_device
+from mmengine.dist import get_rank, is_main_process
+from mmengine.model import BaseDataPreprocessor, is_model_wrapper
+from mmengine.optim import (AmpOptimWrapper, BaseOptimWrapper, OptimWrapper,
+                            OptimWrapperDict, _ParamScheduler,
+                            build_optim_wrapper)
+from mmengine.registry import (FUNCTIONS, MODEL_WRAPPERS, OPTIM_WRAPPERS,
+                               PARAM_SCHEDULERS, STRATEGIES, Registry)
+from mmengine.utils import get_git_hash, mkdir_or_exist
+from .distributed import DDPStrategy
+from .utils import MetaTensorContext
+
+FSDP = FullyShardedDataParallel
+FSDP_CONFIGS = Registry('fsdp configs')
+FSDP_CONFIGS.register_module(module=FullOptimStateDictConfig)
+FSDP_CONFIGS.register_module(module=LocalOptimStateDictConfig)
+FSDP_CONFIGS.register_module(module=FullStateDictConfig)
+FSDP_CONFIGS.register_module(module=LocalStateDictConfig)
+
+
+@STRATEGIES.register_module()
+class FSDPStrategy(DDPStrategy):
+    """Support training model with FullyShardedDataParallel (FSDP).
+
+    Keyword Args:
+        model_wrapper (dict, optional): Config dict for model wrapper. The
+            default configuration is:
+
+            Examples:
+                >>> model_wrapper = dict(
+                >>>    type='MMFullyShardedDataParallel',
+                >>>    use_orig_params=True,
+                >>> )
+
+            See more configurable arguments in
+            :class:`MMFullyShardedDataParallel`. Defaults to None
+        skip_init_weights (bool, optional): Whether to skip initialization of
+            weights. Defaults to False. This is useful when the parameters of
+            the large model are loaded from a checkpoint, since skipping the
+            initialization of weights can save a lot of time.
+        state_dict_cfg (str or dict): Configuration for
+            how to save and load the state dict of the model, optimizer, and
+            scheduler.
+
+            - "local": save and load the sharded state dict in all ranks.
+            - "full": save and load the full state dict in rank 0.
+            - `dict` object: save and load the state dict more flexibly. For
+              example, you can first offload the state dict to the 'cpu' and
+              then save it to the disk. This can help you to load the
+              checkpoint in a non-gpu environment:
+
+              Examples:
+                >>> state_dict_cfg=dict(
+                >>>     state_dict_type='FULL_STATE_DICT',
+                >>>     state_dict_config=dict(type='FullStateDictConfig', offload_to_cpu=True),
+                >>>     optim_state_dict_config=dict(type='FullOptimStateDictConfig', offload_to_cpu=True),
+
+              See more configurable arguments for ``state_dict_cfg``,
+              ``state_dict_config``, and ``optim_state_dict_config``in
+              `FSDP official api documents`_
+        kwargs (dict): Additional arguments passed to :class:`DDPStrategy`:
+
+            - work_dir (str): The working directory to save checkpoints.
+              The logs will be saved in the subdirectory of `work_dir` named
+              :attr:`timestamp`. Defaults to 'work_dirs'.
+            - experiment_name (str, optional): Name of current experiment. If
+              not specified, timestamp will be used as :attr:`experiment_name`.
+              Defaults to None.
+            - env_kwargs (dict, optional): Environment config passed in
+              :meth:`setup_env`. Defaults to None.
+            - log_kwargs (dict, optional): Logger config passed in
+              :meth:`build_logger`. Defaults to None.
+        activation_checkpointing (dict, optional): Config dict for gradient
+            checkpoint.
+
+            Examples:
+              >>> activation_checkpointing = dict(check_fn='CustomCheckFn')
+              >>> activation_checkpointing = dict(check_fn=dict(type='CustomCheckFn', arg1=arg1))
+
+
+            ``check_fn`` field should behave consistently with
+            ``auto_wrap_policy`` defined in `model_wrapper`, and other
+            fields will be passed to ``apply_activation_checkpointing``
+
+            `New in version 0.9.0.`
+
+    .. _FSDP official api documents: https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.set_state_dict_type
+    """  # noqa: E501
+
+    def __init__(self,
+                 *,
+                 model_wrapper: Optional[dict] = None,
+                 skip_init_weights=False,
+                 state_dict_cfg: Union[str, dict] = 'local',
+                 activation_checkpointing: Optional[dict] = None,
+                 **kwargs):
+        super().__init__(model_wrapper=model_wrapper, **kwargs)
+        self._init_state_dict_cfg(state_dict_cfg)
+        if not isinstance(skip_init_weights, bool):
+            raise TypeError('skip_init_weights must be a boolean, but got '
+                            f'{type(skip_init_weights)}')
+        self.skip_init_weights = skip_init_weights
+        self.activation_checkpointing = activation_checkpointing
+
+    def _wrap_model(self, model: nn.Module) -> None:
+        """Wrap the model to :obj:``MMFullyShardedDataParallel`` or other
+        custom fully sharded data parallel module wrappers.
+
+        Args:
+            model (nn.Module): Model to be wrapped.
+
+        Returns:
+            FullyShardedDataParallel: ``MMFullyShardedDataParallel``
+            or subclass of ``FullyShardedDataParallel``.
+        """
+        try:
+            from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import \
+                apply_activation_checkpointing  # noqa: E501
+        except ImportError:
+            apply_activation_checkpointing = None
+
+        for module in model.modules():
+            if isinstance(module, BaseDataPreprocessor):
+                module.to(get_device())
+
+        if is_model_wrapper(model):
+            return
+
+        if self.model_wrapper is None:
+            self.model_wrapper = dict(type='MMFullyShardedDataParallel')
+
+        default_args = dict(
+            module=model,
+            device_id=int(os.environ['LOCAL_RANK']),
+            type='MMFullyShardedDataParallel')
+        model = MODEL_WRAPPERS.build(
+            self.model_wrapper, default_args=default_args)
+        model.set_state_dict_type(model, self.state_dict_type,
+                                  self.state_dict_config,
+                                  self.optim_state_dict_config)
+
+        if self.activation_checkpointing is not None:
+            if apply_activation_checkpointing is None:
+                raise RuntimeError(
+                    'activation_checkpointing maybe deprecated by current '
+                    'PyTorch version, maybe you could switch to PyTorch 2.0 '
+                    'or 2.1 to use `activation_checkpointing`.')
+            cfg = copy.deepcopy(self.activation_checkpointing)
+            with FUNCTIONS.switch_scope_and_registry(None):
+                check_fn = cfg.pop('check_fn')
+                if isinstance(check_fn, str):
+                    check_fn = FUNCTIONS.get(check_fn)
+                elif isinstance(check_fn, dict):
+                    fn_type = check_fn.pop('type')
+                    if isinstance(fn_type, str):
+                        fn_type = FUNCTIONS.get(fn_type)
+                    check_fn = partial(fn_type, **cfg)
+
+                if not callable(check_fn):
+                    raise TypeError('`check_fn` must be a callable function')
+                apply_activation_checkpointing(model, check_fn=check_fn, **cfg)
+        return model
+
+    def _is_full_state_dict(self):
+        """Whether to save and load the full state_dict in rank 0."""
+        return self.state_dict_type == StateDictType.FULL_STATE_DICT
+
+    def build_model(self, model: Union[nn.Module, dict]) -> nn.Module:
+        """Build model.
+
+        If skip_init_weights is True, the model will be built with an empty
+        weights. It means that :meth:`load_checkpoint` must be called to fill
+        the weights before training.
+
+        Args:
+            model (nn.Module or dict): A ``nn.Module`` object or a dict to
+                build ``nn.Module`` object. If ``model`` is a ``nn.Module``
+                object, just returns itself.
+
+        Returns:
+            nn.Module: Model build from ``model``.
+        """
+        if self.skip_init_weights:
+            if isinstance(model, dict):
+                # Accelerate initialization by skipping init weights
+                with MetaTensorContext():
+                    model = super().build_model(model)
+                model.to_empty(device='cpu')
+        else:
+            model = super().build_model(model)
+
+        # `id_to_name` will be used to convert the `optim_state_dict` of the
+        # raw optimizer to the `optim_state_dict`
+        # returned by `FSDP.optim_state_dict` in
+        # `StateDictType.FULL_STATE_DICT` mode.
+        self.id_to_name = dict()
+        for name, param in model.named_parameters():
+            self.id_to_name[id(param)] = name
+        return model
+
+    def save_checkpoint(self,
+                        filename: str,
+                        *,
+                        save_optimizer: bool = True,
+                        save_param_scheduler: bool = True,
+                        extra_ckpt: Optional[dict] = None,
+                        callback: Optional[Callable] = None) -> None:
+        """Save checkpoint to given ``filename``.
+
+        If ``state_dict_type`` is `full`, the checkpoint will only be saved in
+        rank0. The structure of the saved checkpoint is the same as the one
+        saved by ``DDPStrategy``
+
+        If ``state_dict_type`` is `local`, each rank will save the sharded
+        state dict to a directory, which means the saved structure will look
+        like this:
+
+        .. code-block:: bash
+
+            ── epoch_0.pth
+                ├── rank0.pth
+                ├── rank1.pth
+                ├── ...
+                └── rank8.pth
+
+        Args:
+            filename (str): Filename to save checkpoint.
+
+        Keyword Args:
+            save_optimizer (bool): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            save_param_scheduler (bool): Whether to save the param_scheduler
+                to the checkpoint. Defaults to True.
+            extra_ckpt (dict, optional): Extra checkpoint to save.
+                Defaults to None.
+            callback (callable, callable): Callback function to modify the
+                checkpoint before saving the checkpoint.
+                Defaults to None.
+        """
+        from mmengine.runner.checkpoint import save_checkpoint
+
+        state_dict: dict = dict()
+        state_dict['state_dict'] = self.model_state_dict()
+
+        # save optimizer state dict
+        if save_optimizer and hasattr(self, 'optim_wrapper'):
+            state_dict['optimizer'] = self.optim_state_dict()
+
+        # save param scheduler state dict
+        if save_param_scheduler and hasattr(self, 'param_schedulers'):
+            state_dict['param_schedulers'] = self.scheduler_state_dict()
+
+        # save extra checkpoint passed by users
+        if extra_ckpt is None:
+            extra_ckpt = dict()
+        if 'meta' not in extra_ckpt:
+            extra_ckpt['meta'] = dict()
+
+        extra_ckpt['meta'].update(
+            seed=self.seed,
+            time=time.strftime('%Y%m%d_%H%M%S', time.localtime()),
+            mmengine=mmengine.__version__ + get_git_hash(),
+        )
+        state_dict.update(extra_ckpt)
+
+        # users can do some modification before saving checkpoint
+        if callback is not None:
+            callback(state_dict)
+
+        # In non-FULL_STATE_DICT model, FSDPStrategy will save checkpoint
+        # of different ranks in different files.
+        if not self._is_full_state_dict():
+            rank = get_rank()
+            mkdir_or_exist(filename)
+            ckpt_name = f'rank{rank}.pth'
+            filename = osp.join(filename, ckpt_name)
+            save_checkpoint(state_dict, filename)
+
+        if is_main_process():
+            save_checkpoint(state_dict, filename)
+
+    def model_state_dict(self) -> dict:
+        """Get model state dict based on the ``state_dict_type``.
+
+        If ``state_dict_type`` is `full`, the model state dict will be the
+        same as the one of original unsharded model.
+
+        If ``state_dict_type`` is ``local``, and ``use_orig_params`` is ``True``
+        in ``model_wrapper``. The key of the state dict will be the same as
+        the one of original unsharded model, but its value will be the sharded
+        one
+
+        If ``state_dict_type`` is `local`, and ```use_orig_params``` is
+        ``False`` in ``model_wrapper``, the flatten and sharded state dict will
+        be returned.
+
+        See more details in the `official api documents`_
+
+        .. _official api documents: https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.optim_state_dict
+        """  # noqa: E501
+        # We've set state_dict by `FSDP.set_state_dict_type`, therefore we
+        # should get model state dict by `FSDP.state_dict`
+        return self.model.state_dict()
+
+    def optim_state_dict(self) -> dict:
+        """Get model state dict based on the ``state_dict_type``.
+
+        If ``state_dict_type`` is ``full``, the optimizer state dict can be
+        loaded by the original unsharded optimizer.
+
+        Otherwise, the optimizer state dict could only be loaded by the
+        optimizer with sharded parameters.
+
+        Note:
+            The optimizer state dict is not the same as the one of original
+            optimizer even if in ``full`` mode, although they can be loaded
+            correctly.
+
+        See more details in the `official api documents`_
+
+        .. _official api documents: https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.optim_state_dict
+        """  # noqa: E501
+        return FSDP.optim_state_dict(self.model, self.optim_wrapper)
+
+    def load_checkpoint(self, filename: str, **kwargs) -> dict:
+        """Load checkpoint from given ``filename``.
+
+        Note:
+            If ``state_dict_type`` is `local`, the filename should be a
+            directory contains ``rank{i}.pth``.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+
+        Keyword Args:
+            map_location (str or callable): A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'cpu'.
+            strict (bool): strict (bool): Whether to allow different params for
+                the model and checkpoint.
+            revise_keys (list): A list of customized keywords to modify the
+                state_dict in checkpoint. Each item is a (pattern, replacement)
+                pair of the regular expression operations. Defaults to strip
+                the prefix 'module.' by [(r'^module\\.', '')].
+            callback (callable, callable): Callback function to modify the
+                checkpoint after loading the checkpoint.
+                Defaults to None.
+        """
+        if self._is_full_state_dict():
+            return super(DDPStrategy, self).load_checkpoint(filename, **kwargs)
+        else:
+            rank = get_rank()
+            filename = osp.join(filename, f'rank{rank}.pth')
+            return super(DDPStrategy, self).load_checkpoint(filename, **kwargs)
+
+    def load_model_state_dict(
+        self,
+        state_dict: dict,
+        *,
+        strict: bool = False,
+        revise_keys: list = [(r'^module.', '')],
+    ) -> None:  # type: ignore
+        """Load model state from dict.
+
+        Warning:
+            `revise_keys` is not supported yet.
+
+        Args:
+            state_dict (dict): Model state dict returned by
+                :meth:`FSDPStrategy.model_state_dict`. If ``state_dict_type``
+                is ``full``. ``state_dict`` could be the result of
+                ``model.state_dict()``
+            strict (bool): Whether to load model state dict strictly.
+                Defaults to False.
+        """
+        # We should load state dict by `FSDP.load_state_dict`
+        self.model.load_state_dict(state_dict, strict=strict)
+
+    def load_optim_state_dict(self, state_dict: dict) -> None:
+        """Load optimizer state from dict.
+
+        Args:
+            state_dict (dict): The optimizer state dict. If ``state_dict_type``
+                is ``full``. ``state_dict`` could be the result of
+                ``optimizer.state_dict()``
+        """
+        optim_state_dict = FSDP.optim_state_dict_to_load(
+            state_dict, self.model, self.optim_wrapper.optimizer)
+        self.optim_wrapper.load_state_dict(optim_state_dict)
+
+    def _init_state_dict_cfg(self, state_dict_cfg: Union[str, dict]) -> None:
+        """Make ``state_dict_type`` and ``state_dict_config`` can be configured
+        with string."""
+        if isinstance(state_dict_cfg, str):
+            if state_dict_cfg == 'full':
+                self.state_dict_type = StateDictType.FULL_STATE_DICT
+                self.state_dict_config = FullStateDictConfig(
+                    rank0_only=True, offload_to_cpu=True)
+                self.optim_state_dict_config = FullOptimStateDictConfig(
+                    rank0_only=True, offload_to_cpu=True)
+            elif state_dict_cfg == 'local':
+                self.state_dict_type = StateDictType.LOCAL_STATE_DICT
+                self.state_dict_config = LocalStateDictConfig()
+                self.optim_state_dict_config = LocalOptimStateDictConfig()
+            else:
+                raise ValueError('FSDP only supports `full` and `local` '
+                                 f'state_dict_type, but got {state_dict_cfg}')
+        elif isinstance(state_dict_cfg, dict):
+            if 'state_dict_type' not in state_dict_cfg:
+                self.state_dict_type = StateDictType.LOCAL_STATE_DICT
+            else:
+                state_dict_type = state_dict_cfg['state_dict_type']
+                if isinstance(state_dict_type, str):
+                    self.state_dict_type = StateDictType[
+                        state_dict_cfg['state_dict_type']]
+                else:
+                    self.state_dict_type = state_dict_type
+            state_dict_config = state_dict_cfg.get('state_dict_config')
+            if state_dict_config is None:
+                self.state_dict_config = LocalStateDictConfig()
+            elif isinstance(state_dict_config, dict):
+                self.state_dict_config = FSDP_CONFIGS.build(
+                    state_dict_cfg['state_dict_config'])
+            else:
+                self.state_dict_config = state_dict_config
+
+            optim_state_dict_config = state_dict_cfg.get(
+                'optim_state_dict_config')
+            if optim_state_dict_config is None:
+                self.optim_state_dict_config = LocalOptimStateDictConfig()
+            elif isinstance(optim_state_dict_config, dict):
+                self.optim_state_dict_config = FSDP_CONFIGS.build(
+                    state_dict_cfg['optim_state_dict_config'])
+            else:
+                self.optim_state_dict_config = optim_state_dict_config
+        else:
+            raise TypeError('state_dict_cfg should be a `str` or a `dict`, '
+                            f'but got {type(state_dict_cfg)}')
+
+        if not isinstance(self.state_dict_type, StateDictType):
+            raise TypeError('state_dict_type must be StateDictType, but got '
+                            f'{type(self.state_dict_type)}')
+        if not isinstance(self.state_dict_config, StateDictConfig):
+            raise TypeError('state_dict_config must be StateDictConfig, but '
+                            f'got {type(self.state_dict_config)}')
+        if not isinstance(self.optim_state_dict_config, OptimStateDictConfig):
+            raise TypeError('optim_state_dict_config must be '
+                            'OptimStateDictConfig, but got '
+                            f'{type(self.optim_state_dict_config)}')
+
+    def build_optim_wrapper(
+        self,
+        optim_wrapper: Union[Optimizer, OptimWrapper, dict],
+        model: Optional[nn.Module] = None,
+    ) -> BaseOptimWrapper:
+        """Support sharding the optimizer state dict given a built optimizer or
+        optim_wrapper.
+
+        See specific usage in :meth:`BaseStrategy.build_optim_wrapper`.
+        """
+        if isinstance(optim_wrapper, Optimizer):
+            optim_wrapper = OptimWrapper(optim_wrapper)
+        if isinstance(optim_wrapper, BaseOptimWrapper):
+            assert model is not None
+            # NOTE: The only difference is that FSDPStrategy will shard
+            # the the built OptimWrapper
+            optimizer = optim_wrapper.optimizer
+            param_groups = optimizer.param_groups
+            optim_state_dict = optimizer.state_dict()
+            assert not optim_state_dict['state'], (
+                'Optimizer state_dict should be empty when giving an built '
+                'optim_wrapper to FSDPStrategy')
+            # Align the state_dict with state_dict generated by
+            # FSDP.full_optim_state_dict
+            new_param_groups = []
+            for group in param_groups:
+                new_group = {
+                    key: value
+                    for key, value in group.items() if key != 'param'
+                }
+                new_group['params'] = [
+                    self.id_to_name[id(param)] for param in group['params']
+                ]
+                new_param_groups.append(new_group)
+            optim_state_dict['param_groups'] = new_param_groups
+            defaults = {
+                k: v
+                for k, v in optimizer.defaults.items() if k != 'differentiable'
+            }
+
+            params_dict = {}
+            for k, v in model.named_parameters():
+                if '_fsdp_wrapped_module' in k:
+                    k = k.replace('_fsdp_wrapped_module.', '')
+                params_dict[k] = v
+
+            params = []
+            for param_group in new_param_groups:
+                _params = []
+                for param_name in param_group['params']:
+                    if param_name not in params_dict:
+                        raise RuntimeError(
+                            'Failed to reconstruct the sharded optimizer. '
+                            'You can try to set `use_orig_params=True` in '
+                            '`model_wrapper`')
+                    _params.append(params_dict[param_name])
+                param_group = {
+                    k: v
+                    for k, v in param_group.items() if k != 'param'
+                }
+                param_group['params'] = _params
+                params.append(param_group)
+
+            new_optimizer = optimizer.__class__(params, **defaults)
+
+            # Force to load the converted optim_state_dict in full mode.
+            with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT):
+                optim_state_dict = FSDP.optim_state_dict_to_load(
+                    optim_state_dict, model, new_optimizer)
+                new_optimizer.load_state_dict(optim_state_dict)
+            optim_wrapper.optimizer = new_optimizer
+            return optim_wrapper
+        if isinstance(optim_wrapper, (dict, ConfigDict, Config)):
+            assert model is not None
+            # optimizer must be defined for single optimizer training.
+            optimizer = optim_wrapper.get('optimizer', None)
+            optim_wrapper.setdefault('type', 'OptimWrapper')
+            if optim_wrapper.get('type',
+                                 'AmpOptimWrapper') in ('AmpOptimWrapper',
+                                                        AmpOptimWrapper):
+                optim_wrapper.setdefault('use_fsdp', True)
+
+            # If optimizer is a built `Optimizer` instance, the optimizer
+            # wrapper should be built by `OPTIM_WRAPPERS` registry.
+            if isinstance(optimizer, Optimizer):
+                return OPTIM_WRAPPERS.build(optim_wrapper)  # type: ignore
+
+            # If `optimizer` is not None or `constructor` is defined, it means,
+            # optimizer wrapper will be built by optimizer wrapper
+            # constructor. Therefore, `build_optim_wrapper` should be called.
+            if optimizer is not None or 'constructor' in optim_wrapper:
+                return build_optim_wrapper(model, optim_wrapper)
+            else:
+                # if `optimizer` is not defined, it should be the case of
+                # training with multiple optimizers. If `constructor` is not
+                # defined either, each value of `optim_wrapper` must be an
+                # `OptimWrapper` instance since `DefaultOptimizerConstructor`
+                # will not handle the case of training with multiple
+                # optimizers. `build_optim_wrapper` will directly build the
+                # `OptimWrapperDict` instance from `optim_wrapper.`
+                optim_wrappers = OrderedDict()
+                for name, optim in optim_wrapper.items():
+                    if not isinstance(optim, OptimWrapper):
+                        raise ValueError(
+                            'each item mush be an optimizer object when '
+                            '"type" and "constructor" are not in '
+                            f'optimizer, but got {name}={optim}')
+                    optim_wrappers[name] = optim
+                return OptimWrapperDict(**optim_wrappers)
+        else:
+            raise TypeError('optimizer wrapper should be an OptimWrapper '
+                            f'object or dict, but got {optim_wrapper}')
+
+    def _build_param_scheduler(
+        self,
+        scheduler: Union[_ParamScheduler, Dict, List],
+        optim_wrapper: BaseOptimWrapper,
+        default_args: dict,
+    ) -> List[_ParamScheduler]:
+        """Override this method to update the scheduler with the reconstructed
+        sharded optimzer."""
+        if not isinstance(scheduler, Sequence):
+            schedulers = [scheduler]
+        else:
+            schedulers = scheduler
+
+        max_epochs = default_args.pop('max_epochs', None)
+        max_iters = default_args.pop('max_iters', None)
+
+        param_schedulers = []
+        for scheduler in schedulers:
+            # Update the built scheduler with the sharded optimizer
+            if isinstance(scheduler, (_ParamScheduler, LRScheduler)):
+                parameter_keys = inspect.signature(
+                    scheduler.__class__).parameters.keys()
+                kwargs = {
+                    k: v
+                    for k, v in scheduler.state_dict().items()
+                    if k in parameter_keys
+                }
+                scheduler = scheduler.__class__(optim_wrapper, **kwargs)
+            elif isinstance(scheduler, dict):
+                _scheduler = copy.deepcopy(scheduler)
+
+                # Set default end
+                if _scheduler.get('by_epoch', True):
+                    if max_epochs is None:
+                        raise ValueError(
+                            'max_epochs must be specified in default_args')
+                    default_end = max_epochs
+                else:
+                    if max_iters is None:
+                        raise ValueError(
+                            'max_iters must be specified in default_args')
+                    default_end = max_iters
+                _scheduler.setdefault('end', default_end)
+                self.logger.debug(
+                    f'The `end` of {_scheduler["type"]} is not set. '
+                    'Use the max epochs/iters of train loop as default.')
+
+                param_schedulers.append(
+                    PARAM_SCHEDULERS.build(
+                        _scheduler,
+                        default_args=dict(
+                            optimizer=optim_wrapper, **default_args)))
+            else:
+                raise TypeError(
+                    'scheduler should be a _ParamScheduler object or dict, '
+                    f'but got {scheduler}')
+        return param_schedulers
diff --git a/head_extractor/src/mmengine/_strategy/single_device.py b/head_extractor/src/mmengine/_strategy/single_device.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7d8accd5a73e02121f19c75c644202de6a62d1a
--- /dev/null
+++ b/head_extractor/src/mmengine/_strategy/single_device.py
@@ -0,0 +1,287 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+from typing import Callable, Dict, List, Optional, Union
+
+import torch.nn as nn
+
+import mmengine
+from mmengine.device import get_device
+from mmengine.model import revert_sync_batchnorm
+from mmengine.optim import BaseOptimWrapper, _ParamScheduler
+from mmengine.registry import STRATEGIES
+from mmengine.utils import get_git_hash
+from .base import BaseStrategy
+
+
+@STRATEGIES.register_module()
+class SingleDeviceStrategy(BaseStrategy):
+    """Strategy for single device training."""
+
+    def prepare(
+        self,
+        model: Union[nn.Module, dict],
+        *,
+        optim_wrapper: Union[BaseOptimWrapper, dict, None] = None,
+        param_scheduler: Union[_ParamScheduler, Dict, List, None] = None,
+        compile: Union[dict, bool] = False,
+        dispatch_kwargs: Optional[dict] = None,
+    ):
+        """Prepare model and some components.
+
+        Args:
+            model (:obj:`torch.nn.Module` or dict): The model to be run. It
+                can be a dict used for build a model.
+
+        Keyword Args:
+            optim_wrapper (BaseOptimWrapper or dict, optional): Computing the
+                gradient of model parameters and updating them.
+                Defaults to None.
+                See :meth:`build_optim_wrapper` for examples.
+            param_scheduler (_ParamScheduler or dict or list, optional):
+                Parameter scheduler for updating optimizer parameters. If
+                specified, :attr:`optim_wrapper` should also be specified.
+                Defaults to None.
+                See :meth:`build_param_scheduler` for examples.
+            compile (dict, optional): Config to compile model.
+                Defaults to False. Requires PyTorch>=2.0.
+            dispatch_kwargs (dict, optional): Kwargs to be passed to other
+                methods of Strategy. Defaults to None.
+                If ``accumulative_counts`` is set in ``optim_wrapper``, you
+                need to provide ``max_iters`` in ``dispatch_kwargs``.
+        """
+        if self._prepared:
+            return self._prepared_components()
+        if dispatch_kwargs is not None:
+            self.dispatch_kwargs.update(dispatch_kwargs)
+
+        model = self.build_model(model)
+        model = self._init_model_weights(model)
+        model = self._wrap_model(model)
+        model = self.compile_model(model, compile=compile)
+
+        self.model = model
+
+        if optim_wrapper is not None:
+            self.optim_wrapper = self.build_optim_wrapper(optim_wrapper, model)
+            self._scale_lr()
+
+            accumulative_counts = getattr(self.optim_wrapper,
+                                          '_accumulative_counts', 1)
+            if accumulative_counts > 1:
+                if 'max_iters' not in self.dispatch_kwargs:
+                    raise ValueError(
+                        '"max_iters" must be specified because '
+                        '"accumulative_counts" was set as '
+                        f'{accumulative_counts} which is greater than 1.')
+
+                self.optim_wrapper.initialize_count_status(  # type: ignore
+                    self.model, 0, self.dispatch_kwargs['max_iters'])
+
+        if param_scheduler is not None:
+            self.param_schedulers = self.build_param_scheduler(
+                param_scheduler, self.optim_wrapper)
+
+        self._prepared = True
+        return self._prepared_components()
+
+    def _wrap_model(self, model: nn.Module) -> nn.Module:
+        model = self.convert_model(model)
+        current_device = get_device()
+        return model.to(current_device)
+
+    def convert_model(self, model: nn.Module) -> nn.Module:
+        """Convert layers of model.
+
+        convert all ``SyncBatchNorm`` (SyncBN) and
+        ``mmcv.ops.sync_bn.SyncBatchNorm`` (MMSyncBN) layers in the model to
+        ``BatchNormXd`` layers.
+
+        Args:
+            model (nn.Module): Model to convert.
+        """
+        self.logger.info(
+            'Distributed training is not used, all SyncBatchNorm (SyncBN) '
+            'layers in the model will be automatically reverted to '
+            'BatchNormXd layers if they are used.')
+        model = revert_sync_batchnorm(model)
+        return model
+
+    def load_checkpoint(
+        self,
+        filename: str,
+        *,
+        map_location: Union[str, Callable] = 'cpu',
+        strict: bool = False,
+        revise_keys: list = [(r'^module.', '')],
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Load checkpoint from given ``filename``.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+
+        Keyword Args:
+            map_location (str or callable): A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'cpu'.
+            strict (bool): strict (bool): Whether to allow different params for
+                the model and checkpoint.
+            revise_keys (list): A list of customized keywords to modify the
+                state_dict in checkpoint. Each item is a (pattern, replacement)
+                pair of the regular expression operations. Defaults to strip
+                the prefix 'module.' by [(r'^module\\.', '')].
+            callback (callable, callable): Callback function to modify the
+                checkpoint after loading the checkpoint.
+                Defaults to None.
+        """
+        from mmengine.runner.checkpoint import _load_checkpoint
+
+        self.logger.info(f'Load checkpoint from {filename}')
+
+        if map_location == 'default':
+            device = get_device()
+            checkpoint = _load_checkpoint(filename, map_location=device)
+        else:
+            checkpoint = _load_checkpoint(filename, map_location=map_location)
+
+        # users can do some modification after loading checkpoint
+        if callback is not None:
+            callback(checkpoint)
+
+        state_dict = checkpoint.pop('state_dict')
+        self.load_model_state_dict(
+            state_dict, strict=strict, revise_keys=revise_keys)
+
+        return checkpoint
+
+    def resume(
+        self,
+        filename: str,
+        *,
+        resume_optimizer: bool = True,
+        resume_param_scheduler: bool = True,
+        map_location: Union[str, Callable] = 'default',
+        callback: Optional[Callable] = None,
+    ) -> dict:
+        """Resume training from given ``filename``.
+
+        Four types of states will be resumed.
+
+        - model state
+        - optimizer state
+        - scheduler state
+        - randomness state
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+
+        Keyword Args:
+            resume_optimizer (bool): Whether to resume optimizer state.
+                Defaults to True.
+            resume_param_scheduler (bool): Whether to resume param scheduler
+                state. Defaults to True.
+            map_location (str or callable):A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'default'.
+            callback (callable, callable): Callback function to modify the
+                checkpoint before saving the checkpoint.
+                Defaults to None.
+        """
+        self.logger.info(f'Resume checkpoint from {filename}')
+
+        checkpoint = self.load_checkpoint(
+            filename, map_location=map_location, callback=callback)
+
+        if resume_optimizer:
+            self.load_optim_state_dict(checkpoint.pop('optimizer'))
+
+        if resume_param_scheduler and hasattr(self, 'param_schedulers'):
+            self.load_scheduler_state_dict(checkpoint.pop('param_schedulers'))
+
+        # resume random seed
+        resumed_seed = checkpoint['meta'].get('seed', None)
+        current_seed = self._randomness.get('seed')
+        if resumed_seed is not None and resumed_seed != current_seed:
+            if current_seed is not None:
+                self.logger.warning(f'The value of random seed in the '
+                                    f'checkpoint "{resumed_seed}" is '
+                                    f'different from the value in '
+                                    f'`randomness` config "{current_seed}"')
+            self._randomness.update(seed=resumed_seed)
+            self._set_randomness(**self._randomness)
+
+        # resume iter
+        cur_iter = checkpoint['meta']['iter']
+
+        if hasattr(self, 'optim_wrapper'):
+            accumulative_counts = getattr(self.optim_wrapper,
+                                          '_accumulative_counts', 1)
+            if accumulative_counts > 1:
+                if 'max_iters' not in self.dispatch_kwargs:
+                    raise ValueError(
+                        '"max_iters" must be specified because '
+                        '"accumulative_counts" was set as '
+                        f'{accumulative_counts} which is greater than 1.')
+                    # Initiate inner count of `optim_wrapper`.
+                self.optim_wrapper.initialize_count_status(  # type: ignore
+                    self.model, cur_iter, self.dispatch_kwargs['max_iters'])
+
+        return checkpoint
+
+    def save_checkpoint(
+        self,
+        filename: str,
+        *,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        extra_ckpt: Optional[dict] = None,
+        callback: Optional[Callable] = None,
+    ) -> None:
+        """Save checkpoint to given ``filename``.
+
+        Args:
+            filename (str): Filename to save checkpoint.
+
+        Keyword Args:
+            save_optimizer (bool): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            save_param_scheduler (bool): Whether to save the param_scheduler
+                to the checkpoint. Defaults to True.
+            extra_ckpt (dict, optional): Extra checkpoint to save.
+                Defaults to None.
+            callback (callable, callable): Callback function to modify the
+                checkpoint before saving the checkpoint.
+                Defaults to None.
+        """
+        from mmengine.runner.checkpoint import save_checkpoint
+
+        state_dict: dict = dict()
+        state_dict['state_dict'] = self.model_state_dict()
+
+        # save optimizer state dict
+        if save_optimizer and hasattr(self, 'optim_wrapper'):
+            state_dict['optimizer'] = self.optim_state_dict()
+
+        if save_param_scheduler and hasattr(self, 'param_schedulers'):
+            state_dict['param_schedulers'] = self.scheduler_state_dict()
+
+        # save extra checkpoint passed by users
+        if extra_ckpt is None:
+            extra_ckpt = dict()
+        if 'meta' not in extra_ckpt:
+            extra_ckpt['meta'] = dict()
+        extra_ckpt['meta'].update(
+            seed=self.seed,
+            time=time.strftime('%Y%m%d_%H%M%S', time.localtime()),
+            mmengine=mmengine.__version__ + get_git_hash(),
+        )
+
+        state_dict.update(extra_ckpt)
+
+        # users can do some modification before saving checkpoint
+        if callback is not None:
+            callback(state_dict)
+
+        save_checkpoint(state_dict, filename)
diff --git a/head_extractor/src/mmengine/_strategy/utils.py b/head_extractor/src/mmengine/_strategy/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c691bd602bf106fa83a16bb92757e96fde5366ba
--- /dev/null
+++ b/head_extractor/src/mmengine/_strategy/utils.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch._subclasses.fake_tensor import _is_tensor_constructor
+from torch.utils._python_dispatch import TorchDispatchMode
+
+
+class MetaTensorContext(TorchDispatchMode):
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        if _is_tensor_constructor(func):
+            device_idx = [arg.name
+                          for arg in func._schema.arguments].index('device')
+            if len(args) > device_idx:
+                args = list(args)
+                args[device_idx] = 'meta'
+            else:
+                kwargs['device'] = 'meta'
+        return func(*args, **kwargs)
diff --git a/head_extractor/src/mmengine/analysis/__init__.py b/head_extractor/src/mmengine/analysis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e51090c387d665f633d550aa222e06debe26357e
--- /dev/null
+++ b/head_extractor/src/mmengine/analysis/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .complexity_analysis import (ActivationAnalyzer, FlopAnalyzer,
+                                  activation_count, flop_count,
+                                  parameter_count, parameter_count_table)
+from .print_helper import get_model_complexity_info
+
+__all__ = [
+    'FlopAnalyzer', 'ActivationAnalyzer', 'flop_count', 'activation_count',
+    'parameter_count', 'parameter_count_table', 'get_model_complexity_info'
+]
diff --git a/head_extractor/src/mmengine/analysis/complexity_analysis.py b/head_extractor/src/mmengine/analysis/complexity_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..435e5fe5d3a1af4cf2d94ab11ae52910b290d5fc
--- /dev/null
+++ b/head_extractor/src/mmengine/analysis/complexity_analysis.py
@@ -0,0 +1,357 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import typing
+from collections import defaultdict
+from typing import Any, Counter, DefaultDict, Dict, Optional, Tuple, Union
+
+import torch.nn as nn
+from rich import box
+from rich.console import Console
+from rich.table import Table
+from torch import Tensor
+
+from .jit_analysis import JitModelAnalysis
+from .jit_handles import (Handle, addmm_flop_jit, batchnorm_flop_jit,
+                          bmm_flop_jit, conv_flop_jit, einsum_flop_jit,
+                          elementwise_flop_counter, generic_activation_jit,
+                          linear_flop_jit, matmul_flop_jit, norm_flop_counter)
+
+# A dictionary that maps supported operations to their flop count jit handles.
+_DEFAULT_SUPPORTED_FLOP_OPS: Dict[str, Handle] = {
+    'aten::addmm': addmm_flop_jit,
+    'aten::bmm': bmm_flop_jit,
+    'aten::_convolution': conv_flop_jit,
+    'aten::einsum': einsum_flop_jit,
+    'aten::matmul': matmul_flop_jit,
+    'aten::mm': matmul_flop_jit,
+    'aten::linear': linear_flop_jit,
+    # You might want to ignore BN flops due to inference-time fusion.
+    # Use `set_op_handle("aten::batch_norm", None)
+    'aten::batch_norm': batchnorm_flop_jit,
+    'aten::group_norm': norm_flop_counter(2),
+    'aten::layer_norm': norm_flop_counter(2),
+    'aten::instance_norm': norm_flop_counter(1),
+    'aten::upsample_nearest2d': elementwise_flop_counter(0, 1),
+    'aten::upsample_bilinear2d': elementwise_flop_counter(0, 4),
+    'aten::adaptive_avg_pool2d': elementwise_flop_counter(1, 0),
+    'aten::grid_sampler': elementwise_flop_counter(0, 4),  # assume bilinear
+}
+
+# A dictionary that maps supported operations to
+# their activation count handles.
+_DEFAULT_SUPPORTED_ACT_OPS: Dict[str, Handle] = {
+    'aten::_convolution': generic_activation_jit('conv'),
+    'aten::addmm': generic_activation_jit(),
+    'aten::bmm': generic_activation_jit(),
+    'aten::einsum': generic_activation_jit(),
+    'aten::matmul': generic_activation_jit(),
+    'aten::linear': generic_activation_jit(),
+}
+
+
+class FlopAnalyzer(JitModelAnalysis):
+    """Provides access to per-submodule model flop count obtained by tracing a
+    model with pytorch's jit tracing functionality.
+
+    By default, comes with standard flop counters for a few common operators.
+
+    Note:
+        - Flop is not a well-defined concept. We just produce our best
+          estimate.
+        - We count one fused multiply-add as one flop.
+
+    Handles for additional operators may be added, or the default ones
+    overwritten, using the ``.set_op_handle(name, func)`` method.
+    See the method documentation for details.
+    Flop counts can be obtained as:
+
+    - ``.total(module_name="")``: total flop count for the module
+    - ``.by_operator(module_name="")``: flop counts for the module, as a
+      Counter over different operator types
+    - ``.by_module()``: Counter of flop counts for all submodules
+    - ``.by_module_and_operator()``: dictionary indexed by descendant of
+      Counters over different operator types
+
+    An operator is treated as within a module if it is executed inside the
+    module's ``__call__`` method. Note that this does not include calls to
+    other methods of the module or explicit calls to ``module.forward(...)``.
+
+    Modified from
+    https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/flop_count.py
+
+    Args:
+        model (nn.Module): The model to analyze.
+        inputs (Union[Tensor, Tuple[Tensor, ...]]): The input to the model.
+
+    Examples:
+        >>> import torch.nn as nn
+        >>> import torch
+        >>> class TestModel(nn.Module):
+        ...    def __init__(self):
+        ...        super().__init__()
+        ...        self.fc = nn.Linear(in_features=1000, out_features=10)
+        ...        self.conv = nn.Conv2d(
+        ...            in_channels=3, out_channels=10, kernel_size=1
+        ...        )
+        ...        self.act = nn.ReLU()
+        ...    def forward(self, x):
+        ...        return self.fc(self.act(self.conv(x)).flatten(1))
+        >>> model = TestModel()
+        >>> inputs = (torch.randn((1,3,10,10)),)
+        >>> flops = FlopAnalyzer(model, inputs)
+        >>> flops.total()
+        13000
+        >>> flops.total("fc")
+        10000
+        >>> flops.by_operator()
+        Counter({"addmm" : 10000, "conv" : 3000})
+        >>> flops.by_module()
+        Counter({"" : 13000, "fc" : 10000, "conv" : 3000, "act" : 0})
+        >>> flops.by_module_and_operator()
+        {"" : Counter({"addmm" : 10000, "conv" : 3000}),
+        "fc" : Counter({"addmm" : 10000}),
+        "conv" : Counter({"conv" : 3000}),
+        "act" : Counter()
+        }
+    """
+
+    def __init__(
+        self,
+        model: nn.Module,
+        inputs: Union[Tensor, Tuple[Tensor, ...]],
+    ) -> None:
+        super().__init__(model=model, inputs=inputs)
+        self.set_op_handle(**_DEFAULT_SUPPORTED_FLOP_OPS)
+
+    __init__.__doc__ = JitModelAnalysis.__init__.__doc__
+
+
+class ActivationAnalyzer(JitModelAnalysis):
+    """Provides access to per-submodule model activation count obtained by
+    tracing a model with pytorch's jit tracing functionality.
+
+    By default, comes with standard activation counters for convolutional and
+    dot-product operators. Handles for additional operators may be added, or
+    the default ones overwritten, using the ``.set_op_handle(name, func)``
+    method. See the method documentation for details. Activation counts can be
+    obtained as:
+
+    - ``.total(module_name="")``: total activation count for a module
+    - ``.by_operator(module_name="")``: activation counts for the module,
+      as a Counter over different operator types
+    - ``.by_module()``: Counter of activation counts for all submodules
+    - ``.by_module_and_operator()``: dictionary indexed by descendant of
+      Counters over different operator types
+
+    An operator is treated as within a module if it is executed inside the
+    module's ``__call__`` method. Note that this does not include calls to
+    other methods of the module or explicit calls to ``module.forward(...)``.
+
+    Modified from
+    https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/activation_count.py
+
+    Args:
+        model (nn.Module): The model to analyze.
+        inputs (Union[Tensor, Tuple[Tensor, ...]]): The input to the model.
+
+    Examples:
+        >>> import torch.nn as nn
+        >>> import torch
+        >>> class TestModel(nn.Module):
+        ...     def __init__(self):
+        ...        super().__init__()
+        ...        self.fc = nn.Linear(in_features=1000, out_features=10)
+        ...        self.conv = nn.Conv2d(
+        ...            in_channels=3, out_channels=10, kernel_size=1
+        ...        )
+        ...        self.act = nn.ReLU()
+        ...    def forward(self, x):
+        ...        return self.fc(self.act(self.conv(x)).flatten(1))
+        >>> model = TestModel()
+        >>> inputs = (torch.randn((1,3,10,10)),)
+        >>> acts = ActivationAnalyzer(model, inputs)
+        >>> acts.total()
+        1010
+        >>> acts.total("fc")
+        10
+        >>> acts.by_operator()
+        Counter({"conv" : 1000, "addmm" : 10})
+        >>> acts.by_module()
+        Counter({"" : 1010, "fc" : 10, "conv" : 1000, "act" : 0})
+        >>> acts.by_module_and_operator()
+        {"" : Counter({"conv" : 1000, "addmm" : 10}),
+        "fc" : Counter({"addmm" : 10}),
+        "conv" : Counter({"conv" : 1000}),
+        "act" : Counter()
+        }
+    """
+
+    def __init__(
+        self,
+        model: nn.Module,
+        inputs: Union[Tensor, Tuple[Tensor, ...]],
+    ) -> None:
+        super().__init__(model=model, inputs=inputs)
+        self.set_op_handle(**_DEFAULT_SUPPORTED_ACT_OPS)
+
+    __init__.__doc__ = JitModelAnalysis.__init__.__doc__
+
+
+def flop_count(
+    model: nn.Module,
+    inputs: Tuple[Any, ...],
+    supported_ops: Optional[Dict[str, Handle]] = None,
+) -> Tuple[DefaultDict[str, float], Counter[str]]:
+    """Given a model and an input to the model, compute the per-operator Gflops
+    of the given model.
+
+    Adopted from
+    https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/flop_count.py
+
+    Args:
+        model (nn.Module): The model to compute flop counts.
+        inputs (tuple): Inputs that are passed to `model` to count flops.
+            Inputs need to be in a tuple.
+        supported_ops (dict(str,Callable) or None) : provide additional
+            handlers for extra ops, or overwrite the existing handlers for
+            convolution and matmul and einsum. The key is operator name and
+            the value is a function that takes (inputs, outputs) of the op.
+            We count one Multiply-Add as one FLOP.
+
+    Returns:
+        tuple[defaultdict, Counter]: A dictionary that records the number of
+        gflops for each operation and a Counter that records the number of
+        unsupported operations.
+    """
+    if supported_ops is None:
+        supported_ops = {}
+    flop_counter = FlopAnalyzer(model, inputs).set_op_handle(**supported_ops)
+    giga_flops = defaultdict(float)
+    for op, flop in flop_counter.by_operator().items():
+        giga_flops[op] = flop / 1e9
+    return giga_flops, flop_counter.unsupported_ops()
+
+
+def activation_count(
+    model: nn.Module,
+    inputs: Tuple[Any, ...],
+    supported_ops: Optional[Dict[str, Handle]] = None,
+) -> Tuple[DefaultDict[str, float], Counter[str]]:
+    """Given a model and an input to the model, compute the total number of
+    activations of the model.
+
+    Adopted from
+    https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/activation_count.py
+
+    Args:
+        model (nn.Module): The model to compute activation counts.
+        inputs (tuple): Inputs that are passed to `model` to count activations.
+            Inputs need to be in a tuple.
+        supported_ops (dict(str,Callable) or None) : provide additional
+            handlers for extra ops, or overwrite the existing handlers for
+            convolution and matmul. The key is operator name and the value
+            is a function that takes (inputs, outputs) of the op.
+
+    Returns:
+        tuple[defaultdict, Counter]: A dictionary that records the number of
+        activation (mega) for each operation and a Counter that records the
+        number of unsupported operations.
+    """
+    if supported_ops is None:
+        supported_ops = {}
+    act_counter = ActivationAnalyzer(model,
+                                     inputs).set_op_handle(**supported_ops)
+    mega_acts = defaultdict(float)
+    for op, act in act_counter.by_operator().items():
+        mega_acts[op] = act / 1e6
+    return mega_acts, act_counter.unsupported_ops()
+
+
+def parameter_count(model: nn.Module) -> typing.DefaultDict[str, int]:
+    """Count parameters of a model and its submodules.
+
+    Adopted from
+    https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/parameter_count.py
+
+    Args:
+        model (nn.Module): the model to count parameters.
+
+    Returns:
+        dict[str, int]: the key is either a parameter name or a module name.
+        The value is the number of elements in the parameter, or in all
+        parameters of the module. The key "" corresponds to the total
+        number of parameters of the model.
+    """
+    count = defaultdict(int)  # type: typing.DefaultDict[str, int]
+    for name, param in model.named_parameters():
+        size = param.numel()
+        name = name.split('.')
+        for k in range(0, len(name) + 1):
+            prefix = '.'.join(name[:k])
+            count[prefix] += size
+    return count
+
+
+def parameter_count_table(model: nn.Module, max_depth: int = 3) -> str:
+    """Format the parameter count of the model (and its submodules or
+    parameters)
+
+    Adopted from
+    https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/parameter_count.py
+
+    Args:
+        model (nn.Module): the model to count parameters.
+        max_depth (int): maximum depth to recursively print submodules or
+            parameters
+
+    Returns:
+        str: the table to be printed
+    """
+    count: typing.DefaultDict[str, int] = parameter_count(model)
+    # pyre-fixme[24]: Generic type `tuple` expects at least 1 type parameter.
+    param_shape: typing.Dict[str, typing.Tuple] = {
+        k: tuple(v.shape)
+        for k, v in model.named_parameters()
+    }
+
+    # pyre-fixme[24]: Generic type `tuple` expects at least 1 type parameter.
+    rows: typing.List[typing.Tuple] = []
+
+    def format_size(x: int) -> str:
+        if x > 1e8:
+            return f'{x / 1e9:.1f}G'
+        if x > 1e5:
+            return f'{x / 1e6:.1f}M'
+        if x > 1e2:
+            return f'{x / 1e3:.1f}K'
+        return str(x)
+
+    def fill(lvl: int, prefix: str) -> None:
+        if lvl >= max_depth:
+            return
+        for name, v in count.items():
+            if name.count('.') == lvl and name.startswith(prefix):
+                indent = ' ' * (lvl + 1)
+                if name in param_shape:
+                    rows.append(
+                        (indent + name, indent + str(param_shape[name])))
+                else:
+                    rows.append((indent + name, indent + format_size(v)))
+                    fill(lvl + 1, name + '.')
+
+    rows.append(('model', format_size(count.pop(''))))
+    fill(0, '')
+
+    table = Table(
+        title=f'parameter count of {model.__class__.__name__}', box=box.ASCII2)
+    table.add_column('name')
+    table.add_column('#elements or shape')
+
+    for row in rows:
+        table.add_row(*row)
+
+    console = Console()
+    with console.capture() as capture:
+        console.print(table, end='')
+
+    return capture.get()
diff --git a/head_extractor/src/mmengine/analysis/jit_analysis.py b/head_extractor/src/mmengine/analysis/jit_analysis.py
new file mode 100644
index 0000000000000000000000000000000000000000..17b294863a507718716b06b447161f8bf660e14e
--- /dev/null
+++ b/head_extractor/src/mmengine/analysis/jit_analysis.py
@@ -0,0 +1,684 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Modified from
+# https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/jit_analysis.py
+
+import logging
+import typing
+import warnings
+from collections import Counter
+from copy import copy
+from dataclasses import dataclass
+from numbers import Number
+from typing import (Any, Dict, Iterable, Iterator, List, Optional, Set, Tuple,
+                    TypeVar, Union)
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.jit import TracerWarning, _get_trace_graph
+
+from mmengine.logging import print_log
+from .jit_handles import Handle
+
+T = TypeVar('T', bound='JitModelAnalysis')
+
+# Only ignore ops that are technically truly 0 flops:
+# shape-manipulation ops, integer ops, memory copy ops
+_IGNORED_OPS: Set[str] = {
+    'aten::Int',
+    'aten::ScalarImplicit',
+    'aten::__and__',
+    'aten::arange',
+    'aten::bitwise_not',
+    'aten::cat',
+    'aten::chunk',
+    'aten::clamp',
+    'aten::clamp_',
+    'aten::constant_pad_nd',
+    'aten::contiguous',
+    'aten::copy_',
+    'aten::detach',
+    'aten::dropout',
+    'aten::empty',
+    'aten::eq',
+    'aten::expand',
+    'aten::flatten',
+    'aten::floor',
+    'aten::floor_divide',
+    'aten::full',
+    'aten::full_like',
+    'aten::gather',
+    'aten::ge',
+    'aten::gt',
+    'aten::index',
+    'aten::index_put_',
+    'aten::masked_fill',
+    'aten::max',
+    'aten::narrow',
+    'aten::new_empty',
+    'aten::new_full',
+    'aten::new_zeros',
+    'aten::nonzero',
+    'aten::ones',
+    'aten::permute',
+    'aten::relu',
+    'aten::relu_',
+    'aten::remainder',
+    'aten::reshape',
+    'aten::roll',
+    'aten::select',
+    'aten::size',
+    'aten::slice',
+    'aten::split',
+    'aten::split_with_sizes',
+    'aten::squeeze',
+    'aten::stack',
+    'aten::t',
+    'aten::to',
+    'aten::transpose',
+    'aten::type_as',
+    'aten::unbind',
+    'aten::unsqueeze',
+    'aten::unsqueeze_',
+    'aten::view',
+    'aten::zeros',
+    'aten::zeros_like',
+}
+
+
+@dataclass
+class Statistics:
+    """For keeping track of the various model statistics recorded during
+    analysis."""
+
+    counts: Dict[str, typing.Counter[str]]
+    unsupported_ops: Dict[str, typing.Counter[str]]
+    uncalled_mods: Set[str]
+
+
+def _named_modules_with_dup(model: nn.Module,
+                            prefix: str = ''
+                            ) -> Iterable[Tuple[str, nn.Module]]:
+    """The same as `model.named_modules()`, except that it includes duplicated
+    modules that have more than one name."""
+    yield prefix, model
+    for name, module in model._modules.items():
+        if module is None:
+            continue
+        submodule_prefix = prefix + ('.' if prefix else '') + name
+        yield from _named_modules_with_dup(module, submodule_prefix)
+
+
+def _named_modules_without_dup(
+        model: nn.Module) -> Iterator[Tuple[str, nn.Module]]:
+    """Like .named_modules(), but the results are slightly different for some
+    wrapped models."""
+    seen = set()
+    for name, mod in _named_modules_with_dup(model):
+        if mod not in seen:
+            seen.add(mod)
+            yield name, mod
+
+
+def _get_scoped_trace_graph(
+    module: nn.Module,
+    inputs: Union[Tensor, Tuple[Tensor, ...]],
+    aliases: Dict[Union[str, nn.Module], str],
+) -> torch._C.Graph:
+    """Traces the provided module using torch.jit._get_trace_graph, but adds
+    submodule scope information to each graph node.
+
+    The resulting graph is in-lined and has all model parameters treated as
+    inputs. The input model has the scope name '', while its descendants
+    have names of the form 'child.grandchild.grandgrandchild...'.
+
+    Args:
+        model (nn.Module): The module to trace
+        inputs (tuple): Inputs used during the trace of the model
+        aliases (dict[str or nn.Module, str]): maps modules and module
+            names to the canonical name to be used as the scope for
+            that module.
+
+    Returns:
+        graph (torch._C.Graph): The pytorch JIT trace of the model
+    """
+
+    # torch.jit._get_trace_graph can trace torch function like `aten::linear`,
+    # `aten::add` etc. However, the traced node(function) cannot tell it is
+    # called by which module. `ScopePushHook` and `ScopePopHook` can
+    # help traced node get the module name information by `node.scopeName()`.
+    class ScopePushHook:
+
+        def __init__(self, name: str) -> None:
+            self.name = name
+
+        def __call__(self, module: nn.Module, inputs: Any) -> Any:
+            tracing_state = torch._C._get_tracing_state()
+            if tracing_state:
+                tracing_state.push_scope(self.name)
+            return inputs
+
+    class ScopePopHook:
+
+        def __call__(self, module: nn.Module, inputs: Any,
+                     outputs: Any) -> Any:
+            tracing_state = torch._C._get_tracing_state()
+            if tracing_state:
+                tracing_state.pop_scope()
+            return outputs
+
+    hook_handles: List[Any] = []
+
+    def register_hooks(mod: nn.Module, name: str) -> None:
+        prehook = mod.register_forward_pre_hook(ScopePushHook(name))
+        posthook = mod.register_forward_hook(ScopePopHook())
+        hook_handles.append(prehook)
+        hook_handles.append(posthook)
+
+    # Unwrap DDP, but correct the scope names for the root module.
+    module_list = (nn.parallel.distributed.DistributedDataParallel,
+                   nn.DataParallel)
+    # Since DataParallel just wraps the model, add an extra set of hooks
+    # to the model it wraps to account for the wrapper. Then trace it.
+    if isinstance(module, module_list):
+        root_name = aliases[module]
+        module = module.module
+        register_hooks(module, root_name)
+
+    for name, mod in _named_modules_without_dup(module):
+        name = aliases[mod]
+        register_hooks(mod, name)
+
+    graph, _ = _get_trace_graph(module, inputs)
+
+    for handle in hook_handles:
+        handle.remove()
+
+    return graph
+
+
+class JitModelAnalysis:
+    """Provides access to per-submodule model statistics obtained by tracing a
+    model with pytorch's jit tracing functionality.
+
+    Calculates a statistic on a per-operator basis using the provided set of
+    functions that acts on the inputs and outputs to the operator, then
+    aggregates this over modules in the model. Can return the aggregate
+    statistic for any submodule in the model. Is lazily evaluated, and will
+    perform the trace when a statistic is first requested. Changing the
+    operator handles will cause the trace to be rerun on the next request.
+
+    Submodules may be referred to using the module's name. The input model has
+    name "", while its descendants have names of the form
+    "child.grandchild.grandgrandchild...".
+
+    An operator is treated as within the scope of a module if calling that
+    module directly resulted in that operator being run. In particular, this
+    means that calls to other functions owned by a module or explicit
+    calls to module.forward(...) will not register resulting operators as
+    contributing statistics to that module.
+
+    We will trace the execution of `model.forward(inputs)`. This means
+    inputs have to be tensors or tuple of tensors (see
+    https://pytorch.org/docs/stable/generated/torch.jit.trace.html#torch.jit.trace).
+    In order to trace other methods or unsupported input types,
+    you may need to implement a wrapper module.
+
+    Args:
+        model: The model to analyze
+        inputs: The inputs to the model for analysis.
+    """
+
+    def __init__(
+        self,
+        model: nn.Module,
+        inputs: Union[Tensor, Tuple[Tensor, ...]],
+    ) -> None:
+        self._model = model
+        self._inputs = inputs
+        self._op_handles: Dict[str, Handle] = {}
+        # Mapping from names to submodules
+        self._named_modules: Dict[str, nn.Module] = dict(
+            _named_modules_with_dup(model))
+        # Mapping from submodules and their aliases to the canonical name
+        # of each submodule
+        self._aliases: Dict[Union[nn.Module, str],
+                            str] = self._get_aliases(model)
+        self._stats: Optional[Statistics] = None
+
+        self._ignored_ops: Set[str] = copy(_IGNORED_OPS)
+        self.unsupported_ops_warnings(True)
+        self.uncalled_modules_warnings(True)
+        self.tracer_warnings('no_tracer_warning')
+        self.ancestor_mode('owner')
+
+    def total(self, module_name: str = '') -> int:
+        """Returns the total aggregated statistic across all operators for the
+        requested module.
+
+        Args:
+            module_name (str): The submodule to get data for. Defaults to
+                the entire model.
+
+        Returns:
+            int: The aggregated statistic.
+        """
+        stats = self._analyze()
+        module_name = self.canonical_module_name(module_name)
+        total_count = sum(stats.counts[module_name].values())
+        return total_count
+
+    def by_operator(self, module_name: str = '') -> typing.Counter[str]:
+        """Returns the statistics for a requested module, grouped by operator
+        type.
+
+        The operator handle determines the name associated with each
+        operator type.
+
+        Args:
+            module_name (str): The submodule to get data for. Defaults
+                to the entire model.
+
+        Returns:
+            Counter(str): The statistics for each operator.
+        """
+        stats = self._analyze()
+        module_name = self.canonical_module_name(module_name)
+        return stats.counts[module_name]
+
+    def by_module_and_operator(self) -> Dict[str, typing.Counter[str]]:
+        """Returns the statistics for all submodules, separated out by operator
+        type for each submodule.
+
+        The operator handle determines the name associated with
+        each operator type.
+
+        Returns:
+            dict[str, Counter(str)]: The statistics for each submodule
+            and each operator. Grouped by submodule names, then
+            by operator name.
+        """
+        stats = self._analyze()
+        return stats.counts
+
+    def by_module(self) -> typing.Counter[str]:
+        """Returns the statistics for all submodules, aggregated over all
+        operators.
+
+        Returns:
+            Counter(str): statistics counter grouped by submodule names
+        """
+        stats = self._analyze()
+        summed_counts = Counter()  # type: Counter
+        for mod, results in stats.counts.items():
+            summed_counts[mod] = sum(results.values())
+        return summed_counts
+
+    def unsupported_ops(self, module_name: str = '') -> typing.Counter[str]:
+        """Lists the number of operators that were encountered but unsupported
+        because no operator handle is available for them.
+
+        Does not include operators that are explicitly ignored.
+
+        Args:
+            module_name (str): The submodule to list unsupported ops.
+                Defaults to the entire model.
+
+        Returns:
+            Counter(str): The number of occurrences each unsupported operator.
+        """
+        if self._stats is None:
+            raise RuntimeError('Analysis results should be computed '
+                               'before calling unsupported_ops()')
+        module_name = self.canonical_module_name(module_name)
+        return self._stats.unsupported_ops[module_name]  # pyre-fixme
+
+    def uncalled_modules(self) -> Set[str]:
+        """Returns a set of submodules that were never called during the trace
+        of the graph.
+
+        This may be because they were unused, or because they were
+        accessed via direct calls .forward() or with other python methods.
+        In the latter case, statistics will not be attributed to the submodule,
+        though the statistics will be included
+        in the parent module.
+
+        Returns:
+            set[str]: The set of submodule names that were never called
+            during the trace of the model.
+        """
+        stats = self._analyze()
+        return stats.uncalled_mods
+
+    def set_op_handle(self, *args,
+                      **kwargs: Optional[Handle]) -> 'JitModelAnalysis':
+        """Sets additional operator handles, or replaces existing ones.
+
+        If a handle is ``None``, the op will be explicitly ignored. Otherwise,
+        handle should be a function that calculates the desirable statistic
+        from an operator. The function must take two arguments, which are the
+        inputs and outputs of the operator, in the form of
+        ``list(torch._C.Value)``. The function should return a counter object
+        with per-operator statistics.
+
+        Args:
+            args: (str, Handle) pairs of operator names and handles.
+            kwargs: mapping from operator names to handles.
+
+        Examples:
+            >>> handlers = {"aten::linear": my_handler}
+            >>> counter.set_op_handle("aten::matmul", None,
+            ...     "aten::bmm", my_handler2).set_op_handle(**handlers)
+        """
+        self._stats = None
+        if len(args) % 2 != 0:
+            raise TypeError(
+                'set_op_handle should be called with pairs of names and'
+                'handles!')
+        for name, handle in zip(args[::2], args[1::2]):
+            kwargs[name] = handle
+        for name, handle in kwargs.items():
+            if handle is None:
+                self._ignored_ops.add(name)
+            else:
+                self._op_handles[name] = handle
+        return self
+
+    def clear_op_handles(self) -> 'JitModelAnalysis':
+        """Clears all operator handles currently set."""
+        self._op_handles = {}
+        self._ignored_ops = copy(_IGNORED_OPS)
+        self._stats = None
+        return self
+
+    def canonical_module_name(self, name: str) -> str:
+        """Returns the canonical module name of the given ``name``, which might
+        be different from the given ``name`` if the module is shared.
+
+        This is the name that will be used as a key when statistics are
+        output using .by_module() and .by_module_and_operator().
+
+        Args:
+            name (str): The name of the module to find the canonical name for.
+
+        Returns:
+            str: The canonical name of the module.
+        """
+        # Blocks access by a direct module reference
+        assert isinstance(name, str), 'Module name must be a string.'
+        if name in self._aliases:
+            return self._aliases[name]
+        else:
+            raise KeyError('Requested module name is not among '
+                           'the descendants of the analyzed model.')
+
+    def copy(
+        self,
+        new_model: Optional[nn.Module] = None,
+        new_inputs: Union[None, Tensor, Tuple[Tensor, ...]] = None,
+    ) -> 'JitModelAnalysis':
+        """Returns a copy of the :class:`JitModelAnalysis` object, keeping all
+        settings, but on a new model or new inputs.
+
+        Args:
+            new_model (nn.Module or None): a new model for the new
+                JitModelAnalysis. If None, uses the original model.
+                Defaults to None.
+            new_inputs (typing.Tuple[object, ...], optional): new inputs
+                for the new JitModelAnalysis. If None, uses the original
+                inputs. Defaults to None.
+
+        Returns:
+            JitModelAnalysis: the new model analysis object
+        """
+        model = self._model if new_model is None else new_model
+        inputs = self._inputs if new_inputs is None else new_inputs
+        return (JitModelAnalysis(model=model, inputs=inputs).set_op_handle(
+            **self._op_handles).unsupported_ops_warnings(
+                self._enable_warn_unsupported_ops).uncalled_modules_warnings(
+                    self._enable_warn_uncalled_mods).tracer_warnings(
+                        self._warn_trace))
+
+    def tracer_warnings(self: T, mode: str) -> T:
+        """Sets which warnings to print when tracing the graph to calculate
+        statistics. There are three modes. Defaults to 'no_tracer_warning'.
+        Allowed values are:
+
+        * 'all' : keeps all warnings raised while tracing
+        * 'no_tracer_warning' : suppress torch.jit.TracerWarning only
+        * 'none' : suppress all warnings raised while tracing
+
+        Args:
+            mode (str) : warning mode in one of the above values.
+        """
+        if mode not in ['all', 'no_tracer_warning', 'none']:
+            raise ValueError(f'Unrecognized tracer warning mode {mode}.')
+        self._warn_trace = mode
+        return self
+
+    def ancestor_mode(self: T, mode: str) -> T:
+        """Sets how to determine the ancestor modules of an operator. Must be
+        one of "owner" or "caller".
+
+        * "caller": an operator belongs to all modules that are currently
+            executing `forward()` at the time the operator is called.
+        * "owner": an operator belongs to the last module that's executing
+            `forward()` at the time the operator is called, plus this
+            module's recursive parents. If an module has multiple parents
+            (e.g. a shared module), only one will be picked.
+
+        For most cases, a module only calls submodules it owns, so both
+        options would work identically. In certain edge cases, this option
+        will affect the hierarchy of results, but won't affect the total
+        count.
+        """
+        if mode not in ['owner', 'caller']:
+            raise ValueError(f'Unrecognized ancestor mode: {mode}')
+        self._ancestor_mode = mode
+        return self
+
+    def unsupported_ops_warnings(self: T, enabled: bool) -> T:
+        """Sets if warnings for unsupported operators are shown.
+
+        Defaults to True. Counts of unsupported operators may be
+        obtained from :meth:`unsupported_ops` regardless of this setting.
+
+        Args:
+            enabled (bool): Set to 'True' to show unsupported operator
+                warnings.
+        """
+        self._enable_warn_unsupported_ops = enabled
+        return self
+
+    def uncalled_modules_warnings(self: T, enabled: bool) -> T:
+        """Sets if warnings from uncalled submodules are shown.
+
+        Defaults to true. A submodule is considered "uncalled" if it is never
+        called during tracing. This may be because it is actually unused, or
+        because it is accessed via calls to ``.forward()`` or other methods of
+        the module. The set of uncalled modules may be obtained from
+        :meth:`uncalled_modules` regardless of this setting.
+
+        Args:
+            enabled (bool): Set to 'True' to show warnings.
+        """
+        self._enable_warn_uncalled_mods = enabled
+        return self
+
+    def _warn_unsupported_ops(self, ops: typing.Counter[str]) -> None:
+        if not self._enable_warn_unsupported_ops:
+            return
+
+        for op, freq in ops.items():
+            print_log(
+                'Unsupported operator {} encountered {} time(s)'.format(
+                    op, freq),
+                'current',
+                logging.WARNING,
+            )
+
+    def _warn_uncalled_mods(self, uncalled_mods: Set[str]) -> None:
+        if not self._enable_warn_uncalled_mods:
+            return
+        uncalled_mods = {x for x in uncalled_mods if self._has_forward(x)}
+        if len(uncalled_mods) == 0:
+            return
+
+        print_log(
+            'The following submodules of the model were never '
+            'called during the trace of the graph. They may be '
+            'unused, or they were accessed by direct calls to '
+            '.forward() or via other python methods. In the latter '
+            'case they will have zeros for statistics, though their '
+            'statistics will still contribute to their parent calling '
+            'module.\n' + ', '.join(sorted(uncalled_mods)), 'current',
+            logging.WARNING)
+
+    def _get_aliases(self,
+                     model: nn.Module) -> Dict[Union[str, nn.Module], str]:
+        aliases = {}
+        for name, module in _named_modules_with_dup(model):
+            if module not in aliases:
+                aliases[module] = name
+            aliases[name] = aliases[module]
+        return aliases
+
+    def _get_all_ancestors(self, module_name: str) -> Set[str]:
+        """Get all ancestors of the given module, defined by ownership.
+
+        If the given module has multiple owners, use its canonical name.
+        """
+        parts = self.canonical_module_name(module_name).split('.')
+        res = {''}
+        for k in range(len(parts) + 1):
+            res.add('.'.join(parts[:k]))
+        return res
+
+    def _analyze(self) -> 'Statistics':
+        # Don't calculate if results are already stored.
+        stats = self._stats
+        if stats is not None:
+            return stats
+
+        with warnings.catch_warnings():
+            if self._warn_trace == 'none':
+                warnings.simplefilter('ignore')
+            elif self._warn_trace == 'no_tracer_warning':
+                warnings.filterwarnings('ignore', category=TracerWarning)
+            graph = _get_scoped_trace_graph(self._model, self._inputs,
+                                            self._aliases)
+
+        # Assures even modules not in the trace graph are initialized to
+        # zero count
+        counts = {}  # type: Dict
+        unsupported_ops = {}  # type: Dict
+        # We don't need the duplication here, but self._model.named_modules()
+        # gives slightly different results for some wrapped models.
+        for _, mod in _named_modules_with_dup(self._model):
+            name = self._aliases[mod]
+            counts[name] = Counter()
+            unsupported_ops[name] = Counter()
+
+        all_seen = set()
+        for node in graph.nodes():
+            kind = node.kind()
+            if kind == 'prim::PythonOp':
+                # for PythonOp, pyname contains the actual name in Python
+                # pyre-fixme[16]: `Node` has no attribute `pyname`.
+                kind = kind + '.' + node.pyname()
+            scope_names = node.scopeName().split('/')
+            all_seen.update(scope_names)
+            # The result of node.scopeName() is like: `layer1/layer1.layer`
+            # Therefore, if there is not shared module ancestors will have the
+            # same value. However, if layer1.layer is used by multiple modules.
+            # scopeName() will return
+            # `layer1/layer1.layer`
+            # `layer2/layer1.layer` respectively
+            # If mode is `caller`, the ancestors will be:
+            # 'layer1', 'layer2', 'layer1.layer'
+            # else, the ancestors will be:
+            # 'layer1', 'layer1.layer'
+            # which means only the flops will only be counted into `layer1`.
+            if self._ancestor_mode == 'caller':
+                ancestors = set(scope_names)
+            else:
+                ancestors = self._get_all_ancestors(scope_names[-1])
+                all_seen.update(ancestors)
+            if kind not in self._op_handles:
+                if self._should_ignore_node(node):
+                    continue
+                for name in ancestors:
+                    unsupported_ops[name][kind] += 1
+            else:
+                inputs, outputs = list(node.inputs()), list(node.outputs())
+                op_counts = self._op_handles[kind](inputs, outputs)
+                if isinstance(op_counts, Number):
+                    op_counts = Counter(
+                        {self._simplify_op_name(kind): op_counts})
+                for v in op_counts.values():  # type: ignore
+                    if not isinstance(v, (int, float, np.float64, np.int64)):
+                        raise ValueError(
+                            f'Invalid type {type(v)} for the flop count! '
+                            'Please use a wider type to avoid overflow.')
+
+                # Assures an op contributes at most once to a module
+                for name in ancestors:
+                    counts[name] += op_counts
+
+        uncalled_mods = set(self._aliases.values()) - all_seen
+        stats = Statistics(
+            counts=counts,
+            unsupported_ops=unsupported_ops,
+            uncalled_mods=uncalled_mods)
+        self._stats = stats
+        self._warn_unsupported_ops(unsupported_ops[''])
+        self._warn_uncalled_mods(uncalled_mods)
+        return stats
+
+    def _simplify_op_name(self, full_op_name: str) -> str:
+        """Get simplified name of the op without the preceding namespace, e.g.
+        aten::batch_norm -> batch_norm."""
+        p = full_op_name.find('::')
+        if p != -1:
+            return full_op_name[p + 2:]
+        else:
+            return full_op_name
+
+    def _has_forward(self, mod_name: str) -> bool:
+        # Whether the module has a valid forward method.
+        # Modules without forward are not expected to get called
+        # and therefore should not produce "uncalled" warnings
+        module = self._named_modules.get(mod_name)
+        if module is None:
+            return False
+        module_type = type(module)
+        # Containers are not meant to be called anyway (they don't have
+        # forward)
+        # NOTE: We add nn.Identity as well to silence the uncalled warning,
+        # but it's different from other containers: Identity has a forward
+        # but the forward does not contain ops, so it appears "uncalled" after
+        # tracing. A more proper way may be to use forward hooks (instead of
+        # the graph) to decide whether a module has been called.
+        no_forward_mods = {
+            nn.ModuleList, nn.ModuleDict, nn.Module, nn.Identity
+        }
+        for mod in no_forward_mods:
+            if module_type.forward is mod.forward:
+                return False
+        return True
+
+    def _should_ignore_node(self, node) -> bool:
+        kind = node.kind()
+        if kind in self._ignored_ops:
+            return True
+        # Ignore all prim:: operators, with two exceptions:
+        # * prim::PythonOp can be a user-implemented `torch.autograd.Function`
+        # * prim::CallFunction an be a call to scripted module/function.
+        if kind.startswith('prim::PythonOp') or kind.startswith(
+                'prim::CallFunction'):
+            return False
+        if kind.startswith('prim::'):
+            return True
+        return False
diff --git a/head_extractor/src/mmengine/analysis/jit_handles.py b/head_extractor/src/mmengine/analysis/jit_handles.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4b9155e8812fd59ac459cdbdd9571ab8d92c374
--- /dev/null
+++ b/head_extractor/src/mmengine/analysis/jit_handles.py
@@ -0,0 +1,286 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Modified from
+# https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/jit_handles.py
+
+import typing
+from collections import Counter, OrderedDict
+from typing import Any, Callable, List, Optional, Union
+
+import numpy as np
+
+try:
+    from math import prod  # type: ignore
+except ImportError:
+    from numpy import prod as _prod  # type: ignore
+
+    # Patch `numpy.prod` to avoid overflow on Windows by converting its result
+    # from `np.int32` to `int`.
+    def prod(*args, **kwargs):  # type: ignore
+        return _prod(*args, **kwargs).item()
+
+
+Handle = Callable[[List[Any], List[Any]], Union[typing.Counter[str], int]]
+
+
+def get_shape(val: Any) -> Optional[List[int]]:
+    """Get the shapes from a jit value object.
+
+    Args:
+        val (torch._C.Value): jit value object.
+
+    Returns:
+        list(int): return a list of ints.
+    """
+    if val.isCompleteTensor():
+        return val.type().sizes()
+    else:
+        return None  # type: ignore
+
+
+"""
+Below are flop/activation counters for various ops.
+Every counter has the following signature:
+
+Args:
+    inputs (list(torch._C.Value)):
+        The inputs of the op in the form of a list of jit object.
+    outputs (list(torch._C.Value)):
+        The outputs of the op in the form of a list of jit object.
+
+Returns:
+    number: The number of flops/activations for the operation.
+    or Counter[str]
+"""
+
+
+def generic_activation_jit(op_name: Optional[str] = None) -> Handle:
+    """This method returns a handle that counts the number of activation from
+    the output shape for the specified operation.
+
+    Args:
+        op_name (str): The name of the operation. If given, the handle will
+            return a counter using this name.
+
+    Returns:
+        Callable: An activation handle for the given operation.
+    """
+
+    def _generic_activation_jit(
+            i: Any, outputs: List[Any]) -> Union[typing.Counter[str], int]:
+        """This is a generic jit handle that counts the number of activations
+        for any operation given the output shape."""
+        out_shape = get_shape(outputs[0])
+        ac_count = prod(out_shape)  # type: ignore
+        if op_name is None:
+            return ac_count  # type: ignore
+        else:
+            return Counter({op_name: ac_count})
+
+    return _generic_activation_jit
+
+
+def addmm_flop_jit(inputs: List[Any], outputs: List[Any]) -> Union[int, Any]:
+    """Count flops for fully connected layers."""
+    # Count flop for nn.Linear
+    # inputs is a list of length 3.
+    input_shapes = [get_shape(v) for v in inputs[1:3]]
+    # input_shapes[0]: [batch size, input feature dimension]
+    # input_shapes[1]: [batch size, output feature dimension]
+    assert len(input_shapes[0]) == 2, input_shapes[0]  # type: ignore
+    assert len(input_shapes[1]) == 2, input_shapes[1]  # type: ignore
+    batch_size, input_dim = input_shapes[0]  # type: ignore
+    output_dim = input_shapes[1][1]  # type: ignore
+    flops = batch_size * input_dim * output_dim
+    return flops
+
+
+def linear_flop_jit(inputs: List[Any], outputs: List[Any]) -> Union[int, Any]:
+    """Count flops for the aten::linear operator."""
+    # Inputs is a list of length 3; unlike aten::addmm, it is the first
+    # two elements that are relevant.
+    input_shapes = [get_shape(v) for v in inputs[0:2]]
+    # input_shapes[0]: [dim0, dim1, ..., input_feature_dim]
+    # input_shapes[1]: [output_feature_dim, input_feature_dim]
+    assert input_shapes[0][-1] == input_shapes[1][-1]  # type: ignore
+    flops = prod(input_shapes[0]) * input_shapes[1][0]  # type: ignore
+    return flops
+
+
+def bmm_flop_jit(inputs: List[Any], outputs: List[Any]) -> Union[int, Any]:
+    """Count flops for the bmm operation."""
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two tensor.
+    assert len(inputs) == 2, len(inputs)
+    input_shapes = [get_shape(v) for v in inputs]
+    n, c, t = input_shapes[0]  # type: ignore
+    d = input_shapes[-1][-1]  # type: ignore
+    flop = n * c * t * d
+    return flop
+
+
+def conv_flop_count(
+    x_shape: List[int],
+    w_shape: List[int],
+    out_shape: List[int],
+    transposed: bool = False,
+) -> Union[int, Any]:
+    """Count flops for convolution. Note only multiplication is counted.
+    Computation for addition and bias is ignored. Flops for a transposed
+    convolution are calculated as.
+
+    flops = (x_shape[2:] * prod(w_shape) * batch_size).
+
+    Args:
+        x_shape (list(int)): The input shape before convolution.
+        w_shape (list(int)): The filter shape.
+        out_shape (list(int)): The output shape after convolution.
+        transposed (bool): is the convolution transposed
+
+    Returns:
+        int: the number of flops
+    """
+    batch_size = x_shape[0]
+    conv_shape = (x_shape if transposed else out_shape)[2:]
+    flop = batch_size * prod(w_shape) * prod(conv_shape)
+    return flop
+
+
+def conv_flop_jit(inputs: List[Any],
+                  outputs: List[Any]) -> typing.Counter[str]:
+    """Count flops for convolution."""
+    # Inputs of Convolution should be a list of length 12 or 13.
+    # They represent:
+    # 0) input tensor, 1) convolution filter, 2) bias, 3) stride, 4) padding,
+    # 5) dilation, 6) transposed, 7) out_pad, 8) groups, 9) benchmark_cudnn,
+    # 10) deterministic_cudnn and 11) user_enabled_cudnn.
+    # starting with #40737 it will be 12) user_enabled_tf32
+    assert len(inputs) == 12 or len(inputs) == 13, len(inputs)
+    x, w = inputs[:2]
+    x_shape, w_shape, out_shape = (get_shape(x), get_shape(w),
+                                   get_shape(outputs[0]))
+    transposed = inputs[6].toIValue()
+
+    # use a custom name instead of "_convolution"
+    return Counter({
+        'conv':
+        conv_flop_count(
+            x_shape,  # type: ignore
+            w_shape,  # type: ignore
+            out_shape,  # type: ignore
+            transposed=transposed)  # type: ignore
+    })
+
+
+def einsum_flop_jit(inputs: List[Any], outputs: List[Any]) -> Union[int, Any]:
+    """Count flops for the einsum operation."""
+    # Inputs of einsum should be a list of length 2+.
+    # Inputs[0] stores the equation used for einsum.
+    # Inputs[1] stores the list of input shapes.
+    assert len(inputs) >= 2, len(inputs)
+    equation = inputs[0].toIValue()
+    # Get rid of white space in the equation string.
+    equation = equation.replace(' ', '')
+    input_shapes_jit = inputs[1].node().inputs()
+    input_shapes = [get_shape(v) for v in input_shapes_jit]
+
+    # Re-map equation so that same equation with different alphabet
+    # representations will look the same.
+    letter_order = OrderedDict((k, 0) for k in equation if k.isalpha()).keys()
+    mapping = {ord(x): 97 + i for i, x in enumerate(letter_order)}
+    equation = equation.translate(mapping)
+
+    if equation == 'abc,abd->acd':
+        n, c, t = input_shapes[0]  # type: ignore
+        p = input_shapes[-1][-1]  # type: ignore
+        flop = n * c * t * p
+        return flop
+
+    elif equation == 'abc,adc->adb':
+        n, t, g = input_shapes[0]  # type: ignore
+        c = input_shapes[-1][1]  # type: ignore
+        flop = n * t * g * c
+        return flop
+    else:
+        np_arrs = [np.zeros(s) for s in input_shapes]
+        optim = np.einsum_path(equation, *np_arrs, optimize='optimal')[1]
+        for line in optim.split('\n'):
+            if 'optimized flop' in line.lower():
+                # divided by 2 because we count MAC
+                # (multiply-add counted as one flop)
+                flop = float(np.floor(float(line.split(':')[-1]) / 2))
+                return flop
+        raise NotImplementedError('Unsupported einsum operation.')
+
+
+def matmul_flop_jit(inputs: List[Any], outputs: List[Any]) -> Union[int, Any]:
+    """Count flops for matmul."""
+    # input_shapes is a list of length 2.
+    input_shapes: list = [get_shape(v) for v in inputs]
+    input1, input2 = input_shapes
+    if len(input1) == 1:
+        input1 = [1, input1[0]]
+    if len(input2) == 1:
+        input2 = [input2[0], 1]
+
+    assert input1[-1] == input2[-2], input_shapes
+    flop = prod(input1) * input2[-1]
+    return flop
+
+
+def norm_flop_counter(affine_arg_index: int) -> Handle:
+    """
+    Args:
+        affine_arg_index: index of the affine argument in inputs
+    """
+
+    def norm_flop_jit(inputs: List[Any],
+                      outputs: List[Any]) -> Union[int, Any]:
+        """Count flops for norm layers."""
+        # Inputs[0] contains the shape of the input.
+        input_shape = get_shape(inputs[0])
+        has_affine = get_shape(inputs[affine_arg_index]) is not None
+        assert 2 <= len(input_shape) <= 5, input_shape  # type: ignore
+        # 5 is just a rough estimate
+        flop = prod(input_shape) * (5 if has_affine else 4)  # type: ignore
+        return flop
+
+    return norm_flop_jit
+
+
+def batchnorm_flop_jit(inputs: List[Any],
+                       outputs: List[Any]) -> Union[int, Any]:
+    training = inputs[5].toIValue()
+    assert isinstance(training,
+                      bool), 'Signature of aten::batch_norm has changed!'
+    if training:
+        return norm_flop_counter(1)(inputs, outputs)  # pyre-ignore
+    has_affine = get_shape(inputs[1]) is not None
+    input_shape = prod(get_shape(inputs[0]))  # type: ignore
+    return input_shape * (2 if has_affine else 1)
+
+
+def elementwise_flop_counter(input_scale: float = 1,
+                             output_scale: float = 0) -> Handle:
+    """Count flops by.
+
+        input_tensor.numel() * input_scale +
+        output_tensor.numel() * output_scale
+
+    Args:
+        input_scale: scale of the input tensor (first argument)
+        output_scale: scale of the output tensor (first element in outputs)
+    """
+
+    def elementwise_flop(inputs: List[Any],
+                         outputs: List[Any]) -> Union[int, Any]:
+        ret = 0
+        if input_scale != 0:
+            shape = get_shape(inputs[0])
+            ret += input_scale * prod(shape)  # type: ignore
+        if output_scale != 0:
+            shape = get_shape(outputs[0])
+            ret += output_scale * prod(shape)  # type: ignore
+        return ret
+
+    return elementwise_flop
diff --git a/head_extractor/src/mmengine/analysis/print_helper.py b/head_extractor/src/mmengine/analysis/print_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b87d42373dae5c59c7c4ed45aa27cfcb09d95e6
--- /dev/null
+++ b/head_extractor/src/mmengine/analysis/print_helper.py
@@ -0,0 +1,784 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Modified from
+# https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/print_model_statistics.py
+
+from collections import defaultdict
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
+
+import torch
+from rich import box
+from rich.console import Console
+from rich.table import Table
+from torch import nn
+
+from mmengine.utils import is_tuple_of
+from .complexity_analysis import (ActivationAnalyzer, FlopAnalyzer,
+                                  parameter_count)
+
+
+def _format_size(x: int, sig_figs: int = 3, hide_zero: bool = False) -> str:
+    """Formats an integer for printing in a table or model representation.
+
+    Expresses the number in terms of 'kilo', 'mega', etc., using
+    'K', 'M', etc. as a suffix.
+
+    Args:
+        x (int): The integer to format.
+        sig_figs (int): The number of significant figures to keep.
+            Defaults to 3.
+        hide_zero (bool): If True, x=0 is replaced with an empty string
+            instead of '0'. Defaults to False.
+
+    Returns:
+        str: The formatted string.
+    """
+    if hide_zero and x == 0:
+        return ''
+
+    def fmt(x: float) -> str:
+        # use fixed point to avoid scientific notation
+        return f'{{:.{sig_figs}f}}'.format(x).rstrip('0').rstrip('.')
+
+    if abs(x) > 1e14:
+        return fmt(x / 1e15) + 'P'
+    if abs(x) > 1e11:
+        return fmt(x / 1e12) + 'T'
+    if abs(x) > 1e8:
+        return fmt(x / 1e9) + 'G'
+    if abs(x) > 1e5:
+        return fmt(x / 1e6) + 'M'
+    if abs(x) > 1e2:
+        return fmt(x / 1e3) + 'K'
+    return str(x)
+
+
+def _pretty_statistics(statistics: Dict[str, Dict[str, int]],
+                       sig_figs: int = 3,
+                       hide_zero: bool = False) -> Dict[str, Dict[str, str]]:
+    """Converts numeric statistics to strings with kilo/mega/giga/etc. labels.
+
+    Args:
+        statistics (dict[str, dict[str, int]]) : the statistics to
+            format. Organized as a dictionary over modules, which are
+            each a dictionary over statistic types.
+        sig_figs (int): the number of significant figures for each stat.
+            Defaults to 3.
+        hide_zero (bool): if True, statistics that are zero will be
+            written as an empty string. Defaults to False.
+
+    Returns:
+        dict[str, dict[str, str]]: the input statistics as pretty strings
+    """
+    out_stats = {}
+    for mod, stats in statistics.items():
+        out_stats[mod] = {
+            s: _format_size(val, sig_figs, hide_zero)
+            for s, val in stats.items()
+        }
+    return out_stats
+
+
+def _group_by_module(
+        statistics: Dict[str, Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
+    """Converts statistics organized first by statistic type and then by module
+    to statistics organized first by module and then by statistic type.
+
+    Args:
+        statistics (dict[str, dict[str, any]]): the statistics to convert
+
+    Returns:
+        dict[str, dict[str, any]]: the reorganized statistics
+    """
+    out_stats = defaultdict(dict)  # type: Dict[str, Dict[str, Any]]
+    for stat_name, stat in statistics.items():
+        for mod, val in stat.items():
+            out_stats[mod][stat_name] = val
+    return dict(out_stats)
+
+
+def _indicate_uncalled_modules(
+    statistics: Dict[str, Dict[str, str]],
+    stat_name: str,
+    uncalled_modules: Set[str],
+    uncalled_indicator: str = 'N/A',
+) -> Dict[str, Dict[str, str]]:
+    """If a module is in the set of uncalled modules, replace its statistics
+    with the specified indicator, instead of using the existing string.
+
+    Assumes the statistic is already formatting in string form.
+
+    Args:
+        statistics (dict[str, dict[str, str]]): the statistics to
+            format. Organized as a dictionary over modules, which are
+            each a dictionary over statistic types. Expects statistics
+            have already been converted to strings.
+        stat_name (str): the name of the statistic being modified
+        uncalled_modules set(str): a set of names of uncalled modules.
+        indicator (str): the string that will be used to indicate
+            unused modules. Defaults to 'N/A'.
+
+    Returns:
+        dict[str, dict[str, str]]: the modified statistics
+    """
+
+    stats_out = {mod: stats.copy() for mod, stats in statistics.items()}
+    for mod in uncalled_modules:
+        if mod not in stats_out:
+            stats_out[mod] = {}
+        stats_out[mod][stat_name] = uncalled_indicator
+    return stats_out
+
+
+def _remove_zero_statistics(
+    statistics: Dict[str, Dict[str, int]],
+    force_keep: Optional[Set[str]] = None,
+    require_trivial_children: bool = False,
+) -> Dict[str, Dict[str, int]]:
+    """Any module that has zero for all available statistics is removed from
+    the set of statistics.
+
+    This can help declutter the reporting of statistics
+    if many submodules have zero statistics. Assumes the statistics have
+    a model hierarchy starting with a root that has name ''.
+
+    Args:
+        statistics (dict[str, dict[str, int]]): the statistics to
+            remove zeros from. Organized as a dictionary over modules,
+            which are each a dictionary over statistic types.
+        force_keep (set[str] or None): a set of modules to always keep, even
+            if they are all zero.
+        require_trivial_children (bool): If True, a statistic will only
+            be deleted if all its children are also deleted. Defaults to
+            False.
+
+    Returns:
+        dict[str, dict[str, int]]: the input statistics dictionary,
+        with submodules removed if they have zero for all statistics.
+    """
+    out_stats: Dict[str, Dict[str, int]] = {}
+    _force_keep: Set[str] = force_keep if force_keep else set() | {''}
+
+    def keep_stat(name: str) -> None:
+        prefix = name + ('.' if name else '')
+        trivial_children = True
+        for mod in statistics:
+            # 'if mod' excludes root = '', which is never a child
+            if mod and mod.count('.') == prefix.count('.') and mod.startswith(
+                    prefix):
+                keep_stat(mod)
+                trivial_children &= mod not in out_stats
+
+        if ((not all(val == 0 for val in statistics[name].values()))
+                or (name in _force_keep)
+                or (require_trivial_children and not trivial_children)):
+            out_stats[name] = statistics[name].copy()
+
+    keep_stat('')
+    return out_stats
+
+
+def _fill_missing_statistics(
+        model: nn.Module,
+        statistics: Dict[str, Dict[str, int]]) -> Dict[str, Dict[str, int]]:
+    """If, for a given submodule name in the model, a statistic is missing from
+    statistics, fills it in with zero.
+
+    This visually uniformizes the reporting of statistics.
+
+    Args:
+        model (nn.Module): the model whose submodule names will be
+            used to fill in statistics
+        statistics (dict[str, dict[str, int]]) : the statistics to
+            fill in missing values for. Organized as a dictionary
+            over statistics, which are each a dictionary over submodules'
+            names. The statistics are assumed to be formatted already
+            to the desired string format for printing.
+
+    Returns:
+        dict[str, dict[str, int]]: the input statistics with missing
+        values filled with zero.
+    """
+    out_stats = {name: stat.copy() for name, stat in statistics.items()}
+    for mod_name, _ in model.named_modules():
+        for stat in out_stats.values():
+            if mod_name not in stat:
+                stat[mod_name] = 0
+    return out_stats
+
+
+def _model_stats_str(model: nn.Module,
+                     statistics: Dict[str, Dict[str, str]]) -> str:
+    """This produces a representation of the model much like 'str(model)'
+    would, except the provided statistics are written out as additional
+    information for each submodule.
+
+    Args:
+        model (nn.Module): the model to form a representation of.
+        statistics (dict[str, dict[str, str]]): the statistics to
+            include in the model representations. Organized as a dictionary
+            over module names, which are each a dictionary over statistics.
+            The statistics are assumed to be formatted already to the
+            desired string format for printing.
+
+    Returns:
+        str: the string representation of the model with the statistics
+        inserted.
+    """
+
+    # Copied from nn.Module._addindent
+    def _addindent(s_: str, numSpaces: int) -> str:
+        s = s_.split('\n')
+        # don't do anything for single-line stuff
+        if len(s) == 1:
+            return s_
+        first = s.pop(0)
+        s = [(numSpaces * ' ') + line for line in s]
+        s = '\n'.join(s)  # type: ignore
+        s = first + '\n' + s  # type: ignore
+        return s  # type: ignore
+
+    def print_statistics(name: str) -> str:
+        if name not in statistics:
+            return ''
+        printed_stats = [f'{k}: {v}' for k, v in statistics[name].items()]
+        return ', '.join(printed_stats)
+
+    # This comes directly from nn.Module.__repr__ with small changes
+    # to include the statistics.
+    def repr_with_statistics(module: nn.Module, name: str) -> str:
+        # We treat the extra repr like the sub-module, one item per line
+        extra_lines = []
+        extra_repr = module.extra_repr()
+        printed_stats = print_statistics(name)
+        # empty string will be split into list ['']
+        if extra_repr:
+            extra_lines.extend(extra_repr.split('\n'))
+        if printed_stats:
+            extra_lines.extend(printed_stats.split('\n'))
+        child_lines = []
+        for key, submod in module._modules.items():
+            submod_name = name + ('.' if name else '') + key
+            # pyre-fixme[6]: Expected `Module` for 1st param but got
+            #  `Optional[nn.modules.module.Module]`.
+            submod_str = repr_with_statistics(submod, submod_name)
+            submod_str = _addindent(submod_str, 2)
+            child_lines.append('(' + key + '): ' + submod_str)
+        lines = extra_lines + child_lines
+
+        main_str = module._get_name() + '('
+        if lines:
+            # simple one-liner info, which most builtin Modules will use
+            if len(extra_lines) == 1 and not child_lines:
+                main_str += extra_lines[0]
+            else:
+                main_str += '\n  ' + '\n  '.join(lines) + '\n'
+
+        main_str += ')'
+        return main_str
+
+    return repr_with_statistics(model, '')
+
+
+def _get_input_sizes(iterable: Iterable[Any]) -> List[Any]:  # type: ignore
+    """Gets the sizes of all torch tensors in an iterable.
+
+    If an element of the iterable is a non-torch tensor iterable, it recurses
+    into that iterable to continue calculating sizes. Any non-iterable is given
+    a size of None. The output consists of nested lists with the same nesting
+    structure as the input iterables.
+    """
+    out_list = []
+    for i in iterable:
+        if isinstance(i, torch.Tensor):
+            out_list.append(list(i.size()))
+        elif isinstance(i, Iterable):
+            sublist_sizes = _get_input_sizes(i)
+            if all(j is None for j in sublist_sizes):
+                out_list.append(None)  # type: ignore
+            else:
+                out_list.append(sublist_sizes)
+        else:
+            out_list.append(None)  # type: ignore
+    return out_list
+
+
+def _get_single_child(name: str,
+                      statistics: Dict[str, Dict[str, str]]) -> Optional[str]:
+    """If the given module has only a single child in statistics, return it.
+
+    Otherwise, return None.
+    """
+    prefix = name + ('.' if name else '')
+    child = None
+    for mod in statistics:
+        # 'if mod' excludes root = '', which is never a child
+        if mod and mod.count('.') == prefix.count('.') and mod.startswith(
+                prefix):
+            if child is None:
+                child = mod
+            else:
+                return None  # We found a second child, so return None
+    return child
+
+
+def _try_combine(stats1: Dict[str, str],
+                 stats2: Dict[str, str]) -> Optional[Dict[str, str]]:
+    """Try combine two statistics dict to display in one row.
+
+    If they conflict, returns None.
+    """
+    ret = {}
+    if set(stats1.keys()) != set(stats2.keys()):
+        return None
+    for k, v1 in stats1.items():
+        v2 = stats2[k]
+        if v1 != v2 and len(v1) and len(v2):
+            return None
+        ret[k] = v1 if len(v1) else v2
+    return ret
+
+
+def _fastforward(
+        name: str,
+        statistics: Dict[str, Dict[str, str]]) -> Tuple[str, Dict[str, str]]:
+    """If the given module has only a single child and matches statistics with
+    that child, merge statistics and their names into one row.
+
+    Then repeat until the condition isn't met.
+
+    Returns:
+        tuple[str, dict]: the new name and the combined statistics of this row
+    """
+    single_child = _get_single_child(name, statistics)
+    if single_child is None:
+        return name, statistics[name]
+    combined = _try_combine(statistics[name], statistics[single_child])
+    if combined is None:
+        return name, statistics[name]
+    statistics[single_child] = combined
+    return _fastforward(single_child, statistics)
+
+
+def _stats_table_format(
+    statistics: Dict[str, Dict[str, str]],
+    max_depth: int = 3,
+    stat_columns: Optional[List[str]] = None,
+) -> str:
+    """Formats the statistics obtained from a model in a nice table.
+
+    Args:
+        statistics (dict[str, dict[str, str]]): The statistics to print.
+            Organized as a dictionary over modules, then as a dictionary
+            over statistics in the model. The statistics are assumed to
+            already be formatted for printing.
+        max_depth (int): The maximum submodule depth to recurse to.
+            Defaults to 3.
+        stat_columns (list[str]): Specify the order of the columns to print.
+            If None, columns are found automatically from the provided
+            statistics. Defaults to None.
+
+    Return:
+        str: The formatted table.
+    """
+    if stat_columns is None:
+        stat_columns = set()  # type: ignore
+        for stats in statistics.values():
+            stat_columns.update(stats.keys())  # type: ignore
+        stat_columns = list(stat_columns)  # type: ignore
+
+    headers = ['module'] + stat_columns
+    rows: List[List[str]] = []
+
+    def build_row(name: str, stats: Dict[str, str],
+                  indent_lvl: int) -> List[str]:
+        indent = ' ' * indent_lvl
+        row = [indent + name]
+        for stat_name in stat_columns:  # type: ignore
+            row_str = (indent + stats[stat_name]) if stat_name in stats else ''
+            row.append(row_str)
+        return row
+
+    def fill(indent_lvl: int, prefix: str) -> None:
+        if indent_lvl > max_depth:
+            return
+        for mod_name in statistics:
+            # 'if mod' excludes root = '', which is never a child
+            if (mod_name and mod_name.count('.') == prefix.count('.')
+                    and mod_name.startswith(prefix)):
+                mod_name, curr_stats = _fastforward(mod_name, statistics)
+                if root_prefix and mod_name.startswith(root_prefix):
+                    # Skip the root_prefix shared by all submodules as it
+                    # carries 0 information
+                    pretty_mod_name = mod_name[len(root_prefix):]
+                else:
+                    pretty_mod_name = mod_name
+                row = build_row(pretty_mod_name, curr_stats, indent_lvl)
+                rows.append(row)
+                fill(indent_lvl + 1, mod_name + '.')
+
+    root_name, curr_stats = _fastforward('', statistics)
+    row = build_row(root_name or 'model', curr_stats, indent_lvl=0)
+    rows.append(row)
+    root_prefix = root_name + ('.' if root_name else '')
+    fill(indent_lvl=1, prefix=root_prefix)
+
+    table = Table(box=box.ASCII2)
+    for header in headers:
+        table.add_column(header)
+
+    for row in rows:
+        table.add_row(*row)
+
+    console = Console()
+    with console.capture() as capture:
+        console.print(table, end='')
+
+    return capture.get()
+
+
+def complexity_stats_str(
+        flops: FlopAnalyzer,
+        activations: Optional[ActivationAnalyzer] = None) -> str:
+    """Calculates the parameters and flops of the model with the given inputs
+    and returns a string representation of the model that includes the
+    parameters and flops of every submodule. The string is structured to be
+    similar that given by str(model), though it is not guaranteed to be
+    identical in form if the default string representation of a module has been
+    overridden. If a module has zero parameters and flops, statistics will not
+    be reported for succinctness. The trace can only register the scope of a
+    module if it is called directly, which means flops (and activations)
+    arising from explicit calls to .forward() or to other python functions of
+    the module will not be attributed to that module. Modules that are never
+    called will have 'N/A' listed for their flops; this means they are either
+    unused or their statistics are missing for this reason. Any such flops are
+    still counted towards the parent.
+
+    Examples:
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> class InnerNet(nn.Module):
+        ...     def __init__(self):
+        ...         super().__init__()
+        ...         self.fc1 = nn.Linear(10,10)
+        ...         self.fc2 = nn.Linear(10,10)
+        ...     def forward(self, x):
+        ...         return self.fc1(self.fc2(x))
+        >>> class TestNet(nn.Module):
+        ...     def __init__(self):
+        ...         super().__init__()
+        ...         self.fc1 = nn.Linear(10,10)
+        ...         self.fc2 = nn.Linear(10,10)
+        ...         self.inner = InnerNet()
+        ...     def forward(self, x):
+        ...         return self.fc1(self.fc2(self.inner(x)))
+        >>> inputs = torch.randn((1,10))
+        >>> print(complexity_stats_str(FlopAnalyzer(model, inputs)))
+        TestNet(
+          #params: 0.44K, #flops: 0.4K
+          (fc1): Linear(
+            in_features=10, out_features=10, bias=True
+            #params: 0.11K, #flops: 100
+          )
+          (fc2): Linear(
+            in_features=10, out_features=10, bias=True
+            #params: 0.11K, #flops: 100
+          )
+          (inner): InnerNet(
+            #params: 0.22K, #flops: 0.2K
+            (fc1): Linear(
+              in_features=10, out_features=10, bias=True
+              #params: 0.11K, #flops: 100
+            )
+            (fc2): Linear(
+              in_features=10, out_features=10, bias=True
+              #params: 0.11K, #flops: 100
+            )
+          )
+        )
+
+    Args:
+        flops (FlopAnalyzer): the flop counting object
+        activations (ActivationAnalyzer or None): If given, the activations of
+            each layer will also be calculated and included in the
+            representation. Defaults to None.
+
+    Returns:
+        str: a string representation of the model with the number of
+        parameters and flops included.
+    """
+    # cast to dict since pyre doesn't like the implicit defaultdict->dict
+    model = flops._model
+    params = dict(parameter_count(model))
+
+    flops.unsupported_ops_warnings(False)
+    flops.uncalled_modules_warnings(False)
+    flops.tracer_warnings('none')
+    stats = {'#params': params, '#flops': flops.by_module()}
+
+    if activations is not None:
+        activations.unsupported_ops_warnings(False)
+        activations.uncalled_modules_warnings(False)
+        activations.tracer_warnings('none')
+        stats['#acts'] = activations.by_module()
+
+    all_uncalled = flops.uncalled_modules() | (
+        activations.uncalled_modules() if activations is not None else set())
+    stats = _fill_missing_statistics(model, stats)
+    stats = _group_by_module(stats)
+    stats = _remove_zero_statistics(stats, force_keep=all_uncalled)
+    stats = _pretty_statistics(stats, sig_figs=2)  # type: ignore
+    stats = _indicate_uncalled_modules(  # type: ignore
+        stats,  # type: ignore
+        '#flops',  # type: ignore
+        flops.uncalled_modules())  # type: ignore
+    if activations is not None:
+        stats = _indicate_uncalled_modules(  # type: ignore
+            stats,  # type: ignore
+            '#acts',  # type: ignore
+            activations.uncalled_modules())  # type: ignore
+
+    model_string = ''
+    if all_uncalled:
+        model_string += (
+            'N/A indicates a possibly missing statistic due to how '
+            'the module was called. Missing values are still included '
+            "in the parent's total.\n")
+    model_string += _model_stats_str(model, stats)  # type: ignore
+    return model_string
+
+
+def complexity_stats_table(
+    flops: FlopAnalyzer,
+    max_depth: int = 3,
+    activations: Optional[ActivationAnalyzer] = None,
+    show_param_shapes: bool = True,
+) -> str:
+    """
+    Format the per-module parameters and flops of a model in a table.
+    It looks like this:
+    ::
+        | model                            | #parameters or shape| #flops    |
+        |:---------------------------------|:--------------------|:----------|
+        | model                            | 34.6M               | 65.7G     |
+        |  s1                              |  15.4K              |  4.32G    |
+        |   s1.pathway0_stem               |   9.54K             |   1.23G   |
+        |    s1.pathway0_stem.conv         |    9.41K            |    1.23G  |
+        |    s1.pathway0_stem.bn           |    0.128K           |           |
+        |   s1.pathway1_stem               |   5.9K              |   3.08G   |
+        |    s1.pathway1_stem.conv         |    5.88K            |    3.08G  |
+        |    s1.pathway1_stem.bn           |    16               |           |
+        |  s1_fuse                         |  0.928K             |  29.4M    |
+        |   s1_fuse.conv_f2s               |   0.896K            |   29.4M   |
+        |    s1_fuse.conv_f2s.weight       |    (16, 8, 7, 1, 1) |           |
+        |   s1_fuse.bn                     |   32                |           |
+        |    s1_fuse.bn.weight             |    (16,)            |           |
+        |    s1_fuse.bn.bias               |    (16,)            |           |
+        |  s2                              |  0.226M             |  7.73G    |
+        |   s2.pathway0_res0               |   80.1K             |   2.58G   |
+        |    s2.pathway0_res0.branch1      |    20.5K            |    0.671G |
+        |    s2.pathway0_res0.branch1_bn   |    0.512K           |           |
+        |    s2.pathway0_res0.branch2      |    59.1K            |    1.91G  |
+        |   s2.pathway0_res1.branch2       |   70.4K             |   2.28G   |
+        |    s2.pathway0_res1.branch2.a    |    16.4K            |    0.537G |
+        |    s2.pathway0_res1.branch2.a_bn |    0.128K           |           |
+        |    s2.pathway0_res1.branch2.b    |    36.9K            |    1.21G  |
+        |    s2.pathway0_res1.branch2.b_bn |    0.128K           |           |
+        |    s2.pathway0_res1.branch2.c    |    16.4K            |    0.537G |
+        |    s2.pathway0_res1.branch2.c_bn |    0.512K           |           |
+        |   s2.pathway0_res2.branch2       |   70.4K             |   2.28G   |
+        |    s2.pathway0_res2.branch2.a    |    16.4K            |    0.537G |
+        |    s2.pathway0_res2.branch2.a_bn |    0.128K           |           |
+        |    s2.pathway0_res2.branch2.b    |    36.9K            |    1.21G  |
+        |    s2.pathway0_res2.branch2.b_bn |    0.128K           |           |
+        |    s2.pathway0_res2.branch2.c    |    16.4K            |    0.537G |
+        |    s2.pathway0_res2.branch2.c_bn |    0.512K           |           |
+        |    ............................. |    ......           |    ...... |
+
+    Args:
+        flops (FlopAnalyzer): the flop counting object
+        max_depth (int): The max depth of submodules to include in the
+            table. Defaults to 3.
+        activations (ActivationAnalyzer or None): If given, include
+            activation counts as an additional column in the table.
+            Defaults to None.
+        show_param_shapes (bool): If true, shapes for parameters will be
+            included in the table. Defaults to True.
+
+    Returns:
+        str: The formatted table.
+
+    Examples:
+        >>> print(complexity_stats_table(FlopAnalyzer(model, inputs)))
+    """
+    params_header = '#parameters' + (' or shape' if show_param_shapes else '')
+    flops_header, acts_header = '#flops', '#activations'
+
+    model = flops._model
+    # cast to dict since pyre doesn't like the implicit defaultdict->dict
+    params = dict(parameter_count(model))
+
+    flops.unsupported_ops_warnings(False)
+    flops.uncalled_modules_warnings(False)
+    flops.tracer_warnings('none')
+
+    stats = {params_header: params, flops_header: flops.by_module()}
+    stat_columns = [params_header, flops_header]
+
+    if activations is not None:
+        activations.unsupported_ops_warnings(False)
+        activations.uncalled_modules_warnings(False)
+        activations.tracer_warnings('none')
+        stats[acts_header] = activations.by_module()
+        stat_columns += [acts_header]
+
+    stats = _group_by_module(stats)
+    stats = _remove_zero_statistics(
+        stats,  # type: ignore
+        require_trivial_children=True)  # type: ignore
+    stats = _pretty_statistics(stats, hide_zero=False)  # type: ignore
+    stats = _indicate_uncalled_modules(  # type: ignore
+        stats,  # type: ignore
+        flops_header,  # type: ignore
+        flops.uncalled_modules() & stats.keys(),  # type: ignore
+        uncalled_indicator='',  # type: ignore
+    )
+    if activations:
+        stats = _indicate_uncalled_modules(  # type: ignore
+            stats,  # type: ignore
+            acts_header,  # type: ignore
+            activations.uncalled_modules() & stats.keys(),  # type: ignore
+            uncalled_indicator='',  # type: ignore
+        )
+
+    # Swap in shapes for parameters or delete shapes from dict
+    param_shapes: Dict[str, Tuple[int, ...]] = {
+        k: tuple(v.shape)
+        for k, v in model.named_parameters()
+    }
+    to_delete = []
+    for mod in stats:
+        if mod in param_shapes:
+            if show_param_shapes:
+                stats[mod][params_header] = str(  # type: ignore
+                    param_shapes[mod])  # type: ignore
+            else:
+                to_delete.append(mod)
+    for mod in to_delete:
+        del stats[mod]
+
+    return _stats_table_format(
+        statistics=stats,  # type: ignore
+        max_depth=max_depth,
+        stat_columns=stat_columns,
+    )
+
+
+def get_model_complexity_info(
+    model: nn.Module,
+    input_shape: Union[Tuple[int, ...], Tuple[Tuple[int, ...], ...],
+                       None] = None,
+    inputs: Union[torch.Tensor, Tuple[torch.Tensor, ...], Tuple[Any, ...],
+                  None] = None,
+    show_table: bool = True,
+    show_arch: bool = True,
+):
+    """Interface to get the complexity of a model.
+
+    The parameter `inputs` are fed to the forward method of model.
+    If `inputs` is not specified, the `input_shape` is required and
+    it will be used to construct the dummy input fed to model.
+    If the forward of model requires two or more inputs, the `inputs`
+    should be a tuple of tensor or the `input_shape` should be a tuple
+    of tuple which each element will be constructed into a dumpy input.
+
+    Examples:
+        >>> # the forward of model accepts only one input
+        >>> input_shape = (3, 224, 224)
+        >>> get_model_complexity_info(model, input_shape=input_shape)
+        >>> # the forward of model accepts two or more inputs
+        >>> input_shape = ((3, 224, 224), (3, 10))
+        >>> get_model_complexity_info(model, input_shape=input_shape)
+
+    Args:
+        model (nn.Module): The model to analyze.
+        input_shape (Union[Tuple[int, ...], Tuple[Tuple[int, ...]], None]):
+            The input shape of the model.
+            If "inputs" is not specified, the "input_shape" should be set.
+            Defaults to None.
+        inputs (torch.Tensor, tuple[torch.Tensor, ...] or Tuple[Any, ...],\
+            optional]):
+            The input tensor(s) of the model. If not given the input tensor
+            will be generated automatically with the given input_shape.
+            Defaults to None.
+        show_table (bool): Whether to show the complexity table.
+            Defaults to True.
+        show_arch (bool): Whether to show the complexity arch.
+            Defaults to True.
+
+    Returns:
+        dict: The complexity information of the model.
+    """
+    if input_shape is None and inputs is None:
+        raise ValueError('One of "input_shape" and "inputs" should be set.')
+    elif input_shape is not None and inputs is not None:
+        raise ValueError('"input_shape" and "inputs" cannot be both set.')
+
+    if inputs is None:
+        device = next(model.parameters()).device
+        if is_tuple_of(input_shape, int):  # tuple of int, construct one tensor
+            inputs = (torch.randn(1, *input_shape).to(device), )
+        elif is_tuple_of(input_shape, tuple) and all([
+                is_tuple_of(one_input_shape, int)
+                for one_input_shape in input_shape  # type: ignore
+        ]):  # tuple of tuple of int, construct multiple tensors
+            inputs = tuple([
+                torch.randn(1, *one_input_shape).to(device)
+                for one_input_shape in input_shape  # type: ignore
+            ])
+        else:
+            raise ValueError(
+                '"input_shape" should be either a `tuple of int` (to construct'
+                'one input tensor) or a `tuple of tuple of int` (to construct'
+                'multiple input tensors).')
+
+    flop_handler = FlopAnalyzer(model, inputs)
+    activation_handler = ActivationAnalyzer(model, inputs)
+
+    flops = flop_handler.total()
+    activations = activation_handler.total()
+    params = parameter_count(model)['']
+
+    flops_str = _format_size(flops)
+    activations_str = _format_size(activations)
+    params_str = _format_size(params)
+
+    if show_table:
+        complexity_table = complexity_stats_table(
+            flops=flop_handler,
+            activations=activation_handler,
+            show_param_shapes=True,
+        )
+        complexity_table = '\n' + complexity_table
+    else:
+        complexity_table = ''
+
+    if show_arch:
+        complexity_arch = complexity_stats_str(
+            flops=flop_handler,
+            activations=activation_handler,
+        )
+        complexity_arch = '\n' + complexity_arch
+    else:
+        complexity_arch = ''
+
+    return {
+        'flops': flops,
+        'flops_str': flops_str,
+        'activations': activations,
+        'activations_str': activations_str,
+        'params': params,
+        'params_str': params_str,
+        'out_table': complexity_table,
+        'out_arch': complexity_arch
+    }
diff --git a/head_extractor/src/mmengine/config/__init__.py b/head_extractor/src/mmengine/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a1bc47db49d10ae5851211135f63107a175224b
--- /dev/null
+++ b/head_extractor/src/mmengine/config/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .config import Config, ConfigDict, DictAction, read_base
+
+__all__ = ['Config', 'ConfigDict', 'DictAction', 'read_base']
diff --git a/head_extractor/src/mmengine/config/config.py b/head_extractor/src/mmengine/config/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f85795066a24b2ae0f4669cea37884701701626c
--- /dev/null
+++ b/head_extractor/src/mmengine/config/config.py
@@ -0,0 +1,1857 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import ast
+import copy
+import difflib
+import os
+import os.path as osp
+import platform
+import shutil
+import sys
+import tempfile
+import types
+import uuid
+import warnings
+from argparse import Action, ArgumentParser, Namespace
+from collections import OrderedDict, abc
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Any, Optional, Sequence, Tuple, Union
+
+import yapf
+from addict import Dict
+from rich.console import Console
+from rich.text import Text
+from yapf.yapflib.yapf_api import FormatCode
+
+from mmengine.fileio import dump, load
+from mmengine.logging import print_log
+from mmengine.utils import (check_file_exist, digit_version,
+                            get_installed_path, import_modules_from_strings,
+                            is_installed)
+from .lazy import LazyAttr, LazyObject
+from .utils import (ConfigParsingError, ImportTransformer, RemoveAssignFromAST,
+                    _gather_abs_import_lazyobj, _get_external_cfg_base_path,
+                    _get_external_cfg_path, _get_package_and_cfg_path,
+                    _is_builtin_module)
+
+BASE_KEY = '_base_'
+DELETE_KEY = '_delete_'
+DEPRECATION_KEY = '_deprecation_'
+RESERVED_KEYS = ['filename', 'text', 'pretty_text', 'env_variables']
+
+if platform.system() == 'Windows':
+    import regex as re
+else:
+    import re  # type: ignore
+
+
+def _lazy2string(cfg_dict, dict_type=None):
+    if isinstance(cfg_dict, dict):
+        dict_type = dict_type or type(cfg_dict)
+        return dict_type(
+            {k: _lazy2string(v, dict_type)
+             for k, v in dict.items(cfg_dict)})
+    elif isinstance(cfg_dict, (tuple, list)):
+        return type(cfg_dict)(_lazy2string(v, dict_type) for v in cfg_dict)
+    elif isinstance(cfg_dict, (LazyAttr, LazyObject)):
+        return f'{cfg_dict.module}.{str(cfg_dict)}'
+    else:
+        return cfg_dict
+
+
+class ConfigDict(Dict):
+    """A dictionary for config which has the same interface as python's built-
+    in dictionary and can be used as a normal dictionary.
+
+    The Config class would transform the nested fields (dictionary-like fields)
+    in config file into ``ConfigDict``.
+
+    If the class attribute ``lazy``  is ``False``, users will get the
+    object built by ``LazyObject`` or ``LazyAttr``, otherwise users will get
+    the ``LazyObject`` or ``LazyAttr`` itself.
+
+    The ``lazy`` should be set to ``True`` to avoid building the imported
+    object during configuration parsing, and it should be set to False outside
+    the Config to ensure that users do not experience the ``LazyObject``.
+    """
+    lazy = False
+
+    def __init__(__self, *args, **kwargs):
+        object.__setattr__(__self, '__parent', kwargs.pop('__parent', None))
+        object.__setattr__(__self, '__key', kwargs.pop('__key', None))
+        object.__setattr__(__self, '__frozen', False)
+        for arg in args:
+            if not arg:
+                continue
+            # Since ConfigDict.items will convert LazyObject to real object
+            # automatically, we need to call super().items() to make sure
+            # the LazyObject will not be converted.
+            if isinstance(arg, ConfigDict):
+                for key, val in dict.items(arg):
+                    __self[key] = __self._hook(val)
+            elif isinstance(arg, dict):
+                for key, val in arg.items():
+                    __self[key] = __self._hook(val)
+            elif isinstance(arg, tuple) and (not isinstance(arg[0], tuple)):
+                __self[arg[0]] = __self._hook(arg[1])
+            else:
+                for key, val in iter(arg):
+                    __self[key] = __self._hook(val)
+
+        for key, val in dict.items(kwargs):
+            __self[key] = __self._hook(val)
+
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        try:
+            value = super().__getattr__(name)
+            if isinstance(value, (LazyAttr, LazyObject)) and not self.lazy:
+                value = value.build()
+        except KeyError:
+            raise AttributeError(f"'{self.__class__.__name__}' object has no "
+                                 f"attribute '{name}'")
+        except Exception as e:
+            raise e
+        else:
+            return value
+
+    @classmethod
+    def _hook(cls, item):
+        # avoid to convert user defined dict to ConfigDict.
+        if type(item) in (dict, OrderedDict):
+            return cls(item)
+        elif isinstance(item, (list, tuple)):
+            return type(item)(cls._hook(elem) for elem in item)
+        return item
+
+    def __setattr__(self, name, value):
+        value = self._hook(value)
+        return super().__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        value = self._hook(value)
+        return super().__setitem__(name, value)
+
+    def __getitem__(self, key):
+        return self.build_lazy(super().__getitem__(key))
+
+    def __deepcopy__(self, memo):
+        other = self.__class__()
+        memo[id(self)] = other
+        for key, value in super().items():
+            other[copy.deepcopy(key, memo)] = copy.deepcopy(value, memo)
+        return other
+
+    def __copy__(self):
+        other = self.__class__()
+        for key, value in super().items():
+            other[key] = value
+        return other
+
+    copy = __copy__
+
+    def __iter__(self):
+        # Implement `__iter__` to overwrite the unpacking operator `**cfg_dict`
+        # to get the built lazy object
+        return iter(self.keys())
+
+    def get(self, key: str, default: Optional[Any] = None) -> Any:
+        """Get the value of the key. If class attribute ``lazy`` is True, the
+        LazyObject will be built and returned.
+
+        Args:
+            key (str): The key.
+            default (any, optional): The default value. Defaults to None.
+
+        Returns:
+            Any: The value of the key.
+        """
+        return self.build_lazy(super().get(key, default))
+
+    def pop(self, key, default=None):
+        """Pop the value of the key. If class attribute ``lazy`` is True, the
+        LazyObject will be built and returned.
+
+        Args:
+            key (str): The key.
+            default (any, optional): The default value. Defaults to None.
+
+        Returns:
+            Any: The value of the key.
+        """
+        return self.build_lazy(super().pop(key, default))
+
+    def update(self, *args, **kwargs) -> None:
+        """Override this method to make sure the LazyObject will not be built
+        during updating."""
+        other = {}
+        if args:
+            if len(args) > 1:
+                raise TypeError('update only accept one positional argument')
+            # Avoid to used self.items to build LazyObject
+            for key, value in dict.items(args[0]):
+                other[key] = value
+
+        for key, value in dict(kwargs).items():
+            other[key] = value
+        for k, v in other.items():
+            if ((k not in self) or (not isinstance(self[k], dict))
+                    or (not isinstance(v, dict))):
+                self[k] = self._hook(v)
+            else:
+                self[k].update(v)
+
+    def build_lazy(self, value: Any) -> Any:
+        """If class attribute ``lazy`` is False, the LazyObject will be built
+        and returned.
+
+        Args:
+            value (Any): The value to be built.
+
+        Returns:
+            Any: The built value.
+        """
+        if isinstance(value, (LazyAttr, LazyObject)) and not self.lazy:
+            value = value.build()
+        return value
+
+    def values(self):
+        """Yield the values of the dictionary.
+
+        If class attribute ``lazy`` is False, the value of ``LazyObject`` or
+        ``LazyAttr`` will be built and returned.
+        """
+        values = []
+        for value in super().values():
+            values.append(self.build_lazy(value))
+        return values
+
+    def items(self):
+        """Yield the keys and values of the dictionary.
+
+        If class attribute ``lazy`` is False, the value of ``LazyObject`` or
+        ``LazyAttr`` will be built and returned.
+        """
+        items = []
+        for key, value in super().items():
+            items.append((key, self.build_lazy(value)))
+        return items
+
+    def merge(self, other: dict):
+        """Merge another dictionary into current dictionary.
+
+        Args:
+            other (dict): Another dictionary.
+        """
+        default = object()
+
+        def _merge_a_into_b(a, b):
+            if isinstance(a, dict):
+                if not isinstance(b, dict):
+                    a.pop(DELETE_KEY, None)
+                    return a
+                if a.pop(DELETE_KEY, False):
+                    b.clear()
+                all_keys = list(b.keys()) + list(a.keys())
+                return {
+                    key:
+                    _merge_a_into_b(a.get(key, default), b.get(key, default))
+                    for key in all_keys if key != DELETE_KEY
+                }
+            else:
+                return a if a is not default else b
+
+        merged = _merge_a_into_b(copy.deepcopy(other), copy.deepcopy(self))
+        self.clear()
+        for key, value in merged.items():
+            self[key] = value
+
+    def __reduce_ex__(self, proto):
+        # Override __reduce_ex__ to avoid `self.items` will be
+        # called by CPython interpreter during pickling. See more details in
+        # https://github.com/python/cpython/blob/8d61a71f9c81619e34d4a30b625922ebc83c561b/Objects/typeobject.c#L6196  # noqa: E501
+        if digit_version(platform.python_version()) < digit_version('3.8'):
+            return (self.__class__, ({k: v
+                                      for k, v in super().items()}, ), None,
+                    None, None)
+        else:
+            return (self.__class__, ({k: v
+                                      for k, v in super().items()}, ), None,
+                    None, None, None)
+
+    def __eq__(self, other):
+        if isinstance(other, ConfigDict):
+            return other.to_dict() == self.to_dict()
+        elif isinstance(other, dict):
+            return {k: v for k, v in self.items()} == other
+        else:
+            return False
+
+    def _to_lazy_dict(self):
+        """Convert the ConfigDict to a normal dictionary recursively, and keep
+        the ``LazyObject`` or ``LazyAttr`` object not built."""
+
+        def _to_dict(data):
+            if isinstance(data, ConfigDict):
+                return {
+                    key: _to_dict(value)
+                    for key, value in Dict.items(data)
+                }
+            elif isinstance(data, dict):
+                return {key: _to_dict(value) for key, value in data.items()}
+            elif isinstance(data, (list, tuple)):
+                return type(data)(_to_dict(item) for item in data)
+            else:
+                return data
+
+        return _to_dict(self)
+
+    def to_dict(self):
+        """Convert the ConfigDict to a normal dictionary recursively, and
+        convert the ``LazyObject`` or ``LazyAttr`` to string."""
+        return _lazy2string(self, dict_type=dict)
+
+
+def add_args(parser: ArgumentParser,
+             cfg: dict,
+             prefix: str = '') -> ArgumentParser:
+    """Add config fields into argument parser.
+
+    Args:
+        parser (ArgumentParser): Argument parser.
+        cfg (dict): Config dictionary.
+        prefix (str, optional): Prefix of parser argument.
+            Defaults to ''.
+
+    Returns:
+        ArgumentParser: Argument parser containing config fields.
+    """
+    for k, v in cfg.items():
+        if isinstance(v, str):
+            parser.add_argument('--' + prefix + k)
+        elif isinstance(v, bool):
+            parser.add_argument('--' + prefix + k, action='store_true')
+        elif isinstance(v, int):
+            parser.add_argument('--' + prefix + k, type=int)
+        elif isinstance(v, float):
+            parser.add_argument('--' + prefix + k, type=float)
+        elif isinstance(v, dict):
+            add_args(parser, v, prefix + k + '.')
+        elif isinstance(v, abc.Iterable):
+            parser.add_argument(
+                '--' + prefix + k, type=type(next(iter(v))), nargs='+')
+        else:
+            print_log(
+                f'cannot parse key {prefix + k} of type {type(v)}',
+                logger='current')
+    return parser
+
+
+class Config:
+    """A facility for config and config files.
+
+    It supports common file formats as configs: python/json/yaml.
+    ``Config.fromfile`` can parse a dictionary from a config file, then
+    build a ``Config`` instance with the dictionary.
+    The interface is the same as a dict object and also allows access config
+    values as attributes.
+
+    Args:
+        cfg_dict (dict, optional): A config dictionary. Defaults to None.
+        cfg_text (str, optional): Text of config. Defaults to None.
+        filename (str or Path, optional): Name of config file.
+            Defaults to None.
+        format_python_code (bool): Whether to format Python code by yapf.
+            Defaults to True.
+
+    Here is a simple example:
+
+    Examples:
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> cfg.a
+        1
+        >>> cfg.b
+        {'b1': [0, 1]}
+        >>> cfg.b.b1
+        [0, 1]
+        >>> cfg = Config.fromfile('tests/data/config/a.py')
+        >>> cfg.filename
+        "/home/username/projects/mmengine/tests/data/config/a.py"
+        >>> cfg.item4
+        'test'
+        >>> cfg
+        "Config [path: /home/username/projects/mmengine/tests/data/config/a.py]
+        :"
+        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
+
+    You can find more advance usage in the `config tutorial`_.
+
+    .. _config tutorial: https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        cfg_dict: dict = None,
+        cfg_text: Optional[str] = None,
+        filename: Optional[Union[str, Path]] = None,
+        env_variables: Optional[dict] = None,
+        format_python_code: bool = True,
+    ):
+        filename = str(filename) if isinstance(filename, Path) else filename
+        if cfg_dict is None:
+            cfg_dict = dict()
+        elif not isinstance(cfg_dict, dict):
+            raise TypeError('cfg_dict must be a dict, but '
+                            f'got {type(cfg_dict)}')
+        for key in cfg_dict:
+            if key in RESERVED_KEYS:
+                raise KeyError(f'{key} is reserved for config file')
+
+        if not isinstance(cfg_dict, ConfigDict):
+            cfg_dict = ConfigDict(cfg_dict)
+        super().__setattr__('_cfg_dict', cfg_dict)
+        super().__setattr__('_filename', filename)
+        super().__setattr__('_format_python_code', format_python_code)
+        if not hasattr(self, '_imported_names'):
+            super().__setattr__('_imported_names', set())
+
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename, encoding='utf-8') as f:
+                text = f.read()
+        else:
+            text = ''
+        super().__setattr__('_text', text)
+        if env_variables is None:
+            env_variables = dict()
+        super().__setattr__('_env_variables', env_variables)
+
+    @staticmethod
+    def fromfile(filename: Union[str, Path],
+                 use_predefined_variables: bool = True,
+                 import_custom_modules: bool = True,
+                 use_environment_variables: bool = True,
+                 lazy_import: Optional[bool] = None,
+                 format_python_code: bool = True) -> 'Config':
+        """Build a Config instance from config file.
+
+        Args:
+            filename (str or Path): Name of config file.
+            use_predefined_variables (bool, optional): Whether to use
+                predefined variables. Defaults to True.
+            import_custom_modules (bool, optional): Whether to support
+                importing custom modules in config. Defaults to None.
+            use_environment_variables (bool, optional): Whether to use
+                environment variables. Defaults to True.
+            lazy_import (bool): Whether to load config in `lazy_import` mode.
+                If it is `None`, it will be deduced by the content of the
+                config file. Defaults to None.
+            format_python_code (bool): Whether to format Python code by yapf.
+                Defaults to True.
+
+        Returns:
+            Config: Config instance built from config file.
+        """
+        filename = str(filename) if isinstance(filename, Path) else filename
+        if lazy_import is False or \
+           lazy_import is None and not Config._is_lazy_import(filename):
+            cfg_dict, cfg_text, env_variables = Config._file2dict(
+                filename, use_predefined_variables, use_environment_variables,
+                lazy_import)
+            if import_custom_modules and cfg_dict.get('custom_imports', None):
+                try:
+                    import_modules_from_strings(**cfg_dict['custom_imports'])
+                except ImportError as e:
+                    err_msg = (
+                        'Failed to import custom modules from '
+                        f"{cfg_dict['custom_imports']}, the current sys.path "
+                        'is: ')
+                    for p in sys.path:
+                        err_msg += f'\n    {p}'
+                    err_msg += (
+                        '\nYou should set `PYTHONPATH` to make `sys.path` '
+                        'include the directory which contains your custom '
+                        'module')
+                    raise ImportError(err_msg) from e
+            return Config(
+                cfg_dict,
+                cfg_text=cfg_text,
+                filename=filename,
+                env_variables=env_variables,
+            )
+        else:
+            # Enable lazy import when parsing the config.
+            # Using try-except to make sure ``ConfigDict.lazy`` will be reset
+            # to False. See more details about lazy in the docstring of
+            # ConfigDict
+            ConfigDict.lazy = True
+            try:
+                cfg_dict, imported_names = Config._parse_lazy_import(filename)
+            except Exception as e:
+                raise e
+            finally:
+                # disable lazy import to get the real type. See more details
+                # about lazy in the docstring of ConfigDict
+                ConfigDict.lazy = False
+
+            cfg = Config(
+                cfg_dict,
+                filename=filename,
+                format_python_code=format_python_code)
+            object.__setattr__(cfg, '_imported_names', imported_names)
+            return cfg
+
+    @staticmethod
+    def fromstring(cfg_str: str, file_format: str) -> 'Config':
+        """Build a Config instance from config text.
+
+        Args:
+            cfg_str (str): Config text.
+            file_format (str): Config file format corresponding to the
+               config str. Only py/yml/yaml/json type are supported now!
+
+        Returns:
+            Config: Config object generated from ``cfg_str``.
+        """
+        if file_format not in ['.py', '.json', '.yaml', '.yml']:
+            raise OSError('Only py/yml/yaml/json type are supported now!')
+        if file_format != '.py' and 'dict(' in cfg_str:
+            # check if users specify a wrong suffix for python
+            warnings.warn(
+                'Please check "file_format", the file format may be .py')
+
+        # A temporary file can not be opened a second time on Windows.
+        # See https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile for more details. # noqa
+        # `temp_file` is opened first in `tempfile.NamedTemporaryFile` and
+        #  second in `Config.from_file`.
+        # In addition, a named temporary file will be removed after closed.
+        # As a workaround we set `delete=False` and close the temporary file
+        # before opening again.
+
+        with tempfile.NamedTemporaryFile(
+                'w', encoding='utf-8', suffix=file_format,
+                delete=False) as temp_file:
+            temp_file.write(cfg_str)
+
+        cfg = Config.fromfile(temp_file.name)
+        os.remove(temp_file.name)  # manually delete the temporary file
+        return cfg
+
+    @staticmethod
+    def _get_base_modules(nodes: list) -> list:
+        """Get base module name from parsed code.
+
+        Args:
+            nodes (list): Parsed code of the config file.
+
+        Returns:
+            list: Name of base modules.
+        """
+
+        def _get_base_module_from_with(with_nodes: list) -> list:
+            """Get base module name from if statement in python file.
+
+            Args:
+                with_nodes (list): List of if statement.
+
+            Returns:
+                list: Name of base modules.
+            """
+            base_modules = []
+            for node in with_nodes:
+                assert isinstance(node, ast.ImportFrom), (
+                    'Illegal syntax in config file! Only '
+                    '`from ... import ...` could be implemented` in '
+                    'with read_base()`')
+                assert node.module is not None, (
+                    'Illegal syntax in config file! Syntax like '
+                    '`from . import xxx` is not allowed in `with read_base()`')
+                base_modules.append(node.level * '.' + node.module)
+            return base_modules
+
+        for idx, node in enumerate(nodes):
+            if (isinstance(node, ast.Assign)
+                    and isinstance(node.targets[0], ast.Name)
+                    and node.targets[0].id == BASE_KEY):
+                raise ConfigParsingError(
+                    'The configuration file type in the inheritance chain '
+                    'must match the current configuration file type, either '
+                    '"lazy_import" or non-"lazy_import". You got this error '
+                    f'since you use the syntax like `_base_ = "{node.targets[0].id}"` '  # noqa: E501
+                    'in your config. You should use `with read_base(): ... to` '  # noqa: E501
+                    'mark the inherited config file. See more information '
+                    'in https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html'  # noqa: E501
+                )
+
+            if not isinstance(node, ast.With):
+                continue
+
+            expr = node.items[0].context_expr
+            if (not isinstance(expr, ast.Call)
+                    or not expr.func.id == 'read_base' or  # type: ignore
+                    len(node.items) > 1):
+                raise ConfigParsingError(
+                    'Only `read_base` context manager can be used in the '
+                    'config')
+
+            # The original code:
+            # ```
+            # with read_base():
+            #     from .._base_.default_runtime import *
+            # ```
+            # The processed code:
+            # ```
+            # from .._base_.default_runtime import *
+            # ```
+            # As you can see, the if statement is removed and the
+            # from ... import statement will be unindent
+            for nested_idx, nested_node in enumerate(node.body):
+                nodes.insert(idx + nested_idx + 1, nested_node)
+            nodes.pop(idx)
+            return _get_base_module_from_with(node.body)
+        return []
+
+    @staticmethod
+    def _validate_py_syntax(filename: str):
+        """Validate syntax of python config.
+
+        Args:
+            filename (str): Filename of python config file.
+        """
+        with open(filename, encoding='utf-8') as f:
+            content = f.read()
+        try:
+            ast.parse(content)
+        except SyntaxError as e:
+            raise SyntaxError('There are syntax errors in config '
+                              f'file {filename}: {e}')
+
+    @staticmethod
+    def _substitute_predefined_vars(filename: str, temp_config_name: str):
+        """Substitute predefined variables in config with actual values.
+
+        Sometimes we want some variables in the config to be related to the
+        current path or file name, etc.
+
+        Here is an example of a typical usage scenario. When training a model,
+        we define a working directory in the config that save the models and
+        logs. For different configs, we expect to define different working
+        directories. A common way for users is to use the config file name
+        directly as part of the working directory name, e.g. for the config
+        ``config_setting1.py``, the working directory is
+        ``. /work_dir/config_setting1``.
+
+        This can be easily achieved using predefined variables, which can be
+        written in the config `config_setting1.py` as follows
+
+        .. code-block:: python
+
+           work_dir = '. /work_dir/{{ fileBasenameNoExtension }}'
+
+
+        Here `{{ fileBasenameNoExtension }}` indicates the file name of the
+        config (without the extension), and when the config class reads the
+        config file, it will automatically parse this double-bracketed string
+        to the corresponding actual value.
+
+        .. code-block:: python
+
+           cfg = Config.fromfile('. /config_setting1.py')
+           cfg.work_dir # ". /work_dir/config_setting1"
+
+
+        For details, Please refer to docs/zh_cn/advanced_tutorials/config.md .
+
+        Args:
+            filename (str): Filename of config.
+            temp_config_name (str): Temporary filename to save substituted
+                config.
+        """
+        file_dirname = osp.dirname(filename)
+        file_basename = osp.basename(filename)
+        file_basename_no_extension = osp.splitext(file_basename)[0]
+        file_extname = osp.splitext(filename)[1]
+        support_templates = dict(
+            fileDirname=file_dirname,
+            fileBasename=file_basename,
+            fileBasenameNoExtension=file_basename_no_extension,
+            fileExtname=file_extname)
+        with open(filename, encoding='utf-8') as f:
+            config_file = f.read()
+        for key, value in support_templates.items():
+            regexp = r'\{\{\s*' + str(key) + r'\s*\}\}'
+            value = value.replace('\\', '/')
+            config_file = re.sub(regexp, value, config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+
+    @staticmethod
+    def _substitute_env_variables(filename: str, temp_config_name: str):
+        """Substitute environment variables in config with actual values.
+
+        Sometimes, we want to change some items in the config with environment
+        variables. For examples, we expect to change dataset root by setting
+        ``DATASET_ROOT=/dataset/root/path`` in the command line. This can be
+        easily achieved by writing lines in the config as follows
+
+        .. code-block:: python
+
+           data_root = '{{$DATASET_ROOT:/default/dataset}}/images'
+
+
+        Here, ``{{$DATASET_ROOT:/default/dataset}}`` indicates using the
+        environment variable ``DATASET_ROOT`` to replace the part between
+        ``{{}}``. If the ``DATASET_ROOT`` is not set, the default value
+        ``/default/dataset`` will be used.
+
+        Environment variables not only can replace items in the string, they
+        can also substitute other types of data in config. In this situation,
+        we can write the config as below
+
+        .. code-block:: python
+
+           model = dict(
+               bbox_head = dict(num_classes={{'$NUM_CLASSES:80'}}))
+
+
+        For details, Please refer to docs/zh_cn/tutorials/config.md .
+
+        Args:
+            filename (str): Filename of config.
+            temp_config_name (str): Temporary filename to save substituted
+                config.
+        """
+        with open(filename, encoding='utf-8') as f:
+            config_file = f.read()
+        regexp = r'\{\{[\'\"]?\s*\$(\w+)\s*\:\s*(\S*?)\s*[\'\"]?\}\}'
+        keys = re.findall(regexp, config_file)
+        env_variables = dict()
+        for var_name, value in keys:
+            regexp = r'\{\{[\'\"]?\s*\$' + var_name + r'\s*\:\s*' \
+                + value + r'\s*[\'\"]?\}\}'
+            if var_name in os.environ:
+                value = os.environ[var_name]
+                env_variables[var_name] = value
+                print_log(
+                    f'Using env variable `{var_name}` with value of '
+                    f'{value} to replace item in config.',
+                    logger='current')
+            if not value:
+                raise KeyError(f'`{var_name}` cannot be found in `os.environ`.'
+                               f' Please set `{var_name}` in environment or '
+                               'give a default value.')
+            config_file = re.sub(regexp, value, config_file)
+
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+        return env_variables
+
+    @staticmethod
+    def _pre_substitute_base_vars(filename: str,
+                                  temp_config_name: str) -> dict:
+        """Preceding step for substituting variables in base config with actual
+        value.
+
+        Args:
+            filename (str): Filename of config.
+            temp_config_name (str): Temporary filename to save substituted
+                config.
+
+        Returns:
+            dict: A dictionary contains variables in base config.
+        """
+        with open(filename, encoding='utf-8') as f:
+            config_file = f.read()
+        base_var_dict = {}
+        regexp = r'\{\{\s*' + BASE_KEY + r'\.([\w\.]+)\s*\}\}'
+        base_vars = set(re.findall(regexp, config_file))
+        for base_var in base_vars:
+            randstr = f'_{base_var}_{uuid.uuid4().hex.lower()[:6]}'
+            base_var_dict[randstr] = base_var
+            regexp = r'\{\{\s*' + BASE_KEY + r'\.' + base_var + r'\s*\}\}'
+            config_file = re.sub(regexp, f'"{randstr}"', config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+        return base_var_dict
+
+    @staticmethod
+    def _substitute_base_vars(cfg: Any, base_var_dict: dict,
+                              base_cfg: dict) -> Any:
+        """Substitute base variables from strings to their actual values.
+
+        Args:
+            Any : Config dictionary.
+            base_var_dict (dict): A dictionary contains variables in base
+                config.
+            base_cfg (dict): Base config dictionary.
+
+        Returns:
+            Any : A dictionary with origin base variables
+                substituted with actual values.
+        """
+        cfg = copy.deepcopy(cfg)
+
+        if isinstance(cfg, dict):
+            for k, v in cfg.items():
+                if isinstance(v, str) and v in base_var_dict:
+                    new_v = base_cfg
+                    for new_k in base_var_dict[v].split('.'):
+                        new_v = new_v[new_k]
+                    cfg[k] = new_v
+                elif isinstance(v, (list, tuple, dict)):
+                    cfg[k] = Config._substitute_base_vars(
+                        v, base_var_dict, base_cfg)
+        elif isinstance(cfg, tuple):
+            cfg = tuple(
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg)
+        elif isinstance(cfg, list):
+            cfg = [
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg
+            ]
+        elif isinstance(cfg, str) and cfg in base_var_dict:
+            new_v = base_cfg
+            for new_k in base_var_dict[cfg].split('.'):
+                new_v = new_v[new_k]
+            cfg = new_v
+
+        return cfg
+
+    @staticmethod
+    def _file2dict(
+            filename: str,
+            use_predefined_variables: bool = True,
+            use_environment_variables: bool = True,
+            lazy_import: Optional[bool] = None) -> Tuple[dict, str, dict]:
+        """Transform file to variables dictionary.
+
+        Args:
+            filename (str): Name of config file.
+            use_predefined_variables (bool, optional): Whether to use
+                predefined variables. Defaults to True.
+            use_environment_variables (bool, optional): Whether to use
+                environment variables. Defaults to True.
+            lazy_import (bool): Whether to load config in `lazy_import` mode.
+                If it is `None`, it will be deduced by the content of the
+                config file. Defaults to None.
+
+        Returns:
+            Tuple[dict, str]: Variables dictionary and text of Config.
+        """
+        if lazy_import is None and Config._is_lazy_import(filename):
+            raise RuntimeError(
+                'The configuration file type in the inheritance chain '
+                'must match the current configuration file type, either '
+                '"lazy_import" or non-"lazy_import". You got this error '
+                'since you use the syntax like `with read_base(): ...` '
+                f'or import non-builtin module in {filename}. See more '
+                'information in https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html'  # noqa: E501
+            )
+
+        filename = osp.abspath(osp.expanduser(filename))
+        check_file_exist(filename)
+        fileExtname = osp.splitext(filename)[1]
+        if fileExtname not in ['.py', '.json', '.yaml', '.yml']:
+            raise OSError('Only py/yml/yaml/json type are supported now!')
+        try:
+            with tempfile.TemporaryDirectory() as temp_config_dir:
+                temp_config_file = tempfile.NamedTemporaryFile(
+                    dir=temp_config_dir, suffix=fileExtname, delete=False)
+                if platform.system() == 'Windows':
+                    temp_config_file.close()
+
+                # Substitute predefined variables
+                if use_predefined_variables:
+                    Config._substitute_predefined_vars(filename,
+                                                       temp_config_file.name)
+                else:
+                    shutil.copyfile(filename, temp_config_file.name)
+                # Substitute environment variables
+                env_variables = dict()
+                if use_environment_variables:
+                    env_variables = Config._substitute_env_variables(
+                        temp_config_file.name, temp_config_file.name)
+                # Substitute base variables from placeholders to strings
+                base_var_dict = Config._pre_substitute_base_vars(
+                    temp_config_file.name, temp_config_file.name)
+
+                # Handle base files
+                base_cfg_dict = ConfigDict()
+                cfg_text_list = list()
+                for base_cfg_path in Config._get_base_files(
+                        temp_config_file.name):
+                    base_cfg_path, scope = Config._get_cfg_path(
+                        base_cfg_path, filename)
+                    _cfg_dict, _cfg_text, _env_variables = Config._file2dict(
+                        filename=base_cfg_path,
+                        use_predefined_variables=use_predefined_variables,
+                        use_environment_variables=use_environment_variables,
+                        lazy_import=lazy_import,
+                    )
+                    cfg_text_list.append(_cfg_text)
+                    env_variables.update(_env_variables)
+                    duplicate_keys = base_cfg_dict.keys() & _cfg_dict.keys()
+                    if len(duplicate_keys) > 0:
+                        raise KeyError(
+                            'Duplicate key is not allowed among bases. '
+                            f'Duplicate keys: {duplicate_keys}')
+
+                    # _dict_to_config_dict will do the following things:
+                    # 1. Recursively converts ``dict`` to :obj:`ConfigDict`.
+                    # 2. Set `_scope_` for the outer dict variable for the base
+                    # config.
+                    # 3. Set `scope` attribute for each base variable.
+                    # Different from `_scope_`, `scope` is not a key of base
+                    # dict, `scope` attribute will be parsed to key `_scope_`
+                    # by function `_parse_scope` only if the base variable is
+                    # accessed by the current config.
+                    _cfg_dict = Config._dict_to_config_dict(_cfg_dict, scope)
+                    base_cfg_dict.update(_cfg_dict)
+
+                if filename.endswith('.py'):
+                    with open(temp_config_file.name, encoding='utf-8') as f:
+                        parsed_codes = ast.parse(f.read())
+                        parsed_codes = RemoveAssignFromAST(BASE_KEY).visit(
+                            parsed_codes)
+                    codeobj = compile(parsed_codes, filename, mode='exec')
+                    # Support load global variable in nested function of the
+                    # config.
+                    global_locals_var = {BASE_KEY: base_cfg_dict}
+                    ori_keys = set(global_locals_var.keys())
+                    eval(codeobj, global_locals_var, global_locals_var)
+                    cfg_dict = {
+                        key: value
+                        for key, value in global_locals_var.items()
+                        if (key not in ori_keys and not key.startswith('__'))
+                    }
+                elif filename.endswith(('.yml', '.yaml', '.json')):
+                    cfg_dict = load(temp_config_file.name)
+                # close temp file
+                for key, value in list(cfg_dict.items()):
+                    if isinstance(value,
+                                  (types.FunctionType, types.ModuleType)):
+                        cfg_dict.pop(key)
+                temp_config_file.close()
+
+                # If the current config accesses a base variable of base
+                # configs, The ``scope`` attribute of corresponding variable
+                # will be converted to the `_scope_`.
+                Config._parse_scope(cfg_dict)
+        except Exception as e:
+            if osp.exists(temp_config_dir):
+                shutil.rmtree(temp_config_dir)
+            raise e
+
+        # check deprecation information
+        if DEPRECATION_KEY in cfg_dict:
+            deprecation_info = cfg_dict.pop(DEPRECATION_KEY)
+            warning_msg = f'The config file {filename} will be deprecated ' \
+                'in the future.'
+            if 'expected' in deprecation_info:
+                warning_msg += f' Please use {deprecation_info["expected"]} ' \
+                    'instead.'
+            if 'reference' in deprecation_info:
+                warning_msg += ' More information can be found at ' \
+                    f'{deprecation_info["reference"]}'
+            warnings.warn(warning_msg, DeprecationWarning)
+
+        cfg_text = filename + '\n'
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            cfg_text += f.read()
+
+        # Substitute base variables from strings to their actual values
+        cfg_dict = Config._substitute_base_vars(cfg_dict, base_var_dict,
+                                                base_cfg_dict)
+        cfg_dict.pop(BASE_KEY, None)
+
+        cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
+        cfg_dict = {
+            k: v
+            for k, v in cfg_dict.items() if not k.startswith('__')
+        }
+
+        # merge cfg_text
+        cfg_text_list.append(cfg_text)
+        cfg_text = '\n'.join(cfg_text_list)
+
+        return cfg_dict, cfg_text, env_variables
+
+    @staticmethod
+    def _parse_lazy_import(filename: str) -> Tuple[ConfigDict, set]:
+        """Transform file to variables dictionary.
+
+        Args:
+            filename (str): Name of config file.
+
+        Returns:
+            Tuple[dict, dict]: ``cfg_dict`` and ``imported_names``.
+
+              - cfg_dict (dict): Variables dictionary of parsed config.
+              - imported_names (set): Used to mark the names of
+                imported object.
+        """
+        # In lazy import mode, users can use the Python syntax `import` to
+        # implement inheritance between configuration files, which is easier
+        # for users to understand the hierarchical relationships between
+        # different configuration files.
+
+        # Besides, users can also using `import` syntax to import corresponding
+        # module which will be filled in the `type` field. It means users
+        # can directly navigate to the source of the module in the
+        # configuration file by clicking the `type` field.
+
+        # To avoid really importing the third party package like `torch`
+        # during import `type` object, we use `_parse_lazy_import` to parse the
+        # configuration file, which will not actually trigger the import
+        # process, but simply parse the imported `type`s as LazyObject objects.
+
+        # The overall pipeline of _parse_lazy_import is:
+        # 1. Parse the base module from the config file.
+        #                       ||
+        #                       \/
+        #       base_module = ['mmdet.configs.default_runtime']
+        #                       ||
+        #                       \/
+        # 2. recursively parse the base module and gather imported objects to
+        #    a dict.
+        #                       ||
+        #                       \/
+        #       The base_dict will be:
+        #       {
+        #           'mmdet.configs.default_runtime': {...}
+        #           'mmdet.configs.retinanet_r50_fpn_1x_coco': {...}
+        #           ...
+        #       }, each item in base_dict is a dict of `LazyObject`
+        # 3. parse the current config file filling the imported variable
+        #    with the base_dict.
+        #
+        # 4. During the parsing process, all imported variable will be
+        #    recorded in the `imported_names` set. These variables can be
+        #    accessed, but will not be dumped by default.
+
+        with open(filename, encoding='utf-8') as f:
+            global_dict = {'LazyObject': LazyObject, '__file__': filename}
+            base_dict = {}
+
+            parsed_codes = ast.parse(f.read())
+            # get the names of base modules, and remove the
+            # `with read_base():'` statement
+            base_modules = Config._get_base_modules(parsed_codes.body)
+            base_imported_names = set()
+            for base_module in base_modules:
+                # If base_module means a relative import, assuming the level is
+                # 2, which means the module is imported like
+                # "from ..a.b import c". we must ensure that c is an
+                # object `defined` in module b, and module b should not be a
+                # package including `__init__` file but a single python file.
+                level = len(re.match(r'\.*', base_module).group())
+                if level > 0:
+                    # Relative import
+                    base_dir = osp.dirname(filename)
+                    module_path = osp.join(
+                        base_dir, *(['..'] * (level - 1)),
+                        f'{base_module[level:].replace(".", "/")}.py')
+                else:
+                    # Absolute import
+                    module_list = base_module.split('.')
+                    if len(module_list) == 1:
+                        raise ConfigParsingError(
+                            'The imported configuration file should not be '
+                            f'an independent package {module_list[0]}. Here '
+                            'is an example: '
+                            '`with read_base(): from mmdet.configs.retinanet_r50_fpn_1x_coco import *`'  # noqa: E501
+                        )
+                    else:
+                        package = module_list[0]
+                        root_path = get_installed_path(package)
+                        module_path = f'{osp.join(root_path, *module_list[1:])}.py'  # noqa: E501
+                if not osp.isfile(module_path):
+                    raise ConfigParsingError(
+                        f'{module_path} not found! It means that incorrect '
+                        'module is defined in '
+                        f'`with read_base(): = from {base_module} import ...`, please '  # noqa: E501
+                        'make sure the base config module is valid '
+                        'and is consistent with the prior import '
+                        'logic')
+                _base_cfg_dict, _base_imported_names = Config._parse_lazy_import(  # noqa: E501
+                    module_path)
+                base_imported_names |= _base_imported_names
+                # The base_dict will be:
+                # {
+                #     'mmdet.configs.default_runtime': {...}
+                #     'mmdet.configs.retinanet_r50_fpn_1x_coco': {...}
+                #     ...
+                # }
+                base_dict[base_module] = _base_cfg_dict
+
+            # `base_dict` contains all the imported modules from `base_cfg`.
+            # In order to collect the specific imported module from `base_cfg`
+            # before parse the current file, we using AST Transform to
+            # transverse the imported module from base_cfg and merge then into
+            # the global dict. After the ast transformation, most of import
+            # syntax will be removed (except for the builtin import) and
+            # replaced with the `LazyObject`
+            transform = ImportTransformer(
+                global_dict=global_dict,
+                base_dict=base_dict,
+                filename=filename)
+            modified_code = transform.visit(parsed_codes)
+            modified_code, abs_imported = _gather_abs_import_lazyobj(
+                modified_code, filename=filename)
+            imported_names = transform.imported_obj | abs_imported
+            imported_names |= base_imported_names
+            modified_code = ast.fix_missing_locations(modified_code)
+            exec(
+                compile(modified_code, filename, mode='exec'), global_dict,
+                global_dict)
+
+            ret: dict = {}
+            for key, value in global_dict.items():
+                if key.startswith('__') or key in ['LazyObject']:
+                    continue
+                ret[key] = value
+            # convert dict to ConfigDict
+            cfg_dict = Config._dict_to_config_dict_lazy(ret)
+
+            return cfg_dict, imported_names
+
+    @staticmethod
+    def _dict_to_config_dict_lazy(cfg: dict):
+        """Recursively converts ``dict`` to :obj:`ConfigDict`. The only
+        difference between ``_dict_to_config_dict_lazy`` and
+        ``_dict_to_config_dict_lazy`` is that the former one does not consider
+        the scope, and will not trigger the building of ``LazyObject``.
+
+        Args:
+            cfg (dict): Config dict.
+
+        Returns:
+            ConfigDict: Converted dict.
+        """
+        # Only the outer dict with key `type` should have the key `_scope_`.
+        if isinstance(cfg, dict):
+            cfg_dict = ConfigDict()
+            for key, value in cfg.items():
+                cfg_dict[key] = Config._dict_to_config_dict_lazy(value)
+            return cfg_dict
+        if isinstance(cfg, (tuple, list)):
+            return type(cfg)(
+                Config._dict_to_config_dict_lazy(_cfg) for _cfg in cfg)
+        return cfg
+
+    @staticmethod
+    def _dict_to_config_dict(cfg: dict,
+                             scope: Optional[str] = None,
+                             has_scope=True):
+        """Recursively converts ``dict`` to :obj:`ConfigDict`.
+
+        Args:
+            cfg (dict): Config dict.
+            scope (str, optional): Scope of instance.
+            has_scope (bool): Whether to add `_scope_` key to config dict.
+
+        Returns:
+            ConfigDict: Converted dict.
+        """
+        # Only the outer dict with key `type` should have the key `_scope_`.
+        if isinstance(cfg, dict):
+            if has_scope and 'type' in cfg:
+                has_scope = False
+                if scope is not None and cfg.get('_scope_', None) is None:
+                    cfg._scope_ = scope  # type: ignore
+            cfg = ConfigDict(cfg)
+            dict.__setattr__(cfg, 'scope', scope)
+            for key, value in cfg.items():
+                cfg[key] = Config._dict_to_config_dict(
+                    value, scope=scope, has_scope=has_scope)
+        elif isinstance(cfg, tuple):
+            cfg = tuple(
+                Config._dict_to_config_dict(_cfg, scope, has_scope=has_scope)
+                for _cfg in cfg)
+        elif isinstance(cfg, list):
+            cfg = [
+                Config._dict_to_config_dict(_cfg, scope, has_scope=has_scope)
+                for _cfg in cfg
+            ]
+        return cfg
+
+    @staticmethod
+    def _parse_scope(cfg: dict) -> None:
+        """Adds ``_scope_`` to :obj:`ConfigDict` instance, which means a base
+        variable.
+
+        If the config dict already has the scope, scope will not be
+        overwritten.
+
+        Args:
+            cfg (dict): Config needs to be parsed with scope.
+        """
+        if isinstance(cfg, ConfigDict):
+            cfg._scope_ = cfg.scope
+        elif isinstance(cfg, (tuple, list)):
+            [Config._parse_scope(value) for value in cfg]
+        else:
+            return
+
+    @staticmethod
+    def _get_base_files(filename: str) -> list:
+        """Get the base config file.
+
+        Args:
+            filename (str): The config file.
+
+        Raises:
+            TypeError: Name of config file.
+
+        Returns:
+            list: A list of base config.
+        """
+        file_format = osp.splitext(filename)[1]
+        if file_format == '.py':
+            Config._validate_py_syntax(filename)
+            with open(filename, encoding='utf-8') as f:
+                parsed_codes = ast.parse(f.read()).body
+
+                def is_base_line(c):
+                    return (isinstance(c, ast.Assign)
+                            and isinstance(c.targets[0], ast.Name)
+                            and c.targets[0].id == BASE_KEY)
+
+                base_code = next((c for c in parsed_codes if is_base_line(c)),
+                                 None)
+                if base_code is not None:
+                    base_code = ast.Expression(  # type: ignore
+                        body=base_code.value)  # type: ignore
+                    base_files = eval(compile(base_code, '', mode='eval'))
+                else:
+                    base_files = []
+        elif file_format in ('.yml', '.yaml', '.json'):
+            import mmengine
+            cfg_dict = mmengine.load(filename)
+            base_files = cfg_dict.get(BASE_KEY, [])
+        else:
+            raise ConfigParsingError(
+                'The config type should be py, json, yaml or '
+                f'yml, but got {file_format}')
+        base_files = base_files if isinstance(base_files,
+                                              list) else [base_files]
+        return base_files
+
+    @staticmethod
+    def _get_cfg_path(cfg_path: str,
+                      filename: str) -> Tuple[str, Optional[str]]:
+        """Get the config path from the current or external package.
+
+        Args:
+            cfg_path (str): Relative path of config.
+            filename (str): The config file being parsed.
+
+        Returns:
+            Tuple[str, str or None]: Path and scope of config. If the config
+            is not an external config, the scope will be `None`.
+        """
+        if '::' in cfg_path:
+            # `cfg_path` startswith '::' means an external config path.
+            # Get package name and relative config path.
+            scope = cfg_path.partition('::')[0]
+            package, cfg_path = _get_package_and_cfg_path(cfg_path)
+
+            if not is_installed(package):
+                raise ModuleNotFoundError(
+                    f'{package} is not installed, please install {package} '
+                    f'manually')
+
+            # Get installed package path.
+            package_path = get_installed_path(package)
+            try:
+                # Get config path from meta file.
+                cfg_path = _get_external_cfg_path(package_path, cfg_path)
+            except ValueError:
+                # Since base config does not have a metafile, it should be
+                # concatenated with package path and relative config path.
+                cfg_path = _get_external_cfg_base_path(package_path, cfg_path)
+            except FileNotFoundError as e:
+                raise e
+            return cfg_path, scope
+        else:
+            # Get local config path.
+            cfg_dir = osp.dirname(filename)
+            cfg_path = osp.join(cfg_dir, cfg_path)
+            return cfg_path, None
+
+    @staticmethod
+    def _merge_a_into_b(a: dict,
+                        b: dict,
+                        allow_list_keys: bool = False) -> dict:
+        """merge dict ``a`` into dict ``b`` (non-inplace).
+
+        Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid
+        in-place modifications.
+
+        Args:
+            a (dict): The source dict to be merged into ``b``.
+            b (dict): The origin dict to be fetch keys from ``a``.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in source ``a`` and will replace the element of the
+              corresponding index in b if b is a list. Defaults to False.
+
+        Returns:
+            dict: The modified dict of ``b`` using ``a``.
+
+        Examples:
+            # Normally merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # Delete b first and merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # b is a list
+            >>> Config._merge_a_into_b(
+            ...     {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True)
+            [{'a': 2}, {'b': 2}]
+        """
+        b = b.copy()
+        for k, v in a.items():
+            if allow_list_keys and k.isdigit() and isinstance(b, list):
+                k = int(k)
+                if len(b) <= k:
+                    raise KeyError(f'Index {k} exceeds the length of list {b}')
+                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+            elif isinstance(v, dict):
+                if k in b and not v.pop(DELETE_KEY, False):
+                    allowed_types: Union[Tuple, type] = (
+                        dict, list) if allow_list_keys else dict
+                    if not isinstance(b[k], allowed_types):
+                        raise TypeError(
+                            f'{k}={v} in child config cannot inherit from '
+                            f'base because {k} is a dict in the child config '
+                            f'but is of type {type(b[k])} in base config. '
+                            f'You may set `{DELETE_KEY}=True` to ignore the '
+                            f'base config.')
+                    b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+                else:
+                    b[k] = ConfigDict(v)
+            else:
+                b[k] = v
+        return b
+
+    @staticmethod
+    def auto_argparser(description=None):
+        """Generate argparser from config file automatically (experimental)"""
+        partial_parser = ArgumentParser(description=description)
+        partial_parser.add_argument('config', help='config file path')
+        cfg_file = partial_parser.parse_known_args()[0].config
+        cfg = Config.fromfile(cfg_file)
+        parser = ArgumentParser(description=description)
+        parser.add_argument('config', help='config file path')
+        add_args(parser, cfg)
+        return parser, cfg
+
+    @property
+    def filename(self) -> str:
+        """get file name of config."""
+        return self._filename
+
+    @property
+    def text(self) -> str:
+        """get config text."""
+        return self._text
+
+    @property
+    def env_variables(self) -> dict:
+        """get used environment variables."""
+        return self._env_variables
+
+    @property
+    def pretty_text(self) -> str:
+        """get formatted python config text."""
+
+        indent = 4
+
+        def _indent(s_, num_spaces):
+            s = s_.split('\n')
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)
+            s = first + '\n' + s
+            return s
+
+        def _format_basic_types(k, v, use_mapping=False):
+            if isinstance(v, str):
+                v_str = repr(v)
+            else:
+                v_str = str(v)
+
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent)
+
+            return attr_str
+
+        def _format_list_tuple(k, v, use_mapping=False):
+            if isinstance(v, list):
+                left = '['
+                right = ']'
+            else:
+                left = '('
+                right = ')'
+
+            v_str = f'{left}\n'
+            # check if all items in the list are dict
+            for item in v:
+                if isinstance(item, dict):
+                    v_str += f'dict({_indent(_format_dict(item), indent)}),\n'
+                elif isinstance(item, tuple):
+                    v_str += f'{_indent(_format_list_tuple(None, item), indent)},\n'  # noqa: 501
+                elif isinstance(item, list):
+                    v_str += f'{_indent(_format_list_tuple(None, item), indent)},\n'  # noqa: 501
+                elif isinstance(item, str):
+                    v_str += f'{_indent(repr(item), indent)},\n'
+                else:
+                    v_str += str(item) + ',\n'
+            if k is None:
+                return _indent(v_str, indent) + right
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent) + right
+            return attr_str
+
+        def _contain_invalid_identifier(dict_str):
+            contain_invalid_identifier = False
+            for key_name in dict_str:
+                contain_invalid_identifier |= \
+                    (not str(key_name).isidentifier())
+            return contain_invalid_identifier
+
+        def _format_dict(input_dict, outest_level=False):
+            r = ''
+            s = []
+
+            use_mapping = _contain_invalid_identifier(input_dict)
+            if use_mapping:
+                r += '{'
+            for idx, (k, v) in enumerate(
+                    sorted(input_dict.items(), key=lambda x: str(x[0]))):
+                is_last = idx >= len(input_dict) - 1
+                end = '' if outest_level or is_last else ','
+                if isinstance(v, dict):
+                    v_str = '\n' + _format_dict(v)
+                    if use_mapping:
+                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                        attr_str = f'{k_str}: dict({v_str}'
+                    else:
+                        attr_str = f'{str(k)}=dict({v_str}'
+                    attr_str = _indent(attr_str, indent) + ')' + end
+                elif isinstance(v, (list, tuple)):
+                    attr_str = _format_list_tuple(k, v, use_mapping) + end
+                else:
+                    attr_str = _format_basic_types(k, v, use_mapping) + end
+
+                s.append(attr_str)
+            r += '\n'.join(s)
+            if use_mapping:
+                r += '}'
+            return r
+
+        cfg_dict = self.to_dict()
+        text = _format_dict(cfg_dict, outest_level=True)
+        if self._format_python_code:
+            # copied from setup.cfg
+            yapf_style = dict(
+                based_on_style='pep8',
+                blank_line_before_nested_class_or_def=True,
+                split_before_expression_after_opening_paren=True)
+            try:
+                if digit_version(yapf.__version__) >= digit_version('0.40.2'):
+                    text, _ = FormatCode(text, style_config=yapf_style)
+                else:
+                    text, _ = FormatCode(
+                        text, style_config=yapf_style, verify=True)
+            except:  # noqa: E722
+                raise SyntaxError('Failed to format the config file, please '
+                                  f'check the syntax of: \n{text}')
+        return text
+
+    def __repr__(self):
+        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
+
+    def __len__(self):
+        return len(self._cfg_dict)
+
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._cfg_dict, name)
+
+    def __getitem__(self, name):
+        return self._cfg_dict.__getitem__(name)
+
+    def __setattr__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setitem__(name, value)
+
+    def __iter__(self):
+        return iter(self._cfg_dict)
+
+    def __getstate__(
+            self
+    ) -> Tuple[dict, Optional[str], Optional[str], dict, bool, set]:
+        state = (self._cfg_dict, self._filename, self._text,
+                 self._env_variables, self._format_python_code,
+                 self._imported_names)
+        return state
+
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        other = cls.__new__(cls)
+        memo[id(self)] = other
+
+        for key, value in self.__dict__.items():
+            super(Config, other).__setattr__(key, copy.deepcopy(value, memo))
+
+        return other
+
+    def __copy__(self):
+        cls = self.__class__
+        other = cls.__new__(cls)
+        other.__dict__.update(self.__dict__)
+        super(Config, other).__setattr__('_cfg_dict', self._cfg_dict.copy())
+
+        return other
+
+    copy = __copy__
+
+    def __setstate__(self, state: Tuple[dict, Optional[str], Optional[str],
+                                        dict, bool, set]):
+        super().__setattr__('_cfg_dict', state[0])
+        super().__setattr__('_filename', state[1])
+        super().__setattr__('_text', state[2])
+        super().__setattr__('_env_variables', state[3])
+        super().__setattr__('_format_python_code', state[4])
+        super().__setattr__('_imported_names', state[5])
+
+    def dump(self, file: Optional[Union[str, Path]] = None):
+        """Dump config to file or return config text.
+
+        Args:
+            file (str or Path, optional): If not specified, then the object
+            is dumped to a str, otherwise to a file specified by the filename.
+            Defaults to None.
+
+        Returns:
+            str or None: Config text.
+        """
+        file = str(file) if isinstance(file, Path) else file
+        cfg_dict = self.to_dict()
+        if file is None:
+            if self.filename is None or self.filename.endswith('.py'):
+                return self.pretty_text
+            else:
+                file_format = self.filename.split('.')[-1]
+                return dump(cfg_dict, file_format=file_format)
+        elif file.endswith('.py'):
+            with open(file, 'w', encoding='utf-8') as f:
+                f.write(self.pretty_text)
+        else:
+            file_format = file.split('.')[-1]
+            return dump(cfg_dict, file=file, file_format=file_format)
+
+    def merge_from_dict(self,
+                        options: dict,
+                        allow_list_keys: bool = True) -> None:
+        """Merge list into cfg_dict.
+
+        Merge the dict parsed by MultipleKVAction into this cfg.
+
+        Args:
+            options (dict): dict of configs to merge from.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+                are allowed in ``options`` and will replace the element of the
+                corresponding index in the config if the config is a list.
+                Defaults to True.
+
+        Examples:
+            >>> from mmengine import Config
+            >>> #  Merge dictionary element
+            >>> options = {'model.backbone.depth': 50, 'model.backbone.with_cp': True}
+            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
+            >>> cfg.merge_from_dict(options)
+            >>> cfg._cfg_dict
+            {'model': {'backbone': {'type': 'ResNet', 'depth': 50, 'with_cp': True}}}
+            >>> # Merge list element
+            >>> cfg = Config(
+            >>>     dict(pipeline=[dict(type='LoadImage'),
+            >>>                    dict(type='LoadAnnotations')]))
+            >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')})
+            >>> cfg.merge_from_dict(options, allow_list_keys=True)
+            >>> cfg._cfg_dict
+            {'pipeline': [{'type': 'SelfLoadImage'}, {'type': 'LoadAnnotations'}]}
+        """  # noqa: E501
+        option_cfg_dict: dict = {}
+        for full_key, v in options.items():
+            d = option_cfg_dict
+            key_list = full_key.split('.')
+            for subkey in key_list[:-1]:
+                d.setdefault(subkey, ConfigDict())
+                d = d[subkey]
+            subkey = key_list[-1]
+            d[subkey] = v
+
+        cfg_dict = super().__getattribute__('_cfg_dict')
+        super().__setattr__(
+            '_cfg_dict',
+            Config._merge_a_into_b(
+                option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys))
+
+    @staticmethod
+    def diff(cfg1: Union[str, 'Config'], cfg2: Union[str, 'Config']) -> str:
+        if isinstance(cfg1, str):
+            cfg1 = Config.fromfile(cfg1)
+
+        if isinstance(cfg2, str):
+            cfg2 = Config.fromfile(cfg2)
+
+        res = difflib.unified_diff(
+            cfg1.pretty_text.split('\n'), cfg2.pretty_text.split('\n'))
+
+        # Convert into rich format for better visualization
+        console = Console()
+        text = Text()
+        for line in res:
+            if line.startswith('+'):
+                color = 'bright_green'
+            elif line.startswith('-'):
+                color = 'bright_red'
+            else:
+                color = 'bright_white'
+            _text = Text(line + '\n')
+            _text.stylize(color)
+            text.append(_text)
+
+        with console.capture() as capture:
+            console.print(text)
+
+        return capture.get()
+
+    @staticmethod
+    def _is_lazy_import(filename: str) -> bool:
+        if not filename.endswith('.py'):
+            return False
+        with open(filename, encoding='utf-8') as f:
+            codes_str = f.read()
+            parsed_codes = ast.parse(codes_str)
+        for node in ast.walk(parsed_codes):
+            if (isinstance(node, ast.Assign)
+                    and isinstance(node.targets[0], ast.Name)
+                    and node.targets[0].id == BASE_KEY):
+                return False
+
+            if isinstance(node, ast.With):
+                expr = node.items[0].context_expr
+                if (not isinstance(expr, ast.Call)
+                        or not expr.func.id == 'read_base'):  # type: ignore
+                    raise ConfigParsingError(
+                        'Only `read_base` context manager can be used in the '
+                        'config')
+                return True
+            if isinstance(node, ast.ImportFrom):
+                # relative import -> lazy_import
+                if node.level != 0:
+                    return True
+                # Skip checking when using `mmengine.config` in cfg file
+                if (node.module == 'mmengine' and len(node.names) == 1
+                        and node.names[0].name == 'Config'):
+                    continue
+                if not isinstance(node.module, str):
+                    continue
+                # non-builtin module -> lazy_import
+                if not _is_builtin_module(node.module):
+                    return True
+            if isinstance(node, ast.Import):
+                for alias_node in node.names:
+                    if not _is_builtin_module(alias_node.name):
+                        return True
+        return False
+
+    def _to_lazy_dict(self, keep_imported: bool = False) -> dict:
+        """Convert config object to dictionary with lazy object, and filter the
+        imported object."""
+        res = self._cfg_dict._to_lazy_dict()
+        if hasattr(self, '_imported_names') and not keep_imported:
+            res = {
+                key: value
+                for key, value in res.items()
+                if key not in self._imported_names
+            }
+        return res
+
+    def to_dict(self, keep_imported: bool = False):
+        """Convert all data in the config to a builtin ``dict``.
+
+        Args:
+            keep_imported (bool): Whether to keep the imported field.
+                Defaults to False
+
+        If you import third-party objects in the config file, all imported
+        objects will be converted to a string like ``torch.optim.SGD``
+        """
+        cfg_dict = self._cfg_dict.to_dict()
+        if hasattr(self, '_imported_names') and not keep_imported:
+            cfg_dict = {
+                key: value
+                for key, value in cfg_dict.items()
+                if key not in self._imported_names
+            }
+        return cfg_dict
+
+
+class DictAction(Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options can
+    be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit
+    brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build
+    list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]'
+    """
+
+    @staticmethod
+    def _parse_int_float_bool(val: str) -> Union[int, float, bool, Any]:
+        """parse int/float/bool value in the string."""
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return True if val.lower() == 'true' else False
+        if val == 'None':
+            return None
+        return val
+
+    @staticmethod
+    def _parse_iterable(val: str) -> Union[list, tuple, Any]:
+        """Parse iterable values in the string.
+
+        All elements inside '()' or '[]' are treated as iterable values.
+
+        Args:
+            val (str): Value string.
+
+        Returns:
+            list | tuple | Any: The expanded list or tuple from the string,
+            or single value if no iterable values are found.
+
+        Examples:
+            >>> DictAction._parse_iterable('1,2,3')
+            [1, 2, 3]
+            >>> DictAction._parse_iterable('[a, b, c]')
+            ['a', 'b', 'c']
+            >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]')
+            [(1, 2, 3), ['a', 'b'], 'c']
+        """
+
+        def find_next_comma(string):
+            """Find the position of next comma in the string.
+
+            If no ',' is found in the string, return the string length. All
+            chars inside '()' and '[]' are treated as one element and thus ','
+            inside these brackets are ignored.
+            """
+            assert (string.count('(') == string.count(')')) and (
+                    string.count('[') == string.count(']')), \
+                f'Imbalanced brackets exist in {string}'
+            end = len(string)
+            for idx, char in enumerate(string):
+                pre = string[:idx]
+                # The string before this ',' is balanced
+                if ((char == ',') and (pre.count('(') == pre.count(')'))
+                        and (pre.count('[') == pre.count(']'))):
+                    end = idx
+                    break
+            return end
+
+        # Strip ' and " characters and replace whitespace.
+        val = val.strip('\'\"').replace(' ', '')
+        is_tuple = False
+        if val.startswith('(') and val.endswith(')'):
+            is_tuple = True
+            val = val[1:-1]
+        elif val.startswith('[') and val.endswith(']'):
+            val = val[1:-1]
+        elif ',' not in val:
+            # val is a single value
+            return DictAction._parse_int_float_bool(val)
+
+        values = []
+        while len(val) > 0:
+            comma_idx = find_next_comma(val)
+            element = DictAction._parse_iterable(val[:comma_idx])
+            values.append(element)
+            val = val[comma_idx + 1:]
+
+        if is_tuple:
+            return tuple(values)
+
+        return values
+
+    def __call__(self,
+                 parser: ArgumentParser,
+                 namespace: Namespace,
+                 values: Union[str, Sequence[Any], None],
+                 option_string: str = None):
+        """Parse Variables in string and add them into argparser.
+
+        Args:
+            parser (ArgumentParser): Argument parser.
+            namespace (Namespace): Argument namespace.
+            values (Union[str, Sequence[Any], None]): Argument string.
+            option_string (list[str], optional): Option string.
+                Defaults to None.
+        """
+        # Copied behavior from `argparse._ExtendAction`.
+        options = copy.copy(getattr(namespace, self.dest, None) or {})
+        if values is not None:
+            for kv in values:
+                key, val = kv.split('=', maxsplit=1)
+                options[key] = self._parse_iterable(val)
+        setattr(namespace, self.dest, options)
+
+
+@contextmanager
+def read_base():
+    """Context manager to mark the base config.
+
+    The pure Python-style configuration file allows you to use the import
+    syntax. However, it is important to note that you need to import the base
+    configuration file within the context of ``read_base``, and import other
+    dependencies outside of it.
+
+    You can see more usage of Python-style configuration in the `tutorial`_
+
+    .. _tutorial: https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta
+    """  # noqa: E501
+    yield
diff --git a/head_extractor/src/mmengine/config/lazy.py b/head_extractor/src/mmengine/config/lazy.py
new file mode 100644
index 0000000000000000000000000000000000000000..e83cce7c89cbd919def774addaaac8186c10e989
--- /dev/null
+++ b/head_extractor/src/mmengine/config/lazy.py
@@ -0,0 +1,241 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import importlib
+from typing import Any, Optional, Union
+
+from mmengine.utils import is_seq_of
+
+
+class LazyObject:
+    """LazyObject is used to lazily initialize the imported module during
+    parsing the configuration file.
+
+    During parsing process, the syntax like:
+
+    Examples:
+        >>> import torch.nn as nn
+        >>> from mmdet.models import RetinaNet
+        >>> import mmcls.models
+        >>> import mmcls.datasets
+        >>> import mmcls
+
+    Will be parsed as:
+
+    Examples:
+        >>> # import torch.nn as nn
+        >>> nn = lazyObject('torch.nn')
+        >>> # from mmdet.models import RetinaNet
+        >>> RetinaNet = lazyObject('mmdet.models', 'RetinaNet')
+        >>> # import mmcls.models; import mmcls.datasets; import mmcls
+        >>> mmcls = lazyObject(['mmcls', 'mmcls.datasets', 'mmcls.models'])
+
+    ``LazyObject`` records all module information and will be further
+    referenced by the configuration file.
+
+    Args:
+        module (str or list or tuple): The module name to be imported.
+        imported (str, optional): The imported module name. Defaults to None.
+        location (str, optional): The filename and line number of the imported
+            module statement happened.
+    """
+
+    def __init__(self,
+                 module: Union[str, list, tuple],
+                 imported: Optional[str] = None,
+                 location: Optional[str] = None):
+        if not isinstance(module, str) and not is_seq_of(module, str):
+            raise TypeError('module should be `str`, `list`, or `tuple`'
+                            f'but got {type(module)}, this might be '
+                            'a bug of MMEngine, please report it to '
+                            'https://github.com/open-mmlab/mmengine/issues')
+        self._module: Union[str, list, tuple] = module
+
+        if not isinstance(imported, str) and imported is not None:
+            raise TypeError('imported should be `str` or None, but got '
+                            f'{type(imported)}, this might be '
+                            'a bug of MMEngine, please report it to '
+                            'https://github.com/open-mmlab/mmengine/issues')
+        self._imported = imported
+        self.location = location
+
+    def build(self) -> Any:
+        """Return imported object.
+
+        Returns:
+            Any: Imported object
+        """
+        if isinstance(self._module, str):
+            try:
+                module = importlib.import_module(self._module)
+            except Exception as e:
+                raise type(e)(f'Failed to import {self._module} '
+                              f'in {self.location} for {e}')
+
+            if self._imported is not None:
+                if hasattr(module, self._imported):
+                    module = getattr(module, self._imported)
+                else:
+                    raise ImportError(
+                        f'Failed to import {self._imported} '
+                        f'from {self._module} in {self.location}')
+
+            return module
+        else:
+            # import xxx.xxx
+            # import xxx.yyy
+            # import xxx.zzz
+            # return imported xxx
+            try:
+                for module in self._module:
+                    importlib.import_module(module)  # type: ignore
+                module_name = self._module[0].split('.')[0]
+                return importlib.import_module(module_name)
+            except Exception as e:
+                raise type(e)(f'Failed to import {self.module} '
+                              f'in {self.location} for {e}')
+
+    @property
+    def module(self):
+        if isinstance(self._module, str):
+            return self._module
+        return self._module[0].split('.')[0]
+
+    def __call__(self, *args, **kwargs):
+        raise RuntimeError()
+
+    def __deepcopy__(self, memo):
+        return LazyObject(self._module, self._imported, self.location)
+
+    def __getattr__(self, name):
+        # Cannot locate the line number of the getting attribute.
+        # Therefore only record the filename.
+        if self.location is not None:
+            location = self.location.split(', line')[0]
+        else:
+            location = self.location
+        return LazyAttr(name, self, location)
+
+    def __str__(self) -> str:
+        if self._imported is not None:
+            return self._imported
+        return self.module
+
+    __repr__ = __str__
+
+    # `pickle.dump` will try to get the `__getstate__` and `__setstate__`
+    # methods of the dumped object. If these two methods are not defined,
+    # LazyObject will return a `__getstate__` LazyObject` or `__setstate__`
+    # LazyObject.
+    def __getstate__(self):
+        return self.__dict__
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+
+
+class LazyAttr:
+    """The attribute of the LazyObject.
+
+    When parsing the configuration file, the imported syntax will be
+    parsed as the assignment ``LazyObject``. During the subsequent parsing
+    process, users may reference the attributes of the LazyObject.
+    To ensure that these attributes also contain information needed to
+    reconstruct the attribute itself, LazyAttr was introduced.
+
+    Examples:
+        >>> models = LazyObject(['mmdet.models'])
+        >>> model = dict(type=models.RetinaNet)
+        >>> print(type(model['type']))  # <class 'mmengine.config.lazy.LazyAttr'>
+        >>> print(model['type'].build())  # <class 'mmdet.models.detectors.retinanet.RetinaNet'>
+    """  # noqa: E501
+
+    def __init__(self,
+                 name: str,
+                 source: Union['LazyObject', 'LazyAttr'],
+                 location=None):
+        self.name = name
+        self.source: Union[LazyAttr, LazyObject] = source
+
+        if isinstance(self.source, LazyObject):
+            if isinstance(self.source._module, str):
+                if self.source._imported is None:
+                    # source code:
+                    # from xxx.yyy import zzz
+                    # equivalent code:
+                    # zzz = LazyObject('xxx.yyy', 'zzz')
+                    # The source code of get attribute:
+                    # eee = zzz.eee
+                    # Then, `eee._module` should be "xxx.yyy.zzz"
+                    self._module = self.source._module
+                else:
+                    # source code:
+                    # import xxx.yyy as zzz
+                    # equivalent code:
+                    # zzz = LazyObject('xxx.yyy')
+                    # The source code of get attribute:
+                    # eee = zzz.eee
+                    # Then, `eee._module` should be "xxx.yyy"
+                    self._module = f'{self.source._module}.{self.source}'
+            else:
+                # The source code of LazyObject should be
+                # 1. import xxx.yyy
+                # 2. import xxx.zzz
+                # Equivalent to
+                # xxx = LazyObject(['xxx.yyy', 'xxx.zzz'])
+
+                # The source code of LazyAttr should be
+                # eee = xxx.eee
+                # Then, eee._module = xxx
+                self._module = str(self.source)
+        elif isinstance(self.source, LazyAttr):
+            # 1. import xxx
+            # 2. zzz = xxx.yyy.zzz
+
+            # Equivalent to:
+            # xxx = LazyObject('xxx')
+            # zzz = xxx.yyy.zzz
+            # zzz._module = xxx.yyy._module + zzz.name
+            self._module = f'{self.source._module}.{self.source.name}'
+        self.location = location
+
+    @property
+    def module(self):
+        return self._module
+
+    def __call__(self, *args, **kwargs: Any) -> Any:
+        raise RuntimeError()
+
+    def __getattr__(self, name: str) -> 'LazyAttr':
+        return LazyAttr(name, self)
+
+    def __deepcopy__(self, memo):
+        return LazyAttr(self.name, self.source)
+
+    def build(self) -> Any:
+        """Return the attribute of the imported object.
+
+        Returns:
+            Any: attribute of the imported object.
+        """
+        obj = self.source.build()
+        try:
+            return getattr(obj, self.name)
+        except AttributeError:
+            raise ImportError(f'Failed to import {self.module}.{self.name} in '
+                              f'{self.location}')
+        except ImportError as e:
+            raise e
+
+    def __str__(self) -> str:
+        return self.name
+
+    __repr__ = __str__
+
+    # `pickle.dump` will try to get the `__getstate__` and `__setstate__`
+    # methods of the dumped object. If these two methods are not defined,
+    # LazyAttr will return a `__getstate__` LazyAttr` or `__setstate__`
+    # LazyAttr.
+    def __getstate__(self):
+        return self.__dict__
+
+    def __setstate__(self, state):
+        self.__dict__ = state
diff --git a/head_extractor/src/mmengine/config/utils.py b/head_extractor/src/mmengine/config/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..81b58fb49a06090047e94f665ded8890e093ba1e
--- /dev/null
+++ b/head_extractor/src/mmengine/config/utils.py
@@ -0,0 +1,469 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import ast
+import os.path as osp
+import re
+import sys
+import warnings
+from collections import defaultdict
+from importlib.util import find_spec
+from typing import List, Optional, Tuple, Union
+
+from mmengine.fileio import load
+from mmengine.utils import check_file_exist
+
+PYTHON_ROOT_DIR = osp.dirname(osp.dirname(sys.executable))
+SYSTEM_PYTHON_PREFIX = '/usr/lib/python'
+
+MODULE2PACKAGE = {
+    'mmcls': 'mmcls',
+    'mmdet': 'mmdet',
+    'mmdet3d': 'mmdet3d',
+    'mmseg': 'mmsegmentation',
+    'mmaction': 'mmaction2',
+    'mmtrack': 'mmtrack',
+    'mmpose': 'mmpose',
+    'mmedit': 'mmedit',
+    'mmocr': 'mmocr',
+    'mmgen': 'mmgen',
+    'mmfewshot': 'mmfewshot',
+    'mmrazor': 'mmrazor',
+    'mmflow': 'mmflow',
+    'mmhuman3d': 'mmhuman3d',
+    'mmrotate': 'mmrotate',
+    'mmselfsup': 'mmselfsup',
+    'mmyolo': 'mmyolo',
+    'mmpretrain': 'mmpretrain',
+    'mmagic': 'mmagic',
+}
+
+# PKG2PROJECT is not a proper name to represent the mapping between module name
+# (module import from) and package name (used by pip install). Therefore,
+# PKG2PROJECT will be deprecated and this alias will only be kept until
+# MMEngine v1.0.0
+PKG2PROJECT = MODULE2PACKAGE
+
+
+class ConfigParsingError(RuntimeError):
+    """Raise error when failed to parse pure Python style config files."""
+
+
+def _get_cfg_metainfo(package_path: str, cfg_path: str) -> dict:
+    """Get target meta information from all 'metafile.yml' defined in `mode-
+    index.yml` of external package.
+
+    Args:
+        package_path (str): Path of external package.
+        cfg_path (str): Name of experiment config.
+
+    Returns:
+        dict: Meta information of target experiment.
+    """
+    meta_index_path = osp.join(package_path, '.mim', 'model-index.yml')
+    meta_index = load(meta_index_path)
+    cfg_dict = dict()
+    for meta_path in meta_index['Import']:
+        meta_path = osp.join(package_path, '.mim', meta_path)
+        cfg_meta = load(meta_path)
+        for model_cfg in cfg_meta['Models']:
+            if 'Config' not in model_cfg:
+                warnings.warn(f'There is not `Config` define in {model_cfg}')
+                continue
+            cfg_name = model_cfg['Config'].partition('/')[-1]
+            # Some config could have multiple weights, we only pick the
+            # first one.
+            if cfg_name in cfg_dict:
+                continue
+            cfg_dict[cfg_name] = model_cfg
+    if cfg_path not in cfg_dict:
+        raise ValueError(f'Expected configs: {cfg_dict.keys()}, but got '
+                         f'{cfg_path}')
+    return cfg_dict[cfg_path]
+
+
+def _get_external_cfg_path(package_path: str, cfg_file: str) -> str:
+    """Get config path of external package.
+
+    Args:
+        package_path (str): Path of external package.
+        cfg_file (str): Name of experiment config.
+
+    Returns:
+        str: Absolute config path from external package.
+    """
+    cfg_file = cfg_file.split('.')[0]
+    model_cfg = _get_cfg_metainfo(package_path, cfg_file)
+    cfg_path = osp.join(package_path, model_cfg['Config'])
+    check_file_exist(cfg_path)
+    return cfg_path
+
+
+def _get_external_cfg_base_path(package_path: str, cfg_name: str) -> str:
+    """Get base config path of external package.
+
+    Args:
+        package_path (str): Path of external package.
+        cfg_name (str): External relative config path with 'package::'.
+
+    Returns:
+        str: Absolute config path from external package.
+    """
+    cfg_path = osp.join(package_path, '.mim', 'configs', cfg_name)
+    check_file_exist(cfg_path)
+    return cfg_path
+
+
+def _get_package_and_cfg_path(cfg_path: str) -> Tuple[str, str]:
+    """Get package name and relative config path.
+
+    Args:
+        cfg_path (str): External relative config path with 'package::'.
+
+    Returns:
+        Tuple[str, str]: Package name and config path.
+    """
+    if re.match(r'\w*::\w*/\w*', cfg_path) is None:
+        raise ValueError(
+            '`_get_package_and_cfg_path` is used for get external package, '
+            'please specify the package name and relative config path, just '
+            'like `mmdet::faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py`')
+    package_cfg = cfg_path.split('::')
+    if len(package_cfg) > 2:
+        raise ValueError('`::` should only be used to separate package and '
+                         'config name, but found multiple `::` in '
+                         f'{cfg_path}')
+    package, cfg_path = package_cfg
+    assert package in MODULE2PACKAGE, (
+        f'mmengine does not support to load {package} config.')
+    package = MODULE2PACKAGE[package]
+    return package, cfg_path
+
+
+class RemoveAssignFromAST(ast.NodeTransformer):
+    """Remove Assign node if the target's name match the key.
+
+    Args:
+        key (str): The target name of the Assign node.
+    """
+
+    def __init__(self, key):
+        self.key = key
+
+    def visit_Assign(self, node):
+        if (isinstance(node.targets[0], ast.Name)
+                and node.targets[0].id == self.key):
+            return None
+        else:
+            return node
+
+
+def _is_builtin_module(module_name: str) -> bool:
+    """Check if a module is a built-in module.
+
+    Arg:
+        module_name: name of module.
+    """
+    if module_name.startswith('.'):
+        return False
+    if module_name.startswith('mmengine.config'):
+        return True
+    if module_name in sys.builtin_module_names:
+        return True
+    spec = find_spec(module_name.split('.')[0])
+    # Module not found
+    if spec is None:
+        return False
+    origin_path = getattr(spec, 'origin', None)
+    if origin_path is None:
+        return False
+    origin_path = osp.abspath(origin_path)
+    if ('site-package' in origin_path or 'dist-package' in origin_path
+            or not origin_path.startswith(
+                (PYTHON_ROOT_DIR, SYSTEM_PYTHON_PREFIX))):
+        return False
+    else:
+        return True
+
+
+class ImportTransformer(ast.NodeTransformer):
+    """Convert the import syntax to the assignment of
+    :class:`mmengine.config.LazyObject` and preload the base variable before
+    parsing the configuration file.
+
+    Since you are already looking at this part of the code, I believe you must
+    be interested in the mechanism of the ``lazy_import`` feature of
+    :class:`Config`. In this docstring, we will dive deeper into its
+    principles.
+
+    Most of OpenMMLab users maybe bothered with that:
+
+        * In most of popular IDEs, they cannot navigate to the source code in
+          configuration file
+        * In most of popular IDEs, they cannot jump to the base file in current
+          configuration file, which is much painful when the inheritance
+          relationship is complex.
+
+    In order to solve this problem, we introduce the ``lazy_import`` mode.
+
+    A very intuitive idea for solving this problem is to import the module
+    corresponding to the "type" field using the ``import`` syntax. Similarly,
+    we can also ``import`` base file.
+
+    However, this approach has a significant drawback. It requires triggering
+    the import logic to parse the configuration file, which can be
+    time-consuming. Additionally, it implies downloading numerous dependencies
+    solely for the purpose of parsing the configuration file.
+    However, it's possible that only a portion of the config will actually be
+    used. For instance, the package used in the ``train_pipeline`` may not
+    be necessary for an evaluation task. Forcing users to download these
+    unused packages is not a desirable solution.
+
+    To avoid this problem, we introduce :class:`mmengine.config.LazyObject` and
+    :class:`mmengine.config.LazyAttr`. Before we proceed with further
+    explanations, you may refer to the documentation of these two modules to
+    gain an understanding of their functionalities.
+
+    Actually, one of the functions of ``ImportTransformer`` is to hack the
+    ``import`` syntax. It will replace the import syntax
+    (exclude import the base files) with the assignment of ``LazyObject``.
+
+    As for the import syntax of the base file, we cannot lazy import it since
+    we're eager to merge the fields of current file and base files. Therefore,
+    another function of the ``ImportTransformer`` is to collaborate with
+    ``Config._parse_lazy_import`` to parse the base files.
+
+    Args:
+        global_dict (dict): The global dict of the current configuration file.
+            If we divide ordinary Python syntax into two parts, namely the
+            import section and the non-import section (assuming a simple case
+            with imports at the beginning and the rest of the code following),
+            the variables generated by the import statements are stored in
+            global variables for subsequent code use. In this context,
+            the ``global_dict`` represents the global variables required when
+            executing the non-import code. ``global_dict`` will be filled
+            during visiting the parsed code.
+        base_dict (dict): All variables defined in base files.
+
+            Examples:
+                >>> from mmengine.config import read_base
+                >>>
+                >>>
+                >>> with read_base():
+                >>>     from .._base_.default_runtime import *
+                >>>     from .._base_.datasets.coco_detection import dataset
+
+            In this case, the base_dict will be:
+
+            Examples:
+                >>> base_dict = {
+                >>>     '.._base_.default_runtime': ...
+                >>>     '.._base_.datasets.coco_detection': dataset}
+
+            and `global_dict` will be updated like this:
+
+            Examples:
+                >>> global_dict.update(base_dict['.._base_.default_runtime'])  # `import *` means update all data
+                >>> global_dict.update(dataset=base_dict['.._base_.datasets.coco_detection']['dataset'])  # only update `dataset`
+    """  # noqa: E501
+
+    def __init__(self,
+                 global_dict: dict,
+                 base_dict: Optional[dict] = None,
+                 filename: Optional[str] = None):
+        self.base_dict = base_dict if base_dict is not None else {}
+        self.global_dict = global_dict
+        # In Windows, the filename could be like this:
+        # "C:\\Users\\runneradmin\\AppData\\Local\\"
+        # Although it has been an raw string, ast.parse will firstly escape
+        # it as the executed code:
+        # "C:\Users\runneradmin\AppData\Local\\\"
+        # As you see, the `\U` will be treated as a part of
+        # the escape sequence during code parsing, leading to an
+        # parsing error
+        # Here we use `encode('unicode_escape').decode()` for double escaping
+        if isinstance(filename, str):
+            filename = filename.encode('unicode_escape').decode()
+        self.filename = filename
+        self.imported_obj: set = set()
+        super().__init__()
+
+    def visit_ImportFrom(
+        self, node: ast.ImportFrom
+    ) -> Optional[Union[List[ast.Assign], ast.ImportFrom]]:
+        """Hack the ``from ... import ...`` syntax and update the global_dict.
+
+        Examples:
+            >>> from mmdet.models import RetinaNet
+
+        Will be parsed as:
+
+        Examples:
+            >>> RetinaNet = lazyObject('mmdet.models', 'RetinaNet')
+
+        ``global_dict`` will also be updated by ``base_dict`` as the
+        class docstring says.
+
+        Args:
+            node (ast.AST): The node of the current import statement.
+
+        Returns:
+            Optional[List[ast.Assign]]: There three cases:
+
+                * If the node is a statement of importing base files.
+                  None will be returned.
+                * If the node is a statement of importing a builtin module,
+                  node will be directly returned
+                * Otherwise, it will return the assignment statements of
+                  ``LazyObject``.
+        """
+        # Built-in modules will not be parsed as LazyObject
+        module = f'{node.level*"."}{node.module}'
+        if _is_builtin_module(module):
+            # Make sure builtin module will be added into `self.imported_obj`
+            for alias in node.names:
+                if alias.asname is not None:
+                    self.imported_obj.add(alias.asname)
+                elif alias.name == '*':
+                    raise ConfigParsingError(
+                        'Cannot import * from non-base config')
+                else:
+                    self.imported_obj.add(alias.name)
+            return node
+
+        if module in self.base_dict:
+            for alias_node in node.names:
+                if alias_node.name == '*':
+                    self.global_dict.update(self.base_dict[module])
+                    return None
+                if alias_node.asname is not None:
+                    base_key = alias_node.asname
+                else:
+                    base_key = alias_node.name
+                self.global_dict[base_key] = self.base_dict[module][
+                    alias_node.name]
+            return None
+
+        nodes: List[ast.Assign] = []
+        for alias_node in node.names:
+            # `ast.alias` has lineno attr after Python 3.10,
+            if hasattr(alias_node, 'lineno'):
+                lineno = alias_node.lineno
+            else:
+                lineno = node.lineno
+            if alias_node.name == '*':
+                # TODO: If users import * from a non-config module, it should
+                # fallback to import the real module and raise a warning to
+                # remind users the real module will be imported which will slow
+                # down the parsing speed.
+                raise ConfigParsingError(
+                    'Illegal syntax in config! `from xxx import *` is not '
+                    'allowed to appear outside the `if base:` statement')
+            elif alias_node.asname is not None:
+                # case1:
+                # from mmengine.dataset import BaseDataset as Dataset ->
+                # Dataset = LazyObject('mmengine.dataset', 'BaseDataset')
+                code = f'{alias_node.asname} = LazyObject("{module}", "{alias_node.name}", "{self.filename}, line {lineno}")'  # noqa: E501
+                self.imported_obj.add(alias_node.asname)
+            else:
+                # case2:
+                # from mmengine.model import BaseModel
+                # BaseModel = LazyObject('mmengine.model', 'BaseModel')
+                code = f'{alias_node.name} = LazyObject("{module}", "{alias_node.name}", "{self.filename}, line {lineno}")'  # noqa: E501
+                self.imported_obj.add(alias_node.name)
+            try:
+                nodes.append(ast.parse(code).body[0])  # type: ignore
+            except Exception as e:
+                raise ConfigParsingError(
+                    f'Cannot import {alias_node} from {module}'
+                    '1. Cannot import * from 3rd party lib in the config '
+                    'file\n'
+                    '2. Please check if the module is a base config which '
+                    'should be added to `_base_`\n') from e
+        return nodes
+
+    def visit_Import(self, node) -> Union[ast.Assign, ast.Import]:
+        """Work with ``_gather_abs_import_lazyobj`` to hack the ``import ...``
+        syntax.
+
+        Examples:
+            >>> import mmcls.models
+            >>> import mmcls.datasets
+            >>> import mmcls
+
+        Will be parsed as:
+
+        Examples:
+            >>> # import mmcls.models; import mmcls.datasets; import mmcls
+            >>> mmcls = lazyObject(['mmcls', 'mmcls.datasets', 'mmcls.models'])
+
+        Args:
+            node (ast.AST): The node of the current import statement.
+
+        Returns:
+            ast.Assign: If the import statement is ``import ... as ...``,
+            ast.Assign will be returned, otherwise node will be directly
+            returned.
+        """
+        # For absolute import like: `import mmdet.configs as configs`.
+        # It will be parsed as:
+        # configs = LazyObject('mmdet.configs')
+        # For absolute import like:
+        # `import mmdet.configs`
+        # `import mmdet.configs.default_runtime`
+        # This will be parsed as
+        # mmdet = LazyObject(['mmdet.configs.default_runtime', 'mmdet.configs])
+        # However, visit_Import cannot gather other import information, so
+        # `_gather_abs_import_LazyObject` will gather all import information
+        # from the same module and construct the LazyObject.
+        alias_list = node.names
+        assert len(alias_list) == 1, (
+            'Illegal syntax in config! import multiple modules in one line is '
+            'not supported')
+        # TODO Support multiline import
+        alias = alias_list[0]
+        if alias.asname is not None:
+            self.imported_obj.add(alias.asname)
+            if _is_builtin_module(alias.name.split('.')[0]):
+                return node
+            return ast.parse(  # type: ignore
+                f'{alias.asname} = LazyObject('
+                f'"{alias.name}",'
+                f'location="{self.filename}, line {node.lineno}")').body[0]
+        return node
+
+
+def _gather_abs_import_lazyobj(tree: ast.Module,
+                               filename: Optional[str] = None):
+    """Experimental implementation of gathering absolute import information."""
+    if isinstance(filename, str):
+        filename = filename.encode('unicode_escape').decode()
+    imported = defaultdict(list)
+    abs_imported = set()
+    new_body: List[ast.stmt] = []
+    # module2node is used to get lineno when Python < 3.10
+    module2node: dict = dict()
+    for node in tree.body:
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                # Skip converting built-in module to LazyObject
+                if _is_builtin_module(alias.name):
+                    new_body.append(node)
+                    continue
+                module = alias.name.split('.')[0]
+                module2node.setdefault(module, node)
+                imported[module].append(alias)
+            continue
+        new_body.append(node)
+
+    for key, value in imported.items():
+        names = [_value.name for _value in value]
+        if hasattr(value[0], 'lineno'):
+            lineno = value[0].lineno
+        else:
+            lineno = module2node[key].lineno
+        lazy_module_assign = ast.parse(
+            f'{key} = LazyObject({names}, location="{filename}, line {lineno}")'  # noqa: E501
+        )  # noqa: E501
+        abs_imported.add(key)
+        new_body.insert(0, lazy_module_assign.body[0])
+    tree.body = new_body
+    return tree, abs_imported
diff --git a/head_extractor/src/mmengine/dataset/__init__.py b/head_extractor/src/mmengine/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c58ef983f4af79aa1f29a24e42f4d20f1089b133
--- /dev/null
+++ b/head_extractor/src/mmengine/dataset/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_dataset import BaseDataset, Compose, force_full_init
+from .dataset_wrapper import ClassBalancedDataset, ConcatDataset, RepeatDataset
+from .sampler import DefaultSampler, InfiniteSampler
+from .utils import (COLLATE_FUNCTIONS, default_collate, pseudo_collate,
+                    worker_init_fn)
+
+__all__ = [
+    'BaseDataset', 'Compose', 'force_full_init', 'ClassBalancedDataset',
+    'ConcatDataset', 'RepeatDataset', 'DefaultSampler', 'InfiniteSampler',
+    'worker_init_fn', 'pseudo_collate', 'COLLATE_FUNCTIONS', 'default_collate'
+]
diff --git a/head_extractor/src/mmengine/dataset/base_dataset.py b/head_extractor/src/mmengine/dataset/base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4622f146a5884367a8475494a9967d004aa89b8d
--- /dev/null
+++ b/head_extractor/src/mmengine/dataset/base_dataset.py
@@ -0,0 +1,826 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import functools
+import gc
+import logging
+import pickle
+from collections.abc import Mapping
+from typing import Any, Callable, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmengine.config import Config
+from mmengine.fileio import join_path, list_from_file, load
+from mmengine.logging import print_log
+from mmengine.registry import TRANSFORMS
+from mmengine.utils import is_abs
+
+
+class Compose:
+    """Compose multiple transforms sequentially.
+
+    Args:
+        transforms (Sequence[dict, callable], optional): Sequence of transform
+            object or config dict to be composed.
+    """
+
+    def __init__(self, transforms: Optional[Sequence[Union[dict, Callable]]]):
+        self.transforms: List[Callable] = []
+
+        if transforms is None:
+            transforms = []
+
+        for transform in transforms:
+            # `Compose` can be built with config dict with type and
+            # corresponding arguments.
+            if isinstance(transform, dict):
+                transform = TRANSFORMS.build(transform)
+                if not callable(transform):
+                    raise TypeError(f'transform should be a callable object, '
+                                    f'but got {type(transform)}')
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError(
+                    f'transform must be a callable object or dict, '
+                    f'but got {type(transform)}')
+
+    def __call__(self, data: dict) -> Optional[dict]:
+        """Call function to apply transforms sequentially.
+
+        Args:
+            data (dict): A result dict contains the data to transform.
+
+        Returns:
+           dict: Transformed data.
+        """
+        for t in self.transforms:
+            data = t(data)
+            # The transform will return None when it failed to load images or
+            # cannot find suitable augmentation parameters to augment the data.
+            # Here we simply return None if the transform returns None and the
+            # dataset will handle it by randomly selecting another data sample.
+            if data is None:
+                return None
+        return data
+
+    def __repr__(self):
+        """Print ``self.transforms`` in sequence.
+
+        Returns:
+            str: Formatted string.
+        """
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += f'    {t}'
+        format_string += '\n)'
+        return format_string
+
+
+def force_full_init(old_func: Callable) -> Any:
+    """Those methods decorated by ``force_full_init`` will be forced to call
+    ``full_init`` if the instance has not been fully initiated.
+
+    Args:
+        old_func (Callable): Decorated function, make sure the first arg is an
+            instance with ``full_init`` method.
+
+    Returns:
+        Any: Depends on old_func.
+    """
+
+    @functools.wraps(old_func)
+    def wrapper(obj: object, *args, **kwargs):
+        # The instance must have `full_init` method.
+        if not hasattr(obj, 'full_init'):
+            raise AttributeError(f'{type(obj)} does not have full_init '
+                                 'method.')
+        # If instance does not have `_fully_initialized` attribute or
+        # `_fully_initialized` is False, call `full_init` and set
+        # `_fully_initialized` to True
+        if not getattr(obj, '_fully_initialized', False):
+            print_log(
+                f'Attribute `_fully_initialized` is not defined in '
+                f'{type(obj)} or `type(obj)._fully_initialized is '
+                'False, `full_init` will be called and '
+                f'{type(obj)}._fully_initialized will be set to True',
+                logger='current',
+                level=logging.WARNING)
+            obj.full_init()  # type: ignore
+            obj._fully_initialized = True  # type: ignore
+
+        return old_func(obj, *args, **kwargs)
+
+    return wrapper
+
+
+class BaseDataset(Dataset):
+    r"""BaseDataset for open source projects in OpenMMLab.
+
+    The annotation format is shown as follows.
+
+    .. code-block:: none
+
+        {
+            "metainfo":
+            {
+              "dataset_type": "test_dataset",
+              "task_name": "test_task"
+            },
+            "data_list":
+            [
+              {
+                "img_path": "test_img.jpg",
+                "height": 604,
+                "width": 640,
+                "instances":
+                [
+                  {
+                    "bbox": [0, 0, 10, 20],
+                    "bbox_label": 1,
+                    "mask": [[0,0],[0,10],[10,20],[20,0]],
+                    "extra_anns": [1,2,3]
+                  },
+                  {
+                    "bbox": [10, 10, 110, 120],
+                    "bbox_label": 2,
+                    "mask": [[10,10],[10,110],[110,120],[120,10]],
+                    "extra_anns": [4,5,6]
+                  }
+                ]
+              },
+            ]
+        }
+
+    Args:
+        ann_file (str, optional): Annotation file path. Defaults to ''.
+        metainfo (Mapping or Config, optional): Meta information for
+            dataset, such as class information. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to ''.
+        data_prefix (dict): Prefix for training data. Defaults to
+            dict(img_path='').
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+
+    Note:
+        BaseDataset collects meta information from ``annotation file`` (the
+        lowest priority), ``BaseDataset.METAINFO``(medium) and ``metainfo
+        parameter`` (highest) passed to constructors. The lower priority meta
+        information will be overwritten by higher one.
+
+    Note:
+        Dataset wrapper such as ``ConcatDataset``, ``RepeatDataset`` .etc.
+        should not inherit from ``BaseDataset`` since ``get_subset`` and
+        ``get_subset_`` could produce ambiguous meaning sub-dataset which
+        conflicts with original dataset.
+
+    Examples:
+        >>> # Assume the annotation file is given above.
+        >>> class CustomDataset(BaseDataset):
+        >>>     METAINFO: dict = dict(task_name='custom_task',
+        >>>                           dataset_type='custom_type')
+        >>> metainfo=dict(task_name='custom_task_name')
+        >>> custom_dataset = CustomDataset(
+        >>>                      'path/to/ann_file',
+        >>>                      metainfo=metainfo)
+        >>> # meta information of annotation file will be overwritten by
+        >>> # `CustomDataset.METAINFO`. The merged meta information will
+        >>> # further be overwritten by argument `metainfo`.
+        >>> custom_dataset.metainfo
+        {'task_name': custom_task_name, dataset_type: custom_type}
+    """
+
+    METAINFO: dict = dict()
+    _fully_initialized: bool = False
+
+    def __init__(self,
+                 ann_file: Optional[str] = '',
+                 metainfo: Union[Mapping, Config, None] = None,
+                 data_root: Optional[str] = '',
+                 data_prefix: dict = dict(img_path=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000):
+        self.ann_file = ann_file
+        self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
+        self.data_root = data_root
+        self.data_prefix = copy.copy(data_prefix)
+        self.filter_cfg = copy.deepcopy(filter_cfg)
+        self._indices = indices
+        self.serialize_data = serialize_data
+        self.test_mode = test_mode
+        self.max_refetch = max_refetch
+        self.data_list: List[dict] = []
+        self.data_bytes: np.ndarray
+
+        # Join paths.
+        self._join_prefix()
+
+        # Build pipeline.
+        self.pipeline = Compose(pipeline)
+        # Full initialize the dataset.
+        if not lazy_init:
+            self.full_init()
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index and automatically call ``full_init`` if the
+        dataset has not been fully initialized.
+
+        Args:
+            idx (int): The index of data.
+
+        Returns:
+            dict: The idx-th annotation of the dataset.
+        """
+        if self.serialize_data:
+            start_addr = 0 if idx == 0 else self.data_address[idx - 1].item()
+            end_addr = self.data_address[idx].item()
+            bytes = memoryview(
+                self.data_bytes[start_addr:end_addr])  # type: ignore
+            data_info = pickle.loads(bytes)  # type: ignore
+        else:
+            data_info = copy.deepcopy(self.data_list[idx])
+        # Some codebase needs `sample_idx` of data information. Here we convert
+        # the idx to a positive number and save it in data information.
+        if idx >= 0:
+            data_info['sample_idx'] = idx
+        else:
+            data_info['sample_idx'] = len(self) + idx
+
+        return data_info
+
+    def full_init(self):
+        """Load annotation file and set ``BaseDataset._fully_initialized`` to
+        True.
+
+        If ``lazy_init=False``, ``full_init`` will be called during the
+        instantiation and ``self._fully_initialized`` will be set to True. If
+        ``obj._fully_initialized=False``, the class method decorated by
+        ``force_full_init`` will call ``full_init`` automatically.
+
+        Several steps to initialize annotation:
+
+            - load_data_list: Load annotations from annotation file.
+            - filter data information: Filter annotations according to
+              filter_cfg.
+            - slice_data: Slice dataset according to ``self._indices``
+            - serialize_data: Serialize ``self.data_list`` if
+              ``self.serialize_data`` is True.
+        """
+        if self._fully_initialized:
+            return
+        # load data information
+        self.data_list = self.load_data_list()
+        # filter illegal data, such as data that has no annotations.
+        self.data_list = self.filter_data()
+        # Get subset data according to indices.
+        if self._indices is not None:
+            self.data_list = self._get_unserialized_subset(self._indices)
+
+        # serialize data_list
+        if self.serialize_data:
+            self.data_bytes, self.data_address = self._serialize_data()
+
+        self._fully_initialized = True
+
+    @property
+    def metainfo(self) -> dict:
+        """Get meta information of dataset.
+
+        Returns:
+            dict: meta information collected from ``BaseDataset.METAINFO``,
+            annotation file and metainfo argument during instantiation.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def parse_data_info(self, raw_data_info: dict) -> Union[dict, List[dict]]:
+        """Parse raw annotation to target format.
+
+        This method should return dict or list of dict. Each dict or list
+        contains the data information of a training sample. If the protocol of
+        the sample annotations is changed, this function can be overridden to
+        update the parsing logic while keeping compatibility.
+
+        Args:
+            raw_data_info (dict): Raw data information load from ``ann_file``
+
+        Returns:
+            list or list[dict]: Parsed annotation.
+        """
+        for prefix_key, prefix in self.data_prefix.items():
+            assert prefix_key in raw_data_info, (
+                f'raw_data_info: {raw_data_info} dose not contain prefix key'
+                f'{prefix_key}, please check your data_prefix.')
+            raw_data_info[prefix_key] = join_path(prefix,
+                                                  raw_data_info[prefix_key])
+        return raw_data_info
+
+    def filter_data(self) -> List[dict]:
+        """Filter annotations according to filter_cfg. Defaults return all
+        ``data_list``.
+
+        If some ``data_list`` could be filtered according to specific logic,
+        the subclass should override this method.
+
+        Returns:
+            list[int]: Filtered results.
+        """
+        return self.data_list
+
+    def get_cat_ids(self, idx: int) -> List[int]:
+        """Get category ids by index. Dataset wrapped by ClassBalancedDataset
+        must implement this method.
+
+        The ``ClassBalancedDataset`` requires a subclass which implements this
+        method.
+
+        Args:
+            idx (int): The index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+        raise NotImplementedError(f'{type(self)} must implement `get_cat_ids` '
+                                  'method')
+
+    def __getitem__(self, idx: int) -> dict:
+        """Get the idx-th image and data information of dataset after
+        ``self.pipeline``, and ``full_init`` will be called if the dataset has
+        not been fully initialized.
+
+        During training phase, if ``self.pipeline`` get ``None``,
+        ``self._rand_another`` will be called until a valid image is fetched or
+         the maximum limit of refetech is reached.
+
+        Args:
+            idx (int): The index of self.data_list.
+
+        Returns:
+            dict: The idx-th image and data information of dataset after
+            ``self.pipeline``.
+        """
+        # Performing full initialization by calling `__getitem__` will consume
+        # extra memory. If a dataset is not fully initialized by setting
+        # `lazy_init=True` and then fed into the dataloader. Different workers
+        # will simultaneously read and parse the annotation. It will cost more
+        # time and memory, although this may work. Therefore, it is recommended
+        # to manually call `full_init` before dataset fed into dataloader to
+        # ensure all workers use shared RAM from master process.
+        if not self._fully_initialized:
+            print_log(
+                'Please call `full_init()` method manually to accelerate '
+                'the speed.',
+                logger='current',
+                level=logging.WARNING)
+            self.full_init()
+
+        if self.test_mode:
+            data = self.prepare_data(idx)
+            if data is None:
+                raise Exception('Test time pipline should not get `None` '
+                                'data_sample')
+            return data
+
+        for _ in range(self.max_refetch + 1):
+            data = self.prepare_data(idx)
+            # Broken images or random augmentations may cause the returned data
+            # to be None
+            if data is None:
+                idx = self._rand_another()
+                continue
+            return data
+
+        raise Exception(f'Cannot find valid image after {self.max_refetch}! '
+                        'Please check your image path and pipeline')
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotations from an annotation file named as ``self.ann_file``
+
+        If the annotation file does not follow `OpenMMLab 2.0 format dataset
+        <https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html>`_ .
+        The subclass must override this method for load annotations. The meta
+        information of annotation file will be overwritten :attr:`METAINFO`
+        and ``metainfo`` argument of constructor.
+
+        Returns:
+            list[dict]: A list of annotation.
+        """  # noqa: E501
+        # `self.ann_file` denotes the absolute annotation file path if
+        # `self.root=None` or relative path if `self.root=/path/to/data/`.
+        annotations = load(self.ann_file)
+        if not isinstance(annotations, dict):
+            raise TypeError(f'The annotations loaded from annotation file '
+                            f'should be a dict, but got {type(annotations)}!')
+        if 'data_list' not in annotations or 'metainfo' not in annotations:
+            raise ValueError('Annotation must have data_list and metainfo '
+                             'keys')
+        metainfo = annotations['metainfo']
+        raw_data_list = annotations['data_list']
+
+        # Meta information load from annotation file will not influence the
+        # existed meta information load from `BaseDataset.METAINFO` and
+        # `metainfo` arguments defined in constructor.
+        for k, v in metainfo.items():
+            self._metainfo.setdefault(k, v)
+
+        # load and parse data_infos.
+        data_list = []
+        for raw_data_info in raw_data_list:
+            # parse raw data information to target format
+            data_info = self.parse_data_info(raw_data_info)
+            if isinstance(data_info, dict):
+                # For image tasks, `data_info` should information if single
+                # image, such as dict(img_path='xxx', width=360, ...)
+                data_list.append(data_info)
+            elif isinstance(data_info, list):
+                # For video tasks, `data_info` could contain image
+                # information of multiple frames, such as
+                # [dict(video_path='xxx', timestamps=...),
+                #  dict(video_path='xxx', timestamps=...)]
+                for item in data_info:
+                    if not isinstance(item, dict):
+                        raise TypeError('data_info must be list of dict, but '
+                                        f'got {type(item)}')
+                data_list.extend(data_info)
+            else:
+                raise TypeError('data_info should be a dict or list of dict, '
+                                f'but got {type(data_info)}')
+
+        return data_list
+
+    @classmethod
+    def _load_metainfo(cls,
+                       metainfo: Union[Mapping, Config, None] = None) -> dict:
+        """Collect meta information from the dictionary of meta.
+
+        Args:
+            metainfo (Mapping or Config, optional): Meta information dict.
+                If ``metainfo`` contains existed filename, it will be
+                parsed by ``list_from_file``.
+
+        Returns:
+            dict: Parsed meta information.
+        """
+        # avoid `cls.METAINFO` being overwritten by `metainfo`
+        cls_metainfo = copy.deepcopy(cls.METAINFO)
+        if metainfo is None:
+            return cls_metainfo
+        if not isinstance(metainfo, (Mapping, Config)):
+            raise TypeError('metainfo should be a Mapping or Config, '
+                            f'but got {type(metainfo)}')
+
+        for k, v in metainfo.items():
+            if isinstance(v, str):
+                # If type of value is string, and can be loaded from
+                # corresponding backend. it means the file name of meta file.
+                try:
+                    cls_metainfo[k] = list_from_file(v)
+                except (TypeError, FileNotFoundError):
+                    print_log(
+                        f'{v} is not a meta file, simply parsed as meta '
+                        'information',
+                        logger='current',
+                        level=logging.WARNING)
+                    cls_metainfo[k] = v
+            else:
+                cls_metainfo[k] = v
+        return cls_metainfo
+
+    def _join_prefix(self):
+        """Join ``self.data_root`` with ``self.data_prefix`` and
+        ``self.ann_file``.
+
+        Examples:
+            >>> # self.data_prefix contains relative paths
+            >>> self.data_root = 'a/b/c'
+            >>> self.data_prefix = dict(img='d/e/')
+            >>> self.ann_file = 'f'
+            >>> self._join_prefix()
+            >>> self.data_prefix
+            dict(img='a/b/c/d/e')
+            >>> self.ann_file
+            'a/b/c/f'
+            >>> # self.data_prefix contains absolute paths
+            >>> self.data_root = 'a/b/c'
+            >>> self.data_prefix = dict(img='/d/e/')
+            >>> self.ann_file = 'f'
+            >>> self._join_prefix()
+            >>> self.data_prefix
+            dict(img='/d/e')
+            >>> self.ann_file
+            'a/b/c/f'
+        """
+        # Automatically join annotation file path with `self.root` if
+        # `self.ann_file` is not an absolute path.
+        if self.ann_file and not is_abs(self.ann_file) and self.data_root:
+            self.ann_file = join_path(self.data_root, self.ann_file)
+        # Automatically join data directory with `self.root` if path value in
+        # `self.data_prefix` is not an absolute path.
+        for data_key, prefix in self.data_prefix.items():
+            if not isinstance(prefix, str):
+                raise TypeError('prefix should be a string, but got '
+                                f'{type(prefix)}')
+            if not is_abs(prefix) and self.data_root:
+                self.data_prefix[data_key] = join_path(self.data_root, prefix)
+            else:
+                self.data_prefix[data_key] = prefix
+
+    @force_full_init
+    def get_subset_(self, indices: Union[Sequence[int], int]) -> None:
+        """The in-place version of ``get_subset`` to convert dataset to a
+        subset of original dataset.
+
+        This method will convert the original dataset to a subset of dataset.
+        If type of indices is int, ``get_subset_`` will return a subdataset
+        which contains the first or last few data information according to
+        indices is positive or negative. If type of indices is a sequence of
+        int, the subdataset will extract the data information according to
+        the index given in indices.
+
+        Examples:
+              >>> dataset = BaseDataset('path/to/ann_file')
+              >>> len(dataset)
+              100
+              >>> dataset.get_subset_(90)
+              >>> len(dataset)
+              90
+              >>> # if type of indices is sequence, extract the corresponding
+              >>> # index data information
+              >>> dataset.get_subset_([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+              >>> len(dataset)
+              10
+              >>> dataset.get_subset_(-3)
+              >>> len(dataset) # Get the latest few data information.
+              3
+
+        Args:
+            indices (int or Sequence[int]): If type of indices is int, indices
+                represents the first or last few data of dataset according to
+                indices is positive or negative. If type of indices is
+                Sequence, indices represents the target data information
+                index of dataset.
+        """
+        # Get subset of data from serialized data or data information sequence
+        # according to `self.serialize_data`.
+        if self.serialize_data:
+            self.data_bytes, self.data_address = \
+                self._get_serialized_subset(indices)
+        else:
+            self.data_list = self._get_unserialized_subset(indices)
+
+    @force_full_init
+    def get_subset(self, indices: Union[Sequence[int], int]) -> 'BaseDataset':
+        """Return a subset of dataset.
+
+        This method will return a subset of original dataset. If type of
+        indices is int, ``get_subset_`` will return a subdataset which
+        contains the first or last few data information according to
+        indices is positive or negative. If type of indices is a sequence of
+        int, the subdataset will extract the information according to the index
+        given in indices.
+
+        Examples:
+              >>> dataset = BaseDataset('path/to/ann_file')
+              >>> len(dataset)
+              100
+              >>> subdataset = dataset.get_subset(90)
+              >>> len(sub_dataset)
+              90
+              >>> # if type of indices is list, extract the corresponding
+              >>> # index data information
+              >>> subdataset = dataset.get_subset([0, 1, 2, 3, 4, 5, 6, 7,
+              >>>                                  8, 9])
+              >>> len(sub_dataset)
+              10
+              >>> subdataset = dataset.get_subset(-3)
+              >>> len(subdataset) # Get the latest few data information.
+              3
+
+        Args:
+            indices (int or Sequence[int]): If type of indices is int, indices
+                represents the first or last few data of dataset according to
+                indices is positive or negative. If type of indices is
+                Sequence, indices represents the target data information
+                index of dataset.
+
+        Returns:
+            BaseDataset: A subset of dataset.
+        """
+        # Get subset of data from serialized data or data information list
+        # according to `self.serialize_data`. Since `_get_serialized_subset`
+        # will recalculate the subset data information,
+        # `_copy_without_annotation` will copy all attributes except data
+        # information.
+        sub_dataset = self._copy_without_annotation()
+        # Get subset of dataset with serialize and unserialized data.
+        if self.serialize_data:
+            data_bytes, data_address = \
+                self._get_serialized_subset(indices)
+            sub_dataset.data_bytes = data_bytes.copy()
+            sub_dataset.data_address = data_address.copy()
+        else:
+            data_list = self._get_unserialized_subset(indices)
+            sub_dataset.data_list = copy.deepcopy(data_list)
+        return sub_dataset
+
+    def _get_serialized_subset(self, indices: Union[Sequence[int], int]) \
+            -> Tuple[np.ndarray, np.ndarray]:
+        """Get subset of serialized data information list.
+
+        Args:
+            indices (int or Sequence[int]): If type of indices is int,
+                indices represents the first or last few data of serialized
+                data information list. If type of indices is Sequence, indices
+                represents the target data information index which consist of
+                subset data information.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: subset of serialized data
+            information.
+        """
+        sub_data_bytes: Union[List, np.ndarray]
+        sub_data_address: Union[List, np.ndarray]
+        if isinstance(indices, int):
+            if indices >= 0:
+                assert indices < len(self.data_address), \
+                    f'{indices} is out of dataset length({len(self)}'
+                # Return the first few data information.
+                end_addr = self.data_address[indices - 1].item() \
+                    if indices > 0 else 0
+                # Slicing operation of `np.ndarray` does not trigger a memory
+                # copy.
+                sub_data_bytes = self.data_bytes[:end_addr]
+                # Since the buffer size of first few data information is not
+                # changed,
+                sub_data_address = self.data_address[:indices]
+            else:
+                assert -indices <= len(self.data_address), \
+                    f'{indices} is out of dataset length({len(self)}'
+                # Return the last few data information.
+                ignored_bytes_size = self.data_address[indices - 1]
+                start_addr = self.data_address[indices - 1].item()
+                sub_data_bytes = self.data_bytes[start_addr:]
+                sub_data_address = self.data_address[indices:]
+                sub_data_address = sub_data_address - ignored_bytes_size
+        elif isinstance(indices, Sequence):
+            sub_data_bytes = []
+            sub_data_address = []
+            for idx in indices:
+                assert len(self) > idx >= -len(self)
+                start_addr = 0 if idx == 0 else \
+                    self.data_address[idx - 1].item()
+                end_addr = self.data_address[idx].item()
+                # Get data information by address.
+                sub_data_bytes.append(self.data_bytes[start_addr:end_addr])
+                # Get data information size.
+                sub_data_address.append(end_addr - start_addr)
+            # Handle indices is an empty list.
+            if sub_data_bytes:
+                sub_data_bytes = np.concatenate(sub_data_bytes)
+                sub_data_address = np.cumsum(sub_data_address)
+            else:
+                sub_data_bytes = np.array([])
+                sub_data_address = np.array([])
+        else:
+            raise TypeError('indices should be a int or sequence of int, '
+                            f'but got {type(indices)}')
+        return sub_data_bytes, sub_data_address  # type: ignore
+
+    def _get_unserialized_subset(self, indices: Union[Sequence[int],
+                                                      int]) -> list:
+        """Get subset of data information list.
+
+        Args:
+            indices (int or Sequence[int]): If type of indices is int,
+                indices represents the first or last few data of data
+                information. If type of indices is Sequence, indices represents
+                the target data information index which consist of subset data
+                information.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: subset of data information.
+        """
+        if isinstance(indices, int):
+            if indices >= 0:
+                # Return the first few data information.
+                sub_data_list = self.data_list[:indices]
+            else:
+                # Return the last few data information.
+                sub_data_list = self.data_list[indices:]
+        elif isinstance(indices, Sequence):
+            # Return the data information according to given indices.
+            sub_data_list = []
+            for idx in indices:
+                sub_data_list.append(self.data_list[idx])
+        else:
+            raise TypeError('indices should be a int or sequence of int, '
+                            f'but got {type(indices)}')
+        return sub_data_list
+
+    def _serialize_data(self) -> Tuple[np.ndarray, np.ndarray]:
+        """Serialize ``self.data_list`` to save memory when launching multiple
+        workers in data loading. This function will be called in ``full_init``.
+
+        Hold memory using serialized objects, and data loader workers can use
+        shared RAM from master process instead of making a copy.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: Serialized result and corresponding
+            address.
+        """
+
+        def _serialize(data):
+            buffer = pickle.dumps(data, protocol=4)
+            return np.frombuffer(buffer, dtype=np.uint8)
+
+        # Serialize data information list avoid making multiple copies of
+        # `self.data_list` when iterate `import torch.utils.data.dataloader`
+        # with multiple workers.
+        data_list = [_serialize(x) for x in self.data_list]
+        address_list = np.asarray([len(x) for x in data_list], dtype=np.int64)
+        data_address: np.ndarray = np.cumsum(address_list)
+        # TODO Check if np.concatenate is necessary
+        data_bytes = np.concatenate(data_list)
+        # Empty cache for preventing making multiple copies of
+        # `self.data_info` when loading data multi-processes.
+        self.data_list.clear()
+        gc.collect()
+        return data_bytes, data_address
+
+    def _rand_another(self) -> int:
+        """Get random index.
+
+        Returns:
+            int: Random index from 0 to ``len(self)-1``
+        """
+        return np.random.randint(0, len(self))
+
+    def prepare_data(self, idx) -> Any:
+        """Get data processed by ``self.pipeline``.
+
+        Args:
+            idx (int): The index of ``data_info``.
+
+        Returns:
+            Any: Depends on ``self.pipeline``.
+        """
+        data_info = self.get_data_info(idx)
+        return self.pipeline(data_info)
+
+    @force_full_init
+    def __len__(self) -> int:
+        """Get the length of filtered dataset and automatically call
+        ``full_init`` if the  dataset has not been fully init.
+
+        Returns:
+            int: The length of filtered dataset.
+        """
+        if self.serialize_data:
+            return len(self.data_address)
+        else:
+            return len(self.data_list)
+
+    def _copy_without_annotation(self, memo=dict()) -> 'BaseDataset':
+        """Deepcopy for all attributes other than ``data_list``,
+        ``data_address`` and ``data_bytes``.
+
+        Args:
+            memo: Memory dict which used to reconstruct complex object
+                correctly.
+        """
+        cls = self.__class__
+        other = cls.__new__(cls)
+        memo[id(self)] = other
+
+        for key, value in self.__dict__.items():
+            if key in ['data_list', 'data_address', 'data_bytes']:
+                continue
+            super(BaseDataset, other).__setattr__(key,
+                                                  copy.deepcopy(value, memo))
+
+        return other
diff --git a/head_extractor/src/mmengine/dataset/dataset_wrapper.py b/head_extractor/src/mmengine/dataset/dataset_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..e63860bee00f8bbd4e8ce6c96bbb3ee514e656e6
--- /dev/null
+++ b/head_extractor/src/mmengine/dataset/dataset_wrapper.py
@@ -0,0 +1,529 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import bisect
+import copy
+import logging
+import math
+from collections import defaultdict
+from typing import List, Sequence, Tuple, Union
+
+import numpy as np
+from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
+
+from mmengine.logging import print_log
+from mmengine.registry import DATASETS
+from .base_dataset import BaseDataset, force_full_init
+
+
+@DATASETS.register_module()
+class ConcatDataset(_ConcatDataset):
+    """A wrapper of concatenated dataset.
+
+    Same as ``torch.utils.data.dataset.ConcatDataset`` and support lazy_init.
+
+    Note:
+        ``ConcatDataset`` should not inherit from ``BaseDataset`` since
+        ``get_subset`` and ``get_subset_`` could produce ambiguous meaning
+        sub-dataset which conflicts with original dataset. If you want to use
+        a sub-dataset of ``ConcatDataset``, you should set ``indices``
+        arguments for wrapped dataset which inherit from ``BaseDataset``.
+
+    Args:
+        datasets (Sequence[BaseDataset] or Sequence[dict]): A list of datasets
+            which will be concatenated.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. Defaults to False.
+        ignore_keys (List[str] or str): Ignore the keys that can be
+            unequal in `dataset.metainfo`. Defaults to None.
+            `New in version 0.3.0.`
+    """
+
+    def __init__(self,
+                 datasets: Sequence[Union[BaseDataset, dict]],
+                 lazy_init: bool = False,
+                 ignore_keys: Union[str, List[str], None] = None):
+        self.datasets: List[BaseDataset] = []
+        for i, dataset in enumerate(datasets):
+            if isinstance(dataset, dict):
+                self.datasets.append(DATASETS.build(dataset))
+            elif isinstance(dataset, BaseDataset):
+                self.datasets.append(dataset)
+            else:
+                raise TypeError(
+                    'elements in datasets sequence should be config or '
+                    f'`BaseDataset` instance, but got {type(dataset)}')
+        if ignore_keys is None:
+            self.ignore_keys = []
+        elif isinstance(ignore_keys, str):
+            self.ignore_keys = [ignore_keys]
+        elif isinstance(ignore_keys, list):
+            self.ignore_keys = ignore_keys
+        else:
+            raise TypeError('ignore_keys should be a list or str, '
+                            f'but got {type(ignore_keys)}')
+
+        meta_keys: set = set()
+        for dataset in self.datasets:
+            meta_keys |= dataset.metainfo.keys()
+        # Only use metainfo of first dataset.
+        self._metainfo = self.datasets[0].metainfo
+        for i, dataset in enumerate(self.datasets, 1):
+            for key in meta_keys:
+                if key in self.ignore_keys:
+                    continue
+                if key not in dataset.metainfo:
+                    raise ValueError(
+                        f'{key} does not in the meta information of '
+                        f'the {i}-th dataset')
+                first_type = type(self._metainfo[key])
+                cur_type = type(dataset.metainfo[key])
+                if first_type is not cur_type:  # type: ignore
+                    raise TypeError(
+                        f'The type {cur_type} of {key} in the {i}-th dataset '
+                        'should be the same with the first dataset '
+                        f'{first_type}')
+                if (isinstance(self._metainfo[key], np.ndarray)
+                        and not np.array_equal(self._metainfo[key],
+                                               dataset.metainfo[key])
+                        or (not isinstance(self._metainfo[key], np.ndarray)
+                            and self._metainfo[key] != dataset.metainfo[key])):
+                    raise ValueError(
+                        f'The meta information of the {i}-th dataset does not '
+                        'match meta information of the first dataset')
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        """Get the meta information of the first dataset in ``self.datasets``.
+
+        Returns:
+            dict: Meta information of first dataset.
+        """
+        # Prevent `self._metainfo` from being modified by outside.
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self):
+        """Loop to ``full_init`` each dataset."""
+        if self._fully_initialized:
+            return
+        for d in self.datasets:
+            d.full_init()
+        # Get the cumulative sizes of `self.datasets`. For example, the length
+        # of `self.datasets` is [2, 3, 4], the cumulative sizes is [2, 5, 9]
+        super().__init__(self.datasets)
+        self._fully_initialized = True
+
+    @force_full_init
+    def _get_ori_dataset_idx(self, idx: int) -> Tuple[int, int]:
+        """Convert global idx to local index.
+
+        Args:
+            idx (int): Global index of ``RepeatDataset``.
+
+        Returns:
+            Tuple[int, int]: The index of ``self.datasets`` and the local
+            index of data.
+        """
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError(
+                    f'absolute value of index({idx}) should not exceed dataset'
+                    f'length({len(self)}).')
+            idx = len(self) + idx
+        # Get `dataset_idx` to tell idx belongs to which dataset.
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        # Get the inner index of single dataset.
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+
+        return dataset_idx, sample_idx
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index.
+
+        Args:
+            idx (int): Global index of ``ConcatDataset``.
+
+        Returns:
+            dict: The idx-th annotation of the datasets.
+        """
+        dataset_idx, sample_idx = self._get_ori_dataset_idx(idx)
+        return self.datasets[dataset_idx].get_data_info(sample_idx)
+
+    @force_full_init
+    def __len__(self):
+        return super().__len__()
+
+    def __getitem__(self, idx):
+        if not self._fully_initialized:
+            print_log(
+                'Please call `full_init` method manually to '
+                'accelerate the speed.',
+                logger='current',
+                level=logging.WARNING)
+            self.full_init()
+        dataset_idx, sample_idx = self._get_ori_dataset_idx(idx)
+        return self.datasets[dataset_idx][sample_idx]
+
+    def get_subset_(self, indices: Union[List[int], int]) -> None:
+        """Not supported in ``ConcatDataset`` for the ambiguous meaning of sub-
+        dataset."""
+        raise NotImplementedError(
+            '`ConcatDataset` dose not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `ConcatDataset`.')
+
+    def get_subset(self, indices: Union[List[int], int]) -> 'BaseDataset':
+        """Not supported in ``ConcatDataset`` for the ambiguous meaning of sub-
+        dataset."""
+        raise NotImplementedError(
+            '`ConcatDataset` dose not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `ConcatDataset`.')
+
+
+@DATASETS.register_module()
+class RepeatDataset:
+    """A wrapper of repeated dataset.
+
+    The length of repeated dataset will be `times` larger than the original
+    dataset. This is useful when the data loading time is long but the dataset
+    is small. Using RepeatDataset can reduce the data loading time between
+    epochs.
+
+    Note:
+        ``RepeatDataset`` should not inherit from ``BaseDataset`` since
+        ``get_subset`` and ``get_subset_`` could produce ambiguous meaning
+        sub-dataset which conflicts with original dataset. If you want to use
+        a sub-dataset of ``RepeatDataset``, you should set ``indices``
+        arguments for wrapped dataset which inherit from ``BaseDataset``.
+
+    Args:
+        dataset (BaseDataset or dict): The dataset to be repeated.
+        times (int): Repeat times.
+        lazy_init (bool): Whether to load annotation during
+            instantiation. Defaults to False.
+    """
+
+    def __init__(self,
+                 dataset: Union[BaseDataset, dict],
+                 times: int,
+                 lazy_init: bool = False):
+        self.dataset: BaseDataset
+        if isinstance(dataset, dict):
+            self.dataset = DATASETS.build(dataset)
+        elif isinstance(dataset, BaseDataset):
+            self.dataset = dataset
+        else:
+            raise TypeError(
+                'elements in datasets sequence should be config or '
+                f'`BaseDataset` instance, but got {type(dataset)}')
+        self.times = times
+        self._metainfo = self.dataset.metainfo
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        """Get the meta information of the repeated dataset.
+
+        Returns:
+            dict: The meta information of repeated dataset.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self):
+        """Loop to ``full_init`` each dataset."""
+        if self._fully_initialized:
+            return
+
+        self.dataset.full_init()
+        self._ori_len = len(self.dataset)
+        self._fully_initialized = True
+
+    @force_full_init
+    def _get_ori_dataset_idx(self, idx: int) -> int:
+        """Convert global index to local index.
+
+        Args:
+            idx: Global index of ``RepeatDataset``.
+
+        Returns:
+            idx (int): Local index of data.
+        """
+        return idx % self._ori_len
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index.
+
+        Args:
+            idx (int): Global index of ``ConcatDataset``.
+
+        Returns:
+            dict: The idx-th annotation of the datasets.
+        """
+        sample_idx = self._get_ori_dataset_idx(idx)
+        return self.dataset.get_data_info(sample_idx)
+
+    def __getitem__(self, idx):
+        if not self._fully_initialized:
+            print_log(
+                'Please call `full_init` method manually to accelerate the '
+                'speed.',
+                logger='current',
+                level=logging.WARNING)
+            self.full_init()
+
+        sample_idx = self._get_ori_dataset_idx(idx)
+        return self.dataset[sample_idx]
+
+    @force_full_init
+    def __len__(self):
+        return self.times * self._ori_len
+
+    def get_subset_(self, indices: Union[List[int], int]) -> None:
+        """Not supported in ``RepeatDataset`` for the ambiguous meaning of sub-
+        dataset."""
+        raise NotImplementedError(
+            '`RepeatDataset` dose not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `RepeatDataset`.')
+
+    def get_subset(self, indices: Union[List[int], int]) -> 'BaseDataset':
+        """Not supported in ``RepeatDataset`` for the ambiguous meaning of sub-
+        dataset."""
+        raise NotImplementedError(
+            '`RepeatDataset` dose not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `RepeatDataset`.')
+
+
+@DATASETS.register_module()
+class ClassBalancedDataset:
+    """A wrapper of class balanced dataset.
+
+    Suitable for training on class imbalanced datasets like LVIS. Following
+    the sampling strategy in the `paper <https://arxiv.org/abs/1908.03195>`_,
+    in each epoch, an image may appear multiple times based on its
+    "repeat factor".
+    The repeat factor for an image is a function of the frequency the rarest
+    category labeled in that image. The "frequency of category c" in [0, 1]
+    is defined by the fraction of images in the training set (without repeats)
+    in which category c appears.
+    The dataset needs to instantiate :meth:`get_cat_ids` to support
+    ClassBalancedDataset.
+
+    The repeat factor is computed as followed.
+
+    1. For each category c, compute the fraction # of images
+       that contain it: :math:`f(c)`
+    2. For each category c, compute the category-level repeat factor:
+       :math:`r(c) = max(1, sqrt(t/f(c)))`
+    3. For each image I, compute the image-level repeat factor:
+       :math:`r(I) = max_{c in I} r(c)`
+
+    Note:
+        ``ClassBalancedDataset`` should not inherit from ``BaseDataset``
+        since ``get_subset`` and ``get_subset_`` could  produce ambiguous
+        meaning sub-dataset which conflicts with original dataset. If you
+        want to use a sub-dataset of ``ClassBalancedDataset``, you should set
+        ``indices`` arguments for wrapped dataset which inherit from
+        ``BaseDataset``.
+
+    Args:
+        dataset (BaseDataset or dict): The dataset to be repeated.
+        oversample_thr (float): frequency threshold below which data is
+            repeated. For categories with ``f_c >= oversample_thr``, there is
+            no oversampling. For categories with ``f_c < oversample_thr``, the
+            degree of oversampling following the square-root inverse frequency
+            heuristic above.
+        lazy_init (bool, optional): whether to load annotation during
+            instantiation. Defaults to False
+    """
+
+    def __init__(self,
+                 dataset: Union[BaseDataset, dict],
+                 oversample_thr: float,
+                 lazy_init: bool = False):
+        if isinstance(dataset, dict):
+            self.dataset = DATASETS.build(dataset)
+        elif isinstance(dataset, BaseDataset):
+            self.dataset = dataset
+        else:
+            raise TypeError(
+                'elements in datasets sequence should be config or '
+                f'`BaseDataset` instance, but got {type(dataset)}')
+        self.oversample_thr = oversample_thr
+        self._metainfo = self.dataset.metainfo
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        """Get the meta information of the repeated dataset.
+
+        Returns:
+            dict: The meta information of repeated dataset.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self):
+        """Loop to ``full_init`` each dataset."""
+        if self._fully_initialized:
+            return
+
+        self.dataset.full_init()
+        # Get repeat factors for each image.
+        repeat_factors = self._get_repeat_factors(self.dataset,
+                                                  self.oversample_thr)
+        # Repeat dataset's indices according to repeat_factors. For example,
+        # if `repeat_factors = [1, 2, 3]`, and the `len(dataset) == 3`,
+        # the repeated indices will be [1, 2, 2, 3, 3, 3].
+        repeat_indices = []
+        for dataset_index, repeat_factor in enumerate(repeat_factors):
+            repeat_indices.extend([dataset_index] * math.ceil(repeat_factor))
+        self.repeat_indices = repeat_indices
+
+        self._fully_initialized = True
+
+    def _get_repeat_factors(self, dataset: BaseDataset,
+                            repeat_thr: float) -> List[float]:
+        """Get repeat factor for each images in the dataset.
+
+        Args:
+            dataset (BaseDataset): The dataset.
+            repeat_thr (float): The threshold of frequency. If an image
+                contains the categories whose frequency below the threshold,
+                it would be repeated.
+
+        Returns:
+            List[float]: The repeat factors for each images in the dataset.
+        """
+        # 1. For each category c, compute the fraction # of images
+        #   that contain it: f(c)
+        category_freq: defaultdict = defaultdict(float)
+        num_images = len(dataset)
+        for idx in range(num_images):
+            cat_ids = set(self.dataset.get_cat_ids(idx))
+            for cat_id in cat_ids:
+                category_freq[cat_id] += 1
+        for k, v in category_freq.items():
+            assert v > 0, f'caterogy {k} does not contain any images'
+            category_freq[k] = v / num_images
+
+        # 2. For each category c, compute the category-level repeat factor:
+        #    r(c) = max(1, sqrt(t/f(c)))
+        category_repeat = {
+            cat_id: max(1.0, math.sqrt(repeat_thr / cat_freq))
+            for cat_id, cat_freq in category_freq.items()
+        }
+
+        # 3. For each image I and its labels L(I), compute the image-level
+        # repeat factor:
+        #    r(I) = max_{c in L(I)} r(c)
+        repeat_factors = []
+        for idx in range(num_images):
+            # the length of `repeat_factors` need equal to the length of
+            # dataset. Hence, if the `cat_ids` is empty,
+            # the repeat_factor should be 1.
+            repeat_factor: float = 1.
+            cat_ids = set(self.dataset.get_cat_ids(idx))
+            if len(cat_ids) != 0:
+                repeat_factor = max(
+                    {category_repeat[cat_id]
+                     for cat_id in cat_ids})
+            repeat_factors.append(repeat_factor)
+
+        return repeat_factors
+
+    @force_full_init
+    def _get_ori_dataset_idx(self, idx: int) -> int:
+        """Convert global index to local index.
+
+        Args:
+            idx (int): Global index of ``RepeatDataset``.
+
+        Returns:
+            int: Local index of data.
+        """
+        return self.repeat_indices[idx]
+
+    @force_full_init
+    def get_cat_ids(self, idx: int) -> List[int]:
+        """Get category ids of class balanced dataset by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            List[int]: All categories in the image of specified index.
+        """
+        sample_idx = self._get_ori_dataset_idx(idx)
+        return self.dataset.get_cat_ids(sample_idx)
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index.
+
+        Args:
+            idx (int): Global index of ``ConcatDataset``.
+
+        Returns:
+            dict: The idx-th annotation of the dataset.
+        """
+        sample_idx = self._get_ori_dataset_idx(idx)
+        return self.dataset.get_data_info(sample_idx)
+
+    def __getitem__(self, idx):
+        if not self._fully_initialized:
+            print_log(
+                'Please call `full_init` method manually to accelerate '
+                'the speed.',
+                logger='current',
+                level=logging.WARNING)
+            self.full_init()
+
+        ori_index = self._get_ori_dataset_idx(idx)
+        return self.dataset[ori_index]
+
+    @force_full_init
+    def __len__(self):
+        return len(self.repeat_indices)
+
+    def get_subset_(self, indices: Union[List[int], int]) -> None:
+        """Not supported in ``ClassBalancedDataset`` for the ambiguous meaning
+        of sub-dataset."""
+        raise NotImplementedError(
+            '`ClassBalancedDataset` dose not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `ClassBalancedDataset`.')
+
+    def get_subset(self, indices: Union[List[int], int]) -> 'BaseDataset':
+        """Not supported in ``ClassBalancedDataset`` for the ambiguous meaning
+        of sub-dataset."""
+        raise NotImplementedError(
+            '`ClassBalancedDataset` dose not support `get_subset` and '
+            '`get_subset_` interfaces because this will lead to ambiguous '
+            'implementation of some methods. If you want to use `get_subset` '
+            'or `get_subset_` interfaces, please use them in the wrapped '
+            'dataset first and then use `ClassBalancedDataset`.')
diff --git a/head_extractor/src/mmengine/dataset/sampler.py b/head_extractor/src/mmengine/dataset/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..95e8e2da6b3d8c44d9423eeedef8a168df9742ca
--- /dev/null
+++ b/head_extractor/src/mmengine/dataset/sampler.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+import math
+from typing import Iterator, Optional, Sized
+
+import torch
+from torch.utils.data import Sampler
+
+from mmengine.dist import get_dist_info, sync_random_seed
+from mmengine.registry import DATA_SAMPLERS
+
+
+@DATA_SAMPLERS.register_module()
+class DefaultSampler(Sampler):
+    """The default data sampler for both distributed and non-distributed
+    environment.
+
+    It has several differences from the PyTorch ``DistributedSampler`` as
+    below:
+
+    1. This sampler supports non-distributed environment.
+
+    2. The round up behaviors are a little different.
+
+       - If ``round_up=True``, this sampler will add extra samples to make the
+         number of samples is evenly divisible by the world size. And
+         this behavior is the same as the ``DistributedSampler`` with
+         ``drop_last=False``.
+       - If ``round_up=False``, this sampler won't remove or add any samples
+         while the ``DistributedSampler`` with ``drop_last=True`` will remove
+         tail samples.
+
+    Args:
+        dataset (Sized): The dataset.
+        shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
+        seed (int, optional): Random seed used to shuffle the sampler if
+            :attr:`shuffle=True`. This number should be identical across all
+            processes in the distributed group. Defaults to None.
+        round_up (bool): Whether to add extra samples to make the number of
+            samples evenly divisible by the world size. Defaults to True.
+    """
+
+    def __init__(self,
+                 dataset: Sized,
+                 shuffle: bool = True,
+                 seed: Optional[int] = None,
+                 round_up: bool = True) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.shuffle = shuffle
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+        self.epoch = 0
+        self.round_up = round_up
+
+        if self.round_up:
+            self.num_samples = math.ceil(len(self.dataset) / world_size)
+            self.total_size = self.num_samples * self.world_size
+        else:
+            self.num_samples = math.ceil(
+                (len(self.dataset) - rank) / world_size)
+            self.total_size = len(self.dataset)
+
+    def __iter__(self) -> Iterator[int]:
+        """Iterate the indices."""
+        # deterministically shuffle based on epoch and seed
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        if self.round_up:
+            indices = (
+                indices *
+                int(self.total_size / len(indices) + 1))[:self.total_size]
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.world_size]
+
+        return iter(indices)
+
+    def __len__(self) -> int:
+        """The number of samples in this rank."""
+        return self.num_samples
+
+    def set_epoch(self, epoch: int) -> None:
+        """Sets the epoch for this sampler.
+
+        When :attr:`shuffle=True`, this ensures all replicas use a different
+        random ordering for each epoch. Otherwise, the next iteration of this
+        sampler will yield the same ordering.
+
+        Args:
+            epoch (int): Epoch number.
+        """
+        self.epoch = epoch
+
+
+@DATA_SAMPLERS.register_module()
+class InfiniteSampler(Sampler):
+    """It's designed for iteration-based runner and yields a mini-batch indices
+    each time.
+
+    The implementation logic is referred to
+    https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/samplers/distributed_sampler.py
+
+    Args:
+        dataset (Sized): The dataset.
+        shuffle (bool): Whether shuffle the dataset or not. Defaults to True.
+        seed (int, optional): Random seed. If None, set a random seed.
+            Defaults to None.
+    """  # noqa: W605
+
+    def __init__(self,
+                 dataset: Sized,
+                 shuffle: bool = True,
+                 seed: Optional[int] = None) -> None:
+        rank, world_size = get_dist_info()
+        self.rank = rank
+        self.world_size = world_size
+
+        self.dataset = dataset
+        self.world_size = world_size
+        self.rank = rank
+        self.shuffle = shuffle
+        if seed is None:
+            seed = sync_random_seed()
+        self.seed = seed
+        self.size = len(dataset)
+        self.indices = self._indices_of_rank()
+
+    def _infinite_indices(self) -> Iterator[int]:
+        """Infinitely yield a sequence of indices."""
+        g = torch.Generator()
+        g.manual_seed(self.seed)
+        while True:
+            if self.shuffle:
+                yield from torch.randperm(self.size, generator=g).tolist()
+
+            else:
+                yield from torch.arange(self.size).tolist()
+
+    def _indices_of_rank(self) -> Iterator[int]:
+        """Slice the infinite indices by rank."""
+        yield from itertools.islice(self._infinite_indices(), self.rank, None,
+                                    self.world_size)
+
+    def __iter__(self) -> Iterator[int]:
+        """Iterate the indices."""
+        yield from self.indices
+
+    def __len__(self) -> int:
+        """Length of base dataset."""
+        return self.size
+
+    def set_epoch(self, epoch: int) -> None:
+        """Not supported in iteration-based runner."""
+        pass
diff --git a/head_extractor/src/mmengine/dataset/utils.py b/head_extractor/src/mmengine/dataset/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c9cf964970dc33e3b14ab3c343ef7a16067bd90
--- /dev/null
+++ b/head_extractor/src/mmengine/dataset/utils.py
@@ -0,0 +1,165 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+import warnings
+from typing import Any, Mapping, Sequence
+
+import numpy as np
+import torch
+from torch.utils.data._utils.collate import \
+    default_collate as torch_default_collate
+
+from mmengine.registry import FUNCTIONS
+from mmengine.structures import BaseDataElement
+
+# FUNCTIONS is new in MMEngine v0.7.0. Reserve the `COLLATE_FUNCTIONS` to keep
+# the compatibility.
+COLLATE_FUNCTIONS = FUNCTIONS
+
+
+def worker_init_fn(worker_id: int,
+                   num_workers: int,
+                   rank: int,
+                   seed: int,
+                   disable_subprocess_warning: bool = False) -> None:
+    """This function will be called on each worker subprocess after seeding and
+    before data loading.
+
+    Args:
+        worker_id (int): Worker id in [0, num_workers - 1].
+        num_workers (int): How many subprocesses to use for data loading.
+        rank (int): Rank of process in distributed environment. If in
+            non-distributed environment, it is a constant number `0`.
+        seed (int): Random seed.
+    """
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+    torch.manual_seed(worker_seed)
+    if disable_subprocess_warning and worker_id != 0:
+        warnings.simplefilter('ignore')
+
+
+@FUNCTIONS.register_module()
+def pseudo_collate(data_batch: Sequence) -> Any:
+    """Convert list of data sampled from dataset into a batch of data, of which
+    type consistent with the type of each data_itement in ``data_batch``.
+
+    The default behavior of dataloader is to merge a list of samples to form
+    a mini-batch of Tensor(s). However, in MMEngine, ``pseudo_collate``
+    will not stack tensors to batch tensors, and convert int, float, ndarray to
+    tensors.
+
+    This code is referenced from:
+    `Pytorch default_collate <https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py>`_.
+
+    Args:
+        data_batch (Sequence): Batch of data from dataloader.
+
+    Returns:
+        Any: Transversed Data in the same format as the data_itement of
+        ``data_batch``.
+    """  # noqa: E501
+    data_item = data_batch[0]
+    data_item_type = type(data_item)
+    if isinstance(data_item, (str, bytes)):
+        return data_batch
+    elif isinstance(data_item, tuple) and hasattr(data_item, '_fields'):
+        # named tuple
+        return data_item_type(*(pseudo_collate(samples)
+                                for samples in zip(*data_batch)))
+    elif isinstance(data_item, Sequence):
+        # check to make sure that the data_itements in batch have
+        # consistent size
+        it = iter(data_batch)
+        data_item_size = len(next(it))
+        if not all(len(data_item) == data_item_size for data_item in it):
+            raise RuntimeError(
+                'each data_itement in list of batch should be of equal size')
+        transposed = list(zip(*data_batch))
+
+        if isinstance(data_item, tuple):
+            return [pseudo_collate(samples)
+                    for samples in transposed]  # Compat with Pytorch.
+        else:
+            try:
+                return data_item_type(
+                    [pseudo_collate(samples) for samples in transposed])
+            except TypeError:
+                # The sequence type may not support `__init__(iterable)`
+                # (e.g., `range`).
+                return [pseudo_collate(samples) for samples in transposed]
+    elif isinstance(data_item, Mapping):
+        return data_item_type({
+            key: pseudo_collate([d[key] for d in data_batch])
+            for key in data_item
+        })
+    else:
+        return data_batch
+
+
+@FUNCTIONS.register_module()
+def default_collate(data_batch: Sequence) -> Any:
+    """Convert list of data sampled from dataset into a batch of data, of which
+    type consistent with the type of each data_itement in ``data_batch``.
+
+    Different from :func:`pseudo_collate`, ``default_collate`` will stack
+    tensor contained in ``data_batch`` into a batched tensor with the
+    first dimension batch size, and then move input tensor to the target
+    device.
+
+    Different from ``default_collate`` in pytorch, ``default_collate`` will
+    not process ``BaseDataElement``.
+
+    This code is referenced from:
+    `Pytorch default_collate <https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py>`_.
+
+    Note:
+        ``default_collate`` only accept input tensor with the same shape.
+
+    Args:
+        data_batch (Sequence): Data sampled from dataset.
+
+    Returns:
+        Any: Data in the same format as the data_itement of ``data_batch``, of which
+        tensors have been stacked, and ndarray, int, float have been
+        converted to tensors.
+    """  # noqa: E501
+    data_item = data_batch[0]
+    data_item_type = type(data_item)
+
+    if isinstance(data_item, (BaseDataElement, str, bytes)):
+        return data_batch
+    elif isinstance(data_item, tuple) and hasattr(data_item, '_fields'):
+        # named_tuple
+        return data_item_type(*(default_collate(samples)
+                                for samples in zip(*data_batch)))
+    elif isinstance(data_item, Sequence):
+        # check to make sure that the data_itements in batch have
+        # consistent size
+        it = iter(data_batch)
+        data_item_size = len(next(it))
+        if not all(len(data_item) == data_item_size for data_item in it):
+            raise RuntimeError(
+                'each data_itement in list of batch should be of equal size')
+        transposed = list(zip(*data_batch))
+
+        if isinstance(data_item, tuple):
+            return [default_collate(samples)
+                    for samples in transposed]  # Compat with Pytorch.
+        else:
+            try:
+                return data_item_type(
+                    [default_collate(samples) for samples in transposed])
+            except TypeError:
+                # The sequence type may not support `__init__(iterable)`
+                # (e.g., `range`).
+                return [default_collate(samples) for samples in transposed]
+    elif isinstance(data_item, Mapping):
+        return data_item_type({
+            key: default_collate([d[key] for d in data_batch])
+            for key in data_item
+        })
+    else:
+        return torch_default_collate(data_batch)
diff --git a/head_extractor/src/mmengine/device/__init__.py b/head_extractor/src/mmengine/device/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..88937d55922cbf90394a07c4e896f4ce328bf976
--- /dev/null
+++ b/head_extractor/src/mmengine/device/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .utils import (get_device, get_max_cuda_memory, get_max_musa_memory,
+                    is_cuda_available, is_dipu_available, is_mlu_available,
+                    is_mps_available, is_musa_available, is_npu_available,
+                    is_npu_support_full_precision)
+
+__all__ = [
+    'get_max_cuda_memory', 'get_device', 'is_cuda_available',
+    'is_mlu_available', 'is_mps_available', 'is_npu_available',
+    'is_dipu_available', 'get_max_musa_memory', 'is_musa_available',
+    'is_npu_support_full_precision'
+]
diff --git a/head_extractor/src/mmengine/device/utils.py b/head_extractor/src/mmengine/device/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fe6e0c15644f60f6c88036ff0e3cdfe0d40baff
--- /dev/null
+++ b/head_extractor/src/mmengine/device/utils.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Optional
+
+import torch
+
+try:
+    import torch_npu  # noqa: F401
+    import torch_npu.npu.utils as npu_utils
+
+    # Enable operator support for dynamic shape and
+    # binary operator support on the NPU.
+    npu_jit_compile = bool(os.getenv('NPUJITCompile', False))
+    torch.npu.set_compile_mode(jit_compile=npu_jit_compile)
+    IS_NPU_AVAILABLE = hasattr(torch, 'npu') and torch.npu.is_available()
+except Exception:
+    IS_NPU_AVAILABLE = False
+
+try:
+    import torch_mlu  # noqa: F401
+    IS_MLU_AVAILABLE = hasattr(torch, 'mlu') and torch.mlu.is_available()
+except Exception:
+    IS_MLU_AVAILABLE = False
+
+try:
+    import torch_dipu  # noqa: F401
+    IS_DIPU_AVAILABLE = True
+except Exception:
+    IS_DIPU_AVAILABLE = False
+
+try:
+    import torch_musa  # noqa: F401
+    IS_MUSA_AVAILABLE = True
+except Exception:
+    IS_MUSA_AVAILABLE = False
+
+
+def get_max_cuda_memory(device: Optional[torch.device] = None) -> int:
+    """Returns the maximum GPU memory occupied by tensors in megabytes (MB) for
+    a given device. By default, this returns the peak allocated memory since
+    the beginning of this program.
+
+    Args:
+        device (torch.device, optional): selected device. Returns
+            statistic for the current device, given by
+            :func:`~torch.cuda.current_device`, if ``device`` is None.
+            Defaults to None.
+
+    Returns:
+        int: The maximum GPU memory occupied by tensors in megabytes
+        for a given device.
+    """
+    mem = torch.cuda.max_memory_allocated(device=device)
+    mem_mb = torch.tensor([int(mem) // (1024 * 1024)],
+                          dtype=torch.int,
+                          device=device)
+    torch.cuda.reset_peak_memory_stats()
+    return int(mem_mb.item())
+
+
+def is_cuda_available() -> bool:
+    """Returns True if cuda devices exist."""
+    return torch.cuda.is_available()
+
+
+def is_npu_available() -> bool:
+    """Returns True if Ascend PyTorch and npu devices exist."""
+    return IS_NPU_AVAILABLE
+
+
+def is_mlu_available() -> bool:
+    """Returns True if Cambricon PyTorch and mlu devices exist."""
+    return IS_MLU_AVAILABLE
+
+
+def is_mps_available() -> bool:
+    """Return True if mps devices exist.
+
+    It's specialized for mac m1 chips and require torch version 1.12 or higher.
+    """
+    return hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
+
+
+def is_dipu_available() -> bool:
+    return IS_DIPU_AVAILABLE
+
+
+def get_max_musa_memory(device: Optional[torch.device] = None) -> int:
+    """Returns the maximum GPU memory occupied by tensors in megabytes (MB) for
+    a given device. By default, this returns the peak allocated memory since
+    the beginning of this program.
+
+    Args:
+        device (torch.device, optional): selected device. Returns
+            statistic for the current device, given by
+            :func:`~torch.musa.current_device`, if ``device`` is None.
+            Defaults to None.
+
+    Returns:
+        int: The maximum GPU memory occupied by tensors in megabytes
+        for a given device.
+    """
+    mem = torch.musa.max_memory_allocated(device=device)
+    mem_mb = torch.tensor([int(mem) // (1024 * 1024)],
+                          dtype=torch.int,
+                          device=device)
+    # TODO:haowen.han@mthreads.com: This function is not supported by musa yet.
+    # torch.musa.reset_peak_memory_stats()
+    return int(mem_mb.item())
+
+
+def is_musa_available() -> bool:
+    return IS_MUSA_AVAILABLE
+
+
+def is_npu_support_full_precision() -> bool:
+    """Returns True if npu devices support full precision training."""
+    version_of_support_full_precision = 220
+    return IS_NPU_AVAILABLE and npu_utils.get_soc_version(
+    ) >= version_of_support_full_precision
+
+
+DEVICE = 'cpu'
+if is_npu_available():
+    DEVICE = 'npu'
+elif is_cuda_available():
+    DEVICE = 'cuda'
+elif is_mlu_available():
+    DEVICE = 'mlu'
+elif is_mps_available():
+    DEVICE = 'mps'
+elif is_dipu_available():
+    DEVICE = 'dipu'
+elif is_musa_available():
+    DEVICE = 'musa'
+
+
+def get_device() -> str:
+    """Returns the currently existing device type.
+
+    Returns:
+        str: cuda | npu | mlu | mps | musa | cpu.
+    """
+    return DEVICE
diff --git a/head_extractor/src/mmengine/dist/__init__.py b/head_extractor/src/mmengine/dist/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c70e181d5d6ba076ebb1d3fcfbd16f17f09952f1
--- /dev/null
+++ b/head_extractor/src/mmengine/dist/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dist import (all_gather_object, all_reduce, all_gather, all_reduce_dict,
+                   collect_results, gather, broadcast, gather_object,
+                   sync_random_seed, broadcast_object_list,
+                   collect_results_cpu, collect_results_gpu, all_reduce_params)
+from .utils import (get_dist_info, init_dist, init_local_group, get_backend,
+                    get_world_size, get_rank, get_local_size, get_local_rank,
+                    is_main_process, master_only, barrier, get_local_group,
+                    is_distributed, get_default_group, get_data_device,
+                    get_comm_device, cast_data_device, infer_launcher)
+
+__all__ = [
+    'all_gather_object', 'all_reduce', 'all_gather', 'all_reduce_dict',
+    'collect_results', 'collect_results_cpu', 'collect_results_gpu', 'gather',
+    'broadcast', 'gather_object', 'sync_random_seed', 'broadcast_object_list',
+    'get_dist_info', 'init_dist', 'init_local_group', 'get_backend',
+    'get_world_size', 'get_rank', 'get_local_size', 'get_local_group',
+    'get_local_rank', 'is_main_process', 'master_only', 'barrier',
+    'is_distributed', 'get_default_group', 'all_reduce_params',
+    'get_data_device', 'get_comm_device', 'cast_data_device', 'infer_launcher'
+]
diff --git a/head_extractor/src/mmengine/dist/dist.py b/head_extractor/src/mmengine/dist/dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..f70cc3ef46cf7c8ff93de464faadbf2547e4e267
--- /dev/null
+++ b/head_extractor/src/mmengine/dist/dist.py
@@ -0,0 +1,1184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+from collections import OrderedDict
+from typing import Any, Dict, Generator, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+from torch import distributed as torch_dist
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+from torch.distributed import ProcessGroup
+from itertools import zip_longest, chain
+import mmengine
+from .utils import (get_world_size, get_rank, get_backend, get_dist_info,
+                    get_default_group, barrier, get_data_device,
+                    get_comm_device, cast_data_device)
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+from mmengine.device import is_npu_available
+
+
+def _get_reduce_op(name: str) -> torch_dist.ReduceOp:
+    op_mappings = {
+        'sum': torch_dist.ReduceOp.SUM,
+        'product': torch_dist.ReduceOp.PRODUCT,
+        'min': torch_dist.ReduceOp.MIN,
+        'max': torch_dist.ReduceOp.MAX,
+        'band': torch_dist.ReduceOp.BAND,
+        'bor': torch_dist.ReduceOp.BOR,
+        'bxor': torch_dist.ReduceOp.BXOR,
+    }
+
+    if name.lower() not in op_mappings:
+        raise ValueError(
+            f'reduce op should be one of {op_mappings.keys()}, bug got {name}')
+
+    return op_mappings[name.lower()]
+
+
+def all_reduce(data: Tensor,
+               op: str = 'sum',
+               group: Optional[ProcessGroup] = None) -> None:
+    """Reduces the tensor data across all machines in such a way that all get
+    the final result.
+
+    After the call ``data`` is going to be bitwise identical in all
+    processes.
+
+    Note:
+        Calling ``all_reduce`` in non-distributed environment does nothing.
+
+    Args:
+        data (Tensor): Input and output of the collective. The function
+            operates in-place.
+        op (str): Operation to reduce data. Defaults to 'sum'. Optional values
+            are 'sum', 'mean' and 'produce', 'min', 'max', 'band', 'bor' and
+            'bxor'.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = torch.arange(2, dtype=torch.int64)
+        >>> dist.all_reduce(data)
+        >>> data
+        tensor([0, 1])
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> data = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
+        >>> data
+        tensor([1, 2]) # Rank 0
+        tensor([3, 4]) # Rank 1
+        >>> dist.all_reduce(data, op=dist.ReduceOp.SUM)
+        >>> data
+        tensor([4, 6]) # Rank 0
+        tensor([4, 6]) # Rank 1
+    """
+    world_size = get_world_size(group)
+    if world_size > 1:
+        if group is None:
+            group = get_default_group()
+
+        input_device = get_data_device(data)
+        backend_device = get_comm_device(group)
+        data_on_device = cast_data_device(data, backend_device)
+
+        # pytorch does not support 'mean' operation so we fall back to support
+        # it with 'sum' operation.
+        if op.lower() == 'mean':
+            torch_dist.all_reduce(data_on_device, _get_reduce_op('sum'), group)
+
+            # use true_divide to handle torch1.6.0 throws an RuntimeError when
+            # the type of `data_on_device` is int64
+            data_on_device = torch.true_divide(data_on_device, world_size)
+        else:
+            torch_dist.all_reduce(data_on_device, _get_reduce_op(op), group)
+
+        cast_data_device(data_on_device, input_device, out=data)
+
+
+def all_gather(data: Tensor,
+               group: Optional[ProcessGroup] = None) -> List[Tensor]:
+    """Gather data from the whole group in a list.
+
+    Note:
+        Calling ``all_gather`` in non-distributed environment does nothing
+        and just returns a list containing :attr:`data` itself.
+
+    Note:
+        Unlike PyTorch ``torch.distributed.all_gather``, :meth:`all_gather` in
+        MMEngine does not pass in an empty list ``gather_list`` and returns
+        the ``gather_list`` directly, which is more convenient. The difference
+        between their interfaces is as below:
+
+        - MMEngine: all_gather(data, group) -> gather_list
+        - PyTorch: all_gather(gather_list, data, group) -> None
+
+    Args:
+        data (Tensor): Tensor to be gathered.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        list[Tensor]: Return a list containing data from the whole group if
+        in distributed environment, otherwise a list only containing
+        :attr:`data` itself.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = torch.arange(2, dtype=torch.int64)
+        >>> data
+        tensor([0, 1])
+        >>> output = dist.all_gather(data)
+        >>> output
+        [tensor([0, 1])]
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> data = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
+        >>> data
+        tensor([1, 2])  # Rank 0
+        tensor([3, 4])  # Rank 1
+        >>> output = dist.all_gather(data)
+        >>> output
+        [tensor([1, 2]), tensor([3, 4])]  # Rank 0
+        [tensor([1, 2]), tensor([3, 4])]  # Rank 1
+    """
+    world_size = get_world_size(group)
+    if world_size == 1:
+        return [data]
+
+    if group is None:
+        group = get_default_group()
+
+    input_device = get_data_device(data)
+    backend_device = get_comm_device(group)
+    data_on_device = cast_data_device(data, backend_device)
+
+    gather_list = [
+        torch.empty_like(data, device=backend_device)
+        for _ in range(world_size)
+    ]
+
+    torch_dist.all_gather(gather_list, data_on_device, group)
+
+    return cast_data_device(gather_list, input_device)  # type: ignore
+
+
+def gather(data: Tensor,
+           dst: int = 0,
+           group: Optional[ProcessGroup] = None) -> List[Optional[Tensor]]:
+    """Gather data from the whole group to ``dst`` process.
+
+    Note:
+        Calling ``gather`` in non-distributed environment dose nothing
+        and just returns a list containing :attr:`data` itself.
+
+    Note:
+        ``NCCL`` backend does not support ``gather``.
+
+    Note:
+        Unlike PyTorch ``torch.distributed.gather``, :meth:`gather` in
+        MMEngine does not pass in an empty list ``gather_list`` and returns
+        the ``gather_list`` directly, which is more convenient. The difference
+        between their interfaces is as below:
+
+        - MMEngine: gather(data, dst, group) -> gather_list
+        - PyTorch: gather(data, gather_list, dst, group) -> None
+
+    Args:
+        data (Tensor): Tensor to be gathered. CUDA tensor is not supported.
+        dst (int): Destination rank. Defaults to 0.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        list[Tensor]: ``dst`` process will get a list of tensor gathering from
+        the whole group. Other process will get a empty list. If in
+        non-distributed environment, just return a list containing
+        :attr:`data` itself.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = torch.arange(2, dtype=torch.int64)
+        >>> data
+        tensor([0, 1])
+        >>> output = dist.gather(data)
+        >>> output
+        [tensor([0, 1])]
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> data = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
+        >>> data
+        tensor([1, 2]) # Rank 0
+        tensor([3, 4]) # Rank 1
+        >>> output = dist.gather(data)
+        >>> output
+        [tensor([1, 2]), tensor([3, 4])]  # Rank 0
+        []  # Rank 1
+    """
+    world_size = get_world_size(group)
+    if world_size == 1:
+        return [data]
+
+    if group is None:
+        group = get_default_group()
+
+    input_device = get_data_device(data)
+    backend_device = get_comm_device(group)
+
+    if get_rank(group) == dst:
+        gather_list = [
+            torch.empty_like(data, device=backend_device)
+            for _ in range(world_size)
+        ]
+    else:
+        gather_list = []
+
+    torch_dist.gather(data, gather_list, dst, group)
+
+    if get_rank(group) == dst:
+        return cast_data_device(gather_list, input_device)  # type: ignore
+    else:
+        return gather_list
+
+
+def broadcast(data: Tensor,
+              src: int = 0,
+              group: Optional[ProcessGroup] = None) -> None:
+    """Broadcast the data from ``src`` process to the whole group.
+
+    ``data`` must have the same number of elements in all processes
+    participating in the collective.
+
+    Note:
+        Calling ``broadcast`` in non-distributed environment does nothing.
+
+    Args:
+        data (Tensor): Data to be sent if ``src`` is the rank of current
+            process, and data to be used to save received data otherwise.
+        src (int): Source rank. Defaults to 0.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = torch.arange(2, dtype=torch.int64)
+        >>> data
+        tensor([0, 1])
+        >>> dist.broadcast(data)
+        >>> data
+        tensor([0, 1])
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> data = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank
+        >>> data
+        tensor([1, 2]) # Rank 0
+        tensor([3, 4]) # Rank 1
+        >>> dist.broadcast(data)
+        >>> data
+        tensor([1, 2]) # Rank 0
+        tensor([1, 2]) # Rank 1
+    """
+    if get_world_size(group) > 1:
+        if group is None:
+            group = get_default_group()
+
+        input_device = get_data_device(data)
+        backend_device = get_comm_device(group)
+        data_on_device = cast_data_device(data, backend_device)
+        # broadcast requires tensor is contiguous
+        data_on_device = data_on_device.contiguous()  # type: ignore
+        torch_dist.broadcast(data_on_device, src, group)
+
+        if get_rank(group) != src:
+            cast_data_device(data_on_device, input_device, data)
+
+
+def sync_random_seed(group: Optional[ProcessGroup] = None) -> int:
+    """Synchronize a random seed to all processes.
+
+    In distributed sampling, different ranks should sample non-overlapped
+    data in the dataset. Therefore, this function is used to make sure that
+    each rank shuffles the data indices in the same order based
+    on the same seed. Then different ranks could use different indices
+    to select non-overlapped data from the same data list.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        int: Random seed.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> seed = dist.sync_random_seed()
+        >>> seed  # which a random number
+        587791752
+
+        >>> distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> seed = dist.sync_random_seed()
+        >>> seed
+        587791752  # Rank 0
+        587791752  # Rank 1
+    """
+    seed = np.random.randint(2**31)
+    if get_world_size(group) == 1:
+        return seed
+
+    if group is None:
+        group = get_default_group()
+
+    backend_device = get_comm_device(group)
+
+    if get_rank(group) == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32).to(backend_device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32).to(backend_device)
+
+    torch_dist.broadcast(random_num, src=0, group=group)
+
+    return random_num.item()
+
+
+def _object_to_tensor(obj: Any) -> Tuple[Tensor, Tensor]:
+    """Serialize picklable python object to tensor."""
+    byte_storage = torch.ByteStorage.from_buffer(pickle.dumps(obj))
+    # Do not replace `torch.ByteTensor` or `torch.LongTensor` with torch.tensor
+    # and specifying dtype. Otherwise, it will cause 100X slowdown.
+    # See: https://github.com/pytorch/pytorch/issues/65696
+    byte_tensor = torch.ByteTensor(byte_storage)
+    local_size = torch.LongTensor([byte_tensor.numel()])
+    return byte_tensor, local_size
+
+
+def _tensor_to_object(tensor: Tensor, tensor_size: int) -> Any:
+    """Deserialize tensor to picklable python object."""
+    buf = tensor.cpu().numpy().tobytes()[:tensor_size]
+    return pickle.loads(buf)
+
+
+def _broadcast_object_list(object_list: List[Any],
+                           src: int = 0,
+                           group: Optional[ProcessGroup] = None) -> None:
+    """Broadcast picklable objects in ``object_list`` to the whole group.
+
+    Similar to :func:`broadcast`, but Python objects can be passed in. Note
+    that all objects in ``object_list`` must be picklable in order to be
+    broadcasted.
+    """
+    if torch_dist.distributed_c10d._rank_not_in_group(group):
+        return
+
+    my_rank = get_rank()
+    # Serialize object_list elements to tensors on src rank.
+    if my_rank == src:
+        tensor_list, size_list = zip(
+            *[_object_to_tensor(obj) for obj in object_list])
+        object_sizes_tensor = torch.cat(size_list)
+    else:
+        object_sizes_tensor = torch.empty(len(object_list), dtype=torch.long)
+
+    # Current device selection.
+    # To preserve backwards compatibility, ``device`` is ``None`` by default.
+    # in which case we run current logic of device selection, i.e.
+    # ``current_device`` is CUDA if backend is NCCL otherwise CPU device. In
+    # the case it is not ``None`` we move the size and object tensors to be
+    # broadcasted to this device.
+    group_backend = get_backend(group)
+    is_nccl_backend = group_backend == torch_dist.Backend.NCCL
+    current_device = torch.device('cpu')
+    is_hccl_backend = group_backend == 'hccl'
+    is_cncl_backend = group_backend == 'cncl'
+    is_mccl_backend = group_backend == 'mccl'
+    if is_hccl_backend:
+        current_device = torch.device('npu', torch.npu.current_device())
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
+    elif is_cncl_backend:
+        current_device = torch.device('mlu', torch.mlu.current_device())
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
+    elif is_mccl_backend:
+        current_device = torch.device('musa', torch.musa.current_device())
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
+    elif is_nccl_backend:
+        # See note about using torch.cuda.current_device() here in
+        # docstring. We cannot simply use my_rank since rank == device is
+        # not necessarily true.
+        current_device = torch.device('cuda', torch.cuda.current_device())
+        object_sizes_tensor = object_sizes_tensor.to(current_device)
+
+    # Broadcast object sizes
+    torch_dist.broadcast(object_sizes_tensor, src=src, group=group)
+
+    # Concatenate and broadcast serialized object tensors
+    if my_rank == src:
+        object_tensor = torch.cat(tensor_list)
+    else:
+        object_tensor = torch.empty(
+            torch.sum(object_sizes_tensor).int().item(),
+            dtype=torch.uint8,
+        )
+
+    if is_nccl_backend or is_hccl_backend or is_cncl_backend:
+        object_tensor = object_tensor.to(current_device)
+    torch_dist.broadcast(object_tensor, src=src, group=group)
+    # Deserialize objects using their stored sizes.
+    offset = 0
+    if my_rank != src:
+        for i, obj_size in enumerate(object_sizes_tensor):
+            obj_view = object_tensor[offset:offset + obj_size]
+            obj_view = obj_view.type(torch.uint8)
+            if obj_view.device != torch.device('cpu'):
+                obj_view = obj_view.cpu()
+            offset += obj_size
+            object_list[i] = _tensor_to_object(obj_view, obj_size)
+
+
+def broadcast_object_list(data: List[Any],
+                          src: int = 0,
+                          group: Optional[ProcessGroup] = None) -> None:
+    """Broadcasts picklable objects in ``object_list`` to the whole group.
+    Similar to :func:`broadcast`, but Python objects can be passed in. Note
+    that all objects in ``object_list`` must be picklable in order to be
+    broadcasted.
+
+    Note:
+        Calling ``broadcast_object_list`` in non-distributed environment does
+        nothing.
+
+    Args:
+        data (List[Any]): List of input objects to broadcast.
+            Each object must be picklable. Only objects on the ``src`` rank
+            will be broadcast, but each rank must provide lists of equal sizes.
+        src (int): Source rank from which to broadcast ``object_list``.
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Default is ``None``.
+        device (``torch.device``, optional): If not None, the objects are
+            serialized and converted to tensors which are moved to the
+            ``device`` before broadcasting. Default is ``None``.
+
+    Note:
+        For NCCL-based process groups, internal tensor representations of
+        objects must be moved to the GPU device before communication starts.
+        In this case, the used device is given by
+        ``torch.cuda.current_device()`` and it is the user's responsibility to
+        ensure that this is correctly set so that each rank has an individual
+        GPU, via ``torch.cuda.set_device()``.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = ['foo', 12, {1: 2}]
+        >>> dist.broadcast_object_list(data)
+        >>> data
+        ['foo', 12, {1: 2}]
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> if dist.get_rank() == 0:
+        >>>     # Assumes world_size of 3.
+        >>>     data = ["foo", 12, {1: 2}]  # any picklable object
+        >>> else:
+        >>>     data = [None, None, None]
+        >>> dist.broadcast_object_list(data)
+        >>> data
+        ["foo", 12, {1: 2}]  # Rank 0
+        ["foo", 12, {1: 2}]  # Rank 1
+    """
+    assert isinstance(data, list)
+
+    if get_world_size(group) > 1:
+        if group is None:
+            group = get_default_group()
+
+        if digit_version(TORCH_VERSION) >= digit_version(
+                '1.8.0') and not is_npu_available():
+            torch_dist.broadcast_object_list(data, src, group)
+        else:
+            _broadcast_object_list(data, src, group)
+
+
+def all_reduce_dict(data: Dict[str, Tensor],
+                    op: str = 'sum',
+                    group: Optional[ProcessGroup] = None) -> None:
+    """Reduces the dict across all machines in such a way that all get the
+    final result.
+
+    The code is modified from https://github.com/Megvii-
+    BaseDetection/YOLOX/blob/main/yolox/utils/allreduce_norm.py.
+
+    Args:
+        data (dict[str, Tensor]): Data to be reduced.
+        op (str): Operation to reduce data. Defaults to 'sum'. Optional values
+            are 'sum', 'mean' and 'produce', 'min', 'max', 'band', 'bor' and
+            'bxor'.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = {
+                'key1': torch.arange(2, dtype=torch.int64),
+                'key2': torch.arange(3, dtype=torch.int64)
+            }
+        >>> dist.all_reduce_dict(data)
+        >>> data
+            {'key1': tensor([0, 1]), 'key2': tensor([0, 1, 2])}
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> data = {
+                'key1': torch.arange(2, dtype=torch.int64),
+                'key2': torch.arange(3, dtype=torch.int64)
+            }
+        >>> dist.all_reduce_dict(data)
+        >>> data
+        {'key1': tensor([0, 2]), 'key2': tensor([0, 2, 4])}  # Rank 0
+        {'key1': tensor([0, 2]), 'key2': tensor([0, 2, 4])}  # Rank 1
+    """
+    assert isinstance(data, dict)
+
+    world_size = get_world_size(group)
+    if world_size > 1:
+
+        if group is None:
+            group = get_default_group()
+
+        # ensure keys are consistent across processes
+        keys = sorted(data.keys())
+        tensor_shapes = [data[k].shape for k in keys]
+        tensor_sizes = [data[k].numel() for k in keys]
+
+        if digit_version(TORCH_VERSION) == digit_version('1.5.0'):
+            # `torch.cat` in torch1.5 can not concatenate different types so
+            # we fallback to convert them all to float type.
+            flatten_tensor = torch.cat(
+                [data[k].flatten().float() for k in keys])
+        else:
+            flatten_tensor = torch.cat([data[k].flatten() for k in keys])
+
+        all_reduce(flatten_tensor, op=op, group=group)
+
+        split_tensors = [
+            x.reshape(shape) for x, shape in zip(
+                torch.split(flatten_tensor, tensor_sizes), tensor_shapes)
+        ]
+
+        for k, v in zip(keys, split_tensors):
+            data[k] = v
+
+
+def _all_gather_object(object_list: List[Any],
+                       obj: Any,
+                       group: Optional[ProcessGroup] = None) -> None:
+    """Gather picklable objects from the whole group into a list.
+
+    Similar to :func:`all_gather`, but Python objects can be passed in.
+    Note that the object must be picklable in order to be gathered.
+
+    Args:
+        object_list (list[Any]): Output list. It should be correctly sized as
+            the size of the group for this collective and will contain the
+            output.
+        object (Any): Pickable Python object to be broadcast from current
+            process.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        None. If the calling rank is part of this group, the output of the
+        collective will be populated into the input ``object_list``. If the
+        calling rank is not part of the group, the passed in ``object_list``
+        will be unmodified.
+    """
+    if torch_dist.distributed_c10d._rank_not_in_group(group):
+        return
+
+    input_tensor, local_size = _object_to_tensor(obj)
+    group_backend = get_backend(group)
+    current_device = torch.device('cpu')
+    is_nccl_backend = group_backend == torch_dist.Backend.NCCL
+    is_mccl_backend = group_backend == 'mccl'
+    if is_nccl_backend:
+        # See note about using torch.cuda.current_device() here in docstring.
+        # We cannot simply use my_rank since rank == device is not necessarily
+        # true.
+        current_device = torch.device('cuda', torch.cuda.current_device())
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
+    elif is_mccl_backend:
+        # See note about using torch.musa.current_device() here in docstring.
+        # We cannot simply use my_rank since rank == device is not necessarily
+        # true.
+        current_device = torch.device('musa', torch.musa.current_device())
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
+    # Gather all local sizes. This is so that we can find the max size, and
+    # index until the correct size when deserializing the tensors.
+    group_size = get_world_size(group=group)
+    object_sizes_tensor = torch.zeros(
+        group_size, dtype=torch.long, device=current_device)
+    object_size_list = [
+        object_sizes_tensor[i].unsqueeze(dim=0) for i in range(group_size)
+    ]
+    # Allgather tensor sizes
+    torch_dist.all_gather(object_size_list, local_size, group=group)
+    max_object_size = int(max(object_size_list).item())
+    # Resize tensor to max size across all ranks.
+    input_tensor.resize_(max_object_size)
+    coalesced_output_tensor = torch.empty(
+        max_object_size * group_size, dtype=torch.uint8, device=current_device)
+    # Output tensors are nonoverlapping views of coalesced_output_tensor
+    output_tensors = [
+        coalesced_output_tensor[max_object_size * i:max_object_size * (i + 1)]
+        for i in range(group_size)
+    ]
+    torch_dist.all_gather(output_tensors, input_tensor, group=group)
+    # Deserialize outputs back to object.
+    for i, tensor in enumerate(output_tensors):
+        tensor = tensor.type(torch.uint8)
+        if tensor.device != torch.device('cpu'):
+            tensor = tensor.cpu()
+        tensor_size = object_size_list[i]
+        object_list[i] = _tensor_to_object(tensor, tensor_size)
+
+
+def all_gather_object(data: Any,
+                      group: Optional[ProcessGroup] = None) -> List[Any]:
+    """Gather picklable objects from the whole group into a list. Similar to
+    :func:`all_gather`, but Python objects can be passed in. Note that the
+    object must be picklable in order to be gathered.
+
+    Note:
+        Calling ``all_gather_object`` in non-distributed environment does
+        nothing and just returns a list containing :attr:`data` itself.
+
+    Note:
+        Unlike PyTorch ``torch.distributed.all_gather_object``,
+        :meth:`all_gather_object` in MMEngine does not pass in an empty list
+        ``gather_list`` and returns the ``gather_list`` directly, which is
+        more convenient. The difference between their interfaces is as below:
+
+        - MMEngine: all_gather_object(data, group) -> gather_list
+        - PyTorch: all_gather_object(gather_list, data, group) -> None
+
+    Args:
+        data (Any): Pickable Python object to be broadcast from current
+            process.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        list[Tensor]: Return a list containing data from the whole group if
+        in distributed environment, otherwise a list only containing
+        :attr:`data` itself.
+
+    Note:
+        For NCCL-based process groups, internal tensor representations
+        of objects must be moved to the GPU device before communication starts.
+        In this case, the used device is given by
+        ``torch.cuda.current_device()`` and it is the user's responsibility to
+        ensure that this is correctly set so that each rank has an individual
+        GPU, via ``torch.cuda.set_device()``.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = ['foo', 12, {1: 2}]  # any picklable object
+        >>> gather_objects = dist.all_gather_object(data[dist.get_rank()])
+        >>> output
+        ['foo']
+
+        >>> # distributed environment
+        >>> # We have 3 process groups, 3 ranks.
+        >>> output = dist.all_gather_object(data[dist.get_rank()])
+        >>> output
+        ['foo', 12, {1: 2}]  # Rank 0
+        ['foo', 12, {1: 2}]  # Rank 1
+        ['foo', 12, {1: 2}]  # Rank 2
+    """
+    world_size = get_world_size(group)
+    if world_size == 1:
+        return [data]
+
+    if group is None:
+        group = get_default_group()
+
+    gather_list = [None] * world_size
+
+    if digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
+        torch_dist.all_gather_object(gather_list, data, group)
+    else:
+        _all_gather_object(gather_list, data, group)
+
+    return gather_list
+
+
+def _validate_output_list_for_rank(my_rank: int, dst: int,
+                                   gather_list: Optional[list]) -> None:
+    """Validate whether ``gather_list`` is None in non-dst ranks."""
+    if dst == my_rank:
+        if not gather_list:
+            raise ValueError(
+                'Argument ``gather_list`` must be specified on destination '
+                'rank.')
+    elif gather_list:
+        raise ValueError('Argument ``gather_list`` must NOT be specified '
+                         'on non-destination ranks.')
+
+
+def _gather_object(obj: Any,
+                   object_gather_list=None,
+                   dst: int = 0,
+                   group: Optional[ProcessGroup] = None) -> None:
+    """Gathers picklable objects from the whole group in a single process.
+
+    Similar to :func:`gather`, but Python objects can be passed in. Note that
+    the object must be picklable in order to be gathered.
+
+    Args:
+        obj (Any): Input object. Must be picklable.
+        object_gather_list (list[Any], optional): Output list. On the ``dst``
+            rank, it should be correctly sized as the size of the group for
+            this collective and will contain the output. Must be ``None`` on
+            non-dst ranks. Defaults to None.
+        dst (int): Destination rank. Defaults to 0.
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+    """
+    if torch_dist.distributed_c10d._rank_not_in_group(group):
+        return
+
+    # Ensure object_gather_list is specified appopriately.
+    my_rank = get_rank()
+    _validate_output_list_for_rank(my_rank, dst, object_gather_list)
+    input_tensor, local_size = _object_to_tensor(obj)
+    group_backend = get_backend(group)
+    current_device = torch.device('cpu')
+    is_nccl_backend = group_backend == torch_dist.Backend.NCCL
+    is_mccl_backend = group_backend == 'mccl'
+    if is_nccl_backend:
+        current_device = torch.device('cuda', torch.cuda.current_device())
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
+    elif is_mccl_backend:
+        current_device = torch.device('musa', torch.musa.current_device())
+        input_tensor = input_tensor.to(current_device)
+        local_size = local_size.to(current_device)
+    # Gather all local sizes. This is so that we can find the max size, and
+    # index until the correct size when deserializing the tensors.
+    group_size = get_world_size(group=group)
+    object_sizes_tensor = torch.zeros(
+        group_size, dtype=torch.long, device=current_device)
+    object_size_list = [
+        object_sizes_tensor[i].unsqueeze(dim=0) for i in range(group_size)
+    ]
+    # Allgather tensor sizes. An all-gather is needed here despite this being a
+    # gather, since each rank needs to broadcast a tensor of the same (maximal)
+    # size.
+    torch_dist.all_gather(object_size_list, local_size, group=group)
+    max_object_size = int(max(object_size_list).item())
+    # Resize tensor to max size across all ranks.
+    input_tensor.resize_(max_object_size)
+    # Avoid populating output tensors if the result won't be gathered on this
+    # rank.
+    if my_rank == dst:
+        coalesced_output_tensor = torch.empty(
+            max_object_size * group_size,
+            dtype=torch.uint8,
+            device=current_device)
+        # Output tensors are nonoverlapping views of coalesced_output_tensor
+        output_tensors = [
+            coalesced_output_tensor[max_object_size * i:max_object_size *
+                                    (i + 1)] for i in range(group_size)
+        ]
+    # All ranks call gather with equal-sized tensors.
+    torch_dist.gather(
+        input_tensor,
+        gather_list=output_tensors if my_rank == dst else None,
+        dst=dst,
+        group=group,
+    )
+    if my_rank != dst:
+        return
+    for i, tensor in enumerate(output_tensors):
+        tensor = tensor.type(torch.uint8)
+        tensor_size = object_size_list[i]
+        object_gather_list[i] = _tensor_to_object(tensor, tensor_size)
+
+
+def gather_object(data: Any,
+                  dst: int = 0,
+                  group: Optional[ProcessGroup] = None) -> Optional[List[Any]]:
+    """Gathers picklable objects from the whole group in a single process.
+    Similar to :func:`gather`, but Python objects can be passed in. Note that
+    the object must be picklable in order to be gathered.
+
+    Note:
+        ``NCCL backend`` does not support ``gather_object``.
+
+    Note:
+        Unlike PyTorch ``torch.distributed.gather_object``,
+        :meth:`gather_object` in MMEngine does not pass in an empty list
+        ``gather_list`` and returns the ``gather_list`` directly, which is
+        more convenient. The difference between their interfaces is as below:
+
+        - MMEngine: gather_object(data, dst, group) -> gather_list
+        - PyTorch: gather_object(data, gather_list, data, group) -> None
+
+    Args:
+        data (Any): Input object. Must be picklable.
+        dst (int): Destination rank. Defaults to 0.
+        group: (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        list[Any]. On the ``dst`` rank, return ``gather_list`` which contains
+        the output of the collective.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = ['foo', 12, {1: 2}]  # any picklable object
+        >>> gather_objects = dist.gather_object(data[dist.get_rank()])
+        >>> output
+        ['foo']
+
+        >>> # distributed environment
+        >>> # We have 3 process groups, 3 ranks.
+        >>> dist.gather_object(gather_objects[dist.get_rank()], dst=0)
+        >>> output
+        ['foo', 12, {1: 2}]  # Rank 0
+        None  # Rank 1
+        None  # Rank 2
+    """
+    world_size = get_world_size(group)
+    if world_size == 1:
+        return [data]
+
+    if group is None:
+        group = get_default_group()
+
+    gather_list = [None] * world_size if get_rank(group) == dst else None
+
+    if digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
+        torch_dist.gather_object(data, gather_list, dst, group)
+    else:
+        _gather_object(data, gather_list, dst, group)
+
+    return gather_list
+
+
+def collect_results(results: list,
+                    size: int,
+                    device: str = 'cpu',
+                    tmpdir: Optional[str] = None) -> Optional[list]:
+    """Collected results in distributed environments.
+
+    Args:
+        results (list[object]): Result list containing result parts to be
+            collected. Each item of ``result_part`` should be a picklable
+            object.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+        device (str): Device name. Optional values are 'cpu', 'gpu' or 'npu'.
+        tmpdir (str | None): Temporal directory for collected results to
+            store. If set to None, it will create a temporal directory for it.
+            ``tmpdir`` should be None when device is 'gpu' or 'npu'.
+            Defaults to None.
+
+    Returns:
+        list or None: The collected results.
+
+    Examples:
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> import mmengine.dist as dist
+        >>> if dist.get_rank() == 0:
+                data = ['foo', {1: 2}]
+            else:
+                data = [24, {'a': 'b'}]
+        >>> size = 4
+        >>> output = dist.collect_results(data, size, device='cpu')
+        >>> output
+        ['foo', 24, {1: 2}, {'a': 'b'}]  # rank 0
+        None  # rank 1
+    """
+    if device not in ['gpu', 'cpu', 'npu']:
+        raise NotImplementedError(
+            f"device must be 'cpu' , 'gpu' or 'npu', but got {device}")
+
+    if device == 'gpu' or device == 'npu':
+        assert tmpdir is None, f'tmpdir should be None when device is {device}'
+        return _collect_results_device(results, size)
+    else:
+        return collect_results_cpu(results, size, tmpdir)
+
+
+def collect_results_cpu(result_part: list,
+                        size: int,
+                        tmpdir: Optional[str] = None) -> Optional[list]:
+    """Collect results under cpu mode.
+
+    On cpu mode, this function will save the results on different gpus to
+    ``tmpdir`` and collect them by the rank 0 worker.
+
+    Args:
+        result_part (list): Result list containing result parts
+            to be collected. Each item of ``result_part`` should be a picklable
+            object.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+        tmpdir (str | None): Temporal directory for collected results to
+            store. If set to None, it will create a random temporal directory
+            for it. Defaults to None.
+
+    Returns:
+        list or None: The collected results.
+
+    Examples:
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> import mmengine.dist as dist
+        >>> if dist.get_rank() == 0:
+                data = ['foo', {1: 2}]
+            else:
+                data = [24, {'a': 'b'}]
+        >>> size = 4
+        >>> output = dist.collect_results_cpu(data, size)
+        >>> output
+        ['foo', 24, {1: 2}, {'a': 'b'}]  # rank 0
+        None  # rank 1
+    """
+    rank, world_size = get_dist_info()
+    if world_size == 1:
+        return result_part[:size]
+
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8)
+        if rank == 0:
+            mmengine.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8)
+            dir_tensor[:len(tmpdir)] = tmpdir
+        broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.numpy().tobytes().decode().rstrip()
+    else:
+        mmengine.mkdir_or_exist(tmpdir)
+
+    # dump the part result to the dir
+    with open(osp.join(tmpdir, f'part_{rank}.pkl'), 'wb') as f:  # type: ignore
+        pickle.dump(result_part, f, protocol=2)
+
+    barrier()
+
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            path = osp.join(tmpdir, f'part_{i}.pkl')  # type: ignore
+            if not osp.exists(path):
+                raise FileNotFoundError(
+                    f'{tmpdir} is not an shared directory for '
+                    f'rank {i}, please make sure {tmpdir} is a shared '
+                    'directory for all ranks!')
+            with open(path, 'rb') as f:
+                part_list.append(pickle.load(f))
+        # sort the results
+        ordered_results = []
+        zipped_results = zip_longest(*part_list)
+        ordered_results = [
+            i for i in chain.from_iterable(zipped_results) if i is not None
+        ]
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)  # type: ignore
+        return ordered_results
+
+
+def _collect_results_device(result_part: list, size: int) -> Optional[list]:
+    """Collect results under gpu or npu mode."""
+    rank, world_size = get_dist_info()
+    if world_size == 1:
+        return result_part[:size]
+
+    # gather all result part. Note that NCCL does not support gather so use
+    # all_gather_object instead.
+    part_list = all_gather_object(result_part)
+
+    if rank == 0:
+        # sort the results
+        ordered_results = []
+        zipped_results = zip_longest(*part_list)
+        ordered_results = [
+            i for i in chain.from_iterable(zipped_results) if i is not None
+        ]
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        return ordered_results
+    else:
+        return None
+
+
+def collect_results_gpu(result_part: list, size: int) -> Optional[list]:
+    """Collect results under gpu mode.
+
+    On gpu mode, this function will encode results to gpu tensors and use gpu
+    communication for results collection.
+
+    Args:
+        result_part (list[object]): Result list containing result parts
+            to be collected. Each item of ``result_part`` should be a picklable
+            object.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+
+    Returns:
+        list or None: The collected results.
+
+    Examples:
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> import mmengine.dist as dist
+        >>> if dist.get_rank() == 0:
+                data = ['foo', {1: 2}]
+            else:
+                data = [24, {'a': 'b'}]
+        >>> size = 4
+        >>> output = dist.collect_results_gpu(data, size)
+        >>> output
+        ['foo', 24, {1: 2}, {'a': 'b'}]  # rank 0
+        None  # rank 1
+    """
+    return _collect_results_device(result_part, size)
+
+
+def _all_reduce_coalesced(tensors: List[torch.Tensor],
+                          bucket_size_mb: int = -1,
+                          op: str = 'sum',
+                          group: Optional[ProcessGroup] = None) -> None:
+    """All-reduce a sequence of tensors as a whole.
+
+    Args:
+        tensors (List[torch.Tensor]): A sequence of tensors to be
+            all-reduced.
+        bucket_size_mb (int): The limit of each chunk in megabytes
+            for grouping tensors into chunks. Defaults to -1.
+        op (str): Operation to reduce data. Defaults to 'sum'. Optional values
+            are 'sum', 'mean' and 'produce', 'min', 'max', 'band', 'bor' and
+            'bxor'.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+    """
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        all_reduce(flat_tensors, op=op, group=group)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def all_reduce_params(params: Union[List, Generator[torch.Tensor, None, None]],
+                      coalesce: bool = True,
+                      bucket_size_mb: int = -1,
+                      op: str = 'sum',
+                      group: Optional[ProcessGroup] = None) -> None:
+    """All-reduce parameters.
+
+    Args:
+        params (List or Generator[torch.Tensor, None, None]): List of
+            parameters or buffers of a model.
+        coalesce (bool, optional): Whether to reduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+        op (str): Operation to reduce data. Defaults to 'sum'. Optional values
+            are 'sum', 'mean' and 'produce', 'min', 'max', 'band', 'bor' and
+            'bxor'.
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Examples:
+        >>> import torch
+        >>> import mmengine.dist as dist
+
+        >>> # non-distributed environment
+        >>> data = [torch.arange(2), torch.arange(3)]
+        >>> dist.all_reduce_params(data)
+        >>> data
+            [tensor([0, 1]), tensor([0, 1, 2])]
+
+        >>> # distributed environment
+        >>> # We have 2 process groups, 2 ranks.
+        >>> if dist.get_rank() == 0:
+        ...     data = [torch.tensor([1, 2]), torch.tensor([3, 4])]
+        ... else:
+        ...     data = [torch.tensor([2, 3]), torch.tensor([4, 5])]
+
+        >>> dist.all_reduce_params(data)
+        >>> data
+            [torch.tensor([3, 5]), torch.tensor([7, 9])]
+    """
+    world_size = get_world_size(group)
+    if world_size == 1:
+        return
+    params_data = [param.data for param in params]
+    if coalesce:
+        _all_reduce_coalesced(params_data, bucket_size_mb, op=op, group=group)
+    else:
+        for tensor in params_data:
+            all_reduce(tensor, op=op, group=group)
diff --git a/head_extractor/src/mmengine/dist/utils.py b/head_extractor/src/mmengine/dist/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d32cec36b90e9a082e5542319a5bda6e6f5cfa3
--- /dev/null
+++ b/head_extractor/src/mmengine/dist/utils.py
@@ -0,0 +1,623 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import functools
+import os
+import subprocess
+from typing import Callable, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.multiprocessing as mp
+from torch import Tensor
+from torch import distributed as torch_dist
+from torch.distributed import ProcessGroup
+from mmengine.device import (is_mlu_available, is_npu_available,
+                             is_musa_available)
+
+from collections.abc import Iterable, Mapping
+
+_LOCAL_PROCESS_GROUP = None
+
+
+def is_distributed() -> bool:
+    """Return True if distributed environment has been initialized."""
+    return torch_dist.is_available() and torch_dist.is_initialized()
+
+
+def get_local_group() -> Optional[ProcessGroup]:
+    """Return local process group."""
+    if not is_distributed():
+        return None
+
+    if _LOCAL_PROCESS_GROUP is None:
+        raise RuntimeError('Local process group is not created, please use '
+                           '`init_local_group` to setup local process group.')
+
+    return _LOCAL_PROCESS_GROUP
+
+
+def get_default_group() -> Optional[ProcessGroup]:
+    """Return default process group."""
+
+    return torch_dist.distributed_c10d._get_default_group()
+
+
+def infer_launcher():
+    if 'WORLD_SIZE' in os.environ:
+        return 'pytorch'
+    elif 'SLURM_NTASKS' in os.environ:
+        return 'slurm'
+    elif 'OMPI_COMM_WORLD_LOCAL_RANK' in os.environ:
+        return 'mpi'
+    else:
+        return 'none'
+
+
+def init_dist(launcher,
+              backend='nccl',
+              init_backend='torch',
+              **kwargs) -> None:
+    """Initialize distributed environment.
+
+    Args:
+        launcher (str): Way to launcher multi processes. Supported launchers
+            are 'pytorch', 'mpi' and 'slurm'.
+        backend (str): Communication Backends. Supported backends are 'nccl',
+            'gloo' and 'mpi'. Defaults to 'nccl'.
+        **kwargs: keyword arguments are passed to ``init_process_group``.
+    """
+    timeout = kwargs.get('timeout', None)
+    if timeout is not None:
+        # If a timeout (in seconds) is specified, it must be converted
+        # to a timedelta object before forwarding the call to
+        # the respective backend, because they expect a timedelta object.
+        try:
+            kwargs['timeout'] = datetime.timedelta(seconds=timeout)
+        except TypeError as exception:
+            raise TypeError(
+                f'Timeout for distributed training must be provided as '
+                f"timeout in seconds, but we've received the type "
+                f'{type(timeout)}. Please specify the timeout like this: '
+                f"dist_cfg=dict(backend='nccl', timeout=1800)") from exception
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method('spawn')
+    if launcher == 'pytorch':
+        _init_dist_pytorch(backend, init_backend=init_backend, **kwargs)
+    elif launcher == 'mpi':
+        _init_dist_mpi(backend, **kwargs)
+    elif launcher == 'slurm':
+        _init_dist_slurm(backend, init_backend=init_backend, **kwargs)
+    else:
+        raise ValueError(f'Invalid launcher type: {launcher}')
+
+
+def _init_dist_pytorch(backend, init_backend='torch', **kwargs) -> None:
+    """Initialize distributed environment with PyTorch launcher.
+
+    Args:
+        backend (str): Backend of torch.distributed. Supported backends are
+            'nccl', 'gloo' and 'mpi'. Defaults to 'nccl'.
+        **kwargs: keyword arguments are passed to ``init_process_group``.
+    """
+    rank = int(os.environ['RANK'])
+    # LOCAL_RANK is set by `torch.distributed.launch` since PyTorch 1.1
+    local_rank = int(os.environ['LOCAL_RANK'])
+    if is_mlu_available():
+        import torch_mlu  # noqa: F401
+        torch.mlu.set_device(local_rank)
+        torch_dist.init_process_group(
+            backend='cncl',
+            rank=rank,
+            world_size=int(os.environ['WORLD_SIZE']),
+            **kwargs)
+    elif is_npu_available():
+        import torch_npu  # noqa: F401
+        torch.npu.set_device(local_rank)
+        torch_dist.init_process_group(
+            backend='hccl',
+            rank=rank,
+            world_size=int(os.environ['WORLD_SIZE']),
+            **kwargs)
+    elif is_musa_available():
+        import torch_musa  # noqa: F401
+        torch.musa.set_device(rank)
+        torch_dist.init_process_group(
+            backend='mccl',
+            rank=rank,
+            world_size=int(os.environ['WORLD_SIZE']),
+            **kwargs)
+    else:
+        torch.cuda.set_device(local_rank)
+
+        if init_backend == 'torch':
+            torch_dist.init_process_group(backend=backend, **kwargs)
+        elif init_backend == 'deepspeed':
+            import deepspeed
+            deepspeed.init_distributed(dist_backend=backend, **kwargs)
+        elif init_backend == 'colossalai':
+            import colossalai
+            colossalai.launch_from_torch(backend=backend, **kwargs)
+        else:
+            raise ValueError(
+                'supported "init_backend" is "torch" or "deepspeed", '
+                f'but got {init_backend}')
+
+
+def _init_dist_mpi(backend, **kwargs) -> None:
+    """Initialize distributed environment with MPI launcher.
+
+    Args:
+        backend (str): Backend of torch.distributed. Supported backends are
+            'nccl', 'gloo' and 'mpi'. Defaults to 'nccl'.
+        **kwargs: keyword arguments are passed to ``init_process_group``.
+    """
+    if backend == 'smddp':
+        try:
+            import smdistributed.dataparallel.torch.torch_smddp  # noqa: F401
+        except ModuleNotFoundError as e:
+            raise ModuleNotFoundError(
+                'Please use an Amazon SageMaker DLC to access smdistributed: '
+                'https://github.com/aws/deep-learning-containers/blob/master'
+                '/available_images.md#sagemaker-framework-containers'
+                '-sm-support-only') from e
+    local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+    torch.cuda.set_device(local_rank)
+    if 'MASTER_PORT' not in os.environ:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    if 'MASTER_ADDR' not in os.environ:
+        raise KeyError('The environment variable MASTER_ADDR is not set')
+    os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
+    os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
+    torch_dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_slurm(backend,
+                     port=None,
+                     init_backend='torch',
+                     **kwargs) -> None:
+    """Initialize slurm distributed training environment.
+
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    # Not sure when this environment variable could be None, so use a fallback
+    local_rank_env = os.environ.get('SLURM_LOCALID', None)
+    if local_rank_env is not None:
+        local_rank = int(local_rank_env)
+    else:
+        num_gpus = torch.cuda.device_count()
+        local_rank = proc_id % num_gpus
+    addr = subprocess.getoutput(
+        f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    # use MASTER_ADDR in the environment variable if it already exists
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(local_rank)
+    os.environ['RANK'] = str(proc_id)
+
+    if is_mlu_available():
+        import torch_mlu  # noqa: F401
+        torch.mlu.set_device(local_rank)
+        torch_dist.init_process_group(backend='cncl', **kwargs)
+    else:
+        torch.cuda.set_device(local_rank)
+
+        if init_backend == 'torch':
+            torch_dist.init_process_group(backend=backend, **kwargs)
+        elif init_backend == 'deepspeed':
+            import deepspeed
+            deepspeed.init_distributed(dist_backend=backend, **kwargs)
+        elif init_backend == 'colossalai':
+            import colossalai
+            colossalai.launch_from_slurm(
+                backend=backend,
+                host=os.environ['MASTER_ADDR'],
+                port=os.environ['MASTER_PORT'],
+                **kwargs,
+            )
+        else:
+            raise ValueError(
+                'supported "init_backend" is "torch" or "deepspeed", '
+                f'but got {init_backend}')
+
+
+def init_local_group(node_rank: int, num_gpus_per_node: int):
+    """Setup the local process group.
+
+    Setup a process group which only includes processes that on the same
+    machine as the current process.
+
+    The code is modified from
+    https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py
+
+    Args:
+        node_rank (int): Rank of machines used for training.
+        num_gpus_per_node (int): Number of gpus used for training in a single
+            machine.
+    """  # noqa: W501
+    global _LOCAL_PROCESS_GROUP
+    assert _LOCAL_PROCESS_GROUP is None
+
+    ranks = list(
+        range(node_rank * num_gpus_per_node,
+              (node_rank + 1) * num_gpus_per_node))
+    _LOCAL_PROCESS_GROUP = torch_dist.new_group(ranks)
+
+
+def get_backend(group: Optional[ProcessGroup] = None) -> Optional[str]:
+    """Return the backend of the given process group.
+
+    Note:
+        Calling ``get_backend`` in non-distributed environment will return
+        None.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. The
+            default is the general main process group. If another specific
+            group is specified, the calling process must be part of
+            :attr:`group`. Defaults to None.
+
+    Returns:
+        str or None: Return the backend of the given process group as a lower
+        case string if in distributed environment, otherwise None.
+    """
+    if is_distributed():
+        # handle low versions of torch like 1.5.0 which does not support
+        # passing in None for group argument
+        if group is None:
+            group = get_default_group()
+        return torch_dist.get_backend(group)
+    else:
+        return None
+
+
+def get_world_size(group: Optional[ProcessGroup] = None) -> int:
+    """Return the number of the given process group.
+
+    Note:
+        Calling ``get_world_size`` in non-distributed environment will return
+        1.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        int: Return the number of processes of the given process group if in
+        distributed environment, otherwise 1.
+    """
+    if is_distributed():
+        # handle low versions of torch like 1.5.0 which does not support
+        # passing in None for group argument
+        if group is None:
+            group = get_default_group()
+        return torch_dist.get_world_size(group)
+    else:
+        return 1
+
+
+def get_rank(group: Optional[ProcessGroup] = None) -> int:
+    """Return the rank of the given process group.
+
+    Rank is a unique identifier assigned to each process within a distributed
+    process group. They are always consecutive integers ranging from 0 to
+    ``world_size``.
+
+    Note:
+        Calling ``get_rank`` in non-distributed environment will return 0.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        int: Return the rank of the process group if in distributed
+        environment, otherwise 0.
+    """
+
+    if is_distributed():
+        # handle low versions of torch like 1.5.0 which does not support
+        # passing in None for group argument
+        if group is None:
+            group = get_default_group()
+        return torch_dist.get_rank(group)
+    else:
+        return 0
+
+
+def get_local_size() -> int:
+    """Return the number of the current node.
+
+    Returns:
+        int: Return the number of processes in the current node if in
+        distributed environment, otherwise 1.
+    """
+    if not is_distributed():
+        return 1
+
+    if _LOCAL_PROCESS_GROUP is None:
+        raise RuntimeError('Local process group is not created, please use '
+                           '`init_local_group` to setup local process group.')
+
+    return torch_dist.get_world_size(_LOCAL_PROCESS_GROUP)
+
+
+def get_local_rank() -> int:
+    """Return the rank of current process in the current node.
+
+    Returns:
+        int: Return the rank of current process in the current node if in
+        distributed environment, otherwise 0
+    """
+    if not is_distributed():
+        return 0
+
+    if _LOCAL_PROCESS_GROUP is None:
+        raise RuntimeError('Local process group is not created, please use '
+                           '`init_local_group` to setup local process group.')
+
+    return torch_dist.get_rank(_LOCAL_PROCESS_GROUP)
+
+
+def get_dist_info(group: Optional[ProcessGroup] = None) -> Tuple[int, int]:
+    """Get distributed information of the given process group.
+
+    Note:
+        Calling ``get_dist_info`` in non-distributed environment will return
+        (0, 1).
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        tuple[int, int]: Return a tuple containing the ``rank`` and
+        ``world_size``.
+    """
+    world_size = get_world_size(group)
+    rank = get_rank(group)
+    return rank, world_size
+
+
+def is_main_process(group: Optional[ProcessGroup] = None) -> bool:
+    """Whether the current rank of the given process group is equal to 0.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+
+    Returns:
+        bool: Return True if the current rank of the given process group is
+        equal to 0, otherwise False.
+    """
+    return get_rank(group) == 0
+
+
+def master_only(func: Callable) -> Callable:
+    """Decorate those methods which should be executed in master process.
+
+    Args:
+        func (callable): Function to be decorated.
+
+    Returns:
+        callable: Return decorated function.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_main_process():
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+def barrier(group: Optional[ProcessGroup] = None) -> None:
+    """Synchronize all processes from the given process group.
+
+    This collective blocks processes until the whole group enters this
+    function.
+
+    Note:
+        Calling ``barrier`` in non-distributed environment will do nothing.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on. If None,
+            the default process group will be used. Defaults to None.
+    """
+    if is_distributed():
+        # handle low versions of torch like 1.5.0 which does not support
+        # passing in None for group argument
+        if group is None:
+            group = get_default_group()
+        torch_dist.barrier(group)
+
+
+def get_data_device(data: Union[Tensor, Mapping, Iterable]) -> torch.device:
+    """Return the device of ``data``.
+
+    If ``data`` is a sequence of Tensor, all items in ``data`` should have a
+    same device type.
+
+    If ``data`` is a dict whose values are Tensor, all values should have a
+    same device type.
+
+    Args:
+        data (Tensor or Sequence or dict): Inputs to be inferred the device.
+
+    Returns:
+        torch.device: The device of ``data``.
+
+    Examples:
+        >>> import torch
+        >>> from mmengine.dist import cast_data_device
+        >>> # data is a Tensor
+        >>> data = torch.tensor([0, 1])
+        >>> get_data_device(data)
+        device(type='cpu')
+        >>> # data is a list of Tensor
+        >>> data = [torch.tensor([0, 1]), torch.tensor([2, 3])]
+        >>> get_data_device(data)
+        device(type='cpu')
+        >>> # data is a dict
+        >>> data = {'key1': torch.tensor([0, 1]), 'key2': torch.tensor([0, 1])}
+        >>> get_data_device(data)
+        device(type='cpu')
+    """
+    if isinstance(data, Tensor):
+        return data.device
+    elif isinstance(data, Mapping):
+        pre = None
+        for v in data.values():
+            cur = get_data_device(v)
+            if pre is None:
+                pre = cur
+            else:
+                if cur != pre:
+                    raise ValueError(
+                        'device type in data should be consistent, but got '
+                        f'{cur} and {pre}')
+        if pre is None:
+            raise ValueError('data should not be empty.')
+        return pre
+    elif isinstance(data, Iterable) and not isinstance(data, str):
+        pre = None
+        for item in data:
+            cur = get_data_device(item)
+            if pre is None:
+                pre = cur
+            else:
+                if cur != pre:
+                    raise ValueError(
+                        'device type in data should be consistent, but got '
+                        f'{cur} and {pre}')
+        if pre is None:
+            raise ValueError('data should not be empty.')
+        return pre
+    else:
+        raise TypeError('data should be a Tensor, sequence of tensor or dict, '
+                        f'but got {data}')
+
+
+def get_comm_device(group: Optional[ProcessGroup] = None) -> torch.device:
+    """Return the device for communication among groups.
+
+    Args:
+        group (ProcessGroup, optional): The process group to work on.
+
+    Returns:
+        torch.device: The device of backend.
+    """
+    backend = get_backend(group)
+    if backend == 'hccl':
+        import torch_npu  # noqa: F401
+        return torch.device('npu', torch.npu.current_device())
+    elif backend == torch_dist.Backend.NCCL:
+        return torch.device('cuda', torch.cuda.current_device())
+    elif backend == 'cncl':
+        import torch_mlu  # noqa: F401
+        return torch.device('mlu', torch.mlu.current_device())
+    elif backend == 'smddp':
+        return torch.device('cuda', torch.cuda.current_device())
+    elif backend == 'mccl':
+        import torch_musa
+        return torch.device('musa', torch_musa.current_device())
+    else:
+        # GLOO and MPI backends use cpu device by default
+        return torch.device('cpu')
+
+
+def cast_data_device(
+    data: Union[Tensor, Mapping, Iterable],
+    device: torch.device,
+    out: Optional[Union[Tensor, Mapping, Iterable]] = None
+) -> Union[Tensor, Mapping, Iterable]:
+    """Recursively convert Tensor in ``data`` to ``device``.
+
+    If ``data`` has already on the ``device``, it will not be casted again.
+
+    Args:
+        data (Tensor or list or dict): Inputs to be casted.
+        device (torch.device): Destination device type.
+        out (Tensor or list or dict, optional): If ``out`` is specified, its
+            value will be equal to ``data``. Defaults to None.
+
+    Returns:
+        Tensor or list or dict: ``data`` was casted to ``device``.
+    """
+    if out is not None:
+        if type(data) is not type(out):
+            raise TypeError(
+                'out should be the same type with data, but got data is '
+                f'{type(data)} and out is {type(data)}')
+
+        if isinstance(out, set):
+            raise TypeError('out should not be a set')
+
+    if isinstance(data, Tensor):
+        if get_data_device(data) == device:
+            data_on_device = data
+        else:
+            data_on_device = data.to(device)
+
+        if out is not None:
+            # modify the value of out inplace
+            out.copy_(data_on_device)  # type: ignore
+
+        return data_on_device
+    elif isinstance(data, Mapping):
+        data_on_device = {}
+        if out is not None:
+            data_len = len(data)
+            out_len = len(out)  # type: ignore
+            if data_len != out_len:
+                raise ValueError('length of data and out should be same, '
+                                 f'but got {data_len} and {out_len}')
+
+            for k, v in data.items():
+                data_on_device[k] = cast_data_device(v, device,
+                                                     out[k])  # type: ignore
+        else:
+            for k, v in data.items():
+                data_on_device[k] = cast_data_device(v, device)
+
+        if len(data_on_device) == 0:
+            raise ValueError('data should not be empty')
+
+        # To ensure the type of output as same as input, we use `type(data)`
+        # to wrap the output
+        return type(data)(data_on_device)  # type: ignore
+    elif isinstance(data, Iterable) and not isinstance(
+            data, str) and not isinstance(data, np.ndarray):
+        data_on_device = []
+        if out is not None:
+            for v1, v2 in zip(data, out):
+                data_on_device.append(cast_data_device(v1, device, v2))
+        else:
+            for v in data:
+                data_on_device.append(cast_data_device(v, device))
+
+        if len(data_on_device) == 0:
+            raise ValueError('data should not be empty')
+
+        return type(data)(data_on_device)  # type: ignore
+    else:
+        raise TypeError('data should be a Tensor, list of tensor or dict, '
+                        f'but got {data}')
diff --git a/head_extractor/src/mmengine/evaluator/__init__.py b/head_extractor/src/mmengine/evaluator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6bc78425e2a3194bdaa4da29e6b3e238237fafa
--- /dev/null
+++ b/head_extractor/src/mmengine/evaluator/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .evaluator import Evaluator
+from .metric import BaseMetric, DumpResults
+from .utils import get_metric_value
+
+__all__ = ['BaseMetric', 'Evaluator', 'get_metric_value', 'DumpResults']
diff --git a/head_extractor/src/mmengine/evaluator/evaluator.py b/head_extractor/src/mmengine/evaluator/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..930ce930286a6bdcf4d0eac7cb79a961f3587a14
--- /dev/null
+++ b/head_extractor/src/mmengine/evaluator/evaluator.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Iterator, List, Optional, Sequence, Union
+
+from mmengine.dataset import pseudo_collate
+from mmengine.registry import EVALUATOR, METRICS
+from mmengine.structures import BaseDataElement
+from .metric import BaseMetric
+
+
+@EVALUATOR.register_module()
+class Evaluator:
+    """Wrapper class to compose multiple :class:`BaseMetric` instances.
+
+    Args:
+        metrics (dict or BaseMetric or Sequence): The config of metrics.
+    """
+
+    def __init__(self, metrics: Union[dict, BaseMetric, Sequence]):
+        self._dataset_meta: Optional[dict] = None
+        if not isinstance(metrics, Sequence):
+            metrics = [metrics]
+        self.metrics: List[BaseMetric] = []
+        for metric in metrics:
+            if isinstance(metric, dict):
+                self.metrics.append(METRICS.build(metric))
+            else:
+                self.metrics.append(metric)
+
+    @property
+    def dataset_meta(self) -> Optional[dict]:
+        """Optional[dict]: Meta info of the dataset."""
+        return self._dataset_meta
+
+    @dataset_meta.setter
+    def dataset_meta(self, dataset_meta: dict) -> None:
+        """Set the dataset meta info to the evaluator and it's metrics."""
+        self._dataset_meta = dataset_meta
+        for metric in self.metrics:
+            metric.dataset_meta = dataset_meta
+
+    def process(self,
+                data_samples: Sequence[BaseDataElement],
+                data_batch: Optional[Any] = None):
+        """Convert ``BaseDataSample`` to dict and invoke process method of each
+        metric.
+
+        Args:
+            data_samples (Sequence[BaseDataElement]): predictions of the model,
+                and the ground truth of the validation set.
+            data_batch (Any, optional): A batch of data from the dataloader.
+        """
+        _data_samples = []
+        for data_sample in data_samples:
+            if isinstance(data_sample, BaseDataElement):
+                _data_samples.append(data_sample.to_dict())
+            else:
+                _data_samples.append(data_sample)
+
+        for metric in self.metrics:
+            metric.process(data_batch, _data_samples)
+
+    def evaluate(self, size: int) -> dict:
+        """Invoke ``evaluate`` method of each metric and collect the metrics
+        dictionary.
+
+        Args:
+            size (int): Length of the entire validation dataset. When batch
+                size > 1, the dataloader may pad some data samples to make
+                sure all ranks have the same length of dataset slice. The
+                ``collect_results`` function will drop the padded data based on
+                this size.
+
+        Returns:
+            dict: Evaluation results of all metrics. The keys are the names
+            of the metrics, and the values are corresponding results.
+        """
+        metrics = {}
+        for metric in self.metrics:
+            _results = metric.evaluate(size)
+
+            # Check metric name conflicts
+            for name in _results.keys():
+                if name in metrics:
+                    raise ValueError(
+                        'There are multiple evaluation results with the same '
+                        f'metric name {name}. Please make sure all metrics '
+                        'have different prefixes.')
+
+            metrics.update(_results)
+        return metrics
+
+    def offline_evaluate(self,
+                         data_samples: Sequence,
+                         data: Optional[Sequence] = None,
+                         chunk_size: int = 1):
+        """Offline evaluate the dumped predictions on the given data .
+
+        Args:
+            data_samples (Sequence): All predictions and ground truth of the
+                model and the validation set.
+            data (Sequence, optional): All data of the validation set.
+            chunk_size (int): The number of data samples and predictions to be
+                processed in a batch.
+        """
+
+        # support chunking iterable objects
+        def get_chunks(seq: Iterator, chunk_size=1):
+            stop = False
+            while not stop:
+                chunk = []
+                for _ in range(chunk_size):
+                    try:
+                        chunk.append(next(seq))
+                    except StopIteration:
+                        stop = True
+                        break
+                if chunk:
+                    yield chunk
+
+        if data is not None:
+            assert len(data_samples) == len(data), (
+                'data_samples and data should have the same length, but got '
+                f'data_samples length: {len(data_samples)} '
+                f'data length: {len(data)}')
+            data = get_chunks(iter(data), chunk_size)
+
+        size = 0
+        for output_chunk in get_chunks(iter(data_samples), chunk_size):
+            if data is not None:
+                data_chunk = pseudo_collate(next(data))  # type: ignore
+            else:
+                data_chunk = None
+            size += len(output_chunk)
+            self.process(output_chunk, data_chunk)
+        return self.evaluate(size)
diff --git a/head_extractor/src/mmengine/evaluator/metric.py b/head_extractor/src/mmengine/evaluator/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e6d40bee3da6d2f1b8a0a4546a80e2ac6ca527c
--- /dev/null
+++ b/head_extractor/src/mmengine/evaluator/metric.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from abc import ABCMeta, abstractmethod
+from typing import Any, List, Optional, Sequence, Union
+
+from torch import Tensor
+
+from mmengine.dist import (broadcast_object_list, collect_results,
+                           is_main_process)
+from mmengine.fileio import dump
+from mmengine.logging import print_log
+from mmengine.registry import METRICS
+from mmengine.structures import BaseDataElement
+
+
+class BaseMetric(metaclass=ABCMeta):
+    """Base class for a metric.
+
+    The metric first processes each batch of data_samples and predictions,
+    and appends the processed results to the results list. Then it
+    collects all results together from all ranks if distributed training
+    is used. Finally, it computes the metrics of the entire dataset.
+
+    A subclass of class:`BaseMetric` should assign a meaningful value to the
+    class attribute `default_prefix`. See the argument `prefix` for details.
+
+    Args:
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Default: None
+        collect_dir: (str, optional): Synchronize directory for collecting data
+            from different ranks. This argument should only be configured when
+            ``collect_device`` is 'cpu'. Defaults to None.
+            `New in version 0.7.3.`
+    """
+
+    default_prefix: Optional[str] = None
+
+    def __init__(self,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 collect_dir: Optional[str] = None) -> None:
+        if collect_dir is not None and collect_device != 'cpu':
+            raise ValueError('`collec_dir` could only be configured when '
+                             "`collect_device='cpu'`")
+
+        self._dataset_meta: Union[None, dict] = None
+        self.collect_device = collect_device
+        self.results: List[Any] = []
+        self.prefix = prefix or self.default_prefix
+        self.collect_dir = collect_dir
+
+        if self.prefix is None:
+            print_log(
+                'The prefix is not set in metric class '
+                f'{self.__class__.__name__}.',
+                logger='current',
+                level=logging.WARNING)
+
+    @property
+    def dataset_meta(self) -> Optional[dict]:
+        """Optional[dict]: Meta info of the dataset."""
+        return self._dataset_meta
+
+    @dataset_meta.setter
+    def dataset_meta(self, dataset_meta: dict) -> None:
+        """Set the dataset meta info to the metric."""
+        self._dataset_meta = dataset_meta
+
+    @abstractmethod
+    def process(self, data_batch: Any, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data samples and predictions. The processed
+        results should be stored in ``self.results``, which will be used to
+        compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (Any): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from
+                the model.
+        """
+
+    @abstractmethod
+    def compute_metrics(self, results: list) -> dict:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            dict: The computed metrics. The keys are the names of the metrics,
+            and the values are corresponding results.
+        """
+
+    def evaluate(self, size: int) -> dict:
+        """Evaluate the model performance of the whole dataset after processing
+        all batches.
+
+        Args:
+            size (int): Length of the entire validation dataset. When batch
+                size > 1, the dataloader may pad some data samples to make
+                sure all ranks have the same length of dataset slice. The
+                ``collect_results`` function will drop the padded data based on
+                this size.
+
+        Returns:
+            dict: Evaluation metrics dict on the val dataset. The keys are the
+            names of the metrics, and the values are corresponding results.
+        """
+        if len(self.results) == 0:
+            print_log(
+                f'{self.__class__.__name__} got empty `self.results`. Please '
+                'ensure that the processed results are properly added into '
+                '`self.results` in `process` method.',
+                logger='current',
+                level=logging.WARNING)
+
+        if self.collect_device == 'cpu':
+            results = collect_results(
+                self.results,
+                size,
+                self.collect_device,
+                tmpdir=self.collect_dir)
+        else:
+            results = collect_results(self.results, size, self.collect_device)
+
+        if is_main_process():
+            # cast all tensors in results list to cpu
+            results = _to_cpu(results)
+            _metrics = self.compute_metrics(results)  # type: ignore
+            # Add prefix to metric names
+            if self.prefix:
+                _metrics = {
+                    '/'.join((self.prefix, k)): v
+                    for k, v in _metrics.items()
+                }
+            metrics = [_metrics]
+        else:
+            metrics = [None]  # type: ignore
+
+        broadcast_object_list(metrics)
+
+        # reset the results list
+        self.results.clear()
+        return metrics[0]
+
+
+@METRICS.register_module()
+class DumpResults(BaseMetric):
+    """Dump model predictions to a pickle file for offline evaluation.
+
+    Args:
+        out_file_path (str): Path of the dumped file. Must end with '.pkl'
+            or '.pickle'.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        collect_dir: (str, optional): Synchronize directory for collecting data
+            from different ranks. This argument should only be configured when
+            ``collect_device`` is 'cpu'. Defaults to None.
+            `New in version 0.7.3.`
+    """
+
+    def __init__(self,
+                 out_file_path: str,
+                 collect_device: str = 'cpu',
+                 collect_dir: Optional[str] = None) -> None:
+        super().__init__(
+            collect_device=collect_device, collect_dir=collect_dir)
+        if not out_file_path.endswith(('.pkl', '.pickle')):
+            raise ValueError('The output file must be a pkl file.')
+        self.out_file_path = out_file_path
+
+    def process(self, data_batch: Any, predictions: Sequence[dict]) -> None:
+        """transfer tensors in predictions to CPU."""
+        self.results.extend(_to_cpu(predictions))
+
+    def compute_metrics(self, results: list) -> dict:
+        """dump the prediction results to a pickle file."""
+        dump(results, self.out_file_path)
+        print_log(
+            f'Results has been saved to {self.out_file_path}.',
+            logger='current')
+        return {}
+
+
+def _to_cpu(data: Any) -> Any:
+    """transfer all tensors and BaseDataElement to cpu."""
+    if isinstance(data, (Tensor, BaseDataElement)):
+        return data.to('cpu')
+    elif isinstance(data, list):
+        return [_to_cpu(d) for d in data]
+    elif isinstance(data, tuple):
+        return tuple(_to_cpu(d) for d in data)
+    elif isinstance(data, dict):
+        return {k: _to_cpu(v) for k, v in data.items()}
+    else:
+        return data
diff --git a/head_extractor/src/mmengine/evaluator/utils.py b/head_extractor/src/mmengine/evaluator/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6981c881b95e1b6c6c859958a3b6f73049e2f2af
--- /dev/null
+++ b/head_extractor/src/mmengine/evaluator/utils.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict
+
+
+def get_metric_value(indicator: str, metrics: Dict) -> Any:
+    """Get the metric value specified by an indicator, which can be either a
+    metric name or a full name with evaluator prefix.
+
+    Args:
+        indicator (str): The metric indicator, which can be the metric name
+            (e.g. 'AP') or the full name with prefix (e.g. 'COCO/AP')
+        metrics (dict): The evaluation results output by the evaluator
+
+    Returns:
+        Any: The specified metric value
+    """
+
+    if '/' in indicator:
+        # The indicator is a full name
+        if indicator in metrics:
+            return metrics[indicator]
+        else:
+            raise ValueError(
+                f'The indicator "{indicator}" can not match any metric in '
+                f'{list(metrics.keys())}')
+    else:
+        # The indicator is metric name without prefix
+        matched = [k for k in metrics.keys() if k.split('/')[-1] == indicator]
+
+        if not matched:
+            raise ValueError(
+                f'The indicator {indicator} can not match any metric in '
+                f'{list(metrics.keys())}')
+        elif len(matched) > 1:
+            raise ValueError(f'The indicator "{indicator}" matches multiple '
+                             f'metrics {matched}')
+        else:
+            return metrics[matched[0]]
diff --git a/head_extractor/src/mmengine/fileio/__init__.py b/head_extractor/src/mmengine/fileio/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..81adcd4c02c96715825bf220794ac9e0b08019b9
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .backends import (BaseStorageBackend, HTTPBackend, LmdbBackend,
+                       LocalBackend, MemcachedBackend, PetrelBackend,
+                       register_backend)
+from .file_client import FileClient, HardDiskBackend
+from .handlers import (BaseFileHandler, JsonHandler, PickleHandler,
+                       YamlHandler, register_handler)
+from .io import (copy_if_symlink_fails, copyfile, copyfile_from_local,
+                 copyfile_to_local, copytree, copytree_from_local,
+                 copytree_to_local, dump, exists, generate_presigned_url, get,
+                 get_file_backend, get_local_path, get_text, isdir, isfile,
+                 join_path, list_dir_or_file, load, put, put_text, remove,
+                 rmtree)
+from .parse import dict_from_file, list_from_file
+
+__all__ = [
+    'BaseStorageBackend', 'FileClient', 'PetrelBackend', 'MemcachedBackend',
+    'LmdbBackend', 'HardDiskBackend', 'LocalBackend', 'HTTPBackend',
+    'copy_if_symlink_fails', 'copyfile', 'copyfile_from_local',
+    'copyfile_to_local', 'copytree', 'copytree_from_local',
+    'copytree_to_local', 'exists', 'generate_presigned_url', 'get',
+    'get_file_backend', 'get_local_path', 'get_text', 'isdir', 'isfile',
+    'join_path', 'list_dir_or_file', 'put', 'put_text', 'remove', 'rmtree',
+    'load', 'dump', 'register_handler', 'BaseFileHandler', 'JsonHandler',
+    'PickleHandler', 'YamlHandler', 'list_from_file', 'dict_from_file',
+    'register_backend'
+]
diff --git a/head_extractor/src/mmengine/fileio/backends/__init__.py b/head_extractor/src/mmengine/fileio/backends/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa0008977f765f059c2f727885b57716979c2f05
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/backends/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseStorageBackend
+from .http_backend import HTTPBackend
+from .lmdb_backend import LmdbBackend
+from .local_backend import LocalBackend
+from .memcached_backend import MemcachedBackend
+from .petrel_backend import PetrelBackend
+from .registry_utils import backends, prefix_to_backends, register_backend
+
+__all__ = [
+    'BaseStorageBackend', 'LocalBackend', 'HTTPBackend', 'LmdbBackend',
+    'MemcachedBackend', 'PetrelBackend', 'register_backend', 'backends',
+    'prefix_to_backends'
+]
diff --git a/head_extractor/src/mmengine/fileio/backends/base.py b/head_extractor/src/mmengine/fileio/backends/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9331edf598f9b649cfb21e380b441f82a06d5372
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/backends/base.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from abc import ABCMeta, abstractmethod
+
+from mmengine.logging import print_log
+
+
+class BaseStorageBackend(metaclass=ABCMeta):
+    """Abstract class of storage backends.
+
+    All backends need to implement two apis: :meth:`get()` and
+    :meth:`get_text()`.
+
+    - :meth:`get()` reads the file as a byte stream.
+    - :meth:`get_text()` reads the file as texts.
+    """
+
+    # a flag to indicate whether the backend can create a symlink for a file
+    # This attribute will be deprecated in future.
+    _allow_symlink = False
+
+    @property
+    def allow_symlink(self):
+        print_log(
+            'allow_symlink will be deprecated in future',
+            logger='current',
+            level=logging.WARNING)
+        return self._allow_symlink
+
+    @property
+    def name(self):
+        return self.__class__.__name__
+
+    @abstractmethod
+    def get(self, filepath):
+        pass
+
+    @abstractmethod
+    def get_text(self, filepath):
+        pass
diff --git a/head_extractor/src/mmengine/fileio/backends/http_backend.py b/head_extractor/src/mmengine/fileio/backends/http_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3e65bbdbb6e4cb93b324951d9d2dd18c07bae64
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/backends/http_backend.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import tempfile
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Generator, Union
+from urllib.request import urlopen
+
+from .base import BaseStorageBackend
+
+
+class HTTPBackend(BaseStorageBackend):
+    """HTTP and HTTPS storage bachend."""
+
+    def get(self, filepath: str) -> bytes:
+        """Read bytes from a given ``filepath``.
+
+        Args:
+            filepath (str): Path to read data.
+
+        Returns:
+            bytes: Expected bytes object.
+
+        Examples:
+            >>> backend = HTTPBackend()
+            >>> backend.get('http://path/of/file')
+            b'hello world'
+        """
+        return urlopen(filepath).read()
+
+    def get_text(self, filepath, encoding='utf-8') -> str:
+        """Read text from a given ``filepath``.
+
+        Args:
+            filepath (str): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+
+        Examples:
+            >>> backend = HTTPBackend()
+            >>> backend.get_text('http://path/of/file')
+            'hello world'
+        """
+        return urlopen(filepath).read().decode(encoding)
+
+    @contextmanager
+    def get_local_path(
+            self, filepath: str) -> Generator[Union[str, Path], None, None]:
+        """Download a file from ``filepath`` to a local temporary directory,
+        and return the temporary path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Args:
+            filepath (str): Download a file from ``filepath``.
+
+        Yields:
+            Iterable[str]: Only yield one temporary path.
+
+        Examples:
+            >>> backend = HTTPBackend()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with backend.get_local_path('http://path/of/file') as path:
+            ...     # do something here
+        """
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.get(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
diff --git a/head_extractor/src/mmengine/fileio/backends/lmdb_backend.py b/head_extractor/src/mmengine/fileio/backends/lmdb_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb47923e56a43529cd28ce7aa3bc9404875c2fd7
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/backends/lmdb_backend.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from pathlib import Path
+from typing import Union
+
+from .base import BaseStorageBackend
+
+
+class LmdbBackend(BaseStorageBackend):
+    """Lmdb storage backend.
+
+    Args:
+        db_path (str): Lmdb database path.
+        readonly (bool): Lmdb environment parameter. If True, disallow any
+            write operations. Defaults to True.
+        lock (bool): Lmdb environment parameter. If False, when concurrent
+            access occurs, do not lock the database. Defaults to False.
+        readahead (bool): Lmdb environment parameter. If False, disable the OS
+            filesystem readahead mechanism, which may improve random read
+            performance when a database is larger than RAM. Defaults to False.
+        **kwargs: Keyword arguments passed to `lmdb.open`.
+
+    Attributes:
+        db_path (str): Lmdb database path.
+    """
+
+    def __init__(self,
+                 db_path,
+                 readonly=True,
+                 lock=False,
+                 readahead=False,
+                 **kwargs):
+        try:
+            import lmdb  # noqa: F401
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install lmdb" to enable LmdbBackend.')
+
+        self.db_path = str(db_path)
+        self.readonly = readonly
+        self.lock = lock
+        self.readahead = readahead
+        self.kwargs = kwargs
+        self._client = None
+
+    def get(self, filepath: Union[str, Path]) -> bytes:
+        """Get values according to the filepath.
+
+        Args:
+            filepath (str or Path): Here, filepath is the lmdb key.
+
+        Returns:
+            bytes: Expected bytes object.
+
+        Examples:
+            >>> backend = LmdbBackend('path/to/lmdb')
+            >>> backend.get('key')
+            b'hello world'
+        """
+        if self._client is None:
+            self._client = self._get_client()
+
+        filepath = str(filepath)
+        with self._client.begin(write=False) as txn:
+            value_buf = txn.get(filepath.encode('ascii'))
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+    def _get_client(self):
+        import lmdb
+
+        return lmdb.open(
+            self.db_path,
+            readonly=self.readonly,
+            lock=self.lock,
+            readahead=self.readahead,
+            **self.kwargs)
+
+    def __del__(self):
+        if self._client is not None:
+            self._client.close()
diff --git a/head_extractor/src/mmengine/fileio/backends/local_backend.py b/head_extractor/src/mmengine/fileio/backends/local_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7d5f0462132ce80b3e3d6f32ad98047723d0219
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/backends/local_backend.py
@@ -0,0 +1,543 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Generator, Iterator, Optional, Tuple, Union
+
+import mmengine
+from .base import BaseStorageBackend
+
+
+class LocalBackend(BaseStorageBackend):
+    """Raw local storage backend."""
+
+    _allow_symlink = True
+
+    def get(self, filepath: Union[str, Path]) -> bytes:
+        """Read bytes from a given ``filepath`` with 'rb' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes: Expected bytes object.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.get(filepath)
+            b'hello world'
+        """
+        with open(filepath, 'rb') as f:
+            value = f.read()
+        return value
+
+    def get_text(self,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> str:
+        """Read text from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.get_text(filepath)
+            'hello world'
+        """
+        with open(filepath, encoding=encoding) as f:
+            text = f.read()
+        return text
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write bytes to a given ``filepath`` with 'wb' mode.
+
+        Note:
+            ``put`` will create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.put(b'hello world', filepath)
+        """
+        mmengine.mkdir_or_exist(osp.dirname(filepath))
+        with open(filepath, 'wb') as f:
+            f.write(obj)
+
+    def put_text(self,
+                 obj: str,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> None:
+        """Write text to a given ``filepath`` with 'w' mode.
+
+        Note:
+            ``put_text`` will create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.put_text('hello world', filepath)
+        """
+        mmengine.mkdir_or_exist(osp.dirname(filepath))
+        with open(filepath, 'w', encoding=encoding) as f:
+            f.write(obj)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.exists(filepath)
+            True
+        """
+        return osp.exists(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/dir'
+            >>> backend.isdir(filepath)
+            True
+        """
+        return osp.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.isfile(filepath)
+            True
+        """
+        return osp.isfile(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        r"""Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of \*filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result of concatenation.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath1 = '/path/of/dir1'
+            >>> filepath2 = 'dir2'
+            >>> filepath3 = 'path/of/file'
+            >>> backend.join_path(filepath1, filepath2, filepath3)
+            '/path/of/dir/dir2/path/of/file'
+        """
+        # TODO, if filepath or filepaths are Path, should return Path
+        return osp.join(filepath, *filepaths)
+
+    @contextmanager
+    def get_local_path(
+        self,
+        filepath: Union[str, Path],
+    ) -> Generator[Union[str, Path], None, None]:
+        """Only for unified API and do nothing.
+
+        Args:
+            filepath (str or Path): Path to be read data.
+            backend_args (dict, optional): Arguments to instantiate the
+                corresponding backend. Defaults to None.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> with backend.get_local_path('s3://bucket/abc.jpg') as path:
+            ...     # do something here
+        """
+        yield filepath
+
+    def copyfile(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Copy a file src to dst and return the destination file.
+
+        src and dst should have the same prefix. If dst specifies a directory,
+        the file will be copied into dst using the base filename from src. If
+        dst specifies a file that already exists, it will be replaced.
+
+        Args:
+            src (str or Path): A file to be copied.
+            dst (str or Path): Copy file to dst.
+
+        Returns:
+            str: The destination file.
+
+        Raises:
+            SameFileError: If src and dst are the same file, a SameFileError
+                will be raised.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> # dst is a file
+            >>> src = '/path/of/file'
+            >>> dst = '/path1/of/file1'
+            >>> # src will be copied to '/path1/of/file1'
+            >>> backend.copyfile(src, dst)
+            '/path1/of/file1'
+
+            >>> # dst is a directory
+            >>> dst = '/path1/of/dir'
+            >>> # src will be copied to '/path1/of/dir/file'
+            >>> backend.copyfile(src, dst)
+            '/path1/of/dir/file'
+        """
+        return shutil.copy(src, dst)
+
+    def copytree(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Recursively copy an entire directory tree rooted at src to a
+        directory named dst and return the destination directory.
+
+        src and dst should have the same prefix and dst must not already exist.
+
+        TODO: Whether to support dirs_exist_ok parameter.
+
+        Args:
+            src (str or Path): A directory to be copied.
+            dst (str or Path): Copy directory to dst.
+
+        Returns:
+            str: The destination directory.
+
+        Raises:
+            FileExistsError: If dst had already existed, a FileExistsError will
+                be raised.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> src = '/path/of/dir1'
+            >>> dst = '/path/of/dir2'
+            >>> backend.copytree(src, dst)
+            '/path/of/dir2'
+        """
+        return shutil.copytree(src, dst)
+
+    def copyfile_from_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Copy a local file src to dst and return the destination file. Same
+        as :meth:`copyfile`.
+
+        Args:
+            src (str or Path): A local file to be copied.
+            dst (str or Path): Copy file to dst.
+
+        Returns:
+            str: If dst specifies a directory, the file will be copied into dst
+            using the base filename from src.
+
+        Raises:
+            SameFileError: If src and dst are the same file, a SameFileError
+                will be raised.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> # dst is a file
+            >>> src = '/path/of/file'
+            >>> dst = '/path1/of/file1'
+            >>> # src will be copied to '/path1/of/file1'
+            >>> backend.copyfile_from_local(src, dst)
+            '/path1/of/file1'
+
+            >>> # dst is a directory
+            >>> dst = '/path1/of/dir'
+            >>> # src will be copied to
+            >>> backend.copyfile_from_local(src, dst)
+            '/path1/of/dir/file'
+        """
+        return self.copyfile(src, dst)
+
+    def copytree_from_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Recursively copy an entire directory tree rooted at src to a
+        directory named dst and return the destination directory. Same as
+        :meth:`copytree`.
+
+        Args:
+            src (str or Path): A local directory to be copied.
+            dst (str or Path): Copy directory to dst.
+
+        Returns:
+            str: The destination directory.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> src = '/path/of/dir1'
+            >>> dst = '/path/of/dir2'
+            >>> backend.copytree_from_local(src, dst)
+            '/path/of/dir2'
+        """
+        return self.copytree(src, dst)
+
+    def copyfile_to_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Copy the file src to local dst and return the destination file. Same
+        as :meth:`copyfile`.
+
+        If dst specifies a directory, the file will be copied into dst using
+        the base filename from src. If dst specifies a file that already
+        exists, it will be replaced.
+
+        Args:
+            src (str or Path): A file to be copied.
+            dst (str or Path): Copy file to to local dst.
+
+        Returns:
+            str: If dst specifies a directory, the file will be copied into dst
+            using the base filename from src.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> # dst is a file
+            >>> src = '/path/of/file'
+            >>> dst = '/path1/of/file1'
+            >>> # src will be copied to '/path1/of/file1'
+            >>> backend.copyfile_to_local(src, dst)
+            '/path1/of/file1'
+
+            >>> # dst is a directory
+            >>> dst = '/path1/of/dir'
+            >>> # src will be copied to
+            >>> backend.copyfile_to_local(src, dst)
+            '/path1/of/dir/file'
+        """
+        return self.copyfile(src, dst)
+
+    def copytree_to_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Recursively copy an entire directory tree rooted at src to a local
+        directory named dst and return the destination directory.
+
+        Args:
+            src (str or Path): A directory to be copied.
+            dst (str or Path): Copy directory to local dst.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+
+        Returns:
+            str: The destination directory.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> src = '/path/of/dir1'
+            >>> dst = '/path/of/dir2'
+            >>> backend.copytree_from_local(src, dst)
+            '/path/of/dir2'
+        """
+        return self.copytree(src, dst)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str or Path): Path to be removed.
+
+        Raises:
+            IsADirectoryError: If filepath is a directory, an IsADirectoryError
+                will be raised.
+            FileNotFoundError: If filepath does not exist, an FileNotFoundError
+                will be raised.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> filepath = '/path/of/file'
+            >>> backend.remove(filepath)
+        """
+        if not self.exists(filepath):
+            raise FileNotFoundError(f'filepath {filepath} does not exist')
+
+        if self.isdir(filepath):
+            raise IsADirectoryError('filepath should be a file')
+
+        os.remove(filepath)
+
+    def rmtree(self, dir_path: Union[str, Path]) -> None:
+        """Recursively delete a directory tree.
+
+        Args:
+            dir_path (str or Path): A directory to be removed.
+
+        Examples:
+            >>> dir_path = '/path/of/dir'
+            >>> backend.rmtree(dir_path)
+        """
+        shutil.rmtree(dir_path)
+
+    def copy_if_symlink_fails(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> bool:
+        """Create a symbolic link pointing to src named dst.
+
+        If failed to create a symbolic link pointing to src, directly copy src
+        to dst instead.
+
+        Args:
+            src (str or Path): Create a symbolic link pointing to src.
+            dst (str or Path): Create a symbolic link named dst.
+
+        Returns:
+            bool: Return True if successfully create a symbolic link pointing
+            to src. Otherwise, return False.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> src = '/path/of/file'
+            >>> dst = '/path1/of/file1'
+            >>> backend.copy_if_symlink_fails(src, dst)
+            True
+            >>> src = '/path/of/dir'
+            >>> dst = '/path1/of/dir1'
+            >>> backend.copy_if_symlink_fails(src, dst)
+            True
+        """
+        try:
+            os.symlink(src, dst)
+            return True
+        except Exception:
+            if self.isfile(src):
+                self.copyfile(src, dst)
+            else:
+                self.copytree(src, dst)
+            return False
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+        Args:
+            dir_path (str or Path): Path of the directory.
+            list_dir (bool): List the directories. Defaults to True.
+            list_file (bool): List the path of files. Defaults to True.
+            suffix (str or tuple[str], optional): File suffix that we are
+                interested in. Defaults to None.
+            recursive (bool): If set to True, recursively scan the directory.
+                Defaults to False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+
+        Examples:
+            >>> backend = LocalBackend()
+            >>> dir_path = '/path/of/dir'
+            >>> # list those files and directories in current directory
+            >>> for file_path in backend.list_dir_or_file(dir_path):
+            ...     print(file_path)
+            >>> # only list files
+            >>> for file_path in backend.list_dir_or_file(dir_path, list_dir=False):
+            ...     print(file_path)
+            >>> # only list directories
+            >>> for file_path in backend.list_dir_or_file(dir_path, list_file=False):
+            ...     print(file_path)
+            >>> # only list files ending with specified suffixes
+            >>> for file_path in backend.list_dir_or_file(dir_path, suffix='.txt'):
+            ...     print(file_path)
+            >>> # list all files and directory recursively
+            >>> for file_path in backend.list_dir_or_file(dir_path, recursive=True):
+            ...     print(file_path)
+        """  # noqa: E501
+        if list_dir and suffix is not None:
+            raise TypeError('`suffix` should be None when `list_dir` is True')
+
+        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+            raise TypeError('`suffix` must be a string or tuple of strings')
+
+        root = dir_path
+
+        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                              recursive):
+            for entry in os.scandir(dir_path):
+                if not entry.name.startswith('.') and entry.is_file():
+                    rel_path = osp.relpath(entry.path, root)
+                    if (suffix is None
+                            or rel_path.endswith(suffix)) and list_file:
+                        yield rel_path
+                elif osp.isdir(entry.path):
+                    if list_dir:
+                        rel_dir = osp.relpath(entry.path, root)
+                        yield rel_dir
+                    if recursive:
+                        yield from _list_dir_or_file(entry.path, list_dir,
+                                                     list_file, suffix,
+                                                     recursive)
+
+        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                 recursive)
diff --git a/head_extractor/src/mmengine/fileio/backends/memcached_backend.py b/head_extractor/src/mmengine/fileio/backends/memcached_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..2458e7c6ec3525ba0425370c3529c4403f726716
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/backends/memcached_backend.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from pathlib import Path
+from typing import Union
+
+from .base import BaseStorageBackend
+
+
+class MemcachedBackend(BaseStorageBackend):
+    """Memcached storage backend.
+
+    Attributes:
+        server_list_cfg (str): Config file for memcached server list.
+        client_cfg (str): Config file for memcached client.
+        sys_path (str, optional): Additional path to be appended to `sys.path`.
+            Defaults to None.
+    """
+
+    def __init__(self, server_list_cfg, client_cfg, sys_path=None):
+        if sys_path is not None:
+            import sys
+            sys.path.append(sys_path)
+        try:
+            import mc
+        except ImportError:
+            raise ImportError(
+                'Please install memcached to enable MemcachedBackend.')
+
+        self.server_list_cfg = server_list_cfg
+        self.client_cfg = client_cfg
+        self._client = mc.MemcachedClient.GetInstance(self.server_list_cfg,
+                                                      self.client_cfg)
+        # mc.pyvector servers as a point which points to a memory cache
+        self._mc_buffer = mc.pyvector()
+
+    def get(self, filepath: Union[str, Path]):
+        """Get values according to the filepath.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes: Expected bytes object.
+
+        Examples:
+            >>> server_list_cfg = '/path/of/server_list.conf'
+            >>> client_cfg = '/path/of/mc.conf'
+            >>> backend = MemcachedBackend(server_list_cfg, client_cfg)
+            >>> backend.get('/path/of/file')
+            b'hello world'
+        """
+        filepath = str(filepath)
+        import mc
+        self._client.Get(filepath, self._mc_buffer)
+        value_buf = mc.ConvertBuffer(self._mc_buffer)
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
diff --git a/head_extractor/src/mmengine/fileio/backends/petrel_backend.py b/head_extractor/src/mmengine/fileio/backends/petrel_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..3994372f664b22535d1104b9b9032d5c3e7ac092
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/backends/petrel_backend.py
@@ -0,0 +1,771 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import re
+import tempfile
+from contextlib import contextmanager
+from pathlib import Path
+from shutil import SameFileError
+from typing import Generator, Iterator, Optional, Tuple, Union
+
+import mmengine
+from mmengine.utils import has_method
+from .base import BaseStorageBackend
+
+
+class PetrelBackend(BaseStorageBackend):
+    """Petrel storage backend (for internal usage).
+
+    PetrelBackend supports reading and writing data to multiple clusters.
+    If the file path contains the cluster name, PetrelBackend will read data
+    from specified cluster or write data to it. Otherwise, PetrelBackend will
+    access the default cluster.
+
+    Args:
+        path_mapping (dict, optional): Path mapping dict from local path to
+            Petrel path. When ``path_mapping={'src': 'dst'}``, ``src`` in
+            ``filepath`` will be replaced by ``dst``. Defaults to None.
+        enable_mc (bool, optional): Whether to enable memcached support.
+            Defaults to True.
+        conf_path (str, optional): Config path of Petrel client. Default: None.
+            `New in version 0.3.3`.
+
+    Examples:
+        >>> backend = PetrelBackend()
+        >>> filepath1 = 'petrel://path/of/file'
+        >>> filepath2 = 'cluster-name:petrel://path/of/file'
+        >>> backend.get(filepath1)  # get data from default cluster
+        >>> client.get(filepath2)  # get data from 'cluster-name' cluster
+    """
+
+    def __init__(self,
+                 path_mapping: Optional[dict] = None,
+                 enable_mc: bool = True,
+                 conf_path: Optional[str] = None):
+        try:
+            from petrel_client import client
+        except ImportError:
+            raise ImportError('Please install petrel_client to enable '
+                              'PetrelBackend.')
+
+        self._client = client.Client(conf_path=conf_path, enable_mc=enable_mc)
+        assert isinstance(path_mapping, dict) or path_mapping is None
+        self.path_mapping = path_mapping
+
+    def _map_path(self, filepath: Union[str, Path]) -> str:
+        """Map ``filepath`` to a string path whose prefix will be replaced by
+        :attr:`self.path_mapping`.
+
+        Args:
+            filepath (str or Path): Path to be mapped.
+        """
+        filepath = str(filepath)
+        if self.path_mapping is not None:
+            for k, v in self.path_mapping.items():
+                filepath = filepath.replace(k, v, 1)
+        return filepath
+
+    def _format_path(self, filepath: str) -> str:
+        """Convert a ``filepath`` to standard format of petrel oss.
+
+        If the ``filepath`` is concatenated by ``os.path.join``, in a Windows
+        environment, the ``filepath`` will be the format of
+        's3://bucket_name\\image.jpg'. By invoking :meth:`_format_path`, the
+        above ``filepath`` will be converted to 's3://bucket_name/image.jpg'.
+
+        Args:
+            filepath (str): Path to be formatted.
+        """
+        return re.sub(r'\\+', '/', filepath)
+
+    def _replace_prefix(self, filepath: Union[str, Path]) -> str:
+        filepath = str(filepath)
+        return filepath.replace('petrel://', 's3://')
+
+    def get(self, filepath: Union[str, Path]) -> bytes:
+        """Read bytes from a given ``filepath`` with 'rb' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes: Return bytes read from filepath.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.get(filepath)
+            b'hello world'
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        filepath = self._replace_prefix(filepath)
+        value = self._client.Get(filepath)
+        return value
+
+    def get_text(
+        self,
+        filepath: Union[str, Path],
+        encoding: str = 'utf-8',
+    ) -> str:
+        """Read text from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.get_text(filepath)
+            'hello world'
+        """
+        return str(self.get(filepath), encoding=encoding)
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write bytes to a given ``filepath``.
+
+        Args:
+            obj (bytes): Data to be saved.
+            filepath (str or Path): Path to write data.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.put(b'hello world', filepath)
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        filepath = self._replace_prefix(filepath)
+        self._client.put(filepath, obj)
+
+    def put_text(
+        self,
+        obj: str,
+        filepath: Union[str, Path],
+        encoding: str = 'utf-8',
+    ) -> None:
+        """Write text to a given ``filepath``.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to encode the ``obj``.
+                Defaults to 'utf-8'.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.put_text('hello world', filepath)
+        """
+        self.put(bytes(obj, encoding=encoding), filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.exists(filepath)
+            True
+        """
+        if not (has_method(self._client, 'contains')
+                and has_method(self._client, 'isdir')):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `contains` and `isdir` methods, please use a higher'
+                'version or dev branch instead.')
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        filepath = self._replace_prefix(filepath)
+        return self._client.contains(filepath) or self._client.isdir(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/dir'
+            >>> backend.isdir(filepath)
+            True
+        """
+        if not has_method(self._client, 'isdir'):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `isdir` method, please use a higher version or dev'
+                ' branch instead.')
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        filepath = self._replace_prefix(filepath)
+        return self._client.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.isfile(filepath)
+            True
+        """
+        if not has_method(self._client, 'contains'):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `contains` method, please use a higher version or '
+                'dev branch instead.')
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        filepath = self._replace_prefix(filepath)
+        return self._client.contains(filepath)
+
+    def join_path(
+        self,
+        filepath: Union[str, Path],
+        *filepaths: Union[str, Path],
+    ) -> str:
+        r"""Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of \*filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result after concatenation.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.join_path(filepath, 'another/path')
+            'petrel://path/of/file/another/path'
+            >>> backend.join_path(filepath, '/another/path')
+            'petrel://path/of/file/another/path'
+        """
+        filepath = self._format_path(self._map_path(filepath))
+        if filepath.endswith('/'):
+            filepath = filepath[:-1]
+        formatted_paths = [filepath]
+        for path in filepaths:
+            formatted_path = self._format_path(self._map_path(path))
+            formatted_paths.append(formatted_path.lstrip('/'))
+
+        return '/'.join(formatted_paths)
+
+    @contextmanager
+    def get_local_path(
+        self,
+        filepath: Union[str, Path],
+    ) -> Generator[Union[str, Path], None, None]:
+        """Download a file from ``filepath`` to a local temporary directory,
+        and return the temporary path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Args:
+            filepath (str or Path): Download a file from ``filepath``.
+
+        Yields:
+            Iterable[str]: Only yield one temporary path.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> filepath = 'petrel://path/of/file'
+            >>> with backend.get_local_path(filepath) as path:
+            ...     # do something here
+        """
+        assert self.isfile(filepath)
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.get(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+
+    def copyfile(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Copy a file src to dst and return the destination file.
+
+        src and dst should have the same prefix. If dst specifies a directory,
+        the file will be copied into dst using the base filename from src. If
+        dst specifies a file that already exists, it will be replaced.
+
+        Args:
+            src (str or Path): A file to be copied.
+            dst (str or Path): Copy file to dst.
+
+        Returns:
+            str: The destination file.
+
+        Raises:
+            SameFileError: If src and dst are the same file, a SameFileError
+                will be raised.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> # dst is a file
+            >>> src = 'petrel://path/of/file'
+            >>> dst = 'petrel://path/of/file1'
+            >>> backend.copyfile(src, dst)
+            'petrel://path/of/file1'
+
+            >>> # dst is a directory
+            >>> dst = 'petrel://path/of/dir'
+            >>> backend.copyfile(src, dst)
+            'petrel://path/of/dir/file'
+        """
+        src = self._format_path(self._map_path(src))
+        dst = self._format_path(self._map_path(dst))
+        if self.isdir(dst):
+            dst = self.join_path(dst, src.split('/')[-1])
+
+        if src == dst:
+            raise SameFileError('src and dst should not be same')
+
+        self.put(self.get(src), dst)
+        return dst
+
+    def copytree(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Recursively copy an entire directory tree rooted at src to a
+        directory named dst and return the destination directory.
+
+        src and dst should have the same prefix.
+
+        Args:
+            src (str or Path): A directory to be copied.
+            dst (str or Path): Copy directory to dst.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+
+        Returns:
+            str: The destination directory.
+
+        Raises:
+            FileExistsError: If dst had already existed, a FileExistsError will
+                be raised.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> src = 'petrel://path/of/dir'
+            >>> dst = 'petrel://path/of/dir1'
+            >>> backend.copytree(src, dst)
+            'petrel://path/of/dir1'
+        """
+        src = self._format_path(self._map_path(src))
+        dst = self._format_path(self._map_path(dst))
+
+        if self.exists(dst):
+            raise FileExistsError('dst should not exist')
+
+        for path in self.list_dir_or_file(src, list_dir=False, recursive=True):
+            src_path = self.join_path(src, path)
+            dst_path = self.join_path(dst, path)
+            self.put(self.get(src_path), dst_path)
+
+        return dst
+
+    def copyfile_from_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Upload a local file src to dst and return the destination file.
+
+        Args:
+            src (str or Path): A local file to be copied.
+            dst (str or Path): Copy file to dst.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+
+        Returns:
+            str: If dst specifies a directory, the file will be copied into dst
+            using the base filename from src.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> # dst is a file
+            >>> src = 'path/of/your/file'
+            >>> dst = 'petrel://path/of/file1'
+            >>> backend.copyfile_from_local(src, dst)
+            'petrel://path/of/file1'
+
+            >>> # dst is a directory
+            >>> dst = 'petrel://path/of/dir'
+            >>> backend.copyfile_from_local(src, dst)
+            'petrel://path/of/dir/file'
+        """
+        dst = self._format_path(self._map_path(dst))
+        if self.isdir(dst):
+            dst = self.join_path(dst, osp.basename(src))
+
+        with open(src, 'rb') as f:
+            self.put(f.read(), dst)
+
+        return dst
+
+    def copytree_from_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> str:
+        """Recursively copy an entire directory tree rooted at src to a
+        directory named dst and return the destination directory.
+
+        Args:
+            src (str or Path): A local directory to be copied.
+            dst (str or Path): Copy directory to dst.
+
+        Returns:
+            str: The destination directory.
+
+        Raises:
+            FileExistsError: If dst had already existed, a FileExistsError will
+                be raised.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> src = 'path/of/your/dir'
+            >>> dst = 'petrel://path/of/dir1'
+            >>> backend.copytree_from_local(src, dst)
+            'petrel://path/of/dir1'
+        """
+        dst = self._format_path(self._map_path(dst))
+        if self.exists(dst):
+            raise FileExistsError('dst should not exist')
+
+        src = str(src)
+
+        for cur_dir, _, files in os.walk(src):
+            for f in files:
+                src_path = osp.join(cur_dir, f)
+                dst_path = self.join_path(dst, src_path.replace(src, ''))
+                self.copyfile_from_local(src_path, dst_path)
+
+        return dst
+
+    def copyfile_to_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> Union[str, Path]:
+        """Copy the file src to local dst and return the destination file.
+
+        If dst specifies a directory, the file will be copied into dst using
+        the base filename from src. If dst specifies a file that already
+        exists, it will be replaced.
+
+        Args:
+            src (str or Path): A file to be copied.
+            dst (str or Path): Copy file to to local dst.
+
+        Returns:
+            str: If dst specifies a directory, the file will be copied into dst
+            using the base filename from src.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> # dst is a file
+            >>> src = 'petrel://path/of/file'
+            >>> dst = 'path/of/your/file'
+            >>> backend.copyfile_to_local(src, dst)
+            'path/of/your/file'
+
+            >>> # dst is a directory
+            >>> dst = 'path/of/your/dir'
+            >>> backend.copyfile_to_local(src, dst)
+            'path/of/your/dir/file'
+        """
+        if osp.isdir(dst):
+            basename = osp.basename(src)
+            if isinstance(dst, str):
+                dst = osp.join(dst, basename)
+            else:
+                assert isinstance(dst, Path)
+                dst = dst / basename
+
+        with open(dst, 'wb') as f:
+            f.write(self.get(src))
+
+        return dst
+
+    def copytree_to_local(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> Union[str, Path]:
+        """Recursively copy an entire directory tree rooted at src to a local
+        directory named dst and return the destination directory.
+
+        Args:
+            src (str or Path): A directory to be copied.
+            dst (str or Path): Copy directory to local dst.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+
+        Returns:
+            str: The destination directory.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> src = 'petrel://path/of/dir'
+            >>> dst = 'path/of/your/dir'
+            >>> backend.copytree_to_local(src, dst)
+            'path/of/your/dir'
+        """
+        for path in self.list_dir_or_file(src, list_dir=False, recursive=True):
+            dst_path = osp.join(dst, path)
+            mmengine.mkdir_or_exist(osp.dirname(dst_path))
+            with open(dst_path, 'wb') as f:
+                f.write(self.get(self.join_path(src, path)))
+
+        return dst
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str or Path): Path to be removed.
+
+        Raises:
+            FileNotFoundError: If filepath does not exist, an FileNotFoundError
+                will be raised.
+            IsADirectoryError: If filepath is a directory, an IsADirectoryError
+                will be raised.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> filepath = 'petrel://path/of/file'
+            >>> backend.remove(filepath)
+        """
+        if not has_method(self._client, 'delete'):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `delete` method, please use a higher version or dev '
+                'branch instead.')
+
+        if not self.exists(filepath):
+            raise FileNotFoundError(f'filepath {filepath} does not exist')
+
+        if self.isdir(filepath):
+            raise IsADirectoryError('filepath should be a file')
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        filepath = self._replace_prefix(filepath)
+        self._client.delete(filepath)
+
+    def rmtree(self, dir_path: Union[str, Path]) -> None:
+        """Recursively delete a directory tree.
+
+        Args:
+            dir_path (str or Path): A directory to be removed.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> dir_path = 'petrel://path/of/dir'
+            >>> backend.rmtree(dir_path)
+        """
+        for path in self.list_dir_or_file(
+                dir_path, list_dir=False, recursive=True):
+            filepath = self.join_path(dir_path, path)
+            self.remove(filepath)
+
+    def copy_if_symlink_fails(
+        self,
+        src: Union[str, Path],
+        dst: Union[str, Path],
+    ) -> bool:
+        """Create a symbolic link pointing to src named dst.
+
+        Directly copy src to dst because PetrelBacekend does not support create
+        a symbolic link.
+
+        Args:
+            src (str or Path): A file or directory to be copied.
+            dst (str or Path): Copy a file or directory to dst.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+
+        Returns:
+            bool: Return False because PetrelBackend does not support create
+            a symbolic link.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> src = 'petrel://path/of/file'
+            >>> dst = 'petrel://path/of/your/file'
+            >>> backend.copy_if_symlink_fails(src, dst)
+            False
+            >>> src = 'petrel://path/of/dir'
+            >>> dst = 'petrel://path/of/your/dir'
+            >>> backend.copy_if_symlink_fails(src, dst)
+            False
+        """
+        if self.isfile(src):
+            self.copyfile(src, dst)
+        else:
+            self.copytree(src, dst)
+        return False
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            Petrel has no concept of directories but it simulates the directory
+            hierarchy in the filesystem through public prefixes. In addition,
+            if the returned path ends with '/', it means the path is a public
+            prefix which is a logical directory.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+            In addition, the returned path of directory will not contains the
+            suffix '/' which is consistent with other backends.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Defaults to True.
+            list_file (bool): List the path of files. Defaults to True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Defaults to None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Defaults to False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+
+        Examples:
+            >>> backend = PetrelBackend()
+            >>> dir_path = 'petrel://path/of/dir'
+            >>> # list those files and directories in current directory
+            >>> for file_path in backend.list_dir_or_file(dir_path):
+            ...     print(file_path)
+            >>> # only list files
+            >>> for file_path in backend.list_dir_or_file(dir_path, list_dir=False):
+            ...     print(file_path)
+            >>> # only list directories
+            >>> for file_path in backend.list_dir_or_file(dir_path, list_file=False):
+            ...     print(file_path)
+            >>> # only list files ending with specified suffixes
+            >>> for file_path in backend.list_dir_or_file(dir_path, suffix='.txt'):
+            ...     print(file_path)
+            >>> # list all files and directory recursively
+            >>> for file_path in backend.list_dir_or_file(dir_path, recursive=True):
+            ...     print(file_path)
+        """  # noqa: E501
+        if not has_method(self._client, 'list'):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `list` method, please use a higher version or dev'
+                ' branch instead.')
+
+        dir_path = self._map_path(dir_path)
+        dir_path = self._format_path(dir_path)
+        dir_path = self._replace_prefix(dir_path)
+        if list_dir and suffix is not None:
+            raise TypeError(
+                '`list_dir` should be False when `suffix` is not None')
+
+        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+            raise TypeError('`suffix` must be a string or tuple of strings')
+
+        # Petrel's simulated directory hierarchy assumes that directory paths
+        # should end with `/`
+        if not dir_path.endswith('/'):
+            dir_path += '/'
+
+        root = dir_path
+
+        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                              recursive):
+            for path in self._client.list(dir_path):
+                # the `self.isdir` is not used here to determine whether path
+                # is a directory, because `self.isdir` relies on
+                # `self._client.list`
+                if path.endswith('/'):  # a directory path
+                    next_dir_path = self.join_path(dir_path, path)
+                    if list_dir:
+                        # get the relative path and exclude the last
+                        # character '/'
+                        rel_dir = next_dir_path[len(root):-1]
+                        yield rel_dir
+                    if recursive:
+                        yield from _list_dir_or_file(next_dir_path, list_dir,
+                                                     list_file, suffix,
+                                                     recursive)
+                else:  # a file path
+                    absolute_path = self.join_path(dir_path, path)
+                    rel_path = absolute_path[len(root):]
+                    if (suffix is None
+                            or rel_path.endswith(suffix)) and list_file:
+                        yield rel_path
+
+        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                 recursive)
+
+    def generate_presigned_url(self,
+                               url: str,
+                               client_method: str = 'get_object',
+                               expires_in: int = 3600) -> str:
+        """Generate the presigned url of video stream which can be passed to
+        mmcv.VideoReader. Now only work on Petrel backend.
+
+        Note:
+            Now only work on Petrel backend.
+
+        Args:
+            url (str): Url of video stream.
+            client_method (str): Method of client, 'get_object' or
+                'put_object'. Default: 'get_object'.
+            expires_in (int): expires, in seconds. Default: 3600.
+
+        Returns:
+            str: Generated presigned url.
+        """
+        return self._client.generate_presigned_url(url, client_method,
+                                                   expires_in)
diff --git a/head_extractor/src/mmengine/fileio/backends/registry_utils.py b/head_extractor/src/mmengine/fileio/backends/registry_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4578a4ca76fb3f867b87c088407399bc5c700153
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/backends/registry_utils.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+from typing import Optional, Type, Union
+
+from .base import BaseStorageBackend
+from .http_backend import HTTPBackend
+from .lmdb_backend import LmdbBackend
+from .local_backend import LocalBackend
+from .memcached_backend import MemcachedBackend
+from .petrel_backend import PetrelBackend
+
+backends: dict = {}
+prefix_to_backends: dict = {}
+
+
+def _register_backend(name: str,
+                      backend: Type[BaseStorageBackend],
+                      force: bool = False,
+                      prefixes: Union[str, list, tuple, None] = None):
+    """Register a backend.
+
+    Args:
+        name (str): The name of the registered backend.
+        backend (BaseStorageBackend): The backend class to be registered,
+            which must be a subclass of :class:`BaseStorageBackend`.
+        force (bool): Whether to override the backend if the name has already
+            been registered. Defaults to False.
+        prefixes (str or list[str] or tuple[str], optional): The prefix
+            of the registered storage backend. Defaults to None.
+    """
+    global backends, prefix_to_backends
+
+    if not isinstance(name, str):
+        raise TypeError('the backend name should be a string, '
+                        f'but got {type(name)}')
+
+    if not inspect.isclass(backend):
+        raise TypeError(f'backend should be a class, but got {type(backend)}')
+    if not issubclass(backend, BaseStorageBackend):
+        raise TypeError(
+            f'backend {backend} is not a subclass of BaseStorageBackend')
+
+    if name in backends and not force:
+        raise ValueError(f'{name} is already registered as a storage backend, '
+                         'add "force=True" if you want to override it')
+    backends[name] = backend
+
+    if prefixes is not None:
+        if isinstance(prefixes, str):
+            prefixes = [prefixes]
+        else:
+            assert isinstance(prefixes, (list, tuple))
+
+        for prefix in prefixes:
+            if prefix in prefix_to_backends and not force:
+                raise ValueError(
+                    f'{prefix} is already registered as a storage backend,'
+                    ' add "force=True" if you want to override it')
+
+            prefix_to_backends[prefix] = backend
+
+
+def register_backend(name: str,
+                     backend: Optional[Type[BaseStorageBackend]] = None,
+                     force: bool = False,
+                     prefixes: Union[str, list, tuple, None] = None):
+    """Register a backend.
+
+    Args:
+        name (str): The name of the registered backend.
+        backend (class, optional): The backend class to be registered,
+            which must be a subclass of :class:`BaseStorageBackend`.
+            When this method is used as a decorator, backend is None.
+            Defaults to None.
+        force (bool): Whether to override the backend if the name has already
+            been registered. Defaults to False.
+        prefixes (str or list[str] or tuple[str], optional): The prefix
+            of the registered storage backend. Defaults to None.
+
+    This method can be used as a normal method or a decorator.
+
+    Examples:
+
+        >>> class NewBackend(BaseStorageBackend):
+        ...     def get(self, filepath):
+        ...         return filepath
+        ...
+        ...     def get_text(self, filepath):
+        ...         return filepath
+        >>> register_backend('new', NewBackend)
+
+        >>> @register_backend('new')
+        ... class NewBackend(BaseStorageBackend):
+        ...     def get(self, filepath):
+        ...         return filepath
+        ...
+        ...     def get_text(self, filepath):
+        ...         return filepath
+    """
+    if backend is not None:
+        _register_backend(name, backend, force=force, prefixes=prefixes)
+        return
+
+    def _register(backend_cls):
+        _register_backend(name, backend_cls, force=force, prefixes=prefixes)
+        return backend_cls
+
+    return _register
+
+
+register_backend('local', LocalBackend, prefixes='')
+register_backend('memcached', MemcachedBackend)
+register_backend('lmdb', LmdbBackend)
+# To avoid breaking backward Compatibility, 's3' is also used as a
+# prefix for PetrelBackend
+register_backend('petrel', PetrelBackend, prefixes=['petrel', 's3'])
+register_backend('http', HTTPBackend, prefixes=['http', 'https'])
diff --git a/head_extractor/src/mmengine/fileio/file_client.py b/head_extractor/src/mmengine/fileio/file_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..61551d3d1df953adee312d02624ac47c4cfb0df2
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/file_client.py
@@ -0,0 +1,460 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import logging
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Any, Generator, Iterator, Optional, Tuple, Union
+
+from mmengine.logging import print_log
+from mmengine.utils import is_filepath
+from .backends import (BaseStorageBackend, HTTPBackend, LmdbBackend,
+                       LocalBackend, MemcachedBackend, PetrelBackend)
+
+
+class HardDiskBackend(LocalBackend):
+    """Raw hard disks storage backend."""
+
+    def __init__(self) -> None:
+        print_log(
+            '"HardDiskBackend" is the alias of "LocalBackend" '
+            'and the former will be deprecated in future.',
+            logger='current',
+            level=logging.WARNING)
+
+    @property
+    def name(self):
+        return self.__class__.__name__
+
+
+class FileClient:
+    """A general file client to access files in different backends.
+
+    The client loads a file or text in a specified backend from its path
+    and returns it as a binary or text file. There are two ways to choose a
+    backend, the name of backend and the prefix of path. Although both of them
+    can be used to choose a storage backend, ``backend`` has a higher priority
+    that is if they are all set, the storage backend will be chosen by the
+    backend argument. If they are all `None`, the disk backend will be chosen.
+    Note that It can also register other backend accessor with a given name,
+    prefixes, and backend class. In addition, We use the singleton pattern to
+    avoid repeated object creation. If the arguments are the same, the same
+    object will be returned.
+
+    Warning:
+        `FileClient` will be deprecated in future. Please use io functions
+        in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io
+
+    Args:
+        backend (str, optional): The storage backend type. Options are "disk",
+            "memcached", "lmdb", "http" and "petrel". Defaults to None.
+        prefix (str, optional): The prefix of the registered storage backend.
+            Options are "s3", "http", "https". Defaults to None.
+
+    Examples:
+        >>> # only set backend
+        >>> file_client = FileClient(backend='petrel')
+        >>> # only set prefix
+        >>> file_client = FileClient(prefix='s3')
+        >>> # set both backend and prefix but use backend to choose client
+        >>> file_client = FileClient(backend='petrel', prefix='s3')
+        >>> # if the arguments are the same, the same object is returned
+        >>> file_client1 = FileClient(backend='petrel')
+        >>> file_client1 is file_client
+        True
+
+    Attributes:
+        client (:obj:`BaseStorageBackend`): The backend object.
+    """
+
+    _backends = {
+        'disk': HardDiskBackend,
+        'memcached': MemcachedBackend,
+        'lmdb': LmdbBackend,
+        'petrel': PetrelBackend,
+        'http': HTTPBackend,
+    }
+
+    _prefix_to_backends: dict = {
+        's3': PetrelBackend,
+        'petrel': PetrelBackend,
+        'http': HTTPBackend,
+        'https': HTTPBackend,
+    }
+
+    _instances: dict = {}
+
+    client: Any
+
+    def __new__(cls, backend=None, prefix=None, **kwargs):
+        print_log(
+            '"FileClient" will be deprecated in future. Please use io '
+            'functions in '
+            'https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io',  # noqa: E501
+            logger='current',
+            level=logging.WARNING)
+        if backend is None and prefix is None:
+            backend = 'disk'
+        if backend is not None and backend not in cls._backends:
+            raise ValueError(
+                f'Backend {backend} is not supported. Currently supported ones'
+                f' are {list(cls._backends.keys())}')
+        if prefix is not None and prefix not in cls._prefix_to_backends:
+            raise ValueError(
+                f'prefix {prefix} is not supported. Currently supported ones '
+                f'are {list(cls._prefix_to_backends.keys())}')
+
+        # concatenate the arguments to a unique key for determining whether
+        # objects with the same arguments were created
+        arg_key = f'{backend}:{prefix}'
+        for key, value in kwargs.items():
+            arg_key += f':{key}:{value}'
+
+        # if a backend was overridden, it will create a new object
+        if arg_key in cls._instances:
+            _instance = cls._instances[arg_key]
+        else:
+            # create a new object and put it to _instance
+            _instance = super().__new__(cls)
+            if backend is not None:
+                _instance.client = cls._backends[backend](**kwargs)
+            else:
+                _instance.client = cls._prefix_to_backends[prefix](**kwargs)
+
+            cls._instances[arg_key] = _instance
+
+        return _instance
+
+    @property
+    def name(self):
+        return self.client.name
+
+    @property
+    def allow_symlink(self):
+        return self.client.allow_symlink
+
+    @staticmethod
+    def parse_uri_prefix(uri: Union[str, Path]) -> Optional[str]:
+        """Parse the prefix of a uri.
+
+        Args:
+            uri (str | Path): Uri to be parsed that contains the file prefix.
+
+        Examples:
+            >>> FileClient.parse_uri_prefix('s3://path/of/your/file')
+            's3'
+
+        Returns:
+            str | None: Return the prefix of uri if the uri contains '://' else
+            ``None``.
+        """
+        assert is_filepath(uri)
+        uri = str(uri)
+        if '://' not in uri:
+            return None
+        else:
+            prefix, _ = uri.split('://')
+            # In the case of PetrelBackend, the prefix may contains the cluster
+            # name like clusterName:s3
+            if ':' in prefix:
+                _, prefix = prefix.split(':')
+            return prefix
+
+    @classmethod
+    def infer_client(cls,
+                     file_client_args: Optional[dict] = None,
+                     uri: Optional[Union[str, Path]] = None) -> 'FileClient':
+        """Infer a suitable file client based on the URI and arguments.
+
+        Args:
+            file_client_args (dict, optional): Arguments to instantiate a
+                FileClient. Defaults to None.
+            uri (str | Path, optional): Uri to be parsed that contains the file
+                prefix. Defaults to None.
+
+        Examples:
+            >>> uri = 's3://path/of/your/file'
+            >>> file_client = FileClient.infer_client(uri=uri)
+            >>> file_client_args = {'backend': 'petrel'}
+            >>> file_client = FileClient.infer_client(file_client_args)
+
+        Returns:
+            FileClient: Instantiated FileClient object.
+        """
+        assert file_client_args is not None or uri is not None
+        if file_client_args is None:
+            file_prefix = cls.parse_uri_prefix(uri)  # type: ignore
+            return cls(prefix=file_prefix)
+        else:
+            return cls(**file_client_args)
+
+    @classmethod
+    def _register_backend(cls, name, backend, force=False, prefixes=None):
+        if not isinstance(name, str):
+            raise TypeError('the backend name should be a string, '
+                            f'but got {type(name)}')
+        if not inspect.isclass(backend):
+            raise TypeError(
+                f'backend should be a class but got {type(backend)}')
+        if not issubclass(backend, BaseStorageBackend):
+            raise TypeError(
+                f'backend {backend} is not a subclass of BaseStorageBackend')
+        if not force and name in cls._backends:
+            raise KeyError(
+                f'{name} is already registered as a storage backend, '
+                'add "force=True" if you want to override it')
+
+        if name in cls._backends and force:
+            for arg_key, instance in list(cls._instances.items()):
+                if isinstance(instance.client, cls._backends[name]):
+                    cls._instances.pop(arg_key)
+        cls._backends[name] = backend
+
+        if prefixes is not None:
+            if isinstance(prefixes, str):
+                prefixes = [prefixes]
+            else:
+                assert isinstance(prefixes, (list, tuple))
+            for prefix in prefixes:
+                if prefix not in cls._prefix_to_backends:
+                    cls._prefix_to_backends[prefix] = backend
+                elif (prefix in cls._prefix_to_backends) and force:
+                    overridden_backend = cls._prefix_to_backends[prefix]
+                    for arg_key, instance in list(cls._instances.items()):
+                        if isinstance(instance.client, overridden_backend):
+                            cls._instances.pop(arg_key)
+                else:
+                    raise KeyError(
+                        f'{prefix} is already registered as a storage backend,'
+                        ' add "force=True" if you want to override it')
+
+    @classmethod
+    def register_backend(cls, name, backend=None, force=False, prefixes=None):
+        """Register a backend to FileClient.
+
+        This method can be used as a normal class method or a decorator.
+
+        .. code-block:: python
+
+            class NewBackend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return filepath
+
+                def get_text(self, filepath):
+                    return filepath
+
+            FileClient.register_backend('new', NewBackend)
+
+        or
+
+        .. code-block:: python
+
+            @FileClient.register_backend('new')
+            class NewBackend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return filepath
+
+                def get_text(self, filepath):
+                    return filepath
+
+        Args:
+            name (str): The name of the registered backend.
+            backend (class, optional): The backend class to be registered,
+                which must be a subclass of :class:`BaseStorageBackend`.
+                When this method is used as a decorator, backend is None.
+                Defaults to None.
+            force (bool, optional): Whether to override the backend if the name
+                has already been registered. Defaults to False.
+            prefixes (str or list[str] or tuple[str], optional): The prefixes
+                of the registered storage backend. Defaults to None.
+                `New in version 1.3.15.`
+        """
+        if backend is not None:
+            cls._register_backend(
+                name, backend, force=force, prefixes=prefixes)
+            return
+
+        def _register(backend_cls):
+            cls._register_backend(
+                name, backend_cls, force=force, prefixes=prefixes)
+            return backend_cls
+
+        return _register
+
+    def get(self, filepath: Union[str, Path]) -> Union[bytes, memoryview]:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Note:
+            There are two types of return values for ``get``, one is ``bytes``
+            and the other is ``memoryview``. The advantage of using memoryview
+            is that you can avoid copying, and if you want to convert it to
+            ``bytes``, you can use ``.tobytes()``.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes | memoryview: Expected bytes object or a memory view of the
+            bytes object.
+        """
+        return self.client.get(filepath)
+
+    def get_text(self, filepath: Union[str, Path], encoding='utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Defaults to 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        return self.client.get_text(filepath, encoding)
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+
+        Note:
+            ``put`` should create a directory if the directory of ``filepath``
+            does not exist.
+
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        self.client.put(obj, filepath)
+
+    def put_text(self, obj: str, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+
+        Note:
+            ``put_text`` should create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str, optional): The encoding format used to open the
+                `filepath`. Defaults to 'utf-8'.
+        """
+        self.client.put_text(obj, filepath)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str, Path): Path to be removed.
+        """
+        self.client.remove(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        return self.client.exists(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+        """
+        return self.client.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+        """
+        return self.client.isfile(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        r"""Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of \*filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result of concatenation.
+        """
+        return self.client.join_path(filepath, *filepaths)
+
+    @contextmanager
+    def get_local_path(
+            self,
+            filepath: Union[str,
+                            Path]) -> Generator[Union[str, Path], None, None]:
+        """Download data from ``filepath`` and write the data to local path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Note:
+            If the ``filepath`` is a local path, just return itself.
+
+        .. warning::
+            ``get_local_path`` is an experimental interface that may change in
+            the future.
+
+        Args:
+            filepath (str or Path): Path to be read data.
+
+        Examples:
+            >>> file_client = FileClient(prefix='s3')
+            >>> with file_client.get_local_path('s3://bucket/abc.jpg') as path:
+            ...     # do something here
+
+        Yields:
+            Iterable[str]: Only yield one path.
+        """
+        with self.client.get_local_path(str(filepath)) as local_path:
+            yield local_path
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Defaults to True.
+            list_file (bool): List the path of files. Defaults to True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Defaults to None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Defaults to False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        yield from self.client.list_dir_or_file(dir_path, list_dir, list_file,
+                                                suffix, recursive)
diff --git a/head_extractor/src/mmengine/fileio/handlers/__init__.py b/head_extractor/src/mmengine/fileio/handlers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..391a60c36b3cdf9070437c9d96e9c0bf23fac1a2
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/handlers/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseFileHandler
+from .json_handler import JsonHandler
+from .pickle_handler import PickleHandler
+from .registry_utils import file_handlers, register_handler
+from .yaml_handler import YamlHandler
+
+__all__ = [
+    'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler',
+    'register_handler', 'file_handlers'
+]
diff --git a/head_extractor/src/mmengine/fileio/handlers/base.py b/head_extractor/src/mmengine/fileio/handlers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..288878bc57282fbb2f12b32290152ca8e9d3cab0
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/handlers/base.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseFileHandler(metaclass=ABCMeta):
+    # `str_like` is a flag to indicate whether the type of file object is
+    # str-like object or bytes-like object. Pickle only processes bytes-like
+    # objects but json only processes str-like object. If it is str-like
+    # object, `StringIO` will be used to process the buffer.
+    str_like = True
+
+    @abstractmethod
+    def load_from_fileobj(self, file, **kwargs):
+        pass
+
+    @abstractmethod
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        pass
+
+    @abstractmethod
+    def dump_to_str(self, obj, **kwargs):
+        pass
+
+    def load_from_path(self, filepath, mode='r', **kwargs):
+        with open(filepath, mode) as f:
+            return self.load_from_fileobj(f, **kwargs)
+
+    def dump_to_path(self, obj, filepath, mode='w', **kwargs):
+        with open(filepath, mode) as f:
+            self.dump_to_fileobj(obj, f, **kwargs)
diff --git a/head_extractor/src/mmengine/fileio/handlers/json_handler.py b/head_extractor/src/mmengine/fileio/handlers/json_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..18d4f15f74139d20adff18b20be5529c592a66b6
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/handlers/json_handler.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+import numpy as np
+
+from .base import BaseFileHandler
+
+
+def set_default(obj):
+    """Set default json values for non-serializable values.
+
+    It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list.
+    It also converts ``np.generic`` (including ``np.int32``, ``np.float32``,
+    etc.) into plain numbers of plain python built-in types.
+    """
+    if isinstance(obj, (set, range)):
+        return list(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, np.generic):
+        return obj.item()
+    raise TypeError(f'{type(obj)} is unsupported for json dump')
+
+
+class JsonHandler(BaseFileHandler):
+
+    def load_from_fileobj(self, file):
+        return json.load(file)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('default', set_default)
+        json.dump(obj, file, **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('default', set_default)
+        return json.dumps(obj, **kwargs)
diff --git a/head_extractor/src/mmengine/fileio/handlers/pickle_handler.py b/head_extractor/src/mmengine/fileio/handlers/pickle_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..073856fd25a731b42f3cd19269ad95744b20598f
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/handlers/pickle_handler.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pickle
+
+from .base import BaseFileHandler
+
+
+class PickleHandler(BaseFileHandler):
+
+    str_like = False
+
+    def load_from_fileobj(self, file, **kwargs):
+        return pickle.load(file, **kwargs)
+
+    def load_from_path(self, filepath, **kwargs):
+        return super().load_from_path(filepath, mode='rb', **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        return pickle.dumps(obj, **kwargs)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        pickle.dump(obj, file, **kwargs)
+
+    def dump_to_path(self, obj, filepath, **kwargs):
+        super().dump_to_path(obj, filepath, mode='wb', **kwargs)
diff --git a/head_extractor/src/mmengine/fileio/handlers/registry_utils.py b/head_extractor/src/mmengine/fileio/handlers/registry_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..106fc881f2514dfcf5b31878e7eca34c7f1659ea
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/handlers/registry_utils.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import is_list_of
+from .base import BaseFileHandler
+from .json_handler import JsonHandler
+from .pickle_handler import PickleHandler
+from .yaml_handler import YamlHandler
+
+file_handlers = {
+    'json': JsonHandler(),
+    'yaml': YamlHandler(),
+    'yml': YamlHandler(),
+    'pickle': PickleHandler(),
+    'pkl': PickleHandler(),
+}
+
+
+def _register_handler(handler, file_formats):
+    """Register a handler for some file extensions.
+
+    Args:
+        handler (:obj:`BaseFileHandler`): Handler to be registered.
+        file_formats (str or list[str]): File formats to be handled by this
+            handler.
+    """
+    if not isinstance(handler, BaseFileHandler):
+        raise TypeError(
+            f'handler must be a child of BaseFileHandler, not {type(handler)}')
+    if isinstance(file_formats, str):
+        file_formats = [file_formats]
+    if not is_list_of(file_formats, str):
+        raise TypeError('file_formats must be a str or a list of str')
+    for ext in file_formats:
+        file_handlers[ext] = handler
+
+
+def register_handler(file_formats, **kwargs):
+
+    def wrap(cls):
+        _register_handler(cls(**kwargs), file_formats)
+        return cls
+
+    return wrap
diff --git a/head_extractor/src/mmengine/fileio/handlers/yaml_handler.py b/head_extractor/src/mmengine/fileio/handlers/yaml_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..22c2607ae43734f334bbfa83445b1409ef855433
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/handlers/yaml_handler.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import yaml
+
+try:
+    from yaml import CDumper as Dumper  # type: ignore
+    from yaml import CLoader as Loader  # type: ignore
+except ImportError:
+    from yaml import Loader, Dumper  # type: ignore
+
+from .base import BaseFileHandler  # isort:skip
+
+
+class YamlHandler(BaseFileHandler):
+
+    def load_from_fileobj(self, file, **kwargs):
+        kwargs.setdefault('Loader', Loader)
+        return yaml.load(file, **kwargs)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('Dumper', Dumper)
+        yaml.dump(obj, file, **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('Dumper', Dumper)
+        return yaml.dump(obj, **kwargs)
diff --git a/head_extractor/src/mmengine/fileio/io.py b/head_extractor/src/mmengine/fileio/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdeb4dc6df9c7eebb0d4e5d76580c5199c341877
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/io.py
@@ -0,0 +1,940 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This module provides unified file I/O related functions, which support
+operating I/O with different file backends based on the specified filepath or
+backend_args.
+
+MMEngine currently supports five file backends:
+
+- LocalBackend
+- PetrelBackend
+- HTTPBackend
+- LmdbBackend
+- MemcacheBackend
+
+Note that this module provide a union of all of the above file backends so
+NotImplementedError will be raised if the interface in the file backend is not
+implemented.
+
+There are two ways to call a method of a file backend:
+
+- Initialize a file backend with ``get_file_backend`` and call its methods.
+- Directory call unified I/O functions, which will call ``get_file_backend``
+  first and then call the corresponding backend method.
+
+Examples:
+    >>> # Initialize a file backend and call its methods
+    >>> import mmengine.fileio as fileio
+    >>> backend = fileio.get_file_backend(backend_args={'backend': 'petrel'})
+    >>> backend.get('s3://path/of/your/file')
+
+    >>> # Directory call unified I/O functions
+    >>> fileio.get('s3://path/of/your/file')
+"""
+import json
+import warnings
+from contextlib import contextmanager
+from io import BytesIO, StringIO
+from pathlib import Path
+from typing import Generator, Iterator, Optional, Tuple, Union
+
+from mmengine.utils import is_filepath, is_str
+from .backends import backends, prefix_to_backends
+from .file_client import FileClient
+# file_handlers and register_handler had been moved to
+# mmengine/fileio/handlers/registry_utis. Import them
+# in this file to keep backward compatibility.
+from .handlers import file_handlers, register_handler  # noqa: F401
+
+backend_instances: dict = {}
+
+
+def _parse_uri_prefix(uri: Union[str, Path]) -> str:
+    """Parse the prefix of uri.
+
+    Args:
+        uri (str or Path): Uri to be parsed that contains the file prefix.
+
+    Examples:
+        >>> _parse_uri_prefix('/home/path/of/your/file')
+        ''
+        >>> _parse_uri_prefix('s3://path/of/your/file')
+        's3'
+        >>> _parse_uri_prefix('clusterName:s3://path/of/your/file')
+        's3'
+
+    Returns:
+        str: Return the prefix of uri if the uri contains '://'. Otherwise,
+        return ''.
+    """
+    assert is_filepath(uri)
+    uri = str(uri)
+    # if uri does not contains '://', the uri will be handled by
+    # LocalBackend by default
+    if '://' not in uri:
+        return ''
+    else:
+        prefix, _ = uri.split('://')
+        # In the case of PetrelBackend, the prefix may contain the cluster
+        # name like clusterName:s3://path/of/your/file
+        if ':' in prefix:
+            _, prefix = prefix.split(':')
+        return prefix
+
+
+def _get_file_backend(prefix: str, backend_args: dict):
+    """Return a file backend based on the prefix or backend_args.
+
+    Args:
+        prefix (str): Prefix of uri.
+        backend_args (dict): Arguments to instantiate the corresponding
+            backend.
+    """
+    # backend name has a higher priority
+    if 'backend' in backend_args:
+        # backend_args should not be modified
+        backend_args_bak = backend_args.copy()
+        backend_name = backend_args_bak.pop('backend')
+        backend = backends[backend_name](**backend_args_bak)
+    else:
+        backend = prefix_to_backends[prefix](**backend_args)
+    return backend
+
+
+def get_file_backend(
+    uri: Union[str, Path, None] = None,
+    *,
+    backend_args: Optional[dict] = None,
+    enable_singleton: bool = False,
+):
+    """Return a file backend based on the prefix of uri or backend_args.
+
+    Args:
+        uri (str or Path): Uri to be parsed that contains the file prefix.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+        enable_singleton (bool): Whether to enable the singleton pattern.
+            If it is True, the backend created will be reused if the
+            signature is same with the previous one. Defaults to False.
+
+    Returns:
+        BaseStorageBackend: Instantiated Backend object.
+
+    Examples:
+        >>> # get file backend based on the prefix of uri
+        >>> uri = 's3://path/of/your/file'
+        >>> backend = get_file_backend(uri)
+        >>> # get file backend based on the backend_args
+        >>> backend = get_file_backend(backend_args={'backend': 'petrel'})
+        >>> # backend name has a higher priority if 'backend' in backend_args
+        >>> backend = get_file_backend(uri, backend_args={'backend': 'petrel'})
+    """
+    global backend_instances
+
+    if backend_args is None:
+        backend_args = {}
+
+    if uri is None and 'backend' not in backend_args:
+        raise ValueError(
+            'uri should not be None when "backend" does not exist in '
+            'backend_args')
+
+    if uri is not None:
+        prefix = _parse_uri_prefix(uri)
+    else:
+        prefix = ''
+
+    if enable_singleton:
+        # TODO: whether to pass sort_key to json.dumps
+        unique_key = f'{prefix}:{json.dumps(backend_args)}'
+        if unique_key in backend_instances:
+            return backend_instances[unique_key]
+
+        backend = _get_file_backend(prefix, backend_args)
+        backend_instances[unique_key] = backend
+        return backend
+    else:
+        backend = _get_file_backend(prefix, backend_args)
+        return backend
+
+
+def get(
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> bytes:
+    """Read bytes from a given ``filepath`` with 'rb' mode.
+
+    Args:
+        filepath (str or Path): Path to read data.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        bytes: Expected bytes object.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> get(filepath)
+        b'hello world'
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    return backend.get(filepath)
+
+
+def get_text(
+    filepath: Union[str, Path],
+    encoding='utf-8',
+    backend_args: Optional[dict] = None,
+) -> str:
+    """Read text from a given ``filepath`` with 'r' mode.
+
+    Args:
+        filepath (str or Path): Path to read data.
+        encoding (str): The encoding format used to open the ``filepath``.
+            Defaults to 'utf-8'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: Expected text reading from ``filepath``.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> get_text(filepath)
+        'hello world'
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    return backend.get_text(filepath, encoding)
+
+
+def put(
+    obj: bytes,
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> None:
+    """Write bytes to a given ``filepath`` with 'wb' mode.
+
+    Note:
+        ``put`` should create a directory if the directory of
+        ``filepath`` does not exist.
+
+    Args:
+        obj (bytes): Data to be written.
+        filepath (str or Path): Path to write data.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> put(b'hello world', filepath)
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    backend.put(obj, filepath)
+
+
+def put_text(
+    obj: str,
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> None:
+    """Write text to a given ``filepath`` with 'w' mode.
+
+    Note:
+        ``put_text`` should create a directory if the directory of
+        ``filepath`` does not exist.
+
+    Args:
+        obj (str): Data to be written.
+        filepath (str or Path): Path to write data.
+        encoding (str, optional): The encoding format used to open the
+            ``filepath``. Defaults to 'utf-8'.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> put_text('hello world', filepath)
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    backend.put_text(obj, filepath)
+
+
+def exists(
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> bool:
+    """Check whether a file path exists.
+
+    Args:
+        filepath (str or Path): Path to be checked whether exists.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> exists(filepath)
+        True
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    return backend.exists(filepath)
+
+
+def isdir(
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> bool:
+    """Check whether a file path is a directory.
+
+    Args:
+        filepath (str or Path): Path to be checked whether it is a
+            directory.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        bool: Return ``True`` if ``filepath`` points to a directory,
+        ``False`` otherwise.
+
+    Examples:
+        >>> filepath = '/path/of/dir'
+        >>> isdir(filepath)
+        True
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    return backend.isdir(filepath)
+
+
+def isfile(
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> bool:
+    """Check whether a file path is a file.
+
+    Args:
+        filepath (str or Path): Path to be checked whether it is a file.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        bool: Return ``True`` if ``filepath`` points to a file, ``False``
+        otherwise.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> isfile(filepath)
+        True
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    return backend.isfile(filepath)
+
+
+def join_path(
+    filepath: Union[str, Path],
+    *filepaths: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    r"""Concatenate all file paths.
+
+    Join one or more filepath components intelligently. The return value
+    is the concatenation of filepath and any members of \*filepaths.
+
+    Args:
+        filepath (str or Path): Path to be concatenated.
+        *filepaths (str or Path): Other paths to be concatenated.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: The result of concatenation.
+
+    Examples:
+        >>> filepath1 = '/path/of/dir1'
+        >>> filepath2 = 'dir2'
+        >>> filepath3 = 'path/of/file'
+        >>> join_path(filepath1, filepath2, filepath3)
+        '/path/of/dir/dir2/path/of/file'
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    return backend.join_path(filepath, *filepaths)
+
+
+@contextmanager
+def get_local_path(
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Generator[Union[str, Path], None, None]:
+    """Download data from ``filepath`` and write the data to local path.
+
+    ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+    can be called with ``with`` statement, and when exists from the
+    ``with`` statement, the temporary path will be released.
+
+    Note:
+        If the ``filepath`` is a local path, just return itself and it will
+        not be released (removed).
+
+    Args:
+        filepath (str or Path): Path to be read data.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Yields:
+        Iterable[str]: Only yield one path.
+
+    Examples:
+        >>> with get_local_path('s3://bucket/abc.jpg') as path:
+        ...     # do something here
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    with backend.get_local_path(str(filepath)) as local_path:
+        yield local_path
+
+
+def copyfile(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    """Copy a file src to dst and return the destination file.
+
+    src and dst should have the same prefix. If dst specifies a directory,
+    the file will be copied into dst using the base filename from src. If
+    dst specifies a file that already exists, it will be replaced.
+
+    Args:
+        src (str or Path): A file to be copied.
+        dst (str or Path): Copy file to dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: The destination file.
+
+    Raises:
+        SameFileError: If src and dst are the same file, a SameFileError will
+            be raised.
+
+    Examples:
+        >>> # dst is a file
+        >>> src = '/path/of/file'
+        >>> dst = '/path1/of/file1'
+        >>> # src will be copied to '/path1/of/file1'
+        >>> copyfile(src, dst)
+        '/path1/of/file1'
+
+        >>> # dst is a directory
+        >>> dst = '/path1/of/dir'
+        >>> # src will be copied to '/path1/of/dir/file'
+        >>> copyfile(src, dst)
+        '/path1/of/dir/file'
+    """
+    backend = get_file_backend(
+        src, backend_args=backend_args, enable_singleton=True)
+    return backend.copyfile(src, dst)
+
+
+def copytree(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    """Recursively copy an entire directory tree rooted at src to a directory
+    named dst and return the destination directory.
+
+    src and dst should have the same prefix and dst must not already exist.
+
+    Args:
+        src (str or Path): A directory to be copied.
+        dst (str or Path): Copy directory to dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: The destination directory.
+
+    Raises:
+        FileExistsError: If dst had already existed, a FileExistsError will be
+            raised.
+
+    Examples:
+        >>> src = '/path/of/dir1'
+        >>> dst = '/path/of/dir2'
+        >>> copytree(src, dst)
+        '/path/of/dir2'
+    """
+    backend = get_file_backend(
+        src, backend_args=backend_args, enable_singleton=True)
+    return backend.copytree(src, dst)
+
+
+def copyfile_from_local(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    """Copy a local file src to dst and return the destination file.
+
+    Note:
+        If the backend is the instance of LocalBackend, it does the same
+        thing with :func:`copyfile`.
+
+    Args:
+        src (str or Path): A local file to be copied.
+        dst (str or Path): Copy file to dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: If dst specifies a directory, the file will be copied into dst
+        using the base filename from src.
+
+    Examples:
+        >>> # dst is a file
+        >>> src = '/path/of/file'
+        >>> dst = 's3://openmmlab/mmengine/file1'
+        >>> # src will be copied to 's3://openmmlab/mmengine/file1'
+        >>> copyfile_from_local(src, dst)
+        s3://openmmlab/mmengine/file1
+
+        >>> # dst is a directory
+        >>> dst = 's3://openmmlab/mmengine'
+        >>> # src will be copied to 's3://openmmlab/mmengine/file''
+        >>> copyfile_from_local(src, dst)
+        's3://openmmlab/mmengine/file'
+    """
+    backend = get_file_backend(
+        dst, backend_args=backend_args, enable_singleton=True)
+    return backend.copyfile_from_local(src, dst)
+
+
+def copytree_from_local(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    """Recursively copy an entire directory tree rooted at src to a directory
+    named dst and return the destination directory.
+
+    Note:
+        If the backend is the instance of LocalBackend, it does the same
+        thing with :func:`copytree`.
+
+    Args:
+        src (str or Path): A local directory to be copied.
+        dst (str or Path): Copy directory to dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: The destination directory.
+
+    Examples:
+        >>> src = '/path/of/dir'
+        >>> dst = 's3://openmmlab/mmengine/dir'
+        >>> copyfile_from_local(src, dst)
+        's3://openmmlab/mmengine/dir'
+    """
+    backend = get_file_backend(
+        dst, backend_args=backend_args, enable_singleton=True)
+    return backend.copytree_from_local(src, dst)
+
+
+def copyfile_to_local(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    """Copy the file src to local dst and return the destination file.
+
+    If dst specifies a directory, the file will be copied into dst using
+    the base filename from src. If dst specifies a file that already
+    exists, it will be replaced.
+
+    Note:
+        If the backend is the instance of LocalBackend, it does the same
+        thing with :func:`copyfile`.
+
+    Args:
+        src (str or Path): A file to be copied.
+        dst (str or Path): Copy file to to local dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: If dst specifies a directory, the file will be copied into dst
+        using the base filename from src.
+
+    Examples:
+        >>> # dst is a file
+        >>> src = 's3://openmmlab/mmengine/file'
+        >>> dst = '/path/of/file'
+        >>> # src will be copied to '/path/of/file'
+        >>> copyfile_to_local(src, dst)
+        '/path/of/file'
+
+        >>> # dst is a directory
+        >>> dst = '/path/of/dir'
+        >>> # src will be copied to '/path/of/dir/file'
+        >>> copyfile_to_local(src, dst)
+        '/path/of/dir/file'
+    """
+    backend = get_file_backend(
+        dst, backend_args=backend_args, enable_singleton=True)
+    return backend.copyfile_to_local(src, dst)
+
+
+def copytree_to_local(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> Union[str, Path]:
+    """Recursively copy an entire directory tree rooted at src to a local
+    directory named dst and return the destination directory.
+
+    Note:
+        If the backend is the instance of LocalBackend, it does the same
+        thing with :func:`copytree`.
+
+    Args:
+        src (str or Path): A directory to be copied.
+        dst (str or Path): Copy directory to local dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: The destination directory.
+
+    Examples:
+        >>> src = 's3://openmmlab/mmengine/dir'
+        >>> dst = '/path/of/dir'
+        >>> copytree_to_local(src, dst)
+        '/path/of/dir'
+    """
+    backend = get_file_backend(
+        dst, backend_args=backend_args, enable_singleton=True)
+    return backend.copytree_to_local(src, dst)
+
+
+def remove(
+    filepath: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> None:
+    """Remove a file.
+
+    Args:
+        filepath (str, Path): Path to be removed.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Raises:
+        FileNotFoundError: If filepath does not exist, an FileNotFoundError
+            will be raised.
+        IsADirectoryError: If filepath is a directory, an IsADirectoryError
+            will be raised.
+
+    Examples:
+        >>> filepath = '/path/of/file'
+        >>> remove(filepath)
+    """
+    backend = get_file_backend(
+        filepath, backend_args=backend_args, enable_singleton=True)
+    backend.remove(filepath)
+
+
+def rmtree(
+    dir_path: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> None:
+    """Recursively delete a directory tree.
+
+    Args:
+        dir_path (str or Path): A directory to be removed.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Examples:
+        >>> dir_path = '/path/of/dir'
+        >>> rmtree(dir_path)
+    """
+    backend = get_file_backend(
+        dir_path, backend_args=backend_args, enable_singleton=True)
+    backend.rmtree(dir_path)
+
+
+def copy_if_symlink_fails(
+    src: Union[str, Path],
+    dst: Union[str, Path],
+    backend_args: Optional[dict] = None,
+) -> bool:
+    """Create a symbolic link pointing to src named dst.
+
+    If failed to create a symbolic link pointing to src, directory copy src to
+    dst instead.
+
+    Args:
+        src (str or Path): Create a symbolic link pointing to src.
+        dst (str or Path): Create a symbolic link named dst.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        bool: Return True if successfully create a symbolic link pointing to
+        src. Otherwise, return False.
+
+    Examples:
+        >>> src = '/path/of/file'
+        >>> dst = '/path1/of/file1'
+        >>> copy_if_symlink_fails(src, dst)
+        True
+        >>> src = '/path/of/dir'
+        >>> dst = '/path1/of/dir1'
+        >>> copy_if_symlink_fails(src, dst)
+        True
+    """
+    backend = get_file_backend(
+        src, backend_args=backend_args, enable_singleton=True)
+    return backend.copy_if_symlink_fails(src, dst)
+
+
+def list_dir_or_file(
+    dir_path: Union[str, Path],
+    list_dir: bool = True,
+    list_file: bool = True,
+    suffix: Optional[Union[str, Tuple[str]]] = None,
+    recursive: bool = False,
+    backend_args: Optional[dict] = None,
+) -> Iterator[str]:
+    """Scan a directory to find the interested directories or files in
+    arbitrary order.
+
+    Note:
+        :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+    Args:
+        dir_path (str or Path): Path of the directory.
+        list_dir (bool): List the directories. Defaults to True.
+        list_file (bool): List the path of files. Defaults to True.
+        suffix (str or tuple[str], optional): File suffix that we are
+            interested in. Defaults to None.
+        recursive (bool): If set to True, recursively scan the directory.
+            Defaults to False.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Yields:
+        Iterable[str]: A relative path to ``dir_path``.
+
+    Examples:
+        >>> dir_path = '/path/of/dir'
+        >>> for file_path in list_dir_or_file(dir_path):
+        ...     print(file_path)
+        >>> # list those files and directories in current directory
+        >>> for file_path in list_dir_or_file(dir_path):
+        ...     print(file_path)
+        >>> # only list files
+        >>> for file_path in list_dir_or_file(dir_path, list_dir=False):
+        ...     print(file_path)
+        >>> # only list directories
+        >>> for file_path in list_dir_or_file(dir_path, list_file=False):
+        ...     print(file_path)
+        >>> # only list files ending with specified suffixes
+        >>> for file_path in list_dir_or_file(dir_path, suffix='.txt'):
+        ...     print(file_path)
+        >>> # list all files and directory recursively
+        >>> for file_path in list_dir_or_file(dir_path, recursive=True):
+        ...     print(file_path)
+    """
+    backend = get_file_backend(
+        dir_path, backend_args=backend_args, enable_singleton=True)
+    yield from backend.list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                        recursive)
+
+
+def generate_presigned_url(
+    url: str,
+    client_method: str = 'get_object',
+    expires_in: int = 3600,
+    backend_args: Optional[dict] = None,
+) -> str:
+    """Generate the presigned url of video stream which can be passed to
+    mmcv.VideoReader. Now only work on Petrel backend.
+
+    Note:
+        Now only work on Petrel backend.
+
+    Args:
+        url (str): Url of video stream.
+        client_method (str): Method of client, 'get_object' or
+            'put_object'. Defaults to 'get_object'.
+        expires_in (int): expires, in seconds. Defaults to 3600.
+        backend_args (dict, optional): Arguments to instantiate the
+            corresponding backend. Defaults to None.
+
+    Returns:
+        str: Generated presigned url.
+    """
+    backend = get_file_backend(
+        url, backend_args=backend_args, enable_singleton=True)
+    return backend.generate_presigned_url(url, client_method, expires_in)
+
+
+def load(file,
+         file_format=None,
+         file_client_args=None,
+         backend_args=None,
+         **kwargs):
+    """Load data from json/yaml/pickle files.
+
+    This method provides a unified api for loading data from serialized files.
+
+    ``load`` supports loading data from serialized files those can be storaged
+    in different backends.
+
+    Args:
+        file (str or :obj:`Path` or file-like object): Filename or a file-like
+            object.
+        file_format (str, optional): If not specified, the file format will be
+            inferred from the file extension, otherwise use the specified one.
+            Currently supported formats include "json", "yaml/yml" and
+            "pickle/pkl".
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+
+    Examples:
+        >>> load('/path/of/your/file')  # file is storaged in disk
+        >>> load('https://path/of/your/file')  # file is storaged in Internet
+        >>> load('s3://path/of/your/file')  # file is storaged in petrel
+
+    Returns:
+        The content from the file.
+    """
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None and is_str(file):
+        file_format = file.split('.')[-1]
+    if file_format not in file_handlers:
+        raise TypeError(f'Unsupported format: {file_format}')
+
+    if file_client_args is not None:
+        warnings.warn(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead', DeprecationWarning)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args and "backend_args" cannot be set at the '
+                'same time.')
+
+    handler = file_handlers[file_format]
+    if is_str(file):
+        if file_client_args is not None:
+            file_client = FileClient.infer_client(file_client_args, file)
+            file_backend = file_client
+        else:
+            file_backend = get_file_backend(file, backend_args=backend_args)
+
+        if handler.str_like:
+            with StringIO(file_backend.get_text(file)) as f:
+                obj = handler.load_from_fileobj(f, **kwargs)
+        else:
+            with BytesIO(file_backend.get(file)) as f:
+                obj = handler.load_from_fileobj(f, **kwargs)
+    elif hasattr(file, 'read'):
+        obj = handler.load_from_fileobj(file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filepath str or a file-object')
+    return obj
+
+
+def dump(obj,
+         file=None,
+         file_format=None,
+         file_client_args=None,
+         backend_args=None,
+         **kwargs):
+    """Dump data to json/yaml/pickle strings or files.
+
+    This method provides a unified api for dumping data as strings or to files,
+    and also supports custom arguments for each file format.
+
+    ``dump`` supports dumping data as strings or to files which is saved to
+    different backends.
+
+    Args:
+        obj (any): The python object to be dumped.
+        file (str or :obj:`Path` or file-like object, optional): If not
+            specified, then the object is dumped to a str, otherwise to a file
+            specified by the filename or file-like object.
+        file_format (str, optional): Same as :func:`load`.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+
+    Examples:
+        >>> dump('hello world', '/path/of/your/file')  # disk
+        >>> dump('hello world', 's3://path/of/your/file')  # ceph or petrel
+
+    Returns:
+        bool: True for success, False otherwise.
+    """
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None:
+        if is_str(file):
+            file_format = file.split('.')[-1]
+        elif file is None:
+            raise ValueError(
+                'file_format must be specified since file is None')
+    if file_format not in file_handlers:
+        raise TypeError(f'Unsupported format: {file_format}')
+
+    if file_client_args is not None:
+        warnings.warn(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead', DeprecationWarning)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args" and "backend_args" cannot be set at the '
+                'same time.')
+
+    handler = file_handlers[file_format]
+    if file is None:
+        return handler.dump_to_str(obj, **kwargs)
+    elif is_str(file):
+        if file_client_args is not None:
+            file_client = FileClient.infer_client(file_client_args, file)
+            file_backend = file_client
+        else:
+            file_backend = get_file_backend(file, backend_args=backend_args)
+
+        if handler.str_like:
+            with StringIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                file_backend.put_text(f.getvalue(), file)
+        else:
+            with BytesIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                file_backend.put(f.getvalue(), file)
+    elif hasattr(file, 'write'):
+        handler.dump_to_fileobj(obj, file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filename str or a file-object')
diff --git a/head_extractor/src/mmengine/fileio/parse.py b/head_extractor/src/mmengine/fileio/parse.py
new file mode 100644
index 0000000000000000000000000000000000000000..781d899a0465a0041d7d18262ab8840fa06f5acd
--- /dev/null
+++ b/head_extractor/src/mmengine/fileio/parse.py
@@ -0,0 +1,133 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from io import StringIO
+
+from .file_client import FileClient
+from .io import get_text
+
+
+def list_from_file(filename,
+                   prefix='',
+                   offset=0,
+                   max_num=0,
+                   encoding='utf-8',
+                   file_client_args=None,
+                   backend_args=None):
+    """Load a text file and parse the content as a list of strings.
+
+    ``list_from_file`` supports loading a text file which can be storaged in
+    different backends and parsing the content as a list for strings.
+
+    Args:
+        filename (str): Filename.
+        prefix (str): The prefix to be inserted to the beginning of each item.
+        offset (int): The offset of lines.
+        max_num (int): The maximum number of lines to be read,
+            zeros and negatives mean no limitation.
+        encoding (str): Encoding used to open the file. Defaults to utf-8.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+
+    Examples:
+        >>> list_from_file('/path/of/your/file')  # disk
+        ['hello', 'world']
+        >>> list_from_file('s3://path/of/your/file')  # ceph or petrel
+        ['hello', 'world']
+
+    Returns:
+        list[str]: A list of strings.
+    """
+    if file_client_args is not None:
+        warnings.warn(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead', DeprecationWarning)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args" and "backend_args" cannot be set at the '
+                'same time.')
+    cnt = 0
+    item_list = []
+
+    if file_client_args is not None:
+        file_client = FileClient.infer_client(file_client_args, filename)
+        text = file_client.get_text(filename, encoding)
+    else:
+        text = get_text(filename, encoding, backend_args=backend_args)
+
+    with StringIO(text) as f:
+        for _ in range(offset):
+            f.readline()
+        for line in f:
+            if 0 < max_num <= cnt:
+                break
+            item_list.append(prefix + line.rstrip('\n\r'))
+            cnt += 1
+    return item_list
+
+
+def dict_from_file(filename,
+                   key_type=str,
+                   encoding='utf-8',
+                   file_client_args=None,
+                   backend_args=None):
+    """Load a text file and parse the content as a dict.
+
+    Each line of the text file will be two or more columns split by
+    whitespaces or tabs. The first column will be parsed as dict keys, and
+    the following columns will be parsed as dict values.
+
+    ``dict_from_file`` supports loading a text file which can be storaged in
+    different backends and parsing the content as a dict.
+
+    Args:
+        filename(str): Filename.
+        key_type(type): Type of the dict keys. str is user by default and
+            type conversion will be performed if specified.
+        encoding (str): Encoding used to open the file. Defaults to utf-8.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+
+    Examples:
+        >>> dict_from_file('/path/of/your/file')  # disk
+        {'key1': 'value1', 'key2': 'value2'}
+        >>> dict_from_file('s3://path/of/your/file')  # ceph or petrel
+        {'key1': 'value1', 'key2': 'value2'}
+
+    Returns:
+        dict: The parsed contents.
+    """
+    if file_client_args is not None:
+        warnings.warn(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead', DeprecationWarning)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args" and "backend_args" cannot be set at the '
+                'same time.')
+
+    mapping = {}
+
+    if file_client_args is not None:
+        file_client = FileClient.infer_client(file_client_args, filename)
+        text = file_client.get_text(filename, encoding)
+    else:
+        text = get_text(filename, encoding, backend_args=backend_args)
+
+    with StringIO(text) as f:
+        for line in f:
+            items = line.rstrip('\n').split()
+            assert len(items) >= 2
+            key = key_type(items[0])
+            val = items[1:] if len(items) > 2 else items[1]
+            mapping[key] = val
+    return mapping
diff --git a/head_extractor/src/mmengine/hooks/__init__.py b/head_extractor/src/mmengine/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..746be6b02a5afa130026b64e3653824e5576ea6a
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .checkpoint_hook import CheckpointHook
+from .early_stopping_hook import EarlyStoppingHook
+from .ema_hook import EMAHook
+from .empty_cache_hook import EmptyCacheHook
+from .hook import Hook
+from .iter_timer_hook import IterTimerHook
+from .logger_hook import LoggerHook
+from .naive_visualization_hook import NaiveVisualizationHook
+from .param_scheduler_hook import ParamSchedulerHook
+from .profiler_hook import NPUProfilerHook, ProfilerHook
+from .runtime_info_hook import RuntimeInfoHook
+from .sampler_seed_hook import DistSamplerSeedHook
+from .sync_buffer_hook import SyncBuffersHook
+from .test_time_aug_hook import PrepareTTAHook
+
+__all__ = [
+    'Hook', 'IterTimerHook', 'DistSamplerSeedHook', 'ParamSchedulerHook',
+    'SyncBuffersHook', 'EmptyCacheHook', 'CheckpointHook', 'LoggerHook',
+    'NaiveVisualizationHook', 'EMAHook', 'RuntimeInfoHook', 'ProfilerHook',
+    'PrepareTTAHook', 'NPUProfilerHook', 'EarlyStoppingHook'
+]
diff --git a/head_extractor/src/mmengine/hooks/checkpoint_hook.py b/head_extractor/src/mmengine/hooks/checkpoint_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a4867bb967a2eea6fd9b6024618355e5054670
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/checkpoint_hook.py
@@ -0,0 +1,665 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import hashlib
+import logging
+import os.path as osp
+import pickle
+from collections import deque
+from math import inf
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Sequence, Union
+
+from mmengine.dist import is_main_process, master_only
+from mmengine.fileio import FileClient, get_file_backend
+from mmengine.logging import print_log
+from mmengine.registry import HOOKS
+from mmengine.utils import is_list_of, is_seq_of
+from .hook import Hook
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+@HOOKS.register_module()
+class CheckpointHook(Hook):
+    """Save checkpoints periodically.
+
+    Args:
+        interval (int): The saving period. If ``by_epoch=True``, interval
+            indicates epochs, otherwise it indicates iterations.
+            Defaults to -1, which means "never".
+        by_epoch (bool): Saving checkpoints by epoch or by iteration.
+            Defaults to True.
+        save_optimizer (bool): Whether to save optimizer state_dict in the
+            checkpoint. It is usually used for resuming experiments.
+            Defaults to True.
+        save_param_scheduler (bool): Whether to save param_scheduler state_dict
+            in the checkpoint. It is usually used for resuming experiments.
+            Defaults to True.
+        out_dir (str, Path, Optional): The root directory to save checkpoints.
+            If not specified, ``runner.work_dir`` will be used by default. If
+            specified, the ``out_dir`` will be the concatenation of ``out_dir``
+            and the last level directory of ``runner.work_dir``. For example,
+            if the input ``our_dir`` is ``./tmp`` and ``runner.work_dir`` is
+            ``./work_dir/cur_exp``, then the ckpt will be saved in
+            ``./tmp/cur_exp``. Defaults to None.
+        max_keep_ckpts (int): The maximum checkpoints to keep.
+            In some cases we want only the latest few checkpoints and would
+            like to delete old ones to save the disk space.
+            Defaults to -1, which means unlimited.
+        save_last (bool): Whether to force the last checkpoint to be
+            saved regardless of interval. Defaults to True.
+        save_best (str, List[str], optional): If a metric is specified, it
+            would measure the best checkpoint during evaluation. If a list of
+            metrics is passed, it would measure a group of best checkpoints
+            corresponding to the passed metrics. The information about best
+            checkpoint(s) would be saved in ``runner.message_hub`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resuming checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Defaults to None.
+        rule (str, List[str], optional): Comparison rule for best score. If
+            set to None, it will infer a reasonable rule. Keys such as 'acc',
+            'top' .etc will be inferred by 'greater' rule. Keys contain 'loss'
+            will be inferred by 'less' rule. If ``save_best`` is a list of
+            metrics and ``rule`` is a str, all metrics in ``save_best`` will
+            share the comparison rule. If ``save_best`` and ``rule`` are both
+            lists, their length must be the same, and metrics in ``save_best``
+            will use the corresponding comparison rule in ``rule``. Options
+            are 'greater', 'less', None and list which contains 'greater' and
+            'less'. Defaults to None.
+        greater_keys (List[str], optional): Metric keys that will be
+            inferred by 'greater' comparison rule. If ``None``,
+            _default_greater_keys will be used. Defaults to None.
+        less_keys (List[str], optional): Metric keys that will be
+            inferred by 'less' comparison rule. If ``None``, _default_less_keys
+            will be used. Defaults to None.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            ``backend_args`` instead.
+        filename_tmpl (str, optional): String template to indicate checkpoint
+            name. If specified, must contain one and only one "{}", which will
+            be replaced with ``epoch + 1`` if ``by_epoch=True`` else
+            ``iteration + 1``.
+            Defaults to None, which means "epoch_{}.pth" or "iter_{}.pth"
+            accordingly.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            `New in version 0.2.0.`
+        published_keys (str, List[str], optional): If ``save_last`` is ``True``
+            or ``save_best`` is not ``None``, it will automatically
+            publish model with keys in the list after training.
+            Defaults to None.
+            `New in version 0.7.1.`
+        save_begin (int): Control the epoch number or iteration number
+            at which checkpoint saving begins. Defaults to 0, which means
+            saving at the beginning.
+            `New in version 0.8.3.`
+
+    Examples:
+        >>> # Save best based on single metric
+        >>> CheckpointHook(interval=2, by_epoch=True, save_best='acc',
+        >>>                rule='less')
+        >>> # Save best based on multi metrics with the same comparison rule
+        >>> CheckpointHook(interval=2, by_epoch=True,
+        >>>                save_best=['acc', 'mIoU'], rule='greater')
+        >>> # Save best based on multi metrics with different comparison rule
+        >>> CheckpointHook(interval=2, by_epoch=True,
+        >>>                save_best=['FID', 'IS'], rule=['less', 'greater'])
+        >>> # Save best based on single metric and publish model after training
+        >>> CheckpointHook(interval=2, by_epoch=True, save_best='acc',
+        >>>                rule='less', published_keys=['meta', 'state_dict'])
+    """
+    out_dir: str
+
+    priority = 'VERY_LOW'
+
+    # logic to save best checkpoints
+    # Since the key for determining greater or less is related to the
+    # downstream tasks, downstream repositories may need to overwrite
+    # the following inner variables accordingly.
+
+    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
+    init_value_map = {'greater': -inf, 'less': inf}
+    _default_greater_keys = [
+        'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU',
+        'mAcc', 'aAcc'
+    ]
+    _default_less_keys = ['loss']
+
+    def __init__(self,
+                 interval: int = -1,
+                 by_epoch: bool = True,
+                 save_optimizer: bool = True,
+                 save_param_scheduler: bool = True,
+                 out_dir: Optional[Union[str, Path]] = None,
+                 max_keep_ckpts: int = -1,
+                 save_last: bool = True,
+                 save_best: Union[str, List[str], None] = None,
+                 rule: Union[str, List[str], None] = None,
+                 greater_keys: Optional[Sequence[str]] = None,
+                 less_keys: Optional[Sequence[str]] = None,
+                 file_client_args: Optional[dict] = None,
+                 filename_tmpl: Optional[str] = None,
+                 backend_args: Optional[dict] = None,
+                 published_keys: Union[str, List[str], None] = None,
+                 save_begin: int = 0,
+                 **kwargs) -> None:
+        self.interval = interval
+        self.by_epoch = by_epoch
+        self.save_optimizer = save_optimizer
+        self.save_param_scheduler = save_param_scheduler
+        self.out_dir = out_dir  # type: ignore
+        self.max_keep_ckpts = max_keep_ckpts
+        self.save_last = save_last
+        self.args = kwargs
+
+        if file_client_args is not None:
+            print_log(
+                '"file_client_args" will be deprecated in future. '
+                'Please use "backend_args" instead',
+                logger='current',
+                level=logging.WARNING)
+            if backend_args is not None:
+                raise ValueError(
+                    '"file_client_args" and "backend_args" cannot be set '
+                    'at the same time.')
+
+        self.file_client_args = file_client_args
+        self.backend_args = backend_args
+
+        if filename_tmpl is None:
+            if self.by_epoch:
+                self.filename_tmpl = 'epoch_{}.pth'
+            else:
+                self.filename_tmpl = 'iter_{}.pth'
+        else:
+            self.filename_tmpl = filename_tmpl
+
+        # save best logic
+        assert (isinstance(save_best, str) or is_list_of(save_best, str)
+                or (save_best is None)), (
+                    '"save_best" should be a str or list of str or None, '
+                    f'but got {type(save_best)}')
+
+        if isinstance(save_best, list):
+            if 'auto' in save_best:
+                assert len(save_best) == 1, (
+                    'Only support one "auto" in "save_best" list.')
+            assert len(save_best) == len(
+                set(save_best)), ('Find duplicate element in "save_best".')
+        else:
+            # convert str to list[str]
+            if save_best is not None:
+                save_best = [save_best]  # type: ignore # noqa: F401
+        self.save_best = save_best
+
+        # rule logic
+        assert (isinstance(rule, str) or is_list_of(rule, str)
+                or (rule is None)), (
+                    '"rule" should be a str or list of str or None, '
+                    f'but got {type(rule)}')
+        if isinstance(rule, list):
+            # check the length of rule list
+            assert len(rule) in [
+                1,
+                len(self.save_best)  # type: ignore
+            ], ('Number of "rule" must be 1 or the same as number of '
+                f'"save_best", but got {len(rule)}.')
+        else:
+            # convert str/None to list
+            rule = [rule]  # type: ignore # noqa: F401
+
+        if greater_keys is None:
+            self.greater_keys = self._default_greater_keys
+        else:
+            if not isinstance(greater_keys, (list, tuple)):
+                greater_keys = (greater_keys, )  # type: ignore
+            assert is_seq_of(greater_keys, str)
+            self.greater_keys = greater_keys  # type: ignore
+
+        if less_keys is None:
+            self.less_keys = self._default_less_keys
+        else:
+            if not isinstance(less_keys, (list, tuple)):
+                less_keys = (less_keys, )  # type: ignore
+            assert is_seq_of(less_keys, str)
+            self.less_keys = less_keys  # type: ignore
+
+        if self.save_best is not None:
+            self.is_better_than: Dict[str, Callable] = dict()
+            self._init_rule(rule, self.save_best)
+            if len(self.key_indicators) == 1:
+                self.best_ckpt_path: Optional[str] = None
+            else:
+                self.best_ckpt_path_dict: Dict = dict()
+
+        # published keys
+        if not (isinstance(published_keys, str)
+                or is_seq_of(published_keys, str) or published_keys is None):
+            raise TypeError(
+                '"published_keys" should be a str or a sequence of str or '
+                f'None, but got {type(published_keys)}')
+
+        if isinstance(published_keys, str):
+            published_keys = [published_keys]
+        elif isinstance(published_keys, (list, tuple)):
+            assert len(published_keys) == len(set(published_keys)), (
+                'Find duplicate elements in "published_keys".')
+        self.published_keys = published_keys
+
+        self.last_ckpt = None
+        if save_begin < 0:
+            raise ValueError(
+                'save_begin should not be less than 0, but got {save_begin}')
+        self.save_begin = save_begin
+
+    def before_train(self, runner) -> None:
+        """Finish all operations, related to checkpoint.
+
+        This function will get the appropriate file client, and the directory
+        to save these checkpoints of the model.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.out_dir is None:
+            self.out_dir = runner.work_dir
+
+        # If self.file_client_args is None, self.file_client will not
+        # used in CheckpointHook. To avoid breaking backward compatibility,
+        # it will not be removed util the release of MMEngine1.0
+        self.file_client = FileClient.infer_client(self.file_client_args,
+                                                   self.out_dir)
+
+        if self.file_client_args is None:
+            self.file_backend = get_file_backend(
+                self.out_dir, backend_args=self.backend_args)
+        else:
+            self.file_backend = self.file_client
+
+        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
+        # `self.out_dir` is set so the final `self.out_dir` is the
+        # concatenation of `self.out_dir` and the last level directory of
+        # `runner.work_dir`
+        if self.out_dir != runner.work_dir:
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_backend.join_path(
+                self.out_dir, basename)  # type: ignore  # noqa: E501
+
+        runner.logger.info(f'Checkpoints will be saved to {self.out_dir}.')
+
+        if self.save_best is not None:
+            if len(self.key_indicators) == 1:
+                if 'best_ckpt' not in runner.message_hub.runtime_info:
+                    self.best_ckpt_path = None
+                else:
+                    self.best_ckpt_path = runner.message_hub.get_info(
+                        'best_ckpt')
+            else:
+                for key_indicator in self.key_indicators:
+                    best_ckpt_name = f'best_ckpt_{key_indicator}'
+                    if best_ckpt_name not in runner.message_hub.runtime_info:
+                        self.best_ckpt_path_dict[key_indicator] = None
+                    else:
+                        self.best_ckpt_path_dict[
+                            key_indicator] = runner.message_hub.get_info(
+                                best_ckpt_name)
+
+        if self.max_keep_ckpts > 0:
+            keep_ckpt_ids = []
+            if 'keep_ckpt_ids' in runner.message_hub.runtime_info:
+                keep_ckpt_ids = runner.message_hub.get_info('keep_ckpt_ids')
+
+                while len(keep_ckpt_ids) > self.max_keep_ckpts:
+                    step = keep_ckpt_ids.pop(0)
+                    if is_main_process():
+                        path = self.file_backend.join_path(
+                            self.out_dir, self.filename_tmpl.format(step))
+                        if self.file_backend.isfile(path):
+                            self.file_backend.remove(path)
+                        elif self.file_backend.isdir(path):
+                            # checkpoints saved by deepspeed are directories
+                            self.file_backend.rmtree(path)
+
+            self.keep_ckpt_ids: deque = deque(keep_ckpt_ids,
+                                              self.max_keep_ckpts)
+
+    def after_train_epoch(self, runner) -> None:
+        """Save the checkpoint and synchronize buffers after each epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if not self.by_epoch:
+            return
+
+        # save checkpoint for following cases:
+        # 1. every ``self.interval`` epochs which start at ``self.save_begin``
+        # 2. reach the last epoch of training
+        if self.every_n_epochs(runner, self.interval, self.save_begin) or (
+                self.save_last and self.is_last_train_epoch(runner)):
+            runner.logger.info(
+                f'Saving checkpoint at {runner.epoch + 1} epochs')
+            self._save_checkpoint(runner)
+
+    def after_val_epoch(self, runner, metrics):
+        """Save the checkpoint and synchronize buffers after each evaluation
+        epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            metrics (dict): Evaluation results of all metrics
+        """
+        if len(metrics) == 0:
+            runner.logger.warning(
+                'Since `metrics` is an empty dict, the behavior to save '
+                'the best checkpoint will be skipped in this evaluation.')
+            return
+
+        self._save_best_checkpoint(runner, metrics)
+
+    def after_train(self, runner) -> None:
+        """Publish the checkpoint after training.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.published_keys is None:
+            return
+
+        if self.save_last and self.last_ckpt is not None:
+            self._publish_model(runner, self.last_ckpt)
+
+        if getattr(self, 'best_ckpt_path', None) is not None:
+            self._publish_model(runner, str(self.best_ckpt_path))
+        if getattr(self, 'best_ckpt_path_dict', None) is not None:
+            for best_ckpt in self.best_ckpt_path_dict.values():
+                self._publish_model(runner, best_ckpt)
+
+    @master_only
+    def _publish_model(self, runner, ckpt_path: str) -> None:
+        """Remove unnecessary keys from ckpt_path and save the new checkpoint.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            ckpt_path (str): The checkpoint path that ought to be published.
+        """
+        from mmengine.runner import save_checkpoint
+        from mmengine.runner.checkpoint import _load_checkpoint
+        checkpoint = _load_checkpoint(ckpt_path)
+        assert self.published_keys is not None
+        removed_keys = []
+        for key in list(checkpoint.keys()):
+            if key not in self.published_keys:
+                removed_keys.append(key)
+                checkpoint.pop(key)
+        if removed_keys:
+            print_log(
+                f'Key {removed_keys} will be removed because they are not '
+                'found in published_keys. If you want to keep them, '
+                f'please set `{removed_keys}` in published_keys',
+                logger='current')
+        checkpoint_data = pickle.dumps(checkpoint)
+        sha = hashlib.sha256(checkpoint_data).hexdigest()
+        final_path = osp.splitext(ckpt_path)[0] + f'-{sha[:8]}.pth'
+        save_checkpoint(checkpoint, final_path)
+        print_log(
+            f'The checkpoint ({ckpt_path}) is published to '
+            f'{final_path}.',
+            logger='current')
+
+    def _save_checkpoint_with_step(self, runner, step, meta):
+        # remove other checkpoints before save checkpoint to make the
+        # self.keep_ckpt_ids are saved as expected
+        if self.max_keep_ckpts > 0:
+            # _save_checkpoint and _save_best_checkpoint may call this
+            # _save_checkpoint_with_step in one epoch
+            if len(self.keep_ckpt_ids) > 0 and self.keep_ckpt_ids[-1] == step:
+                pass
+            else:
+                if len(self.keep_ckpt_ids) == self.max_keep_ckpts:
+                    _step = self.keep_ckpt_ids.popleft()
+                    if is_main_process():
+                        ckpt_path = self.file_backend.join_path(
+                            self.out_dir, self.filename_tmpl.format(_step))
+
+                        if self.file_backend.isfile(ckpt_path):
+                            self.file_backend.remove(ckpt_path)
+                        elif self.file_backend.isdir(ckpt_path):
+                            # checkpoints saved by deepspeed are directories
+                            self.file_backend.rmtree(ckpt_path)
+
+                self.keep_ckpt_ids.append(step)
+                runner.message_hub.update_info('keep_ckpt_ids',
+                                               list(self.keep_ckpt_ids))
+
+        ckpt_filename = self.filename_tmpl.format(step)
+        self.last_ckpt = self.file_backend.join_path(self.out_dir,
+                                                     ckpt_filename)
+        runner.message_hub.update_info('last_ckpt', self.last_ckpt)
+
+        runner.save_checkpoint(
+            self.out_dir,
+            ckpt_filename,
+            self.file_client_args,
+            save_optimizer=self.save_optimizer,
+            save_param_scheduler=self.save_param_scheduler,
+            meta=meta,
+            by_epoch=self.by_epoch,
+            backend_args=self.backend_args,
+            **self.args)
+
+        # Model parallel-like training should involve pulling sharded states
+        # from all ranks, but skip the following procedure.
+        if not is_main_process():
+            return
+
+        save_file = osp.join(runner.work_dir, 'last_checkpoint')
+        with open(save_file, 'w') as f:
+            f.write(self.last_ckpt)  # type: ignore
+
+    def _save_checkpoint(self, runner) -> None:
+        """Save the current checkpoint and delete outdated checkpoint.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.by_epoch:
+            step = runner.epoch + 1
+            meta = dict(epoch=step, iter=runner.iter)
+        else:
+            step = runner.iter + 1
+            meta = dict(epoch=runner.epoch, iter=step)
+
+        self._save_checkpoint_with_step(runner, step, meta=meta)
+
+    def _save_best_checkpoint(self, runner, metrics) -> None:
+        """Save the current checkpoint and delete outdated checkpoint.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            metrics (dict): Evaluation results of all metrics.
+        """
+        if not self.save_best:
+            return
+
+        if self.by_epoch:
+            ckpt_filename = self.filename_tmpl.format(runner.epoch)
+            cur_type, cur_time = 'epoch', runner.epoch
+        else:
+            ckpt_filename = self.filename_tmpl.format(runner.iter)
+            cur_type, cur_time = 'iter', runner.iter
+
+        meta = dict(epoch=runner.epoch, iter=runner.iter)
+
+        # handle auto in self.key_indicators and self.rules before the loop
+        if 'auto' in self.key_indicators:
+            self._init_rule(self.rules, [list(metrics.keys())[0]])
+
+        best_ckpt_updated = False
+        # save best logic
+        # get score from messagehub
+        for key_indicator, rule in zip(self.key_indicators, self.rules):
+            key_score = metrics[key_indicator]
+
+            if len(self.key_indicators) == 1:
+                best_score_key = 'best_score'
+                runtime_best_ckpt_key = 'best_ckpt'
+                best_ckpt_path = self.best_ckpt_path
+            else:
+                best_score_key = f'best_score_{key_indicator}'
+                runtime_best_ckpt_key = f'best_ckpt_{key_indicator}'
+                best_ckpt_path = self.best_ckpt_path_dict[key_indicator]
+
+            if best_score_key not in runner.message_hub.runtime_info:
+                best_score = self.init_value_map[rule]
+            else:
+                best_score = runner.message_hub.get_info(best_score_key)
+
+            if key_score is None or not self.is_better_than[key_indicator](
+                    key_score, best_score):
+                continue
+
+            best_ckpt_updated = True
+
+            best_score = key_score
+            runner.message_hub.update_info(best_score_key, best_score)
+
+            if best_ckpt_path and is_main_process():
+                is_removed = False
+                if self.file_backend.isfile(best_ckpt_path):
+                    self.file_backend.remove(best_ckpt_path)
+                    is_removed = True
+                elif self.file_backend.isdir(best_ckpt_path):
+                    # checkpoints saved by deepspeed are directories
+                    self.file_backend.rmtree(best_ckpt_path)
+                    is_removed = True
+
+                if is_removed:
+                    runner.logger.info(
+                        f'The previous best checkpoint {best_ckpt_path} '
+                        'is removed')
+
+            best_ckpt_name = f'best_{key_indicator}_{ckpt_filename}'
+            # Replace illegal characters for filename with `_`
+            best_ckpt_name = best_ckpt_name.replace('/', '_')
+            if len(self.key_indicators) == 1:
+                self.best_ckpt_path = self.file_backend.join_path(  # type: ignore # noqa: E501
+                    self.out_dir, best_ckpt_name)
+                runner.message_hub.update_info(runtime_best_ckpt_key,
+                                               self.best_ckpt_path)
+            else:
+                self.best_ckpt_path_dict[
+                    key_indicator] = self.file_backend.join_path(  # type: ignore # noqa: E501
+                        self.out_dir, best_ckpt_name)
+                runner.message_hub.update_info(
+                    runtime_best_ckpt_key,
+                    self.best_ckpt_path_dict[key_indicator])
+            runner.save_checkpoint(
+                self.out_dir,
+                filename=best_ckpt_name,
+                file_client_args=self.file_client_args,
+                save_optimizer=False,
+                save_param_scheduler=False,
+                meta=meta,
+                by_epoch=False,
+                backend_args=self.backend_args)
+            runner.logger.info(
+                f'The best checkpoint with {best_score:0.4f} {key_indicator} '
+                f'at {cur_time} {cur_type} is saved to {best_ckpt_name}.')
+
+        # save checkpoint again to update the best_score and best_ckpt stored
+        # in message_hub because the checkpoint saved in `after_train_epoch`
+        # or `after_train_iter` stage only keep the previous best checkpoint
+        # not the current best checkpoint which causes the current best
+        # checkpoint can not be removed when resuming training.
+        if best_ckpt_updated and self.last_ckpt is not None:
+            self._save_checkpoint_with_step(runner, cur_time, meta)
+
+    def _init_rule(self, rules, key_indicators) -> None:
+        """Initialize rule, key_indicator, comparison_func, and best score. If
+        key_indicator is a list of string and rule is a string, all metric in
+        the key_indicator will share the same rule.
+
+        Here is the rule to determine which rule is used for key indicator when
+        the rule is not specific (note that the key indicator matching is case-
+        insensitive):
+
+        1. If the key indicator is in ``self.greater_keys``, the rule
+            will be specified as 'greater'.
+        2. Or if the key indicator is in ``self.less_keys``, the rule
+            will be specified as 'less'.
+        3. Or if any one item in ``self.greater_keys`` is a substring of
+            key_indicator, the rule will be specified as 'greater'.
+        4. Or if any one item in ``self.less_keys`` is a substring of
+            key_indicator, the rule will be specified as 'less'.
+
+        Args:
+            rule (List[Optional[str]]): Comparison rule for best score.
+            key_indicator (List[str]): Key indicator to determine
+                the comparison rule.
+        """
+        if len(rules) == 1:
+            rules = rules * len(key_indicators)
+
+        self.rules = []
+        for rule, key_indicator in zip(rules, key_indicators):
+
+            if rule not in self.rule_map and rule is not None:
+                raise KeyError('rule must be greater, less or None, '
+                               f'but got {rule}.')
+
+            if rule is None and key_indicator != 'auto':
+                # `_lc` here means we use the lower case of keys for
+                # case-insensitive matching
+                key_indicator_lc = key_indicator.lower()
+                greater_keys = {key.lower() for key in self.greater_keys}
+                less_keys = {key.lower() for key in self.less_keys}
+
+                if key_indicator_lc in greater_keys:
+                    rule = 'greater'
+                elif key_indicator_lc in less_keys:
+                    rule = 'less'
+                elif any(key in key_indicator_lc for key in greater_keys):
+                    rule = 'greater'
+                elif any(key in key_indicator_lc for key in less_keys):
+                    rule = 'less'
+                else:
+                    raise ValueError('Cannot infer the rule for key '
+                                     f'{key_indicator}, thus a specific rule '
+                                     'must be specified.')
+            if rule is not None:
+                self.is_better_than[key_indicator] = self.rule_map[rule]
+            self.rules.append(rule)
+
+        self.key_indicators = key_indicators
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs=Optional[dict]) -> None:
+        """Save the checkpoint and synchronize buffers after each iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (dict, optional): Outputs from model.
+        """
+        if self.by_epoch:
+            return
+
+        # save checkpoint for following cases:
+        # 1. every ``self.interval`` iterations
+        #       which start at ``self.save_begin``
+        # 2. reach the last iteration of training
+        if self.every_n_train_iters(runner, self.interval,
+                                    self.save_begin) or \
+                (self.save_last and
+                 self.is_last_train_iter(runner)):
+            runner.logger.info(
+                f'Saving checkpoint at {runner.iter + 1} iterations')
+            self._save_checkpoint(runner)
diff --git a/head_extractor/src/mmengine/hooks/early_stopping_hook.py b/head_extractor/src/mmengine/hooks/early_stopping_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..5533ebc84cf939a38fc12e3cb6a8e5455c2f8c3c
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/early_stopping_hook.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from math import inf, isfinite
+from typing import Optional, Tuple, Union
+
+from mmengine.registry import HOOKS
+from .hook import Hook
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+@HOOKS.register_module()
+class EarlyStoppingHook(Hook):
+    """Early stop the training when the monitored metric reached a plateau.
+
+    Args:
+        monitor (str): The monitored metric key to decide early stopping.
+        rule (str, optional): Comparison rule. Options are 'greater',
+            'less'. Defaults to None.
+        min_delta (float, optional): Minimum difference to continue the
+            training. Defaults to 0.01.
+        strict (bool, optional): Whether to crash the training when `monitor`
+            is not found in the `metrics`. Defaults to False.
+        check_finite: Whether to stop training when the monitor becomes NaN or
+            infinite. Defaults to True.
+        patience (int, optional): The times of validation with no improvement
+            after which training will be stopped. Defaults to 5.
+        stopping_threshold (float, optional): Stop training immediately once
+            the monitored quantity reaches this threshold. Defaults to None.
+
+    Note:
+        `New in version 0.7.0.`
+    """
+    priority = 'LOWEST'
+
+    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
+    _default_greater_keys = [
+        'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU',
+        'mAcc', 'aAcc'
+    ]
+    _default_less_keys = ['loss']
+
+    def __init__(
+        self,
+        monitor: str,
+        rule: Optional[str] = None,
+        min_delta: float = 0.1,
+        strict: bool = False,
+        check_finite: bool = True,
+        patience: int = 5,
+        stopping_threshold: Optional[float] = None,
+    ):
+
+        self.monitor = monitor
+        if rule is not None:
+            if rule not in ['greater', 'less']:
+                raise ValueError(
+                    '`rule` should be either "greater" or "less", '
+                    f'but got {rule}')
+        else:
+            rule = self._init_rule(monitor)
+        self.rule = rule
+        self.min_delta = min_delta if rule == 'greater' else -1 * min_delta
+        self.strict = strict
+        self.check_finite = check_finite
+        self.patience = patience
+        self.stopping_threshold = stopping_threshold
+
+        self.wait_count = 0
+        self.best_score = -inf if rule == 'greater' else inf
+
+    def _init_rule(self, monitor: str) -> str:
+        greater_keys = {key.lower() for key in self._default_greater_keys}
+        less_keys = {key.lower() for key in self._default_less_keys}
+        monitor_lc = monitor.lower()
+        if monitor_lc in greater_keys:
+            rule = 'greater'
+        elif monitor_lc in less_keys:
+            rule = 'less'
+        elif any(key in monitor_lc for key in greater_keys):
+            rule = 'greater'
+        elif any(key in monitor_lc for key in less_keys):
+            rule = 'less'
+        else:
+            raise ValueError(f'Cannot infer the rule for {monitor}, thus rule '
+                             'must be specified.')
+        return rule
+
+    def _check_stop_condition(self, current_score: float) -> Tuple[bool, str]:
+        compare = self.rule_map[self.rule]
+        stop_training = False
+        reason_message = ''
+
+        if self.check_finite and not isfinite(current_score):
+            stop_training = True
+            reason_message = (f'Monitored metric {self.monitor} = '
+                              f'{current_score} is infinite. '
+                              f'Previous best value was '
+                              f'{self.best_score:.3f}.')
+
+        elif self.stopping_threshold is not None and compare(
+                current_score, self.stopping_threshold):
+            stop_training = True
+            self.best_score = current_score
+            reason_message = (f'Stopping threshold reached: '
+                              f'`{self.monitor}` = {current_score} is '
+                              f'{self.rule} than {self.stopping_threshold}.')
+        elif compare(self.best_score + self.min_delta, current_score):
+
+            self.wait_count += 1
+
+            if self.wait_count >= self.patience:
+                reason_message = (f'the monitored metric did not improve '
+                                  f'in the last {self.wait_count} records. '
+                                  f'best score: {self.best_score:.3f}. ')
+                stop_training = True
+        else:
+            self.best_score = current_score
+            self.wait_count = 0
+
+        return stop_training, reason_message
+
+    def before_run(self, runner) -> None:
+        """Check `stop_training` variable in `runner.train_loop`.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+
+        assert hasattr(runner.train_loop, 'stop_training'), \
+            '`train_loop` should contain `stop_training` variable.'
+
+    def after_val_epoch(self, runner, metrics):
+        """Decide whether to stop the training process.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            metrics (dict): Evaluation results of all metrics
+        """
+
+        if self.monitor not in metrics:
+            if self.strict:
+                raise RuntimeError(
+                    'Early stopping conditioned on metric '
+                    f'`{self.monitor} is not available. Please check available'
+                    f' metrics {metrics}, or set `strict=False` in '
+                    '`EarlyStoppingHook`.')
+            warnings.warn(
+                'Skip early stopping process since the evaluation '
+                f'results ({metrics.keys()}) do not include `monitor` '
+                f'({self.monitor}).')
+            return
+
+        current_score = metrics[self.monitor]
+
+        stop_training, message = self._check_stop_condition(current_score)
+        if stop_training:
+            runner.train_loop.stop_training = True
+            runner.logger.info(message)
diff --git a/head_extractor/src/mmengine/hooks/ema_hook.py b/head_extractor/src/mmengine/hooks/ema_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bc1051d0b7069fa83fa4ad10454c0b6e6470716
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/ema_hook.py
@@ -0,0 +1,241 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import itertools
+import logging
+from typing import Dict, Optional
+
+from mmengine.logging import print_log
+from mmengine.model import is_model_wrapper
+from mmengine.registry import HOOKS, MODELS
+from .hook import DATA_BATCH, Hook
+
+
+@HOOKS.register_module()
+class EMAHook(Hook):
+    """A Hook to apply Exponential Moving Average (EMA) on the model during
+    training.
+
+    Note:
+        - EMAHook takes priority over CheckpointHook.
+        - The original model parameters are actually saved in ema field after
+          train.
+        - ``begin_iter`` and ``begin_epoch`` cannot be set at the same time.
+
+    Args:
+        ema_type (str): The type of EMA strategy to use. You can find the
+            supported strategies in :mod:`mmengine.model.averaged_model`.
+            Defaults to 'ExponentialMovingAverage'.
+        strict_load (bool): Whether to strictly enforce that the keys of
+            ``state_dict`` in checkpoint match the keys returned by
+            ``self.module.state_dict``. Defaults to False.
+            Changed in v0.3.0.
+        begin_iter (int): The number of iteration to enable ``EMAHook``.
+            Defaults to 0.
+        begin_epoch (int): The number of epoch to enable ``EMAHook``.
+            Defaults to 0.
+        **kwargs: Keyword arguments passed to subclasses of
+            :obj:`BaseAveragedModel`
+    """
+
+    priority = 'NORMAL'
+
+    def __init__(self,
+                 ema_type: str = 'ExponentialMovingAverage',
+                 strict_load: bool = False,
+                 begin_iter: int = 0,
+                 begin_epoch: int = 0,
+                 **kwargs):
+        self.strict_load = strict_load
+        self.ema_cfg = dict(type=ema_type, **kwargs)
+        assert not (begin_iter != 0 and begin_epoch != 0), (
+            '`begin_iter` and `begin_epoch` should not be both set.')
+        assert begin_iter >= 0, (
+            '`begin_iter` must larger than or equal to 0, '
+            f'but got begin_iter: {begin_iter}')
+        assert begin_epoch >= 0, (
+            '`begin_epoch` must larger than or equal to 0, '
+            f'but got begin_epoch: {begin_epoch}')
+        self.begin_iter = begin_iter
+        self.begin_epoch = begin_epoch
+        # If `begin_epoch` and `begin_iter` are not set, `EMAHook` will be
+        # enabled at 0 iteration.
+        self.enabled_by_epoch = self.begin_epoch > 0
+
+    def before_run(self, runner) -> None:
+        """Create an ema copy of the model.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        model = runner.model
+        if is_model_wrapper(model):
+            model = model.module
+        self.src_model = model
+        self.ema_model = MODELS.build(
+            self.ema_cfg, default_args=dict(model=self.src_model))
+
+    def before_train(self, runner) -> None:
+        """Check the begin_epoch/iter is smaller than max_epochs/iters.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.enabled_by_epoch:
+            assert self.begin_epoch <= runner.max_epochs, (
+                'self.begin_epoch should be smaller than or equal to '
+                f'runner.max_epochs: {runner.max_epochs}, but got '
+                f'begin_epoch: {self.begin_epoch}')
+        else:
+            assert self.begin_iter <= runner.max_iters, (
+                'self.begin_iter should be smaller than or equal to '
+                f'runner.max_iters: {runner.max_iters}, but got '
+                f'begin_iter: {self.begin_iter}')
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Update ema parameter.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (Sequence[dict], optional): Data from dataloader.
+                Defaults to None.
+            outputs (dict, optional): Outputs from model. Defaults to None.
+        """
+        if self._ema_started(runner):
+            self.ema_model.update_parameters(self.src_model)
+        else:
+            ema_params = self.ema_model.module.state_dict()
+            src_params = self.src_model.state_dict()
+            for k, p in ema_params.items():
+                p.data.copy_(src_params[k].data)
+
+    def before_val_epoch(self, runner) -> None:
+        """We load parameter values from ema model to source model before
+        validation.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        self._swap_ema_parameters()
+
+    def after_val_epoch(self,
+                        runner,
+                        metrics: Optional[Dict[str, float]] = None) -> None:
+        """We recover source model's parameter from ema model after validation.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        self._swap_ema_parameters()
+
+    def before_test_epoch(self, runner) -> None:
+        """We load parameter values from ema model to source model before test.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        self._swap_ema_parameters()
+
+    def after_test_epoch(self,
+                         runner,
+                         metrics: Optional[Dict[str, float]] = None) -> None:
+        """We recover source model's parameter from ema model after test.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on test dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        self._swap_ema_parameters()
+
+    def before_save_checkpoint(self, runner, checkpoint: dict) -> None:
+        """Save ema parameters to checkpoint.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+        """
+        checkpoint['ema_state_dict'] = self.ema_model.state_dict()
+        # Save ema parameters to the source model's state dict so that we
+        # can directly load the averaged model weights for deployment.
+        # Swapping the state_dict key-values instead of swapping model
+        # parameters because the state_dict is a shallow copy of model
+        # parameters.
+        self._swap_ema_state_dict(checkpoint)
+
+    def after_load_checkpoint(self, runner, checkpoint: dict) -> None:
+        """Resume ema parameters from checkpoint.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+        """
+        from mmengine.runner.checkpoint import load_state_dict
+        if 'ema_state_dict' in checkpoint and runner._resume:
+            # The original model parameters are actually saved in ema
+            # field swap the weights back to resume ema state.
+            self._swap_ema_state_dict(checkpoint)
+            self.ema_model.load_state_dict(
+                checkpoint['ema_state_dict'], strict=self.strict_load)
+
+        # Support load checkpoint without ema state dict.
+        else:
+            if runner._resume:
+                print_log(
+                    'There is no `ema_state_dict` in checkpoint. '
+                    '`EMAHook` will make a copy of `state_dict` as the '
+                    'initial `ema_state_dict`', 'current', logging.WARNING)
+            load_state_dict(
+                self.ema_model.module,
+                copy.deepcopy(checkpoint['state_dict']),
+                strict=self.strict_load)
+
+    def _swap_ema_parameters(self) -> None:
+        """Swap the parameter of model with ema_model."""
+        avg_param = (
+            itertools.chain(self.ema_model.module.parameters(),
+                            self.ema_model.module.buffers())
+            if self.ema_model.update_buffers else
+            self.ema_model.module.parameters())
+        src_param = (
+            itertools.chain(self.src_model.parameters(),
+                            self.src_model.buffers())
+            if self.ema_model.update_buffers else self.src_model.parameters())
+        for p_avg, p_src in zip(avg_param, src_param):
+            tmp = p_avg.data.clone()
+            p_avg.data.copy_(p_src.data)
+            p_src.data.copy_(tmp)
+
+    def _swap_ema_state_dict(self, checkpoint):
+        """Swap the state dict values of model with ema_model."""
+        model_state = checkpoint['state_dict']
+        ema_state = checkpoint['ema_state_dict']
+        for k in ema_state:
+            if k[:7] == 'module.':
+                tmp = ema_state[k]
+                ema_state[k] = model_state[k[7:]]
+                model_state[k[7:]] = tmp
+
+    def _ema_started(self, runner) -> bool:
+        """Whether ``EMAHook`` has been initialized at current iteration or
+        epoch.
+
+        :attr:`ema_model` will be initialized when ``runner.iter`` or
+        ``runner.epoch`` is greater than ``self.begin`` for the first time.
+
+        Args:
+            runner (Runner): Runner of the training, validation process.
+
+        Returns:
+            bool: Whether ``EMAHook`` has been initialized.
+        """
+        if self.enabled_by_epoch:
+            return runner.epoch + 1 >= self.begin_epoch
+        else:
+            return runner.iter + 1 >= self.begin_iter
diff --git a/head_extractor/src/mmengine/hooks/empty_cache_hook.py b/head_extractor/src/mmengine/hooks/empty_cache_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a92cdebfef11ce5c9f9bf286d735b8e6202a85f
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/empty_cache_hook.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence, Union
+
+import torch
+
+from mmengine.registry import HOOKS
+from ..device import is_cuda_available, is_musa_available
+from .hook import Hook
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+@HOOKS.register_module()
+class EmptyCacheHook(Hook):
+    """Releases all unoccupied cached GPU memory during the process of
+    training.
+
+    Args:
+        before_epoch (bool): Whether to release cache before an epoch. Defaults
+            to False.
+        after_epoch (bool): Whether to release cache after an epoch. Defaults
+            to True.
+        after_iter (bool): Whether to release cache after an iteration.
+            Defaults to False.
+    """
+
+    priority = 'NORMAL'
+
+    def __init__(self,
+                 before_epoch: bool = False,
+                 after_epoch: bool = True,
+                 after_iter: bool = False) -> None:
+        self._do_before_epoch = before_epoch
+        self._do_after_epoch = after_epoch
+        self._do_after_iter = after_iter
+
+    def _after_iter(self,
+                    runner,
+                    batch_idx: int,
+                    data_batch: DATA_BATCH = None,
+                    outputs: Optional[Union[dict, Sequence]] = None,
+                    mode: str = 'train') -> None:
+        """Empty cache after an iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (dict or sequence, optional): Outputs from model.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+        if self._do_after_iter:
+            if is_cuda_available():
+                torch.cuda.empty_cache()
+            elif is_musa_available():
+                torch.musa.empty_cache()
+
+    def _before_epoch(self, runner, mode: str = 'train') -> None:
+        """Empty cache before an epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+        if self._do_before_epoch:
+            if is_cuda_available():
+                torch.cuda.empty_cache()
+            elif is_musa_available():
+                torch.musa.empty_cache()
+
+    def _after_epoch(self, runner, mode: str = 'train') -> None:
+        """Empty cache after an epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+        if self._do_after_epoch:
+            if is_cuda_available():
+                torch.cuda.empty_cache()
+            elif is_musa_available():
+                torch.musa.empty_cache()
diff --git a/head_extractor/src/mmengine/hooks/hook.py b/head_extractor/src/mmengine/hooks/hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e1c4ce8bd28f450f911d0e92f9ebceb6dae4014
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/hook.py
@@ -0,0 +1,449 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Sequence, Union
+
+from mmengine import is_method_overridden
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+class Hook:
+    """Base hook class.
+
+    All hooks should inherit from this class.
+    """
+
+    priority = 'NORMAL'
+    stages = ('before_run', 'after_load_checkpoint', 'before_train',
+              'before_train_epoch', 'before_train_iter', 'after_train_iter',
+              'after_train_epoch', 'before_val', 'before_val_epoch',
+              'before_val_iter', 'after_val_iter', 'after_val_epoch',
+              'after_val', 'before_save_checkpoint', 'after_train',
+              'before_test', 'before_test_epoch', 'before_test_iter',
+              'after_test_iter', 'after_test_epoch', 'after_test', 'after_run')
+
+    def before_run(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before the training validation or testing process.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+        """
+
+    def after_run(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before the training validation or testing process.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+        """
+
+    def before_train(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before train.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+
+    def after_train(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations after train.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+
+    def before_val(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before validation.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+        """
+
+    def after_val(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations after validation.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+        """
+
+    def before_test(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before testing.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+        """
+
+    def after_test(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations after testing.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+        """
+
+    def before_save_checkpoint(self, runner, checkpoint: dict) -> None:
+        """All subclasses should override this method, if they need any
+        operations before saving the checkpoint.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            checkpoint (dict): Model's checkpoint.
+        """
+
+    def after_load_checkpoint(self, runner, checkpoint: dict) -> None:
+        """All subclasses should override this method, if they need any
+        operations after loading the checkpoint.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            checkpoint (dict): Model's checkpoint.
+        """
+
+    def before_train_epoch(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before each training epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        self._before_epoch(runner, mode='train')
+
+    def before_val_epoch(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before each validation epoch.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+        """
+        self._before_epoch(runner, mode='val')
+
+    def before_test_epoch(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations before each test epoch.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+        """
+        self._before_epoch(runner, mode='test')
+
+    def after_train_epoch(self, runner) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each training epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        self._after_epoch(runner, mode='train')
+
+    def after_val_epoch(self,
+                        runner,
+                        metrics: Optional[Dict[str, float]] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each validation epoch.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        self._after_epoch(runner, mode='val')
+
+    def after_test_epoch(self,
+                         runner,
+                         metrics: Optional[Dict[str, float]] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each test epoch.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on test dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        self._after_epoch(runner, mode='test')
+
+    def before_train_iter(self,
+                          runner,
+                          batch_idx: int,
+                          data_batch: DATA_BATCH = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations before each training iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+        """
+        self._before_iter(
+            runner, batch_idx=batch_idx, data_batch=data_batch, mode='train')
+
+    def before_val_iter(self,
+                        runner,
+                        batch_idx: int,
+                        data_batch: DATA_BATCH = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations before each validation iteration.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict, optional): Data from dataloader.
+                Defaults to None.
+        """
+        self._before_iter(
+            runner, batch_idx=batch_idx, data_batch=data_batch, mode='val')
+
+    def before_test_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations before each test iteration.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+                Defaults to None.
+        """
+        self._before_iter(
+            runner, batch_idx=batch_idx, data_batch=data_batch, mode='test')
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs: Optional[dict] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each training iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict tuple or list, optional): Data from dataloader.
+            outputs (dict, optional): Outputs from model.
+        """
+        self._after_iter(
+            runner,
+            batch_idx=batch_idx,
+            data_batch=data_batch,
+            outputs=outputs,
+            mode='train')
+
+    def after_val_iter(self,
+                       runner,
+                       batch_idx: int,
+                       data_batch: DATA_BATCH = None,
+                       outputs: Optional[Sequence] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each validation iteration.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (Sequence, optional): Outputs from model.
+        """
+        self._after_iter(
+            runner,
+            batch_idx=batch_idx,
+            data_batch=data_batch,
+            outputs=outputs,
+            mode='val')
+
+    def after_test_iter(self,
+                        runner,
+                        batch_idx: int,
+                        data_batch: DATA_BATCH = None,
+                        outputs: Optional[Sequence] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each test iteration.
+
+        Args:
+            runner (Runner): The runner of the training  process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (Sequence, optional): Outputs from model.
+        """
+        self._after_iter(
+            runner,
+            batch_idx=batch_idx,
+            data_batch=data_batch,
+            outputs=outputs,
+            mode='test')
+
+    def _before_epoch(self, runner, mode: str = 'train') -> None:
+        """All subclasses should override this method, if they need any
+        operations before each epoch.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+
+    def _after_epoch(self, runner, mode: str = 'train') -> None:
+        """All subclasses should override this method, if they need any
+        operations after each epoch.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+
+    def _before_iter(self,
+                     runner,
+                     batch_idx: int,
+                     data_batch: DATA_BATCH = None,
+                     mode: str = 'train') -> None:
+        """All subclasses should override this method, if they need any
+        operations before each iter.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            batch_idx (int): The index of the current batch in the loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+
+    def _after_iter(self,
+                    runner,
+                    batch_idx: int,
+                    data_batch: DATA_BATCH = None,
+                    outputs: Optional[Union[Sequence, dict]] = None,
+                    mode: str = 'train') -> None:
+        """All subclasses should override this method, if they need any
+        operations after each epoch.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            batch_idx (int): The index of the current batch in the loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (dict or Sequence, optional): Outputs from model.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+
+    def every_n_epochs(self, runner, n: int, start: int = 0) -> bool:
+        """Test whether current epoch can be evenly divided by n.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            n (int): Whether current epoch can be evenly divided by n.
+            start (int): Starting from `start` to check the logic for
+                every n epochs. Defaults to 0.
+
+        Returns:
+            bool: Whether current epoch can be evenly divided by n.
+        """
+        dividend = runner.epoch + 1 - start
+        return dividend % n == 0 if dividend >= 0 and n > 0 else False
+
+    def every_n_inner_iters(self, batch_idx: int, n: int) -> bool:
+        """Test whether current inner iteration can be evenly divided by n.
+
+        Args:
+            batch_idx (int): Current batch index of the training, validation
+                or testing loop.
+            n (int): Whether current inner iteration can be evenly
+                divided by n.
+
+        Returns:
+            bool: Whether current inner iteration can be evenly
+            divided by n.
+        """
+        return (batch_idx + 1) % n == 0 if n > 0 else False
+
+    def every_n_train_iters(self, runner, n: int, start: int = 0) -> bool:
+        """Test whether current training iteration can be evenly divided by n.
+
+        Args:
+            runner (Runner): The runner of the training, validation or testing
+                process.
+            n (int): Whether current iteration can be evenly divided by n.
+            start (int): Starting from `start` to check the logic for
+                every n iterations. Defaults to 0.
+
+        Returns:
+            bool: Return True if the current iteration can be evenly divided
+            by n, otherwise False.
+        """
+        dividend = runner.iter + 1 - start
+        return dividend % n == 0 if dividend >= 0 and n > 0 else False
+
+    def end_of_epoch(self, dataloader, batch_idx: int) -> bool:
+        """Check whether the current iteration reaches the last iteration of
+        the dataloader.
+
+        Args:
+            dataloader (Dataloader): The dataloader of the training,
+                validation or testing process.
+            batch_idx (int): The index of the current batch in the loop.
+        Returns:
+            bool: Whether reaches the end of current epoch or not.
+        """
+        return batch_idx + 1 == len(dataloader)
+
+    def is_last_train_epoch(self, runner) -> bool:
+        """Test whether current epoch is the last train epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+
+        Returns:
+            bool: Whether reaches the end of training epoch.
+        """
+        return runner.epoch + 1 == runner.max_epochs
+
+    def is_last_train_iter(self, runner) -> bool:
+        """Test whether current iteration is the last train iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+
+        Returns:
+            bool: Whether current iteration is the last train iteration.
+        """
+        return runner.iter + 1 == runner.max_iters
+
+    def get_triggered_stages(self) -> list:
+        """Get all triggered stages with method name of the hook.
+
+        Returns:
+            list: List of triggered stages.
+        """
+        trigger_stages = set()
+        for stage in Hook.stages:
+            if is_method_overridden(stage, Hook, self):
+                trigger_stages.add(stage)
+
+        # some methods will be triggered in multi stages
+        # use this dict to map method to stages.
+        method_stages_map = {
+            '_before_epoch':
+            ['before_train_epoch', 'before_val_epoch', 'before_test_epoch'],
+            '_after_epoch':
+            ['after_train_epoch', 'after_val_epoch', 'after_test_epoch'],
+            '_before_iter':
+            ['before_train_iter', 'before_val_iter', 'before_test_iter'],
+            '_after_iter':
+            ['after_train_iter', 'after_val_iter', 'after_test_iter'],
+        }
+
+        for method, map_stages in method_stages_map.items():
+            if is_method_overridden(method, Hook, self):
+                trigger_stages.update(map_stages)
+
+        return list(trigger_stages)
diff --git a/head_extractor/src/mmengine/hooks/iter_timer_hook.py b/head_extractor/src/mmengine/hooks/iter_timer_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..5632c2b25e03b5773ec694452535be1ecb661d19
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/iter_timer_hook.py
@@ -0,0 +1,107 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+from typing import Optional, Sequence, Union
+
+from mmengine.registry import HOOKS
+from .hook import Hook
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+@HOOKS.register_module()
+class IterTimerHook(Hook):
+    """A hook that logs the time spent during iteration.
+
+    E.g. ``data_time`` for loading data and ``time`` for a model train step.
+    """
+
+    priority = 'NORMAL'
+
+    def __init__(self):
+        self.time_sec_tot = 0
+        self.time_sec_test_val = 0
+        self.start_iter = 0
+
+    def before_train(self, runner) -> None:
+        """Synchronize the number of iterations with the runner after resuming
+        from checkpoints.
+
+        Args:
+            runner: The runner of the training, validation or testing
+                process.
+        """
+        self.start_iter = runner.iter
+
+    def _before_epoch(self, runner, mode: str = 'train') -> None:
+        """Record timestamp before start an epoch.
+
+        Args:
+            runner (Runner): The runner of the training validation and
+                testing process.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+        self.t = time.time()
+
+    def _after_epoch(self, runner, mode: str = 'train') -> None:
+        self.time_sec_test_val = 0
+
+    def _before_iter(self,
+                     runner,
+                     batch_idx: int,
+                     data_batch: DATA_BATCH = None,
+                     mode: str = 'train') -> None:
+        """Calculating time for loading data and updating "data_time"
+        ``HistoryBuffer`` of ``runner.message_hub``.
+
+        Args:
+            runner (Runner): The runner of the training, validation and
+                testing process.
+            batch_idx (int): The index of the current batch in the loop.
+            data_batch (dict or tuple or list, optional): Data from
+                dataloader.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+        # Update data loading time in `runner.message_hub`.
+        runner.message_hub.update_scalar(f'{mode}/data_time',
+                                         time.time() - self.t)
+
+    def _after_iter(self,
+                    runner,
+                    batch_idx: int,
+                    data_batch: DATA_BATCH = None,
+                    outputs: Optional[Union[dict, Sequence]] = None,
+                    mode: str = 'train') -> None:
+        """Calculating time for an iteration and updating "time"
+        ``HistoryBuffer`` of ``runner.message_hub``.
+
+        Args:
+            runner (Runner): The runner of the training validation and
+                testing process.
+            batch_idx (int): The index of the current batch in the loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (dict or sequence, optional): Outputs from model.
+            mode (str): Current mode of runner. Defaults to 'train'.
+        """
+        # Update iteration time in `runner.message_hub`.
+        message_hub = runner.message_hub
+        message_hub.update_scalar(f'{mode}/time', time.time() - self.t)
+        self.t = time.time()
+        iter_time = message_hub.get_scalar(f'{mode}/time')
+        if mode == 'train':
+            self.time_sec_tot += iter_time.current()
+            # Calculate average iterative time.
+            time_sec_avg = self.time_sec_tot / (
+                runner.iter - self.start_iter + 1)
+            # Calculate eta.
+            eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
+            runner.message_hub.update_info('eta', eta_sec)
+        else:
+            if mode == 'val':
+                cur_dataloader = runner.val_dataloader
+            else:
+                cur_dataloader = runner.test_dataloader
+
+            self.time_sec_test_val += iter_time.current()
+            time_sec_avg = self.time_sec_test_val / (batch_idx + 1)
+            eta_sec = time_sec_avg * (len(cur_dataloader) - batch_idx - 1)
+            runner.message_hub.update_info('eta', eta_sec)
diff --git a/head_extractor/src/mmengine/hooks/logger_hook.py b/head_extractor/src/mmengine/hooks/logger_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa0b79dcf93cf9380a464a8ca50fda4b3d61c93e
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/logger_hook.py
@@ -0,0 +1,355 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+import os.path as osp
+from collections import OrderedDict
+from pathlib import Path
+from typing import Dict, Optional, Sequence, Union
+
+import numpy as np
+import torch
+
+from mmengine.fileio import FileClient, dump
+from mmengine.fileio.io import get_file_backend
+from mmengine.hooks import Hook
+from mmengine.logging import print_log
+from mmengine.registry import HOOKS
+from mmengine.utils import is_seq_of, scandir
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+SUFFIX_TYPE = Union[Sequence[str], str]
+
+
+@HOOKS.register_module()
+class LoggerHook(Hook):
+    """Collect logs from different components of ``Runner`` and write them to
+    terminal, JSON file, tensorboard and wandb .etc.
+
+    ``LoggerHook`` is used to record logs formatted by ``LogProcessor`` during
+    training/validation/testing phase. It is used to control following
+    behaviors:
+
+    - The frequency of logs update in terminal, local, tensorboad wandb.etc.
+    - The frequency of show experiment information in terminal.
+    - The work directory to save logs.
+
+    Args:
+        interval (int): Logging interval (every k iterations).
+            Defaults to 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch if
+            the number of remaining iterations is less than :attr:`interval`.
+            Defaults to True.
+        interval_exp_name (int): Logging interval for experiment name. This
+            feature is to help users conveniently get the experiment
+            information from screen or log file. Defaults to 1000.
+        out_dir (str or Path, optional): The root directory to save
+            checkpoints. If not specified, ``runner.work_dir`` will be used
+            by default. If specified, the ``out_dir`` will be the concatenation
+            of ``out_dir`` and the last level directory of ``runner.work_dir``.
+            For example, if the input ``out_dir`` is ``./tmp`` and
+            ``runner.work_dir`` is ``./work_dir/cur_exp``, then the log will be
+            saved in ``./tmp/cur_exp``. Defaults to None.
+        out_suffix (Tuple[str] or str): Those files in ``runner._log_dir``
+            ending with ``out_suffix`` will be copied to ``out_dir``. Defaults
+            to ('json', '.log', '.py').
+        keep_local (bool): Whether to keep local logs in the local machine
+            when :attr:`out_dir` is specified. If False, the local log will be
+            removed. Defaults to True.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            `backend_args` instead.
+        log_metric_by_epoch (bool): Whether to output metric in validation step
+            by epoch. It can be true when running in epoch based runner.
+            If set to True, `after_val_epoch` will set `step` to self.epoch in
+            `runner.visualizer.add_scalars`. Otherwise `step` will be
+            self.iter. Defaults to True.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+
+    Examples:
+        >>> # The simplest LoggerHook config.
+        >>> logger_hook_cfg = dict(interval=20)
+    """
+    priority = 'BELOW_NORMAL'
+
+    def __init__(self,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 interval_exp_name: int = 1000,
+                 out_dir: Optional[Union[str, Path]] = None,
+                 out_suffix: SUFFIX_TYPE = ('.json', '.log', '.py', 'yaml'),
+                 keep_local: bool = True,
+                 file_client_args: Optional[dict] = None,
+                 log_metric_by_epoch: bool = True,
+                 backend_args: Optional[dict] = None):
+
+        if not isinstance(interval, int):
+            raise TypeError('interval must be an integer')
+        if interval <= 0:
+            raise ValueError('interval must be greater than 0')
+
+        if not isinstance(ignore_last, bool):
+            raise TypeError('ignore_last must be a boolean')
+
+        if not isinstance(interval_exp_name, int):
+            raise TypeError('interval_exp_name must be an integer')
+        if interval_exp_name <= 0:
+            raise ValueError('interval_exp_name must be greater than 0')
+
+        if out_dir is not None and not isinstance(out_dir, (str, Path)):
+            raise TypeError('out_dir must be a str or Path object')
+
+        if not isinstance(keep_local, bool):
+            raise TypeError('keep_local must be a boolean')
+
+        if out_dir is None and file_client_args is not None:
+            raise ValueError(
+                'file_client_args should be "None" when `out_dir` is not'
+                'specified.')
+
+        if file_client_args is not None:
+            print_log(
+                '"file_client_args" will be deprecated in future. '
+                'Please use "backend_args" instead',
+                logger='current',
+                level=logging.WARNING)
+            if backend_args is not None:
+                raise ValueError(
+                    '"file_client_args" and "backend_args" cannot be set '
+                    'at the same time.')
+
+        if not (isinstance(out_suffix, str) or is_seq_of(out_suffix, str)):
+            raise TypeError('out_suffix should be a string or a sequence of '
+                            f'string, but got {type(out_suffix)}')
+
+        self.out_suffix = out_suffix
+        self.out_dir = out_dir
+        self.interval = interval
+        self.ignore_last = ignore_last
+        self.interval_exp_name = interval_exp_name
+        self.keep_local = keep_local
+        self.file_client_args = file_client_args
+        self.json_log_path: Optional[str] = None
+
+        if self.out_dir is not None:
+            self.file_client = FileClient.infer_client(file_client_args,
+                                                       self.out_dir)
+            if file_client_args is None:
+                self.file_backend = get_file_backend(
+                    self.out_dir, backend_args=backend_args)
+            else:
+                self.file_backend = self.file_client
+
+        self.log_metric_by_epoch = log_metric_by_epoch
+
+    def before_run(self, runner) -> None:
+        """Infer ``self.file_client`` from ``self.out_dir``. Initialize the
+        ``self.start_iter`` and record the meta information.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.out_dir is not None:
+            # The final `self.out_dir` is the concatenation of `self.out_dir`
+            # and the last level directory of `runner.work_dir`
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_backend.join_path(self.out_dir, basename)
+            runner.logger.info(
+                f'Text logs will be saved to {self.out_dir} after the '
+                'training process.')
+
+        self.json_log_path = f'{runner.timestamp}.json'
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Record logs after training iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict tuple or list, optional): Data from dataloader.
+            outputs (dict, optional): Outputs from model.
+        """
+        # Print experiment name every n iterations.
+        if self.every_n_train_iters(
+                runner, self.interval_exp_name) or (self.end_of_epoch(
+                    runner.train_dataloader, batch_idx)):
+            exp_info = f'Exp name: {runner.experiment_name}'
+            runner.logger.info(exp_info)
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            tag, log_str = runner.log_processor.get_log_after_iter(
+                runner, batch_idx, 'train')
+        elif (self.end_of_epoch(runner.train_dataloader, batch_idx)
+              and (not self.ignore_last
+                   or len(runner.train_dataloader) <= self.interval)):
+            # `runner.max_iters` may not be divisible by `self.interval`. if
+            # `self.ignore_last==True`, the log of remaining iterations will
+            # be recorded (Epoch [4][1000/1007], the logs of 998-1007
+            # iterations will be recorded).
+            tag, log_str = runner.log_processor.get_log_after_iter(
+                runner, batch_idx, 'train')
+        else:
+            return
+        runner.logger.info(log_str)
+        runner.visualizer.add_scalars(
+            tag, step=runner.iter + 1, file_path=self.json_log_path)
+
+    def after_val_iter(self,
+                       runner,
+                       batch_idx: int,
+                       data_batch: DATA_BATCH = None,
+                       outputs: Optional[Sequence] = None) -> None:
+        """Record logs after validation iteration.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the validation
+                loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+                Defaults to None.
+            outputs (sequence, optional): Outputs from model.
+        """
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            _, log_str = runner.log_processor.get_log_after_iter(
+                runner, batch_idx, 'val')
+            runner.logger.info(log_str)
+
+    def after_test_iter(self,
+                        runner,
+                        batch_idx: int,
+                        data_batch: DATA_BATCH = None,
+                        outputs: Optional[Sequence] = None) -> None:
+        """Record logs after testing iteration.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (sequence, optional): Outputs from model.
+        """
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            _, log_str = runner.log_processor.get_log_after_iter(
+                runner, batch_idx, 'test')
+            runner.logger.info(log_str)
+
+    def after_val_epoch(self,
+                        runner,
+                        metrics: Optional[Dict[str, float]] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each validation epoch.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        tag, log_str = runner.log_processor.get_log_after_epoch(
+            runner, len(runner.val_dataloader), 'val')
+        runner.logger.info(log_str)
+        if self.log_metric_by_epoch:
+            # Accessing the epoch attribute of the runner will trigger
+            # the construction of the train_loop. Therefore, to avoid
+            # triggering the construction of the train_loop during
+            # validation, check before accessing the epoch.
+            if (isinstance(runner._train_loop, dict)
+                    or runner._train_loop is None):
+                epoch = 0
+            else:
+                epoch = runner.epoch
+            runner.visualizer.add_scalars(
+                tag, step=epoch, file_path=self.json_log_path)
+        else:
+            if (isinstance(runner._train_loop, dict)
+                    or runner._train_loop is None):
+                iter = 0
+            else:
+                iter = runner.iter
+            runner.visualizer.add_scalars(
+                tag, step=iter, file_path=self.json_log_path)
+
+    def after_test_epoch(self,
+                         runner,
+                         metrics: Optional[Dict[str, float]] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each test epoch.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on test dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        tag, log_str = runner.log_processor.get_log_after_epoch(
+            runner, len(runner.test_dataloader), 'test', with_non_scalar=True)
+        runner.logger.info(log_str)
+        dump(
+            self._process_tags(tag),
+            osp.join(runner.log_dir, self.json_log_path))  # type: ignore
+
+    @staticmethod
+    def _process_tags(tags: dict):
+        """Convert tag values to json-friendly type."""
+
+        def process_val(value):
+            if isinstance(value, (list, tuple)):
+                # Array type of json
+                return [process_val(item) for item in value]
+            elif isinstance(value, dict):
+                # Object type of json
+                return {k: process_val(v) for k, v in value.items()}
+            elif isinstance(value, (str, int, float, bool)) or value is None:
+                # Other supported type of json
+                return value
+            elif isinstance(value, (torch.Tensor, np.ndarray)):
+                return value.tolist()
+            # Drop unsupported values.
+
+        processed_tags = OrderedDict(process_val(tags))
+
+        return processed_tags
+
+    def after_run(self, runner) -> None:
+        """Copy logs to ``self.out_dir`` if ``self.out_dir is not None``
+
+        Args:
+            runner (Runner): The runner of the training/testing/validation
+                process.
+        """
+        # close the visualizer
+        runner.visualizer.close()
+
+        # copy or upload logs to self.out_dir
+        if self.out_dir is None:
+            return
+
+        removed_files = []
+        for filename in scandir(runner._log_dir, self.out_suffix, True):
+            local_filepath = osp.join(runner._log_dir, filename)
+            removed_files.append(local_filepath)
+            out_filepath = self.file_backend.join_path(self.out_dir, filename)
+            with open(local_filepath) as f:
+                self.file_backend.put_text(f.read(), out_filepath)
+
+            runner.logger.info(
+                f'The file {local_filepath} has been uploaded to '
+                f'{out_filepath}.')
+
+            if not self.keep_local:
+                runner.logger.info(f'{local_filepath} was removed due to the '
+                                   '`self.keep_local=False`. You can check '
+                                   f'the running logs in {out_filepath}')
+
+        if not self.keep_local:
+            # Close file handler to avoid PermissionError on Windows.
+            for handler in runner.logger.handlers:
+                if isinstance(handler, logging.FileHandler):
+                    handler.close()
+
+            for file in removed_files:
+                os.remove(file)
diff --git a/head_extractor/src/mmengine/hooks/naive_visualization_hook.py b/head_extractor/src/mmengine/hooks/naive_visualization_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb803a20fa47fa8cffd0a9c0defb2bbca87d1f8
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/naive_visualization_hook.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Optional, Sequence, Tuple, Union
+
+import cv2
+import numpy as np
+
+from mmengine.hooks import Hook
+from mmengine.registry import HOOKS
+from mmengine.utils.dl_utils import tensor2imgs
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+# TODO: Due to interface changes, the current class
+#  functions incorrectly
+@HOOKS.register_module()
+class NaiveVisualizationHook(Hook):
+    """Show or Write the predicted results during the process of testing.
+
+    Args:
+        interval (int): Visualization interval. Defaults to 1.
+        draw_gt (bool): Whether to draw the ground truth. Defaults to True.
+        draw_pred (bool): Whether to draw the predicted result.
+            Defaults to True.
+    """
+    priority = 'NORMAL'
+
+    def __init__(self,
+                 interval: int = 1,
+                 draw_gt: bool = True,
+                 draw_pred: bool = True):
+        self.draw_gt = draw_gt
+        self.draw_pred = draw_pred
+        self._interval = interval
+
+    def _unpad(self, input: np.ndarray, unpad_shape: Tuple[int,
+                                                           int]) -> np.ndarray:
+        """Unpad the input image.
+
+        Args:
+            input (np.ndarray): The image to unpad.
+            unpad_shape (tuple): The shape of image before padding.
+
+        Returns:
+            np.ndarray: The image before padding.
+        """
+        unpad_width, unpad_height = unpad_shape
+        unpad_image = input[:unpad_height, :unpad_width]
+        return unpad_image
+
+    def before_train(self, runner) -> None:
+        """Call add_graph method of visualizer.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        runner.visualizer.add_graph(runner.model, None)
+
+    def after_test_iter(self,
+                        runner,
+                        batch_idx: int,
+                        data_batch: DATA_BATCH = None,
+                        outputs: Optional[Sequence] = None) -> None:
+        """Show or Write the predicted results.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the test loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+            outputs (Sequence, optional): Outputs from model.
+        """
+        if self.every_n_inner_iters(batch_idx, self._interval):
+            for data, output in zip(data_batch, outputs):  # type: ignore
+                input = data['inputs']
+                data_sample = data['data_sample']
+                input = tensor2imgs(input,
+                                    **data_sample.get('img_norm_cfg',
+                                                      dict()))[0]
+                # TODO We will implement a function to revert the augmentation
+                # in the future.
+                ori_shape = (data_sample.ori_width, data_sample.ori_height)
+                if 'pad_shape' in data_sample:
+                    input = self._unpad(input,
+                                        data_sample.get('scale', ori_shape))
+                origin_image = cv2.resize(input, ori_shape)
+                name = osp.basename(data_sample.img_path)
+                runner.visualizer.add_datasample(name, origin_image,
+                                                 data_sample, output,
+                                                 self.draw_gt, self.draw_pred)
diff --git a/head_extractor/src/mmengine/hooks/param_scheduler_hook.py b/head_extractor/src/mmengine/hooks/param_scheduler_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b2f1e610a9db058e99d03faf607514d73bce030
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/param_scheduler_hook.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Union
+
+from mmengine.optim import _ParamScheduler
+from mmengine.registry import HOOKS
+from mmengine.utils import is_list_of
+from .hook import Hook
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+@HOOKS.register_module()
+class ParamSchedulerHook(Hook):
+    """A hook to update some hyper-parameters in optimizer, e.g., learning rate
+    and momentum."""
+
+    priority = 'LOW'
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Call step function for each scheduler after each training iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (dict or tuple or list, optional): Data from dataloader.
+                In order to keep this interface consistent with other hooks,
+                we keep ``data_batch`` here.
+            outputs (dict, optional): Outputs from model.
+                In order to keep this interface consistent with other hooks, we
+                keep ``data_batch`` here.
+        """
+
+        if runner.param_schedulers is None:
+            return
+
+        def step(param_schedulers):
+            assert isinstance(param_schedulers, list)
+            for scheduler in param_schedulers:
+                if not scheduler.by_epoch:
+                    scheduler.step()
+
+        if isinstance(runner.param_schedulers, list):
+            step(runner.param_schedulers)
+        elif isinstance(runner.param_schedulers, dict):
+            for param_schedulers in runner.param_schedulers.values():
+                step(param_schedulers)
+        else:
+            raise TypeError(
+                'runner.param_schedulers should be list of ParamScheduler or '
+                'a dict containing list of ParamScheduler, '
+                f'but got {runner.param_schedulers}')
+
+    def after_train_epoch(self, runner) -> None:
+        """Call step function for each scheduler after each training epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+
+        if runner.param_schedulers is None:
+            return
+
+        def step(param_schedulers):
+            assert isinstance(param_schedulers, list)
+            for scheduler in param_schedulers:
+                if scheduler.by_epoch:
+                    scheduler.step()
+
+        if isinstance(runner.param_schedulers, list):
+            step(runner.param_schedulers)
+        elif isinstance(runner.param_schedulers, dict):
+            for param_schedulers in runner.param_schedulers.values():
+                step(param_schedulers)
+        else:
+            raise TypeError(
+                'runner.param_schedulers should be list of ParamScheduler or '
+                'a dict containing list of ParamScheduler, '
+                f'but got {runner.param_schedulers}')
+
+    def after_val_epoch(self,
+                        runner,
+                        metrics: Optional[Dict[str, float]] = None) -> None:
+        """Call step function for each scheduler which has attribute
+        ``need_val_args`` after each validation epoch.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+
+        Note:
+            if ``runner.param_schedulers`` is not built before,
+            the hook ``after_val_epoch`` will be skipped.
+        """
+
+        if runner.param_schedulers is None:
+            return
+
+        # avoid counting scheduler._global_step
+        # it has counted in after_train_* hook
+        if metrics is None:
+            return
+
+        def step(param_schedulers):
+            # check param_schedulers is list and built
+            if not is_list_of(param_schedulers, _ParamScheduler):
+                return
+
+            for scheduler in param_schedulers:
+                if (scheduler.by_epoch
+                        and getattr(scheduler, 'need_val_args', False)):
+                    scheduler.step(metrics)
+
+        if isinstance(runner.param_schedulers, list):
+            step(runner.param_schedulers)
+        elif isinstance(runner.param_schedulers, dict):
+            for param_schedulers in runner.param_schedulers.values():
+                step(param_schedulers)
+        else:
+            raise TypeError(
+                'runner.param_schedulers should be list of ParamScheduler or '
+                'a dict containing list of ParamScheduler, '
+                f'but got {runner.param_schedulers}')
diff --git a/head_extractor/src/mmengine/hooks/profiler_hook.py b/head_extractor/src/mmengine/hooks/profiler_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..6339a5da9220697e68156235e6cd9782c9391dce
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/profiler_hook.py
@@ -0,0 +1,348 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+import os.path as osp
+import sys
+from typing import Callable, Optional, Union
+
+import torch
+
+from mmengine.dist import master_only
+from mmengine.hooks import Hook
+from mmengine.logging import print_log
+from mmengine.registry import HOOKS
+
+
+def check_kineto() -> bool:  # noqa
+    kineto_exist = False
+    try:
+        if torch.autograd.kineto_available():
+            kineto_exist = True
+    except AttributeError:
+        print_log('NO KINETO', logger='current', level=logging.WARNING)
+    return kineto_exist
+
+
+@HOOKS.register_module()
+class ProfilerHook(Hook):
+    """A hook to analyze performance during training and inference.
+
+    PyTorch Profiler is a tool that allows the collection of the performance
+    metrics during the training. More details on Profiler can be found at
+    `official docs <https://pytorch.org/docs/stable/profiler.html
+    #torch.profiler.profile>`_
+
+    Args:
+        by_epoch (bool): Profile performance by epoch or by iteration.
+            Defaults to True.
+        profile_times (int): The period (epoch/iter) recorded by the profiler.
+            Defaults to 1. For example, profile_iters=10 and by_epoch=False,
+            indicate that 0-10 iterations are recorded.
+        activity_with_cpu (bool): Activities to be used in the analysis (CPU)
+        activity_with_cuda (bool): Activities to be used in the analysis (CUDA)
+        schedule (dict, optional): Key-word arguments passed to
+            `torch.profile.schedule <https://pytorch.org/docs/stable/
+            profiler.html#torch.profiler.schedule>`_.
+            Defaults to None, which means profiling without a schedule
+        on_trace_ready (callable, dict, optional): Either a handler or a dict
+            of generating handler. Defaults to None, which means profiling
+            without an on_trace_ready.The Callable type needs to construct its
+            own function that can handle 'torch.autograd.profiler.profile'.
+            Two officially recommended ways are provided:
+
+            - ``schedule=dict(type='log_trace')``: Print the profiling result
+              in the terminal. See more details in the `PyTorch official tutorial`_.
+              The configurable arguments are the same as
+              ``prof.key_averages().table``
+            - ``scheduler=dict(type='tb_trace')``: Profile the performance
+              with tensorboard. See more details in the tutorial
+              `profile with tensorboard`_.
+
+        record_shapes (bool): Save information about operator's input shapes.
+            Defaults to False.
+        profile_memory (bool): Track tensor memory allocation/deallocation.
+            Defaults to False.
+        with_stack (bool): Record source information (file and line number)
+            for the ops. Defaults to False.
+        with_flops (bool): Use formula to estimate the FLOPS of specific
+            operators (matrix multiplication and 2D convolution).
+            Defaults to False.
+        json_trace_path (str, optional): Exports the collected trace in Chrome
+            JSON format. Chrome use 'chrome://tracing' view json file.
+            Defaults to None, which means profiling does not store json files.
+
+    Warnings:
+        The profiler will be closed after ``profile_times`` iterations
+        automatically. Please make sure the configuration of your scheduler
+        will not close the profiler before the iteration reach the value of
+        ``profile_times``
+
+    Examples:
+        >>> # tensorboard trace
+        >>> trace_config = dict(type='tb_trace')
+        >>> profiler_hook_cfg = dict(on_trace_ready=trace_config)
+
+    .. _PyTorch official tutorial: https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html#using-profiler-to-analyze-execution-time
+    .. _profile with tensorboard: https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html#pytorch-profiler-with-tensorboard
+    """  # noqa: E501
+    priority = 'VERY_LOW'
+
+    def __init__(self,
+                 *,
+                 by_epoch: bool = True,
+                 profile_times: int = 1,
+                 activity_with_cpu: bool = True,
+                 activity_with_cuda: bool = False,
+                 schedule: Optional[dict] = None,
+                 on_trace_ready: Union[Callable, dict, None] = None,
+                 record_shapes: bool = False,
+                 profile_memory: bool = False,
+                 with_stack: bool = False,
+                 with_flops: bool = False,
+                 json_trace_path: Optional[str] = None) -> None:
+
+        try:
+            from torch import profiler
+        except ImportError:
+            raise ImportError('please upgrade torch above 1.8.1')
+        if not check_kineto():
+            raise ImportError('Due to Kineto support issues, please upgrade '
+                              'pytorch above 1.8.1(windows users above 1.9.1)')
+
+        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.'
+        self.by_epoch = by_epoch
+
+        if profile_times < 1:
+            raise ValueError('profile_iters should be greater than 0, '
+                             f'but got {profile_times}')
+        if by_epoch and profile_times > 1:
+            raise ValueError(
+                f'Profiler will profile 0-{profile_times} epochs.\n'
+                'Since profiler will slow down the training, it is recommended'
+                ' to train 1 epoch with ProfilerHook and adjust your setting '
+                'according to the profiler summary.\n'
+                'During normal training(epoch > 1), '
+                'you may disable the ProfilerHook.')
+        self.profile_times = profile_times
+
+        assert isinstance(activity_with_cpu, bool), \
+            '``activity_with_cpu`` should be a boolean.'
+        assert isinstance(activity_with_cuda, bool), \
+            '``activity_with_cuda`` should be a boolean.'
+        self.activities = []
+        if activity_with_cpu:
+            self.activities.append(profiler.ProfilerActivity.CPU)
+        if activity_with_cuda:
+            self.activities.append(profiler.ProfilerActivity.CUDA)
+
+        if schedule is not None:
+            assert isinstance(schedule, dict), '``schedule`` should be a dict.'
+            self.schedule = profiler.schedule(**schedule)
+        else:
+            self.schedule = None
+
+        self.on_trace_ready = on_trace_ready
+        self.record_shapes = record_shapes
+        self.profile_memory = profile_memory
+        self.with_stack = with_stack
+        self.with_flops = with_flops
+
+        self.json_trace_path = json_trace_path
+        self._closed = False
+
+    def before_run(self, runner):
+        """Initialize the profiler.
+
+        Through the runner parameter, the validity of the parameter is further
+        determined.
+        """
+        max_times = runner.max_epochs if self.by_epoch else runner.max_iters
+        if max_times < self.profile_times:
+            raise ValueError(
+                f'``profile_times`` should not be greater than {max_times}')
+
+        on_trace_ready = self._parse_trace_config(runner)
+
+        self.profiler = torch.profiler.profile(  # noqa
+            activities=self.activities,
+            schedule=self.schedule,
+            on_trace_ready=on_trace_ready,
+            record_shapes=self.record_shapes,
+            profile_memory=self.profile_memory,
+            with_stack=self.with_stack,
+            with_flops=self.with_flops)
+
+        self.profiler.__enter__()
+        runner.logger.info('profiler is profiling...')
+
+    def _parse_trace_config(self, runner):
+        """Used to parse the parameter 'on_trace_ready'."""
+        if self.on_trace_ready is None:
+            _on_trace_ready = None
+        elif callable(self.on_trace_ready):
+            _on_trace_ready = self.on_trace_ready
+        elif isinstance(self.on_trace_ready, dict):
+            trace_cfg = self.on_trace_ready.copy()
+            trace_type = trace_cfg.pop('type')
+
+            # Build a log printing handle
+            if trace_type == 'log_trace':
+
+                def _log_handler(_profile):
+                    print(_profile.key_averages().table(**trace_cfg))
+
+                _on_trace_ready = _log_handler
+
+            elif trace_type == 'tb_trace':  # tensorboard_trace handler
+                try:
+                    import torch_tb_profiler  # noqa: F401
+                except ImportError:
+                    raise ImportError(
+                        'please run ``pip install torch-tb-profiler``')
+
+                if 'dir_name' not in trace_cfg:
+                    trace_cfg['dir_name'] = osp.join(runner.log_dir,
+                                                     'tf_tracing_logs')
+                elif not osp.isabs(trace_cfg['dir_name']):
+                    trace_cfg['dir_name'] = osp.join(runner.log_dir,
+                                                     trace_cfg['dir_name'])
+                runner.logger.info('trace_files of ProfilerHook will be '
+                                   f'saved to {trace_cfg["dir_name"]}.')
+
+                if self.json_trace_path is not None:
+                    runner.logger.warn(
+                        'When using tensorboard_trace, it is recommended to '
+                        'save json files by setting ``worker_name`` instead of'
+                        ' setting ``json_trace_path``')
+                _on_trace_ready = torch.profiler.tensorboard_trace_handler(
+                    **trace_cfg)
+            else:
+                raise ValueError('trace_type should be "log_trace" or '
+                                 f'"tb_trace", but got {trace_type}')
+        else:
+            raise ValueError(
+                '``on_trace_ready`` should be a handler, or dict, or None, '
+                f'but got {self.on_trace_ready}')
+        return _on_trace_ready
+
+    def after_train_epoch(self, runner):
+        """Determine if the content is exported."""
+        # `after_train_epoch` will also be called in IterBasedTrainLoop.
+        # Here we check `self._closed` to avoid exiting twice.
+        if not self._closed:
+            self._export_chrome_trace(runner)
+
+    def after_train_iter(self, runner, batch_idx, data_batch, outputs):
+        """profiler will call `step` method if it is not closed."""
+        if not self._closed:
+            self.profiler.step()
+        if runner.iter == self.profile_times - 1 and not self.by_epoch:
+            self._export_chrome_trace(runner)
+
+    def _export_chrome_trace(self, runner):
+        """Exporting content."""
+        self._closed = True
+        runner.logger.info('profiler may take a few minutes...')
+        self.profiler.__exit__(None, None, None)
+        if self.json_trace_path is not None:
+            self.profiler.export_chrome_trace(self.json_trace_path)
+
+
+@HOOKS.register_module()
+class NPUProfilerHook(Hook):
+    """NPUProfiler to analyze performance during training.
+
+    NPU Profiling is used to count the device execution time of all operators.
+    The torch_npu.npu.profile interface is used to complete the profiling data
+    collection at each stage of the project, and the data is analyzed by the
+    msprof tool and the data can be dumped to further manually analyze the
+    key performance bottlenecks. For more details on the torch_npu.npu.profile
+    interface, please visit
+    https://gitee.com/ascend/pytorch/blob/master/torch_npu/npu/profiler.py#profile
+
+    Args:
+        begin (int): Number of start iterations for profiling. Defaults to 0.
+        end (int): Number of end iterations for profiling. Defaults to 1.
+        result_path (str): The path to save the profiling results file.
+            Defaults to 'cann_profiling'.
+        exit_after_profiling (bool): Whether to exit the program after
+            profiling. Defaults to True.
+        use_e2e_profiler (bool): Turn on E2E profiling, E2E profiling combines
+            performance data at the Pytorch level and the NPU level to analyze
+            the bottlenecks of model performance end-to-end, and cannot show
+            detailed content, and only as an auxiliary analysis.
+            Defaults to False.
+        ge_profiling_to_std_out (bool): Turn on GE profiling, GE uses to
+            collect the profiling data of the host side scheduling of the
+            Assend device. Defaults to False.
+
+    Examples:
+        >>> cfg = ...
+        >>> profiler_config = dict(type='NPUProfilerHook', end=2)
+        >>> cfg.merge_from_dict({'custom_hooks': custom_hooks})
+        >>> runner = Runner.from_cfg(cfg)
+        >>> runner.train()
+    """
+    priority = 'VERY_LOW'
+
+    def __init__(self,
+                 *,
+                 begin: int = 0,
+                 end: int = 1,
+                 result_path: str = 'cann_profiling',
+                 exit_after_profiling: bool = True,
+                 use_e2e_profiler: bool = False,
+                 ge_profiling_to_std_out: bool = False):
+
+        try:
+            import torch_npu
+        except ImportError:
+            raise ImportError('Failed to import torch_npu module')
+
+        if begin >= end:
+            raise ValueError(
+                'The iteration to start profiling should not be greater'
+                'than or equal to profile end')
+
+        self.begin = begin
+        self.end = end
+        self.result_path = result_path
+        self.exit_after_profiling = exit_after_profiling
+
+        if ge_profiling_to_std_out:
+            os.environ['GE_PROFILING_TO_STD_OUT'] = '1'
+
+        if not osp.exists(self.result_path):
+            os.makedirs(self.result_path, exist_ok=True)
+
+        self.profiler = torch_npu.npu.profile(
+            self.result_path, use_e2e_profiler=use_e2e_profiler)
+
+    @master_only
+    def before_run(self, runner):
+
+        if self.end > runner.max_iters:
+            raise ValueError(
+                'The profiling end iteration should not be greater'
+                'than the max iteration')
+
+    @master_only
+    def before_train_iter(self, runner, batch_idx, data_batch=None):
+
+        if runner.iter == self.begin:
+            self.profiler.__enter__()
+            runner.logger.info('NPUProfiler starts profiling...')
+
+    @master_only
+    def after_train_iter(self,
+                         runner,
+                         batch_idx,
+                         data_batch=None,
+                         outputs=None):
+
+        if runner.iter == self.end - 1:
+            runner.logger.info('profiler may take a few minutes to'
+                               ' save the profiling result.')
+            self.profiler.__exit__(None, None, None)
+            if self.exit_after_profiling:
+                sys.exit()
diff --git a/head_extractor/src/mmengine/hooks/runtime_info_hook.py b/head_extractor/src/mmengine/hooks/runtime_info_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..49407e45631d96737b37a763850a68801c9bf777
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/runtime_info_hook.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+
+from mmengine.registry import HOOKS
+from mmengine.utils import get_git_hash
+from mmengine.version import __version__
+from .hook import Hook
+
+DATA_BATCH = Optional[Union[dict, tuple, list]]
+
+
+def _is_scalar(value: Any) -> bool:
+    """Determine the value is a scalar type value.
+
+    Args:
+        value (Any): value of log.
+
+    Returns:
+        bool: whether the value is a scalar type value.
+    """
+    if isinstance(value, np.ndarray):
+        return value.size == 1
+    elif isinstance(value, (int, float, np.number)):
+        return True
+    elif isinstance(value, torch.Tensor):
+        return value.numel() == 1
+    return False
+
+
+@HOOKS.register_module()
+class RuntimeInfoHook(Hook):
+    """A hook that updates runtime information into message hub.
+
+    E.g. ``epoch``, ``iter``, ``max_epochs``, and ``max_iters`` for the
+    training state. Components that cannot access the runner can get runtime
+    information through the message hub.
+    """
+
+    priority = 'VERY_HIGH'
+
+    def before_run(self, runner) -> None:
+        """Update metainfo.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        metainfo = dict(
+            cfg=runner.cfg.pretty_text,
+            seed=runner.seed,
+            experiment_name=runner.experiment_name,
+            mmengine_version=__version__ + get_git_hash())
+        runner.message_hub.update_info_dict(metainfo)
+
+        self.last_loop_stage = None
+
+    def before_train(self, runner) -> None:
+        """Update resumed training state.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        runner.message_hub.update_info('loop_stage', 'train')
+        runner.message_hub.update_info('epoch', runner.epoch)
+        runner.message_hub.update_info('iter', runner.iter)
+        runner.message_hub.update_info('max_epochs', runner.max_epochs)
+        runner.message_hub.update_info('max_iters', runner.max_iters)
+        if hasattr(runner.train_dataloader.dataset, 'metainfo'):
+            runner.message_hub.update_info(
+                'dataset_meta', runner.train_dataloader.dataset.metainfo)
+
+    def after_train(self, runner) -> None:
+        runner.message_hub.pop_info('loop_stage')
+
+    def before_train_epoch(self, runner) -> None:
+        """Update current epoch information before every epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        runner.message_hub.update_info('epoch', runner.epoch)
+
+    def before_train_iter(self,
+                          runner,
+                          batch_idx: int,
+                          data_batch: DATA_BATCH = None) -> None:
+        """Update current iter and learning rate information before every
+        iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (Sequence[dict], optional): Data from dataloader.
+                Defaults to None.
+        """
+        runner.message_hub.update_info('iter', runner.iter)
+        lr_dict = runner.optim_wrapper.get_lr()
+        assert isinstance(lr_dict, dict), (
+            '`runner.optim_wrapper.get_lr()` should return a dict '
+            'of learning rate when training with OptimWrapper(single '
+            'optimizer) or OptimWrapperDict(multiple optimizer), '
+            f'but got {type(lr_dict)} please check your optimizer '
+            'constructor return an `OptimWrapper` or `OptimWrapperDict` '
+            'instance')
+        for name, lr in lr_dict.items():
+            runner.message_hub.update_scalar(f'train/{name}', lr[0])
+
+    def after_train_iter(self,
+                         runner,
+                         batch_idx: int,
+                         data_batch: DATA_BATCH = None,
+                         outputs: Optional[dict] = None) -> None:
+        """Update ``log_vars`` in model outputs every iteration.
+
+        Args:
+            runner (Runner): The runner of the training process.
+            batch_idx (int): The index of the current batch in the train loop.
+            data_batch (Sequence[dict], optional): Data from dataloader.
+                Defaults to None.
+            outputs (dict, optional): Outputs from model. Defaults to None.
+        """
+        if outputs is not None:
+            for key, value in outputs.items():
+                runner.message_hub.update_scalar(f'train/{key}', value)
+
+    def before_val(self, runner) -> None:
+        self.last_loop_stage = runner.message_hub.get_info('loop_stage')
+        runner.message_hub.update_info('loop_stage', 'val')
+
+    def after_val_epoch(self,
+                        runner,
+                        metrics: Optional[Dict[str, float]] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each validation epoch.
+
+        Args:
+            runner (Runner): The runner of the validation process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        if metrics is not None:
+            for key, value in metrics.items():
+                if _is_scalar(value):
+                    runner.message_hub.update_scalar(f'val/{key}', value)
+                else:
+                    runner.message_hub.update_info(f'val/{key}', value)
+
+    def after_val(self, runner) -> None:
+        # ValLoop may be called within the TrainLoop, so we need to reset
+        # the loop_stage
+        # workflow: before_train -> before_val -> after_val -> after_train
+        if self.last_loop_stage == 'train':
+            runner.message_hub.update_info('loop_stage', self.last_loop_stage)
+            self.last_loop_stage = None
+        else:
+            runner.message_hub.pop_info('loop_stage')
+
+    def before_test(self, runner) -> None:
+        runner.message_hub.update_info('loop_stage', 'test')
+
+    def after_test(self, runner) -> None:
+        runner.message_hub.pop_info('loop_stage')
+
+    def after_test_epoch(self,
+                         runner,
+                         metrics: Optional[Dict[str, float]] = None) -> None:
+        """All subclasses should override this method, if they need any
+        operations after each test epoch.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on test dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+        """
+        if metrics is not None:
+            for key, value in metrics.items():
+                if _is_scalar(value):
+                    runner.message_hub.update_scalar(f'test/{key}', value)
+                else:
+                    runner.message_hub.update_info(f'test/{key}', value)
diff --git a/head_extractor/src/mmengine/hooks/sampler_seed_hook.py b/head_extractor/src/mmengine/hooks/sampler_seed_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aed9dbcf594fd23ca78dacaa4443d18d0ad41ce
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/sampler_seed_hook.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.registry import HOOKS
+from .hook import Hook
+
+
+@HOOKS.register_module()
+class DistSamplerSeedHook(Hook):
+    """Data-loading sampler for distributed training.
+
+    When distributed training, it is only useful in conjunction with
+    :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same
+    purpose with :obj:`IterLoader`.
+    """
+
+    priority = 'NORMAL'
+
+    def before_train_epoch(self, runner) -> None:
+        """Set the seed for sampler and batch_sampler.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if hasattr(runner.train_loop.dataloader, 'sampler') and hasattr(
+                runner.train_loop.dataloader.sampler, 'set_epoch'):
+            # In case the` _SingleProcessDataLoaderIter` has no sampler,
+            # or data loader uses `SequentialSampler` in Pytorch.
+            runner.train_loop.dataloader.sampler.set_epoch(runner.epoch)
+
+        elif hasattr(runner.train_loop.dataloader,
+                     'batch_sampler') and hasattr(
+                         runner.train_loop.dataloader.batch_sampler.sampler,
+                         'set_epoch'):
+            # In case the` _SingleProcessDataLoaderIter` has no batch sampler.
+            # batch sampler in pytorch warps the sampler as its attributes.
+            runner.train_loop.dataloader.batch_sampler.sampler.set_epoch(
+                runner.epoch)
diff --git a/head_extractor/src/mmengine/hooks/sync_buffer_hook.py b/head_extractor/src/mmengine/hooks/sync_buffer_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cc75757fed4c1ddac7e9867d87394a08f53abea
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/sync_buffer_hook.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dist import all_reduce_params, is_distributed
+from mmengine.registry import HOOKS
+from .hook import Hook
+
+
+@HOOKS.register_module()
+class SyncBuffersHook(Hook):
+    """Synchronize model buffers such as running_mean and running_var in BN at
+    the end of each epoch."""
+
+    priority = 'NORMAL'
+
+    def __init__(self) -> None:
+        self.distributed = is_distributed()
+        # A flag to mark whether synchronization has been done in
+        # after_train_epoch
+        self.called_in_train = False
+
+    def before_val_epoch(self, runner) -> None:
+        """All-reduce model buffers before each validation epoch.
+
+        Synchronize the buffers before each validation if they have not been
+        synchronized at the end of the previous training epoch. This method
+        will be called when using IterBasedTrainLoop.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.distributed:
+            if not self.called_in_train:
+                all_reduce_params(runner.model.buffers(), op='mean')
+            self.called_in_train = False
+
+    def after_train_epoch(self, runner) -> None:
+        """All-reduce model buffers at the end of each epoch.
+
+        Args:
+            runner (Runner): The runner of the training process.
+        """
+        if self.distributed:
+            all_reduce_params(runner.model.buffers(), op='mean')
+            self.called_in_train = True
diff --git a/head_extractor/src/mmengine/hooks/test_time_aug_hook.py b/head_extractor/src/mmengine/hooks/test_time_aug_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..5775736d1f783d3925f98a4dbae9fb35bf740fc0
--- /dev/null
+++ b/head_extractor/src/mmengine/hooks/test_time_aug_hook.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from mmengine.runner import Runner
+
+from mmengine.hooks import Hook
+from mmengine.registry import HOOKS, MODELS, RUNNERS
+
+
+@HOOKS.register_module()
+class PrepareTTAHook(Hook):
+    """Wraps `runner.model` with subclass of :class:`BaseTTAModel` in
+    `before_test`.
+
+    Note:
+        This function will only be used with :obj:`MMFullyShardedDataParallel`.
+
+    Args:
+        tta_cfg (dict): Config dictionary of the test time augmentation model.
+    """
+
+    def __init__(self, tta_cfg: dict):
+        self.tta_cfg = tta_cfg
+
+    def before_test(self, runner: 'Runner') -> None:
+        """Wraps `runner.model` with the subclass of :class:`BaseTTAModel`.
+
+        Args:
+            runner (Runner): The runner of the testing process.
+        """
+        self.tta_cfg['module'] = runner.model  # type: ignore
+        model = MODELS.build(self.tta_cfg)
+        runner.model = model  # type: ignore
+
+
+def build_runner_with_tta(cfg: dict) -> 'Runner':
+    """Builds runner with tta (test time augmentation) transformation and
+    TTAModel.
+
+    Note:
+        This function will only be used with :obj:`MMFullyShardedDataParallel`.
+
+    Args:
+        cfg (dict): cfg with ``tta_pipeline`` and ``tta_model``
+
+    Notes:
+        This is only an experimental feature. We may refactor the code in the
+        future.
+
+    Returns:
+        Runner: Runner with tta transformation and TTAModel
+    """
+    assert hasattr(
+        cfg,
+        'tta_model'), ('please make sure tta_model is defined in your config.')
+    assert hasattr(cfg, 'tta_pipeline'), (
+        'please make sure tta_pipeline is defined in your config.')
+    cfg['test_dataloader']['dataset']['pipeline'] = cfg['tta_pipeline']
+
+    if 'runner_type' in cfg:
+        runner = RUNNERS.build(cfg)
+    else:
+        from mmengine.runner import Runner
+        runner = Runner.from_cfg(cfg)
+
+    runner.register_hook(PrepareTTAHook(tta_cfg=cfg['tta_model']))
+    return runner
diff --git a/head_extractor/src/mmengine/hub/__init__.py b/head_extractor/src/mmengine/hub/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6f2add99c96c9401df11cafda2283f80353e5f1
--- /dev/null
+++ b/head_extractor/src/mmengine/hub/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hub import get_config, get_model
+
+__all__ = ['get_config', 'get_model']
diff --git a/head_extractor/src/mmengine/hub/deprecated.json b/head_extractor/src/mmengine/hub/deprecated.json
new file mode 100644
index 0000000000000000000000000000000000000000..473a57c0eeedd666c2adad3cf3775851db033c83
--- /dev/null
+++ b/head_extractor/src/mmengine/hub/deprecated.json
@@ -0,0 +1,6 @@
+{
+    "resnet50_caffe": "detectron/resnet50_caffe",
+    "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr",
+    "resnet101_caffe": "detectron/resnet101_caffe",
+    "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr"
+  }
diff --git a/head_extractor/src/mmengine/hub/hub.py b/head_extractor/src/mmengine/hub/hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..b24ac2c125143789c8553086eec7767b9c761c41
--- /dev/null
+++ b/head_extractor/src/mmengine/hub/hub.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import importlib
+import os.path as osp
+
+from mmengine.config import Config
+from mmengine.config.utils import (_get_cfg_metainfo,
+                                   _get_external_cfg_base_path,
+                                   _get_package_and_cfg_path)
+from mmengine.registry import MODELS, DefaultScope
+from mmengine.runner import load_checkpoint
+from mmengine.utils import get_installed_path, install_package
+
+
+def get_config(cfg_path: str, pretrained: bool = False) -> Config:
+    """Get config from external package.
+
+    Args:
+        cfg_path (str): External relative config path.
+        pretrained (bool): Whether to save pretrained model path. If
+            ``pretrained==True``, the url of pretrained model can be accessed
+            by ``cfg.model_path``. Defaults to False.
+
+    Examples:
+        >>> cfg = get_config('mmdet::faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py', pretrained=True)
+        >>> # Equivalent to
+        >>> # cfg = Config.fromfile('/path/to/faster-rcnn_r50_fpn_1x_coco.py')
+        >>> cfg.model_path
+        https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth
+
+    Returns:
+        Config: A `Config` parsed from external package.
+    """  # noqa E301
+    # Get package name and relative config path.
+    package, cfg_path = _get_package_and_cfg_path(cfg_path)
+    # Install package if it's not installed.
+    install_package(package)
+    package_path = get_installed_path(package)
+    try:
+        # Use `cfg_path` to search target config file.
+        cfg_meta = _get_cfg_metainfo(package_path, cfg_path)
+        cfg_path = osp.join(package_path, '.mim', cfg_meta['Config'])
+        cfg = Config.fromfile(cfg_path)
+        if pretrained:
+            assert 'Weights' in cfg_meta, ('Cannot find `Weights` in cfg_file'
+                                           '.metafile.yml, please check the'
+                                           'metafile')
+            cfg.model_path = cfg_meta['Weights']
+    except ValueError:
+        # Since the base config does not contain a metafile, the absolute
+        # config is `osp.join(package_path, cfg_path_prefix, cfg_name)`
+        cfg_path = _get_external_cfg_base_path(package_path, cfg_path)
+        cfg = Config.fromfile(cfg_path)
+    except Exception as e:
+        raise e
+    return cfg
+
+
+def get_model(cfg_path: str, pretrained: bool = False, **kwargs):
+    """Get built model from external package.
+
+    Args:
+        cfg_path (str): External relative config path with prefix
+            'package::' and without suffix.
+        pretrained (bool): Whether to load pretrained model. Defaults to False.
+        kwargs (dict): Default arguments to build model.
+
+    Examples:
+        >>> model = get_model('mmdet::faster_rcnn/faster-rcnn_r50_fpn_1x_coco.py', pretrained=True)
+        >>> type(model)
+        <class 'mmdet.models.detectors.faster_rcnn.FasterRCNN'>
+
+    Returns:
+        nn.Module: Built model.
+    """  # noqa E301
+    package = cfg_path.split('::')[0]
+    with DefaultScope.overwrite_default_scope(package):  # type: ignore
+        cfg = get_config(cfg_path, pretrained)
+        if 'data_preprocessor' in cfg:
+            cfg.model.data_preprocessor = cfg.data_preprocessor
+        models_module = importlib.import_module(f'{package}.utils')
+        models_module.register_all_modules()  # type: ignore
+        model = MODELS.build(cfg.model, default_args=kwargs)
+        if pretrained:
+            load_checkpoint(model, cfg.model_path)
+            # Hack to use pretrained weights.
+            # If we do not set _is_init here, Runner will call
+            # `model.init_weights()` to overwrite the pretrained model.
+            model._is_init = True
+        return model
diff --git a/head_extractor/src/mmengine/hub/mmcls.json b/head_extractor/src/mmengine/hub/mmcls.json
new file mode 100644
index 0000000000000000000000000000000000000000..071db8709c42fb386f961c3b14e9583eb51ad2c9
--- /dev/null
+++ b/head_extractor/src/mmengine/hub/mmcls.json
@@ -0,0 +1,59 @@
+{
+    "vgg11": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth",
+    "vgg13": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth",
+    "vgg16": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth",
+    "vgg19": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth",
+    "vgg11_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth",
+    "vgg13_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth",
+    "vgg16_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth",
+    "vgg19_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth",
+    "resnet18": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_8xb32_in1k_20210831-fbbb1da6.pth",
+    "resnet34": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_8xb32_in1k_20210831-f257d4e6.pth",
+    "resnet50": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb32_in1k_20210831-ea4938fc.pth",
+    "resnet101": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.pth",
+    "resnet152": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_8xb32_in1k_20210901-4d7582fa.pth",
+    "resnet50_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_b32x8_imagenet_20210531-db14775a.pth",
+    "resnet101_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_b32x8_imagenet_20210531-6e13bcd3.pth",
+    "resnet152_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_b32x8_imagenet_20210531-278cf22a.pth",
+    "resnext50_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth",
+    "resnext101_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth",
+    "resnext101_32x8d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth",
+    "resnext152_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth",
+    "se-resnet50": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth",
+    "se-resnet101": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth",
+    "resnest50": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth",
+    "resnest101": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth",
+    "resnest200": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth",
+    "resnest269": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth",
+    "shufflenet_v1": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth",
+    "shufflenet_v2": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth",
+    "mobilenet_v2": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth",
+    "mobilenet_v3_small": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth",
+    "mobilenet_v3_large": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_large-3ea3c186.pth",
+    "repvgg_A0": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_3rdparty_4xb64-coslr-120e_in1k_20210909-883ab98c.pth",
+    "repvgg_A1": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A1_3rdparty_4xb64-coslr-120e_in1k_20210909-24003a24.pth",
+    "repvgg_A2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A2_3rdparty_4xb64-coslr-120e_in1k_20210909-97d7695a.pth",
+    "repvgg_B0": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B0_3rdparty_4xb64-coslr-120e_in1k_20210909-446375f4.pth",
+    "repvgg_B1": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1_3rdparty_4xb64-coslr-120e_in1k_20210909-750cdf67.pth",
+    "repvgg_B1g2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g2_3rdparty_4xb64-coslr-120e_in1k_20210909-344f6422.pth",
+    "repvgg_B1g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g4_3rdparty_4xb64-coslr-120e_in1k_20210909-d4c1a642.pth",
+    "repvgg_B2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2_3rdparty_4xb64-coslr-120e_in1k_20210909-bd6b937c.pth",
+    "repvgg_B2g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-7b7955f0.pth",
+    "repvgg_B3": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-dda968bf.pth",
+    "repvgg_B3g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-4e54846a.pth",
+    "repvgg_D2se": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-D2se_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-cf3139b7.pth",
+    "res2net101_w26": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net101-w26-s4_3rdparty_8xb32_in1k_20210927-870b6c36.pth",
+    "res2net50_w14": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w14-s8_3rdparty_8xb32_in1k_20210927-bc967bf1.pth",
+    "res2net50_w26": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w26-s8_3rdparty_8xb32_in1k_20210927-f547a94b.pth",
+    "swin_tiny": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth",
+    "swin_small": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_small_224_b16x64_300e_imagenet_20210615_110219-7f9d988b.pth",
+    "swin_base": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_base_patch4_window7_224_22kto1k-f967f799.pth",
+    "swin_large": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_large_patch4_window7_224_22kto1k-5f0996db.pth",
+    "t2t_vit_t_14": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_3rdparty_8xb64_in1k_20210928-b7c09b62.pth",
+    "t2t_vit_t_19": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_3rdparty_8xb64_in1k_20210928-7f1478d5.pth",
+    "t2t_vit_t_24": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_3rdparty_8xb64_in1k_20210928-fe95a61b.pth",
+    "tnt_small": "https://download.openmmlab.com/mmclassification/v0/tnt/tnt-small-p16_3rdparty_in1k_20210903-c56ee7df.pth",
+    "vit_base_p16": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-98e8652b.pth",
+    "vit_base_p32": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p32_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-9cea8599.pth",
+    "vit_large_p16": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth"
+  }
diff --git a/head_extractor/src/mmengine/hub/openmmlab.json b/head_extractor/src/mmengine/hub/openmmlab.json
new file mode 100644
index 0000000000000000000000000000000000000000..0966212ef32a656aafe76c2956def67899937455
--- /dev/null
+++ b/head_extractor/src/mmengine/hub/openmmlab.json
@@ -0,0 +1,50 @@
+{
+    "vgg16_caffe": "https://download.openmmlab.com/pretrain/third_party/vgg16_caffe-292e1171.pth",
+    "detectron/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth",
+    "detectron2/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth",
+    "detectron/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth",
+    "detectron2/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_msra-6cc46731.pth",
+    "detectron2/resnext101_32x8d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth",
+    "resnext50_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext50-32x4d-0ab1a123.pth",
+    "resnext101_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d-a5af3160.pth",
+    "resnext101_64x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_64x4d-ee2c6f71.pth",
+    "contrib/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_thangvubk-ad1730dd.pth",
+    "detectron/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn-9186a21c.pth",
+    "detectron/resnet101_gn": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn-cac0ab98.pth",
+    "jhu/resnet50_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_ws-15beedd8.pth",
+    "jhu/resnet101_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn_ws-3e3c308c.pth",
+    "jhu/resnext50_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn_ws-0d87ac85.pth",
+    "jhu/resnext101_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn_ws-34ac1a9e.pth",
+    "jhu/resnext50_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn-c7e8b754.pth",
+    "jhu/resnext101_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn-ac3bb84e.pth",
+    "msra/hrnetv2_w18_small": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18_small-b5a04e21.pth",
+    "msra/hrnetv2_w18": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18-00eb2006.pth",
+    "msra/hrnetv2_w32": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w32-dc9eeb4f.pth",
+    "msra/hrnetv2_w40": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w40-ed0b031c.pth",
+    "msra/hrnetv2_w48": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w48-d2186c55.pth",
+    "bninception_caffe": "https://download.openmmlab.com/pretrain/third_party/bn_inception_caffe-ed2e8665.pth",
+    "kin400/i3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/i3d_r50_f32s2_k400-2c57e077.pth",
+    "kin400/nl3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/nl3d_r50_f32s2_k400-fa7e7caa.pth",
+    "res2net101_v1d_26w_4s": "https://download.openmmlab.com/pretrain/third_party/res2net101_v1d_26w_4s_mmdetv2-f0a600f9.pth",
+    "regnetx_400mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_400mf-a5b10d96.pth",
+    "regnetx_800mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth",
+    "regnetx_1.6gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_1.6gf-5791c176.pth",
+    "regnetx_3.2gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth",
+    "regnetx_4.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_4.0gf-a88f671e.pth",
+    "regnetx_6.4gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_6.4gf-006af45d.pth",
+    "regnetx_8.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_8.0gf-3c68abe7.pth",
+    "regnetx_12gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_12gf-4c2a3350.pth",
+    "resnet18_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet18_v1c-b5776b93.pth",
+    "resnet50_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet50_v1c-2cccc1ad.pth",
+    "resnet101_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet101_v1c-e67eebb6.pth",
+    "mmedit/vgg16": "https://download.openmmlab.com/mmediting/third_party/vgg_state_dict.pth",
+    "mmedit/res34_en_nomixup": "https://download.openmmlab.com/mmediting/third_party/model_best_resnet34_En_nomixup.pth",
+    "mmedit/mobilenet_v2": "https://download.openmmlab.com/mmediting/third_party/mobilenet_v2.pth",
+    "contrib/mobilenet_v3_large": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_large-bc2c3fd3.pth",
+    "contrib/mobilenet_v3_small": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_small-47085aa1.pth",
+    "resnest50": "https://download.openmmlab.com/pretrain/third_party/resnest50_d2-7497a55b.pth",
+    "resnest101": "https://download.openmmlab.com/pretrain/third_party/resnest101_d2-f3b931b2.pth",
+    "resnest200": "https://download.openmmlab.com/pretrain/third_party/resnest200_d2-ca88e41f.pth",
+    "darknet53": "https://download.openmmlab.com/pretrain/third_party/darknet53-a628ea1b.pth",
+    "mmdet/mobilenet_v2": "https://download.openmmlab.com/mmdetection/v2.0/third_party/mobilenet_v2_batch256_imagenet-ff34753d.pth"
+  }
diff --git a/head_extractor/src/mmengine/hub/torchvision_0.12.json b/head_extractor/src/mmengine/hub/torchvision_0.12.json
new file mode 100644
index 0000000000000000000000000000000000000000..06defe67484dff91cf6f69109324cb1dd9d64bc3
--- /dev/null
+++ b/head_extractor/src/mmengine/hub/torchvision_0.12.json
@@ -0,0 +1,57 @@
+{
+    "alexnet": "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth",
+    "densenet121": "https://download.pytorch.org/models/densenet121-a639ec97.pth",
+    "densenet169": "https://download.pytorch.org/models/densenet169-b2777c0a.pth",
+    "densenet201": "https://download.pytorch.org/models/densenet201-c1103571.pth",
+    "densenet161": "https://download.pytorch.org/models/densenet161-8d451a50.pth",
+    "efficientnet_b0": "https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth",
+    "efficientnet_b1": "https://download.pytorch.org/models/efficientnet_b1_rwightman-533bc792.pth",
+    "efficientnet_b2": "https://download.pytorch.org/models/efficientnet_b2_rwightman-bcdf34b7.pth",
+    "efficientnet_b3": "https://download.pytorch.org/models/efficientnet_b3_rwightman-cf984f9c.pth",
+    "efficientnet_b4": "https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth",
+    "efficientnet_b5": "https://download.pytorch.org/models/efficientnet_b5_lukemelas-b6417697.pth",
+    "efficientnet_b6": "https://download.pytorch.org/models/efficientnet_b6_lukemelas-c76e70fd.pth",
+    "efficientnet_b7": "https://download.pytorch.org/models/efficientnet_b7_lukemelas-dcc49843.pth",
+    "googlenet": "https://download.pytorch.org/models/googlenet-1378be20.pth",
+    "inception_v3_google": "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth",
+    "mobilenet_v2": "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth",
+    "mobilenet_v3_large": "https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth",
+    "mobilenet_v3_small": "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth",
+    "regnet_y_400mf": "https://download.pytorch.org/models/regnet_y_400mf-c65dace8.pth",
+    "regnet_y_800mf": "https://download.pytorch.org/models/regnet_y_800mf-1b27b58c.pth",
+    "regnet_y_1_6gf": "https://download.pytorch.org/models/regnet_y_1_6gf-b11a554e.pth",
+    "regnet_y_3_2gf": "https://download.pytorch.org/models/regnet_y_3_2gf-b5a9779c.pth",
+    "regnet_y_8gf": "https://download.pytorch.org/models/regnet_y_8gf-d0d0e4a8.pth",
+    "regnet_y_16gf": "https://download.pytorch.org/models/regnet_y_16gf-9e6ed7dd.pth",
+    "regnet_y_32gf": "https://download.pytorch.org/models/regnet_y_32gf-4dee3f7a.pth",
+    "regnet_x_400mf": "https://download.pytorch.org/models/regnet_x_400mf-adf1edd5.pth",
+    "regnet_x_800mf": "https://download.pytorch.org/models/regnet_x_800mf-ad17e45c.pth",
+    "regnet_x_1_6gf": "https://download.pytorch.org/models/regnet_x_1_6gf-e3633e7f.pth",
+    "regnet_x_3_2gf": "https://download.pytorch.org/models/regnet_x_3_2gf-f342aeae.pth",
+    "regnet_x_8gf": "https://download.pytorch.org/models/regnet_x_8gf-03ceed89.pth",
+    "regnet_x_16gf": "https://download.pytorch.org/models/regnet_x_16gf-2007eb11.pth",
+    "regnet_x_32gf": "https://download.pytorch.org/models/regnet_x_32gf-9d47f8d0.pth",
+    "resnet18": "https://download.pytorch.org/models/resnet18-f37072fd.pth",
+    "resnet34": "https://download.pytorch.org/models/resnet34-b627a593.pth",
+    "resnet50": "https://download.pytorch.org/models/resnet50-0676ba61.pth",
+    "resnet101": "https://download.pytorch.org/models/resnet101-63fe2227.pth",
+    "resnet152": "https://download.pytorch.org/models/resnet152-394f9c45.pth",
+    "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth",
+    "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth",
+    "wide_resnet50_2": "https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth",
+    "wide_resnet101_2": "https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth",
+    "shufflenetv2_x0.5": "https://download.pytorch.org/models/shufflenetv2_x0.5-f707e7126e.pth",
+    "shufflenetv2_x1.0": "https://download.pytorch.org/models/shufflenetv2_x1-5666bf0f80.pth",
+    "shufflenetv2_x1.5": null,
+    "shufflenetv2_x2.0": null,
+    "squeezenet1_0": "https://download.pytorch.org/models/squeezenet1_0-b66bff10.pth",
+    "squeezenet1_1": "https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth",
+    "vgg11": "https://download.pytorch.org/models/vgg11-8a719046.pth",
+    "vgg13": "https://download.pytorch.org/models/vgg13-19584684.pth",
+    "vgg16": "https://download.pytorch.org/models/vgg16-397923af.pth",
+    "vgg19": "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth",
+    "vgg11_bn": "https://download.pytorch.org/models/vgg11_bn-6002323d.pth",
+    "vgg13_bn": "https://download.pytorch.org/models/vgg13_bn-abd245e5.pth",
+    "vgg16_bn": "https://download.pytorch.org/models/vgg16_bn-6c64b313.pth",
+    "vgg19_bn": "https://download.pytorch.org/models/vgg19_bn-c79401a0.pth"
+}
diff --git a/head_extractor/src/mmengine/infer/__init__.py b/head_extractor/src/mmengine/infer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a122481f148c6aa52753ea172fee7d9e6005e484
--- /dev/null
+++ b/head_extractor/src/mmengine/infer/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .infer import BaseInferencer
+
+__all__ = ['BaseInferencer']
diff --git a/head_extractor/src/mmengine/infer/infer.py b/head_extractor/src/mmengine/infer/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..322d8852243493944c8b9adbed4095140a8ead89
--- /dev/null
+++ b/head_extractor/src/mmengine/infer/infer.py
@@ -0,0 +1,692 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import importlib
+import os.path as osp
+import re
+import warnings
+from abc import ABCMeta, abstractmethod
+from datetime import datetime
+from typing import (Any, Callable, Dict, Iterable, List, Optional, Sequence,
+                    Tuple, Union)
+
+import numpy as np
+import torch
+import torch.nn as nn
+from rich.progress import track
+
+from mmengine.config import Config, ConfigDict
+from mmengine.config.utils import MODULE2PACKAGE
+from mmengine.dataset import pseudo_collate
+from mmengine.device import get_device
+from mmengine.fileio import (get_file_backend, isdir, join_path,
+                             list_dir_or_file, load)
+from mmengine.logging import print_log
+from mmengine.registry import FUNCTIONS, MODELS, VISUALIZERS, DefaultScope
+from mmengine.runner.checkpoint import (_load_checkpoint,
+                                        _load_checkpoint_to_model)
+from mmengine.structures import InstanceData
+from mmengine.visualization import Visualizer
+
+InstanceList = List[InstanceData]
+InputType = Union[str, np.ndarray, torch.Tensor]
+InputsType = Union[InputType, Sequence[InputType]]
+ImgType = Union[np.ndarray, Sequence[np.ndarray]]
+ResType = Union[Dict, List[Dict]]
+ConfigType = Union[Config, ConfigDict]
+ModelType = Union[dict, ConfigType, str]
+
+
+class InferencerMeta(ABCMeta):
+    """Check the legality of the inferencer.
+
+    All Inferencers should not define duplicated keys for
+    ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs`` and
+    ``postprocess_kwargs``.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert isinstance(self.preprocess_kwargs, set)
+        assert isinstance(self.forward_kwargs, set)
+        assert isinstance(self.visualize_kwargs, set)
+        assert isinstance(self.postprocess_kwargs, set)
+
+        all_kwargs = (
+            self.preprocess_kwargs | self.forward_kwargs
+            | self.visualize_kwargs | self.postprocess_kwargs)
+
+        assert len(all_kwargs) == (
+            len(self.preprocess_kwargs) + len(self.forward_kwargs) +
+            len(self.visualize_kwargs) + len(self.postprocess_kwargs)), (
+                f'Class define error! {self.__name__} should not '
+                'define duplicated keys for `preprocess_kwargs`, '
+                '`forward_kwargs`, `visualize_kwargs` and '
+                '`postprocess_kwargs` are not allowed.')
+
+
+class BaseInferencer(metaclass=InferencerMeta):
+    """Base inferencer for downstream tasks.
+
+    The BaseInferencer provides the standard workflow for inference as follows:
+
+    1. Preprocess the input data by :meth:`preprocess`.
+    2. Forward the data to the model by :meth:`forward`. ``BaseInferencer``
+       assumes the model inherits from :class:`mmengine.models.BaseModel` and
+       will call `model.test_step` in :meth:`forward` by default.
+    3. Visualize the results by :meth:`visualize`.
+    4. Postprocess and return the results by :meth:`postprocess`.
+
+    When we call the subclasses inherited from BaseInferencer (not overriding
+    ``__call__``), the workflow will be executed in order.
+
+    All subclasses of BaseInferencer could define the following class
+    attributes for customization:
+
+    - ``preprocess_kwargs``: The keys of the kwargs that will be passed to
+      :meth:`preprocess`.
+    - ``forward_kwargs``: The keys of the kwargs that will be passed to
+      :meth:`forward`
+    - ``visualize_kwargs``: The keys of the kwargs that will be passed to
+      :meth:`visualize`
+    - ``postprocess_kwargs``: The keys of the kwargs that will be passed to
+      :meth:`postprocess`
+
+    All attributes mentioned above should be a ``set`` of keys (strings),
+    and each key should not be duplicated. Actually, :meth:`__call__` will
+    dispatch all the arguments to the corresponding methods according to the
+    ``xxx_kwargs`` mentioned above, therefore, the key in sets should
+    be unique to avoid ambiguous dispatching.
+
+    Warning:
+        If subclasses defined the class attributes mentioned above with
+        duplicated keys, an ``AssertionError`` will be raised during import
+        process.
+
+    Subclasses inherited from ``BaseInferencer`` should implement
+    :meth:`_init_pipeline`, :meth:`visualize` and :meth:`postprocess`:
+
+    - _init_pipeline: Return a callable object to preprocess the input data.
+    - visualize: Visualize the results returned by :meth:`forward`.
+    - postprocess: Postprocess the results returned by :meth:`forward` and
+      :meth:`visualize`.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. Take the `mmdet metafile <https://github.com/open-mmlab/mmdetection/blob/master/configs/retinanet/metafile.yml>`_
+            as an example, the `model` could be `retinanet_r18_fpn_1x_coco` or
+            its alias. If model is not specified, user must provide the
+            `weights` saved by MMEngine which contains the config string.
+            Defaults to None.
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        scope (str, optional): The scope of the model. Defaults to None.
+        show_progress (bool): Control whether to display the progress bar during
+            the inference process. Defaults to True.
+            `New in version 0.7.4.`
+
+    Note:
+        Since ``Inferencer`` could be used to infer batch data,
+        `collate_fn` should be defined. If `collate_fn` is not defined in config
+        file, the `collate_fn` will be `pseudo_collate` by default.
+    """  # noqa: E501
+
+    preprocess_kwargs: set = set()
+    forward_kwargs: set = set()
+    visualize_kwargs: set = set()
+    postprocess_kwargs: set = set()
+
+    def __init__(self,
+                 model: Union[ModelType, str, None] = None,
+                 weights: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: Optional[str] = None,
+                 show_progress: bool = True) -> None:
+        if scope is None:
+            default_scope = DefaultScope.get_current_instance()
+            if default_scope is not None:
+                scope = default_scope.scope_name
+        self.scope = scope
+        # Load config to cfg
+        cfg: ConfigType
+        if isinstance(model, str):
+            if osp.isfile(model):
+                cfg = Config.fromfile(model)
+            else:
+                # Load config and weights from metafile. If `weights` is
+                # assigned, the weights defined in metafile will be ignored.
+                cfg, _weights = self._load_model_from_metafile(model)
+                if weights is None:
+                    weights = _weights
+        elif isinstance(model, (Config, ConfigDict)):
+            cfg = copy.deepcopy(model)
+        elif isinstance(model, dict):
+            cfg = copy.deepcopy(ConfigDict(model))
+        elif model is None:
+            if weights is None:
+                raise ValueError(
+                    'If model is None, the weights must be specified since '
+                    'the config needs to be loaded from the weights')
+            cfg = ConfigDict()
+        else:
+            raise TypeError('model must be a filepath or any ConfigType'
+                            f'object, but got {type(model)}')
+
+        if device is None:
+            device = get_device()
+
+        self.model = self._init_model(cfg, weights, device)  # type: ignore
+        self.pipeline = self._init_pipeline(cfg)
+        self.collate_fn = self._init_collate(cfg)
+        self.visualizer = self._init_visualizer(cfg)
+        self.cfg = cfg
+        self.show_progress = show_progress
+
+    def __call__(
+        self,
+        inputs: InputsType,
+        return_datasamples: bool = False,
+        batch_size: int = 1,
+        **kwargs,
+    ) -> dict:
+        """Call the inferencer.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+            return_datasamples (bool): Whether to return results as
+                :obj:`BaseDataElement`. Defaults to False.
+            batch_size (int): Batch size. Defaults to 1.
+            **kwargs: Key words arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+        Returns:
+            dict: Inference and visualization results.
+        """
+        (
+            preprocess_kwargs,
+            forward_kwargs,
+            visualize_kwargs,
+            postprocess_kwargs,
+        ) = self._dispatch_kwargs(**kwargs)
+
+        ori_inputs = self._inputs_to_list(inputs)
+        inputs = self.preprocess(
+            ori_inputs, batch_size=batch_size, **preprocess_kwargs)
+        preds = []
+        for data in (track(inputs, description='Inference')
+                     if self.show_progress else inputs):
+            preds.extend(self.forward(data, **forward_kwargs))
+        visualization = self.visualize(
+            ori_inputs, preds,
+            **visualize_kwargs)  # type: ignore  # noqa: E501
+        results = self.postprocess(preds, visualization, return_datasamples,
+                                   **postprocess_kwargs)
+        return results
+
+    def _inputs_to_list(self, inputs: InputsType) -> list:
+        """Preprocess the inputs to a list.
+
+        Preprocess inputs to a list according to its type:
+
+        - list or tuple: return inputs
+        - str:
+            - Directory path: return all files in the directory
+            - other cases: return a list containing the string. The string
+              could be a path to file, a url or other types of string according
+              to the task.
+
+        Args:
+            inputs (InputsType): Inputs for the inferencer.
+
+        Returns:
+            list: List of input for the :meth:`preprocess`.
+        """
+        if isinstance(inputs, str):
+            backend = get_file_backend(inputs)
+            if hasattr(backend, 'isdir') and isdir(inputs):
+                # Backends like HttpsBackend do not implement `isdir`, so only
+                # those backends that implement `isdir` could accept the inputs
+                # as a directory
+                filename_list = list_dir_or_file(inputs, list_dir=False)
+                inputs = [
+                    join_path(inputs, filename) for filename in filename_list
+                ]
+
+        if not isinstance(inputs, (list, tuple)):
+            inputs = [inputs]
+
+        return list(inputs)
+
+    def preprocess(self, inputs: InputsType, batch_size: int = 1, **kwargs):
+        """Process the inputs into a model-feedable format.
+
+        Customize your preprocess by overriding this method. Preprocess should
+        return an iterable object, of which each item will be used as the
+        input of ``model.test_step``.
+
+        ``BaseInferencer.preprocess`` will return an iterable chunked data,
+        which will be used in __call__ like this:
+
+        .. code-block:: python
+
+            def __call__(self, inputs, batch_size=1, **kwargs):
+                chunked_data = self.preprocess(inputs, batch_size, **kwargs)
+                for batch in chunked_data:
+                    preds = self.forward(batch, **kwargs)
+
+        Args:
+            inputs (InputsType): Inputs given by user.
+            batch_size (int): batch size. Defaults to 1.
+
+        Yields:
+            Any: Data processed by the ``pipeline`` and ``collate_fn``.
+        """
+        chunked_data = self._get_chunk_data(
+            map(self.pipeline, inputs), batch_size)
+        yield from map(self.collate_fn, chunked_data)
+
+    @torch.no_grad()
+    def forward(self, inputs: Union[dict, tuple], **kwargs) -> Any:
+        """Feed the inputs to the model."""
+        return self.model.test_step(inputs)
+
+    @abstractmethod
+    def visualize(self,
+                  inputs: list,
+                  preds: Any,
+                  show: bool = False,
+                  **kwargs) -> List[np.ndarray]:
+        """Visualize predictions.
+
+        Customize your visualization by overriding this method. visualize
+        should return visualization results, which could be np.ndarray or any
+        other objects.
+
+        Args:
+            inputs (list): Inputs preprocessed by :meth:`_inputs_to_list`.
+            preds (Any): Predictions of the model.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+
+        Returns:
+            List[np.ndarray]: Visualization results.
+        """
+
+    @abstractmethod
+    def postprocess(
+        self,
+        preds: Any,
+        visualization: List[np.ndarray],
+        return_datasample=False,
+        **kwargs,
+    ) -> dict:
+        """Process the predictions and visualization results from ``forward``
+        and ``visualize``.
+
+        This method should be responsible for the following tasks:
+
+        1. Convert datasamples into a json-serializable dict if needed.
+        2. Pack the predictions and visualization results and return them.
+        3. Dump or log the predictions.
+
+        Customize your postprocess by overriding this method. Make sure
+        ``postprocess`` will return a dict with visualization results and
+        inference results.
+
+        Args:
+            preds (List[Dict]): Predictions of the model.
+            visualization (np.ndarray): Visualized predictions.
+            return_datasample (bool): Whether to return results as datasamples.
+                Defaults to False.
+
+        Returns:
+            dict: Inference and visualization results with key ``predictions``
+            and ``visualization``
+
+            - ``visualization (Any)``: Returned by :meth:`visualize`
+            - ``predictions`` (dict or DataSample): Returned by
+              :meth:`forward` and processed in :meth:`postprocess`.
+              If ``return_datasample=False``, it usually should be a
+              json-serializable dict containing only basic data elements such
+              as strings and numbers.
+        """
+
+    def _load_model_from_metafile(self, model: str) -> Tuple[Config, str]:
+        """Load config and weights from metafile.
+
+        Args:
+            model (str): model name defined in metafile.
+
+        Returns:
+            Tuple[Config, str]: Loaded Config and weights path defined in
+            metafile.
+        """
+        model = model.lower()
+
+        assert self.scope is not None, (
+            'scope should be initialized if you want '
+            'to load config from metafile.')
+        assert self.scope in MODULE2PACKAGE, (
+            f'{self.scope} not in {MODULE2PACKAGE}!,'
+            'please pass a valid scope.')
+
+        repo_or_mim_dir = BaseInferencer._get_repo_or_mim_dir(self.scope)
+        for model_cfg in BaseInferencer._get_models_from_metafile(
+                repo_or_mim_dir):
+            model_name = model_cfg['Name'].lower()
+            model_aliases = model_cfg.get('Alias', [])
+            if isinstance(model_aliases, str):
+                model_aliases = [model_aliases.lower()]
+            else:
+                model_aliases = [alias.lower() for alias in model_aliases]
+            if (model_name == model or model in model_aliases):
+                cfg = Config.fromfile(
+                    osp.join(repo_or_mim_dir, model_cfg['Config']))
+                weights = model_cfg['Weights']
+                weights = weights[0] if isinstance(weights, list) else weights
+                return cfg, weights
+        raise ValueError(f'Cannot find model: {model} in {self.scope}')
+
+    @staticmethod
+    def _get_repo_or_mim_dir(scope):
+        """Get the directory where the ``Configs`` located when the package is
+        installed or ``PYTHONPATH`` is set.
+
+        Args:
+            scope (str): The scope of repository.
+
+        Returns:
+            str: The directory where the ``Configs`` is located.
+        """
+        try:
+            module = importlib.import_module(scope)
+        except ImportError:
+            if scope not in MODULE2PACKAGE:
+                raise KeyError(
+                    f'{scope} is not a valid scope. The available scopes '
+                    f'are {MODULE2PACKAGE.keys()}')
+            else:
+                project = MODULE2PACKAGE[scope]
+                raise ImportError(
+                    f'Cannot import {scope} correctly, please try to install '
+                    f'the {project} by "pip install {project}"')
+        # Since none of OpenMMLab series packages are namespace packages
+        # (https://docs.python.org/3/glossary.html#term-namespace-package),
+        # The first element of module.__path__ means package installation path.
+        package_path = module.__path__[0]
+
+        if osp.exists(osp.join(osp.dirname(package_path), 'configs')):
+            repo_dir = osp.dirname(package_path)
+            return repo_dir
+        else:
+            mim_dir = osp.join(package_path, '.mim')
+            if not osp.exists(osp.join(mim_dir, 'configs')):
+                raise FileNotFoundError(
+                    f'Cannot find `configs` directory in {package_path}!, '
+                    f'please check the completeness of the {scope}.')
+            return mim_dir
+
+    def _init_model(
+        self,
+        cfg: ConfigType,
+        weights: Optional[str],
+        device: str = 'cpu',
+    ) -> nn.Module:
+        """Initialize the model with the given config and checkpoint on the
+        specific device.
+
+        Args:
+            cfg (ConfigType): Config containing the model information.
+            weights (str, optional): Path to the checkpoint.
+            device (str, optional): Device to run inference. Defaults to 'cpu'.
+
+        Returns:
+            nn.Module: Model loaded with checkpoint.
+        """
+        checkpoint: Optional[dict] = None
+        if weights is not None:
+            checkpoint = _load_checkpoint(weights, map_location='cpu')
+
+        if not cfg:
+            assert checkpoint is not None
+            try:
+                # Prefer to get config from `message_hub` since `message_hub`
+                # is a more stable module to store all runtime information.
+                # However, the early version of MMEngine will not save config
+                # in `message_hub`, so we will try to load config from `meta`.
+                cfg_string = checkpoint['message_hub']['runtime_info']['cfg']
+            except KeyError:
+                assert 'meta' in checkpoint, (
+                    'If model(config) is not provided, the checkpoint must'
+                    'contain the config string in `meta` or `message_hub`, '
+                    'but both `meta` and `message_hub` are not found in the '
+                    'checkpoint.')
+                meta = checkpoint['meta']
+                if 'cfg' in meta:
+                    cfg_string = meta['cfg']
+                else:
+                    raise ValueError(
+                        'Cannot find the config in the checkpoint.')
+            cfg.update(
+                Config.fromstring(cfg_string, file_format='.py')._cfg_dict)
+
+        # Delete the `pretrained` field to prevent model from loading the
+        # the pretrained weights unnecessarily.
+        if cfg.model.get('pretrained') is not None:
+            del cfg.model.pretrained
+
+        model = MODELS.build(cfg.model)
+        model.cfg = cfg
+        self._load_weights_to_model(model, checkpoint, cfg)
+        model.to(device)
+        model.eval()
+        return model
+
+    def _load_weights_to_model(self, model: nn.Module,
+                               checkpoint: Optional[dict],
+                               cfg: Optional[ConfigType]) -> None:
+        """Loading model weights and meta information from cfg and checkpoint.
+
+        Subclasses could override this method to load extra meta information
+        from ``checkpoint`` and ``cfg`` to model.
+
+        Args:
+            model (nn.Module): Model to load weights and meta information.
+            checkpoint (dict, optional): The loaded checkpoint.
+            cfg (Config or ConfigDict, optional): The loaded config.
+        """
+        if checkpoint is not None:
+            _load_checkpoint_to_model(model, checkpoint)
+        else:
+            warnings.warn('Checkpoint is not loaded, and the inference '
+                          'result is calculated by the randomly initialized '
+                          'model!')
+
+    def _init_collate(self, cfg: ConfigType) -> Callable:
+        """Initialize the ``collate_fn`` with the given config.
+
+        The returned ``collate_fn`` will be used to collate the batch data.
+        If will be used in :meth:`preprocess` like this
+
+        .. code-block:: python
+            def preprocess(self, inputs, batch_size, **kwargs):
+                ...
+                dataloader = map(self.collate_fn, dataloader)
+                yield from dataloader
+
+        Args:
+            cfg (ConfigType): Config which could contained the `collate_fn`
+                information. If `collate_fn` is not defined in config, it will
+                be :func:`pseudo_collate`.
+
+        Returns:
+            Callable: Collate function.
+        """
+        try:
+            with FUNCTIONS.switch_scope_and_registry(self.scope) as registry:
+                collate_fn = registry.get(cfg.test_dataloader.collate_fn)
+        except AttributeError:
+            collate_fn = pseudo_collate
+        return collate_fn  # type: ignore
+
+    @abstractmethod
+    def _init_pipeline(self, cfg: ConfigType) -> Callable:
+        """Initialize the test pipeline.
+
+        Return a pipeline to handle various input data, such as ``str``,
+        ``np.ndarray``. It is an abstract method in BaseInferencer, and should
+        be implemented in subclasses.
+
+        The returned pipeline will be used to process a single data.
+        It will be used in :meth:`preprocess` like this:
+
+        .. code-block:: python
+            def preprocess(self, inputs, batch_size, **kwargs):
+                ...
+                dataset = map(self.pipeline, dataset)
+                ...
+        """
+
+    def _init_visualizer(self, cfg: ConfigType) -> Optional[Visualizer]:
+        """Initialize visualizers.
+
+        Args:
+            cfg (ConfigType): Config containing the visualizer information.
+
+        Returns:
+            Visualizer or None: Visualizer initialized with config.
+        """
+        if 'visualizer' not in cfg:
+            return None
+        timestamp = str(datetime.timestamp(datetime.now()))
+        name = cfg.visualizer.get('name', timestamp)
+        if Visualizer.check_instance_created(name):
+            name = f'{name}-{timestamp}'
+        cfg.visualizer.name = name
+        return VISUALIZERS.build(cfg.visualizer)
+
+    def _get_chunk_data(self, inputs: Iterable, chunk_size: int):
+        """Get batch data from dataset.
+
+        Args:
+            inputs (Iterable): An iterable dataset.
+            chunk_size (int): Equivalent to batch size.
+
+        Yields:
+            list: batch data.
+        """
+        inputs_iter = iter(inputs)
+        while True:
+            try:
+                chunk_data = []
+                for _ in range(chunk_size):
+                    processed_data = next(inputs_iter)
+                    chunk_data.append(processed_data)
+                yield chunk_data
+            except StopIteration:
+                if chunk_data:
+                    yield chunk_data
+                break
+
+    def _dispatch_kwargs(self, **kwargs) -> Tuple[Dict, Dict, Dict, Dict]:
+        """Dispatch kwargs to preprocess(), forward(), visualize() and
+        postprocess() according to the actual demands.
+
+        Returns:
+            Tuple[Dict, Dict, Dict, Dict]: kwargs passed to preprocess,
+            forward, visualize and postprocess respectively.
+        """
+        # Ensure each argument only matches one function
+        method_kwargs = self.preprocess_kwargs | self.forward_kwargs | \
+            self.visualize_kwargs | self.postprocess_kwargs
+
+        union_kwargs = method_kwargs | set(kwargs.keys())
+        if union_kwargs != method_kwargs:
+            unknown_kwargs = union_kwargs - method_kwargs
+            raise ValueError(
+                f'unknown argument {unknown_kwargs} for `preprocess`, '
+                '`forward`, `visualize` and `postprocess`')
+
+        preprocess_kwargs = {}
+        forward_kwargs = {}
+        visualize_kwargs = {}
+        postprocess_kwargs = {}
+
+        for key, value in kwargs.items():
+            if key in self.preprocess_kwargs:
+                preprocess_kwargs[key] = value
+            elif key in self.forward_kwargs:
+                forward_kwargs[key] = value
+            elif key in self.visualize_kwargs:
+                visualize_kwargs[key] = value
+            else:
+                postprocess_kwargs[key] = value
+
+        return (
+            preprocess_kwargs,
+            forward_kwargs,
+            visualize_kwargs,
+            postprocess_kwargs,
+        )
+
+    @staticmethod
+    def _get_models_from_metafile(dir: str):
+        """Load model config defined in metafile from package path.
+
+        Args:
+            dir (str): Path to the directory of Config. It requires the
+                directory ``Config``, file ``model-index.yml`` exists in the
+                ``dir``.
+
+        Yields:
+            dict: Model config defined in metafile.
+        """
+        meta_indexes = load(osp.join(dir, 'model-index.yml'))
+        for meta_path in meta_indexes['Import']:
+            # meta_path example: mmcls/.mim/configs/conformer/metafile.yml
+            meta_path = osp.join(dir, meta_path)
+            metainfo = load(meta_path)
+            yield from metainfo['Models']
+
+    @staticmethod
+    def list_models(scope: Optional[str] = None, patterns: str = r'.*'):
+        """List models defined in metafile of corresponding packages.
+
+        Args:
+            scope (str, optional): The scope to which the model belongs.
+                Defaults to None.
+            patterns (str, optional): Regular expressions for the searched
+                models. Once matched with ``Alias`` or ``Name`` filed in
+                metafile, corresponding model will be added to the return list.
+                Defaults to '.*'.
+
+        Returns:
+            dict: Model dict with model name and its alias.
+        """
+        matched_models = []
+        if scope is None:
+            default_scope = DefaultScope.get_current_instance()
+            assert default_scope is not None, (
+                'scope should be initialized if you want '
+                'to load config from metafile.')
+        assert scope in MODULE2PACKAGE, (
+            f'{scope} not in {MODULE2PACKAGE}!, please make pass a valid '
+            'scope.')
+        root_or_mim_dir = BaseInferencer._get_repo_or_mim_dir(scope)
+        for model_cfg in BaseInferencer._get_models_from_metafile(
+                root_or_mim_dir):
+            model_name = [model_cfg['Name']]
+            model_name.extend(model_cfg.get('Alias', []))
+            for name in model_name:
+                if re.match(patterns, name) is not None:
+                    matched_models.append(name)
+        output_str = ''
+        for name in matched_models:
+            output_str += f'model_name: {name}\n'
+        print_log(output_str, logger='current')
+        return matched_models
diff --git a/head_extractor/src/mmengine/logging/__init__.py b/head_extractor/src/mmengine/logging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba5533c2363f49c749d0dba87f49496e3861ed80
--- /dev/null
+++ b/head_extractor/src/mmengine/logging/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .history_buffer import HistoryBuffer
+from .logger import MMLogger, print_log
+from .message_hub import MessageHub
+
+__all__ = ['HistoryBuffer', 'MessageHub', 'MMLogger', 'print_log']
diff --git a/head_extractor/src/mmengine/logging/history_buffer.py b/head_extractor/src/mmengine/logging/history_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..58effa8152381b0c9e35b64ada08473994c4bf14
--- /dev/null
+++ b/head_extractor/src/mmengine/logging/history_buffer.py
@@ -0,0 +1,229 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Any, Callable, Optional, Sequence, Tuple, Union
+
+import numpy as np
+
+
+class HistoryBuffer:
+    """Unified storage format for different log types.
+
+    ``HistoryBuffer`` records the history of log for further statistics.
+
+    Examples:
+        >>> history_buffer = HistoryBuffer()
+        >>> # Update history_buffer.
+        >>> history_buffer.update(1)
+        >>> history_buffer.update(2)
+        >>> history_buffer.min()  # minimum of (1, 2)
+        1
+        >>> history_buffer.max()  # maximum of (1, 2)
+        2
+        >>> history_buffer.mean()  # mean of (1, 2)
+        1.5
+        >>> history_buffer.statistics('mean')  # access method by string.
+        1.5
+
+    Args:
+        log_history (Sequence): History logs. Defaults to [].
+        count_history (Sequence): Counts of history logs. Defaults to [].
+        max_length (int): The max length of history logs. Defaults to 1000000.
+    """
+    _statistics_methods: dict = dict()
+
+    def __init__(self,
+                 log_history: Sequence = [],
+                 count_history: Sequence = [],
+                 max_length: int = 1000000):
+
+        self.max_length = max_length
+        self._set_default_statistics()
+        assert len(log_history) == len(count_history), \
+            'The lengths of log_history and count_histroy should be equal'
+        if len(log_history) > max_length:
+            warnings.warn(f'The length of history buffer({len(log_history)}) '
+                          f'exceeds the max_length({max_length}), the first '
+                          'few elements will be ignored.')
+            self._log_history = np.array(log_history[-max_length:])
+            self._count_history = np.array(count_history[-max_length:])
+        else:
+            self._log_history = np.array(log_history)
+            self._count_history = np.array(count_history)
+
+    def _set_default_statistics(self) -> None:
+        """Register default statistic methods: min, max, current and mean."""
+        self._statistics_methods.setdefault('min', HistoryBuffer.min)
+        self._statistics_methods.setdefault('max', HistoryBuffer.max)
+        self._statistics_methods.setdefault('current', HistoryBuffer.current)
+        self._statistics_methods.setdefault('mean', HistoryBuffer.mean)
+
+    def update(self, log_val: Union[int, float], count: int = 1) -> None:
+        """update the log history.
+
+        If the length of the buffer exceeds ``self._max_length``, the oldest
+        element will be removed from the buffer.
+
+        Args:
+            log_val (int or float): The value of log.
+            count (int): The accumulation times of log, defaults to 1.
+            ``count`` will be used in smooth statistics.
+        """
+        if (not isinstance(log_val, (int, float))
+                or not isinstance(count, (int, float))):
+            raise TypeError(f'log_val must be int or float but got '
+                            f'{type(log_val)}, count must be int but got '
+                            f'{type(count)}')
+        self._log_history = np.append(self._log_history, log_val)
+        self._count_history = np.append(self._count_history, count)
+        if len(self._log_history) > self.max_length:
+            self._log_history = self._log_history[-self.max_length:]
+            self._count_history = self._count_history[-self.max_length:]
+
+    @property
+    def data(self) -> Tuple[np.ndarray, np.ndarray]:
+        """Get the ``_log_history`` and ``_count_history``.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]: History logs and the counts of
+            the history logs.
+        """
+        return self._log_history, self._count_history
+
+    @classmethod
+    def register_statistics(cls, method: Callable) -> Callable:
+        """Register custom statistics method to ``_statistics_methods``.
+
+        The registered method can be called by ``history_buffer.statistics``
+        with corresponding method name and arguments.
+
+        Examples:
+            >>> @HistoryBuffer.register_statistics
+            >>> def weighted_mean(self, window_size, weight):
+            >>>     assert len(weight) == window_size
+            >>>     return (self._log_history[-window_size:] *
+            >>>             np.array(weight)).sum() / \
+            >>>             self._count_history[-window_size:]
+
+            >>> log_buffer = HistoryBuffer([1, 2], [1, 1])
+            >>> log_buffer.statistics('weighted_mean', 2, [2, 1])
+            2
+
+        Args:
+            method (Callable): Custom statistics method.
+        Returns:
+            Callable: Original custom statistics method.
+        """
+        method_name = method.__name__
+        assert method_name not in cls._statistics_methods, \
+            'method_name cannot be registered twice!'
+        cls._statistics_methods[method_name] = method
+        return method
+
+    def statistics(self, method_name: str, *arg, **kwargs) -> Any:
+        """Access statistics method by name.
+
+        Args:
+            method_name (str): Name of method.
+
+        Returns:
+            Any: Depends on corresponding method.
+        """
+        if method_name not in self._statistics_methods:
+            raise KeyError(f'{method_name} has not been registered in '
+                           'HistoryBuffer._statistics_methods')
+        method = self._statistics_methods[method_name]
+        # Provide self arguments for registered functions.
+        return method(self, *arg, **kwargs)
+
+    def mean(self, window_size: Optional[int] = None) -> np.ndarray:
+        """Return the mean of the latest ``window_size`` values in log
+        histories.
+
+        If ``window_size is None`` or ``window_size > len(self._log_history)``,
+        return the global mean value of history logs.
+
+        Args:
+            window_size (int, optional): Size of statistics window.
+        Returns:
+            np.ndarray: Mean value within the window.
+        """
+        if window_size is not None:
+            assert isinstance(window_size, int), \
+                'The type of window size should be int, but got ' \
+                f'{type(window_size)}'
+        else:
+            window_size = len(self._log_history)
+        logs_sum = self._log_history[-window_size:].sum()
+        counts_sum = self._count_history[-window_size:].sum()
+        return logs_sum / counts_sum
+
+    def max(self, window_size: Optional[int] = None) -> np.ndarray:
+        """Return the maximum value of the latest ``window_size`` values in log
+        histories.
+
+        If ``window_size is None`` or ``window_size > len(self._log_history)``,
+        return the global maximum value of history logs.
+
+        Args:
+            window_size (int, optional): Size of statistics window.
+        Returns:
+            np.ndarray: The maximum value within the window.
+        """
+        if window_size is not None:
+            assert isinstance(window_size, int), \
+                'The type of window size should be int, but got ' \
+                f'{type(window_size)}'
+        else:
+            window_size = len(self._log_history)
+        return self._log_history[-window_size:].max()
+
+    def min(self, window_size: Optional[int] = None) -> np.ndarray:
+        """Return the minimum value of the latest ``window_size`` values in log
+        histories.
+
+        If ``window_size is None`` or ``window_size > len(self._log_history)``,
+        return the global minimum value of history logs.
+
+        Args:
+            window_size (int, optional): Size of statistics window.
+        Returns:
+            np.ndarray: The minimum value within the window.
+        """
+        if window_size is not None:
+            assert isinstance(window_size, int), \
+                'The type of window size should be int, but got ' \
+                f'{type(window_size)}'
+        else:
+            window_size = len(self._log_history)
+        return self._log_history[-window_size:].min()
+
+    def current(self) -> np.ndarray:
+        """Return the recently updated values in log histories.
+
+        Returns:
+            np.ndarray: Recently updated values in log histories.
+        """
+        if len(self._log_history) == 0:
+            raise ValueError('HistoryBuffer._log_history is an empty array! '
+                             'please call update first')
+        return self._log_history[-1]
+
+    def __getstate__(self) -> dict:
+        """Make ``_statistics_methods`` can be resumed.
+
+        Returns:
+            dict: State dict including statistics_methods.
+        """
+        self.__dict__.update(statistics_methods=self._statistics_methods)
+        return self.__dict__
+
+    def __setstate__(self, state):
+        """Try to load ``_statistics_methods`` from state.
+
+        Args:
+            state (dict): State dict.
+        """
+        statistics_methods = state.pop('statistics_methods', {})
+        self._set_default_statistics()
+        self._statistics_methods.update(statistics_methods)
+        self.__dict__.update(state)
diff --git a/head_extractor/src/mmengine/logging/logger.py b/head_extractor/src/mmengine/logging/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..839a08cddaec1869b256af9268fbae17858855c8
--- /dev/null
+++ b/head_extractor/src/mmengine/logging/logger.py
@@ -0,0 +1,463 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import logging
+import os
+import os.path as osp
+import sys
+import warnings
+from getpass import getuser
+from logging import Logger, LogRecord, handlers
+from socket import gethostname
+from typing import Dict, Optional, Union
+
+from termcolor import colored
+
+from mmengine.utils import ManagerMixin
+from mmengine.utils.manager import _accquire_lock, _release_lock
+
+
+class FilterDuplicateWarning(logging.Filter):
+    """Filter the repeated warning message.
+
+    Args:
+        name (str): name of the filter.
+    """
+
+    def __init__(self, name: str = 'mmengine'):
+        super().__init__(name)
+        self.seen: set = set()
+
+    def filter(self, record: LogRecord) -> bool:
+        """Filter the repeated warning message.
+
+        Args:
+            record (LogRecord): The log record.
+
+        Returns:
+            bool: Whether to output the log record.
+        """
+        if record.levelno != logging.WARNING:
+            return True
+
+        if record.msg not in self.seen:
+            self.seen.add(record.msg)
+            return True
+        return False
+
+
+class MMFormatter(logging.Formatter):
+    """Colorful format for MMLogger. If the log level is error, the logger will
+    additionally output the location of the code.
+
+    Args:
+        color (bool): Whether to use colorful format. filehandler is not
+            allowed to use color format, otherwise it will be garbled.
+        blink (bool): Whether to blink the ``INFO`` and ``DEBUG`` logging
+            level.
+        **kwargs: Keyword arguments passed to
+            :meth:`logging.Formatter.__init__`.
+    """
+    _color_mapping: dict = dict(
+        ERROR='red', WARNING='yellow', INFO='white', DEBUG='green')
+
+    def __init__(self, color: bool = True, blink: bool = False, **kwargs):
+        super().__init__(**kwargs)
+        assert not (not color and blink), (
+            'blink should only be available when color is True')
+        # Get prefix format according to color.
+        error_prefix = self._get_prefix('ERROR', color, blink=True)
+        warn_prefix = self._get_prefix('WARNING', color, blink=True)
+        info_prefix = self._get_prefix('INFO', color, blink)
+        debug_prefix = self._get_prefix('DEBUG', color, blink)
+
+        # Config output format.
+        self.err_format = (f'%(asctime)s - %(name)s - {error_prefix} - '
+                           '%(pathname)s - %(funcName)s - %(lineno)d - '
+                           '%(message)s')
+        self.warn_format = (f'%(asctime)s - %(name)s - {warn_prefix} - %('
+                            'message)s')
+        self.info_format = (f'%(asctime)s - %(name)s - {info_prefix} - %('
+                            'message)s')
+        self.debug_format = (f'%(asctime)s - %(name)s - {debug_prefix} - %('
+                             'message)s')
+
+    def _get_prefix(self, level: str, color: bool, blink=False) -> str:
+        """Get the prefix of the target log level.
+
+        Args:
+            level (str): log level.
+            color (bool): Whether to get colorful prefix.
+            blink (bool): Whether the prefix will blink.
+
+        Returns:
+            str: The plain or colorful prefix.
+        """
+        if color:
+            attrs = ['underline']
+            if blink:
+                attrs.append('blink')
+            prefix = colored(level, self._color_mapping[level], attrs=attrs)
+        else:
+            prefix = level
+        return prefix
+
+    def format(self, record: LogRecord) -> str:
+        """Override the `logging.Formatter.format`` method `. Output the
+        message according to the specified log level.
+
+        Args:
+            record (LogRecord): A LogRecord instance represents an event being
+                logged.
+
+        Returns:
+            str: Formatted result.
+        """
+        if record.levelno == logging.ERROR:
+            self._style._fmt = self.err_format
+        elif record.levelno == logging.WARNING:
+            self._style._fmt = self.warn_format
+        elif record.levelno == logging.INFO:
+            self._style._fmt = self.info_format
+        elif record.levelno == logging.DEBUG:
+            self._style._fmt = self.debug_format
+
+        result = logging.Formatter.format(self, record)
+        return result
+
+
+class MMLogger(Logger, ManagerMixin):
+    """Formatted logger used to record messages.
+
+    ``MMLogger`` can create formatted logger to log message with different
+    log levels and get instance in the same way as ``ManagerMixin``.
+    ``MMLogger`` has the following features:
+
+    - Distributed log storage, ``MMLogger`` can choose whether to save log of
+      different ranks according to `log_file`.
+    - Message with different log levels will have different colors and format
+      when displayed on terminal.
+
+    Note:
+        - The `name` of logger and the ``instance_name`` of ``MMLogger`` could
+          be different. We can only get ``MMLogger`` instance by
+          ``MMLogger.get_instance`` but not ``logging.getLogger``. This feature
+          ensures ``MMLogger`` will not be incluenced by third-party logging
+          config.
+        - Different from ``logging.Logger``, ``MMLogger`` will not log warning
+          or error message without ``Handler``.
+
+    Examples:
+        >>> logger = MMLogger.get_instance(name='MMLogger',
+        >>>                                logger_name='Logger')
+        >>> # Although logger has name attribute just like `logging.Logger`
+        >>> # We cannot get logger instance by `logging.getLogger`.
+        >>> assert logger.name == 'Logger'
+        >>> assert logger.instance_name = 'MMLogger'
+        >>> assert id(logger) != id(logging.getLogger('Logger'))
+        >>> # Get logger that do not store logs.
+        >>> logger1 = MMLogger.get_instance('logger1')
+        >>> # Get logger only save rank0 logs.
+        >>> logger2 = MMLogger.get_instance('logger2', log_file='out.log')
+        >>> # Get logger only save multiple ranks logs.
+        >>> logger3 = MMLogger.get_instance('logger3', log_file='out.log',
+        >>>                                 distributed=True)
+
+    Args:
+        name (str): Global instance name.
+        logger_name (str): ``name`` attribute of ``Logging.Logger`` instance.
+            If `logger_name` is not defined, defaults to 'mmengine'.
+        log_file (str, optional): The log filename. If specified, a
+            ``FileHandler`` will be added to the logger. Defaults to None.
+        log_level (str): The log level of the handler. Defaults to
+            'INFO'. If log level is 'DEBUG', distributed logs will be saved
+            during distributed training.
+        file_mode (str): The file mode used to open log file. Defaults to 'w'.
+        distributed (bool): Whether to save distributed logs, Defaults to
+            false.
+        file_handler_cfg (dict, optional): Configuration of file handler.
+            Defaults to None. If ``file_handler_cfg`` is not specified,
+            ``logging.FileHandler`` will be used by default. If it is
+            specified, the ``type`` key should be set. It can be
+            ``RotatingFileHandler``, ``TimedRotatingFileHandler``,
+            ``WatchedFileHandler`` or other file handlers, and the remaining
+            fields will be used to build the handler.
+
+            Examples:
+                >>> file_handler_cfg = dict(
+                >>>    type='TimedRotatingFileHandler',
+                >>>    when='MIDNIGHT',
+                >>>    interval=1,
+                >>>    backupCount=365)
+
+            `New in version 0.9.0.`
+    """
+
+    def __init__(self,
+                 name: str,
+                 logger_name='mmengine',
+                 log_file: Optional[str] = None,
+                 log_level: Union[int, str] = 'INFO',
+                 file_mode: str = 'w',
+                 distributed=False,
+                 file_handler_cfg: Optional[dict] = None):
+        Logger.__init__(self, logger_name)
+        ManagerMixin.__init__(self, name)
+        # Get rank in DDP mode.
+        if isinstance(log_level, str):
+            log_level = logging._nameToLevel[log_level]
+        global_rank = _get_rank()
+        device_id = _get_device_id()
+
+        # Config stream_handler. If `rank != 0`. stream_handler can only
+        # export ERROR logs.
+        stream_handler = logging.StreamHandler(stream=sys.stdout)
+        # `StreamHandler` record month, day, hour, minute, and second
+        # timestamp.
+        stream_handler.setFormatter(
+            MMFormatter(color=True, datefmt='%m/%d %H:%M:%S'))
+        # Only rank0 `StreamHandler` will log messages below error level.
+        if global_rank == 0:
+            stream_handler.setLevel(log_level)
+        else:
+            stream_handler.setLevel(logging.ERROR)
+        stream_handler.addFilter(FilterDuplicateWarning(logger_name))
+        self.handlers.append(stream_handler)
+
+        if log_file is not None:
+            world_size = _get_world_size()
+            is_distributed = (log_level <= logging.DEBUG
+                              or distributed) and world_size > 1
+            if is_distributed:
+                filename, suffix = osp.splitext(osp.basename(log_file))
+                hostname = _get_host_info()
+                if hostname:
+                    filename = (f'{filename}_{hostname}_device{device_id}_'
+                                f'rank{global_rank}{suffix}')
+                else:
+                    # Omit hostname if it is empty
+                    filename = (f'{filename}_device{device_id}_'
+                                f'rank{global_rank}{suffix}')
+                log_file = osp.join(osp.dirname(log_file), filename)
+            # Save multi-ranks logs if distributed is True. The logs of rank0
+            # will always be saved.
+            if global_rank == 0 or is_distributed:
+                if file_handler_cfg is not None:
+                    assert 'type' in file_handler_cfg
+                    file_handler_type = file_handler_cfg.pop('type')
+                    file_handlers_map = _get_logging_file_handlers()
+                    if file_handler_type in file_handlers_map:
+                        file_handler_cls = file_handlers_map[file_handler_type]
+                        file_handler_cfg.setdefault('filename', log_file)
+                        file_handler = file_handler_cls(**file_handler_cfg)
+                    else:
+                        raise ValueError('`logging.handlers` does not '
+                                         f'contain {file_handler_type}')
+                else:
+                    # Here, the default behavior of the official
+                    # logger is 'a'. Thus, we provide an interface to
+                    # change the file mode to the default behavior.
+                    # `FileHandler` is not supported to have colors,
+                    # otherwise it will appear garbled.
+                    file_handler = logging.FileHandler(log_file, file_mode)
+
+                # `StreamHandler` record year, month, day hour, minute,
+                # and second timestamp. file_handler will only record logs
+                # without color to avoid garbled code saved in files.
+                file_handler.setFormatter(
+                    MMFormatter(color=False, datefmt='%Y/%m/%d %H:%M:%S'))
+                file_handler.setLevel(log_level)
+                file_handler.addFilter(FilterDuplicateWarning(logger_name))
+                self.handlers.append(file_handler)
+        self._log_file = log_file
+
+    @property
+    def log_file(self):
+        return self._log_file
+
+    @classmethod
+    def get_current_instance(cls) -> 'MMLogger':
+        """Get latest created ``MMLogger`` instance.
+
+        :obj:`MMLogger` can call :meth:`get_current_instance` before any
+        instance has been created, and return a logger with the instance name
+        "mmengine".
+
+        Returns:
+            MMLogger: Configured logger instance.
+        """
+        if not cls._instance_dict:
+            cls.get_instance('mmengine')
+        return super().get_current_instance()
+
+    def callHandlers(self, record: LogRecord) -> None:
+        """Pass a record to all relevant handlers.
+
+        Override ``callHandlers`` method in ``logging.Logger`` to avoid
+        multiple warning messages in DDP mode. Loop through all handlers of
+        the logger instance and its parents in the logger hierarchy. If no
+        handler was found, the record will not be output.
+
+        Args:
+            record (LogRecord): A ``LogRecord`` instance contains logged
+                message.
+        """
+        for handler in self.handlers:
+            if record.levelno >= handler.level:
+                handler.handle(record)
+
+    def setLevel(self, level):
+        """Set the logging level of this logger.
+
+        If ``logging.Logger.selLevel`` is called, all ``logging.Logger``
+        instances managed by ``logging.Manager`` will clear the cache. Since
+        ``MMLogger`` is not managed by ``logging.Manager`` anymore,
+        ``MMLogger`` should override this method to clear caches of all
+        ``MMLogger`` instance which is managed by :obj:`ManagerMixin`.
+
+        level must be an int or a str.
+        """
+        self.level = logging._checkLevel(level)
+        _accquire_lock()
+        # The same logic as `logging.Manager._clear_cache`.
+        for logger in MMLogger._instance_dict.values():
+            logger._cache.clear()
+        _release_lock()
+
+
+def print_log(msg,
+              logger: Optional[Union[Logger, str]] = None,
+              level=logging.INFO) -> None:
+    """Print a log message.
+
+    Args:
+        msg (str): The message to be logged.
+        logger (Logger or str, optional): If the type of logger is
+        ``logging.Logger``, we directly use logger to log messages.
+            Some special loggers are:
+
+            - "silent": No message will be printed.
+            - "current": Use latest created logger to log message.
+            - other str: Instance name of logger. The corresponding logger
+              will log message if it has been created, otherwise ``print_log``
+              will raise a `ValueError`.
+            - None: The `print()` method will be used to print log messages.
+        level (int): Logging level. Only available when `logger` is a Logger
+            object, "current", or a created logger instance name.
+    """
+    if logger is None:
+        print(msg)
+    elif isinstance(logger, logging.Logger):
+        logger.log(level, msg)
+    elif logger == 'silent':
+        pass
+    elif logger == 'current':
+        logger_instance = MMLogger.get_current_instance()
+        logger_instance.log(level, msg)
+    elif isinstance(logger, str):
+        # If the type of `logger` is `str`, but not with value of `current` or
+        # `silent`, we assume it indicates the name of the logger. If the
+        # corresponding logger has not been created, `print_log` will raise
+        # a `ValueError`.
+        if MMLogger.check_instance_created(logger):
+            logger_instance = MMLogger.get_instance(logger)
+            logger_instance.log(level, msg)
+        else:
+            raise ValueError(f'MMLogger: {logger} has not been created!')
+    else:
+        raise TypeError(
+            '`logger` should be either a logging.Logger object, str, '
+            f'"silent", "current" or None, but got {type(logger)}')
+
+
+def _get_world_size():
+    """Support using logging module without torch."""
+    try:
+        # requires torch
+        from mmengine.dist import get_world_size
+    except ImportError:
+        return 1
+    else:
+        return get_world_size()
+
+
+def _get_rank():
+    """Support using logging module without torch."""
+    try:
+        # requires torch
+        from mmengine.dist import get_rank
+    except ImportError:
+        return 0
+    else:
+        return get_rank()
+
+
+def _get_device_id():
+    """Get device id of current machine."""
+    try:
+        import torch
+    except ImportError:
+        return 0
+    else:
+        MUSA_AVAILABLE = False
+        try:
+            import torch_musa
+            MUSA_AVAILABLE = True
+        except ImportError:
+            pass
+        if MUSA_AVAILABLE:
+            local_rank = int(os.getenv('LOCAL_RANK', '0'))
+            musa_visible_devices = os.getenv('MUSA_VISIBLE_DEVICES', None)
+            if musa_visible_devices is None:
+                num_device = torch_musa.device_count()
+                musa_visible_devices = list(range(num_device))
+            else:
+                musa_visible_devices = musa_visible_devices.split(',')
+            return int(musa_visible_devices[local_rank])
+        else:
+            local_rank = int(os.getenv('LOCAL_RANK', '0'))
+            # TODO: return device id of npu and mlu.
+            if not torch.cuda.is_available():
+                return local_rank
+            cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
+            if cuda_visible_devices is None:
+                num_device = torch.cuda.device_count()
+                cuda_visible_devices = list(range(num_device))
+            else:
+                cuda_visible_devices = cuda_visible_devices.split(',')
+            try:
+                return int(cuda_visible_devices[local_rank])
+            except ValueError:
+                # handle case for Multi-Instance GPUs
+                # see #1148 for details
+                return cuda_visible_devices[local_rank]
+
+
+def _get_host_info() -> str:
+    """Get hostname and username.
+
+    Return empty string if exception raised, e.g. ``getpass.getuser()`` will
+    lead to error in docker container
+    """
+    host = ''
+    try:
+        host = f'{getuser()}@{gethostname()}'
+    except Exception as e:
+        warnings.warn(f'Host or user not found: {str(e)}')
+    finally:
+        return host
+
+
+def _get_logging_file_handlers() -> Dict:
+    """Get additional file_handlers in ``logging.handlers``.
+
+    Returns:
+        Dict: A map of file_handlers.
+    """
+    file_handlers_map = {}
+    for module_name in dir(handlers):
+        if module_name.startswith('__'):
+            continue
+        _fh = getattr(handlers, module_name)
+        if inspect.isclass(_fh) and issubclass(_fh, logging.FileHandler):
+            file_handlers_map[module_name] = _fh
+    return file_handlers_map
diff --git a/head_extractor/src/mmengine/logging/message_hub.py b/head_extractor/src/mmengine/logging/message_hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..82565d88321f84f849d63626970c46c1b1b65e79
--- /dev/null
+++ b/head_extractor/src/mmengine/logging/message_hub.py
@@ -0,0 +1,470 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Optional, Union
+
+import numpy as np
+
+from mmengine.utils import ManagerMixin
+from .history_buffer import HistoryBuffer
+from .logger import print_log
+
+if TYPE_CHECKING:
+    import torch
+
+
+class MessageHub(ManagerMixin):
+    """Message hub for component interaction. MessageHub is created and
+    accessed in the same way as ManagerMixin.
+
+    ``MessageHub`` will record log information and runtime information. The
+    log information refers to the learning rate, loss, etc. of the model
+    during training phase, which will be stored as ``HistoryBuffer``. The
+    runtime information refers to the iter times, meta information of
+    runner etc., which will be overwritten by next update.
+
+    Args:
+        name (str): Name of message hub used to get corresponding instance
+            globally.
+        log_scalars (dict, optional): Each key-value pair in the
+            dictionary is the name of the log information such as "loss", "lr",
+            "metric" and their corresponding values. The type of value must be
+            HistoryBuffer. Defaults to None.
+        runtime_info (dict, optional): Each key-value pair in the
+            dictionary is the name of the runtime information and their
+            corresponding values. Defaults to None.
+        resumed_keys (dict, optional): Each key-value pair in the
+            dictionary decides whether the key in :attr:`_log_scalars` and
+            :attr:`_runtime_info` will be serialized.
+
+    Note:
+        Key in :attr:`_resumed_keys` belongs to :attr:`_log_scalars` or
+        :attr:`_runtime_info`. The corresponding value cannot be set
+        repeatedly.
+
+    Examples:
+        >>> # create empty `MessageHub`.
+        >>> message_hub1 = MessageHub('name')
+        >>> log_scalars = dict(loss=HistoryBuffer())
+        >>> runtime_info = dict(task='task')
+        >>> resumed_keys = dict(loss=True)
+        >>> # create `MessageHub` from data.
+        >>> message_hub2 = MessageHub(
+        >>>     name='name',
+        >>>     log_scalars=log_scalars,
+        >>>     runtime_info=runtime_info,
+        >>>     resumed_keys=resumed_keys)
+    """
+
+    def __init__(self,
+                 name: str,
+                 log_scalars: Optional[dict] = None,
+                 runtime_info: Optional[dict] = None,
+                 resumed_keys: Optional[dict] = None):
+        super().__init__(name)
+        self._log_scalars = self._parse_input('log_scalars', log_scalars)
+        self._runtime_info = self._parse_input('runtime_info', runtime_info)
+        self._resumed_keys = self._parse_input('resumed_keys', resumed_keys)
+
+        for value in self._log_scalars.values():
+            assert isinstance(value, HistoryBuffer), \
+                ("The type of log_scalars'value must be HistoryBuffer, but "
+                 f'got {type(value)}')
+
+        for key in self._resumed_keys.keys():
+            assert key in self._log_scalars or key in self._runtime_info, \
+                ('Key in `resumed_keys` must contained in `log_scalars` or '
+                 f'`runtime_info`, but got {key}')
+
+    @classmethod
+    def get_current_instance(cls) -> 'MessageHub':
+        """Get latest created ``MessageHub`` instance.
+
+        :obj:`MessageHub` can call :meth:`get_current_instance` before any
+        instance has been created, and return a message hub with the instance
+        name "mmengine".
+
+        Returns:
+            MessageHub: Empty ``MessageHub`` instance.
+        """
+        if not cls._instance_dict:
+            cls.get_instance('mmengine')
+        return super().get_current_instance()
+
+    def update_scalar(self,
+                      key: str,
+                      value: Union[int, float, np.ndarray, 'torch.Tensor'],
+                      count: int = 1,
+                      resumed: bool = True) -> None:
+        """Update :attr:_log_scalars.
+
+        Update ``HistoryBuffer`` in :attr:`_log_scalars`. If corresponding key
+        ``HistoryBuffer`` has been created, ``value`` and ``count`` is the
+        argument of ``HistoryBuffer.update``, Otherwise, ``update_scalar``
+        will create an ``HistoryBuffer`` with value and count via the
+        constructor of ``HistoryBuffer``.
+
+        Examples:
+            >>> message_hub = MessageHub(name='name')
+            >>> # create loss `HistoryBuffer` with value=1, count=1
+            >>> message_hub.update_scalar('loss', 1)
+            >>> # update loss `HistoryBuffer` with value
+            >>> message_hub.update_scalar('loss', 3)
+            >>> message_hub.update_scalar('loss', 3, resumed=False)
+            AssertionError: loss used to be true, but got false now. resumed
+            keys cannot be modified repeatedly'
+
+        Note:
+            The ``resumed`` argument needs to be consistent for the same
+            ``key``.
+
+        Args:
+            key (str): Key of ``HistoryBuffer``.
+            value (torch.Tensor or np.ndarray or int or float): Value of log.
+            count (torch.Tensor or np.ndarray or int or float): Accumulation
+                times of log, defaults to 1. `count` will be used in smooth
+                statistics.
+            resumed (str): Whether the corresponding ``HistoryBuffer``
+                could be resumed. Defaults to True.
+        """
+        self._set_resumed_keys(key, resumed)
+        checked_value = self._get_valid_value(value)
+        assert isinstance(count, int), (
+            f'The type of count must be int. but got {type(count): {count}}')
+        if key in self._log_scalars:
+            self._log_scalars[key].update(checked_value, count)
+        else:
+            self._log_scalars[key] = HistoryBuffer([checked_value], [count])
+
+    def update_scalars(self, log_dict: dict, resumed: bool = True) -> None:
+        """Update :attr:`_log_scalars` with a dict.
+
+        ``update_scalars`` iterates through each pair of log_dict key-value,
+        and calls ``update_scalar``. If type of value is dict, the value should
+        be ``dict(value=xxx) or dict(value=xxx, count=xxx)``. Item in
+        ``log_dict`` has the same resume option.
+
+        Note:
+            The ``resumed`` argument needs to be consistent for the same
+            ``log_dict``.
+
+        Args:
+            log_dict (str): Used for batch updating :attr:`_log_scalars`.
+            resumed (bool): Whether all ``HistoryBuffer`` referred in
+                log_dict should be resumed. Defaults to True.
+
+        Examples:
+            >>> message_hub = MessageHub.get_instance('mmengine')
+            >>> log_dict = dict(a=1, b=2, c=3)
+            >>> message_hub.update_scalars(log_dict)
+            >>> # The default count of  `a`, `b` and `c` is 1.
+            >>> log_dict = dict(a=1, b=2, c=dict(value=1, count=2))
+            >>> message_hub.update_scalars(log_dict)
+            >>> # The count of `c` is 2.
+        """
+        assert isinstance(log_dict, dict), ('`log_dict` must be a dict!, '
+                                            f'but got {type(log_dict)}')
+        for log_name, log_val in log_dict.items():
+            if isinstance(log_val, dict):
+                assert 'value' in log_val, \
+                    f'value must be defined in {log_val}'
+                count = self._get_valid_value(log_val.get('count', 1))
+                value = log_val['value']
+            else:
+                count = 1
+                value = log_val
+            assert isinstance(count,
+                              int), ('The type of count must be int. but got '
+                                     f'{type(count): {count}}')
+            self.update_scalar(log_name, value, count, resumed)
+
+    def update_info(self, key: str, value: Any, resumed: bool = True) -> None:
+        """Update runtime information.
+
+        The key corresponding runtime information will be overwritten each
+        time calling ``update_info``.
+
+        Note:
+            The ``resumed`` argument needs to be consistent for the same
+            ``key``.
+
+        Examples:
+            >>> message_hub = MessageHub(name='name')
+            >>> message_hub.update_info('iter', 100)
+
+        Args:
+            key (str): Key of runtime information.
+            value (Any): Value of runtime information.
+            resumed (bool): Whether the corresponding ``HistoryBuffer``
+                could be resumed.
+        """
+        self._set_resumed_keys(key, resumed)
+        self._runtime_info[key] = value
+
+    def pop_info(self, key: str, default: Optional[Any] = None) -> Any:
+        """Remove runtime information by key. If the key does not exist, this
+        method will return the default value.
+
+        Args:
+            key (str): Key of runtime information.
+            default (Any, optional): The default returned value for the
+                given key.
+
+        Returns:
+            Any: The runtime information if the key exists.
+        """
+        return self._runtime_info.pop(key, default)
+
+    def update_info_dict(self, info_dict: dict, resumed: bool = True) -> None:
+        """Update runtime information with dictionary.
+
+        The key corresponding runtime information will be overwritten each
+        time calling ``update_info``.
+
+        Note:
+            The ``resumed`` argument needs to be consistent for the same
+            ``info_dict``.
+
+        Examples:
+            >>> message_hub = MessageHub(name='name')
+            >>> message_hub.update_info({'iter': 100})
+
+        Args:
+            info_dict (str): Runtime information dictionary.
+            resumed (bool): Whether the corresponding ``HistoryBuffer``
+                could be resumed.
+        """
+        assert isinstance(info_dict, dict), ('`log_dict` must be a dict!, '
+                                             f'but got {type(info_dict)}')
+        for key, value in info_dict.items():
+            self.update_info(key, value, resumed=resumed)
+
+    def _set_resumed_keys(self, key: str, resumed: bool) -> None:
+        """Set corresponding resumed keys.
+
+        This method is called by ``update_scalar``, ``update_scalars`` and
+        ``update_info`` to set the corresponding key is true or false in
+        :attr:`_resumed_keys`.
+
+        Args:
+            key (str): Key of :attr:`_log_scalrs` or :attr:`_runtime_info`.
+            resumed (bool): Whether the corresponding ``HistoryBuffer``
+                could be resumed.
+        """
+        if key not in self._resumed_keys:
+            self._resumed_keys[key] = resumed
+        else:
+            assert self._resumed_keys[key] == resumed, \
+                f'{key} used to be {self._resumed_keys[key]}, but got ' \
+                '{resumed} now. resumed keys cannot be modified repeatedly.'
+
+    @property
+    def log_scalars(self) -> OrderedDict:
+        """Get all ``HistoryBuffer`` instances.
+
+        Note:
+            Considering the large memory footprint of history buffers in the
+            post-training, :meth:`get_scalar` will return a reference of
+            history buffer rather than a copy.
+
+        Returns:
+            OrderedDict: All ``HistoryBuffer`` instances.
+        """
+        return self._log_scalars
+
+    @property
+    def runtime_info(self) -> OrderedDict:
+        """Get all runtime information.
+
+        Returns:
+            OrderedDict: A copy of all runtime information.
+        """
+        return self._runtime_info
+
+    def get_scalar(self, key: str) -> HistoryBuffer:
+        """Get ``HistoryBuffer`` instance by key.
+
+        Note:
+            Considering the large memory footprint of history buffers in the
+            post-training, :meth:`get_scalar` will not return a reference of
+            history buffer rather than a copy.
+
+        Args:
+            key (str): Key of ``HistoryBuffer``.
+
+        Returns:
+            HistoryBuffer: Corresponding ``HistoryBuffer`` instance if the
+            key exists.
+        """
+        if key not in self.log_scalars:
+            raise KeyError(f'{key} is not found in Messagehub.log_buffers: '
+                           f'instance name is: {MessageHub.instance_name}')
+        return self.log_scalars[key]
+
+    def get_info(self, key: str, default: Optional[Any] = None) -> Any:
+        """Get runtime information by key. If the key does not exist, this
+        method will return default information.
+
+        Args:
+            key (str): Key of runtime information.
+            default (Any, optional): The default returned value for the
+                given key.
+
+        Returns:
+            Any: A copy of corresponding runtime information if the key exists.
+        """
+        if key not in self.runtime_info:
+            return default
+        else:
+            # TODO: There are restrictions on objects that can be saved
+            # return copy.deepcopy(self._runtime_info[key])
+            return self._runtime_info[key]
+
+    def _get_valid_value(
+        self,
+        value: Union['torch.Tensor', np.ndarray, np.number, int, float],
+    ) -> Union[int, float]:
+        """Convert value to python built-in type.
+
+        Args:
+            value (torch.Tensor or np.ndarray or np.number or int or float):
+                value of log.
+
+        Returns:
+            float or int: python built-in type value.
+        """
+        if isinstance(value, (np.ndarray, np.number)):
+            assert value.size == 1
+            value = value.item()
+        elif isinstance(value, (int, float)):
+            value = value
+        else:
+            # check whether value is torch.Tensor but don't want
+            # to import torch in this file
+            assert hasattr(value, 'numel') and value.numel() == 1
+            value = value.item()
+        return value  # type: ignore
+
+    def state_dict(self) -> dict:
+        """Returns a dictionary containing log scalars, runtime information and
+        resumed keys, which should be resumed.
+
+        The returned ``state_dict`` can be loaded by :meth:`load_state_dict`.
+
+        Returns:
+            dict: A dictionary contains ``log_scalars``, ``runtime_info`` and
+            ``resumed_keys``.
+        """
+        saved_scalars = OrderedDict()
+        saved_info = OrderedDict()
+
+        for key, value in self._log_scalars.items():
+            if self._resumed_keys.get(key, False):
+                saved_scalars[key] = copy.deepcopy(value)
+
+        for key, value in self._runtime_info.items():
+            if self._resumed_keys.get(key, False):
+                try:
+                    saved_info[key] = copy.deepcopy(value)
+                except:  # noqa: E722
+                    print_log(
+                        f'{key} in message_hub cannot be copied, '
+                        f'just return its reference. ',
+                        logger='current',
+                        level=logging.WARNING)
+                    saved_info[key] = value
+        return dict(
+            log_scalars=saved_scalars,
+            runtime_info=saved_info,
+            resumed_keys=self._resumed_keys)
+
+    def load_state_dict(self, state_dict: Union['MessageHub', dict]) -> None:
+        """Loads log scalars, runtime information and resumed keys from
+        ``state_dict`` or ``message_hub``.
+
+        If ``state_dict`` is a dictionary returned by :meth:`state_dict`, it
+        will only make copies of data which should be resumed from the source
+        ``message_hub``.
+
+        If ``state_dict`` is a ``message_hub`` instance, it will make copies of
+        all data from the source message_hub. We suggest to load data from
+        ``dict`` rather than a ``MessageHub`` instance.
+
+        Args:
+            state_dict (dict or MessageHub): A dictionary contains key
+                ``log_scalars`` ``runtime_info`` and ``resumed_keys``, or a
+                MessageHub instance.
+        """
+        if isinstance(state_dict, dict):
+            for key in ('log_scalars', 'runtime_info', 'resumed_keys'):
+                assert key in state_dict, (
+                    'The loaded `state_dict` of `MessageHub` must contain '
+                    f'key: `{key}`')
+            # The old `MessageHub` could save non-HistoryBuffer `log_scalars`,
+            # therefore the loaded `log_scalars` needs to be filtered.
+            for key, value in state_dict['log_scalars'].items():
+                if not isinstance(value, HistoryBuffer):
+                    print_log(
+                        f'{key} in message_hub is not HistoryBuffer, '
+                        f'just skip resuming it.',
+                        logger='current',
+                        level=logging.WARNING)
+                    continue
+                self.log_scalars[key] = value
+
+            for key, value in state_dict['runtime_info'].items():
+                try:
+                    self._runtime_info[key] = copy.deepcopy(value)
+                except:  # noqa: E722
+                    print_log(
+                        f'{key} in message_hub cannot be copied, '
+                        f'just return its reference.',
+                        logger='current',
+                        level=logging.WARNING)
+                    self._runtime_info[key] = value
+
+            for key, value in state_dict['resumed_keys'].items():
+                if key not in set(self.log_scalars.keys()) | \
+                        set(self._runtime_info.keys()):
+                    print_log(
+                        f'resumed key: {key} is not defined in message_hub, '
+                        f'just skip resuming this key.',
+                        logger='current',
+                        level=logging.WARNING)
+                    continue
+                elif not value:
+                    print_log(
+                        f'Although resumed key: {key} is False, {key} '
+                        'will still be loaded this time. This key will '
+                        'not be saved by the next calling of '
+                        '`MessageHub.state_dict()`',
+                        logger='current',
+                        level=logging.WARNING)
+                self._resumed_keys[key] = value
+
+        # Since some checkpoints saved serialized `message_hub` instance,
+        # `load_state_dict` support loading `message_hub` instance for
+        # compatibility
+        else:
+            self._log_scalars = copy.deepcopy(state_dict._log_scalars)
+            self._runtime_info = copy.deepcopy(state_dict._runtime_info)
+            self._resumed_keys = copy.deepcopy(state_dict._resumed_keys)
+
+    def _parse_input(self, name: str, value: Any) -> OrderedDict:
+        """Parse input value.
+
+        Args:
+            name (str): name of input value.
+            value (Any): Input value.
+
+        Returns:
+            dict: Parsed input value.
+        """
+        if value is None:
+            return OrderedDict()
+        elif isinstance(value, dict):
+            return OrderedDict(value)
+        else:
+            raise TypeError(f'{name} should be a dict or `None`, but '
+                            f'got {type(name)}')
diff --git a/head_extractor/src/mmengine/model/__init__.py b/head_extractor/src/mmengine/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..033512a985e66b300a52415fa5b206a56e268b60
--- /dev/null
+++ b/head_extractor/src/mmengine/model/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils.dl_utils import TORCH_VERSION
+from mmengine.utils.version_utils import digit_version
+from .averaged_model import (BaseAveragedModel, ExponentialMovingAverage,
+                             MomentumAnnealingEMA, StochasticWeightAverage)
+from .base_model import BaseDataPreprocessor, BaseModel, ImgDataPreprocessor
+from .base_module import BaseModule, ModuleDict, ModuleList, Sequential
+from .test_time_aug import BaseTTAModel
+from .utils import (convert_sync_batchnorm, detect_anomalous_params,
+                    merge_dict, revert_sync_batchnorm, stack_batch)
+from .weight_init import (BaseInit, Caffe2XavierInit, ConstantInit,
+                          KaimingInit, NormalInit, PretrainedInit,
+                          TruncNormalInit, UniformInit, XavierInit,
+                          bias_init_with_prob, caffe2_xavier_init,
+                          constant_init, initialize, kaiming_init, normal_init,
+                          trunc_normal_init, uniform_init, update_init_info,
+                          xavier_init)
+from .wrappers import (MMDistributedDataParallel,
+                       MMSeparateDistributedDataParallel, is_model_wrapper)
+
+__all__ = [
+    'MMDistributedDataParallel', 'is_model_wrapper', 'BaseAveragedModel',
+    'StochasticWeightAverage', 'ExponentialMovingAverage',
+    'MomentumAnnealingEMA', 'BaseModel', 'BaseDataPreprocessor',
+    'ImgDataPreprocessor', 'MMSeparateDistributedDataParallel', 'BaseModule',
+    'stack_batch', 'merge_dict', 'detect_anomalous_params', 'ModuleList',
+    'ModuleDict', 'Sequential', 'revert_sync_batchnorm', 'update_init_info',
+    'constant_init', 'xavier_init', 'normal_init', 'trunc_normal_init',
+    'uniform_init', 'kaiming_init', 'caffe2_xavier_init',
+    'bias_init_with_prob', 'BaseInit', 'ConstantInit', 'XavierInit',
+    'NormalInit', 'TruncNormalInit', 'UniformInit', 'KaimingInit',
+    'Caffe2XavierInit', 'PretrainedInit', 'initialize',
+    'convert_sync_batchnorm', 'BaseTTAModel'
+]
+
+if digit_version(TORCH_VERSION) >= digit_version('2.0.0'):
+    from .wrappers import MMFullyShardedDataParallel  # noqa:F401
+    __all__.append('MMFullyShardedDataParallel')
diff --git a/head_extractor/src/mmengine/model/averaged_model.py b/head_extractor/src/mmengine/model/averaged_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..58457c2a6e92311149917b1e362ae339aec8be27
--- /dev/null
+++ b/head_extractor/src/mmengine/model/averaged_model.py
@@ -0,0 +1,263 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from abc import abstractmethod
+from copy import deepcopy
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmengine.logging import print_log
+from mmengine.registry import MODELS
+
+
+class BaseAveragedModel(nn.Module):
+    """A base class for averaging model weights.
+
+    Weight averaging, such as SWA and EMA, is a widely used technique for
+    training neural networks. This class implements the averaging process
+    for a model. All subclasses must implement the `avg_func` method.
+    This class creates a copy of the provided module :attr:`model`
+    on the :attr:`device` and allows computing running averages of the
+    parameters of the :attr:`model`.
+
+    The code is referenced from: https://github.com/pytorch/pytorch/blob/master/torch/optim/swa_utils.py.
+
+    Different from the `AveragedModel` in PyTorch, we use in-place operation
+    to improve the parameter updating speed, which is about 5 times faster
+    than the non-in-place version.
+
+    In mmengine, we provide two ways to use the model averaging:
+
+    1. Use the model averaging module in hook:
+       We provide an :class:`mmengine.hooks.EMAHook` to apply the model
+       averaging during training. Add ``custom_hooks=[dict(type='EMAHook')]``
+       to the config or the runner.
+
+    2. Use the model averaging module directly in the algorithm. Take the ema
+       teacher in semi-supervise as an example:
+
+       >>> from mmengine.model import ExponentialMovingAverage
+       >>> student = ResNet(depth=50)
+       >>> # use ema model as teacher
+       >>> ema_teacher = ExponentialMovingAverage(student)
+
+    Args:
+        model (nn.Module): The model to be averaged.
+        interval (int): Interval between two updates. Defaults to 1.
+        device (torch.device, optional): If provided, the averaged model will
+            be stored on the :attr:`device`. Defaults to None.
+        update_buffers (bool): if True, it will compute running averages for
+            both the parameters and the buffers of the model. Defaults to
+            False.
+    """  # noqa: E501
+
+    def __init__(self,
+                 model: nn.Module,
+                 interval: int = 1,
+                 device: Optional[torch.device] = None,
+                 update_buffers: bool = False) -> None:
+        super().__init__()
+        self.module = deepcopy(model).requires_grad_(False)
+        self.interval = interval
+        if device is not None:
+            self.module = self.module.to(device)
+        self.register_buffer('steps',
+                             torch.tensor(0, dtype=torch.long, device=device))
+        self.update_buffers = update_buffers
+        if update_buffers:
+            self.avg_parameters = self.module.state_dict()
+        else:
+            self.avg_parameters = dict(self.module.named_parameters())
+
+    @abstractmethod
+    def avg_func(self, averaged_param: Tensor, source_param: Tensor,
+                 steps: int) -> None:
+        """Use in-place operation to compute the average of the parameters. All
+        subclasses must implement this method.
+
+        Args:
+            averaged_param (Tensor): The averaged parameters.
+            source_param (Tensor): The source parameters.
+            steps (int): The number of times the parameters have been
+                updated.
+        """
+
+    def forward(self, *args, **kwargs):
+        """Forward method of the averaged model."""
+        return self.module(*args, **kwargs)
+
+    def update_parameters(self, model: nn.Module) -> None:
+        """Update the parameters of the model. This method will execute the
+        ``avg_func`` to compute the new parameters and update the model's
+        parameters.
+
+        Args:
+            model (nn.Module): The model whose parameters will be averaged.
+        """
+        src_parameters = (
+            model.state_dict()
+            if self.update_buffers else dict(model.named_parameters()))
+        if self.steps == 0:
+            for k, p_avg in self.avg_parameters.items():
+                p_avg.data.copy_(src_parameters[k].data)
+        elif self.steps % self.interval == 0:
+            for k, p_avg in self.avg_parameters.items():
+                if p_avg.dtype.is_floating_point:
+                    device = p_avg.device
+                    self.avg_func(p_avg.data,
+                                  src_parameters[k].data.to(device),
+                                  self.steps)
+        if not self.update_buffers:
+            # If not update the buffers,
+            # keep the buffers in sync with the source model.
+            for b_avg, b_src in zip(self.module.buffers(), model.buffers()):
+                b_avg.data.copy_(b_src.data.to(b_avg.device))
+        self.steps += 1
+
+
+@MODELS.register_module()
+class StochasticWeightAverage(BaseAveragedModel):
+    """Implements the stochastic weight averaging (SWA) of the model.
+
+    Stochastic Weight Averaging was proposed in `Averaging Weights Leads to
+    Wider Optima and Better Generalization, UAI 2018.
+    <https://arxiv.org/abs/1803.05407>`_ by Pavel Izmailov, Dmitrii
+    Podoprikhin, Timur Garipov, Dmitry Vetrov and Andrew Gordon Wilson.
+    """
+
+    def avg_func(self, averaged_param: Tensor, source_param: Tensor,
+                 steps: int) -> None:
+        """Compute the average of the parameters using stochastic weight
+        average.
+
+        Args:
+            averaged_param (Tensor): The averaged parameters.
+            source_param (Tensor): The source parameters.
+            steps (int): The number of times the parameters have been
+                updated.
+        """
+        averaged_param.add_(
+            source_param - averaged_param,
+            alpha=1 / float(steps // self.interval + 1))
+
+
+@MODELS.register_module()
+class ExponentialMovingAverage(BaseAveragedModel):
+    r"""Implements the exponential moving average (EMA) of the model.
+
+    All parameters are updated by the formula as below:
+
+        .. math::
+
+            Xema_{t+1} = (1 - momentum) * Xema_{t} +  momentum * X_t
+
+    .. note::
+        This :attr:`momentum` argument is different from one used in optimizer
+        classes and the conventional notion of momentum. Mathematically,
+        :math:`Xema_{t+1}` is the moving average and :math:`X_t` is the
+        new observed value. The value of momentum is usually a small number,
+        allowing observed values to slowly update the ema parameters.
+
+    Args:
+        model (nn.Module): The model to be averaged.
+        momentum (float): The momentum used for updating ema parameter.
+            Defaults to 0.0002.
+            Ema's parameter are updated with the formula
+            :math:`averaged\_param = (1-momentum) * averaged\_param +
+            momentum * source\_param`.
+        interval (int): Interval between two updates. Defaults to 1.
+        device (torch.device, optional): If provided, the averaged model will
+            be stored on the :attr:`device`. Defaults to None.
+        update_buffers (bool): if True, it will compute running averages for
+            both the parameters and the buffers of the model. Defaults to
+            False.
+    """  # noqa: W605
+
+    def __init__(self,
+                 model: nn.Module,
+                 momentum: float = 0.0002,
+                 interval: int = 1,
+                 device: Optional[torch.device] = None,
+                 update_buffers: bool = False) -> None:
+        super().__init__(model, interval, device, update_buffers)
+        assert 0.0 < momentum < 1.0, 'momentum must be in range (0.0, 1.0)'\
+                                     f'but got {momentum}'
+        if momentum > 0.5:
+            print_log(
+                'The value of momentum in EMA is usually a small number,'
+                'which is different from the conventional notion of '
+                f'momentum but got {momentum}. Please make sure the '
+                f'value is correct.',
+                logger='current',
+                level=logging.WARNING)
+        self.momentum = momentum
+
+    def avg_func(self, averaged_param: Tensor, source_param: Tensor,
+                 steps: int) -> None:
+        """Compute the moving average of the parameters using exponential
+        moving average.
+
+        Args:
+            averaged_param (Tensor): The averaged parameters.
+            source_param (Tensor): The source parameters.
+            steps (int): The number of times the parameters have been
+                updated.
+        """
+        averaged_param.lerp_(source_param, self.momentum)
+
+
+@MODELS.register_module()
+class MomentumAnnealingEMA(ExponentialMovingAverage):
+    r"""Exponential moving average (EMA) with momentum annealing strategy.
+
+    Args:
+        model (nn.Module): The model to be averaged.
+        momentum (float): The momentum used for updating ema parameter.
+            Defaults to 0.0002.
+            Ema's parameter are updated with the formula
+            :math:`averaged\_param = (1-momentum) * averaged\_param +
+            momentum * source\_param`.
+        gamma (int): Use a larger momentum early in training and gradually
+            annealing to a smaller value to update the ema model smoothly. The
+            momentum is calculated as max(momentum, gamma / (gamma + steps))
+            Defaults to 100.
+        interval (int): Interval between two updates. Defaults to 1.
+        device (torch.device, optional): If provided, the averaged model will
+            be stored on the :attr:`device`. Defaults to None.
+        update_buffers (bool): if True, it will compute running averages for
+            both the parameters and the buffers of the model. Defaults to
+            False.
+    """
+
+    def __init__(self,
+                 model: nn.Module,
+                 momentum: float = 0.0002,
+                 gamma: int = 100,
+                 interval: int = 1,
+                 device: Optional[torch.device] = None,
+                 update_buffers: bool = False) -> None:
+        super().__init__(
+            model=model,
+            momentum=momentum,
+            interval=interval,
+            device=device,
+            update_buffers=update_buffers)
+        assert gamma > 0, f'gamma must be greater than 0, but got {gamma}'
+        self.gamma = gamma
+
+    def avg_func(self, averaged_param: Tensor, source_param: Tensor,
+                 steps: int) -> None:
+        """Compute the moving average of the parameters using the linear
+        momentum strategy.
+
+        Args:
+            averaged_param (Tensor): The averaged parameters.
+            source_param (Tensor): The source parameters.
+            steps (int): The number of times the parameters have been
+                updated.
+        """
+        momentum = max(self.momentum,
+                       self.gamma / (self.gamma + self.steps.item()))
+        averaged_param.lerp_(source_param, momentum)
diff --git a/head_extractor/src/mmengine/model/base_model/__init__.py b/head_extractor/src/mmengine/model/base_model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..66a3cb89a9326799c2822bfb0b06cb2a0602c4e6
--- /dev/null
+++ b/head_extractor/src/mmengine/model/base_model/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_model import BaseModel
+from .data_preprocessor import BaseDataPreprocessor, ImgDataPreprocessor
+
+__all__ = ['BaseModel', 'ImgDataPreprocessor', 'BaseDataPreprocessor']
diff --git a/head_extractor/src/mmengine/model/base_model/base_model.py b/head_extractor/src/mmengine/model/base_model/base_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..299cd67557ed7d7890d50bd9ded4228345230b44
--- /dev/null
+++ b/head_extractor/src/mmengine/model/base_model/base_model.py
@@ -0,0 +1,367 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from collections import OrderedDict
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from mmengine.optim import OptimWrapper
+from mmengine.registry import MODELS
+from mmengine.utils import is_list_of
+from ..base_module import BaseModule
+from .data_preprocessor import BaseDataPreprocessor
+
+
+class BaseModel(BaseModule):
+    """Base class for all algorithmic models.
+
+    BaseModel implements the basic functions of the algorithmic model, such as
+    weights initialize, batch inputs preprocess(see more information in
+    :class:`BaseDataPreprocessor`), parse losses, and update model parameters.
+
+    Subclasses inherit from BaseModel only need to implement the forward
+    method, which implements the logic to calculate loss and predictions,
+    then can be trained in the runner.
+
+    Examples:
+        >>> @MODELS.register_module()
+        >>> class ToyModel(BaseModel):
+        >>>
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.backbone = nn.Sequential()
+        >>>         self.backbone.add_module('conv1', nn.Conv2d(3, 6, 5))
+        >>>         self.backbone.add_module('pool', nn.MaxPool2d(2, 2))
+        >>>         self.backbone.add_module('conv2', nn.Conv2d(6, 16, 5))
+        >>>         self.backbone.add_module('fc1', nn.Linear(16 * 5 * 5, 120))
+        >>>         self.backbone.add_module('fc2', nn.Linear(120, 84))
+        >>>         self.backbone.add_module('fc3', nn.Linear(84, 10))
+        >>>
+        >>>         self.criterion = nn.CrossEntropyLoss()
+        >>>
+        >>>     def forward(self, batch_inputs, data_samples, mode='tensor'):
+        >>>         data_samples = torch.stack(data_samples)
+        >>>         if mode == 'tensor':
+        >>>             return self.backbone(batch_inputs)
+        >>>         elif mode == 'predict':
+        >>>             feats = self.backbone(batch_inputs)
+        >>>             predictions = torch.argmax(feats, 1)
+        >>>             return predictions
+        >>>         elif mode == 'loss':
+        >>>             feats = self.backbone(batch_inputs)
+        >>>             loss = self.criterion(feats, data_samples)
+        >>>             return dict(loss=loss)
+
+    Args:
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+
+    Attributes:
+        data_preprocessor (:obj:`BaseDataPreprocessor`): Used for
+            pre-processing data sampled by dataloader to the format accepted by
+            :meth:`forward`.
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 data_preprocessor: Optional[Union[dict, nn.Module]] = None,
+                 init_cfg: Optional[dict] = None):
+        super().__init__(init_cfg)
+        if data_preprocessor is None:
+            data_preprocessor = dict(type='BaseDataPreprocessor')
+        if isinstance(data_preprocessor, nn.Module):
+            self.data_preprocessor = data_preprocessor
+        elif isinstance(data_preprocessor, dict):
+            self.data_preprocessor = MODELS.build(data_preprocessor)
+        else:
+            raise TypeError('data_preprocessor should be a `dict` or '
+                            f'`nn.Module` instance, but got '
+                            f'{type(data_preprocessor)}')
+
+    def train_step(self, data: Union[dict, tuple, list],
+                   optim_wrapper: OptimWrapper) -> Dict[str, torch.Tensor]:
+        """Implements the default model training process including
+        preprocessing, model forward propagation, loss calculation,
+        optimization, and back-propagation.
+
+        During non-distributed training. If subclasses do not override the
+        :meth:`train_step`, :class:`EpochBasedTrainLoop` or
+        :class:`IterBasedTrainLoop` will call this method to update model
+        parameters. The default parameter update process is as follows:
+
+        1. Calls ``self.data_processor(data, training=False)`` to collect
+           batch_inputs and corresponding data_samples(labels).
+        2. Calls ``self(batch_inputs, data_samples, mode='loss')`` to get raw
+           loss
+        3. Calls ``self.parse_losses`` to get ``parsed_losses`` tensor used to
+           backward and dict of loss tensor used to log messages.
+        4. Calls ``optim_wrapper.update_params(loss)`` to update model.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            optim_wrapper (OptimWrapper): OptimWrapper instance
+                used to update model parameters.
+
+        Returns:
+            Dict[str, torch.Tensor]: A ``dict`` of tensor for logging.
+        """
+        # Enable automatic mixed precision training context.
+        with optim_wrapper.optim_context(self):
+            data = self.data_preprocessor(data, True)
+            losses = self._run_forward(data, mode='loss')  # type: ignore
+        parsed_losses, log_vars = self.parse_losses(losses)  # type: ignore
+        optim_wrapper.update_params(parsed_losses)
+        return log_vars
+
+    def val_step(self, data: Union[tuple, dict, list]) -> list:
+        """Gets the predictions of given data.
+
+        Calls ``self.data_preprocessor(data, False)`` and
+        ``self(inputs, data_sample, mode='predict')`` in order. Return the
+        predictions which will be passed to evaluator.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        data = self.data_preprocessor(data, False)
+        return self._run_forward(data, mode='predict')  # type: ignore
+
+    def test_step(self, data: Union[dict, tuple, list]) -> list:
+        """``BaseModel`` implements ``test_step`` the same as ``val_step``.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        data = self.data_preprocessor(data, False)
+        return self._run_forward(data, mode='predict')  # type: ignore
+
+    def parse_losses(
+        self, losses: Dict[str, torch.Tensor]
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """Parses the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: There are two elements. The first is the
+            loss tensor passed to optim_wrapper which may be a weighted sum
+            of all losses, and the second is log_vars which will be sent to
+            the logger.
+        """
+        log_vars = []
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars.append([loss_name, loss_value.mean()])
+            elif is_list_of(loss_value, torch.Tensor):
+                log_vars.append(
+                    [loss_name,
+                     sum(_loss.mean() for _loss in loss_value)])
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(value for key, value in log_vars if 'loss' in key)
+        log_vars.insert(0, ['loss', loss])
+        log_vars = OrderedDict(log_vars)  # type: ignore
+
+        return loss, log_vars  # type: ignore
+
+    def to(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.to`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+
+        # Since Torch has not officially merged
+        # the npu-related fields, using the _parse_to function
+        # directly will cause the NPU to not be found.
+        # Here, the input parameters are processed to avoid errors.
+        if args and isinstance(args[0], str) and 'npu' in args[0]:
+            import torch_npu
+            args = tuple([
+                list(args)[0].replace(
+                    'npu', torch_npu.npu.native_device if hasattr(
+                        torch_npu.npu, 'native_device') else 'privateuseone')
+            ])
+        if kwargs and 'npu' in str(kwargs.get('device', '')):
+            import torch_npu
+            kwargs['device'] = kwargs['device'].replace(
+                'npu', torch_npu.npu.native_device if hasattr(
+                    torch_npu.npu, 'native_device') else 'privateuseone')
+
+        device = torch._C._nn._parse_to(*args, **kwargs)[0]
+        if device is not None:
+            self._set_device(torch.device(device))
+        return super().to(*args, **kwargs)
+
+    def cuda(
+        self,
+        device: Optional[Union[int, str, torch.device]] = None,
+    ) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.cuda`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        if device is None or isinstance(device, int):
+            device = torch.device('cuda', index=device)
+        self._set_device(torch.device(device))
+        return super().cuda(device)
+
+    def musa(
+        self,
+        device: Optional[Union[int, str, torch.device]] = None,
+    ) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.musa`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        if device is None or isinstance(device, int):
+            device = torch.device('musa', index=device)
+        self._set_device(torch.device(device))
+        return super().musa(device)
+
+    def mlu(
+        self,
+        device: Union[int, str, torch.device, None] = None,
+    ) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.mlu`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        device = torch.device('mlu', torch.mlu.current_device())
+        self._set_device(device)
+        return super().mlu()
+
+    def npu(
+        self,
+        device: Union[int, str, torch.device, None] = None,
+    ) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.npu`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+
+        Note:
+            This generation of NPU(Ascend910) does not support
+            the use of multiple cards in a single process,
+            so the index here needs to be consistent with the default device
+        """
+        device = torch.npu.current_device()
+        self._set_device(device)
+        return super().npu()
+
+    def cpu(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to call :meth:`BaseDataPreprocessor.cpu`
+        additionally.
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._set_device(torch.device('cpu'))
+        return super().cpu()
+
+    def _set_device(self, device: torch.device) -> None:
+        """Recursively set device for `BaseDataPreprocessor` instance.
+
+        Args:
+            device (torch.device): the desired device of the parameters and
+                buffers in this module.
+        """
+
+        def apply_fn(module):
+            if not isinstance(module, BaseDataPreprocessor):
+                return
+            if device is not None:
+                module._device = device
+
+        self.apply(apply_fn)
+
+    @abstractmethod
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: Optional[list] = None,
+                mode: str = 'tensor') -> Union[Dict[str, torch.Tensor], list]:
+        """Returns losses or predictions of training, validation, testing, and
+        simple inference process.
+
+        ``forward`` method of BaseModel is an abstract method, its subclasses
+        must implement this method.
+
+        Accepts ``batch_inputs`` and ``data_sample`` processed by
+        :attr:`data_preprocessor`, and returns results according to mode
+        arguments.
+
+        During non-distributed training, validation, and testing process,
+        ``forward`` will be called by ``BaseModel.train_step``,
+        ``BaseModel.val_step`` and ``BaseModel.test_step`` directly.
+
+        During distributed data parallel training process,
+        ``MMSeparateDistributedDataParallel.train_step`` will first call
+        ``DistributedDataParallel.forward`` to enable automatic
+        gradient synchronization, and then call ``forward`` to get training
+        loss.
+
+        Args:
+            inputs (torch.Tensor): batch input tensor collated by
+                :attr:`data_preprocessor`.
+            data_samples (list, optional):
+                data samples collated by :attr:`data_preprocessor`.
+            mode (str): mode should be one of ``loss``, ``predict`` and
+                ``tensor``
+
+                - ``loss``: Called by ``train_step`` and return loss ``dict``
+                  used for logging
+                - ``predict``: Called by ``val_step`` and ``test_step``
+                  and return list of results used for computing metric.
+                - ``tensor``: Called by custom use to get ``Tensor`` type
+                  results.
+
+        Returns:
+            dict or list:
+                - If ``mode == loss``, return a ``dict`` of loss tensor used
+                  for backward and logging.
+                - If ``mode == predict``, return a ``list`` of inference
+                  results.
+                - If ``mode == tensor``, return a tensor or ``tuple`` of tensor
+                  or ``dict`` of tensor for custom use.
+        """
+
+    def _run_forward(self, data: Union[dict, tuple, list],
+                     mode: str) -> Union[Dict[str, torch.Tensor], list]:
+        """Unpacks data for :meth:`forward`
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            mode (str): Mode of forward.
+
+        Returns:
+            dict or list: Results of training or testing mode.
+        """
+        if isinstance(data, dict):
+            results = self(**data, mode=mode)
+        elif isinstance(data, (list, tuple)):
+            results = self(*data, mode=mode)
+        else:
+            raise TypeError('Output of `data_preprocessor` should be '
+                            f'list, tuple or dict, but got {type(data)}')
+        return results
diff --git a/head_extractor/src/mmengine/model/base_model/data_preprocessor.py b/head_extractor/src/mmengine/model/base_model/data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a101855203bae2287ea981c7abc0c1e3141b6ab8
--- /dev/null
+++ b/head_extractor/src/mmengine/model/base_model/data_preprocessor.py
@@ -0,0 +1,308 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Mapping, Optional, Sequence, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmengine.registry import MODELS
+from mmengine.structures import BaseDataElement
+from mmengine.utils import is_seq_of
+from ..utils import stack_batch
+
+CastData = Union[tuple, dict, BaseDataElement, torch.Tensor, list, bytes, str,
+                 None]
+
+
+@MODELS.register_module()
+class BaseDataPreprocessor(nn.Module):
+    """Base data pre-processor used for copying data to the target device.
+
+    Subclasses inherit from ``BaseDataPreprocessor`` could override the
+    forward method to implement custom data pre-processing, such as
+    batch-resize, MixUp, or CutMix.
+
+    Args:
+        non_blocking (bool): Whether block current process
+            when transferring data to device.
+            New in version 0.3.0.
+
+    Note:
+        Data dictionary returned by dataloader must be a dict and at least
+        contain the ``inputs`` key.
+    """
+
+    def __init__(self, non_blocking: Optional[bool] = False):
+        super().__init__()
+        self._non_blocking = non_blocking
+        self._device = torch.device('cpu')
+
+    def cast_data(self, data: CastData) -> CastData:
+        """Copying data to the target device.
+
+        Args:
+            data (dict): Data returned by ``DataLoader``.
+
+        Returns:
+            CollatedResult: Inputs and data sample at target device.
+        """
+        if isinstance(data, Mapping):
+            return {key: self.cast_data(data[key]) for key in data}
+        elif isinstance(data, (str, bytes)) or data is None:
+            return data
+        elif isinstance(data, tuple) and hasattr(data, '_fields'):
+            # namedtuple
+            return type(data)(*(self.cast_data(sample) for sample in data))  # type: ignore  # noqa: E501  # yapf:disable
+        elif isinstance(data, Sequence):
+            return type(data)(self.cast_data(sample) for sample in data)  # type: ignore  # noqa: E501  # yapf:disable
+        elif isinstance(data, (torch.Tensor, BaseDataElement)):
+            return data.to(self.device, non_blocking=self._non_blocking)
+        else:
+            return data
+
+    def forward(self, data: dict, training: bool = False) -> Union[dict, list]:
+        """Preprocesses the data into the model input format.
+
+        After the data pre-processing of :meth:`cast_data`, ``forward``
+        will stack the input tensor list to a batch tensor at the first
+        dimension.
+
+        Args:
+            data (dict): Data returned by dataloader
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            dict or list: Data in the same format as the model input.
+        """
+        return self.cast_data(data)  # type: ignore
+
+    @property
+    def device(self):
+        return self._device
+
+    def to(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+
+        # Since Torch has not officially merged
+        # the npu-related fields, using the _parse_to function
+        # directly will cause the NPU to not be found.
+        # Here, the input parameters are processed to avoid errors.
+        if args and isinstance(args[0], str) and 'npu' in args[0]:
+            args = tuple(
+                [list(args)[0].replace('npu', torch.npu.native_device)])
+        if kwargs and 'npu' in str(kwargs.get('device', '')):
+            kwargs['device'] = kwargs['device'].replace(
+                'npu', torch.npu.native_device)
+
+        device = torch._C._nn._parse_to(*args, **kwargs)[0]
+        if device is not None:
+            self._device = torch.device(device)
+        return super().to(*args, **kwargs)
+
+    def cuda(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._device = torch.device(torch.cuda.current_device())
+        return super().cuda()
+
+    def musa(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._device = torch.device(torch.musa.current_device())
+        return super().musa()
+
+    def npu(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._device = torch.device(torch.npu.current_device())
+        return super().npu()
+
+    def mlu(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._device = torch.device(torch.mlu.current_device())
+        return super().mlu()
+
+    def cpu(self, *args, **kwargs) -> nn.Module:
+        """Overrides this method to set the :attr:`device`
+
+        Returns:
+            nn.Module: The model itself.
+        """
+        self._device = torch.device('cpu')
+        return super().cpu()
+
+
+@MODELS.register_module()
+class ImgDataPreprocessor(BaseDataPreprocessor):
+    """Image pre-processor for normalization and bgr to rgb conversion.
+
+    Accepts the data sampled by the dataloader, and preprocesses it into the
+    format of the model input. ``ImgDataPreprocessor`` provides the
+    basic data pre-processing as follows
+
+    - Collates and moves data to the target device.
+    - Converts inputs from bgr to rgb if the shape of input is (3, H, W).
+    - Normalizes image with defined std and mean.
+    - Pads inputs to the maximum size of current batch with defined
+      ``pad_value``. The padding size can be divisible by a defined
+      ``pad_size_divisor``
+    - Stack inputs to batch_inputs.
+
+    For ``ImgDataPreprocessor``, the dimension of the single inputs must be
+    (3, H, W).
+
+    Note:
+        ``ImgDataPreprocessor`` and its subclass is built in the
+        constructor of :class:`BaseDataset`.
+
+    Args:
+        mean (Sequence[float or int], optional): The pixel mean of image
+            channels. If ``bgr_to_rgb=True`` it means the mean value of R,
+            G, B channels. If the length of `mean` is 1, it means all
+            channels have the same mean value, or the input is a gray image.
+            If it is not specified, images will not be normalized. Defaults
+            None.
+        std (Sequence[float or int], optional): The pixel standard deviation of
+            image channels. If ``bgr_to_rgb=True`` it means the standard
+            deviation of R, G, B channels. If the length of `std` is 1,
+            it means all channels have the same standard deviation, or the
+            input is a gray image.  If it is not specified, images will
+            not be normalized. Defaults None.
+        pad_size_divisor (int): The size of padded image should be
+            divisible by ``pad_size_divisor``. Defaults to 1.
+        pad_value (float or int): The padded pixel value. Defaults to 0.
+        bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): whether to convert image from RGB to RGB.
+            Defaults to False.
+        non_blocking (bool): Whether block current process
+            when transferring data to device.
+            New in version v0.3.0.
+
+    Note:
+        if images do not need to be normalized, `std` and `mean` should be
+        both set to None, otherwise both of them should be set to a tuple of
+        corresponding values.
+    """
+
+    def __init__(self,
+                 mean: Optional[Sequence[Union[float, int]]] = None,
+                 std: Optional[Sequence[Union[float, int]]] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Union[float, int] = 0,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False,
+                 non_blocking: Optional[bool] = False):
+        super().__init__(non_blocking)
+        assert not (bgr_to_rgb and rgb_to_bgr), (
+            '`bgr2rgb` and `rgb2bgr` cannot be set to True at the same time')
+        assert (mean is None) == (std is None), (
+            'mean and std should be both None or tuple')
+        if mean is not None:
+            assert len(mean) == 3 or len(mean) == 1, (
+                '`mean` should have 1 or 3 values, to be compatible with '
+                f'RGB or gray image, but got {len(mean)} values')
+            assert len(std) == 3 or len(std) == 1, (  # type: ignore
+                '`std` should have 1 or 3 values, to be compatible with RGB '  # type: ignore # noqa: E501
+                f'or gray image, but got {len(std)} values')  # type: ignore
+            self._enable_normalize = True
+            self.register_buffer('mean',
+                                 torch.tensor(mean).view(-1, 1, 1), False)
+            self.register_buffer('std',
+                                 torch.tensor(std).view(-1, 1, 1), False)
+        else:
+            self._enable_normalize = False
+        self._channel_conversion = rgb_to_bgr or bgr_to_rgb
+        self.pad_size_divisor = pad_size_divisor
+        self.pad_value = pad_value
+
+    def forward(self, data: dict, training: bool = False) -> Union[dict, list]:
+        """Performs normalization, padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict): Data sampled from dataset. If the collate
+                function of DataLoader is :obj:`pseudo_collate`, data will be a
+                list of dict. If collate function is :obj:`default_collate`,
+                data will be a tuple with batch input tensor and list of data
+                samples.
+            training (bool): Whether to enable training time augmentation. If
+                subclasses override this method, they can perform different
+                preprocessing strategies for training and testing based on the
+                value of ``training``.
+
+        Returns:
+            dict or list: Data in the same format as the model input.
+        """
+        data = self.cast_data(data)  # type: ignore
+        _batch_inputs = data['inputs']
+        # Process data with `pseudo_collate`.
+        if is_seq_of(_batch_inputs, torch.Tensor):
+            batch_inputs = []
+            for _batch_input in _batch_inputs:
+                # channel transform
+                if self._channel_conversion:
+                    _batch_input = _batch_input[[2, 1, 0], ...]
+                # Convert to float after channel conversion to ensure
+                # efficiency
+                _batch_input = _batch_input.float()
+                # Normalization.
+                if self._enable_normalize:
+                    if self.mean.shape[0] == 3:
+                        assert _batch_input.dim(
+                        ) == 3 and _batch_input.shape[0] == 3, (
+                            'If the mean has 3 values, the input tensor '
+                            'should in shape of (3, H, W), but got the tensor '
+                            f'with shape {_batch_input.shape}')
+                    _batch_input = (_batch_input - self.mean) / self.std
+                batch_inputs.append(_batch_input)
+            # Pad and stack Tensor.
+            batch_inputs = stack_batch(batch_inputs, self.pad_size_divisor,
+                                       self.pad_value)
+        # Process data with `default_collate`.
+        elif isinstance(_batch_inputs, torch.Tensor):
+            assert _batch_inputs.dim() == 4, (
+                'The input of `ImgDataPreprocessor` should be a NCHW tensor '
+                'or a list of tensor, but got a tensor with shape: '
+                f'{_batch_inputs.shape}')
+            if self._channel_conversion:
+                _batch_inputs = _batch_inputs[:, [2, 1, 0], ...]
+            # Convert to float after channel conversion to ensure
+            # efficiency
+            _batch_inputs = _batch_inputs.float()
+            if self._enable_normalize:
+                _batch_inputs = (_batch_inputs - self.mean) / self.std
+            h, w = _batch_inputs.shape[2:]
+            target_h = math.ceil(
+                h / self.pad_size_divisor) * self.pad_size_divisor
+            target_w = math.ceil(
+                w / self.pad_size_divisor) * self.pad_size_divisor
+            pad_h = target_h - h
+            pad_w = target_w - w
+            batch_inputs = F.pad(_batch_inputs, (0, pad_w, 0, pad_h),
+                                 'constant', self.pad_value)
+        else:
+            raise TypeError('Output of `cast_data` should be a dict of '
+                            'list/tuple with inputs and data_samples, '
+                            f'but got {type(data)}: {data}')
+        data['inputs'] = batch_inputs
+        data.setdefault('data_samples', None)
+        return data
diff --git a/head_extractor/src/mmengine/model/base_module.py b/head_extractor/src/mmengine/model/base_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cfe0b14a8452bd6188a9e64adb122ce4887f750
--- /dev/null
+++ b/head_extractor/src/mmengine/model/base_module.py
@@ -0,0 +1,239 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+from abc import ABCMeta
+from collections import defaultdict
+from logging import FileHandler
+from typing import Iterable, List, Optional, Union
+
+import torch.nn as nn
+
+from mmengine.dist import master_only
+from mmengine.logging import MMLogger, print_log
+from .weight_init import PretrainedInit, initialize, update_init_info
+from .wrappers.utils import is_model_wrapper
+
+
+class BaseModule(nn.Module, metaclass=ABCMeta):
+    """Base module for all modules in openmmlab. ``BaseModule`` is a wrapper of
+    ``torch.nn.Module`` with additional functionality of parameter
+    initialization. Compared with ``torch.nn.Module``, ``BaseModule`` mainly
+    adds three attributes.
+
+    - ``init_cfg``: the config to control the initialization.
+    - ``init_weights``: The function of parameter initialization and recording
+      initialization information.
+    - ``_params_init_info``: Used to track the parameter initialization
+      information. This attribute only exists during executing the
+      ``init_weights``.
+
+    Note:
+        :obj:`PretrainedInit` has a higher priority than any other
+        initializer. The loaded pretrained weights will overwrite
+        the previous initialized weights.
+
+    Args:
+        init_cfg (dict or List[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self, init_cfg: Union[dict, List[dict], None] = None):
+        """Initialize BaseModule, inherited from `torch.nn.Module`"""
+
+        # NOTE init_cfg can be defined in different levels, but init_cfg
+        # in low levels has a higher priority.
+
+        super().__init__()
+        # define default value of init_cfg instead of hard code
+        # in init_weights() function
+        self._is_init = False
+
+        self.init_cfg = copy.deepcopy(init_cfg)
+
+        # Backward compatibility in derived classes
+        # if pretrained is not None:
+        #     warnings.warn('DeprecationWarning: pretrained is a deprecated \
+        #         key, please consider using init_cfg')
+        #     self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+    @property
+    def is_init(self):
+        return self._is_init
+
+    @is_init.setter
+    def is_init(self, value):
+        self._is_init = value
+
+    def init_weights(self):
+        """Initialize the weights."""
+
+        is_top_level_module = False
+        # check if it is top-level module
+        if not hasattr(self, '_params_init_info'):
+            # The `_params_init_info` is used to record the initialization
+            # information of the parameters
+            # the key should be the obj:`nn.Parameter` of model and the value
+            # should be a dict containing
+            # - init_info (str): The string that describes the initialization.
+            # - tmp_mean_value (FloatTensor): The mean of the parameter,
+            #       which indicates whether the parameter has been modified.
+            # this attribute would be deleted after all parameters
+            # is initialized.
+            self._params_init_info = defaultdict(dict)
+            is_top_level_module = True
+
+            # Initialize the `_params_init_info`,
+            # When detecting the `tmp_mean_value` of
+            # the corresponding parameter is changed, update related
+            # initialization information
+            for name, param in self.named_parameters():
+                self._params_init_info[param][
+                    'init_info'] = f'The value is the same before and ' \
+                                   f'after calling `init_weights` ' \
+                                   f'of {self.__class__.__name__} '
+                self._params_init_info[param][
+                    'tmp_mean_value'] = param.data.mean().cpu()
+
+            # pass `params_init_info` to all submodules
+            # All submodules share the same `params_init_info`,
+            # so it will be updated when parameters are
+            # modified at any level of the model.
+            for sub_module in self.modules():
+                sub_module._params_init_info = self._params_init_info
+
+        module_name = self.__class__.__name__
+        if not self._is_init:
+            if self.init_cfg:
+                print_log(
+                    f'initialize {module_name} with init_cfg {self.init_cfg}',
+                    logger='current',
+                    level=logging.DEBUG)
+
+                init_cfgs = self.init_cfg
+                if isinstance(self.init_cfg, dict):
+                    init_cfgs = [self.init_cfg]
+
+                # PretrainedInit has higher priority than any other init_cfg.
+                # Therefore we initialize `pretrained_cfg` last to overwrite
+                # the previous initialized weights.
+                # See details in https://github.com/open-mmlab/mmengine/issues/691 # noqa E501
+                other_cfgs = []
+                pretrained_cfg = []
+                for init_cfg in init_cfgs:
+                    assert isinstance(init_cfg, dict)
+                    if (init_cfg['type'] == 'Pretrained'
+                            or init_cfg['type'] is PretrainedInit):
+                        pretrained_cfg.append(init_cfg)
+                    else:
+                        other_cfgs.append(init_cfg)
+
+                initialize(self, other_cfgs)
+
+            for m in self.children():
+                if is_model_wrapper(m) and not hasattr(m, 'init_weights'):
+                    m = m.module
+                if hasattr(m, 'init_weights') and not getattr(
+                        m, 'is_init', False):
+                    m.init_weights()
+                    # users may overload the `init_weights`
+                    update_init_info(
+                        m,
+                        init_info=f'Initialized by '
+                        f'user-defined `init_weights`'
+                        f' in {m.__class__.__name__} ')
+            if self.init_cfg and pretrained_cfg:
+                initialize(self, pretrained_cfg)
+            self._is_init = True
+        else:
+            print_log(
+                f'init_weights of {self.__class__.__name__} has '
+                f'been called more than once.',
+                logger='current',
+                level=logging.WARNING)
+
+        if is_top_level_module:
+            self._dump_init_info()
+
+            for sub_module in self.modules():
+                del sub_module._params_init_info
+
+    @master_only
+    def _dump_init_info(self):
+        """Dump the initialization information to a file named
+        `initialization.log.json` in workdir."""
+
+        logger = MMLogger.get_current_instance()
+        with_file_handler = False
+        # dump the information to the logger file if there is a `FileHandler`
+        for handler in logger.handlers:
+            if isinstance(handler, FileHandler):
+                handler.stream.write(
+                    'Name of parameter - Initialization information\n')
+                for name, param in self.named_parameters():
+                    handler.stream.write(
+                        f'\n{name} - {param.shape}: '
+                        f"\n{self._params_init_info[param]['init_info']} \n")
+                handler.stream.flush()
+                with_file_handler = True
+        if not with_file_handler:
+            for name, param in self.named_parameters():
+                logger.info(
+                    f'\n{name} - {param.shape}: '
+                    f"\n{self._params_init_info[param]['init_info']} \n ")
+
+    def __repr__(self):
+        s = super().__repr__()
+        if self.init_cfg:
+            s += f'\ninit_cfg={self.init_cfg}'
+        return s
+
+
+class Sequential(BaseModule, nn.Sequential):
+    """Sequential module in openmmlab.
+
+    Ensures that all modules in ``Sequential`` have a different initialization
+    strategy than the outer model
+
+    Args:
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, *args, init_cfg: Optional[dict] = None):
+        BaseModule.__init__(self, init_cfg)
+        nn.Sequential.__init__(self, *args)
+
+
+class ModuleList(BaseModule, nn.ModuleList):
+    """ModuleList in openmmlab.
+
+    Ensures that all modules in ``ModuleList`` have a different initialization
+    strategy than the outer model
+
+    Args:
+        modules (iterable, optional): An iterable of modules to add.
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 modules: Optional[Iterable] = None,
+                 init_cfg: Optional[dict] = None):
+        BaseModule.__init__(self, init_cfg)
+        nn.ModuleList.__init__(self, modules)
+
+
+class ModuleDict(BaseModule, nn.ModuleDict):
+    """ModuleDict in openmmlab.
+
+    Ensures that all modules in ``ModuleDict`` have a different initialization
+    strategy than the outer model
+
+    Args:
+        modules (dict, optional): A mapping (dictionary) of (string: module)
+            or an iterable of key-value pairs of type (string, module).
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 modules: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None):
+        BaseModule.__init__(self, init_cfg)
+        nn.ModuleDict.__init__(self, modules)
diff --git a/head_extractor/src/mmengine/model/efficient_conv_bn_eval.py b/head_extractor/src/mmengine/model/efficient_conv_bn_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cb2ad619907212d4f6cb9763bdca5feb2369b91
--- /dev/null
+++ b/head_extractor/src/mmengine/model/efficient_conv_bn_eval.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from operator import attrgetter
+from typing import List, Union
+
+import torch
+import torch.nn as nn
+
+
+def efficient_conv_bn_eval_forward(bn: nn.modules.batchnorm._BatchNorm,
+                                   conv: nn.modules.conv._ConvNd,
+                                   x: torch.Tensor):
+    """Code borrowed from mmcv 2.0.1, so that this feature can be used for old
+    mmcv versions.
+
+    Implementation based on https://arxiv.org/abs/2305.11624
+    "Tune-Mode ConvBN Blocks For Efficient Transfer Learning"
+    It leverages the associative law between convolution and affine transform,
+    i.e., normalize (weight conv feature) = (normalize weight) conv feature.
+    It works for Eval mode of ConvBN blocks during validation, and can be used
+    for training as well. It reduces memory and computation cost.
+    Args:
+        bn (_BatchNorm): a BatchNorm module.
+        conv (nn._ConvNd): a conv module
+        x (torch.Tensor): Input feature map.
+    """
+    # These lines of code are designed to deal with various cases
+    # like bn without affine transform, and conv without bias
+    weight_on_the_fly = conv.weight
+    if conv.bias is not None:
+        bias_on_the_fly = conv.bias
+    else:
+        bias_on_the_fly = torch.zeros_like(bn.running_var)
+
+    if bn.weight is not None:
+        bn_weight = bn.weight
+    else:
+        bn_weight = torch.ones_like(bn.running_var)
+
+    if bn.bias is not None:
+        bn_bias = bn.bias
+    else:
+        bn_bias = torch.zeros_like(bn.running_var)
+
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    weight_coeff = torch.rsqrt(bn.running_var +
+                               bn.eps).reshape([-1] + [1] *
+                                               (len(conv.weight.shape) - 1))
+    # shape of [C_out, 1, 1, 1] in Conv2d
+    coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff
+
+    # shape of [C_out, C_in, k, k] in Conv2d
+    weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly
+    # shape of [C_out] in Conv2d
+    bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() *\
+        (bias_on_the_fly - bn.running_mean)
+
+    return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly)
+
+
+def efficient_conv_bn_eval_control(bn: nn.modules.batchnorm._BatchNorm,
+                                   conv: nn.modules.conv._ConvNd,
+                                   x: torch.Tensor):
+    """This function controls whether to use `efficient_conv_bn_eval_forward`.
+
+    If the following `bn` is in `eval` mode, then we turn on the special
+    `efficient_conv_bn_eval_forward`.
+    """
+    if not bn.training:
+        # bn in eval mode
+        output = efficient_conv_bn_eval_forward(bn, conv, x)
+        return output
+    else:
+        conv_out = conv._conv_forward(x, conv.weight, conv.bias)
+        return bn(conv_out)
+
+
+def efficient_conv_bn_eval_graph_transform(fx_model):
+    """Find consecutive conv+bn calls in the graph, inplace modify the graph
+    with the fused operation."""
+    modules = dict(fx_model.named_modules())
+
+    patterns = [(torch.nn.modules.conv._ConvNd,
+                 torch.nn.modules.batchnorm._BatchNorm)]
+
+    pairs = []
+    # Iterate through nodes in the graph to find ConvBN blocks
+    for node in fx_model.graph.nodes:
+        # If our current node isn't calling a Module then we can ignore it.
+        if node.op != 'call_module':
+            continue
+        target_module = modules[node.target]
+        found_pair = False
+        for conv_class, bn_class in patterns:
+            if isinstance(target_module, bn_class):
+                source_module = modules[node.args[0].target]
+                if isinstance(source_module, conv_class):
+                    found_pair = True
+        # Not a conv-BN pattern or output of conv is used by other nodes
+        if not found_pair or len(node.args[0].users) > 1:
+            continue
+
+        # Find a pair of conv and bn computation nodes to optimize
+        conv_node = node.args[0]
+        bn_node = node
+        pairs.append([conv_node, bn_node])
+
+    for conv_node, bn_node in pairs:
+        # set insertion point
+        fx_model.graph.inserting_before(conv_node)
+        # create `get_attr` node to access modules
+        # note that we directly call `create_node` to fill the `name`
+        # argument. `fx_model.graph.get_attr` and
+        # `fx_model.graph.call_function` does not allow the `name` argument.
+        conv_get_node = fx_model.graph.create_node(
+            op='get_attr', target=conv_node.target, name='get_conv')
+        bn_get_node = fx_model.graph.create_node(
+            op='get_attr', target=bn_node.target, name='get_bn')
+        # prepare args for the fused function
+        args = (bn_get_node, conv_get_node, conv_node.args[0])
+        # create a new node
+        new_node = fx_model.graph.create_node(
+            op='call_function',
+            target=efficient_conv_bn_eval_control,
+            args=args,
+            name='efficient_conv_bn_eval')
+        # this node replaces the original conv + bn, and therefore
+        # should replace the uses of bn_node
+        bn_node.replace_all_uses_with(new_node)
+        # take care of the deletion order:
+        # delete bn_node first, and then conv_node
+        fx_model.graph.erase_node(bn_node)
+        fx_model.graph.erase_node(conv_node)
+
+    # regenerate the code
+    fx_model.graph.lint()
+    fx_model.recompile()
+
+
+def turn_on_efficient_conv_bn_eval_for_single_model(model: torch.nn.Module):
+    import torch.fx as fx
+
+    # currently we use `fx.symbolic_trace` to trace models.
+    # in the future, we might turn to pytorch 2.0 compile infrastructure to
+    # get the `fx.GraphModule` IR. Nonetheless, the graph transform function
+    # can remain unchanged. We just need to change the way
+    # we get `fx.GraphModule`.
+    fx_model: fx.GraphModule = fx.symbolic_trace(model)
+    efficient_conv_bn_eval_graph_transform(fx_model)
+    model.forward = fx_model.forward
+
+
+def turn_on_efficient_conv_bn_eval(model: torch.nn.Module,
+                                   modules: Union[List[str], str]):
+    if isinstance(modules, str):
+        modules = [modules]
+    for module_name in modules:
+        module = attrgetter(module_name)(model)
+        turn_on_efficient_conv_bn_eval_for_single_model(module)
diff --git a/head_extractor/src/mmengine/model/test_time_aug.py b/head_extractor/src/mmengine/model/test_time_aug.py
new file mode 100644
index 0000000000000000000000000000000000000000..c623eec8bc3fa7840f66bd9d3d475f91602aa2ce
--- /dev/null
+++ b/head_extractor/src/mmengine/model/test_time_aug.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from mmengine.registry import MODELS
+from mmengine.structures import BaseDataElement
+from .base_model import BaseModel
+
+# multi-batch inputs processed by different augmentations from the same batch.
+EnhancedBatchInputs = List[Union[torch.Tensor, List[torch.Tensor]]]
+# multi-batch data samples processed by different augmentations from the same
+# batch. The inner list stands for different augmentations and the outer list
+# stands for batch.
+EnhancedBatchDataSamples = List[List[BaseDataElement]]
+DATA_BATCH = Union[Dict[str, Union[EnhancedBatchInputs,
+                                   EnhancedBatchDataSamples]], tuple, dict]
+MergedDataSamples = List[BaseDataElement]
+
+
+@MODELS.register_module()
+class BaseTTAModel(BaseModel):
+    """Base model for inference with test-time augmentation.
+
+    ``BaseTTAModel`` is a wrapper for inference given multi-batch data.
+    It implements the :meth:`test_step` for multi-batch data inference.
+    ``multi-batch`` data means data processed by different augmentation
+    from the same batch.
+
+    During test time augmentation, the data processed by
+    :obj:`mmcv.transforms.TestTimeAug`, and then collated by
+    ``pseudo_collate`` will have the following format:
+
+    .. code-block::
+
+        result = dict(
+            inputs=[
+                [image1_aug1, image2_aug1],
+                [image1_aug2, image2_aug2]
+            ],
+            data_samples=[
+                [data_sample1_aug1, data_sample2_aug1],
+                [data_sample1_aug2, data_sample2_aug2],
+            ]
+        )
+
+    ``image{i}_aug{j}`` means the i-th image of the batch, which is
+    augmented by the j-th augmentation.
+
+    ``BaseTTAModel`` will collate the data to:
+
+     .. code-block::
+
+        data1 = dict(
+            inputs=[image1_aug1, image2_aug1],
+            data_samples=[data_sample1_aug1, data_sample2_aug1]
+        )
+
+        data2 = dict(
+            inputs=[image1_aug2, image2_aug2],
+            data_samples=[data_sample1_aug2, data_sample2_aug2]
+        )
+
+    ``data1`` and ``data2`` will be passed to model, and the results will be
+    merged by :meth:`merge_preds`.
+
+    Note:
+        :meth:`merge_preds` is an abstract method, all subclasses should
+        implement it.
+
+    Warning:
+        If ``data_preprocessor`` is not None, it will overwrite the model's
+        ``data_preprocessor``.
+
+    Args:
+        module (dict or nn.Module): Tested model.
+        data_preprocessor (dict or :obj:`BaseDataPreprocessor`, optional):
+            If model does not define ``data_preprocessor``, it will be the
+            default value for model.
+    """
+
+    def __init__(
+        self,
+        module: Union[dict, nn.Module],
+        data_preprocessor: Union[dict, nn.Module, None] = None,
+    ):
+        super().__init__()
+        if isinstance(module, nn.Module):
+            self.module = module
+        elif isinstance(module, dict):
+            if data_preprocessor is not None:
+                module['data_preprocessor'] = data_preprocessor
+            self.module = MODELS.build(module)
+        else:
+            raise TypeError('The type of module should be a `nn.Module` '
+                            f'instance or a dict, but got {module}')
+        assert hasattr(self.module, 'test_step'), (
+            'Model wrapped by BaseTTAModel must implement `test_step`!')
+
+    @abstractmethod
+    def merge_preds(self, data_samples_list: EnhancedBatchDataSamples) \
+            -> MergedDataSamples:
+        """Merge predictions of enhanced data to one prediction.
+
+        Args:
+            data_samples_list (EnhancedBatchDataSamples): List of predictions
+                of all enhanced data.
+
+        Returns:
+            List[BaseDataElement]: Merged prediction.
+        """
+
+    def test_step(self, data):
+        """Get predictions of each enhanced data, a multiple predictions.
+
+        Args:
+            data (DataBatch): Enhanced data batch sampled from dataloader.
+
+        Returns:
+            MergedDataSamples: Merged prediction.
+        """
+        data_list: Union[List[dict], List[list]]
+        if isinstance(data, dict):
+            num_augs = len(data[next(iter(data))])
+            data_list = [{key: value[idx]
+                          for key, value in data.items()}
+                         for idx in range(num_augs)]
+        elif isinstance(data, (tuple, list)):
+            num_augs = len(data[0])
+            data_list = [[_data[idx] for _data in data]
+                         for idx in range(num_augs)]
+        else:
+            raise TypeError('data given by dataLoader should be a dict, '
+                            f'tuple or a list, but got {type(data)}')
+
+        predictions = []
+        for data in data_list:  # type: ignore
+            predictions.append(self.module.test_step(data))
+        return self.merge_preds(list(zip(*predictions)))  # type: ignore
+
+    def forward(self,
+                inputs: torch.Tensor,
+                data_samples: Optional[list] = None,
+                mode: str = 'tensor') -> Union[Dict[str, torch.Tensor], list]:
+        """``BaseTTAModel.forward`` should not be called."""
+        raise NotImplementedError(
+            '`BaseTTAModel.forward` will not be called during training or'
+            'testing. Please call `test_step` instead. If you want to use'
+            '`BaseTTAModel.forward`, please implement this method')
diff --git a/head_extractor/src/mmengine/model/utils.py b/head_extractor/src/mmengine/model/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c78ea3134d41d3e83b8cb5b84b983ae9a94d6327
--- /dev/null
+++ b/head_extractor/src/mmengine/model/utils.py
@@ -0,0 +1,257 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import warnings
+from typing import List, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmengine.logging import print_log
+from mmengine.utils.dl_utils import mmcv_full_available
+
+
+def stack_batch(tensor_list: List[torch.Tensor],
+                pad_size_divisor: int = 1,
+                pad_value: Union[int, float] = 0) -> torch.Tensor:
+    """Stack multiple tensors to form a batch and pad the tensor to the max
+    shape use the right bottom padding mode in these images. If
+    ``pad_size_divisor > 0``, add padding to ensure the shape of each dim is
+    divisible by ``pad_size_divisor``.
+
+    Args:
+        tensor_list (List[Tensor]): A list of tensors with the same dim.
+        pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding
+            to ensure the shape of each dim is divisible by
+            ``pad_size_divisor``. This depends on the model, and many
+            models need to be divisible by 32. Defaults to 1
+        pad_value (int, float): The padding value. Defaults to 0.
+
+    Returns:
+       Tensor: The n dim tensor.
+    """
+    assert isinstance(
+        tensor_list,
+        list), (f'Expected input type to be list, but got {type(tensor_list)}')
+    assert tensor_list, '`tensor_list` could not be an empty list'
+    assert len({
+        tensor.ndim
+        for tensor in tensor_list
+    }) == 1, (f'Expected the dimensions of all tensors must be the same, '
+              f'but got {[tensor.ndim for tensor in tensor_list]}')
+
+    dim = tensor_list[0].dim()
+    num_img = len(tensor_list)
+    all_sizes: torch.Tensor = torch.Tensor(
+        [tensor.shape for tensor in tensor_list])
+    max_sizes = torch.ceil(
+        torch.max(all_sizes, dim=0)[0] / pad_size_divisor) * pad_size_divisor
+    padded_sizes = max_sizes - all_sizes
+    # The first dim normally means channel,  which should not be padded.
+    padded_sizes[:, 0] = 0
+    if padded_sizes.sum() == 0:
+        return torch.stack(tensor_list)
+    # `pad` is the second arguments of `F.pad`. If pad is (1, 2, 3, 4),
+    # it means that padding the last dim with 1(left) 2(right), padding the
+    # penultimate dim to 3(top) 4(bottom). The order of `pad` is opposite of
+    # the `padded_sizes`. Therefore, the `padded_sizes` needs to be reversed,
+    # and only odd index of pad should be assigned to keep padding "right" and
+    # "bottom".
+    pad = torch.zeros(num_img, 2 * dim, dtype=torch.int)
+    pad[:, 1::2] = padded_sizes[:, range(dim - 1, -1, -1)]
+    batch_tensor = []
+    for idx, tensor in enumerate(tensor_list):
+        batch_tensor.append(
+            F.pad(tensor, tuple(pad[idx].tolist()), value=pad_value))
+    return torch.stack(batch_tensor)
+
+
+def detect_anomalous_params(loss: torch.Tensor, model) -> None:
+    parameters_in_graph = set()
+    visited = set()
+
+    def traverse(grad_fn):
+        if grad_fn is None:
+            return
+        if grad_fn not in visited:
+            visited.add(grad_fn)
+            if hasattr(grad_fn, 'variable'):
+                parameters_in_graph.add(grad_fn.variable)
+            parents = grad_fn.next_functions
+            if parents is not None:
+                for parent in parents:
+                    grad_fn = parent[0]
+                    traverse(grad_fn)
+
+    traverse(loss.grad_fn)
+    for n, p in model.named_parameters():
+        if p not in parameters_in_graph and p.requires_grad:
+            print_log(
+                f'{n} with shape {p.size()} is not '
+                f'in the computational graph \n',
+                logger='current',
+                level=logging.ERROR)
+
+
+def merge_dict(*args):
+    """Merge all dictionaries into one dictionary.
+
+    If pytorch version >= 1.8, ``merge_dict`` will be wrapped
+    by ``torch.fx.wrap``,  which will make ``torch.fx.symbolic_trace`` skip
+    trace ``merge_dict``.
+
+    Note:
+        If a function needs to be traced by ``torch.fx.symbolic_trace``,
+        but inevitably needs to use ``update`` method of ``dict``(``update``
+        is not traceable). It should use ``merge_dict`` to replace
+        ``xxx.update``.
+
+    Args:
+        *args: dictionary needs to be merged.
+
+    Returns:
+        dict: Merged dict from args
+    """
+    output = dict()
+    for item in args:
+        assert isinstance(
+            item,
+            dict), (f'all arguments of merge_dict should be a dict, but got '
+                    f'{type(item)}')
+        output.update(item)
+    return output
+
+
+# torch.fx is only available when pytorch version >= 1.8.
+# If the subclass of `BaseModel` has multiple submodules, and each module
+# will return a loss dict during training process, i.e., `TwoStageDetector`
+# in mmdet. It should use `merge_dict` to get the total loss, rather than
+# `loss.update` to keep model traceable.
+try:
+    import torch.fx
+
+    # make torch.fx skip trace `merge_dict`.
+    merge_dict = torch.fx.wrap(merge_dict)
+
+except ImportError:
+    warnings.warn('Cannot import torch.fx, `merge_dict` is a simple function '
+                  'to merge multiple dicts')
+
+
+class _BatchNormXd(nn.modules.batchnorm._BatchNorm):
+    """A general BatchNorm layer without input dimension check.
+
+    Reproduced from @kapily's work:
+    (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547)
+    The only difference between BatchNorm1d, BatchNorm2d, BatchNorm3d, etc
+    is `_check_input_dim` that is designed for tensor sanity checks.
+    The check has been bypassed in this class for the convenience of converting
+    SyncBatchNorm.
+    """
+
+    def _check_input_dim(self, input: torch.Tensor):
+        return
+
+
+def revert_sync_batchnorm(module: nn.Module) -> nn.Module:
+    """Helper function to convert all `SyncBatchNorm` (SyncBN) and
+    `mmcv.ops.sync_bn.SyncBatchNorm`(MMSyncBN) layers in the model to
+    `BatchNormXd` layers.
+
+    Adapted from @kapily's work:
+    (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547)
+
+    Args:
+        module (nn.Module): The module containing `SyncBatchNorm` layers.
+
+    Returns:
+        module_output: The converted module with `BatchNormXd` layers.
+    """
+    module_output = module
+    module_checklist = [torch.nn.modules.batchnorm.SyncBatchNorm]
+
+    if mmcv_full_available():
+        from mmcv.ops import SyncBatchNorm
+        module_checklist.append(SyncBatchNorm)
+
+    if isinstance(module, tuple(module_checklist)):
+        module_output = _BatchNormXd(module.num_features, module.eps,
+                                     module.momentum, module.affine,
+                                     module.track_running_stats)
+        if module.affine:
+            # no_grad() may not be needed here but
+            # just to be consistent with `convert_sync_batchnorm()`
+            with torch.no_grad():
+                module_output.weight = module.weight
+                module_output.bias = module.bias
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = module.num_batches_tracked
+        module_output.training = module.training
+        # qconfig exists in quantized models
+        if hasattr(module, 'qconfig'):
+            module_output.qconfig = module.qconfig
+    for name, child in module.named_children():
+        # Some custom modules or 3rd party implemented modules may raise an
+        # error when calling `add_module`. Therefore, try to catch the error
+        # and do not raise it. See https://github.com/open-mmlab/mmengine/issues/638 # noqa: E501
+        # for more details.
+        try:
+            module_output.add_module(name, revert_sync_batchnorm(child))
+        except Exception:
+            print_log(
+                F'Failed to convert {child} from SyncBN to BN!',
+                logger='current',
+                level=logging.WARNING)
+    del module
+    return module_output
+
+
+def convert_sync_batchnorm(module: nn.Module,
+                           implementation='torch') -> nn.Module:
+    """Helper function to convert all `BatchNorm` layers in the model to
+    `SyncBatchNorm` (SyncBN) or `mmcv.ops.sync_bn.SyncBatchNorm` (MMSyncBN)
+    layers. Adapted from `PyTorch convert sync batchnorm`_.
+
+    Args:
+        module (nn.Module): The module containing `SyncBatchNorm` layers.
+        implementation (str): The type of `SyncBatchNorm` to convert to.
+
+            - 'torch': convert to `torch.nn.modules.batchnorm.SyncBatchNorm`.
+            - 'mmcv': convert to `mmcv.ops.sync_bn.SyncBatchNorm`.
+
+    Returns:
+        nn.Module: The converted module with `SyncBatchNorm` layers.
+
+    .. _PyTorch convert sync batchnorm:
+       https://pytorch.org/docs/stable/generated/torch.nn.SyncBatchNorm.html#torch.nn.SyncBatchNorm.convert_sync_batchnorm
+    """  # noqa: E501
+    module_output = module
+
+    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
+        if implementation == 'torch':
+            SyncBatchNorm = torch.nn.modules.batchnorm.SyncBatchNorm
+        elif implementation == 'mmcv':
+            from mmcv.ops import SyncBatchNorm  # type: ignore
+        else:
+            raise ValueError('sync_bn should be "torch" or "mmcv", but got '
+                             f'{implementation}')
+
+        module_output = SyncBatchNorm(module.num_features, module.eps,
+                                      module.momentum, module.affine,
+                                      module.track_running_stats)
+
+        if module.affine:
+            with torch.no_grad():
+                module_output.weight = module.weight
+                module_output.bias = module.bias
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = module.num_batches_tracked
+        if hasattr(module, 'qconfig'):
+            module_output.qconfig = module.qconfig
+    for name, child in module.named_children():
+        module_output.add_module(name,
+                                 convert_sync_batchnorm(child, implementation))
+    del module
+    return module_output
diff --git a/head_extractor/src/mmengine/model/weight_init.py b/head_extractor/src/mmengine/model/weight_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2e0b9a7a5185f78ab09584c500624bb4a749d2a
--- /dev/null
+++ b/head_extractor/src/mmengine/model/weight_init.py
@@ -0,0 +1,682 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmengine.logging import print_log
+from mmengine.registry import WEIGHT_INITIALIZERS, build_from_cfg
+
+
+def update_init_info(module, init_info):
+    """Update the `_params_init_info` in the module if the value of parameters
+    are changed.
+
+    Args:
+        module (obj:`nn.Module`): The module of PyTorch with a user-defined
+            attribute `_params_init_info` which records the initialization
+            information.
+        init_info (str): The string that describes the initialization.
+    """
+    assert hasattr(
+        module,
+        '_params_init_info'), f'Can not find `_params_init_info` in {module}'
+    for name, param in module.named_parameters():
+
+        assert param in module._params_init_info, (
+            f'Find a new :obj:`Parameter` '
+            f'named `{name}` during executing the '
+            f'`init_weights` of '
+            f'`{module.__class__.__name__}`. '
+            f'Please do not add or '
+            f'replace parameters during executing '
+            f'the `init_weights`. ')
+
+        # The parameter has been changed during executing the
+        # `init_weights` of module
+        mean_value = param.data.mean().cpu()
+        if module._params_init_info[param]['tmp_mean_value'] != mean_value:
+            module._params_init_info[param]['init_info'] = init_info
+            module._params_init_info[param]['tmp_mean_value'] = mean_value
+
+
+def constant_init(module, val, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.constant_(module.weight, val)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def xavier_init(module, gain=1, bias=0, distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if hasattr(module, 'weight') and module.weight is not None:
+        if distribution == 'uniform':
+            nn.init.xavier_uniform_(module.weight, gain=gain)
+        else:
+            nn.init.xavier_normal_(module.weight, gain=gain)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def normal_init(module, mean=0, std=1, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def trunc_normal_init(module: nn.Module,
+                      mean: float = 0,
+                      std: float = 1,
+                      a: float = -2,
+                      b: float = 2,
+                      bias: float = 0) -> None:
+    if hasattr(module, 'weight') and module.weight is not None:
+        trunc_normal_(module.weight, mean, std, a, b)  # type: ignore
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)  # type: ignore
+
+
+def uniform_init(module, a=0, b=1, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.uniform_(module.weight, a, b)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def kaiming_init(module,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if hasattr(module, 'weight') and module.weight is not None:
+        if distribution == 'uniform':
+            nn.init.kaiming_uniform_(
+                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+        else:
+            nn.init.kaiming_normal_(
+                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def caffe2_xavier_init(module, bias=0):
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    kaiming_init(
+        module,
+        a=1,
+        mode='fan_in',
+        nonlinearity='leaky_relu',
+        bias=bias,
+        distribution='uniform')
+
+
+def bias_init_with_prob(prior_prob):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+def _get_bases_name(m):
+    return [b.__name__ for b in m.__class__.__bases__]
+
+
+class BaseInit:
+
+    def __init__(self, *, bias=0, bias_prob=None, layer=None):
+        self.wholemodule = False
+        if not isinstance(bias, (int, float)):
+            raise TypeError(f'bias must be a number, but got a {type(bias)}')
+
+        if bias_prob is not None:
+            if not isinstance(bias_prob, float):
+                raise TypeError(f'bias_prob type must be float, \
+                    but got {type(bias_prob)}')
+
+        if layer is not None:
+            if not isinstance(layer, (str, list)):
+                raise TypeError(f'layer must be a str or a list of str, \
+                    but got a {type(layer)}')
+        else:
+            layer = []
+
+        if bias_prob is not None:
+            self.bias = bias_init_with_prob(bias_prob)
+        else:
+            self.bias = bias
+        self.layer = [layer] if isinstance(layer, str) else layer
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Constant')
+class ConstantInit(BaseInit):
+    """Initialize module parameters with constant values.
+
+    Args:
+        val (int | float): the value to fill the weights in the module with
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, val, **kwargs):
+        super().__init__(**kwargs)
+        self.val = val
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                constant_init(m, self.val, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    constant_init(m, self.val, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: val={self.val}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Xavier')
+class XavierInit(BaseInit):
+    r"""Initialize module parameters with values according to the method
+    described in the paper below.
+
+    `Understanding the difficulty of training deep feedforward
+    neural networks - Glorot, X. & Bengio, Y. (2010).
+    <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+
+    Args:
+        gain (int | float): an optional scaling factor. Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        distribution (str): distribution either be ``'normal'``
+            or ``'uniform'``. Defaults to ``'normal'``.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, gain=1, distribution='normal', **kwargs):
+        super().__init__(**kwargs)
+        self.gain = gain
+        self.distribution = distribution
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                xavier_init(m, self.gain, self.bias, self.distribution)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    xavier_init(m, self.gain, self.bias, self.distribution)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: gain={self.gain}, ' \
+               f'distribution={self.distribution}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Normal')
+class NormalInit(BaseInit):
+    r"""Initialize module parameters with the values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
+
+    Args:
+        mean (int | float):the mean of the normal distribution. Defaults to 0.
+        std (int | float): the standard deviation of the normal distribution.
+            Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, mean=0, std=1, **kwargs):
+        super().__init__(**kwargs)
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                normal_init(m, self.mean, self.std, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    normal_init(m, self.mean, self.std, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: mean={self.mean},' \
+               f' std={self.std}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='TruncNormal')
+class TruncNormalInit(BaseInit):
+    r"""Initialize module parameters with the values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values
+    outside :math:`[a, b]`.
+
+    Args:
+        mean (float): the mean of the normal distribution. Defaults to 0.
+        std (float):  the standard deviation of the normal distribution.
+            Defaults to 1.
+        a (float): The minimum cutoff value.
+        b ( float): The maximum cutoff value.
+        bias (float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 mean: float = 0,
+                 std: float = 1,
+                 a: float = -2,
+                 b: float = 2,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.mean = mean
+        self.std = std
+        self.a = a
+        self.b = b
+
+    def __call__(self, module: nn.Module) -> None:
+
+        def init(m):
+            if self.wholemodule:
+                trunc_normal_init(m, self.mean, self.std, self.a, self.b,
+                                  self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    trunc_normal_init(m, self.mean, self.std, self.a, self.b,
+                                      self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a}, b={self.b},' \
+               f' mean={self.mean}, std={self.std}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Uniform')
+class UniformInit(BaseInit):
+    r"""Initialize module parameters with values drawn from the uniform
+    distribution :math:`\mathcal{U}(a, b)`.
+
+    Args:
+        a (int | float): the lower bound of the uniform distribution.
+            Defaults to 0.
+        b (int | float): the upper bound of the uniform distribution.
+            Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, a=0, b=1, **kwargs):
+        super().__init__(**kwargs)
+        self.a = a
+        self.b = b
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                uniform_init(m, self.a, self.b, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    uniform_init(m, self.a, self.b, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a},' \
+               f' b={self.b}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Kaiming')
+class KaimingInit(BaseInit):
+    r"""Initialize module parameters with the values according to the method
+    described in the paper below.
+
+    `Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification - He, K. et al. (2015).
+    <https://www.cv-foundation.org/openaccess/content_iccv_2015/
+    papers/He_Delving_Deep_into_ICCV_2015_paper.pdf>`_
+
+    Args:
+        a (int | float): the negative slope of the rectifier used after this
+            layer (only used with ``'leaky_relu'``). Defaults to 0.
+        mode (str):  either ``'fan_in'`` or ``'fan_out'``. Choosing
+            ``'fan_in'`` preserves the magnitude of the variance of the weights
+            in the forward pass. Choosing ``'fan_out'`` preserves the
+            magnitudes in the backwards pass. Defaults to ``'fan_out'``.
+        nonlinearity (str): the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` .
+            Defaults to 'relu'.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        distribution (str): distribution either be ``'normal'`` or
+            ``'uniform'``. Defaults to ``'normal'``.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 distribution='normal',
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.a = a
+        self.mode = mode
+        self.nonlinearity = nonlinearity
+        self.distribution = distribution
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                kaiming_init(m, self.a, self.mode, self.nonlinearity,
+                             self.bias, self.distribution)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    kaiming_init(m, self.a, self.mode, self.nonlinearity,
+                                 self.bias, self.distribution)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a}, mode={self.mode}, ' \
+               f'nonlinearity={self.nonlinearity}, ' \
+               f'distribution ={self.distribution}, bias={self.bias}'
+        return info
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Caffe2Xavier')
+class Caffe2XavierInit(KaimingInit):
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    def __init__(self, **kwargs):
+        super().__init__(
+            a=1,
+            mode='fan_in',
+            nonlinearity='leaky_relu',
+            distribution='uniform',
+            **kwargs)
+
+    def __call__(self, module):
+        super().__call__(module)
+
+
+@WEIGHT_INITIALIZERS.register_module(name='Pretrained')
+class PretrainedInit:
+    """Initialize module by loading a pretrained model.
+
+    Args:
+        checkpoint (str): the checkpoint file of the pretrained model should
+            be load.
+        prefix (str, optional): the prefix of a sub-module in the pretrained
+            model. it is for loading a part of the pretrained model to
+            initialize. For example, if we would like to only load the
+            backbone of a detector model, we can set ``prefix='backbone.'``.
+            Defaults to None.
+        map_location (str): map tensors into proper locations. Defaults to cpu.
+    """
+
+    def __init__(self, checkpoint, prefix=None, map_location='cpu'):
+        self.checkpoint = checkpoint
+        self.prefix = prefix
+        self.map_location = map_location
+
+    def __call__(self, module):
+        from mmengine.runner.checkpoint import (_load_checkpoint_with_prefix,
+                                                load_checkpoint,
+                                                load_state_dict)
+        if self.prefix is None:
+            print_log(f'load model from: {self.checkpoint}', logger='current')
+            load_checkpoint(
+                module,
+                self.checkpoint,
+                map_location=self.map_location,
+                strict=False,
+                logger='current')
+        else:
+            print_log(
+                f'load {self.prefix} in model from: {self.checkpoint}',
+                logger='current')
+            state_dict = _load_checkpoint_with_prefix(
+                self.prefix, self.checkpoint, map_location=self.map_location)
+            load_state_dict(module, state_dict, strict=False, logger='current')
+
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: load from {self.checkpoint}'
+        return info
+
+
+def _initialize(module, cfg, wholemodule=False):
+    func = build_from_cfg(cfg, WEIGHT_INITIALIZERS)
+    # wholemodule flag is for override mode, there is no layer key in override
+    # and initializer will give init values for the whole module with the name
+    # in override.
+    func.wholemodule = wholemodule
+    func(module)
+
+
+def _initialize_override(module, override, cfg):
+    if not isinstance(override, (dict, list)):
+        raise TypeError(f'override must be a dict or a list of dict, \
+                but got {type(override)}')
+
+    override = [override] if isinstance(override, dict) else override
+
+    for override_ in override:
+
+        cp_override = copy.deepcopy(override_)
+        name = cp_override.pop('name', None)
+        if name is None:
+            raise ValueError('`override` must contain the key "name",'
+                             f'but got {cp_override}')
+        # if override only has name key, it means use args in init_cfg
+        if not cp_override:
+            cp_override.update(cfg)
+        # if override has name key and other args except type key, it will
+        # raise error
+        elif 'type' not in cp_override.keys():
+            raise ValueError(
+                f'`override` need "type" key, but got {cp_override}')
+
+        if hasattr(module, name):
+            _initialize(getattr(module, name), cp_override, wholemodule=True)
+        else:
+            raise RuntimeError(f'module did not have attribute {name}, '
+                               f'but init_cfg is {cp_override}.')
+
+
+def initialize(module, init_cfg):
+    r"""Initialize a module.
+
+    Args:
+        module (``torch.nn.Module``): the module will be initialized.
+        init_cfg (dict | list[dict]): initialization configuration dict to
+            define initializer. OpenMMLab has implemented 6 initializers
+            including ``Constant``, ``Xavier``, ``Normal``, ``Uniform``,
+            ``Kaiming``, and ``Pretrained``.
+
+    Example:
+        >>> module = nn.Linear(2, 3, bias=True)
+        >>> init_cfg = dict(type='Constant', layer='Linear', val =1 , bias =2)
+        >>> initialize(module, init_cfg)
+        >>> module = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2))
+        >>> # define key ``'layer'`` for initializing layer with different
+        >>> # configuration
+        >>> init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
+                dict(type='Constant', layer='Linear', val=2)]
+        >>> initialize(module, init_cfg)
+        >>> # define key``'override'`` to initialize some specific part in
+        >>> # module
+        >>> class FooNet(nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.feat = nn.Conv2d(3, 16, 3)
+        >>>         self.reg = nn.Conv2d(16, 10, 3)
+        >>>         self.cls = nn.Conv2d(16, 5, 3)
+        >>> model = FooNet()
+        >>> init_cfg = dict(type='Constant', val=1, bias=2, layer='Conv2d',
+        >>>     override=dict(type='Constant', name='reg', val=3, bias=4))
+        >>> initialize(model, init_cfg)
+        >>> model = ResNet(depth=50)
+        >>> # Initialize weights with the pretrained model.
+        >>> init_cfg = dict(type='Pretrained',
+                checkpoint='torchvision://resnet50')
+        >>> initialize(model, init_cfg)
+        >>> # Initialize weights of a sub-module with the specific part of
+        >>> # a pretrained model by using "prefix".
+        >>> url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\
+        >>>     'retinanet_r50_fpn_1x_coco/'\
+        >>>     'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
+        >>> init_cfg = dict(type='Pretrained',
+                checkpoint=url, prefix='backbone.')
+    """
+    if not isinstance(init_cfg, (dict, list)):
+        raise TypeError(f'init_cfg must be a dict or a list of dict, \
+                but got {type(init_cfg)}')
+
+    if isinstance(init_cfg, dict):
+        init_cfg = [init_cfg]
+
+    for cfg in init_cfg:
+        # should deeply copy the original config because cfg may be used by
+        # other modules, e.g., one init_cfg shared by multiple bottleneck
+        # blocks, the expected cfg will be changed after pop and will change
+        # the initialization behavior of other modules
+        cp_cfg = copy.deepcopy(cfg)
+        override = cp_cfg.pop('override', None)
+        _initialize(module, cp_cfg)
+
+        if override is not None:
+            cp_cfg.pop('layer', None)
+            _initialize_override(module, override, cp_cfg)
+        else:
+            # All attributes in module have same initialization.
+            pass
+
+
+def _no_grad_trunc_normal_(tensor: Tensor, mean: float, std: float, a: float,
+                           b: float) -> Tensor:
+    # Method based on
+    # https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    # Modified from
+    # https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
+            'The distribution of values may be incorrect.',
+            stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        lower = norm_cdf((a - mean) / std)
+        upper = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [lower, upper], then translate
+        # to [2lower-1, 2upper-1].
+        tensor.uniform_(2 * lower - 1, 2 * upper - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor: Tensor,
+                  mean: float = 0.,
+                  std: float = 1.,
+                  a: float = -2.,
+                  b: float = 2.) -> Tensor:
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    Modified from
+    https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+
+    Args:
+        tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`.
+        mean (float): the mean of the normal distribution.
+        std (float): the standard deviation of the normal distribution.
+        a (float): the minimum cutoff value.
+        b (float): the maximum cutoff value.
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
diff --git a/head_extractor/src/mmengine/model/wrappers/__init__.py b/head_extractor/src/mmengine/model/wrappers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..90eddabbe1df4f694a19f1c1523fce09b40b2858
--- /dev/null
+++ b/head_extractor/src/mmengine/model/wrappers/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils.dl_utils import TORCH_VERSION
+from mmengine.utils.version_utils import digit_version
+from .distributed import MMDistributedDataParallel
+from .seperate_distributed import MMSeparateDistributedDataParallel
+from .utils import is_model_wrapper
+
+__all__ = [
+    'MMDistributedDataParallel', 'is_model_wrapper',
+    'MMSeparateDistributedDataParallel'
+]
+
+if digit_version(TORCH_VERSION) >= digit_version('2.0.0'):
+    from .fully_sharded_distributed import \
+        MMFullyShardedDataParallel  # noqa:F401
+    __all__.append('MMFullyShardedDataParallel')
diff --git a/head_extractor/src/mmengine/model/wrappers/distributed.py b/head_extractor/src/mmengine/model/wrappers/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..4113aebf9ee49015f84d25f2f242c43ee9d92a23
--- /dev/null
+++ b/head_extractor/src/mmengine/model/wrappers/distributed.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict, Union
+
+import torch
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+from mmengine.optim import OptimWrapper
+from mmengine.registry import MODEL_WRAPPERS
+from ..utils import detect_anomalous_params
+
+MODEL_WRAPPERS.register_module(module=DistributedDataParallel)
+MODEL_WRAPPERS.register_module(module=DataParallel)
+
+
+@MODEL_WRAPPERS.register_module()
+class MMDistributedDataParallel(DistributedDataParallel):
+    """A distributed model wrapper used for training,testing and validation in
+    loop.
+
+    Different from DistributedDataParallel, MMDistributedDataParallel
+    implements three methods :meth:`train_step`, :meth:`val_step` and
+    :meth:`test_step`, which will be called by ``train_loop``, ``val_loop``
+    and ``test_loop``.
+
+    - ``train_step``: Called by ``runner.train_loop``, and implement
+      default model forward, gradient back propagation, parameter updating
+      logic. To take advantage of DistributedDataParallel's automatic gradient
+      synchronization, ``train_step`` calls ``DistributedDataParallel.forward``
+      to calculate the losses, and call other methods of :class:`BaseModel` to
+      pre-process data and parse losses. Finally, update model parameters by
+      :class:`OptimWrapper` and return the loss dictionary used
+      for logging.
+
+    - ``val_step``: Called by ``runner.val_loop`` and get the inference
+      results. Since there is no gradient synchronization requirement,
+      this procedure is equivalent to ``BaseModel.val_step``
+
+    - ``test_step``: Called by ``runner.test_loop``, equivalent ``val_step``.
+
+    Args:
+        detect_anomalous_params (bool): This option is only used for
+            debugging which will slow down the training speed.
+            Detect anomalous parameters that are not included in
+            the computational graph with `loss` as the root.
+            There are two cases
+
+            - Parameters were not used during forward pass.
+            - Parameters were not used to produce loss.
+
+            Defaults to False.
+
+        **kwargs: keyword arguments passed to ``DistributedDataParallel``.
+
+            - device_ids (List[int] or torch.device, optional): CUDA devices
+              for module.
+            - output_device (int or torch.device, optional): Device location of
+              output for single-device CUDA modules.
+            - dim (int): Defaults to 0.
+            - broadcast_buffers (bool): Flag that enables syncing (
+              broadcasting) buffers of the module at beginning of the
+              ``forward`` function. Defaults to True
+            - find_unused_parameters (bool): Whether to find parameters of
+              module, which are not in the forward graph. Defaults to False.
+            - process_group (ProcessGroup, optional): The process group to be
+              used for distributed data all-reduction.
+            - bucket_cap_mb (int): bucket size in MegaBytes (MB). Defaults
+              to 25.
+            - check_reduction (bool): This argument is deprecated. Defaults
+              to False.
+            - gradient_as_bucket_view (bool): Defaults to False.
+            - static_graph (bool): Defaults to False.
+
+    See more information about arguments in
+    :class:`torch.nn.parallel.DistributedDataParallel`.
+
+    Note:
+        If model has multiple submodules and each module has
+        separate optimization strategies,
+        :class:`MMSeparateDistributedDataParallel` should be used to wrap
+        the model.
+
+    Note:
+        If model itself has custom optimization strategy, rather than
+        simply forward model and update model. A custom model wrapper
+        inherit from ``MMDistributedDataParallel`` should be defined and
+        override the ``train_step`` method.
+    """
+
+    def __init__(self,
+                 module,
+                 detect_anomalous_params: bool = False,
+                 **kwargs):
+        super().__init__(module=module, **kwargs)
+        self.detect_anomalous_params = detect_anomalous_params
+
+    def train_step(self, data: Union[dict, tuple, list],
+                   optim_wrapper: OptimWrapper) -> Dict[str, torch.Tensor]:
+        """Interface for model forward, backward and parameters updating during
+        training process.
+
+        :meth:`train_step` will perform the following steps in order:
+
+        - If :attr:`module` defines the preprocess method,
+          call ``module.preprocess`` to pre-processing data.
+        - Call ``module.forward(**data)`` and get losses.
+        - Parse losses.
+        - Call ``optim_wrapper.optimizer_step`` to update parameters.
+        - Return log messages of losses.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            optim_wrapper (OptimWrapper): A wrapper of optimizer to
+                update parameters.
+
+        Returns:
+            Dict[str, torch.Tensor]: A ``dict`` of tensor for logging.
+        """
+        # Enable automatic mixed precision training context.
+        with optim_wrapper.optim_context(self):
+            data = self.module.data_preprocessor(data, training=True)
+            losses = self._run_forward(data, mode='loss')
+        parsed_loss, log_vars = self.module.parse_losses(losses)
+        optim_wrapper.update_params(parsed_loss)
+        if self.detect_anomalous_params:
+            detect_anomalous_params(parsed_loss, model=self)
+        return log_vars
+
+    def val_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the prediction of module during validation process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        return self.module.val_step(data)
+
+    def test_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the predictions of module during testing process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        return self.module.test_step(data)
+
+    def _run_forward(self, data: Union[dict, tuple, list], mode: str) -> Any:
+        """Unpacks data for :meth:`forward`
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            mode (str): Mode of forward.
+
+        Returns:
+            dict or list: Results of training or testing mode.
+        """
+        if isinstance(data, dict):
+            results = self(**data, mode=mode)
+        elif isinstance(data, (list, tuple)):
+            results = self(*data, mode=mode)
+        else:
+            raise TypeError('Output of `data_preprocessor` should be '
+                            f'list, tuple or dict, but got {type(data)}')
+        return results
diff --git a/head_extractor/src/mmengine/model/wrappers/fully_sharded_distributed.py b/head_extractor/src/mmengine/model/wrappers/fully_sharded_distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6b145ecf9a7bf8b5bbc2e759e8d4b128dd6b802
--- /dev/null
+++ b/head_extractor/src/mmengine/model/wrappers/fully_sharded_distributed.py
@@ -0,0 +1,453 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed import ProcessGroup
+# yapf: disable
+from torch.distributed.fsdp.api import (FullStateDictConfig,
+                                        LocalOptimStateDictConfig,
+                                        LocalStateDictConfig,
+                                        OptimStateDictConfig,
+                                        ShardedOptimStateDictConfig,
+                                        ShardedStateDictConfig,
+                                        ShardingStrategy, StateDictConfig,
+                                        StateDictSettings, StateDictType)
+from torch.distributed.fsdp.fully_sharded_data_parallel import (
+    BackwardPrefetch, CPUOffload, FullOptimStateDictConfig,
+    FullyShardedDataParallel, MixedPrecision)
+
+# yapf: enable
+from mmengine.optim import OptimWrapper
+from mmengine.registry import FUNCTIONS, MODEL_WRAPPERS
+from mmengine.structures import BaseDataElement
+from mmengine.utils import digit_version, is_seq_of
+
+
+@MODEL_WRAPPERS.register_module()
+class MMFullyShardedDataParallel(FullyShardedDataParallel):
+    """A wrapper for sharding Module parameters across data parallel workers.
+
+    Different from FullyShardedDataParallel, MMFullyShardedDataParallel
+    implements three methods :meth:`train_step`, :meth:`val_step` and
+    :meth:`test_step`, which will be called by ``train_loop``, ``val_loop``
+    and ``test_loop``.
+
+    - ``train_step``: Called by ``runner.train_loop``, and implement
+      default model forward, gradient back propagation, parameter updating
+      logic.
+
+    - ``val_step``: Called by ``runner.val_loop`` and get the inference
+      results. Specially, since MMFullyShardedDataParallel will wrap model
+      recursively, it may cause some problem if one just use
+      ``BaseModel.val_step`` to implement ``val_step`` here. To avoid that,
+      ``val_step`` will call methods of :obj:`BaseModel` to pre-process
+      data first, and use ``FullyShardedDataParallel.forward`` to get result.
+
+    - ``test_step``: Called by ``runner.test_loop`` and get the inference
+      results. Its logic is equivalent to ``val_loop``.
+
+    Args:
+        module (nn.Module): module to be wrapped with FSDP.
+        process_group (ProcessGroup, optional): process group for sharding.
+        cpu_offload (bool, CPUOffload, optional):
+            CPU offloading config.
+            Different from FullyShardedDataParallel,Since it can be set by
+            users' pre-defined config in MMEngine,its type is expected to be
+            `None`, `bool` or `CPUOffload`.
+
+            Currently, only parameter and gradient CPU offload is supported.
+            It can be enabled via passing in
+            ``cpu_offload=CPUOffload(offload_params=True)``. Note that this
+            currently implicitly enables gradient offloading to CPU in order
+            for params and grads to be on same device to work with optimizer.
+            This API is subject to change. Default is ``None`` in which case
+            there will be no offloading.
+        auto_wrap_policy (str or Callable, optional):
+            Specifying a policy to recursively wrap layers with FSDP.
+            Different from FullyShardedDataParallel, Since it can be set by
+            users' pre-defined config in MMEngine, its type is expected to be
+            `None`, `str` or `Callable`. If it's `str`, then
+            MMFullyShardedDataParallel will try to get specified method in
+            ``FSDP_WRAP_POLICIES`` registry,and this method will be passed to
+            FullyShardedDataParallel to finally initialize model.
+
+            Note that this policy currently will only apply to child modules of
+            the passed in module. The remainder modules are always wrapped in
+            the returned FSDP root instance.
+            ``default_auto_wrap_policy`` written in
+            ``torch.distributed.fsdp.wrap`` is an example of
+            ``auto_wrap_policy`` callable, this policy wraps layers with
+            parameter sizes larger than 100M. Users can supply the customized
+            ``auto_wrap_policy`` callable that should accept following
+            arguments: ``module: nn.Module``, ``recurse: bool``,
+            ``unwrapped_params: int``, extra customized arguments could be
+            added to the customized ``auto_wrap_policy`` callable as well.
+
+            Example::
+
+                >>> def custom_auto_wrap_policy(
+                >>>     module: nn.Module,
+                >>>     recurse: bool,
+                >>>     unwrapped_params: int,
+                >>>     # These are customizable for this policy function.
+                >>>     min_num_params: int = int(1e8),
+                >>> ) -> bool:
+                >>>     return unwrapped_params >= min_num_params
+
+        backward_prefetch (str or BackwardPrefetch, optional):
+            Different from FullyShardedDataParallel, this argument could be a
+            string or a BackwardPrefetch instance. If it's a string, then
+            it should be ``BACKWARD_PRE`` or ``BACKWARD_POST``
+        mixed_precision  (dict or MixedPrecision, optional):
+            This configures native mixed precision for FSDP. If this is set to
+            ``None``. Different from the native FSDP, this argument can a dict
+            like this:
+
+            Examples:
+                >>> mixed_precision=dict(param_dtype='float16',
+                >>>                      buffer_dtype='float32',
+                >>>                      reduce_dtype='float32')
+
+            Defaults to None.
+        use_orig_params (bool): Different from native
+            ``FullyShardedDataParallel``, it defaults to True.
+        **kwargs: Keyword arguments passed to
+            :class:`FullyShardedDataParallel`.
+    """
+
+    def __init__(
+        self,
+        module: nn.Module,
+        process_group: Union[dict, ProcessGroup, None] = None,
+        sharding_strategy: Union[str, ShardingStrategy] = None,
+        cpu_offload: Union[bool, CPUOffload, None] = None,
+        auto_wrap_policy: Union[str, Callable, None] = None,
+        backward_prefetch: Union[str, BackwardPrefetch, None] = None,
+        mixed_precision: Union[dict, MixedPrecision, None] = None,
+        param_init_fn: Union[str, Callable[[nn.Module], None]] = None,
+        use_orig_params: bool = True,
+        **kwargs,
+    ):
+        if isinstance(sharding_strategy, str):
+            sharding_strategy = ShardingStrategy[sharding_strategy]
+        if not (isinstance(sharding_strategy, ShardingStrategy)
+                or sharding_strategy is None):
+            raise TypeError(
+                'sharding_strategy must be str or enum of `ShardingStrategy` '
+                f', but got {sharding_strategy}')
+
+        if isinstance(cpu_offload, bool):
+            cpu_offload = CPUOffload(offload_params=cpu_offload)
+        if not (isinstance(cpu_offload, CPUOffload) or cpu_offload is None):
+            raise TypeError(
+                '`cpu_offload` should be `None`, `bool`'
+                f'or `CPUOffload`, but has type {type(cpu_offload)}')
+
+        with FUNCTIONS.switch_scope_and_registry(None):
+            if isinstance(auto_wrap_policy, str):
+                auto_wrap_policy = FUNCTIONS.get(  # type: ignore
+                    auto_wrap_policy)
+                if auto_wrap_policy is None:
+                    raise ValueError('`auto_wrap_policy` is not registered!')
+            elif isinstance(auto_wrap_policy, dict):
+                policy = auto_wrap_policy.pop('type')
+                if isinstance(policy, str):
+                    policy = FUNCTIONS.get(policy)  # type: ignore
+                if policy is None:
+                    raise ValueError('`auto_wrap_policy` is not registered!')
+                auto_wrap_policy = partial(policy, **auto_wrap_policy)
+
+            if not (auto_wrap_policy is None
+                    or callable(auto_wrap_policy)):  # type: ignore
+                raise TypeError('`auto_wrap_policy` should be a str, a '
+                                'callable, a dict or None, but has type '
+                                f'{type(auto_wrap_policy)}')
+
+            if isinstance(backward_prefetch, str):
+                backward_prefetch = BackwardPrefetch[backward_prefetch]
+            if not (isinstance(backward_prefetch, BackwardPrefetch)
+                    or backward_prefetch is None):
+                raise TypeError(
+                    '`backward_prefetch` should be `None`, string of '
+                    '"BACKWARD_PRE" and "BACKWARD_POST", or '
+                    f'`BackwardPrefetch`, but has type {type(backward_prefetch)}'  # noqa: E501
+                )
+
+            if isinstance(param_init_fn, str):
+                param_init_fn = FUNCTIONS.get(  # type: ignore
+                    param_init_fn)
+                if param_init_fn is None:
+                    raise ValueError('`param_init_fn` is not registered!')
+            elif isinstance(param_init_fn, dict):
+                init_fn = param_init_fn.pop('type')
+                if isinstance(param_init_fn, str):
+                    init_fn = FUNCTIONS.get(init_fn)  # type: ignore
+                if init_fn is None:
+                    raise ValueError('`param_init_fn` is not registered!')
+                param_init_fn = partial(init_fn, **param_init_fn)
+
+            if not (callable(param_init_fn) or param_init_fn is None):
+                raise TypeError('`param_init_fn` should be a str, a '
+                                'callable, a dict or None, but has type '
+                                f'{type(param_init_fn)}')
+
+        def parse_dtype(dtype):
+            if dtype is None:
+                return None
+            elif isinstance(dtype, str):
+                return getattr(torch, dtype)
+            elif isinstance(dtype, torch.dtype):
+                return dtype
+            else:
+                raise TypeError(
+                    '`dtype` should be `None`, `str` or `torch.dtype`, '
+                    f'but has type {type(dtype)}')
+
+        if isinstance(mixed_precision, dict):
+            mixed_precision['param_dtype'] = parse_dtype(
+                mixed_precision.get('param_dtype', None))
+            mixed_precision['reduce_dtype'] = parse_dtype(
+                mixed_precision.get('reduce_dtype', None))
+            mixed_precision['buffer_dtype'] = parse_dtype(
+                mixed_precision.get('buffer_dtype', None))
+            mixed_precision = MixedPrecision(**mixed_precision)
+        elif isinstance(mixed_precision, MixedPrecision):
+            mixed_precision = mixed_precision
+        elif mixed_precision is not None:
+            raise TypeError(
+                '`mixed_precision` should be `None`, `dict` or '
+                f'`MixedPrecision`, but has type {type(mixed_precision)}')
+
+        # ignored_parameters and ignored_modules will be deprecated by PyTorch.
+        # Therefore we hide them in **kwargs.
+        # TODO: Update when PyTorch 2.1.0 released
+        if 'ignored_parameters' in kwargs:
+            kwargs['ignored_parameters'] = self._get_ignored_params(
+                module, kwargs['ignored_parameters'])
+
+        if 'ignored_modules' in kwargs:
+            kwargs['ignored_modules'] = self._get_ignored_modules(
+                module, kwargs['ignored_modules'])
+
+        super().__init__(
+            module=module,
+            process_group=process_group,
+            sharding_strategy=sharding_strategy,
+            auto_wrap_policy=auto_wrap_policy,
+            cpu_offload=cpu_offload,
+            backward_prefetch=backward_prefetch,
+            mixed_precision=mixed_precision,
+            param_init_fn=param_init_fn,
+            use_orig_params=use_orig_params,
+            **kwargs)
+
+    def train_step(self, data: dict,
+                   optim_wrapper: OptimWrapper) -> Dict[str, torch.Tensor]:
+        """Interface for model forward, backward and parameters updating during
+        training process.
+
+        :meth:`train_step` will perform the following steps in order:
+
+        - If :attr:`module` defines the preprocess method,
+            call ``module.preprocess`` to pre-processing data.
+        - Call ``module.forward(**data)`` and get losses.
+        - Parse losses.
+        - Call ``optim_wrapper.optimizer_step`` to update parameters.
+        - Return log messages of losses.
+
+        Args:
+            data (dict): Data sampled by dataloader.
+            optim_wrapper (OptimWrapper): A wrapper of optimizer to
+                update parameters.
+
+        Returns:
+            Dict[str, torch.Tensor]: A ``dict`` of tensor for logging.
+        """
+        # enable automatic mixed precision training context.
+        with optim_wrapper.optim_context(self):
+            data = self.module.data_preprocessor(data, training=True)
+            if isinstance(data, dict):
+                losses = self(**data, mode='loss')
+            elif isinstance(data, (list, tuple)):
+                losses = self(*data, mode='loss')
+            else:
+                raise TypeError('Output of `data_preprocessor` should be '
+                                f'list tuple or dict, but got {type(data)}')
+        parsed_loss, log_vars = self.module.parse_losses(losses)
+        optim_wrapper.update_params(parsed_loss)
+        return log_vars
+
+    def val_step(self, data: dict) -> List[BaseDataElement]:
+        """Gets the prediction of module during validation process.
+
+        Args:
+            data (dict): Data sampled by dataloader.
+
+        Returns:
+            List[BaseDataElement] or dict: The predictions of given data.
+        """
+        data = self.module.data_preprocessor(data, False)
+        return self._run_forward(data, mode='predict')  # type: ignore
+
+    def test_step(self, data: dict) -> List[BaseDataElement]:
+        """Gets the predictions of module during testing process.
+
+        Args:
+            data (dict): Data sampled by dataloader.
+
+        Returns:
+            List[BaseDataElement]: The predictions of given data.
+        """
+        data = self.module.data_preprocessor(data, False)
+        return self._run_forward(data, mode='predict')  # type: ignore
+
+    def _run_forward(self, data: Union[dict, tuple, list],
+                     mode: str) -> Union[Dict[str, torch.Tensor], list]:
+        """Unpacks data for :meth:`forward`
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            mode (str): Mode of forward.
+        Returns:
+            dict or list: Results of training or testing mode.
+        """
+        if isinstance(data, dict):
+            results = self(**data, mode=mode)
+        elif isinstance(data, (list, tuple)):
+            results = self(*data, mode=mode)
+        else:
+            raise TypeError('Output of `data_preprocessor` should be '
+                            f'list, tuple or dict, but got {type(data)}')
+        return results
+
+    def _get_ignored_params(self, module: nn.Module,
+                            ignored_parameters: Union[Iterable[str],
+                                                      Iterable[nn.Module]]):
+        """Get params from string."""
+        params_dict = dict(module.named_parameters())
+        if is_seq_of(ignored_parameters, str):
+            ignored_parameters = [
+                params_dict[name] for name in ignored_parameters
+            ]
+        if not is_seq_of(ignored_parameters,
+                         nn.Parameter) and ignored_parameters is not None:
+            raise TypeError(
+                '`ignored_modules` should be `None`, `Iterable[str]` or '
+                '`Iterable[nn.Parameters]`, but has type '
+                f'{type(ignored_parameters)}')
+        return ignored_parameters
+
+    def _get_ignored_modules(self, module: nn.Module,
+                             ignored_modules: Union[Iterable[str],
+                                                    Iterable[nn.Module]]):
+        """Get modules from string."""
+        modules_dict = dict(module.named_modules())
+        if is_seq_of(ignored_modules, str):
+            ignored_modules = [modules_dict[name] for name in ignored_modules]
+        if not is_seq_of(ignored_modules,
+                         nn.Module) and ignored_modules is not None:
+            raise TypeError(
+                '`ignored_modules` should be `None`, `Iterable[str]` or '
+                '`Iterable[nn.Module]`, but has type '
+                f'{type(ignored_modules)}')
+        return ignored_modules
+
+    if digit_version(torch.__version__) < digit_version('2.0.1'):
+
+        @staticmethod
+        def optim_state_dict(
+            model: torch.nn.Module,
+            optim: torch.optim.Optimizer,
+            group: Optional[dist.ProcessGroup] = None,
+        ) -> Dict[str, Any]:
+            """copied from pytorch 2.0.1 which has fixed some bugs."""
+            state_dict_settings = FullyShardedDataParallel.get_state_dict_type(
+                model)
+            return FullyShardedDataParallel._optim_state_dict_impl(
+                model=model,
+                optim=optim,
+                optim_state_dict=optim.state_dict(),
+                optim_input=None,
+                rank0_only=getattr(state_dict_settings.optim_state_dict_config,
+                                   'rank0_only', False),
+                full_state_dict=state_dict_settings.state_dict_type ==
+                StateDictType.FULL_STATE_DICT,
+                group=group,
+            )
+
+        @staticmethod
+        def set_state_dict_type(
+            module: nn.Module,
+            state_dict_type: StateDictType,
+            state_dict_config: Optional[StateDictConfig] = None,
+            optim_state_dict_config: Optional[OptimStateDictConfig] = None,
+        ) -> StateDictSettings:
+            """copied from pytorch 2.0.1 which has fixed some bugs."""
+            import torch.distributed.fsdp._traversal_utils as traversal_utils
+            _state_dict_type_to_config = {
+                StateDictType.FULL_STATE_DICT: FullStateDictConfig,
+                StateDictType.LOCAL_STATE_DICT: LocalStateDictConfig,
+                StateDictType.SHARDED_STATE_DICT: ShardedStateDictConfig,
+            }
+            _optim_state_dict_type_to_config = {
+                StateDictType.FULL_STATE_DICT: FullOptimStateDictConfig,
+                StateDictType.LOCAL_STATE_DICT: LocalOptimStateDictConfig,
+                StateDictType.SHARDED_STATE_DICT: ShardedOptimStateDictConfig,
+            }
+
+            # Use the default config if a state_dict config is not set.
+            state_dict_config_type = _state_dict_type_to_config[
+                state_dict_type]
+            optim_state_dict_config_type = _optim_state_dict_type_to_config[
+                state_dict_type]
+            if state_dict_config is None:
+                state_dict_config = state_dict_config_type()
+            if optim_state_dict_config is None:
+                optim_state_dict_config = optim_state_dict_config_type()
+            if state_dict_config_type != type(state_dict_config):
+                raise RuntimeError('Expected state_dict_config of type '
+                                   f'{state_dict_config_type} '
+                                   f'but got {type(state_dict_config)}')
+            if optim_state_dict_config_type != type(optim_state_dict_config):
+                raise RuntimeError('Expected optim_state_dict_config of type '
+                                   f'{optim_state_dict_config_type} '
+                                   f'but got {type(optim_state_dict_config)}')
+
+            # Set the state_dict type and configurations.
+            prev_state_dict_type = None
+            prev_state_dict_config = None
+            prev_optim_state_dict_config = None
+            for submodule in traversal_utils._get_fsdp_states(module):
+                if prev_state_dict_type is None:
+                    prev_state_dict_type = submodule._state_dict_type
+                else:
+                    assert (
+                        prev_state_dict_type == submodule._state_dict_type
+                    ), 'All FSDP modules should have the same state_dict_type.'
+                if prev_state_dict_config is None:
+                    prev_state_dict_config = submodule._state_dict_config
+                else:
+                    assert isinstance(
+                        submodule._state_dict_config,
+                        type(prev_state_dict_config)), (
+                            'All FSDP modules must have the same type of '
+                            'state_dict_config.')
+                if prev_optim_state_dict_config is None:
+                    prev_optim_state_dict_config = \
+                        submodule._optim_state_dict_config
+                else:
+                    assert isinstance(
+                        submodule._optim_state_dict_config,
+                        type(prev_optim_state_dict_config),
+                    ), ('All FSDP modules must have the same type of '
+                        'optim_state_dict_config.')
+
+                submodule._state_dict_type = state_dict_type
+                submodule._state_dict_config = state_dict_config
+                submodule._optim_state_dict_config = optim_state_dict_config
+
+            return StateDictSettings(prev_state_dict_type,
+                                     prev_state_dict_config,
+                                     prev_optim_state_dict_config)
diff --git a/head_extractor/src/mmengine/model/wrappers/seperate_distributed.py b/head_extractor/src/mmengine/model/wrappers/seperate_distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac9c2383c325282a19d655b156820b711de04d53
--- /dev/null
+++ b/head_extractor/src/mmengine/model/wrappers/seperate_distributed.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from contextlib import ExitStack, contextmanager
+from typing import Dict, Union
+
+import torch
+import torch.nn as nn
+from torch.nn.parallel.distributed import DistributedDataParallel
+
+from mmengine.device import get_device
+from mmengine.optim import OptimWrapperDict
+from mmengine.registry import MODEL_WRAPPERS
+from .distributed import MMDistributedDataParallel
+
+
+@MODEL_WRAPPERS.register_module()
+class MMSeparateDistributedDataParallel(DistributedDataParallel):
+    """A DistributedDataParallel wrapper for models in MMGeneration.
+
+    In MMedting and MMGeneration there is a need to wrap different modules in
+    the models with separate DistributedDataParallel. Otherwise, it will cause
+    errors for GAN training. For example, the GAN model, usually has two
+    submodules: generator and discriminator. If we wrap both of them in one
+    standard DistributedDataParallel, it will cause errors during training,
+    because when we update the parameters of the generator (or discriminator),
+    the parameters of the discriminator (or generator) is not updated, which is
+    not allowed for DistributedDataParallel. So we design this wrapper to
+    separately wrap DistributedDataParallel for generator and discriminator.
+    In this wrapper, we perform two operations:
+
+    1. Wraps each module in the models with separate MMDistributedDataParallel.
+       Note that only modules with parameters will be wrapped.
+    2. Calls ``train_step``, ``val_step`` and ``test_step`` of submodules to
+       get losses and predictions.
+
+    Args:
+        module (nn.Module): model contain multiple submodules which have
+            separately updating strategy.
+        broadcast_buffers (bool): Same as that in
+            ``torch.nn.parallel.distributed.DistributedDataParallel``.
+            Defaults to False.
+        find_unused_parameters (bool): Same as that in
+            ``torch.nn.parallel.distributed.DistributedDataParallel``.
+            Traverse the autograd graph of all tensors contained in returned
+            value of the wrapped module's forward function. Defaults to False.
+        **kwargs: Keyword arguments passed to ``MMDistributedDataParallel``.
+
+            - device_ids (List[int] or torch.device, optional): CUDA devices
+              for module.
+            - output_device (int or torch.device, optional): Device location of
+              output for single-device CUDA modules.
+            - dim (int): Defaults to 0.
+            - process_group (ProcessGroup, optional): The process group to be
+              used for distributed data all-reduction.
+            - bucket_cap_mb (int): bucket size in MegaBytes (MB). Defaults
+              to 25.
+            - check_reduction (bool): This argument is deprecated. Defaults
+              to False.
+            - gradient_as_bucket_view (bool): Defaults to False.
+            - static_graph (bool): Defaults to False.
+
+    See more information about arguments in
+    :class:`torch.nn.parallel.DistributedDataParallel`.
+    """
+
+    def __init__(self,
+                 module: nn.Module,
+                 broadcast_buffers: bool = False,
+                 find_unused_parameters: bool = False,
+                 **kwargs):
+        super(DistributedDataParallel, self).__init__()
+        self.module = module
+        device = get_device()
+        # Wrap the submodule with parameters of `self.module` to
+        # `MMDistributedDataParallel`
+        for name, sub_module in module._modules.items():
+            # module without parameters.
+            if next(sub_module.parameters(), None) is None:
+                sub_module = sub_module.to(device)
+            elif all(not p.requires_grad for p in sub_module.parameters()):
+                sub_module = sub_module.to(device)
+            else:
+                sub_module = MMDistributedDataParallel(
+                    module=sub_module.to(device),
+                    broadcast_buffers=broadcast_buffers,
+                    find_unused_parameters=find_unused_parameters,
+                    **kwargs)
+            module._modules[name] = sub_module
+
+    def train_step(self, data: Union[dict, tuple, list],
+                   optim_wrapper: OptimWrapperDict) -> Dict[str, torch.Tensor]:
+        """Interface for model forward, backward and parameters updating during
+        training process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+            optim_wrapper (OptimWrapperDict): A wrapper of optimizer to
+                update parameters.
+
+        Returns:
+            Dict[str, torch.Tensor]: A dict of tensor for logging.
+        """
+        return self.module.train_step(data, optim_wrapper)
+
+    def val_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the prediction of module during validation process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        return self.module.val_step(data)
+
+    def test_step(self, data: Union[dict, tuple, list]) -> list:
+        """Gets the predictions of module during testing process.
+
+        Args:
+            data (dict or tuple or list): Data sampled from dataset.
+
+        Returns:
+            list: The predictions of given data.
+        """
+        return self.module.test_step(data)
+
+    @contextmanager
+    def no_sync(self):
+        """Enables ``no_sync`` context of all sub ``MMDistributedDataParallel``
+        modules."""
+        with ExitStack() as stack:
+            for sub_ddp_model in self.module._modules.values():
+                stack.enter_context(sub_ddp_model.no_sync())
+                yield
+
+    def train(self, mode: bool = True) -> 'MMSeparateDistributedDataParallel':
+        """Sets the module in training mode.
+
+        In order to make the ddp wrapper inheritance hierarchy more uniform,
+        ``MMSeparateDistributedDataParallel`` inherits from
+        ``DistributedDataParallel``, but will not call its constructor.
+        Since the attributes of ``DistributedDataParallel`` have not been
+        initialized, call the ``train`` method of ``DistributedDataParallel``
+        will raise an error if pytorch version <= 1.9. Therefore, override
+        this method to call the ``train`` method of submodules.
+
+        Args:
+            mode (bool): whether to set training mode (``True``) or evaluation
+                mode (``False``). Defaults to ``True``.
+
+        Returns:
+            Module: self.
+        """
+        self.training = mode
+        self.module.train(mode)
+        return self
diff --git a/head_extractor/src/mmengine/model/wrappers/utils.py b/head_extractor/src/mmengine/model/wrappers/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..86e1e123b98b8ad2172214cb3ddc8e688eacecff
--- /dev/null
+++ b/head_extractor/src/mmengine/model/wrappers/utils.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+from mmengine.registry import MODEL_WRAPPERS, Registry
+
+
+def is_model_wrapper(model: nn.Module, registry: Registry = MODEL_WRAPPERS):
+    """Check if a module is a model wrapper.
+
+    The following 4 model in MMEngine (and their subclasses) are regarded as
+    model wrappers: DataParallel, DistributedDataParallel,
+    MMDataParallel, MMDistributedDataParallel. You may add you own
+    model wrapper by registering it to ``mmengine.registry.MODEL_WRAPPERS``.
+
+    Args:
+        model (nn.Module): The model to be checked.
+        registry (Registry): The parent registry to search for model wrappers.
+
+    Returns:
+        bool: True if the input model is a model wrapper.
+    """
+    module_wrappers = tuple(registry.module_dict.values())
+    if isinstance(model, module_wrappers):
+        return True
+
+    if not registry.children:
+        return False
+
+    return any(
+        is_model_wrapper(model, child) for child in registry.children.values())
diff --git a/head_extractor/src/mmengine/optim/__init__.py b/head_extractor/src/mmengine/optim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0a2ec6e37ddaa8b1df506cb7458007adae044ea
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/__init__.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .optimizer import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS,
+                        AmpOptimWrapper, ApexOptimWrapper, BaseOptimWrapper,
+                        DefaultOptimWrapperConstructor, OptimWrapper,
+                        OptimWrapperDict, ZeroRedundancyOptimizer,
+                        build_optim_wrapper)
+# yapf: disable
+from .scheduler import (ConstantLR, ConstantMomentum, ConstantParamScheduler,
+                        CosineAnnealingLR, CosineAnnealingMomentum,
+                        CosineAnnealingParamScheduler, ExponentialLR,
+                        ExponentialMomentum, ExponentialParamScheduler,
+                        LinearLR, LinearMomentum, LinearParamScheduler,
+                        MultiStepLR, MultiStepMomentum,
+                        MultiStepParamScheduler, OneCycleLR,
+                        OneCycleParamScheduler, PolyLR, PolyMomentum,
+                        PolyParamScheduler, ReduceOnPlateauLR,
+                        ReduceOnPlateauMomentum, ReduceOnPlateauParamScheduler,
+                        StepLR, StepMomentum, StepParamScheduler,
+                        _ParamScheduler)
+
+# yapf: enable
+__all__ = [
+    'OPTIM_WRAPPER_CONSTRUCTORS', 'OPTIMIZERS', 'build_optim_wrapper',
+    'DefaultOptimWrapperConstructor', 'ConstantLR', 'CosineAnnealingLR',
+    'ExponentialLR', 'LinearLR', 'MultiStepLR', 'StepLR', 'ConstantMomentum',
+    'CosineAnnealingMomentum', 'ExponentialMomentum', 'LinearMomentum',
+    'MultiStepMomentum', 'StepMomentum', 'ConstantParamScheduler',
+    'CosineAnnealingParamScheduler', 'ExponentialParamScheduler',
+    'LinearParamScheduler', 'MultiStepParamScheduler', 'StepParamScheduler',
+    '_ParamScheduler', 'OptimWrapper', 'AmpOptimWrapper', 'ApexOptimWrapper',
+    'OptimWrapperDict', 'OneCycleParamScheduler', 'OneCycleLR', 'PolyLR',
+    'PolyMomentum', 'PolyParamScheduler', 'ReduceOnPlateauLR',
+    'ReduceOnPlateauMomentum', 'ReduceOnPlateauParamScheduler',
+    'ZeroRedundancyOptimizer', 'BaseOptimWrapper'
+]
diff --git a/head_extractor/src/mmengine/optim/optimizer/__init__.py b/head_extractor/src/mmengine/optim/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebf1f1e3a56badc5161fff712ea3fe552e67f695
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/optimizer/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .amp_optimizer_wrapper import AmpOptimWrapper
+from .apex_optimizer_wrapper import ApexOptimWrapper
+from .base import BaseOptimWrapper
+from .builder import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS,
+                      build_optim_wrapper)
+from .default_constructor import DefaultOptimWrapperConstructor
+from .optimizer_wrapper import OptimWrapper
+from .optimizer_wrapper_dict import OptimWrapperDict
+from .zero_optimizer import ZeroRedundancyOptimizer
+
+__all__ = [
+    'OPTIM_WRAPPER_CONSTRUCTORS', 'OPTIMIZERS',
+    'DefaultOptimWrapperConstructor', 'build_optim_wrapper', 'OptimWrapper',
+    'AmpOptimWrapper', 'ApexOptimWrapper', 'OptimWrapperDict',
+    'ZeroRedundancyOptimizer', 'BaseOptimWrapper'
+]
diff --git a/head_extractor/src/mmengine/optim/optimizer/amp_optimizer_wrapper.py b/head_extractor/src/mmengine/optim/optimizer/amp_optimizer_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f3323f2cccd358a75827cae9e7384860b6ebf6f
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/optimizer/amp_optimizer_wrapper.py
@@ -0,0 +1,190 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from contextlib import contextmanager
+from typing import Union
+
+import torch
+import torch.nn as nn
+
+from mmengine.device import (is_cuda_available, is_mlu_available,
+                             is_musa_available, is_npu_available)
+from mmengine.registry import OPTIM_WRAPPERS
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+from .optimizer_wrapper import OptimWrapper
+
+if is_npu_available():
+    from torch.npu.amp import GradScaler
+elif is_mlu_available():
+    from torch.mlu.amp import GradScaler
+else:
+    from torch.cuda.amp import GradScaler
+
+
+@OPTIM_WRAPPERS.register_module()
+class AmpOptimWrapper(OptimWrapper):
+    """A subclass of :class:`OptimWrapper` that supports automatic mixed
+    precision training based on torch.cuda.amp.
+
+    ``AmpOptimWrapper`` provides a unified interface with
+    ``OptimWrapper``, so ``AmpOptimWrapper`` can be used in the same way
+    as ``OptimWrapper``.
+
+    Warnings:
+        ``AmpOptimWrapper`` requires PyTorch >= 1.6.
+
+    Args:
+        loss_scale (float or str or dict): The initial configuration of
+            `torch.cuda.amp.GradScaler`. See more specific arguments
+            introduction at `PyTorch AMP <https://pytorch.org/docs/stable/amp.html?highlight=gradscalertorch.cuda.amp.GradScaler>`_ # noqa: E501
+            Defaults to ``dynamic``.
+
+            - "dynamic": Initialize GradScale without any arguments.
+            - float: Initialize GradScaler with ``init_scale``.
+            - dict: Initialize GradScaler with more detail configuration.
+
+        dtype (str or torch.dtype, optional): The data type to autocast in amp.
+            If a ``str`` is given, it will be converted to ``torch.dtype``.
+            Valid ``str`` format are `'float16'`, `'bfloat16'`, `'float32'` and
+            `'float64'`. If set to ``None``, the default data type will be used.
+            Defaults to None.
+            `New in version 0.6.1.`
+        use_fsdp (bool): Using ``ShardedGradScaler`` when it is True. It should
+            be enabled when using ``FullyShardedDataParallel``.
+            Defaults to False.
+            `New in version 0.8.0.`
+        **kwargs: Keyword arguments passed to OptimWrapper.
+
+    Warnings:
+        ``dtype`` argument is only available with PyTorch version >= 1.10.0. If
+        you use PyTorch of an older version, it will be ignored.
+
+    Note:
+        If you use ``IterBasedRunner`` and enable gradient accumulation,
+        the original `max_iters` should be multiplied by
+        ``accumulative_counts``.
+    """
+
+    valid_dtypes = ('float16', 'bfloat16', 'float32', 'float64')
+
+    def __init__(self,
+                 loss_scale: str = 'dynamic',
+                 dtype: Union[str, torch.dtype] = None,
+                 use_fsdp: bool = False,
+                 **kwargs):
+        assert digit_version(TORCH_VERSION) >= digit_version('1.6.0'), (
+            '`torch.cuda.amp` is only available when pytorch version >= 1.6')
+        assert is_cuda_available() or is_npu_available() or is_mlu_available(
+        ) or is_musa_available(), (
+            '``AmpOptimizerWrapper`` is only available training '
+            'on gpu, npu, mlu or musa')
+        super().__init__(**kwargs)
+        self._scale_update_param = None
+
+        if use_fsdp:
+            if digit_version(torch.__version__) >= digit_version('2.0.0'):
+                from torch.distributed.fsdp.sharded_grad_scaler import \
+                    ShardedGradScaler
+                scaler_type = ShardedGradScaler
+            else:
+                raise RuntimeError(
+                    'PyTorch>=2.0.0 is required when sets `use_fsdp=True`')
+        else:
+            scaler_type = GradScaler
+
+        if loss_scale == 'dynamic':
+            #  If loss_scale is a string, it must be 'dynamic', then dynamic
+            #  loss scaling will be used.
+            self.loss_scaler = scaler_type()
+        elif isinstance(loss_scale, float):
+            # Static loss scaling
+            self._scale_update_param = loss_scale
+            self.loss_scaler = scaler_type(init_scale=loss_scale)
+        elif isinstance(loss_scale, dict):
+            # More specific configuration.
+            self.loss_scaler = scaler_type(**loss_scale)
+        else:
+            raise TypeError('loss_scale must be of type float, dict, or '
+                            f'"dynamic", but got {loss_scale}')
+
+        # convert string value to torch.dtype
+        if isinstance(dtype, str):
+            assert dtype in self.valid_dtypes, (
+                f'dtype should be any of {self.valid_dtypes}, got {dtype}')
+            dtype = getattr(torch, dtype)
+
+        assert dtype is None or isinstance(dtype, torch.dtype), (
+            f'dtype should be None or instance of torch.dtype, got {dtype}')
+        self.cast_dtype = dtype
+
+    def backward(self, loss: torch.Tensor, **kwargs):
+        """Perform gradient back propagation with :attr:`loss_scaler`.
+
+        Args:
+            loss (torch.Tensor): The loss of current iteration.
+            kwargs: Keyword arguments passed to :meth:`torch.Tensor.backward`
+        """
+        self.loss_scaler.scale(loss).backward(**kwargs)
+        self._inner_count += 1
+
+    def step(self, **kwargs):
+        """Update parameters with :attr:`loss_scaler`.
+
+        Args:
+            kwargs: Keyword arguments passed to
+                :meth:`torch.optim.Optimizer.step`.
+        """
+        if self.clip_grad_kwargs:
+            self.loss_scaler.unscale_(self.optimizer)
+            self._clip_grad()
+        self.loss_scaler.step(self.optimizer, **kwargs)
+        self.loss_scaler.update(self._scale_update_param)
+
+    def state_dict(self) -> dict:
+        """Get the state dictionary of :attr:`optimizer` and
+        :attr:`loss_scaler`.
+
+        Based on the state dictionary of the optimizer, the returned state
+        dictionary will add a key named "loss_scaler".
+
+        Returns:
+            dict: The merged state dict of :attr:`loss_scaler` and
+            :attr:`optimizer`.
+        """
+        # save state_dict of loss_scaler
+        state_dict = super().state_dict()
+        state_dict['loss_scaler'] = self.loss_scaler.state_dict()
+        return state_dict
+
+    def load_state_dict(self, state_dict: dict):
+        """Load and parse the state dictionary of :attr:`optimizer` and
+        :attr:`loss_scaler`.
+
+        If state_dict contains "loss_scaler.", the :attr:`loss_scaler` will
+        load the corresponding keys. Otherwise, only the :attr:`optimizer`
+        will load the state dictionary.
+
+        Args:
+            state_dict (dict): The state dict of :attr:`optimizer` and
+                :attr:`loss_scaler`
+        """
+        if 'loss_scaler' in state_dict:
+            self.loss_scaler.load_state_dict(state_dict.pop('loss_scaler'))
+
+        if 'base_param_settings' in state_dict:
+            self.base_param_settings = state_dict.pop('base_param_settings')
+
+        # load state_dict of optimizer
+        self.optimizer.load_state_dict(state_dict)
+
+    @contextmanager
+    def optim_context(self, model: nn.Module):
+        """Enables the context for mixed precision training, and enables the
+        context for disabling gradient synchronization during gradient
+        accumulation context.
+
+        Args:
+            model (nn.Module): The training model.
+        """
+        from mmengine.runner.amp import autocast
+        with super().optim_context(model), autocast(dtype=self.cast_dtype):
+            yield
diff --git a/head_extractor/src/mmengine/optim/optimizer/apex_optimizer_wrapper.py b/head_extractor/src/mmengine/optim/optimizer/apex_optimizer_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f2f6f4a1b5c2c2bf5c14a672da5fd576c72a0c2
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/optimizer/apex_optimizer_wrapper.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from contextlib import contextmanager
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+# a circular import will be caused by
+# from mmengine.model.wrappers import is_model_wrapper
+import mmengine
+from mmengine.registry import OPTIM_WRAPPERS
+from .optimizer_wrapper import OptimWrapper
+
+try:
+    import apex.amp as apex_amp
+except ImportError:
+    apex_amp = None
+
+
+@OPTIM_WRAPPERS.register_module()
+class ApexOptimWrapper(OptimWrapper):
+    """A subclass of :class:`OptimWrapper` that supports automatic mixed
+    precision training based on apex.amp.
+
+    ``ApexOptimWrapper`` provides a unified interface with
+    ``OptimWrapper``, so it can be used in the same way as ``OptimWrapper``.
+
+    Warning:
+        ``ApexOptimWrapper`` requires `nvidia apex <https://github.com/NVIDIA/apex>`_
+
+    Args:
+        opt_level (str): Pure or mixed precision optimization level. Accepted
+            values are "O0", "O1", "O2", and "O3". Defaults to "O1".
+        loss_scale (float or str, optional): If passed as a string, must be a
+            string representing a number, e.g., "128.0", or the string
+            "dynamic". Defaults to "dynamic".
+        enabled (bool): If False, renders all Amp calls no-ops, so your script
+            should run as if Amp were not present. Defaults to True.
+        cast_model_type (torch.dtype, optional): Model's parameters and
+            buffers to the desired type. Defaults to None.
+        patch_torch_functions (bool, optional): Patch all Torch functions
+            and Tensor methods to perform Tensor Core-friendly ops like GEMMs
+            and convolutions in FP16, and any ops that benefit from FP32
+            precision in FP32. Defaults to None.
+        keep_batchnorm_fp32 (bool or str, optional): To enhance precision
+            and enable cudnn batchnorm (which improves performance),
+            it's often beneficial to keep batchnorm weights in FP32
+            even if the rest of the model is FP16.
+            If passed as a string, must be the string "True" or "False".
+            Defaults to None.
+        master_weights (bool, optional): Maintain FP32 master weights to
+            accompany any FP16 model weights. FP32 master weights are stepped
+            by the optimizer to enhance precision and capture small gradients.
+            Defaults to None.
+        cast_model_outputs (torch.dtype, optional): Option to ensure that
+            the outputs of your model(s) are always cast to a particular type
+            regardless of ``opt_level``. Defaults to None.
+        num_losses (int): Option to tell Amp in advance how many
+            losses/backward passes you plan to use. Defaults to 1.
+        verbosity (int): Set to 0 to suppress Amp-related output.
+            Defaults to 1.
+        min_loss_scale (float, optional): Sets a floor for the loss scale
+            values that can be chosen by dynamic loss scaling.
+            The default value of None means that no floor is imposed.
+            If dynamic loss scaling is not used, `min_loss_scale` is ignored.
+            Defaults to None.
+        max_loss_scale (float, optional): Sets a ceiling for the loss scale
+            values that can be chosen by dynamic loss scaling. If dynamic
+            loss scaling is not used, `max_loss_scale` is ignored.
+            Defaults to 2.**24.
+        **kwargs: Keyword arguments passed to OptimWrapper.
+
+    Note:
+        If you use ``IterBasedRunner`` and enable gradient accumulation,
+        the original `max_iters` should be multiplied by
+        ``accumulative_counts``.
+
+    Note:
+        `New in version 0.6.0.`
+    """  # noqa: E501
+
+    def __init__(self,
+                 opt_level: str = 'O1',
+                 loss_scale: Union[float, str, None] = 'dynamic',
+                 enabled: Optional[bool] = True,
+                 cast_model_type: Optional[torch.dtype] = None,
+                 patch_torch_functions: Optional[bool] = None,
+                 keep_batchnorm_fp32: Union[bool, str, None] = None,
+                 master_weights: Optional[bool] = None,
+                 cast_model_outputs: Optional[torch.dtype] = None,
+                 num_losses: int = 1,
+                 verbosity: int = 1,
+                 min_loss_scale: Optional[float] = None,
+                 max_loss_scale: Optional[float] = 2.**24,
+                 **kwargs):
+        assert apex_amp is not None, \
+            'Apex is not installed. Please check ' \
+            'https://github.com/NVIDIA/apex#linux.'
+        super().__init__(**kwargs)
+        self.opt_level = opt_level
+        self.loss_scale = loss_scale
+        self.enabled = enabled
+        self.cast_model_type = cast_model_type
+        self.patch_torch_functions = patch_torch_functions
+        self.keep_batchnorm_fp32 = keep_batchnorm_fp32
+        self.master_weights = master_weights
+        self.cast_model_outputs = cast_model_outputs
+        self.num_losses = num_losses
+        self.verbosity = verbosity
+        self.min_loss_scale = min_loss_scale
+        self.max_loss_scale = max_loss_scale
+        self._apex_amp_state_dict = None
+
+    def backward(self, loss: torch.Tensor, **kwargs) -> None:
+        """Perform gradient back propagation with :attr:`loss_scaler`.
+
+        Args:
+            loss (torch.Tensor): The loss of current iteration.
+            kwargs: Keyword arguments passed to :meth:`torch.Tensor.backward`
+        """
+        with apex_amp.scale_loss(loss, self.optimizer) as scaled_loss:
+            scaled_loss.backward(**kwargs)
+        self._inner_count += 1
+
+    def state_dict(self) -> dict:
+        """Get the state dictionary of :attr:`optimizer` and
+        :attr:`apex_amp`.
+
+        Based on the state dictionary of the optimizer, the returned state
+        dictionary will add a key named "apex_amp".
+
+        Returns:
+            dict: The merged state dict of :attr:`apex_amp` and
+            :attr:`optimizer`.
+        """
+        state_dict = self.optimizer.state_dict()
+        state_dict['apex_amp'] = apex_amp.state_dict()
+        return state_dict
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        """Load and parse the state dictionary of :attr:`optimizer` and
+        :attr:`apex_amp`.
+
+        If state_dict contains "apex_amp", the :attr:`apex_amp` will
+        load the corresponding keys. Otherwise, only the :attr:`optimizer`
+        will load the state dictionary.
+
+        Note:
+            :meth:`load_state_dict` shuold be called after
+            `apex_amp.initialize` is called.
+        Args:
+            state_dict (dict): The state dict of :attr:`optimizer` and
+                :attr:`apex_amp`
+        """
+        if 'apex_amp' in state_dict:
+            # when `apex_amp` is not initialized, calling `load_state_dict`
+            # will raise an error, so we temporarily cache the apex_amp
+            # part, and then load it into `apex_amp` after completing
+            # the `apex_amp` initialization in `optim_context` method
+            if hasattr(self.optimizer, '_amp_stash'):
+                apex_amp.load_state_dict(state_dict.pop('apex_amp'))
+            else:
+                self._apex_amp_state_dict = state_dict.pop('apex_amp')
+        self.optimizer.load_state_dict(state_dict)
+
+    @contextmanager
+    def optim_context(self, model: nn.Module):
+        """Enables the context for mixed precision training, and enables the
+        context for disabling gradient synchronization during gradient
+        accumulation context.
+
+        Args:
+            model (nn.Module): The training model.
+        """
+        with super().optim_context(model):
+            # when a given optimizer be passed through apex_amp.initialize,
+            # the "_amp_stash" property will be added
+            if not hasattr(self.optimizer, '_amp_stash'):
+                if mmengine.model.wrappers.is_model_wrapper(model):
+                    model = model.module
+                model, self.optimizer = apex_amp.initialize(
+                    model,
+                    self.optimizer,
+                    opt_level=self.opt_level,
+                    loss_scale=self.loss_scale,
+                    enabled=self.enabled,
+                    cast_model_type=self.cast_model_type,
+                    patch_torch_functions=self.patch_torch_functions,
+                    keep_batchnorm_fp32=self.keep_batchnorm_fp32,
+                    master_weights=self.master_weights,
+                    cast_model_outputs=self.cast_model_outputs,
+                    num_losses=self.num_losses,
+                    verbosity=self.verbosity,
+                    min_loss_scale=self.min_loss_scale,
+                    max_loss_scale=self.max_loss_scale)
+                # loading apex_amp state_dict after initialization of apex_amp
+                if self._apex_amp_state_dict is not None:
+                    apex_amp.load_state_dict(self._apex_amp_state_dict)
+                    self._apex_amp_state_dict = None
+            yield
diff --git a/head_extractor/src/mmengine/optim/optimizer/base.py b/head_extractor/src/mmengine/optim/optimizer/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee53f508b146fc7057053f8805fd7345e1105352
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/optimizer/base.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Dict, List
+
+import torch
+
+
+class BaseOptimWrapper(metaclass=ABCMeta):
+
+    def __init__(self, optimizer):
+        self.optimizer = optimizer
+
+        # The Following code is used to initialize `base_param_settings`.
+        # `base_param_settings` is used to store the parameters that are not
+        # updated by the optimizer.
+        # The `base_param_settings` used for tracking the base learning in the
+        # optimizer. If the optimizer has multiple parameter groups, this
+        # params will not be scaled by the loss factor.
+        if len(optimizer.param_groups) > 1:
+            self.base_param_settings = {
+                'params': torch.tensor([0.0], dtype=torch.float)
+            }
+            self.base_param_settings.update(**self.optimizer.defaults)
+        else:
+            self.base_param_settings = None  # type: ignore
+
+    @abstractmethod
+    def update_params(self, *args, **kwargs):
+        """Update parameters in :attr:`optimizer`."""
+
+    @abstractmethod
+    def backward(self, loss: torch.Tensor, **kwargs) -> None:
+        """Perform gradient back propagation."""
+
+    @abstractmethod
+    def zero_grad(self, **kwargs) -> None:
+        """A wrapper of ``Optimizer.zero_grad``."""
+
+    @abstractmethod
+    def step(self, **kwargs):
+        """Call the step method of optimizer."""
+
+    def state_dict(self) -> dict:
+        """A wrapper of ``Optimizer.state_dict``."""
+        state_dict = self.optimizer.state_dict()
+        if self.base_param_settings is not None:
+            state_dict['base_param_settings'] = self.base_param_settings
+        return state_dict
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        """A wrapper of ``Optimizer.load_state_dict``. load the state dict of
+        :attr:`optimizer`.
+
+        Provide unified ``load_state_dict`` interface compatible with automatic
+        mixed precision training. Subclass can overload this method to
+        implement the required logic. For example, the state dictionary of
+        GradScaler should be loaded when training with ``torch.cuda.amp``.
+
+        Args:
+            state_dict (dict): The state dictionary of :attr:`optimizer`.
+        """
+        base_param_settings = state_dict.pop('base_param_settings', None)
+
+        if base_param_settings is not None:
+            self.base_param_settings = base_param_settings
+
+        # load state_dict of optimizer
+        self.optimizer.load_state_dict(state_dict)
+
+    @property
+    def param_groups(self) -> List[dict]:
+        """A wrapper of ``Optimizer.param_groups``.
+
+        Make OptimizeWrapper compatible with :class:`_ParamScheduler`.
+
+        Returns:
+             dict: the ``param_groups`` of :attr:`optimizer`.
+        """
+        if self.base_param_settings is not None:
+            return self.optimizer.param_groups + [self.base_param_settings]
+        else:
+            return self.optimizer.param_groups
+
+    @property
+    def defaults(self) -> dict:
+        """A wrapper of ``Optimizer.defaults``.
+
+        Make OptimizeWrapper compatible with :class:`_ParamScheduler`.
+
+        Returns:
+             dict: the ``param_groups`` of :attr:`optimizer`.
+        """
+        return self.optimizer.defaults
+
+    def get_lr(self):
+        """Get the learning rate of the optimizer.
+
+        Provide unified interface to get learning rate of optimizer.
+
+        Returns:
+            Dict[str, List[float]]:
+            param_groups learning rate of the optimizer.
+        """
+        res = {}
+        if self.base_param_settings is not None:
+            res['base_lr'] = [self.base_param_settings['lr']]
+
+        res['lr'] = [group['lr'] for group in self.optimizer.param_groups]
+
+        return res
+
+    def get_momentum(self) -> Dict[str, List[float]]:
+        """Get the momentum of the optimizer.
+
+        Provide unified interface to get momentum of optimizer.
+
+        Returns:
+            Dict[str, List[float]]: Momentum of the optimizer.
+        """
+        momentum = []
+        for group in self.optimizer.param_groups:
+            # Get momentum of SGD.
+            if 'momentum' in group.keys():
+                momentum.append(group['momentum'])
+            # Get momentum of Adam.
+            elif 'betas' in group.keys():
+                momentum.append(group['betas'][0])
+            else:
+                momentum.append(0)
+        return dict(momentum=momentum)
diff --git a/head_extractor/src/mmengine/optim/optimizer/builder.py b/head_extractor/src/mmengine/optim/optimizer/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..37e0370e917e12eb2d40a063975c07ff1f41d6dc
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/optimizer/builder.py
@@ -0,0 +1,216 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+from typing import List, Union
+
+import torch
+import torch.nn as nn
+
+from mmengine.config import Config, ConfigDict
+from mmengine.device import is_npu_available, is_npu_support_full_precision
+from mmengine.registry import OPTIM_WRAPPER_CONSTRUCTORS, OPTIMIZERS
+from .optimizer_wrapper import OptimWrapper
+
+
+def register_torch_optimizers() -> List[str]:
+    """Register optimizers in ``torch.optim`` to the ``OPTIMIZERS`` registry.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    torch_optimizers = []
+    for module_name in dir(torch.optim):
+        if module_name.startswith('__'):
+            continue
+        _optim = getattr(torch.optim, module_name)
+        if inspect.isclass(_optim) and issubclass(_optim,
+                                                  torch.optim.Optimizer):
+            #OPTIMIZERS.register_module(module=_optim)
+            if module_name == 'Adafactor':
+                OPTIMIZERS.register_module(
+                    name='torch_Adafactor', module=_optim)
+            else:
+                OPTIMIZERS.register_module(module=_optim)
+            torch_optimizers.append(module_name)
+    return torch_optimizers
+
+
+TORCH_OPTIMIZERS = register_torch_optimizers()
+
+
+def register_torch_npu_optimizers() -> List[str]:
+    """Register optimizers in ``torch npu`` to the ``OPTIMIZERS`` registry.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    if not is_npu_available():
+        return []
+
+    import torch_npu
+    if not hasattr(torch_npu, 'optim'):
+        return []
+
+    torch_npu_optimizers = []
+    for module_name in dir(torch_npu.optim):
+        if module_name.startswith('__') or module_name in OPTIMIZERS:
+            continue
+        _optim = getattr(torch_npu.optim, module_name)
+        if inspect.isclass(_optim) and issubclass(_optim,
+                                                  torch.optim.Optimizer):
+            OPTIMIZERS.register_module(module=_optim)
+            torch_npu_optimizers.append(module_name)
+    return torch_npu_optimizers
+
+
+NPU_OPTIMIZERS = register_torch_npu_optimizers()
+
+
+def register_dadaptation_optimizers() -> List[str]:
+    """Register optimizers in ``dadaptation`` to the ``OPTIMIZERS`` registry.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    dadaptation_optimizers = []
+    try:
+        import dadaptation
+    except ImportError:
+        pass
+    else:
+        for module_name in ['DAdaptAdaGrad', 'DAdaptAdam', 'DAdaptSGD']:
+            _optim = getattr(dadaptation, module_name)
+            if inspect.isclass(_optim) and issubclass(_optim,
+                                                      torch.optim.Optimizer):
+                OPTIMIZERS.register_module(module=_optim)
+                dadaptation_optimizers.append(module_name)
+    return dadaptation_optimizers
+
+
+DADAPTATION_OPTIMIZERS = register_dadaptation_optimizers()
+
+
+def register_lion_optimizers() -> List[str]:
+    """Register Lion optimizer to the ``OPTIMIZERS`` registry.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    optimizers = []
+    try:
+        from lion_pytorch import Lion
+    except ImportError:
+        pass
+    else:
+        OPTIMIZERS.register_module(module=Lion)
+        optimizers.append('Lion')
+    return optimizers
+
+
+LION_OPTIMIZERS = register_lion_optimizers()
+
+
+def register_sophia_optimizers() -> List[str]:
+    """Register Sophia optimizer to the ``OPTIMIZERS`` registry.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    optimizers = []
+    try:
+        import Sophia
+    except ImportError:
+        pass
+    else:
+        for module_name in dir(Sophia):
+            _optim = getattr(Sophia, module_name)
+            if inspect.isclass(_optim) and issubclass(_optim,
+                                                      torch.optim.Optimizer):
+                OPTIMIZERS.register_module(module=_optim)
+                optimizers.append(module_name)
+    return optimizers
+
+
+SOPHIA_OPTIMIZERS = register_sophia_optimizers()
+
+
+def register_bitsandbytes_optimizers() -> List[str]:
+    """Register optimizers in ``bitsandbytes`` to the ``OPTIMIZERS`` registry.
+
+    In the `bitsandbytes` library, optimizers that have the same name as the
+    default optimizers in PyTorch are prefixed with ``bnb_``. For example,
+    ``bnb_Adagrad``.
+
+    Returns:
+        List[str]: A list of registered optimizers' name.
+    """
+    dadaptation_optimizers = []
+    try:
+        import bitsandbytes as bnb
+    except ImportError:
+        pass
+    else:
+        optim_classes = inspect.getmembers(
+            bnb.optim, lambda _optim: (inspect.isclass(_optim) and issubclass(
+                _optim, torch.optim.Optimizer)))
+        for name, optim_cls in optim_classes:
+            if name in OPTIMIZERS:
+                name = f'bnb_{name}'
+            OPTIMIZERS.register_module(module=optim_cls, name=name)
+            dadaptation_optimizers.append(name)
+    return dadaptation_optimizers
+
+
+BITSANDBYTES_OPTIMIZERS = register_bitsandbytes_optimizers()
+
+
+def register_transformers_optimizers():
+    transformer_optimizers = []
+    try:
+        from transformers import Adafactor
+    except ImportError:
+        pass
+    else:
+        OPTIMIZERS.register_module(name='Adafactor', module=Adafactor)
+        transformer_optimizers.append('Adafactor')
+    return transformer_optimizers
+
+
+TRANSFORMERS_OPTIMIZERS = register_transformers_optimizers()
+
+
+def build_optim_wrapper(model: nn.Module,
+                        cfg: Union[dict, Config, ConfigDict]) -> OptimWrapper:
+    """Build function of OptimWrapper.
+
+    If ``constructor`` is set in the ``cfg``, this method will build an
+    optimizer wrapper constructor, and use optimizer wrapper constructor to
+    build the optimizer wrapper. If ``constructor`` is not set, the
+    ``DefaultOptimWrapperConstructor`` will be used by default.
+
+    Args:
+        model (nn.Module): Model to be optimized.
+        cfg (dict): Config of optimizer wrapper, optimizer constructor and
+            optimizer.
+
+    Returns:
+        OptimWrapper: The built optimizer wrapper.
+    """
+    optim_wrapper_cfg = copy.deepcopy(cfg)
+    constructor_type = optim_wrapper_cfg.pop('constructor',
+                                             'DefaultOptimWrapperConstructor')
+    paramwise_cfg = optim_wrapper_cfg.pop('paramwise_cfg', None)
+
+    # Since the current generation of NPU(Ascend 910) only supports
+    # mixed precision training, here we turn on mixed precision
+    # to make the training normal
+    if is_npu_available() and not is_npu_support_full_precision():
+        optim_wrapper_cfg['type'] = 'AmpOptimWrapper'
+
+    optim_wrapper_constructor = OPTIM_WRAPPER_CONSTRUCTORS.build(
+        dict(
+            type=constructor_type,
+            optim_wrapper_cfg=optim_wrapper_cfg,
+            paramwise_cfg=paramwise_cfg))
+    optim_wrapper = optim_wrapper_constructor(model)
+    return optim_wrapper
diff --git a/head_extractor/src/mmengine/optim/optimizer/default_constructor.py b/head_extractor/src/mmengine/optim/optimizer/default_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec223a79678ff5f88483adbbf2ca769c92dd49b1
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/optimizer/default_constructor.py
@@ -0,0 +1,321 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import logging
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import GroupNorm, LayerNorm
+
+from mmengine.logging import print_log
+from mmengine.registry import (OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS,
+                               OPTIMIZERS)
+from mmengine.utils import is_list_of
+from mmengine.utils.dl_utils import mmcv_full_available
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm
+from .optimizer_wrapper import OptimWrapper
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class DefaultOptimWrapperConstructor:
+    """Default constructor for optimizers.
+
+    By default, each parameter share the same optimizer settings, and we
+    provide an argument ``paramwise_cfg`` to specify parameter-wise settings.
+    It is a dict and may contain the following fields:
+
+    - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If
+      one of the keys in ``custom_keys`` is a substring of the name of one
+      parameter, then the setting of the parameter will be specified by
+      ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will
+      be ignored. It should be noted that the aforementioned ``key`` is the
+      longest key that is a substring of the name of the parameter. If there
+      are multiple matched keys with the same length, then the key with lower
+      alphabet order will be chosen.
+      ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult``
+      and ``decay_mult``. See Example 2 below.
+    - ``bias_lr_mult`` (float): It will be multiplied to the learning
+      rate for all bias parameters (except for those in normalization
+      layers and offset layers of DCN).
+    - ``bias_decay_mult`` (float): It will be multiplied to the weight
+      decay for all bias parameters (except for those in
+      normalization layers, depthwise conv layers, offset layers of DCN).
+    - ``norm_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of normalization
+      layers.
+    - ``flat_decay_mult`` (float): It will be multiplied to the weight
+      decay for all one-dimensional parameters
+    - ``dwconv_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of depthwise conv
+      layers.
+    - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning
+      rate for parameters of offset layer in the deformable convs
+      of a model.
+    - ``bypass_duplicate`` (bool): If true, the duplicate parameters
+      would not be added into optimizer. Defaults to False.
+
+    Note:
+
+        1. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        override the effect of ``bias_lr_mult`` in the bias of offset layer.
+        So be careful when using both ``bias_lr_mult`` and
+        ``dcn_offset_lr_mult``. If you wish to apply both of them to the offset
+        layer in deformable convs, set ``dcn_offset_lr_mult`` to the original
+        ``dcn_offset_lr_mult`` * ``bias_lr_mult``.
+
+        2. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        apply it to all the DCN layers in the model. So be careful when the
+        model contains multiple DCN layers in places other than backbone.
+
+    Args:
+        optim_wrapper_cfg (dict): The config dict of the optimizer wrapper.
+
+            Required fields of ``optim_wrapper_cfg`` are
+
+            - ``type``: class name of the OptimizerWrapper
+            - ``optimizer``: The configuration of optimizer.
+
+            Optional fields of ``optim_wrapper_cfg`` are
+
+            - any arguments of the corresponding optimizer wrapper type,
+              e.g., accumulative_counts, clip_grad, etc.
+
+            Required fields of ``optimizer`` are
+
+            - `type`: class name of the optimizer.
+
+            Optional fields of ``optimizer`` are
+
+            - any arguments of the corresponding optimizer type, e.g.,
+              lr, weight_decay, momentum, etc.
+
+        paramwise_cfg (dict, optional): Parameter-wise options.
+
+    Example 1:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optim_wrapper_cfg = dict(
+        >>>     dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01,
+        >>>         momentum=0.9, weight_decay=0.0001))
+        >>> paramwise_cfg = dict(norm_decay_mult=0.)
+        >>> optim_wrapper_builder = DefaultOptimWrapperConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+
+    Example 2:
+        >>> # assume model have attribute model.backbone and model.cls_head
+        >>> optim_wrapper_cfg = dict(type='OptimWrapper', optimizer=dict(
+        >>>     type='SGD', lr=0.01, weight_decay=0.95))
+        >>> paramwise_cfg = dict(custom_keys={
+        >>>     'backbone': dict(lr_mult=0.1, decay_mult=0.9)})
+        >>> optim_wrapper_builder = DefaultOptimWrapperConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+        >>> # Then the `lr` and `weight_decay` for model.backbone is
+        >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for
+        >>> # model.cls_head is (0.01, 0.95).
+    """
+
+    def __init__(self,
+                 optim_wrapper_cfg: dict,
+                 paramwise_cfg: Optional[dict] = None):
+        if not isinstance(optim_wrapper_cfg, dict):
+            raise TypeError('optimizer_cfg should be a dict',
+                            f'but got {type(optim_wrapper_cfg)}')
+        assert 'optimizer' in optim_wrapper_cfg, (
+            '`optim_wrapper_cfg` must contain "optimizer" config')
+        self.optim_wrapper_cfg = optim_wrapper_cfg.copy()
+        self.optimizer_cfg = self.optim_wrapper_cfg.pop('optimizer')
+        self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg
+        self.base_lr = self.optimizer_cfg.get('lr', None)
+        self.base_wd = self.optimizer_cfg.get('weight_decay', None)
+        self._validate_cfg()
+
+    def _validate_cfg(self) -> None:
+        """verify the correctness of the config."""
+        if not isinstance(self.paramwise_cfg, dict):
+            raise TypeError('paramwise_cfg should be None or a dict, '
+                            f'but got {type(self.paramwise_cfg)}')
+
+        if 'custom_keys' in self.paramwise_cfg:
+            if not isinstance(self.paramwise_cfg['custom_keys'], dict):
+                raise TypeError(
+                    'If specified, custom_keys must be a dict, '
+                    f'but got {type(self.paramwise_cfg["custom_keys"])}')
+            if self.base_wd is None:
+                for key in self.paramwise_cfg['custom_keys']:
+                    if 'decay_mult' in self.paramwise_cfg['custom_keys'][key]:
+                        raise ValueError('base_wd should not be None')
+
+        # get base lr and weight decay
+        # weight_decay must be explicitly specified if mult is specified
+        if ('bias_decay_mult' in self.paramwise_cfg
+                or 'norm_decay_mult' in self.paramwise_cfg
+                or 'dwconv_decay_mult' in self.paramwise_cfg):
+            if self.base_wd is None:
+                raise ValueError('base_wd should not be None')
+
+    def _is_in(self, param_group: dict, param_group_list: list) -> bool:
+        """check whether the `param_group` is in the`param_group_list`"""
+        assert is_list_of(param_group_list, dict)
+        param = set(param_group['params'])
+        param_set = set()
+        for group in param_group_list:
+            param_set.update(set(group['params']))
+
+        return not param.isdisjoint(param_set)
+
+    def add_params(self,
+                   params: List[dict],
+                   module: nn.Module,
+                   prefix: str = '',
+                   is_dcn_module: Optional[Union[int, float]] = None) -> None:
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        # get param-wise options
+        custom_keys = self.paramwise_cfg.get('custom_keys', {})
+        # first sort with alphabet order and then sort with reversed len of str
+        sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
+
+        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', None)
+        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', None)
+        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', None)
+        dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', None)
+        flat_decay_mult = self.paramwise_cfg.get('flat_decay_mult', None)
+        bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)
+        dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', None)
+
+        # special rules for norm layers and depth-wise conv layers
+        is_norm = isinstance(module,
+                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))
+        is_dwconv = (
+            isinstance(module, torch.nn.Conv2d)
+            and module.in_channels == module.groups)
+
+        for name, param in module.named_parameters(recurse=False):
+            param_group = {'params': [param]}
+            if bypass_duplicate and self._is_in(param_group, params):
+                print_log(
+                    f'{prefix} is duplicate. It is skipped since '
+                    f'bypass_duplicate={bypass_duplicate}',
+                    logger='current',
+                    level=logging.WARNING)
+                continue
+            if not param.requires_grad:
+                print_log((f'{prefix}.{name} is skipped since its '
+                           f'requires_grad={param.requires_grad}'),
+                          logger='current',
+                          level=logging.WARNING)
+                continue
+
+            # if the parameter match one of the custom keys, ignore other rules
+            is_custom = False
+            for key in sorted_keys:
+                if key in f'{prefix}.{name}':
+                    is_custom = True
+                    lr_mult = custom_keys[key].get('lr_mult', 1.)
+                    param_group['lr'] = self.base_lr * lr_mult
+                    if self.base_wd is not None:
+                        decay_mult = custom_keys[key].get('decay_mult', 1.)
+                        param_group['weight_decay'] = self.base_wd * decay_mult
+                    # add custom settings to param_group
+                    for k, v in custom_keys[key].items():
+                        param_group[k] = v
+                    break
+
+            if not is_custom:
+                # bias_lr_mult affects all bias parameters
+                # except for norm.bias dcn.conv_offset.bias
+                if name == 'bias' and not (
+                        is_norm or is_dcn_module) and bias_lr_mult is not None:
+                    param_group['lr'] = self.base_lr * bias_lr_mult
+
+                if (prefix.find('conv_offset') != -1 and is_dcn_module
+                        and dcn_offset_lr_mult is not None
+                        and isinstance(module, torch.nn.Conv2d)):
+                    # deal with both dcn_offset's bias & weight
+                    param_group['lr'] = self.base_lr * dcn_offset_lr_mult
+
+                # apply weight decay policies
+                if self.base_wd is not None:
+                    # norm decay
+                    if is_norm and norm_decay_mult is not None:
+                        param_group[
+                            'weight_decay'] = self.base_wd * norm_decay_mult
+                    # bias lr and decay
+                    elif (name == 'bias' and not is_dcn_module
+                          and bias_decay_mult is not None):
+                        param_group[
+                            'weight_decay'] = self.base_wd * bias_decay_mult
+                    # depth-wise conv
+                    elif is_dwconv and dwconv_decay_mult is not None:
+                        param_group[
+                            'weight_decay'] = self.base_wd * dwconv_decay_mult
+                    # flatten parameters except dcn offset
+                    elif (param.ndim == 1 and not is_dcn_module
+                          and flat_decay_mult is not None):
+                        param_group[
+                            'weight_decay'] = self.base_wd * flat_decay_mult
+            params.append(param_group)
+            for key, value in param_group.items():
+                if key == 'params':
+                    continue
+                full_name = f'{prefix}.{name}' if prefix else name
+                print_log(
+                    f'paramwise_options -- {full_name}:{key}={value}',
+                    logger='current')
+
+        if mmcv_full_available():
+            from mmcv.ops import DeformConv2d, ModulatedDeformConv2d
+            is_dcn_module = isinstance(module,
+                                       (DeformConv2d, ModulatedDeformConv2d))
+        else:
+            is_dcn_module = False
+        for child_name, child_mod in module.named_children():
+            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
+            self.add_params(
+                params,
+                child_mod,
+                prefix=child_prefix,
+                is_dcn_module=is_dcn_module)
+
+    def __call__(self, model: nn.Module) -> OptimWrapper:
+        if hasattr(model, 'module'):
+            model = model.module
+
+        optim_wrapper_cfg = self.optim_wrapper_cfg.copy()
+        optim_wrapper_cfg.setdefault('type', 'OptimWrapper')
+        optimizer_cfg = self.optimizer_cfg.copy()
+        optimizer_cls = self.optimizer_cfg['type']
+        # Optimizer like HybridAdam in colossalai requires the argument name
+        # `model_params` rather than `params`. Here we get the first argument
+        # name and fill it with the model parameters.
+        if isinstance(optimizer_cls, str):
+            with OPTIMIZERS.switch_scope_and_registry(None) as registry:
+                optimizer_cls = registry.get(self.optimizer_cfg['type'])
+        fisrt_arg_name = next(
+            iter(inspect.signature(optimizer_cls).parameters))
+        # if no paramwise option is specified, just use the global setting
+        if not self.paramwise_cfg:
+            optimizer_cfg[fisrt_arg_name] = model.parameters()
+            optimizer = OPTIMIZERS.build(optimizer_cfg)
+        else:
+            # set param-wise lr and weight decay recursively
+            params: List = []
+            self.add_params(params, model)
+            optimizer_cfg[fisrt_arg_name] = params
+            optimizer = OPTIMIZERS.build(optimizer_cfg)
+        optim_wrapper = OPTIM_WRAPPERS.build(
+            optim_wrapper_cfg, default_args=dict(optimizer=optimizer))
+        return optim_wrapper
diff --git a/head_extractor/src/mmengine/optim/optimizer/optimizer_wrapper.py b/head_extractor/src/mmengine/optim/optimizer/optimizer_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..41218ef768b013a4881eb357962c0dafdfc5c017
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/optimizer/optimizer_wrapper.py
@@ -0,0 +1,411 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from contextlib import contextmanager
+from typing import Dict, List, Optional
+
+import torch
+import torch.nn as nn
+from torch.optim import Optimizer
+
+from mmengine.logging import MessageHub, print_log
+from mmengine.registry import OPTIM_WRAPPERS
+from mmengine.utils.dl_utils import has_batch_norm
+from .base import BaseOptimWrapper
+
+
+@OPTIM_WRAPPERS.register_module()
+class OptimWrapper(BaseOptimWrapper):
+    """Optimizer wrapper provides a common interface for updating parameters.
+
+    Optimizer wrapper provides a unified interface for single precision
+    training and automatic mixed precision training with different hardware.
+    OptimWrapper encapsulates optimizer to provide simplified interfaces
+    for commonly used training techniques such as gradient accumulative and
+    grad clips. ``OptimWrapper`` implements the basic logic of gradient
+    accumulation and gradient clipping based on ``torch.optim.Optimizer``.
+    The subclasses only need to override some methods to implement the mixed
+    precision training. See more information in :class:`AmpOptimWrapper`.
+
+    Args:
+        optimizer (Optimizer): Optimizer used to update model parameters.
+        accumulative_counts (int): The number of iterations to accumulate
+            gradients. The parameters will be updated per
+            ``accumulative_counts``.
+        clip_grad (dict, optional): If ``clip_grad`` is not None, it will be
+            the arguments of :func:`torch.nn.utils.clip_grad_norm_` or
+            :func:`torch.nn.utils.clip_grad_value_`. ``clip_grad`` should be a
+            dict, and the keys could be set as follows:
+
+            If the key ``type`` is not set, or ``type`` is "norm",
+            the accepted keys are as follows:
+
+            - max_norm (float or int): Max norm of the gradients.
+            - norm_type (float or int): Type of the used p-norm. Can be
+              ``'inf'`` for infinity norm.
+            - error_if_nonfinite (bool): If True, an error is thrown if
+              the total norm of the gradients from :attr:`parameters` is
+              ``nan``, ``inf``, or ``-inf``. Defaults to False (will switch
+              to True in the future)
+
+            If the key ``type`` is set to "value", the accepted keys are as
+            follows:
+
+            - clip_value (float or int): maximum allowed value of the
+              gradients. The gradients are clipped in the range
+              ``(-clip_value, +clip_value)``.
+
+    Note:
+        If ``accumulative_counts`` is larger than 1, perform
+        :meth:`update_params` under the context of  ``optim_context``
+        could avoid unnecessary gradient synchronization.
+
+    Note:
+        If you use ``IterBasedRunner`` and enable gradient accumulation,
+        the original `max_iters` should be multiplied by
+        ``accumulative_counts``.
+
+    Note:
+        The subclass should ensure that once :meth:`update_params` is called,
+        ``_inner_count += 1`` is automatically performed.
+
+    Examples:
+        >>> # Config sample of OptimWrapper and enable clipping gradient by
+        >>> # norm.
+        >>> optim_wrapper_cfg = dict(
+        >>>     type='OptimWrapper',
+        >>>     _accumulative_counts=1,
+        >>>     clip_grad=dict(max_norm=0.2))
+        >>> # Config sample of OptimWrapper and enable clipping gradient by
+        >>> # value.
+        >>> optim_wrapper_cfg = dict(
+        >>>     type='OptimWrapper',
+        >>>     _accumulative_counts=1,
+        >>>     clip_grad=dict(type='value', clip_value=0.2))
+        >>> # Use OptimWrapper to update model.
+        >>> import torch.nn as nn
+        >>> import torch
+        >>> from torch.optim import SGD
+        >>> from torch.utils.data import DataLoader
+        >>> from mmengine.optim import OptimWrapper
+        >>>
+        >>> model = nn.Linear(1, 1)
+        >>> dataset = torch.randn(10, 1, 1)
+        >>> dataloader = DataLoader(dataset)
+        >>> optimizer = SGD(model.parameters(), lr=0.1)
+        >>> optim_wrapper = OptimWrapper(optimizer)
+        >>>
+        >>> for data in dataloader:
+        >>>     loss = model(data)
+        >>>     optim_wrapper.update_params(loss)
+        >>> # Enable gradient accumulation
+        >>> optim_wrapper_cfg = dict(
+        >>>     type='OptimWrapper',
+        >>>     _accumulative_counts=3,
+        >>>     clip_grad=dict(max_norm=0.2))
+        >>> ddp_model = DistributedDataParallel(model)
+        >>> optimizer = SGD(ddp_model.parameters(), lr=0.1)
+        >>> optim_wrapper = OptimWrapper(optimizer)
+        >>> optim_wrapper.initialize_count_status(0, len(dataloader))
+        >>> # If model is a subclass instance of DistributedDataParallel,
+        >>> # `optim_context` context manager can avoid unnecessary gradient
+        >>> #  synchronize.
+        >>> for iter, data in enumerate(dataloader):
+        >>>     with optim_wrapper.optim_context(ddp_model):
+        >>>         loss = model(data)
+        >>>     optim_wrapper.update_params(loss)
+    """
+
+    def __init__(self,
+                 optimizer: Optimizer,
+                 accumulative_counts: int = 1,
+                 clip_grad: Optional[dict] = None):
+        assert accumulative_counts > 0, (
+            '_accumulative_counts at least greater than or equal to 1')
+        self._accumulative_counts = accumulative_counts
+        self.optimizer = optimizer
+
+        if clip_grad is not None:
+            # clip_grad_kwargs should not be non-empty dict.
+            assert isinstance(clip_grad, dict) and clip_grad, (
+                'If `clip_grad` is not None, it should be a `dict` '
+                'which is the arguments of `torch.nn.utils.clip_grad_norm_` '
+                'or clip_grad_value_`.')
+            clip_type = clip_grad.pop('type', 'norm')
+            if clip_type == 'norm':
+                self.clip_func = torch.nn.utils.clip_grad_norm_
+                self.grad_name = 'grad_norm'
+            elif clip_type == 'value':
+                self.clip_func = torch.nn.utils.clip_grad_value_
+                self.grad_name = 'grad_value'
+            else:
+                raise ValueError('type of clip_grad should be "norm" or '
+                                 f'"value" but got {clip_type}')
+            assert clip_grad, ('`clip_grad` should contain other arguments '
+                               'besides `type`. The arguments should match '
+                               'with the `torch.nn.utils.clip_grad_norm_` or '
+                               'clip_grad_value_`')
+        self.clip_grad_kwargs = clip_grad
+        # Used to update `grad_norm` log message.
+        self.message_hub = MessageHub.get_current_instance()
+        self._inner_count = 0
+        # `_max_counts` means the total number of parameter updates.  It
+        # ensures that the gradient of the last few iterations will not be
+        # lost when the `_max_counts` is not divisible by
+        # `accumulative_counts`.
+        self._max_counts = -1
+        # The `_remainder_iter` is used for calculating loss factor at the
+        # last few iterations. If `_max_counts` has not been initialized,
+        # the loss factor will always be the same as `_accumulative_counts`.
+        self._remainder_counts = -1
+
+        # The Following code is used to initialize `base_param_settings`.
+        # `base_param_settings` is used to store the parameters that are not
+        # updated by the optimizer.
+        # The `base_param_settings` used for tracking the base learning in the
+        # optimizer. If the optimizer has multiple parameter groups, this
+        # params will not be scaled by the loss factor.
+        if len(optimizer.param_groups) > 1:
+            self.base_param_settings = {
+                'params': torch.tensor([0.0], dtype=torch.float)
+            }
+            self.base_param_settings.update(**self.optimizer.defaults)
+        else:
+            self.base_param_settings = None  # type: ignore
+
+    def update_params(  # type: ignore
+            self,
+            loss: torch.Tensor,
+            step_kwargs: Optional[Dict] = None,
+            zero_kwargs: Optional[Dict] = None) -> None:
+        """Update parameters in :attr:`optimizer`.
+
+        Args:
+            loss (torch.Tensor): A tensor for back propagation.
+            step_kwargs (dict): Arguments for optimizer.step.
+                Defaults to None.
+                New in version v0.4.0.
+            zero_kwargs (dict): Arguments for optimizer.zero_grad.
+                Defaults to None.
+                New in version v0.4.0.
+        """
+        if step_kwargs is None:
+            step_kwargs = {}
+        if zero_kwargs is None:
+            zero_kwargs = {}
+        loss = self.scale_loss(loss)
+        self.backward(loss)
+        # Update parameters only if `self._inner_count` is divisible by
+        # `self._accumulative_counts` or `self._inner_count` equals to
+        # `self._max_counts`
+        if self.should_update():
+            self.step(**step_kwargs)
+            self.zero_grad(**zero_kwargs)
+
+    def backward(self, loss: torch.Tensor, **kwargs) -> None:
+        """Perform gradient back propagation.
+
+        Provide unified ``backward`` interface compatible with automatic mixed
+        precision training. Subclass can overload this method to implement the
+        required logic. For example, ``torch.cuda.amp`` require some extra
+        operation on GradScaler during backward process.
+
+        Note:
+            If subclasses inherit from ``OptimWrapper`` override
+            ``backward``, ``_inner_count +=1`` must be implemented.
+
+        Args:
+            loss (torch.Tensor): The loss of current iteration.
+            kwargs: Keyword arguments passed to :meth:`torch.Tensor.backward`.
+        """
+        loss.backward(**kwargs)
+        self._inner_count += 1
+
+    def zero_grad(self, **kwargs) -> None:
+        """A wrapper of ``Optimizer.zero_grad``.
+
+        Provide unified ``zero_grad`` interface compatible with automatic mixed
+        precision training. Subclass can overload this method to implement the
+        required logic.
+
+        Args:
+            kwargs: Keyword arguments passed to
+                :meth:`torch.optim.Optimizer.zero_grad`.
+        """
+        self.optimizer.zero_grad(**kwargs)
+
+    def step(self, **kwargs) -> None:
+        """A wrapper of ``Optimizer.step``.
+
+        Provide unified ``step`` interface compatible with automatic mixed
+        precision training. Subclass can overload this method to implement the
+        required logic. For example, ``torch.cuda.amp`` require some extra
+        operation on ``GradScaler`` during step process.
+
+        Clip grad if :attr:`clip_grad_kwargs` is not None, and then update
+        parameters.
+
+        Args:
+            kwargs: Keyword arguments passed to
+                :meth:`torch.optim.Optimizer.step`.
+        """
+        if self.clip_grad_kwargs:
+            self._clip_grad()
+        self.optimizer.step(**kwargs)
+
+    @contextmanager
+    def optim_context(self, model: nn.Module):
+        """A Context for gradient accumulation and automatic mix precision
+        training.
+
+        If subclasses need to enable the context for mix precision training,
+        e.g., ``:class:`AmpOptimWrapper``,  the corresponding context should be
+        enabled in `optim_context`. Since ``OptimWrapper`` uses default fp32
+        training, ``optim_context`` will only enable the context for
+        blocking the unnecessary gradient synchronization during gradient
+        accumulation
+
+        If model is an instance with ``no_sync`` method (which means
+        blocking the gradient synchronization) and
+        ``self._accumulative_counts != 1``. The model will not automatically
+        synchronize gradients if ``cur_iter`` is divisible by
+        ``self._accumulative_counts``. Otherwise, this method will enable an
+        empty context.
+
+        Args:
+            model (nn.Module): The training model.
+        """
+        # During gradient accumulation process, the gradient synchronize
+        # should only happen before updating parameters.
+        if not self.should_sync() and hasattr(model, 'no_sync'):
+            with model.no_sync():
+                yield
+        else:
+            yield
+
+    def _clip_grad(self) -> None:
+        """Clip the gradients of parameters."""
+        params: List[torch.Tensor] = []
+        for param_group in self.optimizer.param_groups:
+            params.extend(param_group['params'])
+
+        params = list(
+            filter(lambda p: p.requires_grad and p.grad is not None, params))
+        if len(params) > 0:
+            grad = self.clip_func(params, **self.clip_grad_kwargs)
+            # `torch.nn.utils.clip_grad_value_` will return None.
+            if grad is not None:
+                self.message_hub.update_scalar(f'train/{self.grad_name}',
+                                               float(grad))
+
+    def initialize_count_status(self, model: nn.Module, init_counts: int,
+                                max_counts: int) -> None:
+        """Initialize gradient accumulation related attributes.
+
+        ``OptimWrapper`` can be used without calling
+        ``initialize_iter_status``. However, Consider the case of  ``len(
+        dataloader) == 10``, and the ``accumulative_iter == 3``. Since 10 is
+        not divisible by 3, the last iteration will not trigger
+        ``optimizer.step()``, resulting in one less parameter updating.
+
+        Args:
+            model (nn.Module): Training model
+            init_counts (int): The initial value of the inner count.
+            max_counts (int): The maximum value of the inner count.
+        """
+        self._inner_count = init_counts
+        self._max_counts = max_counts
+        if self._inner_count % self._accumulative_counts != 0:
+            print_log(
+                'Resumed iteration number is not divisible by '
+                '`_accumulative_counts` in `GradientCumulativeOptimizerHook`, '
+                'which means the gradient of some iterations is lost and the '
+                'result may be influenced slightly.',
+                logger='current',
+                level=logging.WARNING)
+
+        if has_batch_norm(model) and self._accumulative_counts > 1:
+            print_log(
+                'Gradient accumulative may slightly decrease '
+                'performance because the model has BatchNorm layers.',
+                logger='current',
+                level=logging.WARNING)
+        # Remainder of `_max_counts` divided by `_accumulative_counts`
+        self._remainder_counts = self._max_counts % self._accumulative_counts
+
+    def should_update(self) -> bool:
+        """Decide whether the parameters should be updated at the current
+        iteration.
+
+        Called by :meth:`update_params` and check whether the optimizer
+        wrapper should update parameters at current iteration.
+
+        Returns:
+            bool: Whether to update parameters.
+        """
+        return (self._inner_count % self._accumulative_counts == 0
+                or self._inner_count == self._max_counts)
+
+    def should_sync(self) -> bool:
+        """Decide whether the automatic gradient synchronization should be
+        allowed at the current iteration.
+
+        It takes effect when gradient accumulation is used to skip
+        synchronization at the iterations where the parameter is not updated.
+
+        Since ``should_sync`` is called by :meth:`optim_context`, and it is
+        called before :meth:`backward` which means ``self._inner_count += 1``
+        has not happened yet. Therefore, ``self._inner_count += 1`` should be
+        performed manually here.
+
+        Returns:
+            bool: Whether to block the automatic gradient synchronization.
+        """
+        return ((self._inner_count + 1) % self._accumulative_counts == 0
+                or (self._inner_count + 1) == self._max_counts)
+
+    def scale_loss(self, loss: torch.Tensor) -> torch.Tensor:
+        """Get scaled loss according to ``_accumulative_counts``,
+        ``_inner_count`` and max_counts.
+
+        Args:
+            loss (torch.Tensor): Original loss calculated by model.
+
+        Returns:
+            loss (torch.Tensor): Scaled loss.
+        """
+        if self._accumulative_counts == 1:
+            # update parameters without gradient accumulation. The gradient
+            # should not be rescaled and `loss_factor=1`.
+            loss_factor = 1
+        elif self._max_counts == -1:
+            loss_factor = self._accumulative_counts
+        else:
+            # if `self._accumulative_counts > 1`, the gradient needs to be
+            # rescaled and accumulated. In most cases, `loss_factor` equals to
+            # `self._accumulative_counts`. However, `self._max_counts` may not
+            # be divisible by `self._accumulative_counts`, so the
+            # `loss_scale` for the last few iterations needs to be
+            # recalculated.
+            if self._inner_count < self._max_counts - self._remainder_counts:
+                loss_factor = self._accumulative_counts
+            else:
+                loss_factor = self._remainder_counts
+            assert loss_factor > 0, (
+                'loss_factor should be larger than zero! This error could '
+                'happened when initialize_iter_status called with an '
+                'error `init_counts` or `max_counts`')
+
+        loss = loss / loss_factor
+        return loss
+
+    @property
+    def inner_count(self):
+        """Get the number of updating parameters of optimizer wrapper."""
+        return self._inner_count
+
+    def __repr__(self):
+        wrapper_info = (f'Type: {type(self).__name__}\n'
+                        f'_accumulative_counts: {self._accumulative_counts}\n'
+                        'optimizer: \n')
+        optimizer_str = repr(self.optimizer) + '\n'
+        return wrapper_info + optimizer_str
diff --git a/head_extractor/src/mmengine/optim/optimizer/optimizer_wrapper_dict.py b/head_extractor/src/mmengine/optim/optimizer/optimizer_wrapper_dict.py
new file mode 100644
index 0000000000000000000000000000000000000000..a18fd99cae72f8053ee827ddd3501342539a6f05
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/optimizer/optimizer_wrapper_dict.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from contextlib import contextmanager
+from typing import Dict, Iterator, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from .optimizer_wrapper import OptimWrapper
+
+
+class OptimWrapperDict(OptimWrapper):
+    """A dictionary container of :obj:`OptimWrapper`.
+
+    If runner is training with multiple optimizers, all optimizer wrappers
+    should be managed by :obj:`OptimWrapperDict` which is built by
+    ``CustomOptimWrapperConstructor``. ``OptimWrapperDict`` will load and save
+    the state dictionary of all optimizer wrappers.
+
+    Consider the semantic ambiguity of calling :meth:``update_params``,
+    :meth:`backward` of all optimizer wrappers, ``OptimWrapperDict`` will not
+    implement these methods.
+
+    Examples:
+        >>> import torch.nn as nn
+        >>> from torch.optim import SGD
+        >>> from mmengine.optim import OptimWrapperDict, OptimWrapper
+        >>> model1 = nn.Linear(1, 1)
+        >>> model2 = nn.Linear(1, 1)
+        >>> optim_wrapper1 = OptimWrapper(SGD(model1.parameters(), lr=0.1))
+        >>> optim_wrapper2 = OptimWrapper(SGD(model2.parameters(), lr=0.1))
+        >>> optim_wrapper_dict = OptimWrapperDict(model1=optim_wrapper1,
+        >>>                                       model2=optim_wrapper2)
+
+    Note:
+        The optimizer wrapper contained in ``OptimWrapperDict`` can be accessed
+        in the same way as `dict`.
+
+    Args:
+        **optim_wrappers: A dictionary of ``OptimWrapper`` instance.
+    """
+
+    def __init__(self, **optim_wrapper_dict: OptimWrapper):
+        for key, value in optim_wrapper_dict.items():
+            assert isinstance(value, OptimWrapper), (
+                '`OptimWrapperDict` only accept OptimWrapper instance, '
+                f'but got {key}: {type(value)}')
+        self.optim_wrappers = optim_wrapper_dict
+
+    def update_params(  # type: ignore
+            self,
+            loss: torch.Tensor,
+            step_kwargs: Optional[Dict] = None,
+            zero_kwargs: Optional[Dict] = None) -> None:
+        """Update all optimizer wrappers would lead to a duplicate backward
+        errors, and OptimWrapperDict does not know which optimizer wrapper
+        should be updated.
+
+        Therefore, this method is not implemented. The optimizer wrapper of
+        OptimWrapperDict should be accessed and call its `update_params`.
+        """
+        raise NotImplementedError('`update_params` should be called by each '
+                                  'optimizer separately`')
+
+    def backward(self, loss: torch.Tensor, **kwargs) -> None:
+        """Since OptimWrapperDict doesn't know which optimizer wrapper's
+        backward method should be called (``loss_scaler`` maybe different in
+        different :obj:AmpOptimWrapper), this method is not implemented.
+
+        The optimizer wrapper of OptimWrapperDict should be accessed and call
+        its `backward`.
+        """
+        raise NotImplementedError('`backward` should be called by each '
+                                  'optimizer separately`')
+
+    def step(self, **kwargs) -> None:
+        """Since the backward method is not implemented, the step should not be
+        implemented either."""
+        raise NotImplementedError('`step` should be called by each '
+                                  'optimizer separately`')
+
+    def zero_grad(self, **kwargs) -> None:
+        """Set the gradients of all optimizer wrappers to zero."""
+        for optim_wrapper in self.optim_wrappers.values():
+            optim_wrapper.zero_grad()
+
+    @contextmanager
+    def optim_context(self, model: nn.Module):
+        """``optim_context`` should be called by each optimizer separately."""
+        raise NotImplementedError(
+            '`optim_context` should be called by each optimizer separately')
+
+    def initialize_count_status(self, model: nn.Module, cur_iter,
+                                max_iters) -> None:
+        """Do nothing but provide unified interface for :obj:`OptimWrapper`
+
+        Since ``OptimWrapperDict`` does not know the correspondence between
+        model and optimizer wrapper. ``initialize_iter_status`` will do nothing
+        and each optimizer wrapper should call ``initialize_iter_status``
+        separately.
+        """
+        return
+
+    @property
+    def param_groups(self):
+        """Returns the parameter groups of each OptimWrapper."""
+        param_groups = dict()
+        for key, value in self.optim_wrappers.items():
+            param_groups[key] = value.param_groups
+        return param_groups
+
+    def get_lr(self) -> Dict[str, List[float]]:
+        """Get the learning rate of all optimizers.
+
+        Returns:
+            Dict[str, List[float]]: Learning rate of all optimizers.
+        """
+        lr_dict = dict()
+        for name, optim_wrapper in self.optim_wrappers.items():
+            inner_lr_dict = optim_wrapper.get_lr()
+            if 'base_lr' in inner_lr_dict:
+                lr_dict[f'{name}.base_lr'] = inner_lr_dict['base_lr']
+            lr_dict[f'{name}.lr'] = inner_lr_dict['lr']
+        return lr_dict
+
+    def get_momentum(self) -> Dict[str, List[float]]:
+        """Get the momentum of all optimizers.
+
+        Returns:
+            Dict[str, List[float]]: momentum of all optimizers.
+        """
+        momentum_dict = dict()
+        for name, optim_wrapper in self.optim_wrappers.items():
+            momentum_dict[f'{name}.momentum'] = optim_wrapper.get_momentum(
+            )['momentum']
+        return momentum_dict
+
+    def state_dict(self) -> dict:
+        """Get the state dictionary of all optimizer wrappers.
+
+        Returns:
+            dict: Each key-value pair in the dictionary represents the name
+            and state dictionary of corresponding :obj:`OptimWrapper`.
+        """
+        state_dict = dict()
+        for name, optim_wrapper in self.optim_wrappers.items():
+            state_dict[name] = optim_wrapper.state_dict()
+        return state_dict
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        """Load the state dictionary from the ``state_dict``.
+
+        Args:
+            state_dict (dict): Each key-value pair in `state_dict` represents
+                the name and the state dictionary of corresponding
+                :obj:`OptimWrapper`.
+        """
+        for name, _state_dict in state_dict.items():
+            assert name in self.optim_wrappers, (
+                f'Mismatched `state_dict`! cannot found {name} in '
+                'OptimWrapperDict')
+            self.optim_wrappers[name].load_state_dict(_state_dict)
+
+    def items(self) -> Iterator[Tuple[str, OptimWrapper]]:
+        """A generator to get the name and corresponding
+        :obj:`OptimWrapper`"""
+        yield from self.optim_wrappers.items()
+
+    def values(self) -> Iterator[OptimWrapper]:
+        """A generator to get :obj:`OptimWrapper`"""
+        yield from self.optim_wrappers.values()
+
+    def keys(self) -> Iterator[str]:
+        """A generator to get the name of :obj:`OptimWrapper`"""
+        yield from self.optim_wrappers.keys()
+
+    def __getitem__(self, key: str) -> OptimWrapper:
+        assert key in self.optim_wrappers, (
+            f'Cannot find {key} in OptimWrapperDict, please check '
+            'your optimizer constructor.')
+        return self.optim_wrappers[key]
+
+    def __contains__(self, key: str) -> bool:
+        return key in self.optim_wrappers
+
+    def __len__(self) -> int:
+        return len(self.optim_wrappers)
+
+    def __repr__(self) -> str:
+        desc = ''
+        for name, optim_wrapper in self.optim_wrappers.items():
+            desc += f'name: {name}\n'
+            desc += repr(optim_wrapper)
+        return desc
diff --git a/head_extractor/src/mmengine/optim/optimizer/zero_optimizer.py b/head_extractor/src/mmengine/optim/optimizer/zero_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c5630a765c0352031cbdcd1f85ebbe210129a0f
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/optimizer/zero_optimizer.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+from torch.distributed.rpc import is_available
+
+from mmengine.dist import is_main_process
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+try:
+    from torch.distributed.optim import \
+        ZeroRedundancyOptimizer as _ZeroRedundancyOptimizer
+except ImportError:
+    _ZeroRedundancyOptimizer = object
+
+from .builder import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module()
+class ZeroRedundancyOptimizer(_ZeroRedundancyOptimizer):
+    """A wrapper class of :class:`ZeroRedundancyOptimizer` that gets a
+    optimizer type as string.
+
+    This class wraps an arbitrary :class:`torch.optim.Optimizer` and shards its
+    states across ranks in the group as described by ZeRO_. The local optimizer
+    instance in each rank is only responsible for updating approximately
+    ``1 / world_size`` parameters and hence only needs to keep
+    ``1 / world_size`` optimizer states. After parameters are updated locally,
+    each rank will broadcast its parameters to all other peers to keep all
+    model replicas in the same state. ``ZeroRedundancyOptimizer`` can be used
+    in conjunction with :class:`torch.nn.parallel.DistributedDataParallel` to
+    reduce per-rank peak memory consumption.
+
+    ``ZeroRedundancyOptimizer`` uses a sorted-greedy algorithm to pack a number
+    of parameters at each rank. Each parameter belongs to a single rank and is
+    not divided among ranks. The partition is arbitrary and might not match the
+    the parameter registration or usage order.
+
+    Warnings:
+        ``ZeroRedundancyOptimizer`` requires PyTorch >= 1.8.
+
+    Warnings:
+        ``ZeroRedundancyOptimizer`` requires PyTorch >= 1.12 to enable param
+        groups.
+
+    Args:
+        params (``Iterable``): an ``Iterable`` of :class:`torch.Tensor` s
+            or :class:`dict` s giving all parameters, which will be sharded
+            across ranks.
+        optimizer_type (str): the string of the local optimizer class.
+
+    .. _ZeRO: https://arxiv.org/abs/1910.02054
+    """
+
+    def __init__(self, params, optimizer_type: str, **kwargs):
+        assert digit_version(TORCH_VERSION) >= digit_version('1.8.0'), (
+            '`torch.distributed.optim.ZeroReundancyOptimizer` is only '
+            'available when pytorch version >= 1.8.')
+        assert is_available(), 'torch.distributed.rpc is not available.'
+        # Avoid the generator becoming empty after the following check
+        params = list(params)
+        assert (
+            all(isinstance(p, torch.Tensor) for p in params)
+            or digit_version(TORCH_VERSION) >= digit_version('1.12.0')), (
+                'PyTorch ZeroRedundancyOptimizer started to support param '
+                'groups since 1.12.0. Please update your pytorch version to '
+                'enable this feature, or disable param groups by deleting '
+                '`paramwise_cfg` filed in config file.')
+        optimizer_class = getattr(torch.optim, optimizer_type)
+        # TODO: Register a DDP communication hook for `overlap_with_ddp=True`.
+        # Currently only `overlap_with_ddp=False` is supported. For more
+        # details, please refer to the pytorch's official documentation.
+        super().__init__(params, optimizer_class, **kwargs)
+
+    def state_dict(self):
+        """Consolidate `state_dict`s from ranks to save the `state_dict`."""
+        self.consolidate_state_dict()
+        state_dict = super().state_dict() if is_main_process() else dict()
+        return state_dict
diff --git a/head_extractor/src/mmengine/optim/scheduler/__init__.py b/head_extractor/src/mmengine/optim/scheduler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48ccc34bc41b07442e2494b03a303b3c0054b42b
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/scheduler/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# yapf: disable
+from .lr_scheduler import (ConstantLR, CosineAnnealingLR, CosineRestartLR,
+                           ExponentialLR, LinearLR, MultiStepLR, OneCycleLR,
+                           PolyLR, ReduceOnPlateauLR, StepLR)
+from .momentum_scheduler import (ConstantMomentum, CosineAnnealingMomentum,
+                                 CosineRestartMomentum, ExponentialMomentum,
+                                 LinearMomentum, MultiStepMomentum,
+                                 PolyMomentum, ReduceOnPlateauMomentum,
+                                 StepMomentum)
+from .param_scheduler import (ConstantParamScheduler,
+                              CosineAnnealingParamScheduler,
+                              CosineRestartParamScheduler,
+                              ExponentialParamScheduler, LinearParamScheduler,
+                              MultiStepParamScheduler, OneCycleParamScheduler,
+                              PolyParamScheduler,
+                              ReduceOnPlateauParamScheduler,
+                              StepParamScheduler, _ParamScheduler)
+
+# yapf: enable
+__all__ = [
+    'ConstantLR', 'CosineAnnealingLR', 'ExponentialLR', 'LinearLR',
+    'MultiStepLR', 'StepLR', 'ConstantMomentum', 'CosineAnnealingMomentum',
+    'ExponentialMomentum', 'LinearMomentum', 'MultiStepMomentum',
+    'StepMomentum', 'ConstantParamScheduler', 'CosineAnnealingParamScheduler',
+    'ExponentialParamScheduler', 'LinearParamScheduler',
+    'MultiStepParamScheduler', 'StepParamScheduler', '_ParamScheduler',
+    'PolyParamScheduler', 'PolyLR', 'PolyMomentum', 'OneCycleParamScheduler',
+    'OneCycleLR', 'CosineRestartParamScheduler', 'CosineRestartLR',
+    'CosineRestartMomentum', 'ReduceOnPlateauParamScheduler',
+    'ReduceOnPlateauLR', 'ReduceOnPlateauMomentum'
+]
diff --git a/head_extractor/src/mmengine/optim/scheduler/lr_scheduler.py b/head_extractor/src/mmengine/optim/scheduler/lr_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b12c60d0cf29c447c3f88518b2e4905900ce15ef
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/scheduler/lr_scheduler.py
@@ -0,0 +1,379 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.registry import PARAM_SCHEDULERS
+# yapf: disable
+from .param_scheduler import (ConstantParamScheduler,
+                              CosineAnnealingParamScheduler,
+                              CosineRestartParamScheduler,
+                              ExponentialParamScheduler, LinearParamScheduler,
+                              MultiStepParamScheduler, OneCycleParamScheduler,
+                              PolyParamScheduler,
+                              ReduceOnPlateauParamScheduler,
+                              StepParamScheduler)
+
+# yapf: enable
+
+
+class LRSchedulerMixin:
+    """A mixin class for learning rate schedulers."""
+
+    def __init__(self, optimizer, *args, **kwargs):
+        super().__init__(optimizer, 'lr', *args, **kwargs)
+
+
+@PARAM_SCHEDULERS.register_module()
+class ConstantLR(LRSchedulerMixin, ConstantParamScheduler):
+    """Decays the learning rate value of each parameter group by a small
+    constant factor until the number of epoch reaches a pre-defined milestone:
+    ``end``. Notice that such decay can happen simultaneously with other
+    changes to the learning rate value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        factor (float): The number we multiply learning rate until the
+            milestone. Defaults to 1./3.
+        begin (int): Step at which to start updating the learning rate.
+            Defaults to 0.
+        end (int): Step at which to stop updating the learning rate.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without state
+            dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled learning rate is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the learning rate for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class CosineAnnealingLR(LRSchedulerMixin, CosineAnnealingParamScheduler):
+    r"""Set the learning rate of each parameter group using a cosine annealing
+    schedule, where :math:`\eta_{max}` is set to the initial value and
+    :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
+
+    .. math::
+        \begin{aligned}
+            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+            & T_{cur} \neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+            & T_{cur} = (2k+1)T_{max}.
+        \end{aligned}
+
+    Notice that because the schedule
+    is defined recursively, the learning rate can be simultaneously modified
+    outside this scheduler by other operators. If the learning rate is set
+    solely by this scheduler, the learning rate at each step becomes:
+
+    .. math::
+        \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 +
+        \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right)
+
+    It has been proposed in
+    `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this
+    only implements the cosine annealing part of SGDR, and not the restarts.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        T_max (int): Maximum number of iterations.
+        eta_min (float): Minimum learning rate. Defaults to None.
+        begin (int): Step at which to start updating the learning rate.
+            Defaults to 0.
+        end (int): Step at which to stop updating the learning rate.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled learning rate is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the learning rate for each update.
+            Defaults to False.
+        eta_min_ratio (float, optional): The ratio of the minimum parameter
+            value to the base parameter value. Either `eta_min` or
+            `eta_min_ratio` should be specified. Defaults to None.
+            New in version 0.3.2.
+
+    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class ExponentialLR(LRSchedulerMixin, ExponentialParamScheduler):
+    """Decays the learning rate of each parameter group by gamma every epoch.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        gamma (float): Multiplicative factor of learning rate decay.
+        begin (int): Step at which to start updating the learning rate.
+            Defaults to 0.
+        end (int): Step at which to stop updating the learning rate.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled learning rate is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the learning rate for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class LinearLR(LRSchedulerMixin, LinearParamScheduler):
+    """Decays the learning rate of each parameter group by linearly changing
+    small multiplicative factor until the number of epoch reaches a pre-defined
+    milestone: ``end``.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    learning rate from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        start_factor (float): The number we multiply learning rate in the
+            first epoch. The multiplication factor changes towards end_factor
+            in the following epochs. Defaults to 1./3.
+        end_factor (float): The number we multiply learning rate at the end
+            of linear changing process. Defaults to 1.0.
+        begin (int): Step at which to start updating the learning rate.
+            Defaults to 0.
+        end (int): Step at which to stop updating the learning rate.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled learning rate is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the learning rate for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class MultiStepLR(LRSchedulerMixin, MultiStepParamScheduler):
+    """Decays the specified learning rate in each parameter group by gamma once
+    the number of epoch reaches one of the milestones. Notice that such decay
+    can happen simultaneously with other changes to the learning rate from
+    outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        milestones (list): List of epoch indices. Must be increasing.
+        gamma (float): Multiplicative factor of learning rate decay.
+            Defaults to 0.1.
+        begin (int): Step at which to start updating the learning rate.
+            Defaults to 0.
+        end (int): Step at which to stop updating the learning rate.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled learning rate is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the learning rate for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class StepLR(LRSchedulerMixin, StepParamScheduler):
+    """Decays the learning rate of each parameter group by gamma every
+    step_size epochs. Notice that such decay can happen simultaneously with
+    other changes to the learning rate from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        step_size (int): Period of learning rate decay.
+        gamma (float): Multiplicative factor of learning rate decay.
+            Defaults to 0.1.
+        begin (int): Step at which to start updating the learning rate.
+            Defaults to 0.
+        end (int): Step at which to stop updating the learning rate.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled learning rate is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the learning rate for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class PolyLR(LRSchedulerMixin, PolyParamScheduler):
+    """Decays the learning rate of each parameter group in a polynomial decay
+    scheme.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    parameter value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        eta_min (float): Minimum learning rate at the end of scheduling.
+            Defaults to 0.
+        power (float): The power of the polynomial. Defaults to 1.0.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class OneCycleLR(LRSchedulerMixin, OneCycleParamScheduler):
+    r"""Sets the learning rate of each parameter group according to the
+    1cycle learning rate policy. The 1cycle policy anneals the learning
+    rate from an initial learning rate to some maximum learning rate and then
+    from that maximum learning rate to some minimum learning rate much lower
+    than the initial learning rate.
+    This policy was initially described in the paper `Super-Convergence:
+    Very Fast Training of Neural Networks Using Large Learning Rates`_.
+
+    The 1cycle learning rate policy changes the learning rate after every
+    batch. `step` should be called after a batch has been used for training.
+
+    This scheduler is not chainable.
+
+    Note also that the total number of steps in the cycle can be determined in
+    one of two ways (listed in order of precedence):
+
+    #. A value for total_steps is explicitly provided.
+    #. A number of epochs (epochs) and a number of steps per epoch
+       (steps_per_epoch) are provided.
+       In this case, the number of total steps is inferred by
+       total_steps = epochs * steps_per_epoch
+
+    You must either provide a value for total_steps or provide a value for both
+    epochs and steps_per_epoch.
+
+    The default behaviour of this scheduler follows the fastai implementation
+    of 1cycle, which claims that "unpublished work has shown even better
+    results by using only two phases". To mimic the behaviour of the original
+    paper instead, set ``three_phase=True``.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        eta_max (float or list): Upper parameter value boundaries in the cycle
+            for each parameter group.
+        total_steps (int): The total number of steps in the cycle. Note that
+            if a value is not provided here, then it must be inferred by
+            providing a value for epochs and steps_per_epoch.
+            Defaults to None.
+        pct_start (float): The percentage of the cycle (in number of steps)
+            spent increasing the learning rate.
+            Defaults to 0.3
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: "cos" for cosine annealing,
+            "linear" for linear annealing.
+            Defaults to 'cos'
+        div_factor (float): Determines the initial learning rate via
+            initial_param = eta_max/div_factor
+            Defaults to 25
+        final_div_factor (float): Determines the minimum learning rate via
+            eta_min = initial_param/final_div_factor
+            Defaults to 1e4
+        three_phase (bool): If ``True``, use a third phase of the schedule to
+            annihilate the learning rate according to 'final_div_factor'
+            instead of modifying the second phase (the first two phases will be
+            symmetrical about the step indicated by 'pct_start').
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+
+    .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
+        https://arxiv.org/abs/1708.07120
+    """# noqa E501
+
+
+@PARAM_SCHEDULERS.register_module()
+class CosineRestartLR(LRSchedulerMixin, CosineRestartParamScheduler):
+    """Sets the learning rate of each parameter group according to the cosine
+    annealing with restarts scheme. The cosine restart policy anneals the
+    learning rate from the initial value to `eta_min` with a cosine annealing
+    schedule and then restarts another period from the maximum value multiplied
+    with `restart_weight`.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        periods (list[int]): Periods for each cosine anneling cycle.
+        restart_weights (list[float]): Restart weights at each
+            restart iteration. Defaults to [1].
+        eta_min (float): Minimum parameter value at the end of scheduling.
+            Defaults to None.
+        eta_min_ratio (float, optional): The ratio of minimum parameter value
+            to the base parameter value. Either `min_lr` or `min_lr_ratio`
+            should be specified. Defaults to None.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class ReduceOnPlateauLR(LRSchedulerMixin, ReduceOnPlateauParamScheduler):
+    """Reduce the learning rate of each parameter group when a metric has
+    stopped improving. Models often benefit from reducing the learning rate by
+    a factor of 2-10 once learning stagnates. This scheduler reads a metrics
+    quantity and if no improvement is seen for a ``patience`` number of epochs,
+    the learning rate is reduced.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        monitor (str): Key name of the value to monitor in metrics dict.
+        rule (str): One of `less`, `greater`. In `less` rule, learning rate
+            will be reduced when the quantity monitored has stopped
+            decreasing; in `greater` rule it will be reduced when the
+            quantity monitored has stopped increasing. Defaults to 'less'.
+            The ``rule`` is the renaming of ``mode`` in pytorch.
+        factor (float): Factor by which the learning rate will be
+            reduced. new_param = param * factor. Defaults to 0.1.
+        patience (int): Number of epochs with no improvement after
+            which learning rate will be reduced. For example, if
+            ``patience = 2``, then we will ignore the first 2 epochs
+            with no improvement, and will only decrease the learning rate after
+            the 3rd epoch if the monitor value still hasn't improved then.
+            Defaults to 10.
+        threshold (float): Threshold for measuring the new optimum,
+            to only focus on significant changes. Defaults to 1e-4.
+        threshold_rule (str): One of `rel`, `abs`. In `rel` rule,
+            dynamic_threshold = best * ( 1 + threshold ) in 'greater'
+            rule or best * ( 1 - threshold ) in `less` rule.
+            In `abs` rule, dynamic_threshold = best + threshold in
+            `greater` rule or best - threshold in `less` rule.
+            Defaults to 'rel'.
+        cooldown (int): Number of epochs to wait before resuming
+            normal operation after learning rate has been reduced.
+            Defaults to 0.
+        min_value (float or list[float]): A scalar or a sequence of scalars.
+            A lower bound on the learning rate of each parameter group
+            respectively. Defaults to 0. .
+        eps (float): Minimal decay applied to learning rate. If the difference
+            between new and old learning rate is smaller than eps, the update
+            is ignored. Defaults to 1e-8.
+        begin (int): Step at which to start triggering the scheduler
+            to monitor in val within the interval calculated
+            according to epoch of training. Defaults to 0.
+        end (int): Step at which to stop triggering the scheduler
+            to monitor in val within the interval calculated
+            according to epoch of training. Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
diff --git a/head_extractor/src/mmengine/optim/scheduler/momentum_scheduler.py b/head_extractor/src/mmengine/optim/scheduler/momentum_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..e356e70f7b734fa4598afdc1e95478e674a3c344
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/scheduler/momentum_scheduler.py
@@ -0,0 +1,362 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.registry import PARAM_SCHEDULERS
+# yapf: disable
+from .param_scheduler import (ConstantParamScheduler,
+                              CosineAnnealingParamScheduler,
+                              CosineRestartParamScheduler,
+                              ExponentialParamScheduler, LinearParamScheduler,
+                              MultiStepParamScheduler, PolyParamScheduler,
+                              ReduceOnPlateauParamScheduler,
+                              StepParamScheduler)
+
+# yapf: enable
+
+
+class MomentumSchedulerMixin:
+    """A mixin class for momentum schedulers.
+
+    It can schedule the momentum in SGD and the beta_0 in Adam series.
+    """
+
+    def __init__(self, optimizer, *args, **kwargs):
+        self.use_betas = False
+        if 'momentum' in optimizer.defaults:
+            param_name = 'momentum'
+        elif 'betas' in optimizer.defaults:
+            # for Adam series optimizer, the momentum is beta_0
+            self.use_betas = True
+            param_name = 'momentum'
+            for group in optimizer.param_groups:
+                # set a reference momentum in the param groups for scheduling
+                group[param_name] = group['betas'][0]
+        else:
+            raise ValueError(
+                'optimizer must support momentum when using momentum scheduler'
+            )
+        super().__init__(optimizer, param_name, *args, **kwargs)
+
+    def step(self):
+        """Adjusts the momentum of each parameter group based on the specified
+        schedule."""
+        super().step()
+        if self.use_betas:
+            for group in self.optimizer.param_groups:
+                _, beta_1 = group['betas']
+                # update the betas with the calculated value
+                group['betas'] = (group['momentum'], beta_1)
+
+
+@PARAM_SCHEDULERS.register_module()
+class ConstantMomentum(MomentumSchedulerMixin, ConstantParamScheduler):
+    """Decays the momentum value of each parameter group by a small constant
+    factor until the number of epoch reaches a pre-defined milestone: ``end``.
+    Notice that such decay can happen simultaneously with other changes to the
+    momentum value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        factor (float): The number we multiply momentum until the milestone.
+            Defaults to 1./3.
+        begin (int): Step at which to start updating the momentum.
+            Defaults to 0.
+        end (int): Step at which to stop updating the momentum.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without state
+            dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled momentum is updated by epochs.
+            Defaults to True.
+        verbose (bool): Whether to print the momentum for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class CosineAnnealingMomentum(MomentumSchedulerMixin,
+                              CosineAnnealingParamScheduler):
+    r"""Set the momentum of each parameter group using a cosine annealing
+    schedule, where :math:`\eta_{max}` is set to the initial value and
+    :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
+
+    .. math::
+        \begin{aligned}
+            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+            & T_{cur} \neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+            & T_{cur} = (2k+1)T_{max}.
+        \end{aligned}
+
+    Notice that because the schedule
+    is defined recursively, the momentum can be simultaneously modified
+    outside this scheduler by other operators. If the momentum is set
+    solely by this scheduler, the momentum at each step becomes:
+
+    .. math::
+        \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 +
+        \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right)
+
+    It has been proposed in
+    `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this
+    only implements the cosine annealing part of SGDR, and not the restarts.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        T_max (int): Maximum number of iterations.
+        eta_min (float): Minimum momentum value. Defaults to None.
+        begin (int): Step at which to start updating the momentum.
+            Defaults to 0.
+        end (int): Step at which to stop updating the momentum.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled momentum is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the momentum for each update.
+            Defaults to False.
+        eta_min_ratio (float, optional): The ratio of the minimum parameter
+            value to the base parameter value. Either `eta_min` or
+            `eta_min_ratio` should be specified. Defaults to None.
+            New in version 0.3.2.
+
+    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class ExponentialMomentum(MomentumSchedulerMixin, ExponentialParamScheduler):
+    """Decays the momentum of each parameter group by gamma every epoch.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        gamma (float): Multiplicative factor of momentum value decay.
+        begin (int): Step at which to start updating the momentum.
+            Defaults to 0.
+        end (int): Step at which to stop updating the momentum.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled momentum is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the momentum for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class LinearMomentum(MomentumSchedulerMixin, LinearParamScheduler):
+    """Decays the momentum of each parameter group by linearly changing
+    small multiplicative factor until the number of epoch reaches a pre-defined
+    milestone: ``end``.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    momentum from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        start_factor (float): The number we multiply momentum in the
+            first epoch. The multiplication factor changes towards end_factor
+            in the following epochs. Defaults to 1./3.
+        end_factor (float): The number we multiply momentum at the end
+            of linear changing process. Defaults to 1.0.
+        begin (int): Step at which to start updating the momentum.
+            Defaults to 0.
+        end (int): Step at which to stop updating the momentum.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled momentum is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the momentum for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class MultiStepMomentum(MomentumSchedulerMixin, MultiStepParamScheduler):
+    """Decays the specified momentum in each parameter group by gamma once the
+    number of epoch reaches one of the milestones. Notice that such decay can
+    happen simultaneously with other changes to the momentum from outside this
+    scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        milestones (list): List of epoch indices. Must be increasing.
+        gamma (float): Multiplicative factor of momentum value decay.
+            Defaults to 0.1.
+        begin (int): Step at which to start updating the momentum.
+            Defaults to 0.
+        end (int): Step at which to stop updating the momentum.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled momentum is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the momentum for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class StepMomentum(MomentumSchedulerMixin, StepParamScheduler):
+    """Decays the momentum of each parameter group by gamma every step_size
+    epochs. Notice that such decay can happen simultaneously with other changes
+    to the momentum from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        step_size (int): Period of momentum value decay.
+        gamma (float): Multiplicative factor of momentum value decay.
+            Defaults to 0.1.
+        begin (int): Step at which to start updating the momentum.
+            Defaults to 0.
+        end (int): Step at which to stop updating the momentum.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled momentum is updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the momentum for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class PolyMomentum(MomentumSchedulerMixin, PolyParamScheduler):
+    """Decays the momentum of each parameter group in a polynomial decay
+    scheme.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    parameter value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        eta_min (float): Minimum momentum at the end of scheduling.
+            Defaults to 0.
+        power (float): The power of the polynomial. Defaults to 1.0.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class CosineRestartMomentum(MomentumSchedulerMixin,
+                            CosineRestartParamScheduler):
+    """Sets the momentum of each parameter group according to the cosine
+    annealing with restarts scheme. The cosine restart policy anneals the
+    momentum from the initial value to `eta_min` with a cosine annealing
+    schedule and then restarts another period from the maximum value multiplied
+    with `restart_weight`.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        periods (list[int]): Periods for each cosine anneling cycle.
+        restart_weights (list[float]): Restart weights at each
+            restart iteration. Defaults to [1].
+        eta_min (float): Minimum parameter value at the end of scheduling.
+            Defaults to None.
+        eta_min_ratio (float, optional): The ratio of minimum parameter value
+            to the base parameter value. Either `min_lr` or `min_lr_ratio`
+            should be specified. Defaults to None.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+
+@PARAM_SCHEDULERS.register_module()
+class ReduceOnPlateauMomentum(MomentumSchedulerMixin,
+                              ReduceOnPlateauParamScheduler):
+    """Reduce the momentum of each parameter group when a metric has stopped
+    improving. Models often benefit from reducing the momentum by a factor of
+    2-10 once learning stagnates. This scheduler reads a metrics quantity and
+    if no improvement is seen for a ``patience`` number of epochs, the momentum
+    is reduced.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): optimizer or Wrapped
+            optimizer.
+        monitor (str): Key name of the value to monitor in metrics dict.
+        rule (str): One of `less`, `greater`. In `less` rule, momentum will
+            be reduced when the quantity monitored has stopped
+            decreasing; in `greater` rule it will be reduced when the
+            quantity monitored has stopped increasing. Defaults to 'less'.
+            The ``rule`` is the renaming of ``mode`` in pytorch.
+        factor (float): Factor by which the momentum will be
+            reduced. new_param = param * factor. Defaults to 0.1.
+        patience (int): Number of epochs with no improvement after
+            which momentum will be reduced. For example, if
+            ``patience = 2``, then we will ignore the first 2 epochs
+            with no improvement, and will only decrease the momentum after
+            the 3rd epoch if the monitor value still hasn't improved then.
+            Defaults to 10.
+        threshold (float): Threshold for measuring the new optimum,
+            to only focus on significant changes. Defaults to 1e-4.
+        threshold_rule (str): One of `rel`, `abs`. In `rel` rule,
+            dynamic_threshold = best * ( 1 + threshold ) in 'greater'
+            rule or best * ( 1 - threshold ) in `less` rule.
+            In `abs` rule, dynamic_threshold = best + threshold in
+            `greater` rule or best - threshold in `less` rule.
+            Defaults to 'rel'.
+        cooldown (int): Number of epochs to wait before resuming
+            normal operation after momentum has been reduced. Defaults to 0.
+        min_value (float or list[float]): A scalar or a sequence of scalars.
+            A lower bound on the momentum of each parameter group
+            respectively. Defaults to 0. .
+        eps (float): Minimal decay applied to momentum. If the difference
+            between new and old momentum is smaller than eps, the update is
+            ignored. Defaults to 1e-8.
+        begin (int): Step at which to start triggering the scheduler
+            to monitor in val within the interval calculated
+            according to epoch of training. Defaults to 0.
+        end (int): Step at which to stop triggering the scheduler
+            to monitor in val within the interval calculated
+            according to epoch of training. Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def step(self, metrics=None):
+        """Adjusts the momentum of each parameter group based on the specified
+        schedule.
+
+        Args:
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+                Defaults to None.
+        """
+        super(MomentumSchedulerMixin, self).step(metrics)
+        if self.use_betas:
+            for group in self.optimizer.param_groups:
+                _, beta_1 = group['betas']
+                # update the betas with the calculated value
+                group['betas'] = (group['momentum'], beta_1)
diff --git a/head_extractor/src/mmengine/optim/scheduler/param_scheduler.py b/head_extractor/src/mmengine/optim/scheduler/param_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..af89ccaea71dda5e973a6f014284752b701110ae
--- /dev/null
+++ b/head_extractor/src/mmengine/optim/scheduler/param_scheduler.py
@@ -0,0 +1,1578 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+# Modified from https://github.com/pytorch/pytorch
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+
+import math
+import warnings
+import weakref
+from collections import Counter
+from functools import wraps
+from typing import Callable, List, Optional, Sequence, Union
+
+from torch.optim import Optimizer
+
+from mmengine.logging import print_log
+from mmengine.optim import BaseOptimWrapper
+from mmengine.registry import PARAM_SCHEDULERS
+
+INF = int(1e9)
+
+OptimizerType = Union[BaseOptimWrapper, Optimizer]
+
+
+class _ParamScheduler:
+    """Base class for parameter schedulers.
+
+    It should be inherited by all schedulers that schedule parameters in the
+    optimizer's ``param_groups``. All subclasses should overwrite the
+    ``_get_value()`` according to their own schedule strategy.
+    The implementation is motivated by
+    https://github.com/pytorch/pytorch/blob/master/torch/optim/lr_scheduler.py.
+
+    Args:
+        optimizer (BaseOptimWrapper or Optimizer): Wrapped optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resuming without
+            state dict. Default value ``-1`` means the ``step`` function is
+            never be called before. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """  # noqa: E501
+
+    def __init__(self,
+                 optimizer: OptimizerType,
+                 param_name: str,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+
+        # Attach optimizer
+        if not isinstance(optimizer, (Optimizer, BaseOptimWrapper)):
+            raise TypeError('``optimizer`` should be an Optimizer,'
+                            'but got {}'.format(type(optimizer).__name__))
+        self.optimizer = optimizer
+        self.param_name = param_name
+
+        if end <= begin:
+            raise ValueError('end should be larger than begin, but got'
+                             ' begin={}, end={}'.format(begin, end))
+        self.begin = begin
+        self.end = end
+
+        self.by_epoch = by_epoch
+
+        assert isinstance(last_step, int) and last_step >= -1
+        # Initialize valid step count and base values
+        if last_step == -1:
+            for group in optimizer.param_groups:
+                # If the param is never be scheduled, record the current value
+                # as the initial value.
+                group.setdefault(f'initial_{param_name}', group[param_name])
+        else:
+            for i, group in enumerate(optimizer.param_groups):
+                if f'initial_{param_name}' not in group:
+                    raise KeyError(
+                        f"param 'initial_{param_name}' is not specified "
+                        'in param_groups[{}] when resuming an optimizer'.
+                        format(i))
+        self.base_values = [
+            group[f'initial_{param_name}'] for group in optimizer.param_groups
+        ]
+        self.last_step = last_step
+
+        # Following https://github.com/pytorch/pytorch/issues/20124
+        # We would like to ensure that `scheduler.step()` is called after
+        # `optimizer.step()`
+        def with_counter(method: Callable):
+            if getattr(method, '_with_counter', False):
+                # `optimizer.step()` has already been replaced, return.
+                return method
+
+            # Keep a weak reference to the optimizer instance to prevent
+            # cyclic references.
+            instance_ref = weakref.ref(method.__self__)  # type: ignore
+            # Get the unbound method for the same purpose.
+            func = method.__func__  # type: ignore
+            cls = instance_ref().__class__  # type: ignore
+            del method
+
+            @wraps(func)
+            def wrapper(*args, **kwargs):
+                instance = instance_ref()
+                instance._global_step += 1
+                wrapped = func.__get__(instance, cls)
+                return wrapped(*args, **kwargs)
+
+            # Note that the returned function here is no longer a bound method,
+            # so attributes like `__func__` and `__self__` no longer exist.
+            wrapper._with_counter = True  # type: ignore
+            return wrapper
+
+        # add counter to optimizer
+        self.optimizer.step = with_counter(self.optimizer.step)  # type: ignore
+        self.optimizer._global_step = -1  # type: ignore
+
+        self._global_step = -1
+        self.verbose = verbose
+
+        self.step()
+
+    def state_dict(self) -> dict:
+        """Returns the state of the scheduler as a :class:`dict`.
+
+        It contains an entry for every variable in self.__dict__ which is not
+        the optimizer.
+
+        Returns:
+            dict: scheduler state.
+        """
+        return {
+            key: value
+            for key, value in self.__dict__.items() if key != 'optimizer'
+        }
+
+    def load_state_dict(self, state_dict: dict):
+        """Loads the schedulers state.
+
+        Args:
+            state_dict (dict): scheduler state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        self.__dict__.update(state_dict)
+
+    def get_last_value(self):
+        """Return the last computed value by current scheduler.
+
+        Returns:
+            list: A list of the last computed value of the optimizer's
+            ``param_group``.
+        """
+        return self._last_value
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        raise NotImplementedError
+
+    def print_value(self, is_verbose: bool, group: int, value: float):
+        """Display the current parameter value.
+
+        Args:
+            is_verbose (bool): Whether to print the value.
+            group (int): The index of the current ``param_group``.
+            value (float): The parameter value.
+        """
+        if is_verbose:
+            print_log(
+                f'Adjusting parameter value of group {group} to {value:.4e}.',
+                logger='current')
+
+    def step(self):
+        """Adjusts the parameter value of each parameter group based on the
+        specified schedule."""
+        # Raise a warning if old pattern is detected
+        # https://github.com/pytorch/pytorch/issues/20124
+        if self._global_step == 0:
+            if not hasattr(self.optimizer.step, '_with_counter'):
+                warnings.warn(
+                    'Seems like `optimizer.step()` has been overridden after '
+                    'parameter value scheduler initialization. Please, make '
+                    'sure to call `optimizer.step()` before '
+                    '`scheduler.step()`. See more details at '
+                    'https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate',  # noqa: E501
+                    UserWarning)
+
+            # Just check if there were two first scheduler.step() calls
+            # before optimizer.step()
+            elif self.optimizer._global_step < 0:
+                warnings.warn(
+                    'Detected call of `scheduler.step()` before '
+                    '`optimizer.step()`. In PyTorch 1.1.0 and later, you '
+                    'should call them in the opposite order: '
+                    '`optimizer.step()` before `scheduler.step()`. '
+                    'Failure to do this will result in PyTorch skipping '
+                    'the first value of the parameter value schedule. '
+                    'See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate',  # noqa: E501
+                    UserWarning)
+        self._global_step += 1
+
+        # Compute parameter value per param group in the effective range
+        if self.begin <= self._global_step < self.end:
+            self.last_step += 1
+            values = self._get_value()
+
+            for i, data in enumerate(zip(self.optimizer.param_groups, values)):
+                param_group, value = data
+                param_group[self.param_name] = value
+                self.print_value(self.verbose, i, value)
+
+        self._last_value = [
+            group[self.param_name] for group in self.optimizer.param_groups
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class StepParamScheduler(_ParamScheduler):
+    """Decays the parameter value of each parameter group by gamma every
+    step_size epochs. Notice that such decay can happen simultaneously with
+    other changes to the parameter value from outside this scheduler.
+
+    Args:
+        optimizer (BaseOptimWrapper or Optimizer): Wrapped optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        step_size (int): Period of parameter value decay.
+        gamma (float): Multiplicative factor of parameter value decay.
+            Defaults to 0.1.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: OptimizerType,
+                 param_name: str,
+                 step_size: int,
+                 gamma: float = 0.1,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        self.step_size = step_size
+        self.gamma = gamma
+        super().__init__(
+            optimizer=optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              step_size,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        step_size = step_size * epoch_length
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(
+            *args,
+            step_size=step_size,
+            begin=begin,
+            end=end,
+            by_epoch=by_epoch,
+            **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if (self.last_step == 0) or (self.last_step % self.step_size != 0):
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+        return [
+            group[self.param_name] * self.gamma
+            for group in self.optimizer.param_groups
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class MultiStepParamScheduler(_ParamScheduler):
+    """Decays the specified parameter in each parameter group by gamma once the
+    number of epoch reaches one of the milestones. Notice that such decay can
+    happen simultaneously with other changes to the parameter from outside this
+    scheduler.
+
+    Args:
+        optimizer (BaseOptimWrapper or Optimizer): Wrapped optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        milestones (list): List of epoch indices. Must be increasing.
+        gamma (float): Multiplicative factor of parameter value decay.
+            Defaults to 0.1.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: OptimizerType,
+                 param_name: str,
+                 milestones: List[int],
+                 gamma: float = 0.1,
+                 last_step: int = -1,
+                 begin: int = 0,
+                 end: int = INF,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        self.milestones = Counter(milestones)
+        self.gamma = gamma
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              milestones,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        milestones = [i * epoch_length for i in milestones]
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(
+            *args,
+            milestones=milestones,
+            begin=begin,
+            end=end,
+            by_epoch=by_epoch,
+            **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step not in self.milestones:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+        return [
+            group[self.param_name] *
+            self.gamma**self.milestones[self.last_step]
+            for group in self.optimizer.param_groups
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class ConstantParamScheduler(_ParamScheduler):
+    """Decays the parameter value of each parameter group by a small constant
+    factor until the number of epoch reaches a pre-defined milestone: ``end``.
+    Notice that such decay can happen simultaneously with other changes to the
+    parameter value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        factor (float): The number we multiply parameter value until the
+            milestone. Defaults to 1./3.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: OptimizerType,
+                 param_name: str,
+                 factor: float = 1.0 / 3,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        if factor > 1.0 or factor < 0:
+            raise ValueError(
+                'Constant multiplicative factor should between 0 and 1.')
+
+        self.factor = factor
+        self.total_iters = end - begin - 1
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step == 0:
+            return [
+                group[self.param_name] * self.factor
+                for group in self.optimizer.param_groups
+            ]
+
+        if (self.last_step > self.total_iters
+                or (self.last_step != self.total_iters)):
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+
+        if self.last_step == self.total_iters:
+            return [
+                group[self.param_name] * (1.0 / self.factor)
+                for group in self.optimizer.param_groups
+            ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class ExponentialParamScheduler(_ParamScheduler):
+    """Decays the parameter value of each parameter group by gamma every epoch.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        gamma (float): Multiplicative factor of parameter value decay.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: OptimizerType,
+                 param_name: str,
+                 gamma: float,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        self.gamma = gamma
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step == 0:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+        return [
+            group[self.param_name] * self.gamma
+            for group in self.optimizer.param_groups
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class CosineAnnealingParamScheduler(_ParamScheduler):
+    r"""Set the parameter value of each parameter group using a cosine
+    annealing schedule, where :math:`\eta_{max}` is set to the initial value
+    and :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
+
+    .. math::
+        \begin{aligned}
+            \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+            + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+            & T_{cur} \neq (2k+1)T_{max}; \\
+            \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+            \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+            & T_{cur} = (2k+1)T_{max}.
+        \end{aligned}
+
+    Notice that because the schedule
+    is defined recursively, the parameter value can be simultaneously modified
+    outside this scheduler by other operators. If the parameter value is set
+    solely by this scheduler, the parameter value at each step becomes:
+
+    .. math::
+        \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1 +
+        \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right)
+
+    It has been proposed in
+    `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this
+    only implements the cosine annealing part of SGDR, and not the restarts.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        T_max (int, optional): Maximum number of iterations. If not specified,
+            use ``end - begin``. Defaults to None.
+        eta_min (float, optional): Minimum parameter value. Defaults to None.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+        eta_min_ratio (float, optional): The ratio of the minimum parameter
+            value to the base parameter value. Either `eta_min` or
+            `eta_min_ratio` should be specified. Defaults to None.
+            New in version 0.3.2.
+
+    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
+        https://arxiv.org/abs/1608.03983
+    """
+
+    def __init__(self,
+                 optimizer: Union[Optimizer, BaseOptimWrapper],
+                 param_name: str,
+                 T_max: Optional[int] = None,
+                 eta_min: Optional[float] = None,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False,
+                 eta_min_ratio: Optional[float] = None):
+        # To preserve backwards compatibility
+        if eta_min is None and eta_min_ratio is None:
+            eta_min = 0.
+        assert (eta_min is None) ^ (eta_min_ratio is None), \
+            'Either `eta_min` or `eta_min_ratio should be specified'
+        self.T_max = T_max or (end - begin)
+        self.eta_min = eta_min
+        self.eta_min_ratio = eta_min_ratio
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              T_max=None,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        if T_max is not None:
+            T_max = T_max * epoch_length
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(
+            *args,
+            T_max=T_max,
+            begin=begin,
+            end=end,
+            by_epoch=by_epoch,
+            **kwargs)
+
+    def _get_value(self) -> list:
+        """Compute value using chainable form of the scheduler."""
+
+        def _get_eta_min(base_value):
+            if self.eta_min_ratio is None:
+                return self.eta_min
+            return base_value * self.eta_min_ratio
+
+        if self.last_step == 0:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+        elif (self.last_step - 1 - self.T_max) % (2 * self.T_max) == 0:
+            return [
+                group[self.param_name] +
+                (base_value - _get_eta_min(base_value)) *
+                (1 - math.cos(math.pi / self.T_max)) / 2
+                for base_value, group in zip(self.base_values,
+                                             self.optimizer.param_groups)
+            ]
+        return [(1 + math.cos(math.pi * self.last_step / self.T_max)) /
+                (1 + math.cos(math.pi * (self.last_step - 1) / self.T_max)) *
+                (group[self.param_name] - _get_eta_min(base_value)) +
+                _get_eta_min(base_value) for base_value, group in zip(
+                    self.base_values, self.optimizer.param_groups)]
+
+
+@PARAM_SCHEDULERS.register_module()
+class LinearParamScheduler(_ParamScheduler):
+    """Decays the parameter value of each parameter group by linearly changing
+    small multiplicative factor until the number of epoch reaches a pre-defined
+    milestone: ``end``.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    parameter value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        start_factor (float): The number we multiply parameter value in the
+            first epoch. The multiplication factor changes towards end_factor
+            in the following epochs. Defaults to 1./3.
+        end_factor (float): The number we multiply parameter value at the end
+            of linear changing process. Defaults to 1.0.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: Union[Optimizer, BaseOptimWrapper],
+                 param_name: str,
+                 start_factor: float = 1.0 / 3,
+                 end_factor: float = 1.0,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        if start_factor > 1.0 or start_factor < 0:
+            raise ValueError(
+                'Starting multiplicative factor should between 0 and 1.')
+
+        if end_factor > 1.0 or end_factor < 0:
+            raise ValueError(
+                'Ending multiplicative factor should between 0 and 1.')
+
+        self.start_factor = start_factor
+        self.end_factor = end_factor
+        self.total_iters = end - begin - 1
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step == 0:
+            return [
+                group[self.param_name] * self.start_factor
+                for group in self.optimizer.param_groups
+            ]
+
+        return [
+            group[self.param_name] *
+            (1. + (self.end_factor - self.start_factor) /
+             (self.total_iters * self.start_factor + (self.last_step - 1) *
+              (self.end_factor - self.start_factor)))
+            for group in self.optimizer.param_groups
+        ]
+
+
+@PARAM_SCHEDULERS.register_module()
+class PolyParamScheduler(_ParamScheduler):
+    """Decays the parameter value of each parameter group in a polynomial decay
+    scheme.
+
+    Notice that such decay can happen simultaneously with other changes to the
+    parameter value from outside this scheduler.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        eta_min (float): Minimum parameter value at the end of scheduling.
+            Defaults to 0.
+        power (float): The power of the polynomial. Defaults to 1.0.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: Union[Optimizer, BaseOptimWrapper],
+                 param_name: str,
+                 eta_min: float = 0,
+                 power: float = 1.0,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+
+        self.eta_min = eta_min
+        self.power = power
+        self.total_iters = end - begin - 1
+
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(*args, begin=begin, end=end, by_epoch=by_epoch, **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        if self.last_step == 0:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+
+        return [(group[self.param_name] - self.eta_min) *
+                (1 - 1 / (self.total_iters - self.last_step + 1))**self.power +
+                self.eta_min for group in self.optimizer.param_groups]
+
+
+@PARAM_SCHEDULERS.register_module()
+class OneCycleParamScheduler(_ParamScheduler):
+    r"""Sets the parameters of each parameter group according to the
+    1cycle learning rate policy. The 1cycle policy anneals the learning
+    rate from an initial learning rate to some maximum learning rate and then
+    from that maximum learning rate to some minimum learning rate much lower
+    than the initial learning rate.
+    This policy was initially described in the paper `Super-Convergence:
+    Very Fast Training of Neural Networks Using Large Learning Rates`_.
+
+    The 1cycle learning rate policy changes the learning rate after every
+    batch. `step` should be called after a batch has been used for training.
+
+    This scheduler is not chainable.
+
+    Note also that the total number of steps in the cycle can be determined in
+    one of two ways (listed in order of precedence):
+
+    #. A value for total_steps is explicitly provided.
+    #. If total_steps is not defined, begin and end of the ParamSchedul will
+       works for it. In this case, the number of total steps is inferred by
+       total_steps = end - begin
+
+    The default behaviour of this scheduler follows the fastai implementation
+    of 1cycle, which claims that "unpublished work has shown even better
+    results by using only two phases". To mimic the behaviour of the original
+    paper instead, set ``three_phase=True``.
+
+    Args:
+        optimizer (Optimizer): Wrapped optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        eta_max (float or list): Upper parameter value boundaries in the cycle
+            for each parameter group.
+        total_steps (int): The total number of steps in the cycle. Note that
+            if a value is not provided here, then it will be equal to
+            ``end - begin``. Defaults to None
+        pct_start (float): The percentage of the cycle (in number of steps)
+            spent increasing the learning rate.
+            Defaults to 0.3
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: "cos" for cosine annealing,
+            "linear" for linear annealing.
+            Defaults to 'cos'
+        div_factor (float): Determines the initial learning rate via
+            initial_param = eta_max/div_factor
+            Defaults to 25
+        final_div_factor (float): Determines the minimum learning rate via
+            eta_min = initial_param/final_div_factor
+            Defaults to 1e4
+        three_phase (bool): If ``True``, use a third phase of the schedule to
+            annihilate the learning rate according to 'final_div_factor'
+            instead of modifying the second phase (the first two phases will be
+            symmetrical about the step indicated by 'pct_start').
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+
+    .. _Super-Convergence\: Very Fast Training of Neural Networks Using Large Learning Rates:
+        https://arxiv.org/abs/1708.07120
+    """  # noqa E501
+
+    def __init__(self,
+                 optimizer: Union[Optimizer, BaseOptimWrapper],
+                 param_name: str,
+                 eta_max: float = 0,
+                 total_steps: Optional[int] = None,
+                 pct_start: float = 0.3,
+                 anneal_strategy: str = 'cos',
+                 div_factor: float = 25.,
+                 final_div_factor: float = 1e4,
+                 three_phase: bool = False,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+
+        assert param_name == 'lr', ('OneCycle only works for learning rate '
+                                    'updating, but got patam_name as '
+                                    f'{param_name}')
+
+        self.eta_max = eta_max
+        self.div_factor = div_factor
+        self.final_div_factor = final_div_factor
+
+        # Validate total_steps
+        if total_steps is not None:
+            if total_steps <= 0 or not isinstance(total_steps, int):
+                raise ValueError('Expected positive integer total_steps, '
+                                 f'but got {total_steps}')
+            self.total_steps = total_steps
+        else:
+            self.total_steps = end - begin
+
+        # Validate pct_start
+        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
+            raise ValueError('Expected float between 0 and 1 pct_start, '
+                             f'but got {pct_start}')
+
+        # Validate anneal_strategy
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError(
+                'anneal_strategy must by one of "cos" or "linear", '
+                f'instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = self._annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = self._annealing_linear
+
+        if three_phase:
+            self._schedule_phases = [
+                {
+                    'end_step': float(pct_start * self.total_steps) - 1,
+                    f'start_{param_name}': f'initial_{param_name}',
+                    f'end_{param_name}': f'max_{param_name}'
+                },
+                {
+                    'end_step': float(2 * pct_start * self.total_steps) - 2,
+                    f'start_{param_name}': f'max_{param_name}',
+                    f'end_{param_name}': f'initial_{param_name}'
+                },
+                {
+                    'end_step': self.total_steps - 1,
+                    f'start_{param_name}': f'initial_{param_name}',
+                    f'end_{param_name}': f'min_{param_name}'
+                },
+            ]
+        else:
+            self._schedule_phases = [
+                {
+                    'end_step': float(pct_start * self.total_steps) - 1,
+                    f'start_{param_name}': f'initial_{param_name}',
+                    f'end_{param_name}': f'max_{param_name}'
+                },
+                {
+                    'end_step': self.total_steps - 1,
+                    f'start_{param_name}': f'max_{param_name}',
+                    f'end_{param_name}': f'min_{param_name}'
+                },
+            ]
+
+        # Initialize parameters
+        max_values = self._format_param(f'max_{param_name}', optimizer,
+                                        eta_max)
+        if last_step == -1:
+            for idx, group in enumerate(optimizer.param_groups):
+                group[f'initial_{param_name}'] = max_values[idx] / div_factor
+                group[f'max_{param_name}'] = max_values[idx]
+                group[f'min_{param_name}'] = \
+                    group[f'initial_{param_name}'] / final_div_factor
+
+        super().__init__(
+            optimizer=optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    def _format_param(self, name, optimizer, param):
+        """Return correctly formatted lr/momentum for each param group."""
+        if isinstance(param, (list, tuple)):
+            if len(param) != len(optimizer.param_groups):
+                raise ValueError(
+                    f'expected {len(optimizer.param_groups)} values '
+                    f'for {name}, got {len(param)}')
+            return param
+        else:
+            return [param] * len(optimizer.param_groups)
+
+    @staticmethod
+    def _annealing_cos(start, end, pct):
+        """Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."""
+
+        cos_out = math.cos(math.pi * pct) + 1
+        return end + (start - end) / 2.0 * cos_out
+
+    @staticmethod
+    def _annealing_linear(start, end, pct):
+        """Linearly anneal from `start` to `end` as pct goes from 0.0 to
+        1.0."""
+        return (end - start) * pct + start
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              begin=0,
+                              end=INF,
+                              total_steps=None,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        by_epoch = False
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        if total_steps is not None:
+            total_steps = total_steps * epoch_length
+        return cls(
+            *args,
+            begin=begin,
+            end=end,
+            total_steps=total_steps,
+            by_epoch=by_epoch,
+            **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+
+        params = []
+        step_num = self.last_step
+
+        if step_num > self.total_steps:
+            raise ValueError(
+                f'Tried to step {step_num + 1} times. '
+                f'The specified number of total steps is {self.total_steps}')
+
+        for group in self.optimizer.param_groups:
+            start_step = 0
+            for i, phase in enumerate(self._schedule_phases):
+                end_step = phase['end_step']
+                if step_num <= end_step or i == len(self._schedule_phases) - 1:
+                    pct = (step_num - start_step) / (end_step - start_step)
+                    computed_param = self.anneal_func(
+                        group[phase['start_' + self.param_name]],
+                        group[phase['end_' + self.param_name]], pct)
+                    break
+                start_step = phase['end_step']
+
+            params.append(computed_param)
+
+        return params
+
+
+@PARAM_SCHEDULERS.register_module()
+class CosineRestartParamScheduler(_ParamScheduler):
+    """Sets the parameters of each parameter group according to the cosine
+    annealing with restarts scheme. The cosine restart policy anneals the
+    parameter from the initial value to `eta_min` with a cosine annealing
+    schedule and then restarts another period from the maximum value multiplied
+    with `restart_weight`.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        periods (list[int]): Periods for each cosine anneling cycle.
+        restart_weights (list[float]): Restart weights at each
+            restart iteration. Defaults to [1].
+        eta_min (float, optional): Minimum parameter value at the end of
+            scheduling. Defaults to None.
+        eta_min_ratio (float, optional): The ratio of minimum parameter value
+            to the base parameter value. Either `eta_min` or `eta_min_ratio`
+            should be specified. Defaults to None.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 optimizer: Union[Optimizer, BaseOptimWrapper],
+                 param_name: str,
+                 periods: List[int],
+                 restart_weights: Sequence[float] = (1, ),
+                 eta_min: Optional[float] = None,
+                 eta_min_ratio: Optional[float] = None,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+        assert (eta_min is None) ^ (eta_min_ratio is None)
+        self.periods = periods
+        self.eta_min = eta_min
+        self.eta_min_ratio = eta_min_ratio
+        self.restart_weights = restart_weights
+        assert (len(self.periods) == len(self.restart_weights)
+                ), 'periods and restart_weights should have the same length.'
+        self.cumulative_periods = [
+            sum(self.periods[0:i + 1]) for i in range(0, len(self.periods))
+        ]
+
+        super().__init__(
+            optimizer,
+            param_name=param_name,
+            begin=begin,
+            end=end,
+            last_step=last_step,
+            by_epoch=by_epoch,
+            verbose=verbose)
+
+    @classmethod
+    def build_iter_from_epoch(cls,
+                              *args,
+                              periods,
+                              begin=0,
+                              end=INF,
+                              by_epoch=True,
+                              epoch_length=None,
+                              **kwargs):
+        """Build an iter-based instance of this scheduler from an epoch-based
+        config."""
+        assert by_epoch, 'Only epoch-based kwargs whose `by_epoch=True` can ' \
+                         'be converted to iter-based.'
+        assert epoch_length is not None and epoch_length > 0, \
+            f'`epoch_length` must be a positive integer, ' \
+            f'but got {epoch_length}.'
+        periods = [p * epoch_length for p in periods]
+        by_epoch = False
+        begin = int(begin * epoch_length)
+        if end != INF:
+            end = int(end * epoch_length)
+        return cls(
+            *args,
+            periods=periods,
+            begin=begin,
+            end=end,
+            by_epoch=by_epoch,
+            **kwargs)
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        idx = self.get_position_from_periods(self.last_step,
+                                             self.cumulative_periods)
+        # if current step is not in the periods, return origin parameters
+        if idx is None:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+        current_weight = self.restart_weights[idx]
+        nearest_restart = 0 if idx == 0 else self.cumulative_periods[idx - 1]
+        current_periods = self.periods[idx]
+        step = self.last_step - nearest_restart
+        values = []
+        for base_value, group in zip(self.base_values,
+                                     self.optimizer.param_groups):
+            eta_max = base_value * current_weight
+            if self.eta_min_ratio is None:
+                eta_min = self.eta_min
+            else:
+                eta_min = base_value * self.eta_min_ratio
+            if step == 0:
+                values.append(eta_max)
+            else:
+                values.append(
+                    (1 + math.cos(math.pi * step / current_periods)) /
+                    (1 + math.cos(math.pi * (step - 1) / current_periods)) *
+                    (group[self.param_name] - eta_min) + eta_min)
+
+        return values
+
+    @staticmethod
+    def get_position_from_periods(
+            iteration: int, cumulative_periods: List[int]) -> Optional[int]:
+        """Get the position from a period list.
+
+        It will return the index of the right-closest number in the period
+        list.
+        For example, the cumulative_periods = [100, 200, 300, 400],
+        if iteration == 50, return 0;
+        if iteration == 210, return 2;
+        if iteration == 300, return 3.
+
+        Args:
+            iteration (int): Current iteration.
+            cumulative_periods (list[int]): Cumulative period list.
+
+        Returns:
+            Optional[int]: The position of the right-closest number in the
+            period list. If not in the period, return None.
+        """
+        for i, period in enumerate(cumulative_periods):
+            if iteration < period:
+                return i
+        return None
+
+
+@PARAM_SCHEDULERS.register_module()
+class ReduceOnPlateauParamScheduler(_ParamScheduler):
+    """Reduce the parameters of each parameter group when a metric has stopped
+    improving. Models often benefit from reducing the parameters by a factor of
+    2-10 once learning stagnates. This scheduler reads a metrics quantity and
+    if no improvement is seen for a ``patience`` number of epochs, the
+    parameters are reduced.
+
+    The implementation is motivated by `PyTorch ReduceLROnPlateau`_.
+
+    Args:
+        optimizer (Optimizer or BaseOptimWrapper): optimizer or Wrapped
+            optimizer.
+        param_name (str): Name of the parameter to be adjusted, such as
+            ``lr``, ``momentum``.
+        monitor (str): The name of the metric to measure whether
+            the performance of the model is improved.
+        rule (str): One of `less`, `greater`. In `less` rule, parameters will
+            be reduced when the quantity monitored has stopped
+            decreasing; in `greater` rule it will be reduced when the
+            quantity monitored has stopped increasing. Defaults to 'less'.
+            The ``rule`` is the renaming of ``mode`` in pytorch.
+        factor (float): Factor by which the parameters will be
+            reduced. new_param = param * factor. Defaults to 0.1.
+        patience (int): Number of epochs with no improvement after
+            which parameters will be reduced. For example, if
+            ``patience = 2``, then we will ignore the first 2 epochs
+            with no improvement, and will only decrease the parameters after
+            the 3rd epoch if the monitor value still hasn't improved then.
+            Defaults to 10.
+        threshold (float): Threshold for measuring the new optimum,
+            to only focus on significant changes. Defaults to 1e-4.
+        threshold_rule (str): One of `rel`, `abs`. In `rel` rule,
+            dynamic_threshold = best * ( 1 + threshold ) in 'greater'
+            rule or best * ( 1 - threshold ) in `less` rule.
+            In `abs` rule, dynamic_threshold = best + threshold in
+            `greater` rule or best - threshold in `less` rule.
+            Defaults to 'rel'.
+        cooldown (int): Number of epochs to wait before resuming
+            normal operation after parameters have been reduced. Defaults to 0.
+        min_value (float or list[float]): A scalar or a sequence of scalars.
+            A lower bound on the parameters of each parameter group
+            respectively. Defaults to 0. .
+        eps (float): Minimal decay applied to parameters. If the difference
+            between new and old parameters are smaller than eps, the update is
+            ignored. Defaults to 1e-8.
+        begin (int): Step at which to start triggering the scheduler
+            to monitor in val within the interval calculated
+            according to epoch of training. Defaults to 0.
+        end (int): Step at which to stop triggering the scheduler
+            to monitor in val within the interval calculated
+            according to epoch of training. Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+
+    .. _PyTorch ReduceLROnPlateau:
+       https://github.com/pytorch/pytorch/blob/master/torch/optim/lr_scheduler.py
+    """
+
+    need_val_args = True
+
+    def __init__(self,
+                 optimizer: OptimizerType,
+                 param_name: str,
+                 monitor: str = 'loss',
+                 rule: str = 'less',
+                 factor: float = 0.1,
+                 patience: int = 10,
+                 threshold: float = 1e-4,
+                 threshold_rule: str = 'rel',
+                 cooldown: int = 0,
+                 min_value: Union[float, Sequence[float]] = 0.,
+                 eps: float = 1e-8,
+                 begin: int = 0,
+                 end: int = INF,
+                 last_step: int = -1,
+                 by_epoch: bool = True,
+                 verbose: bool = False):
+
+        # Attach optimizer
+        if not isinstance(optimizer, (Optimizer, BaseOptimWrapper)):
+            raise TypeError('``optimizer`` should be an Optimizer,'
+                            'but got {}'.format(type(optimizer).__name__))
+        self.optimizer = optimizer
+        self.param_name = param_name
+
+        if end <= begin:
+            raise ValueError('end should be larger than begin, but got'
+                             ' begin={}, end={}'.format(begin, end))
+        self.begin = begin
+        self.end = end
+
+        assert by_epoch, \
+            f'Now {type(self).__name__} only support by_epoch=True'
+        self.by_epoch = by_epoch
+
+        assert isinstance(last_step, int) and last_step >= -1
+        # Initialize valid step count and base values
+        if last_step == -1:
+            for group in optimizer.param_groups:
+                # If the param is never be scheduled, record the current value
+                # as the initial value.
+                group.setdefault(f'initial_{param_name}', group[param_name])
+        else:
+            for i, group in enumerate(optimizer.param_groups):
+                if f'initial_{param_name}' not in group:
+                    raise KeyError(
+                        f"param 'initial_{param_name}' is not specified "
+                        'in param_groups[{}] when resuming an optimizer'.
+                        format(i))
+
+        self.last_step = last_step
+
+        self._global_step = 0
+        self.verbose = verbose
+
+        if factor >= 1.0:
+            raise ValueError('Factor should be < 1.0.')
+        self.factor = factor
+
+        # This code snippet handles compatibility with the optimizer wrapper.
+        # The optimizer wrapper includes an additional parameter to record the
+        # base learning rate (lr) which is not affected by the paramwise_cfg.
+        # By retrieving the base lr, we can obtain the actual base lr that
+        # reflects the learning progress.
+        if isinstance(optimizer, BaseOptimWrapper):
+            raw_optimizer = optimizer.optimizer
+        else:
+            raw_optimizer = optimizer
+
+        if isinstance(min_value, (list, tuple)):
+            if len(min_value) != len(raw_optimizer.param_groups):
+                raise ValueError('expected {} min_lrs, got {}'.format(
+                    len(raw_optimizer.param_groups), len(min_value)))
+            self.min_values = list(min_value)
+            # Consider the `min_value` of the last param_groups
+            # as the base setting. And we only add this value when
+            # the optimizer is OptimWrapper.
+            if isinstance(optimizer, BaseOptimWrapper) and \
+                    optimizer.base_param_settings is not None:  # type: ignore
+                self.min_values.append(self.min_values[-1])
+
+        else:
+            self.min_values = [min_value] * len(  # type: ignore
+                optimizer.param_groups)
+
+        self.patience = patience
+        self.cooldown = cooldown
+        self.cooldown_counter = 0
+        self.rule_worse = None  # the worse value for the chosen mode
+        self.best = None
+        self.num_bad_epochs = 0
+        self.eps = eps
+
+        self.monitor = monitor
+        self._init_is_better(
+            rule=rule, threshold=threshold, threshold_rule=threshold_rule)
+        self._reset()
+
+        # remove call self.step() and init self._global_step = 0
+        self._last_value = [
+            group[self.param_name] for group in self.optimizer.param_groups
+        ]
+
+    def step(self, metrics=None):
+        """Adjusts the parameter value of each parameter group based on the
+        specified schedule.
+
+        Args:
+            metrics (Dict[str, float], optional): Evaluation results of all
+                metrics on validation dataset. The keys are the names of the
+                metrics, and the values are corresponding results.
+                Defaults to None.
+        """
+        if metrics is None:
+            # only to count self._global_step
+            self._global_step += 1
+            return
+
+        if not isinstance(metrics, dict):
+            raise TypeError('metrics type should be dict,'
+                            f' but got type {type(metrics)}')
+
+        # Compute parameter value per param group in the effective range
+        if self.begin <= self._global_step < self.end:
+            self.last_step += 1
+
+            # convert `metric` to float, in case it's a zero-dim Tensor
+            metric = metrics.get(self.monitor, None)
+            if metric is not None:
+                if self._is_better(metric, self.best):
+                    self.best = metric
+                    self.num_bad_epochs = 0
+                else:
+                    self.num_bad_epochs += 1
+
+                if self._in_cooldown():
+                    self.cooldown_counter -= 1
+                    self.num_bad_epochs = 0  # ignore bad epochs in cooldown
+
+                if self.num_bad_epochs > self.patience:
+                    values = self._get_value()
+
+                    for i, data in enumerate(
+                            zip(self.optimizer.param_groups, values)):
+                        param_group, value = data
+                        if param_group[self.param_name] - value > self.eps:
+                            param_group[self.param_name] = value
+                            self.print_value(self.verbose, i, value)
+                    self.cooldown_counter = self.cooldown
+                    self.num_bad_epochs = 0
+
+            else:
+                raise KeyError(f'Excepted key in {list(metrics.keys())},'
+                               f' but got key {self.monitor} is not in dict')
+
+        self._last_value = [
+            group[self.param_name] for group in self.optimizer.param_groups
+        ]
+
+    def print_value(self, is_verbose: bool, group: int, value: float) -> None:
+        """Display the current parameter value.
+
+        Args:
+            is_verbose (bool): Whether to print the value.
+            group (int): The index of the current ``param_group``.
+            value (float): The parameter value.
+        """
+        if is_verbose:
+            step_name = 'epoch' if self.by_epoch else 'iter'
+            print_log(
+                f'Adjusting parameter value of group {group} to {value:.4e} '
+                f'in {step_name} {self.last_step}.',
+                logger='current')
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+        values = [
+            float(group[self.param_name]) * self.factor
+            for group in self.optimizer.param_groups
+        ]
+        return [max(v, min_v) for v, min_v in zip(values, self.min_values)]
+
+    def _in_cooldown(self):
+        """Judge whether it is in cooldown."""
+        return self.cooldown_counter > 0
+
+    def _is_better(self, a, best):
+        """Judge whether the monitor value is better."""
+        if self.rule == 'less' and self.threshold_rule == 'rel':
+            rel_epsilon = 1. - self.threshold
+            return a < best * rel_epsilon
+
+        elif self.rule == 'less' and self.threshold_rule == 'abs':
+            return a < best - self.threshold
+
+        elif self.rule == 'greater' and self.threshold_rule == 'rel':
+            rel_epsilon = self.threshold + 1.
+            return a > best * rel_epsilon
+
+        else:  # rule == 'greater' and epsilon_mode == 'abs':
+            return a > best + self.threshold
+
+    def _init_is_better(self, rule, threshold, threshold_rule):
+        """Initialize rule and its associated values."""
+        if threshold < 0:
+            raise ValueError(f'threshold {threshold} should be >= 0.')
+        if rule not in {'less', 'greater'}:
+            raise ValueError(f'mode {rule} is unknown!')
+        if threshold_rule not in {'rel', 'abs'}:
+            raise ValueError(f'threshold mode {threshold_rule}'
+                             ' is unknown!')
+
+        if rule == 'less':
+            self.rule_worse = INF
+        else:  # rule == 'greater':
+            self.rule_worse = -INF
+
+        self.rule = rule
+        self.threshold = threshold
+        self.threshold_rule = threshold_rule
+
+    def _reset(self):
+        """Resets num_bad_epochs counter and cooldown counter."""
+        self.best = self.rule_worse
+        self.cooldown_counter = 0
+        self.num_bad_epochs = 0
diff --git a/head_extractor/src/mmengine/registry/__init__.py b/head_extractor/src/mmengine/registry/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cce27370437443a88484b4176ac335d27f8aaf14
--- /dev/null
+++ b/head_extractor/src/mmengine/registry/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .build_functions import (build_from_cfg, build_model_from_cfg,
+                              build_runner_from_cfg, build_scheduler_from_cfg)
+from .default_scope import DefaultScope
+from .registry import Registry
+from .root import (DATA_SAMPLERS, DATASETS, EVALUATOR, FUNCTIONS, HOOKS,
+                   INFERENCERS, LOG_PROCESSORS, LOOPS, METRICS, MODEL_WRAPPERS,
+                   MODELS, OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS,
+                   OPTIMIZERS, PARAM_SCHEDULERS, RUNNER_CONSTRUCTORS, RUNNERS,
+                   STRATEGIES, TASK_UTILS, TRANSFORMS, VISBACKENDS,
+                   VISUALIZERS, WEIGHT_INITIALIZERS)
+from .utils import (count_registered_modules, init_default_scope,
+                    traverse_registry_tree)
+
+__all__ = [
+    'Registry', 'RUNNERS', 'RUNNER_CONSTRUCTORS', 'HOOKS', 'DATASETS',
+    'DATA_SAMPLERS', 'TRANSFORMS', 'MODELS', 'WEIGHT_INITIALIZERS',
+    'OPTIMIZERS', 'OPTIM_WRAPPER_CONSTRUCTORS', 'TASK_UTILS',
+    'PARAM_SCHEDULERS', 'METRICS', 'MODEL_WRAPPERS', 'OPTIM_WRAPPERS', 'LOOPS',
+    'VISBACKENDS', 'VISUALIZERS', 'LOG_PROCESSORS', 'EVALUATOR', 'INFERENCERS',
+    'DefaultScope', 'traverse_registry_tree', 'count_registered_modules',
+    'build_model_from_cfg', 'build_runner_from_cfg', 'build_from_cfg',
+    'build_scheduler_from_cfg', 'init_default_scope', 'FUNCTIONS', 'STRATEGIES'
+]
diff --git a/head_extractor/src/mmengine/registry/build_functions.py b/head_extractor/src/mmengine/registry/build_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..585a86a437f5471b98ff8c4d9516c41cc185de90
--- /dev/null
+++ b/head_extractor/src/mmengine/registry/build_functions.py
@@ -0,0 +1,312 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import logging
+from typing import TYPE_CHECKING, Any, Optional, Union
+import torch
+from mmengine.config import Config, ConfigDict
+#from mmengine.utils import ManagerMixin
+from mmengine.utils import ManagerMixin, digit_version
+from .registry import Registry
+
+if TYPE_CHECKING:
+    import torch.nn as nn
+
+    from mmengine.optim.scheduler import _ParamScheduler
+    from mmengine.runner import Runner
+
+
+def build_from_cfg(
+        cfg: Union[dict, ConfigDict, Config],
+        registry: Registry,
+        default_args: Optional[Union[dict, ConfigDict, Config]] = None) -> Any:
+    """Build a module from config dict when it is a class configuration, or
+    call a function from config dict when it is a function configuration.
+
+    If the global variable default scope (:obj:`DefaultScope`) exists,
+    :meth:`build` will firstly get the responding registry and then call
+    its own :meth:`build`.
+
+    At least one of the ``cfg`` and ``default_args`` contains the key "type",
+    which should be either str or class. If they all contain it, the key
+    in ``cfg`` will be used because ``cfg`` has a high priority than
+    ``default_args`` that means if a key exists in both of them, the value of
+    the key will be ``cfg[key]``. They will be merged first and the key "type"
+    will be popped up and the remaining keys will be used as initialization
+    arguments.
+
+    Examples:
+        >>> from mmengine import Registry, build_from_cfg
+        >>> MODELS = Registry('models')
+        >>> @MODELS.register_module()
+        >>> class ResNet:
+        >>>     def __init__(self, depth, stages=4):
+        >>>         self.depth = depth
+        >>>         self.stages = stages
+        >>> cfg = dict(type='ResNet', depth=50)
+        >>> model = build_from_cfg(cfg, MODELS)
+        >>> # Returns an instantiated object
+        >>> @MODELS.register_module()
+        >>> def resnet50():
+        >>>     pass
+        >>> resnet = build_from_cfg(dict(type='resnet50'), MODELS)
+        >>> # Return a result of the calling function
+
+    Args:
+        cfg (dict or ConfigDict or Config): Config dict. It should at least
+            contain the key "type".
+        registry (:obj:`Registry`): The registry to search the type from.
+        default_args (dict or ConfigDict or Config, optional): Default
+            initialization arguments. Defaults to None.
+
+    Returns:
+        object: The constructed object.
+    """
+    # Avoid circular import
+    from ..logging import print_log
+
+    if not isinstance(cfg, (dict, ConfigDict, Config)):
+        raise TypeError(
+            f'cfg should be a dict, ConfigDict or Config, but got {type(cfg)}')
+
+    if 'type' not in cfg:
+        if default_args is None or 'type' not in default_args:
+            raise KeyError(
+                '`cfg` or `default_args` must contain the key "type", '
+                f'but got {cfg}\n{default_args}')
+
+    if not isinstance(registry, Registry):
+        raise TypeError('registry must be a mmengine.Registry object, '
+                        f'but got {type(registry)}')
+
+    if not (isinstance(default_args,
+                       (dict, ConfigDict, Config)) or default_args is None):
+        raise TypeError(
+            'default_args should be a dict, ConfigDict, Config or None, '
+            f'but got {type(default_args)}')
+
+    args = cfg.copy()
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+
+    # Instance should be built under target scope, if `_scope_` is defined
+    # in cfg, current default scope should switch to specified scope
+    # temporarily.
+    scope = args.pop('_scope_', None)
+    with registry.switch_scope_and_registry(scope) as registry:
+        obj_type = args.pop('type')
+        if isinstance(obj_type, str):
+            obj_cls = registry.get(obj_type)
+            if obj_cls is None:
+                raise KeyError(
+                    f'{obj_type} is not in the {registry.scope}::{registry.name} registry. '  # noqa: E501
+                    f'Please check whether the value of `{obj_type}` is '
+                    'correct or it was registered as expected. More details '
+                    'can be found at '
+                    'https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#import-the-custom-module'  # noqa: E501
+                )
+        # this will include classes, functions, partial functions and more
+        elif callable(obj_type):
+            obj_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a str or valid type, but got {type(obj_type)}')
+
+        # If `obj_cls` inherits from `ManagerMixin`, it should be
+        # instantiated by `ManagerMixin.get_instance` to ensure that it
+        # can be accessed globally.
+        if inspect.isclass(obj_cls) and \
+                issubclass(obj_cls, ManagerMixin):  # type: ignore
+            obj = obj_cls.get_instance(**args)  # type: ignore
+        else:
+            obj = obj_cls(**args)  # type: ignore
+
+        if (inspect.isclass(obj_cls) or inspect.isfunction(obj_cls)
+                or inspect.ismethod(obj_cls)):
+            print_log(
+                f'An `{obj_cls.__name__}` instance is built from '  # type: ignore # noqa: E501
+                'registry, and its implementation can be found in '
+                f'{obj_cls.__module__}',  # type: ignore
+                logger='current',
+                level=logging.DEBUG)
+        else:
+            print_log(
+                'An instance is built from registry, and its constructor '
+                f'is {obj_cls}',
+                logger='current',
+                level=logging.DEBUG)
+        return obj
+
+
+def build_runner_from_cfg(cfg: Union[dict, ConfigDict, Config],
+                          registry: Registry) -> 'Runner':
+    """Build a Runner object.
+
+    Examples:
+        >>> from mmengine.registry import Registry, build_runner_from_cfg
+        >>> RUNNERS = Registry('runners', build_func=build_runner_from_cfg)
+        >>> @RUNNERS.register_module()
+        >>> class CustomRunner(Runner):
+        >>>     def setup_env(env_cfg):
+        >>>         pass
+        >>> cfg = dict(runner_type='CustomRunner', ...)
+        >>> custom_runner = RUNNERS.build(cfg)
+
+    Args:
+        cfg (dict or ConfigDict or Config): Config dict. If "runner_type" key
+            exists, it will be used to build a custom runner. Otherwise, it
+            will be used to build a default runner.
+        registry (:obj:`Registry`): The registry to search the type from.
+
+    Returns:
+        object: The constructed runner object.
+    """
+    from ..config import Config, ConfigDict
+    from ..logging import print_log
+
+    assert isinstance(
+        cfg,
+        (dict, ConfigDict, Config
+         )), f'cfg should be a dict, ConfigDict or Config, but got {type(cfg)}'
+    assert isinstance(
+        registry, Registry), ('registry should be a mmengine.Registry object',
+                              f'but got {type(registry)}')
+
+    args = cfg.copy()
+    # Runner should be built under target scope, if `_scope_` is defined
+    # in cfg, current default scope should switch to specified scope
+    # temporarily.
+    scope = args.pop('_scope_', None)
+    with registry.switch_scope_and_registry(scope) as registry:
+        obj_type = args.get('runner_type', 'Runner')
+        if isinstance(obj_type, str):
+            runner_cls = registry.get(obj_type)
+            if runner_cls is None:
+                raise KeyError(
+                    f'{obj_type} is not in the {registry.name} registry. '
+                    f'Please check whether the value of `{obj_type}` is '
+                    'correct or it was registered as expected. More details '
+                    'can be found at https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#import-the-custom-module'  # noqa: E501
+                )
+        elif inspect.isclass(obj_type):
+            runner_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a str or valid type, but got {type(obj_type)}')
+
+        runner = runner_cls.from_cfg(args)  # type: ignore
+        print_log(
+            f'An `{runner_cls.__name__}` instance is built from '  # type: ignore # noqa: E501
+            'registry, its implementation can be found in'
+            f'{runner_cls.__module__}',  # type: ignore
+            logger='current',
+            level=logging.DEBUG)
+        return runner
+
+
+def build_model_from_cfg(
+    cfg: Union[dict, ConfigDict, Config],
+    registry: Registry,
+    default_args: Optional[Union[dict, 'ConfigDict', 'Config']] = None
+) -> 'nn.Module':
+    """Build a PyTorch model from config dict(s). Different from
+    ``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built.
+
+    Args:
+        cfg (dict, list[dict]): The config of modules, which is either a config
+            dict or a list of config dicts. If cfg is a list, the built
+            modules will be wrapped with ``nn.Sequential``.
+        registry (:obj:`Registry`): A registry the module belongs to.
+        default_args (dict, optional): Default arguments to build the module.
+            Defaults to None.
+
+    Returns:
+        nn.Module: A built nn.Module.
+    """
+    from ..model import Sequential
+    if isinstance(cfg, list):
+        modules = [
+            build_from_cfg(_cfg, registry, default_args) for _cfg in cfg
+        ]
+        return Sequential(*modules)
+    else:
+        return build_from_cfg(cfg, registry, default_args)
+
+
+def build_optimizer_from_cfg(
+        cfg: Union[dict, ConfigDict, Config],
+        registry: Registry,
+        default_args: Optional[Union[dict, ConfigDict, Config]] = None) -> Any:
+    if 'Adafactor' == cfg['type'] and digit_version(
+            torch.__version__) >= digit_version('2.5.0'):
+        from ..logging import print_log
+        print_log(
+            'the torch version of Adafactor is registered as torch_Adafactor')
+    return build_from_cfg(cfg, registry, default_args)
+
+
+
+def build_scheduler_from_cfg(
+    cfg: Union[dict, ConfigDict, Config],
+    registry: Registry,
+    default_args: Optional[Union[dict, ConfigDict, Config]] = None
+) -> '_ParamScheduler':
+    """Builds a ``ParamScheduler`` instance from config.
+
+    ``ParamScheduler`` supports building instance by its constructor or
+    method ``build_iter_from_epoch``. Therefore, its registry needs a build
+    function to handle both cases.
+
+    Args:
+        cfg (dict or ConfigDict or Config): Config dictionary. If it contains
+            the key ``convert_to_iter_based``, instance will be built by method
+            ``convert_to_iter_based``, otherwise instance will be built by its
+            constructor.
+        registry (:obj:`Registry`): The ``PARAM_SCHEDULERS`` registry.
+        default_args (dict or ConfigDict or Config, optional): Default
+            initialization arguments. It must contain key ``optimizer``. If
+            ``convert_to_iter_based`` is defined in ``cfg``, it must
+            additionally contain key ``epoch_length``. Defaults to None.
+
+    Returns:
+        object: The constructed ``ParamScheduler``.
+    """
+    assert isinstance(
+        cfg,
+        (dict, ConfigDict, Config
+         )), f'cfg should be a dict, ConfigDict or Config, but got {type(cfg)}'
+    assert isinstance(
+        registry, Registry), ('registry should be a mmengine.Registry object',
+                              f'but got {type(registry)}')
+
+    args = cfg.copy()
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+    scope = args.pop('_scope_', None)
+    with registry.switch_scope_and_registry(scope) as registry:
+        convert_to_iter = args.pop('convert_to_iter_based', False)
+        if convert_to_iter:
+            scheduler_type = args.pop('type')
+            assert 'epoch_length' in args and args.get('by_epoch', True), (
+                'Only epoch-based parameter scheduler can be converted to '
+                'iter-based, and `epoch_length` should be set')
+            if isinstance(scheduler_type, str):
+                scheduler_cls = registry.get(scheduler_type)
+                if scheduler_cls is None:
+                    raise KeyError(
+                        f'{scheduler_type} is not in the {registry.name} '
+                        'registry. Please check whether the value of '
+                        f'`{scheduler_type}` is correct or it was registered '
+                        'as expected. More details can be found at https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#import-the-custom-module'  # noqa: E501
+                    )
+            elif inspect.isclass(scheduler_type):
+                scheduler_cls = scheduler_type
+            else:
+                raise TypeError('type must be a str or valid type, but got '
+                                f'{type(scheduler_type)}')
+            return scheduler_cls.build_iter_from_epoch(  # type: ignore
+                **args)
+        else:
+            args.pop('epoch_length', None)
+            return build_from_cfg(args, registry)
diff --git a/head_extractor/src/mmengine/registry/default_scope.py b/head_extractor/src/mmengine/registry/default_scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f1afcaba6dadfd6542fd58fd90dc7b6948c9e7
--- /dev/null
+++ b/head_extractor/src/mmengine/registry/default_scope.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import time
+from contextlib import contextmanager
+from typing import Generator, Optional
+
+from mmengine.utils.manager import ManagerMixin, _accquire_lock, _release_lock
+
+
+class DefaultScope(ManagerMixin):
+    """Scope of current task used to reset the current registry, which can be
+    accessed globally.
+
+    Consider the case of resetting the current ``Registry`` by
+    ``default_scope`` in the internal module which cannot access runner
+    directly, it is difficult to get the ``default_scope`` defined in
+    ``Runner``. However, if ``Runner`` created ``DefaultScope`` instance
+    by given ``default_scope``, the internal module can get
+    ``default_scope`` by ``DefaultScope.get_current_instance`` everywhere.
+
+    Args:
+        name (str): Name of default scope for global access.
+        scope_name (str): Scope of current task.
+
+    Examples:
+        >>> from mmengine.model import MODELS
+        >>> # Define default scope in runner.
+        >>> DefaultScope.get_instance('task', scope_name='mmdet')
+        >>> # Get default scope globally.
+        >>> scope_name = DefaultScope.get_instance('task').scope_name
+    """
+
+    def __init__(self, name: str, scope_name: str):
+        super().__init__(name)
+        assert isinstance(
+            scope_name,
+            str), (f'scope_name should be a string, but got {scope_name}')
+        self._scope_name = scope_name
+
+    @property
+    def scope_name(self) -> str:
+        """
+        Returns:
+            str: Get current scope.
+        """
+        return self._scope_name
+
+    @classmethod
+    def get_current_instance(cls) -> Optional['DefaultScope']:
+        """Get latest created default scope.
+
+        Since default_scope is an optional argument for ``Registry.build``.
+        ``get_current_instance`` should return ``None`` if there is no
+        ``DefaultScope`` created.
+
+        Examples:
+            >>> default_scope = DefaultScope.get_current_instance()
+            >>> # There is no `DefaultScope` created yet,
+            >>> # `get_current_instance` return `None`.
+            >>> default_scope = DefaultScope.get_instance(
+            >>>     'instance_name', scope_name='mmengine')
+            >>> default_scope.scope_name
+            mmengine
+            >>> default_scope = DefaultScope.get_current_instance()
+            >>> default_scope.scope_name
+            mmengine
+
+        Returns:
+            Optional[DefaultScope]: Return None If there has not been
+            ``DefaultScope`` instance created yet, otherwise return the
+            latest created DefaultScope instance.
+        """
+        _accquire_lock()
+        if cls._instance_dict:
+            instance = super().get_current_instance()
+        else:
+            instance = None
+        _release_lock()
+        return instance
+
+    @classmethod
+    @contextmanager
+    def overwrite_default_scope(cls, scope_name: Optional[str]) -> Generator:
+        """overwrite the current default scope with `scope_name`"""
+        if scope_name is None:
+            yield
+        else:
+            tmp = copy.deepcopy(cls._instance_dict)
+            # To avoid create an instance with the same name.
+            time.sleep(1e-6)
+            cls.get_instance(f'overwrite-{time.time()}', scope_name=scope_name)
+            try:
+                yield
+            finally:
+                cls._instance_dict = tmp
diff --git a/head_extractor/src/mmengine/registry/registry.py b/head_extractor/src/mmengine/registry/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..31fd44d82769c0171714888ffbf9ccea983f519b
--- /dev/null
+++ b/head_extractor/src/mmengine/registry/registry.py
@@ -0,0 +1,669 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import logging
+import sys
+from collections.abc import Callable
+from contextlib import contextmanager
+from importlib import import_module
+from typing import Any, Dict, Generator, List, Optional, Tuple, Type, Union
+
+from rich.console import Console
+from rich.table import Table
+
+from mmengine.config.utils import MODULE2PACKAGE
+from mmengine.utils import get_object_from_string, is_seq_of
+from .default_scope import DefaultScope
+
+
+class Registry:
+    """A registry to map strings to classes or functions.
+
+    Registered object could be built from registry. Meanwhile, registered
+    functions could be called from registry.
+
+    Args:
+        name (str): Registry name.
+        build_func (callable, optional): A function to construct instance
+            from Registry. :func:`build_from_cfg` is used if neither ``parent``
+            or ``build_func`` is specified. If ``parent`` is specified and
+            ``build_func`` is not given,  ``build_func`` will be inherited
+            from ``parent``. Defaults to None.
+        parent (:obj:`Registry`, optional): Parent registry. The class
+            registered in children registry could be built from parent.
+            Defaults to None.
+        scope (str, optional): The scope of registry. It is the key to search
+            for children registry. If not specified, scope will be the name of
+            the package where class is defined, e.g. mmdet, mmcls, mmseg.
+            Defaults to None.
+        locations (list): The locations to import the modules registered
+            in this registry. Defaults to [].
+            New in version 0.4.0.
+
+    Examples:
+        >>> # define a registry
+        >>> MODELS = Registry('models')
+        >>> # registry the `ResNet` to `MODELS`
+        >>> @MODELS.register_module()
+        >>> class ResNet:
+        >>>     pass
+        >>> # build model from `MODELS`
+        >>> resnet = MODELS.build(dict(type='ResNet'))
+        >>> @MODELS.register_module()
+        >>> def resnet50():
+        >>>     pass
+        >>> resnet = MODELS.build(dict(type='resnet50'))
+
+        >>> # hierarchical registry
+        >>> DETECTORS = Registry('detectors', parent=MODELS, scope='det')
+        >>> @DETECTORS.register_module()
+        >>> class FasterRCNN:
+        >>>     pass
+        >>> fasterrcnn = DETECTORS.build(dict(type='FasterRCNN'))
+
+        >>> # add locations to enable auto import
+        >>> DETECTORS = Registry('detectors', parent=MODELS,
+        >>>     scope='det', locations=['det.models.detectors'])
+        >>> # define this class in 'det.models.detectors'
+        >>> @DETECTORS.register_module()
+        >>> class MaskRCNN:
+        >>>     pass
+        >>> # The registry will auto import det.models.detectors.MaskRCNN
+        >>> fasterrcnn = DETECTORS.build(dict(type='det.MaskRCNN'))
+
+    More advanced usages can be found at
+    https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html.
+    """
+
+    def __init__(self,
+                 name: str,
+                 build_func: Optional[Callable] = None,
+                 parent: Optional['Registry'] = None,
+                 scope: Optional[str] = None,
+                 locations: List = []):
+        from .build_functions import build_from_cfg
+        self._name = name
+        self._module_dict: Dict[str, Type] = dict()
+        self._children: Dict[str, 'Registry'] = dict()
+        self._locations = locations
+        self._imported = False
+
+        if scope is not None:
+            assert isinstance(scope, str)
+            self._scope = scope
+        else:
+            self._scope = self.infer_scope()
+
+        # See https://mypy.readthedocs.io/en/stable/common_issues.html#
+        # variables-vs-type-aliases for the use
+        self.parent: Optional['Registry']
+        if parent is not None:
+            assert isinstance(parent, Registry)
+            parent._add_child(self)
+            self.parent = parent
+        else:
+            self.parent = None
+
+        # self.build_func will be set with the following priority:
+        # 1. build_func
+        # 2. parent.build_func
+        # 3. build_from_cfg
+        self.build_func: Callable
+        if build_func is None:
+            if self.parent is not None:
+                self.build_func = self.parent.build_func
+            else:
+                self.build_func = build_from_cfg
+        else:
+            self.build_func = build_func
+
+    def __len__(self):
+        return len(self._module_dict)
+
+    def __contains__(self, key):
+        return self.get(key) is not None
+
+    def __repr__(self):
+        table = Table(title=f'Registry of {self._name}')
+        table.add_column('Names', justify='left', style='cyan')
+        table.add_column('Objects', justify='left', style='green')
+
+        for name, obj in sorted(self._module_dict.items()):
+            table.add_row(name, str(obj))
+
+        console = Console()
+        with console.capture() as capture:
+            console.print(table, end='')
+
+        return capture.get()
+
+    @staticmethod
+    def infer_scope() -> str:
+        """Infer the scope of registry.
+
+        The name of the package where registry is defined will be returned.
+
+        Returns:
+            str: The inferred scope name.
+
+        Examples:
+            >>> # in mmdet/models/backbone/resnet.py
+            >>> MODELS = Registry('models')
+            >>> @MODELS.register_module()
+            >>> class ResNet:
+            >>>     pass
+            >>> # The scope of ``ResNet`` will be ``mmdet``.
+        """
+        from ..logging import print_log
+
+        # `sys._getframe` returns the frame object that many calls below the
+        # top of the stack. The call stack for `infer_scope` can be listed as
+        # follow:
+        # frame-0: `infer_scope` itself
+        # frame-1: `__init__` of `Registry` which calls the `infer_scope`
+        # frame-2: Where the `Registry(...)` is called
+        module = inspect.getmodule(sys._getframe(2))
+        if module is not None:
+            filename = module.__name__
+            split_filename = filename.split('.')
+            scope = split_filename[0]
+        else:
+            # use "mmengine" to handle some cases which can not infer the scope
+            # like initializing Registry in interactive mode
+            scope = 'mmengine'
+            print_log(
+                'set scope as "mmengine" when scope can not be inferred. You '
+                'can silence this warning by passing a "scope" argument to '
+                'Registry like `Registry(name, scope="toy")`',
+                logger='current',
+                level=logging.WARNING)
+
+        return scope
+
+    @staticmethod
+    def split_scope_key(key: str) -> Tuple[Optional[str], str]:
+        """Split scope and key.
+
+        The first scope will be split from key.
+
+        Return:
+            tuple[str | None, str]: The former element is the first scope of
+            the key, which can be ``None``. The latter is the remaining key.
+
+        Examples:
+            >>> Registry.split_scope_key('mmdet.ResNet')
+            'mmdet', 'ResNet'
+            >>> Registry.split_scope_key('ResNet')
+            None, 'ResNet'
+        """
+        split_index = key.find('.')
+        if split_index != -1:
+            return key[:split_index], key[split_index + 1:]
+        else:
+            return None, key
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def scope(self):
+        return self._scope
+
+    @property
+    def module_dict(self):
+        return self._module_dict
+
+    @property
+    def children(self):
+        return self._children
+
+    @property
+    def root(self):
+        return self._get_root_registry()
+
+    @contextmanager
+    def switch_scope_and_registry(self, scope: Optional[str]) -> Generator:
+        """Temporarily switch default scope to the target scope, and get the
+        corresponding registry.
+
+        If the registry of the corresponding scope exists, yield the
+        registry, otherwise yield the current itself.
+
+        Args:
+            scope (str, optional): The target scope.
+
+        Examples:
+            >>> from mmengine.registry import Registry, DefaultScope, MODELS
+            >>> import time
+            >>> # External Registry
+            >>> MMDET_MODELS = Registry('mmdet_model', scope='mmdet',
+            >>>     parent=MODELS)
+            >>> MMCLS_MODELS = Registry('mmcls_model', scope='mmcls',
+            >>>     parent=MODELS)
+            >>> # Local Registry
+            >>> CUSTOM_MODELS = Registry('custom_model', scope='custom',
+            >>>     parent=MODELS)
+            >>>
+            >>> # Initiate DefaultScope
+            >>> DefaultScope.get_instance(f'scope_{time.time()}',
+            >>>     scope_name='custom')
+            >>> # Check default scope
+            >>> DefaultScope.get_current_instance().scope_name
+            custom
+            >>> # Switch to mmcls scope and get `MMCLS_MODELS` registry.
+            >>> with CUSTOM_MODELS.switch_scope_and_registry(scope='mmcls') as registry:
+            >>>     DefaultScope.get_current_instance().scope_name
+            mmcls
+            >>>     registry.scope
+            mmcls
+            >>> # Nested switch scope
+            >>> with CUSTOM_MODELS.switch_scope_and_registry(scope='mmdet') as mmdet_registry:
+            >>>     DefaultScope.get_current_instance().scope_name
+            mmdet
+            >>>     mmdet_registry.scope
+            mmdet
+            >>>     with CUSTOM_MODELS.switch_scope_and_registry(scope='mmcls') as mmcls_registry:
+            >>>         DefaultScope.get_current_instance().scope_name
+            mmcls
+            >>>         mmcls_registry.scope
+            mmcls
+            >>>
+            >>> # Check switch back to original scope.
+            >>> DefaultScope.get_current_instance().scope_name
+            custom
+        """  # noqa: E501
+        from ..logging import print_log
+
+        # Switch to the given scope temporarily. If the corresponding registry
+        # can be found in root registry, return the registry under the scope,
+        # otherwise return the registry itself.
+        with DefaultScope.overwrite_default_scope(scope):
+            # Get the global default scope
+            default_scope = DefaultScope.get_current_instance()
+            # Get registry by scope
+            if default_scope is not None:
+                scope_name = default_scope.scope_name
+                try:
+                    import_module(f'{scope_name}.registry')
+                except (ImportError, AttributeError, ModuleNotFoundError):
+                    if scope in MODULE2PACKAGE:
+                        print_log(
+                            f'{scope} is not installed and its '
+                            'modules will not be registered. If you '
+                            'want to use modules defined in '
+                            f'{scope}, Please install {scope} by '
+                            f'`pip install {MODULE2PACKAGE[scope]}.',
+                            logger='current',
+                            level=logging.WARNING)
+                    else:
+                        print_log(
+                            f'Failed to import `{scope}.registry` '
+                            f'make sure the registry.py exists in `{scope}` '
+                            'package.',
+                            logger='current',
+                            level=logging.WARNING)
+                root = self._get_root_registry()
+                registry = root._search_child(scope_name)
+                if registry is None:
+                    # if `default_scope` can not be found, fallback to argument
+                    # `registry`
+                    print_log(
+                        f'Failed to search registry with scope "{scope_name}" '
+                        f'in the "{root.name}" registry tree. '
+                        f'As a workaround, the current "{self.name}" registry '
+                        f'in "{self.scope}" is used to build instance. This '
+                        'may cause unexpected failure when running the built '
+                        f'modules. Please check whether "{scope_name}" is a '
+                        'correct scope, or whether the registry is '
+                        'initialized.',
+                        logger='current',
+                        level=logging.WARNING)
+                    registry = self
+            # If there is no built default scope, just return current registry.
+            else:
+                registry = self
+            yield registry
+
+    def _get_root_registry(self) -> 'Registry':
+        """Return the root registry."""
+        root = self
+        while root.parent is not None:
+            root = root.parent
+        return root
+
+    def import_from_location(self) -> None:
+        """import modules from the pre-defined locations in self._location."""
+        if not self._imported:
+            # Avoid circular import
+            from ..logging import print_log
+
+            # avoid BC breaking
+            if len(self._locations) == 0 and self.scope in MODULE2PACKAGE:
+                print_log(
+                    f'The "{self.name}" registry in {self.scope} did not '
+                    'set import location. Fallback to call '
+                    f'`{self.scope}.utils.register_all_modules` '
+                    'instead.',
+                    logger='current',
+                    level=logging.DEBUG)
+                try:
+                    module = import_module(f'{self.scope}.utils')
+                except (ImportError, AttributeError, ModuleNotFoundError):
+                    if self.scope in MODULE2PACKAGE:
+                        print_log(
+                            f'{self.scope} is not installed and its '
+                            'modules will not be registered. If you '
+                            'want to use modules defined in '
+                            f'{self.scope}, Please install {self.scope} by '
+                            f'`pip install {MODULE2PACKAGE[self.scope]}.',
+                            logger='current',
+                            level=logging.WARNING)
+                    else:
+                        print_log(
+                            f'Failed to import {self.scope} and register '
+                            'its modules, please make sure you '
+                            'have registered the module manually.',
+                            logger='current',
+                            level=logging.WARNING)
+                else:
+                    # The import errors triggered during the registration
+                    # may be more complex, here just throwing
+                    # the error to avoid causing more implicit registry errors
+                    # like `xxx`` not found in `yyy` registry.
+                    module.register_all_modules(False)  # type: ignore
+
+            for loc in self._locations:
+                import_module(loc)
+                print_log(
+                    f"Modules of {self.scope}'s {self.name} registry have "
+                    f'been automatically imported from {loc}',
+                    logger='current',
+                    level=logging.DEBUG)
+            self._imported = True
+
+    def get(self, key: str) -> Optional[Type]:
+        """Get the registry record.
+
+        If `key`` represents the whole object name with its module
+        information, for example, `mmengine.model.BaseModel`, ``get``
+        will directly return the class object :class:`BaseModel`.
+
+        Otherwise, it will first parse ``key`` and check whether it
+        contains a scope name. The logic to search for ``key``:
+
+        - ``key`` does not contain a scope name, i.e., it is purely a module
+          name like "ResNet": :meth:`get` will search for ``ResNet`` from the
+          current registry to its parent or ancestors until finding it.
+
+        - ``key`` contains a scope name and it is equal to the scope of the
+          current registry (e.g., "mmcls"), e.g., "mmcls.ResNet": :meth:`get`
+          will only search for ``ResNet`` in the current registry.
+
+        - ``key`` contains a scope name and it is not equal to the scope of
+          the current registry (e.g., "mmdet"), e.g., "mmcls.FCNet": If the
+          scope exists in its children, :meth:`get` will get "FCNet" from
+          them. If not, :meth:`get` will first get the root registry and root
+          registry call its own :meth:`get` method.
+
+        Args:
+            key (str): Name of the registered item, e.g., the class name in
+                string format.
+
+        Returns:
+            Type or None: Return the corresponding class if ``key`` exists,
+            otherwise return None.
+
+        Examples:
+            >>> # define a registry
+            >>> MODELS = Registry('models')
+            >>> # register `ResNet` to `MODELS`
+            >>> @MODELS.register_module()
+            >>> class ResNet:
+            >>>     pass
+            >>> resnet_cls = MODELS.get('ResNet')
+
+            >>> # hierarchical registry
+            >>> DETECTORS = Registry('detector', parent=MODELS, scope='det')
+            >>> # `ResNet` does not exist in `DETECTORS` but `get` method
+            >>> # will try to search from its parents or ancestors
+            >>> resnet_cls = DETECTORS.get('ResNet')
+            >>> CLASSIFIER = Registry('classifier', parent=MODELS, scope='cls')
+            >>> @CLASSIFIER.register_module()
+            >>> class MobileNet:
+            >>>     pass
+            >>> # `get` from its sibling registries
+            >>> mobilenet_cls = DETECTORS.get('cls.MobileNet')
+        """
+        # Avoid circular import
+        from ..logging import print_log
+
+        if not isinstance(key, str):
+            raise TypeError(
+                'The key argument of `Registry.get` must be a str, '
+                f'got {type(key)}')
+
+        scope, real_key = self.split_scope_key(key)
+        obj_cls = None
+        registry_name = self.name
+        scope_name = self.scope
+
+        # lazy import the modules to register them into the registry
+        self.import_from_location()
+
+        if scope is None or scope == self._scope:
+            # get from self
+            if real_key in self._module_dict:
+                obj_cls = self._module_dict[real_key]
+            elif scope is None:
+                # try to get the target from its parent or ancestors
+                parent = self.parent
+                while parent is not None:
+                    if real_key in parent._module_dict:
+                        obj_cls = parent._module_dict[real_key]
+                        registry_name = parent.name
+                        scope_name = parent.scope
+                        break
+                    parent = parent.parent
+        else:
+            # import the registry to add the nodes into the registry tree
+            try:
+                import_module(f'{scope}.registry')
+                print_log(
+                    f'Registry node of {scope} has been automatically '
+                    'imported.',
+                    logger='current',
+                    level=logging.DEBUG)
+            except (ImportError, AttributeError, ModuleNotFoundError):
+                print_log(
+                    f'Cannot auto import {scope}.registry, please check '
+                    f'whether the package "{scope}" is installed correctly '
+                    'or import the registry manually.',
+                    logger='current',
+                    level=logging.DEBUG)
+            # get from self._children
+            if scope in self._children:
+                obj_cls = self._children[scope].get(real_key)
+                registry_name = self._children[scope].name
+                scope_name = scope
+            else:
+                root = self._get_root_registry()
+
+                if scope != root._scope and scope not in root._children:
+                    # If not skip directly, `root.get(key)` will recursively
+                    # call itself until RecursionError is thrown.
+                    pass
+                else:
+                    obj_cls = root.get(key)
+
+        if obj_cls is None:
+            # Actually, it's strange to implement this `try ... except` to
+            # get the object by its name in `Registry.get`. However, If we
+            # want to build the model using a configuration like
+            # `dict(type='mmengine.model.BaseModel')`, which can
+            # be dumped by lazy import config, we need this code snippet
+            # for `Registry.get` to work.
+            try:
+                obj_cls = get_object_from_string(key)
+            except Exception:
+                raise RuntimeError(f'Failed to get {key}')
+
+        if obj_cls is not None:
+            # For some rare cases (e.g. obj_cls is a partial function), obj_cls
+            # doesn't have `__name__`. Use default value to prevent error
+            cls_name = getattr(obj_cls, '__name__', str(obj_cls))
+            print_log(
+                f'Get class `{cls_name}` from "{registry_name}"'
+                f' registry in "{scope_name}"',
+                logger='current',
+                level=logging.DEBUG)
+
+        return obj_cls
+
+    def _search_child(self, scope: str) -> Optional['Registry']:
+        """Depth-first search for the corresponding registry in its children.
+
+        Note that the method only search for the corresponding registry from
+        the current registry. Therefore, if we want to search from the root
+        registry, :meth:`_get_root_registry` should be called to get the
+        root registry first.
+
+        Args:
+            scope (str): The scope name used for searching for its
+                corresponding registry.
+
+        Returns:
+            Registry or None: Return the corresponding registry if ``scope``
+            exists, otherwise return None.
+        """
+        if self._scope == scope:
+            return self
+
+        for child in self._children.values():
+            registry = child._search_child(scope)
+            if registry is not None:
+                return registry
+
+        return None
+
+    def build(self, cfg: dict, *args, **kwargs) -> Any:
+        """Build an instance.
+
+        Build an instance by calling :attr:`build_func`.
+
+        Args:
+            cfg (dict): Config dict needs to be built.
+
+        Returns:
+            Any: The constructed object.
+
+        Examples:
+            >>> from mmengine import Registry
+            >>> MODELS = Registry('models')
+            >>> @MODELS.register_module()
+            >>> class ResNet:
+            >>>     def __init__(self, depth, stages=4):
+            >>>         self.depth = depth
+            >>>         self.stages = stages
+            >>> cfg = dict(type='ResNet', depth=50)
+            >>> model = MODELS.build(cfg)
+        """
+        return self.build_func(cfg, *args, **kwargs, registry=self)
+
+    def _add_child(self, registry: 'Registry') -> None:
+        """Add a child for a registry.
+
+        Args:
+            registry (:obj:`Registry`): The ``registry`` will be added as a
+                child of the ``self``.
+        """
+
+        assert isinstance(registry, Registry)
+        assert registry.scope is not None
+        assert registry.scope not in self.children, \
+            f'scope {registry.scope} exists in {self.name} registry'
+        self.children[registry.scope] = registry
+
+    def _register_module(self,
+                         module: Type,
+                         module_name: Optional[Union[str, List[str]]] = None,
+                         force: bool = False) -> None:
+        """Register a module.
+
+        Args:
+            module (type): Module to be registered. Typically a class or a
+                function, but generally all ``Callable`` are acceptable.
+            module_name (str or list of str, optional): The module name to be
+                registered. If not specified, the class name will be used.
+                Defaults to None.
+            force (bool): Whether to override an existing class with the same
+                name. Defaults to False.
+        """
+        if not callable(module):
+            raise TypeError(f'module must be Callable, but got {type(module)}')
+
+        if module_name is None:
+            module_name = module.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in self._module_dict:
+                existed_module = self.module_dict[name]
+                raise KeyError(f'{name} is already registered in {self.name} '
+                               f'at {existed_module.__module__}')
+            self._module_dict[name] = module
+
+    def register_module(
+            self,
+            name: Optional[Union[str, List[str]]] = None,
+            force: bool = False,
+            module: Optional[Type] = None) -> Union[type, Callable]:
+        """Register a module.
+
+        A record will be added to ``self._module_dict``, whose key is the class
+        name or the specified name, and value is the class itself.
+        It can be used as a decorator or a normal function.
+
+        Args:
+            name (str or list of str, optional): The module name to be
+                registered. If not specified, the class name will be used.
+            force (bool): Whether to override an existing class with the same
+                name. Defaults to False.
+            module (type, optional): Module class or function to be registered.
+                Defaults to None.
+
+        Examples:
+            >>> backbones = Registry('backbone')
+            >>> # as a decorator
+            >>> @backbones.register_module()
+            >>> class ResNet:
+            >>>     pass
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module(name='mnet')
+            >>> class MobileNet:
+            >>>     pass
+
+            >>> # as a normal function
+            >>> class ResNet:
+            >>>     pass
+            >>> backbones.register_module(module=ResNet)
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f'force must be a boolean, but got {type(force)}')
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str) or is_seq_of(name, str)):
+            raise TypeError(
+                'name must be None, an instance of str, or a sequence of str, '
+                f'but got {type(name)}')
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            self._register_module(module=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(module):
+            self._register_module(module=module, module_name=name, force=force)
+            return module
+
+        return _register
diff --git a/head_extractor/src/mmengine/registry/root.py b/head_extractor/src/mmengine/registry/root.py
new file mode 100644
index 0000000000000000000000000000000000000000..8acd157b316dcb7e888ce8180b294fa45f1253dd
--- /dev/null
+++ b/head_extractor/src/mmengine/registry/root.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""MMEngine provides 20 root registries to support using modules across
+projects.
+
+More datails can be found at
+https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html.
+"""
+
+#from .build_functions import (build_model_from_cfg, build_runner_from_cfg,
+#                              build_scheduler_from_cfg)
+
+from .build_functions import (build_model_from_cfg, build_optimizer_from_cfg,
+                              build_runner_from_cfg, build_scheduler_from_cfg)
+from .registry import Registry
+
+# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner`
+RUNNERS = Registry('runner', build_func=build_runner_from_cfg)
+# manage runner constructors that define how to initialize runners
+RUNNER_CONSTRUCTORS = Registry('runner constructor')
+# manage all kinds of loops like `EpochBasedTrainLoop`
+LOOPS = Registry('loop')
+# manage all kinds of hooks like `CheckpointHook`
+HOOKS = Registry('hook')
+
+# manage all kinds of strategies like `NativeStrategy` and `DDPStrategy`
+STRATEGIES = Registry('strategy')
+
+# manage data-related modules
+DATASETS = Registry('dataset')
+DATA_SAMPLERS = Registry('data sampler')
+TRANSFORMS = Registry('transform')
+
+# mangage all kinds of modules inheriting `nn.Module`
+MODELS = Registry('model', build_model_from_cfg)
+# mangage all kinds of model wrappers like 'MMDistributedDataParallel'
+MODEL_WRAPPERS = Registry('model_wrapper')
+# mangage all kinds of weight initialization modules like `Uniform`
+WEIGHT_INITIALIZERS = Registry('weight initializer')
+
+# mangage all kinds of optimizers like `SGD` and `Adam`
+#OPTIMIZERS = Registry('optimizer')
+OPTIMIZERS = Registry('optimizer', build_func=build_optimizer_from_cfg)
+# manage optimizer wrapper
+OPTIM_WRAPPERS = Registry('optim_wrapper')
+# manage constructors that customize the optimization hyperparameters.
+OPTIM_WRAPPER_CONSTRUCTORS = Registry('optimizer wrapper constructor')
+# mangage all kinds of parameter schedulers like `MultiStepLR`
+PARAM_SCHEDULERS = Registry(
+    'parameter scheduler', build_func=build_scheduler_from_cfg)
+
+# manage all kinds of metrics
+METRICS = Registry('metric')
+# manage evaluator
+EVALUATOR = Registry('evaluator')
+
+# manage task-specific modules like anchor generators and box coders
+TASK_UTILS = Registry('task util')
+
+# manage visualizer
+VISUALIZERS = Registry('visualizer')
+# manage visualizer backend
+VISBACKENDS = Registry('vis_backend')
+
+# manage logprocessor
+LOG_PROCESSORS = Registry('log_processor')
+
+# manage inferencer
+INFERENCERS = Registry('inferencer')
+
+# manage function
+FUNCTIONS = Registry('function')
diff --git a/head_extractor/src/mmengine/registry/utils.py b/head_extractor/src/mmengine/registry/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..568970bbf121e2cb5e659c27351995a1f46926ae
--- /dev/null
+++ b/head_extractor/src/mmengine/registry/utils.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import logging
+import os.path as osp
+from typing import Optional
+
+from mmengine.fileio import dump
+from mmengine.logging import print_log
+from . import root
+from .default_scope import DefaultScope
+from .registry import Registry
+
+
+def traverse_registry_tree(registry: Registry, verbose: bool = True) -> list:
+    """Traverse the whole registry tree from any given node, and collect
+    information of all registered modules in this registry tree.
+
+    Args:
+        registry (Registry): a registry node in the registry tree.
+        verbose (bool): Whether to print log. Defaults to True
+
+    Returns:
+        list: Statistic results of all modules in each node of the registry
+        tree.
+    """
+    root_registry = registry.root
+    modules_info = []
+
+    def _dfs_registry(_registry):
+        if isinstance(_registry, Registry):
+            num_modules = len(_registry.module_dict)
+            scope = _registry.scope
+            registry_info = dict(num_modules=num_modules, scope=scope)
+            for name, registered_class in _registry.module_dict.items():
+                folder = '/'.join(registered_class.__module__.split('.')[:-1])
+                if folder in registry_info:
+                    registry_info[folder].append(name)
+                else:
+                    registry_info[folder] = [name]
+            if verbose:
+                print_log(
+                    f"Find {num_modules} modules in {scope}'s "
+                    f"'{_registry.name}' registry ",
+                    logger='current')
+            modules_info.append(registry_info)
+        else:
+            return
+        for _, child in _registry.children.items():
+            _dfs_registry(child)
+
+    _dfs_registry(root_registry)
+    return modules_info
+
+
+def count_registered_modules(save_path: Optional[str] = None,
+                             verbose: bool = True) -> dict:
+    """Scan all modules in MMEngine's root and child registries and dump to
+    json.
+
+    Args:
+        save_path (str, optional): Path to save the json file.
+        verbose (bool): Whether to print log. Defaults to True.
+
+    Returns:
+        dict: Statistic results of all registered modules.
+    """
+    # import modules to trigger registering
+    import mmengine.dataset
+    import mmengine.evaluator
+    import mmengine.hooks
+    import mmengine.model
+    import mmengine.optim
+    import mmengine.runner
+    import mmengine.visualization  # noqa: F401
+
+    registries_info = {}
+    # traverse all registries in MMEngine
+    for item in dir(root):
+        if not item.startswith('__'):
+            registry = getattr(root, item)
+            if isinstance(registry, Registry):
+                registries_info[item] = traverse_registry_tree(
+                    registry, verbose)
+    scan_data = dict(
+        scan_date=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+        registries=registries_info)
+    if verbose:
+        print_log(
+            f'Finish registry analysis, got: {scan_data}', logger='current')
+    if save_path is not None:
+        json_path = osp.join(save_path, 'modules_statistic_results.json')
+        dump(scan_data, json_path, indent=2)
+        print_log(f'Result has been saved to {json_path}', logger='current')
+    return scan_data
+
+
+def init_default_scope(scope: str) -> None:
+    """Initialize the given default scope.
+
+    Args:
+        scope (str): The name of the default scope.
+    """
+    never_created = DefaultScope.get_current_instance(
+    ) is None or not DefaultScope.check_instance_created(scope)
+    if never_created:
+        DefaultScope.get_instance(scope, scope_name=scope)
+        return
+    current_scope = DefaultScope.get_current_instance()  # type: ignore
+    if current_scope.scope_name != scope:  # type: ignore
+        print_log(
+            'The current default scope '  # type: ignore
+            f'"{current_scope.scope_name}" is not "{scope}", '
+            '`init_default_scope` will force set the current'
+            f'default scope to "{scope}".',
+            logger='current',
+            level=logging.WARNING)
+        # avoid name conflict
+        new_instance_name = f'{scope}-{datetime.datetime.now()}'
+        DefaultScope.get_instance(new_instance_name, scope_name=scope)
diff --git a/head_extractor/src/mmengine/runner/__init__.py b/head_extractor/src/mmengine/runner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b00f8e83915854e83254cb9152d97e4df4348b8c
--- /dev/null
+++ b/head_extractor/src/mmengine/runner/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ._flexible_runner import FlexibleRunner
+from .activation_checkpointing import turn_on_activation_checkpointing
+from .amp import autocast
+from .base_loop import BaseLoop
+from .checkpoint import (CheckpointLoader, find_latest_checkpoint,
+                         get_deprecated_model_names, get_external_models,
+                         get_mmcls_models, get_state_dict,
+                         get_torchvision_models, load_checkpoint,
+                         load_state_dict, save_checkpoint, weights_to_cpu)
+from .log_processor import LogProcessor
+from .loops import EpochBasedTrainLoop, IterBasedTrainLoop, TestLoop, ValLoop
+from .priority import Priority, get_priority
+from .runner import Runner
+from .utils import set_random_seed
+
+__all__ = [
+    'BaseLoop', 'load_state_dict', 'get_torchvision_models',
+    'get_external_models', 'get_mmcls_models', 'get_deprecated_model_names',
+    'CheckpointLoader', 'load_checkpoint', 'weights_to_cpu', 'get_state_dict',
+    'save_checkpoint', 'EpochBasedTrainLoop', 'IterBasedTrainLoop', 'ValLoop',
+    'TestLoop', 'Runner', 'get_priority', 'Priority', 'find_latest_checkpoint',
+    'autocast', 'LogProcessor', 'set_random_seed', 'FlexibleRunner',
+    'turn_on_activation_checkpointing'
+]
diff --git a/head_extractor/src/mmengine/runner/_flexible_runner.py b/head_extractor/src/mmengine/runner/_flexible_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d727fb4d5f6d772f940d21b8c4de94558626b4f
--- /dev/null
+++ b/head_extractor/src/mmengine/runner/_flexible_runner.py
@@ -0,0 +1,1650 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+import os.path as osp
+import pickle
+import warnings
+from functools import partial
+from typing import Callable, Dict, List, Optional, Union
+
+import torch.nn as nn
+from torch.utils.data import DataLoader
+
+import mmengine
+from mmengine._strategy import BaseStrategy
+from mmengine.config import Config, ConfigDict
+from mmengine.dataset import worker_init_fn as default_worker_init_fn
+from mmengine.dist import get_rank, infer_launcher, master_only
+from mmengine.evaluator import Evaluator
+from mmengine.fileio import FileClient, join_path
+from mmengine.hooks import Hook
+from mmengine.logging import MessageHub, print_log
+from mmengine.optim import OptimWrapper, OptimWrapperDict, _ParamScheduler
+from mmengine.registry import (DATA_SAMPLERS, DATASETS, EVALUATOR, FUNCTIONS,
+                               HOOKS, LOG_PROCESSORS, LOOPS, RUNNERS,
+                               STRATEGIES, VISUALIZERS, DefaultScope)
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+from mmengine.visualization import Visualizer
+from .base_loop import BaseLoop
+from .checkpoint import find_latest_checkpoint
+from .log_processor import LogProcessor
+from .loops import EpochBasedTrainLoop, IterBasedTrainLoop, TestLoop, ValLoop
+from .priority import Priority, get_priority
+from .utils import _get_batch_size
+
+ConfigType = Union[Dict, Config, ConfigDict]
+ParamSchedulerType = Union[List[_ParamScheduler], Dict[str,
+                                                       List[_ParamScheduler]]]
+OptimWrapperType = Union[OptimWrapper, OptimWrapperDict]
+
+
+@RUNNERS.register_module()
+class FlexibleRunner:
+    """A training helper for PyTorch.
+
+    Runner object can be built from config by ``runner = Runner.from_cfg(cfg)``
+    where the ``cfg`` usually contains training, validation, and test-related
+    configurations to build corresponding components. We usually use the
+    same config to launch training, testing, and validation tasks. However,
+    only some of these components are necessary at the same time, e.g.,
+    testing a model does not need training or validation-related components.
+
+    To avoid repeatedly modifying config, the construction of ``Runner`` adopts
+    lazy initialization to only initialize components when they are going to be
+    used. Therefore, the model is always initialized at the beginning, and
+    training, validation, and, testing related components are only initialized
+    when calling ``runner.train()``, ``runner.val()``, and ``runner.test()``,
+    respectively.
+
+    Warning:
+        This is an experimental feature, and its interface is subject to
+        change.
+
+    Args:
+        model (:obj:`torch.nn.Module` or dict): The model to be run. It can be
+            a dict used for build a model.
+
+    Kwargs:
+        work_dir (str, optional): The working directory to save checkpoints.
+            The logs will be saved in the subdirectory of `work_dir` named
+            :attr:`timestamp`. Defaults to 'work_dir'.
+        experiment_name (str, optional): Name of current experiment. If not
+            specified, timestamp will be used as ``experiment_name``.
+            Defaults to None.
+        train_dataloader (Dataloader or dict, optional): A dataloader object or
+            a dict to build a dataloader. If ``None`` is given, it means
+            skipping training steps. Defaults to None.
+            See :meth:`build_dataloader` for more details.
+        optim_wrapper (OptimWrapper or dict, optional):
+            Computing gradient of model parameters. If specified,
+            :attr:`train_dataloader` should also be specified. If automatic
+            mixed precision or gradient accmulation
+            training is required. The type of ``optim_wrapper`` should be
+            AmpOptimizerWrapper. See :meth:`build_optim_wrapper` for
+            examples. Defaults to None.
+        param_scheduler (_ParamScheduler or dict or list, optional):
+            Parameter scheduler for updating optimizer parameters. If
+            specified, :attr:`optimizer` should also be specified.
+            Defaults to None.
+            See :meth:`build_param_scheduler` for examples.
+        train_cfg (dict, optional): A dict to build a training loop. If it does
+            not provide "type" key, it should contain "by_epoch" to decide
+            which type of training loop :class:`EpochBasedTrainLoop` or
+            :class:`IterBasedTrainLoop` should be used. If ``train_cfg``
+            specified, :attr:`train_dataloader` should also be specified.
+            Defaults to None. See :meth:`build_train_loop` for more details.
+        val_dataloader (Dataloader or dict, optional): A dataloader object or
+            a dict to build a dataloader. If ``None`` is given, it means
+            skipping validation steps. Defaults to None.
+            See :meth:`build_dataloader` for more details.
+        val_evaluator (Evaluator or dict or list, optional): A evaluator object
+            used for computing metrics for validation. It can be a dict or a
+            list of dict to build a evaluator. If specified,
+            :attr:`val_dataloader` should also be specified. Defaults to None.
+        val_cfg (dict, optional): A dict to build a validation loop. If it does
+            not provide "type" key, :class:`ValLoop` will be used by default.
+            If ``val_cfg`` specified, :attr:`val_dataloader` should also be
+            specified. If ``ValLoop`` is built with `fp16=True``,
+            ``runner.val()`` will be performed under fp16 precision.
+        test_dataloader (Dataloader or dict, optional): A dataloader object or
+            a dict to build a dataloader. If ``None`` is given, it means
+            skipping test steps. Defaults to None.
+            See :meth:`build_dataloader` for more details.
+            Defaults to None. See :meth:`build_val_loop` for more details.
+        test_evaluator (Evaluator or dict or list, optional): A evaluator
+            object used for computing metrics for test steps. It can be a dict
+            or a list of dict to build a evaluator. If specified,
+            :attr:`test_dataloader` should also be specified. Defaults to None.
+        test_cfg (dict, optional): A dict to build a test loop. If it does
+            not provide "type" key, :class:`TestLoop` will be used by default.
+            If ``test_cfg`` specified, :attr:`test_dataloader` should also be
+            specified. If ``ValLoop`` is built with `fp16=True``,
+            ``runner.val()`` will be performed under fp16 precision.
+            Defaults to None. See :meth:`build_test_loop` for more details.
+        strategy (BaseStrategy or dict, optional): A strategy object or a dict
+            to build a strategy. Defaults to None. If not specified, the
+            strategy will be inferred automatically.
+        auto_scale_lr (dict, Optional): Config to scale the learning rate
+            automatically. It includes ``base_batch_size`` and ``enable``.
+            ``base_batch_size`` is the batch size that the optimizer lr is
+            based on. ``enable`` is the switch to turn on and off the feature.
+        default_hooks (dict[str, dict] or dict[str, Hook], optional): Hooks to
+            execute default actions like updating model parameters and saving
+            checkpoints. Default hooks are ``OptimizerHook``,
+            ``IterTimerHook``, ``LoggerHook``, ``ParamSchedulerHook`` and
+            ``CheckpointHook``. Defaults to None.
+            See :meth:`register_default_hooks` for more details.
+        custom_hooks (list[dict] or list[Hook], optional): Hooks to execute
+            custom actions like visualizing images processed by pipeline.
+            Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`. If the ``model`` argument is a dict
+            and doesn't contain the key ``data_preprocessor``, set the argument
+            as the ``data_preprocessor`` of the ``model`` dict.
+            Defaults to None.
+        load_from (str, optional): The checkpoint file to load from.
+            Defaults to None.
+        resume (bool): Whether to resume training. Defaults to False. If
+            ``resume`` is True and ``load_from`` is None, automatically to
+            find latest checkpoint from ``work_dir``. If not found, resuming
+            does nothing.
+        launcher (str, optional): Way to launcher multi-process. Supported
+            launchers are 'pytorch', 'mpi', 'slurm' and 'none'. If 'none' is
+            provided, non-distributed environment will be launched.
+            If launcher is None, the launcher will be inferred according some
+            specified environments. Defaults to None.
+        env_cfg (dict): A dict used for setting environment. Defaults to
+            dict(dist_cfg=dict(backend='nccl')).
+        log_processor (dict, optional): A processor to format logs. Defaults to
+            None.
+        log_level (int or str): The log level of MMLogger handlers.
+            Defaults to 'INFO'.
+        visualizer (Visualizer or dict, optional): A Visualizer object or a
+            dict build Visualizer object. Defaults to None. If not
+            specified, default config will be used.
+        default_scope (str): Used to reset registries location.
+            Defaults to "mmengine".
+        randomness (dict): Some settings to make the experiment as reproducible
+            as possible like seed and deterministic.
+            Defaults to ``dict(seed=None)``. If seed is None, a random number
+            will be generated and it will be broadcasted to all other processes
+            if in distributed environment. If ``cudnn_benchmark`` is
+            ``True`` in ``env_cfg`` but ``deterministic`` is ``True`` in
+            ``randomness``, the value of ``torch.backends.cudnn.benchmark``
+            will be ``False`` finally.
+        compile (bool or dict, optional): Whether to enable ``torch.compile``.
+            Defaults to False.
+        cfg (dict or Configdict or :obj:`Config`, optional): Full config.
+            Defaults to None.
+
+    Note:
+        Since PyTorch 2.0.0, you can enable ``torch.compile`` by passing in
+        `compile = True`. If you want to control compile options, you
+        can pass a dict, e.g. ``cfg.compile = dict(backend='eager')``.
+        Refer to `PyTorch API Documentation <https://pytorch.org/docs/
+        master/generated/torch.compile.html#torch.compile>`_ for more valid
+        options.
+
+    Examples:
+        >>> from mmengine.runner import Runner
+        >>> cfg = dict(
+        >>>     model=dict(type='ToyModel'),
+        >>>     work_dir='path/of/work_dir',
+        >>>     train_dataloader=dict(
+        >>>     dataset=dict(type='ToyDataset'),
+        >>>     sampler=dict(type='DefaultSampler', shuffle=True),
+        >>>     batch_size=1,
+        >>>     num_workers=0),
+        >>>     val_dataloader=dict(
+        >>>         dataset=dict(type='ToyDataset'),
+        >>>         sampler=dict(type='DefaultSampler', shuffle=False),
+        >>>        batch_size=1,
+        >>>        num_workers=0),
+        >>>     test_dataloader=dict(
+        >>>         dataset=dict(type='ToyDataset'),
+        >>>         sampler=dict(type='DefaultSampler', shuffle=False),
+        >>>         batch_size=1,
+        >>>         num_workers=0),
+        >>>     auto_scale_lr=dict(base_batch_size=16, enable=False),
+        >>>     optim_wrapper=dict(type='OptimizerWrapper', optimizer=dict(
+        >>>         type='SGD', lr=0.01)),
+        >>>     param_scheduler=dict(type='MultiStepLR', milestones=[1, 2]),
+        >>>     val_evaluator=dict(type='ToyEvaluator'),
+        >>>     test_evaluator=dict(type='ToyEvaluator'),
+        >>>     train_cfg=dict(by_epoch=True, max_epochs=3, val_interval=1),
+        >>>     val_cfg=dict(),
+        >>>     test_cfg=dict(),
+        >>>     custom_hooks=[],
+        >>>     default_hooks=dict(
+        >>>         timer=dict(type='IterTimerHook'),
+        >>>         checkpoint=dict(type='CheckpointHook', interval=1),
+        >>>         logger=dict(type='LoggerHook'),
+        >>>         optimizer=dict(type='OptimizerHook', grad_clip=False),
+        >>>         param_scheduler=dict(type='ParamSchedulerHook')),
+        >>>     launcher='none',
+        >>>     env_cfg=dict(dist_cfg=dict(backend='nccl')),
+        >>>     log_processor=dict(window_size=20),
+        >>>     visualizer=dict(type='Visualizer',
+        >>>     vis_backends=[dict(type='LocalVisBackend',
+        >>>                        save_dir='temp_dir')])
+        >>>    )
+        >>> runner = Runner.from_cfg(cfg)
+        >>> runner.train()
+        >>> runner.test()
+    """
+    cfg: Config
+    _train_loop: Optional[Union[BaseLoop, Dict]]
+    _val_loop: Optional[Union[BaseLoop, Dict]]
+    _test_loop: Optional[Union[BaseLoop, Dict]]
+
+    def __init__(
+        self,
+        model: Union[nn.Module, Dict],
+        *,
+        work_dir: str = 'work_dirs',
+        experiment_name: Optional[str] = None,
+        train_dataloader: Optional[Union[DataLoader, Dict]] = None,
+        optim_wrapper: Optional[Union[OptimWrapper, Dict]] = None,
+        param_scheduler: Optional[Union[_ParamScheduler, Dict, List]] = None,
+        train_cfg: Optional[Dict] = None,
+        val_dataloader: Optional[Union[DataLoader, Dict]] = None,
+        val_evaluator: Optional[Union[Evaluator, Dict, List]] = None,
+        val_cfg: Optional[Dict] = None,
+        test_dataloader: Optional[Union[DataLoader, Dict]] = None,
+        test_evaluator: Optional[Union[Evaluator, Dict, List]] = None,
+        test_cfg: Optional[Dict] = None,
+        strategy: Optional[Union[BaseStrategy, Dict]] = None,
+        auto_scale_lr: Optional[Dict] = None,
+        default_hooks: Optional[Dict[str, Union[Hook, Dict]]] = None,
+        custom_hooks: Optional[List[Union[Hook, Dict]]] = None,
+        data_preprocessor: Union[nn.Module, Dict, None] = None,
+        load_from: Optional[str] = None,
+        resume: Union[str, bool] = False,
+        launcher: Optional[str] = None,
+        env_cfg: Dict = dict(dist_cfg=dict(backend='nccl')),
+        log_processor: Optional[Dict] = None,
+        log_level: str = 'INFO',
+        visualizer: Optional[Union[Visualizer, Dict]] = None,
+        default_scope: Optional[str] = 'mmengine',
+        randomness: Dict = dict(seed=None),
+        compile: Union[bool, Dict] = False,
+        cfg: Optional[ConfigType] = None,
+    ):
+        if isinstance(model, dict) and data_preprocessor is not None:
+            # Merge the data_preprocessor to model config.
+            model.setdefault('data_preprocessor', data_preprocessor)
+        self.model = model
+
+        self._work_dir = osp.abspath(work_dir)
+        mmengine.mkdir_or_exist(self._work_dir)
+
+        # recursively copy the `cfg` because `self.cfg` will be modified
+        # everywhere.
+        if cfg is not None:
+            if isinstance(cfg, Config):
+                self.cfg = copy.deepcopy(cfg)
+            elif isinstance(cfg, dict):
+                self.cfg = Config(cfg)
+        else:
+            self.cfg = Config(dict())
+
+        # lazy initialization
+        training_related = [train_dataloader, train_cfg, optim_wrapper]
+        if not (all(item is None for item in training_related)
+                or all(item is not None for item in training_related)):
+            raise ValueError(
+                'train_dataloader, train_cfg, and optim_wrapper should be '
+                'either all None or not None, but got '
+                f'train_dataloader={train_dataloader}, '
+                f'train_cfg={train_cfg}, '
+                f'optim_wrapper={optim_wrapper}.')
+        self._train_dataloader = train_dataloader
+        self._train_loop = train_cfg
+
+        self.optim_wrapper: Optional[Union[OptimWrapper, dict]]
+        self.optim_wrapper = optim_wrapper
+
+        self._auto_scale_lr = auto_scale_lr
+
+        # If there is no need to adjust learning rate, momentum or other
+        # parameters of optimizer, param_scheduler can be None
+        if param_scheduler is not None and self.optim_wrapper is None:
+            raise ValueError(
+                'param_scheduler should be None when optim_wrapper is None, '
+                f'but got {param_scheduler}')
+
+        self.param_schedulers = param_scheduler
+
+        val_related = [val_dataloader, val_cfg, val_evaluator]
+        if not (all(item is None
+                    for item in val_related) or all(item is not None
+                                                    for item in val_related)):
+            raise ValueError(
+                'val_dataloader, val_cfg, and val_evaluator should be either '
+                'all None or not None, but got '
+                f'val_dataloader={val_dataloader}, val_cfg={val_cfg}, '
+                f'val_evaluator={val_evaluator}')
+        self._val_dataloader = val_dataloader
+        self._val_loop = val_cfg
+        self._val_evaluator = val_evaluator
+
+        test_related = [test_dataloader, test_cfg, test_evaluator]
+        if not (all(item is None for item in test_related)
+                or all(item is not None for item in test_related)):
+            raise ValueError(
+                'test_dataloader, test_cfg, and test_evaluator should be '
+                'either all None or not None, but got '
+                f'test_dataloader={test_dataloader}, test_cfg={test_cfg}, '
+                f'test_evaluator={test_evaluator}')
+        self._test_dataloader = test_dataloader
+        self._test_loop = test_cfg
+        self._test_evaluator = test_evaluator
+
+        if not isinstance(compile, bool) and not isinstance(compile, dict):
+            raise TypeError(
+                f'compile should be a bool or dict, but got {type(compile)}')
+        self._compile = compile
+
+        if isinstance(resume, str) and load_from is not None:
+            raise ValueError('If resume is a str, load_from should be None.')
+        self._load_from = load_from
+        self._resume = resume
+        # flag to mark whether checkpoint has been loaded or resumed
+        self._has_loaded = False
+
+        if launcher is None:
+            launcher = infer_launcher()
+
+        if experiment_name is None and self.cfg.filename is not None:
+            experiment_name = osp.splitext(osp.basename(self.cfg.filename))[0]
+
+        self._randomness_cfg = randomness
+        self.strategy = self.build_strategy(
+            strategy,
+            launcher=launcher,
+            randomness=randomness,
+            env_cfg=env_cfg,
+            experiment_name=experiment_name,
+            log_level=log_level,
+        )
+
+        # Used to reset registries location. See :meth:`Registry.build` for
+        # more details.
+        if default_scope is not None:
+            default_scope = DefaultScope.get_instance(  # type: ignore
+                self.experiment_name,
+                scope_name=default_scope)
+        self.default_scope = default_scope
+        # Build log processor to format message.
+        log_processor = dict() if log_processor is None else log_processor
+        self.log_processor = self.build_log_processor(log_processor)
+
+        # Collect and log environment information.
+        self._log_env()
+
+        # Build `message_hub` for communication among components.
+        # `message_hub` can store log scalars (loss, learning rate) and
+        # runtime information (iter and epoch). Those components that do not
+        # have access to the runner can get iteration or epoch information
+        # from `message_hub`. For example, models can get the latest created
+        # `message_hub` by
+        # `self.message_hub=MessageHub.get_current_instance()` and then get
+        # current epoch by `cur_epoch = self.message_hub.get_info('epoch')`.
+        # See `MessageHub` and `ManagerMixin` for more details.
+        self.message_hub = self.build_message_hub()
+        # visualizer used for writing log or visualizing all kinds of data
+        self.visualizer = self.build_visualizer(visualizer)
+        if self.cfg:
+            self.visualizer.add_config(self.cfg)
+
+        self._hooks: List[Hook] = []
+        # register hooks to `self._hooks`
+        self.register_hooks(default_hooks, custom_hooks)
+        # log hooks information
+        self.logger.info(f'Hooks will be executed in the following '
+                         f'order:\n{self.get_hooks_info()}')
+
+        # dump `cfg` to `work_dir`
+        self.dump_config()
+
+    @classmethod
+    def from_cfg(cls, cfg: ConfigType) -> 'FlexibleRunner':
+        """Build a runner from config.
+
+        Args:
+            cfg (ConfigType): A config used for building runner. Keys of
+                ``cfg`` can see :meth:`__init__`.
+
+        Returns:
+            Runner: A runner build from ``cfg``.
+        """
+        cfg = copy.deepcopy(cfg)
+        runner = cls(
+            model=cfg['model'],
+            work_dir=cfg.get('work_dir', 'work_dirs'),
+            experiment_name=cfg.get('experiment_name'),
+            train_dataloader=cfg.get('train_dataloader'),
+            optim_wrapper=cfg.get('optim_wrapper'),
+            param_scheduler=cfg.get('param_scheduler'),
+            train_cfg=cfg.get('train_cfg'),
+            val_dataloader=cfg.get('val_dataloader'),
+            val_evaluator=cfg.get('val_evaluator'),
+            val_cfg=cfg.get('val_cfg'),
+            test_dataloader=cfg.get('test_dataloader'),
+            test_evaluator=cfg.get('test_evaluator'),
+            test_cfg=cfg.get('test_cfg'),
+            strategy=cfg.get('strategy'),
+            auto_scale_lr=cfg.get('auto_scale_lr'),
+            default_hooks=cfg.get('default_hooks'),
+            custom_hooks=cfg.get('custom_hooks'),
+            data_preprocessor=cfg.get('data_preprocessor'),
+            load_from=cfg.get('load_from'),
+            resume=cfg.get('resume', False),
+            launcher=cfg.get('launcher'),
+            env_cfg=cfg.get('env_cfg'),  # type: ignore
+            log_processor=cfg.get('log_processor'),
+            log_level=cfg.get('log_level', 'INFO'),
+            visualizer=cfg.get('visualizer'),
+            default_scope=cfg.get('default_scope', 'mmengine'),
+            randomness=cfg.get('randomness', dict(seed=None)),
+            cfg=cfg,
+        )
+
+        return runner
+
+    @property
+    def experiment_name(self):
+        """str: Name of experiment."""
+        return self.strategy.experiment_name
+
+    @property
+    def model_name(self):
+        """str: Name of the model, usually the module class name."""
+        return self._model_name
+
+    @property
+    def work_dir(self):
+        """str: The working directory to save checkpoints and logs."""
+        return self._work_dir
+
+    @property
+    def log_dir(self):
+        return self.strategy.log_dir
+
+    @property
+    def logger(self):
+        return self.strategy.logger
+
+    @property
+    def max_epochs(self):
+        """int: Total epochs to train model."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.max_epochs
+        else:
+            return 0
+
+    @property
+    def max_iters(self):
+        """int: Total iterations to train model."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.max_iters
+        else:
+            return 0
+
+    @property
+    def epoch(self):
+        """int: Current epoch."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.epoch
+        else:
+            return 0
+
+    @property
+    def iter(self):
+        """int: Current iteration."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.iter
+        else:
+            return 0
+
+    @property
+    def distributed(self):
+        """bool: Whether current environment is distributed."""
+        return self.strategy.distributed
+
+    @property
+    def rank(self):
+        """int: Rank of current process."""
+        return self.strategy.rank
+
+    @property
+    def world_size(self):
+        """int: Number of processes participating in the job."""
+        return self.strategy.world_size
+
+    @property
+    def deterministic(self):
+        """int: Whether cudnn to select deterministic algorithms."""
+        return self._deterministic
+
+    @property
+    def seed(self):
+        """int: A number to set random modules."""
+        return self.strategy.seed
+
+    @property
+    def timestamp(self):
+        """str: Timestamp when creating experiment."""
+        return self.strategy.timestamp
+
+    @property
+    def hooks(self):
+        """list[:obj:`Hook`]: A list of registered hooks."""
+        return self._hooks
+
+    @property
+    def train_loop(self):
+        """:obj:`BaseLoop`: A loop to run training."""
+        if isinstance(self._train_loop, BaseLoop) or self._train_loop is None:
+            return self._train_loop
+        else:
+            self._train_loop = self.build_train_loop(self._train_loop)
+            return self._train_loop
+
+    @property
+    def val_loop(self):
+        """:obj:`BaseLoop`: A loop to run validation."""
+        if isinstance(self._val_loop, BaseLoop) or self._val_loop is None:
+            return self._val_loop
+        else:
+            self._val_loop = self.build_val_loop(self._val_loop)
+            return self._val_loop
+
+    @property
+    def test_loop(self):
+        """:obj:`BaseLoop`: A loop to run testing."""
+        if isinstance(self._test_loop, BaseLoop) or self._test_loop is None:
+            return self._test_loop
+        else:
+            self._test_loop = self.build_test_loop(self._test_loop)
+            return self._test_loop
+
+    @property
+    def train_dataloader(self):
+        """The data loader for training."""
+        return self.train_loop.dataloader
+
+    @property
+    def val_dataloader(self):
+        """The data loader for validation."""
+        return self.val_loop.dataloader
+
+    @property
+    def test_dataloader(self):
+        """The data loader for testing."""
+        return self.test_loop.dataloader
+
+    @property
+    def val_evaluator(self):
+        """:obj:`Evaluator`: An evaluator for validation."""
+        return self.val_loop.evaluator
+
+    @property
+    def test_evaluator(self):
+        """:obj:`Evaluator`: An evaluator for testing."""
+        return self.test_loop.evaluator
+
+    @property
+    def val_interval(self):
+        """int: Interval to run validation during training."""
+        return self.train_loop.val_interval
+
+    @property
+    def val_begin(self):
+        """int: The epoch/iteration to start running validation during
+        training."""
+        return self.train_loop.val_begin
+
+    def build_strategy(
+        self,
+        strategy: Optional[Union[BaseStrategy, Dict]] = None,
+        launcher: str = 'none',
+        randomness: Optional[dict] = None,
+        env_cfg: dict = dict(dist_cfg=dict(backend='nccl')),
+        experiment_name: Optional[str] = None,
+        log_level: Optional[str] = None,
+    ) -> BaseStrategy:
+        """Build a strategy.
+
+        Args:
+            strategy (BaseStrategy, optional): A strategy object or dict to
+                build the strategy. Defaults to None.
+
+        Returns:
+            BaseStrategy: A strategy object.
+        """
+        if isinstance(strategy, BaseStrategy):
+            strategy_obj = strategy
+        else:
+            if launcher == 'none':
+                if strategy is None:
+                    strategy = dict(type='SingleDeviceStrategy')
+            else:
+                if strategy is None:
+                    strategy = dict(type='DDPStrategy')
+
+            assert isinstance(strategy, dict)
+
+            # train_micro_batch_size_per_gpu is required by DeepSpeed
+            if isinstance(strategy['type'], str):
+                strategy_name = strategy['type']
+            else:
+                strategy_name = strategy['type'].__name__
+            if strategy_name == 'DeepSpeedStrategy':
+                if self._train_dataloader is None:
+                    strategy['train_micro_batch_size_per_gpu'] = 1
+                else:
+                    strategy['train_micro_batch_size_per_gpu'] = \
+                        _get_batch_size(self._train_dataloader)
+
+            strategy.setdefault('work_dir', self._work_dir)
+            strategy.setdefault('experiment_name', experiment_name)
+            strategy.setdefault('auto_scale_lr', self._auto_scale_lr)
+
+            env_kwargs = dict(
+                launcher=launcher,
+                randomness=randomness,
+                **env_cfg,
+            )
+            strategy.setdefault('env_kwargs', env_kwargs)
+
+            log_kwargs = dict(log_level=log_level)
+            strategy.setdefault('log_kwargs', log_kwargs)
+
+            strategy_obj = STRATEGIES.build(strategy)
+
+        return strategy_obj
+
+    def build_message_hub(
+        self,
+        message_hub: Optional[Dict] = None,
+    ) -> MessageHub:
+        """Build a global asscessable MessageHub.
+
+        Args:
+            message_hub (dict, optional): A dict to build MessageHub object.
+                If not specified, default config will be used to build
+                MessageHub object. Defaults to None.
+
+        Returns:
+            MessageHub: A MessageHub object build from ``message_hub``.
+        """
+        if message_hub is None:
+            message_hub = dict(name=self.experiment_name)
+        elif isinstance(message_hub, dict):
+            # ensure message_hub containing name key
+            message_hub.setdefault('name', self.experiment_name)
+        else:
+            raise TypeError(
+                f'message_hub should be dict or None, but got {message_hub}')
+
+        return MessageHub.get_instance(**message_hub)
+
+    def build_visualizer(
+        self,
+        visualizer: Optional[Union[Visualizer, Dict]] = None,
+    ) -> Visualizer:
+        """Build a global asscessable Visualizer.
+
+        Args:
+            visualizer (Visualizer or dict, optional): A Visualizer object
+                or a dict to build Visualizer object. If ``visualizer`` is a
+                Visualizer object, just returns itself. If not specified,
+                default config will be used to build Visualizer object.
+                Defaults to None.
+
+        Returns:
+            Visualizer: A Visualizer object build from ``visualizer``.
+        """
+        if visualizer is None:
+            visualizer = dict(
+                name=self.experiment_name,
+                vis_backends=[dict(type='LocalVisBackend')],
+                save_dir=self.log_dir)
+            return Visualizer.get_instance(**visualizer)
+
+        if isinstance(visualizer, Visualizer):
+            return visualizer
+
+        if isinstance(visualizer, dict):
+            # ensure visualizer containing name key
+            visualizer.setdefault('name', self.experiment_name)
+            visualizer.setdefault('save_dir', self.log_dir)
+            return VISUALIZERS.build(visualizer)
+        else:
+            raise TypeError(
+                'visualizer should be Visualizer object, a dict or None, '
+                f'but got {visualizer}')
+
+    def build_evaluator(
+        self,
+        evaluator: Union[Dict, List, Evaluator],
+    ) -> Evaluator:
+        """Build evaluator.
+
+        Examples of ``evaluator``::
+
+            # evaluator could be a built Evaluator instance
+            evaluator = Evaluator(metrics=[ToyMetric()])
+
+            # evaluator can also be a list of dict
+            evaluator = [
+                dict(type='ToyMetric1'),
+                dict(type='ToyEvaluator2')
+            ]
+
+            # evaluator can also be a list of built metric
+            evaluator = [ToyMetric1(), ToyMetric2()]
+
+            # evaluator can also be a dict with key metrics
+            evaluator = dict(metrics=ToyMetric())
+            # metric is a list
+            evaluator = dict(metrics=[ToyMetric()])
+
+        Args:
+            evaluator (Evaluator or dict or list): An Evaluator object or a
+                config dict or list of config dict used to build an Evaluator.
+
+        Returns:
+            Evaluator: Evaluator build from ``evaluator``.
+        """
+        if isinstance(evaluator, Evaluator):
+            return evaluator
+        elif isinstance(evaluator, dict):
+            # if `metrics` in dict keys, it means to build customized evalutor
+            if 'metrics' in evaluator:
+                evaluator.setdefault('type', 'Evaluator')
+                return EVALUATOR.build(evaluator)
+            # otherwise, default evalutor will be built
+            else:
+                return Evaluator(evaluator)  # type: ignore
+        elif isinstance(evaluator, list):
+            # use the default `Evaluator`
+            return Evaluator(evaluator)  # type: ignore
+        else:
+            raise TypeError(
+                'evaluator should be one of dict, list of dict, and Evaluator'
+                f', but got {evaluator}')
+
+    @staticmethod
+    def build_dataloader(
+        dataloader: Union[DataLoader, Dict],
+        seed: Optional[int] = None,
+        diff_rank_seed: bool = False,
+    ) -> DataLoader:
+        """Build dataloader.
+
+        The method builds three components:
+
+        - Dataset
+        - Sampler
+        - Dataloader
+
+        An example of ``dataloader``::
+
+            dataloader = dict(
+                dataset=dict(type='ToyDataset'),
+                sampler=dict(type='DefaultSampler', shuffle=True),
+                batch_size=1,
+                num_workers=9
+            )
+
+        Args:
+            dataloader (DataLoader or dict): A Dataloader object or a dict to
+                build Dataloader object. If ``dataloader`` is a Dataloader
+                object, just returns itself.
+            seed (int, optional): Random seed. Defaults to None.
+            diff_rank_seed (bool): Whether or not set different seeds to
+                different ranks. If True, the seed passed to sampler is set
+                to None, in order to synchronize the seeds used in samplers
+                across different ranks. Defaults to False.
+
+        Returns:
+            Dataloader: DataLoader build from ``dataloader_cfg``.
+        """
+        if isinstance(dataloader, DataLoader):
+            return dataloader
+
+        dataloader_cfg = copy.deepcopy(dataloader)
+
+        # build dataset
+        dataset_cfg = dataloader_cfg.pop('dataset')
+        if isinstance(dataset_cfg, dict):
+            dataset = DATASETS.build(dataset_cfg)
+            if hasattr(dataset, 'full_init'):
+                dataset.full_init()
+        else:
+            # fallback to raise error in dataloader
+            # if `dataset_cfg` is not a valid type
+            dataset = dataset_cfg
+
+        # build sampler
+        sampler_cfg = dataloader_cfg.pop('sampler')
+        if isinstance(sampler_cfg, dict):
+            sampler_seed = None if diff_rank_seed else seed
+            sampler = DATA_SAMPLERS.build(
+                sampler_cfg,
+                default_args=dict(dataset=dataset, seed=sampler_seed))
+        else:
+            # fallback to raise error in dataloader
+            # if `sampler_cfg` is not a valid type
+            sampler = sampler_cfg
+
+        # build batch sampler
+        batch_sampler_cfg = dataloader_cfg.pop('batch_sampler', None)
+        if batch_sampler_cfg is None:
+            batch_sampler = None
+        elif isinstance(batch_sampler_cfg, dict):
+            batch_sampler = DATA_SAMPLERS.build(
+                batch_sampler_cfg,
+                default_args=dict(
+                    sampler=sampler,
+                    batch_size=dataloader_cfg.pop('batch_size')))
+        else:
+            # fallback to raise error in dataloader
+            # if `batch_sampler_cfg` is not a valid type
+            batch_sampler = batch_sampler_cfg
+
+        # build dataloader
+        init_fn: Optional[partial]
+        if 'worker_init_fn' in dataloader_cfg:
+            worker_init_fn_cfg = dataloader_cfg.pop('worker_init_fn')
+            worker_init_fn_type = worker_init_fn_cfg.pop('type')
+            worker_init_fn = FUNCTIONS.get(worker_init_fn_type)
+            assert callable(worker_init_fn)
+            init_fn = partial(worker_init_fn,
+                              **worker_init_fn_cfg)  # type: ignore
+        else:
+            if seed is not None:
+                disable_subprocess_warning = dataloader_cfg.pop(
+                    'disable_subprocess_warning', False)
+                assert isinstance(disable_subprocess_warning, bool), (
+                    'disable_subprocess_warning should be a bool, but got '
+                    f'{type(disable_subprocess_warning)}')
+                init_fn = partial(
+                    default_worker_init_fn,
+                    num_workers=dataloader_cfg.get('num_workers'),
+                    rank=get_rank(),
+                    seed=seed,
+                    disable_subprocess_warning=disable_subprocess_warning)
+            else:
+                init_fn = None
+
+        # `persistent_workers` requires pytorch version >= 1.7
+        if ('persistent_workers' in dataloader_cfg
+                and digit_version(TORCH_VERSION) < digit_version('1.7.0')):
+            print_log(
+                '`persistent_workers` is only available when '
+                'pytorch version >= 1.7',
+                logger='current',
+                level=logging.WARNING)
+            dataloader_cfg.pop('persistent_workers')
+
+        # The default behavior of `collat_fn` in dataloader is to
+        # merge a list of samples to form a mini-batch of Tensor(s).
+        # However, in mmengine, if `collate_fn` is not defined in
+        # dataloader_cfg, `pseudo_collate` will only convert the list of
+        # samples into a dict without stacking the batch tensor.
+        collate_fn_cfg = dataloader_cfg.pop('collate_fn',
+                                            dict(type='pseudo_collate'))
+        if isinstance(collate_fn_cfg, dict):
+            collate_fn_type = collate_fn_cfg.pop('type')
+            if isinstance(collate_fn_type, str):
+                collate_fn = FUNCTIONS.get(collate_fn_type)
+            else:
+                collate_fn = collate_fn_type
+            collate_fn = partial(collate_fn, **collate_fn_cfg)  # type: ignore
+        elif callable(collate_fn_cfg):
+            collate_fn = collate_fn_cfg
+        else:
+            raise TypeError(
+                'collate_fn should be a dict or callable object, but got '
+                f'{collate_fn_cfg}')
+        data_loader = DataLoader(
+            dataset=dataset,
+            sampler=sampler if batch_sampler is None else None,
+            batch_sampler=batch_sampler,
+            collate_fn=collate_fn,
+            worker_init_fn=init_fn,
+            **dataloader_cfg)
+        return data_loader
+
+    def build_train_loop(self, loop: Union[BaseLoop, Dict]) -> BaseLoop:
+        """Build training loop.
+
+        Examples of ``loop``::
+
+            # `EpochBasedTrainLoop` will be used
+            loop = dict(by_epoch=True, max_epochs=3)
+
+            # `IterBasedTrainLoop` will be used
+            loop = dict(by_epoch=False, max_epochs=3)
+
+            # custom training loop
+            loop = dict(type='CustomTrainLoop', max_epochs=3)
+
+        Args:
+            loop (BaseLoop or dict): A training loop or a dict to build
+                training loop. If ``loop`` is a training loop object, just
+                returns itself.
+
+        Returns:
+            :obj:`BaseLoop`: Training loop object build from ``loop``.
+        """
+        if isinstance(loop, BaseLoop):
+            return loop
+        elif not isinstance(loop, dict):
+            raise TypeError(
+                f'loop should be a Loop object or dict, but got {loop}')
+
+        loop_cfg = copy.deepcopy(loop)
+
+        if 'type' in loop_cfg and 'by_epoch' in loop_cfg:
+            raise RuntimeError(
+                'Only one of `type` or `by_epoch` can exist in `loop_cfg`.')
+
+        if 'type' in loop_cfg:
+            loop = LOOPS.build(
+                loop_cfg,
+                default_args=dict(
+                    runner=self, dataloader=self._train_dataloader))
+        else:
+            by_epoch = loop_cfg.pop('by_epoch')
+            if by_epoch:
+                loop = EpochBasedTrainLoop(
+                    **loop_cfg, runner=self, dataloader=self._train_dataloader)
+            else:
+                loop = IterBasedTrainLoop(
+                    **loop_cfg, runner=self, dataloader=self._train_dataloader)
+        return loop  # type: ignore
+
+    def build_val_loop(self, loop: Union[BaseLoop, Dict]) -> BaseLoop:
+        """Build validation loop.
+
+        Examples of ``loop``:
+
+            # `ValLoop` will be used
+            loop = dict()
+
+            # custom validation loop
+            loop = dict(type='CustomValLoop')
+
+        Args:
+            loop (BaseLoop or dict): A validation loop or a dict to build
+                validation loop. If ``loop`` is a validation loop object, just
+                returns itself.
+
+        Returns:
+            :obj:`BaseLoop`: Validation loop object build from ``loop``.
+        """
+        if isinstance(loop, BaseLoop):
+            return loop
+        elif not isinstance(loop, dict):
+            raise TypeError(
+                f'train_loop should be a Loop object or dict, but got {loop}')
+
+        loop_cfg = copy.deepcopy(loop)
+
+        if 'type' in loop_cfg:
+            loop = LOOPS.build(
+                loop_cfg,
+                default_args=dict(
+                    runner=self,
+                    dataloader=self._val_dataloader,
+                    evaluator=self._val_evaluator))
+        else:
+            loop = ValLoop(
+                **loop_cfg,
+                runner=self,
+                dataloader=self._val_dataloader,
+                evaluator=self._val_evaluator)  # type: ignore
+
+        return loop  # type: ignore
+
+    def build_test_loop(self, loop: Union[BaseLoop, Dict]) -> BaseLoop:
+        """Build test loop.
+
+        Examples of ``loop``::
+
+            # `TestLoop` will be used
+            loop = dict()
+
+            # custom test loop
+            loop = dict(type='CustomTestLoop')
+
+        Args:
+            loop (BaseLoop or dict): A test loop or a dict to build test loop.
+                If ``loop`` is a test loop object, just returns itself.
+
+        Returns:
+            :obj:`BaseLoop`: Test loop object build from ``loop_cfg``.
+        """
+        if isinstance(loop, BaseLoop):
+            return loop
+        elif not isinstance(loop, dict):
+            raise TypeError(
+                f'train_loop should be a Loop object or dict, but got {loop}')
+
+        loop_cfg = copy.deepcopy(loop)  # type: ignore
+
+        if 'type' in loop_cfg:
+            loop = LOOPS.build(
+                loop_cfg,
+                default_args=dict(
+                    runner=self,
+                    dataloader=self._test_dataloader,
+                    evaluator=self._test_evaluator))
+        else:
+            loop = TestLoop(
+                **loop_cfg,
+                runner=self,
+                dataloader=self._test_dataloader,
+                evaluator=self._test_evaluator)  # type: ignore
+
+        return loop  # type: ignore
+
+    def build_log_processor(
+        self,
+        log_processor: Union[LogProcessor, Dict],
+    ) -> LogProcessor:
+        """Build test log_processor.
+
+        Examples of ``log_processor``:
+
+            # `LogProcessor` will be used
+            log_processor = dict()
+
+            # custom log_processor
+            log_processor = dict(type='CustomLogProcessor')
+
+        Args:
+            log_processor (LogProcessor or dict): A log processor or a dict
+            to build log processor. If ``log_processor`` is a log processor
+            object, just returns itself.
+
+        Returns:
+            :obj:`LogProcessor`: Log processor object build from
+            ``log_processor_cfg``.
+        """
+        if isinstance(log_processor, LogProcessor):
+            return log_processor
+        elif not isinstance(log_processor, dict):
+            raise TypeError(
+                'log processor should be a LogProcessor object or dict, but'
+                f'got {log_processor}')
+
+        log_processor_cfg = copy.deepcopy(log_processor)  # type: ignore
+
+        if 'type' in log_processor_cfg:
+            log_processor = LOG_PROCESSORS.build(log_processor_cfg)
+        else:
+            log_processor = LogProcessor(**log_processor_cfg)  # type: ignore
+
+        return log_processor  # type: ignore
+
+    def get_hooks_info(self) -> str:
+        # Get hooks info in each stage
+        stage_hook_map: Dict[str, list] = {stage: [] for stage in Hook.stages}
+        for hook in self.hooks:
+            try:
+                priority = Priority(hook.priority).name  # type: ignore
+            except ValueError:
+                priority = hook.priority  # type: ignore
+            classname = hook.__class__.__name__
+            hook_info = f'({priority:<12}) {classname:<35}'
+            for trigger_stage in hook.get_triggered_stages():
+                stage_hook_map[trigger_stage].append(hook_info)
+
+        stage_hook_infos = []
+        for stage in Hook.stages:
+            hook_infos = stage_hook_map[stage]
+            if len(hook_infos) > 0:
+                info = f'{stage}:\n'
+                info += '\n'.join(hook_infos)
+                info += '\n -------------------- '
+                stage_hook_infos.append(info)
+        return '\n'.join(stage_hook_infos)
+
+    def load_or_resume(self):
+        """load or resume checkpoint."""
+        if self._has_loaded:
+            return None
+
+        if not self._resume and self._load_from is None:
+            return None
+
+        # decide to load from checkpoint or resume from checkpoint
+        resume_from = None
+        if isinstance(self._resume, str):
+            resume_from = self._resume
+        elif self._resume and self._load_from is None:
+            # auto resume from the latest checkpoint
+            resume_from = find_latest_checkpoint(self.work_dir)
+            self.logger.info(
+                f'Auto resumed from the latest checkpoint {resume_from}.')
+        elif self._resume and self._load_from is not None:
+            # resume from the specified checkpoint
+            resume_from = self._load_from
+
+        if resume_from is not None:
+            self.resume(resume_from)
+            self._has_loaded = True
+        elif self._load_from is not None:
+            self.load_checkpoint(self._load_from)
+            self._has_loaded = True
+
+    def train(self) -> nn.Module:
+        """Launch training.
+
+        Returns:
+            nn.Module: The model after training.
+        """
+        if self._train_loop is None:
+            raise RuntimeError(
+                '`self._train_loop` should not be None when calling train '
+                'method. Please provide `train_dataloader`, `train_cfg`, '
+                '`optimizer` and `param_scheduler` arguments when '
+                'initializing runner.')
+
+        self._train_loop = self.build_train_loop(
+            self._train_loop)  # type: ignore
+
+        if self._val_loop is not None:
+            self._val_loop = self.build_val_loop(
+                self._val_loop)  # type: ignore
+
+        compile: Union[dict, bool] = False
+        if isinstance(self._compile, bool):
+            if self._compile:
+                compile = dict(target='train_step')
+        else:
+            compile = copy.copy(self._compile)
+            compile.setdefault('target', 'train_step')
+
+        dispatch_kwargs = dict(
+            epoch_length=len(self.train_dataloader),
+            max_epochs=self.max_epochs,
+            max_iters=self.max_iters,
+            train_micro_batch_size_per_gpu=_get_batch_size(
+                self.train_dataloader))  # type: ignore
+
+        self.strategy.prepare(
+            self.model,
+            optim_wrapper=self.optim_wrapper,
+            param_scheduler=self.param_schedulers,
+            compile=compile,
+            dispatch_kwargs=dispatch_kwargs,
+        )
+
+        self.model = self.strategy.model
+        self.optim_wrapper = self.strategy.optim_wrapper  # type: ignore
+        if self.param_schedulers is not None:
+            self.param_schedulers = self.strategy.param_schedulers
+
+        self.load_or_resume()
+
+        # TODO: add a contextmanager to avoid calling `before_run` many times
+        self.call_hook('before_run')
+
+        model = self.train_loop.run()  # type: ignore
+        self.call_hook('after_run')
+        return model
+
+    def val(self) -> dict:
+        """Launch validation.
+
+        Returns:
+            dict: A dict of metrics on validation set.
+        """
+        if self._val_loop is None:
+            raise RuntimeError(
+                '`self._val_loop` should not be None when calling val method.'
+                'Please provide `val_dataloader`, `val_cfg` and '
+                '`val_evaluator` arguments when initializing runner.')
+
+        self._val_loop = self.build_val_loop(self._val_loop)  # type: ignore
+
+        dispatch_kwargs = dict(
+            init_weights_for_test_or_val=self.cfg.get(
+                'init_weights_for_test_or_val', True))
+        self.strategy.prepare(self.model, dispatch_kwargs=dispatch_kwargs)
+        self.model = self.strategy.model
+
+        self.load_or_resume()
+
+        self.call_hook('before_run')
+        metrics = self.val_loop.run()  # type: ignore
+        self.call_hook('after_run')
+
+        return metrics
+
+    def test(self) -> dict:
+        """Launch test.
+
+        Returns:
+            dict: A dict of metrics on testing set.
+        """
+        if self._test_loop is None:
+            raise RuntimeError(
+                '`self._test_loop` should not be None when calling test '
+                'method. Please provide `test_dataloader`, `test_cfg` and '
+                '`test_evaluator` arguments when initializing runner.')
+
+        self._test_loop = self.build_test_loop(self._test_loop)  # type: ignore
+        dispatch_kwargs = dict(
+            init_weights_for_test_or_val=self.cfg.get(
+                'init_weights_for_test_or_val', True))
+        self.strategy.prepare(self.model, dispatch_kwargs=dispatch_kwargs)
+        self.model = self.strategy.model
+
+        self.load_or_resume()
+
+        self.call_hook('before_run')
+        metrics = self.test_loop.run()  # type: ignore
+        self.call_hook('after_run')
+
+        return metrics
+
+    def call_hook(self, fn_name: str, **kwargs) -> None:
+        """Call all hooks.
+
+        Args:
+            fn_name (str): The function name in each hook to be called, such as
+                "before_train_epoch".
+            **kwargs: Keyword arguments passed to hook.
+        """
+        for hook in self._hooks:
+            # support adding additional custom hook methods
+            if hasattr(hook, fn_name):
+                try:
+                    getattr(hook, fn_name)(self, **kwargs)
+                except TypeError as e:
+                    raise TypeError(f'{e} in {hook}') from e
+
+    def register_hook(
+        self,
+        hook: Union[Hook, Dict],
+        priority: Optional[Union[str, int, Priority]] = None,
+    ) -> None:
+        """Register a hook into the hook list.
+
+        The hook will be inserted into a priority queue, with the specified
+        priority (See :class:`Priority` for details of priorities).
+        For hooks with the same priority, they will be triggered in the same
+        order as they are registered.
+
+        Priority of hook will be decided with the following priority:
+
+        - ``priority`` argument. If ``priority`` is given, it will be priority
+          of hook.
+        - If ``hook`` argument is a dict and ``priority`` in it, the priority
+          will be the value of ``hook['priority']``.
+        - If ``hook`` argument is a dict but ``priority`` not in it or ``hook``
+          is an instance of ``hook``, the priority will be ``hook.priority``.
+
+        Args:
+            hook (:obj:`Hook` or dict): The hook to be registered.
+            priority (int or str or :obj:`Priority`, optional): Hook priority.
+                Lower value means higher priority.
+        """
+        if not isinstance(hook, (Hook, dict)):
+            raise TypeError(
+                f'hook should be an instance of Hook or dict, but got {hook}')
+
+        _priority = None
+        if isinstance(hook, dict):
+            if 'priority' in hook:
+                _priority = hook.pop('priority')
+
+            hook_obj = HOOKS.build(hook)
+        else:
+            hook_obj = hook
+
+        if priority is not None:
+            hook_obj.priority = priority
+        elif _priority is not None:
+            hook_obj.priority = _priority
+
+        inserted = False
+        for i in range(len(self._hooks) - 1, -1, -1):
+            if get_priority(hook_obj.priority) >= get_priority(
+                    self._hooks[i].priority):
+                self._hooks.insert(i + 1, hook_obj)
+                inserted = True
+                break
+        if not inserted:
+            self._hooks.insert(0, hook_obj)
+
+    def register_default_hooks(
+        self,
+        hooks: Optional[Dict[str, Union[Hook, Dict]]] = None,
+    ) -> None:
+        """Register default hooks into hook list.
+
+        ``hooks`` will be registered into runner to execute some default
+        actions like updating model parameters or saving checkpoints.
+
+        Default hooks and their priorities:
+
+        +----------------------+-------------------------+
+        | Hooks                | Priority                |
+        +======================+=========================+
+        | RuntimeInfoHook      | VERY_HIGH (10)          |
+        +----------------------+-------------------------+
+        | IterTimerHook        | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | DistSamplerSeedHook  | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | LoggerHook           | BELOW_NORMAL (60)       |
+        +----------------------+-------------------------+
+        | ParamSchedulerHook   | LOW (70)                |
+        +----------------------+-------------------------+
+        | CheckpointHook       | VERY_LOW (90)           |
+        +----------------------+-------------------------+
+
+        If ``hooks`` is None, above hooks will be registered by
+        default::
+
+            default_hooks = dict(
+                runtime_info=dict(type='RuntimeInfoHook'),
+                timer=dict(type='IterTimerHook'),
+                sampler_seed=dict(type='DistSamplerSeedHook'),
+                logger=dict(type='LoggerHook'),
+                param_scheduler=dict(type='ParamSchedulerHook'),
+                checkpoint=dict(type='CheckpointHook', interval=1),
+            )
+
+        If not None, ``hooks`` will be merged into ``default_hooks``.
+        If there are None value in default_hooks, the corresponding item will
+        be popped from ``default_hooks``::
+
+            hooks = dict(timer=None)
+
+        The final registered default hooks will be :obj:`RuntimeInfoHook`,
+        :obj:`DistSamplerSeedHook`, :obj:`LoggerHook`,
+        :obj:`ParamSchedulerHook` and :obj:`CheckpointHook`.
+
+        Args:
+            hooks (dict[str, Hook or dict], optional): Default hooks or configs
+                to be registered.
+        """
+        default_hooks: dict = dict(
+            runtime_info=dict(type='RuntimeInfoHook'),
+            timer=dict(type='IterTimerHook'),
+            sampler_seed=dict(type='DistSamplerSeedHook'),
+            logger=dict(type='LoggerHook'),
+            param_scheduler=dict(type='ParamSchedulerHook'),
+            checkpoint=dict(type='CheckpointHook', interval=1),
+        )
+        if hooks is not None:
+            for name, hook in hooks.items():
+                if name in default_hooks and hook is None:
+                    # remove hook from _default_hooks
+                    default_hooks.pop(name)
+                else:
+                    assert hook is not None
+                    default_hooks[name] = hook
+
+        for hook in default_hooks.values():
+            self.register_hook(hook)
+
+    def register_custom_hooks(self, hooks: List[Union[Hook, Dict]]) -> None:
+        """Register custom hooks into hook list.
+
+        Args:
+            hooks (list[Hook | dict]): List of hooks or configs to be
+                registered.
+        """
+        for hook in hooks:
+            self.register_hook(hook)
+
+    def register_hooks(
+        self,
+        default_hooks: Optional[Dict[str, Union[Hook, Dict]]] = None,
+        custom_hooks: Optional[List[Union[Hook, Dict]]] = None,
+    ) -> None:
+        """Register default hooks and custom hooks into hook list.
+
+        Args:
+            default_hooks (dict[str, dict] or dict[str, Hook], optional): Hooks
+                to execute default actions like updating model parameters and
+                saving checkpoints.  Defaults to None.
+            custom_hooks (list[dict] or list[Hook], optional): Hooks to execute
+                custom actions like visualizing images processed by pipeline.
+                Defaults to None.
+        """
+        self.register_default_hooks(default_hooks)
+
+        if custom_hooks is not None:
+            self.register_custom_hooks(custom_hooks)
+
+    def resume(
+        self,
+        filename: str,
+        resume_optimizer: bool = True,
+        resume_param_scheduler: bool = True,
+        map_location: Union[str, Callable] = 'default',
+    ) -> None:
+        """Resume model from checkpoint.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+            resume_optimizer (bool): Whether to resume optimizer state.
+                Defaults to True.
+            resume_param_scheduler (bool): Whether to resume param scheduler
+                state. Defaults to True.
+            map_location (str or callable):A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'default'.
+        """
+
+        def callback(checkpoint):
+            self.call_hook('after_load_checkpoint', checkpoint=checkpoint)
+
+        checkpoint = self.strategy.resume(
+            filename,
+            resume_optimizer=resume_optimizer,
+            resume_param_scheduler=resume_param_scheduler,
+            map_location=map_location,
+            callback=callback,
+        )
+
+        self.train_loop._epoch = checkpoint['meta']['epoch']
+        self.train_loop._iter = checkpoint['meta']['iter']
+
+        # check whether the number of GPU used for current experiment
+        # is consistent with resuming from checkpoint
+        if 'config' in checkpoint['meta']:
+            config = mmengine.Config.fromstring(
+                checkpoint['meta']['config'], file_format='.py')
+            previous_gpu_ids = config.get('gpu_ids', None)
+            if (previous_gpu_ids is not None and len(previous_gpu_ids) > 0
+                    and len(previous_gpu_ids) != self.world_size):
+                # TODO, should we modify the iteration?
+                self.logger.info(
+                    'Number of GPU used for current experiment is not '
+                    'consistent with resuming from checkpoint')
+                if (self._auto_scale_lr is None
+                        or not self._auto_scale_lr.get('enable', False)):
+                    raise RuntimeError(
+                        'Cannot automatically rescale lr in resuming. Please '
+                        'make sure the number of GPU is consistent with the '
+                        'previous training state resuming from the checkpoint '
+                        'or set `enable` in `auto_scale_lr to False.')
+
+        resumed_dataset_meta = checkpoint['meta'].get('dataset_meta', None)
+        dataset_meta = getattr(self.train_dataloader.dataset, 'metainfo', None)
+
+        # `resumed_dataset_meta` and `dataset_meta` could be object like
+        # np.ndarray, which cannot be directly judged as equal or not,
+        # therefore we just compared their dumped results.
+        if pickle.dumps(resumed_dataset_meta) != pickle.dumps(dataset_meta):
+            self.logger.warning(
+                'The dataset metainfo from the resumed checkpoint is '
+                'different from the current training dataset, please '
+                'check the correctness of the checkpoint or the training '
+                'dataset.')
+
+        self.message_hub.load_state_dict(checkpoint['message_hub'])
+
+        self.logger.info(f'resumed epoch: {self.epoch}, iter: {self.iter}')
+
+    def load_checkpoint(self,
+                        filename: str,
+                        map_location: Union[str, Callable] = 'cpu',
+                        strict: bool = False,
+                        revise_keys: list = [(r'^module.', '')]):
+        """Load checkpoint from given ``filename``.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+            map_location (str or callable): A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'cpu'.
+            strict (bool): strict (bool): Whether to allow different params for
+                the model and checkpoint.
+            revise_keys (list): A list of customized keywords to modify the
+                state_dict in checkpoint. Each item is a (pattern, replacement)
+                pair of the regular expression operations. Defaults to strip
+                the prefix 'module.' by [(r'^module\\.', '')].
+        """
+
+        def callback(checkpoint):
+            self.call_hook('after_load_checkpoint', checkpoint=checkpoint)
+
+        self.strategy.load_checkpoint(
+            filename,
+            map_location=map_location,
+            strict=strict,
+            revise_keys=revise_keys,
+            callback=callback)
+
+    def save_checkpoint(
+        self,
+        out_dir: str,
+        filename: str,
+        file_client_args: Optional[dict] = None,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        meta: dict = None,
+        by_epoch: bool = True,
+        backend_args: Optional[dict] = None,
+    ):
+        """Save checkpoints.
+
+        ``CheckpointHook`` invokes this method to save checkpoints
+        periodically.
+
+        Args:
+            out_dir (str): The directory that checkpoints are saved.
+            filename (str): The checkpoint filename.
+            file_client_args (dict, optional): Arguments to instantiate a
+                FileClient. See :class:`mmengine.fileio.FileClient` for
+                details. Defaults to None. It will be deprecated in future.
+                Please use `backend_args` instead.
+            save_optimizer (bool): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            save_param_scheduler (bool): Whether to save the param_scheduler
+                to the checkpoint. Defaults to True.
+            meta (dict, optional): The meta information to be saved in the
+                checkpoint. Defaults to None.
+            by_epoch (bool): Whether the scheduled momentum is updated by
+                epochs. Defaults to True.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+        """
+        if meta is None:
+            meta = {}
+        elif not isinstance(meta, dict):
+            raise TypeError(
+                f'meta should be a dict or None, but got {type(meta)}')
+
+        if by_epoch:
+            # self.epoch increments 1 after
+            # `self.call_hook('after_train_epoch)` but `save_checkpoint` is
+            # called by `after_train_epoch`` method of `CheckpointHook` so
+            # `epoch` should be `self.epoch + 1`
+            meta.update(epoch=self.epoch + 1, iter=self.iter)
+        else:
+            meta.update(epoch=self.epoch, iter=self.iter + 1)
+
+        if file_client_args is not None:
+            warnings.warn(
+                '"file_client_args" will be deprecated in future. '
+                'Please use "backend_args" instead', DeprecationWarning)
+            if backend_args is not None:
+                raise ValueError(
+                    '"file_client_args" and "backend_args" cannot be set at '
+                    'the same time.')
+
+            file_client = FileClient.infer_client(file_client_args, out_dir)
+            filepath = file_client.join_path(out_dir, filename)
+        else:
+            filepath = join_path(  # type: ignore
+                out_dir, filename, backend_args=backend_args)
+
+        meta.update(
+            cfg=self.cfg.pretty_text, experiment_name=self.experiment_name)
+
+        if hasattr(self.train_dataloader.dataset, 'metainfo'):
+            meta.update(dataset_meta=self.train_dataloader.dataset.metainfo)
+
+        checkpoint = {
+            'meta': meta,
+            'message_hub': self.message_hub.state_dict()
+        }
+
+        def callback(checkpoint):
+            self.call_hook('before_save_checkpoint', checkpoint=checkpoint)
+
+        self.strategy.save_checkpoint(
+            filename=filepath,
+            save_optimizer=save_optimizer,
+            save_param_scheduler=save_param_scheduler,
+            extra_ckpt=checkpoint,
+            callback=callback,
+        )
+
+    @master_only
+    def dump_config(self) -> None:
+        """Dump config to `work_dir`."""
+        if self.cfg.filename is not None:
+            filename = osp.basename(self.cfg.filename)
+        else:
+            filename = f'{self.timestamp}.py'
+        self.cfg.dump(osp.join(self.work_dir, filename))
+
+    def _log_env(self) -> None:
+        """Logging environment information of the current task.
+
+        Args:
+            env_cfg (dict): The environment config of the runner.
+        """
+        # Collect and log environment information.
+        system_env, runtime_env = self.strategy.collect_env()
+
+        env_info = '\n    ' + '\n    '.join(f'{k}: {v}'
+                                            for k, v in system_env.items())
+        runtime_env_info = '\n    ' + '\n    '.join(
+            f'{k}: {v}' for k, v in runtime_env.items())
+        dash_line = '-' * 60
+        self.logger.info('\n' + dash_line + '\nSystem environment:' +
+                         env_info + '\n'
+                         '\nRuntime environment:' + runtime_env_info + '\n' +
+                         dash_line + '\n')
+
+        if self.cfg._cfg_dict:
+            self.logger.info(f'Config:\n{self.cfg.pretty_text}')
diff --git a/head_extractor/src/mmengine/runner/activation_checkpointing.py b/head_extractor/src/mmengine/runner/activation_checkpointing.py
new file mode 100644
index 0000000000000000000000000000000000000000..3db67f057ced0f56a533eaeb98a1ea52fdd183cf
--- /dev/null
+++ b/head_extractor/src/mmengine/runner/activation_checkpointing.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import wraps
+from operator import attrgetter
+from typing import List, Union
+
+import torch
+from torch.utils.checkpoint import checkpoint
+
+
+def wrap_forward(forward):
+
+    @wraps(forward)
+    def wrapper(*args):
+        return checkpoint(forward, *args)
+
+    return wrapper
+
+
+def turn_on_activation_checkpointing(model: torch.nn.Module,
+                                     modules: Union[List[str], str]):
+
+    if isinstance(modules, str):
+        modules = [modules]
+    for module_name in modules:
+        module = attrgetter(module_name)(model)
+        module.forward = wrap_forward(module.forward)
diff --git a/head_extractor/src/mmengine/runner/amp.py b/head_extractor/src/mmengine/runner/amp.py
new file mode 100644
index 0000000000000000000000000000000000000000..198babc58239ee4bc54335b869b1418f511fcb10
--- /dev/null
+++ b/head_extractor/src/mmengine/runner/amp.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from contextlib import contextmanager
+from typing import Optional
+
+import torch
+
+from mmengine.device import (get_device, is_cuda_available, is_mlu_available,
+                             is_npu_available)
+from mmengine.logging import print_log
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+
+@contextmanager
+def autocast(device_type: Optional[str] = None,
+             dtype: Optional[torch.dtype] = None,
+             enabled: bool = True,
+             cache_enabled: Optional[bool] = None):
+    """A wrapper of ``torch.autocast`` and ``toch.cuda.amp.autocast``.
+
+    Pytorch 1.5.0 provide ``torch.cuda.amp.autocast`` for running in
+    mixed precision , and update it to ``torch.autocast`` in 1.10.0.
+    Both interfaces have different arguments, and ``torch.autocast``
+    support running with cpu additionally.
+
+    This function provides a unified interface by wrapping
+    ``torch.autocast`` and ``torch.cuda.amp.autocast``, which resolves the
+    compatibility issues that ``torch.cuda.amp.autocast`` does not support
+    running mixed precision with cpu, and both contexts have different
+    arguments. We suggest users using this function in the code
+    to achieve maximized compatibility of different PyTorch versions.
+
+    Note:
+        ``autocast`` requires pytorch version >= 1.5.0. If pytorch version
+        <= 1.10.0 and cuda is not available, it will raise an error with
+        ``enabled=True``, since ``torch.cuda.amp.autocast`` only support cuda
+        mode.
+
+    Examples:
+         >>> # case1: 1.10 > Pytorch version >= 1.5.0
+         >>> with autocast():
+         >>>    # run in mixed precision context
+         >>>    pass
+         >>> with autocast(device_type='cpu')::
+         >>>    # raise error, torch.cuda.amp.autocast only support cuda mode.
+         >>>    pass
+         >>> # case2: Pytorch version >= 1.10.0
+         >>> with autocast():
+         >>>    # default cuda mixed precision context
+         >>>    pass
+         >>> with autocast(device_type='cpu'):
+         >>>    # cpu mixed precision context
+         >>>    pass
+         >>> with autocast(
+         >>>     device_type='cuda', enabled=True, cache_enabled=True):
+         >>>    # enable precision context with more specific arguments.
+         >>>    pass
+
+    Args:
+        device_type (str, required):  Whether to use 'cuda' or 'cpu' device.
+        enabled(bool):  Whether autocasting should be enabled in the region.
+            Defaults to True
+        dtype (torch_dtype, optional):  Whether to use ``torch.float16`` or
+            ``torch.bfloat16``.
+        cache_enabled(bool, optional):  Whether the weight cache inside
+            autocast should be enabled.
+    """
+    # If `enabled` is True, enable an empty context and all calculations
+    # are performed under fp32.
+    assert digit_version(TORCH_VERSION) >= digit_version('1.5.0'), (
+        'The minimum pytorch version requirements of mmengine is 1.5.0, but '
+        f'got {TORCH_VERSION}')
+
+    if (digit_version('1.5.0') <= digit_version(TORCH_VERSION) <
+            digit_version('1.10.0')):
+        # If pytorch version is between 1.5.0 and 1.10.0, the default value of
+        # dtype for `torch.cuda.amp.autocast` is torch.float16.
+        assert (
+            device_type == 'cuda' or device_type == 'mlu'
+            or device_type is None), (
+                'Pytorch version under 1.10.0 only supports running automatic '
+                'mixed training with cuda or mlu')
+        if dtype is not None or cache_enabled is not None:
+            print_log(
+                f'{dtype} and {device_type} will not work for '
+                '`autocast` since your Pytorch version: '
+                f'{TORCH_VERSION} <= 1.10.0',
+                logger='current',
+                level=logging.WARNING)
+
+        if is_npu_available():
+            with torch.npu.amp.autocast(enabled=enabled):
+                yield
+        elif is_mlu_available():
+            with torch.mlu.amp.autocast(enabled=enabled):
+                yield
+        elif is_cuda_available():
+            with torch.cuda.amp.autocast(enabled=enabled):
+                yield
+        else:
+            if not enabled:
+                yield
+            else:
+                raise RuntimeError(
+                    'If pytorch versions is between 1.5.0 and 1.10, '
+                    '`autocast` is only available in gpu mode')
+
+    else:
+        # Modified from https://github.com/pytorch/pytorch/blob/master/torch/amp/autocast_mode.py # noqa: E501
+        # This code should update with the `torch.autocast`.
+        if cache_enabled is None:
+            cache_enabled = torch.is_autocast_cache_enabled()
+        device = get_device()
+        device_type = device if device_type is None else device_type
+
+        if device_type == 'cuda':
+            if dtype is None:
+                dtype = torch.get_autocast_gpu_dtype()
+
+            if dtype == torch.bfloat16 and not \
+                    torch.cuda.is_bf16_supported():
+                raise RuntimeError(
+                    'Current CUDA Device does not support bfloat16. Please '
+                    'switch dtype to float16.')
+
+        elif device_type == 'cpu':
+            if dtype is None:
+                dtype = torch.bfloat16
+            assert dtype == torch.bfloat16, (
+                'In CPU autocast, only support `torch.bfloat16` dtype')
+
+        elif device_type == 'mlu':
+            pass
+
+        elif device_type == 'npu':
+            pass
+        elif device_type == 'musa':
+            if dtype is None:
+                dtype = torch.get_autocast_gpu_dtype()
+            with torch.musa.amp.autocast(
+                    enabled=enabled, dtype=dtype, cache_enabled=cache_enabled):
+                yield
+                return
+        else:
+            # Device like MPS does not support fp16 training or testing.
+            # If an inappropriate device is set and fp16 is enabled, an error
+            # will be thrown.
+            if enabled is False:
+                yield
+                return
+            else:
+                raise ValueError('User specified autocast device_type must be '
+                                 f'cuda or cpu, but got {device_type}')
+
+        with torch.autocast(
+                device_type=device_type,
+                enabled=enabled,
+                dtype=dtype,
+                cache_enabled=cache_enabled):
+            yield
diff --git a/head_extractor/src/mmengine/runner/base_loop.py b/head_extractor/src/mmengine/runner/base_loop.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bae459a2071b1560ddf159a2922925f6a58bdb7
--- /dev/null
+++ b/head_extractor/src/mmengine/runner/base_loop.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Any, Dict, Union
+
+from torch.utils.data import DataLoader
+
+
+class BaseLoop(metaclass=ABCMeta):
+    """Base loop class.
+
+    All subclasses inherited from ``BaseLoop`` should overwrite the
+    :meth:`run` method.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): An iterator to generate one batch of
+            dataset each iteration.
+    """
+
+    def __init__(self, runner, dataloader: Union[DataLoader, Dict]) -> None:
+        self._runner = runner
+        if isinstance(dataloader, dict):
+            # Determine whether or not different ranks use different seed.
+            diff_rank_seed = runner._randomness_cfg.get(
+                'diff_rank_seed', False)
+            self.dataloader = runner.build_dataloader(
+                dataloader, seed=runner.seed, diff_rank_seed=diff_rank_seed)
+        else:
+            self.dataloader = dataloader
+
+    @property
+    def runner(self):
+        return self._runner
+
+    @abstractmethod
+    def run(self) -> Any:
+        """Execute loop."""
diff --git a/head_extractor/src/mmengine/runner/checkpoint.py b/head_extractor/src/mmengine/runner/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..60d71a735baaf0f0e93f8a99e927124c71d70763
--- /dev/null
+++ b/head_extractor/src/mmengine/runner/checkpoint.py
@@ -0,0 +1,815 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import io
+import logging
+import os
+import os.path as osp
+import pkgutil
+import re
+from collections import OrderedDict, namedtuple
+from importlib import import_module
+from tempfile import TemporaryDirectory
+from typing import Callable, Dict, Optional
+
+import torch
+
+import mmengine
+from mmengine.dist import get_dist_info
+from mmengine.fileio import FileClient, get_file_backend
+from mmengine.fileio import load as load_file
+from mmengine.logging import print_log
+from mmengine.model import BaseTTAModel, is_model_wrapper
+from mmengine.utils import (apply_to, deprecated_function, digit_version,
+                            mkdir_or_exist)
+from mmengine.utils.dl_utils import load_url
+
+# `MMENGINE_HOME` is the highest priority directory to save checkpoints
+# downloaded from Internet. If it is not set, as a workaround, using
+# `XDG_CACHE_HOME`` or `~/.cache` instead.
+# Note that `XDG_CACHE_HOME` defines the base directory relative to which
+# user-specific non-essential data files should be stored. If `XDG_CACHE_HOME`
+# is either not set or empty, a default equal to `~/.cache` should be used.
+ENV_MMENGINE_HOME = 'MMENGINE_HOME'
+ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
+DEFAULT_CACHE_DIR = '~/.cache'
+
+
+class _IncompatibleKeys(
+        namedtuple('IncompatibleKeys', ['missing_keys', 'unexpected_keys'])):
+
+    def __repr__(self):
+        if not self.missing_keys and not self.unexpected_keys:
+            return '<All keys matched successfully>'
+        return super().__repr__()
+
+    __str__ = __repr__
+
+
+def _get_mmengine_home():
+    mmengine_home = os.path.expanduser(
+        os.getenv(
+            ENV_MMENGINE_HOME,
+            os.path.join(
+                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmengine')))
+
+    mkdir_or_exist(mmengine_home)
+    return mmengine_home
+
+
+def load_state_dict(module, state_dict, strict=False, logger=None):
+    """Load state_dict to a module.
+
+    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+    Default value for ``strict`` is set to ``False`` and the message for
+    param mismatch will be shown even if strict is False.
+
+    Args:
+        module (Module): Module that receives the state_dict.
+        state_dict (OrderedDict): Weights.
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Defaults to False.
+        logger (:obj:`logging.Logger`, optional): Logger to log the error
+            message. If not specified, print function will be used.
+    """
+    unexpected_keys = []
+    missing_keys = []
+    err_msg = []
+
+    # copy state_dict so _load_from_state_dict can modify it
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    # use _load_from_state_dict to enable checkpoint version control
+    def load(module, local_state_dict, prefix=''):
+        # recursively check parallel module in case that the model has a
+        # complicated structure, e.g., nn.Module(nn.Module(DDP))
+        if is_model_wrapper(module) or isinstance(module, BaseTTAModel):
+            module = module.module
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(local_state_dict, prefix, local_metadata,
+                                     True, missing_keys, unexpected_keys,
+                                     err_msg)
+        for name, child in module._modules.items():
+            if child is not None:
+                child_prefix = prefix + name + '.'
+                child_state_dict = {
+                    k: v
+                    for k, v in local_state_dict.items()
+                    if k.startswith(child_prefix)
+                }
+                load(child, child_state_dict, child_prefix)
+
+        # Note that the hook can modify missing_keys and unexpected_keys.
+        incompatible_keys = _IncompatibleKeys(missing_keys, unexpected_keys)
+        if hasattr(module, '_load_state_dict_post_hooks'):
+            for hook in module._load_state_dict_post_hooks.values():
+                out = hook(module, incompatible_keys)
+                assert out is None, (
+                    'Hooks registered with '
+                    '``register_load_state_dict_post_hook`` are not expected '
+                    'to return new values, if incompatible_keys need to be '
+                    'modified, it should be done inplace.')
+
+    load(module, state_dict)
+    load = None  # break load->load reference cycle
+
+    # ignore "num_batches_tracked" of BN layers
+    missing_keys = [
+        key for key in missing_keys if 'num_batches_tracked' not in key
+    ]
+
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+
+    rank, _ = get_dist_info()
+    if len(err_msg) > 0 and rank == 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)
+        if strict:
+            raise RuntimeError(err_msg)
+        else:
+            print_log(err_msg, logger=logger, level=logging.WARNING)
+
+
+def get_torchvision_models():
+    import torchvision
+    if digit_version(torchvision.__version__) < digit_version('0.13.0a0'):
+        model_urls = dict()
+        # When the version of torchvision is lower than 0.13, the model url is
+        # not declared in `torchvision.model.__init__.py`, so we need to
+        # iterate through `torchvision.models.__path__` to get the url for each
+        # model.
+        for _, name, ispkg in pkgutil.walk_packages(
+                torchvision.models.__path__):
+            if ispkg:
+                continue
+            _zoo = import_module(f'torchvision.models.{name}')
+            if hasattr(_zoo, 'model_urls'):
+                _urls = getattr(_zoo, 'model_urls')
+                model_urls.update(_urls)
+    else:
+        # Since torchvision bumps to v0.13, the weight loading logic,
+        # model keys and model urls have been changed. Here the URLs of old
+        # version is loaded to avoid breaking back compatibility. If the
+        # torchvision version>=0.13.0, new URLs will be added. Users can get
+        # the resnet50 checkpoint by setting 'resnet50.imagent1k_v1',
+        # 'resnet50' or 'ResNet50_Weights.IMAGENET1K_V1' in the config.
+        json_path = osp.join(mmengine.__path__[0], 'hub/torchvision_0.12.json')
+        model_urls = mmengine.load(json_path)
+        if digit_version(torchvision.__version__) < digit_version('0.14.0a0'):
+            weights_list = [
+                cls for cls_name, cls in torchvision.models.__dict__.items()
+                if cls_name.endswith('_Weights')
+            ]
+        else:
+            weights_list = [
+                torchvision.models.get_model_weights(model)
+                for model in torchvision.models.list_models(torchvision.models)
+            ]
+
+        for cls in weights_list:
+            # The name of torchvision model weights classes ends with
+            # `_Weights` such as `ResNet18_Weights`. However, some model weight
+            # classes, such as `MNASNet0_75_Weights` does not have any urls in
+            # torchvision 0.13.0 and cannot be iterated. Here we simply check
+            # `DEFAULT` attribute to ensure the class is not empty.
+            if not hasattr(cls, 'DEFAULT'):
+                continue
+            # Since `cls.DEFAULT` can not be accessed by iterating cls, we set
+            # default urls explicitly.
+            cls_name = cls.__name__
+            cls_key = cls_name.replace('_Weights', '').lower()
+            model_urls[f'{cls_key}.default'] = cls.DEFAULT.url
+            for weight_enum in cls:
+                cls_key = cls_name.replace('_Weights', '').lower()
+                cls_key = f'{cls_key}.{weight_enum.name.lower()}'
+                model_urls[cls_key] = weight_enum.url
+
+    return model_urls
+
+
+def get_external_models():
+    mmengine_home = _get_mmengine_home()
+    default_json_path = osp.join(mmengine.__path__[0], 'hub/openmmlab.json')
+    default_urls = load_file(default_json_path)
+    assert isinstance(default_urls, dict)
+    external_json_path = osp.join(mmengine_home, 'open_mmlab.json')
+    if osp.exists(external_json_path):
+        external_urls = load_file(external_json_path)
+        assert isinstance(external_urls, dict)
+        default_urls.update(external_urls)
+
+    return default_urls
+
+
+def get_mmcls_models():
+    mmcls_json_path = osp.join(mmengine.__path__[0], 'hub/mmcls.json')
+    mmcls_urls = load_file(mmcls_json_path)
+
+    return mmcls_urls
+
+
+def get_deprecated_model_names():
+    deprecate_json_path = osp.join(mmengine.__path__[0], 'hub/deprecated.json')
+    deprecate_urls = load_file(deprecate_json_path)
+    assert isinstance(deprecate_urls, dict)
+
+    return deprecate_urls
+
+
+def _process_mmcls_checkpoint(checkpoint):
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        # Some checkpoints converted from 3rd-party repo don't
+        # have the "state_dict" key.
+        state_dict = checkpoint
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith('backbone.'):
+            new_state_dict[k[9:]] = v
+    new_checkpoint = dict(state_dict=new_state_dict)
+
+    return new_checkpoint
+
+
+class CheckpointLoader:
+    """A general checkpoint loader to manage all schemes."""
+
+    _schemes: Dict[str, Callable] = {}
+
+    @classmethod
+    def _register_scheme(cls, prefixes, loader, force=False):
+        if isinstance(prefixes, str):
+            prefixes = [prefixes]
+        else:
+            assert isinstance(prefixes, (list, tuple))
+        for prefix in prefixes:
+            if (prefix not in cls._schemes) or force:
+                cls._schemes[prefix] = loader
+            else:
+                raise KeyError(
+                    f'{prefix} is already registered as a loader backend, '
+                    'add "force=True" if you want to override it')
+        # sort, longer prefixes take priority
+        cls._schemes = OrderedDict(
+            sorted(cls._schemes.items(), key=lambda t: t[0], reverse=True))
+
+    @classmethod
+    def register_scheme(cls, prefixes, loader=None, force=False):
+        """Register a loader to CheckpointLoader.
+
+        This method can be used as a normal class method or a decorator.
+
+        Args:
+            prefixes (str or list[str] or tuple[str]):
+            The prefix of the registered loader.
+            loader (function, optional): The loader function to be registered.
+                When this method is used as a decorator, loader is None.
+                Defaults to None.
+            force (bool, optional): Whether to override the loader
+                if the prefix has already been registered. Defaults to False.
+        """
+
+        if loader is not None:
+            cls._register_scheme(prefixes, loader, force=force)
+            return
+
+        def _register(loader_cls):
+            cls._register_scheme(prefixes, loader_cls, force=force)
+            return loader_cls
+
+        return _register
+
+    @classmethod
+    def _get_checkpoint_loader(cls, path):
+        """Finds a loader that supports the given path. Falls back to the local
+        loader if no other loader is found.
+
+        Args:
+            path (str): checkpoint path
+
+        Returns:
+            callable: checkpoint loader
+        """
+        for p in cls._schemes:
+            # use regular match to handle some cases that where the prefix of
+            # loader has a prefix. For example, both 's3://path' and
+            # 'open-mmlab:s3://path' should return `load_from_ceph`
+            if re.match(p, path) is not None:
+                return cls._schemes[p]
+
+    @classmethod
+    def load_checkpoint(cls, filename, map_location=None, logger='current'):
+        """load checkpoint through URL scheme path.
+
+        Args:
+            filename (str): checkpoint file name with given prefix
+            map_location (str, optional): Same as :func:`torch.load`.
+                Defaults to None
+            logger (str): The logger for message. Defaults to 'current'.
+
+        Returns:
+            dict or OrderedDict: The loaded checkpoint.
+        """
+
+        checkpoint_loader = cls._get_checkpoint_loader(filename)
+        class_name = checkpoint_loader.__name__
+        print_log(
+            f'Loads checkpoint by {class_name[10:]} backend from path: '
+            f'{filename}',
+            logger=logger)
+        return checkpoint_loader(filename, map_location)
+
+
+@CheckpointLoader.register_scheme(prefixes='')
+def load_from_local(filename, map_location):
+    """load checkpoint by local file path.
+
+    Args:
+        filename (str): local checkpoint file path
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    filename = osp.expanduser(filename)
+    if not osp.isfile(filename):
+        raise FileNotFoundError(f'{filename} can not be found.')
+    checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes=('http://', 'https://'))
+def load_from_http(filename,
+                   map_location=None,
+                   model_dir=None,
+                   progress=os.isatty(0)):
+    """load checkpoint through HTTP or HTTPS scheme path. In distributed
+    setting, this function only download checkpoint at local rank 0.
+
+    Args:
+        filename (str): checkpoint file path with modelzoo or
+            torchvision prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+        model_dir (string, optional): directory in which to save the object,
+            Defaults to None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        checkpoint = load_url(
+            filename,
+            model_dir=model_dir,
+            map_location=map_location,
+            progress=progress)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            checkpoint = load_url(
+                filename,
+                model_dir=model_dir,
+                map_location=map_location,
+                progress=progress)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes='pavi://')
+def load_from_pavi(filename, map_location=None):
+    """load checkpoint through the file path prefixed with pavi. In distributed
+    setting, this function download ckpt at all ranks to different temporary
+    directories.
+
+    Args:
+        filename (str): checkpoint file path with pavi prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+          Defaults to None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    assert filename.startswith('pavi://'), \
+        f'Expected filename startswith `pavi://`, but get {filename}'
+    model_path = filename[7:]
+
+    try:
+        from pavi import modelcloud
+    except ImportError:
+        raise ImportError(
+            'Please install pavi to load checkpoint from modelcloud.')
+
+    model = modelcloud.get(model_path)
+    with TemporaryDirectory() as tmp_dir:
+        downloaded_file = osp.join(tmp_dir, model.name)
+        model.download(downloaded_file)
+        checkpoint = torch.load(downloaded_file, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(
+    prefixes=[r'(\S+\:)?s3://', r'(\S+\:)?petrel://'])
+def load_from_ceph(filename, map_location=None, backend='petrel'):
+    """load checkpoint through the file path prefixed with s3.  In distributed
+    setting, this function download ckpt at all ranks to different temporary
+    directories.
+
+    Args:
+        filename (str): checkpoint file path with s3 prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+        backend (str, optional): The storage backend type.
+            Defaults to 'petrel'.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    file_backend = get_file_backend(
+        filename, backend_args={'backend': backend})
+    with io.BytesIO(file_backend.get(filename)) as buffer:
+        checkpoint = torch.load(buffer, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes=('modelzoo://', 'torchvision://'))
+def load_from_torchvision(filename, map_location=None):
+    """load checkpoint through the file path prefixed with modelzoo or
+    torchvision.
+
+    Args:
+        filename (str): checkpoint file path with modelzoo or
+            torchvision prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    model_urls = get_torchvision_models()
+    if filename.startswith('modelzoo://'):
+        print_log(
+            'The URL scheme of "modelzoo://" is deprecated, please '
+            'use "torchvision://" instead',
+            logger='current',
+            level=logging.WARNING)
+        model_name = filename[11:]
+    else:
+        model_name = filename[14:]
+    return load_from_http(model_urls[model_name], map_location=map_location)
+
+
+@CheckpointLoader.register_scheme(prefixes=('open-mmlab://', 'openmmlab://'))
+def load_from_openmmlab(filename, map_location=None):
+    """load checkpoint through the file path prefixed with open-mmlab or
+    openmmlab.
+
+    Args:
+        filename (str): checkpoint file path with open-mmlab or
+        openmmlab prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+          Defaults to None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    model_urls = get_external_models()
+    prefix_str = 'open-mmlab://'
+    if filename.startswith(prefix_str):
+        model_name = filename[13:]
+    else:
+        model_name = filename[12:]
+        prefix_str = 'openmmlab://'
+
+    deprecated_urls = get_deprecated_model_names()
+    if model_name in deprecated_urls:
+        print_log(
+            f'{prefix_str}{model_name} is deprecated in favor '
+            f'of {prefix_str}{deprecated_urls[model_name]}',
+            logger='current',
+            level=logging.WARNING)
+        model_name = deprecated_urls[model_name]
+    model_url = model_urls[model_name]
+    # check if is url
+    if model_url.startswith(('http://', 'https://')):
+        checkpoint = load_from_http(model_url, map_location=map_location)
+    else:
+        filename = osp.join(_get_mmengine_home(), model_url)
+        if not osp.isfile(filename):
+            raise FileNotFoundError(f'{filename} can not be found.')
+        checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes='mmcls://')
+def load_from_mmcls(filename, map_location=None):
+    """load checkpoint through the file path prefixed with mmcls.
+
+    Args:
+        filename (str): checkpoint file path with mmcls prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    model_urls = get_mmcls_models()
+    model_name = filename[8:]
+    checkpoint = load_from_http(
+        model_urls[model_name], map_location=map_location)
+    checkpoint = _process_mmcls_checkpoint(checkpoint)
+    return checkpoint
+
+
+def _load_checkpoint(filename, map_location=None, logger=None):
+    """Load checkpoint from somewhere (modelzoo, file, url).
+
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str, optional): Same as :func:`torch.load`.
+           Defaults to None.
+        logger (:mod:`logging.Logger`, optional): The logger for error message.
+           Defaults to None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint. It can be either an
+        OrderedDict storing model weights or a dict containing other
+        information, which depends on the checkpoint.
+    """
+    return CheckpointLoader.load_checkpoint(filename, map_location, logger)
+
+
+def _load_checkpoint_with_prefix(prefix, filename, map_location=None):
+    """Load partial pretrained model with specific prefix.
+
+    Args:
+        prefix (str): The prefix of sub-module.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str | None): Same as :func:`torch.load`.
+            Defaults to None.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    checkpoint = _load_checkpoint(filename, map_location=map_location)
+
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    if not prefix.endswith('.'):
+        prefix += '.'
+    prefix_len = len(prefix)
+
+    state_dict = {
+        k[prefix_len:]: v
+        for k, v in state_dict.items() if k.startswith(prefix)
+    }
+
+    assert state_dict, f'{prefix} is not in the pretrained model'
+    return state_dict
+
+
+def _load_checkpoint_to_model(model,
+                              checkpoint,
+                              strict=False,
+                              logger=None,
+                              revise_keys=[(r'^module\.', '')]):
+
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+
+    # strip prefix of state_dict
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    for p, r in revise_keys:
+        state_dict = OrderedDict(
+            {re.sub(p, r, k): v
+             for k, v in state_dict.items()})
+    # Keep metadata in state_dict
+    state_dict._metadata = metadata
+
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+
+def load_checkpoint(model,
+                    filename,
+                    map_location=None,
+                    strict=False,
+                    logger=None,
+                    revise_keys=[(r'^module\.', '')]):
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        revise_keys (list): A list of customized keywords to modify the
+            state_dict in checkpoint. Each item is a (pattern, replacement)
+            pair of the regular expression operations. Defaults to strip
+            the prefix 'module.' by [(r'^module\\.', '')].
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location, logger)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+
+    return _load_checkpoint_to_model(model, checkpoint, strict, logger,
+                                     revise_keys)
+
+
+def weights_to_cpu(state_dict):
+    """Copy a model state_dict to cpu.
+
+    Args:
+        state_dict (OrderedDict): Model weights on GPU.
+
+    Returns:
+        OrderedDict: Model weights on GPU.
+    """
+    # stash metadata to put in state_dict later
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    state_dict = apply_to(state_dict, lambda x: hasattr(x, 'cpu'),
+                          lambda x: x.cpu())
+    state_dict._metadata = metadata
+    return state_dict
+
+
+@deprecated_function(
+    since='0.3.0',
+    removed_in='0.5.0',
+    instructions='`_save_to_state_dict` will be deprecated in the future, '
+    'please use `nn.Module._save_to_state_dict` directly.')
+def _save_to_state_dict(module, destination, prefix, keep_vars):
+    """Saves module state to `destination` dictionary.
+
+    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (dict): A dict where state will be stored.
+        prefix (str): The prefix for parameters and buffers used in this
+            module.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters.
+    """
+    for name, param in module._parameters.items():
+        if param is not None:
+            destination[prefix + name] = param if keep_vars else param.detach()
+    for name, buf in module._buffers.items():
+        if buf is not None and name not in module._non_persistent_buffers_set:
+            destination[prefix + name] = buf if keep_vars else buf.detach()
+
+
+def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+    """Returns a dictionary containing a whole state of the module.
+
+    Both parameters and persistent buffers (e.g. running averages) are
+    included. Keys are corresponding parameter and buffer names.
+    This method is modified from :meth:`torch.nn.Module.state_dict` to
+    recursively check parallel module in case that the model has a complicated
+    structure, e.g., nn.Module(nn.Module(DDP)).
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (OrderedDict): Returned dict for the state of the
+            module.
+        prefix (str): Prefix of the key.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters. Defaults to False.
+
+    Returns:
+        dict: A dictionary containing a whole state of the module.
+    """
+    # recursively check parallel module in case that the model has a
+    # complicated structure, e.g., nn.Module(nn.Module(DDP))
+    if is_model_wrapper(module):
+        module = module.module
+
+    # below is the same as torch.nn.Module.state_dict()
+    if destination is None:
+        destination = OrderedDict()
+        destination._metadata = OrderedDict()
+    destination._metadata[prefix[:-1]] = local_metadata = dict(
+        version=module._version)
+    module._save_to_state_dict(destination, prefix, keep_vars)
+    for name, child in module._modules.items():
+        if child is not None:
+            get_state_dict(
+                child, destination, prefix + name + '.', keep_vars=keep_vars)
+    for hook in module._state_dict_hooks.values():
+        hook_result = hook(module, destination, prefix, local_metadata)
+        if hook_result is not None:
+            destination = hook_result
+    return destination
+
+
+def save_checkpoint(checkpoint,
+                    filename,
+                    file_client_args=None,
+                    backend_args=None):
+    """Save checkpoint to file.
+
+    Args:
+        checkpoint (dict): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmengine.fileio.FileClient` for details.
+            Defaults to None. It will be deprecated in future. Please use
+            `backend_args` instead.
+        backend_args (dict, optional): Arguments to instantiate the
+            prefix of uri corresponding backend. Defaults to None.
+            New in v0.2.0.
+    """
+    if file_client_args is not None:
+        print_log(
+            '"file_client_args" will be deprecated in future. '
+            'Please use "backend_args" instead',
+            logger='current',
+            level=logging.WARNING)
+        if backend_args is not None:
+            raise ValueError(
+                '"file_client_args" and "backend_args" cannot be set '
+                'at the same time.')
+
+    if filename.startswith('pavi://'):
+        if file_client_args is not None or backend_args is not None:
+            raise ValueError(
+                '"file_client_args" or "backend_args" should be "None" if '
+                'filename starts with "pavi://"')
+        try:
+            from pavi import exception, modelcloud
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except exception.NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        file_client = FileClient.infer_client(file_client_args, filename)
+        if file_client_args is None:
+            file_backend = get_file_backend(
+                filename, backend_args=backend_args)
+        else:
+            file_backend = file_client
+
+        with io.BytesIO() as f:
+            torch.save(checkpoint, f)
+            file_backend.put(f.getvalue(), filename)
+
+
+def find_latest_checkpoint(path: str) -> Optional[str]:
+    """Find the latest checkpoint from the given path.
+
+    Refer to https://github.com/facebookresearch/fvcore/blob/main/fvcore/common/checkpoint.py  # noqa: E501
+
+    Args:
+        path(str): The path to find checkpoints.
+
+    Returns:
+        str or None: File path of the latest checkpoint.
+    """
+    save_file = osp.join(path, 'last_checkpoint')
+    last_saved: Optional[str]
+    if os.path.exists(save_file):
+        with open(save_file) as f:
+            last_saved = f.read().strip()
+    else:
+        print_log('Did not find last_checkpoint to be resumed.')
+        last_saved = None
+    return last_saved
diff --git a/head_extractor/src/mmengine/runner/log_processor.py b/head_extractor/src/mmengine/runner/log_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..98183ae317d813b946ae540899fcdae4e7764e99
--- /dev/null
+++ b/head_extractor/src/mmengine/runner/log_processor.py
@@ -0,0 +1,582 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import datetime
+import re
+from collections import OrderedDict
+from itertools import chain
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+
+from mmengine.device import (get_max_cuda_memory, get_max_musa_memory,
+                             is_cuda_available, is_musa_available)
+from mmengine.registry import LOG_PROCESSORS
+
+
+@LOG_PROCESSORS.register_module()
+class LogProcessor:
+    """A log processor used to format log information collected from
+    ``runner.message_hub.log_scalars``.
+
+    ``LogProcessor`` instance is built by runner and will format
+    ``runner.message_hub.log_scalars`` to ``tag`` and ``log_str``, which can
+    directly used by ``LoggerHook`` and ``MMLogger``. Besides, the argument
+    ``custom_cfg`` of constructor can control the statistics method of logs.
+
+    Args:
+        window_size (int): default smooth interval. Defaults to 10.
+        by_epoch (bool): Whether to format logs with epoch stype. Defaults to
+            True.
+        custom_cfg (list[dict], optional): Contains multiple log config dict,
+            in which key means the data source name of log and value means the
+            statistic method and corresponding arguments used to count the
+            data source. Defaults to None.
+
+            - If custom_cfg is None, all logs will be formatted via default
+              methods, such as smoothing loss by default window_size. If
+              custom_cfg is defined as a list of config dict, for example:
+              [dict(data_src='loss', method='mean', log_name='global_loss',
+              window_size='global')]. It means the log item ``loss`` will be
+              counted as global mean and additionally logged as ``global_loss``
+              (defined by ``log_name``). If ``log_name`` is not defined in
+              config dict, the original logged key will be overwritten.
+
+            - The original log item cannot be overwritten twice. Here is
+              an error example:
+              [dict(data_src='loss', method='mean', window_size='global'),
+              dict(data_src='loss', method='mean', window_size='epoch')].
+              Both log config dict in custom_cfg do not have ``log_name`` key,
+              which means the loss item will be overwritten twice.
+
+            - For those statistic methods with the ``window_size`` argument,
+              if ``by_epoch`` is set to False, ``windows_size`` should not be
+              `epoch` to statistics log value by epoch.
+        num_digits (int): The number of significant digit shown in the
+            logging message. Defaults to 4.
+        log_with_hierarchy (bool): Whether to log with hierarchy. If it is
+            True, the information is written to visualizer backend such as
+            :obj:`LocalVisBackend` and :obj:`TensorboardBackend`
+            with hierarchy. For example, ``loss`` will be saved as
+            ``train/loss``, and accuracy will be saved as ``val/accuracy``.
+            Defaults to False.
+            `New in version 0.7.0.`
+        mean_pattern (str): This is a regular expression used to match the log
+            that need to be included in the smoothing statistics.
+            `New in version 0.7.3.`
+
+    Examples:
+        >>> # `log_name` is defined, `loss_large_window` will be an additional
+        >>> # record.
+        >>> log_processor = dict(
+        >>>     window_size=10,
+        >>>     by_epoch=True,
+        >>>     custom_cfg=[dict(data_src='loss',
+        >>>                       log_name='loss_large_window',
+        >>>                       method_name='mean',
+        >>>                       window_size=100)])
+        >>> # `log_name` is not defined. `loss` will be overwritten.
+        >>> log_processor = dict(
+        >>>     window_size=10,
+        >>>     by_epoch=True,
+        >>>     custom_cfg=[dict(data_src='loss',
+        >>>                       method_name='mean',
+        >>>                       window_size=100)])
+        >>> # Record loss with different statistics methods.
+        >>> log_processor = dict(
+        >>>     window_size=10,
+        >>>     by_epoch=True,
+        >>>     custom_cfg=[dict(data_src='loss',
+        >>>                       log_name='loss_large_window',
+        >>>                       method_name='mean',
+        >>>                       window_size=100),
+        >>>                  dict(data_src='loss',
+        >>>                       method_name='mean',
+        >>>                       window_size=100)])
+        >>> # Overwrite loss item twice will raise an error.
+        >>> log_processor = dict(
+        >>>     window_size=10,
+        >>>     by_epoch=True,
+        >>>     custom_cfg=[dict(data_src='loss',
+        >>>                       method_name='mean',
+        >>>                       window_size=100),
+        >>>                  dict(data_src='loss',
+        >>>                       method_name='max',
+        >>>                       window_size=100)])
+        AssertionError
+    """
+
+    def __init__(self,
+                 window_size=10,
+                 by_epoch=True,
+                 custom_cfg: Optional[List[dict]] = None,
+                 num_digits: int = 4,
+                 log_with_hierarchy: bool = False,
+                 mean_pattern=r'.*(loss|time|data_time|grad_norm).*'):
+        self.window_size = window_size
+        self.by_epoch = by_epoch
+        self.custom_cfg = custom_cfg if custom_cfg else []
+        self.num_digits = num_digits
+        self.log_with_hierarchy = log_with_hierarchy
+        self.mean_pattern = re.compile(mean_pattern)
+        self._check_custom_cfg()
+
+    def get_log_after_iter(self, runner, batch_idx: int,
+                           mode: str) -> Tuple[dict, str]:
+        """Format log string after training, validation or testing iteration.
+
+        Args:
+            runner (Runner): The runner of training phase.
+            batch_idx (int): The index of the current batch in the current
+                loop.
+            mode (str): Current mode of runner, train, test or val.
+
+        Return:
+            Tuple[dict, str]: Formatted log dict/string which will be
+            recorded by :obj:`runner.message_hub` and :obj:`runner.visualizer`.
+        """
+        assert mode in ['train', 'test', 'val']
+        # Overwrite ``window_size`` defined in ``custom_cfg`` to int value.
+        parsed_cfg = self._parse_windows_size(runner, batch_idx,
+                                              self.custom_cfg)
+        # log_tag is used to write log information to terminal
+        log_tag = self._collect_scalars(parsed_cfg, runner, mode)
+
+        # If `self.log_with_hierarchy` is False, the tag is the same as
+        # log_tag. Otherwise, each key in tag starts with prefix `train`,
+        # `test` or `val`
+        if not self.log_with_hierarchy:
+            tag = copy.deepcopy(log_tag)
+        else:
+            tag = self._collect_scalars(parsed_cfg, runner, mode, True)
+
+        # Record learning rate.
+        lr_str_list = []
+        for key, value in tag.items():
+            if key.endswith('lr'):
+                key = self._remove_prefix(key, f'{mode}/')
+                log_tag.pop(key)
+                lr_str_list.append(f'{key}: '
+                                   f'{value:.{self.num_digits}e}')
+        lr_str = ' '.join(lr_str_list)
+        # Format log header.
+        # by_epoch == True
+        #   train/val: Epoch [5][5/10]  ...
+        #   test: Epoch [5/10]
+        # by_epoch == False
+        #  train: Epoch [5/10000] ... (divided by `max_iter`)
+        #  val/test: Epoch [5/2000] ... (divided by length of dataloader)
+        if self.by_epoch:
+            # Align the iteration log:
+            # Epoch(train)  [  9][010/270]
+            # ...                 ||| |||
+            # Epoch(train)  [ 10][100/270]
+            dataloader_len = self._get_dataloader_size(runner, mode)
+            cur_iter = self._get_iter(runner, batch_idx)
+            cur_iter_str = str(cur_iter).rjust(len(str(dataloader_len)))
+            if mode in ['train', 'val']:
+                cur_epoch = self._get_epoch(runner, mode)
+                if not (isinstance(runner._train_loop, dict)
+                        or runner._train_loop is None):
+                    # Right Align the epoch log:
+                    # Epoch(train)   [9][100/270]
+                    # ...             ||
+                    # Epoch(train) [100][100/270]
+                    max_epochs = runner.max_epochs
+                    # 3 means the three characters: "[", "]", and " " occupied
+                    # in " [{max_epochs}]"
+                    cur_epoch_str = f'[{cur_epoch}]'.rjust(
+                        len(str(max_epochs)) + 3, ' ')
+                else:
+                    cur_epoch_str = f'[{cur_epoch}]'
+                tag['epoch'] = cur_epoch
+                log_str = (f'Epoch({mode}){cur_epoch_str}'
+                           f'[{cur_iter_str}/{dataloader_len}]  ')
+            else:
+                log_str = (f'Epoch({mode}) '
+                           f'[{cur_iter_str}/{dataloader_len}]  ')
+        else:
+            if mode == 'train':
+                cur_iter = self._get_iter(runner, batch_idx)
+                cur_iter_str = str(cur_iter).rjust(len(str(runner.max_iters)))
+                log_str = (f'Iter({mode}) '
+                           f'[{cur_iter_str}/{runner.max_iters}]  ')
+            else:
+                dataloader_len = self._get_dataloader_size(runner, mode)
+                cur_iter_str = str(batch_idx + 1).rjust(
+                    len(str(dataloader_len)))
+                log_str = (f'Iter({mode}) [{cur_iter_str}/{dataloader_len}]  ')
+        # Add global iter.
+        if isinstance(runner._train_loop, dict) or runner._train_loop is None:
+            tag['iter'] = 0
+        else:
+            tag['iter'] = runner.iter + 1
+        # Concatenate lr, momentum string with log header.
+        log_str += f'{lr_str}  '
+        # If IterTimerHook used in runner, eta, time, and data_time should be
+        # recorded.
+        if (all(item in log_tag for item in ['time', 'data_time'])
+                and 'eta' in runner.message_hub.runtime_info):
+            eta = runner.message_hub.get_info('eta')
+            eta_str = str(datetime.timedelta(seconds=int(eta)))
+            log_str += f'eta: {eta_str}  '
+            log_str += (f'time: {log_tag["time"]:.{self.num_digits}f}  '
+                        f'data_time: '
+                        f'{log_tag["data_time"]:.{self.num_digits}f}  ')
+            # Pop recorded keys
+            log_tag.pop('time')
+            log_tag.pop('data_time')
+
+        # If cuda/musa is available,
+        # the max memory occupied should be calculated.
+        if is_cuda_available() or is_musa_available():
+            max_memory = self._get_max_memory(runner)
+            log_str += f'memory: {max_memory}  '
+            tag['memory'] = max_memory
+
+        # Loop left keys to fill `log_str`.
+        if mode in ('train', 'val'):
+            log_items = []
+            for name, val in log_tag.items():
+                if mode == 'val' and not name.startswith('val/loss'):
+                    continue
+                if isinstance(val, float):
+                    val = f'{val:.{self.num_digits}f}'
+                log_items.append(f'{name}: {val}')
+            log_str += '  '.join(log_items)
+        return tag, log_str
+
+    def get_log_after_epoch(self,
+                            runner,
+                            batch_idx: int,
+                            mode: str,
+                            with_non_scalar: bool = False) -> Tuple[dict, str]:
+        """Format log string after validation or testing epoch.
+
+        Args:
+            runner (Runner): The runner of validation/testing phase.
+            batch_idx (int): The index of the current batch in the current
+                loop.
+            mode (str): Current mode of runner.
+            with_non_scalar (bool): Whether to include non-scalar infos in the
+                returned tag. Defaults to False.
+
+        Return:
+            Tuple[dict, str]: Formatted log dict/string which will be
+            recorded by :obj:`runner.message_hub` and :obj:`runner.visualizer`.
+        """
+        assert mode in [
+            'test', 'val'
+        ], ('`_get_metric_log_str` only accept val or test mode, but got '
+            f'{mode}')
+        dataloader_len = self._get_dataloader_size(runner, mode)
+
+        # By epoch:
+        #     Epoch(val) [10][1000/1000]  ...
+        #     Epoch(test) [1000/1000] ...
+        # By iteration:
+        #     Iteration(val) [1000/1000]  ...
+        #     Iteration(test) [1000/1000]  ...
+        if self.by_epoch:
+            if mode == 'val':
+                cur_epoch = self._get_epoch(runner, mode)
+                log_str = (f'Epoch({mode}) [{cur_epoch}][{dataloader_len}/'
+                           f'{dataloader_len}]  ')
+            else:
+                log_str = (
+                    f'Epoch({mode}) [{dataloader_len}/{dataloader_len}]  ')
+
+        else:
+            log_str = (f'Iter({mode}) [{dataloader_len}/{dataloader_len}]  ')
+
+        custom_cfg_copy = copy.deepcopy(self.custom_cfg)
+        # remove prefix
+        custom_keys = [
+            self._remove_prefix(cfg['data_src'], f'{mode}/')
+            for cfg in custom_cfg_copy
+        ]
+        # Count the averaged time and data_time by epoch
+        if 'time' not in custom_keys:
+            custom_cfg_copy.append(
+                dict(data_src='time', window_size='epoch', method_name='mean'))
+        if 'data_time' not in custom_keys:
+            custom_cfg_copy.append(
+                dict(
+                    data_src='data_time',
+                    window_size='epoch',
+                    method_name='mean'))
+        parsed_cfg = self._parse_windows_size(runner, batch_idx,
+                                              custom_cfg_copy)
+        # tag is used to write log information to different backends.
+        ori_tag = self._collect_scalars(parsed_cfg, runner, mode,
+                                        self.log_with_hierarchy)
+        non_scalar_tag = self._collect_non_scalars(runner, mode)
+        # move `time` or `data_time` to the end of the log
+        tag = OrderedDict()
+        time_tag = OrderedDict()
+        for key, value in ori_tag.items():
+            if key in (f'{mode}/time', f'{mode}/data_time', 'time',
+                       'data_time'):
+                time_tag[key] = value
+            else:
+                tag[key] = value
+        # Log other messages.
+        log_items = []
+        log_str += '  '
+        for name, val in chain(tag.items(), non_scalar_tag.items(),
+                               time_tag.items()):
+            if isinstance(val, float):
+                val = f'{val:.{self.num_digits}f}'
+            if isinstance(val, (torch.Tensor, np.ndarray)):
+                # newline to display tensor and array.
+                val = f'\n{val}\n'
+            log_items.append(f'{name}: {val}')
+        log_str += '  '.join(log_items)
+
+        if with_non_scalar:
+            tag.update(non_scalar_tag)
+        tag.update(time_tag)
+        return tag, log_str
+
+    def _collect_scalars(self,
+                         custom_cfg: List[dict],
+                         runner,
+                         mode: str,
+                         reserve_prefix: bool = False) -> dict:
+        """Collect log information to compose a dict according to mode.
+
+        Args:
+            custom_cfg (List[dict]): A copy of ``self.custom_cfg`` with int
+                ``window_size``.
+            runner (Runner): The runner of the training/testing/validation
+                process.
+            mode (str): Current mode of runner.
+            reserve_prefix (bool): Whether to reserve the prefix of the key.
+
+        Returns:
+            dict: Statistical values of logs.
+        """
+        custom_cfg = copy.deepcopy(custom_cfg)
+        tag = OrderedDict()
+        # history_scalars of train/val/test phase.
+        history_scalars = runner.message_hub.log_scalars
+        # corresponding mode history_scalars
+        mode_history_scalars = OrderedDict()
+        # extract log scalars and remove prefix to `mode_history_scalars`
+        # according to mode.
+        for prefix_key, log_buffer in history_scalars.items():
+            if prefix_key.startswith(mode):
+                if not reserve_prefix:
+                    key = self._remove_prefix(prefix_key, f'{mode}/')
+                else:
+                    key = prefix_key
+                mode_history_scalars[key] = log_buffer
+        for key in mode_history_scalars:
+            # Update the latest learning rate and smoothed time logs.
+            if re.search(self.mean_pattern, key) is not None:
+                tag[key] = mode_history_scalars[key].mean(self.window_size)
+            else:
+                # Default statistic method is current.
+                tag[key] = mode_history_scalars[key].current()
+        # Update custom keys.
+        for log_cfg in custom_cfg:
+            data_src = log_cfg.pop('data_src')
+            log_name = log_cfg.pop('log_name', data_src)
+            if reserve_prefix:
+                data_src = f'{mode}/{data_src}'
+                log_name = f'{mode}/{log_name}'
+            # log item in custom_cfg could only exist in train or val
+            # mode.
+            if data_src in mode_history_scalars:
+                tag[log_name] = mode_history_scalars[data_src].statistics(
+                    **log_cfg)
+        return tag
+
+    def _collect_non_scalars(self, runner, mode: str) -> dict:
+        """Collect log information to compose a dict according to mode.
+
+        Args:
+            runner (Runner): The runner of the training/testing/validation
+                process.
+            mode (str): Current mode of runner.
+
+        Returns:
+            dict: non-scalar infos of the specified mode.
+        """
+        # infos of train/val/test phase.
+        infos = runner.message_hub.runtime_info
+        # corresponding mode infos
+        mode_infos = OrderedDict()
+        # extract log info and remove prefix to `mode_infos` according to mode.
+        for prefix_key, value in infos.items():
+            if prefix_key.startswith(mode):
+                if self.log_with_hierarchy:
+                    key = prefix_key
+                else:
+                    key = self._remove_prefix(prefix_key, f'{mode}/')
+                mode_infos[key] = value
+        return mode_infos
+
+    def _remove_prefix(self, string: str, prefix: str):
+        """Remove the prefix ``train``, ``val`` and ``test`` of the key."""
+        if string.startswith(prefix):
+            return string[len(prefix):]
+        else:
+            return string
+
+    def _check_custom_cfg(self) -> None:
+        """Check the legality of ``self.custom_cfg``."""
+
+        def _check_window_size():
+            for log_cfg in self.custom_cfg:
+                if not self.by_epoch:
+                    assert log_cfg['window_size'] != 'epoch', \
+                        'window_size cannot be epoch if LoggerHook.by_epoch' \
+                        ' is False.'
+
+        def _check_repeated_log_name():
+            # The `log_name` of the same data_src should not be repeated.
+            # If `log_name` is not specified, `data_src` will be overwritten.
+            # But only allowed to be overwritten once.
+            check_set = set()
+            for log_cfg in self.custom_cfg:
+                assert 'data_src' in log_cfg
+                data_src = log_cfg['data_src']
+                log_name = log_cfg.get('log_name', data_src)
+                assert log_name not in check_set, (
+                    f'Found duplicate {log_name} for {data_src}. Please check'
+                    'your `custom_cfg` for `log_processor`. You should '
+                    f'neither define duplicate `{log_name}` for {data_src} '
+                    f'nor do not define any {log_name} for multiple '
+                    f'{data_src}, See more information in the docstring of '
+                    'LogProcessor')
+
+                check_set.add(log_name)
+
+        _check_repeated_log_name()
+        _check_window_size()
+
+    def _parse_windows_size(self,
+                            runner,
+                            batch_idx: int,
+                            custom_cfg: Optional[list] = None) -> list:
+        """Parse window_size defined in custom_cfg to int value.
+
+        Args:
+            runner (Runner): The runner of the training/testing/validation
+                process.
+            batch_idx (int): The iteration index of current dataloader.
+            custom_cfg (list): A copy of ``self.custom_cfg``. Defaults to None
+                to keep backward compatibility.
+        """
+        if custom_cfg is None:
+            custom_cfg = copy.deepcopy(self.custom_cfg)
+        else:
+            custom_cfg = copy.deepcopy(custom_cfg)
+        for log_cfg in custom_cfg:
+            window_size = log_cfg.get('window_size', None)
+            if window_size is None or isinstance(window_size, int):
+                continue
+            elif window_size == 'epoch':
+                log_cfg['window_size'] = batch_idx + 1
+            elif window_size == 'global':
+                log_cfg['window_size'] = runner.iter + 1
+            else:
+                raise TypeError(
+                    'window_size should be int, epoch or global, but got '
+                    f'invalid {window_size}')
+        return custom_cfg
+
+    def _get_max_memory(self, runner) -> int:
+        """Returns the maximum GPU memory occupied by tensors in megabytes (MB)
+        for a given device.
+
+        Args:
+            runner (Runner): The runner of the training/testing/validation
+                process.
+
+        Returns:
+            The maximum GPU memory occupied by tensors in megabytes for a given
+            device.
+        """
+
+        device = getattr(runner.model, 'output_device', None)
+
+        if is_musa_available():
+            return get_max_musa_memory(device)
+        return get_max_cuda_memory(device)
+
+    def _get_iter(self, runner, batch_idx: int) -> int:
+        """Get current iteration index.
+
+        Args:
+            runner (Runner): The runner of the training/testing/validation
+                process.
+            batch_idx (int): The iteration index of current
+                dataloader. Defaults to None.
+
+        Returns:
+            int: The current global iter or inner iter.
+        """
+        if self.by_epoch:
+            current_iter = batch_idx + 1
+        else:
+            current_iter = runner.iter + 1
+        return current_iter
+
+    def _get_epoch(self, runner, mode: str) -> int:
+        """Get current epoch according to mode.
+
+        Args:
+            runner (Runner): The runner of the training/testing/validation
+                process.
+            mode (str): Current mode of runner.
+
+        Returns:
+            int: The current epoch.
+        """
+        if mode == 'train':
+            epoch = runner.epoch + 1
+        elif mode == 'val':
+            if (isinstance(runner._train_loop, dict)
+                    or runner._train_loop is None):
+                epoch = 0
+            else:
+                # normal val mode
+                # runner.epoch += 1 has been done before validation
+                epoch = runner.epoch
+        else:
+            raise ValueError(
+                f"runner mode should be 'train' or 'val', but got {mode}")
+        return epoch
+
+    def _get_cur_loop(self, runner, mode: str):
+        """Get current loop according to mode.
+
+        Args:
+            runner (Runner): The runner of the training/validation/testing
+                process.
+            mode (str): Current mode of runner.
+
+        Returns:
+            BaseLoop: Current loop of runner.
+        """
+        # returns type hint will occur circular import
+        if mode == 'train':
+            return runner.train_loop
+        elif mode == 'val':
+            return runner.val_loop
+        else:
+            return runner.test_loop
+
+    def _get_dataloader_size(self, runner, mode) -> int:
+        """Get dataloader size of current loop.
+
+        Args:
+            runner (Runner): The runner of the training/validation/testing
+            mode (str): Current mode of runner.
+
+        Returns:
+            int: The dataloader size of current loop.
+        """
+        return len(self._get_cur_loop(runner=runner, mode=mode).dataloader)
diff --git a/head_extractor/src/mmengine/runner/loops.py b/head_extractor/src/mmengine/runner/loops.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a678db7b9f7e57bf088c1d166536fb6dff2a838
--- /dev/null
+++ b/head_extractor/src/mmengine/runner/loops.py
@@ -0,0 +1,550 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import bisect
+import logging
+import time
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+from torch.utils.data import DataLoader
+
+from mmengine.evaluator import Evaluator
+from mmengine.logging import HistoryBuffer, print_log
+from mmengine.registry import LOOPS
+from mmengine.structures import BaseDataElement
+from mmengine.utils import is_list_of
+from .amp import autocast
+from .base_loop import BaseLoop
+from .utils import calc_dynamic_intervals
+
+
+@LOOPS.register_module()
+class EpochBasedTrainLoop(BaseLoop):
+    """Loop for epoch-based training.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): A dataloader object or a dict to
+            build a dataloader.
+        max_epochs (int): Total training epochs.
+        val_begin (int): The epoch that begins validating.
+            Defaults to 1.
+        val_interval (int): Validation interval. Defaults to 1.
+        dynamic_intervals (List[Tuple[int, int]], optional): The
+            first element in the tuple is a milestone and the second
+            element is a interval. The interval is used after the
+            corresponding milestone. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            runner,
+            dataloader: Union[DataLoader, Dict],
+            max_epochs: int,
+            val_begin: int = 1,
+            val_interval: int = 1,
+            dynamic_intervals: Optional[List[Tuple[int, int]]] = None) -> None:
+        super().__init__(runner, dataloader)
+        self._max_epochs = int(max_epochs)
+        assert self._max_epochs == max_epochs, \
+            f'`max_epochs` should be a integer number, but get {max_epochs}.'
+        self._max_iters = self._max_epochs * len(self.dataloader)
+        self._epoch = 0
+        self._iter = 0
+        self.val_begin = val_begin
+        self.val_interval = val_interval
+        # This attribute will be updated by `EarlyStoppingHook`
+        # when it is enabled.
+        self.stop_training = False
+        if hasattr(self.dataloader.dataset, 'metainfo'):
+            self.runner.visualizer.dataset_meta = \
+                self.dataloader.dataset.metainfo
+        else:
+            print_log(
+                f'Dataset {self.dataloader.dataset.__class__.__name__} has no '
+                'metainfo. ``dataset_meta`` in visualizer will be '
+                'None.',
+                logger='current',
+                level=logging.WARNING)
+
+        self.dynamic_milestones, self.dynamic_intervals = \
+            calc_dynamic_intervals(
+                self.val_interval, dynamic_intervals)
+
+    @property
+    def max_epochs(self):
+        """int: Total epochs to train model."""
+        return self._max_epochs
+
+    @property
+    def max_iters(self):
+        """int: Total iterations to train model."""
+        return self._max_iters
+
+    @property
+    def epoch(self):
+        """int: Current epoch."""
+        return self._epoch
+
+    @property
+    def iter(self):
+        """int: Current iteration."""
+        return self._iter
+
+    def run(self) -> torch.nn.Module:
+        """Launch training."""
+        self.runner.call_hook('before_train')
+
+        while self._epoch < self._max_epochs and not self.stop_training:
+            self.run_epoch()
+
+            self._decide_current_val_interval()
+            if (self.runner.val_loop is not None
+                    and self._epoch >= self.val_begin
+                    and (self._epoch % self.val_interval == 0
+                         or self._epoch == self._max_epochs)):
+                self.runner.val_loop.run()
+
+        self.runner.call_hook('after_train')
+        return self.runner.model
+
+    def run_epoch(self) -> None:
+        """Iterate one epoch."""
+        self.runner.call_hook('before_train_epoch')
+        self.runner.model.train()
+        for idx, data_batch in enumerate(self.dataloader):
+            self.run_iter(idx, data_batch)
+
+        self.runner.call_hook('after_train_epoch')
+        self._epoch += 1
+
+    def run_iter(self, idx, data_batch: Sequence[dict]) -> None:
+        """Iterate one min-batch.
+
+        Args:
+            data_batch (Sequence[dict]): Batch of data from dataloader.
+        """
+        self.runner.call_hook(
+            'before_train_iter', batch_idx=idx, data_batch=data_batch)
+        # Enable gradient accumulation mode and avoid unnecessary gradient
+        # synchronization during gradient accumulation process.
+        # outputs should be a dict of loss.
+        outputs = self.runner.model.train_step(
+            data_batch, optim_wrapper=self.runner.optim_wrapper)
+
+        self.runner.call_hook(
+            'after_train_iter',
+            batch_idx=idx,
+            data_batch=data_batch,
+            outputs=outputs)
+        self._iter += 1
+
+    def _decide_current_val_interval(self) -> None:
+        """Dynamically modify the ``val_interval``."""
+        step = bisect.bisect(self.dynamic_milestones, (self.epoch + 1))
+        self.val_interval = self.dynamic_intervals[step - 1]
+
+
+class _InfiniteDataloaderIterator:
+    """An infinite dataloader iterator wrapper for IterBasedTrainLoop.
+
+    It resets the dataloader to continue iterating when the iterator has
+    iterated over all the data. However, this approach is not efficient, as the
+    workers need to be restarted every time the dataloader is reset. It is
+    recommended to use `mmengine.dataset.InfiniteSampler` to enable the
+    dataloader to iterate infinitely.
+    """
+
+    def __init__(self, dataloader: DataLoader) -> None:
+        self._dataloader = dataloader
+        self._iterator = iter(self._dataloader)
+        self._epoch = 0
+
+    def __iter__(self):
+        return self
+
+    def __next__(self) -> Sequence[dict]:
+        try:
+            data = next(self._iterator)
+        except StopIteration:
+            print_log(
+                'Reach the end of the dataloader, it will be '
+                'restarted and continue to iterate. It is '
+                'recommended to use '
+                '`mmengine.dataset.InfiniteSampler` to enable the '
+                'dataloader to iterate infinitely.',
+                logger='current',
+                level=logging.WARNING)
+            self._epoch += 1
+            if hasattr(self._dataloader, 'sampler') and hasattr(
+                    self._dataloader.sampler, 'set_epoch'):
+                # In case the` _SingleProcessDataLoaderIter` has no sampler,
+                # or data loader uses `SequentialSampler` in Pytorch.
+                self._dataloader.sampler.set_epoch(self._epoch)
+
+            elif hasattr(self._dataloader, 'batch_sampler') and hasattr(
+                    self._dataloader.batch_sampler.sampler, 'set_epoch'):
+                # In case the` _SingleProcessDataLoaderIter` has no batch
+                # sampler. batch sampler in pytorch warps the sampler as its
+                # attributes.
+                self._dataloader.batch_sampler.sampler.set_epoch(self._epoch)
+            time.sleep(2)  # Prevent possible deadlock during epoch transition
+            self._iterator = iter(self._dataloader)
+            data = next(self._iterator)
+        return data
+
+
+@LOOPS.register_module()
+class IterBasedTrainLoop(BaseLoop):
+    """Loop for iter-based training.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): A dataloader object or a dict to
+            build a dataloader.
+        max_iters (int): Total training iterations.
+        val_begin (int): The iteration that begins validating.
+            Defaults to 1.
+        val_interval (int): Validation interval. Defaults to 1000.
+        dynamic_intervals (List[Tuple[int, int]], optional): The
+            first element in the tuple is a milestone and the second
+            element is a interval. The interval is used after the
+            corresponding milestone. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            runner,
+            dataloader: Union[DataLoader, Dict],
+            max_iters: int,
+            val_begin: int = 1,
+            val_interval: int = 1000,
+            dynamic_intervals: Optional[List[Tuple[int, int]]] = None) -> None:
+        super().__init__(runner, dataloader)
+        self._max_iters = int(max_iters)
+        assert self._max_iters == max_iters, \
+            f'`max_iters` should be a integer number, but get {max_iters}'
+        self._max_epochs = 1  # for compatibility with EpochBasedTrainLoop
+        self._epoch = 0
+        self._iter = 0
+        self.val_begin = val_begin
+        self.val_interval = val_interval
+        # This attribute will be updated by `EarlyStoppingHook`
+        # when it is enabled.
+        self.stop_training = False
+        if hasattr(self.dataloader.dataset, 'metainfo'):
+            self.runner.visualizer.dataset_meta = \
+                self.dataloader.dataset.metainfo
+        else:
+            print_log(
+                f'Dataset {self.dataloader.dataset.__class__.__name__} has no '
+                'metainfo. ``dataset_meta`` in visualizer will be '
+                'None.',
+                logger='current',
+                level=logging.WARNING)
+        # get the iterator of the dataloader
+        self.dataloader_iterator = _InfiniteDataloaderIterator(self.dataloader)
+
+        self.dynamic_milestones, self.dynamic_intervals = \
+            calc_dynamic_intervals(
+                self.val_interval, dynamic_intervals)
+
+    @property
+    def max_epochs(self):
+        """int: Total epochs to train model."""
+        return self._max_epochs
+
+    @property
+    def max_iters(self):
+        """int: Total iterations to train model."""
+        return self._max_iters
+
+    @property
+    def epoch(self):
+        """int: Current epoch."""
+        return self._epoch
+
+    @property
+    def iter(self):
+        """int: Current iteration."""
+        return self._iter
+
+    def run(self) -> None:
+        """Launch training."""
+        self.runner.call_hook('before_train')
+        # In iteration-based training loop, we treat the whole training process
+        # as a big epoch and execute the corresponding hook.
+        self.runner.call_hook('before_train_epoch')
+        if self._iter > 0:
+            print_log(
+                f'Advance dataloader {self._iter} steps to skip data '
+                'that has already been trained',
+                logger='current',
+                level=logging.WARNING)
+            for _ in range(self._iter):
+                next(self.dataloader_iterator)
+        while self._iter < self._max_iters and not self.stop_training:
+            self.runner.model.train()
+
+            data_batch = next(self.dataloader_iterator)
+            self.run_iter(data_batch)
+
+            self._decide_current_val_interval()
+            if (self.runner.val_loop is not None
+                    and self._iter >= self.val_begin
+                    and (self._iter % self.val_interval == 0
+                         or self._iter == self._max_iters)):
+                self.runner.val_loop.run()
+
+        self.runner.call_hook('after_train_epoch')
+        self.runner.call_hook('after_train')
+        return self.runner.model
+
+    def run_iter(self, data_batch: Sequence[dict]) -> None:
+        """Iterate one mini-batch.
+
+        Args:
+            data_batch (Sequence[dict]): Batch of data from dataloader.
+        """
+        self.runner.call_hook(
+            'before_train_iter', batch_idx=self._iter, data_batch=data_batch)
+        # Enable gradient accumulation mode and avoid unnecessary gradient
+        # synchronization during gradient accumulation process.
+        # outputs should be a dict of loss.
+        outputs = self.runner.model.train_step(
+            data_batch, optim_wrapper=self.runner.optim_wrapper)
+
+        self.runner.call_hook(
+            'after_train_iter',
+            batch_idx=self._iter,
+            data_batch=data_batch,
+            outputs=outputs)
+        self._iter += 1
+
+    def _decide_current_val_interval(self) -> None:
+        """Dynamically modify the ``val_interval``."""
+        step = bisect.bisect(self.dynamic_milestones, (self._iter + 1))
+        self.val_interval = self.dynamic_intervals[step - 1]
+
+
+@LOOPS.register_module()
+class ValLoop(BaseLoop):
+    """Loop for validation.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): A dataloader object or a dict to
+            build a dataloader.
+        evaluator (Evaluator or dict or list): Used for computing metrics.
+        fp16 (bool): Whether to enable fp16 validation. Defaults to
+            False.
+    """
+
+    def __init__(self,
+                 runner,
+                 dataloader: Union[DataLoader, Dict],
+                 evaluator: Union[Evaluator, Dict, List],
+                 fp16: bool = False) -> None:
+        super().__init__(runner, dataloader)
+
+        if isinstance(evaluator, (dict, list)):
+            self.evaluator = runner.build_evaluator(evaluator)  # type: ignore
+        else:
+            assert isinstance(evaluator, Evaluator), (
+                'evaluator must be one of dict, list or Evaluator instance, '
+                f'but got {type(evaluator)}.')
+            self.evaluator = evaluator  # type: ignore
+        if hasattr(self.dataloader.dataset, 'metainfo'):
+            self.evaluator.dataset_meta = self.dataloader.dataset.metainfo
+            self.runner.visualizer.dataset_meta = \
+                self.dataloader.dataset.metainfo
+        else:
+            print_log(
+                f'Dataset {self.dataloader.dataset.__class__.__name__} has no '
+                'metainfo. ``dataset_meta`` in evaluator, metric and '
+                'visualizer will be None.',
+                logger='current',
+                level=logging.WARNING)
+        self.fp16 = fp16
+        self.val_loss: Dict[str, HistoryBuffer] = dict()
+
+    def run(self) -> dict:
+        """Launch validation."""
+        self.runner.call_hook('before_val')
+        self.runner.call_hook('before_val_epoch')
+        self.runner.model.eval()
+
+        # clear val loss
+        self.val_loss.clear()
+        for idx, data_batch in enumerate(self.dataloader):
+            self.run_iter(idx, data_batch)
+
+        # compute metrics
+        metrics = self.evaluator.evaluate(len(self.dataloader.dataset))
+
+        if self.val_loss:
+            loss_dict = _parse_losses(self.val_loss, 'val')
+            metrics.update(loss_dict)
+
+        self.runner.call_hook('after_val_epoch', metrics=metrics)
+        self.runner.call_hook('after_val')
+        return metrics
+
+    @torch.no_grad()
+    def run_iter(self, idx, data_batch: Sequence[dict]):
+        """Iterate one mini-batch.
+
+        Args:
+            data_batch (Sequence[dict]): Batch of data
+                from dataloader.
+        """
+        self.runner.call_hook(
+            'before_val_iter', batch_idx=idx, data_batch=data_batch)
+        # outputs should be sequence of BaseDataElement
+        with autocast(enabled=self.fp16):
+            outputs = self.runner.model.val_step(data_batch)
+
+        outputs, self.val_loss = _update_losses(outputs, self.val_loss)
+
+        self.evaluator.process(data_samples=outputs, data_batch=data_batch)
+        self.runner.call_hook(
+            'after_val_iter',
+            batch_idx=idx,
+            data_batch=data_batch,
+            outputs=outputs)
+
+
+@LOOPS.register_module()
+class TestLoop(BaseLoop):
+    """Loop for test.
+
+    Args:
+        runner (Runner): A reference of runner.
+        dataloader (Dataloader or dict): A dataloader object or a dict to
+            build a dataloader.
+        evaluator (Evaluator or dict or list): Used for computing metrics.
+        fp16 (bool): Whether to enable fp16 testing. Defaults to
+            False.
+    """
+
+    def __init__(self,
+                 runner,
+                 dataloader: Union[DataLoader, Dict],
+                 evaluator: Union[Evaluator, Dict, List],
+                 fp16: bool = False):
+        super().__init__(runner, dataloader)
+
+        if isinstance(evaluator, dict) or isinstance(evaluator, list):
+            self.evaluator = runner.build_evaluator(evaluator)  # type: ignore
+        else:
+            self.evaluator = evaluator  # type: ignore
+        if hasattr(self.dataloader.dataset, 'metainfo'):
+            self.evaluator.dataset_meta = self.dataloader.dataset.metainfo
+            self.runner.visualizer.dataset_meta = \
+                self.dataloader.dataset.metainfo
+        else:
+            print_log(
+                f'Dataset {self.dataloader.dataset.__class__.__name__} has no '
+                'metainfo. ``dataset_meta`` in evaluator, metric and '
+                'visualizer will be None.',
+                logger='current',
+                level=logging.WARNING)
+        self.fp16 = fp16
+        self.test_loss: Dict[str, HistoryBuffer] = dict()
+
+    def run(self) -> dict:
+        """Launch test."""
+        self.runner.call_hook('before_test')
+        self.runner.call_hook('before_test_epoch')
+        self.runner.model.eval()
+
+        # clear test loss
+        self.test_loss.clear()
+        for idx, data_batch in enumerate(self.dataloader):
+            self.run_iter(idx, data_batch)
+
+        # compute metrics
+        metrics = self.evaluator.evaluate(len(self.dataloader.dataset))
+
+        if self.test_loss:
+            loss_dict = _parse_losses(self.test_loss, 'test')
+            metrics.update(loss_dict)
+
+        self.runner.call_hook('after_test_epoch', metrics=metrics)
+        self.runner.call_hook('after_test')
+        return metrics
+
+    @torch.no_grad()
+    def run_iter(self, idx, data_batch: Sequence[dict]) -> None:
+        """Iterate one mini-batch.
+
+        Args:
+            data_batch (Sequence[dict]): Batch of data from dataloader.
+        """
+        self.runner.call_hook(
+            'before_test_iter', batch_idx=idx, data_batch=data_batch)
+        # predictions should be sequence of BaseDataElement
+        with autocast(enabled=self.fp16):
+            outputs = self.runner.model.test_step(data_batch)
+
+        outputs, self.test_loss = _update_losses(outputs, self.test_loss)
+
+        self.evaluator.process(data_samples=outputs, data_batch=data_batch)
+        self.runner.call_hook(
+            'after_test_iter',
+            batch_idx=idx,
+            data_batch=data_batch,
+            outputs=outputs)
+
+
+def _parse_losses(losses: Dict[str, HistoryBuffer],
+                  stage: str) -> Dict[str, float]:
+    """Parses the raw losses of the network.
+
+    Args:
+        losses (dict): raw losses of the network.
+        stage (str): The stage of loss, e.g., 'val' or 'test'.
+
+    Returns:
+        dict[str, float]: The key is the loss name, and the value is the
+        average loss.
+    """
+    all_loss = 0
+    loss_dict: Dict[str, float] = dict()
+
+    for loss_name, loss_value in losses.items():
+        avg_loss = loss_value.mean()
+        loss_dict[loss_name] = avg_loss
+        if 'loss' in loss_name:
+            all_loss += avg_loss
+
+    loss_dict[f'{stage}_loss'] = all_loss
+    return loss_dict
+
+
+def _update_losses(outputs: list, losses: dict) -> Tuple[list, dict]:
+    """Update and record the losses of the network.
+
+    Args:
+        outputs (list): The outputs of the network.
+        losses (dict): The losses of the network.
+
+    Returns:
+        list: The updated outputs of the network.
+        dict: The updated losses of the network.
+    """
+    if isinstance(outputs[-1],
+                  BaseDataElement) and outputs[-1].keys() == ['loss']:
+        loss = outputs[-1].loss  # type: ignore
+        outputs = outputs[:-1]
+    else:
+        loss = dict()
+
+    for loss_name, loss_value in loss.items():
+        if loss_name not in losses:
+            losses[loss_name] = HistoryBuffer()
+        if isinstance(loss_value, torch.Tensor):
+            losses[loss_name].update(loss_value.item())
+        elif is_list_of(loss_value, torch.Tensor):
+            for loss_value_i in loss_value:
+                losses[loss_name].update(loss_value_i.item())
+    return outputs, losses
diff --git a/head_extractor/src/mmengine/runner/priority.py b/head_extractor/src/mmengine/runner/priority.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff644043b810c49dbe673e2ba5e35900650c3f02
--- /dev/null
+++ b/head_extractor/src/mmengine/runner/priority.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import Enum
+from typing import Union
+
+
+class Priority(Enum):
+    """Hook priority levels.
+
+    +--------------+------------+
+    | Level        | Value      |
+    +==============+============+
+    | HIGHEST      | 0          |
+    +--------------+------------+
+    | VERY_HIGH    | 10         |
+    +--------------+------------+
+    | HIGH         | 30         |
+    +--------------+------------+
+    | ABOVE_NORMAL | 40         |
+    +--------------+------------+
+    | NORMAL       | 50         |
+    +--------------+------------+
+    | BELOW_NORMAL | 60         |
+    +--------------+------------+
+    | LOW          | 70         |
+    +--------------+------------+
+    | VERY_LOW     | 90         |
+    +--------------+------------+
+    | LOWEST       | 100        |
+    +--------------+------------+
+    """
+
+    HIGHEST = 0
+    VERY_HIGH = 10
+    HIGH = 30
+    ABOVE_NORMAL = 40
+    NORMAL = 50
+    BELOW_NORMAL = 60
+    LOW = 70
+    VERY_LOW = 90
+    LOWEST = 100
+
+
+def get_priority(priority: Union[int, str, Priority]) -> int:
+    """Get priority value.
+
+    Args:
+        priority (int or str or :obj:`Priority`): Priority.
+
+    Returns:
+        int: The priority value.
+    """
+    if isinstance(priority, int):
+        if priority < 0 or priority > 100:
+            raise ValueError('priority must be between 0 and 100')
+        return priority
+    elif isinstance(priority, Priority):
+        return priority.value
+    elif isinstance(priority, str):
+        return Priority[priority.upper()].value
+    else:
+        raise TypeError('priority must be an integer or Priority enum value')
diff --git a/head_extractor/src/mmengine/runner/runner.py b/head_extractor/src/mmengine/runner/runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..68716ab2538d58ee73db1e9615d9d4f79d484e35
--- /dev/null
+++ b/head_extractor/src/mmengine/runner/runner.py
@@ -0,0 +1,2413 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+import os
+import os.path as osp
+import pickle
+import platform
+import time
+import warnings
+from collections import OrderedDict
+from functools import partial
+from typing import Callable, Dict, List, Optional, Sequence, Union
+
+import torch
+import torch.nn as nn
+from torch.nn.parallel.distributed import DistributedDataParallel
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader
+
+import mmengine
+from mmengine.config import Config, ConfigDict
+from mmengine.dataset import worker_init_fn as default_worker_init_fn
+from mmengine.device import get_device
+from mmengine.dist import (broadcast, get_dist_info, get_rank, get_world_size,
+                           init_dist, is_distributed, master_only)
+from mmengine.evaluator import Evaluator
+from mmengine.fileio import FileClient, join_path
+from mmengine.hooks import Hook
+from mmengine.logging import MessageHub, MMLogger, print_log
+from mmengine.model import (MMDistributedDataParallel, convert_sync_batchnorm,
+                            is_model_wrapper, revert_sync_batchnorm)
+from mmengine.model.efficient_conv_bn_eval import \
+    turn_on_efficient_conv_bn_eval
+from mmengine.optim import (OptimWrapper, OptimWrapperDict, _ParamScheduler,
+                            build_optim_wrapper)
+from mmengine.registry import (DATA_SAMPLERS, DATASETS, EVALUATOR, FUNCTIONS,
+                               HOOKS, LOG_PROCESSORS, LOOPS, MODEL_WRAPPERS,
+                               MODELS, OPTIM_WRAPPERS, PARAM_SCHEDULERS,
+                               RUNNERS, VISUALIZERS, DefaultScope)
+from mmengine.utils import apply_to, digit_version, get_git_hash, is_seq_of
+from mmengine.utils.dl_utils import (TORCH_VERSION, collect_env,
+                                     set_multi_processing)
+from mmengine.visualization import Visualizer
+from .activation_checkpointing import turn_on_activation_checkpointing
+from .base_loop import BaseLoop
+from .checkpoint import (_load_checkpoint, _load_checkpoint_to_model,
+                         find_latest_checkpoint, save_checkpoint,
+                         weights_to_cpu)
+from .log_processor import LogProcessor
+from .loops import EpochBasedTrainLoop, IterBasedTrainLoop, TestLoop, ValLoop
+from .priority import Priority, get_priority
+from .utils import _get_batch_size, set_random_seed
+
+ConfigType = Union[Dict, Config, ConfigDict]
+ParamSchedulerType = Union[List[_ParamScheduler], Dict[str,
+                                                       List[_ParamScheduler]]]
+OptimWrapperType = Union[OptimWrapper, OptimWrapperDict]
+
+
+class _SlicedDataset:
+
+    def __init__(self, dataset, length) -> None:
+        self._dataset = dataset
+        self._length = length
+
+    def __getattr__(self, name):
+        return getattr(self._dataset, name)
+
+    def __getitem__(self, idx):
+        return self._dataset[idx]
+
+    def __len__(self):
+        return self._length
+
+
+@RUNNERS.register_module()
+class Runner:
+    """A training helper for PyTorch.
+
+    Runner object can be built from config by ``runner = Runner.from_cfg(cfg)``
+    where the ``cfg`` usually contains training, validation, and test-related
+    configurations to build corresponding components. We usually use the
+    same config to launch training, testing, and validation tasks. However,
+    only some of these components are necessary at the same time, e.g.,
+    testing a model does not need training or validation-related components.
+
+    To avoid repeatedly modifying config, the construction of ``Runner`` adopts
+    lazy initialization to only initialize components when they are going to be
+    used. Therefore, the model is always initialized at the beginning, and
+    training, validation, and, testing related components are only initialized
+    when calling ``runner.train()``, ``runner.val()``, and ``runner.test()``,
+    respectively.
+
+    Args:
+        model (:obj:`torch.nn.Module` or dict): The model to be run. It can be
+            a dict used for build a model.
+        work_dir (str): The working directory to save checkpoints. The logs
+            will be saved in the subdirectory of `work_dir` named
+            :attr:`timestamp`.
+        train_dataloader (Dataloader or dict, optional): A dataloader object or
+            a dict to build a dataloader. If ``None`` is given, it means
+            skipping training steps. Defaults to None.
+            See :meth:`build_dataloader` for more details.
+        val_dataloader (Dataloader or dict, optional): A dataloader object or
+            a dict to build a dataloader. If ``None`` is given, it means
+            skipping validation steps. Defaults to None.
+            See :meth:`build_dataloader` for more details.
+        test_dataloader (Dataloader or dict, optional): A dataloader object or
+            a dict to build a dataloader. If ``None`` is given, it means
+            skipping test steps. Defaults to None.
+            See :meth:`build_dataloader` for more details.
+        train_cfg (dict, optional): A dict to build a training loop. If it does
+            not provide "type" key, it should contain "by_epoch" to decide
+            which type of training loop :class:`EpochBasedTrainLoop` or
+            :class:`IterBasedTrainLoop` should be used. If ``train_cfg``
+            specified, :attr:`train_dataloader` should also be specified.
+            Defaults to None. See :meth:`build_train_loop` for more details.
+        val_cfg (dict, optional): A dict to build a validation loop. If it does
+            not provide "type" key, :class:`ValLoop` will be used by default.
+            If ``val_cfg`` specified, :attr:`val_dataloader` should also be
+            specified. If ``ValLoop`` is built with `fp16=True``,
+            ``runner.val()`` will be performed under fp16 precision.
+            Defaults to None. See :meth:`build_val_loop` for more details.
+        test_cfg (dict, optional): A dict to build a test loop. If it does
+            not provide "type" key, :class:`TestLoop` will be used by default.
+            If ``test_cfg`` specified, :attr:`test_dataloader` should also be
+            specified. If ``ValLoop`` is built with `fp16=True``,
+            ``runner.val()`` will be performed under fp16 precision.
+            Defaults to None. See :meth:`build_test_loop` for more details.
+        auto_scale_lr (dict, Optional): Config to scale the learning rate
+            automatically. It includes ``base_batch_size`` and ``enable``.
+            ``base_batch_size`` is the batch size that the optimizer lr is
+            based on. ``enable`` is the switch to turn on and off the feature.
+        optim_wrapper (OptimWrapper or dict, optional):
+            Computing gradient of model parameters. If specified,
+            :attr:`train_dataloader` should also be specified. If automatic
+            mixed precision or gradient accmulation
+            training is required. The type of ``optim_wrapper`` should be
+            AmpOptimizerWrapper. See :meth:`build_optim_wrapper` for
+            examples. Defaults to None.
+        param_scheduler (_ParamScheduler or dict or list, optional):
+            Parameter scheduler for updating optimizer parameters. If
+            specified, :attr:`optimizer` should also be specified.
+            Defaults to None.
+            See :meth:`build_param_scheduler` for examples.
+        val_evaluator (Evaluator or dict or list, optional): A evaluator object
+            used for computing metrics for validation. It can be a dict or a
+            list of dict to build a evaluator. If specified,
+            :attr:`val_dataloader` should also be specified. Defaults to None.
+        test_evaluator (Evaluator or dict or list, optional): A evaluator
+            object used for computing metrics for test steps. It can be a dict
+            or a list of dict to build a evaluator. If specified,
+            :attr:`test_dataloader` should also be specified. Defaults to None.
+        default_hooks (dict[str, dict] or dict[str, Hook], optional): Hooks to
+            execute default actions like updating model parameters and saving
+            checkpoints. Default hooks are ``OptimizerHook``,
+            ``IterTimerHook``, ``LoggerHook``, ``ParamSchedulerHook`` and
+            ``CheckpointHook``. Defaults to None.
+            See :meth:`register_default_hooks` for more details.
+        custom_hooks (list[dict] or list[Hook], optional): Hooks to execute
+            custom actions like visualizing images processed by pipeline.
+            Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`. If the ``model`` argument is a dict
+            and doesn't contain the key ``data_preprocessor``, set the argument
+            as the ``data_preprocessor`` of the ``model`` dict.
+            Defaults to None.
+        load_from (str, optional): The checkpoint file to load from.
+            Defaults to None.
+        resume (bool): Whether to resume training. Defaults to False. If
+            ``resume`` is True and ``load_from`` is None, automatically to
+            find latest checkpoint from ``work_dir``. If not found, resuming
+            does nothing.
+        launcher (str): Way to launcher multi-process. Supported launchers
+            are 'pytorch', 'mpi', 'slurm' and 'none'. If 'none' is provided,
+            non-distributed environment will be launched.
+        env_cfg (dict): A dict used for setting environment. Defaults to
+            dict(dist_cfg=dict(backend='nccl')).
+        log_processor (dict, optional): A processor to format logs. Defaults to
+            None.
+        log_level (int or str): The log level of MMLogger handlers.
+            Defaults to 'INFO'.
+        visualizer (Visualizer or dict, optional): A Visualizer object or a
+            dict build Visualizer object. Defaults to None. If not
+            specified, default config will be used.
+        default_scope (str): Used to reset registries location.
+            Defaults to "mmengine".
+        randomness (dict): Some settings to make the experiment as reproducible
+            as possible like seed and deterministic.
+            Defaults to ``dict(seed=None)``. If seed is None, a random number
+            will be generated and it will be broadcasted to all other processes
+            if in distributed environment. If ``cudnn_benchmark`` is
+            ``True`` in ``env_cfg`` but ``deterministic`` is ``True`` in
+            ``randomness``, the value of ``torch.backends.cudnn.benchmark``
+            will be ``False`` finally.
+        experiment_name (str, optional): Name of current experiment. If not
+            specified, timestamp will be used as ``experiment_name``.
+            Defaults to None.
+        cfg (dict or Configdict or :obj:`Config`, optional): Full config.
+            Defaults to None.
+
+    Note:
+        Since PyTorch 2.0.0, you can enable ``torch.compile`` by passing in
+        `cfg.compile = True`. If you want to control compile options, you
+        can pass a dict, e.g. ``cfg.compile = dict(backend='eager')``.
+        Refer to `PyTorch API Documentation <https://pytorch.org/docs/
+        master/generated/torch.compile.html#torch.compile>`_ for more valid
+        options.
+
+    Examples:
+        >>> from mmengine.runner import Runner
+        >>> cfg = dict(
+        >>>     model=dict(type='ToyModel'),
+        >>>     work_dir='path/of/work_dir',
+        >>>     train_dataloader=dict(
+        >>>     dataset=dict(type='ToyDataset'),
+        >>>     sampler=dict(type='DefaultSampler', shuffle=True),
+        >>>     batch_size=1,
+        >>>     num_workers=0),
+        >>>     val_dataloader=dict(
+        >>>         dataset=dict(type='ToyDataset'),
+        >>>         sampler=dict(type='DefaultSampler', shuffle=False),
+        >>>        batch_size=1,
+        >>>        num_workers=0),
+        >>>     test_dataloader=dict(
+        >>>         dataset=dict(type='ToyDataset'),
+        >>>         sampler=dict(type='DefaultSampler', shuffle=False),
+        >>>         batch_size=1,
+        >>>         num_workers=0),
+        >>>     auto_scale_lr=dict(base_batch_size=16, enable=False),
+        >>>     optim_wrapper=dict(type='OptimizerWrapper', optimizer=dict(
+        >>>         type='SGD', lr=0.01)),
+        >>>     param_scheduler=dict(type='MultiStepLR', milestones=[1, 2]),
+        >>>     val_evaluator=dict(type='ToyEvaluator'),
+        >>>     test_evaluator=dict(type='ToyEvaluator'),
+        >>>     train_cfg=dict(by_epoch=True, max_epochs=3, val_interval=1),
+        >>>     val_cfg=dict(),
+        >>>     test_cfg=dict(),
+        >>>     custom_hooks=[],
+        >>>     default_hooks=dict(
+        >>>         timer=dict(type='IterTimerHook'),
+        >>>         checkpoint=dict(type='CheckpointHook', interval=1),
+        >>>         logger=dict(type='LoggerHook'),
+        >>>         optimizer=dict(type='OptimizerHook', grad_clip=False),
+        >>>         param_scheduler=dict(type='ParamSchedulerHook')),
+        >>>     launcher='none',
+        >>>     env_cfg=dict(dist_cfg=dict(backend='nccl')),
+        >>>     log_processor=dict(window_size=20),
+        >>>     visualizer=dict(type='Visualizer',
+        >>>     vis_backends=[dict(type='LocalVisBackend',
+        >>>                        save_dir='temp_dir')])
+        >>>    )
+        >>> runner = Runner.from_cfg(cfg)
+        >>> runner.train()
+        >>> runner.test()
+    """
+    cfg: Config
+    _train_loop: Optional[Union[BaseLoop, Dict]]
+    _val_loop: Optional[Union[BaseLoop, Dict]]
+    _test_loop: Optional[Union[BaseLoop, Dict]]
+
+    def __init__(
+        self,
+        model: Union[nn.Module, Dict],
+        work_dir: str,
+        train_dataloader: Optional[Union[DataLoader, Dict]] = None,
+        val_dataloader: Optional[Union[DataLoader, Dict]] = None,
+        test_dataloader: Optional[Union[DataLoader, Dict]] = None,
+        train_cfg: Optional[Dict] = None,
+        val_cfg: Optional[Dict] = None,
+        test_cfg: Optional[Dict] = None,
+        auto_scale_lr: Optional[Dict] = None,
+        optim_wrapper: Optional[Union[OptimWrapper, Dict]] = None,
+        param_scheduler: Optional[Union[_ParamScheduler, Dict, List]] = None,
+        val_evaluator: Optional[Union[Evaluator, Dict, List]] = None,
+        test_evaluator: Optional[Union[Evaluator, Dict, List]] = None,
+        default_hooks: Optional[Dict[str, Union[Hook, Dict]]] = None,
+        custom_hooks: Optional[List[Union[Hook, Dict]]] = None,
+        data_preprocessor: Union[nn.Module, Dict, None] = None,
+        load_from: Optional[str] = None,
+        resume: bool = False,
+        launcher: str = 'none',
+        env_cfg: Dict = dict(dist_cfg=dict(backend='nccl')),
+        log_processor: Optional[Dict] = None,
+        log_level: str = 'INFO',
+        visualizer: Optional[Union[Visualizer, Dict]] = None,
+        default_scope: str = 'mmengine',
+        randomness: Dict = dict(seed=None),
+        experiment_name: Optional[str] = None,
+        cfg: Optional[ConfigType] = None,
+    ):
+        self._work_dir = osp.abspath(work_dir)
+        mmengine.mkdir_or_exist(self._work_dir)
+
+        # recursively copy the `cfg` because `self.cfg` will be modified
+        # everywhere.
+        if cfg is not None:
+            if isinstance(cfg, Config):
+                self.cfg = copy.deepcopy(cfg)
+            elif isinstance(cfg, dict):
+                self.cfg = Config(cfg)
+        else:
+            self.cfg = Config(dict())
+
+        # lazy initialization
+        training_related = [train_dataloader, train_cfg, optim_wrapper]
+        if not (all(item is None for item in training_related)
+                or all(item is not None for item in training_related)):
+            raise ValueError(
+                'train_dataloader, train_cfg, and optim_wrapper should be '
+                'either all None or not None, but got '
+                f'train_dataloader={train_dataloader}, '
+                f'train_cfg={train_cfg}, '
+                f'optim_wrapper={optim_wrapper}.')
+        self._train_dataloader = train_dataloader
+        self._train_loop = train_cfg
+
+        self.optim_wrapper: Optional[Union[OptimWrapper, dict]]
+        self.optim_wrapper = optim_wrapper
+
+        self.auto_scale_lr = auto_scale_lr
+
+        # If there is no need to adjust learning rate, momentum or other
+        # parameters of optimizer, param_scheduler can be None
+        if param_scheduler is not None and self.optim_wrapper is None:
+            raise ValueError(
+                'param_scheduler should be None when optim_wrapper is None, '
+                f'but got {param_scheduler}')
+
+        # Parse `param_scheduler` to a list or a dict. If `optim_wrapper` is a
+        # `dict` with single optimizer, parsed param_scheduler will be a
+        # list of parameter schedulers. If `optim_wrapper` is
+        # a `dict` with multiple optimizers, parsed `param_scheduler` will be
+        # dict with multiple list of parameter schedulers.
+        self._check_scheduler_cfg(param_scheduler)
+        self.param_schedulers = param_scheduler
+
+        val_related = [val_dataloader, val_cfg, val_evaluator]
+        if not (all(item is None
+                    for item in val_related) or all(item is not None
+                                                    for item in val_related)):
+            raise ValueError(
+                'val_dataloader, val_cfg, and val_evaluator should be either '
+                'all None or not None, but got '
+                f'val_dataloader={val_dataloader}, val_cfg={val_cfg}, '
+                f'val_evaluator={val_evaluator}')
+        self._val_dataloader = val_dataloader
+        self._val_loop = val_cfg
+        self._val_evaluator = val_evaluator
+
+        test_related = [test_dataloader, test_cfg, test_evaluator]
+        if not (all(item is None for item in test_related)
+                or all(item is not None for item in test_related)):
+            raise ValueError(
+                'test_dataloader, test_cfg, and test_evaluator should be '
+                'either all None or not None, but got '
+                f'test_dataloader={test_dataloader}, test_cfg={test_cfg}, '
+                f'test_evaluator={test_evaluator}')
+        self._test_dataloader = test_dataloader
+        self._test_loop = test_cfg
+        self._test_evaluator = test_evaluator
+
+        self._launcher = launcher
+        if self._launcher == 'none':
+            self._distributed = False
+        else:
+            self._distributed = True
+
+        # self._timestamp will be set in the `setup_env` method. Besides,
+        # it also will initialize multi-process and (or) distributed
+        # environment.
+        self.setup_env(env_cfg)
+        # self._deterministic and self._seed will be set in the
+        # `set_randomness`` method
+        self._randomness_cfg = randomness
+        self.set_randomness(**randomness)
+
+        if experiment_name is not None:
+            self._experiment_name = f'{experiment_name}_{self._timestamp}'
+        elif self.cfg.filename is not None:
+            filename_no_ext = osp.splitext(osp.basename(self.cfg.filename))[0]
+            self._experiment_name = f'{filename_no_ext}_{self._timestamp}'
+        else:
+            self._experiment_name = self.timestamp
+        self._log_dir = osp.join(self.work_dir, self.timestamp)
+        mmengine.mkdir_or_exist(self._log_dir)
+        # Used to reset registries location. See :meth:`Registry.build` for
+        # more details.
+        if default_scope is not None:
+            default_scope = DefaultScope.get_instance(  # type: ignore
+                self._experiment_name,
+                scope_name=default_scope)
+        self.default_scope = default_scope
+
+        # Build log processor to format message.
+        log_processor = dict() if log_processor is None else log_processor
+        self.log_processor = self.build_log_processor(log_processor)
+        # Since `get_instance` could return any subclass of ManagerMixin. The
+        # corresponding attribute needs a type hint.
+        self.logger = self.build_logger(log_level=log_level)
+
+        # Collect and log environment information.
+        self._log_env(env_cfg)
+
+        # Build `message_hub` for communication among components.
+        # `message_hub` can store log scalars (loss, learning rate) and
+        # runtime information (iter and epoch). Those components that do not
+        # have access to the runner can get iteration or epoch information
+        # from `message_hub`. For example, models can get the latest created
+        # `message_hub` by
+        # `self.message_hub=MessageHub.get_current_instance()` and then get
+        # current epoch by `cur_epoch = self.message_hub.get_info('epoch')`.
+        # See `MessageHub` and `ManagerMixin` for more details.
+        self.message_hub = self.build_message_hub()
+        # visualizer used for writing log or visualizing all kinds of data
+        self.visualizer = self.build_visualizer(visualizer)
+        if self.cfg:
+            self.visualizer.add_config(self.cfg)
+
+        self._load_from = load_from
+        self._resume = resume
+        # flag to mark whether checkpoint has been loaded or resumed
+        self._has_loaded = False
+
+        # build a model
+        if isinstance(model, dict) and data_preprocessor is not None:
+            # Merge the data_preprocessor to model config.
+            model.setdefault('data_preprocessor', data_preprocessor)
+        self.model = self.build_model(model)
+        # wrap model
+        self.model = self.wrap_model(
+            self.cfg.get('model_wrapper_cfg'), self.model)
+
+        # get model name from the model class
+        if hasattr(self.model, 'module'):
+            self._model_name = self.model.module.__class__.__name__
+        else:
+            self._model_name = self.model.__class__.__name__
+
+        self._hooks: List[Hook] = []
+        # register hooks to `self._hooks`
+        self.register_hooks(default_hooks, custom_hooks)
+        # log hooks information
+        self.logger.info(f'Hooks will be executed in the following '
+                         f'order:\n{self.get_hooks_info()}')
+
+        # dump `cfg` to `work_dir`
+        self.dump_config()
+
+    @classmethod
+    def from_cfg(cls, cfg: ConfigType) -> 'Runner':
+        """Build a runner from config.
+
+        Args:
+            cfg (ConfigType): A config used for building runner. Keys of
+                ``cfg`` can see :meth:`__init__`.
+
+        Returns:
+            Runner: A runner build from ``cfg``.
+        """
+        cfg = copy.deepcopy(cfg)
+        runner = cls(
+            model=cfg['model'],
+            work_dir=cfg['work_dir'],
+            train_dataloader=cfg.get('train_dataloader'),
+            val_dataloader=cfg.get('val_dataloader'),
+            test_dataloader=cfg.get('test_dataloader'),
+            train_cfg=cfg.get('train_cfg'),
+            val_cfg=cfg.get('val_cfg'),
+            test_cfg=cfg.get('test_cfg'),
+            auto_scale_lr=cfg.get('auto_scale_lr'),
+            optim_wrapper=cfg.get('optim_wrapper'),
+            param_scheduler=cfg.get('param_scheduler'),
+            val_evaluator=cfg.get('val_evaluator'),
+            test_evaluator=cfg.get('test_evaluator'),
+            default_hooks=cfg.get('default_hooks'),
+            custom_hooks=cfg.get('custom_hooks'),
+            data_preprocessor=cfg.get('data_preprocessor'),
+            load_from=cfg.get('load_from'),
+            resume=cfg.get('resume', False),
+            launcher=cfg.get('launcher', 'none'),
+            env_cfg=cfg.get('env_cfg', dict(dist_cfg=dict(backend='nccl'))),
+            log_processor=cfg.get('log_processor'),
+            log_level=cfg.get('log_level', 'INFO'),
+            visualizer=cfg.get('visualizer'),
+            default_scope=cfg.get('default_scope', 'mmengine'),
+            randomness=cfg.get('randomness', dict(seed=None)),
+            experiment_name=cfg.get('experiment_name'),
+            cfg=cfg,
+        )
+
+        return runner
+
+    @property
+    def experiment_name(self):
+        """str: Name of experiment."""
+        return self._experiment_name
+
+    @property
+    def model_name(self):
+        """str: Name of the model, usually the module class name."""
+        return self._model_name
+
+    @property
+    def work_dir(self):
+        """str: The working directory to save checkpoints and logs."""
+        return self._work_dir
+
+    @property
+    def log_dir(self):
+        return self._log_dir
+
+    @property
+    def max_epochs(self):
+        """int: Total epochs to train model."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.max_epochs
+        else:
+            return 0
+
+    @property
+    def max_iters(self):
+        """int: Total iterations to train model."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.max_iters
+        else:
+            return 0
+
+    @property
+    def epoch(self):
+        """int: Current epoch."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.epoch
+        else:
+            return 0
+
+    @property
+    def iter(self):
+        """int: Current iteration."""
+        if isinstance(self.train_loop, BaseLoop):
+            return self.train_loop.iter
+        else:
+            return 0
+
+    @property
+    def launcher(self):
+        """str: Way to launcher multi processes."""
+        return self._launcher
+
+    @property
+    def distributed(self):
+        """bool: Whether current environment is distributed."""
+        return self._distributed
+
+    @property
+    def rank(self):
+        """int: Rank of current process."""
+        return self._rank
+
+    @property
+    def world_size(self):
+        """int: Number of processes participating in the job."""
+        return self._world_size
+
+    @property
+    def deterministic(self):
+        """int: Whether cudnn to select deterministic algorithms."""
+        return self._deterministic
+
+    @property
+    def seed(self):
+        """int: A number to set random modules."""
+        return self._seed
+
+    @property
+    def timestamp(self):
+        """str: Timestamp when creating experiment."""
+        return self._timestamp
+
+    @property
+    def hooks(self):
+        """list[:obj:`Hook`]: A list of registered hooks."""
+        return self._hooks
+
+    @property
+    def train_loop(self):
+        """:obj:`BaseLoop`: A loop to run training."""
+        if isinstance(self._train_loop, BaseLoop) or self._train_loop is None:
+            return self._train_loop
+        else:
+            self._train_loop = self.build_train_loop(self._train_loop)
+            return self._train_loop
+
+    @property
+    def val_loop(self):
+        """:obj:`BaseLoop`: A loop to run validation."""
+        if isinstance(self._val_loop, BaseLoop) or self._val_loop is None:
+            return self._val_loop
+        else:
+            self._val_loop = self.build_val_loop(self._val_loop)
+            return self._val_loop
+
+    @property
+    def test_loop(self):
+        """:obj:`BaseLoop`: A loop to run testing."""
+        if isinstance(self._test_loop, BaseLoop) or self._test_loop is None:
+            return self._test_loop
+        else:
+            self._test_loop = self.build_test_loop(self._test_loop)
+            return self._test_loop
+
+    @property
+    def train_dataloader(self):
+        """The data loader for training."""
+        return self.train_loop.dataloader
+
+    @property
+    def val_dataloader(self):
+        """The data loader for validation."""
+        return self.val_loop.dataloader
+
+    @property
+    def test_dataloader(self):
+        """The data loader for testing."""
+        return self.test_loop.dataloader
+
+    @property
+    def val_evaluator(self):
+        """:obj:`Evaluator`: An evaluator for validation."""
+        return self.val_loop.evaluator
+
+    @property
+    def test_evaluator(self):
+        """:obj:`Evaluator`: An evaluator for testing."""
+        return self.test_loop.evaluator
+
+    @property
+    def val_interval(self):
+        """int: Interval to run validation during training."""
+        return self.train_loop.val_interval
+
+    @property
+    def val_begin(self):
+        """int: The epoch/iteration to start running validation during
+        training."""
+        return self.train_loop.val_begin
+
+    def setup_env(self, env_cfg: Dict) -> None:
+        """Setup environment.
+
+        An example of ``env_cfg``::
+
+            env_cfg = dict(
+                cudnn_benchmark=True,
+                mp_cfg=dict(
+                    mp_start_method='fork',
+                    opencv_num_threads=0
+                ),
+                dist_cfg=dict(backend='nccl', timeout=1800),
+                resource_limit=4096
+            )
+
+        Args:
+            env_cfg (dict): Config for setting environment.
+        """
+        if env_cfg.get('cudnn_benchmark'):
+            torch.backends.cudnn.benchmark = True
+
+        mp_cfg: dict = env_cfg.get('mp_cfg', {})
+        set_multi_processing(**mp_cfg, distributed=self.distributed)
+
+        # init distributed env first, since logger depends on the dist info.
+        if self.distributed and not is_distributed():
+            dist_cfg: dict = env_cfg.get('dist_cfg', {})
+            init_dist(self.launcher, **dist_cfg)
+
+        self._rank, self._world_size = get_dist_info()
+
+        timestamp = torch.tensor(time.time(), dtype=torch.float64)
+        # broadcast timestamp from 0 process to other processes
+        broadcast(timestamp)
+        self._timestamp = time.strftime('%Y%m%d_%H%M%S',
+                                        time.localtime(timestamp.item()))
+
+        # https://github.com/pytorch/pytorch/issues/973
+        # set resource limit
+        if platform.system() != 'Windows':
+            import resource
+            rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+            base_soft_limit = rlimit[0]
+            hard_limit = rlimit[1]
+            soft_limit = min(
+                max(env_cfg.get('resource_limit', 4096), base_soft_limit),
+                hard_limit)
+            resource.setrlimit(resource.RLIMIT_NOFILE,
+                               (soft_limit, hard_limit))
+
+    def set_randomness(self,
+                       seed,
+                       diff_rank_seed: bool = False,
+                       deterministic: bool = False) -> None:
+        """Set random seed to guarantee reproducible results.
+
+        Args:
+            seed (int): A number to set random modules.
+            diff_rank_seed (bool): Whether or not set different seeds according
+                to global rank. Defaults to False.
+            deterministic (bool): Whether to set the deterministic option for
+                CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+                to True and `torch.backends.cudnn.benchmark` to False.
+                Defaults to False.
+                See https://pytorch.org/docs/stable/notes/randomness.html for
+                more details.
+        """
+        self._deterministic = deterministic
+        self._seed = set_random_seed(
+            seed=seed,
+            deterministic=deterministic,
+            diff_rank_seed=diff_rank_seed)
+
+    def build_logger(self,
+                     log_level: Union[int, str] = 'INFO',
+                     log_file: str = None,
+                     **kwargs) -> MMLogger:
+        """Build a global asscessable MMLogger.
+
+        Args:
+            log_level (int or str): The log level of MMLogger handlers.
+                Defaults to 'INFO'.
+            log_file (str, optional): Path of filename to save log.
+                Defaults to None.
+            **kwargs: Remaining parameters passed to ``MMLogger``.
+
+        Returns:
+            MMLogger: A MMLogger object build from ``logger``.
+        """
+        if log_file is None:
+            log_file = osp.join(self._log_dir, f'{self.timestamp}.log')
+
+        log_cfg = dict(log_level=log_level, log_file=log_file, **kwargs)
+        log_cfg.setdefault('name', self._experiment_name)
+        # `torch.compile` in PyTorch 2.0 could close all user defined handlers
+        # unexpectedly. Using file mode 'a' can help prevent abnormal
+        # termination of the FileHandler and ensure that the log file could
+        # be continuously updated during the lifespan of the runner.
+        log_cfg.setdefault('file_mode', 'a')
+
+        return MMLogger.get_instance(**log_cfg)  # type: ignore
+
+    def build_message_hub(self,
+                          message_hub: Optional[Dict] = None) -> MessageHub:
+        """Build a global asscessable MessageHub.
+
+        Args:
+            message_hub (dict, optional): A dict to build MessageHub object.
+                If not specified, default config will be used to build
+                MessageHub object. Defaults to None.
+
+        Returns:
+            MessageHub: A MessageHub object build from ``message_hub``.
+        """
+        if message_hub is None:
+            message_hub = dict(name=self._experiment_name)
+        elif isinstance(message_hub, dict):
+            # ensure message_hub containing name key
+            message_hub.setdefault('name', self._experiment_name)
+        else:
+            raise TypeError(
+                f'message_hub should be dict or None, but got {message_hub}')
+
+        return MessageHub.get_instance(**message_hub)
+
+    def build_visualizer(
+            self,
+            visualizer: Optional[Union[Visualizer,
+                                       Dict]] = None) -> Visualizer:
+        """Build a global asscessable Visualizer.
+
+        Args:
+            visualizer (Visualizer or dict, optional): A Visualizer object
+                or a dict to build Visualizer object. If ``visualizer`` is a
+                Visualizer object, just returns itself. If not specified,
+                default config will be used to build Visualizer object.
+                Defaults to None.
+
+        Returns:
+            Visualizer: A Visualizer object build from ``visualizer``.
+        """
+        if visualizer is None:
+            visualizer = dict(
+                name=self._experiment_name,
+                vis_backends=[dict(type='LocalVisBackend')],
+                save_dir=self._log_dir)
+            return Visualizer.get_instance(**visualizer)
+
+        if isinstance(visualizer, Visualizer):
+            return visualizer
+
+        if isinstance(visualizer, dict):
+            # ensure visualizer containing name key
+            visualizer.setdefault('name', self._experiment_name)
+            visualizer.setdefault('save_dir', self._log_dir)
+            return VISUALIZERS.build(visualizer)
+        else:
+            raise TypeError(
+                'visualizer should be Visualizer object, a dict or None, '
+                f'but got {visualizer}')
+
+    def build_model(self, model: Union[nn.Module, Dict]) -> nn.Module:
+        """Build model.
+
+        If ``model`` is a dict, it will be used to build a nn.Module object.
+        Else, if ``model`` is a nn.Module object it will be returned directly.
+
+        An example of ``model``::
+
+            model = dict(type='ResNet')
+
+        Args:
+            model (nn.Module or dict): A ``nn.Module`` object or a dict to
+                build nn.Module object. If ``model`` is a nn.Module object,
+                just returns itself.
+
+        Note:
+            The returned model must implement ``train_step``, ``test_step``
+            if ``runner.train`` or ``runner.test`` will be called. If
+            ``runner.val`` will be called or ``val_cfg`` is configured,
+            model must implement `val_step`.
+
+        Returns:
+            nn.Module: Model build from ``model``.
+        """
+        if isinstance(model, nn.Module):
+            return model
+        elif isinstance(model, dict):
+            model = MODELS.build(model)
+            return model  # type: ignore
+        else:
+            raise TypeError('model should be a nn.Module object or dict, '
+                            f'but got {model}')
+
+    def wrap_model(
+            self, model_wrapper_cfg: Optional[Dict],
+            model: nn.Module) -> Union[DistributedDataParallel, nn.Module]:
+        """Wrap the model to :obj:`MMDistributedDataParallel` or other custom
+        distributed data-parallel module wrappers.
+
+        An example of ``model_wrapper_cfg``::
+
+            model_wrapper_cfg = dict(
+                broadcast_buffers=False,
+                find_unused_parameters=False
+            )
+
+        Args:
+            model_wrapper_cfg (dict, optional): Config to wrap model. If not
+                specified, ``DistributedDataParallel`` will be used in
+                distributed environment. Defaults to None.
+            model (nn.Module): Model to be wrapped.
+
+        Returns:
+            nn.Module or DistributedDataParallel: nn.Module or subclass of
+            ``DistributedDataParallel``.
+        """
+        if is_model_wrapper(model):
+            if model_wrapper_cfg is not None:
+                raise TypeError(
+                    'model has been wrapped and "model_wrapper_cfg" should be '
+                    f'None, but got {model_wrapper_cfg}')
+
+            return model
+
+        # Set `export CUDA_VISIBLE_DEVICES=-1` to enable CPU training.
+        model = model.to(get_device())
+
+        if not self.distributed:
+            self.logger.info(
+                'Distributed training is not used, all SyncBatchNorm (SyncBN) '
+                'layers in the model will be automatically reverted to '
+                'BatchNormXd layers if they are used.')
+            model = revert_sync_batchnorm(model)
+            return model  # type: ignore
+        else:
+            sync_bn = self.cfg.get('sync_bn', None)
+            if sync_bn is not None:
+                try:
+                    model = convert_sync_batchnorm(model, sync_bn)
+                except ValueError as e:
+                    self.logger.error('cfg.sync_bn should be "torch" or '
+                                      f'"mmcv", but got {sync_bn}')
+                    raise e
+        if model_wrapper_cfg is None:
+            find_unused_parameters = self.cfg.get('find_unused_parameters',
+                                                  False)
+            # Sets the `find_unused_parameters` parameter in
+            # torch.nn.parallel.DistributedDataParallel
+            # TODO: may use a more elegant way to get local device ID.
+            model = MMDistributedDataParallel(
+                module=model,
+                device_ids=[int(os.environ['LOCAL_RANK'])],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+        else:
+            model_wrapper_cfg.setdefault('type', 'MMDistributedDataParallel')
+            model_wrapper_type = MODEL_WRAPPERS.get(
+                model_wrapper_cfg.get('type'))  # type: ignore
+            default_args: dict = dict()
+            if issubclass(
+                    model_wrapper_type,  # type: ignore
+                    DistributedDataParallel):
+                default_args['device_ids'] = [int(os.environ['LOCAL_RANK'])]
+            default_args['module'] = model
+            model = MODEL_WRAPPERS.build(
+                model_wrapper_cfg, default_args=default_args)
+        return model
+
+    def _init_model_weights(self) -> None:
+        """Initialize the model weights if the model has
+        :meth:`init_weights`"""
+        model = self.model.module if is_model_wrapper(
+            self.model) else self.model
+        if hasattr(model, 'init_weights'):
+            model.init_weights()
+            # sync params and buffers
+            for name, params in model.state_dict().items():
+                broadcast(params)
+
+    def scale_lr(self,
+                 optim_wrapper: OptimWrapper,
+                 auto_scale_lr: Optional[Dict] = None) -> None:
+        """Automatically scaling learning rate in training according to the
+        ratio of ``base_batch_size`` in ``autoscalelr_cfg`` and real batch
+        size.
+
+        It scales the learning rate linearly according to the
+        `paper <https://arxiv.org/abs/1706.02677>`_.
+
+        Note:
+            ``scale_lr`` must be called after building optimizer wrappers
+            and before building parameter schedulers.
+
+        Args:
+            optim_wrapper (OptimWrapper): An OptimWrapper object whose
+                parameter groups' learning rate need to be scaled.
+            auto_scale_lr (Dict, Optional): Config to scale the learning
+                rate automatically. It includes ``base_batch_size`` and
+                ``enable``. ``base_batch_size`` is the batch size that the
+                optimizer lr is based on. ``enable`` is the switch to turn on
+                and off the feature.
+        """
+        if (auto_scale_lr is None or not auto_scale_lr.get('enable', False)):
+            return None
+
+        assert 'base_batch_size' in auto_scale_lr, \
+            'Lack of `base_batch_size` in `auto_scale_lr`.'
+        dataloader: Union[DataLoader, Dict] = self._train_dataloader
+        bs = dataloader.batch_size if isinstance(
+            dataloader, DataLoader) else dataloader['batch_size']
+        real_bs = self.world_size * bs
+        base_bs = auto_scale_lr['base_batch_size']
+        ratio = float(real_bs) / float(base_bs)
+        self.logger.info(f'LR is set based on batch size of {base_bs} '
+                         f'and the current batch size is {real_bs}. '
+                         f'Scaling the original LR by {ratio}.')
+
+        def _is_built(schedulers):
+            if isinstance(schedulers, dict):
+                return False if 'type' in schedulers else any(
+                    _is_built(s) for s in schedulers.values())
+            if isinstance(schedulers, list):
+                return any(_is_built(s) for s in schedulers)
+            return isinstance(schedulers, _ParamScheduler)
+
+        if _is_built(self.param_schedulers):
+            raise RuntimeError('`scale_lr` should be called before building '
+                               'ParamScheduler because ParamScheduler will '
+                               'store initial lr from optimizer wrappers')
+
+        assert isinstance(optim_wrapper, OptimWrapper), \
+            '`scale_lr should be called after building OptimWrapper'
+        wrappers = list(optim_wrapper.values()) if isinstance(
+            optim_wrapper, OptimWrapperDict) else [optim_wrapper]
+        for wrapper in wrappers:
+            for group in wrapper.optimizer.param_groups:
+                group['lr'] = group['lr'] * ratio
+
+    def build_optim_wrapper(
+        self, optim_wrapper: Union[Optimizer, OptimWrapper, Dict]
+    ) -> Union[OptimWrapper, OptimWrapperDict]:
+        """Build optimizer wrapper.
+
+        If ``optim_wrapper`` is a config dict for only one optimizer,
+        the keys must contain ``optimizer``, and ``type`` is optional.
+        It will build a :obj:`OptimWrapper` by default.
+
+        If ``optim_wrapper`` is a config dict for multiple optimizers, i.e.,
+        it has multiple keys and each key is for an optimizer wrapper. The
+        constructor must be specified since
+        :obj:`DefaultOptimizerConstructor` cannot handle the building of
+        training with multiple optimizers.
+
+        If ``optim_wrapper`` is a dict of pre-built optimizer wrappers, i.e.,
+        each value of ``optim_wrapper`` represents an ``OptimWrapper``
+        instance. ``build_optim_wrapper`` will directly build the
+        :obj:`OptimWrapperDict` instance from ``optim_wrapper``.
+
+        Args:
+            optim_wrapper (OptimWrapper or dict): An OptimWrapper object or a
+                dict to build OptimWrapper objects. If ``optim_wrapper`` is an
+                OptimWrapper, just return an ``OptimizeWrapper`` instance.
+
+        Note:
+            For single optimizer training, if `optim_wrapper` is a config
+            dict, `type` is optional(defaults to :obj:`OptimWrapper`) and it
+            must contain `optimizer` to build the corresponding optimizer.
+
+        Examples:
+            >>> # build an optimizer
+            >>> optim_wrapper_cfg = dict(type='OptimWrapper', optimizer=dict(
+            ...     type='SGD', lr=0.01))
+            >>> # optim_wrapper_cfg = dict(optimizer=dict(type='SGD', lr=0.01))
+            >>> # is also valid.
+            >>> optim_wrapper = runner.build_optim_wrapper(optim_wrapper_cfg)
+            >>> optim_wrapper
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            SGD (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.01
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+            >>> # build optimizer without `type`
+            >>> optim_wrapper_cfg = dict(optimizer=dict(type='SGD', lr=0.01))
+            >>> optim_wrapper = runner.build_optim_wrapper(optim_wrapper_cfg)
+            >>> optim_wrapper
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            SGD (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.01
+                maximize: False
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+            >>> # build multiple optimizers
+            >>> optim_wrapper_cfg = dict(
+            ...    generator=dict(type='OptimWrapper', optimizer=dict(
+            ...        type='SGD', lr=0.01)),
+            ...    discriminator=dict(type='OptimWrapper', optimizer=dict(
+            ...        type='Adam', lr=0.001))
+            ...    # need to customize a multiple optimizer constructor
+            ...    constructor='CustomMultiOptimizerConstructor',
+            ...)
+            >>> optim_wrapper = runner.optim_wrapper(optim_wrapper_cfg)
+            >>> optim_wrapper
+            name: generator
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            SGD (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.1
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+            name: discriminator
+            Type: OptimWrapper
+            accumulative_counts: 1
+            optimizer:
+            'discriminator': Adam (
+            Parameter Group 0
+                dampening: 0
+                lr: 0.02
+                momentum: 0
+                nesterov: False
+                weight_decay: 0
+            )
+
+        Important:
+            If you need to build multiple optimizers, you should implement a
+            MultiOptimWrapperConstructor which gets parameters passed to
+            corresponding optimizers and compose the ``OptimWrapperDict``.
+            More details about how to customize OptimizerConstructor can be
+            found at `optimizer-docs`_.
+
+        Returns:
+            OptimWrapper: Optimizer wrapper build from ``optimizer_cfg``.
+        """  # noqa: E501
+        if isinstance(optim_wrapper, OptimWrapper):
+            return optim_wrapper
+        if isinstance(optim_wrapper, (dict, ConfigDict, Config)):
+            # optimizer must be defined for single optimizer training.
+            optimizer = optim_wrapper.get('optimizer', None)
+
+            # If optimizer is a built `Optimizer` instance, the optimizer
+            # wrapper should be built by `OPTIM_WRAPPERS` registry.
+            if isinstance(optimizer, Optimizer):
+                optim_wrapper.setdefault('type', 'OptimWrapper')
+                return OPTIM_WRAPPERS.build(optim_wrapper)  # type: ignore
+
+            # If `optimizer` is not None or `constructor` is defined, it means,
+            # optimizer wrapper will be built by optimizer wrapper
+            # constructor. Therefore, `build_optim_wrapper` should be called.
+            if optimizer is not None or 'constructor' in optim_wrapper:
+                return build_optim_wrapper(self.model, optim_wrapper)
+            else:
+                # if `optimizer` is not defined, it should be the case of
+                # training with multiple optimizers. If `constructor` is not
+                # defined either, each value of `optim_wrapper` must be an
+                # `OptimWrapper` instance since `DefaultOptimizerConstructor`
+                # will not handle the case of training with multiple
+                # optimizers. `build_optim_wrapper` will directly build the
+                # `OptimWrapperDict` instance from `optim_wrapper.`
+                optim_wrappers = OrderedDict()
+                for name, optim in optim_wrapper.items():
+                    if not isinstance(optim, OptimWrapper):
+                        raise ValueError(
+                            'each item mush be an optimizer object when '
+                            '"type" and "constructor" are not in '
+                            f'optimizer, but got {name}={optim}')
+                    optim_wrappers[name] = optim
+                return OptimWrapperDict(**optim_wrappers)
+        else:
+            raise TypeError('optimizer wrapper should be an OptimWrapper '
+                            f'object or dict, but got {optim_wrapper}')
+
+    def _build_param_scheduler(
+            self, scheduler: Union[_ParamScheduler, Dict, List],
+            optim_wrapper: OptimWrapper) -> List[_ParamScheduler]:
+        """Build parameter schedulers for a single optimizer.
+
+        Args:
+            scheduler (_ParamScheduler or dict or list): A Param Scheduler
+                object or a dict or list of dict to build parameter schedulers.
+            optim_wrapper (OptimWrapper): An optimizer wrapper object is
+                passed to construct ParamScheduler object.
+
+        Returns:
+            list[_ParamScheduler]: List of parameter schedulers build from
+            ``scheduler``.
+
+        Note:
+            If the train loop is built, when building parameter schedulers,
+            it supports setting the max epochs/iters as the default ``end``
+            of schedulers, and supports converting epoch-based schedulers
+            to iter-based according to the ``convert_to_iter_based`` key.
+        """
+        if not isinstance(scheduler, Sequence):
+            schedulers = [scheduler]
+        else:
+            schedulers = scheduler
+
+        param_schedulers = []
+        for scheduler in schedulers:
+            if isinstance(scheduler, _ParamScheduler):
+                param_schedulers.append(scheduler)
+            elif isinstance(scheduler, dict):
+                _scheduler = copy.deepcopy(scheduler)
+
+                # Set default end
+                if isinstance(self._train_loop, BaseLoop):
+                    default_end = self.max_epochs if _scheduler.get(
+                        'by_epoch', True) else self.max_iters
+                    _scheduler.setdefault('end', default_end)
+                    self.logger.debug(
+                        f'The `end` of {_scheduler["type"]} is not set. '
+                        'Use the max epochs/iters of train loop as default.')
+
+                param_schedulers.append(
+                    PARAM_SCHEDULERS.build(
+                        _scheduler,
+                        default_args=dict(
+                            optimizer=optim_wrapper,
+                            epoch_length=len(self.train_dataloader))))
+            else:
+                raise TypeError(
+                    'scheduler should be a _ParamScheduler object or dict, '
+                    f'but got {scheduler}')
+        return param_schedulers
+
+    def build_param_scheduler(
+            self, scheduler: Union[_ParamScheduler, Dict,
+                                   List]) -> ParamSchedulerType:
+        """Build parameter schedulers.
+
+        ``build_param_scheduler`` should be called after
+        ``build_optim_wrapper`` because the building logic will change
+        according to the number of optimizers built by the runner.
+        The cases are as below:
+
+        - Single optimizer: When only one optimizer is built and used in the
+          runner, ``build_param_scheduler`` will return a list of
+          parameter schedulers.
+        - Multiple optimizers: When two or more optimizers are built and used
+          in runner, ``build_param_scheduler`` will return a dict containing
+          the same keys with multiple optimizers and each value is a list of
+          parameter schedulers. Note that, if you want different optimizers to
+          use different parameter schedulers to update optimizer's
+          hyper-parameters, the input parameter ``scheduler`` also needs to be
+          a dict and its key are consistent with multiple optimizers.
+          Otherwise, the same parameter schedulers will be used to update
+          optimizer's hyper-parameters.
+
+        Args:
+            scheduler (_ParamScheduler or dict or list): A Param Scheduler
+                object or a dict or list of dict to build parameter schedulers.
+
+        Examples:
+            >>> # build one scheduler
+            >>> optim_cfg = dict(dict(type='SGD', lr=0.01))
+            >>> runner.optim_wrapper = runner.build_optim_wrapper(
+            >>>     optim_cfg)
+            >>> scheduler_cfg = dict(type='MultiStepLR', milestones=[1, 2])
+            >>> schedulers = runner.build_param_scheduler(scheduler_cfg)
+            >>> schedulers
+            [<mmengine.optim.scheduler.lr_scheduler.MultiStepLR at 0x7f70f6966290>]  # noqa: E501
+
+            >>> # build multiple schedulers
+            >>> scheduler_cfg = [
+            ...    dict(type='MultiStepLR', milestones=[1, 2]),
+            ...    dict(type='StepLR', step_size=1)
+            ... ]
+            >>> schedulers = runner.build_param_scheduler(scheduler_cfg)
+            >>> schedulers
+            [<mmengine.optim.scheduler.lr_scheduler.MultiStepLR at 0x7f70f60dd3d0>,  # noqa: E501
+            <mmengine.optim.scheduler.lr_scheduler.StepLR at 0x7f70f6eb6150>]
+
+        Above examples only provide the case of one optimizer and one scheduler
+        or multiple schedulers. If you want to know how to set parameter
+        scheduler when using multiple optimizers, you can find more examples
+        `optimizer-docs`_.
+
+        Returns:
+            list[_ParamScheduler] or dict[str, list[_ParamScheduler]]: List of
+            parameter schedulers or a dictionary contains list of parameter
+            schedulers build from ``scheduler``.
+
+        .. _optimizer-docs:
+           https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html
+        """
+        param_schedulers: ParamSchedulerType
+        if not isinstance(self.optim_wrapper, OptimWrapperDict):
+            # Since `OptimWrapperDict` inherits from `OptimWrapper`,
+            # `isinstance(self.optim_wrapper, OptimWrapper)` cannot tell
+            # whether `self.optim_wrapper` is an `OptimizerWrapper` or
+            # `OptimWrapperDict` instance. Therefore, here we simply check
+            # self.optim_wrapper is not an `OptimWrapperDict` instance and
+            # then assert it is an OptimWrapper instance.
+            assert isinstance(self.optim_wrapper, OptimWrapper), (
+                '`build_optimizer` should be called before'
+                '`build_param_scheduler` because the latter depends '
+                'on the former')
+            param_schedulers = self._build_param_scheduler(
+                scheduler, self.optim_wrapper)  # type: ignore
+            return param_schedulers
+        else:
+            param_schedulers = dict()
+            for name, optimizer in self.optim_wrapper.items():
+                if isinstance(scheduler, dict) and 'type' not in scheduler:
+                    # scheduler is a dict and each item is a ParamScheduler
+                    # object or a config to build ParamScheduler objects
+                    param_schedulers[name] = self._build_param_scheduler(
+                        scheduler[name], optimizer)
+                else:
+                    param_schedulers[name] = self._build_param_scheduler(
+                        scheduler, optimizer)
+
+            return param_schedulers
+
+    def build_evaluator(self, evaluator: Union[Dict, List,
+                                               Evaluator]) -> Evaluator:
+        """Build evaluator.
+
+        Examples of ``evaluator``::
+
+            # evaluator could be a built Evaluator instance
+            evaluator = Evaluator(metrics=[ToyMetric()])
+
+            # evaluator can also be a list of dict
+            evaluator = [
+                dict(type='ToyMetric1'),
+                dict(type='ToyEvaluator2')
+            ]
+
+            # evaluator can also be a list of built metric
+            evaluator = [ToyMetric1(), ToyMetric2()]
+
+            # evaluator can also be a dict with key metrics
+            evaluator = dict(metrics=ToyMetric())
+            # metric is a list
+            evaluator = dict(metrics=[ToyMetric()])
+
+        Args:
+            evaluator (Evaluator or dict or list): An Evaluator object or a
+                config dict or list of config dict used to build an Evaluator.
+
+        Returns:
+            Evaluator: Evaluator build from ``evaluator``.
+        """
+        if isinstance(evaluator, Evaluator):
+            return evaluator
+        elif isinstance(evaluator, dict):
+            # if `metrics` in dict keys, it means to build customized evalutor
+            if 'metrics' in evaluator:
+                evaluator.setdefault('type', 'Evaluator')
+                return EVALUATOR.build(evaluator)
+            # otherwise, default evalutor will be built
+            else:
+                return Evaluator(evaluator)  # type: ignore
+        elif isinstance(evaluator, list):
+            # use the default `Evaluator`
+            return Evaluator(evaluator)  # type: ignore
+        else:
+            raise TypeError(
+                'evaluator should be one of dict, list of dict, and Evaluator'
+                f', but got {evaluator}')
+
+    @staticmethod
+    def build_dataloader(dataloader: Union[DataLoader, Dict],
+                         seed: Optional[int] = None,
+                         diff_rank_seed: bool = False) -> DataLoader:
+        """Build dataloader.
+
+        The method builds three components:
+
+        - Dataset
+        - Sampler
+        - Dataloader
+
+        An example of ``dataloader``::
+
+            dataloader = dict(
+                dataset=dict(type='ToyDataset'),
+                sampler=dict(type='DefaultSampler', shuffle=True),
+                batch_size=1,
+                num_workers=9
+            )
+
+        Args:
+            dataloader (DataLoader or dict): A Dataloader object or a dict to
+                build Dataloader object. If ``dataloader`` is a Dataloader
+                object, just returns itself.
+            seed (int, optional): Random seed. Defaults to None.
+            diff_rank_seed (bool): Whether or not set different seeds to
+                different ranks. If True, the seed passed to sampler is set
+                to None, in order to synchronize the seeds used in samplers
+                across different ranks.
+
+
+        Returns:
+            Dataloader: DataLoader build from ``dataloader_cfg``.
+        """
+        if isinstance(dataloader, DataLoader):
+            return dataloader
+
+        dataloader_cfg = copy.deepcopy(dataloader)
+
+        # build dataset
+        dataset_cfg = dataloader_cfg.pop('dataset')
+        if isinstance(dataset_cfg, dict):
+            dataset = DATASETS.build(dataset_cfg)
+            if hasattr(dataset, 'full_init'):
+                dataset.full_init()
+        else:
+            # fallback to raise error in dataloader
+            # if `dataset_cfg` is not a valid type
+            dataset = dataset_cfg
+
+        num_batch_per_epoch = dataloader_cfg.pop('num_batch_per_epoch', None)
+        if num_batch_per_epoch is not None:
+            world_size = get_world_size()
+            num_samples = (
+                num_batch_per_epoch * _get_batch_size(dataloader_cfg) *
+                world_size)
+            dataset = _SlicedDataset(dataset, num_samples)
+
+        # build sampler
+        sampler_cfg = dataloader_cfg.pop('sampler')
+        if isinstance(sampler_cfg, dict):
+            sampler_seed = None if diff_rank_seed else seed
+            sampler = DATA_SAMPLERS.build(
+                sampler_cfg,
+                default_args=dict(dataset=dataset, seed=sampler_seed))
+        else:
+            # fallback to raise error in dataloader
+            # if `sampler_cfg` is not a valid type
+            sampler = sampler_cfg
+
+        # build batch sampler
+        batch_sampler_cfg = dataloader_cfg.pop('batch_sampler', None)
+        if batch_sampler_cfg is None:
+            batch_sampler = None
+        elif isinstance(batch_sampler_cfg, dict):
+            batch_sampler = DATA_SAMPLERS.build(
+                batch_sampler_cfg,
+                default_args=dict(
+                    sampler=sampler,
+                    batch_size=dataloader_cfg.pop('batch_size')))
+        else:
+            # fallback to raise error in dataloader
+            # if `batch_sampler_cfg` is not a valid type
+            batch_sampler = batch_sampler_cfg
+
+        # build dataloader
+        init_fn: Optional[partial]
+
+        if 'worker_init_fn' in dataloader_cfg:
+            worker_init_fn_cfg = dataloader_cfg.pop('worker_init_fn')
+            worker_init_fn_type = worker_init_fn_cfg.pop('type')
+            if isinstance(worker_init_fn_type, str):
+                worker_init_fn = FUNCTIONS.get(worker_init_fn_type)
+            elif callable(worker_init_fn_type):
+                worker_init_fn = worker_init_fn_type
+            else:
+                raise TypeError(
+                    'type of worker_init_fn should be string or callable '
+                    f'object, but got {type(worker_init_fn_type)}')
+            assert callable(worker_init_fn)
+            init_fn = partial(worker_init_fn,
+                              **worker_init_fn_cfg)  # type: ignore
+        else:
+            if seed is not None:
+                disable_subprocess_warning = dataloader_cfg.pop(
+                    'disable_subprocess_warning', False)
+                assert isinstance(disable_subprocess_warning, bool), (
+                    'disable_subprocess_warning should be a bool, but got '
+                    f'{type(disable_subprocess_warning)}')
+                init_fn = partial(
+                    default_worker_init_fn,
+                    num_workers=dataloader_cfg.get('num_workers'),
+                    rank=get_rank(),
+                    seed=seed,
+                    disable_subprocess_warning=disable_subprocess_warning)
+            else:
+                init_fn = None
+
+        # `persistent_workers` requires pytorch version >= 1.7
+        if ('persistent_workers' in dataloader_cfg
+                and digit_version(TORCH_VERSION) < digit_version('1.7.0')):
+            print_log(
+                '`persistent_workers` is only available when '
+                'pytorch version >= 1.7',
+                logger='current',
+                level=logging.WARNING)
+            dataloader_cfg.pop('persistent_workers')
+
+        # The default behavior of `collat_fn` in dataloader is to
+        # merge a list of samples to form a mini-batch of Tensor(s).
+        # However, in mmengine, if `collate_fn` is not defined in
+        # dataloader_cfg, `pseudo_collate` will only convert the list of
+        # samples into a dict without stacking the batch tensor.
+        collate_fn_cfg = dataloader_cfg.pop('collate_fn',
+                                            dict(type='pseudo_collate'))
+        if isinstance(collate_fn_cfg, dict):
+            collate_fn_type = collate_fn_cfg.pop('type')
+            if isinstance(collate_fn_type, str):
+                collate_fn = FUNCTIONS.get(collate_fn_type)
+            else:
+                collate_fn = collate_fn_type
+            collate_fn = partial(collate_fn, **collate_fn_cfg)  # type: ignore
+        elif callable(collate_fn_cfg):
+            collate_fn = collate_fn_cfg
+        else:
+            raise TypeError(
+                'collate_fn should be a dict or callable object, but got '
+                f'{collate_fn_cfg}')
+        data_loader = DataLoader(
+            dataset=dataset,
+            sampler=sampler if batch_sampler is None else None,
+            batch_sampler=batch_sampler,
+            collate_fn=collate_fn,
+            worker_init_fn=init_fn,
+            **dataloader_cfg)
+        return data_loader
+
+    def build_train_loop(self, loop: Union[BaseLoop, Dict]) -> BaseLoop:
+        """Build training loop.
+
+        Examples of ``loop``::
+
+            # `EpochBasedTrainLoop` will be used
+            loop = dict(by_epoch=True, max_epochs=3)
+
+            # `IterBasedTrainLoop` will be used
+            loop = dict(by_epoch=False, max_epochs=3)
+
+            # custom training loop
+            loop = dict(type='CustomTrainLoop', max_epochs=3)
+
+        Args:
+            loop (BaseLoop or dict): A training loop or a dict to build
+                training loop. If ``loop`` is a training loop object, just
+                returns itself.
+
+        Returns:
+            :obj:`BaseLoop`: Training loop object build from ``loop``.
+        """
+        if isinstance(loop, BaseLoop):
+            return loop
+        elif not isinstance(loop, dict):
+            raise TypeError(
+                f'train_loop should be a Loop object or dict, but got {loop}')
+
+        loop_cfg = copy.deepcopy(loop)
+
+        if 'type' in loop_cfg and 'by_epoch' in loop_cfg:
+            raise RuntimeError(
+                'Only one of `type` or `by_epoch` can exist in `loop_cfg`.')
+
+        if 'type' in loop_cfg:
+            loop = LOOPS.build(
+                loop_cfg,
+                default_args=dict(
+                    runner=self, dataloader=self._train_dataloader))
+        else:
+            by_epoch = loop_cfg.pop('by_epoch')
+            if by_epoch:
+                loop = EpochBasedTrainLoop(
+                    **loop_cfg, runner=self, dataloader=self._train_dataloader)
+            else:
+                loop = IterBasedTrainLoop(
+                    **loop_cfg, runner=self, dataloader=self._train_dataloader)
+        return loop  # type: ignore
+
+    def build_val_loop(self, loop: Union[BaseLoop, Dict]) -> BaseLoop:
+        """Build validation loop.
+
+        Examples of ``loop``:
+
+            # `ValLoop` will be used
+            loop = dict()
+
+            # custom validation loop
+            loop = dict(type='CustomValLoop')
+
+        Args:
+            loop (BaseLoop or dict): A validation loop or a dict to build
+                validation loop. If ``loop`` is a validation loop object, just
+                returns itself.
+
+        Returns:
+            :obj:`BaseLoop`: Validation loop object build from ``loop``.
+        """
+        if isinstance(loop, BaseLoop):
+            return loop
+        elif not isinstance(loop, dict):
+            raise TypeError(
+                f'val_loop should be a Loop object or dict, but got {loop}')
+
+        loop_cfg = copy.deepcopy(loop)
+
+        if 'type' in loop_cfg:
+            loop = LOOPS.build(
+                loop_cfg,
+                default_args=dict(
+                    runner=self,
+                    dataloader=self._val_dataloader,
+                    evaluator=self._val_evaluator))
+        else:
+            loop = ValLoop(
+                **loop_cfg,
+                runner=self,
+                dataloader=self._val_dataloader,
+                evaluator=self._val_evaluator)  # type: ignore
+
+        return loop  # type: ignore
+
+    def build_test_loop(self, loop: Union[BaseLoop, Dict]) -> BaseLoop:
+        """Build test loop.
+
+        Examples of ``loop``::
+
+            # `TestLoop` will be used
+            loop = dict()
+
+            # custom test loop
+            loop = dict(type='CustomTestLoop')
+
+        Args:
+            loop (BaseLoop or dict): A test loop or a dict to build test loop.
+                If ``loop`` is a test loop object, just returns itself.
+
+        Returns:
+            :obj:`BaseLoop`: Test loop object build from ``loop_cfg``.
+        """
+        if isinstance(loop, BaseLoop):
+            return loop
+        elif not isinstance(loop, dict):
+            raise TypeError(
+                f'test_loop should be a Loop object or dict, but got {loop}')
+
+        loop_cfg = copy.deepcopy(loop)  # type: ignore
+
+        if 'type' in loop_cfg:
+            loop = LOOPS.build(
+                loop_cfg,
+                default_args=dict(
+                    runner=self,
+                    dataloader=self._test_dataloader,
+                    evaluator=self._test_evaluator))
+        else:
+            loop = TestLoop(
+                **loop_cfg,
+                runner=self,
+                dataloader=self._test_dataloader,
+                evaluator=self._test_evaluator)  # type: ignore
+
+        return loop  # type: ignore
+
+    def build_log_processor(
+            self, log_processor: Union[LogProcessor, Dict]) -> LogProcessor:
+        """Build test log_processor.
+
+        Examples of ``log_processor``:
+
+            # `LogProcessor` will be used
+            log_processor = dict()
+
+            # custom log_processor
+            log_processor = dict(type='CustomLogProcessor')
+
+        Args:
+            log_processor (LogProcessor or dict): A log processor or a dict
+            to build log processor. If ``log_processor`` is a log processor
+            object, just returns itself.
+
+        Returns:
+            :obj:`LogProcessor`: Log processor object build from
+            ``log_processor_cfg``.
+        """
+        if isinstance(log_processor, LogProcessor):
+            return log_processor
+        elif not isinstance(log_processor, dict):
+            raise TypeError(
+                'log processor should be a LogProcessor object or dict, but'
+                f'got {log_processor}')
+
+        log_processor_cfg = copy.deepcopy(log_processor)  # type: ignore
+
+        if 'type' in log_processor_cfg:
+            log_processor = LOG_PROCESSORS.build(log_processor_cfg)
+        else:
+            log_processor = LogProcessor(**log_processor_cfg)  # type: ignore
+
+        return log_processor  # type: ignore
+
+    def get_hooks_info(self) -> str:
+        # Get hooks info in each stage
+        stage_hook_map: Dict[str, list] = {stage: [] for stage in Hook.stages}
+        for hook in self.hooks:
+            try:
+                priority = Priority(hook.priority).name  # type: ignore
+            except ValueError:
+                priority = hook.priority  # type: ignore
+            classname = hook.__class__.__name__
+            hook_info = f'({priority:<12}) {classname:<35}'
+            for trigger_stage in hook.get_triggered_stages():
+                stage_hook_map[trigger_stage].append(hook_info)
+
+        stage_hook_infos = []
+        for stage in Hook.stages:
+            hook_infos = stage_hook_map[stage]
+            if len(hook_infos) > 0:
+                info = f'{stage}:\n'
+                info += '\n'.join(hook_infos)
+                info += '\n -------------------- '
+                stage_hook_infos.append(info)
+        return '\n'.join(stage_hook_infos)
+
+    def load_or_resume(self) -> None:
+        """load or resume checkpoint."""
+        if self._has_loaded:
+            return None
+
+        # decide to load from checkpoint or resume from checkpoint
+        resume_from = None
+        if self._resume and self._load_from is None:
+            # auto resume from the latest checkpoint
+            resume_from = find_latest_checkpoint(self.work_dir)
+            self.logger.info(
+                f'Auto resumed from the latest checkpoint {resume_from}.')
+        elif self._resume and self._load_from is not None:
+            # resume from the specified checkpoint
+            resume_from = self._load_from
+
+        if resume_from is not None:
+            self.resume(resume_from)
+            self._has_loaded = True
+        elif self._load_from is not None:
+            self.load_checkpoint(self._load_from)
+            self._has_loaded = True
+
+    def train(self) -> nn.Module:
+        """Launch training.
+
+        Returns:
+            nn.Module: The model after training.
+        """
+        if is_model_wrapper(self.model):
+            ori_model = self.model.module
+        else:
+            ori_model = self.model
+        assert hasattr(ori_model, 'train_step'), (
+            'If you want to train your model, please make sure your model '
+            'has implemented `train_step`.')
+
+        if self._val_loop is not None:
+            assert hasattr(ori_model, 'val_step'), (
+                'If you want to validate your model, please make sure your '
+                'model has implemented `val_step`.')
+
+        if self._train_loop is None:
+            raise RuntimeError(
+                '`self._train_loop` should not be None when calling train '
+                'method. Please provide `train_dataloader`, `train_cfg`, '
+                '`optimizer` and `param_scheduler` arguments when '
+                'initializing runner.')
+
+        self._train_loop = self.build_train_loop(
+            self._train_loop)  # type: ignore
+
+        # `build_optimizer` should be called before `build_param_scheduler`
+        #  because the latter depends on the former
+        self.optim_wrapper = self.build_optim_wrapper(self.optim_wrapper)
+        # Automatically scaling lr by linear scaling rule
+        self.scale_lr(self.optim_wrapper, self.auto_scale_lr)
+
+        if self.param_schedulers is not None:
+            self.param_schedulers = self.build_param_scheduler(  # type: ignore
+                self.param_schedulers)  # type: ignore
+
+        if self._val_loop is not None:
+            self._val_loop = self.build_val_loop(
+                self._val_loop)  # type: ignore
+        # TODO: add a contextmanager to avoid calling `before_run` many times
+        self.call_hook('before_run')
+
+        # initialize the model weights
+        self._init_model_weights()
+
+        # try to enable activation_checkpointing feature
+        modules = self.cfg.get('activation_checkpointing', None)
+        if modules is not None:
+            self.logger.info(f'Enabling the "activation_checkpointing" feature'
+                             f' for sub-modules: {modules}')
+            turn_on_activation_checkpointing(ori_model, modules)
+
+        # try to enable efficient_conv_bn_eval feature
+        modules = self.cfg.get('efficient_conv_bn_eval', None)
+        if modules is not None:
+            self.logger.info(f'Enabling the "efficient_conv_bn_eval" feature'
+                             f' for sub-modules: {modules}')
+            turn_on_efficient_conv_bn_eval(ori_model, modules)
+
+        # make sure checkpoint-related hooks are triggered after `before_run`
+        self.load_or_resume()
+
+        # Initiate inner count of `optim_wrapper`.
+        self.optim_wrapper.initialize_count_status(
+            self.model,
+            self._train_loop.iter,  # type: ignore
+            self._train_loop.max_iters)  # type: ignore
+
+        # Maybe compile the model according to options in self.cfg.compile
+        # This must be called **AFTER** model has been wrapped.
+        self._maybe_compile('train_step')
+
+        model = self.train_loop.run()  # type: ignore
+        self.call_hook('after_run')
+        return model
+
+    def val(self) -> dict:
+        """Launch validation.
+
+        Returns:
+            dict: A dict of metrics on validation set.
+        """
+        if self._val_loop is None:
+            raise RuntimeError(
+                '`self._val_loop` should not be None when calling val method.'
+                'Please provide `val_dataloader`, `val_cfg` and '
+                '`val_evaluator` arguments when initializing runner.')
+
+        self._val_loop = self.build_val_loop(self._val_loop)  # type: ignore
+
+        self.call_hook('before_run')
+
+        # make sure checkpoint-related hooks are triggered after `before_run`
+        self.load_or_resume()
+
+        metrics = self.val_loop.run()  # type: ignore
+        self.call_hook('after_run')
+        return metrics
+
+    def test(self) -> dict:
+        """Launch test.
+
+        Returns:
+            dict: A dict of metrics on testing set.
+        """
+        if self._test_loop is None:
+            raise RuntimeError(
+                '`self._test_loop` should not be None when calling test '
+                'method. Please provide `test_dataloader`, `test_cfg` and '
+                '`test_evaluator` arguments when initializing runner.')
+
+        self._test_loop = self.build_test_loop(self._test_loop)  # type: ignore
+
+        self.call_hook('before_run')
+
+        # make sure checkpoint-related hooks are triggered after `before_run`
+        self.load_or_resume()
+
+        metrics = self.test_loop.run()  # type: ignore
+        self.call_hook('after_run')
+        return metrics
+
+    def call_hook(self, fn_name: str, **kwargs) -> None:
+        """Call all hooks.
+
+        Args:
+            fn_name (str): The function name in each hook to be called, such as
+                "before_train_epoch".
+            **kwargs: Keyword arguments passed to hook.
+        """
+        for hook in self._hooks:
+            # support adding additional custom hook methods
+            if hasattr(hook, fn_name):
+                try:
+                    getattr(hook, fn_name)(self, **kwargs)
+                except TypeError as e:
+                    raise TypeError(f'{e} in {hook}') from None
+
+    def register_hook(
+            self,
+            hook: Union[Hook, Dict],
+            priority: Optional[Union[str, int, Priority]] = None) -> None:
+        """Register a hook into the hook list.
+
+        The hook will be inserted into a priority queue, with the specified
+        priority (See :class:`Priority` for details of priorities).
+        For hooks with the same priority, they will be triggered in the same
+        order as they are registered.
+
+        Priority of hook will be decided with the following priority:
+
+        - ``priority`` argument. If ``priority`` is given, it will be priority
+          of hook.
+        - If ``hook`` argument is a dict and ``priority`` in it, the priority
+          will be the value of ``hook['priority']``.
+        - If ``hook`` argument is a dict but ``priority`` not in it or ``hook``
+          is an instance of ``hook``, the priority will be ``hook.priority``.
+
+        Args:
+            hook (:obj:`Hook` or dict): The hook to be registered.
+            priority (int or str or :obj:`Priority`, optional): Hook priority.
+                Lower value means higher priority.
+        """
+        if not isinstance(hook, (Hook, dict)):
+            raise TypeError(
+                f'hook should be an instance of Hook or dict, but got {hook}')
+
+        _priority = None
+        if isinstance(hook, dict):
+            if 'priority' in hook:
+                _priority = hook.pop('priority')
+
+            hook_obj = HOOKS.build(hook)
+        else:
+            hook_obj = hook
+
+        if priority is not None:
+            hook_obj.priority = priority
+        elif _priority is not None:
+            hook_obj.priority = _priority
+
+        inserted = False
+        for i in range(len(self._hooks) - 1, -1, -1):
+            if get_priority(hook_obj.priority) >= get_priority(
+                    self._hooks[i].priority):
+                self._hooks.insert(i + 1, hook_obj)
+                inserted = True
+                break
+        if not inserted:
+            self._hooks.insert(0, hook_obj)
+
+    def register_default_hooks(
+            self,
+            hooks: Optional[Dict[str, Union[Hook, Dict]]] = None) -> None:
+        """Register default hooks into hook list.
+
+        ``hooks`` will be registered into runner to execute some default
+        actions like updating model parameters or saving checkpoints.
+
+        Default hooks and their priorities:
+
+        +----------------------+-------------------------+
+        | Hooks                | Priority                |
+        +======================+=========================+
+        | RuntimeInfoHook      | VERY_HIGH (10)          |
+        +----------------------+-------------------------+
+        | IterTimerHook        | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | DistSamplerSeedHook  | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | LoggerHook           | BELOW_NORMAL (60)       |
+        +----------------------+-------------------------+
+        | ParamSchedulerHook   | LOW (70)                |
+        +----------------------+-------------------------+
+        | CheckpointHook       | VERY_LOW (90)           |
+        +----------------------+-------------------------+
+
+        If ``hooks`` is None, above hooks will be registered by
+        default::
+
+            default_hooks = dict(
+                runtime_info=dict(type='RuntimeInfoHook'),
+                timer=dict(type='IterTimerHook'),
+                sampler_seed=dict(type='DistSamplerSeedHook'),
+                logger=dict(type='LoggerHook'),
+                param_scheduler=dict(type='ParamSchedulerHook'),
+                checkpoint=dict(type='CheckpointHook', interval=1),
+            )
+
+        If not None, ``hooks`` will be merged into ``default_hooks``.
+        If there are None value in default_hooks, the corresponding item will
+        be popped from ``default_hooks``::
+
+            hooks = dict(timer=None)
+
+        The final registered default hooks will be :obj:`RuntimeInfoHook`,
+        :obj:`DistSamplerSeedHook`, :obj:`LoggerHook`,
+        :obj:`ParamSchedulerHook` and :obj:`CheckpointHook`.
+
+        Args:
+            hooks (dict[str, Hook or dict], optional): Default hooks or configs
+                to be registered.
+        """
+        default_hooks: dict = dict(
+            runtime_info=dict(type='RuntimeInfoHook'),
+            timer=dict(type='IterTimerHook'),
+            sampler_seed=dict(type='DistSamplerSeedHook'),
+            logger=dict(type='LoggerHook'),
+            param_scheduler=dict(type='ParamSchedulerHook'),
+            checkpoint=dict(type='CheckpointHook', interval=1),
+        )
+        if hooks is not None:
+            for name, hook in hooks.items():
+                if name in default_hooks and hook is None:
+                    # remove hook from _default_hooks
+                    default_hooks.pop(name)
+                else:
+                    assert hook is not None
+                    default_hooks[name] = hook
+
+        for hook in default_hooks.values():
+            self.register_hook(hook)
+
+    def register_custom_hooks(self, hooks: List[Union[Hook, Dict]]) -> None:
+        """Register custom hooks into hook list.
+
+        Args:
+            hooks (list[Hook | dict]): List of hooks or configs to be
+                registered.
+        """
+        for hook in hooks:
+            self.register_hook(hook)
+
+    def register_hooks(
+            self,
+            default_hooks: Optional[Dict[str, Union[Hook, Dict]]] = None,
+            custom_hooks: Optional[List[Union[Hook, Dict]]] = None) -> None:
+        """Register default hooks and custom hooks into hook list.
+
+        Args:
+            default_hooks (dict[str, dict] or dict[str, Hook], optional): Hooks
+                to execute default actions like updating model parameters and
+                saving checkpoints.  Defaults to None.
+            custom_hooks (list[dict] or list[Hook], optional): Hooks to execute
+                custom actions like visualizing images processed by pipeline.
+                Defaults to None.
+        """
+        self.register_default_hooks(default_hooks)
+
+        if custom_hooks is not None:
+            self.register_custom_hooks(custom_hooks)
+
+    def resume(self,
+               filename: str,
+               resume_optimizer: bool = True,
+               resume_param_scheduler: bool = True,
+               map_location: Union[str, Callable] = 'default') -> None:
+        """Resume model from checkpoint.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+            resume_optimizer (bool): Whether to resume optimizer state.
+                Defaults to True.
+            resume_param_scheduler (bool): Whether to resume param scheduler
+                state. Defaults to True.
+            map_location (str or callable):A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'default'.
+        """
+        if map_location == 'default':
+            device = get_device()
+            checkpoint = self.load_checkpoint(filename, map_location=device)
+        else:
+            checkpoint = self.load_checkpoint(
+                filename, map_location=map_location)
+
+        self.train_loop._epoch = checkpoint['meta']['epoch']
+        self.train_loop._iter = checkpoint['meta']['iter']
+
+        # check whether the number of GPU used for current experiment
+        # is consistent with resuming from checkpoint
+        if 'config' in checkpoint['meta']:
+            config = mmengine.Config.fromstring(
+                checkpoint['meta']['config'], file_format='.py')
+            previous_gpu_ids = config.get('gpu_ids', None)
+            if (previous_gpu_ids is not None and len(previous_gpu_ids) > 0
+                    and len(previous_gpu_ids) != self._world_size):
+                # TODO, should we modify the iteration?
+                if (self.auto_scale_lr is None
+                        or not self.auto_scale_lr.get('enable', False)):
+                    raise RuntimeError(
+                        'Number of GPUs used for current experiment is not '
+                        'consistent with the checkpoint being resumed from. '
+                        'This will result in poor performance due to the '
+                        'learning rate. You must set the '
+                        '`auto_scale_lr` parameter for Runner and make '
+                        '`auto_scale_lr["enable"]=True`.')
+                else:
+                    self.logger.info(
+                        'Number of GPU used for current experiment is not '
+                        'consistent with resuming from checkpoint but the '
+                        'leaning rate will be adjusted according to the '
+                        f'setting in auto_scale_lr={self.auto_scale_lr}')
+
+        # resume random seed
+        resumed_seed = checkpoint['meta'].get('seed', None)
+        current_seed = self._randomness_cfg.get('seed')
+        if resumed_seed is not None and resumed_seed != current_seed:
+            if current_seed is not None:
+                self.logger.warning(f'The value of random seed in the '
+                                    f'checkpoint "{resumed_seed}" is '
+                                    f'different from the value in '
+                                    f'`randomness` config "{current_seed}"')
+            self._randomness_cfg.update(seed=resumed_seed)
+            self.set_randomness(**self._randomness_cfg)
+
+        resumed_dataset_meta = checkpoint['meta'].get('dataset_meta', None)
+        dataset_meta = getattr(self.train_dataloader.dataset, 'metainfo', None)
+
+        # `resumed_dataset_meta` and `dataset_meta` could be object like
+        # np.ndarray, which cannot be directly judged as equal or not,
+        # therefore we just compared their dumped results.
+        if pickle.dumps(resumed_dataset_meta) != pickle.dumps(dataset_meta):
+            self.logger.warning(
+                'The dataset metainfo from the resumed checkpoint is '
+                'different from the current training dataset, please '
+                'check the correctness of the checkpoint or the training '
+                'dataset.')
+
+        self.message_hub.load_state_dict(checkpoint['message_hub'])
+
+        # resume optimizer
+        if 'optimizer' in checkpoint and resume_optimizer:
+            self.optim_wrapper = self.build_optim_wrapper(self.optim_wrapper)
+            self.optim_wrapper.load_state_dict(  # type: ignore
+                checkpoint['optimizer'])
+
+        # resume param scheduler
+        if resume_param_scheduler and self.param_schedulers is None:
+            self.logger.warning(
+                '`resume_param_scheduler` is True but `self.param_schedulers` '
+                'is None, so skip resuming parameter schedulers')
+            resume_param_scheduler = False
+        if 'param_schedulers' in checkpoint and resume_param_scheduler:
+            self.param_schedulers = self.build_param_scheduler(  # type: ignore
+                self.param_schedulers)  # type: ignore
+            if isinstance(self.param_schedulers, dict):
+                for name, schedulers in self.param_schedulers.items():
+                    for scheduler, ckpt_scheduler in zip(
+                            schedulers, checkpoint['param_schedulers'][name]):
+                        scheduler.load_state_dict(ckpt_scheduler)
+            else:
+                for scheduler, ckpt_scheduler in zip(
+                        self.param_schedulers,  # type: ignore
+                        checkpoint['param_schedulers']):
+                    scheduler.load_state_dict(ckpt_scheduler)
+
+        self._has_loaded = True
+
+        self.logger.info(f'resumed epoch: {self.epoch}, iter: {self.iter}')
+
+    def load_checkpoint(self,
+                        filename: str,
+                        map_location: Union[str, Callable] = 'cpu',
+                        strict: bool = False,
+                        revise_keys: list = [(r'^module.', '')]):
+        """Load checkpoint from given ``filename``.
+
+        Args:
+            filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+                ``open-mmlab://xxx``.
+            map_location (str or callable): A string or a callable function to
+                specifying how to remap storage locations.
+                Defaults to 'cpu'.
+            strict (bool): strict (bool): Whether to allow different params for
+                the model and checkpoint.
+            revise_keys (list): A list of customized keywords to modify the
+                state_dict in checkpoint. Each item is a (pattern, replacement)
+                pair of the regular expression operations. Defaults to strip
+                the prefix 'module.' by [(r'^module\\.', '')].
+        """
+        checkpoint = _load_checkpoint(filename, map_location=map_location)
+
+        # Add comments to describe the usage of `after_load_ckpt`
+        self.call_hook('after_load_checkpoint', checkpoint=checkpoint)
+
+        if is_model_wrapper(self.model):
+            model = self.model.module
+        else:
+            model = self.model
+
+        checkpoint = _load_checkpoint_to_model(
+            model, checkpoint, strict, revise_keys=revise_keys)
+
+        self._has_loaded = True
+
+        self.logger.info(f'Load checkpoint from {filename}')
+
+        return checkpoint
+
+    @master_only
+    def save_checkpoint(
+        self,
+        out_dir: str,
+        filename: str,
+        file_client_args: Optional[dict] = None,
+        save_optimizer: bool = True,
+        save_param_scheduler: bool = True,
+        meta: Optional[dict] = None,
+        by_epoch: bool = True,
+        backend_args: Optional[dict] = None,
+    ):
+        """Save checkpoints.
+
+        ``CheckpointHook`` invokes this method to save checkpoints
+        periodically.
+
+        Args:
+            out_dir (str): The directory that checkpoints are saved.
+            filename (str): The checkpoint filename.
+            file_client_args (dict, optional): Arguments to instantiate a
+                FileClient. See :class:`mmengine.fileio.FileClient` for
+                details. Defaults to None. It will be deprecated in future.
+                Please use `backend_args` instead.
+            save_optimizer (bool): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            save_param_scheduler (bool): Whether to save the param_scheduler
+                to the checkpoint. Defaults to True.
+            meta (dict, optional): The meta information to be saved in the
+                checkpoint. Defaults to None.
+            by_epoch (bool): Decide the number of epoch or iteration saved in
+                checkpoint. Defaults to True.
+            backend_args (dict, optional): Arguments to instantiate the
+                prefix of uri corresponding backend. Defaults to None.
+                New in v0.2.0.
+        """
+        if meta is None:
+            meta = {}
+        elif not isinstance(meta, dict):
+            raise TypeError(
+                f'meta should be a dict or None, but got {type(meta)}')
+
+        if by_epoch:
+            # self.epoch increments 1 after
+            # `self.call_hook('after_train_epoch)` but `save_checkpoint` is
+            # called by `after_train_epoch`` method of `CheckpointHook` so
+            # `epoch` should be `self.epoch + 1`
+            meta.setdefault('epoch', self.epoch + 1)
+            meta.setdefault('iter', self.iter)
+        else:
+            meta.setdefault('epoch', self.epoch)
+            meta.setdefault('iter', self.iter + 1)
+
+        if file_client_args is not None:
+            warnings.warn(
+                '"file_client_args" will be deprecated in future. '
+                'Please use "backend_args" instead', DeprecationWarning)
+            if backend_args is not None:
+                raise ValueError(
+                    '"file_client_args" and "backend_args" cannot be set at '
+                    'the same time.')
+
+            file_client = FileClient.infer_client(file_client_args, out_dir)
+            filepath = file_client.join_path(out_dir, filename)
+        else:
+            filepath = join_path(  # type: ignore
+                out_dir, filename, backend_args=backend_args)
+
+        meta.update(
+            cfg=self.cfg.pretty_text,
+            seed=self.seed,
+            experiment_name=self.experiment_name,
+            time=time.strftime('%Y%m%d_%H%M%S', time.localtime()),
+            mmengine_version=mmengine.__version__ + get_git_hash())
+
+        if hasattr(self.train_dataloader.dataset, 'metainfo'):
+            meta.update(dataset_meta=self.train_dataloader.dataset.metainfo)
+
+        if is_model_wrapper(self.model):
+            model = self.model.module
+        else:
+            model = self.model
+
+        checkpoint = {
+            'meta':
+            meta,
+            'state_dict':
+            weights_to_cpu(model.state_dict()),
+            'message_hub':
+            apply_to(self.message_hub.state_dict(),
+                     lambda x: hasattr(x, 'cpu'), lambda x: x.cpu()),
+        }
+        # save optimizer state dict to checkpoint
+        if save_optimizer:
+            if isinstance(self.optim_wrapper, OptimWrapper):
+                checkpoint['optimizer'] = apply_to(
+                    self.optim_wrapper.state_dict(),
+                    lambda x: hasattr(x, 'cpu'), lambda x: x.cpu())
+            else:
+                raise TypeError(
+                    'self.optim_wrapper should be an `OptimWrapper` '
+                    'or `OptimWrapperDict` instance, but got '
+                    f'{self.optim_wrapper}')
+
+        # save param scheduler state dict
+        if save_param_scheduler and self.param_schedulers is None:
+            self.logger.warning(
+                '`save_param_scheduler` is True but `self.param_schedulers` '
+                'is None, so skip saving parameter schedulers')
+            save_param_scheduler = False
+        if save_param_scheduler:
+            if isinstance(self.param_schedulers, dict):
+                checkpoint['param_schedulers'] = dict()
+                for name, schedulers in self.param_schedulers.items():
+                    checkpoint['param_schedulers'][name] = []
+                    for scheduler in schedulers:
+                        state_dict = scheduler.state_dict()
+                        checkpoint['param_schedulers'][name].append(state_dict)
+            else:
+                checkpoint['param_schedulers'] = []
+                for scheduler in self.param_schedulers:  # type: ignore
+                    state_dict = scheduler.state_dict()  # type: ignore
+                    checkpoint['param_schedulers'].append(state_dict)
+
+        self.call_hook('before_save_checkpoint', checkpoint=checkpoint)
+        save_checkpoint(
+            checkpoint,
+            filepath,
+            file_client_args=file_client_args,
+            backend_args=backend_args)
+
+    @master_only
+    def dump_config(self) -> None:
+        """Dump config to `work_dir`."""
+        if self.cfg.filename is not None:
+            filename = osp.basename(self.cfg.filename)
+        else:
+            filename = f'{self.timestamp}.py'
+        self.cfg.dump(osp.join(self.work_dir, filename))
+
+    def _check_scheduler_cfg(
+            self, param_scheduler: Optional[Union[dict, list,
+                                                  _ParamScheduler]]) -> None:
+        """Parse `param_scheduler` to a list of parameter schedulers, or a
+        `dict` of which each value is a list of parameter schedulers.
+
+        If only one optimizer is used, the parsed config should be a
+        list of parameter scheduler configs or instances. If multiple
+        optimizers are used, the parsed config should be `dict`.
+        Its key should be consistent with the optimizer `dict` and its value
+        should be a list of parameter scheduler configs or instances. See
+        :meth:`build_param_scheduler` for more details.
+
+        Examples:
+            >>> # valid scheduler:
+            >>> # empty scheduler
+            >>> scheduler = None
+            >>> # Single scheduler
+            >>> scheduler = dict(type='MultiStepLR', milestones=[1, 2])
+            >>> # Single list schedulers
+            >>> scheduler = [dict(type='MultiStepLR', milestones=[1, 2]),
+            >>>              dict(type='MultiStepLR', milestones=[2, 3])]
+            >>> # `dict` of schedulers
+            >>> scheduler = dict(linear1=dict(type='MultiStepLR', milestones=[1, 2]),
+            >>>                  linear2=dict(type='MultiStepLR', milestones=[1, 2]))
+            >>> # `dict` of `list` of schedulers
+            >>> scheduler = dict(linear1=[dict(type='MultiStepLR', milestones=[1, 2])],
+            >>>                  linear2=[dict(type='MultiStepLR', milestones=[1, 2])])
+            >>> # Single built scheduler
+            >>> from mmengine.optim import MultiStepLR
+            >>> scheduler = MultiStepLR(milestones=[1, 2], optimizer=optimizer)
+            >>> # Single built list schedulers
+            >>> scheduler = [MultiStepLR(milestones=[1, 2], optimizer=optimizer)]
+            >>> # dict of built scheduler
+            >>> scheduler = dict(linear1=MultiStepLR(milestones=[1, 2], optimizer=optimizer),
+            >>>                  linear2=MultiStepLR(milestones=[1, 2], optimizer=optimizer))
+            >>> # dict of built list schedulers
+            >>> scheduler = dict(linear1=[MultiStepLR(milestones=[1, 2], optimizer=optimizer)],
+            >>>                  linear2=[MultiStepLR(milestones=[1, 2], optimizer=optimizer)])
+
+        Args:
+            param_scheduler (dict or list): The original parameter scheduler.
+        """  # noqa: E501
+        if param_scheduler is None:
+            return
+        if isinstance(param_scheduler, _ParamScheduler):
+            return
+        if is_seq_of(param_scheduler, _ParamScheduler):
+            return
+
+        if is_seq_of(param_scheduler, dict):
+            for _param_scheduler in param_scheduler:
+                assert 'type' in _param_scheduler, (
+                    'Each parameter scheduler should contain the key type, '
+                    f'but got {_param_scheduler}')
+        elif isinstance(param_scheduler, dict):
+            if 'type' not in param_scheduler:
+                for key, _param_scheduler in param_scheduler.items():
+                    assert isinstance(
+                        _param_scheduler,
+                        (dict, tuple, list, _ParamScheduler)), (
+                            'Each value of `param_scheduler` should be a '
+                            f'dict or a list, but got {_param_scheduler} with '
+                            f'type {type(_ParamScheduler)}')
+
+        else:
+            raise TypeError(
+                '`param_scheduler` should be a `_ParamScheduler`, `dict`, '
+                f'list or a tuple, but got {type(param_scheduler)}. If '
+                '`param_scheduler` is a list of dict, it means a list of '
+                'scheduler configs for single optimizer. If it is a dict and '
+                'contains key `type`, it means a scheduler config for a '
+                'single optimizer. If it does not contain key `type`, it '
+                'means multiple lists of schedulers for multiple optimizers.')
+
+    def _log_env(self, env_cfg: dict) -> None:
+        """Logging environment information of the current task.
+
+        Args:
+            env_cfg (dict): The environment config of the runner.
+        """
+        # Collect and log environment information.
+        env = collect_env()
+        runtime_env = OrderedDict()
+        runtime_env.update(env_cfg)
+        runtime_env.update(self._randomness_cfg)
+        runtime_env['seed'] = self._seed
+        runtime_env['Distributed launcher'] = self._launcher
+        runtime_env['Distributed training'] = self._distributed
+        runtime_env['GPU number'] = self._world_size
+
+        env_info = '\n    ' + '\n    '.join(f'{k}: {v}'
+                                            for k, v in env.items())
+        runtime_env_info = '\n    ' + '\n    '.join(
+            f'{k}: {v}' for k, v in runtime_env.items())
+        dash_line = '-' * 60
+        self.logger.info('\n' + dash_line + '\nSystem environment:' +
+                         env_info + '\n'
+                         '\nRuntime environment:' + runtime_env_info + '\n' +
+                         dash_line + '\n')
+
+        if self.cfg._cfg_dict:
+            self.logger.info(f'Config:\n{self.cfg.pretty_text}')
+
+    def _maybe_compile(self, target: str) -> None:
+        """Use `torch.compile` to optimize model/wrapped_model."""
+        compile_cfg = self.cfg.get('compile', None)
+        if compile_cfg is None:
+            # no compile options given, won't compile
+            return
+
+        if isinstance(compile_cfg, bool):
+            if not compile_cfg:
+                # compile=False, compilation is disabled
+                return
+            # compile=True, use default configurations
+            compile_cfg = dict()
+
+        assert digit_version(TORCH_VERSION) >= digit_version('2.0.0'), (
+            'PyTorch >= 2.0.0 is required to enable torch.compile')
+        assert isinstance(compile_cfg, dict), (
+            f'`compile` should be a dict or bool, got {type(compile_cfg)}')
+
+        func = getattr(self.model, target)
+        compiled_func = torch.compile(func, **compile_cfg)
+        setattr(self.model, target, compiled_func)
+        self.logger.info('Model has been "compiled". The first few iterations'
+                         ' will be slow, please be patient.')
diff --git a/head_extractor/src/mmengine/runner/utils.py b/head_extractor/src/mmengine/runner/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91025eb07b95c5e01419a3e01dac7a5fa695a7b
--- /dev/null
+++ b/head_extractor/src/mmengine/runner/utils.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import random
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+
+from mmengine.device import is_cuda_available, is_musa_available
+from mmengine.dist import get_rank, sync_random_seed
+from mmengine.logging import print_log
+from mmengine.utils import digit_version, is_list_of
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+
+def calc_dynamic_intervals(
+    start_interval: int,
+    dynamic_interval_list: Optional[List[Tuple[int, int]]] = None
+) -> Tuple[List[int], List[int]]:
+    """Calculate dynamic intervals.
+
+    Args:
+        start_interval (int): The interval used in the beginning.
+        dynamic_interval_list (List[Tuple[int, int]], optional): The
+            first element in the tuple is a milestone and the second
+            element is a interval. The interval is used after the
+            corresponding milestone. Defaults to None.
+
+    Returns:
+        Tuple[List[int], List[int]]: a list of milestone and its corresponding
+        intervals.
+    """
+    if dynamic_interval_list is None:
+        return [0], [start_interval]
+
+    assert is_list_of(dynamic_interval_list, tuple)
+
+    dynamic_milestones = [0]
+    dynamic_milestones.extend(
+        [dynamic_interval[0] for dynamic_interval in dynamic_interval_list])
+    dynamic_intervals = [start_interval]
+    dynamic_intervals.extend(
+        [dynamic_interval[1] for dynamic_interval in dynamic_interval_list])
+    return dynamic_milestones, dynamic_intervals
+
+
+def set_random_seed(seed: Optional[int] = None,
+                    deterministic: bool = False,
+                    diff_rank_seed: bool = False) -> int:
+    """Set random seed.
+
+    Args:
+        seed (int, optional): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Defaults to False.
+        diff_rank_seed (bool): Whether to add rank number to the random seed to
+            have different random seed in different threads. Defaults to False.
+    """
+    if seed is None:
+        seed = sync_random_seed()
+
+    if diff_rank_seed:
+        rank = get_rank()
+        seed += rank
+
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    # torch.cuda.manual_seed(seed)
+    if is_cuda_available():
+        torch.cuda.manual_seed_all(seed)
+    elif is_musa_available():
+        torch.musa.manual_seed_all(seed)
+    # os.environ['PYTHONHASHSEED'] = str(seed)
+    if deterministic:
+        if torch.backends.cudnn.benchmark:
+            print_log(
+                'torch.backends.cudnn.benchmark is going to be set as '
+                '`False` to cause cuDNN to deterministically select an '
+                'algorithm',
+                logger='current',
+                level=logging.WARNING)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+        if digit_version(TORCH_VERSION) >= digit_version('1.10.0'):
+            torch.use_deterministic_algorithms(True)
+    return seed
+
+
+def _get_batch_size(dataloader: dict):
+    if isinstance(dataloader, dict):
+        if 'batch_size' in dataloader:
+            return dataloader['batch_size']
+        elif ('batch_sampler' in dataloader
+              and 'batch_size' in dataloader['batch_sampler']):
+            return dataloader['batch_sampler']['batch_size']
+        else:
+            raise ValueError('Please set batch_size in `Dataloader` or '
+                             '`batch_sampler`')
+    elif isinstance(dataloader, DataLoader):
+        return dataloader.batch_sampler.batch_size
+    else:
+        raise ValueError('dataloader should be a dict or a Dataloader '
+                         f'instance, but got {type(dataloader)}')
diff --git a/head_extractor/src/mmengine/structures/__init__.py b/head_extractor/src/mmengine/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4d94fd1f78aa52186b6ebe8229ea70c95fc5a2f
--- /dev/null
+++ b/head_extractor/src/mmengine/structures/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_data_element import BaseDataElement
+from .instance_data import InstanceData
+from .label_data import LabelData
+from .pixel_data import PixelData
+
+__all__ = ['BaseDataElement', 'InstanceData', 'LabelData', 'PixelData']
diff --git a/head_extractor/src/mmengine/structures/base_data_element.py b/head_extractor/src/mmengine/structures/base_data_element.py
new file mode 100644
index 0000000000000000000000000000000000000000..53bcd5babf8078e90f6dea0ff6a857f75cda0ae4
--- /dev/null
+++ b/head_extractor/src/mmengine/structures/base_data_element.py
@@ -0,0 +1,639 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Any, Iterator, Optional, Tuple, Type, Union
+
+import numpy as np
+import torch
+
+
+class BaseDataElement:
+    """A base data interface that supports Tensor-like and dict-like
+    operations.
+
+    A typical data elements refer to predicted results or ground truth labels
+    on a task, such as predicted bboxes, instance masks, semantic
+    segmentation masks, etc. Because groundtruth labels and predicted results
+    often have similar properties (for example, the predicted bboxes and the
+    groundtruth bboxes), MMEngine uses the same abstract data interface to
+    encapsulate predicted results and groundtruth labels, and it is recommended
+    to use different name conventions to distinguish them, such as using
+    ``gt_instances`` and ``pred_instances`` to distinguish between labels and
+    predicted results. Additionally, we distinguish data elements at instance
+    level, pixel level, and label level. Each of these types has its own
+    characteristics. Therefore, MMEngine defines the base class
+    ``BaseDataElement``, and implement ``InstanceData``, ``PixelData``, and
+    ``LabelData`` inheriting from ``BaseDataElement`` to represent different
+    types of ground truth labels or predictions.
+
+    Another common data element is sample data. A sample data consists of input
+    data (such as an image) and its annotations and predictions. In general,
+    an image can have multiple types of annotations and/or predictions at the
+    same time (for example, both pixel-level semantic segmentation annotations
+    and instance-level detection bboxes annotations). All labels and
+    predictions of a training sample are often passed between Dataset, Model,
+    Visualizer, and Evaluator components. In order to simplify the interface
+    between components, we can treat them as a large data element and
+    encapsulate them. Such data elements are generally called XXDataSample in
+    the OpenMMLab. Therefore, Similar to `nn.Module`, the `BaseDataElement`
+    allows `BaseDataElement` as its attribute. Such a class generally
+    encapsulates all the data of a sample in the algorithm library, and its
+    attributes generally are various types of data elements. For example,
+    MMDetection is assigned by the BaseDataElement to encapsulate all the data
+    elements of the sample labeling and prediction of a sample in the
+    algorithm library.
+
+    The attributes in ``BaseDataElement`` are divided into two parts,
+    the ``metainfo`` and the ``data`` respectively.
+
+        - ``metainfo``: Usually contains the
+          information about the image such as filename,
+          image_shape, pad_shape, etc. The attributes can be accessed or
+          modified by dict-like or object-like operations, such as
+          ``.`` (for data access and modification), ``in``, ``del``,
+          ``pop(str)``, ``get(str)``, ``metainfo_keys()``,
+          ``metainfo_values()``, ``metainfo_items()``, ``set_metainfo()`` (for
+          set or change key-value pairs in metainfo).
+
+        - ``data``: Annotations or model predictions are
+          stored. The attributes can be accessed or modified by
+          dict-like or object-like operations, such as
+          ``.``, ``in``, ``del``, ``pop(str)``, ``get(str)``, ``keys()``,
+          ``values()``, ``items()``. Users can also apply tensor-like
+          methods to all :obj:`torch.Tensor` in the ``data_fields``,
+          such as ``.cuda()``, ``.cpu()``, ``.numpy()``, ``.to()``,
+          ``to_tensor()``, ``.detach()``.
+
+    Args:
+        metainfo (dict, optional): A dict contains the meta information
+            of single image, such as ``dict(img_shape=(512, 512, 3),
+            scale_factor=(1, 1, 1, 1))``. Defaults to None.
+        kwargs (dict, optional): A dict contains annotations of single image or
+            model predictions. Defaults to None.
+
+    Examples:
+        >>> import torch
+        >>> from mmengine.structures import BaseDataElement
+        >>> gt_instances = BaseDataElement()
+        >>> bboxes = torch.rand((5, 4))
+        >>> scores = torch.rand((5,))
+        >>> img_id = 0
+        >>> img_shape = (800, 1333)
+        >>> gt_instances = BaseDataElement(
+        ...     metainfo=dict(img_id=img_id, img_shape=img_shape),
+        ...     bboxes=bboxes, scores=scores)
+        >>> gt_instances = BaseDataElement(
+        ...     metainfo=dict(img_id=img_id, img_shape=(640, 640)))
+
+        >>> # new
+        >>> gt_instances1 = gt_instances.new(
+        ...     metainfo=dict(img_id=1, img_shape=(640, 640)),
+        ...                   bboxes=torch.rand((5, 4)),
+        ...                   scores=torch.rand((5,)))
+        >>> gt_instances2 = gt_instances1.new()
+
+        >>> # add and process property
+        >>> gt_instances = BaseDataElement()
+        >>> gt_instances.set_metainfo(dict(img_id=9, img_shape=(100, 100)))
+        >>> assert 'img_shape' in gt_instances.metainfo_keys()
+        >>> assert 'img_shape' in gt_instances
+        >>> assert 'img_shape' not in gt_instances.keys()
+        >>> assert 'img_shape' in gt_instances.all_keys()
+        >>> print(gt_instances.img_shape)
+        (100, 100)
+        >>> gt_instances.scores = torch.rand((5,))
+        >>> assert 'scores' in gt_instances.keys()
+        >>> assert 'scores' in gt_instances
+        >>> assert 'scores' in gt_instances.all_keys()
+        >>> assert 'scores' not in gt_instances.metainfo_keys()
+        >>> print(gt_instances.scores)
+        tensor([0.5230, 0.7885, 0.2426, 0.3911, 0.4876])
+        >>> gt_instances.bboxes = torch.rand((5, 4))
+        >>> assert 'bboxes' in gt_instances.keys()
+        >>> assert 'bboxes' in gt_instances
+        >>> assert 'bboxes' in gt_instances.all_keys()
+        >>> assert 'bboxes' not in gt_instances.metainfo_keys()
+        >>> print(gt_instances.bboxes)
+        tensor([[0.0900, 0.0424, 0.1755, 0.4469],
+                [0.8648, 0.0592, 0.3484, 0.0913],
+                [0.5808, 0.1909, 0.6165, 0.7088],
+                [0.5490, 0.4209, 0.9416, 0.2374],
+                [0.3652, 0.1218, 0.8805, 0.7523]])
+
+        >>> # delete and change property
+        >>> gt_instances = BaseDataElement(
+        ...     metainfo=dict(img_id=0, img_shape=(640, 640)),
+        ...     bboxes=torch.rand((6, 4)), scores=torch.rand((6,)))
+        >>> gt_instances.set_metainfo(dict(img_shape=(1280, 1280)))
+        >>> gt_instances.img_shape  # (1280, 1280)
+        >>> gt_instances.bboxes = gt_instances.bboxes * 2
+        >>> gt_instances.get('img_shape', None)  # (1280, 1280)
+        >>> gt_instances.get('bboxes', None)  # 6x4 tensor
+        >>> del gt_instances.img_shape
+        >>> del gt_instances.bboxes
+        >>> assert 'img_shape' not in gt_instances
+        >>> assert 'bboxes' not in gt_instances
+        >>> gt_instances.pop('img_shape', None)  # None
+        >>> gt_instances.pop('bboxes', None)  # None
+
+        >>> # Tensor-like
+        >>> cuda_instances = gt_instances.cuda()
+        >>> cuda_instances = gt_instances.to('cuda:0')
+        >>> cpu_instances = cuda_instances.cpu()
+        >>> cpu_instances = cuda_instances.to('cpu')
+        >>> fp16_instances = cuda_instances.to(
+        ...     device=None, dtype=torch.float16, non_blocking=False,
+        ...     copy=False, memory_format=torch.preserve_format)
+        >>> cpu_instances = cuda_instances.detach()
+        >>> np_instances = cpu_instances.numpy()
+
+        >>> # print
+        >>> metainfo = dict(img_shape=(800, 1196, 3))
+        >>> gt_instances = BaseDataElement(
+        ...     metainfo=metainfo, det_labels=torch.LongTensor([0, 1, 2, 3]))
+        >>> sample = BaseDataElement(metainfo=metainfo,
+        ...                          gt_instances=gt_instances)
+        >>> print(sample)
+        <BaseDataElement(
+            META INFORMATION
+            img_shape: (800, 1196, 3)
+            DATA FIELDS
+            gt_instances: <BaseDataElement(
+                    META INFORMATION
+                    img_shape: (800, 1196, 3)
+                    DATA FIELDS
+                    det_labels: tensor([0, 1, 2, 3])
+                ) at 0x7f0ec5eadc70>
+        ) at 0x7f0fea49e130>
+
+        >>> # inheritance
+        >>> class DetDataSample(BaseDataElement):
+        ...     @property
+        ...     def proposals(self):
+        ...         return self._proposals
+        ...     @proposals.setter
+        ...     def proposals(self, value):
+        ...         self.set_field(value, '_proposals', dtype=BaseDataElement)
+        ...     @proposals.deleter
+        ...     def proposals(self):
+        ...         del self._proposals
+        ...     @property
+        ...     def gt_instances(self):
+        ...         return self._gt_instances
+        ...     @gt_instances.setter
+        ...     def gt_instances(self, value):
+        ...         self.set_field(value, '_gt_instances',
+        ...                        dtype=BaseDataElement)
+        ...     @gt_instances.deleter
+        ...     def gt_instances(self):
+        ...         del self._gt_instances
+        ...     @property
+        ...     def pred_instances(self):
+        ...         return self._pred_instances
+        ...     @pred_instances.setter
+        ...     def pred_instances(self, value):
+        ...         self.set_field(value, '_pred_instances',
+        ...                        dtype=BaseDataElement)
+        ...     @pred_instances.deleter
+        ...     def pred_instances(self):
+        ...         del self._pred_instances
+        >>> det_sample = DetDataSample()
+        >>> proposals = BaseDataElement(bboxes=torch.rand((5, 4)))
+        >>> det_sample.proposals = proposals
+        >>> assert 'proposals' in det_sample
+        >>> assert det_sample.proposals == proposals
+        >>> del det_sample.proposals
+        >>> assert 'proposals' not in det_sample
+        >>> with self.assertRaises(AssertionError):
+        ...     det_sample.proposals = torch.rand((5, 4))
+    """
+
+    def __init__(self, *, metainfo: Optional[dict] = None, **kwargs) -> None:
+
+        self._metainfo_fields: set = set()
+        self._data_fields: set = set()
+
+        if metainfo is not None:
+            self.set_metainfo(metainfo=metainfo)
+        if kwargs:
+            self.set_data(kwargs)
+
+    def set_metainfo(self, metainfo: dict) -> None:
+        """Set or change key-value pairs in ``metainfo_field`` by parameter
+        ``metainfo``.
+
+        Args:
+            metainfo (dict): A dict contains the meta information
+                of image, such as ``img_shape``, ``scale_factor``, etc.
+        """
+        assert isinstance(
+            metainfo,
+            dict), f'metainfo should be a ``dict`` but got {type(metainfo)}'
+        meta = copy.deepcopy(metainfo)
+        for k, v in meta.items():
+            self.set_field(name=k, value=v, field_type='metainfo', dtype=None)
+
+    def set_data(self, data: dict) -> None:
+        """Set or change key-value pairs in ``data_field`` by parameter
+        ``data``.
+
+        Args:
+            data (dict): A dict contains annotations of image or
+                model predictions.
+        """
+        assert isinstance(data,
+                          dict), f'data should be a `dict` but got {data}'
+        for k, v in data.items():
+            # Use `setattr()` rather than `self.set_field` to allow `set_data`
+            # to set property method.
+            setattr(self, k, v)
+
+    def update(self, instance: 'BaseDataElement') -> None:
+        """The update() method updates the BaseDataElement with the elements
+        from another BaseDataElement object.
+
+        Args:
+            instance (BaseDataElement): Another BaseDataElement object for
+                update the current object.
+        """
+        assert isinstance(
+            instance, BaseDataElement
+        ), f'instance should be a `BaseDataElement` but got {type(instance)}'
+        self.set_metainfo(dict(instance.metainfo_items()))
+        self.set_data(dict(instance.items()))
+
+    def new(self,
+            *,
+            metainfo: Optional[dict] = None,
+            **kwargs) -> 'BaseDataElement':
+        """Return a new data element with same type. If ``metainfo`` and
+        ``data`` are None, the new data element will have same metainfo and
+        data. If metainfo or data is not None, the new result will overwrite it
+        with the input value.
+
+        Args:
+            metainfo (dict, optional): A dict contains the meta information
+                of image, such as ``img_shape``, ``scale_factor``, etc.
+                Defaults to None.
+            kwargs (dict): A dict contains annotations of image or
+                model predictions.
+
+        Returns:
+            BaseDataElement: A new data element with same type.
+        """
+        new_data = self.__class__()
+
+        if metainfo is not None:
+            new_data.set_metainfo(metainfo)
+        else:
+            new_data.set_metainfo(dict(self.metainfo_items()))
+        if kwargs:
+            new_data.set_data(kwargs)
+        else:
+            new_data.set_data(dict(self.items()))
+        return new_data
+
+    def clone(self):
+        """Deep copy the current data element.
+
+        Returns:
+            BaseDataElement: The copy of current data element.
+        """
+        clone_data = self.__class__()
+        clone_data.set_metainfo(dict(self.metainfo_items()))
+        clone_data.set_data(dict(self.items()))
+        return clone_data
+
+    def keys(self) -> list:
+        """
+        Returns:
+            list: Contains all keys in data_fields.
+        """
+        # We assume that the name of the attribute related to property is
+        # '_' + the name of the property. We use this rule to filter out
+        # private keys.
+        # TODO: Use a more robust way to solve this problem
+        private_keys = {
+            '_' + key
+            for key in self._data_fields
+            if isinstance(getattr(type(self), key, None), property)
+        }
+        return list(self._data_fields - private_keys)
+
+    def metainfo_keys(self) -> list:
+        """
+        Returns:
+            list: Contains all keys in metainfo_fields.
+        """
+        return list(self._metainfo_fields)
+
+    def values(self) -> list:
+        """
+        Returns:
+            list: Contains all values in data.
+        """
+        return [getattr(self, k) for k in self.keys()]
+
+    def metainfo_values(self) -> list:
+        """
+        Returns:
+            list: Contains all values in metainfo.
+        """
+        return [getattr(self, k) for k in self.metainfo_keys()]
+
+    def all_keys(self) -> list:
+        """
+        Returns:
+            list: Contains all keys in metainfo and data.
+        """
+        return self.metainfo_keys() + self.keys()
+
+    def all_values(self) -> list:
+        """
+        Returns:
+            list: Contains all values in metainfo and data.
+        """
+        return self.metainfo_values() + self.values()
+
+    def all_items(self) -> Iterator[Tuple[str, Any]]:
+        """
+        Returns:
+            iterator: An iterator object whose element is (key, value) tuple
+            pairs for ``metainfo`` and ``data``.
+        """
+        for k in self.all_keys():
+            yield (k, getattr(self, k))
+
+    def items(self) -> Iterator[Tuple[str, Any]]:
+        """
+        Returns:
+            iterator: An iterator object whose element is (key, value) tuple
+            pairs for ``data``.
+        """
+        for k in self.keys():
+            yield (k, getattr(self, k))
+
+    def metainfo_items(self) -> Iterator[Tuple[str, Any]]:
+        """
+        Returns:
+            iterator: An iterator object whose element is (key, value) tuple
+            pairs for ``metainfo``.
+        """
+        for k in self.metainfo_keys():
+            yield (k, getattr(self, k))
+
+    @property
+    def metainfo(self) -> dict:
+        """dict: A dict contains metainfo of current data element."""
+        return dict(self.metainfo_items())
+
+    def __setattr__(self, name: str, value: Any):
+        """setattr is only used to set data."""
+        if name in ('_metainfo_fields', '_data_fields'):
+            if not hasattr(self, name):
+                super().__setattr__(name, value)
+            else:
+                raise AttributeError(f'{name} has been used as a '
+                                     'private attribute, which is immutable.')
+        else:
+            self.set_field(
+                name=name, value=value, field_type='data', dtype=None)
+
+    def __delattr__(self, item: str):
+        """Delete the item in dataelement.
+
+        Args:
+            item (str): The key to delete.
+        """
+        if item in ('_metainfo_fields', '_data_fields'):
+            raise AttributeError(f'{item} has been used as a '
+                                 'private attribute, which is immutable.')
+        super().__delattr__(item)
+        if item in self._metainfo_fields:
+            self._metainfo_fields.remove(item)
+        elif item in self._data_fields:
+            self._data_fields.remove(item)
+
+    # dict-like methods
+    __delitem__ = __delattr__
+
+    def get(self, key, default=None) -> Any:
+        """Get property in data and metainfo as the same as python."""
+        # Use `getattr()` rather than `self.__dict__.get()` to allow getting
+        # properties.
+        return getattr(self, key, default)
+
+    def pop(self, *args) -> Any:
+        """Pop property in data and metainfo as the same as python."""
+        assert len(args) < 3, '``pop`` get more than 2 arguments'
+        name = args[0]
+        if name in self._metainfo_fields:
+            self._metainfo_fields.remove(args[0])
+            return self.__dict__.pop(*args)
+
+        elif name in self._data_fields:
+            self._data_fields.remove(args[0])
+            return self.__dict__.pop(*args)
+
+        # with default value
+        elif len(args) == 2:
+            return args[1]
+        else:
+            # don't just use 'self.__dict__.pop(*args)' for only popping key in
+            # metainfo or data
+            raise KeyError(f'{args[0]} is not contained in metainfo or data')
+
+    def __contains__(self, item: str) -> bool:
+        """Whether the item is in dataelement.
+
+        Args:
+            item (str): The key to inquire.
+        """
+        return item in self._data_fields or item in self._metainfo_fields
+
+    def set_field(self,
+                  value: Any,
+                  name: str,
+                  dtype: Optional[Union[Type, Tuple[Type, ...]]] = None,
+                  field_type: str = 'data') -> None:
+        """Special method for set union field, used as property.setter
+        functions."""
+        assert field_type in ['metainfo', 'data']
+        if dtype is not None:
+            assert isinstance(
+                value,
+                dtype), f'{value} should be a {dtype} but got {type(value)}'
+
+        if field_type == 'metainfo':
+            if name in self._data_fields:
+                raise AttributeError(
+                    f'Cannot set {name} to be a field of metainfo '
+                    f'because {name} is already a data field')
+            self._metainfo_fields.add(name)
+        else:
+            if name in self._metainfo_fields:
+                raise AttributeError(
+                    f'Cannot set {name} to be a field of data '
+                    f'because {name} is already a metainfo field')
+            self._data_fields.add(name)
+        super().__setattr__(name, value)
+
+    # Tensor-like methods
+    def to(self, *args, **kwargs) -> 'BaseDataElement':
+        """Apply same name function to all tensors in data_fields."""
+        new_data = self.new()
+        for k, v in self.items():
+            if hasattr(v, 'to'):
+                v = v.to(*args, **kwargs)
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    # Tensor-like methods
+    def cpu(self) -> 'BaseDataElement':
+        """Convert all tensors to CPU in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.cpu()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    # Tensor-like methods
+    def cuda(self) -> 'BaseDataElement':
+        """Convert all tensors to GPU in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.cuda()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    # Tensor-like methods
+    def musa(self) -> 'BaseDataElement':
+        """Convert all tensors to musa in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.musa()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    # Tensor-like methods
+    def npu(self) -> 'BaseDataElement':
+        """Convert all tensors to NPU in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.npu()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    def mlu(self) -> 'BaseDataElement':
+        """Convert all tensors to MLU in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.mlu()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    # Tensor-like methods
+    def detach(self) -> 'BaseDataElement':
+        """Detach all tensors in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.detach()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    # Tensor-like methods
+    def numpy(self) -> 'BaseDataElement':
+        """Convert all tensors to np.ndarray in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            if isinstance(v, (torch.Tensor, BaseDataElement)):
+                v = v.detach().cpu().numpy()
+                data = {k: v}
+                new_data.set_data(data)
+        return new_data
+
+    def to_tensor(self) -> 'BaseDataElement':
+        """Convert all np.ndarray to tensor in data."""
+        new_data = self.new()
+        for k, v in self.items():
+            data = {}
+            if isinstance(v, np.ndarray):
+                v = torch.from_numpy(v)
+                data[k] = v
+            elif isinstance(v, BaseDataElement):
+                v = v.to_tensor()
+                data[k] = v
+            new_data.set_data(data)
+        return new_data
+
+    def to_dict(self) -> dict:
+        """Convert BaseDataElement to dict."""
+        return {
+            k: v.to_dict() if isinstance(v, BaseDataElement) else v
+            for k, v in self.all_items()
+        }
+
+    def __repr__(self) -> str:
+        """Represent the object."""
+
+        def _addindent(s_: str, num_spaces: int) -> str:
+            """This func is modified from `pytorch` https://github.com/pytorch/
+            pytorch/blob/b17b2b1cc7b017c3daaeff8cc7ec0f514d42ec37/torch/nn/modu
+            les/module.py#L29.
+
+            Args:
+                s_ (str): The string to add spaces.
+                num_spaces (int): The num of space to add.
+
+            Returns:
+                str: The string after add indent.
+            """
+            s = s_.split('\n')
+            # don't do anything for single-line stuff
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)  # type: ignore
+            s = first + '\n' + s  # type: ignore
+            return s  # type: ignore
+
+        def dump(obj: Any) -> str:
+            """Represent the object.
+
+            Args:
+                obj (Any): The obj to represent.
+
+            Returns:
+                str: The represented str.
+            """
+            _repr = ''
+            if isinstance(obj, dict):
+                for k, v in obj.items():
+                    _repr += f'\n{k}: {_addindent(dump(v), 4)}'
+            elif isinstance(obj, BaseDataElement):
+                _repr += '\n\n    META INFORMATION'
+                metainfo_items = dict(obj.metainfo_items())
+                _repr += _addindent(dump(metainfo_items), 4)
+                _repr += '\n\n    DATA FIELDS'
+                items = dict(obj.items())
+                _repr += _addindent(dump(items), 4)
+                classname = obj.__class__.__name__
+                _repr = f'<{classname}({_repr}\n) at {hex(id(obj))}>'
+            else:
+                _repr += repr(obj)
+            return _repr
+
+        return dump(self)
diff --git a/head_extractor/src/mmengine/structures/instance_data.py b/head_extractor/src/mmengine/structures/instance_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..369d445f282b460be57b73d3bf64f679a4d286f1
--- /dev/null
+++ b/head_extractor/src/mmengine/structures/instance_data.py
@@ -0,0 +1,311 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+from collections.abc import Sized
+from typing import Any, List, Union
+
+import numpy as np
+import torch
+
+from mmengine.device import get_device
+from .base_data_element import BaseDataElement
+
+BoolTypeTensor: Union[Any]
+LongTypeTensor: Union[Any]
+
+if get_device() == 'npu':
+    BoolTypeTensor = Union[torch.BoolTensor, torch.npu.BoolTensor]
+    LongTypeTensor = Union[torch.LongTensor, torch.npu.LongTensor]
+elif get_device() == 'mlu':
+    BoolTypeTensor = Union[torch.BoolTensor, torch.mlu.BoolTensor]
+    LongTypeTensor = Union[torch.LongTensor, torch.mlu.LongTensor]
+elif get_device() == 'musa':
+    BoolTypeTensor = Union[torch.BoolTensor, torch.musa.BoolTensor]
+    LongTypeTensor = Union[torch.LongTensor, torch.musa.LongTensor]
+else:
+    BoolTypeTensor = Union[torch.BoolTensor, torch.cuda.BoolTensor]
+    LongTypeTensor = Union[torch.LongTensor, torch.cuda.LongTensor]
+
+IndexType: Union[Any] = Union[str, slice, int, list, LongTypeTensor,
+                              BoolTypeTensor, np.ndarray]
+
+
+# Modified from
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/data_structures/instance_data.py # noqa
+class InstanceData(BaseDataElement):
+    """Data structure for instance-level annotations or predictions.
+
+    Subclass of :class:`BaseDataElement`. All value in `data_fields`
+    should have the same length. This design refer to
+    https://github.com/facebookresearch/detectron2/blob/master/detectron2/structures/instances.py # noqa E501
+    InstanceData also support extra functions: ``index``, ``slice`` and ``cat`` for data field. The type of value
+    in data field can be base data structure such as `torch.Tensor`, `numpy.ndarray`, `list`, `str`, `tuple`,
+    and can be customized data structure that has ``__len__``, ``__getitem__`` and ``cat`` attributes.
+
+    Examples:
+        >>> # custom data structure
+        >>> class TmpObject:
+        ...     def __init__(self, tmp) -> None:
+        ...         assert isinstance(tmp, list)
+        ...         self.tmp = tmp
+        ...     def __len__(self):
+        ...         return len(self.tmp)
+        ...     def __getitem__(self, item):
+        ...         if isinstance(item, int):
+        ...             if item >= len(self) or item < -len(self):  # type:ignore
+        ...                 raise IndexError(f'Index {item} out of range!')
+        ...             else:
+        ...                 # keep the dimension
+        ...                 item = slice(item, None, len(self))
+        ...         return TmpObject(self.tmp[item])
+        ...     @staticmethod
+        ...     def cat(tmp_objs):
+        ...         assert all(isinstance(results, TmpObject) for results in tmp_objs)
+        ...         if len(tmp_objs) == 1:
+        ...             return tmp_objs[0]
+        ...         tmp_list = [tmp_obj.tmp for tmp_obj in tmp_objs]
+        ...         tmp_list = list(itertools.chain(*tmp_list))
+        ...         new_data = TmpObject(tmp_list)
+        ...         return new_data
+        ...     def __repr__(self):
+        ...         return str(self.tmp)
+        >>> from mmengine.structures import InstanceData
+        >>> import numpy as np
+        >>> import torch
+        >>> img_meta = dict(img_shape=(800, 1196, 3), pad_shape=(800, 1216, 3))
+        >>> instance_data = InstanceData(metainfo=img_meta)
+        >>> 'img_shape' in instance_data
+        True
+        >>> instance_data.det_labels = torch.LongTensor([2, 3])
+        >>> instance_data["det_scores"] = torch.Tensor([0.8, 0.7])
+        >>> instance_data.bboxes = torch.rand((2, 4))
+        >>> instance_data.polygons = TmpObject([[1, 2, 3, 4], [5, 6, 7, 8]])
+        >>> len(instance_data)
+        2
+        >>> print(instance_data)
+        <InstanceData(
+            META INFORMATION
+            img_shape: (800, 1196, 3)
+            pad_shape: (800, 1216, 3)
+            DATA FIELDS
+            det_labels: tensor([2, 3])
+            det_scores: tensor([0.8000, 0.7000])
+            bboxes: tensor([[0.4997, 0.7707, 0.0595, 0.4188],
+                        [0.8101, 0.3105, 0.5123, 0.6263]])
+            polygons: [[1, 2, 3, 4], [5, 6, 7, 8]]
+        ) at 0x7fb492de6280>
+        >>> sorted_results = instance_data[instance_data.det_scores.sort().indices]
+        >>> sorted_results.det_scores
+        tensor([0.7000, 0.8000])
+        >>> print(instance_data[instance_data.det_scores > 0.75])
+        <InstanceData(
+            META INFORMATION
+            img_shape: (800, 1196, 3)
+            pad_shape: (800, 1216, 3)
+            DATA FIELDS
+            det_labels: tensor([2])
+            det_scores: tensor([0.8000])
+            bboxes: tensor([[0.4997, 0.7707, 0.0595, 0.4188]])
+            polygons: [[1, 2, 3, 4]]
+        ) at 0x7f64ecf0ec40>
+        >>> print(instance_data[instance_data.det_scores > 1])
+        <InstanceData(
+            META INFORMATION
+            img_shape: (800, 1196, 3)
+            pad_shape: (800, 1216, 3)
+            DATA FIELDS
+            det_labels: tensor([], dtype=torch.int64)
+            det_scores: tensor([])
+            bboxes: tensor([], size=(0, 4))
+            polygons: []
+        ) at 0x7f660a6a7f70>
+        >>> print(instance_data.cat([instance_data, instance_data]))
+        <InstanceData(
+            META INFORMATION
+            img_shape: (800, 1196, 3)
+            pad_shape: (800, 1216, 3)
+            DATA FIELDS
+            det_labels: tensor([2, 3, 2, 3])
+            det_scores: tensor([0.8000, 0.7000, 0.8000, 0.7000])
+            bboxes: tensor([[0.4997, 0.7707, 0.0595, 0.4188],
+                        [0.8101, 0.3105, 0.5123, 0.6263],
+                        [0.4997, 0.7707, 0.0595, 0.4188],
+                        [0.8101, 0.3105, 0.5123, 0.6263]])
+            polygons: [[1, 2, 3, 4], [5, 6, 7, 8], [1, 2, 3, 4], [5, 6, 7, 8]]
+        ) at 0x7f203542feb0>
+    """
+
+    def __setattr__(self, name: str, value: Sized):
+        """setattr is only used to set data.
+
+        The value must have the attribute of `__len__` and have the same length
+        of `InstanceData`.
+        """
+        if name in ('_metainfo_fields', '_data_fields'):
+            if not hasattr(self, name):
+                super().__setattr__(name, value)
+            else:
+                raise AttributeError(f'{name} has been used as a '
+                                     'private attribute, which is immutable.')
+
+        else:
+            assert isinstance(value,
+                              Sized), 'value must contain `__len__` attribute'
+
+            if len(self) > 0:
+                assert len(value) == len(self), 'The length of ' \
+                                                f'values {len(value)} is ' \
+                                                'not consistent with ' \
+                                                'the length of this ' \
+                                                ':obj:`InstanceData` ' \
+                                                f'{len(self)}'
+            super().__setattr__(name, value)
+
+    __setitem__ = __setattr__
+
+    def __getitem__(self, item: IndexType) -> 'InstanceData':
+        """
+        Args:
+            item (str, int, list, :obj:`slice`, :obj:`numpy.ndarray`,
+                :obj:`torch.LongTensor`, :obj:`torch.BoolTensor`):
+                Get the corresponding values according to item.
+
+        Returns:
+            :obj:`InstanceData`: Corresponding values.
+        """
+        assert isinstance(item, IndexType.__args__)
+        if isinstance(item, list):
+            item = np.array(item)
+        if isinstance(item, np.ndarray):
+            # The default int type of numpy is platform dependent, int32 for
+            # windows and int64 for linux. `torch.Tensor` requires the index
+            # should be int64, therefore we simply convert it to int64 here.
+            # More details in https://github.com/numpy/numpy/issues/9464
+            item = item.astype(np.int64) if item.dtype == np.int32 else item
+            item = torch.from_numpy(item)
+
+        if isinstance(item, str):
+            return getattr(self, item)
+
+        if isinstance(item, int):
+            if item >= len(self) or item < -len(self):  # type:ignore
+                raise IndexError(f'Index {item} out of range!')
+            else:
+                # keep the dimension
+                item = slice(item, None, len(self))
+
+        new_data = self.__class__(metainfo=self.metainfo)
+        if isinstance(item, torch.Tensor):
+            assert item.dim() == 1, 'Only support to get the' \
+                                    ' values along the first dimension.'
+            if isinstance(item, BoolTypeTensor.__args__):
+                assert len(item) == len(self), 'The shape of the ' \
+                                               'input(BoolTensor) ' \
+                                               f'{len(item)} ' \
+                                               'does not match the shape ' \
+                                               'of the indexed tensor ' \
+                                               'in results_field ' \
+                                               f'{len(self)} at ' \
+                                               'first dimension.'
+
+            for k, v in self.items():
+                if isinstance(v, torch.Tensor):
+                    new_data[k] = v[item]
+                elif isinstance(v, np.ndarray):
+                    new_data[k] = v[item.cpu().numpy()]
+                elif isinstance(
+                        v, (str, list, tuple)) or (hasattr(v, '__getitem__')
+                                                   and hasattr(v, 'cat')):
+                    # convert to indexes from BoolTensor
+                    if isinstance(item, BoolTypeTensor.__args__):
+                        indexes = torch.nonzero(item).view(
+                            -1).cpu().numpy().tolist()
+                    else:
+                        indexes = item.cpu().numpy().tolist()
+                    slice_list = []
+                    if indexes:
+                        for index in indexes:
+                            slice_list.append(slice(index, None, len(v)))
+                    else:
+                        slice_list.append(slice(None, 0, None))
+                    r_list = [v[s] for s in slice_list]
+                    if isinstance(v, (str, list, tuple)):
+                        new_value = r_list[0]
+                        for r in r_list[1:]:
+                            new_value = new_value + r
+                    else:
+                        new_value = v.cat(r_list)
+                    new_data[k] = new_value
+                else:
+                    raise ValueError(
+                        f'The type of `{k}` is `{type(v)}`, which has no '
+                        'attribute of `cat`, so it does not '
+                        'support slice with `bool`')
+
+        else:
+            # item is a slice
+            for k, v in self.items():
+                new_data[k] = v[item]
+        return new_data  # type:ignore
+
+    @staticmethod
+    def cat(instances_list: List['InstanceData']) -> 'InstanceData':
+        """Concat the instances of all :obj:`InstanceData` in the list.
+
+        Note: To ensure that cat returns as expected, make sure that
+        all elements in the list must have exactly the same keys.
+
+        Args:
+            instances_list (list[:obj:`InstanceData`]): A list
+                of :obj:`InstanceData`.
+
+        Returns:
+            :obj:`InstanceData`
+        """
+        assert all(
+            isinstance(results, InstanceData) for results in instances_list)
+        assert len(instances_list) > 0
+        if len(instances_list) == 1:
+            return instances_list[0]
+
+        # metainfo and data_fields must be exactly the
+        # same for each element to avoid exceptions.
+        field_keys_list = [
+            instances.all_keys() for instances in instances_list
+        ]
+        assert len({len(field_keys) for field_keys in field_keys_list}) \
+               == 1 and len(set(itertools.chain(*field_keys_list))) \
+               == len(field_keys_list[0]), 'There are different keys in ' \
+                                           '`instances_list`, which may ' \
+                                           'cause the cat operation ' \
+                                           'to fail. Please make sure all ' \
+                                           'elements in `instances_list` ' \
+                                           'have the exact same key.'
+
+        new_data = instances_list[0].__class__(
+            metainfo=instances_list[0].metainfo)
+        for k in instances_list[0].keys():
+            values = [results[k] for results in instances_list]
+            v0 = values[0]
+            if isinstance(v0, torch.Tensor):
+                new_values = torch.cat(values, dim=0)
+            elif isinstance(v0, np.ndarray):
+                new_values = np.concatenate(values, axis=0)
+            elif isinstance(v0, (str, list, tuple)):
+                new_values = v0[:]
+                for v in values[1:]:
+                    new_values += v
+            elif hasattr(v0, 'cat'):
+                new_values = v0.cat(values)
+            else:
+                raise ValueError(
+                    f'The type of `{k}` is `{type(v0)}` which has no '
+                    'attribute of `cat`')
+            new_data[k] = new_values
+        return new_data  # type:ignore
+
+    def __len__(self) -> int:
+        """int: The length of InstanceData."""
+        if len(self._data_fields) > 0:
+            return len(self.values()[0])
+        else:
+            return 0
diff --git a/head_extractor/src/mmengine/structures/label_data.py b/head_extractor/src/mmengine/structures/label_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..de178e07a011d4b9d45844113f1052b6d6da01aa
--- /dev/null
+++ b/head_extractor/src/mmengine/structures/label_data.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+
+from .base_data_element import BaseDataElement
+
+
+class LabelData(BaseDataElement):
+    """Data structure for label-level annotations or predictions."""
+
+    @staticmethod
+    def onehot_to_label(onehot: torch.Tensor) -> torch.Tensor:
+        """Convert the one-hot input to label.
+
+        Args:
+            onehot (torch.Tensor, optional): The one-hot input. The format
+                of input must be one-hot.
+
+        Returns:
+            torch.Tensor: The converted results.
+        """
+        assert isinstance(onehot, torch.Tensor)
+        if (onehot.ndim == 1 and onehot.max().item() <= 1
+                and onehot.min().item() >= 0):
+            return onehot.nonzero().squeeze(-1)
+        else:
+            raise ValueError(
+                'input is not one-hot and can not convert to label')
+
+    @staticmethod
+    def label_to_onehot(label: torch.Tensor, num_classes: int) -> torch.Tensor:
+        """Convert the label-format input to one-hot.
+
+        Args:
+            label (torch.Tensor): The label-format input. The format
+                of item must be label-format.
+            num_classes (int): The number of classes.
+
+        Returns:
+            torch.Tensor: The converted results.
+        """
+        assert isinstance(label, torch.Tensor)
+        onehot = label.new_zeros((num_classes, ))
+        assert max(label, default=torch.tensor(0)).item() < num_classes
+        onehot[label] = 1
+        return onehot
diff --git a/head_extractor/src/mmengine/structures/pixel_data.py b/head_extractor/src/mmengine/structures/pixel_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..d550f5c0c6512306c94a9a6bf9c8b40fb6b5a89e
--- /dev/null
+++ b/head_extractor/src/mmengine/structures/pixel_data.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import List, Sequence, Union
+
+import numpy as np
+import torch
+
+from .base_data_element import BaseDataElement
+
+
+class PixelData(BaseDataElement):
+    """Data structure for pixel-level annotations or predictions.
+
+    All data items in ``data_fields`` of ``PixelData`` meet the following
+    requirements:
+
+    - They all have 3 dimensions in orders of channel, height, and width.
+    - They should have the same height and width.
+
+    Examples:
+        >>> metainfo = dict(
+        ...     img_id=random.randint(0, 100),
+        ...     img_shape=(random.randint(400, 600), random.randint(400, 600)))
+        >>> image = np.random.randint(0, 255, (4, 20, 40))
+        >>> featmap = torch.randint(0, 255, (10, 20, 40))
+        >>> pixel_data = PixelData(metainfo=metainfo,
+        ...                        image=image,
+        ...                        featmap=featmap)
+        >>> print(pixel_data.shape)
+        (20, 40)
+
+        >>> # slice
+        >>> slice_data = pixel_data[10:20, 20:40]
+        >>> assert slice_data.shape == (10, 20)
+        >>> slice_data = pixel_data[10, 20]
+        >>> assert slice_data.shape == (1, 1)
+
+        >>> # set
+        >>> pixel_data.map3 = torch.randint(0, 255, (20, 40))
+        >>> assert tuple(pixel_data.map3.shape) == (1, 20, 40)
+        >>> with self.assertRaises(AssertionError):
+        ...     # The dimension must be 3 or 2
+        ...     pixel_data.map2 = torch.randint(0, 255, (1, 3, 20, 40))
+    """
+
+    def __setattr__(self, name: str, value: Union[torch.Tensor, np.ndarray]):
+        """Set attributes of ``PixelData``.
+
+        If the dimension of value is 2 and its shape meet the demand, it
+        will automatically expand its channel-dimension.
+
+        Args:
+            name (str): The key to access the value, stored in `PixelData`.
+            value (Union[torch.Tensor, np.ndarray]): The value to store in.
+                The type of value must be `torch.Tensor` or `np.ndarray`,
+                and its shape must meet the requirements of `PixelData`.
+        """
+        if name in ('_metainfo_fields', '_data_fields'):
+            if not hasattr(self, name):
+                super().__setattr__(name, value)
+            else:
+                raise AttributeError(f'{name} has been used as a '
+                                     'private attribute, which is immutable.')
+
+        else:
+            assert isinstance(value, (torch.Tensor, np.ndarray)), \
+                f'Can not set {type(value)}, only support' \
+                f' {(torch.Tensor, np.ndarray)}'
+
+            if self.shape:
+                assert tuple(value.shape[-2:]) == self.shape, (
+                    'The height and width of '
+                    f'values {tuple(value.shape[-2:])} is '
+                    'not consistent with '
+                    'the shape of this '
+                    ':obj:`PixelData` '
+                    f'{self.shape}')
+            assert value.ndim in [
+                2, 3
+            ], f'The dim of value must be 2 or 3, but got {value.ndim}'
+            if value.ndim == 2:
+                value = value[None]
+                warnings.warn('The shape of value will convert from '
+                              f'{value.shape[-2:]} to {value.shape}')
+            super().__setattr__(name, value)
+
+    # TODO torch.Long/bool
+    def __getitem__(self, item: Sequence[Union[int, slice]]) -> 'PixelData':
+        """
+        Args:
+            item (Sequence[Union[int, slice]]): Get the corresponding values
+                according to item.
+
+        Returns:
+            :obj:`PixelData`: Corresponding values.
+        """
+
+        new_data = self.__class__(metainfo=self.metainfo)
+        if isinstance(item, tuple):
+
+            assert len(item) == 2, 'Only support to slice height and width'
+            tmp_item: List[slice] = list()
+            for index, single_item in enumerate(item[::-1]):
+                if isinstance(single_item, int):
+                    tmp_item.insert(
+                        0, slice(single_item, None, self.shape[-index - 1]))
+                elif isinstance(single_item, slice):
+                    tmp_item.insert(0, single_item)
+                else:
+                    raise TypeError(
+                        'The type of element in input must be int or slice, '
+                        f'but got {type(single_item)}')
+            tmp_item.insert(0, slice(None, None, None))
+            item = tuple(tmp_item)
+            for k, v in self.items():
+                setattr(new_data, k, v[item])
+        else:
+            raise TypeError(
+                f'Unsupported type {type(item)} for slicing PixelData')
+        return new_data
+
+    @property
+    def shape(self):
+        """The shape of pixel data."""
+        if len(self._data_fields) > 0:
+            return tuple(self.values()[0].shape[-2:])
+        else:
+            return None
+
+    # TODO padding, resize
diff --git a/head_extractor/src/mmengine/testing/__init__.py b/head_extractor/src/mmengine/testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7e4da354323ffe2401de2ab01ca9f4a51f932ef
--- /dev/null
+++ b/head_extractor/src/mmengine/testing/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .compare import (assert_allclose, assert_attrs_equal,
+                      assert_dict_contains_subset, assert_dict_has_keys,
+                      assert_is_norm_layer, assert_keys_equal,
+                      assert_params_all_zeros, check_python_script)
+from .runner_test_case import RunnerTestCase
+
+__all__ = [
+    'assert_allclose', 'assert_dict_contains_subset', 'assert_keys_equal',
+    'assert_attrs_equal', 'assert_dict_has_keys', 'assert_is_norm_layer',
+    'assert_params_all_zeros', 'check_python_script', 'RunnerTestCase'
+]
diff --git a/head_extractor/src/mmengine/testing/_internal/__init__.py b/head_extractor/src/mmengine/testing/_internal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4528659a8ff0e342012da67cc5e88fe99c1afa4
--- /dev/null
+++ b/head_extractor/src/mmengine/testing/_internal/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .distributed import MultiProcessTestCase
+
+__all__ = ['MultiProcessTestCase']
diff --git a/head_extractor/src/mmengine/testing/_internal/distributed.py b/head_extractor/src/mmengine/testing/_internal/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..b795cc945672fa132bd12a7f8dce1ee12b5fd6bc
--- /dev/null
+++ b/head_extractor/src/mmengine/testing/_internal/distributed.py
@@ -0,0 +1,357 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) https://github.com/pytorch/pytorch
+# Modified from https://github.com/pytorch/pytorch/blob/master/torch/testing/_internal/common_distributed.py  # noqa: E501
+import faulthandler
+import logging
+import multiprocessing
+import sys
+import tempfile
+import threading
+import time
+import traceback
+import types
+import unittest
+from enum import Enum
+from functools import wraps
+from typing import NamedTuple
+from unittest import TestCase
+
+import torch
+from torch.multiprocessing import active_children
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class TestSkip(NamedTuple):
+    exit_code: int
+    message: str
+
+
+TEST_SKIPS = {
+    'backend_unavailable':
+    TestSkip(10, 'Skipped because distributed backend is not available.'),
+    'no_cuda':
+    TestSkip(11, 'CUDA is not available.'),
+    'multi-gpu-2':
+    TestSkip(12, 'Need at least 2 CUDA device'),
+    'generic':
+    TestSkip(
+        13, 'Test skipped at subprocess level, look at subprocess log for '
+        'skip reason'),
+}
+
+# [How does MultiProcessTestCase work?]
+# Each MultiProcessTestCase instance uses 1 + `world_size()` processes, by
+# default `world_size()` returns 2. Let's take `test_rpc_spawn.py` as an
+# example which inherits from this class. Its `Setup()` methods calls into
+# `MultiProcessTestCase._spawn_processes()` which spawns `world_size()`
+# subprocesses. During the spawn, the main process passes the test name to
+# subprocesses, and the name is acquired from self.id(). The subprocesses
+# then use the provided test function name to retrieve the function attribute
+# from the test instance and run it. The main process simply waits for all
+# subprocesses to join.
+
+
+class MultiProcessTestCase(TestCase):
+    MAIN_PROCESS_RANK = -1
+
+    # This exit code is used to indicate that the test code had an error and
+    # exited abnormally. There are certain tests that might use sys.exit() to
+    # simulate failures and in those cases, we can't have an exit code of 0,
+    # but we still want to ensure we didn't run into any other errors.
+    TEST_ERROR_EXIT_CODE = 10
+
+    # do not early terminate for distributed tests.
+    def _should_stop_test_suite(self) -> bool:
+        return False
+
+    def prepare_subprocess(self):
+        pass
+
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @property
+    def timeout(self) -> int:
+        return 1000
+
+    def join_or_run(self, fn):
+
+        @wraps(fn)
+        def wrapper(self):
+            if self.rank == self.MAIN_PROCESS_RANK:
+                self._join_processes(fn)
+            else:
+                fn()
+
+        return types.MethodType(wrapper, self)
+
+    # The main process spawns N subprocesses that run the test.
+    # Constructor patches current instance test method to
+    # assume the role of the main process and join its subprocesses,
+    # or run the underlying test function.
+    def __init__(self, method_name: str = 'runTest') -> None:
+        super().__init__(method_name)
+        fn = getattr(self, method_name)
+        setattr(self, method_name, self.join_or_run(fn))
+
+    def setUp(self) -> None:
+        super().setUp()
+        self.skip_return_code_checks = []  # type: ignore[var-annotated]
+        self.processes = []  # type: ignore[var-annotated]
+        self.rank = self.MAIN_PROCESS_RANK
+        self.file_name = tempfile.NamedTemporaryFile(delete=False).name
+        # pid to pipe consisting of error message from process.
+        self.pid_to_pipe = {}  # type: ignore[var-annotated]
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        for p in self.processes:
+            p.terminate()
+        # Each Process instance holds a few open file descriptors. The unittest
+        # runner creates a new TestCase instance for each test method and keeps
+        # it alive until the end of the entire suite. We must thus reset the
+        # processes to prevent an effective file descriptor leak.
+        self.processes = []
+
+    def _current_test_name(self) -> str:
+        # self.id()
+        # e.g. '__main__.TestDistributed.TestAdditive.test_get_rank'
+        return self.id().split('.')[-1]
+
+    def _start_processes(self, proc) -> None:
+        self.processes = []
+        for rank in range(int(self.world_size)):
+            parent_conn, child_conn = torch.multiprocessing.Pipe()
+            process = proc(
+                target=self.__class__._run,
+                name='process ' + str(rank),
+                args=(rank, self._current_test_name(), self.file_name,
+                      child_conn),
+            )
+            process.start()
+            self.pid_to_pipe[process.pid] = parent_conn
+            self.processes.append(process)
+
+    def _spawn_processes(self) -> None:
+        proc = torch.multiprocessing.get_context('spawn').Process
+        self._start_processes(proc)
+
+    class Event(Enum):
+        GET_TRACEBACK = 1
+
+    @staticmethod
+    def _event_listener(parent_pipe, signal_pipe, rank: int):
+        while True:
+            ready_pipes = multiprocessing.connection.wait(
+                [parent_pipe, signal_pipe])
+
+            if parent_pipe in ready_pipes:
+
+                if parent_pipe.closed:
+                    return
+
+                event = parent_pipe.recv()
+
+                if event == MultiProcessTestCase.Event.GET_TRACEBACK:
+                    # Return traceback to the parent process.
+                    with tempfile.NamedTemporaryFile(mode='r+') as tmp_file:
+                        faulthandler.dump_traceback(tmp_file)
+                        # Flush buffers and seek to read from the beginning
+                        tmp_file.flush()
+                        tmp_file.seek(0)
+                        parent_pipe.send(tmp_file.read())
+
+            if signal_pipe in ready_pipes:
+                return
+
+    @classmethod
+    def _run(cls, rank: int, test_name: str, file_name: str,
+             parent_pipe) -> None:
+        self = cls(test_name)
+        try:
+            self.prepare_subprocess()
+        except Exception:
+            raise sys.exit(MultiProcessTestCase.TEST_ERROR_EXIT_CODE)
+        self.rank = rank
+        self.file_name = file_name
+        self.run_test(test_name, parent_pipe)
+
+    def run_test(self, test_name: str, parent_pipe) -> None:
+        # Start event listener thread.
+        signal_recv_pipe, signal_send_pipe = torch.multiprocessing.Pipe(
+            duplex=False)
+        event_listener_thread = threading.Thread(
+            target=MultiProcessTestCase._event_listener,
+            args=(parent_pipe, signal_recv_pipe, self.rank),
+            daemon=True,
+        )
+        event_listener_thread.start()
+
+        # self.id() == e.g. '__main__.TestDistributed.test_get_rank'
+        # We're retrieving a corresponding test and executing it.
+        try:
+            getattr(self, test_name)()
+        except unittest.SkipTest as se:
+            logger.info(f'Process {self.rank} skipping test {test_name} for '
+                        f'following reason: {str(se)}')
+            sys.exit(TEST_SKIPS['generic'].exit_code)
+        except Exception:
+            logger.error(
+                f'Caught exception: \n{traceback.format_exc()} exiting '
+                f'process {self.rank} with exit code: '
+                f'{MultiProcessTestCase.TEST_ERROR_EXIT_CODE}')
+            # Send error to parent process.
+            parent_pipe.send(traceback.format_exc())
+            sys.exit(MultiProcessTestCase.TEST_ERROR_EXIT_CODE)
+        finally:
+            if signal_send_pipe is not None:
+                signal_send_pipe.send(None)
+
+            assert event_listener_thread is not None
+            event_listener_thread.join()
+            # Close pipe after done with test.
+            parent_pipe.close()
+
+    def _get_timedout_process_traceback(self) -> None:
+        pipes = []
+        for i, process in enumerate(self.processes):
+            if process.exitcode is None:
+                pipe = self.pid_to_pipe[process.pid]
+                try:
+                    pipe.send(MultiProcessTestCase.Event.GET_TRACEBACK)
+                    pipes.append((i, pipe))
+                except ConnectionError as e:
+                    logger.error(
+                        'Encountered error while trying to get traceback '
+                        f'for process {i}: {e}')
+
+        # Wait for results.
+        for rank, pipe in pipes:
+            try:
+                # Wait for traceback
+                if pipe.poll(5):
+                    if pipe.closed:
+                        logger.info(
+                            f'Pipe closed for process {rank}, cannot retrieve '
+                            'traceback')
+                        continue
+
+                    traceback = pipe.recv()
+                    logger.error(f'Process {rank} timed out with traceback: '
+                                 f'\n\n{traceback}')
+                else:
+                    logger.error('Could not retrieve traceback for timed out '
+                                 f'process: {rank}')
+            except ConnectionError as e:
+                logger.error(
+                    'Encountered error while trying to get traceback for '
+                    f'process {rank}: {e}')
+
+    def _join_processes(self, fn) -> None:
+        start_time = time.time()
+        subprocess_error = False
+        try:
+            while True:
+                # check to see if any subprocess exited with an error early.
+                for (i, p) in enumerate(self.processes):
+                    # This is the exit code processes exit with if they
+                    # encountered an exception.
+                    if p.exitcode == MultiProcessTestCase.TEST_ERROR_EXIT_CODE:
+                        print(
+                            f'Process {i} terminated with exit code '
+                            f'{p.exitcode}, terminating remaining processes.')
+                        _active_children = active_children()
+                        for ac in _active_children:
+                            ac.terminate()
+                        subprocess_error = True
+                        break
+                if subprocess_error:
+                    break
+                # All processes have joined cleanly if they all a valid
+                # exitcode
+                if all([p.exitcode is not None for p in self.processes]):
+                    break
+                # Check if we should time out the test. If so, we terminate
+                # each process.
+                elapsed = time.time() - start_time
+                if elapsed > self.timeout:
+                    self._get_timedout_process_traceback()
+                    print(f'Timing out after {self.timeout} seconds and '
+                          'killing subprocesses.')
+                    for p in self.processes:
+                        p.terminate()
+                    break
+                # Sleep to avoid excessive busy polling.
+                time.sleep(0.1)
+
+            elapsed_time = time.time() - start_time
+
+            if fn in self.skip_return_code_checks:
+                self._check_no_test_errors(elapsed_time)
+            else:
+                self._check_return_codes(elapsed_time)
+        finally:
+            # Close all pipes
+            for pid, pipe in self.pid_to_pipe.items():
+                pipe.close()
+
+    def _check_no_test_errors(self, elapsed_time) -> None:
+        """Checks that we didn't have any errors thrown in the child
+        processes."""
+        for i, p in enumerate(self.processes):
+            if p.exitcode is None:
+                raise RuntimeError(
+                    'Process {} timed out after {} seconds'.format(
+                        i, elapsed_time))
+            self.assertNotEqual(self.TEST_ERROR_EXIT_CODE, p.exitcode)
+
+    def _check_return_codes(self, elapsed_time) -> None:
+        """Checks that the return codes of all spawned processes match, and
+        skips tests if they returned a return code indicating a skipping
+        condition."""
+        first_process = self.processes[0]
+        # first, we check if there are errors in actual processes
+        # (via TEST_ERROR_EXIT CODE), and raise an exception for those.
+        # the reason we do this is to attempt to raise a more helpful error
+        # message than "Process x terminated/timed out"
+        # TODO: we should pipe the exception of the failed subprocess here.
+        # Currently, the actual exception is displayed as a logging output.
+        errored_processes = [
+            (i, p) for i, p in enumerate(self.processes)
+            if p.exitcode == MultiProcessTestCase.TEST_ERROR_EXIT_CODE
+        ]
+        if errored_processes:
+            error = ''
+            for i, process in errored_processes:
+                # Get error from pipe.
+                error_message = self.pid_to_pipe[process.pid].recv()
+                error += (
+                    'Process {} exited with error code {} and exception:\n{}\n'
+                    .format(i, MultiProcessTestCase.TEST_ERROR_EXIT_CODE,
+                            error_message))
+            raise RuntimeError(error)
+        # If no process exited uncleanly, we check for timeouts, and then
+        # ensure each process exited cleanly.
+        for i, p in enumerate(self.processes):
+            if p.exitcode is None:
+                raise RuntimeError(
+                    f'Process {i} terminated or timed out after '
+                    '{elapsed_time} seconds')
+
+        for skip in TEST_SKIPS.values():
+            if first_process.exitcode == skip.exit_code:
+                raise unittest.SkipTest(skip.message)
+
+        # Skip the unittest since the raised error maybe not caused by
+        # the tested function. For example, in CI environment, the tested
+        # method could be terminated by system signal for the limited
+        # resources.
+        self.skipTest(f'Skip test {self._testMethodName} due to '
+                      'the program abort')
+
+    @property
+    def is_master(self) -> bool:
+        return self.rank == 0
diff --git a/head_extractor/src/mmengine/testing/compare.py b/head_extractor/src/mmengine/testing/compare.py
new file mode 100644
index 0000000000000000000000000000000000000000..14c7a97ba73ee98600102ab28d649b01aab8f3bc
--- /dev/null
+++ b/head_extractor/src/mmengine/testing/compare.py
@@ -0,0 +1,188 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+from collections.abc import Iterable
+from runpy import run_path
+from shlex import split
+from typing import Any, Callable, Dict, List, Optional, Union
+from unittest.mock import patch
+
+from torch.nn import GroupNorm, LayerNorm
+from torch.testing import assert_allclose as _assert_allclose
+
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm
+
+
+def assert_allclose(
+    actual: Any,
+    expected: Any,
+    rtol: Optional[float] = None,
+    atol: Optional[float] = None,
+    equal_nan: bool = True,
+    msg: Optional[Union[str, Callable]] = '',
+) -> None:
+    """Asserts that ``actual`` and ``expected`` are close. A wrapper function
+    of ``torch.testing.assert_allclose``.
+
+    Args:
+        actual (Any): Actual input.
+        expected (Any): Expected input.
+        rtol (Optional[float]): Relative tolerance. If specified ``atol`` must
+            also be specified. If omitted, default values based on the
+            :attr:`~torch.Tensor.dtype` are selected with the below table.
+        atol (Optional[float]): Absolute tolerance. If specified :attr:`rtol`
+            must also be specified. If omitted, default values based on the
+            :attr:`~torch.Tensor.dtype` are selected with the below table.
+        equal_nan (bool): If ``True``, two ``NaN`` values will be considered
+            equal.
+        msg (Optional[Union[str, Callable]]): Optional error message to use if
+            the values of corresponding tensors mismatch. Unused when PyTorch
+            < 1.6.
+    """
+    if 'parrots' not in TORCH_VERSION and \
+            digit_version(TORCH_VERSION) >= digit_version('1.6'):
+        _assert_allclose(
+            actual,
+            expected,
+            rtol=rtol,
+            atol=atol,
+            equal_nan=equal_nan,
+            msg=msg)
+    else:
+        # torch.testing.assert_allclose has no ``msg`` argument
+        # when PyTorch < 1.6
+        _assert_allclose(
+            actual, expected, rtol=rtol, atol=atol, equal_nan=equal_nan)
+
+
+def check_python_script(cmd):
+    """Run the python cmd script with `__main__`. The difference between
+    `os.system` is that, this function exectues code in the current process, so
+    that it can be tracked by coverage tools. Currently it supports two forms:
+
+    - ./tests/data/scripts/hello.py zz
+    - python tests/data/scripts/hello.py zz
+    """
+    args = split(cmd)
+    if args[0] == 'python':
+        args = args[1:]
+    with patch.object(sys, 'argv', args):
+        run_path(args[0], run_name='__main__')
+
+
+def _any(judge_result):
+    """Since built-in ``any`` works only when the element of iterable is not
+    iterable, implement the function."""
+    if not isinstance(judge_result, Iterable):
+        return judge_result
+
+    try:
+        for element in judge_result:
+            if _any(element):
+                return True
+    except TypeError:
+        # Maybe encounter the case: torch.tensor(True) | torch.tensor(False)
+        if judge_result:
+            return True
+    return False
+
+
+def assert_dict_contains_subset(dict_obj: Dict[Any, Any],
+                                expected_subset: Dict[Any, Any]) -> bool:
+    """Check if the dict_obj contains the expected_subset.
+
+    Args:
+        dict_obj (Dict[Any, Any]): Dict object to be checked.
+        expected_subset (Dict[Any, Any]): Subset expected to be contained in
+            dict_obj.
+
+    Returns:
+        bool: Whether the dict_obj contains the expected_subset.
+    """
+
+    for key, value in expected_subset.items():
+        if key not in dict_obj.keys() or _any(dict_obj[key] != value):
+            return False
+    return True
+
+
+def assert_attrs_equal(obj: Any, expected_attrs: Dict[str, Any]) -> bool:
+    """Check if attribute of class object is correct.
+
+    Args:
+        obj (object): Class object to be checked.
+        expected_attrs (Dict[str, Any]): Dict of the expected attrs.
+
+    Returns:
+        bool: Whether the attribute of class object is correct.
+    """
+    for attr, value in expected_attrs.items():
+        if not hasattr(obj, attr) or _any(getattr(obj, attr) != value):
+            return False
+    return True
+
+
+def assert_dict_has_keys(obj: Dict[str, Any],
+                         expected_keys: List[str]) -> bool:
+    """Check if the obj has all the expected_keys.
+
+    Args:
+        obj (Dict[str, Any]): Object to be checked.
+        expected_keys (List[str]): Keys expected to contained in the keys of
+            the obj.
+
+    Returns:
+        bool: Whether the obj has the expected keys.
+    """
+    return set(expected_keys).issubset(set(obj.keys()))
+
+
+def assert_keys_equal(result_keys: List[str], target_keys: List[str]) -> bool:
+    """Check if target_keys is equal to result_keys.
+
+    Args:
+        result_keys (List[str]): Result keys to be checked.
+        target_keys (List[str]): Target keys to be checked.
+
+    Returns:
+        bool: Whether target_keys is equal to result_keys.
+    """
+    return set(result_keys) == set(target_keys)
+
+
+def assert_is_norm_layer(module) -> bool:
+    """Check if the module is a norm layer.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: Whether the module is a norm layer.
+    """
+
+    norm_layer_candidates = (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)
+    return isinstance(module, norm_layer_candidates)
+
+
+def assert_params_all_zeros(module) -> bool:
+    """Check if the parameters of the module is all zeros.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: Whether the parameters of the module is all zeros.
+    """
+    weight_data = module.weight.data
+    is_weight_zero = weight_data.allclose(
+        weight_data.new_zeros(weight_data.size()))
+
+    if hasattr(module, 'bias') and module.bias is not None:
+        bias_data = module.bias.data
+        is_bias_zero = bias_data.allclose(
+            bias_data.new_zeros(bias_data.size()))
+    else:
+        is_bias_zero = True
+
+    return is_weight_zero and is_bias_zero
diff --git a/head_extractor/src/mmengine/testing/runner_test_case.py b/head_extractor/src/mmengine/testing/runner_test_case.py
new file mode 100644
index 0000000000000000000000000000000000000000..f64594acef9a966279f94a31c50358ff57b81eac
--- /dev/null
+++ b/head_extractor/src/mmengine/testing/runner_test_case.py
@@ -0,0 +1,196 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+import os
+import shutil
+import tempfile
+import time
+from unittest import TestCase
+from uuid import uuid4
+
+import torch
+import torch.nn as nn
+from torch.distributed import destroy_process_group
+from torch.utils.data import Dataset
+
+import mmengine.hooks  # noqa F401
+import mmengine.optim  # noqa F401
+from mmengine.config import Config
+from mmengine.dist import is_distributed
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MessageHub, MMLogger
+from mmengine.model import BaseModel
+from mmengine.registry import DATASETS, METRICS, MODELS, DefaultScope
+from mmengine.runner import Runner
+from mmengine.visualization import Visualizer
+
+
+class ToyModel(BaseModel):
+
+    def __init__(self, data_preprocessor=None):
+        super().__init__(data_preprocessor=data_preprocessor)
+        self.linear1 = nn.Linear(2, 2)
+        self.linear2 = nn.Linear(2, 1)
+
+    def forward(self, inputs, data_samples=None, mode='tensor'):
+        if isinstance(inputs, list):
+            inputs = torch.stack(inputs)
+        if isinstance(data_samples, list):
+            data_samples = torch.stack(data_samples)
+        outputs = self.linear1(inputs)
+        outputs = self.linear2(outputs)
+
+        if mode == 'tensor':
+            return outputs
+        elif mode == 'loss':
+            loss = (data_samples - outputs).sum()
+            outputs = dict(loss=loss)
+            return outputs
+        elif mode == 'predict':
+            return outputs
+
+
+class ToyDataset(Dataset):
+    METAINFO = dict()  # type: ignore
+    data = torch.randn(12, 2)
+    label = torch.ones(12)
+
+    @property
+    def metainfo(self):
+        return self.METAINFO
+
+    def __len__(self):
+        return self.data.size(0)
+
+    def __getitem__(self, index):
+        return dict(inputs=self.data[index], data_samples=self.label[index])
+
+
+class ToyMetric(BaseMetric):
+
+    def __init__(self, collect_device='cpu', dummy_metrics=None):
+        super().__init__(collect_device=collect_device)
+        self.dummy_metrics = dummy_metrics
+
+    def process(self, data_batch, predictions):
+        result = {'acc': 1}
+        self.results.append(result)
+
+    def compute_metrics(self, results):
+        return dict(acc=1)
+
+
+class RunnerTestCase(TestCase):
+    """A test case to build runner easily.
+
+    `RunnerTestCase` will do the following things:
+
+    1. Registers a toy model, a toy metric, and a toy dataset, which can be
+       used to run the `Runner` successfully.
+    2. Provides epoch based and iteration based cfg to build runner.
+    3. Provides `build_runner` method to build runner easily.
+    4. Clean the global variable used by the runner.
+    """
+    dist_cfg = dict(
+        MASTER_ADDR='127.0.0.1',
+        MASTER_PORT=29600,
+        RANK='0',
+        WORLD_SIZE='1',
+        LOCAL_RANK='0')
+
+    def setUp(self) -> None:
+        self.temp_dir = tempfile.TemporaryDirectory()
+        # Prevent from registering module with the same name by other unit
+        # test. These registries will be cleared in `tearDown`
+        MODELS.register_module(module=ToyModel, force=True)
+        METRICS.register_module(module=ToyMetric, force=True)
+        DATASETS.register_module(module=ToyDataset, force=True)
+        epoch_based_cfg = dict(
+            work_dir=self.temp_dir.name,
+            model=dict(type='ToyModel'),
+            train_dataloader=dict(
+                dataset=dict(type='ToyDataset'),
+                sampler=dict(type='DefaultSampler', shuffle=True),
+                batch_size=3,
+                num_workers=0),
+            val_dataloader=dict(
+                dataset=dict(type='ToyDataset'),
+                sampler=dict(type='DefaultSampler', shuffle=False),
+                batch_size=3,
+                num_workers=0),
+            val_evaluator=[dict(type='ToyMetric')],
+            test_dataloader=dict(
+                dataset=dict(type='ToyDataset'),
+                sampler=dict(type='DefaultSampler', shuffle=False),
+                batch_size=3,
+                num_workers=0),
+            test_evaluator=[dict(type='ToyMetric')],
+            optim_wrapper=dict(optimizer=dict(type='SGD', lr=0.1)),
+            train_cfg=dict(by_epoch=True, max_epochs=2, val_interval=1),
+            val_cfg=dict(),
+            test_cfg=dict(),
+            default_hooks=dict(logger=dict(type='LoggerHook', interval=1)),
+            custom_hooks=[],
+            env_cfg=dict(dist_cfg=dict(backend='nccl')),
+            experiment_name='test1')
+        self.epoch_based_cfg = Config(epoch_based_cfg)
+
+        # prepare iter based cfg.
+        self.iter_based_cfg: Config = copy.deepcopy(self.epoch_based_cfg)
+        self.iter_based_cfg.train_dataloader = dict(
+            dataset=dict(type='ToyDataset'),
+            sampler=dict(type='InfiniteSampler', shuffle=True),
+            batch_size=3,
+            num_workers=0)
+        self.iter_based_cfg.log_processor = dict(by_epoch=False)
+
+        self.iter_based_cfg.train_cfg = dict(by_epoch=False, max_iters=12)
+        self.iter_based_cfg.default_hooks = dict(
+            logger=dict(type='LoggerHook', interval=1),
+            checkpoint=dict(
+                type='CheckpointHook', interval=12, by_epoch=False))
+
+    def tearDown(self):
+        # `FileHandler` should be closed in Windows, otherwise we cannot
+        # delete the temporary directory
+        logging.shutdown()
+        MMLogger._instance_dict.clear()
+        Visualizer._instance_dict.clear()
+        DefaultScope._instance_dict.clear()
+        MessageHub._instance_dict.clear()
+        MODELS.module_dict.pop('ToyModel', None)
+        METRICS.module_dict.pop('ToyMetric', None)
+        DATASETS.module_dict.pop('ToyDataset', None)
+        self.temp_dir.cleanup()
+        if is_distributed():
+            destroy_process_group()
+
+    def build_runner(self, cfg: Config):
+        cfg.experiment_name = self.experiment_name
+        runner = Runner.from_cfg(cfg)
+        return runner
+
+    @property
+    def experiment_name(self):
+        # Since runners could be built too fast to have a unique experiment
+        # name(timestamp is the same), here we use uuid to make sure each
+        # runner has the unique experiment name.
+        return f'{self._testMethodName}_{time.time()} + ' \
+               f'{uuid4()}'
+
+    def setup_dist_env(self):
+        self.dist_cfg['MASTER_PORT'] += 1
+        os.environ['MASTER_PORT'] = str(self.dist_cfg['MASTER_PORT'])
+        os.environ['MASTER_ADDR'] = self.dist_cfg['MASTER_ADDR']
+        os.environ['RANK'] = self.dist_cfg['RANK']
+        os.environ['WORLD_SIZE'] = self.dist_cfg['WORLD_SIZE']
+        os.environ['LOCAL_RANK'] = self.dist_cfg['LOCAL_RANK']
+
+    def clear_work_dir(self):
+        logging.shutdown()
+        for filename in os.listdir(self.temp_dir.name):
+            filepath = os.path.join(self.temp_dir.name, filename)
+            if os.path.isfile(filepath):
+                os.remove(filepath)
+            else:
+                shutil.rmtree(filepath)
diff --git a/head_extractor/src/mmengine/utils/__init__.py b/head_extractor/src/mmengine/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3de90999073d2fafc2738519e99847d9c930502e
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .manager import ManagerMeta, ManagerMixin
+from .misc import (apply_to, check_prerequisites, concat_list,
+                   deprecated_api_warning, deprecated_function,
+                   get_object_from_string, has_method,
+                   import_modules_from_strings, is_list_of,
+                   is_method_overridden, is_seq_of, is_str, is_tuple_of,
+                   iter_cast, list_cast, requires_executable, requires_package,
+                   slice_list, to_1tuple, to_2tuple, to_3tuple, to_4tuple,
+                   to_ntuple, tuple_cast)
+from .package_utils import (call_command, get_installed_path, install_package,
+                            is_installed)
+from .path import (check_file_exist, fopen, is_abs, is_filepath,
+                   mkdir_or_exist, scandir, symlink)
+from .progressbar import (ProgressBar, track_iter_progress,
+                          track_parallel_progress, track_progress)
+from .progressbar_rich import track_progress_rich
+from .timer import Timer, TimerError, check_time
+from .version_utils import digit_version, get_git_hash
+
+__all__ = [
+    'is_str', 'iter_cast', 'list_cast', 'tuple_cast', 'is_seq_of',
+    'is_list_of', 'is_tuple_of', 'slice_list', 'concat_list',
+    'check_prerequisites', 'requires_package', 'requires_executable',
+    'is_filepath', 'fopen', 'check_file_exist', 'mkdir_or_exist', 'symlink',
+    'scandir', 'deprecated_api_warning', 'import_modules_from_strings',
+    'to_1tuple', 'to_2tuple', 'to_3tuple', 'to_4tuple', 'to_ntuple',
+    'is_installed', 'call_command', 'get_installed_path', 'install_package',
+    'is_abs', 'is_method_overridden', 'has_method', 'digit_version',
+    'get_git_hash', 'ManagerMeta', 'ManagerMixin', 'Timer', 'check_time',
+    'TimerError', 'ProgressBar', 'track_iter_progress',
+    'track_parallel_progress', 'track_progress', 'deprecated_function',
+    'apply_to', 'track_progress_rich', 'get_object_from_string'
+]
diff --git a/head_extractor/src/mmengine/utils/dl_utils/__init__.py b/head_extractor/src/mmengine/utils/dl_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..305ea898904e359d61e3aab8e51897a9d035b08e
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/dl_utils/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .collect_env import collect_env
+from .hub import load_url
+from .misc import has_batch_norm, is_norm, mmcv_full_available, tensor2imgs
+from .parrots_wrapper import TORCH_VERSION
+from .setup_env import set_multi_processing
+from .time_counter import TimeCounter
+from .torch_ops import torch_meshgrid
+from .trace import is_jit_tracing
+
+__all__ = [
+    'load_url', 'TORCH_VERSION', 'set_multi_processing', 'has_batch_norm',
+    'is_norm', 'tensor2imgs', 'mmcv_full_available', 'collect_env',
+    'torch_meshgrid', 'is_jit_tracing', 'TimeCounter'
+]
diff --git a/head_extractor/src/mmengine/utils/dl_utils/collect_env.py b/head_extractor/src/mmengine/utils/dl_utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ee99abad2c654e975d11c0b315c14dca18171a7
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/dl_utils/collect_env.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This file holding some environment constant for sharing by other files."""
+import os
+import os.path as osp
+import subprocess
+import sys
+from collections import OrderedDict, defaultdict
+
+import numpy as np
+import torch
+
+import mmengine
+from mmengine.device import is_cuda_available, is_musa_available
+from .parrots_wrapper import TORCH_VERSION, get_build_config, is_rocm_pytorch
+
+
+def _get_cuda_home():
+    if TORCH_VERSION == 'parrots':
+        from parrots.utils.build_extension import CUDA_HOME
+    else:
+        if is_rocm_pytorch():
+            from torch.utils.cpp_extension import ROCM_HOME
+            CUDA_HOME = ROCM_HOME
+        else:
+            from torch.utils.cpp_extension import CUDA_HOME
+    return CUDA_HOME
+
+
+def _get_musa_home():
+    return os.environ.get('MUSA_HOME')
+
+
+def collect_env():
+    """Collect the information of the running environments.
+
+    Returns:
+        dict: The environment information. The following fields are contained.
+
+            - sys.platform: The variable of ``sys.platform``.
+            - Python: Python version.
+            - CUDA available: Bool, indicating if CUDA is available.
+            - GPU devices: Device type of each GPU.
+            - CUDA_HOME (optional): The env var ``CUDA_HOME``.
+            - NVCC (optional): NVCC version.
+            - GCC: GCC version, "n/a" if GCC is not installed.
+            - MSVC: Microsoft Virtual C++ Compiler version, Windows only.
+            - PyTorch: PyTorch version.
+            - PyTorch compiling details: The output of \
+                ``torch.__config__.show()``.
+            - TorchVision (optional): TorchVision version.
+            - OpenCV (optional): OpenCV version.
+            - MMENGINE: MMENGINE version.
+    """
+    from distutils import errors
+
+    env_info = OrderedDict()
+    env_info['sys.platform'] = sys.platform
+    env_info['Python'] = sys.version.replace('\n', '')
+
+    cuda_available = is_cuda_available()
+    musa_available = is_musa_available()
+    env_info['CUDA available'] = cuda_available
+    env_info['MUSA available'] = musa_available
+    env_info['numpy_random_seed'] = np.random.get_state()[1][0]
+
+    if cuda_available:
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, device_ids in devices.items():
+            env_info['GPU ' + ','.join(device_ids)] = name
+
+        CUDA_HOME = _get_cuda_home()
+        env_info['CUDA_HOME'] = CUDA_HOME
+
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            if CUDA_HOME == '/opt/rocm':
+                try:
+                    nvcc = osp.join(CUDA_HOME, 'hip/bin/hipcc')
+                    nvcc = subprocess.check_output(
+                        f'"{nvcc}" --version', shell=True)
+                    nvcc = nvcc.decode('utf-8').strip()
+                    release = nvcc.rfind('HIP version:')
+                    build = nvcc.rfind('')
+                    nvcc = nvcc[release:build].strip()
+                except subprocess.SubprocessError:
+                    nvcc = 'Not Available'
+            else:
+                try:
+                    nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                    nvcc = subprocess.check_output(f'"{nvcc}" -V', shell=True)
+                    nvcc = nvcc.decode('utf-8').strip()
+                    release = nvcc.rfind('Cuda compilation tools')
+                    build = nvcc.rfind('Build ')
+                    nvcc = nvcc[release:build].strip()
+                except subprocess.SubprocessError:
+                    nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+    elif musa_available:
+        devices = defaultdict(list)
+        for k in range(torch.musa.device_count()):
+            devices[torch.musa.get_device_name(k)].append(str(k))
+        for name, device_ids in devices.items():
+            env_info['GPU ' + ','.join(device_ids)] = name
+
+        MUSA_HOME = _get_musa_home()
+        env_info['MUSA_HOME'] = MUSA_HOME
+
+        if MUSA_HOME is not None and osp.isdir(MUSA_HOME):
+            try:
+                mcc = osp.join(MUSA_HOME, 'bin/mcc')
+                subprocess.check_output(f'"{mcc}" -v', shell=True)
+            except subprocess.SubprocessError:
+                mcc = 'Not Available'
+            env_info['mcc'] = mcc
+    try:
+        # Check C++ Compiler.
+        # For Unix-like, sysconfig has 'CC' variable like 'gcc -pthread ...',
+        # indicating the compiler used, we use this to get the compiler name
+        import io
+        import sysconfig
+        cc = sysconfig.get_config_var('CC')
+        if cc:
+            cc = osp.basename(cc.split()[0])
+            cc_info = subprocess.check_output(f'{cc} --version', shell=True)
+            env_info['GCC'] = cc_info.decode('utf-8').partition(
+                '\n')[0].strip()
+        else:
+            # on Windows, cl.exe is not in PATH. We need to find the path.
+            # distutils.ccompiler.new_compiler() returns a msvccompiler
+            # object and after initialization, path to cl.exe is found.
+            import locale
+            import os
+            from distutils.ccompiler import new_compiler
+            ccompiler = new_compiler()
+            ccompiler.initialize()
+            cc = subprocess.check_output(
+                f'{ccompiler.cc}', stderr=subprocess.STDOUT, shell=True)
+            encoding = os.device_encoding(
+                sys.stdout.fileno()) or locale.getpreferredencoding()
+            env_info['MSVC'] = cc.decode(encoding).partition('\n')[0].strip()
+            env_info['GCC'] = 'n/a'
+    except (subprocess.CalledProcessError, errors.DistutilsPlatformError):
+        env_info['GCC'] = 'n/a'
+    except io.UnsupportedOperation as e:
+        # JupyterLab on Windows changes sys.stdout, which has no `fileno` attr
+        # Refer to: https://github.com/open-mmlab/mmengine/issues/931
+        # TODO: find a solution to get compiler info in Windows JupyterLab,
+        # while preserving backward-compatibility in other systems.
+        env_info['MSVC'] = f'n/a, reason: {str(e)}'
+
+    env_info['PyTorch'] = torch.__version__
+    env_info['PyTorch compiling details'] = get_build_config()
+
+    try:
+        import torchvision
+        env_info['TorchVision'] = torchvision.__version__
+    except ModuleNotFoundError:
+        pass
+
+    try:
+        import cv2
+        env_info['OpenCV'] = cv2.__version__
+    except ImportError:
+        pass
+
+    env_info['MMEngine'] = mmengine.__version__
+
+    return env_info
diff --git a/head_extractor/src/mmengine/utils/dl_utils/hub.py b/head_extractor/src/mmengine/utils/dl_utils/hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf555ac766a964f8f1bcd192db9bb398a65b3c99
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/dl_utils/hub.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# The 1.6 release of PyTorch switched torch.save to use a new zipfile-based
+# file format. It will cause RuntimeError when a checkpoint was saved in
+# torch >= 1.6.0 but loaded in torch < 1.7.0.
+# More details at https://github.com/open-mmlab/mmpose/issues/904
+
+from ..path import mkdir_or_exist
+from ..version_utils import digit_version
+from .parrots_wrapper import TORCH_VERSION
+
+if TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) < digit_version(
+        '1.7.0'):
+    # Modified from https://github.com/pytorch/pytorch/blob/master/torch/hub.py
+    import os
+    import sys
+    import warnings
+    import zipfile
+    from urllib.parse import urlparse
+
+    import torch
+    from torch.hub import HASH_REGEX, _get_torch_home, download_url_to_file
+
+    # Hub used to support automatically extracts from zipfile manually
+    # compressed by users. The legacy zip format expects only one file from
+    # torch.save() < 1.6 in the zip. We should remove this support since
+    # zipfile is now default zipfile format for torch.save().
+    def _is_legacy_zip_format(filename):
+        if zipfile.is_zipfile(filename):
+            infolist = zipfile.ZipFile(filename).infolist()
+            return len(infolist) == 1 and not infolist[0].is_dir()
+        return False
+
+    def _legacy_zip_load(filename, model_dir, map_location):
+        warnings.warn(
+            'Falling back to the old format < 1.6. This support will'
+            ' be deprecated in favor of default zipfile format '
+            'introduced in 1.6. Please redo torch.save() to save it '
+            'in the new zipfile format.', DeprecationWarning)
+        # Note: extractall() defaults to overwrite file if exists. No need to
+        #       clean up beforehand. We deliberately don't handle tarfile here
+        #       since our legacy serialization format was in tar.
+        #       E.g. resnet18-5c106cde.pth which is widely used.
+        with zipfile.ZipFile(filename) as f:
+            members = f.infolist()
+            if len(members) != 1:
+                raise RuntimeError(
+                    'Only one file(not dir) is allowed in the zipfile')
+            f.extractall(model_dir)
+            extraced_name = members[0].filename
+            extracted_file = os.path.join(model_dir, extraced_name)
+        return torch.load(extracted_file, map_location=map_location)
+
+    def load_url(url,
+                 model_dir=None,
+                 map_location=None,
+                 progress=True,
+                 check_hash=False,
+                 file_name=None):
+        r"""Loads the Torch serialized object at the given URL.
+        If downloaded file is a zip file, it will be automatically decompressed
+        If the object is already present in `model_dir`, it's deserialized and
+        returned.
+        The default value of ``model_dir`` is ``<hub_dir>/checkpoints`` where
+        ``hub_dir`` is the directory returned by :func:`~torch.hub.get_dir`.
+        Args:
+            url (str): URL of the object to download
+            model_dir (str, optional): directory in which to save the object
+            map_location (optional): a function or a dict specifying how to
+                remap storage locations (see torch.load)
+            progress (bool, optional): whether or not to display a progress bar
+                to stderr. Defaults to True
+            check_hash(bool, optional): If True, the filename part of the URL
+                should follow the naming convention ``filename-<sha256>.ext``
+                where ``<sha256>`` is the first eight or more digits of the
+                SHA256 hash of the contents of the file. The hash is used to
+                ensure unique names and to verify the contents of the file.
+                Defaults to False
+            file_name (str, optional): name for the downloaded file. Filename
+                from ``url`` will be used if not set. Defaults to None.
+        Example:
+            >>> url = ('https://s3.amazonaws.com/pytorch/models/resnet18-5c106'
+            ...        'cde.pth')
+            >>> state_dict = torch.hub.load_state_dict_from_url(url)
+        """
+        # Issue warning to move data if old env is set
+        if os.getenv('TORCH_MODEL_ZOO'):
+            warnings.warn(
+                'TORCH_MODEL_ZOO is deprecated, please use env '
+                'TORCH_HOME instead', DeprecationWarning)
+
+        if model_dir is None:
+            torch_home = _get_torch_home()
+            model_dir = os.path.join(torch_home, 'checkpoints')
+
+        mkdir_or_exist(model_dir)
+
+        parts = urlparse(url)
+        filename = os.path.basename(parts.path)
+        if file_name is not None:
+            filename = file_name
+        cached_file = os.path.join(model_dir, filename)
+        if not os.path.exists(cached_file):
+            sys.stderr.write('Downloading: "{}" to {}\n'.format(
+                url, cached_file))
+            hash_prefix = None
+            if check_hash:
+                r = HASH_REGEX.search(filename)  # r is Optional[Match[str]]
+                hash_prefix = r.group(1) if r else None
+            download_url_to_file(
+                url, cached_file, hash_prefix, progress=progress)
+
+        if _is_legacy_zip_format(cached_file):
+            return _legacy_zip_load(cached_file, model_dir, map_location)
+
+        try:
+            return torch.load(cached_file, map_location=map_location)
+        except RuntimeError as error:
+            if digit_version(TORCH_VERSION) < digit_version('1.5.0'):
+                warnings.warn(
+                    f'If the error is the same as "{cached_file} is a zip '
+                    'archive (did you mean to use torch.jit.load()?)", you can'
+                    ' upgrade your torch to 1.5.0 or higher (current torch '
+                    f'version is {TORCH_VERSION}). The error was raised '
+                    ' because the checkpoint was saved in torch>=1.6.0 but '
+                    'loaded in torch<1.5.')
+            raise error
+else:
+    from torch.utils.model_zoo import load_url  # type: ignore # noqa: F401
diff --git a/head_extractor/src/mmengine/utils/dl_utils/misc.py b/head_extractor/src/mmengine/utils/dl_utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce52d22c3b225abafdc11c43ad1083f3b93b75e0
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/dl_utils/misc.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pkgutil
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ..misc import is_tuple_of
+from .parrots_wrapper import _BatchNorm, _InstanceNorm
+
+
+def is_norm(layer: nn.Module,
+            exclude: Optional[Union[type, Tuple[type]]] = None) -> bool:
+    """Check if a layer is a normalization layer.
+
+    Args:
+        layer (nn.Module): The layer to be checked.
+        exclude (type, tuple[type], optional): Types to be excluded.
+
+    Returns:
+        bool: Whether the layer is a norm layer.
+    """
+    if exclude is not None:
+        if not isinstance(exclude, tuple):
+            exclude = (exclude, )
+        if not is_tuple_of(exclude, type):
+            raise TypeError(
+                f'"exclude" must be either None or type or a tuple of types, '
+                f'but got {type(exclude)}: {exclude}')
+
+    if exclude and isinstance(layer, exclude):
+        return False
+
+    all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)
+    return isinstance(layer, all_norm_bases)
+
+
+def tensor2imgs(tensor: torch.Tensor,
+                mean: Optional[Tuple[float, float, float]] = None,
+                std: Optional[Tuple[float, float, float]] = None,
+                to_bgr: bool = True):
+    """Convert tensor to 3-channel images or 1-channel gray images.
+
+    Args:
+        tensor (torch.Tensor): Tensor that contains multiple images, shape (
+            N, C, H, W). :math:`C` can be either 3 or 1. If C is 3, the format
+            should be RGB.
+        mean (tuple[float], optional): Mean of images. If None,
+            (0, 0, 0) will be used for tensor with 3-channel,
+            while (0, ) for tensor with 1-channel. Defaults to None.
+        std (tuple[float], optional): Standard deviation of images. If None,
+            (1, 1, 1) will be used for tensor with 3-channel,
+            while (1, ) for tensor with 1-channel. Defaults to None.
+        to_bgr (bool): For the tensor with 3 channel, convert its format to
+            BGR. For the tensor with 1 channel, it must be False. Defaults to
+            True.
+
+    Returns:
+        list[np.ndarray]: A list that contains multiple images.
+    """
+
+    assert torch.is_tensor(tensor) and tensor.ndim == 4
+    channels = tensor.size(1)
+    assert channels in [1, 3]
+    if mean is None:
+        mean = (0, ) * channels
+    if std is None:
+        std = (1, ) * channels
+    assert (channels == len(mean) == len(std) == 3) or \
+           (channels == len(mean) == len(std) == 1 and not to_bgr)
+    mean = tensor.new_tensor(mean).view(1, -1)
+    std = tensor.new_tensor(std).view(1, -1)
+    tensor = tensor.permute(0, 2, 3, 1) * std + mean
+    imgs = tensor.detach().cpu().numpy()
+    if to_bgr and channels == 3:
+        imgs = imgs[:, :, :, (2, 1, 0)]  # RGB2BGR
+    imgs = [np.ascontiguousarray(img) for img in imgs]
+    return imgs
+
+
+def has_batch_norm(model: nn.Module) -> bool:
+    """Detect whether model has a BatchNormalization layer.
+
+    Args:
+        model (nn.Module): training model.
+
+    Returns:
+        bool: whether model has a BatchNormalization layer
+    """
+    if isinstance(model, _BatchNorm):
+        return True
+    for m in model.children():
+        if has_batch_norm(m):
+            return True
+    return False
+
+
+def mmcv_full_available() -> bool:
+    """Check whether mmcv-full is installed.
+
+    Returns:
+        bool: True if mmcv-full is installed else False.
+    """
+    try:
+        import mmcv  # noqa: F401
+    except ImportError:
+        return False
+    ext_loader = pkgutil.find_loader('mmcv._ext')
+    return ext_loader is not None
diff --git a/head_extractor/src/mmengine/utils/dl_utils/parrots_wrapper.py b/head_extractor/src/mmengine/utils/dl_utils/parrots_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bd8e5443acbb8bc168e0599d41860f09882fcd6
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/dl_utils/parrots_wrapper.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+from typing import Optional
+
+import torch
+
+TORCH_VERSION = torch.__version__
+
+
+def is_rocm_pytorch() -> bool:
+    """Check whether the PyTorch is compiled on ROCm."""
+    is_rocm = False
+    if TORCH_VERSION != 'parrots':
+        try:
+            from torch.utils.cpp_extension import ROCM_HOME
+            is_rocm = True if ((torch.version.hip is not None) and
+                               (ROCM_HOME is not None)) else False
+        except ImportError:
+            pass
+    return is_rocm
+
+
+def _get_cuda_home() -> Optional[str]:
+    """Obtain the path of CUDA home."""
+    if TORCH_VERSION == 'parrots':
+        from parrots.utils.build_extension import CUDA_HOME
+    else:
+        if is_rocm_pytorch():
+            from torch.utils.cpp_extension import ROCM_HOME
+            CUDA_HOME = ROCM_HOME
+        else:
+            from torch.utils.cpp_extension import CUDA_HOME
+    return CUDA_HOME
+
+
+def get_build_config():
+    """Obtain the build information of PyTorch or Parrots."""
+    if TORCH_VERSION == 'parrots':
+        from parrots.config import get_build_info
+        return get_build_info()
+    else:
+        return torch.__config__.show()
+
+
+def _get_conv() -> tuple:
+    """A wrapper to obtain base classes of Conv layers from PyTorch or
+    Parrots."""
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.conv import _ConvNd, _ConvTransposeMixin
+    else:
+        from torch.nn.modules.conv import _ConvNd, _ConvTransposeMixin
+    return _ConvNd, _ConvTransposeMixin
+
+
+def _get_dataloader() -> tuple:
+    """A wrapper to obtain DataLoader class from PyTorch or Parrots."""
+    if TORCH_VERSION == 'parrots':
+        from torch.utils.data import DataLoader, PoolDataLoader
+    else:
+        from torch.utils.data import DataLoader
+        PoolDataLoader = DataLoader
+    return DataLoader, PoolDataLoader
+
+
+def _get_extension():
+    """A wrapper to obtain extension class from PyTorch or Parrots."""
+    if TORCH_VERSION == 'parrots':
+        from parrots.utils.build_extension import BuildExtension, Extension
+        CppExtension = partial(Extension, cuda=False)
+        CUDAExtension = partial(Extension, cuda=True)
+    else:
+        from torch.utils.cpp_extension import (BuildExtension, CppExtension,
+                                               CUDAExtension)
+    return BuildExtension, CppExtension, CUDAExtension
+
+
+def _get_pool() -> tuple:
+    """A wrapper to obtain base classes of pooling layers from PyTorch or
+    Parrots."""
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.pool import (_AdaptiveAvgPoolNd,
+                                             _AdaptiveMaxPoolNd, _AvgPoolNd,
+                                             _MaxPoolNd)
+    else:
+        from torch.nn.modules.pooling import (_AdaptiveAvgPoolNd,
+                                              _AdaptiveMaxPoolNd, _AvgPoolNd,
+                                              _MaxPoolNd)
+    return _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd
+
+
+def _get_norm() -> tuple:
+    """A wrapper to obtain base classes of normalization layers from PyTorch or
+    Parrots."""
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.batchnorm import _BatchNorm, _InstanceNorm
+        SyncBatchNorm_ = torch.nn.SyncBatchNorm2d
+    else:
+        from torch.nn.modules.batchnorm import _BatchNorm
+        from torch.nn.modules.instancenorm import _InstanceNorm
+        SyncBatchNorm_ = torch.nn.SyncBatchNorm
+    return _BatchNorm, _InstanceNorm, SyncBatchNorm_
+
+
+_ConvNd, _ConvTransposeMixin = _get_conv()
+DataLoader, PoolDataLoader = _get_dataloader()
+_BatchNorm, _InstanceNorm, SyncBatchNorm_ = _get_norm()
+_AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd = _get_pool()
+
+
+class SyncBatchNorm(SyncBatchNorm_):  # type: ignore
+
+    def _check_input_dim(self, input):
+        if TORCH_VERSION == 'parrots':
+            if input.dim() < 2:
+                raise ValueError(
+                    f'expected at least 2D input (got {input.dim()}D input)')
+        else:
+            super()._check_input_dim(input)
diff --git a/head_extractor/src/mmengine/utils/dl_utils/setup_env.py b/head_extractor/src/mmengine/utils/dl_utils/setup_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c23a56a1342b6dc4312e6e98ef1d356e84faef4
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/dl_utils/setup_env.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import platform
+import warnings
+
+import torch.multiprocessing as mp
+
+
+def set_multi_processing(mp_start_method: str = 'fork',
+                         opencv_num_threads: int = 0,
+                         distributed: bool = False) -> None:
+    """Set multi-processing related environment.
+
+    Args:
+        mp_start_method (str): Set the method which should be used to start
+            child processes. Defaults to 'fork'.
+        opencv_num_threads (int): Number of threads for opencv.
+            Defaults to 0.
+        distributed (bool): True if distributed environment.
+            Defaults to False.
+    """
+    # set multi-process start method as `fork` to speed up the training
+    if platform.system() != 'Windows':
+        current_method = mp.get_start_method(allow_none=True)
+        if (current_method is not None and current_method != mp_start_method):
+            warnings.warn(
+                f'Multi-processing start method `{mp_start_method}` is '
+                f'different from the previous setting `{current_method}`.'
+                f'It will be force set to `{mp_start_method}`. You can '
+                'change this behavior by changing `mp_start_method` in '
+                'your config.')
+        mp.set_start_method(mp_start_method, force=True)
+
+    try:
+        import cv2
+
+        # disable opencv multithreading to avoid system being overloaded
+        cv2.setNumThreads(opencv_num_threads)
+    except ImportError:
+        pass
+
+    # setup OMP threads
+    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
+    if 'OMP_NUM_THREADS' not in os.environ and distributed:
+        omp_num_threads = 1
+        warnings.warn(
+            'Setting OMP_NUM_THREADS environment variable for each process'
+            f' to be {omp_num_threads} in default, to avoid your system '
+            'being overloaded, please further tune the variable for '
+            'optimal performance in your application as needed.')
+        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
+
+    # setup MKL threads
+    if 'MKL_NUM_THREADS' not in os.environ and distributed:
+        mkl_num_threads = 1
+        warnings.warn(
+            'Setting MKL_NUM_THREADS environment variable for each process'
+            f' to be {mkl_num_threads} in default, to avoid your system '
+            'being overloaded, please further tune the variable for '
+            'optimal performance in your application as needed.')
+        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
diff --git a/head_extractor/src/mmengine/utils/dl_utils/time_counter.py b/head_extractor/src/mmengine/utils/dl_utils/time_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4a155dd725b2cb453553eb33752fc816e933b39
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/dl_utils/time_counter.py
@@ -0,0 +1,140 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+from typing import Optional, Union
+
+import torch
+
+from mmengine.device import is_cuda_available, is_musa_available
+from mmengine.dist.utils import master_only
+from mmengine.logging import MMLogger, print_log
+
+
+class TimeCounter:
+    """A tool that counts the average running time of a function or a method.
+    Users can use it as a decorator or context manager to calculate the average
+    running time of code blocks.
+
+    Args:
+        log_interval (int): The interval of logging. Defaults to 1.
+        warmup_interval (int): The interval of warmup. Defaults to 1.
+        with_sync (bool): Whether to synchronize cuda. Defaults to True.
+        tag (str, optional): Function tag. Used to distinguish between
+            different functions or methods being called. Defaults to None.
+        logger (MMLogger, optional): Formatted logger used to record messages.
+                Defaults to None.
+
+    Examples:
+        >>> import time
+        >>> from mmengine.utils.dl_utils import TimeCounter
+        >>> @TimeCounter()
+        ... def fun1():
+        ...     time.sleep(0.1)
+        ... fun1()
+        [fun1]-time per run averaged in the past 1 runs: 100.0 ms
+
+        >>> @@TimeCounter(log_interval=2, tag='fun')
+        ... def fun2():
+        ...    time.sleep(0.2)
+        >>> for _ in range(3):
+        ...    fun2()
+        [fun]-time per run averaged in the past 2 runs: 200.0 ms
+
+        >>> with TimeCounter(tag='fun3'):
+        ...      time.sleep(0.3)
+        [fun3]-time per run averaged in the past 1 runs: 300.0 ms
+    """
+
+    instance_dict: dict = dict()
+
+    log_interval: int
+    warmup_interval: int
+    logger: Optional[MMLogger]
+    __count: int
+    __pure_inf_time: float
+
+    def __new__(cls,
+                log_interval: int = 1,
+                warmup_interval: int = 1,
+                with_sync: bool = True,
+                tag: Optional[str] = None,
+                logger: Optional[MMLogger] = None):
+        assert warmup_interval >= 1
+        if tag is not None and tag in cls.instance_dict:
+            return cls.instance_dict[tag]
+
+        instance = super().__new__(cls)
+        cls.instance_dict[tag] = instance
+
+        instance.log_interval = log_interval
+        instance.warmup_interval = warmup_interval
+        instance.with_sync = with_sync
+        instance.tag = tag
+        instance.logger = logger
+
+        instance.__count = 0
+        instance.__pure_inf_time = 0.
+        instance.__start_time = 0.
+
+        return instance
+
+    @master_only
+    def __call__(self, fn):
+        if self.tag is None:
+            self.tag = fn.__name__
+
+        def wrapper(*args, **kwargs):
+            self.__count += 1
+
+            if self.with_sync:
+                if is_cuda_available():
+                    torch.cuda.synchronize()
+                elif is_musa_available():
+                    torch.musa.synchronize()
+            start_time = time.perf_counter()
+
+            result = fn(*args, **kwargs)
+
+            if self.with_sync:
+                if is_cuda_available():
+                    torch.cuda.synchronize()
+                elif is_musa_available():
+                    torch.musa.synchronize()
+            elapsed = time.perf_counter() - start_time
+            self.print_time(elapsed)
+
+            return result
+
+        return wrapper
+
+    @master_only
+    def __enter__(self):
+        assert self.tag is not None, 'In order to clearly distinguish ' \
+                                     'printing information in different ' \
+                                     'contexts, please specify the ' \
+                                     'tag parameter'
+
+        self.__count += 1
+
+        if self.with_sync and torch.cuda.is_available():
+            torch.cuda.synchronize()
+        self.__start_time = time.perf_counter()
+
+    @master_only
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.with_sync and torch.cuda.is_available():
+            torch.cuda.synchronize()
+        elapsed = time.perf_counter() - self.__start_time
+        self.print_time(elapsed)
+
+    def print_time(self, elapsed: Union[int, float]) -> None:
+        """print times per count."""
+        if self.__count >= self.warmup_interval:
+            self.__pure_inf_time += elapsed
+
+            if self.__count % self.log_interval == 0:
+                times_per_count = 1000 * self.__pure_inf_time / (
+                    self.__count - self.warmup_interval + 1)
+                print_log(
+                    f'[{self.tag}]-time per run averaged in the past '
+                    f'{self.__count} runs: {times_per_count:.1f} ms',
+                    self.logger)
diff --git a/head_extractor/src/mmengine/utils/dl_utils/torch_ops.py b/head_extractor/src/mmengine/utils/dl_utils/torch_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..2550ae6986e0fcfe7627b96eb575f26ef601c935
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/dl_utils/torch_ops.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..version_utils import digit_version
+from .parrots_wrapper import TORCH_VERSION
+
+_torch_version_meshgrid_indexing = (
+    'parrots' not in TORCH_VERSION
+    and digit_version(TORCH_VERSION) >= digit_version('1.10.0a0'))
+
+
+def torch_meshgrid(*tensors):
+    """A wrapper of torch.meshgrid to compat different PyTorch versions.
+
+    Since PyTorch 1.10.0a0, torch.meshgrid supports the arguments ``indexing``.
+    So we implement a wrapper here to avoid warning when using high-version
+    PyTorch and avoid compatibility issues when using previous versions of
+    PyTorch.
+
+    Args:
+        tensors (List[Tensor]): List of scalars or 1 dimensional tensors.
+
+    Returns:
+        Sequence[Tensor]: Sequence of meshgrid tensors.
+    """
+    if _torch_version_meshgrid_indexing:
+        return torch.meshgrid(*tensors, indexing='ij')
+    else:
+        return torch.meshgrid(*tensors)  # Uses indexing='ij' by default
diff --git a/head_extractor/src/mmengine/utils/dl_utils/trace.py b/head_extractor/src/mmengine/utils/dl_utils/trace.py
new file mode 100644
index 0000000000000000000000000000000000000000..c12bebf5d12ef7b26f1361a7e54fc120364db469
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/dl_utils/trace.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+
+from ..version_utils import digit_version
+
+
+def is_jit_tracing() -> bool:
+    if (torch.__version__ != 'parrots'
+            and digit_version(torch.__version__) >= digit_version('1.6.0')):
+        on_trace = torch.jit.is_tracing()
+        # In PyTorch 1.6, torch.jit.is_tracing has a bug.
+        # Refers to https://github.com/pytorch/pytorch/issues/42448
+        if isinstance(on_trace, bool):
+            return on_trace
+        else:
+            return torch._C._is_tracing()
+    else:
+        warnings.warn(
+            'torch.jit.is_tracing is only supported after v1.6.0. '
+            'Therefore is_tracing returns False automatically. Please '
+            'set on_trace manually if you are using trace.', UserWarning)
+        return False
diff --git a/head_extractor/src/mmengine/utils/dl_utils/visualize.py b/head_extractor/src/mmengine/utils/dl_utils/visualize.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3361e1d50a4dafb8518d6bbd66f9131b441bd80
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/dl_utils/visualize.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import patch
+
+import torch
+import torch.nn as nn
+
+from mmengine.model import BaseModel
+from mmengine.registry import MODELS
+
+
+@MODELS.register_module()
+class ToyModel(BaseModel):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.conv = nn.Conv2d(1, 1, 1)
+
+    def forward(self, *args, **kwargs):
+        return {'loss': torch.tensor(0.0)}
+
+
+def update_params_step(self, loss):
+    pass
+
+
+def runtimeinfo_step(self, runner, batch_idx, data_batch=None):
+    runner.message_hub.update_info('iter', runner.iter)
+    lr_dict = runner.optim_wrapper.get_lr()
+    for name, lr in lr_dict.items():
+        runner.message_hub.update_scalar(f'train/{name}', lr[0])
+
+    momentum_dict = runner.optim_wrapper.get_momentum()
+    for name, momentum in momentum_dict.items():
+        runner.message_hub.update_scalar(f'train/{name}', momentum[0])
+
+
+@patch('mmengine.optim.optimizer.OptimWrapper.update_params',
+       update_params_step)
+@patch('mmengine.hooks.RuntimeInfoHook.before_train_iter', runtimeinfo_step)
+def fake_run(cfg):
+    from mmengine.runner import Runner
+    cfg.pop('model')
+    cfg.pop('visualizer')
+    cfg.pop('val_dataloader')
+    cfg.pop('val_evaluator')
+    cfg.pop('val_cfg')
+    cfg.pop('test_dataloader')
+    cfg.pop('test_evaluator')
+    cfg.pop('test_cfg')
+    extra_cfg = dict(
+        model=dict(type='ToyModel'),
+        visualizer=dict(
+            type='Visualizer',
+            vis_backends=[
+                dict(type='TensorboardVisBackend', save_dir='temp_dir')
+            ]),
+    )
+    cfg.merge_from_dict(extra_cfg)
+    # build the runner from config
+    runner = Runner.from_cfg(cfg)
+
+    # start training
+    runner.train()
diff --git a/head_extractor/src/mmengine/utils/manager.py b/head_extractor/src/mmengine/utils/manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..70b45f2d8e94b192d070a596f7c3a132097ccc36
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/manager.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import threading
+import warnings
+from collections import OrderedDict
+from typing import Type, TypeVar
+
+_lock = threading.RLock()
+T = TypeVar('T')
+
+
+def _accquire_lock() -> None:
+    """Acquire the module-level lock for serializing access to shared data.
+
+    This should be released with _release_lock().
+    """
+    if _lock:
+        _lock.acquire()
+
+
+def _release_lock() -> None:
+    """Release the module-level lock acquired by calling _accquire_lock()."""
+    if _lock:
+        _lock.release()
+
+
+class ManagerMeta(type):
+    """The metaclass for global accessible class.
+
+    The subclasses inheriting from ``ManagerMeta`` will manage their
+    own ``_instance_dict`` and root instances. The constructors of subclasses
+    must contain the ``name`` argument.
+
+    Examples:
+        >>> class SubClass1(metaclass=ManagerMeta):
+        >>>     def __init__(self, *args, **kwargs):
+        >>>         pass
+        AssertionError: <class '__main__.SubClass1'>.__init__ must have the
+        name argument.
+        >>> class SubClass2(metaclass=ManagerMeta):
+        >>>     def __init__(self, name):
+        >>>         pass
+        >>> # valid format.
+    """
+
+    def __init__(cls, *args):
+        cls._instance_dict = OrderedDict()
+        params = inspect.getfullargspec(cls)
+        params_names = params[0] if params[0] else []
+        assert 'name' in params_names, f'{cls} must have the `name` argument'
+        super().__init__(*args)
+
+
+class ManagerMixin(metaclass=ManagerMeta):
+    """``ManagerMixin`` is the base class for classes that have global access
+    requirements.
+
+    The subclasses inheriting from ``ManagerMixin`` can get their
+    global instances.
+
+    Examples:
+        >>> class GlobalAccessible(ManagerMixin):
+        >>>     def __init__(self, name=''):
+        >>>         super().__init__(name)
+        >>>
+        >>> GlobalAccessible.get_instance('name')
+        >>> instance_1 = GlobalAccessible.get_instance('name')
+        >>> instance_2 = GlobalAccessible.get_instance('name')
+        >>> assert id(instance_1) == id(instance_2)
+
+    Args:
+        name (str): Name of the instance. Defaults to ''.
+    """
+
+    def __init__(self, name: str = '', **kwargs):
+        assert isinstance(name, str) and name, \
+            'name argument must be an non-empty string.'
+        self._instance_name = name
+
+    @classmethod
+    def get_instance(cls: Type[T], name: str, **kwargs) -> T:
+        """Get subclass instance by name if the name exists.
+
+        If corresponding name instance has not been created, ``get_instance``
+        will create an instance, otherwise ``get_instance`` will return the
+        corresponding instance.
+
+        Examples
+            >>> instance1 = GlobalAccessible.get_instance('name1')
+            >>> # Create name1 instance.
+            >>> instance.instance_name
+            name1
+            >>> instance2 = GlobalAccessible.get_instance('name1')
+            >>> # Get name1 instance.
+            >>> assert id(instance1) == id(instance2)
+
+        Args:
+            name (str): Name of instance. Defaults to ''.
+
+        Returns:
+            object: Corresponding name instance, the latest instance, or root
+            instance.
+        """
+        _accquire_lock()
+        assert isinstance(name, str), \
+            f'type of name should be str, but got {type(cls)}'
+        instance_dict = cls._instance_dict  # type: ignore
+        # Get the instance by name.
+        if name not in instance_dict:
+            instance = cls(name=name, **kwargs)  # type: ignore
+            instance_dict[name] = instance  # type: ignore
+        elif kwargs:
+            warnings.warn(
+                f'{cls} instance named of {name} has been created, '
+                'the method `get_instance` should not accept any other '
+                'arguments')
+        # Get latest instantiated instance or root instance.
+        _release_lock()
+        return instance_dict[name]
+
+    @classmethod
+    def get_current_instance(cls):
+        """Get latest created instance.
+
+        Before calling ``get_current_instance``, The subclass must have called
+        ``get_instance(xxx)`` at least once.
+
+        Examples
+            >>> instance = GlobalAccessible.get_current_instance()
+            AssertionError: At least one of name and current needs to be set
+            >>> instance = GlobalAccessible.get_instance('name1')
+            >>> instance.instance_name
+            name1
+            >>> instance = GlobalAccessible.get_current_instance()
+            >>> instance.instance_name
+            name1
+
+        Returns:
+            object: Latest created instance.
+        """
+        _accquire_lock()
+        if not cls._instance_dict:
+            raise RuntimeError(
+                f'Before calling {cls.__name__}.get_current_instance(), you '
+                'should call get_instance(name=xxx) at least once.')
+        name = next(iter(reversed(cls._instance_dict)))
+        _release_lock()
+        return cls._instance_dict[name]
+
+    @classmethod
+    def check_instance_created(cls, name: str) -> bool:
+        """Check whether the name corresponding instance exists.
+
+        Args:
+            name (str): Name of instance.
+
+        Returns:
+            bool: Whether the name corresponding instance exists.
+        """
+        return name in cls._instance_dict
+
+    @property
+    def instance_name(self) -> str:
+        """Get the name of instance.
+
+        Returns:
+            str: Name of instance.
+        """
+        return self._instance_name
diff --git a/head_extractor/src/mmengine/utils/misc.py b/head_extractor/src/mmengine/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..15c1f89faecbe69a8ad61c1d376193580a72243d
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/misc.py
@@ -0,0 +1,543 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections.abc
+import functools
+import itertools
+import logging
+import re
+import subprocess
+import textwrap
+import warnings
+from collections import abc
+from importlib import import_module
+from inspect import getfullargspec, ismodule
+from itertools import repeat
+from typing import Any, Callable, Optional, Type, Union
+
+
+# From PyTorch internals
+def _ntuple(n):
+
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+def is_str(x):
+    """Whether the input is an string instance.
+
+    Note: This method is deprecated since python 2 is no longer supported.
+    """
+    return isinstance(x, str)
+
+
+def import_modules_from_strings(imports, allow_failed_imports=False):
+    """Import modules from the given list of strings.
+
+    Args:
+        imports (list | str | None): The given module names to be imported.
+        allow_failed_imports (bool): If True, the failed imports will return
+            None. Otherwise, an ImportError is raise. Defaults to False.
+
+    Returns:
+        list[module] | module | None: The imported modules.
+
+    Examples:
+        >>> osp, sys = import_modules_from_strings(
+        ...     ['os.path', 'sys'])
+        >>> import os.path as osp_
+        >>> import sys as sys_
+        >>> assert osp == osp_
+        >>> assert sys == sys_
+    """
+    if not imports:
+        return
+    single_import = False
+    if isinstance(imports, str):
+        single_import = True
+        imports = [imports]
+    if not isinstance(imports, list):
+        raise TypeError(
+            f'custom_imports must be a list but got type {type(imports)}')
+    imported = []
+    for imp in imports:
+        if not isinstance(imp, str):
+            raise TypeError(
+                f'{imp} is of type {type(imp)} and cannot be imported.')
+        try:
+            imported_tmp = import_module(imp)
+        except ImportError:
+            if allow_failed_imports:
+                warnings.warn(f'{imp} failed to import and is ignored.',
+                              UserWarning)
+                imported_tmp = None
+            else:
+                raise ImportError(f'Failed to import {imp}')
+        imported.append(imported_tmp)
+    if single_import:
+        imported = imported[0]
+    return imported
+
+
+def iter_cast(inputs, dst_type, return_type=None):
+    """Cast elements of an iterable object into some type.
+
+    Args:
+        inputs (Iterable): The input object.
+        dst_type (type): Destination type.
+        return_type (type, optional): If specified, the output object will be
+            converted to this type, otherwise an iterator.
+
+    Returns:
+        iterator or specified type: The converted object.
+    """
+    if not isinstance(inputs, abc.Iterable):
+        raise TypeError('inputs must be an iterable object')
+    if not isinstance(dst_type, type):
+        raise TypeError('"dst_type" must be a valid type')
+
+    out_iterable = map(dst_type, inputs)
+
+    if return_type is None:
+        return out_iterable
+    else:
+        return return_type(out_iterable)
+
+
+def list_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a list of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=list)
+
+
+def tuple_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a tuple of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=tuple)
+
+
+def is_seq_of(seq: Any,
+              expected_type: Union[Type, tuple],
+              seq_type: Type = None) -> bool:
+    """Check whether it is a sequence of some type.
+
+    Args:
+        seq (Sequence): The sequence to be checked.
+        expected_type (type or tuple): Expected type of sequence items.
+        seq_type (type, optional): Expected sequence type. Defaults to None.
+
+    Returns:
+        bool: Return True if ``seq`` is valid else False.
+
+    Examples:
+        >>> from mmengine.utils import is_seq_of
+        >>> seq = ['a', 'b', 'c']
+        >>> is_seq_of(seq, str)
+        True
+        >>> is_seq_of(seq, int)
+        False
+    """
+    if seq_type is None:
+        exp_seq_type = abc.Sequence
+    else:
+        assert isinstance(seq_type, type)
+        exp_seq_type = seq_type
+    if not isinstance(seq, exp_seq_type):
+        return False
+    for item in seq:
+        if not isinstance(item, expected_type):
+            return False
+    return True
+
+
+def is_list_of(seq, expected_type):
+    """Check whether it is a list of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=list)
+
+
+def is_tuple_of(seq, expected_type):
+    """Check whether it is a tuple of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=tuple)
+
+
+def slice_list(in_list, lens):
+    """Slice a list into several sub lists by a list of given length.
+
+    Args:
+        in_list (list): The list to be sliced.
+        lens(int or list): The expected length of each out list.
+
+    Returns:
+        list: A list of sliced list.
+    """
+    if isinstance(lens, int):
+        assert len(in_list) % lens == 0
+        lens = [lens] * int(len(in_list) / lens)
+    if not isinstance(lens, list):
+        raise TypeError('"indices" must be an integer or a list of integers')
+    elif sum(lens) != len(in_list):
+        raise ValueError('sum of lens and list length does not '
+                         f'match: {sum(lens)} != {len(in_list)}')
+    out_list = []
+    idx = 0
+    for i in range(len(lens)):
+        out_list.append(in_list[idx:idx + lens[i]])
+        idx += lens[i]
+    return out_list
+
+
+def concat_list(in_list):
+    """Concatenate a list of list into a single list.
+
+    Args:
+        in_list (list): The list of list to be merged.
+
+    Returns:
+        list: The concatenated flat list.
+    """
+    return list(itertools.chain(*in_list))
+
+
+def apply_to(data: Any, expr: Callable, apply_func: Callable):
+    """Apply function to each element in dict, list or tuple that matches with
+    the expression.
+
+    For examples, if you want to convert each element in a list of dict from
+    `np.ndarray` to `Tensor`. You can use the following code:
+
+    Examples:
+        >>> from mmengine.utils import apply_to
+        >>> import numpy as np
+        >>> import torch
+        >>> data = dict(array=[np.array(1)]) # {'array': [array(1)]}
+        >>> result = apply_to(data, lambda x: isinstance(x, np.ndarray), lambda x: torch.from_numpy(x))
+        >>> print(result) # {'array': [tensor(1)]}
+
+    Args:
+        data (Any): Data to be applied.
+        expr (Callable): Expression to tell which data should be applied with
+            the function. It should return a boolean.
+        apply_func (Callable): Function applied to data.
+
+    Returns:
+        Any: The data after applying.
+    """  # noqa: E501
+    if isinstance(data, dict):
+        # Keep the original dict type
+        res = type(data)()
+        for key, value in data.items():
+            res[key] = apply_to(value, expr, apply_func)
+        return res
+    elif isinstance(data, tuple) and hasattr(data, '_fields'):
+        # namedtuple
+        return type(data)(*(apply_to(sample, expr, apply_func) for sample in data))  # type: ignore  # noqa: E501  # yapf:disable
+    elif isinstance(data, (tuple, list)):
+        return type(data)(apply_to(sample, expr, apply_func) for sample in data)  # type: ignore  # noqa: E501  # yapf:disable
+    elif expr(data):
+        return apply_func(data)
+    else:
+        return data
+
+
+def check_prerequisites(
+        prerequisites,
+        checker,
+        msg_tmpl='Prerequisites "{}" are required in method "{}" but not '
+                 'found, please install them first.'):  # yapf: disable
+    """A decorator factory to check if prerequisites are satisfied.
+
+    Args:
+        prerequisites (str of list[str]): Prerequisites to be checked.
+        checker (callable): The checker method that returns True if a
+            prerequisite is meet, False otherwise.
+        msg_tmpl (str): The message template with two variables.
+
+    Returns:
+        decorator: A specific decorator.
+    """
+
+    def wrap(func):
+
+        @functools.wraps(func)
+        def wrapped_func(*args, **kwargs):
+            requirements = [prerequisites] if isinstance(
+                prerequisites, str) else prerequisites
+            missing = []
+            for item in requirements:
+                if not checker(item):
+                    missing.append(item)
+            if missing:
+                print(msg_tmpl.format(', '.join(missing), func.__name__))
+                raise RuntimeError('Prerequisites not meet.')
+            else:
+                return func(*args, **kwargs)
+
+        return wrapped_func
+
+    return wrap
+
+
+def _check_py_package(package):
+    try:
+        import_module(package)
+    except ImportError:
+        return False
+    else:
+        return True
+
+
+def _check_executable(cmd):
+    if subprocess.call(f'which {cmd}', shell=True) != 0:
+        return False
+    else:
+        return True
+
+
+def requires_package(prerequisites):
+    """A decorator to check if some python packages are installed.
+
+    Example:
+        >>> @requires_package('numpy')
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        array([0.])
+        >>> @requires_package(['numpy', 'non_package'])
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        ImportError
+    """
+    return check_prerequisites(prerequisites, checker=_check_py_package)
+
+
+def requires_executable(prerequisites):
+    """A decorator to check if some executable files are installed.
+
+    Example:
+        >>> @requires_executable('ffmpeg')
+        >>> func(arg1, args):
+        >>>     print(1)
+        1
+    """
+    return check_prerequisites(prerequisites, checker=_check_executable)
+
+
+def deprecated_api_warning(name_dict: dict,
+                           cls_name: Optional[str] = None) -> Callable:
+    """A decorator to check if some arguments are deprecate and try to replace
+    deprecate src_arg_name to dst_arg_name.
+
+    Args:
+        name_dict(dict):
+            key (str): Deprecate argument names.
+            val (str): Expected argument names.
+
+    Returns:
+        func: New function.
+    """
+
+    def api_warning_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get name of the function
+            func_name = old_func.__name__
+            if cls_name is not None:
+                func_name = f'{cls_name}.{func_name}'
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in arg_names:
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead', DeprecationWarning)
+                        arg_names[arg_names.index(src_arg_name)] = dst_arg_name
+            if kwargs:
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in kwargs:
+                        assert dst_arg_name not in kwargs, (
+                            f'The expected behavior is to replace '
+                            f'the deprecated key `{src_arg_name}` to '
+                            f'new key `{dst_arg_name}`, but got them '
+                            f'in the arguments at the same time, which '
+                            f'is confusing. `{src_arg_name} will be '
+                            f'deprecated in the future, please '
+                            f'use `{dst_arg_name}` instead.')
+
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead', DeprecationWarning)
+                        kwargs[dst_arg_name] = kwargs.pop(src_arg_name)
+
+            # apply converted arguments to the decorated method
+            output = old_func(*args, **kwargs)
+            return output
+
+        return new_func
+
+    return api_warning_wrapper
+
+
+def is_method_overridden(method: str, base_class: type,
+                         derived_class: Union[type, Any]) -> bool:
+    """Check if a method of base class is overridden in derived class.
+
+    Args:
+        method (str): the method name to check.
+        base_class (type): the class of the base class.
+        derived_class (type | Any): the class or instance of the derived class.
+    """
+    assert isinstance(base_class, type), \
+        "base_class doesn't accept instance, Please pass class instead."
+
+    if not isinstance(derived_class, type):
+        derived_class = derived_class.__class__
+
+    base_method = getattr(base_class, method)
+    derived_method = getattr(derived_class, method)
+    return derived_method != base_method
+
+
+def has_method(obj: object, method: str) -> bool:
+    """Check whether the object has a method.
+
+    Args:
+        method (str): The method name to check.
+        obj (object): The object to check.
+
+    Returns:
+        bool: True if the object has the method else False.
+    """
+    return hasattr(obj, method) and callable(getattr(obj, method))
+
+
+def deprecated_function(since: str, removed_in: str,
+                        instructions: str) -> Callable:
+    """Marks functions as deprecated.
+
+    Throw a warning when a deprecated function is called, and add a note in the
+    docstring. Modified from https://github.com/pytorch/pytorch/blob/master/torch/onnx/_deprecation.py
+
+    Args:
+        since (str): The version when the function was first deprecated.
+        removed_in (str): The version when the function will be removed.
+        instructions (str): The action users should take.
+
+    Returns:
+        Callable: A new function, which will be deprecated soon.
+    """  # noqa: E501
+    from mmengine import print_log
+
+    def decorator(function):
+
+        @functools.wraps(function)
+        def wrapper(*args, **kwargs):
+            print_log(
+                f"'{function.__module__}.{function.__name__}' "
+                f'is deprecated in version {since} and will be '
+                f'removed in version {removed_in}. Please {instructions}.',
+                logger='current',
+                level=logging.WARNING,
+            )
+            return function(*args, **kwargs)
+
+        indent = '    '
+        # Add a deprecation note to the docstring.
+        docstring = function.__doc__ or ''
+        # Add a note to the docstring.
+        deprecation_note = textwrap.dedent(f"""\
+            .. deprecated:: {since}
+                Deprecated and will be removed in version {removed_in}.
+                Please {instructions}.
+            """)
+        # Split docstring at first occurrence of newline
+        pattern = '\n\n'
+        summary_and_body = re.split(pattern, docstring, 1)
+
+        if len(summary_and_body) > 1:
+            summary, body = summary_and_body
+            body = textwrap.indent(textwrap.dedent(body), indent)
+            summary = '\n'.join(
+                [textwrap.dedent(string) for string in summary.split('\n')])
+            summary = textwrap.indent(summary, prefix=indent)
+            # Dedent the body. We cannot do this with the presence of the
+            # summary because the body contains leading whitespaces when the
+            # summary does not.
+            new_docstring_parts = [
+                deprecation_note, '\n\n', summary, '\n\n', body
+            ]
+        else:
+            summary = summary_and_body[0]
+            summary = '\n'.join(
+                [textwrap.dedent(string) for string in summary.split('\n')])
+            summary = textwrap.indent(summary, prefix=indent)
+            new_docstring_parts = [deprecation_note, '\n\n', summary]
+
+        wrapper.__doc__ = ''.join(new_docstring_parts)
+
+        return wrapper
+
+    return decorator
+
+
+def get_object_from_string(obj_name: str):
+    """Get object from name.
+
+    Args:
+        obj_name (str): The name of the object.
+
+    Examples:
+        >>> get_object_from_string('torch.optim.sgd.SGD')
+        >>> torch.optim.sgd.SGD
+    """
+    parts = iter(obj_name.split('.'))
+    module_name = next(parts)
+    # import module
+    while True:
+        try:
+            module = import_module(module_name)
+            part = next(parts)
+            # mmcv.ops has nms.py and nms function at the same time. So the
+            # function will have a higher priority
+            obj = getattr(module, part, None)
+            if obj is not None and not ismodule(obj):
+                break
+            module_name = f'{module_name}.{part}'
+        except StopIteration:
+            # if obj is a module
+            return module
+        except ImportError:
+            return None
+
+    # get class or attribute from module
+    obj = module
+    while True:
+        try:
+            obj = getattr(obj, part)
+            part = next(parts)
+        except StopIteration:
+            return obj
+        except AttributeError:
+            return None
diff --git a/head_extractor/src/mmengine/utils/package_utils.py b/head_extractor/src/mmengine/utils/package_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b224625f1334f8d2952a5ffb7dd0bce4c2bd53cf
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/package_utils.py
@@ -0,0 +1,103 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import subprocess
+
+
+def is_installed(package: str) -> bool:
+    """Check package whether installed.
+
+    Args:
+        package (str): Name of package to be checked.
+    """
+    # When executing `import mmengine.runner`,
+    # pkg_resources will be imported and it takes too much time.
+    # Therefore, import it in function scope to save time.
+    import importlib.util
+
+    import pkg_resources
+    from pkg_resources import get_distribution
+
+    # refresh the pkg_resources
+    # more datails at https://github.com/pypa/setuptools/issues/373
+    importlib.reload(pkg_resources)
+    try:
+        get_distribution(package)
+        return True
+    except pkg_resources.DistributionNotFound:
+        spec = importlib.util.find_spec(package)
+        if spec is None:
+            return False
+        elif spec.origin is not None:
+            return True
+        else:
+            return False
+
+
+def get_installed_path(package: str) -> str:
+    """Get installed path of package.
+
+    Args:
+        package (str): Name of package.
+
+    Example:
+        >>> get_installed_path('mmcls')
+        >>> '.../lib/python3.7/site-packages/mmcls'
+    """
+    import importlib.util
+
+    from pkg_resources import DistributionNotFound, get_distribution
+
+    # if the package name is not the same as module name, module name should be
+    # inferred. For example, mmcv-full is the package name, but mmcv is module
+    # name. If we want to get the installed path of mmcv-full, we should concat
+    # the pkg.location and module name
+    try:
+        pkg = get_distribution(package)
+    except DistributionNotFound as e:
+        # if the package is not installed, package path set in PYTHONPATH
+        # can be detected by `find_spec`
+        spec = importlib.util.find_spec(package)
+        if spec is not None:
+            if spec.origin is not None:
+                return osp.dirname(spec.origin)
+            else:
+                # `get_installed_path` cannot get the installed path of
+                # namespace packages
+                raise RuntimeError(
+                    f'{package} is a namespace package, which is invalid '
+                    'for `get_install_path`')
+        else:
+            raise e
+
+    possible_path = osp.join(pkg.location, package)
+    if osp.exists(possible_path):
+        return possible_path
+    else:
+        return osp.join(pkg.location, package2module(package))
+
+
+def package2module(package: str):
+    """Infer module name from package.
+
+    Args:
+        package (str): Package to infer module name.
+    """
+    from pkg_resources import get_distribution
+    pkg = get_distribution(package)
+    if pkg.has_metadata('top_level.txt'):
+        module_name = pkg.get_metadata('top_level.txt').split('\n')[0]
+        return module_name
+    else:
+        raise ValueError(f'can not infer the module name of {package}')
+
+
+def call_command(cmd: list) -> None:
+    try:
+        subprocess.check_call(cmd)
+    except Exception as e:
+        raise e  # type: ignore
+
+
+def install_package(package: str):
+    if not is_installed(package):
+        call_command(['python', '-m', 'pip', 'install', package])
diff --git a/head_extractor/src/mmengine/utils/path.py b/head_extractor/src/mmengine/utils/path.py
new file mode 100644
index 0000000000000000000000000000000000000000..307d053f2fd9fddf0bb52d4e0081cdd32e5a6364
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/path.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from pathlib import Path
+
+from .misc import is_str
+
+
+def is_filepath(x):
+    return is_str(x) or isinstance(x, Path)
+
+
+def fopen(filepath, *args, **kwargs):
+    if is_str(filepath):
+        return open(filepath, *args, **kwargs)
+    elif isinstance(filepath, Path):
+        return filepath.open(*args, **kwargs)
+    raise ValueError('`filepath` should be a string or a Path')
+
+
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+
+
+def mkdir_or_exist(dir_name, mode=0o777):
+    if dir_name == '':
+        return
+    dir_name = osp.expanduser(dir_name)
+    os.makedirs(dir_name, mode=mode, exist_ok=True)
+
+
+def symlink(src, dst, overwrite=True, **kwargs):
+    if os.path.lexists(dst) and overwrite:
+        os.remove(dst)
+    os.symlink(src, dst, **kwargs)
+
+
+def scandir(dir_path, suffix=None, recursive=False, case_sensitive=True):
+    """Scan a directory to find the interested files.
+
+    Args:
+        dir_path (str | :obj:`Path`): Path of the directory.
+        suffix (str | tuple(str), optional): File suffix that we are
+            interested in. Defaults to None.
+        recursive (bool, optional): If set to True, recursively scan the
+            directory. Defaults to False.
+        case_sensitive (bool, optional) : If set to False, ignore the case of
+            suffix. Defaults to True.
+
+    Returns:
+        A generator for all the interested files with relative paths.
+    """
+    if isinstance(dir_path, (str, Path)):
+        dir_path = str(dir_path)
+    else:
+        raise TypeError('"dir_path" must be a string or Path object')
+
+    if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+        raise TypeError('"suffix" must be a string or tuple of strings')
+
+    if suffix is not None and not case_sensitive:
+        suffix = suffix.lower() if isinstance(suffix, str) else tuple(
+            item.lower() for item in suffix)
+
+    root = dir_path
+
+    def _scandir(dir_path, suffix, recursive, case_sensitive):
+        for entry in os.scandir(dir_path):
+            if not entry.name.startswith('.') and entry.is_file():
+                rel_path = osp.relpath(entry.path, root)
+                _rel_path = rel_path if case_sensitive else rel_path.lower()
+                if suffix is None or _rel_path.endswith(suffix):
+                    yield rel_path
+            elif recursive and os.path.isdir(entry.path):
+                # scan recursively if entry.path is a directory
+                yield from _scandir(entry.path, suffix, recursive,
+                                    case_sensitive)
+
+    return _scandir(dir_path, suffix, recursive, case_sensitive)
+
+
+def find_vcs_root(path, markers=('.git', )):
+    """Finds the root directory (including itself) of specified markers.
+
+    Args:
+        path (str): Path of directory or file.
+        markers (list[str], optional): List of file or directory names.
+
+    Returns:
+        The directory contained one of the markers or None if not found.
+    """
+    if osp.isfile(path):
+        path = osp.dirname(path)
+
+    prev, cur = None, osp.abspath(osp.expanduser(path))
+    while cur != prev:
+        if any(osp.exists(osp.join(cur, marker)) for marker in markers):
+            return cur
+        prev, cur = cur, osp.split(cur)[0]
+    return None
+
+
+def is_abs(path: str) -> bool:
+    """Check if path is an absolute path in different backends.
+
+    Args:
+        path (str): path of directory or file.
+
+    Returns:
+        bool: whether path is an absolute path.
+    """
+    if osp.isabs(path) or path.startswith(('http://', 'https://', 's3://')):
+        return True
+    else:
+        return False
diff --git a/head_extractor/src/mmengine/utils/progressbar.py b/head_extractor/src/mmengine/utils/progressbar.py
new file mode 100644
index 0000000000000000000000000000000000000000..36172f04dd5a469387e19f16fbad24ea3ae3b7ce
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/progressbar.py
@@ -0,0 +1,247 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+from collections.abc import Iterable
+from multiprocessing import Pool
+from shutil import get_terminal_size
+from typing import Callable, Sequence
+
+from .timer import Timer
+
+
+class ProgressBar:
+    """A progress bar which can print the progress.
+
+    Args:
+        task_num (int): Number of total steps. Defaults to 0.
+        bar_width (int): Width of the progress bar. Defaults to 50.
+        start (bool): Whether to start the progress bar in the constructor.
+            Defaults to True.
+        file (callable): Progress bar output mode. Defaults to "sys.stdout".
+
+    Examples:
+        >>> import mmengine
+        >>> import time
+        >>> bar = mmengine.ProgressBar(10)
+        >>> for i in range(10):
+        >>>    bar.update()
+        >>>    time.sleep(1)
+    """
+
+    def __init__(self,
+                 task_num: int = 0,
+                 bar_width: int = 50,
+                 start: bool = True,
+                 file=sys.stdout):
+        self.task_num = task_num
+        self.bar_width = bar_width
+        self.completed = 0
+        self.file = file
+        if start:
+            self.start()
+
+    @property
+    def terminal_width(self):
+        width, _ = get_terminal_size()
+        return width
+
+    def start(self):
+        if self.task_num > 0:
+            self.file.write(f'[{" " * self.bar_width}] 0/{self.task_num}, '
+                            'elapsed: 0s, ETA:')
+        else:
+            self.file.write('completed: 0, elapsed: 0s')
+        self.file.flush()
+        self.timer = Timer()
+
+    def update(self, num_tasks: int = 1):
+        """update progressbar.
+
+        Args:
+            num_tasks (int): Update step size.
+        """
+        assert num_tasks > 0
+        self.completed += num_tasks
+        elapsed = self.timer.since_start()
+        if elapsed > 0:
+            fps = self.completed / elapsed
+        else:
+            fps = float('inf')
+        if self.task_num > 0:
+            percentage = self.completed / float(self.task_num)
+            eta = int(elapsed * (1 - percentage) / percentage + 0.5)
+            msg = f'\r[{{}}] {self.completed}/{self.task_num}, ' \
+                  f'{fps:.1f} task/s, elapsed: {int(elapsed + 0.5)}s, ' \
+                  f'ETA: {eta:5}s'
+
+            bar_width = min(self.bar_width,
+                            int(self.terminal_width - len(msg)) + 2,
+                            int(self.terminal_width * 0.6))
+            bar_width = max(2, bar_width)
+            mark_width = int(bar_width * percentage)
+            bar_chars = '>' * mark_width + ' ' * (bar_width - mark_width)
+            self.file.write(msg.format(bar_chars))
+        else:
+            self.file.write(
+                f'completed: {self.completed}, elapsed: {int(elapsed + 0.5)}s,'
+                f' {fps:.1f} tasks/s')
+        self.file.flush()
+
+
+def track_progress(func: Callable,
+                   tasks: Sequence,
+                   bar_width: int = 50,
+                   file=sys.stdout,
+                   **kwargs):
+    """Track the progress of tasks execution with a progress bar.
+
+    Tasks are done with a simple for-loop.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (Sequence): If tasks is a tuple, it must contain two elements,
+            the first being the tasks to be completed and the other being the
+            number of tasks. If it is not a tuple, it represents the tasks to
+            be completed.
+        bar_width (int): Width of progress bar.
+
+    Returns:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]  # type: ignore
+    elif isinstance(tasks, Sequence):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be a tuple object or a sequence object, but got '
+            f'{type(tasks)}')
+    prog_bar = ProgressBar(task_num, bar_width, file=file)
+    results = []
+    for task in tasks:
+        results.append(func(task, **kwargs))
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    return results
+
+
+def init_pool(process_num, initializer=None, initargs=None):
+    if initializer is None:
+        return Pool(process_num)
+    elif initargs is None:
+        return Pool(process_num, initializer)
+    else:
+        if not isinstance(initargs, tuple):
+            raise TypeError('"initargs" must be a tuple')
+        return Pool(process_num, initializer, initargs)
+
+
+def track_parallel_progress(func: Callable,
+                            tasks: Sequence,
+                            nproc: int,
+                            initializer: Callable = None,
+                            initargs: tuple = None,
+                            bar_width: int = 50,
+                            chunksize: int = 1,
+                            skip_first: bool = False,
+                            keep_order: bool = True,
+                            file=sys.stdout):
+    """Track the progress of parallel task execution with a progress bar.
+
+    The built-in :mod:`multiprocessing` module is used for process pools and
+    tasks are done with :func:`Pool.map` or :func:`Pool.imap_unordered`.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (Sequence): If tasks is a tuple, it must contain two elements,
+            the first being the tasks to be completed and the other being the
+            number of tasks. If it is not a tuple, it represents the tasks to
+            be completed.
+        nproc (int): Process (worker) number.
+        initializer (None or callable): Refer to :class:`multiprocessing.Pool`
+            for details.
+        initargs (None or tuple): Refer to :class:`multiprocessing.Pool` for
+            details.
+        chunksize (int): Refer to :class:`multiprocessing.Pool` for details.
+        bar_width (int): Width of progress bar.
+        skip_first (bool): Whether to skip the first sample for each worker
+            when estimating fps, since the initialization step may takes
+            longer.
+        keep_order (bool): If True, :func:`Pool.imap` is used, otherwise
+            :func:`Pool.imap_unordered` is used.
+
+    Returns:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]  # type: ignore
+    elif isinstance(tasks, Sequence):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be a tuple object or a sequence object, but got '
+            f'{type(tasks)}')
+    pool = init_pool(nproc, initializer, initargs)
+    start = not skip_first
+    task_num -= nproc * chunksize * int(skip_first)
+    prog_bar = ProgressBar(task_num, bar_width, start, file=file)
+    results = []
+    if keep_order:
+        gen = pool.imap(func, tasks, chunksize)
+    else:
+        gen = pool.imap_unordered(func, tasks, chunksize)
+    for result in gen:
+        results.append(result)
+        if skip_first:
+            if len(results) < nproc * chunksize:
+                continue
+            elif len(results) == nproc * chunksize:
+                prog_bar.start()
+                continue
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    pool.close()
+    pool.join()
+    return results
+
+
+def track_iter_progress(tasks: Sequence, bar_width: int = 50, file=sys.stdout):
+    """Track the progress of tasks iteration or enumeration with a progress
+    bar.
+
+    Tasks are yielded with a simple for-loop.
+
+    Args:
+        tasks (Sequence): If tasks is a tuple, it must contain two elements,
+            the first being the tasks to be completed and the other being the
+            number of tasks. If it is not a tuple, it represents the tasks to
+            be completed.
+        bar_width (int): Width of progress bar.
+
+    Yields:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]  # type: ignore
+    elif isinstance(tasks, Sequence):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be a tuple object or a sequence object, but got '
+            f'{type(tasks)}')
+    prog_bar = ProgressBar(task_num, bar_width, file=file)
+    for task in tasks:
+        yield task
+        prog_bar.update()
+    prog_bar.file.write('\n')
diff --git a/head_extractor/src/mmengine/utils/progressbar_rich.py b/head_extractor/src/mmengine/utils/progressbar_rich.py
new file mode 100644
index 0000000000000000000000000000000000000000..c126866ba9988022ef1f3ac9cb5deb78d9a963f9
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/progressbar_rich.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from multiprocessing import Pool
+from typing import Callable, Iterable, Sized
+
+from rich.progress import (BarColumn, MofNCompleteColumn, Progress, Task,
+                           TaskProgressColumn, TextColumn, TimeRemainingColumn)
+from rich.text import Text
+
+
+class _Worker:
+    """Function wrapper for ``track_progress_rich``"""
+
+    def __init__(self, func) -> None:
+        self.func = func
+
+    def __call__(self, inputs):
+        inputs, idx = inputs
+        if not isinstance(inputs, (tuple, list)):
+            inputs = (inputs, )
+
+        return self.func(*inputs), idx
+
+
+class _SkipFirstTimeRemainingColumn(TimeRemainingColumn):
+    """Skip calculating remaining time for the first few times.
+
+    Args:
+        skip_times (int): The number of times to skip. Defaults to 0.
+    """
+
+    def __init__(self, *args, skip_times=0, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.skip_times = skip_times
+
+    def render(self, task: Task) -> Text:
+        """Show time remaining."""
+        if task.completed <= self.skip_times:
+            return Text('-:--:--', style='progress.remaining')
+        return super().render(task)
+
+
+def _tasks_with_index(tasks):
+    """Add index to tasks."""
+    for idx, task in enumerate(tasks):
+        yield task, idx
+
+
+def track_progress_rich(func: Callable,
+                        tasks: Iterable = tuple(),
+                        task_num: int = None,
+                        nproc: int = 1,
+                        chunksize: int = 1,
+                        description: str = 'Processing',
+                        color: str = 'blue') -> list:
+    """Track the progress of parallel task execution with a progress bar. The
+    built-in :mod:`multiprocessing` module is used for process pools and tasks
+    are done with :func:`Pool.map` or :func:`Pool.imap_unordered`.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (Iterable or Sized): A tuple of tasks. There are several cases
+            for different format tasks:
+            - When ``func`` accepts no arguments: tasks should be an empty
+              tuple, and ``task_num`` must be specified.
+            - When ``func`` accepts only one argument: tasks should be a tuple
+              containing the argument.
+            - When ``func`` accepts multiple arguments: tasks should be a
+              tuple, with each element representing a set of arguments.
+              If an element is a ``dict``, it will be parsed as a set of
+              keyword-only arguments.
+            Defaults to an empty tuple.
+        task_num (int, optional): If ``tasks`` is an iterator which does not
+            have length, the number of tasks can be provided by ``task_num``.
+            Defaults to None.
+        nproc (int): Process (worker) number, if nuproc is 1,
+            use single process. Defaults to 1.
+        chunksize (int): Refer to :class:`multiprocessing.Pool` for details.
+            Defaults to 1.
+        description (str): The description of progress bar.
+            Defaults to "Process".
+        color (str): The color of progress bar. Defaults to "blue".
+
+    Examples:
+        >>> import time
+
+        >>> def func(x):
+        ...    time.sleep(1)
+        ...    return x**2
+        >>> track_progress_rich(func, range(10), nproc=2)
+
+    Returns:
+        list: The task results.
+    """
+    if not callable(func):
+        raise TypeError('func must be a callable object')
+    if not isinstance(tasks, Iterable):
+        raise TypeError(
+            f'tasks must be an iterable object, but got {type(tasks)}')
+    if isinstance(tasks, Sized):
+        if len(tasks) == 0:
+            if task_num is None:
+                raise ValueError('If tasks is an empty iterable, '
+                                 'task_num must be set')
+            else:
+                tasks = tuple(tuple() for _ in range(task_num))
+        else:
+            if task_num is not None and task_num != len(tasks):
+                raise ValueError('task_num does not match the length of tasks')
+            task_num = len(tasks)
+
+    if nproc <= 0:
+        raise ValueError('nproc must be a positive number')
+
+    skip_times = nproc * chunksize if nproc > 1 else 0
+    prog_bar = Progress(
+        TextColumn('{task.description}'),
+        BarColumn(),
+        _SkipFirstTimeRemainingColumn(skip_times=skip_times),
+        MofNCompleteColumn(),
+        TaskProgressColumn(show_speed=True),
+    )
+
+    worker = _Worker(func)
+    task_id = prog_bar.add_task(
+        total=task_num, color=color, description=description)
+    tasks = _tasks_with_index(tasks)
+
+    # Use single process when nproc is 1, else use multiprocess.
+    with prog_bar:
+        if nproc == 1:
+            results = []
+            for task in tasks:
+                results.append(worker(task)[0])
+                prog_bar.update(task_id, advance=1, refresh=True)
+        else:
+            with Pool(nproc) as pool:
+                results = []
+                unordered_results = []
+                gen = pool.imap_unordered(worker, tasks, chunksize)
+                try:
+                    for result in gen:
+                        result, idx = result
+                        unordered_results.append((result, idx))
+                        results.append(None)
+                        prog_bar.update(task_id, advance=1, refresh=True)
+                except Exception as e:
+                    prog_bar.stop()
+                    raise e
+            for result, idx in unordered_results:
+                results[idx] = result
+    return results
diff --git a/head_extractor/src/mmengine/utils/timer.py b/head_extractor/src/mmengine/utils/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..087a969cfabe30ce0ed3080fd6eb6b81e232502f
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/timer.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from time import time
+
+
+class TimerError(Exception):
+
+    def __init__(self, message):
+        self.message = message
+        super().__init__(message)
+
+
+class Timer:
+    """A flexible Timer class.
+
+    Examples:
+        >>> import time
+        >>> import mmcv
+        >>> with mmcv.Timer():
+        >>>     # simulate a code block that will run for 1s
+        >>>     time.sleep(1)
+        1.000
+        >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'):
+        >>>     # simulate a code block that will run for 1s
+        >>>     time.sleep(1)
+        it takes 1.0 seconds
+        >>> timer = mmcv.Timer()
+        >>> time.sleep(0.5)
+        >>> print(timer.since_start())
+        0.500
+        >>> time.sleep(0.5)
+        >>> print(timer.since_last_check())
+        0.500
+        >>> print(timer.since_start())
+        1.000
+    """
+
+    def __init__(self, start=True, print_tmpl=None):
+        self._is_running = False
+        self.print_tmpl = print_tmpl if print_tmpl else '{:.3f}'
+        if start:
+            self.start()
+
+    @property
+    def is_running(self):
+        """bool: indicate whether the timer is running"""
+        return self._is_running
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        print(self.print_tmpl.format(self.since_last_check()))
+        self._is_running = False
+
+    def start(self):
+        """Start the timer."""
+        if not self._is_running:
+            self._t_start = time()
+            self._is_running = True
+        self._t_last = time()
+
+    def since_start(self):
+        """Total time since the timer is started.
+
+        Returns:
+            float: Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        self._t_last = time()
+        return self._t_last - self._t_start
+
+    def since_last_check(self):
+        """Time since the last checking.
+
+        Either :func:`since_start` or :func:`since_last_check` is a checking
+        operation.
+
+        Returns:
+            float: Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        dur = time() - self._t_last
+        self._t_last = time()
+        return dur
+
+
+_g_timers = {}  # global timers
+
+
+def check_time(timer_id):
+    """Add check points in a single line.
+
+    This method is suitable for running a task on a list of items. A timer will
+    be registered when the method is called for the first time.
+
+    Examples:
+        >>> import time
+        >>> import mmcv
+        >>> for i in range(1, 6):
+        >>>     # simulate a code block
+        >>>     time.sleep(i)
+        >>>     mmcv.check_time('task1')
+        2.000
+        3.000
+        4.000
+        5.000
+
+    Args:
+        str: Timer identifier.
+    """
+    if timer_id not in _g_timers:
+        _g_timers[timer_id] = Timer()
+        return 0
+    else:
+        return _g_timers[timer_id].since_last_check()
diff --git a/head_extractor/src/mmengine/utils/version_utils.py b/head_extractor/src/mmengine/utils/version_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..620180547a0d78746186b6f61e8c704be618c8c4
--- /dev/null
+++ b/head_extractor/src/mmengine/utils/version_utils.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import subprocess
+import warnings
+
+from packaging.version import parse
+
+
+def digit_version(version_str: str, length: int = 4):
+    """Convert a version string into a tuple of integers.
+
+    This method is usually used for comparing two versions. For pre-release
+    versions: alpha < beta < rc.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Defaults to 4.
+
+    Returns:
+        tuple[int]: The version info in digits (integers).
+    """
+    assert 'parrots' not in version_str
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        mapping = {'a': -3, 'b': -2, 'rc': -1}
+        val = -4
+        # version.pre can be None
+        if version.pre:
+            if version.pre[0] not in mapping:
+                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
+                              'version checking may go wrong')
+            else:
+                val = mapping[version.pre[0]]
+            release.extend([val, version.pre[-1]])
+        else:
+            release.extend([val, 0])
+
+    elif version.is_postrelease:
+        release.extend([1, version.post])  # type: ignore
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+
+def _minimal_ext_cmd(cmd):
+    # construct minimal environment
+    env = {}
+    for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+        v = os.environ.get(k)
+        if v is not None:
+            env[k] = v
+    # LANGUAGE is used on win32
+    env['LANGUAGE'] = 'C'
+    env['LANG'] = 'C'
+    env['LC_ALL'] = 'C'
+    out, err = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+        env=env).communicate()
+    return out
+
+
+def get_git_hash(fallback='unknown', digits=None):
+    """Get the git hash of the current repo.
+
+    Args:
+        fallback (str, optional): The fallback string when git hash is
+            unavailable. Defaults to 'unknown'.
+        digits (int, optional): kept digits of the hash. Defaults to None,
+            meaning all digits are kept.
+
+    Returns:
+        str: Git commit hash.
+    """
+
+    if digits is not None and not isinstance(digits, int):
+        raise TypeError('digits must be None or an integer')
+
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+        sha = out.strip().decode('ascii')
+        if digits is not None:
+            sha = sha[:digits]
+    except OSError:
+        sha = fallback
+
+    return sha
diff --git a/head_extractor/src/mmengine/version.py b/head_extractor/src/mmengine/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9e981bdb3534b0fc92f8b16b48bcaf7afe1268a
--- /dev/null
+++ b/head_extractor/src/mmengine/version.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+__version__ = '0.10.5'
+
+
+def parse_version_info(version_str):
+    """Parse the version information.
+
+    Args:
+        version_str (str): version string like '0.1.0'.
+
+    Returns:
+        tuple: version information contains major, minor, micro version.
+    """
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/head_extractor/src/mmengine/visualization/__init__.py b/head_extractor/src/mmengine/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f59452c5402b1c16a952c15a5b3648dc675e11d
--- /dev/null
+++ b/head_extractor/src/mmengine/visualization/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .vis_backend import (AimVisBackend, BaseVisBackend, ClearMLVisBackend,
+                          DVCLiveVisBackend, LocalVisBackend, MLflowVisBackend,
+                          NeptuneVisBackend, TensorboardVisBackend,
+                          WandbVisBackend)
+from .visualizer import Visualizer
+
+__all__ = [
+    'Visualizer', 'BaseVisBackend', 'LocalVisBackend', 'WandbVisBackend',
+    'TensorboardVisBackend', 'MLflowVisBackend', 'ClearMLVisBackend',
+    'NeptuneVisBackend', 'DVCLiveVisBackend', 'AimVisBackend'
+]
diff --git a/head_extractor/src/mmengine/visualization/utils.py b/head_extractor/src/mmengine/visualization/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e6b7d8ba9cb90e464426a1c140cc0815aeac103
--- /dev/null
+++ b/head_extractor/src/mmengine/visualization/utils.py
@@ -0,0 +1,244 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type, Union
+
+import cv2
+import numpy as np
+import torch
+
+if TYPE_CHECKING:
+    from matplotlib.backends.backend_agg import FigureCanvasAgg
+
+
+def tensor2ndarray(value: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
+    """If the type of value is torch.Tensor, convert the value to np.ndarray.
+
+    Args:
+        value (np.ndarray, torch.Tensor): value.
+
+    Returns:
+        Any: value.
+    """
+    if isinstance(value, torch.Tensor):
+        value = value.detach().cpu().numpy()
+    return value
+
+
+def value2list(value: Any, valid_type: Union[Type, Tuple[Type, ...]],
+               expand_dim: int) -> List[Any]:
+    """If the type of ``value`` is ``valid_type``, convert the value to list
+    and expand to ``expand_dim``.
+
+    Args:
+        value (Any): value.
+        valid_type (Union[Type, Tuple[Type, ...]): valid type.
+        expand_dim (int): expand dim.
+
+    Returns:
+        List[Any]: value.
+    """
+    if isinstance(value, valid_type):
+        value = [value] * expand_dim
+    return value
+
+
+def check_type(name: str, value: Any,
+               valid_type: Union[Type, Tuple[Type, ...]]) -> None:
+    """Check whether the type of value is in ``valid_type``.
+
+    Args:
+        name (str): value name.
+        value (Any): value.
+        valid_type (Type, Tuple[Type, ...]): expected type.
+    """
+    if not isinstance(value, valid_type):
+        raise TypeError(f'`{name}` should be {valid_type} '
+                        f' but got {type(value)}')
+
+
+def check_length(name: str, value: Any, valid_length: int) -> None:
+    """If type of the ``value`` is list, check whether its length is equal with
+    or greater than ``valid_length``.
+
+    Args:
+        name (str): value name.
+        value (Any): value.
+        valid_length (int): expected length.
+    """
+    if isinstance(value, list):
+        if len(value) < valid_length:
+            raise AssertionError(
+                f'The length of {name} must equal with or '
+                f'greater than {valid_length}, but got {len(value)}')
+
+
+def check_type_and_length(name: str, value: Any,
+                          valid_type: Union[Type, Tuple[Type, ...]],
+                          valid_length: int) -> None:
+    """Check whether the type of value is in ``valid_type``. If type of the
+    ``value`` is list, check whether its length is equal with or greater than
+    ``valid_length``.
+
+    Args:
+        value (Any): value.
+        legal_type (Type, Tuple[Type, ...]): legal type.
+        valid_length (int): expected length.
+
+    Returns:
+        List[Any]: value.
+    """
+    check_type(name, value, valid_type)
+    check_length(name, value, valid_length)
+
+
+def color_val_matplotlib(
+    colors: Union[str, tuple, List[Union[str, tuple]]]
+) -> Union[str, tuple, List[Union[str, tuple]]]:
+    """Convert various input in RGB order to normalized RGB matplotlib color
+    tuples,
+    Args:
+        colors (Union[str, tuple, List[Union[str, tuple]]]): Color inputs
+    Returns:
+        Union[str, tuple, List[Union[str, tuple]]]: A tuple of 3 normalized
+        floats indicating RGB channels.
+    """
+    if isinstance(colors, str):
+        return colors
+    elif isinstance(colors, tuple):
+        assert len(colors) == 3
+        for channel in colors:
+            assert 0 <= channel <= 255
+        colors = [channel / 255 for channel in colors]
+        return tuple(colors)
+    elif isinstance(colors, list):
+        colors = [
+            color_val_matplotlib(color)  # type:ignore
+            for color in colors
+        ]
+        return colors
+    else:
+        raise TypeError(f'Invalid type for color: {type(colors)}')
+
+
+def color_str2rgb(color: str) -> tuple:
+    """Convert Matplotlib str color to an RGB color which range is 0 to 255,
+    silently dropping the alpha channel.
+
+    Args:
+        color (str): Matplotlib color.
+
+    Returns:
+        tuple: RGB color.
+    """
+    import matplotlib
+    rgb_color: tuple = matplotlib.colors.to_rgb(color)
+    rgb_color = tuple(int(c * 255) for c in rgb_color)
+    return rgb_color
+
+
+def convert_overlay_heatmap(feat_map: Union[np.ndarray, torch.Tensor],
+                            img: Optional[np.ndarray] = None,
+                            alpha: float = 0.5) -> np.ndarray:
+    """Convert feat_map to heatmap and overlay on image, if image is not None.
+
+    Args:
+        feat_map (np.ndarray, torch.Tensor): The feat_map to convert
+            with of shape (H, W), where H is the image height and W is
+            the image width.
+        img (np.ndarray, optional): The origin image. The format
+            should be RGB. Defaults to None.
+        alpha (float): The transparency of featmap. Defaults to 0.5.
+
+    Returns:
+        np.ndarray: heatmap
+    """
+    assert feat_map.ndim == 2 or (feat_map.ndim == 3
+                                  and feat_map.shape[0] in [1, 3])
+    if isinstance(feat_map, torch.Tensor):
+        feat_map = feat_map.detach().cpu().numpy()
+
+    if feat_map.ndim == 3:
+        feat_map = feat_map.transpose(1, 2, 0)
+
+    norm_img = np.zeros(feat_map.shape)
+    norm_img = cv2.normalize(feat_map, norm_img, 0, 255, cv2.NORM_MINMAX)
+    norm_img = np.asarray(norm_img, dtype=np.uint8)
+    heat_img = cv2.applyColorMap(norm_img, cv2.COLORMAP_JET)
+    heat_img = cv2.cvtColor(heat_img, cv2.COLOR_BGR2RGB)
+    if img is not None:
+        heat_img = cv2.addWeighted(img, 1 - alpha, heat_img, alpha, 0)
+    return heat_img
+
+
+def wait_continue(figure, timeout: float = 0, continue_key: str = ' ') -> int:
+    """Show the image and wait for the user's input.
+
+    This implementation refers to
+    https://github.com/matplotlib/matplotlib/blob/v3.5.x/lib/matplotlib/_blocking_input.py
+
+    Args:
+        timeout (float): If positive, continue after ``timeout`` seconds.
+            Defaults to 0.
+        continue_key (str): The key for users to continue. Defaults to
+            the space key.
+
+    Returns:
+        int: If zero, means time out or the user pressed ``continue_key``,
+            and if one, means the user closed the show figure.
+    """  # noqa: E501
+    import matplotlib.pyplot as plt
+    from matplotlib.backend_bases import CloseEvent
+    is_inline = 'inline' in plt.get_backend()
+    if is_inline:
+        # If use inline backend, interactive input and timeout is no use.
+        return 0
+
+    if figure.canvas.manager:  # type: ignore
+        # Ensure that the figure is shown
+        figure.show()  # type: ignore
+
+    while True:
+
+        # Connect the events to the handler function call.
+        event = None
+
+        def handler(ev):
+            # Set external event variable
+            nonlocal event
+            # Qt backend may fire two events at the same time,
+            # use a condition to avoid missing close event.
+            event = ev if not isinstance(event, CloseEvent) else event
+            figure.canvas.stop_event_loop()
+
+        cids = [
+            figure.canvas.mpl_connect(name, handler)  # type: ignore
+            for name in ('key_press_event', 'close_event')
+        ]
+
+        try:
+            figure.canvas.start_event_loop(timeout)  # type: ignore
+        finally:  # Run even on exception like ctrl-c.
+            # Disconnect the callbacks.
+            for cid in cids:
+                figure.canvas.mpl_disconnect(cid)  # type: ignore
+
+        if isinstance(event, CloseEvent):
+            return 1  # Quit for close.
+        elif event is None or event.key == continue_key:
+            return 0  # Quit for continue.
+
+
+def img_from_canvas(canvas: 'FigureCanvasAgg') -> np.ndarray:
+    """Get RGB image from ``FigureCanvasAgg``.
+
+    Args:
+        canvas (FigureCanvasAgg): The canvas to get image.
+
+    Returns:
+        np.ndarray: the output of image in RGB.
+    """  # noqa: E501
+    s, (width, height) = canvas.print_to_buffer()
+    buffer = np.frombuffer(s, dtype='uint8')
+    img_rgba = buffer.reshape(height, width, 4)
+    rgb, alpha = np.split(img_rgba, [3], axis=2)
+    return rgb.astype('uint8')
diff --git a/head_extractor/src/mmengine/visualization/vis_backend.py b/head_extractor/src/mmengine/visualization/vis_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..f74eab1fcd91100fd1d9a2ce384471e0e0e5dc94
--- /dev/null
+++ b/head_extractor/src/mmengine/visualization/vis_backend.py
@@ -0,0 +1,1448 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import functools
+import logging
+import os
+import os.path as osp
+import platform
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections.abc import MutableMapping
+from typing import Any, Callable, List, Optional, Sequence, Union
+
+import cv2
+import numpy as np
+import torch
+
+from mmengine.config import Config, ConfigDict
+from mmengine.fileio import dump
+from mmengine.hooks.logger_hook import SUFFIX_TYPE
+from mmengine.logging import MMLogger, print_log
+from mmengine.registry import VISBACKENDS
+from mmengine.utils import digit_version, scandir
+from mmengine.utils.dl_utils import TORCH_VERSION
+
+
+def force_init_env(old_func: Callable) -> Any:
+    """Those methods decorated by ``force_init_env`` will be forced to call
+    ``_init_env`` if the instance has not been fully initiated. This function
+    will decorated all the `add_xxx` method and `experiment` method, because
+    `VisBackend` is initialized only when used its API.
+
+    Args:
+        old_func (Callable): Decorated function, make sure the first arg is an
+            instance with ``_init_env`` method.
+
+    Returns:
+        Any: Depends on old_func.
+    """
+
+    @functools.wraps(old_func)
+    def wrapper(obj: object, *args, **kwargs):
+        # The instance must have `_init_env` method.
+        if not hasattr(obj, '_init_env'):
+            raise AttributeError(f'{type(obj)} does not have _init_env '
+                                 'method.')
+        # If instance does not have `_env_initialized` attribute or
+        # `_env_initialized` is False, call `_init_env` and set
+        # `_env_initialized` to True
+        if not getattr(obj, '_env_initialized', False):
+            print_log(
+                'Attribute `_env_initialized` is not defined in '
+                f'{type(obj)} or `{type(obj)}._env_initialized is '
+                'False, `_init_env` will be called and '
+                f'{type(obj)}._env_initialized will be set to True',
+                logger='current',
+                level=logging.DEBUG)
+            obj._init_env()  # type: ignore
+            obj._env_initialized = True  # type: ignore
+
+        return old_func(obj, *args, **kwargs)
+
+    return wrapper
+
+
+class BaseVisBackend(metaclass=ABCMeta):
+    """Base class for visualization backend.
+
+    All backends must inherit ``BaseVisBackend`` and implement
+    the required functions.
+
+    Args:
+        save_dir (str, optional): The root directory to save
+            the files produced by the backend.
+    """
+
+    def __init__(self, save_dir: str):
+        self._save_dir = save_dir
+        self._env_initialized = False
+
+    @property
+    @abstractmethod
+    def experiment(self) -> Any:
+        """Return the experiment object associated with this visualization
+        backend.
+
+        The experiment attribute can get the visualization backend, such as
+        wandb, tensorboard. If you want to write other data, such as writing a
+        table, you can directly get the visualization backend through
+        experiment.
+        """
+        pass
+
+    @abstractmethod
+    def _init_env(self) -> Any:
+        """Setup env for VisBackend."""
+        pass
+
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config.
+
+        Args:
+            config (Config): The Config object
+        """
+        pass
+
+    def add_graph(self, model: torch.nn.Module, data_batch: Sequence[dict],
+                  **kwargs) -> None:
+        """Record the model graph.
+
+        Args:
+            model (torch.nn.Module): Model to draw.
+            data_batch (Sequence[dict]): Batch of data from dataloader.
+        """
+        pass
+
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        pass
+
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        pass
+
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalars' data.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): The scalar's data will be
+                saved to the `file_path` file at the same time
+                if the `file_path` parameter is specified.
+                Defaults to None.
+        """
+        pass
+
+    def close(self) -> None:
+        """close an opened object."""
+        pass
+
+
+@VISBACKENDS.register_module()
+class LocalVisBackend(BaseVisBackend):
+    """Local visualization backend class.
+
+    It can write image, config, scalars, etc.
+    to the local hard disk. You can get the drawing backend
+    through the experiment property for custom drawing.
+
+    Examples:
+        >>> from mmengine.visualization import LocalVisBackend
+        >>> import numpy as np
+        >>> local_vis_backend = LocalVisBackend(save_dir='temp_dir')
+        >>> img = np.random.randint(0, 256, size=(10, 10, 3))
+        >>> local_vis_backend.add_image('img', img)
+        >>> local_vis_backend.add_scalar('mAP', 0.6)
+        >>> local_vis_backend.add_scalars({'loss': [1, 2, 3], 'acc': 0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> local_vis_backend.add_config(cfg)
+
+    Args:
+        save_dir (str, optional): The root directory to save the files
+            produced by the visualizer. If it is none, it means no data
+            is stored.
+        img_save_dir (str): The directory to save images.
+            Defaults to 'vis_image'.
+        config_save_file (str): The file name to save config.
+            Defaults to 'config.py'.
+        scalar_save_file (str):  The file name to save scalar values.
+            Defaults to 'scalars.json'.
+    """
+
+    def __init__(self,
+                 save_dir: str,
+                 img_save_dir: str = 'vis_image',
+                 config_save_file: str = 'config.py',
+                 scalar_save_file: str = 'scalars.json'):
+        assert config_save_file.split('.')[-1] == 'py'
+        assert scalar_save_file.split('.')[-1] == 'json'
+        super().__init__(save_dir)
+        self._img_save_dir = img_save_dir
+        self._config_save_file = config_save_file
+        self._scalar_save_file = scalar_save_file
+
+    def _init_env(self):
+        """Init save dir."""
+        if not os.path.exists(self._save_dir):
+            os.makedirs(self._save_dir, exist_ok=True)
+        self._img_save_dir = osp.join(
+            self._save_dir,  # type: ignore
+            self._img_save_dir)
+        self._config_save_file = osp.join(
+            self._save_dir,  # type: ignore
+            self._config_save_file)
+        self._scalar_save_file = osp.join(
+            self._save_dir,  # type: ignore
+            self._scalar_save_file)
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self) -> 'LocalVisBackend':
+        """Return the experiment object associated with this visualization
+        backend."""
+        return self
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to disk.
+
+        Args:
+            config (Config): The Config object
+        """
+        assert isinstance(config, Config)
+        config.dump(self._config_save_file)
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.array,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image to disk.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        assert image.dtype == np.uint8
+        drawn_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+        os.makedirs(self._img_save_dir, exist_ok=True)
+        save_file_name = f'{name}_{step}.png'
+        cv2.imwrite(osp.join(self._img_save_dir, save_file_name), drawn_image)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to disk.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        if isinstance(value, torch.Tensor):
+            value = value.item()
+        self._dump({name: value, 'step': step}, self._scalar_save_file, 'json')
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalars to disk.
+
+        The scalar dict will be written to the default and
+        specified files if ``file_path`` is specified.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values. The value must be dumped
+                into json format.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): The scalar's data will be
+                saved to the ``file_path`` file at the same time
+                if the ``file_path`` parameter is specified.
+                Defaults to None.
+        """
+        assert isinstance(scalar_dict, dict)
+        scalar_dict = copy.deepcopy(scalar_dict)
+        scalar_dict.setdefault('step', step)
+
+        if file_path is not None:
+            assert file_path.split('.')[-1] == 'json'
+            new_save_file_path = osp.join(
+                self._save_dir,  # type: ignore
+                file_path)
+            assert new_save_file_path != self._scalar_save_file, \
+                '``file_path`` and ``scalar_save_file`` have the ' \
+                'same name, please set ``file_path`` to another value'
+            self._dump(scalar_dict, new_save_file_path, 'json')
+        self._dump(scalar_dict, self._scalar_save_file, 'json')
+
+    def _dump(self, value_dict: dict, file_path: str,
+              file_format: str) -> None:
+        """dump dict to file.
+
+        Args:
+           value_dict (dict) : The dict data to saved.
+           file_path (str): The file path to save data.
+           file_format (str): The file format to save data.
+        """
+        with open(file_path, 'a+') as f:
+            dump(value_dict, f, file_format=file_format)
+            f.write('\n')
+
+
+@VISBACKENDS.register_module()
+class WandbVisBackend(BaseVisBackend):
+    """Wandb visualization backend class.
+
+    Examples:
+        >>> from mmengine.visualization import WandbVisBackend
+        >>> import numpy as np
+        >>> wandb_vis_backend = WandbVisBackend()
+        >>> img=np.random.randint(0, 256, size=(10, 10, 3))
+        >>> wandb_vis_backend.add_image('img', img)
+        >>> wandb_vis_backend.add_scaler('mAP', 0.6)
+        >>> wandb_vis_backend.add_scalars({'loss': [1, 2, 3],'acc': 0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> wandb_vis_backend.add_config(cfg)
+
+    Args:
+        save_dir (str, optional): The root directory to save the files
+            produced by the visualizer.
+        init_kwargs (dict, optional): wandb initialization
+            input parameters.
+            See `wandb.init <https://docs.wandb.ai/ref/python/init>`_ for
+            details. Defaults to None.
+        define_metric_cfg (dict or list[dict], optional):
+            When a dict is set, it is a dict of metrics and summary for
+            ``wandb.define_metric``.
+            The key is metric and the value is summary.
+            When a list is set, each dict should be a valid argument of
+            the ``define_metric``.
+            For example, ``define_metric_cfg={'coco/bbox_mAP': 'max'}``,
+            means the maximum value of ``coco/bbox_mAP`` is logged on wandb UI.
+            When ``define_metric_cfg=[dict(name='loss',
+            step_metric='epoch')]``,
+            the "loss" will be plotted against the epoch.
+            See `wandb define_metric <https://docs.wandb.ai/ref/python/
+            run#define_metric>`_ for details.
+            Defaults to None.
+        commit (bool, optional) Save the metrics dict to the wandb server
+            and increment the step.  If false `wandb.log` just updates the
+            current metrics dict with the row argument and metrics won't be
+            saved until `wandb.log` is called with `commit=True`.
+            Defaults to True.
+        log_code_name (str, optional) The name of code artifact.
+            By default, the artifact will be named
+            source-$PROJECT_ID-$ENTRYPOINT_RELPATH. See
+            `wandb log_code <https://docs.wandb.ai/ref/python/run#log_code>`_
+            for details. Defaults to None.
+            `New in version 0.3.0.`
+        watch_kwargs (optional, dict): Agurments for ``wandb.watch``.
+            `New in version 0.4.0.`
+    """
+
+    def __init__(self,
+                 save_dir: str,
+                 init_kwargs: Optional[dict] = None,
+                 define_metric_cfg: Union[dict, list, None] = None,
+                 commit: Optional[bool] = True,
+                 log_code_name: Optional[str] = None,
+                 watch_kwargs: Optional[dict] = None):
+        super().__init__(save_dir)
+        self._init_kwargs = init_kwargs
+        self._define_metric_cfg = define_metric_cfg
+        self._commit = commit
+        self._log_code_name = log_code_name
+        self._watch_kwargs = watch_kwargs if watch_kwargs is not None else {}
+
+    def _init_env(self):
+        """Setup env for wandb."""
+        if not os.path.exists(self._save_dir):
+            os.makedirs(self._save_dir, exist_ok=True)  # type: ignore
+        if self._init_kwargs is None:
+            self._init_kwargs = {'dir': self._save_dir}
+        else:
+            self._init_kwargs.setdefault('dir', self._save_dir)
+        try:
+            import wandb
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install wandb" to install wandb')
+
+        wandb.init(**self._init_kwargs)
+        if self._define_metric_cfg is not None:
+            if isinstance(self._define_metric_cfg, dict):
+                for metric, summary in self._define_metric_cfg.items():
+                    wandb.define_metric(metric, summary=summary)
+            elif isinstance(self._define_metric_cfg, list):
+                for metric_cfg in self._define_metric_cfg:
+                    wandb.define_metric(**metric_cfg)
+            else:
+                raise ValueError('define_metric_cfg should be dict or list')
+        self._wandb = wandb
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return wandb object.
+
+        The experiment attribute can get the wandb backend, If you want to
+        write other data, such as writing a table, you can directly get the
+        wandb backend through experiment.
+        """
+        return self._wandb
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to wandb.
+
+        Args:
+            config (Config): The Config object
+        """
+        assert isinstance(self._init_kwargs, dict)
+        allow_val_change = self._init_kwargs.get('allow_val_change', False)
+        self._wandb.config.update(
+            config.to_dict(), allow_val_change=allow_val_change)
+        self._wandb.run.log_code(name=self._log_code_name)
+
+    @force_init_env
+    def add_graph(self, model: torch.nn.Module, data_batch: Sequence[dict],
+                  **kwargs) -> None:
+        """Record the model graph.
+
+        Args:
+            model (torch.nn.Module): Model to draw.
+            data_batch (Sequence[dict]): Batch of data from dataloader.
+        """
+        self._wandb.watch(model, **self._watch_kwargs)
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image to wandb.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB.
+            step (int): Useless parameter. Wandb does not
+                need this parameter. Defaults to 0.
+        """
+        image = self._wandb.Image(image)
+        self._wandb.log({name: image}, commit=self._commit)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to wandb.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Useless parameter. Wandb does not
+                need this parameter. Defaults to 0.
+        """
+        self._wandb.log({name: value}, commit=self._commit)
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalar's data to wandb.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Useless parameter. Wandb does not
+                need this parameter. Defaults to 0.
+            file_path (str, optional): Useless parameter. Just for
+                interface unification. Defaults to None.
+        """
+        self._wandb.log(scalar_dict, commit=self._commit)
+
+    def close(self) -> None:
+        """close an opened wandb object."""
+        if hasattr(self, '_wandb'):
+            self._wandb.join()
+
+
+@VISBACKENDS.register_module()
+class TensorboardVisBackend(BaseVisBackend):
+    """Tensorboard visualization backend class.
+
+    It can write images, config, scalars, etc. to a
+    tensorboard file.
+
+    Examples:
+        >>> from mmengine.visualization import TensorboardVisBackend
+        >>> import numpy as np
+        >>> vis_backend = TensorboardVisBackend(save_dir='temp_dir')
+        >>> img = np.random.randint(0, 256, size=(10, 10, 3))
+        >>> vis_backend.add_image('img', img)
+        >>> vis_backend.add_scaler('mAP', 0.6)
+        >>> vis_backend.add_scalars({'loss': 0.1,'acc':0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> vis_backend.add_config(cfg)
+
+    Args:
+        save_dir (str): The root directory to save the files
+            produced by the backend.
+    """
+
+    def __init__(self, save_dir: str):
+        super().__init__(save_dir)
+
+    def _init_env(self):
+        """Setup env for Tensorboard."""
+        if not os.path.exists(self._save_dir):
+            os.makedirs(self._save_dir, exist_ok=True)  # type: ignore
+        if TORCH_VERSION == 'parrots':
+            try:
+                from tensorboardX import SummaryWriter
+            except ImportError:
+                raise ImportError('Please install tensorboardX to use '
+                                  'TensorboardLoggerHook.')
+        else:
+            try:
+                from torch.utils.tensorboard import SummaryWriter
+            except ImportError:
+                raise ImportError(
+                    'Please run "pip install future tensorboard" to install '
+                    'the dependencies to use torch.utils.tensorboard '
+                    '(applicable to PyTorch 1.1 or higher)')
+        self._tensorboard = SummaryWriter(self._save_dir)
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return Tensorboard object."""
+        return self._tensorboard
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to tensorboard.
+
+        Args:
+            config (Config): The Config object
+        """
+        self._tensorboard.add_text('config', config.pretty_text)
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image to tensorboard.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        self._tensorboard.add_image(name, image, step, dataformats='HWC')
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to tensorboard.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        if isinstance(value,
+                      (int, float, torch.Tensor, np.ndarray, np.number)):
+            self._tensorboard.add_scalar(name, value, step)
+        else:
+            warnings.warn(f'Got {type(value)}, but numpy array, torch tensor, '
+                          f'int or float are expected. skip it!')
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalar's data to tensorboard.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): Useless parameter. Just for
+                interface unification. Defaults to None.
+        """
+        assert isinstance(scalar_dict, dict)
+        assert 'step' not in scalar_dict, 'Please set it directly ' \
+                                          'through the step parameter'
+        for key, value in scalar_dict.items():
+            self.add_scalar(key, value, step)
+
+    def close(self):
+        """close an opened tensorboard object."""
+        if hasattr(self, '_tensorboard'):
+            self._tensorboard.close()
+
+
+@VISBACKENDS.register_module()
+class MLflowVisBackend(BaseVisBackend):
+    """MLflow visualization backend class.
+
+    It can write images, config, scalars, etc. to a
+    mlflow file.
+
+    Examples:
+        >>> from mmengine.visualization import MLflowVisBackend
+        >>> from mmengine import Config
+        >>> import numpy as np
+        >>> vis_backend = MLflowVisBackend(save_dir='temp_dir')
+        >>> img = np.random.randint(0, 256, size=(10, 10, 3))
+        >>> vis_backend.add_image('img.png', img)
+        >>> vis_backend.add_scalar('mAP', 0.6)
+        >>> vis_backend.add_scalars({'loss': 0.1,'acc':0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> vis_backend.add_config(cfg)
+
+    Args:
+        save_dir (str): The root directory to save the files
+            produced by the backend.
+        exp_name (str, optional): The experiment name. Defaults to None.
+        run_name (str, optional): The run name. Defaults to None.
+        tags (dict, optional): The tags to be added to the experiment.
+            Defaults to None.
+        params (dict, optional): The params to be added to the experiment.
+            Defaults to None.
+        tracking_uri (str, optional): The tracking uri. Defaults to None.
+        artifact_suffix (Tuple[str] or str, optional): The artifact suffix.
+            Defaults to ('.json', '.log', '.py', 'yaml').
+        tracked_config_keys (dict, optional): The top level keys of config that
+            will be added to the experiment. If it is None, which means all
+            the config will be added. Defaults to None.
+            `New in version 0.7.4.`
+        artifact_location (str, optional): The location to store run artifacts.
+            If None, the server picks an appropriate default.
+            Defaults to None.
+            `New in version 0.10.4.`
+    """
+
+    def __init__(self,
+                 save_dir: str,
+                 exp_name: Optional[str] = None,
+                 run_name: Optional[str] = None,
+                 tags: Optional[dict] = None,
+                 params: Optional[dict] = None,
+                 tracking_uri: Optional[str] = None,
+                 artifact_suffix: SUFFIX_TYPE = ('.json', '.log', '.py',
+                                                 'yaml'),
+                 tracked_config_keys: Optional[dict] = None,
+                 artifact_location: Optional[str] = None):
+        super().__init__(save_dir)
+        self._exp_name = exp_name
+        self._run_name = run_name
+        self._tags = tags
+        self._params = params
+        self._tracking_uri = tracking_uri
+        self._artifact_suffix = artifact_suffix
+        self._tracked_config_keys = tracked_config_keys
+        self._artifact_location = artifact_location
+
+    def _init_env(self):
+        """Setup env for MLflow."""
+        if not os.path.exists(self._save_dir):
+            os.makedirs(self._save_dir, exist_ok=True)  # type: ignore
+
+        try:
+            import mlflow
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install mlflow" to install mlflow'
+            )  # type: ignore
+        self._mlflow = mlflow
+
+        # when mlflow is imported, a default logger is created.
+        # at this time, the default logger's stream is None
+        # so the stream is reopened only when the stream is None
+        # or the stream is closed
+        logger = MMLogger.get_current_instance()
+        for handler in logger.handlers:
+            if handler.stream is None or handler.stream.closed:
+                handler.stream = open(handler.baseFilename, 'a')
+
+        if self._tracking_uri is not None:
+            logger.warning(
+                'Please make sure that the mlflow server is running.')
+            self._mlflow.set_tracking_uri(self._tracking_uri)
+        else:
+            if os.name == 'nt':
+                file_url = f'file:\\{os.path.abspath(self._save_dir)}'
+            else:
+                file_url = f'file://{os.path.abspath(self._save_dir)}'
+            self._mlflow.set_tracking_uri(file_url)
+
+        self._exp_name = self._exp_name or 'Default'
+
+        if self._mlflow.get_experiment_by_name(self._exp_name) is None:
+            self._mlflow.create_experiment(
+                self._exp_name, artifact_location=self._artifact_location)
+
+        self._mlflow.set_experiment(self._exp_name)
+
+        if self._run_name is not None:
+            self._mlflow.set_tag('mlflow.runName', self._run_name)
+        if self._tags is not None:
+            self._mlflow.set_tags(self._tags)
+        if self._params is not None:
+            self._mlflow.log_params(self._params)
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return MLflow object."""
+        return self._mlflow
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to mlflow.
+
+        Args:
+            config (Config): The Config object
+        """
+        self.cfg = config
+        if self._tracked_config_keys is None:
+            self._mlflow.log_params(self._flatten(self.cfg.to_dict()))
+        else:
+            tracked_cfg = dict()
+            for k in self._tracked_config_keys:
+                tracked_cfg[k] = self.cfg[k]
+            self._mlflow.log_params(self._flatten(tracked_cfg))
+        self._mlflow.log_text(self.cfg.pretty_text, 'config.py')
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image to mlflow.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB.
+            step (int): Global step value to record. Default to 0.
+        """
+        self._mlflow.log_image(image, name)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to mlflow.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Global step value to record. Default to 0.
+        """
+        self._mlflow.log_metric(name, value, step)
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalar's data to mlflow.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Default to 0.
+            file_path (str, optional): Useless parameter. Just for
+                interface unification. Defaults to None.
+        """
+        assert isinstance(scalar_dict, dict)
+        assert 'step' not in scalar_dict, 'Please set it directly ' \
+                                          'through the step parameter'
+        self._mlflow.log_metrics(scalar_dict, step)
+
+    def close(self) -> None:
+        """Close the mlflow."""
+        if not hasattr(self, '_mlflow'):
+            return
+
+        file_paths = dict()
+        for filename in scandir(self.cfg.work_dir, self._artifact_suffix,
+                                True):
+            file_path = osp.join(self.cfg.work_dir, filename)
+            relative_path = os.path.relpath(file_path, self.cfg.work_dir)
+            dir_path = os.path.dirname(relative_path)
+            file_paths[file_path] = dir_path
+
+        for file_path, dir_path in file_paths.items():
+            self._mlflow.log_artifact(file_path, dir_path)
+
+        self._mlflow.end_run()
+
+    def _flatten(self, d, parent_key='', sep='.') -> dict:
+        """Flatten the dict."""
+        items = dict()
+        for k, v in d.items():
+            new_key = parent_key + sep + k if parent_key else k
+            if isinstance(v, MutableMapping):
+                items.update(self._flatten(v, new_key, sep=sep))
+            elif isinstance(v, list):
+                if any(isinstance(x, dict) for x in v):
+                    for i, x in enumerate(v):
+                        items.update(
+                            self._flatten(x, new_key + sep + str(i), sep=sep))
+                else:
+                    items[new_key] = v
+            else:
+                items[new_key] = v
+        return items
+
+
+@VISBACKENDS.register_module()
+class ClearMLVisBackend(BaseVisBackend):
+    """Clearml visualization backend class. It requires `clearml`_ to be
+    installed.
+
+    Examples:
+        >>> from mmengine.visualization import ClearMLVisBackend
+        >>> from mmengine import Config
+        >>> import numpy as np
+        >>> vis_backend = ClearMLVisBackend(save_dir='temp_dir')
+        >>> img = np.random.randint(0, 256, size=(10, 10, 3))
+        >>> vis_backend.add_image('img.png', img)
+        >>> vis_backend.add_scalar('mAP', 0.6)
+        >>> vis_backend.add_scalars({'loss': 0.1,'acc':0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> vis_backend.add_config(cfg)
+
+    Args:
+        save_dir (str, optional): Useless parameter. Just for
+            interface unification. Defaults to None.
+        init_kwargs (dict, optional): A dict contains the arguments of
+            ``clearml.Task.init`` . See `taskinit`_  for more details.
+            Defaults to None
+        artifact_suffix (Tuple[str] or str): The artifact suffix.
+            Defaults to ('.py', 'pth').
+
+    .. _clearml:
+        https://clear.ml/docs/latest/docs/
+
+    .. _taskinit:
+        https://clear.ml/docs/latest/docs/references/sdk/task/#taskinit
+    """
+
+    def __init__(self,
+                 save_dir: Optional[str] = None,
+                 init_kwargs: Optional[dict] = None,
+                 artifact_suffix: SUFFIX_TYPE = ('.py', '.pth')):
+        super().__init__(save_dir)  # type: ignore
+        self._init_kwargs = init_kwargs
+        self._artifact_suffix = artifact_suffix
+
+    def _init_env(self) -> None:
+        try:
+            import clearml
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install clearml" to install clearml')
+
+        task_kwargs = self._init_kwargs or {}
+        self._clearml = clearml
+        self._task = self._clearml.Task.init(**task_kwargs)
+        self._logger = self._task.get_logger()
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return clearml object."""
+        return self._clearml
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to clearml.
+
+        Args:
+            config (Config): The Config object
+        """
+        self.cfg = config
+        self._task.connect_configuration(config.to_dict())
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image to clearml.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        self._logger.report_image(
+            title=name, series=name, iteration=step, image=image)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to clearml.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        self._logger.report_scalar(
+            title=name, series=name, value=value, iteration=step)
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalar's data to clearml.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): Useless parameter. Just for
+                interface unification. Defaults to None.
+        """
+        assert 'step' not in scalar_dict, 'Please set it directly ' \
+                                          'through the step parameter'
+        for key, value in scalar_dict.items():
+            self._logger.report_scalar(
+                title=key, series=key, value=value, iteration=step)
+
+    def close(self) -> None:
+        """Close the clearml."""
+        if not hasattr(self, '_clearml'):
+            return
+
+        file_paths: List[str] = list()
+        if (hasattr(self, 'cfg')
+                and osp.isdir(getattr(self.cfg, 'work_dir', ''))):
+            for filename in scandir(self.cfg.work_dir, self._artifact_suffix,
+                                    False):
+                file_path = osp.join(self.cfg.work_dir, filename)
+                file_paths.append(file_path)
+
+        for file_path in file_paths:
+            self._task.upload_artifact(os.path.basename(file_path), file_path)
+        self._task.close()
+
+
+@VISBACKENDS.register_module()
+class NeptuneVisBackend(BaseVisBackend):
+    """Neptune visualization backend class.
+
+    Examples:
+        >>> from mmengine.visualization import NeptuneVisBackend
+        >>> from mmengine import Config
+        >>> import numpy as np
+        >>> init_kwargs = {'project': 'your_project_name'}
+        >>> neptune_vis_backend = NeptuneVisBackend(init_kwargs=init_kwargs)
+        >>> img = np.random.randint(0, 256, size=(10, 10, 3))
+        >>> neptune_vis_backend.add_image('img', img)
+        >>> neptune_vis_backend.add_scalar('mAP', 0.6)
+        >>> neptune_vis_backend.add_scalars({'loss': 0.1, 'acc': 0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> neptune_vis_backend.add_config(cfg)
+
+    Note:
+        `New in version 0.9.0.`
+
+    Args:
+        save_dir (str, optional): The root directory to save the files
+            produced by the visualizer. NeptuneVisBackend does
+            not require this argument. Defaults to None.
+        init_kwargs (dict, optional): Neptune initialization parameters.
+            Defaults to None.
+
+            - project (str): Name of a project in a form of
+              `namespace/project_name`. If `project` is not specified,
+              the value of `NEPTUNE_PROJECT` environment variable
+              will be taken.
+            - api_token (str): User's API token. If api_token is not api_token,
+              the value of `NEPTUNE_API_TOKEN` environment variable will
+              be taken. Note: It is strongly recommended to use
+              `NEPTUNE_API_TOKEN` environment variable rather than
+              placing your API token here.
+
+            If 'project' and 'api_token are not specified in `init_kwargs`,
+            the 'mode' will be set to 'offline'.
+            See `neptune.init_run
+            <https://docs.neptune.ai/api/neptune/#init_run>`_ for
+            details.
+    """
+
+    def __init__(self,
+                 save_dir: Optional[str] = None,
+                 init_kwargs: Optional[dict] = None):
+        super().__init__(save_dir)  # type:ignore
+        self._init_kwargs = init_kwargs
+
+    def _init_env(self):
+        """Setup env for neptune."""
+        try:
+            import neptune
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install -U neptune" to install neptune')
+        if self._init_kwargs is None:
+            self._init_kwargs = {'mode': 'offline'}
+
+        self._neptune = neptune.init_run(**self._init_kwargs)
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return Neptune object."""
+        return self._neptune
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to neptune.
+
+        Args:
+            config (Config): The Config object
+        """
+        from neptune.types import File
+        self._neptune['config'].upload(File.from_content(config.pretty_text))
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        from neptune.types import File
+
+        # values in the array need to be in the [0, 1] range
+        img = image.astype(np.float32) / 255.0
+        self._neptune['images'].append(
+            File.as_image(img), name=name, step=step)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        self._neptune[name].append(value, step=step)
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalars' data.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): The scalar's data will be
+                saved to the `file_path` file at the same time
+                if the `file_path` parameter is specified.
+                Defaults to None.
+        """
+        assert isinstance(scalar_dict, dict)
+        assert 'step' not in scalar_dict, 'Please set it directly ' \
+                                          'through the step parameter'
+
+        for k, v in scalar_dict.items():
+            self._neptune[k].append(v, step=step)
+
+    def close(self) -> None:
+        """close an opened object."""
+        if hasattr(self, '_neptune'):
+            self._neptune.stop()
+
+
+@VISBACKENDS.register_module()
+class DVCLiveVisBackend(BaseVisBackend):
+    """DVCLive visualization backend class.
+
+    Examples:
+        >>> from mmengine.visualization import DVCLiveVisBackend
+        >>> import numpy as np
+        >>> dvclive_vis_backend = DVCLiveVisBackend(save_dir='temp_dir')
+        >>> img=np.random.randint(0, 256, size=(10, 10, 3))
+        >>> dvclive_vis_backend.add_image('img', img)
+        >>> dvclive_vis_backend.add_scalar('mAP', 0.6)
+        >>> dvclive_vis_backend.add_scalars({'loss': 0.1, 'acc': 0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> dvclive_vis_backend.add_config(cfg)
+
+    Note:
+        `New in version 0.9.0.`
+
+    Args:
+        save_dir (str, optional): The root directory to save the files
+            produced by the visualizer.
+        artifact_suffix (Tuple[str] or str, optional): The artifact suffix.
+            Defaults to ('.json', '.py', 'yaml').
+        init_kwargs (dict, optional): DVCLive initialization parameters.
+            See `DVCLive <https://dvc.org/doc/dvclive/live>`_ for details.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 save_dir: str,
+                 artifact_suffix: SUFFIX_TYPE = ('.json', '.py', 'yaml'),
+                 init_kwargs: Optional[dict] = None):
+        super().__init__(save_dir)
+        self._artifact_suffix = artifact_suffix
+        self._init_kwargs = init_kwargs
+
+    def _init_env(self):
+        """Setup env for dvclive."""
+        if digit_version(platform.python_version()) < digit_version('3.8'):
+            raise RuntimeError('Please use Python 3.8 or higher version '
+                               'to use DVCLiveVisBackend.')
+
+        try:
+            import pygit2
+            from dvclive import Live
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install dvclive" to install dvclive')
+        # if no git info, init dvc without git to avoid SCMError
+        try:
+            path = pygit2.discover_repository(os.fspath(os.curdir), True, '')
+            pygit2.Repository(path).default_signature
+        except KeyError:
+            os.system('dvc init -f --no-scm')
+
+        if self._init_kwargs is None:
+            self._init_kwargs = {}
+        self._init_kwargs.setdefault('dir', self._save_dir)
+        self._init_kwargs.setdefault('save_dvc_exp', True)
+        self._init_kwargs.setdefault('cache_images', True)
+
+        self._dvclive = Live(**self._init_kwargs)
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return dvclive object.
+
+        The experiment attribute can get the dvclive backend, If you want to
+        write other data, such as writing a table, you can directly get the
+        dvclive backend through experiment.
+        """
+        return self._dvclive
+
+    @force_init_env
+    def add_config(self, config: Config, **kwargs) -> None:
+        """Record the config to dvclive.
+
+        Args:
+            config (Config): The Config object
+        """
+        assert isinstance(config, Config)
+        self.cfg = config
+        self._dvclive.log_params(self._to_dvc_paramlike(self.cfg.to_dict()))
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image to dvclive.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB.
+            step (int): Useless parameter. Dvclive does not
+                need this parameter. Defaults to 0.
+        """
+        assert image.dtype == np.uint8
+        save_file_name = f'{name}.png'
+
+        self._dvclive.log_image(save_file_name, image)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to dvclive.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        if isinstance(value, torch.Tensor):
+            value = value.numpy()
+        self._dvclive.step = step
+        self._dvclive.log_metric(name, value)
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalar's data to dvclive.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): Useless parameter. Just for
+                interface unification. Defaults to None.
+        """
+        for key, value in scalar_dict.items():
+            self.add_scalar(key, value, step, **kwargs)
+
+    def close(self) -> None:
+        """close an opened dvclive object."""
+        if not hasattr(self, '_dvclive'):
+            return
+
+        file_paths = dict()
+        for filename in scandir(self._save_dir, self._artifact_suffix, True):
+            file_path = osp.join(self._save_dir, filename)
+            relative_path = os.path.relpath(file_path, self._save_dir)
+            dir_path = os.path.dirname(relative_path)
+            file_paths[file_path] = dir_path
+
+        for file_path, dir_path in file_paths.items():
+            self._dvclive.log_artifact(file_path, dir_path)
+
+        self._dvclive.end()
+
+    def _to_dvc_paramlike(self,
+                          value: Union[int, float, dict, list, tuple, Config,
+                                       ConfigDict, torch.Tensor, np.ndarray]):
+        """Convert the input value to a DVC `ParamLike` recursively.
+
+        Or the `log_params` method of dvclive will raise an error.
+        """
+
+        if isinstance(value, (dict, Config, ConfigDict)):
+            return {k: self._to_dvc_paramlike(v) for k, v in value.items()}
+        elif isinstance(value, (tuple, list)):
+            return [self._to_dvc_paramlike(item) for item in value]
+        elif isinstance(value, (torch.Tensor, np.ndarray)):
+            return value.tolist()
+        elif isinstance(value, np.generic):
+            return value.item()
+        else:
+            return value
+
+
+@VISBACKENDS.register_module()
+class AimVisBackend(BaseVisBackend):
+    """Aim visualization backend class.
+
+    Examples:
+        >>> from mmengine.visualization import AimVisBackend
+        >>> import numpy as np
+        >>> aim_vis_backend = AimVisBackend()
+        >>> img=np.random.randint(0, 256, size=(10, 10, 3))
+        >>> aim_vis_backend.add_image('img', img)
+        >>> aim_vis_backend.add_scalar('mAP', 0.6)
+        >>> aim_vis_backend.add_scalars({'loss': 0.1, 'acc': 0.8})
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> aim_vis_backend.add_config(cfg)
+
+    Note:
+        1. `New in version 0.9.0.`
+        2. Refer to
+           `Github issue <https://github.com/aimhubio/aim/issues/2064>`_ ,
+           Aim is not unable to be install on Windows for now.
+
+    Args:
+        save_dir (str, optional): The root directory to save the files
+            produced by the visualizer.
+        init_kwargs (dict, optional): Aim initialization parameters. See
+            `Aim <https://aimstack.readthedocs.io/en/latest/refs/sdk.html>`_
+            for details. Defaults to None.
+    """
+
+    def __init__(self,
+                 save_dir: Optional[str] = None,
+                 init_kwargs: Optional[dict] = None):
+        super().__init__(save_dir)  # type:ignore
+        self._init_kwargs = init_kwargs
+
+    def _init_env(self):
+        """Setup env for Aim."""
+        try:
+            from aim import Run
+        except ImportError:
+            raise ImportError('Please run "pip install aim" to install aim')
+
+        from datetime import datetime
+
+        if self._save_dir is not None:
+            path_list = os.path.normpath(self._save_dir).split(os.sep)
+            exp_name = f'{path_list[-2]}_{path_list[-1]}'
+        else:
+            exp_name = datetime.now().strftime('%Y%m%d_%H%M%S')
+
+        if self._init_kwargs is None:
+            self._init_kwargs = {}
+        self._init_kwargs.setdefault('experiment', exp_name)
+        self._aim_run = Run(**self._init_kwargs)
+
+    @property  # type: ignore
+    @force_init_env
+    def experiment(self):
+        """Return Aim object."""
+        return self._aim_run
+
+    @force_init_env
+    def add_config(self, config, **kwargs) -> None:
+        """Record the config to Aim.
+
+        Args:
+            config (Config): The Config object
+        """
+        if isinstance(config, Config):
+            config = config.to_dict()
+        self._aim_run['hparams'] = config
+
+    @force_init_env
+    def add_image(self,
+                  name: str,
+                  image: np.ndarray,
+                  step: int = 0,
+                  **kwargs) -> None:
+        """Record the image.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to be saved. The format
+                should be RGB. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        from aim import Image
+        self._aim_run.track(name=name, value=Image(image), step=step)
+
+    @force_init_env
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float, torch.Tensor, np.ndarray],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data to Aim.
+
+        Args:
+            name (str): The scalar identifier.
+            value (int, float, torch.Tensor, np.ndarray): Value to save.
+            step (int): Global step value to record. Default to 0.
+        """
+        self._aim_run.track(name=name, value=value, step=step)
+
+    @force_init_env
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalar's data to wandb.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Default to 0.
+            file_path (str, optional): Useless parameter. Just for
+                interface unification. Defaults to None.
+        """
+        for key, value in scalar_dict.items():
+            self._aim_run.track(name=key, value=value, step=step)
+
+    def close(self) -> None:
+        """Close the Aim."""
+        if not hasattr(self, '_aim_run'):
+            return
+
+        self._aim_run.close()
diff --git a/head_extractor/src/mmengine/visualization/visualizer.py b/head_extractor/src/mmengine/visualization/visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e90c184a6e186e8304395f8dd57ba86b0c62804
--- /dev/null
+++ b/head_extractor/src/mmengine/visualization/visualizer.py
@@ -0,0 +1,1186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import os.path as osp
+import warnings
+from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, Union
+
+if TYPE_CHECKING:
+    from matplotlib.font_manager import FontProperties
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from mmengine.config import Config
+from mmengine.dist import master_only
+from mmengine.registry import VISBACKENDS, VISUALIZERS
+from mmengine.structures import BaseDataElement
+from mmengine.utils import ManagerMixin, is_seq_of
+from mmengine.visualization.utils import (check_type, check_type_and_length,
+                                          color_str2rgb, color_val_matplotlib,
+                                          convert_overlay_heatmap,
+                                          img_from_canvas, tensor2ndarray,
+                                          value2list, wait_continue)
+from mmengine.visualization.vis_backend import BaseVisBackend
+
+VisBackendsType = Union[List[Union[List, BaseDataElement]], BaseDataElement,
+                        dict, None]
+
+
+@VISUALIZERS.register_module()
+class Visualizer(ManagerMixin):
+    """MMEngine provides a Visualizer class that uses the ``Matplotlib``
+    library as the backend. It has the following functions:
+
+    - Basic drawing methods
+
+      - draw_bboxes: draw single or multiple bounding boxes
+      - draw_texts: draw single or multiple text boxes
+      - draw_points: draw single or multiple points
+      - draw_lines: draw single or multiple line segments
+      - draw_circles: draw single or multiple circles
+      - draw_polygons: draw single or multiple polygons
+      - draw_binary_masks: draw single or multiple binary masks
+      - draw_featmap: draw feature map
+
+    - Basic visualizer backend methods
+
+      - add_configs: write config to all vis storage backends
+      - add_graph: write model graph to all vis storage backends
+      - add_image: write image to all vis storage backends
+      - add_scalar: write scalar to all vis storage backends
+      - add_scalars: write scalars to all vis storage backends
+      - add_datasample: write datasample to all vis storage \
+         backends. The abstract drawing interface used by the user
+
+    - Basic info methods
+
+      - set_image: sets the original image data
+      - get_image: get the image data in Numpy format after drawing
+      - show: visualization
+      - close: close all resources that have been opened
+      - get_backend: get the specified vis backend
+
+
+    All the basic drawing methods support chain calls, which is convenient for
+    overlaydrawing and display. Each downstream algorithm library can inherit
+    ``Visualizer`` and implement the add_datasample logic. For example,
+    ``DetLocalVisualizer`` in MMDetection inherits from ``Visualizer``
+    and implements functions, such as visual detection boxes, instance masks,
+    and semantic segmentation maps in the add_datasample interface.
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        image (np.ndarray, optional): the origin image to draw. The format
+            should be RGB. Defaults to None.
+        vis_backends (list, optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        fig_save_cfg (dict): Keyword parameters of figure for saving.
+            Defaults to empty dict.
+        fig_show_cfg (dict): Keyword parameters of figure for showing.
+            Defaults to empty dict.
+
+    Examples:
+        >>> # Basic info methods
+        >>> vis = Visualizer()
+        >>> vis.set_image(image)
+        >>> vis.get_image()
+        >>> vis.show()
+
+        >>> # Basic drawing methods
+        >>> vis = Visualizer(image=image)
+        >>> vis.draw_bboxes(np.array([0, 0, 1, 1]), edge_colors='g')
+        >>> vis.draw_bboxes(bbox=np.array([[1, 1, 2, 2], [2, 2, 3, 3]]),
+        >>>                    edge_colors=['g', 'r'])
+        >>> vis.draw_lines(x_datas=np.array([1, 3]),
+        >>>                y_datas=np.array([1, 3]),
+        >>>                colors='r', line_widths=1)
+        >>> vis.draw_lines(x_datas=np.array([[1, 3], [2, 4]]),
+        >>>                y_datas=np.array([[1, 3], [2, 4]]),
+        >>>                colors=['r', 'r'], line_widths=[1, 2])
+        >>> vis.draw_texts(text='MMEngine',
+        >>>               position=np.array([2, 2]),
+        >>>               colors='b')
+        >>> vis.draw_texts(text=['MMEngine','OpenMMLab'],
+        >>>                position=np.array([[2, 2], [5, 5]]),
+        >>>                colors=['b', 'b'])
+        >>> vis.draw_circles(circle_coord=np.array([2, 2]), radius=np.array[1])
+        >>> vis.draw_circles(circle_coord=np.array([[2, 2], [3, 5]),
+        >>>                  radius=np.array[1, 2], colors=['g', 'r'])
+        >>> square = np.array([[0, 0], [100, 0], [100, 100], [0, 100]])
+        >>> vis.draw_polygons(polygons=square, edge_colors='g')
+        >>> squares = [np.array([[0, 0], [100, 0], [100, 100], [0, 100]]),
+        >>>            np.array([[0, 0], [50, 0], [50, 50], [0, 50]])]
+        >>> vis.draw_polygons(polygons=squares, edge_colors=['g', 'r'])
+        >>> vis.draw_binary_masks(binary_mask, alpha=0.6)
+        >>> heatmap = vis.draw_featmap(featmap, img,
+        >>>                            channel_reduction='select_max')
+        >>> heatmap = vis.draw_featmap(featmap, img, channel_reduction=None,
+        >>>                            topk=8, arrangement=(4, 2))
+        >>> heatmap = vis.draw_featmap(featmap, img, channel_reduction=None,
+        >>>                            topk=-1)
+
+        >>> # chain calls
+        >>> vis.draw_bboxes().draw_texts().draw_circle().draw_binary_masks()
+
+        >>> # Backend related methods
+        >>> vis = Visualizer(vis_backends=[dict(type='LocalVisBackend')],
+        >>>                                save_dir='temp_dir')
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> vis.add_config(cfg)
+        >>> image=np.random.randint(0, 256, size=(10, 10, 3)).astype(np.uint8)
+        >>> vis.add_image('image',image)
+        >>> vis.add_scaler('mAP', 0.6)
+        >>> vis.add_scalars({'loss': 0.1,'acc':0.8})
+
+        >>> # inherit
+        >>> class DetLocalVisualizer(Visualizer):
+        >>>      def add_datasample(self,
+        >>>                         name,
+        >>>                         image: np.ndarray,
+        >>>                         gt_sample:
+        >>>                             Optional['BaseDataElement'] = None,
+        >>>                         pred_sample:
+        >>>                             Optional['BaseDataElement'] = None,
+        >>>                         draw_gt: bool = True,
+        >>>                         draw_pred: bool = True,
+        >>>                         show: bool = False,
+        >>>                         wait_time: int = 0,
+        >>>                         step: int = 0) -> None:
+        >>>         pass
+    """
+
+    def __init__(
+        self,
+        name='visualizer',
+        image: Optional[np.ndarray] = None,
+        vis_backends: VisBackendsType = None,
+        save_dir: Optional[str] = None,
+        fig_save_cfg=dict(frameon=False),
+        fig_show_cfg=dict(frameon=False)
+    ) -> None:
+        super().__init__(name)
+        self._dataset_meta: Optional[dict] = None
+        self._vis_backends: Dict[str, BaseVisBackend] = {}
+
+        if vis_backends is None:
+            vis_backends = []
+
+        if isinstance(vis_backends, (dict, BaseVisBackend)):
+            vis_backends = [vis_backends]  # type: ignore
+
+        if not is_seq_of(vis_backends, (dict, BaseVisBackend)):
+            raise TypeError('vis_backends must be a list of dicts or a list '
+                            'of BaseBackend instances')
+        if save_dir is not None:
+            save_dir = osp.join(save_dir, 'vis_data')
+
+        for vis_backend in vis_backends:  # type: ignore
+            name = None
+            if isinstance(vis_backend, dict):
+                name = vis_backend.pop('name', None)
+                vis_backend.setdefault('save_dir', save_dir)
+                vis_backend = VISBACKENDS.build(vis_backend)
+
+            # If vis_backend requires `save_dir` (with no default value)
+            # but is initialized with None, then don't add this
+            # vis_backend to the visualizer.
+            save_dir_arg = inspect.signature(
+                vis_backend.__class__.__init__).parameters.get('save_dir')
+            if (save_dir_arg is not None
+                    and save_dir_arg.default is save_dir_arg.empty
+                    and getattr(vis_backend, '_save_dir') is None):
+                warnings.warn(f'Failed to add {vis_backend.__class__}, '
+                              'please provide the `save_dir` argument.')
+                continue
+
+            type_name = vis_backend.__class__.__name__
+            name = name or type_name
+
+            if name in self._vis_backends:
+                raise RuntimeError(f'vis_backend name {name} already exists')
+            self._vis_backends[name] = vis_backend  # type: ignore
+
+        self.fig_save = None
+        self.fig_save_cfg = fig_save_cfg
+        self.fig_show_cfg = fig_show_cfg
+
+        (self.fig_save_canvas, self.fig_save,
+         self.ax_save) = self._initialize_fig(fig_save_cfg)
+        self.dpi = self.fig_save.get_dpi()
+
+        if image is not None:
+            self.set_image(image)
+
+    @property  # type: ignore
+    @master_only
+    def dataset_meta(self) -> Optional[dict]:
+        """Optional[dict]: Meta info of the dataset."""
+        return self._dataset_meta
+
+    @dataset_meta.setter  # type: ignore
+    @master_only
+    def dataset_meta(self, dataset_meta: dict) -> None:
+        """Set the dataset meta info to the Visualizer."""
+        self._dataset_meta = dataset_meta
+
+    @master_only
+    def show(self,
+             drawn_img: Optional[np.ndarray] = None,
+             win_name: str = 'image',
+             wait_time: float = 0.,
+             continue_key: str = ' ',
+             backend: str = 'matplotlib') -> None:
+        """Show the drawn image.
+
+        Args:
+            drawn_img (np.ndarray, optional): The image to show. If drawn_img
+                is None, it will show the image got by Visualizer. Defaults
+                to None.
+            win_name (str):  The image title. Defaults to 'image'.
+            wait_time (float): Delay in seconds. 0 is the special
+                value that means "forever". Defaults to 0.
+            continue_key (str): The key for users to continue. Defaults to
+                the space key.
+            backend (str): The backend to show the image. Defaults to
+                'matplotlib'. `New in version 0.7.3.`
+        """
+        if backend == 'matplotlib':
+            import matplotlib.pyplot as plt
+            is_inline = 'inline' in plt.get_backend()
+            img = self.get_image() if drawn_img is None else drawn_img
+            self._init_manager(win_name)
+            fig = self.manager.canvas.figure
+            # remove white edges by set subplot margin
+            fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
+            fig.clear()
+            ax = fig.add_subplot()
+            ax.axis(False)
+            ax.imshow(img)
+            self.manager.canvas.draw()
+
+            # Find a better way for inline to show the image
+            if is_inline:
+                return fig
+            wait_continue(fig, timeout=wait_time, continue_key=continue_key)
+        elif backend == 'cv2':
+            # Keep images are shown in the same window, and the title of window
+            # will be updated with `win_name`.
+            cv2.namedWindow(winname=f'{id(self)}')
+            cv2.setWindowTitle(f'{id(self)}', win_name)
+            cv2.imshow(
+                str(id(self)),
+                self.get_image() if drawn_img is None else drawn_img)
+            cv2.waitKey(int(np.ceil(wait_time * 1000)))
+        else:
+            raise ValueError('backend should be "matplotlib" or "cv2", '
+                             f'but got {backend} instead')
+
+    @master_only
+    def set_image(self, image: np.ndarray) -> None:
+        """Set the image to draw.
+
+        Args:
+            image (np.ndarray): The image to draw.
+        """
+        assert image is not None
+        image = image.astype('uint8')
+        self._image = image
+        self.width, self.height = image.shape[1], image.shape[0]
+        self._default_font_size = max(
+            np.sqrt(self.height * self.width) // 90, 10)
+
+        # add a small 1e-2 to avoid precision lost due to matplotlib's
+        # truncation (https://github.com/matplotlib/matplotlib/issues/15363)
+        self.fig_save.set_size_inches(  # type: ignore
+            (self.width + 1e-2) / self.dpi, (self.height + 1e-2) / self.dpi)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        self.ax_save.cla()
+        self.ax_save.axis(False)
+        self.ax_save.imshow(
+            image,
+            extent=(0, self.width, self.height, 0),
+            interpolation='none')
+
+    @master_only
+    def get_image(self) -> np.ndarray:
+        """Get the drawn image. The format is RGB.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        assert self._image is not None, 'Please set image using `set_image`'
+        return img_from_canvas(self.fig_save_canvas)  # type: ignore
+
+    def _initialize_fig(self, fig_cfg) -> tuple:
+        """Build figure according to fig_cfg.
+
+        Args:
+            fig_cfg (dict): The config to build figure.
+
+        Returns:
+             tuple: build canvas figure and axes.
+        """
+        from matplotlib.backends.backend_agg import FigureCanvasAgg
+        from matplotlib.figure import Figure
+        fig = Figure(**fig_cfg)
+        ax = fig.add_subplot()
+        ax.axis(False)
+
+        # remove white edges by set subplot margin
+        fig.subplots_adjust(left=0, right=1, bottom=0, top=1)
+        canvas = FigureCanvasAgg(fig)
+        return canvas, fig, ax
+
+    def _init_manager(self, win_name: str) -> None:
+        """Initialize the matplot manager.
+
+        Args:
+            win_name (str): The window name.
+        """
+        from matplotlib.figure import Figure
+        from matplotlib.pyplot import new_figure_manager
+        if getattr(self, 'manager', None) is None:
+            self.manager = new_figure_manager(
+                num=1, FigureClass=Figure, **self.fig_show_cfg)
+
+        try:
+            self.manager.set_window_title(win_name)
+        except Exception:
+            self.manager = new_figure_manager(
+                num=1, FigureClass=Figure, **self.fig_show_cfg)
+            self.manager.set_window_title(win_name)
+
+    @master_only
+    def get_backend(self, name) -> 'BaseVisBackend':
+        """get vis backend by name.
+
+        Args:
+            name (str): The name of vis backend
+
+        Returns:
+             BaseVisBackend: The vis backend.
+        """
+        return self._vis_backends.get(name)  # type: ignore
+
+    def _is_posion_valid(self, position: np.ndarray) -> bool:
+        """Judge whether the position is in image.
+
+        Args:
+            position (np.ndarray): The position to judge which last dim must
+                be two and the format is [x, y].
+
+        Returns:
+            bool: Whether the position is in image.
+        """
+        flag = (position[..., 0] < self.width).all() and \
+               (position[..., 0] >= 0).all() and \
+               (position[..., 1] < self.height).all() and \
+               (position[..., 1] >= 0).all()
+        return flag
+
+    @master_only
+    def draw_points(self,
+                    positions: Union[np.ndarray, torch.Tensor],
+                    colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+                    marker: Optional[str] = None,
+                    sizes: Optional[Union[np.ndarray, torch.Tensor]] = None):
+        """Draw single or multiple points.
+
+        Args:
+            positions (Union[np.ndarray, torch.Tensor]): Positions to draw.
+            colors (Union[str, tuple, List[str], List[tuple]]): The colors
+                of points. ``colors`` can have the same length with points or
+                just single value. If ``colors`` is single value, all the
+                points will have the same colors. Reference to
+                https://matplotlib.org/stable/gallery/color/named_colors.html
+                for more details. Defaults to 'g.
+            marker (str, optional): The marker style.
+                See :mod:`matplotlib.markers` for more information about
+                marker styles. Defaults to None.
+            sizes (Optional[Union[np.ndarray, torch.Tensor]]): The marker size.
+                Defaults to None.
+        """
+        check_type('positions', positions, (np.ndarray, torch.Tensor))
+        positions = tensor2ndarray(positions)
+
+        if len(positions.shape) == 1:
+            positions = positions[None]
+        assert positions.shape[-1] == 2, (
+            'The shape of `positions` should be (N, 2), '
+            f'but got {positions.shape}')
+        colors = color_val_matplotlib(colors)  # type: ignore
+        self.ax_save.scatter(
+            positions[:, 0], positions[:, 1], c=colors, s=sizes, marker=marker)
+        return self
+
+    @master_only
+    def draw_texts(
+        self,
+        texts: Union[str, List[str]],
+        positions: Union[np.ndarray, torch.Tensor],
+        font_sizes: Optional[Union[int, List[int]]] = None,
+        colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+        vertical_alignments: Union[str, List[str]] = 'top',
+        horizontal_alignments: Union[str, List[str]] = 'left',
+        font_families: Union[str, List[str]] = 'sans-serif',
+        bboxes: Optional[Union[dict, List[dict]]] = None,
+        font_properties: Optional[Union['FontProperties',
+                                        List['FontProperties']]] = None
+    ) -> 'Visualizer':
+        """Draw single or multiple text boxes.
+
+        Args:
+            texts (Union[str, List[str]]): Texts to draw.
+            positions (Union[np.ndarray, torch.Tensor]): The position to draw
+                the texts, which should have the same length with texts and
+                each dim contain x and y.
+            font_sizes (Union[int, List[int]], optional): The font size of
+                texts. ``font_sizes`` can have the same length with texts or
+                just single value. If ``font_sizes`` is single value, all the
+                texts will have the same font size. Defaults to None.
+            colors (Union[str, tuple, List[str], List[tuple]]): The colors
+                of texts. ``colors`` can have the same length with texts or
+                just single value. If ``colors`` is single value, all the
+                texts will have the same colors. Reference to
+                https://matplotlib.org/stable/gallery/color/named_colors.html
+                for more details. Defaults to 'g.
+            vertical_alignments (Union[str, List[str]]): The verticalalignment
+                of texts. verticalalignment controls whether the y positional
+                argument for the text indicates the bottom, center or top side
+                of the text bounding box.
+                ``vertical_alignments`` can have the same length with
+                texts or just single value. If ``vertical_alignments`` is
+                single value, all the texts will have the same
+                verticalalignment. verticalalignment can be 'center' or
+                'top', 'bottom' or 'baseline'. Defaults to 'top'.
+            horizontal_alignments (Union[str, List[str]]): The
+                horizontalalignment of texts. Horizontalalignment controls
+                whether the x positional argument for the text indicates the
+                left, center or right side of the text bounding box.
+                ``horizontal_alignments`` can have
+                the same length with texts or just single value.
+                If ``horizontal_alignments`` is single value, all the texts
+                will have the same horizontalalignment. Horizontalalignment
+                can be 'center','right' or 'left'. Defaults to 'left'.
+            font_families (Union[str, List[str]]): The font family of
+                texts. ``font_families`` can have the same length with texts or
+                just single value. If ``font_families`` is single value, all
+                the texts will have the same font family.
+                font_familiy can be 'serif', 'sans-serif', 'cursive', 'fantasy'
+                or 'monospace'.  Defaults to 'sans-serif'.
+            bboxes (Union[dict, List[dict]], optional): The bounding box of the
+                texts. If bboxes is None, there are no bounding box around
+                texts. ``bboxes`` can have the same length with texts or
+                just single value. If ``bboxes`` is single value, all
+                the texts will have the same bbox. Reference to
+                https://matplotlib.org/stable/api/_as_gen/matplotlib.patches.FancyBboxPatch.html#matplotlib.patches.FancyBboxPatch
+                for more details. Defaults to None.
+            font_properties (Union[FontProperties, List[FontProperties]], optional):
+                The font properties of texts. FontProperties is
+                a ``font_manager.FontProperties()`` object.
+                If you want to draw Chinese texts, you need to prepare
+                a font file that can show Chinese characters properly.
+                For example: `simhei.ttf`, `simsun.ttc`, `simkai.ttf` and so on.
+                Then set ``font_properties=matplotlib.font_manager.FontProperties(fname='path/to/font_file')``
+                ``font_properties`` can have the same length with texts or
+                just single value. If ``font_properties`` is single value,
+                all the texts will have the same font properties.
+                Defaults to None.
+                `New in version 0.6.0.`
+        """  # noqa: E501
+        from matplotlib.font_manager import FontProperties
+        check_type('texts', texts, (str, list))
+        if isinstance(texts, str):
+            texts = [texts]
+        num_text = len(texts)
+        check_type('positions', positions, (np.ndarray, torch.Tensor))
+        positions = tensor2ndarray(positions)
+        if len(positions.shape) == 1:
+            positions = positions[None]
+        assert positions.shape == (num_text, 2), (
+            '`positions` should have the shape of '
+            f'({num_text}, 2), but got {positions.shape}')
+        if not self._is_posion_valid(positions):
+            warnings.warn(
+                'Warning: The text is out of bounds,'
+                ' the drawn text may not be in the image', UserWarning)
+        positions = positions.tolist()
+
+        if font_sizes is None:
+            font_sizes = self._default_font_size
+        check_type_and_length('font_sizes', font_sizes, (int, float, list),
+                              num_text)
+        font_sizes = value2list(font_sizes, (int, float), num_text)
+
+        check_type_and_length('colors', colors, (str, tuple, list), num_text)
+        colors = value2list(colors, (str, tuple), num_text)
+        colors = color_val_matplotlib(colors)  # type: ignore
+
+        check_type_and_length('vertical_alignments', vertical_alignments,
+                              (str, list), num_text)
+        vertical_alignments = value2list(vertical_alignments, str, num_text)
+
+        check_type_and_length('horizontal_alignments', horizontal_alignments,
+                              (str, list), num_text)
+        horizontal_alignments = value2list(horizontal_alignments, str,
+                                           num_text)
+
+        check_type_and_length('font_families', font_families, (str, list),
+                              num_text)
+        font_families = value2list(font_families, str, num_text)
+
+        if font_properties is None:
+            font_properties = [None for _ in range(num_text)]  # type: ignore
+        else:
+            check_type_and_length('font_properties', font_properties,
+                                  (FontProperties, list), num_text)
+            font_properties = value2list(font_properties, FontProperties,
+                                         num_text)
+
+        if bboxes is None:
+            bboxes = [None for _ in range(num_text)]  # type: ignore
+        else:
+            check_type_and_length('bboxes', bboxes, (dict, list), num_text)
+            bboxes = value2list(bboxes, dict, num_text)
+
+        for i in range(num_text):
+            self.ax_save.text(
+                positions[i][0],
+                positions[i][1],
+                texts[i],
+                size=font_sizes[i],  # type: ignore
+                bbox=bboxes[i],  # type: ignore
+                verticalalignment=vertical_alignments[i],
+                horizontalalignment=horizontal_alignments[i],
+                family=font_families[i],
+                fontproperties=font_properties[i],
+                color=colors[i])
+        return self
+
+    @master_only
+    def draw_lines(
+        self,
+        x_datas: Union[np.ndarray, torch.Tensor],
+        y_datas: Union[np.ndarray, torch.Tensor],
+        colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+        line_styles: Union[str, List[str]] = '-',
+        line_widths: Union[Union[int, float], List[Union[int, float]]] = 2
+    ) -> 'Visualizer':
+        """Draw single or multiple line segments.
+
+        Args:
+            x_datas (Union[np.ndarray, torch.Tensor]): The x coordinate of
+                each line' start and end points.
+            y_datas (Union[np.ndarray, torch.Tensor]): The y coordinate of
+                each line' start and end points.
+            colors (Union[str, tuple, List[str], List[tuple]]): The colors of
+                lines. ``colors`` can have the same length with lines or just
+                single value. If ``colors`` is single value, all the lines
+                will have the same colors. Reference to
+                https://matplotlib.org/stable/gallery/color/named_colors.html
+                for more details. Defaults to 'g'.
+            line_styles (Union[str, List[str]]): The linestyle
+                of lines. ``line_styles`` can have the same length with
+                texts or just single value. If ``line_styles`` is single
+                value, all the lines will have the same linestyle.
+                Reference to
+                https://matplotlib.org/stable/api/collections_api.html?highlight=collection#matplotlib.collections.AsteriskPolygonCollection.set_linestyle
+                for more details. Defaults to '-'.
+            line_widths (Union[Union[int, float], List[Union[int, float]]]):
+                The linewidth of lines. ``line_widths`` can have
+                the same length with lines or just single value.
+                If ``line_widths`` is single value, all the lines will
+                have the same linewidth. Defaults to 2.
+        """
+        from matplotlib.collections import LineCollection
+        check_type('x_datas', x_datas, (np.ndarray, torch.Tensor))
+        x_datas = tensor2ndarray(x_datas)
+        check_type('y_datas', y_datas, (np.ndarray, torch.Tensor))
+        y_datas = tensor2ndarray(y_datas)
+        assert x_datas.shape == y_datas.shape, (
+            '`x_datas` and `y_datas` should have the same shape')
+        assert x_datas.shape[-1] == 2, (
+            f'The shape of `x_datas` should be (N, 2), but got {x_datas.shape}'
+        )
+        if len(x_datas.shape) == 1:
+            x_datas = x_datas[None]
+            y_datas = y_datas[None]
+        colors = color_val_matplotlib(colors)  # type: ignore
+        lines = np.concatenate(
+            (x_datas.reshape(-1, 2, 1), y_datas.reshape(-1, 2, 1)), axis=-1)
+        if not self._is_posion_valid(lines):
+            warnings.warn(
+                'Warning: The line is out of bounds,'
+                ' the drawn line may not be in the image', UserWarning)
+        line_collect = LineCollection(
+            lines.tolist(),
+            colors=colors,
+            linestyles=line_styles,
+            linewidths=line_widths)
+        self.ax_save.add_collection(line_collect)
+        return self
+
+    @master_only
+    def draw_circles(
+        self,
+        center: Union[np.ndarray, torch.Tensor],
+        radius: Union[np.ndarray, torch.Tensor],
+        edge_colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+        line_styles: Union[str, List[str]] = '-',
+        line_widths: Union[Union[int, float], List[Union[int, float]]] = 2,
+        face_colors: Union[str, tuple, List[str], List[tuple]] = 'none',
+        alpha: Union[float, int] = 0.8,
+    ) -> 'Visualizer':
+        """Draw single or multiple circles.
+
+        Args:
+            center (Union[np.ndarray, torch.Tensor]): The x coordinate of
+                each line' start and end points.
+            radius (Union[np.ndarray, torch.Tensor]): The y coordinate of
+                each line' start and end points.
+            edge_colors (Union[str, tuple, List[str], List[tuple]]): The
+                colors of circles. ``colors`` can have the same length with
+                lines or just single value. If ``colors`` is single value,
+                all the lines will have the same colors. Reference to
+                https://matplotlib.org/stable/gallery/color/named_colors.html
+                for more details. Defaults to 'g.
+            line_styles (Union[str, List[str]]): The linestyle
+                of lines. ``line_styles`` can have the same length with
+                texts or just single value. If ``line_styles`` is single
+                value, all the lines will have the same linestyle.
+                Reference to
+                https://matplotlib.org/stable/api/collections_api.html?highlight=collection#matplotlib.collections.AsteriskPolygonCollection.set_linestyle
+                for more details. Defaults to '-'.
+            line_widths (Union[Union[int, float], List[Union[int, float]]]):
+                The linewidth of lines. ``line_widths`` can have
+                the same length with lines or just single value.
+                If ``line_widths`` is single value, all the lines will
+                have the same linewidth. Defaults to 2.
+            face_colors (Union[str, tuple, List[str], List[tuple]]):
+                The face colors. Defaults to None.
+            alpha (Union[int, float]): The transparency of circles.
+                Defaults to 0.8.
+        """
+        from matplotlib.collections import PatchCollection
+        from matplotlib.patches import Circle
+        check_type('center', center, (np.ndarray, torch.Tensor))
+        center = tensor2ndarray(center)
+        check_type('radius', radius, (np.ndarray, torch.Tensor))
+        radius = tensor2ndarray(radius)
+        if len(center.shape) == 1:
+            center = center[None]
+        assert center.shape == (radius.shape[0], 2), (
+            'The shape of `center` should be (radius.shape, 2), '
+            f'but got {center.shape}')
+        if not (self._is_posion_valid(center -
+                                      np.tile(radius.reshape((-1, 1)), (1, 2)))
+                and self._is_posion_valid(
+                    center + np.tile(radius.reshape((-1, 1)), (1, 2)))):
+            warnings.warn(
+                'Warning: The circle is out of bounds,'
+                ' the drawn circle may not be in the image', UserWarning)
+
+        center = center.tolist()
+        radius = radius.tolist()
+        edge_colors = color_val_matplotlib(edge_colors)  # type: ignore
+        face_colors = color_val_matplotlib(face_colors)  # type: ignore
+        circles = []
+        for i in range(len(center)):
+            circles.append(Circle(tuple(center[i]), radius[i]))
+
+        if isinstance(line_widths, (int, float)):
+            line_widths = [line_widths] * len(circles)
+        line_widths = [
+            min(max(linewidth, 1), self._default_font_size / 4)
+            for linewidth in line_widths
+        ]
+        p = PatchCollection(
+            circles,
+            alpha=alpha,
+            facecolors=face_colors,
+            edgecolors=edge_colors,
+            linewidths=line_widths,
+            linestyles=line_styles)
+        self.ax_save.add_collection(p)
+        return self
+
+    @master_only
+    def draw_bboxes(
+        self,
+        bboxes: Union[np.ndarray, torch.Tensor],
+        edge_colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+        line_styles: Union[str, List[str]] = '-',
+        line_widths: Union[Union[int, float], List[Union[int, float]]] = 2,
+        face_colors: Union[str, tuple, List[str], List[tuple]] = 'none',
+        alpha: Union[int, float] = 0.8,
+    ) -> 'Visualizer':
+        """Draw single or multiple bboxes.
+
+        Args:
+            bboxes (Union[np.ndarray, torch.Tensor]): The bboxes to draw with
+                the format of(x1,y1,x2,y2).
+            edge_colors (Union[str, tuple, List[str], List[tuple]]): The
+                colors of bboxes. ``colors`` can have the same length with
+                lines or just single value. If ``colors`` is single value, all
+                the lines will have the same colors. Refer to `matplotlib.
+                colors` for full list of formats that are accepted.
+                Defaults to 'g'.
+            line_styles (Union[str, List[str]]): The linestyle
+                of lines. ``line_styles`` can have the same length with
+                texts or just single value. If ``line_styles`` is single
+                value, all the lines will have the same linestyle.
+                Reference to
+                https://matplotlib.org/stable/api/collections_api.html?highlight=collection#matplotlib.collections.AsteriskPolygonCollection.set_linestyle
+                for more details. Defaults to '-'.
+            line_widths (Union[Union[int, float], List[Union[int, float]]]):
+                The linewidth of lines. ``line_widths`` can have
+                the same length with lines or just single value.
+                If ``line_widths`` is single value, all the lines will
+                have the same linewidth. Defaults to 2.
+            face_colors (Union[str, tuple, List[str], List[tuple]]):
+                The face colors. Defaults to None.
+            alpha (Union[int, float]): The transparency of bboxes.
+                Defaults to 0.8.
+        """
+        check_type('bboxes', bboxes, (np.ndarray, torch.Tensor))
+        bboxes = tensor2ndarray(bboxes)
+
+        if len(bboxes.shape) == 1:
+            bboxes = bboxes[None]
+        assert bboxes.shape[-1] == 4, (
+            f'The shape of `bboxes` should be (N, 4), but got {bboxes.shape}')
+
+        assert (bboxes[:, 0] <= bboxes[:, 2]).all() and (bboxes[:, 1] <=
+                                                         bboxes[:, 3]).all()
+        if not self._is_posion_valid(bboxes.reshape((-1, 2, 2))):
+            warnings.warn(
+                'Warning: The bbox is out of bounds,'
+                ' the drawn bbox may not be in the image', UserWarning)
+        poly = np.stack(
+            (bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 1],
+             bboxes[:, 2], bboxes[:, 3], bboxes[:, 0], bboxes[:, 3]),
+            axis=-1).reshape(-1, 4, 2)
+        poly = [p for p in poly]
+        return self.draw_polygons(
+            poly,
+            alpha=alpha,
+            edge_colors=edge_colors,
+            line_styles=line_styles,
+            line_widths=line_widths,
+            face_colors=face_colors)
+
+    @master_only
+    def draw_polygons(
+        self,
+        polygons: Union[Union[np.ndarray, torch.Tensor],
+                        List[Union[np.ndarray, torch.Tensor]]],
+        edge_colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+        line_styles: Union[str, List[str]] = '-',
+        line_widths: Union[Union[int, float], List[Union[int, float]]] = 2,
+        face_colors: Union[str, tuple, List[str], List[tuple]] = 'none',
+        alpha: Union[int, float] = 0.8,
+    ) -> 'Visualizer':
+        """Draw single or multiple bboxes.
+
+        Args:
+            polygons (Union[Union[np.ndarray, torch.Tensor],\
+                List[Union[np.ndarray, torch.Tensor]]]): The polygons to draw
+                with the format of (x1,y1,x2,y2,...,xn,yn).
+            edge_colors (Union[str, tuple, List[str], List[tuple]]): The
+                colors of polygons. ``colors`` can have the same length with
+                lines or just single value. If ``colors`` is single value,
+                all the lines will have the same colors. Refer to
+                `matplotlib.colors` for full list of formats that are accepted.
+                Defaults to 'g.
+            line_styles (Union[str, List[str]]): The linestyle
+                of lines. ``line_styles`` can have the same length with
+                texts or just single value. If ``line_styles`` is single
+                value, all the lines will have the same linestyle.
+                Reference to
+                https://matplotlib.org/stable/api/collections_api.html?highlight=collection#matplotlib.collections.AsteriskPolygonCollection.set_linestyle
+                for more details. Defaults to '-'.
+            line_widths (Union[Union[int, float], List[Union[int, float]]]):
+                The linewidth of lines. ``line_widths`` can have
+                the same length with lines or just single value.
+                If ``line_widths`` is single value, all the lines will
+                have the same linewidth. Defaults to 2.
+            face_colors (Union[str, tuple, List[str], List[tuple]]):
+                The face colors. Defaults to None.
+            alpha (Union[int, float]): The transparency of polygons.
+                Defaults to 0.8.
+        """
+        from matplotlib.collections import PolyCollection
+        check_type('polygons', polygons, (list, np.ndarray, torch.Tensor))
+        edge_colors = color_val_matplotlib(edge_colors)  # type: ignore
+        face_colors = color_val_matplotlib(face_colors)  # type: ignore
+
+        if isinstance(polygons, (np.ndarray, torch.Tensor)):
+            polygons = [polygons]
+        if isinstance(polygons, list):
+            for polygon in polygons:
+                assert polygon.shape[1] == 2, (
+                    'The shape of each polygon in `polygons` should be (M, 2),'
+                    f' but got {polygon.shape}')
+        polygons = [tensor2ndarray(polygon) for polygon in polygons]
+        for polygon in polygons:
+            if not self._is_posion_valid(polygon):
+                warnings.warn(
+                    'Warning: The polygon is out of bounds,'
+                    ' the drawn polygon may not be in the image', UserWarning)
+        if isinstance(line_widths, (int, float)):
+            line_widths = [line_widths] * len(polygons)
+        line_widths = [
+            min(max(linewidth, 1), self._default_font_size / 4)
+            for linewidth in line_widths
+        ]
+        polygon_collection = PolyCollection(
+            polygons,
+            alpha=alpha,
+            facecolor=face_colors,
+            linestyles=line_styles,
+            edgecolors=edge_colors,
+            linewidths=line_widths)
+
+        self.ax_save.add_collection(polygon_collection)
+        return self
+
+    @master_only
+    def draw_binary_masks(
+            self,
+            binary_masks: Union[np.ndarray, torch.Tensor],
+            colors: Union[str, tuple, List[str], List[tuple]] = 'g',
+            alphas: Union[float, List[float]] = 0.8) -> 'Visualizer':
+        """Draw single or multiple binary masks.
+
+        Args:
+            binary_masks (np.ndarray, torch.Tensor): The binary_masks to draw
+                with of shape (N, H, W), where H is the image height and W is
+                the image width. Each value in the array is either a 0 or 1
+                value of uint8 type.
+            colors (np.ndarray): The colors which binary_masks will convert to.
+                ``colors`` can have the same length with binary_masks or just
+                single value. If ``colors`` is single value, all the
+                binary_masks will convert to the same colors. The colors format
+                is RGB. Defaults to np.array([0, 255, 0]).
+            alphas (Union[int, List[int]]): The transparency of masks.
+                Defaults to 0.8.
+        """
+        check_type('binary_masks', binary_masks, (np.ndarray, torch.Tensor))
+        binary_masks = tensor2ndarray(binary_masks)
+        assert binary_masks.dtype == np.bool_, (
+            'The dtype of binary_masks should be np.bool_, '
+            f'but got {binary_masks.dtype}')
+        binary_masks = binary_masks.astype('uint8') * 255
+        img = self.get_image()
+        if binary_masks.ndim == 2:
+            binary_masks = binary_masks[None]
+        assert img.shape[:2] == binary_masks.shape[
+                                1:], '`binary_masks` must have ' \
+                                     'the same shape with image'
+        binary_mask_len = binary_masks.shape[0]
+
+        check_type_and_length('colors', colors, (str, tuple, list),
+                              binary_mask_len)
+        colors = value2list(colors, (str, tuple), binary_mask_len)
+        colors = [
+            color_str2rgb(color) if isinstance(color, str) else color
+            for color in colors
+        ]
+        for color in colors:
+            assert len(color) == 3
+            for channel in color:
+                assert 0 <= channel <= 255  # type: ignore
+
+        if isinstance(alphas, float):
+            alphas = [alphas] * binary_mask_len
+
+        for binary_mask, color, alpha in zip(binary_masks, colors, alphas):
+            binary_mask_complement = cv2.bitwise_not(binary_mask)
+            rgb = np.zeros_like(img)
+            rgb[...] = color
+            rgb = cv2.bitwise_and(rgb, rgb, mask=binary_mask)
+            img_complement = cv2.bitwise_and(
+                img, img, mask=binary_mask_complement)
+            rgb = rgb + img_complement
+            img = cv2.addWeighted(img, 1 - alpha, rgb, alpha, 0)
+        self.ax_save.imshow(
+            img,
+            extent=(0, self.width, self.height, 0),
+            interpolation='nearest')
+        return self
+
+    @staticmethod
+    @master_only
+    def draw_featmap(featmap: torch.Tensor,
+                     overlaid_image: Optional[np.ndarray] = None,
+                     channel_reduction: Optional[str] = 'squeeze_mean',
+                     topk: int = 20,
+                     arrangement: Tuple[int, int] = (4, 5),
+                     resize_shape: Optional[tuple] = None,
+                     alpha: float = 0.5) -> np.ndarray:
+        """Draw featmap.
+
+        - If `overlaid_image` is not None, the final output image will be the
+          weighted sum of img and featmap.
+
+        - If `resize_shape` is specified, `featmap` and `overlaid_image`
+          are interpolated.
+
+        - If `resize_shape` is None and `overlaid_image` is not None,
+          the feature map will be interpolated to the spatial size of the image
+          in the case where the spatial dimensions of `overlaid_image` and
+          `featmap` are different.
+
+        - If `channel_reduction` is "squeeze_mean" and "select_max",
+          it will compress featmap to single channel image and weighted
+          sum to `overlaid_image`.
+
+        - If `channel_reduction` is None
+
+          - If topk <= 0, featmap is assert to be one or three
+            channel and treated as image and will be weighted sum
+            to ``overlaid_image``.
+          - If topk > 0, it will select topk channel to show by the sum of
+            each channel. At the same time, you can specify the `arrangement`
+            to set the window layout.
+
+        Args:
+            featmap (torch.Tensor): The featmap to draw which format is
+                (C, H, W).
+            overlaid_image (np.ndarray, optional): The overlaid image.
+                Defaults to None.
+            channel_reduction (str, optional): Reduce multiple channels to a
+                single channel. The optional value is 'squeeze_mean'
+                or 'select_max'. Defaults to 'squeeze_mean'.
+            topk (int): If channel_reduction is not None and topk > 0,
+                it will select topk channel to show by the sum of each channel.
+                if topk <= 0, tensor_chw is assert to be one or three.
+                Defaults to 20.
+            arrangement (Tuple[int, int]): The arrangement of featmap when
+                channel_reduction is None and topk > 0. Defaults to (4, 5).
+            resize_shape (tuple, optional): The shape to scale the feature map.
+                Defaults to None.
+            alpha (Union[int, List[int]]): The transparency of featmap.
+                Defaults to 0.5.
+
+        Returns:
+            np.ndarray: RGB image.
+        """
+        import matplotlib.pyplot as plt
+        assert isinstance(featmap,
+                          torch.Tensor), (f'`featmap` should be torch.Tensor,'
+                                          f' but got {type(featmap)}')
+        assert featmap.ndim == 3, f'Input dimension must be 3, ' \
+                                  f'but got {featmap.ndim}'
+        featmap = featmap.detach().cpu()
+
+        if overlaid_image is not None:
+            if overlaid_image.ndim == 2:
+                overlaid_image = cv2.cvtColor(overlaid_image,
+                                              cv2.COLOR_GRAY2RGB)
+
+            if overlaid_image.shape[:2] != featmap.shape[1:]:
+                warnings.warn(
+                    f'Since the spatial dimensions of '
+                    f'overlaid_image: {overlaid_image.shape[:2]} and '
+                    f'featmap: {featmap.shape[1:]} are not same, '
+                    f'the feature map will be interpolated. '
+                    f'This may cause mismatch problems !')
+                if resize_shape is None:
+                    featmap = F.interpolate(
+                        featmap[None],
+                        overlaid_image.shape[:2],
+                        mode='bilinear',
+                        align_corners=False)[0]
+
+        if resize_shape is not None:
+            featmap = F.interpolate(
+                featmap[None],
+                resize_shape,
+                mode='bilinear',
+                align_corners=False)[0]
+            if overlaid_image is not None:
+                overlaid_image = cv2.resize(overlaid_image, resize_shape[::-1])
+
+        if channel_reduction is not None:
+            assert channel_reduction in [
+                'squeeze_mean', 'select_max'], \
+                f'Mode only support "squeeze_mean", "select_max", ' \
+                f'but got {channel_reduction}'
+            if channel_reduction == 'select_max':
+                sum_channel_featmap = torch.sum(featmap, dim=(1, 2))
+                _, indices = torch.topk(sum_channel_featmap, 1)
+                feat_map = featmap[indices]
+            else:
+                feat_map = torch.mean(featmap, dim=0)
+            return convert_overlay_heatmap(feat_map, overlaid_image, alpha)
+        elif topk <= 0:
+            featmap_channel = featmap.shape[0]
+            assert featmap_channel in [
+                1, 3
+            ], ('The input tensor channel dimension must be 1 or 3 '
+                'when topk is less than 1, but the channel '
+                f'dimension you input is {featmap_channel}, you can use the'
+                ' channel_reduction parameter or set topk greater than '
+                '0 to solve the error')
+            return convert_overlay_heatmap(featmap, overlaid_image, alpha)
+        else:
+            row, col = arrangement
+            channel, height, width = featmap.shape
+            assert row * col >= topk, 'The product of row and col in ' \
+                                      'the `arrangement` is less than ' \
+                                      'topk, please set the ' \
+                                      '`arrangement` correctly'
+
+            # Extract the feature map of topk
+            topk = min(channel, topk)
+            sum_channel_featmap = torch.sum(featmap, dim=(1, 2))
+            _, indices = torch.topk(sum_channel_featmap, topk)
+            topk_featmap = featmap[indices]
+
+            fig = plt.figure(frameon=False)
+            # Set the window layout
+            fig.subplots_adjust(
+                left=0, right=1, bottom=0, top=1, wspace=0, hspace=0)
+            dpi = fig.get_dpi()
+            fig.set_size_inches((width * col + 1e-2) / dpi,
+                                (height * row + 1e-2) / dpi)
+            for i in range(topk):
+                axes = fig.add_subplot(row, col, i + 1)
+                axes.axis('off')
+                axes.text(2, 15, f'channel: {indices[i]}', fontsize=10)
+                axes.imshow(
+                    convert_overlay_heatmap(topk_featmap[i], overlaid_image,
+                                            alpha))
+            image = img_from_canvas(fig.canvas)
+            plt.close(fig)
+            return image
+
+    @master_only
+    def add_config(self, config: Config, **kwargs):
+        """Record the config.
+
+        Args:
+            config (Config): The Config object.
+        """
+        for vis_backend in self._vis_backends.values():
+            vis_backend.add_config(config, **kwargs)
+
+    @master_only
+    def add_graph(self, model: torch.nn.Module, data_batch: Sequence[dict],
+                  **kwargs) -> None:
+        """Record the model graph.
+
+        Args:
+            model (torch.nn.Module): Model to draw.
+            data_batch (Sequence[dict]): Batch of data from dataloader.
+        """
+        for vis_backend in self._vis_backends.values():
+            vis_backend.add_graph(model, data_batch, **kwargs)
+
+    @master_only
+    def add_image(self, name: str, image: np.ndarray, step: int = 0) -> None:
+        """Record the image.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray, optional): The image to be saved. The format
+                should be RGB. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        for vis_backend in self._vis_backends.values():
+            vis_backend.add_image(name, image, step)  # type: ignore
+
+    @master_only
+    def add_scalar(self,
+                   name: str,
+                   value: Union[int, float],
+                   step: int = 0,
+                   **kwargs) -> None:
+        """Record the scalar data.
+
+        Args:
+            name (str): The scalar identifier.
+            value (float, int): Value to save.
+            step (int): Global step value to record. Defaults to 0.
+        """
+        for vis_backend in self._vis_backends.values():
+            vis_backend.add_scalar(name, value, step, **kwargs)  # type: ignore
+
+    @master_only
+    def add_scalars(self,
+                    scalar_dict: dict,
+                    step: int = 0,
+                    file_path: Optional[str] = None,
+                    **kwargs) -> None:
+        """Record the scalars' data.
+
+        Args:
+            scalar_dict (dict): Key-value pair storing the tag and
+                corresponding values.
+            step (int): Global step value to record. Defaults to 0.
+            file_path (str, optional): The scalar's data will be
+                saved to the `file_path` file at the same time
+                if the `file_path` parameter is specified.
+                Defaults to None.
+        """
+        for vis_backend in self._vis_backends.values():
+            vis_backend.add_scalars(scalar_dict, step, file_path, **kwargs)
+
+    @master_only
+    def add_datasample(self,
+                       name,
+                       image: np.ndarray,
+                       data_sample: Optional['BaseDataElement'] = None,
+                       draw_gt: bool = True,
+                       draw_pred: bool = True,
+                       show: bool = False,
+                       wait_time: int = 0,
+                       step: int = 0) -> None:
+        """Draw datasample."""
+        pass
+
+    def close(self) -> None:
+        """close an opened object."""
+        for vis_backend in self._vis_backends.values():
+            vis_backend.close()
+
+    @classmethod
+    def get_instance(cls, name: str, **kwargs) -> 'Visualizer':
+        """Make subclass can get latest created instance by
+        ``Visualizer.get_current_instance()``.
+
+        Downstream codebase may need to get the latest created instance
+        without knowing the specific Visualizer type. For example, mmdetection
+        builds visualizer in runner and some component which cannot access
+        runner wants to get latest created visualizer. In this case,
+        the component does not know which type of visualizer has been built
+        and cannot get target instance. Therefore, :class:`Visualizer`
+        overrides the :meth:`get_instance` and its subclass will register
+        the created instance to :attr:`_instance_dict` additionally.
+        :meth:`get_current_instance` will return the latest created subclass
+        instance.
+
+        Examples:
+            >>> class DetLocalVisualizer(Visualizer):
+            >>>     def __init__(self, name):
+            >>>         super().__init__(name)
+            >>>
+            >>> visualizer1 = DetLocalVisualizer.get_instance('name1')
+            >>> visualizer2 = Visualizer.get_current_instance()
+            >>> visualizer3 = DetLocalVisualizer.get_current_instance()
+            >>> assert id(visualizer1) == id(visualizer2) == id(visualizer3)
+
+        Args:
+            name (str): Name of instance.
+
+        Returns:
+            object: Corresponding name instance.
+        """
+        instance = super().get_instance(name, **kwargs)
+        Visualizer._instance_dict[name] = instance
+        return instance
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/ade20k.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/ade20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..48340d11eeaf44d1e8e973d5b0ce6108e42d3215
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/ade20k.py
@@ -0,0 +1,68 @@
+# dataset settings
+dataset_type = 'ADE20KDataset'
+data_root = 'data/ade/ADEChallengeData2016'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/training', seg_map_path='annotations/training'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/ade20k_640x640.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/ade20k_640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1f642da7fd55077471841f04b177dd4ebe3d150
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/ade20k_640x640.py
@@ -0,0 +1,68 @@
+# dataset settings
+dataset_type = 'ADE20KDataset'
+data_root = 'data/ade/ADEChallengeData2016'
+crop_size = (640, 640)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2560, 640),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2560, 640), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/training', seg_map_path='annotations/training'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/bdd100k.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/bdd100k.py
new file mode 100644
index 0000000000000000000000000000000000000000..24cec69bfeb2211ca387c774e99ee4b83511991f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/bdd100k.py
@@ -0,0 +1,70 @@
+# dataset settings
+dataset_type = 'BDD100KDataset'
+data_root = 'data/bdd100k/'
+
+crop_size = (512, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/10k/train',
+            seg_map_path='labels/sem_seg/masks/train'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/10k/val',
+            seg_map_path='labels/sem_seg/masks/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/chase_db1.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/chase_db1.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed47c2dbe5edd66ce1e853a9f3a6097683315ddf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/chase_db1.py
@@ -0,0 +1,75 @@
+# dataset settings
+dataset_type = 'ChaseDB1Dataset'
+data_root = 'data/CHASE_DB1'
+img_scale = (960, 999)
+crop_size = (128, 128)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=img_scale,
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=40000,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            data_prefix=dict(
+                img_path='images/training',
+                seg_map_path='annotations/training'),
+            pipeline=train_pipeline)))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..b63a4cdfe78e8c75c32723ff419c62add54a0640
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'CityscapesDataset'
+data_root = 'data/cityscapes/'
+crop_size = (512, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='leftImg8bit/train', seg_map_path='gtFine/train'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='leftImg8bit/val', seg_map_path='gtFine/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes_1024x1024.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes_1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..72be307b406cffa52a03916b8cbca73208a120ce
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes_1024x1024.py
@@ -0,0 +1,29 @@
+_base_ = './cityscapes.py'
+crop_size = (1024, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes_768x768.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes_768x768.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcee0143ac75ada8c43b122b0c952df1be455512
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes_768x768.py
@@ -0,0 +1,29 @@
+_base_ = './cityscapes.py'
+crop_size = (768, 768)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2049, 1025),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2049, 1025), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes_769x769.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes_769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae40ac8c5fb02dcee69d93a63e8adbe998b28b47
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes_769x769.py
@@ -0,0 +1,29 @@
+_base_ = './cityscapes.py'
+crop_size = (769, 769)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2049, 1025),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2049, 1025), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes_832x832.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes_832x832.py
new file mode 100644
index 0000000000000000000000000000000000000000..0254580357f5e38be1b89ad3812d3dcedf1ccde4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/cityscapes_832x832.py
@@ -0,0 +1,29 @@
+_base_ = './cityscapes.py'
+crop_size = (832, 832)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/coco-stuff10k.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/coco-stuff10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d6bb12b97549c4ebf1ffd07863282d6705df794
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/coco-stuff10k.py
@@ -0,0 +1,69 @@
+# dataset settings
+dataset_type = 'COCOStuffDataset'
+data_root = 'data/coco_stuff10k'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        reduce_zero_label=True,
+        data_prefix=dict(
+            img_path='images/train2014', seg_map_path='annotations/train2014'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        reduce_zero_label=True,
+        data_prefix=dict(
+            img_path='images/test2014', seg_map_path='annotations/test2014'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/coco-stuff164k.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/coco-stuff164k.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b9d90117a173a5c4800f23eb6a9cb57c263063
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/coco-stuff164k.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'COCOStuffDataset'
+data_root = 'data/coco_stuff164k'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/train2017', seg_map_path='annotations/train2017'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/val2017', seg_map_path='annotations/val2017'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/deep_fashion.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/deep_fashion.py
new file mode 100644
index 0000000000000000000000000000000000000000..95f6263822383cd2539c01e9358926496de99aea
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/deep_fashion.py
@@ -0,0 +1,69 @@
+# dataset settings
+dataset_type = 'DeepFashionDataset'
+data_root = '/mnt/data/ocrSpace/linyangsheng/workspace/data/deepfashion2'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train/image', seg_map_path='train/mask'),
+        # ann_file='ImageSets/Segmentation/train.txt',
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='validation/image', seg_map_path='validation/mask'),
+        # ann_file='ImageSets/Segmentation/val.txt',
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/deep_fashion_10k copy.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/deep_fashion_10k copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..10ef017fea005ba642366d65204f30b07a73a0ab
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/deep_fashion_10k copy.py	
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'DeepFashion10KDataset'
+data_root = '/mnt/data_ssd/h800/linyangsheng/workspace/data/deep_fashion_10k'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    # dict(
+    #     type='RandomResize',
+    #     scale=(2048, 512),
+    #     ratio_range=(0.5, 2.0),
+    #     keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train/imgs', seg_map_path='train/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='val/imgs', seg_map_path='val/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/deep_fashion_10k.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/deep_fashion_10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..10ef017fea005ba642366d65204f30b07a73a0ab
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/deep_fashion_10k.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'DeepFashion10KDataset'
+data_root = '/mnt/data_ssd/h800/linyangsheng/workspace/data/deep_fashion_10k'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    # dict(
+    #     type='RandomResize',
+    #     scale=(2048, 512),
+    #     ratio_range=(0.5, 2.0),
+    #     keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train/imgs', seg_map_path='train/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='val/imgs', seg_map_path='val/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/deep_fashion_10k_add.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/deep_fashion_10k_add.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a5c8ada43067f0a19a85ef05eea997ccbbc28a0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/deep_fashion_10k_add.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'DeepFashion10KDataset'
+data_root = '/mnt/data/ocrSpace/linyangsheng/workspace/data/deep_fashion_10k_AddBackground_2'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train/imgs', seg_map_path='train/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='val/imgs', seg_map_path='val/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/drive.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/drive.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a3dd82c643330a8a00890efbd874e8abb1b2551
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/drive.py
@@ -0,0 +1,73 @@
+# dataset settings
+dataset_type = 'DRIVEDataset'
+data_root = 'data/DRIVE'
+img_scale = (584, 565)
+crop_size = (64, 64)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=img_scale,
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=40000,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            data_prefix=dict(
+                img_path='images/training',
+                seg_map_path='annotations/training'),
+            pipeline=train_pipeline)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/fashion_3category.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/fashion_3category.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d27c4f9b059011057f292778ea436ff7660f0be
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/fashion_3category.py
@@ -0,0 +1,71 @@
+# dataset settings
+dataset_type = 'fashion3categoryDataset'
+data_root = '/mnt/data/ocrSpace/linyangsheng/workspace/data/fashion_3category'
+# data_root = '/mnt/data/ocrSpace/linyangsheng/workspace/data/fashion_try'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    # dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(type='LoadAnnotations'), # , reduce_zero_label=False),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    # dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/train', seg_map_path='annotations_re/train'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/val',
+            seg_map_path='annotations_re/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/hrf.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/hrf.py
new file mode 100644
index 0000000000000000000000000000000000000000..353d070472e5272ff7611265a43846abc1e4014f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/hrf.py
@@ -0,0 +1,73 @@
+# dataset settings
+dataset_type = 'HRFDataset'
+data_root = 'data/HRF'
+img_scale = (2336, 3504)
+crop_size = (256, 256)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=img_scale,
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=40000,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            data_prefix=dict(
+                img_path='images/training',
+                seg_map_path='annotations/training'),
+            pipeline=train_pipeline)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/hsi_drive.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/hsi_drive.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d08e2d60106ee40c8b210c30f5a2abffaf07bf0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/hsi_drive.py
@@ -0,0 +1,53 @@
+train_pipeline = [
+    dict(type='LoadImageFromNpyFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='RandomCrop', crop_size=(192, 384)),
+    dict(type='PackSegInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromNpyFile'),
+    dict(type='RandomCrop', crop_size=(192, 384)),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type='HSIDrive20Dataset',
+        data_root='data/HSIDrive20',
+        data_prefix=dict(
+            img_path='images/training', seg_map_path='annotations/training'),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='HSIDrive20Dataset',
+        data_root='data/HSIDrive20',
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='HSIDrive20Dataset',
+        data_root='data/HSIDrive20',
+        data_prefix=dict(
+            img_path='images/test', seg_map_path='annotations/test'),
+        pipeline=test_pipeline))
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'], ignore_index=0)
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/human_parsing.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/human_parsing.py
new file mode 100644
index 0000000000000000000000000000000000000000..db75153b9bed31d9b67ee3fd5c7299c4b5091e8b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/human_parsing.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'HumanParsingDataset'
+data_root = '/mnt/data/ocrSpace/linyangsheng/workspace/data/human_parsing_2'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train/imgs', seg_map_path='train/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='val/imgs', seg_map_path='val/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/human_parsing_1024x1024.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/human_parsing_1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..afa800cb257f3073b2b8522d7a7ad6b72e903fa7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/human_parsing_1024x1024.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'HumanParsingDataset'
+data_root = '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/human_parsing_fashion_dataset'
+crop_size = (1024, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train/images', seg_map_path='train/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='val/images', seg_map_path='val/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/human_union.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/human_union.py
new file mode 100644
index 0000000000000000000000000000000000000000..35af7f0cf34a6fc1986125218001a117c682142f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/human_union.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'HumanUnionDataset'
+data_root = '/mnt/data/ocrSpace/linyangsheng/workspace/data/union_hp_lip'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train/imgs', seg_map_path='train/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='val/imgs', seg_map_path='val/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/imaterialist.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/imaterialist.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1b89b67244d620d3569684d5f89caefdba0ddf6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/imaterialist.py
@@ -0,0 +1,68 @@
+# dataset settings
+dataset_type = 'iMaterialistDataset'
+# data_root = '/mnt/data/h800/linyangsheng/workspace/data/imaterialist_mannequin_cover'
+data_root = '/mnt/data/h800/linyangsheng/workspace/data/imaterialist'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train/imgs', seg_map_path='train/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='val/imgs', seg_map_path='val/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/imaterialist_5cat.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/imaterialist_5cat.py
new file mode 100644
index 0000000000000000000000000000000000000000..f71e86de36b3390fce3654bb1330e5e29aeee9cd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/imaterialist_5cat.py
@@ -0,0 +1,68 @@
+# dataset settings
+dataset_type = 'iMaterialist_5Cat_Dataset'
+# data_root = '/mnt/data/h800/linyangsheng/workspace/data/imaterialist_mannequin_cover'
+data_root = '/mnt/data/h800/linyangsheng/workspace/data/imaterialist_5cat'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train/imgs', seg_map_path='train/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='val/imgs', seg_map_path='val/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/isaid.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/isaid.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cd4309f6df6309267e1684abec5c338e24d36b1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/isaid.py
@@ -0,0 +1,73 @@
+# dataset settings
+dataset_type = 'iSAIDDataset'
+data_root = 'data/iSAID'
+"""
+This crop_size setting is followed by the implementation of
+`PointFlow: Flowing Semantics Through Points for Aerial Image
+Segmentation <https://arxiv.org/pdf/2103.06564.pdf>`_.
+"""
+
+crop_size = (896, 896)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(896, 896),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(896, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='img_dir/train', seg_map_path='ann_dir/train'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/levir_256x256.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/levir_256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e018a5ae4505e86aadaad6e462b92f97179f424
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/levir_256x256.py
@@ -0,0 +1,68 @@
+# dataset settings
+dataset_type = 'LEVIRCDDataset'
+data_root = r'data/LEVIRCD'
+
+albu_train_transforms = [
+    dict(type='RandomBrightnessContrast', p=0.2),
+    dict(type='HorizontalFlip', p=0.5),
+    dict(type='VerticalFlip', p=0.5)
+]
+
+train_pipeline = [
+    dict(type='LoadMultipleRSImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='Albu',
+        keymap={
+            'img': 'image',
+            'img2': 'image2',
+            'gt_seg_map': 'mask'
+        },
+        transforms=albu_train_transforms,
+        additional_targets={'image2': 'image'},
+        bgr_to_rgb=False),
+    dict(type='ConcatCDInput'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadMultipleRSImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='ConcatCDInput'),
+    dict(type='PackSegInputs')
+]
+
+tta_pipeline = [
+    dict(type='LoadMultipleRSImageFromFile'),
+    dict(
+        type='TestTimeAug',
+        transforms=[[dict(type='LoadAnnotations')],
+                    [dict(type='ConcatCDInput')],
+                    [dict(type='PackSegInputs')]])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train/A',
+            img_path2='train/B',
+            seg_map_path='train/label'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='test/A', img_path2='test/B', seg_map_path='test/label'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/lip.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/lip.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ee142b043be91f4220b2d4560184d81f9c43949
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/lip.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'LIPDataset'
+data_root = '/mnt/data/ocrSpace/linyangsheng/workspace/LIP'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train_images', seg_map_path='train_segmentations'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='val_images_1k', seg_map_path='val_segmentations_1k'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/loveda.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/loveda.py
new file mode 100644
index 0000000000000000000000000000000000000000..b93bc74af1b4536f35c9f9caeb4adbd009795996
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/loveda.py
@@ -0,0 +1,66 @@
+# dataset settings
+dataset_type = 'LoveDADataset'
+data_root = 'data/loveDA'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(1024, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='img_dir/train', seg_map_path='ann_dir/train'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/mapillary_v1.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/mapillary_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..611aa4741b5afceb5998dffcf75933f72403f805
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/mapillary_v1.py
@@ -0,0 +1,68 @@
+# dataset settings
+dataset_type = 'MapillaryDataset_v1'
+data_root = 'data/mapillary/'
+crop_size = (512, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='training/images', seg_map_path='training/v1.2/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='validation/images',
+            seg_map_path='validation/v1.2/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/mapillary_v1_65.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/mapillary_v1_65.py
new file mode 100644
index 0000000000000000000000000000000000000000..f594f373337e403f9d01743103bcd8167781c940
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/mapillary_v1_65.py
@@ -0,0 +1,37 @@
+# dataset settings
+_base_ = './mapillary_v1.py'
+metainfo = dict(
+    classes=('Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail', 'Barrier',
+             'Wall', 'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Parking',
+             'Pedestrian Area', 'Rail Track', 'Road', 'Service Lane',
+             'Sidewalk', 'Bridge', 'Building', 'Tunnel', 'Person', 'Bicyclist',
+             'Motorcyclist', 'Other Rider', 'Lane Marking - Crosswalk',
+             'Lane Marking - General', 'Mountain', 'Sand', 'Sky', 'Snow',
+             'Terrain', 'Vegetation', 'Water', 'Banner', 'Bench', 'Bike Rack',
+             'Billboard', 'Catch Basin', 'CCTV Camera', 'Fire Hydrant',
+             'Junction Box', 'Mailbox', 'Manhole', 'Phone Booth', 'Pothole',
+             'Street Light', 'Pole', 'Traffic Sign Frame', 'Utility Pole',
+             'Traffic Light', 'Traffic Sign (Back)', 'Traffic Sign (Front)',
+             'Trash Can', 'Bicycle', 'Boat', 'Bus', 'Car', 'Caravan',
+             'Motorcycle', 'On Rails', 'Other Vehicle', 'Trailer', 'Truck',
+             'Wheeled Slow', 'Car Mount', 'Ego Vehicle'),
+    palette=[[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153],
+             [180, 165, 180], [90, 120, 150], [102, 102, 156], [128, 64, 255],
+             [140, 140, 200], [170, 170, 170], [250, 170, 160], [96, 96, 96],
+             [230, 150, 140], [128, 64, 128], [110, 110, 110], [244, 35, 232],
+             [150, 100, 100], [70, 70, 70], [150, 120, 90], [220, 20, 60],
+             [255, 0, 0], [255, 0, 100], [255, 0, 200], [200, 128, 128],
+             [255, 255, 255], [64, 170, 64], [230, 160, 50], [70, 130, 180],
+             [190, 255, 255], [152, 251, 152], [107, 142, 35], [0, 170, 30],
+             [255, 255, 128], [250, 0, 30], [100, 140, 180], [220, 220, 220],
+             [220, 128, 128], [222, 40, 40], [100, 170, 30], [40, 40, 40],
+             [33, 33, 33], [100, 128, 160], [142, 0, 0], [70, 100, 150],
+             [210, 170, 100], [153, 153, 153], [128, 128, 128], [0, 0, 80],
+             [250, 170, 30], [192, 192, 192], [220, 220, 0], [140, 140, 20],
+             [119, 11, 32], [150, 0, 255], [0, 60, 100], [0, 0, 142],
+             [0, 0, 90], [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+             [0, 0, 70], [0, 0, 192], [32, 32, 32], [120, 10, 10]])
+
+train_dataloader = dict(dataset=dict(metainfo=metainfo))
+val_dataloader = dict(dataset=dict(metainfo=metainfo))
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/mapillary_v2.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/mapillary_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cb7a958e516caec34f8a1f82af8d05a0a84472f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/mapillary_v2.py
@@ -0,0 +1,68 @@
+# dataset settings
+dataset_type = 'MapillaryDataset_v2'
+data_root = 'data/mapillary/'
+crop_size = (512, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='training/images', seg_map_path='training/v2.0/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='validation/images',
+            seg_map_path='validation/v2.0/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/nyu.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/nyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..74d57c5fc50b89754424a22d0988e18e2a0596c7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/nyu.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'NYUDataset'
+data_root = 'data/nyu'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadDepthAnnotation', depth_rescale_factor=1e-3),
+    dict(type='RandomDepthMix', prob=0.25),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='RandomCrop', crop_size=(480, 480)),
+    dict(
+        type='Albu',
+        transforms=[
+            dict(type='RandomBrightnessContrast'),
+            dict(type='RandomGamma'),
+            dict(type='HueSaturationValue'),
+        ]),
+    dict(
+        type='PackSegInputs',
+        meta_keys=('img_path', 'depth_map_path', 'ori_shape', 'img_shape',
+                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+                   'category_id')),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2000, 480), keep_ratio=True),
+    dict(dict(type='LoadDepthAnnotation', depth_rescale_factor=1e-3)),
+    dict(
+        type='PackSegInputs',
+        meta_keys=('img_path', 'depth_map_path', 'ori_shape', 'img_shape',
+                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+                   'category_id'))
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/train', depth_map_path='annotations/train'),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        test_mode=True,
+        data_prefix=dict(
+            img_path='images/test', depth_map_path='annotations/test'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='DepthMetric',
+    min_depth_eval=0.001,
+    max_depth_eval=10.0,
+    crop_type='nyu_crop')
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/nyu_512x512.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/nyu_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..88e3878d33429f8be1f67c42ebda5604b58c4bc0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/nyu_512x512.py
@@ -0,0 +1,72 @@
+# dataset settings
+dataset_type = 'NYUDataset'
+data_root = 'data/nyu'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadDepthAnnotation', depth_rescale_factor=1e-3),
+    dict(type='RandomDepthMix', prob=0.25),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomResize',
+        scale=(768, 512),
+        ratio_range=(0.8, 1.5),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(512, 512)),
+    dict(
+        type='Albu',
+        transforms=[
+            dict(type='RandomBrightnessContrast'),
+            dict(type='RandomGamma'),
+            dict(type='HueSaturationValue'),
+        ]),
+    dict(
+        type='PackSegInputs',
+        meta_keys=('img_path', 'depth_map_path', 'ori_shape', 'img_shape',
+                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+                   'category_id')),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    dict(dict(type='LoadDepthAnnotation', depth_rescale_factor=1e-3)),
+    dict(
+        type='PackSegInputs',
+        meta_keys=('img_path', 'depth_map_path', 'ori_shape', 'img_shape',
+                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+                   'category_id'))
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/train', depth_map_path='annotations/train'),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        test_mode=True,
+        data_prefix=dict(
+            img_path='images/test', depth_map_path='annotations/test'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='DepthMetric',
+    min_depth_eval=0.001,
+    max_depth_eval=10.0,
+    crop_type='nyu_crop')
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/pascal_context.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/pascal_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfb1f858b352f70b2831e1885d8ab80f2b7b497a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/pascal_context.py
@@ -0,0 +1,56 @@
+# dataset settings
+dataset_type = 'PascalContextDataset'
+data_root = 'data/VOCdevkit/VOC2010/'
+
+img_scale = (520, 520)
+crop_size = (480, 480)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=img_scale,
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='JPEGImages', seg_map_path='SegmentationClassContext'),
+        ann_file='ImageSets/SegmentationContext/train.txt',
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='JPEGImages', seg_map_path='SegmentationClassContext'),
+        ann_file='ImageSets/SegmentationContext/val.txt',
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/pascal_context_59.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/pascal_context_59.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f31043ed0e7952ce4c8d43825ba5cbecb7ea500
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/pascal_context_59.py
@@ -0,0 +1,72 @@
+# dataset settings
+dataset_type = 'PascalContextDataset59'
+data_root = 'data/VOCdevkit/VOC2010/'
+
+img_scale = (520, 520)
+crop_size = (480, 480)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=img_scale,
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='JPEGImages', seg_map_path='SegmentationClassContext'),
+        ann_file='ImageSets/SegmentationContext/train.txt',
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='JPEGImages', seg_map_path='SegmentationClassContext'),
+        ann_file='ImageSets/SegmentationContext/val.txt',
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/pascal_voc12.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/pascal_voc12.py
new file mode 100644
index 0000000000000000000000000000000000000000..5235ca9cfe27f3fb8115acf4238c2b2c50621dc6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/pascal_voc12.py
@@ -0,0 +1,69 @@
+# dataset settings
+dataset_type = 'PascalVOCDataset'
+data_root = 'data/VOCdevkit/VOC2012'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='JPEGImages', seg_map_path='SegmentationClass'),
+        ann_file='ImageSets/Segmentation/train.txt',
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='JPEGImages', seg_map_path='SegmentationClass'),
+        ann_file='ImageSets/Segmentation/val.txt',
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/pascal_voc12_aug.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/pascal_voc12_aug.py
new file mode 100644
index 0000000000000000000000000000000000000000..69c36548805321b62ffcb34758fa0f0976ec6817
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/pascal_voc12_aug.py
@@ -0,0 +1,81 @@
+# dataset settings
+dataset_type = 'PascalVOCDataset'
+data_root = 'data/VOCdevkit/VOC2012'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Pad', size=crop_size),
+    dict(type='PackSegInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+dataset_train = dict(
+    type=dataset_type,
+    data_root=data_root,
+    data_prefix=dict(img_path='JPEGImages', seg_map_path='SegmentationClass'),
+    ann_file='ImageSets/Segmentation/train.txt',
+    pipeline=train_pipeline)
+
+dataset_aug = dict(
+    type=dataset_type,
+    data_root=data_root,
+    data_prefix=dict(
+        img_path='JPEGImages', seg_map_path='SegmentationClassAug'),
+    ann_file='ImageSets/Segmentation/aug.txt',
+    pipeline=train_pipeline)
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(type='ConcatDataset', datasets=[dataset_train, dataset_aug]))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='JPEGImages', seg_map_path='SegmentationClass'),
+        ann_file='ImageSets/Segmentation/val.txt',
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/potsdam.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/potsdam.py
new file mode 100644
index 0000000000000000000000000000000000000000..95f6039351a9e1640d06a7576f47617354224773
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/potsdam.py
@@ -0,0 +1,66 @@
+# dataset settings
+dataset_type = 'PotsdamDataset'
+data_root = 'data/potsdam'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(512, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(512, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='img_dir/train', seg_map_path='ann_dir/train'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/refuge.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/refuge.py
new file mode 100644
index 0000000000000000000000000000000000000000..79bb4d4e945f67286e31f9d6851e8ded0854d090
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/refuge.py
@@ -0,0 +1,90 @@
+# dataset settings
+dataset_type = 'REFUGEDataset'
+data_root = 'data/REFUGE'
+train_img_scale = (2056, 2124)
+val_img_scale = (1634, 1634)
+test_img_scale = (1634, 1634)
+crop_size = (512, 512)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(
+        type='RandomResize',
+        scale=train_img_scale,
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=val_img_scale, keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=test_img_scale, keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/training', seg_map_path='annotations/training'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=val_pipeline))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/test', seg_map_path='annotations/test'),
+        pipeline=val_pipeline))
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/stare.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/stare.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7545dc62379d40fa4dc307f4e6ed0e5501a62a2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/stare.py
@@ -0,0 +1,73 @@
+# dataset settings
+dataset_type = 'STAREDataset'
+data_root = 'data/STARE'
+img_scale = (605, 700)
+crop_size = (128, 128)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=img_scale,
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=40000,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            data_prefix=dict(
+                img_path='images/training',
+                seg_map_path='annotations/training'),
+            pipeline=train_pipeline)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/synapse.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/synapse.py
new file mode 100644
index 0000000000000000000000000000000000000000..86852918cd76916913b8c5ef5591e30583bf4a42
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/synapse.py
@@ -0,0 +1,41 @@
+dataset_type = 'SynapseDataset'
+data_root = 'data/synapse/'
+img_scale = (224, 224)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(type='RandomRotFlip', rotate_prob=0.5, flip_prob=0.5, degree=20),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='img_dir/train', seg_map_path='ann_dir/train'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/union_new.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/union_new.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c6f4c39a74a949f214d0a854f9157f8a3a51dd8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/union_new.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'UnionNewKDataset'
+data_root = '/mnt/data_ssd/h800/linyangsheng/workspace/data/union_new_1'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    # dict(
+    #     type='RandomResize',
+    #     scale=(2048, 512),
+    #     ratio_range=(0.5, 2.0),
+    #     keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train/imgs', seg_map_path='train/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='val/imgs', seg_map_path='val/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/union_new_add_mask.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/union_new_add_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..9027824e77ee2937064959658878619ef9faac47
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/union_new_add_mask.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'UnionNewAddMaskDataset'
+data_root = '/mnt/data_ssd/h800/linyangsheng/workspace/data/union_new_add_mask'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    # dict(
+    #     type='RandomResize',
+    #     scale=(2048, 512),
+    #     ratio_range=(0.5, 2.0),
+    #     keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train/imgs', seg_map_path='train/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='val/imgs', seg_map_path='val/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/datasets/vaihingen.py b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/vaihingen.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c78994fe7d84876810acac14ef76adef6a912c4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/datasets/vaihingen.py
@@ -0,0 +1,66 @@
+# dataset settings
+dataset_type = 'ISPRSDataset'
+data_root = 'data/vaihingen'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(512, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(512, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='img_dir/train', seg_map_path='ann_dir/train'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/default_runtime.py b/head_extractor/src/mmseg/.mim/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..272b4d2467992b0f584a3b9d825061c0db474842
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/default_runtime.py
@@ -0,0 +1,15 @@
+default_scope = 'mmseg'
+env_cfg = dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(by_epoch=False)
+log_level = 'INFO'
+load_from = None
+resume = False
+
+tta_model = dict(type='SegTTAModel')
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/ann_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/ann_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1ef956948627b7ca674abaf3a2369e27bc70329
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/ann_r50-d8.py
@@ -0,0 +1,54 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='ANNHead',
+        in_channels=[1024, 2048],
+        in_index=[2, 3],
+        channels=512,
+        project_channels=256,
+        query_scales=(1, ),
+        key_pool_scales=(1, 3, 6, 8),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/apcnet_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/apcnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..63269f9987115d4709623c9dcb2ee3e74bb2bee8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/apcnet_r50-d8.py
@@ -0,0 +1,52 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='APCHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        pool_scales=(1, 2, 3, 6),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/bisenetv1_r18-d32.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/bisenetv1_r18-d32.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aecb9e2efd788577ed9634eec9659a91381ba1e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/bisenetv1_r18-d32.py
@@ -0,0 +1,76 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='BiSeNetV1',
+        in_channels=3,
+        context_channels=(128, 256, 512),
+        spatial_channels=(64, 64, 64, 128),
+        out_indices=(0, 1, 2),
+        out_channels=256,
+        backbone_cfg=dict(
+            type='ResNet',
+            in_channels=3,
+            depth=18,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            dilations=(1, 1, 1, 1),
+            strides=(1, 2, 2, 2),
+            norm_cfg=norm_cfg,
+            norm_eval=False,
+            style='pytorch',
+            contract_dilation=True),
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        init_cfg=None),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=256,
+        in_index=0,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=19,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=19,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ],
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/bisenetv2.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/bisenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae845129db9f25573653659ef564ddadc790377f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/bisenetv2.py
@@ -0,0 +1,88 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='BiSeNetV2',
+        detail_channels=(64, 64, 128),
+        semantic_channels=(16, 32, 64, 128),
+        semantic_expansion_ratio=6,
+        bga_channels=128,
+        out_indices=(0, 1, 2, 3, 4),
+        init_cfg=None,
+        align_corners=False),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=128,
+        in_index=0,
+        channels=1024,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=16,
+            channels=16,
+            num_convs=2,
+            num_classes=19,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=32,
+            channels=64,
+            num_convs=2,
+            num_classes=19,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=64,
+            channels=256,
+            num_convs=2,
+            num_classes=19,
+            in_index=3,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=1024,
+            num_convs=2,
+            num_classes=19,
+            in_index=4,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ],
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/ccnet_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/ccnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..575d8eb4590393ffb024fe0fecace32ac96f4b2b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/ccnet_r50-d8.py
@@ -0,0 +1,52 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='CCHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        recurrence=2,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/cgnet.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/cgnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..93c6f5b6d1a5f1d325189b78f5a95d36434246a1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/cgnet.py
@@ -0,0 +1,43 @@
+# model settings
+norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[72.39239876, 82.90891754, 73.15835921],
+    std=[1, 1, 1],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='CGNet',
+        norm_cfg=norm_cfg,
+        in_channels=3,
+        num_channels=(32, 64, 128),
+        num_blocks=(3, 21),
+        dilations=(2, 4),
+        reductions=(8, 16)),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=256,
+        in_index=2,
+        channels=256,
+        num_convs=0,
+        concat_input=False,
+        dropout_ratio=0,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=1.0,
+            class_weight=[
+                2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352,
+                10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905,
+                10.347791, 6.3927646, 10.226669, 10.241062, 10.280587,
+                10.396974, 10.055647
+            ])),
+    # model training and testing settings
+    train_cfg=dict(sampler=None),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/danet_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/danet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..8163b3d69116eb069829d66694f0ebee37831532
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/danet_r50-d8.py
@@ -0,0 +1,52 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='DAHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        pam_channels=64,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/deeplabv3_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/deeplabv3_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..22efe9a6cad1713ff4eb4071252e21e782e3ac85
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/deeplabv3_r50-d8.py
@@ -0,0 +1,52 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='ASPPHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        dilations=(1, 12, 24, 36),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/deeplabv3_unet_s5-d16.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/deeplabv3_unet_s5-d16.py
new file mode 100644
index 0000000000000000000000000000000000000000..92df52c35d402b9f2446cb82f5c49e53b4a48107
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/deeplabv3_unet_s5-d16.py
@@ -0,0 +1,58 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='UNet',
+        in_channels=3,
+        base_channels=64,
+        num_stages=5,
+        strides=(1, 1, 1, 1, 1),
+        enc_num_convs=(2, 2, 2, 2, 2),
+        dec_num_convs=(2, 2, 2, 2),
+        downsamples=(True, True, True, True),
+        enc_dilations=(1, 1, 1, 1, 1),
+        dec_dilations=(1, 1, 1, 1),
+        with_cp=False,
+        conv_cfg=None,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU'),
+        upsample_cfg=dict(type='InterpConv'),
+        norm_eval=False),
+    decode_head=dict(
+        type='ASPPHead',
+        in_channels=64,
+        in_index=4,
+        channels=16,
+        dilations=(1, 12, 24, 36),
+        dropout_ratio=0.1,
+        num_classes=2,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=128,
+        in_index=3,
+        channels=64,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=2,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=256, stride=170))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/deeplabv3plus_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/deeplabv3plus_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..74dbed5593d5b4ce246eebb03b6ebb2138fdc59b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/deeplabv3plus_r50-d8.py
@@ -0,0 +1,54 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='DepthwiseSeparableASPPHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        dilations=(1, 12, 24, 36),
+        c1_in_channels=256,
+        c1_channels=48,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/dmnet_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/dmnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..f66a042f1de47434a53a8ecc9557a63df015b234
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/dmnet_r50-d8.py
@@ -0,0 +1,52 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='DMHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        filter_sizes=(1, 3, 5, 7),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/dnl_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/dnl_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee64056c0ee861050d90ede3c0e201771dd62d48
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/dnl_r50-d8.py
@@ -0,0 +1,54 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='DNLHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        dropout_ratio=0.1,
+        reduction=2,
+        use_scale=True,
+        mode='embedded_gaussian',
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/dpt_vit-b16.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/dpt_vit-b16.py
new file mode 100644
index 0000000000000000000000000000000000000000..90845b37b5a312448d6a910d29a8d013cbf4b88d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/dpt_vit-b16.py
@@ -0,0 +1,39 @@
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/vit-b16_p16_224-80ecf9dd.pth', # noqa
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=224,
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        out_indices=(2, 5, 8, 11),
+        final_norm=False,
+        with_cls_token=True,
+        output_cls_token=True),
+    decode_head=dict(
+        type='DPTHead',
+        in_channels=(768, 768, 768, 768),
+        channels=256,
+        embed_dims=768,
+        post_process_channels=[96, 192, 384, 768],
+        num_classes=150,
+        readout_type='project',
+        input_transform='multiple_select',
+        in_index=(0, 1, 2, 3),
+        norm_cfg=norm_cfg,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=None,
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))  # yapf: disable
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/emanet_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/emanet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55af4f11d1d7dc2fe5b046904bdd9e247169dc3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/emanet_r50-d8.py
@@ -0,0 +1,55 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='EMAHead',
+        in_channels=2048,
+        in_index=3,
+        channels=256,
+        ema_channels=512,
+        num_bases=64,
+        num_stages=3,
+        momentum=0.1,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/encnet_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/encnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..63cec9e3cbe9c1a221e788b7bac1eba589a69c1d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/encnet_r50-d8.py
@@ -0,0 +1,56 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='EncHead',
+        in_channels=[512, 1024, 2048],
+        in_index=(1, 2, 3),
+        channels=512,
+        num_codes=32,
+        use_se_loss=True,
+        add_lateral=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_se_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/erfnet_fcn.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/erfnet_fcn.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d68a7229687b36bcbdfc87fec8f11d8e2ee89ea
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/erfnet_fcn.py
@@ -0,0 +1,40 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='ERFNet',
+        in_channels=3,
+        enc_downsample_channels=(16, 64, 128),
+        enc_stage_non_bottlenecks=(5, 8),
+        enc_non_bottleneck_dilations=(2, 4, 8, 16),
+        enc_non_bottleneck_channels=(64, 128),
+        dec_upsample_channels=(64, 16),
+        dec_stages_non_bottleneck=(2, 2),
+        dec_non_bottleneck_channels=(64, 16),
+        dropout_ratio=0.1,
+        init_cfg=None),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=16,
+        channels=128,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/fast_scnn.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/fast_scnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..11127b0115890bfee2ae1b28e58792df9dd9719c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/fast_scnn.py
@@ -0,0 +1,65 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='FastSCNN',
+        downsample_dw_channels=(32, 48),
+        global_in_channels=64,
+        global_block_channels=(64, 96, 128),
+        global_block_strides=(2, 2, 1),
+        global_out_channels=128,
+        higher_in_channels=64,
+        lower_in_channels=128,
+        fusion_out_channels=128,
+        out_indices=(0, 1, 2),
+        norm_cfg=norm_cfg,
+        align_corners=False),
+    decode_head=dict(
+        type='DepthwiseSeparableFCNHead',
+        in_channels=128,
+        channels=128,
+        concat_input=False,
+        num_classes=19,
+        in_index=-1,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1)),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=32,
+            num_convs=1,
+            num_classes=19,
+            in_index=-2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
+        dict(
+            type='FCNHead',
+            in_channels=64,
+            channels=32,
+            num_convs=1,
+            num_classes=19,
+            in_index=-3,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
+    ],
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/fastfcn_r50-d32_jpu_psp.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/fastfcn_r50-d32_jpu_psp.py
new file mode 100644
index 0000000000000000000000000000000000000000..a200b4bac6420f77c6c31a47a40a4301b2bf2b1e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/fastfcn_r50-d32_jpu_psp.py
@@ -0,0 +1,61 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 2, 2),
+        out_indices=(1, 2, 3),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    neck=dict(
+        type='JPU',
+        in_channels=(512, 1024, 2048),
+        mid_channels=512,
+        start_level=0,
+        end_level=-1,
+        dilations=(1, 2, 4, 8),
+        align_corners=False,
+        norm_cfg=norm_cfg),
+    decode_head=dict(
+        type='PSPHead',
+        in_channels=2048,
+        in_index=2,
+        channels=512,
+        pool_scales=(1, 2, 3, 6),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=1,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/fcn_hr18.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/fcn_hr18.py
new file mode 100644
index 0000000000000000000000000000000000000000..01a447aabe67e1226cd60424f4808ffeff15a0a6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/fcn_hr18.py
@@ -0,0 +1,60 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144)))),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        channels=sum([18, 36, 72, 144]),
+        input_transform='resize_concat',
+        kernel_size=1,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=-1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/fcn_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/fcn_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a76a6c3fbe2d645e607abe914c47402234ab383
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/fcn_r50-d8.py
@@ -0,0 +1,53 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        num_convs=2,
+        concat_input=True,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/fcn_unet_s5-d16.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/fcn_unet_s5-d16.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f880d21e205a56fb4e2504ec4eca2617284b1bd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/fcn_unet_s5-d16.py
@@ -0,0 +1,59 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='UNet',
+        in_channels=3,
+        base_channels=64,
+        num_stages=5,
+        strides=(1, 1, 1, 1, 1),
+        enc_num_convs=(2, 2, 2, 2, 2),
+        dec_num_convs=(2, 2, 2, 2),
+        downsamples=(True, True, True, True),
+        enc_dilations=(1, 1, 1, 1, 1),
+        dec_dilations=(1, 1, 1, 1),
+        with_cp=False,
+        conv_cfg=None,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU'),
+        upsample_cfg=dict(type='InterpConv'),
+        norm_eval=False),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=64,
+        in_index=4,
+        channels=64,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=2,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=128,
+        in_index=3,
+        channels=64,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=2,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=256, stride=170))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/fpn_poolformer_s12.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/fpn_poolformer_s12.py
new file mode 100644
index 0000000000000000000000000000000000000000..086c804837aa48fd871a2c0ba24d928ea62fd767
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/fpn_poolformer_s12.py
@@ -0,0 +1,54 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s12_3rdparty_32xb128_in1k_20220414-f8d83051.pth'  # noqa
+# TODO: delete custom_imports after mmpretrain supports auto import
+# please install mmpretrain >= 1.0.0rc7
+# import mmpretrain.models to trigger register_module in mmpretrain
+custom_imports = dict(
+    imports=['mmpretrain.models'], allow_failed_imports=False)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='mmpretrain.PoolFormer',
+        arch='s12',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file, prefix='backbone.'),
+        in_patch_size=7,
+        in_stride=4,
+        in_pad=2,
+        down_patch_size=3,
+        down_stride=2,
+        down_pad=1,
+        drop_rate=0.,
+        drop_path_rate=0.,
+        out_indices=(0, 2, 4, 6),
+        frozen_stages=0,
+    ),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    decode_head=dict(
+        type='FPNHead',
+        in_channels=[256, 256, 256, 256],
+        in_index=[0, 1, 2, 3],
+        feature_strides=[4, 8, 16, 32],
+        channels=128,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/fpn_r50.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/fpn_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..3baa0970fbc65866ff0edfc4a099d725483fb883
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/fpn_r50.py
@@ -0,0 +1,44 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    decode_head=dict(
+        type='FPNHead',
+        in_channels=[256, 256, 256, 256],
+        in_index=[0, 1, 2, 3],
+        feature_strides=[4, 8, 16, 32],
+        channels=128,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/gcnet_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/gcnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..8238d4b5885f9562de6d7d31fc73e2dea4adb0c7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/gcnet_r50-d8.py
@@ -0,0 +1,54 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='GCHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        ratio=1 / 4.,
+        pooling_type='att',
+        fusion_types=('channel_add', ),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/icnet_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/icnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..4377053bdaed3271893cdeef82b5c929f74419c2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/icnet_r50-d8.py
@@ -0,0 +1,82 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ICNet',
+        backbone_cfg=dict(
+            type='ResNetV1c',
+            in_channels=3,
+            depth=50,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            dilations=(1, 1, 2, 4),
+            strides=(1, 2, 1, 1),
+            norm_cfg=norm_cfg,
+            norm_eval=False,
+            style='pytorch',
+            contract_dilation=True),
+        in_channels=3,
+        layer_channels=(512, 2048),
+        light_branch_middle_channels=32,
+        psp_out_channels=512,
+        out_channels=(64, 256, 256),
+        norm_cfg=norm_cfg,
+        align_corners=False,
+    ),
+    neck=dict(
+        type='ICNeck',
+        in_channels=(64, 256, 256),
+        out_channels=128,
+        norm_cfg=norm_cfg,
+        align_corners=False),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=128,
+        channels=128,
+        num_convs=1,
+        in_index=2,
+        dropout_ratio=0,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        concat_input=False,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=128,
+            num_convs=1,
+            num_classes=19,
+            in_index=0,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=128,
+            num_convs=1,
+            num_classes=19,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    ],
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/isanet_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/isanet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..e028ba85b4308e8bbb7a3cc7b8f9f79ef328c4fa
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/isanet_r50-d8.py
@@ -0,0 +1,53 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='ISAHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        isa_channels=256,
+        down_factor=(8, 8),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/lraspp_m-v3-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/lraspp_m-v3-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..acf70e7107b9bd2200a3e3542a0d6e31bdaf5b47
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/lraspp_m-v3-d8.py
@@ -0,0 +1,33 @@
+# model settings
+norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='MobileNetV3',
+        arch='large',
+        out_indices=(1, 3, 16),
+        norm_cfg=norm_cfg),
+    decode_head=dict(
+        type='LRASPPHead',
+        in_channels=(16, 24, 960),
+        in_index=(0, 1, 2),
+        channels=128,
+        input_transform='multiple_select',
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU'),
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/nonlocal_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/nonlocal_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d73a8486011992d936637cb0065dd38a43bf52c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/nonlocal_r50-d8.py
@@ -0,0 +1,54 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='NLHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        dropout_ratio=0.1,
+        reduction=2,
+        use_scale=True,
+        mode='embedded_gaussian',
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/ocrnet_hr18.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/ocrnet_hr18.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c7fcfe3d61bb0d4f16ae51ec807697afca7e04e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/ocrnet_hr18.py
@@ -0,0 +1,76 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='CascadeEncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    num_stages=2,
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[18, 36, 72, 144],
+            channels=sum([18, 36, 72, 144]),
+            in_index=(0, 1, 2, 3),
+            input_transform='resize_concat',
+            kernel_size=1,
+            num_convs=1,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=19,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[18, 36, 72, 144],
+            in_index=(0, 1, 2, 3),
+            input_transform='resize_concat',
+            channels=512,
+            ocr_channels=256,
+            dropout_ratio=-1,
+            num_classes=19,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ],
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/ocrnet_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/ocrnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a2588f9838857480d1aace3cd80e4acba02659f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/ocrnet_r50-d8.py
@@ -0,0 +1,55 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='CascadeEncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    num_stages=2,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=1024,
+            in_index=2,
+            channels=256,
+            num_convs=1,
+            concat_input=False,
+            dropout_ratio=0.1,
+            num_classes=19,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=2048,
+            in_index=3,
+            channels=512,
+            ocr_channels=256,
+            dropout_ratio=0.1,
+            num_classes=19,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ],
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/pointrend_r50.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/pointrend_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a27e856f5c10a02972bcf10b4ea8e4e64b2e9cf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/pointrend_r50.py
@@ -0,0 +1,64 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='CascadeEncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    num_stages=2,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    decode_head=[
+        dict(
+            type='FPNHead',
+            in_channels=[256, 256, 256, 256],
+            in_index=[0, 1, 2, 3],
+            feature_strides=[4, 8, 16, 32],
+            channels=128,
+            dropout_ratio=-1,
+            num_classes=19,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='PointHead',
+            in_channels=[256],
+            in_index=[0],
+            channels=256,
+            num_fcs=3,
+            coarse_pred_each_layer=True,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ],
+    # model training and testing settings
+    train_cfg=dict(
+        num_points=2048, oversample_ratio=3, importance_sample_ratio=0.75),
+    test_cfg=dict(
+        mode='whole',
+        subdivision_steps=2,
+        subdivision_num_points=8196,
+        scale_factor=2))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/psanet_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/psanet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..40fd5a91374463982458fe3bad73c3512922328e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/psanet_r50-d8.py
@@ -0,0 +1,57 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='PSAHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        mask_size=(97, 97),
+        psa_type='bi-direction',
+        compact=False,
+        shrink_factor=2,
+        normalization_factor=1.0,
+        psa_softmax=True,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/pspnet_r50-d8.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/pspnet_r50-d8.py
new file mode 100644
index 0000000000000000000000000000000000000000..c257b8ba276f35b66db3fec96e70ff3ff930ce44
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/pspnet_r50-d8.py
@@ -0,0 +1,52 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='PSPHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        pool_scales=(1, 2, 3, 6),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/pspnet_unet_s5-d16.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/pspnet_unet_s5-d16.py
new file mode 100644
index 0000000000000000000000000000000000000000..834a22ad00c6314cfcfaafa9f339ef56f318a1ca
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/pspnet_unet_s5-d16.py
@@ -0,0 +1,58 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='UNet',
+        in_channels=3,
+        base_channels=64,
+        num_stages=5,
+        strides=(1, 1, 1, 1, 1),
+        enc_num_convs=(2, 2, 2, 2, 2),
+        dec_num_convs=(2, 2, 2, 2),
+        downsamples=(True, True, True, True),
+        enc_dilations=(1, 1, 1, 1, 1),
+        dec_dilations=(1, 1, 1, 1),
+        with_cp=False,
+        conv_cfg=None,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU'),
+        upsample_cfg=dict(type='InterpConv'),
+        norm_eval=False),
+    decode_head=dict(
+        type='PSPHead',
+        in_channels=64,
+        in_index=4,
+        channels=16,
+        pool_scales=(1, 2, 3, 6),
+        dropout_ratio=0.1,
+        num_classes=2,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=128,
+        in_index=3,
+        channels=64,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=2,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=256, stride=170))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/san_vit-b16.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/san_vit-b16.py
new file mode 100644
index 0000000000000000000000000000000000000000..96ac41b8dade5663b19dbc4a0fc67df5e0250c05
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/san_vit-b16.py
@@ -0,0 +1,137 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[122.7709, 116.7460, 104.0937],
+    std=[68.5005, 66.6322, 70.3232],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size_divisor=640,
+    test_cfg=dict(size_divisor=32))
+
+num_classes = 171
+model = dict(
+    type='MultimodalEncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/clip_vit_base_patch16_224.pth',
+    asymetric_input=True,
+    encoder_resolution=0.5,
+    image_encoder=dict(
+        type='VisionTransformer',
+        img_size=(224, 224),
+        patch_size=16,
+        patch_pad=0,
+        in_channels=3,
+        embed_dims=768,
+        num_layers=9,
+        num_heads=12,
+        mlp_ratio=4,
+        out_origin=True,
+        out_indices=(2, 5, 8),
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        with_cls_token=True,
+        output_cls_token=True,
+        patch_bias=False,
+        pre_norm=True,
+        norm_cfg=dict(type='LN', eps=1e-5),
+        act_cfg=dict(type='QuickGELU'),
+        norm_eval=False,
+        interpolate_mode='bicubic',
+        frozen_exclude=['pos_embed']),
+    text_encoder=dict(
+        type='CLIPTextEncoder',
+        dataset_name=None,
+        templates='vild',
+        embed_dims=512,
+        num_layers=12,
+        num_heads=8,
+        mlp_ratio=4,
+        output_dims=512,
+        cache_feature=True,
+        cat_bg=True,
+        norm_cfg=dict(type='LN', eps=1e-5)
+        ),
+    decode_head=dict(
+        type='SideAdapterCLIPHead',
+        num_classes=num_classes,
+        deep_supervision_idxs=[7],
+        san_cfg=dict(
+            in_channels=3,
+            clip_channels=768,
+            embed_dims=240,
+            patch_size=16,
+            patch_bias=True,
+            num_queries=100,
+            cfg_encoder=dict(
+                num_encode_layer=8,
+                num_heads=6,
+                mlp_ratio=4
+            ),
+            fusion_index=[0, 1, 2, 3],
+            cfg_decoder=dict(
+                num_heads=12,
+                num_layers=1,
+                embed_channels=256,
+                mlp_channels=256,
+                num_mlp=3,
+                rescale=True),
+            norm_cfg=dict(type='LN', eps=1e-6),
+        ),
+        maskgen_cfg=dict(
+            sos_token_format='cls_token',
+            sos_token_num=100,
+            cross_attn=False,
+            num_layers=3,
+            embed_dims=768,
+            num_heads=12,
+            mlp_ratio=4,
+            qkv_bias=True,
+            out_dims=512,
+            final_norm=True,
+            act_cfg=dict(type='QuickGELU'),
+            norm_cfg=dict(type='LN', eps=1e-5),
+            frozen_exclude=[]
+        ),
+        align_corners=False,
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='HungarianAssigner',
+                match_costs=[
+                    dict(type='ClassificationCost', weight=2.0),
+                    dict(
+                        type='CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ])),
+        loss_decode=[dict(type='CrossEntropyLoss',
+                          loss_name='loss_cls_ce',
+                          loss_weight=2.0,
+                          class_weight=[1.0] * num_classes + [0.1]),
+                     dict(type='CrossEntropyLoss',
+                          use_sigmoid=True,
+                          loss_name='loss_mask_ce',
+                          loss_weight=5.0),
+                     dict(type='DiceLoss',
+                          ignore_index=None,
+                          naive_dice=True,
+                          eps=1,
+                          loss_name='loss_mask_dice',
+                          loss_weight=5.0)
+                     ]),
+
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))  # yapf: disable
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/segformer_mit-b0.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/segformer_mit-b0.py
new file mode 100644
index 0000000000000000000000000000000000000000..46841adc076aa34720b575d170d48563ae0b4511
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/segformer_mit-b0.py
@@ -0,0 +1,42 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='MixVisionTransformer',
+        in_channels=3,
+        embed_dims=32,
+        num_stages=4,
+        num_layers=[2, 2, 2, 2],
+        num_heads=[1, 2, 5, 8],
+        patch_sizes=[7, 3, 3, 3],
+        sr_ratios=[8, 4, 2, 1],
+        out_indices=(0, 1, 2, 3),
+        mlp_ratio=4,
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.1),
+    decode_head=dict(
+        type='SegformerHead',
+        in_channels=[32, 64, 160, 256],
+        in_index=[0, 1, 2, 3],
+        channels=256,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/segmenter_vit-b16_mask.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/segmenter_vit-b16_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f3dad1536daee6e0cc8da03a0726396c916a4d2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/segmenter_vit-b16_mask.py
@@ -0,0 +1,44 @@
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segmenter/vit_base_p16_384_20220308-96dfe169.pth'  # noqa
+# model settings
+backbone_norm_cfg = dict(type='LN', eps=1e-6, requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=checkpoint,
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=(512, 512),
+        patch_size=16,
+        in_channels=3,
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        drop_path_rate=0.1,
+        attn_drop_rate=0.0,
+        drop_rate=0.0,
+        final_norm=True,
+        norm_cfg=backbone_norm_cfg,
+        with_cls_token=True,
+        interpolate_mode='bicubic',
+    ),
+    decode_head=dict(
+        type='SegmenterMaskTransformerHead',
+        in_channels=768,
+        channels=768,
+        num_classes=150,
+        num_layers=2,
+        num_heads=12,
+        embed_dims=768,
+        dropout_ratio=0.0,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+    ),
+    test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(480, 480)),
+)
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/setr_mla.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/setr_mla.py
new file mode 100644
index 0000000000000000000000000000000000000000..dedf169cacfba5f11bd8cc885bb0e080d2953647
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/setr_mla.py
@@ -0,0 +1,103 @@
+# model settings
+backbone_norm_cfg = dict(type='LN', eps=1e-6, requires_grad=True)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/jx_vit_large_p16_384-b3be5167.pth',
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=(768, 768),
+        patch_size=16,
+        in_channels=3,
+        embed_dims=1024,
+        num_layers=24,
+        num_heads=16,
+        out_indices=(5, 11, 17, 23),
+        drop_rate=0.1,
+        norm_cfg=backbone_norm_cfg,
+        with_cls_token=False,
+        interpolate_mode='bilinear',
+    ),
+    neck=dict(
+        type='MLANeck',
+        in_channels=[1024, 1024, 1024, 1024],
+        out_channels=256,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU'),
+    ),
+    decode_head=dict(
+        type='SETRMLAHead',
+        in_channels=(256, 256, 256, 256),
+        channels=512,
+        in_index=(0, 1, 2, 3),
+        dropout_ratio=0,
+        mla_channels=128,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=256,
+            channels=256,
+            in_index=0,
+            dropout_ratio=0,
+            num_convs=0,
+            kernel_size=1,
+            concat_input=False,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='FCNHead',
+            in_channels=256,
+            channels=256,
+            in_index=1,
+            dropout_ratio=0,
+            num_convs=0,
+            kernel_size=1,
+            concat_input=False,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='FCNHead',
+            in_channels=256,
+            channels=256,
+            in_index=2,
+            dropout_ratio=0,
+            num_convs=0,
+            kernel_size=1,
+            concat_input=False,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='FCNHead',
+            in_channels=256,
+            channels=256,
+            in_index=3,
+            dropout_ratio=0,
+            num_convs=0,
+            kernel_size=1,
+            concat_input=False,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    ],
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/setr_naive.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/setr_naive.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccf5b3398ba02978cb0c870fdb089804db43f80e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/setr_naive.py
@@ -0,0 +1,88 @@
+# model settings
+backbone_norm_cfg = dict(type='LN', eps=1e-6, requires_grad=True)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/jx_vit_large_p16_384-b3be5167.pth',
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=(768, 768),
+        patch_size=16,
+        in_channels=3,
+        embed_dims=1024,
+        num_layers=24,
+        num_heads=16,
+        out_indices=(9, 14, 19, 23),
+        drop_rate=0.1,
+        norm_cfg=backbone_norm_cfg,
+        with_cls_token=True,
+        interpolate_mode='bilinear',
+    ),
+    decode_head=dict(
+        type='SETRUPHead',
+        in_channels=1024,
+        channels=256,
+        in_index=3,
+        num_classes=19,
+        dropout_ratio=0,
+        norm_cfg=norm_cfg,
+        num_convs=1,
+        up_scale=4,
+        kernel_size=1,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=[
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=0,
+            num_classes=19,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            num_convs=1,
+            up_scale=4,
+            kernel_size=1,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=1,
+            num_classes=19,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            num_convs=1,
+            up_scale=4,
+            kernel_size=1,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=2,
+            num_classes=19,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            num_convs=1,
+            up_scale=4,
+            kernel_size=1,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4))
+    ],
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/setr_pup.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/setr_pup.py
new file mode 100644
index 0000000000000000000000000000000000000000..df1bc1890d2c59b344c1460986d08aceb11e0bcb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/setr_pup.py
@@ -0,0 +1,88 @@
+# model settings
+backbone_norm_cfg = dict(type='LN', eps=1e-6, requires_grad=True)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/jx_vit_large_p16_384-b3be5167.pth',
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=(768, 768),
+        patch_size=16,
+        in_channels=3,
+        embed_dims=1024,
+        num_layers=24,
+        num_heads=16,
+        out_indices=(9, 14, 19, 23),
+        drop_rate=0.1,
+        norm_cfg=backbone_norm_cfg,
+        with_cls_token=True,
+        interpolate_mode='bilinear',
+    ),
+    decode_head=dict(
+        type='SETRUPHead',
+        in_channels=1024,
+        channels=256,
+        in_index=3,
+        num_classes=19,
+        dropout_ratio=0,
+        norm_cfg=norm_cfg,
+        num_convs=4,
+        up_scale=2,
+        kernel_size=3,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=[
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=0,
+            num_classes=19,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            num_convs=1,
+            up_scale=4,
+            kernel_size=3,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=1,
+            num_classes=19,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            num_convs=1,
+            up_scale=4,
+            kernel_size=3,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=2,
+            num_classes=19,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            num_convs=1,
+            up_scale=4,
+            kernel_size=3,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    ],
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/stdc.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/stdc.py
new file mode 100644
index 0000000000000000000000000000000000000000..01bf2b925e550d8645e82f2a41405bc21a989151
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/stdc.py
@@ -0,0 +1,91 @@
+norm_cfg = dict(type='BN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='STDCContextPathNet',
+        backbone_cfg=dict(
+            type='STDCNet',
+            stdc_type='STDCNet1',
+            in_channels=3,
+            channels=(32, 64, 256, 512, 1024),
+            bottleneck_type='cat',
+            num_convs=4,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'),
+            with_final_conv=False),
+        last_in_channels=(1024, 512),
+        out_channels=128,
+        ffm_cfg=dict(in_channels=384, out_channels=256, scale_factor=4)),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=256,
+        channels=256,
+        num_convs=1,
+        num_classes=19,
+        in_index=3,
+        concat_input=False,
+        dropout_ratio=0.1,
+        norm_cfg=norm_cfg,
+        align_corners=True,
+        sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=19,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=19,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='STDCHead',
+            in_channels=256,
+            channels=64,
+            num_convs=1,
+            num_classes=2,
+            boundary_threshold=0.1,
+            in_index=0,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=True,
+            loss_decode=[
+                dict(
+                    type='CrossEntropyLoss',
+                    loss_name='loss_ce',
+                    use_sigmoid=True,
+                    loss_weight=1.0),
+                dict(type='DiceLoss', loss_name='loss_dice', loss_weight=1.0)
+            ]),
+    ],
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/twins_pcpvt-s_fpn.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/twins_pcpvt-s_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..059210b5e1976d5b856720a6941071e509c8b223
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/twins_pcpvt-s_fpn.py
@@ -0,0 +1,53 @@
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_small_20220308-e638c41c.pth'  # noqa
+
+# model settings
+backbone_norm_cfg = dict(type='LN')
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='PCPVT',
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        in_channels=3,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        patch_sizes=[4, 2, 2, 2],
+        strides=[4, 2, 2, 2],
+        mlp_ratios=[8, 8, 4, 4],
+        out_indices=(0, 1, 2, 3),
+        qkv_bias=True,
+        norm_cfg=backbone_norm_cfg,
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+        norm_after_stage=False,
+        drop_rate=0.0,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2),
+    neck=dict(
+        type='FPN',
+        in_channels=[64, 128, 320, 512],
+        out_channels=256,
+        num_outs=4),
+    decode_head=dict(
+        type='FPNHead',
+        in_channels=[256, 256, 256, 256],
+        in_index=[0, 1, 2, 3],
+        feature_strides=[4, 8, 16, 32],
+        channels=128,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/twins_pcpvt-s_upernet.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/twins_pcpvt-s_upernet.py
new file mode 100644
index 0000000000000000000000000000000000000000..585a76f8586e90ba7a59ed4313ccd96161d29a02
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/twins_pcpvt-s_upernet.py
@@ -0,0 +1,61 @@
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_small_20220308-e638c41c.pth'  # noqa
+
+# model settings
+backbone_norm_cfg = dict(type='LN')
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='PCPVT',
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        in_channels=3,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        patch_sizes=[4, 2, 2, 2],
+        strides=[4, 2, 2, 2],
+        mlp_ratios=[8, 8, 4, 4],
+        out_indices=(0, 1, 2, 3),
+        qkv_bias=True,
+        norm_cfg=backbone_norm_cfg,
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+        norm_after_stage=False,
+        drop_rate=0.0,
+        attn_drop_rate=0.,
+        drop_path_rate=0.2),
+    decode_head=dict(
+        type='UPerHead',
+        in_channels=[64, 128, 320, 512],
+        in_index=[0, 1, 2, 3],
+        pool_scales=(1, 2, 3, 6),
+        channels=512,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=320,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_beit.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_beit.py
new file mode 100644
index 0000000000000000000000000000000000000000..691e288dbfdf8a79b3b365b70c086626e2ab82d1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_beit.py
@@ -0,0 +1,58 @@
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='BEiT',
+        img_size=(640, 640),
+        patch_size=16,
+        in_channels=3,
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        mlp_ratio=4,
+        out_indices=(3, 5, 7, 11),
+        qv_bias=True,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.1,
+        norm_cfg=dict(type='LN', eps=1e-6),
+        act_cfg=dict(type='GELU'),
+        norm_eval=False,
+        init_values=0.1),
+    neck=dict(type='Feature2Pyramid', embed_dim=768, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='UPerHead',
+        in_channels=[768, 768, 768, 768],
+        in_index=[0, 1, 2, 3],
+        pool_scales=(1, 2, 3, 6),
+        channels=768,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=768,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_convnext.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_convnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..958994c91e65dd39ebebf41a2c7145ae22184d56
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_convnext.py
@@ -0,0 +1,52 @@
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+custom_imports = dict(imports='mmpretrain.models', allow_failed_imports=False)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-base_3rdparty_32xb128-noema_in1k_20220301-2a0ee547.pth'  # noqa
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='mmpretrain.ConvNeXt',
+        arch='base',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    decode_head=dict(
+        type='UPerHead',
+        in_channels=[128, 256, 512, 1024],
+        in_index=[0, 1, 2, 3],
+        pool_scales=(1, 2, 3, 6),
+        channels=512,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=384,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_mae.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_mae.py
new file mode 100644
index 0000000000000000000000000000000000000000..b833b67645981473b60a505cf244d611cf4817a1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_mae.py
@@ -0,0 +1,57 @@
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='MAE',
+        img_size=(640, 640),
+        patch_size=16,
+        in_channels=3,
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        mlp_ratio=4,
+        out_indices=(3, 5, 7, 11),
+        attn_drop_rate=0.0,
+        drop_path_rate=0.1,
+        norm_cfg=dict(type='LN', eps=1e-6),
+        act_cfg=dict(type='GELU'),
+        norm_eval=False,
+        init_values=0.1),
+    neck=dict(type='Feature2Pyramid', embed_dim=768, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='UPerHead',
+        in_channels=[384, 384, 384, 384],
+        in_index=[0, 1, 2, 3],
+        pool_scales=(1, 2, 3, 6),
+        channels=512,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=384,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_r50.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..97f2eb8c4866addab1b7d99754f7696effd92a1a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_r50.py
@@ -0,0 +1,52 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='UPerHead',
+        in_channels=[256, 512, 1024, 2048],
+        in_index=[0, 1, 2, 3],
+        pool_scales=(1, 2, 3, 6),
+        channels=512,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_swin.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..61cfce035ebbb33e1f4f9d0b716815fa84772bda
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_swin.py
@@ -0,0 +1,62 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+backbone_norm_cfg = dict(type='LN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        embed_dims=96,
+        patch_size=4,
+        window_size=7,
+        mlp_ratio=4,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        strides=(4, 2, 2, 2),
+        out_indices=(0, 1, 2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        use_abs_pos_embed=False,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=backbone_norm_cfg),
+    decode_head=dict(
+        type='UPerHead',
+        in_channels=[96, 192, 384, 768],
+        in_index=[0, 1, 2, 3],
+        pool_scales=(1, 2, 3, 6),
+        channels=512,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=384,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_vit-b16_ln_mln.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_vit-b16_ln_mln.py
new file mode 100644
index 0000000000000000000000000000000000000000..776525ad98f07187f18cdb1e86c5f3cf8f2ef63c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/upernet_vit-b16_ln_mln.py
@@ -0,0 +1,65 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/jx_vit_base_p16_224-80ecf9dd.pth',
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=(512, 512),
+        patch_size=16,
+        in_channels=3,
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        mlp_ratio=4,
+        out_indices=(2, 5, 8, 11),
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        with_cls_token=True,
+        norm_cfg=dict(type='LN', eps=1e-6),
+        act_cfg=dict(type='GELU'),
+        norm_eval=False,
+        interpolate_mode='bicubic'),
+    neck=dict(
+        type='MultiLevelNeck',
+        in_channels=[768, 768, 768, 768],
+        out_channels=768,
+        scales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='UPerHead',
+        in_channels=[768, 768, 768, 768],
+        in_index=[0, 1, 2, 3],
+        pool_scales=(1, 2, 3, 6),
+        channels=512,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=768,
+        in_index=3,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))  # yapf: disable
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/models/vpd_sd.py b/head_extractor/src/mmseg/.mim/configs/_base_/models/vpd_sd.py
new file mode 100644
index 0000000000000000000000000000000000000000..87321e74f04aacbf67d5bcb1677f60399a66fd34
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/models/vpd_sd.py
@@ -0,0 +1,86 @@
+# model settings
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=0)
+
+# adapted from stable-diffusion/configs/stable-diffusion/v1-inference.yaml
+stable_diffusion_cfg = dict(
+    base_learning_rate=0.0001,
+    target='ldm.models.diffusion.ddpm.LatentDiffusion',
+    checkpoint='https://download.openmmlab.com/mmsegmentation/v0.5/'
+    'vpd/stable_diffusion_v1-5_pretrain_third_party.pth',
+    params=dict(
+        linear_start=0.00085,
+        linear_end=0.012,
+        num_timesteps_cond=1,
+        log_every_t=200,
+        timesteps=1000,
+        first_stage_key='jpg',
+        cond_stage_key='txt',
+        image_size=64,
+        channels=4,
+        cond_stage_trainable=False,
+        conditioning_key='crossattn',
+        monitor='val/loss_simple_ema',
+        scale_factor=0.18215,
+        use_ema=False,
+        scheduler_config=dict(
+            target='ldm.lr_scheduler.LambdaLinearScheduler',
+            params=dict(
+                warm_up_steps=[10000],
+                cycle_lengths=[10000000000000],
+                f_start=[1e-06],
+                f_max=[1.0],
+                f_min=[1.0])),
+        unet_config=dict(
+            target='ldm.modules.diffusionmodules.openaimodel.UNetModel',
+            params=dict(
+                image_size=32,
+                in_channels=4,
+                out_channels=4,
+                model_channels=320,
+                attention_resolutions=[4, 2, 1],
+                num_res_blocks=2,
+                channel_mult=[1, 2, 4, 4],
+                num_heads=8,
+                use_spatial_transformer=True,
+                transformer_depth=1,
+                context_dim=768,
+                use_checkpoint=True,
+                legacy=False)),
+        first_stage_config=dict(
+            target='ldm.models.autoencoder.AutoencoderKL',
+            params=dict(
+                embed_dim=4,
+                monitor='val/rec_loss',
+                ddconfig=dict(
+                    double_z=True,
+                    z_channels=4,
+                    resolution=256,
+                    in_channels=3,
+                    out_ch=3,
+                    ch=128,
+                    ch_mult=[1, 2, 4, 4],
+                    num_res_blocks=2,
+                    attn_resolutions=[],
+                    dropout=0.0),
+                lossconfig=dict(target='torch.nn.Identity'))),
+        cond_stage_config=dict(
+            target='ldm.modules.encoders.modules.AbstractEncoder')))
+
+model = dict(
+    type='DepthEstimator',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='VPD',
+        diffusion_cfg=stable_diffusion_cfg,
+    ),
+)
+
+# some of the parameters in stable-diffusion model will not be updated
+# during training
+find_unused_parameters = True
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_160k.py b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_160k.py
new file mode 100644
index 0000000000000000000000000000000000000000..60d7bec76244e86ec4635173a45985d4f7023e74
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_160k.py
@@ -0,0 +1,25 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=16000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=16000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_20k.py b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..e809e3e88092446b8ee233ebbc6feccabdbccaac
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_20k.py
@@ -0,0 +1,24 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=20000,
+        by_epoch=False)
+]
+# training schedule for 20k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=20000, val_interval=2000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_240k.py b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_240k.py
new file mode 100644
index 0000000000000000000000000000000000000000..feb2ce9637fd539a28881d1ddf516e5bc3e58be5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_240k.py
@@ -0,0 +1,25 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=240000,
+        by_epoch=False)
+]
+# training schedule for 240k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=240000, val_interval=24000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=24000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_25k.py b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_25k.py
new file mode 100644
index 0000000000000000000000000000000000000000..825e141ed12cd36567030de94c6bd081718510ee
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_25k.py
@@ -0,0 +1,28 @@
+# optimizer
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.1)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=3e-2, begin=0, end=12000,
+        by_epoch=False),
+    dict(
+        type='PolyLRRatio',
+        eta_min_ratio=3e-2,
+        power=0.9,
+        begin=12000,
+        end=24000,
+        by_epoch=False),
+    dict(type='ConstantLR', by_epoch=False, factor=1, begin=24000, end=25000)
+]
+# training schedule for 25k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=25000, val_interval=1000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_320k.py b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_320k.py
new file mode 100644
index 0000000000000000000000000000000000000000..70b063afc9f89d62cb2f2dce8a6f225ad4d39220
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_320k.py
@@ -0,0 +1,25 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=320000,
+        by_epoch=False)
+]
+# training schedule for 320k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=320000, val_interval=32000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=32000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_40k.py b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_40k.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b823339a28cfc19159e4e93603fbf1beb81ac94
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_40k.py
@@ -0,0 +1,24 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=40000,
+        by_epoch=False)
+]
+# training schedule for 40k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=40000, val_interval=4000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_80k.py b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_80k.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dcd6c4d1bc0158107276fc9abe7d5d62c0880c2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/_base_/schedules/schedule_80k.py
@@ -0,0 +1,24 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=80000,
+        by_epoch=False)
+]
+# training schedule for 80k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=80000, val_interval=8000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=8000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/README.md b/head_extractor/src/mmseg/.mim/configs/ann/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1281a9ee14f2aabcfaa128ace7e945a77c361038
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/README.md
@@ -0,0 +1,68 @@
+# ANN
+
+> [Asymmetric Non-local Neural Networks for Semantic Segmentation](https://arxiv.org/abs/1908.07678)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/MendelXu/ANN">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+The non-local module works as a particularly useful technique for semantic segmentation while criticized for its prohibitive computation and GPU memory occupation. In this paper, we present Asymmetric Non-local Neural Network to semantic segmentation, which has two prominent components: Asymmetric Pyramid Non-local Block (APNB) and Asymmetric Fusion Non-local Block (AFNB). APNB leverages a pyramid sampling module into the non-local block to largely reduce the computation and memory consumption without sacrificing the performance. AFNB is adapted from APNB to fuse the features of different levels under a sufficient consideration of long range dependencies and thus considerably improves the performance. Extensive experiments on semantic segmentation benchmarks demonstrate the effectiveness and efficiency of our work. In particular, we report the state-of-the-art performance of 81.3 mIoU on the Cityscapes test set. For a 256x128 input, APNB is around 6 times faster than a non-local block on GPU while 28 times smaller in GPU running memory occupation. Code is available at: [this https URL](https://github.com/MendelXu/ANN).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142898322-3bbd578c-e488-4bae-9c14-7598adac5cbd.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ANN    | R-50-D8  | 512x1024  |   40000 | 6        | 3.71           | V100   | 77.40 |         78.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_40k_cityscapes/ann_r50-d8_512x1024_40k_cityscapes_20200605_095211-049fc292.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_40k_cityscapes/ann_r50-d8_512x1024_40k_cityscapes_20200605_095211.log.json)     |
+| ANN    | R-101-D8 | 512x1024  |   40000 | 9.5      | 2.55           | V100   | 76.55 |         78.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_40k_cityscapes/ann_r101-d8_512x1024_40k_cityscapes_20200605_095243-adf6eece.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_40k_cityscapes/ann_r101-d8_512x1024_40k_cityscapes_20200605_095243.log.json) |
+| ANN    | R-50-D8  | 769x769   |   40000 | 6.8      | 1.70           | V100   | 78.89 |         80.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_40k_cityscapes/ann_r50-d8_769x769_40k_cityscapes_20200530_025712-2b46b04d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_40k_cityscapes/ann_r50-d8_769x769_40k_cityscapes_20200530_025712.log.json)         |
+| ANN    | R-101-D8 | 769x769   |   40000 | 10.7     | 1.15           | V100   | 79.32 |         80.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_40k_cityscapes/ann_r101-d8_769x769_40k_cityscapes_20200530_025720-059bff28.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_40k_cityscapes/ann_r101-d8_769x769_40k_cityscapes_20200530_025720.log.json)     |
+| ANN    | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 77.34 |         78.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_80k_cityscapes/ann_r50-d8_512x1024_80k_cityscapes_20200607_101911-5a9ad545.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_80k_cityscapes/ann_r50-d8_512x1024_80k_cityscapes_20200607_101911.log.json)     |
+| ANN    | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 77.14 |         78.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_80k_cityscapes/ann_r101-d8_512x1024_80k_cityscapes_20200607_013728-aceccc6e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_80k_cityscapes/ann_r101-d8_512x1024_80k_cityscapes_20200607_013728.log.json) |
+| ANN    | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 78.88 |         80.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_80k_cityscapes/ann_r50-d8_769x769_80k_cityscapes_20200607_044426-cc7ff323.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_80k_cityscapes/ann_r50-d8_769x769_80k_cityscapes_20200607_044426.log.json)         |
+| ANN    | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 78.80 |         80.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_80k_cityscapes/ann_r101-d8_769x769_80k_cityscapes_20200607_013713-a9d4be8d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_80k_cityscapes/ann_r101-d8_769x769_80k_cityscapes_20200607_013713.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                               | download                                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ANN    | R-50-D8  | 512x512   |   80000 | 9.1      | 21.01          | V100   | 41.01 |         42.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_80k_ade20k/ann_r50-d8_512x512_80k_ade20k_20200615_014818-26f75e11.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_80k_ade20k/ann_r50-d8_512x512_80k_ade20k_20200615_014818.log.json)         |
+| ANN    | R-101-D8 | 512x512   |   80000 | 12.5     | 14.12          | V100   | 42.94 |         44.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_80k_ade20k/ann_r101-d8_512x512_80k_ade20k_20200615_014818-c0153543.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_80k_ade20k/ann_r101-d8_512x512_80k_ade20k_20200615_014818.log.json)     |
+| ANN    | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 41.74 |         42.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_160k_ade20k/ann_r50-d8_512x512_160k_ade20k_20200615_231733-892247bc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_160k_ade20k/ann_r50-d8_512x512_160k_ade20k_20200615_231733.log.json)     |
+| ANN    | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 42.94 |         44.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_160k_ade20k/ann_r101-d8_512x512_160k_ade20k_20200615_231733-955eb1ec.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_160k_ade20k/ann_r101-d8_512x512_160k_ade20k_20200615_231733.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                                   |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ANN    | R-50-D8  | 512x512   |   20000 | 6        | 20.92          | V100   | 74.86 |         76.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_20k_voc12aug/ann_r50-d8_512x512_20k_voc12aug_20200617_222246-dfcb1c62.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_20k_voc12aug/ann_r50-d8_512x512_20k_voc12aug_20200617_222246.log.json)     |
+| ANN    | R-101-D8 | 512x512   |   20000 | 9.5      | 13.94          | V100   | 77.47 |         78.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_20k_voc12aug/ann_r101-d8_512x512_20k_voc12aug_20200617_222246-2fad0042.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_20k_voc12aug/ann_r101-d8_512x512_20k_voc12aug_20200617_222246.log.json) |
+| ANN    | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 76.56 |         77.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_40k_voc12aug/ann_r50-d8_512x512_40k_voc12aug_20200613_231314-b5dac322.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_40k_voc12aug/ann_r50-d8_512x512_40k_voc12aug_20200613_231314.log.json)     |
+| ANN    | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 76.70 |         78.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_40k_voc12aug/ann_r101-d8_512x512_40k_voc12aug_20200613_231314-bd205bbe.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_40k_voc12aug/ann_r101-d8_512x512_40k_voc12aug_20200613_231314.log.json) |
+
+## Citation
+
+```bibtex
+@inproceedings{zhu2019asymmetric,
+  title={Asymmetric non-local neural networks for semantic segmentation},
+  author={Zhu, Zhen and Xu, Mengde and Bai, Song and Huang, Tengteng and Bai, Xiang},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={593--602},
+  year={2019}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..0da7e0b7021ec3e8ead76c1a81f3ebb761b36a05
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..08459c0a509fd396d0af8967e5ee4a98c6734571
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..46781fa9f244e157e8e7cfe5b6023dec4c62a97c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..c951d8704ce1b59977b2643cf458789e891d37be
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f14327542f84bdf58a99cee4212c8d1c9e091eb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3c1a3f70693a1d3770ffa9b619df40ed73435dd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3c1a3f70693a1d3770ffa9b619df40ed73435dd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cc5b8e30042c965498fc03afec8ca2f5076a068
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..119eb7644789831d1434408a1fab525fa2bb8b66
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/ann_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..3152b929a64e096720b791a98369732fa6a326ed
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/ann_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..793437f7a8e7e38785a0488c10627b26dd4187dc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/ann_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e392ca1662cbacaa10f6aeb8192ea357303855f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/ann_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..900381dd1fb8aa4c17d832bb1ca595895dabc65b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/ann_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..6921218c3122cc5ffc551873fc81eaf0161b65b3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/ann_r50-d8.py', '../_base_/datasets/pascal_voc12_aug.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1c236049c891b4e8fd5e5c18f190c87000683d5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/ann_r50-d8.py', '../_base_/datasets/pascal_voc12_aug.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fb26efcf4e5a30ae272265e92643c0215c5f897
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/ann_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/ann_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/ann/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/ann/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0d118681fdceb502dffeaab01c9753fefc728361
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ann/metafile.yaml
@@ -0,0 +1,391 @@
+Collections:
+- Name: ANN
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  README: configs/ann/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: ann_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.4
+      mIoU(ms+flip): 78.57
+  Config: configs/ann/ann_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_40k_cityscapes/ann_r50-d8_512x1024_40k_cityscapes_20200605_095211-049fc292.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_40k_cityscapes/ann_r50-d8_512x1024_40k_cityscapes_20200605_095211.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.55
+      mIoU(ms+flip): 78.85
+  Config: configs/ann/ann_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_40k_cityscapes/ann_r101-d8_512x1024_40k_cityscapes_20200605_095243-adf6eece.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_40k_cityscapes/ann_r101-d8_512x1024_40k_cityscapes_20200605_095243.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.89
+      mIoU(ms+flip): 80.46
+  Config: configs/ann/ann_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_40k_cityscapes/ann_r50-d8_769x769_40k_cityscapes_20200530_025712-2b46b04d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_40k_cityscapes/ann_r50-d8_769x769_40k_cityscapes_20200530_025712.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.32
+      mIoU(ms+flip): 80.94
+  Config: configs/ann/ann_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_40k_cityscapes/ann_r101-d8_769x769_40k_cityscapes_20200530_025720-059bff28.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_40k_cityscapes/ann_r101-d8_769x769_40k_cityscapes_20200530_025720.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.34
+      mIoU(ms+flip): 78.65
+  Config: configs/ann/ann_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_80k_cityscapes/ann_r50-d8_512x1024_80k_cityscapes_20200607_101911-5a9ad545.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_80k_cityscapes/ann_r50-d8_512x1024_80k_cityscapes_20200607_101911.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.14
+      mIoU(ms+flip): 78.81
+  Config: configs/ann/ann_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_80k_cityscapes/ann_r101-d8_512x1024_80k_cityscapes_20200607_013728-aceccc6e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_80k_cityscapes/ann_r101-d8_512x1024_80k_cityscapes_20200607_013728.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.88
+      mIoU(ms+flip): 80.57
+  Config: configs/ann/ann_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_80k_cityscapes/ann_r50-d8_769x769_80k_cityscapes_20200607_044426-cc7ff323.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_80k_cityscapes/ann_r50-d8_769x769_80k_cityscapes_20200607_044426.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.8
+      mIoU(ms+flip): 80.34
+  Config: configs/ann/ann_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_80k_cityscapes/ann_r101-d8_769x769_80k_cityscapes_20200607_013713-a9d4be8d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_80k_cityscapes/ann_r101-d8_769x769_80k_cityscapes_20200607_013713.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.01
+      mIoU(ms+flip): 42.3
+  Config: configs/ann/ann_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_80k_ade20k/ann_r50-d8_512x512_80k_ade20k_20200615_014818-26f75e11.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_80k_ade20k/ann_r50-d8_512x512_80k_ade20k_20200615_014818.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.94
+      mIoU(ms+flip): 44.18
+  Config: configs/ann/ann_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_80k_ade20k/ann_r101-d8_512x512_80k_ade20k_20200615_014818-c0153543.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_80k_ade20k/ann_r101-d8_512x512_80k_ade20k_20200615_014818.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.74
+      mIoU(ms+flip): 42.62
+  Config: configs/ann/ann_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_160k_ade20k/ann_r50-d8_512x512_160k_ade20k_20200615_231733-892247bc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_160k_ade20k/ann_r50-d8_512x512_160k_ade20k_20200615_231733.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.94
+      mIoU(ms+flip): 44.06
+  Config: configs/ann/ann_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_160k_ade20k/ann_r101-d8_512x512_160k_ade20k_20200615_231733-955eb1ec.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_160k_ade20k/ann_r101-d8_512x512_160k_ade20k_20200615_231733.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 74.86
+      mIoU(ms+flip): 76.13
+  Config: configs/ann/ann_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_20k_voc12aug/ann_r50-d8_512x512_20k_voc12aug_20200617_222246-dfcb1c62.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_20k_voc12aug/ann_r50-d8_512x512_20k_voc12aug_20200617_222246.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.47
+      mIoU(ms+flip): 78.7
+  Config: configs/ann/ann_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_20k_voc12aug/ann_r101-d8_512x512_20k_voc12aug_20200617_222246-2fad0042.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_20k_voc12aug/ann_r101-d8_512x512_20k_voc12aug_20200617_222246.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.56
+      mIoU(ms+flip): 77.51
+  Config: configs/ann/ann_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_40k_voc12aug/ann_r50-d8_512x512_40k_voc12aug_20200613_231314-b5dac322.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_40k_voc12aug/ann_r50-d8_512x512_40k_voc12aug_20200613_231314.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.7
+      mIoU(ms+flip): 78.06
+  Config: configs/ann/ann_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_40k_voc12aug/ann_r101-d8_512x512_40k_voc12aug_20200613_231314-bd205bbe.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_40k_voc12aug/ann_r101-d8_512x512_40k_voc12aug_20200613_231314.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/README.md b/head_extractor/src/mmseg/.mim/configs/apcnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9104f3c87f584c6af115a39bb0b175f44b65c964
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/README.md
@@ -0,0 +1,59 @@
+# APCNet
+
+> [Adaptive Pyramid Context Network for Semantic Segmentation](https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/Junjun2016/APCNet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Recent studies witnessed that context features can significantly improve the performance of deep semantic segmentation networks. Current context based segmentation methods differ with each other in how to construct context features and perform differently in practice. This paper firstly introduces three desirable properties of context features in segmentation task. Specially, we find that Global-guided Local Affinity (GLA) can play a vital role in constructing effective context features, while this property has been largely ignored in previous works. Based on this analysis, this paper proposes Adaptive Pyramid Context Network (APCNet)for semantic segmentation. APCNet adaptively constructs multi-scale contextual representations with multiple welldesigned Adaptive Context Modules (ACMs). Specifically, each ACM leverages a global image representation as a guidance to estimate the local affinity coefficients for each sub-region, and then calculates a context vector with these affinities. We empirically evaluate our APCNet on three semantic segmentation and scene parsing datasets, including PASCAL VOC 2012, Pascal-Context, and ADE20K dataset. Experimental results show that APCNet achieves state-ofthe-art performance on all three benchmarks, and obtains a new record 84.2% on PASCAL VOC 2012 test set without MS COCO pre-trained and any post-processing.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142898638-e1c0c6ae-9270-448e-aa01-bbac3a236db5.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| APCNet | R-50-D8  | 512x1024  |   40000 | 7.7      | 3.57           | V100   | 78.02 |         79.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes/apcnet_r50-d8_512x1024_40k_cityscapes_20201214_115717-5e88fa33.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes/apcnet_r50-d8_512x1024_40k_cityscapes-20201214_115717.log.json)     |
+| APCNet | R-101-D8 | 512x1024  |   40000 | 11.2     | 2.15           | V100   | 79.08 |         80.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes/apcnet_r101-d8_512x1024_40k_cityscapes_20201214_115716-abc9d111.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes/apcnet_r101-d8_512x1024_40k_cityscapes-20201214_115716.log.json) |
+| APCNet | R-50-D8  | 769x769   |   40000 | 8.7      | 1.52           | V100   | 77.89 |         79.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_40k_cityscapes/apcnet_r50-d8_769x769_40k_cityscapes_20201214_115717-2a2628d7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_40k_cityscapes/apcnet_r50-d8_769x769_40k_cityscapes-20201214_115717.log.json)         |
+| APCNet | R-101-D8 | 769x769   |   40000 | 12.7     | 1.03           | V100   | 77.96 |         79.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_40k_cityscapes/apcnet_r101-d8_769x769_40k_cityscapes_20201214_115718-b650de90.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_40k_cityscapes/apcnet_r101-d8_769x769_40k_cityscapes-20201214_115718.log.json)     |
+| APCNet | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 78.96 |         79.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes/apcnet_r50-d8_512x1024_80k_cityscapes_20201214_115716-987f51e3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes/apcnet_r50-d8_512x1024_80k_cityscapes-20201214_115716.log.json)     |
+| APCNet | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 79.64 |         80.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes/apcnet_r101-d8_512x1024_80k_cityscapes_20201214_115705-b1ff208a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes/apcnet_r101-d8_512x1024_80k_cityscapes-20201214_115705.log.json) |
+| APCNet | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 78.79 |         80.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_80k_cityscapes/apcnet_r50-d8_769x769_80k_cityscapes_20201214_115718-7ea9fa12.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_80k_cityscapes/apcnet_r50-d8_769x769_80k_cityscapes-20201214_115718.log.json)         |
+| APCNet | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 78.45 |         79.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_80k_cityscapes/apcnet_r101-d8_769x769_80k_cityscapes_20201214_115716-a7fbc2ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_80k_cityscapes/apcnet_r101-d8_769x769_80k_cityscapes-20201214_115716.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| APCNet | R-50-D8  | 512x512   |   80000 | 10.1     | 19.61          | V100   | 42.20 |         43.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_80k_ade20k/apcnet_r50-d8_512x512_80k_ade20k_20201214_115705-a8626293.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_80k_ade20k/apcnet_r50-d8_512x512_80k_ade20k-20201214_115705.log.json)         |
+| APCNet | R-101-D8 | 512x512   |   80000 | 13.6     | 13.10          | V100   | 45.54 |         46.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_80k_ade20k/apcnet_r101-d8_512x512_80k_ade20k_20201214_115704-c656c3fb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_80k_ade20k/apcnet_r101-d8_512x512_80k_ade20k-20201214_115704.log.json)     |
+| APCNet | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 43.40 |         43.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_160k_ade20k/apcnet_r50-d8_512x512_160k_ade20k_20201214_115706-25fb92c2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_160k_ade20k/apcnet_r50-d8_512x512_160k_ade20k-20201214_115706.log.json)     |
+| APCNet | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 45.41 |         46.63 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_160k_ade20k/apcnet_r101-d8_512x512_160k_ade20k_20201214_115705-73f9a8d7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_160k_ade20k/apcnet_r101-d8_512x512_160k_ade20k-20201214_115705.log.json) |
+
+## Citation
+
+```bibtex
+@InProceedings{He_2019_CVPR,
+author = {He, Junjun and Deng, Zhongying and Zhou, Lei and Wang, Yali and Qiao, Yu},
+title = {Adaptive Pyramid Context Network for Semantic Segmentation},
+booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+month = {June},
+year = {2019}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..754b2d1a089cb48f66a0f287f36030deebca687a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './apcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2b5fe1360f0ff662727b666198c5e4f360f7066
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './apcnet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..03b018d2ffbaf9ef0631b96c555105456567676f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './apcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cbbfadbddbb5e152b7865b8b1b7bc9e80f0de09
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './apcnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0aacc06e0c381ba143cf2a75a434df3ffd480b9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './apcnet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..219d07ae5564072b682a9f208da72586d22c005c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './apcnet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..b440771d01975cb3d9a77570b6b4e389727a6630
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/apcnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ff897c977069fc04ca2aeb43388fde4c2dbce65
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/apcnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..6de10330b3d0947bb6900b56e61ddee921f211e1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/apcnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6ec8985cc363d9ae1d6da7a97987b74c1f8a892
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/apcnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..37b23d1c539c8133733b87d973ae4c1595a65c5f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/apcnet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0fbe275b63b4aa351af6509ee56b302c4d1a432
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/apcnet_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/apcnet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/apcnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/apcnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3f4072c8fda9343755e333550abbc2e98f6716d1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/apcnet/metafile.yaml
@@ -0,0 +1,296 @@
+Collections:
+- Name: APCNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  README: configs/apcnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: apcnet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.02
+      mIoU(ms+flip): 79.26
+  Config: configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes/apcnet_r50-d8_512x1024_40k_cityscapes_20201214_115717-5e88fa33.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes/apcnet_r50-d8_512x1024_40k_cityscapes-20201214_115717.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.08
+      mIoU(ms+flip): 80.34
+  Config: configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes/apcnet_r101-d8_512x1024_40k_cityscapes_20201214_115716-abc9d111.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes/apcnet_r101-d8_512x1024_40k_cityscapes-20201214_115716.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.89
+      mIoU(ms+flip): 79.75
+  Config: configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_40k_cityscapes/apcnet_r50-d8_769x769_40k_cityscapes_20201214_115717-2a2628d7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_40k_cityscapes/apcnet_r50-d8_769x769_40k_cityscapes-20201214_115717.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.96
+      mIoU(ms+flip): 79.24
+  Config: configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_40k_cityscapes/apcnet_r101-d8_769x769_40k_cityscapes_20201214_115718-b650de90.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_40k_cityscapes/apcnet_r101-d8_769x769_40k_cityscapes-20201214_115718.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.96
+      mIoU(ms+flip): 79.94
+  Config: configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes/apcnet_r50-d8_512x1024_80k_cityscapes_20201214_115716-987f51e3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes/apcnet_r50-d8_512x1024_80k_cityscapes-20201214_115716.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.64
+      mIoU(ms+flip): 80.61
+  Config: configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes/apcnet_r101-d8_512x1024_80k_cityscapes_20201214_115705-b1ff208a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes/apcnet_r101-d8_512x1024_80k_cityscapes-20201214_115705.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.79
+      mIoU(ms+flip): 80.35
+  Config: configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_80k_cityscapes/apcnet_r50-d8_769x769_80k_cityscapes_20201214_115718-7ea9fa12.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_80k_cityscapes/apcnet_r50-d8_769x769_80k_cityscapes-20201214_115718.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.45
+      mIoU(ms+flip): 79.91
+  Config: configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_80k_cityscapes/apcnet_r101-d8_769x769_80k_cityscapes_20201214_115716-a7fbc2ab.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_80k_cityscapes/apcnet_r101-d8_769x769_80k_cityscapes-20201214_115716.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.2
+      mIoU(ms+flip): 43.3
+  Config: configs/apcnet/apcnet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_80k_ade20k/apcnet_r50-d8_512x512_80k_ade20k_20201214_115705-a8626293.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_80k_ade20k/apcnet_r50-d8_512x512_80k_ade20k-20201214_115705.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.54
+      mIoU(ms+flip): 46.65
+  Config: configs/apcnet/apcnet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_80k_ade20k/apcnet_r101-d8_512x512_80k_ade20k_20201214_115704-c656c3fb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_80k_ade20k/apcnet_r101-d8_512x512_80k_ade20k-20201214_115704.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.4
+      mIoU(ms+flip): 43.94
+  Config: configs/apcnet/apcnet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_160k_ade20k/apcnet_r50-d8_512x512_160k_ade20k_20201214_115706-25fb92c2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_160k_ade20k/apcnet_r50-d8_512x512_160k_ade20k-20201214_115706.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.41
+      mIoU(ms+flip): 46.63
+  Config: configs/apcnet/apcnet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_160k_ade20k/apcnet_r101-d8_512x512_160k_ade20k_20201214_115705-73f9a8d7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_160k_ade20k/apcnet_r101-d8_512x512_160k_ade20k-20201214_115705.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/beit/README.md b/head_extractor/src/mmseg/.mim/configs/beit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b005c88c501bd50a692aa4cb7db3e13b3f4ec29a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/beit/README.md
@@ -0,0 +1,85 @@
+# BEiT
+
+> [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254)
+
+## Introduction
+
+<!-- [BACKBONE] -->
+
+<a href="https://github.com/microsoft/unilm/tree/master/beit">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/backbones/beit.py#1404">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We introduce a self-supervised vision representation model BEiT, which stands for Bidirectional Encoder representation from Image Transformers. Following BERT developed in the natural language processing area, we propose a masked image modeling task to pretrain vision Transformers. Specifically, each image has two views in our pre-training, i.e, image patches (such as 16x16 pixels), and visual tokens (i.e., discrete tokens). We first "tokenize" the original image into visual tokens. Then we randomly mask some image patches and fed them into the backbone Transformer. The pre-training objective is to recover the original visual tokens based on the corrupted image patches. After pre-training BEiT, we directly fine-tune the model parameters on downstream tasks by appending task layers upon the pretrained encoder. Experimental results on image classification and semantic segmentation show that our model achieves competitive results with previous pre-training methods. For example, base-size BEiT achieves 83.2% top-1 accuracy on ImageNet-1K, significantly outperforming from-scratch DeiT training (81.8%) with the same setup. Moreover, large-size BEiT obtains 86.3% only using ImageNet-1K, even outperforming ViT-L with supervised pre-training on ImageNet-22K (85.2%). The code and pretrained models are available at [this https URL](https://github.com/microsoft/unilm/tree/master/beit).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/93248678/160155758-781c9a45-b1d7-4530-9015-88eca6645006.png" width="70%"/>
+</div>
+
+## Usage
+
+To use other repositories' pre-trained models, it is necessary to convert keys.
+
+We provide a script [`beit2mmseg.py`](../../tools/model_converters/beit2mmseg.py) in the tools directory to convert the key of models from [the official repo](https://github.com/microsoft/unilm/tree/master/beit/semantic_segmentation) to MMSegmentation style.
+
+```shell
+python tools/model_converters/beit2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+E.g.
+
+```shell
+python tools/model_converters/beit2mmseg.py https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22k.pth pretrain/beit_base_patch16_224_pt22k_ft22k.pth
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+In our default setting, pretrained models could be defined below:
+
+| pretrained models | original models                                                                                                             |
+| ----------------- | --------------------------------------------------------------------------------------------------------------------------- |
+| BEiT_base.pth     | ['BEiT_base'](https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22k.pth)   |
+| BEiT_large.pth    | ['BEiT_large'](https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth) |
+
+Verify the single-scale results of the model:
+
+```shell
+sh tools/dist_test.sh \
+configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py \
+upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU
+```
+
+Since relative position embedding requires the input length and width to be equal, the sliding window is adopted for multi-scale inference. So we set min_size=640, that is, the shortest edge is 640. So the multi-scale inference of config is performed separately, instead of '--aug-test'. For multi-scale inference:
+
+```shell
+sh tools/dist_test.sh \
+configs/beit/upernet_beit-large_fp16_640x640_160k_ade20k_ms.py \
+upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU
+```
+
+## Results and models
+
+### ADE20K
+
+| Method  | Backbone | Crop Size | pretrain     | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                           | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------- | -------- | --------- | ------------ | ----------------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | -------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | BEiT-B   | 640x640   | ImageNet-22K | 224x224           | 16         | 160000  | 15.88    | 2.00           | V100   | 53.08 |         53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k.log.json)                         |
+| UPerNet | BEiT-L   | 640x640   | ImageNet-22K | 224x224           | 8          | 320000  | 22.64    | 0.96           | V100   | 56.33 |         56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.log.json) |
+
+## Citation
+
+```bibtex
+@inproceedings{beit,
+      title={{BEiT}: {BERT} Pre-Training of Image Transformers},
+      author={Hangbo Bao and Li Dong and Songhao Piao and Furu Wei},
+      booktitle={International Conference on Learning Representations},
+      year={2022},
+      url={https://openreview.net/forum?id=p-BhZSz59o4}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640.py b/head_extractor/src/mmseg/.mim/configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cd7d0e8a8847019f9e3b71b60a7ddf7e58e8ed8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640.py
@@ -0,0 +1,36 @@
+_base_ = [
+    '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/beit_base_patch16_224_pt22k_ft22k.pth',
+    test_cfg=dict(mode='slide', crop_size=(640, 640), stride=(426, 426)))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=3e-5, betas=(0.9, 0.999), weight_decay=0.05),
+    constructor='LayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640_ms.py b/head_extractor/src/mmseg/.mim/configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640_ms.py
new file mode 100644
index 0000000000000000000000000000000000000000..02480222c454b0da0fdb161c2e0e615af8f67224
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640_ms.py
@@ -0,0 +1,16 @@
+_base_ = './beit-base_upernet_8xb2-160k_ade20k-640x640.py'
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    # TODO: Refactor 'MultiScaleFlipAug' which supports
+    # `min_size` feature in `Resize` class
+    # img_ratios is [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    # original image scale is (2560, 640)
+    dict(type='Resize', scale=(2560, 640), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs'),
+]
+val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640.py b/head_extractor/src/mmseg/.mim/configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fd5cd20adda2685d6d553c4d238840e9e297386
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640.py
@@ -0,0 +1,50 @@
+_base_ = [
+    '../_base_/models/upernet_beit.py', '../_base_/datasets/ade20k_640x640.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_320k.py'
+]
+crop_size = (640, 640)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/beit_large_patch16_224_pt22k_ft22k.pth',
+    backbone=dict(
+        type='BEiT',
+        embed_dims=1024,
+        num_layers=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qv_bias=True,
+        init_values=1e-6,
+        drop_path_rate=0.2,
+        out_indices=[7, 11, 15, 23]),
+    neck=dict(embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        in_channels=[1024, 1024, 1024, 1024], num_classes=150, channels=1024),
+    auxiliary_head=dict(in_channels=1024, num_classes=150),
+    test_cfg=dict(mode='slide', crop_size=(640, 640), stride=(426, 426)))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=2e-5, betas=(0.9, 0.999), weight_decay=0.05),
+    constructor='LayerDecayOptimizerConstructor',
+    paramwise_cfg=dict(num_layers=24, layer_decay_rate=0.95),
+    accumulative_counts=2)
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=3000),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=3000,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+train_dataloader = dict(batch_size=1)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640_ms.py b/head_extractor/src/mmseg/.mim/configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640_ms.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc6f049d11f097c00870cd687df152e996dcb879
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640_ms.py
@@ -0,0 +1,16 @@
+_base_ = './beit-large_upernet_8xb1-amp-160k_ade20k-640x640.py'
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    # TODO: Refactor 'MultiScaleFlipAug' which supports
+    # `min_size` feature in `Resize` class
+    # img_ratios is [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    # original image scale is (2560, 640)
+    dict(type='Resize', scale=(2560, 640), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs'),
+]
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/beit/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/beit/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ef6124e8dcd2cc6541035ae001890eb332c37c3a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/beit/metafile.yaml
@@ -0,0 +1,49 @@
+Models:
+- Name: beit-base_upernet_8xb2-160k_ade20k-640x640
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 53.08
+      mIoU(ms+flip): 53.84
+  Config: configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - BEiT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 15.88
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k.log.json
+  Paper:
+    Title: 'BEiT: BERT Pre-Training of Image Transformers'
+    URL: https://arxiv.org/abs/2106.08254
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/backbones/beit.py#1404
+  Framework: PyTorch
+- Name: beit-large_upernet_8xb1-amp-160k_ade20k-640x640
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 56.33
+      mIoU(ms+flip): 56.84
+  Config: configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - BEiT-L
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 22.64
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.log.json
+  Paper:
+    Title: 'BEiT: BERT Pre-Training of Image Transformers'
+    URL: https://arxiv.org/abs/2106.08254
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/backbones/beit.py#1404
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv1/README.md b/head_extractor/src/mmseg/.mim/configs/bisenetv1/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a5058957f0c43cc74185e3a6b9796b401e25296d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv1/README.md
@@ -0,0 +1,64 @@
+# BiSeNetV1
+
+> [BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation](https://arxiv.org/abs/1808.00897)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/ycszen/TorchSeg/tree/master/model/bisenet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Semantic segmentation requires both rich spatial information and sizeable receptive field. However, modern approaches usually compromise spatial resolution to achieve real-time inference speed, which leads to poor performance. In this paper, we address this dilemma with a novel Bilateral Segmentation Network (BiSeNet). We first design a Spatial Path with a small stride to preserve the spatial information and generate high-resolution features. Meanwhile, a Context Path with a fast downsampling strategy is employed to obtain sufficient receptive field. On top of the two paths, we introduce a new Feature Fusion Module to combine features efficiently. The proposed architecture makes a right balance between the speed and segmentation performance on Cityscapes, CamVid, and COCO-Stuff datasets. Specifically, for a 2048x1024 input, we achieve 68.4% Mean IOU on the Cityscapes test dataset with speed of 105 FPS on one NVIDIA Titan XP card, which is significantly faster than the existing methods with comparable performance.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142898839-a0a78148-848a-41b2-8682-b1f61ac004ba.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method    | Backbone               | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| --------- | ---------------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| BiSeNetV1 | R-18-D32 (No Pretrain) | 1024x1024 |  160000 | 5.69     | 31.77          | V100   | 74.44 | 77.05         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_cityscapes-1024x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json)                                     |
+| BiSeNetV1 | R-18-D32               | 1024x1024 |  160000 | 5.69     | 31.77          | V100   | 74.37 | 76.91         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) |
+| BiSeNetV1 | R-18-D32 (4x8)         | 1024x1024 |  160000 | 11.17    | 31.77          | V100   | 75.16 | 77.24         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb8-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) |
+| BiSeNetV1 | R-50-D32 (No Pretrain) | 1024x1024 |  160000 | 15.39    | 7.71           | V100   | 76.92 | 78.87         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json)                                     |
+| BiSeNetV1 | R-50-D32               | 1024x1024 |  160000 | 15.39    | 7.71           | V100   | 77.68 | 79.57         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) |
+
+### COCO-Stuff 164k
+
+| Method    | Backbone                | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| --------- | ----------------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| BiSeNetV1 | R-18-D32 (No Pretrain)  | 512x512   |  160000 | -        | -              | V100   | 25.45 | 26.15         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211022_054328-046aa2f2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211022_054328.log.json)                                         |
+| BiSeNetV1 | R-18-D32                | 512x512   |  160000 | 6.33     | 74.24          | V100   | 28.55 | 29.26         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211023_013100-f700dbf7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211023_013100.log.json)     |
+| BiSeNetV1 | R-50-D32 (No Pretrain)  | 512x512   |  160000 | -        | -              | V100   | 29.82 | 30.33         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_040616-d2bb0df4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_040616.log.json)                                         |
+| BiSeNetV1 | R-50-D32                | 512x512   |  160000 | 9.28     | 32.60          | V100   | 34.88 | 35.37         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_181932-66747911.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_181932.log.json)     |
+| BiSeNetV1 | R-101-D32 (No Pretrain) | 512x512   |  160000 | -        | -              | V100   | 31.14 | 31.76         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211102_164147-c6b32c3b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211102_164147.log.json)                                     |
+| BiSeNetV1 | R-101-D32               | 512x512   |  160000 | 10.36    | 25.25          | V100   | 37.38 | 37.99         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r101-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_225220-28c8f092.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_225220.log.json) |
+
+Note:
+
+- `4x8`: Using 4 GPUs with 8 samples per GPU in training.
+- For BiSeNetV1 on Cityscapes dataset, default setting is 4 GPUs with 4 samples per GPU in training.
+- `No Pretrain` means the model is trained from scratch.
+
+## Citation
+
+```bibtex
+@inproceedings{yu2018bisenet,
+  title={Bisenet: Bilateral segmentation network for real-time semantic segmentation},
+  author={Yu, Changqian and Wang, Jingbo and Peng, Chao and Gao, Changxin and Yu, Gang and Sang, Nong},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={325--341},
+  year={2018}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r101-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r101-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac63447d47fc7cdf27517546bd254764145ddd4d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r101-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,6 @@
+_base_ = './bisenetv1_r101-d32_4xb4-160k_coco-stuff164k-512x512.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet101_v1c'))))
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r101-d32_4xb4-160k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r101-d32_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..02e4e9be05d9859af91299bbbae8f408360d7ad8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r101-d32_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        context_channels=(512, 1024, 2048),
+        spatial_channels=(256, 256, 256, 512),
+        out_channels=1024,
+        backbone_cfg=dict(type='ResNet', depth=101)),
+    decode_head=dict(in_channels=1024, channels=1024, num_classes=171),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=171,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=171,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ])
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..da3e598127d4c0430f053262f01be7eec819fba2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py
@@ -0,0 +1,29 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (1024, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))))
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+optimizer = dict(type='SGD', lr=0.025, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9de889f001591150ae9ed9bf0b90fc83e4552323
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py'
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))),
+)
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb8-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb8-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..0580ce11e6b84a9a61b9a02a670d806196d891d9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb8-160k_cityscapes-1024x1024.py
@@ -0,0 +1,4 @@
+_base_ = './bisenetv1_r18-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py'
+train_dataloader = dict(batch_size=8, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c3e12b24f5a7ddab6e6ec74075147110487fdea
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_cityscapes-1024x1024.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (1024, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+optimizer = dict(type='SGD', lr=0.025, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..2109d689d0a39f90f1363e4af65a3af36a0db52c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,53 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=171),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=171,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=171,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ])
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..013c4ff1629b6dd961ac8eb7b90a6cfef5f0596b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py
@@ -0,0 +1,7 @@
+_base_ = './bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024.py'
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b35259c72587bdb4f4e4b04308906cdc69bc667f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,7 @@
+_base_ = './bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512.py'
+
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..9753c10231d7030274fe16fb6b1832e8c1dea2f8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024.py
@@ -0,0 +1,55 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+crop_size = (1024, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='BiSeNetV1',
+        context_channels=(512, 1024, 2048),
+        spatial_channels=(256, 256, 256, 512),
+        out_channels=1024,
+        backbone_cfg=dict(type='ResNet', depth=50)),
+    decode_head=dict(
+        type='FCNHead', in_channels=1024, in_index=0, channels=1024),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=19,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False),
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=19,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False),
+    ])
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+optimizer = dict(type='SGD', lr=0.05, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b6ef74c1a7a2a93f451742cff1041825db52933
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        context_channels=(512, 1024, 2048),
+        spatial_channels=(256, 256, 256, 512),
+        out_channels=1024,
+        backbone_cfg=dict(type='ResNet', depth=50)),
+    decode_head=dict(in_channels=1024, channels=1024, num_classes=171),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=171,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=171,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ])
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv1/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/bisenetv1/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e37f632b2fd263429b4a01b2c2008c7a53dd2bcd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv1/metafile.yaml
@@ -0,0 +1,275 @@
+Collections:
+- Name: BiSeNetV1
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - COCO-Stuff 164k
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  README: configs/bisenetv1/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: bisenetv1_r18-d32_4xb4-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.44
+      mIoU(ms+flip): 77.05
+  Config: configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - R-18-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.69
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r18-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.37
+      mIoU(ms+flip): 76.91
+  Config: configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - R-18-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.69
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r18-d32-in1k-pre_4xb8-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.16
+      mIoU(ms+flip): 77.24
+  Config: configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb8-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 32
+    Architecture:
+    - R-18-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.17
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.92
+      mIoU(ms+flip): 78.87
+  Config: configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 15.39
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r50-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.68
+      mIoU(ms+flip): 79.57
+  Config: configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 15.39
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 25.45
+      mIoU(ms+flip): 26.15
+  Config: configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-18-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211022_054328-046aa2f2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211022_054328.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r18-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 28.55
+      mIoU(ms+flip): 29.26
+  Config: configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-18-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.33
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211023_013100-f700dbf7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211023_013100.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 29.82
+      mIoU(ms+flip): 30.33
+  Config: configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_040616-d2bb0df4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_040616.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 34.88
+      mIoU(ms+flip): 35.37
+  Config: configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.28
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_181932-66747911.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_181932.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 31.14
+      mIoU(ms+flip): 31.76
+  Config: configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211102_164147-c6b32c3b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211102_164147.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r101-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 37.38
+      mIoU(ms+flip): 37.99
+  Config: configs/bisenetv1/bisenetv1_r101-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.36
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_225220-28c8f092.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_225220.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv2/README.md b/head_extractor/src/mmseg/.mim/configs/bisenetv2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a5871dfeb986cddf2235fb40c1036fe32e5aacfc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv2/README.md
@@ -0,0 +1,53 @@
+# BiSeNetV2
+
+> [Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation](https://arxiv.org/abs/2004.02147)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv2.py#L545">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+The low-level details and high-level semantics are both essential to the semantic segmentation task. However, to speed up the model inference, current approaches almost always sacrifice the low-level details, which leads to a considerable accuracy decrease. We propose to treat these spatial details and categorical semantics separately to achieve high accuracy and high efficiency for realtime semantic segmentation. To this end, we propose an efficient and effective architecture with a good trade-off between speed and accuracy, termed Bilateral Segmentation Network (BiSeNet V2). This architecture involves: (i) a Detail Branch, with wide channels and shallow layers to capture low-level details and generate high-resolution feature representation; (ii) a Semantic Branch, with narrow channels and deep layers to obtain high-level semantic context. The Semantic Branch is lightweight due to reducing the channel capacity and a fast-downsampling strategy. Furthermore, we design a Guided Aggregation Layer to enhance mutual connections and fuse both types of feature representation. Besides, a booster training strategy is designed to improve the segmentation performance without any extra inference cost. Extensive quantitative and qualitative evaluations demonstrate that the proposed architecture performs favourably against a few state-of-the-art real-time semantic segmentation approaches. Specifically, for a 2,048x1,024 input, we achieve 72.6% Mean IoU on the Cityscapes test set with a speed of 156 FPS on one NVIDIA GeForce GTX 1080 Ti card, which is significantly faster than existing methods, yet we achieve better segmentation accuracy.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142898966-ec4a81da-b4b0-41ee-b083-1d964582c18a.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method    | Backbone         | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                                                                               |
+| --------- | ---------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| BiSeNetV2 | BiSeNetV2        | 1024x1024 |  160000 | 7.64     | 31.77          | V100   | 73.21 |         75.74 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv2/bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes_20210902_015551-bcf10f09.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes_20210902_015551.log.json)                     |
+| BiSeNetV2 | BiSeNetV2 (OHEM) | 1024x1024 |  160000 | 7.64     | -              | V100   | 73.57 |         75.80 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv2/bisenetv2_fcn_4xb4-ohem-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes_20210902_112947-5f8103b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes_20210902_112947.log.json) |
+| BiSeNetV2 | BiSeNetV2 (4x8)  | 1024x1024 |  160000 | 15.05    | -              | V100   | 75.76 |         77.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv2/bisenetv2_fcn_4xb8-160k_cityscapes-1024x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes_20210903_000032-e1a2eed6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes_20210903_000032.log.json)                     |
+| BiSeNetV2 | BiSeNetV2 (FP16) | 1024x1024 |  160000 | 5.77     | 36.65          | V100   | 73.07 |         75.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv2/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes_20210902_045942-b979777b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes_20210902_045942.log.json) |
+
+Note:
+
+- `OHEM` means Online Hard Example Mining (OHEM) is adopted in training.
+- `FP16` means Mixed Precision (FP16) is adopted in training.
+- `4x8` means 4 GPUs with 8 samples per GPU in training.
+
+## Citation
+
+```bibtex
+@article{yu2021bisenet,
+  title={Bisenet v2: Bilateral network with guided aggregation for real-time semantic segmentation},
+  author={Yu, Changqian and Gao, Changxin and Wang, Jingbo and Yu, Gang and Shen, Chunhua and Sang, Nong},
+  journal={International Journal of Computer Vision},
+  pages={1--18},
+  year={2021},
+  publisher={Springer}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv2/bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/bisenetv2/bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..6462ce76247bcf0ec1cb0bb4649bccaf393f7458
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv2/bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/bisenetv2.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (1024, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+optimizer = dict(type='SGD', lr=0.05, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv2/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/bisenetv2/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ed338c00b20e7bcbd80d884155ef7c8b4ad3934
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv2/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024.py
@@ -0,0 +1,6 @@
+_base_ = './bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024.py'
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(type='SGD', lr=0.05, momentum=0.9, weight_decay=0.0005),
+    loss_scale=512.)
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv2/bisenetv2_fcn_4xb4-ohem-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/bisenetv2/bisenetv2_fcn_4xb4-ohem-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d5cbcb4e5e83e5592560ffb88f03ab0f01e8442
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv2/bisenetv2_fcn_4xb4-ohem-160k_cityscapes-1024x1024.py
@@ -0,0 +1,83 @@
+_base_ = [
+    '../_base_/models/bisenetv2.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (1024, 1024)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+models = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(
+        sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000)),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=16,
+            channels=16,
+            num_convs=2,
+            num_classes=19,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=32,
+            channels=64,
+            num_convs=2,
+            num_classes=19,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=64,
+            channels=256,
+            num_convs=2,
+            num_classes=19,
+            in_index=3,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=1024,
+            num_convs=2,
+            num_classes=19,
+            in_index=4,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=10000),
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ],
+)
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+optimizer = dict(type='SGD', lr=0.05, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv2/bisenetv2_fcn_4xb8-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/bisenetv2/bisenetv2_fcn_4xb8-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fcba64713e142db38c29f489fd7ab83c8b28e9c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv2/bisenetv2_fcn_4xb8-160k_cityscapes-1024x1024.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/bisenetv2.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (1024, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+optimizer = dict(type='SGD', lr=0.05, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(batch_size=8, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/bisenetv2/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/bisenetv2/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5430ec3071f96eb75fcd12c4084042c39b45fed8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/bisenetv2/metafile.yaml
@@ -0,0 +1,114 @@
+Collections:
+- Name: BiSeNetV2
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: 'Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2004.02147
+  README: configs/bisenetv2/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV2
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.21
+      mIoU(ms+flip): 75.74
+  Config: configs/bisenetv2/bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - BiSeNetV2
+    - BiSeNetV2
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.64
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes_20210902_015551-bcf10f09.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes_20210902_015551.log.json
+  Paper:
+    Title: 'Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2004.02147
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv2.py#L545
+  Framework: PyTorch
+- Name: bisenetv2_fcn_4xb4-ohem-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV2
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.57
+      mIoU(ms+flip): 75.8
+  Config: configs/bisenetv2/bisenetv2_fcn_4xb4-ohem-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - BiSeNetV2
+    - BiSeNetV2
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.64
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes_20210902_112947-5f8103b4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes_20210902_112947.log.json
+  Paper:
+    Title: 'Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2004.02147
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv2.py#L545
+  Framework: PyTorch
+- Name: bisenetv2_fcn_4xb8-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV2
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.76
+      mIoU(ms+flip): 77.79
+  Config: configs/bisenetv2/bisenetv2_fcn_4xb8-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 32
+    Architecture:
+    - BiSeNetV2
+    - BiSeNetV2
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 15.05
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes_20210903_000032-e1a2eed6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes_20210903_000032.log.json
+  Paper:
+    Title: 'Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2004.02147
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv2.py#L545
+  Framework: PyTorch
+- Name: bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV2
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.07
+      mIoU(ms+flip): 75.13
+  Config: configs/bisenetv2/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - BiSeNetV2
+    - BiSeNetV2
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.77
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes_20210902_045942-b979777b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes_20210902_045942.log.json
+  Paper:
+    Title: 'Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2004.02147
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv2.py#L545
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/README.md b/head_extractor/src/mmseg/.mim/configs/ccnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..64dd5f0298aa4610ea070b96955c6ea4a9151c18
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/README.md
@@ -0,0 +1,67 @@
+# CCNet
+
+> [CCNet: Criss-Cross Attention for Semantic Segmentation](https://arxiv.org/abs/1811.11721)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/speedinghzl/CCNet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Contextual information is vital in visual understanding problems, such as semantic segmentation and object detection. We propose a Criss-Cross Network (CCNet) for obtaining full-image contextual information in a very effective and efficient way. Concretely, for each pixel, a novel criss-cross attention module harvests the contextual information of all the pixels on its criss-cross path. By taking a further recurrent operation, each pixel can finally capture the full-image dependencies. Besides, a category consistent loss is proposed to enforce the criss-cross attention module to produce more discriminative features. Overall, CCNet is with the following merits: 1) GPU memory friendly. Compared with the non-local block, the proposed recurrent criss-cross attention module requires 11x less GPU memory usage. 2) High computational efficiency. The recurrent criss-cross attention significantly reduces FLOPs by about 85% of the non-local block. 3) The state-of-the-art performance. We conduct extensive experiments on semantic segmentation benchmarks including Cityscapes, ADE20K, human parsing benchmark LIP, instance segmentation benchmark COCO, video segmentation benchmark CamVid. In particular, our CCNet achieves the mIoU scores of 81.9%, 45.76% and 55.47% on the Cityscapes test set, the ADE20K validation set and the LIP validation set respectively, which are the new state-of-the-art results. The source codes are available at [this https URL](https://github.com/speedinghzl/CCNet).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142899159-b329c12a-0fde-44df-8718-def6cfb004e4.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CCNet  | R-50-D8  | 512x1024  |   40000 | 6        | 3.32           | V100   | 77.76 |         78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes/ccnet_r50-d8_512x1024_40k_cityscapes_20200616_142517-4123f401.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes/ccnet_r50-d8_512x1024_40k_cityscapes_20200616_142517.log.json)     |
+| CCNet  | R-101-D8 | 512x1024  |   40000 | 9.5      | 2.31           | V100   | 76.35 |         78.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes/ccnet_r101-d8_512x1024_40k_cityscapes_20200616_142540-a3b84ba6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes/ccnet_r101-d8_512x1024_40k_cityscapes_20200616_142540.log.json) |
+| CCNet  | R-50-D8  | 769x769   |   40000 | 6.8      | 1.43           | V100   | 78.46 |         79.93 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_40k_cityscapes/ccnet_r50-d8_769x769_40k_cityscapes_20200616_145125-76d11884.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_40k_cityscapes/ccnet_r50-d8_769x769_40k_cityscapes_20200616_145125.log.json)         |
+| CCNet  | R-101-D8 | 769x769   |   40000 | 10.7     | 1.01           | V100   | 76.94 |         78.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_40k_cityscapes/ccnet_r101-d8_769x769_40k_cityscapes_20200617_101428-4f57c8d0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_40k_cityscapes/ccnet_r101-d8_769x769_40k_cityscapes_20200617_101428.log.json)     |
+| CCNet  | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 79.03 |         80.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes/ccnet_r50-d8_512x1024_80k_cityscapes_20200617_010421-869a3423.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes/ccnet_r50-d8_512x1024_80k_cityscapes_20200617_010421.log.json)     |
+| CCNet  | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 78.87 |         79.90 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes/ccnet_r101-d8_512x1024_80k_cityscapes_20200617_203935-ffae8917.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes/ccnet_r101-d8_512x1024_80k_cityscapes_20200617_203935.log.json) |
+| CCNet  | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 79.29 |         81.08 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_80k_cityscapes/ccnet_r50-d8_769x769_80k_cityscapes_20200617_010421-73eed8ca.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_80k_cityscapes/ccnet_r50-d8_769x769_80k_cityscapes_20200617_010421.log.json)         |
+| CCNet  | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 79.45 |         80.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_80k_cityscapes/ccnet_r101-d8_769x769_80k_cityscapes_20200618_011502-ad3cd481.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_80k_cityscapes/ccnet_r101-d8_769x769_80k_cityscapes_20200618_011502.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CCNet  | R-50-D8  | 512x512   |   80000 | 8.8      | 20.89          | V100   | 41.78 |         42.98 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_80k_ade20k/ccnet_r50-d8_512x512_80k_ade20k_20200615_014848-aa37f61e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_80k_ade20k/ccnet_r50-d8_512x512_80k_ade20k_20200615_014848.log.json)         |
+| CCNet  | R-101-D8 | 512x512   |   80000 | 12.2     | 14.11          | V100   | 43.97 |         45.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_80k_ade20k/ccnet_r101-d8_512x512_80k_ade20k_20200615_014848-1f4929a3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_80k_ade20k/ccnet_r101-d8_512x512_80k_ade20k_20200615_014848.log.json)     |
+| CCNet  | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 42.08 |         43.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_160k_ade20k/ccnet_r50-d8_512x512_160k_ade20k_20200616_084435-7c97193b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_160k_ade20k/ccnet_r50-d8_512x512_160k_ade20k_20200616_084435.log.json)     |
+| CCNet  | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 43.71 |         45.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_160k_ade20k/ccnet_r101-d8_512x512_160k_ade20k_20200616_000644-e849e007.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_160k_ade20k/ccnet_r101-d8_512x512_160k_ade20k_20200616_000644.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CCNet  | R-50-D8  | 512x512   |   20000 | 6        | 20.45          | V100   | 76.17 |         77.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_20k_voc12aug/ccnet_r50-d8_512x512_20k_voc12aug_20200617_193212-fad81784.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_20k_voc12aug/ccnet_r50-d8_512x512_20k_voc12aug_20200617_193212.log.json)     |
+| CCNet  | R-101-D8 | 512x512   |   20000 | 9.5      | 13.64          | V100   | 77.27 |         79.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_20k_voc12aug/ccnet_r101-d8_512x512_20k_voc12aug_20200617_193212-0007b61d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_20k_voc12aug/ccnet_r101-d8_512x512_20k_voc12aug_20200617_193212.log.json) |
+| CCNet  | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 75.96 |         77.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_40k_voc12aug/ccnet_r50-d8_512x512_40k_voc12aug_20200613_232127-c2a15f02.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_40k_voc12aug/ccnet_r50-d8_512x512_40k_voc12aug_20200613_232127.log.json)     |
+| CCNet  | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 77.87 |         78.90 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_40k_voc12aug/ccnet_r101-d8_512x512_40k_voc12aug_20200613_232127-c30da577.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_40k_voc12aug/ccnet_r101-d8_512x512_40k_voc12aug_20200613_232127.log.json) |
+
+## Citation
+
+```bibtex
+@article{huang2018ccnet,
+    title={CCNet: Criss-Cross Attention for Semantic Segmentation},
+    author={Huang, Zilong and Wang, Xinggang and Huang, Lichao and Huang, Chang and Wei, Yunchao and Liu, Wenyu},
+    booktitle={ICCV},
+    year={2019}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c49e1edc29451586321a1a7c1f5466d99b34f0e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..f24f5a70edaa6844d3dc9fa2f7dd37f239909697
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..b358e12c4e643c769064eb147a13fd4ea929aa37
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..75750768b29a98dee6d99dbe79b996c3035fbc26
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a29d118f41e293d47d4cacfad56667679079314e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd421a2ed56e0b717a90fa47117ec09276b18797
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..425dfcf33939bc579ec83b1bfe88c0fa1fcc7486
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6dcb9cf50cd1404952e2627a36d98ddf2e01cb3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..84fc51a6b35d15ffd50e2aeca9b59d271237275c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/ccnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..a93079406587445759f92e4acb0d51f3615f9903
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/ccnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..89bfe81825dfab56af222d69ef965c99bf1e1629
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/ccnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f7c954aecbaf2be5e7b6803eb4beedde5046c07
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/ccnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..cee810cd852abfd3d0bcdc07b19b6df9d9b38433
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/ccnet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a90d98a83d5518a91a95551425a90d43bd37b5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/ccnet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8aeb85dc6157e77d1db04dd6b8f9c17f2e2ce1e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/ccnet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7fced0a7b83949d5c7ce3a7eb0e17ddb092d7bd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/ccnet_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/ccnet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/ccnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/ccnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..62e5694e4770018e5c30402c4072686c39386bcc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ccnet/metafile.yaml
@@ -0,0 +1,391 @@
+Collections:
+- Name: CCNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  README: configs/ccnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: ccnet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.76
+      mIoU(ms+flip): 78.87
+  Config: configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes/ccnet_r50-d8_512x1024_40k_cityscapes_20200616_142517-4123f401.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes/ccnet_r50-d8_512x1024_40k_cityscapes_20200616_142517.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.35
+      mIoU(ms+flip): 78.19
+  Config: configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes/ccnet_r101-d8_512x1024_40k_cityscapes_20200616_142540-a3b84ba6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes/ccnet_r101-d8_512x1024_40k_cityscapes_20200616_142540.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.46
+      mIoU(ms+flip): 79.93
+  Config: configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_40k_cityscapes/ccnet_r50-d8_769x769_40k_cityscapes_20200616_145125-76d11884.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_40k_cityscapes/ccnet_r50-d8_769x769_40k_cityscapes_20200616_145125.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.94
+      mIoU(ms+flip): 78.62
+  Config: configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_40k_cityscapes/ccnet_r101-d8_769x769_40k_cityscapes_20200617_101428-4f57c8d0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_40k_cityscapes/ccnet_r101-d8_769x769_40k_cityscapes_20200617_101428.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.03
+      mIoU(ms+flip): 80.16
+  Config: configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes/ccnet_r50-d8_512x1024_80k_cityscapes_20200617_010421-869a3423.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes/ccnet_r50-d8_512x1024_80k_cityscapes_20200617_010421.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.87
+      mIoU(ms+flip): 79.9
+  Config: configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes/ccnet_r101-d8_512x1024_80k_cityscapes_20200617_203935-ffae8917.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes/ccnet_r101-d8_512x1024_80k_cityscapes_20200617_203935.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.29
+      mIoU(ms+flip): 81.08
+  Config: configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_80k_cityscapes/ccnet_r50-d8_769x769_80k_cityscapes_20200617_010421-73eed8ca.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_80k_cityscapes/ccnet_r50-d8_769x769_80k_cityscapes_20200617_010421.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.45
+      mIoU(ms+flip): 80.66
+  Config: configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_80k_cityscapes/ccnet_r101-d8_769x769_80k_cityscapes_20200618_011502-ad3cd481.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_80k_cityscapes/ccnet_r101-d8_769x769_80k_cityscapes_20200618_011502.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.78
+      mIoU(ms+flip): 42.98
+  Config: configs/ccnet/ccnet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_80k_ade20k/ccnet_r50-d8_512x512_80k_ade20k_20200615_014848-aa37f61e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_80k_ade20k/ccnet_r50-d8_512x512_80k_ade20k_20200615_014848.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.97
+      mIoU(ms+flip): 45.13
+  Config: configs/ccnet/ccnet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_80k_ade20k/ccnet_r101-d8_512x512_80k_ade20k_20200615_014848-1f4929a3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_80k_ade20k/ccnet_r101-d8_512x512_80k_ade20k_20200615_014848.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.08
+      mIoU(ms+flip): 43.13
+  Config: configs/ccnet/ccnet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_160k_ade20k/ccnet_r50-d8_512x512_160k_ade20k_20200616_084435-7c97193b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_160k_ade20k/ccnet_r50-d8_512x512_160k_ade20k_20200616_084435.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.71
+      mIoU(ms+flip): 45.04
+  Config: configs/ccnet/ccnet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_160k_ade20k/ccnet_r101-d8_512x512_160k_ade20k_20200616_000644-e849e007.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_160k_ade20k/ccnet_r101-d8_512x512_160k_ade20k_20200616_000644.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.17
+      mIoU(ms+flip): 77.51
+  Config: configs/ccnet/ccnet_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_20k_voc12aug/ccnet_r50-d8_512x512_20k_voc12aug_20200617_193212-fad81784.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_20k_voc12aug/ccnet_r50-d8_512x512_20k_voc12aug_20200617_193212.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.27
+      mIoU(ms+flip): 79.02
+  Config: configs/ccnet/ccnet_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_20k_voc12aug/ccnet_r101-d8_512x512_20k_voc12aug_20200617_193212-0007b61d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_20k_voc12aug/ccnet_r101-d8_512x512_20k_voc12aug_20200617_193212.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 75.96
+      mIoU(ms+flip): 77.04
+  Config: configs/ccnet/ccnet_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_40k_voc12aug/ccnet_r50-d8_512x512_40k_voc12aug_20200613_232127-c2a15f02.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_40k_voc12aug/ccnet_r50-d8_512x512_40k_voc12aug_20200613_232127.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.87
+      mIoU(ms+flip): 78.9
+  Config: configs/ccnet/ccnet_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_40k_voc12aug/ccnet_r101-d8_512x512_40k_voc12aug_20200613_232127-c30da577.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_40k_voc12aug/ccnet_r101-d8_512x512_40k_voc12aug_20200613_232127.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/cgnet/README.md b/head_extractor/src/mmseg/.mim/configs/cgnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..96c9fcf515c5f58b437fa7cab26190aabeada5bf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/cgnet/README.md
@@ -0,0 +1,46 @@
+# CGNet
+
+> [CGNet: A Light-weight Context Guided Network for Semantic Segmentation](https://arxiv.org/abs/1811.08201)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/wutianyiRosun/CGNet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/cgnet.py#L187">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+The demand of applying semantic segmentation model on mobile devices has been increasing rapidly. Current state-of-the-art networks have enormous amount of parameters hence unsuitable for mobile devices, while other small memory footprint models follow the spirit of classification network and ignore the inherent characteristic of semantic segmentation. To tackle this problem, we propose a novel Context Guided Network (CGNet), which is a light-weight and efficient network for semantic segmentation. We first propose the Context Guided (CG) block, which learns the joint feature of both local feature and surrounding context, and further improves the joint feature with the global context. Based on the CG block, we develop CGNet which captures contextual information in all stages of the network and is specially tailored for increasing segmentation accuracy. CGNet is also elaborately designed to reduce the number of parameters and save memory footprint. Under an equivalent number of parameters, the proposed CGNet significantly outperforms existing segmentation networks. Extensive experiments on Cityscapes and CamVid datasets verify the effectiveness of the proposed approach. Specifically, without any post-processing and multi-scale testing, the proposed CGNet achieves 64.8% mean IoU on Cityscapes with less than 0.5 M parameters. The source code for the complete system can be found at [this https URL](https://github.com/wutianyiRosun/CGNet).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142900351-89559574-79cc-4f57-8f69-5d88765ec38d.png" width="80%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| CGNet  | M3N21    | 680x680   |   60000 | 7.5      | 30.51          | V100   | 65.63 |         68.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/cgnet/cgnet_fcn_4xb4-60k_cityscapes-680x680.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_680x680_60k_cityscapes/cgnet_680x680_60k_cityscapes_20201101_110253-4c0b2f2d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_680x680_60k_cityscapes/cgnet_680x680_60k_cityscapes-20201101_110253.log.json)     |
+| CGNet  | M3N21    | 512x1024  |   60000 | 8.3      | 31.14          | V100   | 68.27 |         70.33 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/cgnet/cgnet_fcn_4xb8-60k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_512x1024_60k_cityscapes/cgnet_512x1024_60k_cityscapes_20201101_110254-124ea03b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_512x1024_60k_cityscapes/cgnet_512x1024_60k_cityscapes-20201101_110254.log.json) |
+
+## Citation
+
+```bibtext
+@article{wu2020cgnet,
+  title={Cgnet: A light-weight context guided network for semantic segmentation},
+  author={Wu, Tianyi and Tang, Sheng and Zhang, Rui and Cao, Juan and Zhang, Yongdong},
+  journal={IEEE Transactions on Image Processing},
+  volume={30},
+  pages={1169--1179},
+  year={2020},
+  publisher={IEEE}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/cgnet/cgnet_fcn_4xb4-60k_cityscapes-680x680.py b/head_extractor/src/mmseg/.mim/configs/cgnet/cgnet_fcn_4xb4-60k_cityscapes-680x680.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a2c0ed12596f3043b7805cf8197f34576b6f72c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/cgnet/cgnet_fcn_4xb4-60k_cityscapes-680x680.py
@@ -0,0 +1,59 @@
+_base_ = [
+    '../_base_/models/cgnet.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py'
+]
+
+# optimizer
+optimizer = dict(type='Adam', lr=0.001, eps=1e-08, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        by_epoch=False,
+        begin=0,
+        end=60000)
+]
+# runtime settings
+total_iters = 60000
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iters, val_interval=4000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000),
+    sampler_seed=dict(type='DistSamplerSeedHook'))
+
+crop_size = (680, 680)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=8, num_workers=4, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1, num_workers=4, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/cgnet/cgnet_fcn_4xb8-60k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/cgnet/cgnet_fcn_4xb8-60k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..8be29de479ab958eece9dc37ba6efbad24b106a2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/cgnet/cgnet_fcn_4xb8-60k_cityscapes-512x1024.py
@@ -0,0 +1,38 @@
+_base_ = [
+    '../_base_/models/cgnet.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py'
+]
+
+# optimizer
+optimizer = dict(type='Adam', lr=0.001, eps=1e-08, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        by_epoch=False,
+        begin=0,
+        end=60000)
+]
+# runtime settings
+total_iters = 60000
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iters, val_interval=4000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000),
+    sampler_seed=dict(type='DistSamplerSeedHook'))
+
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+
+train_dataloader = dict(batch_size=8)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/cgnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/cgnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..063fc8b3c6bc9d5e6bebdeaac9d334409de80310
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/cgnet/metafile.yaml
@@ -0,0 +1,61 @@
+Collections:
+- Name: CGNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: 'CGNet: A Light-weight Context Guided Network for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.08201
+  README: configs/cgnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: cgnet_fcn_4xb4-60k_cityscapes-680x680
+  In Collection: CGNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 65.63
+      mIoU(ms+flip): 68.04
+  Config: configs/cgnet/cgnet_fcn_4xb4-60k_cityscapes-680x680.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - M3N21
+    - CGNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_680x680_60k_cityscapes/cgnet_680x680_60k_cityscapes_20201101_110253-4c0b2f2d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_680x680_60k_cityscapes/cgnet_680x680_60k_cityscapes-20201101_110253.log.json
+  Paper:
+    Title: 'CGNet: A Light-weight Context Guided Network for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.08201
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/cgnet.py#L187
+  Framework: PyTorch
+- Name: cgnet_fcn_4xb8-60k_cityscapes-512x1024
+  In Collection: CGNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 68.27
+      mIoU(ms+flip): 70.33
+  Config: configs/cgnet/cgnet_fcn_4xb8-60k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 32
+    Architecture:
+    - M3N21
+    - CGNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_512x1024_60k_cityscapes/cgnet_512x1024_60k_cityscapes_20201101_110254-124ea03b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_512x1024_60k_cityscapes/cgnet_512x1024_60k_cityscapes-20201101_110254.log.json
+  Paper:
+    Title: 'CGNet: A Light-weight Context Guided Network for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.08201
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/cgnet.py#L187
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/convnext/README.md b/head_extractor/src/mmseg/.mim/configs/convnext/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d78fe6ee1bb06cce56a6e3d13c97db09f4d11120
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/convnext/README.md
@@ -0,0 +1,74 @@
+# ConvNeXt
+
+> [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545)
+
+## Introduction
+
+<!-- [BACKBONE] -->
+
+<a href="https://github.com/facebookresearch/ConvNeXt">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py#L133">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+The "Roaring 20s" of visual recognition began with the introduction of Vision Transformers (ViTs), which quickly superseded ConvNets as the state-of-the-art image classification model. A vanilla ViT, on the other hand, faces difficulties when applied to general computer vision tasks such as object detection and semantic segmentation. It is the hierarchical Transformers (e.g., Swin Transformers) that reintroduced several ConvNet priors, making Transformers practically viable as a generic vision backbone and demonstrating remarkable performance on a wide variety of vision tasks. However, the effectiveness of such hybrid approaches is still largely credited to the intrinsic superiority of Transformers, rather than the inherent inductive biases of convolutions. In this work, we reexamine the design spaces and test the limits of what a pure ConvNet can achieve. We gradually "modernize" a standard ResNet toward the design of a vision Transformer, and discover several key components that contribute to the performance difference along the way. The outcome of this exploration is a family of pure ConvNet models dubbed ConvNeXt. Constructed entirely from standard ConvNet modules, ConvNeXts compete favorably with Transformers in terms of accuracy and scalability, achieving 87.8% ImageNet top-1 accuracy and outperforming Swin Transformers on COCO detection and ADE20K segmentation, while maintaining the simplicity and efficiency of standard ConvNets.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/8370623/148624004-e9581042-ea4d-4e10-b3bd-42c92b02053b.png" width="90%"/>
+</div>
+
+### Usage
+
+- ConvNeXt backbone needs to install [MMClassification](https://github.com/open-mmlab/mmclassification) first, which has abundant backbones for downstream tasks.
+
+```shell
+pip install mmpretrain>=1.0.0rc7
+```
+
+### Pre-trained Models
+
+The pre-trained models on ImageNet-1k or ImageNet-21k are used to fine-tune on the downstream tasks.
+
+|     Model     | Training Data | Params(M) | Flops(G) |                                                                     Download                                                                     |
+| :-----------: | :-----------: | :-------: | :------: | :----------------------------------------------------------------------------------------------------------------------------------------------: |
+| ConvNeXt-T\*  |  ImageNet-1k  |   28.59   |   4.46   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-tiny_3rdparty_32xb128-noema_in1k_20220301-795e9634.pth)  |
+| ConvNeXt-S\*  |  ImageNet-1k  |   50.22   |   8.69   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-small_3rdparty_32xb128-noema_in1k_20220301-303e75e3.pth) |
+| ConvNeXt-B\*  |  ImageNet-1k  |   88.59   |  15.36   | [model](https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-base_3rdparty_32xb128-noema_in1k_20220301-2a0ee547.pth)  |
+| ConvNeXt-B\*  | ImageNet-21k  |   88.59   |  15.36   |        [model](https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-base_3rdparty_in21k_20220301-262fd037.pth)        |
+| ConvNeXt-L\*  | ImageNet-21k  |  197.77   |  34.37   |       [model](https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-large_3rdparty_in21k_20220301-e6e0ea0a.pth)        |
+| ConvNeXt-XL\* | ImageNet-21k  |  350.20   |  60.93   |       [model](https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-xlarge_3rdparty_in21k_20220301-08aa5ddc.pth)       |
+
+*Models with* are converted from the [official repo](https://github.com/facebookresearch/ConvNeXt/tree/main/semantic_segmentation#results-and-fine-tuned-models).\*
+
+## Results and models
+
+### ADE20K
+
+| Method  | Backbone    | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                                                                             |
+| ------- | ----------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | ConvNeXt-T  | 512x512   | 160000  | 4.23     | 19.90          | V100   | 46.11 | 46.62         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/convnext/convnext-tiny_upernet_8xb2-amp-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553-cad485de.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553.log.json)         |
+| UPerNet | ConvNeXt-S  | 512x512   | 160000  | 5.16     | 15.18          | V100   | 48.56 | 49.02         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/convnext/convnext-small_upernet_8xb2-amp-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208-1b1e394f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208.log.json)     |
+| UPerNet | ConvNeXt-B  | 512x512   | 160000  | 6.33     | 14.41          | V100   | 48.71 | 49.54         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227-02a24fc6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227.log.json)         |
+| UPerNet | ConvNeXt-B  | 640x640   | 160000  | 8.53     | 10.88          | V100   | 52.13 | 52.66         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-640x640.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k/upernet_convnext_base_fp16_640x640_160k_ade20k_20220227_182859-9280e39b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k/upernet_convnext_base_fp16_640x640_160k_ade20k_20220227_182859.log.json)         |
+| UPerNet | ConvNeXt-L  | 640x640   | 160000  | 12.08    | 7.69           | V100   | 53.16 | 53.38         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/convnext/convnext-large_upernet_8xb2-amp-160k_ade20k-640x640.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532-e57aa54d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532.log.json)     |
+| UPerNet | ConvNeXt-XL | 640x640   | 160000  | 26.16\*  | 6.33           | V100   | 53.58 | 54.11         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/convnext/convnext-xlarge_upernet_8xb2-amp-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344-95fc38c2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344.log.json) |
+
+Note:
+
+- `Mem (GB)` with * is collected when `cudnn_benchmark=True`, and hardware is V100.
+
+## Citation
+
+```bibtex
+@article{liu2022convnet,
+  title={A ConvNet for the 2020s},
+  author={Liu, Zhuang and Mao, Hanzi and Wu, Chao-Yuan and Feichtenhofer, Christoph and Darrell, Trevor and Xie, Saining},
+  journal={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2022}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..09c2aa6961c57b332069a00fc52fc22c22a186e3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-512x512.py
@@ -0,0 +1,43 @@
+_base_ = [
+    '../_base_/models/upernet_convnext.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(in_channels=[128, 256, 512, 1024], num_classes=150),
+    auxiliary_head=dict(in_channels=512, num_classes=150),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)),
+)
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg={
+        'decay_rate': 0.9,
+        'decay_type': 'stage_wise',
+        'num_layers': 12
+    },
+    constructor='LearningRateDecayOptimizerConstructor',
+    loss_scale='dynamic')
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-640x640.py b/head_extractor/src/mmseg/.mim/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..06a86431442cadbc865deb5d2f8a06d6feb82938
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-640x640.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/upernet_convnext.py',
+    '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+data_preprocessor = dict(size=crop_size)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-base_3rdparty_in21k_20220301-262fd037.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='mmpretrain.ConvNeXt',
+        arch='base',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    decode_head=dict(
+        in_channels=[128, 256, 512, 1024],
+        num_classes=150,
+    ),
+    auxiliary_head=dict(in_channels=512, num_classes=150),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)),
+)
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg={
+        'decay_rate': 0.9,
+        'decay_type': 'stage_wise',
+        'num_layers': 12
+    },
+    constructor='LearningRateDecayOptimizerConstructor',
+    loss_scale='dynamic')
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/convnext/convnext-large_upernet_8xb2-amp-160k_ade20k-640x640.py b/head_extractor/src/mmseg/.mim/configs/convnext/convnext-large_upernet_8xb2-amp-160k_ade20k-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..2956e86f04253900f2037cfec05f5a756530f932
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/convnext/convnext-large_upernet_8xb2-amp-160k_ade20k-640x640.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/upernet_convnext.py',
+    '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+data_preprocessor = dict(size=crop_size)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-large_3rdparty_in21k_20220301-e6e0ea0a.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='mmpretrain.ConvNeXt',
+        arch='large',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    decode_head=dict(
+        in_channels=[192, 384, 768, 1536],
+        num_classes=150,
+    ),
+    auxiliary_head=dict(in_channels=768, num_classes=150),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)),
+)
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg={
+        'decay_rate': 0.9,
+        'decay_type': 'stage_wise',
+        'num_layers': 12
+    },
+    constructor='LearningRateDecayOptimizerConstructor',
+    loss_scale='dynamic')
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/convnext/convnext-small_upernet_8xb2-amp-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/convnext/convnext-small_upernet_8xb2-amp-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbe45f10e0b4af1a23732a1df99f34d7597a1db8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/convnext/convnext-small_upernet_8xb2-amp-160k_ade20k-512x512.py
@@ -0,0 +1,57 @@
+_base_ = [
+    '../_base_/models/upernet_convnext.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-small_3rdparty_32xb128-noema_in1k_20220301-303e75e3.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='mmpretrain.ConvNeXt',
+        arch='small',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.3,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    decode_head=dict(
+        in_channels=[96, 192, 384, 768],
+        num_classes=150,
+    ),
+    auxiliary_head=dict(in_channels=384, num_classes=150),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)),
+)
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg={
+        'decay_rate': 0.9,
+        'decay_type': 'stage_wise',
+        'num_layers': 12
+    },
+    constructor='LearningRateDecayOptimizerConstructor',
+    loss_scale='dynamic')
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/convnext/convnext-tiny_upernet_8xb2-amp-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/convnext/convnext-tiny_upernet_8xb2-amp-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2e545a76d07635a3da76ade6c92590fa0deb0ff
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/convnext/convnext-tiny_upernet_8xb2-amp-160k_ade20k-512x512.py
@@ -0,0 +1,57 @@
+_base_ = [
+    '../_base_/models/upernet_convnext.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-tiny_3rdparty_32xb128-noema_in1k_20220301-795e9634.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='mmpretrain.ConvNeXt',
+        arch='tiny',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    decode_head=dict(
+        in_channels=[96, 192, 384, 768],
+        num_classes=150,
+    ),
+    auxiliary_head=dict(in_channels=384, num_classes=150),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)),
+)
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg={
+        'decay_rate': 0.9,
+        'decay_type': 'stage_wise',
+        'num_layers': 6
+    },
+    constructor='LearningRateDecayOptimizerConstructor',
+    loss_scale='dynamic')
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/convnext/convnext-xlarge_upernet_8xb2-amp-160k_ade20k-640x640.py b/head_extractor/src/mmseg/.mim/configs/convnext/convnext-xlarge_upernet_8xb2-amp-160k_ade20k-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfad73452150ec94d9e5614ec7ad027817a5ea76
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/convnext/convnext-xlarge_upernet_8xb2-amp-160k_ade20k-640x640.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/upernet_convnext.py',
+    '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+data_preprocessor = dict(size=crop_size)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-xlarge_3rdparty_in21k_20220301-08aa5ddc.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='mmpretrain.ConvNeXt',
+        arch='xlarge',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    decode_head=dict(
+        in_channels=[256, 512, 1024, 2048],
+        num_classes=150,
+    ),
+    auxiliary_head=dict(in_channels=1024, num_classes=150),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)),
+)
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00008, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg={
+        'decay_rate': 0.9,
+        'decay_type': 'stage_wise',
+        'num_layers': 12
+    },
+    constructor='LearningRateDecayOptimizerConstructor',
+    loss_scale='dynamic')
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/convnext/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/convnext/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8340a373c2f0b3e21f8408cc7b2dcc05855d55fc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/convnext/metafile.yaml
@@ -0,0 +1,145 @@
+Models:
+- Name: convnext-tiny_upernet_8xb2-amp-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.11
+      mIoU(ms+flip): 46.62
+  Config: configs/convnext/convnext-tiny_upernet_8xb2-amp-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ConvNeXt-T
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.23
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553-cad485de.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553.log.json
+  Paper:
+    Title: A ConvNet for the 2020s
+    URL: https://arxiv.org/abs/2201.03545
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py#L133
+  Framework: PyTorch
+- Name: convnext-small_upernet_8xb2-amp-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.56
+      mIoU(ms+flip): 49.02
+  Config: configs/convnext/convnext-small_upernet_8xb2-amp-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ConvNeXt-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 5.16
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208-1b1e394f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208.log.json
+  Paper:
+    Title: A ConvNet for the 2020s
+    URL: https://arxiv.org/abs/2201.03545
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py#L133
+  Framework: PyTorch
+- Name: convnext-base_upernet_8xb2-amp-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.71
+      mIoU(ms+flip): 49.54
+  Config: configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ConvNeXt-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.33
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227-02a24fc6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227.log.json
+  Paper:
+    Title: A ConvNet for the 2020s
+    URL: https://arxiv.org/abs/2201.03545
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py#L133
+  Framework: PyTorch
+- Name: convnext-base_upernet_8xb2-amp-160k_ade20k-640x640
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 52.13
+      mIoU(ms+flip): 52.66
+  Config: configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ConvNeXt-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.53
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k/upernet_convnext_base_fp16_640x640_160k_ade20k_20220227_182859-9280e39b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k/upernet_convnext_base_fp16_640x640_160k_ade20k_20220227_182859.log.json
+  Paper:
+    Title: A ConvNet for the 2020s
+    URL: https://arxiv.org/abs/2201.03545
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py#L133
+  Framework: PyTorch
+- Name: convnext-large_upernet_8xb2-amp-160k_ade20k-640x640
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 53.16
+      mIoU(ms+flip): 53.38
+  Config: configs/convnext/convnext-large_upernet_8xb2-amp-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ConvNeXt-L
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 12.08
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532-e57aa54d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532.log.json
+  Paper:
+    Title: A ConvNet for the 2020s
+    URL: https://arxiv.org/abs/2201.03545
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py#L133
+  Framework: PyTorch
+- Name: convnext-xlarge_upernet_8xb2-amp-160k_ade20k-640x640
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 53.58
+      mIoU(ms+flip): 54.11
+  Config: configs/convnext/convnext-xlarge_upernet_8xb2-amp-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ConvNeXt-XL
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 26.16
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344-95fc38c2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344.log.json
+  Paper:
+    Title: A ConvNet for the 2020s
+    URL: https://arxiv.org/abs/2201.03545
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py#L133
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/README.md b/head_extractor/src/mmseg/.mim/configs/danet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..90194f3073e0e16933bdac610f490c751110596d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/README.md
@@ -0,0 +1,67 @@
+# DANet
+
+> [Dual Attention Network for Scene Segmentation](https://arxiv.org/abs/1809.02983)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/junfu1115/DANet/">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+In this paper, we address the scene segmentation task by capturing rich contextual dependencies based on the selfattention mechanism. Unlike previous works that capture contexts by multi-scale features fusion, we propose a Dual Attention Networks (DANet) to adaptively integrate local features with their global dependencies. Specifically, we append two types of attention modules on top of traditional dilated FCN, which model the semantic interdependencies in spatial and channel dimensions respectively. The position attention module selectively aggregates the features at each position by a weighted sum of the features at all positions. Similar features would be related to each other regardless of their distances. Meanwhile, the channel attention module selectively emphasizes interdependent channel maps by integrating associated features among all channel maps. We sum the outputs of the two attention modules to further improve feature representation which contributes to more precise segmentation results. We achieve new state-of-the-art segmentation performance on three challenging scene segmentation datasets, i.e., Cityscapes, PASCAL Context and COCO Stuff dataset. In particular, a Mean IoU score of 81.5% on Cityscapes test set is achieved without using coarse data. We make the code and trained model publicly available at [this https URL](https://github.com/junfu1115/DANet).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142900467-f832fdb9-3b7d-47d3-8e80-e6ee9303bdfb.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DANet  | R-50-D8  | 512x1024  |   40000 | 7.4      | 2.66           | V100   | 78.74 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_40k_cityscapes/danet_r50-d8_512x1024_40k_cityscapes_20200605_191324-c0dbfa5f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_40k_cityscapes/danet_r50-d8_512x1024_40k_cityscapes_20200605_191324.log.json)     |
+| DANet  | R-101-D8 | 512x1024  |   40000 | 10.9     | 1.99           | V100   | 80.52 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_40k_cityscapes/danet_r101-d8_512x1024_40k_cityscapes_20200605_200831-c57a7157.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_40k_cityscapes/danet_r101-d8_512x1024_40k_cityscapes_20200605_200831.log.json) |
+| DANet  | R-50-D8  | 769x769   |   40000 | 8.8      | 1.56           | V100   | 78.88 | 80.62         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_40k_cityscapes/danet_r50-d8_769x769_40k_cityscapes_20200530_025703-76681c60.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_40k_cityscapes/danet_r50-d8_769x769_40k_cityscapes_20200530_025703.log.json)         |
+| DANet  | R-101-D8 | 769x769   |   40000 | 12.8     | 1.07           | V100   | 79.88 | 81.47         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_40k_cityscapes/danet_r101-d8_769x769_40k_cityscapes_20200530_025717-dcb7fd4e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_40k_cityscapes/danet_r101-d8_769x769_40k_cityscapes_20200530_025717.log.json)     |
+| DANet  | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 79.34 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_80k_cityscapes/danet_r50-d8_512x1024_80k_cityscapes_20200607_133029-2bfa2293.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_80k_cityscapes/danet_r50-d8_512x1024_80k_cityscapes_20200607_133029.log.json)     |
+| DANet  | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 80.41 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_80k_cityscapes/danet_r101-d8_512x1024_80k_cityscapes_20200607_132918-955e6350.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_80k_cityscapes/danet_r101-d8_512x1024_80k_cityscapes_20200607_132918.log.json) |
+| DANet  | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 79.27 | 80.96         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_80k_cityscapes/danet_r50-d8_769x769_80k_cityscapes_20200607_132954-495689b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_80k_cityscapes/danet_r50-d8_769x769_80k_cityscapes_20200607_132954.log.json)         |
+| DANet  | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 80.47 | 82.02         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_80k_cityscapes/danet_r101-d8_769x769_80k_cityscapes_20200607_132918-f3a929e7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_80k_cityscapes/danet_r101-d8_769x769_80k_cityscapes_20200607_132918.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DANet  | R-50-D8  | 512x512   |   80000 | 11.5     | 21.20          | V100   | 41.66 |         42.90 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_80k_ade20k/danet_r50-d8_512x512_80k_ade20k_20200615_015125-edb18e08.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_80k_ade20k/danet_r50-d8_512x512_80k_ade20k_20200615_015125.log.json)         |
+| DANet  | R-101-D8 | 512x512   |   80000 | 15       | 14.18          | V100   | 43.64 |         45.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_80k_ade20k/danet_r101-d8_512x512_80k_ade20k_20200615_015126-d0357c73.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_80k_ade20k/danet_r101-d8_512x512_80k_ade20k_20200615_015126.log.json)     |
+| DANet  | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 42.45 |         43.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_160k_ade20k/danet_r50-d8_512x512_160k_ade20k_20200616_082340-9cb35dcd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_160k_ade20k/danet_r50-d8_512x512_160k_ade20k_20200616_082340.log.json)     |
+| DANet  | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 44.17 |         45.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_160k_ade20k/danet_r101-d8_512x512_160k_ade20k_20200616_082348-23bf12f9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_160k_ade20k/danet_r101-d8_512x512_160k_ade20k_20200616_082348.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DANet  | R-50-D8  | 512x512   |   20000 | 6.5      | 20.94          | V100   | 74.45 |         75.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_20k_voc12aug/danet_r50-d8_512x512_20k_voc12aug_20200618_070026-9e9e3ab3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_20k_voc12aug/danet_r50-d8_512x512_20k_voc12aug_20200618_070026.log.json)     |
+| DANet  | R-101-D8 | 512x512   |   20000 | 9.9      | 13.76          | V100   | 76.02 |         77.23 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_20k_voc12aug/danet_r101-d8_512x512_20k_voc12aug_20200618_070026-d48d23b2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_20k_voc12aug/danet_r101-d8_512x512_20k_voc12aug_20200618_070026.log.json) |
+| DANet  | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 76.37 |         77.29 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_40k_voc12aug/danet_r50-d8_512x512_40k_voc12aug_20200613_235526-426e3a64.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_40k_voc12aug/danet_r50-d8_512x512_40k_voc12aug_20200613_235526.log.json)     |
+| DANet  | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 76.51 |         77.32 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_40k_voc12aug/danet_r101-d8_512x512_40k_voc12aug_20200613_223031-788e232a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_40k_voc12aug/danet_r101-d8_512x512_40k_voc12aug_20200613_223031.log.json) |
+
+## Citation
+
+```bibtex
+@article{fu2018dual,
+  title={Dual Attention Network for Scene Segmentation},
+  author={Jun Fu, Jing Liu, Haijie Tian, Yong Li, Yongjun Bao, Zhiwei Fang,and Hanqing Lu},
+  booktitle={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2019}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..4602f3318f96f2ca6f7025910aecdd3b0c2270e4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..a08c18ee468e848f93ffdf2ac2f21bcd61150500
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..98b1c6490b51dc5c7ff062717b7a6c51abac4789
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..9affe306cbf5f83cfb4b86af24f412170dc7f01e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..0079ad65e87a81bf7edc460626df4e6072f4c7a1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..48444514b792a3a475a53cab3e8f0ee2f6f3ab3a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f2df7a595563f3e489a80247e3ef81f1964715d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd75bc16b8517c6e990683440a302d540cb78c00
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bc2a7792d503e028480c461e0fb121d7b33efa2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/danet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a01fb9eb54ed5e66af90837f362233dd259d784
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/danet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..95d5df09cbf9051437eceb366a585f860951d492
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/danet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..42557164da0e41dde9cce8b1e31274610c0c1463
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/danet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8f082d1ea32a245c7d48e20be9eb3309d357f2e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/danet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..fab574fa5b36f7f41543b0620eebd2f99e46cdfd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/danet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..148fa39d7287ede9b37a4d1f666c93a51216c93b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/danet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..efbd908a9210e6c027e47a6bfaf2e7549bbe4b98
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/danet_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/danet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/danet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/danet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..daff925baf8efc411a12e29afbe8821c66d038db
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/danet/metafile.yaml
@@ -0,0 +1,387 @@
+Collections:
+- Name: DANet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  README: configs/danet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: danet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.74
+  Config: configs/danet/danet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_40k_cityscapes/danet_r50-d8_512x1024_40k_cityscapes_20200605_191324-c0dbfa5f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_40k_cityscapes/danet_r50-d8_512x1024_40k_cityscapes_20200605_191324.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.52
+  Config: configs/danet/danet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_40k_cityscapes/danet_r101-d8_512x1024_40k_cityscapes_20200605_200831-c57a7157.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_40k_cityscapes/danet_r101-d8_512x1024_40k_cityscapes_20200605_200831.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.88
+      mIoU(ms+flip): 80.62
+  Config: configs/danet/danet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_40k_cityscapes/danet_r50-d8_769x769_40k_cityscapes_20200530_025703-76681c60.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_40k_cityscapes/danet_r50-d8_769x769_40k_cityscapes_20200530_025703.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.88
+      mIoU(ms+flip): 81.47
+  Config: configs/danet/danet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_40k_cityscapes/danet_r101-d8_769x769_40k_cityscapes_20200530_025717-dcb7fd4e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_40k_cityscapes/danet_r101-d8_769x769_40k_cityscapes_20200530_025717.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.34
+  Config: configs/danet/danet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_80k_cityscapes/danet_r50-d8_512x1024_80k_cityscapes_20200607_133029-2bfa2293.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_80k_cityscapes/danet_r50-d8_512x1024_80k_cityscapes_20200607_133029.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.41
+  Config: configs/danet/danet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_80k_cityscapes/danet_r101-d8_512x1024_80k_cityscapes_20200607_132918-955e6350.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_80k_cityscapes/danet_r101-d8_512x1024_80k_cityscapes_20200607_132918.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.27
+      mIoU(ms+flip): 80.96
+  Config: configs/danet/danet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_80k_cityscapes/danet_r50-d8_769x769_80k_cityscapes_20200607_132954-495689b4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_80k_cityscapes/danet_r50-d8_769x769_80k_cityscapes_20200607_132954.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.47
+      mIoU(ms+flip): 82.02
+  Config: configs/danet/danet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_80k_cityscapes/danet_r101-d8_769x769_80k_cityscapes_20200607_132918-f3a929e7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_80k_cityscapes/danet_r101-d8_769x769_80k_cityscapes_20200607_132918.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.66
+      mIoU(ms+flip): 42.9
+  Config: configs/danet/danet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_80k_ade20k/danet_r50-d8_512x512_80k_ade20k_20200615_015125-edb18e08.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_80k_ade20k/danet_r50-d8_512x512_80k_ade20k_20200615_015125.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.64
+      mIoU(ms+flip): 45.19
+  Config: configs/danet/danet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 15.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_80k_ade20k/danet_r101-d8_512x512_80k_ade20k_20200615_015126-d0357c73.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_80k_ade20k/danet_r101-d8_512x512_80k_ade20k_20200615_015126.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.45
+      mIoU(ms+flip): 43.25
+  Config: configs/danet/danet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_160k_ade20k/danet_r50-d8_512x512_160k_ade20k_20200616_082340-9cb35dcd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_160k_ade20k/danet_r50-d8_512x512_160k_ade20k_20200616_082340.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.17
+      mIoU(ms+flip): 45.02
+  Config: configs/danet/danet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_160k_ade20k/danet_r101-d8_512x512_160k_ade20k_20200616_082348-23bf12f9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_160k_ade20k/danet_r101-d8_512x512_160k_ade20k_20200616_082348.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 74.45
+      mIoU(ms+flip): 75.69
+  Config: configs/danet/danet_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_20k_voc12aug/danet_r50-d8_512x512_20k_voc12aug_20200618_070026-9e9e3ab3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_20k_voc12aug/danet_r50-d8_512x512_20k_voc12aug_20200618_070026.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.02
+      mIoU(ms+flip): 77.23
+  Config: configs/danet/danet_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_20k_voc12aug/danet_r101-d8_512x512_20k_voc12aug_20200618_070026-d48d23b2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_20k_voc12aug/danet_r101-d8_512x512_20k_voc12aug_20200618_070026.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.37
+      mIoU(ms+flip): 77.29
+  Config: configs/danet/danet_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_40k_voc12aug/danet_r50-d8_512x512_40k_voc12aug_20200613_235526-426e3a64.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_40k_voc12aug/danet_r50-d8_512x512_40k_voc12aug_20200613_235526.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.51
+      mIoU(ms+flip): 77.32
+  Config: configs/danet/danet_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_40k_voc12aug/danet_r101-d8_512x512_40k_voc12aug_20200613_223031-788e232a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_40k_voc12aug/danet_r101-d8_512x512_40k_voc12aug_20200613_223031.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/ddrnet/README.md b/head_extractor/src/mmseg/.mim/configs/ddrnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ccbfcdff359525e825803bf0413d0cef06ed8918
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ddrnet/README.md
@@ -0,0 +1,46 @@
+# DDRNet
+
+> [Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation of Road Scenes](http://arxiv.org/abs/2101.06085)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/ydhongHIT/DDRNet">Official Repo</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Semantic segmentation is a key technology for autonomous vehicles to understand the surrounding scenes. The appealing performances of contemporary models usually come at the expense of heavy computations and lengthy inference time, which is intolerable for self-driving. Using light-weight architectures (encoder-decoder or two-pathway) or reasoning on low-resolution images, recent methods realize very fast scene parsing, even running at more than 100 FPS on a single 1080Ti GPU. However, there is still a signiﬁcant gap in performance between these real-time methods and the models based on dilation backbones. To tackle this problem, we proposed a family of efﬁcient backbones specially designed for real-time semantic segmentation. The proposed deep dual-resolution networks (DDRNets) are composed of two deep branches between which multiple bilateral fusions are performed. Additionally, we design a new contextual information extractor named Deep Aggregation Pyramid Pooling Module (DAPPM) to enlarge effective receptive ﬁelds and fuse multi-scale context based on low-resolution feature maps. Our method achieves a new state-of-the-art trade-off between accuracy and speed on both Cityscapes and CamVid dataset. In particular, on a single 2080Ti GPU, DDRNet-23-slim yields 77.4% mIoU at 102 FPS on Cityscapes test set and 74.7% mIoU at 230 FPS on CamVid test set. With widely used test augmentation, our method is superior to most state-of-the-art models and requires much less computation. Codes and trained models are available online.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://raw.githubusercontent.com/ydhongHIT/DDRNet/main/figs/DDRNet_seg.png" width="80%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone      | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                                                                                             |
+| ------ | ------------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DDRNet | DDRNet23-slim | 1024x1024 | 120000  | 1.70     | 85.85          | A100   | 77.84 | 80.15         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230426_145312-6a5e5174.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230426_145312.json) |
+| DDRNet | DDRNet23      | 1024x1024 | 120000  | 7.26     | 33.41          | A100   | 79.99 | 81.71         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230425_162633-81601db0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230425_162633.json)                     |
+
+## Notes
+
+The pretrained weights in config files are converted from [the official repo](https://github.com/ydhongHIT/DDRNet#pretrained-models).
+
+## Citation
+
+```bibtex
+@article{pan2022deep,
+  title={Deep Dual-Resolution Networks for Real-Time and Accurate Semantic Segmentation of Traffic Scenes},
+  author={Pan, Huihui and Hong, Yuanduo and Sun, Weichao and Jia, Yisong},
+  journal={IEEE Transactions on Intelligent Transportation Systems},
+  year={2022},
+  publisher={IEEE}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..65b0ead547a4a3ee0cdceb848f40ad3598b0e721
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024.py
@@ -0,0 +1,93 @@
+_base_ = [
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py',
+]
+
+# The class_weight is borrowed from https://github.com/openseg-group/OCNet.pytorch/issues/14 # noqa
+# Licensed under the MIT License
+class_weight = [
+    0.8373, 0.918, 0.866, 1.0345, 1.0166, 0.9969, 0.9754, 1.0489, 0.8786,
+    1.0023, 0.9539, 0.9843, 1.1116, 0.9037, 1.0865, 1.0955, 1.0865, 1.1529,
+    1.0507
+]
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/pretrain/ddrnet23s-in1kpre_3rdparty-1ccac5b1.pth'  # noqa
+crop_size = (1024, 1024)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    size=crop_size,
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DDRNet',
+        in_channels=3,
+        channels=32,
+        ppm_channels=128,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(
+        type='DDRHead',
+        in_channels=32 * 4,
+        channels=64,
+        dropout_ratio=0.,
+        num_classes=19,
+        align_corners=False,
+        norm_cfg=norm_cfg,
+        loss_decode=[
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=class_weight,
+                loss_weight=1.0),
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=class_weight,
+                loss_weight=0.4),
+        ]),
+
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+train_dataloader = dict(batch_size=6, num_workers=4)
+
+iters = 120000
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=iters,
+        by_epoch=False)
+]
+
+# training schedule for 120k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=iters, val_interval=iters // 10)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=iters // 10),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+randomness = dict(seed=304)
diff --git a/head_extractor/src/mmseg/.mim/configs/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..444efe2b883611792d2f33a2e5ddaea112524499
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024.py
@@ -0,0 +1,93 @@
+_base_ = [
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py',
+]
+
+# The class_weight is borrowed from https://github.com/openseg-group/OCNet.pytorch/issues/14 # noqa
+# Licensed under the MIT License
+class_weight = [
+    0.8373, 0.918, 0.866, 1.0345, 1.0166, 0.9969, 0.9754, 1.0489, 0.8786,
+    1.0023, 0.9539, 0.9843, 1.1116, 0.9037, 1.0865, 1.0955, 1.0865, 1.1529,
+    1.0507
+]
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/pretrain/ddrnet23-in1kpre_3rdparty-9ca29f62.pth'  # noqa
+crop_size = (1024, 1024)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    size=crop_size,
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DDRNet',
+        in_channels=3,
+        channels=64,
+        ppm_channels=128,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(
+        type='DDRHead',
+        in_channels=64 * 4,
+        channels=128,
+        dropout_ratio=0.,
+        num_classes=19,
+        align_corners=False,
+        norm_cfg=norm_cfg,
+        loss_decode=[
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=class_weight,
+                loss_weight=1.0),
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=class_weight,
+                loss_weight=0.4),
+        ]),
+
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+train_dataloader = dict(batch_size=6, num_workers=4)
+
+iters = 120000
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=iters,
+        by_epoch=False)
+]
+
+# training schedule for 120k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=iters, val_interval=iters // 10)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=iters // 10),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+randomness = dict(seed=304)
diff --git a/head_extractor/src/mmseg/.mim/configs/ddrnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/ddrnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..07074702c222de649fc7ddf90cfac6071faa7450
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ddrnet/metafile.yaml
@@ -0,0 +1,64 @@
+Collections:
+- Name: DDRNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation
+      of Road Scenes
+    URL: http://arxiv.org/abs/2101.06085
+  README: configs/ddrnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024
+  In Collection: DDRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.84
+      mIoU(ms+flip): 80.15
+  Config: configs/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 12
+    Architecture:
+    - DDRNet23-slim
+    - DDRNet
+    Training Resources: 2x A100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230426_145312-6a5e5174.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230426_145312.json
+  Paper:
+    Title: Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation
+      of Road Scenes
+    URL: http://arxiv.org/abs/2101.06085
+  Code: ''
+  Framework: PyTorch
+- Name: ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024
+  In Collection: DDRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.99
+      mIoU(ms+flip): 81.71
+  Config: configs/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 12
+    Architecture:
+    - DDRNet23
+    - DDRNet
+    Training Resources: 2x A100 GPUS
+    Memory (GB): 7.26
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230425_162633-81601db0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230425_162633.json
+  Paper:
+    Title: Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation
+      of Road Scenes
+    URL: http://arxiv.org/abs/2101.06085
+  Code: ''
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/README.md b/head_extractor/src/mmseg/.mim/configs/deeplabv3/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..df50b7f90a0067b15d08043e4c59cb604e0e57da
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/README.md
@@ -0,0 +1,118 @@
+# DeepLabV3
+
+> [Rethinking atrous convolution for semantic image segmentation](https://arxiv.org/abs/1706.05587)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/tensorflow/models/tree/master/research/deeplab">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+In this work, we revisit atrous convolution, a powerful tool to explicitly adjust filter's field-of-view as well as control the resolution of feature responses computed by Deep Convolutional Neural Networks, in the application of semantic image segmentation. To handle the problem of segmenting objects at multiple scales, we design modules which employ atrous convolution in cascade or in parallel to capture multi-scale context by adopting multiple atrous rates. Furthermore, we propose to augment our previously proposed Atrous Spatial Pyramid Pooling module, which probes convolutional features at multiple scales, with image-level features encoding global context and further boost performance. We also elaborate on implementation details and share our experience on training our system. The proposed \`DeepLabv3' system significantly improves over our previous DeepLab versions without DenseCRF post-processing and attains comparable performance with other state-of-art models on the PASCAL VOC 2012 semantic image segmentation benchmark.
+
+<!-- [IMAGE] -->
+
+<div align=center >
+<img alt="DEEPLABv3_ResNet-D8" src="https://user-images.githubusercontent.com/61172629/209305311-87ff9e36-b7cd-46d7-8b4c-9e26e10c27d0.jpg"/>
+DEEPLABv3_ResNet-D8 model structure
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method           | Backbone        | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                                                               |
+| ---------------- | --------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| DeepLabV3        | R-50-D8         | 512x1024  |   40000 | 6.1      | 2.57           | V100   | 79.09 |         80.45 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes/deeplabv3_r50-d8_512x1024_40k_cityscapes_20200605_022449-acadc2f8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes/deeplabv3_r50-d8_512x1024_40k_cityscapes_20200605_022449.log.json)                                 |
+| DeepLabV3        | R-101-D8        | 512x1024  |   40000 | 9.6      | 1.92           | V100   | 77.12 |         79.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes/deeplabv3_r101-d8_512x1024_40k_cityscapes_20200605_012241-7fd3f799.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes/deeplabv3_r101-d8_512x1024_40k_cityscapes_20200605_012241.log.json)                             |
+| DeepLabV3        | R-50-D8         | 769x769   |   40000 | 6.9      | 1.11           | V100   | 78.58 |         79.89 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-769x769.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes/deeplabv3_r50-d8_769x769_40k_cityscapes_20200606_113723-7eda553c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes/deeplabv3_r50-d8_769x769_40k_cityscapes_20200606_113723.log.json)                                     |
+| DeepLabV3        | R-101-D8        | 769x769   |   40000 | 10.9     | 0.83           | V100   | 79.27 |         80.11 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-769x769.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes/deeplabv3_r101-d8_769x769_40k_cityscapes_20200606_113809-c64f889f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes/deeplabv3_r101-d8_769x769_40k_cityscapes_20200606_113809.log.json)                                 |
+| DeepLabV3        | R-18-D8         | 512x1024  |   80000 | 1.7      | 13.78          | V100   | 76.70 |         78.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-512x1024.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes/deeplabv3_r18-d8_512x1024_80k_cityscapes_20201225_021506-23dffbe2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes/deeplabv3_r18-d8_512x1024_80k_cityscapes-20201225_021506.log.json)                                 |
+| DeepLabV3        | R-50-D8         | 512x1024  |   80000 | -        | -              | V100   | 79.32 |         80.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes/deeplabv3_r50-d8_512x1024_80k_cityscapes_20200606_113404-b92cfdd4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes/deeplabv3_r50-d8_512x1024_80k_cityscapes_20200606_113404.log.json)                                 |
+| DeepLabV3        | R-101-D8        | 512x1024  |   80000 | -        | -              | V100   | 80.20 |         81.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes/deeplabv3_r101-d8_512x1024_80k_cityscapes_20200606_113503-9e428899.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes/deeplabv3_r101-d8_512x1024_80k_cityscapes_20200606_113503.log.json)                             |
+| DeepLabV3 (FP16) | R-101-D8        | 512x1024  |   80000 | 5.75     | 3.86           | V100   | 80.48 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-774d9cec.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920.log.json)         |
+| DeepLabV3        | R-18-D8         | 769x769   |   80000 | 1.9      | 5.55           | V100   | 76.60 |         78.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-769x769.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes/deeplabv3_r18-d8_769x769_80k_cityscapes_20201225_021506-6452126a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes/deeplabv3_r18-d8_769x769_80k_cityscapes-20201225_021506.log.json)                                     |
+| DeepLabV3        | R-50-D8         | 769x769   |   80000 | -        | -              | V100   | 79.89 |         81.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes/deeplabv3_r50-d8_769x769_80k_cityscapes_20200606_221338-788d6228.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes/deeplabv3_r50-d8_769x769_80k_cityscapes_20200606_221338.log.json)                                     |
+| DeepLabV3        | R-101-D8        | 769x769   |   80000 | -        | -              | V100   | 79.67 |         80.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-769x769.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes/deeplabv3_r101-d8_769x769_80k_cityscapes_20200607_013353-60e95418.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes/deeplabv3_r101-d8_769x769_80k_cityscapes_20200607_013353.log.json)                                 |
+| DeepLabV3        | R-101-D16-MG124 | 512x1024  |   40000 | 4.7      | 6.96           | V100   | 76.71 |         78.63 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes_20200908_005644-67b0c992.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes-20200908_005644.log.json) |
+| DeepLabV3        | R-101-D16-MG124 | 512x1024  |   80000 | -        | -              | V100   | 78.36 |         79.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes_20200908_005644-57bb8425.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes-20200908_005644.log.json) |
+| DeepLabV3        | R-18b-D8        | 512x1024  |   80000 | 1.6      | 13.93          | V100   | 76.26 |         77.88 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-512x1024.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes/deeplabv3_r18b-d8_512x1024_80k_cityscapes_20201225_094144-46040cef.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes/deeplabv3_r18b-d8_512x1024_80k_cityscapes-20201225_094144.log.json)                             |
+| DeepLabV3        | R-50b-D8        | 512x1024  |   80000 | 6.0      | 2.74           | V100   | 79.63 |         80.98 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-512x1024.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes/deeplabv3_r50b-d8_512x1024_80k_cityscapes_20201225_155148-ec368954.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes/deeplabv3_r50b-d8_512x1024_80k_cityscapes-20201225_155148.log.json)                             |
+| DeepLabV3        | R-101b-D8       | 512x1024  |   80000 | 9.5      | 1.81           | V100   | 80.01 |         81.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-512x1024.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes/deeplabv3_r101b-d8_512x1024_80k_cityscapes_20201226_171821-8fd49503.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes/deeplabv3_r101b-d8_512x1024_80k_cityscapes-20201226_171821.log.json)                         |
+| DeepLabV3        | R-18b-D8        | 769x769   |   80000 | 1.8      | 5.79           | V100   | 75.63 |         77.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-769x769.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes/deeplabv3_r18b-d8_769x769_80k_cityscapes_20201225_094144-fdc985d9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes/deeplabv3_r18b-d8_769x769_80k_cityscapes-20201225_094144.log.json)                                 |
+| DeepLabV3        | R-50b-D8        | 769x769   |   80000 | 6.8      | 1.16           | V100   | 78.80 |         80.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-769x769.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes/deeplabv3_r50b-d8_769x769_80k_cityscapes_20201225_155404-87fb0cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes/deeplabv3_r50b-d8_769x769_80k_cityscapes-20201225_155404.log.json)                                 |
+| DeepLabV3        | R-101b-D8       | 769x769   |   80000 | 10.7     | 0.82           | V100   | 79.41 |         80.73 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-769x769.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes/deeplabv3_r101b-d8_769x769_80k_cityscapes_20201226_190843-9142ee57.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes/deeplabv3_r101b-d8_769x769_80k_cityscapes-20201226_190843.log.json)                             |
+
+### ADE20K
+
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                           | download                                                                                                                                                                                                                                                                                                                                                   |
+| --------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3 | R-50-D8  | 512x512   |   80000 | 8.9      | 14.76          | V100   | 42.42 |         43.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k/deeplabv3_r50-d8_512x512_80k_ade20k_20200614_185028-0bb3f844.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k/deeplabv3_r50-d8_512x512_80k_ade20k_20200614_185028.log.json)         |
+| DeepLabV3 | R-101-D8 | 512x512   |   80000 | 12.4     | 10.14          | V100   | 44.08 |         45.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k/deeplabv3_r101-d8_512x512_80k_ade20k_20200615_021256-d89c7fa4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k/deeplabv3_r101-d8_512x512_80k_ade20k_20200615_021256.log.json)     |
+| DeepLabV3 | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 42.66 |         44.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k/deeplabv3_r50-d8_512x512_160k_ade20k_20200615_123227-5d0ee427.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k/deeplabv3_r50-d8_512x512_160k_ade20k_20200615_123227.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 45.00 |         46.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k/deeplabv3_r101-d8_512x512_160k_ade20k_20200615_105816-b1f72b3b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k/deeplabv3_r101-d8_512x512_160k_ade20k_20200615_105816.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                            | download                                                                                                                                                                                                                                                                                                                                                       |
+| --------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3 | R-50-D8  | 512x512   |   20000 | 6.1      | 13.88          | V100   | 76.17 |         77.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug/deeplabv3_r50-d8_512x512_20k_voc12aug_20200617_010906-596905ef.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug/deeplabv3_r50-d8_512x512_20k_voc12aug_20200617_010906.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |   20000 | 9.6      | 9.81           | V100   | 78.70 |         79.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug/deeplabv3_r101-d8_512x512_20k_voc12aug_20200617_010932-8d13832f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug/deeplabv3_r101-d8_512x512_20k_voc12aug_20200617_010932.log.json) |
+| DeepLabV3 | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 77.68 |         78.78 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug/deeplabv3_r50-d8_512x512_40k_voc12aug_20200613_161546-2ae96e7e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug/deeplabv3_r50-d8_512x512_40k_voc12aug_20200613_161546.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 77.92 |         79.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug/deeplabv3_r101-d8_512x512_40k_voc12aug_20200613_161432-0017d784.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug/deeplabv3_r101-d8_512x512_40k_voc12aug_20200613_161432.log.json) |
+
+### Pascal Context
+
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                                                               |
+| --------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3 | R-101-D8 | 480x480   |   40000 | 9.2      | 7.09           | V100   | 46.55 |         47.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context/deeplabv3_r101-d8_480x480_40k_pascal_context_20200911_204118-1aa27336.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context/deeplabv3_r101-d8_480x480_40k_pascal_context-20200911_204118.log.json) |
+| DeepLabV3 | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 46.42 |         47.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context/deeplabv3_r101-d8_480x480_80k_pascal_context_20200911_170155-2a21fff3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context/deeplabv3_r101-d8_480x480_80k_pascal_context-20200911_170155.log.json) |
+
+### Pascal Context 59
+
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                                                           |
+| --------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3 | R-101-D8 | 480x480   |   40000 | -        | -              | V100   | 52.61 |         54.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59/deeplabv3_r101-d8_480x480_40k_pascal_context_59_20210416_110332-cb08ea46.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59/deeplabv3_r101-d8_480x480_40k_pascal_context_59-20210416_110332.log.json) |
+| DeepLabV3 | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 52.46 |         54.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59/deeplabv3_r101-d8_480x480_80k_pascal_context_59_20210416_113002-26303993.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59/deeplabv3_r101-d8_480x480_80k_pascal_context_59-20210416_113002.log.json) |
+
+### COCO-Stuff 10k
+
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                 | download                                                                                                                                                                                                                                                                                                                                                                                           |
+| --------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3 | R-50-D8  | 512x512   |   20000 | 9.6      | 10.8           | V100   | 34.66 |         36.08 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_coco-stuff10k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025-b35f789d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |   20000 | 13.2     | 8.7            | V100   | 37.30 |         38.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_coco-stuff10k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025-c49752cb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025.log.json) |
+| DeepLabV3 | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 35.73 |         37.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_coco-stuff10k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305-dc76f3ff.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 37.81 |         38.80 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_coco-stuff10k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305-636cb433.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305.log.json) |
+
+### COCO-Stuff 164k
+
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                                                   |
+| --------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3 | R-50-D8  | 512x512   |   80000 | 9.6      | 10.8           | V100   | 39.38 |         40.03 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_coco-stuff164k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k_20210709_163016-88675c24.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k_20210709_163016.log.json)         |
+| DeepLabV3 | R-101-D8 | 512x512   |   80000 | 13.2     | 8.7            | V100   | 40.87 |         41.50 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k_20210709_201252-13600dc2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k_20210709_201252.log.json)     |
+| DeepLabV3 | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 41.09 |         41.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k_20210709_163016-49f2812b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k_20210709_163016.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 41.82 |         42.49 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_coco-stuff164k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k_20210709_155402-f035acfd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k_20210709_155402.log.json) |
+| DeepLabV3 | R-50-D8  | 512x512   |  320000 | -        | -              | V100   | 41.37 |         42.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-320k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k_20210709_155403-51b21115.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k_20210709_155403.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |  320000 | -        | -              | V100   | 42.61 |         43.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-320k_coco-stuff164k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k_20210709_155402-3cbca14d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k_20210709_155402.log.json) |
+
+Note:
+
+- `D-8` here corresponding to the output stride 8 setting for DeepLab series.
+- `FP16` means Mixed Precision (FP16) is adopted in training.
+
+## Citation
+
+```bibtext
+@article{chen2017rethinking,
+  title={Rethinking atrous convolution for semantic image segmentation},
+  author={Chen, Liang-Chieh and Papandreou, George and Schroff, Florian and Adam, Hartwig},
+  journal={arXiv preprint arXiv:1706.05587},
+  year={2017}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9f3c178df033da62d2ce51b6a27964240ca516a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(
+        depth=101,
+        dilations=(1, 1, 1, 2),
+        strides=(1, 2, 2, 1),
+        multi_grid=(1, 2, 4)),
+    decode_head=dict(
+        dilations=(1, 6, 12, 18),
+        sampler=dict(type='OHEMPixelSampler', min_kept=100000)))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..da3a88f998902e04329cc739b87de0c06f28fde9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(
+        depth=101,
+        dilations=(1, 1, 1, 2),
+        strides=(1, 2, 2, 1),
+        multi_grid=(1, 2, 4)),
+    decode_head=dict(
+        dilations=(1, 6, 12, 18),
+        sampler=dict(type='OHEMPixelSampler', min_kept=100000)))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..d01803ce1f033693794c264dc735375e336aa231
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..7964b514463e41babb5a2c7bbeac9e84e5c7f068
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d1a6201a0bc9e7e7f1eb538e20680cbebcc4237
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..78205468d799f3f0ddfe126d992d3538e7e0630a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..84174166ce4420776ada7b5e48f7322f1b456fd9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = './deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024.py'
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=optimizer,
+    loss_scale=512.)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ed6eee83348a60b70caf7aff2f095b844118875
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..add008345f70844254482c66ef257c37fe59b855
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-160k_coco-stuff164k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_coco-stuff10k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_coco-stuff10k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..349cc88f0a7f137fa97c5504c0ae2943a3198b3f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_coco-stuff10k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-20k_coco-stuff10k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c527e0c5367b61b637b2fdd26d6997160e076d7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-320k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-320k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea27bedc04aeeefaf1fb37d04a05cb038a9db6fd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-320k_coco-stuff164k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-320k_coco-stuff164k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_coco-stuff10k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_coco-stuff10k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a43a786e0e550e338793f4fd74d476abd8931d36
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_coco-stuff10k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-40k_coco-stuff10k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..8879d5394f09d2c13ce403c70faa4a5c1bcaa6b6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-40k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..54671d4dc622d5b5f8ed6722777b9156d357ba9e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b2635d1c28cc488b92eabb111449977c5b33101
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7bb0b64488aee8580a0e5808af5d64a22483062
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d4f6f747b24efd0bebb9240711a7788adfc54e4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_coco-stuff164k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-80k_coco-stuff164k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d64ca29fecd47b381042f41067af833d4a2043f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-80k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..54671d4dc622d5b5f8ed6722777b9156d357ba9e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..708932da859cd163b4637b63960278989e84c764
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,4 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0f634d0819e1d86cb8d99ae0aa1412a1154a0af
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,4 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc353bb564a488b3a5517ca0003661a1bbe6874c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..021c98c3762fac98ee7d6513c1eea5ce7e5124e1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,9 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..c747cd74a2c0ef85cfd60c276a518872d75de501
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..6506abf69628b6e48a9b9bd6830d1744af97f966
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,9 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a2a971eb94f2cc07081d04d9c8e84d0311b6205
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..a52f29e4ce6a934aa09557ab43a1da5752eac1ad
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd29b96e1ef10e2e714c4d046a7ccebbbd7637b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..27f0fc4cae202eb6e1a45f2ee6f128d717ee5fa4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..04e15f0f0fc36b9616765371edd9e61aaf3bfa1b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba76a594193147a03ed1035d0280595ce7d88fdf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py',
+    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=171),
+    auxiliary_head=dict(num_classes=171))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_coco-stuff10k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_coco-stuff10k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0559c8bfc6fd0f106faa273b4ad45725dd0f1b3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_coco-stuff10k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py',
+    '../_base_/datasets/coco-stuff10k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=171),
+    auxiliary_head=dict(num_classes=171))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5458d908bdd27504793e74bb421e6b2cfbfc308
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-320k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-320k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3b4f94f8c62682007db3160b9b3cb0f25c66156
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-320k_coco-stuff164k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py',
+    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_320k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=171),
+    auxiliary_head=dict(num_classes=171))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_coco-stuff10k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_coco-stuff10k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..40dbffad4571a6b944dd45d35db11f4b433d36a5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_coco-stuff10k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py',
+    '../_base_/datasets/coco-stuff10k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=171),
+    auxiliary_head=dict(num_classes=171))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c4e753a2d04dcc329e54520f11fa1574a772d47
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py',
+    '../_base_/datasets/pascal_context.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=60),
+    auxiliary_head=dict(num_classes=60),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b6c36909b58e24d5079d4bc95b545575d1c008
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py',
+    '../_base_/datasets/pascal_context_59.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=59),
+    auxiliary_head=dict(num_classes=59),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..8333cc670159e93a2fb9e05b34e6bb37d383ac26
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bcdab51498f6c95ee778197d2bca750dce05025
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..519df5a23b4f437580be00d9ce658e53f96ec780
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_coco-stuff164k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py',
+    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=171),
+    auxiliary_head=dict(num_classes=171))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba8c7ded96ac05df978eebeabddaeca996baa974
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py',
+    '../_base_/datasets/pascal_context.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=60),
+    auxiliary_head=dict(num_classes=60),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..d34bd89339b13bc4d6dc68ba06495716fa077de5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py',
+    '../_base_/datasets/pascal_context_59.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=59),
+    auxiliary_head=dict(num_classes=59),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..818519f263186a0b504d65af918bd459e5f34401
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..07a234be753b8b7a16567465bb52b95cdff78a54
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/deeplabv3/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..650f7d695db64ae5429ee0516b078572119a81a4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3/metafile.yaml
@@ -0,0 +1,985 @@
+Collections:
+- Name: DeepLabV3
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+    - Pascal Context
+    - Pascal Context 59
+    - COCO-Stuff 10k
+    - COCO-Stuff 164k
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  README: configs/deeplabv3/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.09
+      mIoU(ms+flip): 80.45
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes/deeplabv3_r50-d8_512x1024_40k_cityscapes_20200605_022449-acadc2f8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes/deeplabv3_r50-d8_512x1024_40k_cityscapes_20200605_022449.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.12
+      mIoU(ms+flip): 79.61
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes/deeplabv3_r101-d8_512x1024_40k_cityscapes_20200605_012241-7fd3f799.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes/deeplabv3_r101-d8_512x1024_40k_cityscapes_20200605_012241.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.58
+      mIoU(ms+flip): 79.89
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes/deeplabv3_r50-d8_769x769_40k_cityscapes_20200606_113723-7eda553c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes/deeplabv3_r50-d8_769x769_40k_cityscapes_20200606_113723.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.27
+      mIoU(ms+flip): 80.11
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes/deeplabv3_r101-d8_769x769_40k_cityscapes_20200606_113809-c64f889f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes/deeplabv3_r101-d8_769x769_40k_cityscapes_20200606_113809.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r18-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.7
+      mIoU(ms+flip): 78.27
+  Config: configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes/deeplabv3_r18-d8_512x1024_80k_cityscapes_20201225_021506-23dffbe2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes/deeplabv3_r18-d8_512x1024_80k_cityscapes-20201225_021506.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.32
+      mIoU(ms+flip): 80.57
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes/deeplabv3_r50-d8_512x1024_80k_cityscapes_20200606_113404-b92cfdd4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes/deeplabv3_r50-d8_512x1024_80k_cityscapes_20200606_113404.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.2
+      mIoU(ms+flip): 81.21
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes/deeplabv3_r101-d8_512x1024_80k_cityscapes_20200606_113503-9e428899.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes/deeplabv3_r101-d8_512x1024_80k_cityscapes_20200606_113503.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb2-amp-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.48
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    - (FP16)
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.75
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-774d9cec.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r18-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.6
+      mIoU(ms+flip): 78.26
+  Config: configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes/deeplabv3_r18-d8_769x769_80k_cityscapes_20201225_021506-6452126a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes/deeplabv3_r18-d8_769x769_80k_cityscapes-20201225_021506.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.89
+      mIoU(ms+flip): 81.06
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes/deeplabv3_r50-d8_769x769_80k_cityscapes_20200606_221338-788d6228.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes/deeplabv3_r50-d8_769x769_80k_cityscapes_20200606_221338.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.67
+      mIoU(ms+flip): 80.81
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes/deeplabv3_r101-d8_769x769_80k_cityscapes_20200607_013353-60e95418.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes/deeplabv3_r101-d8_769x769_80k_cityscapes_20200607_013353.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d16-mg124_4xb2-40k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.71
+      mIoU(ms+flip): 78.63
+  Config: configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16-MG124
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes_20200908_005644-67b0c992.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes-20200908_005644.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d16-mg124_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.36
+      mIoU(ms+flip): 79.84
+  Config: configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16-MG124
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes_20200908_005644-57bb8425.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes-20200908_005644.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r18b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.26
+      mIoU(ms+flip): 77.88
+  Config: configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes/deeplabv3_r18b-d8_512x1024_80k_cityscapes_20201225_094144-46040cef.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes/deeplabv3_r18b-d8_512x1024_80k_cityscapes-20201225_094144.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.63
+      mIoU(ms+flip): 80.98
+  Config: configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes/deeplabv3_r50b-d8_512x1024_80k_cityscapes_20201225_155148-ec368954.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes/deeplabv3_r50b-d8_512x1024_80k_cityscapes-20201225_155148.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.01
+      mIoU(ms+flip): 81.21
+  Config: configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes/deeplabv3_r101b-d8_512x1024_80k_cityscapes_20201226_171821-8fd49503.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes/deeplabv3_r101b-d8_512x1024_80k_cityscapes-20201226_171821.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r18b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.63
+      mIoU(ms+flip): 77.51
+  Config: configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes/deeplabv3_r18b-d8_769x769_80k_cityscapes_20201225_094144-fdc985d9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes/deeplabv3_r18b-d8_769x769_80k_cityscapes-20201225_094144.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.8
+      mIoU(ms+flip): 80.27
+  Config: configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes/deeplabv3_r50b-d8_769x769_80k_cityscapes_20201225_155404-87fb0cf4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes/deeplabv3_r50b-d8_769x769_80k_cityscapes-20201225_155404.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.41
+      mIoU(ms+flip): 80.73
+  Config: configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes/deeplabv3_r101b-d8_769x769_80k_cityscapes_20201226_190843-9142ee57.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes/deeplabv3_r101b-d8_769x769_80k_cityscapes-20201226_190843.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.42
+      mIoU(ms+flip): 43.28
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k/deeplabv3_r50-d8_512x512_80k_ade20k_20200614_185028-0bb3f844.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k/deeplabv3_r50-d8_512x512_80k_ade20k_20200614_185028.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.08
+      mIoU(ms+flip): 45.19
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k/deeplabv3_r101-d8_512x512_80k_ade20k_20200615_021256-d89c7fa4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k/deeplabv3_r101-d8_512x512_80k_ade20k_20200615_021256.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.66
+      mIoU(ms+flip): 44.09
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k/deeplabv3_r50-d8_512x512_160k_ade20k_20200615_123227-5d0ee427.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k/deeplabv3_r50-d8_512x512_160k_ade20k_20200615_123227.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.0
+      mIoU(ms+flip): 46.66
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k/deeplabv3_r101-d8_512x512_160k_ade20k_20200615_105816-b1f72b3b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k/deeplabv3_r101-d8_512x512_160k_ade20k_20200615_105816.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.17
+      mIoU(ms+flip): 77.42
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug/deeplabv3_r50-d8_512x512_20k_voc12aug_20200617_010906-596905ef.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug/deeplabv3_r50-d8_512x512_20k_voc12aug_20200617_010906.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.7
+      mIoU(ms+flip): 79.95
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug/deeplabv3_r101-d8_512x512_20k_voc12aug_20200617_010932-8d13832f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug/deeplabv3_r101-d8_512x512_20k_voc12aug_20200617_010932.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.68
+      mIoU(ms+flip): 78.78
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug/deeplabv3_r50-d8_512x512_40k_voc12aug_20200613_161546-2ae96e7e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug/deeplabv3_r50-d8_512x512_40k_voc12aug_20200613_161546.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.92
+      mIoU(ms+flip): 79.18
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug/deeplabv3_r101-d8_512x512_40k_voc12aug_20200613_161432-0017d784.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug/deeplabv3_r101-d8_512x512_40k_voc12aug_20200613_161432.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-40k_pascal-context-480x480
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 46.55
+      mIoU(ms+flip): 47.81
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context/deeplabv3_r101-d8_480x480_40k_pascal_context_20200911_204118-1aa27336.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context/deeplabv3_r101-d8_480x480_40k_pascal_context-20200911_204118.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-80k_pascal-context-480x480
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 46.42
+      mIoU(ms+flip): 47.53
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context/deeplabv3_r101-d8_480x480_80k_pascal_context_20200911_170155-2a21fff3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context/deeplabv3_r101-d8_480x480_80k_pascal_context-20200911_170155.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-40k_pascal-context-59-480x480
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 52.61
+      mIoU(ms+flip): 54.28
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59/deeplabv3_r101-d8_480x480_40k_pascal_context_59_20210416_110332-cb08ea46.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59/deeplabv3_r101-d8_480x480_40k_pascal_context_59-20210416_110332.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-80k_pascal-context-59-480x480
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 52.46
+      mIoU(ms+flip): 54.09
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59/deeplabv3_r101-d8_480x480_80k_pascal_context_59_20210416_113002-26303993.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59/deeplabv3_r101-d8_480x480_80k_pascal_context_59-20210416_113002.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-20k_coco-stuff10k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 34.66
+      mIoU(ms+flip): 36.08
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025-b35f789d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-20k_coco-stuff10k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 37.3
+      mIoU(ms+flip): 38.42
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025-c49752cb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-40k_coco-stuff10k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 35.73
+      mIoU(ms+flip): 37.09
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305-dc76f3ff.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-40k_coco-stuff10k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 37.81
+      mIoU(ms+flip): 38.8
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305-636cb433.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-80k_coco-stuff164k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 39.38
+      mIoU(ms+flip): 40.03
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k_20210709_163016-88675c24.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k_20210709_163016.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-80k_coco-stuff164k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 40.87
+      mIoU(ms+flip): 41.5
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k_20210709_201252-13600dc2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k_20210709_201252.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-160k_coco-stuff164k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 41.09
+      mIoU(ms+flip): 41.69
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k_20210709_163016-49f2812b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k_20210709_163016.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-160k_coco-stuff164k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 41.82
+      mIoU(ms+flip): 42.49
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k_20210709_155402-f035acfd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k_20210709_155402.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-320k_coco-stuff164k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 41.37
+      mIoU(ms+flip): 42.22
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-320k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k_20210709_155403-51b21115.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k_20210709_155403.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-320k_coco-stuff164k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 42.61
+      mIoU(ms+flip): 43.42
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-320k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k_20210709_155402-3cbca14d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k_20210709_155402.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/README.md b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..04d01fa5124c5f7dd8d7fdb94245247ce5f398e0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/README.md
@@ -0,0 +1,138 @@
+# DeepLabV3+
+
+> [Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1802.02611)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/tensorflow/models/tree/master/research/deeplab">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Spatial pyramid pooling module or encode-decoder structure are used in deep neural networks for semantic segmentation task. The former networks are able to encode multi-scale contextual information by probing the incoming features with filters or pooling operations at multiple rates and multiple effective fields-of-view, while the latter networks can capture sharper object boundaries by gradually recovering the spatial information. In this work, we propose to combine the advantages from both methods. Specifically, our proposed model, DeepLabv3+, extends DeepLabv3 by adding a simple yet effective decoder module to refine the segmentation results especially along object boundaries. We further explore the Xception model and apply the depthwise separable convolution to both Atrous Spatial Pyramid Pooling and decoder modules, resulting in a faster and stronger encoder-decoder network. We demonstrate the effectiveness of the proposed model on PASCAL VOC 2012 and Cityscapes datasets, achieving the test set performance of 89.0% and 82.1% without any post-processing. Our paper is accompanied with a publicly available reference implementation of the proposed models in Tensorflow at [this https URL](https://github.com/tensorflow/models/tree/master/research/deeplab).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142900680-3e2c3098-8341-4760-bbfd-b1d7d29968ea.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method            | Backbone        | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| ----------------- | --------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| DeepLabV3+        | R-50-D8         | 512x1024  |   40000 | 7.5      | 3.94           | V100   | 79.61 |         81.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes/deeplabv3plus_r50-d8_512x1024_40k_cityscapes_20200605_094610-d222ffcd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes/deeplabv3plus_r50-d8_512x1024_40k_cityscapes_20200605_094610.log.json)                                 |
+| DeepLabV3+        | R-101-D8        | 512x1024  |   40000 | 11       | 2.60           | V100   | 80.21 |         81.82 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-512x1024.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes/deeplabv3plus_r101-d8_512x1024_40k_cityscapes_20200605_094614-3769eecf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes/deeplabv3plus_r101-d8_512x1024_40k_cityscapes_20200605_094614.log.json)                             |
+| DeepLabV3+        | R-50-D8         | 769x769   |   40000 | 8.5      | 1.72           | V100   | 78.97 |         80.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769.py)                  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes/deeplabv3plus_r50-d8_769x769_40k_cityscapes_20200606_114143-1dcb0e3c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes/deeplabv3plus_r50-d8_769x769_40k_cityscapes_20200606_114143.log.json)                                     |
+| DeepLabV3+        | R-101-D8        | 769x769   |   40000 | 12.5     | 1.15           | V100   | 79.46 |         80.50 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes/deeplabv3plus_r101-d8_769x769_40k_cityscapes_20200606_114304-ff414b9e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes/deeplabv3plus_r101-d8_769x769_40k_cityscapes_20200606_114304.log.json)                                 |
+| DeepLabV3+        | R-18-D8         | 512x1024  |   80000 | 2.2      | 14.27          | V100   | 76.89 |         78.76 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes/deeplabv3plus_r18-d8_512x1024_80k_cityscapes_20201226_080942-cff257fe.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes/deeplabv3plus_r18-d8_512x1024_80k_cityscapes-20201226_080942.log.json)                                 |
+| DeepLabV3+        | R-50-D8         | 512x1024  |   80000 | -        | -              | V100   | 80.09 |         81.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes/deeplabv3plus_r50-d8_512x1024_80k_cityscapes_20200606_114049-f9fb496d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes/deeplabv3plus_r50-d8_512x1024_80k_cityscapes_20200606_114049.log.json)                                 |
+| DeepLabV3+        | R-101-D8        | 512x1024  |   80000 | -        | -              | V100   | 80.97 |         82.03 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_512x1024_80k_cityscapes_20200606_114143-068fcfe9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_512x1024_80k_cityscapes_20200606_114143.log.json)                             |
+| DeepLabV3+ (FP16) | R-101-D8        | 512x1024  |   80000 | 6.35     | 7.87           | V100   | 80.46 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-f1104f4b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920.log.json)         |
+| DeepLabV3+        | R-18-D8         | 769x769   |   80000 | 2.5      | 5.74           | V100   | 76.26 |         77.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-769x769.py)                  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes/deeplabv3plus_r18-d8_769x769_80k_cityscapes_20201226_083346-f326e06a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes/deeplabv3plus_r18-d8_769x769_80k_cityscapes-20201226_083346.log.json)                                     |
+| DeepLabV3+        | R-50-D8         | 769x769   |   80000 | -        | -              | V100   | 79.83 |         81.48 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py)                  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes/deeplabv3plus_r50-d8_769x769_80k_cityscapes_20200606_210233-0e9dfdc4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes/deeplabv3plus_r50-d8_769x769_80k_cityscapes_20200606_210233.log.json)                                     |
+| DeepLabV3+        | R-101-D8        | 769x769   |   80000 | -        | -              | V100   | 80.65 |         81.47 | [config<sup>\[1\]</sup>](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes/deeplabv3plus_r101-d8_769x769_80k_cityscapes_20220406_154720-dfcc0b68.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes/deeplabv3plus_r101-d8_769x769_80k_cityscapes_20220406_154720.log.json)                                 |
+| DeepLabV3+        | R-101-D16-MG124 | 512x1024  |   40000 | 5.8      | 7.48           | V100   | 79.09 |         80.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/ddeeplabv3plus_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes_20200908_005644-cf9ce186.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes-20200908_005644.log.json) |
+| DeepLabV3+        | R-101-D16-MG124 | 512x1024  |   80000 | 9.9      | -              | V100   | 79.90 |         81.33 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes_20200908_005644-ee6158e0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes-20200908_005644.log.json) |
+| DeepLabV3+        | R-18b-D8        | 512x1024  |   80000 | 2.1      | 14.95          | V100   | 75.87 |         77.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-512x1024.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes_20201226_090828-e451abd9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes-20201226_090828.log.json)                             |
+| DeepLabV3+        | R-50b-D8        | 512x1024  |   80000 | 7.4      | 3.94           | V100   | 80.28 |         81.44 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-512x1024.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes_20201225_213645-a97e4e43.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes-20201225_213645.log.json)                             |
+| DeepLabV3+        | R-101b-D8       | 512x1024  |   80000 | 10.9     | 2.60           | V100   | 80.16 |         81.41 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-512x1024.py)               | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes_20201226_190843-9c3c93a4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes-20201226_190843.log.json)                         |
+| DeepLabV3+        | R-18b-D8        | 769x769   |   80000 | 2.4      | 5.96           | V100   | 76.36 |         78.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-769x769.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes/deeplabv3plus_r18b-d8_769x769_80k_cityscapes_20201226_151312-2c868aff.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes/deeplabv3plus_r18b-d8_769x769_80k_cityscapes-20201226_151312.log.json)                                 |
+| DeepLabV3+        | R-50b-D8        | 769x769   |   80000 | 8.4      | 1.72           | V100   | 79.41 |         80.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-769x769.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes/deeplabv3plus_r50b-d8_769x769_80k_cityscapes_20201225_224655-8b596d1c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes/deeplabv3plus_r50b-d8_769x769_80k_cityscapes-20201225_224655.log.json)                                 |
+| DeepLabV3+        | R-101b-D8       | 769x769   |   80000 | 12.3     | 1.10           | V100   | 79.88 |         81.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-769x769.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes/deeplabv3plus_r101b-d8_769x769_80k_cityscapes_20201226_205041-227cdf7c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes/deeplabv3plus_r101b-d8_769x769_80k_cityscapes-20201226_205041.log.json)                             |
+
+\[1\] The training of the model is sensitive to random seed, and the seed to train it is 1111.
+
+### ADE20K
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                           |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-50-D8  | 512x512   |   80000 | 10.6     | 21.01          | V100   | 42.72 |         43.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k/deeplabv3plus_r50-d8_512x512_80k_ade20k_20200614_185028-bf1400d8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k/deeplabv3plus_r50-d8_512x512_80k_ade20k_20200614_185028.log.json)         |
+| DeepLabV3+ | R-101-D8 | 512x512   |   80000 | 14.1     | 14.16          | V100   | 44.60 |         46.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k/deeplabv3plus_r101-d8_512x512_80k_ade20k_20200615_014139-d5730af7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k/deeplabv3plus_r101-d8_512x512_80k_ade20k_20200615_014139.log.json)     |
+| DeepLabV3+ | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 43.95 |         44.93 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k/deeplabv3plus_r50-d8_512x512_160k_ade20k_20200615_124504-6135c7e0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k/deeplabv3plus_r50-d8_512x512_160k_ade20k_20200615_124504.log.json)     |
+| DeepLabV3+ | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 45.47 |         46.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k/deeplabv3plus_r101-d8_512x512_160k_ade20k_20200615_123232-38ed86bb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k/deeplabv3plus_r101-d8_512x512_160k_ade20k_20200615_123232.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                                                               |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-50-D8  | 512x512   |   20000 | 7.6      | 21             | V100   | 75.93 |         77.50 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug/deeplabv3plus_r50-d8_512x512_20k_voc12aug_20200617_102323-aad58ef1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug/deeplabv3plus_r50-d8_512x512_20k_voc12aug_20200617_102323.log.json)     |
+| DeepLabV3+ | R-101-D8 | 512x512   |   20000 | 11       | 13.88          | V100   | 77.22 |         78.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug/deeplabv3plus_r101-d8_512x512_20k_voc12aug_20200617_102345-c7ff3d56.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug/deeplabv3plus_r101-d8_512x512_20k_voc12aug_20200617_102345.log.json) |
+| DeepLabV3+ | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 76.81 |         77.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug/deeplabv3plus_r50-d8_512x512_40k_voc12aug_20200613_161759-e1b43aa9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug/deeplabv3plus_r50-d8_512x512_40k_voc12aug_20200613_161759.log.json)     |
+| DeepLabV3+ | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 78.62 |         79.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug/deeplabv3plus_r101-d8_512x512_40k_voc12aug_20200613_205333-faf03387.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug/deeplabv3plus_r101-d8_512x512_40k_voc12aug_20200613_205333.log.json) |
+
+### Pascal Context
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                                                                       |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-101-D8 | 480x480   |   40000 | -        | 9.09           | V100   | 47.30 |         48.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context/deeplabv3plus_r101-d8_480x480_40k_pascal_context_20200911_165459-d3c8a29e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context/deeplabv3plus_r101-d8_480x480_40k_pascal_context-20200911_165459.log.json) |
+| DeepLabV3+ | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 47.23 |         48.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context/deeplabv3plus_r101-d8_480x480_80k_pascal_context_20200911_155322-145d3ee8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context/deeplabv3plus_r101-d8_480x480_80k_pascal_context-20200911_155322.log.json) |
+
+### Pascal Context 59
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-101-D8 | 480x480   |   40000 | -        | -              | V100   | 52.86 |         54.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59_20210416_111233-ed937f15.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59-20210416_111233.log.json) |
+| DeepLabV3+ | R-101-D8 | 480x480   |   80000 | -        | -              | V100   |  53.2 |         54.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59_20210416_111127-7ca0331d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59-20210416_111127.log.json) |
+
+### LoveDA
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                                                       |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| DeepLabV3+ | R-18-D8  | 512x512   |   80000 | 1.93     | 25.57          | V100   | 50.28 |         50.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_loveda-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda/deeplabv3plus_r18-d8_512x512_80k_loveda_20211104_132800-ce0fa0ca.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda/deeplabv3plus_r18-d8_512x512_80k_loveda_20211104_132800.log.json)     |
+| DeepLabV3+ | R-50-D8  | 512x512   |   80000 | 7.37     | 6.00           | V100   | 50.99 |         50.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda/deeplabv3plus_r50-d8_512x512_80k_loveda_20211105_080442-f0720392.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda/deeplabv3plus_r50-d8_512x512_80k_loveda_20211105_080442.log.json)     |
+| DeepLabV3+ | R-101-D8 | 512x512   |   80000 | 10.84    | 4.33           | V100   | 51.47 |         51.32 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_loveda-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda/deeplabv3plus_r101-d8_512x512_80k_loveda_20211105_110759-4c1f297e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda/deeplabv3plus_r101-d8_512x512_80k_loveda_20211105_110759.log.json) |
+
+### Potsdam
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                           |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-18-D8  | 512x512   |   80000 | 1.91     | 81.68          | V100   | 77.09 |         78.44 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_potsdam-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam/deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601-75fd5bc3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam/deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601.log.json)     |
+| DeepLabV3+ | R-50-D8  | 512x512   |   80000 | 7.36     | 26.44          | V100   | 78.33 |         79.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam/deeplabv3plus_r50-d8_512x512_80k_potsdam_20211219_031508-7e7a2b24.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam/deeplabv3plus_r50-d8_512x512_80k_potsdam_20211219_031508.log.json)     |
+| DeepLabV3+ | R-101-D8 | 512x512   |   80000 | 10.83    | 17.56          | V100   |  78.7 |         79.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam/deeplabv3plus_r101-d8_512x512_80k_potsdam_20211219_031508-8b112708.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam/deeplabv3plus_r101-d8_512x512_80k_potsdam_20211219_031508.log.json) |
+
+### Vaihingen
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                                                                   |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-18-D8  | 512x512   |   80000 | 1.91     | 72.79          | V100   | 72.50 |         74.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_vaihingen-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen_20211231_230805-7626a263.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen_20211231_230805.log.json)     |
+| DeepLabV3+ | R-50-D8  | 512x512   |   80000 | 7.36     | 26.91          | V100   | 73.97 |         75.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen_20211231_230816-5040938d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen_20211231_230816.log.json)     |
+| DeepLabV3+ | R-101-D8 | 512x512   |   80000 | 10.83    | 18.59          | V100   | 73.06 |         74.14 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_vaihingen-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen_20211231_230816-8a095afa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen_20211231_230816.log.json) |
+
+### iSAID
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                | download                                                                                                                                                                                                                                                                                                                                                                               |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-18-D8  | 896x896   |   80000 | 6.19     | 24.81          | V100   | 61.35 |         62.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_isaid-896x896.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid_20220110_180526-7059991d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid_20220110_180526.log.json) |
+| DeepLabV3+ | R-50-D8  | 896x896   |   80000 | 21.45    | 8.42           | V100   | 67.06 |         68.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_isaid-896x896.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid_20220110_180526-598be439.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid_20220110_180526.log.json) |
+
+### Mapillary Vistas v1.2
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                            | download                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-50-D8  | 1280x1280 |  300000 | 24.04    | 17.92          | A100   | 47.35 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280_20230301_110504-655f8e43.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280_20230301_110504.json) |
+
+Note:
+
+- `D-8`/`D-16` here corresponding to the output stride 8/16 setting for DeepLab series.
+- `MG-124` stands for multi-grid dilation in the last stage of ResNet.
+- `FP16` means Mixed Precision (FP16) is adopted in training.
+- `896x896` is the Crop Size of iSAID dataset, which is followed by the implementation of [PointFlow: Flowing Semantics Through Points for Aerial Image Segmentation](https://arxiv.org/pdf/2103.06564.pdf)
+
+## Citation
+
+```bibtex
+@inproceedings{deeplabv3plus2018,
+  title={Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation},
+  author={Liang-Chieh Chen and Yukun Zhu and George Papandreou and Florian Schroff and Hartwig Adam},
+  booktitle={ECCV},
+  year={2018}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..71c9118e1d5bd2e8e61112c42921fbe7bd38baf4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(
+        depth=101,
+        dilations=(1, 1, 1, 2),
+        strides=(1, 2, 2, 1),
+        multi_grid=(1, 2, 4)),
+    decode_head=dict(
+        dilations=(1, 6, 12, 18),
+        sampler=dict(type='OHEMPixelSampler', min_kept=100000)))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d1ccf0b30acb4633c7892e5377bf3c38064e498
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(
+        depth=101,
+        dilations=(1, 1, 1, 2),
+        strides=(1, 2, 2, 1),
+        multi_grid=(1, 2, 4)),
+    decode_head=dict(
+        dilations=(1, 6, 12, 18),
+        sampler=dict(type='OHEMPixelSampler', min_kept=100000)))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..884b526d4843d215b1a838ba4c1bf2b0586c0cfc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..debb0255fcc49e5b3284661ad98d660cfa9a8ad0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc9334e67d5b72ca34d43567a3764d92f38cf0ae
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..4af9aa26825826c46854fbe71285897f6f592925
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c9883dc4f51aafc15136144b5c284fd31833ec8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = './deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=optimizer,
+    loss_scale=512.)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c38a802e1052634c60baf77e0eb8e009eb87075e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..97bb827722a06b76a395215e8e0423c9354e73ee
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4b401162de4bb91e9197179f6ebd71fa3b1e7da
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-40k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeefae49275725c98950a188a83818342277cd1d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..0755c53aaef6467f44bfe61d06e9de9ce8ab1ed0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..844ac9613b431a918277fc6f363c7333573a871e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_loveda-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..87c6da9d6a3be3cd1f4ec2d04abca666ab40c7c5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..115b1c90586ab92b063c019e1a501b69f1e66eef
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aaa65382274652ee62c245f9b28a0d6014cd41c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..5063b1332c243c6395f3d511a3e125d5b7175cb4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_vaihingen-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b99c2c7ee0f51376d19ee49df7c2089cf58af1f8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1bcb0914428485769b401d7310c2636caac77b9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,4 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..c78fc1e209a9915f2f713858d59e77d1fe0267ed
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,4 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f54913e9459d46d24ea452eeff956b5460f0d0a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b361d6d7af4a959b2138ed2860800737ca13561
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_isaid-896x896.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_isaid-896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a1a753b26defa81e161223747ab0f28fe4aaca7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_isaid-896x896.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_isaid-896x896.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_loveda-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..01bbf9bca98d8fa6e453e922e79400260fb3f8b3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_potsdam-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..134f2cfc2a560e3f21345170ea1c1e28cf580dfb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_vaihingen-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..2194838510e53de964d4afd0c706870611b793d8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea86219692e3e63cef05f53962031bb608d26d0f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..34ee7ed3df5007cf5cd9d8d9955eac42d44a470a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280.py
new file mode 100644
index 0000000000000000000000000000000000000000..133c45ae1deea20935755d6c37a46b2643331c3a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/mapillary_v1_65.py',
+    '../_base_/default_runtime.py',
+]
+
+crop_size = (1280, 1280)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(depth=50),
+    decode_head=dict(num_classes=65),
+    auxiliary_head=dict(num_classes=65))
+
+iters = 300000
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.0001)
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=iters,
+        by_epoch=False)
+]
+
+# training schedule for 300k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=iters, val_interval=iters // 10)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=iters // 10),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+train_dataloader = dict(batch_size=2)
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (4 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=8)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..32f994d9b383460d0d5ef2961d6486d87959fc44
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/cityscapes.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cdf534ef43aff12adfffcaff32ca05327806663
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d249b065a6ee79e363604472f8ca84c56b3c2f6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/cityscapes.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..863a46e1b33eef38683b9adf97ea90915c6263b8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a899fb830c007606520cd16859455938630c148
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1876d0ccf4d8f4c0f38a1c0818834a0cd52be644
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..95b56d03ee11787e4a4848d157a7fee379b474c8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/pascal_context.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=60),
+    auxiliary_head=dict(num_classes=60),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..459c62dc5029c324ebddb971c79f5e38aca67682
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/pascal_context_59.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=59),
+    auxiliary_head=dict(num_classes=59),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d61b509dc9d324dee7c472792d2d2c91b9a683f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f872cacf7025b04bdfc3e86568a077a19f0d3be
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_isaid-896x896.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_isaid-896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..7edec14bf848145aa64d601278df592934abe47a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_isaid-896x896.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py', '../_base_/datasets/isaid.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (896, 896)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=16),
+    auxiliary_head=dict(num_classes=16))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..64e262cf88ee4250bcd46de9cd3a75c6189da904
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py', '../_base_/datasets/loveda.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=7),
+    auxiliary_head=dict(num_classes=7))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ff7fcb41e313744e6a0d0db4780045689089295
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/pascal_context.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=60),
+    auxiliary_head=dict(num_classes=60),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..84aaf25b529b8d0aebec6903d6a00fa349c5459e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/pascal_context_59.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=59),
+    auxiliary_head=dict(num_classes=59),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..5810d6bece129b9fb089e18b5323c50a1bffc2b8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/potsdam.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=6),
+    auxiliary_head=dict(num_classes=6))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7f4b2d27a5d45aa2d88698933e7db44997b01c8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/vaihingen.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=6),
+    auxiliary_head=dict(num_classes=6))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e2813534d7c9c423efebc72898ce9188d3f6498
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..6366bd4e3a23d90c62d737ec978daa17cc181449
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b41de4dee224b52dcceeb0b3bb9ec8de2173cc3b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/deeplabv3plus/metafile.yaml
@@ -0,0 +1,1041 @@
+Collections:
+- Name: DeepLabV3+
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+    - Pascal Context
+    - Pascal Context 59
+    - LoveDA
+    - Potsdam
+    - Vaihingen
+    - iSAID
+    - Mapillary Vistas v1.2
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  README: configs/deeplabv3plus/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.61
+      mIoU(ms+flip): 81.01
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes/deeplabv3plus_r50-d8_512x1024_40k_cityscapes_20200605_094610-d222ffcd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes/deeplabv3plus_r50-d8_512x1024_40k_cityscapes_20200605_094610.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.21
+      mIoU(ms+flip): 81.82
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes/deeplabv3plus_r101-d8_512x1024_40k_cityscapes_20200605_094614-3769eecf.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes/deeplabv3plus_r101-d8_512x1024_40k_cityscapes_20200605_094614.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.97
+      mIoU(ms+flip): 80.46
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes/deeplabv3plus_r50-d8_769x769_40k_cityscapes_20200606_114143-1dcb0e3c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes/deeplabv3plus_r50-d8_769x769_40k_cityscapes_20200606_114143.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.46
+      mIoU(ms+flip): 80.5
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes/deeplabv3plus_r101-d8_769x769_40k_cityscapes_20200606_114304-ff414b9e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes/deeplabv3plus_r101-d8_769x769_40k_cityscapes_20200606_114304.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.89
+      mIoU(ms+flip): 78.76
+  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes/deeplabv3plus_r18-d8_512x1024_80k_cityscapes_20201226_080942-cff257fe.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes/deeplabv3plus_r18-d8_512x1024_80k_cityscapes-20201226_080942.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.09
+      mIoU(ms+flip): 81.13
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes/deeplabv3plus_r50-d8_512x1024_80k_cityscapes_20200606_114049-f9fb496d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes/deeplabv3plus_r50-d8_512x1024_80k_cityscapes_20200606_114049.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.97
+      mIoU(ms+flip): 82.03
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_512x1024_80k_cityscapes_20200606_114143-068fcfe9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_512x1024_80k_cityscapes_20200606_114143.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.46
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    - (FP16)
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.35
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-f1104f4b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.26
+      mIoU(ms+flip): 77.91
+  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes/deeplabv3plus_r18-d8_769x769_80k_cityscapes_20201226_083346-f326e06a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes/deeplabv3plus_r18-d8_769x769_80k_cityscapes-20201226_083346.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.83
+      mIoU(ms+flip): 81.48
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes/deeplabv3plus_r50-d8_769x769_80k_cityscapes_20200606_210233-0e9dfdc4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes/deeplabv3plus_r50-d8_769x769_80k_cityscapes_20200606_210233.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.65
+      mIoU(ms+flip): 81.47
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes/deeplabv3plus_r101-d8_769x769_80k_cityscapes_20220406_154720-dfcc0b68.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes/deeplabv3plus_r101-d8_769x769_80k_cityscapes_20220406_154720.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: ddeeplabv3plus_r101-d16-mg124_4xb2-40k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.09
+      mIoU(ms+flip): 80.36
+  Config: configs/deeplabv3plus/ddeeplabv3plus_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16-MG124
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes_20200908_005644-cf9ce186.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes-20200908_005644.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d16-mg124_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.9
+      mIoU(ms+flip): 81.33
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16-MG124
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes_20200908_005644-ee6158e0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes-20200908_005644.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.87
+      mIoU(ms+flip): 77.52
+  Config: configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes_20201226_090828-e451abd9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes-20201226_090828.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.28
+      mIoU(ms+flip): 81.44
+  Config: configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes_20201225_213645-a97e4e43.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes-20201225_213645.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.16
+      mIoU(ms+flip): 81.41
+  Config: configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes_20201226_190843-9c3c93a4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes-20201226_190843.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.36
+      mIoU(ms+flip): 78.24
+  Config: configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes/deeplabv3plus_r18b-d8_769x769_80k_cityscapes_20201226_151312-2c868aff.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes/deeplabv3plus_r18b-d8_769x769_80k_cityscapes-20201226_151312.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.41
+      mIoU(ms+flip): 80.56
+  Config: configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes/deeplabv3plus_r50b-d8_769x769_80k_cityscapes_20201225_224655-8b596d1c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes/deeplabv3plus_r50b-d8_769x769_80k_cityscapes-20201225_224655.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.88
+      mIoU(ms+flip): 81.46
+  Config: configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes/deeplabv3plus_r101b-d8_769x769_80k_cityscapes_20201226_205041-227cdf7c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes/deeplabv3plus_r101b-d8_769x769_80k_cityscapes-20201226_205041.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.72
+      mIoU(ms+flip): 43.75
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k/deeplabv3plus_r50-d8_512x512_80k_ade20k_20200614_185028-bf1400d8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k/deeplabv3plus_r50-d8_512x512_80k_ade20k_20200614_185028.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.6
+      mIoU(ms+flip): 46.06
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 14.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k/deeplabv3plus_r101-d8_512x512_80k_ade20k_20200615_014139-d5730af7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k/deeplabv3plus_r101-d8_512x512_80k_ade20k_20200615_014139.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.95
+      mIoU(ms+flip): 44.93
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k/deeplabv3plus_r50-d8_512x512_160k_ade20k_20200615_124504-6135c7e0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k/deeplabv3plus_r50-d8_512x512_160k_ade20k_20200615_124504.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.47
+      mIoU(ms+flip): 46.35
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k/deeplabv3plus_r101-d8_512x512_160k_ade20k_20200615_123232-38ed86bb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k/deeplabv3plus_r101-d8_512x512_160k_ade20k_20200615_123232.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 75.93
+      mIoU(ms+flip): 77.5
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug/deeplabv3plus_r50-d8_512x512_20k_voc12aug_20200617_102323-aad58ef1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug/deeplabv3plus_r50-d8_512x512_20k_voc12aug_20200617_102323.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.22
+      mIoU(ms+flip): 78.59
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug/deeplabv3plus_r101-d8_512x512_20k_voc12aug_20200617_102345-c7ff3d56.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug/deeplabv3plus_r101-d8_512x512_20k_voc12aug_20200617_102345.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.81
+      mIoU(ms+flip): 77.57
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug/deeplabv3plus_r50-d8_512x512_40k_voc12aug_20200613_161759-e1b43aa9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug/deeplabv3plus_r50-d8_512x512_40k_voc12aug_20200613_161759.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.62
+      mIoU(ms+flip): 79.53
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug/deeplabv3plus_r101-d8_512x512_40k_voc12aug_20200613_205333-faf03387.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug/deeplabv3plus_r101-d8_512x512_40k_voc12aug_20200613_205333.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-40k_pascal-context-480x480
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 47.3
+      mIoU(ms+flip): 48.47
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context/deeplabv3plus_r101-d8_480x480_40k_pascal_context_20200911_165459-d3c8a29e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context/deeplabv3plus_r101-d8_480x480_40k_pascal_context-20200911_165459.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-80k_pascal-context-480x480
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 47.23
+      mIoU(ms+flip): 48.26
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context/deeplabv3plus_r101-d8_480x480_80k_pascal_context_20200911_155322-145d3ee8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context/deeplabv3plus_r101-d8_480x480_80k_pascal_context-20200911_155322.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-40k_pascal-context-59-480x480
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 52.86
+      mIoU(ms+flip): 54.54
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59_20210416_111233-ed937f15.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59-20210416_111233.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-80k_pascal-context-59-480x480
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 53.2
+      mIoU(ms+flip): 54.67
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59_20210416_111127-7ca0331d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59-20210416_111127.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18-d8_4xb4-80k_loveda-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 50.28
+      mIoU(ms+flip): 50.47
+  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.93
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda/deeplabv3plus_r18-d8_512x512_80k_loveda_20211104_132800-ce0fa0ca.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda/deeplabv3plus_r18-d8_512x512_80k_loveda_20211104_132800.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 50.99
+      mIoU(ms+flip): 50.65
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.37
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda/deeplabv3plus_r50-d8_512x512_80k_loveda_20211105_080442-f0720392.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda/deeplabv3plus_r50-d8_512x512_80k_loveda_20211105_080442.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-80k_loveda-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 51.47
+      mIoU(ms+flip): 51.32
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.84
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda/deeplabv3plus_r101-d8_512x512_80k_loveda_20211105_110759-4c1f297e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda/deeplabv3plus_r101-d8_512x512_80k_loveda_20211105_110759.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18-d8_4xb4-80k_potsdam-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 77.09
+      mIoU(ms+flip): 78.44
+  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.91
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam/deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601-75fd5bc3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam/deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 78.33
+      mIoU(ms+flip): 79.27
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.36
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam/deeplabv3plus_r50-d8_512x512_80k_potsdam_20211219_031508-7e7a2b24.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam/deeplabv3plus_r50-d8_512x512_80k_potsdam_20211219_031508.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 78.7
+      mIoU(ms+flip): 79.47
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.83
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam/deeplabv3plus_r101-d8_512x512_80k_potsdam_20211219_031508-8b112708.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam/deeplabv3plus_r101-d8_512x512_80k_potsdam_20211219_031508.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18-d8_4xb4-80k_vaihingen-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 72.5
+      mIoU(ms+flip): 74.13
+  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.91
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen_20211231_230805-7626a263.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen_20211231_230805.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 73.97
+      mIoU(ms+flip): 75.05
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.36
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen_20211231_230816-5040938d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen_20211231_230816.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-80k_vaihingen-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 73.06
+      mIoU(ms+flip): 74.14
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.83
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen_20211231_230816-8a095afa.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen_20211231_230816.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18-d8_4xb4-80k_isaid-896x896
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 61.35
+      mIoU(ms+flip): 62.61
+  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.19
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid_20220110_180526-7059991d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid_20220110_180526.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-80k_isaid-896x896
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 67.06
+      mIoU(ms+flip): 68.02
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 21.45
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid_20220110_180526-598be439.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid_20220110_180526.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Mapillary Vistas v1.2
+    Metrics:
+      mIoU: 47.35
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280.py
+  Metadata:
+    Training Data: Mapillary Vistas v1.2
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x A100 GPUS
+    Memory (GB): 24.04
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280_20230301_110504-655f8e43.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280_20230301_110504.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_fashion10k_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_fashion10k_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..abc97448d81094aa74f9ad7b1e379c61ac203759
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_fashion10k_896x896.py
@@ -0,0 +1,243 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/deep_fashion_10k.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 24
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        # load_from='./checkpoints/depth_anything_vitl14.pth'),
+        load_from='work_dirs/imaterialist_5cat/depth_anything_large_896x896/best_mIoU_iter_429880_ok+.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=3584),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(3584, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(type='PackSegInputs')
+]
+set_batch_size = 2
+train_dataloader = dict(batch_size=set_batch_size, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optim_wrapper = dict(
+#     _delete_=True,
+#     type='OptimWrapper',
+#     optimizer=dict(
+#         type='AdamW', lr=3e-5, betas=(0.9, 0.999), weight_decay=0.05),
+#     constructor='LayerDecayOptimizerConstructor',
+#     paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+set_lr = 0.00003 * set_batch_size
+optimizer = dict(
+    # type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999)) # when at first batchsize = 1
+    type='AdamW', lr=set_lr, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+train_epochs = 40
+# train_epochs = 50
+val_epoch = 1
+train_dataset_num = 12013
+# train_dataset_num = 2963
+total_iter = int(train_dataset_num / set_batch_size) * train_epochs
+linearLR_end = int(total_iter / 50)
+val_iter = int(train_dataset_num / set_batch_size) * val_epoch
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=linearLR_end),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=linearLR_end,
+        end=total_iter,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iter, val_interval=val_iter)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=val_iter, save_best='mIoU', max_keep_ckpts=2),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/fashion10k/depth_anything_large_896x896_imaterialist-pretrain_20250116'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bc733c43a384a96d5ed7dcc73f7e04ab25b6b64
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py
@@ -0,0 +1,230 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/ade20k_640x640.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 150
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=3584),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(3584, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optim_wrapper = dict(
+#     _delete_=True,
+#     type='OptimWrapper',
+#     optimizer=dict(
+#         type='AdamW', lr=3e-5, betas=(0.9, 0.999), weight_decay=0.05),
+#     constructor='LayerDecayOptimizerConstructor',
+#     paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_160k_fashion10k_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_160k_fashion10k_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..762ce057b047908a9299517794c889018a0ec633
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_160k_fashion10k_896x896.py
@@ -0,0 +1,230 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/deep_fashion_10k_add.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 24
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=3584),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(3584, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optim_wrapper = dict(
+#     _delete_=True,
+#     type='OptimWrapper',
+#     optimizer=dict(
+#         type='AdamW', lr=3e-5, betas=(0.9, 0.999), weight_decay=0.05),
+#     constructor='LayerDecayOptimizerConstructor',
+#     paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_160k_fashion10k_896x896_add-bg'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_160k_human_parsing_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_160k_human_parsing_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..c120974599179ea1105151434700787b8aaf70e0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_160k_human_parsing_896x896.py
@@ -0,0 +1,230 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/human_parsing_1024x1024.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 57
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=3584),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(3584, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optim_wrapper = dict(
+#     _delete_=True,
+#     type='OptimWrapper',
+#     optimizer=dict(
+#         type='AdamW', lr=3e-5, betas=(0.9, 0.999), weight_decay=0.05),
+#     constructor='LayerDecayOptimizerConstructor',
+#     paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_160k_human_parsing_896x896'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_160k_human_parsing_fashion_1024x1024.py b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_160k_human_parsing_fashion_1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..a42645384835b5a843ec3acbcc16a6d497953d0b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_160k_human_parsing_fashion_1024x1024.py
@@ -0,0 +1,573 @@
+auto_scale_lr = dict(base_batch_size=16, enable=False)
+backbone_embed_multi = dict(decay_mult=0.0, lr_mult=0.1)
+backbone_norm_multi = dict(decay_mult=0.0, lr_mult=0.1)
+crop_size = (
+    896,
+    896,
+)
+custom_keys = dict({
+    'backbone.dinov2':
+    dict(decay_mult=1.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.0.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.1.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.10.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.11.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.12.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.13.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.14.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.15.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.16.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.17.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.18.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.19.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.2.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.20.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.21.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.22.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.23.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.3.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.4.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.5.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.6.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.7.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.8.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.9.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'level_embed':
+    dict(decay_mult=0.0, lr_mult=1.0),
+    'pos_embed':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'query_embed':
+    dict(decay_mult=0.0, lr_mult=1.0),
+    'query_feat':
+    dict(decay_mult=0.0, lr_mult=1.0)
+})
+data_preprocessor = dict(
+    bgr_to_rgb=True,
+    mean=[
+        123.675,
+        116.28,
+        103.53,
+    ],
+    pad_val=0,
+    seg_pad_val=255,
+    size=(
+        896,
+        896,
+    ),
+    std=[
+        58.395,
+        57.12,
+        57.375,
+    ],
+    type='SegDataPreProcessor')
+data_root = '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/deep_fashion_10k'
+dataset_type = 'HumanParsingDataset'
+default_hooks = dict(
+    checkpoint=dict(
+        by_epoch=False,
+        interval=2000,
+        max_keep_ckpts=50,
+        save_best='mIoU',
+        type='CheckpointHook'),
+    logger=dict(interval=50, log_metric_by_epoch=False, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='SegVisualizationHook'))
+default_scope = 'mmseg'
+embed_multi = dict(decay_mult=0.0, lr_mult=1.0)
+env_cfg = dict(
+    cudnn_benchmark=True,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+find_unused_parameters = True
+img_ratios = [
+    0.5,
+    0.75,
+    1.0,
+    1.25,
+    1.5,
+    1.75,
+]
+launcher = 'none'
+load_from = '/mnt/data_ssd/limaopeng/limaopeng/segmentation/mmsegmentation/work_dirs/depth_anything_large_mask2former_16xb1_160k_human_parsing_896x896/best_mIoU_iter_110000.pth'
+log_level = 'INFO'
+log_processor = dict(by_epoch=False)
+model = dict(
+    backbone=dict(
+        freeze=False,
+        # load_from='./checkpoints/depth_anything_vitl14.pth',
+        type='DINOv2',
+        version='large'),
+    data_preprocessor=dict(
+        bgr_to_rgb=True,
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        pad_val=0,
+        seg_pad_val=255,
+        size=(
+            896,
+            896,
+        ),
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        type='SegDataPreProcessor'),
+    decode_head=dict(
+        align_corners=False,
+        enforce_decoder_input_project=False,
+        feat_channels=1024,
+        in_channels=[
+            1024,
+            1024,
+            1024,
+            1024,
+        ],
+        loss_boundary=dict(loss_weight=5.0, type='BoundaryLoss'),
+        loss_cls=dict(
+            class_weight=[
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+            ],
+            loss_weight=2.0,
+            reduction='mean',
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False),
+        loss_dice=dict(
+            activate=True,
+            eps=1.0,
+            loss_weight=5.0,
+            naive_dice=True,
+            reduction='mean',
+            type='mmdet.DiceLoss',
+            use_sigmoid=True),
+        loss_mask=dict(
+            loss_weight=5.0,
+            reduction='mean',
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True),
+        num_classes=43,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        out_channels=1024,
+        pixel_decoder=dict(
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                init_cfg=None,
+                layer_cfg=dict(
+                    ffn_cfg=dict(
+                        act_cfg=dict(inplace=True, type='ReLU'),
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        ffn_drop=0.0,
+                        num_fcs=2),
+                    self_attn_cfg=dict(
+                        batch_first=True,
+                        dropout=0.0,
+                        embed_dims=1024,
+                        im2col_step=64,
+                        init_cfg=None,
+                        norm_cfg=None,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4)),
+                num_layers=6),
+            init_cfg=None,
+            norm_cfg=dict(num_groups=32, type='GN'),
+            num_outs=3,
+            positional_encoding=dict(normalize=True, num_feats=512),
+            type='mmdet.MSDeformAttnPixelDecoder'),
+        positional_encoding=dict(normalize=True, num_feats=512),
+        train_cfg=dict(
+            assigner=dict(
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        use_sigmoid=True,
+                        weight=5.0),
+                    dict(
+                        eps=1.0,
+                        pred_act=True,
+                        type='mmdet.DiceCost',
+                        weight=5.0),
+                ],
+                type='mmdet.HungarianAssigner'),
+            importance_sample_ratio=0.75,
+            num_points=12544,
+            oversample_ratio=3.0,
+            sampler=dict(type='mmdet.MaskPseudoSampler')),
+        transformer_decoder=dict(
+            init_cfg=None,
+            layer_cfg=dict(
+                cross_attn_cfg=dict(
+                    attn_drop=0.0,
+                    batch_first=True,
+                    dropout_layer=None,
+                    embed_dims=1024,
+                    num_heads=32,
+                    proj_drop=0.0),
+                ffn_cfg=dict(
+                    act_cfg=dict(inplace=True, type='ReLU'),
+                    add_identity=True,
+                    dropout_layer=None,
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    ffn_drop=0.0,
+                    num_fcs=2),
+                self_attn_cfg=dict(
+                    attn_drop=0.0,
+                    batch_first=True,
+                    dropout_layer=None,
+                    embed_dims=1024,
+                    num_heads=32,
+                    proj_drop=0.0)),
+            num_layers=9,
+            return_intermediate=True),
+        type='Mask2FormerHead'),
+    neck=dict(
+        embed_dim=1024, rescales=[
+            4,
+            2,
+            1,
+            0.5,
+        ], type='Feature2Pyramid'),
+    test_cfg=dict(crop_size=(
+        896,
+        896,
+    ), mode='slide', stride=(
+        426,
+        426,
+    )),
+    train_cfg=dict(),
+    type='EncoderDecoder')
+num_classes = 43
+optim_wrapper = dict(
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    optimizer=dict(
+        betas=(
+            0.9,
+            0.999,
+        ),
+        eps=1e-08,
+        lr=3e-05,
+        type='AdamW',
+        weight_decay=0.05),
+    paramwise_cfg=dict(
+        custom_keys=dict({
+            'backbone.dinov2':
+            dict(decay_mult=1.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.0.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.1.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.10.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.11.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.12.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.13.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.14.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.15.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.16.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.17.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.18.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.19.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.2.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.20.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.21.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.22.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.23.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.3.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.4.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.5.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.6.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.7.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.8.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.9.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'level_embed':
+            dict(decay_mult=0.0, lr_mult=1.0),
+            'pos_embed':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'query_embed':
+            dict(decay_mult=0.0, lr_mult=1.0),
+            'query_feat':
+            dict(decay_mult=0.0, lr_mult=1.0)
+        }),
+        norm_decay_mult=0.0),
+    type='OptimWrapper')
+optimizer = dict(
+    betas=(
+        0.9,
+        0.999,
+    ), eps=1e-08, lr=3e-05, type='AdamW', weight_decay=0.05)
+param_scheduler = [
+    dict(
+        begin=0, by_epoch=False, end=1500, start_factor=1e-06,
+        type='LinearLR'),
+    dict(
+        begin=1500,
+        by_epoch=False,
+        end=300000,
+        eta_min=0.0,
+        power=0.9,
+        type='PolyLR'),
+]
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='val20250512/images', seg_map_path='val20250512/labels'),
+        data_root=
+        '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/deep_fashion_10k',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=False, scale=(
+                896,
+                896,
+            ), type='Resize'),
+            dict(reduce_zero_label=False, type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type='HumanParsingDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(keep_ratio=False, scale=(
+        896,
+        896,
+    ), type='Resize'),
+    dict(reduce_zero_label=False, type='LoadAnnotations'),
+    dict(type='PackSegInputs'),
+]
+train_cfg = dict(
+    max_iters=300000, type='IterBasedTrainLoop', val_interval=2000)
+train_dataloader = dict(
+    batch_size=3,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='train20250512/images',
+            seg_map_path='train20250512/labels'),
+        data_root=
+        '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/deep_fashion_10k',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations'),
+            dict(
+                keep_ratio=True,
+                ratio_range=(
+                    0.2,
+                    2.0,
+                ),
+                scale=(
+                    896,
+                    896,
+                ),
+                type='RandomResize'),
+            dict(
+                cat_max_ratio=0.75, crop_size=(
+                    896,
+                    896,
+                ), type='RandomCrop'),
+            dict(keep_ratio=True, scale=(
+                896,
+                896,
+            ), type='Resize'),
+            dict(degree=45, prob=0.5, seg_pad_val=0, type='RandomRotate'),
+            dict(type='PhotoMetricDistortion'),
+            dict(type='PackSegInputs'),
+        ],
+        type='HumanParsingDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='InfiniteSampler'))
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        keep_ratio=True,
+        ratio_range=(
+            0.2,
+            2.0,
+        ),
+        scale=(
+            896,
+            896,
+        ),
+        type='RandomResize'),
+    dict(cat_max_ratio=0.75, crop_size=(
+        896,
+        896,
+    ), type='RandomCrop'),
+    dict(keep_ratio=True, scale=(
+        896,
+        896,
+    ), type='Resize'),
+    dict(degree=45, prob=0.5, seg_pad_val=0, type='RandomRotate'),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs'),
+]
+tta_model = dict(type='SegTTAModel')
+tta_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(
+        transforms=[
+            [
+                dict(keep_ratio=True, scale_factor=0.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=0.75, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.0, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.25, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.75, type='Resize'),
+            ],
+            [
+                dict(direction='horizontal', prob=0.0, type='RandomFlip'),
+                dict(direction='horizontal', prob=1.0, type='RandomFlip'),
+            ],
+            [
+                dict(type='LoadAnnotations'),
+            ],
+            [
+                dict(type='PackSegInputs'),
+            ],
+        ],
+        type='TestTimeAug'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='val20250512/images', seg_map_path='val20250512/labels'),
+        data_root=
+        '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/deep_fashion_10k',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=False, scale=(
+                896,
+                896,
+            ), type='Resize'),
+            dict(reduce_zero_label=False, type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type='HumanParsingDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    name='visualizer',
+    type='SegLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_160k_human_pasing_fasion_1024x1024_boundary_20250521'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..614daddfa55e5d4e6b1dbfa0d4fab4c450ab75d6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py
@@ -0,0 +1,222 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/cityscapes.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 19
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(518, 518)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=896 * 4),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(896 * 4, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=80000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=80000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ae5de1ae482c35c234d3b979707da93d3407e7d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py
@@ -0,0 +1,244 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/cityscapes.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size,
+    test_cfg=dict(size_divisor=14))
+num_classes = 19
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(518, 518)))
+
+# dataset config
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=896 * 4),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(896 * 4, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]
+# img_ratios = [1.0]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=80000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=80000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=1, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_80k_fashion10k_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_80k_fashion10k_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f144d28de0155a29e029cf0551ba4c7bebdfbf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_mask2former_16xb1_80k_fashion10k_896x896.py
@@ -0,0 +1,222 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/deep_fashion_10k.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 18
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(518, 518)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=896 * 4),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(896 * 4, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=80000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=80000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_80k_fashion10k_896x896'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_unionnew_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_unionnew_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..2de65a23e7ee8246dd71444ce27c8e91948d1e19
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_unionnew_896x896.py
@@ -0,0 +1,244 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/union_new.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 14
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+        # load_from='work_dirs/imaterialist_5cat/depth_anything_large_896x896/best_mIoU_iter_429880_ok+.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        # norm_cfg = dict(type='SyncBN', requires_grad=True),
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=3584),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(3584, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(type='PackSegInputs')
+]
+set_batch_size = 2
+train_dataloader = dict(batch_size=set_batch_size, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optim_wrapper = dict(
+#     _delete_=True,
+#     type='OptimWrapper',
+#     optimizer=dict(
+#         type='AdamW', lr=3e-5, betas=(0.9, 0.999), weight_decay=0.05),
+#     constructor='LayerDecayOptimizerConstructor',
+#     paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+set_lr = 0.00003 * set_batch_size
+optimizer = dict(
+    # type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999)) # when at first batchsize = 1
+    type='AdamW', lr=set_lr, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+train_epochs = 40
+val_epoch = 1
+train_dataset_num = 24747
+# train_dataset_num = 2963
+total_iter = int(train_dataset_num / set_batch_size) * train_epochs
+linearLR_end = int(total_iter / 50)
+val_iter = int(train_dataset_num / set_batch_size) * val_epoch
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=linearLR_end),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=linearLR_end,
+        end=total_iter,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iter, val_interval=val_iter)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=val_iter, save_best='mIoU', max_keep_ckpts=2),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+# work_dir = './work_dirs/union_new_1/depth_anything_large_896x896_depth-anything-pretrain_20250121_1GPU'
+work_dir = './work_dirs/union_new_1/depth_anything_large_896x896_depth-anything-pretrain_20250116_1GPU'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_unionnew_addmask_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_unionnew_addmask_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7a63ebba64a2cc9b512760ad363340e04801999
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything/depth_anything_large_unionnew_addmask_896x896.py
@@ -0,0 +1,244 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/union_new_add_mask.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 15
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+        # load_from='work_dirs/imaterialist_5cat/depth_anything_large_896x896/best_mIoU_iter_429880_ok+.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        # norm_cfg = dict(type='SyncBN', requires_grad=True),
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=3584),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(3584, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(type='PackSegInputs')
+]
+set_batch_size = 2
+train_dataloader = dict(batch_size=set_batch_size, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optim_wrapper = dict(
+#     _delete_=True,
+#     type='OptimWrapper',
+#     optimizer=dict(
+#         type='AdamW', lr=3e-5, betas=(0.9, 0.999), weight_decay=0.05),
+#     constructor='LayerDecayOptimizerConstructor',
+#     paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+set_lr = 0.00003 * set_batch_size
+optimizer = dict(
+    # type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999)) # when at first batchsize = 1
+    type='AdamW', lr=set_lr, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+train_epochs = 40
+val_epoch = 1
+train_dataset_num = 26746
+# train_dataset_num = 2963
+total_iter = int(train_dataset_num / set_batch_size) * train_epochs
+linearLR_end = int(total_iter / 50)
+val_iter = int(train_dataset_num / set_batch_size) * val_epoch
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=linearLR_end),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=linearLR_end,
+        end=total_iter,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iter, val_interval=val_iter)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=val_iter, save_best='mIoU', max_keep_ckpts=2),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+# work_dir = './work_dirs/union_new_1/depth_anything_large_896x896_depth-anything-pretrain_20250121_1GPU'
+work_dir = './work_dirs/union_new_add_mask/depth_anything_large_896x896_depth-anything-pretrain_20250123_1GPU'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bc733c43a384a96d5ed7dcc73f7e04ab25b6b64
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py
@@ -0,0 +1,230 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/ade20k_640x640.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 150
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=3584),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(3584, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optim_wrapper = dict(
+#     _delete_=True,
+#     type='OptimWrapper',
+#     optimizer=dict(
+#         type='AdamW', lr=3e-5, betas=(0.9, 0.999), weight_decay=0.05),
+#     constructor='LayerDecayOptimizerConstructor',
+#     paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_160k_imaterialist_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_160k_imaterialist_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..975133f16396ceb41b3c749d2330756e58ad8557
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_160k_imaterialist_896x896.py
@@ -0,0 +1,242 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/imaterialist.py',
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 47
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=3584),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(3584, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(type='PackSegInputs')
+]
+set_batch_size = 1
+train_dataloader = dict(batch_size=set_batch_size, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optim_wrapper = dict(
+#     _delete_=True,
+#     type='OptimWrapper',
+#     optimizer=dict(
+#         type='AdamW', lr=3e-5, betas=(0.9, 0.999), weight_decay=0.05),
+#     constructor='LayerDecayOptimizerConstructor',
+#     paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+set_lr = 0.00003 * set_batch_size
+optimizer = dict(
+    # type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999)) # when at first batchsize = 1
+    type='AdamW', lr=set_lr, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+# train_epochs = 40
+train_epochs = 50
+val_epoch = 2
+# train_dataset_num = 42945
+train_dataset_num = 2963
+total_iter = int(train_dataset_num / set_batch_size) * train_epochs
+linearLR_end = int(total_iter / 50)
+val_iter = int(train_dataset_num / set_batch_size) * val_epoch
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=linearLR_end),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=linearLR_end,
+        end=total_iter,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iter, val_interval=val_iter)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=val_iter, save_best='mIoU', max_keep_ckpts=2),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/imaterialist_mannequin_cover/depth_anything_large_mask2former_16xb1_160k_896x896'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..614daddfa55e5d4e6b1dbfa0d4fab4c450ab75d6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py
@@ -0,0 +1,222 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/cityscapes.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 19
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(518, 518)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=896 * 4),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(896 * 4, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=80000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=80000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ae5de1ae482c35c234d3b979707da93d3407e7d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py
@@ -0,0 +1,244 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/cityscapes.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size,
+    test_cfg=dict(size_divisor=14))
+num_classes = 19
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(518, 518)))
+
+# dataset config
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=896 * 4),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(896 * 4, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]
+# img_ratios = [1.0]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=80000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=80000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=1, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_80k_fashion10k_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_80k_fashion10k_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f144d28de0155a29e029cf0551ba4c7bebdfbf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_80k_fashion10k_896x896.py
@@ -0,0 +1,222 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/deep_fashion_10k.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 18
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(518, 518)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=896 * 4),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(896 * 4, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=80000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=80000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_80k_fashion10k_896x896'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_imaterialist5cat_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_imaterialist5cat_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..c166edd353dad03184c8aabfc40e3bb6904c4c66
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_imaterialist5cat_896x896.py
@@ -0,0 +1,242 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/imaterialist_5cat.py',
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 5
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=3584),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(3584, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(type='PackSegInputs')
+]
+set_batch_size = 2
+train_dataloader = dict(batch_size=set_batch_size, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optim_wrapper = dict(
+#     _delete_=True,
+#     type='OptimWrapper',
+#     optimizer=dict(
+#         type='AdamW', lr=3e-5, betas=(0.9, 0.999), weight_decay=0.05),
+#     constructor='LayerDecayOptimizerConstructor',
+#     paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+set_lr = 0.00003 * set_batch_size
+optimizer = dict(
+    # type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999)) # when at first batchsize = 1
+    type='AdamW', lr=set_lr, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+train_epochs = 40
+# train_epochs = 50
+val_epoch = 1
+train_dataset_num = 42988
+# train_dataset_num = 2963
+total_iter = int(train_dataset_num / set_batch_size) * train_epochs
+linearLR_end = int(total_iter / 50)
+val_iter = int(train_dataset_num / set_batch_size) * val_epoch
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=linearLR_end),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=linearLR_end,
+        end=total_iter,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iter, val_interval=val_iter)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=val_iter, save_best='mIoU', max_keep_ckpts=2),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/imaterialist_5cat/depth_anything_large_896x896'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bc733c43a384a96d5ed7dcc73f7e04ab25b6b64
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896.py
@@ -0,0 +1,230 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/ade20k_640x640.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 150
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=3584),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(3584, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# optim_wrapper = dict(
+#     _delete_=True,
+#     type='OptimWrapper',
+#     optimizer=dict(
+#         type='AdamW', lr=3e-5, betas=(0.9, 0.999), weight_decay=0.05),
+#     constructor='LayerDecayOptimizerConstructor',
+#     paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.9))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_160k_ade20k_896x896'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..614daddfa55e5d4e6b1dbfa0d4fab4c450ab75d6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896.py
@@ -0,0 +1,222 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/cityscapes.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 19
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(518, 518)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=896 * 4),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(896 * 4, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=80000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=80000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ae5de1ae482c35c234d3b979707da93d3407e7d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms.py
@@ -0,0 +1,244 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/cityscapes.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size,
+    test_cfg=dict(size_divisor=14))
+num_classes = 19
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(518, 518)))
+
+# dataset config
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=896 * 4),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(896 * 4, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]
+# img_ratios = [1.0]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=80000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=80000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=1, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_80k_cityscapes_896x896_ms'
diff --git a/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_mask2former_16xb1_80k_fashion10k_896x896.py b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_mask2former_16xb1_80k_fashion10k_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f144d28de0155a29e029cf0551ba4c7bebdfbf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/depth_anything_imaterialist_5cat/depth_anything_large_mask2former_16xb1_80k_fashion10k_896x896.py
@@ -0,0 +1,222 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/deep_fashion_10k.py'
+]
+
+crop_size = (896, 896)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 18
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DINOv2',
+        version='large',
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth'),
+    neck=dict(type='Feature2Pyramid', embed_dim=1024, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[1024, 1024, 1024, 1024],
+        # strides=[4, 8, 16, 32],
+        feat_channels=1024,
+        out_channels=1024,
+        num_classes=num_classes,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=1024,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=512, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=512, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=1024,
+                    num_heads=32,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(518, 518)))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=896 * 4),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(896 * 4, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=1, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone.dinov2': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.dinov2.norm': backbone_norm_multi,
+    'pos_embed': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.dinov2.blocks.{block_id}.norm': backbone_norm_multi
+    for block_id in range(24)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+find_unused_parameters=True
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=80000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=80000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000, save_best='mIoU', max_keep_ckpts=1),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/depth_anything_large_mask2former_16xb1_80k_fashion10k_896x896'
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/README.md b/head_extractor/src/mmseg/.mim/configs/dmnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b0cf94455e1a40708dfb6ea74919deb49bcfdc34
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/README.md
@@ -0,0 +1,59 @@
+# DMNet
+
+> [Dynamic Multi-scale Filters for Semantic Segmentation](https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/Junjun2016/DMNet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Multi-scale representation provides an effective way toaddress scale variation of objects and stuff in semantic seg-mentation. Previous works construct multi-scale represen-tation by utilizing different filter sizes, expanding filter sizeswith dilated filters or pooling grids, and the parameters ofthese filters are fixed after training. These methods oftensuffer from heavy computational cost or have more param-eters, and are not adaptive to the input image during in-ference. To address these problems, this paper proposes aDynamic Multi-scale Network (DMNet) to adaptively cap-ture multi-scale contents for predicting pixel-level semanticlabels. DMNet is composed of multiple Dynamic Convolu-tional Modules (DCMs) arranged in parallel, each of whichexploits context-aware filters to estimate semantic represen-tation for a specific scale. The outputs of multiple DCMsare further integrated for final segmentation. We conductextensive experiments to evaluate our DMNet on three chal-lenging semantic segmentation and scene parsing datasets,PASCAL VOC 2012, Pascal-Context, and ADE20K. DMNetachieves a new record 84.4% mIoU on PASCAL VOC 2012test set without MS COCO pre-trained and post-processing,and also obtains state-of-the-art performance on Pascal-Context and ADE20K.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142900781-6215763f-8b71-4e0b-a6b1-c41372db2aa0.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DMNet  | R-50-D8  | 512x1024  |   40000 | 7.0      | 3.66           | V100   | 77.78 |         79.14 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes/dmnet_r50-d8_512x1024_40k_cityscapes_20201215_042326-615373cf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes/dmnet_r50-d8_512x1024_40k_cityscapes-20201215_042326.log.json)     |
+| DMNet  | R-101-D8 | 512x1024  |   40000 | 10.6     | 2.54           | V100   | 78.37 |         79.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes/dmnet_r101-d8_512x1024_40k_cityscapes_20201215_043100-8291e976.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes/dmnet_r101-d8_512x1024_40k_cityscapes-20201215_043100.log.json) |
+| DMNet  | R-50-D8  | 769x769   |   40000 | 7.9      | 1.57           | V100   | 78.49 |         80.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_40k_cityscapes/dmnet_r50-d8_769x769_40k_cityscapes_20201215_093706-e7f0e23e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_40k_cityscapes/dmnet_r50-d8_769x769_40k_cityscapes-20201215_093706.log.json)         |
+| DMNet  | R-101-D8 | 769x769   |   40000 | 12.0     | 1.01           | V100   | 77.62 |         78.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_40k_cityscapes/dmnet_r101-d8_769x769_40k_cityscapes_20201215_081348-a74261f6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_40k_cityscapes/dmnet_r101-d8_769x769_40k_cityscapes-20201215_081348.log.json)     |
+| DMNet  | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 79.07 |         80.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes/dmnet_r50-d8_512x1024_80k_cityscapes_20201215_053728-3c8893b9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes/dmnet_r50-d8_512x1024_80k_cityscapes-20201215_053728.log.json)     |
+| DMNet  | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 79.64 |         80.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes/dmnet_r101-d8_512x1024_80k_cityscapes_20201215_031718-fa081cb8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes/dmnet_r101-d8_512x1024_80k_cityscapes-20201215_031718.log.json) |
+| DMNet  | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 79.22 |         80.55 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_80k_cityscapes/dmnet_r50-d8_769x769_80k_cityscapes_20201215_034006-6060840e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_80k_cityscapes/dmnet_r50-d8_769x769_80k_cityscapes-20201215_034006.log.json)         |
+| DMNet  | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 79.19 |         80.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_80k_cityscapes/dmnet_r101-d8_769x769_80k_cityscapes_20201215_082810-7f0de59a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_80k_cityscapes/dmnet_r101-d8_769x769_80k_cityscapes-20201215_082810.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DMNet  | R-50-D8  | 512x512   |   80000 | 9.4      | 20.95          | V100   | 42.37 |         43.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_80k_ade20k/dmnet_r50-d8_512x512_80k_ade20k_20201215_144744-f89092a6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_80k_ade20k/dmnet_r50-d8_512x512_80k_ade20k-20201215_144744.log.json)         |
+| DMNet  | R-101-D8 | 512x512   |   80000 | 13.0     | 13.88          | V100   | 45.34 |         46.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_80k_ade20k/dmnet_r101-d8_512x512_80k_ade20k_20201215_104812-bfa45311.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_80k_ade20k/dmnet_r101-d8_512x512_80k_ade20k-20201215_104812.log.json)     |
+| DMNet  | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 43.15 |         44.17 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_160k_ade20k/dmnet_r50-d8_512x512_160k_ade20k_20201215_115313-025ab3f9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_160k_ade20k/dmnet_r50-d8_512x512_160k_ade20k-20201215_115313.log.json)     |
+| DMNet  | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 45.42 |         46.76 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_160k_ade20k/dmnet_r101-d8_512x512_160k_ade20k_20201215_111145-a0bc02ef.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_160k_ade20k/dmnet_r101-d8_512x512_160k_ade20k-20201215_111145.log.json) |
+
+## Citation
+
+```bibtex
+@InProceedings{He_2019_ICCV,
+author = {He, Junjun and Deng, Zhongying and Qiao, Yu},
+title = {Dynamic Multi-Scale Filters for Semantic Segmentation},
+booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+month = {October},
+year = {2019}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..9832b62a29087a88111157fa49b74abbb1b14b18
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './dmnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..03346c5d9beb3b16069f3b5a45394399ad050531
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './dmnet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd7e9acd1c0de671eebf1f703dc21dee0ac3ee73
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './dmnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..2205e601ce9a2e1ea40790a50be8af6e302c3c3b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './dmnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..23e215bf2fdcb0a2cb828f7e00ca5bf76a425ab7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './dmnet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c25587e647fec6bf71b06565ad856db691631af
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './dmnet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa86b013988f28479533adc97e69a4f822c36027
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/dmnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c2dbf31bde29f7017a09d88c5634459eab8df75
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/dmnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc2160634b5efef0642c3f8a270d3bbb7876cabd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/dmnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..e32ae71765e4bcc8ac30a67cc611da66e556adb3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/dmnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d0a046ba90f884133e16b94583a437e52e65ef
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/dmnet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..727bed0ea1c95d5c2e3e0a16c4252feaca987d54
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/dmnet_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/dmnet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/dmnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/dmnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7f5e5367538694d9aba9f4434059298e235c6bbb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dmnet/metafile.yaml
@@ -0,0 +1,296 @@
+Collections:
+- Name: DMNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  README: configs/dmnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: dmnet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.78
+      mIoU(ms+flip): 79.14
+  Config: configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes/dmnet_r50-d8_512x1024_40k_cityscapes_20201215_042326-615373cf.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes/dmnet_r50-d8_512x1024_40k_cityscapes-20201215_042326.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.37
+      mIoU(ms+flip): 79.72
+  Config: configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes/dmnet_r101-d8_512x1024_40k_cityscapes_20201215_043100-8291e976.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes/dmnet_r101-d8_512x1024_40k_cityscapes-20201215_043100.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.49
+      mIoU(ms+flip): 80.27
+  Config: configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_40k_cityscapes/dmnet_r50-d8_769x769_40k_cityscapes_20201215_093706-e7f0e23e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_40k_cityscapes/dmnet_r50-d8_769x769_40k_cityscapes-20201215_093706.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.62
+      mIoU(ms+flip): 78.94
+  Config: configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_40k_cityscapes/dmnet_r101-d8_769x769_40k_cityscapes_20201215_081348-a74261f6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_40k_cityscapes/dmnet_r101-d8_769x769_40k_cityscapes-20201215_081348.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.07
+      mIoU(ms+flip): 80.22
+  Config: configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes/dmnet_r50-d8_512x1024_80k_cityscapes_20201215_053728-3c8893b9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes/dmnet_r50-d8_512x1024_80k_cityscapes-20201215_053728.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.64
+      mIoU(ms+flip): 80.67
+  Config: configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes/dmnet_r101-d8_512x1024_80k_cityscapes_20201215_031718-fa081cb8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes/dmnet_r101-d8_512x1024_80k_cityscapes-20201215_031718.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.22
+      mIoU(ms+flip): 80.55
+  Config: configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_80k_cityscapes/dmnet_r50-d8_769x769_80k_cityscapes_20201215_034006-6060840e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_80k_cityscapes/dmnet_r50-d8_769x769_80k_cityscapes-20201215_034006.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.19
+      mIoU(ms+flip): 80.65
+  Config: configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_80k_cityscapes/dmnet_r101-d8_769x769_80k_cityscapes_20201215_082810-7f0de59a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_80k_cityscapes/dmnet_r101-d8_769x769_80k_cityscapes-20201215_082810.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.37
+      mIoU(ms+flip): 43.62
+  Config: configs/dmnet/dmnet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_80k_ade20k/dmnet_r50-d8_512x512_80k_ade20k_20201215_144744-f89092a6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_80k_ade20k/dmnet_r50-d8_512x512_80k_ade20k-20201215_144744.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.34
+      mIoU(ms+flip): 46.13
+  Config: configs/dmnet/dmnet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_80k_ade20k/dmnet_r101-d8_512x512_80k_ade20k_20201215_104812-bfa45311.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_80k_ade20k/dmnet_r101-d8_512x512_80k_ade20k-20201215_104812.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.15
+      mIoU(ms+flip): 44.17
+  Config: configs/dmnet/dmnet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_160k_ade20k/dmnet_r50-d8_512x512_160k_ade20k_20201215_115313-025ab3f9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_160k_ade20k/dmnet_r50-d8_512x512_160k_ade20k-20201215_115313.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.42
+      mIoU(ms+flip): 46.76
+  Config: configs/dmnet/dmnet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_160k_ade20k/dmnet_r101-d8_512x512_160k_ade20k_20201215_111145-a0bc02ef.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_160k_ade20k/dmnet_r101-d8_512x512_160k_ade20k-20201215_111145.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/README.md b/head_extractor/src/mmseg/.mim/configs/dnlnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6835ffd1ed9d94f1f7dbd70dc6a3cef0ca470dc3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/README.md
@@ -0,0 +1,62 @@
+# DNLNet
+
+> [Disentangled Non-Local Neural Networks](https://arxiv.org/abs/2006.06668)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/yinmh17/DNL-Semantic-Segmentation">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+The non-local block is a popular module for strengthening the context modeling ability of a regular convolutional neural network. This paper first studies the non-local block in depth, where we find that its attention computation can be split into two terms, a whitened pairwise term accounting for the relationship between two pixels and a unary term representing the saliency of every pixel. We also observe that the two terms trained alone tend to model different visual clues, e.g. the whitened pairwise term learns within-region relationships while the unary term learns salient boundaries. However, the two terms are tightly coupled in the non-local block, which hinders the learning of each. Based on these findings, we present the disentangled non-local block, where the two terms are decoupled to facilitate learning for both terms. We demonstrate the effectiveness of the decoupled design on various tasks, such as semantic segmentation on Cityscapes, ADE20K and PASCAL Context, object detection on COCO, and action recognition on Kinetics.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142900944-b8d93301-d2ce-488e-a461-b0813f96be49.png" width="70%"/>
+</div>
+
+## Results and models (in progress)
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DNLNet | R-50-D8  | 512x1024  |   40000 |      7.3 | 2.56           | V100   | 78.61 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes/dnl_r50-d8_512x1024_40k_cityscapes_20200904_233629-53d4ea93.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes/dnl_r50-d8_512x1024_40k_cityscapes-20200904_233629.log.json)     |
+| DNLNet | R-101-D8 | 512x1024  |   40000 |     10.9 | 1.96           | V100   | 78.31 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes/dnl_r101-d8_512x1024_40k_cityscapes_20200904_233629-9928ffef.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes/dnl_r101-d8_512x1024_40k_cityscapes-20200904_233629.log.json) |
+| DNLNet | R-50-D8  | 769x769   |   40000 |      9.2 | 1.50           | V100   | 78.44 | 80.27         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_40k_cityscapes/dnl_r50-d8_769x769_40k_cityscapes_20200820_232206-0f283785.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_40k_cityscapes/dnl_r50-d8_769x769_40k_cityscapes-20200820_232206.log.json)         |
+| DNLNet | R-101-D8 | 769x769   |   40000 |     12.6 | 1.02           | V100   | 76.39 | 77.77         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_40k_cityscapes/dnl_r101-d8_769x769_40k_cityscapes_20200820_171256-76c596df.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_40k_cityscapes/dnl_r101-d8_769x769_40k_cityscapes-20200820_171256.log.json)     |
+| DNLNet | R-50-D8  | 512x1024  |   80000 |        - | -              | V100   | 79.33 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes/dnl_r50-d8_512x1024_80k_cityscapes_20200904_233629-58b2f778.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes/dnl_r50-d8_512x1024_80k_cityscapes-20200904_233629.log.json)     |
+| DNLNet | R-101-D8 | 512x1024  |   80000 |        - | -              | V100   | 80.41 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes/dnl_r101-d8_512x1024_80k_cityscapes_20200904_233629-758e2dd4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes/dnl_r101-d8_512x1024_80k_cityscapes-20200904_233629.log.json) |
+| DNLNet | R-50-D8  | 769x769   |   80000 |        - | -              | V100   | 79.36 | 80.70         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_80k_cityscapes/dnl_r50-d8_769x769_80k_cityscapes_20200820_011925-366bc4c7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_80k_cityscapes/dnl_r50-d8_769x769_80k_cityscapes-20200820_011925.log.json)         |
+| DNLNet | R-101-D8 | 769x769   |   80000 |        - | -              | V100   | 79.41 | 80.68         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_80k_cityscapes/dnl_r101-d8_769x769_80k_cityscapes_20200821_051111-95ff84ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_80k_cityscapes/dnl_r101-d8_769x769_80k_cityscapes-20200821_051111.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                  | download                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DNLNet | R-50-D8  | 512x512   |   80000 |      8.8 | 20.66          | V100   | 41.76 | 42.99         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_80k_ade20k/dnl_r50-d8_512x512_80k_ade20k_20200826_183354-1cf6e0c1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_80k_ade20k/dnl_r50-d8_512x512_80k_ade20k-20200826_183354.log.json)         |
+| DNLNet | R-101-D8 | 512x512   |   80000 |     12.8 | 12.54          | V100   | 43.76 | 44.91         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_80k_ade20k/dnl_r101-d8_512x512_80k_ade20k_20200826_183354-d820d6ea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_80k_ade20k/dnl_r101-d8_512x512_80k_ade20k-20200826_183354.log.json)     |
+| DNLNet | R-50-D8  | 512x512   |  160000 |        - | -              | V100   | 41.87 | 43.01         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_160k_ade20k/dnl_r50-d8_512x512_160k_ade20k_20200826_183350-37837798.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_160k_ade20k/dnl_r50-d8_512x512_160k_ade20k-20200826_183350.log.json)     |
+| DNLNet | R-101-D8 | 512x512   |  160000 |        - | -              | V100   | 44.25 | 45.78         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_160k_ade20k/dnl_r101-d8_512x512_160k_ade20k_20200826_183350-ed522c61.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_160k_ade20k/dnl_r101-d8_512x512_160k_ade20k-20200826_183350.log.json) |
+
+## Notes
+
+This example is to reproduce ["Disentangled Non-Local Neural Networks"](https://arxiv.org/abs/2006.06668) for semantic segmentation. It is still in progress.
+
+## Citation
+
+```bibtex
+@misc{yin2020disentangled,
+    title={Disentangled Non-Local Neural Networks},
+    author={Minghao Yin and Zhuliang Yao and Yue Cao and Xiu Li and Zheng Zhang and Stephen Lin and Han Hu},
+    year={2020},
+    booktitle={ECCV}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..310d84e5740b65ad50d3c480492fe92cee081faf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './dnl_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..a94dbb89b3b528b7a0ad977f5f54212655427e03
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './dnl_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9b6d5ee3df8d2be4ca3e65554641dc64b94c9e0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './dnl_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c7d557d028f82195f99c38b8e24c73e54769dd7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './dnl_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1edc26fd8c620f51ab27566a64e6fdce8e5d1507
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './dnl_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..d29c17ef5b84bb87a0bc05079af2e3a90f4fafee
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './dnl_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..be389927ce050e0e21e1b5b7e2ee8c488a847b5c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/dnl_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eaaa63ef279fcd044acf1ac9e1cbc92aa56a94c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/dnl_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e431783ad52981ee4de63ee8f8fd81f2c3be8e1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/dnl_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb379c1e08cb9154393599481bd7dbbd2e7758a5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,16 @@
+_base_ = [
+    '../_base_/models/dnl_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
+
+optim_wrapper = dict(
+    paramwise_cfg=dict(
+        custom_keys=dict(theta=dict(wd_mult=0.), phi=dict(wd_mult=0.))))
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2ae2a85da973689fb0d7271442f02dc18fd990f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/dnl_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..f310a4ebab43ae73844c191c23bbbf3b3080ecb7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/dnl_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/dnl_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/dnlnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/dnlnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..22e48d3dc5361729ac1e1ebf94ffe9594d1ad35e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dnlnet/metafile.yaml
@@ -0,0 +1,292 @@
+Collections:
+- Name: DNLNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  README: configs/dnlnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: dnl_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.61
+  Config: configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes/dnl_r50-d8_512x1024_40k_cityscapes_20200904_233629-53d4ea93.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes/dnl_r50-d8_512x1024_40k_cityscapes-20200904_233629.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.31
+  Config: configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes/dnl_r101-d8_512x1024_40k_cityscapes_20200904_233629-9928ffef.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes/dnl_r101-d8_512x1024_40k_cityscapes-20200904_233629.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.44
+      mIoU(ms+flip): 80.27
+  Config: configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_40k_cityscapes/dnl_r50-d8_769x769_40k_cityscapes_20200820_232206-0f283785.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_40k_cityscapes/dnl_r50-d8_769x769_40k_cityscapes-20200820_232206.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.39
+      mIoU(ms+flip): 77.77
+  Config: configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_40k_cityscapes/dnl_r101-d8_769x769_40k_cityscapes_20200820_171256-76c596df.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_40k_cityscapes/dnl_r101-d8_769x769_40k_cityscapes-20200820_171256.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.33
+  Config: configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes/dnl_r50-d8_512x1024_80k_cityscapes_20200904_233629-58b2f778.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes/dnl_r50-d8_512x1024_80k_cityscapes-20200904_233629.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.41
+  Config: configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes/dnl_r101-d8_512x1024_80k_cityscapes_20200904_233629-758e2dd4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes/dnl_r101-d8_512x1024_80k_cityscapes-20200904_233629.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.36
+      mIoU(ms+flip): 80.7
+  Config: configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_80k_cityscapes/dnl_r50-d8_769x769_80k_cityscapes_20200820_011925-366bc4c7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_80k_cityscapes/dnl_r50-d8_769x769_80k_cityscapes-20200820_011925.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.41
+      mIoU(ms+flip): 80.68
+  Config: configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_80k_cityscapes/dnl_r101-d8_769x769_80k_cityscapes_20200821_051111-95ff84ab.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_80k_cityscapes/dnl_r101-d8_769x769_80k_cityscapes-20200821_051111.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.76
+      mIoU(ms+flip): 42.99
+  Config: configs/dnlnet/dnl_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_80k_ade20k/dnl_r50-d8_512x512_80k_ade20k_20200826_183354-1cf6e0c1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_80k_ade20k/dnl_r50-d8_512x512_80k_ade20k-20200826_183354.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.76
+      mIoU(ms+flip): 44.91
+  Config: configs/dnlnet/dnl_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_80k_ade20k/dnl_r101-d8_512x512_80k_ade20k_20200826_183354-d820d6ea.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_80k_ade20k/dnl_r101-d8_512x512_80k_ade20k-20200826_183354.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.87
+      mIoU(ms+flip): 43.01
+  Config: configs/dnlnet/dnl_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_160k_ade20k/dnl_r50-d8_512x512_160k_ade20k_20200826_183350-37837798.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_160k_ade20k/dnl_r50-d8_512x512_160k_ade20k-20200826_183350.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.25
+      mIoU(ms+flip): 45.78
+  Config: configs/dnlnet/dnl_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_160k_ade20k/dnl_r101-d8_512x512_160k_ade20k_20200826_183350-ed522c61.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_160k_ade20k/dnl_r101-d8_512x512_160k_ade20k-20200826_183350.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/dpt/README.md b/head_extractor/src/mmseg/.mim/configs/dpt/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b3a5573a65f1e6b60c8de63ae3452ec95049e812
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dpt/README.md
@@ -0,0 +1,67 @@
+# DPT
+
+> [Vision Transformer for Dense Prediction](https://arxiv.org/abs/2103.13413)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/isl-org/DPT">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dpt_head.py#L215">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We introduce dense vision transformers, an architecture that leverages vision transformers in place of convolutional networks as a backbone for dense prediction tasks. We assemble tokens from various stages of the vision transformer into image-like representations at various resolutions and progressively combine them into full-resolution predictions using a convolutional decoder. The transformer backbone processes representations at a constant and relatively high resolution and has a global receptive field at every stage. These properties allow the dense vision transformer to provide finer-grained and more globally coherent predictions when compared to fully-convolutional networks. Our experiments show that this architecture yields substantial improvements on dense prediction tasks, especially when a large amount of training data is available. For monocular depth estimation, we observe an improvement of up to 28% in relative performance when compared to a state-of-the-art fully-convolutional network. When applied to semantic segmentation, dense vision transformers set a new state of the art on ADE20K with 49.02% mIoU. We further show that the architecture can be fine-tuned on smaller datasets such as NYUv2, KITTI, and Pascal Context where it also sets the new state of the art. Our models are available at [this https URL](https://github.com/isl-org/DPT).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142901057-00aabea5-dab4-43d3-a14a-5f73eb5dd9b9.png" width="80%"/>
+</div>
+
+## Usage
+
+To use other repositories' pre-trained models, it is necessary to convert keys.
+
+We provide a script [`vit2mmseg.py`](../../tools/model_converters/vit2mmseg.py) in the tools directory to convert the key of models from [timm](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to MMSegmentation style.
+
+```shell
+python tools/model_converters/vit2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+E.g.
+
+```shell
+python tools/model_converters/vit2mmseg.py https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth pretrain/jx_vit_base_p16_224-80ecf9dd.pth
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+## Results and models
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                               | download                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| DPT    | ViT-B    | 512x512   |  160000 | 8.09     | 10.41          | V100   | 46.97 |         48.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dpt/dpt_vit-b16_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-20210809_172025.log.json) |
+
+## Citation
+
+```bibtex
+@article{dosoViTskiy2020,
+  title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
+  author={DosoViTskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and  Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
+  journal={arXiv preprint arXiv:2010.11929},
+  year={2020}
+}
+
+@article{Ranftl2021,
+  author    = {Ren\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun},
+  title     = {Vision Transformers for Dense Prediction},
+  journal   = {ArXiv preprint},
+  year      = {2021},
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/dpt/dpt_vit-b16_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/dpt/dpt_vit-b16_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..56b33d96b9e424f6f73640986be419a0bb271458
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dpt/dpt_vit-b16_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/dpt_vit-b16.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/dpt/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/dpt/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b721e041b612ab81bd69e89a7e9a956abf5276d4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dpt/metafile.yaml
@@ -0,0 +1,37 @@
+Collections:
+- Name: DPT
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - ADE20K
+  Paper:
+    Title: Vision Transformer for Dense Prediction
+    URL: https://arxiv.org/abs/2103.13413
+  README: configs/dpt/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: dpt_vit-b16_8xb2-160k_ade20k-512x512
+  In Collection: DPT
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.97
+      mIoU(ms+flip): 48.34
+  Config: configs/dpt/dpt_vit-b16_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-B
+    - DPT
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.09
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-20210809_172025.log.json
+  Paper:
+    Title: Vision Transformer for Dense Prediction
+    URL: https://arxiv.org/abs/2103.13413
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dpt_head.py#L215
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/dsdl/README.md b/head_extractor/src/mmseg/.mim/configs/dsdl/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e564cffdb212280e6982643c4ce6108ad16f2b9c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dsdl/README.md
@@ -0,0 +1,103 @@
+# DSDL: Standard Description Language for DataSet
+
+<!-- [SKIP DEV CHECK] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Data is the cornerstone of artificial intelligence. The efficiency of data acquisition, exchange, and application directly impacts the advances in technologies and applications. Over the long history of AI, a vast quantity of data sets have been developed and distributed. However, these datasets are defined in very different forms, which incurs significant overhead when it comes to exchange, integration, and utilization -- it is often the case that one needs to develop a new customized tool or script in order to incorporate a new dataset into a workflow.
+
+To overcome such difficulties, we develop **Data Set Description Language (DSDL)**. More details please visit our [official documents](https://opendatalab.github.io/dsdl-docs/getting_started/overview/), dsdl datasets can be downloaded from our platform [OpenDataLab](https://opendatalab.com/).
+
+<!-- [IMAGE] -->
+
+## Steps
+
+- install dsdl and opendatalab:
+
+  ```
+  pip install dsdl
+  pip install opendatalab
+  ```
+
+- install mmseg and pytorch:
+  please refer this [installation documents](https://mmsegmentation.readthedocs.io/en/latest/get_started.html).
+
+- prepare dsdl dataset (take voc2012 as an example)
+
+  - dowaload dsdl dataset (you will need an opendatalab account to do so. [register one now](https://opendatalab.com/))
+
+    ```
+    cd data
+
+    odl login
+    odl get PASCAL_VOC2012
+    ```
+
+    usually, dataset is compressed on opendatalab platform, the downloaded voc 2012 dataset should be like this:
+
+    ```
+    data/
+    ├── PASCAL_VOC2012
+    │   ├── dsdl
+    │   │   ├── dsdl_Det_full.zip
+    │   │   └── dsdl_SemSeg_full.zip
+    │   ├── raw
+    │   │   ├── VOC2012test.tar
+    │   │   ├── VOCdevkit_18-May-2011.tar
+    │   │   └── VOCtrainval_11-May-2012.tar
+    │   └── README.md
+    └── ...
+    ```
+
+  - decompress dataset
+
+    ```
+    cd dsdl
+    unzip dsdl_SemSeg_full.zip
+    ```
+
+    as we do not need detection dsdl files, we only decompress the semantic segmentation files here.
+
+    ```
+    cd ../raw
+    tar -xvf VOCtrainval_11-May-2012.tar
+    tar -xvf VOC2012test.tar
+
+    cd ../../
+    ```
+
+- change traning config
+
+  open the [voc config file](voc.py) and set some file paths as below:
+
+  ```
+  data_root = 'data/PASCAL_VOC2012'
+  img_prefix = 'raw/VOCdevkit/VOC2012'
+  train_ann = 'dsdl/dsdl_SemSeg_full/set-train/train.yaml'
+  val_ann = 'dsdl/dsdl_SemSeg_full/set-val/val.yaml'
+  ```
+
+  as dsdl datasets with one task using one dataloader, we can simplly change these file paths to train a model on a different dataset.
+
+- train:
+
+  - using single gpu:
+
+  ```
+  python tools/train.py {config_file}
+  ```
+
+  - using slrum:
+
+  ```
+  ./tools/slurm_train.sh {partition} {job_name} {config_file} {work_dir} {gpu_nums}
+  ```
+
+## Test Results
+
+|  Datasets  |                                                                                        Model                                                                                         | mIoU(%) |          Config           |
+| :--------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----: | :-----------------------: |
+|  voc2012   |    [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug/deeplabv3_r50-d8_512x512_20k_voc12aug_20200617_010906-596905ef.pth)    |  76.73  |    [config](./voc.py)     |
+| cityscapes | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes/deeplabv3_r50-d8_512x1024_40k_cityscapes_20200605_022449-acadc2f8.pth) |  79.01  | [config](./cityscapes.py) |
diff --git a/head_extractor/src/mmseg/.mim/configs/dsdl/cityscapes.py b/head_extractor/src/mmseg/.mim/configs/dsdl/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ccc068e0616720023e0cd2bdc5ee73f467a265
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dsdl/cityscapes.py
@@ -0,0 +1,70 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+# dataset settings
+dataset_type = 'DSDLSegDataset'
+data_root = 'data/CityScapes'
+img_prefix = 'raw/CityScapes'
+train_ann = 'dsdl/dsdl_SemSeg_full/set-train/train.yaml'
+val_ann = 'dsdl/dsdl_SemSeg_full/set-val/val.yaml'
+
+used_labels = [
+    'road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic_light',
+    'traffic_sign', 'vegetation', 'terrain', 'sky', 'person', 'rider', 'car',
+    'truck', 'bus', 'train', 'motorcycle', 'bicycle'
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path=img_prefix, seg_map_path=img_prefix),
+        ann_file=train_ann,
+        used_labels=used_labels,
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path=img_prefix, seg_map_path=img_prefix),
+        ann_file=val_ann,
+        used_labels=used_labels,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/.mim/configs/dsdl/voc.py b/head_extractor/src/mmseg/.mim/configs/dsdl/voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1895f7c7d3816f602a3c4e6130a378d288c8d6a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/dsdl/voc.py
@@ -0,0 +1,65 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+
+# dataset settings
+dataset_type = 'DSDLSegDataset'
+data_root = 'data/PASCAL_VOC2012'
+img_prefix = 'raw/VOCdevkit/VOC2012'
+train_ann = 'dsdl/dsdl_SemSeg_full/set-train/train.yaml'
+val_ann = 'dsdl/dsdl_SemSeg_full/set-val/val.yaml'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path=img_prefix, seg_map_path=img_prefix),
+        ann_file=train_ann,
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path=img_prefix, seg_map_path=img_prefix),
+        ann_file=val_ann,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
+
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/emanet/README.md b/head_extractor/src/mmseg/.mim/configs/emanet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8ffaf471caf36187798ac6a8c861882dede66ddb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/emanet/README.md
@@ -0,0 +1,46 @@
+# EMANet
+
+> [Expectation-Maximization Attention Networks for Semantic Segmentation](https://arxiv.org/abs/1907.13426)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://xialipku.github.io/EMANet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ema_head.py#L80">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Self-attention mechanism has been widely used for various tasks. It is designed to compute the representation of each position by a weighted sum of the features at all positions. Thus, it can capture long-range relations for computer vision tasks. However, it is computationally consuming. Since the attention maps are computed w.r.t all other positions. In this paper, we formulate the attention mechanism into an expectation-maximization manner and iteratively estimate a much more compact set of bases upon which the attention maps are computed. By a weighted summation upon these bases, the resulting representation is low-rank and deprecates noisy information from the input. The proposed Expectation-Maximization Attention (EMA) module is robust to the variance of input and is also friendly in memory and computation. Moreover, we set up the bases maintenance and normalization methods to stabilize its training procedure. We conduct extensive experiments on popular semantic segmentation benchmarks including PASCAL VOC, PASCAL Context and COCO Stuff, on which we set new records.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142901186-7bfe15e2-805a-420e-81b0-74f214f20a36.png" width="80%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                           | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | -------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| EMANet | R-50-D8  | 512x1024  |   80000 |      5.4 | 4.58           | V100   | 77.59 | 79.44         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet/eemanet_r50-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_512x1024_80k_cityscapes/emanet_r50-d8_512x1024_80k_cityscapes_20200901_100301-c43fcef1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_512x1024_80k_cityscapes/emanet_r50-d8_512x1024_80k_cityscapes-20200901_100301.log.json)     |
+| EMANet | R-101-D8 | 512x1024  |   80000 |      6.2 | 2.87           | V100   | 79.10 | 81.21         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_512x1024_80k_cityscapes/emanet_r101-d8_512x1024_80k_cityscapes_20200901_100301-2d970745.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_512x1024_80k_cityscapes/emanet_r101-d8_512x1024_80k_cityscapes-20200901_100301.log.json) |
+| EMANet | R-50-D8  | 769x769   |   80000 |      8.9 | 1.97           | V100   | 79.33 | 80.49         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_769x769_80k_cityscapes/emanet_r50-d8_769x769_80k_cityscapes_20200901_100301-16f8de52.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_769x769_80k_cityscapes/emanet_r50-d8_769x769_80k_cityscapes-20200901_100301.log.json)         |
+| EMANet | R-101-D8 | 769x769   |   80000 |     10.1 | 1.22           | V100   | 79.62 | 81.00         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_769x769_80k_cityscapes/emanet_r101-d8_769x769_80k_cityscapes_20200901_100301-47a324ce.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_769x769_80k_cityscapes/emanet_r101-d8_769x769_80k_cityscapes-20200901_100301.log.json)     |
+
+## Citation
+
+```bibtex
+@inproceedings{li2019expectation,
+  title={Expectation-maximization attention networks for semantic segmentation},
+  author={Li, Xia and Zhong, Zhisheng and Wu, Jianlong and Yang, Yibo and Lin, Zhouchen and Liu, Hong},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision},
+  pages={9167--9176},
+  year={2019}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee3a3b51672f14da083339bbf478322dc6faaba1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './emanet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..7319a3e4b603d244f7b7723cc3237c97ae36be2a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './emanet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..6198e1f9a2e8538e05d338d6f916c44743171d16
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/emanet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8e4521b07bda3bbcdf3e39c45733eb7d8477fec
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/emanet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/emanet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/emanet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b2a6b09ed7296cf6efe0f545e17c9a90a2fc4405
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/emanet/metafile.yaml
@@ -0,0 +1,109 @@
+Collections:
+- Name: EMANet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: Expectation-Maximization Attention Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.13426
+  README: configs/emanet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: eemanet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: EMANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.59
+      mIoU(ms+flip): 79.44
+  Config: configs/emanet/eemanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - EMANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_512x1024_80k_cityscapes/emanet_r50-d8_512x1024_80k_cityscapes_20200901_100301-c43fcef1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_512x1024_80k_cityscapes/emanet_r50-d8_512x1024_80k_cityscapes-20200901_100301.log.json
+  Paper:
+    Title: Expectation-Maximization Attention Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.13426
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ema_head.py#L80
+  Framework: PyTorch
+- Name: emanet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: EMANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.1
+      mIoU(ms+flip): 81.21
+  Config: configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - EMANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_512x1024_80k_cityscapes/emanet_r101-d8_512x1024_80k_cityscapes_20200901_100301-2d970745.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_512x1024_80k_cityscapes/emanet_r101-d8_512x1024_80k_cityscapes-20200901_100301.log.json
+  Paper:
+    Title: Expectation-Maximization Attention Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.13426
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ema_head.py#L80
+  Framework: PyTorch
+- Name: emanet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: EMANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.33
+      mIoU(ms+flip): 80.49
+  Config: configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - EMANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_769x769_80k_cityscapes/emanet_r50-d8_769x769_80k_cityscapes_20200901_100301-16f8de52.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_769x769_80k_cityscapes/emanet_r50-d8_769x769_80k_cityscapes-20200901_100301.log.json
+  Paper:
+    Title: Expectation-Maximization Attention Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.13426
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ema_head.py#L80
+  Framework: PyTorch
+- Name: emanet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: EMANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.62
+      mIoU(ms+flip): 81.0
+  Config: configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - EMANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_769x769_80k_cityscapes/emanet_r101-d8_769x769_80k_cityscapes_20200901_100301-47a324ce.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_769x769_80k_cityscapes/emanet_r101-d8_769x769_80k_cityscapes-20200901_100301.log.json
+  Paper:
+    Title: Expectation-Maximization Attention Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.13426
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ema_head.py#L80
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/README.md b/head_extractor/src/mmseg/.mim/configs/encnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ff09bc32f86df1a91dc8eb1338c384599090643b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/README.md
@@ -0,0 +1,59 @@
+# EncNet
+
+> [Context Encoding for Semantic Segmentation](https://arxiv.org/abs/1803.08904)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/zhanghang1989/PyTorch-Encoding">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Recent work has made significant progress in improving spatial resolution for pixelwise labeling with Fully Convolutional Network (FCN) framework by employing Dilated/Atrous convolution, utilizing multi-scale features and refining boundaries. In this paper, we explore the impact of global contextual information in semantic segmentation by introducing the Context Encoding Module, which captures the semantic context of scenes and selectively highlights class-dependent featuremaps. The proposed Context Encoding Module significantly improves semantic segmentation results with only marginal extra computation cost over FCN. Our approach has achieved new state-of-the-art results 51.7% mIoU on PASCAL-Context, 85.9% mIoU on PASCAL VOC 2012. Our single model achieves a final score of 0.5567 on ADE20K test set, which surpass the winning entry of COCO-Place Challenge in 2017. In addition, we also explore how the Context Encoding Module can improve the feature representation of relatively shallow networks for the image classification on CIFAR-10 dataset. Our 14 layer network has achieved an error rate of 3.45%, which is comparable with state-of-the-art approaches with over 10 times more layers. The source code for the complete system are publicly available.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142901276-b364fbbf-3bdb-4000-9d31-b9a135e30935.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| EncNet | R-50-D8  | 512x1024  |   40000 | 8.6      | 4.58           | V100   | 75.67 |         77.08 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_40k_cityscapes/encnet_r50-d8_512x1024_40k_cityscapes_20200621_220958-68638a47.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_40k_cityscapes/encnet_r50-d8_512x1024_40k_cityscapes-20200621_220958.log.json)     |
+| EncNet | R-101-D8 | 512x1024  |   40000 | 12.1     | 2.66           | V100   | 75.81 |         77.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_40k_cityscapes/encnet_r101-d8_512x1024_40k_cityscapes_20200621_220933-35e0a3e8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_40k_cityscapes/encnet_r101-d8_512x1024_40k_cityscapes-20200621_220933.log.json) |
+| EncNet | R-50-D8  | 769x769   |   40000 | 9.8      | 1.82           | V100   | 76.24 |         77.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_40k_cityscapes/encnet_r50-d8_769x769_40k_cityscapes_20200621_220958-3bcd2884.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_40k_cityscapes/encnet_r50-d8_769x769_40k_cityscapes-20200621_220958.log.json)         |
+| EncNet | R-101-D8 | 769x769   |   40000 | 13.7     | 1.26           | V100   | 74.25 |         76.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_40k_cityscapes/encnet_r101-d8_769x769_40k_cityscapes_20200621_220933-2fafed55.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_40k_cityscapes/encnet_r101-d8_769x769_40k_cityscapes-20200621_220933.log.json)     |
+| EncNet | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 77.94 |         79.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_80k_cityscapes/encnet_r50-d8_512x1024_80k_cityscapes_20200622_003554-fc5c5624.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_80k_cityscapes/encnet_r50-d8_512x1024_80k_cityscapes-20200622_003554.log.json)     |
+| EncNet | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 78.55 |         79.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_80k_cityscapes/encnet_r101-d8_512x1024_80k_cityscapes_20200622_003555-1de64bec.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_80k_cityscapes/encnet_r101-d8_512x1024_80k_cityscapes-20200622_003555.log.json) |
+| EncNet | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 77.44 |         78.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_80k_cityscapes/encnet_r50-d8_769x769_80k_cityscapes_20200622_003554-55096dcb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_80k_cityscapes/encnet_r50-d8_769x769_80k_cityscapes-20200622_003554.log.json)         |
+| EncNet | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 76.10 |         76.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_80k_cityscapes/encnet_r101-d8_769x769_80k_cityscapes_20200622_003555-470ef79d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_80k_cityscapes/encnet_r101-d8_769x769_80k_cityscapes-20200622_003555.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| EncNet | R-50-D8  | 512x512   |   80000 | 10.1     | 22.81          | V100   | 39.53 |         41.17 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_80k_ade20k/encnet_r50-d8_512x512_80k_ade20k_20200622_042412-44b46b04.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_80k_ade20k/encnet_r50-d8_512x512_80k_ade20k-20200622_042412.log.json)         |
+| EncNet | R-101-D8 | 512x512   |   80000 | 13.6     | 14.87          | V100   | 42.11 |         43.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_80k_ade20k/encnet_r101-d8_512x512_80k_ade20k_20200622_101128-dd35e237.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_80k_ade20k/encnet_r101-d8_512x512_80k_ade20k-20200622_101128.log.json)     |
+| EncNet | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 40.10 |         41.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_160k_ade20k/encnet_r50-d8_512x512_160k_ade20k_20200622_101059-b2db95e0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_160k_ade20k/encnet_r50-d8_512x512_160k_ade20k-20200622_101059.log.json)     |
+| EncNet | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 42.61 |         44.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_160k_ade20k/encnet_r101-d8_512x512_160k_ade20k_20200622_073348-7989641f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_160k_ade20k/encnet_r101-d8_512x512_160k_ade20k-20200622_073348.log.json) |
+
+## Citation
+
+```bibtex
+@InProceedings{Zhang_2018_CVPR,
+author = {Zhang, Hang and Dana, Kristin and Shi, Jianping and Zhang, Zhongyue and Wang, Xiaogang and Tyagi, Ambrish and Agrawal, Amit},
+title = {Context Encoding for Semantic Segmentation},
+booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+month = {June},
+year = {2018}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..13ab367be59d03a643e26e13e0171e5b2404802a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..7810ac440d417f839962edc2eee37fbbddb7b73b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..bec6bd907d9f7e5dcf117696f68073c41b47d85b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1f6409e6339c3929005ed278ef48923a911bd16
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9599f9c0d3d295accc7652719899eba870976ba7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9edfc28a2315f0ee8c2c6a4836b9b9c261f99a8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2fbab59e38b9205c0cbfbda5b8b4284279e9eff
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..debe8c8331bb5aa849e07ae9503114e1129db995
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5c3027a898b41c10b9e741031ed8a168e1a4a09
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/encnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..045d0feb0c58df1e3ceef8d4490f7c6d67ddfee7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/encnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dafcd5b7db201384c0d52d525438863baf06d24
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/encnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4d0b8045e2dbd3145e1cd45cdb43d46a0190b0d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/encnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9167980627ff2fad662feddc412999fc615271b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/encnet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5c917158d6c73e9650b1bea01297de9b004c3a9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/encnet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ca126ab0236849feac685dbdd4649e5cef13108
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/encnet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..931d6c019bff7296e0957c85c0bd187d053aa6e5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/encnet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50s-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50s-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..e98104dbafec055df9ffdddc91719eb8acfa71fb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/encnet_r50s-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/encnet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(stem_channels=128),
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/encnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/encnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0dbdcfaab3f9a01d40e73b430c68d71b77b5c4b2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/encnet/metafile.yaml
@@ -0,0 +1,296 @@
+Collections:
+- Name: EncNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  README: configs/encnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: encnet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.67
+      mIoU(ms+flip): 77.08
+  Config: configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_40k_cityscapes/encnet_r50-d8_512x1024_40k_cityscapes_20200621_220958-68638a47.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_40k_cityscapes/encnet_r50-d8_512x1024_40k_cityscapes-20200621_220958.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.81
+      mIoU(ms+flip): 77.21
+  Config: configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_40k_cityscapes/encnet_r101-d8_512x1024_40k_cityscapes_20200621_220933-35e0a3e8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_40k_cityscapes/encnet_r101-d8_512x1024_40k_cityscapes-20200621_220933.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.24
+      mIoU(ms+flip): 77.85
+  Config: configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_40k_cityscapes/encnet_r50-d8_769x769_40k_cityscapes_20200621_220958-3bcd2884.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_40k_cityscapes/encnet_r50-d8_769x769_40k_cityscapes-20200621_220958.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.25
+      mIoU(ms+flip): 76.25
+  Config: configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_40k_cityscapes/encnet_r101-d8_769x769_40k_cityscapes_20200621_220933-2fafed55.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_40k_cityscapes/encnet_r101-d8_769x769_40k_cityscapes-20200621_220933.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.94
+      mIoU(ms+flip): 79.13
+  Config: configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_80k_cityscapes/encnet_r50-d8_512x1024_80k_cityscapes_20200622_003554-fc5c5624.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_80k_cityscapes/encnet_r50-d8_512x1024_80k_cityscapes-20200622_003554.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.55
+      mIoU(ms+flip): 79.47
+  Config: configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_80k_cityscapes/encnet_r101-d8_512x1024_80k_cityscapes_20200622_003555-1de64bec.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_80k_cityscapes/encnet_r101-d8_512x1024_80k_cityscapes-20200622_003555.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.44
+      mIoU(ms+flip): 78.72
+  Config: configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_80k_cityscapes/encnet_r50-d8_769x769_80k_cityscapes_20200622_003554-55096dcb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_80k_cityscapes/encnet_r50-d8_769x769_80k_cityscapes-20200622_003554.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.1
+      mIoU(ms+flip): 76.97
+  Config: configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_80k_cityscapes/encnet_r101-d8_769x769_80k_cityscapes_20200622_003555-470ef79d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_80k_cityscapes/encnet_r101-d8_769x769_80k_cityscapes-20200622_003555.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 39.53
+      mIoU(ms+flip): 41.17
+  Config: configs/encnet/encnet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_80k_ade20k/encnet_r50-d8_512x512_80k_ade20k_20200622_042412-44b46b04.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_80k_ade20k/encnet_r50-d8_512x512_80k_ade20k-20200622_042412.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.11
+      mIoU(ms+flip): 43.61
+  Config: configs/encnet/encnet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_80k_ade20k/encnet_r101-d8_512x512_80k_ade20k_20200622_101128-dd35e237.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_80k_ade20k/encnet_r101-d8_512x512_80k_ade20k-20200622_101128.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.1
+      mIoU(ms+flip): 41.71
+  Config: configs/encnet/encnet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_160k_ade20k/encnet_r50-d8_512x512_160k_ade20k_20200622_101059-b2db95e0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_160k_ade20k/encnet_r50-d8_512x512_160k_ade20k-20200622_101059.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.61
+      mIoU(ms+flip): 44.01
+  Config: configs/encnet/encnet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_160k_ade20k/encnet_r101-d8_512x512_160k_ade20k_20200622_073348-7989641f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_160k_ade20k/encnet_r101-d8_512x512_160k_ade20k-20200622_073348.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/erfnet/README.md b/head_extractor/src/mmseg/.mim/configs/erfnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..55d71973a3e08b17263b7fff861971771b0ccd52
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/erfnet/README.md
@@ -0,0 +1,54 @@
+# ERFNet
+
+> [ERFNet: Efficient Residual Factorized ConvNet for Real-time Semantic Segmentation](http://www.robesafe.uah.es/personal/eduardo.romera/pdfs/Romera17tits.pdf)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/Eromera/erfnet_pytorch">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/erfnet.py#L321">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Semantic segmentation is a challenging task that addresses most of the perception needs of intelligent vehicles (IVs) in an unified way. Deep neural networks excel at this task, as they can be trained end-to-end to accurately classify multiple object categories in an image at pixel level. However, a good tradeoff between high quality and computational resources is yet not present in the state-of-the-art semantic segmentation approaches, limiting their application in real vehicles. In this paper, we propose a deep architecture that is able to run in real time while providing accurate semantic segmentation. The core of our architecture is a novel layer that uses residual connections and factorized convolutions in order to remain efficient while retaining remarkable accuracy. Our approach is able to run at over 83 FPS in a single Titan X, and 7 FPS in a Jetson TX1 (embedded device). A comprehensive set of experiments on the publicly available Cityscapes data set demonstrates that our system achieves an accuracy that is similar to the state of the art, while being orders of magnitude faster to compute than other architectures that achieve top precision. The resulting tradeoff makes our model an ideal approach for scene understanding in IV applications. The code is publicly available at: https://github.com/Eromera/erfnet.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/143479729-ea7951f6-1a3c-47d6-aaee-62c5759c0638.png" width="60%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ---: | ------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ERFNet | ERFNet   | 512x1024  |  160000 | 6.04     | 15.26          | V100   | 72.5 | 74.75         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145-dc90157a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145.log.json) |
+
+Note:
+
+- The model is trained from scratch.
+
+- Last deconvolution layer in the [original paper](https://github.com/Eromera/erfnet_pytorch/blob/master/train/erfnet.py#L123) is replaced by a naive `FCNHead` decoder head and a bilinear upsampling layer, found more effective and efficient.
+
+- This model performance is sensitive to the seed values used, please refer to the log file for the specific settings of the seed. If you choose a different seed, the results might differ from the table results.
+
+## Citation
+
+```bibtex
+@article{romera2017erfnet,
+  title={Erfnet: Efficient residual factorized convnet for real-time semantic segmentation},
+  author={Romera, Eduardo and Alvarez, Jos{\'e} M and Bergasa, Luis M and Arroyo, Roberto},
+  journal={IEEE Transactions on Intelligent Transportation Systems},
+  volume={19},
+  number={1},
+  pages={263--272},
+  year={2017},
+  publisher={IEEE}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d6558279818db7e3a56030d4ab6eca54f90c753
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/erfnet_fcn.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/erfnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/erfnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bf514124ee001a62109f336ca543a96e8ef6eabf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/erfnet/metafile.yaml
@@ -0,0 +1,37 @@
+Collections:
+- Name: ERFNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: 'ERFNet: Efficient Residual Factorized ConvNet for Real-time Semantic Segmentation'
+    URL: http://www.robesafe.uah.es/personal/eduardo.romera/pdfs/Romera17tits.pdf
+  README: configs/erfnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: erfnet_fcn_4xb4-160k_cityscapes-512x1024
+  In Collection: ERFNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 72.5
+      mIoU(ms+flip): 74.75
+  Config: configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - ERFNet
+    - ERFNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.04
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145-dc90157a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145.log.json
+  Paper:
+    Title: 'ERFNet: Efficient Residual Factorized ConvNet for Real-time Semantic Segmentation'
+    URL: http://www.robesafe.uah.es/personal/eduardo.romera/pdfs/Romera17tits.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/erfnet.py#L321
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/README.md b/head_extractor/src/mmseg/.mim/configs/fastfcn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..48644e57e3ab2e3d65c73ebc2738ab3010f9b5d4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/README.md
@@ -0,0 +1,63 @@
+# FastFCN
+
+> [FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation](https://arxiv.org/abs/1903.11816)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/wuhuikai/FastFCN">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Modern approaches for semantic segmentation usually employ dilated convolutions in the backbone to extract high-resolution feature maps, which brings heavy computation complexity and memory footprint. To replace the time and memory consuming dilated convolutions, we propose a novel joint upsampling module named Joint Pyramid Upsampling (JPU) by formulating the task of extracting high-resolution feature maps into a joint upsampling problem. With the proposed JPU, our method reduces the computation complexity by more than three times without performance loss. Experiments show that JPU is superior to other upsampling modules, which can be plugged into many existing approaches to reduce computation complexity and improve performance. By replacing dilated convolutions with the proposed JPU module, our method achieves the state-of-the-art performance in Pascal Context dataset (mIoU of 53.13%) and ADE20K dataset (final score of 0.5584) while running 3 times faster.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142901374-6e0252ab-6e0f-4acd-86ad-1e9f49be3185.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method              | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                                                                                           |
+| ------------------- | -------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FastFCN + DeepLabV3 | R-50-D32       | 512x1024  |   80000 | 5.67     | 2.64           | V100   | 79.12 | 80.58         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes_20210928_053722-5d1a2648.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes_20210928_053722.log.json)                 |
+| FastFCN + DeepLabV3 | R-50-D32 (4x4) | 512x1024  |   80000 | 9.79     | -              | V100   | 79.52 | 80.91         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes_20210924_214357-72220849.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes_20210924_214357.log.json) |
+| FastFCN + PSPNet    | R-50-D32       | 512x1024  |   80000 | 5.67     | 4.40           | V100   | 79.26 | 80.86         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes_20210928_053722-57749bed.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes_20210928_053722.log.json)                     |
+| FastFCN + PSPNet    | R-50-D32 (4x4) | 512x1024  |   80000 | 9.94     | -              | V100   | 78.76 | 80.03         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes_20210925_061841-77e87b0a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes_20210925_061841.log.json)     |
+| FastFCN + EncNet    | R-50-D32       | 512x1024  |   80000 | 8.15     | 4.77           | V100   | 77.97 | 79.92         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes_20210928_030036-78da5046.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes_20210928_030036.log.json)                     |
+| FastFCN + EncNet    | R-50-D32 (4x4) | 512x1024  |   80000 | 15.45    | -              | V100   |  78.6 | 80.25         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes_20210926_093217-e1eb6dbb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes_20210926_093217.log.json)     |
+
+### ADE20K
+
+| Method              | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                | download                                                                                                                                                                                                                                                                                                                                                                           |
+| ------------------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FastFCN + DeepLabV3 | R-50-D32 | 512x1024  |   80000 | 8.46     | 12.06          | V100   | 41.88 | 42.91         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k_20211013_190619-3aa40f2d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k_20211013_190619.log.json)     |
+| FastFCN + DeepLabV3 | R-50-D32 | 512x1024  |  160000 | -        | -              | V100   | 43.58 | 44.92         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k_20211008_152246-27036aee.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k_20211008_152246.log.json) |
+| FastFCN + PSPNet    | R-50-D32 | 512x1024  |   80000 | 8.02     | 19.21          | V100   | 41.40 | 42.12         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k_20210930_225137-993d07c8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k_20210930_225137.log.json)         |
+| FastFCN + PSPNet    | R-50-D32 | 512x1024  |  160000 | -        | -              | V100   | 42.63 | 43.71         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k_20211008_105455-e8f5a2fd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k_20211008_105455.log.json)     |
+| FastFCN + EncNet    | R-50-D32 | 512x1024  |   80000 | 9.67     | 17.23          | V100   | 40.88 | 42.36         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k_20210930_225214-65aef6dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k_20210930_225214.log.json)         |
+| FastFCN + EncNet    | R-50-D32 | 512x1024  |  160000 | -        | -              | V100   | 42.50 | 44.21         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k_20211008_105456-d875ce3c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k_20211008_105456.log.json)     |
+
+Note:
+
+- `4x4` means 4 GPUs with 4 samples per GPU in training, default setting is 4 GPUs with 2 samples per GPU in training.
+- Results of [DeepLabV3 (mIoU: 79.32)](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3), [PSPNet (mIoU: 78.55)](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet) and [ENCNet (mIoU: 77.94)](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet) can be found in each original repository.
+
+## Citation
+
+```bibtex
+@article{wu2019fastfcn,
+title={Fastfcn: Rethinking dilated convolution in the backbone for semantic segmentation},
+author={Wu, Huikai and Zhang, Junge and Huang, Kaiqi and Liang, Kongming and Yu, Yizhou},
+journal={arXiv preprint arXiv:1903.11816},
+year={2019}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..39e6e236b7cabd6baa72199359e5c45517c0962b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,20 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='ASPPHead',
+        in_channels=2048,
+        in_index=2,
+        channels=512,
+        dilations=(1, 12, 24, 36),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1913544cfbdd640f6e32ef0b50b40b6bf2357c20
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,20 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='ASPPHead',
+        in_channels=2048,
+        in_index=2,
+        channels=512,
+        dilations=(1, 12, 24, 36),
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..751689599dcd90c7c9a6e3ef8b2c028a9f1d9875
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,20 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='ASPPHead',
+        in_channels=2048,
+        in_index=2,
+        channels=512,
+        dilations=(1, 12, 24, 36),
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8c5dc323205dbd2b25855835573eae7aca94187
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_cityscapes-512x1024.py
@@ -0,0 +1,5 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py'
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..4840dd02878d2ae90b6aae6b75ef4f0b33523a96
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,24 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='EncHead',
+        in_channels=[512, 1024, 2048],
+        in_index=(0, 1, 2),
+        channels=512,
+        num_codes=32,
+        use_se_loss=True,
+        add_lateral=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_se_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..619d0862f17e20c5b3ce308a47e39049fd908290
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,24 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='EncHead',
+        in_channels=[512, 1024, 2048],
+        in_index=(0, 1, 2),
+        channels=512,
+        num_codes=32,
+        use_se_loss=True,
+        add_lateral=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_se_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a76b026b6a2574f1fd0078edfb25be3f62a68809
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,24 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='EncHead',
+        in_channels=[512, 1024, 2048],
+        in_index=(0, 1, 2),
+        channels=512,
+        num_codes=32,
+        use_se_loss=True,
+        add_lateral=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_se_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..6df1527272b51a013b2f11875e33a863470fbffb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_cityscapes-512x1024.py
@@ -0,0 +1,5 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py'
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc5c54d5537df83ecd2bc64753e05d2660830e43
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/fastfcn_r50-d32_jpu_psp.py',
+    '../_base_/datasets/cityscapes.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..887ace1d8786b20065cb74a3e42b49e89662db5b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/fastfcn_r50-d32_jpu_psp.py',
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..3981e20a470de753d20a74ab9f220f2a5ebeb221
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/fastfcn_r50-d32_jpu_psp.py',
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c7d5041606bd1305bd897b10c8bcdb026b2f19e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/fastfcn_r50-d32_jpu_psp.py',
+    '../_base_/datasets/cityscapes.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/fastfcn/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/fastfcn/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f5fe03ca45a656cb86ffeaf53b6e1da34fc251ec
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastfcn/metafile.yaml
@@ -0,0 +1,311 @@
+Collections:
+- Name: FastFCN
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  README: configs/fastfcn/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.12
+      mIoU(ms+flip): 80.58
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.67
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes_20210928_053722-5d1a2648.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes_20210928_053722.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.52
+      mIoU(ms+flip): 80.91
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.79
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes_20210924_214357-72220849.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes_20210924_214357.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.26
+      mIoU(ms+flip): 80.86
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.67
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes_20210928_053722-57749bed.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes_20210928_053722.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.76
+      mIoU(ms+flip): 80.03
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.94
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes_20210925_061841-77e87b0a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes_20210925_061841.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.97
+      mIoU(ms+flip): 79.92
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.15
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes_20210928_030036-78da5046.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes_20210928_030036.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.6
+      mIoU(ms+flip): 80.25
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 15.45
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes_20210926_093217-e1eb6dbb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes_20210926_093217.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_aspp_4xb4-80k_ade20k-512x512
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.88
+      mIoU(ms+flip): 42.91
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.46
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k_20211013_190619-3aa40f2d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k_20211013_190619.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_aspp_4xb4-160k_ade20k-512x512
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.58
+      mIoU(ms+flip): 44.92
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k_20211008_152246-27036aee.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k_20211008_152246.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.4
+      mIoU(ms+flip): 42.12
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.02
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k_20210930_225137-993d07c8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k_20210930_225137.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.63
+      mIoU(ms+flip): 43.71
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k_20211008_105455-e8f5a2fd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k_20211008_105455.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_enc_4xb4-80k_ade20k-512x512
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.88
+      mIoU(ms+flip): 42.36
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.67
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k_20210930_225214-65aef6dd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k_20210930_225214.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_enc_4xb4-160k_ade20k-512x512
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.5
+      mIoU(ms+flip): 44.21
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k_20211008_105456-d875ce3c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k_20211008_105456.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/fastscnn/README.md b/head_extractor/src/mmseg/.mim/configs/fastscnn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6be981462ab1381c03a479eab5986566145c3b98
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastscnn/README.md
@@ -0,0 +1,42 @@
+# Fast-SCNN
+
+> [Fast-SCNN for Semantic Segmentation](https://arxiv.org/abs/1902.04502)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/fast_scnn.py#L272">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+The encoder-decoder framework is state-of-the-art for offline semantic image segmentation. Since the rise in autonomous systems, real-time computation is increasingly desirable. In this paper, we introduce fast segmentation convolutional neural network (Fast-SCNN), an above real-time semantic segmentation model on high resolution image data (1024x2048px) suited to efficient computation on embedded devices with low memory. Building on existing two-branch methods for fast segmentation, we introduce our \`learning to downsample' module which computes low-level features for multiple resolution branches simultaneously. Our network combines spatial detail at high resolution with deep features extracted at lower resolution, yielding an accuracy of 68.0% mean intersection over union at 123.5 frames per second on Cityscapes. We also show that large scale pre-training is unnecessary. We thoroughly validate our metric in experiments with ImageNet pre-training and the coarse labeled data of Cityscapes. Finally, we show even faster computation with competitive results on subsampled inputs, without any network modifications.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142901444-705b4ff4-6d1e-409b-899a-37bf3a6b69ce.png" width="80%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method   | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                               |
+| -------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FastSCNN | FastSCNN | 512x1024  |  160000 | 3.3      | 56.45          | V100   | 70.96 | 72.65         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastscnn/fast_scnn_8xb4-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853.log.json) |
+
+## Citation
+
+```bibtex
+@article{poudel2019fast,
+  title={Fast-scnn: Fast semantic segmentation network},
+  author={Poudel, Rudra PK and Liwicki, Stephan and Cipolla, Roberto},
+  journal={arXiv preprint arXiv:1902.04502},
+  year={2019}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/fastscnn/fast_scnn_8xb4-160k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fastscnn/fast_scnn_8xb4-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7f68bfe73e9a2db67af2c8af048fdf58d4c8e58
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastscnn/fast_scnn_8xb4-160k_cityscapes-512x1024.py
@@ -0,0 +1,15 @@
+_base_ = [
+    '../_base_/models/fast_scnn.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+# Re-config the data sampler.
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+# Re-config the optimizer.
+optimizer = dict(type='SGD', lr=0.12, momentum=0.9, weight_decay=4e-5)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/fastscnn/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/fastscnn/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9e33c902db1ee8291dc13af0772ccbc1689bec10
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fastscnn/metafile.yaml
@@ -0,0 +1,37 @@
+Collections:
+- Name: FastSCNN
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: Fast-SCNN for Semantic Segmentation
+    URL: https://arxiv.org/abs/1902.04502
+  README: configs/fastscnn/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: fast_scnn_8xb4-160k_cityscapes-512x1024
+  In Collection: FastSCNN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 70.96
+      mIoU(ms+flip): 72.65
+  Config: configs/fastscnn/fast_scnn_8xb4-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 32
+    Architecture:
+    - FastSCNN
+    - FastSCNN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 3.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853.log.json
+  Paper:
+    Title: Fast-SCNN for Semantic Segmentation
+    URL: https://arxiv.org/abs/1902.04502
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/fast_scnn.py#L272
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/README.md b/head_extractor/src/mmseg/.mim/configs/fcn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cf7379ff3db5260d5a0d8260ab25bfdbe9aca001
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/README.md
@@ -0,0 +1,111 @@
+# FCN
+
+> [Fully Convolutional Networks for Semantic Segmentation](https://arxiv.org/abs/1411.4038)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/BVLC/caffe/wiki/Model-Zoo#fcn">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Convolutional networks are powerful visual models that yield hierarchies of features. We show that convolutional networks by themselves, trained end-to-end, pixels-to-pixels, exceed the state-of-the-art in semantic segmentation. Our key insight is to build "fully convolutional" networks that take input of arbitrary size and produce correspondingly-sized output with efficient inference and learning. We define and detail the space of fully convolutional networks, explain their application to spatially dense prediction tasks, and draw connections to prior models. We adapt contemporary classification networks (AlexNet, the VGG net, and GoogLeNet) into fully convolutional networks and transfer their learned representations by fine-tuning to the segmentation task. We then define a novel architecture that combines semantic information from a deep, coarse layer with appearance information from a shallow, fine layer to produce accurate and detailed segmentations. Our fully convolutional network achieves state-of-the-art segmentation of PASCAL VOC (20% relative improvement to 62.2% mean IU on 2012), NYUDv2, and SIFT Flow, while inference takes one third of a second for a typical image.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142901525-fd0d2bf4-9a47-4143-85f5-3cee8849eaa4.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method     | Backbone   | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device   |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                   |
+| ---------- | ---------- | --------- | ------: | -------- | -------------- | -------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN        | R-50-D8    | 512x1024  |   40000 | 5.7      | 4.17           | V100     | 72.25 |         73.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_40k_cityscapes/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608-efe53f0d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_40k_cityscapes/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608.log.json)                         |
+| FCN        | R-101-D8   | 512x1024  |   40000 | 9.2      | 2.66           | V100     | 75.45 |         76.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-512x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_40k_cityscapes/fcn_r101-d8_512x1024_40k_cityscapes_20200604_181852-a883d3a1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_40k_cityscapes/fcn_r101-d8_512x1024_40k_cityscapes_20200604_181852.log.json)                     |
+| FCN        | R-50-D8    | 769x769   |   40000 | 6.5      | 1.80           | V100     | 71.47 |         72.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-769x769.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_40k_cityscapes/fcn_r50-d8_769x769_40k_cityscapes_20200606_113104-977b5d02.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_40k_cityscapes/fcn_r50-d8_769x769_40k_cityscapes_20200606_113104.log.json)                             |
+| FCN        | R-101-D8   | 769x769   |   40000 | 10.4     | 1.19           | V100     | 73.93 |         75.14 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-769x769.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_40k_cityscapes/fcn_r101-d8_769x769_40k_cityscapes_20200606_113208-7d4ab69c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_40k_cityscapes/fcn_r101-d8_769x769_40k_cityscapes_20200606_113208.log.json)                         |
+| FCN        | R-18-D8    | 512x1024  |   80000 | 1.7      | 14.65          | V100     | 71.11 |         72.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-512x1024.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_512x1024_80k_cityscapes/fcn_r18-d8_512x1024_80k_cityscapes_20201225_021327-6c50f8b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_512x1024_80k_cityscapes/fcn_r18-d8_512x1024_80k_cityscapes-20201225_021327.log.json)                         |
+| FCN        | R-50-D8    | 512x1024  |   80000 | -        |                | V100     | 73.61 |         74.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_80k_cityscapes/fcn_r50-d8_512x1024_80k_cityscapes_20200606_113019-03aa804d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_80k_cityscapes/fcn_r50-d8_512x1024_80k_cityscapes_20200606_113019.log.json)                         |
+| FCN        | R-101-D8   | 512x1024  |   80000 | -        | -              | V100     | 75.13 |         75.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_80k_cityscapes/fcn_r101-d8_512x1024_80k_cityscapes_20200606_113038-3fb937eb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_80k_cityscapes/fcn_r101-d8_512x1024_80k_cityscapes_20200606_113038.log.json)                     |
+| FCN (FP16) | R-101-D8   | 512x1024  |   80000 | 5.37     | 8.64           | V100     | 76.80 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes/fcn_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230921-fb13e883.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes/fcn_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230921.log.json) |
+| FCN        | R-18-D8    | 769x769   |   80000 | 1.9      | 6.40           | V100     | 70.80 |         73.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-769x769.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_769x769_80k_cityscapes/fcn_r18-d8_769x769_80k_cityscapes_20201225_021451-9739d1b8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_769x769_80k_cityscapes/fcn_r18-d8_769x769_80k_cityscapes-20201225_021451.log.json)                             |
+| FCN        | R-50-D8    | 769x769   |   80000 | -        | -              | V100     | 72.64 |         73.32 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-769x769.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_80k_cityscapes/fcn_r50-d8_769x769_80k_cityscapes_20200606_195749-f5caeabc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_80k_cityscapes/fcn_r50-d8_769x769_80k_cityscapes_20200606_195749.log.json)                             |
+| FCN        | R-101-D8   | 769x769   |   80000 | -        | -              | V100     | 75.52 |         76.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-769x769.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_80k_cityscapes/fcn_r101-d8_769x769_80k_cityscapes_20200606_214354-45cbac68.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_80k_cityscapes/fcn_r101-d8_769x769_80k_cityscapes_20200606_214354.log.json)                         |
+| FCN        | R-18b-D8   | 512x1024  |   80000 | 1.6      | 16.74          | V100     | 70.24 |         72.77 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-512x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_512x1024_80k_cityscapes/fcn_r18b-d8_512x1024_80k_cityscapes_20201225_230143-92c0f445.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_512x1024_80k_cityscapes/fcn_r18b-d8_512x1024_80k_cityscapes-20201225_230143.log.json)                     |
+| FCN        | R-50b-D8   | 512x1024  |   80000 | 5.6      | 4.20           | V100     | 75.65 |         77.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-512x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_512x1024_80k_cityscapes/fcn_r50b-d8_512x1024_80k_cityscapes_20201225_094221-82957416.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_512x1024_80k_cityscapes/fcn_r50b-d8_512x1024_80k_cityscapes-20201225_094221.log.json)                     |
+| FCN        | R-101b-D8  | 512x1024  |   80000 | 9.1      | 2.73           | V100     | 77.37 |         78.77 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-512x1024.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_512x1024_80k_cityscapes/fcn_r101b-d8_512x1024_80k_cityscapes_20201226_160213-4543858f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_512x1024_80k_cityscapes/fcn_r101b-d8_512x1024_80k_cityscapes-20201226_160213.log.json)                 |
+| FCN        | R-18b-D8   | 769x769   |   80000 | 1.7      | 6.70           | V100     | 69.66 |         72.07 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-769x769.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_769x769_80k_cityscapes/fcn_r18b-d8_769x769_80k_cityscapes_20201226_004430-32d504e5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_769x769_80k_cityscapes/fcn_r18b-d8_769x769_80k_cityscapes-20201226_004430.log.json)                         |
+| FCN        | R-50b-D8   | 769x769   |   80000 | 6.3      | 1.82           | V100     | 73.83 |         76.60 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-769x769.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_769x769_80k_cityscapes/fcn_r50b-d8_769x769_80k_cityscapes_20201225_094223-94552d38.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_769x769_80k_cityscapes/fcn_r50b-d8_769x769_80k_cityscapes-20201225_094223.log.json)                         |
+| FCN        | R-101b-D8  | 769x769   |   80000 | 10.3     | 1.15           | V100     | 77.02 |         78.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-769x769.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_769x769_80k_cityscapes/fcn_r101b-d8_769x769_80k_cityscapes_20201226_170012-82be37e2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_769x769_80k_cityscapes/fcn_r101b-d8_769x769_80k_cityscapes-20201226_170012.log.json)                     |
+| FCN (D6)   | R-50-D16   | 512x1024  |   40000 | 3.4      | 10.22          | TITAN Xp | 77.06 |         78.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes/fcn_d6_r50-d16_512x1024_40k_cityscapes_20210305_130133-98d5d1bc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes/fcn_d6_r50-d16_512x1024_40k_cityscapes-20210305_130133.log.json)         |
+| FCN (D6)   | R-50-D16   | 512x1024  |   80000 | -        | 10.35          | TITAN Xp | 77.27 |         78.88 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes/fcn_d6_r50-d16_512x1024_80k_cityscapes_20210306_115604-133c292f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes/fcn_d6_r50-d16_512x1024_80k_cityscapes-20210306_115604.log.json)         |
+| FCN (D6)   | R-50-D16   | 769x769   |   40000 | 3.7      | 4.17           | TITAN Xp | 76.82 |         78.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-769x769.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes/fcn_d6_r50-d16_769x769_40k_cityscapes_20210305_185744-1aab18ed.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes/fcn_d6_r50-d16_769x769_40k_cityscapes-20210305_185744.log.json)             |
+| FCN (D6)   | R-50-D16   | 769x769   |   80000 | -        | 4.15           | TITAN Xp | 77.04 |         78.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes/fcn_d6_r50-d16_769x769_80k_cityscapes_20210305_200413-109d88eb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes/fcn_d6_r50-d16_769x769_80k_cityscapes-20210305_200413.log.json)             |
+| FCN (D6)   | R-101-D16  | 512x1024  |   40000 | 4.5      | 8.04           | TITAN Xp | 77.36 |         79.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes/fcn_d6_r101-d16_512x1024_40k_cityscapes_20210305_130337-9cf2b450.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes/fcn_d6_r101-d16_512x1024_40k_cityscapes-20210305_130337.log.json)     |
+| FCN (D6)   | R-101-D16  | 512x1024  |   80000 | -        | 8.26           | TITAN Xp | 78.46 |         80.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes/fcn_d6_r101-d16_512x1024_80k_cityscapes_20210308_102747-cb336445.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes/fcn_d6_r101-d16_512x1024_80k_cityscapes-20210308_102747.log.json)     |
+| FCN (D6)   | R-101-D16  | 769x769   |   40000 | 5.0      | 3.12           | TITAN Xp | 77.28 |         78.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes/fcn_d6_r101-d16_769x769_40k_cityscapes_20210308_102453-60b114e9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes/fcn_d6_r101-d16_769x769_40k_cityscapes-20210308_102453.log.json)         |
+| FCN (D6)   | R-101-D16  | 769x769   |   80000 | -        | 3.21           | TITAN Xp | 78.06 |         79.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes/fcn_d6_r101-d16_769x769_80k_cityscapes_20210306_120016-e33adc4f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes/fcn_d6_r101-d16_769x769_80k_cityscapes-20210306_120016.log.json)         |
+| FCN (D6)   | R-50b-D16  | 512x1024  |   80000 | 3.2      | 10.16          | TITAN Xp | 76.99 |         79.03 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b-d16_512x1024_80k_cityscapes/fcn_d6_r50b-d16_512x1024_80k_cityscapes_20210311_125550-6a0b62e9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b_d16_512x1024_80k_cityscapes/fcn_d6_r50b_d16_512x1024_80k_cityscapes-20210311_125550.log.json)     |
+| FCN (D6)   | R-50b-D16  | 769x769   |   80000 | 3.6      | 4.17           | TITAN Xp | 76.86 |         78.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b-d16_769x769_80k_cityscapes/fcn_d6_r50b-d16_769x769_80k_cityscapes_20210311_131012-d665f231.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b_d16_769x769_80k_cityscapes/fcn_d6_r50b_d16_769x769_80k_cityscapes-20210311_131012.log.json)         |
+| FCN (D6)   | R-101b-D16 | 512x1024  |   80000 | 4.3      | 8.46           | TITAN Xp | 77.72 |         79.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b-d16_512x1024_80k_cityscapes/fcn_d6_r101b-d16_512x1024_80k_cityscapes_20210311_144305-3f2eb5b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b_d16_512x1024_80k_cityscapes/fcn_d6_r101b_d16_512x1024_80k_cityscapes-20210311_144305.log.json) |
+| FCN (D6)   | R-101b-D16 | 769x769   |   80000 | 4.8      | 3.32           | TITAN Xp | 77.34 |         78.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b-d16_769x769_80k_cityscapes/fcn_d6_r101b-d16_769x769_80k_cityscapes_20210311_154527-c4d8bfbc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b_d16_769x769_80k_cityscapes/fcn_d6_r101b_d16_769x769_80k_cityscapes-20210311_154527.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                               | download                                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | R-50-D8  | 512x512   |   80000 | 8.5      | 23.49          | V100   | 35.94 |         37.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_80k_ade20k/fcn_r50-d8_512x512_80k_ade20k_20200614_144016-f8ac5082.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_80k_ade20k/fcn_r50-d8_512x512_80k_ade20k_20200614_144016.log.json)         |
+| FCN    | R-101-D8 | 512x512   |   80000 | 12       | 14.78          | V100   | 39.61 |         40.83 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-80k_ade20k-512x512.pyy) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_80k_ade20k/fcn_r101-d8_512x512_80k_ade20k_20200615_014143-bc1809f7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_80k_ade20k/fcn_r101-d8_512x512_80k_ade20k_20200615_014143.log.json)     |
+| FCN    | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 36.10 |         38.08 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_160k_ade20k/fcn_r50-d8_512x512_160k_ade20k_20200615_100713-4edbc3b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_160k_ade20k/fcn_r50-d8_512x512_160k_ade20k_20200615_100713.log.json)     |
+| FCN    | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 39.91 |         41.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_160k_ade20k/fcn_r101-d8_512x512_160k_ade20k_20200615_105816-fd192bd5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_160k_ade20k/fcn_r101-d8_512x512_160k_ade20k_20200615_105816.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                                   |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | R-50-D8  | 512x512   |   20000 | 5.7      | 23.28          | V100   | 67.08 |         69.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_20k_voc12aug/fcn_r50-d8_512x512_20k_voc12aug_20200617_010715-52dc5306.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_20k_voc12aug/fcn_r50-d8_512x512_20k_voc12aug_20200617_010715.log.json)     |
+| FCN    | R-101-D8 | 512x512   |   20000 | 9.2      | 14.81          | V100   | 71.16 |         73.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_20k_voc12aug/fcn_r101-d8_512x512_20k_voc12aug_20200617_010842-0bb4e798.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_20k_voc12aug/fcn_r101-d8_512x512_20k_voc12aug_20200617_010842.log.json) |
+| FCN    | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 66.97 |         69.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_40k_voc12aug/fcn_r50-d8_512x512_40k_voc12aug_20200613_161222-5e2dbf40.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_40k_voc12aug/fcn_r50-d8_512x512_40k_voc12aug_20200613_161222.log.json)     |
+| FCN    | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 69.91 |         72.38 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_40k_voc12aug/fcn_r101-d8_512x512_40k_voc12aug_20200613_161240-4c8bcefd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_40k_voc12aug/fcn_r101-d8_512x512_40k_voc12aug_20200613_161240.log.json) |
+
+### Pascal Context
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | R-101-D8 | 480x480   |   40000 | -        | 9.93           | V100   | 44.43 |         45.63 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context/fcn_r101-d8_480x480_40k_pascal_context_20210421_154757-b5e97937.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context/fcn_r101-d8_480x480_40k_pascal_context-20210421_154757.log.json) |
+| FCN    | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 44.13 |         45.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context/fcn_r101-d8_480x480_80k_pascal_context_20210421_163310-4711813f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context/fcn_r101-d8_480x480_80k_pascal_context-20210421_163310.log.json) |
+
+### Pascal Context 59
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                       |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | R-101-D8 | 480x480   |   40000 | -        | -              | V100   | 48.42 |          50.4 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context_59/fcn_r101-d8_480x480_40k_pascal_context_59_20210415_230724-8cf83682.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context_59/fcn_r101-d8_480x480_40k_pascal_context_59-20210415_230724.log.json) |
+| FCN    | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 49.35 |         51.38 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context_59/fcn_r101-d8_480x480_80k_pascal_context_59_20210416_110804-9a6f2c94.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context_59/fcn_r101-d8_480x480_80k_pascal_context_59-20210416_110804.log.json) |
+
+Note:
+
+- `FP16` means Mixed Precision (FP16) is adopted in training.
+- `FCN D6` means dilation rate of convolution operator in FCN is 6.
+
+## Citation
+
+```bibtex
+@article{shelhamer2017fully,
+  title={Fully convolutional networks for semantic segmentation},
+  author={Shelhamer, Evan and Long, Jonathan and Darrell, Trevor},
+  journal={IEEE transactions on pattern analysis and machine intelligence},
+  volume={39},
+  number={4},
+  pages={640--651},
+  year={2017},
+  publisher={IEEE Trans Pattern Anal Mach Intell}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f2cd02b00409ce40ce24821cd98eebb89799cea
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fcn-d6_r50-d16_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..4782b30377121ac6159eb1b89e74957e483fc68a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './fcn-d6_r50-d16_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f654b4bbd8b7294a10e89b2af783a745cfe2a1d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..91eca1c52ec24ab16ac324040314f5bca38a00c9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..62e61277991478939624160310baa6bcda0a505d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,4 @@
+_base_ = './fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b8d24799e69d2c1b404e50bbb8a5cf63c912a4f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,4 @@
+_base_ = './fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a1efb41d530f0d254b711b90ee6a0a5bc755637
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(dilations=(1, 1, 1, 2), strides=(1, 2, 2, 1)),
+    decode_head=dict(dilation=6),
+    auxiliary_head=dict(dilation=6))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b2a6f4537f3d7496fa8843796646c6271bcff7a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(dilations=(1, 1, 1, 2), strides=(1, 2, 2, 1)),
+    decode_head=dict(align_corners=True, dilation=6),
+    auxiliary_head=dict(align_corners=True, dilation=6),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6cca006f3e66a64a09a021d8fa4466c0ba95cf0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(dilations=(1, 1, 1, 2), strides=(1, 2, 2, 1)),
+    decode_head=dict(dilation=6),
+    auxiliary_head=dict(dilation=6))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..990ff9c58e8cb2064ff7ba2356f12203e3c9e6cd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(dilations=(1, 1, 1, 2), strides=(1, 2, 2, 1)),
+    decode_head=dict(align_corners=True, dilation=6),
+    auxiliary_head=dict(align_corners=True, dilation=6),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d470a50be7e7b1b1bab8b83534defd7e605cdad
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9093ea2dccbf5c248a3ab8fc59d190765e1225f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3ec0a742c33a56ae307244fcfc6fa9424cba75b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f83fe20788770292804c5a373e14156f5f24a20
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..4527b3b8a0ced0a32e1cb80a592dc1fc17587fda
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ce112484dad13435d50215a5c4474c77ad4e5dc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4d94878c87771a6dfc2f18bc5d1c20e36df9f7d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
@@ -0,0 +1,6 @@
+_base_ = './fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005),
+    loss_scale=512.)
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1f5c5c78510ea64028d00a5ffca3bb2b26d9f73
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..61ee96f94ebe03a1aaf99d6e20358e682ee07a65
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..1161193adb4fdadbee0af56bfa1e1c06f1664082
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-40k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3a6dbc9ab5bb07856abbfa3ecb1e7a815b2380c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b68b6e04071bfb7ce6a6773910ae34b8e28b99d6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..3facce30dc7e8230d3a22b0afff858ddb59432b5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..1161193adb4fdadbee0af56bfa1e1c06f1664082
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-40k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..cebe33082a12abe502f87b662951ab3a34c8f46f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-80k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..e53751b14427370001d706c57c360624f0ec86fd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,4 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..daa65026109ff465ed04cf2e2c093fb2ef5b9a62
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,4 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..4073148122e4a514cee971b36ace573245010266
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c1d2b6df01a38d13d5c27ffbede8c60bb47e671
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..08ab467573ac6894eedf66dd171d5426d98a3ea3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..c591ebe9722d50cc019377a4d9aba7f8050eb314
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fba72333def61347a773b9292e40c64cf6ccd20
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..d57afe1c224940398f50dc892a1273dc92e2e487
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b1fdae80949add32835c3fcefb7da94e96437cd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a713fd30937680199e23ffa8a0a83ed29275c48
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..258b9fb579fe531b120004cf5d06db767ba8dd11
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..eac86d5389fdeeea02b078212feb031d721605eb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py', '../_base_/datasets/pascal_voc12_aug.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-40k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..d99cb0dc36462461785adf50378a7083853e817a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py', '../_base_/datasets/pascal_context.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=60),
+    auxiliary_head=dict(num_classes=60),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-40k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..64c94105213a769594a6967d560868f466fb4bea
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py',
+    '../_base_/datasets/pascal_context_59.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=59),
+    auxiliary_head=dict(num_classes=59),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..42edb46e94b1a4786c7e6fa0a6928b86871c1b34
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py', '../_base_/datasets/pascal_voc12_aug.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..099f6affa56e89387034c02cc2331fe93db10ade
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-80k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eeafb8a539b368328734c94173d465353b1ff92
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py', '../_base_/datasets/pascal_context.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=60),
+    auxiliary_head=dict(num_classes=60),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-80k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..c11a9bbb6de7ddbda781e3fb04f40348a1157097
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50-d8_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/fcn_r50-d8.py',
+    '../_base_/datasets/pascal_context_59.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=59),
+    auxiliary_head=dict(num_classes=59),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..44821fd7d38dcb0e68df4c196f55d2a78fd8ae1c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..a85b39197edc35cead28bcd129c50dec0871714c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/head_extractor/src/mmseg/.mim/configs/fcn/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/fcn/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f3d80f652ee49b3f9814c1354a5051fc33ecd249
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/fcn/metafile.yaml
@@ -0,0 +1,997 @@
+Collections:
+- Name: FCN
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+    - Pascal Context
+    - Pascal Context 59
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  README: configs/fcn/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: fcn_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 72.25
+      mIoU(ms+flip): 73.36
+  Config: configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_40k_cityscapes/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608-efe53f0d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_40k_cityscapes/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.45
+      mIoU(ms+flip): 76.58
+  Config: configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_40k_cityscapes/fcn_r101-d8_512x1024_40k_cityscapes_20200604_181852-a883d3a1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_40k_cityscapes/fcn_r101-d8_512x1024_40k_cityscapes_20200604_181852.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 71.47
+      mIoU(ms+flip): 72.54
+  Config: configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_40k_cityscapes/fcn_r50-d8_769x769_40k_cityscapes_20200606_113104-977b5d02.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_40k_cityscapes/fcn_r50-d8_769x769_40k_cityscapes_20200606_113104.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.93
+      mIoU(ms+flip): 75.14
+  Config: configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_40k_cityscapes/fcn_r101-d8_769x769_40k_cityscapes_20200606_113208-7d4ab69c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_40k_cityscapes/fcn_r101-d8_769x769_40k_cityscapes_20200606_113208.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r18-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 71.11
+      mIoU(ms+flip): 72.91
+  Config: configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_512x1024_80k_cityscapes/fcn_r18-d8_512x1024_80k_cityscapes_20201225_021327-6c50f8b4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_512x1024_80k_cityscapes/fcn_r18-d8_512x1024_80k_cityscapes-20201225_021327.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.61
+      mIoU(ms+flip): 74.24
+  Config: configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_80k_cityscapes/fcn_r50-d8_512x1024_80k_cityscapes_20200606_113019-03aa804d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_80k_cityscapes/fcn_r50-d8_512x1024_80k_cityscapes_20200606_113019.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.13
+      mIoU(ms+flip): 75.94
+  Config: configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_80k_cityscapes/fcn_r101-d8_512x1024_80k_cityscapes_20200606_113038-3fb937eb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_80k_cityscapes/fcn_r101-d8_512x1024_80k_cityscapes_20200606_113038.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb2-amp-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.8
+  Config: configs/fcn/fcn_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - FCN
+    - (FP16)
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.37
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes/fcn_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230921-fb13e883.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes/fcn_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230921.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r18-d8_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 70.8
+      mIoU(ms+flip): 73.16
+  Config: configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_769x769_80k_cityscapes/fcn_r18-d8_769x769_80k_cityscapes_20201225_021451-9739d1b8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_769x769_80k_cityscapes/fcn_r18-d8_769x769_80k_cityscapes-20201225_021451.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 72.64
+      mIoU(ms+flip): 73.32
+  Config: configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_80k_cityscapes/fcn_r50-d8_769x769_80k_cityscapes_20200606_195749-f5caeabc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_80k_cityscapes/fcn_r50-d8_769x769_80k_cityscapes_20200606_195749.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.52
+      mIoU(ms+flip): 76.61
+  Config: configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_80k_cityscapes/fcn_r101-d8_769x769_80k_cityscapes_20200606_214354-45cbac68.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_80k_cityscapes/fcn_r101-d8_769x769_80k_cityscapes_20200606_214354.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r18b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 70.24
+      mIoU(ms+flip): 72.77
+  Config: configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_512x1024_80k_cityscapes/fcn_r18b-d8_512x1024_80k_cityscapes_20201225_230143-92c0f445.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_512x1024_80k_cityscapes/fcn_r18b-d8_512x1024_80k_cityscapes-20201225_230143.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.65
+      mIoU(ms+flip): 77.59
+  Config: configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_512x1024_80k_cityscapes/fcn_r50b-d8_512x1024_80k_cityscapes_20201225_094221-82957416.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_512x1024_80k_cityscapes/fcn_r50b-d8_512x1024_80k_cityscapes-20201225_094221.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.37
+      mIoU(ms+flip): 78.77
+  Config: configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_512x1024_80k_cityscapes/fcn_r101b-d8_512x1024_80k_cityscapes_20201226_160213-4543858f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_512x1024_80k_cityscapes/fcn_r101b-d8_512x1024_80k_cityscapes-20201226_160213.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r18b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 69.66
+      mIoU(ms+flip): 72.07
+  Config: configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_769x769_80k_cityscapes/fcn_r18b-d8_769x769_80k_cityscapes_20201226_004430-32d504e5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_769x769_80k_cityscapes/fcn_r18b-d8_769x769_80k_cityscapes-20201226_004430.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.83
+      mIoU(ms+flip): 76.6
+  Config: configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_769x769_80k_cityscapes/fcn_r50b-d8_769x769_80k_cityscapes_20201225_094223-94552d38.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_769x769_80k_cityscapes/fcn_r50b-d8_769x769_80k_cityscapes-20201225_094223.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.02
+      mIoU(ms+flip): 78.67
+  Config: configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_769x769_80k_cityscapes/fcn_r101b-d8_769x769_80k_cityscapes_20201226_170012-82be37e2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_769x769_80k_cityscapes/fcn_r101b-d8_769x769_80k_cityscapes-20201226_170012.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r50-d16_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.06
+      mIoU(ms+flip): 78.85
+  Config: configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 3.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes/fcn_d6_r50-d16_512x1024_40k_cityscapes_20210305_130133-98d5d1bc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes/fcn_d6_r50-d16_512x1024_40k_cityscapes-20210305_130133.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.27
+      mIoU(ms+flip): 78.88
+  Config: configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes/fcn_d6_r50-d16_512x1024_80k_cityscapes_20210306_115604-133c292f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes/fcn_d6_r50-d16_512x1024_80k_cityscapes-20210306_115604.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r50-d16_4xb2-40k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.82
+      mIoU(ms+flip): 78.22
+  Config: configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 3.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes/fcn_d6_r50-d16_769x769_40k_cityscapes_20210305_185744-1aab18ed.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes/fcn_d6_r50-d16_769x769_40k_cityscapes-20210305_185744.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.04
+      mIoU(ms+flip): 78.4
+  Config: configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes/fcn_d6_r50-d16_769x769_80k_cityscapes_20210305_200413-109d88eb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes/fcn_d6_r50-d16_769x769_80k_cityscapes-20210305_200413.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r101-d16_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.36
+      mIoU(ms+flip): 79.18
+  Config: configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 4.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes/fcn_d6_r101-d16_512x1024_40k_cityscapes_20210305_130337-9cf2b450.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes/fcn_d6_r101-d16_512x1024_40k_cityscapes-20210305_130337.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r101-d16_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.46
+      mIoU(ms+flip): 80.42
+  Config: configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes/fcn_d6_r101-d16_512x1024_80k_cityscapes_20210308_102747-cb336445.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes/fcn_d6_r101-d16_512x1024_80k_cityscapes-20210308_102747.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r101-d16_4xb2-40k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.28
+      mIoU(ms+flip): 78.95
+  Config: configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 5.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes/fcn_d6_r101-d16_769x769_40k_cityscapes_20210308_102453-60b114e9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes/fcn_d6_r101-d16_769x769_40k_cityscapes-20210308_102453.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r101-d16_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.06
+      mIoU(ms+flip): 79.58
+  Config: configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes/fcn_d6_r101-d16_769x769_80k_cityscapes_20210306_120016-e33adc4f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes/fcn_d6_r101-d16_769x769_80k_cityscapes-20210306_120016.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r50b-d16_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.99
+      mIoU(ms+flip): 79.03
+  Config: configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 3.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b-d16_512x1024_80k_cityscapes/fcn_d6_r50b-d16_512x1024_80k_cityscapes_20210311_125550-6a0b62e9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b_d16_512x1024_80k_cityscapes/fcn_d6_r50b_d16_512x1024_80k_cityscapes-20210311_125550.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.86
+      mIoU(ms+flip): 78.52
+  Config: configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 3.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b-d16_769x769_80k_cityscapes/fcn_d6_r50b-d16_769x769_80k_cityscapes_20210311_131012-d665f231.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b_d16_769x769_80k_cityscapes/fcn_d6_r50b_d16_769x769_80k_cityscapes-20210311_131012.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r101b-d16_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.72
+      mIoU(ms+flip): 79.53
+  Config: configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 4.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b-d16_512x1024_80k_cityscapes/fcn_d6_r101b-d16_512x1024_80k_cityscapes_20210311_144305-3f2eb5b4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b_d16_512x1024_80k_cityscapes/fcn_d6_r101b_d16_512x1024_80k_cityscapes-20210311_144305.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r101b-d16_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.34
+      mIoU(ms+flip): 78.91
+  Config: configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 4.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b-d16_769x769_80k_cityscapes/fcn_d6_r101b-d16_769x769_80k_cityscapes_20210311_154527-c4d8bfbc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b_d16_769x769_80k_cityscapes/fcn_d6_r101b_d16_769x769_80k_cityscapes-20210311_154527.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 35.94
+      mIoU(ms+flip): 37.94
+  Config: configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_80k_ade20k/fcn_r50-d8_512x512_80k_ade20k_20200614_144016-f8ac5082.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_80k_ade20k/fcn_r50-d8_512x512_80k_ade20k_20200614_144016.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 39.61
+      mIoU(ms+flip): 40.83
+  Config: configs/fcn/fcn_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_80k_ade20k/fcn_r101-d8_512x512_80k_ade20k_20200615_014143-bc1809f7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_80k_ade20k/fcn_r101-d8_512x512_80k_ade20k_20200615_014143.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 36.1
+      mIoU(ms+flip): 38.08
+  Config: configs/fcn/fcn_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_160k_ade20k/fcn_r50-d8_512x512_160k_ade20k_20200615_100713-4edbc3b4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_160k_ade20k/fcn_r50-d8_512x512_160k_ade20k_20200615_100713.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 39.91
+      mIoU(ms+flip): 41.4
+  Config: configs/fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_160k_ade20k/fcn_r101-d8_512x512_160k_ade20k_20200615_105816-fd192bd5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_160k_ade20k/fcn_r101-d8_512x512_160k_ade20k_20200615_105816.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 67.08
+      mIoU(ms+flip): 69.94
+  Config: configs/fcn/fcn_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_20k_voc12aug/fcn_r50-d8_512x512_20k_voc12aug_20200617_010715-52dc5306.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_20k_voc12aug/fcn_r50-d8_512x512_20k_voc12aug_20200617_010715.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 71.16
+      mIoU(ms+flip): 73.57
+  Config: configs/fcn/fcn_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_20k_voc12aug/fcn_r101-d8_512x512_20k_voc12aug_20200617_010842-0bb4e798.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_20k_voc12aug/fcn_r101-d8_512x512_20k_voc12aug_20200617_010842.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 66.97
+      mIoU(ms+flip): 69.04
+  Config: configs/fcn/fcn_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_40k_voc12aug/fcn_r50-d8_512x512_40k_voc12aug_20200613_161222-5e2dbf40.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_40k_voc12aug/fcn_r50-d8_512x512_40k_voc12aug_20200613_161222.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 69.91
+      mIoU(ms+flip): 72.38
+  Config: configs/fcn/fcn_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_40k_voc12aug/fcn_r101-d8_512x512_40k_voc12aug_20200613_161240-4c8bcefd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_40k_voc12aug/fcn_r101-d8_512x512_40k_voc12aug_20200613_161240.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-40k_pascal-context-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 44.43
+      mIoU(ms+flip): 45.63
+  Config: configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context/fcn_r101-d8_480x480_40k_pascal_context_20210421_154757-b5e97937.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context/fcn_r101-d8_480x480_40k_pascal_context-20210421_154757.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-80k_pascal-context-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 44.13
+      mIoU(ms+flip): 45.26
+  Config: configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context/fcn_r101-d8_480x480_80k_pascal_context_20210421_163310-4711813f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context/fcn_r101-d8_480x480_80k_pascal_context-20210421_163310.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-40k_pascal-context-59-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 48.42
+      mIoU(ms+flip): 50.4
+  Config: configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context_59/fcn_r101-d8_480x480_40k_pascal_context_59_20210415_230724-8cf83682.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context_59/fcn_r101-d8_480x480_40k_pascal_context_59-20210415_230724.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-80k_pascal-context-59-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 49.35
+      mIoU(ms+flip): 51.38
+  Config: configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context_59/fcn_r101-d8_480x480_80k_pascal_context_59_20210416_110804-9a6f2c94.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context_59/fcn_r101-d8_480x480_80k_pascal_context_59-20210416_110804.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/README.md b/head_extractor/src/mmseg/.mim/configs/gcnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ba1a21e8512e04339b6ac2fbfd45a9291a2b4837
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/README.md
@@ -0,0 +1,68 @@
+# GCNet
+
+> [GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond](https://arxiv.org/abs/1904.11492)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/xvjiarui/GCNet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+The Non-Local Network (NLNet) presents a pioneering approach for capturing long-range dependencies, via aggregating query-specific global context to each query position. However, through a rigorous empirical analysis, we have found that the global contexts modeled by non-local network are almost the same for different query positions within an image. In this paper, we take advantage of this finding to create a simplified network based on a query-independent formulation, which maintains the accuracy of NLNet but with significantly less computation. We further observe that this simplified design shares similar structure with Squeeze-Excitation Network (SENet). Hence we unify them into a three-step general framework for global context modeling. Within the general framework, we design a better instantiation, called the global context (GC) block, which is lightweight and can effectively model the global context. The lightweight property allows us to apply it for multiple layers in a backbone network to construct a global context network (GCNet), which generally outperforms both simplified NLNet and SENet on major benchmarks for various recognition tasks. The code and configurations are released at [this https URL](https://github.com/xvjiarui/GCNet).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142901601-ad17922e-2538-4b48-9f51-84a57d44b12b.png" width="80%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| GCNet  | R-50-D8  | 512x1024  |   40000 | 5.8      | 3.93           | V100   | 77.69 |         78.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes/gcnet_r50-d8_512x1024_40k_cityscapes_20200618_074436-4b0fd17b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes/gcnet_r50-d8_512x1024_40k_cityscapes_20200618_074436.log.json)     |
+| GCNet  | R-101-D8 | 512x1024  |   40000 | 9.2      | 2.61           | V100   | 78.28 |         79.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes/gcnet_r101-d8_512x1024_40k_cityscapes_20200618_074436-5e62567f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes/gcnet_r101-d8_512x1024_40k_cityscapes_20200618_074436.log.json) |
+| GCNet  | R-50-D8  | 769x769   |   40000 | 6.5      | 1.67           | V100   | 78.12 |         80.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-769x769.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_40k_cityscapes/gcnet_r50-d8_769x769_40k_cityscapes_20200618_182814-a26f4471.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_40k_cityscapes/gcnet_r50-d8_769x769_40k_cityscapes_20200618_182814.log.json)         |
+| GCNet  | R-101-D8 | 769x769   |   40000 | 10.5     | 1.13           | V100   | 78.95 |         80.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_40k_cityscapes/gcnet_r101-d8_769x769_40k_cityscapes_20200619_092550-ca4f0a84.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_40k_cityscapes/gcnet_r101-d8_769x769_40k_cityscapes_20200619_092550.log.json)     |
+| GCNet  | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 78.48 |         80.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes/gcnet_r50-d8_512x1024_80k_cityscapes_20200618_074450-ef8f069b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes/gcnet_r50-d8_512x1024_80k_cityscapes_20200618_074450.log.json)     |
+| GCNet  | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 79.03 |         79.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-512x1024.pyy) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes/gcnet_r101-d8_512x1024_80k_cityscapes_20200618_074450-778ebf69.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes/gcnet_r101-d8_512x1024_80k_cityscapes_20200618_074450.log.json) |
+| GCNet  | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 78.68 |         80.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-769x769.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_80k_cityscapes/gcnet_r50-d8_769x769_80k_cityscapes_20200619_092516-4839565b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_80k_cityscapes/gcnet_r50-d8_769x769_80k_cityscapes_20200619_092516.log.json)         |
+| GCNet  | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 79.18 |         80.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_80k_cityscapes/gcnet_r101-d8_769x769_80k_cityscapes_20200619_092628-8e043423.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_80k_cityscapes/gcnet_r101-d8_769x769_80k_cityscapes_20200619_092628.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| GCNet  | R-50-D8  | 512x512   |   80000 | 8.5      | 23.38          | V100   | 41.47 |         42.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_80k_ade20k/gcnet_r50-d8_512x512_80k_ade20k_20200614_185146-91a6da41.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_80k_ade20k/gcnet_r50-d8_512x512_80k_ade20k_20200614_185146.log.json)         |
+| GCNet  | R-101-D8 | 512x512   |   80000 | 12       | 15.20          | V100   | 42.82 |         44.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_80k_ade20k/gcnet_r101-d8_512x512_80k_ade20k_20200615_020811-c3fcb6dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_80k_ade20k/gcnet_r101-d8_512x512_80k_ade20k_20200615_020811.log.json)     |
+| GCNet  | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 42.37 |         43.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_160k_ade20k/gcnet_r50-d8_512x512_160k_ade20k_20200615_224122-d95f3e1f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_160k_ade20k/gcnet_r50-d8_512x512_160k_ade20k_20200615_224122.log.json)     |
+| GCNet  | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 43.69 |         45.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_160k_ade20k/gcnet_r101-d8_512x512_160k_ade20k_20200615_225406-615528d7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_160k_ade20k/gcnet_r101-d8_512x512_160k_ade20k_20200615_225406.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| GCNet  | R-50-D8  | 512x512   |   20000 | 5.8      | 23.35          | V100   | 76.42 |         77.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_20k_voc12aug/gcnet_r50-d8_512x512_20k_voc12aug_20200617_165701-3cbfdab1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_20k_voc12aug/gcnet_r50-d8_512x512_20k_voc12aug_20200617_165701.log.json)     |
+| GCNet  | R-101-D8 | 512x512   |   20000 | 9.2      | 14.80          | V100   | 77.41 |         78.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_20k_voc12aug/gcnet_r101-d8_512x512_20k_voc12aug_20200617_165713-6c720aa9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_20k_voc12aug/gcnet_r101-d8_512x512_20k_voc12aug_20200617_165713.log.json) |
+| GCNet  | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 76.24 |         77.63 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_40k_voc12aug/gcnet_r50-d8_512x512_40k_voc12aug_20200613_195105-9797336d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_40k_voc12aug/gcnet_r50-d8_512x512_40k_voc12aug_20200613_195105.log.json)     |
+| GCNet  | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 77.84 |         78.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_40k_voc12aug/gcnet_r101-d8_512x512_40k_voc12aug_20200613_185806-1e38208d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_40k_voc12aug/gcnet_r101-d8_512x512_40k_voc12aug_20200613_185806.log.json) |
+
+## Citation
+
+```bibtex
+@inproceedings{cao2019gcnet,
+  title={Gcnet: Non-local networks meet squeeze-excitation networks and beyond},
+  author={Cao, Yue and Xu, Jiarui and Lin, Stephen and Wei, Fangyun and Hu, Han},
+  booktitle={Proceedings of the IEEE International Conference on Computer Vision Workshops},
+  pages={0--0},
+  year={2019}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8f7c552fbd197d646dd1266cf1a3638a8935114
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..887d17b71d8063e6600c399dc31989818b53981e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa47578d1630a17a245070fba4625190589af700
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddf4ad7bbc27716125fac62ab6777e3daf61da23
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..45285c0183067a52af04461600a9fbe3df983055
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b466c409e8f3da34dbdc64f5c126287977638fc4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c7f741f053e6b55b909f15253abf7b6c36fc6f7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..61337dbda26be0cc55a8e17011027f78527697e6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..f976bd907a466edd04835db92f866de96302bfa8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/gcnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..34ce822c5921b478efbeda9a0b80a8162fb58a2e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/gcnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..50889290473a81fa511471952d3e99c1585c40f1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/gcnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..f886f170fcc7b67673aa2de8fcdb076bfbe9ca48
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/gcnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3f56313190f6b3e55400688a7414a39b7a63c97
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/gcnet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..356b088236c10cda7dd557f9efc7d0facd3fea91
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/gcnet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..802b7668e055f0dd1d0c0a8f2212ede4701cadc4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/gcnet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..7327934289cebc8713ebbdf527f629b170f27aa3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/gcnet_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/gcnet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/gcnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/gcnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1f3c4623a034a8e6a095535f8724df5058ae927b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/gcnet/metafile.yaml
@@ -0,0 +1,391 @@
+Collections:
+- Name: GCNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  README: configs/gcnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: gcnet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.69
+      mIoU(ms+flip): 78.56
+  Config: configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes/gcnet_r50-d8_512x1024_40k_cityscapes_20200618_074436-4b0fd17b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes/gcnet_r50-d8_512x1024_40k_cityscapes_20200618_074436.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.28
+      mIoU(ms+flip): 79.34
+  Config: configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes/gcnet_r101-d8_512x1024_40k_cityscapes_20200618_074436-5e62567f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes/gcnet_r101-d8_512x1024_40k_cityscapes_20200618_074436.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.12
+      mIoU(ms+flip): 80.09
+  Config: configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_40k_cityscapes/gcnet_r50-d8_769x769_40k_cityscapes_20200618_182814-a26f4471.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_40k_cityscapes/gcnet_r50-d8_769x769_40k_cityscapes_20200618_182814.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.95
+      mIoU(ms+flip): 80.71
+  Config: configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_40k_cityscapes/gcnet_r101-d8_769x769_40k_cityscapes_20200619_092550-ca4f0a84.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_40k_cityscapes/gcnet_r101-d8_769x769_40k_cityscapes_20200619_092550.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.48
+      mIoU(ms+flip): 80.01
+  Config: configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes/gcnet_r50-d8_512x1024_80k_cityscapes_20200618_074450-ef8f069b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes/gcnet_r50-d8_512x1024_80k_cityscapes_20200618_074450.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.03
+      mIoU(ms+flip): 79.84
+  Config: configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes/gcnet_r101-d8_512x1024_80k_cityscapes_20200618_074450-778ebf69.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes/gcnet_r101-d8_512x1024_80k_cityscapes_20200618_074450.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.68
+      mIoU(ms+flip): 80.66
+  Config: configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_80k_cityscapes/gcnet_r50-d8_769x769_80k_cityscapes_20200619_092516-4839565b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_80k_cityscapes/gcnet_r50-d8_769x769_80k_cityscapes_20200619_092516.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.18
+      mIoU(ms+flip): 80.71
+  Config: configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_80k_cityscapes/gcnet_r101-d8_769x769_80k_cityscapes_20200619_092628-8e043423.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_80k_cityscapes/gcnet_r101-d8_769x769_80k_cityscapes_20200619_092628.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.47
+      mIoU(ms+flip): 42.85
+  Config: configs/gcnet/gcnet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_80k_ade20k/gcnet_r50-d8_512x512_80k_ade20k_20200614_185146-91a6da41.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_80k_ade20k/gcnet_r50-d8_512x512_80k_ade20k_20200614_185146.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.82
+      mIoU(ms+flip): 44.54
+  Config: configs/gcnet/gcnet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_80k_ade20k/gcnet_r101-d8_512x512_80k_ade20k_20200615_020811-c3fcb6dd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_80k_ade20k/gcnet_r101-d8_512x512_80k_ade20k_20200615_020811.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.37
+      mIoU(ms+flip): 43.52
+  Config: configs/gcnet/gcnet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_160k_ade20k/gcnet_r50-d8_512x512_160k_ade20k_20200615_224122-d95f3e1f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_160k_ade20k/gcnet_r50-d8_512x512_160k_ade20k_20200615_224122.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.69
+      mIoU(ms+flip): 45.21
+  Config: configs/gcnet/gcnet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_160k_ade20k/gcnet_r101-d8_512x512_160k_ade20k_20200615_225406-615528d7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_160k_ade20k/gcnet_r101-d8_512x512_160k_ade20k_20200615_225406.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.42
+      mIoU(ms+flip): 77.51
+  Config: configs/gcnet/gcnet_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_20k_voc12aug/gcnet_r50-d8_512x512_20k_voc12aug_20200617_165701-3cbfdab1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_20k_voc12aug/gcnet_r50-d8_512x512_20k_voc12aug_20200617_165701.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.41
+      mIoU(ms+flip): 78.56
+  Config: configs/gcnet/gcnet_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_20k_voc12aug/gcnet_r101-d8_512x512_20k_voc12aug_20200617_165713-6c720aa9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_20k_voc12aug/gcnet_r101-d8_512x512_20k_voc12aug_20200617_165713.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.24
+      mIoU(ms+flip): 77.63
+  Config: configs/gcnet/gcnet_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_40k_voc12aug/gcnet_r50-d8_512x512_40k_voc12aug_20200613_195105-9797336d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_40k_voc12aug/gcnet_r50-d8_512x512_40k_voc12aug_20200613_195105.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.84
+      mIoU(ms+flip): 78.59
+  Config: configs/gcnet/gcnet_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_40k_voc12aug/gcnet_r101-d8_512x512_40k_voc12aug_20200613_185806-1e38208d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_40k_voc12aug/gcnet_r101-d8_512x512_40k_voc12aug_20200613_185806.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/README.md b/head_extractor/src/mmseg/.mim/configs/hrnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b529fc895ed0c6882b9af3cc450d0f1dc64a5355
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/README.md
@@ -0,0 +1,122 @@
+# HRNet
+
+> [Deep High-Resolution Representation Learning for Human Pose Estimation](https://arxiv.org/abs/1908.07919)
+
+## Introduction
+
+<!-- [BACKBONE] -->
+
+<a href="https://github.com/HRNet/HRNet-Semantic-Segmentation">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+High-resolution representations are essential for position-sensitive vision problems, such as human pose estimation, semantic segmentation, and object detection. Existing state-of-the-art frameworks first encode the input image as a low-resolution representation through a subnetwork that is formed by connecting high-to-low resolution convolutions \\emph{in series} (e.g., ResNet, VGGNet), and then recover the high-resolution representation from the encoded low-resolution representation. Instead, our proposed network, named as High-Resolution Network (HRNet), maintains high-resolution representations through the whole process. There are two key characteristics: (i) Connect the high-to-low resolution convolution streams \\emph{in parallel}; (ii) Repeatedly exchange the information across resolutions. The benefit is that the resulting representation is semantically richer and spatially more precise. We show the superiority of the proposed HRNet in a wide range of applications, including human pose estimation, semantic segmentation, and object detection, suggesting that the HRNet is a stronger backbone for computer vision problems. All the codes are available at [this https URL](https://github.com/HRNet).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142901680-64c285bc-669f-4924-b054-46a2f07c5427.png" width="80%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                               |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | HRNetV2p-W18-Small | 512x1024  |   40000 | 1.7      | 23.74          | V100   | 73.86 |         75.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_40k_cityscapes/fcn_hr18s_512x1024_40k_cityscapes_20200601_014216-93db27d0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_40k_cityscapes/fcn_hr18s_512x1024_40k_cityscapes_20200601_014216.log.json)     |
+| FCN    | HRNetV2p-W18       | 512x1024  |   40000 | 2.9      | 12.97          | V100   | 77.19 |         78.92 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb2-40k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_40k_cityscapes/fcn_hr18_512x1024_40k_cityscapes_20200601_014216-f196fb4e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_40k_cityscapes/fcn_hr18_512x1024_40k_cityscapes_20200601_014216.log.json)         |
+| FCN    | HRNetV2p-W48       | 512x1024  |   40000 | 6.2      | 6.42           | V100   | 78.48 |         79.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb2-40k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_40k_cityscapes/fcn_hr48_512x1024_40k_cityscapes_20200601_014240-a989b146.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_40k_cityscapes/fcn_hr48_512x1024_40k_cityscapes_20200601_014240.log.json)         |
+| FCN    | HRNetV2p-W18-Small | 512x1024  |   80000 | -        | -              | V100   | 75.31 |         77.48 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_80k_cityscapes/fcn_hr18s_512x1024_80k_cityscapes_20200601_202700-1462b75d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_80k_cityscapes/fcn_hr18s_512x1024_80k_cityscapes_20200601_202700.log.json)     |
+| FCN    | HRNetV2p-W18       | 512x1024  |   80000 | -        | -              | V100   | 78.65 |         80.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb2-80k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_80k_cityscapes/fcn_hr18_512x1024_80k_cityscapes_20200601_223255-4e7b345e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_80k_cityscapes/fcn_hr18_512x1024_80k_cityscapes_20200601_223255.log.json)         |
+| FCN    | HRNetV2p-W48       | 512x1024  |   80000 | -        | -              | V100   | 79.93 |         80.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb2-80k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_80k_cityscapes/fcn_hr48_512x1024_80k_cityscapes_20200601_202606-58ea95d6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_80k_cityscapes/fcn_hr48_512x1024_80k_cityscapes_20200601_202606.log.json)         |
+| FCN    | HRNetV2p-W18-Small | 512x1024  |  160000 | -        | -              | V100   | 76.31 |         78.31 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb2-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_160k_cityscapes/fcn_hr18s_512x1024_160k_cityscapes_20200602_190901-4a0797ea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_160k_cityscapes/fcn_hr18s_512x1024_160k_cityscapes_20200602_190901.log.json) |
+| FCN    | HRNetV2p-W18       | 512x1024  |  160000 | -        | -              | V100   | 78.80 |         80.74 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb2-160k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_160k_cityscapes/fcn_hr18_512x1024_160k_cityscapes_20200602_190822-221e4a4f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_160k_cityscapes/fcn_hr18_512x1024_160k_cityscapes_20200602_190822.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x1024  |  160000 | -        | -              | V100   | 80.65 |         81.92 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb2-160k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_160k_cityscapes/fcn_hr48_512x1024_160k_cityscapes_20200602_190946-59b7973e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_160k_cityscapes/fcn_hr48_512x1024_160k_cityscapes_20200602_190946.log.json)     |
+
+### ADE20K
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                               | download                                                                                                                                                                                                                                                                                                           |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FCN    | HRNetV2p-W18-Small | 512x512   |   80000 | 3.8      | 38.66          | V100   | 31.38 |         32.45 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_ade20k/fcn_hr18s_512x512_80k_ade20k_20200614_144345-77fc814a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_ade20k/fcn_hr18s_512x512_80k_ade20k_20200614_144345.log.json)     |
+| FCN    | HRNetV2p-W18       | 512x512   |   80000 | 4.9      | 22.57          | V100   | 36.27 |         37.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_ade20k/fcn_hr18_512x512_80k_ade20k_20210827_114910-6c9382c0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_ade20k/fcn_hr18_512x512_80k_ade20k_20210827_114910.log.json)         |
+| FCN    | HRNetV2p-W48       | 512x512   |   80000 | 8.2      | 21.23          | V100   | 41.90 |         43.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_ade20k/fcn_hr48_512x512_80k_ade20k_20200614_193946-7ba5258d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_ade20k/fcn_hr48_512x512_80k_ade20k_20200614_193946.log.json)         |
+| FCN    | HRNetV2p-W18-Small | 512x512   |  160000 | -        | -              | V100   | 33.07 |         34.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_160k_ade20k/fcn_hr18s_512x512_160k_ade20k_20210829_174739-f1e7c2e7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_160k_ade20k/fcn_hr18s_512x512_160k_ade20k_20210829_174739.log.json) |
+| FCN    | HRNetV2p-W18       | 512x512   |  160000 | -        | -              | V100   | 36.79 |         38.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_160k_ade20k/fcn_hr18_512x512_160k_ade20k_20200614_214426-ca961836.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_160k_ade20k/fcn_hr18_512x512_160k_ade20k_20200614_214426.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x512   |  160000 | -        | -              | V100   | 42.02 |         43.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_160k_ade20k/fcn_hr48_512x512_160k_ade20k_20200614_214407-a52fc02c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_160k_ade20k/fcn_hr48_512x512_160k_ade20k_20200614_214407.log.json)     |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                               |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | HRNetV2p-W18-Small | 512x512   |   20000 | 1.8      | 43.36          | V100   |  65.5 |         68.89 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_20k_voc12aug/fcn_hr18s_512x512_20k_voc12aug_20210829_174910-0aceadb4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_20k_voc12aug/fcn_hr18s_512x512_20k_voc12aug_20210829_174910.log.json) |
+| FCN    | HRNetV2p-W18       | 512x512   |   20000 | 2.9      | 23.48          | V100   | 72.30 |         74.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_20k_voc12aug/fcn_hr18_512x512_20k_voc12aug_20200617_224503-488d45f7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_20k_voc12aug/fcn_hr18_512x512_20k_voc12aug_20200617_224503.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x512   |   20000 | 6.2      | 22.05          | V100   | 75.87 |         78.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_20k_voc12aug/fcn_hr48_512x512_20k_voc12aug_20200617_224419-89de05cd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_20k_voc12aug/fcn_hr48_512x512_20k_voc12aug_20200617_224419.log.json)     |
+| FCN    | HRNetV2p-W18-Small | 512x512   |   40000 | -        | -              | V100   | 66.61 |         70.00 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_40k_voc12aug/fcn_hr18s_512x512_40k_voc12aug_20200614_000648-4f8d6e7f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_40k_voc12aug/fcn_hr18s_512x512_40k_voc12aug_20200614_000648.log.json) |
+| FCN    | HRNetV2p-W18       | 512x512   |   40000 | -        | -              | V100   | 72.90 |         75.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_40k_voc12aug/fcn_hr18_512x512_40k_voc12aug_20200613_224401-1b4b76cd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_40k_voc12aug/fcn_hr18_512x512_40k_voc12aug_20200613_224401.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x512   |   40000 | -        | -              | V100   | 76.24 |         78.49 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_40k_voc12aug/fcn_hr48_512x512_40k_voc12aug_20200613_222111-1b0f18bc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_40k_voc12aug/fcn_hr48_512x512_40k_voc12aug_20200613_222111.log.json)     |
+
+### Pascal Context
+
+| Method | Backbone     | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                   |
+| ------ | ------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FCN    | HRNetV2p-W48 | 480x480   |   40000 | 6.1      | 8.86           | V100   | 45.14 |         47.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context/fcn_hr48_480x480_40k_pascal_context_20200911_164852-667d00b0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context/fcn_hr48_480x480_40k_pascal_context-20200911_164852.log.json) |
+| FCN    | HRNetV2p-W48 | 480x480   |   80000 | -        | -              | V100   | 45.84 |         47.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context/fcn_hr48_480x480_80k_pascal_context_20200911_155322-847a6711.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context/fcn_hr48_480x480_80k_pascal_context-20200911_155322.log.json) |
+
+### Pascal Context 59
+
+| Method | Backbone     | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                               |
+| ------ | ------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FCN    | HRNetV2p-W48 | 480x480   |   40000 | -        | -              | V100   | 50.33 |         52.83 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context_59/fcn_hr48_480x480_40k_pascal_context_59_20210410_122738-b808b8b2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context_59/fcn_hr48_480x480_40k_pascal_context_59-20210410_122738.log.json) |
+| FCN    | HRNetV2p-W48 | 480x480   |   80000 | -        | -              | V100   | 51.12 |         53.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context_59/fcn_hr48_480x480_80k_pascal_context_59_20210411_003240-3ae7081e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context_59/fcn_hr48_480x480_80k_pascal_context_59-20210411_003240.log.json) |
+
+### LoveDA
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                               | download                                                                                                                                                                                                                                                                                                       |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | HRNetV2p-W18-Small | 512x512   |   80000 | 1.59     | 24.87          | V100   | 49.28 |         49.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-80k_loveda-512x512.pyy) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_loveda/fcn_hr18s_512x512_80k_loveda_20211210_203228-60a86a7a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_loveda/fcn_hr18s_512x512_80k_loveda_20211210_203228.log.json) |
+| FCN    | HRNetV2p-W18       | 512x512   |   80000 | 2.76     | 12.92          | V100   | 50.81 |         50.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-80k_loveda-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_loveda/fcn_hr18_512x512_80k_loveda_20211210_203952-93d9c3b3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_loveda/fcn_hr18_512x512_80k_loveda_20211210_203952.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x512   |   80000 | 6.20     | 9.61           | V100   | 51.42 |         51.64 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_loveda-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_loveda/fcn_hr48_512x512_80k_loveda_20211211_044756-67072f55.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_loveda/fcn_hr48_512x512_80k_loveda_20211211_044756.log.json)     |
+
+### Potsdam
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                               | download                                                                                                                                                                                                                                                                                                           |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FCN    | HRNetV2p-W18-Small | 512x512   |   80000 | 1.58     | 36.00          | V100   | 77.64 |          78.8 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-80k_potsdam-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_potsdam/fcn_hr18s_512x512_80k_potsdam_20211218_205517-ba32af63.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_potsdam/fcn_hr18s_512x512_80k_potsdam_20211218_205517.log.json) |
+| FCN    | HRNetV2p-W18       | 512x512   |   80000 | 2.76     | 19.25          | V100   | 78.26 |         79.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-80k_potsdam-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_potsdam/fcn_hr18_512x512_80k_potsdam_20211218_205517-5d0387ad.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_potsdam/fcn_hr18_512x512_80k_potsdam_20211218_205517.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x512   |   80000 | 6.20     | 16.42          | V100   | 78.39 |         79.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_potsdam-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_potsdam/fcn_hr48_512x512_80k_potsdam_20211219_020601-97434c78.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_potsdam/fcn_hr48_512x512_80k_potsdam_20211219_020601.log.json)     |
+
+### Vaihingen
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                 | download                                                                                                                                                                                                                                                                                                                                   |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FCN    | HRNetV2p-W18-Small | 512x512   |   80000 | 1.58     | 38.11          | V100   | 71.81 |          73.1 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-80k_vaihingen-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen/fcn_hr18s_4x4_512x512_80k_vaihingen_20211231_230909-b23aae02.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen/fcn_hr18s_4x4_512x512_80k_vaihingen_20211231_230909.log.json) |
+| FCN    | HRNetV2p-W18       | 512x512   |   80000 | 2.76     | 19.55          | V100   | 72.57 |         74.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-80k_vaihingen-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen/fcn_hr18_4x4_512x512_80k_vaihingen_20211231_231216-2ec3ae8a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen/fcn_hr18_4x4_512x512_80k_vaihingen_20211231_231216.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x512   |   80000 | 6.20     | 17.25          | V100   | 72.50 |         73.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_vaihingen-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen/fcn_hr48_4x4_512x512_80k_vaihingen_20211231_231244-7133cb22.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen/fcn_hr48_4x4_512x512_80k_vaihingen_20211231_231244.log.json)     |
+
+### iSAID
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                             | download                                                                                                                                                                                                                                                                                                                   |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | HRNetV2p-W18-Small | 896x896   |   80000 | 4.95     | 13.84          | V100   | 62.30 |         62.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-80k_isaid-896x896.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_896x896_80k_isaid/fcn_hr18s_4x4_896x896_80k_isaid_20220118_001603-3cc0769b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_896x896_80k_isaid/fcn_hr18s_4x4_896x896_80k_isaid_20220118_001603.log.json) |
+| FCN    | HRNetV2p-W18       | 896x896   |   80000 | 8.30     | 7.71           | V100   | 65.06 |         65.60 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-80k_isaid-896x896.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_896x896_80k_isaid/fcn_hr18_4x4_896x896_80k_isaid_20220110_182230-49bf752e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_896x896_80k_isaid/fcn_hr18_4x4_896x896_80k_isaid_20220110_182230.log.json)     |
+| FCN    | HRNetV2p-W48       | 896x896   |   80000 | 16.89    | 7.34           | V100   | 67.80 |         68.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_isaid-896x896.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_896x896_80k_isaid/fcn_hr48_4x4_896x896_80k_isaid_20220114_174643-547fc420.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_896x896_80k_isaid/fcn_hr48_4x4_896x896_80k_isaid_20220114_174643.log.json)     |
+
+Note:
+
+- `896x896` is the Crop Size of iSAID dataset, which is followed by the implementation of [PointFlow: Flowing Semantics Through Points for Aerial Image Segmentation](https://arxiv.org/pdf/2103.06564.pdf)
+
+## Citation
+
+```bibtext
+@inproceedings{SunXLW19,
+  title={Deep High-Resolution Representation Learning for Human Pose Estimation},
+  author={Ke Sun and Bin Xiao and Dong Liu and Jingdong Wang},
+  booktitle={CVPR},
+  year={2019}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb2-160k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb2-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b374632b8847ead680859e3b291749ac5a1760f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb2-160k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..598b938a3ff3fd91b71a2f5ab325f6890d227146
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb7da49dbce21fbb83e8415e70195478b1f9393e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4f732cd894a72150dfb39a32f39ff5705f4cca1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor, decode_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..107df6b13b2cfeade3e1b82fe103b3d2750ade3f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/pascal_voc12_aug.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor, decode_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-40k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..f744baec7bc9a53ede3fb0eecafea03d00f2e2ed
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/pascal_context.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=60),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-40k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..0daaa35ebce1146fe9df3e938f0de86500359271
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/pascal_context_59.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=59),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..2aa16b124de9885a4d828128ca6b3203224929fe
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/pascal_voc12_aug.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor, decode_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..029b7d0e9acb9524e908ce9dac1ba3cdb1b695fa
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor, decode_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_isaid-896x896.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_isaid-896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..33a6ac70a646169b782d60e1f2e350899931c1c7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_isaid-896x896.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/isaid.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (896, 896)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor, decode_head=dict(num_classes=16))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_loveda-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a918b2ecee78301c87d951faaa6eaf520828220
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/loveda.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor, decode_head=dict(num_classes=7))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f37e8ade73a0cac24b218910afc6121055f0d89
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/pascal_context.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=60),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c35cb9c329157183abf51d823047607dfdeec6c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/pascal_context_59.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=59),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_potsdam-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..181c03d379db578716681446d8872db4846caa35
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/potsdam.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor, decode_head=dict(num_classes=6))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_vaihingen-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..6303bb65c3f182184384fad834f565bbbae89315
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/fcn_hr18.py', '../_base_/datasets/vaihingen.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor, decode_head=dict(num_classes=6))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb2-160k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb2-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ca631cbeee25028150c16145133fbc5fde33c5a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb2-160k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb2-160k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba7e9c696ea3ff88596b2f9d5e568a155a7c9537
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb2-40k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..26ab6210ddb2f520c71d29f517e055a62bc61c47
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..29cbd10cbfc410039d174cd43f60cb0ae7b4128d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dd1933349995fb347f305039349708327f8f413
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-20k_voc12aug-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-40k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f88f532a3c0edb864eb41e8bf474e1eafefedf4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-40k_pascal-context-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-40k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..b616fad8c2a55617d86f6fff82a898e026fdc57f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b10b282dd8a999f18e67f931567d821ad1536292
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-40k_voc12aug-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9f49360bf27a1b7fcd29ce6645a95d1298548d2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_isaid-896x896.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_isaid-896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab2d2414dd9360368e92ae9d5882b644d2ba02ec
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_isaid-896x896.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_isaid-896x896.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_loveda-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd17076c3fcb0ba1d2729f3a5e189f31cec2d9ad
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_loveda-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7b52331c77557f2956f26d7e274511ecdb8e6d5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_pascal-context-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccf1040d1368ab46cede7e3d7c0d3d496436336f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_pascal-context-59-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_potsdam-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a5726f5d184d5ca0be19e5e4a6b84a57d6fec35
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_potsdam-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_vaihingen-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..720c1732b074cd282b9c2cc63517b7b354166e11
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr18s_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_vaihingen-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb2-160k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb2-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aa5d94d1e8e30716c177c48f5571f958b7596bc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb2-160k_cityscapes-512x1024.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb2-160k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cb795250d027fc2a251a358fe19e5173386e934
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb2-40k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e2ce034b2f23e59c5c2334bd7d6f2d6b36892c4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..89b1f046510a9e9cff0ee259b75fd3414f8aa9b3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ca38a9a79ae77794befad8933fbe9282256e5d3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-20k_voc12aug-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..379be1d67e8a97c6c5a6a24b87c467e27063febd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-40k_pascal-context-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..12730dd53376bc29f30b998c98688810573fb769
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e1b920c59c28c78e8750780359e1ecb432bb686
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-40k_voc12aug-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..14fd663e87407cb13dc936a6c09fec7280722fd2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_isaid-896x896.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_isaid-896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..81815efa8d00dd559590abb1336a856014d93186
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_isaid-896x896.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_isaid-896x896.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_loveda-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..34d23af1634946f3b90911a12c0ff5733c920e64
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_loveda-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d193d90423cfa7a104f8103224829049953dd33
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_pascal-context-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8b4c4aa8e26db6b8dab81448d894690a787523b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_pascal-context-59-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_potsdam-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..58a650004dd96934cbaa315a0b7c624a901c3508
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_potsdam-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_vaihingen-512x512.py b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..db91ed83ef7d6a08ded59aa8167dec82929acecc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/fcn_hr48_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_vaihingen-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/head_extractor/src/mmseg/.mim/configs/hrnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/hrnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..11c30165a535454bf26174b2fb800cb60f99b761
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/hrnet/metafile.yaml
@@ -0,0 +1,874 @@
+Models:
+- Name: fcn_hr18s_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.86
+      mIoU(ms+flip): 75.91
+  Config: configs/hrnet/fcn_hr18s_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_40k_cityscapes/fcn_hr18s_512x1024_40k_cityscapes_20200601_014216-93db27d0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_40k_cityscapes/fcn_hr18s_512x1024_40k_cityscapes_20200601_014216.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.19
+      mIoU(ms+flip): 78.92
+  Config: configs/hrnet/fcn_hr18_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_40k_cityscapes/fcn_hr18_512x1024_40k_cityscapes_20200601_014216-f196fb4e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_40k_cityscapes/fcn_hr18_512x1024_40k_cityscapes_20200601_014216.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.48
+      mIoU(ms+flip): 79.69
+  Config: configs/hrnet/fcn_hr48_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_40k_cityscapes/fcn_hr48_512x1024_40k_cityscapes_20200601_014240-a989b146.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_40k_cityscapes/fcn_hr48_512x1024_40k_cityscapes_20200601_014240.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.31
+      mIoU(ms+flip): 77.48
+  Config: configs/hrnet/fcn_hr18s_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_80k_cityscapes/fcn_hr18s_512x1024_80k_cityscapes_20200601_202700-1462b75d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_80k_cityscapes/fcn_hr18s_512x1024_80k_cityscapes_20200601_202700.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.65
+      mIoU(ms+flip): 80.35
+  Config: configs/hrnet/fcn_hr18_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_80k_cityscapes/fcn_hr18_512x1024_80k_cityscapes_20200601_223255-4e7b345e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_80k_cityscapes/fcn_hr18_512x1024_80k_cityscapes_20200601_223255.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.93
+      mIoU(ms+flip): 80.72
+  Config: configs/hrnet/fcn_hr48_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_80k_cityscapes/fcn_hr48_512x1024_80k_cityscapes_20200601_202606-58ea95d6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_80k_cityscapes/fcn_hr48_512x1024_80k_cityscapes_20200601_202606.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb2-160k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.31
+      mIoU(ms+flip): 78.31
+  Config: configs/hrnet/fcn_hr18s_4xb2-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_160k_cityscapes/fcn_hr18s_512x1024_160k_cityscapes_20200602_190901-4a0797ea.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_160k_cityscapes/fcn_hr18s_512x1024_160k_cityscapes_20200602_190901.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb2-160k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.8
+      mIoU(ms+flip): 80.74
+  Config: configs/hrnet/fcn_hr18_4xb2-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_160k_cityscapes/fcn_hr18_512x1024_160k_cityscapes_20200602_190822-221e4a4f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_160k_cityscapes/fcn_hr18_512x1024_160k_cityscapes_20200602_190822.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb2-160k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.65
+      mIoU(ms+flip): 81.92
+  Config: configs/hrnet/fcn_hr48_4xb2-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_160k_cityscapes/fcn_hr48_512x1024_160k_cityscapes_20200602_190946-59b7973e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_160k_cityscapes/fcn_hr48_512x1024_160k_cityscapes_20200602_190946.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-80k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 31.38
+      mIoU(ms+flip): 32.45
+  Config: configs/hrnet/fcn_hr18s_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_ade20k/fcn_hr18s_512x512_80k_ade20k_20200614_144345-77fc814a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_ade20k/fcn_hr18s_512x512_80k_ade20k_20200614_144345.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-80k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 36.27
+      mIoU(ms+flip): 37.28
+  Config: configs/hrnet/fcn_hr18_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_ade20k/fcn_hr18_512x512_80k_ade20k_20210827_114910-6c9382c0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_ade20k/fcn_hr18_512x512_80k_ade20k_20210827_114910.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.9
+      mIoU(ms+flip): 43.27
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_ade20k/fcn_hr48_512x512_80k_ade20k_20200614_193946-7ba5258d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_ade20k/fcn_hr48_512x512_80k_ade20k_20200614_193946.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 33.07
+      mIoU(ms+flip): 34.56
+  Config: configs/hrnet/fcn_hr18s_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_160k_ade20k/fcn_hr18s_512x512_160k_ade20k_20210829_174739-f1e7c2e7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_160k_ade20k/fcn_hr18s_512x512_160k_ade20k_20210829_174739.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 36.79
+      mIoU(ms+flip): 38.58
+  Config: configs/hrnet/fcn_hr18_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_160k_ade20k/fcn_hr18_512x512_160k_ade20k_20200614_214426-ca961836.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_160k_ade20k/fcn_hr18_512x512_160k_ade20k_20200614_214426.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.02
+      mIoU(ms+flip): 43.86
+  Config: configs/hrnet/fcn_hr48_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_160k_ade20k/fcn_hr48_512x512_160k_ade20k_20200614_214407-a52fc02c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_160k_ade20k/fcn_hr48_512x512_160k_ade20k_20200614_214407.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-20k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 65.5
+      mIoU(ms+flip): 68.89
+  Config: configs/hrnet/fcn_hr18s_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_20k_voc12aug/fcn_hr18s_512x512_20k_voc12aug_20210829_174910-0aceadb4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_20k_voc12aug/fcn_hr18s_512x512_20k_voc12aug_20210829_174910.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-20k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 72.3
+      mIoU(ms+flip): 74.71
+  Config: configs/hrnet/fcn_hr18_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_20k_voc12aug/fcn_hr18_512x512_20k_voc12aug_20200617_224503-488d45f7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_20k_voc12aug/fcn_hr18_512x512_20k_voc12aug_20200617_224503.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-20k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 75.87
+      mIoU(ms+flip): 78.58
+  Config: configs/hrnet/fcn_hr48_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_20k_voc12aug/fcn_hr48_512x512_20k_voc12aug_20200617_224419-89de05cd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_20k_voc12aug/fcn_hr48_512x512_20k_voc12aug_20200617_224419.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-40k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 66.61
+      mIoU(ms+flip): 70.0
+  Config: configs/hrnet/fcn_hr18s_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_40k_voc12aug/fcn_hr18s_512x512_40k_voc12aug_20200614_000648-4f8d6e7f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_40k_voc12aug/fcn_hr18s_512x512_40k_voc12aug_20200614_000648.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-40k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 72.9
+      mIoU(ms+flip): 75.59
+  Config: configs/hrnet/fcn_hr18_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_40k_voc12aug/fcn_hr18_512x512_40k_voc12aug_20200613_224401-1b4b76cd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_40k_voc12aug/fcn_hr18_512x512_40k_voc12aug_20200613_224401.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-40k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.24
+      mIoU(ms+flip): 78.49
+  Config: configs/hrnet/fcn_hr48_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_40k_voc12aug/fcn_hr48_512x512_40k_voc12aug_20200613_222111-1b0f18bc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_40k_voc12aug/fcn_hr48_512x512_40k_voc12aug_20200613_222111.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-40k_pascal-context-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 45.14
+      mIoU(ms+flip): 47.42
+  Config: configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context/fcn_hr48_480x480_40k_pascal_context_20200911_164852-667d00b0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context/fcn_hr48_480x480_40k_pascal_context-20200911_164852.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_pascal-context-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 45.84
+      mIoU(ms+flip): 47.84
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context/fcn_hr48_480x480_80k_pascal_context_20200911_155322-847a6711.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context/fcn_hr48_480x480_80k_pascal_context-20200911_155322.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-40k_pascal-context-59-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 50.33
+      mIoU(ms+flip): 52.83
+  Config: configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context_59/fcn_hr48_480x480_40k_pascal_context_59_20210410_122738-b808b8b2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context_59/fcn_hr48_480x480_40k_pascal_context_59-20210410_122738.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_pascal-context-59-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 51.12
+      mIoU(ms+flip): 53.56
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context_59/fcn_hr48_480x480_80k_pascal_context_59_20210411_003240-3ae7081e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context_59/fcn_hr48_480x480_80k_pascal_context_59-20210411_003240.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-80k_loveda-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 49.28
+      mIoU(ms+flip): 49.42
+  Config: configs/hrnet/fcn_hr18s_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.59
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_loveda/fcn_hr18s_512x512_80k_loveda_20211210_203228-60a86a7a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_loveda/fcn_hr18s_512x512_80k_loveda_20211210_203228.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-80k_loveda-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 50.81
+      mIoU(ms+flip): 50.95
+  Config: configs/hrnet/fcn_hr18_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.76
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_loveda/fcn_hr18_512x512_80k_loveda_20211210_203952-93d9c3b3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_loveda/fcn_hr18_512x512_80k_loveda_20211210_203952.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_loveda-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 51.42
+      mIoU(ms+flip): 51.64
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_loveda/fcn_hr48_512x512_80k_loveda_20211211_044756-67072f55.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_loveda/fcn_hr48_512x512_80k_loveda_20211211_044756.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-80k_potsdam-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 77.64
+      mIoU(ms+flip): 78.8
+  Config: configs/hrnet/fcn_hr18s_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.58
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_potsdam/fcn_hr18s_512x512_80k_potsdam_20211218_205517-ba32af63.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_potsdam/fcn_hr18s_512x512_80k_potsdam_20211218_205517.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-80k_potsdam-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 78.26
+      mIoU(ms+flip): 79.24
+  Config: configs/hrnet/fcn_hr18_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.76
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_potsdam/fcn_hr18_512x512_80k_potsdam_20211218_205517-5d0387ad.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_potsdam/fcn_hr18_512x512_80k_potsdam_20211218_205517.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_potsdam-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 78.39
+      mIoU(ms+flip): 79.34
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_potsdam/fcn_hr48_512x512_80k_potsdam_20211219_020601-97434c78.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_potsdam/fcn_hr48_512x512_80k_potsdam_20211219_020601.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-80k_vaihingen-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 71.81
+      mIoU(ms+flip): 73.1
+  Config: configs/hrnet/fcn_hr18s_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.58
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen/fcn_hr18s_4x4_512x512_80k_vaihingen_20211231_230909-b23aae02.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen/fcn_hr18s_4x4_512x512_80k_vaihingen_20211231_230909.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-80k_vaihingen-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 72.57
+      mIoU(ms+flip): 74.09
+  Config: configs/hrnet/fcn_hr18_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.76
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen/fcn_hr18_4x4_512x512_80k_vaihingen_20211231_231216-2ec3ae8a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen/fcn_hr18_4x4_512x512_80k_vaihingen_20211231_231216.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_vaihingen-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 72.5
+      mIoU(ms+flip): 73.52
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen/fcn_hr48_4x4_512x512_80k_vaihingen_20211231_231244-7133cb22.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen/fcn_hr48_4x4_512x512_80k_vaihingen_20211231_231244.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-80k_isaid-896x896
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 62.3
+      mIoU(ms+flip): 62.97
+  Config: configs/hrnet/fcn_hr18s_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.95
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_896x896_80k_isaid/fcn_hr18s_4x4_896x896_80k_isaid_20220118_001603-3cc0769b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_896x896_80k_isaid/fcn_hr18s_4x4_896x896_80k_isaid_20220118_001603.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-80k_isaid-896x896
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 65.06
+      mIoU(ms+flip): 65.6
+  Config: configs/hrnet/fcn_hr18_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_896x896_80k_isaid/fcn_hr18_4x4_896x896_80k_isaid_20220110_182230-49bf752e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_896x896_80k_isaid/fcn_hr18_4x4_896x896_80k_isaid_20220110_182230.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_isaid-896x896
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 67.8
+      mIoU(ms+flip): 68.53
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 16.89
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_896x896_80k_isaid/fcn_hr48_4x4_896x896_80k_isaid_20220114_174643-547fc420.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_896x896_80k_isaid/fcn_hr48_4x4_896x896_80k_isaid_20220114_174643.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/README.md b/head_extractor/src/mmseg/.mim/configs/icnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fa2327fc3910804bd520526dcca1fe14d4a5ae81
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/README.md
@@ -0,0 +1,56 @@
+# ICNet
+
+> [ICNet for Real-time Semantic Segmentation on High-resolution Images](https://arxiv.org/abs/1704.08545)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/hszhao/ICNet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We focus on the challenging task of real-time semantic segmentation in this paper. It finds many practical applications and yet is with fundamental difficulty of reducing a large portion of computation for pixel-wise label inference. We propose an image cascade network (ICNet) that incorporates multi-resolution branches under proper label guidance to address this challenge. We provide in-depth analysis of our framework and introduce the cascade feature fusion unit to quickly achieve high-quality segmentation. Our system yields real-time inference on a single GPU card with decent quality results evaluated on challenging datasets like Cityscapes, CamVid and COCO-Stuff.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142901772-4570455d-7b27-44ae-a690-47dd9fde8445.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method           | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                | download                                                                                                                                                                                                                                                                                                                                                                               |
+| ---------------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ICNet            | R-18-D8  | 832x832   |   80000 | 1.70     | 27.12          | V100   | 68.14 |         70.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r18-d8_4xb2-80k_cityscapes-832x832.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521-2e36638d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521.log.json)                                             |
+| ICNet            | R-18-D8  | 832x832   |  160000 | -        | -              | V100   | 71.64 |         74.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r18-d8_4xb2-160k_cityscapes-832x832.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153-2c6eb6e0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153.log.json)                                         |
+| ICNet (in1k-pre) | R-18-D8  | 832x832   |   80000 | -        | -              | V100   | 72.51 |         74.78 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354-1cbe3022.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354.log.json)         |
+| ICNet (in1k-pre) | R-18-D8  | 832x832   |  160000 | -        | -              | V100   | 74.43 |         76.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702-619c8ae1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702.log.json)     |
+| ICNet            | R-50-D8  | 832x832   |   80000 | 2.53     | 20.08          | V100   | 68.91 |         69.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r50-d8_4xb2-80k_cityscapes-832x832.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625-c6407341.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625.log.json)                                             |
+| ICNet            | R-50-D8  | 832x832   |  160000 | -        | -              | V100   | 73.82 |         75.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r50-d8_4xb2-160k_cityscapes-832x832.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612-a95f0d4e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612.log.json)                                         |
+| ICNet (in1k-pre) | R-50-D8  | 832x832   |   80000 | -        | -              | V100   | 74.58 |         76.41 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943-1743dc7b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943.log.json)         |
+| ICNet (in1k-pre) | R-50-D8  | 832x832   |  160000 | -        | -              | V100   | 76.29 |         78.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715-ce310aea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715.log.json)     |
+| ICNet            | R-101-D8 | 832x832   |   80000 | 3.08     | 16.95          | V100   | 70.28 |         71.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r101-d8_4xb2-80k_cityscapes-832x832.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447-b52f936e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447.log.json)                                         |
+| ICNet            | R-101-D8 | 832x832   |  160000 | -        | -              | V100   | 73.80 |         76.10 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r101-d8_4xb2-160k_cityscapes-832x832.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350-3a1ebf1a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350.log.json)                                     |
+| ICNet (in1k-pre) | R-101-D8 | 832x832   |   80000 | -        | -              | V100   | 75.57 |         77.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414-7ceb12c5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414.log.json)     |
+| ICNet (in1k-pre) | R-101-D8 | 832x832   |  160000 | -        | -              | V100   | 76.15 |         77.98 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612-9484ae8a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612.log.json) |
+
+Note: `in1k-pre` means pretrained model is used.
+
+## Citation
+
+```bibtext
+@inproceedings{zhao2018icnet,
+  title={Icnet for real-time semantic segmentation on high-resolution images},
+  author={Zhao, Hengshuang and Qi, Xiaojuan and Shen, Xiaoyong and Shi, Jianping and Jia, Jiaya},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={405--420},
+  year={2018}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6840a1155ff713a58f51435284b77a3b6d502f9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
@@ -0,0 +1,7 @@
+_base_ = './icnet_r50-d8_4xb2-160k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            depth=101,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet101_v1c'))))
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca81df8c7b51e737b740f703998b0ced35028be2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
@@ -0,0 +1,7 @@
+_base_ = './icnet_r50-d8_4xb2-80k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            depth=101,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet101_v1c'))))
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r101-d8_4xb2-160k_cityscapes-832x832.py b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r101-d8_4xb2-160k_cityscapes-832x832.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef60446bc57cd27810f6f1628c47d667cbaddb1a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r101-d8_4xb2-160k_cityscapes-832x832.py
@@ -0,0 +1,2 @@
+_base_ = './icnet_r50-d8_4xb2-160k_cityscapes-832x832.py'
+model = dict(backbone=dict(backbone_cfg=dict(depth=101)))
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r101-d8_4xb2-80k_cityscapes-832x832.py b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r101-d8_4xb2-80k_cityscapes-832x832.py
new file mode 100644
index 0000000000000000000000000000000000000000..5173d2d6f820ff7c9c479f4069d24b87504d5bed
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r101-d8_4xb2-80k_cityscapes-832x832.py
@@ -0,0 +1,2 @@
+_base_ = './icnet_r50-d8_4xb2-80k_cityscapes-832x832.py'
+model = dict(backbone=dict(backbone_cfg=dict(depth=101)))
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f72daab65727489639df7401d4cb5dbfee8fec1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
@@ -0,0 +1,8 @@
+_base_ = './icnet_r50-d8_4xb2-160k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(
+        layer_channels=(128, 512),
+        backbone_cfg=dict(
+            depth=18,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))))
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fc79ab1977b0c4c545f91d83928a5894c3369e6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
@@ -0,0 +1,8 @@
+_base_ = './icnet_r50-d8_4xb2-80k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(
+        layer_channels=(128, 512),
+        backbone_cfg=dict(
+            depth=18,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))))
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r18-d8_4xb2-160k_cityscapes-832x832.py b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r18-d8_4xb2-160k_cityscapes-832x832.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c70e948106250c9a9d5604393a3ee9d8333db6f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r18-d8_4xb2-160k_cityscapes-832x832.py
@@ -0,0 +1,3 @@
+_base_ = './icnet_r50-d8_4xb2-160k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(layer_channels=(128, 512), backbone_cfg=dict(depth=18)))
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r18-d8_4xb2-80k_cityscapes-832x832.py b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r18-d8_4xb2-80k_cityscapes-832x832.py
new file mode 100644
index 0000000000000000000000000000000000000000..23c7ac29900bf3991b7b54da1ec6fff13acacf02
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r18-d8_4xb2-80k_cityscapes-832x832.py
@@ -0,0 +1,3 @@
+_base_ = './icnet_r50-d8_4xb2-80k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(layer_channels=(128, 512), backbone_cfg=dict(depth=18)))
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9ab863402a3c5a6d99176327618c9a188cc3faf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
@@ -0,0 +1,6 @@
+_base_ = './icnet_r50-d8_4xb2-160k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a085d4f6120711c0e0da968a01a97fe1d1240ef
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
@@ -0,0 +1,6 @@
+_base_ = './icnet_r50-d8_4xb2-80k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r50-d8_4xb2-160k_cityscapes-832x832.py b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r50-d8_4xb2-160k_cityscapes-832x832.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b7b1884f79520a03eb277a50c70c7a1f4c0c755
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r50-d8_4xb2-160k_cityscapes-832x832.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/icnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_832x832.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (832, 832)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r50-d8_4xb2-80k_cityscapes-832x832.py b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r50-d8_4xb2-80k_cityscapes-832x832.py
new file mode 100644
index 0000000000000000000000000000000000000000..001dbcaf7f6ee0a78f5832b4fa28887d63fa57a3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/icnet_r50-d8_4xb2-80k_cityscapes-832x832.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/icnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_832x832.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (832, 832)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/icnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/icnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d843ee4b66ceffd05fc3dc1c67c7a45b50f626f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/icnet/metafile.yaml
@@ -0,0 +1,298 @@
+Collections:
+- Name: ICNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  README: configs/icnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: icnet_r18-d8_4xb2-80k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 68.14
+      mIoU(ms+flip): 70.16
+  Config: configs/icnet/icnet_r18-d8_4xb2-80k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - ICNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521-2e36638d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r18-d8_4xb2-160k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 71.64
+      mIoU(ms+flip): 74.18
+  Config: configs/icnet/icnet_r18-d8_4xb2-160k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - ICNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153-2c6eb6e0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r18-d8-in1k-pre_4xb2-80k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 72.51
+      mIoU(ms+flip): 74.78
+  Config: configs/icnet/icnet_r18-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - ICNet
+    - (in1k-pre)
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354-1cbe3022.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r18-d8-in1k-pre_4xb2-160k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.43
+      mIoU(ms+flip): 76.72
+  Config: configs/icnet/icnet_r18-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - ICNet
+    - (in1k-pre)
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702-619c8ae1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r50-d8_4xb2-80k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 68.91
+      mIoU(ms+flip): 69.72
+  Config: configs/icnet/icnet_r50-d8_4xb2-80k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ICNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.53
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625-c6407341.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r50-d8_4xb2-160k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.82
+      mIoU(ms+flip): 75.67
+  Config: configs/icnet/icnet_r50-d8_4xb2-160k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ICNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612-a95f0d4e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r50-d8-in1k-pre_4xb2-80k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.58
+      mIoU(ms+flip): 76.41
+  Config: configs/icnet/icnet_r50-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ICNet
+    - (in1k-pre)
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943-1743dc7b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r50-d8-in1k-pre_4xb2-160k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.29
+      mIoU(ms+flip): 78.09
+  Config: configs/icnet/icnet_r50-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ICNet
+    - (in1k-pre)
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715-ce310aea.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r101-d8_4xb2-80k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 70.28
+      mIoU(ms+flip): 71.95
+  Config: configs/icnet/icnet_r101-d8_4xb2-80k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ICNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.08
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447-b52f936e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r101-d8_4xb2-160k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.8
+      mIoU(ms+flip): 76.1
+  Config: configs/icnet/icnet_r101-d8_4xb2-160k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ICNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350-3a1ebf1a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r101-d8-in1k-pre_4xb2-80k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.57
+      mIoU(ms+flip): 77.86
+  Config: configs/icnet/icnet_r101-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ICNet
+    - (in1k-pre)
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414-7ceb12c5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r101-d8-in1k-pre_4xb2-160k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.15
+      mIoU(ms+flip): 77.98
+  Config: configs/icnet/icnet_r101-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ICNet
+    - (in1k-pre)
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612-9484ae8a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/README.md b/head_extractor/src/mmseg/.mim/configs/isanet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c11744ffef8c6161ef2d721e98a81067ff4cf9b0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/README.md
@@ -0,0 +1,80 @@
+# ISANet
+
+> [Interlaced Sparse Self-Attention for Semantic Segmentation](https://arxiv.org/abs/1907.12273)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/openseg-group/openseg.pytorch">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+In this paper, we present a so-called interlaced sparse self-attention approach to improve the efficiency of the \\emph{self-attention} mechanism for semantic segmentation. The main idea is that we factorize the dense affinity matrix as the product of two sparse affinity matrices. There are two successive attention modules each estimating a sparse affinity matrix. The first attention module is used to estimate the affinities within a subset of positions that have long spatial interval distances and the second attention module is used to estimate the affinities within a subset of positions that have short spatial interval distances. These two attention modules are designed so that each position is able to receive the information from all the other positions. In contrast to the original self-attention module, our approach decreases the computation and memory complexity substantially especially when processing high-resolution feature maps. We empirically verify the effectiveness of our approach on six challenging semantic segmentation benchmarks.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142901868-03d80da4-b9c0-4df9-8509-5f684ba9dadc.png" width="80%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) |                                                                                                                         config | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------- | -------: | -------------- | ------ | ----- | ------------: | -----------------------------------------------------------------------------------------------------------------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ISANet | R-50-D8  | 512x1024  | 40000   |    5.869 | 2.91           | V100   | 78.49 |         79.44 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_40k_cityscapes/isanet_r50-d8_512x1024_40k_cityscapes_20210901_054739-981bd763.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_40k_cityscapes/isanet_r50-d8_512x1024_40k_cityscapes_20210901_054739.log.json)     |
+| ISANet | R-50-D8  | 512x1024  | 80000   |    5.869 | 2.91           | V100   | 78.68 |         80.25 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_80k_cityscapes/isanet_r50-d8_512x1024_80k_cityscapes_20210901_074202-89384497.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_80k_cityscapes/isanet_r50-d8_512x1024_80k_cityscapes_20210901_074202.log.json)     |
+| ISANet | R-50-D8  | 769x769   | 40000   |    6.759 | 1.54           | V100   | 78.70 |         80.28 |   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-769x769.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_40k_cityscapes/isanet_r50-d8_769x769_40k_cityscapes_20210903_050200-4ae7e65b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_40k_cityscapes/isanet_r50-d8_769x769_40k_cityscapes_20210903_050200.log.json)         |
+| ISANet | R-50-D8  | 769x769   | 80000   |    6.759 | 1.54           | V100   | 79.29 |         80.53 |   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-769x769.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_80k_cityscapes/isanet_r50-d8_769x769_80k_cityscapes_20210903_101126-99b54519.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_80k_cityscapes/isanet_r50-d8_769x769_80k_cityscapes_20210903_101126.log.json)         |
+| ISANet | R-101-D8 | 512x1024  | 40000   |    9.425 | 2.35           | V100   | 79.58 |         81.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_40k_cityscapes/isanet_r101-d8_512x1024_40k_cityscapes_20210901_145553-293e6bd6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_40k_cityscapes/isanet_r101-d8_512x1024_40k_cityscapes_20210901_145553.log.json) |
+| ISANet | R-101-D8 | 512x1024  | 80000   |    9.425 | 2.35           | V100   | 80.32 |         81.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_80k_cityscapes/isanet_r101-d8_512x1024_80k_cityscapes_20210901_145243-5b99c9b2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_80k_cityscapes/isanet_r101-d8_512x1024_80k_cityscapes_20210901_145243.log.json) |
+| ISANet | R-101-D8 | 769x769   | 40000   |   10.815 | 0.92           | V100   | 79.68 |         80.95 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-769x769.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_40k_cityscapes/isanet_r101-d8_769x769_40k_cityscapes_20210903_111320-509e7224.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_40k_cityscapes/isanet_r101-d8_769x769_40k_cityscapes_20210903_111320.log.json)     |
+| ISANet | R-101-D8 | 769x769   | 80000   |   10.815 | 0.92           | V100   | 80.61 |         81.59 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-769x769.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_80k_cityscapes/isanet_r101-d8_769x769_80k_cityscapes_20210903_111319-24f71dfa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_80k_cityscapes/isanet_r101-d8_769x769_80k_cityscapes_20210903_111319.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) |                                                                                                                     config | download                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------- | -------: | -------------- | ------ | ----- | ------------: | -------------------------------------------------------------------------------------------------------------------------: | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ISANet | R-50-D8  | 512x512   | 80000   |      9.0 | 22.55          | V100   | 41.12 |         42.35 |   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_80k_ade20k/isanet_r50-d8_512x512_80k_ade20k_20210903_124557-6ed83a0c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_80k_ade20k/isanet_r50-d8_512x512_80k_ade20k_20210903_124557.log.json)         |
+| ISANet | R-50-D8  | 512x512   | 160000  |      9.0 | 22.55          | V100   | 42.59 |         43.07 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_160k_ade20k/isanet_r50-d8_512x512_160k_ade20k_20210903_104850-f752d0a3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_160k_ade20k/isanet_r50-d8_512x512_160k_ade20k_20210903_104850.log.json)     |
+| ISANet | R-101-D8 | 512x512   | 80000   |   12.562 | 10.56          | V100   | 43.51 |         44.38 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_80k_ade20k/isanet_r101-d8_512x512_80k_ade20k_20210903_162056-68b235c2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_80k_ade20k/isanet_r101-d8_512x512_80k_ade20k_20210903_162056.log.json)     |
+| ISANet | R-101-D8 | 512x512   | 160000  |   12.562 | 10.56          | V100   | 43.80 |          45.4 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_160k_ade20k/isanet_r101-d8_512x512_160k_ade20k_20210903_211431-a7879dcd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_160k_ade20k/isanet_r101-d8_512x512_160k_ade20k_20210903_211431.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) |                                                                                                                      config | download                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------- | -------: | -------------- | ------ | ----- | ------------: | --------------------------------------------------------------------------------------------------------------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ISANet | R-50-D8  | 512x512   | 20000   |      5.9 | 23.08          | V100   | 76.78 |         77.79 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_20k_voc12aug/isanet_r50-d8_512x512_20k_voc12aug_20210901_164838-79d59b80.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_20k_voc12aug/isanet_r50-d8_512x512_20k_voc12aug_20210901_164838.log.json)     |
+| ISANet | R-50-D8  | 512x512   | 40000   |      5.9 | 23.08          | V100   | 76.20 |         77.22 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_40k_voc12aug/isanet_r50-d8_512x512_40k_voc12aug_20210901_151349-7d08a54e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_40k_voc12aug/isanet_r50-d8_512x512_40k_voc12aug_20210901_151349.log.json)     |
+| ISANet | R-101-D8 | 512x512   | 20000   |    9.465 | 7.42           | V100   | 78.46 |         79.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_20k_voc12aug/isanet_r101-d8_512x512_20k_voc12aug_20210901_115805-3ccbf355.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_20k_voc12aug/isanet_r101-d8_512x512_20k_voc12aug_20210901_115805.log.json) |
+| ISANet | R-101-D8 | 512x512   | 40000   |    9.465 | 7.42           | V100   | 78.12 |         79.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_40k_voc12aug/isanet_r101-d8_512x512_40k_voc12aug_20210901_145814-bc71233b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_40k_voc12aug/isanet_r101-d8_512x512_40k_voc12aug_20210901_145814.log.json) |
+
+## Citation
+
+```bibetex
+@article{huang2019isa,
+  title={Interlaced Sparse Self-Attention for Semantic Segmentation},
+  author={Huang, Lang and Yuan, Yuhui and Guo, Jianyuan and Zhang, Chao and Chen, Xilin and Wang, Jingdong},
+  journal={arXiv preprint arXiv:1907.12273},
+  year={2019}
+}
+```
+
+The technical report above is also presented at:
+
+```bibetex
+@article{yuan2021ocnet,
+  title={OCNet: Object Context for Semantic Segmentation},
+  author={Yuan, Yuhui and Huang, Lang and Guo, Jianyuan and Zhang, Chao and Chen, Xilin and Wang, Jingdong},
+  journal={International Journal of Computer Vision},
+  pages={1--24},
+  year={2021},
+  publisher={Springer}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..6093aeb4f70b7be3d72557180e36901dfe737892
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc14c76dfb682ee85bfcbee26bb17702eeade400
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..1735f89d4183f1448bd360fde2dc6ec6a3d2dbb2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1a6371b7678829a69fe489a522966a64ddf117e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2fb09e374bb6e0a99279e80ef750df639ec70b3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c225cfe3a879fd41ea711300da61c8b8219fd32
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e86ee584f439b1fc07fc8a1bbeab7c69cdac52f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..090e86f243cc140ceaaf6fc50ce4f33614baeaba
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..f03365e22405022ae94418da79a39c68bcaa870a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/isanet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..f073a7b691c76a2b0c1d1de18d97cecaa975806f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/isanet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..4be445d5cf91436796aa5e274c57da113f7f9a06
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/isanet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..0278ad852a262d9f6eacfea309661327074535c5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/isanet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f4af8d0ba41367fc17fd5fec0d26a388a9a0afc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/isanet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..591df42a87ba73cbd6bfc49224a223e131bc1021
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/isanet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a59879b0fc8fbf71b3200b3997a131c0ff84f9df
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/isanet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..7df05c37814201baab2d115833ff84790180fb14
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/isanet_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/isanet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/isanet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/isanet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ad394eabb2d8ab71395da265cabecce52ac5c225
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/isanet/metafile.yaml
@@ -0,0 +1,399 @@
+Collections:
+- Name: ISANet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  README: configs/isanet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: isanet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.49
+      mIoU(ms+flip): 79.44
+  Config: configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.869
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_40k_cityscapes/isanet_r50-d8_512x1024_40k_cityscapes_20210901_054739-981bd763.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_40k_cityscapes/isanet_r50-d8_512x1024_40k_cityscapes_20210901_054739.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.68
+      mIoU(ms+flip): 80.25
+  Config: configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.869
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_80k_cityscapes/isanet_r50-d8_512x1024_80k_cityscapes_20210901_074202-89384497.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_80k_cityscapes/isanet_r50-d8_512x1024_80k_cityscapes_20210901_074202.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.7
+      mIoU(ms+flip): 80.28
+  Config: configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.759
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_40k_cityscapes/isanet_r50-d8_769x769_40k_cityscapes_20210903_050200-4ae7e65b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_40k_cityscapes/isanet_r50-d8_769x769_40k_cityscapes_20210903_050200.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.29
+      mIoU(ms+flip): 80.53
+  Config: configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.759
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_80k_cityscapes/isanet_r50-d8_769x769_80k_cityscapes_20210903_101126-99b54519.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_80k_cityscapes/isanet_r50-d8_769x769_80k_cityscapes_20210903_101126.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.58
+      mIoU(ms+flip): 81.05
+  Config: configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.425
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_40k_cityscapes/isanet_r101-d8_512x1024_40k_cityscapes_20210901_145553-293e6bd6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_40k_cityscapes/isanet_r101-d8_512x1024_40k_cityscapes_20210901_145553.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.32
+      mIoU(ms+flip): 81.58
+  Config: configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.425
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_80k_cityscapes/isanet_r101-d8_512x1024_80k_cityscapes_20210901_145243-5b99c9b2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_80k_cityscapes/isanet_r101-d8_512x1024_80k_cityscapes_20210901_145243.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.68
+      mIoU(ms+flip): 80.95
+  Config: configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.815
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_40k_cityscapes/isanet_r101-d8_769x769_40k_cityscapes_20210903_111320-509e7224.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_40k_cityscapes/isanet_r101-d8_769x769_40k_cityscapes_20210903_111320.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.61
+      mIoU(ms+flip): 81.59
+  Config: configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.815
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_80k_cityscapes/isanet_r101-d8_769x769_80k_cityscapes_20210903_111319-24f71dfa.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_80k_cityscapes/isanet_r101-d8_769x769_80k_cityscapes_20210903_111319.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.12
+      mIoU(ms+flip): 42.35
+  Config: configs/isanet/isanet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_80k_ade20k/isanet_r50-d8_512x512_80k_ade20k_20210903_124557-6ed83a0c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_80k_ade20k/isanet_r50-d8_512x512_80k_ade20k_20210903_124557.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.59
+      mIoU(ms+flip): 43.07
+  Config: configs/isanet/isanet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_160k_ade20k/isanet_r50-d8_512x512_160k_ade20k_20210903_104850-f752d0a3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_160k_ade20k/isanet_r50-d8_512x512_160k_ade20k_20210903_104850.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.51
+      mIoU(ms+flip): 44.38
+  Config: configs/isanet/isanet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.562
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_80k_ade20k/isanet_r101-d8_512x512_80k_ade20k_20210903_162056-68b235c2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_80k_ade20k/isanet_r101-d8_512x512_80k_ade20k_20210903_162056.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.8
+      mIoU(ms+flip): 45.4
+  Config: configs/isanet/isanet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.562
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_160k_ade20k/isanet_r101-d8_512x512_160k_ade20k_20210903_211431-a7879dcd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_160k_ade20k/isanet_r101-d8_512x512_160k_ade20k_20210903_211431.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.78
+      mIoU(ms+flip): 77.79
+  Config: configs/isanet/isanet_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_20k_voc12aug/isanet_r50-d8_512x512_20k_voc12aug_20210901_164838-79d59b80.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_20k_voc12aug/isanet_r50-d8_512x512_20k_voc12aug_20210901_164838.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.2
+      mIoU(ms+flip): 77.22
+  Config: configs/isanet/isanet_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_40k_voc12aug/isanet_r50-d8_512x512_40k_voc12aug_20210901_151349-7d08a54e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_40k_voc12aug/isanet_r50-d8_512x512_40k_voc12aug_20210901_151349.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.46
+      mIoU(ms+flip): 79.16
+  Config: configs/isanet/isanet_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.465
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_20k_voc12aug/isanet_r101-d8_512x512_20k_voc12aug_20210901_115805-3ccbf355.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_20k_voc12aug/isanet_r101-d8_512x512_20k_voc12aug_20210901_115805.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.12
+      mIoU(ms+flip): 79.04
+  Config: configs/isanet/isanet_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.465
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_40k_voc12aug/isanet_r101-d8_512x512_40k_voc12aug_20210901_145814-bc71233b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_40k_voc12aug/isanet_r101-d8_512x512_40k_voc12aug_20210901_145814.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/knet/README.md b/head_extractor/src/mmseg/.mim/configs/knet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f3f2ae268ecda8fbc70579b77b5296b64433917
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/knet/README.md
@@ -0,0 +1,52 @@
+# K-Net
+
+> [K-Net: Towards Unified Image Segmentation](https://arxiv.org/abs/2106.14855)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/ZwwWayne/K-Net/">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Semantic, instance, and panoptic segmentations have been addressed using different and specialized frameworks despite their underlying connections. This paper presents a unified, simple, and effective framework for these essentially similar tasks. The framework, named K-Net, segments both instances and semantic categories consistently by a group of learnable kernels, where each kernel is responsible for generating a mask for either a potential instance or a stuff class. To remedy the difficulties of distinguishing various instances, we propose a kernel update strategy that enables each kernel dynamic and conditional on its meaningful group in the input image. K-Net can be trained in an end-to-end manner with bipartite matching, and its training and inference are naturally NMS-free and box-free. Without bells and whistles, K-Net surpasses all previous published state-of-the-art single-model results of panoptic segmentation on MS COCO test-dev split and semantic segmentation on ADE20K val split with 55.2% PQ and 54.3% mIoU, respectively. Its instance segmentation performance is also on par with Cascade Mask R-CNN on MS COCO with 60%-90% faster inference speeds. Code and models will be released at [this https URL](https://github.com/ZwwWayne/K-Net/).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/157008300-9f40905c-b8e8-4a2a-9593-c1177fa35b2c.png" width="90%"/>
+</div>
+
+## Results and models
+
+### ADE20K
+
+| Method           | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                                                                                         |
+| ---------------- | -------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| KNet + FCN       | R-50-D8  | 512x512   | 80000   | 7.01     | 19.24          | V100   | 43.60 | 45.12         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_r50-d8_fcn_8xb2-adamw-80k_ade20k-512x512.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_043751-abcab920.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_043751.log.json)                         |
+| KNet + PSPNet    | R-50-D8  | 512x512   | 80000   | 6.98     | 20.04          | V100   | 44.18 | 45.58         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_r50-d8_pspnet_8xb2-adamw-80k_ade20k-512x512.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_054634-d2c72240.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_054634.log.json)             |
+| KNet + DeepLabV3 | R-50-D8  | 512x512   | 80000   | 7.42     | 12.10          | V100   | 45.06 | 46.11         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_r50-d8_deeplabv3_8xb2-adamw-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_041642-00c8fbeb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_041642.log.json) |
+| KNet + UperNet   | R-50-D8  | 512x512   | 80000   | 7.34     | 17.11          | V100   | 43.45 | 44.07         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_r50-d8_upernet_8xb2-adamw-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220304_125657-215753b0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220304_125657.log.json)         |
+| KNet + UperNet   | Swin-T   | 512x512   | 80000   | 7.57     | 15.56          | V100   | 45.84 | 46.27         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k_20220303_133059-7545e1dc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k_20220303_133059.log.json)         |
+| KNet + UperNet   | Swin-L   | 512x512   | 80000   | 13.5     | 8.29           | V100   | 52.05 | 53.24         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k_20220303_154559-d8da9a90.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k_20220303_154559.log.json)         |
+| KNet + UperNet   | Swin-L   | 640x640   | 80000   | 13.54    | 8.29           | V100   | 52.21 | 53.34         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-640x640.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k_20220301_220747-8787fc71.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k_20220301_220747.log.json)         |
+
+Note:
+
+- All experiments of K-Net are implemented with 8 V100 (32G) GPUs with 2 samplers per GPU.
+
+# Citation
+
+```bibtex
+@inproceedings{zhang2021knet,
+    title={{K-Net: Towards} Unified Image Segmentation},
+    author={Wenwei Zhang and Jiangmiao Pang and Kai Chen and Chen Change Loy},
+    year={2021},
+    booktitle={NeurIPS},
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_r50-d8_deeplabv3_8xb2-adamw-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_r50-d8_deeplabv3_8xb2-adamw-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..7946cca0673ff3b3da4bad21472ebf6c07b6fa4d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_r50-d8_deeplabv3_8xb2-adamw-80k_ade20k-512x512.py
@@ -0,0 +1,111 @@
+_base_ = [
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    size=crop_size,
+    seg_pad_val=255)
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+num_stages = 3
+conv_kernel_size = 1
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='IterativeDecodeHead',
+        num_stages=num_stages,
+        kernel_update_head=[
+            dict(
+                type='KernelUpdateHead',
+                num_classes=150,
+                num_ffn_fcs=2,
+                num_heads=8,
+                num_mask_fcs=1,
+                feedforward_channels=2048,
+                in_channels=512,
+                out_channels=512,
+                dropout=0.0,
+                conv_kernel_size=conv_kernel_size,
+                ffn_act_cfg=dict(type='ReLU', inplace=True),
+                with_ffn=True,
+                feat_transform_cfg=dict(
+                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
+                kernel_updator_cfg=dict(
+                    type='KernelUpdator',
+                    in_channels=256,
+                    feat_channels=256,
+                    out_channels=256,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    norm_cfg=dict(type='LN'))) for _ in range(num_stages)
+        ],
+        kernel_generate_head=dict(
+            type='ASPPHead',
+            in_channels=2048,
+            in_index=3,
+            channels=512,
+            dilations=(1, 12, 24, 36),
+            dropout_ratio=0.1,
+            num_classes=150,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0005),
+    clip_grad=dict(max_norm=1, norm_type=2))
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=1000,
+        end=80000,
+        milestones=[60000, 72000],
+        by_epoch=False,
+    )
+]
+# In K-Net implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_r50-d8_fcn_8xb2-adamw-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_r50-d8_fcn_8xb2-adamw-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..497cd04bf56cc85ba2da5eccb072b2b5a8883c82
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_r50-d8_fcn_8xb2-adamw-80k_ade20k-512x512.py
@@ -0,0 +1,112 @@
+_base_ = [
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    size=crop_size,
+    seg_pad_val=255)
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+num_stages = 3
+conv_kernel_size = 1
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='IterativeDecodeHead',
+        num_stages=num_stages,
+        kernel_update_head=[
+            dict(
+                type='KernelUpdateHead',
+                num_classes=150,
+                num_ffn_fcs=2,
+                num_heads=8,
+                num_mask_fcs=1,
+                feedforward_channels=2048,
+                in_channels=512,
+                out_channels=512,
+                dropout=0.0,
+                conv_kernel_size=conv_kernel_size,
+                ffn_act_cfg=dict(type='ReLU', inplace=True),
+                with_ffn=True,
+                feat_transform_cfg=dict(
+                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
+                kernel_updator_cfg=dict(
+                    type='KernelUpdator',
+                    in_channels=256,
+                    feat_channels=256,
+                    out_channels=256,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    norm_cfg=dict(type='LN'))) for _ in range(num_stages)
+        ],
+        kernel_generate_head=dict(
+            type='FCNHead',
+            in_channels=2048,
+            in_index=3,
+            channels=512,
+            num_convs=2,
+            concat_input=True,
+            dropout_ratio=0.1,
+            num_classes=150,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0005),
+    clip_grad=dict(max_norm=1, norm_type=2))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=1000,
+        end=80000,
+        milestones=[60000, 72000],
+        by_epoch=False,
+    )
+]
+# In K-Net implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_r50-d8_pspnet_8xb2-adamw-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_r50-d8_pspnet_8xb2-adamw-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b918671bfcf28fd2e26a0d70aa0d3ccc3a5b60a3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_r50-d8_pspnet_8xb2-adamw-80k_ade20k-512x512.py
@@ -0,0 +1,110 @@
+_base_ = [
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    size=crop_size,
+    seg_pad_val=255)
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+num_stages = 3
+conv_kernel_size = 1
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='IterativeDecodeHead',
+        num_stages=num_stages,
+        kernel_update_head=[
+            dict(
+                type='KernelUpdateHead',
+                num_classes=150,
+                num_ffn_fcs=2,
+                num_heads=8,
+                num_mask_fcs=1,
+                feedforward_channels=2048,
+                in_channels=512,
+                out_channels=512,
+                dropout=0.0,
+                conv_kernel_size=conv_kernel_size,
+                ffn_act_cfg=dict(type='ReLU', inplace=True),
+                with_ffn=True,
+                feat_transform_cfg=dict(
+                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
+                kernel_updator_cfg=dict(
+                    type='KernelUpdator',
+                    in_channels=256,
+                    feat_channels=256,
+                    out_channels=256,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    norm_cfg=dict(type='LN'))) for _ in range(num_stages)
+        ],
+        kernel_generate_head=dict(
+            type='PSPHead',
+            in_channels=2048,
+            in_index=3,
+            channels=512,
+            pool_scales=(1, 2, 3, 6),
+            dropout_ratio=0.1,
+            num_classes=150,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0005),
+    clip_grad=dict(max_norm=1, norm_type=2))
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=1000,
+        end=80000,
+        milestones=[60000, 72000],
+        by_epoch=False,
+    )
+]
+# In K-Net implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_r50-d8_upernet_8xb2-adamw-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_r50-d8_upernet_8xb2-adamw-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0a66c57ddb970cbd4a47728319171534acd8093
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_r50-d8_upernet_8xb2-adamw-80k_ade20k-512x512.py
@@ -0,0 +1,111 @@
+_base_ = [
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    size=crop_size,
+    seg_pad_val=255)
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+num_stages = 3
+conv_kernel_size = 1
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='IterativeDecodeHead',
+        num_stages=num_stages,
+        kernel_update_head=[
+            dict(
+                type='KernelUpdateHead',
+                num_classes=150,
+                num_ffn_fcs=2,
+                num_heads=8,
+                num_mask_fcs=1,
+                feedforward_channels=2048,
+                in_channels=512,
+                out_channels=512,
+                dropout=0.0,
+                conv_kernel_size=conv_kernel_size,
+                ffn_act_cfg=dict(type='ReLU', inplace=True),
+                with_ffn=True,
+                feat_transform_cfg=dict(
+                    conv_cfg=dict(type='Conv2d'), act_cfg=None),
+                kernel_updator_cfg=dict(
+                    type='KernelUpdator',
+                    in_channels=256,
+                    feat_channels=256,
+                    out_channels=256,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    norm_cfg=dict(type='LN'))) for _ in range(num_stages)
+        ],
+        kernel_generate_head=dict(
+            type='UPerHead',
+            in_channels=[256, 512, 1024, 2048],
+            in_index=[0, 1, 2, 3],
+            pool_scales=(1, 2, 3, 6),
+            channels=512,
+            dropout_ratio=0.1,
+            num_classes=150,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0005),
+    clip_grad=dict(max_norm=1, norm_type=2))
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=1000,
+        end=80000,
+        milestones=[60000, 72000],
+        by_epoch=False,
+    )
+]
+# In K-Net implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f4eb6ae2764c1961f1be3ad1cbaa70d63067aa
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-512x512.py
@@ -0,0 +1,21 @@
+_base_ = 'knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py'
+
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window7_224_22k_20220308-d5bdebaf.pth'  # noqa
+# model settings
+model = dict(
+    pretrained=checkpoint_file,
+    backbone=dict(
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7,
+        use_abs_pos_embed=False,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    decode_head=dict(
+        kernel_generate_head=dict(in_channels=[192, 384, 768, 1536])),
+    auxiliary_head=dict(in_channels=768))
+# In K-Net implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-640x640.py b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..84c3d8cc6a57e8f72e5b6ba27bb648d1fe53846e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-640x640.py
@@ -0,0 +1,57 @@
+_base_ = 'knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py'
+
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window7_224_22k_20220308-d5bdebaf.pth'  # noqa
+# model settings
+crop_size = (640, 640)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    size=crop_size,
+    seg_pad_val=255)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained=checkpoint_file,
+    backbone=dict(
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7,
+        use_abs_pos_embed=False,
+        drop_path_rate=0.4,
+        patch_norm=True),
+    decode_head=dict(
+        kernel_generate_head=dict(in_channels=[192, 384, 768, 1536])),
+    auxiliary_head=dict(in_channels=768))
+
+crop_size = (640, 640)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2048, 640),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 640), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+# In K-Net implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7acec49963570ec662e9de0b2834a384ddd50c3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/knet/knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py
@@ -0,0 +1,63 @@
+_base_ = 'knet-s3_r50-d8_upernet_8xb2-adamw-80k_ade20k-512x512.py'
+
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220308-f41b89d3.pth'  # noqa
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+num_stages = 3
+conv_kernel_size = 1
+
+model = dict(
+    type='EncoderDecoder',
+    pretrained=checkpoint_file,
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        use_abs_pos_embed=False,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3)),
+    decode_head=dict(
+        kernel_generate_head=dict(in_channels=[96, 192, 384, 768])),
+    auxiliary_head=dict(in_channels=384))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    # modify learning rate following the official implementation of Swin Transformer # noqa
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.0005),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'relative_position_bias_table': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }),
+    clip_grad=dict(max_norm=1, norm_type=2))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=1000,
+        end=80000,
+        milestones=[60000, 72000],
+        by_epoch=False,
+    )
+]
+# In K-Net implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/knet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/knet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f4ab796096f545d7a47a84b2568c4393004caed
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/knet/metafile.yaml
@@ -0,0 +1,188 @@
+Collections:
+- Name: KNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - ADE20K
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  README: configs/knet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: knet-s3_r50-d8_fcn_8xb2-adamw-80k_ade20k-512x512
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.6
+      mIoU(ms+flip): 45.12
+  Config: configs/knet/knet-s3_r50-d8_fcn_8xb2-adamw-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - KNet
+    - FCN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.01
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_043751-abcab920.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_043751.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
+- Name: knet-s3_r50-d8_pspnet_8xb2-adamw-80k_ade20k-512x512
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.18
+      mIoU(ms+flip): 45.58
+  Config: configs/knet/knet-s3_r50-d8_pspnet_8xb2-adamw-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - KNet
+    - PSPNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.98
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_054634-d2c72240.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_054634.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
+- Name: knet-s3_r50-d8_deeplabv3_8xb2-adamw-80k_ade20k-512x512
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.06
+      mIoU(ms+flip): 46.11
+  Config: configs/knet/knet-s3_r50-d8_deeplabv3_8xb2-adamw-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - KNet
+    - DeepLabV3
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.42
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_041642-00c8fbeb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_041642.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
+- Name: knet-s3_r50-d8_upernet_8xb2-adamw-80k_ade20k-512x512
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.45
+      mIoU(ms+flip): 44.07
+  Config: configs/knet/knet-s3_r50-d8_upernet_8xb2-adamw-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - KNet
+    - UperNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.34
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220304_125657-215753b0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220304_125657.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
+- Name: knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.84
+      mIoU(ms+flip): 46.27
+  Config: configs/knet/knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-T
+    - KNet
+    - UperNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.57
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k_20220303_133059-7545e1dc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k_20220303_133059.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
+- Name: knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-512x512
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 52.05
+      mIoU(ms+flip): 53.24
+  Config: configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-L
+    - KNet
+    - UperNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 13.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k_20220303_154559-d8da9a90.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k_20220303_154559.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
+- Name: knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-640x640
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 52.21
+      mIoU(ms+flip): 53.34
+  Config: configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-L
+    - KNet
+    - UperNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 13.54
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k_20220301_220747-8787fc71.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k_20220301_220747.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/mae/README.md b/head_extractor/src/mmseg/.mim/configs/mae/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d14e3830be248b4556f0e0c2496bc3a94b5d6974
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mae/README.md
@@ -0,0 +1,82 @@
+# MAE
+
+> [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
+
+## Introduction
+
+<!-- [BACKBONE] -->
+
+<a href="https://github.com/facebookresearch/mae">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.24.0/mmseg/models/backbones/mae.py#L46">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+This paper shows that masked autoencoders (MAE) are scalable self-supervised learners for computer vision. Our MAE approach is simple: we mask random patches of the input image and reconstruct the missing pixels. It is based on two core designs. First, we develop an asymmetric encoder-decoder architecture, with an encoder that operates only on the visible subset of patches (without mask tokens), along with a lightweight decoder that reconstructs the original image from the latent representation and mask tokens. Second, we find that masking a high proportion of the input image, e.g., 75%, yields a nontrivial and meaningful self-supervisory task. Coupling these two designs enables us to train large models efficiently and effectively: we accelerate training (by 3x or more) and improve accuracy. Our scalable approach allows for learning high-capacity models that generalize well: e.g., a vanilla ViT-Huge model achieves the best accuracy (87.8%) among methods that use only ImageNet-1K data. Transfer performance in downstream tasks outperforms supervised pre-training and shows promising scaling behavior.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/165456416-1cba54bf-b1b5-4bdf-ad86-d6390de7f342.png" width="70%"/>
+</div>
+
+## Usage
+
+To use other repositories' pre-trained models, it is necessary to convert keys.
+
+We provide a script [`beit2mmseg.py`](../../tools/model_converters/beit2mmseg.py) in the tools directory to convert the key of MAE model from [the official repo](https://github.com/facebookresearch/mae) to MMSegmentation style.
+
+```shell
+python tools/model_converters/beit2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+E.g.
+
+```shell
+python tools/model_converters/beit2mmseg.py https://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_base.pth pretrain/mae_pretrain_vit_base_mmcls.pth
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+In our default setting, pretrained models could be defined below:
+
+| pretrained models               | original models                                                                                  |
+| ------------------------------- | ------------------------------------------------------------------------------------------------ |
+| mae_pretrain_vit_base_mmcls.pth | ['mae_pretrain_vit_base'](https://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_base.pth) |
+
+Verify the single-scale results of the model:
+
+```shell
+sh tools/dist_test.sh \
+configs/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k.py \
+upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752-f92a2975.pth $GPUS --eval mIoU
+```
+
+Since relative position embedding requires the input length and width to be equal, the sliding window is adopted for multi-scale inference. So we set min_size=512, that is, the shortest edge is 512. So the multi-scale inference of config is performed separately, instead of '--aug-test'. For multi-scale inference:
+
+```shell
+sh tools/dist_test.sh \
+configs/mae/upernet_mae-base_fp16_512x512_160k_ade20k_ms.py \
+upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752-f92a2975.pth $GPUS --eval mIoU
+```
+
+## Results and models
+
+### ADE20K
+
+| Method  | Backbone | Crop Size | pretrain    | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                                       |
+| ------- | -------- | --------- | ----------- | ----------------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| UPerNet | ViT-B    | 512x512   | ImageNet-1K | 224x224           | 16         | 160000  | 9.96     | 7.14           | V100   | 48.13 |         48.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k/upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752-f92a2975.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k/upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752.log.json) |
+
+## Citation
+
+```bibtex
+@article{he2021masked,
+  title={Masked autoencoders are scalable vision learners},
+  author={He, Kaiming and Chen, Xinlei and Xie, Saining and Li, Yanghao and Doll{\'a}r, Piotr and Girshick, Ross},
+  journal={arXiv preprint arXiv:2111.06377},
+  year={2021}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512-ms.py b/head_extractor/src/mmseg/.mim/configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512-ms.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec32fea54b00a26cb9b045d57ae42208f958a8b2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512-ms.py
@@ -0,0 +1,16 @@
+_base_ = './mae-base_upernet_8xb2-amp-160k_ade20k-512x512.py'
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    # TODO: Refactor 'MultiScaleFlipAug' which supports
+    # `min_size` feature in `Resize` class
+    # img_ratios is [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    # original image scale is (2048, 512)
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8eae174e9b48f39519fa3aad8cf2a044d7e7c7f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512.py
@@ -0,0 +1,54 @@
+_base_ = [
+    '../_base_/models/upernet_mae.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='./pretrain/mae_pretrain_vit_base_mmcls.pth',
+    backbone=dict(
+        type='MAE',
+        img_size=(512, 512),
+        patch_size=16,
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        mlp_ratio=4,
+        init_values=1.0,
+        drop_path_rate=0.1,
+        out_indices=[3, 5, 7, 11]),
+    neck=dict(embed_dim=768, rescales=[4, 2, 1, 0.5]),
+    decode_head=dict(
+        in_channels=[768, 768, 768, 768], num_classes=150, channels=768),
+    auxiliary_head=dict(in_channels=768, num_classes=150),
+    test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341)))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=1e-4, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.65),
+    constructor='LayerDecayOptimizerConstructor')
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
+
+# mixed precision
+fp16 = dict(loss_scale='dynamic')
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/mae/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/mae/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..567eafe1317055339b49ddf49d2e98fe77f488a4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mae/metafile.yaml
@@ -0,0 +1,25 @@
+Models:
+- Name: mae-base_upernet_8xb2-amp-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.13
+      mIoU(ms+flip): 48.7
+  Config: configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.96
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k/upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752-f92a2975.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k/upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752.log.json
+  Paper:
+    Title: Masked Autoencoders Are Scalable Vision Learners
+    URL: https://arxiv.org/abs/2111.06377
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.24.0/mmseg/models/backbones/mae.py#L46
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/README.md b/head_extractor/src/mmseg/.mim/configs/mask2former/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c21ab0d0c6009e0786fa659e85323a0850064ba2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/README.md
@@ -0,0 +1,74 @@
+# Mask2Former
+
+> [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/facebookresearch/Mask2Former">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Image segmentation is about grouping pixels with different semantics, e.g., category or instance membership, where each choice of semantics defines a task. While only the semantics of each task differ, current research focuses on designing specialized architectures for each task. We present Masked-attention Mask Transformer (Mask2Former), a new architecture capable of addressing any image segmentation task (panoptic, instance or semantic). Its key components include masked attention, which extracts localized features by constraining cross-attention within predicted mask regions. In addition to reducing the research effort by at least three times, it outperforms the best specialized architectures by a significant margin on four popular datasets. Most notably, Mask2Former sets a new state-of-the-art for panoptic segmentation (57.8 PQ on COCO), instance segmentation (50.1 AP on COCO) and semantic segmentation (57.7 mIoU on ADE20K).
+
+### Usage
+
+- Mask2Former model needs to install [MMDetection](https://github.com/open-mmlab/mmdetection) first.
+
+```shell
+pip install "mmdet>=3.0.0rc4"
+```
+
+## Results and models
+
+### Cityscapes
+
+| Method      | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) |                                                                                                                                                    config | download                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| ----------- | -------------- | --------- | ------- | -------: | -------------- | ------ | ----- | ------------: | --------------------------------------------------------------------------------------------------------------------------------------------------------: | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Mask2Former | R-50-D32       | 512x1024  | 90000   |     5.67 | 9.17           | A100   | 80.44 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-ffd9d750.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802.json)                                                                                      |
+| Mask2Former | R-101-D32      | 512x1024  | 90000   |     6.81 | 7.11           | A100   | 80.80 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-43e68666.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628.json))                                                                                 |
+| Mask2Former | Swin-T         | 512x1024  | 90000   |     6.36 | 7.18           | A100   | 81.71 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-36c59341.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501.json))                                                                         |
+| Mask2Former | Swin-S         | 512x1024  | 90000   |     8.09 | 5.57           | A100   | 82.57 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-9ab177f6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802.json))                                                                         |
+| Mask2Former | Swin-B (in22k) | 512x1024  | 90000   |    10.89 | 4.32           | A100   | 83.52 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-9a86a225.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030.json)) |
+| Mask2Former | Swin-L (in22k) | 512x1024  | 90000   |    15.83 | 2.86           | A100   | 83.65 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-28ad20f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901.json)) |
+
+### ADE20K
+
+| Method      | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) |                                                                                                                                                config | download                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| ----------- | -------------- | --------- | ------- | -------: | -------------- | ------ | ----- | ------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Mask2Former | R-50-D32       | 512x512   | 160000  |     3.31 | 26.59          | A100   | 47.87 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-2d1f55f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055.json))                                                                                     |
+| Mask2Former | R-101-D32      | 512x512   | 160000  |     4.09 | 22.97          | A100   | 48.60 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b7135890.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905.json))                                                                                 |
+| Mask2Former | Swin-T         | 512x512   | 160000  |     3826 | 23.82          | A100   | 48.66 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-7d64e5dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230.json))                                                                         |
+| Mask2Former | Swin-S         | 512x512   | 160000  |     3.74 | 19.69          | A100   | 51.24 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-e715144e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905.json))                                                                         |
+| Mask2Former | Swin-B         | 640x640   | 160000  |     5.66 | 12.48          | A100   | 52.44 |             - |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-a4a086d2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118.json))     |
+| Mask2Former | Swin-B (in22k) | 640x640   | 160000  |     5.66 | 12.43          | A100   | 53.90 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-7ec0f569.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230.json)) |
+| Mask2Former | Swin-L (in22k) | 640x640   | 160000  |     8.86 | 8.81           | A100   | 56.01 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-7120c214.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933.json)) |
+
+Note:
+
+- All experiments of Mask2Former are implemented with 8 A100 GPUs with 2 samplers per GPU.
+- As mentioned at [the official repo](https://github.com/facebookresearch/Mask2Former/issues/5), the results of Mask2Former are relatively not stable, the result of Mask2Former(swin-s) on ADE20K dataset in the table is the medium result obtained by training 5 times following the suggestion of the author.
+- The ResNet backbones utilized in MaskFormer models are standard `ResNet` rather than `ResNetV1c`.
+- Test time augmentation is not supported in MMSegmentation 1.x version yet, we would add "ms+flip" results as soon as possible.
+
+## Citation
+
+```bibtex
+@inproceedings{cheng2021mask2former,
+  title={Masked-attention Mask Transformer for Universal Image Segmentation},
+  author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar},
+  journal={CVPR},
+  year={2022}
+}
+@inproceedings{cheng2021maskformer,
+  title={Per-Pixel Classification is Not All You Need for Semantic Segmentation},
+  author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov},
+  journal={NeurIPS},
+  year={2021}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..48f6c12d13569cdc3a9419a40f2e052891e76cdc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,7 @@
+_base_ = ['./mask2former_r50_8xb2-160k_ade20k-512x512.py']
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..275a7dab52c42cecbed66d7e1a9b8e4e97b61170
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = ['./mask2former_r50_8xb2-90k_cityscapes-512x1024.py']
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..78cf60510c7007c72c8b1440781a3aeaed255d83
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,200 @@
+_base_ = ['../_base_/default_runtime.py', '../_base_/datasets/ade20k.py']
+
+custom_imports = dict(imports='mmdet.models', allow_failed_imports=False)
+
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size,
+    test_cfg=dict(size_divisor=32))
+num_classes = 150
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        deep_stem=False,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(512 * x * 0.1) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=2048),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=2, dataset=dict(pipeline=train_pipeline))
+
+# optimizer
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0))
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000,
+        save_best='mIoU'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2211b66a3d5279462cb8047165e67c85c689039
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,197 @@
+_base_ = ['../_base_/default_runtime.py', '../_base_/datasets/cityscapes.py']
+
+crop_size = (512, 1024)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size,
+    test_cfg=dict(size_divisor=32))
+num_classes = 19
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        deep_stem=False,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(1024 * x * 0.1) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=4096),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# optimizer
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0))
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=90000,
+        by_epoch=False)
+]
+
+# training schedule for 90k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=90000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000,
+        save_best='mIoU'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8b1d6cfff0274cd29e358f87caa73663340093d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
@@ -0,0 +1,229 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/ade20k_640x640.py'
+]
+
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_20220317-55b0104a.pth'  # noqa
+
+crop_size = (640, 640)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 150
+
+depths = [2, 2, 18, 2]
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=depths,
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[128, 256, 512, 1024],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 640) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=2560),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=2, dataset=dict(pipeline=train_pipeline))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000,
+        save_best='mIoU'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..f39a3c59066f381575ea74d0e977b841d371827f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
@@ -0,0 +1,5 @@
+_base_ = ['./mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py']
+
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth'  # noqa
+model = dict(
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained)))
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_union-640x640.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_union-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7b43ab988fa8f180c370d226f2c762e2873e641
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_union-640x640.py
@@ -0,0 +1,283 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/union_new.py'
+]
+
+pretrained = 'checkpoints/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth'  # noqa
+
+crop_size = (640, 640)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 150
+
+depths = [2, 2, 18, 2]
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=depths,
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[128, 256, 512, 1024],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset config
+# train_pipeline = [
+#     dict(type='LoadImageFromFile'),
+#     dict(type='LoadAnnotations', reduce_zero_label=False),
+#     dict(
+#         type='RandomChoiceResize',
+#         scales=[int(x * 0.1 * 640) for x in range(5, 21)],
+#         resize_type='ResizeShortestEdge',
+#         max_size=2560),
+#     dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+#     dict(type='RandomFlip', prob=0.5),
+#     dict(type='PhotoMetricDistortion'),
+#     dict(type='PackSegInputs')
+# ]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 896) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=3584),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+
+# train_dataloader = dict(batch_size=2, dataset=dict(pipeline=train_pipeline))
+set_batch_size = 4
+train_dataloader = dict(batch_size=set_batch_size, dataset=dict(pipeline=train_pipeline))
+# val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+# test_dataloader = val_dataloader
+
+
+# optimizer
+# optimizer = dict(
+#     type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+# optim_wrapper = dict(
+#     type='OptimWrapper',
+#     optimizer=optimizer,
+#     clip_grad=dict(max_norm=0.01, norm_type=2),
+#     paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+set_lr = 0.0001 * set_batch_size / 2
+optimizer = dict(
+    # type='AdamW', lr=0.00003, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999)) # when at first batchsize = 1
+    type='AdamW', lr=set_lr, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+train_epochs = 40
+val_epoch = 1
+train_dataset_num = 24747
+# train_dataset_num = 2963
+total_iter = int(train_dataset_num / set_batch_size) * train_epochs
+linearLR_end = int(total_iter / 50)
+val_iter = int(train_dataset_num / set_batch_size) * val_epoch
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=linearLR_end),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=linearLR_end,
+        end=total_iter,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# learning policy
+# param_scheduler = [
+#     dict(
+#         type='PolyLR',
+#         eta_min=0,
+#         power=0.9,
+#         begin=0,
+#         end=160000,
+#         by_epoch=False)
+# ]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000,
+        save_best='mIoU'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
+
+work_dir = './work_dirs/union_new_1/mask2former_swin-b-22k_640x640_20250116_1GPU'
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c229c145d9993f0ed07c2efc977fed9c5ebab65
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,42 @@
+_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=depths,
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[128, 256, 512, 1024]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2657e884253f993135384fd9848fbab7fdb733e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
@@ -0,0 +1,9 @@
+_base_ = ['./mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window12_384_22k_20220412-6580f57d.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        embed_dims=192,
+        num_heads=[6, 12, 24, 48],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(num_queries=100, in_channels=[192, 384, 768, 1536]))
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..01a7b9988fc011c1398976f5484830f62426924a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,42 @@
+_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window12_384_22k_20220412-6580f57d.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=depths,
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[192, 384, 768, 1536]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7796d5693b3305fc2fb2feb3f9383f81891104e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,37 @@
+_base_ = ['./mask2former_swin-t_8xb2-160k_ade20k-512x512.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        depths=depths, init_cfg=dict(type='Pretrained',
+                                     checkpoint=pretrained)))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f75544b1a560c7426d0fa9802fb5d0b072e393a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,37 @@
+_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        depths=depths, init_cfg=dict(type='Pretrained',
+                                     checkpoint=pretrained)))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9de3d242ebaa6838004f2613351176c39d4d01f7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,52 @@
+_base_ = ['./mask2former_r50_8xb2-160k_ade20k-512x512.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth'  # noqa
+depths = [2, 2, 6, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[96, 192, 384, 768]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..0abda6430ca639b022758eac4329cdbcbade4856
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,52 @@
+_base_ = ['./mask2former_r50_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth'  # noqa
+depths = [2, 2, 6, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[96, 192, 384, 768]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/head_extractor/src/mmseg/.mim/configs/mask2former/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/mask2former/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..090c95e7cfee4a704559f98e5e54f96afa6294cc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mask2former/metafile.yaml
@@ -0,0 +1,314 @@
+Collections:
+- Name: Mask2Former
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Usage
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  README: configs/mask2former/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: mask2former_r50_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.44
+  Config: configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 5.67
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-ffd9d750.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_r101_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.8
+  Config: configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - R-101-D32
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 6.81
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-43e68666.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-t_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 81.71
+  Config: configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - Swin-T
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 6.36
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-36c59341.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-s_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 82.57
+  Config: configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - Swin-S
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 8.09
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-9ab177f6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 83.52
+  Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 10.89
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-9a86a225.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 83.65
+  Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - Swin-L
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 15.83
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-28ad20f1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_r50_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.87
+  Config: configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 3.31
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-2d1f55f1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_r101_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.6
+  Config: configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D32
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 4.09
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b7135890.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-t_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.66
+  Config: configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-T
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 3826.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-7d64e5dd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-s_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 51.24
+  Config: configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-S
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 3.74
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-e715144e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 52.44
+  Config: configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 5.66
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-a4a086d2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 53.9
+  Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 5.66
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-7ec0f569.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 56.01
+  Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-L
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 8.86
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-7120c214.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/maskformer/README.md b/head_extractor/src/mmseg/.mim/configs/maskformer/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a899bac0909b3000f1e72a53480e7cd7322ad3ba
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/maskformer/README.md
@@ -0,0 +1,62 @@
+# MaskFormer
+
+> [MaskFormer: Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/facebookresearch/MaskFormer/">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/199215459-ea507126-aafe-4823-8eb1-ae6487509d5c.png" width="90%"/>
+</div>
+
+### Usage
+
+- MaskFormer model needs to install [MMDetection](https://github.com/open-mmlab/mmdetection) first.
+
+```shell
+pip install "mmdet>=3.0.0rc4"
+```
+
+## Results and models
+
+### ADE20K
+
+| Method     | Backbone  | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                                                                                     |
+| ---------- | --------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| MaskFormer | R-50-D32  | 512x512   | 160000  | 3.29     | A100           | 42.20  | 44.29 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-3a9cfe45.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724.json)                             |
+| MaskFormer | R-101-D32 | 512x512   | 160000  | 4.12     | A100           | 34.90  | 45.11 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-84adbfcb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053.json)                         |
+| MaskFormer | Swin-T    | 512x512   | 160000  | 3.73     | A100           | 40.53  | 46.69 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-f14e7ce0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813.json) |
+| MaskFormer | Swin-S    | 512x512   | 160000  | 5.33     | A100           | 26.98  | 49.36 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-723512c7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710.json) |
+
+Note:
+
+- All experiments of MaskFormer are implemented with 8 V100 (32G) GPUs with 2 samplers per GPU.
+- The results of MaskFormer are relatively not stable.  The accuracy (mIoU) of model with `R-101-D32` is from 44.7 to 46.0, and with `Swin-S` is from 49.0 to 49.8.
+- The ResNet backbones utilized in MaskFormer models are standard `ResNet` rather than `ResNetV1c`.
+- Test time augmentation is not supported in MMSegmentation 1.x version yet, we would add "ms+flip" results as soon as possible.
+
+## Citation
+
+```bibtex
+@article{cheng2021per,
+  title={Per-pixel classification is not all you need for semantic segmentation},
+  author={Cheng, Bowen and Schwing, Alex and Kirillov, Alexander},
+  journal={Advances in Neural Information Processing Systems},
+  volume={34},
+  pages={17864--17875},
+  year={2021}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..04bd37546abfe979d83a4bf488f249b16aba79e9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,7 @@
+_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/head_extractor/src/mmseg/.mim/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a83746171072fb266e4d33480943de85426d4b7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    size=crop_size,
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+# model_cfg
+num_classes = 150
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='pytorch',
+        contract_dilation=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    decode_head=dict(
+        type='MaskFormerHead',
+        in_channels=[256, 512, 1024,
+                     2048],  # input channels of pixel_decoder modules
+        feat_channels=256,
+        in_index=[0, 1, 2, 3],
+        num_classes=150,
+        out_channels=256,
+        num_queries=100,
+        pixel_decoder=dict(
+            type='mmdet.PixelDecoder',
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU')),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # DetrTransformerDecoder
+            return_intermediate=True,
+            num_layers=6,
+            layer_cfg=dict(  # DetrTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.1,
+                    proj_drop=0.1,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.1,
+                    proj_drop=0.1,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.1,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=1.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            reduction='mean',
+            loss_weight=20.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=1.0),
+        train_cfg=dict(
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=1.0),
+                    dict(
+                        type='mmdet.FocalLossCost',
+                        weight=20.0,
+                        binary_input=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=1.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'),
+)
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.0001)
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'backbone': dict(lr_mult=0.1),
+    }))
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+
+# In MaskFormer implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cbc038ac244a4984faece3455c0f3b905ea81c1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,79 @@
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
+backbone_norm_cfg = dict(type='LN', requires_grad=True)
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        embed_dims=96,
+        patch_size=4,
+        window_size=7,
+        mlp_ratio=4,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        strides=(4, 2, 2, 2),
+        out_indices=(0, 1, 2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        use_abs_pos_embed=False,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=backbone_norm_cfg,
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)),
+    decode_head=dict(
+        type='MaskFormerHead',
+        in_channels=[96, 192, 384,
+                     768],  # input channels of pixel_decoder modules
+    ))
+
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01)
+# set all layers in backbone to lr_mult=1.0
+# set all norm layers, position_embeding,
+# query_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+embed_multi = dict(decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa242dbe31f1cfaff17b6d57b9d5f5dab5695aea
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,81 @@
+_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
+
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth'  # noqa
+backbone_norm_cfg = dict(type='LN', requires_grad=True)
+depths = [2, 2, 6, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        embed_dims=96,
+        patch_size=4,
+        window_size=7,
+        mlp_ratio=4,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        strides=(4, 2, 2, 2),
+        out_indices=(0, 1, 2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        use_abs_pos_embed=False,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=backbone_norm_cfg,
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)),
+    decode_head=dict(
+        type='MaskFormerHead',
+        in_channels=[96, 192, 384,
+                     768],  # input channels of pixel_decoder modules
+    ))
+
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01)
+
+# set all layers in backbone to lr_mult=1.0
+# set all norm layers, position_embeding,
+# query_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+embed_multi = dict(decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/maskformer/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/maskformer/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c9853e131f6968deddcea14840f277b682696da0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/maskformer/metafile.yaml
@@ -0,0 +1,111 @@
+Collections:
+- Name: MaskFormer
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Usage
+    - ADE20K
+  Paper:
+    Title: 'MaskFormer: Per-Pixel Classification is Not All You Need for Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2107.06278
+  README: configs/maskformer/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: maskformer_r50-d32_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.29
+  Config: configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - MaskFormer
+    Training Resources: 8x 42.20 GPUS
+    Memory (GB): 3.29
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-3a9cfe45.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724.json
+  Paper:
+    Title: 'MaskFormer: Per-Pixel Classification is Not All You Need for Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2107.06278
+  Code: https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21
+  Framework: PyTorch
+- Name: maskformer_r101-d32_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.11
+  Config: configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D32
+    - MaskFormer
+    Training Resources: 8x 34.90 GPUS
+    Memory (GB): 4.12
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-84adbfcb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053.json
+  Paper:
+    Title: 'MaskFormer: Per-Pixel Classification is Not All You Need for Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2107.06278
+  Code: https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21
+  Framework: PyTorch
+- Name: maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.69
+  Config: configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-T
+    - MaskFormer
+    Training Resources: 8x 40.53 GPUS
+    Memory (GB): 3.73
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-f14e7ce0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813.json
+  Paper:
+    Title: 'MaskFormer: Per-Pixel Classification is Not All You Need for Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2107.06278
+  Code: https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21
+  Framework: PyTorch
+- Name: maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.36
+  Config: configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-S
+    - MaskFormer
+    Training Resources: 8x 26.98 GPUS
+    Memory (GB): 5.33
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-723512c7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710.json
+  Paper:
+    Title: 'MaskFormer: Per-Pixel Classification is Not All You Need for Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2107.06278
+  Code: https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/README.md b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bff5259129cd22a4f672e2ba24ec5bd7e746e039
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/README.md
@@ -0,0 +1,56 @@
+# MobileNetV2
+
+> [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)
+
+## Introduction
+
+<!-- [BACKBONE] -->
+
+<a href="https://github.com/tensorflow/models/tree/master/research/deeplab">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+In this paper we describe a new mobile architecture, MobileNetV2, that improves the state of the art performance of mobile models on multiple tasks and benchmarks as well as across a spectrum of different model sizes. We also describe efficient ways of applying these mobile models to object detection in a novel framework we call SSDLite. Additionally, we demonstrate how to build mobile semantic segmentation models through a reduced form of DeepLabv3 which we call Mobile DeepLabv3.
+The MobileNetV2 architecture is based on an inverted residual structure where the input and output of the residual block are thin bottleneck layers opposite to traditional residual models which use expanded representations in the input an MobileNetV2 uses lightweight depthwise convolutions to filter features in the intermediate expansion layer. Additionally, we find that it is important to remove non-linearities in the narrow layers in order to maintain representational power. We demonstrate that this improves performance and provide an intuition that led to this design. Finally, our approach allows decoupling of the input/output domains from the expressiveness of the transformation, which provides a convenient framework for further analysis. We measure our performance on Imagenet classification, COCO object detection, VOC image segmentation. We evaluate the trade-offs between accuracy, and number of operations measured by multiply-adds (MAdd), as well as the number of parameters.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142901935-fa22700e-4b77-477f-90b9-334a4197506f.png" width="50%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                              | download                                                                                                                                                                                                                                                                                                                                                                                                 |
+| ---------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN        | M-V2-D8  | 512x1024  |   80000 |      3.4 | 14.2           | A100   | 71.19 | 73.34         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024-20230224_185436-13fef4ea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024_20230224_185436.json) |
+| PSPNet     | M-V2-D8  | 512x1024  |   80000 |      3.6 | 11.2           | V100   | 70.23 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes/pspnet_m-v2-d8_512x1024_80k_cityscapes_20200825_124817-19e81d51.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes/pspnet_m-v2-d8_512x1024_80k_cityscapes-20200825_124817.log.json)                                     |
+| DeepLabV3  | M-V2-D8  | 512x1024  |   80000 |      3.9 | 8.4            | V100   | 73.84 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes/deeplabv3_m-v2-d8_512x1024_80k_cityscapes_20200825_124836-bef03590.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes/deeplabv3_m-v2-d8_512x1024_80k_cityscapes-20200825_124836.log.json)                         |
+| DeepLabV3+ | M-V2-D8  | 512x1024  |   80000 |      5.1 | 8.4            | V100   | 75.20 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes_20200825_124836-d256dd4b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes-20200825_124836.log.json)         |
+
+### ADE20K
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                                                         |
+| ---------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN        | M-V2-D8  | 512x512   |  160000 |      6.5 | 64.4           | V100   | 19.71 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb4-160k_ade20k-512x512.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k/fcn_m-v2-d8_512x512_160k_ade20k_20200825_214953-c40e1095.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k/fcn_m-v2-d8_512x512_160k_ade20k-20200825_214953.log.json)                                         |
+| PSPNet     | M-V2-D8  | 512x512   |  160000 |      6.5 | 57.7           | V100   | 29.68 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb4-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k/pspnet_m-v2-d8_512x512_160k_ade20k_20200825_214953-f5942f7a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k/pspnet_m-v2-d8_512x512_160k_ade20k-20200825_214953.log.json)                             |
+| DeepLabV3  | M-V2-D8  | 512x512   |  160000 |      6.8 | 39.9           | V100   | 34.08 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb4-160k_ade20k-512x512.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k/deeplabv3_m-v2-d8_512x512_160k_ade20k_20200825_223255-63986343.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k/deeplabv3_m-v2-d8_512x512_160k_ade20k-20200825_223255.log.json)                 |
+| DeepLabV3+ | M-V2-D8  | 512x512   |  160000 |      8.2 | 43.1           | V100   | 34.02 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k/deeplabv3plus_m-v2-d8_512x512_160k_ade20k_20200825_223255-465a01d4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k/deeplabv3plus_m-v2-d8_512x512_160k_ade20k-20200825_223255.log.json) |
+
+## Citation
+
+```bibtex
+@inproceedings{sandler2018mobilenetv2,
+  title={Mobilenetv2: Inverted residuals and linear bottlenecks},
+  author={Sandler, Mark and Howard, Andrew and Zhu, Menglong and Zhmoginov, Andrey and Chen, Liang-Chieh},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={4510--4520},
+  year={2018}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..119c9ae7d663c31212ee08cd43684b743d6086e0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/metafile.yaml
@@ -0,0 +1,186 @@
+Models:
+- Name: mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 71.19
+      mIoU(ms+flip): 73.34
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - M-V2-D8
+    - FCN
+    Training Resources: 4x A100 GPUS
+    Memory (GB): 3.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024-20230224_185436-13fef4ea.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024_20230224_185436.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 70.23
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - M-V2-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes/pspnet_m-v2-d8_512x1024_80k_cityscapes_20200825_124817-19e81d51.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes/pspnet_m-v2-d8_512x1024_80k_cityscapes-20200825_124817.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_deeplabv3_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.84
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - M-V2-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes/deeplabv3_m-v2-d8_512x1024_80k_cityscapes_20200825_124836-bef03590.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes/deeplabv3_m-v2-d8_512x1024_80k_cityscapes-20200825_124836.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.2
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - M-V2-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes_20200825_124836-d256dd4b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes-20200825_124836.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_fcn_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 19.71
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - M-V2-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k/fcn_m-v2-d8_512x512_160k_ade20k_20200825_214953-c40e1095.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k/fcn_m-v2-d8_512x512_160k_ade20k-20200825_214953.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_pspnet_4xb4-160k_ade20k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 29.68
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - M-V2-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k/pspnet_m-v2-d8_512x512_160k_ade20k_20200825_214953-f5942f7a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k/pspnet_m-v2-d8_512x512_160k_ade20k-20200825_214953.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_deeplabv3_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 34.08
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - M-V2-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k/deeplabv3_m-v2-d8_512x512_160k_ade20k_20200825_223255-63986343.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k/deeplabv3_m-v2-d8_512x512_160k_ade20k-20200825_223255.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_deeplabv3plus_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 34.02
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - M-V2-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k/deeplabv3plus_m-v2-d8_512x512_160k_ade20k_20200825_223255-465a01d4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k/deeplabv3plus_m-v2-d8_512x512_160k_ade20k-20200825_223255.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..ece9b0bf8fe3d96043f1dc86209959f8c14e7524
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,13 @@
+_base_ = '../deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320),
+    auxiliary_head=dict(in_channels=96))
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..86eec0d9489cbbf75240da394522ee41d3dd2754
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = '../deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320),
+    auxiliary_head=dict(in_channels=96))
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..195046edc497e96df5542e0767ac13d653dd93ae
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,15 @@
+_base_ = [
+    '../deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+]
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320, c1_in_channels=24),
+    auxiliary_head=dict(in_channels=96))
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4f669f163fbdd07de4a1648c04ba994c8d496bf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = '../deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320, c1_in_channels=24),
+    auxiliary_head=dict(in_channels=96))
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..0829f438a76839fb045d40ffaef70b8845ba7e93
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,13 @@
+_base_ = '../fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320),
+    auxiliary_head=dict(in_channels=96))
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..015fa6f2011a7c2a8991923e1da34ffef7cb5737
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = '../fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320),
+    auxiliary_head=dict(in_channels=96))
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..8542e0266905d127c66d4dbbd2d7189c9bb32e7f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,13 @@
+_base_ = '../pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320),
+    auxiliary_head=dict(in_channels=96))
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..73db59beaea2d6b005ecdea9053b9caa448b7100
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = '../pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320),
+    auxiliary_head=dict(in_channels=96))
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/README.md b/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8ed0a5692a13d0b172417869eb2df0953eec3d21
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/README.md
@@ -0,0 +1,50 @@
+# MobileNetV3
+
+> [Searching for MobileNetV3](https://arxiv.org/abs/1905.02244)
+
+## Introduction
+
+<!-- [BACKBONE] -->
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/tensorflow/models/tree/master/research/deeplab">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v3.py#L15">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We present the next generation of MobileNets based on a combination of complementary search techniques as well as a novel architecture design. MobileNetV3 is tuned to mobile phone CPUs through a combination of hardware-aware network architecture search (NAS) complemented by the NetAdapt algorithm and then subsequently improved through novel architecture advances. This paper starts the exploration of how automated search algorithms and network design can work together to harness complementary approaches improving the overall state of the art. Through this process we create two new MobileNet models for release: MobileNetV3-Large and MobileNetV3-Small which are targeted for high and low resource use cases. These models are then adapted and applied to the tasks of object detection and semantic segmentation. For the task of semantic segmentation (or any dense pixel prediction), we propose a new efficient segmentation decoder Lite Reduced Atrous Spatial Pyramid Pooling (LR-ASPP). We achieve new state of the art results for mobile classification, detection and segmentation. MobileNetV3-Large is 3.2% more accurate on ImageNet classification while reducing latency by 15% compared to MobileNetV2. MobileNetV3-Small is 4.6% more accurate while reducing latency by 5% compared to MobileNetV2. MobileNetV3-Large detection is 25% faster at roughly the same accuracy as MobileNetV2 on COCO detection. MobileNetV3-Large LR-ASPP is 30% faster than MobileNetV2 R-ASPP at similar accuracy for Cityscapes segmentation.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142902036-3dc2e0c0-d475-4816-b1ac-961836b41f5c.png" width="60%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                                                                                     |
+| ------ | ------------------ | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| LRASPP | M-V3-D8            | 512x1024  |  320000 |      8.9 | 15.22          | V100   | 69.54 | 70.89         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v3/mobilenet-v3-d8_lraspp_4xb4-320k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes/lraspp_m-v3-d8_512x1024_320k_cityscapes_20201224_220337-cfe8fb07.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes/lraspp_m-v3-d8_512x1024_320k_cityscapes-20201224_220337.log.json)                                     |
+| LRASPP | M-V3-D8 (scratch)  | 512x1024  |  320000 |      8.9 | 14.77          | V100   | 67.87 | 69.78         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v3/mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes_20201224_220337-9f29cd72.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes-20201224_220337.log.json)     |
+| LRASPP | M-V3s-D8           | 512x1024  |  320000 |      5.3 | 23.64          | V100   | 64.11 | 66.42         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v3/mobilenet-v3-d8-s_lraspp_4xb4-320k_cityscapes-512x1024.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes/lraspp_m-v3s-d8_512x1024_320k_cityscapes_20201224_223935-61565b34.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes/lraspp_m-v3s-d8_512x1024_320k_cityscapes-20201224_223935.log.json)                                 |
+| LRASPP | M-V3s-D8 (scratch) | 512x1024  |  320000 |      5.3 | 24.50          | V100   | 62.74 | 65.01         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v3/mobilenet-v3-d8-scratch-s_lraspp_4xb4-320k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes_20201224_223935-03daeabb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes-20201224_223935.log.json) |
+
+## Citation
+
+```bibtex
+@inproceedings{Howard_2019_ICCV,
+  title={Searching for MobileNetV3},
+  author={Howard, Andrew and Sandler, Mark and Chu, Grace and Chen, Liang-Chieh and Chen, Bo and Tan, Mingxing and Wang, Weijun and Zhu, Yukun and Pang, Ruoming and Vasudevan, Vijay and Le, Quoc V. and Adam, Hartwig},
+  booktitle={The IEEE International Conference on Computer Vision (ICCV)},
+  pages={1314-1324},
+  month={October},
+  year={2019},
+  doi={10.1109/ICCV.2019.00140}}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0351d3b8e459430ee8c3bb614a482b894fee84e9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/metafile.yaml
@@ -0,0 +1,109 @@
+Collections:
+- Name: LRASPP
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: Searching for MobileNetV3
+    URL: https://arxiv.org/abs/1905.02244
+  README: configs/mobilenet_v3/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: mobilenet-v3-d8_lraspp_4xb4-320k_cityscapes-512x1024
+  In Collection: LRASPP
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 69.54
+      mIoU(ms+flip): 70.89
+  Config: configs/mobilenet_v3/mobilenet-v3-d8_lraspp_4xb4-320k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - M-V3-D8
+    - LRASPP
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes/lraspp_m-v3-d8_512x1024_320k_cityscapes_20201224_220337-cfe8fb07.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes/lraspp_m-v3-d8_512x1024_320k_cityscapes-20201224_220337.log.json
+  Paper:
+    Title: Searching for MobileNetV3
+    URL: https://arxiv.org/abs/1905.02244
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v3.py#L15
+  Framework: PyTorch
+- Name: mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024
+  In Collection: LRASPP
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 67.87
+      mIoU(ms+flip): 69.78
+  Config: configs/mobilenet_v3/mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - M-V3-D8
+    - LRASPP
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes_20201224_220337-9f29cd72.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes-20201224_220337.log.json
+  Paper:
+    Title: Searching for MobileNetV3
+    URL: https://arxiv.org/abs/1905.02244
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v3.py#L15
+  Framework: PyTorch
+- Name: mobilenet-v3-d8-s_lraspp_4xb4-320k_cityscapes-512x1024
+  In Collection: LRASPP
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 64.11
+      mIoU(ms+flip): 66.42
+  Config: configs/mobilenet_v3/mobilenet-v3-d8-s_lraspp_4xb4-320k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - M-V3s-D8
+    - LRASPP
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes/lraspp_m-v3s-d8_512x1024_320k_cityscapes_20201224_223935-61565b34.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes/lraspp_m-v3s-d8_512x1024_320k_cityscapes-20201224_223935.log.json
+  Paper:
+    Title: Searching for MobileNetV3
+    URL: https://arxiv.org/abs/1905.02244
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v3.py#L15
+  Framework: PyTorch
+- Name: mobilenet-v3-d8-scratch-s_lraspp_4xb4-320k_cityscapes-512x1024
+  In Collection: LRASPP
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 62.74
+      mIoU(ms+flip): 65.01
+  Config: configs/mobilenet_v3/mobilenet-v3-d8-scratch-s_lraspp_4xb4-320k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - M-V3s-D8
+    - LRASPP
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes_20201224_223935-03daeabb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes-20201224_223935.log.json
+  Paper:
+    Title: Searching for MobileNetV3
+    URL: https://arxiv.org/abs/1905.02244
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v3.py#L15
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/mobilenet-v3-d8-s_lraspp_4xb4-320k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/mobilenet-v3-d8-s_lraspp_4xb4-320k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc6322fe40b3b7bc8b32f8e2a92c46ee4322f672
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/mobilenet-v3-d8-s_lraspp_4xb4-320k_cityscapes-512x1024.py
@@ -0,0 +1,23 @@
+_base_ = './mobilenet-v3-d8_lraspp_4xb4-320k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='open-mmlab://contrib/mobilenet_v3_small',
+    backbone=dict(
+        type='MobileNetV3',
+        arch='small',
+        out_indices=(0, 1, 12),
+        norm_cfg=norm_cfg),
+    decode_head=dict(
+        type='LRASPPHead',
+        in_channels=(16, 16, 576),
+        in_index=(0, 1, 2),
+        channels=128,
+        input_transform='multiple_select',
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU'),
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/mobilenet-v3-d8-scratch-s_lraspp_4xb4-320k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/mobilenet-v3-d8-scratch-s_lraspp_4xb4-320k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..7260936e60a3b5394c0092e7fefd7255385ba862
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/mobilenet-v3-d8-scratch-s_lraspp_4xb4-320k_cityscapes-512x1024.py
@@ -0,0 +1,22 @@
+_base_ = './mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        type='MobileNetV3',
+        arch='small',
+        out_indices=(0, 1, 12),
+        norm_cfg=norm_cfg),
+    decode_head=dict(
+        type='LRASPPHead',
+        in_channels=(16, 16, 576),
+        in_index=(0, 1, 2),
+        channels=128,
+        input_transform='multiple_select',
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU'),
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dcbc3395f13ad07f501be74de9667f7d5697f72
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024.py
@@ -0,0 +1,13 @@
+_base_ = [
+    '../_base_/models/lraspp_m-v3-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+# Re-config the data sampler.
+model = dict(data_preprocessor=data_preprocessor)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+runner = dict(type='IterBasedRunner', max_iters=320000)
diff --git a/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/mobilenet-v3-d8_lraspp_4xb4-320k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/mobilenet-v3-d8_lraspp_4xb4-320k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd84265f32f67632eaf9882f4dfc9480d4e89a12
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/mobilenet_v3/mobilenet-v3-d8_lraspp_4xb4-320k_cityscapes-512x1024.py
@@ -0,0 +1,16 @@
+_base_ = [
+    '../_base_/models/lraspp_m-v3-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://contrib/mobilenet_v3_large')
+
+# Re-config the data sampler.
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+runner = dict(type='IterBasedRunner', max_iters=320000)
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/README.md b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4c3f49f981e209950188b4e2c4f89e7825f412dc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/README.md
@@ -0,0 +1,68 @@
+# NonLocal Net
+
+> [Non-local Neural Networks](https://arxiv.org/abs/1711.07971)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/facebookresearch/video-nonlocal-net">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Both convolutional and recurrent operations are building blocks that process one local neighborhood at a time. In this paper, we present non-local operations as a generic family of building blocks for capturing long-range dependencies. Inspired by the classical non-local means method in computer vision, our non-local operation computes the response at a position as a weighted sum of the features at all positions. This building block can be plugged into many computer vision architectures. On the task of video classification, even without any bells and whistles, our non-local models can compete or outperform current competition winners on both Kinetics and Charades datasets. In static image recognition, our non-local models improve object detection/segmentation and pose estimation on the COCO suite of tasks. Code is available at [this https URL](https://github.com/facebookresearch/video-nonlocal-net).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142902128-17e29678-bf12-4ff4-b3d6-a39b47dfd253.png" width="50%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method      | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                 | download                                                                                                                                                                                                                                                                                                                                                                     |
+| ----------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | -------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| NonLocalNet | R-50-D8  | 512x1024  |   40000 | 7.4      | 2.72           | V100   | 78.24 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes/nonlocal_r50-d8_512x1024_40k_cityscapes_20200605_210748-c75e81e3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes/nonlocal_r50-d8_512x1024_40k_cityscapes_20200605_210748.log.json)     |
+| NonLocalNet | R-101-D8 | 512x1024  |   40000 | 10.9     | 1.95           | V100   | 78.66 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes/nonlocal_r101-d8_512x1024_40k_cityscapes_20200605_210748-d63729fa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes/nonlocal_r101-d8_512x1024_40k_cityscapes_20200605_210748.log.json) |
+| NonLocalNet | R-50-D8  | 769x769   |   40000 | 8.9      | 1.52           | V100   | 78.33 | 79.92         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes/nonlocal_r50-d8_769x769_40k_cityscapes_20200530_045243-82ef6749.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes/nonlocal_r50-d8_769x769_40k_cityscapes_20200530_045243.log.json)         |
+| NonLocalNet | R-101-D8 | 769x769   |   40000 | 12.8     | 1.05           | V100   | 78.57 | 80.29         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes/nonlocal_r101-d8_769x769_40k_cityscapes_20200530_045348-8fe9a9dc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes/nonlocal_r101-d8_769x769_40k_cityscapes_20200530_045348.log.json)     |
+| NonLocalNet | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 78.01 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes/nonlocal_r50-d8_512x1024_80k_cityscapes_20200607_193518-d6839fae.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes/nonlocal_r50-d8_512x1024_80k_cityscapes_20200607_193518.log.json)     |
+| NonLocalNet | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 78.93 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes/nonlocal_r101-d8_512x1024_80k_cityscapes_20200607_183411-32700183.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes/nonlocal_r101-d8_512x1024_80k_cityscapes_20200607_183411.log.json) |
+| NonLocalNet | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 79.05 | 80.68         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes/nonlocal_r50-d8_769x769_80k_cityscapes_20200607_193506-1f9792f6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes/nonlocal_r50-d8_769x769_80k_cityscapes_20200607_193506.log.json)         |
+| NonLocalNet | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 79.40 | 80.85         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes/nonlocal_r101-d8_769x769_80k_cityscapes_20200607_183428-0e1fa4f9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes/nonlocal_r101-d8_769x769_80k_cityscapes_20200607_183428.log.json)     |
+
+### ADE20K
+
+| Method      | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                     |
+| ----------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| NonLocalNet | R-50-D8  | 512x512   |   80000 | 9.1      | 21.37          | V100   | 40.75 |         42.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k/nonlocal_r50-d8_512x512_80k_ade20k_20200615_015801-5ae0aa33.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k/nonlocal_r50-d8_512x512_80k_ade20k_20200615_015801.log.json)         |
+| NonLocalNet | R-101-D8 | 512x512   |   80000 | 12.6     | 13.97          | V100   | 42.90 |         44.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k/nonlocal_r101-d8_512x512_80k_ade20k_20200615_015758-24105919.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k/nonlocal_r101-d8_512x512_80k_ade20k_20200615_015758.log.json)     |
+| NonLocalNet | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 42.03 |         43.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k/nonlocal_r50-d8_512x512_160k_ade20k_20200616_005410-baef45e3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k/nonlocal_r50-d8_512x512_160k_ade20k_20200616_005410.log.json)     |
+| NonLocalNet | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 44.63 |         45.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k/nonlocal_r101-d8_512x512_160k_ade20k_20210827_221502-7881aa1a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k/nonlocal_r101-d8_512x512_160k_ade20k_20210827_221502.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method      | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                              | download                                                                                                                                                                                                                                                                                                                                                         |
+| ----------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| NonLocalNet | R-50-D8  | 512x512   |   20000 | 6.4      | 21.21          | V100   | 76.20 |         77.12 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug/nonlocal_r50-d8_512x512_20k_voc12aug_20200617_222613-07f2a57c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug/nonlocal_r50-d8_512x512_20k_voc12aug_20200617_222613.log.json)     |
+| NonLocalNet | R-101-D8 | 512x512   |   20000 | 9.8      | 14.01          | V100   | 78.15 |         78.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug/nonlocal_r101-d8_512x512_20k_voc12aug_20200617_222615-948c68ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug/nonlocal_r101-d8_512x512_20k_voc12aug_20200617_222615.log.json) |
+| NonLocalNet | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 76.65 |         77.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug/nonlocal_r50-d8_512x512_40k_voc12aug_20200614_000028-0139d4a9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug/nonlocal_r50-d8_512x512_40k_voc12aug_20200614_000028.log.json)     |
+| NonLocalNet | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 78.27 |         79.12 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug/nonlocal_r101-d8_512x512_40k_voc12aug_20200614_000028-7e5ff470.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug/nonlocal_r101-d8_512x512_40k_voc12aug_20200614_000028.log.json) |
+
+## Citation
+
+```bibtex
+@inproceedings{wang2018non,
+  title={Non-local neural networks},
+  author={Wang, Xiaolong and Girshick, Ross and Gupta, Abhinav and He, Kaiming},
+  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
+  pages={7794--7803},
+  year={2018}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..69bd72570b8d45da20d641f19d7eb2f2a29588d3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/metafile.yaml
@@ -0,0 +1,387 @@
+Collections:
+- Name: NonLocalNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  README: configs/nonlocal_net/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.24
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes/nonlocal_r50-d8_512x1024_40k_cityscapes_20200605_210748-c75e81e3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes/nonlocal_r50-d8_512x1024_40k_cityscapes_20200605_210748.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.66
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes/nonlocal_r101-d8_512x1024_40k_cityscapes_20200605_210748-d63729fa.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes/nonlocal_r101-d8_512x1024_40k_cityscapes_20200605_210748.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.33
+      mIoU(ms+flip): 79.92
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes/nonlocal_r50-d8_769x769_40k_cityscapes_20200530_045243-82ef6749.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes/nonlocal_r50-d8_769x769_40k_cityscapes_20200530_045243.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.57
+      mIoU(ms+flip): 80.29
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes/nonlocal_r101-d8_769x769_40k_cityscapes_20200530_045348-8fe9a9dc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes/nonlocal_r101-d8_769x769_40k_cityscapes_20200530_045348.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.01
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes/nonlocal_r50-d8_512x1024_80k_cityscapes_20200607_193518-d6839fae.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes/nonlocal_r50-d8_512x1024_80k_cityscapes_20200607_193518.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.93
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes/nonlocal_r101-d8_512x1024_80k_cityscapes_20200607_183411-32700183.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes/nonlocal_r101-d8_512x1024_80k_cityscapes_20200607_183411.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.05
+      mIoU(ms+flip): 80.68
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes/nonlocal_r50-d8_769x769_80k_cityscapes_20200607_193506-1f9792f6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes/nonlocal_r50-d8_769x769_80k_cityscapes_20200607_193506.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.4
+      mIoU(ms+flip): 80.85
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes/nonlocal_r101-d8_769x769_80k_cityscapes_20200607_183428-0e1fa4f9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes/nonlocal_r101-d8_769x769_80k_cityscapes_20200607_183428.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.75
+      mIoU(ms+flip): 42.05
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k/nonlocal_r50-d8_512x512_80k_ade20k_20200615_015801-5ae0aa33.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k/nonlocal_r50-d8_512x512_80k_ade20k_20200615_015801.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.9
+      mIoU(ms+flip): 44.27
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k/nonlocal_r101-d8_512x512_80k_ade20k_20200615_015758-24105919.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k/nonlocal_r101-d8_512x512_80k_ade20k_20200615_015758.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.03
+      mIoU(ms+flip): 43.04
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k/nonlocal_r50-d8_512x512_160k_ade20k_20200616_005410-baef45e3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k/nonlocal_r50-d8_512x512_160k_ade20k_20200616_005410.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.63
+      mIoU(ms+flip): 45.79
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k/nonlocal_r101-d8_512x512_160k_ade20k_20210827_221502-7881aa1a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k/nonlocal_r101-d8_512x512_160k_ade20k_20210827_221502.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.2
+      mIoU(ms+flip): 77.12
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug/nonlocal_r50-d8_512x512_20k_voc12aug_20200617_222613-07f2a57c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug/nonlocal_r50-d8_512x512_20k_voc12aug_20200617_222613.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.15
+      mIoU(ms+flip): 78.86
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug/nonlocal_r101-d8_512x512_20k_voc12aug_20200617_222615-948c68ab.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug/nonlocal_r101-d8_512x512_20k_voc12aug_20200617_222615.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.65
+      mIoU(ms+flip): 77.47
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug/nonlocal_r50-d8_512x512_40k_voc12aug_20200614_000028-0139d4a9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug/nonlocal_r50-d8_512x512_40k_voc12aug_20200614_000028.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.27
+      mIoU(ms+flip): 79.12
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug/nonlocal_r101-d8_512x512_40k_voc12aug_20200614_000028-7e5ff470.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug/nonlocal_r101-d8_512x512_40k_voc12aug_20200614_000028.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fcf7bcb16016949a49dc6a7f21221cbf87c495f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee984c2bbd6407907f15fc19c289b054ac814865
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..aca80d676a2f82b210eacbfeb7d32f89fc8b70a0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a7aeea7f6b755774c19124c99a91b0ca6c29e58
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cdb3caaf33376bba84bc5cb8f6fafaa6f0b9a3f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7cacea5178f22adfad95c22c2e1890881cda448
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec475443e8275de210e314a65e5e434d73402354
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca79f6fdc0630806a17fe1bd42aa8cd6b0dc0352
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4d5fd22f956b3cb876c3bf8830305c92a18dd77
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/nonlocal_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..17423f2658f84fce535d49fb4e3f5b6ffdb5ec39
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/nonlocal_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cc752c2c7293e66eb0ce59ea4568df215495f30
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/nonlocal_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..f855a814e507dea59ba17ed1272e4679cf140a66
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/nonlocal_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..848be4a233d19b449fff49a961be4bdf4f6a9e4e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/nonlocal_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd840a03d1c0491ea9c41691b5e7544c39ccb522
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/nonlocal_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..0efb9d096917ba901358f2cce10bd216291d021f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/nonlocal_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..52783bcc9881ef59fe72d5a29b69faf77e52accd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/nonlocal_net/nonlocal_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/nonlocal_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/README.md b/head_extractor/src/mmseg/.mim/configs/ocrnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..628a3b1597cdac03a875da6437cf1f5c88454fcb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/README.md
@@ -0,0 +1,89 @@
+# OCRNet
+
+> [Object-Contextual Representations for Semantic Segmentation](https://arxiv.org/abs/1909.11065)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/openseg-group/OCNet.pytorch">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+In this paper, we address the problem of semantic segmentation and focus on the context aggregation strategy for robust segmentation. Our motivation is that the label of a pixel is the category of the object that the pixel belongs to. We present a simple yet effective approach, object-contextual representations, characterizing a pixel by exploiting the representation of the corresponding object class. First, we construct object regions based on a feature map supervised by the ground-truth segmentation, and then compute the object region representations. Second, we compute the representation similarity between each pixel and each object region, and augment the representation of each pixel with an object contextual representation, which is a weighted aggregation of all the object region representations according to their similarities with the pixel. We empirically demonstrate that the proposed approach achieves competitive performance on six challenging semantic segmentation benchmarks: Cityscapes, ADE20K, LIP, PASCAL VOC 2012, PASCAL-Context and COCO-Stuff. Notably, we achieved the \\nth{2} place on the Cityscapes leader-board with a single model.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142902197-b06b1e04-57ab-44ac-adc8-cea6695bb236.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+#### HRNet backbone
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                         |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| OCRNet | HRNetV2p-W18-Small | 512x1024  |   40000 | 3.5      | 10.45          | A100   | 76.61 |         78.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024_20230227_145026-6c052a14.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024_20230227_145026.json) |
+| OCRNet | HRNetV2p-W18       | 512x1024  |   40000 | 4.7      | 7.50           | V100   | 77.72 |         79.49 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb2-40k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes/ocrnet_hr18_512x1024_40k_cityscapes_20200601_033320-401c5bdd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes/ocrnet_hr18_512x1024_40k_cityscapes_20200601_033320.log.json)                     |
+| OCRNet | HRNetV2p-W48       | 512x1024  |   40000 | 8        | 4.22           | V100   | 80.58 |         81.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb2-40k_cityscapes-512x1024.pyy)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes/ocrnet_hr48_512x1024_40k_cityscapes_20200601_033336-55b32491.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes/ocrnet_hr48_512x1024_40k_cityscapes_20200601_033336.log.json)                     |
+| OCRNet | HRNetV2p-W18-Small | 512x1024  |   80000 | -        | -              | V100   | 77.16 |         78.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes/ocrnet_hr18s_512x1024_80k_cityscapes_20200601_222735-55979e63.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes/ocrnet_hr18s_512x1024_80k_cityscapes_20200601_222735.log.json)                 |
+| OCRNet | HRNetV2p-W18       | 512x1024  |   80000 | -        | -              | V100   | 78.57 |         80.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb2-80k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes/ocrnet_hr18_512x1024_80k_cityscapes_20200614_230521-c2e1dd4a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes/ocrnet_hr18_512x1024_80k_cityscapes_20200614_230521.log.json)                     |
+| OCRNet | HRNetV2p-W48       | 512x1024  |   80000 | -        | -              | V100   | 80.70 |         81.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb2-80k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes/ocrnet_hr48_512x1024_80k_cityscapes_20200601_222752-9076bcdf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes/ocrnet_hr48_512x1024_80k_cityscapes_20200601_222752.log.json)                     |
+| OCRNet | HRNetV2p-W18-Small | 512x1024  |  160000 | -        | -              | V100   | 78.45 |         79.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb2-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes/ocrnet_hr18s_512x1024_160k_cityscapes_20200602_191005-f4a7af28.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes/ocrnet_hr18s_512x1024_160k_cityscapes_20200602_191005.log.json)             |
+| OCRNet | HRNetV2p-W18       | 512x1024  |  160000 | -        | -              | V100   | 79.47 |         80.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb2-160k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes/ocrnet_hr18_512x1024_160k_cityscapes_20200602_191001-b9172d0c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes/ocrnet_hr18_512x1024_160k_cityscapes_20200602_191001.log.json)                 |
+| OCRNet | HRNetV2p-W48       | 512x1024  |  160000 | -        | -              | V100   | 81.35 |         82.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb2-160k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes/ocrnet_hr48_512x1024_160k_cityscapes_20200602_191037-dfbf1b0c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes/ocrnet_hr48_512x1024_160k_cityscapes_20200602_191037.log.json)                 |
+
+#### ResNet backbone
+
+| Method | Backbone | Crop Size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| OCRNet | R-101-D8 | 512x1024  | 8          | 40000   | -        | -              | V100   | 80.09 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes/ocrnet_r101-d8_512x1024_40k_b8_cityscapes_20200717_110721-02ac0f13.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes/ocrnet_r101-d8_512x1024_40k_b8_cityscapes_20200717_110721.log.json)     |
+| OCRNet | R-101-D8 | 512x1024  | 16         | 40000   | 8.8      | 3.02           | V100   | 80.30 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_r101-d8_8xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes/ocrnet_r101-d8_512x1024_40k_b16_cityscapes_20200723_193726-db500f80.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes/ocrnet_r101-d8_512x1024_40k_b16_cityscapes_20200723_193726.log.json) |
+| OCRNet | R-101-D8 | 512x1024  | 16         | 80000   | 8.8      | 3.02           | V100   | 80.81 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_r101-d8_8xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes/ocrnet_r101-d8_512x1024_80k_b16_cityscapes_20200723_192421-78688424.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes/ocrnet_r101-d8_512x1024_80k_b16_cityscapes_20200723_192421.log.json) |
+
+### ADE20K
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                  | download                                                                                                                                                                                                                                                                                                                         |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| OCRNet | HRNetV2p-W18-Small | 512x512   |   80000 | 6.7      | 28.98          | V100   | 35.06 |         35.80 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_80k_ade20k/ocrnet_hr18s_512x512_80k_ade20k_20200615_055600-e80b62af.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_80k_ade20k/ocrnet_hr18s_512x512_80k_ade20k_20200615_055600.log.json)     |
+| OCRNet | HRNetV2p-W18       | 512x512   |   80000 | 7.9      | 18.93          | V100   | 37.79 |         39.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_80k_ade20k/ocrnet_hr18_512x512_80k_ade20k_20200615_053157-d173d83b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_80k_ade20k/ocrnet_hr18_512x512_80k_ade20k_20200615_053157.log.json)         |
+| OCRNet | HRNetV2p-W48       | 512x512   |   80000 | 11.2     | 16.99          | V100   | 43.00 |         44.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_80k_ade20k/ocrnet_hr48_512x512_80k_ade20k_20200615_021518-d168c2d1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_80k_ade20k/ocrnet_hr48_512x512_80k_ade20k_20200615_021518.log.json)         |
+| OCRNet | HRNetV2p-W18-Small | 512x512   |  160000 | -        | -              | V100   | 37.19 |         38.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_160k_ade20k/ocrnet_hr18s_512x512_160k_ade20k_20200615_184505-8e913058.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_160k_ade20k/ocrnet_hr18s_512x512_160k_ade20k_20200615_184505.log.json) |
+| OCRNet | HRNetV2p-W18       | 512x512   |  160000 | -        | -              | V100   | 39.32 |         40.80 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_160k_ade20k/ocrnet_hr18_512x512_160k_ade20k_20200615_200940-d8fcd9d1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_160k_ade20k/ocrnet_hr18_512x512_160k_ade20k_20200615_200940.log.json)     |
+| OCRNet | HRNetV2p-W48       | 512x512   |  160000 | -        | -              | V100   | 43.25 |         44.88 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_160k_ade20k/ocrnet_hr48_512x512_160k_ade20k_20200615_184705-a073726d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_160k_ade20k/ocrnet_hr48_512x512_160k_ade20k_20200615_184705.log.json)     |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                             |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| OCRNet | HRNetV2p-W18-Small | 512x512   |   20000 | 3.5      | 31.55          | V100   | 71.70 |         73.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug/ocrnet_hr18s_512x512_20k_voc12aug_20200617_233913-02b04fcb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug/ocrnet_hr18s_512x512_20k_voc12aug_20200617_233913.log.json) |
+| OCRNet | HRNetV2p-W18       | 512x512   |   20000 | 4.7      | 19.91          | V100   | 74.75 |         77.11 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_20k_voc12aug/ocrnet_hr18_512x512_20k_voc12aug_20200617_233932-8954cbb7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_20k_voc12aug/ocrnet_hr18_512x512_20k_voc12aug_20200617_233932.log.json)     |
+| OCRNet | HRNetV2p-W48       | 512x512   |   20000 | 8.1      | 17.83          | V100   | 77.72 |         79.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_20k_voc12aug/ocrnet_hr48_512x512_20k_voc12aug_20200617_233932-9e82080a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_20k_voc12aug/ocrnet_hr48_512x512_20k_voc12aug_20200617_233932.log.json)     |
+| OCRNet | HRNetV2p-W18-Small | 512x512   |   40000 | -        | -              | V100   | 72.76 |         74.60 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug/ocrnet_hr18s_512x512_40k_voc12aug_20200614_002025-42b587ac.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug/ocrnet_hr18s_512x512_40k_voc12aug_20200614_002025.log.json) |
+| OCRNet | HRNetV2p-W18       | 512x512   |   40000 | -        | -              | V100   | 74.98 |         77.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_40k_voc12aug/ocrnet_hr18_512x512_40k_voc12aug_20200614_015958-714302be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_40k_voc12aug/ocrnet_hr18_512x512_40k_voc12aug_20200614_015958.log.json)     |
+| OCRNet | HRNetV2p-W48       | 512x512   |   40000 | -        | -              | V100   | 77.14 |         79.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_40k_voc12aug/ocrnet_hr48_512x512_40k_voc12aug_20200614_015958-255bc5ce.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_40k_voc12aug/ocrnet_hr48_512x512_40k_voc12aug_20200614_015958.log.json)     |
+
+## Citation
+
+```bibtex
+@article{YuanW18,
+  title={Ocnet: Object context network for scene parsing},
+  author={Yuhui Yuan and Jingdong Wang},
+  booktitle={arXiv preprint arXiv:1809.00916},
+  year={2018}
+}
+
+@article{YuanCW20,
+  title={Object-Contextual Representations for Semantic Segmentation},
+  author={Yuhui Yuan and Xilin Chen and Jingdong Wang},
+  booktitle={ECCV},
+  year={2020}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/ocrnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5467feb9753c5dec1033e526ad3d993bf00d8f14
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/metafile.yaml
@@ -0,0 +1,577 @@
+Collections:
+- Name: OCRNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - '# HRNet backbone'
+    - '# ResNet backbone'
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  README: configs/ocrnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: ocrnet_hr18s_4xb2-40k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 76.61
+      mIoU(ms+flip): 78.01
+  Config: configs/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x A100 GPUS
+    Memory (GB): 3.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024_20230227_145026-6c052a14.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024_20230227_145026.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb2-40k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 77.72
+      mIoU(ms+flip): 79.49
+  Config: configs/ocrnet/ocrnet_hr18_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes/ocrnet_hr18_512x1024_40k_cityscapes_20200601_033320-401c5bdd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes/ocrnet_hr18_512x1024_40k_cityscapes_20200601_033320.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb2-40k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 80.58
+      mIoU(ms+flip): 81.79
+  Config: configs/ocrnet/ocrnet_hr48_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes/ocrnet_hr48_512x1024_40k_cityscapes_20200601_033336-55b32491.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes/ocrnet_hr48_512x1024_40k_cityscapes_20200601_033336.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18s_4xb2-80k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 77.16
+      mIoU(ms+flip): 78.66
+  Config: configs/ocrnet/ocrnet_hr18s_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes/ocrnet_hr18s_512x1024_80k_cityscapes_20200601_222735-55979e63.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes/ocrnet_hr18s_512x1024_80k_cityscapes_20200601_222735.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb2-80k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 78.57
+      mIoU(ms+flip): 80.46
+  Config: configs/ocrnet/ocrnet_hr18_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes/ocrnet_hr18_512x1024_80k_cityscapes_20200614_230521-c2e1dd4a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes/ocrnet_hr18_512x1024_80k_cityscapes_20200614_230521.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb2-80k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 80.7
+      mIoU(ms+flip): 81.87
+  Config: configs/ocrnet/ocrnet_hr48_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes/ocrnet_hr48_512x1024_80k_cityscapes_20200601_222752-9076bcdf.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes/ocrnet_hr48_512x1024_80k_cityscapes_20200601_222752.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18s_4xb2-160k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 78.45
+      mIoU(ms+flip): 79.97
+  Config: configs/ocrnet/ocrnet_hr18s_4xb2-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes/ocrnet_hr18s_512x1024_160k_cityscapes_20200602_191005-f4a7af28.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes/ocrnet_hr18s_512x1024_160k_cityscapes_20200602_191005.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb2-160k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 79.47
+      mIoU(ms+flip): 80.91
+  Config: configs/ocrnet/ocrnet_hr18_4xb2-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes/ocrnet_hr18_512x1024_160k_cityscapes_20200602_191001-b9172d0c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes/ocrnet_hr18_512x1024_160k_cityscapes_20200602_191001.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb2-160k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 81.35
+      mIoU(ms+flip): 82.7
+  Config: configs/ocrnet/ocrnet_hr48_4xb2-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes/ocrnet_hr48_512x1024_160k_cityscapes_20200602_191037-dfbf1b0c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes/ocrnet_hr48_512x1024_160k_cityscapes_20200602_191037.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# ResNet backbone'
+    Metrics:
+      mIoU: 80.09
+  Config: configs/ocrnet/ocrnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# ResNet backbone'
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes/ocrnet_r101-d8_512x1024_40k_b8_cityscapes_20200717_110721-02ac0f13.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes/ocrnet_r101-d8_512x1024_40k_b8_cityscapes_20200717_110721.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_r101-d8_8xb2-40k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# ResNet backbone'
+    Metrics:
+      mIoU: 80.3
+  Config: configs/ocrnet/ocrnet_r101-d8_8xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# ResNet backbone'
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - OCRNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes/ocrnet_r101-d8_512x1024_40k_b16_cityscapes_20200723_193726-db500f80.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes/ocrnet_r101-d8_512x1024_40k_b16_cityscapes_20200723_193726.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_r101-d8_8xb2-80k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# ResNet backbone'
+    Metrics:
+      mIoU: 80.81
+  Config: configs/ocrnet/ocrnet_r101-d8_8xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# ResNet backbone'
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - OCRNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes/ocrnet_r101-d8_512x1024_80k_b16_cityscapes_20200723_192421-78688424.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes/ocrnet_r101-d8_512x1024_80k_b16_cityscapes_20200723_192421.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18s_4xb4-80k_ade20k-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 35.06
+      mIoU(ms+flip): 35.8
+  Config: configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_80k_ade20k/ocrnet_hr18s_512x512_80k_ade20k_20200615_055600-e80b62af.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_80k_ade20k/ocrnet_hr18s_512x512_80k_ade20k_20200615_055600.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb4-80k_ade20k-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 37.79
+      mIoU(ms+flip): 39.16
+  Config: configs/ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_80k_ade20k/ocrnet_hr18_512x512_80k_ade20k_20200615_053157-d173d83b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_80k_ade20k/ocrnet_hr18_512x512_80k_ade20k_20200615_053157.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb4-80k_ade20k-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.0
+      mIoU(ms+flip): 44.3
+  Config: configs/ocrnet/ocrnet_hr48_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_80k_ade20k/ocrnet_hr48_512x512_80k_ade20k_20200615_021518-d168c2d1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_80k_ade20k/ocrnet_hr48_512x512_80k_ade20k_20200615_021518.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18s_4xb4-80k_ade20k-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 37.19
+      mIoU(ms+flip): 38.4
+  Config: configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_160k_ade20k/ocrnet_hr18s_512x512_160k_ade20k_20200615_184505-8e913058.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_160k_ade20k/ocrnet_hr18s_512x512_160k_ade20k_20200615_184505.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb4-80k_ade20k-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 39.32
+      mIoU(ms+flip): 40.8
+  Config: configs/ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_160k_ade20k/ocrnet_hr18_512x512_160k_ade20k_20200615_200940-d8fcd9d1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_160k_ade20k/ocrnet_hr18_512x512_160k_ade20k_20200615_200940.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb4-160k_ade20k-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.25
+      mIoU(ms+flip): 44.88
+  Config: configs/ocrnet/ocrnet_hr48_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_160k_ade20k/ocrnet_hr48_512x512_160k_ade20k_20200615_184705-a073726d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_160k_ade20k/ocrnet_hr48_512x512_160k_ade20k_20200615_184705.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18s_4xb4-20k_voc12aug-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 71.7
+      mIoU(ms+flip): 73.84
+  Config: configs/ocrnet/ocrnet_hr18s_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug/ocrnet_hr18s_512x512_20k_voc12aug_20200617_233913-02b04fcb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug/ocrnet_hr18s_512x512_20k_voc12aug_20200617_233913.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb4-20k_voc12aug-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 74.75
+      mIoU(ms+flip): 77.11
+  Config: configs/ocrnet/ocrnet_hr18_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_20k_voc12aug/ocrnet_hr18_512x512_20k_voc12aug_20200617_233932-8954cbb7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_20k_voc12aug/ocrnet_hr18_512x512_20k_voc12aug_20200617_233932.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb4-20k_voc12aug-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.72
+      mIoU(ms+flip): 79.87
+  Config: configs/ocrnet/ocrnet_hr48_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_20k_voc12aug/ocrnet_hr48_512x512_20k_voc12aug_20200617_233932-9e82080a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_20k_voc12aug/ocrnet_hr48_512x512_20k_voc12aug_20200617_233932.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18s_4xb4-40k_voc12aug-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 72.76
+      mIoU(ms+flip): 74.6
+  Config: configs/ocrnet/ocrnet_hr18s_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug/ocrnet_hr18s_512x512_40k_voc12aug_20200614_002025-42b587ac.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug/ocrnet_hr18s_512x512_40k_voc12aug_20200614_002025.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb4-40k_voc12aug-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 74.98
+      mIoU(ms+flip): 77.4
+  Config: configs/ocrnet/ocrnet_hr18_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_40k_voc12aug/ocrnet_hr18_512x512_40k_voc12aug_20200614_015958-714302be.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_40k_voc12aug/ocrnet_hr18_512x512_40k_voc12aug_20200614_015958.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb4-40k_voc12aug-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.14
+      mIoU(ms+flip): 79.71
+  Config: configs/ocrnet/ocrnet_hr48_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_40k_voc12aug/ocrnet_hr48_512x512_40k_voc12aug_20200614_015958-255bc5ce.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_40k_voc12aug/ocrnet_hr48_512x512_40k_voc12aug_20200614_015958.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb2-160k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb2-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..659217cf69d55b12492051ccac177da4f5733452
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb2-160k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/ocrnet_hr18.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..d401c4b1e77ecdc71dedc2b6316dbcf8c5f75bf9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/ocrnet_hr18.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..44426a28e161dc68c56008302d34f5e17def52b9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/ocrnet_hr18.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..353005b57bd46beb7eabdf93249cff5362b0e4c8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/ocrnet_hr18.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[18, 36, 72, 144],
+            channels=sum([18, 36, 72, 144]),
+            in_index=(0, 1, 2, 3),
+            input_transform='resize_concat',
+            kernel_size=1,
+            num_convs=1,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=150,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[18, 36, 72, 144],
+            in_index=(0, 1, 2, 3),
+            input_transform='resize_concat',
+            channels=512,
+            ocr_channels=256,
+            dropout_ratio=-1,
+            num_classes=150,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ])
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c696c21e966c209ee1cf80678bb74e579d2c7eda
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,40 @@
+_base_ = [
+    '../_base_/models/ocrnet_hr18.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[18, 36, 72, 144],
+            channels=sum([18, 36, 72, 144]),
+            in_index=(0, 1, 2, 3),
+            input_transform='resize_concat',
+            kernel_size=1,
+            num_convs=1,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=21,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[18, 36, 72, 144],
+            in_index=(0, 1, 2, 3),
+            input_transform='resize_concat',
+            channels=512,
+            ocr_channels=256,
+            dropout_ratio=-1,
+            num_classes=21,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ])
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6b69ea632da380287cbb3c9dde264395b568147
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,40 @@
+_base_ = [
+    '../_base_/models/ocrnet_hr18.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[18, 36, 72, 144],
+            channels=sum([18, 36, 72, 144]),
+            in_index=(0, 1, 2, 3),
+            input_transform='resize_concat',
+            kernel_size=1,
+            num_convs=1,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=21,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[18, 36, 72, 144],
+            in_index=(0, 1, 2, 3),
+            input_transform='resize_concat',
+            channels=512,
+            ocr_channels=256,
+            dropout_ratio=-1,
+            num_classes=21,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ])
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ceca8df6965748b8a762fdd4463fb34d1e686296
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/ocrnet_hr18.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[18, 36, 72, 144],
+            channels=sum([18, 36, 72, 144]),
+            in_index=(0, 1, 2, 3),
+            input_transform='resize_concat',
+            kernel_size=1,
+            num_convs=1,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=150,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[18, 36, 72, 144],
+            in_index=(0, 1, 2, 3),
+            input_transform='resize_concat',
+            channels=512,
+            ocr_channels=256,
+            dropout_ratio=-1,
+            num_classes=150,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ])
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb2-160k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb2-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5388fb75177e266c9ba351c8c31cea488a0a8c2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb2-160k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb2-160k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..2335f3b7620652fbe9a768fe77961d460c596fae
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb2-40k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d1a8fa8481c85c1e3844a257a36e8083d2e168
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..fabf5826cd6081a1d80ab01fec6351b8137a62b3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eca655cfce554160171c1e50582fc90db966832
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb4-20k_voc12aug-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..13b02b9df6b452fae559f3d7676221a942644a69
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb4-40k_voc12aug-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..60c79c2dc5f2035acd9b6ad55e22b3a6ea08f1e6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb4-80k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb2-160k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb2-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..184d38dd2cb8c961f05b66e76f12803d0e9e7614
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb2-160k_cityscapes-512x1024.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb2-160k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..7025ee9e77139e24d3b2246d3976f4b903ee9412
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb2-40k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c68a15fc5e657b14efc27eab592f0668657eefb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb2-80k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..e74976c80510d0363b8335425a3459d53d39fc7d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb4-160k_ade20k-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=150,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=150,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..f015b920e1dcc4afae22921991650e614d685647
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb4-20k_voc12aug-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=21,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=21,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..baafa380d41ccb89b52785d2f00cc6de945467c6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb4-40k_voc12aug-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=21,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=21,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..85514b9d7edc9d0a160ca26a1f883f6054411da8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_hr48_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb4-80k_ade20k-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=150,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=150,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..a94597bf35347017f137aa0d0528bccec4d48f39
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/ocrnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_r101-d8_8xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_r101-d8_8xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..88e5ad08fd5989ddd8efc6640756b7aef79595a8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_r101-d8_8xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,21 @@
+_base_ = [
+    '../_base_/models/ocrnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(depth=101))
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=2e-4,
+        power=0.9,
+        begin=0,
+        end=40000,
+        by_epoch=False)
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_r101-d8_8xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_r101-d8_8xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3b420909cef60a559e83a8b047bc34f658ae8d5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/ocrnet/ocrnet_r101-d8_8xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,21 @@
+_base_ = [
+    '../_base_/models/ocrnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(depth=101))
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=2e-4,
+        power=0.9,
+        begin=0,
+        end=40000,
+        by_epoch=False)
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/person_parsing/depth_anything_large_mask2former_16xb1_160k_fashion10k_896x896.py b/head_extractor/src/mmseg/.mim/configs/person_parsing/depth_anything_large_mask2former_16xb1_160k_fashion10k_896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f081e108fd1b1edfe3be49bd1cf1b99206c7477
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/person_parsing/depth_anything_large_mask2former_16xb1_160k_fashion10k_896x896.py
@@ -0,0 +1,564 @@
+auto_scale_lr = dict(base_batch_size=16, enable=False)
+backbone_embed_multi = dict(decay_mult=0.0, lr_mult=0.1)
+backbone_norm_multi = dict(decay_mult=0.0, lr_mult=0.1)
+crop_size = (
+    896,
+    896,
+)
+custom_keys = dict({
+    'backbone.dinov2':
+    dict(decay_mult=1.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.0.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.1.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.10.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.11.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.12.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.13.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.14.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.15.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.16.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.17.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.18.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.19.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.2.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.20.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.21.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.22.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.23.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.3.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.4.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.5.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.6.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.7.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.8.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.blocks.9.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'backbone.dinov2.norm':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'level_embed':
+    dict(decay_mult=0.0, lr_mult=1.0),
+    'pos_embed':
+    dict(decay_mult=0.0, lr_mult=0.1),
+    'query_embed':
+    dict(decay_mult=0.0, lr_mult=1.0),
+    'query_feat':
+    dict(decay_mult=0.0, lr_mult=1.0)
+})
+data_preprocessor = dict(
+    bgr_to_rgb=True,
+    mean=[
+        123.675,
+        116.28,
+        103.53,
+    ],
+    pad_val=0,
+    seg_pad_val=255,
+    size=(
+        896,
+        896,
+    ),
+    std=[
+        58.395,
+        57.12,
+        57.375,
+    ],
+    type='SegDataPreProcessor')
+data_root = '/mnt/data_ssd/limaopeng/limaopeng/segmentation/mmsegmentation/configs/person_parsing/human_parsing_fashion_dataset.py'
+dataset_type = 'HumanParsingFashionDataset'
+default_hooks = dict(
+    checkpoint=dict(
+        by_epoch=False,
+        interval=5000,
+        max_keep_ckpts=1,
+        save_best='mIoU',
+        type='CheckpointHook'),
+    logger=dict(interval=50, log_metric_by_epoch=False, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='SegVisualizationHook'))
+default_scope = 'mmseg'
+embed_multi = dict(decay_mult=0.0, lr_mult=1.0)
+env_cfg = dict(
+    cudnn_benchmark=True,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+find_unused_parameters = True
+img_ratios = [
+    0.5,
+    0.75,
+    1.0,
+    1.25,
+    1.5,
+    1.75,
+]
+launcher = 'pytorch'
+load_from = None
+log_level = 'INFO'
+log_processor = dict(by_epoch=False)
+fp16 = True
+model = dict(
+    backbone=dict(
+        freeze=False,
+        load_from='./checkpoints/depth_anything_vitl14.pth',
+        type='DINOv2',
+        version='large'),
+    data_preprocessor=dict(
+        bgr_to_rgb=True,
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        pad_val=0,
+        seg_pad_val=255,
+        size=(
+            896,
+            896,
+        ),
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        type='SegDataPreProcessor'),
+    decode_head=dict(
+        align_corners=False,
+        enforce_decoder_input_project=False,
+        feat_channels=1024,
+        in_channels=[
+            1024,
+            1024,
+            1024,
+            1024,
+        ],
+        loss_cls=dict(
+            class_weight=[
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                0.1,
+            ],
+            loss_weight=2.0,
+            reduction='mean',
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False),
+        loss_dice=dict(
+            activate=True,
+            eps=1.0,
+            loss_weight=5.0,
+            naive_dice=True,
+            reduction='mean',
+            type='mmdet.DiceLoss',
+            use_sigmoid=True),
+        loss_mask=dict(
+            loss_weight=5.0,
+            reduction='mean',
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True),
+        num_classes=57,
+        num_queries=200,
+        num_transformer_feat_level=3,
+        out_channels=1024,
+        pixel_decoder=dict(
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(
+                init_cfg=None,
+                layer_cfg=dict(
+                    ffn_cfg=dict(
+                        act_cfg=dict(inplace=True, type='ReLU'),
+                        embed_dims=1024,
+                        feedforward_channels=4096,
+                        ffn_drop=0.0,
+                        num_fcs=2),
+                    self_attn_cfg=dict(
+                        batch_first=True,
+                        dropout=0.0,
+                        embed_dims=1024,
+                        im2col_step=64,
+                        init_cfg=None,
+                        norm_cfg=None,
+                        num_heads=32,
+                        num_levels=3,
+                        num_points=4)),
+                num_layers=6),
+            init_cfg=None,
+            norm_cfg=dict(num_groups=32, type='GN'),
+            num_outs=3,
+            positional_encoding=dict(normalize=True, num_feats=512),
+            type='mmdet.MSDeformAttnPixelDecoder'),
+        positional_encoding=dict(normalize=True, num_feats=512),
+        train_cfg=dict(
+            assigner=dict(
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        use_sigmoid=True,
+                        weight=5.0),
+                    dict(
+                        eps=1.0,
+                        pred_act=True,
+                        type='mmdet.DiceCost',
+                        weight=5.0),
+                ],
+                type='mmdet.HungarianAssigner'),
+            importance_sample_ratio=0.75,
+            num_points=12544,
+            oversample_ratio=3.0,
+            sampler=dict(type='mmdet.MaskPseudoSampler')),
+        transformer_decoder=dict(
+            init_cfg=None,
+            layer_cfg=dict(
+                cross_attn_cfg=dict(
+                    attn_drop=0.0,
+                    batch_first=True,
+                    dropout_layer=None,
+                    embed_dims=1024,
+                    num_heads=32,
+                    proj_drop=0.0),
+                ffn_cfg=dict(
+                    act_cfg=dict(inplace=True, type='ReLU'),
+                    add_identity=True,
+                    dropout_layer=None,
+                    embed_dims=1024,
+                    feedforward_channels=4096,
+                    ffn_drop=0.0,
+                    num_fcs=2),
+                self_attn_cfg=dict(
+                    attn_drop=0.0,
+                    batch_first=True,
+                    dropout_layer=None,
+                    embed_dims=1024,
+                    num_heads=32,
+                    proj_drop=0.0)),
+            num_layers=9,
+            return_intermediate=True),
+        type='Mask2FormerHead'),
+    neck=dict(
+        embed_dim=1024, rescales=[
+            4,
+            2,
+            1,
+            0.5,
+        ], type='Feature2Pyramid'),
+    test_cfg=dict(crop_size=(
+        896,
+        896,
+    ), mode='slide', stride=(
+        426,
+        426,
+    )),
+    train_cfg=dict(),
+    type='EncoderDecoder')
+num_classes = 57
+optim_wrapper = dict(
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    optimizer=dict(
+        betas=(
+            0.9,
+            0.999,
+        ),
+        eps=1e-08,
+        lr=3e-05,
+        type='AdamW',
+        weight_decay=0.05),
+    paramwise_cfg=dict(
+        custom_keys=dict({
+            'backbone.dinov2':
+            dict(decay_mult=1.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.0.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.1.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.10.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.11.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.12.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.13.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.14.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.15.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.16.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.17.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.18.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.19.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.2.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.20.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.21.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.22.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.23.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.3.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.4.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.5.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.6.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.7.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.8.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.blocks.9.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'backbone.dinov2.norm':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'level_embed':
+            dict(decay_mult=0.0, lr_mult=1.0),
+            'pos_embed':
+            dict(decay_mult=0.0, lr_mult=0.1),
+            'query_embed':
+            dict(decay_mult=0.0, lr_mult=1.0),
+            'query_feat':
+            dict(decay_mult=0.0, lr_mult=1.0)
+        }),
+        norm_decay_mult=0.0),
+    type='OptimWrapper')
+optimizer = dict(
+    betas=(
+        0.9,
+        0.999,
+    ), eps=1e-08, lr=3e-05, type='AdamW', weight_decay=0.05)
+param_scheduler = [
+    dict(
+        begin=0, by_epoch=False, end=1500, start_factor=1e-06,
+        type='LinearLR'),
+    dict(
+        begin=1500,
+        by_epoch=False,
+        end=160000,
+        eta_min=0.0,
+        power=1.0,
+        type='PolyLR'),
+]
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(img_path='val/images', seg_map_path='val/labels'),
+        data_root=
+        '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/human_parsing_fashion_dataset',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                896,
+                896,
+            ), type='Resize'),
+            dict(reduce_zero_label=False, type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type='DeepFashion10KDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(keep_ratio=True, scale=(
+        896,
+        896,
+    ), type='Resize'),
+    dict(reduce_zero_label=False, type='LoadAnnotations'),
+    dict(type='PackSegInputs'),
+]
+train_cfg = dict(
+    max_iters=160000, type='IterBasedTrainLoop', val_interval=5000)
+train_dataloader = dict(
+    batch_size=4,
+    dataset=dict(
+        data_prefix=dict(img_path='train/images', seg_map_path='train/labels'),
+        data_root=
+        '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/human_parsing_fashion_dataset',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(reduce_zero_label=False, type='LoadAnnotations'),
+            dict(
+                max_size=896,
+                resize_type='ResizeShortestEdge',
+                scales=[
+                    448,
+                    537,
+                    627,
+                    716,
+                    806,
+                    896,
+                    985,
+                    1075,
+                    1164,
+                    1254,
+                    1344,
+                    1433,
+                    1523,
+                    1612,
+                    1702,
+                    1792,
+                ],
+                type='RandomChoiceResize'),
+            # dict(
+            #     cat_max_ratio=0.75, crop_size=(
+            #         896,
+            #         896,
+            #     ), type='RandomCrop'),
+            # dict(prob=0.5, type='RandomFlip'),
+            # dict(type='PhotoMetricDistortion'),
+            dict(type='PackSegInputs'),
+        ],
+        type='DeepFashion10KDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='InfiniteSampler'))
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(reduce_zero_label=False, type='LoadAnnotations'),
+    dict(
+        max_size=896,
+        resize_type='ResizeShortestEdge',
+        scales=[
+            448,
+            537,
+            627,
+            716,
+            806,
+            896,
+            985,
+            1075,
+            1164,
+            1254,
+            1344,
+            1433,
+            1523,
+            1612,
+            1702,
+            1792,
+        ],
+        type='RandomChoiceResize'),
+    # dict(cat_max_ratio=0.75, crop_size=(
+    #     896,
+    #     896,
+    # ), type='RandomCrop'),
+    # dict(prob=0.5, type='RandomFlip'),
+    # dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs'),
+]
+tta_model = dict(type='SegTTAModel')
+tta_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(
+        transforms=[
+            [
+                dict(keep_ratio=True, scale_factor=0.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=0.75, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.0, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.25, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.75, type='Resize'),
+            ],
+            [
+                dict(direction='horizontal', prob=0.0, type='RandomFlip'),
+                dict(direction='horizontal', prob=1.0, type='RandomFlip'),
+            ],
+            [
+                dict(type='LoadAnnotations'),
+            ],
+            [
+                dict(type='PackSegInputs'),
+            ],
+        ],
+        type='TestTimeAug'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(img_path='val/images', seg_map_path='val/labels'),
+        data_root=
+        '/mnt/data_ssd/limaopeng/limaopeng/segmentation/dataset/human_parsing_fashion_dataset',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                896,
+                896,
+            ), type='Resize'),
+            dict(reduce_zero_label=False, type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type='DeepFashion10KDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    name='visualizer',
+    type='SegLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+work_dir = './work_dirs/human_parsing_fashion_dataset_20250429'
diff --git a/head_extractor/src/mmseg/.mim/configs/person_parsing/human_parsing_fashion_dataset.py b/head_extractor/src/mmseg/.mim/configs/person_parsing/human_parsing_fashion_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ee61c63d726984537bd1b81cc025948168cb400
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/person_parsing/human_parsing_fashion_dataset.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class HumanParsingFashionDataset(BaseSegDataset):
+    """
+        HumanParsingFashionDataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'Background', 'shirt, blouse', 'top, t-shirt, sweatshirt', 'sweater', 'cardigan', 'jacket', 'vest', 'pants', 'shorts', 'skirt', 'coat', 'dress', 'jumpsuit', 'cape', 'glasses', 'hat', 'headband, head covering, hair accessory', 'tie', 'glove', 'watch', 'belt', 'leg warmer', 'tights, stockings', 'sock', 'shoe', 'bag, wallet', 'scarf', 'umbrella', 'hood', 'collar', 'lapel', 'epaulette', 'sleeve', 'pocket', 'neckline', 'buckle', 'zipper', 'applique', 'bead', 'bow', 'flower', 'fringe', 'ribbon', 'rivet', 'ruffle', 'sequin', 'tassel', 'Hair', 'Sunglasses', 'Upper-clothes', 'Left-shoe', 'Right-shoe', 'Face', 'Left-leg', 'Right-leg', 'Left-arm', 'Right-arm'),
+
+        palette=[
+                [0, 0, 0], [255, 0, 0], [0, 128, 0], [0, 0, 255],
+                [0, 128, 128], [238, 130, 238], [128, 128, 128], [255, 255, 0],
+                [255, 153, 18], [255, 125, 64], [127, 255, 0], [175, 238, 238],
+                [138, 43, 226], [210, 105, 30], [0, 0, 139], [72, 61, 139],
+                [255, 20, 147], [255, 192, 203], [205, 92, 92], [32, 178, 170],
+                [132, 112, 255], [160, 82, 45], [255, 222, 173],
+                [240, 230, 140], [173, 216, 230], [250, 128, 114], [107, 142, 35],
+                [72, 209, 204], [199, 21, 133], [25, 25, 112], [123, 104, 238],
+                [0, 250, 154], [34, 139, 34], [219, 112, 147], [240, 128, 128],
+                [143, 188, 143], [47, 79, 79], [188, 143, 143], [100, 149, 237],
+                [102, 205, 170], [255, 160, 122], [147, 112, 219], [60, 179, 113],
+                [139, 0, 139], [255, 215, 0], [233, 150, 122], [0, 206, 209],
+                [148, 0, 211], [144, 238, 144], [255, 105, 180], [30, 144, 255],
+                [255, 140, 0], [153, 50, 204], [220, 20, 60], [46, 139, 87],
+                [240, 230, 155], [255, 99, 71]
+            ])
+           
+        # palette=[
+        #     [0, 0, 0], [0, 192, 64], [0, 64, 96], [128, 192, 192],
+        #     [0, 64, 64], [0, 192, 224], [0, 192, 192], [128, 192, 64],
+        #     [0, 192, 96], [128, 192, 64], [128, 32, 192], [0, 0, 224],
+        #     [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+        #     [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+        #     [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+        #     ],)
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/.mim/configs/pidnet/README.md b/head_extractor/src/mmseg/.mim/configs/pidnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e23efbd3f3aaab795880d3532535ad3e64b712cc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pidnet/README.md
@@ -0,0 +1,50 @@
+# PIDNet
+
+> [PIDNet: A Real-time Semantic Segmentation Network Inspired from PID Controller](https://arxiv.org/pdf/2206.02066.pdf)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/XuJiacong/PIDNet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/pidnet.py">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Two-branch network architecture has shown its efficiency and effectiveness for real-time semantic segmentation tasks. However, direct fusion of low-level details and high-level semantics will lead to a phenomenon that the detailed features are easily overwhelmed by surrounding contextual information, namely overshoot in this paper, which limits the improvement of the accuracy of existed two-branch models. In this paper, we bridge a connection between Convolutional Neural Network (CNN) and Proportional-IntegralDerivative (PID) controller and reveal that the two-branch network is nothing but a Proportional-Integral (PI) controller, which inherently suffers from the similar overshoot issue. To alleviate this issue, we propose a novel threebranch network architecture: PIDNet, which possesses three branches to parse the detailed, context and boundary information (derivative of semantics), respectively, and employs boundary attention to guide the fusion of detailed and context branches in final stage. The family of PIDNets achieve the best trade-off between inference speed and accuracy and their test accuracy surpasses all the existed models with similar inference speed on Cityscapes, CamVid and COCO-Stuff datasets. Especially, PIDNet-S achieves 78.6% mIOU with inference speed of 93.2 FPS on Cityscapes test set and 80.1% mIOU with speed of 153.7 FPS on CamVid test set.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://raw.githubusercontent.com/XuJiacong/PIDNet/main/figs/pidnet.jpg" width="800"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | -------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PIDNet | PIDNet-S | 1024x1024 | 120000  | 3.38     | 80.82          | A100   | 78.74 | 80.87         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes/pidnet-s_2xb6-120k_1024x1024-cityscapes_20230302_191700-bb8e3bcc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes/pidnet-s_2xb6-120k_1024x1024-cityscapes_20230302_191700.json) |
+| PIDNet | PIDNet-M | 1024x1024 | 120000  | 5.14     | 71.98          | A100   | 80.22 | 82.05         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes/pidnet-m_2xb6-120k_1024x1024-cityscapes_20230301_143452-f9bcdbf3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes/pidnet-m_2xb6-120k_1024x1024-cityscapes_20230301_143452.json) |
+| PIDNet | PIDNet-L | 1024x1024 | 120000  | 5.83     | 60.06          | A100   | 80.89 | 82.37         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes/pidnet-l_2xb6-120k_1024x1024-cityscapes_20230303_114514-0783ca6b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes/pidnet-l_2xb6-120k_1024x1024-cityscapes_20230303_114514.json) |
+
+## Notes
+
+The pretrained weights in config files are converted from [the official repo](https://github.com/XuJiacong/PIDNet#models).
+
+## Citation
+
+```bibtex
+@misc{xu2022pidnet,
+      title={PIDNet: A Real-time Semantic Segmentation Network Inspired from PID Controller},
+      author={Jiacong Xu and Zixiang Xiong and Shankar P. Bhattacharyya},
+      year={2022},
+      eprint={2206.02066},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/pidnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/pidnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..51b514a487d194996674a3de112ccdd25a11982a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pidnet/metafile.yaml
@@ -0,0 +1,85 @@
+Collections:
+- Name: PIDNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: 'PIDNet: A Real-time Semantic Segmentation Network Inspired from PID Controller'
+    URL: https://arxiv.org/pdf/2206.02066.pdf
+  README: configs/pidnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: pidnet-s_2xb6-120k_1024x1024-cityscapes
+  In Collection: PIDNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.74
+      mIoU(ms+flip): 80.87
+  Config: configs/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 12
+    Architecture:
+    - PIDNet-S
+    - PIDNet
+    Training Resources: 2x A100 GPUS
+    Memory (GB): 3.38
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes/pidnet-s_2xb6-120k_1024x1024-cityscapes_20230302_191700-bb8e3bcc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes/pidnet-s_2xb6-120k_1024x1024-cityscapes_20230302_191700.json
+  Paper:
+    Title: 'PIDNet: A Real-time Semantic Segmentation Network Inspired from PID Controller'
+    URL: https://arxiv.org/pdf/2206.02066.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/pidnet.py
+  Framework: PyTorch
+- Name: pidnet-m_2xb6-120k_1024x1024-cityscapes
+  In Collection: PIDNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.22
+      mIoU(ms+flip): 82.05
+  Config: configs/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 12
+    Architecture:
+    - PIDNet-M
+    - PIDNet
+    Training Resources: 2x A100 GPUS
+    Memory (GB): 5.14
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes/pidnet-m_2xb6-120k_1024x1024-cityscapes_20230301_143452-f9bcdbf3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes/pidnet-m_2xb6-120k_1024x1024-cityscapes_20230301_143452.json
+  Paper:
+    Title: 'PIDNet: A Real-time Semantic Segmentation Network Inspired from PID Controller'
+    URL: https://arxiv.org/pdf/2206.02066.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/pidnet.py
+  Framework: PyTorch
+- Name: pidnet-l_2xb6-120k_1024x1024-cityscapes
+  In Collection: PIDNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.89
+      mIoU(ms+flip): 82.37
+  Config: configs/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 12
+    Architecture:
+    - PIDNet-L
+    - PIDNet
+    Training Resources: 2x A100 GPUS
+    Memory (GB): 5.83
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes/pidnet-l_2xb6-120k_1024x1024-cityscapes_20230303_114514-0783ca6b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes/pidnet-l_2xb6-120k_1024x1024-cityscapes_20230303_114514.json
+  Paper:
+    Title: 'PIDNet: A Real-time Semantic Segmentation Network Inspired from PID Controller'
+    URL: https://arxiv.org/pdf/2206.02066.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/pidnet.py
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes.py b/head_extractor/src/mmseg/.mim/configs/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..1955c91e051ec50ea3d6e7b9e663dff1e4e2d5c7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes.py
@@ -0,0 +1,10 @@
+_base_ = './pidnet-s_2xb6-120k_1024x1024-cityscapes.py'
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/pidnet/pidnet-l_imagenet1k_20230306-67889109.pth'  # noqa
+model = dict(
+    backbone=dict(
+        channels=64,
+        ppm_channels=112,
+        num_stem_blocks=3,
+        num_branch_blocks=4,
+        init_cfg=dict(checkpoint=checkpoint_file)),
+    decode_head=dict(in_channels=256, channels=256))
diff --git a/head_extractor/src/mmseg/.mim/configs/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes.py b/head_extractor/src/mmseg/.mim/configs/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..38a69c1c45a22366250ccda60c769cdfa32b8b94
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes.py
@@ -0,0 +1,5 @@
+_base_ = './pidnet-s_2xb6-120k_1024x1024-cityscapes.py'
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/pidnet/pidnet-m_imagenet1k_20230306-39893c52.pth'  # noqa
+model = dict(
+    backbone=dict(channels=64, init_cfg=dict(checkpoint=checkpoint_file)),
+    decode_head=dict(in_channels=256))
diff --git a/head_extractor/src/mmseg/.mim/configs/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes.py b/head_extractor/src/mmseg/.mim/configs/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..f70ca4287ab2d3266e69951cce957875853be06d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes.py
@@ -0,0 +1,113 @@
+_base_ = [
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py'
+]
+
+# The class_weight is borrowed from https://github.com/openseg-group/OCNet.pytorch/issues/14 # noqa
+# Licensed under the MIT License
+class_weight = [
+    0.8373, 0.918, 0.866, 1.0345, 1.0166, 0.9969, 0.9754, 1.0489, 0.8786,
+    1.0023, 0.9539, 0.9843, 1.1116, 0.9037, 1.0865, 1.0955, 1.0865, 1.1529,
+    1.0507
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/pidnet/pidnet-s_imagenet1k_20230306-715e6273.pth'  # noqa
+crop_size = (1024, 1024)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='PIDNet',
+        in_channels=3,
+        channels=32,
+        ppm_channels=96,
+        num_stem_blocks=2,
+        num_branch_blocks=3,
+        align_corners=False,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU', inplace=True),
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)),
+    decode_head=dict(
+        type='PIDHead',
+        in_channels=128,
+        channels=128,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU', inplace=True),
+        align_corners=True,
+        loss_decode=[
+            dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=False,
+                class_weight=class_weight,
+                loss_weight=0.4),
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=class_weight,
+                loss_weight=1.0),
+            dict(type='BoundaryLoss', loss_weight=20.0),
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=class_weight,
+                loss_weight=1.0)
+        ]),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='GenerateEdge', edge_width=4),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=6, dataset=dict(pipeline=train_pipeline))
+
+iters = 120000
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=iters,
+        by_epoch=False)
+]
+# training schedule for 120k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=iters, val_interval=iters // 10)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=iters // 10),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+randomness = dict(seed=304)
diff --git a/head_extractor/src/mmseg/.mim/configs/point_rend/README.md b/head_extractor/src/mmseg/.mim/configs/point_rend/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..487d3bcc7f9506ce010c296cbcf1ba4cf0392d43
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/point_rend/README.md
@@ -0,0 +1,51 @@
+# PointRend
+
+> [PointRend: Image Segmentation as Rendering](https://arxiv.org/abs/1912.08193)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/point_head.py#L36">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We present a new method for efficient high-quality image segmentation of objects and scenes. By analogizing classical computer graphics methods for efficient rendering with over- and undersampling challenges faced in pixel labeling tasks, we develop a unique perspective of image segmentation as a rendering problem. From this vantage, we present the PointRend (Point-based Rendering) neural network module: a module that performs point-based segmentation predictions at adaptively selected locations based on an iterative subdivision algorithm. PointRend can be flexibly applied to both instance and semantic segmentation tasks by building on top of existing state-of-the-art models. While many concrete implementations of the general idea are possible, we show that a simple design already achieves excellent results. Qualitatively, PointRend outputs crisp object boundaries in regions that are over-smoothed by previous methods. Quantitatively, PointRend yields significant gains on COCO and Cityscapes, for both instance and semantic segmentation. PointRend's efficiency enables output resolutions that are otherwise impractical in terms of memory or computation compared to existing approaches. Code has been made available at [this https URL](https://github.com/facebookresearch/detectron2/tree/main/projects/PointRend).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142902293-5db49cdd-4b1b-4940-9067-2acd6196c700.png" width="60%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                         |
+| --------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PointRend | R-50     | 512x1024  |   80000 |      3.1 | 8.48           | V100   | 76.47 | 78.13         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/point_rend/pointrend_r50_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x1024_80k_cityscapes/pointrend_r50_512x1024_80k_cityscapes_20200711_015821-bb1ff523.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x1024_80k_cityscapes/pointrend_r50_512x1024_80k_cityscapes-20200715_214714.log.json)     |
+| PointRend | R-101    | 512x1024  |   80000 |      4.2 | 7.00           | V100   | 78.30 | 79.97         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/point_rend/pointrend_r101_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x1024_80k_cityscapes/pointrend_r101_512x1024_80k_cityscapes_20200711_170850-d0ca84be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x1024_80k_cityscapes/pointrend_r101_512x1024_80k_cityscapes-20200715_214824.log.json) |
+
+### ADE20K
+
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                         |
+| --------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| PointRend | R-50     | 512x512   |  160000 |      5.1 | 17.31          | V100   | 37.64 | 39.17         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/point_rend/pointrend_r50_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x512_160k_ade20k/pointrend_r50_512x512_160k_ade20k_20200807_232644-ac3febf2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x512_160k_ade20k/pointrend_r50_512x512_160k_ade20k-20200807_232644.log.json)     |
+| PointRend | R-101    | 512x512   |  160000 |      6.1 | 15.50          | V100   | 40.02 | 41.60         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/point_rend/pointrend_r101_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x512_160k_ade20k/pointrend_r101_512x512_160k_ade20k_20200808_030852-8834902a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x512_160k_ade20k/pointrend_r101_512x512_160k_ade20k-20200808_030852.log.json) |
+
+## Citation
+
+```bibtex
+@inproceedings{kirillov2020pointrend,
+  title={Pointrend: Image segmentation as rendering},
+  author={Kirillov, Alexander and Wu, Yuxin and He, Kaiming and Girshick, Ross},
+  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+  pages={9799--9808},
+  year={2020}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/point_rend/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/point_rend/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..064717c9df61fc77425c789105f7f946ebe8084e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/point_rend/metafile.yaml
@@ -0,0 +1,110 @@
+Collections:
+- Name: PointRend
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: 'PointRend: Image Segmentation as Rendering'
+    URL: https://arxiv.org/abs/1912.08193
+  README: configs/point_rend/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: pointrend_r50_4xb2-80k_cityscapes-512x1024
+  In Collection: PointRend
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.47
+      mIoU(ms+flip): 78.13
+  Config: configs/point_rend/pointrend_r50_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50
+    - PointRend
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x1024_80k_cityscapes/pointrend_r50_512x1024_80k_cityscapes_20200711_015821-bb1ff523.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x1024_80k_cityscapes/pointrend_r50_512x1024_80k_cityscapes-20200715_214714.log.json
+  Paper:
+    Title: 'PointRend: Image Segmentation as Rendering'
+    URL: https://arxiv.org/abs/1912.08193
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/point_head.py#L36
+  Framework: PyTorch
+- Name: pointrend_r101_4xb2-80k_cityscapes-512x1024
+  In Collection: PointRend
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.3
+      mIoU(ms+flip): 79.97
+  Config: configs/point_rend/pointrend_r101_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101
+    - PointRend
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x1024_80k_cityscapes/pointrend_r101_512x1024_80k_cityscapes_20200711_170850-d0ca84be.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x1024_80k_cityscapes/pointrend_r101_512x1024_80k_cityscapes-20200715_214824.log.json
+  Paper:
+    Title: 'PointRend: Image Segmentation as Rendering'
+    URL: https://arxiv.org/abs/1912.08193
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/point_head.py#L36
+  Framework: PyTorch
+- Name: pointrend_r50_4xb4-160k_ade20k-512x512
+  In Collection: PointRend
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 37.64
+      mIoU(ms+flip): 39.17
+  Config: configs/point_rend/pointrend_r50_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50
+    - PointRend
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x512_160k_ade20k/pointrend_r50_512x512_160k_ade20k_20200807_232644-ac3febf2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x512_160k_ade20k/pointrend_r50_512x512_160k_ade20k-20200807_232644.log.json
+  Paper:
+    Title: 'PointRend: Image Segmentation as Rendering'
+    URL: https://arxiv.org/abs/1912.08193
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/point_head.py#L36
+  Framework: PyTorch
+- Name: pointrend_r101_4xb4-160k_ade20k-512x512
+  In Collection: PointRend
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.02
+      mIoU(ms+flip): 41.6
+  Config: configs/point_rend/pointrend_r101_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101
+    - PointRend
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x512_160k_ade20k/pointrend_r101_512x512_160k_ade20k_20200808_030852-8834902a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x512_160k_ade20k/pointrend_r101_512x512_160k_ade20k-20200808_030852.log.json
+  Paper:
+    Title: 'PointRend: Image Segmentation as Rendering'
+    URL: https://arxiv.org/abs/1912.08193
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/point_head.py#L36
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/point_rend/pointrend_r101_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/point_rend/pointrend_r101_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca2a19a196e868cbbbfb9de88c91cc2a128ef7c8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/point_rend/pointrend_r101_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './pointrend_r50_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/point_rend/pointrend_r101_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/point_rend/pointrend_r101_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..6729d3b672eb6116cb25d6ae6c4bb30d22f7208a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/point_rend/pointrend_r101_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pointrend_r50_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/point_rend/pointrend_r50_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/point_rend/pointrend_r50_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb005d8bee4dd141e579409ae78b2e1269fd2849
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/point_rend/pointrend_r50_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/pointrend_r50.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=200),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=200,
+        end=80000,
+        by_epoch=False,
+    )
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/point_rend/pointrend_r50_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/point_rend/pointrend_r50_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..d350fa686b3f2d2f7ba64caaf06120dc6e94541b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/point_rend/pointrend_r50_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,46 @@
+_base_ = [
+    '../_base_/models/pointrend_r50.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=[
+        dict(
+            type='FPNHead',
+            in_channels=[256, 256, 256, 256],
+            in_index=[0, 1, 2, 3],
+            feature_strides=[4, 8, 16, 32],
+            channels=128,
+            dropout_ratio=-1,
+            num_classes=150,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='PointHead',
+            in_channels=[256],
+            in_index=[0],
+            channels=256,
+            num_fcs=3,
+            coarse_pred_each_layer=True,
+            dropout_ratio=-1,
+            num_classes=150,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=200),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=200,
+        end=160000,
+        by_epoch=False,
+    )
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/poolformer/README.md b/head_extractor/src/mmseg/.mim/configs/poolformer/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e6e2eac210ea68d2c0ba799bffddc9608d72088f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/poolformer/README.md
@@ -0,0 +1,65 @@
+# PoolFormer
+
+> [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)
+
+## Introduction
+
+<!-- [BACKBONE] -->
+
+<a href="https://github.com/sail-sg/poolformer/tree/main/segmentation">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmclassification/blob/v0.23.0/mmcls/models/backbones/poolformer.py#L198">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Transformers have shown great potential in computer vision tasks. A common belief is their attention-based token mixer module contributes most to their competence. However, recent works show the attention-based module in transformers can be replaced by spatial MLPs and the resulted models still perform quite well. Based on this observation, we hypothesize that the general architecture of the transformers, instead of the specific token mixer module, is more essential to the model's performance. To verify this, we deliberately replace the attention module in transformers with an embarrassingly simple spatial pooling operator to conduct only the most basic token mixing. Surprisingly, we observe that the derived model, termed as PoolFormer, achieves competitive performance on multiple computer vision tasks. For example, on ImageNet-1K, PoolFormer achieves 82.1% top-1 accuracy, surpassing well-tuned vision transformer/MLP-like baselines DeiT-B/ResMLP-B24 by 0.3%/1.1% accuracy with 35%/52% fewer parameters and 48%/60% fewer MACs. The effectiveness of PoolFormer verifies our hypothesis and urges us to initiate the concept of "MetaFormer", a general architecture abstracted from transformers without specifying the token mixer. Based on the extensive experiments, we argue that MetaFormer is the key player in achieving superior results for recent transformer and MLP-like models on vision tasks. This work calls for more future research dedicated to improving MetaFormer instead of focusing on the token mixer modules. Additionally, our proposed PoolFormer could serve as a starting baseline for future MetaFormer architecture design. Code is available at [this https URL](https://github.com/sail-sg/poolformer)
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/15921929/144710761-1635f59a-abde-4946-984c-a2c3f22a19d2.png" width="70%"/>
+</div>
+
+## Citation
+
+```bibtex
+@inproceedings{yu2022metaformer,
+  title={Metaformer is actually what you need for vision},
+  author={Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={10819--10829},
+  year={2022}
+}
+```
+
+### Usage
+
+- PoolFormer backbone needs to install [MMClassification](https://github.com/open-mmlab/mmclassification) first, which has abundant backbones for downstream tasks.
+
+```shell
+pip install "mmpretrain>=1.0.0rc7"
+```
+
+- The pretrained models could also be downloaded from [PoolFormer config of MMClassification](https://github.com/open-mmlab/mmclassification/tree/master/configs/poolformer).
+
+## Results and models
+
+### ADE20K
+
+| Method | Backbone       | Crop Size | pretrain    | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | mIoU\* | mIoU\*(ms+flip) | config                                                                                                                            | download                                                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------------- | --------- | ----------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | ------ | --------------: | --------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FPN    | PoolFormer-S12 | 512x512   | ImageNet-1K | 32         | 40000   | 4.17     | 23.48          | V100   | 36.68 |             - | 37.07  |               - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/poolformer/fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s12_8x4_512x512_40k_ade20k/fpn_poolformer_s12_8x4_512x512_40k_ade20k_20220501_115154-b5aa2f49.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s12_8x4_512x512_40k_ade20k/fpn_poolformer_s12_8x4_512x512_40k_ade20k_20220501_115154.log.json) |
+| FPN    | PoolFormer-S24 | 512x512   | ImageNet-1K | 32         | 40000   | 5.47     | 15.74          | V100   | 40.12 |             - | 40.36  |               - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/poolformer/fpn_poolformer_s24_8xb4-40k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s24_8x4_512x512_40k_ade20k/fpn_poolformer_s24_8x4_512x512_40k_ade20k_20220503_222049-394a7cf7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s24_8x4_512x512_40k_ade20k/fpn_poolformer_s24_8x4_512x512_40k_ade20k_20220503_222049.log.json) |
+| FPN    | PoolFormer-S36 | 512x512   | ImageNet-1K | 32         | 40000   | 6.77     | 11.34          | V100   | 41.61 |             - | 41.81  |               - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/poolformer/fpn_poolformer_s36_8xb4-40k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k/fpn_poolformer_s36_8x4_512x512_40k_ade20k_20220501_151122-b47e607d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k/fpn_poolformer_s36_8x4_512x512_40k_ade20k_20220501_151122.log.json) |
+| FPN    | PoolFormer-M36 | 512x512   | ImageNet-1K | 32         | 40000   | 8.59     | 8.97           | V100   | 41.95 |             - | 42.35  |               - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/poolformer/fpn_poolformer_m36_8xb4-40k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m36_8x4_512x512_40k_ade20k/fpn_poolformer_m36_8x4_512x512_40k_ade20k_20220501_164230-3dc83921.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m36_8x4_512x512_40k_ade20k/fpn_poolformer_m36_8x4_512x512_40k_ade20k_20220501_164230.log.json) |
+| FPN    | PoolFormer-M48 | 512x512   | ImageNet-1K | 32         | 40000   | 10.48    | 6.69           | V100   | 42.43 |             - | 42.76  |               - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/poolformer/fpn_poolformer_m48_8xb4-40k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m48_8x4_512x512_40k_ade20k/fpn_poolformer_m48_8x4_512x512_40k_ade20k_20220504_003923-64168d3b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m48_8x4_512x512_40k_ade20k/fpn_poolformer_m48_8x4_512x512_40k_ade20k_20220504_003923.log.json) |
+
+Note:
+
+- We replace `AlignedResize` in original PoolFormer implementation to `Resize + ResizeToMultiple`.
+
+- `mIoU` with * is collected when `Resize + ResizeToMultiple` is adopted in `test_pipeline`, so do `mIoU` in logs.
+
+- The Test Time Augmentation i.e., "ms+flip" in MMSegmentation v1.x is developing, stay tuned!
diff --git a/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_m36_8xb4-40k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_m36_8xb4-40k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..4100eb99233284f465a7cd4d4941b6debf90c176
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_m36_8xb4-40k_ade20k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = './fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py'
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m36_3rdparty_32xb128_in1k_20220414-c55e0949.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        arch='m36',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    neck=dict(in_channels=[96, 192, 384, 768]))
diff --git a/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_m48_8xb4-40k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_m48_8xb4-40k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfc49ccbdb9edd29d94e90b32bef0e18364ce34d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_m48_8xb4-40k_ade20k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = './fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py'
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m48_3rdparty_32xb128_in1k_20220414-9378f3eb.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        arch='m48',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    neck=dict(in_channels=[96, 192, 384, 768]))
diff --git a/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0b15312fed42b29a7b72d2be03bb571e0bd6119
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py
@@ -0,0 +1,91 @@
+_base_ = [
+    '../_base_/models/fpn_poolformer_s12.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+
+# dataset settings
+dataset_type = 'ADE20KDataset'
+data_root = 'data/ade/ADEChallengeData2016'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    dict(type='ResizeToMultiple', size_divisor=32),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=50,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            data_prefix=dict(
+                img_path='images/training',
+                seg_map_path='annotations/training'),
+            pipeline=train_pipeline)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
+
+# model settings
+model = dict(
+    data_preprocessor=data_preprocessor,
+    neck=dict(in_channels=[64, 128, 320, 512]),
+    decode_head=dict(num_classes=150))
+
+# optimizer
+# optimizer = dict(_delete_=True, type='AdamW', lr=0.0002, weight_decay=0.0001)
+# optimizer_config = dict()
+# # learning policy
+# lr_config = dict(policy='poly', power=0.9, min_lr=0.0, by_epoch=False)
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001))
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        power=0.9,
+        begin=0,
+        end=40000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_s24_8xb4-40k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_s24_8xb4-40k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f9d24cd41bc32d641f99585220d90ee2e0be56f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_s24_8xb4-40k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py'
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s24_3rdparty_32xb128_in1k_20220414-d7055904.pth'  # noqa
+# model settings
+model = dict(
+    backbone=dict(
+        arch='s24',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')))
diff --git a/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k.py b/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..231dcf6c20e52820b3d09721ff2bcdcd4c21efdf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k.py
@@ -0,0 +1,10 @@
+_base_ = './fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py'
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s36_3rdparty_32xb128_in1k_20220414-d78ff3e8.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        arch='s36',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')))
diff --git a/head_extractor/src/mmseg/.mim/configs/poolformer/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/poolformer/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12f402be65727b7378e3f9342111031f34e600fc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/poolformer/metafile.yaml
@@ -0,0 +1,116 @@
+Models:
+- Name: fpn_poolformer_s12_8xb4-40k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 36.68
+  Config: configs/poolformer/fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - PoolFormer-S12
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.17
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s12_8x4_512x512_40k_ade20k/fpn_poolformer_s12_8x4_512x512_40k_ade20k_20220501_115154-b5aa2f49.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s12_8x4_512x512_40k_ade20k/fpn_poolformer_s12_8x4_512x512_40k_ade20k_20220501_115154.log.json
+  Paper:
+    Title: MetaFormer is Actually What You Need for Vision
+    URL: https://arxiv.org/abs/2111.11418
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.23.0/mmcls/models/backbones/poolformer.py#L198
+  Framework: PyTorch
+- Name: fpn_poolformer_s24_8xb4-40k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.12
+  Config: configs/poolformer/fpn_poolformer_s24_8xb4-40k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - PoolFormer-S24
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 5.47
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s24_8x4_512x512_40k_ade20k/fpn_poolformer_s24_8x4_512x512_40k_ade20k_20220503_222049-394a7cf7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s24_8x4_512x512_40k_ade20k/fpn_poolformer_s24_8x4_512x512_40k_ade20k_20220503_222049.log.json
+  Paper:
+    Title: MetaFormer is Actually What You Need for Vision
+    URL: https://arxiv.org/abs/2111.11418
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.23.0/mmcls/models/backbones/poolformer.py#L198
+  Framework: PyTorch
+- Name: fpn_poolformer_s36_8xb4-40k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.61
+  Config: configs/poolformer/fpn_poolformer_s36_8xb4-40k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - PoolFormer-S36
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.77
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k/fpn_poolformer_s36_8x4_512x512_40k_ade20k_20220501_151122-b47e607d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k/fpn_poolformer_s36_8x4_512x512_40k_ade20k_20220501_151122.log.json
+  Paper:
+    Title: MetaFormer is Actually What You Need for Vision
+    URL: https://arxiv.org/abs/2111.11418
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.23.0/mmcls/models/backbones/poolformer.py#L198
+  Framework: PyTorch
+- Name: fpn_poolformer_m36_8xb4-40k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.95
+  Config: configs/poolformer/fpn_poolformer_m36_8xb4-40k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - PoolFormer-M36
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.59
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m36_8x4_512x512_40k_ade20k/fpn_poolformer_m36_8x4_512x512_40k_ade20k_20220501_164230-3dc83921.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m36_8x4_512x512_40k_ade20k/fpn_poolformer_m36_8x4_512x512_40k_ade20k_20220501_164230.log.json
+  Paper:
+    Title: MetaFormer is Actually What You Need for Vision
+    URL: https://arxiv.org/abs/2111.11418
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.23.0/mmcls/models/backbones/poolformer.py#L198
+  Framework: PyTorch
+- Name: fpn_poolformer_m48_8xb4-40k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.43
+  Config: configs/poolformer/fpn_poolformer_m48_8xb4-40k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - PoolFormer-M48
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 10.48
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m48_8x4_512x512_40k_ade20k/fpn_poolformer_m48_8x4_512x512_40k_ade20k_20220504_003923-64168d3b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m48_8x4_512x512_40k_ade20k/fpn_poolformer_m48_8x4_512x512_40k_ade20k_20220504_003923.log.json
+  Paper:
+    Title: MetaFormer is Actually What You Need for Vision
+    URL: https://arxiv.org/abs/2111.11418
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.23.0/mmcls/models/backbones/poolformer.py#L198
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/README.md b/head_extractor/src/mmseg/.mim/configs/psanet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f5680fbabcc7c537e2d529300ce656bce5d0f47
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/README.md
@@ -0,0 +1,68 @@
+# PSANet
+
+> [PSANet: Point-wise Spatial Attention Network for Scene Parsing](https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/hszhao/PSANet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We notice information flow in convolutional neural networksis  restricted  inside  local  neighborhood  regions  due  to  the  physical  de-sign  of  convolutional  filters,  which  limits  the  overall  understanding  ofcomplex scenes. In this paper, we propose thepoint-wise  spatial  atten-tion network(PSANet) to relax the local neighborhood constraint. Eachposition on the feature map is connected to all the other ones througha self-adaptively learned attention mask. Moreover, information propa-gation in bi-direction for scene parsing is enabled. Information at otherpositions can be collected to help the prediction of the current positionand  vice  versa,  information  at  the  current  position  can  be  distributedto assist the prediction of other ones. Our proposed approach achievestop performance on various competitive scene parsing datasets, includ-ing  ADE20K,  PASCAL  VOC  2012  and  Cityscapes,  demonstrating  itseffectiveness and generality.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142902367-0f29e8cb-5ac0-434b-98c4-b2af7c9c2e58.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSANet | R-50-D8  | 512x1024  |   40000 | 7        | 3.17           | V100   | 77.63 |         79.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_40k_cityscapes/psanet_r50-d8_512x1024_40k_cityscapes_20200606_103117-99fac37c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_40k_cityscapes/psanet_r50-d8_512x1024_40k_cityscapes_20200606_103117.log.json)     |
+| PSANet | R-101-D8 | 512x1024  |   40000 | 10.5     | 2.20           | V100   | 79.14 |         80.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_40k_cityscapes/psanet_r101-d8_512x1024_40k_cityscapes_20200606_001418-27b9cfa7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_40k_cityscapes/psanet_r101-d8_512x1024_40k_cityscapes_20200606_001418.log.json) |
+| PSANet | R-50-D8  | 769x769   |   40000 | 7.9      | 1.40           | V100   | 77.99 |         79.64 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_40k_cityscapes/psanet_r50-d8_769x769_40k_cityscapes_20200530_033717-d5365506.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_40k_cityscapes/psanet_r50-d8_769x769_40k_cityscapes_20200530_033717.log.json)         |
+| PSANet | R-101-D8 | 769x769   |   40000 | 11.9     | 0.98           | V100   | 78.43 |         80.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_40k_cityscapes/psanet_r101-d8_769x769_40k_cityscapes_20200530_035107-997da1e6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_40k_cityscapes/psanet_r101-d8_769x769_40k_cityscapes_20200530_035107.log.json)     |
+| PSANet | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 77.24 |         78.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_80k_cityscapes/psanet_r50-d8_512x1024_80k_cityscapes_20200606_161842-ab60a24f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_80k_cityscapes/psanet_r50-d8_512x1024_80k_cityscapes_20200606_161842.log.json)     |
+| PSANet | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 79.31 |         80.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_80k_cityscapes/psanet_r101-d8_512x1024_80k_cityscapes_20200606_161823-0f73a169.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_80k_cityscapes/psanet_r101-d8_512x1024_80k_cityscapes_20200606_161823.log.json) |
+| PSANet | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 79.31 |         80.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_80k_cityscapes/psanet_r50-d8_769x769_80k_cityscapes_20200606_225134-fe42f49e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_80k_cityscapes/psanet_r50-d8_769x769_80k_cityscapes_20200606_225134.log.json)         |
+| PSANet | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 79.69 |         80.89 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_80k_cityscapes/psanet_r101-d8_769x769_80k_cityscapes_20200606_214550-7665827b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_80k_cityscapes/psanet_r101-d8_769x769_80k_cityscapes_20200606_214550.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSANet | R-50-D8  | 512x512   |   80000 | 9        | 18.91          | V100   | 41.14 |         41.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_80k_ade20k/psanet_r50-d8_512x512_80k_ade20k_20200614_144141-835e4b97.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_80k_ade20k/psanet_r50-d8_512x512_80k_ade20k_20200614_144141.log.json)         |
+| PSANet | R-101-D8 | 512x512   |   80000 | 12.5     | 13.13          | V100   | 43.80 |         44.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_80k_ade20k/psanet_r101-d8_512x512_80k_ade20k_20200614_185117-1fab60d4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_80k_ade20k/psanet_r101-d8_512x512_80k_ade20k_20200614_185117.log.json)     |
+| PSANet | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 41.67 |         42.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_160k_ade20k/psanet_r50-d8_512x512_160k_ade20k_20200615_161258-148077dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_160k_ade20k/psanet_r50-d8_512x512_160k_ade20k_20200615_161258.log.json)     |
+| PSANet | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 43.74 |         45.38 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_160k_ade20k/psanet_r101-d8_512x512_160k_ade20k_20200615_161537-dbfa564c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_160k_ade20k/psanet_r101-d8_512x512_160k_ade20k_20200615_161537.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSANet | R-50-D8  | 512x512   |   20000 | 6.9      | 18.24          | V100   | 76.39 |         77.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_20k_voc12aug/psanet_r50-d8_512x512_20k_voc12aug_20200617_102413-2f1bbaa1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_20k_voc12aug/psanet_r50-d8_512x512_20k_voc12aug_20200617_102413.log.json)     |
+| PSANet | R-101-D8 | 512x512   |   20000 | 10.4     | 12.63          | V100   | 77.91 |         79.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_20k_voc12aug/psanet_r101-d8_512x512_20k_voc12aug_20200617_110624-946fef11.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_20k_voc12aug/psanet_r101-d8_512x512_20k_voc12aug_20200617_110624.log.json) |
+| PSANet | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 76.30 |         77.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_40k_voc12aug/psanet_r50-d8_512x512_40k_voc12aug_20200613_161946-f596afb5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_40k_voc12aug/psanet_r50-d8_512x512_40k_voc12aug_20200613_161946.log.json)     |
+| PSANet | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 77.73 |         79.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_40k_voc12aug/psanet_r101-d8_512x512_40k_voc12aug_20200613_161946-1f560f9e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_40k_voc12aug/psanet_r101-d8_512x512_40k_voc12aug_20200613_161946.log.json) |
+
+## Citation
+
+```bibtex
+@inproceedings{zhao2018psanet,
+  title={Psanet: Point-wise spatial attention network for scene parsing},
+  author={Zhao, Hengshuang and Zhang, Yi and Liu, Shu and Shi, Jianping and Change Loy, Chen and Lin, Dahua and Jia, Jiaya},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  pages={267--283},
+  year={2018}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/psanet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3fbe6f6d3e9fde439d7946537ef820b70a6bd27a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/metafile.yaml
@@ -0,0 +1,391 @@
+Collections:
+- Name: PSANet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  README: configs/psanet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: psanet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.63
+      mIoU(ms+flip): 79.04
+  Config: configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_40k_cityscapes/psanet_r50-d8_512x1024_40k_cityscapes_20200606_103117-99fac37c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_40k_cityscapes/psanet_r50-d8_512x1024_40k_cityscapes_20200606_103117.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.14
+      mIoU(ms+flip): 80.19
+  Config: configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_40k_cityscapes/psanet_r101-d8_512x1024_40k_cityscapes_20200606_001418-27b9cfa7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_40k_cityscapes/psanet_r101-d8_512x1024_40k_cityscapes_20200606_001418.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.99
+      mIoU(ms+flip): 79.64
+  Config: configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_40k_cityscapes/psanet_r50-d8_769x769_40k_cityscapes_20200530_033717-d5365506.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_40k_cityscapes/psanet_r50-d8_769x769_40k_cityscapes_20200530_033717.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.43
+      mIoU(ms+flip): 80.26
+  Config: configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_40k_cityscapes/psanet_r101-d8_769x769_40k_cityscapes_20200530_035107-997da1e6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_40k_cityscapes/psanet_r101-d8_769x769_40k_cityscapes_20200530_035107.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.24
+      mIoU(ms+flip): 78.69
+  Config: configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_80k_cityscapes/psanet_r50-d8_512x1024_80k_cityscapes_20200606_161842-ab60a24f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_80k_cityscapes/psanet_r50-d8_512x1024_80k_cityscapes_20200606_161842.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.31
+      mIoU(ms+flip): 80.53
+  Config: configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_80k_cityscapes/psanet_r101-d8_512x1024_80k_cityscapes_20200606_161823-0f73a169.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_80k_cityscapes/psanet_r101-d8_512x1024_80k_cityscapes_20200606_161823.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.31
+      mIoU(ms+flip): 80.91
+  Config: configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_80k_cityscapes/psanet_r50-d8_769x769_80k_cityscapes_20200606_225134-fe42f49e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_80k_cityscapes/psanet_r50-d8_769x769_80k_cityscapes_20200606_225134.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.69
+      mIoU(ms+flip): 80.89
+  Config: configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_80k_cityscapes/psanet_r101-d8_769x769_80k_cityscapes_20200606_214550-7665827b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_80k_cityscapes/psanet_r101-d8_769x769_80k_cityscapes_20200606_214550.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.14
+      mIoU(ms+flip): 41.91
+  Config: configs/psanet/psanet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_80k_ade20k/psanet_r50-d8_512x512_80k_ade20k_20200614_144141-835e4b97.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_80k_ade20k/psanet_r50-d8_512x512_80k_ade20k_20200614_144141.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.8
+      mIoU(ms+flip): 44.75
+  Config: configs/psanet/psanet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_80k_ade20k/psanet_r101-d8_512x512_80k_ade20k_20200614_185117-1fab60d4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_80k_ade20k/psanet_r101-d8_512x512_80k_ade20k_20200614_185117.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.67
+      mIoU(ms+flip): 42.95
+  Config: configs/psanet/psanet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_160k_ade20k/psanet_r50-d8_512x512_160k_ade20k_20200615_161258-148077dd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_160k_ade20k/psanet_r50-d8_512x512_160k_ade20k_20200615_161258.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.74
+      mIoU(ms+flip): 45.38
+  Config: configs/psanet/psanet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_160k_ade20k/psanet_r101-d8_512x512_160k_ade20k_20200615_161537-dbfa564c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_160k_ade20k/psanet_r101-d8_512x512_160k_ade20k_20200615_161537.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.39
+      mIoU(ms+flip): 77.34
+  Config: configs/psanet/psanet_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_20k_voc12aug/psanet_r50-d8_512x512_20k_voc12aug_20200617_102413-2f1bbaa1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_20k_voc12aug/psanet_r50-d8_512x512_20k_voc12aug_20200617_102413.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.91
+      mIoU(ms+flip): 79.3
+  Config: configs/psanet/psanet_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_20k_voc12aug/psanet_r101-d8_512x512_20k_voc12aug_20200617_110624-946fef11.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_20k_voc12aug/psanet_r101-d8_512x512_20k_voc12aug_20200617_110624.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.3
+      mIoU(ms+flip): 77.35
+  Config: configs/psanet/psanet_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_40k_voc12aug/psanet_r50-d8_512x512_40k_voc12aug_20200613_161946-f596afb5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_40k_voc12aug/psanet_r50-d8_512x512_40k_voc12aug_20200613_161946.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.73
+      mIoU(ms+flip): 79.05
+  Config: configs/psanet/psanet_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_40k_voc12aug/psanet_r101-d8_512x512_40k_voc12aug_20200613_161946-1f560f9e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_40k_voc12aug/psanet_r101-d8_512x512_40k_voc12aug_20200613_161946.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69cf4270353a17bc4c375280a06d4409b6c2faf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..e543099842bd7af2dac9d8679ad0adb8e120e5e4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8636384d0a871740ec1b9c1c04238fd252c2b78
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..097b1c58cea777c94ad7bd02dc49440ef0b1be80
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac86306cb6089dc6f3c04bd02872b501090bb63b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..abd8e56512ac6846817455d36800bb2c6c58ca4b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3154a8f147a4edb7f2061b1402460f8a973cc84
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b34d4248e86d802927e03d7ee7950a71f723b0c7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..82463aaebaeeaee28cac8eb079f4ecee3a7180a1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/psanet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..af44b3080f0dcdee32fce7e64a40f5f962ba6681
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/psanet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e5052f8c5361d685b61dfa5b71982730466113d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/psanet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..0eaf83070354f722b9bf1ad98187de787208273f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/psanet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..de13296afff122be25ac1a48967be2e2d014aabe
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/psanet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(mask_size=(66, 66), num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..45d8762a090de5b5d7ca929c2136778c47009d56
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/psanet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5d99d1000e9ac608ab3bc026b013841739a4c48
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/psanet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3b65287e57057540042f1adca4bb1d7e3011846
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/psanet/psanet_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/psanet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(mask_size=(66, 66), num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/README.md b/head_extractor/src/mmseg/.mim/configs/pspnet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4209d259b7e7c1238871cb10137ce02e46b995e3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/README.md
@@ -0,0 +1,182 @@
+# PSPNet
+
+> [Pyramid Scene Parsing Network](https://arxiv.org/abs/1612.01105)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/hszhao/PSPNet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Scene parsing is challenging for unrestricted open vocabulary and diverse scenes. In this paper, we exploit the capability of global context information by different-region-based context aggregation through our pyramid pooling module together with the proposed pyramid scene parsing network (PSPNet). Our global prior representation is effective to produce good quality results on the scene parsing task, while PSPNet provides a superior framework for pixel-level prediction tasks. The proposed approach achieves state-of-the-art performance on various datasets. It came first in ImageNet scene parsing challenge 2016, PASCAL VOC 2012 benchmark and Cityscapes benchmark. A single PSPNet yields new record of mIoU accuracy 85.4% on PASCAL VOC 2012 and accuracy 80.2% on Cityscapes.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142902444-9f93b99e-9261-443b-a0a4-17e78eefb525.png" width="70%"/>
+</div>
+
+<div align=center >
+<img alt="PSPNet-R50-D8" src="https://user-images.githubusercontent.com/47882088/209554973-66804b14-de5a-4f83-b54e-26683a91818a.jpg"/>
+PSPNet-R50 D8 model structure
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method        | Backbone      | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| ------------- | ------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| PSPNet        | R-50-D8       | 512x1024  |   40000 | 6.1      | 4.07           | V100   | 77.85 |         79.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json)                                                                                 |
+| PSPNet        | R-101-D8      | 512x1024  |   40000 | 9.6      | 2.68           | V100   | 78.34 |         79.74 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json)                                                                             |
+| PSPNet        | R-50-D8       | 769x769   |   40000 | 6.9      | 1.76           | V100   | 78.26 |         79.88 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-769x769.py)             | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_40k_cityscapes/pspnet_r50-d8_769x769_40k_cityscapes_20200606_112725-86638686.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_40k_cityscapes/pspnet_r50-d8_769x769_40k_cityscapes_20200606_112725.log.json)                                                                                     |
+| PSPNet        | R-101-D8      | 769x769   |   40000 | 10.9     | 1.15           | V100   | 79.08 |         80.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-769x769.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_40k_cityscapes/pspnet_r101-d8_769x769_40k_cityscapes_20200606_112753-61c6f5be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_40k_cityscapes/pspnet_r101-d8_769x769_40k_cityscapes_20200606_112753.log.json)                                                                                 |
+| PSPNet        | R-18-D8       | 512x1024  |   80000 | 1.7      | 15.71          | V100   | 74.87 |         76.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-512x1024.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes/pspnet_r18-d8_512x1024_80k_cityscapes_20201225_021458-09ffa746.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes/pspnet_r18-d8_512x1024_80k_cityscapes-20201225_021458.log.json)                                                                                 |
+| PSPNet        | R-50-D8       | 512x1024  |   80000 | -        | -              | V100   | 78.55 |         79.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_512x1024_80k_cityscapes_20200606_112131-2376f12b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_512x1024_80k_cityscapes_20200606_112131.log.json)                                                                                 |
+| PSPNet        | R-50b-D8 rsb  | 512x1024  |   80000 | 6.2      | 3.82           | V100   | 78.47 |         79.45 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8-rsb_4xb2-adamw-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220315_123238-588c30be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220315_123238.log.json)                                           |
+| PSPNet        | R-101-D8      | 512x1024  |   80000 | -        | -              | V100   | 79.76 |         81.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes/pspnet_r101-d8_512x1024_80k_cityscapes_20200606_112211-e1e1100f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes/pspnet_r101-d8_512x1024_80k_cityscapes_20200606_112211.log.json)                                                                             |
+| PSPNet (FP16) | R-101-D8      | 512x1024  |   80000 | 5.34     | 8.77           | V100   | 79.46 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes/pspnet_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230919-a0875e5c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes/pspnet_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230919.log.json)                                                         |
+| PSPNet        | R-18-D8       | 769x769   |   80000 | 1.9      | 6.20           | V100   | 75.90 |         77.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-769x769.py)             | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_769x769_80k_cityscapes/pspnet_r18-d8_769x769_80k_cityscapes_20201225_021458-3deefc62.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_769x769_80k_cityscapes/pspnet_r18-d8_769x769_80k_cityscapes-20201225_021458.log.json)                                                                                     |
+| PSPNet        | R-50-D8       | 769x769   |   80000 | -        | -              | V100   | 79.59 |         80.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py)             | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_80k_cityscapes/pspnet_r50-d8_769x769_80k_cityscapes_20200606_210121-5ccf03dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_80k_cityscapes/pspnet_r50-d8_769x769_80k_cityscapes_20200606_210121.log.json)                                                                                     |
+| PSPNet        | R-101-D8      | 769x769   |   80000 | -        | -              | V100   | 79.77 |         81.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-769x769.py)            | [model](https://download.oz1z1penmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_80k_cityscapes/pspnet_r101-d8_769x769_80k_cityscapes_20200606_225055-dba412fa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_80k_cityscapes/pspnet_r101-d8_769x769_80k_cityscapes_20200606_225055.log.json)                                                                             |
+| PSPNet        | R-18b-D8      | 512x1024  |   80000 | 1.5      | 16.28          | V100   | 74.23 |         75.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes/pspnet_r18b-d8_512x1024_80k_cityscapes_20201226_063116-26928a60.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes/pspnet_r18b-d8_512x1024_80k_cityscapes-20201226_063116.log.json)                                                                             |
+| PSPNet        | R-50b-D8      | 512x1024  |   80000 | 6.0      | 4.30           | V100   | 78.22 |         79.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes/pspnet_r50b-d8_512x1024_80k_cityscapes_20201225_094315-6344287a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes/pspnet_r50b-d8_512x1024_80k_cityscapes-20201225_094315.log.json)                                                                             |
+| PSPNet        | R-101b-D8     | 512x1024  |   80000 | 9.5      | 2.76           | V100   | 79.69 |         80.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json)                                                                         |
+| PSPNet        | R-18b-D8      | 769x769   |   80000 | 1.7      | 6.41           | V100   | 74.92 |         76.90 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-769x769.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes/pspnet_r18b-d8_769x769_80k_cityscapes_20201226_080942-bf98d186.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes/pspnet_r18b-d8_769x769_80k_cityscapes-20201226_080942.log.json)                                                                                 |
+| PSPNet        | R-50b-D8      | 769x769   |   80000 | 6.8      | 1.88           | V100   | 78.50 |         79.96 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-769x769.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes/pspnet_r50b-d8_769x769_80k_cityscapes_20201225_094316-4c643cf6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes/pspnet_r50b-d8_769x769_80k_cityscapes-20201225_094316.log.json)                                                                                 |
+| PSPNet        | R-101b-D8     | 769x769   |   80000 | 10.8     | 1.17           | V100   | 78.87 |         80.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-769x769.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes/pspnet_r101b-d8_769x769_80k_cityscapes_20201226_171823-f0e7c293.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes/pspnet_r101b-d8_769x769_80k_cityscapes-20201226_171823.log.json)                                                                             |
+| PSPNet        | R-50-D32      | 512x1024  |   80000 | 3.0      | 15.21          | V100   | 73.88 |         76.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes/pspnet_r50-d32_512x1024_80k_cityscapes_20220316_224840-9092b254.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes/pspnet_r50-d32_512x1024_80k_cityscapes_20220316_224840.log.json)                                                                             |
+| PSPNet        | R-50b-D32 rsb | 512x1024  |   80000 | 3.1      | 16.08          | V100   | 74.09 |         77.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d32_rsb_4xb2-adamw-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220316_141229-dd9c9610.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220316_141229.log.json) |
+| PSPNet        | R-50b-D32     | 512x1024  |   80000 | 2.9      | 15.41          | V100   | 72.61 |         75.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes/pspnet_r50b-d32_512x1024_80k_cityscapes_20220311_152152-23bcaf8c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes/pspnet_r50b-d32_512x1024_80k_cityscapes_20220311_152152.log.json)                                                                         |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-50-D8  | 512x512   |   80000 | 8.5      | 23.53          | V100   | 41.13 |         41.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_ade20k/pspnet_r50-d8_512x512_80k_ade20k_20200615_014128-15a8b914.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_ade20k/pspnet_r50-d8_512x512_80k_ade20k_20200615_014128.log.json)         |
+| PSPNet | R-101-D8 | 512x512   |   80000 | 12       | 15.30          | V100   | 43.57 |         44.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_ade20k/pspnet_r101-d8_512x512_80k_ade20k_20200614_031423-b6e782f0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_ade20k/pspnet_r101-d8_512x512_80k_ade20k_20200614_031423.log.json)     |
+| PSPNet | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 42.48 |         43.44 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_160k_ade20k/pspnet_r50-d8_512x512_160k_ade20k_20200615_184358-1890b0bd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_160k_ade20k/pspnet_r50-d8_512x512_160k_ade20k_20200615_184358.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 44.39 |         45.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_160k_ade20k/pspnet_r101-d8_512x512_160k_ade20k_20200615_100650-967c316f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_160k_ade20k/pspnet_r101-d8_512x512_160k_ade20k_20200615_100650.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-50-D8  | 512x512   |   20000 | 6.1      | 23.59          | V100   | 76.78 |         77.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_20k_voc12aug/pspnet_r50-d8_512x512_20k_voc12aug_20200617_101958-ed5dfbd9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_20k_voc12aug/pspnet_r50-d8_512x512_20k_voc12aug_20200617_101958.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   20000 | 9.6      | 15.02          | V100   | 78.47 |         79.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_20k_voc12aug/pspnet_r101-d8_512x512_20k_voc12aug_20200617_102003-4aef3c9a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_20k_voc12aug/pspnet_r101-d8_512x512_20k_voc12aug_20200617_102003.log.json) |
+| PSPNet | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 77.29 |         78.48 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_40k_voc12aug/pspnet_r50-d8_512x512_40k_voc12aug_20200613_161222-ae9c1b8c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_40k_voc12aug/pspnet_r50-d8_512x512_40k_voc12aug_20200613_161222.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 78.52 |         79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_40k_voc12aug/pspnet_r101-d8_512x512_40k_voc12aug_20200613_161222-bc933b18.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_40k_voc12aug/pspnet_r101-d8_512x512_40k_voc12aug_20200613_161222.log.json) |
+
+### Pascal Context
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                            | download                                                                                                                                                                                                                                                                                                                                                             |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-101-D8 | 480x480   |   40000 | 8.8      | 9.68           | V100   | 46.60 |         47.78 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context/pspnet_r101-d8_480x480_40k_pascal_context_20200911_211210-bf0f5d7c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context/pspnet_r101-d8_480x480_40k_pascal_context-20200911_211210.log.json) |
+| PSPNet | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 46.03 |         47.15 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context/pspnet_r101-d8_480x480_80k_pascal_context_20200911_190530-c86d6233.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context/pspnet_r101-d8_480x480_80k_pascal_context-20200911_190530.log.json) |
+
+### Pascal Context 59
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                               | download                                                                                                                                                                                                                                                                                                                                                                         |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-101-D8 | 480x480   |   40000 | -        | -              | V100   | 52.02 |         53.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59/pspnet_r101-d8_480x480_40k_pascal_context_59_20210416_114524-86d44cd4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59/pspnet_r101-d8_480x480_40k_pascal_context_59-20210416_114524.log.json) |
+| PSPNet | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 52.47 |         53.99 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59/pspnet_r101-d8_480x480_80k_pascal_context_59_20210416_114418-fa6caaa2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59/pspnet_r101-d8_480x480_80k_pascal_context_59-20210416_114418.log.json) |
+
+### Dark Zurich and Nighttime Driving
+
+We support evaluation results on these two datasets using models above trained on Cityscapes training set.
+
+| Method | Backbone  | Training Dataset        | Test Dataset              | mIoU  | config                                                                                                                                                  | evaluation checkpoint                                                                                                                                                                                                                                                                                                                                        |
+| ------ | --------- | ----------------------- | ------------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| PSPNet | R-50-D8   | Cityscapes Training set | Dark Zurich               | 10.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json)         |
+| PSPNet | R-50-D8   | Cityscapes Training set | Nighttime Driving         | 23.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json)         |
+| PSPNet | R-50-D8   | Cityscapes Training set | Cityscapes Validation set | 77.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)                           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json)         |
+| PSPNet | R-101-D8  | Cityscapes Training set | Dark Zurich               | 10.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json)     |
+| PSPNet | R-101-D8  | Cityscapes Training set | Nighttime Driving         | 20.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json)     |
+| PSPNet | R-101-D8  | Cityscapes Training set | Cityscapes Validation set | 78.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024.py)                          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json)     |
+| PSPNet | R-101b-D8 | Cityscapes Training set | Dark Zurich               | 15.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json) |
+| PSPNet | R-101b-D8 | Cityscapes Training set | Nighttime Driving         | 22.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json) |
+| PSPNet | R-101b-D8 | Cityscapes Training set | Cityscapes Validation set | 79.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024.py)                         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json) |
+
+### COCO-Stuff 10k
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                           | download                                                                                                                                                                                                                                                                                                                                                                         |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-50-D8  | 512x512   |   20000 | 9.6      | 20.5           | V100   | 35.69 |         36.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-20k_coco-stuff10k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k_20210820_203258-b88df27f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k_20210820_203258.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   20000 | 13.2     | 11.1           | V100   | 37.26 |         38.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-20k_coco-stuff10k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k_20210820_232135-76aae482.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k_20210820_232135.log.json) |
+| PSPNet | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 36.33 |         37.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-40k_coco-stuff10k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_030857-92e2902b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_030857.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 37.76 |         38.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-40k_coco-stuff10k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_014022-831aec95.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_014022.log.json) |
+
+### COCO-Stuff 164k
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-50-D8  | 512x512   |   80000 | 9.6      | 20.5           | V100   | 38.80 |         39.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-80k_coco-stuff164k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034-0e41b2db.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034.log.json)         |
+| PSPNet | R-101-D8 | 512x512   |   80000 | 13.2     | 11.1           | V100   | 40.34 |         40.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034-7eb41789.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034.log.json)     |
+| PSPNet | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 39.64 |         39.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-160k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004-51276a57.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 41.28 |         41.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-160k_coco-stuff164k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004-4af9621b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004.log.json) |
+| PSPNet | R-50-D8  | 512x512   |  320000 | -        | -              | V100   | 40.53 |         40.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-320k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004-be9610cc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |  320000 | -        | -              | V100   | 41.95 |         42.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-320k_coco-stuff164k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004-72220c60.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004.log.json) |
+
+### LoveDA
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                             |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| PSPNet | R-18-D8  | 512x512   |   80000 | 1.45     | 26.87          | V100   | 48.62 |         47.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18-d8_4xb4-80k_loveda-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x512_80k_loveda/pspnet_r18-d8_512x512_80k_loveda_20211105_052100-b97697f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x512_80k_loveda/pspnet_r18-d8_512x512_80k_loveda_20211105_052100.log.json)     |
+| PSPNet | R-50-D8  | 512x512   |   80000 | 6.14     | 6.60           | V100   | 50.46 |         50.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-80k_loveda-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_loveda/pspnet_r50-d8_512x512_80k_loveda_20211104_155728-88610f9f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_loveda/pspnet_r50-d8_512x512_80k_loveda_20211104_155728.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   80000 | 9.61     | 4.58           | V100   | 51.86 |         51.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_loveda-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_loveda/pspnet_r101-d8_512x512_80k_loveda_20211104_153212-1c06c6a8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_loveda/pspnet_r101-d8_512x512_80k_loveda_20211104_153212.log.json) |
+
+### Potsdam
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-18-D8  | 512x512   |   80000 | 1.50     | 85.12          | V100   | 77.09 |         78.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18-d8_4xb4-80k_potsdam-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam/pspnet_r18-d8_4x4_512x512_80k_potsdam_20211220_125612-7cd046e1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam/pspnet_r18-d8_4x4_512x512_80k_potsdam_20211220_125612.log.json)     |
+| PSPNet | R-50-D8  | 512x512   |   80000 | 6.14     | 30.21          | V100   | 78.12 |         78.98 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-80k_potsdam-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam/pspnet_r50-d8_4x4_512x512_80k_potsdam_20211219_043541-2dd5fe67.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam/pspnet_r50-d8_4x4_512x512_80k_potsdam_20211219_043541.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   80000 | 9.61     | 19.40          | V100   | 78.62 |         79.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_potsdam-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam/pspnet_r101-d8_4x4_512x512_80k_potsdam_20211220_125612-aed036c4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam/pspnet_r101-d8_4x4_512x512_80k_potsdam_20211220_125612.log.json) |
+
+### Vaihingen
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                                         |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-18-D8  | 512x512   |   80000 | 1.45     | 85.06          | V100   | 71.46 |         73.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18-d8_4xb4-80k_vaihingen-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen/pspnet_r18-d8_4x4_512x512_80k_vaihingen_20211228_160355-52a8a6f6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen/pspnet_r18-d8_4x4_512x512_80k_vaihingen_20211228_160355.log.json)     |
+| PSPNet | R-50-D8  | 512x512   |   80000 | 6.14     | 30.29          | V100   | 72.36 |         73.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-80k_vaihingen-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen/pspnet_r50-d8_4x4_512x512_80k_vaihingen_20211228_160355-382f8f5b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen/pspnet_r50-d8_4x4_512x512_80k_vaihingen_20211228_160355.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   80000 | 9.61     | 19.97          | V100   | 72.61 |         74.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_vaihingen-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen/pspnet_r101-d8_4x4_512x512_80k_vaihingen_20211231_230806-8eba0a09.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen/pspnet_r101-d8_4x4_512x512_80k_vaihingen_20211231_230806.log.json) |
+
+### iSAID
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-18-D8  | 896x896   |   80000 | 4.52     | 26.91          | V100   | 60.22 |         61.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18-d8_4xb4-80k_isaid-896x896.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid/pspnet_r18-d8_4x4_896x896_80k_isaid_20220110_180526-e84c0b6a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid/pspnet_r18-d8_4x4_896x896_80k_isaid_20220110_180526.log.json) |
+| PSPNet | R-50-D8  | 896x896   |   80000 | 16.58    | 8.88           | V100   | 65.36 |         66.48 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-80k_isaid-896x896.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid/pspnet_r50-d8_4x4_896x896_80k_isaid_20220110_180629-1f21dc32.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid/pspnet_r50-d8_4x4_896x896_80k_isaid_20220110_180629.log.json) |
+
+Note:
+
+- `FP16` means Mixed Precision (FP16) is adopted in training.
+- `896x896` is the Crop Size of iSAID dataset, which is followed by the implementation of [PointFlow: Flowing Semantics Through Points for Aerial Image Segmentation](https://arxiv.org/pdf/2103.06564.pdf)
+- `rsb` is short for 'Resnet strikes back'.
+- The `b` in `R-50b` means ResNetV1b, which is a standard ResNet backbone. In MMSegmentation, default backbone is ResNetV1c, which usually performs better in semantic segmentation task.
+
+## Citation
+
+```bibtex
+@inproceedings{zhao2017pspnet,
+  title={Pyramid Scene Parsing Network},
+  author={Zhao, Hengshuang and Shi, Jianping and Qi, Xiaojuan and Wang, Xiaogang and Jia, Jiaya},
+  booktitle={CVPR},
+  year={2017}
+}
+```
+
+```bibtex
+@article{wightman2021resnet,
+  title={Resnet strikes back: An improved training procedure in timm},
+  author={Wightman, Ross and Touvron, Hugo and J{\'e}gou, Herv{\'e}},
+  journal={arXiv preprint arXiv:2110.00476},
+  year={2021}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/pspnet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d00b89d5cf780439533ab4189e7b8db1fea5574e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/metafile.yaml
@@ -0,0 +1,1303 @@
+Collections:
+- Name: PSPNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+    - Pascal Context
+    - Pascal Context 59
+    - Dark Zurich and Nighttime Driving
+    - COCO-Stuff 10k
+    - COCO-Stuff 164k
+    - LoveDA
+    - Potsdam
+    - Vaihingen
+    - iSAID
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  README: configs/pspnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: pspnet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.85
+      mIoU(ms+flip): 79.18
+  Config: configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.34
+      mIoU(ms+flip): 79.74
+  Config: configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.26
+      mIoU(ms+flip): 79.88
+  Config: configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_40k_cityscapes/pspnet_r50-d8_769x769_40k_cityscapes_20200606_112725-86638686.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_40k_cityscapes/pspnet_r50-d8_769x769_40k_cityscapes_20200606_112725.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.08
+      mIoU(ms+flip): 80.28
+  Config: configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_40k_cityscapes/pspnet_r101-d8_769x769_40k_cityscapes_20200606_112753-61c6f5be.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_40k_cityscapes/pspnet_r101-d8_769x769_40k_cityscapes_20200606_112753.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.87
+      mIoU(ms+flip): 76.04
+  Config: configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes/pspnet_r18-d8_512x1024_80k_cityscapes_20201225_021458-09ffa746.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes/pspnet_r18-d8_512x1024_80k_cityscapes-20201225_021458.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.55
+      mIoU(ms+flip): 79.79
+  Config: configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_512x1024_80k_cityscapes_20200606_112131-2376f12b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_512x1024_80k_cityscapes_20200606_112131.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8-rsb_4xb2-adamw-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.47
+      mIoU(ms+flip): 79.45
+  Config: configs/pspnet/pspnet_r50-d8-rsb_4xb2-adamw-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220315_123238-588c30be.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220315_123238.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.76
+      mIoU(ms+flip): 81.01
+  Config: configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes/pspnet_r101-d8_512x1024_80k_cityscapes_20200606_112211-e1e1100f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes/pspnet_r101-d8_512x1024_80k_cityscapes_20200606_112211.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb2-amp-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.46
+  Config: configs/pspnet/pspnet_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    - (FP16)
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.34
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes/pspnet_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230919-a0875e5c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes/pspnet_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230919.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.9
+      mIoU(ms+flip): 77.86
+  Config: configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_769x769_80k_cityscapes/pspnet_r18-d8_769x769_80k_cityscapes_20201225_021458-3deefc62.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_769x769_80k_cityscapes/pspnet_r18-d8_769x769_80k_cityscapes-20201225_021458.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.59
+      mIoU(ms+flip): 80.69
+  Config: configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_80k_cityscapes/pspnet_r50-d8_769x769_80k_cityscapes_20200606_210121-5ccf03dd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_80k_cityscapes/pspnet_r50-d8_769x769_80k_cityscapes_20200606_210121.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.77
+      mIoU(ms+flip): 81.06
+  Config: configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.oz1z1penmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_80k_cityscapes/pspnet_r101-d8_769x769_80k_cityscapes_20200606_225055-dba412fa.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_80k_cityscapes/pspnet_r101-d8_769x769_80k_cityscapes_20200606_225055.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.23
+      mIoU(ms+flip): 75.79
+  Config: configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes/pspnet_r18b-d8_512x1024_80k_cityscapes_20201226_063116-26928a60.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes/pspnet_r18b-d8_512x1024_80k_cityscapes-20201226_063116.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.22
+      mIoU(ms+flip): 79.46
+  Config: configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes/pspnet_r50b-d8_512x1024_80k_cityscapes_20201225_094315-6344287a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes/pspnet_r50b-d8_512x1024_80k_cityscapes-20201225_094315.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.69
+      mIoU(ms+flip): 80.79
+  Config: configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.92
+      mIoU(ms+flip): 76.9
+  Config: configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes/pspnet_r18b-d8_769x769_80k_cityscapes_20201226_080942-bf98d186.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes/pspnet_r18b-d8_769x769_80k_cityscapes-20201226_080942.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.5
+      mIoU(ms+flip): 79.96
+  Config: configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes/pspnet_r50b-d8_769x769_80k_cityscapes_20201225_094316-4c643cf6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes/pspnet_r50b-d8_769x769_80k_cityscapes-20201225_094316.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.87
+      mIoU(ms+flip): 80.04
+  Config: configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes/pspnet_r101b-d8_769x769_80k_cityscapes_20201226_171823-f0e7c293.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes/pspnet_r101b-d8_769x769_80k_cityscapes-20201226_171823.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.88
+      mIoU(ms+flip): 76.85
+  Config: configs/pspnet/pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes/pspnet_r50-d32_512x1024_80k_cityscapes_20220316_224840-9092b254.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes/pspnet_r50-d32_512x1024_80k_cityscapes_20220316_224840.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d32_rsb_4xb2-adamw-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.09
+      mIoU(ms+flip): 77.18
+  Config: configs/pspnet/pspnet_r50-d32_rsb_4xb2-adamw-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D32
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220316_141229-dd9c9610.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220316_141229.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 72.61
+      mIoU(ms+flip): 75.51
+  Config: configs/pspnet/pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D32
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes/pspnet_r50b-d32_512x1024_80k_cityscapes_20220311_152152-23bcaf8c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes/pspnet_r50b-d32_512x1024_80k_cityscapes_20220311_152152.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.13
+      mIoU(ms+flip): 41.94
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_ade20k/pspnet_r50-d8_512x512_80k_ade20k_20200615_014128-15a8b914.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_ade20k/pspnet_r50-d8_512x512_80k_ade20k_20200615_014128.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.57
+      mIoU(ms+flip): 44.35
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_ade20k/pspnet_r101-d8_512x512_80k_ade20k_20200614_031423-b6e782f0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_ade20k/pspnet_r101-d8_512x512_80k_ade20k_20200614_031423.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.48
+      mIoU(ms+flip): 43.44
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_160k_ade20k/pspnet_r50-d8_512x512_160k_ade20k_20200615_184358-1890b0bd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_160k_ade20k/pspnet_r50-d8_512x512_160k_ade20k_20200615_184358.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.39
+      mIoU(ms+flip): 45.35
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_160k_ade20k/pspnet_r101-d8_512x512_160k_ade20k_20200615_100650-967c316f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_160k_ade20k/pspnet_r101-d8_512x512_160k_ade20k_20200615_100650.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.78
+      mIoU(ms+flip): 77.61
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_20k_voc12aug/pspnet_r50-d8_512x512_20k_voc12aug_20200617_101958-ed5dfbd9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_20k_voc12aug/pspnet_r50-d8_512x512_20k_voc12aug_20200617_101958.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.47
+      mIoU(ms+flip): 79.25
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_20k_voc12aug/pspnet_r101-d8_512x512_20k_voc12aug_20200617_102003-4aef3c9a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_20k_voc12aug/pspnet_r101-d8_512x512_20k_voc12aug_20200617_102003.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.29
+      mIoU(ms+flip): 78.48
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_40k_voc12aug/pspnet_r50-d8_512x512_40k_voc12aug_20200613_161222-ae9c1b8c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_40k_voc12aug/pspnet_r50-d8_512x512_40k_voc12aug_20200613_161222.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.52
+      mIoU(ms+flip): 79.57
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_40k_voc12aug/pspnet_r101-d8_512x512_40k_voc12aug_20200613_161222-bc933b18.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_40k_voc12aug/pspnet_r101-d8_512x512_40k_voc12aug_20200613_161222.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-40k_pascal-context-480x480
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 46.6
+      mIoU(ms+flip): 47.78
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context/pspnet_r101-d8_480x480_40k_pascal_context_20200911_211210-bf0f5d7c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context/pspnet_r101-d8_480x480_40k_pascal_context-20200911_211210.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_pascal-context-480x480
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 46.03
+      mIoU(ms+flip): 47.15
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context/pspnet_r101-d8_480x480_80k_pascal_context_20200911_190530-c86d6233.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context/pspnet_r101-d8_480x480_80k_pascal_context-20200911_190530.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-40k_pascal-context-59-480x480
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 52.02
+      mIoU(ms+flip): 53.54
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59/pspnet_r101-d8_480x480_40k_pascal_context_59_20210416_114524-86d44cd4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59/pspnet_r101-d8_480x480_40k_pascal_context_59-20210416_114524.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_pascal-context-59-480x480
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 52.47
+      mIoU(ms+flip): 53.99
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59/pspnet_r101-d8_480x480_80k_pascal_context_59_20210416_114418-fa6caaa2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59/pspnet_r101-d8_480x480_80k_pascal_context_59-20210416_114418.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-20k_coco-stuff10k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 35.69
+      mIoU(ms+flip): 36.62
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-20k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k_20210820_203258-b88df27f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k_20210820_203258.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-20k_coco-stuff10k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 37.26
+      mIoU(ms+flip): 38.52
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-20k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k_20210820_232135-76aae482.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k_20210820_232135.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-40k_coco-stuff10k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 36.33
+      mIoU(ms+flip): 37.24
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-40k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_030857-92e2902b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_030857.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-40k_coco-stuff10k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 37.76
+      mIoU(ms+flip): 38.86
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-40k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_014022-831aec95.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_014022.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-80k_coco-stuff164k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 38.8
+      mIoU(ms+flip): 39.19
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-80k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034-0e41b2db.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_coco-stuff164k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 40.34
+      mIoU(ms+flip): 40.79
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034-7eb41789.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-160k_coco-stuff164k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 39.64
+      mIoU(ms+flip): 39.97
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004-51276a57.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-160k_coco-stuff164k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 41.28
+      mIoU(ms+flip): 41.66
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004-4af9621b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-320k_coco-stuff164k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 40.53
+      mIoU(ms+flip): 40.75
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-320k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004-be9610cc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-320k_coco-stuff164k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 41.95
+      mIoU(ms+flip): 42.42
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-320k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004-72220c60.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18-d8_4xb4-80k_loveda-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 48.62
+      mIoU(ms+flip): 47.57
+  Config: configs/pspnet/pspnet_r18-d8_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.45
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x512_80k_loveda/pspnet_r18-d8_512x512_80k_loveda_20211105_052100-b97697f1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x512_80k_loveda/pspnet_r18-d8_512x512_80k_loveda_20211105_052100.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-80k_loveda-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 50.46
+      mIoU(ms+flip): 50.19
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.14
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_loveda/pspnet_r50-d8_512x512_80k_loveda_20211104_155728-88610f9f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_loveda/pspnet_r50-d8_512x512_80k_loveda_20211104_155728.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_loveda-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 51.86
+      mIoU(ms+flip): 51.34
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.61
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_loveda/pspnet_r101-d8_512x512_80k_loveda_20211104_153212-1c06c6a8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_loveda/pspnet_r101-d8_512x512_80k_loveda_20211104_153212.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18-d8_4xb4-80k_potsdam-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 77.09
+      mIoU(ms+flip): 78.3
+  Config: configs/pspnet/pspnet_r18-d8_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam/pspnet_r18-d8_4x4_512x512_80k_potsdam_20211220_125612-7cd046e1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam/pspnet_r18-d8_4x4_512x512_80k_potsdam_20211220_125612.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-80k_potsdam-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 78.12
+      mIoU(ms+flip): 78.98
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.14
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam/pspnet_r50-d8_4x4_512x512_80k_potsdam_20211219_043541-2dd5fe67.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam/pspnet_r50-d8_4x4_512x512_80k_potsdam_20211219_043541.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_potsdam-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 78.62
+      mIoU(ms+flip): 79.47
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.61
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam/pspnet_r101-d8_4x4_512x512_80k_potsdam_20211220_125612-aed036c4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam/pspnet_r101-d8_4x4_512x512_80k_potsdam_20211220_125612.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18-d8_4xb4-80k_vaihingen-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 71.46
+      mIoU(ms+flip): 73.36
+  Config: configs/pspnet/pspnet_r18-d8_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.45
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen/pspnet_r18-d8_4x4_512x512_80k_vaihingen_20211228_160355-52a8a6f6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen/pspnet_r18-d8_4x4_512x512_80k_vaihingen_20211228_160355.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-80k_vaihingen-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 72.36
+      mIoU(ms+flip): 73.75
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.14
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen/pspnet_r50-d8_4x4_512x512_80k_vaihingen_20211228_160355-382f8f5b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen/pspnet_r50-d8_4x4_512x512_80k_vaihingen_20211228_160355.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_vaihingen-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 72.61
+      mIoU(ms+flip): 74.18
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.61
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen/pspnet_r101-d8_4x4_512x512_80k_vaihingen_20211231_230806-8eba0a09.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen/pspnet_r101-d8_4x4_512x512_80k_vaihingen_20211231_230806.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18-d8_4xb4-80k_isaid-896x896
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 60.22
+      mIoU(ms+flip): 61.25
+  Config: configs/pspnet/pspnet_r18-d8_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.52
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid/pspnet_r18-d8_4x4_896x896_80k_isaid_20220110_180526-e84c0b6a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid/pspnet_r18-d8_4x4_896x896_80k_isaid_20220110_180526.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-80k_isaid-896x896
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 65.36
+      mIoU(ms+flip): 66.48
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 16.58
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid/pspnet_r50-d8_4x4_896x896_80k_isaid_20220110_180629-1f21dc32.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid/pspnet_r50-d8_4x4_896x896_80k_isaid_20220110_180629.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..f33d653b7621703a9e7948a09d56605dcdeb55a7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py
new file mode 100644
index 0000000000000000000000000000000000000000..5babaa885169930d924e787222a1eb5d2b23c020
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py'  # noqa
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9480c52f8edbc39775834e9e73124ea03a312a0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py'  # noqa
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..e05cff6d8e03059714f1a2c940308d229af2407a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..6704cdd5d24d6631660059eac6fb04777c19e46f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..3733e691982b17ac9d0d8d99b3efd30e97dc2956
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..52f86b5e752b279fdf765ae15bfa796c5daff187
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
@@ -0,0 +1,6 @@
+_base_ = './pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005),
+    loss_scale=512.)
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..2231049b8a21cc48e819706f204b14db25ad1e47
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-160k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5390f8c7631db41acf20ae17e3799d8fbec14ce
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-160k_coco-stuff164k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-20k_coco-stuff10k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-20k_coco-stuff10k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..84a986cd9de1878027b76d4d78ab97034a60bbd5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-20k_coco-stuff10k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-20k_coco-stuff10k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..71897ddc2d2131955fdb0128c3efcaa3c31bcfd2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-320k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-320k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebaea36da837ff8ec7c7fd72b98c1f2123bb9543
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-320k_coco-stuff164k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-320k_coco-stuff164k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-40k_coco-stuff10k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-40k_coco-stuff10k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a55f53ee94d218605572e603fa5afc7e618ac28
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-40k_coco-stuff10k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-40k_coco-stuff10k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..205d00bac9e24d9fc818a99ef3159652b185a68c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-40k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d7c176073202920255bb5c5c40c99ab1464a51e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..0599f31f9631fde33a6a275b6df01fb0e459271e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..f95560347a5d3926ba00d1afcdd115dfea6cb1c2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a34f9748583426297e08256e8754d60d1aab370
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_coco-stuff164k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_coco-stuff164k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_loveda-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..7076877980a9bd295caca603a7575f7f5311d1e4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_loveda-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ac40dc861ba10e7b33637d00402cb7a3296beac
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..307188c783783ee89c88ec3564167e880df81a09
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_potsdam-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..31ed2f2938adc59cac52ed7f0ab77e27959c8823
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_potsdam-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_vaihingen-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac33ed7cdadaf6649c518176306a9dce8330f745
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101-d8_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_vaihingen-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2c0f696380dd12840c88320050b6f511ca195a1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,4 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1817441495b19eea61c690333416657ba834e13
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py
@@ -0,0 +1,4 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py'  # noqa
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a8994b4c820e3e31a4f8db41094bbbd582f7da8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py
@@ -0,0 +1,4 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py'  # noqa
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..891bfd51ed3973abff5d8609ae2bf984b40c8953
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,4 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4b342ef23522f7778e4572631a7cd1bd027c8f9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e7f3e90ac49cd304407241da2ee5be42a98ef6f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb4-80k_isaid-896x896.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb4-80k_isaid-896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..efce7a0e7de4e22d96ff4b2d4a480f6494ac1381
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb4-80k_isaid-896x896.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_isaid-896x896.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb4-80k_loveda-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..80e2d20cbecbadcd01fbbfeefbe033e4302175da
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_loveda-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb4-80k_potsdam-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ef0585e799fdd4e9f9093b89433fbf6d176da5f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_potsdam-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb4-80k_vaihingen-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..51e66d2e5131063a41165f8ea0a91c9512e3cfbb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18-d8_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_vaihingen-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e356c5c5fab1a043d5c2b77f91c9619e1310f41
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..831354d4ce2df80019de431afee30f37c4037dd5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d32_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d32_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..5700b5b3b4553d6ab665804a467dfa76b0f941c4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d32_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(dilations=(1, 1, 2, 4), strides=(1, 2, 2, 2)))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d32_rsb_4xb2-adamw-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d32_rsb_4xb2-adamw-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..13903293426107292394c14b8ea03a28a8c996ad
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d32_rsb_4xb2-adamw-80k_cityscapes-512x1024.py
@@ -0,0 +1,35 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 2, 2)))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0005, weight_decay=0.05),
+    clip_grad=dict(max_norm=1, norm_type=2))
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=1000,
+        end=80000,
+        by_epoch=False,
+        milestones=[60000, 72000],
+    )
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8-rsb_4xb2-adamw-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8-rsb_4xb2-adamw-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..b83a0b447cdea6f9b636eaab9e0d5314602e6f67
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8-rsb_4xb2-adamw-80k_cityscapes-512x1024.py
@@ -0,0 +1,33 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb256-rsb-a1-600e_in1k_20211228-20e21305.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='ResNet',
+        init_cfg=dict(
+            type='Pretrained', prefix='backbone.', checkpoint=checkpoint)))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0005, weight_decay=0.05),
+    clip_grad=dict(max_norm=1, norm_type=2))
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=1000,
+        end=80000,
+        by_epoch=False,
+        milestones=[60000, 72000],
+    )
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9dcb52b668cf7d3b67e53cc4796657afcfe82b1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bf4a135c7fc3edea4d79ae55492d802fb009b0b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(1920, 1080), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+test_dataloader = dict(
+    dataset=dict(
+        type='DarkZurichDataset',
+        data_root='data/dark_zurich/',
+        data_prefix=dict(
+            img_path='rgb_anon/val/night/GOPR0356',
+            seg_map_path='gt/val/night/GOPR0356'),
+        pipeline=test_pipeline))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py
new file mode 100644
index 0000000000000000000000000000000000000000..b91258913104d67e319d2e3307cb1e259df339e4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py
@@ -0,0 +1,25 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(1920, 1080), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+test_dataloader = dict(
+    dataset=dict(
+        type='NightDrivingDataset',
+        data_root='data/NighttimeDrivingTest/',
+        data_prefix=dict(
+            img_path='leftImg8bit/test/night',
+            seg_map_path='gtCoarse_daytime_trainvaltest/test/night'),
+        pipeline=test_pipeline))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..6baa31baede6b7dba1ed51fb5cf22436141ffa19
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ea27de906e4f35e94f67f19406db8fc02606aa5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py
new file mode 100644
index 0000000000000000000000000000000000000000..200679ffdf8f6f491c6a130126857e4dd5cc9c4f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py
@@ -0,0 +1,25 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(1920, 1080), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+test_dataloader = dict(
+    dataset=dict(
+        type='DarkZurichDataset',
+        data_root='data/dark_zurich/',
+        data_prefix=dict(
+            img_path='rgb_anon/val/night/GOPR0356',
+            seg_map_path='gt/val/night/GOPR0356'),
+        pipeline=test_pipeline))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py
new file mode 100644
index 0000000000000000000000000000000000000000..517381375f35bbc32573557863150efb4fe6a411
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py
@@ -0,0 +1,25 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(1920, 1080), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+test_dataloader = dict(
+    dataset=dict(
+        type='NightDrivingDataset',
+        data_root='data/NighttimeDrivingTest/',
+        data_prefix=dict(
+            img_path='leftImg8bit/test/night',
+            seg_map_path='gtCoarse_daytime_trainvaltest/test/night'),
+        pipeline=test_pipeline))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..d43d30a0b6149de8f9057df1b8dff5260ab687a1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d9164f2e1ad002de36d7235ba053e02e4acb5d1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-160k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..6185c2efee4a3488dc2836a3e02cd107a9f8b77a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py',
+    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=171),
+    auxiliary_head=dict(num_classes=171))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-20k_coco-stuff10k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-20k_coco-stuff10k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c1ba2ddf06a719fe54017cbd87b49eaff39e1c9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-20k_coco-stuff10k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/coco-stuff10k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=171),
+    auxiliary_head=dict(num_classes=171))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f60819313c059b175e210559b512937552b3eee
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-320k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-320k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a9ce4c4f134e14b8e85b23cd3c60675e8481ea1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-320k_coco-stuff164k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py',
+    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_320k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=171),
+    auxiliary_head=dict(num_classes=171))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-40k_coco-stuff10k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-40k_coco-stuff10k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..fae57b0dbcddeeaad7efde5e7a169bf3acf51ebd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-40k_coco-stuff10k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/coco-stuff10k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=171),
+    auxiliary_head=dict(num_classes=171))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-40k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..08a214448cc48c25480e489d1cf6318f3c1b9a0e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py',
+    '../_base_/datasets/pascal_context.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=60),
+    auxiliary_head=dict(num_classes=60),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-40k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6544957325d362d042a1aa5ac93547f590c9641
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py',
+    '../_base_/datasets/pascal_context_59.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=59),
+    auxiliary_head=dict(num_classes=59),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4a46115303ca35a13fa608da17d65a0ffd3cfe5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb12aed85c0e6383ae0f2ede9899f882779d59dd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_coco-stuff164k-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..954a653456ae39cfc8c55143572625a185fb37ca
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_coco-stuff164k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py',
+    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=171),
+    auxiliary_head=dict(num_classes=171))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_isaid-896x896.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_isaid-896x896.py
new file mode 100644
index 0000000000000000000000000000000000000000..63165b608ef1a8ce763160f3c893db3d438e8279
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_isaid-896x896.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/isaid.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (896, 896)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=16),
+    auxiliary_head=dict(num_classes=16))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_loveda-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..920729d3be0a8648a68c17f29c66b49452ed0ca5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/loveda.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=7),
+    auxiliary_head=dict(num_classes=7))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_pascal-context-480x480.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d82478ce8ea1adf518899b86af32bdc347f423
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py',
+    '../_base_/datasets/pascal_context.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=60),
+    auxiliary_head=dict(num_classes=60),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_pascal-context-59-480x480.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7abc1bdd39edb47fd1c353b6964fb4ca60f0e97
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py',
+    '../_base_/datasets/pascal_context_59.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (480, 480)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=59),
+    auxiliary_head=dict(num_classes=59),
+    test_cfg=dict(mode='slide', crop_size=(480, 480), stride=(320, 320)))
+optimizer = dict(type='SGD', lr=0.004, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_potsdam-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..afb3977ad2a7119ca38f49a366f4aac46e40c935
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/potsdam.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=6),
+    auxiliary_head=dict(num_classes=6))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_vaihingen-512x512.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..35322d2df08fce1fe959f451524d52e1291a5dfe
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50-d8_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/vaihingen.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=6),
+    auxiliary_head=dict(num_classes=6))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..64e55090acf4db747ac20543c82b6156b8b8b6a3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='torchvision://resnet50',
+    backbone=dict(type='ResNet', dilations=(1, 1, 2, 4), strides=(1, 2, 2, 2)))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dd64b332f10a1fd4a2f0da0178a3e2006665f45
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..3875c092fed129e4bce29c363b1685d9b430d3c5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/head_extractor/src/mmseg/.mim/configs/resnest/README.md b/head_extractor/src/mmseg/.mim/configs/resnest/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..304791abe9666099a9b25e8aa8cf5b4c7cdeeda7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/resnest/README.md
@@ -0,0 +1,54 @@
+# ResNeSt
+
+> [ResNeSt: Split-Attention Networks](https://arxiv.org/abs/2004.08955)
+
+## Introduction
+
+<!-- [BACKBONE] -->
+
+<a href="https://github.com/zhanghang1989/ResNeSt">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+It is well known that featuremap attention and multi-path representation are important for visual recognition. In this paper, we present a modularized architecture, which applies the channel-wise attention on different network branches to leverage their success in capturing cross-feature interactions and learning diverse representations. Our design results in a simple and unified computation block, which can be parameterized using only a few variables. Our model, named ResNeSt, outperforms EfficientNet in accuracy and latency trade-off on image classification. In addition, ResNeSt has achieved superior transfer learning results on several public benchmarks serving as the backbone, and has been adopted by the winning entries of COCO-LVIS challenge. The source code for complete system and pretrained models are publicly available.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142902526-3cf33345-7e40-47a6-985e-4381857e21df.png" width="60%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                                               |
+| ---------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN        | S-101-D8 | 512x1024  |   80000 |     11.4 | 2.39           | V100   | 77.56 | 78.98         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_fcn_4xb2-80k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x1024_80k_cityscapes/fcn_s101-d8_512x1024_80k_cityscapes_20200807_140631-f8d155b3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x1024_80k_cityscapes/fcn_s101-d8_512x1024_80k_cityscapes-20200807_140631.log.json)                                         |
+| PSPNet     | S-101-D8 | 512x1024  |   80000 |     11.8 | 2.52           | V100   | 78.57 | 79.19         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x1024_80k_cityscapes/pspnet_s101-d8_512x1024_80k_cityscapes_20200807_140631-c75f3b99.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x1024_80k_cityscapes/pspnet_s101-d8_512x1024_80k_cityscapes-20200807_140631.log.json)                             |
+| DeepLabV3  | S-101-D8 | 512x1024  |   80000 |     11.9 | 1.88           | V100   | 79.67 | 80.51         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes/deeplabv3_s101-d8_512x1024_80k_cityscapes_20200807_144429-b73c4270.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes/deeplabv3_s101-d8_512x1024_80k_cityscapes-20200807_144429.log.json)                 |
+| DeepLabV3+ | S-101-D8 | 512x1024  |   80000 |     13.2 | 2.36           | V100   | 79.62 | 80.27         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes/deeplabv3plus_s101-d8_512x1024_80k_cityscapes_20200807_144429-1239eb43.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes/deeplabv3plus_s101-d8_512x1024_80k_cityscapes-20200807_144429.log.json) |
+
+### ADE20K
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                               |
+| ---------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN        | S-101-D8 | 512x512   |  160000 |     14.2 | 12.86          | V100   | 45.62 | 46.16         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_fcn_4xb4-160k_ade20k-512x512.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x512_160k_ade20k/fcn_s101-d8_512x512_160k_ade20k_20200807_145416-d3160329.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x512_160k_ade20k/fcn_s101-d8_512x512_160k_ade20k-20200807_145416.log.json)                                         |
+| PSPNet     | S-101-D8 | 512x512   |  160000 |     14.2 | 13.02          | V100   | 45.44 | 46.28         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x512_160k_ade20k/pspnet_s101-d8_512x512_160k_ade20k_20200807_145416-a6daa92a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x512_160k_ade20k/pspnet_s101-d8_512x512_160k_ade20k-20200807_145416.log.json)                             |
+| DeepLabV3  | S-101-D8 | 512x512   |  160000 |     14.6 | 9.28           | V100   | 45.71 | 46.59         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_deeplabv3_4xb4-160k_ade20k-512x512.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x512_160k_ade20k/deeplabv3_s101-d8_512x512_160k_ade20k_20200807_144503-17ecabe5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x512_160k_ade20k/deeplabv3_s101-d8_512x512_160k_ade20k-20200807_144503.log.json)                 |
+| DeepLabV3+ | S-101-D8 | 512x512   |  160000 |     16.2 | 11.96          | V100   | 46.47 | 47.27         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k/deeplabv3plus_s101-d8_512x512_160k_ade20k_20200807_144503-27b26226.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k/deeplabv3plus_s101-d8_512x512_160k_ade20k-20200807_144503.log.json) |
+
+## Citation
+
+```bibtex
+@article{zhang2020resnest,
+title={ResNeSt: Split-Attention Networks},
+author={Zhang, Hang and Wu, Chongruo and Zhang, Zhongyue and Zhu, Yi and Zhang, Zhi and Lin, Haibin and Sun, Yue and He, Tong and Muller, Jonas and Manmatha, R. and Li, Mu and Smola, Alexander},
+journal={arXiv preprint arXiv:2004.08955},
+year={2020}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/resnest/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/resnest/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b8d41ebfd992542582d6f2c973373a2972bc977
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/resnest/metafile.yaml
@@ -0,0 +1,193 @@
+Models:
+- Name: resnest_s101-d8_fcn_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.56
+      mIoU(ms+flip): 78.98
+  Config: configs/resnest/resnest_s101-d8_fcn_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - S-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x1024_80k_cityscapes/fcn_s101-d8_512x1024_80k_cityscapes_20200807_140631-f8d155b3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x1024_80k_cityscapes/fcn_s101-d8_512x1024_80k_cityscapes-20200807_140631.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.57
+      mIoU(ms+flip): 79.19
+  Config: configs/resnest/resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - S-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x1024_80k_cityscapes/pspnet_s101-d8_512x1024_80k_cityscapes_20200807_140631-c75f3b99.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x1024_80k_cityscapes/pspnet_s101-d8_512x1024_80k_cityscapes-20200807_140631.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_deeplabv3_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.67
+      mIoU(ms+flip): 80.51
+  Config: configs/resnest/resnest_s101-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - S-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes/deeplabv3_s101-d8_512x1024_80k_cityscapes_20200807_144429-b73c4270.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes/deeplabv3_s101-d8_512x1024_80k_cityscapes-20200807_144429.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.62
+      mIoU(ms+flip): 80.27
+  Config: configs/resnest/resnest_s101-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - S-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes/deeplabv3plus_s101-d8_512x1024_80k_cityscapes_20200807_144429-1239eb43.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes/deeplabv3plus_s101-d8_512x1024_80k_cityscapes-20200807_144429.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_fcn_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.62
+      mIoU(ms+flip): 46.16
+  Config: configs/resnest/resnest_s101-d8_fcn_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - S-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 14.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x512_160k_ade20k/fcn_s101-d8_512x512_160k_ade20k_20200807_145416-d3160329.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x512_160k_ade20k/fcn_s101-d8_512x512_160k_ade20k-20200807_145416.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.44
+      mIoU(ms+flip): 46.28
+  Config: configs/resnest/resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - S-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 14.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x512_160k_ade20k/pspnet_s101-d8_512x512_160k_ade20k_20200807_145416-a6daa92a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x512_160k_ade20k/pspnet_s101-d8_512x512_160k_ade20k-20200807_145416.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_deeplabv3_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.71
+      mIoU(ms+flip): 46.59
+  Config: configs/resnest/resnest_s101-d8_deeplabv3_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - S-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 14.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x512_160k_ade20k/deeplabv3_s101-d8_512x512_160k_ade20k_20200807_144503-17ecabe5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x512_160k_ade20k/deeplabv3_s101-d8_512x512_160k_ade20k-20200807_144503.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_deeplabv3plus_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.47
+      mIoU(ms+flip): 47.27
+  Config: configs/resnest/resnest_s101-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - S-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 16.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k/deeplabv3plus_s101-d8_512x512_160k_ade20k_20200807_144503-27b26226.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k/deeplabv3plus_s101-d8_512x512_160k_ade20k-20200807_144503.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ece894b56703b0f11f472079f48997285aa7d72
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = '../deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_deeplabv3_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_deeplabv3_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2852301fcb01cd91231107a7ab03a326c8980a1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_deeplabv3_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = '../deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c43a9547deaa554cedc741e6042b459b015cb92
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = '../deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py'  # noqa
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce39d3709f4baf2b64f46e4bbeffa78498b759ba
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = '../deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_fcn_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_fcn_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc333e4ff07c00935f29d8ee5844370828857dd0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_fcn_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = '../fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_fcn_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_fcn_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..af127334442a027bb29dce543978d6bc96dc601d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_fcn_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = '../fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024.py b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aab5244497819c08fd56b0d65a40c20f5696cba
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024.py
@@ -0,0 +1,9 @@
+_base_ = '../pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..66e6639c183b09763cd1a1ba47a3af7c71b42f3a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/resnest/resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = '../pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/head_extractor/src/mmseg/.mim/configs/san/README.md b/head_extractor/src/mmseg/.mim/configs/san/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d9940eb5c0db1f466a56f01e3577b1650a2e2baa
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/san/README.md
@@ -0,0 +1,47 @@
+# SAN
+
+> [Side Adapter Network for Open-Vocabulary Semantic Segmentation](https://arxiv.org/abs/2302.12242)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/MendelXu/SAN">Official Repo</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+This paper presents a new framework for open-vocabulary semantic segmentation with the pre-trained vision-language model, named Side Adapter Network (SAN). Our approach models the semantic segmentation task as a region recognition problem. A side network is attached to a frozen CLIP model with two branches: one for predicting mask proposals, and the other for predicting attention bias which is applied in the CLIP model to recognize the class of masks. This decoupled design has the benefit CLIP in recognizing the class of mask proposals. Since the attached side network can reuse CLIP features, it can be very light. In addition, the entire network can be trained end-to-end, allowing the side network to be adapted to the frozen CLIP model, which makes the predicted mask proposals CLIP-aware. Our approach is fast, accurate, and only adds a few additional trainable parameters. We evaluate our approach on multiple semantic segmentation benchmarks. Our method significantly outperforms other counterparts, with up to 18 times fewer trainable parameters and 19 times faster inference speed. We hope our approach will serve as a solid baseline and help ease future research in open-vocabulary semantic segmentation.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://github.com/MendelXu/SAN/blob/main/resources/arch.png" width="800"/>
+</div>
+
+## Results and models
+
+### COCO-Stuff164k
+
+| Method | Backbone | Pretrained   | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                   | download                                                                                                                                                                                    |
+| ------ | -------- | ------------ | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | -------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| SAN    | ViT-B_16 | CLIP_ViT-B16 | 640x640   | 60000   | 12.61    | -              | V100   | 41.93 | 41.77         | https://github.com/open-mmlab/mmsegmentation/blob/main/configs/san/san-vit-b16_coco-stuff164k-640x640.py | [model](https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-b16_20230906-fd0a7684.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-b16_20230906.log) |
+| SAN    | ViT-L_14 | CLIP_ViT-L14 | 640x640   | 60000   | 22.84    | -              | V100   | 45.78 | 43.99         | https://github.com/open-mmlab/mmsegmentation/blob/main/configs/san/san-vit-l14_coco-stuff164k-640x640.py | [model](https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-l14_20230907-a11e098f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-l14_20230907.log) |
+
+## Notes
+
+git push
+The pretrained weights in config files are converted from open_clip models using tools/model_converters/clip2mmseg.py.
+
+## Citation
+
+```bibtex
+@inproceedings{xu2023side,
+  title={Side adapter network for open-vocabulary semantic segmentation},
+  author={Xu, Mengde and Zhang, Zheng and Wei, Fangyun and Hu, Han and Bai, Xiang},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={2945--2954},
+  year={2023}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/san/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/san/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..117d088af036f4b640d19b9a55c73027596414e5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/san/metafile.yaml
@@ -0,0 +1,61 @@
+Collections:
+- Name: SAN
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - COCO-Stuff 164k
+  Paper:
+    Title: 'Side Adapter Network for Open-Vocabulary Semantic Segmentation'
+    URL: https://arxiv.org/abs/2302.12242
+  README: configs/san/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: san-vit-b16_coco-stuff164k-640x640
+  In Collection: SAN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 41.93
+      mIoU(ms+flip): 41.77
+  Config: configs/san/san-vit-b16_coco-stuff164k-640x640.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - SAN
+    - ViT
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 12.61
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-b16_20230906-fd0a7684.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-b16_20230906.log
+  Paper:
+    Title: 'Side Adapter Network for Open-Vocabulary Semantic Segmentation'
+    URL: https://arxiv.org/abs/2302.12242
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/mmseg/models/decode_heads/san_head.py#L470
+  Framework: PyTorch
+- Name: san-vit-l14_coco-stuff164k-640x640
+  In Collection: SAN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 45.78
+      mIoU(ms+flip): 43.99
+  Config: configs/san/san-vit-l14_coco-stuff164k-640x640.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - SAN
+    - ViT
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 12.61
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-l14_20230907-a11e098f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-l14_20230907.log
+  Paper:
+    Title: 'Side Adapter Network for Open-Vocabulary Semantic Segmentation'
+    URL: https://arxiv.org/abs/2302.12242
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/mmseg/models/decode_heads/san_head.py#L470
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/san/san-vit-b16_coco-stuff164k-640x640.py b/head_extractor/src/mmseg/.mim/configs/san/san-vit-b16_coco-stuff164k-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..40592486d1e10c295ffc2860412ef78b602c50fe
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/san/san-vit-b16_coco-stuff164k-640x640.py
@@ -0,0 +1,82 @@
+_base_ = [
+    '../_base_/models/san_vit-b16.py', '../_base_/datasets/coco-stuff164k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(640 * x * 0.1) for x in range(5, 16)],
+        resize_type='ResizeShortestEdge',
+        max_size=2560),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=1.0),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackSegInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeShortestEdge', scale=crop_size, max_size=2560),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+
+# By default, models are trained on 4 GPUs with 8 images per GPU
+train_dataloader = dict(batch_size=8, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/san/clip_vit-base-patch16-224_3rdparty-d08f8887.pth'  # noqa
+data_preprocessor = dict(
+    mean=[122.7709, 116.7460, 104.0937],
+    std=[68.5005, 66.6322, 70.3232],
+    size_divisor=640,
+    test_cfg=dict(size_divisor=32))
+model = dict(
+    pretrained=pretrained,
+    text_encoder=dict(dataset_name='coco-stuff164k'),
+    decode_head=dict(num_classes=171))
+
+# training schedule for 60k
+train_cfg = dict(
+    type='IterBasedTrainLoop',
+    max_iters=60000,
+    val_interval=500,
+    val_begin=55000)
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        by_epoch=False,
+        interval=10000,
+        save_best='mIoU'))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_encoder': dict(lr_mult=0.1, decay_mult=1.0),
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }),
+    loss_scale='dynamic',
+    clip_grad=dict(max_norm=0.01, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=0,
+        end=60000,
+        by_epoch=False,
+    )
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/san/san-vit-b16_pascal_context-640x640.py b/head_extractor/src/mmseg/.mim/configs/san/san-vit-b16_pascal_context-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..b164fe41fd97c9bf035c6628ed92283e7f851b9b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/san/san-vit-b16_pascal_context-640x640.py
@@ -0,0 +1,56 @@
+_base_ = [
+    '../_base_/models/san_vit-b16.py',
+    '../_base_/datasets/pascal_context_59.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeShortestEdge', scale=crop_size, max_size=2560),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+data_preprocessor = dict(
+    mean=[122.7709, 116.7460, 104.0937],
+    std=[68.5005, 66.6322, 70.3232],
+    size_divisor=640,
+    test_cfg=dict(size_divisor=32))
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/vit_base_patch16_224.pth',
+    text_encoder=dict(dataset_name='pascal_context'),
+    decode_head=dict(num_classes=59))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/san/san-vit-b16_voc12aug-640x640.py b/head_extractor/src/mmseg/.mim/configs/san/san-vit-b16_voc12aug-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..62e9b26f0af3d8679680adcebac1f19aa005f776
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/san/san-vit-b16_voc12aug-640x640.py
@@ -0,0 +1,65 @@
+_base_ = [
+    '../_base_/models/san_vit-b16.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+
+metainfo = dict(
+    classes=('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+             'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
+             'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'),
+    palette=[[128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
+             [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0],
+             [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128],
+             [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0],
+             [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]])
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeShortestEdge', scale=crop_size, max_size=2560),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(
+    batch_size=1, dataset=dict(metainfo=metainfo, pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+data_preprocessor = dict(
+    mean=[122.7709, 116.7460, 104.0937],
+    std=[68.5005, 66.6322, 70.3232],
+    size_divisor=640,
+    test_cfg=dict(size_divisor=32))
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/vit_base_patch16_224.pth',
+    text_encoder=dict(dataset_name='voc'),
+    decode_head=dict(num_classes=20))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/san/san-vit-l14_coco-stuff164k-640x640.py b/head_extractor/src/mmseg/.mim/configs/san/san-vit-l14_coco-stuff164k-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..c34328db3feef2d8704caaef59426b072c9b8171
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/san/san-vit-l14_coco-stuff164k-640x640.py
@@ -0,0 +1,36 @@
+_base_ = ['./san-vit-b16_coco-stuff164k-640x640.py']
+
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/san/clip_vit-large-patch14-336_3rdparty-0b5df9cb.pth'  # noqa
+model = dict(
+    type='MultimodalEncoderDecoder',
+    pretrained=pretrained,
+    encoder_resolution=0.7,
+    image_encoder=dict(
+        type='VisionTransformer',
+        img_size=(336, 336),
+        patch_size=14,
+        patch_pad=0,
+        embed_dims=1024,
+        num_layers=18,
+        num_heads=16,
+        out_indices=(5, 11, 17),
+    ),
+    text_encoder=dict(
+        type='CLIPTextEncoder',
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        output_dims=768,
+    ),
+    decode_head=dict(
+        type='SideAdapterCLIPHead',
+        san_cfg=dict(clip_channels=1024, cfg_decoder=dict(num_heads=16)),
+        maskgen_cfg=dict(
+            num_layers=6,
+            embed_dims=1024,
+            num_heads=16,
+            out_dims=768,
+        )))
+
+# By default, models are trained on 8 GPUs with 4 images per GPU
+train_dataloader = dict(batch_size=4)
diff --git a/head_extractor/src/mmseg/.mim/configs/san/san-vit-l14_pascal_context-640x640.py b/head_extractor/src/mmseg/.mim/configs/san/san-vit-l14_pascal_context-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9545fac8e04e3138f0f151d093ac22fa0590efb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/san/san-vit-l14_pascal_context-640x640.py
@@ -0,0 +1,32 @@
+_base_ = ['./san-vit-b16_pascal_context-640x640.py']
+
+model = dict(
+    type='MultimodalEncoderDecoder',
+    pretrained='pretrain/jx_vit_base_p16_224-80ecf9dd.pth',
+    encoder_resolution=0.7,
+    image_encoder=dict(
+        type='VisionTransformer',
+        img_size=(336, 336),
+        patch_size=14,
+        patch_pad=0,
+        embed_dims=1024,
+        num_layers=18,
+        num_heads=16,
+        out_indices=(5, 11, 17),
+    ),
+    text_encoder=dict(
+        type='CLIPTextEncoder',
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        output_dims=768,
+    ),
+    decode_head=dict(
+        type='SideAdapterCLIPHead',
+        san_cfg=dict(clip_channels=1024, cfg_decoder=dict(num_heads=16)),
+        maskgen_cfg=dict(
+            num_layers=6,
+            embed_dims=1024,
+            num_heads=16,
+            out_dims=768,
+        )))
diff --git a/head_extractor/src/mmseg/.mim/configs/san/san-vit-l14_voc12aug-640x640.py b/head_extractor/src/mmseg/.mim/configs/san/san-vit-l14_voc12aug-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f377150394df02b0af0fd8e5bcbfb127d5e403d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/san/san-vit-l14_voc12aug-640x640.py
@@ -0,0 +1,32 @@
+_base_ = ['./san-vit-b16_voc12aug-640x640.py']
+
+model = dict(
+    type='MultimodalEncoderDecoder',
+    pretrained='pretrain/jx_vit_base_p16_224-80ecf9dd.pth',
+    encoder_resolution=0.7,
+    image_encoder=dict(
+        type='VisionTransformer',
+        img_size=(336, 336),
+        patch_size=14,
+        patch_pad=0,
+        embed_dims=1024,
+        num_layers=18,
+        num_heads=16,
+        out_indices=(5, 11, 17),
+    ),
+    text_encoder=dict(
+        type='CLIPTextEncoder',
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        output_dims=768,
+    ),
+    decode_head=dict(
+        type='SideAdapterCLIPHead',
+        san_cfg=dict(clip_channels=1024, cfg_decoder=dict(num_heads=16)),
+        maskgen_cfg=dict(
+            num_layers=6,
+            embed_dims=1024,
+            num_heads=16,
+            out_dims=768,
+        )))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/README.md b/head_extractor/src/mmseg/.mim/configs/segformer/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f8999b0efa5034a115b64efba127cdf8e5c2a113
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/README.md
@@ -0,0 +1,101 @@
+# SegFormer
+
+> [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/NVlabs/SegFormer">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We present SegFormer, a simple, efficient yet powerful semantic segmentation framework which unifies Transformers with lightweight multilayer perception (MLP) decoders. SegFormer has two appealing features: 1) SegFormer comprises a novel hierarchically structured Transformer encoder which outputs multiscale features. It does not need positional encoding, thereby avoiding the interpolation of positional codes which leads to decreased performance when the testing resolution differs from training. 2) SegFormer avoids complex decoders. The proposed MLP decoder aggregates information from different layers, and thus combining both local attention and global attention to render powerful representations. We show that this simple and lightweight design is the key to efficient segmentation on Transformers. We scale our approach up to obtain a series of models from SegFormer-B0 to SegFormer-B5, reaching significantly better performance and efficiency than previous counterparts. For example, SegFormer-B4 achieves 50.3% mIoU on ADE20K with 64M parameters, being 5x smaller and 2.2% better than the previous best method. Our best model, SegFormer-B5, achieves 84.0% mIoU on Cityscapes validation set and shows excellent zero-shot robustness on Cityscapes-C. Code will be released at: [this http URL](https://github.com/NVlabs/SegFormer).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142902600-e188073e-5744-4ba9-8dbf-9316e55c74aa.png" width="70%"/>
+</div>
+
+## Usage
+
+To use other repositories' pre-trained models, it is necessary to convert keys.
+
+We provide a script [`mit2mmseg.py`](../../tools/model_converters/mit2mmseg.py) in the tools directory to convert the key of models from [the official repo](https://github.com/NVlabs/SegFormer) to MMSegmentation style.
+
+```shell
+python tools/model_converters/mit2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+## Results and models
+
+### ADE20K
+
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device   |  mIoU | mIoU(ms+flip) | config                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                               |
+| --------- | -------- | --------- | ------: | -------: | -------------- | -------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| Segformer | MIT-B0   | 512x512   |  160000 |      2.1 | 51.32          | 1080 Ti  | 37.41 | 38.34         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b0_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20k/segformer_mit-b0_512x512_160k_ade20k_20210726_101530-8ffa8fda.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20k/segformer_mit-b0_512x512_160k_ade20k_20210726_101530.log.json) |
+| Segformer | MIT-B1   | 512x512   |  160000 |      2.6 | 47.66          | TITAN Xp | 40.97 | 42.54         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b1_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20k/segformer_mit-b1_512x512_160k_ade20k_20210726_112106-d70e859d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20k/segformer_mit-b1_512x512_160k_ade20k_20210726_112106.log.json) |
+| Segformer | MIT-B2   | 512x512   |  160000 |      3.6 | 30.88          | TITAN Xp | 45.58 | 47.03         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b2_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20k/segformer_mit-b2_512x512_160k_ade20k_20210726_112103-cbd414ac.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20k/segformer_mit-b2_512x512_160k_ade20k_20210726_112103.log.json) |
+| Segformer | MIT-B3   | 512x512   |  160000 |      4.8 | 22.11          | V100     | 47.82 | 48.81         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b3_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20k/segformer_mit-b3_512x512_160k_ade20k_20210726_081410-962b98d2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20k/segformer_mit-b3_512x512_160k_ade20k_20210726_081410.log.json) |
+| Segformer | MIT-B4   | 512x512   |  160000 |      6.1 | 15.45          | V100     | 48.46 | 49.76         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b4_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20k/segformer_mit-b4_512x512_160k_ade20k_20210728_183055-7f509d7d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20k/segformer_mit-b4_512x512_160k_ade20k_20210728_183055.log.json) |
+| Segformer | MIT-B5   | 512x512   |  160000 |      7.2 | 11.89          | V100     | 49.13 | 50.22         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20k/segformer_mit-b5_512x512_160k_ade20k_20210726_145235-94cedf59.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20k/segformer_mit-b5_512x512_160k_ade20k_20210726_145235.log.json) |
+| Segformer | MIT-B5   | 640x640   |  160000 |     11.5 | 11.30          | V100     | 49.62 | 50.36         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20k/segformer_mit-b5_640x640_160k_ade20k_20210801_121243-41d2845b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20k/segformer_mit-b5_640x640_160k_ade20k_20210801_121243.log.json) |
+
+Evaluation with AlignedResize:
+
+| Method    | Backbone | Crop Size | Lr schd |  mIoU | mIoU(ms+flip) |
+| --------- | -------- | --------- | ------: | ----: | ------------- |
+| Segformer | MIT-B0   | 512x512   |  160000 |  38.1 | 38.57         |
+| Segformer | MIT-B1   | 512x512   |  160000 | 41.64 | 42.76         |
+| Segformer | MIT-B2   | 512x512   |  160000 | 46.53 | 47.49         |
+| Segformer | MIT-B3   | 512x512   |  160000 | 48.46 | 49.14         |
+| Segformer | MIT-B4   | 512x512   |  160000 | 49.34 | 50.29         |
+| Segformer | MIT-B5   | 512x512   |  160000 | 50.08 | 50.72         |
+| Segformer | MIT-B5   | 640x640   |  160000 | 50.58 | 50.8          |
+
+We replace `AlignedResize` in original implementatiuon to `Resize + ResizeToMultiple`. If you want to test by
+using `AlignedResize`, you can change the dataset pipeline like this:
+
+```python
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # resize image to multiple of 32, improve SegFormer by 0.5-1.0 mIoU.
+    dict(type='ResizeToMultiple', size_divisor=32),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+```
+
+### Cityscapes
+
+The lower fps result is caused by the sliding window inference scheme (window size:1024x1024).
+
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                | download                                                                                                                                                                                                                                                                                                                                                                                       |
+| --------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Segformer | MIT-B0   | 1024x1024 |  160000 |     3.64 | 4.74           | V100   | 76.54 | 78.22         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes/segformer_mit-b0_8x1_1024x1024_160k_cityscapes_20211208_101857-e7f88502.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes/segformer_mit-b0_8x1_1024x1024_160k_cityscapes_20211208_101857.log.json) |
+| Segformer | MIT-B1   | 1024x1024 |  160000 |     4.49 | 4.3            | V100   | 78.56 | 79.73         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b1_8xb1-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes/segformer_mit-b1_8x1_1024x1024_160k_cityscapes_20211208_064213-655c7b3f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes/segformer_mit-b1_8x1_1024x1024_160k_cityscapes_20211208_064213.log.json) |
+| Segformer | MIT-B2   | 1024x1024 |  160000 |     7.42 | 3.36           | V100   | 81.08 | 82.18         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b2_8xb1-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes/segformer_mit-b2_8x1_1024x1024_160k_cityscapes_20211207_134205-6096669a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes/segformer_mit-b2_8x1_1024x1024_160k_cityscapes_20211207_134205.log.json) |
+| Segformer | MIT-B3   | 1024x1024 |  160000 |    10.86 | 2.53           | V100   | 81.94 | 83.14         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b3_8xb1-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes/segformer_mit-b3_8x1_1024x1024_160k_cityscapes_20211206_224823-a8f8a177.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes/segformer_mit-b3_8x1_1024x1024_160k_cityscapes_20211206_224823.log.json) |
+| Segformer | MIT-B4   | 1024x1024 |  160000 |    15.07 | 1.88           | V100   | 81.89 | 83.38         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b4_8xb1-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes/segformer_mit-b4_8x1_1024x1024_160k_cityscapes_20211207_080709-07f6c333.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes/segformer_mit-b4_8x1_1024x1024_160k_cityscapes_20211207_080709.log.json) |
+| Segformer | MIT-B5   | 1024x1024 |  160000 |    18.00 | 1.39           | V100   | 82.25 | 83.48         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b5_8xb1-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes/segformer_mit-b5_8x1_1024x1024_160k_cityscapes_20211206_072934-87a052ec.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes/segformer_mit-b5_8x1_1024x1024_160k_cityscapes_20211206_072934.log.json) |
+
+## Citation
+
+```bibtex
+@article{xie2021segformer,
+  title={SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers},
+  author={Xie, Enze and Wang, Wenhai and Yu, Zhiding and Anandkumar, Anima and Alvarez, Jose M and Luo, Ping},
+  journal={arXiv preprint arXiv:2105.15203},
+  year={2021}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/segformer/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7fb38d745baed41ef272451f690a4929c6548f94
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/metafile.yaml
@@ -0,0 +1,340 @@
+Collections:
+- Name: Segformer
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - ADE20K
+    - Cityscapes
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  README: configs/segformer/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: segformer_mit-b0_8xb2-160k_ade20k-512x512
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 37.41
+      mIoU(ms+flip): 38.34
+  Config: configs/segformer/segformer_mit-b0_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B0
+    - Segformer
+    Training Resources: 8x 1080 Ti GPUS
+    Memory (GB): 2.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20k/segformer_mit-b0_512x512_160k_ade20k_20210726_101530-8ffa8fda.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20k/segformer_mit-b0_512x512_160k_ade20k_20210726_101530.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b1_8xb2-160k_ade20k-512x512
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.97
+      mIoU(ms+flip): 42.54
+  Config: configs/segformer/segformer_mit-b1_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B1
+    - Segformer
+    Training Resources: 8x TITAN Xp GPUS
+    Memory (GB): 2.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20k/segformer_mit-b1_512x512_160k_ade20k_20210726_112106-d70e859d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20k/segformer_mit-b1_512x512_160k_ade20k_20210726_112106.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b2_8xb2-160k_ade20k-512x512
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.58
+      mIoU(ms+flip): 47.03
+  Config: configs/segformer/segformer_mit-b2_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B2
+    - Segformer
+    Training Resources: 8x TITAN Xp GPUS
+    Memory (GB): 3.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20k/segformer_mit-b2_512x512_160k_ade20k_20210726_112103-cbd414ac.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20k/segformer_mit-b2_512x512_160k_ade20k_20210726_112103.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b3_8xb2-160k_ade20k-512x512
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.82
+      mIoU(ms+flip): 48.81
+  Config: configs/segformer/segformer_mit-b3_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B3
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20k/segformer_mit-b3_512x512_160k_ade20k_20210726_081410-962b98d2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20k/segformer_mit-b3_512x512_160k_ade20k_20210726_081410.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b4_8xb2-160k_ade20k-512x512
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.46
+      mIoU(ms+flip): 49.76
+  Config: configs/segformer/segformer_mit-b4_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B4
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20k/segformer_mit-b4_512x512_160k_ade20k_20210728_183055-7f509d7d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20k/segformer_mit-b4_512x512_160k_ade20k_20210728_183055.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b5_8xb2-160k_ade20k-512x512
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.13
+      mIoU(ms+flip): 50.22
+  Config: configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B5
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20k/segformer_mit-b5_512x512_160k_ade20k_20210726_145235-94cedf59.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20k/segformer_mit-b5_512x512_160k_ade20k_20210726_145235.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b5_8xb2-160k_ade20k-640x640
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.62
+      mIoU(ms+flip): 50.36
+  Config: configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B5
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 11.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20k/segformer_mit-b5_640x640_160k_ade20k_20210801_121243-41d2845b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20k/segformer_mit-b5_640x640_160k_ade20k_20210801_121243.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b0_8xb1-160k_cityscapes-1024x1024
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.54
+      mIoU(ms+flip): 78.22
+  Config: configs/segformer/segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - MIT-B0
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 3.64
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes/segformer_mit-b0_8x1_1024x1024_160k_cityscapes_20211208_101857-e7f88502.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes/segformer_mit-b0_8x1_1024x1024_160k_cityscapes_20211208_101857.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b1_8xb1-160k_cityscapes-1024x1024
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.56
+      mIoU(ms+flip): 79.73
+  Config: configs/segformer/segformer_mit-b1_8xb1-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - MIT-B1
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.49
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes/segformer_mit-b1_8x1_1024x1024_160k_cityscapes_20211208_064213-655c7b3f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes/segformer_mit-b1_8x1_1024x1024_160k_cityscapes_20211208_064213.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b2_8xb1-160k_cityscapes-1024x1024
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 81.08
+      mIoU(ms+flip): 82.18
+  Config: configs/segformer/segformer_mit-b2_8xb1-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - MIT-B2
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.42
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes/segformer_mit-b2_8x1_1024x1024_160k_cityscapes_20211207_134205-6096669a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes/segformer_mit-b2_8x1_1024x1024_160k_cityscapes_20211207_134205.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b3_8xb1-160k_cityscapes-1024x1024
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 81.94
+      mIoU(ms+flip): 83.14
+  Config: configs/segformer/segformer_mit-b3_8xb1-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - MIT-B3
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 10.86
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes/segformer_mit-b3_8x1_1024x1024_160k_cityscapes_20211206_224823-a8f8a177.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes/segformer_mit-b3_8x1_1024x1024_160k_cityscapes_20211206_224823.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b4_8xb1-160k_cityscapes-1024x1024
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 81.89
+      mIoU(ms+flip): 83.38
+  Config: configs/segformer/segformer_mit-b4_8xb1-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - MIT-B4
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 15.07
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes/segformer_mit-b4_8x1_1024x1024_160k_cityscapes_20211207_080709-07f6c333.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes/segformer_mit-b4_8x1_1024x1024_160k_cityscapes_20211207_080709.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b5_8xb1-160k_cityscapes-1024x1024
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 82.25
+      mIoU(ms+flip): 83.48
+  Config: configs/segformer/segformer_mit-b5_8xb1-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - MIT-B5
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 18.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes/segformer_mit-b5_8x1_1024x1024_160k_cityscapes_20211206_072934-87a052ec.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes/segformer_mit-b5_8x1_1024x1024_160k_cityscapes_20211206_072934.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..1280047c9420181e5c4bcb2a18443804d282affc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py
@@ -0,0 +1,41 @@
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (1024, 1024)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    test_cfg=dict(mode='slide', crop_size=(1024, 1024), stride=(768, 768)))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
+
+train_dataloader = dict(batch_size=1, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a9476df684593ab5d52746c7e796eae9ff5e43b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(num_classes=150))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_deepfashion-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_deepfashion-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..5af704617f655daef77fc4b639b92de69c9b0462
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_deepfashion-512x512.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/deep_fashion.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(num_classes=14))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_fashion10k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_fashion10k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1e250781f798887659eb0fd6dc65b447a95ed9e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_fashion10k-512x512.py
@@ -0,0 +1,64 @@
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/deep_fashion_10k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+# train: 12013      val: 688
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+# checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+checkpoint = 'checkpoints/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(num_classes=24))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        # type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+        type='AdamW', lr=0.0006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+total_train_img = 12013
+total_epoch = 120
+train_batch_size = 1
+total_iter = (int(total_train_img / train_batch_size) + 1) * total_epoch
+linear_lr_end = (int(total_train_img / train_batch_size) + 1) * total_epoch / 10
+val_save_interval = int(total_train_img / train_batch_size) + 1
+base_lr = 0.0009 * (train_batch_size / 64)
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=linear_lr_end),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=linear_lr_end,
+        end=total_iter,
+        by_epoch=False,
+    )
+]
+
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iter, val_interval=val_save_interval)
+
+train_dataloader = dict(batch_size=train_batch_size, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, save_best='mIoU', interval=val_save_interval),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+work_dir = './work_dirs/imaterialist_5cat/segformer-b0'
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_fashion3category-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_fashion3category-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..84272b39c7b51dc31b9874609b634c46fde3a1d0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_fashion3category-512x512.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/fashion_3category.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(num_classes=4))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
+train_dataloader = dict(batch_size=64, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_humanparsing-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_humanparsing-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..2749a238fdd954576c229a06da8ca3025ebf41b9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_humanparsing-1024x1024.py
@@ -0,0 +1,75 @@
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/human_parsing_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+# train: 15914      val: 1792
+crop_size = (1024, 1024)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(num_classes=18))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        # type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+        type='AdamW', lr=0.0006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=38000, # batch size 64
+        by_epoch=False,
+    )
+]
+
+# param_scheduler = [
+#     dict(
+#         type='LinearLR', start_factor=1e-6, by_epoch=True, begin=0, end=5),
+#     dict(
+#         type='PolyLR',
+#         eta_min=0.0,
+#         power=1.0,
+#         begin=5,
+#         end=150,
+#         by_epoch=True,
+#     )
+# ]
+
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=38000, val_interval=1250)
+# val_cfg = dict(type='ValLoop')
+# test_cfg = dict(type='TestLoop')
+# default_hooks = dict(
+#     timer=dict(type='IterTimerHook'),
+#     logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+#     param_scheduler=dict(type='ParamSchedulerHook'),
+#     checkpoint=dict(type='CheckpointHook', by_epoch=True, interval=5),
+#     sampler_seed=dict(type='DistSamplerSeedHook'),
+#     visualization=dict(type='SegVisualizationHook'))
+
+train_dataloader = dict(batch_size=64, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=1250),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_humanparsing-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_humanparsing-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9ac45c13c4170e9371ce39fe9bf4874b1dfd9c9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_humanparsing-512x512.py
@@ -0,0 +1,75 @@
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/human_parsing.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+# train: 15914      val: 1792
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(num_classes=18))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        # type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+        type='AdamW', lr=0.0006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=38000, # batch size 64
+        by_epoch=False,
+    )
+]
+
+# param_scheduler = [
+#     dict(
+#         type='LinearLR', start_factor=1e-6, by_epoch=True, begin=0, end=5),
+#     dict(
+#         type='PolyLR',
+#         eta_min=0.0,
+#         power=1.0,
+#         begin=5,
+#         end=150,
+#         by_epoch=True,
+#     )
+# ]
+
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=38000, val_interval=1250)
+# val_cfg = dict(type='ValLoop')
+# test_cfg = dict(type='TestLoop')
+# default_hooks = dict(
+#     timer=dict(type='IterTimerHook'),
+#     logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+#     param_scheduler=dict(type='ParamSchedulerHook'),
+#     checkpoint=dict(type='CheckpointHook', by_epoch=True, interval=5),
+#     sampler_seed=dict(type='DistSamplerSeedHook'),
+#     visualization=dict(type='SegVisualizationHook'))
+
+train_dataloader = dict(batch_size=64, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=1250),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_humanunion-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_humanunion-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd7464f8b946bebb5f9372323776cf3b11f045f1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_humanunion-512x512.py
@@ -0,0 +1,53 @@
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/human_union.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+# train: 47282      val: 1838
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(num_classes=18))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        # type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+        type='AdamW', lr=0.0006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=3000),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=3000,
+        end=75000, # batch size 64
+        by_epoch=False,
+    )
+]
+
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=75000, val_interval=3000)
+
+train_dataloader = dict(batch_size=64, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=3000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_imaterialist-5cat-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_imaterialist-5cat-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a29c22b90cbe2bcf15cfb4ed5774733eec1cf97
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_imaterialist-5cat-512x512.py
@@ -0,0 +1,64 @@
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/imaterialist_5cat.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+# train: 12013      val: 688
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+# checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+checkpoint = 'checkpoints/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(num_classes=24))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        # type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+        type='AdamW', lr=0.0006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+total_train_img = 42988
+total_epoch = 120
+train_batch_size = 64
+total_iter = (int(total_train_img / train_batch_size) + 1) * total_epoch
+linear_lr_end = (int(total_train_img / train_batch_size) + 1) * total_epoch / 20
+val_save_interval = int(total_train_img / train_batch_size) + 1
+base_lr = 0.0009 * (train_batch_size / 64)
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=linear_lr_end),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=linear_lr_end,
+        end=total_iter,
+        by_epoch=False,
+    )
+]
+
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iter, val_interval=val_save_interval)
+
+train_dataloader = dict(batch_size=train_batch_size, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, save_best='mIoU', interval=val_save_interval),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+work_dir = './work_dirs/imaterialist_5cat/segformer-b0'
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_lip_512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_lip_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..4170f4322423567029cf0cc329556bb08fee2166
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_lip_512x512.py
@@ -0,0 +1,55 @@
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/lip.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+# train: 30462      val: 10000
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(num_classes=20))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        # type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+        type='AdamW', lr=0.0006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+iter_end = 60000
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=iter_end, # batch size 64
+        by_epoch=False,
+    )
+]
+
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=iter_end, val_interval=2500)
+
+train_dataloader = dict(batch_size=64, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2500),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_unionnew-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_unionnew-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c7efabf1f9f07ab4281821d3d4d6b2f4eddf705
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b0_8xb2-160k_unionnew-512x512.py
@@ -0,0 +1,64 @@
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/union_new.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+# train: 12013      val: 688
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+# checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+checkpoint = 'checkpoints/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(num_classes=24))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        # type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+        type='AdamW', lr=0.0006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+total_train_img = 13033
+total_epoch = 120
+train_batch_size = 1
+total_iter = (int(total_train_img / train_batch_size) + 1) * total_epoch
+linear_lr_end = (int(total_train_img / train_batch_size) + 1) * total_epoch / 10
+val_save_interval = int(total_train_img / train_batch_size) + 1
+base_lr = 0.0009 * (train_batch_size / 64)
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=linear_lr_end),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=linear_lr_end,
+        end=total_iter,
+        by_epoch=False,
+    )
+]
+
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iter, val_interval=val_save_interval)
+
+train_dataloader = dict(batch_size=train_batch_size, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, save_best='mIoU', interval=val_save_interval),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+work_dir = './work_dirs/imaterialist_5cat/segformer-b0'
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b1_8xb1-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b1_8xb1-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..85c126ead42d4d54195d6730e079ecdd2479ddd2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b1_8xb1-160k_cityscapes-1024x1024.py
@@ -0,0 +1,9 @@
+_base_ = ['./segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b1_20220624-02e5a6a1.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b1_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b1_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ff21b8becf7453f4920f099e515ea2c510de8a0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b1_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b1_20220624-02e5a6a1.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[2, 2, 2, 2]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb1-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb1-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..c802f275b5776abfb49dd7773d0c14b97ac3e4d2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb1-160k_cityscapes-1024x1024.py
@@ -0,0 +1,10 @@
+_base_ = ['./segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b2_20220624-66e8bf70.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_layers=[3, 4, 6, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f4c1af061f55f7e2f8bc6845856dbc53327684d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b2_20220624-66e8bf70.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 4, 6, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_fashion10k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_fashion10k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..6307e76040ca79cf56a32098a1ff0e966fa362d5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_fashion10k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_fashion10k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b2_20220624-66e8bf70.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 4, 6, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_humanparsing-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_humanparsing-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..aeeccd468b6da84532c4e19abd615a0df838444b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_humanparsing-1024x1024.py
@@ -0,0 +1,54 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_humanparsing-1024x1024.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b2_20220624-66e8bf70.pth'  # noqa
+
+iter_end = 160000
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 4, 6, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=6000),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=6000,
+        end=iter_end, # batch size 8
+        by_epoch=False,
+    )
+]
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0003, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=iter_end, val_interval=5000)
+train_dataloader = dict(batch_size=8, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=5000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_humanparsing-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_humanparsing-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..badf1c00197b32cfda5d9fcaf0a2aa1f832a3154
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_humanparsing-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_humanparsing-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b2_20220624-66e8bf70.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 4, 6, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_humanunion-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_humanunion-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1091f48b81b38a63090113cd5ed1e0196724d7ca
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_humanunion-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_humanunion-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b2_20220624-66e8bf70.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 4, 6, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_lip_512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_lip_512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..98284a27c945f6c1537a8cf521f1f0de7640bc14
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b2_8xb2-160k_lip_512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_lip_512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b2_20220624-66e8bf70.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 4, 6, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b3_8xb1-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b3_8xb1-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b41ad0b3913e341bee9d2dd44585206677ec301
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b3_8xb1-160k_cityscapes-1024x1024.py
@@ -0,0 +1,10 @@
+_base_ = ['./segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b3_20220624-13b1141c.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_layers=[3, 4, 18, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b3_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b3_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2cc13d847d0a159377c8568be40caa4d16ad793
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b3_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b3_20220624-13b1141c.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 4, 18, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb1-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb1-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fb16080dd57095dbd8d7263403e3254c51ec55a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb1-160k_cityscapes-1024x1024.py
@@ -0,0 +1,10 @@
+_base_ = ['./segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b4_20220624-d588d980.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_layers=[3, 8, 27, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f39c3010809148a9634ffb321f581139016de8f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b4_20220624-d588d980.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 8, 27, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_fashion10k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_fashion10k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..3852536580ad237a4f809485064fc869fcc911c4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_fashion10k-512x512.py
@@ -0,0 +1,64 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_fashion10k-512x512.py']
+
+# checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b4_20220624-d588d980.pth'  # noqa
+checkpoint = 'checkpoints/mit_b4_20220624-d588d980.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 8, 27, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
+
+total_train_img = 12013
+total_epoch = 120
+train_batch_size = 1
+total_iter = (int(total_train_img / train_batch_size) + 1) * total_epoch
+linear_lr_end = (int(total_train_img / train_batch_size) + 1) * total_epoch / 10
+val_save_interval = int(total_train_img / train_batch_size) + 1
+base_lr = 0.0009 * (train_batch_size / 64)
+
+work_dir = './work_dirs/fashion10k/segformer-b4'
+
+# total_train_img = 12013 * 2
+# total_epoch = 120
+# train_batch_size = 32
+# total_iter = (int(total_train_img / train_batch_size) + 1) * 120
+# linear_lr_end = (int(total_train_img / train_batch_size) + 1) * 5
+# val_save_interval = int(total_iter / 10)
+# base_lr = 0.0006 * (train_batch_size / 64)
+
+# optim_wrapper = dict(
+#     _delete_=True,
+#     type='OptimWrapper',
+#     optimizer=dict(
+#         # type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+#         type='AdamW', lr=0.0006, betas=(0.9, 0.999), weight_decay=0.01),
+#     paramwise_cfg=dict(
+#         custom_keys={
+#             'pos_block': dict(decay_mult=0.),
+#             'norm': dict(decay_mult=0.),
+#             'head': dict(lr_mult=10.)
+#         }))
+
+# param_scheduler = [
+#     dict(
+#         type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=linear_lr_end),
+#     dict(
+#         type='PolyLR',
+#         eta_min=0.0,
+#         power=1.0,
+#         begin=linear_lr_end,
+#         end=total_iter, # batch size 64
+#         by_epoch=False,
+#     )
+# ]
+
+# train_cfg = dict(
+#     type='IterBasedTrainLoop', max_iters=total_iter, val_interval=val_save_interval)
+
+# train_dataloader = dict(batch_size=train_batch_size, num_workers=4)
+# val_dataloader = dict(batch_size=1, num_workers=4)
+# test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_humanparsing-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_humanparsing-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ca40557a8e7821f66cd280a21b7ed14769f5e87
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_humanparsing-1024x1024.py
@@ -0,0 +1,314 @@
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b4_20220624-d588d980.pth'
+crop_size = (
+    1024,
+    1024,
+)
+data_preprocessor = dict(
+    bgr_to_rgb=True,
+    mean=[
+        123.675,
+        116.28,
+        103.53,
+    ],
+    pad_val=0,
+    seg_pad_val=255,
+    size=(
+        1024,
+        1024,
+    ),
+    std=[
+        58.395,
+        57.12,
+        57.375,
+    ],
+    type='SegDataPreProcessor')
+data_root = '/mnt/data/ocrSpace/linyangsheng/workspace/data/human_parsing_2'
+dataset_type = 'HumanParsingDataset'
+default_hooks = dict(
+    checkpoint=dict(by_epoch=False, interval=5000, type='CheckpointHook'),
+    logger=dict(interval=100, log_metric_by_epoch=False, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='SegVisualizationHook'))
+default_scope = 'mmseg'
+env_cfg = dict(
+    cudnn_benchmark=True,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+img_ratios = [
+    0.5,
+    0.75,
+    1.0,
+    1.25,
+    1.5,
+    1.75,
+]
+launcher = 'none'
+load_from = None
+log_level = 'INFO'
+log_processor = dict(by_epoch=False)
+model = dict(
+    backbone=dict(
+        attn_drop_rate=0.0,
+        drop_path_rate=0.1,
+        drop_rate=0.0,
+        embed_dims=64,
+        in_channels=3,
+        init_cfg=dict(
+            checkpoint=
+            'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b4_20220624-d588d980.pth',
+            type='Pretrained'),
+        mlp_ratio=4,
+        num_heads=[
+            1,
+            2,
+            5,
+            8,
+        ],
+        num_layers=[
+            3,
+            8,
+            27,
+            3,
+        ],
+        num_stages=4,
+        out_indices=(
+            0,
+            1,
+            2,
+            3,
+        ),
+        patch_sizes=[
+            7,
+            3,
+            3,
+            3,
+        ],
+        qkv_bias=True,
+        sr_ratios=[
+            8,
+            4,
+            2,
+            1,
+        ],
+        type='MixVisionTransformer'),
+    data_preprocessor=dict(
+        bgr_to_rgb=True,
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        pad_val=0,
+        seg_pad_val=255,
+        size=(
+            1024,
+            1024,
+        ),
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        type='SegDataPreProcessor'),
+    decode_head=dict(
+        align_corners=False,
+        channels=256,
+        dropout_ratio=0.1,
+        in_channels=[
+            64,
+            128,
+            320,
+            512,
+        ],
+        in_index=[
+            0,
+            1,
+            2,
+            3,
+        ],
+        loss_decode=dict(
+            loss_weight=1.0, type='CrossEntropyLoss', use_sigmoid=False),
+        norm_cfg=dict(requires_grad=True, type='SyncBN'),
+        num_classes=18,
+        type='SegformerHead'),
+    pretrained=None,
+    test_cfg=dict(mode='whole'),
+    train_cfg=dict(),
+    type='EncoderDecoder')
+norm_cfg = dict(requires_grad=True, type='SyncBN')
+optim_wrapper = dict(
+    optimizer=dict(
+        betas=(
+            0.9,
+            0.999,
+        ), lr=0.0006, type='AdamW', weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            head=dict(lr_mult=10.0),
+            norm=dict(decay_mult=0.0),
+            pos_block=dict(decay_mult=0.0))),
+    type='OptimWrapper')
+optimizer = dict(lr=0.01, momentum=0.9, type='SGD', weight_decay=0.0005)
+param_scheduler = [
+    dict(
+        begin=0, by_epoch=False, end=3000, start_factor=1e-06,
+        type='LinearLR'),
+    dict(
+        begin=3000,
+        by_epoch=False,
+        end=80000,
+        eta_min=0.0,
+        power=1.0,
+        type='PolyLR'),
+]
+resume = False
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(img_path='val/imgs', seg_map_path='val/labels'),
+        data_root=
+        '/mnt/data/ocrSpace/linyangsheng/workspace/data/human_parsing_2',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                2048,
+                1024,
+            ), type='Resize'),
+            dict(type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type='HumanParsingDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(keep_ratio=True, scale=(
+        2048,
+        1024,
+    ), type='Resize'),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs'),
+]
+train_cfg = dict(max_iters=80000, type='IterBasedTrainLoop', val_interval=5000)
+train_dataloader = dict(
+    batch_size=16,
+    dataset=dict(
+        data_prefix=dict(img_path='train/imgs', seg_map_path='train/labels'),
+        data_root=
+        '/mnt/data/ocrSpace/linyangsheng/workspace/data/human_parsing_2',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations'),
+            dict(
+                keep_ratio=True,
+                ratio_range=(
+                    0.5,
+                    2.0,
+                ),
+                scale=(
+                    2048,
+                    1024,
+                ),
+                type='RandomResize'),
+            dict(
+                cat_max_ratio=0.75, crop_size=(
+                    1024,
+                    1024,
+                ), type='RandomCrop'),
+            dict(prob=0.5, type='RandomFlip'),
+            dict(type='PhotoMetricDistortion'),
+            dict(type='PackSegInputs'),
+        ],
+        type='HumanParsingDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='InfiniteSampler'))
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        keep_ratio=True,
+        ratio_range=(
+            0.5,
+            2.0,
+        ),
+        scale=(
+            2048,
+            1024,
+        ),
+        type='RandomResize'),
+    dict(cat_max_ratio=0.75, crop_size=(
+        1024,
+        1024,
+    ), type='RandomCrop'),
+    dict(prob=0.5, type='RandomFlip'),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs'),
+]
+tta_model = dict(type='SegTTAModel')
+tta_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(
+        transforms=[
+            [
+                dict(keep_ratio=True, scale_factor=0.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=0.75, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.0, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.25, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.75, type='Resize'),
+            ],
+            [
+                dict(direction='horizontal', prob=0.0, type='RandomFlip'),
+                dict(direction='horizontal', prob=1.0, type='RandomFlip'),
+            ],
+            [
+                dict(type='LoadAnnotations'),
+            ],
+            [
+                dict(type='PackSegInputs'),
+            ],
+        ],
+        type='TestTimeAug'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(img_path='val/imgs', seg_map_path='val/labels'),
+        data_root=
+        '/mnt/data/ocrSpace/linyangsheng/workspace/data/human_parsing_2',
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                2048,
+                1024,
+            ), type='Resize'),
+            dict(type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type='HumanParsingDataset'),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    name='visualizer',
+    type='SegLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_humanparsing-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_humanparsing-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d504e0ad1c0717b5ba97a011a8e7e123b5c49ea
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_humanparsing-512x512.py
@@ -0,0 +1,52 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_humanparsing-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b4_20220624-d588d980.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 8, 27, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=3000),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=3000,
+        end=80000, # batch size 32
+        by_epoch=False,
+    )
+]
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=80000, val_interval=2500)
+train_dataloader = dict(batch_size=32, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2500),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_humanunion-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_humanunion-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaed67feaf8208648b9e192a1da706d41c4c5fde
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b4_8xb2-160k_humanunion-512x512.py
@@ -0,0 +1,52 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_humanunion-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b4_20220624-d588d980.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 8, 27, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=3000),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=3000,
+        end=80000, # batch size 32
+        by_epoch=False,
+    )
+]
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=80000, val_interval=5000)
+train_dataloader = dict(batch_size=32, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=5000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb1-160k_cityscapes-1024x1024.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb1-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..18c3c162588b2bd041aa07d84e0b6fda78a58096
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb1-160k_cityscapes-1024x1024.py
@@ -0,0 +1,10 @@
+_base_ = ['./segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b5_20220624-658746d9.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_layers=[3, 6, 40, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..41d7fa0241b86c235d487786b3e48154dc9b51bb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_fashion10k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b5_20220624-658746d9.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 6, 40, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-640x640.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-640x640.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32eb7c1e1adedf7d1dcb41d5fd4413b71e25932
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-640x640.py
@@ -0,0 +1,41 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b5_20220624-658746d9.pth'  # noqa
+
+# dataset settings
+crop_size = (640, 640)
+data_preprocessor = dict(size=crop_size)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2048, 640),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 640), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# model settings
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 6, 40, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb2-160k_fashion10k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb2-160k_fashion10k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..941cbf0e3105a7e5705063b49b728b13b0285586
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb2-160k_fashion10k-512x512.py
@@ -0,0 +1,37 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_fashion10k-512x512.py']
+
+# checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b5_20220624-658746d9.pth'  # noqa
+checkpoint = 'checkpoints/mit_b5_20220624-658746d9.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 6, 40, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
+
+total_train_img = 12013
+total_epoch = 120
+train_batch_size = 1
+total_iter = (int(total_train_img / train_batch_size) + 1) * total_epoch
+linear_lr_end = (int(total_train_img / train_batch_size) + 1) * total_epoch / 10
+val_save_interval = int(total_train_img / train_batch_size) + 1
+base_lr = 0.0009 * (train_batch_size / 64)
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iter, val_interval=val_save_interval)
+
+train_dataloader = dict(batch_size=train_batch_size, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, save_best='mIoU', interval=val_save_interval),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+work_dir = './work_dirs/fashion10k/segformer-b5'
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb2-160k_unionnew-512x512.py b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb2-160k_unionnew-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab9103139ce55fda6ec906284bf41356760bdc4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segformer/segformer_mit-b5_8xb2-160k_unionnew-512x512.py
@@ -0,0 +1,64 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_unionnew-512x512.py']
+
+# checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b5_20220624-658746d9.pth'  # noqa
+checkpoint = 'checkpoints/mit_b5_20220624-658746d9.pth'  # noqa
+crop_size = (896, 896)
+data_preprocessor = dict(size=crop_size)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2048, 896),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 896), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 6, 40, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
+
+total_train_img = 13033
+total_epoch = 80
+train_batch_size = 1
+total_iter = (int(total_train_img / train_batch_size) + 1) * total_epoch
+linear_lr_end = (int(total_train_img / train_batch_size) + 1) * total_epoch / 10
+val_save_interval = int(total_train_img / train_batch_size) + 1
+# base_lr = 0.0009 * (train_batch_size / 64)
+base_lr = 0.00003
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iter, val_interval=val_save_interval)
+
+train_dataloader = dict(batch_size=train_batch_size, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=100, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, save_best='mIoU', interval=val_save_interval),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+work_dir = './work_dirs/union_new/segformer-b5'
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/.mim/configs/segmenter/README.md b/head_extractor/src/mmseg/.mim/configs/segmenter/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..103b1254729e596494998f674e100f1c356ad4be
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segmenter/README.md
@@ -0,0 +1,76 @@
+# Segmenter
+
+> [Segmenter: Transformer for Semantic Segmentation](https://arxiv.org/abs/2105.05633)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/rstrudel/segmenter">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.21.0/mmseg/models/decode_heads/segmenter_mask_head.py#L15">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Image segmentation is often ambiguous at the level of individual image patches and requires contextual information to reach label consensus. In this paper we introduce Segmenter, a transformer model for semantic segmentation. In contrast to convolution-based methods, our approach allows to model global context already at the first layer and throughout the network. We build on the recent Vision Transformer (ViT) and extend it to semantic segmentation. To do so, we rely on the output embeddings corresponding to image patches and obtain class labels from these embeddings with a point-wise linear decoder or a mask transformer decoder. We leverage models pre-trained for image classification and show that we can fine-tune them on moderate sized datasets available for semantic segmentation. The linear decoder allows to obtain excellent results already, but the performance can be further improved by a mask transformer generating class masks. We conduct an extensive ablation study to show the impact of the different parameters, in particular the performance is better for large models and small patch sizes. Segmenter attains excellent results for semantic segmentation. It outperforms the state of the art on both ADE20K and Pascal Context datasets and is competitive on Cityscapes.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/148507554-87eb80bd-02c7-4c31-b102-c6141e231ec8.png" width="70%"/>
+</div>
+
+## Usage
+
+We have provided pretrained models converted from [ViT-AugReg](https://github.com/rwightman/pytorch-image-models/blob/f55c22bebf9d8afc449d317a723231ef72e0d662/timm/models/vision_transformer.py#L54-L106).
+
+If you want to convert keys on your own to use the pre-trained ViT model from [Segmenter](https://github.com/rstrudel/segmenter), we also provide a script [`vitjax2mmseg.py`](../../tools/model_converters/vitjax2mmseg.py) in the tools directory to convert the key of models from [ViT-AugReg](https://github.com/rwightman/pytorch-image-models/blob/f55c22bebf9d8afc449d317a723231ef72e0d662/timm/models/vision_transformer.py#L54-L106) to MMSegmentation style.
+
+```shell
+python tools/model_converters/vitjax2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+E.g.
+
+```shell
+python tools/model_converters/vitjax2mmseg.py \
+Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz \
+pretrain/vit_tiny_p16_384.pth
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+In our default setting, pretrained models and their corresponding [ViT-AugReg](https://github.com/rwightman/pytorch-image-models/blob/f55c22bebf9d8afc449d317a723231ef72e0d662/timm/models/vision_transformer.py#L54-L106) models could be defined below:
+
+| pretrained models     | original models                                                                                                                                                                 |
+| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| vit_tiny_p16_384.pth  | [vit_tiny_patch16_384](https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz)   |
+| vit_small_p16_384.pth | [vit_small_patch16_384](https://storage.googleapis.com/vit_models/augreg/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz) |
+| vit_base_p16_384.pth  | [vit_base_patch16_384](https://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz)  |
+| vit_large_p16_384.pth | [vit_large_patch16_384](https://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npz) |
+
+## Results and models
+
+### ADE20K
+
+| Method           | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                              | download                                                                                                                                                                                                                                                                                                                                                                                       |
+| ---------------- | -------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Segmenter Mask   | ViT-T_16 | 512x512   | 160000  | 1.21     | 27.98          | V100   | 39.99 | 40.83         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segmenter/segmenter_vit-t_mask_8xb1-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k/segmenter_vit-t_mask_8x1_512x512_160k_ade20k_20220105_151706-ffcf7509.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k/segmenter_vit-t_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json)         |
+| Segmenter Linear | ViT-S_16 | 512x512   | 160000  | 1.78     | 28.07          | V100   | 45.75 | 46.82         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segmenter/segmenter_vit-s_fcn_8xb1-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k/segmenter_vit-s_linear_8x1_512x512_160k_ade20k_20220105_151713-39658c46.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k/segmenter_vit-s_linear_8x1_512x512_160k_ade20k_20220105_151713.log.json) |
+| Segmenter Mask   | ViT-S_16 | 512x512   | 160000  | 2.03     | 24.80          | V100   | 46.19 | 47.85         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segmenter/segmenter_vit-s_mask_8xb1-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k/segmenter_vit-s_mask_8x1_512x512_160k_ade20k_20220105_151706-511bb103.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k/segmenter_vit-s_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json)         |
+| Segmenter Mask   | ViT-B_16 | 512x512   | 160000  | 4.20     | 13.20          | V100   | 49.60 | 51.07         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segmenter/segmenter_vit-b_mask_8xb1-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k/segmenter_vit-b_mask_8x1_512x512_160k_ade20k_20220105_151706-bc533b08.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k/segmenter_vit-b_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json)         |
+| Segmenter Mask   | ViT-L_16 | 640x640   | 160000  | 16.56    | 2.62           | V100   | 52.16 | 53.65         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segmenter/segmenter_vit-l_mask_8xb1-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k/segmenter_vit-l_mask_8x1_512x512_160k_ade20k_20220105_162750-7ef345be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k/segmenter_vit-l_mask_8x1_512x512_160k_ade20k_20220105_162750.log.json)         |
+
+## Citation
+
+```bibtex
+@inproceedings{strudel2021segmenter,
+  title={Segmenter: Transformer for semantic segmentation},
+  author={Strudel, Robin and Garcia, Ricardo and Laptev, Ivan and Schmid, Cordelia},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={7262--7272},
+  year={2021}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/segmenter/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/segmenter/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ff2aa448bb28e43ee61209650241e730d12d3160
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segmenter/metafile.yaml
@@ -0,0 +1,138 @@
+Collections:
+- Name: Segmenter
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - ADE20K
+  Paper:
+    Title: 'Segmenter: Transformer for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2105.05633
+  README: configs/segmenter/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: segmenter_vit-t_mask_8xb1-160k_ade20k-512x512
+  In Collection: Segmenter
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 39.99
+      mIoU(ms+flip): 40.83
+  Config: configs/segmenter/segmenter_vit-t_mask_8xb1-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - ViT-T_16
+    - Segmenter
+    - Mask
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 1.21
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k/segmenter_vit-t_mask_8x1_512x512_160k_ade20k_20220105_151706-ffcf7509.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k/segmenter_vit-t_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json
+  Paper:
+    Title: 'Segmenter: Transformer for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2105.05633
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.21.0/mmseg/models/decode_heads/segmenter_mask_head.py#L15
+  Framework: PyTorch
+- Name: segmenter_vit-s_fcn_8xb1-160k_ade20k-512x512
+  In Collection: Segmenter
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.75
+      mIoU(ms+flip): 46.82
+  Config: configs/segmenter/segmenter_vit-s_fcn_8xb1-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - ViT-S_16
+    - Segmenter
+    - Linear
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 1.78
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k/segmenter_vit-s_linear_8x1_512x512_160k_ade20k_20220105_151713-39658c46.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k/segmenter_vit-s_linear_8x1_512x512_160k_ade20k_20220105_151713.log.json
+  Paper:
+    Title: 'Segmenter: Transformer for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2105.05633
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.21.0/mmseg/models/decode_heads/segmenter_mask_head.py#L15
+  Framework: PyTorch
+- Name: segmenter_vit-s_mask_8xb1-160k_ade20k-512x512
+  In Collection: Segmenter
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.19
+      mIoU(ms+flip): 47.85
+  Config: configs/segmenter/segmenter_vit-s_mask_8xb1-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - ViT-S_16
+    - Segmenter
+    - Mask
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 2.03
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k/segmenter_vit-s_mask_8x1_512x512_160k_ade20k_20220105_151706-511bb103.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k/segmenter_vit-s_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json
+  Paper:
+    Title: 'Segmenter: Transformer for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2105.05633
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.21.0/mmseg/models/decode_heads/segmenter_mask_head.py#L15
+  Framework: PyTorch
+- Name: segmenter_vit-b_mask_8xb1-160k_ade20k-512x512
+  In Collection: Segmenter
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.6
+      mIoU(ms+flip): 51.07
+  Config: configs/segmenter/segmenter_vit-b_mask_8xb1-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - ViT-B_16
+    - Segmenter
+    - Mask
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k/segmenter_vit-b_mask_8x1_512x512_160k_ade20k_20220105_151706-bc533b08.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k/segmenter_vit-b_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json
+  Paper:
+    Title: 'Segmenter: Transformer for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2105.05633
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.21.0/mmseg/models/decode_heads/segmenter_mask_head.py#L15
+  Framework: PyTorch
+- Name: segmenter_vit-l_mask_8xb1-160k_ade20k-512x512
+  In Collection: Segmenter
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 52.16
+      mIoU(ms+flip): 53.65
+  Config: configs/segmenter/segmenter_vit-l_mask_8xb1-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - ViT-L_16
+    - Segmenter
+    - Mask
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 16.56
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k/segmenter_vit-l_mask_8x1_512x512_160k_ade20k_20220105_162750-7ef345be.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k/segmenter_vit-l_mask_8x1_512x512_160k_ade20k_20220105_162750.log.json
+  Paper:
+    Title: 'Segmenter: Transformer for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2105.05633
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.21.0/mmseg/models/decode_heads/segmenter_mask_head.py#L15
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-b_mask_8xb1-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-b_mask_8xb1-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4bae50648df3fbd8bc1b20778fae50da36a2156
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-b_mask_8xb1-160k_ade20k-512x512.py
@@ -0,0 +1,14 @@
+_base_ = [
+    '../_base_/models/segmenter_vit-b16_mask.py',
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+optimizer = dict(lr=0.001, weight_decay=0.0)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(
+    # num_gpus: 8 -> batch_size: 8
+    batch_size=1)
+val_dataloader = dict(batch_size=1)
diff --git a/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-l_mask_8xb1-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-l_mask_8xb1-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..302acdecb6ff37c241d5e3abcec5e6f3beb96dfc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-l_mask_8xb1-160k_ade20k-512x512.py
@@ -0,0 +1,32 @@
+_base_ = [
+    '../_base_/models/segmenter_vit-b16_mask.py',
+    '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segmenter/vit_large_p16_384_20220308-d4efb41d.pth'  # noqa
+
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained=checkpoint,
+    backbone=dict(
+        type='VisionTransformer',
+        img_size=(640, 640),
+        embed_dims=1024,
+        num_layers=24,
+        num_heads=16),
+    decode_head=dict(
+        type='SegmenterMaskTransformerHead',
+        in_channels=1024,
+        channels=1024,
+        num_heads=16,
+        embed_dims=1024),
+    test_cfg=dict(mode='slide', crop_size=(640, 640), stride=(608, 608)))
+
+optimizer = dict(lr=0.001, weight_decay=0.0)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(
+    # num_gpus: 8 -> batch_size: 8
+    batch_size=1)
+val_dataloader = dict(batch_size=1)
diff --git a/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-s_fcn_8xb1-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-s_fcn_8xb1-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc1e4c898539eaeffaa8a94cd8d9a80f06762a03
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-s_fcn_8xb1-160k_ade20k-512x512.py
@@ -0,0 +1,14 @@
+_base_ = './segmenter_vit-s_mask_8xb1-160k_ade20k-512x512.py'
+
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='FCNHead',
+        in_channels=384,
+        channels=384,
+        num_convs=0,
+        dropout_ratio=0.0,
+        concat_input=False,
+        num_classes=150,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
diff --git a/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-s_mask_8xb1-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-s_mask_8xb1-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b19fd41409849374b3148e5edb2e01d299b925a5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-s_mask_8xb1-160k_ade20k-512x512.py
@@ -0,0 +1,36 @@
+_base_ = [
+    '../_base_/models/segmenter_vit-b16_mask.py',
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segmenter/vit_small_p16_384_20220308-410f6037.pth'  # noqa
+
+backbone_norm_cfg = dict(type='LN', eps=1e-6, requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained=checkpoint,
+    backbone=dict(
+        img_size=(512, 512),
+        embed_dims=384,
+        num_heads=6,
+    ),
+    decode_head=dict(
+        type='SegmenterMaskTransformerHead',
+        in_channels=384,
+        channels=384,
+        num_classes=150,
+        num_layers=2,
+        num_heads=6,
+        embed_dims=384,
+        dropout_ratio=0.0,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
+
+optimizer = dict(lr=0.001, weight_decay=0.0)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(
+    # num_gpus: 8 -> batch_size: 8
+    batch_size=1)
+val_dataloader = dict(batch_size=1)
diff --git a/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-t_mask_8xb1-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-t_mask_8xb1-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..221a9f9a6df0c95b3de1fa19d46da23076e2185f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segmenter/segmenter_vit-t_mask_8xb1-160k_ade20k-512x512.py
@@ -0,0 +1,26 @@
+_base_ = [
+    '../_base_/models/segmenter_vit-b16_mask.py',
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segmenter/vit_tiny_p16_384_20220308-cce8c795.pth'  # noqa
+
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained=checkpoint,
+    backbone=dict(embed_dims=192, num_heads=3),
+    decode_head=dict(
+        type='SegmenterMaskTransformerHead',
+        in_channels=192,
+        channels=192,
+        num_heads=3,
+        embed_dims=192))
+
+optimizer = dict(lr=0.001, weight_decay=0.0)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(
+    # num_gpus: 8 -> batch_size: 8
+    batch_size=1)
+val_dataloader = dict(batch_size=1)
diff --git a/head_extractor/src/mmseg/.mim/configs/segnext/README.md b/head_extractor/src/mmseg/.mim/configs/segnext/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d7434a062134814dc4834b7d6445108881a07353
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segnext/README.md
@@ -0,0 +1,63 @@
+# SegNeXt
+
+> [SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation](https://arxiv.org/abs/2209.08575)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/visual-attention-network/segnext">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/mscan.py#L328">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We present SegNeXt, a simple convolutional network architecture for semantic segmentation. Recent transformer-based models have dominated the field of semantic segmentation due to the efficiency of self-attention in encoding spatial information. In this paper, we show that convolutional attention is a more efficient and effective way to encode contextual information than the self-attention mechanism in transformers. By re-examining the characteristics owned by successful segmentation models, we discover several key components leading to the performance improvement of segmentation models. This motivates us to design a novel convolutional attention network that uses cheap convolutional operations. Without bells and whistles, our SegNeXt significantly improves the performance of previous state-of-the-art methods on popular benchmarks, including ADE20K, Cityscapes, COCO-Stuff, Pascal VOC, Pascal Context, and iSAID. Notably, SegNeXt outperforms EfficientNet-L2 w/ NAS-FPN and achieves 90.6% mIoU on the Pascal VOC 2012 test leaderboard using only 1/10 parameters of it. On average, SegNeXt achieves about 2.0% mIoU improvements compared to the state-of-the-art methods on the ADE20K datasets with the same or fewer computations. Code is available at [this https URL](https://github.com/uyzhang/JSeg) (Jittor) and [this https URL](https://github.com/Visual-Attention-Network/SegNeXt) (Pytorch).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/215688018-5d4c8366-7793-4fdf-9397-960a09fac951.png" width="70%"/>
+</div>
+
+## Results and models
+
+### ADE20K
+
+| Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                              | download                                                                                                                                                                                                                                                                                                                                                                                   |
+| ------- | -------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| SegNeXt | MSCAN-T  | 512x512   | 160000  | 17.88    | 52.38          | A100   | 41.50 | 42.59         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segnext/segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k_20230210_140244-05bd8466.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k_20230210_140244.log.json) |
+| SegNeXt | MSCAN-S  | 512x512   | 160000  | 21.47    | 42.27          | A100   | 44.16 | 45.81         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segnext/segnext_mscan-s_1xb16-adamw-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k_20230214_113014-43013668.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k_20230214_113014.log.json) |
+| SegNeXt | MSCAN-B  | 512x512   | 160000  | 31.03    | 35.15          | A100   | 48.03 | 49.68         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segnext/segnext_mscan-b_1xb16-adamw-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k_20230209_172053-b6f6c70c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k_20230209_172053.log.json) |
+| SegNeXt | MSCAN-L  | 512x512   | 160000  | 43.32    | 22.91          | A100   | 50.99 | 52.10         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segnext/segnext_mscan-l_1xb16-adamw-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k_20230209_172055-19b14b63.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k_20230209_172055.log.json) |
+
+Note:
+
+- When we integrated SegNeXt into MMSegmentation, we modified some layers' names to make them more precise and concise without changing the model architecture. Therefore, the keys of pre-trained weights are different from the [original weights](https://cloud.tsinghua.edu.cn/d/c15b25a6745946618462/), but don't worry about these changes. we have converted them and uploaded the checkpoints, you might find URL of pre-trained checkpoints in config files and can use them directly for training.
+
+- The total batch size is 16. We trained for SegNeXt with a single GPU as the performance degrades significantly when using`SyncBN` (mainly in `OverlapPatchEmbed` modules of `MSCAN`) of PyTorch 1.9.
+
+- There will be subtle differences when model testing as Non-negative Matrix Factorization (NMF) in `LightHamHead` will be initialized randomly. To control this randomness, please set the random seed when model testing. You can modify [`./tools/test.py`](https://github.com/open-mmlab/mmsegmentation/blob/main/tools/test.py) like:
+
+```python
+def main():
+    from mmengine.runner import seg_random_seed
+    random_seed = xxx # set random seed recorded in training log
+    set_random_seed(random_seed, deterministic=False)
+    ...
+```
+
+- This model performance is sensitive to the seed values used, please refer to the log file for the specific settings of the seed. If you choose a different seed, the results might differ from the table results. Take SegNeXt Large for example, its results range from 49.60 to 51.0.
+
+## Citation
+
+```bibtex
+@article{guo2022segnext,
+  title={SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation},
+  author={Guo, Meng-Hao and Lu, Cheng-Ze and Hou, Qibin and Liu, Zhengning and Cheng, Ming-Ming and Hu, Shi-Min},
+  journal={arXiv preprint arXiv:2209.08575},
+  year={2022}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/segnext/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/segnext/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3c8ff5bb92cbfb784790a0f08ec12bba83610f27
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segnext/metafile.yaml
@@ -0,0 +1,109 @@
+Collections:
+- Name: SegNeXt
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - ADE20K
+  Paper:
+    Title: 'SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2209.08575
+  README: configs/segnext/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512
+  In Collection: SegNeXt
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.5
+      mIoU(ms+flip): 42.59
+  Config: configs/segnext/segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MSCAN-T
+    - SegNeXt
+    Training Resources: 1x A100 GPUS
+    Memory (GB): 17.88
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k_20230210_140244-05bd8466.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k_20230210_140244.log.json
+  Paper:
+    Title: 'SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2209.08575
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/mscan.py#L328
+  Framework: PyTorch
+- Name: segnext_mscan-s_1xb16-adamw-160k_ade20k-512x512
+  In Collection: SegNeXt
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.16
+      mIoU(ms+flip): 45.81
+  Config: configs/segnext/segnext_mscan-s_1xb16-adamw-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MSCAN-S
+    - SegNeXt
+    Training Resources: 1x A100 GPUS
+    Memory (GB): 21.47
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k_20230214_113014-43013668.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k_20230214_113014.log.json
+  Paper:
+    Title: 'SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2209.08575
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/mscan.py#L328
+  Framework: PyTorch
+- Name: segnext_mscan-b_1xb16-adamw-160k_ade20k-512x512
+  In Collection: SegNeXt
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.03
+      mIoU(ms+flip): 49.68
+  Config: configs/segnext/segnext_mscan-b_1xb16-adamw-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MSCAN-B
+    - SegNeXt
+    Training Resources: 1x A100 GPUS
+    Memory (GB): 31.03
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k_20230209_172053-b6f6c70c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k_20230209_172053.log.json
+  Paper:
+    Title: 'SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2209.08575
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/mscan.py#L328
+  Framework: PyTorch
+- Name: segnext_mscan-l_1xb16-adamw-160k_ade20k-512x512
+  In Collection: SegNeXt
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 50.99
+      mIoU(ms+flip): 52.1
+  Config: configs/segnext/segnext_mscan-l_1xb16-adamw-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MSCAN-L
+    - SegNeXt
+    Training Resources: 1x A100 GPUS
+    Memory (GB): 43.32
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k_20230209_172055-19b14b63.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k_20230209_172055.log.json
+  Paper:
+    Title: 'SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2209.08575
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/mscan.py#L328
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/segnext/segnext_mscan-b_1xb16-adamw-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segnext/segnext_mscan-b_1xb16-adamw-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..000f4484838f0d1a4491d867c5f01f1777a8ad62
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segnext/segnext_mscan-b_1xb16-adamw-160k_ade20k-512x512.py
@@ -0,0 +1,28 @@
+_base_ = './segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py'
+
+# model settings
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segnext/mscan_b_20230227-3ab7d230.pth'  # noqa
+ham_norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        embed_dims=[64, 128, 320, 512],
+        depths=[3, 3, 12, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        drop_path_rate=0.1,
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    decode_head=dict(
+        type='LightHamHead',
+        in_channels=[128, 320, 512],
+        in_index=[1, 2, 3],
+        channels=512,
+        ham_channels=512,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=ham_norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/segnext/segnext_mscan-l_1xb16-adamw-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segnext/segnext_mscan-l_1xb16-adamw-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..212d0a8557c5806a4010b62578fe78d1bbad8e9d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segnext/segnext_mscan-l_1xb16-adamw-160k_ade20k-512x512.py
@@ -0,0 +1,27 @@
+_base_ = './segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py'
+# model settings
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segnext/mscan_l_20230227-cef260d4.pth'  # noqa
+ham_norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        embed_dims=[64, 128, 320, 512],
+        depths=[3, 5, 27, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        drop_path_rate=0.3,
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    decode_head=dict(
+        type='LightHamHead',
+        in_channels=[128, 320, 512],
+        in_index=[1, 2, 3],
+        channels=1024,
+        ham_channels=1024,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=ham_norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/segnext/segnext_mscan-s_1xb16-adamw-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segnext/segnext_mscan-s_1xb16-adamw-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a90779a605fb67b73df996349f01b5c445890af
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segnext/segnext_mscan-s_1xb16-adamw-160k_ade20k-512x512.py
@@ -0,0 +1,27 @@
+_base_ = './segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py'
+# model settings
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segnext/mscan_s_20230227-f33ccdf2.pth'  # noqa
+ham_norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        embed_dims=[64, 128, 320, 512],
+        depths=[2, 2, 4, 2],
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    decode_head=dict(
+        type='LightHamHead',
+        in_channels=[128, 320, 512],
+        in_index=[1, 2, 3],
+        channels=256,
+        ham_channels=256,
+        ham_kwargs=dict(MD_R=16),
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=ham_norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/segnext/segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/segnext/segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8d6da85ff488fae24b25d0fb0c92d87e87e395d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/segnext/segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py
@@ -0,0 +1,84 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py',
+    '../_base_/datasets/ade20k.py'
+]
+# model settings
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segnext/mscan_t_20230227-119e8c9f.pth'  # noqa
+ham_norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=(512, 512),
+    test_cfg=dict(size_divisor=32))
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='MSCAN',
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        embed_dims=[32, 64, 160, 256],
+        mlp_ratios=[8, 8, 4, 4],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        depths=[3, 3, 5, 2],
+        attention_kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+        attention_kernel_paddings=[2, [0, 3], [0, 5], [0, 10]],
+        act_cfg=dict(type='GELU'),
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    decode_head=dict(
+        type='LightHamHead',
+        in_channels=[64, 160, 256],
+        in_index=[1, 2, 3],
+        channels=256,
+        ham_channels=256,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=ham_norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        ham_kwargs=dict(
+            MD_S=1,
+            MD_R=16,
+            train_steps=6,
+            eval_steps=7,
+            inv_t=100,
+            rand_init=True)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset settings
+train_dataloader = dict(batch_size=16)
+
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/sem_fpn/README.md b/head_extractor/src/mmseg/.mim/configs/sem_fpn/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..697cf506e2054ec8d2ae7524ae4a9072abd22424
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/sem_fpn/README.md
@@ -0,0 +1,51 @@
+# Semantic FPN
+
+> [Panoptic Feature Pyramid Networks](https://arxiv.org/abs/1901.02446)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/facebookresearch/detectron2">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fpn_head.py#L12">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+The recently introduced panoptic segmentation task has renewed our community's interest in unifying the tasks of instance segmentation (for thing classes) and semantic segmentation (for stuff classes). However, current state-of-the-art methods for this joint task use separate and dissimilar networks for instance and semantic segmentation, without performing any shared computation. In this work, we aim to unify these methods at the architectural level, designing a single network for both tasks. Our approach is to endow Mask R-CNN, a popular instance segmentation method, with a semantic segmentation branch using a shared Feature Pyramid Network (FPN) backbone. Surprisingly, this simple baseline not only remains effective for instance segmentation, but also yields a lightweight, top-performing method for semantic segmentation. In this work, we perform a detailed study of this minimally extended version of Mask R-CNN with FPN, which we refer to as Panoptic FPN, and show it is a robust and accurate baseline for both tasks. Given its effectiveness and conceptual simplicity, we hope our method can serve as a strong baseline and aid future research in panoptic segmentation.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142902694-03ed2131-9104-467b-ace1-c74c62fb7177.png" width="60%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FPN    | R-50     | 512x1024  |   80000 |      2.8 | 13.54          | V100   | 74.52 | 76.08         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/sem_fpn/fpn_r50_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x1024_80k_cityscapes/fpn_r50_512x1024_80k_cityscapes_20200717_021437-94018a0d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x1024_80k_cityscapes/fpn_r50_512x1024_80k_cityscapes-20200717_021437.log.json)     |
+| FPN    | R-101    | 512x1024  |   80000 |      3.9 | 10.29          | V100   | 75.80 | 77.40         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/sem_fpn/fpn_r101_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x1024_80k_cityscapes/fpn_r101_512x1024_80k_cityscapes_20200717_012416-c5800d4c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x1024_80k_cityscapes/fpn_r101_512x1024_80k_cityscapes-20200717_012416.log.json) |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | --------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FPN    | R-50     | 512x512   |  160000 |      4.9 | 55.77          | V100   | 37.49 | 39.09         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/sem_fpn/fpn_r50_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x512_160k_ade20k/fpn_r50_512x512_160k_ade20k_20200718_131734-5b5a6ab9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x512_160k_ade20k/fpn_r50_512x512_160k_ade20k-20200718_131734.log.json)     |
+| FPN    | R-101    | 512x512   |  160000 |      5.9 | 40.58          | V100   | 39.35 | 40.72         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/sem_fpn/fpn_r101_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x512_160k_ade20k/fpn_r101_512x512_160k_ade20k_20200718_131734-306b5004.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x512_160k_ade20k/fpn_r101_512x512_160k_ade20k-20200718_131734.log.json) |
+
+## Citation
+
+```bibtex
+@inproceedings{kirillov2019panoptic,
+  title={Panoptic feature pyramid networks},
+  author={Kirillov, Alexander and Girshick, Ross and He, Kaiming and Doll{\'a}r, Piotr},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={6399--6408},
+  year={2019}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/sem_fpn/fpn_r101_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/sem_fpn/fpn_r101_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e9bcfbb5978fc808e93dcd8235044290f26925d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/sem_fpn/fpn_r101_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fpn_r50_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/sem_fpn/fpn_r101_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/sem_fpn/fpn_r101_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..adad1a4f38508a8eec50d436c52e190a3e4ce931
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/sem_fpn/fpn_r101_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,5 @@
+_base_ = './fpn_r50_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/sem_fpn/fpn_r50_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/sem_fpn/fpn_r50_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf71d388e9cf6c0dd93a3800195634a442a0485b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/sem_fpn/fpn_r50_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/fpn_r50.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/sem_fpn/fpn_r50_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/sem_fpn/fpn_r50_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e4bc57788a8ea0584607596b4d789c1bc0f8edb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/sem_fpn/fpn_r50_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = [
+    '../_base_/models/fpn_r50.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor, decode_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/sem_fpn/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/sem_fpn/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e734897245775c35fdcac713b8a4173e6f89d0d4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/sem_fpn/metafile.yaml
@@ -0,0 +1,110 @@
+Collections:
+- Name: FPN
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: Panoptic Feature Pyramid Networks
+    URL: https://arxiv.org/abs/1901.02446
+  README: configs/sem_fpn/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: fpn_r50_4xb2-80k_cityscapes-512x1024
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.52
+      mIoU(ms+flip): 76.08
+  Config: configs/sem_fpn/fpn_r50_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50
+    - FPN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x1024_80k_cityscapes/fpn_r50_512x1024_80k_cityscapes_20200717_021437-94018a0d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x1024_80k_cityscapes/fpn_r50_512x1024_80k_cityscapes-20200717_021437.log.json
+  Paper:
+    Title: Panoptic Feature Pyramid Networks
+    URL: https://arxiv.org/abs/1901.02446
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fpn_head.py#L12
+  Framework: PyTorch
+- Name: fpn_r101_4xb2-80k_cityscapes-512x1024
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.8
+      mIoU(ms+flip): 77.4
+  Config: configs/sem_fpn/fpn_r101_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101
+    - FPN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x1024_80k_cityscapes/fpn_r101_512x1024_80k_cityscapes_20200717_012416-c5800d4c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x1024_80k_cityscapes/fpn_r101_512x1024_80k_cityscapes-20200717_012416.log.json
+  Paper:
+    Title: Panoptic Feature Pyramid Networks
+    URL: https://arxiv.org/abs/1901.02446
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fpn_head.py#L12
+  Framework: PyTorch
+- Name: fpn_r50_4xb4-160k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 37.49
+      mIoU(ms+flip): 39.09
+  Config: configs/sem_fpn/fpn_r50_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50
+    - FPN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x512_160k_ade20k/fpn_r50_512x512_160k_ade20k_20200718_131734-5b5a6ab9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x512_160k_ade20k/fpn_r50_512x512_160k_ade20k-20200718_131734.log.json
+  Paper:
+    Title: Panoptic Feature Pyramid Networks
+    URL: https://arxiv.org/abs/1901.02446
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fpn_head.py#L12
+  Framework: PyTorch
+- Name: fpn_r101_4xb4-160k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 39.35
+      mIoU(ms+flip): 40.72
+  Config: configs/sem_fpn/fpn_r101_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101
+    - FPN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x512_160k_ade20k/fpn_r101_512x512_160k_ade20k_20200718_131734-306b5004.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x512_160k_ade20k/fpn_r101_512x512_160k_ade20k-20200718_131734.log.json
+  Paper:
+    Title: Panoptic Feature Pyramid Networks
+    URL: https://arxiv.org/abs/1901.02446
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fpn_head.py#L12
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/setr/README.md b/head_extractor/src/mmseg/.mim/configs/setr/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..15be6ec099a329adea635d741fa8b2db69b175cf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/setr/README.md
@@ -0,0 +1,74 @@
+# SETR
+
+> [Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers](https://arxiv.org/abs/2012.15840)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/fudan-zvg/SETR">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Most recent semantic segmentation methods adopt a fully-convolutional network (FCN) with an encoder-decoder architecture. The encoder progressively reduces the spatial resolution and learns more abstract/semantic visual concepts with larger receptive fields. Since context modeling is critical for segmentation, the latest efforts have been focused on increasing the receptive field, through either dilated/atrous convolutions or inserting attention modules. However, the encoder-decoder based FCN architecture remains unchanged. In this paper, we aim to provide an alternative perspective by treating semantic segmentation as a sequence-to-sequence prediction task. Specifically, we deploy a pure transformer (ie, without convolution and resolution reduction) to encode an image as a sequence of patches. With the global context modeled in every layer of the transformer, this encoder can be combined with a simple decoder to provide a powerful segmentation model, termed SEgmentation TRansformer (SETR). Extensive experiments show that SETR achieves new state of the art on ADE20K (50.28% mIoU), Pascal Context (55.83% mIoU) and competitive results on Cityscapes. Particularly, we achieve the first position in the highly competitive ADE20K test server leaderboard on the day of submission.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142902777-ee2d34b7-a631-4fa7-ad68-118ff5716afe.png" width="80%"/>
+</div>
+
+```None
+This head has two version head.
+```
+
+## Usage
+
+You can download the pretrain from [here](https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_384-b3be5167.pth). Then you can convert its keys with the script `vit2mmseg.py` in the tools directory.
+
+```shell
+python tools/model_converters/vit2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+E.g.
+
+```shell
+python tools/model_converters/vit2mmseg.py \
+jx_vit_large_p16_384-b3be5167.pth pretrain/vit_large_p16.pth
+```
+
+This script convert the model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+## Results and models
+
+### ADE20K
+
+| Method     | Backbone | Crop Size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                             |
+| ---------- | -------- | --------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| SETR Naive | ViT-L    | 512x512   | 16         | 160000  | 18.40    | 4.72           | V100   | 48.28 |         49.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l_naive_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_512x512_160k_b16_ade20k/setr_naive_512x512_160k_b16_ade20k_20210619_191258-061f24f5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_512x512_160k_b16_ade20k/setr_naive_512x512_160k_b16_ade20k_20210619_191258.log.json) |
+| SETR PUP   | ViT-L    | 512x512   | 16         | 160000  | 19.54    | 4.50           | V100   | 48.24 |         49.99 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l_pup_8xb2-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343-7e0ce826.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343.log.json)         |
+| SETR MLA   | ViT-L    | 512x512   | 8          | 160000  | 10.96    | -              | V100   | 47.34 |         49.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l-mla_8xb1-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118-c6d21df0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118.log.json)             |
+| SETR MLA   | ViT-L    | 512x512   | 16         | 160000  | 17.30    | 5.25           | V100   | 47.39 |         49.37 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l_mla_8xb2-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057-f9741de7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057.log.json)         |
+
+### Cityscapes
+
+| Method     | Backbone | Crop Size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                                                 |
+| ---------- | -------- | --------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| SETR Naive | ViT-L    | 768x768   | 8          | 80000   | 24.06    | 0.39           | V100   | 78.10 |         80.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l_naive_8xb1-80k_cityscapes-768x768.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505-20728e80.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505.log.json) |
+| SETR PUP   | ViT-L    | 768x768   | 8          | 80000   | 27.96    | 0.37           | V100   | 79.21 |         81.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l_pup_8xb1-80k_cityscapes-768x768.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115-f6f37b8f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115.log.json)         |
+| SETR MLA   | ViT-L    | 768x768   | 8          | 80000   | 24.10    | 0.41           | V100   | 77.00 |         79.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l_mla_8xb1-80k_cityscapes-768x768.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003-7f8dccbe.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003.log.json)         |
+
+## Citation
+
+```bibtex
+@article{zheng2020rethinking,
+  title={Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers},
+  author={Zheng, Sixiao and Lu, Jiachen and Zhao, Hengshuang and Zhu, Xiatian and Luo, Zekun and Wang, Yabiao and Fu, Yanwei and Feng, Jianfeng and Xiang, Tao and Torr, Philip HS and others},
+  journal={arXiv preprint arXiv:2012.15840},
+  year={2020}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/setr/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/setr/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8e6bc087dd326b91cd1c9289a3dbf6535d850e1f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/setr/metafile.yaml
@@ -0,0 +1,197 @@
+Collections:
+- Name: SETR
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - ADE20K
+    - Cityscapes
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  README: configs/setr/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: setr_vit-l_naive_8xb2-160k_ade20k-512x512
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.28
+      mIoU(ms+flip): 49.56
+  Config: configs/setr/setr_vit-l_naive_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-L
+    - SETR
+    - Naive
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 18.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_512x512_160k_b16_ade20k/setr_naive_512x512_160k_b16_ade20k_20210619_191258-061f24f5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_512x512_160k_b16_ade20k/setr_naive_512x512_160k_b16_ade20k_20210619_191258.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
+- Name: setr_vit-l_pup_8xb2-160k_ade20k-512x512
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.24
+      mIoU(ms+flip): 49.99
+  Config: configs/setr/setr_vit-l_pup_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-L
+    - SETR
+    - PUP
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 19.54
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343-7e0ce826.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
+- Name: setr_vit-l-mla_8xb1-160k_ade20k-512x512
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.34
+      mIoU(ms+flip): 49.05
+  Config: configs/setr/setr_vit-l-mla_8xb1-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - ViT-L
+    - SETR
+    - MLA
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 10.96
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118-c6d21df0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
+- Name: setr_vit-l_mla_8xb2-160k_ade20k-512x512
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.39
+      mIoU(ms+flip): 49.37
+  Config: configs/setr/setr_vit-l_mla_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-L
+    - SETR
+    - MLA
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 17.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057-f9741de7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
+- Name: setr_vit-l_naive_8xb1-80k_cityscapes-768x768
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.1
+      mIoU(ms+flip): 80.22
+  Config: configs/setr/setr_vit-l_naive_8xb1-80k_cityscapes-768x768.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - ViT-L
+    - SETR
+    - Naive
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 24.06
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505-20728e80.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
+- Name: setr_vit-l_pup_8xb1-80k_cityscapes-768x768
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.21
+      mIoU(ms+flip): 81.02
+  Config: configs/setr/setr_vit-l_pup_8xb1-80k_cityscapes-768x768.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - ViT-L
+    - SETR
+    - PUP
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 27.96
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115-f6f37b8f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
+- Name: setr_vit-l_mla_8xb1-80k_cityscapes-768x768
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.0
+      mIoU(ms+flip): 79.59
+  Config: configs/setr/setr_vit-l_mla_8xb1-80k_cityscapes-768x768.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - ViT-L
+    - SETR
+    - MLA
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 24.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003-7f8dccbe.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l-mla_8xb1-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l-mla_8xb1-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c6e2845f9e3cf9b270bccf0cda7c2adcdccc600
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l-mla_8xb1-160k_ade20k-512x512.py
@@ -0,0 +1,90 @@
+_base_ = [
+    '../_base_/models/setr_mla.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        img_size=(512, 512),
+        drop_rate=0.,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='pretrain/vit_large_p16.pth')),
+    decode_head=dict(num_classes=150),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=256,
+            channels=256,
+            in_index=0,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'),
+            num_convs=0,
+            kernel_size=1,
+            concat_input=False,
+            num_classes=150,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='FCNHead',
+            in_channels=256,
+            channels=256,
+            in_index=1,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'),
+            num_convs=0,
+            kernel_size=1,
+            concat_input=False,
+            num_classes=150,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='FCNHead',
+            in_channels=256,
+            channels=256,
+            in_index=2,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'),
+            num_convs=0,
+            kernel_size=1,
+            concat_input=False,
+            num_classes=150,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='FCNHead',
+            in_channels=256,
+            channels=256,
+            in_index=3,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'),
+            num_convs=0,
+            kernel_size=1,
+            concat_input=False,
+            num_classes=150,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    ],
+    test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341)),
+)
+
+optimizer = dict(lr=0.001, weight_decay=0.0)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)}))
+# num_gpus: 8 -> batch_size: 8
+train_dataloader = dict(batch_size=1)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_mla_8xb1-80k_cityscapes-768x768.py b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_mla_8xb1-80k_cityscapes-768x768.py
new file mode 100644
index 0000000000000000000000000000000000000000..026557f505f118f5c5cab0421b77095981f6c1de
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_mla_8xb1-80k_cityscapes-768x768.py
@@ -0,0 +1,23 @@
+_base_ = [
+    '../_base_/models/setr_mla.py', '../_base_/datasets/cityscapes_768x768.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (768, 768)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        drop_rate=0,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='pretrain/vit_large_p16.pth')),
+    test_cfg=dict(mode='slide', crop_size=(768, 768), stride=(512, 512)))
+
+optimizer = dict(lr=0.002, weight_decay=0.0)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)}))
+train_dataloader = dict(batch_size=1)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_mla_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_mla_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d3fb7d4e1d51137f2ae63eb1290f496ce5af5e6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_mla_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,6 @@
+_base_ = ['./setr_vit-l-mla_8xb1-160k_ade20k-512x512.py']
+
+# num_gpus: 8 -> batch_size: 16
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_naive_8xb1-80k_cityscapes-768x768.py b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_naive_8xb1-80k_cityscapes-768x768.py
new file mode 100644
index 0000000000000000000000000000000000000000..db49317301300ad28fd34c6969335b0067801858
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_naive_8xb1-80k_cityscapes-768x768.py
@@ -0,0 +1,24 @@
+_base_ = [
+    '../_base_/models/setr_naive.py',
+    '../_base_/datasets/cityscapes_768x768.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (768, 768)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        drop_rate=0.,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='pretrain/vit_large_p16.pth')),
+    test_cfg=dict(mode='slide', crop_size=(768, 768), stride=(512, 512)))
+
+optimizer = dict(weight_decay=0.0)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)}))
+train_dataloader = dict(batch_size=1)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_naive_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_naive_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..109996c1b6fc42ce429ebf2a215d46bafca975e4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_naive_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,72 @@
+_base_ = [
+    '../_base_/models/setr_naive.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        img_size=(512, 512),
+        drop_rate=0.,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='pretrain/vit_large_p16.pth')),
+    decode_head=dict(num_classes=150),
+    auxiliary_head=[
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=0,
+            num_classes=150,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'),
+            num_convs=2,
+            kernel_size=1,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=1,
+            num_classes=150,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'),
+            num_convs=2,
+            kernel_size=1,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=2,
+            num_classes=150,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'),
+            num_convs=2,
+            kernel_size=1,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4))
+    ],
+    test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341)),
+)
+
+optimizer = dict(lr=0.01, weight_decay=0.0)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)}))
+# num_gpus: 8 -> batch_size: 16
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_pup_8xb1-80k_cityscapes-768x768.py b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_pup_8xb1-80k_cityscapes-768x768.py
new file mode 100644
index 0000000000000000000000000000000000000000..999ab180382914d6631ca42784a09fca74dfacd4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_pup_8xb1-80k_cityscapes-768x768.py
@@ -0,0 +1,70 @@
+_base_ = [
+    '../_base_/models/setr_pup.py', '../_base_/datasets/cityscapes_768x768.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (768, 768)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+crop_size = (768, 768)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        drop_rate=0.,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='pretrain/vit_large_p16.pth')),
+    auxiliary_head=[
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=0,
+            num_classes=19,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            num_convs=2,
+            up_scale=4,
+            kernel_size=3,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=1,
+            num_classes=19,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            num_convs=2,
+            up_scale=4,
+            kernel_size=3,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=2,
+            num_classes=19,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            num_convs=2,
+            up_scale=4,
+            kernel_size=3,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4))
+    ],
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(512, 512)))
+
+optimizer = dict(weight_decay=0.0)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)}))
+
+train_dataloader = dict(batch_size=1)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_pup_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_pup_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9bfb2201d790ac6490faed4676a43f2ecccaaca
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/setr/setr_vit-l_pup_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,72 @@
+_base_ = [
+    '../_base_/models/setr_pup.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        img_size=(512, 512),
+        drop_rate=0.,
+        init_cfg=dict(
+            type='Pretrained', checkpoint='pretrain/vit_large_p16.pth')),
+    decode_head=dict(num_classes=150),
+    auxiliary_head=[
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=0,
+            num_classes=150,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'),
+            num_convs=2,
+            kernel_size=3,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=1,
+            num_classes=150,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'),
+            num_convs=2,
+            kernel_size=3,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='SETRUPHead',
+            in_channels=1024,
+            channels=256,
+            in_index=2,
+            num_classes=150,
+            dropout_ratio=0,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type='ReLU'),
+            num_convs=2,
+            kernel_size=3,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    ],
+    test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341)),
+)
+
+optimizer = dict(lr=0.001, weight_decay=0.0)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    paramwise_cfg=dict(custom_keys={'head': dict(lr_mult=10.)}))
+# num_gpus: 8 -> batch_size: 16
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/stdc/README.md b/head_extractor/src/mmseg/.mim/configs/stdc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3e8bf606880287ccddf1c23f87cb52a7cade7021
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/stdc/README.md
@@ -0,0 +1,73 @@
+# STDC
+
+> [Rethinking BiSeNet For Real-time Semantic Segmentation](https://arxiv.org/abs/2104.13188)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/MichaelFan01/STDC-Seg">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/stdc.py#L394">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+BiSeNet has been proved to be a popular two-stream network for real-time segmentation. However, its principle of adding an extra path to encode spatial information is time-consuming, and the backbones borrowed from pretrained tasks, e.g., image classification, may be inefficient for image segmentation due to the deficiency of task-specific design. To handle these problems, we propose a novel and efficient structure named Short-Term Dense Concatenate network (STDC network) by removing structure redundancy. Specifically, we gradually reduce the dimension of feature maps and use the aggregation of them for image representation, which forms the basic module of STDC network. In the decoder, we propose a Detail Aggregation module by integrating the learning of spatial information into low-level layers in single-stream manner. Finally, the low-level features and deep features are fused to predict the final segmentation results. Extensive experiments on Cityscapes and CamVid dataset demonstrate the effectiveness of our method by achieving promising trade-off between segmentation accuracy and inference speed. On Cityscapes, we achieve 71.9% mIoU on the test set with a speed of 250.4 FPS on NVIDIA GTX 1080Ti, which is 45.2% faster than the latest methods, and achieve 76.8% mIoU with 97.0 FPS while inferring on higher resolution images.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/143640374-d0709587-edb2-4821-bb60-340035f6ad8f.png" width="60%"/>
+</div>
+
+## Usage
+
+We have provided [ImageNet Pretrained STDCNet Weights](https://drive.google.com/drive/folders/1wROFwRt8qWHD4jSo8Zu1gp1d6oYJ3ns1) models converted from [official repo](https://github.com/MichaelFan01/STDC-Seg).
+
+If you want to convert keys on your own to use official repositories' pre-trained models, we also provide a script [`stdc2mmseg.py`](../../tools/model_converters/stdc2mmseg.py) in the tools directory to convert the key of models from [the official repo](https://github.com/MichaelFan01/STDC-Seg) to MMSegmentation style.
+
+```shell
+python tools/model_converters/stdc2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH} ${STDC_TYPE}
+```
+
+E.g.
+
+```shell
+python tools/model_converters/stdc2mmseg.py ./STDCNet813M_73.91.tar ./pretrained/stdc1.pth STDC1
+
+python tools/model_converters/stdc2mmseg.py ./STDCNet1446_76.47.tar ./pretrained/stdc2.pth STDC2
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone             | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                             |
+| ------ | -------------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| STDC   | STDC1 (No Pretrain)  | 512x1024  |   80000 | 7.15     | 23.06          | V100   | 71.82 | 73.89         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/stdc/stdc1_4xb12-80k_cityscapes-512x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_512x1024_80k_cityscapes/stdc1_512x1024_80k_cityscapes_20220224_073048-74e6920a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_512x1024_80k_cityscapes/stdc1_512x1024_80k_cityscapes_20220224_073048.log.json)                                     |
+| STDC   | STDC1                | 512x1024  |   80000 | -        | -              | V100   | 74.94 | 76.97         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/stdc/stdc1_in1k-pre_4xb12-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes/stdc1_in1k-pre_512x1024_80k_cityscapes_20220224_141648-3d4c2981.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes/stdc1_in1k-pre_512x1024_80k_cityscapes_20220224_141648.log.json) |
+| STDC   | STDC2  (No Pretrain) | 512x1024  |   80000 | 8.27     | 23.71          | V100   | 73.15 | 76.13         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/stdc/stdc2_4xb12-80k_cityscapes-512x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_512x1024_80k_cityscapes/stdc2_512x1024_80k_cityscapes_20220222_132015-fb1e3a1a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_512x1024_80k_cityscapes/stdc2_512x1024_80k_cityscapes_20220222_132015.log.json)                                     |
+| STDC   | STDC2                | 512x1024  |   80000 | -        | -              | V100   | 76.67 | 78.67         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/stdc/stdc2_in1k-pre_4xb12-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes/stdc2_in1k-pre_512x1024_80k_cityscapes_20220224_073048-1f8f0f6c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes/stdc2_in1k-pre_512x1024_80k_cityscapes_20220224_073048.log.json) |
+
+Note:
+
+- For STDC on Cityscapes dataset, default setting is 4 GPUs with 12 samples per GPU in training.
+- `No Pretrain` means the model is trained from scratch.
+- The FPS is for reference only. The environment is also different from paper setting, whose input size is `512x1024` and `768x1536`, i.e., 50% and 75% of our input size, respectively and using TensorRT.
+- The parameter `fusion_kernel` in `STDCHead` is not learnable. In official repo, `find_unused_parameters=True` is set [here](https://github.com/MichaelFan01/STDC-Seg/blob/59ff37fbd693b99972c76fcefe97caa14aeb619f/train.py#L220). You may check it by printing model parameters of original repo on your own.
+
+## Citation
+
+```bibtex
+@inproceedings{fan2021rethinking,
+  title={Rethinking BiSeNet For Real-time Semantic Segmentation},
+  author={Fan, Mingyuan and Lai, Shenqi and Huang, Junshi and Wei, Xiaoming and Chai, Zhenhua and Luo, Junfeng and Wei, Xiaolin},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={9716--9725},
+  year={2021}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/stdc/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/stdc/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..93cb14f50b1286a25f31069e28c888a9a1c2d640
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/stdc/metafile.yaml
@@ -0,0 +1,107 @@
+Collections:
+- Name: STDC
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: Rethinking BiSeNet For Real-time Semantic Segmentation
+    URL: https://arxiv.org/abs/2104.13188
+  README: configs/stdc/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: stdc1_4xb12-80k_cityscapes-512x1024
+  In Collection: STDC
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 71.82
+      mIoU(ms+flip): 73.89
+  Config: configs/stdc/stdc1_4xb12-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 48
+    Architecture:
+    - STDC1
+    - STDC
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.15
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_512x1024_80k_cityscapes/stdc1_512x1024_80k_cityscapes_20220224_073048-74e6920a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_512x1024_80k_cityscapes/stdc1_512x1024_80k_cityscapes_20220224_073048.log.json
+  Paper:
+    Title: Rethinking BiSeNet For Real-time Semantic Segmentation
+    URL: https://arxiv.org/abs/2104.13188
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/stdc.py#L394
+  Framework: PyTorch
+- Name: stdc1_in1k-pre_4xb12-80k_cityscapes-512x1024
+  In Collection: STDC
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.94
+      mIoU(ms+flip): 76.97
+  Config: configs/stdc/stdc1_in1k-pre_4xb12-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 48
+    Architecture:
+    - STDC1
+    - STDC
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes/stdc1_in1k-pre_512x1024_80k_cityscapes_20220224_141648-3d4c2981.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes/stdc1_in1k-pre_512x1024_80k_cityscapes_20220224_141648.log.json
+  Paper:
+    Title: Rethinking BiSeNet For Real-time Semantic Segmentation
+    URL: https://arxiv.org/abs/2104.13188
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/stdc.py#L394
+  Framework: PyTorch
+- Name: stdc2_4xb12-80k_cityscapes-512x1024
+  In Collection: STDC
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.15
+      mIoU(ms+flip): 76.13
+  Config: configs/stdc/stdc2_4xb12-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 48
+    Architecture:
+    - STDC2
+    - STDC
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.27
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_512x1024_80k_cityscapes/stdc2_512x1024_80k_cityscapes_20220222_132015-fb1e3a1a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_512x1024_80k_cityscapes/stdc2_512x1024_80k_cityscapes_20220222_132015.log.json
+  Paper:
+    Title: Rethinking BiSeNet For Real-time Semantic Segmentation
+    URL: https://arxiv.org/abs/2104.13188
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/stdc.py#L394
+  Framework: PyTorch
+- Name: stdc2_in1k-pre_4xb12-80k_cityscapes-512x1024
+  In Collection: STDC
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.67
+      mIoU(ms+flip): 78.67
+  Config: configs/stdc/stdc2_in1k-pre_4xb12-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 48
+    Architecture:
+    - STDC2
+    - STDC
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes/stdc2_in1k-pre_512x1024_80k_cityscapes_20220224_073048-1f8f0f6c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes/stdc2_in1k-pre_512x1024_80k_cityscapes_20220224_073048.log.json
+  Paper:
+    Title: Rethinking BiSeNet For Real-time Semantic Segmentation
+    URL: https://arxiv.org/abs/2104.13188
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/stdc.py#L394
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/stdc/stdc1_4xb12-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/stdc/stdc1_4xb12-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..20aec3d5bf62d327443bf00e548534dc5abd5e80
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/stdc/stdc1_4xb12-80k_cityscapes-512x1024.py
@@ -0,0 +1,21 @@
+_base_ = [
+    '../_base_/models/stdc.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=80000,
+        by_epoch=False,
+    )
+]
+train_dataloader = dict(batch_size=12, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/stdc/stdc1_in1k-pre_4xb12-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/stdc/stdc1_in1k-pre_4xb12-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..15e807f9edabf85eea487068c4950d6e981fde7c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/stdc/stdc1_in1k-pre_4xb12-80k_cityscapes-512x1024.py
@@ -0,0 +1,6 @@
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/stdc/stdc1_20220308-5368626c.pth'  # noqa
+_base_ = './stdc1_4xb12-80k_cityscapes-512x1024.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(type='Pretrained', checkpoint=checkpoint))))
diff --git a/head_extractor/src/mmseg/.mim/configs/stdc/stdc2_4xb12-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/stdc/stdc2_4xb12-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..5657351698dc063a8f6b26f12f53bd4d75f1e662
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/stdc/stdc2_4xb12-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './stdc1_4xb12-80k_cityscapes-512x1024.py'
+model = dict(backbone=dict(backbone_cfg=dict(stdc_type='STDCNet2')))
diff --git a/head_extractor/src/mmseg/.mim/configs/stdc/stdc2_in1k-pre_4xb12-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/stdc/stdc2_in1k-pre_4xb12-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..05a202b74c7e1bb0284e1b8be28452edb3d96fc2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/stdc/stdc2_in1k-pre_4xb12-80k_cityscapes-512x1024.py
@@ -0,0 +1,6 @@
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/stdc/stdc2_20220308-7dbd9127.pth'  # noqa
+_base_ = './stdc2_4xb12-80k_cityscapes-512x1024.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(type='Pretrained', checkpoint=checkpoint))))
diff --git a/head_extractor/src/mmseg/.mim/configs/swin/README.md b/head_extractor/src/mmseg/.mim/configs/swin/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..64f0d5ff4b7b11be0e802a23d544eb1046f3040e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/swin/README.md
@@ -0,0 +1,76 @@
+# Swin Transformer
+
+> [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
+
+## Introduction
+
+<!-- [BACKBONE] -->
+
+<a href="https://github.com/microsoft/Swin-Transformer">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/swin.py#L524">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+This paper presents a new vision Transformer, called Swin Transformer, that capably serves as a general-purpose backbone for computer vision. Challenges in adapting Transformer from language to vision arise from differences between the two domains, such as large variations in the scale of visual entities and the high resolution of pixels in images compared to words in text. To address these differences, we propose a hierarchical Transformer whose representation is computed with Shifted windows. The shifted windowing scheme brings greater efficiency by limiting self-attention computation to non-overlapping local windows while also allowing for cross-window connection. This hierarchical architecture has the flexibility to model at various scales and has linear computational complexity with respect to image size. These qualities of Swin Transformer make it compatible with a broad range of vision tasks, including image classification (87.3 top-1 accuracy on ImageNet-1K) and dense prediction tasks such as object detection (58.7 box AP and 51.1 mask AP on COCO test-dev) and semantic segmentation (53.5 mIoU on ADE20K val). Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and +2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures. The code and models are publicly available at [this https URL](https://github.com/microsoft/Swin-Transformer).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142902882-3fb9014c-11b6-47e9-aa14-500dfe7cbb1c.png" width="80%"/>
+</div>
+
+## Usage
+
+We have provided pretrained models converted from [official repo](https://github.com/microsoft/Swin-Transformer)．
+
+If you want to convert keys on your own to use official repositories' pre-trained models, we also provide a script [`swin2mmseg.py`](../../tools/model_converters/swin2mmseg.py) in the tools directory to convert the key of models from [the official repo](https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation) to MMSegmentation style.
+
+```shell
+python tools/model_converters/swin2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+E.g.
+
+```shell
+python tools/model_converters/swin2mmseg.py https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth pretrain/swin_base_patch4_window7_224.pth
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+In our default setting, pretrained models and their corresponding [original models](https://github.com/microsoft/Swin-Transforme) models could be defined below:
+
+| pretrained models                              | original models                                                                                                                                    |
+| ---------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
+| pretrain/swin_tiny_patch4_window7_224.pth      | [swin_tiny_patch4_window7_224.pth](https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth)           |
+| pretrain/swin_small_patch4_window7_224.pth     | [swin_small_patch4_window7_224.pth](https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth)         |
+| pretrain/swin_base_patch4_window7_224.pth      | [swin_base_patch4_window7_224.pth](https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pth)           |
+| pretrain/swin_base_patch4_window7_224_22k.pth  | [swin_base_patch4_window7_224_22k.pth](https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22k.pth)   |
+| pretrain/swin_base_patch4_window12_384.pth     | [swin_base_patch4_window12_384.pth](https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pth)         |
+| pretrain/swin_base_patch4_window12_384_22k.pth | [swin_base_patch4_window12_384_22k.pth](https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth) |
+
+## Results and models
+
+### ADE20K
+
+| Method  | Backbone | Crop Size | pretrain     | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| ------- | -------- | --------- | ------------ | ----------------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | Swin-T   | 512x512   | ImageNet-1K  | 224x224           | 16         | 160000  | 5.02     | 21.06          | V100   | 44.41 |         45.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/swin/swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542-e380ad3e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542.log.json)         |
+| UPerNet | Swin-S   | 512x512   | ImageNet-1K  | 224x224           | 16         | 160000  | 6.17     | 14.72          | V100   | 47.72 |         49.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/swin/swin-small-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015-ee2fff1c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015.log.json)     |
+| UPerNet | Swin-B   | 512x512   | ImageNet-1K  | 224x224           | 16         | 160000  | 7.61     | 12.65          | V100   | 47.99 |         49.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/swin/swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192340-593b0e13.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192340.log.json)         |
+| UPerNet | Swin-B   | 512x512   | ImageNet-22K | 224x224           | 16         | 160000  | -        | -              | V100   | 50.13 |          51.9 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/swin/swin-base-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K_20210526_211650-762e2178.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K_20210526_211650.log.json)     |
+| UPerNet | Swin-B   | 512x512   | ImageNet-1K  | 384x384           | 16         | 160000  | 8.52     | 12.10          | V100   | 48.35 |         49.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/swin/swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K_20210531_132020-05b22ea4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K_20210531_132020.log.json)     |
+| UPerNet | Swin-B   | 512x512   | ImageNet-22K | 384x384           | 16         | 160000  | -        | -              | V100   | 50.76 |          52.4 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/swin/swin-base-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459-429057bf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459.log.json) |
+
+## Citation
+
+```bibtex
+@article{liu2021Swin,
+  title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows},
+  author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
+  journal={arXiv preprint arXiv:2103.14030},
+  year={2021}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/swin/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/swin/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..67a4e075514d8a0cb3f0f78f2cfaf7a7be2ec9d4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/swin/metafile.yaml
@@ -0,0 +1,143 @@
+Models:
+- Name: swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.41
+      mIoU(ms+flip): 45.79
+  Config: configs/swin/swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-T
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 5.02
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542-e380ad3e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542.log.json
+  Paper:
+    Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    URL: https://arxiv.org/abs/2103.14030
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/swin.py#L524
+  Framework: PyTorch
+- Name: swin-small-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.72
+      mIoU(ms+flip): 49.24
+  Config: configs/swin/swin-small-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.17
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015-ee2fff1c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015.log.json
+  Paper:
+    Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    URL: https://arxiv.org/abs/2103.14030
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/swin.py#L524
+  Framework: PyTorch
+- Name: swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.99
+      mIoU(ms+flip): 49.57
+  Config: configs/swin/swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.61
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192340-593b0e13.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192340.log.json
+  Paper:
+    Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    URL: https://arxiv.org/abs/2103.14030
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/swin.py#L524
+  Framework: PyTorch
+- Name: swin-base-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 50.13
+      mIoU(ms+flip): 51.9
+  Config: configs/swin/swin-base-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K_20210526_211650-762e2178.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K_20210526_211650.log.json
+  Paper:
+    Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    URL: https://arxiv.org/abs/2103.14030
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/swin.py#L524
+  Framework: PyTorch
+- Name: swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.35
+      mIoU(ms+flip): 49.65
+  Config: configs/swin/swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.52
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K_20210531_132020-05b22ea4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K_20210531_132020.log.json
+  Paper:
+    Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    URL: https://arxiv.org/abs/2103.14030
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/swin.py#L524
+  Framework: PyTorch
+- Name: swin-base-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 50.76
+      mIoU(ms+flip): 52.4
+  Config: configs/swin/swin-base-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459-429057bf.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459.log.json
+  Paper:
+    Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    URL: https://arxiv.org/abs/2103.14030
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/swin.py#L524
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/swin/swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/swin/swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..11cea36703c597863220298bc9feee4672579f01
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/swin/swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,14 @@
+_base_ = [
+    'swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py'
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_20220317-55b0104a.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12),
+    decode_head=dict(in_channels=[128, 256, 512, 1024], num_classes=150),
+    auxiliary_head=dict(in_channels=512, num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/swin/swin-base-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/swin/swin-base-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c1171646e4d22116fbafadb57b397f651c8ad64
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/swin/swin-base-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,7 @@
+_base_ = [
+    './swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py'  # noqa
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)))
diff --git a/head_extractor/src/mmseg/.mim/configs/swin/swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/swin/swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..73bf6166ef22e8e3a1a08b56a6fc809c72c64f58
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/swin/swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = [
+    './swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py'
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window7_224_20220317-e9b98025.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32]),
+    decode_head=dict(in_channels=[128, 256, 512, 1024], num_classes=150),
+    auxiliary_head=dict(in_channels=512, num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/swin/swin-base-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/swin/swin-base-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..96148cd71d9a1a87ca5065788d8e4aae20b44326
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/swin/swin-base-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,7 @@
+_base_ = [
+    './swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py'
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window7_224_22k_20220317-4f79f7c0.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)))
diff --git a/head_extractor/src/mmseg/.mim/configs/swin/swin-large-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/swin/swin-large-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0a654e026f7450b33a7d9a118c4797bbc302355
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/swin/swin-large-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    'swin-large-patch4-window7-in22k-pre_upernet_'
+    '8xb2-160k_ade20k-512x512.py'
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window12_384_22k_20220412-6580f57d.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        pretrain_img_size=384,
+        window_size=12))
diff --git a/head_extractor/src/mmseg/.mim/configs/swin/swin-large-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/swin/swin-large-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c93cdfeaaeb91a3aaefe8019afffd0d380b8f761
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/swin/swin-large-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,15 @@
+_base_ = [
+    'swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_'
+    'ade20k-512x512.py'
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window7_224_22k_20220412-aeecf2aa.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        pretrain_img_size=224,
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7),
+    decode_head=dict(in_channels=[192, 384, 768, 1536], num_classes=150),
+    auxiliary_head=dict(in_channels=768, num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/swin/swin-small-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/swin/swin-small-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..19863dfc82b92a9c304819cd27f23a121f9e979d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/swin/swin-small-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    './swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py'
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        depths=[2, 2, 18, 2]),
+    decode_head=dict(in_channels=[96, 192, 384, 768], num_classes=150),
+    auxiliary_head=dict(in_channels=384, num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/swin/swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/swin/swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..f61a27627717b2eebfd26f71dc34d4255fd5b6a1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/swin/swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,52 @@
+_base_ = [
+    '../_base_/models/upernet_swin.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        use_abs_pos_embed=False,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    decode_head=dict(in_channels=[96, 192, 384, 768], num_classes=150),
+    auxiliary_head=dict(in_channels=384, num_classes=150))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'relative_position_bias_table': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/swin/swin-tiny-patch4-window7_upernet_1xb8-20k_levir-256x256.py b/head_extractor/src/mmseg/.mim/configs/swin/swin-tiny-patch4-window7_upernet_1xb8-20k_levir-256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b90686dd57e979d1676a16fea0cb3ace1056dcb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/swin/swin-tiny-patch4-window7_upernet_1xb8-20k_levir-256x256.py
@@ -0,0 +1,57 @@
+_base_ = [
+    '../_base_/models/upernet_swin.py', '../_base_/datasets/levir_256x256.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (256, 256)
+norm_cfg = dict(type='BN', requires_grad=True)
+data_preprocessor = dict(
+    size=crop_size,
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53, 123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375, 58.395, 57.12, 57.375],
+    bgr_to_rgb=False)
+
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        in_channels=6,
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        use_abs_pos_embed=False,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    decode_head=dict(in_channels=[96, 192, 384, 768], num_classes=2),
+    auxiliary_head=dict(in_channels=384, num_classes=2))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'relative_position_bias_table': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=20000,
+        by_epoch=False,
+    )
+]
+
+train_dataloader = dict(batch_size=4)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/README.md b/head_extractor/src/mmseg/.mim/configs/twins/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e4b3735b002fa65921503863c81063edb12ae37e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/README.md
@@ -0,0 +1,76 @@
+# Twins
+
+> [Twins: Revisiting the Design of Spatial Attention in Vision Transformers](https://arxiv.org/pdf/2104.13840.pdf)
+
+## Introduction
+
+<!-- [BACKBONE] -->
+
+<a href = "https://github.com/Meituan-AutoML/Twins">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Very recently, a variety of vision transformer architectures for dense prediction tasks have been proposed and they show that the design of spatial attention is critical to their success in these tasks. In this work, we revisit the design of the spatial attention and demonstrate that a carefully-devised yet simple spatial attention mechanism performs favourably against the state-of-the-art schemes. As a result, we propose two vision transformer architectures, namely, Twins-PCPVT and Twins-SVT. Our proposed architectures are highly-efficient and easy to implement, only involving matrix multiplications that are highly optimized in modern deep learning frameworks. More importantly, the proposed architectures achieve excellent performance on a wide range of visual tasks, including image level classification as well as dense detection and segmentation. The simplicity and strong performance suggest that our proposed architectures may serve as stronger backbones for many vision tasks. Our code is released at [this https URL](https://github.com/Meituan-AutoML/Twins).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/145021310-57826cf5-5e03-4c7c-9081-ffa744bdae27.png" width="80%"/>
+</div>
+
+## Usage
+
+We have provided pretrained models converted from [official repo](https://github.com/Meituan-AutoML/Twins).
+
+If you want to convert keys on your own to use official repositories' pre-trained models, we also provide a script [`twins2mmseg.py`](../../tools/model_converters/twins2mmseg.py) in the tools directory to convert the key of models from [the official repo](https://github.com/Meituan-AutoML/Twins) to MMSegmentation style.
+
+```shell
+python tools/model_converters/twins2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH} ${MODEL_TYPE}
+```
+
+This script convert `pcpvt` or `svt` pretrained model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+For example,
+
+```shell
+python tools/model_converters/twins2mmseg.py ./alt_gvt_base.pth ./pretrained/alt_gvt_base.pth svt
+```
+
+## Results and models
+
+### ADE20K
+
+| Method  | Backbone            | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                              | download                                                                                                                                                                                                                                                                                                                                                                                       |
+| ------- | ------------------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FPN     | Twins-PCPVT-S       | 512x512   | 80000   | 6.60     | 27.15          | V100   | 43.26 | 44.11         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_204132-41acd132.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_204132.log.json) |
+| UPerNet | Twins-PCPVT-S       | 512x512   | 160000  | 9.67     | 14.24          | V100   | 46.04 | 46.92         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k_20211201_233537-8e99c07a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k_20211201_233537.log.json)         |
+| FPN     | Twins-PCPVT-B       | 512x512   | 80000   | 8.41     | 19.67          | V100   | 45.66 | 46.48         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141019-d396db72.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141019.log.json) |
+| UPerNet | Twins-PCPVT-B (8x2) | 512x512   | 160000  | 6.46     | 12.04          | V100   | 47.91 | 48.64         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-b_uperhead_8xb2-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k_20211130_141020-02094ea5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k_20211130_141020.log.json)         |
+| FPN     | Twins-PCPVT-L       | 512x512   | 80000   | 10.78    | 14.32          | V100   | 45.94 | 46.70         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_105226-bc6d61dc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_105226.log.json) |
+| UPerNet | Twins-PCPVT-L (8x2) | 512x512   | 160000  | 7.82     | 10.70          | V100   | 49.35 | 50.08         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k_20211201_075053-c6095c07.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k_20211201_075053.log.json)         |
+| FPN     | Twins-SVT-S         | 512x512   | 80000   | 5.80     | 29.79          | V100   | 44.47 | 45.42         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141006-0a0d3317.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141006.log.json)         |
+| UPerNet | SVT-S  (8x2)        | 512x512   | 160000  | 4.93     | 15.09          | V100   | 46.08 | 46.96         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_svt-s_uperhead_8xb2-160k_ade20k-512x512.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k/twins_svt-s_uperhead_8x2_512x512_160k_ade20k_20211130_141005-e48a2d94.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k/twins_svt-s_uperhead_8x2_512x512_160k_ade20k_20211130_141005.log.json)                 |
+| FPN     | Twins-SVT-B         | 512x512   | 80000   | 8.75     | 21.10          | V100   | 46.77 | 47.47         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_svt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_113849-88b2907c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_113849.log.json)         |
+| UPerNet | Twins-SVT-B  (8x2)  | 512x512   | 160000  | 6.77     | 12.66          | V100   | 48.04 | 48.87         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_svt-b_uperhead_8xb2-160k_ade20k-512x512.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k/twins_svt-b_uperhead_8x2_512x512_160k_ade20k_20211202_040826-0943a1f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k/twins_svt-b_uperhead_8x2_512x512_160k_ade20k_20211202_040826.log.json)                 |
+| FPN     | Twins-SVT-L         | 512x512   | 80000   | 11.20    | 17.80          | V100   | 46.55 | 47.74         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_svt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141005-1d59bee2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141005.log.json)         |
+| UPerNet | Twins-SVT-L  (8x2)  | 512x512   | 160000  | 8.41     | 10.73          | V100   | 49.65 | 50.63         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k/twins_svt-l_uperhead_8x2_512x512_160k_ade20k_20211130_141005-3e2cae61.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k/twins_svt-l_uperhead_8x2_512x512_160k_ade20k_20211130_141005.log.json)                 |
+
+Note:
+
+- `8x2` means 8 GPUs with 2 samples per GPU in training. Default setting of Twins on ADE20K is 8 GPUs with 4 samples per GPU in training.
+- `UPerNet` and `FPN` are decoder heads utilized in corresponding Twins model, which is `UPerHead` and `FPNHead`, respectively. Specifically, models in [official repo](https://github.com/Meituan-AutoML/Twins) all use `UPerHead`.
+
+## Citation
+
+```bibtex
+@article{chu2021twins,
+  title={Twins: Revisiting spatial attention design in vision transformers},
+  author={Chu, Xiangxiang and Tian, Zhi and Wang, Yuqing and Zhang, Bo and Ren, Haibing and Wei, Xiaolin and Xia, Huaxia and Shen, Chunhua},
+  journal={arXiv preprint arXiv:2104.13840},
+  year={2021}altgvt
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/twins/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0de78d9d2e8923fea4963fdacc1096683091f2ba
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/metafile.yaml
@@ -0,0 +1,289 @@
+Models:
+- Name: twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.26
+      mIoU(ms+flip): 44.11
+  Config: configs/twins/twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-PCPVT-S
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_204132-41acd132.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_204132.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.04
+      mIoU(ms+flip): 46.92
+  Config: configs/twins/twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-PCPVT-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.67
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k_20211201_233537-8e99c07a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k_20211201_233537.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_pcpvt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.66
+      mIoU(ms+flip): 46.48
+  Config: configs/twins/twins_pcpvt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-PCPVT-B
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.41
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141019-d396db72.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141019.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_pcpvt-b_uperhead_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.91
+      mIoU(ms+flip): 48.64
+  Config: configs/twins/twins_pcpvt-b_uperhead_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Twins-PCPVT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.46
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k_20211130_141020-02094ea5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k_20211130_141020.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_pcpvt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.94
+      mIoU(ms+flip): 46.7
+  Config: configs/twins/twins_pcpvt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-PCPVT-L
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 10.78
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_105226-bc6d61dc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_105226.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.35
+      mIoU(ms+flip): 50.08
+  Config: configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Twins-PCPVT-L
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.82
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k_20211201_075053-c6095c07.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k_20211201_075053.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.47
+      mIoU(ms+flip): 45.42
+  Config: configs/twins/twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-SVT-S
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 5.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141006-0a0d3317.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141006.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_svt-s_uperhead_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.08
+      mIoU(ms+flip): 46.96
+  Config: configs/twins/twins_svt-s_uperhead_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - SVT-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.93
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k/twins_svt-s_uperhead_8x2_512x512_160k_ade20k_20211130_141005-e48a2d94.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k/twins_svt-s_uperhead_8x2_512x512_160k_ade20k_20211130_141005.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_svt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.77
+      mIoU(ms+flip): 47.47
+  Config: configs/twins/twins_svt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-SVT-B
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.75
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_113849-88b2907c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_113849.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_svt-b_uperhead_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.04
+      mIoU(ms+flip): 48.87
+  Config: configs/twins/twins_svt-b_uperhead_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Twins-SVT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.77
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k/twins_svt-b_uperhead_8x2_512x512_160k_ade20k_20211202_040826-0943a1f1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k/twins_svt-b_uperhead_8x2_512x512_160k_ade20k_20211202_040826.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_svt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.55
+      mIoU(ms+flip): 47.74
+  Config: configs/twins/twins_svt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-SVT-L
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 11.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141005-1d59bee2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141005.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.65
+      mIoU(ms+flip): 50.63
+  Config: configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Twins-SVT-L
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.41
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k/twins_svt-l_uperhead_8x2_512x512_160k_ade20k_20211130_141005-3e2cae61.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k/twins_svt-l_uperhead_8x2_512x512_160k_ade20k_20211130_141005.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..4739ad4b0a3a4e3352862e907a0d623e3abe1a16
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = ['./twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_base_20220308-0621964c.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        depths=[3, 4, 18, 3]), )
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-b_uperhead_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-b_uperhead_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba9748547d8877fb567fc753e704a91cde234297
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-b_uperhead_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = ['./twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_base_20220308-0621964c.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        depths=[3, 4, 18, 3],
+        drop_path_rate=0.3))
+
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..bff7c41946032a76790d987b185a1689ba4e4bea
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = ['./twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_large_20220308-37579dc6.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        depths=[3, 8, 27, 3]))
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..666ff5b69c7345b1372c8512510e8a3285b0fa76
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = ['./twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_large_20220308-37579dc6.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        depths=[3, 8, 27, 3],
+        drop_path_rate=0.3))
+
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b480b9f99e16d25571e174012feb20045eceb46
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/twins_pcpvt-s_fpn.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=None)
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..387cf60228942ccc5b50a27957a59263070c980d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py
@@ -0,0 +1,31 @@
+_base_ = [
+    '../_base_/models/twins_pcpvt-s_upernet.py',
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(custom_keys={
+        'pos_block': dict(decay_mult=0.),
+        'norm': dict(decay_mult=0.)
+    }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e9fa00f887800028d62d96441445307eb764ba7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_base_20220308-1b7eb711.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=[96, 192, 384, 768],
+        num_heads=[3, 6, 12, 24],
+        depths=[2, 2, 18, 2]),
+    neck=dict(in_channels=[96, 192, 384, 768]),
+)
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-b_uperhead_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-b_uperhead_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ce2361f5feb8f6a93bcb737729760969dc3e618
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-b_uperhead_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./twins_svt-s_uperhead_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_base_20220308-1b7eb711.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=[96, 192, 384, 768],
+        num_heads=[3, 6, 12, 24],
+        depths=[2, 2, 18, 2]),
+    decode_head=dict(in_channels=[96, 192, 384, 768]),
+    auxiliary_head=dict(in_channels=384))
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7e5f9cdb834b00edad01cb9dacb25904dcf4d01
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = ['./twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_large_20220308-fb5936f3.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=[128, 256, 512, 1024],
+        num_heads=[4, 8, 16, 32],
+        depths=[2, 2, 18, 2],
+        drop_path_rate=0.3),
+    neck=dict(in_channels=[128, 256, 512, 1024]),
+)
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-l_uperhead_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-l_uperhead_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..69c69df3b5bb95da472ca4366facc8918f791ad9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-l_uperhead_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = ['./twins_svt-s_uperhead_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_large_20220308-fb5936f3.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=[128, 256, 512, 1024],
+        num_heads=[4, 8, 16, 32],
+        depths=[2, 2, 18, 2],
+        drop_path_rate=0.3),
+    decode_head=dict(in_channels=[128, 256, 512, 1024]),
+    auxiliary_head=dict(in_channels=512))
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1aad83bc1a6a735b828d26c9adc6977704f4029
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
@@ -0,0 +1,28 @@
+_base_ = [
+    '../_base_/models/twins_pcpvt-s_fpn.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_small_20220308-7e1c3695.pth'  # noqa
+
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='SVT',
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=[64, 128, 256, 512],
+        num_heads=[2, 4, 8, 16],
+        mlp_ratios=[4, 4, 4, 4],
+        depths=[2, 2, 10, 4],
+        windiow_sizes=[7, 7, 7, 7],
+        norm_after_stage=True),
+    neck=dict(in_channels=[64, 128, 256, 512], out_channels=256, num_outs=4),
+    decode_head=dict(num_classes=150),
+)
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001),
+    clip_grad=None)
diff --git a/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-s_uperhead_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-s_uperhead_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..3846795509a1e259a929bbdbb2bab968c1206396
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/twins/twins_svt-s_uperhead_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,49 @@
+_base_ = [
+    '../_base_/models/twins_pcpvt-s_upernet.py',
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_small_20220308-7e1c3695.pth'  # noqa
+
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='SVT',
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=[64, 128, 256, 512],
+        num_heads=[2, 4, 8, 16],
+        mlp_ratios=[4, 4, 4, 4],
+        depths=[2, 2, 10, 4],
+        windiow_sizes=[7, 7, 7, 7],
+        norm_after_stage=True),
+    decode_head=dict(in_channels=[64, 128, 256, 512]),
+    auxiliary_head=dict(in_channels=256))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(custom_keys={
+        'pos_block': dict(decay_mult=0.),
+        'norm': dict(decay_mult=0.)
+    }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
+
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/README.md b/head_extractor/src/mmseg/.mim/configs/unet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7225fbbf68f8a94861418b248ad4bd0263b8e424
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/README.md
@@ -0,0 +1,92 @@
+# UNet
+
+> [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="http://lmb.informatik.uni-freiburg.de/people/ronneber/u-net">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+There is large consent that successful training of deep networks requires many thousand annotated training samples. In this paper, we present a network and training strategy that relies on the strong use of data augmentation to use the available annotated samples more efficiently. The architecture consists of a contracting path to capture context and a symmetric expanding path that enables precise localization. We show that such a network can be trained end-to-end from very few images and outperforms the prior best method (a sliding-window convolutional network) on the ISBI challenge for segmentation of neuronal structures in electron microscopic stacks. Using the same network trained on transmitted light microscopy images (phase contrast and DIC) we won the ISBI cell tracking challenge 2015 in these categories by a large margin. Moreover, the network is fast. Segmentation of a 512x512 image takes less than a second on a recent GPU. The full implementation (based on Caffe) and the trained networks are available at [this http URL](https://lmb.informatik.uni-freiburg.de/people/ronneber/u-net/).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142902977-20fe689d-a147-4d92-9690-dbfde8b68dbe.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method     | Backbone    | Loss          | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                                     |
+| ---------- | ----------- | ------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UNet + FCN | UNet-S5-D16 | Cross Entropy | 512x1024  |  160000 | 17.91    | 3.05           | V100   | 69.10 |         71.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes_20211210_145204-6860854e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes_20211210_145204.log.json) |
+
+### DRIVE
+
+| Method           | Backbone    | Loss                 | Image Size | Crop Size | Stride | Lr schd | Mem (GB) | Inf time (fps) | Device | mDice |  Dice | config                                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                                                                         |
+| ---------------- | ----------- | -------------------- | ---------- | --------- | -----: | ------- | -------- | -------------: | ------ | ----: | ----: | ------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy        | 584x565    | 64x64     |  42x42 | 40000   | 0.680    |              - | V100   | 88.38 | 78.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-40k_drive-64x64.py)                       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_64x64_40k_drive/fcn_unet_s5-d16_64x64_40k_drive_20201223_191051-5daf6d3b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_64x64_40k_drive/unet_s5-d16_64x64_40k_drive-20201223_191051.log.json)                                                                                                 |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy + Dice | 584x565    | 64x64     |  42x42 | 40000   | 0.582    |              - | V100   | 88.71 | 79.32 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201820-785de5c2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201820.log.json)                         |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy        | 584x565    | 64x64     |  42x42 | 40000   | 0.599    |              - | V100   | 88.35 | 78.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-40k_drive-64x64.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_64x64_40k_drive/pspnet_unet_s5-d16_64x64_40k_drive_20201227_181818-aac73387.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_64x64_40k_drive/pspnet_unet_s5-d16_64x64_40k_drive-20201227_181818.log.json)                                                                             |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy + Dice | 584x565    | 64x64     |  42x42 | 40000   | 0.585    |              - | V100   | 88.76 | 79.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201821-22b3e3ba.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201821.log.json)             |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy        | 584x565    | 64x64     |  42x42 | 40000   | 0.596    |              - | V100   | 88.38 | 78.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_drive-64x64.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_64x64_40k_drive/deeplabv3_unet_s5-d16_64x64_40k_drive_20201226_094047-0671ff20.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_64x64_40k_drive/deeplabv3_unet_s5-d16_64x64_40k_drive-20201226_094047.log.json)                                                                 |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy + Dice | 584x565    | 64x64     |  42x42 | 40000   | 0.582    |              - | V100   | 88.84 | 79.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201825-6bf0efd7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201825.log.json) |
+
+### STARE
+
+| Method           | Backbone    | Loss                 | Image Size | Crop Size | Stride | Lr schd | Mem (GB) | Inf time (fps) | Device | mDice |  Dice | config                                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| ---------------- | ----------- | -------------------- | ---------- | --------- | -----: | ------- | -------- | -------------: | ------ | ----: | ----: | --------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy        | 605x700    | 128x128   |  85x85 | 40000   | 0.968    |              - | V100   | 89.78 | 81.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-40k_stare-128x128.py)                       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_128x128_40k_stare/fcn_unet_s5-d16_128x128_40k_stare_20201223_191051-7d77e78b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_128x128_40k_stare/unet_s5-d16_128x128_40k_stare-20201223_191051.log.json)                                                                                                 |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy + Dice | 605x700    | 128x128   |  85x85 | 40000   | 0.986    |              - | V100   | 90.65 | 82.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201821-f75705a9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201821.log.json)                         |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy        | 605x700    | 128x128   |  85x85 | 40000   | 0.982    |              - | V100   | 89.89 | 81.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-40k_stare-128x128.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_stare/pspnet_unet_s5-d16_128x128_40k_stare_20201227_181818-3c2923c4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_stare/pspnet_unet_s5-d16_128x128_40k_stare-20201227_181818.log.json)                                                                             |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy + Dice | 605x700    | 128x128   |  85x85 | 40000   | 1.028    |              - | V100   | 90.72 | 82.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201823-f1063ef7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201823.log.json)             |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy        | 605x700    | 128x128   |  85x85 | 40000   | 0.999    |              - | V100   | 89.73 | 80.93 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_stare-128x128.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_stare/deeplabv3_unet_s5-d16_128x128_40k_stare_20201226_094047-93dcb93c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_stare/deeplabv3_unet_s5-d16_128x128_40k_stare-20201226_094047.log.json)                                                                 |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy + Dice | 605x700    | 128x128   |  85x85 | 40000   | 1.010    |              - | V100   | 90.65 | 82.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201825-21db614c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201825.log.json) |
+
+### CHASE_DB1
+
+| Method           | Backbone    | Loss                 | Image Size | Crop Size | Stride | Lr schd | Mem (GB) | Inf time (fps) | Device | mDice |  Dice | config                                                                                                                                            | download                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| ---------------- | ----------- | -------------------- | ---------- | --------- | -----: | ------- | -------- | -------------: | ------ | ----: | ----: | ------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy        | 960x999    | 128x128   |  85x85 | 40000   | 0.968    |              - | V100   | 89.46 | 80.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-40k_chase-db1-128x128.py)                       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_128x128_40k_chase_db1/fcn_unet_s5-d16_128x128_40k_chase_db1_20201223_191051-11543527.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_128x128_40k_chase_db1/unet_s5-d16_128x128_40k_chase_db1-20201223_191051.log.json)                                                                                                 |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy + Dice | 960x999    | 128x128   |  85x85 | 40000   | 0.986    |              - | V100   | 89.52 | 80.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201821-1c4eb7cf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201821.log.json)                         |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy        | 960x999    | 128x128   |  85x85 | 40000   | 0.982    |              - | V100   | 89.52 | 80.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-40k_chase-db1-128x128.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1/pspnet_unet_s5-d16_128x128_40k_chase_db1_20201227_181818-68d4e609.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1/pspnet_unet_s5-d16_128x128_40k_chase_db1-20201227_181818.log.json)                                                                             |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy + Dice | 960x999    | 128x128   |  85x85 | 40000   | 1.028    |              - | V100   | 89.45 | 80.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201823-c0802c4d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201823.log.json)             |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy        | 960x999    | 128x128   |  85x85 | 40000   | 0.999    |              - | V100   | 89.57 | 80.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet_s5-d16_deeplabv3_4xb4-40k_chase-db1-128x128.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1/deeplabv3_unet_s5-d16_128x128_40k_chase_db1_20201226_094047-4c5aefa3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1/deeplabv3_unet_s5-d16_128x128_40k_chase_db1-20201226_094047.log.json)                                                                 |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy + Dice | 960x999    | 128x128   |  85x85 | 40000   | 1.010    |              - | V100   | 89.49 | 80.37 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201825-4ef29df5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201825.log.json) |
+
+### HRF
+
+| Method           | Backbone    | Loss                 | Image Size | Crop Size |  Stride | Lr schd | Mem (GB) | Inf time (fps) | Device | mDice |  Dice | config                                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                                                                         |
+| ---------------- | ----------- | -------------------- | ---------- | --------- | ------: | ------- | -------- | -------------: | ------ | ----: | ----: | ------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy        | 2336x3504  | 256x256   | 170x170 | 40000   | 2.525    |              - | V100   | 88.92 | 79.45 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-40k_hrf-256x256.py)                       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_256x256_40k_hrf/fcn_unet_s5-d16_256x256_40k_hrf_20201223_173724-d89cf1ed.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_256x256_40k_hrf/unet_s5-d16_256x256_40k_hrf-20201223_173724.log.json)                                                                                                 |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy + Dice | 2336x3504  | 256x256   | 170x170 | 40000   | 2.623    |              - | V100   | 89.64 | 80.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201821-c314da8a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201821.log.json)                         |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy        | 2336x3504  | 256x256   | 170x170 | 40000   | 2.588    |              - | V100   | 89.24 | 80.07 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-40k_hrf-256x256.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_256x256_40k_hrf/pspnet_unet_s5-d16_256x256_40k_hrf_20201227_181818-fdb7e29b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_256x256_40k_hrf/pspnet_unet_s5-d16_256x256_40k_hrf-20201227_181818.log.json)                                                                             |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy + Dice | 2336x3504  | 256x256   | 170x170 | 40000   | 2.798    |              - | V100   | 89.69 | 80.96 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201823-53d492fa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201823.log.json)             |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy        | 2336x3504  | 256x256   | 170x170 | 40000   | 2.604    |              - | V100   | 89.32 | 80.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_hrf-256x256.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf/deeplabv3_unet_s5-d16_256x256_40k_hrf_20201226_094047-3a1fdf85.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf/deeplabv3_unet_s5-d16_256x256_40k_hrf-20201226_094047.log.json)                                                                 |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy + Dice | 2336x3504  | 256x256   | 170x170 | 40000   | 2.607    |              - | V100   | 89.56 | 80.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_202032-59daf7a4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_202032.log.json) |
+
+Note:
+
+- In  `DRIVE`, `STARE`, `CHASE_DB1`, and `HRF` dataset, `mDice` is mean dice of background and vessel, while `Dice` is dice metric of vessel(foreground) only.
+
+## Citation
+
+```bibtex
+@inproceedings{ronneberger2015u,
+  title={U-net: Convolutional networks for biomedical image segmentation},
+  author={Ronneberger, Olaf and Fischer, Philipp and Brox, Thomas},
+  booktitle={International Conference on Medical image computing and computer-assisted intervention},
+  pages={234--241},
+  year={2015},
+  organization={Springer}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/unet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1eafbc6d0885f64527bade9317be6fc5ad177107
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/metafile.yaml
@@ -0,0 +1,642 @@
+Collections:
+- Name: UNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - DRIVE
+    - STARE
+    - CHASE_DB1
+    - HRF
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  README: configs/unet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 69.1
+      mIoU(ms+flip): 71.05
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 17.91
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes_20211210_145204-6860854e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes_20211210_145204.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-40k_drive-64x64
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: DRIVE
+    Metrics:
+      mDice: 88.38
+      Dice: 78.67
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-40k_drive-64x64.py
+  Metadata:
+    Training Data: DRIVE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.68
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_64x64_40k_drive/fcn_unet_s5-d16_64x64_40k_drive_20201223_191051-5daf6d3b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_64x64_40k_drive/unet_s5-d16_64x64_40k_drive-20201223_191051.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_drive-64x64
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: DRIVE
+    Metrics:
+      mDice: 88.71
+      Dice: 79.32
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
+  Metadata:
+    Training Data: DRIVE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.582
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201820-785de5c2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201820.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-40k_drive-64x64
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: DRIVE
+    Metrics:
+      mDice: 88.35
+      Dice: 78.62
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-40k_drive-64x64.py
+  Metadata:
+    Training Data: DRIVE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.599
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_64x64_40k_drive/pspnet_unet_s5-d16_64x64_40k_drive_20201227_181818-aac73387.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_64x64_40k_drive/pspnet_unet_s5-d16_64x64_40k_drive-20201227_181818.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_drive-64x64
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: DRIVE
+    Metrics:
+      mDice: 88.76
+      Dice: 79.42
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
+  Metadata:
+    Training Data: DRIVE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.585
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201821-22b3e3ba.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201821.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-40k_drive-64x64
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: DRIVE
+    Metrics:
+      mDice: 88.38
+      Dice: 78.69
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_drive-64x64.py
+  Metadata:
+    Training Data: DRIVE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.596
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_64x64_40k_drive/deeplabv3_unet_s5-d16_64x64_40k_drive_20201226_094047-0671ff20.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_64x64_40k_drive/deeplabv3_unet_s5-d16_64x64_40k_drive-20201226_094047.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_drive-64x64
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: DRIVE
+    Metrics:
+      mDice: 88.84
+      Dice: 79.56
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
+  Metadata:
+    Training Data: DRIVE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.582
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201825-6bf0efd7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201825.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-40k_stare-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: STARE
+    Metrics:
+      mDice: 89.78
+      Dice: 81.02
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-40k_stare-128x128.py
+  Metadata:
+    Training Data: STARE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.968
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_128x128_40k_stare/fcn_unet_s5-d16_128x128_40k_stare_20201223_191051-7d77e78b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_128x128_40k_stare/unet_s5-d16_128x128_40k_stare-20201223_191051.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_stare-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: STARE
+    Metrics:
+      mDice: 90.65
+      Dice: 82.7
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
+  Metadata:
+    Training Data: STARE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.986
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201821-f75705a9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201821.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-40k_stare-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: STARE
+    Metrics:
+      mDice: 89.89
+      Dice: 81.22
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-40k_stare-128x128.py
+  Metadata:
+    Training Data: STARE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.982
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_stare/pspnet_unet_s5-d16_128x128_40k_stare_20201227_181818-3c2923c4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_stare/pspnet_unet_s5-d16_128x128_40k_stare-20201227_181818.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_stare-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: STARE
+    Metrics:
+      mDice: 90.72
+      Dice: 82.84
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
+  Metadata:
+    Training Data: STARE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.028
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201823-f1063ef7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201823.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-40k_stare-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: STARE
+    Metrics:
+      mDice: 89.73
+      Dice: 80.93
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_stare-128x128.py
+  Metadata:
+    Training Data: STARE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.999
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_stare/deeplabv3_unet_s5-d16_128x128_40k_stare_20201226_094047-93dcb93c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_stare/deeplabv3_unet_s5-d16_128x128_40k_stare-20201226_094047.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_stare-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: STARE
+    Metrics:
+      mDice: 90.65
+      Dice: 82.71
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
+  Metadata:
+    Training Data: STARE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.01
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201825-21db614c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201825.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-40k_chase-db1-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: CHASE_DB1
+    Metrics:
+      mDice: 89.46
+      Dice: 80.24
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-40k_chase-db1-128x128.py
+  Metadata:
+    Training Data: CHASE_DB1
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.968
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_128x128_40k_chase_db1/fcn_unet_s5-d16_128x128_40k_chase_db1_20201223_191051-11543527.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_128x128_40k_chase_db1/unet_s5-d16_128x128_40k_chase_db1-20201223_191051.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: CHASE_DB1
+    Metrics:
+      mDice: 89.52
+      Dice: 80.4
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
+  Metadata:
+    Training Data: CHASE_DB1
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.986
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201821-1c4eb7cf.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201821.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-40k_chase-db1-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: CHASE_DB1
+    Metrics:
+      mDice: 89.52
+      Dice: 80.36
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-40k_chase-db1-128x128.py
+  Metadata:
+    Training Data: CHASE_DB1
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.982
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1/pspnet_unet_s5-d16_128x128_40k_chase_db1_20201227_181818-68d4e609.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1/pspnet_unet_s5-d16_128x128_40k_chase_db1-20201227_181818.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: CHASE_DB1
+    Metrics:
+      mDice: 89.45
+      Dice: 80.28
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
+  Metadata:
+    Training Data: CHASE_DB1
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.028
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201823-c0802c4d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201823.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet_s5-d16_deeplabv3_4xb4-40k_chase-db1-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: CHASE_DB1
+    Metrics:
+      mDice: 89.57
+      Dice: 80.47
+  Config: configs/unet/unet_s5-d16_deeplabv3_4xb4-40k_chase-db1-128x128.py
+  Metadata:
+    Training Data: CHASE_DB1
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.999
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1/deeplabv3_unet_s5-d16_128x128_40k_chase_db1_20201226_094047-4c5aefa3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1/deeplabv3_unet_s5-d16_128x128_40k_chase_db1-20201226_094047.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: CHASE_DB1
+    Metrics:
+      mDice: 89.49
+      Dice: 80.37
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
+  Metadata:
+    Training Data: CHASE_DB1
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.01
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201825-4ef29df5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201825.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-40k_hrf-256x256
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: HRF
+    Metrics:
+      mDice: 88.92
+      Dice: 79.45
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-40k_hrf-256x256.py
+  Metadata:
+    Training Data: HRF
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.525
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_256x256_40k_hrf/fcn_unet_s5-d16_256x256_40k_hrf_20201223_173724-d89cf1ed.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_256x256_40k_hrf/unet_s5-d16_256x256_40k_hrf-20201223_173724.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: HRF
+    Metrics:
+      mDice: 89.64
+      Dice: 80.87
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
+  Metadata:
+    Training Data: HRF
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.623
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201821-c314da8a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201821.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-40k_hrf-256x256
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: HRF
+    Metrics:
+      mDice: 89.24
+      Dice: 80.07
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-40k_hrf-256x256.py
+  Metadata:
+    Training Data: HRF
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.588
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_256x256_40k_hrf/pspnet_unet_s5-d16_256x256_40k_hrf_20201227_181818-fdb7e29b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_256x256_40k_hrf/pspnet_unet_s5-d16_256x256_40k_hrf-20201227_181818.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: HRF
+    Metrics:
+      mDice: 89.69
+      Dice: 80.96
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
+  Metadata:
+    Training Data: HRF
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.798
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201823-53d492fa.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201823.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-40k_hrf-256x256
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: HRF
+    Metrics:
+      mDice: 89.32
+      Dice: 80.21
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_hrf-256x256.py
+  Metadata:
+    Training Data: HRF
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.604
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf/deeplabv3_unet_s5-d16_256x256_40k_hrf_20201226_094047-3a1fdf85.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf/deeplabv3_unet_s5-d16_256x256_40k_hrf-20201226_094047.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: HRF
+    Metrics:
+      mDice: 89.56
+      Dice: 80.71
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
+  Metadata:
+    Training Data: HRF
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.607
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_202032-59daf7a4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_202032.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_drive-64x64.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_drive-64x64.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4af542bfa61ec5aaf9f923fa5298b532e84d128
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_drive-64x64.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/deeplabv3_unet_s5-d16.py', '../_base_/datasets/drive.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (64, 64)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    test_cfg=dict(crop_size=(64, 64), stride=(42, 42)))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_hrf-256x256.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_hrf-256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..b45405fe35e769cb0e94730ef52c90528cb0fd16
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_hrf-256x256.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/deeplabv3_unet_s5-d16.py', '../_base_/datasets/hrf.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (256, 256)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    test_cfg=dict(crop_size=(256, 256), stride=(170, 170)))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_stare-128x128.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_stare-128x128.py
new file mode 100644
index 0000000000000000000000000000000000000000..554caca96f53055b2f3a898681da7c65c32c8c80
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_stare-128x128.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/deeplabv3_unet_s5-d16.py', '../_base_/datasets/stare.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (128, 128)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    test_cfg=dict(crop_size=(128, 128), stride=(85, 85)))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f30bba9a7c64fb3e475a7aecf4c2ad1e151fbf0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
@@ -0,0 +1,6 @@
+_base_ = './unet_s5-d16_deeplabv3_4xb4-40k_chase-db1-128x128.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
new file mode 100644
index 0000000000000000000000000000000000000000..823fc6dc51febe4c82712282a756adb8ee5d74c6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_deeplabv3_4xb4-40k_drive-64x64.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..174eaf8d93ebbc6e4be190239b4e60f73a75c391
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_deeplabv3_4xb4-40k_hrf-256x256.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
new file mode 100644
index 0000000000000000000000000000000000000000..35972bea938086b428317c3174afa719405feb4f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_deeplabv3_4xb4-40k_stare-128x128.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2e995dd21da3e072fa8bf2b8e679ec409adfbe5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py
@@ -0,0 +1,16 @@
+_base_ = [
+    '../_base_/models/fcn_unet_s5-d16.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=19),
+    auxiliary_head=dict(num_classes=19),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-160k_hsidrive-192x384.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-160k_hsidrive-192x384.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5768ba1487da1a124191be9b86d5ecc57779647
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-160k_hsidrive-192x384.py
@@ -0,0 +1,36 @@
+_base_ = [
+    '../_base_/models/fcn_unet_s5-d16.py', '../_base_/datasets/hsi_drive.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (192, 384)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    size=crop_size,
+    mean=None,
+    std=None,
+    bgr_to_rgb=None,
+    pad_val=0,
+    seg_pad_val=255)
+
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(in_channels=25),
+    decode_head=dict(
+        ignore_index=0,
+        num_classes=11,
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=1.0,
+            avg_non_ignore=True)),
+    auxiliary_head=dict(
+        ignore_index=0,
+        num_classes=11,
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=1.0,
+            avg_non_ignore=True)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-40k_chase-db1-128x128.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-40k_chase-db1-128x128.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfc2109e224754f96ebfd39ae16771f25e4d7431
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-40k_chase-db1-128x128.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/fcn_unet_s5-d16.py', '../_base_/datasets/chase_db1.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (128, 128)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    test_cfg=dict(crop_size=(128, 128), stride=(85, 85)))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-40k_drive-64x64.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-40k_drive-64x64.py
new file mode 100644
index 0000000000000000000000000000000000000000..10a45d1f7fe6f040ed4f21b8074106d9020908dd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-40k_drive-64x64.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/fcn_unet_s5-d16.py', '../_base_/datasets/drive.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (64, 64)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    test_cfg=dict(crop_size=(64, 64), stride=(42, 42)))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-40k_hrf-256x256.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-40k_hrf-256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..7de57f2c2f76fddafaae36c8273bd6860d2eea81
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-40k_hrf-256x256.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/fcn_unet_s5-d16.py', '../_base_/datasets/hrf.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (256, 256)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    test_cfg=dict(crop_size=(256, 256), stride=(170, 170)))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-40k_stare-128x128.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-40k_stare-128x128.py
new file mode 100644
index 0000000000000000000000000000000000000000..8eeef77628ecd4e777a3938366e106f7df6ba71b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-40k_stare-128x128.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/fcn_unet_s5-d16.py', '../_base_/datasets/stare.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (128, 128)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    test_cfg=dict(crop_size=(128, 128), stride=(85, 85)))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a26ccbf96e0834e4e1967f3a401c9c9893792a5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_fcn_4xb4-40k_chase-db1-128x128.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3b1488ad56d1454c4e1fd8d7956d3341a265cbb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_fcn_4xb4-40k_drive-64x64.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd3a6afc028c99fd42fe205dd0d80727e856e07c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_fcn_4xb4-40k_hrf-256x256.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8fecf34e9dc51886040ce1e1020596f6d958b72
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_fcn_4xb4-40k_stare-128x128.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-40k_chase-db1-128x128.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-40k_chase-db1-128x128.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca6e5132fa05626b79745e767744fd9560c31a2d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-40k_chase-db1-128x128.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/pspnet_unet_s5-d16.py',
+    '../_base_/datasets/chase_db1.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (128, 128)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    test_cfg=dict(crop_size=(128, 128), stride=(85, 85)))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-40k_drive-64x64.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-40k_drive-64x64.py
new file mode 100644
index 0000000000000000000000000000000000000000..503b90136d89a8c96b8839ab11831ef4107fcafc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-40k_drive-64x64.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/pspnet_unet_s5-d16.py', '../_base_/datasets/drive.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (64, 64)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    test_cfg=dict(crop_size=(64, 64), stride=(42, 42)))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-40k_hrf-256x256.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-40k_hrf-256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..245365ca8dab5d2439a3bb0a079b18a8955d6b20
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-40k_hrf-256x256.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/pspnet_unet_s5-d16.py', '../_base_/datasets/hrf.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (256, 256)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    test_cfg=dict(crop_size=(256, 256), stride=(170, 170)))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-40k_stare-128x128.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-40k_stare-128x128.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1eeeb96f8a0a7bd1c09dcf7abe12a6c9178a02b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-40k_stare-128x128.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/pspnet_unet_s5-d16.py', '../_base_/datasets/stare.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (128, 128)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    test_cfg=dict(crop_size=(128, 128), stride=(85, 85)))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
new file mode 100644
index 0000000000000000000000000000000000000000..69a4bbaf82d131d16f4ef781a8378085bc3c79fc
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_pspnet_4xb4-40k_chase-db1-128x128.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
new file mode 100644
index 0000000000000000000000000000000000000000..1abbd53d8c974046afc3cac75bb13ae7dde1d149
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_pspnet_4xb4-40k_drive-64x64.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3256d759bb63da4f2a4253da0efc8ca9f896bd4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_pspnet_4xb4-40k_hrf-256x256.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
new file mode 100644
index 0000000000000000000000000000000000000000..82aa3da616d3c6726f71c8abf828adbea7593c89
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_pspnet_4xb4-40k_stare-128x128.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/head_extractor/src/mmseg/.mim/configs/unet/unet_s5-d16_deeplabv3_4xb4-40k_chase-db1-128x128.py b/head_extractor/src/mmseg/.mim/configs/unet/unet_s5-d16_deeplabv3_4xb4-40k_chase-db1-128x128.py
new file mode 100644
index 0000000000000000000000000000000000000000..82494f30924f637d2cf0ad6f731c415ce18cf190
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/unet/unet_s5-d16_deeplabv3_4xb4-40k_chase-db1-128x128.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/deeplabv3_unet_s5-d16.py',
+    '../_base_/datasets/chase_db1.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (128, 128)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    test_cfg=dict(crop_size=(128, 128), stride=(85, 85)))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/README.md b/head_extractor/src/mmseg/.mim/configs/upernet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c2babbd2a71084d9f641b61337e9fbed373273a4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/README.md
@@ -0,0 +1,68 @@
+# UPerNet
+
+> [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/pdf/1807.10221.pdf)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/CSAILVision/unifiedparsing">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Humans recognize the visual world at multiple levels: we effortlessly categorize scenes and detect objects inside, while also identifying the textures and surfaces of the objects along with their different compositional parts. In this paper, we study a new task called Unified Perceptual Parsing, which requires the machine vision systems to recognize as many visual concepts as possible from a given image. A multi-task framework called UPerNet and a training strategy are developed to learn from heterogeneous image annotations. We benchmark our framework on Unified Perceptual Parsing and show that it is able to effectively segment a wide range of concepts from images. The trained networks are further applied to discover visual knowledge in natural scenes. Models are available at [this https URL](https://github.com/CSAILVision/unifiedparsing).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142903077-44e8e0da-7276-4bda-bd2b-0df1680ca845.png" width="70%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                           |
+| ------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | R-50     | 512x1024  |   40000 | 6.4      | 4.25           | V100   | 77.10 |         78.37 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_40k_cityscapes/upernet_r50_512x1024_40k_cityscapes_20200605_094827-aa54cb54.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_40k_cityscapes/upernet_r50_512x1024_40k_cityscapes_20200605_094827.log.json)     |
+| UPerNet | R-101    | 512x1024  |   40000 | 7.4      | 3.79           | V100   | 78.69 |         80.11 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_40k_cityscapes/upernet_r101_512x1024_40k_cityscapes_20200605_094933-ebce3b10.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_40k_cityscapes/upernet_r101_512x1024_40k_cityscapes_20200605_094933.log.json) |
+| UPerNet | R-50     | 769x769   |   40000 | 7.2      | 1.76           | V100   | 77.98 |         79.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_40k_cityscapes/upernet_r50_769x769_40k_cityscapes_20200530_033048-92d21539.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_40k_cityscapes/upernet_r50_769x769_40k_cityscapes_20200530_033048.log.json)         |
+| UPerNet | R-101    | 769x769   |   40000 | 8.4      | 1.56           | V100   | 79.03 |         80.77 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_40k_cityscapes/upernet_r101_769x769_40k_cityscapes_20200530_040819-83c95d01.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_40k_cityscapes/upernet_r101_769x769_40k_cityscapes_20200530_040819.log.json)     |
+| UPerNet | R-50     | 512x1024  |   80000 | -        | -              | V100   | 78.19 |         79.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_80k_cityscapes/upernet_r50_512x1024_80k_cityscapes_20200607_052207-848beca8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_80k_cityscapes/upernet_r50_512x1024_80k_cityscapes_20200607_052207.log.json)     |
+| UPerNet | R-101    | 512x1024  |   80000 | -        | -              | V100   | 79.40 |         80.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_80k_cityscapes/upernet_r101_512x1024_80k_cityscapes_20200607_002403-f05f2345.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_80k_cityscapes/upernet_r101_512x1024_80k_cityscapes_20200607_002403.log.json) |
+| UPerNet | R-50     | 769x769   |   80000 | -        | -              | V100   | 79.39 |         80.92 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_80k_cityscapes/upernet_r50_769x769_80k_cityscapes_20200607_005107-82ae7d15.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_80k_cityscapes/upernet_r50_769x769_80k_cityscapes_20200607_005107.log.json)         |
+| UPerNet | R-101    | 769x769   |   80000 | -        | -              | V100   | 80.10 |         81.49 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_80k_cityscapes/upernet_r101_769x769_80k_cityscapes_20200607_001014-082fc334.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_80k_cityscapes/upernet_r101_769x769_80k_cityscapes_20200607_001014.log.json)     |
+
+### ADE20K
+
+| Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                           |
+| ------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | R-50     | 512x512   |   80000 | 8.1      | 23.40          | V100   | 40.70 |         41.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_80k_ade20k/upernet_r50_512x512_80k_ade20k_20200614_144127-ecc8377b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_80k_ade20k/upernet_r50_512x512_80k_ade20k_20200614_144127.log.json)         |
+| UPerNet | R-101    | 512x512   |   80000 | 9.1      | 20.34          | V100   | 42.91 |         43.96 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_80k_ade20k/upernet_r101_512x512_80k_ade20k_20200614_185117-32e4db94.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_80k_ade20k/upernet_r101_512x512_80k_ade20k_20200614_185117.log.json)     |
+| UPerNet | R-50     | 512x512   |  160000 | -        | -              | V100   | 42.05 |         42.78 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_160k_ade20k/upernet_r50_512x512_160k_ade20k_20200615_184328-8534de8d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_160k_ade20k/upernet_r50_512x512_160k_ade20k_20200615_184328.log.json)     |
+| UPerNet | R-101    | 512x512   |  160000 | -        | -              | V100   | 43.82 |         44.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_160k_ade20k/upernet_r101_512x512_160k_ade20k_20200615_161951-91b32684.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_160k_ade20k/upernet_r101_512x512_160k_ade20k_20200615_161951.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                               |
+| ------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | R-50     | 512x512   |   20000 | 6.4      | 23.17          | V100   | 74.82 |         76.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_20k_voc12aug/upernet_r50_512x512_20k_voc12aug_20200617_165330-5b5890a7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_20k_voc12aug/upernet_r50_512x512_20k_voc12aug_20200617_165330.log.json)     |
+| UPerNet | R-101    | 512x512   |   20000 | 7.5      | 19.98          | V100   | 77.10 |         78.29 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_20k_voc12aug/upernet_r101_512x512_20k_voc12aug_20200617_165629-f14e7f27.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_20k_voc12aug/upernet_r101_512x512_20k_voc12aug_20200617_165629.log.json) |
+| UPerNet | R-50     | 512x512   |   40000 | -        | -              | V100   | 75.92 |         77.44 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_40k_voc12aug/upernet_r50_512x512_40k_voc12aug_20200613_162257-ca9bcc6b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_40k_voc12aug/upernet_r50_512x512_40k_voc12aug_20200613_162257.log.json)     |
+| UPerNet | R-101    | 512x512   |   40000 | -        | -              | V100   | 77.43 |         78.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_40k_voc12aug/upernet_r101_512x512_40k_voc12aug_20200613_163549-e26476ac.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_40k_voc12aug/upernet_r101_512x512_40k_voc12aug_20200613_163549.log.json) |
+
+## Citation
+
+```bibtex
+@inproceedings{xiao2018unified,
+  title={Unified perceptual parsing for scene understanding},
+  author={Xiao, Tete and Liu, Yingcheng and Zhou, Bolei and Jiang, Yuning and Sun, Jian},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV)},
+  pages={418--434},
+  year={2018}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/upernet/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f6ad8187f21ebe7c4d172d581e23b43ae02894d2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/metafile.yaml
@@ -0,0 +1,391 @@
+Collections:
+- Name: UPerNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  README: configs/upernet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: upernet_r50_4xb2-40k_cityscapes-512x1024
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.1
+      mIoU(ms+flip): 78.37
+  Config: configs/upernet/upernet_r50_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_40k_cityscapes/upernet_r50_512x1024_40k_cityscapes_20200605_094827-aa54cb54.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_40k_cityscapes/upernet_r50_512x1024_40k_cityscapes_20200605_094827.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb2-40k_cityscapes-512x1024
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.69
+      mIoU(ms+flip): 80.11
+  Config: configs/upernet/upernet_r101_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_40k_cityscapes/upernet_r101_512x1024_40k_cityscapes_20200605_094933-ebce3b10.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_40k_cityscapes/upernet_r101_512x1024_40k_cityscapes_20200605_094933.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb2-40k_cityscapes-769x769
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.98
+      mIoU(ms+flip): 79.7
+  Config: configs/upernet/upernet_r50_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_40k_cityscapes/upernet_r50_769x769_40k_cityscapes_20200530_033048-92d21539.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_40k_cityscapes/upernet_r50_769x769_40k_cityscapes_20200530_033048.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb2-40k_cityscapes-769x769
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.03
+      mIoU(ms+flip): 80.77
+  Config: configs/upernet/upernet_r101_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_40k_cityscapes/upernet_r101_769x769_40k_cityscapes_20200530_040819-83c95d01.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_40k_cityscapes/upernet_r101_769x769_40k_cityscapes_20200530_040819.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb2-80k_cityscapes-512x1024
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.19
+      mIoU(ms+flip): 79.19
+  Config: configs/upernet/upernet_r50_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_80k_cityscapes/upernet_r50_512x1024_80k_cityscapes_20200607_052207-848beca8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_80k_cityscapes/upernet_r50_512x1024_80k_cityscapes_20200607_052207.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb2-80k_cityscapes-512x1024
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.4
+      mIoU(ms+flip): 80.46
+  Config: configs/upernet/upernet_r101_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_80k_cityscapes/upernet_r101_512x1024_80k_cityscapes_20200607_002403-f05f2345.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_80k_cityscapes/upernet_r101_512x1024_80k_cityscapes_20200607_002403.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb2-80k_cityscapes-769x769
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.39
+      mIoU(ms+flip): 80.92
+  Config: configs/upernet/upernet_r50_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_80k_cityscapes/upernet_r50_769x769_80k_cityscapes_20200607_005107-82ae7d15.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_80k_cityscapes/upernet_r50_769x769_80k_cityscapes_20200607_005107.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb2-80k_cityscapes-769x769
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.1
+      mIoU(ms+flip): 81.49
+  Config: configs/upernet/upernet_r101_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_80k_cityscapes/upernet_r101_769x769_80k_cityscapes_20200607_001014-082fc334.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_80k_cityscapes/upernet_r101_769x769_80k_cityscapes_20200607_001014.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb4-80k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.7
+      mIoU(ms+flip): 41.81
+  Config: configs/upernet/upernet_r50_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_80k_ade20k/upernet_r50_512x512_80k_ade20k_20200614_144127-ecc8377b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_80k_ade20k/upernet_r50_512x512_80k_ade20k_20200614_144127.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb4-80k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.91
+      mIoU(ms+flip): 43.96
+  Config: configs/upernet/upernet_r101_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_80k_ade20k/upernet_r101_512x512_80k_ade20k_20200614_185117-32e4db94.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_80k_ade20k/upernet_r101_512x512_80k_ade20k_20200614_185117.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb4-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.05
+      mIoU(ms+flip): 42.78
+  Config: configs/upernet/upernet_r50_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_160k_ade20k/upernet_r50_512x512_160k_ade20k_20200615_184328-8534de8d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_160k_ade20k/upernet_r50_512x512_160k_ade20k_20200615_184328.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb4-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.82
+      mIoU(ms+flip): 44.85
+  Config: configs/upernet/upernet_r101_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_160k_ade20k/upernet_r101_512x512_160k_ade20k_20200615_161951-91b32684.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_160k_ade20k/upernet_r101_512x512_160k_ade20k_20200615_161951.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb4-20k_voc12aug-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 74.82
+      mIoU(ms+flip): 76.35
+  Config: configs/upernet/upernet_r50_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_20k_voc12aug/upernet_r50_512x512_20k_voc12aug_20200617_165330-5b5890a7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_20k_voc12aug/upernet_r50_512x512_20k_voc12aug_20200617_165330.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb4-20k_voc12aug-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.1
+      mIoU(ms+flip): 78.29
+  Config: configs/upernet/upernet_r101_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_20k_voc12aug/upernet_r101_512x512_20k_voc12aug_20200617_165629-f14e7f27.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_20k_voc12aug/upernet_r101_512x512_20k_voc12aug_20200617_165629.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb4-40k_voc12aug-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 75.92
+      mIoU(ms+flip): 77.44
+  Config: configs/upernet/upernet_r50_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_40k_voc12aug/upernet_r50_512x512_40k_voc12aug_20200613_162257-ca9bcc6b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_40k_voc12aug/upernet_r50_512x512_40k_voc12aug_20200613_162257.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb4-40k_voc12aug-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.43
+      mIoU(ms+flip): 78.56
+  Config: configs/upernet/upernet_r101_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_40k_voc12aug/upernet_r101_512x512_40k_voc12aug_20200613_163549-e26476ac.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_40k_voc12aug/upernet_r101_512x512_40k_voc12aug_20200613_163549.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f5f6aecfe99ca9a07e0bef516d1d5b6533f8223
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..28b5d3e968c7709fbbf980aaac9733bb89f5e1ea
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..cafd8a209167fe06a3ab0139a6169738456cab6f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..e17572054fc47a4323afbc790b0dbfac8a26d8da
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a6152774cb88a1d9eb2a849585e9dff954090f7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..be8f0848dfde7a2403228ae070260a8744b948b4
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..db1d976498377663e992e074906a435028db8b66
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..84549a421da988a11e3fe24d56a44980d6883ffa
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r101_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbff0e75a1c3f581d0802ecc5531409974e551f6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,6 @@
+_base_ = './upernet_r50_4xb2-40k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(in_channels=[64, 128, 256, 512]),
+    auxiliary_head=dict(in_channels=256))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..dee6349f64f4bcda87d35daeac5d3dd547a61baf
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,6 @@
+_base_ = './upernet_r50_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(in_channels=[64, 128, 256, 512]),
+    auxiliary_head=dict(in_channels=256))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ac6c35527b588a8cd3abd3a63f913896a9c5b07
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(in_channels=[64, 128, 256, 512], num_classes=150),
+    auxiliary_head=dict(in_channels=256, num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cae4f5435354c36acdebfa10959facf5bb8c8b8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(in_channels=[64, 128, 256, 512], num_classes=21),
+    auxiliary_head=dict(in_channels=256, num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..652ded75164653d900284bde65f678f753f61557
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(in_channels=[64, 128, 256, 512], num_classes=21),
+    auxiliary_head=dict(in_channels=256, num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a7956d71fa99f0a3d85f2a997990181f3533902
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r18_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(in_channels=[64, 128, 256, 512], num_classes=150),
+    auxiliary_head=dict(in_channels=256, num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb2-40k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..4751fc11027b15c8b2eb0ae68b9f208b42a0408e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb2-40k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f05b6c364379d16ed9493578b45bce1932e1cf7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb2-80k_cityscapes-512x1024.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3488c61085ee18d7b3861bfb51481b44a8cc2ec
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb2-80k_cityscapes-769x769.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a8f48ec51cbf6f890b884444b06599143ff609c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,12 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py',
+    '../_base_/datasets/cityscapes_769x769.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (769, 769)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(align_corners=True),
+    auxiliary_head=dict(align_corners=True),
+    test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513)))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb4-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d15b2abd902277a794aca633831cfa38f0fc6d0
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb4-20k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e96b4eac0b14a73fed9c4c5510e144ad7c7415a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb4-40k_voc12aug-512x512.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..cada9496204d92d0792818c345d155e1839fe861
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb4-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..322d5d8c841df987ce3ba734c63ca245e9d6e008
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/upernet/upernet_r50_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
diff --git a/head_extractor/src/mmseg/.mim/configs/vit/README.md b/head_extractor/src/mmseg/.mim/configs/vit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f75326e8e44929e05758b759f51e9e4e7d087ccb
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vit/README.md
@@ -0,0 +1,70 @@
+# Vision Transformer
+
+> [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/pdf/2010.11929.pdf)
+
+## Introduction
+
+<!-- [BACKBONE] -->
+
+<a href="https://github.com/google-research/vision_transformer">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/142903144-f80a12cc-8698-48ab-843c-49dedf558121.png" width="70%"/>
+</div>
+
+## Usage
+
+To use other repositories' pre-trained models, it is necessary to convert keys.
+
+We provide a script [`vit2mmseg.py`](../../tools/model_converters/vit2mmseg.py) in the tools directory to convert the key of models from [timm](https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py) to MMSegmentation style.
+
+```shell
+python tools/model_converters/vit2mmseg.py ${PRETRAIN_PATH} ${STORE_PATH}
+```
+
+E.g.
+
+```shell
+python tools/model_converters/vit2mmseg.py https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth pretrain/jx_vit_base_p16_224-80ecf9dd.pth
+```
+
+This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
+
+## Results and models
+
+### ADE20K
+
+| Method  | Backbone          | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                               | download                                                                                                                                                                                                                                                                                                                   |
+| ------- | ----------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | ViT-B + MLN       | 512x512   |   80000 | 9.20     | 6.94           | V100   | 47.71 |         49.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_80k_ade20k/upernet_vit-b16_mln_512x512_80k_ade20k_20210624_130547-0403cee1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_80k_ade20k/20210624_130547.log.json)                |
+| UPerNet | ViT-B + MLN       | 512x512   |  160000 | 9.20     | 7.58           | V100   | 46.75 |         48.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_160k_ade20k/upernet_vit-b16_mln_512x512_160k_ade20k_20210624_130547-852fa768.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_160k_ade20k/20210623_192432.log.json)             |
+| UPerNet | ViT-B + LN + MLN  | 512x512   |  160000 | 9.21     | 6.82           | V100   | 47.73 |         49.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k/upernet_vit-b16_ln_mln_512x512_160k_ade20k_20210621_172828-f444c077.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k/20210621_172828.log.json)    |
+| UPerNet | DeiT-S            | 512x512   |   80000 | 4.68     | 29.85          | V100   | 42.96 |         43.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-s16_upernet_8xb2-80k_ade20k-512x512.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_80k_ade20k/upernet_deit-s16_512x512_80k_ade20k_20210624_095228-afc93ec2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_80k_ade20k/20210624_095228.log.json)                         |
+| UPerNet | DeiT-S            | 512x512   |  160000 | 4.68     | 29.19          | V100   | 42.87 |         43.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-s16_upernet_8xb2-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_160k_ade20k/upernet_deit-s16_512x512_160k_ade20k_20210621_160903-5110d916.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_160k_ade20k/20210621_160903.log.json)                      |
+| UPerNet | DeiT-S + MLN      | 512x512   |  160000 | 5.69     | 11.18          | V100   | 43.82 |         45.07 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-s16_mln_upernet_8xb2-160k_ade20k-512x512.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_mln_512x512_160k_ade20k/upernet_deit-s16_mln_512x512_160k_ade20k_20210621_161021-fb9a5dfb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_mln_512x512_160k_ade20k/20210621_161021.log.json)          |
+| UPerNet | DeiT-S + LN + MLN | 512x512   |  160000 | 5.69     | 12.39          | V100   | 43.52 |         45.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-s16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k/upernet_deit-s16_ln_mln_512x512_160k_ade20k_20210621_161021-c0cd652f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k/20210621_161021.log.json) |
+| UPerNet | DeiT-B            | 512x512   |   80000 | 7.75     | 9.69           | V100   | 45.24 |         46.73 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-b16_upernet_8xb2-80k_ade20k-512x512.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_80k_ade20k/upernet_deit-b16_512x512_80k_ade20k_20210624_130529-1e090789.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_80k_ade20k/20210624_130529.log.json)                         |
+| UPerNet | DeiT-B            | 512x512   |  160000 | 7.75     | 10.39          | V100   | 45.36 |         47.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-b16_upernet_8xb2-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_160k_ade20k/upernet_deit-b16_512x512_160k_ade20k_20210621_180100-828705d7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_160k_ade20k/20210621_180100.log.json)                      |
+| UPerNet | DeiT-B + MLN      | 512x512   |  160000 | 9.21     | 7.78           | V100   | 45.46 |         47.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_mln_512x512_160k_ade20k/upernet_deit-b16_mln_512x512_160k_ade20k_20210621_191949-4e1450f3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_mln_512x512_160k_ade20k/20210621_191949.log.json)          |
+| UPerNet | DeiT-B + LN + MLN | 512x512   |  160000 | 9.21     | 7.75           | V100   | 45.37 |         47.23 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k/upernet_deit-b16_ln_mln_512x512_160k_ade20k_20210623_153535-8a959c14.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k/20210623_153535.log.json) |
+
+## Citation
+
+```bibtex
+@article{dosoViTskiy2020,
+  title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
+  author={DosoViTskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and  Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
+  journal={arXiv preprint arXiv:2010.11929},
+  year={2020}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/vit/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/vit/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..68e254a5f99d5107500bd6d326ad98ab812f546d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vit/metafile.yaml
@@ -0,0 +1,265 @@
+Models:
+- Name: vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.71
+      mIoU(ms+flip): 49.51
+  Config: configs/vit/vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_80k_ade20k/upernet_vit-b16_mln_512x512_80k_ade20k_20210624_130547-0403cee1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_80k_ade20k/20210624_130547.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.75
+      mIoU(ms+flip): 48.46
+  Config: configs/vit/vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_160k_ade20k/upernet_vit-b16_mln_512x512_160k_ade20k_20210624_130547-852fa768.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_160k_ade20k/20210623_192432.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.73
+      mIoU(ms+flip): 49.95
+  Config: configs/vit/vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.21
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k/upernet_vit-b16_ln_mln_512x512_160k_ade20k_20210621_172828-f444c077.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k/20210621_172828.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-s16_upernet_8xb2-80k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.96
+      mIoU(ms+flip): 43.79
+  Config: configs/vit/vit_deit-s16_upernet_8xb2-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.68
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_80k_ade20k/upernet_deit-s16_512x512_80k_ade20k_20210624_095228-afc93ec2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_80k_ade20k/20210624_095228.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-s16_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.87
+      mIoU(ms+flip): 43.79
+  Config: configs/vit/vit_deit-s16_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.68
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_160k_ade20k/upernet_deit-s16_512x512_160k_ade20k_20210621_160903-5110d916.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_160k_ade20k/20210621_160903.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-s16_mln_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.82
+      mIoU(ms+flip): 45.07
+  Config: configs/vit/vit_deit-s16_mln_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 5.69
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_mln_512x512_160k_ade20k/upernet_deit-s16_mln_512x512_160k_ade20k_20210621_161021-fb9a5dfb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_mln_512x512_160k_ade20k/20210621_161021.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-s16-ln_mln_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.52
+      mIoU(ms+flip): 45.01
+  Config: configs/vit/vit_deit-s16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 5.69
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k/upernet_deit-s16_ln_mln_512x512_160k_ade20k_20210621_161021-c0cd652f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k/20210621_161021.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-b16_upernet_8xb2-80k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.24
+      mIoU(ms+flip): 46.73
+  Config: configs/vit/vit_deit-b16_upernet_8xb2-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.75
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_80k_ade20k/upernet_deit-b16_512x512_80k_ade20k_20210624_130529-1e090789.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_80k_ade20k/20210624_130529.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-b16_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.36
+      mIoU(ms+flip): 47.16
+  Config: configs/vit/vit_deit-b16_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.75
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_160k_ade20k/upernet_deit-b16_512x512_160k_ade20k_20210621_180100-828705d7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_160k_ade20k/20210621_180100.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-b16_mln_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.46
+      mIoU(ms+flip): 47.16
+  Config: configs/vit/vit_deit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.21
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_mln_512x512_160k_ade20k/upernet_deit-b16_mln_512x512_160k_ade20k_20210621_191949-4e1450f3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_mln_512x512_160k_ade20k/20210621_191949.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.37
+      mIoU(ms+flip): 47.23
+  Config: configs/vit/vit_deit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.21
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k/upernet_deit-b16_ln_mln_512x512_160k_ade20k_20210623_153535-8a959c14.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k/20210623_153535.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d1c54fafc36077135bdd6a913d929326fe7c8b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,5 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
+    backbone=dict(drop_path_rate=0.1, final_norm=True))
diff --git a/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..706673f6b1d48854bb777a137cc9cbe967c25c45
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,6 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
+    backbone=dict(drop_path_rate=0.1),
+)
diff --git a/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-b16_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-b16_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..23a23582d7a87a591ea7c0ee2c1bbdd27f29b2d8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-b16_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,6 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
+    backbone=dict(drop_path_rate=0.1),
+    neck=None)
diff --git a/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-b16_upernet_8xb2-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-b16_upernet_8xb2-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c8bc939eea06169f838a86cd5eadb236ec00067
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-b16_upernet_8xb2-80k_ade20k-512x512.py
@@ -0,0 +1,6 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
+    backbone=dict(drop_path_rate=0.1),
+    neck=None)
diff --git a/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-s16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-s16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e626fe0dea6bd04fe8c1958487e4c4e32dd9fb1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-s16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
+    backbone=dict(
+        num_heads=6, embed_dims=384, drop_path_rate=0.1, final_norm=True),
+    decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
+    neck=dict(in_channels=[384, 384, 384, 384], out_channels=384),
+    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-s16_mln_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-s16_mln_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a69a892b335fc8eb16fefd870223e9f8bde6a6f
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-s16_mln_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
+    backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1),
+    decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
+    neck=dict(in_channels=[384, 384, 384, 384], out_channels=384),
+    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-s16_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-s16_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ef699d5d56ad06a2c979076e61f0228af21f6e6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-s16_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
+    backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1),
+    decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
+    neck=None,
+    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-s16_upernet_8xb2-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-s16_upernet_8xb2-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ef699d5d56ad06a2c979076e61f0228af21f6e6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vit/vit_deit-s16_upernet_8xb2-80k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
+    backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1),
+    decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
+    neck=None,
+    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/head_extractor/src/mmseg/.mim/configs/vit/vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/vit/vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dd81b48bbbd6c94674b02f4a0d09de5814567b2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vit/vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,45 @@
+_base_ = [
+    '../_base_/models/upernet_vit-b16_ln_mln.py',
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/vit_base_patch16_224.pth',
+    backbone=dict(drop_path_rate=0.1, final_norm=True),
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/vit/vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/vit/vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a7ec16c92942094429a3f1777cae16ae69d81aa
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vit/vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,44 @@
+_base_ = [
+    '../_base_/models/upernet_vit-b16_ln_mln.py',
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/vit_base_patch16_224.pth',
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/vit/vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py b/head_extractor/src/mmseg/.mim/configs/vit/vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef7345057c10dc984e651054fb179e82d5c7f1f7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vit/vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py
@@ -0,0 +1,44 @@
+_base_ = [
+    '../_base_/models/upernet_vit-b16_ln_mln.py',
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/vit_base_patch16_224.pth',
+    decode_head=dict(num_classes=150),
+    auxiliary_head=dict(num_classes=150))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=80000,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/head_extractor/src/mmseg/.mim/configs/vpd/README.md b/head_extractor/src/mmseg/.mim/configs/vpd/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e90085bec9e772bc05aca0335fe95efd176cd51e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vpd/README.md
@@ -0,0 +1,50 @@
+# VPD
+
+> [Unleashing Text-to-Image Diffusion Models for Visual Perception](https://arxiv.org/abs/2303.02153)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href = "https://github.com/wl-zhao/VPD">Official Repo</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Diffusion models (DMs) have become the new trend of generative models and have demonstrated a powerful ability of conditional synthesis. Among those, text-to-image diffusion models pre-trained on large-scale image-text pairs are highly controllable by customizable prompts. Unlike the unconditional generative models that focus on low-level attributes and details, text-to-image diffusion models contain more high-level knowledge thanks to the vision-language pre-training. In this paper, we propose VPD (Visual Perception with a pre-trained Diffusion model), a new framework that exploits the semantic information of a pre-trained text-to-image diffusion model in visual perception tasks. Instead of using the pre-trained denoising autoencoder in a diffusion-based pipeline, we simply use it as a backbone and aim to study how to take full advantage of the learned knowledge. Specifically, we prompt the denoising decoder with proper textual inputs and refine the text features with an adapter, leading to a better alignment to the pre-trained stage and making the visual contents interact with the text prompts. We also propose to utilize the cross-attention maps between the visual features and the text features to provide explicit guidance. Compared with other pre-training methods, we show that vision-language pre-trained diffusion models can be faster adapted to downstream visual perception tasks using the proposed VPD. Extensive experiments on semantic segmentation, referring image segmentation and depth estimation demonstrates the effectiveness of our method. Notably, VPD attains 0.254 RMSE on NYUv2 depth estimation and 73.3% oIoU on RefCOCO-val referring image segmentation, establishing new records on these two benchmarks.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmsegmentation/assets/26127467/88f5752d-7fe2-4cb0-a284-8ee0680e29cd" width="80%"/>
+</div>
+
+## Usage
+
+To run training or inference with VPD model, please install the required packages via
+
+```sh
+pip install -r requirements/albu.txt
+pip install -r requirements/optional.txt
+```
+
+## Results and models
+
+### NYU
+
+| Method | Backbone              | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | RMSE  | d1    | d2    | d3    | REL   | log_10 | config                                                                                                      | download                                                                                                                                                                                                                     |
+| ------ | --------------------- | --------- | ------- | -------- | -------------- | ------ | ----- | ----- | ----- | ----- | ----- | ------ | ----------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| VPD    | Stable-Diffusion-v1-5 | 480x480   | 25000   | -        | -              | A100   | 0.253 | 0.964 | 0.995 | 0.999 | 0.069 | 0.030  | [config](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/vpd/vpd_sd_4xb8-25k_nyu-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-480x480_20230908-66144bc4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-480x480_20230908.json) |
+| VPD    | Stable-Diffusion-v1-5 | 512x512   | 25000   | -        | -              | A100   | 0.258 | 0.963 | 0.995 | 0.999 | 0.072 | 0.031  | [config](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/vpd/vpd_sd_4xb8-25k_nyu-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-512x512_20230918-60cefcff.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-512x512_20230918.json) |
+
+## Citation
+
+```bibtex
+@article{zhao2023unleashing,
+  title={Unleashing Text-to-Image Diffusion Models for Visual Perception},
+  author={Zhao, Wenliang and Rao, Yongming and Liu, Zuyan and Liu, Benlin and Zhou, Jie and Lu, Jiwen},
+  journal={ICCV},
+  year={2023}
+}
+```
diff --git a/head_extractor/src/mmseg/.mim/configs/vpd/metafile.yaml b/head_extractor/src/mmseg/.mim/configs/vpd/metafile.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ccdc0e81eb2639b9ec4eb1ee61de722cbabea78c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vpd/metafile.yaml
@@ -0,0 +1,56 @@
+Collections:
+- Name: VPD
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - NYU
+  Paper:
+    Title: Unleashing Text-to-Image Diffusion Models for Visual Perception
+    URL: https://arxiv.org/abs/2303.02153
+  README: configs/vpd/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: vpd_sd_4xb8-25k_nyu-480x480
+  In Collection: VPD
+  Results:
+    Task: Depth Estimation
+    Dataset: NYU
+    Metrics:
+      RMSE: 0.253
+  Config: configs/vpd/vpd_sd_4xb8-25k_nyu-480x480.py
+  Metadata:
+    Training Data: NYU
+    Batch Size: 32
+    Architecture:
+    - Stable-Diffusion
+    Training Resources: 8x A100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-480x480_20230908-66144bc4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-480x480_20230908.json
+  Paper:
+    Title: 'High-Resolution Image Synthesis with Latent Diffusion Models'
+    URL: https://arxiv.org/abs/2112.10752
+  Code: https://github.com/open-mmlab/mmsegmentation/tree/main/mmseg/models/backbones/vpd.py#L333
+  Framework: PyTorch
+- Name: vpd_sd_4xb8-25k_nyu-512x512
+  In Collection: VPD
+  Alias: vpd_depth
+  Results:
+    Task: Depth Estimation
+    Dataset: NYU
+    Metrics:
+      RMSE: 0.258
+  Config: configs/vpd/vpd_sd_4xb8-25k_nyu-512x512.py
+  Metadata:
+    Training Data: NYU
+    Batch Size: 32
+    Architecture:
+    - Stable-Diffusion
+    Training Resources: 8x A100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-512x512_20230918-60cefcff.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-512x512_20230918.json
+  Paper:
+    Title: 'High-Resolution Image Synthesis with Latent Diffusion Models'
+    URL: https://arxiv.org/abs/2112.10752
+  Code: https://github.com/open-mmlab/mmsegmentation/tree/main/mmseg/models/backbones/vpd.py#L333
+  Framework: PyTorch
diff --git a/head_extractor/src/mmseg/.mim/configs/vpd/vpd_sd_4xb8-25k_nyu-480x480.py b/head_extractor/src/mmseg/.mim/configs/vpd/vpd_sd_4xb8-25k_nyu-480x480.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d14d8dd338e9b6333461c6310b0c29dad7f4a86
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vpd/vpd_sd_4xb8-25k_nyu-480x480.py
@@ -0,0 +1,38 @@
+_base_ = [
+    '../_base_/models/vpd_sd.py', '../_base_/datasets/nyu.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_25k.py'
+]
+
+crop_size = (480, 480)
+
+model = dict(
+    type='DepthEstimator',
+    data_preprocessor=dict(size=crop_size),
+    backbone=dict(
+        class_embed_path='https://download.openmmlab.com/mmsegmentation/'
+        'v0.5/vpd/nyu_class_embeddings.pth',
+        class_embed_select=True,
+        pad_shape=512,
+        unet_cfg=dict(use_attn=False),
+    ),
+    decode_head=dict(
+        type='VPDDepthHead',
+        in_channels=[320, 640, 1280, 1280],
+        max_depth=10,
+        fmap_border=(1, 1),
+    ),
+    test_cfg=dict(mode='slide_flip', crop_size=crop_size, stride=(160, 160)))
+
+default_hooks = dict(
+    checkpoint=dict(save_best='rmse', rule='less', max_keep_ckpts=1))
+
+# custom optimizer
+optim_wrapper = dict(
+    constructor='ForceDefaultOptimWrapperConstructor',
+    paramwise_cfg=dict(
+        bias_decay_mult=0,
+        force_default_settings=True,
+        custom_keys={
+            'backbone.encoder_vq': dict(lr_mult=0),
+            'backbone.unet': dict(lr_mult=0.01),
+        }))
diff --git a/head_extractor/src/mmseg/.mim/configs/vpd/vpd_sd_4xb8-25k_nyu-512x512.py b/head_extractor/src/mmseg/.mim/configs/vpd/vpd_sd_4xb8-25k_nyu-512x512.py
new file mode 100644
index 0000000000000000000000000000000000000000..e89eb9c422f51ec060d38ef5f30a48ae3c677dff
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/configs/vpd/vpd_sd_4xb8-25k_nyu-512x512.py
@@ -0,0 +1,37 @@
+_base_ = [
+    '../_base_/models/vpd_sd.py', '../_base_/datasets/nyu_512x512.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_25k.py'
+]
+
+crop_size = (512, 512)
+
+model = dict(
+    type='DepthEstimator',
+    data_preprocessor=dict(size=crop_size),
+    backbone=dict(
+        class_embed_path='https://download.openmmlab.com/mmsegmentation/'
+        'v0.5/vpd/nyu_class_embeddings.pth',
+        class_embed_select=True,
+        pad_shape=512,
+        unet_cfg=dict(use_attn=False),
+    ),
+    decode_head=dict(
+        type='VPDDepthHead',
+        in_channels=[320, 640, 1280, 1280],
+        max_depth=10,
+    ),
+    test_cfg=dict(mode='slide_flip', crop_size=crop_size, stride=(128, 128)))
+
+default_hooks = dict(
+    checkpoint=dict(save_best='rmse', rule='less', max_keep_ckpts=1))
+
+# custom optimizer
+optim_wrapper = dict(
+    constructor='ForceDefaultOptimWrapperConstructor',
+    paramwise_cfg=dict(
+        bias_decay_mult=0,
+        force_default_settings=True,
+        custom_keys={
+            'backbone.encoder_vq': dict(lr_mult=0),
+            'backbone.unet': dict(lr_mult=0.01),
+        }))
diff --git a/head_extractor/src/mmseg/.mim/dataset-index.yml b/head_extractor/src/mmseg/.mim/dataset-index.yml
new file mode 100644
index 0000000000000000000000000000000000000000..30ff0ee005ae55656a1d90a13d2fc4fd524a7074
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/dataset-index.yml
@@ -0,0 +1,80 @@
+openxlab: true
+ade20k:
+  dataset: OpenDataLab/ADE20K_2016
+  download_root: data
+  data_root: data/ade
+
+cityscapes:
+  dataset: OpenDataLab/CityScapes
+  download_root: data
+  data_root: data/cityscapes
+
+voc2012:
+  dataset: OpenDataLab/PASCAL_VOC2012
+  download_root: data
+  data_root: data/VOCdevkit/VOC2012
+
+cocostuff:
+  dataset: OpenDataLab/COCO-Stuff
+  download_root: data
+  data_root: data/coco_stuff164k
+
+mapillary:
+  dataset: OpenDataLab/Mapillary
+  download_root: data
+  data_root: data/mapillary
+
+pascal_context:
+  dataset: OpenDataLab/VOC2010
+  download_root: data
+  data_root: data/VOCdevkit/VOC2010
+
+isaid:
+  dataset: OpenDataLab/iSAID
+  download_root: data
+  data_root: data/iSAID
+
+isprs_potsdam:
+  dataset: OpenDataLab/ISPRS_Potsdam
+  download_root: data
+  data_root: data/potsdam
+
+loveda:
+  dataset: OpenDataLab/LoveDA
+  download_root: data
+  data_root: data/loveDA
+
+chase_db1:
+  dataset: OpenDataLab/CHASE_DB1
+  download_root: data
+  data_root: data/CHASE_DB1
+
+drive:
+  dataset: OpenDataLab/DRIVE
+  download_root: data
+  data_root: data/DRIVE
+
+hrf:
+  dataset: OpenDataLab/HRF
+  download_root: data
+  data_root: data/HRF
+
+stare:
+  dataset: OpenDataLab/STARE
+  download_root: data
+  data_root: data/STARE
+
+synapse:
+  dataset: OpenDataLab/SurgVisDom
+  download_root: data
+  data_root: data/synapse
+
+refuge:
+  dataset: OpenDataLab/REFUGE_Challenge
+  download_root: data
+  data_root: data/REFUGE
+
+lip:
+  dataset: OpenDataLab/LIP
+  download_root: data
+  data_root: data/LIP
diff --git a/head_extractor/src/mmseg/.mim/model-index.yml b/head_extractor/src/mmseg/.mim/model-index.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4026bb9e6e4f637afbc6028b24006494d7af12c2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/model-index.yml
@@ -0,0 +1,53 @@
+Import:
+- configs/ann/metafile.yaml
+- configs/apcnet/metafile.yaml
+- configs/beit/metafile.yaml
+- configs/bisenetv1/metafile.yaml
+- configs/bisenetv2/metafile.yaml
+- configs/ccnet/metafile.yaml
+- configs/cgnet/metafile.yaml
+- configs/convnext/metafile.yaml
+- configs/danet/metafile.yaml
+- configs/ddrnet/metafile.yaml
+- configs/deeplabv3/metafile.yaml
+- configs/deeplabv3plus/metafile.yaml
+- configs/dmnet/metafile.yaml
+- configs/dnlnet/metafile.yaml
+- configs/dpt/metafile.yaml
+- configs/emanet/metafile.yaml
+- configs/encnet/metafile.yaml
+- configs/erfnet/metafile.yaml
+- configs/fastfcn/metafile.yaml
+- configs/fastscnn/metafile.yaml
+- configs/fcn/metafile.yaml
+- configs/gcnet/metafile.yaml
+- configs/hrnet/metafile.yaml
+- configs/icnet/metafile.yaml
+- configs/isanet/metafile.yaml
+- configs/knet/metafile.yaml
+- configs/mae/metafile.yaml
+- configs/mask2former/metafile.yaml
+- configs/maskformer/metafile.yaml
+- configs/mobilenet_v2/metafile.yaml
+- configs/mobilenet_v3/metafile.yaml
+- configs/nonlocal_net/metafile.yaml
+- configs/ocrnet/metafile.yaml
+- configs/pidnet/metafile.yaml
+- configs/point_rend/metafile.yaml
+- configs/poolformer/metafile.yaml
+- configs/psanet/metafile.yaml
+- configs/pspnet/metafile.yaml
+- configs/resnest/metafile.yaml
+- configs/san/metafile.yaml
+- configs/segformer/metafile.yaml
+- configs/segmenter/metafile.yaml
+- configs/segnext/metafile.yaml
+- configs/sem_fpn/metafile.yaml
+- configs/setr/metafile.yaml
+- configs/stdc/metafile.yaml
+- configs/swin/metafile.yaml
+- configs/twins/metafile.yaml
+- configs/unet/metafile.yaml
+- configs/upernet/metafile.yaml
+- configs/vit/metafile.yaml
+- configs/vpd/metafile.yaml
diff --git a/head_extractor/src/mmseg/.mim/tools/analysis_tools/analyze_logs.py b/head_extractor/src/mmseg/.mim/tools/analysis_tools/analyze_logs.py
new file mode 100644
index 0000000000000000000000000000000000000000..7464d231621b17249ce69f358479bbba42757362
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/analysis_tools/analyze_logs.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from https://github.com/open-
+mmlab/mmdetection/blob/master/tools/analysis_tools/analyze_logs.py."""
+import argparse
+import json
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            plot_epochs = []
+            plot_iters = []
+            plot_values = []
+            # In some log files exist lines of validation,
+            # `mode` list is used to only collect iter number
+            # of training line.
+            for epoch in epochs:
+                epoch_logs = log_dict[epoch]
+                if metric not in epoch_logs.keys():
+                    continue
+                if metric in ['mIoU', 'mAcc', 'aAcc']:
+                    plot_epochs.append(epoch)
+                    plot_values.append(epoch_logs[metric][0])
+                else:
+                    for idx in range(len(epoch_logs[metric])):
+                        plot_iters.append(epoch_logs['step'][idx])
+                        plot_values.append(epoch_logs[metric][idx])
+            ax = plt.gca()
+            label = legend[i * num_metrics + j]
+            if metric in ['mIoU', 'mAcc', 'aAcc']:
+                ax.set_xticks(plot_epochs)
+                plt.xlabel('step')
+                plt.plot(plot_epochs, plot_values, label=label, marker='o')
+            else:
+                plt.xlabel('iter')
+                plt.plot(plot_iters, plot_values, label=label, linewidth=0.5)
+        plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    parser.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['mIoU'],
+        help='the metric that you want to plot')
+    parser.add_argument('--title', type=str, help='title of figure')
+    parser.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser.add_argument('--out', type=str, default=None)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is step, value is a sub dict
+    # keys of sub dict is different metrics
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    prev_step = 0
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log) as log_file:
+            for line in log_file:
+                log = json.loads(line.strip())
+                # the final step in json file is 0.
+                if 'step' in log and log['step'] != 0:
+                    step = log['step']
+                    prev_step = step
+                else:
+                    step = prev_step
+                if step not in log_dict:
+                    log_dict[step] = defaultdict(list)
+                for k, v in log.items():
+                    log_dict[step][k].append(v)
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+    log_dicts = load_json_logs(json_logs)
+    plot_curve(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/analysis_tools/benchmark.py b/head_extractor/src/mmseg/.mim/tools/analysis_tools/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..afaeabac85fa642b03c006b8a920c0d95d4cb400
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/analysis_tools/benchmark.py
@@ -0,0 +1,121 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+import time
+
+import numpy as np
+import torch
+from mmengine import Config
+from mmengine.fileio import dump
+from mmengine.model.utils import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner import Runner, load_checkpoint
+from mmengine.utils import mkdir_or_exist
+
+from mmseg.registry import MODELS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMSeg benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--log-interval', type=int, default=50, help='interval of logging')
+    parser.add_argument(
+        '--work-dir',
+        help=('if specified, the results will be dumped '
+              'into the directory as json'))
+    parser.add_argument('--repeat-times', type=int, default=1)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+
+    init_default_scope(cfg.get('default_scope', 'mmseg'))
+
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    if args.work_dir is not None:
+        mkdir_or_exist(osp.abspath(args.work_dir))
+        json_file = osp.join(args.work_dir, f'fps_{timestamp}.json')
+    else:
+        # use config filename as default work_dir if cfg.work_dir is None
+        work_dir = osp.join('./work_dirs',
+                            osp.splitext(osp.basename(args.config))[0])
+        mkdir_or_exist(osp.abspath(work_dir))
+        json_file = osp.join(work_dir, f'fps_{timestamp}.json')
+
+    repeat_times = args.repeat_times
+    # set cudnn_benchmark
+    torch.backends.cudnn.benchmark = False
+    cfg.model.pretrained = None
+
+    benchmark_dict = dict(config=args.config, unit='img / s')
+    overall_fps_list = []
+    cfg.test_dataloader.batch_size = 1
+    for time_index in range(repeat_times):
+        print(f'Run {time_index + 1}:')
+        # build the dataloader
+        data_loader = Runner.build_dataloader(cfg.test_dataloader)
+
+        # build the model and load checkpoint
+        cfg.model.train_cfg = None
+        model = MODELS.build(cfg.model)
+
+        if 'checkpoint' in args and osp.exists(args.checkpoint):
+            load_checkpoint(model, args.checkpoint, map_location='cpu')
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+
+        model = revert_sync_batchnorm(model)
+
+        model.eval()
+
+        # the first several iterations may be very slow so skip them
+        num_warmup = 5
+        pure_inf_time = 0
+        total_iters = 200
+
+        # benchmark with 200 batches and take the average
+        for i, data in enumerate(data_loader):
+            data = model.data_preprocessor(data, True)
+            inputs = data['inputs']
+            data_samples = data['data_samples']
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            start_time = time.perf_counter()
+
+            with torch.no_grad():
+                model(inputs, data_samples, mode='predict')
+
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            elapsed = time.perf_counter() - start_time
+
+            if i >= num_warmup:
+                pure_inf_time += elapsed
+                if (i + 1) % args.log_interval == 0:
+                    fps = (i + 1 - num_warmup) / pure_inf_time
+                    print(f'Done image [{i + 1:<3}/ {total_iters}], '
+                          f'fps: {fps:.2f} img / s')
+
+            if (i + 1) == total_iters:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Overall fps: {fps:.2f} img / s\n')
+                benchmark_dict[f'overall_fps_{time_index + 1}'] = round(fps, 2)
+                overall_fps_list.append(fps)
+                break
+    benchmark_dict['average_fps'] = round(np.mean(overall_fps_list), 2)
+    benchmark_dict['fps_variance'] = round(np.var(overall_fps_list), 4)
+    print(f'Average fps of {repeat_times} evaluations: '
+          f'{benchmark_dict["average_fps"]}')
+    print(f'The variance of {repeat_times} evaluations: '
+          f'{benchmark_dict["fps_variance"]}')
+    dump(benchmark_dict, json_file, indent=4)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/analysis_tools/browse_dataset.py b/head_extractor/src/mmseg/.mim/tools/analysis_tools/browse_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..925c14a8ab63b4e38950b6c6af58e37dba002a4c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/analysis_tools/browse_dataset.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.utils import ProgressBar
+
+from mmseg.registry import DATASETS, VISUALIZERS
+from mmseg.utils import register_all_modules
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=2,
+        help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # register all modules in mmdet into the registries
+    register_all_modules()
+
+    dataset = DATASETS.build(cfg.train_dataloader.dataset)
+    visualizer = VISUALIZERS.build(cfg.visualizer)
+    visualizer.dataset_meta = dataset.metainfo
+
+    progress_bar = ProgressBar(len(dataset))
+    for item in dataset:
+        img = item['inputs'].permute(1, 2, 0).numpy()
+        img = img[..., [2, 1, 0]]  # bgr to rgb
+        data_sample = item['data_samples'].numpy()
+        img_path = osp.basename(item['data_samples'].img_path)
+
+        out_file = osp.join(
+            args.output_dir,
+            osp.basename(img_path)) if args.output_dir is not None else None
+
+        visualizer.add_datasample(
+            name=osp.basename(img_path),
+            image=img,
+            data_sample=data_sample,
+            draw_gt=True,
+            draw_pred=False,
+            wait_time=args.show_interval,
+            out_file=out_file,
+            show=not args.not_show)
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/analysis_tools/confusion_matrix.py b/head_extractor/src/mmseg/.mim/tools/analysis_tools/confusion_matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..39756cdfdd2341e7e02f9de24077da880b6021c3
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/analysis_tools/confusion_matrix.py
@@ -0,0 +1,197 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.ticker import MultipleLocator
+from mmengine.config import Config, DictAction
+from mmengine.registry import init_default_scope
+from mmengine.utils import mkdir_or_exist, progressbar
+from PIL import Image
+
+from mmseg.registry import DATASETS
+
+init_default_scope('mmseg')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate confusion matrix from segmentation results')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument(
+        'prediction_path', help='prediction path where test folder result')
+    parser.add_argument(
+        'save_dir', help='directory where confusion matrix will be saved')
+    parser.add_argument(
+        '--show', action='store_true', help='show confusion matrix')
+    parser.add_argument(
+        '--color-theme',
+        default='winter',
+        help='theme of the matrix color map')
+    parser.add_argument(
+        '--title',
+        default='Normalized Confusion Matrix',
+        help='title of the matrix color map')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def calculate_confusion_matrix(dataset, results):
+    """Calculate the confusion matrix.
+
+    Args:
+        dataset (Dataset): Test or val dataset.
+        results (list[ndarray]): A list of segmentation results in each image.
+    """
+    n = len(dataset.METAINFO['classes'])
+    confusion_matrix = np.zeros(shape=[n, n])
+    assert len(dataset) == len(results)
+    ignore_index = dataset.ignore_index
+    reduce_zero_label = dataset.reduce_zero_label
+    prog_bar = progressbar.ProgressBar(len(results))
+    for idx, per_img_res in enumerate(results):
+        res_segm = per_img_res
+        gt_segm = dataset[idx]['data_samples'] \
+            .gt_sem_seg.data.squeeze().numpy().astype(np.uint8)
+        gt_segm, res_segm = gt_segm.flatten(), res_segm.flatten()
+        if reduce_zero_label:
+            gt_segm = gt_segm - 1
+        to_ignore = gt_segm == ignore_index
+
+        gt_segm, res_segm = gt_segm[~to_ignore], res_segm[~to_ignore]
+        inds = n * gt_segm + res_segm
+        mat = np.bincount(inds, minlength=n**2).reshape(n, n)
+        confusion_matrix += mat
+        prog_bar.update()
+    return confusion_matrix
+
+
+def plot_confusion_matrix(confusion_matrix,
+                          labels,
+                          save_dir=None,
+                          show=True,
+                          title='Normalized Confusion Matrix',
+                          color_theme='OrRd'):
+    """Draw confusion matrix with matplotlib.
+
+    Args:
+        confusion_matrix (ndarray): The confusion matrix.
+        labels (list[str]): List of class names.
+        save_dir (str|optional): If set, save the confusion matrix plot to the
+            given path. Default: None.
+        show (bool): Whether to show the plot. Default: True.
+        title (str): Title of the plot. Default: `Normalized Confusion Matrix`.
+        color_theme (str): Theme of the matrix color map. Default: `winter`.
+    """
+    # normalize the confusion matrix
+    per_label_sums = confusion_matrix.sum(axis=1)[:, np.newaxis]
+    confusion_matrix = \
+        confusion_matrix.astype(np.float32) / per_label_sums * 100
+
+    num_classes = len(labels)
+    fig, ax = plt.subplots(
+        figsize=(2 * num_classes, 2 * num_classes * 0.8), dpi=300)
+    cmap = plt.get_cmap(color_theme)
+    im = ax.imshow(confusion_matrix, cmap=cmap)
+    colorbar = plt.colorbar(mappable=im, ax=ax)
+    colorbar.ax.tick_params(labelsize=20)  # 设置 colorbar 标签的字体大小
+
+    title_font = {'weight': 'bold', 'size': 20}
+    ax.set_title(title, fontdict=title_font)
+    label_font = {'size': 40}
+    plt.ylabel('Ground Truth Label', fontdict=label_font)
+    plt.xlabel('Prediction Label', fontdict=label_font)
+
+    # draw locator
+    xmajor_locator = MultipleLocator(1)
+    xminor_locator = MultipleLocator(0.5)
+    ax.xaxis.set_major_locator(xmajor_locator)
+    ax.xaxis.set_minor_locator(xminor_locator)
+    ymajor_locator = MultipleLocator(1)
+    yminor_locator = MultipleLocator(0.5)
+    ax.yaxis.set_major_locator(ymajor_locator)
+    ax.yaxis.set_minor_locator(yminor_locator)
+
+    # draw grid
+    ax.grid(True, which='minor', linestyle='-')
+
+    # draw label
+    ax.set_xticks(np.arange(num_classes))
+    ax.set_yticks(np.arange(num_classes))
+    ax.set_xticklabels(labels, fontsize=20)
+    ax.set_yticklabels(labels, fontsize=20)
+
+    ax.tick_params(
+        axis='x', bottom=False, top=True, labelbottom=False, labeltop=True)
+    plt.setp(
+        ax.get_xticklabels(), rotation=45, ha='left', rotation_mode='anchor')
+
+    # draw confusion matrix value
+    for i in range(num_classes):
+        for j in range(num_classes):
+            ax.text(
+                j,
+                i,
+                '{}%'.format(
+                    round(confusion_matrix[i, j], 2
+                          ) if not np.isnan(confusion_matrix[i, j]) else -1),
+                ha='center',
+                va='center',
+                color='k',
+                size=20)
+
+    ax.set_ylim(len(confusion_matrix) - 0.5, -0.5)  # matplotlib>3.1.1
+
+    fig.tight_layout()
+    if save_dir is not None:
+        mkdir_or_exist(save_dir)
+        plt.savefig(
+            os.path.join(save_dir, 'confusion_matrix.png'), format='png')
+    if show:
+        plt.show()
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    results = []
+    for img in sorted(os.listdir(args.prediction_path)):
+        img = os.path.join(args.prediction_path, img)
+        image = Image.open(img)
+        image = np.copy(image)
+        results.append(image)
+
+    assert isinstance(results, list)
+    if isinstance(results[0], np.ndarray):
+        pass
+    else:
+        raise TypeError('invalid type of prediction results')
+
+    dataset = DATASETS.build(cfg.test_dataloader.dataset)
+    confusion_matrix = calculate_confusion_matrix(dataset, results)
+    plot_confusion_matrix(
+        confusion_matrix,
+        dataset.METAINFO['classes'],
+        save_dir=args.save_dir,
+        show=args.show,
+        title=args.title,
+        color_theme=args.color_theme)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/analysis_tools/get_flops.py b/head_extractor/src/mmseg/.mim/tools/analysis_tools/get_flops.py
new file mode 100644
index 0000000000000000000000000000000000000000..78a73988d4e037094bffb23f7e70b24439c97799
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/analysis_tools/get_flops.py
@@ -0,0 +1,124 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+from pathlib import Path
+
+import torch
+from mmengine import Config, DictAction
+from mmengine.logging import MMLogger
+from mmengine.model import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+
+from mmseg.models import BaseSegmentor
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+
+try:
+    from mmengine.analysis import get_model_complexity_info
+    from mmengine.analysis.print_helper import _format_size
+except ImportError:
+    raise ImportError('Please upgrade mmengine >= 0.6.0 to use this script.')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Get the FLOPs of a segmentor')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[2048, 1024],
+        help='input image size')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def inference(args: argparse.Namespace, logger: MMLogger) -> dict:
+    config_name = Path(args.config)
+
+    if not config_name.exists():
+        logger.error(f'Config file {config_name} does not exist')
+
+    cfg: Config = Config.fromfile(config_name)
+    cfg.work_dir = tempfile.TemporaryDirectory().name
+    cfg.log_level = 'WARN'
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    init_default_scope(cfg.get('scope', 'mmseg'))
+
+    if len(args.shape) == 1:
+        input_shape = (3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (3, ) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+    result = {}
+
+    model: BaseSegmentor = MODELS.build(cfg.model)
+    if hasattr(model, 'auxiliary_head'):
+        model.auxiliary_head = None
+    if torch.cuda.is_available():
+        model.cuda()
+    model = revert_sync_batchnorm(model)
+    result['ori_shape'] = input_shape[-2:]
+    result['pad_shape'] = input_shape[-2:]
+    data_batch = {
+        'inputs': [torch.rand(input_shape)],
+        'data_samples': [SegDataSample(metainfo=result)]
+    }
+    data = model.data_preprocessor(data_batch)
+    model.eval()
+    if cfg.model.decode_head.type in ['MaskFormerHead', 'Mask2FormerHead']:
+        # TODO: Support MaskFormer and Mask2Former
+        raise NotImplementedError('MaskFormer and Mask2Former are not '
+                                  'supported yet.')
+    outputs = get_model_complexity_info(
+        model,
+        input_shape=None,
+        inputs=data['inputs'],
+        show_table=False,
+        show_arch=False)
+    result['flops'] = _format_size(outputs['flops'])
+    result['params'] = _format_size(outputs['params'])
+    result['compute_type'] = 'direct: randomly generate a picture'
+    return result
+
+
+def main():
+
+    args = parse_args()
+    logger = MMLogger.get_instance(name='MMLogger')
+
+    result = inference(args, logger)
+    split_line = '=' * 30
+    ori_shape = result['ori_shape']
+    pad_shape = result['pad_shape']
+    flops = result['flops']
+    params = result['params']
+    compute_type = result['compute_type']
+
+    if pad_shape != ori_shape:
+        print(f'{split_line}\nUse size divisor set input shape '
+              f'from {ori_shape} to {pad_shape}')
+    print(f'{split_line}\nCompute type: {compute_type}\n'
+          f'Input shape: {pad_shape}\nFlops: {flops}\n'
+          f'Params: {params}\n{split_line}')
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify '
+          'that the flops computation is correct.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/analysis_tools/visualization_cam.py b/head_extractor/src/mmseg/.mim/tools/analysis_tools/visualization_cam.py
new file mode 100644
index 0000000000000000000000000000000000000000..00cdb3e04ab1f9000844ace781bc138f230d4630
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/analysis_tools/visualization_cam.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Use the pytorch-grad-cam tool to visualize Class Activation Maps (CAM).
+
+requirement: pip install grad-cam
+"""
+
+from argparse import ArgumentParser
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmengine import Config
+from mmengine.model import revert_sync_batchnorm
+from PIL import Image
+from pytorch_grad_cam import GradCAM
+from pytorch_grad_cam.utils.image import preprocess_image, show_cam_on_image
+
+from mmseg.apis import inference_model, init_model, show_result_pyplot
+from mmseg.utils import register_all_modules
+
+
+class SemanticSegmentationTarget:
+    """wrap the model.
+
+    requirement: pip install grad-cam
+
+    Args:
+        category (int): Visualization class.
+        mask (ndarray): Mask of class.
+        size (tuple): Image size.
+    """
+
+    def __init__(self, category, mask, size):
+        self.category = category
+        self.mask = torch.from_numpy(mask)
+        self.size = size
+        if torch.cuda.is_available():
+            self.mask = self.mask.cuda()
+
+    def __call__(self, model_output):
+        model_output = torch.unsqueeze(model_output, dim=0)
+        model_output = F.interpolate(
+            model_output, size=self.size, mode='bilinear')
+        model_output = torch.squeeze(model_output, dim=0)
+
+        return (model_output[self.category, :, :] * self.mask).sum()
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--out-file',
+        default='prediction.png',
+        help='Path to output prediction file')
+    parser.add_argument(
+        '--cam-file', default='vis_cam.png', help='Path to output cam file')
+    parser.add_argument(
+        '--target-layers',
+        default='backbone.layer4[2]',
+        help='Target layers to visualize CAM')
+    parser.add_argument(
+        '--category-index', default='7', help='Category to visualize CAM')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    args = parser.parse_args()
+
+    # build the model from a config file and a checkpoint file
+    register_all_modules()
+    model = init_model(args.config, args.checkpoint, device=args.device)
+    if args.device == 'cpu':
+        model = revert_sync_batchnorm(model)
+
+    # test a single image
+    result = inference_model(model, args.img)
+
+    # show the results
+    show_result_pyplot(
+        model,
+        args.img,
+        result,
+        draw_gt=False,
+        show=False if args.out_file is not None else True,
+        out_file=args.out_file)
+
+    # result data conversion
+    prediction_data = result.pred_sem_seg.data
+    pre_np_data = prediction_data.cpu().numpy().squeeze(0)
+
+    target_layers = args.target_layers
+    target_layers = [eval(f'model.{target_layers}')]
+
+    category = int(args.category_index)
+    mask_float = np.float32(pre_np_data == category)
+
+    # data processing
+    image = np.array(Image.open(args.img).convert('RGB'))
+    height, width = image.shape[0], image.shape[1]
+    rgb_img = np.float32(image) / 255
+    config = Config.fromfile(args.config)
+    image_mean = config.data_preprocessor['mean']
+    image_std = config.data_preprocessor['std']
+    input_tensor = preprocess_image(
+        rgb_img,
+        mean=[x / 255 for x in image_mean],
+        std=[x / 255 for x in image_std])
+
+    # Grad CAM(Class Activation Maps)
+    # Can also be LayerCAM, XGradCAM, GradCAMPlusPlus, EigenCAM, EigenGradCAM
+    targets = [
+        SemanticSegmentationTarget(category, mask_float, (height, width))
+    ]
+    with GradCAM(
+            model=model,
+            target_layers=target_layers,
+            use_cuda=torch.cuda.is_available()) as cam:
+        grayscale_cam = cam(input_tensor=input_tensor, targets=targets)[0, :]
+        cam_image = show_cam_on_image(rgb_img, grayscale_cam, use_rgb=True)
+
+        # save cam file
+        Image.fromarray(cam_image).save(args.cam_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/chase_db1.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/chase_db1.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4fefbd77435c5745d290269cd00f67fda604455
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/chase_db1.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import tempfile
+import zipfile
+
+import mmcv
+from mmengine.utils import mkdir_or_exist
+
+CHASE_DB1_LEN = 28 * 3
+TRAINING_LEN = 60
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert CHASE_DB1 dataset to mmsegmentation format')
+    parser.add_argument('dataset_path', help='path of CHASEDB1.zip')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    dataset_path = args.dataset_path
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'CHASE_DB1')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mkdir_or_exist(out_dir)
+    mkdir_or_exist(osp.join(out_dir, 'images'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        print('Extracting CHASEDB1.zip...')
+        zip_file = zipfile.ZipFile(dataset_path)
+        zip_file.extractall(tmp_dir)
+
+        print('Generating training dataset...')
+
+        assert len(os.listdir(tmp_dir)) == CHASE_DB1_LEN, \
+            f'len(os.listdir(tmp_dir)) != {CHASE_DB1_LEN}'
+
+        for img_name in sorted(os.listdir(tmp_dir))[:TRAINING_LEN]:
+            img = mmcv.imread(osp.join(tmp_dir, img_name))
+            if osp.splitext(img_name)[1] == '.jpg':
+                mmcv.imwrite(
+                    img,
+                    osp.join(out_dir, 'images', 'training',
+                             osp.splitext(img_name)[0] + '.png'))
+            else:
+                # The annotation img should be divided by 128, because some of
+                # the annotation imgs are not standard. We should set a
+                # threshold to convert the nonstandard annotation imgs. The
+                # value divided by 128 is equivalent to '1 if value >= 128
+                # else 0'
+                mmcv.imwrite(
+                    img[:, :, 0] // 128,
+                    osp.join(out_dir, 'annotations', 'training',
+                             osp.splitext(img_name)[0] + '.png'))
+
+        for img_name in sorted(os.listdir(tmp_dir))[TRAINING_LEN:]:
+            img = mmcv.imread(osp.join(tmp_dir, img_name))
+            if osp.splitext(img_name)[1] == '.jpg':
+                mmcv.imwrite(
+                    img,
+                    osp.join(out_dir, 'images', 'validation',
+                             osp.splitext(img_name)[0] + '.png'))
+            else:
+                mmcv.imwrite(
+                    img[:, :, 0] // 128,
+                    osp.join(out_dir, 'annotations', 'validation',
+                             osp.splitext(img_name)[0] + '.png'))
+
+        print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/cityscapes.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d6a80135d906db7330a736ccbcc908e0a6309c6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/cityscapes.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+from cityscapesscripts.preparation.json2labelImg import json2labelImg
+from mmengine.utils import (mkdir_or_exist, scandir, track_parallel_progress,
+                            track_progress)
+
+
+def convert_json_to_label(json_file):
+    label_file = json_file.replace('_polygons.json', '_labelTrainIds.png')
+    json2labelImg(json_file, label_file, 'trainIds')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert Cityscapes annotations to TrainIds')
+    parser.add_argument('cityscapes_path', help='cityscapes data path')
+    parser.add_argument('--gt-dir', default='gtFine', type=str)
+    parser.add_argument('-o', '--out-dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=1, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cityscapes_path = args.cityscapes_path
+    out_dir = args.out_dir if args.out_dir else cityscapes_path
+    mkdir_or_exist(out_dir)
+
+    gt_dir = osp.join(cityscapes_path, args.gt_dir)
+
+    poly_files = []
+    for poly in scandir(gt_dir, '_polygons.json', recursive=True):
+        poly_file = osp.join(gt_dir, poly)
+        poly_files.append(poly_file)
+    if args.nproc > 1:
+        track_parallel_progress(convert_json_to_label, poly_files, args.nproc)
+    else:
+        track_progress(convert_json_to_label, poly_files)
+
+    split_names = ['train', 'val', 'test']
+
+    for split in split_names:
+        filenames = []
+        for poly in scandir(
+                osp.join(gt_dir, split), '_polygons.json', recursive=True):
+            filenames.append(poly.replace('_gtFine_polygons.json', ''))
+        with open(osp.join(out_dir, f'{split}.txt'), 'w') as f:
+            f.writelines(f + '\n' for f in filenames)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/coco_stuff10k.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/coco_stuff10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..920127ee10fc09b76f8e2344ecdf3b7800d51802
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/coco_stuff10k.py
@@ -0,0 +1,308 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+import shutil
+from functools import partial
+
+import numpy as np
+from mmengine.utils import (mkdir_or_exist, track_parallel_progress,
+                            track_progress)
+from PIL import Image
+from scipy.io import loadmat
+
+COCO_LEN = 10000
+
+clsID_to_trID = {
+    0: 0,
+    1: 1,
+    2: 2,
+    3: 3,
+    4: 4,
+    5: 5,
+    6: 6,
+    7: 7,
+    8: 8,
+    9: 9,
+    10: 10,
+    11: 11,
+    13: 12,
+    14: 13,
+    15: 14,
+    16: 15,
+    17: 16,
+    18: 17,
+    19: 18,
+    20: 19,
+    21: 20,
+    22: 21,
+    23: 22,
+    24: 23,
+    25: 24,
+    27: 25,
+    28: 26,
+    31: 27,
+    32: 28,
+    33: 29,
+    34: 30,
+    35: 31,
+    36: 32,
+    37: 33,
+    38: 34,
+    39: 35,
+    40: 36,
+    41: 37,
+    42: 38,
+    43: 39,
+    44: 40,
+    46: 41,
+    47: 42,
+    48: 43,
+    49: 44,
+    50: 45,
+    51: 46,
+    52: 47,
+    53: 48,
+    54: 49,
+    55: 50,
+    56: 51,
+    57: 52,
+    58: 53,
+    59: 54,
+    60: 55,
+    61: 56,
+    62: 57,
+    63: 58,
+    64: 59,
+    65: 60,
+    67: 61,
+    70: 62,
+    72: 63,
+    73: 64,
+    74: 65,
+    75: 66,
+    76: 67,
+    77: 68,
+    78: 69,
+    79: 70,
+    80: 71,
+    81: 72,
+    82: 73,
+    84: 74,
+    85: 75,
+    86: 76,
+    87: 77,
+    88: 78,
+    89: 79,
+    90: 80,
+    92: 81,
+    93: 82,
+    94: 83,
+    95: 84,
+    96: 85,
+    97: 86,
+    98: 87,
+    99: 88,
+    100: 89,
+    101: 90,
+    102: 91,
+    103: 92,
+    104: 93,
+    105: 94,
+    106: 95,
+    107: 96,
+    108: 97,
+    109: 98,
+    110: 99,
+    111: 100,
+    112: 101,
+    113: 102,
+    114: 103,
+    115: 104,
+    116: 105,
+    117: 106,
+    118: 107,
+    119: 108,
+    120: 109,
+    121: 110,
+    122: 111,
+    123: 112,
+    124: 113,
+    125: 114,
+    126: 115,
+    127: 116,
+    128: 117,
+    129: 118,
+    130: 119,
+    131: 120,
+    132: 121,
+    133: 122,
+    134: 123,
+    135: 124,
+    136: 125,
+    137: 126,
+    138: 127,
+    139: 128,
+    140: 129,
+    141: 130,
+    142: 131,
+    143: 132,
+    144: 133,
+    145: 134,
+    146: 135,
+    147: 136,
+    148: 137,
+    149: 138,
+    150: 139,
+    151: 140,
+    152: 141,
+    153: 142,
+    154: 143,
+    155: 144,
+    156: 145,
+    157: 146,
+    158: 147,
+    159: 148,
+    160: 149,
+    161: 150,
+    162: 151,
+    163: 152,
+    164: 153,
+    165: 154,
+    166: 155,
+    167: 156,
+    168: 157,
+    169: 158,
+    170: 159,
+    171: 160,
+    172: 161,
+    173: 162,
+    174: 163,
+    175: 164,
+    176: 165,
+    177: 166,
+    178: 167,
+    179: 168,
+    180: 169,
+    181: 170,
+    182: 171
+}
+
+
+def convert_to_trainID(tuple_path, in_img_dir, in_ann_dir, out_img_dir,
+                       out_mask_dir, is_train):
+    imgpath, maskpath = tuple_path
+    shutil.copyfile(
+        osp.join(in_img_dir, imgpath),
+        osp.join(out_img_dir, 'train2014', imgpath) if is_train else osp.join(
+            out_img_dir, 'test2014', imgpath))
+    annotate = loadmat(osp.join(in_ann_dir, maskpath))
+    mask = annotate['S'].astype(np.uint8)
+    mask_copy = mask.copy()
+    for clsID, trID in clsID_to_trID.items():
+        mask_copy[mask == clsID] = trID
+    seg_filename = osp.join(out_mask_dir, 'train2014',
+                            maskpath.split('.')[0] +
+                            '_labelTrainIds.png') if is_train else osp.join(
+                                out_mask_dir, 'test2014',
+                                maskpath.split('.')[0] + '_labelTrainIds.png')
+    Image.fromarray(mask_copy).save(seg_filename, 'PNG')
+
+
+def generate_coco_list(folder):
+    train_list = osp.join(folder, 'imageLists', 'train.txt')
+    test_list = osp.join(folder, 'imageLists', 'test.txt')
+    train_paths = []
+    test_paths = []
+
+    with open(train_list) as f:
+        for filename in f:
+            basename = filename.strip()
+            imgpath = basename + '.jpg'
+            maskpath = basename + '.mat'
+            train_paths.append((imgpath, maskpath))
+
+    with open(test_list) as f:
+        for filename in f:
+            basename = filename.strip()
+            imgpath = basename + '.jpg'
+            maskpath = basename + '.mat'
+            test_paths.append((imgpath, maskpath))
+
+    return train_paths, test_paths
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=\
+        'Convert COCO Stuff 10k annotations to mmsegmentation format')  # noqa
+    parser.add_argument('coco_path', help='coco stuff path')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=16, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    coco_path = args.coco_path
+    nproc = args.nproc
+
+    out_dir = args.out_dir or coco_path
+    out_img_dir = osp.join(out_dir, 'images')
+    out_mask_dir = osp.join(out_dir, 'annotations')
+
+    mkdir_or_exist(osp.join(out_img_dir, 'train2014'))
+    mkdir_or_exist(osp.join(out_img_dir, 'test2014'))
+    mkdir_or_exist(osp.join(out_mask_dir, 'train2014'))
+    mkdir_or_exist(osp.join(out_mask_dir, 'test2014'))
+
+    train_list, test_list = generate_coco_list(coco_path)
+    assert (len(train_list) +
+            len(test_list)) == COCO_LEN, 'Wrong length of list {} & {}'.format(
+                len(train_list), len(test_list))
+
+    if args.nproc > 1:
+        track_parallel_progress(
+            partial(
+                convert_to_trainID,
+                in_img_dir=osp.join(coco_path, 'images'),
+                in_ann_dir=osp.join(coco_path, 'annotations'),
+                out_img_dir=out_img_dir,
+                out_mask_dir=out_mask_dir,
+                is_train=True),
+            train_list,
+            nproc=nproc)
+        track_parallel_progress(
+            partial(
+                convert_to_trainID,
+                in_img_dir=osp.join(coco_path, 'images'),
+                in_ann_dir=osp.join(coco_path, 'annotations'),
+                out_img_dir=out_img_dir,
+                out_mask_dir=out_mask_dir,
+                is_train=False),
+            test_list,
+            nproc=nproc)
+    else:
+        track_progress(
+            partial(
+                convert_to_trainID,
+                in_img_dir=osp.join(coco_path, 'images'),
+                in_ann_dir=osp.join(coco_path, 'annotations'),
+                out_img_dir=out_img_dir,
+                out_mask_dir=out_mask_dir,
+                is_train=True), train_list)
+        track_progress(
+            partial(
+                convert_to_trainID,
+                in_img_dir=osp.join(coco_path, 'images'),
+                in_ann_dir=osp.join(coco_path, 'annotations'),
+                out_img_dir=out_img_dir,
+                out_mask_dir=out_mask_dir,
+                is_train=False), test_list)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/coco_stuff164k.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/coco_stuff164k.py
new file mode 100644
index 0000000000000000000000000000000000000000..a13114ab1e0c37675369b2e9ba065cbfb2dca1e7
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/coco_stuff164k.py
@@ -0,0 +1,265 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+import shutil
+from functools import partial
+from glob import glob
+
+import numpy as np
+from mmengine.utils import (mkdir_or_exist, track_parallel_progress,
+                            track_progress)
+from PIL import Image
+
+COCO_LEN = 123287
+
+clsID_to_trID = {
+    0: 0,
+    1: 1,
+    2: 2,
+    3: 3,
+    4: 4,
+    5: 5,
+    6: 6,
+    7: 7,
+    8: 8,
+    9: 9,
+    10: 10,
+    12: 11,
+    13: 12,
+    14: 13,
+    15: 14,
+    16: 15,
+    17: 16,
+    18: 17,
+    19: 18,
+    20: 19,
+    21: 20,
+    22: 21,
+    23: 22,
+    24: 23,
+    26: 24,
+    27: 25,
+    30: 26,
+    31: 27,
+    32: 28,
+    33: 29,
+    34: 30,
+    35: 31,
+    36: 32,
+    37: 33,
+    38: 34,
+    39: 35,
+    40: 36,
+    41: 37,
+    42: 38,
+    43: 39,
+    45: 40,
+    46: 41,
+    47: 42,
+    48: 43,
+    49: 44,
+    50: 45,
+    51: 46,
+    52: 47,
+    53: 48,
+    54: 49,
+    55: 50,
+    56: 51,
+    57: 52,
+    58: 53,
+    59: 54,
+    60: 55,
+    61: 56,
+    62: 57,
+    63: 58,
+    64: 59,
+    66: 60,
+    69: 61,
+    71: 62,
+    72: 63,
+    73: 64,
+    74: 65,
+    75: 66,
+    76: 67,
+    77: 68,
+    78: 69,
+    79: 70,
+    80: 71,
+    81: 72,
+    83: 73,
+    84: 74,
+    85: 75,
+    86: 76,
+    87: 77,
+    88: 78,
+    89: 79,
+    91: 80,
+    92: 81,
+    93: 82,
+    94: 83,
+    95: 84,
+    96: 85,
+    97: 86,
+    98: 87,
+    99: 88,
+    100: 89,
+    101: 90,
+    102: 91,
+    103: 92,
+    104: 93,
+    105: 94,
+    106: 95,
+    107: 96,
+    108: 97,
+    109: 98,
+    110: 99,
+    111: 100,
+    112: 101,
+    113: 102,
+    114: 103,
+    115: 104,
+    116: 105,
+    117: 106,
+    118: 107,
+    119: 108,
+    120: 109,
+    121: 110,
+    122: 111,
+    123: 112,
+    124: 113,
+    125: 114,
+    126: 115,
+    127: 116,
+    128: 117,
+    129: 118,
+    130: 119,
+    131: 120,
+    132: 121,
+    133: 122,
+    134: 123,
+    135: 124,
+    136: 125,
+    137: 126,
+    138: 127,
+    139: 128,
+    140: 129,
+    141: 130,
+    142: 131,
+    143: 132,
+    144: 133,
+    145: 134,
+    146: 135,
+    147: 136,
+    148: 137,
+    149: 138,
+    150: 139,
+    151: 140,
+    152: 141,
+    153: 142,
+    154: 143,
+    155: 144,
+    156: 145,
+    157: 146,
+    158: 147,
+    159: 148,
+    160: 149,
+    161: 150,
+    162: 151,
+    163: 152,
+    164: 153,
+    165: 154,
+    166: 155,
+    167: 156,
+    168: 157,
+    169: 158,
+    170: 159,
+    171: 160,
+    172: 161,
+    173: 162,
+    174: 163,
+    175: 164,
+    176: 165,
+    177: 166,
+    178: 167,
+    179: 168,
+    180: 169,
+    181: 170,
+    255: 255
+}
+
+
+def convert_to_trainID(maskpath, out_mask_dir, is_train):
+    mask = np.array(Image.open(maskpath))
+    mask_copy = mask.copy()
+    for clsID, trID in clsID_to_trID.items():
+        mask_copy[mask == clsID] = trID
+    seg_filename = osp.join(
+        out_mask_dir, 'train2017',
+        osp.basename(maskpath).split('.')[0] +
+        '_labelTrainIds.png') if is_train else osp.join(
+            out_mask_dir, 'val2017',
+            osp.basename(maskpath).split('.')[0] + '_labelTrainIds.png')
+    Image.fromarray(mask_copy).save(seg_filename, 'PNG')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description=\
+        'Convert COCO Stuff 164k annotations to mmsegmentation format')  # noqa
+    parser.add_argument('coco_path', help='coco stuff path')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=16, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    coco_path = args.coco_path
+    nproc = args.nproc
+
+    out_dir = args.out_dir or coco_path
+    out_img_dir = osp.join(out_dir, 'images')
+    out_mask_dir = osp.join(out_dir, 'annotations')
+
+    mkdir_or_exist(osp.join(out_mask_dir, 'train2017'))
+    mkdir_or_exist(osp.join(out_mask_dir, 'val2017'))
+
+    if out_dir != coco_path:
+        shutil.copytree(osp.join(coco_path, 'images'), out_img_dir)
+
+    train_list = glob(osp.join(coco_path, 'annotations', 'train2017', '*.png'))
+    train_list = [file for file in train_list if '_labelTrainIds' not in file]
+    test_list = glob(osp.join(coco_path, 'annotations', 'val2017', '*.png'))
+    test_list = [file for file in test_list if '_labelTrainIds' not in file]
+    assert (len(train_list) +
+            len(test_list)) == COCO_LEN, 'Wrong length of list {} & {}'.format(
+                len(train_list), len(test_list))
+
+    if args.nproc > 1:
+        track_parallel_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=True),
+            train_list,
+            nproc=nproc)
+        track_parallel_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=False),
+            test_list,
+            nproc=nproc)
+    else:
+        track_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=True),
+            train_list)
+        track_progress(
+            partial(
+                convert_to_trainID, out_mask_dir=out_mask_dir, is_train=False),
+            test_list)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/drive.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/drive.py
new file mode 100644
index 0000000000000000000000000000000000000000..076fd05a2029216e0f1a1494610181fdaa7fbef9
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/drive.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import tempfile
+import zipfile
+
+import cv2
+import mmcv
+from mmengine.utils import mkdir_or_exist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert DRIVE dataset to mmsegmentation format')
+    parser.add_argument(
+        'training_path', help='the training part of DRIVE dataset')
+    parser.add_argument(
+        'testing_path', help='the testing part of DRIVE dataset')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    training_path = args.training_path
+    testing_path = args.testing_path
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'DRIVE')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mkdir_or_exist(out_dir)
+    mkdir_or_exist(osp.join(out_dir, 'images'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        print('Extracting training.zip...')
+        zip_file = zipfile.ZipFile(training_path)
+        zip_file.extractall(tmp_dir)
+
+        print('Generating training dataset...')
+        now_dir = osp.join(tmp_dir, 'training', 'images')
+        for img_name in os.listdir(now_dir):
+            img = mmcv.imread(osp.join(now_dir, img_name))
+            mmcv.imwrite(
+                img,
+                osp.join(
+                    out_dir, 'images', 'training',
+                    osp.splitext(img_name)[0].replace('_training', '') +
+                    '.png'))
+
+        now_dir = osp.join(tmp_dir, 'training', '1st_manual')
+        for img_name in os.listdir(now_dir):
+            cap = cv2.VideoCapture(osp.join(now_dir, img_name))
+            ret, img = cap.read()
+            mmcv.imwrite(
+                img[:, :, 0] // 128,
+                osp.join(out_dir, 'annotations', 'training',
+                         osp.splitext(img_name)[0] + '.png'))
+
+        print('Extracting test.zip...')
+        zip_file = zipfile.ZipFile(testing_path)
+        zip_file.extractall(tmp_dir)
+
+        print('Generating validation dataset...')
+        now_dir = osp.join(tmp_dir, 'test', 'images')
+        for img_name in os.listdir(now_dir):
+            img = mmcv.imread(osp.join(now_dir, img_name))
+            mmcv.imwrite(
+                img,
+                osp.join(
+                    out_dir, 'images', 'validation',
+                    osp.splitext(img_name)[0].replace('_test', '') + '.png'))
+
+        now_dir = osp.join(tmp_dir, 'test', '1st_manual')
+        if osp.exists(now_dir):
+            for img_name in os.listdir(now_dir):
+                cap = cv2.VideoCapture(osp.join(now_dir, img_name))
+                ret, img = cap.read()
+                # The annotation img should be divided by 128, because some of
+                # the annotation imgs are not standard. We should set a
+                # threshold to convert the nonstandard annotation imgs. The
+                # value divided by 128 is equivalent to '1 if value >= 128
+                # else 0'
+                mmcv.imwrite(
+                    img[:, :, 0] // 128,
+                    osp.join(out_dir, 'annotations', 'validation',
+                             osp.splitext(img_name)[0] + '.png'))
+
+        now_dir = osp.join(tmp_dir, 'test', '2nd_manual')
+        if osp.exists(now_dir):
+            for img_name in os.listdir(now_dir):
+                cap = cv2.VideoCapture(osp.join(now_dir, img_name))
+                ret, img = cap.read()
+                mmcv.imwrite(
+                    img[:, :, 0] // 128,
+                    osp.join(out_dir, 'annotations', 'validation',
+                             osp.splitext(img_name)[0] + '.png'))
+
+        print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/hrf.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/hrf.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bfd80c9ee42e3b5cba4a12a6c8b32ddbb2f1f11
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/hrf.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import tempfile
+import zipfile
+
+import mmcv
+from mmengine.utils import mkdir_or_exist
+
+HRF_LEN = 15
+TRAINING_LEN = 5
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert HRF dataset to mmsegmentation format')
+    parser.add_argument('healthy_path', help='the path of healthy.zip')
+    parser.add_argument(
+        'healthy_manualsegm_path', help='the path of healthy_manualsegm.zip')
+    parser.add_argument('glaucoma_path', help='the path of glaucoma.zip')
+    parser.add_argument(
+        'glaucoma_manualsegm_path', help='the path of glaucoma_manualsegm.zip')
+    parser.add_argument(
+        'diabetic_retinopathy_path',
+        help='the path of diabetic_retinopathy.zip')
+    parser.add_argument(
+        'diabetic_retinopathy_manualsegm_path',
+        help='the path of diabetic_retinopathy_manualsegm.zip')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    images_path = [
+        args.healthy_path, args.glaucoma_path, args.diabetic_retinopathy_path
+    ]
+    annotations_path = [
+        args.healthy_manualsegm_path, args.glaucoma_manualsegm_path,
+        args.diabetic_retinopathy_manualsegm_path
+    ]
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'HRF')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mkdir_or_exist(out_dir)
+    mkdir_or_exist(osp.join(out_dir, 'images'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+
+    print('Generating images...')
+    for now_path in images_path:
+        with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+            zip_file = zipfile.ZipFile(now_path)
+            zip_file.extractall(tmp_dir)
+
+            assert len(os.listdir(tmp_dir)) == HRF_LEN, \
+                f'len(os.listdir(tmp_dir)) != {HRF_LEN}'
+
+            for filename in sorted(os.listdir(tmp_dir))[:TRAINING_LEN]:
+                img = mmcv.imread(osp.join(tmp_dir, filename))
+                mmcv.imwrite(
+                    img,
+                    osp.join(out_dir, 'images', 'training',
+                             osp.splitext(filename)[0] + '.png'))
+            for filename in sorted(os.listdir(tmp_dir))[TRAINING_LEN:]:
+                img = mmcv.imread(osp.join(tmp_dir, filename))
+                mmcv.imwrite(
+                    img,
+                    osp.join(out_dir, 'images', 'validation',
+                             osp.splitext(filename)[0] + '.png'))
+
+    print('Generating annotations...')
+    for now_path in annotations_path:
+        with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+            zip_file = zipfile.ZipFile(now_path)
+            zip_file.extractall(tmp_dir)
+
+            assert len(os.listdir(tmp_dir)) == HRF_LEN, \
+                f'len(os.listdir(tmp_dir)) != {HRF_LEN}'
+
+            for filename in sorted(os.listdir(tmp_dir))[:TRAINING_LEN]:
+                img = mmcv.imread(osp.join(tmp_dir, filename))
+                # The annotation img should be divided by 128, because some of
+                # the annotation imgs are not standard. We should set a
+                # threshold to convert the nonstandard annotation imgs. The
+                # value divided by 128 is equivalent to '1 if value >= 128
+                # else 0'
+                mmcv.imwrite(
+                    img[:, :, 0] // 128,
+                    osp.join(out_dir, 'annotations', 'training',
+                             osp.splitext(filename)[0] + '.png'))
+            for filename in sorted(os.listdir(tmp_dir))[TRAINING_LEN:]:
+                img = mmcv.imread(osp.join(tmp_dir, filename))
+                mmcv.imwrite(
+                    img[:, :, 0] // 128,
+                    osp.join(out_dir, 'annotations', 'validation',
+                             osp.splitext(filename)[0] + '.png'))
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/isaid.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/isaid.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d5ccd9c776e9621c261e6d168bf6aa4f7b451f6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/isaid.py
@@ -0,0 +1,246 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import glob
+import os
+import os.path as osp
+import shutil
+import tempfile
+import zipfile
+
+import mmcv
+import numpy as np
+from mmengine.utils import ProgressBar, mkdir_or_exist
+from PIL import Image
+
+iSAID_palette = \
+    {
+        0: (0, 0, 0),
+        1: (0, 0, 63),
+        2: (0, 63, 63),
+        3: (0, 63, 0),
+        4: (0, 63, 127),
+        5: (0, 63, 191),
+        6: (0, 63, 255),
+        7: (0, 127, 63),
+        8: (0, 127, 127),
+        9: (0, 0, 127),
+        10: (0, 0, 191),
+        11: (0, 0, 255),
+        12: (0, 191, 127),
+        13: (0, 127, 191),
+        14: (0, 127, 255),
+        15: (0, 100, 155)
+    }
+
+iSAID_invert_palette = {v: k for k, v in iSAID_palette.items()}
+
+
+def iSAID_convert_from_color(arr_3d, palette=iSAID_invert_palette):
+    """RGB-color encoding to grayscale labels."""
+    arr_2d = np.zeros((arr_3d.shape[0], arr_3d.shape[1]), dtype=np.uint8)
+
+    for c, i in palette.items():
+        m = np.all(arr_3d == np.array(c).reshape(1, 1, 3), axis=2)
+        arr_2d[m] = i
+
+    return arr_2d
+
+
+def slide_crop_image(src_path, out_dir, mode, patch_H, patch_W, overlap):
+    img = np.asarray(Image.open(src_path).convert('RGB'))
+
+    img_H, img_W, _ = img.shape
+
+    if img_H < patch_H and img_W > patch_W:
+
+        img = mmcv.impad(img, shape=(patch_H, img_W), pad_val=0)
+
+        img_H, img_W, _ = img.shape
+
+    elif img_H > patch_H and img_W < patch_W:
+
+        img = mmcv.impad(img, shape=(img_H, patch_W), pad_val=0)
+
+        img_H, img_W, _ = img.shape
+
+    elif img_H < patch_H and img_W < patch_W:
+
+        img = mmcv.impad(img, shape=(patch_H, patch_W), pad_val=0)
+
+        img_H, img_W, _ = img.shape
+
+    for x in range(0, img_W, patch_W - overlap):
+        for y in range(0, img_H, patch_H - overlap):
+            x_str = x
+            x_end = x + patch_W
+            if x_end > img_W:
+                diff_x = x_end - img_W
+                x_str -= diff_x
+                x_end = img_W
+            y_str = y
+            y_end = y + patch_H
+            if y_end > img_H:
+                diff_y = y_end - img_H
+                y_str -= diff_y
+                y_end = img_H
+
+            img_patch = img[y_str:y_end, x_str:x_end, :]
+            img_patch = Image.fromarray(img_patch.astype(np.uint8))
+            image = osp.basename(src_path).split('.')[0] + '_' + str(
+                y_str) + '_' + str(y_end) + '_' + str(x_str) + '_' + str(
+                    x_end) + '.png'
+            # print(image)
+            save_path_image = osp.join(out_dir, 'img_dir', mode, str(image))
+            img_patch.save(save_path_image, format='BMP')
+
+
+def slide_crop_label(src_path, out_dir, mode, patch_H, patch_W, overlap):
+    label = mmcv.imread(src_path, channel_order='rgb')
+    label = iSAID_convert_from_color(label)
+    img_H, img_W = label.shape
+
+    if img_H < patch_H and img_W > patch_W:
+
+        label = mmcv.impad(label, shape=(patch_H, img_W), pad_val=255)
+
+        img_H = patch_H
+
+    elif img_H > patch_H and img_W < patch_W:
+
+        label = mmcv.impad(label, shape=(img_H, patch_W), pad_val=255)
+
+        img_W = patch_W
+
+    elif img_H < patch_H and img_W < patch_W:
+
+        label = mmcv.impad(label, shape=(patch_H, patch_W), pad_val=255)
+
+        img_H = patch_H
+        img_W = patch_W
+
+    for x in range(0, img_W, patch_W - overlap):
+        for y in range(0, img_H, patch_H - overlap):
+            x_str = x
+            x_end = x + patch_W
+            if x_end > img_W:
+                diff_x = x_end - img_W
+                x_str -= diff_x
+                x_end = img_W
+            y_str = y
+            y_end = y + patch_H
+            if y_end > img_H:
+                diff_y = y_end - img_H
+                y_str -= diff_y
+                y_end = img_H
+
+            lab_patch = label[y_str:y_end, x_str:x_end]
+            lab_patch = Image.fromarray(lab_patch.astype(np.uint8), mode='P')
+
+            image = osp.basename(src_path).split('.')[0].split(
+                '_')[0] + '_' + str(y_str) + '_' + str(y_end) + '_' + str(
+                    x_str) + '_' + str(x_end) + '_instance_color_RGB' + '.png'
+            lab_patch.save(osp.join(out_dir, 'ann_dir', mode, str(image)))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert iSAID dataset to mmsegmentation format')
+    parser.add_argument('dataset_path', help='iSAID folder path')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+
+    parser.add_argument(
+        '--patch_width',
+        default=896,
+        type=int,
+        help='Width of the cropped image patch')
+    parser.add_argument(
+        '--patch_height',
+        default=896,
+        type=int,
+        help='Height of the cropped image patch')
+    parser.add_argument(
+        '--overlap_area', default=384, type=int, help='Overlap area')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    dataset_path = args.dataset_path
+    # image patch width and height
+    patch_H, patch_W = args.patch_width, args.patch_height
+
+    overlap = args.overlap_area  # overlap area
+
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'iSAID')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'test'))
+
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'test'))
+
+    assert os.path.exists(os.path.join(dataset_path, 'train')), \
+        f'train is not in {dataset_path}'
+    assert os.path.exists(os.path.join(dataset_path, 'val')), \
+        f'val is not in {dataset_path}'
+    assert os.path.exists(os.path.join(dataset_path, 'test')), \
+        f'test is not in {dataset_path}'
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        for dataset_mode in ['train', 'val', 'test']:
+
+            # for dataset_mode in [ 'test']:
+            print(f'Extracting  {dataset_mode}ing.zip...')
+            img_zipp_list = glob.glob(
+                os.path.join(dataset_path, dataset_mode, 'images', '*.zip'))
+            print('Find the data', img_zipp_list)
+            for img_zipp in img_zipp_list:
+                zip_file = zipfile.ZipFile(img_zipp)
+                zip_file.extractall(os.path.join(tmp_dir, dataset_mode, 'img'))
+            src_path_list = glob.glob(
+                os.path.join(tmp_dir, dataset_mode, 'img', 'images', '*.png'))
+
+            src_prog_bar = ProgressBar(len(src_path_list))
+            for i, img_path in enumerate(src_path_list):
+                if dataset_mode != 'test':
+                    slide_crop_image(img_path, out_dir, dataset_mode, patch_H,
+                                     patch_W, overlap)
+
+                else:
+                    shutil.move(img_path,
+                                os.path.join(out_dir, 'img_dir', dataset_mode))
+                src_prog_bar.update()
+
+            if dataset_mode != 'test':
+                label_zipp_list = glob.glob(
+                    os.path.join(dataset_path, dataset_mode, 'Semantic_masks',
+                                 '*.zip'))
+                for label_zipp in label_zipp_list:
+                    zip_file = zipfile.ZipFile(label_zipp)
+                    zip_file.extractall(
+                        os.path.join(tmp_dir, dataset_mode, 'lab'))
+
+                lab_path_list = glob.glob(
+                    os.path.join(tmp_dir, dataset_mode, 'lab', 'images',
+                                 '*.png'))
+                lab_prog_bar = ProgressBar(len(lab_path_list))
+                for i, lab_path in enumerate(lab_path_list):
+                    slide_crop_label(lab_path, out_dir, dataset_mode, patch_H,
+                                     patch_W, overlap)
+                    lab_prog_bar.update()
+
+        print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/levircd.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/levircd.py
new file mode 100644
index 0000000000000000000000000000000000000000..8717f3e856ba3f171b511f34d0217e1fda87ccb6
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/levircd.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import glob
+import math
+import os
+import os.path as osp
+
+import mmcv
+import numpy as np
+from mmengine.utils import ProgressBar
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert levir-cd dataset to mmsegmentation format')
+    parser.add_argument('--dataset_path', help='potsdam folder path')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--clip_size',
+        type=int,
+        help='clipped size of image after preparation',
+        default=256)
+    parser.add_argument(
+        '--stride_size',
+        type=int,
+        help='stride of clipping original images',
+        default=256)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    input_folder = args.dataset_path
+    png_files = glob.glob(
+        os.path.join(input_folder, '**/*.png'), recursive=True)
+    output_folder = args.out_dir
+    prog_bar = ProgressBar(len(png_files))
+    for png_file in png_files:
+        new_path = os.path.join(
+            output_folder,
+            os.path.relpath(os.path.dirname(png_file), input_folder))
+        os.makedirs(os.path.dirname(new_path), exist_ok=True)
+        label = False
+        if 'label' in png_file:
+            label = True
+        clip_big_image(png_file, new_path, args, label)
+        prog_bar.update()
+
+
+def clip_big_image(image_path, clip_save_dir, args, to_label=False):
+    image = mmcv.imread(image_path)
+
+    h, w, c = image.shape
+    clip_size = args.clip_size
+    stride_size = args.stride_size
+
+    num_rows = math.ceil((h - clip_size) / stride_size) if math.ceil(
+        (h - clip_size) /
+        stride_size) * stride_size + clip_size >= h else math.ceil(
+            (h - clip_size) / stride_size) + 1
+    num_cols = math.ceil((w - clip_size) / stride_size) if math.ceil(
+        (w - clip_size) /
+        stride_size) * stride_size + clip_size >= w else math.ceil(
+            (w - clip_size) / stride_size) + 1
+
+    x, y = np.meshgrid(np.arange(num_cols + 1), np.arange(num_rows + 1))
+    xmin = x * clip_size
+    ymin = y * clip_size
+
+    xmin = xmin.ravel()
+    ymin = ymin.ravel()
+    xmin_offset = np.where(xmin + clip_size > w, w - xmin - clip_size,
+                           np.zeros_like(xmin))
+    ymin_offset = np.where(ymin + clip_size > h, h - ymin - clip_size,
+                           np.zeros_like(ymin))
+    boxes = np.stack([
+        xmin + xmin_offset, ymin + ymin_offset,
+        np.minimum(xmin + clip_size, w),
+        np.minimum(ymin + clip_size, h)
+    ],
+                     axis=1)
+
+    if to_label:
+        image[image == 255] = 1
+        image = image[:, :, 0]
+    for box in boxes:
+        start_x, start_y, end_x, end_y = box
+        clipped_image = image[start_y:end_y, start_x:end_x] \
+            if to_label else image[start_y:end_y, start_x:end_x, :]
+        idx = osp.basename(image_path).split('.')[0]
+        mmcv.imwrite(
+            clipped_image.astype(np.uint8),
+            osp.join(clip_save_dir,
+                     f'{idx}_{start_x}_{start_y}_{end_x}_{end_y}.png'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/loveda.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/loveda.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b0ef4bb8bbd07f60dfc0397e9659f0200b96f5d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/loveda.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import shutil
+import tempfile
+import zipfile
+
+from mmengine.utils import mkdir_or_exist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert LoveDA dataset to mmsegmentation format')
+    parser.add_argument('dataset_path', help='LoveDA folder path')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    dataset_path = args.dataset_path
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'loveDA')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mkdir_or_exist(out_dir)
+    mkdir_or_exist(osp.join(out_dir, 'img_dir'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'test'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+
+    assert 'Train.zip' in os.listdir(dataset_path), \
+        f'Train.zip is not in {dataset_path}'
+    assert 'Val.zip' in os.listdir(dataset_path), \
+        f'Val.zip is not in {dataset_path}'
+    assert 'Test.zip' in os.listdir(dataset_path), \
+        f'Test.zip is not in {dataset_path}'
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        for dataset in ['Train', 'Val', 'Test']:
+            zip_file = zipfile.ZipFile(
+                os.path.join(dataset_path, dataset + '.zip'))
+            zip_file.extractall(tmp_dir)
+            data_type = dataset.lower()
+            for location in ['Rural', 'Urban']:
+                for image_type in ['images_png', 'masks_png']:
+                    if image_type == 'images_png':
+                        dst = osp.join(out_dir, 'img_dir', data_type)
+                    else:
+                        dst = osp.join(out_dir, 'ann_dir', data_type)
+                    if dataset == 'Test' and image_type == 'masks_png':
+                        continue
+                    else:
+                        src_dir = osp.join(tmp_dir, dataset, location,
+                                           image_type)
+                        src_lst = os.listdir(src_dir)
+                        for file in src_lst:
+                            shutil.move(osp.join(src_dir, file), dst)
+        print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/nyu.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/nyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..49e09e7af6844b709e681f6d9f4df14ed547a00c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/nyu.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+import shutil
+import tempfile
+import zipfile
+
+from mmengine.utils import mkdir_or_exist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert NYU Depth dataset to mmsegmentation format')
+    parser.add_argument('raw_data', help='the path of raw data')
+    parser.add_argument(
+        '-o', '--out_dir', help='output path', default='./data/nyu')
+    args = parser.parse_args()
+    return args
+
+
+def reorganize(raw_data_dir: str, out_dir: str):
+    """Reorganize NYU Depth dataset files into the required directory
+    structure.
+
+    Args:
+        raw_data_dir (str): Path to the raw data directory.
+        out_dir (str): Output directory for the organized dataset.
+    """
+
+    def move_data(data_list, dst_prefix, fname_func):
+        """Move data files from source to destination directory.
+
+        Args:
+            data_list (list): List of data file paths.
+            dst_prefix (str): Prefix to be added to destination paths.
+            fname_func (callable): Function to process file names
+        """
+        for data_item in data_list:
+            data_item = data_item.strip().strip('/')
+            new_item = fname_func(data_item)
+            shutil.move(
+                osp.join(raw_data_dir, data_item),
+                osp.join(out_dir, dst_prefix, new_item))
+
+    def process_phase(phase):
+        """Process a dataset phase (e.g., 'train' or 'test')."""
+        with open(osp.join(raw_data_dir, f'nyu_{phase}.txt')) as f:
+            data = filter(lambda x: len(x.strip()) > 0, f.readlines())
+            data = map(lambda x: x.split()[:2], data)
+            images, annos = zip(*data)
+
+            move_data(images, f'images/{phase}',
+                      lambda x: x.replace('/rgb', ''))
+            move_data(annos, f'annotations/{phase}',
+                      lambda x: x.replace('/sync_depth', ''))
+
+    process_phase('train')
+    process_phase('test')
+
+
+def main():
+    args = parse_args()
+
+    print('Making directories...')
+    mkdir_or_exist(args.out_dir)
+    for subdir in [
+            'images/train', 'images/test', 'annotations/train',
+            'annotations/test'
+    ]:
+        mkdir_or_exist(osp.join(args.out_dir, subdir))
+
+    print('Generating images and annotations...')
+
+    if args.raw_data.endswith('.zip'):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            zip_file = zipfile.ZipFile(args.raw_data)
+            zip_file.extractall(tmp_dir)
+            reorganize(osp.join(tmp_dir, 'nyu'), args.out_dir)
+    else:
+        assert osp.isdir(
+            args.raw_data
+        ), 'the argument --raw-data should be either a zip file or directory.'
+        reorganize(args.raw_data, args.out_dir)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/pascal_context.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/pascal_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92d1dc6411137b92fe67fbde0fc554060194085
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/pascal_context.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from functools import partial
+
+import numpy as np
+from detail import Detail
+from mmengine.utils import mkdir_or_exist, track_progress
+from PIL import Image
+
+_mapping = np.sort(
+    np.array([
+        0, 2, 259, 260, 415, 324, 9, 258, 144, 18, 19, 22, 23, 397, 25, 284,
+        158, 159, 416, 33, 162, 420, 454, 295, 296, 427, 44, 45, 46, 308, 59,
+        440, 445, 31, 232, 65, 354, 424, 68, 326, 72, 458, 34, 207, 80, 355,
+        85, 347, 220, 349, 360, 98, 187, 104, 105, 366, 189, 368, 113, 115
+    ]))
+_key = np.array(range(len(_mapping))).astype('uint8')
+
+
+def generate_labels(img_id, detail, out_dir):
+
+    def _class_to_index(mask, _mapping, _key):
+        # assert the values
+        values = np.unique(mask)
+        for i in range(len(values)):
+            assert (values[i] in _mapping)
+        index = np.digitize(mask.ravel(), _mapping, right=True)
+        return _key[index].reshape(mask.shape)
+
+    mask = Image.fromarray(
+        _class_to_index(detail.getMask(img_id), _mapping=_mapping, _key=_key))
+    filename = img_id['file_name']
+    mask.save(osp.join(out_dir, filename.replace('jpg', 'png')))
+    return osp.splitext(osp.basename(filename))[0]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert PASCAL VOC annotations to mmsegmentation format')
+    parser.add_argument('devkit_path', help='pascal voc devkit path')
+    parser.add_argument('json_path', help='annoation json filepath')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    devkit_path = args.devkit_path
+    if args.out_dir is None:
+        out_dir = osp.join(devkit_path, 'VOC2010', 'SegmentationClassContext')
+    else:
+        out_dir = args.out_dir
+    json_path = args.json_path
+    mkdir_or_exist(out_dir)
+    img_dir = osp.join(devkit_path, 'VOC2010', 'JPEGImages')
+
+    train_detail = Detail(json_path, img_dir, 'train')
+    train_ids = train_detail.getImgs()
+
+    val_detail = Detail(json_path, img_dir, 'val')
+    val_ids = val_detail.getImgs()
+
+    mkdir_or_exist(
+        osp.join(devkit_path, 'VOC2010/ImageSets/SegmentationContext'))
+
+    train_list = track_progress(
+        partial(generate_labels, detail=train_detail, out_dir=out_dir),
+        train_ids)
+    with open(
+            osp.join(devkit_path, 'VOC2010/ImageSets/SegmentationContext',
+                     'train.txt'), 'w') as f:
+        f.writelines(line + '\n' for line in sorted(train_list))
+
+    val_list = track_progress(
+        partial(generate_labels, detail=val_detail, out_dir=out_dir), val_ids)
+    with open(
+            osp.join(devkit_path, 'VOC2010/ImageSets/SegmentationContext',
+                     'val.txt'), 'w') as f:
+        f.writelines(line + '\n' for line in sorted(val_list))
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/potsdam.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/potsdam.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3c713ee2a08d2f6eaf68fb225899504b8f4e829
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/potsdam.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import glob
+import math
+import os
+import os.path as osp
+import tempfile
+import zipfile
+
+import mmcv
+import numpy as np
+from mmengine.utils import ProgressBar, mkdir_or_exist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert potsdam dataset to mmsegmentation format')
+    parser.add_argument('dataset_path', help='potsdam folder path')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--clip_size',
+        type=int,
+        help='clipped size of image after preparation',
+        default=512)
+    parser.add_argument(
+        '--stride_size',
+        type=int,
+        help='stride of clipping original images',
+        default=256)
+    args = parser.parse_args()
+    return args
+
+
+def clip_big_image(image_path, clip_save_dir, args, to_label=False):
+    # Original image of Potsdam dataset is very large, thus pre-processing
+    # of them is adopted. Given fixed clip size and stride size to generate
+    # clipped image, the intersection　of width and height is determined.
+    # For example, given one 5120 x 5120 original image, the clip size is
+    # 512 and stride size is 256, thus it would generate 20x20 = 400 images
+    # whose size are all 512x512.
+    image = mmcv.imread(image_path)
+
+    h, w, c = image.shape
+    clip_size = args.clip_size
+    stride_size = args.stride_size
+
+    num_rows = math.ceil((h - clip_size) / stride_size) if math.ceil(
+        (h - clip_size) /
+        stride_size) * stride_size + clip_size >= h else math.ceil(
+            (h - clip_size) / stride_size) + 1
+    num_cols = math.ceil((w - clip_size) / stride_size) if math.ceil(
+        (w - clip_size) /
+        stride_size) * stride_size + clip_size >= w else math.ceil(
+            (w - clip_size) / stride_size) + 1
+
+    x, y = np.meshgrid(np.arange(num_cols + 1), np.arange(num_rows + 1))
+    xmin = x * clip_size
+    ymin = y * clip_size
+
+    xmin = xmin.ravel()
+    ymin = ymin.ravel()
+    xmin_offset = np.where(xmin + clip_size > w, w - xmin - clip_size,
+                           np.zeros_like(xmin))
+    ymin_offset = np.where(ymin + clip_size > h, h - ymin - clip_size,
+                           np.zeros_like(ymin))
+    boxes = np.stack([
+        xmin + xmin_offset, ymin + ymin_offset,
+        np.minimum(xmin + clip_size, w),
+        np.minimum(ymin + clip_size, h)
+    ],
+                     axis=1)
+
+    if to_label:
+        color_map = np.array([[0, 0, 0], [255, 255, 255], [255, 0, 0],
+                              [255, 255, 0], [0, 255, 0], [0, 255, 255],
+                              [0, 0, 255]])
+        flatten_v = np.matmul(
+            image.reshape(-1, c),
+            np.array([2, 3, 4]).reshape(3, 1))
+        out = np.zeros_like(flatten_v)
+        for idx, class_color in enumerate(color_map):
+            value_idx = np.matmul(class_color,
+                                  np.array([2, 3, 4]).reshape(3, 1))
+            out[flatten_v == value_idx] = idx
+        image = out.reshape(h, w)
+
+    for box in boxes:
+        start_x, start_y, end_x, end_y = box
+        clipped_image = image[start_y:end_y,
+                              start_x:end_x] if to_label else image[
+                                  start_y:end_y, start_x:end_x, :]
+        idx_i, idx_j = osp.basename(image_path).split('_')[2:4]
+        mmcv.imwrite(
+            clipped_image.astype(np.uint8),
+            osp.join(
+                clip_save_dir,
+                f'{idx_i}_{idx_j}_{start_x}_{start_y}_{end_x}_{end_y}.png'))
+
+
+def main():
+    args = parse_args()
+    splits = {
+        'train': [
+            '2_10', '2_11', '2_12', '3_10', '3_11', '3_12', '4_10', '4_11',
+            '4_12', '5_10', '5_11', '5_12', '6_10', '6_11', '6_12', '6_7',
+            '6_8', '6_9', '7_10', '7_11', '7_12', '7_7', '7_8', '7_9'
+        ],
+        'val': [
+            '5_15', '6_15', '6_13', '3_13', '4_14', '6_14', '5_14', '2_13',
+            '4_15', '2_14', '5_13', '4_13', '3_14', '7_13'
+        ]
+    }
+
+    dataset_path = args.dataset_path
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'potsdam')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+
+    zipp_list = glob.glob(os.path.join(dataset_path, '*.zip'))
+    print('Find the data', zipp_list)
+
+    for zipp in zipp_list:
+        with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+            zip_file = zipfile.ZipFile(zipp)
+            zip_file.extractall(tmp_dir)
+            src_path_list = glob.glob(os.path.join(tmp_dir, '*.tif'))
+            if not len(src_path_list):
+                sub_tmp_dir = os.path.join(tmp_dir, os.listdir(tmp_dir)[0])
+                src_path_list = glob.glob(os.path.join(sub_tmp_dir, '*.tif'))
+
+            prog_bar = ProgressBar(len(src_path_list))
+            for i, src_path in enumerate(src_path_list):
+                idx_i, idx_j = osp.basename(src_path).split('_')[2:4]
+                data_type = 'train' if f'{idx_i}_{idx_j}' in splits[
+                    'train'] else 'val'
+                if 'label' in src_path:
+                    dst_dir = osp.join(out_dir, 'ann_dir', data_type)
+                    clip_big_image(src_path, dst_dir, args, to_label=True)
+                else:
+                    dst_dir = osp.join(out_dir, 'img_dir', data_type)
+                    clip_big_image(src_path, dst_dir, args, to_label=False)
+                prog_bar.update()
+
+    print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/refuge.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/refuge.py
new file mode 100644
index 0000000000000000000000000000000000000000..1186866ab3fd58c4d72e5f573938053a8d7c80b2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/refuge.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import tempfile
+import zipfile
+
+import mmcv
+import numpy as np
+from mmengine.utils import mkdir_or_exist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert REFUGE dataset to mmsegmentation format')
+    parser.add_argument('--raw_data_root', help='the root path of raw data')
+
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def extract_img(root: str,
+                cur_dir: str,
+                out_dir: str,
+                mode: str = 'train',
+                file_type: str = 'img') -> None:
+    """_summary_
+
+    Args:
+       Args:
+        root (str): root where the extracted data is saved
+        cur_dir (cur_dir): dir where the zip_file exists
+        out_dir (str): root dir where the data is saved
+
+        mode (str, optional): Defaults to 'train'.
+        file_type (str, optional): Defaults to 'img',else to 'mask'.
+    """
+    zip_file = zipfile.ZipFile(cur_dir)
+    zip_file.extractall(root)
+    for cur_dir, dirs, files in os.walk(root):
+        # filter child dirs and directories with "Illustration" and "MACOSX"
+        if len(dirs) == 0 and \
+                cur_dir.split('\\')[-1].find('Illustration') == -1 and \
+                cur_dir.find('MACOSX') == -1:
+
+            file_names = [
+                file for file in files
+                if file.endswith('.jpg') or file.endswith('.bmp')
+            ]
+            for filename in sorted(file_names):
+                img = mmcv.imread(osp.join(cur_dir, filename))
+
+                if file_type == 'annotations':
+                    img = img[:, :, 0]
+                    img[np.where(img == 0)] = 1
+                    img[np.where(img == 128)] = 2
+                    img[np.where(img == 255)] = 0
+                mmcv.imwrite(
+                    img,
+                    osp.join(out_dir, file_type, mode,
+                             osp.splitext(filename)[0] + '.png'))
+
+
+def main():
+    args = parse_args()
+
+    raw_data_root = args.raw_data_root
+    if args.out_dir is None:
+        out_dir = osp.join('./data', 'REFUGE')
+
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mkdir_or_exist(out_dir)
+    mkdir_or_exist(osp.join(out_dir, 'images'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'test'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'test'))
+
+    print('Generating images and annotations...')
+    # process data from the child dir on the first rank
+    cur_dir, dirs, files = list(os.walk(raw_data_root))[0]
+    print('====================')
+
+    files = list(filter(lambda x: x.endswith('.zip'), files))
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        for file in files:
+            # search data folders for training,validation,test
+            mode = list(
+                filter(lambda x: file.lower().find(x) != -1,
+                       ['training', 'test', 'validation']))[0]
+            file_root = osp.join(tmp_dir, file[:-4])
+            file_type = 'images' if file.find('Anno') == -1 and file.find(
+                'GT') == -1 else 'annotations'
+            extract_img(file_root, osp.join(cur_dir, file), out_dir, mode,
+                        file_type)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/stare.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/stare.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a23ba4dd8a4744bca9d1a506c79131c0e42c73d
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/stare.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import gzip
+import os
+import os.path as osp
+import tarfile
+import tempfile
+
+import mmcv
+from mmengine.utils import mkdir_or_exist
+
+STARE_LEN = 20
+TRAINING_LEN = 10
+
+
+def un_gz(src, dst):
+    g_file = gzip.GzipFile(src)
+    with open(dst, 'wb+') as f:
+        f.write(g_file.read())
+    g_file.close()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert STARE dataset to mmsegmentation format')
+    parser.add_argument('image_path', help='the path of stare-images.tar')
+    parser.add_argument('labels_ah', help='the path of labels-ah.tar')
+    parser.add_argument('labels_vk', help='the path of labels-vk.tar')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    image_path = args.image_path
+    labels_ah = args.labels_ah
+    labels_vk = args.labels_vk
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'STARE')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mkdir_or_exist(out_dir)
+    mkdir_or_exist(osp.join(out_dir, 'images'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        mkdir_or_exist(osp.join(tmp_dir, 'gz'))
+        mkdir_or_exist(osp.join(tmp_dir, 'files'))
+
+        print('Extracting stare-images.tar...')
+        with tarfile.open(image_path) as f:
+            f.extractall(osp.join(tmp_dir, 'gz'))
+
+        for filename in os.listdir(osp.join(tmp_dir, 'gz')):
+            un_gz(
+                osp.join(tmp_dir, 'gz', filename),
+                osp.join(tmp_dir, 'files',
+                         osp.splitext(filename)[0]))
+
+        now_dir = osp.join(tmp_dir, 'files')
+
+        assert len(os.listdir(now_dir)) == STARE_LEN, \
+            f'len(os.listdir(now_dir)) != {STARE_LEN}'
+
+        for filename in sorted(os.listdir(now_dir))[:TRAINING_LEN]:
+            img = mmcv.imread(osp.join(now_dir, filename))
+            mmcv.imwrite(
+                img,
+                osp.join(out_dir, 'images', 'training',
+                         osp.splitext(filename)[0] + '.png'))
+
+        for filename in sorted(os.listdir(now_dir))[TRAINING_LEN:]:
+            img = mmcv.imread(osp.join(now_dir, filename))
+            mmcv.imwrite(
+                img,
+                osp.join(out_dir, 'images', 'validation',
+                         osp.splitext(filename)[0] + '.png'))
+
+        print('Removing the temporary files...')
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        mkdir_or_exist(osp.join(tmp_dir, 'gz'))
+        mkdir_or_exist(osp.join(tmp_dir, 'files'))
+
+        print('Extracting labels-ah.tar...')
+        with tarfile.open(labels_ah) as f:
+            f.extractall(osp.join(tmp_dir, 'gz'))
+
+        for filename in os.listdir(osp.join(tmp_dir, 'gz')):
+            un_gz(
+                osp.join(tmp_dir, 'gz', filename),
+                osp.join(tmp_dir, 'files',
+                         osp.splitext(filename)[0]))
+
+        now_dir = osp.join(tmp_dir, 'files')
+
+        assert len(os.listdir(now_dir)) == STARE_LEN, \
+            f'len(os.listdir(now_dir)) != {STARE_LEN}'
+
+        for filename in sorted(os.listdir(now_dir))[:TRAINING_LEN]:
+            img = mmcv.imread(osp.join(now_dir, filename))
+            # The annotation img should be divided by 128, because some of
+            # the annotation imgs are not standard. We should set a threshold
+            # to convert the nonstandard annotation imgs. The value divided by
+            # 128 equivalent to '1 if value >= 128 else 0'
+            mmcv.imwrite(
+                img[:, :, 0] // 128,
+                osp.join(out_dir, 'annotations', 'training',
+                         osp.splitext(filename)[0] + '.png'))
+
+        for filename in sorted(os.listdir(now_dir))[TRAINING_LEN:]:
+            img = mmcv.imread(osp.join(now_dir, filename))
+            mmcv.imwrite(
+                img[:, :, 0] // 128,
+                osp.join(out_dir, 'annotations', 'validation',
+                         osp.splitext(filename)[0] + '.png'))
+
+        print('Removing the temporary files...')
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        mkdir_or_exist(osp.join(tmp_dir, 'gz'))
+        mkdir_or_exist(osp.join(tmp_dir, 'files'))
+
+        print('Extracting labels-vk.tar...')
+        with tarfile.open(labels_vk) as f:
+            f.extractall(osp.join(tmp_dir, 'gz'))
+
+        for filename in os.listdir(osp.join(tmp_dir, 'gz')):
+            un_gz(
+                osp.join(tmp_dir, 'gz', filename),
+                osp.join(tmp_dir, 'files',
+                         osp.splitext(filename)[0]))
+
+        now_dir = osp.join(tmp_dir, 'files')
+
+        assert len(os.listdir(now_dir)) == STARE_LEN, \
+            f'len(os.listdir(now_dir)) != {STARE_LEN}'
+
+        for filename in sorted(os.listdir(now_dir))[:TRAINING_LEN]:
+            img = mmcv.imread(osp.join(now_dir, filename))
+            mmcv.imwrite(
+                img[:, :, 0] // 128,
+                osp.join(out_dir, 'annotations', 'training',
+                         osp.splitext(filename)[0] + '.png'))
+
+        for filename in sorted(os.listdir(now_dir))[TRAINING_LEN:]:
+            img = mmcv.imread(osp.join(now_dir, filename))
+            mmcv.imwrite(
+                img[:, :, 0] // 128,
+                osp.join(out_dir, 'annotations', 'validation',
+                         osp.splitext(filename)[0] + '.png'))
+
+        print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/synapse.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/synapse.py
new file mode 100644
index 0000000000000000000000000000000000000000..42dac6b7eff94107b8b3a59984622cb1fd2e7599
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/synapse.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+import nibabel as nib
+import numpy as np
+from mmengine.utils import mkdir_or_exist
+from PIL import Image
+
+
+def read_files_from_txt(txt_path):
+    with open(txt_path) as f:
+        files = f.readlines()
+    files = [file.strip() for file in files]
+    return files
+
+
+def read_nii_file(nii_path):
+    img = nib.load(nii_path).get_fdata()
+    return img
+
+
+def split_3d_image(img):
+    c, _, _ = img.shape
+    res = []
+    for i in range(c):
+        res.append(img[i, :, :])
+    return res
+
+
+def label_mapping(label):
+    """Label mapping from TransUNet paper setting. It only has 9 classes, which
+    are 'background', 'aorta', 'gallbladder', 'left_kidney', 'right_kidney',
+    'liver', 'pancreas', 'spleen', 'stomach', respectively. Other foreground
+    classes in original dataset are all set to background.
+
+    More details could be found here: https://arxiv.org/abs/2102.04306
+    """
+    maped_label = np.zeros_like(label)
+    maped_label[label == 8] = 1
+    maped_label[label == 4] = 2
+    maped_label[label == 3] = 3
+    maped_label[label == 2] = 4
+    maped_label[label == 6] = 5
+    maped_label[label == 11] = 6
+    maped_label[label == 1] = 7
+    maped_label[label == 7] = 8
+    return maped_label
+
+
+def pares_args():
+    parser = argparse.ArgumentParser(
+        description='Convert synapse dataset to mmsegmentation format')
+    parser.add_argument(
+        '--dataset-path', type=str, help='synapse dataset path.')
+    parser.add_argument(
+        '--save-path',
+        default='data/synapse',
+        type=str,
+        help='save path of the dataset.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = pares_args()
+    dataset_path = args.dataset_path
+    save_path = args.save_path
+
+    if not osp.exists(dataset_path):
+        raise ValueError('The dataset path does not exist. '
+                         'Please enter a correct dataset path.')
+    if not osp.exists(osp.join(dataset_path, 'img')) \
+            or not osp.exists(osp.join(dataset_path, 'label')):
+        raise FileNotFoundError('The dataset structure is incorrect. '
+                                'Please check your dataset.')
+
+    train_id = read_files_from_txt(osp.join(dataset_path, 'train.txt'))
+    train_id = [idx[3:7] for idx in train_id]
+
+    test_id = read_files_from_txt(osp.join(dataset_path, 'val.txt'))
+    test_id = [idx[3:7] for idx in test_id]
+
+    mkdir_or_exist(osp.join(save_path, 'img_dir/train'))
+    mkdir_or_exist(osp.join(save_path, 'img_dir/val'))
+    mkdir_or_exist(osp.join(save_path, 'ann_dir/train'))
+    mkdir_or_exist(osp.join(save_path, 'ann_dir/val'))
+
+    # It follows data preparation pipeline from here:
+    # https://github.com/Beckschen/TransUNet/tree/main/datasets
+    for i, idx in enumerate(train_id):
+        img_3d = read_nii_file(
+            osp.join(dataset_path, 'img', 'img' + idx + '.nii.gz'))
+        label_3d = read_nii_file(
+            osp.join(dataset_path, 'label', 'label' + idx + '.nii.gz'))
+
+        img_3d = np.clip(img_3d, -125, 275)
+        img_3d = (img_3d + 125) / 400
+        img_3d *= 255
+        img_3d = np.transpose(img_3d, [2, 0, 1])
+        img_3d = np.flip(img_3d, 2)
+
+        label_3d = np.transpose(label_3d, [2, 0, 1])
+        label_3d = np.flip(label_3d, 2)
+        label_3d = label_mapping(label_3d)
+
+        for c in range(img_3d.shape[0]):
+            img = img_3d[c]
+            label = label_3d[c]
+
+            img = Image.fromarray(img).convert('RGB')
+            label = Image.fromarray(label).convert('L')
+            img.save(
+                osp.join(
+                    save_path, 'img_dir/train', 'case' + idx.zfill(4) +
+                    '_slice' + str(c).zfill(3) + '.jpg'))
+            label.save(
+                osp.join(
+                    save_path, 'ann_dir/train', 'case' + idx.zfill(4) +
+                    '_slice' + str(c).zfill(3) + '.png'))
+
+    for i, idx in enumerate(test_id):
+        img_3d = read_nii_file(
+            osp.join(dataset_path, 'img', 'img' + idx + '.nii.gz'))
+        label_3d = read_nii_file(
+            osp.join(dataset_path, 'label', 'label' + idx + '.nii.gz'))
+
+        img_3d = np.clip(img_3d, -125, 275)
+        img_3d = (img_3d + 125) / 400
+        img_3d *= 255
+        img_3d = np.transpose(img_3d, [2, 0, 1])
+        img_3d = np.flip(img_3d, 2)
+
+        label_3d = np.transpose(label_3d, [2, 0, 1])
+        label_3d = np.flip(label_3d, 2)
+        label_3d = label_mapping(label_3d)
+
+        for c in range(img_3d.shape[0]):
+            img = img_3d[c]
+            label = label_3d[c]
+
+            img = Image.fromarray(img).convert('RGB')
+            label = Image.fromarray(label).convert('L')
+            img.save(
+                osp.join(
+                    save_path, 'img_dir/val', 'case' + idx.zfill(4) +
+                    '_slice' + str(c).zfill(3) + '.jpg'))
+            label.save(
+                osp.join(
+                    save_path, 'ann_dir/val', 'case' + idx.zfill(4) +
+                    '_slice' + str(c).zfill(3) + '.png'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/vaihingen.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/vaihingen.py
new file mode 100644
index 0000000000000000000000000000000000000000..db980144eb491846a844b0a374bb7a01d5509265
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/vaihingen.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import glob
+import math
+import os
+import os.path as osp
+import tempfile
+import zipfile
+
+import mmcv
+import numpy as np
+from mmengine.utils import ProgressBar, mkdir_or_exist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert vaihingen dataset to mmsegmentation format')
+    parser.add_argument('dataset_path', help='vaihingen folder path')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--clip_size',
+        type=int,
+        help='clipped size of image after preparation',
+        default=512)
+    parser.add_argument(
+        '--stride_size',
+        type=int,
+        help='stride of clipping original images',
+        default=256)
+    args = parser.parse_args()
+    return args
+
+
+def clip_big_image(image_path, clip_save_dir, to_label=False):
+    # Original image of Vaihingen dataset is very large, thus pre-processing
+    # of them is adopted. Given fixed clip size and stride size to generate
+    # clipped image, the intersection　of width and height is determined.
+    # For example, given one 5120 x 5120 original image, the clip size is
+    # 512 and stride size is 256, thus it would generate 20x20 = 400 images
+    # whose size are all 512x512.
+    image = mmcv.imread(image_path)
+
+    h, w, c = image.shape
+    cs = args.clip_size
+    ss = args.stride_size
+
+    num_rows = math.ceil((h - cs) / ss) if math.ceil(
+        (h - cs) / ss) * ss + cs >= h else math.ceil((h - cs) / ss) + 1
+    num_cols = math.ceil((w - cs) / ss) if math.ceil(
+        (w - cs) / ss) * ss + cs >= w else math.ceil((w - cs) / ss) + 1
+
+    x, y = np.meshgrid(np.arange(num_cols + 1), np.arange(num_rows + 1))
+    xmin = x * cs
+    ymin = y * cs
+
+    xmin = xmin.ravel()
+    ymin = ymin.ravel()
+    xmin_offset = np.where(xmin + cs > w, w - xmin - cs, np.zeros_like(xmin))
+    ymin_offset = np.where(ymin + cs > h, h - ymin - cs, np.zeros_like(ymin))
+    boxes = np.stack([
+        xmin + xmin_offset, ymin + ymin_offset,
+        np.minimum(xmin + cs, w),
+        np.minimum(ymin + cs, h)
+    ],
+                     axis=1)
+
+    if to_label:
+        color_map = np.array([[0, 0, 0], [255, 255, 255], [255, 0, 0],
+                              [255, 255, 0], [0, 255, 0], [0, 255, 255],
+                              [0, 0, 255]])
+        flatten_v = np.matmul(
+            image.reshape(-1, c),
+            np.array([2, 3, 4]).reshape(3, 1))
+        out = np.zeros_like(flatten_v)
+        for idx, class_color in enumerate(color_map):
+            value_idx = np.matmul(class_color,
+                                  np.array([2, 3, 4]).reshape(3, 1))
+            out[flatten_v == value_idx] = idx
+        image = out.reshape(h, w)
+
+    for box in boxes:
+        start_x, start_y, end_x, end_y = box
+        clipped_image = image[start_y:end_y,
+                              start_x:end_x] if to_label else image[
+                                  start_y:end_y, start_x:end_x, :]
+        area_idx = osp.basename(image_path).split('_')[3].strip('.tif')
+        mmcv.imwrite(
+            clipped_image.astype(np.uint8),
+            osp.join(clip_save_dir,
+                     f'{area_idx}_{start_x}_{start_y}_{end_x}_{end_y}.png'))
+
+
+def main():
+    splits = {
+        'train': [
+            'area1', 'area11', 'area13', 'area15', 'area17', 'area21',
+            'area23', 'area26', 'area28', 'area3', 'area30', 'area32',
+            'area34', 'area37', 'area5', 'area7'
+        ],
+        'val': [
+            'area6', 'area24', 'area35', 'area16', 'area14', 'area22',
+            'area10', 'area4', 'area2', 'area20', 'area8', 'area31', 'area33',
+            'area27', 'area38', 'area12', 'area29'
+        ],
+    }
+
+    dataset_path = args.dataset_path
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'vaihingen')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+
+    zipp_list = glob.glob(os.path.join(dataset_path, '*.zip'))
+    print('Find the data', zipp_list)
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        for zipp in zipp_list:
+            zip_file = zipfile.ZipFile(zipp)
+            zip_file.extractall(tmp_dir)
+            src_path_list = glob.glob(os.path.join(tmp_dir, '*.tif'))
+            if 'ISPRS_semantic_labeling_Vaihingen' in zipp:
+                src_path_list = glob.glob(
+                    os.path.join(os.path.join(tmp_dir, 'top'), '*.tif'))
+            if 'ISPRS_semantic_labeling_Vaihingen_ground_truth_eroded_COMPLETE' in zipp:  # noqa
+                src_path_list = glob.glob(os.path.join(tmp_dir, '*.tif'))
+                # delete unused area9 ground truth
+                for area_ann in src_path_list:
+                    if 'area9' in area_ann:
+                        src_path_list.remove(area_ann)
+            prog_bar = ProgressBar(len(src_path_list))
+            for i, src_path in enumerate(src_path_list):
+                area_idx = osp.basename(src_path).split('_')[3].strip('.tif')
+                data_type = 'train' if area_idx in splits['train'] else 'val'
+                if 'noBoundary' in src_path:
+                    dst_dir = osp.join(out_dir, 'ann_dir', data_type)
+                    clip_big_image(src_path, dst_dir, to_label=True)
+                else:
+                    dst_dir = osp.join(out_dir, 'img_dir', data_type)
+                    clip_big_image(src_path, dst_dir, to_label=False)
+                prog_bar.update()
+
+        print('Removing the temporary files...')
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/dataset_converters/voc_aug.py b/head_extractor/src/mmseg/.mim/tools/dataset_converters/voc_aug.py
new file mode 100644
index 0000000000000000000000000000000000000000..a536f4290d06e4a6c3c9fa8dbadfda847fec583b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dataset_converters/voc_aug.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from functools import partial
+
+import numpy as np
+from mmengine.utils import mkdir_or_exist, scandir, track_parallel_progress
+from PIL import Image
+from scipy.io import loadmat
+
+AUG_LEN = 10582
+
+
+def convert_mat(mat_file, in_dir, out_dir):
+    data = loadmat(osp.join(in_dir, mat_file))
+    mask = data['GTcls'][0]['Segmentation'][0].astype(np.uint8)
+    seg_filename = osp.join(out_dir, mat_file.replace('.mat', '.png'))
+    Image.fromarray(mask).save(seg_filename, 'PNG')
+
+
+def generate_aug_list(merged_list, excluded_list):
+    return list(set(merged_list) - set(excluded_list))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert PASCAL VOC annotations to mmsegmentation format')
+    parser.add_argument('devkit_path', help='pascal voc devkit path')
+    parser.add_argument('aug_path', help='pascal voc aug path')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=1, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    devkit_path = args.devkit_path
+    aug_path = args.aug_path
+    nproc = args.nproc
+    if args.out_dir is None:
+        out_dir = osp.join(devkit_path, 'VOC2012', 'SegmentationClassAug')
+    else:
+        out_dir = args.out_dir
+    mkdir_or_exist(out_dir)
+    in_dir = osp.join(aug_path, 'dataset', 'cls')
+
+    track_parallel_progress(
+        partial(convert_mat, in_dir=in_dir, out_dir=out_dir),
+        list(scandir(in_dir, suffix='.mat')),
+        nproc=nproc)
+
+    full_aug_list = []
+    with open(osp.join(aug_path, 'dataset', 'train.txt')) as f:
+        full_aug_list += [line.strip() for line in f]
+    with open(osp.join(aug_path, 'dataset', 'val.txt')) as f:
+        full_aug_list += [line.strip() for line in f]
+
+    with open(
+            osp.join(devkit_path, 'VOC2012/ImageSets/Segmentation',
+                     'train.txt')) as f:
+        ori_train_list = [line.strip() for line in f]
+    with open(
+            osp.join(devkit_path, 'VOC2012/ImageSets/Segmentation',
+                     'val.txt')) as f:
+        val_list = [line.strip() for line in f]
+
+    aug_train_list = generate_aug_list(ori_train_list + full_aug_list,
+                                       val_list)
+    assert len(aug_train_list) == AUG_LEN, 'len(aug_train_list) != {}'.format(
+        AUG_LEN)
+
+    with open(
+            osp.join(devkit_path, 'VOC2012/ImageSets/Segmentation',
+                     'trainaug.txt'), 'w') as f:
+        f.writelines(line + '\n' for line in aug_train_list)
+
+    aug_list = generate_aug_list(full_aug_list, ori_train_list + val_list)
+    assert len(aug_list) == AUG_LEN - len(
+        ori_train_list), 'len(aug_list) != {}'.format(AUG_LEN -
+                                                      len(ori_train_list))
+    with open(
+            osp.join(devkit_path, 'VOC2012/ImageSets/Segmentation', 'aug.txt'),
+            'w') as f:
+        f.writelines(line + '\n' for line in aug_list)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/deployment/pytorch2onnx.py b/head_extractor/src/mmseg/.mim/tools/deployment/pytorch2onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc4f3f6c369d8105e7efdab267f05c783232eff
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/deployment/pytorch2onnx.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+from mmengine.model import revert_sync_batchnorm
+from mmseg.apis import inference_model, init_model, show_result_pyplot
+import os
+from tqdm import tqdm
+import time
+import torch
+
+parser = ArgumentParser()
+parser.add_argument('--img', default='', help='Image file')
+parser.add_argument('--config', default='', help='Config file')
+parser.add_argument('--checkpoint', default='', help='Checkpoint file')
+parser.add_argument('--out-file', default=None, help='Path to output file')
+parser.add_argument(
+    '--device', default='cuda', help='Device used for inference')
+parser.add_argument(
+    '--opacity',
+    type=float,
+    default=0.5,
+    help='Opacity of painted segmentation map. In (0, 1] range.')
+parser.add_argument(
+    '--with-labels',
+    action='store_true',
+    default=False,
+    help='Whether to display the class labels.')
+parser.add_argument(
+    '--title', default='result', help='The image identifier.')
+args = parser.parse_args()
+
+args.device = 'cuda'
+args.config = 'segformer_b4_fashion10k_add_background_2_74.23/segformer_mit-b4_8xb2-160k_fashion10k-512x512.py'
+args.checkpoint = 'segformer_b4_fashion10k_add_background_2_74.23/best_mIoU_iter_90120.pth'
+
+# build the model from a config file and a checkpoint file
+model = init_model(args.config, args.checkpoint, device=args.device)
+if 'fashion10k' in args.checkpoint or 'depth-anything' in args.checkpoint:
+    model.dataset_meta['palette'] = [
+        [0, 0, 0], [255, 0, 0], [0, 128, 0], [0, 0, 255],
+        [0, 128, 128], [238, 130, 238], [128, 128, 128], [255, 255, 0],
+        [255, 153, 18], [255, 125, 64], [127, 255, 0], [175, 238, 238],
+        [138, 43, 226], [210, 105, 30], [0, 0, 139], [72, 61, 139],
+        [255, 20, 147], [255, 192, 203], [205, 92, 92], [32, 178, 170],
+        [132, 112, 255], [160, 82, 45], [255, 222, 173], [240, 230, 140],
+        ]
+
+input_shape = (512, 512)
+input_tensor = torch.randn(1, 3, *input_shape).cuda()
+onnx_file = 'depth-anything_seg.onnx'
+model.onnx_export(input_tensor, onnx_file)
diff --git a/head_extractor/src/mmseg/.mim/tools/deployment/pytorch2torchscript.py b/head_extractor/src/mmseg/.mim/tools/deployment/pytorch2torchscript.py
new file mode 100644
index 0000000000000000000000000000000000000000..94a9827f53996715029d1f9781010fa042c60b15
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/deployment/pytorch2torchscript.py
@@ -0,0 +1,211 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import numpy as np
+import torch
+import torch._C
+import torch.serialization
+from mmengine import Config
+from mmengine.runner import load_checkpoint
+from torch import nn
+
+from mmseg.models import build_segmentor
+from mmseg.utils import register_all_modules
+register_all_modules()
+torch.manual_seed(3)
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+def check_torch_version():
+    torch_minimum_version = '1.8.0'
+    torch_version = digit_version(torch.__version__)
+
+    assert (torch_version >= digit_version(torch_minimum_version)), \
+        f'Torch=={torch.__version__} is not support for converting to ' \
+        f'torchscript. Please install pytorch>={torch_minimum_version}.'
+
+
+def _convert_batchnorm(module):
+    module_output = module
+    if isinstance(module, torch.nn.SyncBatchNorm):
+        module_output = torch.nn.BatchNorm2d(module.num_features, module.eps,
+                                             module.momentum, module.affine,
+                                             module.track_running_stats)
+        if module.affine:
+            module_output.weight.data = module.weight.data.clone().detach()
+            module_output.bias.data = module.bias.data.clone().detach()
+            # keep requires_grad unchanged
+            module_output.weight.requires_grad = module.weight.requires_grad
+            module_output.bias.requires_grad = module.bias.requires_grad
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = module.num_batches_tracked
+    for name, child in module.named_children():
+        module_output.add_module(name, _convert_batchnorm(child))
+    del module
+    return module_output
+
+
+def _demo_mm_inputs(input_shape, num_classes):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        input_shape (tuple):
+            input batch dimensions
+        num_classes (int):
+            number of semantic classes
+    """
+    (N, C, H, W) = input_shape
+    rng = np.random.RandomState(0)
+    imgs = rng.rand(*input_shape)
+    segs = rng.randint(
+        low=0, high=num_classes - 1, size=(N, 1, H, W)).astype(np.uint8)
+    img_metas = [{
+        'img_shape': (H, W, C),
+        'ori_shape': (H, W, C),
+        'pad_shape': (H, W, C),
+        'filename': '<demo>.png',
+        'scale_factor': 1.0,
+        'flip': False,
+    } for _ in range(N)]
+    mm_inputs = {
+        'imgs': torch.FloatTensor(imgs).requires_grad_(True),
+        'img_metas': img_metas,
+        'gt_semantic_seg': torch.LongTensor(segs)
+    }
+    return mm_inputs
+
+
+def pytorch2libtorch(model,
+                     input_shape,
+                     show=False,
+                     output_file='tmp.pt',
+                     verify=False):
+    """Export Pytorch model to TorchScript model and verify the outputs are
+    same between Pytorch and TorchScript.
+
+    Args:
+        model (nn.Module): Pytorch model we want to export.
+        input_shape (tuple): Use this input shape to construct
+            the corresponding dummy input and execute the model.
+        show (bool): Whether print the computation graph. Default: False.
+        output_file (string): The path to where we store the
+            output TorchScript model. Default: `tmp.pt`.
+        verify (bool): Whether compare the outputs between
+            Pytorch and TorchScript. Default: False.
+    """
+    if isinstance(model.decode_head, nn.ModuleList):
+        num_classes = model.decode_head[-1].num_classes
+    else:
+        num_classes = model.decode_head.num_classes
+
+    mm_inputs = _demo_mm_inputs(input_shape, num_classes)
+
+    imgs = mm_inputs.pop('imgs')
+
+    # replace the original forword with forward_dummy
+    # model.forward = model.forward_dummy
+    
+    model = model.to('cuda')
+    imgs = imgs.to('cuda')
+    
+    model.eval()
+    traced_model = torch.jit.trace(
+        model,
+        example_inputs=imgs,
+        check_trace=verify,
+    )
+
+    if show:
+        print(traced_model.graph)
+
+    # traced_model.save(output_file)
+    torch.jit.save(traced_model, output_file)
+    
+    
+    # script_model = torch.jit.script(model)
+    # output = script_model(imgs)
+    # torch.jit.save(script_model, output_file)
+    
+    print(f'Successfully exported TorchScript model: {output_file}')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert MMSeg to TorchScript')
+    parser.add_argument('--config', help='test config file path')
+    parser.add_argument('--checkpoint', help='checkpoint file', default=None)
+    parser.add_argument(
+        '--show', action='store_true', help='show TorchScript graph')
+    parser.add_argument(
+        '--verify', action='store_true', help='verify the TorchScript model')
+    parser.add_argument('--output-file', type=str, default='tmp.pt')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[896, 896],
+        help='input image size (height, width)')
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    check_torch_version()
+
+    args.config = 'work_dirs/depth_anything_large_fashion10k_896x896_2GPU/depth_anything_large_mask2former_16xb1_160k_fashion10k_896x896.py'
+    args.checkpoint = 'depth-anything_best_mIoU_iter_45000.pth'
+    # args.config = 'work_dirs/segformer_mit-b2_8xb2-160k_fashion10k-512x512_73.41/segformer_mit-b2_8xb2-160k_fashion10k-512x512.py'
+    # args.checkpoint = 'work_dirs/segformer_mit-b2_8xb2-160k_fashion10k-512x512_73.41/iter_22560.pth'
+    # args.shape = [512,512]
+    args.output_file = args.checkpoint.split('/')[-1].replace('.pth', '.pt')
+    # args.verify = True
+    # args.show = True
+    
+    if len(args.shape) == 1:
+        input_shape = (1, 3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (
+            1,
+            3,
+        ) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+
+    cfg = Config.fromfile(args.config)
+    cfg.model.pretrained = None
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    # segmentor = build_segmentor(
+    #     cfg.model, train_cfg=None, test_cfg=cfg.get('test_cfg'))
+    
+    segmentor = build_segmentor(
+        cfg.model, train_cfg=None, test_cfg=None)
+    
+    # convert SyncBN to BN
+    segmentor = _convert_batchnorm(segmentor)
+
+    if args.checkpoint:
+        load_checkpoint(segmentor, args.checkpoint, map_location='cpu')
+        # load_checkpoint(segmentor, args.checkpoint, map_location='cuda')
+
+    # convert the PyTorch model to LibTorch model
+    pytorch2libtorch(
+        segmentor,
+        input_shape,
+        show=args.show,
+        output_file=args.output_file,
+        verify=args.verify)
diff --git a/head_extractor/src/mmseg/.mim/tools/dist_test.sh b/head_extractor/src/mmseg/.mim/tools/dist_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..89711fd5c02cfc1f0386e5354506d4b74ecac251
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dist_test.sh
@@ -0,0 +1,20 @@
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/test.py \
+    $CONFIG \
+    $CHECKPOINT \
+    --launcher pytorch \
+    ${@:4}
diff --git a/head_extractor/src/mmseg/.mim/tools/dist_train.sh b/head_extractor/src/mmseg/.mim/tools/dist_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..16a6a79f80e050a089c217f5fcf508823b207d76
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/dist_train.sh
@@ -0,0 +1,41 @@
+# CONFIG=$1
+# GPUS=$2
+# NNODES=${NNODES:-1}
+# NODE_RANK=${NODE_RANK:-0}
+# PORT=${PORT:-29555}
+# MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+# PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+# python -m torch.distributed.launch \
+#     --nnodes=$NNODES \
+#     --node_rank=$NODE_RANK \
+#     --master_addr=$MASTER_ADDR \
+#     --nproc_per_node=$GPUS \
+#     --master_port=$PORT \
+#     $(dirname "$0")/train.py \
+#     --config $CONFIG \
+#     --launcher pytorch ${@:3}
+
+
+CONFIG=$1
+GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/train.py \
+    --config $CONFIG \
+    --launcher pytorch ${@:3}
+
+# torchrun --nnodes=$NNODES --nproc_per_node=$GPUS --master_addr=$MASTER_ADDR --master_port=$PORT --node_rank=$NODE_RANK \
+#     $(dirname "$0")/train.py \
+#     --config $CONFIG \
+#     --launcher pytorch ${@:3}
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/.mim/tools/misc/browse_dataset.py b/head_extractor/src/mmseg/.mim/tools/misc/browse_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..7863eb74f2cab53d025afad347f7886a5ce29919
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/misc/browse_dataset.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+from mmengine import Config, DictAction
+from mmengine.registry import init_default_scope
+from mmengine.utils import ProgressBar
+
+from mmseg.registry import DATASETS, VISUALIZERS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=2,
+        help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # register all modules in mmseg into the registries
+    init_default_scope('mmseg')
+
+    dataset = DATASETS.build(cfg.train_dataloader.dataset)
+    cfg.visualizer['save_dir'] = args.output_dir
+    visualizer = VISUALIZERS.build(cfg.visualizer)
+    visualizer.dataset_meta = dataset.METAINFO
+
+    progress_bar = ProgressBar(len(dataset))
+    for item in dataset:
+        img = item['inputs'].permute(1, 2, 0).numpy()
+        data_sample = item['data_samples'].numpy()
+        img_path = osp.basename(item['data_samples'].img_path)
+
+        img = img[..., [2, 1, 0]]  # bgr to rgb
+
+        visualizer.add_datasample(
+            osp.basename(img_path),
+            img,
+            data_sample,
+            show=not args.not_show,
+            wait_time=args.show_interval)
+
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/misc/print_config.py b/head_extractor/src/mmseg/.mim/tools/misc/print_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a1c024a6a44157a0b0d4d6213d18d67f57a33c5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/misc/print_config.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import warnings
+
+from mmengine import Config, DictAction
+
+from mmseg.apis import init_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--graph', action='store_true', help='print the models graph')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help="--options is deprecated in favor of --cfg_options' and it will "
+        'not be supported in version v0.22.0. Override some settings in the '
+        'used config, the key-value pair in xxx=yyy format will be merged '
+        'into config file. If the value to be overwritten is a list, it '
+        'should be like key="[a,b]" or key=a,b It also allows nested '
+        'list/tuple values, e.g. key="[(a,b),(c,d)]" Note that the quotation '
+        'marks are necessary and that no white space is allowed.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both '
+            'specified, --options is deprecated in favor of --cfg-options. '
+            '--options will not be supported in version v0.22.0.')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options, '
+                      '--options will not be supported in version v0.22.0.')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    print(f'Config:\n{cfg.pretty_text}')
+    # dump config
+    cfg.dump('example.py')
+    # dump models graph
+    if args.graph:
+        model = init_model(args.config, device='cpu')
+        print(f'Model graph:\n{str(model)}')
+        with open('example-graph.txt', 'w') as f:
+            f.writelines(str(model))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/misc/publish_model.py b/head_extractor/src/mmseg/.mim/tools/misc/publish_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e035ad90e85e0e03d8304c1d5b524c5ac322c644
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/misc/publish_model.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+from hashlib import sha256
+
+import torch
+
+BLOCK_SIZE = 128 * 1024
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def sha256sum(filename: str) -> str:
+    """Compute SHA256 message digest from a file."""
+    hash_func = sha256()
+    byte_array = bytearray(BLOCK_SIZE)
+    memory_view = memoryview(byte_array)
+    with open(filename, 'rb', buffering=0) as file:
+        for block in iter(lambda: file.readinto(memory_view), 0):
+            hash_func.update(memory_view[:block])
+    return hash_func.hexdigest()
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    sha = sha256sum(in_file)
+    final_file = out_file.rstrip('.pth') + f'-{sha[:8]}.pth'
+    subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/model_converters/beit2mmseg.py b/head_extractor/src/mmseg/.mim/tools/model_converters/beit2mmseg.py
new file mode 100644
index 0000000000000000000000000000000000000000..20f8f0f4509f93291782ca152bf04ab019b0e0ff
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/model_converters/beit2mmseg.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_beit(ckpt):
+    new_ckpt = OrderedDict()
+
+    for k, v in ckpt.items():
+        if k.startswith('patch_embed'):
+            new_key = k.replace('patch_embed.proj', 'patch_embed.projection')
+            new_ckpt[new_key] = v
+        if k.startswith('blocks'):
+            new_key = k.replace('blocks', 'layers')
+            if 'norm' in new_key:
+                new_key = new_key.replace('norm', 'ln')
+            elif 'mlp.fc1' in new_key:
+                new_key = new_key.replace('mlp.fc1', 'ffn.layers.0.0')
+            elif 'mlp.fc2' in new_key:
+                new_key = new_key.replace('mlp.fc2', 'ffn.layers.1')
+            new_ckpt[new_key] = v
+        else:
+            new_key = k
+            new_ckpt[new_key] = v
+
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in official pretrained beit models to'
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    weight = convert_beit(state_dict)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/model_converters/clip2mmseg.py b/head_extractor/src/mmseg/.mim/tools/model_converters/clip2mmseg.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a97e4b04ab45740ee37149d30a85b67245868f5
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/model_converters/clip2mmseg.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_vitlayer(paras):
+    new_para_name = ''
+    if paras[0] == 'ln_1':
+        new_para_name = '.'.join(['ln1'] + paras[1:])
+    elif paras[0] == 'attn':
+        new_para_name = '.'.join(['attn.attn'] + paras[1:])
+    elif paras[0] == 'ln_2':
+        new_para_name = '.'.join(['ln2'] + paras[1:])
+    elif paras[0] == 'mlp':
+        if paras[1] == 'c_fc':
+            new_para_name = '.'.join(['ffn.layers.0.0'] + paras[-1:])
+        else:
+            new_para_name = '.'.join(['ffn.layers.1'] + paras[-1:])
+    else:
+        print(f'Wrong for {paras}')
+    return new_para_name
+
+
+def convert_translayer(paras):
+    new_para_name = ''
+    if paras[0] == 'attn':
+        new_para_name = '.'.join(['attentions.0.attn'] + paras[1:])
+    elif paras[0] == 'ln_1':
+        new_para_name = '.'.join(['norms.0'] + paras[1:])
+    elif paras[0] == 'ln_2':
+        new_para_name = '.'.join(['norms.1'] + paras[1:])
+    elif paras[0] == 'mlp':
+        if paras[1] == 'c_fc':
+            new_para_name = '.'.join(['ffns.0.layers.0.0'] + paras[2:])
+        elif paras[1] == 'c_proj':
+            new_para_name = '.'.join(['ffns.0.layers.1'] + paras[2:])
+        else:
+            print(f'Wrong for {paras}')
+    else:
+        print(f'Wrong for {paras}')
+    return new_para_name
+
+
+def convert_key_name(ckpt, visual_split):
+    new_ckpt = OrderedDict()
+    for k, v in ckpt.items():
+        key_list = k.split('.')
+        if key_list[0] == 'visual':
+            new_transform_name = 'image_encoder'
+            if key_list[1] == 'class_embedding':
+                new_name = '.'.join([new_transform_name, 'cls_token'])
+            elif key_list[1] == 'positional_embedding':
+                new_name = '.'.join([new_transform_name, 'pos_embed'])
+            elif key_list[1] == 'conv1':
+                new_name = '.'.join([
+                    new_transform_name, 'patch_embed.projection', key_list[2]
+                ])
+            elif key_list[1] == 'ln_pre':
+                new_name = '.'.join(
+                    [new_transform_name, key_list[1], key_list[2]])
+            elif key_list[1] == 'transformer':
+                new_layer_name = 'layers'
+                layer_index = key_list[3]
+                paras = key_list[4:]
+                if int(layer_index) < visual_split:
+                    new_para_name = convert_vitlayer(paras)
+                    new_name = '.'.join([
+                        new_transform_name, new_layer_name, layer_index,
+                        new_para_name
+                    ])
+                else:
+                    new_para_name = convert_translayer(paras)
+                    new_transform_name = 'decode_head.rec_with_attnbias'
+                    new_layer_name = 'layers'
+                    layer_index = str(int(layer_index) - visual_split)
+                    new_name = '.'.join([
+                        new_transform_name, new_layer_name, layer_index,
+                        new_para_name
+                    ])
+            elif key_list[1] == 'proj':
+                new_name = 'decode_head.rec_with_attnbias.proj.weight'
+            elif key_list[1] == 'ln_post':
+                new_name = k.replace('visual', 'decode_head.rec_with_attnbias')
+            else:
+                print(f'pop parameter: {k}')
+                continue
+        else:
+            text_encoder_name = 'text_encoder'
+            if key_list[0] == 'transformer':
+                layer_name = 'transformer'
+                layer_index = key_list[2]
+                paras = key_list[3:]
+                new_para_name = convert_translayer(paras)
+                new_name = '.'.join([
+                    text_encoder_name, layer_name, layer_index, new_para_name
+                ])
+            elif key_list[0] in [
+                    'positional_embedding', 'text_projection', 'bg_embed',
+                    'attn_mask', 'logit_scale', 'token_embedding', 'ln_final'
+            ]:
+                new_name = 'text_encoder.' + k
+            else:
+                print(f'pop parameter: {k}')
+                continue
+        new_ckpt[new_name] = v
+
+    return new_ckpt
+
+
+def convert_tensor(ckpt):
+    cls_token = ckpt['image_encoder.cls_token']
+    new_cls_token = cls_token.unsqueeze(0).unsqueeze(0)
+    ckpt['image_encoder.cls_token'] = new_cls_token
+    pos_embed = ckpt['image_encoder.pos_embed']
+    new_pos_embed = pos_embed.unsqueeze(0)
+    ckpt['image_encoder.pos_embed'] = new_pos_embed
+    proj_weight = ckpt['decode_head.rec_with_attnbias.proj.weight']
+    new_proj_weight = proj_weight.transpose(1, 0)
+    ckpt['decode_head.rec_with_attnbias.proj.weight'] = new_proj_weight
+    return ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in timm pretrained vit models to '
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    if any([s in args.src for s in ['B-16', 'b16', 'base_patch16']]):
+        visual_split = 9
+    elif any([s in args.src for s in ['L-14', 'l14', 'large_patch14']]):
+        visual_split = 18
+    else:
+        print('Make sure the clip model is ViT-B/16 or ViT-L/14!')
+        visual_split = -1
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    if isinstance(checkpoint, torch.jit.RecursiveScriptModule):
+        state_dict = checkpoint.state_dict()
+    else:
+        if 'state_dict' in checkpoint:
+            # timm checkpoint
+            state_dict = checkpoint['state_dict']
+        elif 'model' in checkpoint:
+            # deit checkpoint
+            state_dict = checkpoint['model']
+        else:
+            state_dict = checkpoint
+    weight = convert_key_name(state_dict, visual_split)
+    weight = convert_tensor(weight)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/model_converters/mit2mmseg.py b/head_extractor/src/mmseg/.mim/tools/model_converters/mit2mmseg.py
new file mode 100644
index 0000000000000000000000000000000000000000..f10cbbf9d40d3656be0d447460c12fc83771c14c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/model_converters/mit2mmseg.py
@@ -0,0 +1,82 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_mit(ckpt):
+    new_ckpt = OrderedDict()
+    # Process the concat between q linear weights and kv linear weights
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        # patch embedding conversion
+        elif k.startswith('patch_embed'):
+            stage_i = int(k.split('.')[0].replace('patch_embed', ''))
+            new_k = k.replace(f'patch_embed{stage_i}', f'layers.{stage_i-1}.0')
+            new_v = v
+            if 'proj.' in new_k:
+                new_k = new_k.replace('proj.', 'projection.')
+        # transformer encoder layer conversion
+        elif k.startswith('block'):
+            stage_i = int(k.split('.')[0].replace('block', ''))
+            new_k = k.replace(f'block{stage_i}', f'layers.{stage_i-1}.1')
+            new_v = v
+            if 'attn.q.' in new_k:
+                sub_item_k = k.replace('q.', 'kv.')
+                new_k = new_k.replace('q.', 'attn.in_proj_')
+                new_v = torch.cat([v, ckpt[sub_item_k]], dim=0)
+            elif 'attn.kv.' in new_k:
+                continue
+            elif 'attn.proj.' in new_k:
+                new_k = new_k.replace('proj.', 'attn.out_proj.')
+            elif 'attn.sr.' in new_k:
+                new_k = new_k.replace('sr.', 'sr.')
+            elif 'mlp.' in new_k:
+                string = f'{new_k}-'
+                new_k = new_k.replace('mlp.', 'ffn.layers.')
+                if 'fc1.weight' in new_k or 'fc2.weight' in new_k:
+                    new_v = v.reshape((*v.shape, 1, 1))
+                new_k = new_k.replace('fc1.', '0.')
+                new_k = new_k.replace('dwconv.dwconv.', '1.')
+                new_k = new_k.replace('fc2.', '4.')
+                string += f'{new_k} {v.shape}-{new_v.shape}'
+        # norm layer conversion
+        elif k.startswith('norm'):
+            stage_i = int(k.split('.')[0].replace('norm', ''))
+            new_k = k.replace(f'norm{stage_i}', f'layers.{stage_i-1}.2')
+            new_v = v
+        else:
+            new_k = k
+            new_v = v
+        new_ckpt[new_k] = new_v
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in official pretrained segformer to '
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    weight = convert_mit(state_dict)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/model_converters/san2mmseg.py b/head_extractor/src/mmseg/.mim/tools/model_converters/san2mmseg.py
new file mode 100644
index 0000000000000000000000000000000000000000..301a46608e0f14df17138922ae3a747aee105372
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/model_converters/san2mmseg.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_key_name(ckpt):
+    new_ckpt = OrderedDict()
+
+    for k, v in ckpt.items():
+        key_list = k.split('.')
+        if key_list[0] == 'clip_visual_extractor':
+            new_transform_name = 'image_encoder'
+            if key_list[1] == 'class_embedding':
+                new_name = '.'.join([new_transform_name, 'cls_token'])
+            elif key_list[1] == 'positional_embedding':
+                new_name = '.'.join([new_transform_name, 'pos_embed'])
+            elif key_list[1] == 'conv1':
+                new_name = '.'.join([
+                    new_transform_name, 'patch_embed.projection', key_list[2]
+                ])
+            elif key_list[1] == 'ln_pre':
+                new_name = '.'.join(
+                    [new_transform_name, key_list[1], key_list[2]])
+            elif key_list[1] == 'resblocks':
+                new_layer_name = 'layers'
+                layer_index = key_list[2]
+                paras = key_list[3:]
+                if paras[0] == 'ln_1':
+                    new_para_name = '.'.join(['ln1'] + key_list[4:])
+                elif paras[0] == 'attn':
+                    new_para_name = '.'.join(['attn.attn'] + key_list[4:])
+                elif paras[0] == 'ln_2':
+                    new_para_name = '.'.join(['ln2'] + key_list[4:])
+                elif paras[0] == 'mlp':
+                    if paras[1] == 'c_fc':
+                        new_para_name = '.'.join(['ffn.layers.0.0'] +
+                                                 key_list[-1:])
+                    else:
+                        new_para_name = '.'.join(['ffn.layers.1'] +
+                                                 key_list[-1:])
+                new_name = '.'.join([
+                    new_transform_name, new_layer_name, layer_index,
+                    new_para_name
+                ])
+        elif key_list[0] == 'side_adapter_network':
+            decode_head_name = 'decode_head'
+            module_name = 'side_adapter_network'
+            if key_list[1] == 'vit_model':
+                if key_list[2] == 'blocks':
+                    layer_name = 'encode_layers'
+                    layer_index = key_list[3]
+                    paras = key_list[4:]
+                    if paras[0] == 'norm1':
+                        new_para_name = '.'.join(['ln1'] + key_list[5:])
+                    elif paras[0] == 'attn':
+                        new_para_name = '.'.join(key_list[4:])
+                        new_para_name = new_para_name.replace(
+                            'attn.qkv.', 'attn.attn.in_proj_')
+                        new_para_name = new_para_name.replace(
+                            'attn.proj', 'attn.attn.out_proj')
+                    elif paras[0] == 'norm2':
+                        new_para_name = '.'.join(['ln2'] + key_list[5:])
+                    elif paras[0] == 'mlp':
+                        new_para_name = '.'.join(['ffn'] + key_list[5:])
+                        new_para_name = new_para_name.replace(
+                            'fc1', 'layers.0.0')
+                        new_para_name = new_para_name.replace(
+                            'fc2', 'layers.1')
+                    else:
+                        print(f'Wrong for {k}')
+                    new_name = '.'.join([
+                        decode_head_name, module_name, layer_name, layer_index,
+                        new_para_name
+                    ])
+                elif key_list[2] == 'pos_embed':
+                    new_name = '.'.join(
+                        [decode_head_name, module_name, 'pos_embed'])
+                elif key_list[2] == 'patch_embed':
+                    new_name = '.'.join([
+                        decode_head_name, module_name, 'patch_embed',
+                        'projection', key_list[4]
+                    ])
+                else:
+                    print(f'Wrong for {k}')
+            elif key_list[1] == 'query_embed' or key_list[
+                    1] == 'query_pos_embed':
+                new_name = '.'.join(
+                    [decode_head_name, module_name, key_list[1]])
+            elif key_list[1] == 'fusion_layers':
+                layer_name = 'conv_clips'
+                layer_index = key_list[2][-1]
+                paras = '.'.join(key_list[3:])
+                new_para_name = paras.replace('input_proj.0', '0')
+                new_para_name = new_para_name.replace('input_proj.1', '1.conv')
+                new_name = '.'.join([
+                    decode_head_name, module_name, layer_name, layer_index,
+                    new_para_name
+                ])
+            elif key_list[1] == 'mask_decoder':
+                new_name = 'decode_head.' + k
+            else:
+                print(f'Wrong for {k}')
+        elif key_list[0] == 'clip_rec_head':
+            module_name = 'rec_with_attnbias'
+            if key_list[1] == 'proj':
+                new_name = '.'.join(
+                    [decode_head_name, module_name, 'proj.weight'])
+            elif key_list[1] == 'ln_post':
+                new_name = '.'.join(
+                    [decode_head_name, module_name, 'ln_post', key_list[2]])
+            elif key_list[1] == 'resblocks':
+                new_layer_name = 'layers'
+                layer_index = key_list[2]
+                paras = key_list[3:]
+                if paras[0] == 'ln_1':
+                    new_para_name = '.'.join(['norms.0'] + paras[1:])
+                elif paras[0] == 'attn':
+                    new_para_name = '.'.join(['attentions.0.attn'] + paras[1:])
+                elif paras[0] == 'ln_2':
+                    new_para_name = '.'.join(['norms.1'] + paras[1:])
+                elif paras[0] == 'mlp':
+                    if paras[1] == 'c_fc':
+                        new_para_name = '.'.join(['ffns.0.layers.0.0'] +
+                                                 paras[2:])
+                    elif paras[1] == 'c_proj':
+                        new_para_name = '.'.join(['ffns.0.layers.1'] +
+                                                 paras[2:])
+                    else:
+                        print(f'Wrong for {k}')
+                new_name = '.'.join([
+                    decode_head_name, module_name, new_layer_name, layer_index,
+                    new_para_name
+                ])
+            else:
+                print(f'Wrong for {k}')
+        elif key_list[0] == 'ov_classifier':
+            text_encoder_name = 'text_encoder'
+            if key_list[1] == 'transformer':
+                layer_name = 'transformer'
+                layer_index = key_list[3]
+                paras = key_list[4:]
+                if paras[0] == 'attn':
+                    new_para_name = '.'.join(['attentions.0.attn'] + paras[1:])
+                elif paras[0] == 'ln_1':
+                    new_para_name = '.'.join(['norms.0'] + paras[1:])
+                elif paras[0] == 'ln_2':
+                    new_para_name = '.'.join(['norms.1'] + paras[1:])
+                elif paras[0] == 'mlp':
+                    if paras[1] == 'c_fc':
+                        new_para_name = '.'.join(['ffns.0.layers.0.0'] +
+                                                 paras[2:])
+                    elif paras[1] == 'c_proj':
+                        new_para_name = '.'.join(['ffns.0.layers.1'] +
+                                                 paras[2:])
+                    else:
+                        print(f'Wrong for {k}')
+                else:
+                    print(f'Wrong for {k}')
+                new_name = '.'.join([
+                    text_encoder_name, layer_name, layer_index, new_para_name
+                ])
+            elif key_list[1] in [
+                    'positional_embedding', 'text_projection', 'bg_embed',
+                    'attn_mask', 'logit_scale', 'token_embedding', 'ln_final'
+            ]:
+                new_name = k.replace('ov_classifier', 'text_encoder')
+            else:
+                print(f'Wrong for {k}')
+        elif key_list[0] == 'criterion':
+            new_name = k
+        else:
+            print(f'Wrong for {k}')
+        new_ckpt[new_name] = v
+    return new_ckpt
+
+
+def convert_tensor(ckpt):
+    cls_token = ckpt['image_encoder.cls_token']
+    new_cls_token = cls_token.unsqueeze(0).unsqueeze(0)
+    ckpt['image_encoder.cls_token'] = new_cls_token
+    pos_embed = ckpt['image_encoder.pos_embed']
+    new_pos_embed = pos_embed.unsqueeze(0)
+    ckpt['image_encoder.pos_embed'] = new_pos_embed
+    proj_weight = ckpt['decode_head.rec_with_attnbias.proj.weight']
+    new_proj_weight = proj_weight.transpose(1, 0)
+    ckpt['decode_head.rec_with_attnbias.proj.weight'] = new_proj_weight
+    return ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in timm pretrained vit models to '
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    if 'state_dict' in checkpoint:
+        # timm checkpoint
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        # deit checkpoint
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    weight = convert_key_name(state_dict)
+    weight = convert_tensor(weight)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/model_converters/stdc2mmseg.py b/head_extractor/src/mmseg/.mim/tools/model_converters/stdc2mmseg.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ea3b8342f546692f50a8e3c0b740f881058229c
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/model_converters/stdc2mmseg.py
@@ -0,0 +1,71 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_stdc(ckpt, stdc_type):
+    new_state_dict = {}
+    if stdc_type == 'STDC1':
+        stage_lst = ['0', '1', '2.0', '2.1', '3.0', '3.1', '4.0', '4.1']
+    else:
+        stage_lst = [
+            '0', '1', '2.0', '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '3.3',
+            '3.4', '4.0', '4.1', '4.2'
+        ]
+    for k, v in ckpt.items():
+        ori_k = k
+        flag = False
+        if 'cp.' in k:
+            k = k.replace('cp.', '')
+        if 'features.' in k:
+            num_layer = int(k.split('.')[1])
+            feature_key_lst = 'features.' + str(num_layer) + '.'
+            stages_key_lst = 'stages.' + stage_lst[num_layer] + '.'
+            k = k.replace(feature_key_lst, stages_key_lst)
+            flag = True
+        if 'conv_list' in k:
+            k = k.replace('conv_list', 'layers')
+            flag = True
+        if 'avd_layer.' in k:
+            if 'avd_layer.0' in k:
+                k = k.replace('avd_layer.0', 'downsample.conv')
+            elif 'avd_layer.1' in k:
+                k = k.replace('avd_layer.1', 'downsample.bn')
+            flag = True
+        if flag:
+            new_state_dict[k] = ckpt[ori_k]
+
+    return new_state_dict
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in official pretrained STDC1/2 to '
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    parser.add_argument('type', help='model type: STDC1 or STDC2')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+
+    assert args.type in ['STDC1',
+                         'STDC2'], 'STD type should be STDC1 or STDC2!'
+    weight = convert_stdc(state_dict, args.type)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/model_converters/swin2mmseg.py b/head_extractor/src/mmseg/.mim/tools/model_converters/swin2mmseg.py
new file mode 100644
index 0000000000000000000000000000000000000000..d434f9465bbdad6bebc7d5962e8bfaf63c7c9e72
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/model_converters/swin2mmseg.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_swin(ckpt):
+    new_ckpt = OrderedDict()
+
+    def correct_unfold_reduction_order(x):
+        out_channel, in_channel = x.shape
+        x = x.reshape(out_channel, 4, in_channel // 4)
+        x = x[:, [0, 2, 1, 3], :].transpose(1,
+                                            2).reshape(out_channel, in_channel)
+        return x
+
+    def correct_unfold_norm_order(x):
+        in_channel = x.shape[0]
+        x = x.reshape(4, in_channel // 4)
+        x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
+        return x
+
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        elif k.startswith('layers'):
+            new_v = v
+            if 'attn.' in k:
+                new_k = k.replace('attn.', 'attn.w_msa.')
+            elif 'mlp.' in k:
+                if 'mlp.fc1.' in k:
+                    new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.')
+                elif 'mlp.fc2.' in k:
+                    new_k = k.replace('mlp.fc2.', 'ffn.layers.1.')
+                else:
+                    new_k = k.replace('mlp.', 'ffn.')
+            elif 'downsample' in k:
+                new_k = k
+                if 'reduction.' in k:
+                    new_v = correct_unfold_reduction_order(v)
+                elif 'norm.' in k:
+                    new_v = correct_unfold_norm_order(v)
+            else:
+                new_k = k
+            new_k = new_k.replace('layers', 'stages', 1)
+        elif k.startswith('patch_embed'):
+            new_v = v
+            if 'proj' in k:
+                new_k = k.replace('proj', 'projection')
+            else:
+                new_k = k
+        else:
+            new_v = v
+            new_k = k
+
+        new_ckpt[new_k] = new_v
+
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in official pretrained swin models to'
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    weight = convert_swin(state_dict)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/model_converters/twins2mmseg.py b/head_extractor/src/mmseg/.mim/tools/model_converters/twins2mmseg.py
new file mode 100644
index 0000000000000000000000000000000000000000..647d41784aa07468be4b3f2e183064ad55266ad1
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/model_converters/twins2mmseg.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_twins(args, ckpt):
+
+    new_ckpt = OrderedDict()
+
+    for k, v in list(ckpt.items()):
+        new_v = v
+        if k.startswith('head'):
+            continue
+        elif k.startswith('patch_embeds'):
+            if 'proj.' in k:
+                new_k = k.replace('proj.', 'projection.')
+            else:
+                new_k = k
+        elif k.startswith('blocks'):
+            # Union
+            if 'attn.q.' in k:
+                new_k = k.replace('q.', 'attn.in_proj_')
+                new_v = torch.cat([v, ckpt[k.replace('attn.q.', 'attn.kv.')]],
+                                  dim=0)
+            elif 'mlp.fc1' in k:
+                new_k = k.replace('mlp.fc1', 'ffn.layers.0.0')
+            elif 'mlp.fc2' in k:
+                new_k = k.replace('mlp.fc2', 'ffn.layers.1')
+            # Only pcpvt
+            elif args.model == 'pcpvt':
+                if 'attn.proj.' in k:
+                    new_k = k.replace('proj.', 'attn.out_proj.')
+                else:
+                    new_k = k
+
+            # Only svt
+            else:
+                if 'attn.proj.' in k:
+                    k_lst = k.split('.')
+                    if int(k_lst[2]) % 2 == 1:
+                        new_k = k.replace('proj.', 'attn.out_proj.')
+                    else:
+                        new_k = k
+                else:
+                    new_k = k
+            new_k = new_k.replace('blocks.', 'layers.')
+        elif k.startswith('pos_block'):
+            new_k = k.replace('pos_block', 'position_encodings')
+            if 'proj.0.' in new_k:
+                new_k = new_k.replace('proj.0.', 'proj.')
+        else:
+            new_k = k
+        if 'attn.kv.' not in k:
+            new_ckpt[new_k] = new_v
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in timm pretrained vit models to '
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    parser.add_argument('model', help='model: pcpvt or svt')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+
+    if 'state_dict' in checkpoint:
+        # timm checkpoint
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+
+    weight = convert_twins(args, state_dict)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/model_converters/vit2mmseg.py b/head_extractor/src/mmseg/.mim/tools/model_converters/vit2mmseg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d1f8a427e232290c6dcf490e33f777275dd238a
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/model_converters/vit2mmseg.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_vit(ckpt):
+
+    new_ckpt = OrderedDict()
+
+    for k, v in ckpt.items():
+        if k.startswith('head'):
+            continue
+        if k.startswith('norm'):
+            new_k = k.replace('norm.', 'ln1.')
+        elif k.startswith('patch_embed'):
+            if 'proj' in k:
+                new_k = k.replace('proj', 'projection')
+            else:
+                new_k = k
+        elif k.startswith('blocks'):
+            if 'norm' in k:
+                new_k = k.replace('norm', 'ln')
+            elif 'mlp.fc1' in k:
+                new_k = k.replace('mlp.fc1', 'ffn.layers.0.0')
+            elif 'mlp.fc2' in k:
+                new_k = k.replace('mlp.fc2', 'ffn.layers.1')
+            elif 'attn.qkv' in k:
+                new_k = k.replace('attn.qkv.', 'attn.attn.in_proj_')
+            elif 'attn.proj' in k:
+                new_k = k.replace('attn.proj', 'attn.attn.out_proj')
+            else:
+                new_k = k
+            new_k = new_k.replace('blocks.', 'layers.')
+        else:
+            new_k = k
+        new_ckpt[new_k] = v
+
+    return new_ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in timm pretrained vit models to '
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    if 'state_dict' in checkpoint:
+        # timm checkpoint
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        # deit checkpoint
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    weight = convert_vit(state_dict)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/model_converters/vitjax2mmseg.py b/head_extractor/src/mmseg/.mim/tools/model_converters/vitjax2mmseg.py
new file mode 100644
index 0000000000000000000000000000000000000000..81bc2ea020e32d086fc4ce2153cc2bf51edd4d48
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/model_converters/vitjax2mmseg.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+import mmengine
+import numpy as np
+import torch
+
+
+def vit_jax_to_torch(jax_weights, num_layer=12):
+    torch_weights = dict()
+
+    # patch embedding
+    conv_filters = jax_weights['embedding/kernel']
+    conv_filters = conv_filters.permute(3, 2, 0, 1)
+    torch_weights['patch_embed.projection.weight'] = conv_filters
+    torch_weights['patch_embed.projection.bias'] = jax_weights[
+        'embedding/bias']
+
+    # pos embedding
+    torch_weights['pos_embed'] = jax_weights[
+        'Transformer/posembed_input/pos_embedding']
+
+    # cls token
+    torch_weights['cls_token'] = jax_weights['cls']
+
+    # head
+    torch_weights['ln1.weight'] = jax_weights['Transformer/encoder_norm/scale']
+    torch_weights['ln1.bias'] = jax_weights['Transformer/encoder_norm/bias']
+
+    # transformer blocks
+    for i in range(num_layer):
+        jax_block = f'Transformer/encoderblock_{i}'
+        torch_block = f'layers.{i}'
+
+        # attention norm
+        torch_weights[f'{torch_block}.ln1.weight'] = jax_weights[
+            f'{jax_block}/LayerNorm_0/scale']
+        torch_weights[f'{torch_block}.ln1.bias'] = jax_weights[
+            f'{jax_block}/LayerNorm_0/bias']
+
+        # attention
+        query_weight = jax_weights[
+            f'{jax_block}/MultiHeadDotProductAttention_1/query/kernel']
+        query_bias = jax_weights[
+            f'{jax_block}/MultiHeadDotProductAttention_1/query/bias']
+        key_weight = jax_weights[
+            f'{jax_block}/MultiHeadDotProductAttention_1/key/kernel']
+        key_bias = jax_weights[
+            f'{jax_block}/MultiHeadDotProductAttention_1/key/bias']
+        value_weight = jax_weights[
+            f'{jax_block}/MultiHeadDotProductAttention_1/value/kernel']
+        value_bias = jax_weights[
+            f'{jax_block}/MultiHeadDotProductAttention_1/value/bias']
+
+        qkv_weight = torch.from_numpy(
+            np.stack((query_weight, key_weight, value_weight), 1))
+        qkv_weight = torch.flatten(qkv_weight, start_dim=1)
+        qkv_bias = torch.from_numpy(
+            np.stack((query_bias, key_bias, value_bias), 0))
+        qkv_bias = torch.flatten(qkv_bias, start_dim=0)
+
+        torch_weights[f'{torch_block}.attn.attn.in_proj_weight'] = qkv_weight
+        torch_weights[f'{torch_block}.attn.attn.in_proj_bias'] = qkv_bias
+        to_out_weight = jax_weights[
+            f'{jax_block}/MultiHeadDotProductAttention_1/out/kernel']
+        to_out_weight = torch.flatten(to_out_weight, start_dim=0, end_dim=1)
+        torch_weights[
+            f'{torch_block}.attn.attn.out_proj.weight'] = to_out_weight
+        torch_weights[f'{torch_block}.attn.attn.out_proj.bias'] = jax_weights[
+            f'{jax_block}/MultiHeadDotProductAttention_1/out/bias']
+
+        # mlp norm
+        torch_weights[f'{torch_block}.ln2.weight'] = jax_weights[
+            f'{jax_block}/LayerNorm_2/scale']
+        torch_weights[f'{torch_block}.ln2.bias'] = jax_weights[
+            f'{jax_block}/LayerNorm_2/bias']
+
+        # mlp
+        torch_weights[f'{torch_block}.ffn.layers.0.0.weight'] = jax_weights[
+            f'{jax_block}/MlpBlock_3/Dense_0/kernel']
+        torch_weights[f'{torch_block}.ffn.layers.0.0.bias'] = jax_weights[
+            f'{jax_block}/MlpBlock_3/Dense_0/bias']
+        torch_weights[f'{torch_block}.ffn.layers.1.weight'] = jax_weights[
+            f'{jax_block}/MlpBlock_3/Dense_1/kernel']
+        torch_weights[f'{torch_block}.ffn.layers.1.bias'] = jax_weights[
+            f'{jax_block}/MlpBlock_3/Dense_1/bias']
+
+    # transpose weights
+    for k, v in torch_weights.items():
+        if 'weight' in k and 'patch_embed' not in k and 'ln' not in k:
+            v = v.permute(1, 0)
+        torch_weights[k] = v
+
+    return torch_weights
+
+
+def main():
+    # stole refactoring code from Robin Strudel, thanks
+    parser = argparse.ArgumentParser(
+        description='Convert keys from jax official pretrained vit models to '
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    jax_weights = np.load(args.src)
+    jax_weights_tensor = {}
+    for key in jax_weights.files:
+        value = torch.from_numpy(jax_weights[key])
+        jax_weights_tensor[key] = value
+    if 'L_16-i21k' in args.src:
+        num_layer = 24
+    else:
+        num_layer = 12
+    torch_weights = vit_jax_to_torch(jax_weights_tensor, num_layer)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(torch_weights, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/slurm_test.sh b/head_extractor/src/mmseg/.mim/tools/slurm_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4e6f7bf4e33267f269cf0f455924cb70166ccd4b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/slurm_test.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+CHECKPOINT=$4
+GPUS=${GPUS:-4}
+GPUS_PER_NODE=${GPUS_PER_NODE:-4}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+PY_ARGS=${@:5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS}
diff --git a/head_extractor/src/mmseg/.mim/tools/slurm_train.sh b/head_extractor/src/mmseg/.mim/tools/slurm_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ab232105f0309c720ed81a522eca14b6fbd64afd
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/slurm_train.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+set -x
+
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+GPUS=${GPUS:-4}
+GPUS_PER_NODE=${GPUS_PER_NODE:-4}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+PY_ARGS=${@:4}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py ${CONFIG} --launcher="slurm" ${PY_ARGS}
diff --git a/head_extractor/src/mmseg/.mim/tools/test.py b/head_extractor/src/mmseg/.mim/tools/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f03900711f661326eea295741ebc4106cbaeef2
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/test.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.runner import Runner
+
+
+# TODO: support fuse_conv_bn, visualization, and format_only
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMSeg test (and eval) a model')
+    parser.add_argument('--config', help='train config file path')
+    parser.add_argument('--checkpoint', help='checkpoint file')
+    parser.add_argument(
+        '--work-dir',
+        help=('if specified, the evaluation metric results will be dumped'
+              'into the directory as json'))
+    parser.add_argument(
+        '--out',
+        type=str,
+        help='The directory to save output prediction for offline evaluation')
+    parser.add_argument(
+        '--show', action='store_true', help='show prediction results')
+    parser.add_argument(
+        '--show-dir',
+        help='directory where painted images will be saved. '
+        'If specified, it will be automatically saved '
+        'to the work_dir/timestamp/show_dir')
+    parser.add_argument(
+        '--wait-time', type=float, default=2, help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument(
+        '--tta', action='store_true', help='Test time augmentation')
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def trigger_visualization_hook(cfg, args):
+    default_hooks = cfg.default_hooks
+    if 'visualization' in default_hooks:
+        visualization_hook = default_hooks['visualization']
+        # Turn on visualization
+        visualization_hook['draw'] = True
+        if args.show:
+            visualization_hook['show'] = True
+            visualization_hook['wait_time'] = args.wait_time
+        if args.show_dir:
+            visualizer = cfg.visualizer
+            visualizer['save_dir'] = args.show_dir
+    else:
+        raise RuntimeError(
+            'VisualizationHook must be included in default_hooks.'
+            'refer to usage '
+            '"visualization=dict(type=\'VisualizationHook\')"')
+
+    return cfg
+
+
+def main():
+    args = parse_args()
+    
+    args.config = 'work_dirs/imaterialist/depth_anything_large_mask2former_16xb1_160k_896x896/depth_anything_large_mask2former_16xb1_160k_imaterialist_896x896.py'
+    args.checkpoint = 'work_dirs/imaterialist/depth_anything_large_mask2former_16xb1_160k_896x896/iter_85890.pth'
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    cfg.load_from = args.checkpoint
+
+    if args.show or args.show_dir:
+        cfg = trigger_visualization_hook(cfg, args)
+
+    if args.tta:
+        cfg.test_dataloader.dataset.pipeline = cfg.tta_pipeline
+        cfg.tta_model.module = cfg.model
+        cfg.model = cfg.tta_model
+
+    # add output_dir in metric
+    if args.out is not None:
+        cfg.test_evaluator['output_dir'] = args.out
+        cfg.test_evaluator['keep_results'] = True
+
+    # build the runner from config
+    runner = Runner.from_cfg(cfg)
+
+    # start testing
+    runner.test()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/torchserve/mmseg2torchserve.py b/head_extractor/src/mmseg/.mim/tools/torchserve/mmseg2torchserve.py
new file mode 100644
index 0000000000000000000000000000000000000000..23f99638e799fd0b37a6737cc833dd7d24f611f8
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/torchserve/mmseg2torchserve.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser, Namespace
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+from mmengine import Config
+from mmengine.utils import mkdir_or_exist
+
+try:
+    from model_archiver.model_packaging import package_model
+    from model_archiver.model_packaging_utils import ModelExportUtils
+except ImportError:
+    package_model = None
+
+
+def mmseg2torchserve(
+    config_file: str,
+    checkpoint_file: str,
+    output_folder: str,
+    model_name: str,
+    model_version: str = '1.0',
+    force: bool = False,
+):
+    """Converts mmsegmentation model (config + checkpoint) to TorchServe
+    `.mar`.
+
+    Args:
+        config_file:
+            In MMSegmentation config format.
+            The contents vary for each task repository.
+        checkpoint_file:
+            In MMSegmentation checkpoint format.
+            The contents vary for each task repository.
+        output_folder:
+            Folder where `{model_name}.mar` will be created.
+            The file created will be in TorchServe archive format.
+        model_name:
+            If not None, used for naming the `{model_name}.mar` file
+            that will be created under `output_folder`.
+            If None, `{Path(checkpoint_file).stem}` will be used.
+        model_version:
+            Model's version.
+        force:
+            If True, if there is an existing `{model_name}.mar`
+            file under `output_folder` it will be overwritten.
+    """
+    mkdir_or_exist(output_folder)
+
+    config = Config.fromfile(config_file)
+
+    with TemporaryDirectory() as tmpdir:
+        config.dump(f'{tmpdir}/config.py')
+
+        args = Namespace(
+            **{
+                'model_file': f'{tmpdir}/config.py',
+                'serialized_file': checkpoint_file,
+                'handler': f'{Path(__file__).parent}/mmseg_handler.py',
+                'model_name': model_name or Path(checkpoint_file).stem,
+                'version': model_version,
+                'export_path': output_folder,
+                'force': force,
+                'requirements_file': None,
+                'extra_files': None,
+                'runtime': 'python',
+                'archive_format': 'default'
+            })
+        manifest = ModelExportUtils.generate_manifest_json(args)
+        package_model(args, manifest)
+
+
+def parse_args():
+    parser = ArgumentParser(
+        description='Convert mmseg models to TorchServe `.mar` format.')
+    parser.add_argument('config', type=str, help='config file path')
+    parser.add_argument('checkpoint', type=str, help='checkpoint file path')
+    parser.add_argument(
+        '--output-folder',
+        type=str,
+        required=True,
+        help='Folder where `{model_name}.mar` will be created.')
+    parser.add_argument(
+        '--model-name',
+        type=str,
+        default=None,
+        help='If not None, used for naming the `{model_name}.mar`'
+        'file that will be created under `output_folder`.'
+        'If None, `{Path(checkpoint_file).stem}` will be used.')
+    parser.add_argument(
+        '--model-version',
+        type=str,
+        default='1.0',
+        help='Number used for versioning.')
+    parser.add_argument(
+        '-f',
+        '--force',
+        action='store_true',
+        help='overwrite the existing `{model_name}.mar`')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if package_model is None:
+        raise ImportError('`torch-model-archiver` is required.'
+                          'Try: pip install torch-model-archiver')
+
+    mmseg2torchserve(args.config, args.checkpoint, args.output_folder,
+                     args.model_name, args.model_version, args.force)
diff --git a/head_extractor/src/mmseg/.mim/tools/torchserve/mmseg_handler.py b/head_extractor/src/mmseg/.mim/tools/torchserve/mmseg_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbe5ded8482c1113a6adb45a22b650af71f6294e
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/torchserve/mmseg_handler.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import base64
+import os
+
+import cv2
+import mmcv
+import torch
+from mmengine.model.utils import revert_sync_batchnorm
+from ts.torch_handler.base_handler import BaseHandler
+
+from mmseg.apis import inference_model, init_model
+
+
+class MMsegHandler(BaseHandler):
+
+    def initialize(self, context):
+        properties = context.system_properties
+        self.map_location = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = torch.device(self.map_location + ':' +
+                                   str(properties.get('gpu_id')) if torch.cuda.
+                                   is_available() else self.map_location)
+        self.manifest = context.manifest
+
+        model_dir = properties.get('model_dir')
+        serialized_file = self.manifest['model']['serializedFile']
+        checkpoint = os.path.join(model_dir, serialized_file)
+        self.config_file = os.path.join(model_dir, 'config.py')
+
+        self.model = init_model(self.config_file, checkpoint, self.device)
+        self.model = revert_sync_batchnorm(self.model)
+        self.initialized = True
+
+    def preprocess(self, data):
+        images = []
+
+        for row in data:
+            image = row.get('data') or row.get('body')
+            if isinstance(image, str):
+                image = base64.b64decode(image)
+            image = mmcv.imfrombytes(image)
+            images.append(image)
+
+        return images
+
+    def inference(self, data, *args, **kwargs):
+        results = [inference_model(self.model, img) for img in data]
+        return results
+
+    def postprocess(self, data):
+        output = []
+
+        for image_result in data:
+            _, buffer = cv2.imencode('.png', image_result[0].astype('uint8'))
+            content = buffer.tobytes()
+            output.append(content)
+        return output
diff --git a/head_extractor/src/mmseg/.mim/tools/torchserve/test_torchserve.py b/head_extractor/src/mmseg/.mim/tools/torchserve/test_torchserve.py
new file mode 100644
index 0000000000000000000000000000000000000000..b015b6658556e5045af2daf5d998de0de61e1f6b
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/torchserve/test_torchserve.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+from io import BytesIO
+
+import matplotlib.pyplot as plt
+import mmcv
+import requests
+
+from mmseg.apis import inference_model, init_model
+
+
+def parse_args():
+    parser = ArgumentParser(
+        description='Compare result of torchserve and pytorch,'
+        'and visualize them.')
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('model_name', help='The model name in the server')
+    parser.add_argument(
+        '--inference-addr',
+        default='127.0.0.1:8080',
+        help='Address and port of the inference server')
+    parser.add_argument(
+        '--result-image',
+        type=str,
+        default=None,
+        help='save server output in result-image')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    url = 'http://' + args.inference_addr + '/predictions/' + args.model_name
+    with open(args.img, 'rb') as image:
+        tmp_res = requests.post(url, image)
+    content = tmp_res.content
+    if args.result_image:
+        with open(args.result_image, 'wb') as out_image:
+            out_image.write(content)
+        plt.imshow(mmcv.imread(args.result_image, 'grayscale'))
+        plt.show()
+    else:
+        plt.imshow(plt.imread(BytesIO(content)))
+        plt.show()
+    model = init_model(args.config, args.checkpoint, args.device)
+    image = mmcv.imread(args.img)
+    result = inference_model(model, image)
+    plt.imshow(result[0])
+    plt.show()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/head_extractor/src/mmseg/.mim/tools/train.py b/head_extractor/src/mmseg/.mim/tools/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..d475fcaa3722dc3a6ce8ecf7d3115050b98d6786
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/train.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import logging
+import os
+import os.path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.logging import print_log
+from mmengine.runner import Runner
+
+from mmseg.registry import RUNNERS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a segmentor')
+    # parser.add_argument('config', help='train config file path')
+    parser.add_argument('--config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume',
+        action='store_true',
+        default=False,
+        help='resume from the latest checkpoint in the work_dir automatically')
+    parser.add_argument(
+        '--amp',
+        action='store_true',
+        default=False,
+        help='enable automatic-mixed-precision training')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def main():
+    args = parse_args()
+    # args.config = 'configs/depth_anything_imaterialist/depth_anything_large_mask2former_16xb1_160k_imaterialist_896x896.py'
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+
+    # enable automatic-mixed-precision training
+    if args.amp is True:
+        optim_wrapper = cfg.optim_wrapper.type
+        if optim_wrapper == 'AmpOptimWrapper':
+            print_log(
+                'AMP training is already enabled in your config.',
+                logger='current',
+                level=logging.WARNING)
+        else:
+            assert optim_wrapper == 'OptimWrapper', (
+                '`--amp` is only supported when the optimizer wrapper type is '
+                f'`OptimWrapper` but got {optim_wrapper}.')
+            cfg.optim_wrapper.type = 'AmpOptimWrapper'
+            cfg.optim_wrapper.loss_scale = 'dynamic'
+
+    # resume training
+    cfg.resume = args.resume
+
+    # build the runner from config
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
+
+    # start training
+    runner.train()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/head_extractor/src/mmseg/.mim/tools/vis_label.py b/head_extractor/src/mmseg/.mim/tools/vis_label.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1ec4f556773ba67cdabc6dfd73313c4794c2473
--- /dev/null
+++ b/head_extractor/src/mmseg/.mim/tools/vis_label.py
@@ -0,0 +1,33 @@
+import os
+import cv2
+import numpy as np
+
+
+def add_mask2image_binary(images_path, masks_path, masked_path):
+    # Add binary masks to images
+    # for img_item in os.listdir(images_path):
+    #     print(img_item)
+    #     img_path = os.path.join(images_path, img_item)
+    #     img = cv2.imread(img_path)
+    #     mask_path = os.path.join(masks_path, img_item[:-4] + '.png')  # mask是.png格式的，image是.jpg格式的
+    #     mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)  # 将彩色mask以二值图像形式读取
+    #     masked = cv2.add(img, np.zeros(np.shape(img), dtype=np.uint8), mask=mask)  # 将image的相素值和mask像素值相加得到结果
+    #     cv2.imwrite(os.path.join(masked_path, img_item), masked)
+    #     break
+
+    img_item = '000001.jpg'
+    print(img_item)
+    img_path = os.path.join(images_path, img_item)
+    img = cv2.imread(img_path)
+    mask_path = os.path.join(masks_path, img_item[:-4] + '.png')  # mask是.png格式的，image是.jpg格式的
+    mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)  # 将彩色mask以二值图像形式读取
+    masked = cv2.add(img, np.zeros(np.shape(img), dtype=np.uint8), mask=mask)  # 将image的相素值和mask像素值相加得到结果
+    cv2.imwrite(os.path.join(masked_path, img_item), masked)
+
+
+# 注意使用全局路径，且无中文
+images_path = r'/mnt/data/ocrSpace/linyangsheng/workspace/data/deepfashion2/validation/image'
+masks_path = r'/mnt/data/ocrSpace/linyangsheng/workspace/data/deepfashion2/validation/mask'
+masked_path = r'.'
+add_mask2image_binary(images_path, masks_path, masked_path)
+
diff --git a/head_extractor/src/mmseg/__init__.py b/head_extractor/src/mmseg/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..092a6f161b4801b433a09709b7babfe1074073f7
--- /dev/null
+++ b/head_extractor/src/mmseg/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import mmengine
+from packaging.version import parse
+
+from .version import __version__, version_info
+
+MMCV_MIN = '2.0.0rc4'
+MMCV_MAX = '2.2.0'
+MMENGINE_MIN = '0.5.0'
+MMENGINE_MAX = '1.0.0'
+
+
+def digit_version(version_str: str, length: int = 4):
+    """Convert a version string into a tuple of integers.
+
+    This method is usually used for comparing two versions. For pre-release
+    versions: alpha < beta < rc.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Default: 4.
+
+    Returns:
+        tuple[int]: The version info in digits (integers).
+    """
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        mapping = {'a': -3, 'b': -2, 'rc': -1}
+        val = -4
+        # version.pre can be None
+        if version.pre:
+            if version.pre[0] not in mapping:
+                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
+                              'version checking may go wrong')
+            else:
+                val = mapping[version.pre[0]]
+            release.extend([val, version.pre[-1]])
+        else:
+            release.extend([val, 0])
+
+    elif version.is_postrelease:
+        release.extend([1, version.post])
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+
+mmcv_min_version = digit_version(MMCV_MIN)
+mmcv_max_version = digit_version(MMCV_MAX)
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_min_version <= mmcv_version <= mmcv_max_version), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>=2.0.0rc4.'
+
+mmengine_min_version = digit_version(MMENGINE_MIN)
+mmengine_max_version = digit_version(MMENGINE_MAX)
+mmengine_version = digit_version(mmengine.__version__)
+
+assert (mmengine_min_version <= mmengine_version < mmengine_max_version), \
+    f'MMEngine=={mmengine.__version__} is used but incompatible. ' \
+    f'Please install mmengine>={mmengine_min_version}, '\
+    f'<{mmengine_max_version}.'
+
+__all__ = ['__version__', 'version_info', 'digit_version']
diff --git a/head_extractor/src/mmseg/apis/__init__.py b/head_extractor/src/mmseg/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b50a266319c9cf74cb8b13afcff564248c058732
--- /dev/null
+++ b/head_extractor/src/mmseg/apis/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .inference import inference_model, init_model, show_result_pyplot
+from .mmseg_inferencer import MMSegInferencer
+from .remote_sense_inferencer import RSImage, RSInferencer
+
+__all__ = [
+    'init_model', 'inference_model', 'show_result_pyplot', 'MMSegInferencer',
+    'RSInferencer', 'RSImage'
+]
diff --git a/head_extractor/src/mmseg/apis/inference.py b/head_extractor/src/mmseg/apis/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7be9a269972ed8f7f601a2410a7797db08f835f
--- /dev/null
+++ b/head_extractor/src/mmseg/apis/inference.py
@@ -0,0 +1,190 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from pathlib import Path
+from typing import Optional, Union
+
+import mmcv
+import numpy as np
+import torch
+from mmengine import Config
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+from mmengine.utils import mkdir_or_exist
+
+from mmseg.models import BaseSegmentor
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+from mmseg.utils import SampleList, dataset_aliases, get_classes, get_palette
+from mmseg.visualization import SegLocalVisualizer
+from .utils import ImageType, _preprare_data
+
+
+def init_model(config: Union[str, Path, Config],
+               checkpoint: Optional[str] = None,
+               device: str = 'cuda:0',
+               cfg_options: Optional[dict] = None):
+    """Initialize a segmentor from config file.
+
+    Args:
+        config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path,
+            :obj:`Path`, or the config object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+        device (str, optional) CPU/CUDA device option. Default 'cuda:0'.
+            Use 'cpu' for loading model on CPU.
+        cfg_options (dict, optional): Options to override some settings in
+            the used config.
+    Returns:
+        nn.Module: The constructed segmentor.
+    """
+    if isinstance(config, (str, Path)):
+        config = Config.fromfile(config)
+    elif not isinstance(config, Config):
+        raise TypeError('config must be a filename or Config object, '
+                        'but got {}'.format(type(config)))
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+    if config.model.type == 'EncoderDecoder':
+        if 'init_cfg' in config.model.backbone:
+            config.model.backbone.init_cfg = None
+    elif config.model.type == 'MultimodalEncoderDecoder':
+        for k, v in config.model.items():
+            if isinstance(v, dict) and 'init_cfg' in v:
+                config.model[k].init_cfg = None
+    config.model.pretrained = None
+    config.model.train_cfg = None
+    init_default_scope(config.get('default_scope', 'mmseg'))
+    # import pdb; pdb.set_trace()
+    model = MODELS.build(config.model)
+    if checkpoint is not None:
+        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+        dataset_meta = checkpoint['meta'].get('dataset_meta', None)
+        # save the dataset_meta in the model for convenience
+        if 'dataset_meta' in checkpoint.get('meta', {}):
+            # mmseg 1.x
+            model.dataset_meta = dataset_meta
+        elif 'CLASSES' in checkpoint.get('meta', {}):
+            # < mmseg 1.x
+            classes = checkpoint['meta']['CLASSES']
+            palette = checkpoint['meta']['PALETTE']
+            model.dataset_meta = {'classes': classes, 'palette': palette}
+        else:
+            warnings.simplefilter('once')
+            warnings.warn(
+                'dataset_meta or class names are not saved in the '
+                'checkpoint\'s meta data, classes and palette will be'
+                'set according to num_classes ')
+            num_classes = model.decode_head.num_classes
+            dataset_name = None
+            for name in dataset_aliases.keys():
+                if len(get_classes(name)) == num_classes:
+                    dataset_name = name
+                    break
+            if dataset_name is None:
+                warnings.warn(
+                    'No suitable dataset found, use Cityscapes by default')
+                dataset_name = 'cityscapes'
+            model.dataset_meta = {
+                'classes': get_classes(dataset_name),
+                'palette': get_palette(dataset_name)
+            }
+    model.cfg = config  # save the config in the model for convenience
+    model.to(device)
+    model.eval()
+    return model
+
+
+def inference_model(model: BaseSegmentor,
+                    img: ImageType) -> Union[SegDataSample, SampleList]:
+    """Inference image(s) with the segmentor.
+
+    Args:
+        model (nn.Module): The loaded segmentor.
+        imgs (str/ndarray or list[str/ndarray]): Either image files or loaded
+            images.
+
+    Returns:
+        :obj:`SegDataSample` or list[:obj:`SegDataSample`]:
+        If imgs is a list or tuple, the same length list type results
+        will be returned, otherwise return the segmentation results directly.
+    """
+    # prepare data
+    data, is_batch = _preprare_data(img, model)
+
+    # data = data.to(dtype=torch.float16)
+    # forward the model
+    with torch.no_grad():
+        results = model.test_step(data)
+
+    return results if is_batch else results[0]
+
+
+def show_result_pyplot(model: BaseSegmentor,
+                       img: Union[str, np.ndarray],
+                       result: SegDataSample,
+                       opacity: float = 0.5,
+                       title: str = '',
+                       draw_gt: bool = True,
+                       draw_pred: bool = True,
+                       wait_time: float = 0,
+                       show: bool = True,
+                       with_labels: Optional[bool] = True,
+                       save_dir=None,
+                       out_file=None):
+    """Visualize the segmentation results on the image.
+
+    Args:
+        model (nn.Module): The loaded segmentor.
+        img (str or np.ndarray): Image filename or loaded image.
+        result (SegDataSample): The prediction SegDataSample result.
+        opacity(float): Opacity of painted segmentation map.
+            Default 0.5. Must be in (0, 1] range.
+        title (str): The title of pyplot figure.
+            Default is ''.
+        draw_gt (bool): Whether to draw GT SegDataSample. Default to True.
+        draw_pred (bool): Whether to draw Prediction SegDataSample.
+            Defaults to True.
+        wait_time (float): The interval of show (s). 0 is the special value
+            that means "forever". Defaults to 0.
+        show (bool): Whether to display the drawn image.
+            Default to True.
+        with_labels(bool, optional): Add semantic labels in visualization
+            result, Default to True.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        out_file (str, optional): Path to output file. Default to None.
+
+
+
+    Returns:
+        np.ndarray: the drawn image which channel is RGB.
+    """
+    if hasattr(model, 'module'):
+        model = model.module
+    if isinstance(img, str):
+        image = mmcv.imread(img, channel_order='rgb')
+    else:
+        image = img
+    if save_dir is not None:
+        mkdir_or_exist(save_dir)
+    # init visualizer
+    visualizer = SegLocalVisualizer(
+        vis_backends=[dict(type='LocalVisBackend')],
+        save_dir=save_dir,
+        alpha=opacity)
+    visualizer.dataset_meta = dict(
+        classes=model.dataset_meta['classes'],
+        palette=model.dataset_meta['palette'])
+    visualizer.add_datasample(
+        name=title,
+        image=image,
+        data_sample=result,
+        draw_gt=draw_gt,
+        draw_pred=draw_pred,
+        wait_time=wait_time,
+        out_file=out_file,
+        show=show,
+        with_labels=with_labels)
+    vis_img = visualizer.get_image()
+
+    return vis_img
diff --git a/head_extractor/src/mmseg/apis/mmseg_inferencer.py b/head_extractor/src/mmseg/apis/mmseg_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..02a198b516a71c1f5a0833955607ba4ecc05bf13
--- /dev/null
+++ b/head_extractor/src/mmseg/apis/mmseg_inferencer.py
@@ -0,0 +1,382 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import List, Optional, Sequence, Union
+
+import mmcv
+import mmengine
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.transforms import Compose
+from mmengine.infer.infer import BaseInferencer, ModelType
+from mmengine.model import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner.checkpoint import _load_checkpoint_to_model
+from PIL import Image
+
+from mmseg.structures import SegDataSample
+from mmseg.utils import ConfigType, SampleList, get_classes, get_palette
+from mmseg.visualization import SegLocalVisualizer
+
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[SegDataSample, SampleList]
+
+
+class MMSegInferencer(BaseInferencer):
+    """Semantic segmentation inferencer, provides inference and visualization
+    interfaces. Note: MMEngine >= 0.5.0 is required.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. Take the `mmseg metafile <https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/metafile.yaml>`_
+            as an example the `model` could be
+            "fcn_r50-d8_4xb2-40k_cityscapes-512x1024", and the weights of model
+            will be download automatically. If use config file, like
+            "configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py", the
+            `weights` should be defined.
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        classes (list, optional): Input classes for result rendering, as the
+            prediction of segmentation model is a segment map with label
+            indices, `classes` is a list which includes items responding to the
+            label indices. If classes is not defined, visualizer will take
+            `cityscapes` classes by default. Defaults to None.
+        palette (list, optional): Input palette for result rendering, which is
+            a list of color palette responding to the classes. If palette is
+            not defined, visualizer will take `cityscapes` palette by default.
+            Defaults to None.
+        dataset_name (str, optional): `Dataset name or alias <https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/utils/class_names.py#L302-L317>`_
+            visulizer will use the meta information of the dataset i.e. classes
+            and palette, but the `classes` and `palette` have higher priority.
+            Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        scope (str, optional): The scope of the model. Defaults to 'mmseg'.
+    """ # noqa
+
+    preprocess_kwargs: set = set()
+    forward_kwargs: set = {'mode', 'out_dir'}
+    visualize_kwargs: set = {
+        'show', 'wait_time', 'img_out_dir', 'opacity', 'return_vis',
+        'with_labels'
+    }
+    postprocess_kwargs: set = {'pred_out_dir', 'return_datasample'}
+
+    def __init__(self,
+                 model: Union[ModelType, str],
+                 weights: Optional[str] = None,
+                 classes: Optional[Union[str, List]] = None,
+                 palette: Optional[Union[str, List]] = None,
+                 dataset_name: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: Optional[str] = 'mmseg') -> None:
+        # A global counter tracking the number of images processes, for
+        # naming of the output images
+        self.num_visualized_imgs = 0
+        self.num_pred_imgs = 0
+        init_default_scope(scope if scope else 'mmseg')
+        super().__init__(
+            model=model, weights=weights, device=device, scope=scope)
+
+        if device == 'cpu' or not torch.cuda.is_available():
+            self.model = revert_sync_batchnorm(self.model)
+
+        assert isinstance(self.visualizer, SegLocalVisualizer)
+        self.visualizer.set_dataset_meta(classes, palette, dataset_name)
+
+    def _load_weights_to_model(self, model: nn.Module,
+                               checkpoint: Optional[dict],
+                               cfg: Optional[ConfigType]) -> None:
+        """Loading model weights and meta information from cfg and checkpoint.
+
+        Subclasses could override this method to load extra meta information
+        from ``checkpoint`` and ``cfg`` to model.
+
+        Args:
+            model (nn.Module): Model to load weights and meta information.
+            checkpoint (dict, optional): The loaded checkpoint.
+            cfg (Config or ConfigDict, optional): The loaded config.
+        """
+
+        if checkpoint is not None:
+            _load_checkpoint_to_model(model, checkpoint)
+            checkpoint_meta = checkpoint.get('meta', {})
+            # save the dataset_meta in the model for convenience
+            if 'dataset_meta' in checkpoint_meta:
+                # mmsegmentation 1.x
+                model.dataset_meta = {
+                    'classes': checkpoint_meta['dataset_meta'].get('classes'),
+                    'palette': checkpoint_meta['dataset_meta'].get('palette')
+                }
+            elif 'CLASSES' in checkpoint_meta:
+                # mmsegmentation 0.x
+                classes = checkpoint_meta['CLASSES']
+                palette = checkpoint_meta.get('PALETTE', None)
+                model.dataset_meta = {'classes': classes, 'palette': palette}
+            else:
+                warnings.warn(
+                    'dataset_meta or class names are not saved in the '
+                    'checkpoint\'s meta data, use classes of Cityscapes by '
+                    'default.')
+                model.dataset_meta = {
+                    'classes': get_classes('cityscapes'),
+                    'palette': get_palette('cityscapes')
+                }
+        else:
+            warnings.warn('Checkpoint is not loaded, and the inference '
+                          'result is calculated by the randomly initialized '
+                          'model!')
+            warnings.warn(
+                'weights is None, use cityscapes classes by default.')
+            model.dataset_meta = {
+                'classes': get_classes('cityscapes'),
+                'palette': get_palette('cityscapes')
+            }
+
+    def __call__(self,
+                 inputs: InputsType,
+                 return_datasamples: bool = False,
+                 batch_size: int = 1,
+                 return_vis: bool = False,
+                 show: bool = False,
+                 wait_time: int = 0,
+                 out_dir: str = '',
+                 img_out_dir: str = 'vis',
+                 pred_out_dir: str = 'pred',
+                 **kwargs) -> dict:
+        """Call the inferencer.
+
+        Args:
+            inputs (Union[list, str, np.ndarray]): Inputs for the inferencer.
+            return_datasamples (bool): Whether to return results as
+                :obj:`SegDataSample`. Defaults to False.
+            batch_size (int): Batch size. Defaults to 1.
+            show (bool): Whether to display the rendering color segmentation
+                mask in a popup window. Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            out_dir (str): Output directory of inference results. Defaults
+                to ''.
+            img_out_dir (str): Subdirectory of `out_dir`, used to save
+                rendering color segmentation mask, so `out_dir` must be defined
+                if you would like to save predicted mask. Defaults to 'vis'.
+            pred_out_dir (str): Subdirectory of `out_dir`, used to save
+                predicted mask file, so `out_dir` must be defined if you would
+                like to save predicted mask. Defaults to 'pred'.
+
+            **kwargs: Other keyword arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+
+        Returns:
+            dict: Inference and visualization results.
+        """
+
+        if out_dir != '':
+            pred_out_dir = osp.join(out_dir, pred_out_dir)
+            img_out_dir = osp.join(out_dir, img_out_dir)
+        else:
+            pred_out_dir = ''
+            img_out_dir = ''
+
+        return super().__call__(
+            inputs=inputs,
+            return_datasamples=return_datasamples,
+            batch_size=batch_size,
+            show=show,
+            wait_time=wait_time,
+            img_out_dir=img_out_dir,
+            pred_out_dir=pred_out_dir,
+            return_vis=return_vis,
+            **kwargs)
+
+    def visualize(self,
+                  inputs: list,
+                  preds: List[dict],
+                  return_vis: bool = False,
+                  show: bool = False,
+                  wait_time: int = 0,
+                  img_out_dir: str = '',
+                  opacity: float = 0.8,
+                  with_labels: Optional[bool] = True) -> List[np.ndarray]:
+        """Visualize predictions.
+
+        Args:
+            inputs (list): Inputs preprocessed by :meth:`_inputs_to_list`.
+            preds (Any): Predictions of the model.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            img_out_dir (str): Output directory of rendering prediction i.e.
+                color segmentation mask. Defaults: ''
+            opacity (int, float): The transparency of segmentation mask.
+                Defaults to 0.8.
+
+        Returns:
+            List[np.ndarray]: Visualization results.
+        """
+        if not show and img_out_dir == '' and not return_vis:
+            return None
+        if self.visualizer is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+
+        self.visualizer.set_dataset_meta(**self.model.dataset_meta)
+        self.visualizer.alpha = opacity
+
+        results = []
+
+        for single_input, pred in zip(inputs, preds):
+            if isinstance(single_input, str):
+                img_bytes = mmengine.fileio.get(single_input)
+                img = mmcv.imfrombytes(img_bytes)
+                img = img[:, :, ::-1]
+                img_name = osp.basename(single_input)
+            elif isinstance(single_input, np.ndarray):
+                img = single_input.copy()
+                img_num = str(self.num_visualized_imgs).zfill(8) + '_vis'
+                img_name = f'{img_num}.jpg'
+            else:
+                raise ValueError('Unsupported input type:'
+                                 f'{type(single_input)}')
+
+            out_file = osp.join(img_out_dir, img_name) if img_out_dir != ''\
+                else None
+
+            self.visualizer.add_datasample(
+                img_name,
+                img,
+                pred,
+                show=show,
+                wait_time=wait_time,
+                draw_gt=False,
+                draw_pred=True,
+                out_file=out_file,
+                with_labels=with_labels)
+            if return_vis:
+                results.append(self.visualizer.get_image())
+            self.num_visualized_imgs += 1
+
+        return results if return_vis else None
+
+    def postprocess(self,
+                    preds: PredType,
+                    visualization: List[np.ndarray],
+                    return_datasample: bool = False,
+                    pred_out_dir: str = '') -> dict:
+        """Process the predictions and visualization results from ``forward``
+        and ``visualize``.
+
+        This method should be responsible for the following tasks:
+
+        1. Pack the predictions and visualization results and return them.
+        2. Save the predictions, if it needed.
+
+        Args:
+            preds (List[Dict]): Predictions of the model.
+            visualization (List[np.ndarray]): The list of rendering color
+                segmentation mask.
+            return_datasample (bool): Whether to return results as datasamples.
+                Defaults to False.
+            pred_out_dir: File to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Inference and visualization results with key ``predictions``
+            and ``visualization``
+
+            - ``visualization (Any)``: Returned by :meth:`visualize`
+            - ``predictions`` (List[np.ndarray], np.ndarray): Returned by
+              :meth:`forward` and processed in :meth:`postprocess`.
+              If ``return_datasample=False``, it will be the segmentation mask
+              with label indice.
+        """
+        if return_datasample:
+            if len(preds) == 1:
+                return preds[0]
+            else:
+                return preds
+
+        results_dict = {}
+
+        results_dict['predictions'] = []
+        results_dict['visualization'] = []
+
+        for i, pred in enumerate(preds):
+            pred_data = dict()
+            if 'pred_sem_seg' in pred.keys():
+                pred_data['sem_seg'] = pred.pred_sem_seg.numpy().data[0]
+            elif 'pred_depth_map' in pred.keys():
+                pred_data['depth_map'] = pred.pred_depth_map.numpy().data[0]
+
+            if visualization is not None:
+                vis = visualization[i]
+                results_dict['visualization'].append(vis)
+            if pred_out_dir != '':
+                mmengine.mkdir_or_exist(pred_out_dir)
+                for key, data in pred_data.items():
+                    post_fix = '_pred.png' if key == 'sem_seg' else '_pred.npy'
+                    img_name = str(self.num_pred_imgs).zfill(8) + post_fix
+                    img_path = osp.join(pred_out_dir, img_name)
+                    if key == 'sem_seg':
+                        output = Image.fromarray(data.astype(np.uint8))
+                        output.save(img_path)
+                    else:
+                        np.save(img_path, data)
+            pred_data = next(iter(pred_data.values()))
+            results_dict['predictions'].append(pred_data)
+            self.num_pred_imgs += 1
+
+        if len(results_dict['predictions']) == 1:
+            results_dict['predictions'] = results_dict['predictions'][0]
+            if visualization is not None:
+                results_dict['visualization'] = \
+                    results_dict['visualization'][0]
+        return results_dict
+
+    def _init_pipeline(self, cfg: ConfigType) -> Compose:
+        """Initialize the test pipeline.
+
+        Return a pipeline to handle various input data, such as ``str``,
+        ``np.ndarray``. It is an abstract method in BaseInferencer, and should
+        be implemented in subclasses.
+
+        The returned pipeline will be used to process a single data.
+        It will be used in :meth:`preprocess` like this:
+
+        .. code-block:: python
+            def preprocess(self, inputs, batch_size, **kwargs):
+                ...
+                dataset = map(self.pipeline, dataset)
+                ...
+        """
+        pipeline_cfg = cfg.test_dataloader.dataset.pipeline
+        # Loading annotations is also not applicable
+        for transform in ('LoadAnnotations', 'LoadDepthAnnotation'):
+            idx = self._get_transform_idx(pipeline_cfg, transform)
+            if idx != -1:
+                del pipeline_cfg[idx]
+
+        load_img_idx = self._get_transform_idx(pipeline_cfg,
+                                               'LoadImageFromFile')
+        if load_img_idx == -1:
+            raise ValueError(
+                'LoadImageFromFile is not found in the test pipeline')
+        pipeline_cfg[load_img_idx]['type'] = 'InferencerLoader'
+        return Compose(pipeline_cfg)
+
+    def _get_transform_idx(self, pipeline_cfg: ConfigType, name: str) -> int:
+        """Returns the index of the transform in a pipeline.
+
+        If the transform is not found, returns -1.
+        """
+        for i, transform in enumerate(pipeline_cfg):
+            if transform['type'] == name:
+                return i
+        return -1
diff --git a/head_extractor/src/mmseg/apis/remote_sense_inferencer.py b/head_extractor/src/mmseg/apis/remote_sense_inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6726c6ae3464b3911f7e69b14a0baf35cffc66d0
--- /dev/null
+++ b/head_extractor/src/mmseg/apis/remote_sense_inferencer.py
@@ -0,0 +1,279 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import threading
+from queue import Queue
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine import Config
+from mmengine.model import BaseModel
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+
+try:
+    from osgeo import gdal
+except ImportError:
+    gdal = None
+
+from mmseg.registry import MODELS
+from .utils import _preprare_data
+
+
+class RSImage:
+    """Remote sensing image class.
+
+    Args:
+        img (str or gdal.Dataset): Image file path or gdal.Dataset.
+    """
+
+    def __init__(self, image):
+        self.dataset = gdal.Open(image, gdal.GA_ReadOnly) if isinstance(
+            image, str) else image
+        assert isinstance(self.dataset, gdal.Dataset), \
+            f'{image} is not a image'
+        self.width = self.dataset.RasterXSize
+        self.height = self.dataset.RasterYSize
+        self.channel = self.dataset.RasterCount
+        self.trans = self.dataset.GetGeoTransform()
+        self.proj = self.dataset.GetProjection()
+        self.band_list = []
+        self.band_list.extend(
+            self.dataset.GetRasterBand(c + 1) for c in range(self.channel))
+        self.grids = []
+
+    def read(self, grid: Optional[List] = None) -> np.ndarray:
+        """Read image data. If grid is None, read the whole image.
+
+        Args:
+            grid (Optional[List], optional): Grid to read. Defaults to None.
+        Returns:
+            np.ndarray: Image data.
+        """
+        if grid is None:
+            return np.einsum('ijk->jki', self.dataset.ReadAsArray())
+        assert len(
+            grid) >= 4, 'grid must be a list containing at least 4 elements'
+        data = self.dataset.ReadAsArray(*grid[:4])
+        if data.ndim == 2:
+            data = data[np.newaxis, ...]
+        return np.einsum('ijk->jki', data)
+
+    def write(self, data: Optional[np.ndarray], grid: Optional[List] = None):
+        """Write image data.
+
+        Args:
+            grid (Optional[List], optional): Grid to write. Defaults to None.
+            data (Optional[np.ndarray], optional): Data to write.
+                Defaults to None.
+
+        Raises:
+            ValueError: Either grid or data must be provided.
+        """
+        if grid is not None:
+            assert len(grid) == 8, 'grid must be a list of 8 elements'
+            for band in self.band_list:
+                band.WriteArray(
+                    data[grid[5]:grid[5] + grid[7], grid[4]:grid[4] + grid[6]],
+                    grid[0] + grid[4], grid[1] + grid[5])
+        elif data is not None:
+            for i in range(self.channel):
+                self.band_list[i].WriteArray(data[..., i])
+        else:
+            raise ValueError('Either grid or data must be provided.')
+
+    def create_seg_map(self, output_path: Optional[str] = None):
+        if output_path is None:
+            output_path = 'output_label.tif'
+        driver = gdal.GetDriverByName('GTiff')
+        seg_map = driver.Create(output_path, self.width, self.height, 1,
+                                gdal.GDT_Byte)
+        seg_map.SetGeoTransform(self.trans)
+        seg_map.SetProjection(self.proj)
+        seg_map_img = RSImage(seg_map)
+        seg_map_img.path = output_path
+        return seg_map_img
+
+    def create_grids(self,
+                     window_size: Tuple[int, int],
+                     stride: Tuple[int, int] = (0, 0)):
+        """Create grids for image inference.
+
+        Args:
+            window_size (Tuple[int, int]): the size of the sliding window.
+            stride (Tuple[int, int], optional): the stride of the sliding
+                window. Defaults to (0, 0).
+
+        Raises:
+            AssertionError: window_size must be a tuple of 2 elements.
+            AssertionError: stride must be a tuple of 2 elements.
+        """
+        assert len(
+            window_size) == 2, 'window_size must be a tuple of 2 elements'
+        assert len(stride) == 2, 'stride must be a tuple of 2 elements'
+        win_w, win_h = window_size
+        stride_x, stride_y = stride
+
+        stride_x = win_w if stride_x == 0 else stride_x
+        stride_y = win_h if stride_y == 0 else stride_y
+
+        x_half_overlap = (win_w - stride_x + 1) // 2
+        y_half_overlap = (win_h - stride_y + 1) // 2
+
+        for y in range(0, self.height, stride_y):
+            y_end = y + win_h >= self.height
+            y_offset = self.height - win_h if y_end else y
+            y_size = win_h
+            y_crop_off = 0 if y_offset == 0 else y_half_overlap
+            y_crop_size = y_size if y_end else win_h - y_crop_off
+
+            for x in range(0, self.width, stride_x):
+                x_end = x + win_w >= self.width
+                x_offset = self.width - win_w if x_end else x
+                x_size = win_w
+                x_crop_off = 0 if x_offset == 0 else x_half_overlap
+                x_crop_size = x_size if x_end else win_w - x_crop_off
+
+                self.grids.append([
+                    x_offset, y_offset, x_size, y_size, x_crop_off, y_crop_off,
+                    x_crop_size, y_crop_size
+                ])
+
+
+class RSInferencer:
+    """Remote sensing inference class.
+
+    Args:
+        model (BaseModel): The loaded model.
+        batch_size (int, optional): Batch size. Defaults to 1.
+        thread (int, optional): Number of threads. Defaults to 1.
+    """
+
+    def __init__(self, model: BaseModel, batch_size: int = 1, thread: int = 1):
+        self.model = model
+        self.batch_size = batch_size
+        self.END_FLAG = object()
+        self.read_buffer = Queue(self.batch_size)
+        self.write_buffer = Queue(self.batch_size)
+        self.thread = thread
+
+    @classmethod
+    def from_config_path(cls,
+                         config_path: str,
+                         checkpoint_path: str,
+                         batch_size: int = 1,
+                         thread: int = 1,
+                         device: Optional[str] = 'cpu'):
+        """Initialize a segmentor from config file.
+
+        Args:
+            config_path (str): Config file path.
+            checkpoint_path (str): Checkpoint path.
+            batch_size (int, optional): Batch size. Defaults to 1.
+        """
+        init_default_scope('mmseg')
+        cfg = Config.fromfile(config_path)
+        model = MODELS.build(cfg.model)
+        model.cfg = cfg
+        load_checkpoint(model, checkpoint_path, map_location='cpu')
+        model.to(device)
+        model.eval()
+        return cls(model, batch_size, thread)
+
+    @classmethod
+    def from_model(cls,
+                   model: BaseModel,
+                   checkpoint_path: Optional[str] = None,
+                   batch_size: int = 1,
+                   thread: int = 1,
+                   device: Optional[str] = 'cpu'):
+        """Initialize a segmentor from model.
+
+        Args:
+            model (BaseModel): The loaded model.
+            checkpoint_path (Optional[str]): Checkpoint path.
+            batch_size (int, optional): Batch size. Defaults to 1.
+        """
+        if checkpoint_path is not None:
+            load_checkpoint(model, checkpoint_path, map_location='cpu')
+        model.to(device)
+        return cls(model, batch_size, thread)
+
+    def read(self,
+             image: RSImage,
+             window_size: Tuple[int, int],
+             strides: Tuple[int, int] = (0, 0)):
+        """Load image data to read buffer.
+
+        Args:
+            image (RSImage): The image to read.
+            window_size (Tuple[int, int]): The size of the sliding window.
+            strides (Tuple[int, int], optional): The stride of the sliding
+                window. Defaults to (0, 0).
+        """
+        image.create_grids(window_size, strides)
+        for grid in image.grids:
+            self.read_buffer.put([grid, image.read(grid=grid)])
+        self.read_buffer.put(self.END_FLAG)
+
+    def inference(self):
+        """Inference image data from read buffer and put the result to write
+        buffer."""
+        while True:
+            item = self.read_buffer.get()
+            if item == self.END_FLAG:
+                self.read_buffer.put(self.END_FLAG)
+                self.write_buffer.put(item)
+                break
+            data, _ = _preprare_data(item[1], self.model)
+            with torch.no_grad():
+                result = self.model.test_step(data)
+            item[1] = result[0].pred_sem_seg.cpu().data.numpy()[0]
+            self.write_buffer.put(item)
+            self.read_buffer.task_done()
+
+    def write(self, image: RSImage, output_path: Optional[str] = None):
+        """Write image data from write buffer.
+
+        Args:
+            image (RSImage): The image to write.
+            output_path (Optional[str], optional): The path to save the
+                segmentation map. Defaults to None.
+        """
+        seg_map = image.create_seg_map(output_path)
+        while True:
+            item = self.write_buffer.get()
+            if item == self.END_FLAG:
+                break
+            seg_map.write(data=item[1], grid=item[0])
+            self.write_buffer.task_done()
+
+    def run(self,
+            image: RSImage,
+            window_size: Tuple[int, int],
+            strides: Tuple[int, int] = (0, 0),
+            output_path: Optional[str] = None):
+        """Run inference with multi-threading.
+
+        Args:
+            image (RSImage): The image to inference.
+            window_size (Tuple[int, int]): The size of the sliding window.
+            strides (Tuple[int, int], optional): The stride of the sliding
+                window. Defaults to (0, 0).
+            output_path (Optional[str], optional): The path to save the
+                segmentation map. Defaults to None.
+        """
+        read_thread = threading.Thread(
+            target=self.read, args=(image, window_size, strides))
+        read_thread.start()
+        inference_threads = []
+        for _ in range(self.thread):
+            inference_thread = threading.Thread(target=self.inference)
+            inference_thread.start()
+            inference_threads.append(inference_thread)
+        write_thread = threading.Thread(
+            target=self.write, args=(image, output_path))
+        write_thread.start()
+        read_thread.join()
+        for inference_thread in inference_threads:
+            inference_thread.join()
+        write_thread.join()
diff --git a/head_extractor/src/mmseg/apis/utils.py b/head_extractor/src/mmseg/apis/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..805196dc4644b8d7918be5da341cd4d12b81fc79
--- /dev/null
+++ b/head_extractor/src/mmseg/apis/utils.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import Sequence, Union
+
+import numpy as np
+from mmengine.dataset import Compose
+from mmengine.model import BaseModel
+
+import torch
+import cv2
+
+ImageType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]]
+
+
+def _preprare_data(imgs: ImageType, model: BaseModel):
+
+    cfg = model.cfg
+    for t in cfg.test_pipeline:
+        if t.get('type') == 'LoadAnnotations':
+            cfg.test_pipeline.remove(t)
+
+    is_batch = True
+    if not isinstance(imgs, (list, tuple)):
+        imgs = [imgs]
+        is_batch = False
+
+    if isinstance(imgs[0], np.ndarray):
+        cfg.test_pipeline[0]['type'] = 'LoadImageFromNDArray'
+
+    # TODO: Consider using the singleton pattern to avoid building
+    # a pipeline for each inference
+    pipeline = Compose(cfg.test_pipeline)
+
+    data = defaultdict(list)
+    for img in imgs:
+        if isinstance(img, np.ndarray):
+            data_ = dict(img=img)
+        else:
+            data_ = dict(img_path=img)
+        data_ = pipeline(data_)
+        data['inputs'].append(data_['inputs'])
+        data['data_samples'].append(data_['data_samples'])
+        # data['inputs'].append(data_['inputs'].to(dtype=torch.float16))
+        # data['data_samples'].append(data_['data_samples'].to(dtype=torch.float16))
+
+    return data, is_batch
diff --git a/head_extractor/src/mmseg/configs/_base_/datasets/loveda.py b/head_extractor/src/mmseg/configs/_base_/datasets/loveda.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb3d358fe437f46d1ce0e5a4c9b6a6c101b2e297
--- /dev/null
+++ b/head_extractor/src/mmseg/configs/_base_/datasets/loveda.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import (RandomFlip, RandomResize, Resize,
+                                        TestTimeAug)
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+
+from mmseg.datasets.loveda import LoveDADataset
+from mmseg.datasets.transforms.formatting import PackSegInputs
+from mmseg.datasets.transforms.loading import LoadAnnotations
+from mmseg.datasets.transforms.transforms import (PhotoMetricDistortion,
+                                                  RandomCrop)
+from mmseg.evaluation import IoUMetric
+
+# dataset settings
+dataset_type = LoveDADataset
+data_root = 'data/loveDA'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type=LoadImageFromFile),
+    dict(type=LoadAnnotations, reduce_zero_label=True),
+    dict(
+        type=RandomResize,
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PhotoMetricDistortion),
+    dict(type=PackSegInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile),
+    dict(type=Resize, scale=(1024, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type=LoadAnnotations, reduce_zero_label=True),
+    dict(type=PackSegInputs)
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=None),
+    dict(
+        type=TestTimeAug,
+        transforms=[[
+            dict(type=Resize, scale_factor=r, keep_ratio=True)
+            for r in img_ratios
+        ],
+                    [
+                        dict(type=RandomFlip, prob=0., direction='horizontal'),
+                        dict(type=RandomFlip, prob=1., direction='horizontal')
+                    ], [dict(type=LoadAnnotations)],
+                    [dict(type=PackSegInputs)]])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=12,
+    persistent_workers=True,
+    sampler=dict(type=InfiniteSampler, shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='img_dir/train', seg_map_path='ann_dir/train'),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
+        pipeline=test_pipeline))
+
+test_dataloader = val_dataloader
+val_evaluator = dict(type=IoUMetric, iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/configs/_base_/datasets/potsdam.py b/head_extractor/src/mmseg/configs/_base_/datasets/potsdam.py
new file mode 100644
index 0000000000000000000000000000000000000000..33a4ebfd8f841ff502828c0adbc60c963e154897
--- /dev/null
+++ b/head_extractor/src/mmseg/configs/_base_/datasets/potsdam.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.transforms.loading import LoadImageFromFile
+from mmcv.transforms.processing import (RandomFlip, RandomResize, Resize,
+                                        TestTimeAug)
+from mmengine.dataset.sampler import DefaultSampler, InfiniteSampler
+
+from mmseg.datasets.potsdam import PotsdamDataset
+from mmseg.datasets.transforms.formatting import PackSegInputs
+from mmseg.datasets.transforms.loading import LoadAnnotations
+from mmseg.datasets.transforms.transforms import (PhotoMetricDistortion,
+                                                  RandomCrop)
+from mmseg.evaluation import IoUMetric
+
+# dataset settings
+dataset_type = PotsdamDataset
+data_root = 'data/potsdam'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type=LoadImageFromFile),
+    dict(type=LoadAnnotations, reduce_zero_label=True),
+    dict(
+        type=RandomResize,
+        scale=(512, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type=RandomCrop, crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type=RandomFlip, prob=0.5),
+    dict(type=PhotoMetricDistortion),
+    dict(type=PackSegInputs)
+]
+test_pipeline = [
+    dict(type=LoadImageFromFile),
+    dict(type=Resize, scale=(512, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type=LoadAnnotations, reduce_zero_label=True),
+    dict(type=PackSegInputs)
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type=LoadImageFromFile, backend_args=None),
+    dict(
+        type=TestTimeAug,
+        transforms=[[
+            dict(type=Resize, scale_factor=r, keep_ratio=True)
+            for r in img_ratios
+        ],
+                    [
+                        dict(type=RandomFlip, prob=0., direction='horizontal'),
+                        dict(type=RandomFlip, prob=1., direction='horizontal')
+                    ], [dict(type=LoadAnnotations)],
+                    [dict(type=PackSegInputs)]])
+]
+
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type=InfiniteSampler, shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='img_dir/train', seg_map_path='ann_dir/train'),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type=DefaultSampler, shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type=IoUMetric, iou_metrics=['mIoU'])  # 'mDice', 'mFscore'
+test_evaluator = val_evaluator
diff --git a/head_extractor/src/mmseg/configs/_base_/default_runtime.py b/head_extractor/src/mmseg/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..c90502096b36abeda6be0840da67c255bd223d9f
--- /dev/null
+++ b/head_extractor/src/mmseg/configs/_base_/default_runtime.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmengine.visualization import LocalVisBackend
+
+from mmseg.models import SegTTAModel
+from mmseg.visualization import SegLocalVisualizer
+
+env_cfg = dict(
+    cudnn_benchmark=False,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+vis_backends = [dict(type=LocalVisBackend)]
+visualizer = dict(
+    type=SegLocalVisualizer, vis_backends=vis_backends, name='visualizer')
+log_processor = dict(by_epoch=False)
+log_level = 'INFO'
+load_from = None
+resume = False
+
+tta_model = dict(type=SegTTAModel)
+default_scope = None
diff --git a/head_extractor/src/mmseg/configs/_base_/schedules/schedule_160k.py b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_160k.py
new file mode 100644
index 0000000000000000000000000000000000000000..294d6ee3f534f39676f2fa7cdb99bc4321320351
--- /dev/null
+++ b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_160k.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import PolyLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+from mmseg.engine import SegVisualizationHook
+
+# optimizer
+optimizer = dict(
+    type=SGD,
+    #  lr=0.01,
+    #  momentum=0.9,
+    #  weight_decay=0.0005
+)
+
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type=PolyLR,
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+# training schedule for 160k
+
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=160000, val_interval=8000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=8000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/src/mmseg/configs/_base_/schedules/schedule_20k.py b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_20k.py
new file mode 100644
index 0000000000000000000000000000000000000000..255300a1f4e4161deb80a34298b734a80f685f92
--- /dev/null
+++ b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_20k.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import PolyLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+from mmseg.engine import SegVisualizationHook
+
+# optimizer
+optimizer = dict(type=SGD, lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type=PolyLR,
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=20000,
+        by_epoch=False)
+]
+# training schedule for 20k
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=20000, val_interval=2000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=2000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/src/mmseg/configs/_base_/schedules/schedule_240k.py b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_240k.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf9e5d3c94e2d70992a7f66b4a38ee3a0ba8c8f8
--- /dev/null
+++ b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_240k.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import PolyLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+# from mmengine.runner.loops import EpochBasedTrainLoop
+from torch.optim.sgd import SGD
+
+from mmseg.engine import SegVisualizationHook
+
+optimizer = dict(type=SGD, lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type=PolyLR,
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=240000,
+        by_epoch=False)
+]
+# training schedule for 240k
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=240000, val_interval=24000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=24000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/src/mmseg/configs/_base_/schedules/schedule_25k.py b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_25k.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a3ebf405e431973d4dc9ef541f424dedf5f83ce
--- /dev/null
+++ b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_25k.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import ConstantLR, LinearLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+# from mmengine.runner.loops import EpochBasedTrainLoop
+from torch.optim.adamw import AdamW
+
+from mmseg.engine import SegVisualizationHook
+from mmseg.engine.schedulers import PolyLRRatio
+
+# optimizer
+optimizer = dict(type=AdamW, lr=0.01, weight_decay=0.1)
+
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+# learning policy
+
+# learning policy
+param_scheduler = [
+    dict(type=LinearLR, start_factor=3e-2, begin=0, end=12000, by_epoch=False),
+    dict(
+        type=PolyLRRatio,
+        eta_min_ratio=3e-2,
+        power=0.9,
+        begin=12000,
+        end=24000,
+        by_epoch=False),
+    dict(type=ConstantLR, by_epoch=False, factor=1, begin=24000, end=25000)
+]
+
+# training schedule for 25k
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=25000, val_interval=1000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=1000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/src/mmseg/configs/_base_/schedules/schedule_320k.py b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_320k.py
new file mode 100644
index 0000000000000000000000000000000000000000..dae323ec48adf33f67d91d5843f787a63bcac90c
--- /dev/null
+++ b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_320k.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import PolyLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+# from mmengine.runner.loops import EpochBasedTrainLoop
+from torch.optim.sgd import SGD
+
+from mmseg.engine import SegVisualizationHook
+
+# optimizer
+optimizer = dict(type=SGD, lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type=PolyLR,
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=320000,
+        by_epoch=False)
+]
+# training schedule for 320k
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=320000, val_interval=32000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=32000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/src/mmseg/configs/_base_/schedules/schedule_40k.py b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_40k.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4b2ea42b5c13f588497b654023e7b539fadc565
--- /dev/null
+++ b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_40k.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import PolyLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+from mmseg.engine import SegVisualizationHook
+
+# optimizer
+optimizer = dict(type=SGD, lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+
+param_scheduler = [
+    dict(
+        type=PolyLR,
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=40000,
+        by_epoch=False)
+]
+# training schedule for 40k
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=40000, val_interval=4000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=4000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/src/mmseg/configs/_base_/schedules/schedule_80k.py b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_80k.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e711ca891564993f86eba0af6a45ff4ab1543dd
--- /dev/null
+++ b/head_extractor/src/mmseg/configs/_base_/schedules/schedule_80k.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
+                            LoggerHook, ParamSchedulerHook)
+from mmengine.optim.optimizer.optimizer_wrapper import OptimWrapper
+from mmengine.optim.scheduler.lr_scheduler import PolyLR
+from mmengine.runner.loops import IterBasedTrainLoop, TestLoop, ValLoop
+from torch.optim.sgd import SGD
+
+from mmseg.engine import SegVisualizationHook
+
+# optimizer
+optimizer = dict(
+    type=SGD,
+    #  lr=0.01,
+    #  momentum=0.9,
+    #  weight_decay=0.0005
+)
+
+optim_wrapper = dict(type=OptimWrapper, optimizer=optimizer, clip_grad=None)
+
+# learning policy
+param_scheduler = [
+    dict(
+        type=PolyLR,
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=80000,
+        by_epoch=False)
+]
+# training schedule for 80k
+train_cfg = dict(type=IterBasedTrainLoop, max_iters=80000, val_interval=8000)
+val_cfg = dict(type=ValLoop)
+test_cfg = dict(type=TestLoop)
+
+default_hooks = dict(
+    timer=dict(type=IterTimerHook),
+    logger=dict(type=LoggerHook, interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type=ParamSchedulerHook),
+    checkpoint=dict(type=CheckpointHook, by_epoch=False, interval=8000),
+    sampler_seed=dict(type=DistSamplerSeedHook),
+    visualization=dict(type=SegVisualizationHook))
diff --git a/head_extractor/src/mmseg/datasets/__init__.py b/head_extractor/src/mmseg/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..528bdac67fe371b14a7c252b87090fbfecce78c5
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/__init__.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# yapf: disable
+from .ade import ADE20KDataset
+from .basesegdataset import BaseCDDataset, BaseSegDataset
+from .bdd100k import BDD100KDataset
+from .chase_db1 import ChaseDB1Dataset
+from .cityscapes import CityscapesDataset
+from .coco_stuff import COCOStuffDataset
+from .dark_zurich import DarkZurichDataset
+from .dataset_wrappers import MultiImageMixDataset
+from .decathlon import DecathlonDataset
+from .drive import DRIVEDataset
+from .dsdl import DSDLSegDataset
+from .hrf import HRFDataset
+from .hsi_drive import HSIDrive20Dataset
+from .isaid import iSAIDDataset
+from .isprs import ISPRSDataset
+from .levir import LEVIRCDDataset
+from .lip import LIPDataset
+from .loveda import LoveDADataset
+from .mapillary import MapillaryDataset_v1, MapillaryDataset_v2
+from .night_driving import NightDrivingDataset
+from .nyu import NYUDataset
+from .pascal_context import PascalContextDataset, PascalContextDataset59
+from .potsdam import PotsdamDataset
+from .refuge import REFUGEDataset
+from .stare import STAREDataset
+from .synapse import SynapseDataset
+from .fashion import FashionDataset
+from .fashion_3category import fashion3categoryDataset
+from .deep_fashion import DeepFashionDataset
+from .human_parsing import HumanParsingDataset
+from .human_union import HumanUnionDataset
+from .deepfashion_10k import DeepFashion10KDataset
+from .imaterialist import iMaterialistDataset
+from .imaterialist_5cat import iMaterialist_5Cat_Dataset
+from .union_new import UnionNewKDataset
+from .union_new_add_mask import UnionNewAddMaskDataset
+
+# yapf: disable
+from .transforms import (CLAHE, AdjustGamma, Albu, BioMedical3DPad,
+                         BioMedical3DRandomCrop, BioMedical3DRandomFlip,
+                         BioMedicalGaussianBlur, BioMedicalGaussianNoise,
+                         BioMedicalRandomGamma, ConcatCDInput, GenerateEdge,
+                         LoadAnnotations, LoadBiomedicalAnnotation,
+                         LoadBiomedicalData, LoadBiomedicalImageFromFile,
+                         LoadImageFromNDArray, LoadMultipleRSImageFromFile,
+                         LoadSingleRSImageFromFile, PackSegInputs,
+                         PhotoMetricDistortion, RandomCrop, RandomCutOut,
+                         RandomMosaic, RandomRotate, RandomRotFlip, Rerange,
+                         ResizeShortestEdge, ResizeToMultiple, RGB2Gray,
+                         SegRescale)
+from .voc import PascalVOCDataset
+
+# yapf: enable
+__all__ = [
+    'BaseSegDataset', 'BioMedical3DRandomCrop', 'BioMedical3DRandomFlip',
+    'CityscapesDataset', 'PascalVOCDataset', 'ADE20KDataset',
+    'PascalContextDataset', 'PascalContextDataset59', 'ChaseDB1Dataset',
+    'DRIVEDataset', 'HRFDataset', 'STAREDataset', 'DarkZurichDataset',
+    'NightDrivingDataset', 'COCOStuffDataset', 'LoveDADataset',
+    'MultiImageMixDataset', 'iSAIDDataset', 'ISPRSDataset', 'PotsdamDataset',
+    'LoadAnnotations', 'RandomCrop', 'SegRescale', 'PhotoMetricDistortion',
+    'RandomRotate', 'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray',
+    'RandomCutOut', 'RandomMosaic', 'PackSegInputs', 'ResizeToMultiple',
+    'LoadImageFromNDArray', 'LoadBiomedicalImageFromFile',
+    'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge',
+    'DecathlonDataset', 'LIPDataset', 'ResizeShortestEdge',
+    'BioMedicalGaussianNoise', 'BioMedicalGaussianBlur',
+    'BioMedicalRandomGamma', 'BioMedical3DPad', 'RandomRotFlip',
+    'SynapseDataset', 'REFUGEDataset', 'MapillaryDataset_v1',
+    'MapillaryDataset_v2', 'Albu', 'LEVIRCDDataset',
+    'LoadMultipleRSImageFromFile', 'LoadSingleRSImageFromFile',
+    'ConcatCDInput', 'BaseCDDataset', 'DSDLSegDataset', 'BDD100KDataset',
+    'NYUDataset', 'HSIDrive20Dataset', 'FashionDataset', 'fashion3categoryDataset', 'DeepFashionDataset',
+    'HumanParsingDataset', 'HumanUnionDataset', 'DeepFashion10KDataset', 'iMaterialistDataset', 'iMaterialist_5Cat_Dataset',
+    'UnionNewKDataset', 'UnionNewAddMaskDataset'
+]
diff --git a/head_extractor/src/mmseg/datasets/ade.py b/head_extractor/src/mmseg/datasets/ade.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9bdae7421205f25d39441381d6492e9208a4714
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/ade.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class ADE20KDataset(BaseSegDataset):
+    """ADE20K dataset.
+
+    In segmentation map annotation for ADE20K, 0 stands for background, which
+    is not included in 150 categories. ``reduce_zero_label`` is fixed to True.
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is fixed to
+    '.png'.
+    """
+    METAINFO = dict(
+        classes=('wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road',
+                 'bed ', 'windowpane', 'grass', 'cabinet', 'sidewalk',
+                 'person', 'earth', 'door', 'table', 'mountain', 'plant',
+                 'curtain', 'chair', 'car', 'water', 'painting', 'sofa',
+                 'shelf', 'house', 'sea', 'mirror', 'rug', 'field', 'armchair',
+                 'seat', 'fence', 'desk', 'rock', 'wardrobe', 'lamp',
+                 'bathtub', 'railing', 'cushion', 'base', 'box', 'column',
+                 'signboard', 'chest of drawers', 'counter', 'sand', 'sink',
+                 'skyscraper', 'fireplace', 'refrigerator', 'grandstand',
+                 'path', 'stairs', 'runway', 'case', 'pool table', 'pillow',
+                 'screen door', 'stairway', 'river', 'bridge', 'bookcase',
+                 'blind', 'coffee table', 'toilet', 'flower', 'book', 'hill',
+                 'bench', 'countertop', 'stove', 'palm', 'kitchen island',
+                 'computer', 'swivel chair', 'boat', 'bar', 'arcade machine',
+                 'hovel', 'bus', 'towel', 'light', 'truck', 'tower',
+                 'chandelier', 'awning', 'streetlight', 'booth',
+                 'television receiver', 'airplane', 'dirt track', 'apparel',
+                 'pole', 'land', 'bannister', 'escalator', 'ottoman', 'bottle',
+                 'buffet', 'poster', 'stage', 'van', 'ship', 'fountain',
+                 'conveyer belt', 'canopy', 'washer', 'plaything',
+                 'swimming pool', 'stool', 'barrel', 'basket', 'waterfall',
+                 'tent', 'bag', 'minibike', 'cradle', 'oven', 'ball', 'food',
+                 'step', 'tank', 'trade name', 'microwave', 'pot', 'animal',
+                 'bicycle', 'lake', 'dishwasher', 'screen', 'blanket',
+                 'sculpture', 'hood', 'sconce', 'vase', 'traffic light',
+                 'tray', 'ashcan', 'fan', 'pier', 'crt screen', 'plate',
+                 'monitor', 'bulletin board', 'shower', 'radiator', 'glass',
+                 'clock', 'flag'),
+        palette=[[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+                 [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+                 [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+                 [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+                 [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+                 [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+                 [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+                 [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+                 [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+                 [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+                 [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+                 [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+                 [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+                 [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+                 [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+                 [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+                 [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+                 [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+                 [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+                 [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+                 [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+                 [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+                 [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+                 [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+                 [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+                 [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+                 [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+                 [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+                 [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+                 [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+                 [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+                 [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+                 [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+                 [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+                 [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+                 [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+                 [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+                 [102, 255, 0], [92, 0, 255]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/basesegdataset.py b/head_extractor/src/mmseg/datasets/basesegdataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c4668c1f561961fb27642fb7c1ac702f626cbb7
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/basesegdataset.py
@@ -0,0 +1,552 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import Callable, Dict, List, Optional, Sequence, Union
+
+import mmengine
+import mmengine.fileio as fileio
+import numpy as np
+from mmengine.dataset import BaseDataset, Compose
+
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BaseSegDataset(BaseDataset):
+    """Custom dataset for semantic segmentation. An example of file structure
+    is as followed.
+
+    .. code-block:: none
+
+        ├── data
+        │   ├── my_dataset
+        │   │   ├── img_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{img_suffix}
+        │   │   │   │   ├── yyy{img_suffix}
+        │   │   │   │   ├── zzz{img_suffix}
+        │   │   │   ├── val
+        │   │   ├── ann_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{seg_map_suffix}
+        │   │   │   │   ├── yyy{seg_map_suffix}
+        │   │   │   │   ├── zzz{seg_map_suffix}
+        │   │   │   ├── val
+
+    The img/gt_semantic_seg pair of BaseSegDataset should be of the same
+    except suffix. A valid img/gt_semantic_seg filename pair should be like
+    ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included
+    in the suffix). If split is given, then ``xxx`` is specified in txt file.
+    Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded.
+    Please refer to ``docs/en/tutorials/new_dataset.md`` for more details.
+
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as
+            specify classes to load. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            dict(img_path=None, seg_map_path=None).
+        img_suffix (str): Suffix of images. Default: '.jpg'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+        ignore_index (int): The label index to be ignored. Default: 255
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+    METAINFO: dict = dict()
+
+    def __init__(self,
+                 ann_file: str = '',
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: dict = dict(img_path='', seg_map_path=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000,
+                 ignore_index: int = 255,
+                 reduce_zero_label: bool = False,
+                 backend_args: Optional[dict] = None) -> None:
+
+        self.img_suffix = img_suffix
+        self.seg_map_suffix = seg_map_suffix
+        self.ignore_index = ignore_index
+        self.reduce_zero_label = reduce_zero_label
+        self.backend_args = backend_args.copy() if backend_args else None
+
+        self.data_root = data_root
+        self.data_prefix = copy.copy(data_prefix)
+        self.ann_file = ann_file
+        self.filter_cfg = copy.deepcopy(filter_cfg)
+        self._indices = indices
+        self.serialize_data = serialize_data
+        self.test_mode = test_mode
+        self.max_refetch = max_refetch
+        self.data_list: List[dict] = []
+        self.data_bytes: np.ndarray
+
+        # Set meta information.
+        self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
+
+        # Get label map for custom classes
+        new_classes = self._metainfo.get('classes', None)
+        self.label_map = self.get_label_map(new_classes)
+        self._metainfo.update(
+            dict(
+                label_map=self.label_map,
+                reduce_zero_label=self.reduce_zero_label))
+
+        # Update palette based on label map or generate palette
+        # if it is not defined
+        updated_palette = self._update_palette()
+        self._metainfo.update(dict(palette=updated_palette))
+
+        # Join paths.
+        if self.data_root is not None:
+            self._join_prefix()
+
+        # Build pipeline.
+        self.pipeline = Compose(pipeline)
+        # Full initialize the dataset.
+        if not lazy_init:
+            self.full_init()
+
+        if test_mode:
+            assert self._metainfo.get('classes') is not None, \
+                'dataset metainfo `classes` should be specified when testing'
+
+    @classmethod
+    def get_label_map(cls,
+                      new_classes: Optional[Sequence] = None
+                      ) -> Union[Dict, None]:
+        """Require label mapping.
+
+        The ``label_map`` is a dictionary, its keys are the old label ids and
+        its values are the new label ids, and is used for changing pixel
+        labels in load_annotations. If and only if old classes in cls.METAINFO
+        is not equal to new classes in self._metainfo and nether of them is not
+        None, `label_map` is not None.
+
+        Args:
+            new_classes (list, tuple, optional): The new classes name from
+                metainfo. Default to None.
+
+
+        Returns:
+            dict, optional: The mapping from old classes in cls.METAINFO to
+                new classes in self._metainfo
+        """
+        old_classes = cls.METAINFO.get('classes', None)
+        if (new_classes is not None and old_classes is not None
+                and list(new_classes) != list(old_classes)):
+
+            label_map = {}
+            if not set(new_classes).issubset(cls.METAINFO['classes']):
+                raise ValueError(
+                    f'new classes {new_classes} is not a '
+                    f'subset of classes {old_classes} in METAINFO.')
+            for i, c in enumerate(old_classes):
+                if c not in new_classes:
+                    label_map[i] = 255
+                else:
+                    label_map[i] = new_classes.index(c)
+            return label_map
+        else:
+            return None
+
+    def _update_palette(self) -> list:
+        """Update palette after loading metainfo.
+
+        If length of palette is equal to classes, just return the palette.
+        If palette is not defined, it will randomly generate a palette.
+        If classes is updated by customer, it will return the subset of
+        palette.
+
+        Returns:
+            Sequence: Palette for current dataset.
+        """
+        palette = self._metainfo.get('palette', [])
+        classes = self._metainfo.get('classes', [])
+        # palette does match classes
+        if len(palette) == len(classes):
+            return palette
+
+        if len(palette) == 0:
+            # Get random state before set seed, and restore
+            # random state later.
+            # It will prevent loss of randomness, as the palette
+            # may be different in each iteration if not specified.
+            # See: https://github.com/open-mmlab/mmdetection/issues/5844
+            state = np.random.get_state()
+            np.random.seed(42)
+            # random palette
+            new_palette = np.random.randint(
+                0, 255, size=(len(classes), 3)).tolist()
+            np.random.set_state(state)
+        elif len(palette) >= len(classes) and self.label_map is not None:
+            new_palette = []
+            # return subset of palette
+            for old_id, new_id in sorted(
+                    self.label_map.items(), key=lambda x: x[1]):
+                if new_id != 255:
+                    new_palette.append(palette[old_id])
+            new_palette = type(palette)(new_palette)
+        else:
+            raise ValueError('palette does not match classes '
+                             f'as metainfo is {self._metainfo}.')
+        return new_palette
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        ann_dir = self.data_prefix.get('seg_map_path', None)
+        if not osp.isdir(self.ann_file) and self.ann_file:
+            assert osp.isfile(self.ann_file), \
+                f'Failed to load `ann_file` {self.ann_file}'
+            lines = mmengine.list_from_file(
+                self.ann_file, backend_args=self.backend_args)
+            for line in lines:
+                img_name = line.strip()
+                data_info = dict(
+                    img_path=osp.join(img_dir, img_name + self.img_suffix))
+                if ann_dir is not None:
+                    seg_map = img_name + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_info['reduce_zero_label'] = self.reduce_zero_label
+                data_info['seg_fields'] = []
+                data_list.append(data_info)
+        else:
+            _suffix_len = len(self.img_suffix)
+            for img in fileio.list_dir_or_file(
+                    dir_path=img_dir,
+                    list_dir=False,
+                    suffix=self.img_suffix,
+                    recursive=True,
+                    backend_args=self.backend_args):
+                data_info = dict(img_path=osp.join(img_dir, img))
+                if ann_dir is not None:
+                    seg_map = img[:-_suffix_len] + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_info['reduce_zero_label'] = self.reduce_zero_label
+                data_info['seg_fields'] = []
+                data_list.append(data_info)
+            data_list = sorted(data_list, key=lambda x: x['img_path'])
+        return data_list
+
+
+@DATASETS.register_module()
+class BaseCDDataset(BaseDataset):
+    """Custom dataset for change detection. An example of file structure is as
+    followed.
+
+    .. code-block:: none
+
+        ├── data
+        │   ├── my_dataset
+        │   │   ├── img_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{img_suffix}
+        │   │   │   │   ├── yyy{img_suffix}
+        │   │   │   │   ├── zzz{img_suffix}
+        │   │   │   ├── val
+        │   │   ├── img_dir2
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{img_suffix}
+        │   │   │   │   ├── yyy{img_suffix}
+        │   │   │   │   ├── zzz{img_suffix}
+        │   │   │   ├── val
+        │   │   ├── ann_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{seg_map_suffix}
+        │   │   │   │   ├── yyy{seg_map_suffix}
+        │   │   │   │   ├── zzz{seg_map_suffix}
+        │   │   │   ├── val
+
+    The image names in img_dir and img_dir2 should be consistent.
+    The img/gt_semantic_seg pair of BaseSegDataset should be of the same
+    except suffix. A valid img/gt_semantic_seg filename pair should be like
+    ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included
+    in the suffix). If split is given, then ``xxx`` is specified in txt file.
+    Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded.
+    Please refer to ``docs/en/tutorials/new_dataset.md`` for more details.
+
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as
+            specify classes to load. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            dict(img_path=None, img_path2=None, seg_map_path=None).
+        img_suffix (str): Suffix of images. Default: '.jpg'
+        img_suffix2 (str): Suffix of images. Default: '.jpg'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+        ignore_index (int): The label index to be ignored. Default: 255
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+    METAINFO: dict = dict()
+
+    def __init__(self,
+                 ann_file: str = '',
+                 img_suffix='.jpg',
+                 img_suffix2='.jpg',
+                 seg_map_suffix='.png',
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: dict = dict(
+                     img_path='', img_path2='', seg_map_path=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000,
+                 ignore_index: int = 255,
+                 reduce_zero_label: bool = False,
+                 backend_args: Optional[dict] = None) -> None:
+
+        self.img_suffix = img_suffix
+        self.img_suffix2 = img_suffix2
+        self.seg_map_suffix = seg_map_suffix
+        self.ignore_index = ignore_index
+        self.reduce_zero_label = reduce_zero_label
+        self.backend_args = backend_args.copy() if backend_args else None
+
+        self.data_root = data_root
+        self.data_prefix = copy.copy(data_prefix)
+        self.ann_file = ann_file
+        self.filter_cfg = copy.deepcopy(filter_cfg)
+        self._indices = indices
+        self.serialize_data = serialize_data
+        self.test_mode = test_mode
+        self.max_refetch = max_refetch
+        self.data_list: List[dict] = []
+        self.data_bytes: np.ndarray
+
+        # Set meta information.
+        self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
+
+        # Get label map for custom classes
+        new_classes = self._metainfo.get('classes', None)
+        self.label_map = self.get_label_map(new_classes)
+        self._metainfo.update(
+            dict(
+                label_map=self.label_map,
+                reduce_zero_label=self.reduce_zero_label))
+
+        # Update palette based on label map or generate palette
+        # if it is not defined
+        updated_palette = self._update_palette()
+        self._metainfo.update(dict(palette=updated_palette))
+
+        # Join paths.
+        if self.data_root is not None:
+            self._join_prefix()
+
+        # Build pipeline.
+        self.pipeline = Compose(pipeline)
+        # Full initialize the dataset.
+        if not lazy_init:
+            self.full_init()
+
+        if test_mode:
+            assert self._metainfo.get('classes') is not None, \
+                'dataset metainfo `classes` should be specified when testing'
+
+    @classmethod
+    def get_label_map(cls,
+                      new_classes: Optional[Sequence] = None
+                      ) -> Union[Dict, None]:
+        """Require label mapping.
+
+        The ``label_map`` is a dictionary, its keys are the old label ids and
+        its values are the new label ids, and is used for changing pixel
+        labels in load_annotations. If and only if old classes in cls.METAINFO
+        is not equal to new classes in self._metainfo and nether of them is not
+        None, `label_map` is not None.
+
+        Args:
+            new_classes (list, tuple, optional): The new classes name from
+                metainfo. Default to None.
+
+
+        Returns:
+            dict, optional: The mapping from old classes in cls.METAINFO to
+                new classes in self._metainfo
+        """
+        old_classes = cls.METAINFO.get('classes', None)
+        if (new_classes is not None and old_classes is not None
+                and list(new_classes) != list(old_classes)):
+
+            label_map = {}
+            if not set(new_classes).issubset(cls.METAINFO['classes']):
+                raise ValueError(
+                    f'new classes {new_classes} is not a '
+                    f'subset of classes {old_classes} in METAINFO.')
+            for i, c in enumerate(old_classes):
+                if c not in new_classes:
+                    label_map[i] = 255
+                else:
+                    label_map[i] = new_classes.index(c)
+            return label_map
+        else:
+            return None
+
+    def _update_palette(self) -> list:
+        """Update palette after loading metainfo.
+
+        If length of palette is equal to classes, just return the palette.
+        If palette is not defined, it will randomly generate a palette.
+        If classes is updated by customer, it will return the subset of
+        palette.
+
+        Returns:
+            Sequence: Palette for current dataset.
+        """
+        palette = self._metainfo.get('palette', [])
+        classes = self._metainfo.get('classes', [])
+        # palette does match classes
+        if len(palette) == len(classes):
+            return palette
+
+        if len(palette) == 0:
+            # Get random state before set seed, and restore
+            # random state later.
+            # It will prevent loss of randomness, as the palette
+            # may be different in each iteration if not specified.
+            # See: https://github.com/open-mmlab/mmdetection/issues/5844
+            state = np.random.get_state()
+            np.random.seed(42)
+            # random palette
+            new_palette = np.random.randint(
+                0, 255, size=(len(classes), 3)).tolist()
+            np.random.set_state(state)
+        elif len(palette) >= len(classes) and self.label_map is not None:
+            new_palette = []
+            # return subset of palette
+            for old_id, new_id in sorted(
+                    self.label_map.items(), key=lambda x: x[1]):
+                if new_id != 255:
+                    new_palette.append(palette[old_id])
+            new_palette = type(palette)(new_palette)
+        else:
+            raise ValueError('palette does not match classes '
+                             f'as metainfo is {self._metainfo}.')
+        return new_palette
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        img_dir2 = self.data_prefix.get('img_path2', None)
+        ann_dir = self.data_prefix.get('seg_map_path', None)
+        if osp.isfile(self.ann_file):
+            lines = mmengine.list_from_file(
+                self.ann_file, backend_args=self.backend_args)
+            for line in lines:
+                img_name = line.strip()
+                if '.' in osp.basename(img_name):
+                    img_name, img_ext = osp.splitext(img_name)
+                    self.img_suffix = img_ext
+                    self.img_suffix2 = img_ext
+                data_info = dict(
+                    img_path=osp.join(img_dir, img_name + self.img_suffix),
+                    img_path2=osp.join(img_dir2, img_name + self.img_suffix2))
+
+                if ann_dir is not None:
+                    seg_map = img_name + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_info['reduce_zero_label'] = self.reduce_zero_label
+                data_info['seg_fields'] = []
+                data_list.append(data_info)
+        else:
+            for img in fileio.list_dir_or_file(
+                    dir_path=img_dir,
+                    list_dir=False,
+                    suffix=self.img_suffix,
+                    recursive=True,
+                    backend_args=self.backend_args):
+                if '.' in osp.basename(img):
+                    img, img_ext = osp.splitext(img)
+                    self.img_suffix = img_ext
+                    self.img_suffix2 = img_ext
+                data_info = dict(
+                    img_path=osp.join(img_dir, img + self.img_suffix),
+                    img_path2=osp.join(img_dir2, img + self.img_suffix2))
+                if ann_dir is not None:
+                    seg_map = img + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_info['reduce_zero_label'] = self.reduce_zero_label
+                data_info['seg_fields'] = []
+                data_list.append(data_info)
+            data_list = sorted(data_list, key=lambda x: x['img_path'])
+        return data_list
diff --git a/head_extractor/src/mmseg/datasets/bdd100k.py b/head_extractor/src/mmseg/datasets/bdd100k.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ae70b5cb29f2b34c5804129c85622bfcca6767d
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/bdd100k.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmseg.datasets.basesegdataset import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BDD100KDataset(BaseSegDataset):
+    METAINFO = dict(
+        classes=('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+                 'traffic light', 'traffic sign', 'vegetation', 'terrain',
+                 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train',
+                 'motorcycle', 'bicycle'),
+        palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+                 [190, 153, 153], [153, 153, 153], [250, 170,
+                                                    30], [220, 220, 0],
+                 [107, 142, 35], [152, 251, 152], [70, 130, 180],
+                 [220, 20, 60], [255, 0, 0], [0, 0, 142], [0, 0, 70],
+                 [0, 60, 100], [0, 80, 100], [0, 0, 230], [119, 11, 32]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/chase_db1.py b/head_extractor/src/mmseg/datasets/chase_db1.py
new file mode 100644
index 0000000000000000000000000000000000000000..626ddf75e9a2a10a09ca1f298f12f4290268d504
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/chase_db1.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class ChaseDB1Dataset(BaseSegDataset):
+    """Chase_db1 dataset.
+
+    In segmentation map annotation for Chase_db1, 0 stands for background,
+    which is included in 2 categories. ``reduce_zero_label`` is fixed to False.
+    The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '_1stHO.png'.
+    """
+    METAINFO = dict(
+        classes=('background', 'vessel'),
+        palette=[[120, 120, 120], [6, 230, 230]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_1stHO.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/head_extractor/src/mmseg/datasets/cityscapes.py b/head_extractor/src/mmseg/datasets/cityscapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..f494d62424a39581961ab705b3308e7e07bee110
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/cityscapes.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class CityscapesDataset(BaseSegDataset):
+    """Cityscapes dataset.
+
+    The ``img_suffix`` is fixed to '_leftImg8bit.png' and ``seg_map_suffix`` is
+    fixed to '_gtFine_labelTrainIds.png' for Cityscapes dataset.
+    """
+    METAINFO = dict(
+        classes=('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+                 'traffic light', 'traffic sign', 'vegetation', 'terrain',
+                 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train',
+                 'motorcycle', 'bicycle'),
+        palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+                 [190, 153, 153], [153, 153, 153], [250, 170,
+                                                    30], [220, 220, 0],
+                 [107, 142, 35], [152, 251, 152], [70, 130, 180],
+                 [220, 20, 60], [255, 0, 0], [0, 0, 142], [0, 0, 70],
+                 [0, 60, 100], [0, 80, 100], [0, 0, 230], [119, 11, 32]])
+
+    def __init__(self,
+                 img_suffix='_leftImg8bit.png',
+                 seg_map_suffix='_gtFine_labelTrainIds.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/coco_stuff.py b/head_extractor/src/mmseg/datasets/coco_stuff.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1574d9702330cc5b10bab084841df61e7121ff
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/coco_stuff.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class COCOStuffDataset(BaseSegDataset):
+    """COCO-Stuff dataset.
+
+    In segmentation map annotation for COCO-Stuff, Train-IDs of the 10k version
+    are from 1 to 171, where 0 is the ignore index, and Train-ID of COCO Stuff
+    164k is from 0 to 170, where 255 is the ignore index. So, they are all 171
+    semantic categories. ``reduce_zero_label`` is set to True and False for the
+    10k and 164k versions, respectively. The ``img_suffix`` is fixed to '.jpg',
+    and ``seg_map_suffix`` is fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=(
+            'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+            'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+            'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+            'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+            'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+            'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
+            'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+            'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+            'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+            'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+            'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+            'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven',
+            'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+            'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+            'blanket', 'branch', 'bridge', 'building-other', 'bush', 'cabinet',
+            'cage', 'cardboard', 'carpet', 'ceiling-other', 'ceiling-tile',
+            'cloth', 'clothes', 'clouds', 'counter', 'cupboard', 'curtain',
+            'desk-stuff', 'dirt', 'door-stuff', 'fence', 'floor-marble',
+            'floor-other', 'floor-stone', 'floor-tile', 'floor-wood', 'flower',
+            'fog', 'food-other', 'fruit', 'furniture-other', 'grass', 'gravel',
+            'ground-other', 'hill', 'house', 'leaves', 'light', 'mat', 'metal',
+            'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net',
+            'paper', 'pavement', 'pillow', 'plant-other', 'plastic',
+            'platform', 'playingfield', 'railing', 'railroad', 'river', 'road',
+            'rock', 'roof', 'rug', 'salad', 'sand', 'sea', 'shelf',
+            'sky-other', 'skyscraper', 'snow', 'solid-other', 'stairs',
+            'stone', 'straw', 'structural-other', 'table', 'tent',
+            'textile-other', 'towel', 'tree', 'vegetable', 'wall-brick',
+            'wall-concrete', 'wall-other', 'wall-panel', 'wall-stone',
+            'wall-tile', 'wall-wood', 'water-other', 'waterdrops',
+            'window-blind', 'window-other', 'wood'),
+        palette=[[0, 192, 64], [0, 192, 64], [0, 64, 96], [128, 192, 192],
+                 [0, 64, 64], [0, 192, 224], [0, 192, 192], [128, 192, 64],
+                 [0, 192, 96], [128, 192, 64], [128, 32, 192], [0, 0, 224],
+                 [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+                 [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+                 [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+                 [64, 128, 32], [0, 160, 0], [0, 0, 0], [192, 128, 160],
+                 [0, 32, 0], [0, 128, 128], [64, 128, 160], [128, 160, 0],
+                 [0, 128, 0], [192, 128, 32], [128, 96, 128], [0, 0, 128],
+                 [64, 0, 32], [0, 224, 128], [128, 0, 0], [192, 0, 160],
+                 [0, 96, 128], [128, 128, 128], [64, 0, 160], [128, 224, 128],
+                 [128, 128, 64], [192, 0, 32], [128, 96, 0], [128, 0, 192],
+                 [0, 128, 32], [64, 224, 0], [0, 0, 64], [128, 128, 160],
+                 [64, 96, 0], [0, 128, 192], [0, 128, 160], [192, 224, 0],
+                 [0, 128, 64], [128, 128, 32], [192, 32, 128], [0, 64, 192],
+                 [0, 0, 32], [64, 160, 128], [128, 64, 64], [128, 0, 160],
+                 [64, 32, 128], [128, 192, 192], [0, 0, 160], [192, 160, 128],
+                 [128, 192, 0], [128, 0, 96], [192, 32, 0], [128, 64, 128],
+                 [64, 128, 96], [64, 160, 0], [0, 64, 0], [192, 128, 224],
+                 [64, 32, 0], [0, 192, 128], [64, 128, 224], [192, 160, 0],
+                 [0, 192, 0], [192, 128, 96], [192, 96, 128], [0, 64, 128],
+                 [64, 0, 96], [64, 224, 128], [128, 64, 0], [192, 0, 224],
+                 [64, 96, 128], [128, 192, 128], [64, 0, 224], [192, 224, 128],
+                 [128, 192, 64], [192, 0, 96], [192, 96, 0], [128, 64, 192],
+                 [0, 128, 96], [0, 224, 0], [64, 64, 64], [128, 128, 224],
+                 [0, 96, 0], [64, 192, 192], [0, 128, 224], [128, 224, 0],
+                 [64, 192, 64], [128, 128, 96], [128, 32, 128], [64, 0, 192],
+                 [0, 64, 96], [0, 160, 128], [192, 0, 64], [128, 64, 224],
+                 [0, 32, 128], [192, 128, 192], [0, 64, 224], [128, 160, 128],
+                 [192, 128, 0], [128, 64, 32], [128, 32, 64], [192, 0, 128],
+                 [64, 192, 32], [0, 160, 64], [64, 0, 0], [192, 192, 160],
+                 [0, 32, 64], [64, 128, 128], [64, 192, 160], [128, 160, 64],
+                 [64, 128, 0], [192, 192, 32], [128, 96, 192], [64, 0, 128],
+                 [64, 64, 32], [0, 224, 192], [192, 0, 0], [192, 64, 160],
+                 [0, 96, 192], [192, 128, 128], [64, 64, 160], [128, 224, 192],
+                 [192, 128, 64], [192, 64, 32], [128, 96, 64], [192, 0, 192],
+                 [0, 192, 32], [64, 224, 64], [64, 0, 64], [128, 192, 160],
+                 [64, 96, 64], [64, 128, 192], [0, 192, 160], [192, 224, 64],
+                 [64, 128, 64], [128, 192, 32], [192, 32, 192], [64, 64, 192],
+                 [0, 64, 32], [64, 160, 192], [192, 64, 64], [128, 64, 160],
+                 [64, 32, 192], [192, 192, 192], [0, 64, 160], [192, 160, 192],
+                 [192, 192, 0], [128, 64, 96], [192, 32, 64], [192, 64, 128],
+                 [64, 192, 96], [64, 160, 64], [64, 64, 0]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='_labelTrainIds.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/dark_zurich.py b/head_extractor/src/mmseg/datasets/dark_zurich.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b5393fa9e5047e81790f91829cfe4b7f33cc707
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/dark_zurich.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .cityscapes import CityscapesDataset
+
+
+@DATASETS.register_module()
+class DarkZurichDataset(CityscapesDataset):
+    """DarkZurichDataset dataset."""
+
+    def __init__(self,
+                 img_suffix='_rgb_anon.png',
+                 seg_map_suffix='_gt_labelTrainIds.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/dataset_wrappers.py b/head_extractor/src/mmseg/datasets/dataset_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..082c116ff4582ecc7064dba1aba3c164dd556af5
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/dataset_wrappers.py
@@ -0,0 +1,136 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections
+import copy
+from typing import List, Optional, Sequence, Union
+
+from mmengine.dataset import ConcatDataset, force_full_init
+
+from mmseg.registry import DATASETS, TRANSFORMS
+
+
+@DATASETS.register_module()
+class MultiImageMixDataset:
+    """A wrapper of multiple images mixed dataset.
+
+    Suitable for training on multiple images mixed data augmentation like
+    mosaic and mixup.
+
+    Args:
+        dataset (ConcatDataset or dict): The dataset to be mixed.
+        pipeline (Sequence[dict]): Sequence of transform object or
+            config dict to be composed.
+        skip_type_keys (list[str], optional): Sequence of type string to
+            be skip pipeline. Default to None.
+    """
+
+    def __init__(self,
+                 dataset: Union[ConcatDataset, dict],
+                 pipeline: Sequence[dict],
+                 skip_type_keys: Optional[List[str]] = None,
+                 lazy_init: bool = False) -> None:
+        assert isinstance(pipeline, collections.abc.Sequence)
+
+        if isinstance(dataset, dict):
+            self.dataset = DATASETS.build(dataset)
+        elif isinstance(dataset, ConcatDataset):
+            self.dataset = dataset
+        else:
+            raise TypeError(
+                'elements in datasets sequence should be config or '
+                f'`ConcatDataset` instance, but got {type(dataset)}')
+
+        if skip_type_keys is not None:
+            assert all([
+                isinstance(skip_type_key, str)
+                for skip_type_key in skip_type_keys
+            ])
+        self._skip_type_keys = skip_type_keys
+
+        self.pipeline = []
+        self.pipeline_types = []
+        for transform in pipeline:
+            if isinstance(transform, dict):
+                self.pipeline_types.append(transform['type'])
+                transform = TRANSFORMS.build(transform)
+                self.pipeline.append(transform)
+            else:
+                raise TypeError('pipeline must be a dict')
+
+        self._metainfo = self.dataset.metainfo
+        self.num_samples = len(self.dataset)
+
+        self._fully_initialized = False
+        if not lazy_init:
+            self.full_init()
+
+    @property
+    def metainfo(self) -> dict:
+        """Get the meta information of the multi-image-mixed dataset.
+
+        Returns:
+            dict: The meta information of multi-image-mixed dataset.
+        """
+        return copy.deepcopy(self._metainfo)
+
+    def full_init(self):
+        """Loop to ``full_init`` each dataset."""
+        if self._fully_initialized:
+            return
+
+        self.dataset.full_init()
+        self._ori_len = len(self.dataset)
+        self._fully_initialized = True
+
+    @force_full_init
+    def get_data_info(self, idx: int) -> dict:
+        """Get annotation by index.
+
+        Args:
+            idx (int): Global index of ``ConcatDataset``.
+
+        Returns:
+            dict: The idx-th annotation of the datasets.
+        """
+        return self.dataset.get_data_info(idx)
+
+    @force_full_init
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        results = copy.deepcopy(self.dataset[idx])
+        for (transform, transform_type) in zip(self.pipeline,
+                                               self.pipeline_types):
+            if self._skip_type_keys is not None and \
+                    transform_type in self._skip_type_keys:
+                continue
+
+            if hasattr(transform, 'get_indices'):
+                indices = transform.get_indices(self.dataset)
+                if not isinstance(indices, collections.abc.Sequence):
+                    indices = [indices]
+                mix_results = [
+                    copy.deepcopy(self.dataset[index]) for index in indices
+                ]
+                results['mix_results'] = mix_results
+
+            results = transform(results)
+
+            if 'mix_results' in results:
+                results.pop('mix_results')
+
+        return results
+
+    def update_skip_type_keys(self, skip_type_keys):
+        """Update skip_type_keys.
+
+        It is called by an external hook.
+
+        Args:
+            skip_type_keys (list[str], optional): Sequence of type
+                string to be skip pipeline.
+        """
+        assert all([
+            isinstance(skip_type_key, str) for skip_type_key in skip_type_keys
+        ])
+        self._skip_type_keys = skip_type_keys
diff --git a/head_extractor/src/mmseg/datasets/decathlon.py b/head_extractor/src/mmseg/datasets/decathlon.py
new file mode 100644
index 0000000000000000000000000000000000000000..26aa4ef0d7f44e55d4400ed6151ea1f6cb3930ec
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/decathlon.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import List
+
+from mmengine.fileio import load
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class DecathlonDataset(BaseSegDataset):
+    """Dataset for Dacathlon dataset.
+
+    The dataset.json format is shown as follows
+
+    .. code-block:: none
+
+        {
+            "name": "BRATS",
+            "tensorImageSize": "4D",
+            "modality":
+            {
+                "0": "FLAIR",
+                "1": "T1w",
+                "2": "t1gd",
+                "3": "T2w"
+            },
+            "labels": {
+                "0": "background",
+                "1": "edema",
+                "2": "non-enhancing tumor",
+                "3": "enhancing tumour"
+            },
+            "numTraining": 484,
+            "numTest": 266,
+            "training":
+            [
+                {
+                    "image": "./imagesTr/BRATS_306.nii.gz"
+                    "label": "./labelsTr/BRATS_306.nii.gz"
+                    ...
+                }
+            ]
+            "test":
+            [
+                "./imagesTs/BRATS_557.nii.gz"
+                ...
+            ]
+        }
+    """
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        # `self.ann_file` denotes the absolute annotation file path if
+        # `self.root=None` or relative path if `self.root=/path/to/data/`.
+        annotations = load(self.ann_file)
+        if not isinstance(annotations, dict):
+            raise TypeError(f'The annotations loaded from annotation file '
+                            f'should be a dict, but got {type(annotations)}!')
+        raw_data_list = annotations[
+            'training'] if not self.test_mode else annotations['test']
+        data_list = []
+        for raw_data_info in raw_data_list:
+            # `2:` works for removing './' in file path, which will break
+            # loading from cloud storage.
+            if isinstance(raw_data_info, dict):
+                data_info = dict(
+                    img_path=osp.join(self.data_root, raw_data_info['image']
+                                      [2:]))
+                data_info['seg_map_path'] = osp.join(
+                    self.data_root, raw_data_info['label'][2:])
+            else:
+                data_info = dict(
+                    img_path=osp.join(self.data_root, raw_data_info)[2:])
+            data_info['label_map'] = self.label_map
+            data_info['reduce_zero_label'] = self.reduce_zero_label
+            data_info['seg_fields'] = []
+            data_list.append(data_info)
+        annotations.pop('training')
+        annotations.pop('test')
+
+        metainfo = copy.deepcopy(annotations)
+        metainfo['classes'] = [*metainfo['labels'].values()]
+        # Meta information load from annotation file will not influence the
+        # existed meta information load from `BaseDataset.METAINFO` and
+        # `metainfo` arguments defined in constructor.
+        for k, v in metainfo.items():
+            self._metainfo.setdefault(k, v)
+
+        return data_list
diff --git a/head_extractor/src/mmseg/datasets/deep_fashion.py b/head_extractor/src/mmseg/datasets/deep_fashion.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba8914c5db9608c7abcffab8bfd5cf11da280742
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/deep_fashion.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class DeepFashionDataset(BaseSegDataset):
+    """
+        Deep Fashion dataset.
+    """
+    METAINFO = dict(
+        classes=('background',
+                'sleeve top', 'long sleeve top', 'short sleeve outwear', 'long sleeve outwear',
+                'vest', 'sling', 'shorts', 'trousers', 'skirt', 'short sleeve dress',
+                'long sleeve dress', 'vest dress', 'sling dress'),
+        palette=[[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
+                 [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
+                 [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
+                 [64, 0, 128], [66, 66, 66]])
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            # ann_file=ann_file,
+            **kwargs)
+        # assert fileio.exists(self.data_prefix['img_path'],
+        #                      self.backend_args) and osp.isfile(self.ann_file)
diff --git a/head_extractor/src/mmseg/datasets/deepfashion_10k.py b/head_extractor/src/mmseg/datasets/deepfashion_10k.py
new file mode 100644
index 0000000000000000000000000000000000000000..054926636bf443aa709d39e3ca66e2092f76a4af
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/deepfashion_10k.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class DeepFashion10KDataset(BaseSegDataset):
+    """
+        Deep Fashion 10k dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'background',
+            'top',
+            'outer',
+            'skirt',
+            'dress',
+            'pants',
+            'leggings',
+            'headwear',
+            'eyeglass',
+            'neckwear',
+            'belt',
+            'footwear',
+            'bag',
+            'hair',
+            'face',
+            'skin',
+            'ring',
+            'wrist wearing',
+            'socks',
+            'gloves',
+            'necklace',
+            'rompers',
+            'earrings',
+            'tie'),
+
+        palette=[
+            [0, 0, 0], [255, 0, 0], [0, 128, 0], [0, 0, 255],
+            [0, 128, 128], [238, 130, 238], [128, 128, 128], [255, 255, 0],
+            [255, 153, 18], [255, 125, 64], [127, 255, 0], [175, 238, 238],
+            [138, 43, 226], [210, 105, 30], [0, 0, 139], [72, 61, 139],
+            [255, 20, 147], [255, 192, 203], [205, 92, 92], [32, 178, 170],
+            [132, 112, 255], [160, 82, 45], [255, 222, 173], [240, 230, 140],
+            ],)
+           
+        # palette=[
+        #     [0, 0, 0], [0, 192, 64], [0, 64, 96], [128, 192, 192],
+        #     [0, 64, 64], [0, 192, 224], [0, 192, 192], [128, 192, 64],
+        #     [0, 192, 96], [128, 192, 64], [128, 32, 192], [0, 0, 224],
+        #     [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+        #     [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+        #     [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+        #     ],)
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/drive.py b/head_extractor/src/mmseg/datasets/drive.py
new file mode 100644
index 0000000000000000000000000000000000000000..76c0160a6b6bf4a56ff135620ff0b08dc086d1d9
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/drive.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class DRIVEDataset(BaseSegDataset):
+    """DRIVE dataset.
+
+    In segmentation map annotation for DRIVE, 0 stands for background, which is
+    included in 2 categories. ``reduce_zero_label`` is fixed to False. The
+    ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '_manual1.png'.
+    """
+    METAINFO = dict(
+        classes=('background', 'vessel'),
+        palette=[[120, 120, 120], [6, 230, 230]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_manual1.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/head_extractor/src/mmseg/datasets/dsdl.py b/head_extractor/src/mmseg/datasets/dsdl.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf7e4e61b5fdd4bcb34617c8e53b93829def443a
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/dsdl.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Dict, List, Optional, Sequence, Union
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+try:
+    from dsdl.dataset import DSDLDataset
+except ImportError:
+    DSDLDataset = None
+
+
+@DATASETS.register_module()
+class DSDLSegDataset(BaseSegDataset):
+    """Dataset for dsdl segmentation.
+
+    Args:
+        specific_key_path(dict): Path of specific key which can not
+            be loaded by it's field name.
+        pre_transform(dict): pre-transform functions before loading.
+        used_labels(sequence): list of actual used classes in train steps,
+            this must be subset of class domain.
+    """
+
+    METAINFO = {}
+
+    def __init__(self,
+                 specific_key_path: Dict = {},
+                 pre_transform: Dict = {},
+                 used_labels: Optional[Sequence] = None,
+                 **kwargs) -> None:
+
+        if DSDLDataset is None:
+            raise RuntimeError(
+                'Package dsdl is not installed. Please run "pip install dsdl".'
+            )
+        self.used_labels = used_labels
+
+        loc_config = dict(type='LocalFileReader', working_dir='')
+        if kwargs.get('data_root'):
+            kwargs['ann_file'] = os.path.join(kwargs['data_root'],
+                                              kwargs['ann_file'])
+        required_fields = ['Image', 'LabelMap']
+
+        self.dsdldataset = DSDLDataset(
+            dsdl_yaml=kwargs['ann_file'],
+            location_config=loc_config,
+            required_fields=required_fields,
+            specific_key_path=specific_key_path,
+            transform=pre_transform,
+        )
+        BaseSegDataset.__init__(self, **kwargs)
+
+    def load_data_list(self) -> List[Dict]:
+        """Load data info from a dsdl yaml file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of data list.
+        """
+
+        if self.used_labels:
+            self._metainfo['classes'] = tuple(self.used_labels)
+            self.label_map = self.get_label_map(self.used_labels)
+        else:
+            self._metainfo['classes'] = tuple(['background'] +
+                                              self.dsdldataset.class_names)
+        data_list = []
+
+        for i, data in enumerate(self.dsdldataset):
+            datainfo = dict(
+                img_path=os.path.join(self.data_prefix['img_path'],
+                                      data['Image'][0].location),
+                seg_map_path=os.path.join(self.data_prefix['seg_map_path'],
+                                          data['LabelMap'][0].location),
+                label_map=self.label_map,
+                reduce_zero_label=self.reduce_zero_label,
+                seg_fields=[],
+            )
+            data_list.append(datainfo)
+
+        return data_list
+
+    def get_label_map(self,
+                      new_classes: Optional[Sequence] = None
+                      ) -> Union[Dict, None]:
+        """Require label mapping.
+
+        The ``label_map`` is a dictionary, its keys are the old label ids and
+        its values are the new label ids, and is used for changing pixel
+        labels in load_annotations. If and only if old classes in class_dom
+        is not equal to new classes in args and nether of them is not
+        None, `label_map` is not None.
+        Args:
+            new_classes (list, tuple, optional): The new classes name from
+                metainfo. Default to None.
+        Returns:
+            dict, optional: The mapping from old classes to new classes.
+        """
+        old_classes = ['background'] + self.dsdldataset.class_names
+        if (new_classes is not None and old_classes is not None
+                and list(new_classes) != list(old_classes)):
+
+            label_map = {}
+            if not set(new_classes).issubset(old_classes):
+                raise ValueError(
+                    f'new classes {new_classes} is not a '
+                    f'subset of classes {old_classes} in class_dom.')
+            for i, c in enumerate(old_classes):
+                if c not in new_classes:
+                    label_map[i] = 255
+                else:
+                    label_map[i] = new_classes.index(c)
+            return label_map
+        else:
+            return None
diff --git a/head_extractor/src/mmseg/datasets/fashion.py b/head_extractor/src/mmseg/datasets/fashion.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bb4c8d8fe4fb1e1021bcad53b9a8728a53ef3d4
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/fashion.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class FashionDataset(BaseSegDataset):
+    """imaterialist_fashion dataset.
+
+    The ``img_suffix`` is fixed to '_leftImg8bit.png' and ``seg_map_suffix`` is
+    fixed to '_gtFine_labelTrainIds.png' for Cityscapes dataset.
+    """
+    METAINFO = dict(
+        classes=("shirt, blouse", "top, t-shirt, sweatshirt", "sweater",
+                 "cardigan", "jacket", "vest", "pants", "shorts", "skirt",
+                 "coat", "dress", "jumpsuit", "cape", "glasses", "hat", 
+                 "headband, head covering, hair accessory", "tie", "glove",
+                 "watch", "belt", "leg warmer", "tights, stockings", "sock",
+                 "shoe", "bag, wallet", "scarf", "umbrella", "hood", "collar",
+                 "lapel", "epaulette", "sleeve", "pocket", "neckline", "buckle",
+                 "zipper", "applique", "bead", "bow", "flower", "fringe",
+                 "ribbon", "rivet", "ruffle", "sequin", "tassel",),
+        palette=[[205, 124, 84], [117, 64, 134], [66, 75, 191], [152, 102, 149], 
+                 [68, 20, 174], [104, 106, 176], [26, 123, 233], [65, 148, 108], 
+                 [90, 227, 255], [53, 74, 138], [174, 5, 217], [36, 9, 3], [175, 93, 71], 
+                 [86, 96, 239], [221, 101, 166], [156, 227, 224], [186, 223, 138], 
+                 [151, 121, 189], [118, 43, 207], [137, 157, 76], [224, 160, 18], 
+                 [100, 109, 226], [88, 31, 162], [101, 153, 76], [140, 252, 51], 
+                 [121, 107, 19], [228, 250, 222], [251, 148, 245], [155, 29, 0], 
+                 [99, 246, 138], [182, 66, 5], [103, 232, 180], [50, 75, 12], 
+                 [79, 181, 229], [172, 98, 94], [19, 137, 226], [191, 182, 104], 
+                 [141, 97, 101], [216, 134, 90], [31, 33, 23], [255, 224, 125], 
+                 [199, 82, 200], [196, 10, 110], [244, 144, 145], [232, 145, 29], [51, 185, 206]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='_label.jpg',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/fashion_3category.py b/head_extractor/src/mmseg/datasets/fashion_3category.py
new file mode 100644
index 0000000000000000000000000000000000000000..a78f157331d88418d021349b731c7bf8f061f3b4
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/fashion_3category.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class fashion3categoryDataset(BaseSegDataset):
+    """imaterialist_fashion dataset. Simplified to 3 catgories
+
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.jpg' for Fashion dataset.
+    """
+    METAINFO = dict(
+        classes=('background', 'upper_body', 'lower_body', 'whole_body'),
+        palette=[[0, 0, 0], [255,0,0], [0, 255, 0], [0, 0, 255]])
+        # palette=[[0, 0, 0], [1,1,1], [2, 2, 2], [3, 3, 3]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/hrf.py b/head_extractor/src/mmseg/datasets/hrf.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd669cce26420b7e2c810ecace247a9e09350a5d
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/hrf.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class HRFDataset(BaseSegDataset):
+    """HRF dataset.
+
+    In segmentation map annotation for HRF, 0 stands for background, which is
+    included in 2 categories. ``reduce_zero_label`` is fixed to False. The
+    ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '.png'.
+    """
+    METAINFO = dict(
+        classes=('background', 'vessel'),
+        palette=[[120, 120, 120], [6, 230, 230]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/head_extractor/src/mmseg/datasets/hsi_drive.py b/head_extractor/src/mmseg/datasets/hsi_drive.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d46a86629e6535ee510d11bfa24caeb9fa4ab95
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/hsi_drive.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+classes_exp = ('unlabelled', 'road', 'road marks', 'vegetation',
+               'painted metal', 'sky', 'concrete', 'pedestrian', 'water',
+               'unpainted metal', 'glass')
+palette_exp = [[0, 0, 0], [77, 77, 77], [255, 255, 255], [0, 255, 0],
+               [255, 0, 0], [0, 0, 255], [102, 51, 0], [255, 255, 0],
+               [0, 207, 250], [255, 166, 0], [0, 204, 204]]
+
+
+@DATASETS.register_module()
+class HSIDrive20Dataset(BaseSegDataset):
+    """HSI-Drive v2.0 (https://ieeexplore.ieee.org/document/10371793), the
+    updated version of HSI-Drive
+    (https://ieeexplore.ieee.org/document/9575298), is a structured dataset for
+    the research and development of automated driving systems (ADS) supported
+    by hyperspectral imaging (HSI). It contains per-pixel manually annotated
+    images selected from videos recorded in real driving conditions and has
+    been organized according to four parameters: season, daytime, road type,
+    and weather conditions.
+
+    The video sequences have been captured with a small-size 25-band VNIR
+    (Visible-NearlnfraRed) snapshot hyperspectral camera mounted on a driving
+    automobile. As a consequence, you need to modify the in_channels parameter
+    of your model from 3 (RGB images) to 25 (HSI images) as it is done in
+    configs/unet/unet-s5-d16_fcn_4xb4-160k_hsidrive-192x384.py
+
+    Apart from the abovementioned articles, additional information is provided
+    in the website (https://ipaccess.ehu.eus/HSI-Drive/) from where you can
+    download the dataset and also visualize some examples of segmented videos.
+    """
+
+    METAINFO = dict(classes=classes_exp, palette=palette_exp)
+
+    def __init__(self,
+                 img_suffix='.npy',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/human_parsing.py b/head_extractor/src/mmseg/datasets/human_parsing.py
new file mode 100644
index 0000000000000000000000000000000000000000..62938b39af1f8cfdb47f65a69a056b434c98450d
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/human_parsing.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class HumanParsingDataset(BaseSegDataset):
+    """
+        Human Parsing dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'Background', 'shirt, blouse', 'top, t-shirt, sweatshirt', 'sweater', 'cardigan', 'jacket', 'vest', 'pants', 'shorts', 'skirt', 'coat', 'dress', 'jumpsuit', 'cape', 'glasses', 'hat', 'headband, head covering, hair accessory', 'tie', 'glove', 'watch', 'belt', 'leg warmer', 'tights, stockings', 'sock', 'shoe', 'bag, wallet', 'scarf', 'umbrella', 'hood', 'collar', 'lapel', 'epaulette', 'sleeve', 'pocket', 'neckline', 'buckle', 'zipper', 'applique', 'bead', 'bow', 'flower', 'fringe', 'ribbon', 'rivet', 'ruffle', 'sequin', 'tassel', 'Hair', 'Sunglasses', 'Upper-clothes', 'Left-shoe', 'Right-shoe', 'Face', 'Left-leg', 'Right-leg', 'Left-arm', 'Right-arm'),
+
+        palette=[
+                [0, 0, 0], [255, 0, 0], [0, 128, 0], [0, 0, 255],
+                [0, 128, 128], [238, 130, 238], [128, 128, 128], [255, 255, 0],
+                [255, 153, 18], [255, 125, 64], [127, 255, 0], [175, 238, 238],
+                [138, 43, 226], [210, 105, 30], [0, 0, 139], [72, 61, 139],
+                [255, 20, 147], [255, 192, 203], [205, 92, 92], [32, 178, 170],
+                [132, 112, 255], [160, 82, 45], [255, 222, 173],
+                [240, 230, 140], [173, 216, 230], [250, 128, 114], [107, 142, 35],
+                [72, 209, 204], [199, 21, 133], [25, 25, 112], [123, 104, 238],
+                [0, 250, 154], [34, 139, 34], [219, 112, 147], [240, 128, 128],
+                [143, 188, 143], [47, 79, 79], [188, 143, 143], [100, 149, 237],
+                [102, 205, 170], [255, 160, 122], [147, 112, 219], [60, 179, 113],
+                [139, 0, 139], [255, 215, 0], [233, 150, 122], [0, 206, 209],
+                [148, 0, 211], [144, 238, 144], [255, 105, 180], [30, 144, 255],
+                [255, 140, 0], [153, 50, 204], [220, 20, 60], [46, 139, 87],
+                [240, 230, 155], [255, 99, 71]
+            ])
+           
+        # palette=[
+        #     [0, 0, 0], [0, 192, 64], [0, 64, 96], [128, 192, 192],
+        #     [0, 64, 64], [0, 192, 224], [0, 192, 192], [128, 192, 64],
+        #     [0, 192, 96], [128, 192, 64], [128, 32, 192], [0, 0, 224],
+        #     [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+        #     [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+        #     [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+        #     ],)
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/human_union.py b/head_extractor/src/mmseg/datasets/human_union.py
new file mode 100644
index 0000000000000000000000000000000000000000..d68b49afb8c4de8344131013e8d6ff8afba038ff
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/human_union.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class HumanUnionDataset(BaseSegDataset):
+    """
+        Human Union dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'background',
+            'hat',
+            'hair',
+            'sunglasses',
+            'upper-clothes',
+            'skirt',
+            'pants',
+            'dress',
+            'belt',
+            'shoes',
+            'face',
+            'legs',
+            'arms',
+            'bag',
+            'scarf',
+            'glove',
+            'socks',
+            'jumpsuits',),
+        
+        palette=[
+            [0, 0, 0], 
+            [128, 0, 0], 
+            [0, 128, 0], 
+            [128, 128, 0], 
+            [0, 0, 128], 
+            [128, 0, 128], 
+            [0, 128, 128], 
+            [128, 128, 128],
+            [64, 0, 0], 
+            [192, 0, 0], 
+            [64, 128, 0], 
+            [192, 128, 0], 
+            [64, 0, 128], 
+            [66, 66, 66], 
+            [123, 66, 123], 
+            [22, 33, 44], 
+            [77, 88, 99], 
+            [23, 24, 77]],)
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/imaterialist.py b/head_extractor/src/mmseg/datasets/imaterialist.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf6c3210e5bc52acae2e76728e1bd6dc87fa4ed3
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/imaterialist.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class iMaterialistDataset(BaseSegDataset):
+    """
+        iMaterialist 2019 dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'background',
+            'shirt, blouse',
+            'top, t-shirt, sweatshirt',
+            'sweater',
+            'cardigan',
+            "jacket",
+            "vest",
+            "pants",
+            "shorts",
+            "skirt",
+            "coat",
+            "dress",
+            "jumpsuit",
+            "cape",
+            "glasses",
+            "hat",
+            "headband, head covering, hair accessory",
+            "tie",
+            "glove",
+            "watch",
+            "belt",
+            "leg warmer",
+            "tights, stockings",
+            "sock",
+            "shoe",
+            "bag, wallet",
+            "scarf",
+            "umbrella",
+            "hood",
+            "collar",
+            "lapel",
+            "epaulette",
+            "sleeve",
+            "pocket",
+            "neckline",
+            "buckle",
+            "zipper",
+            "applique",
+            "bead",
+            "bow",
+            "flower",
+            "fringe",
+            "ribbon",
+            "rivet",
+            "ruffle",
+            "sequin",
+            "tassel",
+            ),
+
+        palette=[
+            [0, 0, 0], 
+            [234, 191, 155], [186, 99, 123], [46, 100, 157], [154, 71, 196], 
+            [15, 185, 171], [13, 89, 100], [67, 216, 41], [212, 139, 166], 
+            [10, 101, 73], [198, 51, 168], [38, 174, 154], [150, 192, 158], 
+            [194, 243, 120], [10, 224, 173], [214, 94, 149], [211, 126, 18], 
+            [96, 7, 165], [255, 35, 14], [83, 127, 78], [106, 23, 51], 
+            [41, 244, 224], [38, 86, 244], [244, 234, 150], [233, 247, 180], 
+            [222, 117, 26], [2, 90, 51], [27, 176, 90], [178, 160, 25], 
+            [75, 52, 236], [119, 65, 186], [163, 254, 113], [39, 140, 118], 
+            [235, 112, 193], [134, 107, 77], [57, 169, 93], [251, 104, 47], 
+            [224, 14, 49], [20, 123, 134], [178, 32, 212], [116, 194, 248], 
+            [211, 196, 233], [93, 36, 29], [113, 99, 55], [5, 7, 250], 
+            [172, 174, 41], [101, 98, 209],
+            ],)
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/imaterialist_5cat.py b/head_extractor/src/mmseg/datasets/imaterialist_5cat.py
new file mode 100644
index 0000000000000000000000000000000000000000..b92e0aa2aa67d727979109b28ed545e37d658cd1
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/imaterialist_5cat.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class iMaterialist_5Cat_Dataset(BaseSegDataset):
+    """
+        iMaterialist 2019 dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'background',
+            'upperbody',
+            'lowerbody',
+            'head_related',
+            'others'
+            ),
+
+        palette=[
+            [0, 0, 0], 
+            [0, 0, 255], [255, 0, 0], [0, 255, 0], [128, 0, 196], 
+            ],)
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/isaid.py b/head_extractor/src/mmseg/datasets/isaid.py
new file mode 100644
index 0000000000000000000000000000000000000000..61942ec1ea33e76c65c22d8e7fc71fb8194841dd
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/isaid.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class iSAIDDataset(BaseSegDataset):
+    """ iSAID: A Large-scale Dataset for Instance Segmentation in Aerial Images
+    In segmentation map annotation for iSAID dataset, which is included
+    in 16 categories. ``reduce_zero_label`` is fixed to False. The
+    ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '_manual1.png'.
+    """
+
+    METAINFO = dict(
+        classes=('background', 'ship', 'store_tank', 'baseball_diamond',
+                 'tennis_court', 'basketball_court', 'Ground_Track_Field',
+                 'Bridge', 'Large_Vehicle', 'Small_Vehicle', 'Helicopter',
+                 'Swimming_pool', 'Roundabout', 'Soccer_ball_field', 'plane',
+                 'Harbor'),
+        palette=[[0, 0, 0], [0, 0, 63], [0, 63, 63], [0, 63, 0], [0, 63, 127],
+                 [0, 63, 191], [0, 63, 255], [0, 127, 63], [0, 127, 127],
+                 [0, 0, 127], [0, 0, 191], [0, 0, 255], [0, 191, 127],
+                 [0, 127, 191], [0, 127, 255], [0, 100, 155]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_instance_color_RGB.png',
+                 ignore_index=255,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            ignore_index=ignore_index,
+            **kwargs)
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/head_extractor/src/mmseg/datasets/isprs.py b/head_extractor/src/mmseg/datasets/isprs.py
new file mode 100644
index 0000000000000000000000000000000000000000..30af53c569b05c9be1218e9a58655c36c8aa9931
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/isprs.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class ISPRSDataset(BaseSegDataset):
+    """ISPRS dataset.
+
+    In segmentation map annotation for ISPRS, 0 is the ignore index.
+    ``reduce_zero_label`` should be set to True. The ``img_suffix`` and
+    ``seg_map_suffix`` are both fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=('impervious_surface', 'building', 'low_vegetation', 'tree',
+                 'car', 'clutter'),
+        palette=[[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
+                 [255, 255, 0], [255, 0, 0]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/levir.py b/head_extractor/src/mmseg/datasets/levir.py
new file mode 100644
index 0000000000000000000000000000000000000000..f467481bad70a426381842dba61d85576c196eaf
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/levir.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseCDDataset
+
+
+@DATASETS.register_module()
+class LEVIRCDDataset(BaseCDDataset):
+    """ISPRS dataset.
+
+    In segmentation map annotation for ISPRS, 0 is to ignore index.
+    ``reduce_zero_label`` should be set to True. The ``img_suffix`` and
+    ``seg_map_suffix`` are both fixed to '.png'.
+    """
+
+    METAINFO = dict(
+        classes=('background', 'changed'),
+        palette=[[0, 0, 0], [255, 255, 255]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 img_suffix2='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            img_suffix2=img_suffix2,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/lip.py b/head_extractor/src/mmseg/datasets/lip.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a32a193aff990ae9f819d4a0a1be82df1d049cb
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/lip.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class LIPDataset(BaseSegDataset):
+    """LIP dataset.
+
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is fixed to
+    '.png'.
+    """
+    METAINFO = dict(
+        classes=('Background', 'Hat', 'Hair', 'Glove', 'Sunglasses',
+                 'UpperClothes', 'Dress', 'Coat', 'Socks', 'Pants',
+                 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm',
+                 'Right-arm', 'Left-leg', 'Right-leg', 'Left-shoe',
+                 'Right-shoe'),
+        palette=(
+            [0, 0, 0],
+            [128, 0, 0],
+            [255, 0, 0],
+            [0, 85, 0],
+            [170, 0, 51],
+            [255, 85, 0],
+            [0, 0, 85],
+            [0, 119, 221],
+            [85, 85, 0],
+            [0, 85, 85],
+            [85, 51, 0],
+            [52, 86, 128],
+            [0, 128, 0],
+            [0, 0, 255],
+            [51, 170, 221],
+            [0, 255, 255],
+            [85, 255, 170],
+            [170, 255, 85],
+            [255, 255, 0],
+            [255, 170, 0],
+        ))
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/loveda.py b/head_extractor/src/mmseg/datasets/loveda.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c16db503adee6f1a1cac67e1dc72ff873ccd5ea
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/loveda.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class LoveDADataset(BaseSegDataset):
+    """LoveDA dataset.
+
+    In segmentation map annotation for LoveDA, 0 is the ignore index.
+    ``reduce_zero_label`` should be set to True. The ``img_suffix`` and
+    ``seg_map_suffix`` are both fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=('background', 'building', 'road', 'water', 'barren', 'forest',
+                 'agricultural'),
+        palette=[[255, 255, 255], [255, 0, 0], [255, 255, 0], [0, 0, 255],
+                 [159, 129, 183], [0, 255, 0], [255, 195, 128]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/mapillary.py b/head_extractor/src/mmseg/datasets/mapillary.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c2947338ec79b3d8558cee0387a2a84e41f0421
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/mapillary.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class MapillaryDataset_v1(BaseSegDataset):
+    """Mapillary Vistas Dataset.
+
+    Dataset paper link:
+    http://ieeexplore.ieee.org/document/8237796/
+
+    v1.2 contain 66 object classes.
+    (37 instance-specific)
+
+    v2.0 contain 124 object classes.
+    (70 instance-specific, 46 stuff, 8 void or crowd).
+
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png' for Mapillary Vistas Dataset.
+    """
+    METAINFO = dict(
+        classes=('Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail',
+                 'Barrier', 'Wall', 'Bike Lane', 'Crosswalk - Plain',
+                 'Curb Cut', 'Parking', 'Pedestrian Area', 'Rail Track',
+                 'Road', 'Service Lane', 'Sidewalk', 'Bridge', 'Building',
+                 'Tunnel', 'Person', 'Bicyclist', 'Motorcyclist',
+                 'Other Rider', 'Lane Marking - Crosswalk',
+                 'Lane Marking - General', 'Mountain', 'Sand', 'Sky', 'Snow',
+                 'Terrain', 'Vegetation', 'Water', 'Banner', 'Bench',
+                 'Bike Rack', 'Billboard', 'Catch Basin', 'CCTV Camera',
+                 'Fire Hydrant', 'Junction Box', 'Mailbox', 'Manhole',
+                 'Phone Booth', 'Pothole', 'Street Light', 'Pole',
+                 'Traffic Sign Frame', 'Utility Pole', 'Traffic Light',
+                 'Traffic Sign (Back)', 'Traffic Sign (Front)', 'Trash Can',
+                 'Bicycle', 'Boat', 'Bus', 'Car', 'Caravan', 'Motorcycle',
+                 'On Rails', 'Other Vehicle', 'Trailer', 'Truck',
+                 'Wheeled Slow', 'Car Mount', 'Ego Vehicle', 'Unlabeled'),
+        palette=[[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153],
+                 [180, 165, 180], [90, 120, 150], [102, 102, 156],
+                 [128, 64, 255], [140, 140, 200], [170, 170, 170],
+                 [250, 170, 160], [96, 96, 96],
+                 [230, 150, 140], [128, 64, 128], [110, 110, 110],
+                 [244, 35, 232], [150, 100, 100], [70, 70, 70], [150, 120, 90],
+                 [220, 20, 60], [255, 0, 0], [255, 0, 100], [255, 0, 200],
+                 [200, 128, 128], [255, 255, 255], [64, 170,
+                                                    64], [230, 160, 50],
+                 [70, 130, 180], [190, 255, 255], [152, 251, 152],
+                 [107, 142, 35], [0, 170, 30], [255, 255, 128], [250, 0, 30],
+                 [100, 140, 180], [220, 220, 220], [220, 128, 128],
+                 [222, 40, 40], [100, 170, 30], [40, 40, 40], [33, 33, 33],
+                 [100, 128, 160], [142, 0, 0], [70, 100, 150], [210, 170, 100],
+                 [153, 153, 153], [128, 128, 128], [0, 0, 80], [250, 170, 30],
+                 [192, 192, 192], [220, 220, 0], [140, 140, 20], [119, 11, 32],
+                 [150, 0, 255], [0, 60, 100], [0, 0, 142], [0, 0, 90],
+                 [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+                 [0, 0, 70], [0, 0, 192], [32, 32, 32], [120, 10,
+                                                         10], [0, 0, 0]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
+
+
+@DATASETS.register_module()
+class MapillaryDataset_v2(BaseSegDataset):
+    """Mapillary Vistas Dataset.
+
+    Dataset paper link:
+    http://ieeexplore.ieee.org/document/8237796/
+
+    v1.2 contain 66 object classes.
+    (37 instance-specific)
+
+    v2.0 contain 124 object classes.
+    (70 instance-specific, 46 stuff, 8 void or crowd).
+
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png' for Mapillary Vistas Dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'Bird', 'Ground Animal', 'Ambiguous Barrier', 'Concrete Block',
+            'Curb', 'Fence', 'Guard Rail', 'Barrier', 'Road Median',
+            'Road Side', 'Lane Separator', 'Temporary Barrier', 'Wall',
+            'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Driveway',
+            'Parking', 'Parking Aisle', 'Pedestrian Area', 'Rail Track',
+            'Road', 'Road Shoulder', 'Service Lane', 'Sidewalk',
+            'Traffic Island', 'Bridge', 'Building', 'Garage', 'Tunnel',
+            'Person', 'Person Group', 'Bicyclist', 'Motorcyclist',
+            'Other Rider', 'Lane Marking - Dashed Line',
+            'Lane Marking - Straight Line', 'Lane Marking - Zigzag Line',
+            'Lane Marking - Ambiguous', 'Lane Marking - Arrow (Left)',
+            'Lane Marking - Arrow (Other)', 'Lane Marking - Arrow (Right)',
+            'Lane Marking - Arrow (Split Left or Straight)',
+            'Lane Marking - Arrow (Split Right or Straight)',
+            'Lane Marking - Arrow (Straight)', 'Lane Marking - Crosswalk',
+            'Lane Marking - Give Way (Row)',
+            'Lane Marking - Give Way (Single)',
+            'Lane Marking - Hatched (Chevron)',
+            'Lane Marking - Hatched (Diagonal)', 'Lane Marking - Other',
+            'Lane Marking - Stop Line', 'Lane Marking - Symbol (Bicycle)',
+            'Lane Marking - Symbol (Other)', 'Lane Marking - Text',
+            'Lane Marking (only) - Dashed Line',
+            'Lane Marking (only) - Crosswalk', 'Lane Marking (only) - Other',
+            'Lane Marking (only) - Test', 'Mountain', 'Sand', 'Sky', 'Snow',
+            'Terrain', 'Vegetation', 'Water', 'Banner', 'Bench', 'Bike Rack',
+            'Catch Basin', 'CCTV Camera', 'Fire Hydrant', 'Junction Box',
+            'Mailbox', 'Manhole', 'Parking Meter', 'Phone Booth', 'Pothole',
+            'Signage - Advertisement', 'Signage - Ambiguous', 'Signage - Back',
+            'Signage - Information', 'Signage - Other', 'Signage - Store',
+            'Street Light', 'Pole', 'Pole Group', 'Traffic Sign Frame',
+            'Utility Pole', 'Traffic Cone', 'Traffic Light - General (Single)',
+            'Traffic Light - Pedestrians', 'Traffic Light - General (Upright)',
+            'Traffic Light - General (Horizontal)', 'Traffic Light - Cyclists',
+            'Traffic Light - Other', 'Traffic Sign - Ambiguous',
+            'Traffic Sign (Back)', 'Traffic Sign - Direction (Back)',
+            'Traffic Sign - Direction (Front)', 'Traffic Sign (Front)',
+            'Traffic Sign - Parking', 'Traffic Sign - Temporary (Back)',
+            'Traffic Sign - Temporary (Front)', 'Trash Can', 'Bicycle', 'Boat',
+            'Bus', 'Car', 'Caravan', 'Motorcycle', 'On Rails', 'Other Vehicle',
+            'Trailer', 'Truck', 'Vehicle Group', 'Wheeled Slow', 'Water Valve',
+            'Car Mount', 'Dynamic', 'Ego Vehicle', 'Ground', 'Static',
+            'Unlabeled'),
+        palette=[[165, 42, 42], [0, 192, 0], [250, 170, 31], [250, 170, 32],
+                 [196, 196, 196], [190, 153, 153], [180, 165, 180],
+                 [90, 120, 150], [250, 170, 33], [250, 170, 34],
+                 [128, 128, 128], [250, 170, 35], [102, 102, 156],
+                 [128, 64, 255], [140, 140, 200], [170, 170, 170],
+                 [250, 170, 36], [250, 170, 160], [250, 170, 37], [96, 96, 96],
+                 [230, 150, 140], [128, 64, 128], [110, 110, 110],
+                 [110, 110, 110], [244, 35, 232], [128, 196,
+                                                   128], [150, 100, 100],
+                 [70, 70, 70], [150, 150, 150], [150, 120, 90], [220, 20, 60],
+                 [220, 20, 60], [255, 0, 0], [255, 0, 100], [255, 0, 200],
+                 [255, 255, 255], [255, 255, 255], [250, 170, 29],
+                 [250, 170, 28], [250, 170, 26], [250, 170,
+                                                  25], [250, 170, 24],
+                 [250, 170, 22], [250, 170, 21], [250, 170,
+                                                  20], [255, 255, 255],
+                 [250, 170, 19], [250, 170, 18], [250, 170,
+                                                  12], [250, 170, 11],
+                 [255, 255, 255], [255, 255, 255], [250, 170, 16],
+                 [250, 170, 15], [250, 170, 15], [255, 255, 255],
+                 [255, 255, 255], [255, 255, 255], [255, 255, 255],
+                 [64, 170, 64], [230, 160, 50],
+                 [70, 130, 180], [190, 255, 255], [152, 251, 152],
+                 [107, 142, 35], [0, 170, 30], [255, 255, 128], [250, 0, 30],
+                 [100, 140, 180], [220, 128, 128], [222, 40,
+                                                    40], [100, 170, 30],
+                 [40, 40, 40], [33, 33, 33], [100, 128, 160], [20, 20, 255],
+                 [142, 0, 0], [70, 100, 150], [250, 171, 30], [250, 172, 30],
+                 [250, 173, 30], [250, 174, 30], [250, 175,
+                                                  30], [250, 176, 30],
+                 [210, 170, 100], [153, 153, 153], [153, 153, 153],
+                 [128, 128, 128], [0, 0, 80], [210, 60, 60], [250, 170, 30],
+                 [250, 170, 30], [250, 170, 30], [250, 170,
+                                                  30], [250, 170, 30],
+                 [250, 170, 30], [192, 192, 192], [192, 192, 192],
+                 [192, 192, 192], [220, 220, 0], [220, 220, 0], [0, 0, 196],
+                 [192, 192, 192], [220, 220, 0], [140, 140, 20], [119, 11, 32],
+                 [150, 0, 255], [0, 60, 100], [0, 0, 142], [0, 0, 90],
+                 [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+                 [0, 0, 70], [0, 0, 142], [0, 0, 192], [170, 170, 170],
+                 [32, 32, 32], [111, 74, 0], [120, 10, 10], [81, 0, 81],
+                 [111, 111, 0], [0, 0, 0]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/night_driving.py b/head_extractor/src/mmseg/datasets/night_driving.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ead91ec77cbd8e3f0a870dee3462549183e9c9b
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/night_driving.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .cityscapes import CityscapesDataset
+
+
+@DATASETS.register_module()
+class NightDrivingDataset(CityscapesDataset):
+    """NightDrivingDataset dataset."""
+
+    def __init__(self,
+                 img_suffix='_leftImg8bit.png',
+                 seg_map_suffix='_gtCoarse_labelTrainIds.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/nyu.py b/head_extractor/src/mmseg/datasets/nyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcfda46647d25b5d16425af97a06ffb8c1f81bca
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/nyu.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class NYUDataset(BaseSegDataset):
+    """NYU depth estimation dataset. The file structure should be.
+
+    .. code-block:: none
+
+        ├── data
+        │   ├── nyu
+        │   │   ├── images
+        │   │   │   ├── train
+        │   │   │   │   ├── scene_xxx.jpg
+        │   │   │   │   ├── ...
+        │   │   │   ├── test
+        │   │   ├── annotations
+        │   │   │   ├── train
+        │   │   │   │   ├── scene_xxx.png
+        │   │   │   │   ├── ...
+        │   │   │   ├── test
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as
+            specify classes to load. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            dict(img_path='images', depth_map_path='annotations').
+        img_suffix (str): Suffix of images. Default: '.jpg'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+        ignore_index (int): The label index to be ignored. Default: 255
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+    METAINFO = dict(
+        classes=('printer_room', 'bathroom', 'living_room', 'study',
+                 'conference_room', 'study_room', 'kitchen', 'home_office',
+                 'bedroom', 'dinette', 'playroom', 'indoor_balcony',
+                 'laundry_room', 'basement', 'excercise_room', 'foyer',
+                 'home_storage', 'cafe', 'furniture_store', 'office_kitchen',
+                 'student_lounge', 'dining_room', 'reception_room',
+                 'computer_lab', 'classroom', 'office', 'bookstore'))
+
+    def __init__(self,
+                 data_prefix=dict(
+                     img_path='images', depth_map_path='annotations'),
+                 img_suffix='.jpg',
+                 depth_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            data_prefix=data_prefix,
+            img_suffix=img_suffix,
+            seg_map_suffix=depth_map_suffix,
+            **kwargs)
+
+    def _get_category_id_from_filename(self, image_fname: str) -> int:
+        """Retrieve the category ID from the given image filename."""
+        image_fname = osp.basename(image_fname)
+        position = image_fname.find(next(filter(str.isdigit, image_fname)), 0)
+        categoty_name = image_fname[:position - 1]
+        if categoty_name not in self._metainfo['classes']:
+            return -1
+        else:
+            return self._metainfo['classes'].index(categoty_name)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        ann_dir = self.data_prefix.get('depth_map_path', None)
+
+        _suffix_len = len(self.img_suffix)
+        for img in fileio.list_dir_or_file(
+                dir_path=img_dir,
+                list_dir=False,
+                suffix=self.img_suffix,
+                recursive=True,
+                backend_args=self.backend_args):
+            data_info = dict(img_path=osp.join(img_dir, img))
+            if ann_dir is not None:
+                depth_map = img[:-_suffix_len] + self.seg_map_suffix
+                data_info['depth_map_path'] = osp.join(ann_dir, depth_map)
+            data_info['seg_fields'] = []
+            data_info['category_id'] = self._get_category_id_from_filename(img)
+            data_list.append(data_info)
+        data_list = sorted(data_list, key=lambda x: x['img_path'])
+        return data_list
diff --git a/head_extractor/src/mmseg/datasets/pascal_context.py b/head_extractor/src/mmseg/datasets/pascal_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..82d00a9b3086a0db81457ab9b2f79c79de4ffaa8
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/pascal_context.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class PascalContextDataset(BaseSegDataset):
+    """PascalContext dataset.
+
+    In segmentation map annotation for PascalContext, 0 stands for background,
+    which is included in 60 categories. ``reduce_zero_label`` is fixed to
+    False. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png'.
+
+    Args:
+        ann_file (str): Annotation file path.
+    """
+
+    METAINFO = dict(
+        classes=('background', 'aeroplane', 'bag', 'bed', 'bedclothes',
+                 'bench', 'bicycle', 'bird', 'boat', 'book', 'bottle',
+                 'building', 'bus', 'cabinet', 'car', 'cat', 'ceiling',
+                 'chair', 'cloth', 'computer', 'cow', 'cup', 'curtain', 'dog',
+                 'door', 'fence', 'floor', 'flower', 'food', 'grass', 'ground',
+                 'horse', 'keyboard', 'light', 'motorbike', 'mountain',
+                 'mouse', 'person', 'plate', 'platform', 'pottedplant', 'road',
+                 'rock', 'sheep', 'shelves', 'sidewalk', 'sign', 'sky', 'snow',
+                 'sofa', 'table', 'track', 'train', 'tree', 'truck',
+                 'tvmonitor', 'wall', 'water', 'window', 'wood'),
+        palette=[[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+                 [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+                 [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+                 [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+                 [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+                 [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+                 [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+                 [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+                 [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+                 [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+                 [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+                 [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+                 [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+                 [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+                 [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]])
+
+    def __init__(self,
+                 ann_file='',
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            ann_file=ann_file,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
+        assert fileio.exists(self.data_prefix['img_path'], self.backend_args)
+
+
+@DATASETS.register_module()
+class PascalContextDataset59(BaseSegDataset):
+    """PascalContext dataset.
+
+    In segmentation map annotation for PascalContext, 0 stands for background,
+    which is included in 60 categories. ``reduce_zero_label`` is fixed to
+    True. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png'.
+    Noted: If the background is 255 and the ids of categories are from 0 to 58,
+    ``reduce_zero_label`` needs to be set to False.
+
+    Args:
+        ann_file (str): Annotation file path.
+    """
+    METAINFO = dict(
+        classes=('aeroplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle',
+                 'bird', 'boat', 'book', 'bottle', 'building', 'bus',
+                 'cabinet', 'car', 'cat', 'ceiling', 'chair', 'cloth',
+                 'computer', 'cow', 'cup', 'curtain', 'dog', 'door', 'fence',
+                 'floor', 'flower', 'food', 'grass', 'ground', 'horse',
+                 'keyboard', 'light', 'motorbike', 'mountain', 'mouse',
+                 'person', 'plate', 'platform', 'pottedplant', 'road', 'rock',
+                 'sheep', 'shelves', 'sidewalk', 'sign', 'sky', 'snow', 'sofa',
+                 'table', 'track', 'train', 'tree', 'truck', 'tvmonitor',
+                 'wall', 'water', 'window', 'wood'),
+        palette=[[180, 120, 120], [6, 230, 230], [80, 50, 50], [4, 200, 3],
+                 [120, 120, 80], [140, 140, 140], [204, 5, 255],
+                 [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+                 [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+                 [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+                 [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+                 [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+                 [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+                 [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+                 [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+                 [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+                 [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+                 [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+                 [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+                 [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]])
+
+    def __init__(self,
+                 ann_file='',
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs):
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            ann_file=ann_file,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
+        assert fileio.exists(self.data_prefix['img_path'], self.backend_args)
diff --git a/head_extractor/src/mmseg/datasets/potsdam.py b/head_extractor/src/mmseg/datasets/potsdam.py
new file mode 100644
index 0000000000000000000000000000000000000000..6892de3dd29fda569527342377c6e83ce0d972bf
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/potsdam.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class PotsdamDataset(BaseSegDataset):
+    """ISPRS Potsdam dataset.
+
+    In segmentation map annotation for Potsdam dataset, 0 is the ignore index.
+    ``reduce_zero_label`` should be set to True. The ``img_suffix`` and
+    ``seg_map_suffix`` are both fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=('impervious_surface', 'building', 'low_vegetation', 'tree',
+                 'car', 'clutter'),
+        palette=[[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
+                 [255, 255, 0], [255, 0, 0]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/refuge.py b/head_extractor/src/mmseg/datasets/refuge.py
new file mode 100644
index 0000000000000000000000000000000000000000..4016a825a37cdd0162f9c3e72df2fcabc6984991
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/refuge.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class REFUGEDataset(BaseSegDataset):
+    """REFUGE dataset.
+
+    In segmentation map annotation for REFUGE, 0 stands for background, which
+    is not included in 2 categories. ``reduce_zero_label`` is fixed to True.
+    The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '.png'.
+    """
+    METAINFO = dict(
+        classes=('background', ' Optic Cup', 'Optic Disc'),
+        palette=[[120, 120, 120], [6, 230, 230], [56, 59, 120]])
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(
+            img_suffix='.png',
+            seg_map_suffix='.png',
+            reduce_zero_label=False,
+            **kwargs)
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/head_extractor/src/mmseg/datasets/stare.py b/head_extractor/src/mmseg/datasets/stare.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b997bb785f20a9225c8b7e3f9b0522bc5e5ed99
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/stare.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class STAREDataset(BaseSegDataset):
+    """STARE dataset.
+
+    In segmentation map annotation for STARE, 0 stands for background, which is
+    included in 2 categories. ``reduce_zero_label`` is fixed to False. The
+    ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '.ah.png'.
+    """
+    METAINFO = dict(
+        classes=('background', 'vessel'),
+        palette=[[120, 120, 120], [6, 230, 230]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.ah.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/head_extractor/src/mmseg/datasets/synapse.py b/head_extractor/src/mmseg/datasets/synapse.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f83b6415046667fb24086083c43083040f4487c
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/synapse.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class SynapseDataset(BaseSegDataset):
+    """Synapse dataset.
+
+    Before dataset preprocess of Synapse, there are total 13 categories of
+    foreground which does not include background. After preprocessing, 8
+    foreground categories are kept while the other 5 foreground categories are
+    handled as background. The ``img_suffix`` is fixed to '.jpg' and
+    ``seg_map_suffix`` is fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=('background', 'aorta', 'gallbladder', 'left_kidney',
+                 'right_kidney', 'liver', 'pancreas', 'spleen', 'stomach'),
+        palette=[[0, 0, 0], [0, 0, 255], [0, 255, 0], [255, 0, 0],
+                 [0, 255, 255], [255, 0, 255], [255, 255, 0], [60, 255, 255],
+                 [240, 240, 240]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/transforms/__init__.py b/head_extractor/src/mmseg/datasets/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..125f07081810c980ebc6ded077bcf5dfd955cfcf
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/transforms/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .formatting import PackSegInputs
+from .loading import (LoadAnnotations, LoadBiomedicalAnnotation,
+                      LoadBiomedicalData, LoadBiomedicalImageFromFile,
+                      LoadDepthAnnotation, LoadImageFromNDArray,
+                      LoadMultipleRSImageFromFile, LoadSingleRSImageFromFile)
+# yapf: disable
+from .transforms import (CLAHE, AdjustGamma, Albu, BioMedical3DPad,
+                         BioMedical3DRandomCrop, BioMedical3DRandomFlip,
+                         BioMedicalGaussianBlur, BioMedicalGaussianNoise,
+                         BioMedicalRandomGamma, ConcatCDInput, GenerateEdge,
+                         PhotoMetricDistortion, RandomCrop, RandomCutOut,
+                         RandomDepthMix, RandomFlip, RandomMosaic,
+                         RandomRotate, RandomRotFlip, Rerange, Resize,
+                         ResizeShortestEdge, ResizeToMultiple, RGB2Gray,
+                         SegRescale)
+
+# yapf: enable
+__all__ = [
+    'LoadAnnotations', 'RandomCrop', 'BioMedical3DRandomCrop', 'SegRescale',
+    'PhotoMetricDistortion', 'RandomRotate', 'AdjustGamma', 'CLAHE', 'Rerange',
+    'RGB2Gray', 'RandomCutOut', 'RandomMosaic', 'PackSegInputs',
+    'ResizeToMultiple', 'LoadImageFromNDArray', 'LoadBiomedicalImageFromFile',
+    'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge',
+    'ResizeShortestEdge', 'BioMedicalGaussianNoise', 'BioMedicalGaussianBlur',
+    'BioMedical3DRandomFlip', 'BioMedicalRandomGamma', 'BioMedical3DPad',
+    'RandomRotFlip', 'Albu', 'LoadSingleRSImageFromFile', 'ConcatCDInput',
+    'LoadMultipleRSImageFromFile', 'LoadDepthAnnotation', 'RandomDepthMix',
+    'RandomFlip', 'Resize'
+]
diff --git a/head_extractor/src/mmseg/datasets/transforms/formatting.py b/head_extractor/src/mmseg/datasets/transforms/formatting.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd250551e98ffc9decaa2e168943821501844c1f
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/transforms/formatting.py
@@ -0,0 +1,112 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+from mmcv.transforms import to_tensor
+from mmcv.transforms.base import BaseTransform
+from mmengine.structures import PixelData
+
+from mmseg.registry import TRANSFORMS
+from mmseg.structures import SegDataSample
+
+
+@TRANSFORMS.register_module()
+class PackSegInputs(BaseTransform):
+    """Pack the inputs data for the semantic segmentation.
+
+    The ``img_meta`` item is always populated.  The contents of the
+    ``img_meta`` dictionary depends on ``meta_keys``. By default this includes:
+
+        - ``img_path``: filename of the image
+
+        - ``ori_shape``: original shape of the image as a tuple (h, w, c)
+
+        - ``img_shape``: shape of the image input to the network as a tuple \
+            (h, w, c).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+
+        - ``pad_shape``: shape of padded images
+
+        - ``scale_factor``: a float indicating the preprocessing scale
+
+        - ``flip``: a boolean indicating if image flip transform was used
+
+        - ``flip_direction``: the flipping direction
+
+    Args:
+        meta_keys (Sequence[str], optional): Meta keys to be packed from
+            ``SegDataSample`` and collected in ``data[img_metas]``.
+            Default: ``('img_path', 'ori_shape',
+            'img_shape', 'pad_shape', 'scale_factor', 'flip',
+            'flip_direction')``
+    """
+
+    def __init__(self,
+                 meta_keys=('img_path', 'seg_map_path', 'ori_shape',
+                            'img_shape', 'pad_shape', 'scale_factor', 'flip',
+                            'flip_direction', 'reduce_zero_label')):
+        self.meta_keys = meta_keys
+
+    def transform(self, results: dict) -> dict:
+        """Method to pack the input data.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict:
+
+            - 'inputs' (obj:`torch.Tensor`): The forward data of models.
+            - 'data_sample' (obj:`SegDataSample`): The annotation info of the
+                sample.
+        """
+        packed_results = dict()
+        if 'img' in results:
+            img = results['img']
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            if not img.flags.c_contiguous:
+                img = to_tensor(np.ascontiguousarray(img.transpose(2, 0, 1)))
+            else:
+                img = img.transpose(2, 0, 1)
+                img = to_tensor(img).contiguous()
+            packed_results['inputs'] = img
+
+        data_sample = SegDataSample()
+        if 'gt_seg_map' in results:
+            if len(results['gt_seg_map'].shape) == 2:
+                data = to_tensor(results['gt_seg_map'][None,
+                                                       ...].astype(np.int64))
+            else:
+                warnings.warn('Please pay attention your ground truth '
+                              'segmentation map, usually the segmentation '
+                              'map is 2D, but got '
+                              f'{results["gt_seg_map"].shape}')
+                data = to_tensor(results['gt_seg_map'].astype(np.int64))
+            gt_sem_seg_data = dict(data=data)
+            data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data)
+
+        if 'gt_edge_map' in results:
+            gt_edge_data = dict(
+                data=to_tensor(results['gt_edge_map'][None,
+                                                      ...].astype(np.int64)))
+            data_sample.set_data(dict(gt_edge_map=PixelData(**gt_edge_data)))
+
+        if 'gt_depth_map' in results:
+            gt_depth_data = dict(
+                data=to_tensor(results['gt_depth_map'][None, ...]))
+            data_sample.set_data(dict(gt_depth_map=PixelData(**gt_depth_data)))
+
+        img_meta = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_meta[key] = results[key]
+        data_sample.set_metainfo(img_meta)
+        packed_results['data_samples'] = data_sample
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str
diff --git a/head_extractor/src/mmseg/datasets/transforms/loading.py b/head_extractor/src/mmseg/datasets/transforms/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..c28937e55eed0aeb9c9cf2f3cd367541a7e81d07
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/transforms/loading.py
@@ -0,0 +1,771 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+import mmcv
+import mmengine.fileio as fileio
+import numpy as np
+from mmcv.transforms import BaseTransform
+from mmcv.transforms import LoadAnnotations as MMCV_LoadAnnotations
+from mmcv.transforms import LoadImageFromFile
+
+from mmseg.registry import TRANSFORMS
+from mmseg.utils import datafrombytes
+
+try:
+    from osgeo import gdal
+except ImportError:
+    gdal = None
+
+
+@TRANSFORMS.register_module()
+class LoadAnnotations(MMCV_LoadAnnotations):
+    """Load annotations for semantic segmentation provided by dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            # Filename of semantic segmentation ground truth file.
+            'seg_map_path': 'a/b/c'
+        }
+
+    After this module, the annotation has been changed to the format below:
+
+    .. code-block:: python
+
+        {
+            # in str
+            'seg_fields': List
+             # In uint8 type.
+            'gt_seg_map': np.ndarray (H, W)
+        }
+
+    Required Keys:
+
+    - seg_map_path (str): Path of semantic segmentation ground truth file.
+
+    Added Keys:
+
+    - seg_fields (List)
+    - gt_seg_map (np.uint8)
+
+    Args:
+        reduce_zero_label (bool, optional): Whether reduce all label value
+            by 1. Usually used for datasets where 0 is background label.
+            Defaults to None.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:``mmcv.imfrombytes``.
+            See :fun:``mmcv.imfrombytes`` for details.
+            Defaults to 'pillow'.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(
+        self,
+        reduce_zero_label=None,
+        backend_args=None,
+        imdecode_backend='pillow',
+    ) -> None:
+        super().__init__(
+            with_bbox=False,
+            with_label=False,
+            with_seg=True,
+            with_keypoints=False,
+            imdecode_backend=imdecode_backend,
+            backend_args=backend_args)
+        self.reduce_zero_label = reduce_zero_label
+        if self.reduce_zero_label is not None:
+            warnings.warn('`reduce_zero_label` will be deprecated, '
+                          'if you would like to ignore the zero label, please '
+                          'set `reduce_zero_label=True` when dataset '
+                          'initialized')
+        self.imdecode_backend = imdecode_backend
+
+    def _load_seg_map(self, results: dict) -> None:
+        """Private function to load semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+
+        img_bytes = fileio.get(
+            results['seg_map_path'], backend_args=self.backend_args)
+        gt_semantic_seg = mmcv.imfrombytes(
+            img_bytes, flag='unchanged',
+            backend=self.imdecode_backend).squeeze().astype(np.uint8)
+
+        # reduce zero_label
+        if self.reduce_zero_label is None:
+            self.reduce_zero_label = results['reduce_zero_label']
+        assert self.reduce_zero_label == results['reduce_zero_label'], \
+            'Initialize dataset with `reduce_zero_label` as ' \
+            f'{results["reduce_zero_label"]} but when load annotation ' \
+            f'the `reduce_zero_label` is {self.reduce_zero_label}'
+        if self.reduce_zero_label:
+            # avoid using underflow conversion
+            gt_semantic_seg[gt_semantic_seg == 0] = 255
+            gt_semantic_seg = gt_semantic_seg - 1
+            gt_semantic_seg[gt_semantic_seg == 254] = 255
+        # modify if custom classes
+        if results.get('label_map', None) is not None:
+            # Add deep copy to solve bug of repeatedly
+            # replace `gt_semantic_seg`, which is reported in
+            # https://github.com/open-mmlab/mmsegmentation/pull/1445/
+            gt_semantic_seg_copy = gt_semantic_seg.copy()
+            for old_id, new_id in results['label_map'].items():
+                gt_semantic_seg[gt_semantic_seg_copy == old_id] = new_id
+        results['gt_seg_map'] = gt_semantic_seg
+        results['seg_fields'].append('gt_seg_map')
+
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(reduce_zero_label={self.reduce_zero_label}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'backend_args={self.backend_args})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadImageFromNDArray(LoadImageFromFile):
+    """Load an image from ``results['img']``.
+
+    Similar with :obj:`LoadImageFromFile`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['img']``. Can be used when loading image
+    from webcam.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_path
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            results (dict): Result dict with Webcam read image in
+                ``results['img']``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        img = results['img']
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img_path'] = None
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadBiomedicalImageFromFile(BaseTransform):
+    """Load an biomedical mage from file.
+
+    Required Keys:
+
+    - img_path
+
+    Added Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities, and data type is float32
+        if set to_float32 = True, or float64 if decode_backend is 'nifti' and
+        to_float32 is False.
+    - img_shape
+    - ori_shape
+
+    Args:
+        decode_backend (str): The data decoding backend type. Options are
+            'numpy'and 'nifti', and there is a convention that when backend is
+            'nifti' the axis of data loaded is XYZ, and when backend is
+            'numpy', the the axis is ZYX. The data will be transposed if the
+            backend is 'nifti'. Defaults to 'nifti'.
+        to_xyz (bool): Whether transpose data from Z, Y, X to X, Y, Z.
+            Defaults to False.
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an float64 array.
+            Defaults to True.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 decode_backend: str = 'nifti',
+                 to_xyz: bool = False,
+                 to_float32: bool = True,
+                 backend_args: Optional[dict] = None) -> None:
+        self.decode_backend = decode_backend
+        self.to_xyz = to_xyz
+        self.to_float32 = to_float32
+        self.backend_args = backend_args.copy() if backend_args else None
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+
+        data_bytes = fileio.get(filename, self.backend_args)
+        img = datafrombytes(data_bytes, backend=self.decode_backend)
+
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        if len(img.shape) == 3:
+            img = img[None, ...]
+
+        if self.decode_backend == 'nifti':
+            img = img.transpose(0, 3, 2, 1)
+
+        if self.to_xyz:
+            img = img.transpose(0, 3, 2, 1)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[1:]
+        results['ori_shape'] = img.shape[1:]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f"decode_backend='{self.decode_backend}', "
+                    f'to_xyz={self.to_xyz}, '
+                    f'to_float32={self.to_float32}, '
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadBiomedicalAnnotation(BaseTransform):
+    """Load ``seg_map`` annotation provided by biomedical dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'gt_seg_map': np.ndarray (X, Y, Z) or (Z, Y, X)
+        }
+
+    Required Keys:
+
+    - seg_map_path
+
+    Added Keys:
+
+    - gt_seg_map (np.ndarray): Biomedical seg map with shape (Z, Y, X) by
+        default, and data type is float32 if set to_float32 = True, or
+        float64 if decode_backend is 'nifti' and to_float32 is False.
+
+    Args:
+        decode_backend (str): The data decoding backend type. Options are
+            'numpy'and 'nifti', and there is a convention that when backend is
+            'nifti' the axis of data loaded is XYZ, and when backend is
+            'numpy', the the axis is ZYX. The data will be transposed if the
+            backend is 'nifti'. Defaults to 'nifti'.
+        to_xyz (bool): Whether transpose data from Z, Y, X to X, Y, Z.
+            Defaults to False.
+        to_float32 (bool): Whether to convert the loaded seg map to a float32
+            numpy array. If set to False, the loaded image is an float64 array.
+            Defaults to True.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See :class:`mmengine.fileio` for details.
+            Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 decode_backend: str = 'nifti',
+                 to_xyz: bool = False,
+                 to_float32: bool = True,
+                 backend_args: Optional[dict] = None) -> None:
+        super().__init__()
+        self.decode_backend = decode_backend
+        self.to_xyz = to_xyz
+        self.to_float32 = to_float32
+        self.backend_args = backend_args.copy() if backend_args else None
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        data_bytes = fileio.get(results['seg_map_path'], self.backend_args)
+        gt_seg_map = datafrombytes(data_bytes, backend=self.decode_backend)
+
+        if self.to_float32:
+            gt_seg_map = gt_seg_map.astype(np.float32)
+
+        if self.decode_backend == 'nifti':
+            gt_seg_map = gt_seg_map.transpose(2, 1, 0)
+
+        if self.to_xyz:
+            gt_seg_map = gt_seg_map.transpose(2, 1, 0)
+
+        results['gt_seg_map'] = gt_seg_map
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f"decode_backend='{self.decode_backend}', "
+                    f'to_xyz={self.to_xyz}, '
+                    f'to_float32={self.to_float32}, '
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadBiomedicalData(BaseTransform):
+    """Load an biomedical image and annotation from file.
+
+    The loading data format is as the following:
+
+    .. code-block:: python
+
+        {
+            'img': np.ndarray data[:-1, X, Y, Z]
+            'seg_map': np.ndarray data[-1, X, Y, Z]
+        }
+
+
+    Required Keys:
+
+    - img_path
+
+    Added Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+    - img_shape
+    - ori_shape
+
+    Args:
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Defaults to False.
+        decode_backend (str): The data decoding backend type. Options are
+            'numpy'and 'nifti', and there is a convention that when backend is
+            'nifti' the axis of data loaded is XYZ, and when backend is
+            'numpy', the the axis is ZYX. The data will be transposed if the
+            backend is 'nifti'. Defaults to 'nifti'.
+        to_xyz (bool): Whether transpose data from Z, Y, X to X, Y, Z.
+            Defaults to False.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 with_seg=False,
+                 decode_backend: str = 'numpy',
+                 to_xyz: bool = False,
+                 backend_args: Optional[dict] = None) -> None:  # noqa
+        self.with_seg = with_seg
+        self.decode_backend = decode_backend
+        self.to_xyz = to_xyz
+        self.backend_args = backend_args.copy() if backend_args else None
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        data_bytes = fileio.get(results['img_path'], self.backend_args)
+        data = datafrombytes(data_bytes, backend=self.decode_backend)
+        # img is 4D data (N, X, Y, Z), N is the number of protocol
+        img = data[:-1, :]
+
+        if self.decode_backend == 'nifti':
+            img = img.transpose(0, 3, 2, 1)
+
+        if self.to_xyz:
+            img = img.transpose(0, 3, 2, 1)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[1:]
+        results['ori_shape'] = img.shape[1:]
+
+        if self.with_seg:
+            gt_seg_map = data[-1, :]
+            if self.decode_backend == 'nifti':
+                gt_seg_map = gt_seg_map.transpose(2, 1, 0)
+
+            if self.to_xyz:
+                gt_seg_map = gt_seg_map.transpose(2, 1, 0)
+            results['gt_seg_map'] = gt_seg_map
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'with_seg={self.with_seg}, '
+                    f"decode_backend='{self.decode_backend}', "
+                    f'to_xyz={self.to_xyz}, '
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class InferencerLoader(BaseTransform):
+    """Load an image from ``results['img']``.
+
+    Similar with :obj:`LoadImageFromFile`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['img']``. Can be used when loading image
+    from webcam.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_path
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__()
+        self.from_file = TRANSFORMS.build(
+            dict(type='LoadImageFromFile', **kwargs))
+        self.from_ndarray = TRANSFORMS.build(
+            dict(type='LoadImageFromNDArray', **kwargs))
+
+    def transform(self, single_input: Union[str, np.ndarray, dict]) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            results (dict): Result dict with Webcam read image in
+                ``results['img']``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        if isinstance(single_input, str):
+            inputs = dict(img_path=single_input)
+        elif isinstance(single_input, np.ndarray):
+            inputs = dict(img=single_input)
+        elif isinstance(single_input, dict):
+            inputs = single_input
+        else:
+            raise NotImplementedError
+
+        if 'img' in inputs:
+            return self.from_ndarray(inputs)
+        return self.from_file(inputs)
+
+
+@TRANSFORMS.register_module()
+class LoadSingleRSImageFromFile(BaseTransform):
+    """Load a Remote Sensing mage from file.
+
+    Required Keys:
+
+    - img_path
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is a float64 array.
+            Defaults to True.
+    """
+
+    def __init__(self, to_float32: bool = True):
+        self.to_float32 = to_float32
+
+        if gdal is None:
+            raise RuntimeError('gdal is not installed')
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+        ds = gdal.Open(filename)
+        if ds is None:
+            raise Exception(f'Unable to open file: {filename}')
+        img = np.einsum('ijk->jki', ds.ReadAsArray())
+
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadMultipleRSImageFromFile(BaseTransform):
+    """Load two Remote Sensing mage from file.
+
+    Required Keys:
+
+    - img_path
+    - img_path2
+
+    Modified Keys:
+
+    - img
+    - img2
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is a float64 array.
+            Defaults to True.
+    """
+
+    def __init__(self, to_float32: bool = True):
+        if gdal is None:
+            raise RuntimeError('gdal is not installed')
+        self.to_float32 = to_float32
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+        filename2 = results['img_path2']
+
+        ds = gdal.Open(filename)
+        ds2 = gdal.Open(filename2)
+
+        if ds is None:
+            raise Exception(f'Unable to open file: {filename}')
+        if ds2 is None:
+            raise Exception(f'Unable to open file: {filename2}')
+
+        img = np.einsum('ijk->jki', ds.ReadAsArray())
+        img2 = np.einsum('ijk->jki', ds2.ReadAsArray())
+
+        if self.to_float32:
+            img = img.astype(np.float32)
+            img2 = img2.astype(np.float32)
+
+        if img.shape != img2.shape:
+            raise Exception(f'Image shapes do not match:'
+                            f' {img.shape} vs {img2.shape}')
+
+        results['img'] = img
+        results['img2'] = img2
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadDepthAnnotation(BaseTransform):
+    """Load ``depth_map`` annotation provided by depth estimation dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'gt_depth_map': np.ndarray [Y, X]
+        }
+
+    Required Keys:
+
+    - seg_depth_path
+
+    Added Keys:
+
+    - gt_depth_map (np.ndarray): Depth map with shape (Y, X) by
+        default, and data type is float32 if set to_float32 = True.
+    - depth_rescale_factor (float): The rescale factor of depth map, which
+        can be used to recover the original value of depth map.
+
+    Args:
+        decode_backend (str): The data decoding backend type. Options are
+            'numpy', 'nifti', and 'cv2'. Defaults to 'cv2'.
+        to_float32 (bool): Whether to convert the loaded depth map to a float32
+            numpy array. If set to False, the loaded image is an uint16 array.
+            Defaults to True.
+        depth_rescale_factor (float): Factor to rescale the depth value to
+            limit the range. Defaults to 1.0.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See :class:`mmengine.fileio` for details.
+            Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 decode_backend: str = 'cv2',
+                 to_float32: bool = True,
+                 depth_rescale_factor: float = 1.0,
+                 backend_args: Optional[dict] = None) -> None:
+        super().__init__()
+        self.decode_backend = decode_backend
+        self.to_float32 = to_float32
+        self.depth_rescale_factor = depth_rescale_factor
+        self.backend_args = backend_args.copy() if backend_args else None
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load depth map.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded depth map.
+        """
+        data_bytes = fileio.get(results['depth_map_path'], self.backend_args)
+        gt_depth_map = datafrombytes(data_bytes, backend=self.decode_backend)
+
+        if self.to_float32:
+            gt_depth_map = gt_depth_map.astype(np.float32)
+
+        gt_depth_map *= self.depth_rescale_factor
+        results['gt_depth_map'] = gt_depth_map
+        results['seg_fields'].append('gt_depth_map')
+        results['depth_rescale_factor'] = self.depth_rescale_factor
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f"decode_backend='{self.decode_backend}', "
+                    f'to_float32={self.to_float32}, '
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadImageFromNpyFile(LoadImageFromFile):
+    """Load an image from ``results['img_path']``.
+
+    Required Keys:
+
+    - img_path
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def transform(self, results: dict) -> Optional[dict]:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from
+                :class:`mmengine.dataset.BaseDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+
+        try:
+            if Path(filename).suffix in ['.npy', '.npz']:
+                img = np.load(filename)
+            else:
+                if self.file_client_args is not None:
+                    file_client = fileio.FileClient.infer_client(
+                        self.file_client_args, filename)
+                    img_bytes = file_client.get(filename)
+                else:
+                    img_bytes = fileio.get(
+                        filename, backend_args=self.backend_args)
+                img = mmcv.imfrombytes(
+                    img_bytes,
+                    flag=self.color_type,
+                    backend=self.imdecode_backend)
+        except Exception as e:
+            if self.ignore_empty:
+                return None
+            else:
+                raise e
+
+        # in some cases, images are not read successfully, the img would be
+        # `None`, refer to https://github.com/open-mmlab/mmpretrain/issues/1427
+        assert img is not None, f'failed to load image: {filename}'
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
diff --git a/head_extractor/src/mmseg/datasets/transforms/transforms.py b/head_extractor/src/mmseg/datasets/transforms/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..64e23230c66e2cbe13370770646c83e936443398
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/transforms/transforms.py
@@ -0,0 +1,2537 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+import warnings
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import cv2
+import mmcv
+import mmengine
+import numpy as np
+from mmcv.transforms import RandomFlip as MMCV_RandomFlip
+from mmcv.transforms import Resize as MMCV_Resize
+from mmcv.transforms.base import BaseTransform
+from mmcv.transforms.utils import cache_randomness
+from mmengine.utils import is_tuple_of
+from numpy import random
+from scipy.ndimage import gaussian_filter
+
+from mmseg.datasets.dataset_wrappers import MultiImageMixDataset
+from mmseg.registry import TRANSFORMS
+
+try:
+    import albumentations
+    from albumentations import Compose
+    ALBU_INSTALLED = True
+except ImportError:
+    albumentations = None
+    Compose = None
+    ALBU_INSTALLED = False
+
+
+@TRANSFORMS.register_module()
+class ResizeToMultiple(BaseTransform):
+    """Resize images & seg to multiple of divisor.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - pad_shape
+
+    Args:
+        size_divisor (int): images and gt seg maps need to resize to multiple
+            of size_divisor. Default: 32.
+        interpolation (str, optional): The interpolation mode of image resize.
+            Default: None
+    """
+
+    def __init__(self, size_divisor=32, interpolation=None):
+        self.size_divisor = size_divisor
+        self.interpolation = interpolation
+
+    def transform(self, results: dict) -> dict:
+        """Call function to resize images, semantic segmentation map to
+        multiple of size divisor.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape' keys are updated.
+        """
+        # Align image to multiple of size divisor.
+        img = results['img']
+        img = mmcv.imresize_to_multiple(
+            img,
+            self.size_divisor,
+            scale_factor=1,
+            interpolation=self.interpolation
+            if self.interpolation else 'bilinear')
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['pad_shape'] = img.shape[:2]
+
+        # Align segmentation map to multiple of size divisor.
+        for key in results.get('seg_fields', []):
+            gt_seg = results[key]
+            gt_seg = mmcv.imresize_to_multiple(
+                gt_seg,
+                self.size_divisor,
+                scale_factor=1,
+                interpolation='nearest')
+            results[key] = gt_seg
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(size_divisor={self.size_divisor}, '
+                     f'interpolation={self.interpolation})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Rerange(BaseTransform):
+    """Rerange the image pixel value.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        min_value (float or int): Minimum value of the reranged image.
+            Default: 0.
+        max_value (float or int): Maximum value of the reranged image.
+            Default: 255.
+    """
+
+    def __init__(self, min_value=0, max_value=255):
+        assert isinstance(min_value, float) or isinstance(min_value, int)
+        assert isinstance(max_value, float) or isinstance(max_value, int)
+        assert min_value < max_value
+        self.min_value = min_value
+        self.max_value = max_value
+
+    def transform(self, results: dict) -> dict:
+        """Call function to rerange images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Reranged results.
+        """
+
+        img = results['img']
+        img_min_value = np.min(img)
+        img_max_value = np.max(img)
+
+        assert img_min_value < img_max_value
+        # rerange to [0, 1]
+        img = (img - img_min_value) / (img_max_value - img_min_value)
+        # rerange to [min_value, max_value]
+        img = img * (self.max_value - self.min_value) + self.min_value
+        results['img'] = img
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(min_value={self.min_value}, max_value={self.max_value})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class CLAHE(BaseTransform):
+    """Use CLAHE method to process the image.
+
+    See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
+    Graphics Gems, 1994:474-485.` for more information.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        clip_limit (float): Threshold for contrast limiting. Default: 40.0.
+        tile_grid_size (tuple[int]): Size of grid for histogram equalization.
+            Input image will be divided into equally sized rectangular tiles.
+            It defines the number of tiles in row and column. Default: (8, 8).
+    """
+
+    def __init__(self, clip_limit=40.0, tile_grid_size=(8, 8)):
+        assert isinstance(clip_limit, (float, int))
+        self.clip_limit = clip_limit
+        assert is_tuple_of(tile_grid_size, int)
+        assert len(tile_grid_size) == 2
+        self.tile_grid_size = tile_grid_size
+
+    def transform(self, results: dict) -> dict:
+        """Call function to Use CLAHE method process images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Processed results.
+        """
+
+        for i in range(results['img'].shape[2]):
+            results['img'][:, :, i] = mmcv.clahe(
+                np.array(results['img'][:, :, i], dtype=np.uint8),
+                self.clip_limit, self.tile_grid_size)
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(clip_limit={self.clip_limit}, ' \
+                    f'tile_grid_size={self.tile_grid_size})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomCrop(BaseTransform):
+    """Random crop the image & seg.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_seg_map
+
+
+    Args:
+        crop_size (Union[int, Tuple[int, int]]):  Expected size after cropping
+            with the format of (h, w). If set to an integer, then cropping
+            width and height are equal to this integer.
+        cat_max_ratio (float): The maximum ratio that single category could
+            occupy.
+        ignore_index (int): The label index to be ignored. Default: 255
+    """
+
+    def __init__(self,
+                 crop_size: Union[int, Tuple[int, int]],
+                 cat_max_ratio: float = 1.,
+                 ignore_index: int = 255):
+        super().__init__()
+        assert isinstance(crop_size, int) or (
+            isinstance(crop_size, tuple) and len(crop_size) == 2
+        ), 'The expected crop_size is an integer, or a tuple containing two '
+        'intergers'
+
+        if isinstance(crop_size, int):
+            crop_size = (crop_size, crop_size)
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        self.crop_size = crop_size
+        self.cat_max_ratio = cat_max_ratio
+        self.ignore_index = ignore_index
+
+    @cache_randomness
+    def crop_bbox(self, results: dict) -> tuple:
+        """get a crop bounding box.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            tuple: Coordinates of the cropped image.
+        """
+
+        def generate_crop_bbox(img: np.ndarray) -> tuple:
+            """Randomly get a crop bounding box.
+
+            Args:
+                img (np.ndarray): Original input image.
+
+            Returns:
+                tuple: Coordinates of the cropped image.
+            """
+
+            margin_h = max(img.shape[0] - self.crop_size[0], 0)
+            margin_w = max(img.shape[1] - self.crop_size[1], 0)
+            offset_h = np.random.randint(0, margin_h + 1)
+            offset_w = np.random.randint(0, margin_w + 1)
+            crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0]
+            crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1]
+
+            return crop_y1, crop_y2, crop_x1, crop_x2
+
+        img = results['img']
+        crop_bbox = generate_crop_bbox(img)
+        if self.cat_max_ratio < 1.:
+            # Repeat 10 times
+            for _ in range(10):
+                seg_temp = self.crop(results['gt_seg_map'], crop_bbox)
+                labels, cnt = np.unique(seg_temp, return_counts=True)
+                cnt = cnt[labels != self.ignore_index]
+                if len(cnt) > 1 and np.max(cnt) / np.sum(
+                        cnt) < self.cat_max_ratio:
+                    break
+                crop_bbox = generate_crop_bbox(img)
+
+        return crop_bbox
+
+    def crop(self, img: np.ndarray, crop_bbox: tuple) -> np.ndarray:
+        """Crop from ``img``
+
+        Args:
+            img (np.ndarray): Original input image.
+            crop_bbox (tuple): Coordinates of the cropped image.
+
+        Returns:
+            np.ndarray: The cropped image.
+        """
+
+        crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox
+        img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+        return img
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to randomly crop images, semantic segmentation
+        maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+
+        img = results['img']
+        crop_bbox = self.crop_bbox(results)
+
+        # crop the image
+        img = self.crop(img, crop_bbox)
+
+        # crop semantic seg
+        for key in results.get('seg_fields', []):
+            results[key] = self.crop(results[key], crop_bbox)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(crop_size={self.crop_size})'
+
+
+@TRANSFORMS.register_module()
+class RandomRotate(BaseTransform):
+    """Rotate the image & seg.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+
+    Args:
+        prob (float): The rotation probability.
+        degree (float, tuple[float]): Range of degrees to select from. If
+            degree is a number instead of tuple like (min, max),
+            the range of degree will be (``-degree``, ``+degree``)
+        pad_val (float, optional): Padding value of image. Default: 0.
+        seg_pad_val (float, optional): Padding value of segmentation map.
+            Default: 255.
+        center (tuple[float], optional): Center point (w, h) of the rotation in
+            the source image. If not specified, the center of the image will be
+            used. Default: None.
+        auto_bound (bool): Whether to adjust the image size to cover the whole
+            rotated image. Default: False
+    """
+
+    def __init__(self,
+                 prob,
+                 degree,
+                 pad_val=0,
+                 seg_pad_val=255,
+                 center=None,
+                 auto_bound=False):
+        self.prob = prob
+        assert prob >= 0 and prob <= 1
+        if isinstance(degree, (float, int)):
+            assert degree > 0, f'degree {degree} should be positive'
+            self.degree = (-degree, degree)
+        else:
+            self.degree = degree
+        assert len(self.degree) == 2, f'degree {self.degree} should be a ' \
+                                      f'tuple of (min, max)'
+        self.pal_val = pad_val
+        self.seg_pad_val = seg_pad_val
+        self.center = center
+        self.auto_bound = auto_bound
+
+    @cache_randomness
+    def generate_degree(self):
+        return np.random.rand() < self.prob, np.random.uniform(
+            min(*self.degree), max(*self.degree))
+
+    def transform(self, results: dict) -> dict:
+        """Call function to rotate image, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Rotated results.
+        """
+
+        rotate, degree = self.generate_degree()
+        if rotate:
+            # rotate image
+            results['img'] = mmcv.imrotate(
+                results['img'],
+                angle=degree,
+                border_value=self.pal_val,
+                center=self.center,
+                auto_bound=self.auto_bound)
+
+            # rotate segs
+            for key in results.get('seg_fields', []):
+                results[key] = mmcv.imrotate(
+                    results[key],
+                    angle=degree,
+                    border_value=self.seg_pad_val,
+                    center=self.center,
+                    auto_bound=self.auto_bound,
+                    interpolation='nearest')
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, ' \
+                    f'degree={self.degree}, ' \
+                    f'pad_val={self.pal_val}, ' \
+                    f'seg_pad_val={self.seg_pad_val}, ' \
+                    f'center={self.center}, ' \
+                    f'auto_bound={self.auto_bound})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RGB2Gray(BaseTransform):
+    """Convert RGB image to grayscale image.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_shape
+
+    This transform calculate the weighted mean of input image channels with
+    ``weights`` and then expand the channels to ``out_channels``. When
+    ``out_channels`` is None, the number of output channels is the same as
+    input channels.
+
+    Args:
+        out_channels (int): Expected number of output channels after
+            transforming. Default: None.
+        weights (tuple[float]): The weights to calculate the weighted mean.
+            Default: (0.299, 0.587, 0.114).
+    """
+
+    def __init__(self, out_channels=None, weights=(0.299, 0.587, 0.114)):
+        assert out_channels is None or out_channels > 0
+        self.out_channels = out_channels
+        assert isinstance(weights, tuple)
+        for item in weights:
+            assert isinstance(item, (float, int))
+        self.weights = weights
+
+    def transform(self, results: dict) -> dict:
+        """Call function to convert RGB image to grayscale image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with grayscale image.
+        """
+        img = results['img']
+        assert len(img.shape) == 3
+        assert img.shape[2] == len(self.weights)
+        weights = np.array(self.weights).reshape((1, 1, -1))
+        img = (img * weights).sum(2, keepdims=True)
+        if self.out_channels is None:
+            img = img.repeat(weights.shape[2], axis=2)
+        else:
+            img = img.repeat(self.out_channels, axis=2)
+
+        results['img'] = img
+        results['img_shape'] = img.shape
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(out_channels={self.out_channels}, ' \
+                    f'weights={self.weights})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class AdjustGamma(BaseTransform):
+    """Using gamma correction to process the image.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        gamma (float or int): Gamma value used in gamma correction.
+            Default: 1.0.
+    """
+
+    def __init__(self, gamma=1.0):
+        assert isinstance(gamma, float) or isinstance(gamma, int)
+        assert gamma > 0
+        self.gamma = gamma
+        inv_gamma = 1.0 / gamma
+        self.table = np.array([(i / 255.0)**inv_gamma * 255
+                               for i in np.arange(256)]).astype('uint8')
+
+    def transform(self, results: dict) -> dict:
+        """Call function to process the image with gamma correction.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Processed results.
+        """
+
+        results['img'] = mmcv.lut_transform(
+            np.array(results['img'], dtype=np.uint8), self.table)
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(gamma={self.gamma})'
+
+
+@TRANSFORMS.register_module()
+class SegRescale(BaseTransform):
+    """Rescale semantic segmentation maps.
+
+    Required Keys:
+
+    - gt_seg_map
+
+    Modified Keys:
+
+    - gt_seg_map
+
+    Args:
+        scale_factor (float): The scale factor of the final output.
+    """
+
+    def __init__(self, scale_factor=1):
+        self.scale_factor = scale_factor
+
+    def transform(self, results: dict) -> dict:
+        """Call function to scale the semantic segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with semantic segmentation map scaled.
+        """
+        for key in results.get('seg_fields', []):
+            if self.scale_factor != 1:
+                results[key] = mmcv.imrescale(
+                    results[key], self.scale_factor, interpolation='nearest')
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(scale_factor={self.scale_factor})'
+
+
+@TRANSFORMS.register_module()
+class PhotoMetricDistortion(BaseTransform):
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta: int = 32,
+                 contrast_range: Sequence[float] = (0.5, 1.5),
+                 saturation_range: Sequence[float] = (0.5, 1.5),
+                 hue_delta: int = 18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def convert(self,
+                img: np.ndarray,
+                alpha: int = 1,
+                beta: int = 0) -> np.ndarray:
+        """Multiple with alpha and add beat with clip.
+
+        Args:
+            img (np.ndarray): The input image.
+            alpha (int): Image weights, change the contrast/saturation
+                of the image. Default: 1
+            beta (int): Image bias, change the brightness of the
+                image. Default: 0
+
+        Returns:
+            np.ndarray: The transformed image.
+        """
+
+        img = img.astype(np.float32) * alpha + beta
+        img = np.clip(img, 0, 255)
+        return img.astype(np.uint8)
+
+    def brightness(self, img: np.ndarray) -> np.ndarray:
+        """Brightness distortion.
+
+        Args:
+            img (np.ndarray): The input image.
+        Returns:
+            np.ndarray: Image after brightness change.
+        """
+
+        if random.randint(2):
+            return self.convert(
+                img,
+                beta=random.uniform(-self.brightness_delta,
+                                    self.brightness_delta))
+        return img
+
+    def contrast(self, img: np.ndarray) -> np.ndarray:
+        """Contrast distortion.
+
+        Args:
+            img (np.ndarray): The input image.
+        Returns:
+            np.ndarray: Image after contrast change.
+        """
+
+        if random.randint(2):
+            return self.convert(
+                img,
+                alpha=random.uniform(self.contrast_lower, self.contrast_upper))
+        return img
+
+    def saturation(self, img: np.ndarray) -> np.ndarray:
+        """Saturation distortion.
+
+        Args:
+            img (np.ndarray): The input image.
+        Returns:
+            np.ndarray: Image after saturation change.
+        """
+
+        if random.randint(2):
+            img = mmcv.bgr2hsv(img)
+            img[:, :, 1] = self.convert(
+                img[:, :, 1],
+                alpha=random.uniform(self.saturation_lower,
+                                     self.saturation_upper))
+            img = mmcv.hsv2bgr(img)
+        return img
+
+    def hue(self, img: np.ndarray) -> np.ndarray:
+        """Hue distortion.
+
+        Args:
+            img (np.ndarray): The input image.
+        Returns:
+            np.ndarray: Image after hue change.
+        """
+
+        if random.randint(2):
+            img = mmcv.bgr2hsv(img)
+            img[:, :,
+                0] = (img[:, :, 0].astype(int) +
+                      random.randint(-self.hue_delta, self.hue_delta)) % 180
+            img = mmcv.hsv2bgr(img)
+        return img
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+
+        img = results['img']
+        # random brightness
+        img = self.brightness(img)
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = random.randint(2)
+        if mode == 1:
+            img = self.contrast(img)
+
+        # random saturation
+        img = self.saturation(img)
+
+        # random hue
+        img = self.hue(img)
+
+        # random contrast
+        if mode == 0:
+            img = self.contrast(img)
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(brightness_delta={self.brightness_delta}, '
+                     f'contrast_range=({self.contrast_lower}, '
+                     f'{self.contrast_upper}), '
+                     f'saturation_range=({self.saturation_lower}, '
+                     f'{self.saturation_upper}), '
+                     f'hue_delta={self.hue_delta})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomCutOut(BaseTransform):
+    """CutOut operation.
+
+    Randomly drop some regions of image used in
+    `Cutout <https://arxiv.org/abs/1708.04552>`_.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+
+    Args:
+        prob (float): cutout probability.
+        n_holes (int | tuple[int, int]): Number of regions to be dropped.
+            If it is given as a list, number of holes will be randomly
+            selected from the closed interval [`n_holes[0]`, `n_holes[1]`].
+        cutout_shape (tuple[int, int] | list[tuple[int, int]]): The candidate
+            shape of dropped regions. It can be `tuple[int, int]` to use a
+            fixed cutout shape, or `list[tuple[int, int]]` to randomly choose
+            shape from the list.
+        cutout_ratio (tuple[float, float] | list[tuple[float, float]]): The
+            candidate ratio of dropped regions. It can be `tuple[float, float]`
+            to use a fixed ratio or `list[tuple[float, float]]` to randomly
+            choose ratio from the list. Please note that `cutout_shape`
+            and `cutout_ratio` cannot be both given at the same time.
+        fill_in (tuple[float, float, float] | tuple[int, int, int]): The value
+            of pixel to fill in the dropped regions. Default: (0, 0, 0).
+        seg_fill_in (int): The labels of pixel to fill in the dropped regions.
+            If seg_fill_in is None, skip. Default: None.
+    """
+
+    def __init__(self,
+                 prob,
+                 n_holes,
+                 cutout_shape=None,
+                 cutout_ratio=None,
+                 fill_in=(0, 0, 0),
+                 seg_fill_in=None):
+
+        assert 0 <= prob and prob <= 1
+        assert (cutout_shape is None) ^ (cutout_ratio is None), \
+            'Either cutout_shape or cutout_ratio should be specified.'
+        assert (isinstance(cutout_shape, (list, tuple))
+                or isinstance(cutout_ratio, (list, tuple)))
+        if isinstance(n_holes, tuple):
+            assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1]
+        else:
+            n_holes = (n_holes, n_holes)
+        if seg_fill_in is not None:
+            assert (isinstance(seg_fill_in, int) and 0 <= seg_fill_in
+                    and seg_fill_in <= 255)
+        self.prob = prob
+        self.n_holes = n_holes
+        self.fill_in = fill_in
+        self.seg_fill_in = seg_fill_in
+        self.with_ratio = cutout_ratio is not None
+        self.candidates = cutout_ratio if self.with_ratio else cutout_shape
+        if not isinstance(self.candidates, list):
+            self.candidates = [self.candidates]
+
+    @cache_randomness
+    def do_cutout(self):
+        return np.random.rand() < self.prob
+
+    @cache_randomness
+    def generate_patches(self, results):
+        cutout = self.do_cutout()
+
+        h, w, _ = results['img'].shape
+        if cutout:
+            n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
+        else:
+            n_holes = 0
+        x1_lst = []
+        y1_lst = []
+        index_lst = []
+        for _ in range(n_holes):
+            x1_lst.append(np.random.randint(0, w))
+            y1_lst.append(np.random.randint(0, h))
+            index_lst.append(np.random.randint(0, len(self.candidates)))
+        return cutout, n_holes, x1_lst, y1_lst, index_lst
+
+    def transform(self, results: dict) -> dict:
+        """Call function to drop some regions of image."""
+        cutout, n_holes, x1_lst, y1_lst, index_lst = self.generate_patches(
+            results)
+        if cutout:
+            h, w, c = results['img'].shape
+            for i in range(n_holes):
+                x1 = x1_lst[i]
+                y1 = y1_lst[i]
+                index = index_lst[i]
+                if not self.with_ratio:
+                    cutout_w, cutout_h = self.candidates[index]
+                else:
+                    cutout_w = int(self.candidates[index][0] * w)
+                    cutout_h = int(self.candidates[index][1] * h)
+
+                x2 = np.clip(x1 + cutout_w, 0, w)
+                y2 = np.clip(y1 + cutout_h, 0, h)
+                results['img'][y1:y2, x1:x2, :] = self.fill_in
+
+                if self.seg_fill_in is not None:
+                    for key in results.get('seg_fields', []):
+                        results[key][y1:y2, x1:x2] = self.seg_fill_in
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'n_holes={self.n_holes}, '
+        repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio
+                     else f'cutout_shape={self.candidates}, ')
+        repr_str += f'fill_in={self.fill_in}, '
+        repr_str += f'seg_fill_in={self.seg_fill_in})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomRotFlip(BaseTransform):
+    """Rotate and flip the image & seg or just rotate the image & seg.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+
+    Args:
+        rotate_prob (float): The probability of rotate image.
+        flip_prob (float): The probability of rotate&flip image.
+        degree (float, tuple[float]): Range of degrees to select from. If
+            degree is a number instead of tuple like (min, max),
+            the range of degree will be (``-degree``, ``+degree``)
+    """
+
+    def __init__(self, rotate_prob=0.5, flip_prob=0.5, degree=(-20, 20)):
+        self.rotate_prob = rotate_prob
+        self.flip_prob = flip_prob
+        assert 0 <= rotate_prob <= 1 and 0 <= flip_prob <= 1
+        if isinstance(degree, (float, int)):
+            assert degree > 0, f'degree {degree} should be positive'
+            self.degree = (-degree, degree)
+        else:
+            self.degree = degree
+        assert len(self.degree) == 2, f'degree {self.degree} should be a ' \
+                                      f'tuple of (min, max)'
+
+    def random_rot_flip(self, results: dict) -> dict:
+        k = np.random.randint(0, 4)
+        results['img'] = np.rot90(results['img'], k)
+        for key in results.get('seg_fields', []):
+            results[key] = np.rot90(results[key], k)
+        axis = np.random.randint(0, 2)
+        results['img'] = np.flip(results['img'], axis=axis).copy()
+        for key in results.get('seg_fields', []):
+            results[key] = np.flip(results[key], axis=axis).copy()
+        return results
+
+    def random_rotate(self, results: dict) -> dict:
+        angle = np.random.uniform(min(*self.degree), max(*self.degree))
+        results['img'] = mmcv.imrotate(results['img'], angle=angle)
+        for key in results.get('seg_fields', []):
+            results[key] = mmcv.imrotate(results[key], angle=angle)
+        return results
+
+    def transform(self, results: dict) -> dict:
+        """Call function to rotate or rotate & flip image, semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Rotated or rotated & flipped results.
+        """
+        rotate_flag = 0
+        if random.random() < self.rotate_prob:
+            results = self.random_rotate(results)
+            rotate_flag = 1
+        if random.random() < self.flip_prob and rotate_flag == 0:
+            results = self.random_rot_flip(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(rotate_prob={self.rotate_prob}, ' \
+                    f'flip_prob={self.flip_prob}, ' \
+                    f'degree={self.degree})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomFlip(MMCV_RandomFlip):
+    """Flip the image & bbox & segmentation map. Added or Updated
+    keys: flip, flip_direction, img, gt_bboxes, gt_seg_map, and gt_depth_map.
+    There are 3 flip modes:
+
+    - ``prob`` is float, ``direction`` is string: the image will be
+      ``direction``ly flipped with probability of ``prob`` .
+      E.g., ``prob=0.5``, ``direction='horizontal'``,
+      then image will be horizontally flipped with probability of 0.5.
+
+    - ``prob`` is float, ``direction`` is list of string: the image will
+      be ``direction[i]``ly flipped with probability of
+      ``prob/len(direction)``.
+      E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``,
+      then image will be horizontally flipped with probability of 0.25,
+      vertically with probability of 0.25.
+
+    - ``prob`` is list of float, ``direction`` is list of string:
+      given ``len(prob) == len(direction)``, the image will
+      be ``direction[i]``ly flipped with probability of ``prob[i]``.
+      E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal',
+      'vertical']``, then image will be horizontally flipped with
+      probability of 0.3, vertically with probability of 0.5.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_depth_map (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_depth_map (optional)
+
+    Added Keys:
+
+    - flip
+    - flip_direction
+    - swap_seg_labels (optional)
+
+    Args:
+        prob (float | list[float], optional): The flipping probability.
+            Defaults to None.
+        direction(str | list[str]): The flipping direction. Options
+            If input is a list, the length must equal ``prob``. Each
+            element in ``prob`` indicates the flip probability of
+            corresponding direction. Defaults to 'horizontal'.
+        swap_seg_labels (list, optional): The label pair need to be swapped
+            for ground truth, like 'left arm' and 'right arm' need to be
+            swapped after horizontal flipping. For example, ``[(1, 5)]``,
+            where 1/5 is the label of the left/right arm. Defaults to None.
+    """
+
+    def _flip(self, results: dict) -> None:
+        """Flip images, bounding boxes and semantic segmentation map."""
+        # flip image
+        results['img'] = mmcv.imflip(
+            results['img'], direction=results['flip_direction'])
+
+        img_shape = results['img'].shape[:2]
+
+        # flip bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'] = self._flip_bbox(results['gt_bboxes'],
+                                                   img_shape,
+                                                   results['flip_direction'])
+
+        # flip seg map
+        for key in results.get('seg_fields', []):
+            if results.get(key, None) is not None:
+                results[key] = self._flip_seg_map(
+                    results[key], direction=results['flip_direction']).copy()
+                results['swap_seg_labels'] = self.swap_seg_labels
+
+
+@TRANSFORMS.register_module()
+class Resize(MMCV_Resize):
+    """Resize images & seg & depth map.
+
+    This transform resizes the input image according to ``scale`` or
+    ``scale_factor``. Seg map, depth map and other relative annotations are
+    then resized with the same scale factor.
+    if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to
+    resize.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map (optional)
+    - gt_depth_map (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+    - gt_depth_map
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+
+    Args:
+        scale (int or tuple): Images scales for resizing. Defaults to None
+        scale_factor (float or tuple[float]): Scale factors for resizing.
+            Defaults to None.
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Defaults to False.
+        clip_object_border (bool): Whether to clip the objects
+            outside the border of the image. In some dataset like MOT17, the gt
+            bboxes are allowed to cross the border of images. Therefore, we
+            don't need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def _resize_seg(self, results: dict) -> None:
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for seg_key in results.get('seg_fields', []):
+            if results.get(seg_key, None) is not None:
+                if self.keep_ratio:
+                    gt_seg = mmcv.imrescale(
+                        results[seg_key],
+                        results['scale'],
+                        interpolation='nearest',
+                        backend=self.backend)
+                else:
+                    gt_seg = mmcv.imresize(
+                        results[seg_key],
+                        results['scale'],
+                        interpolation='nearest',
+                        backend=self.backend)
+                results[seg_key] = gt_seg
+
+
+@TRANSFORMS.register_module()
+class RandomMosaic(BaseTransform):
+    """Mosaic augmentation. Given 4 images, mosaic transform combines them into
+    one output image. The output image is composed of the parts from each sub-
+    image.
+
+    .. code:: text
+
+                        mosaic transform
+                           center_x
+                +------------------------------+
+                |       pad        |  pad      |
+                |      +-----------+           |
+                |      |           |           |
+                |      |  image1   |--------+  |
+                |      |           |        |  |
+                |      |           | image2 |  |
+     center_y   |----+-------------+-----------|
+                |    |   cropped   |           |
+                |pad |   image3    |  image4   |
+                |    |             |           |
+                +----|-------------+-----------+
+                     |             |
+                     +-------------+
+
+     The mosaic transform steps are as follows:
+         1. Choose the mosaic center as the intersections of 4 images
+         2. Get the left top image according to the index, and randomly
+            sample another 3 images from the custom dataset.
+         3. Sub image will be cropped if image is larger than mosaic patch
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+    - mix_results
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+    - gt_seg_map
+
+    Args:
+        prob (float): mosaic probability.
+        img_scale (Sequence[int]): Image size after mosaic pipeline of
+            a single image. The size of the output image is four times
+            that of a single image. The output image comprises 4 single images.
+            Default: (640, 640).
+        center_ratio_range (Sequence[float]): Center ratio range of mosaic
+            output. Default: (0.5, 1.5).
+        pad_val (int): Pad value. Default: 0.
+        seg_pad_val (int): Pad value of segmentation map. Default: 255.
+    """
+
+    def __init__(self,
+                 prob,
+                 img_scale=(640, 640),
+                 center_ratio_range=(0.5, 1.5),
+                 pad_val=0,
+                 seg_pad_val=255):
+        assert 0 <= prob and prob <= 1
+        assert isinstance(img_scale, tuple)
+        self.prob = prob
+        self.img_scale = img_scale
+        self.center_ratio_range = center_ratio_range
+        self.pad_val = pad_val
+        self.seg_pad_val = seg_pad_val
+
+    @cache_randomness
+    def do_mosaic(self):
+        return np.random.rand() < self.prob
+
+    def transform(self, results: dict) -> dict:
+        """Call function to make a mosaic of image.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with mosaic transformed.
+        """
+        mosaic = self.do_mosaic()
+        if mosaic:
+            results = self._mosaic_transform_img(results)
+            results = self._mosaic_transform_seg(results)
+        return results
+
+    def get_indices(self, dataset: MultiImageMixDataset) -> list:
+        """Call function to collect indices.
+
+        Args:
+            dataset (:obj:`MultiImageMixDataset`): The dataset.
+
+        Returns:
+            list: indices.
+        """
+
+        indices = [random.randint(0, len(dataset)) for _ in range(3)]
+        return indices
+
+    @cache_randomness
+    def generate_mosaic_center(self):
+        # mosaic center x, y
+        center_x = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[1])
+        center_y = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        return center_x, center_y
+
+    def _mosaic_transform_img(self, results: dict) -> dict:
+        """Mosaic transform function.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        assert 'mix_results' in results
+        if len(results['img'].shape) == 3:
+            c = results['img'].shape[2]
+            mosaic_img = np.full(
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2), c),
+                self.pad_val,
+                dtype=results['img'].dtype)
+        else:
+            mosaic_img = np.full(
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2)),
+                self.pad_val,
+                dtype=results['img'].dtype)
+
+        # mosaic center x, y
+        self.center_x, self.center_y = self.generate_mosaic_center()
+        center_position = (self.center_x, self.center_y)
+
+        loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        for i, loc in enumerate(loc_strs):
+            if loc == 'top_left':
+                result_patch = copy.deepcopy(results)
+            else:
+                result_patch = copy.deepcopy(results['mix_results'][i - 1])
+
+            img_i = result_patch['img']
+            h_i, w_i = img_i.shape[:2]
+            # keep_ratio resize
+            scale_ratio_i = min(self.img_scale[0] / h_i,
+                                self.img_scale[1] / w_i)
+            img_i = mmcv.imresize(
+                img_i, (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)))
+
+            # compute the combine parameters
+            paste_coord, crop_coord = self._mosaic_combine(
+                loc, center_position, img_i.shape[:2][::-1])
+            x1_p, y1_p, x2_p, y2_p = paste_coord
+            x1_c, y1_c, x2_c, y2_c = crop_coord
+
+            # crop and paste image
+            mosaic_img[y1_p:y2_p, x1_p:x2_p] = img_i[y1_c:y2_c, x1_c:x2_c]
+
+        results['img'] = mosaic_img
+        results['img_shape'] = mosaic_img.shape
+        results['ori_shape'] = mosaic_img.shape
+
+        return results
+
+    def _mosaic_transform_seg(self, results: dict) -> dict:
+        """Mosaic transform function for label annotations.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Updated result dict.
+        """
+
+        assert 'mix_results' in results
+        for key in results.get('seg_fields', []):
+            mosaic_seg = np.full(
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2)),
+                self.seg_pad_val,
+                dtype=results[key].dtype)
+
+            # mosaic center x, y
+            center_position = (self.center_x, self.center_y)
+
+            loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+            for i, loc in enumerate(loc_strs):
+                if loc == 'top_left':
+                    result_patch = copy.deepcopy(results)
+                else:
+                    result_patch = copy.deepcopy(results['mix_results'][i - 1])
+
+                gt_seg_i = result_patch[key]
+                h_i, w_i = gt_seg_i.shape[:2]
+                # keep_ratio resize
+                scale_ratio_i = min(self.img_scale[0] / h_i,
+                                    self.img_scale[1] / w_i)
+                gt_seg_i = mmcv.imresize(
+                    gt_seg_i,
+                    (int(w_i * scale_ratio_i), int(h_i * scale_ratio_i)),
+                    interpolation='nearest')
+
+                # compute the combine parameters
+                paste_coord, crop_coord = self._mosaic_combine(
+                    loc, center_position, gt_seg_i.shape[:2][::-1])
+                x1_p, y1_p, x2_p, y2_p = paste_coord
+                x1_c, y1_c, x2_c, y2_c = crop_coord
+
+                # crop and paste image
+                mosaic_seg[y1_p:y2_p, x1_p:x2_p] = \
+                    gt_seg_i[y1_c:y2_c, x1_c:x2_c]
+
+            results[key] = mosaic_seg
+
+        return results
+
+    def _mosaic_combine(self, loc: str, center_position_xy: Sequence[float],
+                        img_shape_wh: Sequence[int]) -> tuple:
+        """Calculate global coordinate of mosaic image and local coordinate of
+        cropped sub-image.
+
+        Args:
+            loc (str): Index for the sub-image, loc in ('top_left',
+              'top_right', 'bottom_left', 'bottom_right').
+            center_position_xy (Sequence[float]): Mixing center for 4 images,
+                (x, y).
+            img_shape_wh (Sequence[int]): Width and height of sub-image
+
+        Returns:
+            tuple[tuple[float]]: Corresponding coordinate of pasting and
+                cropping
+                - paste_coord (tuple): paste corner coordinate in mosaic image.
+                - crop_coord (tuple): crop corner coordinate in mosaic image.
+        """
+
+        assert loc in ('top_left', 'top_right', 'bottom_left', 'bottom_right')
+        if loc == 'top_left':
+            # index0 to top left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             center_position_xy[0], \
+                             center_position_xy[1]
+            crop_coord = img_shape_wh[0] - (x2 - x1), img_shape_wh[1] - (
+                y2 - y1), img_shape_wh[0], img_shape_wh[1]
+
+        elif loc == 'top_right':
+            # index1 to top right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             max(center_position_xy[1] - img_shape_wh[1], 0), \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[1] * 2), \
+                             center_position_xy[1]
+            crop_coord = 0, img_shape_wh[1] - (y2 - y1), min(
+                img_shape_wh[0], x2 - x1), img_shape_wh[1]
+
+        elif loc == 'bottom_left':
+            # index2 to bottom left part of image
+            x1, y1, x2, y2 = max(center_position_xy[0] - img_shape_wh[0], 0), \
+                             center_position_xy[1], \
+                             center_position_xy[0], \
+                             min(self.img_scale[0] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = img_shape_wh[0] - (x2 - x1), 0, img_shape_wh[0], min(
+                y2 - y1, img_shape_wh[1])
+
+        else:
+            # index3 to bottom right part of image
+            x1, y1, x2, y2 = center_position_xy[0], \
+                             center_position_xy[1], \
+                             min(center_position_xy[0] + img_shape_wh[0],
+                                 self.img_scale[1] * 2), \
+                             min(self.img_scale[0] * 2, center_position_xy[1] +
+                                 img_shape_wh[1])
+            crop_coord = 0, 0, min(img_shape_wh[0],
+                                   x2 - x1), min(y2 - y1, img_shape_wh[1])
+
+        paste_coord = x1, y1, x2, y2
+        return paste_coord, crop_coord
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'img_scale={self.img_scale}, '
+        repr_str += f'center_ratio_range={self.center_ratio_range}, '
+        repr_str += f'pad_val={self.pad_val}, '
+        repr_str += f'seg_pad_val={self.pad_val})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class GenerateEdge(BaseTransform):
+    """Generate Edge for CE2P approach.
+
+    Edge will be used to calculate loss of
+    `CE2P <https://arxiv.org/abs/1809.05996>`_.
+
+    Modified from https://github.com/liutinglt/CE2P/blob/master/dataset/target_generation.py # noqa:E501
+
+    Required Keys:
+
+        - img_shape
+        - gt_seg_map
+
+    Added Keys:
+        - gt_edge_map (np.ndarray, uint8): The edge annotation generated from the
+            seg map by extracting border between different semantics.
+
+    Args:
+        edge_width (int): The width of edge. Default to 3.
+        ignore_index (int): Index that will be ignored. Default to 255.
+    """
+
+    def __init__(self, edge_width: int = 3, ignore_index: int = 255) -> None:
+        super().__init__()
+        self.edge_width = edge_width
+        self.ignore_index = ignore_index
+
+    def transform(self, results: Dict) -> Dict:
+        """Call function to generate edge from segmentation map.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with edge mask.
+        """
+        h, w = results['img_shape']
+        edge = np.zeros((h, w), dtype=np.uint8)
+        seg_map = results['gt_seg_map']
+
+        # down
+        edge_down = edge[1:h, :]
+        edge_down[(seg_map[1:h, :] != seg_map[:h - 1, :])
+                  & (seg_map[1:h, :] != self.ignore_index) &
+                  (seg_map[:h - 1, :] != self.ignore_index)] = 1
+        # left
+        edge_left = edge[:, :w - 1]
+        edge_left[(seg_map[:, :w - 1] != seg_map[:, 1:w])
+                  & (seg_map[:, :w - 1] != self.ignore_index) &
+                  (seg_map[:, 1:w] != self.ignore_index)] = 1
+        # up_left
+        edge_upleft = edge[:h - 1, :w - 1]
+        edge_upleft[(seg_map[:h - 1, :w - 1] != seg_map[1:h, 1:w])
+                    & (seg_map[:h - 1, :w - 1] != self.ignore_index) &
+                    (seg_map[1:h, 1:w] != self.ignore_index)] = 1
+        # up_right
+        edge_upright = edge[:h - 1, 1:w]
+        edge_upright[(seg_map[:h - 1, 1:w] != seg_map[1:h, :w - 1])
+                     & (seg_map[:h - 1, 1:w] != self.ignore_index) &
+                     (seg_map[1:h, :w - 1] != self.ignore_index)] = 1
+
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT,
+                                           (self.edge_width, self.edge_width))
+        edge = cv2.dilate(edge, kernel)
+
+        results['gt_edge_map'] = edge
+        results['edge_width'] = self.edge_width
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'edge_width={self.edge_width}, '
+        repr_str += f'ignore_index={self.ignore_index})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ResizeShortestEdge(BaseTransform):
+    """Resize the image and mask while keeping the aspect ratio unchanged.
+
+    Modified from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/augmentation_impl.py#L130 # noqa:E501
+    Copyright (c) Facebook, Inc. and its affiliates.
+    Licensed under the Apache-2.0 License
+
+    This transform attempts to scale the shorter edge to the given
+    `scale`, as long as the longer edge does not exceed `max_size`.
+    If `max_size` is reached, then downscale so that the longer
+    edge does not exceed `max_size`.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_seg_map (optional))
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+
+
+    Args:
+        scale (Union[int, Tuple[int, int]]): The target short edge length.
+            If it's tuple, will select the min value as the short edge length.
+        max_size (int): The maximum allowed longest edge length.
+    """
+
+    def __init__(self, scale: Union[int, Tuple[int, int]],
+                 max_size: int) -> None:
+        super().__init__()
+        self.scale = scale
+        self.max_size = max_size
+
+        # Create a empty Resize object
+        self.resize = TRANSFORMS.build({
+            'type': 'Resize',
+            'scale': 0,
+            'keep_ratio': True
+        })
+
+    def _get_output_shape(self, img, short_edge_length) -> Tuple[int, int]:
+        """Compute the target image shape with the given `short_edge_length`.
+
+        Args:
+            img (np.ndarray): The input image.
+            short_edge_length (Union[int, Tuple[int, int]]): The target short
+                edge length. If it's tuple, will select the min value as the
+                short edge length.
+        """
+        h, w = img.shape[:2]
+        if isinstance(short_edge_length, int):
+            size = short_edge_length * 1.0
+        elif isinstance(short_edge_length, tuple):
+            size = min(short_edge_length) * 1.0
+        scale = size / min(h, w)
+        if h < w:
+            new_h, new_w = size, scale * w
+        else:
+            new_h, new_w = scale * h, size
+
+        if max(new_h, new_w) > self.max_size:
+            scale = self.max_size * 1.0 / max(new_h, new_w)
+            new_h *= scale
+            new_w *= scale
+
+        new_h = int(new_h + 0.5)
+        new_w = int(new_w + 0.5)
+        return (new_w, new_h)
+
+    def transform(self, results: Dict) -> Dict:
+        self.resize.scale = self._get_output_shape(results['img'], self.scale)
+        return self.resize(results)
+
+
+@TRANSFORMS.register_module()
+class BioMedical3DRandomCrop(BaseTransform):
+    """Crop the input patch for medical image & segmentation mask.
+
+    Required Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X),
+        N is the number of modalities, and data type is float32.
+    - gt_seg_map (np.ndarray, optional): Biomedical semantic segmentation mask
+        with shape (Z, Y, X).
+
+    Modified Keys:
+
+        - img
+        - img_shape
+        - gt_seg_map (optional)
+
+    Args:
+        crop_shape (Union[int, Tuple[int, int, int]]):  Expected size after
+            cropping with the format of (z, y, x). If set to an integer,
+            then cropping width and height are equal to this integer.
+        keep_foreground (bool): If keep_foreground is True, it will sample a
+            voxel of foreground classes randomly, and will take it as the
+            center of the crop bounding-box. Default to True.
+    """
+
+    def __init__(self,
+                 crop_shape: Union[int, Tuple[int, int, int]],
+                 keep_foreground: bool = True):
+        super().__init__()
+        assert isinstance(crop_shape, int) or (
+            isinstance(crop_shape, tuple) and len(crop_shape) == 3
+        ), 'The expected crop_shape is an integer, or a tuple containing '
+        'three integers'
+
+        if isinstance(crop_shape, int):
+            crop_shape = (crop_shape, crop_shape, crop_shape)
+        assert crop_shape[0] > 0 and crop_shape[1] > 0 and crop_shape[2] > 0
+        self.crop_shape = crop_shape
+        self.keep_foreground = keep_foreground
+
+    def random_sample_location(self, seg_map: np.ndarray) -> dict:
+        """sample foreground voxel when keep_foreground is True.
+
+        Args:
+            seg_map (np.ndarray): gt seg map.
+
+        Returns:
+            dict: Coordinates of selected foreground voxel.
+        """
+        num_samples = 10000
+        # at least 1% of the class voxels need to be selected,
+        # otherwise it may be too sparse
+        min_percent_coverage = 0.01
+        class_locs = {}
+        foreground_classes = []
+        all_classes = np.unique(seg_map)
+        for c in all_classes:
+            if c == 0:
+                # to avoid the segmentation mask full of background 0
+                # and the class_locs is just void dictionary {} when it return
+                # there add a void list for background 0.
+                class_locs[c] = []
+            else:
+                all_locs = np.argwhere(seg_map == c)
+                target_num_samples = min(num_samples, len(all_locs))
+                target_num_samples = max(
+                    target_num_samples,
+                    int(np.ceil(len(all_locs) * min_percent_coverage)))
+
+                selected = all_locs[np.random.choice(
+                    len(all_locs), target_num_samples, replace=False)]
+                class_locs[c] = selected
+                foreground_classes.append(c)
+
+        selected_voxel = None
+        if len(foreground_classes) > 0:
+            selected_class = np.random.choice(foreground_classes)
+            voxels_of_that_class = class_locs[selected_class]
+            selected_voxel = voxels_of_that_class[np.random.choice(
+                len(voxels_of_that_class))]
+
+        return selected_voxel
+
+    def random_generate_crop_bbox(self, margin_z: int, margin_y: int,
+                                  margin_x: int) -> tuple:
+        """Randomly get a crop bounding box.
+
+        Args:
+            seg_map (np.ndarray): Ground truth segmentation map.
+
+        Returns:
+            tuple: Coordinates of the cropped image.
+        """
+        offset_z = np.random.randint(0, margin_z + 1)
+        offset_y = np.random.randint(0, margin_y + 1)
+        offset_x = np.random.randint(0, margin_x + 1)
+        crop_z1, crop_z2 = offset_z, offset_z + self.crop_shape[0]
+        crop_y1, crop_y2 = offset_y, offset_y + self.crop_shape[1]
+        crop_x1, crop_x2 = offset_x, offset_x + self.crop_shape[2]
+
+        return crop_z1, crop_z2, crop_y1, crop_y2, crop_x1, crop_x2
+
+    def generate_margin(self, results: dict) -> tuple:
+        """Generate margin of crop bounding-box.
+
+        If keep_foreground is True, it will sample a voxel of foreground
+        classes randomly, and will take it as the center of the bounding-box,
+        and return the margin between of the bounding-box and image.
+        If keep_foreground is False, it will return the difference from crop
+        shape and image shape.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            tuple: The margin for 3 dimensions of crop bounding-box and image.
+        """
+
+        seg_map = results['gt_seg_map']
+        if self.keep_foreground:
+            selected_voxel = self.random_sample_location(seg_map)
+            if selected_voxel is None:
+                # this only happens if some image does not contain
+                # foreground voxels at all
+                warnings.warn(f'case does not contain any foreground classes'
+                              f': {results["img_path"]}')
+                margin_z = max(seg_map.shape[0] - self.crop_shape[0], 0)
+                margin_y = max(seg_map.shape[1] - self.crop_shape[1], 0)
+                margin_x = max(seg_map.shape[2] - self.crop_shape[2], 0)
+            else:
+                margin_z = max(0, selected_voxel[0] - self.crop_shape[0] // 2)
+                margin_y = max(0, selected_voxel[1] - self.crop_shape[1] // 2)
+                margin_x = max(0, selected_voxel[2] - self.crop_shape[2] // 2)
+                margin_z = max(
+                    0, min(seg_map.shape[0] - self.crop_shape[0], margin_z))
+                margin_y = max(
+                    0, min(seg_map.shape[1] - self.crop_shape[1], margin_y))
+                margin_x = max(
+                    0, min(seg_map.shape[2] - self.crop_shape[2], margin_x))
+        else:
+            margin_z = max(seg_map.shape[0] - self.crop_shape[0], 0)
+            margin_y = max(seg_map.shape[1] - self.crop_shape[1], 0)
+            margin_x = max(seg_map.shape[2] - self.crop_shape[2], 0)
+
+        return margin_z, margin_y, margin_x
+
+    def crop(self, img: np.ndarray, crop_bbox: tuple) -> np.ndarray:
+        """Crop from ``img``
+
+        Args:
+            img (np.ndarray): Original input image.
+            crop_bbox (tuple): Coordinates of the cropped image.
+
+        Returns:
+            np.ndarray: The cropped image.
+        """
+        crop_z1, crop_z2, crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox
+        if len(img.shape) == 3:
+            # crop seg map
+            img = img[crop_z1:crop_z2, crop_y1:crop_y2, crop_x1:crop_x2]
+        else:
+            # crop image
+            assert len(img.shape) == 4
+            img = img[:, crop_z1:crop_z2, crop_y1:crop_y2, crop_x1:crop_x2]
+        return img
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to randomly crop images, semantic segmentation
+        maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+        margin = self.generate_margin(results)
+        crop_bbox = self.random_generate_crop_bbox(*margin)
+
+        # crop the image
+        img = results['img']
+        results['img'] = self.crop(img, crop_bbox)
+        results['img_shape'] = results['img'].shape[1:]
+
+        # crop semantic seg
+        seg_map = results['gt_seg_map']
+        results['gt_seg_map'] = self.crop(seg_map, crop_bbox)
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(crop_shape={self.crop_shape})'
+
+
+@TRANSFORMS.register_module()
+class BioMedicalGaussianNoise(BaseTransform):
+    """Add random Gaussian noise to image.
+
+    Modified from https://github.com/MIC-DKFZ/batchgenerators/blob/7651ece69faf55263dd582a9f5cbd149ed9c3ad0/batchgenerators/transforms/noise_transforms.py#L53  # noqa:E501
+
+    Copyright (c) German Cancer Research Center (DKFZ)
+    Licensed under the Apache License, Version 2.0
+
+    Required Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X),
+            N is the number of modalities, and data type is float32.
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): Probability to add Gaussian noise for
+            each sample. Default to 0.1.
+        mean (float): Mean or “centre” of the distribution. Default to 0.0.
+        std (float): Standard deviation of distribution. Default to 0.1.
+    """
+
+    def __init__(self,
+                 prob: float = 0.1,
+                 mean: float = 0.0,
+                 std: float = 0.1) -> None:
+        super().__init__()
+        assert 0.0 <= prob <= 1.0 and std >= 0.0
+        self.prob = prob
+        self.mean = mean
+        self.std = std
+
+    def transform(self, results: Dict) -> Dict:
+        """Call function to add random Gaussian noise to image.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with random Gaussian noise.
+        """
+        if np.random.rand() < self.prob:
+            rand_std = np.random.uniform(0, self.std)
+            noise = np.random.normal(
+                self.mean, rand_std, size=results['img'].shape)
+            # noise is float64 array, convert to the results['img'].dtype
+            noise = noise.astype(results['img'].dtype)
+            results['img'] = results['img'] + noise
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'mean={self.mean}, '
+        repr_str += f'std={self.std})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BioMedicalGaussianBlur(BaseTransform):
+    """Add Gaussian blur with random sigma to image.
+
+    Modified from https://github.com/MIC-DKFZ/batchgenerators/blob/7651ece69faf55263dd582a9f5cbd149ed9c3ad0/batchgenerators/transforms/noise_transforms.py#L81 # noqa:E501
+
+    Copyright (c) German Cancer Research Center (DKFZ)
+    Licensed under the Apache License, Version 2.0
+
+    Required Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X),
+            N is the number of modalities, and data type is float32.
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        sigma_range (Tuple[float, float]|float): range to randomly
+            select sigma value. Default to (0.5, 1.0).
+        prob (float): Probability to apply Gaussian blur
+            for each sample. Default to 0.2.
+        prob_per_channel  (float): Probability to apply Gaussian blur
+            for each channel (axis N of the image). Default to 0.5.
+        different_sigma_per_channel (bool): whether to use different
+            sigma for each channel (axis N of the image). Default to True.
+        different_sigma_per_axis (bool): whether to use different
+            sigma for axis Z, X and Y of the image. Default to True.
+    """
+
+    def __init__(self,
+                 sigma_range: Tuple[float, float] = (0.5, 1.0),
+                 prob: float = 0.2,
+                 prob_per_channel: float = 0.5,
+                 different_sigma_per_channel: bool = True,
+                 different_sigma_per_axis: bool = True) -> None:
+        super().__init__()
+        assert 0.0 <= prob <= 1.0
+        assert 0.0 <= prob_per_channel <= 1.0
+        assert isinstance(sigma_range, Sequence) and len(sigma_range) == 2
+        self.sigma_range = sigma_range
+        self.prob = prob
+        self.prob_per_channel = prob_per_channel
+        self.different_sigma_per_channel = different_sigma_per_channel
+        self.different_sigma_per_axis = different_sigma_per_axis
+
+    def _get_valid_sigma(self, value_range) -> Tuple[float, ...]:
+        """Ensure the `value_range` to be either a single value or a sequence
+        of two values. If the `value_range` is a sequence, generate a random
+        value with `[value_range[0], value_range[1]]` based on uniform
+        sampling.
+
+        Modified from https://github.com/MIC-DKFZ/batchgenerators/blob/7651ece69faf55263dd582a9f5cbd149ed9c3ad0/batchgenerators/augmentations/utils.py#L625 # noqa:E501
+
+        Args:
+            value_range (tuple|list|float|int): the input value range
+        """
+        if (isinstance(value_range, (list, tuple))):
+            if (value_range[0] == value_range[1]):
+                value = value_range[0]
+            else:
+                orig_type = type(value_range[0])
+                value = np.random.uniform(value_range[0], value_range[1])
+                value = orig_type(value)
+        return value
+
+    def _gaussian_blur(self, data_sample: np.ndarray) -> np.ndarray:
+        """Random generate sigma and apply Gaussian Blur to the data
+        Args:
+            data_sample (np.ndarray): data sample with multiple modalities,
+                the data shape is (N, Z, Y, X)
+        """
+        sigma = None
+        for c in range(data_sample.shape[0]):
+            if np.random.rand() < self.prob_per_channel:
+                # if no `sigma` is generated, generate one
+                # if `self.different_sigma_per_channel` is True,
+                # re-generate random sigma for each channel
+                if (sigma is None or self.different_sigma_per_channel):
+                    if (not self.different_sigma_per_axis):
+                        sigma = self._get_valid_sigma(self.sigma_range)
+                    else:
+                        sigma = [
+                            self._get_valid_sigma(self.sigma_range)
+                            for _ in data_sample.shape[1:]
+                        ]
+                # apply gaussian filter with `sigma`
+                data_sample[c] = gaussian_filter(
+                    data_sample[c], sigma, order=0)
+        return data_sample
+
+    def transform(self, results: Dict) -> Dict:
+        """Call function to add random Gaussian blur to image.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with random Gaussian noise.
+        """
+        if np.random.rand() < self.prob:
+            results['img'] = self._gaussian_blur(results['img'])
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'prob_per_channel={self.prob_per_channel}, '
+        repr_str += f'sigma_range={self.sigma_range}, '
+        repr_str += 'different_sigma_per_channel=' \
+                    f'{self.different_sigma_per_channel}, '
+        repr_str += 'different_sigma_per_axis=' \
+                    f'{self.different_sigma_per_axis})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BioMedicalRandomGamma(BaseTransform):
+    """Using random gamma correction to process the biomedical image.
+
+    Modified from
+    https://github.com/MIC-DKFZ/batchgenerators/blob/master/batchgenerators/transforms/color_transforms.py#L132 # noqa:E501
+    With licence: Apache 2.0
+
+    Required Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X),
+        N is the number of modalities, and data type is float32.
+
+    Modified Keys:
+    - img
+
+    Args:
+        prob (float): The probability to perform this transform. Default: 0.5.
+        gamma_range (Tuple[float]): Range of gamma values. Default: (0.5, 2).
+        invert_image (bool): Whether invert the image before applying gamma
+            augmentation. Default: False.
+        per_channel (bool): Whether perform the transform each channel
+            individually. Default: False
+        retain_stats (bool): Gamma transformation will alter the mean and std
+            of the data in the patch. If retain_stats=True, the data will be
+            transformed to match the mean and standard deviation before gamma
+            augmentation. Default: False.
+    """
+
+    def __init__(self,
+                 prob: float = 0.5,
+                 gamma_range: Tuple[float] = (0.5, 2),
+                 invert_image: bool = False,
+                 per_channel: bool = False,
+                 retain_stats: bool = False):
+        assert 0 <= prob and prob <= 1
+        assert isinstance(gamma_range, tuple) and len(gamma_range) == 2
+        assert isinstance(invert_image, bool)
+        assert isinstance(per_channel, bool)
+        assert isinstance(retain_stats, bool)
+        self.prob = prob
+        self.gamma_range = gamma_range
+        self.invert_image = invert_image
+        self.per_channel = per_channel
+        self.retain_stats = retain_stats
+
+    @cache_randomness
+    def _do_gamma(self):
+        """Whether do adjust gamma for image."""
+        return np.random.rand() < self.prob
+
+    def _adjust_gamma(self, img: np.array):
+        """Gamma adjustment for image.
+
+        Args:
+            img (np.array): Input image before gamma adjust.
+
+        Returns:
+            np.arrays: Image after gamma adjust.
+        """
+
+        if self.invert_image:
+            img = -img
+
+        def _do_adjust(img):
+            if retain_stats_here:
+                img_mean = img.mean()
+                img_std = img.std()
+            if np.random.random() < 0.5 and self.gamma_range[0] < 1:
+                gamma = np.random.uniform(self.gamma_range[0], 1)
+            else:
+                gamma = np.random.uniform(
+                    max(self.gamma_range[0], 1), self.gamma_range[1])
+            img_min = img.min()
+            img_range = img.max() - img_min  # range
+            img = np.power(((img - img_min) / float(img_range + 1e-7)),
+                           gamma) * img_range + img_min
+            if retain_stats_here:
+                img = img - img.mean()
+                img = img / (img.std() + 1e-8) * img_std
+                img = img + img_mean
+            return img
+
+        if not self.per_channel:
+            retain_stats_here = self.retain_stats
+            img = _do_adjust(img)
+        else:
+            for c in range(img.shape[0]):
+                img[c] = _do_adjust(img[c])
+        if self.invert_image:
+            img = -img
+        return img
+
+    def transform(self, results: dict) -> dict:
+        """Call function to perform random gamma correction
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with random gamma correction performed.
+        """
+        do_gamma = self._do_gamma()
+
+        if do_gamma:
+            results['img'] = self._adjust_gamma(results['img'])
+        else:
+            pass
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'gamma_range={self.gamma_range},'
+        repr_str += f'invert_image={self.invert_image},'
+        repr_str += f'per_channel={self.per_channel},'
+        repr_str += f'retain_stats={self.retain_stats}'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BioMedical3DPad(BaseTransform):
+    """Pad the biomedical 3d image & biomedical 3d semantic segmentation maps.
+
+    Required Keys:
+
+    - img (np.ndarry): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+
+    Modified Keys:
+
+    - img (np.ndarry): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+
+    Added Keys:
+
+    - pad_shape (Tuple[int, int, int]): The padded shape.
+
+    Args:
+        pad_shape (Tuple[int, int, int]): Fixed padding size.
+            Expected padding shape (Z, Y, X).
+        pad_val (float): Padding value for biomedical image.
+            The padding mode is set to "constant". The value
+            to be filled in padding area. Default: 0.
+        seg_pad_val (int): Padding value for biomedical 3d semantic
+            segmentation maps. The padding mode is set to "constant".
+            The value to be filled in padding area. Default: 0.
+    """
+
+    def __init__(self,
+                 pad_shape: Tuple[int, int, int],
+                 pad_val: float = 0.,
+                 seg_pad_val: int = 0) -> None:
+
+        # check pad_shape
+        assert pad_shape is not None
+        if not isinstance(pad_shape, tuple):
+            assert len(pad_shape) == 3
+
+        self.pad_shape = pad_shape
+        self.pad_val = pad_val
+        self.seg_pad_val = seg_pad_val
+
+    def _pad_img(self, results: dict) -> None:
+        """Pad images according to ``self.pad_shape``
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: The dict contains the padded image and shape
+                information.
+        """
+        padded_img = self._to_pad(
+            results['img'], pad_shape=self.pad_shape, pad_val=self.pad_val)
+
+        results['img'] = padded_img
+        results['pad_shape'] = padded_img.shape[1:]
+
+    def _pad_seg(self, results: dict) -> None:
+        """Pad semantic segmentation map according to ``self.pad_shape`` if
+        ``gt_seg_map`` is not None in results dict.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Update the padded gt seg map in dict.
+        """
+        if results.get('gt_seg_map', None) is not None:
+            pad_gt_seg = self._to_pad(
+                results['gt_seg_map'][None, ...],
+                pad_shape=results['pad_shape'],
+                pad_val=self.seg_pad_val)
+            results['gt_seg_map'] = pad_gt_seg[1:]
+
+    @staticmethod
+    def _to_pad(img: np.ndarray,
+                pad_shape: Tuple[int, int, int],
+                pad_val: Union[int, float] = 0) -> np.ndarray:
+        """Pad the given 3d image to a certain shape with specified padding
+        value.
+
+        Args:
+            img (ndarray): Biomedical image with shape (N, Z, Y, X)
+                to be padded. N is the number of modalities.
+            pad_shape (Tuple[int,int,int]): Expected padding shape (Z, Y, X).
+            pad_val (float, int): Values to be filled in padding areas
+                and the padding_mode is set to 'constant'. Default: 0.
+
+        Returns:
+            ndarray: The padded image.
+        """
+        # compute pad width
+        d = max(pad_shape[0] - img.shape[1], 0)
+        pad_d = (d // 2, d - d // 2)
+        h = max(pad_shape[1] - img.shape[2], 0)
+        pad_h = (h // 2, h - h // 2)
+        w = max(pad_shape[2] - img.shape[2], 0)
+        pad_w = (w // 2, w - w // 2)
+
+        pad_list = [(0, 0), pad_d, pad_h, pad_w]
+
+        img = np.pad(img, pad_list, mode='constant', constant_values=pad_val)
+        return img
+
+    def transform(self, results: dict) -> dict:
+        """Call function to pad images, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        self._pad_seg(results)
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'pad_shape={self.pad_shape}, '
+        repr_str += f'pad_val={self.pad_val}), '
+        repr_str += f'seg_pad_val={self.seg_pad_val})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BioMedical3DRandomFlip(BaseTransform):
+    """Flip biomedical 3D images and segmentations.
+
+    Modified from https://github.com/MIC-DKFZ/batchgenerators/blob/master/batchgenerators/transforms/spatial_transforms.py # noqa:E501
+
+    Copyright 2021 Division of
+    Medical Image Computing, German Cancer Research Center (DKFZ) and Applied
+    Computer Vision Lab, Helmholtz Imaging Platform.
+    Licensed under the Apache-2.0 License.
+
+    Required Keys:
+
+    - img (np.ndarry): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+
+    Modified Keys:
+
+    - img (np.ndarry): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+
+    Added Keys:
+
+    - do_flip
+    - flip_axes
+
+    Args:
+        prob (float): Flipping probability.
+        axes (Tuple[int, ...]): Flipping axes with order 'ZXY'.
+        swap_label_pairs (Optional[List[Tuple[int, int]]]):
+        The segmentation label pairs that are swapped when flipping.
+    """
+
+    def __init__(self,
+                 prob: float,
+                 axes: Tuple[int, ...],
+                 swap_label_pairs: Optional[List[Tuple[int, int]]] = None):
+        self.prob = prob
+        self.axes = axes
+        self.swap_label_pairs = swap_label_pairs
+        assert prob >= 0 and prob <= 1
+        if axes is not None:
+            assert max(axes) <= 2
+
+    @staticmethod
+    def _flip(img, direction: Tuple[bool, bool, bool]) -> np.ndarray:
+        if direction[0]:
+            img[:, :] = img[:, ::-1]
+        if direction[1]:
+            img[:, :, :] = img[:, :, ::-1]
+        if direction[2]:
+            img[:, :, :, :] = img[:, :, :, ::-1]
+        return img
+
+    def _do_flip(self, img: np.ndarray) -> Tuple[bool, bool, bool]:
+        """Call function to determine which axis to flip.
+
+        Args:
+            img (np.ndarry): Image or segmentation map array.
+        Returns:
+            tuple: Flip action, whether to flip on the z, x, and y axes.
+        """
+        flip_c, flip_x, flip_y = False, False, False
+        if self.axes is not None:
+            flip_c = 0 in self.axes and np.random.rand() < self.prob
+            flip_x = 1 in self.axes and np.random.rand() < self.prob
+            if len(img.shape) == 4:
+                flip_y = 2 in self.axes and np.random.rand() < self.prob
+        return flip_c, flip_x, flip_y
+
+    def _swap_label(self, seg: np.ndarray) -> np.ndarray:
+        out = seg.copy()
+        for first, second in self.swap_label_pairs:
+            first_area = (seg == first)
+            second_area = (seg == second)
+            out[first_area] = second
+            out[second_area] = first
+        return out
+
+    def transform(self, results: Dict) -> Dict:
+        """Call function to flip and swap pair labels.
+
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Flipped results, 'do_flip', 'flip_axes' keys are added into
+                result dict.
+        """
+        # get actual flipped axis
+        if 'do_flip' not in results:
+            results['do_flip'] = self._do_flip(results['img'])
+        if 'flip_axes' not in results:
+            results['flip_axes'] = self.axes
+        # flip image
+        results['img'] = self._flip(
+            results['img'], direction=results['do_flip'])
+        # flip seg
+        if results['gt_seg_map'] is not None:
+            if results['gt_seg_map'].shape != results['img'].shape:
+                results['gt_seg_map'] = results['gt_seg_map'][None, :]
+            results['gt_seg_map'] = self._flip(
+                results['gt_seg_map'], direction=results['do_flip'])
+            results['gt_seg_map'] = results['gt_seg_map'].squeeze()
+            # swap label pairs
+            if self.swap_label_pairs is not None:
+                results['gt_seg_map'] = self._swap_label(results['gt_seg_map'])
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, axes={self.axes}, ' \
+                    f'swap_label_pairs={self.swap_label_pairs})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Albu(BaseTransform):
+    """Albumentation augmentation. Adds custom transformations from
+    Albumentations library. Please, visit
+    `https://albumentations.readthedocs.io` to get more information. An example
+    of ``transforms`` is as followed:
+
+    .. code-block::
+        [
+            dict(
+                type='ShiftScaleRotate',
+                shift_limit=0.0625,
+                scale_limit=0.0,
+                rotate_limit=0,
+                interpolation=1,
+                p=0.5),
+            dict(
+                type='RandomBrightnessContrast',
+                brightness_limit=[0.1, 0.3],
+                contrast_limit=[0.1, 0.3],
+                p=0.2),
+            dict(type='ChannelShuffle', p=0.1),
+            dict(
+                type='OneOf',
+                transforms=[
+                    dict(type='Blur', blur_limit=3, p=1.0),
+                    dict(type='MedianBlur', blur_limit=3, p=1.0)
+                ],
+                p=0.1),
+        ]
+    Args:
+        transforms (list[dict]): A list of albu transformations
+        keymap (dict): Contains {'input key':'albumentation-style key'}
+        additional_targets(dict):  Allows applying same augmentations to \
+        multiple objects of same type.
+        update_pad_shape (bool): Whether to update padding shape according to \
+            the output shape of the last transform
+        bgr_to_rgb (bool): Whether to convert the band order to RGB
+    """
+
+    def __init__(self,
+                 transforms: List[dict],
+                 keymap: Optional[dict] = None,
+                 additional_targets: Optional[dict] = None,
+                 update_pad_shape: bool = False,
+                 bgr_to_rgb: bool = True):
+        if not ALBU_INSTALLED:
+            raise ImportError(
+                'albumentations is not installed, '
+                'we suggest install albumentation by '
+                '"pip install albumentations>=0.3.2 --no-binary qudida,albumentations"'  # noqa
+            )
+
+        # Args will be modified later, copying it will be safer
+        transforms = copy.deepcopy(transforms)
+
+        self.transforms = transforms
+        self.keymap = keymap
+        self.additional_targets = additional_targets
+        self.update_pad_shape = update_pad_shape
+        self.bgr_to_rgb = bgr_to_rgb
+
+        self.aug = Compose([self.albu_builder(t) for t in self.transforms],
+                           additional_targets=self.additional_targets)
+
+        if not keymap:
+            self.keymap_to_albu = {'img': 'image', 'gt_seg_map': 'mask'}
+        else:
+            self.keymap_to_albu = copy.deepcopy(keymap)
+        self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
+
+    def albu_builder(self, cfg: dict) -> object:
+        """Build a callable object from a dict containing albu arguments.
+
+        Args:
+            cfg (dict): Config dict. It should at least contain the key "type".
+
+        Returns:
+            Callable: A callable object.
+        """
+
+        assert isinstance(cfg, dict) and 'type' in cfg
+        args = cfg.copy()
+
+        obj_type = args.pop('type')
+        if mmengine.is_str(obj_type):
+            if not ALBU_INSTALLED:
+                raise ImportError(
+                    'albumentations is not installed, '
+                    'we suggest install albumentation by '
+                    '"pip install albumentations>=0.3.2 --no-binary qudida,albumentations"'  # noqa
+                )
+            obj_cls = getattr(albumentations, obj_type)
+        elif inspect.isclass(obj_type):
+            obj_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a valid type or str, but got {type(obj_type)}')
+
+        if 'transforms' in args:
+            args['transforms'] = [
+                self.albu_builder(t) for t in args['transforms']
+            ]
+
+        return obj_cls(**args)
+
+    @staticmethod
+    def mapper(d: dict, keymap: dict):
+        """Dictionary mapper.
+
+        Renames keys according to keymap provided.
+        Args:
+            d (dict): old dict
+            keymap (dict): {'old_key':'new_key'}
+        Returns:
+            dict: new dict.
+        """
+
+        updated_dict = {}
+        for k, _ in zip(d.keys(), d.values()):
+            new_k = keymap.get(k, k)
+            updated_dict[new_k] = d[k]
+        return updated_dict
+
+    def transform(self, results):
+        # dict to albumentations format
+        results = self.mapper(results, self.keymap_to_albu)
+
+        # Convert to RGB since Albumentations works with RGB images
+        if self.bgr_to_rgb:
+            results['image'] = cv2.cvtColor(results['image'],
+                                            cv2.COLOR_BGR2RGB)
+            if self.additional_targets:
+                for key, value in self.additional_targets.items():
+                    if value == 'image':
+                        results[key] = cv2.cvtColor(results[key],
+                                                    cv2.COLOR_BGR2RGB)
+
+        # Apply Transform
+        results = self.aug(**results)
+
+        # Convert back to BGR
+        if self.bgr_to_rgb:
+            results['image'] = cv2.cvtColor(results['image'],
+                                            cv2.COLOR_RGB2BGR)
+            if self.additional_targets:
+                for key, value in self.additional_targets.items():
+                    if value == 'image':
+                        results[key] = cv2.cvtColor(results['image2'],
+                                                    cv2.COLOR_RGB2BGR)
+
+        # back to the original format
+        results = self.mapper(results, self.keymap_back)
+
+        # update final shape
+        if self.update_pad_shape:
+            results['pad_shape'] = results['img'].shape
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ConcatCDInput(BaseTransform):
+    """Concat images for change detection.
+
+    Required Keys:
+
+    - img
+    - img2
+
+    Args:
+        input_keys (tuple):  Input image keys for change detection.
+            Default: ('img', 'img2').
+    """
+
+    def __init__(self, input_keys=('img', 'img2')):
+        self.input_keys = input_keys
+
+    def transform(self, results: dict) -> dict:
+        img = []
+        for input_key in self.input_keys:
+            img.append(results.pop(input_key))
+        results['img'] = np.concatenate(img, axis=2)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(input_keys={self.input_keys}, '
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomDepthMix(BaseTransform):
+    """This class implements the RandomDepthMix transform.
+
+    Args:
+        prob (float): Probability of applying the transformation.
+            Defaults to 0.25.
+        mix_scale_ratio (float): Ratio to scale the mix width.
+            Defaults to 0.75.
+    """
+
+    def __init__(
+        self,
+        prob: float = 0.25,
+        mix_scale_ratio: float = 0.75,
+    ):
+        super().__init__()
+
+        self.prob = prob
+        self.mix_scale_ratio = mix_scale_ratio
+
+    def transform(self, results: dict) -> dict:
+        if random.random() > self.prob:
+            return results
+
+        h, w = results['img_shape'][:2]
+        left = int(w * random.random())
+        width_ratio = self.mix_scale_ratio * random.random()
+        width = int(max(1, (w - left) * width_ratio))
+
+        img = results['img']
+        depth_rescale_factor = results.get('depth_rescale_factor', 1)
+        depth_map = results['gt_depth_map'] / depth_rescale_factor
+
+        if img.ndim == 3:
+            for c in range(img.shape[-1]):
+                img[:, left:left + width, c] = depth_map[:, left:left + width]
+        elif img.ndim == 2:
+            img[:, left:left + width] = depth_map[:, left:left + width]
+        else:
+            raise ValueError(f'Invalid image shape ({img.shape})')
+
+        results['img'] = img
+        return results
diff --git a/head_extractor/src/mmseg/datasets/union_new.py b/head_extractor/src/mmseg/datasets/union_new.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff47c530a6e2334789cf9664591dae97971a1892
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/union_new.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class UnionNewKDataset(BaseSegDataset):
+    """
+        union dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'background',
+            'top',
+            'outer',
+            'skirt',
+            'dress',
+            'pants',
+            'leggings',
+            'accessories',
+            'belt',
+            'footwear',
+            'bag',
+            'hair',
+            'skin',
+            'rompers',
+            ),
+
+        palette=[
+            [0, 0, 0], # background
+            [255, 0, 0], # top
+            [4, 63, 120], # outer
+            [127, 127, 127],  # skirt
+            [80, 205, 207], # dress
+            [0, 255, 0], # pants
+            [230, 83, 223], # leggings
+            [207, 135, 41], # accessories, including wrist wear,ring,tie,etc...
+            [0, 51, 51], # belt
+            [0, 153, 255], # footwear
+            [167,78,103], # bag
+            [0, 0, 255], # hair
+            [142, 124, 195], # skin
+            [74, 28, 28], # rompers
+            ],)
+           
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/union_new_add_mask.py b/head_extractor/src/mmseg/datasets/union_new_add_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea7a613172e6ef916eb9c15232c8cddfd1a67987
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/union_new_add_mask.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class UnionNewAddMaskDataset(BaseSegDataset):
+    """
+        union dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'background',
+            'top',
+            'outer',
+            'skirt',
+            'dress',
+            'pants',
+            'leggings',
+            'accessories',
+            'belt',
+            'footwear',
+            'bag',
+            'hair',
+            'skin',
+            'rompers',
+            'face_mask'
+            ),
+
+        palette=[
+            [0, 0, 0], # background
+            [0, 0, 255], # top
+            [120, 63, 3], # outer
+            [127, 127, 127],  # skirt
+            [207, 205, 80], # dress
+            [0, 255, 0], # pants
+            [223, 83, 230], # leggings
+            [41, 135, 207], # accessories, including wrist wear,ring,tie,etc...
+            [51, 51, 0], # belt
+            [255, 153, 0], # footwear
+            [103, 78, 167], # bag
+            [255, 0, 0], # hair
+            [195, 124, 142], # skin
+            [28, 28, 74], # rompers
+            [147, 196, 125], # face mask
+            ],)
+           
+
+    def __init__(self,
+                #  ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/datasets/voc.py b/head_extractor/src/mmseg/datasets/voc.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e5d6025c03760953a82f80e337185afc51f1386
--- /dev/null
+++ b/head_extractor/src/mmseg/datasets/voc.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class PascalVOCDataset(BaseSegDataset):
+    """Pascal VOC dataset.
+
+    Args:
+        split (str): Split txt file for Pascal VOC.
+    """
+    METAINFO = dict(
+        classes=('background', 'aeroplane', 'bicycle', 'bird', 'boat',
+                 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable',
+                 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep',
+                 'sofa', 'train', 'tvmonitor'),
+        palette=[[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
+                 [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
+                 [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
+                 [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128],
+                 [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
+                 [0, 64, 128]])
+
+    def __init__(self,
+                 ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            ann_file=ann_file,
+            **kwargs)
+        assert fileio.exists(self.data_prefix['img_path'],
+                             self.backend_args) and osp.isfile(self.ann_file)
diff --git a/head_extractor/src/mmseg/engine/__init__.py b/head_extractor/src/mmseg/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..98139a0047fd2f076d659ba5aed2cd3452dbd235
--- /dev/null
+++ b/head_extractor/src/mmseg/engine/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hooks import SegVisualizationHook
+from .optimizers import (ForceDefaultOptimWrapperConstructor,
+                         LayerDecayOptimizerConstructor,
+                         LearningRateDecayOptimizerConstructor)
+from .schedulers import PolyLRRatio
+
+__all__ = [
+    'LearningRateDecayOptimizerConstructor', 'LayerDecayOptimizerConstructor',
+    'SegVisualizationHook', 'PolyLRRatio',
+    'ForceDefaultOptimWrapperConstructor'
+]
diff --git a/head_extractor/src/mmseg/engine/hooks/__init__.py b/head_extractor/src/mmseg/engine/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6048088a7fd322890ced17569e855acee826eca
--- /dev/null
+++ b/head_extractor/src/mmseg/engine/hooks/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .visualization_hook import SegVisualizationHook
+
+__all__ = ['SegVisualizationHook']
diff --git a/head_extractor/src/mmseg/engine/hooks/visualization_hook.py b/head_extractor/src/mmseg/engine/hooks/visualization_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..21cddde89d96c77ffe5b711a5efe524ced668b5c
--- /dev/null
+++ b/head_extractor/src/mmseg/engine/hooks/visualization_hook.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import Optional, Sequence
+
+import mmcv
+from mmengine.fileio import get
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+from mmengine.visualization import Visualizer
+
+from mmseg.registry import HOOKS
+from mmseg.structures import SegDataSample
+
+
+@HOOKS.register_module()
+class SegVisualizationHook(Hook):
+    """Segmentation Visualization Hook. Used to visualize validation and
+    testing process prediction results.
+
+    In the testing phase:
+
+    1. If ``show`` is True, it means that only the prediction results are
+        visualized without storing data, so ``vis_backends`` needs to
+        be excluded.
+
+    Args:
+        draw (bool): whether to draw prediction results. If it is False,
+            it means that no drawing will be done. Defaults to False.
+        interval (int): The interval of visualization. Defaults to 50.
+        show (bool): Whether to display the drawn image. Default to False.
+        wait_time (float): The interval of show (s). Defaults to 0.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 draw: bool = False,
+                 interval: int = 50,
+                 show: bool = False,
+                 wait_time: float = 0.,
+                 backend_args: Optional[dict] = None):
+        self._visualizer: Visualizer = Visualizer.get_current_instance()
+        self.interval = interval
+        self.show = show
+        if self.show:
+            # No need to think about vis backends.
+            self._visualizer._vis_backends = {}
+            warnings.warn('The show is True, it means that only '
+                          'the prediction results are visualized '
+                          'without storing data, so vis_backends '
+                          'needs to be excluded.')
+
+        self.wait_time = wait_time
+        self.backend_args = backend_args.copy() if backend_args else None
+        self.draw = draw
+        if not self.draw:
+            warnings.warn('The draw is False, it means that the '
+                          'hook for visualization will not take '
+                          'effect. The results will NOT be '
+                          'visualized or stored.')
+        self._test_index = 0
+
+    def after_val_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                       outputs: Sequence[SegDataSample]) -> None:
+        """Run after every ``self.interval`` validation iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`SegDataSample`]]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        # There is no guarantee that the same batch of images
+        # is visualized for each evaluation.
+        total_curr_iter = runner.iter + batch_idx
+
+        # Visualize only the first data
+        img_path = outputs[0].img_path
+        img_bytes = get(img_path, backend_args=self.backend_args)
+        img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+        window_name = f'val_{osp.basename(img_path)}'
+
+        if total_curr_iter % self.interval == 0:
+            self._visualizer.add_datasample(
+                window_name,
+                img,
+                data_sample=outputs[0],
+                show=self.show,
+                wait_time=self.wait_time,
+                step=total_curr_iter)
+
+    def after_test_iter(self, runner: Runner, batch_idx: int, data_batch: dict,
+                        outputs: Sequence[SegDataSample]) -> None:
+        """Run after every testing iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the testing process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`SegDataSample`]): A batch of data samples
+                that contain annotations and predictions.
+        """
+        if self.draw is False:
+            return
+
+        for data_sample in outputs:
+            self._test_index += 1
+
+            img_path = data_sample.img_path
+            window_name = f'test_{osp.basename(img_path)}'
+
+            img_path = data_sample.img_path
+            img_bytes = get(img_path, backend_args=self.backend_args)
+            img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+
+            self._visualizer.add_datasample(
+                window_name,
+                img,
+                data_sample=data_sample,
+                show=self.show,
+                wait_time=self.wait_time,
+                step=self._test_index)
diff --git a/head_extractor/src/mmseg/engine/optimizers/__init__.py b/head_extractor/src/mmseg/engine/optimizers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4cf58741febfc20ea33664ea8e1b1ac68bbb327
--- /dev/null
+++ b/head_extractor/src/mmseg/engine/optimizers/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .force_default_constructor import ForceDefaultOptimWrapperConstructor
+from .layer_decay_optimizer_constructor import (
+    LayerDecayOptimizerConstructor, LearningRateDecayOptimizerConstructor)
+
+__all__ = [
+    'LearningRateDecayOptimizerConstructor', 'LayerDecayOptimizerConstructor',
+    'ForceDefaultOptimWrapperConstructor'
+]
diff --git a/head_extractor/src/mmseg/engine/optimizers/force_default_constructor.py b/head_extractor/src/mmseg/engine/optimizers/force_default_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..12c642ad411bfd547d63c894c84636e2f1896128
--- /dev/null
+++ b/head_extractor/src/mmseg/engine/optimizers/force_default_constructor.py
@@ -0,0 +1,255 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmengine.logging import print_log
+from mmengine.optim import DefaultOptimWrapperConstructor
+from mmengine.utils.dl_utils import mmcv_full_available
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm
+from torch.nn import GroupNorm, LayerNorm
+
+from mmseg.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class ForceDefaultOptimWrapperConstructor(DefaultOptimWrapperConstructor):
+    """Default constructor with forced optimizer settings.
+
+    This constructor extends the default constructor to add an option for
+    forcing default optimizer settings. This is useful for ensuring that
+    certain parameters or layers strictly adhere to pre-defined default
+    settings, regardless of any custom settings specified.
+
+    By default, each parameter share the same optimizer settings, and we
+    provide an argument ``paramwise_cfg`` to specify parameter-wise settings.
+    It is a dict and may contain various fields like 'custom_keys',
+    'bias_lr_mult', etc., as well as the additional field
+    `force_default_settings` which allows for enforcing default settings on
+    optimizer parameters.
+
+    - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If
+      one of the keys in ``custom_keys`` is a substring of the name of one
+      parameter, then the setting of the parameter will be specified by
+      ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will
+      be ignored. It should be noted that the aforementioned ``key`` is the
+      longest key that is a substring of the name of the parameter. If there
+      are multiple matched keys with the same length, then the key with lower
+      alphabet order will be chosen.
+      ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult``
+      and ``decay_mult``. See Example 2 below.
+    - ``bias_lr_mult`` (float): It will be multiplied to the learning
+      rate for all bias parameters (except for those in normalization
+      layers and offset layers of DCN).
+    - ``bias_decay_mult`` (float): It will be multiplied to the weight
+      decay for all bias parameters (except for those in
+      normalization layers, depthwise conv layers, offset layers of DCN).
+    - ``norm_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of normalization
+      layers.
+    - ``flat_decay_mult`` (float): It will be multiplied to the weight
+      decay for all one-dimensional parameters
+    - ``dwconv_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of depthwise conv
+      layers.
+    - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning
+      rate for parameters of offset layer in the deformable convs
+      of a model.
+    - ``bypass_duplicate`` (bool): If true, the duplicate parameters
+      would not be added into optimizer. Defaults to False.
+    - ``force_default_settings`` (bool): If true, this will override any
+      custom settings defined by ``custom_keys`` and enforce the use of
+      default settings for optimizer parameters like ``bias_lr_mult``.
+      This is particularly useful when you want to ensure that certain layers
+      or parameters adhere strictly to the pre-defined default settings.
+
+    Note:
+
+        1. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        override the effect of ``bias_lr_mult`` in the bias of offset layer.
+        So be careful when using both ``bias_lr_mult`` and
+        ``dcn_offset_lr_mult``. If you wish to apply both of them to the offset
+        layer in deformable convs, set ``dcn_offset_lr_mult`` to the original
+        ``dcn_offset_lr_mult`` * ``bias_lr_mult``.
+
+        2. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        apply it to all the DCN layers in the model. So be careful when the
+        model contains multiple DCN layers in places other than backbone.
+
+        3. When the option ``force_default_settings`` is true, it will override
+        any custom settings provided in ``custom_keys``. This ensures that the
+        default settings for the optimizer parameters are used.
+
+    Args:
+        optim_wrapper_cfg (dict): The config dict of the optimizer wrapper.
+
+            Required fields of ``optim_wrapper_cfg`` are
+
+            - ``type``: class name of the OptimizerWrapper
+            - ``optimizer``: The configuration of optimizer.
+
+            Optional fields of ``optim_wrapper_cfg`` are
+
+            - any arguments of the corresponding optimizer wrapper type,
+              e.g., accumulative_counts, clip_grad, etc.
+
+            Required fields of ``optimizer`` are
+
+            - `type`: class name of the optimizer.
+
+            Optional fields of ``optimizer`` are
+
+            - any arguments of the corresponding optimizer type, e.g.,
+              lr, weight_decay, momentum, etc.
+
+        paramwise_cfg (dict, optional): Parameter-wise options.
+
+    Example 1:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optim_wrapper_cfg = dict(
+        >>>     dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01,
+        >>>         momentum=0.9, weight_decay=0.0001))
+        >>> paramwise_cfg = dict(norm_decay_mult=0.)
+        >>> optim_wrapper_builder = DefaultOptimWrapperConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+
+    Example 2:
+        >>> # assume model have attribute model.backbone and model.cls_head
+        >>> optim_wrapper_cfg = dict(type='OptimWrapper', optimizer=dict(
+        >>>     type='SGD', lr=0.01, weight_decay=0.95))
+        >>> paramwise_cfg = dict(custom_keys={
+        >>>     'backbone': dict(lr_mult=0.1, decay_mult=0.9)})
+        >>> optim_wrapper_builder = DefaultOptimWrapperConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+        >>> # Then the `lr` and `weight_decay` for model.backbone is
+        >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for
+        >>> # model.cls_head is (0.01, 0.95).
+    """
+
+    def add_params(self,
+                   params: List[dict],
+                   module: nn.Module,
+                   prefix: str = '',
+                   is_dcn_module: Optional[Union[int, float]] = None) -> None:
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        # get param-wise options
+        custom_keys = self.paramwise_cfg.get('custom_keys', {})
+        # first sort with alphabet order and then sort with reversed len of str
+        sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
+
+        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', None)
+        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', None)
+        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', None)
+        dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', None)
+        flat_decay_mult = self.paramwise_cfg.get('flat_decay_mult', None)
+        bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)
+        dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', None)
+        force_default_settings = self.paramwise_cfg.get(
+            'force_default_settings', False)
+
+        # special rules for norm layers and depth-wise conv layers
+        is_norm = isinstance(module,
+                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))
+        is_dwconv = (
+            isinstance(module, torch.nn.Conv2d)
+            and module.in_channels == module.groups)
+
+        for name, param in module.named_parameters(recurse=False):
+            param_group = {'params': [param]}
+            if bypass_duplicate and self._is_in(param_group, params):
+                print_log(
+                    f'{prefix} is duplicate. It is skipped since '
+                    f'bypass_duplicate={bypass_duplicate}',
+                    logger='current',
+                    level=logging.WARNING)
+                continue
+            if not param.requires_grad:
+                params.append(param_group)
+                continue
+
+            # if the parameter match one of the custom keys, ignore other rules
+            is_custom = False
+            for key in sorted_keys:
+                if key in f'{prefix}.{name}':
+                    is_custom = True
+                    lr_mult = custom_keys[key].get('lr_mult', 1.)
+                    param_group['lr'] = self.base_lr * lr_mult
+                    if self.base_wd is not None:
+                        decay_mult = custom_keys[key].get('decay_mult', 1.)
+                        param_group['weight_decay'] = self.base_wd * decay_mult
+                    # add custom settings to param_group
+                    for k, v in custom_keys[key].items():
+                        param_group[k] = v
+                    break
+
+            if not is_custom or force_default_settings:
+                # bias_lr_mult affects all bias parameters
+                # except for norm.bias dcn.conv_offset.bias
+                if name == 'bias' and not (
+                        is_norm or is_dcn_module) and bias_lr_mult is not None:
+                    param_group['lr'] = self.base_lr * bias_lr_mult
+
+                if (prefix.find('conv_offset') != -1 and is_dcn_module
+                        and dcn_offset_lr_mult is not None
+                        and isinstance(module, torch.nn.Conv2d)):
+                    # deal with both dcn_offset's bias & weight
+                    param_group['lr'] = self.base_lr * dcn_offset_lr_mult
+
+                # apply weight decay policies
+                if self.base_wd is not None:
+                    # norm decay
+                    if is_norm and norm_decay_mult is not None:
+                        param_group[
+                            'weight_decay'] = self.base_wd * norm_decay_mult
+                    # bias lr and decay
+                    elif (name == 'bias' and not is_dcn_module
+                          and bias_decay_mult is not None):
+                        param_group[
+                            'weight_decay'] = self.base_wd * bias_decay_mult
+                    # depth-wise conv
+                    elif is_dwconv and dwconv_decay_mult is not None:
+                        param_group[
+                            'weight_decay'] = self.base_wd * dwconv_decay_mult
+                    # flatten parameters except dcn offset
+                    elif (param.ndim == 1 and not is_dcn_module
+                          and flat_decay_mult is not None):
+                        param_group[
+                            'weight_decay'] = self.base_wd * flat_decay_mult
+            params.append(param_group)
+            for key, value in param_group.items():
+                if key == 'params':
+                    continue
+                full_name = f'{prefix}.{name}' if prefix else name
+                print_log(
+                    f'paramwise_options -- {full_name}:{key}={value}',
+                    logger='current')
+
+        if mmcv_full_available():
+            from mmcv.ops import DeformConv2d, ModulatedDeformConv2d
+            is_dcn_module = isinstance(module,
+                                       (DeformConv2d, ModulatedDeformConv2d))
+        else:
+            is_dcn_module = False
+        for child_name, child_mod in module.named_children():
+            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
+            self.add_params(
+                params,
+                child_mod,
+                prefix=child_prefix,
+                is_dcn_module=is_dcn_module)
diff --git a/head_extractor/src/mmseg/engine/optimizers/layer_decay_optimizer_constructor.py b/head_extractor/src/mmseg/engine/optimizers/layer_decay_optimizer_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdae3ca698c65879056b969f04185f80452ff8d0
--- /dev/null
+++ b/head_extractor/src/mmseg/engine/optimizers/layer_decay_optimizer_constructor.py
@@ -0,0 +1,207 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import warnings
+
+from mmengine.dist import get_dist_info
+from mmengine.logging import print_log
+from mmengine.optim import DefaultOptimWrapperConstructor
+
+from mmseg.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+def get_layer_id_for_convnext(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates in ``layer_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_layer_id (int): Maximum number of backbone layers.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        stage_id = int(var_name.split('.')[2])
+        if stage_id == 0:
+            layer_id = 0
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        block_id = int(var_name.split('.')[3])
+        if stage_id == 0:
+            layer_id = 1
+        elif stage_id == 1:
+            layer_id = 2
+        elif stage_id == 2:
+            layer_id = 3 + block_id // 3
+        elif stage_id == 3:
+            layer_id = max_layer_id
+        return layer_id
+    else:
+        return max_layer_id + 1
+
+
+def get_stage_id_for_convnext(var_name, max_stage_id):
+    """Get the stage id to set the different learning rates in ``stage_wise``
+    decay_type.
+
+    Args:
+        var_name (str): The key of the model.
+        max_stage_id (int): Maximum number of backbone layers.
+
+    Returns:
+        int: The id number corresponding to different learning rate in
+        ``LearningRateDecayOptimizerConstructor``.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.downsample_layers'):
+        return 0
+    elif var_name.startswith('backbone.stages'):
+        stage_id = int(var_name.split('.')[2])
+        return stage_id + 1
+    else:
+        return max_stage_id - 1
+
+
+def get_layer_id_for_vit(var_name, max_layer_id):
+    """Get the layer id to set the different learning rates.
+
+    Args:
+        var_name (str): The key of the model.
+        num_max_layer (int): Maximum number of backbone layers.
+
+    Returns:
+        int: Returns the layer id of the key.
+    """
+
+    if var_name in ('backbone.cls_token', 'backbone.mask_token',
+                    'backbone.pos_embed'):
+        return 0
+    elif var_name.startswith('backbone.patch_embed'):
+        return 0
+    elif var_name.startswith('backbone.layers'):
+        layer_id = int(var_name.split('.')[2])
+        return layer_id + 1
+    else:
+        return max_layer_id - 1
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimWrapperConstructor):
+    """Different learning rates are set for different layers of backbone.
+
+    Note: Currently, this optimizer constructor is built for ConvNeXt,
+    BEiT and MAE.
+    """
+
+    def add_params(self, params, module, **kwargs):
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+        """
+
+        parameter_groups = {}
+        print_log(f'self.paramwise_cfg is {self.paramwise_cfg}')
+        num_layers = self.paramwise_cfg.get('num_layers') + 2
+        decay_rate = self.paramwise_cfg.get('decay_rate')
+        decay_type = self.paramwise_cfg.get('decay_type', 'layer_wise')
+        print_log('Build LearningRateDecayOptimizerConstructor  '
+                  f'{decay_type} {decay_rate} - {num_layers}')
+        weight_decay = self.base_wd
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue  # frozen weights
+            if len(param.shape) == 1 or name.endswith('.bias') or name in (
+                    'pos_embed', 'cls_token'):
+                group_name = 'no_decay'
+                this_weight_decay = 0.
+            else:
+                group_name = 'decay'
+                this_weight_decay = weight_decay
+            if 'layer_wise' in decay_type:
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_convnext(
+                        name, self.paramwise_cfg.get('num_layers'))
+                    print_log(f'set param {name} as id {layer_id}')
+                elif 'BEiT' in module.backbone.__class__.__name__ or \
+                     'MAE' in module.backbone.__class__.__name__:
+                    layer_id = get_layer_id_for_vit(name, num_layers)
+                    print_log(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            elif decay_type == 'stage_wise':
+                if 'ConvNeXt' in module.backbone.__class__.__name__:
+                    layer_id = get_stage_id_for_convnext(name, num_layers)
+                    print_log(f'set param {name} as id {layer_id}')
+                else:
+                    raise NotImplementedError()
+            group_name = f'layer_{layer_id}_{group_name}'
+
+            if group_name not in parameter_groups:
+                scale = decay_rate**(num_layers - layer_id - 1)
+
+                parameter_groups[group_name] = {
+                    'weight_decay': this_weight_decay,
+                    'params': [],
+                    'param_names': [],
+                    'lr_scale': scale,
+                    'group_name': group_name,
+                    'lr': scale * self.base_lr,
+                }
+
+            parameter_groups[group_name]['params'].append(param)
+            parameter_groups[group_name]['param_names'].append(name)
+        rank, _ = get_dist_info()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    'param_names': parameter_groups[key]['param_names'],
+                    'lr_scale': parameter_groups[key]['lr_scale'],
+                    'lr': parameter_groups[key]['lr'],
+                    'weight_decay': parameter_groups[key]['weight_decay'],
+                }
+            print_log(f'Param groups = {json.dumps(to_display, indent=2)}')
+        params.extend(parameter_groups.values())
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class LayerDecayOptimizerConstructor(LearningRateDecayOptimizerConstructor):
+    """Different learning rates are set for different layers of backbone.
+
+    Note: Currently, this optimizer constructor is built for BEiT,
+    and it will be deprecated.
+    Please use ``LearningRateDecayOptimizerConstructor`` instead.
+    """
+
+    def __init__(self, optim_wrapper_cfg, paramwise_cfg):
+        warnings.warn('DeprecationWarning: Original '
+                      'LayerDecayOptimizerConstructor of BEiT '
+                      'will be deprecated. Please use '
+                      'LearningRateDecayOptimizerConstructor instead, '
+                      'and set decay_type = layer_wise_vit in paramwise_cfg.')
+        paramwise_cfg.update({'decay_type': 'layer_wise_vit'})
+        warnings.warn('DeprecationWarning: Layer_decay_rate will '
+                      'be deleted, please use decay_rate instead.')
+        paramwise_cfg['decay_rate'] = paramwise_cfg.pop('layer_decay_rate')
+        super().__init__(optim_wrapper_cfg, paramwise_cfg)
diff --git a/head_extractor/src/mmseg/engine/schedulers/__init__.py b/head_extractor/src/mmseg/engine/schedulers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cd3f6211345bb3627b76d683291f48efd934a77
--- /dev/null
+++ b/head_extractor/src/mmseg/engine/schedulers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .poly_ratio_scheduler import PolyLRRatio
+
+__all__ = ['PolyLRRatio']
diff --git a/head_extractor/src/mmseg/engine/schedulers/poly_ratio_scheduler.py b/head_extractor/src/mmseg/engine/schedulers/poly_ratio_scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..057203acc9cc9fc72306d2039669b90f35704436
--- /dev/null
+++ b/head_extractor/src/mmseg/engine/schedulers/poly_ratio_scheduler.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from mmengine.optim.scheduler import PolyLR
+
+from mmseg.registry import PARAM_SCHEDULERS
+
+
+@PARAM_SCHEDULERS.register_module()
+class PolyLRRatio(PolyLR):
+    """Implements polynomial learning rate decay with ratio.
+
+    This scheduler adjusts the learning rate of each parameter group
+    following a polynomial decay equation. The decay can occur in
+    conjunction with external parameter adjustments made outside this
+    scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        eta_min (float): Minimum learning rate at the end of scheduling.
+            Defaults to 0.
+        eta_min_ratio (float, optional): The ratio of the minimum parameter
+            value to the base parameter value. Either `eta_min` or
+            `eta_min_ratio` should be specified. Defaults to None.
+        power (float): The power of the polynomial. Defaults to 1.0.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self, eta_min_ratio: Optional[int] = None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.eta_min_ratio = eta_min_ratio
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+
+        if self.last_step == 0:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+
+        param_groups_value = []
+        for base_value, param_group in zip(self.base_values,
+                                           self.optimizer.param_groups):
+            eta_min = self.eta_min if self.eta_min_ratio is None else \
+                base_value * self.eta_min_ratio
+            step_ratio = (1 - 1 /
+                          (self.total_iters - self.last_step + 1))**self.power
+            step_value = (param_group[self.param_name] -
+                          eta_min) * step_ratio + eta_min
+            param_groups_value.append(step_value)
+
+        return param_groups_value
diff --git a/head_extractor/src/mmseg/evaluation/__init__.py b/head_extractor/src/mmseg/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..82b3a8d68d3aefcc23542fc1006eaddde05ca2ab
--- /dev/null
+++ b/head_extractor/src/mmseg/evaluation/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .metrics import CityscapesMetric, DepthMetric, IoUMetric
+
+__all__ = ['IoUMetric', 'CityscapesMetric', 'DepthMetric']
diff --git a/head_extractor/src/mmseg/evaluation/metrics/__init__.py b/head_extractor/src/mmseg/evaluation/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..848d4713dc8c0b6a08569d536bb72bd04ca1b1cc
--- /dev/null
+++ b/head_extractor/src/mmseg/evaluation/metrics/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .citys_metric import CityscapesMetric
+from .depth_metric import DepthMetric
+from .iou_metric import IoUMetric
+
+__all__ = ['IoUMetric', 'CityscapesMetric', 'DepthMetric']
diff --git a/head_extractor/src/mmseg/evaluation/metrics/citys_metric.py b/head_extractor/src/mmseg/evaluation/metrics/citys_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..32984653c3fa9c13d8c6a7402033001012b5031f
--- /dev/null
+++ b/head_extractor/src/mmseg/evaluation/metrics/citys_metric.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import shutil
+from collections import OrderedDict
+from typing import Dict, Optional, Sequence
+
+try:
+
+    import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as CSEval  # noqa
+    import cityscapesscripts.helpers.labels as CSLabels
+except ImportError:
+    CSLabels = None
+    CSEval = None
+
+import numpy as np
+from mmengine.dist import is_main_process, master_only
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+from mmengine.utils import mkdir_or_exist
+from PIL import Image
+
+from mmseg.registry import METRICS
+
+
+@METRICS.register_module()
+class CityscapesMetric(BaseMetric):
+    """Cityscapes evaluation metric.
+
+    Args:
+        output_dir (str): The directory for output prediction
+        ignore_index (int): Index that will be ignored in evaluation.
+            Default: 255.
+        format_only (bool): Only format result for results commit without
+            perform evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        keep_results (bool): Whether to keep the results. When ``format_only``
+            is True, ``keep_results`` must be True. Defaults to False.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    def __init__(self,
+                 output_dir: str,
+                 ignore_index: int = 255,
+                 format_only: bool = False,
+                 keep_results: bool = False,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 **kwargs) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        if CSEval is None:
+            raise ImportError('Please run "pip install cityscapesscripts" to '
+                              'install cityscapesscripts first.')
+        self.output_dir = output_dir
+        self.ignore_index = ignore_index
+
+        self.format_only = format_only
+        if format_only:
+            assert keep_results, (
+                'When format_only is True, the results must be keep, please '
+                f'set keep_results as True, but got {keep_results}')
+        self.keep_results = keep_results
+        self.prefix = prefix
+        if is_main_process():
+            mkdir_or_exist(self.output_dir)
+
+    @master_only
+    def __del__(self) -> None:
+        """Clean up."""
+        if not self.keep_results:
+            shutil.rmtree(self.output_dir)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        mkdir_or_exist(self.output_dir)
+
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_sem_seg']['data'][0].cpu().numpy()
+            # when evaluating with official cityscapesscripts,
+            # labelIds should be used
+            pred_label = self._convert_to_label_id(pred_label)
+            basename = osp.splitext(osp.basename(data_sample['img_path']))[0]
+            png_filename = osp.abspath(
+                osp.join(self.output_dir, f'{basename}.png'))
+            output = Image.fromarray(pred_label.astype(np.uint8)).convert('P')
+            output.save(png_filename)
+            if self.format_only:
+                # format_only always for test dataset without ground truth
+                gt_filename = ''
+            else:
+                # when evaluating with official cityscapesscripts,
+                # **_gtFine_labelIds.png is used
+                gt_filename = data_sample['seg_map_path'].replace(
+                    'labelTrainIds.png', 'labelIds.png')
+            self.results.append((png_filename, gt_filename))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): Testing results of the dataset.
+
+        Returns:
+            dict[str: float]: Cityscapes evaluation results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.format_only:
+            logger.info(f'results are saved to {osp.dirname(self.output_dir)}')
+            return OrderedDict()
+
+        msg = 'Evaluating in Cityscapes style'
+        if logger is None:
+            msg = '\n' + msg
+        print_log(msg, logger=logger)
+
+        eval_results = dict()
+        print_log(
+            f'Evaluating results under {self.output_dir} ...', logger=logger)
+
+        CSEval.args.evalInstLevelScore = True
+        CSEval.args.predictionPath = osp.abspath(self.output_dir)
+        CSEval.args.evalPixelAccuracy = True
+        CSEval.args.JSONOutput = False
+
+        pred_list, gt_list = zip(*results)
+        metric = dict()
+        eval_results.update(
+            CSEval.evaluateImgLists(pred_list, gt_list, CSEval.args))
+        metric['averageScoreCategories'] = eval_results[
+            'averageScoreCategories']
+        metric['averageScoreInstCategories'] = eval_results[
+            'averageScoreInstCategories']
+        return metric
+
+    @staticmethod
+    def _convert_to_label_id(result):
+        """Convert trainId to id for cityscapes."""
+        if isinstance(result, str):
+            result = np.load(result)
+        result_copy = result.copy()
+        for trainId, label in CSLabels.trainId2label.items():
+            result_copy[result == trainId] = label.id
+
+        return result_copy
diff --git a/head_extractor/src/mmseg/evaluation/metrics/depth_metric.py b/head_extractor/src/mmseg/evaluation/metrics/depth_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..621d4a31c9fe69cdbf83790e8f320218f755557a
--- /dev/null
+++ b/head_extractor/src/mmseg/evaluation/metrics/depth_metric.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import OrderedDict, defaultdict
+from typing import Dict, List, Optional, Sequence
+
+import cv2
+import numpy as np
+import torch
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+from mmengine.utils import mkdir_or_exist
+from prettytable import PrettyTable
+from torch import Tensor
+
+from mmseg.registry import METRICS
+
+
+@METRICS.register_module()
+class DepthMetric(BaseMetric):
+    """Depth estimation evaluation metric.
+
+    Args:
+        depth_metrics (List[str], optional): List of metrics to compute. If
+            not specified, defaults to all metrics in self.METRICS.
+        min_depth_eval (float): Minimum depth value for evaluation.
+            Defaults to 0.0.
+        max_depth_eval (float): Maximum depth value for evaluation.
+            Defaults to infinity.
+        crop_type (str, optional): Specifies the type of cropping to be used
+            during evaluation. This option can affect how the evaluation mask
+            is generated. Currently, 'nyu_crop' is supported, but other
+            types can be added in future. Defaults to None if no cropping
+            should be applied.
+        depth_scale_factor (float): Factor to scale the depth values.
+            Defaults to 1.0.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        output_dir (str): The directory for output prediction. Defaults to
+            None.
+        format_only (bool): Only format result for results commit without
+            perform evaluation. It is useful when you want to save the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+    METRICS = ('d1', 'd2', 'd3', 'abs_rel', 'sq_rel', 'rmse', 'rmse_log',
+               'log10', 'silog')
+
+    def __init__(self,
+                 depth_metrics: Optional[List[str]] = None,
+                 min_depth_eval: float = 0.0,
+                 max_depth_eval: float = float('inf'),
+                 crop_type: Optional[str] = None,
+                 depth_scale_factor: float = 1.0,
+                 collect_device: str = 'cpu',
+                 output_dir: Optional[str] = None,
+                 format_only: bool = False,
+                 prefix: Optional[str] = None,
+                 **kwargs) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        if depth_metrics is None:
+            self.metrics = self.METRICS
+        elif isinstance(depth_metrics, [tuple, list]):
+            for metric in depth_metrics:
+                assert metric in self.METRICS, f'the metric {metric} is not ' \
+                    f'supported. Please use metrics in {self.METRICS}'
+            self.metrics = depth_metrics
+
+        # Validate crop_type, if provided
+        assert crop_type in [
+            None, 'nyu_crop'
+        ], (f'Invalid value for crop_type: {crop_type}. Supported values are '
+            'None or \'nyu_crop\'.')
+        self.crop_type = crop_type
+        self.min_depth_eval = min_depth_eval
+        self.max_depth_eval = max_depth_eval
+        self.output_dir = output_dir
+        if self.output_dir and is_main_process():
+            mkdir_or_exist(self.output_dir)
+        self.format_only = format_only
+        self.depth_scale_factor = depth_scale_factor
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_depth_map']['data'].squeeze()
+            # format_only always for test dataset without ground truth
+            if not self.format_only:
+                gt_depth = data_sample['gt_depth_map']['data'].squeeze().to(
+                    pred_label)
+
+                eval_mask = self._get_eval_mask(gt_depth)
+                self.results.append(
+                    (gt_depth[eval_mask], pred_label[eval_mask]))
+            # format_result
+            if self.output_dir is not None:
+                basename = osp.splitext(osp.basename(
+                    data_sample['img_path']))[0]
+                png_filename = osp.abspath(
+                    osp.join(self.output_dir, f'{basename}.png'))
+                output_mask = pred_label.cpu().numpy(
+                ) * self.depth_scale_factor
+
+                cv2.imwrite(png_filename, output_mask.astype(np.uint16),
+                            [cv2.IMWRITE_PNG_COMPRESSION, 0])
+
+    def _get_eval_mask(self, gt_depth: Tensor):
+        """Generates an evaluation mask based on ground truth depth and
+        cropping.
+
+        Args:
+            gt_depth (Tensor): Ground truth depth map.
+
+        Returns:
+            Tensor: Boolean mask where evaluation should be performed.
+        """
+        valid_mask = torch.logical_and(gt_depth > self.min_depth_eval,
+                                       gt_depth < self.max_depth_eval)
+
+        if self.crop_type == 'nyu_crop':
+            # this implementation is adapted from
+            # https://github.com/zhyever/Monocular-Depth-Estimation-Toolbox/blob/main/depth/datasets/nyu.py  # noqa
+            crop_mask = torch.zeros_like(valid_mask)
+            crop_mask[45:471, 41:601] = 1
+        else:
+            crop_mask = torch.ones_like(valid_mask)
+
+        eval_mask = torch.logical_and(valid_mask, crop_mask)
+        return eval_mask
+
+    @staticmethod
+    def _calc_all_metrics(gt_depth, pred_depth):
+        """Computes final evaluation metrics based on accumulated results."""
+        assert gt_depth.shape == pred_depth.shape
+
+        thresh = torch.max((gt_depth / pred_depth), (pred_depth / gt_depth))
+        diff = pred_depth - gt_depth
+        diff_log = torch.log(pred_depth) - torch.log(gt_depth)
+
+        d1 = torch.sum(thresh < 1.25).float() / len(thresh)
+        d2 = torch.sum(thresh < 1.25**2).float() / len(thresh)
+        d3 = torch.sum(thresh < 1.25**3).float() / len(thresh)
+
+        abs_rel = torch.mean(torch.abs(diff) / gt_depth)
+        sq_rel = torch.mean(torch.pow(diff, 2) / gt_depth)
+
+        rmse = torch.sqrt(torch.mean(torch.pow(diff, 2)))
+        rmse_log = torch.sqrt(torch.mean(torch.pow(diff_log, 2)))
+
+        log10 = torch.mean(
+            torch.abs(torch.log10(pred_depth) - torch.log10(gt_depth)))
+        silog = torch.sqrt(
+            torch.pow(diff_log, 2).mean() -
+            0.5 * torch.pow(diff_log.mean(), 2))
+
+        return {
+            'd1': d1.item(),
+            'd2': d2.item(),
+            'd3': d3.item(),
+            'abs_rel': abs_rel.item(),
+            'sq_rel': sq_rel.item(),
+            'rmse': rmse.item(),
+            'rmse_log': rmse_log.item(),
+            'log10': log10.item(),
+            'silog': silog.item()
+        }
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results. The keys
+                are identical with self.metrics.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.format_only:
+            logger.info(f'results are saved to {osp.dirname(self.output_dir)}')
+            return OrderedDict()
+
+        metrics = defaultdict(list)
+        for gt_depth, pred_depth in results:
+            for key, value in self._calc_all_metrics(gt_depth,
+                                                     pred_depth).items():
+                metrics[key].append(value)
+        metrics = {k: sum(metrics[k]) / len(metrics[k]) for k in self.metrics}
+
+        table_data = PrettyTable()
+        for key, val in metrics.items():
+            table_data.add_column(key, [round(val, 5)])
+
+        print_log('results:', logger)
+        print_log('\n' + table_data.get_string(), logger=logger)
+
+        return metrics
diff --git a/head_extractor/src/mmseg/evaluation/metrics/iou_metric.py b/head_extractor/src/mmseg/evaluation/metrics/iou_metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..16014c74001d7295f9fff8f03ef185077e3f613b
--- /dev/null
+++ b/head_extractor/src/mmseg/evaluation/metrics/iou_metric.py
@@ -0,0 +1,286 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import OrderedDict
+from typing import Dict, List, Optional, Sequence
+
+import numpy as np
+import torch
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+from mmengine.utils import mkdir_or_exist
+from PIL import Image
+from prettytable import PrettyTable
+
+from mmseg.registry import METRICS
+
+
+@METRICS.register_module()
+class IoUMetric(BaseMetric):
+    """IoU evaluation metric.
+
+    Args:
+        ignore_index (int): Index that will be ignored in evaluation.
+            Default: 255.
+        iou_metrics (list[str] | str): Metrics to be calculated, the options
+            includes 'mIoU', 'mDice' and 'mFscore'.
+        nan_to_num (int, optional): If specified, NaN values will be replaced
+            by the numbers defined by the user. Default: None.
+        beta (int): Determines the weight of recall in the combined score.
+            Default: 1.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        output_dir (str): The directory for output prediction. Defaults to
+            None.
+        format_only (bool): Only format result for results commit without
+            perform evaluation. It is useful when you want to save the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    def __init__(self,
+                 ignore_index: int = 255,
+                 iou_metrics: List[str] = ['mIoU'],
+                 nan_to_num: Optional[int] = None,
+                 beta: int = 1,
+                 collect_device: str = 'cpu',
+                 output_dir: Optional[str] = None,
+                 format_only: bool = False,
+                 prefix: Optional[str] = None,
+                 **kwargs) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        self.ignore_index = ignore_index
+        self.metrics = iou_metrics
+        self.nan_to_num = nan_to_num
+        self.beta = beta
+        self.output_dir = output_dir
+        if self.output_dir and is_main_process():
+            mkdir_or_exist(self.output_dir)
+        self.format_only = format_only
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        num_classes = len(self.dataset_meta['classes'])
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_sem_seg']['data'].squeeze()
+            # format_only always for test dataset without ground truth
+            if not self.format_only:
+                label = data_sample['gt_sem_seg']['data'].squeeze().to(
+                    pred_label)
+                self.results.append(
+                    self.intersect_and_union(pred_label, label, num_classes,
+                                             self.ignore_index))
+            # format_result
+            if self.output_dir is not None:
+                basename = osp.splitext(osp.basename(
+                    data_sample['img_path']))[0]
+                png_filename = osp.abspath(
+                    osp.join(self.output_dir, f'{basename}.png'))
+                output_mask = pred_label.cpu().numpy()
+                # The index range of official ADE20k dataset is from 0 to 150.
+                # But the index range of output is from 0 to 149.
+                # That is because we set reduce_zero_label=True.
+                if data_sample.get('reduce_zero_label', False):
+                    output_mask = output_mask + 1
+                output = Image.fromarray(output_mask.astype(np.uint8))
+                output.save(png_filename)
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results. The key
+                mainly includes aAcc, mIoU, mAcc, mDice, mFscore, mPrecision,
+                mRecall.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.format_only:
+            logger.info(f'results are saved to {osp.dirname(self.output_dir)}')
+            return OrderedDict()
+        # convert list of tuples to tuple of lists, e.g.
+        # [(A_1, B_1, C_1, D_1), ...,  (A_n, B_n, C_n, D_n)] to
+        # ([A_1, ..., A_n], ..., [D_1, ..., D_n])
+        results = tuple(zip(*results))
+        assert len(results) == 4
+
+        total_area_intersect = sum(results[0])
+        total_area_union = sum(results[1])
+        total_area_pred_label = sum(results[2])
+        total_area_label = sum(results[3])
+        ret_metrics = self.total_area_to_metrics(
+            total_area_intersect, total_area_union, total_area_pred_label,
+            total_area_label, self.metrics, self.nan_to_num, self.beta)
+
+        class_names = self.dataset_meta['classes']
+
+        # summary table
+        ret_metrics_summary = OrderedDict({
+            ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2)
+            for ret_metric, ret_metric_value in ret_metrics.items()
+        })
+        metrics = dict()
+        for key, val in ret_metrics_summary.items():
+            if key == 'aAcc':
+                metrics[key] = val
+            else:
+                metrics['m' + key] = val
+
+        # each class table
+        ret_metrics.pop('aAcc', None)
+        ret_metrics_class = OrderedDict({
+            ret_metric: np.round(ret_metric_value * 100, 2)
+            for ret_metric, ret_metric_value in ret_metrics.items()
+        })
+        ret_metrics_class.update({'Class': class_names})
+        ret_metrics_class.move_to_end('Class', last=False)
+        class_table_data = PrettyTable()
+        for key, val in ret_metrics_class.items():
+            class_table_data.add_column(key, val)
+
+        print_log('per class results:', logger)
+        print_log('\n' + class_table_data.get_string(), logger=logger)
+
+        return metrics
+
+    @staticmethod
+    def intersect_and_union(pred_label: torch.tensor, label: torch.tensor,
+                            num_classes: int, ignore_index: int):
+        """Calculate Intersection and Union.
+
+        Args:
+            pred_label (torch.tensor): Prediction segmentation map
+                or predict result filename. The shape is (H, W).
+            label (torch.tensor): Ground truth segmentation map
+                or label filename. The shape is (H, W).
+            num_classes (int): Number of categories.
+            ignore_index (int): Index that will be ignored in evaluation.
+
+        Returns:
+            torch.Tensor: The intersection of prediction and ground truth
+                histogram on all classes.
+            torch.Tensor: The union of prediction and ground truth histogram on
+                all classes.
+            torch.Tensor: The prediction histogram on all classes.
+            torch.Tensor: The ground truth histogram on all classes.
+        """
+
+        mask = (label != ignore_index)
+        pred_label = pred_label[mask]
+        label = label[mask]
+
+        intersect = pred_label[pred_label == label]
+        area_intersect = torch.histc(
+            intersect.float(), bins=(num_classes), min=0,
+            max=num_classes - 1).cpu()
+        area_pred_label = torch.histc(
+            pred_label.float(), bins=(num_classes), min=0,
+            max=num_classes - 1).cpu()
+        area_label = torch.histc(
+            label.float(), bins=(num_classes), min=0,
+            max=num_classes - 1).cpu()
+        area_union = area_pred_label + area_label - area_intersect
+        return area_intersect, area_union, area_pred_label, area_label
+
+    @staticmethod
+    def total_area_to_metrics(total_area_intersect: np.ndarray,
+                              total_area_union: np.ndarray,
+                              total_area_pred_label: np.ndarray,
+                              total_area_label: np.ndarray,
+                              metrics: List[str] = ['mIoU'],
+                              nan_to_num: Optional[int] = None,
+                              beta: int = 1):
+        """Calculate evaluation metrics
+        Args:
+            total_area_intersect (np.ndarray): The intersection of prediction
+                and ground truth histogram on all classes.
+            total_area_union (np.ndarray): The union of prediction and ground
+                truth histogram on all classes.
+            total_area_pred_label (np.ndarray): The prediction histogram on
+                all classes.
+            total_area_label (np.ndarray): The ground truth histogram on
+                all classes.
+            metrics (List[str] | str): Metrics to be evaluated, 'mIoU' and
+                'mDice'.
+            nan_to_num (int, optional): If specified, NaN values will be
+                replaced by the numbers defined by the user. Default: None.
+            beta (int): Determines the weight of recall in the combined score.
+                Default: 1.
+        Returns:
+            Dict[str, np.ndarray]: per category evaluation metrics,
+                shape (num_classes, ).
+        """
+
+        def f_score(precision, recall, beta=1):
+            """calculate the f-score value.
+
+            Args:
+                precision (float | torch.Tensor): The precision value.
+                recall (float | torch.Tensor): The recall value.
+                beta (int): Determines the weight of recall in the combined
+                    score. Default: 1.
+
+            Returns:
+                [torch.tensor]: The f-score value.
+            """
+            score = (1 + beta**2) * (precision * recall) / (
+                (beta**2 * precision) + recall)
+            return score
+
+        if isinstance(metrics, str):
+            metrics = [metrics]
+        allowed_metrics = ['mIoU', 'mDice', 'mFscore']
+        if not set(metrics).issubset(set(allowed_metrics)):
+            raise KeyError(f'metrics {metrics} is not supported')
+
+        all_acc = total_area_intersect.sum() / total_area_label.sum()
+        ret_metrics = OrderedDict({'aAcc': all_acc})
+        for metric in metrics:
+            if metric == 'mIoU':
+                iou = total_area_intersect / total_area_union
+                acc = total_area_intersect / total_area_label
+                ret_metrics['IoU'] = iou
+                ret_metrics['Acc'] = acc
+            elif metric == 'mDice':
+                dice = 2 * total_area_intersect / (
+                    total_area_pred_label + total_area_label)
+                acc = total_area_intersect / total_area_label
+                ret_metrics['Dice'] = dice
+                ret_metrics['Acc'] = acc
+            elif metric == 'mFscore':
+                precision = total_area_intersect / total_area_pred_label
+                recall = total_area_intersect / total_area_label
+                f_value = torch.tensor([
+                    f_score(x[0], x[1], beta) for x in zip(precision, recall)
+                ])
+                ret_metrics['Fscore'] = f_value
+                ret_metrics['Precision'] = precision
+                ret_metrics['Recall'] = recall
+
+        ret_metrics = {
+            metric: value.numpy()
+            for metric, value in ret_metrics.items()
+        }
+        if nan_to_num is not None:
+            ret_metrics = OrderedDict({
+                metric: np.nan_to_num(metric_value, nan=nan_to_num)
+                for metric, metric_value in ret_metrics.items()
+            })
+        return ret_metrics
diff --git a/head_extractor/src/mmseg/models/__init__.py b/head_extractor/src/mmseg/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a98951283c1ac4047c5f5ca3cdc827a43c42cf60
--- /dev/null
+++ b/head_extractor/src/mmseg/models/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .assigners import *  # noqa: F401,F403
+from .backbones import *  # noqa: F401,F403
+from .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone,
+                      build_head, build_loss, build_segmentor)
+from .data_preprocessor import SegDataPreProcessor
+from .decode_heads import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .segmentors import *  # noqa: F401,F403
+from .text_encoder import *  # noqa: F401,F403
+
+__all__ = [
+    'BACKBONES', 'HEADS', 'LOSSES', 'SEGMENTORS', 'build_backbone',
+    'build_head', 'build_loss', 'build_segmentor', 'SegDataPreProcessor'
+]
diff --git a/head_extractor/src/mmseg/models/assigners/__init__.py b/head_extractor/src/mmseg/models/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d49b1b18b9e3e6d4e3b19c48eb1c80cbb1205f69
--- /dev/null
+++ b/head_extractor/src/mmseg/models/assigners/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_assigner import BaseAssigner
+from .hungarian_assigner import HungarianAssigner
+from .match_cost import ClassificationCost, CrossEntropyLossCost, DiceCost
+
+__all__ = [
+    'BaseAssigner',
+    'HungarianAssigner',
+    'ClassificationCost',
+    'CrossEntropyLossCost',
+    'DiceCost',
+]
diff --git a/head_extractor/src/mmseg/models/assigners/base_assigner.py b/head_extractor/src/mmseg/models/assigners/base_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..97895cdac2789a62c3e8a381caaf944679f1e5a4
--- /dev/null
+++ b/head_extractor/src/mmseg/models/assigners/base_assigner.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Optional
+
+from mmengine.structures import InstanceData
+
+
+class BaseAssigner(metaclass=ABCMeta):
+    """Base assigner that assigns masks to ground truth class labels."""
+
+    @abstractmethod
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs):
+        """Assign masks to either a ground truth class label or a negative
+        label."""
diff --git a/head_extractor/src/mmseg/models/assigners/hungarian_assigner.py b/head_extractor/src/mmseg/models/assigners/hungarian_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..28868f0a04e7feaf3de20e39fac5059d789047d3
--- /dev/null
+++ b/head_extractor/src/mmseg/models/assigners/hungarian_assigner.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import torch
+from mmengine import ConfigDict
+from mmengine.structures import InstanceData
+from scipy.optimize import linear_sum_assignment
+from torch.cuda.amp import autocast
+
+from mmseg.registry import TASK_UTILS
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class HungarianAssigner(BaseAssigner):
+    """Computes one-to-one matching between prediction masks and ground truth.
+
+    This class uses bipartite matching-based assignment to computes an
+    assignment between the prediction masks and the ground truth. The
+    assignment result is based on the weighted sum of match costs. The
+    Hungarian algorithm is used to calculate the best matching with the
+    minimum cost. The prediction masks that are not matched are classified
+    as background.
+
+    Args:
+        match_costs (ConfigDict|List[ConfigDict]): Match cost configs.
+    """
+
+    def __init__(
+        self, match_costs: Union[List[Union[dict, ConfigDict]], dict,
+                                 ConfigDict]
+    ) -> None:
+
+        if isinstance(match_costs, dict):
+            match_costs = [match_costs]
+        elif isinstance(match_costs, list):
+            assert len(match_costs) > 0, \
+                'match_costs must not be a empty list.'
+
+        self.match_costs = [
+            TASK_UTILS.build(match_cost) for match_cost in match_costs
+        ]
+
+    def assign(self, pred_instances: InstanceData, gt_instances: InstanceData,
+               **kwargs):
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The assignment first calculates the cost for each
+        category assigned to each query mask, and then uses the
+        Hungarian algorithm to calculate the minimum cost as the best
+        match.
+
+        Args:
+            pred_instances (InstanceData): Instances of model
+                predictions. It includes "masks", with shape
+                (n, h, w) or (n, l), and "cls", with shape (n, num_classes+1)
+            gt_instances (InstanceData): Ground truth of instance
+                annotations. It includes "labels", with shape (k, ),
+                and "masks", with shape (k, h, w) or (k, l).
+
+        Returns:
+            matched_quiery_inds (Tensor): The indexes of matched quieres.
+            matched_label_inds (Tensor): The indexes of matched labels.
+        """
+        # compute weighted cost
+        cost_list = []
+        with autocast(enabled=False):
+            for match_cost in self.match_costs:
+                cost = match_cost(
+                    pred_instances=pred_instances, gt_instances=gt_instances)
+                cost_list.append(cost)
+            cost = torch.stack(cost_list).sum(dim=0)
+
+        device = cost.device
+        # do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+
+        matched_quiery_inds, matched_label_inds = linear_sum_assignment(cost)
+        matched_quiery_inds = torch.from_numpy(matched_quiery_inds).to(device)
+        matched_label_inds = torch.from_numpy(matched_label_inds).to(device)
+
+        return matched_quiery_inds, matched_label_inds
diff --git a/head_extractor/src/mmseg/models/assigners/match_cost.py b/head_extractor/src/mmseg/models/assigners/match_cost.py
new file mode 100644
index 0000000000000000000000000000000000000000..560df852902fa7a2167cc7cfdf86595bf8d6e3f8
--- /dev/null
+++ b/head_extractor/src/mmseg/models/assigners/match_cost.py
@@ -0,0 +1,231 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Union
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import TASK_UTILS
+
+
+class BaseMatchCost:
+    """Base match cost class.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self, weight: Union[float, int] = 1.) -> None:
+        self.weight = weight
+
+    @abstractmethod
+    def __call__(self, pred_instances: InstanceData,
+                 gt_instances: InstanceData, **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (InstanceData): Instances of model predictions.
+            It often includes "labels" and "scores".
+            gt_instances (InstanceData): Ground truth of instance
+            annotations. It usually includes "labels".
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pass
+
+
+@TASK_UTILS.register_module()
+class ClassificationCost(BaseMatchCost):
+    """ClsSoftmaxCost.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+
+    Examples:
+        >>> from mmseg.models.assigners import ClassificationCost
+        >>> import torch
+        >>> self = ClassificationCost()
+        >>> cls_pred = torch.rand(4, 3)
+        >>> gt_labels = torch.tensor([0, 1, 2])
+        >>> factor = torch.tensor([10, 8, 10, 8])
+        >>> self(cls_pred, gt_labels)
+        tensor([[-0.3430, -0.3525, -0.3045],
+            [-0.3077, -0.2931, -0.3992],
+            [-0.3664, -0.3455, -0.2881],
+            [-0.3343, -0.2701, -0.3956]])
+    """
+
+    def __init__(self, weight: Union[float, int] = 1) -> None:
+        super().__init__(weight=weight)
+
+    def __call__(self, pred_instances: InstanceData,
+                 gt_instances: InstanceData, **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (InstanceData): "scores" inside is
+                predicted classification logits, of shape
+                (num_queries, num_class).
+            gt_instances (InstanceData): "labels" inside should have
+                shape (num_gt, ).
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        assert hasattr(pred_instances, 'scores'), \
+            "pred_instances must contain 'scores'"
+        assert hasattr(gt_instances, 'labels'), \
+            "gt_instances must contain 'labels'"
+        pred_scores = pred_instances.scores
+        gt_labels = gt_instances.labels
+
+        pred_scores = pred_scores.softmax(-1)
+        cls_cost = -pred_scores[:, gt_labels]
+
+        return cls_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class DiceCost(BaseMatchCost):
+    """Cost of mask assignments based on dice losses.
+
+    Args:
+        pred_act (bool): Whether to apply sigmoid to mask_pred.
+            Defaults to False.
+        eps (float): Defaults to 1e-3.
+        naive_dice (bool): If True, use the naive dice loss
+            in which the power of the number in the denominator is
+            the first power. If False, use the second power that
+            is adopted by K-Net and SOLO. Defaults to True.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 pred_act: bool = False,
+                 eps: float = 1e-3,
+                 naive_dice: bool = True,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.pred_act = pred_act
+        self.eps = eps
+        self.naive_dice = naive_dice
+
+    def _binary_mask_dice_loss(self, mask_preds: Tensor,
+                               gt_masks: Tensor) -> Tensor:
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction in shape (num_queries, *).
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+                store 0 or 1, 0 for negative class and 1 for
+                positive class.
+
+        Returns:
+            Tensor: Dice cost matrix in shape (num_queries, num_gt).
+        """
+        mask_preds = mask_preds.flatten(1)
+        gt_masks = gt_masks.flatten(1).float()
+        numerator = 2 * torch.einsum('nc,mc->nm', mask_preds, gt_masks)
+        if self.naive_dice:
+            denominator = mask_preds.sum(-1)[:, None] + \
+                gt_masks.sum(-1)[None, :]
+        else:
+            denominator = mask_preds.pow(2).sum(1)[:, None] + \
+                gt_masks.pow(2).sum(1)[None, :]
+        loss = 1 - (numerator + self.eps) / (denominator + self.eps)
+        return loss
+
+    def __call__(self, pred_instances: InstanceData,
+                 gt_instances: InstanceData, **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (InstanceData): Predicted instances which
+                must contain "masks".
+            gt_instances (InstanceData): Ground truth which must contain
+                "mask".
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        assert hasattr(pred_instances, 'masks'), \
+            "pred_instances must contain 'masks'"
+        assert hasattr(gt_instances, 'masks'), \
+            "gt_instances must contain 'masks'"
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+
+        if self.pred_act:
+            pred_masks = pred_masks.sigmoid()
+        dice_cost = self._binary_mask_dice_loss(pred_masks, gt_masks)
+        return dice_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class CrossEntropyLossCost(BaseMatchCost):
+    """CrossEntropyLossCost.
+
+    Args:
+        use_sigmoid (bool): Whether the prediction uses sigmoid
+                of softmax. Defaults to True.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 use_sigmoid: bool = True,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.use_sigmoid = use_sigmoid
+
+    def _binary_cross_entropy(self, cls_pred: Tensor,
+                              gt_labels: Tensor) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): The prediction with shape (num_queries, 1, *) or
+                (num_queries, *).
+            gt_labels (Tensor): The learning label of prediction with
+                shape (num_gt, *).
+
+        Returns:
+            Tensor: Cross entropy cost matrix in shape (num_queries, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1).float()
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        pos = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.ones_like(cls_pred), reduction='none')
+        neg = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.zeros_like(cls_pred), reduction='none')
+        cls_cost = torch.einsum('nc,mc->nm', pos, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg, 1 - gt_labels)
+        cls_cost = cls_cost / n
+
+        return cls_cost
+
+    def __call__(self, pred_instances: InstanceData,
+                 gt_instances: InstanceData, **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``masks``.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        assert hasattr(pred_instances, 'masks'), \
+            "pred_instances must contain 'masks'"
+        assert hasattr(gt_instances, 'masks'), \
+            "gt_instances must contain 'masks'"
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+        if self.use_sigmoid:
+            cls_cost = self._binary_cross_entropy(pred_masks, gt_masks)
+        else:
+            raise NotImplementedError
+
+        return cls_cost * self.weight
diff --git a/head_extractor/src/mmseg/models/backbones/__init__.py b/head_extractor/src/mmseg/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..697668124d7ce8935cb0c910f713a8bf13d0662d
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .beit import BEiT
+from .bisenetv1 import BiSeNetV1
+from .bisenetv2 import BiSeNetV2
+from .cgnet import CGNet
+from .ddrnet import DDRNet
+from .erfnet import ERFNet
+from .fast_scnn import FastSCNN
+from .hrnet import HRNet
+from .icnet import ICNet
+from .mae import MAE
+from .mit import MixVisionTransformer
+from .mobilenet_v2 import MobileNetV2
+from .mobilenet_v3 import MobileNetV3
+from .mscan import MSCAN
+from .pidnet import PIDNet
+from .resnest import ResNeSt
+from .resnet import ResNet, ResNetV1c, ResNetV1d
+from .resnext import ResNeXt
+from .stdc import STDCContextPathNet, STDCNet
+from .swin import SwinTransformer
+from .timm_backbone import TIMMBackbone
+from .twins import PCPVT, SVT
+from .unet import UNet
+from .vit import VisionTransformer
+from .vpd import VPD
+from .dinov2 import DINOv2
+
+__all__ = [
+    'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN',
+    'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
+    'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer',
+    'BiSeNetV1', 'BiSeNetV2', 'ICNet', 'TIMMBackbone', 'ERFNet', 'PCPVT',
+    'SVT', 'STDCNet', 'STDCContextPathNet', 'BEiT', 'MAE', 'PIDNet', 'MSCAN',
+    'DDRNet', 'VPD', 'DINOv2'
+]
diff --git a/head_extractor/src/mmseg/models/backbones/beit.py b/head_extractor/src/mmseg/models/backbones/beit.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5da71e729256a9dd12b70d32886c9db27d9fa3c
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/beit.py
@@ -0,0 +1,554 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, kaiming_init,
+                                        trunc_normal_)
+from mmengine.runner.checkpoint import _load_checkpoint
+from scipy import interpolate
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn.modules.utils import _pair as to_2tuple
+
+from mmseg.registry import MODELS
+from ..utils import PatchEmbed
+from .vit import TransformerEncoderLayer as VisionTransformerEncoderLayer
+
+
+class BEiTAttention(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        bias (bool): The option to add leanable bias for q, k, v. If bias is
+            True, it will add leanable bias. If bias is 'qv_bias', it will only
+            add leanable bias for q, v. If bias is False, it will not add bias
+            for q, k, v. Default to 'qv_bias'.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float): Dropout ratio of output. Default: 0.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 bias='qv_bias',
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.bias = bias
+        self.scale = qk_scale or head_embed_dims**-0.5
+
+        qkv_bias = bias
+        if bias == 'qv_bias':
+            self._init_qv_bias()
+            qkv_bias = False
+
+        self.window_size = window_size
+        self._init_rel_pos_embedding()
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+    def _init_qv_bias(self):
+        self.q_bias = nn.Parameter(torch.zeros(self.embed_dims))
+        self.v_bias = nn.Parameter(torch.zeros(self.embed_dims))
+
+    def _init_rel_pos_embedding(self):
+        Wh, Ww = self.window_size
+        # cls to token & token 2 cls & cls to cls
+        self.num_relative_distance = (2 * Wh - 1) * (2 * Ww - 1) + 3
+        # relative_position_bias_table shape is (2*Wh-1 * 2*Ww-1 + 3, nH)
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, self.num_heads))
+
+        # get pair-wise relative position index for
+        # each token inside the window
+        coords_h = torch.arange(Wh)
+        coords_w = torch.arange(Ww)
+        # coords shape is (2, Wh, Ww)
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))
+        # coords_flatten shape is (2, Wh*Ww)
+        coords_flatten = torch.flatten(coords, 1)
+        relative_coords = (
+            coords_flatten[:, :, None] - coords_flatten[:, None, :])
+        # relative_coords shape is (Wh*Ww, Wh*Ww, 2)
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
+        # shift to start from 0
+        relative_coords[:, :, 0] += Wh - 1
+        relative_coords[:, :, 1] += Ww - 1
+        relative_coords[:, :, 0] *= 2 * Ww - 1
+        relative_position_index = torch.zeros(
+            size=(Wh * Ww + 1, ) * 2, dtype=relative_coords.dtype)
+        # relative_position_index shape is (Wh*Ww, Wh*Ww)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+    def init_weights(self):
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x):
+        """
+        Args:
+            x (tensor): input features with shape of (num_windows*B, N, C).
+        """
+        B, N, C = x.shape
+
+        if self.bias == 'qv_bias':
+            k_bias = torch.zeros_like(self.v_bias, requires_grad=False)
+            qkv_bias = torch.cat((self.q_bias, k_bias, self.v_bias))
+            qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        else:
+            qkv = self.qkv(x)
+
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        if self.relative_position_bias_table is not None:
+            Wh = self.window_size[0]
+            Ww = self.window_size[1]
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.view(-1)].view(
+                    Wh * Ww + 1, Wh * Ww + 1, -1)
+            relative_position_bias = relative_position_bias.permute(
+                2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class BEiTTransformerEncoderLayer(VisionTransformerEncoderLayer):
+    """Implements one encoder layer in Vision Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): Stochastic depth rate. Default 0.0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        bias (bool): The option to add leanable bias for q, k, v. If bias is
+            True, it will add leanable bias. If bias is 'qv_bias', it will only
+            add leanable bias for q, v. If bias is False, it will not add bias
+            for q, k, v. Default to 'qv_bias'.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        window_size (tuple[int], optional): The height and width of the window.
+            Default: None.
+        init_values (float, optional): Initialize the values of BEiTAttention
+            and FFN with learnable scaling. Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 num_fcs=2,
+                 bias='qv_bias',
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 window_size=None,
+                 attn_cfg=dict(),
+                 ffn_cfg=dict(add_identity=False),
+                 init_values=None):
+        attn_cfg.update(dict(window_size=window_size, qk_scale=None))
+
+        super().__init__(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            feedforward_channels=feedforward_channels,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=0.,
+            drop_rate=0.,
+            num_fcs=num_fcs,
+            qkv_bias=bias,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            attn_cfg=attn_cfg,
+            ffn_cfg=ffn_cfg)
+
+        # NOTE: drop path for stochastic depth, we shall see if
+        # this is better than dropout here
+        dropout_layer = dict(type='DropPath', drop_prob=drop_path_rate)
+        self.drop_path = build_dropout(
+            dropout_layer) if dropout_layer else nn.Identity()
+        self.gamma_1 = nn.Parameter(
+            init_values * torch.ones(embed_dims), requires_grad=True)
+        self.gamma_2 = nn.Parameter(
+            init_values * torch.ones(embed_dims), requires_grad=True)
+
+    def build_attn(self, attn_cfg):
+        self.attn = BEiTAttention(**attn_cfg)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.gamma_2 * self.ffn(self.norm2(x)))
+        return x
+
+
+@MODELS.register_module()
+class BEiT(BaseModule):
+    """BERT Pre-Training of Image Transformers.
+
+    Args:
+        img_size (int | tuple): Input image size. Default: 224.
+        patch_size (int): The patch size. Default: 16.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 768.
+        num_layers (int): Depth of transformer. Default: 12.
+        num_heads (int): Number of attention heads. Default: 12.
+        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        out_indices (list | tuple | int): Output from which stages.
+            Default: -1.
+        qv_bias (bool): Enable bias for qv if True. Default: True.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): Stochastic depth rate. Default 0.0.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        patch_norm (bool): Whether to add a norm in PatchEmbed Block.
+            Default: False.
+        final_norm (bool): Whether to add a additional layer to normalize
+            final feature map. Default: False.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_values (float): Initialize the values of BEiTAttention and FFN
+            with learnable scaling.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dims=768,
+                 num_layers=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 out_indices=-1,
+                 qv_bias=True,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_cfg=dict(type='LN'),
+                 act_cfg=dict(type='GELU'),
+                 patch_norm=False,
+                 final_norm=False,
+                 num_fcs=2,
+                 norm_eval=False,
+                 pretrained=None,
+                 init_values=0.1,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        if isinstance(img_size, int):
+            img_size = to_2tuple(img_size)
+        elif isinstance(img_size, tuple):
+            if len(img_size) == 1:
+                img_size = to_2tuple(img_size[0])
+            assert len(img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.in_channels = in_channels
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.norm_eval = norm_eval
+        self.pretrained = pretrained
+        self.num_layers = num_layers
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.num_fcs = num_fcs
+        self.qv_bias = qv_bias
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.patch_norm = patch_norm
+        self.init_values = init_values
+        self.window_size = (img_size[0] // patch_size,
+                            img_size[1] // patch_size)
+        self.patch_shape = self.window_size
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
+
+        self._build_patch_embedding()
+        self._build_layers()
+
+        if isinstance(out_indices, int):
+            if out_indices == -1:
+                out_indices = num_layers - 1
+            self.out_indices = [out_indices]
+        elif isinstance(out_indices, list) or isinstance(out_indices, tuple):
+            self.out_indices = out_indices
+        else:
+            raise TypeError('out_indices must be type of int, list or tuple')
+
+        self.final_norm = final_norm
+        if final_norm:
+            self.norm1_name, norm1 = build_norm_layer(
+                norm_cfg, embed_dims, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+
+    def _build_patch_embedding(self):
+        """Build patch embedding layer."""
+        self.patch_embed = PatchEmbed(
+            in_channels=self.in_channels,
+            embed_dims=self.embed_dims,
+            conv_type='Conv2d',
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding=0,
+            norm_cfg=self.norm_cfg if self.patch_norm else None,
+            init_cfg=None)
+
+    def _build_layers(self):
+        """Build transformer encoding layers."""
+
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, self.num_layers)
+        ]
+        self.layers = ModuleList()
+        for i in range(self.num_layers):
+            self.layers.append(
+                BEiTTransformerEncoderLayer(
+                    embed_dims=self.embed_dims,
+                    num_heads=self.num_heads,
+                    feedforward_channels=self.mlp_ratio * self.embed_dims,
+                    attn_drop_rate=self.attn_drop_rate,
+                    drop_path_rate=dpr[i],
+                    num_fcs=self.num_fcs,
+                    bias='qv_bias' if self.qv_bias else False,
+                    act_cfg=self.act_cfg,
+                    norm_cfg=self.norm_cfg,
+                    window_size=self.window_size,
+                    init_values=self.init_values))
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    def _geometric_sequence_interpolation(self, src_size, dst_size, sequence,
+                                          num):
+        """Get new sequence via geometric sequence interpolation.
+
+        Args:
+            src_size (int): Pos_embedding size in pre-trained model.
+            dst_size (int): Pos_embedding size in the current model.
+            sequence (tensor): The relative position bias of the pretrain
+                model after removing the extra tokens.
+            num (int): Number of attention heads.
+        Returns:
+            new_sequence (tensor): Geometric sequence interpolate the
+                pre-trained relative position bias to the size of
+                the current model.
+        """
+
+        def geometric_progression(a, r, n):
+            return a * (1.0 - r**n) / (1.0 - r)
+
+        # Here is a binary function.
+        left, right = 1.01, 1.5
+        while right - left > 1e-6:
+            q = (left + right) / 2.0
+            gp = geometric_progression(1, q, src_size // 2)
+            if gp > dst_size // 2:
+                right = q
+            else:
+                left = q
+        # The position of each interpolated point is determined
+        # by the ratio obtained by dichotomy.
+        dis = []
+        cur = 1
+        for i in range(src_size // 2):
+            dis.append(cur)
+            cur += q**(i + 1)
+        r_ids = [-_ for _ in reversed(dis)]
+        x = r_ids + [0] + dis
+        y = r_ids + [0] + dis
+        t = dst_size // 2.0
+        dx = np.arange(-t, t + 0.1, 1.0)
+        dy = np.arange(-t, t + 0.1, 1.0)
+        # Interpolation functions are being executed and called.
+        new_sequence = []
+        for i in range(num):
+            z = sequence[:, i].view(src_size, src_size).float().numpy()
+            f = interpolate.interp2d(x, y, z, kind='cubic')
+            new_sequence.append(
+                torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(sequence))
+        new_sequence = torch.cat(new_sequence, dim=-1)
+        return new_sequence
+
+    def resize_rel_pos_embed(self, checkpoint):
+        """Resize relative pos_embed weights.
+
+        This function is modified from
+        https://github.com/microsoft/unilm/blob/master/beit/semantic_segmentation/mmcv_custom/checkpoint.py.  # noqa: E501
+        Copyright (c) Microsoft Corporation
+        Licensed under the MIT License
+        Args:
+            checkpoint (dict): Key and value of the pretrain model.
+        Returns:
+            state_dict (dict): Interpolate the relative pos_embed weights
+                in the pre-train model to the current model size.
+        """
+        if 'state_dict' in checkpoint:
+            state_dict = checkpoint['state_dict']
+        else:
+            state_dict = checkpoint
+
+        all_keys = list(state_dict.keys())
+        for key in all_keys:
+            if 'relative_position_index' in key:
+                state_dict.pop(key)
+            # In order to keep the center of pos_bias as consistent as
+            # possible after interpolation, and vice versa in the edge
+            # area, the geometric sequence interpolation method is adopted.
+            if 'relative_position_bias_table' in key:
+                rel_pos_bias = state_dict[key]
+                src_num_pos, num_attn_heads = rel_pos_bias.size()
+                dst_num_pos, _ = self.state_dict()[key].size()
+                dst_patch_shape = self.patch_shape
+                if dst_patch_shape[0] != dst_patch_shape[1]:
+                    raise NotImplementedError()
+                # Count the number of extra tokens.
+                num_extra_tokens = dst_num_pos - (
+                    dst_patch_shape[0] * 2 - 1) * (
+                        dst_patch_shape[1] * 2 - 1)
+                src_size = int((src_num_pos - num_extra_tokens)**0.5)
+                dst_size = int((dst_num_pos - num_extra_tokens)**0.5)
+                if src_size != dst_size:
+                    extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
+                    rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
+                    new_rel_pos_bias = self._geometric_sequence_interpolation(
+                        src_size, dst_size, rel_pos_bias, num_attn_heads)
+                    new_rel_pos_bias = torch.cat(
+                        (new_rel_pos_bias, extra_tokens), dim=0)
+                    state_dict[key] = new_rel_pos_bias
+
+        return state_dict
+
+    def init_weights(self):
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+        self.apply(_init_weights)
+
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg.get('type') == 'Pretrained'):
+            checkpoint = _load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+            state_dict = self.resize_rel_pos_embed(checkpoint)
+            self.load_state_dict(state_dict, False)
+        elif self.init_cfg is not None:
+            super().init_weights()
+        else:
+            # We only implement the 'jax_impl' initialization implemented at
+            # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
+            # Copyright 2019 Ross Wightman
+            # Licensed under the Apache License, Version 2.0 (the "License")
+            trunc_normal_(self.cls_token, std=.02)
+            for n, m in self.named_modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if m.bias is not None:
+                        if 'ffn' in n:
+                            nn.init.normal_(m.bias, mean=0., std=1e-6)
+                        else:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Conv2d):
+                    kaiming_init(m, mode='fan_in', bias=0.)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m, val=1.0, bias=0.)
+
+    def forward(self, inputs):
+        B = inputs.shape[0]
+
+        x, hw_shape = self.patch_embed(inputs)
+
+        # stole cls_tokens impl from Phil Wang, thanks
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i == len(self.layers) - 1:
+                if self.final_norm:
+                    x = self.norm1(x)
+            if i in self.out_indices:
+                # Remove class token and reshape token for decoder head
+                out = x[:, 1:]
+                B, _, C = out.shape
+                out = out.reshape(B, hw_shape[0], hw_shape[1],
+                                  C).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.LayerNorm):
+                    m.eval()
diff --git a/head_extractor/src/mmseg/models/backbones/bisenetv1.py b/head_extractor/src/mmseg/models/backbones/bisenetv1.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca58bf9c597836937bc384739ff77001b5402942
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/bisenetv1.py
@@ -0,0 +1,332 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+class SpatialPath(BaseModule):
+    """Spatial Path to preserve the spatial size of the original input image
+    and encode affluent spatial information.
+
+    Args:
+        in_channels(int): The number of channels of input
+            image. Default: 3.
+        num_channels (Tuple[int]): The number of channels of
+            each layers in Spatial Path.
+            Default: (64, 64, 64, 128).
+    Returns:
+        x (torch.Tensor): Feature map for Feature Fusion Module.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 num_channels=(64, 64, 64, 128),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert len(num_channels) == 4, 'Length of input channels \
+                                        of Spatial Path must be 4!'
+
+        self.layers = []
+        for i in range(len(num_channels)):
+            layer_name = f'layer{i + 1}'
+            self.layers.append(layer_name)
+            if i == 0:
+                self.add_module(
+                    layer_name,
+                    ConvModule(
+                        in_channels=in_channels,
+                        out_channels=num_channels[i],
+                        kernel_size=7,
+                        stride=2,
+                        padding=3,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+            elif i == len(num_channels) - 1:
+                self.add_module(
+                    layer_name,
+                    ConvModule(
+                        in_channels=num_channels[i - 1],
+                        out_channels=num_channels[i],
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+            else:
+                self.add_module(
+                    layer_name,
+                    ConvModule(
+                        in_channels=num_channels[i - 1],
+                        out_channels=num_channels[i],
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+
+    def forward(self, x):
+        for i, layer_name in enumerate(self.layers):
+            layer_stage = getattr(self, layer_name)
+            x = layer_stage(x)
+        return x
+
+
+class AttentionRefinementModule(BaseModule):
+    """Attention Refinement Module (ARM) to refine the features of each stage.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+    Returns:
+        x_out (torch.Tensor): Feature map of Attention Refinement Module.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channel,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.conv_layer = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.atten_conv_layer = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            ConvModule(
+                in_channels=out_channel,
+                out_channels=out_channel,
+                kernel_size=1,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None), nn.Sigmoid())
+
+    def forward(self, x):
+        x = self.conv_layer(x)
+        x_atten = self.atten_conv_layer(x)
+        x_out = x * x_atten
+        return x_out
+
+
+class ContextPath(BaseModule):
+    """Context Path to provide sufficient receptive field.
+
+    Args:
+        backbone_cfg:(dict): Config of backbone of
+            Context Path.
+        context_channels (Tuple[int]): The number of channel numbers
+            of various modules in Context Path.
+            Default: (128, 256, 512).
+        align_corners (bool, optional): The align_corners argument of
+            resize operation. Default: False.
+    Returns:
+        x_16_up, x_32_up (torch.Tensor, torch.Tensor): Two feature maps
+            undergoing upsampling from 1/16 and 1/32 downsampling
+            feature maps. These two feature maps are used for Feature
+            Fusion Module and Auxiliary Head.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 context_channels=(128, 256, 512),
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert len(context_channels) == 3, 'Length of input channels \
+                                           of Context Path must be 3!'
+
+        self.backbone = MODELS.build(backbone_cfg)
+
+        self.align_corners = align_corners
+        self.arm16 = AttentionRefinementModule(context_channels[1],
+                                               context_channels[0])
+        self.arm32 = AttentionRefinementModule(context_channels[2],
+                                               context_channels[0])
+        self.conv_head32 = ConvModule(
+            in_channels=context_channels[0],
+            out_channels=context_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv_head16 = ConvModule(
+            in_channels=context_channels[0],
+            out_channels=context_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.gap_conv = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            ConvModule(
+                in_channels=context_channels[2],
+                out_channels=context_channels[0],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def forward(self, x):
+        x_4, x_8, x_16, x_32 = self.backbone(x)
+        x_gap = self.gap_conv(x_32)
+
+        x_32_arm = self.arm32(x_32)
+        x_32_sum = x_32_arm + x_gap
+        x_32_up = resize(input=x_32_sum, size=x_16.shape[2:], mode='nearest')
+        x_32_up = self.conv_head32(x_32_up)
+
+        x_16_arm = self.arm16(x_16)
+        x_16_sum = x_16_arm + x_32_up
+        x_16_up = resize(input=x_16_sum, size=x_8.shape[2:], mode='nearest')
+        x_16_up = self.conv_head16(x_16_up)
+
+        return x_16_up, x_32_up
+
+
+class FeatureFusionModule(BaseModule):
+    """Feature Fusion Module to fuse low level output feature of Spatial Path
+    and high level output feature of Context Path.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+    Returns:
+        x_out (torch.Tensor): Feature map of Feature Fusion Module.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.gap = nn.AdaptiveAvgPool2d((1, 1))
+        self.conv_atten = nn.Sequential(
+            ConvModule(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg), nn.Sigmoid())
+
+    def forward(self, x_sp, x_cp):
+        x_concat = torch.cat([x_sp, x_cp], dim=1)
+        x_fuse = self.conv1(x_concat)
+        x_atten = self.gap(x_fuse)
+        # Note: No BN and more 1x1 conv in paper.
+        x_atten = self.conv_atten(x_atten)
+        x_atten = x_fuse * x_atten
+        x_out = x_atten + x_fuse
+        return x_out
+
+
+@MODELS.register_module()
+class BiSeNetV1(BaseModule):
+    """BiSeNetV1 backbone.
+
+    This backbone is the implementation of `BiSeNet: Bilateral
+    Segmentation Network for Real-time Semantic
+    Segmentation <https://arxiv.org/abs/1808.00897>`_.
+
+    Args:
+        backbone_cfg:(dict): Config of backbone of
+            Context Path.
+        in_channels (int): The number of channels of input
+            image. Default: 3.
+        spatial_channels (Tuple[int]): Size of channel numbers of
+            various layers in Spatial Path.
+            Default: (64, 64, 64, 128).
+        context_channels (Tuple[int]): Size of channel numbers of
+            various modules in Context Path.
+            Default: (128, 256, 512).
+        out_indices (Tuple[int] | int, optional): Output from which stages.
+            Default: (0, 1, 2).
+        align_corners (bool, optional): The align_corners argument of
+            resize operation in Bilateral Guided Aggregation Layer.
+            Default: False.
+        out_channels(int): The number of channels of output.
+            It must be the same with `in_channels` of decode_head.
+            Default: 256.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 in_channels=3,
+                 spatial_channels=(64, 64, 64, 128),
+                 context_channels=(128, 256, 512),
+                 out_indices=(0, 1, 2),
+                 align_corners=False,
+                 out_channels=256,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+        assert len(spatial_channels) == 4, 'Length of input channels \
+                                           of Spatial Path must be 4!'
+
+        assert len(context_channels) == 3, 'Length of input channels \
+                                           of Context Path must be 3!'
+
+        self.out_indices = out_indices
+        self.align_corners = align_corners
+        self.context_path = ContextPath(backbone_cfg, context_channels,
+                                        self.align_corners)
+        self.spatial_path = SpatialPath(in_channels, spatial_channels)
+        self.ffm = FeatureFusionModule(context_channels[1], out_channels)
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+    def forward(self, x):
+        # stole refactoring code from Coin Cheung, thanks
+        x_context8, x_context16 = self.context_path(x)
+        x_spatial = self.spatial_path(x)
+        x_fuse = self.ffm(x_spatial, x_context8)
+
+        outs = [x_fuse, x_context8, x_context16]
+        outs = [outs[i] for i in self.out_indices]
+        return tuple(outs)
diff --git a/head_extractor/src/mmseg/models/backbones/bisenetv2.py b/head_extractor/src/mmseg/models/backbones/bisenetv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..32aa49822f7d0c3bd4839b3796a15689e1f4cbc0
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/bisenetv2.py
@@ -0,0 +1,622 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule,
+                      build_activation_layer, build_norm_layer)
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+class DetailBranch(BaseModule):
+    """Detail Branch with wide channels and shallow layers to capture low-level
+    details and generate high-resolution feature representation.
+
+    Args:
+        detail_channels (Tuple[int]): Size of channel numbers of each stage
+            in Detail Branch, in paper it has 3 stages.
+            Default: (64, 64, 128).
+        in_channels (int): Number of channels of input image. Default: 3.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        x (torch.Tensor): Feature map of Detail Branch.
+    """
+
+    def __init__(self,
+                 detail_channels=(64, 64, 128),
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        detail_branch = []
+        for i in range(len(detail_channels)):
+            if i == 0:
+                detail_branch.append(
+                    nn.Sequential(
+                        ConvModule(
+                            in_channels=in_channels,
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=2,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg),
+                        ConvModule(
+                            in_channels=detail_channels[i],
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg)))
+            else:
+                detail_branch.append(
+                    nn.Sequential(
+                        ConvModule(
+                            in_channels=detail_channels[i - 1],
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=2,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg),
+                        ConvModule(
+                            in_channels=detail_channels[i],
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg),
+                        ConvModule(
+                            in_channels=detail_channels[i],
+                            out_channels=detail_channels[i],
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            conv_cfg=conv_cfg,
+                            norm_cfg=norm_cfg,
+                            act_cfg=act_cfg)))
+        self.detail_branch = nn.ModuleList(detail_branch)
+
+    def forward(self, x):
+        for stage in self.detail_branch:
+            x = stage(x)
+        return x
+
+
+class StemBlock(BaseModule):
+    """Stem Block at the beginning of Semantic Branch.
+
+    Args:
+        in_channels (int): Number of input channels.
+            Default: 3.
+        out_channels (int): Number of output channels.
+            Default: 16.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        x (torch.Tensor): First feature map in Semantic Branch.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 out_channels=16,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.conv_first = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.convs = nn.Sequential(
+            ConvModule(
+                in_channels=out_channels,
+                out_channels=out_channels // 2,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                in_channels=out_channels // 2,
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+        self.pool = nn.MaxPool2d(
+            kernel_size=3, stride=2, padding=1, ceil_mode=False)
+        self.fuse_last = ConvModule(
+            in_channels=out_channels * 2,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        x = self.conv_first(x)
+        x_left = self.convs(x)
+        x_right = self.pool(x)
+        x = self.fuse_last(torch.cat([x_left, x_right], dim=1))
+        return x
+
+
+class GELayer(BaseModule):
+    """Gather-and-Expansion Layer.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        exp_ratio (int): Expansion ratio for middle channels.
+            Default: 6.
+        stride (int): Stride of GELayer. Default: 1
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        x (torch.Tensor): Intermediate feature map in
+            Semantic Branch.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 exp_ratio=6,
+                 stride=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        mid_channel = in_channels * exp_ratio
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        if stride == 1:
+            self.dwconv = nn.Sequential(
+                # ReLU in ConvModule not shown in paper
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=mid_channel,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    groups=in_channels,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.shortcut = None
+        else:
+            self.dwconv = nn.Sequential(
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=mid_channel,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    groups=in_channels,
+                    bias=False,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=None),
+                # ReLU in ConvModule not shown in paper
+                ConvModule(
+                    in_channels=mid_channel,
+                    out_channels=mid_channel,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    groups=mid_channel,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg),
+            )
+            self.shortcut = nn.Sequential(
+                DepthwiseSeparableConvModule(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    dw_norm_cfg=norm_cfg,
+                    dw_act_cfg=None,
+                    pw_norm_cfg=norm_cfg,
+                    pw_act_cfg=None,
+                ))
+
+        self.conv2 = nn.Sequential(
+            ConvModule(
+                in_channels=mid_channel,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+            ))
+
+        self.act = build_activation_layer(act_cfg)
+
+    def forward(self, x):
+        identity = x
+        x = self.conv1(x)
+        x = self.dwconv(x)
+        x = self.conv2(x)
+        if self.shortcut is not None:
+            shortcut = self.shortcut(identity)
+            x = x + shortcut
+        else:
+            x = x + identity
+        x = self.act(x)
+        return x
+
+
+class CEBlock(BaseModule):
+    """Context Embedding Block for large receptive filed in Semantic Branch.
+
+    Args:
+        in_channels (int): Number of input channels.
+            Default: 3.
+        out_channels (int): Number of output channels.
+            Default: 16.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        x (torch.Tensor): Last feature map in Semantic Branch.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 out_channels=16,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.gap = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            build_norm_layer(norm_cfg, self.in_channels)[1])
+        self.conv_gap = ConvModule(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        # Note: in paper here is naive conv2d, no bn-relu
+        self.conv_last = ConvModule(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        identity = x
+        x = self.gap(x)
+        x = self.conv_gap(x)
+        x = identity + x
+        x = self.conv_last(x)
+        return x
+
+
+class SemanticBranch(BaseModule):
+    """Semantic Branch which is lightweight with narrow channels and deep
+    layers to obtain　high-level semantic context.
+
+    Args:
+        semantic_channels(Tuple[int]): Size of channel numbers of
+            various stages in Semantic Branch.
+            Default: (16, 32, 64, 128).
+        in_channels (int): Number of channels of input image. Default: 3.
+        exp_ratio (int): Expansion ratio for middle channels.
+            Default: 6.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        semantic_outs (List[torch.Tensor]): List of several feature maps
+            for auxiliary heads (Booster) and Bilateral
+            Guided Aggregation Layer.
+    """
+
+    def __init__(self,
+                 semantic_channels=(16, 32, 64, 128),
+                 in_channels=3,
+                 exp_ratio=6,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.semantic_channels = semantic_channels
+        self.semantic_stages = []
+        for i in range(len(semantic_channels)):
+            stage_name = f'stage{i + 1}'
+            self.semantic_stages.append(stage_name)
+            if i == 0:
+                self.add_module(
+                    stage_name,
+                    StemBlock(self.in_channels, semantic_channels[i]))
+            elif i == (len(semantic_channels) - 1):
+                self.add_module(
+                    stage_name,
+                    nn.Sequential(
+                        GELayer(semantic_channels[i - 1], semantic_channels[i],
+                                exp_ratio, 2),
+                        GELayer(semantic_channels[i], semantic_channels[i],
+                                exp_ratio, 1),
+                        GELayer(semantic_channels[i], semantic_channels[i],
+                                exp_ratio, 1),
+                        GELayer(semantic_channels[i], semantic_channels[i],
+                                exp_ratio, 1)))
+            else:
+                self.add_module(
+                    stage_name,
+                    nn.Sequential(
+                        GELayer(semantic_channels[i - 1], semantic_channels[i],
+                                exp_ratio, 2),
+                        GELayer(semantic_channels[i], semantic_channels[i],
+                                exp_ratio, 1)))
+
+        self.add_module(f'stage{len(semantic_channels)}_CEBlock',
+                        CEBlock(semantic_channels[-1], semantic_channels[-1]))
+        self.semantic_stages.append(f'stage{len(semantic_channels)}_CEBlock')
+
+    def forward(self, x):
+        semantic_outs = []
+        for stage_name in self.semantic_stages:
+            semantic_stage = getattr(self, stage_name)
+            x = semantic_stage(x)
+            semantic_outs.append(x)
+        return semantic_outs
+
+
+class BGALayer(BaseModule):
+    """Bilateral Guided Aggregation Layer to fuse the complementary information
+    from both Detail Branch and Semantic Branch.
+
+    Args:
+        out_channels (int): Number of output channels.
+            Default: 128.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    Returns:
+        output (torch.Tensor): Output feature map for Segment heads.
+    """
+
+    def __init__(self,
+                 out_channels=128,
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.out_channels = out_channels
+        self.align_corners = align_corners
+        self.detail_dwconv = nn.Sequential(
+            DepthwiseSeparableConvModule(
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                dw_norm_cfg=norm_cfg,
+                dw_act_cfg=None,
+                pw_norm_cfg=None,
+                pw_act_cfg=None,
+            ))
+        self.detail_down = nn.Sequential(
+            ConvModule(
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None),
+            nn.AvgPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=False))
+        self.semantic_conv = nn.Sequential(
+            ConvModule(
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None))
+        self.semantic_dwconv = nn.Sequential(
+            DepthwiseSeparableConvModule(
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                dw_norm_cfg=norm_cfg,
+                dw_act_cfg=None,
+                pw_norm_cfg=None,
+                pw_act_cfg=None,
+            ))
+        self.conv = ConvModule(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            inplace=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+        )
+
+    def forward(self, x_d, x_s):
+        detail_dwconv = self.detail_dwconv(x_d)
+        detail_down = self.detail_down(x_d)
+        semantic_conv = self.semantic_conv(x_s)
+        semantic_dwconv = self.semantic_dwconv(x_s)
+        semantic_conv = resize(
+            input=semantic_conv,
+            size=detail_dwconv.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        fuse_1 = detail_dwconv * torch.sigmoid(semantic_conv)
+        fuse_2 = detail_down * torch.sigmoid(semantic_dwconv)
+        fuse_2 = resize(
+            input=fuse_2,
+            size=fuse_1.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        output = self.conv(fuse_1 + fuse_2)
+        return output
+
+
+@MODELS.register_module()
+class BiSeNetV2(BaseModule):
+    """BiSeNetV2: Bilateral Network with Guided Aggregation for
+    Real-time Semantic Segmentation.
+
+    This backbone is the implementation of
+    `BiSeNetV2 <https://arxiv.org/abs/2004.02147>`_.
+
+    Args:
+        in_channels (int): Number of channel of input image. Default: 3.
+        detail_channels (Tuple[int], optional): Channels of each stage
+            in Detail Branch. Default: (64, 64, 128).
+        semantic_channels (Tuple[int], optional): Channels of each stage
+            in Semantic Branch. Default: (16, 32, 64, 128).
+            See Table 1 and Figure 3 of paper for more details.
+        semantic_expansion_ratio (int, optional): The expansion factor
+            expanding channel number of middle channels in Semantic Branch.
+            Default: 6.
+        bga_channels (int, optional): Number of middle channels in
+            Bilateral Guided Aggregation Layer. Default: 128.
+        out_indices (Tuple[int] | int, optional): Output from which stages.
+            Default: (0, 1, 2, 3, 4).
+        align_corners (bool, optional): The align_corners argument of
+            resize operation in Bilateral Guided Aggregation Layer.
+            Default: False.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 detail_channels=(64, 64, 128),
+                 semantic_channels=(16, 32, 64, 128),
+                 semantic_expansion_ratio=6,
+                 bga_channels=128,
+                 out_indices=(0, 1, 2, 3, 4),
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        if init_cfg is None:
+            init_cfg = [
+                dict(type='Kaiming', layer='Conv2d'),
+                dict(
+                    type='Constant', val=1, layer=['_BatchNorm', 'GroupNorm'])
+            ]
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_indices = out_indices
+        self.detail_channels = detail_channels
+        self.semantic_channels = semantic_channels
+        self.semantic_expansion_ratio = semantic_expansion_ratio
+        self.bga_channels = bga_channels
+        self.align_corners = align_corners
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.detail = DetailBranch(self.detail_channels, self.in_channels)
+        self.semantic = SemanticBranch(self.semantic_channels,
+                                       self.in_channels,
+                                       self.semantic_expansion_ratio)
+        self.bga = BGALayer(self.bga_channels, self.align_corners)
+
+    def forward(self, x):
+        #  stole refactoring code from Coin Cheung, thanks
+        x_detail = self.detail(x)
+        x_semantic_lst = self.semantic(x)
+        x_head = self.bga(x_detail, x_semantic_lst[-1])
+        outs = [x_head] + x_semantic_lst[:-1]
+        outs = [outs[i] for i in self.out_indices]
+        return tuple(outs)
diff --git a/head_extractor/src/mmseg/models/backbones/cgnet.py b/head_extractor/src/mmseg/models/backbones/cgnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b74b494f53466d1c608e50d088632aa952a5e534
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/cgnet.py
@@ -0,0 +1,372 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmseg.registry import MODELS
+
+
+class GlobalContextExtractor(nn.Module):
+    """Global Context Extractor for CGNet.
+
+    This class is employed to refine the joint feature of both local feature
+    and surrounding context.
+
+    Args:
+        channel (int): Number of input feature channels.
+        reduction (int): Reductions for global context extractor. Default: 16.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self, channel, reduction=16, with_cp=False):
+        super().__init__()
+        self.channel = channel
+        self.reduction = reduction
+        assert reduction >= 1 and channel >= reduction
+        self.with_cp = with_cp
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction), nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel), nn.Sigmoid())
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            num_batch, num_channel = x.size()[:2]
+            y = self.avg_pool(x).view(num_batch, num_channel)
+            y = self.fc(y).view(num_batch, num_channel, 1, 1)
+            return x * y
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class ContextGuidedBlock(nn.Module):
+    """Context Guided Block for CGNet.
+
+    This class consists of four components: local feature extractor,
+    surrounding feature extractor, joint feature extractor and global
+    context extractor.
+
+    Args:
+        in_channels (int): Number of input feature channels.
+        out_channels (int): Number of output feature channels.
+        dilation (int): Dilation rate for surrounding context extractor.
+            Default: 2.
+        reduction (int): Reduction for global context extractor. Default: 16.
+        skip_connect (bool): Add input to output or not. Default: True.
+        downsample (bool): Downsample the input to 1/2 or not. Default: False.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='PReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 dilation=2,
+                 reduction=16,
+                 skip_connect=True,
+                 downsample=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='PReLU'),
+                 with_cp=False):
+        super().__init__()
+        self.with_cp = with_cp
+        self.downsample = downsample
+
+        channels = out_channels if downsample else out_channels // 2
+        if 'type' in act_cfg and act_cfg['type'] == 'PReLU':
+            act_cfg['num_parameters'] = channels
+        kernel_size = 3 if downsample else 1
+        stride = 2 if downsample else 1
+        padding = (kernel_size - 1) // 2
+
+        self.conv1x1 = ConvModule(
+            in_channels,
+            channels,
+            kernel_size,
+            stride,
+            padding,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.f_loc = build_conv_layer(
+            conv_cfg,
+            channels,
+            channels,
+            kernel_size=3,
+            padding=1,
+            groups=channels,
+            bias=False)
+        self.f_sur = build_conv_layer(
+            conv_cfg,
+            channels,
+            channels,
+            kernel_size=3,
+            padding=dilation,
+            groups=channels,
+            dilation=dilation,
+            bias=False)
+
+        self.bn = build_norm_layer(norm_cfg, 2 * channels)[1]
+        self.activate = nn.PReLU(2 * channels)
+
+        if downsample:
+            self.bottleneck = build_conv_layer(
+                conv_cfg,
+                2 * channels,
+                out_channels,
+                kernel_size=1,
+                bias=False)
+
+        self.skip_connect = skip_connect and not downsample
+        self.f_glo = GlobalContextExtractor(out_channels, reduction, with_cp)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = self.conv1x1(x)
+            loc = self.f_loc(out)
+            sur = self.f_sur(out)
+
+            joi_feat = torch.cat([loc, sur], 1)  # the joint feature
+            joi_feat = self.bn(joi_feat)
+            joi_feat = self.activate(joi_feat)
+            if self.downsample:
+                joi_feat = self.bottleneck(joi_feat)  # channel = out_channels
+            # f_glo is employed to refine the joint feature
+            out = self.f_glo(joi_feat)
+
+            if self.skip_connect:
+                return x + out
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class InputInjection(nn.Module):
+    """Downsampling module for CGNet."""
+
+    def __init__(self, num_downsampling):
+        super().__init__()
+        self.pool = nn.ModuleList()
+        for i in range(num_downsampling):
+            self.pool.append(nn.AvgPool2d(3, stride=2, padding=1))
+
+    def forward(self, x):
+        for pool in self.pool:
+            x = pool(x)
+        return x
+
+
+@MODELS.register_module()
+class CGNet(BaseModule):
+    """CGNet backbone.
+
+    This backbone is the implementation of `A Light-weight Context Guided
+    Network for Semantic Segmentation <https://arxiv.org/abs/1811.08201>`_.
+
+    Args:
+        in_channels (int): Number of input image channels. Normally 3.
+        num_channels (tuple[int]): Numbers of feature channels at each stages.
+            Default: (32, 64, 128).
+        num_blocks (tuple[int]): Numbers of CG blocks at stage 1 and stage 2.
+            Default: (3, 21).
+        dilations (tuple[int]): Dilation rate for surrounding context
+            extractors at stage 1 and stage 2. Default: (2, 4).
+        reductions (tuple[int]): Reductions for global context extractors at
+            stage 1 and stage 2. Default: (8, 16).
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='PReLU').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 num_channels=(32, 64, 128),
+                 num_blocks=(3, 21),
+                 dilations=(2, 4),
+                 reductions=(8, 16),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='PReLU'),
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+
+        super().__init__(init_cfg)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer=['Conv2d', 'Linear']),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm']),
+                    dict(type='Constant', val=0, layer='PReLU')
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.in_channels = in_channels
+        self.num_channels = num_channels
+        assert isinstance(self.num_channels, tuple) and len(
+            self.num_channels) == 3
+        self.num_blocks = num_blocks
+        assert isinstance(self.num_blocks, tuple) and len(self.num_blocks) == 2
+        self.dilations = dilations
+        assert isinstance(self.dilations, tuple) and len(self.dilations) == 2
+        self.reductions = reductions
+        assert isinstance(self.reductions, tuple) and len(self.reductions) == 2
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        if 'type' in self.act_cfg and self.act_cfg['type'] == 'PReLU':
+            self.act_cfg['num_parameters'] = num_channels[0]
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        cur_channels = in_channels
+        self.stem = nn.ModuleList()
+        for i in range(3):
+            self.stem.append(
+                ConvModule(
+                    cur_channels,
+                    num_channels[0],
+                    3,
+                    2 if i == 0 else 1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            cur_channels = num_channels[0]
+
+        self.inject_2x = InputInjection(1)  # down-sample for Input, factor=2
+        self.inject_4x = InputInjection(2)  # down-sample for Input, factor=4
+
+        cur_channels += in_channels
+        self.norm_prelu_0 = nn.Sequential(
+            build_norm_layer(norm_cfg, cur_channels)[1],
+            nn.PReLU(cur_channels))
+
+        # stage 1
+        self.level1 = nn.ModuleList()
+        for i in range(num_blocks[0]):
+            self.level1.append(
+                ContextGuidedBlock(
+                    cur_channels if i == 0 else num_channels[1],
+                    num_channels[1],
+                    dilations[0],
+                    reductions[0],
+                    downsample=(i == 0),
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    with_cp=with_cp))  # CG block
+
+        cur_channels = 2 * num_channels[1] + in_channels
+        self.norm_prelu_1 = nn.Sequential(
+            build_norm_layer(norm_cfg, cur_channels)[1],
+            nn.PReLU(cur_channels))
+
+        # stage 2
+        self.level2 = nn.ModuleList()
+        for i in range(num_blocks[1]):
+            self.level2.append(
+                ContextGuidedBlock(
+                    cur_channels if i == 0 else num_channels[2],
+                    num_channels[2],
+                    dilations[1],
+                    reductions[1],
+                    downsample=(i == 0),
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    with_cp=with_cp))  # CG block
+
+        cur_channels = 2 * num_channels[2]
+        self.norm_prelu_2 = nn.Sequential(
+            build_norm_layer(norm_cfg, cur_channels)[1],
+            nn.PReLU(cur_channels))
+
+    def forward(self, x):
+        output = []
+
+        # stage 0
+        inp_2x = self.inject_2x(x)
+        inp_4x = self.inject_4x(x)
+        for layer in self.stem:
+            x = layer(x)
+        x = self.norm_prelu_0(torch.cat([x, inp_2x], 1))
+        output.append(x)
+
+        # stage 1
+        for i, layer in enumerate(self.level1):
+            x = layer(x)
+            if i == 0:
+                down1 = x
+        x = self.norm_prelu_1(torch.cat([x, down1, inp_4x], 1))
+        output.append(x)
+
+        # stage 2
+        for i, layer in enumerate(self.level2):
+            x = layer(x)
+            if i == 0:
+                down2 = x
+        x = self.norm_prelu_2(torch.cat([down2, x], 1))
+        output.append(x)
+
+        return output
+
+    def train(self, mode=True):
+        """Convert the model into training mode will keeping the normalization
+        layer freezed."""
+        super().train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/head_extractor/src/mmseg/models/backbones/ddrnet.py b/head_extractor/src/mmseg/models/backbones/ddrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4508aade82b484abfcca593825649031db7cbdd0
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/ddrnet.py
@@ -0,0 +1,222 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_norm_layer
+from mmengine.model import BaseModule
+
+from mmseg.models.utils import DAPPM, BasicBlock, Bottleneck, resize
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType
+
+
+@MODELS.register_module()
+class DDRNet(BaseModule):
+    """DDRNet backbone.
+
+    This backbone is the implementation of `Deep Dual-resolution Networks for
+    Real-time and Accurate Semantic Segmentation of Road Scenes
+    <http://arxiv.org/abs/2101.06085>`_.
+    Modified from https://github.com/ydhongHIT/DDRNet.
+
+    Args:
+        in_channels (int): Number of input image channels. Default: 3.
+        channels: (int): The base channels of DDRNet. Default: 32.
+        ppm_channels (int): The channels of PPM module. Default: 128.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        norm_cfg (dict): Config dict to build norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+        init_cfg (dict, optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 3,
+                 channels: int = 32,
+                 ppm_channels: int = 128,
+                 align_corners: bool = False,
+                 norm_cfg: OptConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+
+        self.in_channels = in_channels
+        self.ppm_channels = ppm_channels
+
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+
+        # stage 0-2
+        self.stem = self._make_stem_layer(in_channels, channels, num_blocks=2)
+        self.relu = nn.ReLU()
+
+        # low resolution(context) branch
+        self.context_branch_layers = nn.ModuleList()
+        for i in range(3):
+            self.context_branch_layers.append(
+                self._make_layer(
+                    block=BasicBlock if i < 2 else Bottleneck,
+                    inplanes=channels * 2**(i + 1),
+                    planes=channels * 8 if i > 0 else channels * 4,
+                    num_blocks=2 if i < 2 else 1,
+                    stride=2))
+
+        # bilateral fusion
+        self.compression_1 = ConvModule(
+            channels * 4,
+            channels * 2,
+            kernel_size=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+        self.down_1 = ConvModule(
+            channels * 2,
+            channels * 4,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+
+        self.compression_2 = ConvModule(
+            channels * 8,
+            channels * 2,
+            kernel_size=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+        self.down_2 = nn.Sequential(
+            ConvModule(
+                channels * 2,
+                channels * 4,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                channels * 4,
+                channels * 8,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None))
+
+        # high resolution(spatial) branch
+        self.spatial_branch_layers = nn.ModuleList()
+        for i in range(3):
+            self.spatial_branch_layers.append(
+                self._make_layer(
+                    block=BasicBlock if i < 2 else Bottleneck,
+                    inplanes=channels * 2,
+                    planes=channels * 2,
+                    num_blocks=2 if i < 2 else 1,
+                ))
+
+        self.spp = DAPPM(
+            channels * 16, ppm_channels, channels * 4, num_scales=5)
+
+    def _make_stem_layer(self, in_channels, channels, num_blocks):
+        layers = [
+            ConvModule(
+                in_channels,
+                channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                channels,
+                channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        ]
+
+        layers.extend([
+            self._make_layer(BasicBlock, channels, channels, num_blocks),
+            nn.ReLU(),
+            self._make_layer(
+                BasicBlock, channels, channels * 2, num_blocks, stride=2),
+            nn.ReLU(),
+        ])
+
+        return nn.Sequential(*layers)
+
+    def _make_layer(self, block, inplanes, planes, num_blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, planes * block.expansion)[1])
+
+        layers = [
+            block(
+                in_channels=inplanes,
+                channels=planes,
+                stride=stride,
+                downsample=downsample)
+        ]
+        inplanes = planes * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    in_channels=inplanes,
+                    channels=planes,
+                    stride=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg_out=None if i == num_blocks - 1 else self.act_cfg))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Forward function."""
+        out_size = (x.shape[-2] // 8, x.shape[-1] // 8)
+
+        # stage 0-2
+        x = self.stem(x)
+
+        # stage3
+        x_c = self.context_branch_layers[0](x)
+        x_s = self.spatial_branch_layers[0](x)
+        comp_c = self.compression_1(self.relu(x_c))
+        x_c += self.down_1(self.relu(x_s))
+        x_s += resize(
+            comp_c,
+            size=out_size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.training:
+            temp_context = x_s.clone()
+
+        # stage4
+        x_c = self.context_branch_layers[1](self.relu(x_c))
+        x_s = self.spatial_branch_layers[1](self.relu(x_s))
+        comp_c = self.compression_2(self.relu(x_c))
+        x_c += self.down_2(self.relu(x_s))
+        x_s += resize(
+            comp_c,
+            size=out_size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        # stage5
+        x_s = self.spatial_branch_layers[2](self.relu(x_s))
+        x_c = self.context_branch_layers[2](self.relu(x_c))
+        x_c = self.spp(x_c)
+        x_c = resize(
+            x_c,
+            size=out_size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        return (temp_context, x_s + x_c) if self.training else x_s + x_c
diff --git a/head_extractor/src/mmseg/models/backbones/dinov2.py b/head_extractor/src/mmseg/models/backbones/dinov2.py
new file mode 100644
index 0000000000000000000000000000000000000000..95d39521ee1c4d5dfd80df48059f5bb41056f1f5
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/dinov2.py
@@ -0,0 +1,60 @@
+import torch
+from mmengine.model import BaseModule
+from torch import nn
+
+from mmseg.registry import MODELS
+import os
+_DINOV2_MMSEG_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
+_DINOV2_TORCHHUB_DIR = os.path.join(_DINOV2_MMSEG_ROOT, 'torchhub', 'facebookresearch_dinov2_main')
+
+
+@MODELS.register_module()
+class DINOv2(nn.Module):
+    """Use DINOv2 pre-trained models
+    """
+
+    def __init__(self, version='large', freeze=False, load_from=None):
+        super().__init__()
+        
+        if version == 'large':
+            self.dinov2 = torch.hub.load(_DINOV2_TORCHHUB_DIR, 'dinov2_vitl14', source='local', pretrained=False)
+        else:
+            raise NotImplementedError
+
+        if load_from is not None:
+            if load_from.split('/')[-1] == 'depth_anything_vitl14.pth':
+                print(load_from)
+                d = torch.load(load_from, map_location='cpu')
+                new_d = {}
+                for key, value in d.items():
+                    if 'pretrained' in key:
+                        new_d[key.replace('pretrained.', '')] = value
+                self.dinov2.load_state_dict(new_d)
+            else:
+                print(load_from)
+                all_d = torch.load(load_from, map_location='cpu')
+                d = all_d['state_dict']
+                new_d = {}
+                for key, value in d.items():
+                    if 'backbone.dinov2' in key:
+                        new_d[key.replace('backbone.dinov2.', '')] = value
+                self.dinov2.load_state_dict(new_d)
+        
+        self.freeze = freeze
+        
+    def forward(self, inputs):
+        B, _, h, w = inputs.shape
+        
+        if self.freeze:
+            with torch.no_grad():
+                features = self.dinov2.get_intermediate_layers(inputs, 4)
+        else:
+            features = self.dinov2.get_intermediate_layers(inputs, 4)
+        
+        outs = []
+        for feature in features:
+            C = feature.shape[-1]
+            feature = feature.permute(0, 2, 1).reshape(B, C, h // 14, w // 14).contiguous()
+            outs.append(feature)
+        
+        return outs
diff --git a/head_extractor/src/mmseg/models/backbones/erfnet.py b/head_extractor/src/mmseg/models/backbones/erfnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c5ec672a086b5d67568514140023ce402eef92f
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/erfnet.py
@@ -0,0 +1,329 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+class DownsamplerBlock(BaseModule):
+    """Downsampler block of ERFNet.
+
+    This module is a little different from basical ConvModule.
+    The features from Conv and MaxPool layers are
+    concatenated before BatchNorm.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.conv = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            out_channels - in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+        self.bn = build_norm_layer(self.norm_cfg, out_channels)[1]
+        self.act = build_activation_layer(self.act_cfg)
+
+    def forward(self, input):
+        conv_out = self.conv(input)
+        pool_out = self.pool(input)
+        pool_out = resize(
+            input=pool_out,
+            size=conv_out.size()[2:],
+            mode='bilinear',
+            align_corners=False)
+        output = torch.cat([conv_out, pool_out], 1)
+        output = self.bn(output)
+        output = self.act(output)
+        return output
+
+
+class NonBottleneck1d(BaseModule):
+    """Non-bottleneck block of ERFNet.
+
+    Args:
+        channels (int): Number of channels in Non-bottleneck block.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.
+        dilation (int): Dilation rate for last two conv layers.
+            Default 1.
+        num_conv_layer (int): Number of 3x1 and 1x3 convolution layers.
+            Default 2.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 channels,
+                 drop_rate=0,
+                 dilation=1,
+                 num_conv_layer=2,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.act = build_activation_layer(self.act_cfg)
+
+        self.convs_layers = nn.ModuleList()
+        for conv_layer in range(num_conv_layer):
+            first_conv_padding = (1, 0) if conv_layer == 0 else (dilation, 0)
+            first_conv_dilation = 1 if conv_layer == 0 else (dilation, 1)
+            second_conv_padding = (0, 1) if conv_layer == 0 else (0, dilation)
+            second_conv_dilation = 1 if conv_layer == 0 else (1, dilation)
+
+            self.convs_layers.append(
+                build_conv_layer(
+                    self.conv_cfg,
+                    channels,
+                    channels,
+                    kernel_size=(3, 1),
+                    stride=1,
+                    padding=first_conv_padding,
+                    bias=True,
+                    dilation=first_conv_dilation))
+            self.convs_layers.append(self.act)
+            self.convs_layers.append(
+                build_conv_layer(
+                    self.conv_cfg,
+                    channels,
+                    channels,
+                    kernel_size=(1, 3),
+                    stride=1,
+                    padding=second_conv_padding,
+                    bias=True,
+                    dilation=second_conv_dilation))
+            self.convs_layers.append(
+                build_norm_layer(self.norm_cfg, channels)[1])
+            if conv_layer == 0:
+                self.convs_layers.append(self.act)
+            else:
+                self.convs_layers.append(nn.Dropout(p=drop_rate))
+
+    def forward(self, input):
+        output = input
+        for conv in self.convs_layers:
+            output = conv(output)
+        output = self.act(output + input)
+        return output
+
+
+class UpsamplerBlock(BaseModule):
+    """Upsampler block of ERFNet.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', eps=1e-3),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.conv = nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            output_padding=1,
+            bias=True)
+        self.bn = build_norm_layer(self.norm_cfg, out_channels)[1]
+        self.act = build_activation_layer(self.act_cfg)
+
+    def forward(self, input):
+        output = self.conv(input)
+        output = self.bn(output)
+        output = self.act(output)
+        return output
+
+
+@MODELS.register_module()
+class ERFNet(BaseModule):
+    """ERFNet backbone.
+
+    This backbone is the implementation of `ERFNet: Efficient Residual
+    Factorized ConvNet for Real-time SemanticSegmentation
+    <https://ieeexplore.ieee.org/document/8063438>`_.
+
+    Args:
+        in_channels (int): The number of channels of input
+            image. Default: 3.
+        enc_downsample_channels (Tuple[int]): Size of channel
+            numbers of various Downsampler block in encoder.
+            Default: (16, 64, 128).
+        enc_stage_non_bottlenecks (Tuple[int]): Number of stages of
+            Non-bottleneck block in encoder.
+            Default: (5, 8).
+        enc_non_bottleneck_dilations (Tuple[int]): Dilation rate of each
+            stage of Non-bottleneck block of encoder.
+            Default: (2, 4, 8, 16).
+        enc_non_bottleneck_channels (Tuple[int]): Size of channel
+            numbers of various Non-bottleneck block in encoder.
+            Default: (64, 128).
+        dec_upsample_channels (Tuple[int]): Size of channel numbers of
+            various Deconvolution block in decoder.
+            Default: (64, 16).
+        dec_stages_non_bottleneck (Tuple[int]): Number of stages of
+            Non-bottleneck block in decoder.
+            Default: (2, 2).
+        dec_non_bottleneck_channels (Tuple[int]): Size of channel
+            numbers of various Non-bottleneck block in decoder.
+            Default: (64, 16).
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.1.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 enc_downsample_channels=(16, 64, 128),
+                 enc_stage_non_bottlenecks=(5, 8),
+                 enc_non_bottleneck_dilations=(2, 4, 8, 16),
+                 enc_non_bottleneck_channels=(64, 128),
+                 dec_upsample_channels=(64, 16),
+                 dec_stages_non_bottleneck=(2, 2),
+                 dec_non_bottleneck_channels=(64, 16),
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+        assert len(enc_downsample_channels) \
+               == len(dec_upsample_channels)+1, 'Number of downsample\
+                     block of encoder does not \
+                    match number of upsample block of decoder!'
+        assert len(enc_downsample_channels) \
+               == len(enc_stage_non_bottlenecks)+1, 'Number of \
+                    downsample block of encoder does not match \
+                    number of Non-bottleneck block of encoder!'
+        assert len(enc_downsample_channels) \
+               == len(enc_non_bottleneck_channels)+1, 'Number of \
+                    downsample block of encoder does not match \
+                    number of channels of Non-bottleneck block of encoder!'
+        assert enc_stage_non_bottlenecks[-1] \
+               % len(enc_non_bottleneck_dilations) == 0, 'Number of \
+                    Non-bottleneck block of encoder does not match \
+                    number of Non-bottleneck block of encoder!'
+        assert len(dec_upsample_channels) \
+               == len(dec_stages_non_bottleneck), 'Number of \
+                upsample block of decoder does not match \
+                number of Non-bottleneck block of decoder!'
+        assert len(dec_stages_non_bottleneck) \
+               == len(dec_non_bottleneck_channels), 'Number of \
+                Non-bottleneck block of decoder does not match \
+                number of channels of Non-bottleneck block of decoder!'
+
+        self.in_channels = in_channels
+        self.enc_downsample_channels = enc_downsample_channels
+        self.enc_stage_non_bottlenecks = enc_stage_non_bottlenecks
+        self.enc_non_bottleneck_dilations = enc_non_bottleneck_dilations
+        self.enc_non_bottleneck_channels = enc_non_bottleneck_channels
+        self.dec_upsample_channels = dec_upsample_channels
+        self.dec_stages_non_bottleneck = dec_stages_non_bottleneck
+        self.dec_non_bottleneck_channels = dec_non_bottleneck_channels
+        self.dropout_ratio = dropout_ratio
+
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+        self.encoder.append(
+            DownsamplerBlock(self.in_channels, enc_downsample_channels[0]))
+
+        for i in range(len(enc_downsample_channels) - 1):
+            self.encoder.append(
+                DownsamplerBlock(enc_downsample_channels[i],
+                                 enc_downsample_channels[i + 1]))
+            # Last part of encoder is some dilated NonBottleneck1d blocks.
+            if i == len(enc_downsample_channels) - 2:
+                iteration_times = int(enc_stage_non_bottlenecks[-1] /
+                                      len(enc_non_bottleneck_dilations))
+                for j in range(iteration_times):
+                    for k in range(len(enc_non_bottleneck_dilations)):
+                        self.encoder.append(
+                            NonBottleneck1d(enc_downsample_channels[-1],
+                                            self.dropout_ratio,
+                                            enc_non_bottleneck_dilations[k]))
+            else:
+                for j in range(enc_stage_non_bottlenecks[i]):
+                    self.encoder.append(
+                        NonBottleneck1d(enc_downsample_channels[i + 1],
+                                        self.dropout_ratio))
+
+        for i in range(len(dec_upsample_channels)):
+            if i == 0:
+                self.decoder.append(
+                    UpsamplerBlock(enc_downsample_channels[-1],
+                                   dec_non_bottleneck_channels[i]))
+            else:
+                self.decoder.append(
+                    UpsamplerBlock(dec_non_bottleneck_channels[i - 1],
+                                   dec_non_bottleneck_channels[i]))
+            for j in range(dec_stages_non_bottleneck[i]):
+                self.decoder.append(
+                    NonBottleneck1d(dec_non_bottleneck_channels[i]))
+
+    def forward(self, x):
+        for enc in self.encoder:
+            x = enc(x)
+        for dec in self.decoder:
+            x = dec(x)
+        return [x]
diff --git a/head_extractor/src/mmseg/models/backbones/fast_scnn.py b/head_extractor/src/mmseg/models/backbones/fast_scnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ff7a3191d2fee904c5200e0a526214a65f58b32
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/fast_scnn.py
@@ -0,0 +1,408 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+
+from mmseg.models.decode_heads.psp_head import PPM
+from mmseg.registry import MODELS
+from ..utils import InvertedResidual, resize
+
+
+class LearningToDownsample(nn.Module):
+    """Learning to downsample module.
+
+    Args:
+        in_channels (int): Number of input channels.
+        dw_channels (tuple[int]): Number of output channels of the first and
+            the second depthwise conv (dwconv) layers.
+        out_channels (int): Number of output channels of the whole
+            'learning to downsample' module.
+        conv_cfg (dict | None): Config of conv layers. Default: None
+        norm_cfg (dict | None): Config of norm layers. Default:
+            dict(type='BN')
+        act_cfg (dict): Config of activation layers. Default:
+            dict(type='ReLU')
+        dw_act_cfg (dict): In DepthwiseSeparableConvModule, activation config
+            of depthwise ConvModule. If it is 'default', it will be the same
+            as `act_cfg`. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 dw_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 dw_act_cfg=None):
+        super().__init__()
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.dw_act_cfg = dw_act_cfg
+        dw_channels1 = dw_channels[0]
+        dw_channels2 = dw_channels[1]
+
+        self.conv = ConvModule(
+            in_channels,
+            dw_channels1,
+            3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.dsconv1 = DepthwiseSeparableConvModule(
+            dw_channels1,
+            dw_channels2,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            dw_act_cfg=self.dw_act_cfg)
+
+        self.dsconv2 = DepthwiseSeparableConvModule(
+            dw_channels2,
+            out_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            dw_act_cfg=self.dw_act_cfg)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.dsconv1(x)
+        x = self.dsconv2(x)
+        return x
+
+
+class GlobalFeatureExtractor(nn.Module):
+    """Global feature extractor module.
+
+    Args:
+        in_channels (int): Number of input channels of the GFE module.
+            Default: 64
+        block_channels (tuple[int]): Tuple of ints. Each int specifies the
+            number of output channels of each Inverted Residual module.
+            Default: (64, 96, 128)
+        out_channels(int): Number of output channels of the GFE module.
+            Default: 128
+        expand_ratio (int): Adjusts number of channels of the hidden layer
+            in InvertedResidual by this amount.
+            Default: 6
+        num_blocks (tuple[int]): Tuple of ints. Each int specifies the
+            number of times each Inverted Residual module is repeated.
+            The repeated Inverted Residual modules are called a 'group'.
+            Default: (3, 3, 3)
+        strides (tuple[int]): Tuple of ints. Each int specifies
+            the downsampling factor of each 'group'.
+            Default: (2, 2, 1)
+        pool_scales (tuple[int]): Tuple of ints. Each int specifies
+            the parameter required in 'global average pooling' within PPM.
+            Default: (1, 2, 3, 6)
+        conv_cfg (dict | None): Config of conv layers. Default: None
+        norm_cfg (dict | None): Config of norm layers. Default:
+            dict(type='BN')
+        act_cfg (dict): Config of activation layers. Default:
+            dict(type='ReLU')
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False
+    """
+
+    def __init__(self,
+                 in_channels=64,
+                 block_channels=(64, 96, 128),
+                 out_channels=128,
+                 expand_ratio=6,
+                 num_blocks=(3, 3, 3),
+                 strides=(2, 2, 1),
+                 pool_scales=(1, 2, 3, 6),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False):
+        super().__init__()
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        assert len(block_channels) == len(num_blocks) == 3
+        self.bottleneck1 = self._make_layer(in_channels, block_channels[0],
+                                            num_blocks[0], strides[0],
+                                            expand_ratio)
+        self.bottleneck2 = self._make_layer(block_channels[0],
+                                            block_channels[1], num_blocks[1],
+                                            strides[1], expand_ratio)
+        self.bottleneck3 = self._make_layer(block_channels[1],
+                                            block_channels[2], num_blocks[2],
+                                            strides[2], expand_ratio)
+        self.ppm = PPM(
+            pool_scales,
+            block_channels[2],
+            block_channels[2] // 4,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=align_corners)
+
+        self.out = ConvModule(
+            block_channels[2] * 2,
+            out_channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _make_layer(self,
+                    in_channels,
+                    out_channels,
+                    blocks,
+                    stride=1,
+                    expand_ratio=6):
+        layers = [
+            InvertedResidual(
+                in_channels,
+                out_channels,
+                stride,
+                expand_ratio,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        ]
+        for i in range(1, blocks):
+            layers.append(
+                InvertedResidual(
+                    out_channels,
+                    out_channels,
+                    1,
+                    expand_ratio,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.bottleneck1(x)
+        x = self.bottleneck2(x)
+        x = self.bottleneck3(x)
+        x = torch.cat([x, *self.ppm(x)], dim=1)
+        x = self.out(x)
+        return x
+
+
+class FeatureFusionModule(nn.Module):
+    """Feature fusion module.
+
+    Args:
+        higher_in_channels (int): Number of input channels of the
+            higher-resolution branch.
+        lower_in_channels (int): Number of input channels of the
+            lower-resolution branch.
+        out_channels (int): Number of output channels.
+        conv_cfg (dict | None): Config of conv layers. Default: None
+        norm_cfg (dict | None): Config of norm layers. Default:
+            dict(type='BN')
+        dwconv_act_cfg (dict): Config of activation layers in 3x3 conv.
+            Default: dict(type='ReLU').
+        conv_act_cfg (dict): Config of activation layers in the two 1x1 conv.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+    """
+
+    def __init__(self,
+                 higher_in_channels,
+                 lower_in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dwconv_act_cfg=dict(type='ReLU'),
+                 conv_act_cfg=None,
+                 align_corners=False):
+        super().__init__()
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dwconv_act_cfg = dwconv_act_cfg
+        self.conv_act_cfg = conv_act_cfg
+        self.align_corners = align_corners
+        self.dwconv = ConvModule(
+            lower_in_channels,
+            out_channels,
+            3,
+            padding=1,
+            groups=out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.dwconv_act_cfg)
+        self.conv_lower_res = ConvModule(
+            out_channels,
+            out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.conv_act_cfg)
+
+        self.conv_higher_res = ConvModule(
+            higher_in_channels,
+            out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.conv_act_cfg)
+
+        self.relu = nn.ReLU(True)
+
+    def forward(self, higher_res_feature, lower_res_feature):
+        lower_res_feature = resize(
+            lower_res_feature,
+            size=higher_res_feature.size()[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        lower_res_feature = self.dwconv(lower_res_feature)
+        lower_res_feature = self.conv_lower_res(lower_res_feature)
+
+        higher_res_feature = self.conv_higher_res(higher_res_feature)
+        out = higher_res_feature + lower_res_feature
+        return self.relu(out)
+
+
+@MODELS.register_module()
+class FastSCNN(BaseModule):
+    """Fast-SCNN Backbone.
+
+    This backbone is the implementation of `Fast-SCNN: Fast Semantic
+    Segmentation Network <https://arxiv.org/abs/1902.04502>`_.
+
+    Args:
+        in_channels (int): Number of input image channels. Default: 3.
+        downsample_dw_channels (tuple[int]): Number of output channels after
+            the first conv layer & the second conv layer in
+            Learning-To-Downsample (LTD) module.
+            Default: (32, 48).
+        global_in_channels (int): Number of input channels of
+            Global Feature Extractor(GFE).
+            Equal to number of output channels of LTD.
+            Default: 64.
+        global_block_channels (tuple[int]): Tuple of integers that describe
+            the output channels for each of the MobileNet-v2 bottleneck
+            residual blocks in GFE.
+            Default: (64, 96, 128).
+        global_block_strides (tuple[int]): Tuple of integers
+            that describe the strides (downsampling factors) for each of the
+            MobileNet-v2 bottleneck residual blocks in GFE.
+            Default: (2, 2, 1).
+        global_out_channels (int): Number of output channels of GFE.
+            Default: 128.
+        higher_in_channels (int): Number of input channels of the higher
+            resolution branch in FFM.
+            Equal to global_in_channels.
+            Default: 64.
+        lower_in_channels (int): Number of input channels of  the lower
+            resolution branch in FFM.
+            Equal to global_out_channels.
+            Default: 128.
+        fusion_out_channels (int): Number of output channels of FFM.
+            Default: 128.
+        out_indices (tuple): Tuple of indices of list
+            [higher_res_features, lower_res_features, fusion_output].
+            Often set to (0,1,2) to enable aux. heads.
+            Default: (0, 1, 2).
+        conv_cfg (dict | None): Config of conv layers. Default: None
+        norm_cfg (dict | None): Config of norm layers. Default:
+            dict(type='BN')
+        act_cfg (dict): Config of activation layers. Default:
+            dict(type='ReLU')
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False
+        dw_act_cfg (dict): In DepthwiseSeparableConvModule, activation config
+            of depthwise ConvModule. If it is 'default', it will be the same
+            as `act_cfg`. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 downsample_dw_channels=(32, 48),
+                 global_in_channels=64,
+                 global_block_channels=(64, 96, 128),
+                 global_block_strides=(2, 2, 1),
+                 global_out_channels=128,
+                 higher_in_channels=64,
+                 lower_in_channels=128,
+                 fusion_out_channels=128,
+                 out_indices=(0, 1, 2),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 dw_act_cfg=None,
+                 init_cfg=None):
+
+        super().__init__(init_cfg)
+
+        if init_cfg is None:
+            self.init_cfg = [
+                dict(type='Kaiming', layer='Conv2d'),
+                dict(
+                    type='Constant', val=1, layer=['_BatchNorm', 'GroupNorm'])
+            ]
+
+        if global_in_channels != higher_in_channels:
+            raise AssertionError('Global Input Channels must be the same \
+                                 with Higher Input Channels!')
+        elif global_out_channels != lower_in_channels:
+            raise AssertionError('Global Output Channels must be the same \
+                                with Lower Input Channels!')
+
+        self.in_channels = in_channels
+        self.downsample_dw_channels1 = downsample_dw_channels[0]
+        self.downsample_dw_channels2 = downsample_dw_channels[1]
+        self.global_in_channels = global_in_channels
+        self.global_block_channels = global_block_channels
+        self.global_block_strides = global_block_strides
+        self.global_out_channels = global_out_channels
+        self.higher_in_channels = higher_in_channels
+        self.lower_in_channels = lower_in_channels
+        self.fusion_out_channels = fusion_out_channels
+        self.out_indices = out_indices
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+        self.learning_to_downsample = LearningToDownsample(
+            in_channels,
+            downsample_dw_channels,
+            global_in_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            dw_act_cfg=dw_act_cfg)
+        self.global_feature_extractor = GlobalFeatureExtractor(
+            global_in_channels,
+            global_block_channels,
+            global_out_channels,
+            strides=self.global_block_strides,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+        self.feature_fusion = FeatureFusionModule(
+            higher_in_channels,
+            lower_in_channels,
+            fusion_out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dwconv_act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+
+    def forward(self, x):
+        higher_res_features = self.learning_to_downsample(x)
+        lower_res_features = self.global_feature_extractor(higher_res_features)
+        fusion_output = self.feature_fusion(higher_res_features,
+                                            lower_res_features)
+
+        outs = [higher_res_features, lower_res_features, fusion_output]
+        outs = [outs[i] for i in self.out_indices]
+        return tuple(outs)
diff --git a/head_extractor/src/mmseg/models/backbones/hrnet.py b/head_extractor/src/mmseg/models/backbones/hrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2da755e731cfea911d47729f455c54c3d38a68e4
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/hrnet.py
@@ -0,0 +1,642 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule, ModuleList, Sequential
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmseg.registry import MODELS
+from ..utils import Upsample, resize
+from .resnet import BasicBlock, Bottleneck
+
+
+class HRModule(BaseModule):
+    """High-Resolution Module for HRNet.
+
+    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
+    is in this module.
+    """
+
+    def __init__(self,
+                 num_branches,
+                 blocks,
+                 num_blocks,
+                 in_channels,
+                 num_channels,
+                 multiscale_output=True,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 block_init_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        self.block_init_cfg = block_init_cfg
+        self._check_branches(num_branches, num_blocks, in_channels,
+                             num_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.multiscale_output = multiscale_output
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.with_cp = with_cp
+        self.branches = self._make_branches(num_branches, blocks, num_blocks,
+                                            num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=False)
+
+    def _check_branches(self, num_branches, num_blocks, in_channels,
+                        num_channels):
+        """Check branches configuration."""
+        if num_branches != len(num_blocks):
+            error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_BLOCKS(' \
+                        f'{len(num_blocks)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_CHANNELS(' \
+                        f'{len(num_channels)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_INCHANNELS(' \
+                        f'{len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        """Build one branch."""
+        downsample = None
+        if stride != 1 or \
+                self.in_channels[branch_index] != \
+                num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    self.in_channels[branch_index],
+                    num_channels[branch_index] * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, num_channels[branch_index] *
+                                 block.expansion)[1])
+
+        layers = []
+        layers.append(
+            block(
+                self.in_channels[branch_index],
+                num_channels[branch_index],
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=self.block_init_cfg))
+        self.in_channels[branch_index] = \
+            num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(
+                    self.in_channels[branch_index],
+                    num_channels[branch_index],
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=self.block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        """Build multiple branch."""
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        """Build fuse layer."""
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels[j],
+                                in_channels[i],
+                                kernel_size=1,
+                                stride=1,
+                                padding=0,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            # we set align_corners=False for HRNet
+                            Upsample(
+                                scale_factor=2**(j - i),
+                                mode='bilinear',
+                                align_corners=False)))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[i],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(
+                                        self.conv_cfg,
+                                        in_channels[j],
+                                        in_channels[j],
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=1,
+                                        bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=False)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = 0
+            for j in range(self.num_branches):
+                if i == j:
+                    y += x[j]
+                elif j > i:
+                    y = y + resize(
+                        self.fuse_layers[i][j](x[j]),
+                        size=x[i].shape[2:],
+                        mode='bilinear',
+                        align_corners=False)
+                else:
+                    y += self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+
+
+@MODELS.register_module()
+class HRNet(BaseModule):
+    """HRNet backbone.
+
+    This backbone is the implementation of `High-Resolution Representations
+    for Labeling Pixels and Regions <https://arxiv.org/abs/1904.04514>`_.
+
+    Args:
+        extra (dict): Detailed configuration for each stage of HRNet.
+            There must be 4 stages, the configuration for each stage must have
+            5 keys:
+
+                - num_modules (int): The number of HRModule in this stage.
+                - num_branches (int): The number of branches in the HRModule.
+                - block (str): The type of convolution block.
+                - num_blocks (tuple): The number of blocks in each branch.
+                    The length must be equal to num_branches.
+                - num_channels (tuple): The number of channels in each branch.
+                    The length must be equal to num_branches.
+        in_channels (int): Number of input image channels. Normally 3.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Use `BN` by default.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: False.
+        multiscale_output (bool): Whether to output multi-level features
+            produced by multiple branches. If False, only the first level
+            feature will be output. Default: True.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Example:
+        >>> from mmseg.models import HRNet
+        >>> import torch
+        >>> extra = dict(
+        >>>     stage1=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=1,
+        >>>         block='BOTTLENECK',
+        >>>         num_blocks=(4, ),
+        >>>         num_channels=(64, )),
+        >>>     stage2=dict(
+        >>>         num_modules=1,
+        >>>         num_branches=2,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4),
+        >>>         num_channels=(32, 64)),
+        >>>     stage3=dict(
+        >>>         num_modules=4,
+        >>>         num_branches=3,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4),
+        >>>         num_channels=(32, 64, 128)),
+        >>>     stage4=dict(
+        >>>         num_modules=3,
+        >>>         num_branches=4,
+        >>>         block='BASIC',
+        >>>         num_blocks=(4, 4, 4, 4),
+        >>>         num_channels=(32, 64, 128, 256)))
+        >>> self = HRNet(extra, in_channels=1)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 1, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 32, 8, 8)
+        (1, 64, 4, 4)
+        (1, 128, 2, 2)
+        (1, 256, 1, 1)
+    """
+
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 with_cp=False,
+                 frozen_stages=-1,
+                 zero_init_residual=False,
+                 multiscale_output=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.pretrained = pretrained
+        self.zero_init_residual = zero_init_residual
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        # Assert configurations of 4 stages are in extra
+        assert 'stage1' in extra and 'stage2' in extra \
+               and 'stage3' in extra and 'stage4' in extra
+        # Assert whether the length of `num_blocks` and `num_channels` are
+        # equal to `num_branches`
+        for i in range(4):
+            cfg = extra[f'stage{i + 1}']
+            assert len(cfg['num_blocks']) == cfg['num_branches'] and \
+                   len(cfg['num_channels']) == cfg['num_branches']
+
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.frozen_stages = frozen_stages
+
+        # stem net
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            in_channels,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            self.conv_cfg,
+            64,
+            64,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # stage 1
+        self.stage1_cfg = self.extra['stage1']
+        num_channels = self.stage1_cfg['num_channels'][0]
+        block_type = self.stage1_cfg['block']
+        num_blocks = self.stage1_cfg['num_blocks'][0]
+
+        block = self.blocks_dict[block_type]
+        stage1_out_channels = num_channels * block.expansion
+        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
+
+        # stage 2
+        self.stage2_cfg = self.extra['stage2']
+        num_channels = self.stage2_cfg['num_channels']
+        block_type = self.stage2_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition1 = self._make_transition_layer([stage1_out_channels],
+                                                       num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        # stage 3
+        self.stage3_cfg = self.extra['stage3']
+        num_channels = self.stage3_cfg['num_channels']
+        block_type = self.stage3_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition2 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        # stage 4
+        self.stage4_cfg = self.extra['stage4']
+        num_channels = self.stage4_cfg['num_channels']
+        block_type = self.stage4_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition3 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multiscale_output=multiscale_output)
+
+        self._freeze_stages()
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        """Make transition layer."""
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                num_channels_pre_layer[i],
+                                num_channels_cur_layer[i],
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(
+                                self.conv_cfg,
+                                in_channels,
+                                out_channels,
+                                kernel_size=3,
+                                stride=2,
+                                padding=1,
+                                bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+        """Make each layer."""
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, planes * block.expansion)[1])
+
+        layers = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=block_init_cfg))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    inplanes,
+                    planes,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    init_cfg=block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
+        """Make each stage."""
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+
+        hr_modules = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(
+                    type='Constant', val=0, override=dict(name='norm3'))
+
+        for i in range(num_modules):
+            # multi_scale_output is only used for the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            hr_modules.append(
+                HRModule(
+                    num_branches,
+                    block,
+                    num_blocks,
+                    in_channels,
+                    num_channels,
+                    reset_multiscale_output,
+                    with_cp=self.with_cp,
+                    norm_cfg=self.norm_cfg,
+                    conv_cfg=self.conv_cfg,
+                    block_init_cfg=block_init_cfg))
+
+        return Sequential(*hr_modules), in_channels
+
+    def _freeze_stages(self):
+        """Freeze stages param and norm stats."""
+        if self.frozen_stages >= 0:
+
+            self.norm1.eval()
+            self.norm2.eval()
+            for m in [self.conv1, self.norm1, self.conv2, self.norm2]:
+                for param in m.parameters():
+                    param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            if i == 1:
+                m = getattr(self, f'layer{i}')
+                t = getattr(self, f'transition{i}')
+            elif i == 4:
+                m = getattr(self, f'stage{i}')
+            else:
+                m = getattr(self, f'stage{i}')
+                t = getattr(self, f'transition{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+            t.eval()
+            for param in t.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['num_branches']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['num_branches']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['num_branches']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+
+        return y_list
+
+    def train(self, mode=True):
+        """Convert the model into training mode will keeping the normalization
+        layer freezed."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/head_extractor/src/mmseg/models/backbones/icnet.py b/head_extractor/src/mmseg/models/backbones/icnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ff3448569c5a3ec82a12726767fcbb48b3870d2
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/icnet.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..decode_heads.psp_head import PPM
+from ..utils import resize
+
+
+@MODELS.register_module()
+class ICNet(BaseModule):
+    """ICNet for Real-Time Semantic Segmentation on High-Resolution Images.
+
+    This backbone is the implementation of
+    `ICNet <https://arxiv.org/abs/1704.08545>`_.
+
+    Args:
+        backbone_cfg (dict): Config dict to build backbone. Usually it is
+            ResNet but it can also be other backbones.
+        in_channels (int): The number of input image channels. Default: 3.
+        layer_channels (Sequence[int]): The numbers of feature channels at
+            layer 2 and layer 4 in ResNet. It can also be other backbones.
+            Default: (512, 2048).
+        light_branch_middle_channels (int): The number of channels of the
+            middle layer in light branch. Default: 32.
+        psp_out_channels (int): The number of channels of the output of PSP
+            module. Default: 512.
+        out_channels (Sequence[int]): The numbers of output feature channels
+            at each branches. Default: (64, 256, 256).
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module. Default: (1, 2, 3, 6).
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Dictionary to construct and config act layer.
+            Default: dict(type='ReLU').
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 in_channels=3,
+                 layer_channels=(512, 2048),
+                 light_branch_middle_channels=32,
+                 psp_out_channels=512,
+                 out_channels=(64, 256, 256),
+                 pool_scales=(1, 2, 3, 6),
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 init_cfg=None):
+        if backbone_cfg is None:
+            raise TypeError('backbone_cfg must be passed from config file!')
+        if init_cfg is None:
+            init_cfg = [
+                dict(type='Kaiming', mode='fan_out', layer='Conv2d'),
+                dict(type='Constant', val=1, layer='_BatchNorm'),
+                dict(type='Normal', mean=0.01, layer='Linear')
+            ]
+        super().__init__(init_cfg=init_cfg)
+        self.align_corners = align_corners
+        self.backbone = MODELS.build(backbone_cfg)
+
+        # Note: Default `ceil_mode` is false in nn.MaxPool2d, set
+        # `ceil_mode=True` to keep information in the corner of feature map.
+        self.backbone.maxpool = nn.MaxPool2d(
+            kernel_size=3, stride=2, padding=1, ceil_mode=True)
+
+        self.psp_modules = PPM(
+            pool_scales=pool_scales,
+            in_channels=layer_channels[1],
+            channels=psp_out_channels,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            align_corners=align_corners)
+
+        self.psp_bottleneck = ConvModule(
+            layer_channels[1] + len(pool_scales) * psp_out_channels,
+            psp_out_channels,
+            3,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.conv_sub1 = nn.Sequential(
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=light_branch_middle_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg),
+            ConvModule(
+                in_channels=light_branch_middle_channels,
+                out_channels=light_branch_middle_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg),
+            ConvModule(
+                in_channels=light_branch_middle_channels,
+                out_channels=out_channels[0],
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg))
+
+        self.conv_sub2 = ConvModule(
+            layer_channels[0],
+            out_channels[1],
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+        self.conv_sub4 = ConvModule(
+            psp_out_channels,
+            out_channels[2],
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+    def forward(self, x):
+        output = []
+
+        # sub 1
+        output.append(self.conv_sub1(x))
+
+        # sub 2
+        x = resize(
+            x,
+            scale_factor=0.5,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.backbone.stem(x)
+        x = self.backbone.maxpool(x)
+        x = self.backbone.layer1(x)
+        x = self.backbone.layer2(x)
+        output.append(self.conv_sub2(x))
+
+        # sub 4
+        x = resize(
+            x,
+            scale_factor=0.5,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.backbone.layer3(x)
+        x = self.backbone.layer4(x)
+        psp_outs = self.psp_modules(x) + [x]
+        psp_outs = torch.cat(psp_outs, dim=1)
+        x = self.psp_bottleneck(psp_outs)
+
+        output.append(self.conv_sub4(x))
+
+        return output
diff --git a/head_extractor/src/mmseg/models/backbones/mae.py b/head_extractor/src/mmseg/models/backbones/mae.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1f243f0857b9aca5454e8c1410075bff9281285
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/mae.py
@@ -0,0 +1,260 @@
+# Copyright (c) OpenMMLab. All rights reserved.import math
+import math
+
+import torch
+import torch.nn as nn
+from mmengine.model import ModuleList
+from mmengine.model.weight_init import (constant_init, kaiming_init,
+                                        trunc_normal_)
+from mmengine.runner.checkpoint import _load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmseg.registry import MODELS
+from .beit import BEiT, BEiTAttention, BEiTTransformerEncoderLayer
+
+
+class MAEAttention(BEiTAttention):
+    """Multi-head self-attention with relative position bias used in MAE.
+
+    This module is different from ``BEiTAttention`` by initializing the
+    relative bias table with zeros.
+    """
+
+    def init_weights(self):
+        """Initialize relative position bias with zeros."""
+
+        # As MAE initializes relative position bias as zeros and this class
+        # inherited from BEiT which initializes relative position bias
+        # with `trunc_normal`, `init_weights` here does
+        # nothing and just passes directly
+
+        pass
+
+
+class MAETransformerEncoderLayer(BEiTTransformerEncoderLayer):
+    """Implements one encoder layer in Vision Transformer.
+
+    This module is different from ``BEiTTransformerEncoderLayer`` by replacing
+    ``BEiTAttention`` with ``MAEAttention``.
+    """
+
+    def build_attn(self, attn_cfg):
+        self.attn = MAEAttention(**attn_cfg)
+
+
+@MODELS.register_module()
+class MAE(BEiT):
+    """VisionTransformer with support for patch.
+
+    Args:
+        img_size (int | tuple): Input image size. Default: 224.
+        patch_size (int): The patch size. Default: 16.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): embedding dimension. Default: 768.
+        num_layers (int): depth of transformer. Default: 12.
+        num_heads (int): number of attention heads. Default: 12.
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        out_indices (list | tuple | int): Output from which stages.
+            Default: -1.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): stochastic depth rate. Default 0.0.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        patch_norm (bool): Whether to add a norm in PatchEmbed Block.
+            Default: False.
+        final_norm (bool): Whether to add a additional layer to normalize
+            final feature map. Default: False.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_values (float): Initialize the values of Attention and FFN
+            with learnable scaling. Defaults to 0.1.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dims=768,
+                 num_layers=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 out_indices=-1,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_cfg=dict(type='LN'),
+                 act_cfg=dict(type='GELU'),
+                 patch_norm=False,
+                 final_norm=False,
+                 num_fcs=2,
+                 norm_eval=False,
+                 pretrained=None,
+                 init_values=0.1,
+                 init_cfg=None):
+        super().__init__(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            out_indices=out_indices,
+            qv_bias=False,
+            attn_drop_rate=attn_drop_rate,
+            drop_path_rate=drop_path_rate,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            patch_norm=patch_norm,
+            final_norm=final_norm,
+            num_fcs=num_fcs,
+            norm_eval=norm_eval,
+            pretrained=pretrained,
+            init_values=init_values,
+            init_cfg=init_cfg)
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
+
+        self.num_patches = self.patch_shape[0] * self.patch_shape[1]
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, self.num_patches + 1, embed_dims))
+
+    def _build_layers(self):
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, self.drop_path_rate, self.num_layers)
+        ]
+        self.layers = ModuleList()
+        for i in range(self.num_layers):
+            self.layers.append(
+                MAETransformerEncoderLayer(
+                    embed_dims=self.embed_dims,
+                    num_heads=self.num_heads,
+                    feedforward_channels=self.mlp_ratio * self.embed_dims,
+                    attn_drop_rate=self.attn_drop_rate,
+                    drop_path_rate=dpr[i],
+                    num_fcs=self.num_fcs,
+                    bias=True,
+                    act_cfg=self.act_cfg,
+                    norm_cfg=self.norm_cfg,
+                    window_size=self.patch_shape,
+                    init_values=self.init_values))
+
+    def fix_init_weight(self):
+        """Rescale the initialization according to layer id.
+
+        This function is copied from  https://github.com/microsoft/unilm/blob/master/beit/modeling_pretrain.py. # noqa: E501
+        Copyright (c) Microsoft Corporation
+        Licensed under the MIT License
+        """
+
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.layers):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.ffn.layers[1].weight.data, layer_id + 1)
+
+    def init_weights(self):
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+        self.apply(_init_weights)
+        self.fix_init_weight()
+
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg.get('type') == 'Pretrained'):
+            checkpoint = _load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+            state_dict = self.resize_rel_pos_embed(checkpoint)
+            state_dict = self.resize_abs_pos_embed(state_dict)
+            self.load_state_dict(state_dict, False)
+        elif self.init_cfg is not None:
+            super().init_weights()
+        else:
+            # We only implement the 'jax_impl' initialization implemented at
+            # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
+            # Copyright 2019 Ross Wightman
+            # Licensed under the Apache License, Version 2.0 (the "License")
+            trunc_normal_(self.cls_token, std=.02)
+            for n, m in self.named_modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if m.bias is not None:
+                        if 'ffn' in n:
+                            nn.init.normal_(m.bias, mean=0., std=1e-6)
+                        else:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Conv2d):
+                    kaiming_init(m, mode='fan_in', bias=0.)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m, val=1.0, bias=0.)
+
+    def resize_abs_pos_embed(self, state_dict):
+        if 'pos_embed' in state_dict:
+            pos_embed_checkpoint = state_dict['pos_embed']
+            embedding_size = pos_embed_checkpoint.shape[-1]
+            num_extra_tokens = self.pos_embed.shape[-2] - self.num_patches
+            # height (== width) for the checkpoint position embedding
+            orig_size = int(
+                (pos_embed_checkpoint.shape[-2] - num_extra_tokens)**0.5)
+            # height (== width) for the new position embedding
+            new_size = int(self.num_patches**0.5)
+            # class_token and dist_token are kept unchanged
+            if orig_size != new_size:
+                extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+                # only the position tokens are interpolated
+                pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+                pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size,
+                                                embedding_size).permute(
+                                                    0, 3, 1, 2)
+                pos_tokens = torch.nn.functional.interpolate(
+                    pos_tokens,
+                    size=(new_size, new_size),
+                    mode='bicubic',
+                    align_corners=False)
+                pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+                new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+                state_dict['pos_embed'] = new_pos_embed
+        return state_dict
+
+    def forward(self, inputs):
+        B = inputs.shape[0]
+
+        x, hw_shape = self.patch_embed(inputs)
+
+        # stole cls_tokens impl from Phil Wang, thanks
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = x + self.pos_embed
+
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i == len(self.layers) - 1:
+                if self.final_norm:
+                    x = self.norm1(x)
+            if i in self.out_indices:
+                out = x[:, 1:]
+                B, _, C = out.shape
+                out = out.reshape(B, hw_shape[0], hw_shape[1],
+                                  C).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+
+        return tuple(outs)
diff --git a/head_extractor/src/mmseg/models/backbones/mit.py b/head_extractor/src/mmseg/models/backbones/mit.py
new file mode 100644
index 0000000000000000000000000000000000000000..66556bdfca2b0bcb180afd23c2923c68b9ff3a69
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/mit.py
@@ -0,0 +1,450 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import Conv2d, build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import MultiheadAttention
+from mmengine.model import BaseModule, ModuleList, Sequential
+from mmengine.model.weight_init import (constant_init, normal_init,
+                                        trunc_normal_init)
+
+from mmseg.registry import MODELS
+from ..utils import PatchEmbed, nchw_to_nlc, nlc_to_nchw
+
+
+class MixFFN(BaseModule):
+    """An implementation of MixFFN of Segformer.
+
+    The differences between MixFFN & FFN:
+        1. Use 1X1 Conv to replace Linear layer.
+        2. Introduce 3X3 Conv to encode positional information.
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='ReLU')
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 feedforward_channels,
+                 act_cfg=dict(type='GELU'),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.act_cfg = act_cfg
+        self.activate = build_activation_layer(act_cfg)
+
+        in_channels = embed_dims
+        fc1 = Conv2d(
+            in_channels=in_channels,
+            out_channels=feedforward_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        # 3x3 depth wise conv to provide positional encode information
+        pe_conv = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=feedforward_channels,
+            kernel_size=3,
+            stride=1,
+            padding=(3 - 1) // 2,
+            bias=True,
+            groups=feedforward_channels)
+        fc2 = Conv2d(
+            in_channels=feedforward_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            bias=True)
+        drop = nn.Dropout(ffn_drop)
+        layers = [fc1, pe_conv, self.activate, drop, fc2, drop]
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+
+    def forward(self, x, hw_shape, identity=None):
+        out = nlc_to_nchw(x, hw_shape)
+        out = self.layers(out)
+        out = nchw_to_nlc(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+class EfficientMultiheadAttention(MultiheadAttention):
+    """An implementation of Efficient Multi-head Attention of Segformer.
+
+    This module is modified from MultiheadAttention which is a module from
+    mmcv.cnn.bricks.transformer.
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut. Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        qkv_bias (bool): enable bias for qkv if True. Default True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
+            Attention of Segformer. Default: 1.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=None,
+                 init_cfg=None,
+                 batch_first=True,
+                 qkv_bias=False,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1):
+        super().__init__(
+            embed_dims,
+            num_heads,
+            attn_drop,
+            proj_drop,
+            dropout_layer=dropout_layer,
+            init_cfg=init_cfg,
+            batch_first=batch_first,
+            bias=qkv_bias)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = Conv2d(
+                in_channels=embed_dims,
+                out_channels=embed_dims,
+                kernel_size=sr_ratio,
+                stride=sr_ratio)
+            # The ret[0] of build_norm_layer is norm name.
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        # handle the BC-breaking from https://github.com/open-mmlab/mmcv/pull/1418 # noqa
+        from mmseg import digit_version, mmcv_version
+        if mmcv_version < digit_version('1.3.17'):
+            warnings.warn('The legacy version of forward function in'
+                          'EfficientMultiheadAttention is deprecated in'
+                          'mmcv>=1.3.17 and will no longer support in the'
+                          'future. Please upgrade your mmcv.')
+            self.forward = self.legacy_forward
+
+    def forward(self, x, hw_shape, identity=None):
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            x_q = x_q.transpose(0, 1)
+            x_kv = x_kv.transpose(0, 1)
+
+        out = self.attn(query=x_q, key=x_kv, value=x_kv)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+    def legacy_forward(self, x, hw_shape, identity=None):
+        """multi head attention forward in mmcv version < 1.3.17."""
+
+        x_q = x
+        if self.sr_ratio > 1:
+            x_kv = nlc_to_nchw(x, hw_shape)
+            x_kv = self.sr(x_kv)
+            x_kv = nchw_to_nlc(x_kv)
+            x_kv = self.norm(x_kv)
+        else:
+            x_kv = x
+
+        if identity is None:
+            identity = x_q
+
+        # `need_weights=True` will let nn.MultiHeadAttention
+        # `return attn_output, attn_output_weights.sum(dim=1) / num_heads`
+        # The `attn_output_weights.sum(dim=1)` may cause cuda error. So, we set
+        # `need_weights=False` to ignore `attn_output_weights.sum(dim=1)`.
+        # This issue - `https://github.com/pytorch/pytorch/issues/37583` report
+        # the error that large scale tensor sum operation may cause cuda error.
+        out = self.attn(query=x_q, key=x_kv, value=x_kv, need_weights=False)[0]
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+class TransformerEncoderLayer(BaseModule):
+    """Implements one encoder layer in Segformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed.
+            after the feed forward layer. Default 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.0.
+        qkv_bias (bool): enable bias for qkv if True.
+            Default: True.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: False.
+        init_cfg (dict, optional): Initialization config dict.
+            Default:None.
+        sr_ratio (int): The ratio of spatial reduction of Efficient Multi-head
+            Attention of Segformer. Default: 1.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 batch_first=True,
+                 sr_ratio=1,
+                 with_cp=False):
+        super().__init__()
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.attn = EfficientMultiheadAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            batch_first=batch_first,
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio)
+
+        # The ret[0] of build_norm_layer is norm name.
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+
+        self.ffn = MixFFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg)
+
+        self.with_cp = with_cp
+
+    def forward(self, x, hw_shape):
+
+        def _inner_forward(x):
+            x = self.attn(self.norm1(x), hw_shape, identity=x)
+            x = self.ffn(self.norm2(x), hw_shape, identity=x)
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+
+
+@MODELS.register_module()
+class MixVisionTransformer(BaseModule):
+    """The backbone of Segformer.
+
+    This backbone is the implementation of `SegFormer: Simple and
+    Efficient Design for Semantic Segmentation with
+    Transformers <https://arxiv.org/abs/2105.15203>`_.
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): Embedding dimension. Default: 768.
+        num_stags (int): The num of stages. Default: 4.
+        num_layers (Sequence[int]): The layer number of each transformer encode
+            layer. Default: [3, 4, 6, 3].
+        num_heads (Sequence[int]): The attention heads of each transformer
+            encode layer. Default: [1, 2, 4, 8].
+        patch_sizes (Sequence[int]): The patch_size of each overlapped patch
+            embedding. Default: [7, 3, 3, 3].
+        strides (Sequence[int]): The stride of each overlapped patch embedding.
+            Default: [4, 2, 2, 2].
+        sr_ratios (Sequence[int]): The spatial reduction rate of each
+            transformer encode layer. Default: [8, 4, 2, 1].
+        out_indices (Sequence[int] | int): Output from which stages.
+            Default: (0, 1, 2, 3).
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): stochastic depth rate. Default 0.0
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=64,
+                 num_stages=4,
+                 num_layers=[3, 4, 6, 3],
+                 num_heads=[1, 2, 4, 8],
+                 patch_sizes=[7, 3, 3, 3],
+                 strides=[4, 2, 2, 2],
+                 sr_ratios=[8, 4, 2, 1],
+                 out_indices=(0, 1, 2, 3),
+                 mlp_ratio=4,
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN', eps=1e-6),
+                 pretrained=None,
+                 init_cfg=None,
+                 with_cp=False):
+        super().__init__(init_cfg=init_cfg)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.embed_dims = embed_dims
+        self.num_stages = num_stages
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.sr_ratios = sr_ratios
+        self.with_cp = with_cp
+        assert num_stages == len(num_layers) == len(num_heads) \
+               == len(patch_sizes) == len(strides) == len(sr_ratios)
+
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_stages
+
+        # transformer encoder
+        dpr = [
+            x.item()
+            for x in torch.linspace(0, drop_path_rate, sum(num_layers))
+        ]  # stochastic num_layer decay rule
+
+        cur = 0
+        self.layers = ModuleList()
+        for i, num_layer in enumerate(num_layers):
+            embed_dims_i = embed_dims * num_heads[i]
+            patch_embed = PatchEmbed(
+                in_channels=in_channels,
+                embed_dims=embed_dims_i,
+                kernel_size=patch_sizes[i],
+                stride=strides[i],
+                padding=patch_sizes[i] // 2,
+                norm_cfg=norm_cfg)
+            layer = ModuleList([
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims_i,
+                    num_heads=num_heads[i],
+                    feedforward_channels=mlp_ratio * embed_dims_i,
+                    drop_rate=drop_rate,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_path_rate=dpr[cur + idx],
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    with_cp=with_cp,
+                    sr_ratio=sr_ratios[i]) for idx in range(num_layer)
+            ])
+            in_channels = embed_dims_i
+            # The ret[0] of build_norm_layer is norm name.
+            norm = build_norm_layer(norm_cfg, embed_dims_i)[1]
+            self.layers.append(ModuleList([patch_embed, layer, norm]))
+            cur += num_layer
+
+    def init_weights(self):
+        if self.init_cfg is None:
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, val=1.0, bias=0.)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(
+                        m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
+        else:
+            super().init_weights()
+
+    def forward(self, x):
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            x, hw_shape = layer[0](x)
+            for block in layer[1]:
+                x = block(x, hw_shape)
+            x = layer[2](x)
+            x = nlc_to_nchw(x, hw_shape)
+            if i in self.out_indices:
+                outs.append(x)
+
+        return outs
diff --git a/head_extractor/src/mmseg/models/backbones/mobilenet_v2.py b/head_extractor/src/mmseg/models/backbones/mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c21b5df97dade148136e8b0e6b039512f9e03f9
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/mobilenet_v2.py
@@ -0,0 +1,197 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmseg.registry import MODELS
+from ..utils import InvertedResidual, make_divisible
+
+
+@MODELS.register_module()
+class MobileNetV2(BaseModule):
+    """MobileNetV2 backbone.
+
+    This backbone is the implementation of
+    `MobileNetV2: Inverted Residuals and Linear Bottlenecks
+    <https://arxiv.org/abs/1801.04381>`_.
+
+    Args:
+        widen_factor (float): Width multiplier, multiply number of
+            channels in each layer by this amount. Default: 1.0.
+        strides (Sequence[int], optional): Strides of the first block of each
+            layer. If not specified, default config in ``arch_setting`` will
+            be used.
+        dilations (Sequence[int]): Dilation of each layer.
+        out_indices (None or Sequence[int]): Output from which stages.
+            Default: (7, ).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    # Parameters to build layers. 3 parameters are needed to construct a
+    # layer, from left to right: expand_ratio, channel, num_blocks.
+    arch_settings = [[1, 16, 1], [6, 24, 2], [6, 32, 3], [6, 64, 4],
+                     [6, 96, 3], [6, 160, 3], [6, 320, 1]]
+
+    def __init__(self,
+                 widen_factor=1.,
+                 strides=(1, 2, 2, 2, 1, 2, 1),
+                 dilations=(1, 1, 1, 1, 1, 1, 1),
+                 out_indices=(1, 2, 4, 6),
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.widen_factor = widen_factor
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == len(self.arch_settings)
+        self.out_indices = out_indices
+        for index in out_indices:
+            if index not in range(0, 7):
+                raise ValueError('the item in out_indices must in '
+                                 f'range(0, 7). But received {index}')
+
+        if frozen_stages not in range(-1, 7):
+            raise ValueError('frozen_stages must be in range(-1, 7). '
+                             f'But received {frozen_stages}')
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+
+        self.in_channels = make_divisible(32 * widen_factor, 8)
+
+        self.conv1 = ConvModule(
+            in_channels=3,
+            out_channels=self.in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.layers = []
+
+        for i, layer_cfg in enumerate(self.arch_settings):
+            expand_ratio, channel, num_blocks = layer_cfg
+            stride = self.strides[i]
+            dilation = self.dilations[i]
+            out_channels = make_divisible(channel * widen_factor, 8)
+            inverted_res_layer = self.make_layer(
+                out_channels=out_channels,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                expand_ratio=expand_ratio)
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, inverted_res_layer)
+            self.layers.append(layer_name)
+
+    def make_layer(self, out_channels, num_blocks, stride, dilation,
+                   expand_ratio):
+        """Stack InvertedResidual blocks to build a layer for MobileNetV2.
+
+        Args:
+            out_channels (int): out_channels of block.
+            num_blocks (int): Number of blocks.
+            stride (int): Stride of the first block.
+            dilation (int): Dilation of the first block.
+            expand_ratio (int): Expand the number of channels of the
+                hidden layer in InvertedResidual by this ratio.
+        """
+        layers = []
+        for i in range(num_blocks):
+            layers.append(
+                InvertedResidual(
+                    self.in_channels,
+                    out_channels,
+                    stride if i == 0 else 1,
+                    expand_ratio=expand_ratio,
+                    dilation=dilation if i == 0 else 1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    with_cp=self.with_cp))
+            self.in_channels = out_channels
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+        for i in range(1, self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/head_extractor/src/mmseg/models/backbones/mobilenet_v3.py b/head_extractor/src/mmseg/models/backbones/mobilenet_v3.py
new file mode 100644
index 0000000000000000000000000000000000000000..1efb6e097472d53a5269e52a39ff2cae48e834db
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/mobilenet_v3.py
@@ -0,0 +1,267 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import Conv2dAdaptivePadding
+from mmengine.model import BaseModule
+from mmengine.utils import is_tuple_of
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmseg.registry import MODELS
+from ..utils import InvertedResidualV3 as InvertedResidual
+
+
+@MODELS.register_module()
+class MobileNetV3(BaseModule):
+    """MobileNetV3 backbone.
+
+    This backbone is the improved implementation of `Searching for MobileNetV3
+    <https://ieeexplore.ieee.org/document/9008835>`_.
+
+    Args:
+        arch (str): Architecture of mobilnetv3, from {'small', 'large'}.
+            Default: 'small'.
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        out_indices (tuple[int]): Output from which layer.
+            Default: (0, 1, 12).
+        frozen_stages (int): Stages to be frozen (all param fixed).
+            Default: -1, which means not freezing any parameters.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed.
+            Default: False.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+    # Parameters to build each block:
+    #     [kernel size, mid channels, out channels, with_se, act type, stride]
+    arch_settings = {
+        'small': [[3, 16, 16, True, 'ReLU', 2],  # block0 layer1 os=4
+                  [3, 72, 24, False, 'ReLU', 2],  # block1 layer2 os=8
+                  [3, 88, 24, False, 'ReLU', 1],
+                  [5, 96, 40, True, 'HSwish', 2],  # block2 layer4 os=16
+                  [5, 240, 40, True, 'HSwish', 1],
+                  [5, 240, 40, True, 'HSwish', 1],
+                  [5, 120, 48, True, 'HSwish', 1],  # block3 layer7 os=16
+                  [5, 144, 48, True, 'HSwish', 1],
+                  [5, 288, 96, True, 'HSwish', 2],  # block4 layer9 os=32
+                  [5, 576, 96, True, 'HSwish', 1],
+                  [5, 576, 96, True, 'HSwish', 1]],
+        'large': [[3, 16, 16, False, 'ReLU', 1],  # block0 layer1 os=2
+                  [3, 64, 24, False, 'ReLU', 2],  # block1 layer2 os=4
+                  [3, 72, 24, False, 'ReLU', 1],
+                  [5, 72, 40, True, 'ReLU', 2],  # block2 layer4 os=8
+                  [5, 120, 40, True, 'ReLU', 1],
+                  [5, 120, 40, True, 'ReLU', 1],
+                  [3, 240, 80, False, 'HSwish', 2],  # block3 layer7 os=16
+                  [3, 200, 80, False, 'HSwish', 1],
+                  [3, 184, 80, False, 'HSwish', 1],
+                  [3, 184, 80, False, 'HSwish', 1],
+                  [3, 480, 112, True, 'HSwish', 1],  # block4 layer11 os=16
+                  [3, 672, 112, True, 'HSwish', 1],
+                  [5, 672, 160, True, 'HSwish', 2],  # block5 layer13 os=32
+                  [5, 960, 160, True, 'HSwish', 1],
+                  [5, 960, 160, True, 'HSwish', 1]]
+    }  # yapf: disable
+
+    def __init__(self,
+                 arch='small',
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 out_indices=(0, 1, 12),
+                 frozen_stages=-1,
+                 reduction_factor=1,
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        assert arch in self.arch_settings
+        assert isinstance(reduction_factor, int) and reduction_factor > 0
+        assert is_tuple_of(out_indices, int)
+        for index in out_indices:
+            if index not in range(0, len(self.arch_settings[arch]) + 2):
+                raise ValueError(
+                    'the item in out_indices must in '
+                    f'range(0, {len(self.arch_settings[arch])+2}). '
+                    f'But received {index}')
+
+        if frozen_stages not in range(-1, len(self.arch_settings[arch]) + 2):
+            raise ValueError('frozen_stages must be in range(-1, '
+                             f'{len(self.arch_settings[arch])+2}). '
+                             f'But received {frozen_stages}')
+        self.arch = arch
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.reduction_factor = reduction_factor
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.layers = self._make_layer()
+
+    def _make_layer(self):
+        layers = []
+
+        # build the first layer (layer0)
+        in_channels = 16
+        layer = ConvModule(
+            in_channels=3,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_cfg=dict(type='Conv2dAdaptivePadding'),
+            norm_cfg=self.norm_cfg,
+            act_cfg=dict(type='HSwish'))
+        self.add_module('layer0', layer)
+        layers.append('layer0')
+
+        layer_setting = self.arch_settings[self.arch]
+        for i, params in enumerate(layer_setting):
+            (kernel_size, mid_channels, out_channels, with_se, act,
+             stride) = params
+
+            if self.arch == 'large' and i >= 12 or self.arch == 'small' and \
+                    i >= 8:
+                mid_channels = mid_channels // self.reduction_factor
+                out_channels = out_channels // self.reduction_factor
+
+            if with_se:
+                se_cfg = dict(
+                    channels=mid_channels,
+                    ratio=4,
+                    act_cfg=(dict(type='ReLU'),
+                             dict(type='HSigmoid', bias=3.0, divisor=6.0)))
+            else:
+                se_cfg = None
+
+            layer = InvertedResidual(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                mid_channels=mid_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                se_cfg=se_cfg,
+                with_expand_conv=(in_channels != mid_channels),
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=dict(type=act),
+                with_cp=self.with_cp)
+            in_channels = out_channels
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, layer)
+            layers.append(layer_name)
+
+        # build the last layer
+        # block5 layer12 os=32 for small model
+        # block6 layer16 os=32 for large model
+        layer = ConvModule(
+            in_channels=in_channels,
+            out_channels=576 if self.arch == 'small' else 960,
+            kernel_size=1,
+            stride=1,
+            dilation=4,
+            padding=0,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=dict(type='HSwish'))
+        layer_name = f'layer{len(layer_setting) + 1}'
+        self.add_module(layer_name, layer)
+        layers.append(layer_name)
+
+        # next, convert backbone MobileNetV3 to a semantic segmentation version
+        if self.arch == 'small':
+            self.layer4.depthwise_conv.conv.stride = (1, 1)
+            self.layer9.depthwise_conv.conv.stride = (1, 1)
+            for i in range(4, len(layers)):
+                layer = getattr(self, layers[i])
+                if isinstance(layer, InvertedResidual):
+                    modified_module = layer.depthwise_conv.conv
+                else:
+                    modified_module = layer.conv
+
+                if i < 9:
+                    modified_module.dilation = (2, 2)
+                    pad = 2
+                else:
+                    modified_module.dilation = (4, 4)
+                    pad = 4
+
+                if not isinstance(modified_module, Conv2dAdaptivePadding):
+                    # Adjust padding
+                    pad *= (modified_module.kernel_size[0] - 1) // 2
+                    modified_module.padding = (pad, pad)
+        else:
+            self.layer7.depthwise_conv.conv.stride = (1, 1)
+            self.layer13.depthwise_conv.conv.stride = (1, 1)
+            for i in range(7, len(layers)):
+                layer = getattr(self, layers[i])
+                if isinstance(layer, InvertedResidual):
+                    modified_module = layer.depthwise_conv.conv
+                else:
+                    modified_module = layer.conv
+
+                if i < 13:
+                    modified_module.dilation = (2, 2)
+                    pad = 2
+                else:
+                    modified_module.dilation = (4, 4)
+                    pad = 4
+
+                if not isinstance(modified_module, Conv2dAdaptivePadding):
+                    # Adjust padding
+                    pad *= (modified_module.kernel_size[0] - 1) // 2
+                    modified_module.padding = (pad, pad)
+
+        return layers
+
+    def forward(self, x):
+        outs = []
+        for i, layer_name in enumerate(self.layers):
+            layer = getattr(self, layer_name)
+            x = layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return outs
+
+    def _freeze_stages(self):
+        for i in range(self.frozen_stages + 1):
+            layer = getattr(self, f'layer{i}')
+            layer.eval()
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, _BatchNorm):
+                    m.eval()
diff --git a/head_extractor/src/mmseg/models/backbones/mscan.py b/head_extractor/src/mmseg/models/backbones/mscan.py
new file mode 100644
index 0000000000000000000000000000000000000000..7150cb7a1c13d11dcdcc6fbbc72931154853929e
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/mscan.py
@@ -0,0 +1,467 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Originally from https://github.com/visual-attention-network/segnext
+# Licensed under the Apache License, Version 2.0 (the "License")
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks import DropPath
+from mmengine.model import BaseModule
+from mmengine.model.weight_init import (constant_init, normal_init,
+                                        trunc_normal_init)
+
+from mmseg.registry import MODELS
+
+
+class Mlp(BaseModule):
+    """Multi Layer Perceptron (MLP) Module.
+
+    Args:
+        in_features (int): The dimension of input features.
+        hidden_features (int): The dimension of hidden features.
+            Defaults: None.
+        out_features (int): The dimension of output features.
+            Defaults: None.
+        act_cfg (dict): Config dict for activation layer in block.
+            Default: dict(type='GELU').
+        drop (float): The number of dropout rate in MLP block.
+            Defaults: 0.0.
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_cfg=dict(type='GELU'),
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
+        self.dwconv = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            3,
+            1,
+            1,
+            bias=True,
+            groups=hidden_features)
+        self.act = build_activation_layer(act_cfg)
+        self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        """Forward function."""
+
+        x = self.fc1(x)
+
+        x = self.dwconv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+
+        return x
+
+
+class StemConv(BaseModule):
+    """Stem Block at the beginning of Semantic Branch.
+
+    Args:
+        in_channels (int): The dimension of input channels.
+        out_channels (int): The dimension of output channels.
+        act_cfg (dict): Config dict for activation layer in block.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults: dict(type='SyncBN', requires_grad=True).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN', requires_grad=True)):
+        super().__init__()
+
+        self.proj = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                out_channels // 2,
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1)),
+            build_norm_layer(norm_cfg, out_channels // 2)[1],
+            build_activation_layer(act_cfg),
+            nn.Conv2d(
+                out_channels // 2,
+                out_channels,
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1)),
+            build_norm_layer(norm_cfg, out_channels)[1],
+        )
+
+    def forward(self, x):
+        """Forward function."""
+
+        x = self.proj(x)
+        _, _, H, W = x.size()
+        x = x.flatten(2).transpose(1, 2)
+        return x, H, W
+
+
+class MSCAAttention(BaseModule):
+    """Attention Module in Multi-Scale Convolutional Attention Module (MSCA).
+
+    Args:
+        channels (int): The dimension of channels.
+        kernel_sizes (list): The size of attention
+            kernel. Defaults: [5, [1, 7], [1, 11], [1, 21]].
+        paddings (list): The number of
+            corresponding padding value in attention module.
+            Defaults: [2, [0, 3], [0, 5], [0, 10]].
+    """
+
+    def __init__(self,
+                 channels,
+                 kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+                 paddings=[2, [0, 3], [0, 5], [0, 10]]):
+        super().__init__()
+        self.conv0 = nn.Conv2d(
+            channels,
+            channels,
+            kernel_size=kernel_sizes[0],
+            padding=paddings[0],
+            groups=channels)
+        for i, (kernel_size,
+                padding) in enumerate(zip(kernel_sizes[1:], paddings[1:])):
+            kernel_size_ = [kernel_size, kernel_size[::-1]]
+            padding_ = [padding, padding[::-1]]
+            conv_name = [f'conv{i}_1', f'conv{i}_2']
+            for i_kernel, i_pad, i_conv in zip(kernel_size_, padding_,
+                                               conv_name):
+                self.add_module(
+                    i_conv,
+                    nn.Conv2d(
+                        channels,
+                        channels,
+                        tuple(i_kernel),
+                        padding=i_pad,
+                        groups=channels))
+        self.conv3 = nn.Conv2d(channels, channels, 1)
+
+    def forward(self, x):
+        """Forward function."""
+
+        u = x.clone()
+
+        attn = self.conv0(x)
+
+        # Multi-Scale Feature extraction
+        attn_0 = self.conv0_1(attn)
+        attn_0 = self.conv0_2(attn_0)
+
+        attn_1 = self.conv1_1(attn)
+        attn_1 = self.conv1_2(attn_1)
+
+        attn_2 = self.conv2_1(attn)
+        attn_2 = self.conv2_2(attn_2)
+
+        attn = attn + attn_0 + attn_1 + attn_2
+        # Channel Mixing
+        attn = self.conv3(attn)
+
+        # Convolutional Attention
+        x = attn * u
+
+        return x
+
+
+class MSCASpatialAttention(BaseModule):
+    """Spatial Attention Module in Multi-Scale Convolutional Attention Module
+    (MSCA).
+
+    Args:
+        in_channels (int): The dimension of channels.
+        attention_kernel_sizes (list): The size of attention
+            kernel. Defaults: [5, [1, 7], [1, 11], [1, 21]].
+        attention_kernel_paddings (list): The number of
+            corresponding padding value in attention module.
+            Defaults: [2, [0, 3], [0, 5], [0, 10]].
+        act_cfg (dict): Config dict for activation layer in block.
+            Default: dict(type='GELU').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 attention_kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+                 attention_kernel_paddings=[2, [0, 3], [0, 5], [0, 10]],
+                 act_cfg=dict(type='GELU')):
+        super().__init__()
+        self.proj_1 = nn.Conv2d(in_channels, in_channels, 1)
+        self.activation = build_activation_layer(act_cfg)
+        self.spatial_gating_unit = MSCAAttention(in_channels,
+                                                 attention_kernel_sizes,
+                                                 attention_kernel_paddings)
+        self.proj_2 = nn.Conv2d(in_channels, in_channels, 1)
+
+    def forward(self, x):
+        """Forward function."""
+
+        shorcut = x.clone()
+        x = self.proj_1(x)
+        x = self.activation(x)
+        x = self.spatial_gating_unit(x)
+        x = self.proj_2(x)
+        x = x + shorcut
+        return x
+
+
+class MSCABlock(BaseModule):
+    """Basic Multi-Scale Convolutional Attention Block. It leverage the large-
+    kernel attention (LKA) mechanism to build both channel and spatial
+    attention. In each branch, it uses two depth-wise strip convolutions to
+    approximate standard depth-wise convolutions with large kernels. The kernel
+    size for each branch is set to 7, 11, and 21, respectively.
+
+    Args:
+        channels (int): The dimension of channels.
+        attention_kernel_sizes (list): The size of attention
+            kernel. Defaults: [5, [1, 7], [1, 11], [1, 21]].
+        attention_kernel_paddings (list): The number of
+            corresponding padding value in attention module.
+            Defaults: [2, [0, 3], [0, 5], [0, 10]].
+        mlp_ratio (float): The ratio of multiple input dimension to
+            calculate hidden feature in MLP layer. Defaults: 4.0.
+        drop (float): The number of dropout rate in MLP block.
+            Defaults: 0.0.
+        drop_path (float): The ratio of drop paths.
+            Defaults: 0.0.
+        act_cfg (dict): Config dict for activation layer in block.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults: dict(type='SyncBN', requires_grad=True).
+    """
+
+    def __init__(self,
+                 channels,
+                 attention_kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+                 attention_kernel_paddings=[2, [0, 3], [0, 5], [0, 10]],
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN', requires_grad=True)):
+        super().__init__()
+        self.norm1 = build_norm_layer(norm_cfg, channels)[1]
+        self.attn = MSCASpatialAttention(channels, attention_kernel_sizes,
+                                         attention_kernel_paddings, act_cfg)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = build_norm_layer(norm_cfg, channels)[1]
+        mlp_hidden_channels = int(channels * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=channels,
+            hidden_features=mlp_hidden_channels,
+            act_cfg=act_cfg,
+            drop=drop)
+        layer_scale_init_value = 1e-2
+        self.layer_scale_1 = nn.Parameter(
+            layer_scale_init_value * torch.ones(channels), requires_grad=True)
+        self.layer_scale_2 = nn.Parameter(
+            layer_scale_init_value * torch.ones(channels), requires_grad=True)
+
+    def forward(self, x, H, W):
+        """Forward function."""
+
+        B, N, C = x.shape
+        x = x.permute(0, 2, 1).view(B, C, H, W)
+        x = x + self.drop_path(
+            self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) *
+            self.attn(self.norm1(x)))
+        x = x + self.drop_path(
+            self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) *
+            self.mlp(self.norm2(x)))
+        x = x.view(B, C, N).permute(0, 2, 1)
+        return x
+
+
+class OverlapPatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    Args:
+        patch_size (int): The patch size.
+            Defaults: 7.
+        stride (int): Stride of the convolutional layer.
+            Default: 4.
+        in_channels (int): The number of input channels.
+            Defaults: 3.
+        embed_dims (int): The dimensions of embedding.
+            Defaults: 768.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults: dict(type='SyncBN', requires_grad=True).
+    """
+
+    def __init__(self,
+                 patch_size=7,
+                 stride=4,
+                 in_channels=3,
+                 embed_dim=768,
+                 norm_cfg=dict(type='SyncBN', requires_grad=True)):
+        super().__init__()
+
+        self.proj = nn.Conv2d(
+            in_channels,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=patch_size // 2)
+        self.norm = build_norm_layer(norm_cfg, embed_dim)[1]
+
+    def forward(self, x):
+        """Forward function."""
+
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = self.norm(x)
+
+        x = x.flatten(2).transpose(1, 2)
+
+        return x, H, W
+
+
+@MODELS.register_module()
+class MSCAN(BaseModule):
+    """SegNeXt Multi-Scale Convolutional Attention Network (MCSAN) backbone.
+
+    This backbone is the implementation of `SegNeXt: Rethinking
+    Convolutional Attention Design for Semantic
+    Segmentation <https://arxiv.org/abs/2209.08575>`_.
+    Inspiration from https://github.com/visual-attention-network/segnext.
+
+    Args:
+        in_channels (int): The number of input channels. Defaults: 3.
+        embed_dims (list[int]): Embedding dimension.
+            Defaults: [64, 128, 256, 512].
+        mlp_ratios (list[int]): Ratio of mlp hidden dim to embedding dim.
+            Defaults: [4, 4, 4, 4].
+        drop_rate (float): Dropout rate. Defaults: 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults: 0.
+        depths (list[int]): Depths of each Swin Transformer stage.
+            Default: [3, 4, 6, 3].
+        num_stages (int): MSCAN stages. Default: 4.
+        attention_kernel_sizes (list): Size of attention kernel in
+            Attention Module (Figure 2(b) of original paper).
+            Defaults: [5, [1, 7], [1, 11], [1, 21]].
+        attention_kernel_paddings (list): Size of attention paddings
+            in Attention Module (Figure 2(b) of original paper).
+            Defaults: [2, [0, 3], [0, 5], [0, 10]].
+        norm_cfg (dict): Config of norm layers.
+            Defaults: dict(type='SyncBN', requires_grad=True).
+        pretrained (str, optional): model pretrained path.
+            Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=[64, 128, 256, 512],
+                 mlp_ratios=[4, 4, 4, 4],
+                 drop_rate=0.,
+                 drop_path_rate=0.,
+                 depths=[3, 4, 6, 3],
+                 num_stages=4,
+                 attention_kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+                 attention_kernel_paddings=[2, [0, 3], [0, 5], [0, 10]],
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN', requires_grad=True),
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depths = depths
+        self.num_stages = num_stages
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        cur = 0
+
+        for i in range(num_stages):
+            if i == 0:
+                patch_embed = StemConv(3, embed_dims[0], norm_cfg=norm_cfg)
+            else:
+                patch_embed = OverlapPatchEmbed(
+                    patch_size=7 if i == 0 else 3,
+                    stride=4 if i == 0 else 2,
+                    in_channels=in_channels if i == 0 else embed_dims[i - 1],
+                    embed_dim=embed_dims[i],
+                    norm_cfg=norm_cfg)
+
+            block = nn.ModuleList([
+                MSCABlock(
+                    channels=embed_dims[i],
+                    attention_kernel_sizes=attention_kernel_sizes,
+                    attention_kernel_paddings=attention_kernel_paddings,
+                    mlp_ratio=mlp_ratios[i],
+                    drop=drop_rate,
+                    drop_path=dpr[cur + j],
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg) for j in range(depths[i])
+            ])
+            norm = nn.LayerNorm(embed_dims[i])
+            cur += depths[i]
+
+            setattr(self, f'patch_embed{i + 1}', patch_embed)
+            setattr(self, f'block{i + 1}', block)
+            setattr(self, f'norm{i + 1}', norm)
+
+    def init_weights(self):
+        """Initialize modules of MSCAN."""
+
+        print('init cfg', self.init_cfg)
+        if self.init_cfg is None:
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, val=1.0, bias=0.)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(
+                        m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
+        else:
+            super().init_weights()
+
+    def forward(self, x):
+        """Forward function."""
+
+        B = x.shape[0]
+        outs = []
+
+        for i in range(self.num_stages):
+            patch_embed = getattr(self, f'patch_embed{i + 1}')
+            block = getattr(self, f'block{i + 1}')
+            norm = getattr(self, f'norm{i + 1}')
+            x, H, W = patch_embed(x)
+            for blk in block:
+                x = blk(x, H, W)
+            x = norm(x)
+            x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+            outs.append(x)
+
+        return outs
diff --git a/head_extractor/src/mmseg/models/backbones/pidnet.py b/head_extractor/src/mmseg/models/backbones/pidnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b711a373701c0771c5c5997bbb8e5b345d70924
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/pidnet.py
@@ -0,0 +1,522 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from mmengine.runner import CheckpointLoader
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType
+from ..utils import DAPPM, PAPPM, BasicBlock, Bottleneck
+
+
+class PagFM(BaseModule):
+    """Pixel-attention-guided fusion module.
+
+    Args:
+        in_channels (int): The number of input channels.
+        channels (int): The number of channels.
+        after_relu (bool): Whether to use ReLU before attention.
+            Default: False.
+        with_channel (bool): Whether to use channel attention.
+            Default: False.
+        upsample_mode (str): The mode of upsample. Default: 'bilinear'.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(typ='ReLU', inplace=True).
+        init_cfg (dict): Config dict for initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 after_relu: bool = False,
+                 with_channel: bool = False,
+                 upsample_mode: str = 'bilinear',
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(typ='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.after_relu = after_relu
+        self.with_channel = with_channel
+        self.upsample_mode = upsample_mode
+        self.f_i = ConvModule(
+            in_channels, channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        self.f_p = ConvModule(
+            in_channels, channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        if with_channel:
+            self.up = ConvModule(
+                channels, in_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        if after_relu:
+            self.relu = MODELS.build(act_cfg)
+
+    def forward(self, x_p: Tensor, x_i: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x_p (Tensor): The featrue map from P branch.
+            x_i (Tensor): The featrue map from I branch.
+
+        Returns:
+            Tensor: The feature map with pixel-attention-guided fusion.
+        """
+        if self.after_relu:
+            x_p = self.relu(x_p)
+            x_i = self.relu(x_i)
+
+        f_i = self.f_i(x_i)
+        f_i = F.interpolate(
+            f_i,
+            size=x_p.shape[2:],
+            mode=self.upsample_mode,
+            align_corners=False)
+
+        f_p = self.f_p(x_p)
+
+        if self.with_channel:
+            sigma = torch.sigmoid(self.up(f_p * f_i))
+        else:
+            sigma = torch.sigmoid(torch.sum(f_p * f_i, dim=1).unsqueeze(1))
+
+        x_i = F.interpolate(
+            x_i,
+            size=x_p.shape[2:],
+            mode=self.upsample_mode,
+            align_corners=False)
+
+        out = sigma * x_i + (1 - sigma) * x_p
+        return out
+
+
+class Bag(BaseModule):
+    """Boundary-attention-guided fusion module.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        kernel_size (int): The kernel size of the convolution. Default: 3.
+        padding (int): The padding of the convolution. Default: 1.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: dict(order=('norm', 'act', 'conv')).
+        init_cfg (dict): Config dict for initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int = 3,
+                 padding: int = 1,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 conv_cfg: OptConfigType = dict(order=('norm', 'act', 'conv')),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+
+        self.conv = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=padding,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **conv_cfg)
+
+    def forward(self, x_p: Tensor, x_i: Tensor, x_d: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x_p (Tensor): The featrue map from P branch.
+            x_i (Tensor): The featrue map from I branch.
+            x_d (Tensor): The featrue map from D branch.
+
+        Returns:
+            Tensor: The feature map with boundary-attention-guided fusion.
+        """
+        sigma = torch.sigmoid(x_d)
+        return self.conv(sigma * x_p + (1 - sigma) * x_i)
+
+
+class LightBag(BaseModule):
+    """Light Boundary-attention-guided fusion module.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer. Default: None.
+        init_cfg (dict): Config dict for initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.f_p = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.f_i = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x_p: Tensor, x_i: Tensor, x_d: Tensor) -> Tensor:
+        """Forward function.
+        Args:
+            x_p (Tensor): The featrue map from P branch.
+            x_i (Tensor): The featrue map from I branch.
+            x_d (Tensor): The featrue map from D branch.
+
+        Returns:
+            Tensor: The feature map with light boundary-attention-guided
+                fusion.
+        """
+        sigma = torch.sigmoid(x_d)
+
+        f_p = self.f_p((1 - sigma) * x_i + x_p)
+        f_i = self.f_i(x_i + sigma * x_p)
+
+        return f_p + f_i
+
+
+@MODELS.register_module()
+class PIDNet(BaseModule):
+    """PIDNet backbone.
+
+    This backbone is the implementation of `PIDNet: A Real-time Semantic
+    Segmentation Network Inspired from PID Controller
+    <https://arxiv.org/abs/2206.02066>`_.
+    Modified from https://github.com/XuJiacong/PIDNet.
+
+    Licensed under the MIT License.
+
+    Args:
+        in_channels (int): The number of input channels. Default: 3.
+        channels (int): The number of channels in the stem layer. Default: 64.
+        ppm_channels (int): The number of channels in the PPM layer.
+            Default: 96.
+        num_stem_blocks (int): The number of blocks in the stem layer.
+            Default: 2.
+        num_branch_blocks (int): The number of blocks in the branch layer.
+            Default: 3.
+        align_corners (bool): The align_corners argument of F.interpolate.
+            Default: False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+        init_cfg (dict): Config dict for initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 3,
+                 channels: int = 64,
+                 ppm_channels: int = 96,
+                 num_stem_blocks: int = 2,
+                 num_branch_blocks: int = 3,
+                 align_corners: bool = False,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None,
+                 **kwargs):
+        super().__init__(init_cfg)
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+
+        # stem layer
+        self.stem = self._make_stem_layer(in_channels, channels,
+                                          num_stem_blocks)
+        self.relu = nn.ReLU()
+
+        # I Branch
+        self.i_branch_layers = nn.ModuleList()
+        for i in range(3):
+            self.i_branch_layers.append(
+                self._make_layer(
+                    block=BasicBlock if i < 2 else Bottleneck,
+                    in_channels=channels * 2**(i + 1),
+                    channels=channels * 8 if i > 0 else channels * 4,
+                    num_blocks=num_branch_blocks if i < 2 else 2,
+                    stride=2))
+
+        # P Branch
+        self.p_branch_layers = nn.ModuleList()
+        for i in range(3):
+            self.p_branch_layers.append(
+                self._make_layer(
+                    block=BasicBlock if i < 2 else Bottleneck,
+                    in_channels=channels * 2,
+                    channels=channels * 2,
+                    num_blocks=num_stem_blocks if i < 2 else 1))
+        self.compression_1 = ConvModule(
+            channels * 4,
+            channels * 2,
+            kernel_size=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.compression_2 = ConvModule(
+            channels * 8,
+            channels * 2,
+            kernel_size=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.pag_1 = PagFM(channels * 2, channels)
+        self.pag_2 = PagFM(channels * 2, channels)
+
+        # D Branch
+        if num_stem_blocks == 2:
+            self.d_branch_layers = nn.ModuleList([
+                self._make_single_layer(BasicBlock, channels * 2, channels),
+                self._make_layer(Bottleneck, channels, channels, 1)
+            ])
+            channel_expand = 1
+            spp_module = PAPPM
+            dfm_module = LightBag
+            act_cfg_dfm = None
+        else:
+            self.d_branch_layers = nn.ModuleList([
+                self._make_single_layer(BasicBlock, channels * 2,
+                                        channels * 2),
+                self._make_single_layer(BasicBlock, channels * 2, channels * 2)
+            ])
+            channel_expand = 2
+            spp_module = DAPPM
+            dfm_module = Bag
+            act_cfg_dfm = act_cfg
+
+        self.diff_1 = ConvModule(
+            channels * 4,
+            channels * channel_expand,
+            kernel_size=3,
+            padding=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.diff_2 = ConvModule(
+            channels * 8,
+            channels * 2,
+            kernel_size=3,
+            padding=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.spp = spp_module(
+            channels * 16, ppm_channels, channels * 4, num_scales=5)
+        self.dfm = dfm_module(
+            channels * 4, channels * 4, norm_cfg=norm_cfg, act_cfg=act_cfg_dfm)
+
+        self.d_branch_layers.append(
+            self._make_layer(Bottleneck, channels * 2, channels * 2, 1))
+
+    def _make_stem_layer(self, in_channels: int, channels: int,
+                         num_blocks: int) -> nn.Sequential:
+        """Make stem layer.
+
+        Args:
+            in_channels (int): Number of input channels.
+            channels (int): Number of output channels.
+            num_blocks (int): Number of blocks.
+
+        Returns:
+            nn.Sequential: The stem layer.
+        """
+
+        layers = [
+            ConvModule(
+                in_channels,
+                channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                channels,
+                channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        ]
+
+        layers.append(
+            self._make_layer(BasicBlock, channels, channels, num_blocks))
+        layers.append(nn.ReLU())
+        layers.append(
+            self._make_layer(
+                BasicBlock, channels, channels * 2, num_blocks, stride=2))
+        layers.append(nn.ReLU())
+
+        return nn.Sequential(*layers)
+
+    def _make_layer(self,
+                    block: BasicBlock,
+                    in_channels: int,
+                    channels: int,
+                    num_blocks: int,
+                    stride: int = 1) -> nn.Sequential:
+        """Make layer for PIDNet backbone.
+        Args:
+            block (BasicBlock): Basic block.
+            in_channels (int): Number of input channels.
+            channels (int): Number of output channels.
+            num_blocks (int): Number of blocks.
+            stride (int): Stride of the first block. Default: 1.
+
+        Returns:
+            nn.Sequential: The Branch Layer.
+        """
+        downsample = None
+        if stride != 1 or in_channels != channels * block.expansion:
+            downsample = ConvModule(
+                in_channels,
+                channels * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None)
+
+        layers = [block(in_channels, channels, stride, downsample)]
+        in_channels = channels * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    in_channels,
+                    channels,
+                    stride=1,
+                    act_cfg_out=None if i == num_blocks - 1 else self.act_cfg))
+        return nn.Sequential(*layers)
+
+    def _make_single_layer(self,
+                           block: Union[BasicBlock, Bottleneck],
+                           in_channels: int,
+                           channels: int,
+                           stride: int = 1) -> nn.Module:
+        """Make single layer for PIDNet backbone.
+        Args:
+            block (BasicBlock or Bottleneck): Basic block or Bottleneck.
+            in_channels (int): Number of input channels.
+            channels (int): Number of output channels.
+            stride (int): Stride of the first block. Default: 1.
+
+        Returns:
+            nn.Module
+        """
+
+        downsample = None
+        if stride != 1 or in_channels != channels * block.expansion:
+            downsample = ConvModule(
+                in_channels,
+                channels * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None)
+        return block(
+            in_channels, channels, stride, downsample, act_cfg_out=None)
+
+    def init_weights(self):
+        """Initialize the weights in backbone.
+
+        Since the D branch is not initialized by the pre-trained model, we
+        initialize it with the same method as the ResNet.
+        """
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        if self.init_cfg is not None:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            ckpt = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], map_location='cpu')
+            self.load_state_dict(ckpt, strict=False)
+
+    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:
+        """Forward function.
+
+        Args:
+            x (Tensor): Input tensor with shape (B, C, H, W).
+
+        Returns:
+            Tensor or tuple[Tensor]: If self.training is True, return
+                tuple[Tensor], else return Tensor.
+        """
+        w_out = x.shape[-1] // 8
+        h_out = x.shape[-2] // 8
+
+        # stage 0-2
+        x = self.stem(x)
+
+        # stage 3
+        x_i = self.relu(self.i_branch_layers[0](x))
+        x_p = self.p_branch_layers[0](x)
+        x_d = self.d_branch_layers[0](x)
+
+        comp_i = self.compression_1(x_i)
+        x_p = self.pag_1(x_p, comp_i)
+        diff_i = self.diff_1(x_i)
+        x_d += F.interpolate(
+            diff_i,
+            size=[h_out, w_out],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.training:
+            temp_p = x_p.clone()
+
+        # stage 4
+        x_i = self.relu(self.i_branch_layers[1](x_i))
+        x_p = self.p_branch_layers[1](self.relu(x_p))
+        x_d = self.d_branch_layers[1](self.relu(x_d))
+
+        comp_i = self.compression_2(x_i)
+        x_p = self.pag_2(x_p, comp_i)
+        diff_i = self.diff_2(x_i)
+        x_d += F.interpolate(
+            diff_i,
+            size=[h_out, w_out],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.training:
+            temp_d = x_d.clone()
+
+        # stage 5
+        x_i = self.i_branch_layers[2](x_i)
+        x_p = self.p_branch_layers[2](self.relu(x_p))
+        x_d = self.d_branch_layers[2](self.relu(x_d))
+
+        x_i = self.spp(x_i)
+        x_i = F.interpolate(
+            x_i,
+            size=[h_out, w_out],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        out = self.dfm(x_p, x_i, x_d)
+        return (temp_p, out, temp_d) if self.training else out
diff --git a/head_extractor/src/mmseg/models/backbones/resnest.py b/head_extractor/src/mmseg/models/backbones/resnest.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cc380b4460915f476ffc1febcfc145a94fc7c7a
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/resnest.py
@@ -0,0 +1,318 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from mmseg.registry import MODELS
+from ..utils import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNetV1d
+
+
+class RSoftmax(nn.Module):
+    """Radix Softmax module in ``SplitAttentionConv2d``.
+
+    Args:
+        radix (int): Radix of input.
+        groups (int): Groups of input.
+    """
+
+    def __init__(self, radix, groups):
+        super().__init__()
+        self.radix = radix
+        self.groups = groups
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class SplitAttentionConv2d(nn.Module):
+    """Split-Attention Conv2d in ResNeSt.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int | tuple[int]): Same as nn.Conv2d.
+        stride (int | tuple[int]): Same as nn.Conv2d.
+        padding (int | tuple[int]): Same as nn.Conv2d.
+        dilation (int | tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels. Default: 4.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        dcn (dict): Config dict for DCN. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 radix=2,
+                 reduction_factor=4,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None):
+        super().__init__()
+        inter_channels = max(in_channels * radix // reduction_factor, 32)
+        self.radix = radix
+        self.groups = groups
+        self.channels = channels
+        self.with_dcn = dcn is not None
+        self.dcn = dcn
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if self.with_dcn and not fallback_on_stride:
+            assert conv_cfg is None, 'conv_cfg must be None for DCN'
+            conv_cfg = dcn
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            channels * radix,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups * radix,
+            bias=False)
+        self.norm0_name, norm0 = build_norm_layer(
+            norm_cfg, channels * radix, postfix=0)
+        self.add_module(self.norm0_name, norm0)
+        self.relu = nn.ReLU(inplace=True)
+        self.fc1 = build_conv_layer(
+            None, channels, inter_channels, 1, groups=self.groups)
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, inter_channels, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.fc2 = build_conv_layer(
+            None, inter_channels, channels * radix, 1, groups=self.groups)
+        self.rsoftmax = RSoftmax(radix, groups)
+
+    @property
+    def norm0(self):
+        """nn.Module: the normalization layer named "norm0" """
+        return getattr(self, self.norm0_name)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm0(x)
+        x = self.relu(x)
+
+        batch, rchannel = x.shape[:2]
+        batch = x.size(0)
+        if self.radix > 1:
+            splits = x.view(batch, self.radix, -1, *x.shape[2:])
+            gap = splits.sum(dim=1)
+        else:
+            gap = x
+        gap = F.adaptive_avg_pool2d(gap, 1)
+        gap = self.fc1(gap)
+
+        gap = self.norm1(gap)
+        gap = self.relu(gap)
+
+        atten = self.fc2(gap)
+        atten = self.rsoftmax(atten).view(batch, -1, 1, 1)
+
+        if self.radix > 1:
+            attens = atten.view(batch, self.radix, -1, *atten.shape[2:])
+            out = torch.sum(attens * splits, dim=1)
+        else:
+            out = atten * x
+        return out.contiguous()
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeSt.
+
+    Args:
+        inplane (int): Input planes of this block.
+        planes (int): Middle planes of this block.
+        groups (int): Groups of conv2.
+        width_per_group (int): Width per group of conv2. 64x4d indicates
+            ``groups=64, width_per_group=4`` and 32x8d indicates
+            ``groups=32, width_per_group=8``.
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Key word arguments for base class.
+    """
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        """Bottleneck block for ResNeSt."""
+        super().__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.avg_down_stride = avg_down_stride and self.conv2_stride > 1
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.with_modulated_dcn = False
+        self.conv2 = SplitAttentionConv2d(
+            width,
+            width,
+            kernel_size=3,
+            stride=1 if self.avg_down_stride else self.conv2_stride,
+            padding=self.dilation,
+            dilation=self.dilation,
+            groups=groups,
+            radix=radix,
+            reduction_factor=reduction_factor,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            dcn=self.dcn)
+        delattr(self, self.norm2_name)
+
+        if self.avg_down_stride:
+            self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1)
+
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+
+            if self.avg_down_stride:
+                out = self.avd_layer(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class ResNeSt(ResNetV1d):
+    """ResNeSt backbone.
+
+    This backbone is the implementation of `ResNeSt:
+    Split-Attention Networks <https://arxiv.org/abs/2004.08955>`_.
+
+    Args:
+        groups (int): Number of groups of Bottleneck. Default: 1
+        base_width (int): Base width of Bottleneck. Default: 4
+        radix (int): Radix of SpltAtConv2d. Default: 2
+        reduction_factor (int): Reduction factor of inter_channels in
+            SplitAttentionConv2d. Default: 4.
+        avg_down_stride (bool): Whether to use average pool for stride in
+            Bottleneck. Default: True.
+        kwargs (dict): Keyword arguments for ResNet.
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3)),
+        200: (Bottleneck, (3, 24, 36, 3))
+    }
+
+    def __init__(self,
+                 groups=1,
+                 base_width=4,
+                 radix=2,
+                 reduction_factor=4,
+                 avg_down_stride=True,
+                 **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        self.radix = radix
+        self.reduction_factor = reduction_factor
+        self.avg_down_stride = avg_down_stride
+        super().__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            radix=self.radix,
+            reduction_factor=self.reduction_factor,
+            avg_down_stride=self.avg_down_stride,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/models/backbones/resnet.py b/head_extractor/src/mmseg/models/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..9226c90d85c938e76f322e58643ee9d7b17ba27b
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/resnet.py
@@ -0,0 +1,712 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_plugin_layer
+from mmengine.model import BaseModule
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmseg.registry import MODELS
+from ..utils import ResLayer
+
+
+class BasicBlock(BaseModule):
+    """Basic block for ResNet."""
+
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    """Bottleneck block for ResNet.
+
+    If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is
+    "caffe", the stride-two layer is the first 1x1 conv layer.
+    """
+
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        assert style in ['pytorch', 'caffe']
+        assert dcn is None or isinstance(dcn, dict)
+        assert plugins is None or isinstance(plugins, list)
+        if plugins is not None:
+            allowed_position = ['after_conv1', 'after_conv2', 'after_conv3']
+            assert all(p['position'] in allowed_position for p in plugins)
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+        self.plugins = plugins
+        self.with_plugins = plugins is not None
+
+        if self.with_plugins:
+            # collect plugins for conv1/conv2/conv3
+            self.after_conv1_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv1'
+            ]
+            self.after_conv2_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv2'
+            ]
+            self.after_conv3_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv3'
+            ]
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                conv_cfg,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                dcn,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+        if self.with_plugins:
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                planes, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                planes, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                planes * self.expansion, self.after_conv3_plugins)
+
+    def make_block_plugins(self, in_channels, plugins):
+        """make plugins for block.
+
+        Args:
+            in_channels (int): Input channels of plugin.
+            plugins (list[dict]): List of plugins cfg to build.
+
+        Returns:
+            list[str]: List of the names of plugin.
+        """
+        assert isinstance(plugins, list)
+        plugin_names = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            name, layer = build_plugin_layer(
+                plugin,
+                in_channels=in_channels,
+                postfix=plugin.pop('postfix', ''))
+            assert not hasattr(self, name), f'duplicate plugin {name}'
+            self.add_module(name, layer)
+            plugin_names.append(name)
+        return plugin_names
+
+    def forward_plugin(self, x, plugin_names):
+        """Forward function for plugins."""
+        out = x
+        for name in plugin_names:
+            out = getattr(self, name)(x)
+        return out
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@MODELS.register_module()
+class ResNet(BaseModule):
+    """ResNet backbone.
+
+    This backbone is the improved implementation of `Deep Residual Learning
+    for Image Recognition <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Default: 3.
+        stem_channels (int): Number of stem channels. Default: 64.
+        base_channels (int): Number of base channels of res layer. Default: 64.
+        num_stages (int): Resnet stages, normally 4. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+            Default: (1, 2, 2, 2).
+        dilations (Sequence[int]): Dilation of each stage.
+            Default: (1, 1, 1, 1).
+        out_indices (Sequence[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer. Default: 'pytorch'.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv.
+            Default: False.
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters. Default: -1.
+        conv_cfg (dict | None): Dictionary to construct and config conv layer.
+            When conv_cfg is None, cfg will be set to dict(type='Conv2d').
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        dcn (dict | None): Dictionary to construct and config DCN conv layer.
+            When dcn is not None, conv_cfg must be None. Default: None.
+        stage_with_dcn (Sequence[bool]): Whether to set DCN conv for each
+            stage. The length of stage_with_dcn is equal to num_stages.
+            Default: (False, False, False, False).
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+
+            - position (str, required): Position inside block to insert plugin,
+            options: 'after_conv1', 'after_conv2', 'after_conv3'.
+
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+            should be same as 'num_stages'.
+            Default: None.
+        multi_grid (Sequence[int]|None): Multi grid dilation rates of last
+            stage. Default: None.
+        contract_dilation (bool): Whether contract first dilation of each layer
+            Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: True.
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Example:
+        >>> from mmseg.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=64,
+                 base_channels=64,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=False,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 multi_grid=None,
+                 contract_dilation=False,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+
+        self.pretrained = pretrained
+        self.zero_init_residual = zero_init_residual
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                block = self.arch_settings[depth][0]
+                if self.zero_init_residual:
+                    if block is BasicBlock:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm2'))
+                    elif block is Bottleneck:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depth = depth
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.multi_grid = multi_grid
+        self.contract_dilation = contract_dilation
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = stem_channels
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if plugins is not None:
+                stage_plugins = self.make_stage_plugins(plugins, i)
+            else:
+                stage_plugins = None
+            # multi grid is applied to last layer only
+            stage_multi_grid = multi_grid if i == len(
+                self.stage_blocks) - 1 else None
+            planes = base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                multi_grid=stage_multi_grid,
+                contract_dilation=contract_dilation,
+                init_cfg=block_init_cfg)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i+1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    def make_stage_plugins(self, plugins, stage_idx):
+        """make plugins for ResNet 'stage_idx'th stage .
+
+        Currently we support to insert 'context_block',
+        'empirical_attention_block', 'nonlocal_block' into the backbone like
+        ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
+        Bottleneck.
+
+        An example of plugins format could be :
+        >>> plugins=[
+        ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+        ...          stages=(False, True, True, True),
+        ...          position='after_conv2'),
+        ...     dict(cfg=dict(type='yyy'),
+        ...          stages=(True, True, True, True),
+        ...          position='after_conv3'),
+        ...     dict(cfg=dict(type='zzz', postfix='1'),
+        ...          stages=(True, True, True, True),
+        ...          position='after_conv3'),
+        ...     dict(cfg=dict(type='zzz', postfix='2'),
+        ...          stages=(True, True, True, True),
+        ...          position='after_conv3')
+        ... ]
+        >>> self = ResNet(depth=18)
+        >>> stage_plugins = self.make_stage_plugins(plugins, 0)
+        >>> assert len(stage_plugins) == 3
+
+        Suppose 'stage_idx=0', the structure of blocks in the stage would be:
+            conv1-> conv2->conv3->yyy->zzz1->zzz2
+        Suppose 'stage_idx=1', the structure of blocks in the stage would be:
+            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2
+
+        If stages is missing, the plugin would be applied to all stages.
+
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+
+        Returns:
+            list[dict]: Plugins for current stage
+        """
+        stage_plugins = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            # whether to insert plugin into current stage
+            if stages is None or stages[stage_idx]:
+                stage_plugins.append(plugin)
+
+        return stage_plugins
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        """Make stem layer for ResNet."""
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels)[1],
+                nn.ReLU(inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        """Freeze stages param and norm stats."""
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super().train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@MODELS.register_module()
+class ResNetV1c(ResNet):
+    """ResNetV1c variant described in [1]_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1c replaces the 7x7 conv in
+    the input stem with three 3x3 convs. For more details please refer to `Bag
+    of Tricks for Image Classification with Convolutional Neural Networks
+    <https://arxiv.org/abs/1812.01187>`_.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(deep_stem=True, avg_down=False, **kwargs)
+
+
+@MODELS.register_module()
+class ResNetV1d(ResNet):
+    """ResNetV1d variant described in [1]_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(deep_stem=True, avg_down=True, **kwargs)
diff --git a/head_extractor/src/mmseg/models/backbones/resnext.py b/head_extractor/src/mmseg/models/backbones/resnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..67a244a12f61b78ee12e89e8b45868781208614c
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/resnext.py
@@ -0,0 +1,150 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+
+from mmseg.registry import MODELS
+from ..utils import ResLayer
+from .resnet import Bottleneck as _Bottleneck
+from .resnet import ResNet
+
+
+class Bottleneck(_Bottleneck):
+    """Bottleneck block for ResNeXt.
+
+    If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is
+    "caffe", the stride-two layer is the first 1x1 conv layer.
+    """
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 groups=1,
+                 base_width=4,
+                 base_channels=64,
+                 **kwargs):
+        super().__init__(inplanes, planes, **kwargs)
+
+        if groups == 1:
+            width = self.planes
+        else:
+            width = math.floor(self.planes *
+                               (base_width / base_channels)) * groups
+
+        self.norm1_name, norm1 = build_norm_layer(
+            self.norm_cfg, width, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(
+            self.norm_cfg, width, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            self.norm_cfg, self.planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            self.conv_cfg,
+            self.inplanes,
+            width,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = self.dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                self.conv_cfg,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                self.dcn,
+                width,
+                width,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=self.dilation,
+                dilation=self.dilation,
+                groups=groups,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            self.conv_cfg,
+            width,
+            self.planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+
+@MODELS.register_module()
+class ResNeXt(ResNet):
+    """ResNeXt backbone.
+
+    This backbone is the implementation of `Aggregated
+    Residual Transformations for Deep Neural
+    Networks <https://arxiv.org/abs/1611.05431>`_.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (int): Number of input image channels. Normally 3.
+        num_stages (int): Resnet stages, normally 4.
+        groups (int): Group of resnext.
+        base_width (int): Base width of resnext.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from mmseg.models import ResNeXt
+        >>> import torch
+        >>> self = ResNeXt(depth=50)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 256, 8, 8)
+        (1, 512, 4, 4)
+        (1, 1024, 2, 2)
+        (1, 2048, 1, 1)
+    """
+
+    arch_settings = {
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, groups=1, base_width=4, **kwargs):
+        self.groups = groups
+        self.base_width = base_width
+        super().__init__(**kwargs)
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``"""
+        return ResLayer(
+            groups=self.groups,
+            base_width=self.base_width,
+            base_channels=self.base_channels,
+            **kwargs)
diff --git a/head_extractor/src/mmseg/models/backbones/stdc.py b/head_extractor/src/mmseg/models/backbones/stdc.py
new file mode 100644
index 0000000000000000000000000000000000000000..758a3c92e07dc8d2051f670adf00d163019d758c
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/stdc.py
@@ -0,0 +1,422 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from https://github.com/MichaelFan01/STDC-Seg."""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList, Sequential
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .bisenetv1 import AttentionRefinementModule
+
+
+class STDCModule(BaseModule):
+    """STDCModule.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels before scaling.
+        stride (int): The number of stride for the first conv layer.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): The activation config for conv layers.
+        num_convs (int): Numbers of conv layers.
+        fusion_type (str): Type of fusion operation. Default: 'add'.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 num_convs=4,
+                 fusion_type='add',
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert num_convs > 1
+        assert fusion_type in ['add', 'cat']
+        self.stride = stride
+        self.with_downsample = True if self.stride == 2 else False
+        self.fusion_type = fusion_type
+
+        self.layers = ModuleList()
+        conv_0 = ConvModule(
+            in_channels, out_channels // 2, kernel_size=1, norm_cfg=norm_cfg)
+
+        if self.with_downsample:
+            self.downsample = ConvModule(
+                out_channels // 2,
+                out_channels // 2,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                groups=out_channels // 2,
+                norm_cfg=norm_cfg,
+                act_cfg=None)
+
+            if self.fusion_type == 'add':
+                self.layers.append(nn.Sequential(conv_0, self.downsample))
+                self.skip = Sequential(
+                    ConvModule(
+                        in_channels,
+                        in_channels,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        groups=in_channels,
+                        norm_cfg=norm_cfg,
+                        act_cfg=None),
+                    ConvModule(
+                        in_channels,
+                        out_channels,
+                        1,
+                        norm_cfg=norm_cfg,
+                        act_cfg=None))
+            else:
+                self.layers.append(conv_0)
+                self.skip = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)
+        else:
+            self.layers.append(conv_0)
+
+        for i in range(1, num_convs):
+            out_factor = 2**(i + 1) if i != num_convs - 1 else 2**i
+            self.layers.append(
+                ConvModule(
+                    out_channels // 2**i,
+                    out_channels // out_factor,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs):
+        if self.fusion_type == 'add':
+            out = self.forward_add(inputs)
+        else:
+            out = self.forward_cat(inputs)
+        return out
+
+    def forward_add(self, inputs):
+        layer_outputs = []
+        x = inputs.clone()
+        for layer in self.layers:
+            x = layer(x)
+            layer_outputs.append(x)
+        if self.with_downsample:
+            inputs = self.skip(inputs)
+
+        return torch.cat(layer_outputs, dim=1) + inputs
+
+    def forward_cat(self, inputs):
+        x0 = self.layers[0](inputs)
+        layer_outputs = [x0]
+        for i, layer in enumerate(self.layers[1:]):
+            if i == 0:
+                if self.with_downsample:
+                    x = layer(self.downsample(x0))
+                else:
+                    x = layer(x0)
+            else:
+                x = layer(x)
+            layer_outputs.append(x)
+        if self.with_downsample:
+            layer_outputs[0] = self.skip(x0)
+        return torch.cat(layer_outputs, dim=1)
+
+
+class FeatureFusionModule(BaseModule):
+    """Feature Fusion Module. This module is different from FeatureFusionModule
+    in BiSeNetV1. It uses two ConvModules in `self.attention` whose inter
+    channel number is calculated by given `scale_factor`, while
+    FeatureFusionModule in BiSeNetV1 only uses one ConvModule in
+    `self.conv_atten`.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        scale_factor (int): The number of channel scale factor.
+            Default: 4.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): The activation config for conv layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 scale_factor=4,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        channels = out_channels // scale_factor
+        self.conv0 = ConvModule(
+            in_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.attention = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            ConvModule(
+                out_channels,
+                channels,
+                1,
+                norm_cfg=None,
+                bias=False,
+                act_cfg=act_cfg),
+            ConvModule(
+                channels,
+                out_channels,
+                1,
+                norm_cfg=None,
+                bias=False,
+                act_cfg=None), nn.Sigmoid())
+
+    def forward(self, spatial_inputs, context_inputs):
+        inputs = torch.cat([spatial_inputs, context_inputs], dim=1)
+        x = self.conv0(inputs)
+        attn = self.attention(x)
+        x_attn = x * attn
+        return x_attn + x
+
+
+@MODELS.register_module()
+class STDCNet(BaseModule):
+    """This backbone is the implementation of `Rethinking BiSeNet For Real-time
+    Semantic Segmentation <https://arxiv.org/abs/2104.13188>`_.
+
+    Args:
+        stdc_type (int): The type of backbone structure,
+            `STDCNet1` and`STDCNet2` denotes two main backbones in paper,
+            whose FLOPs is 813M and 1446M, respectively.
+        in_channels (int): The num of input_channels.
+        channels (tuple[int]): The output channels for each stage.
+        bottleneck_type (str): The type of STDC Module type, the value must
+            be 'add' or 'cat'.
+        norm_cfg (dict): Config dict for normalization layer.
+        act_cfg (dict): The activation config for conv layers.
+        num_convs (int): Numbers of conv layer at each STDC Module.
+            Default: 4.
+        with_final_conv (bool): Whether add a conv layer at the Module output.
+            Default: True.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Example:
+        >>> import torch
+        >>> stdc_type = 'STDCNet1'
+        >>> in_channels = 3
+        >>> channels = (32, 64, 256, 512, 1024)
+        >>> bottleneck_type = 'cat'
+        >>> inputs = torch.rand(1, 3, 1024, 2048)
+        >>> self = STDCNet(stdc_type, in_channels,
+        ...                 channels, bottleneck_type).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 256, 128, 256])
+        outputs[1].shape = torch.Size([1, 512, 64, 128])
+        outputs[2].shape = torch.Size([1, 1024, 32, 64])
+    """
+
+    arch_settings = {
+        'STDCNet1': [(2, 1), (2, 1), (2, 1)],
+        'STDCNet2': [(2, 1, 1, 1), (2, 1, 1, 1, 1), (2, 1, 1)]
+    }
+
+    def __init__(self,
+                 stdc_type,
+                 in_channels,
+                 channels,
+                 bottleneck_type,
+                 norm_cfg,
+                 act_cfg,
+                 num_convs=4,
+                 with_final_conv=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert stdc_type in self.arch_settings, \
+            f'invalid structure {stdc_type} for STDCNet.'
+        assert bottleneck_type in ['add', 'cat'],\
+            f'bottleneck_type must be `add` or `cat`, got {bottleneck_type}'
+
+        assert len(channels) == 5,\
+            f'invalid channels length {len(channels)} for STDCNet.'
+
+        self.in_channels = in_channels
+        self.channels = channels
+        self.stage_strides = self.arch_settings[stdc_type]
+        self.prtrained = pretrained
+        self.num_convs = num_convs
+        self.with_final_conv = with_final_conv
+
+        self.stages = ModuleList([
+            ConvModule(
+                self.in_channels,
+                self.channels[0],
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            ConvModule(
+                self.channels[0],
+                self.channels[1],
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        ])
+        # `self.num_shallow_features` is the number of shallow modules in
+        # `STDCNet`, which is noted as `Stage1` and `Stage2` in original paper.
+        # They are both not used for following modules like Attention
+        # Refinement Module and Feature Fusion Module.
+        # Thus they would be cut from `outs`. Please refer to Figure 4
+        # of original paper for more details.
+        self.num_shallow_features = len(self.stages)
+
+        for strides in self.stage_strides:
+            idx = len(self.stages) - 1
+            self.stages.append(
+                self._make_stage(self.channels[idx], self.channels[idx + 1],
+                                 strides, norm_cfg, act_cfg, bottleneck_type))
+        # After appending, `self.stages` is a ModuleList including several
+        # shallow modules and STDCModules.
+        # (len(self.stages) ==
+        # self.num_shallow_features + len(self.stage_strides))
+        if self.with_final_conv:
+            self.final_conv = ConvModule(
+                self.channels[-1],
+                max(1024, self.channels[-1]),
+                1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+
+    def _make_stage(self, in_channels, out_channels, strides, norm_cfg,
+                    act_cfg, bottleneck_type):
+        layers = []
+        for i, stride in enumerate(strides):
+            layers.append(
+                STDCModule(
+                    in_channels if i == 0 else out_channels,
+                    out_channels,
+                    stride,
+                    norm_cfg,
+                    act_cfg,
+                    num_convs=self.num_convs,
+                    fusion_type=bottleneck_type))
+        return Sequential(*layers)
+
+    def forward(self, x):
+        outs = []
+        for stage in self.stages:
+            x = stage(x)
+            outs.append(x)
+        if self.with_final_conv:
+            outs[-1] = self.final_conv(outs[-1])
+        outs = outs[self.num_shallow_features:]
+        return tuple(outs)
+
+
+@MODELS.register_module()
+class STDCContextPathNet(BaseModule):
+    """STDCNet with Context Path. The `outs` below is a list of three feature
+    maps from deep to shallow, whose height and width is from small to big,
+    respectively. The biggest feature map of `outs` is outputted for
+    `STDCHead`, where Detail Loss would be calculated by Detail Ground-truth.
+    The other two feature maps are used for Attention Refinement Module,
+    respectively. Besides, the biggest feature map of `outs` and the last
+    output of Attention Refinement Module are concatenated for Feature Fusion
+    Module. Then, this fusion feature map `feat_fuse` would be outputted for
+    `decode_head`. More details please refer to Figure 4 of original paper.
+
+    Args:
+        backbone_cfg (dict): Config dict for stdc backbone.
+        last_in_channels (tuple(int)), The number of channels of last
+            two feature maps from stdc backbone. Default: (1024, 512).
+        out_channels (int): The channels of output feature maps.
+            Default: 128.
+        ffm_cfg (dict): Config dict for Feature Fusion Module. Default:
+            `dict(in_channels=512, out_channels=256, scale_factor=4)`.
+        upsample_mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'nearest'``.
+        align_corners (str): align_corners argument of F.interpolate. It
+            must be `None` if upsample_mode is ``'nearest'``. Default: None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Return:
+        outputs (tuple): The tuple of list of output feature map for
+            auxiliary heads and decoder head.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 last_in_channels=(1024, 512),
+                 out_channels=128,
+                 ffm_cfg=dict(
+                     in_channels=512, out_channels=256, scale_factor=4),
+                 upsample_mode='nearest',
+                 align_corners=None,
+                 norm_cfg=dict(type='BN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.backbone = MODELS.build(backbone_cfg)
+        self.arms = ModuleList()
+        self.convs = ModuleList()
+        for channels in last_in_channels:
+            self.arms.append(AttentionRefinementModule(channels, out_channels))
+            self.convs.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg))
+        self.conv_avg = ConvModule(
+            last_in_channels[0], out_channels, 1, norm_cfg=norm_cfg)
+
+        self.ffm = FeatureFusionModule(**ffm_cfg)
+
+        self.upsample_mode = upsample_mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        outs = list(self.backbone(x))
+        avg = F.adaptive_avg_pool2d(outs[-1], 1)
+        avg_feat = self.conv_avg(avg)
+
+        feature_up = resize(
+            avg_feat,
+            size=outs[-1].shape[2:],
+            mode=self.upsample_mode,
+            align_corners=self.align_corners)
+        arms_out = []
+        for i in range(len(self.arms)):
+            x_arm = self.arms[i](outs[len(outs) - 1 - i]) + feature_up
+            feature_up = resize(
+                x_arm,
+                size=outs[len(outs) - 1 - i - 1].shape[2:],
+                mode=self.upsample_mode,
+                align_corners=self.align_corners)
+            feature_up = self.convs[i](feature_up)
+            arms_out.append(feature_up)
+
+        feat_fuse = self.ffm(outs[0], arms_out[1])
+
+        # The `outputs` has four feature maps.
+        # `outs[0]` is outputted for `STDCHead` auxiliary head.
+        # Two feature maps of `arms_out` are outputted for auxiliary head.
+        # `feat_fuse` is outputted for decoder head.
+        outputs = [outs[0]] + list(arms_out) + [feat_fuse]
+        return tuple(outputs)
diff --git a/head_extractor/src/mmseg/models/backbones/swin.py b/head_extractor/src/mmseg/models/backbones/swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..67b28a96e15fe81e8213d67518d664383a4fd255
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/swin.py
@@ -0,0 +1,757 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from collections import OrderedDict
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, build_dropout
+from mmengine.logging import print_log
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, trunc_normal_,
+                                        trunc_normal_init)
+from mmengine.runner import CheckpointLoader
+from mmengine.utils import to_2tuple
+
+from mmseg.registry import MODELS
+from ..utils.embed import PatchEmbed, PatchMerging
+
+
+class WindowMSA(BaseModule):
+    """Window based multi-head self-attention (W-MSA) module with relative
+    position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.scale = qk_scale or head_embed_dims**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # About 2x faster than original impl
+        Wh, Ww = self.window_size
+        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
+        rel_position_index = rel_index_coords + rel_index_coords.T
+        rel_position_index = rel_position_index.flip(1).contiguous()
+        self.register_buffer('relative_position_index', rel_position_index)
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def init_weights(self):
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+
+            x (tensor): input features with shape of (num_windows*B, N, C)
+            mask (tensor | None, Optional): mask with shape of (num_windows,
+                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
+        """
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        # make torchscript happy (cannot use tensor as tuple)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def double_step_seq(step1, len1, step2, len2):
+        seq1 = torch.arange(0, step1 * len1, step1)
+        seq2 = torch.arange(0, step2 * len2, step2)
+        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
+
+
+class ShiftWindowMSA(BaseModule):
+    """Shifted Window Multihead Self-Attention Module.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): The height and width of the window.
+        shift_size (int, optional): The shift step of each window towards
+            right-bottom. If zero, act as regular window-msa. Defaults to 0.
+        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
+            Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Defaults: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Defaults: 0.
+        proj_drop_rate (float, optional): Dropout ratio of output.
+            Defaults: 0.
+        dropout_layer (dict, optional): The dropout_layer used before output.
+            Defaults: dict(type='DropPath', drop_prob=0.).
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 window_size,
+                 shift_size=0,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0,
+                 proj_drop_rate=0,
+                 dropout_layer=dict(type='DropPath', drop_prob=0.),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.window_size = window_size
+        self.shift_size = shift_size
+        assert 0 <= self.shift_size < self.window_size
+
+        self.w_msa = WindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=to_2tuple(window_size),
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate,
+            init_cfg=None)
+
+        self.drop = build_dropout(dropout_layer)
+
+    def forward(self, query, hw_shape):
+        B, L, C = query.shape
+        H, W = hw_shape
+        assert L == H * W, 'input feature has wrong size'
+        query = query.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
+        H_pad, W_pad = query.shape[1], query.shape[2]
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_query = torch.roll(
+                query,
+                shifts=(-self.shift_size, -self.shift_size),
+                dims=(1, 2))
+
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device)
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            # nW, window_size, window_size, 1
+            mask_windows = self.window_partition(img_mask)
+            mask_windows = mask_windows.view(
+                -1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                              float(-100.0)).masked_fill(
+                                                  attn_mask == 0, float(0.0))
+        else:
+            shifted_query = query
+            attn_mask = None
+
+        # nW*B, window_size, window_size, C
+        query_windows = self.window_partition(shifted_query)
+        # nW*B, window_size*window_size, C
+        query_windows = query_windows.view(-1, self.window_size**2, C)
+
+        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
+        attn_windows = self.w_msa(query_windows, mask=attn_mask)
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+
+        # B H' W' C
+        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        x = self.drop(x)
+        return x
+
+    def window_reverse(self, windows, H, W):
+        """
+        Args:
+            windows: (num_windows*B, window_size, window_size, C)
+            H (int): Height of image
+            W (int): Width of image
+        Returns:
+            x: (B, H, W, C)
+        """
+        window_size = self.window_size
+        B = int(windows.shape[0] / (H * W / window_size / window_size))
+        x = windows.view(B, H // window_size, W // window_size, window_size,
+                         window_size, -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+        return x
+
+    def window_partition(self, x):
+        """
+        Args:
+            x: (B, H, W, C)
+        Returns:
+            windows: (num_windows*B, window_size, window_size, C)
+        """
+        B, H, W, C = x.shape
+        window_size = self.window_size
+        x = x.view(B, H // window_size, window_size, W // window_size,
+                   window_size, C)
+        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+        windows = windows.view(-1, window_size, window_size, C)
+        return windows
+
+
+class SwinBlock(BaseModule):
+    """"
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        window_size (int, optional): The local window scale. Default: 7.
+        shift (bool, optional): whether to shift window or not. Default False.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 window_size=7,
+                 shift=False,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+
+        self.with_cp = with_cp
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.attn = ShiftWindowMSA(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            shift_size=window_size // 2 if shift else 0,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            init_cfg=None)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=2,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=True,
+            init_cfg=None)
+
+    def forward(self, x, hw_shape):
+
+        def _inner_forward(x):
+            identity = x
+            x = self.norm1(x)
+            x = self.attn(x, hw_shape)
+
+            x = x + identity
+
+            identity = x
+            x = self.norm2(x)
+            x = self.ffn(x, identity=identity)
+
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+
+        return x
+
+
+class SwinBlockSequence(BaseModule):
+    """Implements one stage in Swin Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        depth (int): The number of blocks in this stage.
+        window_size (int, optional): The local window scale. Default: 7.
+        qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
+        drop_path_rate (float | list[float], optional): Stochastic depth
+            rate. Default: 0.
+        downsample (BaseModule | None, optional): The downsample operation
+            module. Default: None.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 depth,
+                 window_size=7,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 downsample=None,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(drop_path_rate, list):
+            drop_path_rates = drop_path_rate
+            assert len(drop_path_rates) == depth
+        else:
+            drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)]
+
+        self.blocks = ModuleList()
+        for i in range(depth):
+            block = SwinBlock(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                feedforward_channels=feedforward_channels,
+                window_size=window_size,
+                shift=False if i % 2 == 0 else True,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=drop_path_rates[i],
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.blocks.append(block)
+
+        self.downsample = downsample
+
+    def forward(self, x, hw_shape):
+        for block in self.blocks:
+            x = block(x, hw_shape)
+
+        if self.downsample:
+            x_down, down_hw_shape = self.downsample(x, hw_shape)
+            return x_down, down_hw_shape, x, hw_shape
+        else:
+            return x, hw_shape, x, hw_shape
+
+
+@MODELS.register_module()
+class SwinTransformer(BaseModule):
+    """Swin Transformer backbone.
+
+    This backbone is the implementation of `Swin Transformer:
+    Hierarchical Vision Transformer using Shifted
+    Windows <https://arxiv.org/abs/2103.14030>`_.
+    Inspiration from https://github.com/microsoft/Swin-Transformer.
+
+    Args:
+        pretrain_img_size (int | tuple[int]): The size of input image when
+            pretrain. Defaults: 224.
+        in_channels (int): The num of input channels.
+            Defaults: 3.
+        embed_dims (int): The feature dimension. Default: 96.
+        patch_size (int | tuple[int]): Patch size. Default: 4.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (int | float): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+            Default: (2, 2, 6, 2).
+        num_heads (tuple[int]): Parallel attention heads of each Swin
+            Transformer stage. Default: (3, 6, 12, 24).
+        strides (tuple[int]): The patch merging or patch embedding stride of
+            each Swin Transformer stage. (In swin, we set kernel size equal to
+            stride.) Default: (4, 2, 2, 2).
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key,
+            value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        patch_norm (bool): If add a norm layer for patch embed and patch
+            merging. Default: True.
+        drop_rate (float): Dropout rate. Defaults: 0.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults: 0.1.
+        use_abs_pos_embed (bool): If True, add absolute position embedding to
+            the patch embedding. Defaults: False.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='LN').
+        norm_cfg (dict): Config dict for normalization layer at
+            output of backone. Defaults: dict(type='LN').
+        with_cp (bool, optional): Use checkpoint or not. Using checkpoint
+            will save some memory while slowing down the training speed.
+            Default: False.
+        pretrained (str, optional): model pretrained path. Default: None.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 pretrain_img_size=224,
+                 in_channels=3,
+                 embed_dims=96,
+                 patch_size=4,
+                 window_size=7,
+                 mlp_ratio=4,
+                 depths=(2, 2, 6, 2),
+                 num_heads=(3, 6, 12, 24),
+                 strides=(4, 2, 2, 2),
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=True,
+                 qk_scale=None,
+                 patch_norm=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 use_abs_pos_embed=False,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 with_cp=False,
+                 pretrained=None,
+                 frozen_stages=-1,
+                 init_cfg=None):
+        self.frozen_stages = frozen_stages
+
+        if isinstance(pretrain_img_size, int):
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+        elif isinstance(pretrain_img_size, tuple):
+            if len(pretrain_img_size) == 1:
+                pretrain_img_size = to_2tuple(pretrain_img_size[0])
+            assert len(pretrain_img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(pretrain_img_size)}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            init_cfg = init_cfg
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        super().__init__(init_cfg=init_cfg)
+
+        num_layers = len(depths)
+        self.out_indices = out_indices
+        self.use_abs_pos_embed = use_abs_pos_embed
+
+        assert strides[0] == patch_size, 'Use non-overlapping patch embed.'
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=strides[0],
+            padding='corner',
+            norm_cfg=norm_cfg if patch_norm else None,
+            init_cfg=None)
+
+        if self.use_abs_pos_embed:
+            patch_row = pretrain_img_size[0] // patch_size
+            patch_col = pretrain_img_size[1] // patch_size
+            num_patches = patch_row * patch_col
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros((1, num_patches, embed_dims)))
+
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+
+        # set stochastic depth decay rule
+        total_depth = sum(depths)
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
+        ]
+
+        self.stages = ModuleList()
+        in_channels = embed_dims
+        for i in range(num_layers):
+            if i < num_layers - 1:
+                downsample = PatchMerging(
+                    in_channels=in_channels,
+                    out_channels=2 * in_channels,
+                    stride=strides[i + 1],
+                    norm_cfg=norm_cfg if patch_norm else None,
+                    init_cfg=None)
+            else:
+                downsample = None
+
+            stage = SwinBlockSequence(
+                embed_dims=in_channels,
+                num_heads=num_heads[i],
+                feedforward_channels=int(mlp_ratio * in_channels),
+                depth=depths[i],
+                window_size=window_size,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop_rate=drop_rate,
+                attn_drop_rate=attn_drop_rate,
+                drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
+                downsample=downsample,
+                act_cfg=act_cfg,
+                norm_cfg=norm_cfg,
+                with_cp=with_cp,
+                init_cfg=None)
+            self.stages.append(stage)
+            if downsample:
+                in_channels = downsample.out_channels
+
+        self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)]
+        # Add a norm layer for each output
+        for i in out_indices:
+            layer = build_norm_layer(norm_cfg, self.num_features[i])[1]
+            layer_name = f'norm{i}'
+            self.add_module(layer_name, layer)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super().train(mode)
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+            if self.use_abs_pos_embed:
+                self.absolute_pos_embed.requires_grad = False
+            self.drop_after_pos.eval()
+
+        for i in range(1, self.frozen_stages + 1):
+
+            if (i - 1) in self.out_indices:
+                norm_layer = getattr(self, f'norm{i-1}')
+                norm_layer.eval()
+                for param in norm_layer.parameters():
+                    param.requires_grad = False
+
+            m = self.stages[i - 1]
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def init_weights(self):
+        if self.init_cfg is None:
+            print_log(f'No pre-trained weights for '
+                      f'{self.__class__.__name__}, '
+                      f'training start from scratch')
+            if self.use_abs_pos_embed:
+                trunc_normal_(self.absolute_pos_embed, std=0.02)
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, val=1.0, bias=0.)
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            ckpt = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+            if 'state_dict' in ckpt:
+                _state_dict = ckpt['state_dict']
+            elif 'model' in ckpt:
+                _state_dict = ckpt['model']
+            else:
+                _state_dict = ckpt
+
+            state_dict = OrderedDict()
+            for k, v in _state_dict.items():
+                if k.startswith('backbone.'):
+                    state_dict[k[9:]] = v
+                else:
+                    state_dict[k] = v
+
+            # strip prefix of state_dict
+            if list(state_dict.keys())[0].startswith('module.'):
+                state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+            # reshape absolute position embedding
+            if state_dict.get('absolute_pos_embed') is not None:
+                absolute_pos_embed = state_dict['absolute_pos_embed']
+                N1, L, C1 = absolute_pos_embed.size()
+                N2, C2, H, W = self.absolute_pos_embed.size()
+                if N1 != N2 or C1 != C2 or L != H * W:
+                    print_log('Error in loading absolute_pos_embed, pass')
+                else:
+                    state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
+                        N2, H, W, C2).permute(0, 3, 1, 2).contiguous()
+
+            # interpolate position bias table if needed
+            relative_position_bias_table_keys = [
+                k for k in state_dict.keys()
+                if 'relative_position_bias_table' in k
+            ]
+            for table_key in relative_position_bias_table_keys:
+                table_pretrained = state_dict[table_key]
+                if table_key in self.state_dict():
+                    table_current = self.state_dict()[table_key]
+                    L1, nH1 = table_pretrained.size()
+                    L2, nH2 = table_current.size()
+                    if nH1 != nH2:
+                        print_log(f'Error in loading {table_key}, pass')
+                    elif L1 != L2:
+                        S1 = int(L1**0.5)
+                        S2 = int(L2**0.5)
+                        table_pretrained_resized = F.interpolate(
+                            table_pretrained.permute(1, 0).reshape(
+                                1, nH1, S1, S1),
+                            size=(S2, S2),
+                            mode='bicubic')
+                        state_dict[table_key] = table_pretrained_resized.view(
+                            nH2, L2).permute(1, 0).contiguous()
+
+            # load state_dict
+            self.load_state_dict(state_dict, strict=False)
+
+    def forward(self, x):
+        x, hw_shape = self.patch_embed(x)
+
+        if self.use_abs_pos_embed:
+            x = x + self.absolute_pos_embed
+        x = self.drop_after_pos(x)
+
+        outs = []
+        for i, stage in enumerate(self.stages):
+            x, hw_shape, out, out_hw_shape = stage(x, hw_shape)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                out = norm_layer(out)
+                out = out.view(-1, *out_hw_shape,
+                               self.num_features[i]).permute(0, 3, 1,
+                                                             2).contiguous()
+                outs.append(out)
+
+        return outs
diff --git a/head_extractor/src/mmseg/models/backbones/timm_backbone.py b/head_extractor/src/mmseg/models/backbones/timm_backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eef302bddeac3cee71412bcb481b68b796e515f
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/timm_backbone.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+try:
+    import timm
+except ImportError:
+    timm = None
+
+from mmengine.model import BaseModule
+from mmengine.registry import MODELS as MMENGINE_MODELS
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class TIMMBackbone(BaseModule):
+    """Wrapper to use backbones from timm library. More details can be found in
+    `timm <https://github.com/rwightman/pytorch-image-models>`_ .
+
+    Args:
+        model_name (str): Name of timm model to instantiate.
+        pretrained (bool): Load pretrained weights if True.
+        checkpoint_path (str): Path of checkpoint to load after
+            model is initialized.
+        in_channels (int): Number of input image channels. Default: 3.
+        init_cfg (dict, optional): Initialization config dict
+        **kwargs: Other timm & model specific arguments.
+    """
+
+    def __init__(
+        self,
+        model_name,
+        features_only=True,
+        pretrained=True,
+        checkpoint_path='',
+        in_channels=3,
+        init_cfg=None,
+        **kwargs,
+    ):
+        if timm is None:
+            raise RuntimeError('timm is not installed')
+        super().__init__(init_cfg)
+        if 'norm_layer' in kwargs:
+            kwargs['norm_layer'] = MMENGINE_MODELS.get(kwargs['norm_layer'])
+        self.timm_model = timm.create_model(
+            model_name=model_name,
+            features_only=features_only,
+            pretrained=pretrained,
+            in_chans=in_channels,
+            checkpoint_path=checkpoint_path,
+            **kwargs,
+        )
+
+        # Make unused parameters None
+        self.timm_model.global_pool = None
+        self.timm_model.fc = None
+        self.timm_model.classifier = None
+
+        # Hack to use pretrained weights from timm
+        if pretrained or checkpoint_path:
+            self._is_init = True
+
+    def forward(self, x):
+        features = self.timm_model(x)
+        return features
diff --git a/head_extractor/src/mmseg/models/backbones/twins.py b/head_extractor/src/mmseg/models/backbones/twins.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6a6eea795cf53bee6b52ece80d5d90ecc969970
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/twins.py
@@ -0,0 +1,588 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.transformer import FFN
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, normal_init,
+                                        trunc_normal_init)
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from mmseg.models.backbones.mit import EfficientMultiheadAttention
+from mmseg.registry import MODELS
+from ..utils.embed import PatchEmbed
+
+
+class GlobalSubsampledAttention(EfficientMultiheadAttention):
+    """Global Sub-sampled Attention (Spatial Reduction Attention)
+
+    This module is modified from EfficientMultiheadAttention，
+    which is a module from mmseg.models.backbones.mit.py.
+    Specifically, there is no difference between
+    `GlobalSubsampledAttention` and `EfficientMultiheadAttention`,
+    `GlobalSubsampledAttention` is built as a brand new class
+    because it is renamed as `Global sub-sampled attention (GSA)`
+    in paper.
+
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut. Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dims)
+            or (n, batch, embed_dims). Default: False.
+        qkv_bias (bool): enable bias for qkv if True. Default: True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (int): The ratio of spatial reduction of GSA of PCPVT.
+            Default: 1.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=None,
+                 batch_first=True,
+                 qkv_bias=True,
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1,
+                 init_cfg=None):
+        super().__init__(
+            embed_dims,
+            num_heads,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            dropout_layer=dropout_layer,
+            batch_first=batch_first,
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio,
+            init_cfg=init_cfg)
+
+
+class GSAEncoderLayer(BaseModule):
+    """Implements one encoder layer with GSA.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): Stochastic depth rate. Default 0.0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        sr_ratio (float): Kernel_size of conv in Attention modules. Default: 1.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 num_fcs=2,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 sr_ratio=1.,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1]
+        self.attn = GlobalSubsampledAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            sr_ratio=sr_ratio)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims, postfix=2)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=num_fcs,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=False)
+
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path_rate)
+        ) if drop_path_rate > 0. else nn.Identity()
+
+    def forward(self, x, hw_shape):
+        x = x + self.drop_path(self.attn(self.norm1(x), hw_shape, identity=0.))
+        x = x + self.drop_path(self.ffn(self.norm2(x)))
+        return x
+
+
+class LocallyGroupedSelfAttention(BaseModule):
+    """Locally-grouped Self Attention (LSA) module.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        num_heads (int): Number of attention heads. Default: 8
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: False.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        window_size(int): Window size of LSA. Default: 1.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 window_size=1,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        assert embed_dims % num_heads == 0, f'dim {embed_dims} should be ' \
+                                            f'divided by num_heads ' \
+                                            f'{num_heads}.'
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        head_dim = embed_dims // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+        self.window_size = window_size
+
+    def forward(self, x, hw_shape):
+        b, n, c = x.shape
+        h, w = hw_shape
+        x = x.view(b, h, w, c)
+
+        # pad feature maps to multiples of Local-groups
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - w % self.window_size) % self.window_size
+        pad_b = (self.window_size - h % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+
+        # calculate attention mask for LSA
+        Hp, Wp = x.shape[1:-1]
+        _h, _w = Hp // self.window_size, Wp // self.window_size
+        mask = torch.zeros((1, Hp, Wp), device=x.device)
+        mask[:, -pad_b:, :].fill_(1)
+        mask[:, :, -pad_r:].fill_(1)
+
+        # [B, _h, _w, window_size, window_size, C]
+        x = x.reshape(b, _h, self.window_size, _w, self.window_size,
+                      c).transpose(2, 3)
+        mask = mask.reshape(1, _h, self.window_size, _w,
+                            self.window_size).transpose(2, 3).reshape(
+                                1, _h * _w,
+                                self.window_size * self.window_size)
+        # [1, _h*_w, window_size*window_size, window_size*window_size]
+        attn_mask = mask.unsqueeze(2) - mask.unsqueeze(3)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                          float(-1000.0)).masked_fill(
+                                              attn_mask == 0, float(0.0))
+
+        # [3, B, _w*_h, nhead, window_size*window_size, dim]
+        qkv = self.qkv(x).reshape(b, _h * _w,
+                                  self.window_size * self.window_size, 3,
+                                  self.num_heads, c // self.num_heads).permute(
+                                      3, 0, 1, 4, 2, 5)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        # [B, _h*_w, n_head, window_size*window_size, window_size*window_size]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn + attn_mask.unsqueeze(2)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        attn = (attn @ v).transpose(2, 3).reshape(b, _h, _w, self.window_size,
+                                                  self.window_size, c)
+        x = attn.transpose(2, 3).reshape(b, _h * self.window_size,
+                                         _w * self.window_size, c)
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :h, :w, :].contiguous()
+
+        x = x.reshape(b, n, c)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class LSAEncoderLayer(BaseModule):
+    """Implements one encoder layer in Twins-SVT.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+           Default: 0.0
+        drop_path_rate (float): Stochastic depth rate. Default 0.0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+           head_dim ** -0.5 if set. Default: None.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        window_size (int): Window size of LSA. Default: 1.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 num_fcs=2,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 window_size=1,
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1]
+        self.attn = LocallyGroupedSelfAttention(embed_dims, num_heads,
+                                                qkv_bias, qk_scale,
+                                                attn_drop_rate, drop_rate,
+                                                window_size)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims, postfix=2)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=num_fcs,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=False)
+
+        self.drop_path = build_dropout(
+            dict(type='DropPath', drop_prob=drop_path_rate)
+        ) if drop_path_rate > 0. else nn.Identity()
+
+    def forward(self, x, hw_shape):
+        x = x + self.drop_path(self.attn(self.norm1(x), hw_shape))
+        x = x + self.drop_path(self.ffn(self.norm2(x)))
+        return x
+
+
+class ConditionalPositionEncoding(BaseModule):
+    """The Conditional Position Encoding (CPE) module.
+
+    The CPE is the implementation of 'Conditional Positional Encodings
+    for Vision Transformers <https://arxiv.org/abs/2102.10882>'_.
+
+    Args:
+       in_channels (int): Number of input channels.
+       embed_dims (int): The feature dimension. Default: 768.
+       stride (int): Stride of conv layer. Default: 1.
+    """
+
+    def __init__(self, in_channels, embed_dims=768, stride=1, init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.proj = nn.Conv2d(
+            in_channels,
+            embed_dims,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=True,
+            groups=embed_dims)
+        self.stride = stride
+
+    def forward(self, x, hw_shape):
+        b, n, c = x.shape
+        h, w = hw_shape
+        feat_token = x
+        cnn_feat = feat_token.transpose(1, 2).view(b, c, h, w)
+        if self.stride == 1:
+            x = self.proj(cnn_feat) + cnn_feat
+        else:
+            x = self.proj(cnn_feat)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+
+
+@MODELS.register_module()
+class PCPVT(BaseModule):
+    """The backbone of Twins-PCPVT.
+
+    This backbone is the implementation of `Twins: Revisiting the Design
+    of Spatial Attention in Vision Transformers
+    <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (list): Embedding dimension. Default: [64, 128, 256, 512].
+        patch_sizes (list): The patch sizes. Default: [4, 2, 2, 2].
+        strides (list): The strides. Default: [4, 2, 2, 2].
+        num_heads (int): Number of attention heads. Default: [1, 2, 4, 8].
+        mlp_ratios (int): Ratio of mlp hidden dim to embedding dim.
+            Default: [4, 4, 4, 4].
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool): Enable bias for qkv if True. Default: False.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): Stochastic depth rate. Default 0.0
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        depths (list): Depths of each stage. Default [3, 4, 6, 3]
+        sr_ratios (list): Kernel_size of conv in each Attn module in
+            Transformer encoder layer. Default: [8, 4, 2, 1].
+        norm_after_stage（bool): Add extra norm. Default False.
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=[64, 128, 256, 512],
+                 patch_sizes=[4, 2, 2, 2],
+                 strides=[4, 2, 2, 2],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_cfg=dict(type='LN'),
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 norm_after_stage=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+        self.depths = depths
+
+        # patch_embed
+        self.patch_embeds = ModuleList()
+        self.position_encoding_drops = ModuleList()
+        self.layers = ModuleList()
+
+        for i in range(len(depths)):
+            self.patch_embeds.append(
+                PatchEmbed(
+                    in_channels=in_channels if i == 0 else embed_dims[i - 1],
+                    embed_dims=embed_dims[i],
+                    conv_type='Conv2d',
+                    kernel_size=patch_sizes[i],
+                    stride=strides[i],
+                    padding='corner',
+                    norm_cfg=norm_cfg))
+
+            self.position_encoding_drops.append(nn.Dropout(p=drop_rate))
+
+        self.position_encodings = ModuleList([
+            ConditionalPositionEncoding(embed_dim, embed_dim)
+            for embed_dim in embed_dims
+        ])
+
+        # transformer encoder
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        cur = 0
+
+        for k in range(len(depths)):
+            _block = ModuleList([
+                GSAEncoderLayer(
+                    embed_dims=embed_dims[k],
+                    num_heads=num_heads[k],
+                    feedforward_channels=mlp_ratios[k] * embed_dims[k],
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    drop_path_rate=dpr[cur + i],
+                    num_fcs=2,
+                    qkv_bias=qkv_bias,
+                    act_cfg=dict(type='GELU'),
+                    norm_cfg=dict(type='LN'),
+                    sr_ratio=sr_ratios[k]) for i in range(depths[k])
+            ])
+            self.layers.append(_block)
+            cur += depths[k]
+
+        self.norm_name, norm = build_norm_layer(
+            norm_cfg, embed_dims[-1], postfix=1)
+
+        self.out_indices = out_indices
+        self.norm_after_stage = norm_after_stage
+        if self.norm_after_stage:
+            self.norm_list = ModuleList()
+            for dim in embed_dims:
+                self.norm_list.append(build_norm_layer(norm_cfg, dim)[1])
+
+    def init_weights(self):
+        if self.init_cfg is not None:
+            super().init_weights()
+        else:
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m, val=1.0, bias=0.)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(
+                        m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
+
+    def forward(self, x):
+        outputs = list()
+
+        b = x.shape[0]
+
+        for i in range(len(self.depths)):
+            x, hw_shape = self.patch_embeds[i](x)
+            h, w = hw_shape
+            x = self.position_encoding_drops[i](x)
+            for j, blk in enumerate(self.layers[i]):
+                x = blk(x, hw_shape)
+                if j == 0:
+                    x = self.position_encodings[i](x, hw_shape)
+            if self.norm_after_stage:
+                x = self.norm_list[i](x)
+            x = x.reshape(b, h, w, -1).permute(0, 3, 1, 2).contiguous()
+
+            if i in self.out_indices:
+                outputs.append(x)
+
+        return tuple(outputs)
+
+
+@MODELS.register_module()
+class SVT(PCPVT):
+    """The backbone of Twins-SVT.
+
+    This backbone is the implementation of `Twins: Revisiting the Design
+    of Spatial Attention in Vision Transformers
+    <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (list): Embedding dimension. Default: [64, 128, 256, 512].
+        patch_sizes (list): The patch sizes. Default: [4, 2, 2, 2].
+        strides (list): The strides. Default: [4, 2, 2, 2].
+        num_heads (int): Number of attention heads. Default: [1, 2, 4].
+        mlp_ratios (int): Ratio of mlp hidden dim to embedding dim.
+            Default: [4, 4, 4].
+        out_indices (tuple[int]): Output from which stages.
+            Default: (0, 1, 2, 3).
+        qkv_bias (bool): Enable bias for qkv if True. Default: False.
+        drop_rate (float): Dropout rate. Default 0.
+        attn_drop_rate (float): Dropout ratio of attention weight.
+            Default 0.0
+        drop_path_rate (float): Stochastic depth rate. Default 0.2.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        depths (list): Depths of each stage. Default [4, 4, 4].
+        sr_ratios (list): Kernel_size of conv in each Attn module in
+            Transformer encoder layer. Default: [4, 2, 1].
+        windiow_sizes (list): Window size of LSA. Default: [7, 7, 7],
+        input_features_slice（bool): Input features need slice. Default: False.
+        norm_after_stage（bool): Add extra norm. Default False.
+        strides (list): Strides in patch-Embedding modules. Default: (2, 2, 2)
+        init_cfg (dict, optional): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=[64, 128, 256],
+                 patch_sizes=[4, 2, 2, 2],
+                 strides=[4, 2, 2, 2],
+                 num_heads=[1, 2, 4],
+                 mlp_ratios=[4, 4, 4],
+                 out_indices=(0, 1, 2, 3),
+                 qkv_bias=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_cfg=dict(type='LN'),
+                 depths=[4, 4, 4],
+                 sr_ratios=[4, 2, 1],
+                 windiow_sizes=[7, 7, 7],
+                 norm_after_stage=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(in_channels, embed_dims, patch_sizes, strides,
+                         num_heads, mlp_ratios, out_indices, qkv_bias,
+                         drop_rate, attn_drop_rate, drop_path_rate, norm_cfg,
+                         depths, sr_ratios, norm_after_stage, pretrained,
+                         init_cfg)
+        # transformer encoder
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        for k in range(len(depths)):
+            for i in range(depths[k]):
+                if i % 2 == 0:
+                    self.layers[k][i] = \
+                        LSAEncoderLayer(
+                            embed_dims=embed_dims[k],
+                            num_heads=num_heads[k],
+                            feedforward_channels=mlp_ratios[k] * embed_dims[k],
+                            drop_rate=drop_rate,
+                            attn_drop_rate=attn_drop_rate,
+                            drop_path_rate=dpr[sum(depths[:k])+i],
+                            qkv_bias=qkv_bias,
+                            window_size=windiow_sizes[k])
diff --git a/head_extractor/src/mmseg/models/backbones/unet.py b/head_extractor/src/mmseg/models/backbones/unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..545921db8e14668e454f5834f9a1618fe0c04ffe
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/unet.py
@@ -0,0 +1,436 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+from mmengine.model import BaseModule
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
+
+from mmseg.registry import MODELS
+from ..utils import UpConvBlock, Upsample
+
+
+class BasicConvBlock(nn.Module):
+    """Basic convolutional block for UNet.
+
+    This module consists of several plain convolutional layers.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_convs (int): Number of convolutional layers. Default: 2.
+        stride (int): Whether use stride convolution to downsample
+            the input feature map. If stride=2, it only uses stride convolution
+            in the first convolutional layer to downsample the input feature
+            map. Options are 1 or 2. Default: 1.
+        dilation (int): Whether use dilated convolution to expand the
+            receptive field. Set dilation rate of each convolutional layer and
+            the dilation rate of the first convolutional layer is always 1.
+            Default: 1.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        conv_cfg (dict | None): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        dcn (bool): Use deformable convolution in convolutional layer or not.
+            Default: None.
+        plugins (dict): plugins for convolutional layers. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_convs=2,
+                 stride=1,
+                 dilation=1,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 dcn=None,
+                 plugins=None):
+        super().__init__()
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.with_cp = with_cp
+        convs = []
+        for i in range(num_convs):
+            convs.append(
+                ConvModule(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    stride=stride if i == 0 else 1,
+                    dilation=1 if i == 0 else dilation,
+                    padding=1 if i == 0 else dilation,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+        self.convs = nn.Sequential(*convs)
+
+    def forward(self, x):
+        """Forward function."""
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(self.convs, x)
+        else:
+            out = self.convs(x)
+        return out
+
+
+@MODELS.register_module()
+class DeconvModule(nn.Module):
+    """Deconvolution upsample module in decoder for UNet (2X upsample).
+
+    This module uses deconvolution to upsample feature map in the decoder
+    of UNet.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        kernel_size (int): Kernel size of the convolutional layer. Default: 4.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 with_cp=False,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 *,
+                 kernel_size=4,
+                 scale_factor=2):
+        super().__init__()
+
+        assert (kernel_size - scale_factor >= 0) and\
+               (kernel_size - scale_factor) % 2 == 0,\
+               f'kernel_size should be greater than or equal to scale_factor '\
+               f'and (kernel_size - scale_factor) should be even numbers, '\
+               f'while the kernel size is {kernel_size} and scale_factor is '\
+               f'{scale_factor}.'
+
+        stride = scale_factor
+        padding = (kernel_size - scale_factor) // 2
+        self.with_cp = with_cp
+        deconv = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding)
+
+        norm_name, norm = build_norm_layer(norm_cfg, out_channels)
+        activate = build_activation_layer(act_cfg)
+        self.deconv_upsamping = nn.Sequential(deconv, norm, activate)
+
+    def forward(self, x):
+        """Forward function."""
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(self.deconv_upsamping, x)
+        else:
+            out = self.deconv_upsamping(x)
+        return out
+
+
+@MODELS.register_module()
+class InterpConv(nn.Module):
+    """Interpolation upsample module in decoder for UNet.
+
+    This module uses interpolation to upsample feature map in the decoder
+    of UNet. It consists of one interpolation upsample layer and one
+    convolutional layer. It can be one interpolation upsample layer followed
+    by one convolutional layer (conv_first=False) or one convolutional layer
+    followed by one interpolation upsample layer (conv_first=True).
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        conv_cfg (dict | None): Config dict for convolution layer.
+            Default: None.
+        conv_first (bool): Whether convolutional layer or interpolation
+            upsample layer first. Default: False. It means interpolation
+            upsample layer followed by one convolutional layer.
+        kernel_size (int): Kernel size of the convolutional layer. Default: 1.
+        stride (int): Stride of the convolutional layer. Default: 1.
+        padding (int): Padding of the convolutional layer. Default: 1.
+        upsample_cfg (dict): Interpolation config of the upsample layer.
+            Default: dict(
+                scale_factor=2, mode='bilinear', align_corners=False).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 with_cp=False,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 *,
+                 conv_cfg=None,
+                 conv_first=False,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 upsample_cfg=dict(
+                     scale_factor=2, mode='bilinear', align_corners=False)):
+        super().__init__()
+
+        self.with_cp = with_cp
+        conv = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        upsample = Upsample(**upsample_cfg)
+        if conv_first:
+            self.interp_upsample = nn.Sequential(conv, upsample)
+        else:
+            self.interp_upsample = nn.Sequential(upsample, conv)
+
+    def forward(self, x):
+        """Forward function."""
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(self.interp_upsample, x)
+        else:
+            out = self.interp_upsample(x)
+        return out
+
+
+@MODELS.register_module()
+class UNet(BaseModule):
+    """UNet backbone.
+
+    This backbone is the implementation of `U-Net: Convolutional Networks
+    for Biomedical Image Segmentation <https://arxiv.org/abs/1505.04597>`_.
+
+    Args:
+        in_channels (int): Number of input image channels. Default" 3.
+        base_channels (int): Number of base channels of each stage.
+            The output channels of the first stage. Default: 64.
+        num_stages (int): Number of stages in encoder, normally 5. Default: 5.
+        strides (Sequence[int 1 | 2]): Strides of each stage in encoder.
+            len(strides) is equal to num_stages. Normally the stride of the
+            first stage in encoder is 1. If strides[i]=2, it uses stride
+            convolution to downsample in the correspondence encoder stage.
+            Default: (1, 1, 1, 1, 1).
+        enc_num_convs (Sequence[int]): Number of convolutional layers in the
+            convolution block of the correspondence encoder stage.
+            Default: (2, 2, 2, 2, 2).
+        dec_num_convs (Sequence[int]): Number of convolutional layers in the
+            convolution block of the correspondence decoder stage.
+            Default: (2, 2, 2, 2).
+        downsamples (Sequence[int]): Whether use MaxPool to downsample the
+            feature map after the first stage of encoder
+            (stages: [1, num_stages)). If the correspondence encoder stage use
+            stride convolution (strides[i]=2), it will never use MaxPool to
+            downsample, even downsamples[i-1]=True.
+            Default: (True, True, True, True).
+        enc_dilations (Sequence[int]): Dilation rate of each stage in encoder.
+            Default: (1, 1, 1, 1, 1).
+        dec_dilations (Sequence[int]): Dilation rate of each stage in decoder.
+            Default: (1, 1, 1, 1).
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        conv_cfg (dict | None): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        upsample_cfg (dict): The upsample config of the upsample module in
+            decoder. Default: dict(type='InterpConv').
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        dcn (bool): Use deformable convolution in convolutional layer or not.
+            Default: None.
+        plugins (dict): plugins for convolutional layers. Default: None.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Notice:
+        The input image size should be divisible by the whole downsample rate
+        of the encoder. More detail of the whole downsample rate can be found
+        in UNet._check_input_divisible.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 base_channels=64,
+                 num_stages=5,
+                 strides=(1, 1, 1, 1, 1),
+                 enc_num_convs=(2, 2, 2, 2, 2),
+                 dec_num_convs=(2, 2, 2, 2),
+                 downsamples=(True, True, True, True),
+                 enc_dilations=(1, 1, 1, 1, 1),
+                 dec_dilations=(1, 1, 1, 1),
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 upsample_cfg=dict(type='InterpConv'),
+                 norm_eval=False,
+                 dcn=None,
+                 plugins=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+        assert len(strides) == num_stages, \
+            'The length of strides should be equal to num_stages, '\
+            f'while the strides is {strides}, the length of '\
+            f'strides is {len(strides)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(enc_num_convs) == num_stages, \
+            'The length of enc_num_convs should be equal to num_stages, '\
+            f'while the enc_num_convs is {enc_num_convs}, the length of '\
+            f'enc_num_convs is {len(enc_num_convs)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(dec_num_convs) == (num_stages-1), \
+            'The length of dec_num_convs should be equal to (num_stages-1), '\
+            f'while the dec_num_convs is {dec_num_convs}, the length of '\
+            f'dec_num_convs is {len(dec_num_convs)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(downsamples) == (num_stages-1), \
+            'The length of downsamples should be equal to (num_stages-1), '\
+            f'while the downsamples is {downsamples}, the length of '\
+            f'downsamples is {len(downsamples)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(enc_dilations) == num_stages, \
+            'The length of enc_dilations should be equal to num_stages, '\
+            f'while the enc_dilations is {enc_dilations}, the length of '\
+            f'enc_dilations is {len(enc_dilations)}, and the num_stages is '\
+            f'{num_stages}.'
+        assert len(dec_dilations) == (num_stages-1), \
+            'The length of dec_dilations should be equal to (num_stages-1), '\
+            f'while the dec_dilations is {dec_dilations}, the length of '\
+            f'dec_dilations is {len(dec_dilations)}, and the num_stages is '\
+            f'{num_stages}.'
+        self.num_stages = num_stages
+        self.strides = strides
+        self.downsamples = downsamples
+        self.norm_eval = norm_eval
+        self.base_channels = base_channels
+
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+
+        for i in range(num_stages):
+            enc_conv_block = []
+            if i != 0:
+                if strides[i] == 1 and downsamples[i - 1]:
+                    enc_conv_block.append(nn.MaxPool2d(kernel_size=2))
+                upsample = (strides[i] != 1 or downsamples[i - 1])
+                self.decoder.append(
+                    UpConvBlock(
+                        conv_block=BasicConvBlock,
+                        in_channels=base_channels * 2**i,
+                        skip_channels=base_channels * 2**(i - 1),
+                        out_channels=base_channels * 2**(i - 1),
+                        num_convs=dec_num_convs[i - 1],
+                        stride=1,
+                        dilation=dec_dilations[i - 1],
+                        with_cp=with_cp,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg,
+                        upsample_cfg=upsample_cfg if upsample else None,
+                        dcn=None,
+                        plugins=None))
+
+            enc_conv_block.append(
+                BasicConvBlock(
+                    in_channels=in_channels,
+                    out_channels=base_channels * 2**i,
+                    num_convs=enc_num_convs[i],
+                    stride=strides[i],
+                    dilation=enc_dilations[i],
+                    with_cp=with_cp,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    dcn=None,
+                    plugins=None))
+            self.encoder.append(nn.Sequential(*enc_conv_block))
+            in_channels = base_channels * 2**i
+
+    def forward(self, x):
+        self._check_input_divisible(x)
+        enc_outs = []
+        for enc in self.encoder:
+            x = enc(x)
+            enc_outs.append(x)
+        dec_outs = [x]
+        for i in reversed(range(len(self.decoder))):
+            x = self.decoder[i](enc_outs[i], x)
+            dec_outs.append(x)
+
+        return dec_outs
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super().train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+    def _check_input_divisible(self, x):
+        h, w = x.shape[-2:]
+        whole_downsample_rate = 1
+        for i in range(1, self.num_stages):
+            if self.strides[i] == 2 or self.downsamples[i - 1]:
+                whole_downsample_rate *= 2
+        assert (h % whole_downsample_rate == 0) \
+            and (w % whole_downsample_rate == 0),\
+            f'The input image size {(h, w)} should be divisible by the whole '\
+            f'downsample rate {whole_downsample_rate}, when num_stages is '\
+            f'{self.num_stages}, strides is {self.strides}, and downsamples '\
+            f'is {self.downsamples}.'
diff --git a/head_extractor/src/mmseg/models/backbones/vit.py b/head_extractor/src/mmseg/models/backbones/vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd0f688fcc46680b13904a26f14269b3d19d6ce3
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/vit.py
@@ -0,0 +1,501 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmengine.logging import print_log
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, kaiming_init,
+                                        trunc_normal_)
+from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn.modules.utils import _pair as to_2tuple
+
+from mmseg.registry import MODELS
+from ..utils import PatchEmbed, resize
+
+
+class TransformerEncoderLayer(BaseModule):
+    """Implements one encoder layer in Vision Transformer.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        qkv_bias (bool): enable bias for qkv if True. Default: True
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: True.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 num_fcs=2,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 batch_first=True,
+                 attn_cfg=dict(),
+                 ffn_cfg=dict(),
+                 with_cp=False):
+        super().__init__()
+
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, embed_dims, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+
+        attn_cfg.update(
+            dict(
+                embed_dims=embed_dims,
+                num_heads=num_heads,
+                attn_drop=attn_drop_rate,
+                proj_drop=drop_rate,
+                batch_first=batch_first,
+                bias=qkv_bias))
+
+        self.build_attn(attn_cfg)
+
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, embed_dims, postfix=2)
+        self.add_module(self.norm2_name, norm2)
+
+        ffn_cfg.update(
+            dict(
+                embed_dims=embed_dims,
+                feedforward_channels=feedforward_channels,
+                num_fcs=num_fcs,
+                ffn_drop=drop_rate,
+                dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate)
+                if drop_path_rate > 0 else None,
+                act_cfg=act_cfg))
+        self.build_ffn(ffn_cfg)
+        self.with_cp = with_cp
+
+    def build_attn(self, attn_cfg):
+        self.attn = MultiheadAttention(**attn_cfg)
+
+    def build_ffn(self, ffn_cfg):
+        self.ffn = FFN(**ffn_cfg)
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            x = self.attn(self.norm1(x), identity=x)
+            x = self.ffn(self.norm2(x), identity=x)
+            return x
+
+        if self.with_cp and x.requires_grad:
+            x = cp.checkpoint(_inner_forward, x)
+        else:
+            x = _inner_forward(x)
+        return x
+
+
+@MODELS.register_module()
+class VisionTransformer(BaseModule):
+    """Vision Transformer.
+
+    This backbone is the implementation of `An Image is Worth 16x16 Words:
+    Transformers for Image Recognition at
+    Scale <https://arxiv.org/abs/2010.11929>`_.
+
+    Args:
+        img_size (int | tuple): Input image size. Default: 224.
+        patch_size (int): The patch size. Default: 16.
+        patch_pad  (str | int | None): The padding method in patch embedding.
+            Default: 'corner'.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): embedding dimension. Default: 768.
+        num_layers (int): depth of transformer. Default: 12.
+        num_heads (int): number of attention heads. Default: 12.
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        out_origin (bool): Whether to output the original input embedding.
+            Default: False
+        out_indices (list | tuple | int): Output from which stages.
+            Default: -1.
+        qkv_bias (bool): enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): stochastic depth rate. Default 0.0
+        with_cls_token (bool): Whether concatenating class token into image
+            tokens as transformer input. Default: True.
+        output_cls_token (bool): Whether output the cls_token. If set True,
+            `with_cls_token` must be True. Default: False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        patch_bias (dict): Whether use bias in convolution of PatchEmbed Block.
+            Default: True.
+        patch_norm (bool): Whether to add a norm in PatchEmbed Block.
+            Default: False.
+        pre_norm (bool): Whether to add a norm before Transformer Layers.
+            Default: False.
+        final_norm (bool): Whether to add a additional layer to normalize
+            final feature map. Default: False.
+        interpolate_mode (str): Select the interpolate mode for position
+            embeding vector resize. Default: bicubic.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+        frozen_exclude (List): List of parameters that are not to be frozen.
+            Default: ["all"], "all" means there are no frozen parameters.
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 patch_pad='corner',
+                 in_channels=3,
+                 embed_dims=768,
+                 num_layers=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 out_origin=False,
+                 out_indices=-1,
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 with_cls_token=True,
+                 output_cls_token=False,
+                 norm_cfg=dict(type='LN'),
+                 act_cfg=dict(type='GELU'),
+                 patch_norm=False,
+                 patch_bias=False,
+                 pre_norm=False,
+                 final_norm=False,
+                 interpolate_mode='bicubic',
+                 num_fcs=2,
+                 norm_eval=False,
+                 with_cp=False,
+                 frozen_exclude=['all'],
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        if isinstance(img_size, int):
+            img_size = to_2tuple(img_size)
+        elif isinstance(img_size, tuple):
+            if len(img_size) == 1:
+                img_size = to_2tuple(img_size[0])
+            assert len(img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(img_size)}'
+
+        if output_cls_token:
+            assert with_cls_token is True, f'with_cls_token must be True if' \
+                f'set output_cls_token to True, but got {with_cls_token}'
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.interpolate_mode = interpolate_mode
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.pretrained = pretrained
+        self.out_origin = out_origin
+        self.frozen_exclude = frozen_exclude
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=patch_size,
+            padding=patch_pad,
+            bias=patch_bias,
+            norm_cfg=norm_cfg if patch_norm else None,
+            init_cfg=None,
+        )
+
+        num_patches = (img_size[0] // patch_size) * \
+            (img_size[1] // patch_size)
+
+        self.with_cls_token = with_cls_token
+        self.output_cls_token = output_cls_token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + 1, embed_dims))
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+        self.pre_norm = pre_norm
+
+        if self.pre_norm:
+            self.pre_ln_name, pre_ln = build_norm_layer(
+                norm_cfg, embed_dims, postfix='_pre')
+            self.add_module(self.pre_ln_name, pre_ln)
+
+        if isinstance(out_indices, int):
+            if out_indices == -1:
+                out_indices = num_layers - 1
+            self.out_indices = [out_indices]
+        elif isinstance(out_indices, list) or isinstance(out_indices, tuple):
+            self.out_indices = out_indices
+        else:
+            raise TypeError('out_indices must be type of int, list or tuple')
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, num_layers)
+        ]  # stochastic depth decay rule
+
+        self.layers = ModuleList()
+        for i in range(num_layers):
+            self.layers.append(
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims,
+                    num_heads=num_heads,
+                    feedforward_channels=mlp_ratio * embed_dims,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    drop_path_rate=dpr[i],
+                    num_fcs=num_fcs,
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    with_cp=with_cp,
+                    batch_first=True))
+
+        self.final_norm = final_norm
+        if final_norm:
+            self.norm1_name, norm1 = build_norm_layer(
+                norm_cfg, embed_dims, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+
+        self._freeze()
+
+    @property
+    def pre_ln(self):
+        return getattr(self, self.pre_ln_name)
+
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+
+    def init_weights(self):
+        if isinstance(self.init_cfg, dict) and \
+                self.init_cfg.get('type') in ['Pretrained', 'Pretrained_Part']:
+            checkpoint = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+
+            if self.init_cfg.get('type') == 'Pretrained':
+                if 'state_dict' in checkpoint:
+                    state_dict = checkpoint['state_dict']
+                else:
+                    state_dict = checkpoint
+
+            elif self.init_cfg.get('type') == 'Pretrained_Part':
+                state_dict = checkpoint.copy()
+                para_prefix = 'image_encoder'
+                prefix_len = len(para_prefix) + 1
+                for k, v in checkpoint.items():
+                    state_dict.pop(k)
+                    if para_prefix in k:
+                        state_dict[k[prefix_len:]] = v
+
+            if 'pos_embed' in state_dict.keys():
+                if self.pos_embed.shape != state_dict['pos_embed'].shape:
+                    print_log(msg=f'Resize the pos_embed shape from '
+                              f'{state_dict["pos_embed"].shape} to '
+                              f'{self.pos_embed.shape}')
+                    h, w = self.img_size
+                    pos_size = int(
+                        math.sqrt(state_dict['pos_embed'].shape[1] - 1))
+                    state_dict['pos_embed'] = self.resize_pos_embed(
+                        state_dict['pos_embed'],
+                        (h // self.patch_size, w // self.patch_size),
+                        (pos_size, pos_size), self.interpolate_mode)
+
+            load_state_dict(self, state_dict, strict=False, logger=None)
+        elif self.init_cfg is not None:
+            super().init_weights()
+        else:
+            # We only implement the 'jax_impl' initialization implemented at
+            # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
+            trunc_normal_(self.pos_embed, std=.02)
+            trunc_normal_(self.cls_token, std=.02)
+            for n, m in self.named_modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if m.bias is not None:
+                        if 'ffn' in n:
+                            nn.init.normal_(m.bias, mean=0., std=1e-6)
+                        else:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Conv2d):
+                    kaiming_init(m, mode='fan_in', bias=0.)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m, val=1.0, bias=0.)
+
+    def _freeze(self):
+        if 'all' in self.frozen_exclude:
+            return
+        for name, param in self.named_parameters():
+            if not any([exclude in name for exclude in self.frozen_exclude]):
+                param.requires_grad = False
+
+    def _pos_embeding(self, patched_img, hw_shape, pos_embed):
+        """Positioning embeding method.
+
+        Resize the pos_embed, if the input image size doesn't match
+            the training size.
+        Args:
+            patched_img (torch.Tensor): The patched image, it should be
+                shape of [B, L1, C].
+            hw_shape (tuple): The downsampled image resolution.
+            pos_embed (torch.Tensor): The pos_embed weighs, it should be
+                shape of [B, L2, c].
+        Return:
+            torch.Tensor: The pos encoded image feature.
+        """
+        assert patched_img.ndim == 3 and pos_embed.ndim == 3, \
+            'the shapes of patched_img and pos_embed must be [B, L, C]'
+        x_len, pos_len = patched_img.shape[1], pos_embed.shape[1]
+        if x_len != pos_len:
+            if pos_len == (self.img_size[0] // self.patch_size) * (
+                    self.img_size[1] // self.patch_size) + 1:
+                pos_h = self.img_size[0] // self.patch_size
+                pos_w = self.img_size[1] // self.patch_size
+            else:
+                raise ValueError(
+                    'Unexpected shape of pos_embed, got {}.'.format(
+                        pos_embed.shape))
+            pos_embed = self.resize_pos_embed(pos_embed, hw_shape,
+                                              (pos_h, pos_w),
+                                              self.interpolate_mode)
+        return self.drop_after_pos(patched_img + pos_embed)
+
+    @staticmethod
+    def resize_pos_embed(pos_embed, input_shpae, pos_shape, mode):
+        """Resize pos_embed weights.
+
+        Resize pos_embed using bicubic interpolate method.
+        Args:
+            pos_embed (torch.Tensor): Position embedding weights.
+            input_shpae (tuple): Tuple for (downsampled input image height,
+                downsampled input image width).
+            pos_shape (tuple): The resolution of downsampled origin training
+                image.
+            mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'nearest'``
+        Return:
+            torch.Tensor: The resized pos_embed of shape [B, L_new, C]
+        """
+        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
+        pos_h, pos_w = pos_shape
+        cls_token_weight = pos_embed[:, 0]
+        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
+        pos_embed_weight = pos_embed_weight.reshape(
+            1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2)
+        pos_embed_weight = resize(
+            pos_embed_weight, size=input_shpae, align_corners=False, mode=mode)
+        cls_token_weight = cls_token_weight.unsqueeze(1)
+        pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2)
+        pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1)
+        return pos_embed
+
+    def forward(self, inputs):
+        B = inputs.shape[0]
+
+        x, hw_shape = self.patch_embed(inputs)
+
+        # stole cls_tokens impl from Phil Wang, thanks
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = self._pos_embeding(x, hw_shape, self.pos_embed)
+
+        if not self.with_cls_token:
+            # Remove class token for transformer encoder input
+            x = x[:, 1:]
+
+        if self.pre_norm:
+            x = self.pre_ln(x)
+
+        outs = []
+        if self.out_origin:
+            if self.with_cls_token:
+                # Remove class token and reshape token for decoder head
+                out = x[:, 1:]
+            else:
+                out = x
+            B, _, C = out.shape
+            out = out.reshape(B, hw_shape[0], hw_shape[1],
+                              C).permute(0, 3, 1, 2).contiguous()
+            if self.output_cls_token:
+                out = [out, x[:, 0]]
+            outs.append(out)
+
+        for i, layer in enumerate(self.layers):
+            x = layer(x)
+            if i == len(self.layers) - 1:
+                if self.final_norm:
+                    x = self.norm1(x)
+            if i in self.out_indices:
+                if self.with_cls_token:
+                    # Remove class token and reshape token for decoder head
+                    out = x[:, 1:]
+                else:
+                    out = x
+                B, _, C = out.shape
+                out = out.reshape(B, hw_shape[0], hw_shape[1],
+                                  C).permute(0, 3, 1, 2).contiguous()
+                if self.output_cls_token:
+                    out = [out, x[:, 0]]
+                outs.append(out)
+
+        return tuple(outs)
+
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.LayerNorm):
+                    m.eval()
diff --git a/head_extractor/src/mmseg/models/backbones/vpd.py b/head_extractor/src/mmseg/models/backbones/vpd.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0536d31c64f82fb66117d9ebd2161d5f2df57bd
--- /dev/null
+++ b/head_extractor/src/mmseg/models/backbones/vpd.py
@@ -0,0 +1,395 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/wl-zhao/VPD/blob/main/vpd/models.py
+# Original licence: MIT License
+# ------------------------------------------------------------------------------
+
+import math
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+from mmengine.runner import CheckpointLoader, load_checkpoint
+
+from mmseg.registry import MODELS
+from mmseg.utils import ConfigType, OptConfigType
+
+try:
+    from ldm.modules.diffusionmodules.util import timestep_embedding
+    from ldm.util import instantiate_from_config
+    has_ldm = True
+except ImportError:
+    has_ldm = False
+
+
+def register_attention_control(model, controller):
+    """Registers a control function to manage attention within a model.
+
+    Args:
+        model: The model to which attention is to be registered.
+        controller: The control function responsible for managing attention.
+    """
+
+    def ca_forward(self, place_in_unet):
+        """Custom forward method for attention.
+
+        Args:
+            self: Reference to the current object.
+            place_in_unet: The location in UNet (down/mid/up).
+
+        Returns:
+            The modified forward method.
+        """
+
+        def forward(x, context=None, mask=None):
+            h = self.heads
+            is_cross = context is not None
+            context = context or x  # if context is None, use x
+
+            q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
+            q, k, v = (
+                tensor.view(tensor.shape[0] * h, tensor.shape[1],
+                            tensor.shape[2] // h) for tensor in [q, k, v])
+
+            sim = torch.matmul(q, k.transpose(-2, -1)) * self.scale
+
+            if mask is not None:
+                mask = mask.flatten(1).unsqueeze(1).repeat(h, 1, 1)
+                max_neg_value = -torch.finfo(sim.dtype).max
+                sim.masked_fill_(~mask, max_neg_value)
+
+            attn = sim.softmax(dim=-1)
+            attn_mean = attn.view(h, attn.shape[0] // h,
+                                  *attn.shape[1:]).mean(0)
+            controller(attn_mean, is_cross, place_in_unet)
+
+            out = torch.matmul(attn, v)
+            out = out.view(out.shape[0] // h, out.shape[1], out.shape[2] * h)
+            return self.to_out(out)
+
+        return forward
+
+    def register_recr(net_, count, place_in_unet):
+        """Recursive function to register the custom forward method to all
+        CrossAttention layers.
+
+        Args:
+            net_: The network layer currently being processed.
+            count: The current count of layers processed.
+            place_in_unet: The location in UNet (down/mid/up).
+
+        Returns:
+            The updated count of layers processed.
+        """
+        if net_.__class__.__name__ == 'CrossAttention':
+            net_.forward = ca_forward(net_, place_in_unet)
+            return count + 1
+        if hasattr(net_, 'children'):
+            return sum(
+                register_recr(child, 0, place_in_unet)
+                for child in net_.children())
+        return count
+
+    cross_att_count = sum(
+        register_recr(net[1], 0, place) for net, place in [
+            (child, 'down') if 'input_blocks' in name else (
+                child, 'up') if 'output_blocks' in name else
+            (child,
+             'mid') if 'middle_block' in name else (None, None)  # Default case
+            for name, child in model.diffusion_model.named_children()
+        ] if net is not None)
+
+    controller.num_att_layers = cross_att_count
+
+
+class AttentionStore:
+    """A class for storing attention information in the UNet model.
+
+    Attributes:
+        base_size (int): Base size for storing attention information.
+        max_size (int): Maximum size for storing attention information.
+    """
+
+    def __init__(self, base_size=64, max_size=None):
+        """Initialize AttentionStore with default or custom sizes."""
+        self.reset()
+        self.base_size = base_size
+        self.max_size = max_size or (base_size // 2)
+        self.num_att_layers = -1
+
+    @staticmethod
+    def get_empty_store():
+        """Returns an empty store for holding attention values."""
+        return {
+            key: []
+            for key in [
+                'down_cross', 'mid_cross', 'up_cross', 'down_self', 'mid_self',
+                'up_self'
+            ]
+        }
+
+    def reset(self):
+        """Resets the step and attention stores to their initial states."""
+        self.cur_step = 0
+        self.cur_att_layer = 0
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        """Processes a single forward step, storing the attention.
+
+        Args:
+            attn: The attention tensor.
+            is_cross (bool): Whether it's cross attention.
+            place_in_unet (str): The location in UNet (down/mid/up).
+
+        Returns:
+            The unmodified attention tensor.
+        """
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[1] <= (self.max_size)**2:
+            self.step_store[key].append(attn)
+        return attn
+
+    def between_steps(self):
+        """Processes and stores attention information between steps."""
+        if not self.attention_store:
+            self.attention_store = self.step_store
+        else:
+            for key in self.attention_store:
+                self.attention_store[key] = [
+                    stored + step for stored, step in zip(
+                        self.attention_store[key], self.step_store[key])
+                ]
+        self.step_store = self.get_empty_store()
+
+    def get_average_attention(self):
+        """Calculates and returns the average attention across all steps."""
+        return {
+            key: [item for item in self.step_store[key]]
+            for key in self.step_store
+        }
+
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        """Allows the class instance to be callable."""
+        return self.forward(attn, is_cross, place_in_unet)
+
+    @property
+    def num_uncond_att_layers(self):
+        """Returns the number of unconditional attention layers (default is
+        0)."""
+        return 0
+
+    def step_callback(self, x_t):
+        """A placeholder for a step callback.
+
+        Returns the input unchanged.
+        """
+        return x_t
+
+
+class UNetWrapper(nn.Module):
+    """A wrapper for UNet with optional attention mechanisms.
+
+    Args:
+        unet (nn.Module): The UNet model to wrap
+        use_attn (bool): Whether to use attention. Defaults to True
+        base_size (int): Base size for the attention store. Defaults to 512
+        max_attn_size (int, optional): Maximum size for the attention store.
+            Defaults to None
+        attn_selector (str): The types of attention to use.
+            Defaults to 'up_cross+down_cross'
+    """
+
+    def __init__(self,
+                 unet,
+                 use_attn=True,
+                 base_size=512,
+                 max_attn_size=None,
+                 attn_selector='up_cross+down_cross'):
+        super().__init__()
+
+        assert has_ldm, 'To use UNetWrapper, please install required ' \
+            'packages via `pip install -r requirements/optional.txt`.'
+
+        self.unet = unet
+        self.attention_store = AttentionStore(
+            base_size=base_size // 8, max_size=max_attn_size)
+        self.attn_selector = attn_selector.split('+')
+        self.use_attn = use_attn
+        self.init_sizes(base_size)
+        if self.use_attn:
+            register_attention_control(unet, self.attention_store)
+
+    def init_sizes(self, base_size):
+        """Initialize sizes based on the base size."""
+        self.size16 = base_size // 32
+        self.size32 = base_size // 16
+        self.size64 = base_size // 8
+
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+        """Forward pass through the model."""
+        diffusion_model = self.unet.diffusion_model
+        if self.use_attn:
+            self.attention_store.reset()
+        hs, emb, out_list = self._unet_forward(x, timesteps, context, y,
+                                               diffusion_model)
+        if self.use_attn:
+            self._append_attn_to_output(out_list)
+        return out_list[::-1]
+
+    def _unet_forward(self, x, timesteps, context, y, diffusion_model):
+        hs = []
+        t_emb = timestep_embedding(
+            timesteps, diffusion_model.model_channels, repeat_only=False)
+        emb = diffusion_model.time_embed(t_emb)
+        h = x.type(diffusion_model.dtype)
+        for module in diffusion_model.input_blocks:
+            h = module(h, emb, context)
+            hs.append(h)
+        h = diffusion_model.middle_block(h, emb, context)
+        out_list = []
+        for i_out, module in enumerate(diffusion_model.output_blocks):
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+            if i_out in [1, 4, 7]:
+                out_list.append(h)
+        h = h.type(x.dtype)
+        out_list.append(h)
+        return hs, emb, out_list
+
+    def _append_attn_to_output(self, out_list):
+        avg_attn = self.attention_store.get_average_attention()
+        attns = {self.size16: [], self.size32: [], self.size64: []}
+        for k in self.attn_selector:
+            for up_attn in avg_attn[k]:
+                size = int(math.sqrt(up_attn.shape[1]))
+                up_attn = up_attn.transpose(-1, -2).reshape(
+                    *up_attn.shape[:2], size, -1)
+                attns[size].append(up_attn)
+        attn16 = torch.stack(attns[self.size16]).mean(0)
+        attn32 = torch.stack(attns[self.size32]).mean(0)
+        attn64 = torch.stack(attns[self.size64]).mean(0) if len(
+            attns[self.size64]) > 0 else None
+        out_list[1] = torch.cat([out_list[1], attn16], dim=1)
+        out_list[2] = torch.cat([out_list[2], attn32], dim=1)
+        if attn64 is not None:
+            out_list[3] = torch.cat([out_list[3], attn64], dim=1)
+
+
+class TextAdapter(nn.Module):
+    """A PyTorch Module that serves as a text adapter.
+
+    This module takes text embeddings and adjusts them based on a scaling
+    factor gamma.
+    """
+
+    def __init__(self, text_dim=768):
+        super().__init__()
+        self.fc = nn.Sequential(
+            nn.Linear(text_dim, text_dim), nn.GELU(),
+            nn.Linear(text_dim, text_dim))
+
+    def forward(self, texts, gamma):
+        texts_after = self.fc(texts)
+        texts = texts + gamma * texts_after
+        return texts
+
+
+@MODELS.register_module()
+class VPD(BaseModule):
+    """VPD (Visual Perception Diffusion) model.
+
+    .. _`VPD`: https://arxiv.org/abs/2303.02153
+
+    Args:
+        diffusion_cfg (dict): Configuration for diffusion model.
+        class_embed_path (str): Path for class embeddings.
+        unet_cfg (dict, optional): Configuration for U-Net.
+        gamma (float, optional): Gamma for text adaptation. Defaults to 1e-4.
+        class_embed_select (bool, optional): If True, enables class embedding
+            selection. Defaults to False.
+        pad_shape (Optional[Union[int, List[int]]], optional): Padding shape.
+            Defaults to None.
+        pad_val (Union[int, List[int]], optional): Padding value.
+            Defaults to 0.
+        init_cfg (dict, optional): Configuration for network initialization.
+    """
+
+    def __init__(self,
+                 diffusion_cfg: ConfigType,
+                 class_embed_path: str,
+                 unet_cfg: OptConfigType = dict(),
+                 gamma: float = 1e-4,
+                 class_embed_select=False,
+                 pad_shape: Optional[Union[int, List[int]]] = None,
+                 pad_val: Union[int, List[int]] = 0,
+                 init_cfg: OptConfigType = None):
+
+        super().__init__(init_cfg=init_cfg)
+
+        assert has_ldm, 'To use VPD model, please install required packages' \
+            ' via `pip install -r requirements/optional.txt`.'
+
+        if pad_shape is not None:
+            if not isinstance(pad_shape, (list, tuple)):
+                pad_shape = (pad_shape, pad_shape)
+
+        self.pad_shape = pad_shape
+        self.pad_val = pad_val
+
+        # diffusion model
+        diffusion_checkpoint = diffusion_cfg.pop('checkpoint', None)
+        sd_model = instantiate_from_config(diffusion_cfg)
+        if diffusion_checkpoint is not None:
+            load_checkpoint(sd_model, diffusion_checkpoint, strict=False)
+
+        self.encoder_vq = sd_model.first_stage_model
+        self.unet = UNetWrapper(sd_model.model, **unet_cfg)
+
+        # class embeddings & text adapter
+        class_embeddings = CheckpointLoader.load_checkpoint(class_embed_path)
+        text_dim = class_embeddings.size(-1)
+        self.text_adapter = TextAdapter(text_dim=text_dim)
+        self.class_embed_select = class_embed_select
+        if class_embed_select:
+            class_embeddings = torch.cat(
+                (class_embeddings, class_embeddings.mean(dim=0,
+                                                         keepdims=True)),
+                dim=0)
+        self.register_buffer('class_embeddings', class_embeddings)
+        self.gamma = nn.Parameter(torch.ones(text_dim) * gamma)
+
+    def forward(self, x):
+        """Extract features from images."""
+
+        # calculate cross-attn map
+        if self.class_embed_select:
+            if isinstance(x, (tuple, list)):
+                x, class_ids = x[:2]
+                class_ids = class_ids.tolist()
+            else:
+                class_ids = [-1] * x.size(0)
+            class_embeddings = self.class_embeddings[class_ids]
+            c_crossattn = self.text_adapter(class_embeddings, self.gamma)
+            c_crossattn = c_crossattn.unsqueeze(1)
+        else:
+            class_embeddings = self.class_embeddings
+            c_crossattn = self.text_adapter(class_embeddings, self.gamma)
+            c_crossattn = c_crossattn.unsqueeze(0).repeat(x.size(0), 1, 1)
+
+        # pad to required input shape for pretrained diffusion model
+        if self.pad_shape is not None:
+            pad_width = max(0, self.pad_shape[1] - x.shape[-1])
+            pad_height = max(0, self.pad_shape[0] - x.shape[-2])
+            x = F.pad(x, (0, pad_width, 0, pad_height), value=self.pad_val)
+
+        # forward the denoising model
+        with torch.no_grad():
+            latents = self.encoder_vq.encode(x).mode().detach()
+        t = torch.ones((x.shape[0], ), device=x.device).long()
+        outs = self.unet(latents, t, context=c_crossattn)
+
+        return outs
diff --git a/head_extractor/src/mmseg/models/builder.py b/head_extractor/src/mmseg/models/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..081c646b49b8ff1ea6c42d1ea4e24e63cdf6b43a
--- /dev/null
+++ b/head_extractor/src/mmseg/models/builder.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmseg.registry import MODELS
+
+BACKBONES = MODELS
+NECKS = MODELS
+HEADS = MODELS
+LOSSES = MODELS
+SEGMENTORS = MODELS
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    warnings.warn('``build_backbone`` would be deprecated soon, please use '
+                  '``mmseg.registry.MODELS.build()`` ')
+    return BACKBONES.build(cfg)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    warnings.warn('``build_neck`` would be deprecated soon, please use '
+                  '``mmseg.registry.MODELS.build()`` ')
+    return NECKS.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    warnings.warn('``build_head`` would be deprecated soon, please use '
+                  '``mmseg.registry.MODELS.build()`` ')
+    return HEADS.build(cfg)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    warnings.warn('``build_loss`` would be deprecated soon, please use '
+                  '``mmseg.registry.MODELS.build()`` ')
+    return LOSSES.build(cfg)
+
+
+def build_segmentor(cfg, train_cfg=None, test_cfg=None):
+    """Build segmentor."""
+    if train_cfg is not None or test_cfg is not None:
+        warnings.warn(
+            'train_cfg and test_cfg is deprecated, '
+            'please specify them in model', UserWarning)
+    assert cfg.get('train_cfg') is None or train_cfg is None, \
+        'train_cfg specified in both outer field and model field '
+    assert cfg.get('test_cfg') is None or test_cfg is None, \
+        'test_cfg specified in both outer field and model field '
+    return SEGMENTORS.build(
+        cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
diff --git a/head_extractor/src/mmseg/models/data_preprocessor.py b/head_extractor/src/mmseg/models/data_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d32bc647b7d48183590408e36ec42ea36aea91c
--- /dev/null
+++ b/head_extractor/src/mmseg/models/data_preprocessor.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from numbers import Number
+from typing import Any, Dict, List, Optional, Sequence
+
+import torch
+from mmengine.model import BaseDataPreprocessor
+
+from mmseg.registry import MODELS
+from mmseg.utils import stack_batch
+
+
+@MODELS.register_module()
+class SegDataPreProcessor(BaseDataPreprocessor):
+    """Image pre-processor for segmentation tasks.
+
+    Comparing with the :class:`mmengine.ImgDataPreprocessor`,
+
+    1. It won't do normalization if ``mean`` is not specified.
+    2. It does normalization and color space conversion after stacking batch.
+    3. It supports batch augmentations like mixup and cutmix.
+
+
+    It provides the data pre-processing as follows
+
+    - Collate and move data to the target device.
+    - Pad inputs to the input size with defined ``pad_val``, and pad seg map
+        with defined ``seg_pad_val``.
+    - Stack inputs to batch_inputs.
+    - Convert inputs from bgr to rgb if the shape of input is (3, H, W).
+    - Normalize image with defined std and mean.
+    - Do batch augmentations like Mixup and Cutmix during training.
+
+    Args:
+        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
+            Defaults to None.
+        std (Sequence[Number], optional): The pixel standard deviation of
+            R, G, B channels. Defaults to None.
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (float, optional): Padding value. Default: 0.
+        seg_pad_val (float, optional): Padding value of segmentation map.
+            Default: 255.
+        padding_mode (str): Type of padding. Default: constant.
+            - constant: pads with a constant value, this value is specified
+              with pad_val.
+        bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): whether to convert image from RGB to RGB.
+            Defaults to False.
+        batch_augments (list[dict], optional): Batch-level augmentations
+        test_cfg (dict, optional): The padding size config in testing, if not
+            specify, will use `size` and `size_divisor` params as default.
+            Defaults to None, only supports keys `size` or `size_divisor`.
+    """
+
+    def __init__(
+        self,
+        mean: Sequence[Number] = None,
+        std: Sequence[Number] = None,
+        size: Optional[tuple] = None,
+        size_divisor: Optional[int] = None,
+        pad_val: Number = 0,
+        seg_pad_val: Number = 255,
+        bgr_to_rgb: bool = False,
+        rgb_to_bgr: bool = False,
+        batch_augments: Optional[List[dict]] = None,
+        test_cfg: dict = None,
+    ):
+        super().__init__()
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_val = pad_val
+        self.seg_pad_val = seg_pad_val
+
+        assert not (bgr_to_rgb and rgb_to_bgr), (
+            '`bgr2rgb` and `rgb2bgr` cannot be set to True at the same time')
+        self.channel_conversion = rgb_to_bgr or bgr_to_rgb
+
+        if mean is not None:
+            assert std is not None, 'To enable the normalization in ' \
+                                    'preprocessing, please specify both ' \
+                                    '`mean` and `std`.'
+            # Enable the normalization in preprocessing.
+            self._enable_normalize = True
+            self.register_buffer('mean',
+                                 torch.tensor(mean).view(-1, 1, 1), False)
+            self.register_buffer('std',
+                                 torch.tensor(std).view(-1, 1, 1), False)
+        else:
+            self._enable_normalize = False
+
+        # TODO: support batch augmentations.
+        self.batch_augments = batch_augments
+
+        # Support different padding methods in testing
+        self.test_cfg = test_cfg
+
+    def forward(self, data: dict, training: bool = False) -> Dict[str, Any]:
+        """Perform normalization、padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict): data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+
+        Returns:
+            Dict: Data in the same format as the model input.
+        """
+        data = self.cast_data(data)  # type: ignore
+        inputs = data['inputs']
+        data_samples = data.get('data_samples', None)
+        # TODO: whether normalize should be after stack_batch
+        if self.channel_conversion and inputs[0].size(0) == 3:
+            inputs = [_input[[2, 1, 0], ...] for _input in inputs]
+
+        inputs = [_input.float() for _input in inputs]
+        if self._enable_normalize:
+            inputs = [(_input - self.mean) / self.std for _input in inputs]
+
+        if training:
+            assert data_samples is not None, ('During training, ',
+                                              '`data_samples` must be define.')
+            inputs, data_samples = stack_batch(
+                inputs=inputs,
+                data_samples=data_samples,
+                size=self.size,
+                size_divisor=self.size_divisor,
+                pad_val=self.pad_val,
+                seg_pad_val=self.seg_pad_val)
+
+            if self.batch_augments is not None:
+                inputs, data_samples = self.batch_augments(
+                    inputs, data_samples)
+        else:
+            img_size = inputs[0].shape[1:]
+            assert all(input_.shape[1:] == img_size for input_ in inputs),  \
+                'The image size in a batch should be the same.'
+            # pad images when testing
+            if self.test_cfg:
+                inputs, padded_samples = stack_batch(
+                    inputs=inputs,
+                    size=self.test_cfg.get('size', None),
+                    size_divisor=self.test_cfg.get('size_divisor', None),
+                    pad_val=self.pad_val,
+                    seg_pad_val=self.seg_pad_val)
+                for data_sample, pad_info in zip(data_samples, padded_samples):
+                    data_sample.set_metainfo({**pad_info})
+            else:
+                inputs = torch.stack(inputs, dim=0)
+
+        return dict(inputs=inputs, data_samples=data_samples)
diff --git a/head_extractor/src/mmseg/models/decode_heads/__init__.py b/head_extractor/src/mmseg/models/decode_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4229763816e4100ab6718e4698a21ce92199371b
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/__init__.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .ann_head import ANNHead
+from .apc_head import APCHead
+from .aspp_head import ASPPHead
+from .cc_head import CCHead
+from .da_head import DAHead
+from .ddr_head import DDRHead
+from .dm_head import DMHead
+from .dnl_head import DNLHead
+from .dpt_head import DPTHead
+from .ema_head import EMAHead
+from .enc_head import EncHead
+from .fcn_head import FCNHead
+from .fpn_head import FPNHead
+from .gc_head import GCHead
+from .ham_head import LightHamHead
+from .isa_head import ISAHead
+from .knet_head import IterativeDecodeHead, KernelUpdateHead, KernelUpdator
+from .lraspp_head import LRASPPHead
+from .mask2former_head import Mask2FormerHead
+from .maskformer_head import MaskFormerHead
+from .nl_head import NLHead
+from .ocr_head import OCRHead
+from .pid_head import PIDHead
+from .point_head import PointHead
+from .psa_head import PSAHead
+from .psp_head import PSPHead
+from .san_head import SideAdapterCLIPHead
+from .segformer_head import SegformerHead
+from .segmenter_mask_head import SegmenterMaskTransformerHead
+from .sep_aspp_head import DepthwiseSeparableASPPHead
+from .sep_fcn_head import DepthwiseSeparableFCNHead
+from .setr_mla_head import SETRMLAHead
+from .setr_up_head import SETRUPHead
+from .stdc_head import STDCHead
+from .uper_head import UPerHead
+from .vpd_depth_head import VPDDepthHead
+
+__all__ = [
+    'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead',
+    'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead',
+    'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead',
+    'PointHead', 'APCHead', 'DMHead', 'LRASPPHead', 'SETRUPHead',
+    'SETRMLAHead', 'DPTHead', 'SETRMLAHead', 'SegmenterMaskTransformerHead',
+    'SegformerHead', 'ISAHead', 'STDCHead', 'IterativeDecodeHead',
+    'KernelUpdateHead', 'KernelUpdator', 'MaskFormerHead', 'Mask2FormerHead',
+    'LightHamHead', 'PIDHead', 'DDRHead', 'VPDDepthHead', 'SideAdapterCLIPHead'
+]
diff --git a/head_extractor/src/mmseg/models/decode_heads/ann_head.py b/head_extractor/src/mmseg/models/decode_heads/ann_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b40ef5aa1da0bc2473597fedca5b3f33973beb0
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/ann_head.py
@@ -0,0 +1,245 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import SelfAttentionBlock as _SelfAttentionBlock
+from .decode_head import BaseDecodeHead
+
+
+class PPMConcat(nn.ModuleList):
+    """Pyramid Pooling Module that only concat the features of each layer.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module.
+    """
+
+    def __init__(self, pool_scales=(1, 3, 6, 8)):
+        super().__init__(
+            [nn.AdaptiveAvgPool2d(pool_scale) for pool_scale in pool_scales])
+
+    def forward(self, feats):
+        """Forward function."""
+        ppm_outs = []
+        for ppm in self:
+            ppm_out = ppm(feats)
+            ppm_outs.append(ppm_out.view(*feats.shape[:2], -1))
+        concat_outs = torch.cat(ppm_outs, dim=2)
+        return concat_outs
+
+
+class SelfAttentionBlock(_SelfAttentionBlock):
+    """Make a ANN used SelfAttentionBlock.
+
+    Args:
+        low_in_channels (int): Input channels of lower level feature,
+            which is the key feature for self-attention.
+        high_in_channels (int): Input channels of higher level feature,
+            which is the query feature for self-attention.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+        share_key_query (bool): Whether share projection weight between key
+            and query projection.
+        query_scale (int): The scale of query feature map.
+        key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module of key feature.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict|None): Config of activation layers.
+    """
+
+    def __init__(self, low_in_channels, high_in_channels, channels,
+                 out_channels, share_key_query, query_scale, key_pool_scales,
+                 conv_cfg, norm_cfg, act_cfg):
+        key_psp = PPMConcat(key_pool_scales)
+        if query_scale > 1:
+            query_downsample = nn.MaxPool2d(kernel_size=query_scale)
+        else:
+            query_downsample = None
+        super().__init__(
+            key_in_channels=low_in_channels,
+            query_in_channels=high_in_channels,
+            channels=channels,
+            out_channels=out_channels,
+            share_key_query=share_key_query,
+            query_downsample=query_downsample,
+            key_downsample=key_psp,
+            key_query_num_convs=1,
+            key_query_norm=True,
+            value_out_num_convs=1,
+            value_out_norm=False,
+            matmul_norm=True,
+            with_out=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+
+class AFNB(nn.Module):
+    """Asymmetric Fusion Non-local Block(AFNB)
+
+    Args:
+        low_in_channels (int): Input channels of lower level feature,
+            which is the key feature for self-attention.
+        high_in_channels (int): Input channels of higher level feature,
+            which is the query feature for self-attention.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+            and query projection.
+        query_scales (tuple[int]): The scales of query feature map.
+            Default: (1,)
+        key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module of key feature.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict|None): Config of activation layers.
+    """
+
+    def __init__(self, low_in_channels, high_in_channels, channels,
+                 out_channels, query_scales, key_pool_scales, conv_cfg,
+                 norm_cfg, act_cfg):
+        super().__init__()
+        self.stages = nn.ModuleList()
+        for query_scale in query_scales:
+            self.stages.append(
+                SelfAttentionBlock(
+                    low_in_channels=low_in_channels,
+                    high_in_channels=high_in_channels,
+                    channels=channels,
+                    out_channels=out_channels,
+                    share_key_query=False,
+                    query_scale=query_scale,
+                    key_pool_scales=key_pool_scales,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        self.bottleneck = ConvModule(
+            out_channels + high_in_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, low_feats, high_feats):
+        """Forward function."""
+        priors = [stage(high_feats, low_feats) for stage in self.stages]
+        context = torch.stack(priors, dim=0).sum(dim=0)
+        output = self.bottleneck(torch.cat([context, high_feats], 1))
+        return output
+
+
+class APNB(nn.Module):
+    """Asymmetric Pyramid Non-local Block (APNB)
+
+    Args:
+        in_channels (int): Input channels of key/query feature,
+            which is the key feature for self-attention.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+        query_scales (tuple[int]): The scales of query feature map.
+            Default: (1,)
+        key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module of key feature.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict|None): Config of activation layers.
+    """
+
+    def __init__(self, in_channels, channels, out_channels, query_scales,
+                 key_pool_scales, conv_cfg, norm_cfg, act_cfg):
+        super().__init__()
+        self.stages = nn.ModuleList()
+        for query_scale in query_scales:
+            self.stages.append(
+                SelfAttentionBlock(
+                    low_in_channels=in_channels,
+                    high_in_channels=in_channels,
+                    channels=channels,
+                    out_channels=out_channels,
+                    share_key_query=True,
+                    query_scale=query_scale,
+                    key_pool_scales=key_pool_scales,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        self.bottleneck = ConvModule(
+            2 * in_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, feats):
+        """Forward function."""
+        priors = [stage(feats, feats) for stage in self.stages]
+        context = torch.stack(priors, dim=0).sum(dim=0)
+        output = self.bottleneck(torch.cat([context, feats], 1))
+        return output
+
+
+@MODELS.register_module()
+class ANNHead(BaseDecodeHead):
+    """Asymmetric Non-local Neural Networks for Semantic Segmentation.
+
+    This head is the implementation of `ANNNet
+    <https://arxiv.org/abs/1908.07678>`_.
+
+    Args:
+        project_channels (int): Projection channels for Nonlocal.
+        query_scales (tuple[int]): The scales of query feature map.
+            Default: (1,)
+        key_pool_scales (tuple[int]): The pooling scales of key feature map.
+            Default: (1, 3, 6, 8).
+    """
+
+    def __init__(self,
+                 project_channels,
+                 query_scales=(1, ),
+                 key_pool_scales=(1, 3, 6, 8),
+                 **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+        assert len(self.in_channels) == 2
+        low_in_channels, high_in_channels = self.in_channels
+        self.project_channels = project_channels
+        self.fusion = AFNB(
+            low_in_channels=low_in_channels,
+            high_in_channels=high_in_channels,
+            out_channels=high_in_channels,
+            channels=project_channels,
+            query_scales=query_scales,
+            key_pool_scales=key_pool_scales,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.bottleneck = ConvModule(
+            high_in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.context = APNB(
+            in_channels=self.channels,
+            out_channels=self.channels,
+            channels=project_channels,
+            query_scales=query_scales,
+            key_pool_scales=key_pool_scales,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        low_feats, high_feats = self._transform_inputs(inputs)
+        output = self.fusion(low_feats, high_feats)
+        output = self.dropout(output)
+        output = self.bottleneck(output)
+        output = self.context(output)
+        output = self.cls_seg(output)
+
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/apc_head.py b/head_extractor/src/mmseg/models/decode_heads/apc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..728f39659c63680944306fddc9e33b7c9172c1ba
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/apc_head.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class ACM(nn.Module):
+    """Adaptive Context Module used in APCNet.
+
+    Args:
+        pool_scale (int): Pooling scale used in Adaptive Context
+            Module to extract region features.
+        fusion (bool): Add one conv to fuse residual feature.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict | None): Config of conv layers.
+        norm_cfg (dict | None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self, pool_scale, fusion, in_channels, channels, conv_cfg,
+                 norm_cfg, act_cfg):
+        super().__init__()
+        self.pool_scale = pool_scale
+        self.fusion = fusion
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.pooled_redu_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.input_redu_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.global_info = ConvModule(
+            self.channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.gla = nn.Conv2d(self.channels, self.pool_scale**2, 1, 1, 0)
+
+        self.residual_conv = ConvModule(
+            self.channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        if self.fusion:
+            self.fusion_conv = ConvModule(
+                self.channels,
+                self.channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+    def forward(self, x):
+        """Forward function."""
+        pooled_x = F.adaptive_avg_pool2d(x, self.pool_scale)
+        # [batch_size, channels, h, w]
+        x = self.input_redu_conv(x)
+        # [batch_size, channels, pool_scale, pool_scale]
+        pooled_x = self.pooled_redu_conv(pooled_x)
+        batch_size = x.size(0)
+        # [batch_size, pool_scale * pool_scale, channels]
+        pooled_x = pooled_x.view(batch_size, self.channels,
+                                 -1).permute(0, 2, 1).contiguous()
+        # [batch_size, h * w, pool_scale * pool_scale]
+        affinity_matrix = self.gla(x + resize(
+            self.global_info(F.adaptive_avg_pool2d(x, 1)), size=x.shape[2:])
+                                   ).permute(0, 2, 3, 1).reshape(
+                                       batch_size, -1, self.pool_scale**2)
+        affinity_matrix = F.sigmoid(affinity_matrix)
+        # [batch_size, h * w, channels]
+        z_out = torch.matmul(affinity_matrix, pooled_x)
+        # [batch_size, channels, h * w]
+        z_out = z_out.permute(0, 2, 1).contiguous()
+        # [batch_size, channels, h, w]
+        z_out = z_out.view(batch_size, self.channels, x.size(2), x.size(3))
+        z_out = self.residual_conv(z_out)
+        z_out = F.relu(z_out + x)
+        if self.fusion:
+            z_out = self.fusion_conv(z_out)
+
+        return z_out
+
+
+@MODELS.register_module()
+class APCHead(BaseDecodeHead):
+    """Adaptive Pyramid Context Network for Semantic Segmentation.
+
+    This head is the implementation of
+    `APCNet <https://openaccess.thecvf.com/content_CVPR_2019/papers/\
+    He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_\
+    CVPR_2019_paper.pdf>`_.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Adaptive Context
+            Module. Default: (1, 2, 3, 6).
+        fusion (bool): Add one conv to fuse residual feature.
+    """
+
+    def __init__(self, pool_scales=(1, 2, 3, 6), fusion=True, **kwargs):
+        super().__init__(**kwargs)
+        assert isinstance(pool_scales, (list, tuple))
+        self.pool_scales = pool_scales
+        self.fusion = fusion
+        acm_modules = []
+        for pool_scale in self.pool_scales:
+            acm_modules.append(
+                ACM(pool_scale,
+                    self.fusion,
+                    self.in_channels,
+                    self.channels,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        self.acm_modules = nn.ModuleList(acm_modules)
+        self.bottleneck = ConvModule(
+            self.in_channels + len(pool_scales) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        acm_outs = [x]
+        for acm_module in self.acm_modules:
+            acm_outs.append(acm_module(x))
+        acm_outs = torch.cat(acm_outs, dim=1)
+        output = self.bottleneck(acm_outs)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/aspp_head.py b/head_extractor/src/mmseg/models/decode_heads/aspp_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d7185d7de58d35ef17e5d54e0e75b045e8724c4
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/aspp_head.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class ASPPModule(nn.ModuleList):
+    """Atrous Spatial Pyramid Pooling (ASPP) Module.
+
+    Args:
+        dilations (tuple[int]): Dilation rate of each layer.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self, dilations, in_channels, channels, conv_cfg, norm_cfg,
+                 act_cfg):
+        super().__init__()
+        self.dilations = dilations
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        for dilation in dilations:
+            self.append(
+                ConvModule(
+                    self.in_channels,
+                    self.channels,
+                    1 if dilation == 1 else 3,
+                    dilation=dilation,
+                    padding=0 if dilation == 1 else dilation,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+    def forward(self, x):
+        """Forward function."""
+        aspp_outs = []
+        for aspp_module in self:
+            aspp_outs.append(aspp_module(x))
+
+        return aspp_outs
+
+
+@MODELS.register_module()
+class ASPPHead(BaseDecodeHead):
+    """Rethinking Atrous Convolution for Semantic Image Segmentation.
+
+    This head is the implementation of `DeepLabV3
+    <https://arxiv.org/abs/1706.05587>`_.
+
+    Args:
+        dilations (tuple[int]): Dilation rates for ASPP module.
+            Default: (1, 6, 12, 18).
+    """
+
+    def __init__(self, dilations=(1, 6, 12, 18), **kwargs):
+        super().__init__(**kwargs)
+        assert isinstance(dilations, (list, tuple))
+        self.dilations = dilations
+        self.image_pool = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            ConvModule(
+                self.in_channels,
+                self.channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        self.aspp_modules = ASPPModule(
+            dilations,
+            self.in_channels,
+            self.channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.bottleneck = ConvModule(
+            (len(dilations) + 1) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _forward_feature(self, inputs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        x = self._transform_inputs(inputs)
+        aspp_outs = [
+            resize(
+                self.image_pool(x),
+                size=x.size()[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        ]
+        aspp_outs.extend(self.aspp_modules(x))
+        aspp_outs = torch.cat(aspp_outs, dim=1)
+        feats = self.bottleneck(aspp_outs)
+        return feats
+
+    def forward(self, inputs):
+        """Forward function."""
+        output = self._forward_feature(inputs)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/cascade_decode_head.py b/head_extractor/src/mmseg/models/decode_heads/cascade_decode_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe2bcb9302235e3881696dff6657e3e7fb12609b
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/cascade_decode_head.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List
+
+from torch import Tensor
+
+from mmseg.utils import ConfigType
+from .decode_head import BaseDecodeHead
+
+
+class BaseCascadeDecodeHead(BaseDecodeHead, metaclass=ABCMeta):
+    """Base class for cascade decode head used in
+    :class:`CascadeEncoderDecoder."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @abstractmethod
+    def forward(self, inputs, prev_output):
+        """Placeholder of forward function."""
+        pass
+
+    def loss(self, inputs: List[Tensor], prev_output: Tensor,
+             batch_data_samples: List[dict], train_cfg: ConfigType) -> Tensor:
+        """Forward function for training.
+
+        Args:
+            inputs (List[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+            batch_data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_sem_seg`.
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs, prev_output)
+        losses = self.loss_by_feat(seg_logits, batch_data_samples)
+
+        return losses
+
+    def predict(self, inputs: List[Tensor], prev_output: Tensor,
+                batch_img_metas: List[dict], tese_cfg: ConfigType):
+        """Forward function for testing.
+
+        Args:
+            inputs (List[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+            batch_img_metas (dict): List Image info where each dict may also
+                contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        seg_logits = self.forward(inputs, prev_output)
+
+        return self.predict_by_feat(seg_logits, batch_img_metas)
diff --git a/head_extractor/src/mmseg/models/decode_heads/cc_head.py b/head_extractor/src/mmseg/models/decode_heads/cc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9075a2648d77f6bca6bb29f3e7db52a329f7afb
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/cc_head.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmseg.registry import MODELS
+from .fcn_head import FCNHead
+
+try:
+    from mmcv.ops import CrissCrossAttention
+except ModuleNotFoundError:
+    CrissCrossAttention = None
+
+
+@MODELS.register_module()
+class CCHead(FCNHead):
+    """CCNet: Criss-Cross Attention for Semantic Segmentation.
+
+    This head is the implementation of `CCNet
+    <https://arxiv.org/abs/1811.11721>`_.
+
+    Args:
+        recurrence (int): Number of recurrence of Criss Cross Attention
+            module. Default: 2.
+    """
+
+    def __init__(self, recurrence=2, **kwargs):
+        if CrissCrossAttention is None:
+            raise RuntimeError('Please install mmcv-full for '
+                               'CrissCrossAttention ops')
+        super().__init__(num_convs=2, **kwargs)
+        self.recurrence = recurrence
+        self.cca = CrissCrossAttention(self.channels)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        output = self.convs[0](x)
+        for _ in range(self.recurrence):
+            output = self.cca(output)
+        output = self.convs[1](output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/da_head.py b/head_extractor/src/mmseg/models/decode_heads/da_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d87214365d2f8695b60ccab0c1850669ff8dd295
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/da_head.py
@@ -0,0 +1,184 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, Scale
+from torch import Tensor, nn
+
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList, add_prefix
+from ..utils import SelfAttentionBlock as _SelfAttentionBlock
+from .decode_head import BaseDecodeHead
+
+
+class PAM(_SelfAttentionBlock):
+    """Position Attention Module (PAM)
+
+    Args:
+        in_channels (int): Input channels of key/query feature.
+        channels (int): Output channels of key/query transform.
+    """
+
+    def __init__(self, in_channels, channels):
+        super().__init__(
+            key_in_channels=in_channels,
+            query_in_channels=in_channels,
+            channels=channels,
+            out_channels=in_channels,
+            share_key_query=False,
+            query_downsample=None,
+            key_downsample=None,
+            key_query_num_convs=1,
+            key_query_norm=False,
+            value_out_num_convs=1,
+            value_out_norm=False,
+            matmul_norm=False,
+            with_out=False,
+            conv_cfg=None,
+            norm_cfg=None,
+            act_cfg=None)
+
+        self.gamma = Scale(0)
+
+    def forward(self, x):
+        """Forward function."""
+        out = super().forward(x, x)
+
+        out = self.gamma(out) + x
+        return out
+
+
+class CAM(nn.Module):
+    """Channel Attention Module (CAM)"""
+
+    def __init__(self):
+        super().__init__()
+        self.gamma = Scale(0)
+
+    def forward(self, x):
+        """Forward function."""
+        batch_size, channels, height, width = x.size()
+        proj_query = x.view(batch_size, channels, -1)
+        proj_key = x.view(batch_size, channels, -1).permute(0, 2, 1)
+        energy = torch.bmm(proj_query, proj_key)
+        energy_new = torch.max(
+            energy, -1, keepdim=True)[0].expand_as(energy) - energy
+        attention = F.softmax(energy_new, dim=-1)
+        proj_value = x.view(batch_size, channels, -1)
+
+        out = torch.bmm(attention, proj_value)
+        out = out.view(batch_size, channels, height, width)
+
+        out = self.gamma(out) + x
+        return out
+
+
+@MODELS.register_module()
+class DAHead(BaseDecodeHead):
+    """Dual Attention Network for Scene Segmentation.
+
+    This head is the implementation of `DANet
+    <https://arxiv.org/abs/1809.02983>`_.
+
+    Args:
+        pam_channels (int): The channels of Position Attention Module(PAM).
+    """
+
+    def __init__(self, pam_channels, **kwargs):
+        super().__init__(**kwargs)
+        self.pam_channels = pam_channels
+        self.pam_in_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.pam = PAM(self.channels, pam_channels)
+        self.pam_out_conv = ConvModule(
+            self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.pam_conv_seg = nn.Conv2d(
+            self.channels, self.num_classes, kernel_size=1)
+
+        self.cam_in_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.cam = CAM()
+        self.cam_out_conv = ConvModule(
+            self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.cam_conv_seg = nn.Conv2d(
+            self.channels, self.num_classes, kernel_size=1)
+
+    def pam_cls_seg(self, feat):
+        """PAM feature classification."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.pam_conv_seg(feat)
+        return output
+
+    def cam_cls_seg(self, feat):
+        """CAM feature classification."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.cam_conv_seg(feat)
+        return output
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        pam_feat = self.pam_in_conv(x)
+        pam_feat = self.pam(pam_feat)
+        pam_feat = self.pam_out_conv(pam_feat)
+        pam_out = self.pam_cls_seg(pam_feat)
+
+        cam_feat = self.cam_in_conv(x)
+        cam_feat = self.cam(cam_feat)
+        cam_feat = self.cam_out_conv(cam_feat)
+        cam_out = self.cam_cls_seg(cam_feat)
+
+        feat_sum = pam_feat + cam_feat
+        pam_cam_out = self.cls_seg(feat_sum)
+
+        return pam_cam_out, pam_out, cam_out
+
+    def predict(self, inputs, batch_img_metas: List[dict], test_cfg,
+                **kwargs) -> List[Tensor]:
+        """Forward function for testing, only ``pam_cam`` is used."""
+        seg_logits = self.forward(inputs)[0]
+        return self.predict_by_feat(seg_logits, batch_img_metas, **kwargs)
+
+    def loss_by_feat(self, seg_logit: Tuple[Tensor],
+                     batch_data_samples: SampleList, **kwargs) -> dict:
+        """Compute ``pam_cam``, ``pam``, ``cam`` loss."""
+        pam_cam_seg_logit, pam_seg_logit, cam_seg_logit = seg_logit
+        loss = dict()
+        loss.update(
+            add_prefix(
+                super().loss_by_feat(pam_cam_seg_logit, batch_data_samples),
+                'pam_cam'))
+        loss.update(
+            add_prefix(super().loss_by_feat(pam_seg_logit, batch_data_samples),
+                       'pam'))
+        loss.update(
+            add_prefix(super().loss_by_feat(cam_seg_logit, batch_data_samples),
+                       'cam'))
+        return loss
diff --git a/head_extractor/src/mmseg/models/decode_heads/ddr_head.py b/head_extractor/src/mmseg/models/decode_heads/ddr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba26d6503c09d7efb3ca6664c7baf59c9e6e3ce9
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/ddr_head.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+from torch import Tensor
+
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.models.losses import accuracy
+from mmseg.models.utils import resize
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType, SampleList
+
+
+@MODELS.register_module()
+class DDRHead(BaseDecodeHead):
+    """Decode head for DDRNet.
+
+    Args:
+        in_channels (int): Number of input channels.
+        channels (int): Number of output channels.
+        num_classes (int): Number of classes.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict, optional): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 num_classes: int,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 **kwargs):
+        super().__init__(
+            in_channels,
+            channels,
+            num_classes=num_classes,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **kwargs)
+
+        self.head = self._make_base_head(self.in_channels, self.channels)
+        self.aux_head = self._make_base_head(self.in_channels // 2,
+                                             self.channels)
+        self.aux_cls_seg = nn.Conv2d(
+            self.channels, self.out_channels, kernel_size=1)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(
+            self,
+            inputs: Union[Tensor,
+                          Tuple[Tensor]]) -> Union[Tensor, Tuple[Tensor]]:
+        if self.training:
+            c3_feat, c5_feat = inputs
+            x_c = self.head(c5_feat)
+            x_c = self.cls_seg(x_c)
+            x_s = self.aux_head(c3_feat)
+            x_s = self.aux_cls_seg(x_s)
+
+            return x_c, x_s
+        else:
+            x_c = self.head(inputs)
+            x_c = self.cls_seg(x_c)
+            return x_c
+
+    def _make_base_head(self, in_channels: int,
+                        channels: int) -> nn.Sequential:
+        layers = [
+            ConvModule(
+                in_channels,
+                channels,
+                kernel_size=3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg,
+                order=('norm', 'act', 'conv')),
+            build_norm_layer(self.norm_cfg, channels)[1],
+            build_activation_layer(self.act_cfg),
+        ]
+
+        return nn.Sequential(*layers)
+
+    def loss_by_feat(self, seg_logits: Tuple[Tensor],
+                     batch_data_samples: SampleList) -> dict:
+        loss = dict()
+        context_logit, spatial_logit = seg_logits
+        seg_label = self._stack_batch_gt(batch_data_samples)
+
+        context_logit = resize(
+            context_logit,
+            size=seg_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        spatial_logit = resize(
+            spatial_logit,
+            size=seg_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        seg_label = seg_label.squeeze(1)
+
+        loss['loss_context'] = self.loss_decode[0](context_logit, seg_label)
+        loss['loss_spatial'] = self.loss_decode[1](spatial_logit, seg_label)
+        loss['acc_seg'] = accuracy(
+            context_logit, seg_label, ignore_index=self.ignore_index)
+
+        return loss
diff --git a/head_extractor/src/mmseg/models/decode_heads/decode_head.py b/head_extractor/src/mmseg/models/decode_heads/decode_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd53afe22d9350482e84c989d3d87e4e07d1ee6b
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/decode_head.py
@@ -0,0 +1,366 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from abc import ABCMeta, abstractmethod
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures import build_pixel_sampler
+from mmseg.utils import ConfigType, SampleList
+from ..losses import accuracy
+from ..utils import resize
+
+
+class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead.
+
+    1. The ``init_weights`` method is used to initialize decode_head's
+    model parameters. After segmentor initialization, ``init_weights``
+    is triggered when ``segmentor.init_weights()`` is called externally.
+
+    2. The ``loss`` method is used to calculate the loss of decode_head,
+    which includes two steps: (1) the decode_head model performs forward
+    propagation to obtain the feature maps (2) The ``loss_by_feat`` method
+    is called based on the feature maps to calculate the loss.
+
+    .. code:: text
+
+    loss(): forward() -> loss_by_feat()
+
+    3. The ``predict`` method is used to predict segmentation results,
+    which includes two steps: (1) the decode_head model performs forward
+    propagation to obtain the feature maps (2) The ``predict_by_feat`` method
+    is called based on the feature maps to predict segmentation results
+    including post-processing.
+
+    .. code:: text
+
+    predict(): forward() -> predict_by_feat()
+
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        out_channels (int): Output channels of conv_seg. Default: None.
+        threshold (float): Threshold for binary segmentation in the case of
+            `num_classes==1`. Default: None.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict | Sequence[dict]): Config of decode loss.
+            The `loss_name` is property of corresponding loss function which
+            could be shown in training log. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_ce'.
+             e.g. dict(type='CrossEntropyLoss'),
+             [dict(type='CrossEntropyLoss', loss_name='loss_ce'),
+              dict(type='DiceLoss', loss_name='loss_dice')]
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255.
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 out_channels=None,
+                 threshold=None,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False,
+                 init_cfg=dict(
+                     type='Normal', std=0.01, override=dict(name='conv_seg'))):
+        super().__init__(init_cfg)
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+
+        if out_channels is None:
+            if num_classes == 2:
+                warnings.warn('For binary segmentation, we suggest using'
+                              '`out_channels = 1` to define the output'
+                              'channels of segmentor, and use `threshold`'
+                              'to convert `seg_logits` into a prediction'
+                              'applying a threshold')
+            out_channels = num_classes
+
+        if out_channels != num_classes and out_channels != 1:
+            raise ValueError(
+                'out_channels should be equal to num_classes,'
+                'except binary segmentation set out_channels == 1 and'
+                f'num_classes == 2, but got out_channels={out_channels}'
+                f'and num_classes={num_classes}')
+
+        if out_channels == 1 and threshold is None:
+            threshold = 0.3
+            warnings.warn('threshold is not defined for binary, and defaults'
+                          'to 0.3')
+        self.num_classes = num_classes
+        self.out_channels = out_channels
+        self.threshold = threshold
+
+        if isinstance(loss_decode, dict):
+            self.loss_decode = MODELS.build(loss_decode)
+        elif isinstance(loss_decode, (list, tuple)):
+            self.loss_decode = nn.ModuleList()
+            for loss in loss_decode:
+                self.loss_decode.append(MODELS.build(loss))
+        else:
+            raise TypeError(f'loss_decode must be a dict or sequence of dict,\
+                but got {type(loss_decode)}')
+
+        if sampler is not None:
+            self.sampler = build_pixel_sampler(sampler, context=self)
+        else:
+            self.sampler = None
+
+        self.conv_seg = nn.Conv2d(channels, self.out_channels, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            Tensor: The transformed inputs
+        """
+
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+
+        return inputs
+
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    def loss(self, inputs: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Forward function for training.
+
+        Args:
+            inputs (Tuple[Tensor]): List of multi-level img features.
+            batch_data_samples (list[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `img_metas` or `gt_semantic_seg`.
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs)
+        losses = self.loss_by_feat(seg_logits, batch_data_samples)
+        return losses
+
+    def predict(self, inputs: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tensor:
+        """Forward function for prediction.
+
+        Args:
+            inputs (Tuple[Tensor]): List of multi-level img features.
+            batch_img_metas (dict): List Image info where each dict may also
+                contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Outputs segmentation logits map.
+        """
+        seg_logits = self.forward(inputs)
+
+        return self.predict_by_feat(seg_logits, batch_img_metas)
+
+    def _stack_batch_gt(self, batch_data_samples: SampleList) -> Tensor:
+        gt_semantic_segs = [
+            data_sample.gt_sem_seg.data for data_sample in batch_data_samples
+        ]
+        return torch.stack(gt_semantic_segs, dim=0)
+
+    def loss_by_feat(self, seg_logits: Tensor,
+                     batch_data_samples: SampleList) -> dict:
+        """Compute segmentation loss.
+
+        Args:
+            seg_logits (Tensor): The output from decode head forward function.
+            batch_data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        seg_label = self._stack_batch_gt(batch_data_samples)
+        loss = dict()
+        seg_logits = resize(
+            input=seg_logits,
+            size=seg_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.sampler is not None:
+            seg_weight = self.sampler.sample(seg_logits, seg_label)
+        else:
+            seg_weight = None
+        seg_label = seg_label.squeeze(1)
+
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_decode in losses_decode:
+            if loss_decode.loss_name not in loss:
+                loss[loss_decode.loss_name] = loss_decode(
+                    seg_logits,
+                    seg_label,
+                    weight=seg_weight,
+                    ignore_index=self.ignore_index)
+            else:
+                loss[loss_decode.loss_name] += loss_decode(
+                    seg_logits,
+                    seg_label,
+                    weight=seg_weight,
+                    ignore_index=self.ignore_index)
+
+        loss['acc_seg'] = accuracy(
+            seg_logits, seg_label, ignore_index=self.ignore_index)
+        return loss
+
+    def predict_by_feat(self, seg_logits: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Transform a batch of output seg_logits to the input shape.
+
+        Args:
+            seg_logits (Tensor): The output from decode head forward function.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            Tensor: Outputs segmentation logits map.
+        """
+
+        if isinstance(batch_img_metas[0]['img_shape'], torch.Size):
+            # slide inference
+            size = batch_img_metas[0]['img_shape']
+        elif 'pad_shape' in batch_img_metas[0]:
+            size = batch_img_metas[0]['pad_shape'][:2]
+        else:
+            size = batch_img_metas[0]['img_shape']
+
+        seg_logits = resize(
+            input=seg_logits,
+            size=size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        return seg_logits
diff --git a/head_extractor/src/mmseg/models/decode_heads/dm_head.py b/head_extractor/src/mmseg/models/decode_heads/dm_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7694abd8ac3a470d543c580bd97adceb5b647f7c
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/dm_head.py
@@ -0,0 +1,141 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+
+from mmseg.registry import MODELS
+from .decode_head import BaseDecodeHead
+
+
+class DCM(nn.Module):
+    """Dynamic Convolutional Module used in DMNet.
+
+    Args:
+        filter_size (int): The filter size of generated convolution kernel
+            used in Dynamic Convolutional Module.
+        fusion (bool): Add one conv to fuse DCM output feature.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict | None): Config of conv layers.
+        norm_cfg (dict | None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self, filter_size, fusion, in_channels, channels, conv_cfg,
+                 norm_cfg, act_cfg):
+        super().__init__()
+        self.filter_size = filter_size
+        self.fusion = fusion
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.filter_gen_conv = nn.Conv2d(self.in_channels, self.channels, 1, 1,
+                                         0)
+
+        self.input_redu_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        if self.norm_cfg is not None:
+            self.norm = build_norm_layer(self.norm_cfg, self.channels)[1]
+        else:
+            self.norm = None
+        self.activate = build_activation_layer(self.act_cfg)
+
+        if self.fusion:
+            self.fusion_conv = ConvModule(
+                self.channels,
+                self.channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+    def forward(self, x):
+        """Forward function."""
+        generated_filter = self.filter_gen_conv(
+            F.adaptive_avg_pool2d(x, self.filter_size))
+        x = self.input_redu_conv(x)
+        b, c, h, w = x.shape
+        # [1, b * c, h, w], c = self.channels
+        x = x.view(1, b * c, h, w)
+        # [b * c, 1, filter_size, filter_size]
+        generated_filter = generated_filter.view(b * c, 1, self.filter_size,
+                                                 self.filter_size)
+        pad = (self.filter_size - 1) // 2
+        if (self.filter_size - 1) % 2 == 0:
+            p2d = (pad, pad, pad, pad)
+        else:
+            p2d = (pad + 1, pad, pad + 1, pad)
+        x = F.pad(input=x, pad=p2d, mode='constant', value=0)
+        # [1, b * c, h, w]
+        output = F.conv2d(input=x, weight=generated_filter, groups=b * c)
+        # [b, c, h, w]
+        output = output.view(b, c, h, w)
+        if self.norm is not None:
+            output = self.norm(output)
+        output = self.activate(output)
+
+        if self.fusion:
+            output = self.fusion_conv(output)
+
+        return output
+
+
+@MODELS.register_module()
+class DMHead(BaseDecodeHead):
+    """Dynamic Multi-scale Filters for Semantic Segmentation.
+
+    This head is the implementation of
+    `DMNet <https://openaccess.thecvf.com/content_ICCV_2019/papers/\
+        He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_\
+            ICCV_2019_paper.pdf>`_.
+
+    Args:
+        filter_sizes (tuple[int]): The size of generated convolutional filters
+            used in Dynamic Convolutional Module. Default: (1, 3, 5, 7).
+        fusion (bool): Add one conv to fuse DCM output feature.
+    """
+
+    def __init__(self, filter_sizes=(1, 3, 5, 7), fusion=False, **kwargs):
+        super().__init__(**kwargs)
+        assert isinstance(filter_sizes, (list, tuple))
+        self.filter_sizes = filter_sizes
+        self.fusion = fusion
+        dcm_modules = []
+        for filter_size in self.filter_sizes:
+            dcm_modules.append(
+                DCM(filter_size,
+                    self.fusion,
+                    self.in_channels,
+                    self.channels,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        self.dcm_modules = nn.ModuleList(dcm_modules)
+        self.bottleneck = ConvModule(
+            self.in_channels + len(filter_sizes) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        dcm_outs = [x]
+        for dcm_module in self.dcm_modules:
+            dcm_outs.append(dcm_module(x))
+        dcm_outs = torch.cat(dcm_outs, dim=1)
+        output = self.bottleneck(dcm_outs)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/dnl_head.py b/head_extractor/src/mmseg/models/decode_heads/dnl_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..248c11814108d02e88fa7e0cada061b3366e33ff
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/dnl_head.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import NonLocal2d
+from torch import nn
+
+from mmseg.registry import MODELS
+from .fcn_head import FCNHead
+
+
+class DisentangledNonLocal2d(NonLocal2d):
+    """Disentangled Non-Local Blocks.
+
+    Args:
+        temperature (float): Temperature to adjust attention. Default: 0.05
+    """
+
+    def __init__(self, *arg, temperature, **kwargs):
+        super().__init__(*arg, **kwargs)
+        self.temperature = temperature
+        self.conv_mask = nn.Conv2d(self.in_channels, 1, kernel_size=1)
+
+    def embedded_gaussian(self, theta_x, phi_x):
+        """Embedded gaussian with temperature."""
+
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        if self.use_scale:
+            # theta_x.shape[-1] is `self.inter_channels`
+            pairwise_weight /= torch.tensor(
+                theta_x.shape[-1],
+                dtype=torch.float,
+                device=pairwise_weight.device)**torch.tensor(
+                    0.5, device=pairwise_weight.device)
+        pairwise_weight /= torch.tensor(
+            self.temperature, device=pairwise_weight.device)
+        pairwise_weight = pairwise_weight.softmax(dim=-1)
+        return pairwise_weight
+
+    def forward(self, x):
+        # x: [N, C, H, W]
+        n = x.size(0)
+
+        # g_x: [N, HxW, C]
+        g_x = self.g(x).view(n, self.inter_channels, -1)
+        g_x = g_x.permute(0, 2, 1)
+
+        # theta_x: [N, HxW, C], phi_x: [N, C, HxW]
+        if self.mode == 'gaussian':
+            theta_x = x.view(n, self.in_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            if self.sub_sample:
+                phi_x = self.phi(x).view(n, self.in_channels, -1)
+            else:
+                phi_x = x.view(n, self.in_channels, -1)
+        elif self.mode == 'concatenation':
+            theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
+        else:
+            theta_x = self.theta(x).view(n, self.inter_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, -1)
+
+        # subtract mean
+        theta_x -= theta_x.mean(dim=-2, keepdim=True)
+        phi_x -= phi_x.mean(dim=-1, keepdim=True)
+
+        pairwise_func = getattr(self, self.mode)
+        # pairwise_weight: [N, HxW, HxW]
+        pairwise_weight = pairwise_func(theta_x, phi_x)
+
+        # y: [N, HxW, C]
+        y = torch.matmul(pairwise_weight, g_x)
+        # y: [N, C, H, W]
+        y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
+                                                    *x.size()[2:])
+
+        # unary_mask: [N, 1, HxW]
+        unary_mask = self.conv_mask(x)
+        unary_mask = unary_mask.view(n, 1, -1)
+        unary_mask = unary_mask.softmax(dim=-1)
+        # unary_x: [N, 1, C]
+        unary_x = torch.matmul(unary_mask, g_x)
+        # unary_x: [N, C, 1, 1]
+        unary_x = unary_x.permute(0, 2, 1).contiguous().reshape(
+            n, self.inter_channels, 1, 1)
+
+        output = x + self.conv_out(y + unary_x)
+
+        return output
+
+
+@MODELS.register_module()
+class DNLHead(FCNHead):
+    """Disentangled Non-Local Neural Networks.
+
+    This head is the implementation of `DNLNet
+    <https://arxiv.org/abs/2006.06668>`_.
+
+    Args:
+        reduction (int): Reduction factor of projection transform. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            sqrt(1/inter_channels). Default: False.
+        mode (str): The nonlocal mode. Options are 'embedded_gaussian',
+            'dot_product'. Default: 'embedded_gaussian.'.
+        temperature (float): Temperature to adjust attention. Default: 0.05
+    """
+
+    def __init__(self,
+                 reduction=2,
+                 use_scale=True,
+                 mode='embedded_gaussian',
+                 temperature=0.05,
+                 **kwargs):
+        super().__init__(num_convs=2, **kwargs)
+        self.reduction = reduction
+        self.use_scale = use_scale
+        self.mode = mode
+        self.temperature = temperature
+        self.dnl_block = DisentangledNonLocal2d(
+            in_channels=self.channels,
+            reduction=self.reduction,
+            use_scale=self.use_scale,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            mode=self.mode,
+            temperature=self.temperature)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        output = self.convs[0](x)
+        output = self.dnl_block(output)
+        output = self.convs[1](output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/dpt_head.py b/head_extractor/src/mmseg/models/decode_heads/dpt_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2cfd89daa4df48601e930cfd158dcf3c9a6a837
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/dpt_head.py
@@ -0,0 +1,294 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, Linear, build_activation_layer
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class ReassembleBlocks(BaseModule):
+    """ViTPostProcessBlock, process cls_token in ViT backbone output and
+    rearrange the feature vector to feature map.
+
+    Args:
+        in_channels (int): ViT feature channels. Default: 768.
+        out_channels (List): output channels of each stage.
+            Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        patch_size (int): The patch size. Default: 16.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=768,
+                 out_channels=[96, 192, 384, 768],
+                 readout_type='ignore',
+                 patch_size=16,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        assert readout_type in ['ignore', 'add', 'project']
+        self.readout_type = readout_type
+        self.patch_size = patch_size
+
+        self.projects = nn.ModuleList([
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                act_cfg=None,
+            ) for out_channel in out_channels
+        ])
+
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        if self.readout_type == 'project':
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        Linear(2 * in_channels, in_channels),
+                        build_activation_layer(dict(type='GELU'))))
+
+    def forward(self, inputs):
+        assert isinstance(inputs, list)
+        out = []
+        for i, x in enumerate(inputs):
+            assert len(x) == 2
+            x, cls_token = x[0], x[1]
+            feature_shape = x.shape
+            if self.readout_type == 'project':
+                x = x.flatten(2).permute((0, 2, 1))
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+                x = x.permute(0, 2, 1).reshape(feature_shape)
+            elif self.readout_type == 'add':
+                x = x.flatten(2) + cls_token.unsqueeze(-1)
+                x = x.reshape(feature_shape)
+            else:
+                pass
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        return out
+
+
+class PreActResidualConvUnit(BaseModule):
+    """ResidualConvUnit, pre-activate residual unit.
+
+    Args:
+        in_channels (int): number of channels in the input feature map.
+        act_cfg (dict): dictionary to construct and config activation layer.
+        norm_cfg (dict): dictionary to construct and config norm layer.
+        stride (int): stride of the first block. Default: 1
+        dilation (int): dilation rate for convs layers. Default: 1.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 act_cfg,
+                 norm_cfg,
+                 stride=1,
+                 dilation=1,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.conv1 = ConvModule(
+            in_channels,
+            in_channels,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            bias=False,
+            order=('act', 'conv', 'norm'))
+
+        self.conv2 = ConvModule(
+            in_channels,
+            in_channels,
+            3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            bias=False,
+            order=('act', 'conv', 'norm'))
+
+    def forward(self, inputs):
+        inputs_ = inputs.clone()
+        x = self.conv1(inputs)
+        x = self.conv2(x)
+        return x + inputs_
+
+
+class FeatureFusionBlock(BaseModule):
+    """FeatureFusionBlock, merge feature map from different stages.
+
+    Args:
+        in_channels (int): Input channels.
+        act_cfg (dict): The activation config for ResidualConvUnit.
+        norm_cfg (dict): Config dict for normalization layer.
+        expand (bool): Whether expand the channels in post process block.
+            Default: False.
+        align_corners (bool): align_corner setting for bilinear upsample.
+            Default: True.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 act_cfg,
+                 norm_cfg,
+                 expand=False,
+                 align_corners=True,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.in_channels = in_channels
+        self.expand = expand
+        self.align_corners = align_corners
+
+        self.out_channels = in_channels
+        if self.expand:
+            self.out_channels = in_channels // 2
+
+        self.project = ConvModule(
+            self.in_channels,
+            self.out_channels,
+            kernel_size=1,
+            act_cfg=None,
+            bias=True)
+
+        self.res_conv_unit1 = PreActResidualConvUnit(
+            in_channels=self.in_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
+        self.res_conv_unit2 = PreActResidualConvUnit(
+            in_channels=self.in_channels, act_cfg=act_cfg, norm_cfg=norm_cfg)
+
+    def forward(self, *inputs):
+        x = inputs[0]
+        if len(inputs) == 2:
+            if x.shape != inputs[1].shape:
+                res = resize(
+                    inputs[1],
+                    size=(x.shape[2], x.shape[3]),
+                    mode='bilinear',
+                    align_corners=False)
+            else:
+                res = inputs[1]
+            x = x + self.res_conv_unit1(res)
+        x = self.res_conv_unit2(x)
+        x = resize(
+            x,
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.project(x)
+        return x
+
+
+@MODELS.register_module()
+class DPTHead(BaseDecodeHead):
+    """Vision Transformers for Dense Prediction.
+
+    This head is implemented of `DPT <https://arxiv.org/abs/2103.13413>`_.
+
+    Args:
+        embed_dims (int): The embed dimension of the ViT backbone.
+            Default: 768.
+        post_process_channels (List): Out channels of post process conv
+            layers. Default: [96, 192, 384, 768].
+        readout_type (str): Type of readout operation. Default: 'ignore'.
+        patch_size (int): The patch size. Default: 16.
+        expand_channels (bool): Whether expand the channels in post process
+            block. Default: False.
+        act_cfg (dict): The activation config for residual conv unit.
+            Default dict(type='ReLU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+    """
+
+    def __init__(self,
+                 embed_dims=768,
+                 post_process_channels=[96, 192, 384, 768],
+                 readout_type='ignore',
+                 patch_size=16,
+                 expand_channels=False,
+                 act_cfg=dict(type='ReLU'),
+                 norm_cfg=dict(type='BN'),
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.in_channels = self.in_channels
+        self.expand_channels = expand_channels
+        self.reassemble_blocks = ReassembleBlocks(embed_dims,
+                                                  post_process_channels,
+                                                  readout_type, patch_size)
+
+        self.post_process_channels = [
+            channel * math.pow(2, i) if expand_channels else channel
+            for i, channel in enumerate(post_process_channels)
+        ]
+        self.convs = nn.ModuleList()
+        for channel in self.post_process_channels:
+            self.convs.append(
+                ConvModule(
+                    channel,
+                    self.channels,
+                    kernel_size=3,
+                    padding=1,
+                    act_cfg=None,
+                    bias=False))
+        self.fusion_blocks = nn.ModuleList()
+        for _ in range(len(self.convs)):
+            self.fusion_blocks.append(
+                FeatureFusionBlock(self.channels, act_cfg, norm_cfg))
+        self.fusion_blocks[0].res_conv_unit1 = None
+        self.project = ConvModule(
+            self.channels,
+            self.channels,
+            kernel_size=3,
+            padding=1,
+            norm_cfg=norm_cfg)
+        self.num_fusion_blocks = len(self.fusion_blocks)
+        self.num_reassemble_blocks = len(self.reassemble_blocks.resize_layers)
+        self.num_post_process_channels = len(self.post_process_channels)
+        assert self.num_fusion_blocks == self.num_reassemble_blocks
+        assert self.num_reassemble_blocks == self.num_post_process_channels
+
+    def forward(self, inputs):
+        assert len(inputs) == self.num_reassemble_blocks
+        x = self._transform_inputs(inputs)
+        x = self.reassemble_blocks(x)
+        x = [self.convs[i](feature) for i, feature in enumerate(x)]
+        out = self.fusion_blocks[0](x[-1])
+        for i in range(1, len(self.fusion_blocks)):
+            out = self.fusion_blocks[i](out, x[-(i + 1)])
+        out = self.project(out)
+        out = self.cls_seg(out)
+        return out
diff --git a/head_extractor/src/mmseg/models/decode_heads/ema_head.py b/head_extractor/src/mmseg/models/decode_heads/ema_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab8dbb0c29b9b533dad962e48d71ae055f20aa07
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/ema_head.py
@@ -0,0 +1,169 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from .decode_head import BaseDecodeHead
+
+
+def reduce_mean(tensor):
+    """Reduce mean when distributed training."""
+    if not (dist.is_available() and dist.is_initialized()):
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+    return tensor
+
+
+class EMAModule(nn.Module):
+    """Expectation Maximization Attention Module used in EMANet.
+
+    Args:
+        channels (int): Channels of the whole module.
+        num_bases (int): Number of bases.
+        num_stages (int): Number of the EM iterations.
+    """
+
+    def __init__(self, channels, num_bases, num_stages, momentum):
+        super().__init__()
+        assert num_stages >= 1, 'num_stages must be at least 1!'
+        self.num_bases = num_bases
+        self.num_stages = num_stages
+        self.momentum = momentum
+
+        bases = torch.zeros(1, channels, self.num_bases)
+        bases.normal_(0, math.sqrt(2. / self.num_bases))
+        # [1, channels, num_bases]
+        bases = F.normalize(bases, dim=1, p=2)
+        self.register_buffer('bases', bases)
+
+    def forward(self, feats):
+        """Forward function."""
+        batch_size, channels, height, width = feats.size()
+        # [batch_size, channels, height*width]
+        feats = feats.view(batch_size, channels, height * width)
+        # [batch_size, channels, num_bases]
+        bases = self.bases.repeat(batch_size, 1, 1)
+
+        with torch.no_grad():
+            for i in range(self.num_stages):
+                # [batch_size, height*width, num_bases]
+                attention = torch.einsum('bcn,bck->bnk', feats, bases)
+                attention = F.softmax(attention, dim=2)
+                # l1 norm
+                attention_normed = F.normalize(attention, dim=1, p=1)
+                # [batch_size, channels, num_bases]
+                bases = torch.einsum('bcn,bnk->bck', feats, attention_normed)
+                # l2 norm
+                bases = F.normalize(bases, dim=1, p=2)
+
+        feats_recon = torch.einsum('bck,bnk->bcn', bases, attention)
+        feats_recon = feats_recon.view(batch_size, channels, height, width)
+
+        if self.training:
+            bases = bases.mean(dim=0, keepdim=True)
+            bases = reduce_mean(bases)
+            # l2 norm
+            bases = F.normalize(bases, dim=1, p=2)
+            self.bases = (1 -
+                          self.momentum) * self.bases + self.momentum * bases
+
+        return feats_recon
+
+
+@MODELS.register_module()
+class EMAHead(BaseDecodeHead):
+    """Expectation Maximization Attention Networks for Semantic Segmentation.
+
+    This head is the implementation of `EMANet
+    <https://arxiv.org/abs/1907.13426>`_.
+
+    Args:
+        ema_channels (int): EMA module channels
+        num_bases (int): Number of bases.
+        num_stages (int): Number of the EM iterations.
+        concat_input (bool): Whether concat the input and output of convs
+            before classification layer. Default: True
+        momentum (float): Momentum to update the base. Default: 0.1.
+    """
+
+    def __init__(self,
+                 ema_channels,
+                 num_bases,
+                 num_stages,
+                 concat_input=True,
+                 momentum=0.1,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.ema_channels = ema_channels
+        self.num_bases = num_bases
+        self.num_stages = num_stages
+        self.concat_input = concat_input
+        self.momentum = momentum
+        self.ema_module = EMAModule(self.ema_channels, self.num_bases,
+                                    self.num_stages, self.momentum)
+
+        self.ema_in_conv = ConvModule(
+            self.in_channels,
+            self.ema_channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        # project (0, inf) -> (-inf, inf)
+        self.ema_mid_conv = ConvModule(
+            self.ema_channels,
+            self.ema_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=None,
+            act_cfg=None)
+        for param in self.ema_mid_conv.parameters():
+            param.requires_grad = False
+
+        self.ema_out_conv = ConvModule(
+            self.ema_channels,
+            self.ema_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+        self.bottleneck = ConvModule(
+            self.ema_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        if self.concat_input:
+            self.conv_cat = ConvModule(
+                self.in_channels + self.channels,
+                self.channels,
+                kernel_size=3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        feats = self.ema_in_conv(x)
+        identity = feats
+        feats = self.ema_mid_conv(feats)
+        recon = self.ema_module(feats)
+        recon = F.relu(recon, inplace=True)
+        recon = self.ema_out_conv(recon)
+        output = F.relu(identity + recon, inplace=True)
+        output = self.bottleneck(output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/enc_head.py b/head_extractor/src/mmseg/models/decode_heads/enc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bba73b3010b31cd39547b79bd507073f14bdb32
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/enc_head.py
@@ -0,0 +1,196 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_norm_layer
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import ConfigType, SampleList
+from ..utils import Encoding, resize
+from .decode_head import BaseDecodeHead
+
+
+class EncModule(nn.Module):
+    """Encoding Module used in EncNet.
+
+    Args:
+        in_channels (int): Input channels.
+        num_codes (int): Number of code words.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self, in_channels, num_codes, conv_cfg, norm_cfg, act_cfg):
+        super().__init__()
+        self.encoding_project = ConvModule(
+            in_channels,
+            in_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        # TODO: resolve this hack
+        # change to 1d
+        if norm_cfg is not None:
+            encoding_norm_cfg = norm_cfg.copy()
+            if encoding_norm_cfg['type'] in ['BN', 'IN']:
+                encoding_norm_cfg['type'] += '1d'
+            else:
+                encoding_norm_cfg['type'] = encoding_norm_cfg['type'].replace(
+                    '2d', '1d')
+        else:
+            # fallback to BN1d
+            encoding_norm_cfg = dict(type='BN1d')
+        self.encoding = nn.Sequential(
+            Encoding(channels=in_channels, num_codes=num_codes),
+            build_norm_layer(encoding_norm_cfg, num_codes)[1],
+            nn.ReLU(inplace=True))
+        self.fc = nn.Sequential(
+            nn.Linear(in_channels, in_channels), nn.Sigmoid())
+
+    def forward(self, x):
+        """Forward function."""
+        encoding_projection = self.encoding_project(x)
+        encoding_feat = self.encoding(encoding_projection).mean(dim=1)
+        batch_size, channels, _, _ = x.size()
+        gamma = self.fc(encoding_feat)
+        y = gamma.view(batch_size, channels, 1, 1)
+        output = F.relu_(x + x * y)
+        return encoding_feat, output
+
+
+@MODELS.register_module()
+class EncHead(BaseDecodeHead):
+    """Context Encoding for Semantic Segmentation.
+
+    This head is the implementation of `EncNet
+    <https://arxiv.org/abs/1803.08904>`_.
+
+    Args:
+        num_codes (int): Number of code words. Default: 32.
+        use_se_loss (bool): Whether use Semantic Encoding Loss (SE-loss) to
+            regularize the training. Default: True.
+        add_lateral (bool): Whether use lateral connection to fuse features.
+            Default: False.
+        loss_se_decode (dict): Config of decode loss.
+            Default: dict(type='CrossEntropyLoss', use_sigmoid=True).
+    """
+
+    def __init__(self,
+                 num_codes=32,
+                 use_se_loss=True,
+                 add_lateral=False,
+                 loss_se_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=0.2),
+                 **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+        self.use_se_loss = use_se_loss
+        self.add_lateral = add_lateral
+        self.num_codes = num_codes
+        self.bottleneck = ConvModule(
+            self.in_channels[-1],
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        if add_lateral:
+            self.lateral_convs = nn.ModuleList()
+            for in_channels in self.in_channels[:-1]:  # skip the last one
+                self.lateral_convs.append(
+                    ConvModule(
+                        in_channels,
+                        self.channels,
+                        1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+            self.fusion = ConvModule(
+                len(self.in_channels) * self.channels,
+                self.channels,
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        self.enc_module = EncModule(
+            self.channels,
+            num_codes=num_codes,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        if self.use_se_loss:
+            self.loss_se_decode = MODELS.build(loss_se_decode)
+            self.se_layer = nn.Linear(self.channels, self.num_classes)
+
+    def forward(self, inputs):
+        """Forward function."""
+        inputs = self._transform_inputs(inputs)
+        feat = self.bottleneck(inputs[-1])
+        if self.add_lateral:
+            laterals = [
+                resize(
+                    lateral_conv(inputs[i]),
+                    size=feat.shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners)
+                for i, lateral_conv in enumerate(self.lateral_convs)
+            ]
+            feat = self.fusion(torch.cat([feat, *laterals], 1))
+        encode_feat, output = self.enc_module(feat)
+        output = self.cls_seg(output)
+        if self.use_se_loss:
+            se_output = self.se_layer(encode_feat)
+            return output, se_output
+        else:
+            return output
+
+    def predict(self, inputs: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType):
+        """Forward function for testing, ignore se_loss."""
+        if self.use_se_loss:
+            seg_logits = self.forward(inputs)[0]
+        else:
+            seg_logits = self.forward(inputs)
+        return self.predict_by_feat(seg_logits, batch_img_metas)
+
+    @staticmethod
+    def _convert_to_onehot_labels(seg_label, num_classes):
+        """Convert segmentation label to onehot.
+
+        Args:
+            seg_label (Tensor): Segmentation label of shape (N, H, W).
+            num_classes (int): Number of classes.
+
+        Returns:
+            Tensor: Onehot labels of shape (N, num_classes).
+        """
+
+        batch_size = seg_label.size(0)
+        onehot_labels = seg_label.new_zeros((batch_size, num_classes))
+        for i in range(batch_size):
+            hist = seg_label[i].float().histc(
+                bins=num_classes, min=0, max=num_classes - 1)
+            onehot_labels[i] = hist > 0
+        return onehot_labels
+
+    def loss_by_feat(self, seg_logit: Tuple[Tensor],
+                     batch_data_samples: SampleList, **kwargs) -> dict:
+        """Compute segmentation and semantic encoding loss."""
+        seg_logit, se_seg_logit = seg_logit
+        loss = dict()
+        loss.update(super().loss_by_feat(seg_logit, batch_data_samples))
+
+        seg_label = self._stack_batch_gt(batch_data_samples)
+        se_loss = self.loss_se_decode(
+            se_seg_logit,
+            self._convert_to_onehot_labels(seg_label, self.num_classes))
+        loss['loss_se'] = se_loss
+        return loss
diff --git a/head_extractor/src/mmseg/models/decode_heads/fcn_head.py b/head_extractor/src/mmseg/models/decode_heads/fcn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..341801888368d307da6b926a2c89f72b6b06476d
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/fcn_head.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from .decode_head import BaseDecodeHead
+
+
+@MODELS.register_module()
+class FCNHead(BaseDecodeHead):
+    """Fully Convolution Networks for Semantic Segmentation.
+
+    This head is implemented of `FCNNet <https://arxiv.org/abs/1411.4038>`_.
+
+    Args:
+        num_convs (int): Number of convs in the head. Default: 2.
+        kernel_size (int): The kernel size for convs in the head. Default: 3.
+        concat_input (bool): Whether concat the input and output of convs
+            before classification layer.
+        dilation (int): The dilation rate for convs in the head. Default: 1.
+    """
+
+    def __init__(self,
+                 num_convs=2,
+                 kernel_size=3,
+                 concat_input=True,
+                 dilation=1,
+                 **kwargs):
+        assert num_convs >= 0 and dilation > 0 and isinstance(dilation, int)
+        self.num_convs = num_convs
+        self.concat_input = concat_input
+        self.kernel_size = kernel_size
+        super().__init__(**kwargs)
+        if num_convs == 0:
+            assert self.in_channels == self.channels
+
+        conv_padding = (kernel_size // 2) * dilation
+        convs = []
+        convs.append(
+            ConvModule(
+                self.in_channels,
+                self.channels,
+                kernel_size=kernel_size,
+                padding=conv_padding,
+                dilation=dilation,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+        for i in range(num_convs - 1):
+            convs.append(
+                ConvModule(
+                    self.channels,
+                    self.channels,
+                    kernel_size=kernel_size,
+                    padding=conv_padding,
+                    dilation=dilation,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+        if num_convs == 0:
+            self.convs = nn.Identity()
+        else:
+            self.convs = nn.Sequential(*convs)
+        if self.concat_input:
+            self.conv_cat = ConvModule(
+                self.in_channels + self.channels,
+                self.channels,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+
+    def _forward_feature(self, inputs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        x = self._transform_inputs(inputs)
+        feats = self.convs(x)
+        if self.concat_input:
+            feats = self.conv_cat(torch.cat([x, feats], dim=1))
+        return feats
+
+    def forward(self, inputs):
+        """Forward function."""
+        output = self._forward_feature(inputs)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/fpn_head.py b/head_extractor/src/mmseg/models/decode_heads/fpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..25f481fe81c5f4f0aa37903aaf135dc63c930bf8
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/fpn_head.py
@@ -0,0 +1,68 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import Upsample, resize
+from .decode_head import BaseDecodeHead
+
+
+@MODELS.register_module()
+class FPNHead(BaseDecodeHead):
+    """Panoptic Feature Pyramid Networks.
+
+    This head is the implementation of `Semantic FPN
+    <https://arxiv.org/abs/1901.02446>`_.
+
+    Args:
+        feature_strides (tuple[int]): The strides for input feature maps.
+            stack_lateral. All strides suppose to be power of 2. The first
+            one is of largest resolution.
+    """
+
+    def __init__(self, feature_strides, **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+        assert len(feature_strides) == len(self.in_channels)
+        assert min(feature_strides) == feature_strides[0]
+        self.feature_strides = feature_strides
+
+        self.scale_heads = nn.ModuleList()
+        for i in range(len(feature_strides)):
+            head_length = max(
+                1,
+                int(np.log2(feature_strides[i]) - np.log2(feature_strides[0])))
+            scale_head = []
+            for k in range(head_length):
+                scale_head.append(
+                    ConvModule(
+                        self.in_channels[i] if k == 0 else self.channels,
+                        self.channels,
+                        3,
+                        padding=1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg))
+                if feature_strides[i] != feature_strides[0]:
+                    scale_head.append(
+                        Upsample(
+                            scale_factor=2,
+                            mode='bilinear',
+                            align_corners=self.align_corners))
+            self.scale_heads.append(nn.Sequential(*scale_head))
+
+    def forward(self, inputs):
+
+        x = self._transform_inputs(inputs)
+
+        output = self.scale_heads[0](x[0])
+        for i in range(1, len(self.feature_strides)):
+            # non inplace
+            output = output + resize(
+                self.scale_heads[i](x[i]),
+                size=output.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/gc_head.py b/head_extractor/src/mmseg/models/decode_heads/gc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..14f0ef021c1143d493e17f347f1f4da1145470b8
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/gc_head.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import ContextBlock
+
+from mmseg.registry import MODELS
+from .fcn_head import FCNHead
+
+
+@MODELS.register_module()
+class GCHead(FCNHead):
+    """GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond.
+
+    This head is the implementation of `GCNet
+    <https://arxiv.org/abs/1904.11492>`_.
+
+    Args:
+        ratio (float): Multiplier of channels ratio. Default: 1/4.
+        pooling_type (str): The pooling type of context aggregation.
+            Options are 'att', 'avg'. Default: 'avg'.
+        fusion_types (tuple[str]): The fusion type for feature fusion.
+            Options are 'channel_add', 'channel_mul'. Default: ('channel_add',)
+    """
+
+    def __init__(self,
+                 ratio=1 / 4.,
+                 pooling_type='att',
+                 fusion_types=('channel_add', ),
+                 **kwargs):
+        super().__init__(num_convs=2, **kwargs)
+        self.ratio = ratio
+        self.pooling_type = pooling_type
+        self.fusion_types = fusion_types
+        self.gc_block = ContextBlock(
+            in_channels=self.channels,
+            ratio=self.ratio,
+            pooling_type=self.pooling_type,
+            fusion_types=self.fusion_types)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        output = self.convs[0](x)
+        output = self.gc_block(output)
+        output = self.convs[1](output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/ham_head.py b/head_extractor/src/mmseg/models/decode_heads/ham_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..073d8011b05dc8c5e8d48cc8b77484a27f7b2100
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/ham_head.py
@@ -0,0 +1,255 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Originally from https://github.com/visual-attention-network/segnext
+# Licensed under the Apache License, Version 2.0 (the "License")
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.device import get_device
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class Matrix_Decomposition_2D_Base(nn.Module):
+    """Base class of 2D Matrix Decomposition.
+
+    Args:
+        MD_S (int): The number of spatial coefficient in
+            Matrix Decomposition, it may be used for calculation
+            of the number of latent dimension D in Matrix
+            Decomposition. Defaults: 1.
+        MD_R (int): The number of latent dimension R in
+            Matrix Decomposition. Defaults: 64.
+        train_steps (int): The number of iteration steps in
+            Multiplicative Update (MU) rule to solve Non-negative
+            Matrix Factorization (NMF) in training. Defaults: 6.
+        eval_steps (int): The number of iteration steps in
+            Multiplicative Update (MU) rule to solve Non-negative
+            Matrix Factorization (NMF) in evaluation. Defaults: 7.
+        inv_t (int): Inverted multiple number to make coefficient
+            smaller in softmax. Defaults: 100.
+        rand_init (bool): Whether to initialize randomly.
+            Defaults: True.
+    """
+
+    def __init__(self,
+                 MD_S=1,
+                 MD_R=64,
+                 train_steps=6,
+                 eval_steps=7,
+                 inv_t=100,
+                 rand_init=True):
+        super().__init__()
+
+        self.S = MD_S
+        self.R = MD_R
+
+        self.train_steps = train_steps
+        self.eval_steps = eval_steps
+
+        self.inv_t = inv_t
+
+        self.rand_init = rand_init
+
+    def _build_bases(self, B, S, D, R, device=None):
+        raise NotImplementedError
+
+    def local_step(self, x, bases, coef):
+        raise NotImplementedError
+
+    def local_inference(self, x, bases):
+        # (B * S, D, N)^T @ (B * S, D, R) -> (B * S, N, R)
+        coef = torch.bmm(x.transpose(1, 2), bases)
+        coef = F.softmax(self.inv_t * coef, dim=-1)
+
+        steps = self.train_steps if self.training else self.eval_steps
+        for _ in range(steps):
+            bases, coef = self.local_step(x, bases, coef)
+
+        return bases, coef
+
+    def compute_coef(self, x, bases, coef):
+        raise NotImplementedError
+
+    def forward(self, x, return_bases=False):
+        """Forward Function."""
+        B, C, H, W = x.shape
+
+        # (B, C, H, W) -> (B * S, D, N)
+        D = C // self.S
+        N = H * W
+        x = x.view(B * self.S, D, N)
+        if not self.rand_init and not hasattr(self, 'bases'):
+            bases = self._build_bases(1, self.S, D, self.R, device=x.device)
+            self.register_buffer('bases', bases)
+
+        # (S, D, R) -> (B * S, D, R)
+        if self.rand_init:
+            bases = self._build_bases(B, self.S, D, self.R, device=x.device)
+        else:
+            bases = self.bases.repeat(B, 1, 1)
+
+        bases, coef = self.local_inference(x, bases)
+
+        # (B * S, N, R)
+        coef = self.compute_coef(x, bases, coef)
+
+        # (B * S, D, R) @ (B * S, N, R)^T -> (B * S, D, N)
+        x = torch.bmm(bases, coef.transpose(1, 2))
+
+        # (B * S, D, N) -> (B, C, H, W)
+        x = x.view(B, C, H, W)
+
+        return x
+
+
+class NMF2D(Matrix_Decomposition_2D_Base):
+    """Non-negative Matrix Factorization (NMF) module.
+
+    It is inherited from ``Matrix_Decomposition_2D_Base`` module.
+    """
+
+    def __init__(self, args=dict()):
+        super().__init__(**args)
+
+        self.inv_t = 1
+
+    def _build_bases(self, B, S, D, R, device=None):
+        """Build bases in initialization."""
+        if device is None:
+            device = get_device()
+        bases = torch.rand((B * S, D, R)).to(device)
+        bases = F.normalize(bases, dim=1)
+
+        return bases
+
+    def local_step(self, x, bases, coef):
+        """Local step in iteration to renew bases and coefficient."""
+        # (B * S, D, N)^T @ (B * S, D, R) -> (B * S, N, R)
+        numerator = torch.bmm(x.transpose(1, 2), bases)
+        # (B * S, N, R) @ [(B * S, D, R)^T @ (B * S, D, R)] -> (B * S, N, R)
+        denominator = coef.bmm(bases.transpose(1, 2).bmm(bases))
+        # Multiplicative Update
+        coef = coef * numerator / (denominator + 1e-6)
+
+        # (B * S, D, N) @ (B * S, N, R) -> (B * S, D, R)
+        numerator = torch.bmm(x, coef)
+        # (B * S, D, R) @ [(B * S, N, R)^T @ (B * S, N, R)] -> (B * S, D, R)
+        denominator = bases.bmm(coef.transpose(1, 2).bmm(coef))
+        # Multiplicative Update
+        bases = bases * numerator / (denominator + 1e-6)
+
+        return bases, coef
+
+    def compute_coef(self, x, bases, coef):
+        """Compute coefficient."""
+        # (B * S, D, N)^T @ (B * S, D, R) -> (B * S, N, R)
+        numerator = torch.bmm(x.transpose(1, 2), bases)
+        # (B * S, N, R) @ (B * S, D, R)^T @ (B * S, D, R) -> (B * S, N, R)
+        denominator = coef.bmm(bases.transpose(1, 2).bmm(bases))
+        # multiplication update
+        coef = coef * numerator / (denominator + 1e-6)
+
+        return coef
+
+
+class Hamburger(nn.Module):
+    """Hamburger Module. It consists of one slice of "ham" (matrix
+    decomposition) and two slices of "bread" (linear transformation).
+
+    Args:
+        ham_channels (int): Input and output channels of feature.
+        ham_kwargs (dict): Config of matrix decomposition module.
+        norm_cfg (dict | None): Config of norm layers.
+    """
+
+    def __init__(self,
+                 ham_channels=512,
+                 ham_kwargs=dict(),
+                 norm_cfg=None,
+                 **kwargs):
+        super().__init__()
+
+        self.ham_in = ConvModule(
+            ham_channels, ham_channels, 1, norm_cfg=None, act_cfg=None)
+
+        self.ham = NMF2D(ham_kwargs)
+
+        self.ham_out = ConvModule(
+            ham_channels, ham_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+
+    def forward(self, x):
+        enjoy = self.ham_in(x)
+        enjoy = F.relu(enjoy, inplace=True)
+        enjoy = self.ham(enjoy)
+        enjoy = self.ham_out(enjoy)
+        ham = F.relu(x + enjoy, inplace=True)
+
+        return ham
+
+
+@MODELS.register_module()
+class LightHamHead(BaseDecodeHead):
+    """SegNeXt decode head.
+
+    This decode head is the implementation of `SegNeXt: Rethinking
+    Convolutional Attention Design for Semantic
+    Segmentation <https://arxiv.org/abs/2209.08575>`_.
+    Inspiration from https://github.com/visual-attention-network/segnext.
+
+    Specifically, LightHamHead is inspired by HamNet from
+    `Is Attention Better Than Matrix Decomposition?
+    <https://arxiv.org/abs/2109.04553>`.
+
+    Args:
+        ham_channels (int): input channels for Hamburger.
+            Defaults: 512.
+        ham_kwargs (int): kwagrs for Ham. Defaults: dict().
+    """
+
+    def __init__(self, ham_channels=512, ham_kwargs=dict(), **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+        self.ham_channels = ham_channels
+
+        self.squeeze = ConvModule(
+            sum(self.in_channels),
+            self.ham_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.hamburger = Hamburger(ham_channels, ham_kwargs, **kwargs)
+
+        self.align = ConvModule(
+            self.ham_channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        inputs = self._transform_inputs(inputs)
+
+        inputs = [
+            resize(
+                level,
+                size=inputs[0].shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for level in inputs
+        ]
+
+        inputs = torch.cat(inputs, dim=1)
+        # apply a conv block to squeeze feature map
+        x = self.squeeze(inputs)
+        # apply hamburger module
+        x = self.hamburger(x)
+
+        # apply a conv block to align feature map
+        output = self.align(x)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/isa_head.py b/head_extractor/src/mmseg/models/decode_heads/isa_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..355f215f39007d0153c2fdb3b22a40e7f11a01e3
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/isa_head.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import SelfAttentionBlock as _SelfAttentionBlock
+from .decode_head import BaseDecodeHead
+
+
+class SelfAttentionBlock(_SelfAttentionBlock):
+    """Self-Attention Module.
+
+    Args:
+        in_channels (int): Input channels of key/query feature.
+        channels (int): Output channels of key/query transform.
+        conv_cfg (dict | None): Config of conv layers.
+        norm_cfg (dict | None): Config of norm layers.
+        act_cfg (dict | None): Config of activation layers.
+    """
+
+    def __init__(self, in_channels, channels, conv_cfg, norm_cfg, act_cfg):
+        super().__init__(
+            key_in_channels=in_channels,
+            query_in_channels=in_channels,
+            channels=channels,
+            out_channels=in_channels,
+            share_key_query=False,
+            query_downsample=None,
+            key_downsample=None,
+            key_query_num_convs=2,
+            key_query_norm=True,
+            value_out_num_convs=1,
+            value_out_norm=False,
+            matmul_norm=True,
+            with_out=False,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        self.output_project = self.build_project(
+            in_channels,
+            in_channels,
+            num_convs=1,
+            use_conv_module=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x):
+        """Forward function."""
+        context = super().forward(x, x)
+        return self.output_project(context)
+
+
+@MODELS.register_module()
+class ISAHead(BaseDecodeHead):
+    """Interlaced Sparse Self-Attention for Semantic Segmentation.
+
+    This head is the implementation of `ISA
+    <https://arxiv.org/abs/1907.12273>`_.
+
+    Args:
+        isa_channels (int): The channels of ISA Module.
+        down_factor (tuple[int]): The local group size of ISA.
+    """
+
+    def __init__(self, isa_channels, down_factor=(8, 8), **kwargs):
+        super().__init__(**kwargs)
+        self.down_factor = down_factor
+
+        self.in_conv = ConvModule(
+            self.in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.global_relation = SelfAttentionBlock(
+            self.channels,
+            isa_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.local_relation = SelfAttentionBlock(
+            self.channels,
+            isa_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.out_conv = ConvModule(
+            self.channels * 2,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x_ = self._transform_inputs(inputs)
+        x = self.in_conv(x_)
+        residual = x
+
+        n, c, h, w = x.size()
+        loc_h, loc_w = self.down_factor  # size of local group in H- and W-axes
+        glb_h, glb_w = math.ceil(h / loc_h), math.ceil(w / loc_w)
+        pad_h, pad_w = glb_h * loc_h - h, glb_w * loc_w - w
+        if pad_h > 0 or pad_w > 0:  # pad if the size is not divisible
+            padding = (pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                       pad_h - pad_h // 2)
+            x = F.pad(x, padding)
+
+        # global relation
+        x = x.view(n, c, glb_h, loc_h, glb_w, loc_w)
+        # do permutation to gather global group
+        x = x.permute(0, 3, 5, 1, 2, 4)  # (n, loc_h, loc_w, c, glb_h, glb_w)
+        x = x.reshape(-1, c, glb_h, glb_w)
+        # apply attention within each global group
+        x = self.global_relation(x)  # (n * loc_h * loc_w, c, glb_h, glb_w)
+
+        # local relation
+        x = x.view(n, loc_h, loc_w, c, glb_h, glb_w)
+        # do permutation to gather local group
+        x = x.permute(0, 4, 5, 3, 1, 2)  # (n, glb_h, glb_w, c, loc_h, loc_w)
+        x = x.reshape(-1, c, loc_h, loc_w)
+        # apply attention within each local group
+        x = self.local_relation(x)  # (n * glb_h * glb_w, c, loc_h, loc_w)
+
+        # permute each pixel back to its original position
+        x = x.view(n, glb_h, glb_w, c, loc_h, loc_w)
+        x = x.permute(0, 3, 1, 4, 2, 5)  # (n, c, glb_h, loc_h, glb_w, loc_w)
+        x = x.reshape(n, c, glb_h * loc_h, glb_w * loc_w)
+        if pad_h > 0 or pad_w > 0:  # remove padding
+            x = x[:, :, pad_h // 2:pad_h // 2 + h, pad_w // 2:pad_w // 2 + w]
+
+        x = self.out_conv(torch.cat([x, residual], dim=1))
+        out = self.cls_seg(x)
+
+        return out
diff --git a/head_extractor/src/mmseg/models/decode_heads/knet_head.py b/head_extractor/src/mmseg/models/decode_heads/knet_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..82d3a2807685cdc896c881095f46fd50a450018e
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/knet_head.py
@@ -0,0 +1,461 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks.transformer import (FFN, MultiheadAttention,
+                                         build_transformer_layer)
+from mmengine.logging import print_log
+from torch import Tensor
+
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList
+
+
+@MODELS.register_module()
+class KernelUpdator(nn.Module):
+    """Dynamic Kernel Updator in Kernel Update Head.
+
+    Args:
+        in_channels (int): The number of channels of input feature map.
+            Default: 256.
+        feat_channels (int): The number of middle-stage channels in
+            the kernel updator. Default: 64.
+        out_channels (int): The number of output channels.
+        gate_sigmoid (bool): Whether use sigmoid function in gate
+            mechanism. Default: True.
+        gate_norm_act (bool): Whether add normalization and activation
+            layer in gate mechanism. Default: False.
+        activate_out: Whether add activation after gate mechanism.
+            Default: False.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='LN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+    """
+
+    def __init__(
+            self,
+            in_channels=256,
+            feat_channels=64,
+            out_channels=None,
+            gate_sigmoid=True,
+            gate_norm_act=False,
+            activate_out=False,
+            norm_cfg=dict(type='LN'),
+            act_cfg=dict(type='ReLU', inplace=True),
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.out_channels_raw = out_channels
+        self.gate_sigmoid = gate_sigmoid
+        self.gate_norm_act = gate_norm_act
+        self.activate_out = activate_out
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.out_channels = out_channels if out_channels else in_channels
+
+        self.num_params_in = self.feat_channels
+        self.num_params_out = self.feat_channels
+        self.dynamic_layer = nn.Linear(
+            self.in_channels, self.num_params_in + self.num_params_out)
+        self.input_layer = nn.Linear(self.in_channels,
+                                     self.num_params_in + self.num_params_out,
+                                     1)
+        self.input_gate = nn.Linear(self.in_channels, self.feat_channels, 1)
+        self.update_gate = nn.Linear(self.in_channels, self.feat_channels, 1)
+        if self.gate_norm_act:
+            self.gate_norm = build_norm_layer(norm_cfg, self.feat_channels)[1]
+
+        self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.input_norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.input_norm_out = build_norm_layer(norm_cfg, self.feat_channels)[1]
+
+        self.activation = build_activation_layer(act_cfg)
+
+        self.fc_layer = nn.Linear(self.feat_channels, self.out_channels, 1)
+        self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+    def forward(self, update_feature, input_feature):
+        """Forward function of KernelUpdator.
+
+        Args:
+            update_feature (torch.Tensor): Feature map assembled from
+                each group. It would be reshaped with last dimension
+                shape: `self.in_channels`.
+            input_feature (torch.Tensor): Intermediate feature
+                with shape: (N, num_classes, conv_kernel_size**2, channels).
+        Returns:
+            Tensor: The output tensor of shape (N*C1/C2, K*K, C2), where N is
+            the number of classes, C1 and C2 are the feature map channels of
+            KernelUpdateHead and KernelUpdator, respectively.
+        """
+
+        update_feature = update_feature.reshape(-1, self.in_channels)
+        num_proposals = update_feature.size(0)
+        # dynamic_layer works for
+        # phi_1 and psi_3 in Eq.(4) and (5) of K-Net paper
+        parameters = self.dynamic_layer(update_feature)
+        param_in = parameters[:, :self.num_params_in].view(
+            -1, self.feat_channels)
+        param_out = parameters[:, -self.num_params_out:].view(
+            -1, self.feat_channels)
+
+        # input_layer works for
+        # phi_2 and psi_4 in Eq.(4) and (5) of K-Net paper
+        input_feats = self.input_layer(
+            input_feature.reshape(num_proposals, -1, self.feat_channels))
+        input_in = input_feats[..., :self.num_params_in]
+        input_out = input_feats[..., -self.num_params_out:]
+
+        # `gate_feats` is F^G in K-Net paper
+        gate_feats = input_in * param_in.unsqueeze(-2)
+        if self.gate_norm_act:
+            gate_feats = self.activation(self.gate_norm(gate_feats))
+
+        input_gate = self.input_norm_in(self.input_gate(gate_feats))
+        update_gate = self.norm_in(self.update_gate(gate_feats))
+        if self.gate_sigmoid:
+            input_gate = input_gate.sigmoid()
+            update_gate = update_gate.sigmoid()
+        param_out = self.norm_out(param_out)
+        input_out = self.input_norm_out(input_out)
+
+        if self.activate_out:
+            param_out = self.activation(param_out)
+            input_out = self.activation(input_out)
+
+        # Gate mechanism. Eq.(5) in original paper.
+        # param_out has shape (batch_size, feat_channels, out_channels)
+        features = update_gate * param_out.unsqueeze(
+            -2) + input_gate * input_out
+
+        features = self.fc_layer(features)
+        features = self.fc_norm(features)
+        features = self.activation(features)
+
+        return features
+
+
+@MODELS.register_module()
+class KernelUpdateHead(nn.Module):
+    """Kernel Update Head in K-Net.
+
+    Args:
+        num_classes (int): Number of classes. Default: 150.
+        num_ffn_fcs (int): The number of fully-connected layers in
+            FFNs. Default: 2.
+        num_heads (int): The number of parallel attention heads.
+            Default: 8.
+        num_mask_fcs (int): The number of fully connected layers for
+            mask prediction. Default: 3.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 2048.
+        in_channels (int): The number of channels of input feature map.
+            Default: 256.
+        out_channels (int): The number of output channels.
+            Default: 256.
+        dropout (float): The Probability of an element to be
+            zeroed in MultiheadAttention and FFN. Default 0.0.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        ffn_act_cfg (dict): Config of activation layers in FFN.
+            Default: dict(type='ReLU').
+        conv_kernel_size (int): The kernel size of convolution in
+            Kernel Update Head for dynamic kernel updation.
+            Default: 1.
+        feat_transform_cfg (dict | None): Config of feature transform.
+            Default: None.
+        kernel_init (bool): Whether initiate mask kernel in mask head.
+            Default: False.
+        with_ffn (bool): Whether add FFN in kernel update head.
+            Default: True.
+        feat_gather_stride (int): Stride of convolution in feature transform.
+            Default: 1.
+        mask_transform_stride (int): Stride of mask transform.
+            Default: 1.
+        kernel_updator_cfg (dict): Config of kernel updator.
+            Default: dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN')).
+    """
+
+    def __init__(self,
+                 num_classes=150,
+                 num_ffn_fcs=2,
+                 num_heads=8,
+                 num_mask_fcs=3,
+                 feedforward_channels=2048,
+                 in_channels=256,
+                 out_channels=256,
+                 dropout=0.0,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 ffn_act_cfg=dict(type='ReLU', inplace=True),
+                 conv_kernel_size=1,
+                 feat_transform_cfg=None,
+                 kernel_init=False,
+                 with_ffn=True,
+                 feat_gather_stride=1,
+                 mask_transform_stride=1,
+                 kernel_updator_cfg=dict(
+                     type='DynamicConv',
+                     in_channels=256,
+                     feat_channels=64,
+                     out_channels=256,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                     norm_cfg=dict(type='LN'))):
+        super().__init__()
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.fp16_enabled = False
+        self.dropout = dropout
+        self.num_heads = num_heads
+        self.kernel_init = kernel_init
+        self.with_ffn = with_ffn
+        self.conv_kernel_size = conv_kernel_size
+        self.feat_gather_stride = feat_gather_stride
+        self.mask_transform_stride = mask_transform_stride
+
+        self.attention = MultiheadAttention(in_channels * conv_kernel_size**2,
+                                            num_heads, dropout)
+        self.attention_norm = build_norm_layer(
+            dict(type='LN'), in_channels * conv_kernel_size**2)[1]
+        self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg)
+
+        if feat_transform_cfg is not None:
+            kernel_size = feat_transform_cfg.pop('kernel_size', 1)
+            transform_channels = in_channels
+            self.feat_transform = ConvModule(
+                transform_channels,
+                in_channels,
+                kernel_size,
+                stride=feat_gather_stride,
+                padding=int(feat_gather_stride // 2),
+                **feat_transform_cfg)
+        else:
+            self.feat_transform = None
+
+        if self.with_ffn:
+            self.ffn = FFN(
+                in_channels,
+                feedforward_channels,
+                num_ffn_fcs,
+                act_cfg=ffn_act_cfg,
+                dropout=dropout)
+            self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1]
+
+        self.mask_fcs = nn.ModuleList()
+        for _ in range(num_mask_fcs):
+            self.mask_fcs.append(
+                nn.Linear(in_channels, in_channels, bias=False))
+            self.mask_fcs.append(
+                build_norm_layer(dict(type='LN'), in_channels)[1])
+            self.mask_fcs.append(build_activation_layer(act_cfg))
+
+        self.fc_mask = nn.Linear(in_channels, out_channels)
+
+    def init_weights(self):
+        """Use xavier initialization for all weight parameter and set
+        classification head bias as a specific value when use focal loss."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+            else:
+                # adopt the default initialization for
+                # the weight and bias of the layer norm
+                pass
+        if self.kernel_init:
+            print_log(
+                'mask kernel in mask head is normal initialized by std 0.01')
+            nn.init.normal_(self.fc_mask.weight, mean=0, std=0.01)
+
+    def forward(self, x, proposal_feat, mask_preds, mask_shape=None):
+        """Forward function of Dynamic Instance Interactive Head.
+
+        Args:
+            x (Tensor): Feature map from FPN with shape
+                (batch_size, feature_dimensions, H , W).
+            proposal_feat (Tensor): Intermediate feature get from
+                diihead in last stage, has shape
+                (batch_size, num_proposals, feature_dimensions)
+            mask_preds (Tensor): mask prediction from the former stage in shape
+                (batch_size, num_proposals, H, W).
+
+        Returns:
+            Tuple: The first tensor is predicted mask with shape
+            (N, num_classes, H, W), the second tensor is dynamic kernel
+            with shape (N, num_classes, channels, K, K).
+        """
+        N, num_proposals = proposal_feat.shape[:2]
+        if self.feat_transform is not None:
+            x = self.feat_transform(x)
+
+        C, H, W = x.shape[-3:]
+
+        mask_h, mask_w = mask_preds.shape[-2:]
+        if mask_h != H or mask_w != W:
+            gather_mask = F.interpolate(
+                mask_preds, (H, W), align_corners=False, mode='bilinear')
+        else:
+            gather_mask = mask_preds
+
+        sigmoid_masks = gather_mask.softmax(dim=1)
+
+        # Group Feature Assembling. Eq.(3) in original paper.
+        # einsum is faster than bmm by 30%
+        x_feat = torch.einsum('bnhw,bchw->bnc', sigmoid_masks, x)
+
+        # obj_feat in shape [B, N, C, K, K] -> [B, N, C, K*K] -> [B, N, K*K, C]
+        proposal_feat = proposal_feat.reshape(N, num_proposals,
+                                              self.in_channels,
+                                              -1).permute(0, 1, 3, 2)
+        obj_feat = self.kernel_update_conv(x_feat, proposal_feat)
+
+        # [B, N, K*K, C] -> [B, N, K*K*C] -> [N, B, K*K*C]
+        obj_feat = obj_feat.reshape(N, num_proposals, -1).permute(1, 0, 2)
+        obj_feat = self.attention_norm(self.attention(obj_feat))
+        # [N, B, K*K*C] -> [B, N, K*K*C]
+        obj_feat = obj_feat.permute(1, 0, 2)
+
+        # obj_feat in shape [B, N, K*K*C] -> [B, N, K*K, C]
+        obj_feat = obj_feat.reshape(N, num_proposals, -1, self.in_channels)
+
+        # FFN
+        if self.with_ffn:
+            obj_feat = self.ffn_norm(self.ffn(obj_feat))
+
+        mask_feat = obj_feat
+
+        for reg_layer in self.mask_fcs:
+            mask_feat = reg_layer(mask_feat)
+
+        # [B, N, K*K, C] -> [B, N, C, K*K]
+        mask_feat = self.fc_mask(mask_feat).permute(0, 1, 3, 2)
+
+        if (self.mask_transform_stride == 2 and self.feat_gather_stride == 1):
+            mask_x = F.interpolate(
+                x, scale_factor=0.5, mode='bilinear', align_corners=False)
+            H, W = mask_x.shape[-2:]
+        else:
+            mask_x = x
+        # group conv is 5x faster than unfold and uses about 1/5 memory
+        # Group conv vs. unfold vs. concat batch, 2.9ms :13.5ms :3.8ms
+        # Group conv vs. unfold vs. concat batch, 278 : 1420 : 369
+        # but in real training group conv is slower than concat batch
+        # so we keep using concat batch.
+        # fold_x = F.unfold(
+        #     mask_x,
+        #     self.conv_kernel_size,
+        #     padding=int(self.conv_kernel_size // 2))
+        # mask_feat = mask_feat.reshape(N, num_proposals, -1)
+        # new_mask_preds = torch.einsum('bnc,bcl->bnl', mask_feat, fold_x)
+        # [B, N, C, K*K] -> [B*N, C, K, K]
+        mask_feat = mask_feat.reshape(N, num_proposals, C,
+                                      self.conv_kernel_size,
+                                      self.conv_kernel_size)
+        # [B, C, H, W] -> [1, B*C, H, W]
+        new_mask_preds = []
+        for i in range(N):
+            new_mask_preds.append(
+                F.conv2d(
+                    mask_x[i:i + 1],
+                    mask_feat[i],
+                    padding=int(self.conv_kernel_size // 2)))
+
+        new_mask_preds = torch.cat(new_mask_preds, dim=0)
+        new_mask_preds = new_mask_preds.reshape(N, num_proposals, H, W)
+        if self.mask_transform_stride == 2:
+            new_mask_preds = F.interpolate(
+                new_mask_preds,
+                scale_factor=2,
+                mode='bilinear',
+                align_corners=False)
+
+        if mask_shape is not None and mask_shape[0] != H:
+            new_mask_preds = F.interpolate(
+                new_mask_preds,
+                mask_shape,
+                align_corners=False,
+                mode='bilinear')
+
+        return new_mask_preds, obj_feat.permute(0, 1, 3, 2).reshape(
+            N, num_proposals, self.in_channels, self.conv_kernel_size,
+            self.conv_kernel_size)
+
+
+@MODELS.register_module()
+class IterativeDecodeHead(BaseDecodeHead):
+    """K-Net: Towards Unified Image Segmentation.
+
+    This head is the implementation of
+    `K-Net:　<https://arxiv.org/abs/2106.14855>`_.
+
+    Args:
+        num_stages (int): The number of stages (kernel update heads)
+            in IterativeDecodeHead. Default: 3.
+        kernel_generate_head:(dict): Config of kernel generate head which
+            generate mask predictions, dynamic kernels and class predictions
+            for next kernel update heads.
+        kernel_update_head (dict): Config of kernel update head which refine
+            dynamic kernels and class predictions iteratively.
+
+    """
+
+    def __init__(self, num_stages, kernel_generate_head, kernel_update_head,
+                 **kwargs):
+        # ``IterativeDecodeHead`` would skip initialization of
+        # ``BaseDecodeHead`` which would be called when building
+        # ``self.kernel_generate_head``.
+        super(BaseDecodeHead, self).__init__(**kwargs)
+        assert num_stages == len(kernel_update_head)
+        self.num_stages = num_stages
+        self.kernel_generate_head = MODELS.build(kernel_generate_head)
+        self.kernel_update_head = nn.ModuleList()
+        self.align_corners = self.kernel_generate_head.align_corners
+        self.num_classes = self.kernel_generate_head.num_classes
+        self.input_transform = self.kernel_generate_head.input_transform
+        self.ignore_index = self.kernel_generate_head.ignore_index
+        self.out_channels = self.num_classes
+
+        for head_cfg in kernel_update_head:
+            self.kernel_update_head.append(MODELS.build(head_cfg))
+
+    def forward(self, inputs):
+        """Forward function."""
+        feats = self.kernel_generate_head._forward_feature(inputs)
+        sem_seg = self.kernel_generate_head.cls_seg(feats)
+        seg_kernels = self.kernel_generate_head.conv_seg.weight.clone()
+        seg_kernels = seg_kernels[None].expand(
+            feats.size(0), *seg_kernels.size())
+
+        stage_segs = [sem_seg]
+        for i in range(self.num_stages):
+            sem_seg, seg_kernels = self.kernel_update_head[i](feats,
+                                                              seg_kernels,
+                                                              sem_seg)
+            stage_segs.append(sem_seg)
+        if self.training:
+            return stage_segs
+        # only return the prediction of the last stage during testing
+        return stage_segs[-1]
+
+    def loss_by_feat(self, seg_logits: List[Tensor],
+                     batch_data_samples: SampleList, **kwargs) -> dict:
+        losses = dict()
+        for i, logit in enumerate(seg_logits):
+            loss = self.kernel_generate_head.loss_by_feat(
+                logit, batch_data_samples)
+            for k, v in loss.items():
+                losses[f'{k}.s{i}'] = v
+
+        return losses
diff --git a/head_extractor/src/mmseg/models/decode_heads/lraspp_head.py b/head_extractor/src/mmseg/models/decode_heads/lraspp_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba2465f27522e6ff106fcdf94a46aab42881260a
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/lraspp_head.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.utils import is_tuple_of
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+@MODELS.register_module()
+class LRASPPHead(BaseDecodeHead):
+    """Lite R-ASPP (LRASPP) head is proposed in Searching for MobileNetV3.
+
+    This head is the improved implementation of `Searching for MobileNetV3
+    <https://ieeexplore.ieee.org/document/9008835>`_.
+
+    Args:
+        branch_channels (tuple[int]): The number of output channels in every
+            each branch. Default: (32, 64).
+    """
+
+    def __init__(self, branch_channels=(32, 64), **kwargs):
+        super().__init__(**kwargs)
+        if self.input_transform != 'multiple_select':
+            raise ValueError('in Lite R-ASPP (LRASPP) head, input_transform '
+                             f'must be \'multiple_select\'. But received '
+                             f'\'{self.input_transform}\'')
+        assert is_tuple_of(branch_channels, int)
+        assert len(branch_channels) == len(self.in_channels) - 1
+        self.branch_channels = branch_channels
+
+        self.convs = nn.Sequential()
+        self.conv_ups = nn.Sequential()
+        for i in range(len(branch_channels)):
+            self.convs.add_module(
+                f'conv{i}',
+                nn.Conv2d(
+                    self.in_channels[i], branch_channels[i], 1, bias=False))
+            self.conv_ups.add_module(
+                f'conv_up{i}',
+                ConvModule(
+                    self.channels + branch_channels[i],
+                    self.channels,
+                    1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    bias=False))
+
+        self.conv_up_input = nn.Conv2d(self.channels, self.channels, 1)
+
+        self.aspp_conv = ConvModule(
+            self.in_channels[-1],
+            self.channels,
+            1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            bias=False)
+        self.image_pool = nn.Sequential(
+            nn.AvgPool2d(kernel_size=49, stride=(16, 20)),
+            ConvModule(
+                self.in_channels[2],
+                self.channels,
+                1,
+                act_cfg=dict(type='Sigmoid'),
+                bias=False))
+
+    def forward(self, inputs):
+        """Forward function."""
+        inputs = self._transform_inputs(inputs)
+
+        x = inputs[-1]
+
+        x = self.aspp_conv(x) * resize(
+            self.image_pool(x),
+            size=x.size()[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        x = self.conv_up_input(x)
+
+        for i in range(len(self.branch_channels) - 1, -1, -1):
+            x = resize(
+                x,
+                size=inputs[i].size()[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            x = torch.cat([x, self.convs[i](inputs[i])], 1)
+            x = self.conv_ups[i](x)
+
+        return self.cls_seg(x)
diff --git a/head_extractor/src/mmseg/models/decode_heads/mask2former_head.py b/head_extractor/src/mmseg/models/decode_heads/mask2former_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..36121da6a1fa570e2fb6ba4d8fa5773ee76ccefc
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/mask2former_head.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+
+from mmdet.models.dense_heads import \
+    Mask2FormerHead as MMDET_Mask2FormerHead
+    # print("MMDET_Mask2FormerHead", MMDET_Mask2FormerHead)
+# except ModuleNotFoundError:
+#     print("ModuleNotFoundError")
+#     MMDET_Mask2FormerHead = BaseModule
+
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures.seg_data_sample import SegDataSample
+from mmseg.utils import ConfigType, SampleList
+
+
+@MODELS.register_module()
+class Mask2FormerHead(MMDET_Mask2FormerHead):
+    """Implements the Mask2Former head.
+
+    See `Mask2Former: Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/abs/2112.01527>`_ for details.
+
+    Args:
+        num_classes (int): Number of classes. Default: 150.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        ignore_index (int): The label index to be ignored. Default: 255.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 align_corners=False,
+                 ignore_index=255,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        self.out_channels = num_classes
+        self.ignore_index = ignore_index
+
+        feat_channels = kwargs['feat_channels']
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+
+    def _seg_data_to_instance_data(self, batch_data_samples: SampleList):
+        """Perform forward propagation to convert paradigm from MMSegmentation
+        to MMDetection to ensure ``MMDET_Mask2FormerHead`` could be called
+        normally. Specifically, ``batch_gt_instances`` would be added.
+
+        Args:
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: A tuple contains two lists.
+
+                - batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                    gt_instance. It usually includes ``labels``, each is
+                    unique ground truth label id of images, with
+                    shape (num_gt, ) and ``masks``, each is ground truth
+                    masks of each instances of a image, shape (num_gt, h, w).
+                - batch_img_metas (list[dict]): List of image meta information.
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            gt_sem_seg = data_sample.gt_sem_seg.data
+            classes = torch.unique(
+                gt_sem_seg,
+                sorted=False,
+                return_inverse=False,
+                return_counts=False)
+
+            # remove ignored region
+            gt_labels = classes[classes != self.ignore_index]
+
+            masks = []
+            for class_id in gt_labels:
+                masks.append(gt_sem_seg == class_id)
+
+            if len(masks) == 0:
+                gt_masks = torch.zeros(
+                    (0, gt_sem_seg.shape[-2],
+                     gt_sem_seg.shape[-1])).to(gt_sem_seg).long()
+            else:
+                gt_masks = torch.stack(masks).squeeze(1).long()
+
+            instance_data = InstanceData(labels=gt_labels, masks=gt_masks)
+            batch_gt_instances.append(instance_data)
+        return batch_gt_instances, batch_img_metas
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Perform forward propagation and loss calculation of the decoder head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            train_cfg (ConfigType): Training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        # batch SegDataSample to InstanceDataSample
+        batch_gt_instances, batch_img_metas = self._seg_data_to_instance_data(
+            batch_data_samples)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self, x: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tuple[Tensor]:
+        """Test without augmentaton.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_img_metas (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            test_cfg (ConfigType): Test config.
+
+        Returns:
+            Tensor: A tensor of segmentation mask.
+        """
+        batch_data_samples = [
+            SegDataSample(metainfo=metainfo) for metainfo in batch_img_metas
+        ]
+
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+        if 'pad_shape' in batch_img_metas[0]:
+            size = batch_img_metas[0]['pad_shape']
+        else:
+            size = batch_img_metas[0]['img_shape']
+        # upsample mask
+        mask_pred_results = F.interpolate(
+            mask_pred_results, size=size, mode='bilinear', align_corners=False)
+        cls_score = F.softmax(mask_cls_results, dim=-1)[..., :-1]
+        mask_pred = mask_pred_results.sigmoid()
+        seg_logits = torch.einsum('bqc, bqhw->bchw', cls_score, mask_pred)
+        return seg_logits
diff --git a/head_extractor/src/mmseg/models/decode_heads/maskformer_head.py b/head_extractor/src/mmseg/models/decode_heads/maskformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e61a7f63a33a508955a866e57c139ce8c40e0f6
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/maskformer_head.py
@@ -0,0 +1,174 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+
+try:
+    from mmdet.models.dense_heads import MaskFormerHead as MMDET_MaskFormerHead
+except ModuleNotFoundError:
+    MMDET_MaskFormerHead = BaseModule
+
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures.seg_data_sample import SegDataSample
+from mmseg.utils import ConfigType, SampleList
+
+
+@MODELS.register_module()
+class MaskFormerHead(MMDET_MaskFormerHead):
+    """Implements the MaskFormer head.
+
+    See `Per-Pixel Classification is Not All You Need for Semantic Segmentation
+    <https://arxiv.org/pdf/2107.06278>`_ for details.
+
+    Args:
+        num_classes (int): Number of classes. Default: 150.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        ignore_index (int): The label index to be ignored. Default: 255.
+    """
+
+    def __init__(self,
+                 num_classes: int = 150,
+                 align_corners: bool = False,
+                 ignore_index: int = 255,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+
+        self.out_channels = kwargs['out_channels']
+        self.align_corners = True
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        self.out_channels = num_classes
+        self.ignore_index = ignore_index
+
+        feat_channels = kwargs['feat_channels']
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+
+    def _seg_data_to_instance_data(self, batch_data_samples: SampleList):
+        """Perform forward propagation to convert paradigm from MMSegmentation
+        to MMDetection to ensure ``MMDET_MaskFormerHead`` could be called
+        normally. Specifically, ``batch_gt_instances`` would be added.
+
+        Args:
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: A tuple contains two lists.
+
+                - batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                    gt_instance. It usually includes ``labels``, each is
+                    unique ground truth label id of images, with
+                    shape (num_gt, ) and ``masks``, each is ground truth
+                    masks of each instances of a image, shape (num_gt, h, w).
+                - batch_img_metas (list[dict]): List of image meta information.
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+        for data_sample in batch_data_samples:
+            # Add `batch_input_shape` in metainfo of data_sample, which would
+            # be used in MaskFormerHead of MMDetection.
+            metainfo = data_sample.metainfo
+            metainfo['batch_input_shape'] = metainfo['img_shape']
+            data_sample.set_metainfo(metainfo)
+            batch_img_metas.append(data_sample.metainfo)
+            gt_sem_seg = data_sample.gt_sem_seg.data
+            classes = torch.unique(
+                gt_sem_seg,
+                sorted=False,
+                return_inverse=False,
+                return_counts=False)
+
+            # remove ignored region
+            gt_labels = classes[classes != self.ignore_index]
+
+            masks = []
+            for class_id in gt_labels:
+                masks.append(gt_sem_seg == class_id)
+
+            if len(masks) == 0:
+                gt_masks = torch.zeros((0, gt_sem_seg.shape[-2],
+                                        gt_sem_seg.shape[-1])).to(gt_sem_seg)
+            else:
+                gt_masks = torch.stack(masks).squeeze(1)
+
+            instance_data = InstanceData(
+                labels=gt_labels, masks=gt_masks.long())
+            batch_gt_instances.append(instance_data)
+        return batch_gt_instances, batch_img_metas
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Perform forward propagation and loss calculation of the decoder head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            train_cfg (ConfigType): Training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        # batch SegDataSample to InstanceDataSample
+        batch_gt_instances, batch_img_metas = self._seg_data_to_instance_data(
+            batch_data_samples)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self, x: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tuple[Tensor]:
+        """Test without augmentaton.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_img_metas (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            test_cfg (ConfigType): Test config.
+
+        Returns:
+            Tensor: A tensor of segmentation mask.
+        """
+
+        batch_data_samples = []
+        for metainfo in batch_img_metas:
+            metainfo['batch_input_shape'] = metainfo['img_shape']
+            batch_data_samples.append(SegDataSample(metainfo=metainfo))
+        # Forward function of MaskFormerHead from MMDetection needs
+        # 'batch_data_samples' as inputs, which is image shape　actually.
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        # upsample masks
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=img_shape,
+            mode='bilinear',
+            align_corners=False)
+
+        # semantic inference
+        cls_score = F.softmax(mask_cls_results, dim=-1)[..., :-1]
+        mask_pred = mask_pred_results.sigmoid()
+        seg_logits = torch.einsum('bqc,bqhw->bchw', cls_score, mask_pred)
+        return seg_logits
diff --git a/head_extractor/src/mmseg/models/decode_heads/nl_head.py b/head_extractor/src/mmseg/models/decode_heads/nl_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ffcc2a2f081127f109deb0ad5bd1be0d6f50493
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/nl_head.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import NonLocal2d
+
+from mmseg.registry import MODELS
+from .fcn_head import FCNHead
+
+
+@MODELS.register_module()
+class NLHead(FCNHead):
+    """Non-local Neural Networks.
+
+    This head is the implementation of `NLNet
+    <https://arxiv.org/abs/1711.07971>`_.
+
+    Args:
+        reduction (int): Reduction factor of projection transform. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            sqrt(1/inter_channels). Default: True.
+        mode (str): The nonlocal mode. Options are 'embedded_gaussian',
+            'dot_product'. Default: 'embedded_gaussian.'.
+    """
+
+    def __init__(self,
+                 reduction=2,
+                 use_scale=True,
+                 mode='embedded_gaussian',
+                 **kwargs):
+        super().__init__(num_convs=2, **kwargs)
+        self.reduction = reduction
+        self.use_scale = use_scale
+        self.mode = mode
+        self.nl_block = NonLocal2d(
+            in_channels=self.channels,
+            reduction=self.reduction,
+            use_scale=self.use_scale,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            mode=self.mode)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        output = self.convs[0](x)
+        output = self.nl_block(output)
+        output = self.convs[1](output)
+        if self.concat_input:
+            output = self.conv_cat(torch.cat([x, output], dim=1))
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/ocr_head.py b/head_extractor/src/mmseg/models/decode_heads/ocr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9afe37bebd6c16ff184dc482ae358eb7ae9a093a
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/ocr_head.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import SelfAttentionBlock as _SelfAttentionBlock
+from ..utils import resize
+from .cascade_decode_head import BaseCascadeDecodeHead
+
+
+class SpatialGatherModule(nn.Module):
+    """Aggregate the context features according to the initial predicted
+    probability distribution.
+
+    Employ the soft-weighted method to aggregate the context.
+    """
+
+    def __init__(self, scale):
+        super().__init__()
+        self.scale = scale
+
+    def forward(self, feats, probs):
+        """Forward function."""
+        batch_size, num_classes, height, width = probs.size()
+        channels = feats.size(1)
+        probs = probs.view(batch_size, num_classes, -1)
+        feats = feats.view(batch_size, channels, -1)
+        # [batch_size, height*width, num_classes]
+        feats = feats.permute(0, 2, 1)
+        # [batch_size, channels, height*width]
+        probs = F.softmax(self.scale * probs, dim=2)
+        # [batch_size, channels, num_classes]
+        ocr_context = torch.matmul(probs, feats)
+        ocr_context = ocr_context.permute(0, 2, 1).contiguous().unsqueeze(3)
+        return ocr_context
+
+
+class ObjectAttentionBlock(_SelfAttentionBlock):
+    """Make a OCR used SelfAttentionBlock."""
+
+    def __init__(self, in_channels, channels, scale, conv_cfg, norm_cfg,
+                 act_cfg):
+        if scale > 1:
+            query_downsample = nn.MaxPool2d(kernel_size=scale)
+        else:
+            query_downsample = None
+        super().__init__(
+            key_in_channels=in_channels,
+            query_in_channels=in_channels,
+            channels=channels,
+            out_channels=in_channels,
+            share_key_query=False,
+            query_downsample=query_downsample,
+            key_downsample=None,
+            key_query_num_convs=2,
+            key_query_norm=True,
+            value_out_num_convs=1,
+            value_out_norm=True,
+            matmul_norm=True,
+            with_out=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.bottleneck = ConvModule(
+            in_channels * 2,
+            in_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, query_feats, key_feats):
+        """Forward function."""
+        context = super().forward(query_feats, key_feats)
+        output = self.bottleneck(torch.cat([context, query_feats], dim=1))
+        if self.query_downsample is not None:
+            output = resize(query_feats)
+
+        return output
+
+
+@MODELS.register_module()
+class OCRHead(BaseCascadeDecodeHead):
+    """Object-Contextual Representations for Semantic Segmentation.
+
+    This head is the implementation of `OCRNet
+    <https://arxiv.org/abs/1909.11065>`_.
+
+    Args:
+        ocr_channels (int): The intermediate channels of OCR block.
+        scale (int): The scale of probability map in SpatialGatherModule in
+            Default: 1.
+    """
+
+    def __init__(self, ocr_channels, scale=1, **kwargs):
+        super().__init__(**kwargs)
+        self.ocr_channels = ocr_channels
+        self.scale = scale
+        self.object_context_block = ObjectAttentionBlock(
+            self.channels,
+            self.ocr_channels,
+            self.scale,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.spatial_gather_module = SpatialGatherModule(self.scale)
+
+        self.bottleneck = ConvModule(
+            self.in_channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs, prev_output):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        feats = self.bottleneck(x)
+        context = self.spatial_gather_module(feats, prev_output)
+        object_context = self.object_context_block(feats, context)
+        output = self.cls_seg(object_context)
+
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/pid_head.py b/head_extractor/src/mmseg/models/decode_heads/pid_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..c092cb32d07c279c1d6a45d2e02baccb8e5ffa33
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/pid_head.py
@@ -0,0 +1,183 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.models.losses import accuracy
+from mmseg.models.utils import resize
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType, SampleList
+
+
+class BasePIDHead(BaseModule):
+    """Base class for PID head.
+
+    Args:
+        in_channels (int): Number of input channels.
+        channels (int): Number of output channels.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Init config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.conv = ConvModule(
+            in_channels,
+            channels,
+            kernel_size=3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            order=('norm', 'act', 'conv'))
+        _, self.norm = build_norm_layer(norm_cfg, num_features=channels)
+        self.act = build_activation_layer(act_cfg)
+
+    def forward(self, x: Tensor, cls_seg: Optional[nn.Module]) -> Tensor:
+        """Forward function.
+        Args:
+            x (Tensor): Input tensor.
+            cls_seg (nn.Module, optional): The classification head.
+
+        Returns:
+            Tensor: Output tensor.
+        """
+        x = self.conv(x)
+        x = self.norm(x)
+        x = self.act(x)
+        if cls_seg is not None:
+            x = cls_seg(x)
+        return x
+
+
+@MODELS.register_module()
+class PIDHead(BaseDecodeHead):
+    """Decode head for PIDNet.
+
+    Args:
+        in_channels (int): Number of input channels.
+        channels (int): Number of output channels.
+        num_classes (int): Number of classes.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 num_classes: int,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 **kwargs):
+        super().__init__(
+            in_channels,
+            channels,
+            num_classes=num_classes,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **kwargs)
+        self.i_head = BasePIDHead(in_channels, channels, norm_cfg, act_cfg)
+        self.p_head = BasePIDHead(in_channels // 2, channels, norm_cfg,
+                                  act_cfg)
+        self.d_head = BasePIDHead(
+            in_channels // 2,
+            in_channels // 4,
+            norm_cfg,
+        )
+        self.p_cls_seg = nn.Conv2d(channels, self.out_channels, kernel_size=1)
+        self.d_cls_seg = nn.Conv2d(in_channels // 4, 1, kernel_size=1)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(
+            self,
+            inputs: Union[Tensor,
+                          Tuple[Tensor]]) -> Union[Tensor, Tuple[Tensor]]:
+        """Forward function.
+        Args:
+            inputs (Tensor | tuple[Tensor]): Input tensor or tuple of
+                Tensor. When training, the input is a tuple of three tensors,
+                (p_feat, i_feat, d_feat), and the output is a tuple of three
+                tensors, (p_seg_logit, i_seg_logit, d_seg_logit).
+                When inference, only the head of integral branch is used, and
+                input is a tensor of integral feature map, and the output is
+                the segmentation logit.
+
+        Returns:
+            Tensor | tuple[Tensor]: Output tensor or tuple of tensors.
+        """
+        if self.training:
+            x_p, x_i, x_d = inputs
+            x_p = self.p_head(x_p, self.p_cls_seg)
+            x_i = self.i_head(x_i, self.cls_seg)
+            x_d = self.d_head(x_d, self.d_cls_seg)
+            return x_p, x_i, x_d
+        else:
+            return self.i_head(inputs, self.cls_seg)
+
+    def _stack_batch_gt(self, batch_data_samples: SampleList) -> Tuple[Tensor]:
+        gt_semantic_segs = [
+            data_sample.gt_sem_seg.data for data_sample in batch_data_samples
+        ]
+        gt_edge_segs = [
+            data_sample.gt_edge_map.data for data_sample in batch_data_samples
+        ]
+        gt_sem_segs = torch.stack(gt_semantic_segs, dim=0)
+        gt_edge_segs = torch.stack(gt_edge_segs, dim=0)
+        return gt_sem_segs, gt_edge_segs
+
+    def loss_by_feat(self, seg_logits: Tuple[Tensor],
+                     batch_data_samples: SampleList) -> dict:
+        loss = dict()
+        p_logit, i_logit, d_logit = seg_logits
+        sem_label, bd_label = self._stack_batch_gt(batch_data_samples)
+        p_logit = resize(
+            input=p_logit,
+            size=sem_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        i_logit = resize(
+            input=i_logit,
+            size=sem_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        d_logit = resize(
+            input=d_logit,
+            size=bd_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        sem_label = sem_label.squeeze(1)
+        bd_label = bd_label.squeeze(1)
+        loss['loss_sem_p'] = self.loss_decode[0](
+            p_logit, sem_label, ignore_index=self.ignore_index)
+        loss['loss_sem_i'] = self.loss_decode[1](i_logit, sem_label)
+        loss['loss_bd'] = self.loss_decode[2](d_logit, bd_label)
+        filler = torch.ones_like(sem_label) * self.ignore_index
+        sem_bd_label = torch.where(
+            torch.sigmoid(d_logit[:, 0, :, :]) > 0.8, sem_label, filler)
+        loss['loss_sem_bd'] = self.loss_decode[3](i_logit, sem_bd_label)
+        loss['acc_seg'] = accuracy(
+            i_logit, sem_label, ignore_index=self.ignore_index)
+        return loss
diff --git a/head_extractor/src/mmseg/models/decode_heads/point_head.py b/head_extractor/src/mmseg/models/decode_heads/point_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8e433d66249a4690cea3e33e95ec54d58ee3a07
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/point_head.py
@@ -0,0 +1,367 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py  # noqa
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+try:
+    from mmcv.ops import point_sample
+except ModuleNotFoundError:
+    point_sample = None
+
+from typing import List
+
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList
+from ..losses import accuracy
+from ..utils import resize
+from .cascade_decode_head import BaseCascadeDecodeHead
+
+
+def calculate_uncertainty(seg_logits):
+    """Estimate uncertainty based on seg logits.
+
+    For each location of the prediction ``seg_logits`` we estimate
+    uncertainty as the difference between top first and top second
+    predicted logits.
+
+    Args:
+        seg_logits (Tensor): Semantic segmentation logits,
+            shape (batch_size, num_classes, height, width).
+
+    Returns:
+        scores (Tensor): T uncertainty scores with the most uncertain
+            locations having the highest uncertainty score, shape (
+            batch_size, 1, height, width)
+    """
+    top2_scores = torch.topk(seg_logits, k=2, dim=1)[0]
+    return (top2_scores[:, 1] - top2_scores[:, 0]).unsqueeze(1)
+
+
+@MODELS.register_module()
+class PointHead(BaseCascadeDecodeHead):
+    """A mask point head use in PointRend.
+
+    This head is implemented of `PointRend: Image Segmentation as
+    Rendering <https://arxiv.org/abs/1912.08193>`_.
+    ``PointHead`` use shared multi-layer perceptron (equivalent to
+    nn.Conv1d) to predict the logit of input points. The fine-grained feature
+    and coarse feature will be concatenate together for predication.
+
+    Args:
+        num_fcs (int): Number of fc layers in the head. Default: 3.
+        in_channels (int): Number of input channels. Default: 256.
+        fc_channels (int): Number of fc channels. Default: 256.
+        num_classes (int): Number of classes for logits. Default: 80.
+        class_agnostic (bool): Whether use class agnostic classification.
+            If so, the output channels of logits will be 1. Default: False.
+        coarse_pred_each_layer (bool): Whether concatenate coarse feature with
+            the output of each fc layer. Default: True.
+        conv_cfg (dict|None): Dictionary to construct and config conv layer.
+            Default: dict(type='Conv1d'))
+        norm_cfg (dict|None): Dictionary to construct and config norm layer.
+            Default: None.
+        loss_point (dict): Dictionary to construct and config loss layer of
+            point head. Default: dict(type='CrossEntropyLoss', use_mask=True,
+            loss_weight=1.0).
+    """
+
+    def __init__(self,
+                 num_fcs=3,
+                 coarse_pred_each_layer=True,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU', inplace=False),
+                 **kwargs):
+        super().__init__(
+            input_transform='multiple_select',
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            init_cfg=dict(
+                type='Normal', std=0.01, override=dict(name='fc_seg')),
+            **kwargs)
+        if point_sample is None:
+            raise RuntimeError('Please install mmcv-full for '
+                               'point_sample ops')
+
+        self.num_fcs = num_fcs
+        self.coarse_pred_each_layer = coarse_pred_each_layer
+
+        fc_in_channels = sum(self.in_channels) + self.num_classes
+        fc_channels = self.channels
+        self.fcs = nn.ModuleList()
+        for k in range(num_fcs):
+            fc = ConvModule(
+                fc_in_channels,
+                fc_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+            self.fcs.append(fc)
+            fc_in_channels = fc_channels
+            fc_in_channels += self.num_classes if self.coarse_pred_each_layer \
+                else 0
+        self.fc_seg = nn.Conv1d(
+            fc_in_channels,
+            self.num_classes,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        if self.dropout_ratio > 0:
+            self.dropout = nn.Dropout(self.dropout_ratio)
+        delattr(self, 'conv_seg')
+
+    def cls_seg(self, feat):
+        """Classify each pixel with fc."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.fc_seg(feat)
+        return output
+
+    def forward(self, fine_grained_point_feats, coarse_point_feats):
+        x = torch.cat([fine_grained_point_feats, coarse_point_feats], dim=1)
+        for fc in self.fcs:
+            x = fc(x)
+            if self.coarse_pred_each_layer:
+                x = torch.cat((x, coarse_point_feats), dim=1)
+        return self.cls_seg(x)
+
+    def _get_fine_grained_point_feats(self, x, points):
+        """Sample from fine grained features.
+
+        Args:
+            x (list[Tensor]): Feature pyramid from by neck or backbone.
+            points (Tensor): Point coordinates, shape (batch_size,
+                num_points, 2).
+
+        Returns:
+            fine_grained_feats (Tensor): Sampled fine grained feature,
+                shape (batch_size, sum(channels of x), num_points).
+        """
+
+        fine_grained_feats_list = [
+            point_sample(_, points, align_corners=self.align_corners)
+            for _ in x
+        ]
+        if len(fine_grained_feats_list) > 1:
+            fine_grained_feats = torch.cat(fine_grained_feats_list, dim=1)
+        else:
+            fine_grained_feats = fine_grained_feats_list[0]
+
+        return fine_grained_feats
+
+    def _get_coarse_point_feats(self, prev_output, points):
+        """Sample from fine grained features.
+
+        Args:
+            prev_output (list[Tensor]): Prediction of previous decode head.
+            points (Tensor): Point coordinates, shape (batch_size,
+                num_points, 2).
+
+        Returns:
+            coarse_feats (Tensor): Sampled coarse feature, shape (batch_size,
+                num_classes, num_points).
+        """
+
+        coarse_feats = point_sample(
+            prev_output, points, align_corners=self.align_corners)
+
+        return coarse_feats
+
+    def loss(self, inputs, prev_output, batch_data_samples: SampleList,
+             train_cfg, **kwargs):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+            batch_data_samples (list[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `img_metas` or `gt_semantic_seg`.
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        x = self._transform_inputs(inputs)
+        with torch.no_grad():
+            points = self.get_points_train(
+                prev_output, calculate_uncertainty, cfg=train_cfg)
+        fine_grained_point_feats = self._get_fine_grained_point_feats(
+            x, points)
+        coarse_point_feats = self._get_coarse_point_feats(prev_output, points)
+        point_logits = self.forward(fine_grained_point_feats,
+                                    coarse_point_feats)
+
+        losses = self.loss_by_feat(point_logits, points, batch_data_samples)
+
+        return losses
+
+    def predict(self, inputs, prev_output, batch_img_metas: List[dict],
+                test_cfg, **kwargs):
+        """Forward function for testing.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            prev_output (Tensor): The output of previous decode head.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Output segmentation map.
+        """
+
+        x = self._transform_inputs(inputs)
+        refined_seg_logits = prev_output.clone()
+        for _ in range(test_cfg.subdivision_steps):
+            refined_seg_logits = resize(
+                refined_seg_logits,
+                scale_factor=test_cfg.scale_factor,
+                mode='bilinear',
+                align_corners=self.align_corners)
+            batch_size, channels, height, width = refined_seg_logits.shape
+            point_indices, points = self.get_points_test(
+                refined_seg_logits, calculate_uncertainty, cfg=test_cfg)
+            fine_grained_point_feats = self._get_fine_grained_point_feats(
+                x, points)
+            coarse_point_feats = self._get_coarse_point_feats(
+                prev_output, points)
+            point_logits = self.forward(fine_grained_point_feats,
+                                        coarse_point_feats)
+
+            point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1)
+            refined_seg_logits = refined_seg_logits.reshape(
+                batch_size, channels, height * width)
+            refined_seg_logits = refined_seg_logits.scatter_(
+                2, point_indices, point_logits)
+            refined_seg_logits = refined_seg_logits.view(
+                batch_size, channels, height, width)
+
+        return self.predict_by_feat(refined_seg_logits, batch_img_metas,
+                                    **kwargs)
+
+    def loss_by_feat(self, point_logits, points, batch_data_samples, **kwargs):
+        """Compute segmentation loss."""
+        gt_semantic_seg = self._stack_batch_gt(batch_data_samples)
+        point_label = point_sample(
+            gt_semantic_seg.float(),
+            points,
+            mode='nearest',
+            align_corners=self.align_corners)
+        point_label = point_label.squeeze(1).long()
+
+        loss = dict()
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_module in losses_decode:
+            loss['point' + loss_module.loss_name] = loss_module(
+                point_logits, point_label, ignore_index=self.ignore_index)
+
+        loss['acc_point'] = accuracy(
+            point_logits, point_label, ignore_index=self.ignore_index)
+        return loss
+
+    def get_points_train(self, seg_logits, uncertainty_func, cfg):
+        """Sample points for training.
+
+        Sample points in [0, 1] x [0, 1] coordinate space based on their
+        uncertainty. The uncertainties are calculated for each point using
+        'uncertainty_func' function that takes point's logit prediction as
+        input.
+
+        Args:
+            seg_logits (Tensor): Semantic segmentation logits, shape (
+                batch_size, num_classes, height, width).
+            uncertainty_func (func): uncertainty calculation function.
+            cfg (dict): Training config of point head.
+
+        Returns:
+            point_coords (Tensor): A tensor of shape (batch_size, num_points,
+                2) that contains the coordinates of ``num_points`` sampled
+                points.
+        """
+        num_points = cfg.num_points
+        oversample_ratio = cfg.oversample_ratio
+        importance_sample_ratio = cfg.importance_sample_ratio
+        assert oversample_ratio >= 1
+        assert 0 <= importance_sample_ratio <= 1
+        batch_size = seg_logits.shape[0]
+        num_sampled = int(num_points * oversample_ratio)
+        point_coords = torch.rand(
+            batch_size, num_sampled, 2, device=seg_logits.device)
+        point_logits = point_sample(seg_logits, point_coords)
+        # It is crucial to calculate uncertainty based on the sampled
+        # prediction value for the points. Calculating uncertainties of the
+        # coarse predictions first and sampling them for points leads to
+        # incorrect results.  To illustrate this: assume uncertainty func(
+        # logits)=-abs(logits), a sampled point between two coarse
+        # predictions with -1 and 1 logits has 0 logits, and therefore 0
+        # uncertainty value. However, if we calculate uncertainties for the
+        # coarse predictions first, both will have -1 uncertainty,
+        # and sampled point will get -1 uncertainty.
+        point_uncertainties = uncertainty_func(point_logits)
+        num_uncertain_points = int(importance_sample_ratio * num_points)
+        num_random_points = num_points - num_uncertain_points
+        idx = torch.topk(
+            point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+        shift = num_sampled * torch.arange(
+            batch_size, dtype=torch.long, device=seg_logits.device)
+        idx += shift[:, None]
+        point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+            batch_size, num_uncertain_points, 2)
+        if num_random_points > 0:
+            rand_point_coords = torch.rand(
+                batch_size, num_random_points, 2, device=seg_logits.device)
+            point_coords = torch.cat((point_coords, rand_point_coords), dim=1)
+        return point_coords
+
+    def get_points_test(self, seg_logits, uncertainty_func, cfg):
+        """Sample points for testing.
+
+        Find ``num_points`` most uncertain points from ``uncertainty_map``.
+
+        Args:
+            seg_logits (Tensor): A tensor of shape (batch_size, num_classes,
+                height, width) for class-specific or class-agnostic prediction.
+            uncertainty_func (func): uncertainty calculation function.
+            cfg (dict): Testing config of point head.
+
+        Returns:
+            point_indices (Tensor): A tensor of shape (batch_size, num_points)
+                that contains indices from [0, height x width) of the most
+                uncertain points.
+            point_coords (Tensor): A tensor of shape (batch_size, num_points,
+                2) that contains [0, 1] x [0, 1] normalized coordinates of the
+                most uncertain points from the ``height x width`` grid .
+        """
+
+        num_points = cfg.subdivision_num_points
+        uncertainty_map = uncertainty_func(seg_logits)
+        batch_size, _, height, width = uncertainty_map.shape
+        h_step = 1.0 / height
+        w_step = 1.0 / width
+
+        uncertainty_map = uncertainty_map.view(batch_size, height * width)
+        num_points = min(height * width, num_points)
+        point_indices = uncertainty_map.topk(num_points, dim=1)[1]
+        point_coords = torch.zeros(
+            batch_size,
+            num_points,
+            2,
+            dtype=torch.float,
+            device=seg_logits.device)
+        point_coords[:, :, 0] = w_step / 2.0 + (point_indices %
+                                                width).float() * w_step
+        point_coords[:, :, 1] = h_step / 2.0 + (point_indices //
+                                                width).float() * h_step
+        return point_indices, point_coords
diff --git a/head_extractor/src/mmseg/models/decode_heads/psa_head.py b/head_extractor/src/mmseg/models/decode_heads/psa_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..13ee5c58a569bb46612625b85685cd61b7e9df3e
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/psa_head.py
@@ -0,0 +1,197 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+try:
+    from mmcv.ops import PSAMask
+except ModuleNotFoundError:
+    PSAMask = None
+
+
+@MODELS.register_module()
+class PSAHead(BaseDecodeHead):
+    """Point-wise Spatial Attention Network for Scene Parsing.
+
+    This head is the implementation of `PSANet
+    <https://hszhao.github.io/papers/eccv18_psanet.pdf>`_.
+
+    Args:
+        mask_size (tuple[int]): The PSA mask size. It usually equals input
+            size.
+        psa_type (str): The type of psa module. Options are 'collect',
+            'distribute', 'bi-direction'. Default: 'bi-direction'
+        compact (bool): Whether use compact map for 'collect' mode.
+            Default: True.
+        shrink_factor (int): The downsample factors of psa mask. Default: 2.
+        normalization_factor (float): The normalize factor of attention.
+        psa_softmax (bool): Whether use softmax for attention.
+    """
+
+    def __init__(self,
+                 mask_size,
+                 psa_type='bi-direction',
+                 compact=False,
+                 shrink_factor=2,
+                 normalization_factor=1.0,
+                 psa_softmax=True,
+                 **kwargs):
+        if PSAMask is None:
+            raise RuntimeError('Please install mmcv-full for PSAMask ops')
+        super().__init__(**kwargs)
+        assert psa_type in ['collect', 'distribute', 'bi-direction']
+        self.psa_type = psa_type
+        self.compact = compact
+        self.shrink_factor = shrink_factor
+        self.mask_size = mask_size
+        mask_h, mask_w = mask_size
+        self.psa_softmax = psa_softmax
+        if normalization_factor is None:
+            normalization_factor = mask_h * mask_w
+        self.normalization_factor = normalization_factor
+
+        self.reduce = ConvModule(
+            self.in_channels,
+            self.channels,
+            kernel_size=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.attention = nn.Sequential(
+            ConvModule(
+                self.channels,
+                self.channels,
+                kernel_size=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            nn.Conv2d(
+                self.channels, mask_h * mask_w, kernel_size=1, bias=False))
+        if psa_type == 'bi-direction':
+            self.reduce_p = ConvModule(
+                self.in_channels,
+                self.channels,
+                kernel_size=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            self.attention_p = nn.Sequential(
+                ConvModule(
+                    self.channels,
+                    self.channels,
+                    kernel_size=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                nn.Conv2d(
+                    self.channels, mask_h * mask_w, kernel_size=1, bias=False))
+            self.psamask_collect = PSAMask('collect', mask_size)
+            self.psamask_distribute = PSAMask('distribute', mask_size)
+        else:
+            self.psamask = PSAMask(psa_type, mask_size)
+        self.proj = ConvModule(
+            self.channels * (2 if psa_type == 'bi-direction' else 1),
+            self.in_channels,
+            kernel_size=1,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        self.bottleneck = ConvModule(
+            self.in_channels * 2,
+            self.channels,
+            kernel_size=3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        identity = x
+        align_corners = self.align_corners
+        if self.psa_type in ['collect', 'distribute']:
+            out = self.reduce(x)
+            n, c, h, w = out.size()
+            if self.shrink_factor != 1:
+                if h % self.shrink_factor and w % self.shrink_factor:
+                    h = (h - 1) // self.shrink_factor + 1
+                    w = (w - 1) // self.shrink_factor + 1
+                    align_corners = True
+                else:
+                    h = h // self.shrink_factor
+                    w = w // self.shrink_factor
+                    align_corners = False
+                out = resize(
+                    out,
+                    size=(h, w),
+                    mode='bilinear',
+                    align_corners=align_corners)
+            y = self.attention(out)
+            if self.compact:
+                if self.psa_type == 'collect':
+                    y = y.view(n, h * w,
+                               h * w).transpose(1, 2).view(n, h * w, h, w)
+            else:
+                y = self.psamask(y)
+            if self.psa_softmax:
+                y = F.softmax(y, dim=1)
+            out = torch.bmm(
+                out.view(n, c, h * w), y.view(n, h * w, h * w)).view(
+                    n, c, h, w) * (1.0 / self.normalization_factor)
+        else:
+            x_col = self.reduce(x)
+            x_dis = self.reduce_p(x)
+            n, c, h, w = x_col.size()
+            if self.shrink_factor != 1:
+                if h % self.shrink_factor and w % self.shrink_factor:
+                    h = (h - 1) // self.shrink_factor + 1
+                    w = (w - 1) // self.shrink_factor + 1
+                    align_corners = True
+                else:
+                    h = h // self.shrink_factor
+                    w = w // self.shrink_factor
+                    align_corners = False
+                x_col = resize(
+                    x_col,
+                    size=(h, w),
+                    mode='bilinear',
+                    align_corners=align_corners)
+                x_dis = resize(
+                    x_dis,
+                    size=(h, w),
+                    mode='bilinear',
+                    align_corners=align_corners)
+            y_col = self.attention(x_col)
+            y_dis = self.attention_p(x_dis)
+            if self.compact:
+                y_dis = y_dis.view(n, h * w,
+                                   h * w).transpose(1, 2).view(n, h * w, h, w)
+            else:
+                y_col = self.psamask_collect(y_col)
+                y_dis = self.psamask_distribute(y_dis)
+            if self.psa_softmax:
+                y_col = F.softmax(y_col, dim=1)
+                y_dis = F.softmax(y_dis, dim=1)
+            x_col = torch.bmm(
+                x_col.view(n, c, h * w), y_col.view(n, h * w, h * w)).view(
+                    n, c, h, w) * (1.0 / self.normalization_factor)
+            x_dis = torch.bmm(
+                x_dis.view(n, c, h * w), y_dis.view(n, h * w, h * w)).view(
+                    n, c, h, w) * (1.0 / self.normalization_factor)
+            out = torch.cat([x_col, x_dis], 1)
+        out = self.proj(out)
+        out = resize(
+            out,
+            size=identity.shape[2:],
+            mode='bilinear',
+            align_corners=align_corners)
+        out = self.bottleneck(torch.cat((identity, out), dim=1))
+        out = self.cls_seg(out)
+        return out
diff --git a/head_extractor/src/mmseg/models/decode_heads/psp_head.py b/head_extractor/src/mmseg/models/decode_heads/psp_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..a40ec41dec281e53815e9753ee2ba1a5da76bd05
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/psp_head.py
@@ -0,0 +1,117 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class PPM(nn.ModuleList):
+    """Pooling Pyramid Module used in PSPNet.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module.
+        in_channels (int): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+        align_corners (bool): align_corners argument of F.interpolate.
+    """
+
+    def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg,
+                 act_cfg, align_corners, **kwargs):
+        super().__init__()
+        self.pool_scales = pool_scales
+        self.align_corners = align_corners
+        self.in_channels = in_channels
+        self.channels = channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        for pool_scale in pool_scales:
+            self.append(
+                nn.Sequential(
+                    nn.AdaptiveAvgPool2d(pool_scale),
+                    ConvModule(
+                        self.in_channels,
+                        self.channels,
+                        1,
+                        conv_cfg=self.conv_cfg,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg,
+                        **kwargs)))
+
+    def forward(self, x):
+        """Forward function."""
+        ppm_outs = []
+        for ppm in self:
+            ppm_out = ppm(x)
+            upsampled_ppm_out = resize(
+                ppm_out,
+                size=x.size()[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            ppm_outs.append(upsampled_ppm_out)
+        return ppm_outs
+
+
+@MODELS.register_module()
+class PSPHead(BaseDecodeHead):
+    """Pyramid Scene Parsing Network.
+
+    This head is the implementation of
+    `PSPNet <https://arxiv.org/abs/1612.01105>`_.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module. Default: (1, 2, 3, 6).
+    """
+
+    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
+        super().__init__(**kwargs)
+        assert isinstance(pool_scales, (list, tuple))
+        self.pool_scales = pool_scales
+        self.psp_modules = PPM(
+            self.pool_scales,
+            self.in_channels,
+            self.channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+        self.bottleneck = ConvModule(
+            self.in_channels + len(pool_scales) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _forward_feature(self, inputs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        x = self._transform_inputs(inputs)
+        psp_outs = [x]
+        psp_outs.extend(self.psp_modules(x))
+        psp_outs = torch.cat(psp_outs, dim=1)
+        feats = self.bottleneck(psp_outs)
+        return feats
+
+    def forward(self, inputs):
+        """Forward function."""
+        output = self._forward_feature(inputs)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/san_head.py b/head_extractor/src/mmseg/models/decode_heads/san_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..d20da801924080efeee30a246331af2e2e5df352
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/san_head.py
@@ -0,0 +1,736 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_norm_layer
+from mmcv.cnn.bricks.transformer import BaseTransformerLayer
+from mmcv.ops import point_sample
+from mmengine.dist import all_reduce
+from mmengine.model.weight_init import (caffe2_xavier_init, normal_init,
+                                        trunc_normal_)
+from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmseg.models.backbones.vit import TransformerEncoderLayer
+from mmseg.registry import MODELS
+from mmseg.utils import (ConfigType, MatchMasks, SampleList,
+                         seg_data_to_instance_data)
+from ..utils import (MLP, LayerNorm2d, PatchEmbed, cross_attn_layer,
+                     get_uncertain_point_coords_with_randomness, resize)
+from .decode_head import BaseDecodeHead
+
+
+class MLPMaskDecoder(nn.Module):
+    """Module for decoding query and visual features with MLP layers to
+    generate the attention biases and the mask proposals."""
+
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        total_heads: int = 1,
+        total_layers: int = 1,
+        embed_channels: int = 256,
+        mlp_channels: int = 256,
+        mlp_num_layers: int = 3,
+        rescale_attn_bias: bool = False,
+    ):
+        super().__init__()
+        self.total_heads = total_heads
+        self.total_layers = total_layers
+
+        dense_affine_func = partial(nn.Conv2d, kernel_size=1)
+        # Query Branch
+        self.query_mlp = MLP(in_channels, mlp_channels, embed_channels,
+                             mlp_num_layers)
+        # Pixel Branch
+        self.pix_mlp = MLP(
+            in_channels,
+            mlp_channels,
+            embed_channels,
+            mlp_num_layers,
+            affine_func=dense_affine_func,
+        )
+        # Attention Bias Branch
+        self.attn_mlp = MLP(
+            in_channels,
+            mlp_channels,
+            embed_channels * self.total_heads * self.total_layers,
+            mlp_num_layers,
+            affine_func=dense_affine_func,
+        )
+        if rescale_attn_bias:
+            self.bias_scaling = nn.Linear(1, 1)
+        else:
+            self.bias_scaling = nn.Identity()
+
+    def forward(self, query: torch.Tensor,
+                x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """Forward function.
+        Args:
+            query (Tensor): Query Tokens [B,N,C].
+            x (Tensor): Visual features [B,C,H,W]
+
+        Return:
+            mask_preds (Tensor): Mask proposals.
+            attn_bias (List[Tensor]): List of attention bias.
+        """
+        query = self.query_mlp(query)
+        pix = self.pix_mlp(x)
+        b, c, h, w = pix.shape
+        # preidict mask
+        mask_preds = torch.einsum('bqc,bchw->bqhw', query, pix)
+        # generate attn bias
+        attn = self.attn_mlp(x)
+        attn = attn.reshape(b, self.total_layers, self.total_heads, c, h, w)
+        attn_bias = torch.einsum('bqc,blnchw->blnqhw', query, attn)
+        attn_bias = self.bias_scaling(attn_bias[..., None]).squeeze(-1)
+        attn_bias = attn_bias.chunk(self.total_layers, dim=1)
+        attn_bias = [attn.squeeze(1) for attn in attn_bias]
+        return mask_preds, attn_bias
+
+
+class SideAdapterNetwork(nn.Module):
+    """Side Adapter Network for predicting mask proposals and attention bias.
+
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        clip_channels (int): Number of channels of visual features.
+            Default: 768.
+        embed_dims (int): embedding dimension. Default: 240.
+        patch_size (int): The patch size. Default: 16.
+        patch_bias (bool): Whether use bias in patch embedding.
+            Default: True.
+        num_queries (int): Number of queries for mask proposals.
+            Default: 100.
+        fusion_index (List[int]): The layer number of the encode
+            transformer to fuse with the CLIP feature.
+            Default: [0, 1, 2, 3].
+        cfg_encoder (ConfigType): Configs for the encode layers.
+        cfg_decoder (ConfigType): Configs for the decode layers.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+    """
+
+    def __init__(
+            self,
+            in_channels: int = 3,
+            clip_channels: int = 768,
+            embed_dims: int = 240,
+            patch_size: int = 16,
+            patch_bias: bool = True,
+            num_queries: int = 100,
+            fusion_index: list = [0, 1, 2, 3],
+            cfg_encoder: ConfigType = ...,
+            cfg_decoder: ConfigType = ...,
+            norm_cfg: dict = dict(type='LN'),
+    ):
+        super().__init__()
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=patch_size,
+            padding=0,
+            input_size=(640, 640),
+            bias=patch_bias,
+            norm_cfg=None,
+            init_cfg=None,
+        )
+        ori_h, ori_w = self.patch_embed.init_out_size
+        num_patches = ori_h * ori_w
+        self.pos_embed = nn.Parameter(
+            torch.randn(1, num_patches, embed_dims) * .02)
+        self.query_pos_embed = nn.Parameter(
+            torch.zeros(1, num_queries, embed_dims))
+        self.query_embed = nn.Parameter(
+            torch.zeros(1, num_queries, embed_dims))
+        encode_layers = []
+        for i in range(cfg_encoder.num_encode_layer):
+            encode_layers.append(
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims,
+                    num_heads=cfg_encoder.num_heads,
+                    feedforward_channels=cfg_encoder.mlp_ratio * embed_dims,
+                    norm_cfg=norm_cfg))
+        self.encode_layers = nn.ModuleList(encode_layers)
+        conv_clips = []
+        for i in range(len(fusion_index)):
+            conv_clips.append(
+                nn.Sequential(
+                    LayerNorm2d(clip_channels),
+                    ConvModule(
+                        clip_channels,
+                        embed_dims,
+                        kernel_size=1,
+                        norm_cfg=None,
+                        act_cfg=None)))
+        self.conv_clips = nn.ModuleList(conv_clips)
+        self.fusion_index = fusion_index
+        self.mask_decoder = MLPMaskDecoder(
+            in_channels=embed_dims,
+            total_heads=cfg_decoder.num_heads,
+            total_layers=cfg_decoder.num_layers,
+            embed_channels=cfg_decoder.embed_channels,
+            mlp_channels=cfg_decoder.mlp_channels,
+            mlp_num_layers=cfg_decoder.num_mlp,
+            rescale_attn_bias=cfg_decoder.rescale)
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.query_embed, std=0.02)
+        nn.init.normal_(self.query_pos_embed, std=0.02)
+        for i in range(len(self.conv_clips)):
+            caffe2_xavier_init(self.conv_clips[i][1].conv)
+
+    def fuse_clip(self, fused_index: int, x: torch.Tensor,
+                  clip_feature: torch.Tensor, hwshape: Tuple[int,
+                                                             int], L: int):
+        """Fuse CLIP feature and visual tokens."""
+        fused_clip = (resize(
+            self.conv_clips[fused_index](clip_feature.contiguous()),
+            size=hwshape,
+            mode='bilinear',
+            align_corners=False)).permute(0, 2, 3, 1).reshape(x[:, -L:,
+                                                                ...].shape)
+        x = torch.cat([x[:, :-L, ...], x[:, -L:, ...] + fused_clip], dim=1)
+        return x
+
+    def encode_feature(self, image: torch.Tensor,
+                       clip_features: List[torch.Tensor],
+                       deep_supervision_idxs: List[int]) -> List[List]:
+        """Encode images by a lightweight vision transformer."""
+        assert len(self.fusion_index) == len(clip_features)
+        x, hwshape = self.patch_embed(image)
+        ori_h, ori_w = self.patch_embed.init_out_size
+        pos_embed = self.pos_embed
+        if self.pos_embed.shape[1] != x.shape[1]:
+            # resize the position embedding
+            pos_embed = (
+                resize(
+                    self.pos_embed.reshape(1, ori_h, ori_w,
+                                           -1).permute(0, 3, 1, 2),
+                    size=hwshape,
+                    mode='bicubic',
+                    align_corners=False,
+                ).flatten(2).permute(0, 2, 1))
+        pos_embed = torch.cat([
+            self.query_pos_embed.expand(pos_embed.shape[0], -1, -1), pos_embed
+        ],
+                              dim=1)
+        x = torch.cat([self.query_embed.expand(x.shape[0], -1, -1), x], dim=1)
+        x = x + pos_embed
+        L = hwshape[0] * hwshape[1]
+        fused_index = 0
+        if self.fusion_index[fused_index] == 0:
+            x = self.fuse_clip(fused_index, x, clip_features[0][0], hwshape, L)
+            fused_index += 1
+        outs = []
+        for index, block in enumerate(self.encode_layers, start=1):
+            x = block(x)
+            if index < len(self.fusion_index
+                           ) and index == self.fusion_index[fused_index]:
+                x = self.fuse_clip(fused_index, x,
+                                   clip_features[fused_index][0], hwshape, L)
+                fused_index += 1
+            x_query = x[:, :-L, ...]
+            x_feat = x[:, -L:, ...].permute(0, 2, 1)\
+                .reshape(x.shape[0], x.shape[-1], hwshape[0], hwshape[1])
+
+            if index in deep_supervision_idxs or index == len(
+                    self.encode_layers):
+                outs.append({'query': x_query, 'x': x_feat})
+
+            if index < len(self.encode_layers):
+                x = x + pos_embed
+        return outs
+
+    def decode_feature(self, features):
+        mask_embeds = []
+        attn_biases = []
+        for feature in features:
+            mask_embed, attn_bias = self.mask_decoder(**feature)
+            mask_embeds.append(mask_embed)
+            attn_biases.append(attn_bias)
+        return mask_embeds, attn_biases
+
+    def forward(
+        self, image: torch.Tensor, clip_features: List[torch.Tensor],
+        deep_supervision_idxs: List[int]
+    ) -> Tuple[List[torch.Tensor], List[List[torch.Tensor]]]:
+        """Forward function."""
+        features = self.encode_feature(image, clip_features,
+                                       deep_supervision_idxs)
+        mask_embeds, attn_biases = self.decode_feature(features)
+        return mask_embeds, attn_biases
+
+
+class RecWithAttnbias(nn.Module):
+    """Mask recognition module by applying the attention biases to rest deeper
+    CLIP layers.
+
+    Args:
+        sos_token_format (str): The format of sos token. It should be
+            chosen from  ["cls_token", "learnable_token", "pos_embedding"].
+            Default: 'cls_token'.
+        sos_token_num (int): Number of sos token. It should be equal to
+            the number of quries. Default: 100.
+        num_layers (int): Number of rest CLIP layers for mask recognition.
+            Default: 3.
+        cross_attn (bool): Whether use cross attention to update sos token.
+            Default: False.
+        embed_dims (int): The feature dimension of CLIP layers.
+            Default: 768.
+        num_heads (int): Parallel attention heads of CLIP layers.
+            Default: 768.
+        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        qkv_bias (bool): Whether to use bias in multihead-attention.
+            Default: True.
+        out_dims (int): Number of channels of the output mask proposals.
+            It should be equal to the out_dims of text_encoder.
+            Default: 512.
+        final_norm (True): Whether use norm layer for sos token.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        frozen_exclude (List): List of parameters that are not to be frozen.
+    """
+
+    def __init__(self,
+                 sos_token_format: str = 'cls_token',
+                 sos_token_num: int = 100,
+                 num_layers: int = 3,
+                 cross_attn: bool = False,
+                 embed_dims: int = 768,
+                 num_heads: int = 12,
+                 mlp_ratio: int = 4,
+                 num_fcs: int = 2,
+                 qkv_bias: bool = True,
+                 out_dims: int = 512,
+                 final_norm: bool = True,
+                 act_cfg: dict = dict(type='GELU'),
+                 norm_cfg: dict = dict(type='LN'),
+                 frozen_exclude: List = []):
+        super().__init__()
+
+        assert sos_token_format in [
+            'cls_token', 'learnable_token', 'pos_embedding'
+        ]
+        self.sos_token_format = sos_token_format
+        self.sos_token_num = sos_token_num
+        self.frozen_exclude = frozen_exclude
+        self.cross_attn = cross_attn
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        if sos_token_format in ['learnable_token', 'pos_embedding']:
+            self.sos_token = nn.Parameter(
+                torch.randn(sos_token_num, 1, self.proj.shape[0]))
+            self.frozen.append('sos_token')
+
+        layers = []
+        for i in range(num_layers):
+            layers.append(
+                BaseTransformerLayer(
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=embed_dims,
+                        num_heads=num_heads,
+                        batch_first=False,
+                        bias=qkv_bias),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=embed_dims,
+                        feedforward_channels=mlp_ratio * embed_dims,
+                        act_cfg=act_cfg),
+                    operation_order=('norm', 'self_attn', 'norm', 'ffn')))
+        self.layers = nn.ModuleList(layers)
+
+        self.ln_post = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.proj = nn.Linear(embed_dims, out_dims, bias=False)
+
+        self.final_norm = final_norm
+        self._freeze()
+
+    def init_weights(self, rec_state_dict):
+        if hasattr(self, 'sos_token'):
+            normal_init(self.sos_token, std=0.02)
+        if rec_state_dict is not None:
+            load_state_dict(self, rec_state_dict, strict=False, logger=None)
+        else:
+            super().init_weights()
+
+    def _freeze(self):
+        if 'all' in self.frozen_exclude:
+            return
+        for name, param in self.named_parameters():
+            if not any([exclude in name for exclude in self.frozen_exclude]):
+                param.requires_grad = False
+
+    def _build_attn_biases(self, attn_biases, target_shape):
+        formatted_attn_biases = []
+        for attn_bias in attn_biases:
+            # convert it to proper format: N*num_head,L,L
+            # attn_bias: [N, num_head/1, num_sos,H,W]
+            n, num_head, num_sos, h, w = attn_bias.shape
+            # reshape and downsample
+            attn_bias = F.adaptive_max_pool2d(
+                attn_bias.reshape(n, num_head * num_sos, h, w),
+                output_size=target_shape)
+            attn_bias = attn_bias.reshape(n, num_head, num_sos, *target_shape)
+
+            true_num_head = self.num_heads
+            assert (num_head == 1 or num_head
+                    == true_num_head), f'num_head={num_head} is not supported.'
+            if num_head == 1:
+                attn_bias = attn_bias.repeat(1, true_num_head, 1, 1, 1)
+            attn_bias = attn_bias.reshape(n * true_num_head, num_sos, -1)
+            L = attn_bias.shape[-1]
+            if self.cross_attn:
+                # [n*num_head, num_sos, L]
+                formatted_attn_biases.append(attn_bias)
+            else:
+                # [n*num_head, num_sos+1+L, num_sos+1+L]
+                new_attn_bias = attn_bias.new_zeros(num_sos + 1 + L,
+                                                    num_sos + 1 + L)
+                new_attn_bias[:, :num_sos] = -100
+                new_attn_bias[torch.arange(num_sos), torch.arange(num_sos)] = 0
+                new_attn_bias[:num_sos, num_sos] = -100
+                new_attn_bias = (
+                    new_attn_bias[None, ...].expand(n * true_num_head, -1,
+                                                    -1).clone())
+                new_attn_bias[..., :num_sos, -L:] = attn_bias
+                formatted_attn_biases.append(new_attn_bias)
+
+        if len(formatted_attn_biases) == 1:
+            formatted_attn_biases = [
+                formatted_attn_biases[0] for _ in range(self.num_layers)
+            ]
+        return formatted_attn_biases
+
+    def forward(self, bias: List[Tensor], feature: List[Tensor]):
+        """Forward function to recognize the category of masks
+        Args:
+            bias (List[Tensor]): Attention bias for transformer layers
+            feature (List[Tensor]): Output of the image encoder,
+            including cls_token and img_feature.
+        """
+        cls_token = feature[1].unsqueeze(0)
+        img_feature = feature[0]
+        b, c, h, w = img_feature.shape
+        # construct clip shadow features
+        x = torch.cat(
+            [cls_token,
+             img_feature.reshape(b, c, -1).permute(2, 0, 1)])
+
+        # construct sos token
+        if self.sos_token_format == 'cls_token':
+            sos_token = cls_token.repeat(self.sos_token_num, 1, 1)
+        elif self.sos_token_format == 'learnable_token':
+            sos_token = self.sos_token.expand(-1, b, -1)
+        elif self.sos_token_format == 'pos_embedding':
+            sos_token = self.sos_token.expand(-1, b, -1) + cls_token
+
+        # construct attn bias
+        attn_biases = self._build_attn_biases(bias, target_shape=(h, w))
+
+        if self.cross_attn:
+            for i, block in enumerate(self.layers):
+                if self.cross_attn:
+                    sos_token = cross_attn_layer(
+                        block,
+                        sos_token,
+                        x[1:, ],
+                        attn_biases[i],
+                    )
+                    if i < len(self.layers) - 1:
+                        x = block(x)
+        else:
+            x = torch.cat([sos_token, x], dim=0)
+            for i, block in enumerate(self.layers):
+                x = block(x, attn_masks=[attn_biases[i]])
+            sos_token = x[:self.sos_token_num]
+
+        sos_token = sos_token.permute(1, 0, 2)  # LND -> NLD
+        sos_token = self.ln_post(sos_token)
+        sos_token = self.proj(sos_token)
+        if self.final_norm:
+            sos_token = F.normalize(sos_token, dim=-1)
+        return sos_token
+
+
+@MODELS.register_module()
+class SideAdapterCLIPHead(BaseDecodeHead):
+    """Side Adapter Network (SAN) for open-vocabulary semantic segmentation
+    with pre-trained vision-language model.
+
+    This decode head is the implementation of `Side Adapter Network
+    for Open-Vocabulary Semantic Segmentation`
+    <https://arxiv.org/abs/2302.12242>.
+    Modified from https://github.com/MendelXu/SAN/blob/main/san/model/side_adapter/side_adapter.py # noqa:E501
+    Copyright (c) 2023 MendelXu.
+    Licensed under the MIT License
+
+    Args:
+        num_classes (int): the number of classes.
+        san_cfg (ConfigType): Configs for SideAdapterNetwork module
+        maskgen_cfg (ConfigType): Configs for RecWithAttnbias module
+    """
+
+    def __init__(self, num_classes: int, san_cfg: ConfigType,
+                 maskgen_cfg: ConfigType, deep_supervision_idxs: List[int],
+                 train_cfg: ConfigType, **kwargs):
+        super().__init__(
+            in_channels=san_cfg.in_channels,
+            channels=san_cfg.embed_dims,
+            num_classes=num_classes,
+            **kwargs)
+        assert san_cfg.num_queries == maskgen_cfg.sos_token_num, \
+            'num_queries in san_cfg should be equal to sos_token_num ' \
+            'in maskgen_cfg'
+        del self.conv_seg
+        self.side_adapter_network = SideAdapterNetwork(**san_cfg)
+        self.rec_with_attnbias = RecWithAttnbias(**maskgen_cfg)
+        self.deep_supervision_idxs = deep_supervision_idxs
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.match_masks = MatchMasks(
+                num_points=train_cfg.num_points,
+                num_queries=san_cfg.num_queries,
+                num_classes=num_classes,
+                assigner=train_cfg.assigner)
+
+    def init_weights(self):
+
+        rec_state_dict = None
+        if isinstance(self.init_cfg, dict) and \
+                self.init_cfg.get('type') == 'Pretrained_Part':
+            checkpoint = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+
+            rec_state_dict = checkpoint.copy()
+            para_prefix = 'decode_head.rec_with_attnbias'
+            prefix_len = len(para_prefix) + 1
+            for k, v in checkpoint.items():
+                rec_state_dict.pop(k)
+                if para_prefix in k:
+                    rec_state_dict[k[prefix_len:]] = v
+
+        self.side_adapter_network.init_weights()
+        self.rec_with_attnbias.init_weights(rec_state_dict)
+
+    def forward(self, inputs: Tuple[Tensor],
+                deep_supervision_idxs) -> Tuple[List]:
+        """Forward function.
+
+        Args:
+            inputs (Tuple[Tensor]): A triplet including images,
+            list of multi-level visual features from image encoder and
+            class embeddings from text_encoder.
+
+        Returns:
+            mask_props (List[Tensor]): Mask proposals predicted by SAN.
+            mask_logits (List[Tensor]): Class logits of mask proposals.
+        """
+        imgs, clip_feature, class_embeds = inputs
+        # predict mask proposals and attention bias
+        mask_props, attn_biases = self.side_adapter_network(
+            imgs, clip_feature, deep_supervision_idxs)
+
+        # mask recognition with attention bias
+        mask_embeds = [
+            self.rec_with_attnbias(att_bias, clip_feature[-1])
+            for att_bias in attn_biases
+        ]
+        # Obtain class prediction of masks by comparing the similarity
+        # between the image token and the text embedding of class names.
+        mask_logits = [
+            torch.einsum('bqc,nc->bqn', mask_embed, class_embeds)
+            for mask_embed in mask_embeds
+        ]
+        return mask_props, mask_logits
+
+    def predict(self, inputs: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tensor:
+        """Forward function for prediction.
+
+        Args:
+            inputs (Tuple[Tensor]): Images, visual features from image encoder
+            and class embedding from text encoder.
+            batch_img_metas (dict): List Image info where each dict may also
+                contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Outputs segmentation logits map.
+        """
+        mask_props, mask_logits = self.forward(inputs, [])
+
+        return self.predict_by_feat([mask_props[-1], mask_logits[-1]],
+                                    batch_img_metas)
+
+    def predict_by_feat(self, seg_logits: List[Tensor],
+                        batch_img_metas: List[dict]) -> Tensor:
+        """1. Transform a batch of mask proposals to the input shape.
+           2. Generate segmentation map with mask proposals and class logits.
+        """
+        mask_pred = seg_logits[0]
+        cls_score = seg_logits[1]
+        if isinstance(batch_img_metas[0]['img_shape'], torch.Size):
+            # slide inference
+            size = batch_img_metas[0]['img_shape']
+        elif 'pad_shape' in batch_img_metas[0]:
+            size = batch_img_metas[0]['pad_shape'][:2]
+        else:
+            size = batch_img_metas[0]['img_shape']
+        # upsample mask
+        mask_pred = F.interpolate(
+            mask_pred, size=size, mode='bilinear', align_corners=False)
+
+        mask_cls = F.softmax(cls_score, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        seg_logits = torch.einsum('bqc,bqhw->bchw', mask_cls, mask_pred)
+        return seg_logits
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Perform forward propagation and loss calculation of the decoder head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            train_cfg (ConfigType): Training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        # batch SegDataSample to InstanceDataSample
+        batch_gt_instances = seg_data_to_instance_data(self.ignore_index,
+                                                       batch_data_samples)
+
+        # forward
+        all_mask_props, all_mask_logits = self.forward(
+            x, self.deep_supervision_idxs)
+
+        # loss
+        losses = self.loss_by_feat(all_mask_logits, all_mask_props,
+                                   batch_gt_instances)
+
+        return losses
+
+    def loss_by_feat(
+            self, all_cls_scores: Tensor, all_mask_preds: Tensor,
+            batch_gt_instances: List[InstanceData]) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification scores for all decoder
+                layers with shape (num_decoder, batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            all_mask_preds (Tensor): Mask scores for all decoder layers with
+                shape (num_decoder, batch_size, num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_dec_layers = len(all_cls_scores)
+        batch_gt_instances_list = [
+            batch_gt_instances for _ in range(num_dec_layers)
+        ]
+
+        losses = []
+        for i in range(num_dec_layers):
+            cls_scores = all_cls_scores[i]
+            mask_preds = all_mask_preds[i]
+            # matching N mask predictions to K category labels
+            (labels, mask_targets, mask_weights,
+             avg_factor) = self.match_masks.get_targets(
+                 cls_scores, mask_preds, batch_gt_instances_list[i])
+            cls_scores = cls_scores.flatten(0, 1)
+            labels = labels.flatten(0, 1)
+            num_total_masks = cls_scores.new_tensor([avg_factor],
+                                                    dtype=torch.float)
+            all_reduce(num_total_masks, op='mean')
+            num_total_masks = max(num_total_masks, 1)
+
+            # extract positive ones
+            # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+            mask_preds = mask_preds[mask_weights > 0]
+
+            if mask_targets.shape[0] != 0:
+                with torch.no_grad():
+                    points_coords = get_uncertain_point_coords_with_randomness(
+                        mask_preds.unsqueeze(1), None,
+                        self.train_cfg.num_points,
+                        self.train_cfg.oversample_ratio,
+                        self.train_cfg.importance_sample_ratio)
+                    # shape (num_total_gts, h, w)
+                    # -> (num_total_gts, num_points)
+                    mask_point_targets = point_sample(
+                        mask_targets.unsqueeze(1).float(),
+                        points_coords).squeeze(1)
+                # shape (num_queries, h, w) -> (num_queries, num_points)
+                mask_point_preds = point_sample(
+                    mask_preds.unsqueeze(1), points_coords).squeeze(1)
+
+            if not isinstance(self.loss_decode, nn.ModuleList):
+                losses_decode = [self.loss_decode]
+            else:
+                losses_decode = self.loss_decode
+            loss = dict()
+            for loss_decode in losses_decode:
+                if 'loss_cls' in loss_decode.loss_name:
+                    if loss_decode.loss_name == 'loss_cls_ce':
+                        loss[loss_decode.loss_name] = loss_decode(
+                            cls_scores, labels)
+                    else:
+                        assert False, "Only support 'CrossEntropyLoss' in" \
+                                      ' classification loss'
+
+                elif 'loss_mask' in loss_decode.loss_name:
+                    if mask_targets.shape[0] == 0:
+                        loss[loss_decode.loss_name] = mask_preds.sum()
+                    elif loss_decode.loss_name == 'loss_mask_ce':
+                        loss[loss_decode.loss_name] = loss_decode(
+                            mask_point_preds,
+                            mask_point_targets,
+                            avg_factor=num_total_masks *
+                            self.train_cfg.num_points)
+                    elif loss_decode.loss_name == 'loss_mask_dice':
+                        loss[loss_decode.loss_name] = loss_decode(
+                            mask_point_preds,
+                            mask_point_targets,
+                            avg_factor=num_total_masks)
+                    else:
+                        assert False, "Only support 'CrossEntropyLoss' and" \
+                                      " 'DiceLoss' in mask loss"
+                else:
+                    assert False, "Only support for 'loss_cls' and 'loss_mask'"
+
+            losses.append(loss)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict.update(losses[-1])
+        # loss from other decoder layers
+        for i, loss in enumerate(losses[:-1]):
+            for k, v in loss.items():
+                loss_dict[f'd{self.deep_supervision_idxs[i]}.{k}'] = v
+        return loss_dict
diff --git a/head_extractor/src/mmseg/models/decode_heads/segformer_head.py b/head_extractor/src/mmseg/models/decode_heads/segformer_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9eb0b320b4e7b892e0540cea5ba5ea7054f8008
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/segformer_head.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+@MODELS.register_module()
+class SegformerHead(BaseDecodeHead):
+    """The all mlp Head of segformer.
+
+    This head is the implementation of
+    `Segformer <https://arxiv.org/abs/2105.15203>` _.
+
+    Args:
+        interpolate_mode: The interpolate mode of MLP head upsample operation.
+            Default: 'bilinear'.
+    """
+
+    def __init__(self, interpolate_mode='bilinear', **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+
+        self.interpolate_mode = interpolate_mode
+        num_inputs = len(self.in_channels)
+
+        assert num_inputs == len(self.in_index)
+
+        self.convs = nn.ModuleList()
+        for i in range(num_inputs):
+            self.convs.append(
+                ConvModule(
+                    in_channels=self.in_channels[i],
+                    out_channels=self.channels,
+                    kernel_size=1,
+                    stride=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg))
+
+        self.fusion_conv = ConvModule(
+            in_channels=self.channels * num_inputs,
+            out_channels=self.channels,
+            kernel_size=1,
+            norm_cfg=self.norm_cfg)
+
+    def forward(self, inputs):
+        # Receive 4 stage backbone feature map: 1/4, 1/8, 1/16, 1/32
+        inputs = self._transform_inputs(inputs)
+        outs = []
+        for idx in range(len(inputs)):
+            x = inputs[idx]
+            conv = self.convs[idx]
+            outs.append(
+                resize(
+                    input=conv(x),
+                    size=inputs[0].shape[2:],
+                    mode=self.interpolate_mode,
+                    align_corners=self.align_corners))
+
+        out = self.fusion_conv(torch.cat(outs, dim=1))
+
+        out = self.cls_seg(out)
+
+        return out
diff --git a/head_extractor/src/mmseg/models/decode_heads/segmenter_mask_head.py b/head_extractor/src/mmseg/models/decode_heads/segmenter_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..85d27735ba8015772324177716b5e8d5f357295c
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/segmenter_mask_head.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmengine.model import ModuleList
+from mmengine.model.weight_init import (constant_init, trunc_normal_,
+                                        trunc_normal_init)
+
+from mmseg.models.backbones.vit import TransformerEncoderLayer
+from mmseg.registry import MODELS
+from .decode_head import BaseDecodeHead
+
+
+@MODELS.register_module()
+class SegmenterMaskTransformerHead(BaseDecodeHead):
+    """Segmenter: Transformer for Semantic Segmentation.
+
+    This head is the implementation of
+    `Segmenter: <https://arxiv.org/abs/2105.05633>`_.
+
+    Args:
+        backbone_cfg:(dict): Config of backbone of
+            Context Path.
+        in_channels (int): The number of channels of input image.
+        num_layers (int): The depth of transformer.
+        num_heads (int): The number of attention heads.
+        embed_dims (int): The number of embedding dimension.
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        drop_path_rate (float): stochastic depth rate. Default 0.1.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        qkv_bias (bool): Enable bias for qkv if True. Default: True.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        init_std (float): The value of std in weight initialization.
+            Default: 0.02.
+    """
+
+    def __init__(
+            self,
+            in_channels,
+            num_layers,
+            num_heads,
+            embed_dims,
+            mlp_ratio=4,
+            drop_path_rate=0.1,
+            drop_rate=0.0,
+            attn_drop_rate=0.0,
+            num_fcs=2,
+            qkv_bias=True,
+            act_cfg=dict(type='GELU'),
+            norm_cfg=dict(type='LN'),
+            init_std=0.02,
+            **kwargs,
+    ):
+        super().__init__(in_channels=in_channels, **kwargs)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, num_layers)]
+        self.layers = ModuleList()
+        for i in range(num_layers):
+            self.layers.append(
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims,
+                    num_heads=num_heads,
+                    feedforward_channels=mlp_ratio * embed_dims,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    drop_path_rate=dpr[i],
+                    num_fcs=num_fcs,
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    batch_first=True,
+                ))
+
+        self.dec_proj = nn.Linear(in_channels, embed_dims)
+
+        self.cls_emb = nn.Parameter(
+            torch.randn(1, self.num_classes, embed_dims))
+        self.patch_proj = nn.Linear(embed_dims, embed_dims, bias=False)
+        self.classes_proj = nn.Linear(embed_dims, embed_dims, bias=False)
+
+        self.decoder_norm = build_norm_layer(
+            norm_cfg, embed_dims, postfix=1)[1]
+        self.mask_norm = build_norm_layer(
+            norm_cfg, self.num_classes, postfix=2)[1]
+
+        self.init_std = init_std
+
+        delattr(self, 'conv_seg')
+
+    def init_weights(self):
+        trunc_normal_(self.cls_emb, std=self.init_std)
+        trunc_normal_init(self.patch_proj, std=self.init_std)
+        trunc_normal_init(self.classes_proj, std=self.init_std)
+        for n, m in self.named_modules():
+            if isinstance(m, nn.Linear):
+                trunc_normal_init(m, std=self.init_std, bias=0)
+            elif isinstance(m, nn.LayerNorm):
+                constant_init(m, val=1.0, bias=0.0)
+
+    def forward(self, inputs):
+        x = self._transform_inputs(inputs)
+        b, c, h, w = x.shape
+        x = x.permute(0, 2, 3, 1).contiguous().view(b, -1, c)
+
+        x = self.dec_proj(x)
+        cls_emb = self.cls_emb.expand(x.size(0), -1, -1)
+        x = torch.cat((x, cls_emb), 1)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.decoder_norm(x)
+
+        patches = self.patch_proj(x[:, :-self.num_classes])
+        cls_seg_feat = self.classes_proj(x[:, -self.num_classes:])
+
+        patches = F.normalize(patches, dim=2, p=2)
+        cls_seg_feat = F.normalize(cls_seg_feat, dim=2, p=2)
+
+        masks = patches @ cls_seg_feat.transpose(1, 2)
+        masks = self.mask_norm(masks)
+        masks = masks.permute(0, 2, 1).contiguous().view(b, -1, h, w)
+
+        return masks
diff --git a/head_extractor/src/mmseg/models/decode_heads/sep_aspp_head.py b/head_extractor/src/mmseg/models/decode_heads/sep_aspp_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dba68c9ecc6909e47da4f2da6169d529910355d
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/sep_aspp_head.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .aspp_head import ASPPHead, ASPPModule
+
+
+class DepthwiseSeparableASPPModule(ASPPModule):
+    """Atrous Spatial Pyramid Pooling (ASPP) Module with depthwise separable
+    conv."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        for i, dilation in enumerate(self.dilations):
+            if dilation > 1:
+                self[i] = DepthwiseSeparableConvModule(
+                    self.in_channels,
+                    self.channels,
+                    3,
+                    dilation=dilation,
+                    padding=dilation,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg)
+
+
+@MODELS.register_module()
+class DepthwiseSeparableASPPHead(ASPPHead):
+    """Encoder-Decoder with Atrous Separable Convolution for Semantic Image
+    Segmentation.
+
+    This head is the implementation of `DeepLabV3+
+    <https://arxiv.org/abs/1802.02611>`_.
+
+    Args:
+        c1_in_channels (int): The input channels of c1 decoder. If is 0,
+            the no decoder will be used.
+        c1_channels (int): The intermediate channels of c1 decoder.
+    """
+
+    def __init__(self, c1_in_channels, c1_channels, **kwargs):
+        super().__init__(**kwargs)
+        assert c1_in_channels >= 0
+        self.aspp_modules = DepthwiseSeparableASPPModule(
+            dilations=self.dilations,
+            in_channels=self.in_channels,
+            channels=self.channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        if c1_in_channels > 0:
+            self.c1_bottleneck = ConvModule(
+                c1_in_channels,
+                c1_channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        else:
+            self.c1_bottleneck = None
+        self.sep_bottleneck = nn.Sequential(
+            DepthwiseSeparableConvModule(
+                self.channels + c1_channels,
+                self.channels,
+                3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            DepthwiseSeparableConvModule(
+                self.channels,
+                self.channels,
+                3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg))
+
+    def forward(self, inputs):
+        """Forward function."""
+        x = self._transform_inputs(inputs)
+        aspp_outs = [
+            resize(
+                self.image_pool(x),
+                size=x.size()[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        ]
+        aspp_outs.extend(self.aspp_modules(x))
+        aspp_outs = torch.cat(aspp_outs, dim=1)
+        output = self.bottleneck(aspp_outs)
+        if self.c1_bottleneck is not None:
+            c1_output = self.c1_bottleneck(inputs[0])
+            output = resize(
+                input=output,
+                size=c1_output.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            output = torch.cat([output, c1_output], dim=1)
+        output = self.sep_bottleneck(output)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/sep_fcn_head.py b/head_extractor/src/mmseg/models/decode_heads/sep_fcn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b15983bceaeff48534bbceedfdf1c434a8d1d1f
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/sep_fcn_head.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import DepthwiseSeparableConvModule
+
+from mmseg.registry import MODELS
+from .fcn_head import FCNHead
+
+
+@MODELS.register_module()
+class DepthwiseSeparableFCNHead(FCNHead):
+    """Depthwise-Separable Fully Convolutional Network for Semantic
+    Segmentation.
+
+    This head is implemented according to `Fast-SCNN: Fast Semantic
+    Segmentation Network <https://arxiv.org/abs/1902.04502>`_.
+
+    Args:
+        in_channels(int): Number of output channels of FFM.
+        channels(int): Number of middle-stage channels in the decode head.
+        concat_input(bool): Whether to concatenate original decode input into
+            the result of several consecutive convolution layers.
+            Default: True.
+        num_classes(int): Used to determine the dimension of
+            final prediction tensor.
+        in_index(int): Correspond with 'out_indices' in FastSCNN backbone.
+        norm_cfg (dict | None): Config of norm layers.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        loss_decode(dict): Config of loss type and some
+            relevant additional options.
+        dw_act_cfg (dict):Activation config of depthwise ConvModule. If it is
+            'default', it will be the same as `act_cfg`. Default: None.
+    """
+
+    def __init__(self, dw_act_cfg=None, **kwargs):
+        super().__init__(**kwargs)
+        self.convs[0] = DepthwiseSeparableConvModule(
+            self.in_channels,
+            self.channels,
+            kernel_size=self.kernel_size,
+            padding=self.kernel_size // 2,
+            norm_cfg=self.norm_cfg,
+            dw_act_cfg=dw_act_cfg)
+
+        for i in range(1, self.num_convs):
+            self.convs[i] = DepthwiseSeparableConvModule(
+                self.channels,
+                self.channels,
+                kernel_size=self.kernel_size,
+                padding=self.kernel_size // 2,
+                norm_cfg=self.norm_cfg,
+                dw_act_cfg=dw_act_cfg)
+
+        if self.concat_input:
+            self.conv_cat = DepthwiseSeparableConvModule(
+                self.in_channels + self.channels,
+                self.channels,
+                kernel_size=self.kernel_size,
+                padding=self.kernel_size // 2,
+                norm_cfg=self.norm_cfg,
+                dw_act_cfg=dw_act_cfg)
diff --git a/head_extractor/src/mmseg/models/decode_heads/setr_mla_head.py b/head_extractor/src/mmseg/models/decode_heads/setr_mla_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1975991a60cc720650b880060efe10753f213131
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/setr_mla_head.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import Upsample
+from .decode_head import BaseDecodeHead
+
+
+@MODELS.register_module()
+class SETRMLAHead(BaseDecodeHead):
+    """Multi level feature aggretation head of SETR.
+
+    MLA head of `SETR  <https://arxiv.org/pdf/2012.15840.pdf>`_.
+
+    Args:
+        mlahead_channels (int): Channels of conv-conv-4x of multi-level feature
+            aggregation. Default: 128.
+        up_scale (int): The scale factor of interpolate. Default:4.
+    """
+
+    def __init__(self, mla_channels=128, up_scale=4, **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+        self.mla_channels = mla_channels
+
+        num_inputs = len(self.in_channels)
+
+        # Refer to self.cls_seg settings of BaseDecodeHead
+        assert self.channels == num_inputs * mla_channels
+
+        self.up_convs = nn.ModuleList()
+        for i in range(num_inputs):
+            self.up_convs.append(
+                nn.Sequential(
+                    ConvModule(
+                        in_channels=self.in_channels[i],
+                        out_channels=mla_channels,
+                        kernel_size=3,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    ConvModule(
+                        in_channels=mla_channels,
+                        out_channels=mla_channels,
+                        kernel_size=3,
+                        padding=1,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    Upsample(
+                        scale_factor=up_scale,
+                        mode='bilinear',
+                        align_corners=self.align_corners)))
+
+    def forward(self, inputs):
+        inputs = self._transform_inputs(inputs)
+        outs = []
+        for x, up_conv in zip(inputs, self.up_convs):
+            outs.append(up_conv(x))
+        out = torch.cat(outs, dim=1)
+        out = self.cls_seg(out)
+        return out
diff --git a/head_extractor/src/mmseg/models/decode_heads/setr_up_head.py b/head_extractor/src/mmseg/models/decode_heads/setr_up_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c796d8161088c2d7effe17f5ba71e43ff62e50c
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/setr_up_head.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_norm_layer
+
+from mmseg.registry import MODELS
+from ..utils import Upsample
+from .decode_head import BaseDecodeHead
+
+
+@MODELS.register_module()
+class SETRUPHead(BaseDecodeHead):
+    """Naive upsampling head and Progressive upsampling head of SETR.
+
+    Naive or PUP head of `SETR  <https://arxiv.org/pdf/2012.15840.pdf>`_.
+
+    Args:
+        norm_layer (dict): Config dict for input normalization.
+            Default: norm_layer=dict(type='LN', eps=1e-6, requires_grad=True).
+        num_convs (int): Number of decoder convolutions. Default: 1.
+        up_scale (int): The scale factor of interpolate. Default:4.
+        kernel_size (int): The kernel size of convolution when decoding
+            feature information from backbone. Default: 3.
+        init_cfg (dict | list[dict] | None): Initialization config dict.
+            Default: dict(
+                     type='Constant', val=1.0, bias=0, layer='LayerNorm').
+    """
+
+    def __init__(self,
+                 norm_layer=dict(type='LN', eps=1e-6, requires_grad=True),
+                 num_convs=1,
+                 up_scale=4,
+                 kernel_size=3,
+                 init_cfg=[
+                     dict(type='Constant', val=1.0, bias=0, layer='LayerNorm'),
+                     dict(
+                         type='Normal',
+                         std=0.01,
+                         override=dict(name='conv_seg'))
+                 ],
+                 **kwargs):
+
+        assert kernel_size in [1, 3], 'kernel_size must be 1 or 3.'
+
+        super().__init__(init_cfg=init_cfg, **kwargs)
+
+        assert isinstance(self.in_channels, int)
+
+        _, self.norm = build_norm_layer(norm_layer, self.in_channels)
+
+        self.up_convs = nn.ModuleList()
+        in_channels = self.in_channels
+        out_channels = self.channels
+        for _ in range(num_convs):
+            self.up_convs.append(
+                nn.Sequential(
+                    ConvModule(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        padding=int(kernel_size - 1) // 2,
+                        norm_cfg=self.norm_cfg,
+                        act_cfg=self.act_cfg),
+                    Upsample(
+                        scale_factor=up_scale,
+                        mode='bilinear',
+                        align_corners=self.align_corners)))
+            in_channels = out_channels
+
+    def forward(self, x):
+        x = self._transform_inputs(x)
+
+        n, c, h, w = x.shape
+        x = x.reshape(n, c, h * w).transpose(2, 1).contiguous()
+        x = self.norm(x)
+        x = x.transpose(1, 2).reshape(n, c, h, w).contiguous()
+
+        for up_conv in self.up_convs:
+            x = up_conv(x)
+        out = self.cls_seg(x)
+        return out
diff --git a/head_extractor/src/mmseg/models/decode_heads/stdc_head.py b/head_extractor/src/mmseg/models/decode_heads/stdc_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c1c21e3083fcb5098d2458e44538c0cf5b8f0e4
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/stdc_head.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+from mmengine.structures import PixelData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+from mmseg.utils import SampleList
+from .fcn_head import FCNHead
+
+
+@MODELS.register_module()
+class STDCHead(FCNHead):
+    """This head is the implementation of `Rethinking BiSeNet For Real-time
+    Semantic Segmentation <https://arxiv.org/abs/2104.13188>`_.
+
+    Args:
+        boundary_threshold (float): The threshold of calculating boundary.
+            Default: 0.1.
+    """
+
+    def __init__(self, boundary_threshold=0.1, **kwargs):
+        super().__init__(**kwargs)
+        self.boundary_threshold = boundary_threshold
+        # Using register buffer to make laplacian kernel on the same
+        # device of `seg_label`.
+        self.register_buffer(
+            'laplacian_kernel',
+            torch.tensor([-1, -1, -1, -1, 8, -1, -1, -1, -1],
+                         dtype=torch.float32,
+                         requires_grad=False).reshape((1, 1, 3, 3)))
+        self.fusion_kernel = torch.nn.Parameter(
+            torch.tensor([[6. / 10], [3. / 10], [1. / 10]],
+                         dtype=torch.float32).reshape(1, 3, 1, 1),
+            requires_grad=False)
+
+    def loss_by_feat(self, seg_logits: Tensor,
+                     batch_data_samples: SampleList) -> dict:
+        """Compute Detail Aggregation Loss."""
+        # Note: The paper claims `fusion_kernel` is a trainable 1x1 conv
+        # parameters. However, it is a constant in original repo and other
+        # codebase because it would not be added into computation graph
+        # after threshold operation.
+        seg_label = self._stack_batch_gt(batch_data_samples).to(
+            self.laplacian_kernel)
+        boundary_targets = F.conv2d(
+            seg_label, self.laplacian_kernel, padding=1)
+        boundary_targets = boundary_targets.clamp(min=0)
+        boundary_targets[boundary_targets > self.boundary_threshold] = 1
+        boundary_targets[boundary_targets <= self.boundary_threshold] = 0
+
+        boundary_targets_x2 = F.conv2d(
+            seg_label, self.laplacian_kernel, stride=2, padding=1)
+        boundary_targets_x2 = boundary_targets_x2.clamp(min=0)
+
+        boundary_targets_x4 = F.conv2d(
+            seg_label, self.laplacian_kernel, stride=4, padding=1)
+        boundary_targets_x4 = boundary_targets_x4.clamp(min=0)
+
+        boundary_targets_x4_up = F.interpolate(
+            boundary_targets_x4, boundary_targets.shape[2:], mode='nearest')
+        boundary_targets_x2_up = F.interpolate(
+            boundary_targets_x2, boundary_targets.shape[2:], mode='nearest')
+
+        boundary_targets_x2_up[
+            boundary_targets_x2_up > self.boundary_threshold] = 1
+        boundary_targets_x2_up[
+            boundary_targets_x2_up <= self.boundary_threshold] = 0
+
+        boundary_targets_x4_up[
+            boundary_targets_x4_up > self.boundary_threshold] = 1
+        boundary_targets_x4_up[
+            boundary_targets_x4_up <= self.boundary_threshold] = 0
+
+        boundary_targets_pyramids = torch.stack(
+            (boundary_targets, boundary_targets_x2_up, boundary_targets_x4_up),
+            dim=1)
+
+        boundary_targets_pyramids = boundary_targets_pyramids.squeeze(2)
+        boudary_targets_pyramid = F.conv2d(boundary_targets_pyramids,
+                                           self.fusion_kernel)
+
+        boudary_targets_pyramid[
+            boudary_targets_pyramid > self.boundary_threshold] = 1
+        boudary_targets_pyramid[
+            boudary_targets_pyramid <= self.boundary_threshold] = 0
+
+        seg_labels = boudary_targets_pyramid.long()
+        batch_sample_list = []
+        for label in seg_labels:
+            seg_data_sample = SegDataSample()
+            seg_data_sample.gt_sem_seg = PixelData(data=label)
+            batch_sample_list.append(seg_data_sample)
+
+        loss = super().loss_by_feat(seg_logits, batch_sample_list)
+        return loss
diff --git a/head_extractor/src/mmseg/models/decode_heads/uper_head.py b/head_extractor/src/mmseg/models/decode_heads/uper_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1ccc3173c0f1193e89ad48861aa7b5ee3b329cc
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/uper_head.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+from .psp_head import PPM
+
+
+@MODELS.register_module()
+class UPerHead(BaseDecodeHead):
+    """Unified Perceptual Parsing for Scene Understanding.
+
+    This head is the implementation of `UPerNet
+    <https://arxiv.org/abs/1807.10221>`_.
+
+    Args:
+        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
+            Module applied on the last feature. Default: (1, 2, 3, 6).
+    """
+
+    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+        # PSP Module
+        self.psp_modules = PPM(
+            pool_scales,
+            self.in_channels[-1],
+            self.channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+        self.bottleneck = ConvModule(
+            self.in_channels[-1] + len(pool_scales) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+        # FPN Module
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+        for in_channels in self.in_channels[:-1]:  # skip the top layer
+            l_conv = ConvModule(
+                in_channels,
+                self.channels,
+                1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                self.channels,
+                self.channels,
+                3,
+                padding=1,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg,
+                inplace=False)
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        self.fpn_bottleneck = ConvModule(
+            len(self.in_channels) * self.channels,
+            self.channels,
+            3,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def psp_forward(self, inputs):
+        """Forward function of PSP module."""
+        x = inputs[-1]
+        psp_outs = [x]
+        psp_outs.extend(self.psp_modules(x))
+        psp_outs = torch.cat(psp_outs, dim=1)
+        output = self.bottleneck(psp_outs)
+
+        return output
+
+    def _forward_feature(self, inputs):
+        """Forward function for feature maps before classifying each pixel with
+        ``self.cls_seg`` fc.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+
+        Returns:
+            feats (Tensor): A tensor of shape (batch_size, self.channels,
+                H, W) which is feature map for last layer of decoder head.
+        """
+        inputs = self._transform_inputs(inputs)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        laterals.append(self.psp_forward(inputs))
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            prev_shape = laterals[i - 1].shape[2:]
+            laterals[i - 1] = laterals[i - 1] + resize(
+                laterals[i],
+                size=prev_shape,
+                mode='bilinear',
+                align_corners=self.align_corners)
+
+        # build outputs
+        fpn_outs = [
+            self.fpn_convs[i](laterals[i])
+            for i in range(used_backbone_levels - 1)
+        ]
+        # append psp feature
+        fpn_outs.append(laterals[-1])
+
+        for i in range(used_backbone_levels - 1, 0, -1):
+            fpn_outs[i] = resize(
+                fpn_outs[i],
+                size=fpn_outs[0].shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+        fpn_outs = torch.cat(fpn_outs, dim=1)
+        feats = self.fpn_bottleneck(fpn_outs)
+        return feats
+
+    def forward(self, inputs):
+        """Forward function."""
+        output = self._forward_feature(inputs)
+        output = self.cls_seg(output)
+        return output
diff --git a/head_extractor/src/mmseg/models/decode_heads/vpd_depth_head.py b/head_extractor/src/mmseg/models/decode_heads/vpd_depth_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..65bdfbd8d9065c158f4f6a147cd8c06ae1dfd961
--- /dev/null
+++ b/head_extractor/src/mmseg/models/decode_heads/vpd_depth_head.py
@@ -0,0 +1,253 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class VPDDepthDecoder(BaseModule):
+    """VPD Depth Decoder class.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_deconv_layers (int): Number of deconvolution layers.
+        num_deconv_filters (List[int]): List of output channels for
+            deconvolution layers.
+        init_cfg (Optional[Union[Dict, List[Dict]]], optional): Configuration
+            for weight initialization. Defaults to Normal for Conv2d and
+            ConvTranspose2d layers.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 num_deconv_layers: int,
+                 num_deconv_filters: List[int],
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = dict(
+                     type='Normal',
+                     std=0.001,
+                     layer=['Conv2d', 'ConvTranspose2d'])):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+
+        self.deconv_layers = self._make_deconv_layer(
+            num_deconv_layers,
+            num_deconv_filters,
+        )
+
+        conv_layers = []
+        conv_layers.append(
+            build_conv_layer(
+                dict(type='Conv2d'),
+                in_channels=num_deconv_filters[-1],
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1))
+        conv_layers.append(build_norm_layer(dict(type='BN'), out_channels)[1])
+        conv_layers.append(nn.ReLU(inplace=True))
+        self.conv_layers = nn.Sequential(*conv_layers)
+
+        self.up_sample = nn.Upsample(
+            scale_factor=2, mode='bilinear', align_corners=False)
+
+    def forward(self, x):
+        """Forward pass through the decoder network."""
+        out = self.deconv_layers(x)
+        out = self.conv_layers(out)
+
+        out = self.up_sample(out)
+        out = self.up_sample(out)
+
+        return out
+
+    def _make_deconv_layer(self, num_layers, num_deconv_filters):
+        """Make deconv layers."""
+
+        layers = []
+        in_channels = self.in_channels
+        for i in range(num_layers):
+
+            num_channels = num_deconv_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=in_channels,
+                    out_channels=num_channels,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    output_padding=0,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(num_channels))
+            layers.append(nn.ReLU(inplace=True))
+            in_channels = num_channels
+
+        return nn.Sequential(*layers)
+
+
+@MODELS.register_module()
+class VPDDepthHead(BaseDecodeHead):
+    """Depth Prediction Head for VPD.
+
+    .. _`VPD`: https://arxiv.org/abs/2303.02153
+
+    Args:
+        max_depth (float): Maximum depth value. Defaults to 10.0.
+        in_channels (Sequence[int]): Number of input channels for each
+            convolutional layer.
+        embed_dim (int): Dimension of embedding. Defaults to 192.
+        feature_dim (int): Dimension of aggregated feature. Defaults to 1536.
+        num_deconv_layers (int): Number of deconvolution layers in the
+            decoder. Defaults to 3.
+        num_deconv_filters (Sequence[int]): Number of filters for each deconv
+            layer. Defaults to (32, 32, 32).
+        fmap_border (Union[int, Sequence[int]]): Feature map border for
+            cropping. Defaults to 0.
+        align_corners (bool): Flag for align_corners in interpolation.
+            Defaults to False.
+        loss_decode (dict): Configurations for the loss function. Defaults to
+            dict(type='SiLogLoss').
+        init_cfg (dict): Initialization configurations. Defaults to
+            dict(type='TruncNormal', std=0.02, layer=['Conv2d', 'Linear']).
+    """
+
+    num_classes = 1
+    out_channels = 1
+    input_transform = None
+
+    def __init__(
+        self,
+        max_depth: float = 10.0,
+        in_channels: Sequence[int] = [320, 640, 1280, 1280],
+        embed_dim: int = 192,
+        feature_dim: int = 1536,
+        num_deconv_layers: int = 3,
+        num_deconv_filters: Sequence[int] = (32, 32, 32),
+        fmap_border: Union[int, Sequence[int]] = 0,
+        align_corners: bool = False,
+        loss_decode: dict = dict(type='SiLogLoss'),
+        init_cfg=dict(
+            type='TruncNormal', std=0.02, layer=['Conv2d', 'Linear']),
+    ):
+
+        super(BaseDecodeHead, self).__init__(init_cfg=init_cfg)
+
+        # initialize parameters
+        self.in_channels = in_channels
+        self.max_depth = max_depth
+        self.align_corners = align_corners
+
+        # feature map border
+        if isinstance(fmap_border, int):
+            fmap_border = (fmap_border, fmap_border)
+        self.fmap_border = fmap_border
+
+        # define network layers
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels[0], in_channels[0], 3, stride=2, padding=1),
+            nn.GroupNorm(16, in_channels[0]),
+            nn.ReLU(),
+            nn.Conv2d(in_channels[0], in_channels[0], 3, stride=2, padding=1),
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels[1], in_channels[1], 3, stride=2, padding=1)
+
+        self.conv_aggregation = nn.Sequential(
+            nn.Conv2d(sum(in_channels), feature_dim, 1),
+            nn.GroupNorm(16, feature_dim),
+            nn.ReLU(),
+        )
+
+        self.decoder = VPDDepthDecoder(
+            in_channels=embed_dim * 8,
+            out_channels=embed_dim,
+            num_deconv_layers=num_deconv_layers,
+            num_deconv_filters=num_deconv_filters)
+
+        self.depth_pred_layer = nn.Sequential(
+            nn.Conv2d(
+                embed_dim, embed_dim, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(inplace=False),
+            nn.Conv2d(embed_dim, 1, kernel_size=3, stride=1, padding=1))
+
+        # build loss
+        if isinstance(loss_decode, dict):
+            self.loss_decode = MODELS.build(loss_decode)
+        elif isinstance(loss_decode, (list, tuple)):
+            self.loss_decode = nn.ModuleList()
+            for loss in loss_decode:
+                self.loss_decode.append(MODELS.build(loss))
+        else:
+            raise TypeError(f'loss_decode must be a dict or sequence of dict,\
+                but got {type(loss_decode)}')
+
+    def _stack_batch_gt(self, batch_data_samples: SampleList) -> Tensor:
+        gt_depth_maps = [
+            data_sample.gt_depth_map.data for data_sample in batch_data_samples
+        ]
+        return torch.stack(gt_depth_maps, dim=0)
+
+    def forward(self, x):
+        x = [
+            x[0], x[1],
+            torch.cat([x[2], F.interpolate(x[3], scale_factor=2)], dim=1)
+        ]
+        x = torch.cat([self.conv1(x[0]), self.conv2(x[1]), x[2]], dim=1)
+        x = self.conv_aggregation(x)
+
+        x = x[:, :, :x.size(2) - self.fmap_border[0], :x.size(3) -
+              self.fmap_border[1]].contiguous()
+        x = self.decoder(x)
+        out = self.depth_pred_layer(x)
+
+        depth = torch.sigmoid(out) * self.max_depth
+
+        return depth
+
+    def loss_by_feat(self, pred_depth_map: Tensor,
+                     batch_data_samples: SampleList) -> dict:
+        """Compute depth estimation loss.
+
+        Args:
+            pred_depth_map (Tensor): The output from decode head forward
+                function.
+            batch_data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_dpeth_map`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        gt_depth_map = self._stack_batch_gt(batch_data_samples)
+        loss = dict()
+        pred_depth_map = resize(
+            input=pred_depth_map,
+            size=gt_depth_map.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_decode in losses_decode:
+            if loss_decode.loss_name not in loss:
+                loss[loss_decode.loss_name] = loss_decode(
+                    pred_depth_map, gt_depth_map)
+            else:
+                loss[loss_decode.loss_name] += loss_decode(
+                    pred_depth_map, gt_depth_map)
+
+        return loss
diff --git a/head_extractor/src/mmseg/models/losses/__init__.py b/head_extractor/src/mmseg/models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0467cb3ad89b8c0c57f7f8eb58cbc2e23f50cdb4
--- /dev/null
+++ b/head_extractor/src/mmseg/models/losses/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .accuracy import Accuracy, accuracy
+from .boundary_loss import BoundaryLoss
+from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,
+                                 cross_entropy, mask_cross_entropy)
+from .dice_loss import DiceLoss
+from .focal_loss import FocalLoss
+from .huasdorff_distance_loss import HuasdorffDisstanceLoss
+from .lovasz_loss import LovaszLoss
+from .ohem_cross_entropy_loss import OhemCrossEntropy
+from .silog_loss import SiLogLoss
+from .tversky_loss import TverskyLoss
+from .utils import reduce_loss, weight_reduce_loss, weighted_loss
+
+__all__ = [
+    'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
+    'mask_cross_entropy', 'CrossEntropyLoss', 'reduce_loss',
+    'weight_reduce_loss', 'weighted_loss', 'LovaszLoss', 'DiceLoss',
+    'FocalLoss', 'TverskyLoss', 'OhemCrossEntropy', 'BoundaryLoss',
+    'HuasdorffDisstanceLoss', 'SiLogLoss'
+]
diff --git a/head_extractor/src/mmseg/models/losses/accuracy.py b/head_extractor/src/mmseg/models/losses/accuracy.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d9e2d7701088adadd5b6bb71c718c986b87a066
--- /dev/null
+++ b/head_extractor/src/mmseg/models/losses/accuracy.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+
+def accuracy(pred, target, topk=1, thresh=None, ignore_index=None):
+    """Calculate accuracy according to the prediction and target.
+
+    Args:
+        pred (torch.Tensor): The model prediction, shape (N, num_class, ...)
+        target (torch.Tensor): The target of each prediction, shape (N, , ...)
+        ignore_index (int | None): The label index to be ignored. Default: None
+        topk (int | tuple[int], optional): If the predictions in ``topk``
+            matches the target, the predictions will be regarded as
+            correct ones. Defaults to 1.
+        thresh (float, optional): If not None, predictions with scores under
+            this threshold are considered incorrect. Default to None.
+
+    Returns:
+        float | tuple[float]: If the input ``topk`` is a single integer,
+            the function will return a single float as accuracy. If
+            ``topk`` is a tuple containing multiple integers, the
+            function will return a tuple containing accuracies of
+            each ``topk`` number.
+    """
+    assert isinstance(topk, (int, tuple))
+    if isinstance(topk, int):
+        topk = (topk, )
+        return_single = True
+    else:
+        return_single = False
+
+    maxk = max(topk)
+    if pred.size(0) == 0:
+        accu = [pred.new_tensor(0.) for i in range(len(topk))]
+        return accu[0] if return_single else accu
+    assert pred.ndim == target.ndim + 1
+    assert pred.size(0) == target.size(0)
+    assert maxk <= pred.size(1), \
+        f'maxk {maxk} exceeds pred dimension {pred.size(1)}'
+    pred_value, pred_label = pred.topk(maxk, dim=1)
+    # transpose to shape (maxk, N, ...)
+    pred_label = pred_label.transpose(0, 1)
+    correct = pred_label.eq(target.unsqueeze(0).expand_as(pred_label))
+    if thresh is not None:
+        # Only prediction values larger than thresh are counted as correct
+        correct = correct & (pred_value > thresh).t()
+    if ignore_index is not None:
+        correct = correct[:, target != ignore_index]
+    res = []
+    eps = torch.finfo(torch.float32).eps
+    for k in topk:
+        # Avoid causing ZeroDivisionError when all pixels
+        # of an image are ignored
+        correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + eps
+        if ignore_index is not None:
+            total_num = target[target != ignore_index].numel() + eps
+        else:
+            total_num = target.numel() + eps
+        res.append(correct_k.mul_(100.0 / total_num))
+    return res[0] if return_single else res
+
+
+class Accuracy(nn.Module):
+    """Accuracy calculation module."""
+
+    def __init__(self, topk=(1, ), thresh=None, ignore_index=None):
+        """Module to calculate the accuracy.
+
+        Args:
+            topk (tuple, optional): The criterion used to calculate the
+                accuracy. Defaults to (1,).
+            thresh (float, optional): If not None, predictions with scores
+                under this threshold are considered incorrect. Default to None.
+        """
+        super().__init__()
+        self.topk = topk
+        self.thresh = thresh
+        self.ignore_index = ignore_index
+
+    def forward(self, pred, target):
+        """Forward function to calculate accuracy.
+
+        Args:
+            pred (torch.Tensor): Prediction of models.
+            target (torch.Tensor): Target for each prediction.
+
+        Returns:
+            tuple[float]: The accuracies under different topk criterions.
+        """
+        return accuracy(pred, target, self.topk, self.thresh,
+                        self.ignore_index)
diff --git a/head_extractor/src/mmseg/models/losses/boundary_loss.py b/head_extractor/src/mmseg/models/losses/boundary_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..e86b850d87e1d26be8cbb700758dae8dead82c58
--- /dev/null
+++ b/head_extractor/src/mmseg/models/losses/boundary_loss.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class BoundaryLoss(nn.Module):
+    """Boundary loss.
+
+    This function is modified from
+    `PIDNet <https://github.com/XuJiacong/PIDNet/blob/main/utils/criterion.py#L122>`_.  # noqa
+    Licensed under the MIT License.
+
+
+    Args:
+        loss_weight (float): Weight of the loss. Defaults to 1.0.
+        loss_name (str): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_boundary'.
+    """
+
+    def __init__(self,
+                 loss_weight: float = 1.0,
+                 loss_name: str = 'loss_boundary'):
+        super().__init__()
+        self.loss_weight = loss_weight
+        self.loss_name_ = loss_name
+
+    def forward(self, bd_pre: Tensor, bd_gt: Tensor) -> Tensor:
+        """Forward function.
+        Args:
+            bd_pre (Tensor): Predictions of the boundary head.
+            bd_gt (Tensor): Ground truth of the boundary.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        log_p = bd_pre.permute(0, 2, 3, 1).contiguous().view(1, -1)
+        target_t = bd_gt.view(1, -1).float()
+
+        pos_index = (target_t == 1)
+        neg_index = (target_t == 0)
+
+        weight = torch.zeros_like(log_p)
+        pos_num = pos_index.sum()
+        neg_num = neg_index.sum()
+        sum_num = pos_num + neg_num
+        weight[pos_index] = neg_num * 1.0 / sum_num
+        weight[neg_index] = pos_num * 1.0 / sum_num
+
+        loss = F.binary_cross_entropy_with_logits(
+            log_p, target_t, weight, reduction='mean')
+
+        return self.loss_weight * loss
+
+    @property
+    def loss_name(self):
+        return self.loss_name_
diff --git a/head_extractor/src/mmseg/models/losses/cross_entropy_loss.py b/head_extractor/src/mmseg/models/losses/cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..988fb789c11eca9d002b2c02f227450d704aeaef
--- /dev/null
+++ b/head_extractor/src/mmseg/models/losses/cross_entropy_loss.py
@@ -0,0 +1,311 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmseg.registry import MODELS
+from .utils import get_class_weight, weight_reduce_loss
+
+
+def cross_entropy(pred,
+                  label,
+                  weight=None,
+                  class_weight=None,
+                  reduction='mean',
+                  avg_factor=None,
+                  ignore_index=-100,
+                  avg_non_ignore=False):
+    """cross_entropy. The wrapper function for :func:`F.cross_entropy`
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1).
+        label (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+            Default: None.
+        class_weight (list[float], optional): The weight for each class.
+            Default: None.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are 'none', 'mean' and 'sum'. Default: 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Default: None.
+        ignore_index (int): Specifies a target value that is ignored and
+            does not contribute to the input gradients. When
+            ``avg_non_ignore `` is ``True``, and the ``reduction`` is
+            ``''mean''``, the loss is averaged over non-ignored targets.
+            Defaults: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+            `New in version 0.23.0.`
+    """
+
+    # class_weight is a manual rescaling weight given to each class.
+    # If given, has to be a Tensor of size C element-wise losses
+    loss = F.cross_entropy(
+        pred,
+        label,
+        weight=class_weight,
+        reduction='none',
+        ignore_index=ignore_index)
+
+    # apply weights and do the reduction
+    # average loss over non-ignored elements
+    # pytorch's official cross_entropy average loss over non-ignored elements
+    # refer to https://github.com/pytorch/pytorch/blob/56b43f4fec1f76953f15a627694d4bba34588969/torch/nn/functional.py#L2660  # noqa
+    if (avg_factor is None) and reduction == 'mean':
+        if class_weight is None:
+            if avg_non_ignore:
+                avg_factor = label.numel() - (label
+                                              == ignore_index).sum().item()
+            else:
+                avg_factor = label.numel()
+
+        else:
+            # the average factor should take the class weights into account
+            label_weights = torch.stack([class_weight[cls] for cls in label
+                                         ]).to(device=class_weight.device)
+
+            if avg_non_ignore:
+                label_weights[label == ignore_index] = 0
+            avg_factor = label_weights.sum()
+
+    if weight is not None:
+        weight = weight.float()
+    loss = weight_reduce_loss(
+        loss, weight=weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def _expand_onehot_labels(labels, label_weights, target_shape, ignore_index):
+    """Expand onehot labels to match the size of prediction."""
+    bin_labels = labels.new_zeros(target_shape)
+    valid_mask = (labels >= 0) & (labels != ignore_index)
+    inds = torch.nonzero(valid_mask, as_tuple=True)
+
+    if inds[0].numel() > 0:
+        if labels.dim() == 3:
+            bin_labels[inds[0], labels[valid_mask], inds[1], inds[2]] = 1
+        else:
+            bin_labels[inds[0], labels[valid_mask]] = 1
+
+    valid_mask = valid_mask.unsqueeze(1).expand(target_shape).float()
+
+    if label_weights is None:
+        bin_label_weights = valid_mask
+    else:
+        bin_label_weights = label_weights.unsqueeze(1).expand(target_shape)
+        bin_label_weights = bin_label_weights * valid_mask
+
+    return bin_labels, bin_label_weights, valid_mask
+
+
+def binary_cross_entropy(pred,
+                         label,
+                         weight=None,
+                         reduction='mean',
+                         avg_factor=None,
+                         class_weight=None,
+                         ignore_index=-100,
+                         avg_non_ignore=False,
+                         **kwargs):
+    """Calculate the binary CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1).
+        label (torch.Tensor): The learning label of the prediction.
+            Note: In bce loss, label < 0 is invalid.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int): The label index to be ignored. Default: -100.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+            `New in version 0.23.0.`
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    if pred.size(1) == 1:
+        # For binary class segmentation, the shape of pred is
+        # [N, 1, H, W] and that of label is [N, H, W].
+        # As the ignore_index often set as 255, so the
+        # binary class label check should mask out
+        # ignore_index
+        assert label[label != ignore_index].max() <= 1, \
+            'For pred with shape [N, 1, H, W], its label must have at ' \
+            'most 2 classes'
+        pred = pred.squeeze(1)
+    if pred.dim() != label.dim():
+        assert (pred.dim() == 2 and label.dim() == 1) or (
+                pred.dim() == 4 and label.dim() == 3), \
+            'Only pred shape [N, C], label shape [N] or pred shape [N, C, ' \
+            'H, W], label shape [N, H, W] are supported'
+        # `weight` returned from `_expand_onehot_labels`
+        # has been treated for valid (non-ignore) pixels
+        label, weight, valid_mask = _expand_onehot_labels(
+            label, weight, pred.shape, ignore_index)
+    else:
+        # should mask out the ignored elements
+        valid_mask = ((label >= 0) & (label != ignore_index)).float()
+        if weight is not None:
+            weight = weight * valid_mask
+        else:
+            weight = valid_mask
+    # average loss over non-ignored and valid elements
+    if reduction == 'mean' and avg_factor is None and avg_non_ignore:
+        avg_factor = valid_mask.sum().item()
+
+    loss = F.binary_cross_entropy_with_logits(
+        pred, label.float(), pos_weight=class_weight, reduction='none')
+    # do the reduction for the weighted loss
+    loss = weight_reduce_loss(
+        loss, weight, reduction=reduction, avg_factor=avg_factor)
+
+    return loss
+
+
+def mask_cross_entropy(pred,
+                       target,
+                       label,
+                       reduction='mean',
+                       avg_factor=None,
+                       class_weight=None,
+                       ignore_index=None,
+                       **kwargs):
+    """Calculate the CrossEntropy loss for masks.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        label (torch.Tensor): ``label`` indicates the class label of the mask'
+            corresponding object. This will be used to select the mask in the
+            of the class which the object belongs to when the mask prediction
+            if not class-agnostic.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (None): Placeholder, to be consistent with other loss.
+            Default: None.
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    assert ignore_index is None, 'BCE loss does not support ignore_index'
+    # TODO: handle these two reserved arguments
+    assert reduction == 'mean' and avg_factor is None
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(
+        pred_slice, target, weight=class_weight, reduction='mean')[None]
+
+
+@MODELS.register_module()
+class CrossEntropyLoss(nn.Module):
+    """CrossEntropyLoss.
+
+    Args:
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+            of softmax. Defaults to False.
+        use_mask (bool, optional): Whether to use mask cross entropy loss.
+            Defaults to False.
+        reduction (str, optional): . Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+        loss_name (str, optional): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_ce'.
+        avg_non_ignore (bool): The flag decides to whether the loss is
+            only averaged over non-ignored targets. Default: False.
+            `New in version 0.23.0.`
+    """
+
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0,
+                 loss_name='loss_ce',
+                 avg_non_ignore=False):
+        super().__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = get_class_weight(class_weight)
+        self.avg_non_ignore = avg_non_ignore
+        if not self.avg_non_ignore and self.reduction == 'mean':
+            warnings.warn(
+                'Default ``avg_non_ignore`` is False, if you would like to '
+                'ignore the certain label and average loss over non-ignore '
+                'labels, which is the same with PyTorch official '
+                'cross_entropy, set ``avg_non_ignore=True``.')
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+        self._loss_name = loss_name
+
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'avg_non_ignore={self.avg_non_ignore}'
+        return s
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                ignore_index=-100,
+                **kwargs):
+        """Forward function."""
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+        # Note: for BCE loss, label < 0 is invalid.
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            weight,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            avg_non_ignore=self.avg_non_ignore,
+            ignore_index=ignore_index,
+            **kwargs)
+        return loss_cls
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/src/mmseg/models/losses/dice_loss.py b/head_extractor/src/mmseg/models/losses/dice_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb2ffdba8daf867032b6d7b4e0d70a9b7a0c50fe
--- /dev/null
+++ b/head_extractor/src/mmseg/models/losses/dice_loss.py
@@ -0,0 +1,202 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+import torch.nn as nn
+
+from mmseg.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def _expand_onehot_labels_dice(pred: torch.Tensor,
+                               target: torch.Tensor) -> torch.Tensor:
+    """Expand onehot labels to match the size of prediction.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (N, num_class, H, W).
+        target (torch.Tensor): The learning label of the prediction,
+            has a shape (N, H, W).
+
+    Returns:
+        torch.Tensor: The target after one-hot encoding,
+            has a shape (N, num_class, H, W).
+    """
+    num_classes = pred.shape[1]
+    one_hot_target = torch.clamp(target, min=0, max=num_classes)
+    one_hot_target = torch.nn.functional.one_hot(one_hot_target,
+                                                 num_classes + 1)
+    one_hot_target = one_hot_target[..., :num_classes].permute(0, 3, 1, 2)
+    return one_hot_target
+
+
+def dice_loss(pred: torch.Tensor,
+              target: torch.Tensor,
+              weight: Union[torch.Tensor, None],
+              eps: float = 1e-3,
+              reduction: Union[str, None] = 'mean',
+              naive_dice: Union[bool, None] = False,
+              avg_factor: Union[int, None] = None,
+              ignore_index: Union[int, None] = 255) -> float:
+    """Calculate dice loss, there are two forms of dice loss is supported:
+
+        - the one proposed in `V-Net: Fully Convolutional Neural
+            Networks for Volumetric Medical Image Segmentation
+            <https://arxiv.org/abs/1606.04797>`_.
+        - the dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (n, *)
+        target (torch.Tensor): The learning label of the prediction,
+            shape (n, *), same shape of pred.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction, has a shape (n,). Defaults to None.
+        eps (float): Avoid dividing by zero. Default: 1e-3.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        naive_dice (bool, optional): If false, use the dice
+            loss defined in the V-Net paper, otherwise, use the
+            naive dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.Defaults to False.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        ignore_index (int, optional): The label index to be ignored.
+            Defaults to 255.
+    """
+    if ignore_index is not None:
+        num_classes = pred.shape[1]
+        pred = pred[:, torch.arange(num_classes) != ignore_index, :, :]
+        target = target[:, torch.arange(num_classes) != ignore_index, :, :]
+        assert pred.shape[1] != 0  # if the ignored index is the only class
+    input = pred.flatten(1)
+    target = target.flatten(1).float()
+    a = torch.sum(input * target, 1)
+    if naive_dice:
+        b = torch.sum(input, 1)
+        c = torch.sum(target, 1)
+        d = (2 * a + eps) / (b + c + eps)
+    else:
+        b = torch.sum(input * input, 1) + eps
+        c = torch.sum(target * target, 1) + eps
+        d = (2 * a) / (b + c)
+
+    loss = 1 - d
+    if weight is not None:
+        assert weight.ndim == loss.ndim
+        assert len(weight) == len(pred)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class DiceLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 activate=True,
+                 reduction='mean',
+                 naive_dice=False,
+                 loss_weight=1.0,
+                 ignore_index=255,
+                 eps=1e-3,
+                 loss_name='loss_dice'):
+        """Compute dice loss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            activate (bool): Whether to activate the predictions inside,
+                this will disable the inside sigmoid operation.
+                Defaults to True.
+            reduction (str, optional): The method used
+                to reduce the loss. Options are "none",
+                "mean" and "sum". Defaults to 'mean'.
+            naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power. Defaults to False.
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            ignore_index (int, optional): The label index to be ignored.
+                Default: 255.
+            eps (float): Avoid dividing by zero. Defaults to 1e-3.
+            loss_name (str, optional): Name of the loss item. If you want this
+                loss item to be included into the backward graph, `loss_` must
+                be the prefix of the name. Defaults to 'loss_dice'.
+        """
+
+        super().__init__()
+        self.use_sigmoid = use_sigmoid
+        self.reduction = reduction
+        self.naive_dice = naive_dice
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self.activate = activate
+        self.ignore_index = ignore_index
+        self._loss_name = loss_name
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                ignore_index=255,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction, has a shape (n, *).
+            target (torch.Tensor): The label of the prediction,
+                shape (n, *), same shape of pred.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction, has a shape (n,). Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        one_hot_target = target
+        if (pred.shape != target.shape):
+            one_hot_target = _expand_onehot_labels_dice(pred, target)
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.activate:
+            if self.use_sigmoid:
+                pred = pred.sigmoid()
+            elif pred.shape[1] != 1:
+                # softmax does not work when there is only 1 class
+                pred = pred.softmax(dim=1)
+        loss = self.loss_weight * dice_loss(
+            pred,
+            one_hot_target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            naive_dice=self.naive_dice,
+            avg_factor=avg_factor,
+            ignore_index=self.ignore_index)
+
+        return loss
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/src/mmseg/models/losses/focal_loss.py b/head_extractor/src/mmseg/models/losses/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..6507ed7a9112993733ac25bc095da0b571e14363
--- /dev/null
+++ b/head_extractor/src/mmseg/models/losses/focal_loss.py
@@ -0,0 +1,337 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/open-mmlab/mmdetection
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss
+
+from mmseg.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+# This method is used when cuda is not available
+def py_sigmoid_focal_loss(pred,
+                          target,
+                          one_hot_target=None,
+                          weight=None,
+                          gamma=2.0,
+                          alpha=0.5,
+                          class_weight=None,
+                          valid_mask=None,
+                          reduction='mean',
+                          avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning label of the prediction with
+            shape (N, C)
+        one_hot_target (None): Placeholder. It should be None.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float | list[float], optional): A balanced form for Focal Loss.
+            Defaults to 0.5.
+        class_weight (list[float], optional): Weight of each class.
+            Defaults to None.
+        valid_mask (torch.Tensor, optional): A mask uses 1 to mark the valid
+            samples and uses 0 to mark the ignored samples. Default: None.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    if isinstance(alpha, list):
+        alpha = pred.new_tensor(alpha)
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    one_minus_pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * one_minus_pt.pow(gamma)
+
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    final_weight = torch.ones(1, pred.size(1)).type_as(loss)
+    if weight is not None:
+        if weight.shape != loss.shape and weight.size(0) == loss.size(0):
+            # For most cases, weight is of shape (N, ),
+            # which means it does not have the second axis num_class
+            weight = weight.view(-1, 1)
+        assert weight.dim() == loss.dim()
+        final_weight = final_weight * weight
+    if class_weight is not None:
+        final_weight = final_weight * pred.new_tensor(class_weight)
+    if valid_mask is not None:
+        final_weight = final_weight * valid_mask
+    loss = weight_reduce_loss(loss, final_weight, reduction, avg_factor)
+    return loss
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       one_hot_target,
+                       weight=None,
+                       gamma=2.0,
+                       alpha=0.5,
+                       class_weight=None,
+                       valid_mask=None,
+                       reduction='mean',
+                       avg_factor=None):
+    r"""A wrapper of cuda version `Focal Loss
+    <https://arxiv.org/abs/1708.02002>`_.
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction. It's shape
+            should be (N, )
+        one_hot_target (torch.Tensor): The learning label with shape (N, C)
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float | list[float], optional): A balanced form for Focal Loss.
+            Defaults to 0.5.
+        class_weight (list[float], optional): Weight of each class.
+            Defaults to None.
+        valid_mask (torch.Tensor, optional): A mask uses 1 to mark the valid
+            samples and uses 0 to mark the ignored samples. Default: None.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    # Function.apply does not accept keyword arguments, so the decorator
+    # "weighted_loss" is not applicable
+    final_weight = torch.ones(1, pred.size(1)).type_as(pred)
+    if isinstance(alpha, list):
+        # _sigmoid_focal_loss doesn't accept alpha of list type. Therefore, if
+        # a list is given, we set the input alpha as 0.5. This means setting
+        # equal weight for foreground class and background class. By
+        # multiplying the loss by 2, the effect of setting alpha as 0.5 is
+        # undone. The alpha of type list is used to regulate the loss in the
+        # post-processing process.
+        loss = _sigmoid_focal_loss(pred.contiguous(), target.contiguous(),
+                                   gamma, 0.5, None, 'none') * 2
+        alpha = pred.new_tensor(alpha)
+        final_weight = final_weight * (
+            alpha * one_hot_target + (1 - alpha) * (1 - one_hot_target))
+    else:
+        loss = _sigmoid_focal_loss(pred.contiguous(), target.contiguous(),
+                                   gamma, alpha, None, 'none')
+    if weight is not None:
+        if weight.shape != loss.shape and weight.size(0) == loss.size(0):
+            # For most cases, weight is of shape (N, ),
+            # which means it does not have the second axis num_class
+            weight = weight.view(-1, 1)
+        assert weight.dim() == loss.dim()
+        final_weight = final_weight * weight
+    if class_weight is not None:
+        final_weight = final_weight * pred.new_tensor(class_weight)
+    if valid_mask is not None:
+        final_weight = final_weight * valid_mask
+    loss = weight_reduce_loss(loss, final_weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.5,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0,
+                 loss_name='loss_focal'):
+        """`Focal Loss <https://arxiv.org/abs/1708.02002>`_
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float | list[float], optional): A balanced form for Focal
+                Loss. Defaults to 0.5. When a list is provided, the length
+                of the list should be equal to the number of classes.
+                Please be careful that this parameter is not the
+                class-wise weight but the weight of a binary classification
+                problem. This binary classification problem regards the
+                pixels which belong to one class as the foreground
+                and the other pixels as the background, each element in
+                the list is the weight of the corresponding foreground class.
+                The value of alpha or each element of alpha should be a float
+                in the interval [0, 1]. If you want to specify the class-wise
+                weight, please use `class_weight` parameter.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            class_weight (list[float], optional): Weight of each class.
+                Defaults to None.
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            loss_name (str, optional): Name of the loss item. If you want this
+                loss item to be included into the backward graph, `loss_` must
+                be the prefix of the name. Defaults to 'loss_focal'.
+        """
+        super().__init__()
+        assert use_sigmoid is True, \
+            'AssertionError: Only sigmoid focal loss supported now.'
+        assert reduction in ('none', 'mean', 'sum'), \
+            "AssertionError: reduction should be 'none', 'mean' or " \
+            "'sum'"
+        assert isinstance(alpha, (float, list)), \
+            'AssertionError: alpha should be of type float'
+        assert isinstance(gamma, float), \
+            'AssertionError: gamma should be of type float'
+        assert isinstance(loss_weight, float), \
+            'AssertionError: loss_weight should be of type float'
+        assert isinstance(loss_name, str), \
+            'AssertionError: loss_name should be of type str'
+        assert isinstance(class_weight, list) or class_weight is None, \
+            'AssertionError: class_weight must be None or of type list'
+        self.use_sigmoid = use_sigmoid
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.class_weight = class_weight
+        self.loss_weight = loss_weight
+        self._loss_name = loss_name
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                ignore_index=255,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction with shape
+                (N, C) where C = number of classes, or
+                (N, C, d_1, d_2, ..., d_K) with K≥1 in the
+                case of K-dimensional loss.
+            target (torch.Tensor): The ground truth. If containing class
+                indices, shape (N) where each value is 0≤targets[i]≤C−1,
+                or (N, d_1, d_2, ..., d_K) with K≥1 in the case of
+                K-dimensional loss. If containing class probabilities,
+                same shape as the input.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used
+                to override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+            ignore_index (int, optional): The label index to be ignored.
+                Default: 255
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert isinstance(ignore_index, int), \
+            'ignore_index must be of type int'
+        assert reduction_override in (None, 'none', 'mean', 'sum'), \
+            "AssertionError: reduction should be 'none', 'mean' or " \
+            "'sum'"
+        assert pred.shape == target.shape or \
+               (pred.size(0) == target.size(0) and
+                pred.shape[2:] == target.shape[1:]), \
+               "The shape of pred doesn't match the shape of target"
+
+        original_shape = pred.shape
+
+        # [B, C, d_1, d_2, ..., d_k] -> [C, B, d_1, d_2, ..., d_k]
+        pred = pred.transpose(0, 1)
+        # [C, B, d_1, d_2, ..., d_k] -> [C, N]
+        pred = pred.reshape(pred.size(0), -1)
+        # [C, N] -> [N, C]
+        pred = pred.transpose(0, 1).contiguous()
+
+        if original_shape == target.shape:
+            # target with shape [B, C, d_1, d_2, ...]
+            # transform it's shape into [N, C]
+            # [B, C, d_1, d_2, ...] -> [C, B, d_1, d_2, ..., d_k]
+            target = target.transpose(0, 1)
+            # [C, B, d_1, d_2, ..., d_k] -> [C, N]
+            target = target.reshape(target.size(0), -1)
+            # [C, N] -> [N, C]
+            target = target.transpose(0, 1).contiguous()
+        else:
+            # target with shape [B, d_1, d_2, ...]
+            # transform it's shape into [N, ]
+            target = target.view(-1).contiguous()
+            valid_mask = (target != ignore_index).view(-1, 1)
+            # avoid raising error when using F.one_hot()
+            target = torch.where(target == ignore_index, target.new_tensor(0),
+                                 target)
+
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            num_classes = pred.size(1)
+            if torch.cuda.is_available() and pred.is_cuda:
+                if target.dim() == 1:
+                    one_hot_target = F.one_hot(
+                        target, num_classes=num_classes + 1)
+                    if num_classes == 1:
+                        one_hot_target = one_hot_target[:, 1]
+                        target = 1 - target
+                    else:
+                        one_hot_target = one_hot_target[:, :num_classes]
+                else:
+                    one_hot_target = target
+                    target = target.argmax(dim=1)
+                    valid_mask = (target != ignore_index).view(-1, 1)
+                calculate_loss_func = sigmoid_focal_loss
+            else:
+                one_hot_target = None
+                if target.dim() == 1:
+                    target = F.one_hot(target, num_classes=num_classes + 1)
+                    if num_classes == 1:
+                        target = target[:, 1]
+                    else:
+                        target = target[:, num_classes]
+                else:
+                    valid_mask = (target.argmax(dim=1) != ignore_index).view(
+                        -1, 1)
+                calculate_loss_func = py_sigmoid_focal_loss
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                one_hot_target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                class_weight=self.class_weight,
+                valid_mask=valid_mask,
+                reduction=reduction,
+                avg_factor=avg_factor)
+
+            if reduction == 'none':
+                # [N, C] -> [C, N]
+                loss_cls = loss_cls.transpose(0, 1)
+                # [C, N] -> [C, B, d1, d2, ...]
+                # original_shape: [B, C, d1, d2, ...]
+                loss_cls = loss_cls.reshape(original_shape[1],
+                                            original_shape[0],
+                                            *original_shape[2:])
+                # [C, B, d1, d2, ...] -> [B, C, d1, d2, ...]
+                loss_cls = loss_cls.transpose(0, 1).contiguous()
+        else:
+            raise NotImplementedError
+        return loss_cls
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/src/mmseg/models/losses/huasdorff_distance_loss.py b/head_extractor/src/mmseg/models/losses/huasdorff_distance_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..d950ba728f8d419ea2b291e2159b926aca44038c
--- /dev/null
+++ b/head_extractor/src/mmseg/models/losses/huasdorff_distance_loss.py
@@ -0,0 +1,160 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from https://github.com/JunMa11/SegWithDistMap/blob/
+master/code/train_LA_HD.py (Apache-2.0 License)"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from scipy.ndimage import distance_transform_edt as distance
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from .utils import get_class_weight, weighted_loss
+
+
+def compute_dtm(img_gt: Tensor, pred: Tensor) -> Tensor:
+    """
+    compute the distance transform map of foreground in mask
+    Args:
+        img_gt: Ground truth of the image, (b, h, w)
+        pred: Predictions of the segmentation head after softmax, (b, c, h, w)
+
+    Returns:
+        output: the foreground Distance Map (SDM)
+        dtm(x) = 0; x in segmentation boundary
+                inf|x-y|; x in segmentation
+    """
+
+    fg_dtm = torch.zeros_like(pred)
+    out_shape = pred.shape
+    for b in range(out_shape[0]):  # batch size
+        for c in range(1, out_shape[1]):  # default 0 channel is background
+            posmask = img_gt[b].byte()
+            if posmask.any():
+                posdis = distance(posmask)
+                fg_dtm[b][c] = torch.from_numpy(posdis)
+
+    return fg_dtm
+
+
+@weighted_loss
+def hd_loss(seg_soft: Tensor,
+            gt: Tensor,
+            seg_dtm: Tensor,
+            gt_dtm: Tensor,
+            class_weight=None,
+            ignore_index=255) -> Tensor:
+    """
+    compute huasdorff distance loss for segmentation
+    Args:
+        seg_soft: softmax results, shape=(b,c,x,y)
+        gt: ground truth, shape=(b,x,y)
+        seg_dtm: segmentation distance transform map, shape=(b,c,x,y)
+        gt_dtm: ground truth distance transform map, shape=(b,c,x,y)
+
+    Returns:
+        output: hd_loss
+    """
+    assert seg_soft.shape[0] == gt.shape[0]
+    total_loss = 0
+    num_class = seg_soft.shape[1]
+    if class_weight is not None:
+        assert class_weight.ndim == num_class
+    for i in range(1, num_class):
+        if i != ignore_index:
+            delta_s = (seg_soft[:, i, ...] - gt.float())**2
+            s_dtm = seg_dtm[:, i, ...]**2
+            g_dtm = gt_dtm[:, i, ...]**2
+            dtm = s_dtm + g_dtm
+            multiplied = torch.einsum('bxy, bxy->bxy', delta_s, dtm)
+            hd_loss = multiplied.mean()
+        if class_weight is not None:
+            hd_loss *= class_weight[i]
+        total_loss += hd_loss
+
+    return total_loss / num_class
+
+
+@MODELS.register_module()
+class HuasdorffDisstanceLoss(nn.Module):
+    """HuasdorffDisstanceLoss. This loss is proposed in `How Distance Transform
+    Maps Boost Segmentation CNNs: An Empirical Study.
+
+    <http://proceedings.mlr.press/v121/ma20b.html>`_.
+    Args:
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float): Weight of the loss. Defaults to 1.0.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+        loss_name (str): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_boundary'.
+    """
+
+    def __init__(self,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0,
+                 ignore_index=255,
+                 loss_name='loss_huasdorff_disstance',
+                 **kwargs):
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = get_class_weight(class_weight)
+        self._loss_name = loss_name
+        self.ignore_index = ignore_index
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predictions of the segmentation head. (B, C, H, W)
+            target (Tensor): Ground truth of the image. (B, H, W)
+            avg_factor (int, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used
+                to override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+        Returns:
+            Tensor: Loss tensor.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.class_weight is not None:
+            class_weight = pred.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        pred_soft = F.softmax(pred, dim=1)
+        valid_mask = (target != self.ignore_index).long()
+        target = target * valid_mask
+
+        with torch.no_grad():
+            gt_dtm = compute_dtm(target.cpu(), pred_soft)
+            gt_dtm = gt_dtm.float()
+            seg_dtm2 = compute_dtm(
+                pred_soft.argmax(dim=1, keepdim=False).cpu(), pred_soft)
+            seg_dtm2 = seg_dtm2.float()
+
+        loss_hd = self.loss_weight * hd_loss(
+            pred_soft,
+            target,
+            seg_dtm=seg_dtm2,
+            gt_dtm=gt_dtm,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            class_weight=class_weight,
+            ignore_index=self.ignore_index)
+        return loss_hd
+
+    @property
+    def loss_name(self):
+        return self._loss_name
diff --git a/head_extractor/src/mmseg/models/losses/kldiv_loss.py b/head_extractor/src/mmseg/models/losses/kldiv_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..496ef9713f085a36d46837ac0b51d4cb9f956fce
--- /dev/null
+++ b/head_extractor/src/mmseg/models/losses/kldiv_loss.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class KLDivLoss(nn.Module):
+
+    def __init__(self,
+                 temperature: float = 1.0,
+                 reduction: str = 'mean',
+                 loss_name: str = 'loss_kld'):
+        """Kullback-Leibler divergence Loss.
+
+        <https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence>
+
+        Args:
+            temperature (float, optional): Temperature param
+            reduction  (str,  optional): The method to reduce the loss into a
+            scalar. Default is "mean". Options are "none", "sum",
+            and "mean"
+        """
+
+        assert isinstance(temperature, (float, int)), \
+            'Expected temperature to be' \
+            f'float or int, but got {temperature.__class__.__name__} instead'
+        assert temperature != 0., 'Temperature must not be zero'
+
+        assert reduction in ['mean', 'none', 'sum'], \
+            'Reduction must be one of the options ("mean", ' \
+            f'"sum", "none"), but got {reduction}'
+
+        super().__init__()
+        self.temperature = temperature
+        self.reduction = reduction
+        self._loss_name = loss_name
+
+    def forward(self, input: torch.Tensor, target: torch.Tensor):
+        """Forward function. Calculate KL divergence Loss.
+
+        Args:
+            input (Tensor): Logit tensor,
+                the data type is float32 or float64.
+                The shape is (N, C) where N is batchsize and C  is number of
+                channels.
+                If there more than 2 dimensions, shape is (N, C, D1, D2, ...
+                Dk), k>= 1
+            target (Tensor): Logit tensor,
+                the data type is float32 or float64.
+                input and target must be with the same shape.
+
+        Returns:
+            (Tensor): Reduced loss.
+        """
+        assert isinstance(input, torch.Tensor), 'Expected input to' \
+            f'be Tensor, but got {input.__class__.__name__} instead'
+        assert isinstance(target, torch.Tensor), 'Expected target to' \
+            f'be Tensor, but got {target.__class__.__name__} instead'
+
+        assert input.shape == target.shape, 'Input and target ' \
+            'must have same shape,' \
+            f'but got shapes {input.shape} and {target.shape}'
+
+        input = F.softmax(input / self.temperature, dim=1)
+        target = F.softmax(target / self.temperature, dim=1)
+
+        loss = F.kl_div(input, target, reduction='none', log_target=False)
+        loss = loss * self.temperature**2
+
+        batch_size = input.shape[0]
+
+        if self.reduction == 'sum':
+            # Change view to calculate instance-wise sum
+            loss = loss.view(batch_size, -1)
+            return torch.sum(loss, dim=1)
+
+        elif self.reduction == 'mean':
+            # Change view to calculate instance-wise mean
+            loss = loss.view(batch_size, -1)
+            return torch.mean(loss, dim=1)
+
+        return loss
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/src/mmseg/models/losses/lovasz_loss.py b/head_extractor/src/mmseg/models/losses/lovasz_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b47f9d8a15330a45d0d2d25f3c18d9386e2b335e
--- /dev/null
+++ b/head_extractor/src/mmseg/models/losses/lovasz_loss.py
@@ -0,0 +1,323 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from https://github.com/bermanmaxim/LovaszSoftmax/blob/master/pytor
+ch/lovasz_losses.py Lovasz-Softmax and Jaccard hinge loss in PyTorch Maxim
+Berman 2018 ESAT-PSI KU Leuven (MIT License)"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.utils import is_list_of
+
+from mmseg.registry import MODELS
+from .utils import get_class_weight, weight_reduce_loss
+
+
+def lovasz_grad(gt_sorted):
+    """Computes gradient of the Lovasz extension w.r.t sorted errors.
+
+    See Alg. 1 in paper.
+    """
+    p = len(gt_sorted)
+    gts = gt_sorted.sum()
+    intersection = gts - gt_sorted.float().cumsum(0)
+    union = gts + (1 - gt_sorted).float().cumsum(0)
+    jaccard = 1. - intersection / union
+    if p > 1:  # cover 1-pixel case
+        jaccard[1:p] = jaccard[1:p] - jaccard[0:-1]
+    return jaccard
+
+
+def flatten_binary_logits(logits, labels, ignore_index=None):
+    """Flattens predictions in the batch (binary case) Remove labels equal to
+    'ignore_index'."""
+    logits = logits.view(-1)
+    labels = labels.view(-1)
+    if ignore_index is None:
+        return logits, labels
+    valid = (labels != ignore_index)
+    vlogits = logits[valid]
+    vlabels = labels[valid]
+    return vlogits, vlabels
+
+
+def flatten_probs(probs, labels, ignore_index=None):
+    """Flattens predictions in the batch."""
+    if probs.dim() == 3:
+        # assumes output of a sigmoid layer
+        B, H, W = probs.size()
+        probs = probs.view(B, 1, H, W)
+    B, C, H, W = probs.size()
+    probs = probs.permute(0, 2, 3, 1).contiguous().view(-1, C)  # B*H*W, C=P,C
+    labels = labels.view(-1)
+    if ignore_index is None:
+        return probs, labels
+    valid = (labels != ignore_index)
+    vprobs = probs[valid.nonzero().squeeze()]
+    vlabels = labels[valid]
+    return vprobs, vlabels
+
+
+def lovasz_hinge_flat(logits, labels):
+    """Binary Lovasz hinge loss.
+
+    Args:
+        logits (torch.Tensor): [P], logits at each prediction
+            (between -infty and +infty).
+        labels (torch.Tensor): [P], binary ground truth labels (0 or 1).
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    if len(labels) == 0:
+        # only void pixels, the gradients should be 0
+        return logits.sum() * 0.
+    signs = 2. * labels.float() - 1.
+    errors = (1. - logits * signs)
+    errors_sorted, perm = torch.sort(errors, dim=0, descending=True)
+    perm = perm.data
+    gt_sorted = labels[perm]
+    grad = lovasz_grad(gt_sorted)
+    loss = torch.dot(F.relu(errors_sorted), grad)
+    return loss
+
+
+def lovasz_hinge(logits,
+                 labels,
+                 classes='present',
+                 per_image=False,
+                 class_weight=None,
+                 reduction='mean',
+                 avg_factor=None,
+                 ignore_index=255):
+    """Binary Lovasz hinge loss.
+
+    Args:
+        logits (torch.Tensor): [B, H, W], logits at each pixel
+            (between -infty and +infty).
+        labels (torch.Tensor): [B, H, W], binary ground truth masks (0 or 1).
+        classes (str | list[int], optional): Placeholder, to be consistent with
+            other loss. Default: None.
+        per_image (bool, optional): If per_image is True, compute the loss per
+            image instead of per batch. Default: False.
+        class_weight (list[float], optional): Placeholder, to be consistent
+            with other loss. Default: None.
+        reduction (str, optional): The method used to reduce the loss. Options
+            are "none", "mean" and "sum". This parameter only works when
+            per_image is True. Default: 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. This parameter only works when per_image is True.
+            Default: None.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    if per_image:
+        loss = [
+            lovasz_hinge_flat(*flatten_binary_logits(
+                logit.unsqueeze(0), label.unsqueeze(0), ignore_index))
+            for logit, label in zip(logits, labels)
+        ]
+        loss = weight_reduce_loss(
+            torch.stack(loss), None, reduction, avg_factor)
+    else:
+        loss = lovasz_hinge_flat(
+            *flatten_binary_logits(logits, labels, ignore_index))
+    return loss
+
+
+def lovasz_softmax_flat(probs, labels, classes='present', class_weight=None):
+    """Multi-class Lovasz-Softmax loss.
+
+    Args:
+        probs (torch.Tensor): [P, C], class probabilities at each prediction
+            (between 0 and 1).
+        labels (torch.Tensor): [P], ground truth labels (between 0 and C - 1).
+        classes (str | list[int], optional): Classes chosen to calculate loss.
+            'all' for all classes, 'present' for classes present in labels, or
+            a list of classes to average. Default: 'present'.
+        class_weight (list[float], optional): The weight for each class.
+            Default: None.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    if probs.numel() == 0:
+        # only void pixels, the gradients should be 0
+        return probs * 0.
+    C = probs.size(1)
+    losses = []
+    class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes
+    for c in class_to_sum:
+        fg = (labels == c).float()  # foreground for class c
+        if (classes == 'present' and fg.sum() == 0):
+            continue
+        if C == 1:
+            if len(classes) > 1:
+                raise ValueError('Sigmoid output possible only with 1 class')
+            class_pred = probs[:, 0]
+        else:
+            class_pred = probs[:, c]
+        errors = (fg - class_pred).abs()
+        errors_sorted, perm = torch.sort(errors, 0, descending=True)
+        perm = perm.data
+        fg_sorted = fg[perm]
+        loss = torch.dot(errors_sorted, lovasz_grad(fg_sorted))
+        if class_weight is not None:
+            loss *= class_weight[c]
+        losses.append(loss)
+    return torch.stack(losses).mean()
+
+
+def lovasz_softmax(probs,
+                   labels,
+                   classes='present',
+                   per_image=False,
+                   class_weight=None,
+                   reduction='mean',
+                   avg_factor=None,
+                   ignore_index=255):
+    """Multi-class Lovasz-Softmax loss.
+
+    Args:
+        probs (torch.Tensor): [B, C, H, W], class probabilities at each
+            prediction (between 0 and 1).
+        labels (torch.Tensor): [B, H, W], ground truth labels (between 0 and
+            C - 1).
+        classes (str | list[int], optional): Classes chosen to calculate loss.
+            'all' for all classes, 'present' for classes present in labels, or
+            a list of classes to average. Default: 'present'.
+        per_image (bool, optional): If per_image is True, compute the loss per
+            image instead of per batch. Default: False.
+        class_weight (list[float], optional): The weight for each class.
+            Default: None.
+        reduction (str, optional): The method used to reduce the loss. Options
+            are "none", "mean" and "sum". This parameter only works when
+            per_image is True. Default: 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. This parameter only works when per_image is True.
+            Default: None.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+
+    if per_image:
+        loss = [
+            lovasz_softmax_flat(
+                *flatten_probs(
+                    prob.unsqueeze(0), label.unsqueeze(0), ignore_index),
+                classes=classes,
+                class_weight=class_weight)
+            for prob, label in zip(probs, labels)
+        ]
+        loss = weight_reduce_loss(
+            torch.stack(loss), None, reduction, avg_factor)
+    else:
+        loss = lovasz_softmax_flat(
+            *flatten_probs(probs, labels, ignore_index),
+            classes=classes,
+            class_weight=class_weight)
+    return loss
+
+
+@MODELS.register_module()
+class LovaszLoss(nn.Module):
+    """LovaszLoss.
+
+    This loss is proposed in `The Lovasz-Softmax loss: A tractable surrogate
+    for the optimization of the intersection-over-union measure in neural
+    networks <https://arxiv.org/abs/1705.08790>`_.
+
+    Args:
+        loss_type (str, optional): Binary or multi-class loss.
+            Default: 'multi_class'. Options are "binary" and "multi_class".
+        classes (str | list[int], optional): Classes chosen to calculate loss.
+            'all' for all classes, 'present' for classes present in labels, or
+            a list of classes to average. Default: 'present'.
+        per_image (bool, optional): If per_image is True, compute the loss per
+            image instead of per batch. Default: False.
+        reduction (str, optional): The method used to reduce the loss. Options
+            are "none", "mean" and "sum". This parameter only works when
+            per_image is True. Default: 'mean'.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+        loss_name (str, optional): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_lovasz'.
+    """
+
+    def __init__(self,
+                 loss_type='multi_class',
+                 classes='present',
+                 per_image=False,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0,
+                 loss_name='loss_lovasz'):
+        super().__init__()
+        assert loss_type in ('binary', 'multi_class'), "loss_type should be \
+                                                    'binary' or 'multi_class'."
+
+        if loss_type == 'binary':
+            self.cls_criterion = lovasz_hinge
+        else:
+            self.cls_criterion = lovasz_softmax
+        assert classes in ('all', 'present') or is_list_of(classes, int)
+        if not per_image:
+            assert reduction == 'none', "reduction should be 'none' when \
+                                                        per_image is False."
+
+        self.classes = classes
+        self.per_image = per_image
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = get_class_weight(class_weight)
+        self._loss_name = loss_name
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function."""
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        # if multi-class loss, transform logits to probs
+        if self.cls_criterion == lovasz_softmax:
+            cls_score = F.softmax(cls_score, dim=1)
+
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            self.classes,
+            self.per_image,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_cls
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/src/mmseg/models/losses/ohem_cross_entropy_loss.py b/head_extractor/src/mmseg/models/losses/ohem_cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..a519b4d84e1dbf86ebc7ad07372ddbdfb0ff3d13
--- /dev/null
+++ b/head_extractor/src/mmseg/models/losses/ohem_cross_entropy_loss.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class OhemCrossEntropy(nn.Module):
+    """OhemCrossEntropy loss.
+
+    This func is modified from
+    `PIDNet <https://github.com/XuJiacong/PIDNet/blob/main/utils/criterion.py#L43>`_.  # noqa
+
+    Licensed under the MIT License.
+
+    Args:
+        ignore_label (int): Labels to ignore when computing the loss.
+            Default: 255
+        thresh (float, optional): The threshold for hard example selection.
+            Below which, are prediction with low confidence. If not
+            specified, the hard examples will be pixels of top ``min_kept``
+            loss. Default: 0.7.
+        min_kept (int, optional): The minimum number of predictions to keep.
+            Default: 100000.
+        loss_weight (float): Weight of the loss. Defaults to 1.0.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_name (str): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_boundary'.
+    """
+
+    def __init__(self,
+                 ignore_label: int = 255,
+                 thres: float = 0.7,
+                 min_kept: int = 100000,
+                 loss_weight: float = 1.0,
+                 class_weight: Optional[Union[List[float], str]] = None,
+                 loss_name: str = 'loss_ohem'):
+        super().__init__()
+        self.thresh = thres
+        self.min_kept = max(1, min_kept)
+        self.ignore_label = ignore_label
+        self.loss_weight = loss_weight
+        self.loss_name_ = loss_name
+        self.class_weight = class_weight
+
+    def forward(self, score: Tensor, target: Tensor) -> Tensor:
+        """Forward function.
+        Args:
+            score (Tensor): Predictions of the segmentation head.
+            target (Tensor): Ground truth of the image.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        # score: (N, C, H, W)
+        pred = F.softmax(score, dim=1)
+        if self.class_weight is not None:
+            class_weight = score.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        pixel_losses = F.cross_entropy(
+            score,
+            target,
+            weight=class_weight,
+            ignore_index=self.ignore_label,
+            reduction='none').contiguous().view(-1)  # (N*H*W)
+        mask = target.contiguous().view(-1) != self.ignore_label  # (N*H*W)
+
+        tmp_target = target.clone()  # (N, H, W)
+        tmp_target[tmp_target == self.ignore_label] = 0
+        # pred: (N, C, H, W) -> (N*H*W, C)
+        pred = pred.gather(1, tmp_target.unsqueeze(1))
+        # pred: (N*H*W, C) -> (N*H*W), ind: (N*H*W)
+        pred, ind = pred.contiguous().view(-1, )[mask].contiguous().sort()
+        if pred.numel() > 0:
+            min_value = pred[min(self.min_kept, pred.numel() - 1)]
+        else:
+            return score.new_tensor(0.0)
+        threshold = max(min_value, self.thresh)
+
+        pixel_losses = pixel_losses[mask][ind]
+        pixel_losses = pixel_losses[pred < threshold]
+        return self.loss_weight * pixel_losses.mean()
+
+    @property
+    def loss_name(self):
+        return self.loss_name_
diff --git a/head_extractor/src/mmseg/models/losses/silog_loss.py b/head_extractor/src/mmseg/models/losses/silog_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecc07aac424a9308bce33e00c621369ac555f4ba
--- /dev/null
+++ b/head_extractor/src/mmseg/models/losses/silog_loss.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def silog_loss(pred: Tensor,
+               target: Tensor,
+               weight: Optional[Tensor] = None,
+               eps: float = 1e-4,
+               reduction: Union[str, None] = 'mean',
+               avg_factor: Optional[int] = None) -> Tensor:
+    """Computes the Scale-Invariant Logarithmic (SI-Log) loss between
+    prediction and target.
+
+    Args:
+        pred (Tensor): Predicted output.
+        target (Tensor): Ground truth.
+        weight (Optional[Tensor]): Optional weight to apply on the loss.
+        eps (float): Epsilon value to avoid division and log(0).
+        reduction (Union[str, None]): Specifies the reduction to apply to the
+            output: 'mean', 'sum' or None.
+        avg_factor (Optional[int]): Optional average factor for the loss.
+
+    Returns:
+        Tensor: The calculated SI-Log loss.
+    """
+    pred, target = pred.flatten(1), target.flatten(1)
+    valid_mask = (target > eps).detach().float()
+
+    diff_log = torch.log(target.clamp(min=eps)) - torch.log(
+        pred.clamp(min=eps))
+
+    valid_mask = (target > eps).detach() & (~torch.isnan(diff_log))
+    diff_log[~valid_mask] = 0.0
+    valid_mask = valid_mask.float()
+
+    diff_log_sq_mean = (diff_log.pow(2) * valid_mask).sum(
+        dim=1) / valid_mask.sum(dim=1).clamp(min=eps)
+    diff_log_mean = (diff_log * valid_mask).sum(dim=1) / valid_mask.sum(
+        dim=1).clamp(min=eps)
+
+    loss = torch.sqrt(diff_log_sq_mean - 0.5 * diff_log_mean.pow(2))
+
+    if weight is not None:
+        weight = weight.float()
+
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class SiLogLoss(nn.Module):
+    """Compute SiLog loss.
+
+    Args:
+        reduction (str, optional): The method used
+            to reduce the loss. Options are "none",
+            "mean" and "sum". Defaults to 'mean'.
+        loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+        eps (float): Avoid dividing by zero. Defaults to 1e-3.
+        loss_name (str, optional): Name of the loss item. If you want this
+            loss item to be included into the backward graph, `loss_` must
+            be the prefix of the name. Defaults to 'loss_silog'.
+    """
+
+    def __init__(self,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 eps=1e-6,
+                 loss_name='loss_silog'):
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self._loss_name = loss_name
+
+    def forward(
+        self,
+        pred,
+        target,
+        weight=None,
+        avg_factor=None,
+        reduction_override=None,
+    ):
+
+        assert pred.shape == target.shape, 'the shapes of pred ' \
+            f'({pred.shape}) and target ({target.shape}) are mismatch'
+
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        loss = self.loss_weight * silog_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+        )
+
+        return loss
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/src/mmseg/models/losses/tversky_loss.py b/head_extractor/src/mmseg/models/losses/tversky_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfca1af6669e3ac328492da11758a084999ef906
--- /dev/null
+++ b/head_extractor/src/mmseg/models/losses/tversky_loss.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from
+https://github.com/JunMa11/SegLoss/blob/master/losses_pytorch/dice_loss.py#L333
+(Apache-2.0 License)"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import get_class_weight, weighted_loss
+
+
+@weighted_loss
+def tversky_loss(pred,
+                 target,
+                 valid_mask,
+                 alpha=0.3,
+                 beta=0.7,
+                 smooth=1,
+                 class_weight=None,
+                 ignore_index=255):
+    assert pred.shape[0] == target.shape[0]
+    total_loss = 0
+    num_classes = pred.shape[1]
+    for i in range(num_classes):
+        if i != ignore_index:
+            tversky_loss = binary_tversky_loss(
+                pred[:, i],
+                target[..., i],
+                valid_mask=valid_mask,
+                alpha=alpha,
+                beta=beta,
+                smooth=smooth)
+            if class_weight is not None:
+                tversky_loss *= class_weight[i]
+            total_loss += tversky_loss
+    return total_loss / num_classes
+
+
+@weighted_loss
+def binary_tversky_loss(pred,
+                        target,
+                        valid_mask,
+                        alpha=0.3,
+                        beta=0.7,
+                        smooth=1):
+    assert pred.shape[0] == target.shape[0]
+    pred = pred.reshape(pred.shape[0], -1)
+    target = target.reshape(target.shape[0], -1)
+    valid_mask = valid_mask.reshape(valid_mask.shape[0], -1)
+
+    TP = torch.sum(torch.mul(pred, target) * valid_mask, dim=1)
+    FP = torch.sum(torch.mul(pred, 1 - target) * valid_mask, dim=1)
+    FN = torch.sum(torch.mul(1 - pred, target) * valid_mask, dim=1)
+    tversky = (TP + smooth) / (TP + alpha * FP + beta * FN + smooth)
+
+    return 1 - tversky
+
+
+@LOSSES.register_module()
+class TverskyLoss(nn.Module):
+    """TverskyLoss. This loss is proposed in `Tversky loss function for image
+    segmentation using 3D fully convolutional deep networks.
+
+    <https://arxiv.org/abs/1706.05721>`_.
+    Args:
+        smooth (float): A float number to smooth loss, and avoid NaN error.
+            Default: 1.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float, optional): Weight of the loss. Default to 1.0.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+        alpha(float, in [0, 1]):
+            The coefficient of false positives. Default: 0.3.
+        beta (float, in [0, 1]):
+            The coefficient of false negatives. Default: 0.7.
+            Note: alpha + beta = 1.
+        loss_name (str, optional): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_tversky'.
+    """
+
+    def __init__(self,
+                 smooth=1,
+                 class_weight=None,
+                 loss_weight=1.0,
+                 ignore_index=255,
+                 alpha=0.3,
+                 beta=0.7,
+                 loss_name='loss_tversky'):
+        super().__init__()
+        self.smooth = smooth
+        self.class_weight = get_class_weight(class_weight)
+        self.loss_weight = loss_weight
+        self.ignore_index = ignore_index
+        assert (alpha + beta == 1.0), 'Sum of alpha and beta but be 1.0!'
+        self.alpha = alpha
+        self.beta = beta
+        self._loss_name = loss_name
+
+    def forward(self, pred, target, **kwargs):
+        if self.class_weight is not None:
+            class_weight = pred.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        pred = F.softmax(pred, dim=1)
+        num_classes = pred.shape[1]
+        one_hot_target = F.one_hot(
+            torch.clamp(target.long(), 0, num_classes - 1),
+            num_classes=num_classes)
+        valid_mask = (target != self.ignore_index).long()
+
+        loss = self.loss_weight * tversky_loss(
+            pred,
+            one_hot_target,
+            valid_mask=valid_mask,
+            alpha=self.alpha,
+            beta=self.beta,
+            smooth=self.smooth,
+            class_weight=class_weight,
+            ignore_index=self.ignore_index)
+        return loss
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/head_extractor/src/mmseg/models/losses/utils.py b/head_extractor/src/mmseg/models/losses/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..047803473316ff5fc58de2b8e35ef0087bc3b624
--- /dev/null
+++ b/head_extractor/src/mmseg/models/losses/utils.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmengine.fileio import load
+
+
+def get_class_weight(class_weight):
+    """Get class weight for loss function.
+
+    Args:
+        class_weight (list[float] | str | None): If class_weight is a str,
+            take it as a file name and read from it.
+    """
+    if isinstance(class_weight, str):
+        # take it as a file path
+        if class_weight.endswith('.npy'):
+            class_weight = np.load(class_weight)
+        else:
+            # pkl, json or yaml
+            class_weight = load(class_weight)
+
+    return class_weight
+
+
+def reduce_loss(loss, reduction) -> torch.Tensor:
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+def weight_reduce_loss(loss,
+                       weight=None,
+                       reduction='mean',
+                       avg_factor=None) -> torch.Tensor:
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Average factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        assert weight.dim() == loss.dim()
+        if weight.dim() > 1:
+            assert weight.size(1) == 1 or weight.size(1) == loss.size(1)
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = torch.finfo(torch.float32).eps
+            loss = loss.sum() / (avg_factor + eps)
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+def weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
diff --git a/head_extractor/src/mmseg/models/necks/__init__.py b/head_extractor/src/mmseg/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff03186a92b78f942e79cff9eec9f5e2784c359a
--- /dev/null
+++ b/head_extractor/src/mmseg/models/necks/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .featurepyramid import Feature2Pyramid
+from .fpn import FPN
+from .ic_neck import ICNeck
+from .jpu import JPU
+from .mla_neck import MLANeck
+from .multilevel_neck import MultiLevelNeck
+
+__all__ = [
+    'FPN', 'MultiLevelNeck', 'MLANeck', 'ICNeck', 'JPU', 'Feature2Pyramid'
+]
diff --git a/head_extractor/src/mmseg/models/necks/featurepyramid.py b/head_extractor/src/mmseg/models/necks/featurepyramid.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc1250d39dafcf78880aa282bcba4215520ad94e
--- /dev/null
+++ b/head_extractor/src/mmseg/models/necks/featurepyramid.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class Feature2Pyramid(nn.Module):
+    """Feature2Pyramid.
+
+    A neck structure connect ViT backbone and decoder_heads.
+
+    Args:
+        embed_dims (int): Embedding dimension.
+        rescales (list[float]): Different sampling multiples were
+            used to obtain pyramid features. Default: [4, 2, 1, 0.5].
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='SyncBN', requires_grad=True).
+    """
+
+    def __init__(self,
+                 embed_dim,
+                 rescales=[4, 2, 1, 0.5],
+                 norm_cfg=dict(type='SyncBN', requires_grad=True)):
+        super().__init__()
+        self.rescales = rescales
+        self.upsample_4x = None
+        for k in self.rescales:
+            if k == 4:
+                self.upsample_4x = nn.Sequential(
+                    nn.ConvTranspose2d(
+                        embed_dim, embed_dim, kernel_size=2, stride=2),
+                    build_norm_layer(norm_cfg, embed_dim)[1],
+                    nn.GELU(),
+                    nn.ConvTranspose2d(
+                        embed_dim, embed_dim, kernel_size=2, stride=2),
+                )
+            elif k == 2:
+                self.upsample_2x = nn.Sequential(
+                    nn.ConvTranspose2d(
+                        embed_dim, embed_dim, kernel_size=2, stride=2))
+            elif k == 1:
+                self.identity = nn.Identity()
+            elif k == 0.5:
+                self.downsample_2x = nn.MaxPool2d(kernel_size=2, stride=2)
+            elif k == 0.25:
+                self.downsample_4x = nn.MaxPool2d(kernel_size=4, stride=4)
+            else:
+                raise KeyError(f'invalid {k} for feature2pyramid')
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.rescales)
+        outputs = []
+        if self.upsample_4x is not None:
+            ops = [
+                self.upsample_4x, self.upsample_2x, self.identity,
+                self.downsample_2x
+            ]
+        else:
+            ops = [
+                self.upsample_2x, self.identity, self.downsample_2x,
+                self.downsample_4x
+            ]
+        for i in range(len(inputs)):
+            outputs.append(ops[i](inputs[i]))
+        return tuple(outputs)
diff --git a/head_extractor/src/mmseg/models/necks/fpn.py b/head_extractor/src/mmseg/models/necks/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddab74c00a262a89031fda44824c5de0e2e9a362
--- /dev/null
+++ b/head_extractor/src/mmseg/models/necks/fpn.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+@MODELS.register_module()
+class FPN(BaseModule):
+    """Feature Pyramid Network.
+
+    This neck is the implementation of `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (list[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, its actual mode is specified by `extra_convs_on_inputs`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral': Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs
+            on the original feature from the backbone. If True,
+            it is equivalent to `add_extra_convs='on_input'`. If False, it is
+            equivalent to set `add_extra_convs='on_output'`. Default to True.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: dict(mode='nearest').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 extra_convs_on_inputs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest'),
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super().__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            if extra_convs_on_inputs:
+                # For compatibility with previous release
+                # TODO: deprecate `extra_convs_on_inputs`
+                self.add_extra_convs = 'on_input'
+            else:
+                self.add_extra_convs = 'on_output'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] = laterals[i - 1] + resize(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/head_extractor/src/mmseg/models/necks/ic_neck.py b/head_extractor/src/mmseg/models/necks/ic_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..9763541e0980cb0ec53a342b656e64c99d87ed7e
--- /dev/null
+++ b/head_extractor/src/mmseg/models/necks/ic_neck.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+class CascadeFeatureFusion(BaseModule):
+    """Cascade Feature Fusion Unit in ICNet.
+
+    Args:
+        low_channels (int): The number of input channels for
+            low resolution feature map.
+        high_channels (int): The number of input channels for
+            high resolution feature map.
+        out_channels (int): The number of output channels.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Dictionary to construct and config act layer.
+            Default: dict(type='ReLU').
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+
+    Returns:
+        x (Tensor): The output tensor of shape (N, out_channels, H, W).
+        x_low (Tensor): The output tensor of shape (N, out_channels, H, W)
+            for Cascade Label Guidance in auxiliary heads.
+    """
+
+    def __init__(self,
+                 low_channels,
+                 high_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.align_corners = align_corners
+        self.conv_low = ConvModule(
+            low_channels,
+            out_channels,
+            3,
+            padding=2,
+            dilation=2,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv_high = ConvModule(
+            high_channels,
+            out_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x_low, x_high):
+        x_low = resize(
+            x_low,
+            size=x_high.size()[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        # Note: Different from original paper, `x_low` is underwent
+        # `self.conv_low` rather than another 1x1 conv classifier
+        #  before being used for auxiliary head.
+        x_low = self.conv_low(x_low)
+        x_high = self.conv_high(x_high)
+        x = x_low + x_high
+        x = F.relu(x, inplace=True)
+        return x, x_low
+
+
+@MODELS.register_module()
+class ICNeck(BaseModule):
+    """ICNet for Real-Time Semantic Segmentation on High-Resolution Images.
+
+    This head is the implementation of `ICHead
+    <https://arxiv.org/abs/1704.08545>`_.
+
+    Args:
+        in_channels (int): The number of input image channels. Default: 3.
+        out_channels (int): The numbers of output feature channels.
+            Default: 128.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+            Default: None.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Dictionary to construct and config act layer.
+            Default: dict(type='ReLU').
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=(64, 256, 256),
+                 out_channels=128,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 align_corners=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert len(in_channels) == 3, 'Length of input channels \
+                                        must be 3!'
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+        self.cff_24 = CascadeFeatureFusion(
+            self.in_channels[2],
+            self.in_channels[1],
+            self.out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+
+        self.cff_12 = CascadeFeatureFusion(
+            self.out_channels,
+            self.in_channels[0],
+            self.out_channels,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+
+    def forward(self, inputs):
+        assert len(inputs) == 3, 'Length of input feature \
+                                        maps must be 3!'
+
+        x_sub1, x_sub2, x_sub4 = inputs
+        x_cff_24, x_24 = self.cff_24(x_sub4, x_sub2)
+        x_cff_12, x_12 = self.cff_12(x_cff_24, x_sub1)
+        # Note: `x_cff_12` is used for decode_head,
+        # `x_24` and `x_12` are used for auxiliary head.
+        return x_24, x_12, x_cff_12
diff --git a/head_extractor/src/mmseg/models/necks/jpu.py b/head_extractor/src/mmseg/models/necks/jpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ea0fe2183377d3e3c1a87ca8a0df909b123cdfa
--- /dev/null
+++ b/head_extractor/src/mmseg/models/necks/jpu.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+@MODELS.register_module()
+class JPU(BaseModule):
+    """FastFCN: Rethinking Dilated Convolution in the Backbone
+    for Semantic Segmentation.
+
+    This Joint Pyramid Upsampling (JPU) neck is the implementation of
+    `FastFCN <https://arxiv.org/abs/1903.11816>`_.
+
+    Args:
+        in_channels (Tuple[int], optional): The number of input channels
+            for each convolution operations before upsampling.
+            Default: (512, 1024, 2048).
+        mid_channels (int): The number of output channels of JPU.
+            Default: 512.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        dilations (tuple[int]): Dilation rate of each Depthwise
+            Separable ConvModule. Default: (1, 2, 4, 8).
+        align_corners (bool, optional): The align_corners argument of
+            resize operation. Default: False.
+        conv_cfg (dict | None): Config of conv layers.
+            Default: None.
+        norm_cfg (dict | None): Config of norm layers.
+            Default: dict(type='BN').
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU').
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=(512, 1024, 2048),
+                 mid_channels=512,
+                 start_level=0,
+                 end_level=-1,
+                 dilations=(1, 2, 4, 8),
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(in_channels, tuple)
+        assert isinstance(dilations, tuple)
+        self.in_channels = in_channels
+        self.mid_channels = mid_channels
+        self.start_level = start_level
+        self.num_ins = len(in_channels)
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+        else:
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+
+        self.dilations = dilations
+        self.align_corners = align_corners
+
+        self.conv_layers = nn.ModuleList()
+        self.dilation_layers = nn.ModuleList()
+        for i in range(self.start_level, self.backbone_end_level):
+            conv_layer = nn.Sequential(
+                ConvModule(
+                    self.in_channels[i],
+                    self.mid_channels,
+                    kernel_size=3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+            self.conv_layers.append(conv_layer)
+        for i in range(len(dilations)):
+            dilation_layer = nn.Sequential(
+                DepthwiseSeparableConvModule(
+                    in_channels=(self.backbone_end_level - self.start_level) *
+                    self.mid_channels,
+                    out_channels=self.mid_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=dilations[i],
+                    dilation=dilations[i],
+                    dw_norm_cfg=norm_cfg,
+                    dw_act_cfg=None,
+                    pw_norm_cfg=norm_cfg,
+                    pw_act_cfg=act_cfg))
+            self.dilation_layers.append(dilation_layer)
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels), 'Length of inputs must \
+                                           be the same with self.in_channels!'
+
+        feats = [
+            self.conv_layers[i - self.start_level](inputs[i])
+            for i in range(self.start_level, self.backbone_end_level)
+        ]
+
+        h, w = feats[0].shape[2:]
+        for i in range(1, len(feats)):
+            feats[i] = resize(
+                feats[i],
+                size=(h, w),
+                mode='bilinear',
+                align_corners=self.align_corners)
+
+        feat = torch.cat(feats, dim=1)
+        concat_feat = torch.cat([
+            self.dilation_layers[i](feat) for i in range(len(self.dilations))
+        ],
+                                dim=1)
+
+        outs = []
+
+        # Default: outs[2] is the output of JPU for decoder head, outs[1] is
+        # the feature map from backbone for auxiliary head. Additionally,
+        # outs[0] can also be used for auxiliary head.
+        for i in range(self.start_level, self.backbone_end_level - 1):
+            outs.append(inputs[i])
+        outs.append(concat_feat)
+        return tuple(outs)
diff --git a/head_extractor/src/mmseg/models/necks/mla_neck.py b/head_extractor/src/mmseg/models/necks/mla_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..db250aefbfa45beaa98855be79ddc7f5e7276cca
--- /dev/null
+++ b/head_extractor/src/mmseg/models/necks/mla_neck.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_norm_layer
+
+from mmseg.registry import MODELS
+
+
+class MLAModule(nn.Module):
+
+    def __init__(self,
+                 in_channels=[1024, 1024, 1024, 1024],
+                 out_channels=256,
+                 norm_cfg=None,
+                 act_cfg=None):
+        super().__init__()
+        self.channel_proj = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.channel_proj.append(
+                ConvModule(
+                    in_channels=in_channels[i],
+                    out_channels=out_channels,
+                    kernel_size=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        self.feat_extract = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.feat_extract.append(
+                ConvModule(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    def forward(self, inputs):
+
+        # feat_list -> [p2, p3, p4, p5]
+        feat_list = []
+        for x, conv in zip(inputs, self.channel_proj):
+            feat_list.append(conv(x))
+
+        # feat_list -> [p5, p4, p3, p2]
+        # mid_list -> [m5, m4, m3, m2]
+        feat_list = feat_list[::-1]
+        mid_list = []
+        for feat in feat_list:
+            if len(mid_list) == 0:
+                mid_list.append(feat)
+            else:
+                mid_list.append(mid_list[-1] + feat)
+
+        # mid_list -> [m5, m4, m3, m2]
+        # out_list -> [o2, o3, o4, o5]
+        out_list = []
+        for mid, conv in zip(mid_list, self.feat_extract):
+            out_list.append(conv(mid))
+
+        return tuple(out_list)
+
+
+@MODELS.register_module()
+class MLANeck(nn.Module):
+    """Multi-level Feature Aggregation.
+
+    This neck is `The Multi-level Feature Aggregation construction of
+    SETR <https://arxiv.org/abs/2012.15840>`_.
+
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        norm_layer (dict): Config dict for input normalization.
+            Default: norm_layer=dict(type='LN', eps=1e-6, requires_grad=True).
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 norm_layer=dict(type='LN', eps=1e-6, requires_grad=True),
+                 norm_cfg=None,
+                 act_cfg=None):
+        super().__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        # In order to build general vision transformer backbone, we have to
+        # move MLA to neck.
+        self.norm = nn.ModuleList([
+            build_norm_layer(norm_layer, in_channels[i])[1]
+            for i in range(len(in_channels))
+        ])
+
+        self.mla = MLAModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # Convert from nchw to nlc
+        outs = []
+        for i in range(len(inputs)):
+            x = inputs[i]
+            n, c, h, w = x.shape
+            x = x.reshape(n, c, h * w).transpose(2, 1).contiguous()
+            x = self.norm[i](x)
+            x = x.transpose(1, 2).reshape(n, c, h, w).contiguous()
+            outs.append(x)
+
+        outs = self.mla(outs)
+        return tuple(outs)
diff --git a/head_extractor/src/mmseg/models/necks/multilevel_neck.py b/head_extractor/src/mmseg/models/necks/multilevel_neck.py
new file mode 100644
index 0000000000000000000000000000000000000000..c997125f24791b1c01248c60a27fa37a986c6c82
--- /dev/null
+++ b/head_extractor/src/mmseg/models/necks/multilevel_neck.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model.weight_init import xavier_init
+
+from mmseg.registry import MODELS
+from ..utils import resize
+
+
+@MODELS.register_module()
+class MultiLevelNeck(nn.Module):
+    """MultiLevelNeck.
+
+    A neck structure connect vit backbone and decoder_heads.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        scales (List[float]): Scale factors for each input feature map.
+            Default: [0.5, 1, 2, 4]
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 scales=[0.5, 1, 2, 4],
+                 norm_cfg=None,
+                 act_cfg=None):
+        super().__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.scales = scales
+        self.num_outs = len(scales)
+        self.lateral_convs = nn.ModuleList()
+        self.convs = nn.ModuleList()
+        for in_channel in in_channels:
+            self.lateral_convs.append(
+                ConvModule(
+                    in_channel,
+                    out_channels,
+                    kernel_size=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+        for _ in range(self.num_outs):
+            self.convs.append(
+                ConvModule(
+                    out_channels,
+                    out_channels,
+                    kernel_size=3,
+                    padding=1,
+                    stride=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+        inputs = [
+            lateral_conv(inputs[i])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+        # for len(inputs) not equal to self.num_outs
+        if len(inputs) == 1:
+            inputs = [inputs[0] for _ in range(self.num_outs)]
+        outs = []
+        for i in range(self.num_outs):
+            x_resize = resize(
+                inputs[i], scale_factor=self.scales[i], mode='bilinear')
+            outs.append(self.convs[i](x_resize))
+        return tuple(outs)
diff --git a/head_extractor/src/mmseg/models/segmentors/__init__.py b/head_extractor/src/mmseg/models/segmentors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..59b012f41725d26d099b8f890630d1dc04019ba5
--- /dev/null
+++ b/head_extractor/src/mmseg/models/segmentors/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseSegmentor
+from .cascade_encoder_decoder import CascadeEncoderDecoder
+from .depth_estimator import DepthEstimator
+from .encoder_decoder import EncoderDecoder
+from .multimodal_encoder_decoder import MultimodalEncoderDecoder
+from .seg_tta import SegTTAModel
+
+__all__ = [
+    'BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder', 'SegTTAModel',
+    'MultimodalEncoderDecoder', 'DepthEstimator'
+]
diff --git a/head_extractor/src/mmseg/models/segmentors/base.py b/head_extractor/src/mmseg/models/segmentors/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0d6ca189e5e4db834974692580d77621f8ff69f
--- /dev/null
+++ b/head_extractor/src/mmseg/models/segmentors/base.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import List, Tuple
+
+from mmengine.model import BaseModel
+from mmengine.structures import PixelData
+from torch import Tensor
+
+from mmseg.structures import SegDataSample
+from mmseg.utils import (ForwardResults, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList)
+from ..utils import resize
+
+import torch
+
+
+class BaseSegmentor(BaseModel, metaclass=ABCMeta):
+    """Base class for segmentors.
+
+    Args:
+        data_preprocessor (dict, optional): Model preprocessing config
+            for processing the input data. it usually includes
+            ``to_rgb``, ``pad_size_divisor``, ``pad_val``,
+            ``mean`` and ``std``. Default to None.
+       init_cfg (dict, optional): the config to control the
+           initialization. Default to None.
+    """
+
+    def __init__(self,
+                 data_preprocessor: OptConfigType = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+
+    @property
+    def with_neck(self) -> bool:
+        """bool: whether the segmentor has neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    @property
+    def with_auxiliary_head(self) -> bool:
+        """bool: whether the segmentor has auxiliary head"""
+        return hasattr(self,
+                       'auxiliary_head') and self.auxiliary_head is not None
+
+    @property
+    def with_decode_head(self) -> bool:
+        """bool: whether the segmentor has decode head"""
+        return hasattr(self, 'decode_head') and self.decode_head is not None
+
+    @abstractmethod
+    def extract_feat(self, inputs: Tensor) -> bool:
+        """Placeholder for extract features from images."""
+        pass
+
+    @abstractmethod
+    def encode_decode(self, inputs: Tensor, batch_data_samples: SampleList):
+        """Placeholder for encode images with backbone and decode into a
+        semantic segmentation map of the same size as input."""
+        pass
+
+    def forward(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None,
+                mode: str = 'tensor') -> ForwardResults:
+        """The unified entry for a forward process in both training and test.
+
+        The method should accept three modes: "tensor", "predict" and "loss":
+
+        - "tensor": Forward the whole network and return tensor or tuple of
+        tensor without any post-processing, same as a common nn.Module.
+        - "predict": Forward and return the predictions, which are fully
+        processed to a list of :obj:`SegDataSample`.
+        - "loss": Forward and return a dict of losses according to the given
+        inputs and data samples.
+
+        Note that this method doesn't handle neither back propagation nor
+        optimizer updating, which are done in the :meth:`train_step`.
+
+        Args:
+            inputs (torch.Tensor): The input tensor with shape (N, C, ...) in
+                general.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`. Default to None.
+            mode (str): Return what kind of value. Defaults to 'tensor'.
+
+        Returns:
+            The return type depends on ``mode``.
+
+            - If ``mode="tensor"``, return a tensor or a tuple of tensor.
+            - If ``mode="predict"``, return a list of :obj:`DetDataSample`.
+            - If ``mode="loss"``, return a dict of tensor.
+        """
+        if mode == 'loss':
+            # try:
+            #     return self.loss(inputs, data_samples)
+            # except:
+            #     print(data_samples)
+            return self.loss(inputs, data_samples)
+        elif mode == 'predict':
+            # data_samples[0] = data_samples[0].to(dtype=torch.float16)
+            # inputs = inputs.to(dtype=torch.float16)
+            return self.predict(inputs, data_samples)
+            # try:
+            #     return self.predict(inputs, data_samples)
+            # except:
+            #     print(data_samples)
+        elif mode == 'tensor':
+            return self._forward(inputs, data_samples)
+        else:
+            raise RuntimeError(f'Invalid mode "{mode}". '
+                               'Only supports loss, predict and tensor mode')
+
+    @abstractmethod
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples."""
+        pass
+
+    @abstractmethod
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing."""
+        pass
+
+    @abstractmethod
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+        """Network forward process.
+
+        Usually includes backbone, neck and head forward without any post-
+        processing.
+        """
+        pass
+
+    def postprocess_result(self,
+                           seg_logits: Tensor,
+                           data_samples: OptSampleList = None) -> SampleList:
+        """ Convert results list to `SegDataSample`.
+        Args:
+            seg_logits (Tensor): The segmentation results, seg_logits from
+                model of each input image.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`. Default to None.
+        Returns:
+            list[:obj:`SegDataSample`]: Segmentation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+            - ``seg_logits``(PixelData): Predicted logits of semantic
+                segmentation before normalization.
+        """
+        batch_size, C, H, W = seg_logits.shape
+
+        if data_samples is None:
+            data_samples = [SegDataSample() for _ in range(batch_size)]
+            only_prediction = True
+        else:
+            only_prediction = False
+
+        for i in range(batch_size):
+            if not only_prediction:
+                img_meta = data_samples[i].metainfo
+                # remove padding area
+                if 'img_padding_size' not in img_meta:
+                    padding_size = img_meta.get('padding_size', [0] * 4)
+                else:
+                    padding_size = img_meta['img_padding_size']
+                padding_left, padding_right, padding_top, padding_bottom =\
+                    padding_size
+                # i_seg_logits shape is 1, C, H, W after remove padding
+                i_seg_logits = seg_logits[i:i + 1, :,
+                                          padding_top:H - padding_bottom,
+                                          padding_left:W - padding_right]
+
+                flip = img_meta.get('flip', None)
+                if flip:
+                    flip_direction = img_meta.get('flip_direction', None)
+                    assert flip_direction in ['horizontal', 'vertical']
+                    if flip_direction == 'horizontal':
+                        i_seg_logits = i_seg_logits.flip(dims=(3, ))
+                    else:
+                        i_seg_logits = i_seg_logits.flip(dims=(2, ))
+
+                # resize as original shape
+                i_seg_logits = resize(
+                    i_seg_logits,
+                    size=img_meta['ori_shape'],
+                    mode='bilinear',
+                    align_corners=self.align_corners,
+                    warning=False).squeeze(0)
+            else:
+                i_seg_logits = seg_logits[i]
+
+            if C > 1:
+                i_seg_pred = i_seg_logits.argmax(dim=0, keepdim=True)
+            else:
+                i_seg_logits = i_seg_logits.sigmoid()
+                i_seg_pred = (i_seg_logits >
+                              self.decode_head.threshold).to(i_seg_logits)
+            data_samples[i].set_data({
+                'seg_logits':
+                PixelData(**{'data': i_seg_logits}),
+                'pred_sem_seg':
+                PixelData(**{'data': i_seg_pred})
+            })
+
+        return data_samples
diff --git a/head_extractor/src/mmseg/models/segmentors/cascade_encoder_decoder.py b/head_extractor/src/mmseg/models/segmentors/cascade_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0184a3533a18cbe96a28bbb645c3e73bbffcdeee
--- /dev/null
+++ b/head_extractor/src/mmseg/models/segmentors/cascade_encoder_decoder.py
@@ -0,0 +1,138 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+from torch import Tensor, nn
+
+from mmseg.registry import MODELS
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from .encoder_decoder import EncoderDecoder
+
+
+@MODELS.register_module()
+class CascadeEncoderDecoder(EncoderDecoder):
+    """Cascade Encoder Decoder segmentors.
+
+    CascadeEncoderDecoder almost the same as EncoderDecoder, while decoders of
+    CascadeEncoderDecoder are cascaded. The output of previous decoder_head
+    will be the input of next decoder_head.
+
+    Args:
+
+        num_stages (int): How many stages will be cascaded.
+        backbone (ConfigType): The config for the backnone of segmentor.
+        decode_head (ConfigType): The config for the decode head of segmentor.
+        neck (OptConfigType): The config for the neck of segmentor.
+            Defaults to None.
+        auxiliary_head (OptConfigType): The config for the auxiliary head of
+            segmentor. Defaults to None.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """
+
+    def __init__(self,
+                 num_stages: int,
+                 backbone: ConfigType,
+                 decode_head: ConfigType,
+                 neck: OptConfigType = None,
+                 auxiliary_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 init_cfg: OptMultiConfig = None):
+        self.num_stages = num_stages
+        super().__init__(
+            backbone=backbone,
+            decode_head=decode_head,
+            neck=neck,
+            auxiliary_head=auxiliary_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def _init_decode_head(self, decode_head: ConfigType) -> None:
+        """Initialize ``decode_head``"""
+        assert isinstance(decode_head, list)
+        assert len(decode_head) == self.num_stages
+        self.decode_head = nn.ModuleList()
+        for i in range(self.num_stages):
+            self.decode_head.append(MODELS.build(decode_head[i]))
+        self.align_corners = self.decode_head[-1].align_corners
+        self.num_classes = self.decode_head[-1].num_classes
+        self.out_channels = self.decode_head[-1].out_channels
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input."""
+        x = self.extract_feat(inputs)
+        out = self.decode_head[0].forward(x)
+        for i in range(1, self.num_stages - 1):
+            out = self.decode_head[i].forward(x, out)
+        seg_logits_list = self.decode_head[-1].predict(x, out, batch_img_metas,
+                                                       self.test_cfg)
+
+        return seg_logits_list
+
+    def _decode_head_forward_train(self, inputs: Tensor,
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+
+        loss_decode = self.decode_head[0].loss(inputs, data_samples,
+                                               self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode_0'))
+        # get batch_img_metas
+        batch_size = len(data_samples)
+        batch_img_metas = []
+        for batch_index in range(batch_size):
+            metainfo = data_samples[batch_index].metainfo
+            batch_img_metas.append(metainfo)
+
+        for i in range(1, self.num_stages):
+            # forward test again, maybe unnecessary for most methods.
+            if i == 1:
+                prev_outputs = self.decode_head[0].forward(inputs)
+            else:
+                prev_outputs = self.decode_head[i - 1].forward(
+                    inputs, prev_outputs)
+            loss_decode = self.decode_head[i].loss(inputs, prev_outputs,
+                                                   data_samples,
+                                                   self.train_cfg)
+            losses.update(add_prefix(loss_decode, f'decode_{i}'))
+
+        return losses
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_semantic_seg`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+
+        out = self.decode_head[0].forward(x)
+        for i in range(1, self.num_stages):
+            # TODO support PointRend tensor mode
+            out = self.decode_head[i].forward(x, out)
+
+        return out
diff --git a/head_extractor/src/mmseg/models/segmentors/depth_estimator.py b/head_extractor/src/mmseg/models/segmentors/depth_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..1020637e737a3c72ba6a48f2d1228717470ba862
--- /dev/null
+++ b/head_extractor/src/mmseg/models/segmentors/depth_estimator.py
@@ -0,0 +1,392 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.logging import print_log
+from mmengine.structures import PixelData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from ..utils import resize
+from .encoder_decoder import EncoderDecoder
+
+
+@MODELS.register_module()
+class DepthEstimator(EncoderDecoder):
+    """Encoder Decoder depth estimator.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+
+    1. The ``loss`` method is used to calculate the loss of model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2) Call the decode head loss function to forward decode head model and
+    calculate losses.
+
+    .. code:: text
+
+     loss(): extract_feat() -> _decode_head_forward_train() -> _auxiliary_head_forward_train (optional)
+     _decode_head_forward_train(): decode_head.loss()
+     _auxiliary_head_forward_train(): auxiliary_head.loss (optional)
+
+    2. The ``predict`` method is used to predict depth estimation results,
+    which includes two steps: (1) Run inference function to obtain the list of
+    depth (2) Call post-processing function to obtain list of
+    ``SegDataSample`` including ``pred_depth_map``.
+
+    .. code:: text
+
+     predict(): inference() -> postprocess_result()
+     inference(): whole_inference()/slide_inference()
+     whole_inference()/slide_inference(): encoder_decoder()
+     encoder_decoder(): extract_feat() -> decode_head.predict()
+
+    3. The ``_forward`` method is used to output the tensor by running the model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2)Call the decode head forward function to forward decode head model.
+
+    .. code:: text
+
+     _forward(): extract_feat() -> _decode_head.forward()
+
+    Args:
+
+        backbone (ConfigType): The config for the backnone of depth estimator.
+        decode_head (ConfigType): The config for the decode head of depth estimator.
+        neck (OptConfigType): The config for the neck of depth estimator.
+            Defaults to None.
+        auxiliary_head (OptConfigType): The config for the auxiliary head of
+            depth estimator. Defaults to None.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 decode_head: ConfigType,
+                 neck: OptConfigType = None,
+                 auxiliary_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            decode_head=decode_head,
+            neck=neck,
+            auxiliary_head=auxiliary_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def extract_feat(self,
+                     inputs: Tensor,
+                     batch_img_metas: Optional[List[dict]] = None) -> Tensor:
+        """Extract features from images."""
+
+        if getattr(self.backbone, 'class_embed_select', False) and \
+                isinstance(batch_img_metas, list) and \
+                'category_id' in batch_img_metas[0]:
+            cat_ids = [meta['category_id'] for meta in batch_img_metas]
+            cat_ids = torch.tensor(cat_ids).to(inputs.device)
+            inputs = (inputs, cat_ids)
+
+        x = self.backbone(inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode images with backbone and decode into a depth map of the same
+        size as input."""
+        x = self.extract_feat(inputs, batch_img_metas)
+        depth = self.decode_head.predict(x, batch_img_metas, self.test_cfg)
+
+        return depth
+
+    def _decode_head_forward_train(self, inputs: List[Tensor],
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.loss(inputs, data_samples,
+                                            self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def _auxiliary_head_forward_train(self, inputs: List[Tensor],
+                                      data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for auxiliary head in
+        training."""
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.loss(inputs, data_samples, self.train_cfg)
+                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+        else:
+            loss_aux = self.auxiliary_head.loss(inputs, data_samples,
+                                                self.train_cfg)
+            losses.update(add_prefix(loss_aux, 'aux'))
+
+        return losses
+
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): Input images.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_depth_map`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        x = self.extract_feat(inputs, batch_img_metas)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, data_samples)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(x, data_samples)
+            losses.update(loss_aux)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`], optional): The seg data
+                samples. It usually includes information such as `metainfo`
+                and `gt_depth_map`.
+
+        Returns:
+            list[:obj:`SegDataSample`]: Depth estimation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_depth_max``(PixelData): Prediction of depth estimation.
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        depth = self.inference(inputs, batch_img_metas)
+
+        return self.postprocess_result(depth, data_samples)
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_depth_map`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+        return self.decode_head.forward(x)
+
+    def slide_flip_inference(self, inputs: Tensor,
+                             batch_img_metas: List[dict]) -> Tensor:
+        """Inference by sliding-window with overlap and flip.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+
+        Args:
+            inputs (tensor): the tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The depth estimation results.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = inputs.size()
+        out_channels = self.out_channels
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+        count_mat = inputs.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = inputs[:, :, y1:y2, x1:x2]
+                # change the image shape to patch shape
+                batch_img_metas[0]['img_shape'] = crop_img.shape[2:]
+                # the output of encode_decode is depth tensor map
+                # with shape [N, C, H, W]
+                crop_depth_map = self.encode_decode(crop_img, batch_img_metas)
+
+                # average out the original and flipped prediction
+                crop_depth_map_flip = self.encode_decode(
+                    crop_img.flip(dims=(3, )), batch_img_metas)
+                crop_depth_map_flip = crop_depth_map_flip.flip(dims=(3, ))
+                crop_depth_map = (crop_depth_map + crop_depth_map_flip) / 2.0
+
+                preds += F.pad(crop_depth_map,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        depth = preds / count_mat
+
+        return depth
+
+    def inference(self, inputs: Tensor, batch_img_metas: List[dict]) -> Tensor:
+        """Inference with slide/whole style.
+
+        Args:
+            inputs (Tensor): The input image of shape (N, 3, H, W).
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', 'pad_shape', and 'padding_size'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The depth estimation results.
+        """
+        assert self.test_cfg.get('mode', 'whole') in ['slide', 'whole',
+                                                      'slide_flip'], \
+            f'Only "slide", "slide_flip" or "whole" test mode are ' \
+            f'supported, but got {self.test_cfg["mode"]}.'
+        ori_shape = batch_img_metas[0]['ori_shape']
+        if not all(_['ori_shape'] == ori_shape for _ in batch_img_metas):
+            print_log(
+                'Image shapes are different in the batch.',
+                logger='current',
+                level=logging.WARN)
+        if self.test_cfg.mode == 'slide':
+            depth_map = self.slide_inference(inputs, batch_img_metas)
+        if self.test_cfg.mode == 'slide_flip':
+            depth_map = self.slide_flip_inference(inputs, batch_img_metas)
+        else:
+            depth_map = self.whole_inference(inputs, batch_img_metas)
+
+        return depth_map
+
+    def postprocess_result(self,
+                           depth: Tensor,
+                           data_samples: OptSampleList = None) -> SampleList:
+        """ Convert results list to `SegDataSample`.
+        Args:
+            depth (Tensor): The depth estimation results.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_depth_map`. Default to None.
+        Returns:
+            list[:obj:`SegDataSample`]: Depth estomation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_depth_map``(PixelData): Prediction of depth estimation.
+        """
+        batch_size, C, H, W = depth.shape
+
+        if data_samples is None:
+            data_samples = [SegDataSample() for _ in range(batch_size)]
+            only_prediction = True
+        else:
+            only_prediction = False
+
+        for i in range(batch_size):
+            if not only_prediction:
+                img_meta = data_samples[i].metainfo
+                # remove padding area
+                if 'img_padding_size' not in img_meta:
+                    padding_size = img_meta.get('padding_size', [0] * 4)
+                else:
+                    padding_size = img_meta['img_padding_size']
+                padding_left, padding_right, padding_top, padding_bottom =\
+                    padding_size
+                # i_depth shape is 1, C, H, W after remove padding
+                i_depth = depth[i:i + 1, :, padding_top:H - padding_bottom,
+                                padding_left:W - padding_right]
+
+                flip = img_meta.get('flip', None)
+                if flip:
+                    flip_direction = img_meta.get('flip_direction', None)
+                    assert flip_direction in ['horizontal', 'vertical']
+                    if flip_direction == 'horizontal':
+                        i_depth = i_depth.flip(dims=(3, ))
+                    else:
+                        i_depth = i_depth.flip(dims=(2, ))
+
+                # resize as original shape
+                i_depth = resize(
+                    i_depth,
+                    size=img_meta['ori_shape'],
+                    mode='bilinear',
+                    align_corners=self.align_corners,
+                    warning=False).squeeze(0)
+            else:
+                i_depth = depth[i]
+
+            data_samples[i].set_data(
+                {'pred_depth_map': PixelData(**{'data': i_depth})})
+
+        return data_samples
diff --git a/head_extractor/src/mmseg/models/segmentors/encoder_decoder.py b/head_extractor/src/mmseg/models/segmentors/encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9c6011f2783f3a738fd105cd1a3850588aefa1b
--- /dev/null
+++ b/head_extractor/src/mmseg/models/segmentors/encoder_decoder.py
@@ -0,0 +1,365 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import List, Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.logging import print_log
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from .base import BaseSegmentor
+
+
+@MODELS.register_module()
+class EncoderDecoder(BaseSegmentor):
+    """Encoder Decoder segmentors.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+
+    1. The ``loss`` method is used to calculate the loss of model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2) Call the decode head loss function to forward decode head model and
+    calculate losses.
+
+    .. code:: text
+
+     loss(): extract_feat() -> _decode_head_forward_train() -> _auxiliary_head_forward_train (optional)
+     _decode_head_forward_train(): decode_head.loss()
+     _auxiliary_head_forward_train(): auxiliary_head.loss (optional)
+
+    2. The ``predict`` method is used to predict segmentation results,
+    which includes two steps: (1) Run inference function to obtain the list of
+    seg_logits (2) Call post-processing function to obtain list of
+    ``SegDataSample`` including ``pred_sem_seg`` and ``seg_logits``.
+
+    .. code:: text
+
+     predict(): inference() -> postprocess_result()
+     infercen(): whole_inference()/slide_inference()
+     whole_inference()/slide_inference(): encoder_decoder()
+     encoder_decoder(): extract_feat() -> decode_head.predict()
+
+    3. The ``_forward`` method is used to output the tensor by running the model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2)Call the decode head forward function to forward decode head model.
+
+    .. code:: text
+
+     _forward(): extract_feat() -> _decode_head.forward()
+
+    Args:
+
+        backbone (ConfigType): The config for the backnone of segmentor.
+        decode_head (ConfigType): The config for the decode head of segmentor.
+        neck (OptConfigType): The config for the neck of segmentor.
+            Defaults to None.
+        auxiliary_head (OptConfigType): The config for the auxiliary head of
+            segmentor. Defaults to None.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 decode_head: ConfigType,
+                 neck: OptConfigType = None,
+                 auxiliary_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        if pretrained is not None:
+            assert backbone.get('pretrained') is None, \
+                'both backbone and segmentor set pretrained weight'
+            backbone.pretrained = pretrained
+        self.backbone = MODELS.build(backbone)
+        if neck is not None:
+            self.neck = MODELS.build(neck)
+        self._init_decode_head(decode_head)
+        self._init_auxiliary_head(auxiliary_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head
+
+    def _init_decode_head(self, decode_head: ConfigType) -> None:
+        """Initialize ``decode_head``"""
+        self.decode_head = MODELS.build(decode_head)
+        self.align_corners = self.decode_head.align_corners
+        self.num_classes = self.decode_head.num_classes
+        self.out_channels = self.decode_head.out_channels
+
+    def _init_auxiliary_head(self, auxiliary_head: ConfigType) -> None:
+        """Initialize ``auxiliary_head``"""
+        if auxiliary_head is not None:
+            if isinstance(auxiliary_head, list):
+                self.auxiliary_head = nn.ModuleList()
+                for head_cfg in auxiliary_head:
+                    self.auxiliary_head.append(MODELS.build(head_cfg))
+            else:
+                self.auxiliary_head = MODELS.build(auxiliary_head)
+
+    def extract_feat(self, inputs: Tensor) -> List[Tensor]:
+        """Extract features from images."""
+        x = self.backbone(inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input."""
+        x = self.extract_feat(inputs)
+        seg_logits = self.decode_head.predict(x, batch_img_metas,
+                                              self.test_cfg)
+
+        return seg_logits
+
+    def _decode_head_forward_train(self, inputs: List[Tensor],
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.loss(inputs, data_samples,
+                                            self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def _auxiliary_head_forward_train(self, inputs: List[Tensor],
+                                      data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for auxiliary head in
+        training."""
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.loss(inputs, data_samples, self.train_cfg)
+                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+        else:
+            loss_aux = self.auxiliary_head.loss(inputs, data_samples,
+                                                self.train_cfg)
+            losses.update(add_prefix(loss_aux, 'aux'))
+
+        return losses
+
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): Input images.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        x = self.extract_feat(inputs)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, data_samples)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(x, data_samples)
+            losses.update(loss_aux)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`], optional): The seg data
+                samples. It usually includes information such as `metainfo`
+                and `gt_sem_seg`.
+
+        Returns:
+            list[:obj:`SegDataSample`]: Segmentation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+            - ``seg_logits``(PixelData): Predicted logits of semantic
+                segmentation before normalization.
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        seg_logits = self.inference(inputs, batch_img_metas)
+
+        return self.postprocess_result(seg_logits, data_samples)
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_sem_seg`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+        # return self.decode_head.forward(x)
+        return self.decode_head.forward(x, data_samples)
+
+    def slide_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+
+        Args:
+            inputs (tensor): the tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = inputs.size()
+        out_channels = self.out_channels
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+        count_mat = inputs.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = inputs[:, :, y1:y2, x1:x2]
+                # change the image shape to patch shape
+                batch_img_metas[0]['img_shape'] = crop_img.shape[2:]
+                # the output of encode_decode is seg logits tensor map
+                # with shape [N, C, H, W]
+                crop_seg_logit = self.encode_decode(crop_img, batch_img_metas)
+                preds += F.pad(crop_seg_logit,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        seg_logits = preds / count_mat
+
+        return seg_logits
+
+    def whole_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference with full image.
+
+        Args:
+            inputs (Tensor): The tensor should have a shape NxCxHxW, which
+                contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        seg_logits = self.encode_decode(inputs, batch_img_metas)
+
+        return seg_logits
+
+    def inference(self, inputs: Tensor, batch_img_metas: List[dict]) -> Tensor:
+        """Inference with slide/whole style.
+
+        Args:
+            inputs (Tensor): The input image of shape (N, 3, H, W).
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', 'pad_shape', and 'padding_size'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+        assert self.test_cfg.get('mode', 'whole') in ['slide', 'whole'], \
+            f'Only "slide" or "whole" test mode are supported, but got ' \
+            f'{self.test_cfg["mode"]}.'
+        ori_shape = batch_img_metas[0]['ori_shape']
+        if not all(_['ori_shape'] == ori_shape for _ in batch_img_metas):
+            print_log(
+                'Image shapes are different in the batch.',
+                logger='current',
+                level=logging.WARN)
+        if self.test_cfg.mode == 'slide':
+            seg_logit = self.slide_inference(inputs, batch_img_metas)
+        else:
+            seg_logit = self.whole_inference(inputs, batch_img_metas)
+
+        return seg_logit
+
+    def aug_test(self, inputs, batch_img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented seg logit inplace
+        seg_logit = self.inference(inputs[0], batch_img_metas[0], rescale)
+        for i in range(1, len(inputs)):
+            cur_seg_logit = self.inference(inputs[i], batch_img_metas[i],
+                                           rescale)
+            seg_logit += cur_seg_logit
+        seg_logit /= len(inputs)
+        seg_pred = seg_logit.argmax(dim=1)
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
diff --git a/head_extractor/src/mmseg/models/segmentors/multimodal_encoder_decoder.py b/head_extractor/src/mmseg/models/segmentors/multimodal_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..75aa8b9b17688cb5f54da08f9300af82b3339967
--- /dev/null
+++ b/head_extractor/src/mmseg/models/segmentors/multimodal_encoder_decoder.py
@@ -0,0 +1,350 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from .base import BaseSegmentor
+
+
+@MODELS.register_module()
+class MultimodalEncoderDecoder(BaseSegmentor):
+    """Multimodal Encoder-Decoder segmentors.
+
+    Multimodal segmentation architecture is used for open-vocabulary
+    semantic segmentation with combining the visual and language
+    pretrain models. It consists of a image_encoder (backbone) to extract
+    visual feature, a text encoder to extract text feature, and a decode
+    head to generate semantic maps.
+    Note that the deep supervision during training is implemented in decode head.
+
+    1. The ``loss`` method is used to calculate the loss of model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2) Call the decode head loss function to forward decode head model and
+    calculate losses.
+
+    .. code:: text
+
+     loss(): extract_feat() -> _decode_head_forward_train()
+     _decode_head_forward_train(): decode_head.loss()
+
+    2. The ``predict`` method is used to predict segmentation results,
+    which includes two steps: (1) Run inference function to obtain the list of
+    seg_logits (2) Call post-processing function to obtain list of
+    ``SegDataSampel`` including ``pred_sem_seg`` and ``seg_logits``.
+
+    .. code:: text
+
+     predict(): inference() -> postprocess_result()
+     inference(): whole_inference()/slide_inference()
+     whole_inference()/slide_inference(): encoder_decoder()
+     encoder_decoder(): extract_feat() -> decode_head.predict()
+
+    3. The ``_forward`` method is used to output the tensor by running the model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2)Call the decode head forward function to forward decode head model.
+
+    .. code:: text
+
+     _forward(): extract_feat() -> _decode_head.forward()
+
+    Args:
+
+        image_encoder (ConfigType): The config for the visual encoder of segmentor.
+        text_encoder ((ConfigType): The config for the text encoder of segmentor.
+        decode_head (ConfigType): The config for the decode head of segmentor.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        asymetric_input (bool): whether to use different size of input for image encoder
+            and decode head. Defaults to False.
+        encoder_resolution (float): resize scale of input images for image encoder.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 image_encoder: ConfigType,
+                 text_encoder: ConfigType,
+                 decode_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 asymetric_input: bool = True,
+                 encoder_resolution: float = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        if pretrained is not None:
+            image_encoder.init_cfg = dict(
+                type='Pretrained_Part', checkpoint=pretrained)
+            text_encoder.init_cfg = dict(
+                type='Pretrained_Part', checkpoint=pretrained)
+            decode_head.init_cfg = dict(
+                type='Pretrained_Part', checkpoint=pretrained)
+
+        if asymetric_input:
+            assert encoder_resolution is not None, \
+                'if asymetric_input set True, ' \
+                'clip_resolution must be a certain value'
+        self.asymetric_input = asymetric_input
+        self.encoder_resolution = encoder_resolution
+        self.image_encoder = MODELS.build(image_encoder)
+        self.text_encoder = MODELS.build(text_encoder)
+        self._init_decode_head(decode_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head
+
+    def _init_decode_head(self, decode_head: ConfigType) -> None:
+        """Initialize ``decode_head``"""
+        self.decode_head = MODELS.build(decode_head)
+        self.align_corners = self.decode_head.align_corners
+        self.num_classes = self.decode_head.num_classes
+        self.out_channels = self.decode_head.out_channels
+
+    def extract_feat(self, inputs: Tensor) -> List[Tensor]:
+        """Extract visual features from images."""
+        x = self.image_encoder(inputs)
+        return x
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode the name of classes with text_encoder and encode images with
+        image_encoder.
+
+        Then decode the class embedding and visual feature into a semantic
+        segmentation map of the same size as input.
+        """
+        classifier_embeds = self.text_encoder()
+        clip_inputs = inputs
+        if self.asymetric_input:
+            clip_inputs = F.interpolate(
+                inputs, scale_factor=self.encoder_resolution, mode='bilinear')
+        x = self.image_encoder(clip_inputs)
+        seg_logits = self.decode_head.predict([inputs, x, classifier_embeds],
+                                              batch_img_metas, self.test_cfg)
+
+        return seg_logits
+
+    def _decode_head_forward_train(self, inputs: List[Tensor],
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.loss(inputs, data_samples,
+                                            self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): Input images.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        classifier_embeds = self.text_encoder()
+        clip_inputs = inputs
+        if self.asymetric_input:
+            clip_inputs = F.interpolate(
+                inputs, scale_factor=self.encoder_resolution, mode='bilinear')
+        x = self.image_encoder(clip_inputs)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(
+            [inputs, x, classifier_embeds], data_samples)
+        losses.update(loss_decode)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`], optional): The seg data
+                samples. It usually includes information such as `metainfo`
+                and `gt_sem_seg`.
+
+        Returns:
+            list[:obj:`SegDataSample`]: Segmentation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+            - ``seg_logits``(PixelData): Predicted logits of semantic
+                segmentation before normalization.
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        seg_logits = self.inference(inputs, batch_img_metas)
+
+        return self.postprocess_result(seg_logits, data_samples)
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_sem_seg`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+        return self.decode_head.forward(x)
+
+    def slide_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+
+        Args:
+            inputs (tensor): the tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = inputs.size()
+        out_channels = self.out_channels
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+        count_mat = inputs.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = inputs[:, :, y1:y2, x1:x2]
+                # change the image shape to patch shape
+                batch_img_metas[0]['img_shape'] = crop_img.shape[2:]
+                # the output of encode_decode is seg logits tensor map
+                # with shape [N, C, H, W]
+                crop_seg_logit = self.encode_decode(crop_img, batch_img_metas)
+                preds += F.pad(crop_seg_logit,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        seg_logits = preds / count_mat
+
+        return seg_logits
+
+    def whole_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference with full image.
+
+        Args:
+            inputs (Tensor): The tensor should have a shape NxCxHxW, which
+                contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        seg_logits = self.encode_decode(inputs, batch_img_metas)
+
+        return seg_logits
+
+    def inference(self, inputs: Tensor, batch_img_metas: List[dict]) -> Tensor:
+        """Inference with slide/whole style.
+
+        Args:
+            inputs (Tensor): The input image of shape (N, 3, H, W).
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', 'pad_shape', and 'padding_size'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        assert self.test_cfg.mode in ['slide', 'whole']
+        ori_shape = batch_img_metas[0]['ori_shape']
+        assert all(_['ori_shape'] == ori_shape for _ in batch_img_metas)
+        if self.test_cfg.mode == 'slide':
+            seg_logit = self.slide_inference(inputs, batch_img_metas)
+        else:
+            seg_logit = self.whole_inference(inputs, batch_img_metas)
+
+        return seg_logit
+
+    def aug_test(self, inputs, batch_img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented seg logit inplace
+        seg_logit = self.inference(inputs[0], batch_img_metas[0], rescale)
+        for i in range(1, len(inputs)):
+            cur_seg_logit = self.inference(inputs[i], batch_img_metas[i],
+                                           rescale)
+            seg_logit += cur_seg_logit
+        seg_logit /= len(inputs)
+        seg_pred = seg_logit.argmax(dim=1)
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
diff --git a/head_extractor/src/mmseg/models/segmentors/seg_tta.py b/head_extractor/src/mmseg/models/segmentors/seg_tta.py
new file mode 100644
index 0000000000000000000000000000000000000000..63ef61d223a572dec4fc3e43e1550b98cd2e7302
--- /dev/null
+++ b/head_extractor/src/mmseg/models/segmentors/seg_tta.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmengine.model import BaseTTAModel
+from mmengine.structures import PixelData
+
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList
+
+
+@MODELS.register_module()
+class SegTTAModel(BaseTTAModel):
+
+    def merge_preds(self, data_samples_list: List[SampleList]) -> SampleList:
+        """Merge predictions of enhanced data to one prediction.
+
+        Args:
+            data_samples_list (List[SampleList]): List of predictions
+                of all enhanced data.
+
+        Returns:
+            SampleList: Merged prediction.
+        """
+        predictions = []
+        for data_samples in data_samples_list:
+            seg_logits = data_samples[0].seg_logits.data
+            logits = torch.zeros(seg_logits.shape).to(seg_logits)
+            for data_sample in data_samples:
+                seg_logit = data_sample.seg_logits.data
+                if self.module.out_channels > 1:
+                    logits += seg_logit.softmax(dim=0)
+                else:
+                    logits += seg_logit.sigmoid()
+            logits /= len(data_samples)
+            if self.module.out_channels == 1:
+                seg_pred = (logits > self.module.decode_head.threshold
+                            ).to(logits).squeeze(1)
+            else:
+                seg_pred = logits.argmax(dim=0)
+            data_sample.set_data({'pred_sem_seg': PixelData(data=seg_pred)})
+            if hasattr(data_samples[0], 'gt_sem_seg'):
+                data_sample.set_data(
+                    {'gt_sem_seg': data_samples[0].gt_sem_seg})
+            data_sample.set_metainfo({'img_path': data_samples[0].img_path})
+            predictions.append(data_sample)
+        return predictions
diff --git a/head_extractor/src/mmseg/models/text_encoder/__init__.py b/head_extractor/src/mmseg/models/text_encoder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..199856d9d79800cbcd9aa7b77223a6528c6b7e0a
--- /dev/null
+++ b/head_extractor/src/mmseg/models/text_encoder/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .clip_text_encoder import CLIPTextEncoder
+
+__all__ = ['CLIPTextEncoder']
diff --git a/head_extractor/src/mmseg/models/text_encoder/clip_text_encoder.py b/head_extractor/src/mmseg/models/text_encoder/clip_text_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a18b86395ebcf0443e9aab05f4454acada98990
--- /dev/null
+++ b/head_extractor/src/mmseg/models/text_encoder/clip_text_encoder.py
@@ -0,0 +1,229 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import BaseTransformerLayer
+from mmengine.model import BaseModule, ModuleList
+from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict
+from torch.nn import functional as F
+
+from mmseg.registry import MODELS
+from mmseg.utils import get_classes, get_predefined_templates, tokenizer
+
+
+@MODELS.register_module()
+class CLIPTextEncoder(BaseModule):
+    """A text encoder with transformer architecture to encode the label text.
+
+    Modified from https://github.com/MendelXu/SAN/blob/main/san/model/clip_utils/classifier.py # noqa:E501
+    Copyright (c) 2023 MendelXu.
+    Licensed under the MIT License
+
+    Args:
+        dataset_name: (str|None): The name of the dataset to which
+            the data belongs.
+        vocabulary: (List[str]|None): The list of class names. Default: None.
+        templates: (List[str]|None): The prompt template used for labels.
+            Default: None.
+        total_vocab_size: (int): Number of all words used by the pre-trained
+            model. Default: 49408 (CLIP).
+        context_length: (int): The max length of prompt text.
+            Default: 77 (CLIP).
+        embed_dims: (int): Width of transformer model. Default: 512.
+        num_layers: (int): Depth of transformer. Default: 12,
+        num_heads: (int): Number of attention heads in transformer.
+            Default: 8,
+        mlp_ratio: (int) Ratio of mlp hidden dim to embedding dim in
+            transformer. Default: 4,
+        output_dims: (int) Dim of output text embeddings. Default: 512,
+        cache_feature: (bool) Whether to save class embeddings in cache.
+            Default: True,
+        cat_bg: (bool) Whether to add background embedding. Default: True.
+        norm_cfg (dict|None): Config for norm layer. Default: dict(type='LN')
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 dataset_name: str = None,
+                 vocabulary: List[str] = None,
+                 templates: str = 'vild',
+                 total_vocab_size: int = 49408,
+                 context_length: int = 77,
+                 embed_dims: int = 512,
+                 num_layers: int = 12,
+                 num_heads: int = 8,
+                 mlp_ratio: int = 4,
+                 output_dims: int = 512,
+                 cache_feature: bool = True,
+                 cat_bg: bool = True,
+                 norm_cfg: dict = dict(type='LN'),
+                 init_cfg: dict = None):
+        super().__init__(init_cfg)
+        if isinstance(templates, List):
+            self.templates = templates
+        else:
+            self.templates = get_predefined_templates(templates)
+
+        assert dataset_name is not None or vocabulary is not None, \
+            "text_encoder required either 'dataset_name' or 'vocabulary'"
+        assert dataset_name is None or vocabulary is None, \
+            "there is conflict between 'dataset_name' and 'vocabulary'"
+        self.dataset_name = dataset_name
+        self.vocabulary = vocabulary
+        self.num_pos = context_length
+        self.token_embedding = nn.Embedding(total_vocab_size, embed_dims)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(context_length, embed_dims))
+        self.text_projection = nn.Parameter(
+            torch.empty(embed_dims, output_dims))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.transformer = ModuleList()
+        self.register_buffer(
+            'attn_mask', self.build_attention_mask(), persistent=False)
+        for i in range(num_layers):
+            self.transformer.append(
+                BaseTransformerLayer(
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=embed_dims,
+                        num_heads=num_heads,
+                        batch_first=False,
+                        bias=True),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=embed_dims,
+                        feedforward_channels=mlp_ratio * embed_dims,
+                        act_cfg=dict(type='QuickGELU')),
+                    operation_order=('norm', 'self_attn', 'norm', 'ffn')))
+        self.ln_final = build_norm_layer(
+            norm_cfg, embed_dims, postfix='_final')[1]
+
+        self.cache_feature = cache_feature
+        if self.cache_feature:
+            self.cache = {}
+
+        self._freeze()
+
+        self.cat_bg = cat_bg
+        if self.cat_bg:
+            self.bg_embed = nn.Parameter(
+                torch.randn(1, self.text_projection.shape[1]))
+
+    @property
+    def ln_final(self):
+        return getattr(self, self.final_name)
+
+    def build_attention_mask(self):
+        """lazily create causal attention mask, with full attention between the
+        tokens.
+
+        pytorch uses additive attention mask; fill with -inf
+        """
+        mask = torch.empty(self.num_pos, self.num_pos)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    def _freeze(self):
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def init_weights(self):
+        if self.cat_bg:
+            nn.init.normal_(
+                self.bg_embed,
+                std=self.bg_embed.shape[1]**-0.5,
+            )
+        if isinstance(self.init_cfg, dict) and \
+                self.init_cfg.get('type') == 'Pretrained_Part':
+            checkpoint = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+
+            state_dict = checkpoint.copy()
+            para_prefix = 'text_encoder'
+            prefix_len = len(para_prefix) + 1
+            for k, v in checkpoint.items():
+                state_dict.pop(k)
+                if para_prefix in k:
+                    state_dict[k[prefix_len:]] = v
+
+            load_state_dict(self, state_dict, strict=False, logger=None)
+
+        else:
+            super().init_weights()
+
+    @torch.no_grad()
+    def encode_text(self, text, normalize=False):
+        """encode class token."""
+
+        embed_device = self.token_embedding.weight.device
+        x = self.token_embedding(
+            text.to(embed_device))  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        for block in self.transformer:
+            x = block(query=x, attn_masks=self.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x)  # [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding
+        # (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]),
+              text.argmax(dim=-1)] @ self.text_projection
+        return F.normalize(x, dim=-1) if normalize else x
+
+    def template_encode(self, vocabulary):
+        """Prompt engineering."""
+        text_embed_bucket = []
+        for template in self.templates:
+            text_inputs = tokenizer.tokenize(
+                [template.format(noun) for noun in vocabulary])
+            text_embed = self.encode_text(text_inputs, normalize=True)
+            text_embed_bucket.append(text_embed)
+        text_embed = torch.stack(text_embed_bucket).mean(dim=0)
+        text_embed = text_embed / text_embed.norm(dim=-1, keepdim=True)
+        return text_embed
+
+    def forward(self):
+        """Forward function."""
+        if self.dataset_name is None:  # encoding vocabulary directly
+            class_names = self.vocabulary
+            if self.cache_feature:
+                new_classes = [
+                    word for word in class_names if word not in self.cache
+                ]
+                if len(new_classes) > 0:
+                    class_embeds = self.template_encode(new_classes)
+                    self.cache.update(dict(zip(new_classes, class_embeds)))
+                class_embeds = torch.stack(
+                    [self.cache[word] for word in class_names])
+            else:
+                class_embeds = self.template_encode(class_names)
+
+        else:  # encoding the classes of the dataset
+            class_names = get_classes(self.dataset_name)
+            if class_names[0] == 'background':
+                class_names = class_names[1:]
+            if self.cache_feature:
+                if self.dataset_name not in self.cache:
+                    class_embeds = self.template_encode(class_names)
+                    self.cache[self.dataset_name] = class_embeds
+                else:
+                    class_embeds = self.cache[self.dataset_name]
+            else:
+                class_embeds = self.template_encode(class_names)
+
+        if self.cat_bg:
+            class_embeds = torch.cat([class_embeds, self.bg_embed])
+            class_embeds = F.normalize(class_embeds, p=2, dim=-1)
+        return self.logit_scale.exp() * class_embeds
+
+
+@MODELS.register_module()
+class QuickGELU(nn.Module):
+    # From https://github.com/openai/CLIP/blob/main/clip/model.py
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
diff --git a/head_extractor/src/mmseg/models/utils/__init__.py b/head_extractor/src/mmseg/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0751b17c02de14e9bf1bfc02230d507a143e9c0
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .basic_block import BasicBlock, Bottleneck
+from .embed import PatchEmbed
+from .encoding import Encoding
+from .inverted_residual import InvertedResidual, InvertedResidualV3
+from .make_divisible import make_divisible
+from .point_sample import get_uncertain_point_coords_with_randomness
+from .ppm import DAPPM, PAPPM
+from .res_layer import ResLayer
+from .se_layer import SELayer
+from .self_attention_block import SelfAttentionBlock
+from .shape_convert import (nchw2nlc2nchw, nchw_to_nlc, nlc2nchw2nlc,
+                            nlc_to_nchw)
+from .up_conv_block import UpConvBlock
+
+# isort: off
+from .wrappers import Upsample, resize
+from .san_layers import MLP, LayerNorm2d, cross_attn_layer
+
+__all__ = [
+    'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual',
+    'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'PatchEmbed',
+    'nchw_to_nlc', 'nlc_to_nchw', 'nchw2nlc2nchw', 'nlc2nchw2nlc', 'Encoding',
+    'Upsample', 'resize', 'DAPPM', 'PAPPM', 'BasicBlock', 'Bottleneck',
+    'cross_attn_layer', 'LayerNorm2d', 'MLP',
+    'get_uncertain_point_coords_with_randomness'
+]
diff --git a/head_extractor/src/mmseg/models/utils/basic_block.py b/head_extractor/src/mmseg/models/utils/basic_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e1ad8146dd200c5f1e543adf22ada654ee196a4
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/basic_block.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType
+
+
+class BasicBlock(BaseModule):
+    """Basic block from `ResNet <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        in_channels (int): Input channels.
+        channels (int): Output channels.
+        stride (int): Stride of the first block. Default: 1.
+        downsample (nn.Module, optional): Downsample operation on identity.
+            Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict, optional): Config dict for activation layer in
+            ConvModule. Default: dict(type='ReLU', inplace=True).
+        act_cfg_out (dict, optional): Config dict for activation layer at the
+            last of the block. Default: None.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    expansion = 1
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 stride: int = 1,
+                 downsample: nn.Module = None,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 act_cfg_out: OptConfigType = dict(type='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.conv1 = ConvModule(
+            in_channels,
+            channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = ConvModule(
+            channels,
+            channels,
+            kernel_size=3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.downsample = downsample
+        if act_cfg_out:
+            self.act = MODELS.build(act_cfg_out)
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.downsample:
+            residual = self.downsample(x)
+
+        out += residual
+
+        if hasattr(self, 'act'):
+            out = self.act(out)
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    """Bottleneck block from `ResNet <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        in_channels (int): Input channels.
+        channels (int): Output channels.
+        stride (int): Stride of the first block. Default: 1.
+        downsample (nn.Module, optional): Downsample operation on identity.
+            Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict, optional): Config dict for activation layer in
+            ConvModule. Default: dict(type='ReLU', inplace=True).
+        act_cfg_out (dict, optional): Config dict for activation layer at
+            the last of the block. Default: None.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    expansion = 2
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 stride: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 act_cfg_out: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.conv1 = ConvModule(
+            in_channels, channels, 1, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.conv2 = ConvModule(
+            channels,
+            channels,
+            3,
+            stride,
+            1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv3 = ConvModule(
+            channels,
+            channels * self.expansion,
+            1,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        if act_cfg_out:
+            self.act = MODELS.build(act_cfg_out)
+        self.downsample = downsample
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = x
+
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = self.conv3(out)
+
+        if self.downsample:
+            residual = self.downsample(x)
+
+        out += residual
+
+        if hasattr(self, 'act'):
+            out = self.act(out)
+
+        return out
diff --git a/head_extractor/src/mmseg/models/utils/embed.py b/head_extractor/src/mmseg/models/utils/embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..aef0a40b0a87bb6616db96fe2c72c19cc6f5b366
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/embed.py
@@ -0,0 +1,330 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Sequence
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import BaseModule
+from mmengine.utils import to_2tuple
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding to input (if needed) so that input can get fully covered
+    by filter you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
+    input. The "corner"  mode would pad zero to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel:
+        stride (int | tuple): Stride of the filter. Default: 1:
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+
+        super().__init__()
+
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The config dict for embedding
+            conv layer type selection. Default: "Conv2d".
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int, optional): The slide stride of embedding conv.
+            Default: None (Would be set as `kernel_size`).
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only work when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmengine.ConfigDict`, optional): The Config for
+            initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv2d',
+                 kernel_size=16,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adap_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adap_padding:
+                pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (out_h, out_w).
+        """
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
+    merge patch, which is about 25% faster than original implementation.
+    Instead, we need to modify pretrained models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adap_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+
+        if self.adap_padding:
+            x = self.adap_padding(x)
+            H, W = x.shape[-2:]
+
+        x = self.sampler(x)
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
diff --git a/head_extractor/src/mmseg/models/utils/encoding.py b/head_extractor/src/mmseg/models/utils/encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee4f0574fbc1957cf8da591a0e4befd6d8a125d3
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/encoding.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+
+class Encoding(nn.Module):
+    """Encoding Layer: a learnable residual encoder.
+
+    Input is of shape  (batch_size, channels, height, width).
+    Output is of shape (batch_size, num_codes, channels).
+
+    Args:
+        channels: dimension of the features or feature channels
+        num_codes: number of code words
+    """
+
+    def __init__(self, channels, num_codes):
+        super().__init__()
+        # init codewords and smoothing factor
+        self.channels, self.num_codes = channels, num_codes
+        std = 1. / ((num_codes * channels)**0.5)
+        # [num_codes, channels]
+        self.codewords = nn.Parameter(
+            torch.empty(num_codes, channels,
+                        dtype=torch.float).uniform_(-std, std),
+            requires_grad=True)
+        # [num_codes]
+        self.scale = nn.Parameter(
+            torch.empty(num_codes, dtype=torch.float).uniform_(-1, 0),
+            requires_grad=True)
+
+    @staticmethod
+    def scaled_l2(x, codewords, scale):
+        num_codes, channels = codewords.size()
+        batch_size = x.size(0)
+        reshaped_scale = scale.view((1, 1, num_codes))
+        expanded_x = x.unsqueeze(2).expand(
+            (batch_size, x.size(1), num_codes, channels))
+        reshaped_codewords = codewords.view((1, 1, num_codes, channels))
+
+        scaled_l2_norm = reshaped_scale * (
+            expanded_x - reshaped_codewords).pow(2).sum(dim=3)
+        return scaled_l2_norm
+
+    @staticmethod
+    def aggregate(assignment_weights, x, codewords):
+        num_codes, channels = codewords.size()
+        reshaped_codewords = codewords.view((1, 1, num_codes, channels))
+        batch_size = x.size(0)
+
+        expanded_x = x.unsqueeze(2).expand(
+            (batch_size, x.size(1), num_codes, channels))
+        encoded_feat = (assignment_weights.unsqueeze(3) *
+                        (expanded_x - reshaped_codewords)).sum(dim=1)
+        return encoded_feat
+
+    def forward(self, x):
+        assert x.dim() == 4 and x.size(1) == self.channels
+        # [batch_size, channels, height, width]
+        batch_size = x.size(0)
+        # [batch_size, height x width, channels]
+        x = x.view(batch_size, self.channels, -1).transpose(1, 2).contiguous()
+        # assignment_weights: [batch_size, channels, num_codes]
+        assignment_weights = F.softmax(
+            self.scaled_l2(x, self.codewords, self.scale), dim=2)
+        # aggregate
+        encoded_feat = self.aggregate(assignment_weights, x, self.codewords)
+        return encoded_feat
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(Nx{self.channels}xHxW =>Nx{self.num_codes}' \
+                    f'x{self.channels})'
+        return repr_str
diff --git a/head_extractor/src/mmseg/models/utils/inverted_residual.py b/head_extractor/src/mmseg/models/utils/inverted_residual.py
new file mode 100644
index 0000000000000000000000000000000000000000..56190b3bfe7cc8fe98bf34c3812db18dd34a8f02
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/inverted_residual.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule
+from torch import nn
+from torch.utils import checkpoint as cp
+
+from .se_layer import SELayer
+
+
+class InvertedResidual(nn.Module):
+    """InvertedResidual block for MobileNetV2.
+
+    Args:
+        in_channels (int): The input channels of the InvertedResidual block.
+        out_channels (int): The output channels of the InvertedResidual block.
+        stride (int): Stride of the middle (first) 3x3 convolution.
+        expand_ratio (int): Adjusts number of channels of the hidden layer
+            in InvertedResidual by this amount.
+        dilation (int): Dilation rate of depthwise conv. Default: 1
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU6').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 expand_ratio,
+                 dilation=1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU6'),
+                 with_cp=False,
+                 **kwargs):
+        super().__init__()
+        self.stride = stride
+        assert stride in [1, 2], f'stride must in [1, 2]. ' \
+            f'But received {stride}.'
+        self.with_cp = with_cp
+        self.use_res_connect = self.stride == 1 and in_channels == out_channels
+        hidden_dim = int(round(in_channels * expand_ratio))
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(
+                ConvModule(
+                    in_channels=in_channels,
+                    out_channels=hidden_dim,
+                    kernel_size=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    **kwargs))
+        layers.extend([
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=3,
+                stride=stride,
+                padding=dilation,
+                dilation=dilation,
+                groups=hidden_dim,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                **kwargs),
+            ConvModule(
+                in_channels=hidden_dim,
+                out_channels=out_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+                **kwargs)
+        ])
+        self.conv = nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            if self.use_res_connect:
+                return x + self.conv(x)
+            else:
+                return self.conv(x)
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
+
+
+class InvertedResidualV3(nn.Module):
+    """Inverted Residual Block for MobileNetV3.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        mid_channels (int): The input channels of the depthwise convolution.
+        kernel_size (int): The kernel size of the depthwise convolution.
+            Default: 3.
+        stride (int): The stride of the depthwise convolution. Default: 1.
+        se_cfg (dict): Config dict for se layer. Default: None, which means no
+            se layer.
+        with_expand_conv (bool): Use expand conv or not. If set False,
+            mid_channels must be the same with in_channels. Default: True.
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels,
+                 kernel_size=3,
+                 stride=1,
+                 se_cfg=None,
+                 with_expand_conv=True,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 with_cp=False):
+        super().__init__()
+        self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
+        assert stride in [1, 2]
+        self.with_cp = with_cp
+        self.with_se = se_cfg is not None
+        self.with_expand_conv = with_expand_conv
+
+        if self.with_se:
+            assert isinstance(se_cfg, dict)
+        if not self.with_expand_conv:
+            assert mid_channels == in_channels
+
+        if self.with_expand_conv:
+            self.expand_conv = ConvModule(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.depthwise_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=mid_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=mid_channels,
+            conv_cfg=dict(
+                type='Conv2dAdaptivePadding') if stride == 2 else conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+        if self.with_se:
+            self.se = SELayer(**se_cfg)
+
+        self.linear_conv = ConvModule(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            out = x
+
+            if self.with_expand_conv:
+                out = self.expand_conv(out)
+
+            out = self.depthwise_conv(out)
+
+            if self.with_se:
+                out = self.se(out)
+
+            out = self.linear_conv(out)
+
+            if self.with_res_shortcut:
+                return x + out
+            else:
+                return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        return out
diff --git a/head_extractor/src/mmseg/models/utils/make_divisible.py b/head_extractor/src/mmseg/models/utils/make_divisible.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed42c2eeea2a6aed03a0be5516b8d1ef1139e486
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/make_divisible.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
+    """Make divisible function.
+
+    This function rounds the channel number to the nearest value that can be
+    divisible by the divisor. It is taken from the original tf repo. It ensures
+    that all layers have a channel number that is divisible by divisor. It can
+    be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py  # noqa
+
+    Args:
+        value (int): The original channel number.
+        divisor (int): The divisor to fully divide the channel number.
+        min_value (int): The minimum value of the output channel.
+            Default: None, means that the minimum value equal to the divisor.
+        min_ratio (float): The minimum ratio of the rounded channel number to
+            the original channel number. Default: 0.9.
+
+    Returns:
+        int: The modified output channel number.
+    """
+
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than (1-min_ratio).
+    if new_value < min_ratio * value:
+        new_value += divisor
+    return new_value
diff --git a/head_extractor/src/mmseg/models/utils/point_sample.py b/head_extractor/src/mmseg/models/utils/point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..1afc957f3da7d1dc030c21d40311c768c6952ea4
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/point_sample.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import point_sample
+from torch import Tensor
+
+
+def get_uncertainty(mask_preds: Tensor, labels: Tensor) -> Tensor:
+    """Estimate uncertainty based on pred logits.
+
+    We estimate uncertainty as L1 distance between 0.0 and the logits
+    prediction in 'mask_preds' for the foreground class in `classes`.
+
+    Args:
+        mask_preds (Tensor): mask predication logits, shape (num_rois,
+            num_classes, mask_height, mask_width).
+
+        labels (Tensor): Either predicted or ground truth label for
+            each predicted mask, of length num_rois.
+
+    Returns:
+        scores (Tensor): Uncertainty scores with the most uncertain
+            locations having the highest uncertainty score,
+            shape (num_rois, 1, mask_height, mask_width)
+    """
+    if mask_preds.shape[1] == 1:
+        gt_class_logits = mask_preds.clone()
+    else:
+        inds = torch.arange(mask_preds.shape[0], device=mask_preds.device)
+        gt_class_logits = mask_preds[inds, labels].unsqueeze(1)
+    return -torch.abs(gt_class_logits)
+
+
+def get_uncertain_point_coords_with_randomness(
+        mask_preds: Tensor, labels: Tensor, num_points: int,
+        oversample_ratio: float, importance_sample_ratio: float) -> Tensor:
+    """Get ``num_points`` most uncertain points with random points during
+    train.
+
+    Sample points in [0, 1] x [0, 1] coordinate space based on their
+    uncertainty. The uncertainties are calculated for each point using
+    'get_uncertainty()' function that takes point's logit prediction as
+    input.
+
+    Args:
+        mask_preds (Tensor): A tensor of shape (num_rois, num_classes,
+            mask_height, mask_width) for class-specific or class-agnostic
+            prediction.
+        labels (Tensor): The ground truth class for each instance.
+        num_points (int): The number of points to sample.
+        oversample_ratio (float): Oversampling parameter.
+        importance_sample_ratio (float): Ratio of points that are sampled
+            via importnace sampling.
+
+    Returns:
+        point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+            that contains the coordinates sampled points.
+    """
+    assert oversample_ratio >= 1
+    assert 0 <= importance_sample_ratio <= 1
+    batch_size = mask_preds.shape[0]
+    num_sampled = int(num_points * oversample_ratio)
+    point_coords = torch.rand(
+        batch_size, num_sampled, 2, device=mask_preds.device)
+    point_logits = point_sample(mask_preds, point_coords)
+    # It is crucial to calculate uncertainty based on the sampled
+    # prediction value for the points. Calculating uncertainties of the
+    # coarse predictions first and sampling them for points leads to
+    # incorrect results.  To illustrate this: assume uncertainty func(
+    # logits)=-abs(logits), a sampled point between two coarse
+    # predictions with -1 and 1 logits has 0 logits, and therefore 0
+    # uncertainty value. However, if we calculate uncertainties for the
+    # coarse predictions first, both will have -1 uncertainty,
+    # and sampled point will get -1 uncertainty.
+    point_uncertainties = get_uncertainty(point_logits, labels)
+    num_uncertain_points = int(importance_sample_ratio * num_points)
+    num_random_points = num_points - num_uncertain_points
+    idx = torch.topk(
+        point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+    shift = num_sampled * torch.arange(
+        batch_size, dtype=torch.long, device=mask_preds.device)
+    idx += shift[:, None]
+    point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+        batch_size, num_uncertain_points, 2)
+    if num_random_points > 0:
+        rand_roi_coords = torch.rand(
+            batch_size, num_random_points, 2, device=mask_preds.device)
+        point_coords = torch.cat((point_coords, rand_roi_coords), dim=1)
+    return point_coords
diff --git a/head_extractor/src/mmseg/models/utils/ppm.py b/head_extractor/src/mmseg/models/utils/ppm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fe6ff26fae6869b989cecde96af3ceff1a37b38
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/ppm.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList, Sequential
+from torch import Tensor
+
+
+class DAPPM(BaseModule):
+    """DAPPM module in `DDRNet <https://arxiv.org/abs/2101.06085>`_.
+
+    Args:
+        in_channels (int): Input channels.
+        branch_channels (int): Branch channels.
+        out_channels (int): Output channels.
+        num_scales (int): Number of scales.
+        kernel_sizes (list[int]): Kernel sizes of each scale.
+        strides (list[int]): Strides of each scale.
+        paddings (list[int]): Paddings of each scale.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU', inplace=True).
+        conv_cfg (dict): Config dict for convolution layer in ConvModule.
+            Default: dict(order=('norm', 'act', 'conv'), bias=False).
+        upsample_mode (str): Upsample mode. Default: 'bilinear'.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 branch_channels: int,
+                 out_channels: int,
+                 num_scales: int,
+                 kernel_sizes: List[int] = [5, 9, 17],
+                 strides: List[int] = [2, 4, 8],
+                 paddings: List[int] = [2, 4, 8],
+                 norm_cfg: Dict = dict(type='BN', momentum=0.1),
+                 act_cfg: Dict = dict(type='ReLU', inplace=True),
+                 conv_cfg: Dict = dict(
+                     order=('norm', 'act', 'conv'), bias=False),
+                 upsample_mode: str = 'bilinear'):
+        super().__init__()
+
+        self.num_scales = num_scales
+        self.unsample_mode = upsample_mode
+        self.in_channels = in_channels
+        self.branch_channels = branch_channels
+        self.out_channels = out_channels
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.conv_cfg = conv_cfg
+
+        self.scales = ModuleList([
+            ConvModule(
+                in_channels,
+                branch_channels,
+                kernel_size=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                **conv_cfg)
+        ])
+        for i in range(1, num_scales - 1):
+            self.scales.append(
+                Sequential(*[
+                    nn.AvgPool2d(
+                        kernel_size=kernel_sizes[i - 1],
+                        stride=strides[i - 1],
+                        padding=paddings[i - 1]),
+                    ConvModule(
+                        in_channels,
+                        branch_channels,
+                        kernel_size=1,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg,
+                        **conv_cfg)
+                ]))
+        self.scales.append(
+            Sequential(*[
+                nn.AdaptiveAvgPool2d((1, 1)),
+                ConvModule(
+                    in_channels,
+                    branch_channels,
+                    kernel_size=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    **conv_cfg)
+            ]))
+        self.processes = ModuleList()
+        for i in range(num_scales - 1):
+            self.processes.append(
+                ConvModule(
+                    branch_channels,
+                    branch_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    **conv_cfg))
+
+        self.compression = ConvModule(
+            branch_channels * num_scales,
+            out_channels,
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **conv_cfg)
+
+        self.shortcut = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **conv_cfg)
+
+    def forward(self, inputs: Tensor):
+        feats = []
+        feats.append(self.scales[0](inputs))
+
+        for i in range(1, self.num_scales):
+            feat_up = F.interpolate(
+                self.scales[i](inputs),
+                size=inputs.shape[2:],
+                mode=self.unsample_mode)
+            feats.append(self.processes[i - 1](feat_up + feats[i - 1]))
+
+        return self.compression(torch.cat(feats,
+                                          dim=1)) + self.shortcut(inputs)
+
+
+class PAPPM(DAPPM):
+    """PAPPM module in `PIDNet <https://arxiv.org/abs/2206.02066>`_.
+
+    Args:
+        in_channels (int): Input channels.
+        branch_channels (int): Branch channels.
+        out_channels (int): Output channels.
+        num_scales (int): Number of scales.
+        kernel_sizes (list[int]): Kernel sizes of each scale.
+        strides (list[int]): Strides of each scale.
+        paddings (list[int]): Paddings of each scale.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', momentum=0.1).
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU', inplace=True).
+        conv_cfg (dict): Config dict for convolution layer in ConvModule.
+            Default: dict(order=('norm', 'act', 'conv'), bias=False).
+        upsample_mode (str): Upsample mode. Default: 'bilinear'.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 branch_channels: int,
+                 out_channels: int,
+                 num_scales: int,
+                 kernel_sizes: List[int] = [5, 9, 17],
+                 strides: List[int] = [2, 4, 8],
+                 paddings: List[int] = [2, 4, 8],
+                 norm_cfg: Dict = dict(type='BN', momentum=0.1),
+                 act_cfg: Dict = dict(type='ReLU', inplace=True),
+                 conv_cfg: Dict = dict(
+                     order=('norm', 'act', 'conv'), bias=False),
+                 upsample_mode: str = 'bilinear'):
+        super().__init__(in_channels, branch_channels, out_channels,
+                         num_scales, kernel_sizes, strides, paddings, norm_cfg,
+                         act_cfg, conv_cfg, upsample_mode)
+
+        self.processes = ConvModule(
+            self.branch_channels * (self.num_scales - 1),
+            self.branch_channels * (self.num_scales - 1),
+            kernel_size=3,
+            padding=1,
+            groups=self.num_scales - 1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            **self.conv_cfg)
+
+    def forward(self, inputs: Tensor):
+        x_ = self.scales[0](inputs)
+        feats = []
+        for i in range(1, self.num_scales):
+            feat_up = F.interpolate(
+                self.scales[i](inputs),
+                size=inputs.shape[2:],
+                mode=self.unsample_mode,
+                align_corners=False)
+            feats.append(feat_up + x_)
+        scale_out = self.processes(torch.cat(feats, dim=1))
+        return self.compression(torch.cat([x_, scale_out],
+                                          dim=1)) + self.shortcut(inputs)
diff --git a/head_extractor/src/mmseg/models/utils/res_layer.py b/head_extractor/src/mmseg/models/utils/res_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dd7a6f75a168f2f7e3c61f82d309b1cf0d502bc
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/res_layer.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmengine.model import Sequential
+from torch import nn as nn
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        multi_grid (int | None): Multi grid dilation rates of last
+            stage. Default: None
+        contract_dilation (bool): Whether contract first dilation of each layer
+            Default: False
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 dilation=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 multi_grid=None,
+                 contract_dilation=False,
+                 **kwargs):
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if multi_grid is None:
+            if dilation > 1 and contract_dilation:
+                first_dilation = dilation // 2
+            else:
+                first_dilation = dilation
+        else:
+            first_dilation = multi_grid[0]
+        layers.append(
+            block(
+                inplanes=inplanes,
+                planes=planes,
+                stride=stride,
+                dilation=first_dilation,
+                downsample=downsample,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                **kwargs))
+        inplanes = planes * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=1,
+                    dilation=dilation if multi_grid is None else multi_grid[i],
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+        super().__init__(*layers)
diff --git a/head_extractor/src/mmseg/models/utils/san_layers.py b/head_extractor/src/mmseg/models/utils/san_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..2267686daf62658c5dc81408e0a399c43aee83aa
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/san_layers.py
@@ -0,0 +1,418 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/MendelXu/SAN/blob/main/san/model/attn_helper.py  # noqa: E501
+# Copyright (c) 2023 MendelXu.
+# Licensed under the MIT License
+
+import warnings
+from typing import Optional
+
+import torch
+from mmcv.cnn.bricks.transformer import BaseTransformerLayer
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+
+def cross_attn_with_self_bias(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    embed_dim_to_check: int,
+    num_heads: int,
+    in_proj_weight: Tensor,
+    in_proj_bias: Tensor,
+    bias_k: Optional[Tensor],
+    bias_v: Optional[Tensor],
+    add_zero_attn: bool,
+    dropout_p: float,
+    out_proj_weight: Tensor,
+    out_proj_bias: Tensor,
+    training: bool = True,
+    key_padding_mask: Optional[Tensor] = None,
+    need_weights: bool = True,
+    attn_mask: Optional[Tensor] = None,
+    use_separate_proj_weight: bool = False,
+    q_proj_weight: Optional[Tensor] = None,
+    k_proj_weight: Optional[Tensor] = None,
+    v_proj_weight: Optional[Tensor] = None,
+    static_k: Optional[Tensor] = None,
+    static_v: Optional[Tensor] = None,
+):
+    """Forward function of multi-head attention. Modified from
+    multi_head_attention_forward in
+    https://github.com/pytorch/pytorch/blob/main/torch/nn/functional.py.
+
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+            Default: `True`
+            Note: `needs_weight` defaults to `True`, but should be set to `False`
+            For best performance when attention weights are not needed.
+            *Setting needs_weights to `True`
+            leads to a significant performance degradation.*
+        attn_mask: 2D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+    """  # noqa: E501
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    # allow MHA to have different sizes for the feature dimension
+    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+
+    head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, \
+        'embed_dim must be divisible by num_heads'
+    scaling = float(head_dim)**-0.5
+
+    if not use_separate_proj_weight:
+        if (query is key or torch.equal(
+                query, key)) and (key is value or torch.equal(key, value)):
+            # self-attention
+            raise NotImplementedError('self-attention is not implemented')
+
+        elif key is value or torch.equal(key, value):
+            # encoder-decoder attention
+            # This is inline in_proj function
+            # with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = F.linear(query, _w, _b)
+
+            if key is None:
+                assert value is None
+                k = None
+                v = None
+                q_k = None
+                q_v = None
+            else:
+                # This is inline in_proj function with
+                # in_proj_weight and in_proj_bias
+                _b = in_proj_bias
+                _start = embed_dim
+                _end = None
+                _w = in_proj_weight[_start:, :]
+                if _b is not None:
+                    _b = _b[_start:]
+                k, v = F.linear(key, _w, _b).chunk(2, dim=-1)
+                q_k, q_v = F.linear(query, _w, _b).chunk(2, dim=-1)
+        else:
+            # This is inline in_proj function with
+            # in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = F.linear(query, _w, _b)
+
+            # This is inline in_proj function with
+            # in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim
+            _end = embed_dim * 2
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            k = F.linear(key, _w, _b)
+            q_k = F.linear(query, _w, _b)
+            # This is inline in_proj function with
+            # in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim * 2
+            _end = None
+            _w = in_proj_weight[_start:, :]
+            if _b is not None:
+                _b = _b[_start:]
+            v = F.linear(value, _w, _b)
+            q_v = F.linear(query, _w, _b)
+    else:
+        q_proj_weight_non_opt = \
+            torch.jit._unwrap_optional(q_proj_weight)
+        len1, len2 = q_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == query.size(-1)
+
+        k_proj_weight_non_opt = \
+            torch.jit._unwrap_optional(k_proj_weight)
+        len1, len2 = k_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == key.size(-1)
+
+        v_proj_weight_non_opt = \
+            torch.jit._unwrap_optional(v_proj_weight)
+        len1, len2 = v_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == value.size(-1)
+
+        if in_proj_bias is not None:
+            q = F.linear(query, q_proj_weight_non_opt,
+                         in_proj_bias[0:embed_dim])
+            k = F.linear(key, k_proj_weight_non_opt,
+                         in_proj_bias[embed_dim:(embed_dim * 2)])
+            v = F.linear(value, v_proj_weight_non_opt,
+                         in_proj_bias[(embed_dim * 2):])
+        else:
+            q = F.linear(query, q_proj_weight_non_opt, in_proj_bias)
+            k = F.linear(key, k_proj_weight_non_opt, in_proj_bias)
+            v = F.linear(value, v_proj_weight_non_opt, in_proj_bias)
+    q = q * scaling
+
+    if attn_mask is not None:
+        assert (
+            attn_mask.dtype == torch.float32
+            or attn_mask.dtype == torch.float64
+            or attn_mask.dtype == torch.float16
+            or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool
+        ), 'Only float, byte, and bool types are supported for ' \
+           'attn_mask, not {}'.format(attn_mask.dtype)
+        if attn_mask.dtype == torch.uint8:
+            warnings.warn('Byte tensor for attn_mask in nn.MultiheadAttention '
+                          'is deprecated. Use bool tensor instead.')
+            attn_mask = attn_mask.to(torch.bool)
+
+        if attn_mask.dim() == 2:
+            attn_mask = attn_mask.unsqueeze(0)
+            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                raise RuntimeError(
+                    'The size of the 2D attn_mask is not correct.')
+        elif attn_mask.dim() == 3:
+            if list(attn_mask.size()) != [
+                    bsz * num_heads,
+                    query.size(0), key.size(0)
+            ]:
+                raise RuntimeError(
+                    'The size of the 3D attn_mask is not correct.')
+        else:
+            raise RuntimeError(
+                "attn_mask's dimension {} is not supported".format(
+                    attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+
+    # convert ByteTensor key_padding_mask to bool
+    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+        warnings.warn(
+            'Byte tensor for key_padding_mask in nn.MultiheadAttention '
+            'is deprecated. Use bool tensor instead.')
+        key_padding_mask = key_padding_mask.to(torch.bool)
+
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = F.pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = F.pad(key_padding_mask, (0, 1))
+        else:
+            assert static_k is None, 'bias cannot be added to static key.'
+            assert static_v is None, 'bias cannot be added to static value.'
+    else:
+        assert bias_k is None
+        assert bias_v is None
+
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+        q_k = q_k.contiguous().view(tgt_len, bsz * num_heads,
+                                    head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+        q_v = q_v.contiguous().view(tgt_len, bsz * num_heads,
+                                    head_dim).transpose(0, 1)
+
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == head_dim
+        v = static_v
+
+    src_len = k.size(1)
+
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat(
+            [
+                k,
+                torch.zeros(
+                    (k.size(0), 1) + k.size()[2:],
+                    dtype=k.dtype,
+                    device=k.device),
+            ],
+            dim=1,
+        )
+        v = torch.cat(
+            [
+                v,
+                torch.zeros(
+                    (v.size(0), 1) + v.size()[2:],
+                    dtype=v.dtype,
+                    device=v.device),
+            ],
+            dim=1,
+        )
+        if attn_mask is not None:
+            attn_mask = F.pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = F.pad(key_padding_mask, (0, 1))
+
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    assert list(
+        attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+        else:
+            attn_output_weights += attn_mask
+
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len,
+                                                       src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1).unsqueeze(2),
+            float('-inf'),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads,
+                                                       tgt_len, src_len)
+    # attn_out_weights: [bsz * num_heads, tgt_len, src_len]
+    # ->[bsz * num_heads, tgt_len, src_len+1]
+    self_weight = (q * q_k).sum(
+        dim=-1, keepdim=True)  # [bsz * num_heads, tgt_len, 1]
+    total_attn_output_weights = torch.cat([attn_output_weights, self_weight],
+                                          dim=-1)
+    total_attn_output_weights = F.softmax(total_attn_output_weights, dim=-1)
+    total_attn_output_weights = F.dropout(
+        total_attn_output_weights, p=dropout_p, training=training)
+    attn_output_weights = \
+        total_attn_output_weights[:, :, : -1]
+    # [bsz * num_heads, tgt_len, src_len]
+    self_weight = \
+        total_attn_output_weights[:, :, -1:]  # [bsz * num_heads, tgt_len, 1]
+
+    attn_output = torch.bmm(attn_output_weights,
+                            v)  # [bsz * num_heads, tgt_len, head_dim]
+    attn_output = (attn_output + self_weight * q_v
+                   )  # [bsz * num_heads, tgt_len, head_dim]
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(
+        tgt_len, bsz, embed_dim)
+    attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias)
+
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len,
+                                                       src_len)
+        return attn_output, attn_output_weights  # .sum(dim=1) / num_heads
+    else:
+        return attn_output, None
+
+
+def cross_attn_layer(tf_layer: BaseTransformerLayer, x, mem, attn_bias):
+    """Implementation of transformer layer with cross attention. The cross
+    attention shares the embedding weights with self-attention of tf_layer.
+    Args:
+        tf_layer: (TransformerEncoderLayer): The Module of transformer layer.
+        x (Tensor): query [K,N,C]
+        mem (Tensor): key and value [L,N,C]
+        attn_bias (Tensor): attention bias [N*num_head,K,L]
+
+    Return:
+        x (Tensor): cross attention output [K,N,C]
+    """
+    self_attn_layer = tf_layer.attentions[0].attn
+    attn_layer_paras = {
+        'embed_dim_to_check': self_attn_layer.embed_dim,
+        'num_heads': self_attn_layer.num_heads,
+        'in_proj_weight': self_attn_layer.in_proj_weight,
+        'in_proj_bias': self_attn_layer.in_proj_bias,
+        'bias_k': self_attn_layer.bias_k,
+        'bias_v': self_attn_layer.bias_v,
+        'add_zero_attn': self_attn_layer.add_zero_attn,
+        'dropout_p': self_attn_layer.dropout,
+        'out_proj_weight': self_attn_layer.out_proj.weight,
+        'out_proj_bias': self_attn_layer.out_proj.bias,
+        'training': self_attn_layer.training
+    }
+
+    q_x = tf_layer.norms[0](x)
+    k_x = v_x = tf_layer.norms[0](mem)
+    x = x + cross_attn_with_self_bias(
+        q_x,
+        k_x,
+        v_x,
+        attn_mask=attn_bias,
+        need_weights=False,
+        **attn_layer_paras)[0]
+    x = tf_layer.ffns[0](tf_layer.norms[1](x), identity=x)
+    return x
+
+
+class LayerNorm2d(nn.Module):
+    """A LayerNorm variant, popularized by Transformers, that performs point-
+    wise mean and variance normalization over the channel dimension for inputs
+    that have shape (batch_size, channels, height, width).
+
+    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa B950
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x: torch.Tensor):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+class MLP(nn.Module):
+    """Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self,
+                 input_dim,
+                 hidden_dim,
+                 output_dim,
+                 num_layers,
+                 affine_func=nn.Linear):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            affine_func(n, k)
+            for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x: torch.Tensor):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
diff --git a/head_extractor/src/mmseg/models/utils/se_layer.py b/head_extractor/src/mmseg/models/utils/se_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ff632cfea728a7ffd99f1578c828c588d78f3db
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/se_layer.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.utils import is_tuple_of
+
+from .make_divisible import make_divisible
+
+
+class SELayer(nn.Module):
+    """Squeeze-and-Excitation Module.
+
+    Args:
+        channels (int): The input (and output) channels of the SE layer.
+        ratio (int): Squeeze ratio in SELayer, the intermediate channel will be
+            ``int(channels/ratio)``. Default: 16.
+        conv_cfg (None or dict): Config dict for convolution layer.
+            Default: None, which means using conv2d.
+        act_cfg (dict or Sequence[dict]): Config dict for activation layer.
+            If act_cfg is a dict, two activation layers will be configured
+            by this dict. If act_cfg is a sequence of dicts, the first
+            activation layer will be configured by the first dict and the
+            second activation layer will be configured by the second dict.
+            Default: (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0,
+            divisor=6.0)).
+    """
+
+    def __init__(self,
+                 channels,
+                 ratio=16,
+                 conv_cfg=None,
+                 act_cfg=(dict(type='ReLU'),
+                          dict(type='HSigmoid', bias=3.0, divisor=6.0))):
+        super().__init__()
+        if isinstance(act_cfg, dict):
+            act_cfg = (act_cfg, act_cfg)
+        assert len(act_cfg) == 2
+        assert is_tuple_of(act_cfg, dict)
+        self.global_avgpool = nn.AdaptiveAvgPool2d(1)
+        self.conv1 = ConvModule(
+            in_channels=channels,
+            out_channels=make_divisible(channels // ratio, 8),
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[0])
+        self.conv2 = ConvModule(
+            in_channels=make_divisible(channels // ratio, 8),
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            conv_cfg=conv_cfg,
+            act_cfg=act_cfg[1])
+
+    def forward(self, x):
+        out = self.global_avgpool(x)
+        out = self.conv1(out)
+        out = self.conv2(out)
+        return x * out
diff --git a/head_extractor/src/mmseg/models/utils/self_attention_block.py b/head_extractor/src/mmseg/models/utils/self_attention_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bb6e8284e599637c12553e27199338a820709e3
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/self_attention_block.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import ConvModule
+from mmengine.model.weight_init import constant_init
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+class SelfAttentionBlock(nn.Module):
+    """General self-attention block/non-local block.
+
+    Please refer to https://arxiv.org/abs/1706.03762 for details about key,
+    query and value.
+
+    Args:
+        key_in_channels (int): Input channels of key feature.
+        query_in_channels (int): Input channels of query feature.
+        channels (int): Output channels of key/query transform.
+        out_channels (int): Output channels.
+        share_key_query (bool): Whether share projection weight between key
+            and query projection.
+        query_downsample (nn.Module): Query downsample module.
+        key_downsample (nn.Module): Key downsample module.
+        key_query_num_convs (int): Number of convs for key/query projection.
+        value_num_convs (int): Number of convs for value projection.
+        matmul_norm (bool): Whether normalize attention map with sqrt of
+            channels
+        with_out (bool): Whether use out projection.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict|None): Config of activation layers.
+    """
+
+    def __init__(self, key_in_channels, query_in_channels, channels,
+                 out_channels, share_key_query, query_downsample,
+                 key_downsample, key_query_num_convs, value_out_num_convs,
+                 key_query_norm, value_out_norm, matmul_norm, with_out,
+                 conv_cfg, norm_cfg, act_cfg):
+        super().__init__()
+        if share_key_query:
+            assert key_in_channels == query_in_channels
+        self.key_in_channels = key_in_channels
+        self.query_in_channels = query_in_channels
+        self.out_channels = out_channels
+        self.channels = channels
+        self.share_key_query = share_key_query
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.key_project = self.build_project(
+            key_in_channels,
+            channels,
+            num_convs=key_query_num_convs,
+            use_conv_module=key_query_norm,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        if share_key_query:
+            self.query_project = self.key_project
+        else:
+            self.query_project = self.build_project(
+                query_in_channels,
+                channels,
+                num_convs=key_query_num_convs,
+                use_conv_module=key_query_norm,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        self.value_project = self.build_project(
+            key_in_channels,
+            channels if with_out else out_channels,
+            num_convs=value_out_num_convs,
+            use_conv_module=value_out_norm,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        if with_out:
+            self.out_project = self.build_project(
+                channels,
+                out_channels,
+                num_convs=value_out_num_convs,
+                use_conv_module=value_out_norm,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        else:
+            self.out_project = None
+
+        self.query_downsample = query_downsample
+        self.key_downsample = key_downsample
+        self.matmul_norm = matmul_norm
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialize weight of later layer."""
+        if self.out_project is not None:
+            if not isinstance(self.out_project, ConvModule):
+                constant_init(self.out_project, 0)
+
+    def build_project(self, in_channels, channels, num_convs, use_conv_module,
+                      conv_cfg, norm_cfg, act_cfg):
+        """Build projection layer for key/query/value/out."""
+        if use_conv_module:
+            convs = [
+                ConvModule(
+                    in_channels,
+                    channels,
+                    1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg)
+            ]
+            for _ in range(num_convs - 1):
+                convs.append(
+                    ConvModule(
+                        channels,
+                        channels,
+                        1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+        else:
+            convs = [nn.Conv2d(in_channels, channels, 1)]
+            for _ in range(num_convs - 1):
+                convs.append(nn.Conv2d(channels, channels, 1))
+        if len(convs) > 1:
+            convs = nn.Sequential(*convs)
+        else:
+            convs = convs[0]
+        return convs
+
+    def forward(self, query_feats, key_feats):
+        """Forward function."""
+        batch_size = query_feats.size(0)
+        query = self.query_project(query_feats)
+        if self.query_downsample is not None:
+            query = self.query_downsample(query)
+        query = query.reshape(*query.shape[:2], -1)
+        query = query.permute(0, 2, 1).contiguous()
+
+        key = self.key_project(key_feats)
+        value = self.value_project(key_feats)
+        if self.key_downsample is not None:
+            key = self.key_downsample(key)
+            value = self.key_downsample(value)
+        key = key.reshape(*key.shape[:2], -1)
+        value = value.reshape(*value.shape[:2], -1)
+        value = value.permute(0, 2, 1).contiguous()
+
+        sim_map = torch.matmul(query, key)
+        if self.matmul_norm:
+            sim_map = (self.channels**-.5) * sim_map
+        sim_map = F.softmax(sim_map, dim=-1)
+
+        context = torch.matmul(sim_map, value)
+        context = context.permute(0, 2, 1).contiguous()
+        context = context.reshape(batch_size, -1, *query_feats.shape[2:])
+        if self.out_project is not None:
+            context = self.out_project(context)
+        return context
diff --git a/head_extractor/src/mmseg/models/utils/shape_convert.py b/head_extractor/src/mmseg/models/utils/shape_convert.py
new file mode 100644
index 0000000000000000000000000000000000000000..cce1e220b645d4b02df1ec2d9ed3137c8acba707
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/shape_convert.py
@@ -0,0 +1,107 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+def nlc_to_nchw(x, hw_shape):
+    """Convert [N, L, C] shape tensor to [N, C, H, W] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, L, C] before conversion.
+        hw_shape (Sequence[int]): The height and width of output feature map.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W] after conversion.
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len doesn\'t match H, W'
+    return x.transpose(1, 2).reshape(B, C, H, W)
+
+
+def nchw_to_nlc(x):
+    """Flatten [N, C, H, W] shape tensor to [N, L, C] shape tensor.
+
+    Args:
+        x (Tensor): The input tensor of shape [N, C, H, W] before conversion.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C] after conversion.
+    """
+    assert len(x.shape) == 4
+    return x.flatten(2).transpose(1, 2).contiguous()
+
+
+def nchw2nlc2nchw(module, x, contiguous=False, **kwargs):
+    """Flatten [N, C, H, W] shape tensor `x` to [N, L, C] shape tensor. Use the
+    reshaped tensor as the input of `module`, and the convert the output of
+    `module`, whose shape is.
+
+    [N, L, C], to [N, C, H, W].
+
+    Args:
+        module (Callable): A callable object the takes a tensor
+            with shape [N, L, C] as input.
+        x (Tensor): The input tensor of shape [N, C, H, W].
+                contiguous:
+        contiguous (Bool): Whether to make the tensor contiguous
+            after each shape transform.
+
+    Returns:
+        Tensor: The output tensor of shape [N, C, H, W].
+
+    Example:
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> norm = nn.LayerNorm(4)
+        >>> feature_map = torch.rand(4, 4, 5, 5)
+        >>> output = nchw2nlc2nchw(norm, feature_map)
+    """
+    B, C, H, W = x.shape
+    if not contiguous:
+        x = x.flatten(2).transpose(1, 2)
+        x = module(x, **kwargs)
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+    else:
+        x = x.flatten(2).transpose(1, 2).contiguous()
+        x = module(x, **kwargs)
+        x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+    return x
+
+
+def nlc2nchw2nlc(module, x, hw_shape, contiguous=False, **kwargs):
+    """Convert [N, L, C] shape tensor `x` to [N, C, H, W] shape tensor. Use the
+    reshaped tensor as the input of `module`, and convert the output of
+    `module`, whose shape is.
+
+    [N, C, H, W], to [N, L, C].
+
+    Args:
+        module (Callable): A callable object the takes a tensor
+            with shape [N, C, H, W] as input.
+        x (Tensor): The input tensor of shape [N, L, C].
+        hw_shape: (Sequence[int]): The height and width of the
+            feature map with shape [N, C, H, W].
+        contiguous (Bool): Whether to make the tensor contiguous
+            after each shape transform.
+
+    Returns:
+        Tensor: The output tensor of shape [N, L, C].
+
+    Example:
+        >>> import torch
+        >>> import torch.nn as nn
+        >>> conv = nn.Conv2d(16, 16, 3, 1, 1)
+        >>> feature_map = torch.rand(4, 25, 16)
+        >>> output = nlc2nchw2nlc(conv, feature_map, (5, 5))
+    """
+    H, W = hw_shape
+    assert len(x.shape) == 3
+    B, L, C = x.shape
+    assert L == H * W, 'The seq_len doesn\'t match H, W'
+    if not contiguous:
+        x = x.transpose(1, 2).reshape(B, C, H, W)
+        x = module(x, **kwargs)
+        x = x.flatten(2).transpose(1, 2)
+    else:
+        x = x.transpose(1, 2).reshape(B, C, H, W).contiguous()
+        x = module(x, **kwargs)
+        x = x.flatten(2).transpose(1, 2).contiguous()
+    return x
diff --git a/head_extractor/src/mmseg/models/utils/up_conv_block.py b/head_extractor/src/mmseg/models/utils/up_conv_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fa3b598de96d53c169232d9c89ac458f6921e8d
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/up_conv_block.py
@@ -0,0 +1,102 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_upsample_layer
+
+
+class UpConvBlock(nn.Module):
+    """Upsample convolution block in decoder for UNet.
+
+    This upsample convolution block consists of one upsample module
+    followed by one convolution block. The upsample module expands the
+    high-level low-resolution feature map and the convolution block fuses
+    the upsampled high-level low-resolution feature map and the low-level
+    high-resolution feature map from encoder.
+
+    Args:
+        conv_block (nn.Sequential): Sequential of convolutional layers.
+        in_channels (int): Number of input channels of the high-level
+        skip_channels (int): Number of input channels of the low-level
+        high-resolution feature map from encoder.
+        out_channels (int): Number of output channels.
+        num_convs (int): Number of convolutional layers in the conv_block.
+            Default: 2.
+        stride (int): Stride of convolutional layer in conv_block. Default: 1.
+        dilation (int): Dilation rate of convolutional layer in conv_block.
+            Default: 1.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        conv_cfg (dict | None): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict | None): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict | None): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU').
+        upsample_cfg (dict): The upsample config of the upsample module in
+            decoder. Default: dict(type='InterpConv'). If the size of
+            high-level feature map is the same as that of skip feature map
+            (low-level feature map from encoder), it does not need upsample the
+            high-level feature map and the upsample_cfg is None.
+        dcn (bool): Use deformable convolution in convolutional layer or not.
+            Default: None.
+        plugins (dict): plugins for convolutional layers. Default: None.
+    """
+
+    def __init__(self,
+                 conv_block,
+                 in_channels,
+                 skip_channels,
+                 out_channels,
+                 num_convs=2,
+                 stride=1,
+                 dilation=1,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 upsample_cfg=dict(type='InterpConv'),
+                 dcn=None,
+                 plugins=None):
+        super().__init__()
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.conv_block = conv_block(
+            in_channels=2 * skip_channels,
+            out_channels=out_channels,
+            num_convs=num_convs,
+            stride=stride,
+            dilation=dilation,
+            with_cp=with_cp,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            dcn=None,
+            plugins=None)
+        if upsample_cfg is not None:
+            self.upsample = build_upsample_layer(
+                cfg=upsample_cfg,
+                in_channels=in_channels,
+                out_channels=skip_channels,
+                with_cp=with_cp,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+        else:
+            self.upsample = ConvModule(
+                in_channels,
+                skip_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg)
+
+    def forward(self, skip, x):
+        """Forward function."""
+
+        x = self.upsample(x)
+        out = torch.cat([skip, x], dim=1)
+        out = self.conv_block(out)
+
+        return out
diff --git a/head_extractor/src/mmseg/models/utils/wrappers.py b/head_extractor/src/mmseg/models/utils/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..abbd0c029623b4f480a067e4b78adfec234ef8d0
--- /dev/null
+++ b/head_extractor/src/mmseg/models/utils/wrappers.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def resize(input,
+           size=None,
+           scale_factor=None,
+           mode='nearest',
+           align_corners=None,
+           warning=True):
+    if warning:
+        if size is not None and align_corners:
+            input_h, input_w = tuple(int(x) for x in input.shape[2:])
+            output_h, output_w = tuple(int(x) for x in size)
+            if output_h > input_h or output_w > output_h:
+                if ((output_h > 1 and output_w > 1 and input_h > 1
+                     and input_w > 1) and (output_h - 1) % (input_h - 1)
+                        and (output_w - 1) % (input_w - 1)):
+                    warnings.warn(
+                        f'When align_corners={align_corners}, '
+                        'the output would more aligned if '
+                        f'input size {(input_h, input_w)} is `x+1` and '
+                        f'out size {(output_h, output_w)} is `nx+1`')
+    return F.interpolate(input, size, scale_factor, mode, align_corners)
+
+
+class Upsample(nn.Module):
+
+    def __init__(self,
+                 size=None,
+                 scale_factor=None,
+                 mode='nearest',
+                 align_corners=None):
+        super().__init__()
+        self.size = size
+        if isinstance(scale_factor, tuple):
+            self.scale_factor = tuple(float(factor) for factor in scale_factor)
+        else:
+            self.scale_factor = float(scale_factor) if scale_factor else None
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        if not self.size:
+            size = [int(t * self.scale_factor) for t in x.shape[-2:]]
+        else:
+            size = self.size
+        return resize(x, size, None, self.mode, self.align_corners)
diff --git a/head_extractor/src/mmseg/registry/__init__.py b/head_extractor/src/mmseg/registry/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee514d1a2a2bdd54a0a9b017ec227160ee502be5
--- /dev/null
+++ b/head_extractor/src/mmseg/registry/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .registry import (DATA_SAMPLERS, DATASETS, EVALUATOR, HOOKS, INFERENCERS,
+                       LOG_PROCESSORS, LOOPS, METRICS, MODEL_WRAPPERS, MODELS,
+                       OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS, OPTIMIZERS,
+                       PARAM_SCHEDULERS, RUNNER_CONSTRUCTORS, RUNNERS,
+                       TASK_UTILS, TRANSFORMS, VISBACKENDS, VISUALIZERS,
+                       WEIGHT_INITIALIZERS)
+
+__all__ = [
+    'HOOKS', 'DATASETS', 'DATA_SAMPLERS', 'TRANSFORMS', 'MODELS',
+    'WEIGHT_INITIALIZERS', 'OPTIMIZERS', 'OPTIM_WRAPPER_CONSTRUCTORS',
+    'TASK_UTILS', 'PARAM_SCHEDULERS', 'METRICS', 'MODEL_WRAPPERS',
+    'VISBACKENDS', 'VISUALIZERS', 'RUNNERS', 'RUNNER_CONSTRUCTORS', 'LOOPS',
+    'EVALUATOR', 'LOG_PROCESSORS', 'OPTIM_WRAPPERS', 'INFERENCERS'
+]
diff --git a/head_extractor/src/mmseg/registry/registry.py b/head_extractor/src/mmseg/registry/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..37b6a776095856c2fab0101b5b0ec8ed7e8fa8f2
--- /dev/null
+++ b/head_extractor/src/mmseg/registry/registry.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""MMSegmentation provides 21 registry nodes to support using modules across
+projects. Each node is a child of the root registry in MMEngine.
+
+More details can be found at
+https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html.
+"""
+
+from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS
+from mmengine.registry import DATASETS as MMENGINE_DATASETS
+from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR
+from mmengine.registry import HOOKS as MMENGINE_HOOKS
+from mmengine.registry import INFERENCERS as MMENGINE_INFERENCERS
+from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS
+from mmengine.registry import LOOPS as MMENGINE_LOOPS
+from mmengine.registry import METRICS as MMENGINE_METRICS
+from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS
+from mmengine.registry import MODELS as MMENGINE_MODELS
+from mmengine.registry import \
+    OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS
+from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS
+from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS
+from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS
+from mmengine.registry import \
+    RUNNER_CONSTRUCTORS as MMENGINE_RUNNER_CONSTRUCTORS
+from mmengine.registry import RUNNERS as MMENGINE_RUNNERS
+from mmengine.registry import TASK_UTILS as MMENGINE_TASK_UTILS
+from mmengine.registry import TRANSFORMS as MMENGINE_TRANSFORMS
+from mmengine.registry import VISBACKENDS as MMENGINE_VISBACKENDS
+from mmengine.registry import VISUALIZERS as MMENGINE_VISUALIZERS
+from mmengine.registry import \
+    WEIGHT_INITIALIZERS as MMENGINE_WEIGHT_INITIALIZERS
+from mmengine.registry import Registry
+
+# manage all kinds of runners like `EpochBasedRunner` and `IterBasedRunner`
+RUNNERS = Registry('runner', parent=MMENGINE_RUNNERS)
+# manage runner constructors that define how to initialize runners
+RUNNER_CONSTRUCTORS = Registry(
+    'runner constructor', parent=MMENGINE_RUNNER_CONSTRUCTORS)
+# manage all kinds of loops like `EpochBasedTrainLoop`
+LOOPS = Registry('loop', parent=MMENGINE_LOOPS)
+# manage all kinds of hooks like `CheckpointHook`
+HOOKS = Registry(
+    'hook', parent=MMENGINE_HOOKS, locations=['mmseg.engine.hooks'])
+
+# manage data-related modules
+DATASETS = Registry(
+    'dataset', parent=MMENGINE_DATASETS, locations=['mmseg.datasets'])
+DATA_SAMPLERS = Registry('data sampler', parent=MMENGINE_DATA_SAMPLERS)
+TRANSFORMS = Registry(
+    'transform',
+    parent=MMENGINE_TRANSFORMS,
+    locations=['mmseg.datasets.transforms'])
+
+# mangage all kinds of modules inheriting `nn.Module`
+MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['mmseg.models'])
+# mangage all kinds of model wrappers like 'MMDistributedDataParallel'
+MODEL_WRAPPERS = Registry(
+    'model_wrapper',
+    parent=MMENGINE_MODEL_WRAPPERS,
+    locations=['mmseg.models'])
+# mangage all kinds of weight initialization modules like `Uniform`
+WEIGHT_INITIALIZERS = Registry(
+    'weight initializer',
+    parent=MMENGINE_WEIGHT_INITIALIZERS,
+    locations=['mmseg.models'])
+
+# mangage all kinds of optimizers like `SGD` and `Adam`
+OPTIMIZERS = Registry(
+    'optimizer',
+    parent=MMENGINE_OPTIMIZERS,
+    locations=['mmseg.engine.optimizers'])
+# manage optimizer wrapper
+OPTIM_WRAPPERS = Registry(
+    'optim_wrapper',
+    parent=MMENGINE_OPTIM_WRAPPERS,
+    locations=['mmseg.engine.optimizers'])
+# manage constructors that customize the optimization hyperparameters.
+OPTIM_WRAPPER_CONSTRUCTORS = Registry(
+    'optimizer wrapper constructor',
+    parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS,
+    locations=['mmseg.engine.optimizers'])
+# mangage all kinds of parameter schedulers like `MultiStepLR`
+PARAM_SCHEDULERS = Registry(
+    'parameter scheduler',
+    parent=MMENGINE_PARAM_SCHEDULERS,
+    locations=['mmseg.engine.schedulers'])
+
+# manage all kinds of metrics
+METRICS = Registry(
+    'metric', parent=MMENGINE_METRICS, locations=['mmseg.evaluation'])
+# manage evaluator
+EVALUATOR = Registry(
+    'evaluator', parent=MMENGINE_EVALUATOR, locations=['mmseg.evaluation'])
+
+# manage task-specific modules like ohem pixel sampler
+TASK_UTILS = Registry(
+    'task util', parent=MMENGINE_TASK_UTILS, locations=['mmseg.models'])
+
+# manage visualizer
+VISUALIZERS = Registry(
+    'visualizer',
+    parent=MMENGINE_VISUALIZERS,
+    locations=['mmseg.visualization'])
+# manage visualizer backend
+VISBACKENDS = Registry(
+    'vis_backend',
+    parent=MMENGINE_VISBACKENDS,
+    locations=['mmseg.visualization'])
+
+# manage logprocessor
+LOG_PROCESSORS = Registry(
+    'log_processor',
+    parent=MMENGINE_LOG_PROCESSORS,
+    locations=['mmseg.visualization'])
+
+# manage inferencer
+INFERENCERS = Registry('inferencer', parent=MMENGINE_INFERENCERS)
diff --git a/head_extractor/src/mmseg/structures/__init__.py b/head_extractor/src/mmseg/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..63d118dca3ebcff30ca241f9378475bcce072627
--- /dev/null
+++ b/head_extractor/src/mmseg/structures/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .sampler import BasePixelSampler, OHEMPixelSampler, build_pixel_sampler
+from .seg_data_sample import SegDataSample
+
+__all__ = [
+    'SegDataSample', 'BasePixelSampler', 'OHEMPixelSampler',
+    'build_pixel_sampler'
+]
diff --git a/head_extractor/src/mmseg/structures/sampler/__init__.py b/head_extractor/src/mmseg/structures/sampler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..91d762d1b4552b391ece046fa3d094409011bcec
--- /dev/null
+++ b/head_extractor/src/mmseg/structures/sampler/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_pixel_sampler import BasePixelSampler
+from .builder import build_pixel_sampler
+from .ohem_pixel_sampler import OHEMPixelSampler
+
+__all__ = ['build_pixel_sampler', 'BasePixelSampler', 'OHEMPixelSampler']
diff --git a/head_extractor/src/mmseg/structures/sampler/base_pixel_sampler.py b/head_extractor/src/mmseg/structures/sampler/base_pixel_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..03672cd478a2e464cc734ae92686c86f219da0a9
--- /dev/null
+++ b/head_extractor/src/mmseg/structures/sampler/base_pixel_sampler.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BasePixelSampler(metaclass=ABCMeta):
+    """Base class of pixel sampler."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    @abstractmethod
+    def sample(self, seg_logit, seg_label):
+        """Placeholder for sample function."""
diff --git a/head_extractor/src/mmseg/structures/sampler/builder.py b/head_extractor/src/mmseg/structures/sampler/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..48e14790264a3d4c4ff54d84e5bab67b1623a1df
--- /dev/null
+++ b/head_extractor/src/mmseg/structures/sampler/builder.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmseg.registry import TASK_UTILS
+
+PIXEL_SAMPLERS = TASK_UTILS
+
+
+def build_pixel_sampler(cfg, **default_args):
+    """Build pixel sampler for segmentation map."""
+    warnings.warn(
+        '``build_pixel_sampler`` would be deprecated soon, please use '
+        '``mmseg.registry.TASK_UTILS.build()`` ')
+    return TASK_UTILS.build(cfg, default_args=default_args)
diff --git a/head_extractor/src/mmseg/structures/sampler/ohem_pixel_sampler.py b/head_extractor/src/mmseg/structures/sampler/ohem_pixel_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..a974273cab504be269e7f391e23a521b97bd8588
--- /dev/null
+++ b/head_extractor/src/mmseg/structures/sampler/ohem_pixel_sampler.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .base_pixel_sampler import BasePixelSampler
+from .builder import PIXEL_SAMPLERS
+
+
+@PIXEL_SAMPLERS.register_module()
+class OHEMPixelSampler(BasePixelSampler):
+    """Online Hard Example Mining Sampler for segmentation.
+
+    Args:
+        context (nn.Module): The context of sampler, subclass of
+            :obj:`BaseDecodeHead`.
+        thresh (float, optional): The threshold for hard example selection.
+            Below which, are prediction with low confidence. If not
+            specified, the hard examples will be pixels of top ``min_kept``
+            loss. Default: None.
+        min_kept (int, optional): The minimum number of predictions to keep.
+            Default: 100000.
+    """
+
+    def __init__(self, context, thresh=None, min_kept=100000):
+        super().__init__()
+        self.context = context
+        assert min_kept > 1
+        self.thresh = thresh
+        self.min_kept = min_kept
+
+    def sample(self, seg_logit, seg_label):
+        """Sample pixels that have high loss or with low prediction confidence.
+
+        Args:
+            seg_logit (torch.Tensor): segmentation logits, shape (N, C, H, W)
+            seg_label (torch.Tensor): segmentation label, shape (N, 1, H, W)
+
+        Returns:
+            torch.Tensor: segmentation weight, shape (N, H, W)
+        """
+        with torch.no_grad():
+            assert seg_logit.shape[2:] == seg_label.shape[2:]
+            assert seg_label.shape[1] == 1
+            seg_label = seg_label.squeeze(1).long()
+            batch_kept = self.min_kept * seg_label.size(0)
+            valid_mask = seg_label != self.context.ignore_index
+            seg_weight = seg_logit.new_zeros(size=seg_label.size())
+            valid_seg_weight = seg_weight[valid_mask]
+            if self.thresh is not None:
+                seg_prob = F.softmax(seg_logit, dim=1)
+
+                tmp_seg_label = seg_label.clone().unsqueeze(1)
+                tmp_seg_label[tmp_seg_label == self.context.ignore_index] = 0
+                seg_prob = seg_prob.gather(1, tmp_seg_label).squeeze(1)
+                sort_prob, sort_indices = seg_prob[valid_mask].sort()
+
+                if sort_prob.numel() > 0:
+                    min_threshold = sort_prob[min(batch_kept,
+                                                  sort_prob.numel() - 1)]
+                else:
+                    min_threshold = 0.0
+                threshold = max(min_threshold, self.thresh)
+                valid_seg_weight[seg_prob[valid_mask] < threshold] = 1.
+            else:
+                if not isinstance(self.context.loss_decode, nn.ModuleList):
+                    losses_decode = [self.context.loss_decode]
+                else:
+                    losses_decode = self.context.loss_decode
+                losses = 0.0
+                for loss_module in losses_decode:
+                    losses += loss_module(
+                        seg_logit,
+                        seg_label,
+                        weight=None,
+                        ignore_index=self.context.ignore_index,
+                        reduction_override='none')
+
+                # faster than topk according to https://github.com/pytorch/pytorch/issues/22812  # noqa
+                _, sort_indices = losses[valid_mask].sort(descending=True)
+                valid_seg_weight[sort_indices[:batch_kept]] = 1.
+
+            seg_weight[valid_mask] = valid_seg_weight
+
+            return seg_weight
diff --git a/head_extractor/src/mmseg/structures/seg_data_sample.py b/head_extractor/src/mmseg/structures/seg_data_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce68b5474330e2149d7d1c4de2d2406ae5b0345e
--- /dev/null
+++ b/head_extractor/src/mmseg/structures/seg_data_sample.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.structures import BaseDataElement, PixelData
+
+
+class SegDataSample(BaseDataElement):
+    """A data structure interface of MMSegmentation. They are used as
+    interfaces between different components.
+
+    The attributes in ``SegDataSample`` are divided into several parts:
+
+        - ``gt_sem_seg``(PixelData): Ground truth of semantic segmentation.
+        - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+        - ``seg_logits``(PixelData): Predicted logits of semantic segmentation.
+
+    Examples:
+         >>> import torch
+         >>> import numpy as np
+         >>> from mmengine.structures import PixelData
+         >>> from mmseg.structures import SegDataSample
+
+         >>> data_sample = SegDataSample()
+         >>> img_meta = dict(img_shape=(4, 4, 3),
+         ...                 pad_shape=(4, 4, 3))
+         >>> gt_segmentations = PixelData(metainfo=img_meta)
+         >>> gt_segmentations.data = torch.randint(0, 2, (1, 4, 4))
+         >>> data_sample.gt_sem_seg = gt_segmentations
+         >>> assert 'img_shape' in data_sample.gt_sem_seg.metainfo_keys()
+         >>> data_sample.gt_sem_seg.shape
+         (4, 4)
+         >>> print(data_sample)
+        <SegDataSample(
+
+            META INFORMATION
+
+            DATA FIELDS
+            gt_sem_seg: <PixelData(
+
+                    META INFORMATION
+                    img_shape: (4, 4, 3)
+                    pad_shape: (4, 4, 3)
+
+                    DATA FIELDS
+                    data: tensor([[[1, 1, 1, 0],
+                                 [1, 0, 1, 1],
+                                 [1, 1, 1, 1],
+                                 [0, 1, 0, 1]]])
+                ) at 0x1c2b4156460>
+        ) at 0x1c2aae44d60>
+
+        >>> data_sample = SegDataSample()
+        >>> gt_sem_seg_data = dict(sem_seg=torch.rand(1, 4, 4))
+        >>> gt_sem_seg = PixelData(**gt_sem_seg_data)
+        >>> data_sample.gt_sem_seg = gt_sem_seg
+        >>> assert 'gt_sem_seg' in data_sample
+        >>> assert 'sem_seg' in data_sample.gt_sem_seg
+    """
+
+    @property
+    def gt_sem_seg(self) -> PixelData:
+        return self._gt_sem_seg
+
+    @gt_sem_seg.setter
+    def gt_sem_seg(self, value: PixelData) -> None:
+        self.set_field(value, '_gt_sem_seg', dtype=PixelData)
+
+    @gt_sem_seg.deleter
+    def gt_sem_seg(self) -> None:
+        del self._gt_sem_seg
+
+    @property
+    def pred_sem_seg(self) -> PixelData:
+        return self._pred_sem_seg
+
+    @pred_sem_seg.setter
+    def pred_sem_seg(self, value: PixelData) -> None:
+        self.set_field(value, '_pred_sem_seg', dtype=PixelData)
+
+    @pred_sem_seg.deleter
+    def pred_sem_seg(self) -> None:
+        del self._pred_sem_seg
+
+    @property
+    def seg_logits(self) -> PixelData:
+        return self._seg_logits
+
+    @seg_logits.setter
+    def seg_logits(self, value: PixelData) -> None:
+        self.set_field(value, '_seg_logits', dtype=PixelData)
+
+    @seg_logits.deleter
+    def seg_logits(self) -> None:
+        del self._seg_logits
diff --git a/head_extractor/src/mmseg/torchhub/README.md b/head_extractor/src/mmseg/torchhub/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..eade757c3b0a25c350ba6bf3b5d2e6f048ad1de6
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/README.md
@@ -0,0 +1,3 @@
+# Local PyTorch Hub
+
+This directory is for loading the DINOv2 encoder locally in case of no Internet connection.
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/CODE_OF_CONDUCT.md b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..3232ed665566ec047ce55a929db1581dbda266a1
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/CODE_OF_CONDUCT.md
@@ -0,0 +1,80 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@meta.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/CONTRIBUTING.md b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..afc89823fc90b920f0758f50e4d808df6a884a34
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/CONTRIBUTING.md
@@ -0,0 +1,31 @@
+# Contributing to DINOv2
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## License
+By contributing to DINOv2, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/LICENSE b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..a115f899f8d09ef3b1def4a16c7bae1a0bd50fbe
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/LICENSE
@@ -0,0 +1,400 @@
+
+Attribution-NonCommercial 4.0 International
+
+=======================================================================
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public: 
+	wiki.creativecommons.org/Considerations_for_licensees
+
+=======================================================================
+
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+
+Section 1 -- Definitions.
+
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+
+Section 2 -- Scope.
+
+  a. License grant.
+
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+
+       5. Downstream recipients.
+
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+
+  b. Other rights.
+
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+  a. Attribution.
+
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+
+                ii. a copyright notice;
+
+               iii. a notice that refers to this Public License;
+
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+
+Section 6 -- Term and Termination.
+
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+
+       2. upon express reinstatement by the Licensor.
+
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+
+Section 7 -- Other Terms and Conditions.
+
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+
+Section 8 -- Interpretation.
+
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+
+=======================================================================
+
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+
+Creative Commons may be contacted at creativecommons.org.
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/MODEL_CARD.md b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/MODEL_CARD.md
new file mode 100644
index 0000000000000000000000000000000000000000..5cd35748eb3c5d8f607f83ff068367a0102117c5
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/MODEL_CARD.md
@@ -0,0 +1,201 @@
+# Model Card for DINOv2-S/B/L/g
+
+These are Vision Transformer models trained following the method described in the paper:
+"DINOv2: Learning Robust Visual Features without Supervision"
+
+We provide 4 models: 1 ViT-g trained from scratch, and 3 ViT-S/B/L models distilled from the ViT-g.
+
+## Model Details
+The model takes an image as input and returns a class token and patch tokens.
+
+The embedding dimension is: 
+- 384 for ViT-S.
+- 768 for ViT-B.
+- 1024 for ViT-L.
+- 1536 for ViT-g.
+
+The models follow a Transformer architecture, with a patch size of 14.
+
+For a 224x224 image, this results in 1 class token + 256 patch tokens.
+
+The models can accept larger images provided the image shapes are multiples of the patch size (14). 
+If this condition is not verified, the model will crop to the closest smaller multiple of the patch size.
+
+### Model Description
+
+- **Developed by:** Meta AI
+- **Model type:** Vision Transformer
+- **License:** CC-BY-NC
+
+- **Repository:** https://github.com/facebookresearch/dinov2
+- **Paper:** https://arxiv.org/abs/2304.07193
+- **Demo:** https://dinov2.metademolab.com/
+
+## Uses
+
+The models are vision backbones providing multi-purpose features for downstream tasks.
+
+### Direct Use
+
+The models can be used without fine-tuning, with downstream classifiers as simple as linear layers, to obtain competitive results:
+- on depth estimation, semantic segmentation, using linear layers.
+- on image classification, using k-NN classifiers on the class token.
+- on image classification, with logistic regression classifiers applied on the class token.
+- on image classification, with a linear layer applied on the class token and the average of the patch tokens.
+- on image retrieval using nearest neighbors.
+
+### Downstream Use
+
+It is technically possible to perform fine-tuning on the models, for small gains (we measured +2% on ImageNet-1k classification). 
+We recommend keeping this as a very last step and only when necessary, as the features already provide good performance out-of-the-box.
+
+## Bias, Risks, and Limitations
+
+Despite improvements thanks to the training method not using annotations, we still observe significant biases in our models toward rich households from Western countries.
+
+### Recommendations
+
+We expect fine-tuning will increase the biases in the features produced by the model as they will be tuned to the fine-tuning labels.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+```python
+import torch
+dinov2_vits14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
+dinov2_vitb14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14')
+dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
+dinov2_vitg14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14')
+```
+
+## Training Details
+
+### Training Data
+
+- **Training data:** LVD-142M (see paper)
+- **Training regime:** fp16 using PyTorch-FSDP mixed-precision.
+
+### Training Procedure 
+
+- **Training objective:**
+  - DINO self-distillation loss with multi-crop
+  - iBOT masked-image modeling loss
+  - KoLeo regularization on [CLS] tokens
+- **Architectures:**
+  - ViT-S (21M params): Patch size 14, embedding dimension 384, 6 heads, MLP FFN
+  - ViT-B (86M params): Patch size 14, embedding dimension 768, 12 heads, MLP FFN
+  - ViT-L (0.3B params): Patch size 14, embedding dimension 1024, 16 heads, MLP FFN
+  - ViT-g (1.1B params): Patch size 14, embedding dimension 1536, 24 heads, SwiGLU FFN
+- **Distillation:**
+  - Distillation follows the standard DINOv2 pretraining procedure, except the teacher is a pretrained ViT-g, frozen.
+
+## Evaluation
+
+We refer users to the associated paper for the evaluation protocols.
+
+<table>
+  <tr>
+    <th>model</th>
+    <th colspan="3">ImageNet-1k</th>
+    <th>NYU-Depth v2</th>
+    <th>SUN-RGBD</th>
+    <th>ADE20k</th>
+    <th>iNaturalist 2018</th>
+    <th>Oxford-H</th>
+  </tr>
+  <tr>
+    <th rowspan="2">task</th>
+    <th>classif. (acc)</th>
+    <th>classif. (acc)</th>
+    <th>classif. V2 (acc)</th>
+    <th>depth (RMSE)</th>
+    <th>depth (RMSE)</th>
+    <th>segm. (mAP)</th>
+    <th>classif. (acc)</th>
+    <th>retrieval (mAP)</th>
+  </tr>
+  <tr>
+    <!-- <th>^</th> -->
+    <th>k-NN</th>
+    <th>linear</th>
+    <th>linear</th>
+    <th>linear<br />4 layers</th>
+    <th>NYU-D transfer</th>
+    <th>multiscale</th>
+    <th>linear</th>
+    <th>nearest neighbor</th>
+  </tr>
+  <tr>
+    <td>ViT-S/14</td>
+    <td align="right">79.0%</td>
+    <td align="right">81.1%</td>
+    <td align="right">70.8%</td> 
+    <td align="right">0.417</td> 
+    <td align="right">0.431</td> 
+    <td align="right">47.2</td> 
+    <td align="right">69.5%</td> 
+    <td align="right">43.2</td> 
+  </tr>
+  <tr>
+    <td>ViT-B/14</td>
+    <td align="right">82.1%</td>
+    <td align="right">84.5%</td>
+    <td align="right">74.9%</td>
+    <td align="right">0.362</td> 
+    <td align="right">0.400</td> 
+    <td align="right">51.3</td> 
+    <td align="right">76.3%</td> 
+    <td align="right">49.5</td> 
+  </tr>
+  <tr>
+    <td>ViT-L/14</td>
+    <td align="right">83.5%</td>
+    <td align="right">86.3%</td>
+    <td align="right">77.6%</td>
+    <td align="right">0.333</td> 
+    <td align="right">0.396</td> 
+    <td align="right">53.1</td> 
+    <td align="right">79.8%</td> 
+    <td align="right">54.0</td> 
+  </tr>
+  <tr>
+    <td>ViT-g/14</td>
+    <td align="right">83.5%</td>
+    <td align="right">86.5%</td>
+    <td align="right">78.4%</td>
+    <td align="right">0.298</td> 
+    <td align="right">0.362</td> 
+    <td align="right">53.0</td> 
+    <td align="right">81.6%</td> 
+    <td align="right">52.3</td> 
+  </tr>
+</table>
+
+## Environmental Impact
+
+- **Hardware Type:** Nvidia A100
+- **Hours used:** 22,000 for ViT-g, 4,500 for ViT-S distillation, 5,300 for ViT-B distillation, 8,000 for ViT-L distillation
+- **Cloud Provider:** Private infra
+- **Compute Region:** USA
+- **Carbon Emitted:** 7t CO2eq
+
+#### Hardware
+
+Nvidia A100 GPUs
+
+#### Software
+
+PyTorch 2.0,
+xFormers 0.0.18
+
+**BibTeX**
+
+```
+@misc{oquab2023dinov2,
+  title={DINOv2: Learning Robust Visual Features without Supervision},
+  author={Oquab, Maxime and Darcet, Timothée and Moutakanni, Theo and Vo, Huy and Szafraniec, Marc and Khalidov, Vasil and Fernandez, Pierre and Haziza, Daniel and Massa, Francisco and El-Nouby, Alaaeldin and Howes, Russell and Huang, Po-Yao and Xu, Hu and Sharma, Vasu and Li, Shang-Wen and Galuba, Wojciech and Rabbat, Mike and Assran, Mido and Ballas, Nicolas and Synnaeve, Gabriel and Misra, Ishan and Jegou, Herve and Mairal, Julien and Labatut, Patrick and Joulin, Armand and Bojanowski, Piotr},
+  journal={arXiv:2304.07193},
+  year={2023}
+}
+```
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/README.md b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..96453e567dee10148be83b5e92d91f347f8521d5
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/README.md
@@ -0,0 +1,277 @@
+# DINOv2: Learning Robust Visual Features without Supervision
+
+**[Meta AI Research, FAIR](https://ai.facebook.com/research/)**
+
+Maxime Oquab,
+Timothée Darcet,
+Théo Moutakanni,
+Huy V. Vo,
+Marc Szafraniec,
+Vasil Khalidov,
+Patrick Labatut,
+Armand Joulin,
+Piotr Bojanowski
+
+[[`Paper`](https://arxiv.org/abs/2304.07193)] [[`Blog`](https://ai.facebook.com/blog/dino-v2-computer-vision-self-supervised-learning/)] [[`Demo`](https://dinov2.metademolab.com)] [[`BibTeX`](#citing-dinov2)]
+
+PyTorch implementation and pretrained models for DINOv2. For details, see the paper: **[DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193)**.
+
+DINOv2 models produce high-performance visual features that can be directly employed with classifiers as simple as linear layers on a variety of computer vision tasks; these visual features are robust and perform well across domains without any requirement for fine-tuning. The models were pretrained on a dataset of 142 M images without using any labels or annotations.
+
+https://github.com/facebookresearch/dinov2/assets/60359573/f168823e-7922-415a-b429-578badf5c356
+
+<div align="center">
+  Visualization of the three first principal components of the patch features of all frames, mapped to RGB values.
+</div>
+
+## Pretrained models
+
+<table style="margin: auto">
+  <tr>
+    <th>model</th>
+    <th># of<br />params</th>
+    <th>ImageNet<br />k-NN</th>
+    <th>ImageNet<br />linear</th>
+    <th>download</th>
+  </tr>
+  <tr>
+    <td>ViT-S/14 distilled</td>
+    <td align="right">21 M</td>
+    <td align="right">79.0%</td>
+    <td align="right">81.1%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth">backbone only</a></td>
+  </tr>
+  <tr>
+    <td>ViT-B/14 distilled</td>
+    <td align="right">86 M</td>
+    <td align="right">82.1%</td>
+    <td align="right">84.5%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pth">backbone only</a></td>
+  </tr>
+  <tr>
+    <td>ViT-L/14 distilled</td>
+    <td align="right">300 M</td>
+    <td align="right">83.5%</td>
+    <td align="right">86.3%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth">backbone only</a></td>
+  </tr>
+  <tr>
+    <td>ViT-g/14</td>
+    <td align="right">1,100 M</td>
+    <td align="right">83.5%</td>
+    <td align="right">86.5%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_pretrain.pth">backbone only</a></td>
+  </tr>
+</table>
+
+### Pretrained models via PyTorch Hub
+
+Please follow the instructions [here](https://pytorch.org/get-started/locally/) to install PyTorch (the only required dependency for loading the model). Installing PyTorch with CUDA support is strongly recommended.
+
+A corresponding [model card](MODEL_CARD.md) is included in the repository.
+
+```python
+import torch
+
+dinov2_vits14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
+dinov2_vitb14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14')
+dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
+dinov2_vitg14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitg14')
+```
+
+## Installation
+
+The training and evaluation code requires PyTorch 2.0 and [xFormers](https://github.com/facebookresearch/xformers) 0.0.18 as well as a number of other 3rd party packages. Note that the code has only been tested with the specified versions and also expects a Linux environment. To setup all the required dependencies for training and evaluation, please follow the instructions below:
+
+*[conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html)* **(Recommended)** - Clone the repository and then create and activate a `dinov2` conda environment using the provided environment definition:
+
+```shell
+conda env create -f conda.yaml
+conda activate dinov2
+```
+
+*[pip](https://pip.pypa.io/en/stable/getting-started/)* - Clone the repository and then use the provided `requirements.txt` to install the dependencies:
+
+```shell
+pip install -r requirements.txt
+```
+
+## Data preparation
+
+### ImageNet-1k
+
+The root directory of the dataset should hold the following contents:
+
+- `<ROOT>/test/ILSVRC2012_test_00000001.JPEG`
+- `<ROOT>/test/[..]`
+- `<ROOT>/test/ILSVRC2012_test_00100000.JPEG`
+- `<ROOT>/train/n01440764/n01440764_10026.JPEG`
+- `<ROOT>/train/[...]`
+- `<ROOT>/train/n15075141/n15075141_9993.JPEG`
+- `<ROOT>/val/n01440764/ILSVRC2012_val_00000293.JPEG`
+- `<ROOT>/val/[...]`
+- `<ROOT>/val/n15075141/ILSVRC2012_val_00049174.JPEG`
+- `<ROOT>/labels.txt`
+
+The provided dataset implementation expects a few additional metadata files to be present under the extra directory:
+
+- `<EXTRA>/class-ids-TRAIN.npy`
+- `<EXTRA>/class-ids-VAL.npy`
+- `<EXTRA>/class-names-TRAIN.npy`
+- `<EXTRA>/class-names-VAL.npy`
+- `<EXTRA>/entries-TEST.npy`
+- `<EXTRA>/entries-TRAIN.npy`
+- `<EXTRA>/entries-VAL.npy`
+
+These metadata files can be generated (once) with the following lines of Python code:
+
+```python
+from dinov2.data.datasets import ImageNet
+
+for split in ImageNet.Split:
+    dataset = ImageNet(split=split, root="<ROOT>", extra="<EXTRA>")
+    dataset.dump_extra()
+```
+
+Note that the root and extra directories do not have to be distinct directories.
+
+### ImageNet-22k
+
+Please adapt the [dataset class](dinov2/data/datasets/image_net_22k.py) to match your local setup.
+
+<br />
+
+:warning: To execute the commands provided in the next sections for training and evaluation, the `dinov2` package should be included in the Python module search path, i.e. simply prefix the command to run with `PYTHONPATH=.`.
+
+## Training
+
+### Fast setup: training DINOv2 ViT-L/16 on ImageNet-1k
+
+Run DINOv2 training on 4 A100-80GB nodes (32 GPUs) in a SLURM cluster environment with submitit:
+
+```shell
+python dinov2/run/train/train.py \
+    --nodes 4 \
+    --config-file dinov2/configs/train/vitl16_short.yaml \
+    --output-dir <PATH/TO/OUTPUT/DIR> \
+    train.dataset_path=ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
+```
+
+Training time is approximately 1 day and the resulting checkpoint should reach 81.6% on k-NN eval and 82.9% on linear eval.
+
+The training code saves the weights of the teacher in the `eval` folder every 12500 iterations for evaluation.
+
+### Long setup: training DINOv2 ViT-L/14 on ImageNet-22k
+
+Run DINOv2 training on 12 A100-80GB nodes (96 GPUs) in a SLURM cluster environment with submitit:
+
+```shell
+python dinov2/run/train/train.py \
+    --nodes 12 \
+    --config-file dinov2/configs/train/vitl14.yaml \
+    --output-dir <PATH/TO/OUTPUT/DIR> \
+    train.dataset_path=ImageNet22k:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
+```
+
+Training time is approximately 3.3 days and the resulting checkpoint should reach 82.0% on k-NN eval and 84.5% on linear eval.
+
+The training code saves the weights of the teacher in the `eval` folder every 12500 iterations for evaluation.
+
+
+## Evaluation
+
+The training code regularly saves the teacher weights. In order to evaluate the model, run the following evaluation on a single node:
+
+### k-NN classification on ImageNet-1k
+
+```shell
+python dinov2/run/eval/knn.py \
+    --config-file <PATH/TO/OUTPUT/DIR>/config.yaml \
+    --pretrained-weights <PATH/TO/OUTPUT/DIR>/eval/training_24999/teacher_checkpoint.pth \
+    --output-dir <PATH/TO/OUTPUT/DIR>/eval/training_24999/knn \
+    --train-dataset ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
+    --val-dataset ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
+```
+
+### Logistic regression classification on ImageNet-1k
+
+```shell
+python dinov2/run/eval/log_regression.py \
+    --config-file <PATH/TO/OUTPUT/DIR>/config.yaml \
+    --pretrained-weights <PATH/TO/OUTPUT/DIR>/eval/training_24999/teacher_checkpoint.pth \
+    --output-dir <PATH/TO/OUTPUT/DIR>/eval/training_24999/logreg \
+    --train-dataset ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
+    --val-dataset ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
+```
+
+### Linear classification with data augmentation on ImageNet-1k
+
+```shell
+python dinov2/run/eval/linear.py \
+    --config-file <PATH/TO/OUTPUT/DIR>/config.yaml \
+    --pretrained-weights <PATH/TO/OUTPUT/DIR>/eval/training_24999/teacher_checkpoint.pth \
+    --output-dir <PATH/TO/OUTPUT/DIR>/eval/training_24999/linear \
+    --train-dataset ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
+    --val-dataset ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
+```
+
+We release the weights from evaluating the different models:
+
+<table style="margin: auto">
+  <tr>
+    <th>model</th>
+    <th>ImageNet<br />top-1</th>
+    <th>linear evaluation</th>
+  </tr>
+  <tr>
+    <td>ViT-S/14 distilled</td>
+    <td align="right">81.1%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_linear_head.pth">linear head weights</a></td>
+  </tr>
+  <tr>
+    <td>ViT-B/14 distilled</td>
+    <td align="right">84.5%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_linear_head.pth">linear head weights</a></td>
+  </tr>
+  <tr>
+    <td>ViT-L/14 distilled</td>
+    <td align="right">86.3%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_linear_head.pth">linear head weights</a></td>
+  </tr>
+  <tr>
+    <td>ViT-g/14</td>
+    <td align="right">86.5%</td>
+    <td><a href="https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_linear_head.pth">linear head weights</a></td>
+  </tr>
+</table>
+
+The performance of the provided pretrained model weights can be evaluated as follows on ImageNet-1k:
+
+```shell
+python dinov2/run/eval/linear.py \
+    --config-file dinov2/configs/eval/vitg14_pretrain.yaml \
+    --pretrained-weights https://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_pretrain.pth \
+    --train-dataset ImageNet:split=TRAIN:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET> \
+    --val-dataset ImageNet:split=VAL:root=<PATH/TO/DATASET>:extra=<PATH/TO/DATASET>
+```
+
+## License
+
+DINOv2 code and model weights are released under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for additional details.
+
+## Contributing
+
+See [contributing](CONTRIBUTING.md) and the [code of conduct](CODE_OF_CONDUCT.md).
+
+## Citing DINOv2
+
+If you find this repository useful, please consider giving a star :star: and citation :t-rex::
+
+```
+@misc{oquab2023dinov2,
+  title={DINOv2: Learning Robust Visual Features without Supervision},
+  author={Oquab, Maxime and Darcet, Timothée and Moutakanni, Theo and Vo, Huy V. and Szafraniec, Marc and Khalidov, Vasil and Fernandez, Pierre and Haziza, Daniel and Massa, Francisco and El-Nouby, Alaaeldin and Howes, Russell and Huang, Po-Yao and Xu, Hu and Sharma, Vasu and Li, Shang-Wen and Galuba, Wojciech and Rabbat, Mike and Assran, Mido and Ballas, Nicolas and Synnaeve, Gabriel and Misra, Ishan and Jegou, Herve and Mairal, Julien and Labatut, Patrick and Joulin, Armand and Bojanowski, Piotr},
+  journal={arXiv:2304.07193},
+  year={2023}
+}
+```
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/conda.yaml b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/conda.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..35dfc30adc275da51b58ff2340dd1d53d2cb9250
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/conda.yaml
@@ -0,0 +1,22 @@
+name: dinov2
+channels:
+  - defaults
+  - pytorch
+  - nvidia
+  - xformers
+  - conda-forge
+dependencies:
+  - python=3.9
+  - pytorch::pytorch=2.0.0
+  - pytorch::pytorch-cuda=11.7.0
+  - pytorch::torchvision=0.15.0
+  - omegaconf
+  - torchmetrics=0.10.3
+  - fvcore
+  - iopath
+  - xformers::xformers=0.0.18
+  - pip
+  - pip:
+    - git+https://github.com/facebookincubator/submitit
+    - --extra-index-url https://pypi.nvidia.com
+    - cuml-cu11
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b4afb514783786adf76744f9b97f3e1db1d6081
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+__version__ = "0.0.1"
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..033c35660450afec6612adb342c7c30e1ccd15ee
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pathlib
+
+from omegaconf import OmegaConf
+
+
+def load_config(config_name: str):
+    config_filename = config_name + ".yaml"
+    return OmegaConf.load(pathlib.Path(__file__).parent.resolve() / config_filename)
+
+
+dinov2_default_config = load_config("ssl_default_config")
+
+
+def load_and_merge_config(config_name: str):
+    default_config = OmegaConf.create(dinov2_default_config)
+    loaded_config = load_config(config_name)
+    return OmegaConf.merge(default_config, loaded_config)
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitb14_pretrain.yaml b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitb14_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..117d0f027ca26cd8ce6c010bb78d5a8fac42c70e
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitb14_pretrain.yaml
@@ -0,0 +1,6 @@
+student:
+  arch: vit_base
+  patch_size: 14
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitg14_pretrain.yaml b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitg14_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a96dd5b117b4d59ee210b65037821f1b3e3f16e3
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitg14_pretrain.yaml
@@ -0,0 +1,7 @@
+student:
+  arch: vit_giant2
+  patch_size: 14
+  ffn_layer: swiglufused
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitl14_pretrain.yaml b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitl14_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7a984548bd034f762d455419d7193917fa462dd8
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vitl14_pretrain.yaml
@@ -0,0 +1,6 @@
+student:
+  arch: vit_large
+  patch_size: 14
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vits14_pretrain.yaml b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vits14_pretrain.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..afbdb4ba14f1c97130a25b579360f4d817cda495
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/eval/vits14_pretrain.yaml
@@ -0,0 +1,6 @@
+student:
+  arch: vit_small
+  patch_size: 14
+crops:
+  global_crops_size: 518  # this is to set up the position embeddings properly
+  local_crops_size: 98
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/ssl_default_config.yaml b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/ssl_default_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a4ef04545ce9d6cc52b5179236008adc8a9bbda2
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/ssl_default_config.yaml
@@ -0,0 +1,115 @@
+MODEL:
+  WEIGHTS: ''
+compute_precision:
+  grad_scaler: true
+  teacher:
+    backbone:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+    dino_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+    ibot_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+  student:
+    backbone:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp16
+        buffer_dtype: fp32
+    dino_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp32
+        buffer_dtype: fp32
+    ibot_head:
+      sharding_strategy: SHARD_GRAD_OP
+      mixed_precision:
+        param_dtype: fp16
+        reduce_dtype: fp32
+        buffer_dtype: fp32
+dino:
+  loss_weight: 1.0
+  head_n_prototypes: 65536
+  head_bottleneck_dim: 256
+  head_nlayers: 3
+  head_hidden_dim: 2048
+  koleo_loss_weight: 0.1
+ibot:
+  loss_weight: 1.0
+  mask_sample_probability: 0.5
+  mask_ratio_min_max:
+  - 0.1
+  - 0.5
+  separate_head: false
+  head_n_prototypes: 65536
+  head_bottleneck_dim: 256
+  head_nlayers: 3
+  head_hidden_dim: 2048
+train:
+  batch_size_per_gpu: 64
+  dataset_path: ImageNet:split=TRAIN
+  output_dir: .
+  saveckp_freq: 20
+  seed: 0
+  num_workers: 10
+  OFFICIAL_EPOCH_LENGTH: 1250
+  cache_dataset: true
+  centering: "centering" # or "sinkhorn_knopp"
+student:
+  arch: vit_large
+  patch_size: 16
+  drop_path_rate: 0.3
+  layerscale: 1.0e-05
+  drop_path_uniform: true
+  pretrained_weights: ''
+  ffn_layer: "mlp"
+  block_chunks: 0
+  qkv_bias: true
+  proj_bias: true
+  ffn_bias: true
+teacher:
+  momentum_teacher: 0.992
+  final_momentum_teacher: 1
+  warmup_teacher_temp: 0.04
+  teacher_temp: 0.07
+  warmup_teacher_temp_epochs: 30
+optim:
+  epochs: 100
+  weight_decay: 0.04
+  weight_decay_end: 0.4
+  base_lr: 0.004  # learning rate for a batch size of 1024
+  lr: 0.  # will be set after applying scaling rule
+  warmup_epochs: 10
+  min_lr: 1.0e-06
+  clip_grad: 3.0
+  freeze_last_layer_epochs: 1
+  scaling_rule: sqrt_wrt_1024
+  patch_embed_lr_mult: 0.2
+  layerwise_decay: 0.9
+  adamw_beta1: 0.9
+  adamw_beta2: 0.999
+crops:
+  global_crops_scale:
+  - 0.32
+  - 1.0
+  local_crops_number: 8
+  local_crops_scale:
+  - 0.05
+  - 0.32
+  global_crops_size: 224
+  local_crops_size: 96
+evaluation:
+  eval_period_iterations: 12500
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitg14.yaml b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitg14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d05cf0d59e07ac6e4a2b0f9bdcb6131d7c508962
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitg14.yaml
@@ -0,0 +1,26 @@
+dino:
+  head_n_prototypes: 131072
+  head_bottleneck_dim: 384
+ibot:
+  separate_head: true
+  head_n_prototypes: 131072
+train:
+  batch_size_per_gpu: 12
+  dataset_path: ImageNet22k
+  centering: sinkhorn_knopp
+student:
+  arch: vit_giant2
+  patch_size: 14
+  drop_path_rate: 0.4
+  ffn_layer: swiglufused
+  block_chunks: 4
+teacher:
+  momentum_teacher: 0.994
+optim:
+  epochs: 500
+  weight_decay_end: 0.2
+  base_lr: 2.0e-04  # learning rate for a batch size of 1024
+  warmup_epochs: 80
+  layerwise_decay: 1.0
+crops:
+  local_crops_size: 98
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl14.yaml b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl14.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d9b491dcc6a522c71328fc2933dd0501123c8f6b
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl14.yaml
@@ -0,0 +1,26 @@
+dino:
+  head_n_prototypes: 131072
+  head_bottleneck_dim: 384
+ibot:
+  separate_head: true
+  head_n_prototypes: 131072
+train:
+  batch_size_per_gpu: 32
+  dataset_path: ImageNet22k
+  centering: sinkhorn_knopp
+student:
+  arch: vit_large
+  patch_size: 14
+  drop_path_rate: 0.4
+  ffn_layer: swiglufused
+  block_chunks: 4
+teacher:
+  momentum_teacher: 0.994
+optim:
+  epochs: 500
+  weight_decay_end: 0.2
+  base_lr: 2.0e-04  # learning rate for a batch size of 1024
+  warmup_epochs: 80
+  layerwise_decay: 1.0
+crops:
+  local_crops_size: 98
\ No newline at end of file
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl16_short.yaml b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl16_short.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3e7e72864c92175a1354142ac1d64da8070d1e5e
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/configs/train/vitl16_short.yaml
@@ -0,0 +1,6 @@
+# this corresponds to the default config
+train:
+  dataset_path: ImageNet:split=TRAIN
+  batch_size_per_gpu: 64
+student:
+  block_chunks: 4
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..357db5c542c5810391ba2bd45a60c13c01c3737a
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .adapters import DatasetWithEnumeratedTargets
+from .loaders import make_data_loader, make_dataset, SamplerType
+from .collate import collate_data_and_cast
+from .masking import MaskingGenerator
+from .augmentations import DataAugmentationDINO
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dcbc68e046f03460d5867f1388d5380d9c6f603
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/adapters.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Tuple
+
+from torch.utils.data import Dataset
+
+
+class DatasetWithEnumeratedTargets(Dataset):
+    def __init__(self, dataset):
+        self._dataset = dataset
+
+    def get_image_data(self, index: int) -> bytes:
+        return self._dataset.get_image_data(index)
+
+    def get_target(self, index: int) -> Tuple[Any, int]:
+        target = self._dataset.get_target(index)
+        return (index, target)
+
+    def __getitem__(self, index: int) -> Tuple[Any, Tuple[Any, int]]:
+        image, target = self._dataset[index]
+        target = index if target is None else target
+        return image, (index, target)
+
+    def __len__(self) -> int:
+        return len(self._dataset)
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/augmentations.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/augmentations.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ca28cb59a4de2566a6c9ef9c301cbbb4e54b5ee
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/augmentations.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from torchvision import transforms
+
+from .transforms import (
+    GaussianBlur,
+    make_normalize_transform,
+)
+
+
+logger = logging.getLogger("dinov2")
+
+
+class DataAugmentationDINO(object):
+    def __init__(
+        self,
+        global_crops_scale,
+        local_crops_scale,
+        local_crops_number,
+        global_crops_size=224,
+        local_crops_size=96,
+    ):
+        self.global_crops_scale = global_crops_scale
+        self.local_crops_scale = local_crops_scale
+        self.local_crops_number = local_crops_number
+        self.global_crops_size = global_crops_size
+        self.local_crops_size = local_crops_size
+
+        logger.info("###################################")
+        logger.info("Using data augmentation parameters:")
+        logger.info(f"global_crops_scale: {global_crops_scale}")
+        logger.info(f"local_crops_scale: {local_crops_scale}")
+        logger.info(f"local_crops_number: {local_crops_number}")
+        logger.info(f"global_crops_size: {global_crops_size}")
+        logger.info(f"local_crops_size: {local_crops_size}")
+        logger.info("###################################")
+
+        # random resized crop and flip
+        self.geometric_augmentation_global = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    global_crops_size, scale=global_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.RandomHorizontalFlip(p=0.5),
+            ]
+        )
+
+        self.geometric_augmentation_local = transforms.Compose(
+            [
+                transforms.RandomResizedCrop(
+                    local_crops_size, scale=local_crops_scale, interpolation=transforms.InterpolationMode.BICUBIC
+                ),
+                transforms.RandomHorizontalFlip(p=0.5),
+            ]
+        )
+
+        # color distorsions / blurring
+        color_jittering = transforms.Compose(
+            [
+                transforms.RandomApply(
+                    [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
+                    p=0.8,
+                ),
+                transforms.RandomGrayscale(p=0.2),
+            ]
+        )
+
+        global_transfo1_extra = GaussianBlur(p=1.0)
+
+        global_transfo2_extra = transforms.Compose(
+            [
+                GaussianBlur(p=0.1),
+                transforms.RandomSolarize(threshold=128, p=0.2),
+            ]
+        )
+
+        local_transfo_extra = GaussianBlur(p=0.5)
+
+        # normalization
+        self.normalize = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                make_normalize_transform(),
+            ]
+        )
+
+        self.global_transfo1 = transforms.Compose([color_jittering, global_transfo1_extra, self.normalize])
+        self.global_transfo2 = transforms.Compose([color_jittering, global_transfo2_extra, self.normalize])
+        self.local_transfo = transforms.Compose([color_jittering, local_transfo_extra, self.normalize])
+
+    def __call__(self, image):
+        output = {}
+
+        # global crops:
+        im1_base = self.geometric_augmentation_global(image)
+        global_crop_1 = self.global_transfo1(im1_base)
+
+        im2_base = self.geometric_augmentation_global(image)
+        global_crop_2 = self.global_transfo2(im2_base)
+
+        output["global_crops"] = [global_crop_1, global_crop_2]
+
+        # global crops for teacher:
+        output["global_crops_teacher"] = [global_crop_1, global_crop_2]
+
+        # local crops:
+        local_crops = [
+            self.local_transfo(self.geometric_augmentation_local(image)) for _ in range(self.local_crops_number)
+        ]
+        output["local_crops"] = local_crops
+        output["offsets"] = ()
+
+        return output
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/collate.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/collate.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f0d98906808ed326dff4486d95b3ec04f8a5e75
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/collate.py
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import random
+
+
+def collate_data_and_cast(samples_list, mask_ratio_tuple, mask_probability, dtype, n_tokens=None, mask_generator=None):
+    # dtype = torch.half  # TODO: Remove
+
+    n_global_crops = len(samples_list[0][0]["global_crops"])
+    n_local_crops = len(samples_list[0][0]["local_crops"])
+
+    collated_global_crops = torch.stack([s[0]["global_crops"][i] for i in range(n_global_crops) for s in samples_list])
+
+    collated_local_crops = torch.stack([s[0]["local_crops"][i] for i in range(n_local_crops) for s in samples_list])
+
+    B = len(collated_global_crops)
+    N = n_tokens
+    n_samples_masked = int(B * mask_probability)
+    probs = torch.linspace(*mask_ratio_tuple, n_samples_masked + 1)
+    upperbound = 0
+    masks_list = []
+    for i in range(0, n_samples_masked):
+        prob_min = probs[i]
+        prob_max = probs[i + 1]
+        masks_list.append(torch.BoolTensor(mask_generator(int(N * random.uniform(prob_min, prob_max)))))
+        upperbound += int(N * prob_max)
+    for i in range(n_samples_masked, B):
+        masks_list.append(torch.BoolTensor(mask_generator(0)))
+
+    random.shuffle(masks_list)
+
+    collated_masks = torch.stack(masks_list).flatten(1)
+    mask_indices_list = collated_masks.flatten().nonzero().flatten()
+
+    masks_weight = (1 / collated_masks.sum(-1).clamp(min=1.0)).unsqueeze(-1).expand_as(collated_masks)[collated_masks]
+
+    return {
+        "collated_global_crops": collated_global_crops.to(dtype),
+        "collated_local_crops": collated_local_crops.to(dtype),
+        "collated_masks": collated_masks,
+        "mask_indices_list": mask_indices_list,
+        "masks_weight": masks_weight,
+        "upperbound": upperbound,
+        "n_masked_patches": torch.full((1,), fill_value=mask_indices_list.shape[0], dtype=torch.long),
+    }
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b537aee8fe31d7e0fa06713d2cfe9233ff0ef60
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .image_net import ImageNet
+from .image_net_22k import ImageNet22k
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/decoders.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/decoders.py
new file mode 100644
index 0000000000000000000000000000000000000000..548720b3b9959b4949f71fb2dd5cf6af3d184066
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/decoders.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from io import BytesIO
+from typing import Any
+
+from PIL import Image
+
+
+class Decoder:
+    def decode(self) -> Any:
+        raise NotImplementedError
+
+
+class ImageDataDecoder(Decoder):
+    def __init__(self, image_data: bytes) -> None:
+        self._image_data = image_data
+
+    def decode(self) -> Image:
+        f = BytesIO(self._image_data)
+        return Image.open(f).convert(mode="RGB")
+
+
+class TargetDecoder(Decoder):
+    def __init__(self, target: Any):
+        self._target = target
+
+    def decode(self) -> Any:
+        return self._target
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/extended.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/extended.py
new file mode 100644
index 0000000000000000000000000000000000000000..4da831e6ad275025ed55eaa490f780ecf6083f2c
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/extended.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, Tuple
+
+from torchvision.datasets import VisionDataset
+
+from .decoders import TargetDecoder, ImageDataDecoder
+
+
+class ExtendedVisionDataset(VisionDataset):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)  # type: ignore
+
+    def get_image_data(self, index: int) -> bytes:
+        raise NotImplementedError
+
+    def get_target(self, index: int) -> Any:
+        raise NotImplementedError
+
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        try:
+            image_data = self.get_image_data(index)
+            image = ImageDataDecoder(image_data).decode()
+        except Exception as e:
+            raise RuntimeError(f"can not read image for sample {index}") from e
+        target = self.get_target(index)
+        target = TargetDecoder(target).decode()
+
+        if self.transforms is not None:
+            image, target = self.transforms(image, target)
+
+        return image, target
+
+    def __len__(self) -> int:
+        raise NotImplementedError
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e1c384cc96ceb6afeb3e555d9b3e2a2c008c5d4
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net.py
@@ -0,0 +1,291 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import csv
+from enum import Enum
+import logging
+import os
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from .extended import ExtendedVisionDataset
+
+
+logger = logging.getLogger("dinov2")
+_Target = int
+
+
+class _Split(Enum):
+    TRAIN = "train"
+    VAL = "val"
+    TEST = "test"  # NOTE: torchvision does not support the test split
+
+    @property
+    def length(self) -> int:
+        split_lengths = {
+            _Split.TRAIN: 1_281_167,
+            _Split.VAL: 50_000,
+            _Split.TEST: 100_000,
+        }
+        return split_lengths[self]
+
+    def get_dirname(self, class_id: Optional[str] = None) -> str:
+        return self.value if class_id is None else os.path.join(self.value, class_id)
+
+    def get_image_relpath(self, actual_index: int, class_id: Optional[str] = None) -> str:
+        dirname = self.get_dirname(class_id)
+        if self == _Split.TRAIN:
+            basename = f"{class_id}_{actual_index}"
+        else:  # self in (_Split.VAL, _Split.TEST):
+            basename = f"ILSVRC2012_{self.value}_{actual_index:08d}"
+        return os.path.join(dirname, basename + ".JPEG")
+
+    def parse_image_relpath(self, image_relpath: str) -> Tuple[str, int]:
+        assert self != _Split.TEST
+        dirname, filename = os.path.split(image_relpath)
+        class_id = os.path.split(dirname)[-1]
+        basename, _ = os.path.splitext(filename)
+        actual_index = int(basename.split("_")[-1])
+        return class_id, actual_index
+
+
+class ImageNet(ExtendedVisionDataset):
+    Target = Union[_Target]
+    Split = Union[_Split]
+
+    def __init__(
+        self,
+        *,
+        split: "ImageNet.Split",
+        root: str,
+        extra: str,
+        transforms: Optional[Callable] = None,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+    ) -> None:
+        super().__init__(root, transforms, transform, target_transform)
+        self._extra_root = extra
+        self._split = split
+
+        self._entries = None
+        self._class_ids = None
+        self._class_names = None
+
+    @property
+    def split(self) -> "ImageNet.Split":
+        return self._split
+
+    def _get_extra_full_path(self, extra_path: str) -> str:
+        return os.path.join(self._extra_root, extra_path)
+
+    def _load_extra(self, extra_path: str) -> np.ndarray:
+        extra_full_path = self._get_extra_full_path(extra_path)
+        return np.load(extra_full_path, mmap_mode="r")
+
+    def _save_extra(self, extra_array: np.ndarray, extra_path: str) -> None:
+        extra_full_path = self._get_extra_full_path(extra_path)
+        os.makedirs(self._extra_root, exist_ok=True)
+        np.save(extra_full_path, extra_array)
+
+    @property
+    def _entries_path(self) -> str:
+        return f"entries-{self._split.value.upper()}.npy"
+
+    @property
+    def _class_ids_path(self) -> str:
+        return f"class-ids-{self._split.value.upper()}.npy"
+
+    @property
+    def _class_names_path(self) -> str:
+        return f"class-names-{self._split.value.upper()}.npy"
+
+    def _get_entries(self) -> np.ndarray:
+        if self._entries is None:
+            self._entries = self._load_extra(self._entries_path)
+        assert self._entries is not None
+        return self._entries
+
+    def _get_class_ids(self) -> np.ndarray:
+        if self._split == _Split.TEST:
+            assert False, "Class IDs are not available in TEST split"
+        if self._class_ids is None:
+            self._class_ids = self._load_extra(self._class_ids_path)
+        assert self._class_ids is not None
+        return self._class_ids
+
+    def _get_class_names(self) -> np.ndarray:
+        if self._split == _Split.TEST:
+            assert False, "Class names are not available in TEST split"
+        if self._class_names is None:
+            self._class_names = self._load_extra(self._class_names_path)
+        assert self._class_names is not None
+        return self._class_names
+
+    def find_class_id(self, class_index: int) -> str:
+        class_ids = self._get_class_ids()
+        return str(class_ids[class_index])
+
+    def find_class_name(self, class_index: int) -> str:
+        class_names = self._get_class_names()
+        return str(class_names[class_index])
+
+    def get_image_data(self, index: int) -> bytes:
+        entries = self._get_entries()
+        actual_index = entries[index]["actual_index"]
+
+        class_id = self.get_class_id(index)
+
+        image_relpath = self.split.get_image_relpath(actual_index, class_id)
+        image_full_path = os.path.join(self.root, image_relpath)
+        with open(image_full_path, mode="rb") as f:
+            image_data = f.read()
+        return image_data
+
+    def get_target(self, index: int) -> Optional[Target]:
+        entries = self._get_entries()
+        class_index = entries[index]["class_index"]
+        return None if self.split == _Split.TEST else int(class_index)
+
+    def get_targets(self) -> Optional[np.ndarray]:
+        entries = self._get_entries()
+        return None if self.split == _Split.TEST else entries["class_index"]
+
+    def get_class_id(self, index: int) -> Optional[str]:
+        entries = self._get_entries()
+        class_id = entries[index]["class_id"]
+        return None if self.split == _Split.TEST else str(class_id)
+
+    def get_class_name(self, index: int) -> Optional[str]:
+        entries = self._get_entries()
+        class_name = entries[index]["class_name"]
+        return None if self.split == _Split.TEST else str(class_name)
+
+    def __len__(self) -> int:
+        entries = self._get_entries()
+        assert len(entries) == self.split.length
+        return len(entries)
+
+    def _load_labels(self, labels_path: str) -> List[Tuple[str, str]]:
+        labels_full_path = os.path.join(self.root, labels_path)
+        labels = []
+
+        try:
+            with open(labels_full_path, "r") as f:
+                reader = csv.reader(f)
+                for row in reader:
+                    class_id, class_name = row
+                    labels.append((class_id, class_name))
+        except OSError as e:
+            raise RuntimeError(f'can not read labels file "{labels_full_path}"') from e
+
+        return labels
+
+    def _dump_entries(self) -> None:
+        split = self.split
+        if split == ImageNet.Split.TEST:
+            dataset = None
+            sample_count = split.length
+            max_class_id_length, max_class_name_length = 0, 0
+        else:
+            labels_path = "labels.txt"
+            logger.info(f'loading labels from "{labels_path}"')
+            labels = self._load_labels(labels_path)
+
+            # NOTE: Using torchvision ImageFolder for consistency
+            from torchvision.datasets import ImageFolder
+
+            dataset_root = os.path.join(self.root, split.get_dirname())
+            dataset = ImageFolder(dataset_root)
+            sample_count = len(dataset)
+            max_class_id_length, max_class_name_length = -1, -1
+            for sample in dataset.samples:
+                _, class_index = sample
+                class_id, class_name = labels[class_index]
+                max_class_id_length = max(len(class_id), max_class_id_length)
+                max_class_name_length = max(len(class_name), max_class_name_length)
+
+        dtype = np.dtype(
+            [
+                ("actual_index", "<u4"),
+                ("class_index", "<u4"),
+                ("class_id", f"U{max_class_id_length}"),
+                ("class_name", f"U{max_class_name_length}"),
+            ]
+        )
+        entries_array = np.empty(sample_count, dtype=dtype)
+
+        if split == ImageNet.Split.TEST:
+            old_percent = -1
+            for index in range(sample_count):
+                percent = 100 * (index + 1) // sample_count
+                if percent > old_percent:
+                    logger.info(f"creating entries: {percent}%")
+                    old_percent = percent
+
+                actual_index = index + 1
+                class_index = np.uint32(-1)
+                class_id, class_name = "", ""
+                entries_array[index] = (actual_index, class_index, class_id, class_name)
+        else:
+            class_names = {class_id: class_name for class_id, class_name in labels}
+
+            assert dataset
+            old_percent = -1
+            for index in range(sample_count):
+                percent = 100 * (index + 1) // sample_count
+                if percent > old_percent:
+                    logger.info(f"creating entries: {percent}%")
+                    old_percent = percent
+
+                image_full_path, class_index = dataset.samples[index]
+                image_relpath = os.path.relpath(image_full_path, self.root)
+                class_id, actual_index = split.parse_image_relpath(image_relpath)
+                class_name = class_names[class_id]
+                entries_array[index] = (actual_index, class_index, class_id, class_name)
+
+        logger.info(f'saving entries to "{self._entries_path}"')
+        self._save_extra(entries_array, self._entries_path)
+
+    def _dump_class_ids_and_names(self) -> None:
+        split = self.split
+        if split == ImageNet.Split.TEST:
+            return
+
+        entries_array = self._load_extra(self._entries_path)
+
+        max_class_id_length, max_class_name_length, max_class_index = -1, -1, -1
+        for entry in entries_array:
+            class_index, class_id, class_name = (
+                entry["class_index"],
+                entry["class_id"],
+                entry["class_name"],
+            )
+            max_class_index = max(int(class_index), max_class_index)
+            max_class_id_length = max(len(str(class_id)), max_class_id_length)
+            max_class_name_length = max(len(str(class_name)), max_class_name_length)
+
+        class_count = max_class_index + 1
+        class_ids_array = np.empty(class_count, dtype=f"U{max_class_id_length}")
+        class_names_array = np.empty(class_count, dtype=f"U{max_class_name_length}")
+        for entry in entries_array:
+            class_index, class_id, class_name = (
+                entry["class_index"],
+                entry["class_id"],
+                entry["class_name"],
+            )
+            class_ids_array[class_index] = class_id
+            class_names_array[class_index] = class_name
+
+        logger.info(f'saving class IDs to "{self._class_ids_path}"')
+        self._save_extra(class_ids_array, self._class_ids_path)
+
+        logger.info(f'saving class names to "{self._class_names_path}"')
+        self._save_extra(class_names_array, self._class_names_path)
+
+    def dump_extra(self) -> None:
+        self._dump_entries()
+        self._dump_class_ids_and_names()
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net_22k.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net_22k.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c0bfd335a68b67e02c241f39b1ae06f9fbe1dd0
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/datasets/image_net_22k.py
@@ -0,0 +1,303 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from enum import Enum
+from functools import lru_cache
+from gzip import GzipFile
+from io import BytesIO
+from mmap import ACCESS_READ, mmap
+import os
+from typing import Any, Callable, List, Optional, Set, Tuple
+import warnings
+
+import numpy as np
+
+from .extended import ExtendedVisionDataset
+
+
+_Labels = int
+
+_DEFAULT_MMAP_CACHE_SIZE = 16  # Warning: This can exhaust file descriptors
+
+
+@dataclass
+class _ClassEntry:
+    block_offset: int
+    maybe_filename: Optional[str] = None
+
+
+@dataclass
+class _Entry:
+    class_index: int  # noqa: E701
+    start_offset: int
+    end_offset: int
+    filename: str
+
+
+class _Split(Enum):
+    TRAIN = "train"
+    VAL = "val"
+
+    @property
+    def length(self) -> int:
+        return {
+            _Split.TRAIN: 11_797_647,
+            _Split.VAL: 561_050,
+        }[self]
+
+    def entries_path(self):
+        return f"imagenet21kp_{self.value}.txt"
+
+
+def _get_tarball_path(class_id: str) -> str:
+    return f"{class_id}.tar"
+
+
+def _make_mmap_tarball(tarballs_root: str, mmap_cache_size: int):
+    @lru_cache(maxsize=mmap_cache_size)
+    def _mmap_tarball(class_id: str) -> mmap:
+        tarball_path = _get_tarball_path(class_id)
+        tarball_full_path = os.path.join(tarballs_root, tarball_path)
+        with open(tarball_full_path) as f:
+            return mmap(fileno=f.fileno(), length=0, access=ACCESS_READ)
+
+    return _mmap_tarball
+
+
+class ImageNet22k(ExtendedVisionDataset):
+    _GZIPPED_INDICES: Set[int] = {
+        841_545,
+        1_304_131,
+        2_437_921,
+        2_672_079,
+        2_795_676,
+        2_969_786,
+        6_902_965,
+        6_903_550,
+        6_903_628,
+        7_432_557,
+        7_432_589,
+        7_813_809,
+        8_329_633,
+        10_296_990,
+        10_417_652,
+        10_492_265,
+        10_598_078,
+        10_782_398,
+        10_902_612,
+        11_203_736,
+        11_342_890,
+        11_397_596,
+        11_589_762,
+        11_705_103,
+        12_936_875,
+        13_289_782,
+    }
+    Labels = _Labels
+
+    def __init__(
+        self,
+        *,
+        root: str,
+        extra: str,
+        transforms: Optional[Callable] = None,
+        transform: Optional[Callable] = None,
+        target_transform: Optional[Callable] = None,
+        mmap_cache_size: int = _DEFAULT_MMAP_CACHE_SIZE,
+    ) -> None:
+        super().__init__(root, transforms, transform, target_transform)
+        self._extra_root = extra
+
+        entries_path = self._get_entries_path(root)
+        self._entries = self._load_extra(entries_path)
+
+        class_ids_path = self._get_class_ids_path(root)
+        self._class_ids = self._load_extra(class_ids_path)
+
+        self._gzipped_indices = ImageNet22k._GZIPPED_INDICES
+        self._mmap_tarball = _make_mmap_tarball(self._tarballs_root, mmap_cache_size)
+
+    def _get_entries_path(self, root: Optional[str] = None) -> str:
+        return "entries.npy"
+
+    def _get_class_ids_path(self, root: Optional[str] = None) -> str:
+        return "class-ids.npy"
+
+    def _find_class_ids(self, path: str) -> List[str]:
+        class_ids = []
+
+        with os.scandir(path) as entries:
+            for entry in entries:
+                root, ext = os.path.splitext(entry.name)
+                if ext != ".tar":
+                    continue
+                class_ids.append(root)
+
+        return sorted(class_ids)
+
+    def _load_entries_class_ids(self, root: Optional[str] = None) -> Tuple[List[_Entry], List[str]]:
+        root = self.get_root(root)
+        entries: List[_Entry] = []
+        class_ids = self._find_class_ids(root)
+
+        for class_index, class_id in enumerate(class_ids):
+            path = os.path.join(root, "blocks", f"{class_id}.log")
+            class_entries = []
+
+            try:
+                with open(path) as f:
+                    for line in f:
+                        line = line.rstrip()
+                        block, filename = line.split(":")
+                        block_offset = int(block[6:])
+                        filename = filename[1:]
+
+                        maybe_filename = None
+                        if filename != "** Block of NULs **":
+                            maybe_filename = filename
+                            _, ext = os.path.splitext(filename)
+                            # assert ext == ".JPEG"
+
+                        class_entry = _ClassEntry(block_offset, maybe_filename)
+                        class_entries.append(class_entry)
+            except OSError as e:
+                raise RuntimeError(f'can not read blocks file "{path}"') from e
+
+            assert class_entries[-1].maybe_filename is None
+
+            for class_entry1, class_entry2 in zip(class_entries, class_entries[1:]):
+                assert class_entry1.block_offset <= class_entry2.block_offset
+                start_offset = 512 * class_entry1.block_offset
+                end_offset = 512 * class_entry2.block_offset
+                assert class_entry1.maybe_filename is not None
+                filename = class_entry1.maybe_filename
+                entry = _Entry(class_index, start_offset, end_offset, filename)
+                # Skip invalid image files (PIL throws UnidentifiedImageError)
+                if filename == "n06470073_47249.JPEG":
+                    continue
+                entries.append(entry)
+
+        return entries, class_ids
+
+    def _load_extra(self, extra_path: str) -> np.ndarray:
+        extra_root = self._extra_root
+        extra_full_path = os.path.join(extra_root, extra_path)
+        return np.load(extra_full_path, mmap_mode="r")
+
+    def _save_extra(self, extra_array: np.ndarray, extra_path: str) -> None:
+        extra_root = self._extra_root
+        extra_full_path = os.path.join(extra_root, extra_path)
+        os.makedirs(extra_root, exist_ok=True)
+        np.save(extra_full_path, extra_array)
+
+    @property
+    def _tarballs_root(self) -> str:
+        return self.root
+
+    def find_class_id(self, class_index: int) -> str:
+        return str(self._class_ids[class_index])
+
+    def get_image_data(self, index: int) -> bytes:
+        entry = self._entries[index]
+        class_id = entry["class_id"]
+        class_mmap = self._mmap_tarball(class_id)
+
+        start_offset, end_offset = entry["start_offset"], entry["end_offset"]
+        try:
+            mapped_data = class_mmap[start_offset:end_offset]
+            data = mapped_data[512:]  # Skip entry header block
+
+            if len(data) >= 2 and tuple(data[:2]) == (0x1F, 0x8B):
+                assert index in self._gzipped_indices, f"unexpected gzip header for sample {index}"
+                with GzipFile(fileobj=BytesIO(data)) as g:
+                    data = g.read()
+        except Exception as e:
+            raise RuntimeError(f"can not retrieve image data for sample {index} " f'from "{class_id}" tarball') from e
+
+        return data
+
+    def get_target(self, index: int) -> Any:
+        return int(self._entries[index]["class_index"])
+
+    def get_targets(self) -> np.ndarray:
+        return self._entries["class_index"]
+
+    def get_class_id(self, index: int) -> str:
+        return str(self._entries[index]["class_id"])
+
+    def get_class_ids(self) -> np.ndarray:
+        return self._entries["class_id"]
+
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            return super().__getitem__(index)
+
+    def __len__(self) -> int:
+        return len(self._entries)
+
+    def _dump_entries(self, *args, **kwargs) -> None:
+        entries, class_ids = self._load_entries_class_ids(*args, **kwargs)
+
+        max_class_id_length, max_filename_length, max_class_index = -1, -1, -1
+        for entry in entries:
+            class_id = class_ids[entry.class_index]
+            max_class_index = max(entry.class_index, max_class_index)
+            max_class_id_length = max(len(class_id), max_class_id_length)
+            max_filename_length = max(len(entry.filename), max_filename_length)
+
+        dtype = np.dtype(
+            [
+                ("class_index", "<u4"),
+                ("class_id", f"U{max_class_id_length}"),
+                ("start_offset", "<u4"),
+                ("end_offset", "<u4"),
+                ("filename", f"U{max_filename_length}"),
+            ]
+        )
+        sample_count = len(entries)
+        entries_array = np.empty(sample_count, dtype=dtype)
+        for i, entry in enumerate(entries):
+            class_index = entry.class_index
+            class_id = class_ids[class_index]
+            start_offset = entry.start_offset
+            end_offset = entry.end_offset
+            filename = entry.filename
+            entries_array[i] = (
+                class_index,
+                class_id,
+                start_offset,
+                end_offset,
+                filename,
+            )
+
+        entries_path = self._get_entries_path(*args, **kwargs)
+        self._save_extra(entries_array, entries_path)
+
+    def _dump_class_ids(self, *args, **kwargs) -> None:
+        entries_path = self._get_entries_path(*args, **kwargs)
+        entries_array = self._load_extra(entries_path)
+
+        max_class_id_length, max_class_index = -1, -1
+        for entry in entries_array:
+            class_index, class_id = entry["class_index"], entry["class_id"]
+            max_class_index = max(int(class_index), max_class_index)
+            max_class_id_length = max(len(str(class_id)), max_class_id_length)
+
+        class_ids_array = np.empty(max_class_index + 1, dtype=f"U{max_class_id_length}")
+        for entry in entries_array:
+            class_index, class_id = entry["class_index"], entry["class_id"]
+            class_ids_array[class_index] = class_id
+        class_ids_path = self._get_class_ids_path(*args, **kwargs)
+        self._save_extra(class_ids_array, class_ids_path)
+
+    def _dump_extra(self, *args, **kwargs) -> None:
+        self._dump_entries(*args, *kwargs)
+        self._dump_class_ids(*args, *kwargs)
+
+    def dump_extra(self, root: Optional[str] = None) -> None:
+        return self._dump_extra(root)
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/loaders.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fb6f25a0a3c3251b803f48d0a515aa0b9591226
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/loaders.py
@@ -0,0 +1,223 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from enum import Enum
+from typing import Any, Callable, List, Optional, TypeVar
+
+import torch
+from torch.utils.data import Sampler
+
+from .datasets import ImageNet, ImageNet22k
+from .samplers import EpochSampler, InfiniteSampler, ShardedInfiniteSampler
+
+
+logger = logging.getLogger("dinov2")
+
+
+class SamplerType(Enum):
+    DISTRIBUTED = 0
+    EPOCH = 1
+    INFINITE = 2
+    SHARDED_INFINITE = 3
+    SHARDED_INFINITE_NEW = 4
+
+
+def _make_bool_str(b: bool) -> str:
+    return "yes" if b else "no"
+
+
+def _make_sample_transform(image_transform: Optional[Callable] = None, target_transform: Optional[Callable] = None):
+    def transform(sample):
+        image, target = sample
+        if image_transform is not None:
+            image = image_transform(image)
+        if target_transform is not None:
+            target = target_transform(target)
+        return image, target
+
+    return transform
+
+
+def _parse_dataset_str(dataset_str: str):
+    tokens = dataset_str.split(":")
+
+    name = tokens[0]
+    kwargs = {}
+
+    for token in tokens[1:]:
+        key, value = token.split("=")
+        assert key in ("root", "extra", "split")
+        kwargs[key] = value
+
+    if name == "ImageNet":
+        class_ = ImageNet
+        if "split" in kwargs:
+            kwargs["split"] = ImageNet.Split[kwargs["split"]]
+    elif name == "ImageNet22k":
+        class_ = ImageNet22k
+    else:
+        raise ValueError(f'Unsupported dataset "{name}"')
+
+    return class_, kwargs
+
+
+def make_dataset(
+    *,
+    dataset_str: str,
+    transform: Optional[Callable] = None,
+    target_transform: Optional[Callable] = None,
+):
+    """
+    Creates a dataset with the specified parameters.
+
+    Args:
+        dataset_str: A dataset string description (e.g. ImageNet:split=TRAIN).
+        transform: A transform to apply to images.
+        target_transform: A transform to apply to targets.
+
+    Returns:
+        The created dataset.
+    """
+    logger.info(f'using dataset: "{dataset_str}"')
+
+    class_, kwargs = _parse_dataset_str(dataset_str)
+    dataset = class_(transform=transform, target_transform=target_transform, **kwargs)
+
+    logger.info(f"# of dataset samples: {len(dataset):,d}")
+
+    # Aggregated datasets do not expose (yet) these attributes, so add them.
+    if not hasattr(dataset, "transform"):
+        setattr(dataset, "transform", transform)
+    if not hasattr(dataset, "target_transform"):
+        setattr(dataset, "target_transform", target_transform)
+
+    return dataset
+
+
+def _make_sampler(
+    *,
+    dataset,
+    type: Optional[SamplerType] = None,
+    shuffle: bool = False,
+    seed: int = 0,
+    size: int = -1,
+    advance: int = 0,
+) -> Optional[Sampler]:
+    sample_count = len(dataset)
+
+    if type == SamplerType.INFINITE:
+        logger.info("sampler: infinite")
+        if size > 0:
+            raise ValueError("sampler size > 0 is invalid")
+        return InfiniteSampler(
+            sample_count=sample_count,
+            shuffle=shuffle,
+            seed=seed,
+            advance=advance,
+        )
+    elif type in (SamplerType.SHARDED_INFINITE, SamplerType.SHARDED_INFINITE_NEW):
+        logger.info("sampler: sharded infinite")
+        if size > 0:
+            raise ValueError("sampler size > 0 is invalid")
+        # TODO: Remove support for old shuffling
+        use_new_shuffle_tensor_slice = type == SamplerType.SHARDED_INFINITE_NEW
+        return ShardedInfiniteSampler(
+            sample_count=sample_count,
+            shuffle=shuffle,
+            seed=seed,
+            advance=advance,
+            use_new_shuffle_tensor_slice=use_new_shuffle_tensor_slice,
+        )
+    elif type == SamplerType.EPOCH:
+        logger.info("sampler: epoch")
+        if advance > 0:
+            raise NotImplementedError("sampler advance > 0 is not supported")
+        size = size if size > 0 else sample_count
+        logger.info(f"# of samples / epoch: {size:,d}")
+        return EpochSampler(
+            size=size,
+            sample_count=sample_count,
+            shuffle=shuffle,
+            seed=seed,
+        )
+    elif type == SamplerType.DISTRIBUTED:
+        logger.info("sampler: distributed")
+        if size > 0:
+            raise ValueError("sampler size > 0 is invalid")
+        if advance > 0:
+            raise ValueError("sampler advance > 0 is invalid")
+        return torch.utils.data.DistributedSampler(
+            dataset=dataset,
+            shuffle=shuffle,
+            seed=seed,
+            drop_last=False,
+        )
+
+    logger.info("sampler: none")
+    return None
+
+
+T = TypeVar("T")
+
+
+def make_data_loader(
+    *,
+    dataset,
+    batch_size: int,
+    num_workers: int,
+    shuffle: bool = True,
+    seed: int = 0,
+    sampler_type: Optional[SamplerType] = SamplerType.INFINITE,
+    sampler_size: int = -1,
+    sampler_advance: int = 0,
+    drop_last: bool = True,
+    persistent_workers: bool = False,
+    collate_fn: Optional[Callable[[List[T]], Any]] = None,
+):
+    """
+    Creates a data loader with the specified parameters.
+
+    Args:
+        dataset: A dataset (third party, LaViDa or WebDataset).
+        batch_size: The size of batches to generate.
+        num_workers: The number of workers to use.
+        shuffle: Whether to shuffle samples.
+        seed: The random seed to use.
+        sampler_type: Which sampler to use: EPOCH, INFINITE, SHARDED_INFINITE, SHARDED_INFINITE_NEW, DISTRIBUTED or None.
+        sampler_size: The number of images per epoch (when applicable) or -1 for the entire dataset.
+        sampler_advance: How many samples to skip (when applicable).
+        drop_last: Whether the last non-full batch of data should be dropped.
+        persistent_workers: maintain the workers Dataset instances alive after a dataset has been consumed once.
+        collate_fn: Function that performs batch collation
+    """
+
+    sampler = _make_sampler(
+        dataset=dataset,
+        type=sampler_type,
+        shuffle=shuffle,
+        seed=seed,
+        size=sampler_size,
+        advance=sampler_advance,
+    )
+
+    logger.info("using PyTorch data loader")
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=True,
+        drop_last=drop_last,
+        persistent_workers=persistent_workers,
+        collate_fn=collate_fn,
+    )
+
+    try:
+        logger.info(f"# of batches: {len(data_loader):,d}")
+    except TypeError:  # data loader has no length
+        logger.info("infinite data loader")
+    return data_loader
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/masking.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/masking.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc3c72648c3e440dcdb284366b98d2df12ad1272
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/masking.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import random
+import math
+import numpy as np
+
+
+class MaskingGenerator:
+    def __init__(
+        self,
+        input_size,
+        num_masking_patches=None,
+        min_num_patches=4,
+        max_num_patches=None,
+        min_aspect=0.3,
+        max_aspect=None,
+    ):
+        if not isinstance(input_size, tuple):
+            input_size = (input_size,) * 2
+        self.height, self.width = input_size
+
+        self.num_patches = self.height * self.width
+        self.num_masking_patches = num_masking_patches
+
+        self.min_num_patches = min_num_patches
+        self.max_num_patches = num_masking_patches if max_num_patches is None else max_num_patches
+
+        max_aspect = max_aspect or 1 / min_aspect
+        self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
+
+    def __repr__(self):
+        repr_str = "Generator(%d, %d -> [%d ~ %d], max = %d, %.3f ~ %.3f)" % (
+            self.height,
+            self.width,
+            self.min_num_patches,
+            self.max_num_patches,
+            self.num_masking_patches,
+            self.log_aspect_ratio[0],
+            self.log_aspect_ratio[1],
+        )
+        return repr_str
+
+    def get_shape(self):
+        return self.height, self.width
+
+    def _mask(self, mask, max_mask_patches):
+        delta = 0
+        for _ in range(10):
+            target_area = random.uniform(self.min_num_patches, max_mask_patches)
+            aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+            if w < self.width and h < self.height:
+                top = random.randint(0, self.height - h)
+                left = random.randint(0, self.width - w)
+
+                num_masked = mask[top : top + h, left : left + w].sum()
+                # Overlap
+                if 0 < h * w - num_masked <= max_mask_patches:
+                    for i in range(top, top + h):
+                        for j in range(left, left + w):
+                            if mask[i, j] == 0:
+                                mask[i, j] = 1
+                                delta += 1
+
+                if delta > 0:
+                    break
+        return delta
+
+    def __call__(self, num_masking_patches=0):
+        mask = np.zeros(shape=self.get_shape(), dtype=bool)
+        mask_count = 0
+        while mask_count < num_masking_patches:
+            max_mask_patches = num_masking_patches - mask_count
+            max_mask_patches = min(max_mask_patches, self.max_num_patches)
+
+            delta = self._mask(mask, max_mask_patches)
+            if delta == 0:
+                break
+            else:
+                mask_count += delta
+
+        return mask
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/samplers.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/samplers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e356edf603a33ce2d18a388fd799694e22d1980f
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/samplers.py
@@ -0,0 +1,230 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+from typing import Any, Optional
+import warnings
+
+import numpy as np
+import torch
+from torch.utils.data.sampler import Sampler
+
+import dinov2.distributed as distributed
+
+
+class EpochSampler(Sampler):
+    def __init__(
+        self,
+        *,
+        size: int,
+        sample_count: int,
+        shuffle: bool = False,
+        seed: int = 0,
+        start: Optional[int] = None,
+        step: Optional[int] = None,
+    ):
+        self._size = size
+        self._sample_count = sample_count
+        self._shuffle = shuffle
+        self._seed = seed
+        self._start = distributed.get_global_rank() if start is None else start
+        self._step = distributed.get_global_size() if step is None else step
+        self._epoch = 0
+
+    def __iter__(self):
+        count = (self._size + self._sample_count - 1) // self._sample_count
+        tiled_indices = np.tile(np.arange(self._sample_count), count)
+        if self._shuffle:
+            seed = self._seed * self._epoch if self._seed != 0 else self._epoch
+            rng = np.random.default_rng(seed)
+            iterable = rng.choice(tiled_indices, self._size, replace=False)
+        else:
+            iterable = tiled_indices[: self._size]
+
+        yield from itertools.islice(iterable, self._start, None, self._step)
+
+    def __len__(self):
+        return (self._size - self._start + self._step - 1) // self._step
+
+    def set_epoch(self, epoch):
+        self._epoch = epoch
+
+
+def _get_numpy_dtype(size: int) -> Any:
+    return np.int32 if size <= 2**31 else np.int64
+
+
+def _get_torch_dtype(size: int) -> Any:
+    return torch.int32 if size <= 2**31 else torch.int64
+
+
+def _generate_randperm_indices(*, size: int, generator: torch.Generator):
+    """Generate the indices of a random permutation."""
+    dtype = _get_torch_dtype(size)
+    # This is actually matching PyTorch's CPU implementation, see: https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorFactories.cpp#L900-L921
+    perm = torch.arange(size, dtype=dtype)
+    for i in range(size):
+        j = torch.randint(i, size, size=(1,), generator=generator).item()
+
+        # Always swap even if no-op
+        value = perm[j].item()
+        perm[j] = perm[i].item()
+        perm[i] = value
+        yield value
+
+
+class InfiniteSampler(Sampler):
+    def __init__(
+        self,
+        *,
+        sample_count: int,
+        shuffle: bool = False,
+        seed: int = 0,
+        start: Optional[int] = None,
+        step: Optional[int] = None,
+        advance: int = 0,
+    ):
+        self._sample_count = sample_count
+        self._seed = seed
+        self._shuffle = shuffle
+        self._start = distributed.get_global_rank() if start is None else start
+        self._step = distributed.get_global_size() if step is None else step
+        self._advance = advance
+
+    def __iter__(self):
+        if self._shuffle:
+            iterator = self._shuffled_iterator()
+        else:
+            iterator = self._iterator()
+
+        yield from itertools.islice(iterator, self._advance, None)
+
+    def _iterator(self):
+        assert not self._shuffle
+
+        while True:
+            iterable = range(self._sample_count)
+            yield from itertools.islice(iterable, self._start, None, self._step)
+
+    def _shuffled_iterator(self):
+        assert self._shuffle
+
+        # Instantiate a generator here (rather than in the ctor) to keep the class
+        # picklable (requirement of mp.spawn)
+        generator = torch.Generator().manual_seed(self._seed)
+
+        while True:
+            iterable = _generate_randperm_indices(size=self._sample_count, generator=generator)
+            yield from itertools.islice(iterable, self._start, None, self._step)
+
+
+# The following function is somewhat equivalent to _new_shuffle_tensor_slice below,
+# but avoids a full in-place random permutation generation.
+def _shuffle_tensor_slice(
+    *, tensor: torch.Tensor, start: int = 0, step: int = 1, generator: torch.Generator
+) -> np.ndarray:
+    stop = len(tensor)
+    count = stop // step
+    drop_count = stop - step * count
+    if drop_count:
+        warnings.warn(f"# of dropped samples: {drop_count}")
+
+    dtype = _get_numpy_dtype(stop)
+    result = np.empty(count, dtype=dtype)
+
+    for i in range(count):
+        j = torch.randint(0, i + 1, size=(1,), generator=generator).item() if i > 0 else 0
+
+        result[i] = result[j]
+        result[j] = tensor[start + i * step].item()
+
+    return result
+
+
+def _new_shuffle_tensor_slice(
+    *, tensor: torch.Tensor, start: int = 0, step: int = 1, generator: torch.Generator
+) -> np.ndarray:
+    stop = len(tensor)
+    count = stop // step
+    dtype = torch.int64  # Needed for using randperm result as indices
+    count = stop // step
+    drop_count = stop - step * count
+    if drop_count:
+        warnings.warn(f"# of dropped samples: {drop_count}")
+    indices = torch.randperm(count, dtype=dtype, generator=generator)
+    return tensor[start::step][indices].numpy()
+
+
+def _make_seed(seed: int, start: int, iter_count: int) -> int:
+    # NOTE: Tried a few variants (including iter_count << 32), this one worked best.
+    return seed + start + (iter_count << 24)
+
+
+class ShardedInfiniteSampler(Sampler):
+    def __init__(
+        self,
+        *,
+        sample_count: int,
+        shuffle: bool = False,
+        seed: int = 0,
+        start: Optional[int] = None,
+        step: Optional[int] = None,
+        advance: int = 0,
+        use_new_shuffle_tensor_slice: bool = False,
+    ):
+        self._sample_count = sample_count
+        self._seed = seed
+        self._shuffle = shuffle
+        self._start = distributed.get_global_rank() if start is None else start
+        self._step = distributed.get_global_size() if step is None else step
+        self._advance = advance
+        self._iter_count = 0
+        self._shuffle_tensor_slice_fn = (
+            _new_shuffle_tensor_slice if use_new_shuffle_tensor_slice else _shuffle_tensor_slice
+        )
+
+    def __iter__(self):
+        iter_count = self._advance // self._sample_count
+        if iter_count > 0:
+            self._advance -= iter_count * self._sample_count
+            self._iter_count += iter_count
+
+        if self._shuffle:
+            iterator = self._shuffled_iterator()
+        else:
+            iterator = self._iterator()
+
+        yield from itertools.islice(iterator, self._advance, None)
+
+    def _iterator(self):
+        assert not self._shuffle
+
+        while True:
+            iterable = range(self._sample_count)
+            yield from itertools.islice(iterable, self._start, None, self._step)
+
+    def _shuffled_iterator(self):
+        assert self._shuffle
+
+        # Instantiate a generator here (rather than in the ctor) to be keep the class
+        # picklable (requirement of mp.spawn)
+        generator = torch.Generator()
+
+        # Always shuffle everything first
+        generator.manual_seed(self._seed)
+        dtype = _get_torch_dtype(self._sample_count)
+        perm = torch.randperm(self._sample_count, dtype=dtype, generator=generator)
+
+        while True:
+            # Re-seed on each iteration to allow skipping whole permutations
+            seed = _make_seed(self._seed, self._start, self._iter_count)
+            generator.manual_seed(seed)
+
+            iterable = self._shuffle_tensor_slice_fn(
+                tensor=perm, start=self._start, step=self._step, generator=generator
+            )
+            yield from iterable
+            self._iter_count += 1
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/transforms.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1bc4cbd1a459a9f44314806cf9ccedea112ab14
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/data/transforms.py
@@ -0,0 +1,92 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Sequence
+
+import torch
+from torchvision import transforms
+
+
+class GaussianBlur(transforms.RandomApply):
+    """
+    Apply Gaussian Blur to the PIL image.
+    """
+
+    def __init__(self, *, p: float = 0.5, radius_min: float = 0.1, radius_max: float = 2.0):
+        # NOTE: torchvision is applying 1 - probability to return the original image
+        keep_p = 1 - p
+        transform = transforms.GaussianBlur(kernel_size=9, sigma=(radius_min, radius_max))
+        super().__init__(transforms=[transform], p=keep_p)
+
+
+class MaybeToTensor(transforms.ToTensor):
+    """
+    Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor, or keep as is if already a tensor.
+    """
+
+    def __call__(self, pic):
+        """
+        Args:
+            pic (PIL Image, numpy.ndarray or torch.tensor): Image to be converted to tensor.
+        Returns:
+            Tensor: Converted image.
+        """
+        if isinstance(pic, torch.Tensor):
+            return pic
+        return super().__call__(pic)
+
+
+# Use timm's names
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+
+def make_normalize_transform(
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+) -> transforms.Normalize:
+    return transforms.Normalize(mean=mean, std=std)
+
+
+# This roughly matches torchvision's preset for classification training:
+#   https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L6-L44
+def make_classification_train_transform(
+    *,
+    crop_size: int = 224,
+    interpolation=transforms.InterpolationMode.BICUBIC,
+    hflip_prob: float = 0.5,
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+):
+    transforms_list = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)]
+    if hflip_prob > 0.0:
+        transforms_list.append(transforms.RandomHorizontalFlip(hflip_prob))
+    transforms_list.extend(
+        [
+            MaybeToTensor(),
+            make_normalize_transform(mean=mean, std=std),
+        ]
+    )
+    return transforms.Compose(transforms_list)
+
+
+# This matches (roughly) torchvision's preset for classification evaluation:
+#   https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L47-L69
+def make_classification_eval_transform(
+    *,
+    resize_size: int = 256,
+    interpolation=transforms.InterpolationMode.BICUBIC,
+    crop_size: int = 224,
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+) -> transforms.Compose:
+    transforms_list = [
+        transforms.Resize(resize_size, interpolation=interpolation),
+        transforms.CenterCrop(crop_size),
+        MaybeToTensor(),
+        make_normalize_transform(mean=mean, std=std),
+    ]
+    return transforms.Compose(transforms_list)
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/distributed/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ccd663f33d5a21ad1f9d25db7bd378ec52aeac2
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/distributed/__init__.py
@@ -0,0 +1,271 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import random
+import re
+import socket
+from typing import Dict, List
+
+import torch
+import torch.distributed as dist
+
+_LOCAL_RANK = -1
+_LOCAL_WORLD_SIZE = -1
+
+
+def is_enabled() -> bool:
+    """
+    Returns:
+        True if distributed training is enabled
+    """
+    return dist.is_available() and dist.is_initialized()
+
+
+def get_global_size() -> int:
+    """
+    Returns:
+        The number of processes in the process group
+    """
+    return dist.get_world_size() if is_enabled() else 1
+
+
+def get_global_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the global process group.
+    """
+    return dist.get_rank() if is_enabled() else 0
+
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if not is_enabled():
+        return 0
+    assert 0 <= _LOCAL_RANK < _LOCAL_WORLD_SIZE
+    return _LOCAL_RANK
+
+
+def get_local_size() -> int:
+    """
+    Returns:
+        The size of the per-machine process group,
+        i.e. the number of processes per machine.
+    """
+    if not is_enabled():
+        return 1
+    assert 0 <= _LOCAL_RANK < _LOCAL_WORLD_SIZE
+    return _LOCAL_WORLD_SIZE
+
+
+def is_main_process() -> bool:
+    """
+    Returns:
+        True if the current process is the main one.
+    """
+    return get_global_rank() == 0
+
+
+def _restrict_print_to_main_process() -> None:
+    """
+    This function disables printing when not in the main process
+    """
+    import builtins as __builtin__
+
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_main_process() or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def _get_master_port(seed: int = 0) -> int:
+    MIN_MASTER_PORT, MAX_MASTER_PORT = (20_000, 60_000)
+
+    master_port_str = os.environ.get("MASTER_PORT")
+    if master_port_str is None:
+        rng = random.Random(seed)
+        return rng.randint(MIN_MASTER_PORT, MAX_MASTER_PORT)
+
+    return int(master_port_str)
+
+
+def _get_available_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        # A "" host address means INADDR_ANY i.e. binding to all interfaces.
+        # Note this is not compatible with IPv6.
+        s.bind(("", 0))
+        port = s.getsockname()[1]
+        return port
+
+
+_TORCH_DISTRIBUTED_ENV_VARS = (
+    "MASTER_ADDR",
+    "MASTER_PORT",
+    "RANK",
+    "WORLD_SIZE",
+    "LOCAL_RANK",
+    "LOCAL_WORLD_SIZE",
+)
+
+
+def _collect_env_vars() -> Dict[str, str]:
+    return {env_var: os.environ[env_var] for env_var in _TORCH_DISTRIBUTED_ENV_VARS if env_var in os.environ}
+
+
+def _is_slurm_job_process() -> bool:
+    return "SLURM_JOB_ID" in os.environ
+
+
+def _parse_slurm_node_list(s: str) -> List[str]:
+    nodes = []
+    # Extract "hostname", "hostname[1-2,3,4-5]," substrings
+    p = re.compile(r"(([^\[]+)(?:\[([^\]]+)\])?),?")
+    for m in p.finditer(s):
+        prefix, suffixes = s[m.start(2) : m.end(2)], s[m.start(3) : m.end(3)]
+        for suffix in suffixes.split(","):
+            span = suffix.split("-")
+            if len(span) == 1:
+                nodes.append(prefix + suffix)
+            else:
+                width = len(span[0])
+                start, end = int(span[0]), int(span[1]) + 1
+                nodes.extend([prefix + f"{i:0{width}}" for i in range(start, end)])
+    return nodes
+
+
+def _check_env_variable(key: str, new_value: str):
+    # Only check for difference with preset environment variables
+    if key in os.environ and os.environ[key] != new_value:
+        raise RuntimeError(f"Cannot export environment variables as {key} is already set")
+
+
+class _TorchDistributedEnvironment:
+    def __init__(self):
+        self.master_addr = "127.0.0.1"
+        self.master_port = 0
+        self.rank = -1
+        self.world_size = -1
+        self.local_rank = -1
+        self.local_world_size = -1
+
+        if _is_slurm_job_process():
+            return self._set_from_slurm_env()
+
+        env_vars = _collect_env_vars()
+        if not env_vars:
+            # Environment is not set
+            pass
+        elif len(env_vars) == len(_TORCH_DISTRIBUTED_ENV_VARS):
+            # Environment is fully set
+            return self._set_from_preset_env()
+        else:
+            # Environment is partially set
+            collected_env_vars = ", ".join(env_vars.keys())
+            raise RuntimeError(f"Partially set environment: {collected_env_vars}")
+
+        if torch.cuda.device_count() > 0:
+            return self._set_from_local()
+
+        raise RuntimeError("Can't initialize PyTorch distributed environment")
+
+    # Slurm job created with sbatch, submitit, etc...
+    def _set_from_slurm_env(self):
+        # logger.info("Initialization from Slurm environment")
+        job_id = int(os.environ["SLURM_JOB_ID"])
+        node_count = int(os.environ["SLURM_JOB_NUM_NODES"])
+        nodes = _parse_slurm_node_list(os.environ["SLURM_JOB_NODELIST"])
+        assert len(nodes) == node_count
+
+        self.master_addr = nodes[0]
+        self.master_port = _get_master_port(seed=job_id)
+        self.rank = int(os.environ["SLURM_PROCID"])
+        self.world_size = int(os.environ["SLURM_NTASKS"])
+        assert self.rank < self.world_size
+        self.local_rank = int(os.environ["SLURM_LOCALID"])
+        self.local_world_size = self.world_size // node_count
+        assert self.local_rank < self.local_world_size
+
+    # Single node job with preset environment (i.e. torchrun)
+    def _set_from_preset_env(self):
+        # logger.info("Initialization from preset environment")
+        self.master_addr = os.environ["MASTER_ADDR"]
+        self.master_port = os.environ["MASTER_PORT"]
+        self.rank = int(os.environ["RANK"])
+        self.world_size = int(os.environ["WORLD_SIZE"])
+        assert self.rank < self.world_size
+        self.local_rank = int(os.environ["LOCAL_RANK"])
+        self.local_world_size = int(os.environ["LOCAL_WORLD_SIZE"])
+        assert self.local_rank < self.local_world_size
+
+    # Single node and GPU job (i.e. local script run)
+    def _set_from_local(self):
+        # logger.info("Initialization from local")
+        self.master_addr = "127.0.0.1"
+        self.master_port = _get_available_port()
+        self.rank = 0
+        self.world_size = 1
+        self.local_rank = 0
+        self.local_world_size = 1
+
+    def export(self, *, overwrite: bool) -> "_TorchDistributedEnvironment":
+        # See the "Environment variable initialization" section from
+        # https://pytorch.org/docs/stable/distributed.html for the complete list of
+        # environment variables required for the env:// initialization method.
+        env_vars = {
+            "MASTER_ADDR": self.master_addr,
+            "MASTER_PORT": str(self.master_port),
+            "RANK": str(self.rank),
+            "WORLD_SIZE": str(self.world_size),
+            "LOCAL_RANK": str(self.local_rank),
+            "LOCAL_WORLD_SIZE": str(self.local_world_size),
+        }
+        if not overwrite:
+            for k, v in env_vars.items():
+                _check_env_variable(k, v)
+
+        os.environ.update(env_vars)
+        return self
+
+
+def enable(*, set_cuda_current_device: bool = True, overwrite: bool = False, allow_nccl_timeout: bool = False):
+    """Enable distributed mode
+
+    Args:
+        set_cuda_current_device: If True, call torch.cuda.set_device() to set the
+            current PyTorch CUDA device to the one matching the local rank.
+        overwrite: If True, overwrites already set variables. Else fails.
+    """
+
+    global _LOCAL_RANK, _LOCAL_WORLD_SIZE
+    if _LOCAL_RANK >= 0 or _LOCAL_WORLD_SIZE >= 0:
+        raise RuntimeError("Distributed mode has already been enabled")
+    torch_env = _TorchDistributedEnvironment()
+    torch_env.export(overwrite=overwrite)
+
+    if set_cuda_current_device:
+        torch.cuda.set_device(torch_env.local_rank)
+
+    if allow_nccl_timeout:
+        # This allows to use torch distributed timeout in a NCCL backend
+        key, value = "NCCL_ASYNC_ERROR_HANDLING", "1"
+        if not overwrite:
+            _check_env_variable(key, value)
+        os.environ[key] = value
+
+    dist.init_process_group(backend="nccl")
+    dist.barrier()
+
+    # Finalize setup
+    _LOCAL_RANK = torch_env.local_rank
+    _LOCAL_WORLD_SIZE = torch_env.local_world_size
+    _restrict_print_to_main_process()
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0952fcc3f57e34b3747962e9ebd6fc57aeea63fa
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/knn.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..02ee261348e9871b10bfc40b7283b4f6205cba18
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/knn.py
@@ -0,0 +1,405 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from functools import partial
+import json
+import logging
+import os
+import sys
+from typing import List, Optional
+
+import torch
+from torch.nn.functional import one_hot, softmax
+
+import dinov2.distributed as distributed
+from dinov2.data import SamplerType, make_data_loader, make_dataset
+from dinov2.data.transforms import make_classification_eval_transform
+from dinov2.eval.metrics import AccuracyAveraging, build_topk_accuracy_metric
+from dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from dinov2.eval.setup import setup_and_build_model
+from dinov2.eval.utils import ModelWithNormalize, evaluate, extract_features
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parents = parents or []
+    setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+    parents = [setup_args_parser]
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--train-dataset",
+        dest="train_dataset_str",
+        type=str,
+        help="Training dataset",
+    )
+    parser.add_argument(
+        "--val-dataset",
+        dest="val_dataset_str",
+        type=str,
+        help="Validation dataset",
+    )
+    parser.add_argument(
+        "--nb_knn",
+        nargs="+",
+        type=int,
+        help="Number of NN to use. 20 is usually working the best.",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        help="Temperature used in the voting coefficient",
+    )
+    parser.add_argument(
+        "--gather-on-cpu",
+        action="store_true",
+        help="Whether to gather the train features on cpu, slower"
+        "but useful to avoid OOM for large datasets (e.g. ImageNet22k).",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        help="Batch size.",
+    )
+    parser.add_argument(
+        "--n-per-class-list",
+        nargs="+",
+        type=int,
+        help="Number to take per class",
+    )
+    parser.add_argument(
+        "--n-tries",
+        type=int,
+        help="Number of tries",
+    )
+    parser.set_defaults(
+        train_dataset_str="ImageNet:split=TRAIN",
+        val_dataset_str="ImageNet:split=VAL",
+        nb_knn=[10, 20, 100, 200],
+        temperature=0.07,
+        batch_size=256,
+        n_per_class_list=[-1],
+        n_tries=1,
+    )
+    return parser
+
+
+class KnnModule(torch.nn.Module):
+    """
+    Gets knn of test features from all processes on a chunk of the train features
+
+    Each rank gets a chunk of the train features as well as a chunk of the test features.
+    In `compute_neighbors`, for each rank one after the other, its chunk of test features
+    is sent to all devices, partial knns are computed with each chunk of train features
+    then collated back on the original device.
+    """
+
+    def __init__(self, train_features, train_labels, nb_knn, T, device, num_classes=1000):
+        super().__init__()
+
+        self.global_rank = distributed.get_global_rank()
+        self.global_size = distributed.get_global_size()
+
+        self.device = device
+        self.train_features_rank_T = train_features.chunk(self.global_size)[self.global_rank].T.to(self.device)
+        self.candidates = train_labels.chunk(self.global_size)[self.global_rank].view(1, -1).to(self.device)
+
+        self.nb_knn = nb_knn
+        self.max_k = max(self.nb_knn)
+        self.T = T
+        self.num_classes = num_classes
+
+    def _get_knn_sims_and_labels(self, similarity, train_labels):
+        topk_sims, indices = similarity.topk(self.max_k, largest=True, sorted=True)
+        neighbors_labels = torch.gather(train_labels, 1, indices)
+        return topk_sims, neighbors_labels
+
+    def _similarity_for_rank(self, features_rank, source_rank):
+        # Send the features from `source_rank` to all ranks
+        broadcast_shape = torch.tensor(features_rank.shape).to(self.device)
+        torch.distributed.broadcast(broadcast_shape, source_rank)
+
+        broadcasted = features_rank
+        if self.global_rank != source_rank:
+            broadcasted = torch.zeros(*broadcast_shape, dtype=features_rank.dtype, device=self.device)
+        torch.distributed.broadcast(broadcasted, source_rank)
+
+        # Compute the neighbors for `source_rank` among `train_features_rank_T`
+        similarity_rank = torch.mm(broadcasted, self.train_features_rank_T)
+        candidate_labels = self.candidates.expand(len(similarity_rank), -1)
+        return self._get_knn_sims_and_labels(similarity_rank, candidate_labels)
+
+    def _gather_all_knn_for_rank(self, topk_sims, neighbors_labels, target_rank):
+        # Gather all neighbors for `target_rank`
+        topk_sims_rank = retrieved_rank = None
+        if self.global_rank == target_rank:
+            topk_sims_rank = [torch.zeros_like(topk_sims) for _ in range(self.global_size)]
+            retrieved_rank = [torch.zeros_like(neighbors_labels) for _ in range(self.global_size)]
+
+        torch.distributed.gather(topk_sims, topk_sims_rank, dst=target_rank)
+        torch.distributed.gather(neighbors_labels, retrieved_rank, dst=target_rank)
+
+        if self.global_rank == target_rank:
+            # Perform a second top-k on the k * global_size retrieved neighbors
+            topk_sims_rank = torch.cat(topk_sims_rank, dim=1)
+            retrieved_rank = torch.cat(retrieved_rank, dim=1)
+            results = self._get_knn_sims_and_labels(topk_sims_rank, retrieved_rank)
+            return results
+        return None
+
+    def compute_neighbors(self, features_rank):
+        for rank in range(self.global_size):
+            topk_sims, neighbors_labels = self._similarity_for_rank(features_rank, rank)
+            results = self._gather_all_knn_for_rank(topk_sims, neighbors_labels, rank)
+            if results is not None:
+                topk_sims_rank, neighbors_labels_rank = results
+        return topk_sims_rank, neighbors_labels_rank
+
+    def forward(self, features_rank):
+        """
+        Compute the results on all values of `self.nb_knn` neighbors from the full `self.max_k`
+        """
+        assert all(k <= self.max_k for k in self.nb_knn)
+
+        topk_sims, neighbors_labels = self.compute_neighbors(features_rank)
+        batch_size = neighbors_labels.shape[0]
+        topk_sims_transform = softmax(topk_sims / self.T, 1)
+        matmul = torch.mul(
+            one_hot(neighbors_labels, num_classes=self.num_classes),
+            topk_sims_transform.view(batch_size, -1, 1),
+        )
+        probas_for_k = {k: torch.sum(matmul[:, :k, :], 1) for k in self.nb_knn}
+        return probas_for_k
+
+
+class DictKeysModule(torch.nn.Module):
+    def __init__(self, keys):
+        super().__init__()
+        self.keys = keys
+
+    def forward(self, features_dict, targets):
+        for k in self.keys:
+            features_dict = features_dict[k]
+        return {"preds": features_dict, "target": targets}
+
+
+def create_module_dict(*, module, n_per_class_list, n_tries, nb_knn, train_features, train_labels):
+    modules = {}
+    mapping = create_class_indices_mapping(train_labels)
+    for npc in n_per_class_list:
+        if npc < 0:  # Only one try needed when using the full data
+            full_module = module(
+                train_features=train_features,
+                train_labels=train_labels,
+                nb_knn=nb_knn,
+            )
+            modules["full"] = ModuleDictWithForward({"1": full_module})
+            continue
+        all_tries = {}
+        for t in range(n_tries):
+            final_indices = filter_train(mapping, npc, seed=t)
+            k_list = list(set(nb_knn + [npc]))
+            k_list = sorted([el for el in k_list if el <= npc])
+            all_tries[str(t)] = module(
+                train_features=train_features[final_indices],
+                train_labels=train_labels[final_indices],
+                nb_knn=k_list,
+            )
+        modules[f"{npc} per class"] = ModuleDictWithForward(all_tries)
+
+    return ModuleDictWithForward(modules)
+
+
+def filter_train(mapping, n_per_class, seed):
+    torch.manual_seed(seed)
+    final_indices = []
+    for k in mapping.keys():
+        index = torch.randperm(len(mapping[k]))[:n_per_class]
+        final_indices.append(mapping[k][index])
+    return torch.cat(final_indices).squeeze()
+
+
+def create_class_indices_mapping(labels):
+    unique_labels, inverse = torch.unique(labels, return_inverse=True)
+    mapping = {unique_labels[i]: (inverse == i).nonzero() for i in range(len(unique_labels))}
+    return mapping
+
+
+class ModuleDictWithForward(torch.nn.ModuleDict):
+    def forward(self, *args, **kwargs):
+        return {k: module(*args, **kwargs) for k, module in self._modules.items()}
+
+
+def eval_knn(
+    model,
+    train_dataset,
+    val_dataset,
+    accuracy_averaging,
+    nb_knn,
+    temperature,
+    batch_size,
+    num_workers,
+    gather_on_cpu,
+    n_per_class_list=[-1],
+    n_tries=1,
+):
+    model = ModelWithNormalize(model)
+
+    logger.info("Extracting features for train set...")
+    train_features, train_labels = extract_features(
+        model, train_dataset, batch_size, num_workers, gather_on_cpu=gather_on_cpu
+    )
+    logger.info(f"Train features created, shape {train_features.shape}.")
+
+    val_dataloader = make_data_loader(
+        dataset=val_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        sampler_type=SamplerType.DISTRIBUTED,
+        drop_last=False,
+        shuffle=False,
+        persistent_workers=True,
+    )
+    num_classes = train_labels.max() + 1
+    metric_collection = build_topk_accuracy_metric(accuracy_averaging, num_classes=num_classes)
+
+    device = torch.cuda.current_device()
+    partial_module = partial(KnnModule, T=temperature, device=device, num_classes=num_classes)
+    knn_module_dict = create_module_dict(
+        module=partial_module,
+        n_per_class_list=n_per_class_list,
+        n_tries=n_tries,
+        nb_knn=nb_knn,
+        train_features=train_features,
+        train_labels=train_labels,
+    )
+    postprocessors, metrics = {}, {}
+    for n_per_class, knn_module in knn_module_dict.items():
+        for t, knn_try in knn_module.items():
+            postprocessors = {
+                **postprocessors,
+                **{(n_per_class, t, k): DictKeysModule([n_per_class, t, k]) for k in knn_try.nb_knn},
+            }
+            metrics = {**metrics, **{(n_per_class, t, k): metric_collection.clone() for k in knn_try.nb_knn}}
+    model_with_knn = torch.nn.Sequential(model, knn_module_dict)
+
+    # ============ evaluation ... ============
+    logger.info("Start the k-NN classification.")
+    _, results_dict = evaluate(model_with_knn, val_dataloader, postprocessors, metrics, device)
+
+    # Averaging the results over the n tries for each value of n_per_class
+    for n_per_class, knn_module in knn_module_dict.items():
+        first_try = list(knn_module.keys())[0]
+        k_list = knn_module[first_try].nb_knn
+        for k in k_list:
+            keys = results_dict[(n_per_class, first_try, k)].keys()  # keys are e.g. `top-1` and `top-5`
+            results_dict[(n_per_class, k)] = {
+                key: torch.mean(torch.stack([results_dict[(n_per_class, t, k)][key] for t in knn_module.keys()]))
+                for key in keys
+            }
+            for t in knn_module.keys():
+                del results_dict[(n_per_class, t, k)]
+
+    return results_dict
+
+
+def eval_knn_with_model(
+    model,
+    output_dir,
+    train_dataset_str="ImageNet:split=TRAIN",
+    val_dataset_str="ImageNet:split=VAL",
+    nb_knn=(10, 20, 100, 200),
+    temperature=0.07,
+    autocast_dtype=torch.float,
+    accuracy_averaging=AccuracyAveraging.MEAN_ACCURACY,
+    transform=None,
+    gather_on_cpu=False,
+    batch_size=256,
+    num_workers=5,
+    n_per_class_list=[-1],
+    n_tries=1,
+):
+    transform = transform or make_classification_eval_transform()
+
+    train_dataset = make_dataset(
+        dataset_str=train_dataset_str,
+        transform=transform,
+    )
+    val_dataset = make_dataset(
+        dataset_str=val_dataset_str,
+        transform=transform,
+    )
+
+    with torch.cuda.amp.autocast(dtype=autocast_dtype):
+        results_dict_knn = eval_knn(
+            model=model,
+            train_dataset=train_dataset,
+            val_dataset=val_dataset,
+            accuracy_averaging=accuracy_averaging,
+            nb_knn=nb_knn,
+            temperature=temperature,
+            batch_size=batch_size,
+            num_workers=num_workers,
+            gather_on_cpu=gather_on_cpu,
+            n_per_class_list=n_per_class_list,
+            n_tries=n_tries,
+        )
+
+    results_dict = {}
+    if distributed.is_main_process():
+        for knn_ in results_dict_knn.keys():
+            top1 = results_dict_knn[knn_]["top-1"].item() * 100.0
+            top5 = results_dict_knn[knn_]["top-5"].item() * 100.0
+            results_dict[f"{knn_} Top 1"] = top1
+            results_dict[f"{knn_} Top 5"] = top5
+            logger.info(f"{knn_} classifier result: Top1: {top1:.2f} Top5: {top5:.2f}")
+
+    metrics_file_path = os.path.join(output_dir, "results_eval_knn.json")
+    with open(metrics_file_path, "a") as f:
+        for k, v in results_dict.items():
+            f.write(json.dumps({k: v}) + "\n")
+
+    if distributed.is_enabled():
+        torch.distributed.barrier()
+    return results_dict
+
+
+def main(args):
+    model, autocast_dtype = setup_and_build_model(args)
+    eval_knn_with_model(
+        model=model,
+        output_dir=args.output_dir,
+        train_dataset_str=args.train_dataset_str,
+        val_dataset_str=args.val_dataset_str,
+        nb_knn=args.nb_knn,
+        temperature=args.temperature,
+        autocast_dtype=autocast_dtype,
+        accuracy_averaging=AccuracyAveraging.MEAN_ACCURACY,
+        transform=None,
+        gather_on_cpu=args.gather_on_cpu,
+        batch_size=args.batch_size,
+        num_workers=5,
+        n_per_class_list=args.n_per_class_list,
+        n_tries=args.n_tries,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    description = "DINOv2 k-NN evaluation"
+    args_parser = get_args_parser(description=description)
+    args = args_parser.parse_args()
+    sys.exit(main(args))
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/linear.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d8202606999c0c01353904d8b02d2ff3509fef9
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/linear.py
@@ -0,0 +1,626 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from functools import partial
+import json
+import logging
+import os
+import sys
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel
+from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer
+
+from dinov2.data import SamplerType, make_data_loader, make_dataset
+from dinov2.data.transforms import make_classification_eval_transform, make_classification_train_transform
+import dinov2.distributed as distributed
+from dinov2.eval.metrics import MetricType, build_metric
+from dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from dinov2.eval.setup import setup_and_build_model
+from dinov2.eval.utils import ModelWithIntermediateLayers, evaluate
+from dinov2.logging import MetricLogger
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parents = parents or []
+    setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+    parents = [setup_args_parser]
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--train-dataset",
+        dest="train_dataset_str",
+        type=str,
+        help="Training dataset",
+    )
+    parser.add_argument(
+        "--val-dataset",
+        dest="val_dataset_str",
+        type=str,
+        help="Validation dataset",
+    )
+    parser.add_argument(
+        "--test-datasets",
+        dest="test_dataset_strs",
+        type=str,
+        nargs="+",
+        help="Test datasets, none to reuse the validation dataset",
+    )
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        help="Number of training epochs",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        help="Batch Size (per GPU)",
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        help="Number de Workers",
+    )
+    parser.add_argument(
+        "--epoch-length",
+        type=int,
+        help="Length of an epoch in number of iterations",
+    )
+    parser.add_argument(
+        "--save-checkpoint-frequency",
+        type=int,
+        help="Number of epochs between two named checkpoint saves.",
+    )
+    parser.add_argument(
+        "--eval-period-iterations",
+        type=int,
+        help="Number of iterations between two evaluations.",
+    )
+    parser.add_argument(
+        "--learning-rates",
+        nargs="+",
+        type=float,
+        help="Learning rates to grid search.",
+    )
+    parser.add_argument(
+        "--no-resume",
+        action="store_true",
+        help="Whether to not resume from existing checkpoints",
+    )
+    parser.add_argument(
+        "--val-metric-type",
+        type=MetricType,
+        choices=list(MetricType),
+        help="Validation metric",
+    )
+    parser.add_argument(
+        "--test-metric-types",
+        type=MetricType,
+        choices=list(MetricType),
+        nargs="+",
+        help="Evaluation metric",
+    )
+    parser.add_argument(
+        "--classifier-fpath",
+        type=str,
+        help="Path to a file containing pretrained linear classifiers",
+    )
+    parser.add_argument(
+        "--val-class-mapping-fpath",
+        type=str,
+        help="Path to a file containing a mapping to adjust classifier outputs",
+    )
+    parser.add_argument(
+        "--test-class-mapping-fpaths",
+        nargs="+",
+        type=str,
+        help="Path to a file containing a mapping to adjust classifier outputs",
+    )
+    parser.set_defaults(
+        train_dataset_str="ImageNet:split=TRAIN",
+        val_dataset_str="ImageNet:split=VAL",
+        test_dataset_strs=None,
+        epochs=10,
+        batch_size=128,
+        num_workers=8,
+        epoch_length=1250,
+        save_checkpoint_frequency=20,
+        eval_period_iterations=1250,
+        learning_rates=[1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2, 0.1],
+        val_metric_type=MetricType.MEAN_ACCURACY,
+        test_metric_types=None,
+        classifier_fpath=None,
+        val_class_mapping_fpath=None,
+        test_class_mapping_fpaths=[None],
+    )
+    return parser
+
+
+def has_ddp_wrapper(m: nn.Module) -> bool:
+    return isinstance(m, DistributedDataParallel)
+
+
+def remove_ddp_wrapper(m: nn.Module) -> nn.Module:
+    return m.module if has_ddp_wrapper(m) else m
+
+
+def _pad_and_collate(batch):
+    maxlen = max(len(targets) for image, targets in batch)
+    padded_batch = [
+        (image, np.pad(targets, (0, maxlen - len(targets)), constant_values=-1)) for image, targets in batch
+    ]
+    return torch.utils.data.default_collate(padded_batch)
+
+
+def create_linear_input(x_tokens_list, use_n_blocks, use_avgpool):
+    intermediate_output = x_tokens_list[-use_n_blocks:]
+    output = torch.cat([class_token for _, class_token in intermediate_output], dim=-1)
+    if use_avgpool:
+        output = torch.cat(
+            (
+                output,
+                torch.mean(intermediate_output[-1][0], dim=1),  # patch tokens
+            ),
+            dim=-1,
+        )
+        output = output.reshape(output.shape[0], -1)
+    return output.float()
+
+
+class LinearClassifier(nn.Module):
+    """Linear layer to train on top of frozen features"""
+
+    def __init__(self, out_dim, use_n_blocks, use_avgpool, num_classes=1000):
+        super().__init__()
+        self.out_dim = out_dim
+        self.use_n_blocks = use_n_blocks
+        self.use_avgpool = use_avgpool
+        self.num_classes = num_classes
+        self.linear = nn.Linear(out_dim, num_classes)
+        self.linear.weight.data.normal_(mean=0.0, std=0.01)
+        self.linear.bias.data.zero_()
+
+    def forward(self, x_tokens_list):
+        output = create_linear_input(x_tokens_list, self.use_n_blocks, self.use_avgpool)
+        return self.linear(output)
+
+
+class AllClassifiers(nn.Module):
+    def __init__(self, classifiers_dict):
+        super().__init__()
+        self.classifiers_dict = nn.ModuleDict()
+        self.classifiers_dict.update(classifiers_dict)
+
+    def forward(self, inputs):
+        return {k: v.forward(inputs) for k, v in self.classifiers_dict.items()}
+
+    def __len__(self):
+        return len(self.classifiers_dict)
+
+
+class LinearPostprocessor(nn.Module):
+    def __init__(self, linear_classifier, class_mapping=None):
+        super().__init__()
+        self.linear_classifier = linear_classifier
+        self.register_buffer("class_mapping", None if class_mapping is None else torch.LongTensor(class_mapping))
+
+    def forward(self, samples, targets):
+        preds = self.linear_classifier(samples)
+        return {
+            "preds": preds[:, self.class_mapping] if self.class_mapping is not None else preds,
+            "target": targets,
+        }
+
+
+def scale_lr(learning_rates, batch_size):
+    return learning_rates * (batch_size * distributed.get_global_size()) / 256.0
+
+
+def setup_linear_classifiers(sample_output, n_last_blocks_list, learning_rates, batch_size, num_classes=1000):
+    linear_classifiers_dict = nn.ModuleDict()
+    optim_param_groups = []
+    for n in n_last_blocks_list:
+        for avgpool in [False, True]:
+            for _lr in learning_rates:
+                lr = scale_lr(_lr, batch_size)
+                out_dim = create_linear_input(sample_output, use_n_blocks=n, use_avgpool=avgpool).shape[1]
+                linear_classifier = LinearClassifier(
+                    out_dim, use_n_blocks=n, use_avgpool=avgpool, num_classes=num_classes
+                )
+                linear_classifier = linear_classifier.cuda()
+                linear_classifiers_dict[
+                    f"classifier_{n}_blocks_avgpool_{avgpool}_lr_{lr:.5f}".replace(".", "_")
+                ] = linear_classifier
+                optim_param_groups.append({"params": linear_classifier.parameters(), "lr": lr})
+
+    linear_classifiers = AllClassifiers(linear_classifiers_dict)
+    if distributed.is_enabled():
+        linear_classifiers = nn.parallel.DistributedDataParallel(linear_classifiers)
+
+    return linear_classifiers, optim_param_groups
+
+
+@torch.no_grad()
+def evaluate_linear_classifiers(
+    feature_model,
+    linear_classifiers,
+    data_loader,
+    metric_type,
+    metrics_file_path,
+    training_num_classes,
+    iteration,
+    prefixstring="",
+    class_mapping=None,
+    best_classifier_on_val=None,
+):
+    logger.info("running validation !")
+
+    num_classes = len(class_mapping) if class_mapping is not None else training_num_classes
+    metric = build_metric(metric_type, num_classes=num_classes)
+    postprocessors = {k: LinearPostprocessor(v, class_mapping) for k, v in linear_classifiers.classifiers_dict.items()}
+    metrics = {k: metric.clone() for k in linear_classifiers.classifiers_dict}
+
+    _, results_dict_temp = evaluate(
+        feature_model,
+        data_loader,
+        postprocessors,
+        metrics,
+        torch.cuda.current_device(),
+    )
+
+    logger.info("")
+    results_dict = {}
+    max_accuracy = 0
+    best_classifier = ""
+    for i, (classifier_string, metric) in enumerate(results_dict_temp.items()):
+        logger.info(f"{prefixstring} -- Classifier: {classifier_string} * {metric}")
+        if (
+            best_classifier_on_val is None and metric["top-1"].item() > max_accuracy
+        ) or classifier_string == best_classifier_on_val:
+            max_accuracy = metric["top-1"].item()
+            best_classifier = classifier_string
+
+    results_dict["best_classifier"] = {"name": best_classifier, "accuracy": max_accuracy}
+
+    logger.info(f"best classifier: {results_dict['best_classifier']}")
+
+    if distributed.is_main_process():
+        with open(metrics_file_path, "a") as f:
+            f.write(f"iter: {iteration}\n")
+            for k, v in results_dict.items():
+                f.write(json.dumps({k: v}) + "\n")
+            f.write("\n")
+
+    return results_dict
+
+
+def eval_linear(
+    *,
+    feature_model,
+    linear_classifiers,
+    train_data_loader,
+    val_data_loader,
+    metrics_file_path,
+    optimizer,
+    scheduler,
+    output_dir,
+    max_iter,
+    checkpoint_period,  # In number of iter, creates a new file every period
+    running_checkpoint_period,  # Period to update main checkpoint file
+    eval_period,
+    metric_type,
+    training_num_classes,
+    resume=True,
+    classifier_fpath=None,
+    val_class_mapping=None,
+):
+    checkpointer = Checkpointer(linear_classifiers, output_dir, optimizer=optimizer, scheduler=scheduler)
+    start_iter = checkpointer.resume_or_load(classifier_fpath or "", resume=resume).get("iteration", -1) + 1
+
+    periodic_checkpointer = PeriodicCheckpointer(checkpointer, checkpoint_period, max_iter=max_iter)
+    iteration = start_iter
+    logger.info("Starting training from iteration {}".format(start_iter))
+    metric_logger = MetricLogger(delimiter="  ")
+    header = "Training"
+
+    for data, labels in metric_logger.log_every(
+        train_data_loader,
+        10,
+        header,
+        max_iter,
+        start_iter,
+    ):
+        data = data.cuda(non_blocking=True)
+        labels = labels.cuda(non_blocking=True)
+
+        features = feature_model(data)
+        outputs = linear_classifiers(features)
+
+        losses = {f"loss_{k}": nn.CrossEntropyLoss()(v, labels) for k, v in outputs.items()}
+        loss = sum(losses.values())
+
+        # compute the gradients
+        optimizer.zero_grad()
+        loss.backward()
+
+        # step
+        optimizer.step()
+        scheduler.step()
+
+        # log
+        if iteration % 10 == 0:
+            torch.cuda.synchronize()
+            metric_logger.update(loss=loss.item())
+            metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+            print("lr", optimizer.param_groups[0]["lr"])
+
+        if iteration - start_iter > 5:
+            if iteration % running_checkpoint_period == 0:
+                torch.cuda.synchronize()
+                if distributed.is_main_process():
+                    logger.info("Checkpointing running_checkpoint")
+                    periodic_checkpointer.save("running_checkpoint_linear_eval", iteration=iteration)
+                torch.cuda.synchronize()
+        periodic_checkpointer.step(iteration)
+
+        if eval_period > 0 and (iteration + 1) % eval_period == 0 and iteration != max_iter - 1:
+            _ = evaluate_linear_classifiers(
+                feature_model=feature_model,
+                linear_classifiers=remove_ddp_wrapper(linear_classifiers),
+                data_loader=val_data_loader,
+                metrics_file_path=metrics_file_path,
+                prefixstring=f"ITER: {iteration}",
+                metric_type=metric_type,
+                training_num_classes=training_num_classes,
+                iteration=iteration,
+                class_mapping=val_class_mapping,
+            )
+            torch.cuda.synchronize()
+
+        iteration = iteration + 1
+
+    val_results_dict = evaluate_linear_classifiers(
+        feature_model=feature_model,
+        linear_classifiers=remove_ddp_wrapper(linear_classifiers),
+        data_loader=val_data_loader,
+        metrics_file_path=metrics_file_path,
+        metric_type=metric_type,
+        training_num_classes=training_num_classes,
+        iteration=iteration,
+        class_mapping=val_class_mapping,
+    )
+    return val_results_dict, feature_model, linear_classifiers, iteration
+
+
+def make_eval_data_loader(test_dataset_str, batch_size, num_workers, metric_type):
+    test_dataset = make_dataset(
+        dataset_str=test_dataset_str,
+        transform=make_classification_eval_transform(),
+    )
+    test_data_loader = make_data_loader(
+        dataset=test_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        sampler_type=SamplerType.DISTRIBUTED,
+        drop_last=False,
+        shuffle=False,
+        persistent_workers=False,
+        collate_fn=_pad_and_collate if metric_type == MetricType.IMAGENET_REAL_ACCURACY else None,
+    )
+    return test_data_loader
+
+
+def test_on_datasets(
+    feature_model,
+    linear_classifiers,
+    test_dataset_strs,
+    batch_size,
+    num_workers,
+    test_metric_types,
+    metrics_file_path,
+    training_num_classes,
+    iteration,
+    best_classifier_on_val,
+    prefixstring="",
+    test_class_mappings=[None],
+):
+    results_dict = {}
+    for test_dataset_str, class_mapping, metric_type in zip(test_dataset_strs, test_class_mappings, test_metric_types):
+        logger.info(f"Testing on {test_dataset_str}")
+        test_data_loader = make_eval_data_loader(test_dataset_str, batch_size, num_workers, metric_type)
+        dataset_results_dict = evaluate_linear_classifiers(
+            feature_model,
+            remove_ddp_wrapper(linear_classifiers),
+            test_data_loader,
+            metric_type,
+            metrics_file_path,
+            training_num_classes,
+            iteration,
+            prefixstring="",
+            class_mapping=class_mapping,
+            best_classifier_on_val=best_classifier_on_val,
+        )
+        results_dict[f"{test_dataset_str}_accuracy"] = 100.0 * dataset_results_dict["best_classifier"]["accuracy"]
+    return results_dict
+
+
+def run_eval_linear(
+    model,
+    output_dir,
+    train_dataset_str,
+    val_dataset_str,
+    batch_size,
+    epochs,
+    epoch_length,
+    num_workers,
+    save_checkpoint_frequency,
+    eval_period_iterations,
+    learning_rates,
+    autocast_dtype,
+    test_dataset_strs=None,
+    resume=True,
+    classifier_fpath=None,
+    val_class_mapping_fpath=None,
+    test_class_mapping_fpaths=[None],
+    val_metric_type=MetricType.MEAN_ACCURACY,
+    test_metric_types=None,
+):
+    seed = 0
+
+    if test_dataset_strs is None:
+        test_dataset_strs = [val_dataset_str]
+    if test_metric_types is None:
+        test_metric_types = [val_metric_type] * len(test_dataset_strs)
+    else:
+        assert len(test_metric_types) == len(test_dataset_strs)
+    assert len(test_dataset_strs) == len(test_class_mapping_fpaths)
+
+    train_transform = make_classification_train_transform()
+    train_dataset = make_dataset(
+        dataset_str=train_dataset_str,
+        transform=train_transform,
+    )
+    training_num_classes = len(torch.unique(torch.Tensor(train_dataset.get_targets().astype(int))))
+    sampler_type = SamplerType.SHARDED_INFINITE
+    # sampler_type = SamplerType.INFINITE
+
+    n_last_blocks_list = [1, 4]
+    n_last_blocks = max(n_last_blocks_list)
+    autocast_ctx = partial(torch.cuda.amp.autocast, enabled=True, dtype=autocast_dtype)
+    feature_model = ModelWithIntermediateLayers(model, n_last_blocks, autocast_ctx)
+    sample_output = feature_model(train_dataset[0][0].unsqueeze(0).cuda())
+
+    linear_classifiers, optim_param_groups = setup_linear_classifiers(
+        sample_output,
+        n_last_blocks_list,
+        learning_rates,
+        batch_size,
+        training_num_classes,
+    )
+
+    optimizer = torch.optim.SGD(optim_param_groups, momentum=0.9, weight_decay=0)
+    max_iter = epochs * epoch_length
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, max_iter, eta_min=0)
+    checkpointer = Checkpointer(linear_classifiers, output_dir, optimizer=optimizer, scheduler=scheduler)
+    start_iter = checkpointer.resume_or_load(classifier_fpath or "", resume=resume).get("iteration", -1) + 1
+    train_data_loader = make_data_loader(
+        dataset=train_dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=True,
+        seed=seed,
+        sampler_type=sampler_type,
+        sampler_advance=start_iter,
+        drop_last=True,
+        persistent_workers=True,
+    )
+    val_data_loader = make_eval_data_loader(val_dataset_str, batch_size, num_workers, val_metric_type)
+
+    checkpoint_period = save_checkpoint_frequency * epoch_length
+
+    if val_class_mapping_fpath is not None:
+        logger.info(f"Using class mapping from {val_class_mapping_fpath}")
+        val_class_mapping = np.load(val_class_mapping_fpath)
+    else:
+        val_class_mapping = None
+
+    test_class_mappings = []
+    for class_mapping_fpath in test_class_mapping_fpaths:
+        if class_mapping_fpath is not None and class_mapping_fpath != "None":
+            logger.info(f"Using class mapping from {class_mapping_fpath}")
+            class_mapping = np.load(class_mapping_fpath)
+        else:
+            class_mapping = None
+        test_class_mappings.append(class_mapping)
+
+    metrics_file_path = os.path.join(output_dir, "results_eval_linear.json")
+    val_results_dict, feature_model, linear_classifiers, iteration = eval_linear(
+        feature_model=feature_model,
+        linear_classifiers=linear_classifiers,
+        train_data_loader=train_data_loader,
+        val_data_loader=val_data_loader,
+        metrics_file_path=metrics_file_path,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        output_dir=output_dir,
+        max_iter=max_iter,
+        checkpoint_period=checkpoint_period,
+        running_checkpoint_period=epoch_length,
+        eval_period=eval_period_iterations,
+        metric_type=val_metric_type,
+        training_num_classes=training_num_classes,
+        resume=resume,
+        val_class_mapping=val_class_mapping,
+        classifier_fpath=classifier_fpath,
+    )
+    results_dict = {}
+    if len(test_dataset_strs) > 1 or test_dataset_strs[0] != val_dataset_str:
+        results_dict = test_on_datasets(
+            feature_model,
+            linear_classifiers,
+            test_dataset_strs,
+            batch_size,
+            0,  # num_workers,
+            test_metric_types,
+            metrics_file_path,
+            training_num_classes,
+            iteration,
+            val_results_dict["best_classifier"]["name"],
+            prefixstring="",
+            test_class_mappings=test_class_mappings,
+        )
+    results_dict["best_classifier"] = val_results_dict["best_classifier"]["name"]
+    results_dict[f"{val_dataset_str}_accuracy"] = 100.0 * val_results_dict["best_classifier"]["accuracy"]
+    logger.info("Test Results Dict " + str(results_dict))
+
+    return results_dict
+
+
+def main(args):
+    model, autocast_dtype = setup_and_build_model(args)
+    run_eval_linear(
+        model=model,
+        output_dir=args.output_dir,
+        train_dataset_str=args.train_dataset_str,
+        val_dataset_str=args.val_dataset_str,
+        test_dataset_strs=args.test_dataset_strs,
+        batch_size=args.batch_size,
+        epochs=args.epochs,
+        epoch_length=args.epoch_length,
+        num_workers=args.num_workers,
+        save_checkpoint_frequency=args.save_checkpoint_frequency,
+        eval_period_iterations=args.eval_period_iterations,
+        learning_rates=args.learning_rates,
+        autocast_dtype=autocast_dtype,
+        resume=not args.no_resume,
+        classifier_fpath=args.classifier_fpath,
+        val_metric_type=args.val_metric_type,
+        test_metric_types=args.test_metric_types,
+        val_class_mapping_fpath=args.val_class_mapping_fpath,
+        test_class_mapping_fpaths=args.test_class_mapping_fpaths,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    description = "DINOv2 linear evaluation"
+    args_parser = get_args_parser(description=description)
+    args = args_parser.parse_args()
+    sys.exit(main(args))
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/log_regression.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/log_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e6ede2b616208cb49c7af67d58c8e6e4afb60e1
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/log_regression.py
@@ -0,0 +1,445 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import gc
+import logging
+import sys
+import time
+from typing import List, Optional
+
+from cuml.linear_model import LogisticRegression
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed
+from torch import nn
+from torch.utils.data import TensorDataset
+from torchmetrics import MetricTracker
+
+from dinov2.data import make_dataset
+from dinov2.data.transforms import make_classification_eval_transform
+from dinov2.distributed import get_global_rank, get_global_size
+from dinov2.eval.metrics import MetricType, build_metric
+from dinov2.eval.setup import get_args_parser as get_setup_args_parser
+from dinov2.eval.setup import setup_and_build_model
+from dinov2.eval.utils import evaluate, extract_features
+from dinov2.utils.dtype import as_torch_dtype
+
+
+logger = logging.getLogger("dinov2")
+
+DEFAULT_MAX_ITER = 1_000
+C_POWER_RANGE = torch.linspace(-6, 5, 45)
+_CPU_DEVICE = torch.device("cpu")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parents = parents or []
+    setup_args_parser = get_setup_args_parser(parents=parents, add_help=False)
+    parents = [setup_args_parser]
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--train-dataset",
+        dest="train_dataset_str",
+        type=str,
+        help="Training dataset",
+    )
+    parser.add_argument(
+        "--val-dataset",
+        dest="val_dataset_str",
+        type=str,
+        help="Validation dataset",
+    )
+    parser.add_argument(
+        "--finetune-dataset-str",
+        dest="finetune_dataset_str",
+        type=str,
+        help="Fine-tuning dataset",
+    )
+    parser.add_argument(
+        "--finetune-on-val",
+        action="store_true",
+        help="If there is no finetune dataset, whether to choose the "
+        "hyperparameters on the val set instead of 10%% of the train dataset",
+    )
+    parser.add_argument(
+        "--metric-type",
+        type=MetricType,
+        choices=list(MetricType),
+        help="Metric type",
+    )
+    parser.add_argument(
+        "--train-features-device",
+        type=str,
+        help="Device to gather train features (cpu, cuda, cuda:0, etc.), default: %(default)s",
+    )
+    parser.add_argument(
+        "--train-dtype",
+        type=str,
+        help="Data type to convert the train features to (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--max-train-iters",
+        type=int,
+        help="Maximum number of train iterations (default: %(default)s)",
+    )
+    parser.set_defaults(
+        train_dataset_str="ImageNet:split=TRAIN",
+        val_dataset_str="ImageNet:split=VAL",
+        finetune_dataset_str=None,
+        metric_type=MetricType.MEAN_ACCURACY,
+        train_features_device="cpu",
+        train_dtype="float64",
+        max_train_iters=DEFAULT_MAX_ITER,
+        finetune_on_val=False,
+    )
+    return parser
+
+
+class LogRegModule(nn.Module):
+    def __init__(
+        self,
+        C,
+        max_iter=DEFAULT_MAX_ITER,
+        dtype=torch.float64,
+        device=_CPU_DEVICE,
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.device = device
+        self.estimator = LogisticRegression(
+            penalty="l2",
+            C=C,
+            max_iter=max_iter,
+            output_type="numpy",
+            tol=1e-12,
+            linesearch_max_iter=50,
+        )
+
+    def forward(self, samples, targets):
+        samples_device = samples.device
+        samples = samples.to(dtype=self.dtype, device=self.device)
+        if self.device == _CPU_DEVICE:
+            samples = samples.numpy()
+        probas = self.estimator.predict_proba(samples)
+        return {"preds": torch.from_numpy(probas).to(samples_device), "target": targets}
+
+    def fit(self, train_features, train_labels):
+        train_features = train_features.to(dtype=self.dtype, device=self.device)
+        train_labels = train_labels.to(dtype=self.dtype, device=self.device)
+        if self.device == _CPU_DEVICE:
+            # both cuML and sklearn only work with numpy arrays on CPU
+            train_features = train_features.numpy()
+            train_labels = train_labels.numpy()
+        self.estimator.fit(train_features, train_labels)
+
+
+def evaluate_model(*, logreg_model, logreg_metric, test_data_loader, device):
+    postprocessors = {"metrics": logreg_model}
+    metrics = {"metrics": logreg_metric}
+    return evaluate(nn.Identity(), test_data_loader, postprocessors, metrics, device)
+
+
+def train_for_C(*, C, max_iter, train_features, train_labels, dtype=torch.float64, device=_CPU_DEVICE):
+    logreg_model = LogRegModule(C, max_iter=max_iter, dtype=dtype, device=device)
+    logreg_model.fit(train_features, train_labels)
+    return logreg_model
+
+
+def train_and_evaluate(
+    *,
+    C,
+    max_iter,
+    train_features,
+    train_labels,
+    logreg_metric,
+    test_data_loader,
+    train_dtype=torch.float64,
+    train_features_device,
+    eval_device,
+):
+    logreg_model = train_for_C(
+        C=C,
+        max_iter=max_iter,
+        train_features=train_features,
+        train_labels=train_labels,
+        dtype=train_dtype,
+        device=train_features_device,
+    )
+    return evaluate_model(
+        logreg_model=logreg_model,
+        logreg_metric=logreg_metric,
+        test_data_loader=test_data_loader,
+        device=eval_device,
+    )
+
+
+def sweep_C_values(
+    *,
+    train_features,
+    train_labels,
+    test_data_loader,
+    metric_type,
+    num_classes,
+    train_dtype=torch.float64,
+    train_features_device=_CPU_DEVICE,
+    max_train_iters=DEFAULT_MAX_ITER,
+):
+    if metric_type == MetricType.PER_CLASS_ACCURACY:
+        # If we want to output per-class accuracy, we select the hyperparameters with mean per class
+        metric_type = MetricType.MEAN_PER_CLASS_ACCURACY
+    logreg_metric = build_metric(metric_type, num_classes=num_classes)
+    metric_tracker = MetricTracker(logreg_metric, maximize=True)
+    ALL_C = 10**C_POWER_RANGE
+    logreg_models = {}
+
+    train_features = train_features.to(dtype=train_dtype, device=train_features_device)
+    train_labels = train_labels.to(device=train_features_device)
+
+    for i in range(get_global_rank(), len(ALL_C), get_global_size()):
+        C = ALL_C[i].item()
+        logger.info(
+            f"Training for C = {C:.5f}, dtype={train_dtype}, "
+            f"features: {train_features.shape}, {train_features.dtype}, "
+            f"labels: {train_labels.shape}, {train_labels.dtype}"
+        )
+        logreg_models[C] = train_for_C(
+            C=C,
+            max_iter=max_train_iters,
+            train_features=train_features,
+            train_labels=train_labels,
+            dtype=train_dtype,
+            device=train_features_device,
+        )
+
+    gather_list = [None for _ in range(get_global_size())]
+    torch.distributed.all_gather_object(gather_list, logreg_models)
+
+    logreg_models_gathered = {}
+    for logreg_dict in gather_list:
+        logreg_models_gathered.update(logreg_dict)
+
+    for i in range(len(ALL_C)):
+        metric_tracker.increment()
+        C = ALL_C[i].item()
+        evals = evaluate_model(
+            logreg_model=logreg_models_gathered[C],
+            logreg_metric=metric_tracker,
+            test_data_loader=test_data_loader,
+            device=torch.cuda.current_device(),
+        )
+        logger.info(f"Trained for C = {C:.5f}, accuracies = {evals}")
+
+        best_stats, which_epoch = metric_tracker.best_metric(return_step=True)
+        best_stats_100 = {k: 100.0 * v for k, v in best_stats.items()}
+        if which_epoch["top-1"] == i:
+            best_C = C
+    logger.info(f"Sweep best {best_stats_100}, best C = {best_C:.6f}")
+
+    return best_stats, best_C
+
+
+def eval_log_regression(
+    *,
+    model,
+    train_dataset,
+    val_dataset,
+    finetune_dataset,
+    metric_type,
+    batch_size,
+    num_workers,
+    finetune_on_val=False,
+    train_dtype=torch.float64,
+    train_features_device=_CPU_DEVICE,
+    max_train_iters=DEFAULT_MAX_ITER,
+):
+    """
+    Implements the "standard" process for log regression evaluation:
+    The value of C is chosen by training on train_dataset and evaluating on
+    finetune_dataset. Then, the final model is trained on a concatenation of
+    train_dataset and finetune_dataset, and is evaluated on val_dataset.
+    If there is no finetune_dataset, the value of C is the one that yields
+    the best results on a random 10% subset of the train dataset
+    """
+
+    start = time.time()
+
+    train_features, train_labels = extract_features(
+        model, train_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+    )
+    val_features, val_labels = extract_features(
+        model, val_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+    )
+    val_data_loader = torch.utils.data.DataLoader(
+        TensorDataset(val_features, val_labels),
+        batch_size=batch_size,
+        drop_last=False,
+        num_workers=0,
+        persistent_workers=False,
+    )
+
+    if finetune_dataset is None and finetune_on_val:
+        logger.info("Choosing hyperparameters on the val dataset")
+        finetune_features, finetune_labels = val_features, val_labels
+    elif finetune_dataset is None and not finetune_on_val:
+        logger.info("Choosing hyperparameters on 10% of the train dataset")
+        torch.manual_seed(0)
+        indices = torch.randperm(len(train_features), device=train_features.device)
+        finetune_index = indices[: len(train_features) // 10]
+        train_index = indices[len(train_features) // 10 :]
+        finetune_features, finetune_labels = train_features[finetune_index], train_labels[finetune_index]
+        train_features, train_labels = train_features[train_index], train_labels[train_index]
+    else:
+        logger.info("Choosing hyperparameters on the finetune dataset")
+        finetune_features, finetune_labels = extract_features(
+            model, finetune_dataset, batch_size, num_workers, gather_on_cpu=(train_features_device == _CPU_DEVICE)
+        )
+    # release the model - free GPU memory
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
+    finetune_data_loader = torch.utils.data.DataLoader(
+        TensorDataset(finetune_features, finetune_labels),
+        batch_size=batch_size,
+        drop_last=False,
+    )
+
+    if len(train_labels.shape) > 1:
+        num_classes = train_labels.shape[1]
+    else:
+        num_classes = train_labels.max() + 1
+
+    logger.info("Using cuML for logistic regression")
+
+    best_stats, best_C = sweep_C_values(
+        train_features=train_features,
+        train_labels=train_labels,
+        test_data_loader=finetune_data_loader,
+        metric_type=metric_type,
+        num_classes=num_classes,
+        train_dtype=train_dtype,
+        train_features_device=train_features_device,
+        max_train_iters=max_train_iters,
+    )
+
+    if not finetune_on_val:
+        logger.info("Best parameter found, concatenating features")
+        train_features = torch.cat((train_features, finetune_features))
+        train_labels = torch.cat((train_labels, finetune_labels))
+
+    logger.info("Training final model")
+    logreg_metric = build_metric(metric_type, num_classes=num_classes)
+    evals = train_and_evaluate(
+        C=best_C,
+        max_iter=max_train_iters,
+        train_features=train_features,
+        train_labels=train_labels,
+        logreg_metric=logreg_metric.clone(),
+        test_data_loader=val_data_loader,
+        eval_device=torch.cuda.current_device(),
+        train_dtype=train_dtype,
+        train_features_device=train_features_device,
+    )
+
+    best_stats = evals[1]["metrics"]
+
+    best_stats["best_C"] = best_C
+
+    logger.info(f"Log regression evaluation done in {int(time.time() - start)}s")
+    return best_stats
+
+
+def eval_log_regression_with_model(
+    model,
+    train_dataset_str="ImageNet:split=TRAIN",
+    val_dataset_str="ImageNet:split=VAL",
+    finetune_dataset_str=None,
+    autocast_dtype=torch.float,
+    finetune_on_val=False,
+    metric_type=MetricType.MEAN_ACCURACY,
+    train_dtype=torch.float64,
+    train_features_device=_CPU_DEVICE,
+    max_train_iters=DEFAULT_MAX_ITER,
+):
+    cudnn.benchmark = True
+
+    transform = make_classification_eval_transform(resize_size=224)
+    target_transform = None
+
+    train_dataset = make_dataset(dataset_str=train_dataset_str, transform=transform, target_transform=target_transform)
+    val_dataset = make_dataset(dataset_str=val_dataset_str, transform=transform, target_transform=target_transform)
+    if finetune_dataset_str is not None:
+        finetune_dataset = make_dataset(
+            dataset_str=finetune_dataset_str, transform=transform, target_transform=target_transform
+        )
+    else:
+        finetune_dataset = None
+
+    with torch.cuda.amp.autocast(dtype=autocast_dtype):
+        results_dict_logreg = eval_log_regression(
+            model=model,
+            train_dataset=train_dataset,
+            val_dataset=val_dataset,
+            finetune_dataset=finetune_dataset,
+            metric_type=metric_type,
+            batch_size=256,
+            num_workers=0,  # 5,
+            finetune_on_val=finetune_on_val,
+            train_dtype=train_dtype,
+            train_features_device=train_features_device,
+            max_train_iters=max_train_iters,
+        )
+
+    results_dict = {
+        "top-1": results_dict_logreg["top-1"].cpu().numpy() * 100.0,
+        "top-5": results_dict_logreg.get("top-5", torch.tensor(0.0)).cpu().numpy() * 100.0,
+        "best_C": results_dict_logreg["best_C"],
+    }
+    logger.info(
+        "\n".join(
+            [
+                "Training of the supervised logistic regression on frozen features completed.\n"
+                "Top-1 test accuracy: {acc:.1f}".format(acc=results_dict["top-1"]),
+                "Top-5 test accuracy: {acc:.1f}".format(acc=results_dict["top-5"]),
+                "obtained for C = {c:.6f}".format(c=results_dict["best_C"]),
+            ]
+        )
+    )
+
+    torch.distributed.barrier()
+    return results_dict
+
+
+def main(args):
+    model, autocast_dtype = setup_and_build_model(args)
+    eval_log_regression_with_model(
+        model=model,
+        train_dataset_str=args.train_dataset_str,
+        val_dataset_str=args.val_dataset_str,
+        finetune_dataset_str=args.finetune_dataset_str,
+        autocast_dtype=autocast_dtype,
+        finetune_on_val=args.finetune_on_val,
+        metric_type=args.metric_type,
+        train_dtype=as_torch_dtype(args.train_dtype),
+        train_features_device=torch.device(args.train_features_device),
+        max_train_iters=args.max_train_iters,
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    description = "DINOv2 logistic regression evaluation"
+    args_parser = get_args_parser(description=description)
+    args = args_parser.parse_args()
+    sys.exit(main(args))
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/metrics.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..80bf88da224e749dd6b3dd4b2bd90ec99eaec34e
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/metrics.py
@@ -0,0 +1,114 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+import logging
+from typing import Any, Dict, Optional
+
+import torch
+from torch import Tensor
+from torchmetrics import Metric, MetricCollection
+from torchmetrics.classification import MulticlassAccuracy
+from torchmetrics.utilities.data import dim_zero_cat, select_topk
+
+
+logger = logging.getLogger("dinov2")
+
+
+class MetricType(Enum):
+    MEAN_ACCURACY = "mean_accuracy"
+    MEAN_PER_CLASS_ACCURACY = "mean_per_class_accuracy"
+    PER_CLASS_ACCURACY = "per_class_accuracy"
+    IMAGENET_REAL_ACCURACY = "imagenet_real_accuracy"
+
+    @property
+    def accuracy_averaging(self):
+        return getattr(AccuracyAveraging, self.name, None)
+
+    def __str__(self):
+        return self.value
+
+
+class AccuracyAveraging(Enum):
+    MEAN_ACCURACY = "micro"
+    MEAN_PER_CLASS_ACCURACY = "macro"
+    PER_CLASS_ACCURACY = "none"
+
+    def __str__(self):
+        return self.value
+
+
+def build_metric(metric_type: MetricType, *, num_classes: int, ks: Optional[tuple] = None):
+    if metric_type.accuracy_averaging is not None:
+        return build_topk_accuracy_metric(
+            average_type=metric_type.accuracy_averaging,
+            num_classes=num_classes,
+            ks=(1, 5) if ks is None else ks,
+        )
+    elif metric_type == MetricType.IMAGENET_REAL_ACCURACY:
+        return build_topk_imagenet_real_accuracy_metric(
+            num_classes=num_classes,
+            ks=(1, 5) if ks is None else ks,
+        )
+
+    raise ValueError(f"Unknown metric type {metric_type}")
+
+
+def build_topk_accuracy_metric(average_type: AccuracyAveraging, num_classes: int, ks: tuple = (1, 5)):
+    metrics: Dict[str, Metric] = {
+        f"top-{k}": MulticlassAccuracy(top_k=k, num_classes=int(num_classes), average=average_type.value) for k in ks
+    }
+    return MetricCollection(metrics)
+
+
+def build_topk_imagenet_real_accuracy_metric(num_classes: int, ks: tuple = (1, 5)):
+    metrics: Dict[str, Metric] = {f"top-{k}": ImageNetReaLAccuracy(top_k=k, num_classes=int(num_classes)) for k in ks}
+    return MetricCollection(metrics)
+
+
+class ImageNetReaLAccuracy(Metric):
+    is_differentiable: bool = False
+    higher_is_better: Optional[bool] = None
+    full_state_update: bool = False
+
+    def __init__(
+        self,
+        num_classes: int,
+        top_k: int = 1,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.num_classes = num_classes
+        self.top_k = top_k
+        self.add_state("tp", [], dist_reduce_fx="cat")
+
+    def update(self, preds: Tensor, target: Tensor) -> None:  # type: ignore
+        # preds [B, D]
+        # target [B, A]
+        # preds_oh [B, D] with 0 and 1
+        # select top K highest probabilities, use one hot representation
+        preds_oh = select_topk(preds, self.top_k)
+        # target_oh [B, D + 1] with 0 and 1
+        target_oh = torch.zeros((preds_oh.shape[0], preds_oh.shape[1] + 1), device=target.device, dtype=torch.int32)
+        target = target.long()
+        # for undefined targets (-1) use a fake value `num_classes`
+        target[target == -1] = self.num_classes
+        # fill targets, use one hot representation
+        target_oh.scatter_(1, target, 1)
+        # target_oh [B, D] (remove the fake target at index `num_classes`)
+        target_oh = target_oh[:, :-1]
+        # tp [B] with 0 and 1
+        tp = (preds_oh * target_oh == 1).sum(dim=1)
+        # at least one match between prediction and target
+        tp.clip_(max=1)
+        # ignore instances where no targets are defined
+        mask = target_oh.sum(dim=1) > 0
+        tp = tp[mask]
+        self.tp.append(tp)  # type: ignore
+
+    def compute(self) -> Tensor:
+        tp = dim_zero_cat(self.tp)  # type: ignore
+        return tp.float().mean()
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/setup.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7fadc2b63b994f569c8def82a43ed08ccd15b33
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/setup.py
@@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+from typing import Any, List, Optional, Tuple
+
+import torch
+import torch.backends.cudnn as cudnn
+
+from dinov2.models import build_model_from_cfg
+from dinov2.utils.config import setup
+import dinov2.utils.utils as dinov2_utils
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+):
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents or [],
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        help="Model configuration file",
+    )
+    parser.add_argument(
+        "--pretrained-weights",
+        type=str,
+        help="Pretrained model weights",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="",
+        type=str,
+        help="Output directory to write results and logs",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Extra configuration options",
+        default=[],
+        nargs="+",
+    )
+    return parser
+
+
+def get_autocast_dtype(config):
+    teacher_dtype_str = config.compute_precision.teacher.backbone.mixed_precision.param_dtype
+    if teacher_dtype_str == "fp16":
+        return torch.half
+    elif teacher_dtype_str == "bf16":
+        return torch.bfloat16
+    else:
+        return torch.float
+
+
+def build_model_for_eval(config, pretrained_weights):
+    model, _ = build_model_from_cfg(config, only_teacher=True)
+    dinov2_utils.load_pretrained_weights(model, pretrained_weights, "teacher")
+    model.eval()
+    model.cuda()
+    return model
+
+
+def setup_and_build_model(args) -> Tuple[Any, torch.dtype]:
+    cudnn.benchmark = True
+    config = setup(args)
+    model = build_model_for_eval(config, args.pretrained_weights)
+    autocast_dtype = get_autocast_dtype(config)
+    return model, autocast_dtype
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/utils.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2f7e34f41ba6a0b911023e0c5375eef21f426fa
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/eval/utils.py
@@ -0,0 +1,147 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Dict, Optional
+
+import torch
+from torch import nn
+from torchmetrics import MetricCollection
+
+from dinov2.data import DatasetWithEnumeratedTargets, SamplerType, make_data_loader
+import dinov2.distributed as distributed
+from dinov2.logging import MetricLogger
+
+
+logger = logging.getLogger("dinov2")
+
+
+class ModelWithNormalize(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, samples):
+        return nn.functional.normalize(self.model(samples), dim=1, p=2)
+
+
+class ModelWithIntermediateLayers(nn.Module):
+    def __init__(self, feature_model, n_last_blocks, autocast_ctx):
+        super().__init__()
+        self.feature_model = feature_model
+        self.feature_model.eval()
+        self.n_last_blocks = n_last_blocks
+        self.autocast_ctx = autocast_ctx
+
+    def forward(self, images):
+        with torch.inference_mode():
+            with self.autocast_ctx():
+                features = self.feature_model.get_intermediate_layers(
+                    images, self.n_last_blocks, return_class_token=True
+                )
+        return features
+
+
+@torch.inference_mode()
+def evaluate(
+    model: nn.Module,
+    data_loader,
+    postprocessors: Dict[str, nn.Module],
+    metrics: Dict[str, MetricCollection],
+    device: torch.device,
+    criterion: Optional[nn.Module] = None,
+):
+    model.eval()
+    if criterion is not None:
+        criterion.eval()
+
+    for metric in metrics.values():
+        metric = metric.to(device)
+
+    metric_logger = MetricLogger(delimiter="  ")
+    header = "Test:"
+
+    for samples, targets, *_ in metric_logger.log_every(data_loader, 10, header):
+        outputs = model(samples.to(device))
+        targets = targets.to(device)
+
+        if criterion is not None:
+            loss = criterion(outputs, targets)
+            metric_logger.update(loss=loss.item())
+
+        for k, metric in metrics.items():
+            metric_inputs = postprocessors[k](outputs, targets)
+            metric.update(**metric_inputs)
+
+    metric_logger.synchronize_between_processes()
+    logger.info(f"Averaged stats: {metric_logger}")
+
+    stats = {k: metric.compute() for k, metric in metrics.items()}
+    metric_logger_stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    return metric_logger_stats, stats
+
+
+def all_gather_and_flatten(tensor_rank):
+    tensor_all_ranks = torch.empty(
+        distributed.get_global_size(),
+        *tensor_rank.shape,
+        dtype=tensor_rank.dtype,
+        device=tensor_rank.device,
+    )
+    tensor_list = list(tensor_all_ranks.unbind(0))
+    torch.distributed.all_gather(tensor_list, tensor_rank.contiguous())
+    return tensor_all_ranks.flatten(end_dim=1)
+
+
+def extract_features(model, dataset, batch_size, num_workers, gather_on_cpu=False):
+    dataset_with_enumerated_targets = DatasetWithEnumeratedTargets(dataset)
+    sample_count = len(dataset_with_enumerated_targets)
+    data_loader = make_data_loader(
+        dataset=dataset_with_enumerated_targets,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        sampler_type=SamplerType.DISTRIBUTED,
+        drop_last=False,
+        shuffle=False,
+    )
+    return extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu)
+
+
+@torch.inference_mode()
+def extract_features_with_dataloader(model, data_loader, sample_count, gather_on_cpu=False):
+    gather_device = torch.device("cpu") if gather_on_cpu else torch.device("cuda")
+    metric_logger = MetricLogger(delimiter="  ")
+    features, all_labels = None, None
+    for samples, (index, labels_rank) in metric_logger.log_every(data_loader, 10):
+        samples = samples.cuda(non_blocking=True)
+        labels_rank = labels_rank.cuda(non_blocking=True)
+        index = index.cuda(non_blocking=True)
+        features_rank = model(samples).float()
+
+        # init storage feature matrix
+        if features is None:
+            features = torch.zeros(sample_count, features_rank.shape[-1], device=gather_device)
+            labels_shape = list(labels_rank.shape)
+            labels_shape[0] = sample_count
+            all_labels = torch.full(labels_shape, fill_value=-1, device=gather_device)
+            logger.info(f"Storing features into tensor of shape {features.shape}")
+
+        # share indexes, features and labels between processes
+        index_all = all_gather_and_flatten(index).to(gather_device)
+        features_all_ranks = all_gather_and_flatten(features_rank).to(gather_device)
+        labels_all_ranks = all_gather_and_flatten(labels_rank).to(gather_device)
+
+        # update storage feature matrix
+        if len(index_all) > 0:
+            features.index_copy_(0, index_all, features_all_ranks)
+            all_labels.index_copy_(0, index_all, labels_all_ranks)
+
+    logger.info(f"Features shape: {tuple(features.shape)}")
+    logger.info(f"Labels shape: {tuple(all_labels.shape)}")
+
+    assert torch.all(all_labels > -1)
+
+    return features, all_labels
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/fsdp/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/fsdp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..71d20397611619e6a02ea07f5305d650ffef2a51
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/fsdp/__init__.py
@@ -0,0 +1,158 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from typing import Any
+
+import torch
+import dinov2.distributed as distributed
+from functools import partial
+from fvcore.common.checkpoint import Checkpointer
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import ShardingStrategy
+from torch.distributed.fsdp import MixedPrecision
+from torch.distributed.fsdp import StateDictType
+from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
+from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.distributed.fsdp._runtime_utils import _reshard
+
+
+def get_fsdp_wrapper(model_cfg, modules_to_wrap=set()):
+    sharding_strategy_dict = {
+        "NO_SHARD": ShardingStrategy.NO_SHARD,
+        "SHARD_GRAD_OP": ShardingStrategy.SHARD_GRAD_OP,
+        "FULL_SHARD": ShardingStrategy.FULL_SHARD,
+    }
+
+    dtype_dict = {
+        "fp32": torch.float32,
+        "fp16": torch.float16,
+        "bf16": torch.bfloat16,
+    }
+
+    mixed_precision_config = MixedPrecision(
+        param_dtype=dtype_dict[model_cfg.mixed_precision.param_dtype],
+        reduce_dtype=dtype_dict[model_cfg.mixed_precision.reduce_dtype],
+        buffer_dtype=dtype_dict[model_cfg.mixed_precision.buffer_dtype],
+    )
+
+    sharding_strategy_config = sharding_strategy_dict[model_cfg.sharding_strategy]
+
+    local_rank = distributed.get_local_rank()
+
+    fsdp_wrapper = partial(
+        FSDP,
+        sharding_strategy=sharding_strategy_config,
+        mixed_precision=mixed_precision_config,
+        device_id=local_rank,
+        sync_module_states=True,
+        use_orig_params=True,
+        auto_wrap_policy=ModuleWrapPolicy(modules_to_wrap),
+    )
+    return fsdp_wrapper
+
+
+def is_fsdp(x):
+    return isinstance(x, FSDP)
+
+
+def is_sharded_fsdp(x):
+    return is_fsdp(x) and x.sharding_strategy is not ShardingStrategy.NO_SHARD
+
+
+def free_if_fsdp(x):
+    if is_sharded_fsdp(x):
+        handles = x._handles
+        true_list = [True for h in handles]
+        _reshard(x, handles, true_list)
+
+
+def get_fsdp_modules(x):
+    return FSDP.fsdp_modules(x)
+
+
+def reshard_fsdp_model(x):
+    for m in get_fsdp_modules(x):
+        free_if_fsdp(m)
+
+
+def rankstr():
+    return f"rank_{distributed.get_global_rank()}"
+
+
+class FSDPCheckpointer(Checkpointer):
+    def save(self, name: str, **kwargs: Any) -> None:
+        """
+        Dump model and checkpointables to a file.
+
+        Args:
+            name (str): name of the file.
+            kwargs (dict): extra arbitrary data to save.
+        """
+        if not self.save_dir or not self.save_to_disk:
+            return
+
+        data = {}
+        with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT):
+            data["model"] = self.model.state_dict()
+
+        # data["model"] = self.model.state_dict()
+        for key, obj in self.checkpointables.items():
+            data[key] = obj.state_dict()
+        data.update(kwargs)
+
+        basename = f"{name}.{rankstr()}.pth"
+        save_file = os.path.join(self.save_dir, basename)
+        assert os.path.basename(save_file) == basename, basename
+        self.logger.info("Saving checkpoint to {}".format(save_file))
+        with self.path_manager.open(save_file, "wb") as f:
+            torch.save(data, f)
+        self.tag_last_checkpoint(basename)
+
+    def load(self, *args, **kwargs):
+        with FSDP.state_dict_type(self.model, StateDictType.LOCAL_STATE_DICT):
+            return super().load(*args, **kwargs)
+
+    def has_checkpoint(self) -> bool:
+        """
+        Returns:
+            bool: whether a checkpoint exists in the target directory.
+        """
+        save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+        return self.path_manager.exists(save_file)
+
+    def get_checkpoint_file(self) -> str:
+        """
+        Returns:
+            str: The latest checkpoint file in target directory.
+        """
+        save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+        try:
+            with self.path_manager.open(save_file, "r") as f:
+                last_saved = f.read().strip()
+        except IOError:
+            # if file doesn't exist, maybe because it has just been
+            # deleted by a separate process
+            return ""
+        # pyre-fixme[6]: For 2nd param expected `Union[PathLike[str], str]` but got
+        #  `Union[bytes, str]`.
+        return os.path.join(self.save_dir, last_saved)
+
+    def tag_last_checkpoint(self, last_filename_basename: str) -> None:
+        """
+        Tag the last checkpoint.
+
+        Args:
+            last_filename_basename (str): the basename of the last filename.
+        """
+        if distributed.is_enabled():
+            torch.distributed.barrier()
+        save_file = os.path.join(self.save_dir, f"last_checkpoint.{rankstr()}")
+        with self.path_manager.open(save_file, "w") as f:
+            f.write(last_filename_basename)  # pyre-ignore
+
+
+ShardedGradScaler = ShardedGradScaler
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..31f196aacac5be8a7c537a3dfa8f97084671b466
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f9b0c94b40967dfdff4f261c127cbd21328c905
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/attention.py
@@ -0,0 +1,81 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+import logging
+
+from torch import Tensor
+from torch import nn
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+    from xformers.ops import memory_efficient_attention, unbind, fmha
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            return super().forward(x)
+
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+
+        q, k, v = unbind(qkv, 2)
+
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/block.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/block.py
new file mode 100644
index 0000000000000000000000000000000000000000..25488f57cc0ad3c692f86b62555f6668e2a66db1
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/block.py
@@ -0,0 +1,252 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+
+import torch
+from torch import nn, Tensor
+
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+    from xformers.ops import fmha
+    from xformers.ops import scaled_index_add, index_select_cat
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.sample_drop_ratio = drop_path
+
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+
+
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+
+    residual_scale_factor = b / sample_subset_size
+
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+
+
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+
+
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+
+
+attn_bias_cache: Dict[Tuple, Any] = {}
+
+
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+
+    return attn_bias_cache[all_shapes], cat_tensors
+
+
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+
+
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+
+        if self.training and self.sample_drop_ratio > 0.0:
+
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/dino_head.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/dino_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..7212db92a4fd8d4c7230e284e551a0234e9d8623
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/dino_head.py
@@ -0,0 +1,59 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+
+
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+
+
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py
new file mode 100644
index 0000000000000000000000000000000000000000..af05625984dd14682cc96a63bf0c97bab1f123b1
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/drop_path.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+
+
+from torch import nn
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca5daa52bd81d3581adeb2198ea5b7dba2a3aea1
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/layer_scale.py
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+
+from typing import Union
+
+import torch
+from torch import Tensor
+from torch import nn
+
+
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e4b315f972f9a9f54aef1e4ef4e81b52976f018
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/mlp.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+
+
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/patch_embed.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/patch_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..574abe41175568d700a389b8b96d1ba554914779
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/patch_embed.py
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+from typing import Callable, Optional, Tuple, Union
+
+from torch import Tensor
+import torch.nn as nn
+
+
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+
+    assert isinstance(x, int)
+    return (x, x)
+
+
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.flatten_embedding = flatten_embedding
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/swiglu_ffn.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/swiglu_ffn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3324b266fb0a50ccf8c3a0ede2ae10ac4dfa03e
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/layers/swiglu_ffn.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+import torch.nn.functional as F
+
+
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+
+
+try:
+    from xformers.ops import SwiGLU
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+
+
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e80dadb2d57056e9f6f4989cd24a3c7e26fee23f
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/__init__.py
@@ -0,0 +1,103 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import functools
+import logging
+import os
+import sys
+from typing import Optional
+
+import dinov2.distributed as distributed
+from .helpers import MetricLogger, SmoothedValue
+
+
+# So that calling _configure_logger multiple times won't add many handlers
+@functools.lru_cache()
+def _configure_logger(
+    name: Optional[str] = None,
+    *,
+    level: int = logging.DEBUG,
+    output: Optional[str] = None,
+):
+    """
+    Configure a logger.
+
+    Adapted from Detectron2.
+
+    Args:
+        name: The name of the logger to configure.
+        level: The logging level to use.
+        output: A file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+
+    Returns:
+        The configured logger.
+    """
+
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+    logger.propagate = False
+
+    # Loosely match Google glog format:
+    #   [IWEF]yyyymmdd hh:mm:ss.uuuuuu threadid file:line] msg
+    # but use a shorter timestamp and include the logger name:
+    #   [IWEF]yyyymmdd hh:mm:ss logger threadid file:line] msg
+    fmt_prefix = "%(levelname).1s%(asctime)s %(process)s %(name)s %(filename)s:%(lineno)s] "
+    fmt_message = "%(message)s"
+    fmt = fmt_prefix + fmt_message
+    datefmt = "%Y%m%d %H:%M:%S"
+    formatter = logging.Formatter(fmt=fmt, datefmt=datefmt)
+
+    # stdout logging for main worker only
+    if distributed.is_main_process():
+        handler = logging.StreamHandler(stream=sys.stdout)
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    # file logging for all workers
+    if output:
+        if os.path.splitext(output)[-1] in (".txt", ".log"):
+            filename = output
+        else:
+            filename = os.path.join(output, "logs", "log.txt")
+
+        if not distributed.is_main_process():
+            global_rank = distributed.get_global_rank()
+            filename = filename + ".rank{}".format(global_rank)
+
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+        handler = logging.StreamHandler(open(filename, "a"))
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    return logger
+
+
+def setup_logging(
+    output: Optional[str] = None,
+    *,
+    name: Optional[str] = None,
+    level: int = logging.DEBUG,
+    capture_warnings: bool = True,
+) -> None:
+    """
+    Setup logging.
+
+    Args:
+        output: A file name or a directory to save log files. If None, log
+            files will not be saved. If output ends with ".txt" or ".log", it
+            is assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name: The name of the logger to configure, by default the root logger.
+        level: The logging level to use.
+        capture_warnings: Whether warnings should be captured as logs.
+    """
+    logging.captureWarnings(capture_warnings)
+    _configure_logger(name, level=level, output=output)
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/helpers.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..16d643500d2ee10ffea5916aad07f9b9d7c0af6d
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/logging/helpers.py
@@ -0,0 +1,195 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict, deque
+import datetime
+import json
+import logging
+import time
+
+import torch
+
+import dinov2.distributed as distributed
+
+
+logger = logging.getLogger("dinov2")
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t", output_file=None):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+        self.output_file = output_file
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append("{}: {}".format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def dump_in_output_file(self, iteration, iter_time, data_time):
+        if self.output_file is None or not distributed.is_main_process():
+            return
+        dict_to_dump = dict(
+            iteration=iteration,
+            iter_time=iter_time,
+            data_time=data_time,
+        )
+        dict_to_dump.update({k: v.median for k, v in self.meters.items()})
+        with open(self.output_file, "a") as f:
+            f.write(json.dumps(dict_to_dump) + "\n")
+        pass
+
+    def log_every(self, iterable, print_freq, header=None, n_iterations=None, start_iteration=0):
+        i = start_iteration
+        if not header:
+            header = ""
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt="{avg:.6f}")
+        data_time = SmoothedValue(fmt="{avg:.6f}")
+
+        if n_iterations is None:
+            n_iterations = len(iterable)
+
+        space_fmt = ":" + str(len(str(n_iterations))) + "d"
+
+        log_list = [
+            header,
+            "[{0" + space_fmt + "}/{1}]",
+            "eta: {eta}",
+            "{meters}",
+            "time: {time}",
+            "data: {data}",
+        ]
+        if torch.cuda.is_available():
+            log_list += ["max mem: {memory:.0f}"]
+
+        log_msg = self.delimiter.join(log_list)
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == n_iterations - 1:
+                self.dump_in_output_file(iteration=i, iter_time=iter_time.avg, data_time=data_time.avg)
+                eta_seconds = iter_time.global_avg * (n_iterations - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    logger.info(
+                        log_msg.format(
+                            i,
+                            n_iterations,
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB,
+                        )
+                    )
+                else:
+                    logger.info(
+                        log_msg.format(
+                            i,
+                            n_iterations,
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                        )
+                    )
+            i += 1
+            end = time.time()
+            if i >= n_iterations:
+                break
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        logger.info("{} Total time: {} ({:.6f} s / it)".format(header, total_time_str, total_time / n_iterations))
+
+
+class SmoothedValue:
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, num=1):
+        self.deque.append(value)
+        self.count += num
+        self.total += value * num
+
+    def synchronize_between_processes(self):
+        """
+        Distributed synchronization of the metric
+        Warning: does not synchronize the deque!
+        """
+        if not distributed.is_enabled():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
+        torch.distributed.barrier()
+        torch.distributed.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value,
+        )
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..477b71b28259bf97b806df3f3d2f392dded866d6
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .dino_clstoken_loss import DINOLoss
+from .ibot_patch_loss import iBOTPatchLoss
+from .koleo_loss import KoLeoLoss
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/dino_clstoken_loss.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/dino_clstoken_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f33897efb1084e6c1c14ae00bc93ab332c61074
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/dino_clstoken_loss.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+
+
+class DINOLoss(nn.Module):
+    def __init__(
+        self,
+        out_dim,
+        student_temp=0.1,
+        center_momentum=0.9,
+    ):
+        super().__init__()
+        self.student_temp = student_temp
+        self.center_momentum = center_momentum
+        self.register_buffer("center", torch.zeros(1, out_dim))
+        self.updated = True
+        self.reduce_handle = None
+        self.len_teacher_output = None
+        self.async_batch_center = None
+
+    @torch.no_grad()
+    def softmax_center_teacher(self, teacher_output, teacher_temp):
+        self.apply_center_update()
+        # teacher centering and sharpening
+        return F.softmax((teacher_output - self.center) / teacher_temp, dim=-1)
+
+    @torch.no_grad()
+    def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_iterations=3):
+        teacher_output = teacher_output.float()
+        world_size = dist.get_world_size() if dist.is_initialized() else 1
+        Q = torch.exp(teacher_output / teacher_temp).t()  # Q is K-by-B for consistency with notations from our paper
+        B = Q.shape[1] * world_size  # number of samples to assign
+        K = Q.shape[0]  # how many prototypes
+
+        # make the matrix sums to 1
+        sum_Q = torch.sum(Q)
+        if dist.is_initialized():
+            dist.all_reduce(sum_Q)
+        Q /= sum_Q
+
+        for it in range(n_iterations):
+            # normalize each row: total weight per prototype must be 1/K
+            sum_of_rows = torch.sum(Q, dim=1, keepdim=True)
+            if dist.is_initialized():
+                dist.all_reduce(sum_of_rows)
+            Q /= sum_of_rows
+            Q /= K
+
+            # normalize each column: total weight per sample must be 1/B
+            Q /= torch.sum(Q, dim=0, keepdim=True)
+            Q /= B
+
+        Q *= B  # the columns must sum to 1 so that Q is an assignment
+        return Q.t()
+
+    def forward(self, student_output_list, teacher_out_softmaxed_centered_list):
+        """
+        Cross-entropy between softmax outputs of the teacher and student networks.
+        """
+        # TODO: Use cross_entropy_distribution here
+        total_loss = 0
+        for s in student_output_list:
+            lsm = F.log_softmax(s / self.student_temp, dim=-1)
+            for t in teacher_out_softmaxed_centered_list:
+                loss = torch.sum(t * lsm, dim=-1)
+                total_loss -= loss.mean()
+        return total_loss
+
+    @torch.no_grad()
+    def update_center(self, teacher_output):
+        self.reduce_center_update(teacher_output)
+
+    @torch.no_grad()
+    def reduce_center_update(self, teacher_output):
+        self.updated = False
+        self.len_teacher_output = len(teacher_output)
+        self.async_batch_center = torch.sum(teacher_output, dim=0, keepdim=True)
+        if dist.is_initialized():
+            self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True)
+
+    @torch.no_grad()
+    def apply_center_update(self):
+        if self.updated is False:
+            world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+            if self.reduce_handle is not None:
+                self.reduce_handle.wait()
+            _t = self.async_batch_center / (self.len_teacher_output * world_size)
+
+            self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum)
+
+            self.updated = True
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/ibot_patch_loss.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/ibot_patch_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..16bc5cf634d661f1fa337304273f60dcd43c79c3
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/ibot_patch_loss.py
@@ -0,0 +1,152 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+
+import logging
+
+
+logger = logging.getLogger("dinov2")
+
+
+try:
+    from xformers.ops import cross_entropy
+
+    def lossfunc(t, s, temp):
+        s = s.float()
+        t = t.float()
+        if s.ndim == 2:
+            return -cross_entropy(s.unsqueeze(0), t.unsqueeze(0), temp, bw_inplace=True).squeeze(0)
+        elif s.ndim == 3:
+            return -cross_entropy(s, t, temp, bw_inplace=True)
+
+except ImportError:
+
+    def lossfunc(t, s, temp):
+        return torch.sum(t * F.log_softmax(s / temp, dim=-1), dim=-1)
+
+
+class iBOTPatchLoss(nn.Module):
+    def __init__(self, patch_out_dim, student_temp=0.1, center_momentum=0.9):
+        super().__init__()
+        self.student_temp = student_temp
+        self.center_momentum = center_momentum
+        self.register_buffer("center", torch.zeros(1, 1, patch_out_dim))
+        self.updated = True
+        self.reduce_handle = None
+        self.len_teacher_patch_tokens = None
+        self.async_batch_center = None
+
+    @torch.no_grad()
+    def softmax_center_teacher(self, teacher_patch_tokens, teacher_temp):
+        self.apply_center_update()
+        # teacher centering and sharpening
+        #
+        # WARNING:
+        #   as self.center is a float32, everything gets casted to float32 afterwards
+        #
+        # teacher_patch_tokens = teacher_patch_tokens.float()
+        # return F.softmax((teacher_patch_tokens.sub_(self.center.to(teacher_patch_tokens.dtype))).mul_(1 / teacher_temp), dim=-1)
+
+        return F.softmax((teacher_patch_tokens - self.center) / teacher_temp, dim=-1)
+
+        # this is experimental, keep everything in float16 and let's see what happens:
+        # return F.softmax((teacher_patch_tokens.sub_(self.center)) / teacher_temp, dim=-1)
+
+    @torch.no_grad()
+    def sinkhorn_knopp_teacher(self, teacher_output, teacher_temp, n_masked_patches_tensor, n_iterations=3):
+        teacher_output = teacher_output.float()
+        # world_size = dist.get_world_size() if dist.is_initialized() else 1
+        Q = torch.exp(teacher_output / teacher_temp).t()  # Q is K-by-B for consistency with notations from our paper
+        # B = Q.shape[1] * world_size # number of samples to assign
+        B = n_masked_patches_tensor
+        dist.all_reduce(B)
+        K = Q.shape[0]  # how many prototypes
+
+        # make the matrix sums to 1
+        sum_Q = torch.sum(Q)
+        if dist.is_initialized():
+            dist.all_reduce(sum_Q)
+        Q /= sum_Q
+
+        for it in range(n_iterations):
+            # normalize each row: total weight per prototype must be 1/K
+            sum_of_rows = torch.sum(Q, dim=1, keepdim=True)
+            if dist.is_initialized():
+                dist.all_reduce(sum_of_rows)
+            Q /= sum_of_rows
+            Q /= K
+
+            # normalize each column: total weight per sample must be 1/B
+            Q /= torch.sum(Q, dim=0, keepdim=True)
+            Q /= B
+
+        Q *= B  # the columns must sum to 1 so that Q is an assignment
+        return Q.t()
+
+    def forward(self, student_patch_tokens, teacher_patch_tokens, student_masks_flat):
+        """
+        Cross-entropy between softmax outputs of the teacher and student networks.
+        student_patch_tokens: (B, N, D) tensor
+        teacher_patch_tokens: (B, N, D) tensor
+        student_masks_flat: (B, N) tensor
+        """
+        t = teacher_patch_tokens
+        s = student_patch_tokens
+        loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1)
+        loss = torch.sum(loss * student_masks_flat.float(), dim=-1) / student_masks_flat.sum(dim=-1).clamp(min=1.0)
+        return -loss.mean()
+
+    def forward_masked(
+        self,
+        student_patch_tokens_masked,
+        teacher_patch_tokens_masked,
+        student_masks_flat,
+        n_masked_patches=None,
+        masks_weight=None,
+    ):
+        t = teacher_patch_tokens_masked
+        s = student_patch_tokens_masked
+        # loss = torch.sum(t * F.log_softmax(s / self.student_temp, dim=-1), dim=-1)
+        loss = lossfunc(t, s, self.student_temp)
+        if masks_weight is None:
+            masks_weight = (
+                (1 / student_masks_flat.sum(-1).clamp(min=1.0))
+                .unsqueeze(-1)
+                .expand_as(student_masks_flat)[student_masks_flat]
+            )
+        if n_masked_patches is not None:
+            loss = loss[:n_masked_patches]
+        loss = loss * masks_weight
+        return -loss.sum() / student_masks_flat.shape[0]
+
+    @torch.no_grad()
+    def update_center(self, teacher_patch_tokens):
+        self.reduce_center_update(teacher_patch_tokens)
+
+    @torch.no_grad()
+    def reduce_center_update(self, teacher_patch_tokens):
+        self.updated = False
+        self.len_teacher_patch_tokens = len(teacher_patch_tokens)
+        self.async_batch_center = torch.sum(teacher_patch_tokens.mean(1), dim=0, keepdim=True)
+        if dist.is_initialized():
+            self.reduce_handle = dist.all_reduce(self.async_batch_center, async_op=True)
+
+    @torch.no_grad()
+    def apply_center_update(self):
+        if self.updated is False:
+            world_size = dist.get_world_size() if dist.is_initialized() else 1
+
+            if self.reduce_handle is not None:
+                self.reduce_handle.wait()
+            _t = self.async_batch_center / (self.len_teacher_patch_tokens * world_size)
+
+            self.center = self.center * self.center_momentum + _t * (1 - self.center_momentum)
+
+            self.updated = True
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..e776d0426bb029cf48f25b0c94077720bc8421c4
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/loss/koleo_loss.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# import torch.distributed as dist
+
+
+logger = logging.getLogger("dinov2")
+
+
+class KoLeoLoss(nn.Module):
+    """Kozachenko-Leonenko entropic loss regularizer from Sablayrolles et al. - 2018 - Spreading vectors for similarity search"""
+
+    def __init__(self):
+        super().__init__()
+        self.pdist = nn.PairwiseDistance(2, eps=1e-8)
+
+    def pairwise_NNs_inner(self, x):
+        """
+        Pairwise nearest neighbors for L2-normalized vectors.
+        Uses Torch rather than Faiss to remain on GPU.
+        """
+        # parwise dot products (= inverse distance)
+        dots = torch.mm(x, x.t())
+        n = x.shape[0]
+        dots.view(-1)[:: (n + 1)].fill_(-1)  # Trick to fill diagonal with -1
+        # max inner prod -> min distance
+        _, I = torch.max(dots, dim=1)  # noqa: E741
+        return I
+
+    def forward(self, student_output, eps=1e-8):
+        """
+        Args:
+            student_output (BxD): backbone output of student
+        """
+        with torch.cuda.amp.autocast(enabled=False):
+            student_output = F.normalize(student_output, eps=eps, p=2, dim=-1)
+            I = self.pairwise_NNs_inner(student_output)  # noqa: E741
+            distances = self.pdist(student_output, student_output[I])  # BxD, BxD -> B
+            loss = -torch.log(distances + eps).mean()
+        return loss
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e5a1f3832464f898752e57e865760e9864613cb
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from . import vision_transformer as vits
+
+
+logger = logging.getLogger("dinov2")
+
+
+def build_model(args, only_teacher=False, img_size=224):
+    args.arch = args.arch.removesuffix("_memeff")
+    if "vit" in args.arch:
+        vit_kwargs = dict(
+            img_size=img_size,
+            patch_size=args.patch_size,
+            init_values=args.layerscale,
+            ffn_layer=args.ffn_layer,
+            block_chunks=args.block_chunks,
+            qkv_bias=args.qkv_bias,
+            proj_bias=args.proj_bias,
+            ffn_bias=args.ffn_bias,
+        )
+        teacher = vits.__dict__[args.arch](**vit_kwargs)
+        if only_teacher:
+            return teacher, teacher.embed_dim
+        student = vits.__dict__[args.arch](
+            **vit_kwargs,
+            drop_path_rate=args.drop_path_rate,
+            drop_path_uniform=args.drop_path_uniform,
+        )
+        embed_dim = student.embed_dim
+    return student, teacher, embed_dim
+
+
+def build_model_from_cfg(cfg, only_teacher=False):
+    return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/vision_transformer.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..18e159a986336af813c8f0e505b946f42cd83e47
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/models/vision_transformer.py
@@ -0,0 +1,358 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+
+from dinov2.layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+
+
+logger = logging.getLogger("dinov2")
+
+
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+
+
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+
+            def f(*args, **kwargs):
+                return nn.Identity()
+
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+        self.init_weights()
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        w0, h0 = w0 + 0.1, h0 + 0.1
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+            scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+            mode="bicubic",
+        )
+
+        assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+
+        return x
+
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_patchtokens": x_norm[:, 1:],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+
+        x = self.prepare_tokens_with_masks(x, masks)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_patchtokens": x_norm[:, 1:],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+
+
+def vit_small(patch_size=16, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
+
+
+def vit_base(patch_size=16, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
+
+
+def vit_large(patch_size=16, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
+
+
+def vit_giant2(patch_size=16, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0952fcc3f57e34b3747962e9ebd6fc57aeea63fa
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..15d674b78b0629aa0f041c2426c894925469a0e8
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/knn.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.eval.knn import get_args_parser as get_knn_args_parser
+from dinov2.logging import setup_logging
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.eval.knn import main as knn_main
+
+        self._setup_args()
+        knn_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 k-NN evaluation"
+    knn_args_parser = get_knn_args_parser(add_help=False)
+    parents = [knn_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Evaluator, args, name="dinov2:knn")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8c264762ac6bb82a3622c74e1e683ea5c6be437
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/linear.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.eval.linear import get_args_parser as get_linear_args_parser
+from dinov2.logging import setup_logging
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.eval.linear import main as linear_main
+
+        self._setup_args()
+        linear_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 linear evaluation"
+    linear_args_parser = get_linear_args_parser(add_help=False)
+    parents = [linear_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Evaluator, args, name="dinov2:linear")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/log_regression.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/log_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d3d5a5742792fc8d4ca3b39c15c47e8aa349bc7
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/eval/log_regression.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.eval.log_regression import get_args_parser as get_log_regression_args_parser
+from dinov2.logging import setup_logging
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Evaluator:
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.eval.log_regression import main as log_regression_main
+
+        self._setup_args()
+        log_regression_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 logistic evaluation"
+    log_regression_args_parser = get_log_regression_args_parser(add_help=False)
+    parents = [log_regression_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Evaluator, args, name="dinov2:logreg")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/submit.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/submit.py
new file mode 100644
index 0000000000000000000000000000000000000000..68140f3d6d93dc67ccd7c45fe712eb15483d1ad6
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/submit.py
@@ -0,0 +1,123 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import List, Optional
+
+import submitit
+
+from dinov2.utils.cluster import (
+    get_slurm_executor_parameters,
+    get_slurm_partition,
+    get_user_checkpoint_path,
+)
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(
+    description: Optional[str] = None,
+    parents: Optional[List[argparse.ArgumentParser]] = None,
+    add_help: bool = True,
+) -> argparse.ArgumentParser:
+    parents = parents or []
+    slurm_partition = get_slurm_partition()
+    parser = argparse.ArgumentParser(
+        description=description,
+        parents=parents,
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--ngpus",
+        "--gpus",
+        "--gpus-per-node",
+        default=8,
+        type=int,
+        help="Number of GPUs to request on each node",
+    )
+    parser.add_argument(
+        "--nodes",
+        "--nnodes",
+        default=2,
+        type=int,
+        help="Number of nodes to request",
+    )
+    parser.add_argument(
+        "--timeout",
+        default=2800,
+        type=int,
+        help="Duration of the job",
+    )
+    parser.add_argument(
+        "--partition",
+        default=slurm_partition,
+        type=str,
+        help="Partition where to submit",
+    )
+    parser.add_argument(
+        "--use-volta32",
+        action="store_true",
+        help="Request V100-32GB GPUs",
+    )
+    parser.add_argument(
+        "--comment",
+        default="",
+        type=str,
+        help="Comment to pass to scheduler, e.g. priority message",
+    )
+    parser.add_argument(
+        "--exclude",
+        default="",
+        type=str,
+        help="Nodes to exclude",
+    )
+    return parser
+
+
+def get_shared_folder() -> Path:
+    user_checkpoint_path = get_user_checkpoint_path()
+    if user_checkpoint_path is None:
+        raise RuntimeError("Path to user checkpoint cannot be determined")
+    path = user_checkpoint_path / "experiments"
+    path.mkdir(exist_ok=True)
+    return path
+
+
+def submit_jobs(task_class, args, name: str):
+    if not args.output_dir:
+        args.output_dir = str(get_shared_folder() / "%j")
+
+    Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    executor = submitit.AutoExecutor(folder=args.output_dir, slurm_max_num_timeout=30)
+
+    kwargs = {}
+    if args.use_volta32:
+        kwargs["slurm_constraint"] = "volta32gb"
+    if args.comment:
+        kwargs["slurm_comment"] = args.comment
+    if args.exclude:
+        kwargs["slurm_exclude"] = args.exclude
+
+    executor_params = get_slurm_executor_parameters(
+        nodes=args.nodes,
+        num_gpus_per_node=args.ngpus,
+        timeout_min=args.timeout,  # max is 60 * 72
+        slurm_signal_delay_s=120,
+        slurm_partition=args.partition,
+        **kwargs,
+    )
+    executor.update_parameters(name=name, **executor_params)
+
+    task = task_class(args)
+    job = executor.submit(task)
+
+    logger.info(f"Submitted job_id: {job.job_id}")
+    str_output_dir = os.path.abspath(args.output_dir).replace("%j", str(job.job_id))
+    logger.info(f"Logs and checkpoints will be saved at: {str_output_dir}")
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..24716f2a314820a4cc15289fe0cb13ad52cf343c
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/run/train/train.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+
+from dinov2.logging import setup_logging
+from dinov2.train import get_args_parser as get_train_args_parser
+from dinov2.run.submit import get_args_parser, submit_jobs
+
+
+logger = logging.getLogger("dinov2")
+
+
+class Trainer(object):
+    def __init__(self, args):
+        self.args = args
+
+    def __call__(self):
+        from dinov2.train import main as train_main
+
+        self._setup_args()
+        train_main(self.args)
+
+    def checkpoint(self):
+        import submitit
+
+        logger.info(f"Requeuing {self.args}")
+        empty = type(self)(self.args)
+        return submitit.helpers.DelayedSubmission(empty)
+
+    def _setup_args(self):
+        import submitit
+
+        job_env = submitit.JobEnvironment()
+        self.args.output_dir = self.args.output_dir.replace("%j", str(job_env.job_id))
+        logger.info(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
+        logger.info(f"Args: {self.args}")
+
+
+def main():
+    description = "Submitit launcher for DINOv2 training"
+    train_args_parser = get_train_args_parser(add_help=False)
+    parents = [train_args_parser]
+    args_parser = get_args_parser(description=description, parents=parents)
+    args = args_parser.parse_args()
+
+    setup_logging()
+
+    assert os.path.exists(args.config_file), "Configuration file does not exist!"
+    submit_jobs(Trainer, args, name="dinov2:train")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0b66d17aa547ed5560e75a03f5c1587da2d4fd7
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .train import get_args_parser, main
+from .ssl_meta_arch import SSLMetaArch
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/ssl_meta_arch.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/ssl_meta_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..86d0c2413f9abc61953d0e12b43a5a843d97d244
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/ssl_meta_arch.py
@@ -0,0 +1,403 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import partial
+import logging
+
+import torch
+from torch import nn
+
+from dinov2.loss import DINOLoss, iBOTPatchLoss, KoLeoLoss
+from dinov2.models import build_model_from_cfg
+from dinov2.layers import DINOHead
+from dinov2.utils.utils import has_batchnorms
+from dinov2.utils.param_groups import get_params_groups_with_decay, fuse_params_groups
+from dinov2.fsdp import get_fsdp_wrapper, ShardedGradScaler, get_fsdp_modules, reshard_fsdp_model
+
+from dinov2.models.vision_transformer import BlockChunk
+
+try:
+    from xformers.ops import fmha
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    XFORMERS_AVAILABLE = False
+assert XFORMERS_AVAILABLE, "xFormers is required for DINOv2 training"
+
+
+logger = logging.getLogger("dinov2")
+
+
+class SSLMetaArch(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.cfg = cfg
+        self.fp16_scaler = ShardedGradScaler() if cfg.compute_precision.grad_scaler else None
+
+        student_model_dict = dict()
+        teacher_model_dict = dict()
+
+        student_backbone, teacher_backbone, embed_dim = build_model_from_cfg(cfg)
+        student_model_dict["backbone"] = student_backbone
+        teacher_model_dict["backbone"] = teacher_backbone
+        logger.info(f"OPTIONS -- architecture : embed_dim: {embed_dim}")
+
+        if cfg.student.pretrained_weights:
+            chkpt = torch.load(cfg.student.pretrained_weights)
+            logger.info(f"OPTIONS -- pretrained weights: loading from {cfg.student.pretrained_weights}")
+            student_backbone.load_state_dict(chkpt["model"], strict=False)
+
+        self.embed_dim = embed_dim
+        self.dino_out_dim = cfg.dino.head_n_prototypes
+
+        self.do_dino = cfg.dino.loss_weight > 0
+        self.do_koleo = cfg.dino.koleo_loss_weight > 0
+        self.do_ibot = cfg.ibot.loss_weight > 0
+        self.ibot_separate_head = cfg.ibot.separate_head
+
+        logger.info("OPTIONS -- DINO")
+        if self.do_dino:
+            logger.info(f"OPTIONS -- DINO -- loss_weight: {cfg.dino.loss_weight}")
+            logger.info(f"OPTIONS -- DINO -- head_n_prototypes: {cfg.dino.head_n_prototypes}")
+            logger.info(f"OPTIONS -- DINO -- head_bottleneck_dim: {cfg.dino.head_bottleneck_dim}")
+            logger.info(f"OPTIONS -- DINO -- head_hidden_dim: {cfg.dino.head_hidden_dim}")
+            self.dino_loss_weight = cfg.dino.loss_weight
+            dino_head = partial(
+                DINOHead,
+                in_dim=embed_dim,
+                out_dim=cfg.dino.head_n_prototypes,
+                hidden_dim=cfg.dino.head_hidden_dim,
+                bottleneck_dim=cfg.dino.head_bottleneck_dim,
+                nlayers=cfg.dino.head_nlayers,
+            )
+            self.dino_loss = DINOLoss(self.dino_out_dim)
+            if self.do_koleo:
+                logger.info("OPTIONS -- DINO -- applying KOLEO regularization")
+                self.koleo_loss = KoLeoLoss()
+
+        else:
+            logger.info("OPTIONS -- DINO -- not using DINO")
+
+        if self.do_dino or self.do_ibot:
+            student_model_dict["dino_head"] = dino_head()
+            teacher_model_dict["dino_head"] = dino_head()
+
+        logger.info("OPTIONS -- IBOT")
+        logger.info(f"OPTIONS -- IBOT -- loss_weight: {cfg.ibot.loss_weight}")
+        logger.info(f"OPTIONS -- IBOT masking -- ibot_mask_ratio_tuple: {cfg.ibot.mask_ratio_min_max}")
+        logger.info(f"OPTIONS -- IBOT masking -- ibot_mask_sample_probability: {cfg.ibot.mask_sample_probability}")
+        if self.do_ibot:
+            self.ibot_loss_weight = cfg.ibot.loss_weight
+            assert max(cfg.ibot.mask_ratio_min_max) > 0, "please provide a positive mask ratio tuple for ibot"
+            assert cfg.ibot.mask_sample_probability > 0, "please provide a positive mask probability for ibot"
+            self.ibot_out_dim = cfg.ibot.head_n_prototypes if self.ibot_separate_head else cfg.dino.head_n_prototypes
+            self.ibot_patch_loss = iBOTPatchLoss(self.ibot_out_dim)
+            if self.ibot_separate_head:
+                logger.info(f"OPTIONS -- IBOT -- loss_weight: {cfg.ibot.loss_weight}")
+                logger.info(f"OPTIONS -- IBOT -- head_n_prototypes: {cfg.ibot.head_n_prototypes}")
+                logger.info(f"OPTIONS -- IBOT -- head_bottleneck_dim: {cfg.ibot.head_bottleneck_dim}")
+                logger.info(f"OPTIONS -- IBOT -- head_hidden_dim: {cfg.ibot.head_hidden_dim}")
+                ibot_head = partial(
+                    DINOHead,
+                    in_dim=embed_dim,
+                    out_dim=cfg.ibot.head_n_prototypes,
+                    hidden_dim=cfg.ibot.head_hidden_dim,
+                    bottleneck_dim=cfg.ibot.head_bottleneck_dim,
+                    nlayers=cfg.ibot.head_nlayers,
+                )
+                student_model_dict["ibot_head"] = ibot_head()
+                teacher_model_dict["ibot_head"] = ibot_head()
+            else:
+                logger.info("OPTIONS -- IBOT -- head shared with DINO")
+
+        self.need_to_synchronize_fsdp_streams = True
+
+        self.student = nn.ModuleDict(student_model_dict)
+        self.teacher = nn.ModuleDict(teacher_model_dict)
+
+        # there is no backpropagation through the teacher, so no need for gradients
+        for p in self.teacher.parameters():
+            p.requires_grad = False
+        logger.info(f"Student and Teacher are built: they are both {cfg.student.arch} network.")
+
+    def forward(self, inputs):
+        raise NotImplementedError
+
+    def backprop_loss(self, loss):
+        if self.fp16_scaler is not None:
+            self.fp16_scaler.scale(loss).backward()
+        else:
+            loss.backward()
+
+    def forward_backward(self, images, teacher_temp):
+        n_global_crops = 2
+        assert n_global_crops == 2
+        n_local_crops = self.cfg.crops.local_crops_number
+
+        global_crops = images["collated_global_crops"].cuda(non_blocking=True)
+        local_crops = images["collated_local_crops"].cuda(non_blocking=True)
+
+        masks = images["collated_masks"].cuda(non_blocking=True)
+        mask_indices_list = images["mask_indices_list"].cuda(non_blocking=True)
+        n_masked_patches_tensor = images["n_masked_patches"].cuda(non_blocking=True)
+        n_masked_patches = mask_indices_list.shape[0]
+        upperbound = images["upperbound"]
+        masks_weight = images["masks_weight"].cuda(non_blocking=True)
+
+        n_local_crops_loss_terms = max(n_local_crops * n_global_crops, 1)
+        n_global_crops_loss_terms = (n_global_crops - 1) * n_global_crops
+
+        do_dino = self.do_dino
+        do_ibot = self.do_ibot
+
+        # loss scales
+        ibot_loss_scale = 1.0 / n_global_crops
+
+        # teacher output
+        @torch.no_grad()
+        def get_teacher_output():
+            x, n_global_crops_teacher = global_crops, n_global_crops
+            teacher_backbone_output_dict = self.teacher.backbone(x, is_training=True)
+            teacher_cls_tokens = teacher_backbone_output_dict["x_norm_clstoken"]
+            teacher_cls_tokens = teacher_cls_tokens.chunk(n_global_crops_teacher)
+            # watch out: these are chunked and cat'd in reverse so A is matched to B in the global crops dino loss
+            teacher_cls_tokens = torch.cat((teacher_cls_tokens[1], teacher_cls_tokens[0]))
+            ibot_teacher_patch_tokens = teacher_backbone_output_dict["x_norm_patchtokens"]
+            _dim = ibot_teacher_patch_tokens.shape[-1]
+            n_cls_tokens = teacher_cls_tokens.shape[0]
+
+            if do_ibot and not self.ibot_separate_head:
+                buffer_tensor_teacher = ibot_teacher_patch_tokens.new_zeros(upperbound + n_cls_tokens, _dim)
+                buffer_tensor_teacher[:n_cls_tokens].copy_(teacher_cls_tokens)
+                torch.index_select(
+                    ibot_teacher_patch_tokens.flatten(0, 1),
+                    dim=0,
+                    index=mask_indices_list,
+                    out=buffer_tensor_teacher[n_cls_tokens : n_cls_tokens + n_masked_patches],
+                )
+                tokens_after_head = self.teacher.dino_head(buffer_tensor_teacher)
+                teacher_cls_tokens_after_head = tokens_after_head[:n_cls_tokens]
+                masked_teacher_patch_tokens_after_head = tokens_after_head[
+                    n_cls_tokens : n_cls_tokens + n_masked_patches
+                ]
+            elif do_ibot and self.ibot_separate_head:
+                buffer_tensor_teacher = ibot_teacher_patch_tokens.new_zeros(upperbound, _dim)
+                torch.index_select(
+                    ibot_teacher_patch_tokens.flatten(0, 1),
+                    dim=0,
+                    index=mask_indices_list,
+                    out=buffer_tensor_teacher[:n_masked_patches],
+                )
+                teacher_cls_tokens_after_head = self.teacher.dino_head(teacher_cls_tokens)
+                masked_teacher_patch_tokens_after_head = self.teacher.ibot_head(buffer_tensor_teacher)[
+                    :n_masked_patches
+                ]
+            else:
+                teacher_cls_tokens_after_head = self.teacher.dino_head(teacher_cls_tokens)
+                masked_teacher_ibot_softmaxed_centered = None
+
+            if self.cfg.train.centering == "centering":
+                teacher_dino_softmaxed_centered_list = self.dino_loss.softmax_center_teacher(
+                    teacher_cls_tokens_after_head, teacher_temp=teacher_temp
+                ).view(n_global_crops_teacher, -1, *teacher_cls_tokens_after_head.shape[1:])
+                self.dino_loss.update_center(teacher_cls_tokens_after_head)
+                if do_ibot:
+                    masked_teacher_patch_tokens_after_head = masked_teacher_patch_tokens_after_head.unsqueeze(0)
+                    masked_teacher_ibot_softmaxed_centered = self.ibot_patch_loss.softmax_center_teacher(
+                        masked_teacher_patch_tokens_after_head[:, :n_masked_patches], teacher_temp=teacher_temp
+                    )
+                    masked_teacher_ibot_softmaxed_centered = masked_teacher_ibot_softmaxed_centered.squeeze(0)
+                    self.ibot_patch_loss.update_center(masked_teacher_patch_tokens_after_head[:n_masked_patches])
+
+            elif self.cfg.train.centering == "sinkhorn_knopp":
+                teacher_dino_softmaxed_centered_list = self.dino_loss.sinkhorn_knopp_teacher(
+                    teacher_cls_tokens_after_head, teacher_temp=teacher_temp
+                ).view(n_global_crops_teacher, -1, *teacher_cls_tokens_after_head.shape[1:])
+
+                if do_ibot:
+                    masked_teacher_ibot_softmaxed_centered = self.ibot_patch_loss.sinkhorn_knopp_teacher(
+                        masked_teacher_patch_tokens_after_head,
+                        teacher_temp=teacher_temp,
+                        n_masked_patches_tensor=n_masked_patches_tensor,
+                    )
+
+            else:
+                raise NotImplementedError
+
+            return teacher_dino_softmaxed_centered_list, masked_teacher_ibot_softmaxed_centered
+
+        teacher_dino_softmaxed_centered_list, masked_teacher_ibot_softmaxed_centered = get_teacher_output()
+        reshard_fsdp_model(self.teacher)
+
+        loss_dict = {}
+
+        loss_accumulator = 0  # for backprop
+        student_global_backbone_output_dict, student_local_backbone_output_dict = self.student.backbone(
+            [global_crops, local_crops], masks=[masks, None], is_training=True
+        )
+
+        inputs_for_student_head_list = []
+
+        # 1a: local crops cls tokens
+        student_local_cls_tokens = student_local_backbone_output_dict["x_norm_clstoken"]
+        inputs_for_student_head_list.append(student_local_cls_tokens.unsqueeze(0))
+
+        # 1b: global crops cls tokens
+        student_global_cls_tokens = student_global_backbone_output_dict["x_norm_clstoken"]
+        inputs_for_student_head_list.append(student_global_cls_tokens.unsqueeze(0))
+
+        # 1c: global crops patch tokens
+        if do_ibot:
+            _dim = student_global_backbone_output_dict["x_norm_clstoken"].shape[-1]
+            ibot_student_patch_tokens = student_global_backbone_output_dict["x_norm_patchtokens"]
+            buffer_tensor_patch_tokens = ibot_student_patch_tokens.new_zeros(upperbound, _dim)
+            buffer_tensor_patch_tokens[:n_masked_patches].copy_(
+                torch.index_select(ibot_student_patch_tokens.flatten(0, 1), dim=0, index=mask_indices_list)
+            )
+            if not self.ibot_separate_head:
+                inputs_for_student_head_list.append(buffer_tensor_patch_tokens.unsqueeze(0))
+            else:
+                student_global_masked_patch_tokens_after_head = self.student.ibot_head(buffer_tensor_patch_tokens)[
+                    :n_masked_patches
+                ]
+
+        # 2: run
+        _attn_bias, cat_inputs = fmha.BlockDiagonalMask.from_tensor_list(inputs_for_student_head_list)
+        outputs_list = _attn_bias.split(self.student.dino_head(cat_inputs))
+
+        # 3a: local crops cls tokens
+        student_local_cls_tokens_after_head = outputs_list.pop(0).squeeze(0)
+
+        # 3b: global crops cls tokens
+        student_global_cls_tokens_after_head = outputs_list.pop(0).squeeze(0)
+
+        # 3c: global crops patch tokens
+        if do_ibot and not self.ibot_separate_head:
+            student_global_masked_patch_tokens_after_head = outputs_list.pop(0).squeeze(0)[:n_masked_patches]
+
+        if n_local_crops > 0:
+            dino_local_crops_loss = self.dino_loss(
+                student_output_list=student_local_cls_tokens_after_head.chunk(n_local_crops),
+                teacher_out_softmaxed_centered_list=teacher_dino_softmaxed_centered_list,
+            ) / (n_global_crops_loss_terms + n_local_crops_loss_terms)
+
+            # store for display
+            loss_dict["dino_local_crops_loss"] = dino_local_crops_loss
+
+            # accumulate loss
+            loss_accumulator += self.dino_loss_weight * dino_local_crops_loss
+
+        # process global crops
+        loss_scales = 2  # this is here since we process global crops together
+
+        if do_dino:
+            # compute loss
+            dino_global_crops_loss = (
+                self.dino_loss(
+                    student_output_list=[student_global_cls_tokens_after_head],
+                    teacher_out_softmaxed_centered_list=[
+                        teacher_dino_softmaxed_centered_list.flatten(0, 1)
+                    ],  # these were chunked and stacked in reverse so A is matched to B
+                )
+                * loss_scales
+                / (n_global_crops_loss_terms + n_local_crops_loss_terms)
+            )
+
+            loss_dict["dino_global_crops_loss"] = dino_global_crops_loss
+
+            # accumulate loss
+            loss_accumulator += self.dino_loss_weight * dino_global_crops_loss
+
+            student_cls_tokens = student_global_cls_tokens
+
+            if self.do_koleo:
+                koleo_loss = self.cfg.dino.koleo_loss_weight * sum(
+                    self.koleo_loss(p) for p in student_cls_tokens.chunk(2)
+                )  # we don't apply koleo loss between cls tokens of a same image
+                loss_accumulator += koleo_loss
+                loss_dict["koleo_loss"] = (
+                    koleo_loss / loss_scales
+                )  # this is to display the same losses as before but we can remove eventually
+
+        if do_ibot:
+            # compute loss
+            ibot_patch_loss = (
+                self.ibot_patch_loss.forward_masked(
+                    student_global_masked_patch_tokens_after_head,
+                    masked_teacher_ibot_softmaxed_centered,
+                    student_masks_flat=masks,
+                    n_masked_patches=n_masked_patches,
+                    masks_weight=masks_weight,
+                )
+                * loss_scales
+                * ibot_loss_scale
+            )
+
+            # store for display
+            loss_dict["ibot_loss"] = ibot_patch_loss / 2
+
+            # accumulate loss
+            loss_accumulator += self.ibot_loss_weight * ibot_patch_loss
+
+        self.backprop_loss(loss_accumulator)
+
+        self.fsdp_synchronize_streams()
+
+        return loss_dict
+
+    def fsdp_synchronize_streams(self):
+        if self.need_to_synchronize_fsdp_streams:
+            torch.cuda.synchronize()
+            self.student.dino_head._streams = (
+                self.teacher.dino_head._streams
+            ) = self.student.backbone._streams = self.teacher.backbone._streams
+            self.need_to_synchronize_fsdp_streams = False
+
+    def update_teacher(self, m):
+        student_param_list = []
+        teacher_param_list = []
+        with torch.no_grad():
+            for k in self.student.keys():
+                for ms, mt in zip(get_fsdp_modules(self.student[k]), get_fsdp_modules(self.teacher[k])):
+                    student_param_list += ms.params
+                    teacher_param_list += mt.params
+            torch._foreach_mul_(teacher_param_list, m)
+            torch._foreach_add_(teacher_param_list, student_param_list, alpha=1 - m)
+
+    def train(self):
+        super().train()
+        self.teacher.eval()
+
+    def get_maybe_fused_params_for_submodel(self, m):
+        params_groups = get_params_groups_with_decay(
+            model=m,
+            lr_decay_rate=self.cfg.optim.layerwise_decay,
+            patch_embed_lr_mult=self.cfg.optim.patch_embed_lr_mult,
+        )
+        fused_params_groups = fuse_params_groups(params_groups)
+        logger.info("fusing param groups")
+
+        for g in fused_params_groups:
+            g["foreach"] = True
+        return fused_params_groups
+
+    def get_params_groups(self):
+        all_params_groups = []
+        for m in self.student.values():
+            all_params_groups += self.get_maybe_fused_params_for_submodel(m)
+        return all_params_groups
+
+    def prepare_for_distributed_training(self):
+        logger.info("DISTRIBUTED FSDP -- preparing model for distributed training")
+        if has_batchnorms(self.student):
+            raise NotImplementedError
+        # below will synchronize all student subnetworks across gpus:
+        for k, v in self.student.items():
+            self.teacher[k].load_state_dict(self.student[k].state_dict())
+            student_model_cfg = self.cfg.compute_precision.student[k]
+            self.student[k] = get_fsdp_wrapper(student_model_cfg, modules_to_wrap={BlockChunk})(self.student[k])
+            teacher_model_cfg = self.cfg.compute_precision.teacher[k]
+            self.teacher[k] = get_fsdp_wrapper(teacher_model_cfg, modules_to_wrap={BlockChunk})(self.teacher[k])
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/train.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..5279b9c4317e56b5c0a9c39f7bf9bf56b04a1f8b
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/train/train.py
@@ -0,0 +1,319 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import math
+import os
+from functools import partial
+
+from fvcore.common.checkpoint import PeriodicCheckpointer
+import torch
+
+from dinov2.data import SamplerType, make_data_loader, make_dataset
+from dinov2.data import collate_data_and_cast, DataAugmentationDINO, MaskingGenerator
+import dinov2.distributed as distributed
+from dinov2.fsdp import FSDPCheckpointer
+from dinov2.logging import MetricLogger
+from dinov2.utils.config import setup
+from dinov2.utils.utils import CosineScheduler
+
+from dinov2.train.ssl_meta_arch import SSLMetaArch
+
+
+torch.backends.cuda.matmul.allow_tf32 = True  # PyTorch 1.12 sets this to False by default
+logger = logging.getLogger("dinov2")
+
+
+def get_args_parser(add_help: bool = True):
+    parser = argparse.ArgumentParser("DINOv2 training", add_help=add_help)
+    parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
+    parser.add_argument(
+        "--no-resume",
+        action="store_true",
+        help="Whether to not attempt to resume from the checkpoint directory. ",
+    )
+    parser.add_argument("--eval-only", action="store_true", help="perform evaluation only")
+    parser.add_argument("--eval", type=str, default="", help="Eval type to perform")
+    parser.add_argument(
+        "opts",
+        help="""
+Modify config options at the end of the command. For Yacs configs, use
+space-separated "PATH.KEY VALUE" pairs.
+For python-based LazyConfig, use "path.key=value".
+        """.strip(),
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+    parser.add_argument(
+        "--output-dir",
+        "--output_dir",
+        default="",
+        type=str,
+        help="Output directory to save logs and checkpoints",
+    )
+
+    return parser
+
+
+def build_optimizer(cfg, params_groups):
+    return torch.optim.AdamW(params_groups, betas=(cfg.optim.adamw_beta1, cfg.optim.adamw_beta2))
+
+
+def build_schedulers(cfg):
+    OFFICIAL_EPOCH_LENGTH = cfg.train.OFFICIAL_EPOCH_LENGTH
+    lr = dict(
+        base_value=cfg.optim["lr"],
+        final_value=cfg.optim["min_lr"],
+        total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+        warmup_iters=cfg.optim["warmup_epochs"] * OFFICIAL_EPOCH_LENGTH,
+        start_warmup_value=0,
+    )
+    wd = dict(
+        base_value=cfg.optim["weight_decay"],
+        final_value=cfg.optim["weight_decay_end"],
+        total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+    )
+    momentum = dict(
+        base_value=cfg.teacher["momentum_teacher"],
+        final_value=cfg.teacher["final_momentum_teacher"],
+        total_iters=cfg.optim["epochs"] * OFFICIAL_EPOCH_LENGTH,
+    )
+    teacher_temp = dict(
+        base_value=cfg.teacher["teacher_temp"],
+        final_value=cfg.teacher["teacher_temp"],
+        total_iters=cfg.teacher["warmup_teacher_temp_epochs"] * OFFICIAL_EPOCH_LENGTH,
+        warmup_iters=cfg.teacher["warmup_teacher_temp_epochs"] * OFFICIAL_EPOCH_LENGTH,
+        start_warmup_value=cfg.teacher["warmup_teacher_temp"],
+    )
+
+    lr_schedule = CosineScheduler(**lr)
+    wd_schedule = CosineScheduler(**wd)
+    momentum_schedule = CosineScheduler(**momentum)
+    teacher_temp_schedule = CosineScheduler(**teacher_temp)
+    last_layer_lr_schedule = CosineScheduler(**lr)
+
+    last_layer_lr_schedule.schedule[
+        : cfg.optim["freeze_last_layer_epochs"] * OFFICIAL_EPOCH_LENGTH
+    ] = 0  # mimicking the original schedules
+
+    logger.info("Schedulers ready.")
+
+    return (
+        lr_schedule,
+        wd_schedule,
+        momentum_schedule,
+        teacher_temp_schedule,
+        last_layer_lr_schedule,
+    )
+
+
+def apply_optim_scheduler(optimizer, lr, wd, last_layer_lr):
+    for param_group in optimizer.param_groups:
+        is_last_layer = param_group["is_last_layer"]
+        lr_multiplier = param_group["lr_multiplier"]
+        wd_multiplier = param_group["wd_multiplier"]
+        param_group["weight_decay"] = wd * wd_multiplier
+        param_group["lr"] = (last_layer_lr if is_last_layer else lr) * lr_multiplier
+
+
+def do_test(cfg, model, iteration):
+    new_state_dict = model.teacher.state_dict()
+
+    if distributed.is_main_process():
+        iterstring = str(iteration)
+        eval_dir = os.path.join(cfg.train.output_dir, "eval", iterstring)
+        os.makedirs(eval_dir, exist_ok=True)
+        # save teacher checkpoint
+        teacher_ckp_path = os.path.join(eval_dir, "teacher_checkpoint.pth")
+        torch.save({"teacher": new_state_dict}, teacher_ckp_path)
+
+
+def do_train(cfg, model, resume=False):
+    model.train()
+    inputs_dtype = torch.half
+    fp16_scaler = model.fp16_scaler  # for mixed precision training
+
+    # setup optimizer
+
+    optimizer = build_optimizer(cfg, model.get_params_groups())
+    (
+        lr_schedule,
+        wd_schedule,
+        momentum_schedule,
+        teacher_temp_schedule,
+        last_layer_lr_schedule,
+    ) = build_schedulers(cfg)
+
+    # checkpointer
+    checkpointer = FSDPCheckpointer(model, cfg.train.output_dir, optimizer=optimizer, save_to_disk=True)
+
+    start_iter = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1
+
+    OFFICIAL_EPOCH_LENGTH = cfg.train.OFFICIAL_EPOCH_LENGTH
+    max_iter = cfg.optim.epochs * OFFICIAL_EPOCH_LENGTH
+
+    periodic_checkpointer = PeriodicCheckpointer(
+        checkpointer,
+        period=3 * OFFICIAL_EPOCH_LENGTH,
+        max_iter=max_iter,
+        max_to_keep=3,
+    )
+
+    # setup data preprocessing
+
+    img_size = cfg.crops.global_crops_size
+    patch_size = cfg.student.patch_size
+    n_tokens = (img_size // patch_size) ** 2
+    mask_generator = MaskingGenerator(
+        input_size=(img_size // patch_size, img_size // patch_size),
+        max_num_patches=0.5 * img_size // patch_size * img_size // patch_size,
+    )
+
+    data_transform = DataAugmentationDINO(
+        cfg.crops.global_crops_scale,
+        cfg.crops.local_crops_scale,
+        cfg.crops.local_crops_number,
+        global_crops_size=cfg.crops.global_crops_size,
+        local_crops_size=cfg.crops.local_crops_size,
+    )
+
+    collate_fn = partial(
+        collate_data_and_cast,
+        mask_ratio_tuple=cfg.ibot.mask_ratio_min_max,
+        mask_probability=cfg.ibot.mask_sample_probability,
+        n_tokens=n_tokens,
+        mask_generator=mask_generator,
+        dtype=inputs_dtype,
+    )
+
+    # setup data loader
+
+    dataset = make_dataset(
+        dataset_str=cfg.train.dataset_path,
+        transform=data_transform,
+        target_transform=lambda _: (),
+    )
+    # sampler_type = SamplerType.INFINITE
+    sampler_type = SamplerType.SHARDED_INFINITE
+    data_loader = make_data_loader(
+        dataset=dataset,
+        batch_size=cfg.train.batch_size_per_gpu,
+        num_workers=cfg.train.num_workers,
+        shuffle=True,
+        seed=start_iter,  # TODO: Fix this -- cfg.train.seed
+        sampler_type=sampler_type,
+        sampler_advance=0,  # TODO(qas): fix this -- start_iter * cfg.train.batch_size_per_gpu,
+        drop_last=True,
+        collate_fn=collate_fn,
+    )
+
+    # training loop
+
+    iteration = start_iter
+
+    logger.info("Starting training from iteration {}".format(start_iter))
+    metrics_file = os.path.join(cfg.train.output_dir, "training_metrics.json")
+    metric_logger = MetricLogger(delimiter="  ", output_file=metrics_file)
+    header = "Training"
+
+    for data in metric_logger.log_every(
+        data_loader,
+        10,
+        header,
+        max_iter,
+        start_iter,
+    ):
+        current_batch_size = data["collated_global_crops"].shape[0] / 2
+        if iteration > max_iter:
+            return
+
+        # apply schedules
+
+        lr = lr_schedule[iteration]
+        wd = wd_schedule[iteration]
+        mom = momentum_schedule[iteration]
+        teacher_temp = teacher_temp_schedule[iteration]
+        last_layer_lr = last_layer_lr_schedule[iteration]
+        apply_optim_scheduler(optimizer, lr, wd, last_layer_lr)
+
+        # compute losses
+
+        optimizer.zero_grad(set_to_none=True)
+        loss_dict = model.forward_backward(data, teacher_temp=teacher_temp)
+
+        # clip gradients
+
+        if fp16_scaler is not None:
+            if cfg.optim.clip_grad:
+                fp16_scaler.unscale_(optimizer)
+                for v in model.student.values():
+                    v.clip_grad_norm_(cfg.optim.clip_grad)
+            fp16_scaler.step(optimizer)
+            fp16_scaler.update()
+        else:
+            if cfg.optim.clip_grad:
+                for v in model.student.values():
+                    v.clip_grad_norm_(cfg.optim.clip_grad)
+            optimizer.step()
+
+        # perform teacher EMA update
+
+        model.update_teacher(mom)
+
+        # logging
+
+        if distributed.get_global_size() > 1:
+            for v in loss_dict.values():
+                torch.distributed.all_reduce(v)
+        loss_dict_reduced = {k: v.item() / distributed.get_global_size() for k, v in loss_dict.items()}
+
+        if math.isnan(sum(loss_dict_reduced.values())):
+            logger.info("NaN detected")
+            raise AssertionError
+        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+
+        metric_logger.update(lr=lr)
+        metric_logger.update(wd=wd)
+        metric_logger.update(mom=mom)
+        metric_logger.update(last_layer_lr=last_layer_lr)
+        metric_logger.update(current_batch_size=current_batch_size)
+        metric_logger.update(total_loss=losses_reduced, **loss_dict_reduced)
+
+        # checkpointing and testing
+
+        if cfg.evaluation.eval_period_iterations > 0 and (iteration + 1) % cfg.evaluation.eval_period_iterations == 0:
+            do_test(cfg, model, f"training_{iteration}")
+            torch.cuda.synchronize()
+        periodic_checkpointer.step(iteration)
+
+        iteration = iteration + 1
+    metric_logger.synchronize_between_processes()
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+def main(args):
+    cfg = setup(args)
+
+    model = SSLMetaArch(cfg).to(torch.device("cuda"))
+    model.prepare_for_distributed_training()
+
+    logger.info("Model:\n{}".format(model))
+    if args.eval_only:
+        iteration = (
+            FSDPCheckpointer(model, save_dir=cfg.train.output_dir)
+            .resume_or_load(cfg.MODEL.WEIGHTS, resume=not args.no_resume)
+            .get("iteration", -1)
+            + 1
+        )
+        return do_test(cfg, model, f"manual_{iteration}")
+
+    do_train(cfg, model, resume=not args.no_resume)
+
+
+if __name__ == "__main__":
+    args = get_args_parser(add_help=True).parse_args()
+    main(args)
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0952fcc3f57e34b3747962e9ebd6fc57aeea63fa
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/cluster.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/cluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d98c05d68aa6e9dc165df3db06bd70d999b3fda
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/cluster.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+
+class ClusterType(Enum):
+    AWS = "aws"
+    FAIR = "fair"
+    RSC = "rsc"
+
+
+def _guess_cluster_type() -> ClusterType:
+    uname = os.uname()
+    if uname.sysname == "Linux":
+        if uname.release.endswith("-aws"):
+            # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws"
+            return ClusterType.AWS
+        elif uname.nodename.startswith("rsc"):
+            # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc"
+            return ClusterType.RSC
+
+    return ClusterType.FAIR
+
+
+def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]:
+    if cluster_type is None:
+        return _guess_cluster_type()
+
+    return cluster_type
+
+
+def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+
+    CHECKPOINT_DIRNAMES = {
+        ClusterType.AWS: "checkpoints",
+        ClusterType.FAIR: "checkpoint",
+        ClusterType.RSC: "checkpoint/dino",
+    }
+    return Path("/") / CHECKPOINT_DIRNAMES[cluster_type]
+
+
+def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    checkpoint_path = get_checkpoint_path(cluster_type)
+    if checkpoint_path is None:
+        return None
+
+    username = os.environ.get("USER")
+    assert username is not None
+    return checkpoint_path / username
+
+
+def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+
+    SLURM_PARTITIONS = {
+        ClusterType.AWS: "learnlab",
+        ClusterType.FAIR: "learnlab",
+        ClusterType.RSC: "learn",
+    }
+    return SLURM_PARTITIONS[cluster_type]
+
+
+def get_slurm_executor_parameters(
+    nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs
+) -> Dict[str, Any]:
+    # create default parameters
+    params = {
+        "mem_gb": 0,  # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html
+        "gpus_per_node": num_gpus_per_node,
+        "tasks_per_node": num_gpus_per_node,  # one task per GPU
+        "cpus_per_task": 10,
+        "nodes": nodes,
+        "slurm_partition": get_slurm_partition(cluster_type),
+    }
+    # apply cluster-specific adjustments
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type == ClusterType.AWS:
+        params["cpus_per_task"] = 12
+        del params["mem_gb"]
+    elif cluster_type == ClusterType.RSC:
+        params["cpus_per_task"] = 12
+    # set additional parameters / apply overrides
+    params.update(kwargs)
+    return params
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/config.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3763a8b0808ad45cbbfc1dcb00d52b00113f9ad
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/config.py
@@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import logging
+import os
+
+from omegaconf import OmegaConf
+
+import dinov2.distributed as distributed
+from dinov2.logging import setup_logging
+from dinov2.utils import utils
+from dinov2.configs import dinov2_default_config
+
+
+logger = logging.getLogger("dinov2")
+
+
+def apply_scaling_rules_to_cfg(cfg):  # to fix
+    if cfg.optim.scaling_rule == "sqrt_wrt_1024":
+        base_lr = cfg.optim.base_lr
+        cfg.optim.lr = base_lr
+        cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0)
+        logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}")
+    else:
+        raise NotImplementedError
+    return cfg
+
+
+def write_config(cfg, output_dir, name="config.yaml"):
+    logger.info(OmegaConf.to_yaml(cfg))
+    saved_cfg_path = os.path.join(output_dir, name)
+    with open(saved_cfg_path, "w") as f:
+        OmegaConf.save(config=cfg, f=f)
+    return saved_cfg_path
+
+
+def get_cfg_from_args(args):
+    args.output_dir = os.path.abspath(args.output_dir)
+    args.opts += [f"train.output_dir={args.output_dir}"]
+    default_cfg = OmegaConf.create(dinov2_default_config)
+    cfg = OmegaConf.load(args.config_file)
+    cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts))
+    return cfg
+
+
+def default_setup(args):
+    distributed.enable(overwrite=True)
+    seed = getattr(args, "seed", 0)
+    rank = distributed.get_global_rank()
+
+    global logger
+    setup_logging(output=args.output_dir, level=logging.INFO)
+    logger = logging.getLogger("dinov2")
+
+    utils.fix_random_seeds(seed + rank)
+    logger.info("git:\n  {}\n".format(utils.get_sha()))
+    logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg_from_args(args)
+    os.makedirs(args.output_dir, exist_ok=True)
+    default_setup(args)
+    apply_scaling_rules_to_cfg(cfg)
+    write_config(cfg, args.output_dir)
+    return cfg
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..cef122b25ff3533e004799a1d977f63eb213fee0
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/dtype.py
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Dict, Union
+
+import numpy as np
+import torch
+
+
+TypeSpec = Union[str, np.dtype, torch.dtype]
+
+
+_NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
+    np.dtype("bool"): torch.bool,
+    np.dtype("uint8"): torch.uint8,
+    np.dtype("int8"): torch.int8,
+    np.dtype("int16"): torch.int16,
+    np.dtype("int32"): torch.int32,
+    np.dtype("int64"): torch.int64,
+    np.dtype("float16"): torch.float16,
+    np.dtype("float32"): torch.float32,
+    np.dtype("float64"): torch.float64,
+    np.dtype("complex64"): torch.complex64,
+    np.dtype("complex128"): torch.complex128,
+}
+
+
+def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, str):
+        dtype = np.dtype(dtype)
+    assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
+    return _NUMPY_TO_TORCH_DTYPE[dtype]
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/param_groups.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/param_groups.py
new file mode 100644
index 0000000000000000000000000000000000000000..d707e70cc11591858d4166410d6ed80621cd49ff
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/param_groups.py
@@ -0,0 +1,94 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import defaultdict
+import logging
+
+
+logger = logging.getLogger("dinov2")
+
+
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12, force_is_backbone=False, chunked_blocks=False):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone") or force_is_backbone:
+        if ".pos_embed" in name or ".patch_embed" in name or ".mask_token" in name or ".cls_token" in name:
+            layer_id = 0
+        elif force_is_backbone and (
+            "pos_embed" in name or "patch_embed" in name or "mask_token" in name or "cls_token" in name
+        ):
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+        elif chunked_blocks and "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[2]) + 1
+        elif "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[1]) + 1
+
+    return lr_decay_rate ** (num_layers + 1 - layer_id)
+
+
+def get_params_groups_with_decay(model, lr_decay_rate=1.0, patch_embed_lr_mult=1.0):
+    chunked_blocks = False
+    if hasattr(model, "n_blocks"):
+        logger.info("chunked fsdp")
+        n_blocks = model.n_blocks
+        chunked_blocks = model.chunked_blocks
+    elif hasattr(model, "blocks"):
+        logger.info("first code branch")
+        n_blocks = len(model.blocks)
+    elif hasattr(model, "backbone"):
+        logger.info("second code branch")
+        n_blocks = len(model.backbone.blocks)
+    else:
+        logger.info("else code branch")
+        n_blocks = 0
+    all_param_groups = []
+
+    for name, param in model.named_parameters():
+        name = name.replace("_fsdp_wrapped_module.", "")
+        if not param.requires_grad:
+            continue
+        decay_rate = get_vit_lr_decay_rate(
+            name, lr_decay_rate, num_layers=n_blocks, force_is_backbone=n_blocks > 0, chunked_blocks=chunked_blocks
+        )
+        d = {"params": param, "is_last_layer": False, "lr_multiplier": decay_rate, "wd_multiplier": 1.0, "name": name}
+
+        if "last_layer" in name:
+            d.update({"is_last_layer": True})
+
+        if name.endswith(".bias") or "norm" in name or "gamma" in name:
+            d.update({"wd_multiplier": 0.0})
+
+        if "patch_embed" in name:
+            d.update({"lr_multiplier": d["lr_multiplier"] * patch_embed_lr_mult})
+
+        all_param_groups.append(d)
+        logger.info(f"""{name}: lr_multiplier: {d["lr_multiplier"]}, wd_multiplier: {d["wd_multiplier"]}""")
+
+    return all_param_groups
+
+
+def fuse_params_groups(all_params_groups, keys=("lr_multiplier", "wd_multiplier", "is_last_layer")):
+    fused_params_groups = defaultdict(lambda: {"params": []})
+    for d in all_params_groups:
+        identifier = ""
+        for k in keys:
+            identifier += k + str(d[k]) + "_"
+
+        for k in keys:
+            fused_params_groups[identifier][k] = d[k]
+        fused_params_groups[identifier]["params"].append(d["params"])
+
+    return fused_params_groups.values()
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/utils.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..53e63eb427f6d5396c8dc153ab07e825c72b68b4
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/dinov2/utils/utils.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import random
+import subprocess
+from urllib.parse import urlparse
+
+import numpy as np
+import torch
+from torch import nn
+
+
+logger = logging.getLogger("dinov2")
+
+
+def load_pretrained_weights(model, pretrained_weights, checkpoint_key):
+    if urlparse(pretrained_weights).scheme:  # If it looks like an URL
+        state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
+    else:
+        state_dict = torch.load(pretrained_weights, map_location="cpu")
+    if checkpoint_key is not None and checkpoint_key in state_dict:
+        logger.info(f"Take key {checkpoint_key} in provided checkpoint dict")
+        state_dict = state_dict[checkpoint_key]
+    # remove `module.` prefix
+    state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+    # remove `backbone.` prefix induced by multicrop wrapper
+    state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
+    msg = model.load_state_dict(state_dict, strict=False)
+    logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg))
+
+
+def fix_random_seeds(seed=31):
+    """
+    Fix random seeds.
+    """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
+
+    sha = "N/A"
+    diff = "clean"
+    branch = "N/A"
+    try:
+        sha = _run(["git", "rev-parse", "HEAD"])
+        subprocess.check_output(["git", "diff"], cwd=cwd)
+        diff = _run(["git", "diff-index", "HEAD"])
+        diff = "has uncommitted changes" if diff else "clean"
+        branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+
+
+class CosineScheduler(object):
+    def __init__(self, base_value, final_value, total_iters, warmup_iters=0, start_warmup_value=0, freeze_iters=0):
+        super().__init__()
+        self.final_value = final_value
+        self.total_iters = total_iters
+
+        freeze_schedule = np.zeros((freeze_iters))
+
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+
+        iters = np.arange(total_iters - warmup_iters - freeze_iters)
+        schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
+        self.schedule = np.concatenate((freeze_schedule, warmup_schedule, schedule))
+
+        assert len(self.schedule) == self.total_iters
+
+    def __getitem__(self, it):
+        if it >= self.total_iters:
+            return self.final_value
+        else:
+            return self.schedule[it]
+
+
+def has_batchnorms(model):
+    bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
+    for name, module in model.named_modules():
+        if isinstance(module, bn_types):
+            return True
+    return False
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/hubconf.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/hubconf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36b42cd2136182ea956d8be785cf492418163d8
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/hubconf.py
@@ -0,0 +1,162 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from enum import Enum
+from typing import Union
+
+import torch
+
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+
+
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+
+
+class Weights(Enum):
+    LVD142M = "LVD142M"
+
+
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    import vision_transformer as vits
+
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+
+    return model
+
+
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+
+
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+
+
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+
+
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/pyproject.toml b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..da67abd8ceabe6d427a96e5d9d4f04b25aebcd32
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/pyproject.toml
@@ -0,0 +1,29 @@
+[tool.black]
+line-length = 120
+
+[tool.pylint.master]
+persistent = false
+score = false
+
+[tool.pylint.messages_control]
+disable = "all"
+enable = [
+  "miscellaneous",
+  "similarities",
+]
+
+[tool.pylint.similarities]
+ignore-comments = true
+ignore-docstrings = true
+ignore-imports = true
+min-similarity-lines = 8
+
+[tool.pylint.reports]
+reports = false
+
+[tool.pylint.miscellaneous]
+notes = [
+  "FIXME",
+  "XXX",
+  "TODO",
+]
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/requirements-dev.txt b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/requirements-dev.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5cad34c34cde3a182b616d68b168588827eb9b7c
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/requirements-dev.txt
@@ -0,0 +1,3 @@
+black==22.6.0
+flake8==5.0.4
+pylint==2.15.0
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/requirements.txt b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..04c159c443b89330ff3c84257c41b011f9791257
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/requirements.txt
@@ -0,0 +1,11 @@
+--extra-index-url https://download.pytorch.org/whl/cu117
+torch==2.0.0
+torchvision==0.15.0
+omegaconf
+torchmetrics==0.10.3
+fvcore
+iopath
+xformers==0.0.18
+submitit
+--extra-index-url https://pypi.nvidia.com
+cuml-cu11
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/scripts/lint.sh b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/scripts/lint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b91acaf762c4be3a0c9d2a162210bfebfaacba08
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/scripts/lint.sh
@@ -0,0 +1,28 @@
+#!/bin/sh
+
+if [ -n "$1" ]; then
+  echo "linting \"$1\""
+fi
+
+echo "running black"
+if [ -n "$1" ]; then
+  black "$1"
+else
+  black dinov2
+fi
+
+echo "running flake8"
+if [ -n "$1" ]; then
+  flake8 "$1"
+else
+  flake8
+fi
+
+echo "running pylint"
+if [ -n "$1" ]; then
+  pylint "$1"
+else
+  pylint dinov2
+fi
+
+exit 0
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/setup.cfg b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..3cac0c045434cde205eebe91fd5a2c35a1226b4b
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/setup.cfg
@@ -0,0 +1,7 @@
+[flake8]
+max-line-length = 120
+ignore = E203,E501,W503
+per-file-ignores =
+  __init__.py:F401
+exclude =
+    venv
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/setup.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..001987cfeef6c5fe3469ea09cd4698352fa90939
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/setup.py
@@ -0,0 +1,87 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from pathlib import Path
+import re
+from typing import List, Tuple
+
+from setuptools import setup, find_packages
+
+
+NAME = "dinov2"
+DESCRIPTION = "PyTorch code and models for the DINOv2 self-supervised learning method."
+
+URL = "https://github.com/facebookresearch/dinov2"
+AUTHOR = "FAIR"
+REQUIRES_PYTHON = ">=3.9.0"
+HERE = Path(__file__).parent
+
+
+try:
+    with open(HERE / "README.md", encoding="utf-8") as f:
+        long_description = "\n" + f.read()
+except FileNotFoundError:
+    long_description = DESCRIPTION
+
+
+def get_requirements(path: str = HERE / "requirements.txt") -> Tuple[List[str], List[str]]:
+    requirements = []
+    extra_indices = []
+    with open(path) as f:
+        for line in f.readlines():
+            line = line.rstrip("\r\n")
+            if line.startswith("--extra-index-url "):
+                extra_indices.append(line[18:])
+                continue
+            requirements.append(line)
+    return requirements, extra_indices
+
+
+def get_package_version() -> str:
+    with open(HERE / "dinov2/__init__.py") as f:
+        result = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", f.read(), re.M)
+        if result:
+            return result.group(1)
+    raise RuntimeError("Can't get package version")
+
+
+requirements, extra_indices = get_requirements()
+version = get_package_version()
+dev_requirements, _ = get_requirements(HERE / "requirements-dev.txt")
+
+
+setup(
+    name=NAME,
+    version=version,
+    description=DESCRIPTION,
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    author=AUTHOR,
+    python_requires=REQUIRES_PYTHON,
+    url=URL,
+    packages=find_packages(),
+    package_data={
+        "": ["*.yaml"],
+    },
+    install_requires=requirements,
+    dependency_links=extra_indices,
+    extras_require={
+        "dev": dev_requirements,
+    },
+    install_package_data=True,
+    license="CC-BY-NC",
+    license_files=("LICENSE",),
+    classifiers=[
+        # Trove classifiers: https://github.com/pypa/trove-classifiers/blob/main/src/trove_classifiers/__init__.py
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: Other/Proprietary License",
+        "Programming Language :: Python :: 3.9",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+)
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/utils.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c6641404093652d5a2f19b4cf283d976ec39e64
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/utils.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+import itertools
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+
+
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+
+
+class CenterPadding(nn.Module):
+    def __init__(self, multiple):
+        super().__init__()
+        self.multiple = multiple
+
+    def _get_pad(self, size):
+        new_size = math.ceil(size / self.multiple) * self.multiple
+        pad_size = new_size - size
+        pad_size_left = pad_size // 2
+        pad_size_right = pad_size - pad_size_left
+        return pad_size_left, pad_size_right
+
+    @torch.inference_mode()
+    def forward(self, x):
+        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
+        output = F.pad(x, pads)
+        return output
diff --git a/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/vision_transformer.py b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..121318f9c77a69a4467888cce44e49549e9954c0
--- /dev/null
+++ b/head_extractor/src/mmseg/torchhub/facebookresearch_dinov2_main/vision_transformer.py
@@ -0,0 +1,395 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+
+from dinov2.layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+
+
+logger = logging.getLogger("dinov2")
+
+
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+
+
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+
+            def f(*args, **kwargs):
+                return nn.Identity()
+
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+        self.init_weights()
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
+        w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        
+        sqrt_N = math.sqrt(N)
+        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+            scale_factor=(sx, sy),
+            # (int(w0), int(h0)), # to solve the upsampling shape issue
+            mode="bicubic",
+            antialias=self.interpolate_antialias
+        )
+        
+        assert int(w0) == patch_pos_embed.shape[-2]
+        assert int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+
+        return x
+
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+
+        x = self.prepare_tokens_with_masks(x, masks)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+
+
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+
+
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
diff --git a/head_extractor/src/mmseg/utils/__init__.py b/head_extractor/src/mmseg/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1160c2c160b0ba0e99e2a84f157f7c656111004
--- /dev/null
+++ b/head_extractor/src/mmseg/utils/__init__.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# yapf: disable
+from .class_names import (ade_classes, ade_palette, bdd100k_classes,
+                          bdd100k_palette, cityscapes_classes,
+                          cityscapes_palette, cocostuff_classes,
+                          cocostuff_palette, dataset_aliases, get_classes,
+                          get_palette, isaid_classes, isaid_palette,
+                          loveda_classes, loveda_palette, potsdam_classes,
+                          potsdam_palette, stare_classes, stare_palette,
+                          synapse_classes, synapse_palette, vaihingen_classes,
+                          vaihingen_palette, voc_classes, voc_palette, 
+                          deepfashion_palette, deepfashion_classes,
+                          fashion_3category_classes, fashion_3category_palette,
+                          human_parsing_classes, human_parsing_palette,
+                          lip_classes, lip_palette, 
+                          human_union_classes, human_union_palette,
+                          deep_fashion_10k_classes, deep_fashion_10k_palette,
+                          imaterialist_classes, imaterialist_palette,
+                          imaterialist_5cat_classes, imaterialist_5cat_palette,
+                          union_new_classes, union_new_palette,
+                          union_new_add_mask_classes, union_new_add_mask_palette)
+
+# yapf: enable
+from .collect_env import collect_env
+from .get_templates import get_predefined_templates
+from .io import datafrombytes
+from .misc import add_prefix, stack_batch
+from .set_env import register_all_modules
+from .tokenizer import tokenize
+from .typing_utils import (ConfigType, ForwardResults, MultiConfig,
+                           OptConfigType, OptMultiConfig, OptSampleList,
+                           SampleList, TensorDict, TensorList)
+
+# isort: off
+from .mask_classification import MatchMasks, seg_data_to_instance_data
+
+__all__ = [
+    'union_new_add_mask_classes',
+    'union_new_add_mask_palette',
+    'union_new_classes',
+    'union_new_palette',
+    'imaterialist_5cat_classes',
+    'imaterialist_5cat_palette',
+    'imaterialist_classes',
+    'imaterialist_palette',
+    'human_union_classes',
+    'human_union_palette',
+    'deep_fashion_10k_classes',
+    'deep_fashion_10k_palette',
+    'lip_classes',
+    'lip_palette',
+    'human_parsing_classes',
+    'human_parsing_palette',
+    'fashion_3category_palette',
+    'fashion_3category_classes',
+    'deepfashion_palette',
+    'deepfashion_classes',
+    'collect_env',
+    'register_all_modules',
+    'stack_batch',
+    'add_prefix',
+    'ConfigType',
+    'OptConfigType',
+    'MultiConfig',
+    'OptMultiConfig',
+    'SampleList',
+    'OptSampleList',
+    'TensorDict',
+    'TensorList',
+    'ForwardResults',
+    'cityscapes_classes',
+    'ade_classes',
+    'voc_classes',
+    'cocostuff_classes',
+    'loveda_classes',
+    'potsdam_classes',
+    'vaihingen_classes',
+    'isaid_classes',
+    'stare_classes',
+    'cityscapes_palette',
+    'ade_palette',
+    'voc_palette',
+    'cocostuff_palette',
+    'loveda_palette',
+    'potsdam_palette',
+    'vaihingen_palette',
+    'isaid_palette',
+    'stare_palette',
+    'dataset_aliases',
+    'get_classes',
+    'get_palette',
+    'datafrombytes',
+    'synapse_palette',
+    'synapse_classes',
+    'get_predefined_templates',
+    'tokenize',
+    'seg_data_to_instance_data',
+    'MatchMasks',
+    'bdd100k_classes',
+    'bdd100k_palette',
+]
diff --git a/head_extractor/src/mmseg/utils/bpe_simple_vocab_16e6.txt.gz b/head_extractor/src/mmseg/utils/bpe_simple_vocab_16e6.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..36a15856e00a06a9fbed8cdd34d2393fea4a3113
--- /dev/null
+++ b/head_extractor/src/mmseg/utils/bpe_simple_vocab_16e6.txt.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917
diff --git a/head_extractor/src/mmseg/utils/class_names.py b/head_extractor/src/mmseg/utils/class_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..2636b83bd906a20ca04026bd6b1dba9fc819e59d
--- /dev/null
+++ b/head_extractor/src/mmseg/utils/class_names.py
@@ -0,0 +1,837 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import is_str
+
+
+def fashion_3category_classes():
+    """fashion_3category class names for external use."""
+    return [
+        'background', 'upper_body', 'lower_body', 'whole_body'
+    ]
+
+def fashion_3category_palette():
+    """fashion_3category palette for external use."""
+    return [
+        [0,0,0], [255,0,0], [0, 255, 0], [0, 0, 255]
+    ]
+
+def human_parsing_classes():
+    return [
+        'Background',
+        'Hat', 'Hair', 'Sunglasses', 'Upper-clothes',
+        'Skirt', 'Pants', 'Dress', 'Belt', 'Left-shoe', 'Right-shoe',
+        'Face', 'Left-leg', 'Right-leg', 'Left-arm', 'Right-arm', 'Bag', 'Scarf',
+    ]
+    
+def human_parsing_palette():
+    return [
+        [0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
+        [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
+        [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
+        [64, 0, 128], [66, 66, 66], [123, 66, 123], [22, 33, 44], [77, 88, 99], [23, 24, 77]
+    ]
+
+def human_union_classes():
+    return [
+        'background',
+            'hat',
+            'hair',
+            'sunglasses',
+            'upper-clothes',
+            'skirt',
+            'pants',
+            'dress',
+            'belt',
+            'shoes',
+            'face',
+            'legs',
+            'arms',
+            'bag',
+            'scarf',
+            'glove',
+            'socks',
+            'jumpsuits'
+    ]
+
+def human_union_palette():
+    return [
+        [0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
+        [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
+        [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
+        [64, 0, 128], [66, 66, 66], [123, 66, 123], [22, 33, 44], [77, 88, 99], [23, 24, 77]
+    ]
+
+def deep_fashion_10k_classes():
+    return [
+        'background',
+        'top',
+        'outer',
+        'skirt',
+        'dress',
+        'pants',
+        'leggings',
+        'headwear',
+        'eyeglass',
+        'neckwear',
+        'belt',
+        'footwear',
+        'bag',
+        'hair',
+        'face',
+        'skin',
+        'ring',
+        'wrist wearing',
+        'socks',
+        'gloves',
+        'necklace',
+        'rompers',
+        'earrings',
+        'tie'
+    ]
+
+def deep_fashion_10k_palette():
+    return [
+        [0, 0, 0], [255, 0, 0], [0, 128, 0], [0, 0, 255],
+        [0, 128, 128], [238, 130, 238], [128, 128, 128], [255, 255, 0],
+        [255, 153, 18], [255, 125, 64], [127, 255, 0], [175, 238, 238],
+        [138, 43, 226], [210, 105, 30], [0, 0, 139], [72, 61, 139],
+        [255, 20, 147], [255, 192, 203], [205, 92, 92], [32, 178, 170],
+        [132, 112, 255], [160, 82, 45], [255, 222, 173], [240, 230, 140],
+    ]
+
+def union_new_classes():
+    return [
+        'background',
+        'top',
+        'outer',
+        'skirt',
+        'dress',
+        'pants',
+        'leggings',
+        'accessories',
+        'belt',
+        'footwear',
+        'bag',
+        'hair',
+        'skin',
+        'rompers',
+    ]
+
+def union_new_palette():
+    return [
+        [0, 0, 0], # background
+        [255, 0, 0], # top
+        [4, 63, 120], # outer
+        [127, 127, 127],  # skirt
+        [80, 205, 207], # dress
+        [0, 255, 0], # pants
+        [230, 83, 223], # leggings
+        [207, 135, 41], # accessories, including wrist wear,ring,tie,etc...
+        [0, 51, 51], # belt
+        [0, 153, 255], # footwear
+        [167,78,103], # bag
+        [0, 0, 255], # hair
+        [142, 124, 195], # skin
+        [74, 28, 28], # rompers
+    ]
+
+def union_new_add_mask_classes():
+    return [
+        'background',
+        'top',
+        'outer',
+        'skirt',
+        'dress',
+        'pants',
+        'leggings',
+        'accessories',
+        'belt',
+        'footwear',
+        'bag',
+        'hair',
+        'skin',
+        'rompers',
+        'face_mask'
+    ]
+
+def union_new_add_mask_palette():
+    return [
+        [0, 0, 0], # background
+        [255, 0, 0], # top
+        [4, 63, 120], # outer
+        [127, 127, 127],  # skirt
+        [80, 205, 207], # dress
+        [0, 255, 0], # pants
+        [230, 83, 223], # leggings
+        [207, 135, 41], # accessories, including wrist wear,ring,tie,etc...
+        [0, 51, 51], # belt
+        [0, 153, 255], # footwear
+        [167,78,103], # bag
+        [0, 0, 255], # hair
+        [142, 124, 195], # skin
+        [74, 28, 28], # rompers
+        [147, 196, 125], # face mask
+    ]
+
+def imaterialist_classes():
+    return [
+        'background',
+        'shirt, blouse',
+        'top, t-shirt, sweatshirt',
+        'sweater',
+        'cardigan',
+        "jacket",
+        "vest",
+        "pants",
+        "shorts",
+        "skirt",
+        "coat",
+        "dress",
+        "jumpsuit",
+        "cape",
+        "glasses",
+        "hat",
+        "headband, head covering, hair accessory",
+        "tie",
+        "glove",
+        "watch",
+        "belt",
+        "leg warmer",
+        "tights, stockings",
+        "sock",
+        "shoe",
+        "bag, wallet",
+        "scarf",
+        "umbrella",
+        "hood",
+        "collar",
+        "lapel",
+        "epaulette",
+        "sleeve",
+        "pocket",
+        "neckline",
+        "buckle",
+        "zipper",
+        "applique",
+        "bead",
+        "bow",
+        "flower",
+        "fringe",
+        "ribbon",
+        "rivet",
+        "ruffle",
+        "sequin",
+        "tassel",
+    ]
+
+def imaterialist_palette():
+    return [
+        [0, 0, 0], 
+        [234, 191, 155], [186, 99, 123], [46, 100, 157], [154, 71, 196], 
+        [15, 185, 171], [13, 89, 100], [67, 216, 41], [212, 139, 166], 
+        [10, 101, 73], [198, 51, 168], [38, 174, 154], [150, 192, 158], 
+        [194, 243, 120], [10, 224, 173], [214, 94, 149], [211, 126, 18], 
+        [96, 7, 165], [255, 35, 14], [83, 127, 78], [106, 23, 51], 
+        [41, 244, 224], [38, 86, 244], [244, 234, 150], [233, 247, 180], 
+        [222, 117, 26], [2, 90, 51], [27, 176, 90], [178, 160, 25], 
+        [75, 52, 236], [119, 65, 186], [163, 254, 113], [39, 140, 118], 
+        [235, 112, 193], [134, 107, 77], [57, 169, 93], [251, 104, 47], 
+        [224, 14, 49], [20, 123, 134], [178, 32, 212], [116, 194, 248], 
+        [211, 196, 233], [93, 36, 29], [113, 99, 55], [5, 7, 250], 
+        [172, 174, 41], [101, 98, 209],
+    ]
+
+def imaterialist_5cat_classes():
+    return [
+        'background',
+        'upperbody',
+        'lowerbody',
+        'head_related',
+        'others',]
+    
+def imaterialist_5cat_palette():
+    return [
+        [0, 0, 0], [0, 0, 255], [255, 0, 0], [0, 255, 0], [128, 0, 196], 
+    ]
+
+def cityscapes_classes():
+    """Cityscapes class names for external use."""
+    return [
+        'road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+        'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+
+
+def ade_classes():
+    """ADE20K class names for external use."""
+    return [
+        'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed ',
+        'windowpane', 'grass', 'cabinet', 'sidewalk', 'person', 'earth',
+        'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car',
+        'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug',
+        'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe',
+        'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column',
+        'signboard', 'chest of drawers', 'counter', 'sand', 'sink',
+        'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path',
+        'stairs', 'runway', 'case', 'pool table', 'pillow', 'screen door',
+        'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffee table',
+        'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove',
+        'palm', 'kitchen island', 'computer', 'swivel chair', 'boat', 'bar',
+        'arcade machine', 'hovel', 'bus', 'towel', 'light', 'truck', 'tower',
+        'chandelier', 'awning', 'streetlight', 'booth', 'television receiver',
+        'airplane', 'dirt track', 'apparel', 'pole', 'land', 'bannister',
+        'escalator', 'ottoman', 'bottle', 'buffet', 'poster', 'stage', 'van',
+        'ship', 'fountain', 'conveyer belt', 'canopy', 'washer', 'plaything',
+        'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent',
+        'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank',
+        'trade name', 'microwave', 'pot', 'animal', 'bicycle', 'lake',
+        'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce',
+        'vase', 'traffic light', 'tray', 'ashcan', 'fan', 'pier', 'crt screen',
+        'plate', 'monitor', 'bulletin board', 'shower', 'radiator', 'glass',
+        'clock', 'flag'
+    ]
+
+
+def voc_classes():
+    """Pascal VOC class names for external use."""
+    return [
+        'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
+        'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
+        'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
+        'tvmonitor'
+    ]
+
+def deepfashion_classes():
+    """deepfashion class names for external use."""
+    return [
+        'background',
+        'sleeve top', 'long sleeve top', 'short sleeve outwear', 'long sleeve outwear',
+        'vest', 'sling', 'shorts', 'trousers', 'skirt', 'short sleeve dress',
+        'long sleeve dress', 'vest dress', 'sling dress'
+    ]
+
+
+def pcontext_classes():
+    """Pascal Context class names for external use."""
+    return [
+        'aeroplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle', 'bird',
+        'boat', 'book', 'bottle', 'building', 'bus', 'cabinet', 'car', 'cat',
+        'ceiling', 'chair', 'cloth', 'computer', 'cow', 'cup', 'curtain',
+        'dog', 'door', 'fence', 'floor', 'flower', 'food', 'grass', 'ground',
+        'horse', 'keyboard', 'light', 'motorbike', 'mountain', 'mouse',
+        'person', 'plate', 'platform', 'pottedplant', 'road', 'rock', 'sheep',
+        'shelves', 'sidewalk', 'sign', 'sky', 'snow', 'sofa', 'table', 'track',
+        'train', 'tree', 'truck', 'tvmonitor', 'wall', 'water', 'window',
+        'wood'
+    ]
+
+
+def cocostuff_classes():
+    """CocoStuff class names for external use."""
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+        'blanket', 'branch', 'bridge', 'building-other', 'bush', 'cabinet',
+        'cage', 'cardboard', 'carpet', 'ceiling-other', 'ceiling-tile',
+        'cloth', 'clothes', 'clouds', 'counter', 'cupboard', 'curtain',
+        'desk-stuff', 'dirt', 'door-stuff', 'fence', 'floor-marble',
+        'floor-other', 'floor-stone', 'floor-tile', 'floor-wood', 'flower',
+        'fog', 'food-other', 'fruit', 'furniture-other', 'grass', 'gravel',
+        'ground-other', 'hill', 'house', 'leaves', 'light', 'mat', 'metal',
+        'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net', 'paper',
+        'pavement', 'pillow', 'plant-other', 'plastic', 'platform',
+        'playingfield', 'railing', 'railroad', 'river', 'road', 'rock', 'roof',
+        'rug', 'salad', 'sand', 'sea', 'shelf', 'sky-other', 'skyscraper',
+        'snow', 'solid-other', 'stairs', 'stone', 'straw', 'structural-other',
+        'table', 'tent', 'textile-other', 'towel', 'tree', 'vegetable',
+        'wall-brick', 'wall-concrete', 'wall-other', 'wall-panel',
+        'wall-stone', 'wall-tile', 'wall-wood', 'water-other', 'waterdrops',
+        'window-blind', 'window-other', 'wood'
+    ]
+
+
+def loveda_classes():
+    """LoveDA class names for external use."""
+    return [
+        'background', 'building', 'road', 'water', 'barren', 'forest',
+        'agricultural'
+    ]
+
+
+def potsdam_classes():
+    """Potsdam class names for external use."""
+    return [
+        'impervious_surface', 'building', 'low_vegetation', 'tree', 'car',
+        'clutter'
+    ]
+
+
+def vaihingen_classes():
+    """Vaihingen class names for external use."""
+    return [
+        'impervious_surface', 'building', 'low_vegetation', 'tree', 'car',
+        'clutter'
+    ]
+
+
+def isaid_classes():
+    """iSAID class names for external use."""
+    return [
+        'background', 'ship', 'store_tank', 'baseball_diamond', 'tennis_court',
+        'basketball_court', 'Ground_Track_Field', 'Bridge', 'Large_Vehicle',
+        'Small_Vehicle', 'Helicopter', 'Swimming_pool', 'Roundabout',
+        'Soccer_ball_field', 'plane', 'Harbor'
+    ]
+
+
+def stare_classes():
+    """stare class names for external use."""
+    return ['background', 'vessel']
+
+
+def mapillary_v1_classes():
+    """mapillary_v1 class names for external use."""
+    return [
+        'Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail', 'Barrier',
+        'Wall', 'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Parking',
+        'Pedestrian Area', 'Rail Track', 'Road', 'Service Lane', 'Sidewalk',
+        'Bridge', 'Building', 'Tunnel', 'Person', 'Bicyclist', 'Motorcyclist',
+        'Other Rider', 'Lane Marking - Crosswalk', 'Lane Marking - General',
+        'Mountain', 'Sand', 'Sky', 'Snow', 'Terrain', 'Vegetation', 'Water',
+        'Banner', 'Bench', 'Bike Rack', 'Billboard', 'Catch Basin',
+        'CCTV Camera', 'Fire Hydrant', 'Junction Box', 'Mailbox', 'Manhole',
+        'Phone Booth', 'Pothole', 'Street Light', 'Pole', 'Traffic Sign Frame',
+        'Utility Pole', 'Traffic Light', 'Traffic Sign (Back)',
+        'Traffic Sign (Front)', 'Trash Can', 'Bicycle', 'Boat', 'Bus', 'Car',
+        'Caravan', 'Motorcycle', 'On Rails', 'Other Vehicle', 'Trailer',
+        'Truck', 'Wheeled Slow', 'Car Mount', 'Ego Vehicle', 'Unlabeled'
+    ]
+
+
+def mapillary_v1_palette():
+    """mapillary_v1_ palette for external use."""
+    return [[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153],
+            [180, 165, 180], [90, 120, 150], [102, 102, 156], [128, 64, 255],
+            [140, 140, 200], [170, 170, 170], [250, 170, 160], [96, 96, 96],
+            [230, 150, 140], [128, 64, 128], [110, 110, 110], [244, 35, 232],
+            [150, 100, 100], [70, 70, 70], [150, 120, 90], [220, 20, 60],
+            [255, 0, 0], [255, 0, 100], [255, 0, 200], [200, 128, 128],
+            [255, 255, 255], [64, 170, 64], [230, 160, 50], [70, 130, 180],
+            [190, 255, 255], [152, 251, 152], [107, 142, 35], [0, 170, 30],
+            [255, 255, 128], [250, 0, 30], [100, 140, 180], [220, 220, 220],
+            [220, 128, 128], [222, 40, 40], [100, 170, 30], [40, 40, 40],
+            [33, 33, 33], [100, 128, 160], [142, 0, 0], [70, 100, 150],
+            [210, 170, 100], [153, 153, 153], [128, 128, 128], [0, 0, 80],
+            [250, 170, 30], [192, 192, 192], [220, 220, 0], [140, 140, 20],
+            [119, 11, 32], [150, 0, 255], [0, 60, 100], [0, 0, 142],
+            [0, 0, 90], [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+            [0, 0, 70], [0, 0, 192], [32, 32, 32], [120, 10, 10], [0, 0, 0]]
+
+
+def mapillary_v2_classes():
+    """mapillary_v2 class names for external use."""
+    return [
+        'Bird', 'Ground Animal', 'Ambiguous Barrier', 'Concrete Block', 'Curb',
+        'Fence', 'Guard Rail', 'Barrier', 'Road Median', 'Road Side',
+        'Lane Separator', 'Temporary Barrier', 'Wall', 'Bike Lane',
+        'Crosswalk - Plain', 'Curb Cut', 'Driveway', 'Parking',
+        'Parking Aisle', 'Pedestrian Area', 'Rail Track', 'Road',
+        'Road Shoulder', 'Service Lane', 'Sidewalk', 'Traffic Island',
+        'Bridge', 'Building', 'Garage', 'Tunnel', 'Person', 'Person Group',
+        'Bicyclist', 'Motorcyclist', 'Other Rider',
+        'Lane Marking - Dashed Line', 'Lane Marking - Straight Line',
+        'Lane Marking - Zigzag Line', 'Lane Marking - Ambiguous',
+        'Lane Marking - Arrow (Left)', 'Lane Marking - Arrow (Other)',
+        'Lane Marking - Arrow (Right)',
+        'Lane Marking - Arrow (Split Left or Straight)',
+        'Lane Marking - Arrow (Split Right or Straight)',
+        'Lane Marking - Arrow (Straight)', 'Lane Marking - Crosswalk',
+        'Lane Marking - Give Way (Row)', 'Lane Marking - Give Way (Single)',
+        'Lane Marking - Hatched (Chevron)',
+        'Lane Marking - Hatched (Diagonal)', 'Lane Marking - Other',
+        'Lane Marking - Stop Line', 'Lane Marking - Symbol (Bicycle)',
+        'Lane Marking - Symbol (Other)', 'Lane Marking - Text',
+        'Lane Marking (only) - Dashed Line', 'Lane Marking (only) - Crosswalk',
+        'Lane Marking (only) - Other', 'Lane Marking (only) - Test',
+        'Mountain', 'Sand', 'Sky', 'Snow', 'Terrain', 'Vegetation', 'Water',
+        'Banner', 'Bench', 'Bike Rack', 'Catch Basin', 'CCTV Camera',
+        'Fire Hydrant', 'Junction Box', 'Mailbox', 'Manhole', 'Parking Meter',
+        'Phone Booth', 'Pothole', 'Signage - Advertisement',
+        'Signage - Ambiguous', 'Signage - Back', 'Signage - Information',
+        'Signage - Other', 'Signage - Store', 'Street Light', 'Pole',
+        'Pole Group', 'Traffic Sign Frame', 'Utility Pole', 'Traffic Cone',
+        'Traffic Light - General (Single)', 'Traffic Light - Pedestrians',
+        'Traffic Light - General (Upright)',
+        'Traffic Light - General (Horizontal)', 'Traffic Light - Cyclists',
+        'Traffic Light - Other', 'Traffic Sign - Ambiguous',
+        'Traffic Sign (Back)', 'Traffic Sign - Direction (Back)',
+        'Traffic Sign - Direction (Front)', 'Traffic Sign (Front)',
+        'Traffic Sign - Parking', 'Traffic Sign - Temporary (Back)',
+        'Traffic Sign - Temporary (Front)', 'Trash Can', 'Bicycle', 'Boat',
+        'Bus', 'Car', 'Caravan', 'Motorcycle', 'On Rails', 'Other Vehicle',
+        'Trailer', 'Truck', 'Vehicle Group', 'Wheeled Slow', 'Water Valve',
+        'Car Mount', 'Dynamic', 'Ego Vehicle', 'Ground', 'Static', 'Unlabeled'
+    ]
+
+
+def mapillary_v2_palette():
+    """mapillary_v2_ palette for external use."""
+    return [[165, 42, 42], [0, 192, 0], [250, 170, 31], [250, 170, 32],
+            [196, 196, 196], [190, 153, 153], [180, 165, 180], [90, 120, 150],
+            [250, 170, 33], [250, 170, 34], [128, 128, 128], [250, 170, 35],
+            [102, 102, 156], [128, 64, 255], [140, 140, 200], [170, 170, 170],
+            [250, 170, 36], [250, 170, 160], [250, 170, 37], [96, 96, 96],
+            [230, 150, 140], [128, 64, 128], [110, 110, 110], [110, 110, 110],
+            [244, 35, 232], [128, 196, 128], [150, 100, 100], [70, 70, 70],
+            [150, 150, 150], [150, 120, 90], [220, 20, 60], [220, 20, 60],
+            [255, 0, 0], [255, 0, 100], [255, 0, 200], [255, 255, 255],
+            [255, 255, 255], [250, 170, 29], [250, 170, 28], [250, 170, 26],
+            [250, 170, 25], [250, 170, 24], [250, 170, 22], [250, 170, 21],
+            [250, 170, 20], [255, 255, 255], [250, 170, 19], [250, 170, 18],
+            [250, 170, 12], [250, 170, 11], [255, 255, 255], [255, 255, 255],
+            [250, 170, 16], [250, 170, 15], [250, 170, 15], [255, 255, 255],
+            [255, 255, 255], [255, 255, 255], [255, 255, 255], [64, 170, 64],
+            [230, 160, 50], [70, 130, 180], [190, 255, 255], [152, 251, 152],
+            [107, 142, 35], [0, 170, 30], [255, 255, 128], [250, 0, 30],
+            [100, 140, 180], [220, 128, 128], [222, 40, 40], [100, 170, 30],
+            [40, 40, 40], [33, 33, 33], [100, 128, 160], [20, 20, 255],
+            [142, 0, 0], [70, 100, 150], [250, 171, 30], [250, 172, 30],
+            [250, 173, 30], [250, 174, 30], [250, 175, 30], [250, 176, 30],
+            [210, 170, 100], [153, 153, 153], [153, 153, 153], [128, 128, 128],
+            [0, 0, 80], [210, 60, 60], [250, 170, 30], [250, 170, 30],
+            [250, 170, 30], [250, 170, 30], [250, 170, 30], [250, 170, 30],
+            [192, 192, 192], [192, 192, 192], [192, 192, 192], [220, 220, 0],
+            [220, 220, 0], [0, 0, 196], [192, 192, 192], [220, 220, 0],
+            [140, 140, 20], [119, 11, 32], [150, 0, 255], [0, 60, 100],
+            [0, 0, 142], [0, 0, 90], [0, 0, 230], [0, 80, 100], [128, 64, 64],
+            [0, 0, 110], [0, 0, 70], [0, 0, 142], [0, 0, 192], [170, 170, 170],
+            [32, 32, 32], [111, 74, 0], [120, 10, 10], [81, 0, 81],
+            [111, 111, 0], [0, 0, 0]]
+
+
+def cityscapes_palette():
+    """Cityscapes palette for external use."""
+    return [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+            [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0],
+            [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60],
+            [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100],
+            [0, 0, 230], [119, 11, 32]]
+
+
+def ade_palette():
+    """ADE20K palette for external use."""
+    return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+            [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+            [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+            [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+            [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+            [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+            [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+            [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+            [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+            [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+            [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+            [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+            [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+            [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+            [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+            [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+            [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+            [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+            [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+            [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+            [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+            [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+            [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+            [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+            [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+            [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+            [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+            [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+            [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+            [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+            [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+            [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+            [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+            [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+            [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+            [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+            [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+            [102, 255, 0], [92, 0, 255]]
+
+
+def voc_palette():
+    """Pascal VOC palette for external use."""
+    return [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
+            [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0],
+            [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128],
+            [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0],
+            [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]]
+
+def deepfashion_palette():
+    """deepfashion palette for external use."""
+    return [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0],
+            [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128],
+            [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0],
+            [64, 0, 128], [66, 66, 66]]
+
+def pcontext_palette():
+    """Pascal Context palette for external use."""
+    return [[180, 120, 120], [6, 230, 230], [80, 50, 50], [4, 200, 3],
+            [120, 120, 80], [140, 140, 140], [204, 5, 255], [230, 230, 230],
+            [4, 250, 7], [224, 5, 255], [235, 255, 7], [150, 5, 61],
+            [120, 120, 70], [8, 255, 51], [255, 6, 82], [143, 255, 140],
+            [204, 255, 4], [255, 51, 7], [204, 70, 3], [0, 102, 200],
+            [61, 230, 250], [255, 6, 51], [11, 102, 255], [255, 7, 71],
+            [255, 9, 224], [9, 7, 230], [220, 220, 220], [255, 9, 92],
+            [112, 9, 255], [8, 255, 214], [7, 255, 224], [255, 184, 6],
+            [10, 255, 71], [255, 41, 10], [7, 255, 255], [224, 255, 8],
+            [102, 8, 255], [255, 61, 6], [255, 194, 7], [255, 122, 8],
+            [0, 255, 20], [255, 8, 41], [255, 5, 153], [6, 51, 255],
+            [235, 12, 255], [160, 150, 20], [0, 163, 255], [140, 140, 140],
+            [250, 10, 15], [20, 255, 0], [31, 255, 0], [255, 31, 0],
+            [255, 224, 0], [153, 255, 0], [0, 0, 255], [255, 71, 0],
+            [0, 235, 255], [0, 173, 255], [31, 0, 255]]
+
+
+def cocostuff_palette():
+    """CocoStuff palette for external use."""
+    return [[0, 192, 64], [0, 192, 64], [0, 64, 96], [128, 192, 192],
+            [0, 64, 64], [0, 192, 224], [0, 192, 192], [128, 192, 64],
+            [0, 192, 96], [128, 192, 64], [128, 32, 192], [0, 0, 224],
+            [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+            [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+            [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+            [64, 128, 32], [0, 160, 0], [0, 0, 0], [192, 128, 160], [0, 32, 0],
+            [0, 128, 128], [64, 128, 160], [128, 160, 0], [0, 128, 0],
+            [192, 128, 32], [128, 96, 128], [0, 0, 128], [64, 0, 32],
+            [0, 224, 128], [128, 0, 0], [192, 0, 160], [0, 96, 128],
+            [128, 128, 128], [64, 0, 160], [128, 224, 128], [128, 128, 64],
+            [192, 0, 32], [128, 96, 0], [128, 0, 192], [0, 128, 32],
+            [64, 224, 0], [0, 0, 64], [128, 128, 160], [64, 96, 0],
+            [0, 128, 192], [0, 128, 160], [192, 224, 0], [0, 128, 64],
+            [128, 128, 32], [192, 32, 128], [0, 64, 192], [0, 0, 32],
+            [64, 160, 128], [128, 64, 64], [128, 0, 160], [64, 32, 128],
+            [128, 192, 192], [0, 0, 160], [192, 160, 128], [128, 192, 0],
+            [128, 0, 96], [192, 32, 0], [128, 64, 128], [64, 128, 96],
+            [64, 160, 0], [0, 64, 0], [192, 128, 224], [64, 32, 0],
+            [0, 192, 128], [64, 128, 224], [192, 160, 0], [0, 192, 0],
+            [192, 128, 96], [192, 96, 128], [0, 64, 128], [64, 0, 96],
+            [64, 224, 128], [128, 64, 0], [192, 0, 224], [64, 96, 128],
+            [128, 192, 128], [64, 0, 224], [192, 224, 128], [128, 192, 64],
+            [192, 0, 96], [192, 96, 0], [128, 64, 192], [0, 128, 96],
+            [0, 224, 0], [64, 64, 64], [128, 128, 224], [0, 96, 0],
+            [64, 192, 192], [0, 128, 224], [128, 224, 0], [64, 192, 64],
+            [128, 128, 96], [128, 32, 128], [64, 0, 192], [0, 64, 96],
+            [0, 160, 128], [192, 0, 64], [128, 64, 224], [0, 32, 128],
+            [192, 128, 192], [0, 64, 224], [128, 160, 128], [192, 128, 0],
+            [128, 64, 32], [128, 32, 64], [192, 0, 128], [64, 192, 32],
+            [0, 160, 64], [64, 0, 0], [192, 192, 160], [0, 32, 64],
+            [64, 128, 128], [64, 192, 160], [128, 160, 64], [64, 128, 0],
+            [192, 192, 32], [128, 96, 192], [64, 0, 128], [64, 64, 32],
+            [0, 224, 192], [192, 0, 0], [192, 64, 160], [0, 96, 192],
+            [192, 128, 128], [64, 64, 160], [128, 224, 192], [192, 128, 64],
+            [192, 64, 32], [128, 96, 64], [192, 0, 192], [0, 192, 32],
+            [64, 224, 64], [64, 0, 64], [128, 192, 160], [64, 96, 64],
+            [64, 128, 192], [0, 192, 160], [192, 224, 64], [64, 128, 64],
+            [128, 192, 32], [192, 32, 192], [64, 64, 192], [0, 64, 32],
+            [64, 160, 192], [192, 64, 64], [128, 64, 160], [64, 32, 192],
+            [192, 192, 192], [0, 64, 160], [192, 160, 192], [192, 192, 0],
+            [128, 64, 96], [192, 32, 64], [192, 64, 128], [64, 192, 96],
+            [64, 160, 64], [64, 64, 0]]
+
+
+def loveda_palette():
+    """LoveDA palette for external use."""
+    return [[255, 255, 255], [255, 0, 0], [255, 255, 0], [0, 0, 255],
+            [159, 129, 183], [0, 255, 0], [255, 195, 128]]
+
+
+def potsdam_palette():
+    """Potsdam palette for external use."""
+    return [[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
+            [255, 255, 0], [255, 0, 0]]
+
+
+def vaihingen_palette():
+    """Vaihingen palette for external use."""
+    return [[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
+            [255, 255, 0], [255, 0, 0]]
+
+
+def isaid_palette():
+    """iSAID palette for external use."""
+    return [[0, 0, 0], [0, 0, 63], [0, 63, 63], [0, 63, 0], [0, 63, 127],
+            [0, 63, 191], [0, 63, 255], [0, 127, 63], [0, 127,
+                                                       127], [0, 0, 127],
+            [0, 0, 191], [0, 0, 255], [0, 191, 127], [0, 127, 191],
+            [0, 127, 255], [0, 100, 155]]
+
+
+def stare_palette():
+    """STARE palette for external use."""
+    return [[120, 120, 120], [6, 230, 230]]
+
+
+def synapse_palette():
+    """Synapse palette for external use."""
+    return [[0, 0, 0], [0, 0, 255], [0, 255, 0], [255, 0, 0], [0, 255, 255],
+            [255, 0, 255], [255, 255, 0], [60, 255, 255], [240, 240, 240]]
+
+
+def synapse_classes():
+    """Synapse class names for external use."""
+    return [
+        'background', 'aorta', 'gallbladder', 'left_kidney', 'right_kidney',
+        'liver', 'pancreas', 'spleen', 'stomach'
+    ]
+
+
+def lip_classes():
+    """LIP class names for external use."""
+    return [
+        'background', 'hat', 'hair', 'glove', 'sunglasses', 'upperclothes',
+        'dress', 'coat', 'socks', 'pants', 'jumpsuits', 'scarf', 'skirt',
+        'face', 'leftArm', 'rightArm', 'leftLeg', 'rightLeg', 'leftShoe',
+        'rightShoe'
+    ]
+
+
+def lip_palette():
+    """LIP palette for external use."""
+    return [
+        [0, 0, 0],
+        [128, 0, 0],
+        [255, 0, 0],
+        [0, 85, 0],
+        [170, 0, 51],
+        [255, 85, 0],
+        [0, 0, 85],
+        [0, 119, 221],
+        [85, 85, 0],
+        [0, 85, 85],
+        [85, 51, 0],
+        [52, 86, 128],
+        [0, 128, 0],
+        [0, 0, 255],
+        [51, 170, 221],
+        [0, 255, 255],
+        [85, 255, 170],
+        [170, 255, 85],
+        [255, 255, 0],
+        [255, 170, 0],
+    ]
+
+
+def bdd100k_classes():
+    """BDD100K class names for external use(the class name is compatible with
+    Cityscapes )."""
+    return [
+        'road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+        'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+
+
+def bdd100k_palette():
+    """bdd100k palette for external use(same with cityscapes)"""
+    return [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+            [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0],
+            [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60],
+            [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100],
+            [0, 0, 230], [119, 11, 32]]
+
+
+def hsidrive_classes():
+    """HSI Drive 2.0 class names for external use."""
+    return [
+        'unlabelled', 'road', 'road marks', 'vegetation', 'painted metal',
+        'sky', 'concrete', 'pedestrian', 'water', 'unpainted metal', 'glass'
+    ]
+
+
+def hsidrive_palette():
+    """HSI Drive 2.0 palette for external use."""
+    return [[0, 0, 0], [77, 77, 77], [255, 255, 255], [0, 255, 0], [255, 0, 0],
+            [0, 0, 255], [102, 51, 0], [255, 255, 0], [0, 207, 250],
+            [255, 166, 0], [0, 204, 204]]
+
+
+dataset_aliases = {
+    'fashion_3category': ['fashon_3category'],
+    'cityscapes': ['cityscapes'],
+    'ade': ['ade', 'ade20k'],
+    'voc': ['voc', 'pascal_voc', 'voc12', 'voc12aug'],
+    'pcontext': ['pcontext', 'pascal_context', 'voc2010'],
+    'loveda': ['loveda'],
+    'potsdam': ['potsdam'],
+    'vaihingen': ['vaihingen'],
+    'cocostuff': [
+        'cocostuff', 'cocostuff10k', 'cocostuff164k', 'coco-stuff',
+        'coco-stuff10k', 'coco-stuff164k', 'coco_stuff', 'coco_stuff10k',
+        'coco_stuff164k'
+    ],
+    'isaid': ['isaid', 'iSAID'],
+    'stare': ['stare', 'STARE'],
+    'lip': ['LIP', 'lip'],
+    'human_union': ['human_union', 'Human_Union'],
+    'union_new': ['union_new'],
+    'union_new_add_mask': ['union_new_add_mask'],
+    'deep_fashion_10k': ['deep_fashion_10k', 'Deep_Fashion_10K'],
+    'imaterialist': ['iMaterialist', 'imaterialist'],
+    'imaterialist_5cat': ['iMaterialist_5cat', 'imaterialist_5cat'],
+    'mapillary_v1': ['mapillary_v1'],
+    'mapillary_v2': ['mapillary_v2'],
+    'bdd100k': ['bdd100k'],
+    'hsidrive': [
+        'hsidrive', 'HSIDrive', 'HSI-Drive', 'hsidrive20', 'HSIDrive20',
+        'HSI-Drive20'
+    ]
+}
+
+
+def get_classes(dataset):
+    """Get class names of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_classes()')
+        else:
+            raise ValueError(f'Unrecognized dataset: {dataset}')
+    else:
+        raise TypeError(f'dataset must a str, but got {type(dataset)}')
+    return labels
+
+
+def get_palette(dataset):
+    """Get class palette (RGB) of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_palette()')
+        else:
+            raise ValueError(f'Unrecognized dataset: {dataset}')
+    else:
+        raise TypeError(f'dataset must a str, but got {type(dataset)}')
+    return labels
diff --git a/head_extractor/src/mmseg/utils/collect_env.py b/head_extractor/src/mmseg/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5d6ea290283e3af2f29475f82d225072cf39d99
--- /dev/null
+++ b/head_extractor/src/mmseg/utils/collect_env.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.utils import get_git_hash
+from mmengine.utils.dl_utils import collect_env as collect_base_env
+
+import mmseg
+
+
+def collect_env():
+    """Collect the information of the running environments."""
+    env_info = collect_base_env()
+    env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}'
+
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/head_extractor/src/mmseg/utils/get_templates.py b/head_extractor/src/mmseg/utils/get_templates.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e9032ba96cbe750134676fe46fc26fb607779f5
--- /dev/null
+++ b/head_extractor/src/mmseg/utils/get_templates.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+PREDEFINED_TEMPLATES = {
+    'imagenet': [
+        'a bad photo of a {}.',
+        'a photo of many {}.',
+        'a sculpture of a {}.',
+        'a photo of the hard to see {}.',
+        'a low resolution photo of the {}.',
+        'a rendering of a {}.',
+        'graffiti of a {}.',
+        'a bad photo of the {}.',
+        'a cropped photo of the {}.',
+        'a tattoo of a {}.',
+        'the embroidered {}.',
+        'a photo of a hard to see {}.',
+        'a bright photo of a {}.',
+        'a photo of a clean {}.',
+        'a photo of a dirty {}.',
+        'a dark photo of the {}.',
+        'a drawing of a {}.',
+        'a photo of my {}.',
+        'the plastic {}.',
+        'a photo of the cool {}.',
+        'a close-up photo of a {}.',
+        'a black and white photo of the {}.',
+        'a painting of the {}.',
+        'a painting of a {}.',
+        'a pixelated photo of the {}.',
+        'a sculpture of the {}.',
+        'a bright photo of the {}.',
+        'a cropped photo of a {}.',
+        'a plastic {}.',
+        'a photo of the dirty {}.',
+        'a jpeg corrupted photo of a {}.',
+        'a blurry photo of the {}.',
+        'a photo of the {}.',
+        'a good photo of the {}.',
+        'a rendering of the {}.',
+        'a {} in a video game.',
+        'a photo of one {}.',
+        'a doodle of a {}.',
+        'a close-up photo of the {}.',
+        'a photo of a {}.',
+        'the origami {}.',
+        'the {} in a video game.',
+        'a sketch of a {}.',
+        'a doodle of the {}.',
+        'a origami {}.',
+        'a low resolution photo of a {}.',
+        'the toy {}.',
+        'a rendition of the {}.',
+        'a photo of the clean {}.',
+        'a photo of a large {}.',
+        'a rendition of a {}.',
+        'a photo of a nice {}.',
+        'a photo of a weird {}.',
+        'a blurry photo of a {}.',
+        'a cartoon {}.',
+        'art of a {}.',
+        'a sketch of the {}.',
+        'a embroidered {}.',
+        'a pixelated photo of a {}.',
+        'itap of the {}.',
+        'a jpeg corrupted photo of the {}.',
+        'a good photo of a {}.',
+        'a plushie {}.',
+        'a photo of the nice {}.',
+        'a photo of the small {}.',
+        'a photo of the weird {}.',
+        'the cartoon {}.',
+        'art of the {}.',
+        'a drawing of the {}.',
+        'a photo of the large {}.',
+        'a black and white photo of a {}.',
+        'the plushie {}.',
+        'a dark photo of a {}.',
+        'itap of a {}.',
+        'graffiti of the {}.',
+        'a toy {}.',
+        'itap of my {}.',
+        'a photo of a cool {}.',
+        'a photo of a small {}.',
+        'a tattoo of the {}.',
+    ],
+    'vild': [
+        'a photo of a {}.',
+        'This is a photo of a {}',
+        'There is a {} in the scene',
+        'There is the {} in the scene',
+        'a photo of a {} in the scene',
+        'a photo of a small {}.',
+        'a photo of a medium {}.',
+        'a photo of a large {}.',
+        'This is a photo of a small {}.',
+        'This is a photo of a medium {}.',
+        'This is a photo of a large {}.',
+        'There is a small {} in the scene.',
+        'There is a medium {} in the scene.',
+        'There is a large {} in the scene.',
+    ],
+}
+
+
+def get_predefined_templates(template_set_name: str) -> List[str]:
+    if template_set_name not in PREDEFINED_TEMPLATES:
+        raise ValueError(f'Template set {template_set_name} not found')
+    return PREDEFINED_TEMPLATES[template_set_name]
diff --git a/head_extractor/src/mmseg/utils/io.py b/head_extractor/src/mmseg/utils/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..7029c3cddda02c89cbb50cee9f8b7e7fa57378d9
--- /dev/null
+++ b/head_extractor/src/mmseg/utils/io.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import gzip
+import io
+import pickle
+
+import cv2
+import numpy as np
+
+
+def datafrombytes(content: bytes, backend: str = 'numpy') -> np.ndarray:
+    """Data decoding from bytes.
+
+    Args:
+        content (bytes): The data bytes got from files or other streams.
+        backend (str): The data decoding backend type. Options are 'numpy',
+            'nifti', 'cv2' and 'pickle'. Defaults to 'numpy'.
+
+    Returns:
+        numpy.ndarray: Loaded data array.
+    """
+    if backend == 'pickle':
+        data = pickle.loads(content)
+    else:
+        with io.BytesIO(content) as f:
+            if backend == 'nifti':
+                f = gzip.open(f)
+                try:
+                    from nibabel import FileHolder, Nifti1Image
+                except ImportError:
+                    print('nifti files io depends on nibabel, please run'
+                          '`pip install nibabel` to install it')
+                fh = FileHolder(fileobj=f)
+                data = Nifti1Image.from_file_map({'header': fh, 'image': fh})
+                data = Nifti1Image.from_bytes(data.to_bytes()).get_fdata()
+            elif backend == 'numpy':
+                data = np.load(f)
+            elif backend == 'cv2':
+                data = np.frombuffer(f.read(), dtype=np.uint8)
+                data = cv2.imdecode(data, cv2.IMREAD_UNCHANGED)
+            else:
+                raise ValueError
+    return data
diff --git a/head_extractor/src/mmseg/utils/mask_classification.py b/head_extractor/src/mmseg/utils/mask_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..205d5259754abfe07e0d84ae0739cf08043815ff
--- /dev/null
+++ b/head_extractor/src/mmseg/utils/mask_classification.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from mmcv.ops import point_sample
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import TASK_UTILS
+from mmseg.utils import ConfigType, SampleList
+
+
+def seg_data_to_instance_data(ignore_index: int,
+                              batch_data_samples: SampleList):
+    """Convert the paradigm of ground truth from semantic segmentation to
+    instance segmentation.
+
+    Args:
+        ignore_index (int): The label index to be ignored.
+        batch_data_samples (List[SegDataSample]): The Data
+            Samples. It usually includes information such as
+            `gt_sem_seg`.
+
+    Returns:
+        tuple[Tensor]: A tuple contains two lists.
+            - batch_gt_instances (List[InstanceData]): Batch of
+                gt_instance. It usually includes ``labels``, each is
+                unique ground truth label id of images, with
+                shape (num_gt, ) and ``masks``, each is ground truth
+                masks of each instances of a image, shape (num_gt, h, w).
+            - batch_img_metas (List[Dict]): List of image meta information.
+    """
+    batch_gt_instances = []
+
+    for data_sample in batch_data_samples:
+        gt_sem_seg = data_sample.gt_sem_seg.data
+        classes = torch.unique(
+            gt_sem_seg,
+            sorted=False,
+            return_inverse=False,
+            return_counts=False)
+
+        # remove ignored region
+        gt_labels = classes[classes != ignore_index]
+
+        masks = []
+        for class_id in gt_labels:
+            masks.append(gt_sem_seg == class_id)
+
+        if len(masks) == 0:
+            gt_masks = torch.zeros(
+                (0, gt_sem_seg.shape[-2],
+                 gt_sem_seg.shape[-1])).to(gt_sem_seg).long()
+        else:
+            gt_masks = torch.stack(masks).squeeze(1).long()
+
+        instance_data = InstanceData(labels=gt_labels, masks=gt_masks)
+        batch_gt_instances.append(instance_data)
+    return batch_gt_instances
+
+
+class MatchMasks:
+    """Match the predictions to category labels.
+
+    Args:
+        num_points (int): the number of sampled points to compute cost.
+        num_queries (int): the number of prediction masks.
+        num_classes (int): the number of classes.
+        assigner (BaseAssigner): the assigner to compute matching.
+    """
+
+    def __init__(self,
+                 num_points: int,
+                 num_queries: int,
+                 num_classes: int,
+                 assigner: ConfigType = None):
+        assert assigner is not None, "\'assigner\' in decode_head.train_cfg" \
+                                     'cannot be None'
+        assert num_points > 0, 'num_points should be a positive integer.'
+        self.num_points = num_points
+        self.num_queries = num_queries
+        self.num_classes = num_classes
+        self.assigner = TASK_UTILS.build(assigner)
+
+    def get_targets(self, cls_scores: List[Tensor], mask_preds: List[Tensor],
+                    batch_gt_instances: List[InstanceData]) -> Tuple:
+        """Compute best mask matches for all images for a decoder layer.
+
+        Args:
+            cls_scores (List[Tensor]): Mask score logits from a single
+                decoder layer for all images. Each with shape (num_queries,
+                cls_out_channels).
+            mask_preds (List[Tensor]): Mask logits from a single decoder
+                layer for all images. Each with shape (num_queries, h, w).
+            batch_gt_instances (List[InstanceData]): each contains
+                ``labels`` and ``masks``.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+                - labels (List[Tensor]): Labels of all images.\
+                    Each with shape (num_queries, ).
+                - mask_targets (List[Tensor]): Mask targets of\
+                    all images. Each with shape (num_queries, h, w).
+                - mask_weights (List[Tensor]): Mask weights of\
+                    all images. Each with shape (num_queries, ).
+                - avg_factor (int): Average factor that is used to
+                    average the loss. `avg_factor` is usually equal
+                    to the number of positive priors.
+        """
+        batch_size = cls_scores.shape[0]
+        results = dict({
+            'labels': [],
+            'mask_targets': [],
+            'mask_weights': [],
+        })
+        for i in range(batch_size):
+            labels, mask_targets, mask_weights\
+                = self._get_targets_single(cls_scores[i],
+                                           mask_preds[i],
+                                           batch_gt_instances[i])
+            results['labels'].append(labels)
+            results['mask_targets'].append(mask_targets)
+            results['mask_weights'].append(mask_weights)
+
+        # shape (batch_size, num_queries)
+        labels = torch.stack(results['labels'], dim=0)
+        # shape (batch_size, num_gts, h, w)
+        mask_targets = torch.cat(results['mask_targets'], dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(results['mask_weights'], dim=0)
+
+        avg_factor = sum(
+            [len(gt_instances.labels) for gt_instances in batch_gt_instances])
+
+        res = (labels, mask_targets, mask_weights, avg_factor)
+
+        return res
+
+    def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor,
+                            gt_instances: InstanceData) \
+            -> Tuple[Tensor, Tensor, Tensor]:
+        """Compute a set of best mask matches for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_instances (:obj:`InstanceData`): It contains ``labels`` and
+                ``masks``.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+        """
+        gt_labels = gt_instances.labels
+        gt_masks = gt_instances.masks
+        # when "gt_labels" is empty, classify all queries to background
+        if len(gt_labels) == 0:
+            labels = gt_labels.new_full((self.num_queries, ),
+                                        self.num_classes,
+                                        dtype=torch.long)
+            mask_targets = gt_labels
+            mask_weights = gt_labels.new_zeros((self.num_queries, ))
+            return labels, mask_targets, mask_weights
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(
+            mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1,
+                                                        1)).squeeze(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(
+            gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1,
+                                                               1)).squeeze(1)
+
+        sampled_gt_instances = InstanceData(
+            labels=gt_labels, masks=gt_points_masks)
+        sampled_pred_instances = InstanceData(
+            scores=cls_score, masks=mask_points_pred)
+        # assign and sample
+        matched_quiery_inds, matched_label_inds = self.assigner.assign(
+            pred_instances=sampled_pred_instances,
+            gt_instances=sampled_gt_instances)
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[matched_quiery_inds] = gt_labels[matched_label_inds]
+
+        mask_weights = gt_labels.new_zeros((self.num_queries, ))
+        mask_weights[matched_quiery_inds] = 1
+        mask_targets = gt_masks[matched_label_inds]
+
+        return labels, mask_targets, mask_weights
diff --git a/head_extractor/src/mmseg/utils/misc.py b/head_extractor/src/mmseg/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfc469e8320d375135846cfb0474a0fc8d9b15d0
--- /dev/null
+++ b/head_extractor/src/mmseg/utils/misc.py
@@ -0,0 +1,128 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from .typing_utils import SampleList
+
+
+def add_prefix(inputs, prefix):
+    """Add prefix for dict.
+
+    Args:
+        inputs (dict): The input dict with str keys.
+        prefix (str): The prefix to add.
+
+    Returns:
+
+        dict: The dict with keys updated with ``prefix``.
+    """
+
+    outputs = dict()
+    for name, value in inputs.items():
+        outputs[f'{prefix}.{name}'] = value
+
+    return outputs
+
+
+def stack_batch(inputs: List[torch.Tensor],
+                data_samples: Optional[SampleList] = None,
+                size: Optional[tuple] = None,
+                size_divisor: Optional[int] = None,
+                pad_val: Union[int, float] = 0,
+                seg_pad_val: Union[int, float] = 255) -> torch.Tensor:
+    """Stack multiple inputs to form a batch and pad the images and gt_sem_segs
+    to the max shape use the right bottom padding mode.
+
+    Args:
+        inputs (List[Tensor]): The input multiple tensors. each is a
+            CHW 3D-tensor.
+        data_samples (list[:obj:`SegDataSample`]): The list of data samples.
+            It usually includes information such as `gt_sem_seg`.
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (int, float): The padding value. Defaults to 0
+        seg_pad_val (int, float): The padding value. Defaults to 255
+
+    Returns:
+       Tensor: The 4D-tensor.
+       List[:obj:`SegDataSample`]: After the padding of the gt_seg_map.
+    """
+    assert isinstance(inputs, list), \
+        f'Expected input type to be list, but got {type(inputs)}'
+    assert len({tensor.ndim for tensor in inputs}) == 1, \
+        f'Expected the dimensions of all inputs must be the same, ' \
+        f'but got {[tensor.ndim for tensor in inputs]}'
+    assert inputs[0].ndim == 3, f'Expected tensor dimension to be 3, ' \
+        f'but got {inputs[0].ndim}'
+    assert len({tensor.shape[0] for tensor in inputs}) == 1, \
+        f'Expected the channels of all inputs must be the same, ' \
+        f'but got {[tensor.shape[0] for tensor in inputs]}'
+
+    # only one of size and size_divisor should be valid
+    assert (size is not None) ^ (size_divisor is not None), \
+        'only one of size and size_divisor should be valid'
+
+    padded_inputs = []
+    padded_samples = []
+    inputs_sizes = [(img.shape[-2], img.shape[-1]) for img in inputs]
+    max_size = np.stack(inputs_sizes).max(0)
+    if size_divisor is not None and size_divisor > 1:
+        # the last two dims are H,W, both subject to divisibility requirement
+        max_size = (max_size +
+                    (size_divisor - 1)) // size_divisor * size_divisor
+
+    for i in range(len(inputs)):
+        tensor = inputs[i]
+        if size is not None:
+            width = max(size[-1] - tensor.shape[-1], 0)
+            height = max(size[-2] - tensor.shape[-2], 0)
+            # (padding_left, padding_right, padding_top, padding_bottom)
+            padding_size = (0, width, 0, height)
+        elif size_divisor is not None:
+            width = max(max_size[-1] - tensor.shape[-1], 0)
+            height = max(max_size[-2] - tensor.shape[-2], 0)
+            padding_size = (0, width, 0, height)
+        else:
+            padding_size = [0, 0, 0, 0]
+
+        # pad img
+        pad_img = F.pad(tensor, padding_size, value=pad_val)
+        padded_inputs.append(pad_img)
+        # pad gt_sem_seg
+        if data_samples is not None:
+            data_sample = data_samples[i]
+            pad_shape = None
+            if 'gt_sem_seg' in data_sample:
+                gt_sem_seg = data_sample.gt_sem_seg.data
+                del data_sample.gt_sem_seg.data
+                data_sample.gt_sem_seg.data = F.pad(
+                    gt_sem_seg, padding_size, value=seg_pad_val)
+                pad_shape = data_sample.gt_sem_seg.shape
+            if 'gt_edge_map' in data_sample:
+                gt_edge_map = data_sample.gt_edge_map.data
+                del data_sample.gt_edge_map.data
+                data_sample.gt_edge_map.data = F.pad(
+                    gt_edge_map, padding_size, value=seg_pad_val)
+                pad_shape = data_sample.gt_edge_map.shape
+            if 'gt_depth_map' in data_sample:
+                gt_depth_map = data_sample.gt_depth_map.data
+                del data_sample.gt_depth_map.data
+                data_sample.gt_depth_map.data = F.pad(
+                    gt_depth_map, padding_size, value=seg_pad_val)
+                pad_shape = data_sample.gt_depth_map.shape
+            data_sample.set_metainfo({
+                'img_shape': tensor.shape[-2:],
+                'pad_shape': pad_shape,
+                'padding_size': padding_size
+            })
+            padded_samples.append(data_sample)
+        else:
+            padded_samples.append(
+                dict(
+                    img_padding_size=padding_size,
+                    pad_shape=pad_img.shape[-2:]))
+
+    return torch.stack(padded_inputs, dim=0), padded_samples
diff --git a/head_extractor/src/mmseg/utils/set_env.py b/head_extractor/src/mmseg/utils/set_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..c948950d62a7463295c1055a27a9a0ce881d9fad
--- /dev/null
+++ b/head_extractor/src/mmseg/utils/set_env.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import warnings
+
+from mmengine import DefaultScope
+
+
+def register_all_modules(init_default_scope: bool = True) -> None:
+    """Register all modules in mmseg into the registries.
+
+    Args:
+        init_default_scope (bool): Whether initialize the mmseg default scope.
+            When `init_default_scope=True`, the global default scope will be
+            set to `mmseg`, and all registries will build modules from mmseg's
+            registry node. To understand more about the registry, please refer
+            to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md
+            Defaults to True.
+    """  # noqa
+    import mmseg.datasets  # noqa: F401,F403
+    import mmseg.engine  # noqa: F401,F403
+    import mmseg.evaluation  # noqa: F401,F403
+    import mmseg.models  # noqa: F401,F403
+    import mmseg.structures  # noqa: F401,F403
+
+    if init_default_scope:
+        never_created = DefaultScope.get_current_instance() is None \
+                        or not DefaultScope.check_instance_created('mmseg')
+        if never_created:
+            DefaultScope.get_instance('mmseg', scope_name='mmseg')
+            return
+        current_scope = DefaultScope.get_current_instance()
+        if current_scope.scope_name != 'mmseg':
+            warnings.warn('The current default scope '
+                          f'"{current_scope.scope_name}" is not "mmseg", '
+                          '`register_all_modules` will force the current'
+                          'default scope to be "mmseg". If this is not '
+                          'expected, please set `init_default_scope=False`.')
+            # avoid name conflict
+            new_instance_name = f'mmseg-{datetime.datetime.now()}'
+            DefaultScope.get_instance(new_instance_name, scope_name='mmseg')
diff --git a/head_extractor/src/mmseg/utils/tokenizer.py b/head_extractor/src/mmseg/utils/tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d56f5fae602506a27b9ae8835415e8dea7b611b7
--- /dev/null
+++ b/head_extractor/src/mmseg/utils/tokenizer.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""CLIP tokenizer.
+
+Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright
+(c) 2021 OpenAI.
+"""
+import gzip
+import html
+import os
+from functools import lru_cache
+from typing import List, Union
+
+import ftfy
+import regex as re
+import torch
+
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+
+@lru_cache()
+def default_bpe():
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'bpe_simple_vocab_16e6.txt.gz')
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """Returns list of utf-8 byte and a corresponding list of unicode strings.
+
+    The reversible bpe codes work on unicode strings. This means you need a
+    large # of unicode characters in your vocab if you want to avoid UNKs. When
+    you're at something like a 10B token dataset you end up needing around 5K
+    for decent coverage. This is a significant percentage of your normal, say,
+    32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and
+    unicode strings. And avoids mapping to whitespace/control characters the
+    bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length
+    strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer:
+
+    def __init__(self, bpe_path: str = default_bpe(), special_tokens=None):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        if not special_tokens:
+            special_tokens = ['<start_of_text>', '<end_of_text>']
+        else:
+            special_tokens = ['<start_of_text>', '<end_of_text>'
+                              ] + special_tokens
+        vocab.extend(special_tokens)
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {t: t for t in special_tokens}
+        special = '|'.join(special_tokens)
+        self.pat = re.compile(
+            special +
+            r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+        self.vocab_size = len(self.encoder)
+        self.all_special_ids = [self.encoder[t] for t in special_tokens]
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa: E722, E261
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
+
+
+_tokenizer = SimpleTokenizer()
+
+
+def decode(output_ids: torch.Tensor):
+    output_ids = output_ids.cpu().numpy()
+    return _tokenizer.decode(output_ids)
+
+
+def tokenize(texts: Union[str, List[str]],
+             context_length: int = 77) -> torch.LongTensor:
+    """Returns the tokenized representation of given input string(s)
+
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens,
+    shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+
+    sot_token = _tokenizer.encoder['<start_of_text>']
+    eot_token = _tokenizer.encoder['<end_of_text>']
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token]
+                  for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            tokens = tokens[:context_length]  # Truncate
+            tokens[-1] = eot_token
+        result[i, :len(tokens)] = torch.tensor(tokens)
+
+    return result
+
+
+class HFTokenizer:
+    """HuggingFace tokenizer wrapper."""
+
+    def __init__(self, tokenizer_name: str):
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+
+    def save_pretrained(self, dest):
+        self.tokenizer.save_pretrained(dest)
+
+    def __call__(self,
+                 texts: Union[str, List[str]],
+                 context_length: int = 77) -> torch.Tensor:
+        # same cleaning as for default tokenizer, except lowercasing
+        # adding lower (for case-sensitive tokenizers) will make it
+        # more robust but less sensitive to nuance
+        if isinstance(texts, str):
+            texts = [texts]
+        texts = [whitespace_clean(basic_clean(text)) for text in texts]
+        input_ids = self.tokenizer(
+            texts,
+            return_tensors='pt',
+            max_length=context_length,
+            padding='max_length',
+            truncation=True,
+        ).input_ids
+        return input_ids
diff --git a/head_extractor/src/mmseg/utils/typing_utils.py b/head_extractor/src/mmseg/utils/typing_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba7d3b92bba8301171d2a0fffadfabfcd112976
--- /dev/null
+++ b/head_extractor/src/mmseg/utils/typing_utils.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Collecting some commonly used type hint in mmflow."""
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+from mmengine.config import ConfigDict
+
+from mmseg.structures import SegDataSample
+
+# Type hint of config data
+ConfigType = Union[ConfigDict, dict]
+OptConfigType = Optional[ConfigType]
+# Type hint of one or more config data
+MultiConfig = Union[ConfigType, Sequence[ConfigType]]
+OptMultiConfig = Optional[MultiConfig]
+
+SampleList = Sequence[SegDataSample]
+OptSampleList = Optional[SampleList]
+
+# Type hint of Tensor
+TensorDict = Dict[str, torch.Tensor]
+TensorList = Sequence[torch.Tensor]
+
+ForwardResults = Union[Dict[str, torch.Tensor], List[SegDataSample],
+                       Tuple[torch.Tensor], torch.Tensor]
diff --git a/head_extractor/src/mmseg/version.py b/head_extractor/src/mmseg/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..b76bb4580ddfa0ba0ba13fa4896c49bac9cef65a
--- /dev/null
+++ b/head_extractor/src/mmseg/version.py
@@ -0,0 +1,18 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+
+__version__ = '1.2.2'
+
+
+def parse_version_info(version_str):
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/head_extractor/src/mmseg/visualization/__init__.py b/head_extractor/src/mmseg/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cbb211e5243aafb4ab3d91f6a6f7ce0735b13a9
--- /dev/null
+++ b/head_extractor/src/mmseg/visualization/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .local_visualizer import SegLocalVisualizer
+
+__all__ = ['SegLocalVisualizer']
diff --git a/head_extractor/src/mmseg/visualization/local_visualizer.py b/head_extractor/src/mmseg/visualization/local_visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee3d652c7bbe9d93ca481fb7a7ed4bb976eec80d
--- /dev/null
+++ b/head_extractor/src/mmseg/visualization/local_visualizer.py
@@ -0,0 +1,349 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmengine.dist import master_only
+from mmengine.structures import PixelData
+from mmengine.visualization import Visualizer
+
+from mmseg.registry import VISUALIZERS
+from mmseg.structures import SegDataSample
+from mmseg.utils import get_classes, get_palette
+
+
+@VISUALIZERS.register_module()
+class SegLocalVisualizer(Visualizer):
+    """Local Visualizer.
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        image (np.ndarray, optional): the origin image to draw. The format
+            should be RGB. Defaults to None.
+        vis_backends (list, optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        classes (list, optional): Input classes for result rendering, as the
+            prediction of segmentation model is a segment map with label
+            indices, `classes` is a list which includes items responding to the
+            label indices. If classes is not defined, visualizer will take
+            `cityscapes` classes by default. Defaults to None.
+        palette (list, optional): Input palette for result rendering, which is
+            a list of color palette responding to the classes. Defaults to None.
+        dataset_name (str, optional): `Dataset name or alias <https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/utils/class_names.py#L302-L317>`_
+            visulizer will use the meta information of the dataset i.e. classes
+            and palette, but the `classes` and `palette` have higher priority.
+            Defaults to None.
+        alpha (int, float): The transparency of segmentation mask.
+                Defaults to 0.8.
+
+    Examples:
+        >>> import numpy as np
+        >>> import torch
+        >>> from mmengine.structures import PixelData
+        >>> from mmseg.structures import SegDataSample
+        >>> from mmseg.visualization import SegLocalVisualizer
+
+        >>> seg_local_visualizer = SegLocalVisualizer()
+        >>> image = np.random.randint(0, 256,
+        ...                     size=(10, 12, 3)).astype('uint8')
+        >>> gt_sem_seg_data = dict(data=torch.randint(0, 2, (1, 10, 12)))
+        >>> gt_sem_seg = PixelData(**gt_sem_seg_data)
+        >>> gt_seg_data_sample = SegDataSample()
+        >>> gt_seg_data_sample.gt_sem_seg = gt_sem_seg
+        >>> seg_local_visualizer.dataset_meta = dict(
+        >>>     classes=('background', 'foreground'),
+        >>>     palette=[[120, 120, 120], [6, 230, 230]])
+        >>> seg_local_visualizer.add_datasample('visualizer_example',
+        ...                         image, gt_seg_data_sample)
+        >>> seg_local_visualizer.add_datasample(
+        ...                        'visualizer_example', image,
+        ...                         gt_seg_data_sample, show=True)
+    """  # noqa
+
+    def __init__(self,
+                 name: str = 'visualizer',
+                 image: Optional[np.ndarray] = None,
+                 vis_backends: Optional[Dict] = None,
+                 save_dir: Optional[str] = None,
+                 classes: Optional[List] = None,
+                 palette: Optional[List] = None,
+                 dataset_name: Optional[str] = None,
+                 alpha: float = 0.8,
+                 **kwargs):
+        super().__init__(name, image, vis_backends, save_dir, **kwargs)
+        self.alpha: float = alpha
+        self.set_dataset_meta(palette, classes, dataset_name)
+
+    def _get_center_loc(self, mask: np.ndarray) -> np.ndarray:
+        """Get semantic seg center coordinate.
+
+        Args:
+            mask: np.ndarray: get from sem_seg
+        """
+        loc = np.argwhere(mask == 1)
+
+        loc_sort = np.array(
+            sorted(loc.tolist(), key=lambda row: (row[0], row[1])))
+        y_list = loc_sort[:, 0]
+        unique, indices, counts = np.unique(
+            y_list, return_index=True, return_counts=True)
+        y_loc = unique[counts.argmax()]
+        y_most_freq_loc = loc[loc_sort[:, 0] == y_loc]
+        center_num = len(y_most_freq_loc) // 2
+        x = y_most_freq_loc[center_num][1]
+        y = y_most_freq_loc[center_num][0]
+        return np.array([x, y])
+
+    def _draw_sem_seg(self,
+                      image: np.ndarray,
+                      sem_seg: PixelData,
+                      classes: Optional[List],
+                      palette: Optional[List],
+                      with_labels: Optional[bool] = True) -> np.ndarray:
+        """Draw semantic seg of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            sem_seg (:obj:`PixelData`): Data structure for pixel-level
+                annotations or predictions.
+            classes (list, optional): Input classes for result rendering, as
+                the prediction of segmentation model is a segment map with
+                label indices, `classes` is a list which includes items
+                responding to the label indices. If classes is not defined,
+                visualizer will take `cityscapes` classes by default.
+                Defaults to None.
+            palette (list, optional): Input palette for result rendering, which
+                is a list of color palette responding to the classes.
+                Defaults to None.
+            with_labels(bool, optional): Add semantic labels in visualization
+                result, Default to True.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        num_classes = len(classes)
+
+        sem_seg = sem_seg.cpu().data
+        ids = np.unique(sem_seg)[::-1]
+        legal_indices = ids < num_classes
+        ids = ids[legal_indices]
+        labels = np.array(ids, dtype=np.int64)
+
+        colors = [palette[label] for label in labels]
+
+        mask = np.zeros_like(image, dtype=np.uint8)
+        for label, color in zip(labels, colors):
+            mask[sem_seg[0] == label, :] = color
+
+        if with_labels:
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            # (0,1] to change the size of the text relative to the image
+            scale = 0.05
+            fontScale = min(image.shape[0], image.shape[1]) / (25 / scale)
+            fontColor = (255, 255, 255)
+            if image.shape[0] < 300 or image.shape[1] < 300:
+                thickness = 1
+                rectangleThickness = 1
+            else:
+                thickness = 2
+                rectangleThickness = 2
+            lineType = 2
+
+            if isinstance(sem_seg[0], torch.Tensor):
+                masks = sem_seg[0].numpy() == labels[:, None, None]
+            else:
+                masks = sem_seg[0] == labels[:, None, None]
+            masks = masks.astype(np.uint8)
+            for mask_num in range(len(labels)):
+                classes_id = labels[mask_num]
+                classes_color = colors[mask_num]
+                loc = self._get_center_loc(masks[mask_num])
+                text = classes[classes_id]
+                (label_width, label_height), baseline = cv2.getTextSize(
+                    text, font, fontScale, thickness)
+                mask = cv2.rectangle(mask, loc,
+                                     (loc[0] + label_width + baseline,
+                                      loc[1] + label_height + baseline),
+                                     classes_color, -1)
+                mask = cv2.rectangle(mask, loc,
+                                     (loc[0] + label_width + baseline,
+                                      loc[1] + label_height + baseline),
+                                     (0, 0, 0), rectangleThickness)
+                mask = cv2.putText(mask, text, (loc[0], loc[1] + label_height),
+                                   font, fontScale, fontColor, thickness,
+                                   lineType)
+        color_seg = (image * (1 - self.alpha) + mask * self.alpha).astype(
+            np.uint8)
+        self.set_image(color_seg)
+        return color_seg
+
+    def _draw_depth_map(self, image: np.ndarray,
+                        depth_map: PixelData) -> np.ndarray:
+        """Draws a depth map on a given image.
+
+        This function takes an image and a depth map as input,
+        renders the depth map, and concatenates it with the original image.
+        Finally, it updates the internal image state of the visualizer with
+        the concatenated result.
+
+        Args:
+            image (np.ndarray): The original image where the depth map will
+                be drawn. The array should be in the format HxWx3 where H is
+                the height, W is the width.
+
+            depth_map (PixelData): Depth map to be drawn. The depth map
+                should be in the form of a PixelData object. It will be
+                converted to a torch tensor if it is a numpy array.
+
+        Returns:
+            np.ndarray: The concatenated image with the depth map drawn.
+
+        Example:
+            >>> depth_map_data = PixelData(data=torch.rand(1, 10, 10))
+            >>> image = np.random.randint(0, 256,
+            >>>                           size=(10, 10, 3)).astype('uint8')
+            >>> visualizer = SegLocalVisualizer()
+            >>> visualizer._draw_depth_map(image, depth_map_data)
+        """
+        depth_map = depth_map.cpu().data
+        if isinstance(depth_map, np.ndarray):
+            depth_map = torch.from_numpy(depth_map)
+        if depth_map.ndim == 2:
+            depth_map = depth_map[None]
+
+        depth_map = self.draw_featmap(depth_map, resize_shape=image.shape[:2])
+        out_image = np.concatenate((image, depth_map), axis=0)
+        self.set_image(out_image)
+        return out_image
+
+    def set_dataset_meta(self,
+                         classes: Optional[List] = None,
+                         palette: Optional[List] = None,
+                         dataset_name: Optional[str] = None) -> None:
+        """Set meta information to visualizer.
+
+        Args:
+            classes (list, optional): Input classes for result rendering, as
+                the prediction of segmentation model is a segment map with
+                label indices, `classes` is a list which includes items
+                responding to the label indices. If classes is not defined,
+                visualizer will take `cityscapes` classes by default.
+                Defaults to None.
+            palette (list, optional): Input palette for result rendering, which
+                is a list of color palette responding to the classes.
+                Defaults to None.
+            dataset_name (str, optional): `Dataset name or alias <https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/utils/class_names.py#L302-L317>`_
+                visulizer will use the meta information of the dataset i.e.
+                classes and palette, but the `classes` and `palette` have
+                higher priority. Defaults to None.
+        """  # noqa
+        # Set default value. When calling
+        # `SegLocalVisualizer().dataset_meta=xxx`,
+        # it will override the default value.
+        if dataset_name is None:
+            dataset_name = 'cityscapes'
+        classes = classes if classes else get_classes(dataset_name)
+        palette = palette if palette else get_palette(dataset_name)
+        assert len(classes) == len(
+            palette), 'The length of classes should be equal to palette'
+        self.dataset_meta: dict = {'classes': classes, 'palette': palette}
+
+    @master_only
+    def add_datasample(
+            self,
+            name: str,
+            image: np.ndarray,
+            data_sample: Optional[SegDataSample] = None,
+            draw_gt: bool = True,
+            draw_pred: bool = True,
+            show: bool = False,
+            wait_time: float = 0,
+            # TODO: Supported in mmengine's Viusalizer.
+            out_file: Optional[str] = None,
+            step: int = 0,
+            with_labels: Optional[bool] = True) -> None:
+        """Draw datasample and save to all backends.
+
+        - If GT and prediction are plotted at the same time, they are
+        displayed in a stitched image where the left image is the
+        ground truth and the right image is the prediction.
+        - If ``show`` is True, all storage backends are ignored, and
+        the images will be displayed in a local window.
+        - If ``out_file`` is specified, the drawn image will be
+        saved to ``out_file``. it is usually used when the display
+        is not available.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to draw.
+            gt_sample (:obj:`SegDataSample`, optional): GT SegDataSample.
+                Defaults to None.
+            pred_sample (:obj:`SegDataSample`, optional): Prediction
+                SegDataSample. Defaults to None.
+            draw_gt (bool): Whether to draw GT SegDataSample. Default to True.
+            draw_pred (bool): Whether to draw Prediction SegDataSample.
+                Defaults to True.
+            show (bool): Whether to display the drawn image. Default to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            out_file (str): Path to output file. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+            with_labels(bool, optional): Add semantic labels in visualization
+                result, Defaults to True.
+        """
+        classes = self.dataset_meta.get('classes', None)
+        palette = self.dataset_meta.get('palette', None)
+
+        gt_img_data = None
+        pred_img_data = None
+
+        if draw_gt and data_sample is not None:
+            if 'gt_sem_seg' in data_sample:
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing semantic ' \
+                                            'segmentation results.'
+                gt_img_data = self._draw_sem_seg(image, data_sample.gt_sem_seg,
+                                                 classes, palette, with_labels)
+
+            if 'gt_depth_map' in data_sample:
+                gt_img_data = gt_img_data if gt_img_data is not None else image
+                gt_img_data = self._draw_depth_map(gt_img_data,
+                                                   data_sample.gt_depth_map)
+
+        if draw_pred and data_sample is not None:
+
+            if 'pred_sem_seg' in data_sample:
+
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing semantic ' \
+                                            'segmentation results.'
+                pred_img_data = self._draw_sem_seg(image,
+                                                   data_sample.pred_sem_seg,
+                                                   classes, palette,
+                                                   with_labels)
+
+            if 'pred_depth_map' in data_sample:
+                pred_img_data = pred_img_data if pred_img_data is not None \
+                    else image
+                pred_img_data = self._draw_depth_map(
+                    pred_img_data, data_sample.pred_depth_map)
+
+        if gt_img_data is not None and pred_img_data is not None:
+            drawn_img = np.concatenate((gt_img_data, pred_img_data), axis=1)
+        elif gt_img_data is not None:
+            drawn_img = gt_img_data
+        else:
+            drawn_img = pred_img_data
+
+        if show:
+            self.show(drawn_img, win_name=name, wait_time=wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(mmcv.rgb2bgr(drawn_img), out_file)
+        else:
+            self.add_image(name, drawn_img, step)
diff --git a/head_extractor/stress_test.py b/head_extractor/stress_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e17181106ca39891fbc4d71c79625c9f7360025
--- /dev/null
+++ b/head_extractor/stress_test.py
@@ -0,0 +1,112 @@
+import time
+import numpy as np
+from PIL import Image
+from head_extractor import ProcessorPipeline
+
+def run_benchmark(
+    image_path: str,
+    num_warmup: int = 5,
+    num_runs: int = 20,
+    config: dict = None
+):
+    """
+    对 head_extractor Pipeline 进行基准测试。
+
+    Args:
+        image_path (str): 用于测试的图片路径.
+        num_warmup (int): 在正式测试前预热的次数.
+        num_runs (int): 正式测试的运行次数.
+        config (dict): 传递给 extract_head 的参数配置.
+    """
+    if config is None:
+        config = {}
+
+    print("--- Head Extractor 基准测试 ---")
+    print(f"测试图片: {image_path}")
+    print(f"预热次数: {num_warmup}")
+    print(f"正式运行次数: {num_runs}")
+    print(f"测试配置: {config}")
+    print("-" * 30)
+
+    # 1. 加载模型
+    print("步骤 1: 正在加载模型...")
+    load_start_time = time.time()
+    try:
+        pipeline = ProcessorPipeline.load()
+    except Exception as e:
+        print(f"模型加载失败: {e}")
+        return
+    load_end_time = time.time()
+    print(f"模型加载完成，耗时: {load_end_time - load_start_time:.2f} 秒")
+
+    # 2. 加载图片
+    try:
+        image = Image.open(image_path)
+    except FileNotFoundError:
+        print(f"错误：测试图片未找到，请检查路径 {image_path}")
+        return
+
+    # 3. 预热
+    print(f"\n步骤 2: 正在进行 {num_warmup} 次预热...")
+    for i in range(num_warmup):
+        _ = pipeline.extract_head(image, **config)
+        print(f"  预热 {i+1}/{num_warmup} 完成")
+    print("预热完成。")
+
+    # 4. 正式运行和计时
+    print(f"\n步骤 3: 正在进行 {num_runs} 次正式测试...")
+    timings = []
+    for i in range(num_runs):
+        start_time = time.time()
+        _ = pipeline.extract_head(image, **config)
+        end_time = time.time()
+        duration = end_time - start_time
+        timings.append(duration)
+        print(f"  运行 {i+1}/{num_runs}，耗时: {duration:.4f} 秒")
+
+    # 5. 计算并打印结果
+    print("\n--- 基准测试结果 ---")
+    total_time = sum(timings)
+    avg_time = np.mean(timings)
+    std_dev = np.std(timings)
+    fps = 1.0 / avg_time if avg_time > 0 else 0
+
+    print(f"总耗时 ({num_runs} 次运行): {total_time:.2f} 秒")
+    print(f"平均每次耗时: {avg_time:.4f} 秒")
+    print(f"标准差: {std_dev:.4f} 秒")
+    print(f"处理速度 (FPS): {fps:.2f} 帧/秒")
+    print("-" * 30)
+
+
+if __name__ == '__main__':
+    # --- 配置测试参数 ---
+    # 请确保这张图片存在于项目的根目录下
+    TEST_IMAGE_PATH = "./assets/001.jpg"
+
+    # 场景1：测试默认配置 (RGB, 填充为正方形)
+    default_config = {
+        "crop_padding": 10,
+        "background_color": (255, 255, 255),
+        "pad2square": True,
+        "output_mode": 'RGB'
+    }
+    run_benchmark(
+        image_path=TEST_IMAGE_PATH,
+        num_warmup=5,
+        num_runs=20,
+        config=default_config
+    )
+
+    # 测试其他场景
+    # # 场景2：测试 RGBA 透明背景，不填充
+    # print("\n\n") # 添加一些间隔
+    # rgba_config = {
+    #     "pad2square": False,
+    #     "output_mode": 'RGBA'
+    # }
+    # run_benchmark(
+    #     image_path=TEST_IMAGE_PATH,
+    #     num_warmup=2,
+    #     num_runs=10,
+    #     config=rgba_config
+    # )
diff --git a/head_extractor/test.py b/head_extractor/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..92be5858523ed626d7b0a674aee543252ad78166
--- /dev/null
+++ b/head_extractor/test.py
@@ -0,0 +1,48 @@
+from PIL import Image
+from head_extractor import ProcessorPipeline
+
+print("--- 开始测试 head_extractor 包 ---")
+
+# 1. 初始化 Pipeline
+print("正在加载模型...")
+
+pipeline = ProcessorPipeline.load()
+print("模型加载成功！")
+
+
+# 2. 读取一张待处理的图片 (请确保图片路径正确)
+image_path = "./assets/001.jpg" # 示例图片路径
+print(f"正在打开图片: {image_path}")
+input_image = Image.open(image_path)
+
+
+# 3. 提取头部
+print("正在提取头部...")
+# case:1
+# default pad2square=True, output_mode='RGB'
+extracted_head_image_rgb = pipeline.extract_head(input_image, output_mode='RGB')
+extracted_head_image_rgb.save("./assets/001_head-default.webp")
+
+# case:2
+# pad2square=False
+# rgba
+result_rgba = pipeline.extract_head(
+    input_image,
+    output_mode='RGBA',
+    pad2square=False
+)
+result_rgba.save("./assets/001_head-pad2square-false.webp")
+
+# case:3
+# black bg
+result_black_bg = pipeline.extract_head(
+    input_image,
+    background_color=(0, 0, 0),
+    pad2square=True,
+    output_mode='RGB'
+)
+result_black_bg.save("./assets/001_head-black-bg.webp")
+
+
+
+print(f"处理完成！结果已保存")